diff --git a/.ci/all_requirements.txt b/.ci/all_requirements.txt
index ac9682a09bec1..4918d7519291f 100644
--- a/.ci/all_requirements.txt
+++ b/.ci/all_requirements.txt
@@ -12,6 +12,94 @@ certifi==2025.8.3 \
     --hash=sha256:e564105f78ded564e3ae7c923924435e1daa7463faeab5bb932bc53ffae63407 \
     --hash=sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5
     # via requests
+cffi==2.0.0 \
+    --hash=sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb \
+    --hash=sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b \
+    --hash=sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f \
+    --hash=sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9 \
+    --hash=sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44 \
+    --hash=sha256:0f6084a0ea23d05d20c3edcda20c3d006f9b6f3fefeac38f59262e10cef47ee2 \
+    --hash=sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c \
+    --hash=sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75 \
+    --hash=sha256:1cd13c99ce269b3ed80b417dcd591415d3372bcac067009b6e0f59c7d4015e65 \
+    --hash=sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e \
+    --hash=sha256:1f72fb8906754ac8a2cc3f9f5aaa298070652a0ffae577e0ea9bd480dc3c931a \
+    --hash=sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e \
+    --hash=sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25 \
+    --hash=sha256:2081580ebb843f759b9f617314a24ed5738c51d2aee65d31e02f6f7a2b97707a \
+    --hash=sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe \
+    --hash=sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b \
+    --hash=sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91 \
+    --hash=sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592 \
+    --hash=sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187 \
+    --hash=sha256:2de9a304e27f7596cd03d16f1b7c72219bd944e99cc52b84d0145aefb07cbd3c \
+    --hash=sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1 \
+    --hash=sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94 \
+    --hash=sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba \
+    --hash=sha256:3e837e369566884707ddaf85fc1744b47575005c0a229de3327f8f9a20f4efeb \
+    --hash=sha256:3f4d46d8b35698056ec29bca21546e1551a205058ae1a181d871e278b0b28165 \
+    --hash=sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529 \
+    --hash=sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca \
+    --hash=sha256:4647afc2f90d1ddd33441e5b0e85b16b12ddec4fca55f0d9671fef036ecca27c \
+    --hash=sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6 \
+    --hash=sha256:53f77cbe57044e88bbd5ed26ac1d0514d2acf0591dd6bb02a3ae37f76811b80c \
+    --hash=sha256:5eda85d6d1879e692d546a078b44251cdd08dd1cfb98dfb77b670c97cee49ea0 \
+    --hash=sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743 \
+    --hash=sha256:61d028e90346df14fedc3d1e5441df818d095f3b87d286825dfcbd6459b7ef63 \
+    --hash=sha256:66f011380d0e49ed280c789fbd08ff0d40968ee7b665575489afa95c98196ab5 \
+    --hash=sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5 \
+    --hash=sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4 \
+    --hash=sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d \
+    --hash=sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b \
+    --hash=sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93 \
+    --hash=sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205 \
+    --hash=sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27 \
+    --hash=sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512 \
+    --hash=sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d \
+    --hash=sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c \
+    --hash=sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037 \
+    --hash=sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26 \
+    --hash=sha256:89472c9762729b5ae1ad974b777416bfda4ac5642423fa93bd57a09204712322 \
+    --hash=sha256:8ea985900c5c95ce9db1745f7933eeef5d314f0565b27625d9a10ec9881e1bfb \
+    --hash=sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c \
+    --hash=sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8 \
+    --hash=sha256:9332088d75dc3241c702d852d4671613136d90fa6881da7d770a483fd05248b4 \
+    --hash=sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414 \
+    --hash=sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9 \
+    --hash=sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664 \
+    --hash=sha256:a05d0c237b3349096d3981b727493e22147f934b20f6f125a3eba8f994bec4a9 \
+    --hash=sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775 \
+    --hash=sha256:b18a3ed7d5b3bd8d9ef7a8cb226502c6bf8308df1525e1cc676c3680e7176739 \
+    --hash=sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc \
+    --hash=sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062 \
+    --hash=sha256:b4c854ef3adc177950a8dfc81a86f5115d2abd545751a304c5bcf2c2c7283cfe \
+    --hash=sha256:b882b3df248017dba09d6b16defe9b5c407fe32fc7c65a9c69798e6175601be9 \
+    --hash=sha256:baf5215e0ab74c16e2dd324e8ec067ef59e41125d3eade2b863d294fd5035c92 \
+    --hash=sha256:c649e3a33450ec82378822b3dad03cc228b8f5963c0c12fc3b1e0ab940f768a5 \
+    --hash=sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13 \
+    --hash=sha256:c6638687455baf640e37344fe26d37c404db8b80d037c3d29f58fe8d1c3b194d \
+    --hash=sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26 \
+    --hash=sha256:cb527a79772e5ef98fb1d700678fe031e353e765d1ca2d409c92263c6d43e09f \
+    --hash=sha256:cf364028c016c03078a23b503f02058f1814320a56ad535686f90565636a9495 \
+    --hash=sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b \
+    --hash=sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6 \
+    --hash=sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c \
+    --hash=sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef \
+    --hash=sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5 \
+    --hash=sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18 \
+    --hash=sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad \
+    --hash=sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3 \
+    --hash=sha256:de8dad4425a6ca6e4e5e297b27b5c824ecc7581910bf9aee86cb6835e6812aa7 \
+    --hash=sha256:e11e82b744887154b182fd3e7e8512418446501191994dbf9c9fc1f32cc8efd5 \
+    --hash=sha256:e6e73b9e02893c764e7e8d5bb5ce277f1a009cd5243f8228f75f842bf937c534 \
+    --hash=sha256:f73b96c41e3b2adedc34a7356e64c8eb96e03a3782b535e043a986276ce12a49 \
+    --hash=sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2 \
+    --hash=sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5 \
+    --hash=sha256:fc7de24befaeae77ba923797c7c87834c73648a05a4bde34b3b7e5588973a453 \
+    --hash=sha256:fe562eb1a64e67dd297ccc4f5addea2501664954f2692b69a76449ec7913ecbf
+    # via
+    #   cryptography
+    #   pynacl
 charset-normalizer==3.4.3 \
     --hash=sha256:00237675befef519d9af72169d8604a067d92755e84fe76492fef5441db05b91 \
     --hash=sha256:02425242e96bcf29a49711b0ca9f37e451da7c70562bc10e8ed992a5a7a25cc0 \
@@ -93,6 +181,62 @@ charset-normalizer==3.4.3 \
     --hash=sha256:fd10de089bcdcd1be95a2f73dbe6254798ec1bda9f450d5828c96f93e2536b9c \
     --hash=sha256:fdabf8315679312cfa71302f9bd509ded4f2f263fb5b765cf1433b39106c3cc9
     # via requests
+cryptography==46.0.3 \
+    --hash=sha256:00a5e7e87938e5ff9ff5447ab086a5706a957137e6e433841e9d24f38a065217 \
+    --hash=sha256:01ca9ff2885f3acc98c29f1860552e37f6d7c7d013d7334ff2a9de43a449315d \
+    --hash=sha256:09859af8466b69bc3c27bdf4f5d84a665e0f7ab5088412e9e2ec49758eca5cbc \
+    --hash=sha256:0abf1ffd6e57c67e92af68330d05760b7b7efb243aab8377e583284dbab72c71 \
+    --hash=sha256:1000713389b75c449a6e979ffc7dcc8ac90b437048766cef052d4d30b8220971 \
+    --hash=sha256:109d4ddfadf17e8e7779c39f9b18111a09efb969a301a31e987416a0191ed93a \
+    --hash=sha256:10b01676fc208c3e6feeb25a8b83d81767e8059e1fe86e1dc62d10a3018fa926 \
+    --hash=sha256:10ca84c4668d066a9878890047f03546f3ae0a6b8b39b697457b7757aaf18dbc \
+    --hash=sha256:15ab9b093e8f09daab0f2159bb7e47532596075139dd74365da52ecc9cb46c5d \
+    --hash=sha256:191bb60a7be5e6f54e30ba16fdfae78ad3a342a0599eb4193ba88e3f3d6e185b \
+    --hash=sha256:22d7e97932f511d6b0b04f2bfd818d73dcd5928db509460aaf48384778eb6d20 \
+    --hash=sha256:23b1a8f26e43f47ceb6d6a43115f33a5a37d57df4ea0ca295b780ae8546e8044 \
+    --hash=sha256:36e627112085bb3b81b19fed209c05ce2a52ee8b15d161b7c643a7d5a88491f3 \
+    --hash=sha256:39b6755623145ad5eff1dab323f4eae2a32a77a7abef2c5089a04a3d04366715 \
+    --hash=sha256:3b51b8ca4f1c6453d8829e1eb7299499ca7f313900dd4d89a24b8b87c0a780d4 \
+    --hash=sha256:402b58fc32614f00980b66d6e56a5b4118e6cb362ae8f3fda141ba4689bd4506 \
+    --hash=sha256:416260257577718c05135c55958b674000baef9a1c7d9e8f306ec60d71db850f \
+    --hash=sha256:46acf53b40ea38f9c6c229599a4a13f0d46a6c3fa9ef19fc1a124d62e338dfa0 \
+    --hash=sha256:4b7387121ac7d15e550f5cb4a43aef2559ed759c35df7336c402bb8275ac9683 \
+    --hash=sha256:50fc3343ac490c6b08c0cf0d704e881d0d660be923fd3076db3e932007e726e3 \
+    --hash=sha256:516ea134e703e9fe26bcd1277a4b59ad30586ea90c365a87781d7887a646fe21 \
+    --hash=sha256:549e234ff32571b1f4076ac269fcce7a808d3bf98b76c8dd560e42dbc66d7d91 \
+    --hash=sha256:5d7f93296ee28f68447397bf5198428c9aeeab45705a55d53a6343455dcb2c3c \
+    --hash=sha256:5ecfccd2329e37e9b7112a888e76d9feca2347f12f37918facbb893d7bb88ee8 \
+    --hash=sha256:6276eb85ef938dc035d59b87c8a7dc559a232f954962520137529d77b18ff1df \
+    --hash=sha256:6b5063083824e5509fdba180721d55909ffacccc8adbec85268b48439423d78c \
+    --hash=sha256:6eae65d4c3d33da080cff9c4ab1f711b15c1d9760809dad6ea763f3812d254cb \
+    --hash=sha256:6f61efb26e76c45c4a227835ddeae96d83624fb0d29eb5df5b96e14ed1a0afb7 \
+    --hash=sha256:71e842ec9bc7abf543b47cf86b9a743baa95f4677d22baa4c7d5c69e49e9bc04 \
+    --hash=sha256:760f83faa07f8b64e9c33fc963d790a2edb24efb479e3520c14a45741cd9b2db \
+    --hash=sha256:78a97cf6a8839a48c49271cdcbd5cf37ca2c1d6b7fdd86cc864f302b5e9bf459 \
+    --hash=sha256:7ce938a99998ed3c8aa7e7272dca1a610401ede816d36d0693907d863b10d9ea \
+    --hash=sha256:8a6e050cb6164d3f830453754094c086ff2d0b2f3a897a1d9820f6139a1f0914 \
+    --hash=sha256:9394673a9f4de09e28b5356e7fff97d778f8abad85c9d5ac4a4b7e25a0de7717 \
+    --hash=sha256:94cd0549accc38d1494e1f8de71eca837d0509d0d44bf11d158524b0e12cebf9 \
+    --hash=sha256:a04bee9ab6a4da801eb9b51f1b708a1b5b5c9eb48c03f74198464c66f0d344ac \
+    --hash=sha256:a23582810fedb8c0bc47524558fb6c56aac3fc252cb306072fd2815da2a47c32 \
+    --hash=sha256:a2c0cd47381a3229c403062f764160d57d4d175e022c1df84e168c6251a22eec \
+    --hash=sha256:a8b17438104fed022ce745b362294d9ce35b4c2e45c1d958ad4a4b019285f4a1 \
+    --hash=sha256:a9a3008438615669153eb86b26b61e09993921ebdd75385ddd748702c5adfddb \
+    --hash=sha256:b02cf04496f6576afffef5ddd04a0cb7d49cf6be16a9059d793a30b035f6b6ac \
+    --hash=sha256:b419ae593c86b87014b9be7396b385491ad7f320bde96826d0dd174459e54665 \
+    --hash=sha256:c0a7bb1a68a5d3471880e264621346c48665b3bf1c3759d682fc0864c540bd9e \
+    --hash=sha256:c70cc23f12726be8f8bc72e41d5065d77e4515efae3690326764ea1b07845cfb \
+    --hash=sha256:c8daeb2d2174beb4575b77482320303f3d39b8e81153da4f0fb08eb5fe86a6c5 \
+    --hash=sha256:cb3d760a6117f621261d662bccc8ef5bc32ca673e037c83fbe565324f5c46936 \
+    --hash=sha256:d55f3dffadd674514ad19451161118fd010988540cee43d8bc20675e775925de \
+    --hash=sha256:d89c3468de4cdc4f08a57e214384d0471911a3830fcdaf7a8cc587e42a866372 \
+    --hash=sha256:db391fa7c66df6762ee3f00c95a89e6d428f4d60e7abc8328f4fe155b5ac6e54 \
+    --hash=sha256:dfb781ff7eaa91a6f7fd41776ec37c5853c795d3b358d4896fdbb5df168af422 \
+    --hash=sha256:e5bf0ed4490068a2e72ac03d786693adeb909981cc596425d09032d372bcc849 \
+    --hash=sha256:e7aec276d68421f9574040c26e2a7c3771060bc0cff408bae1dcb19d3ab1e63c \
+    --hash=sha256:ef639cb3372f69ec44915fafcd6698b6cc78fbe0c2ea41be867f6ed612811963 \
+    --hash=sha256:f260d0d41e9b4da1ed1e0f1ce571f97fe370b152ab18778e9e8f67d6af432018
+    # via pyjwt
 google-api-core==2.25.1 \
     --hash=sha256:8a2a56c1fef82987a524371f99f3bd0143702fecc670c72e600c1cda6bf8dbb7 \
     --hash=sha256:d2aaa0b13c78c61cb3f4282c464c046e45fbd75755683c9c525e6e8f7ed0a5e8
@@ -303,6 +447,47 @@ pybind11==2.13.6 \
     --hash=sha256:237c41e29157b962835d356b370ededd57594a26d5894a795960f0047cb5caf5 \
     --hash=sha256:ba6af10348c12b24e92fa086b39cfba0eff619b61ac77c406167d813b096d39a
     # via -r mlir/python/requirements.txt
+pycparser==2.23 \
+    --hash=sha256:78816d4f24add8f10a06d6f05b4d424ad9e96cfebf68a4ddc99c65c0720d00c2 \
+    --hash=sha256:e5c6e8d3fbad53479cab09ac03729e0a9faf2bee3db8208a550daf5af81a5934
+    # via cffi
+pygithub==2.8.1 \
+    --hash=sha256:23a0a5bca93baef082e03411bf0ce27204c32be8bfa7abc92fe4a3e132936df0 \
+    --hash=sha256:341b7c78521cb07324ff670afd1baa2bf5c286f8d9fd302c1798ba594a5400c9
+    # via -r .ci/requirements.txt
+pyjwt[crypto]==2.10.1 \
+    --hash=sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953 \
+    --hash=sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb
+    # via pygithub
+pynacl==1.6.0 \
+    --hash=sha256:04f20784083014e265ad58c1b2dd562c3e35864b5394a14ab54f5d150ee9e53e \
+    --hash=sha256:10d755cf2a455d8c0f8c767a43d68f24d163b8fe93ccfaabfa7bafd26be58d73 \
+    --hash=sha256:140373378e34a1f6977e573033d1dd1de88d2a5d90ec6958c9485b2fd9f3eb90 \
+    --hash=sha256:16c60daceee88d04f8d41d0a4004a7ed8d9a5126b997efd2933e08e93a3bd850 \
+    --hash=sha256:16dd347cdc8ae0b0f6187a2608c0af1c8b7ecbbe6b4a06bff8253c192f696990 \
+    --hash=sha256:25720bad35dfac34a2bcdd61d9e08d6bfc6041bebc7751d9c9f2446cf1e77d64 \
+    --hash=sha256:2d6cd56ce4998cb66a6c112fda7b1fdce5266c9f05044fa72972613bef376d15 \
+    --hash=sha256:347dcddce0b4d83ed3f32fd00379c83c425abee5a9d2cd0a2c84871334eaff64 \
+    --hash=sha256:4853c154dc16ea12f8f3ee4b7e763331876316cc3a9f06aeedf39bcdca8f9995 \
+    --hash=sha256:49c336dd80ea54780bcff6a03ee1a476be1612423010472e60af83452aa0f442 \
+    --hash=sha256:4a25cfede801f01e54179b8ff9514bd7b5944da560b7040939732d1804d25419 \
+    --hash=sha256:51fed9fe1bec9e7ff9af31cd0abba179d0e984a2960c77e8e5292c7e9b7f7b5d \
+    --hash=sha256:536703b8f90e911294831a7fbcd0c062b837f3ccaa923d92a6254e11178aaf42 \
+    --hash=sha256:5789f016e08e5606803161ba24de01b5a345d24590a80323379fc4408832d290 \
+    --hash=sha256:6b08eab48c9669d515a344fb0ef27e2cbde847721e34bba94a343baa0f33f1f4 \
+    --hash=sha256:6b393bc5e5a0eb86bb85b533deb2d2c815666665f840a09e0aa3362bb6088736 \
+    --hash=sha256:84709cea8f888e618c21ed9a0efdb1a59cc63141c403db8bf56c469b71ad56f2 \
+    --hash=sha256:8bfaa0a28a1ab718bad6239979a5a57a8d1506d0caf2fba17e524dbb409441cf \
+    --hash=sha256:bbcc4452a1eb10cd5217318c822fde4be279c9de8567f78bad24c773c21254f8 \
+    --hash=sha256:cb36deafe6e2bce3b286e5d1f3e1c246e0ccdb8808ddb4550bb2792f2df298f2 \
+    --hash=sha256:cf831615cc16ba324240de79d925eacae8265b7691412ac6b24221db157f6bd1 \
+    --hash=sha256:dcdeb41c22ff3c66eef5e63049abf7639e0db4edee57ba70531fc1b6b133185d \
+    --hash=sha256:dea103a1afcbc333bc0e992e64233d360d393d1e63d0bc88554f572365664348 \
+    --hash=sha256:ef214b90556bb46a485b7da8258e59204c244b1b5b576fb71848819b468c44a7 \
+    --hash=sha256:f3482abf0f9815e7246d461fab597aa179b7524628a4bc36f86a7dc418d2608d \
+    --hash=sha256:f46386c24a65383a9081d68e9c2de909b1834ec74ff3013271f1bca9c2d233eb \
+    --hash=sha256:f4b3824920e206b4f52abd7de621ea7a44fd3cb5c8daceb7c3612345dfc54f2e
+    # via pygithub
 pyyaml==6.0.1 \
     --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \
     --hash=sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc \
@@ -362,6 +547,7 @@ requests==2.32.5 \
     # via
     #   google-api-core
     #   google-cloud-storage
+    #   pygithub
 rsa==4.9.1 \
     --hash=sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762 \
     --hash=sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75
@@ -386,8 +572,12 @@ swig==4.3.1 \
 typing-extensions==4.15.0 \
     --hash=sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466 \
     --hash=sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548
-    # via -r mlir/python/requirements.txt
+    # via
+    #   -r mlir/python/requirements.txt
+    #   pygithub
 urllib3==2.5.0 \
     --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
     --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
-    # via requests
+    # via
+    #   pygithub
+    #   requests
diff --git a/.ci/generate_test_report_github.py b/.ci/generate_test_report_github.py
index 6785e82f3440b..18c5e078a5064 100644
--- a/.ci/generate_test_report_github.py
+++ b/.ci/generate_test_report_github.py
@@ -4,19 +4,9 @@
 """Script to generate a build report for Github."""
 
 import argparse
-import platform
 
 import generate_test_report_lib
 
-def compute_platform_title() -> str:
-    logo = ":window:" if platform.system() == "Windows" else ":penguin:"
-    # On Linux the machine value is x86_64 on Windows it is AMD64.
-    if platform.machine() == "x86_64" or platform.machine() == "AMD64":
-        arch = "x64"
-    else:
-        arch = platform.machine()
-    return f"{logo} {platform.system()} {arch} Test Results"
-
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -27,7 +17,9 @@ def compute_platform_title() -> str:
     args = parser.parse_args()
 
     report = generate_test_report_lib.generate_report_from_files(
-        compute_platform_title(), args.return_code, args.build_test_logs
+        generate_test_report_lib.compute_platform_title(),
+        args.return_code,
+        args.build_test_logs,
     )
 
     print(report)
diff --git a/.ci/generate_test_report_lib.py b/.ci/generate_test_report_lib.py
index 36c95852452ac..ce8262f0dc73f 100644
--- a/.ci/generate_test_report_lib.py
+++ b/.ci/generate_test_report_lib.py
@@ -3,8 +3,22 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """Library to parse JUnit XML files and return a markdown report."""
 
+from typing import TypedDict, Optional
+import platform
+
 from junitparser import JUnitXml, Failure
 
+
+# This data structure should match the definition in llvm-zorg in
+# premerge/advisor/advisor_lib.py
+# TODO(boomanaiden154): Drop the Optional here and switch to str | None when
+# we require Python 3.10.
+class FailureExplanation(TypedDict):
+    name: str
+    explained: bool
+    reason: Optional[str]
+
+
 SEE_BUILD_FILE_STR = "Download the build's log file to see the details."
 UNRELATED_FAILURES_STR = (
     "If these failures are unrelated to your changes (for example "
@@ -41,10 +55,12 @@ def _parse_ninja_log(ninja_log: list[str]) -> list[tuple[str, str]]:
         # touch test/4.stamp
         #
         # index will point to the line that starts with Failed:. The progress
-        # indicator is the line before this ([4/5] test/4.stamp) and contains a pretty
-        # printed version of the target being built (test/4.stamp). We use this line
-        # and remove the progress information to get a succinct name for the target.
-        failing_action = ninja_log[index - 1].split("] ")[1]
+        # indicator is sometimes the line before this ([4/5] test/4.stamp) and
+        # will contain a pretty printed version of the target being built
+        # (test/4.stamp) when accurate. We instead parse the failed line rather
+        # than the progress indicator as the progress indicator may not be
+        # aligned with the failure.
+        failing_action = ninja_log[index].split("FAILED: ")[1]
         failure_log = []
         while (
             index < len(ninja_log)
@@ -80,16 +96,29 @@ def find_failure_in_ninja_logs(ninja_logs: list[list[str]]) -> list[tuple[str, s
     return failures
 
 
-def _format_ninja_failures(ninja_failures: list[tuple[str, str]]) -> list[str]:
-    """Formats ninja failures into summary views for the report."""
+def _format_failures(
+    failures: list[tuple[str, str]], failure_explanations: dict[str, FailureExplanation]
+) -> list[str]:
+    """Formats failures into summary views for the report."""
     output = []
-    for build_failure in ninja_failures:
+    for build_failure in failures:
         failed_action, failure_message = build_failure
+        failure_explanation = None
+        if failed_action in failure_explanations:
+            failure_explanation = failure_explanations[failed_action]
+        output.append("<details>")
+        if failure_explanation:
+            output.extend(
+                [
+                    f"<summary>{failed_action} (Likely Already Failing)</summary>" "",
+                    failure_explanation["reason"],
+                    "",
+                ]
+            )
+        else:
+            output.extend([f"<summary>{failed_action}</summary>", ""])
         output.extend(
             [
-                "<details>",
-                f"<summary>{failed_action}</summary>",
-                "",
                 "```",
                 failure_message,
                 "```",
@@ -98,6 +127,7 @@ def _format_ninja_failures(ninja_failures: list[tuple[str, str]]) -> list[str]:
         )
     return output
 
+
 def get_failures(junit_objects) -> dict[str, list[tuple[str, str]]]:
     failures = {}
     for results in junit_objects:
@@ -129,12 +159,19 @@ def generate_report(
     ninja_logs: list[list[str]],
     size_limit=1024 * 1024,
     list_failures=True,
+    failure_explanations_list: list[FailureExplanation] = [],
 ):
     failures = get_failures(junit_objects)
     tests_run = 0
     tests_skipped = 0
     tests_failed = 0
 
+    failure_explanations: dict[str, FailureExplanation] = {}
+    for failure_explanation in failure_explanations_list:
+        if not failure_explanation["explained"]:
+            continue
+        failure_explanations[failure_explanation["name"]] = failure_explanation
+
     for results in junit_objects:
         for testsuite in results:
             tests_run += testsuite.tests
@@ -173,7 +210,7 @@ def generate_report(
                         "",
                     ]
                 )
-                report.extend(_format_ninja_failures(ninja_failures))
+                report.extend(_format_failures(ninja_failures, failure_explanations))
                 report.extend(
                     [
                         "",
@@ -209,18 +246,7 @@ def plural(num_tests):
 
         for testsuite_name, failures in failures.items():
             report.extend(["", f"### {testsuite_name}"])
-            for name, output in failures:
-                report.extend(
-                    [
-                        "<details>",
-                        f"<summary>{name}</summary>",
-                        "",
-                        "```",
-                        output,
-                        "```",
-                        "</details>",
-                    ]
-                )
+            report.extend(_format_failures(failures, failure_explanations))
     elif return_code != 0:
         # No tests failed but the build was in a failed state. Bring this to the user's
         # attention.
@@ -245,7 +271,7 @@ def plural(num_tests):
                     "",
                 ]
             )
-            report.extend(_format_ninja_failures(ninja_failures))
+            report.extend(_format_failures(ninja_failures, failure_explanations))
 
     if failures or return_code != 0:
         report.extend(["", UNRELATED_FAILURES_STR])
@@ -282,3 +308,13 @@ def load_info_from_files(build_log_files):
 def generate_report_from_files(title, return_code, build_log_files):
     junit_objects, ninja_logs = load_info_from_files(build_log_files)
     return generate_report(title, return_code, junit_objects, ninja_logs)
+
+
+def compute_platform_title() -> str:
+    logo = ":window:" if platform.system() == "Windows" else ":penguin:"
+    # On Linux the machine value is x86_64 on Windows it is AMD64.
+    if platform.machine() == "x86_64" or platform.machine() == "AMD64":
+        arch = "x64"
+    else:
+        arch = platform.machine()
+    return f"{logo} {platform.system()} {arch} Test Results"
diff --git a/.ci/generate_test_report_lib_test.py b/.ci/generate_test_report_lib_test.py
index a8659e1d6a3e3..341cf3037b921 100644
--- a/.ci/generate_test_report_lib_test.py
+++ b/.ci/generate_test_report_lib_test.py
@@ -39,7 +39,7 @@ def test_find_failure_ninja_logs(self):
         self.assertEqual(
             failures[0],
             (
-                "test/4.stamp",
+                "touch test/4.stamp",
                 dedent(
                     """\
                     FAILED: touch test/4.stamp
@@ -77,7 +77,7 @@ def test_ninja_log_end(self):
         self.assertEqual(
             failures[0],
             (
-                "test/3.stamp",
+                "touch test/3.stamp",
                 dedent(
                     """\
                     FAILED: touch test/3.stamp
@@ -106,7 +106,7 @@ def test_ninja_log_multiple_failures(self):
         self.assertEqual(
             failures[0],
             (
-                "test/2.stamp",
+                "touch test/2.stamp",
                 dedent(
                     """\
                     FAILED: touch test/2.stamp
@@ -117,7 +117,7 @@ def test_ninja_log_multiple_failures(self):
         self.assertEqual(
             failures[1],
             (
-                "test/4.stamp",
+                "touch test/4.stamp",
                 dedent(
                     """\
                     FAILED: touch test/4.stamp
@@ -150,7 +150,7 @@ def test_ninja_log_runtimes_failure(self):
         self.assertEqual(
             failures[0],
             (
-                "test/2.stamp",
+                "touch test/2.stamp",
                 dedent(
                     """\
                     FAILED: touch test/2.stamp
@@ -159,6 +159,34 @@ def test_ninja_log_runtimes_failure(self):
             ),
         )
 
+    # Test that we correctly handle cases where the FAILED: line does not
+    # match up with the progress indicator.
+    def test_ninja_log_mismatched_failed(self):
+        failures = generate_test_report_lib.find_failure_in_ninja_logs(
+            [
+                [
+                    "[1/5] test/1.stamp",
+                    "[2/5] test/2.stamp",
+                    "ModuleNotFoundError: No module named 'mount_langley'",
+                    "FAILED: tools/check-langley",
+                    "Wow! This system is really broken!",
+                    "[5/5] test/5.stamp",
+                ]
+            ]
+        )
+        self.assertEqual(len(failures), 1)
+        self.assertEqual(
+            failures[0],
+            (
+                "tools/check-langley",
+                dedent(
+                    """\
+                    FAILED: tools/check-langley
+                    Wow! This system is really broken!"""
+                ),
+            ),
+        )
+
     def test_title_only(self):
         self.assertEqual(
             generate_test_report_lib.generate_report("Foo", 0, [], []),
@@ -407,7 +435,6 @@ def test_no_failures_multiple_build_failed_ninja_log(self):
                 ]
             ],
         )
-        print(test)
         self.assertEqual(
             generate_test_report_lib.generate_report(
                 "Foo",
@@ -449,7 +476,7 @@ def test_no_failures_multiple_build_failed_ninja_log(self):
                     All tests passed but another part of the build **failed**. Click on a failure below to see the details.
 
                     <details>
-                    <summary>test/2.stamp</summary>
+                    <summary>touch test/2.stamp</summary>
 
                     ```
                     FAILED: touch test/2.stamp
@@ -457,7 +484,7 @@ def test_no_failures_multiple_build_failed_ninja_log(self):
                     ```
                     </details>
                     <details>
-                    <summary>test/4.stamp</summary>
+                    <summary>touch test/4.stamp</summary>
 
                     ```
                     FAILED: touch test/4.stamp
@@ -754,6 +781,160 @@ def test_report_size_limit(self):
             ),
         )
 
+    def test_report_ninja_explanation(self):
+        self.assertEqual(
+            generate_test_report_lib.generate_report(
+                "Foo",
+                1,
+                [],
+                [
+                    [
+                        "[1/5] test/1.stamp",
+                        "[2/5] test/2.stamp",
+                        "[3/5] test/3.stamp",
+                        "[4/5] test/4.stamp",
+                        "FAILED: test/4.stamp",
+                        "touch test/4.stamp",
+                        "Half Moon Bay.",
+                        "[5/5] test/5.stamp",
+                    ]
+                ],
+                failure_explanations_list=[
+                    {
+                        "name": "test/4.stamp",
+                        "explained": True,
+                        "reason": "Failing at head",
+                    }
+                ],
+            ),
+            dedent(
+                """\
+            # Foo
+
+            The build failed before running any tests. Click on a failure below to see the details.
+
+            <details>
+            <summary>test/4.stamp (Likely Already Failing)</summary>
+            Failing at head
+
+            ```
+            FAILED: test/4.stamp
+            touch test/4.stamp
+            Half Moon Bay.
+            ```
+            </details>
+            
+            If these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the `infrastructure` label."""
+            ),
+        )
+
+    def test_report_test_failure_explanation(self):
+        self.assertEqual(
+            generate_test_report_lib.generate_report(
+                "Foo",
+                1,
+                [
+                    junit_from_xml(
+                        dedent(
+                            """\
+          <?xml version="1.0" encoding="UTF-8"?>
+          <testsuites time="8.89">
+          <testsuite name="Bar" tests="1" failures="1" skipped="0" time="410.63">
+          <testcase classname="Bar/test_3" name="test_3" time="0.02">
+            <failure><![CDATA[Error! Expected Big Sur to be next to the ocean.]]></failure>
+          </testcase>
+          </testsuite>
+          </testsuites>"""
+                        )
+                    )
+                ],
+                [],
+                failure_explanations_list=[
+                    {
+                        "name": "Bar/test_3/test_3",
+                        "explained": True,
+                        "reason": "Big Sur is next to the Pacific.",
+                    }
+                ],
+            ),
+            (
+                dedent(
+                    """\
+          # Foo
+
+          * 1 test failed
+
+          ## Failed Tests
+          (click on a test name to see its output)
+
+          ### Bar
+          <details>
+          <summary>Bar/test_3/test_3 (Likely Already Failing)</summary>
+          Big Sur is next to the Pacific.
+          
+          ```
+          Error! Expected Big Sur to be next to the ocean.
+          ```
+          </details>
+
+          If these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the `infrastructure` label."""
+                )
+            ),
+        )
+
+    def test_report_test_failure_have_explanation_explained_false(self):
+        self.assertEqual(
+            generate_test_report_lib.generate_report(
+                "Foo",
+                1,
+                [
+                    junit_from_xml(
+                        dedent(
+                            """\
+          <?xml version="1.0" encoding="UTF-8"?>
+          <testsuites time="8.89">
+          <testsuite name="Bar" tests="1" failures="1" skipped="0" time="410.63">
+          <testcase classname="Bar/test_3" name="test_3" time="0.02">
+            <failure><![CDATA[Error! Expected Mt. Shasta to be next in the Eastern Sierras.]]></failure>
+          </testcase>
+          </testsuite>
+          </testsuites>"""
+                        )
+                    )
+                ],
+                [],
+                failure_explanations_list=[
+                    {
+                        "name": "Bar/test_3/test_3",
+                        "explained": False,
+                        "reason": "Mt. Shasta is in the Cascades",
+                    }
+                ],
+            ),
+            (
+                dedent(
+                    """\
+          # Foo
+
+          * 1 test failed
+
+          ## Failed Tests
+          (click on a test name to see its output)
+
+          ### Bar
+          <details>
+          <summary>Bar/test_3/test_3</summary>
+          
+          ```
+          Error! Expected Mt. Shasta to be next in the Eastern Sierras.
+          ```
+          </details>
+
+          If these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the `infrastructure` label."""
+                )
+            ),
+        )
+
     def test_generate_report_end_to_end(self):
         with tempfile.TemporaryDirectory() as temp_dir:
             junit_xml_file = os.path.join(temp_dir, "junit.xml")
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index 5fb8f69528e89..beaed71f49f65 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -32,8 +32,6 @@ export LD=link
 # see https://github.com/llvm/llvm-project/pull/82393 and
 # https://discourse.llvm.org/t/rfc-future-of-windows-pre-commit-ci/76840/40
 # for further information.
-# We limit the number of parallel compile jobs to 24 control memory
-# consumption and improve build reliability.
 cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D LLVM_ENABLE_PROJECTS="${projects}" \
       -G Ninja \
@@ -49,7 +47,6 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D CMAKE_EXE_LINKER_FLAGS="/MANIFEST:NO" \
       -D CMAKE_MODULE_LINKER_FLAGS="/MANIFEST:NO" \
       -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO" \
-      -D CMAKE_CXX_FLAGS="-Wno-c++98-compat -Wno-c++14-compat -Wno-unsafe-buffer-usage -Wno-old-style-cast" \
       -D LLVM_ENABLE_RUNTIMES="${runtimes}"
 
 start-group "ninja"
diff --git a/.ci/premerge_advisor_explain.py b/.ci/premerge_advisor_explain.py
index 06c6cb9aaa46b..e1bc59f389b36 100644
--- a/.ci/premerge_advisor_explain.py
+++ b/.ci/premerge_advisor_explain.py
@@ -4,20 +4,90 @@
 """Script for getting explanations from the premerge advisor."""
 
 import argparse
-import os
 import platform
 import sys
+import json
+
+# TODO(boomanaiden154): Remove the optional call once we can require Python
+# 3.10.
+from typing import Optional
 
 import requests
+import github
+import github.PullRequest
 
 import generate_test_report_lib
 
 PREMERGE_ADVISOR_URL = (
     "http://premerge-advisor.premerge-advisor.svc.cluster.local:5000/explain"
 )
+COMMENT_TAG = "<!--PREMERGE ADVISOR COMMENT: {platform}-->"
+
+
+def get_comment_id(platform: str, pr: github.PullRequest.PullRequest) -> Optional[int]:
+    platform_comment_tag = COMMENT_TAG.format(platform=platform)
+    for comment in pr.as_issue().get_comments():
+        if platform_comment_tag in comment.body:
+            return comment.id
+    return None
+
+
+def get_comment(
+    github_token: str,
+    pr_number: int,
+    body: str,
+) -> dict[str, str]:
+    repo = github.Github(github_token).get_repo("llvm/llvm-project")
+    pr = repo.get_issue(pr_number).as_pull_request()
+    comment = {"body": body}
+    comment_id = get_comment_id(platform.system(), pr)
+    if comment_id:
+        comment["id"] = comment_id
+    return comment
+
 
+def main(
+    commit_sha: str,
+    build_log_files: list[str],
+    github_token: str,
+    pr_number: int,
+    return_code: int,
+):
+    """The main entrypoint for the script.
 
-def main(commit_sha: str, build_log_files: list[str]):
+    This function parses failures from files, requests information from the
+    premerge advisor, and may write a Github comment depending upon the output.
+    There are four different scenarios:
+    1. There has never been a previous failure and the job passes - We do not
+       create a comment. We write out an empty file to the comment path so the
+       issue-write workflow knows not to create anything.
+    2. There has never been a previous failure and the job fails - We create a
+       new comment containing the failure information and any possible premerge
+       advisor findings.
+    3. There has been a previous failure and the job passes - We update the
+       existing comment by passing its ID and a passed message to the
+       issue-write workflow.
+    4. There has been a previous failure and the job fails - We update the
+       existing comment in the same manner as above, but generate the comment
+       as if we have a failure.
+
+    Args:
+      commit_sha: The base commit SHA for this PR run.
+      build_log_files: The list of JUnit XML files and ninja logs.
+      github_token: The token to use to access the Github API.
+      pr_number: The number of the PR associated with this run.
+      return_code: The numerical return code of ninja/CMake.
+    """
+    if return_code == 0:
+        with open("comment", "w") as comment_file_handle:
+            comment = get_comment(
+                github_token,
+                pr_number,
+                ":white_check_mark: With the latest revision this PR passed "
+                "the premerge checks.",
+            )
+            if "id" in comment:
+                json.dump([comment], comment_file_handle)
     junit_objects, ninja_logs = generate_test_report_lib.load_info_from_files(
         build_log_files
     )
@@ -40,9 +110,26 @@ def main(commit_sha: str, build_log_files: list[str]):
             explanation_request["failures"].append(
                 {"name": name, "message": failure_message}
             )
-    advisor_response = requests.get(PREMERGE_ADVISOR_URL, json=explanation_request)
+    advisor_response = requests.get(
+        PREMERGE_ADVISOR_URL, json=explanation_request, timeout=5
+    )
     if advisor_response.status_code == 200:
         print(advisor_response.json())
+        comments = [
+            get_comment(
+                github_token,
+                pr_number,
+                generate_test_report_lib.generate_report(
+                    generate_test_report_lib.compute_platform_title(),
+                    return_code,
+                    junit_objects,
+                    ninja_logs,
+                    failure_explanations_list=advisor_response.json(),
+                ),
+            )
+        ]
+        with open("comment", "w") as comment_file_handle:
+            json.dump(comments, comment_file_handle)
     else:
         print(advisor_response.reason)
 
@@ -50,6 +137,9 @@ def main(commit_sha: str, build_log_files: list[str]):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("commit_sha", help="The base commit SHA for the test.")
+    parser.add_argument("return_code", help="The build's return code", type=int)
+    parser.add_argument("github_token", help="Github authentication token", type=str)
+    parser.add_argument("pr_number", help="The PR number", type=int)
     parser.add_argument(
         "build_log_files", help="Paths to JUnit report files and ninja logs.", nargs="*"
     )
@@ -60,4 +150,10 @@ def main(commit_sha: str, build_log_files: list[str]):
     if platform.machine() == "arm64":
         sys.exit(0)
 
-    main(args.commit_sha, args.build_log_files)
+    main(
+        args.commit_sha,
+        args.build_log_files,
+        args.github_token,
+        args.pr_number,
+        args.return_code,
+    )
diff --git a/.ci/premerge_advisor_upload.py b/.ci/premerge_advisor_upload.py
index cb379b0e77cd6..9e14743c7cc07 100644
--- a/.ci/premerge_advisor_upload.py
+++ b/.ci/premerge_advisor_upload.py
@@ -45,7 +45,7 @@ def main(commit_sha, workflow_run_number, build_log_files):
         for name, failure_message in ninja_failures:
             failure_info["failures"].append({"name": name, "message": failure_message})
     for premerge_advisor_url in PREMERGE_ADVISOR_URLS:
-        requests.post(premerge_advisor_url, json=failure_info)
+        requests.post(premerge_advisor_url, json=failure_info, timeout=5)
 
 
 if __name__ == "__main__":
diff --git a/.ci/requirements.txt b/.ci/requirements.txt
index 2fec1baf25fdc..45eb253548496 100644
--- a/.ci/requirements.txt
+++ b/.ci/requirements.txt
@@ -1,2 +1,3 @@
 junitparser==3.2.0
 google-cloud-storage==3.3.0
+PyGithub==2.8.1
diff --git a/.ci/utils.sh b/.ci/utils.sh
index 540acfa8d5cc5..c364f9395d67b 100644
--- a/.ci/utils.sh
+++ b/.ci/utils.sh
@@ -33,17 +33,18 @@ function at-exit {
   # If building fails there will be no results files.
   shopt -s nullglob
 
-  if [[ "$GITHUB_STEP_SUMMARY" != "" ]]; then
+  if [[ "$GITHUB_ACTIONS" != "" ]]; then
     python "${MONOREPO_ROOT}"/.ci/generate_test_report_github.py \
       $retcode "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log \
       >> $GITHUB_STEP_SUMMARY
+    python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \
+      $(git rev-parse HEAD~1) $retcode "${GITHUB_TOKEN}" \
+      $GITHUB_PR_NUMBER "${BUILD_DIR}"/test-results.*.xml \
+      "${MONOREPO_ROOT}"/ninja*.log
   fi
 
   if [[ "$retcode" != "0" ]]; then
     if [[ "$GITHUB_ACTIONS" != "" ]]; then
-      python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \
-        $(git rev-parse HEAD~1) "${BUILD_DIR}"/test-results.*.xml \
-        "${MONOREPO_ROOT}"/ninja*.log
       python "${MONOREPO_ROOT}"/.ci/premerge_advisor_upload.py \
         $(git rev-parse HEAD~1) $GITHUB_RUN_NUMBER \
         "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log
diff --git a/.clang-tidy b/.clang-tidy
index 06bb0f18e9d2e..2cda1b81de808 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -1,3 +1,4 @@
+HeaderFilterRegex: ''
 Checks: >
   -*,
   clang-diagnostic-*,
diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml
new file mode 100644
index 0000000000000..595c3f8dd2070
--- /dev/null
+++ b/.github/actions/build-container/action.yml
@@ -0,0 +1,95 @@
+name: Build Container
+description: >-
+  Build and test a container using the standard llvm naming scheme for containers.
+
+inputs:
+  tag:
+    description: >-
+      The tag to use for this container.
+    required: false
+  container-name:
+    description: >-
+      The name for the container.
+    required: true
+  dockerfile:
+    description: >-
+      Path to docker file.
+    required: false
+  target:
+    description: >-
+      The container target to build 'passed to podman via ---target option'
+    required: false
+  context:
+    description: >-
+      Path to context for the container build.
+    required: false
+  test-command:
+    description: >-
+      Test command to run to ensure the container is working correctly.
+    required: false
+
+runs:
+  using: "composite"
+  steps:
+    # podman is not installed by default on the ARM64 images.
+    - name: Install Podman
+      if: runner.arch == 'ARM64'
+      shell: bash
+      run: |
+        sudo apt-get install podman
+
+    - name: Build Container
+      shell: bash
+      env:
+        INPUT_TAG: ${{inputs.tag }}
+        INPUT_CONTAINER_NAME: ${{ inputs.container-name }}
+        INPUT_TARGET: ${{ inputs.target }}
+        INPUT_DOCKERFILE: ${{ inputs.dockerfile }}
+        INPUT_CONTEXT: ${{ inputs.context }}
+      id: build
+      run: |
+        env
+        tag="${INPUT_TAG:-$(git rev-parse --short=12 HEAD)}"
+
+        case "$RUNNER_ARCH" in
+          ARM64)
+            container_arch="arm64v8"
+            ;;
+          *)
+            container_arch="amd64"
+            ;;
+        esac
+
+        container_name="ghcr.io/$GITHUB_REPOSITORY_OWNER/$container_arch/$INPUT_CONTAINER_NAME:$tag"
+        container_filename="$(echo $container_name  | sed -e 's/\//-/g' -e 's/:/-/g').tar"
+        if [ -n "$INPUT_TARGET" ]; then
+          podman_options="$podman_options --target $INPUT_TARGET"
+        fi
+        if [ -n "$INPUT_DOCKERFILE" ]; then
+          podman_options="$podman_options -f $INPUT_DOCKERFILE"
+        fi
+        podman_options="$podman_options ${INPUT_CONTEXT:-.}"
+        echo "Podman Options: $podman_options"
+
+        podman build -t $container_name $podman_options
+
+        podman save $container_name > $container_filename
+
+        echo "container-full-name=$container_name" >> $GITHUB_OUTPUT
+
+    - name: Create container artifact
+      uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+      with:
+        name: ${{ inputs.container-name }}-${{ runner.arch }}
+        path: "*.tar"
+        retention-days: 14
+
+    - name: Test container
+      shell: bash
+      if: inputs.test-command
+      env:
+        INPUT_TEST_COMMAND: ${{ inputs.test-command }}
+        CONTAINER_FULL_NAME: ${{ steps.build.outputs.container-full-name }}
+      run: |
+        podman run --pull=never --rm -it $CONTAINER_FULL_NAME /usr/bin/bash -x -c "$INPUT_TEST_COMMAND"
+
diff --git a/.github/actions/push-container/action.yml b/.github/actions/push-container/action.yml
new file mode 100644
index 0000000000000..087e3dcb2718c
--- /dev/null
+++ b/.github/actions/push-container/action.yml
@@ -0,0 +1,44 @@
+name: Push Container
+description: >-
+  Download all container artifacts for this job and push them to the GitHub registry.
+
+inputs:
+  token:
+    description: >-
+      Token to use to authenticate with the container registry.
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Download container
+      uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
+
+    - name: Push Container
+      env:
+        GITHUB_TOKEN: ${{ inputs.token }}
+      shell: bash
+      run: |
+        function push_container {
+          image_name=$1
+          latest_name=$(echo $image_name | sed 's/:[a-f0-9]\+$/:latest/g')
+          podman tag $image_name $latest_name
+          echo "Pushing $image_name ..."
+          podman push --compression-format=zstd $image_name
+          echo "Pushing $latest_name ..."
+          podman push --compression-format=zstd $latest_name
+        }
+
+        podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io
+        for f in $(find . -iname '*.tar'); do
+          image_name=$(podman load -q -i $f | sed 's/Loaded image: //g')
+          push_container $image_name
+
+          if echo $image_name | grep '/amd64/'; then
+            # For amd64, create an alias with the arch component removed.
+            # This matches the convention used on dockerhub.
+            default_image_name=$(echo $(dirname $(dirname $image_name))/$(basename $image_name))
+            podman tag $image_name $default_image_name
+            push_container $default_image_name
+          fi
+        done
diff --git a/.github/instructions/lldb.instructions.md b/.github/instructions/lldb.instructions.md
new file mode 100644
index 0000000000000..35bcd27b1b42f
--- /dev/null
+++ b/.github/instructions/lldb.instructions.md
@@ -0,0 +1,79 @@
+---
+applyTo: lldb/**/*
+---
+
+When reviewing code, focus on:
+
+## Language, Libraries & Standards
+
+- Target C++17 and avoid vendor-specific extensions.
+- For Python scripts, follow PEP 8.
+- Prefer standard library or LLVM support libraries instead of reinventing data structures.
+
+## Comments & Documentation
+
+- Each source file should include the standard LLVM file header.
+- Header files must have proper header guards.
+- Non-trivial classes and public methods should have Doxygen documentation.
+- Use `//` or `///` comments normally; avoid block comments unless necessary.
+- Non-trivial code should have comments explaining what it does and why. Avoid comments that explain how it does it at a micro level.
+
+## Language & Compiler Issues
+
+- Write portable code; wrap non-portable code in interfaces.
+- Do not use RTTI or exceptions.
+- Prefer C++-style casts over C-style casts.
+- Do not use static constructors.
+- Use `class` or `struct` consistently; `struct` only for all-public data.
+- When then same class is declared or defined multiple times, make sure it's consistently done using either `class` or `struct`.
+
+## Headers & Library Layering
+
+- Include order: module header → local/private headers → project headers → system headers.
+- Headers must compile standalone (include all dependencies).
+- Maintain proper library layering; avoid circular dependencies.
+- Include minimally; use forward declarations where possible.
+- Keep internal headers private to modules.
+- Use full namespace qualifiers for out-of-line definitions.
+
+## Control Flow & Structure
+
+- Prefer early exits over deep nesting.
+- Do not use `else` after `return`, `continue`, `break`, or `goto`.
+- Encapsulate loops that compute predicates into helper functions.
+
+## Naming
+
+- LLDB's code style differs from LLVM's coding style.
+- Variables are `snake_case`.
+- Functions and methods are `UpperCamelCase`.
+- Static, global and member variables have `s_`, `g_` and `m_` prefixes respectively.
+
+## General Guidelines
+
+- Use `assert` liberally; prefer `llvm_unreachable` for unreachable states.
+- Do not use `using namespace std;` in headers.
+- Provide a virtual method anchor for classes defined in headers.
+- Do not use default labels in fully covered switches over enumerations.
+- Use range-based for loops wherever possible.
+- Capture `end()` outside loops if not using range-based iteration.
+- Including `<iostream>` is forbidded. Use LLVM’s `raw_ostream` instead.
+- Don’t use `inline` when defining a function in a class definition.
+
+## Microscopic Details
+
+- Preserve existing style in modified code.
+- Prefer pre-increment (`++i`) when value is unused.
+- Use `private`, `protected`, or `public` keyword as appropriate to restrict class member visibility.
+- Omit braces for single-statement `if`, `else`, `while`, `for` unless needed.
+
+## Review Style
+
+- Be specific and actionable in feedback.
+- Explain the "why" behind recommendations.
+- Link back to the LLVM Coding Standards: https://llvm.org/docs/CodingStandards.html.
+- Ask clarifying questions when code intent is unclear.
+
+Ignore formatting and assume that's handled by external tools like `clang-format` and `black`.
+Remember that these standards are **guidelines**.
+Always prioritize consistency with the style that is already being used by the surrounding code.
diff --git a/.github/copilot-instructions.md b/.github/instructions/llvm.instructions.md
similarity index 90%
rename from .github/copilot-instructions.md
rename to .github/instructions/llvm.instructions.md
index 03748938700e3..3f1308f51e676 100644
--- a/.github/copilot-instructions.md
+++ b/.github/instructions/llvm.instructions.md
@@ -1,3 +1,7 @@
+---
+applyTo: llvm/**/*
+---
+
 When performing a code review, pay close attention to code modifying a function's
 control flow. Could the change result in the corruption of performance profile
 data? Could the change result in invalid debug information, in particular for
diff --git a/.github/renovate.json b/.github/renovate.json
index 6ce98c4e7b105..8e89ba8c4b32a 100644
--- a/.github/renovate.json
+++ b/.github/renovate.json
@@ -8,5 +8,12 @@
   "minimumReleaseAge": "3 days",
   "assignees": ["boomanaiden154"],
   "ignorePaths": [".github/workflows/containers/**"],
-  "groupName": "[Github] Update GHA Dependencies"
+  "groupName": "[Github] Update GHA Dependencies",
+  "packageRules": [
+    {
+      "matchPackageNames": ["windows", "macos"],
+      "matchManagers": ["github-actions"],
+      "enabled": false
+    }
+  ]
 }
diff --git a/.github/workflows/bazel-checks.yml b/.github/workflows/bazel-checks.yml
index 65d51649dd9e7..27092d9326aeb 100644
--- a/.github/workflows/bazel-checks.yml
+++ b/.github/workflows/bazel-checks.yml
@@ -30,3 +30,32 @@ jobs:
       - name: Run Buildifier
         run: |
           buildifier --mode=check $(find ./utils/bazel -name *BUILD*)
+  
+  bazel-build:
+    name: "Bazel Build/Test"
+    # Only run on US Central workers so we only have to keep one cache warm as
+    # the cache buckets are per cluster.
+    runs-on:
+      group: llvm-premerge-cluster-us-central
+      labels: llvm-premerge-linux-runners
+    if: github.repository == 'llvm/llvm-project'
+    steps:
+      - name: Fetch LLVM sources
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        # TODO(boomanaiden154): We should use a purpose built container for this. Move
+        # over when we have fixed the issues with using custom containers with Github
+        # ARC in GKE.
+      - name: Setup System Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y libmpfr-dev libpfm4-dev m4 libedit-dev
+          sudo curl -L https://github.com/bazelbuild/bazelisk/releases/download/v1.27.0/bazelisk-amd64.deb > /tmp/bazelisk.deb
+          sudo apt-get install -y /tmp/bazelisk.deb
+          rm /tmp/bazelisk.deb
+      - name: Build/Test
+        working-directory: utils/bazel
+        run: |
+          bazelisk test --config=ci --sandbox_base="" \
+            --remote_cache=https://storage.googleapis.com/$CACHE_GCS_BUCKET-bazel \
+            --google_default_credentials \
+            @llvm-project//... //...
diff --git a/.github/workflows/build-ci-container-tooling.yml b/.github/workflows/build-ci-container-tooling.yml
index c77c78617666d..531da2ccbd446 100644
--- a/.github/workflows/build-ci-container-tooling.yml
+++ b/.github/workflows/build-ci-container-tooling.yml
@@ -12,17 +12,33 @@ on:
       - '.github/workflows/containers/github-action-ci-tooling/**'
       - llvm/utils/git/requirements_formatting.txt
       - llvm/utils/git/requirements_linting.txt
+      - '.github/actions/build-container/**'
+      - '.github/actions/push-container/**'
   pull_request:
     paths:
       - .github/workflows/build-ci-container-tooling.yml
       - '.github/workflows/containers/github-action-ci-tooling/**'
       - llvm/utils/git/requirements_formatting.txt
       - llvm/utils/git/requirements_linting.txt
+      - '.github/actions/build-container/**'
+      - '.github/actions/push-container/**'
 
 jobs:
   build-ci-container-tooling:
+    name: Build Container ${{ matrix.container-name }}
     if: github.repository_owner == 'llvm'
     runs-on: ubuntu-24.04
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - container-name: format
+            test-command: 'cd $HOME && clang-format --version | grep version && git-clang-format -h | grep usage && black --version | grep black'
+          - container-name: lint
+            test-command: 'cd $HOME && clang-tidy --version | grep version && clang-tidy-diff.py -h | grep usage'
+          - container-name: abi-tests
+            test-command: 'cd $HOME && abi-compliance-checker --help'
+            target: abi-tests
     steps:
       - name: Checkout LLVM
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
@@ -32,48 +48,15 @@ jobs:
             llvm/utils/git/requirements_formatting.txt
             llvm/utils/git/requirements_linting.txt
             clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py
+            .github/actions/build-container
 
-      - name: Write Variables
-        id: vars
-        run: |
-          tag=$(git rev-parse --short=12 HEAD)
-          container_name="ghcr.io/$GITHUB_REPOSITORY_OWNER/amd64/ci-ubuntu-24.04"
-          echo "container-name-format=$container_name-code-format" >> $GITHUB_OUTPUT
-          echo "container-name-lint=$container_name-code-lint" >> $GITHUB_OUTPUT
-          echo "container-name-format-tag=$container_name-format:$tag" >> $GITHUB_OUTPUT
-          echo "container-name-lint-tag=$container_name-lint:$tag" >> $GITHUB_OUTPUT
-          echo "container-format-filename=$(echo $container_name-format:$tag  | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT
-          echo "container-lint-filename=$(echo $container_name-lint:$tag  | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT
-
-      - name: Build container
-        run: |
-          podman build --target ci-container-code-format \
-          -f .github/workflows/containers/github-action-ci-tooling/Dockerfile \
-          -t ${{ steps.vars.outputs.container-name-format-tag }} .
-          podman build --target ci-container-code-lint \
-          -f .github/workflows/containers/github-action-ci-tooling/Dockerfile \
-          -t ${{ steps.vars.outputs.container-name-lint-tag }} .
-
-      # Save the container so we have it in case the push fails.  This also
-      # allows us to separate the push step into a different job so we can
-      # maintain minimal permissions while building the container.
-      - name: Save container image
-        run: |
-          podman save ${{ steps.vars.outputs.container-name-format-tag }}  >  ${{ steps.vars.outputs.container-format-filename }}
-          podman save ${{ steps.vars.outputs.container-name-lint-tag }}  >  ${{ steps.vars.outputs.container-lint-filename }}
-
-      - name: Upload container image
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+      - name: Build Container
+        uses: ./.github/actions/build-container
         with:
-          name: container-amd64
-          path: "*.tar"
-          retention-days: 14
-
-      - name: Test Container
-        run: |
-          # Use --pull=never to ensure we are testing the just built image.
-          podman run --pull=never --rm -it ${{ steps.vars.outputs.container-name-format-tag }} /usr/bin/bash -x -c 'cd $HOME && clang-format --version | grep version && git-clang-format -h | grep usage && black --version | grep black'
-          podman run --pull=never --rm -it ${{ steps.vars.outputs.container-name-lint-tag }} /usr/bin/bash -x -c 'cd $HOME && clang-tidy --version | grep version && clang-tidy-diff.py -h | grep usage'
+          container-name: ci-ubuntu-24.04-${{ matrix.container-name }}
+          dockerfile: .github/workflows/containers/github-action-ci-tooling/Dockerfile
+          target: ci-container-${{ matrix.target || format('code-{0}', matrix.container-name) }}
+          test-command: ${{ matrix.test-command }}
 
   push-ci-container:
     if: github.event_name == 'push'
@@ -82,34 +65,13 @@ jobs:
     permissions:
       packages: write
     runs-on: ubuntu-24.04
-    env:
-      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - name: Download container
-        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
-
-      - name: Push Container
-        run: |
-          function push_container {
-            image_name=$1
-            latest_name=$(echo $image_name | sed 's/:[a-f0-9]\+$/:latest/g')
-            podman tag $image_name $latest_name
-            echo "Pushing $image_name ..."
-            podman push $image_name
-            echo "Pushing $latest_name ..."
-            podman push $latest_name
-          }
-
-          podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io
-          for f in $(find . -iname '*.tar'); do
-            image_name=$(podman load -q -i $f | sed 's/Loaded image: //g')
-            push_container $image_name
+      - name: Checkout LLVM
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          sparse-checkout: |
+            .github/actions/push-container
 
-            if echo $image_name | grep '/amd64/'; then
-              # For amd64, create an alias with the arch component removed.
-              # This matches the convention used on dockerhub.
-              default_image_name=$(echo $(dirname $(dirname $image_name))/$(basename $image_name))
-              podman tag $image_name $default_image_name
-              push_container $default_image_name
-            fi
-          done
+      - uses: ./.github/actions/push-container
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/build-ci-container-windows.yml b/.github/workflows/build-ci-container-windows.yml
index 14c349b1b2fe5..3996948bb44e0 100644
--- a/.github/workflows/build-ci-container-windows.yml
+++ b/.github/workflows/build-ci-container-windows.yml
@@ -44,7 +44,7 @@ jobs:
         run: |
           docker save  ${{ steps.vars.outputs.container-name-tag }} >  ${{ steps.vars.outputs.container-filename }}
       - name: Upload container image
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: container
           path: ${{ steps.vars.outputs.container-filename }}
@@ -56,18 +56,22 @@ jobs:
       - build-ci-container-windows
     permissions:
       packages: write
-    runs-on: windows-2022
+    runs-on: ubuntu-24.04
     env:
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
       - name: Download container
-        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
+        uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
         with:
           name: container
       - name: Push Container
         run: |
-          docker load -i ${{ needs.build-ci-container-windows.outputs.container-filename }}
-          docker tag ${{ needs.build-ci-container-windows.outputs.container-name-tag }} ${{ needs.build-ci-container-windows.outputs.container-name }}:latest
-          docker login -u ${{ github.actor }} -p $env:GITHUB_TOKEN ghcr.io
-          docker push ${{ needs.build-ci-container-windows.outputs.container-name-tag }}
-          docker push ${{ needs.build-ci-container-windows.outputs.container-name }}:latest
+          sudo apt-get update
+          sudo apt-get install -y skopeo
+          skopeo login -u ${{ github.actor }} -p ${{ secrets.GITHUB_TOKEN }} ghcr.io
+          skopeo copy docker-archive:${{ needs.build-ci-container-windows.outputs.container-filename }} \
+            --dest-compress-format zstd \
+            docker://${{ needs.build-ci-container-windows.outputs.container-name-tag }}
+          skopeo copy docker-archive:${{ needs.build-ci-container-windows.outputs.container-filename }} \
+            --dest-compress-format zstd \
+            docker://${{ needs.build-ci-container-windows.outputs.container-name }}:latest
diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml
index 027c558afdd0b..ddb803fb969ff 100644
--- a/.github/workflows/build-ci-container.yml
+++ b/.github/workflows/build-ci-container.yml
@@ -10,72 +10,46 @@ on:
     paths:
       - .github/workflows/build-ci-container.yml
       - '.github/workflows/containers/github-action-ci/**'
+      - '.github/actions/build-container/**'
+      - '.github/actions/push-container/**'
   pull_request:
     paths:
       - .github/workflows/build-ci-container.yml
       - '.github/workflows/containers/github-action-ci/**'
+      - '.github/actions/build-container/**'
+      - '.github/actions/push-container/**'
 
 jobs:
   build-ci-container:
+    name: Build Container ${{ matrix.container-name }} ${{ (contains(matrix.runs-on, 'arm') && 'ARM64') || 'X64' }}
     if: github.repository_owner == 'llvm'
     runs-on: ${{ matrix.runs-on }}
     strategy:
       matrix:
-        include:
-          # The arch names should match the names used on dockerhub.
-          # See https://github.com/docker-library/official-images#architectures-other-than-amd64
-          - arch: amd64
-            runs-on: depot-ubuntu-24.04-16
-          - arch: arm64v8
-            runs-on: depot-ubuntu-24.04-arm-16
+        runs-on:
+          - depot-ubuntu-24.04-16
+          - depot-ubuntu-24.04-arm-16
+        container-name:
+          - ''
+          - agent
+        test-command:
+          - cd $HOME && printf '#include <iostream>\nint main(int argc, char **argv) { std::cout << "Hello\\n"; }' | clang++ -x c++ - && ./a.out | grep Hello
     steps:
       - name: Checkout LLVM
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
-          sparse-checkout: .github/workflows/containers/github-action-ci/
-      # podman is not installed by default on the ARM64 images.
-      - name: Install Podman
-        if: runner.arch == 'ARM64'
-        run: |
-          sudo apt-get install podman
-      - name: Write Variables
-        id: vars
-        run: |
-          tag=$(git rev-parse --short=12 HEAD)
-          container_name="ghcr.io/$GITHUB_REPOSITORY_OWNER/${{ matrix.arch }}/ci-ubuntu-24.04"
-          echo "container-name=$container_name" >> $GITHUB_OUTPUT
-          echo "container-name-agent=$container_name-agent" >> $GITHUB_OUTPUT
-          echo "container-name-tag=$container_name:$tag" >> $GITHUB_OUTPUT
-          echo "container-name-agent-tag=$container_name-agent:$tag" >> $GITHUB_OUTPUT
-          echo "container-filename=$(echo $container_name:$tag  | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT
-          echo "container-agent-filename=$(echo $container_name-agent:$tag  | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT
-      - name: Build container
-        working-directory: ./.github/workflows/containers/github-action-ci/
-        run: |
-          podman build --target ci-container -t ${{ steps.vars.outputs.container-name-tag }} .
-          podman build --target ci-container-agent -t ${{ steps.vars.outputs.container-name-agent-tag }} .
+          sparse-checkout: |
+            .github/workflows/containers/github-action-ci/
+            .github/actions/build-container
 
-      # Save the container so we have it in case the push fails.  This also
-      # allows us to separate the push step into a different job so we can
-      # maintain minimal permissions while building the container.
-      - name: Save container image
-        run: |
-          podman save ${{ steps.vars.outputs.container-name-tag }}  >  ${{ steps.vars.outputs.container-filename }}
-          podman save ${{ steps.vars.outputs.container-name-agent-tag }} > ${{ steps.vars.outputs.container-agent-filename }}
-
-      - name: Upload container image
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+      - name: Build Container
+        uses: ./.github/actions/build-container
         with:
-          name: container-${{ matrix.arch }}
-          path: "*.tar"
-          retention-days: 14
-
-      - name: Test Container
-        run: |
-          for image in ${{ steps.vars.outputs.container-name-tag }}; do
-            # Use --pull=never to ensure we are testing the just built image.
-            podman run --pull=never --rm -it $image /usr/bin/bash -x -c 'cd $HOME && printf '\''#include <iostream>\nint main(int argc, char **argv) { std::cout << "Hello\\n"; }'\'' | clang++ -x c++ - && ./a.out | grep Hello'
-          done
+          container-name: ci-ubuntu-24.04${{ matrix.container-name && format('-{0}', matrix.container-name)}}
+          context: .github/workflows/containers/github-action-ci/
+          dockerfile: .github/workflows/containers/github-action-ci/Dockerfile
+          target: ci-container${{ matrix.container-name && format('-{0}', matrix.container-name) }}
+          test-command: ${{ matrix.test-command }}
 
   push-ci-container:
     if: github.event_name == 'push'
@@ -87,31 +61,12 @@ jobs:
     env:
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - name: Download container
-        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
-
-      - name: Push Container
-        run: |
-          function push_container {
-            image_name=$1
-            latest_name=$(echo $image_name | sed 's/:[a-f0-9]\+$/:latest/g')
-            podman tag $image_name $latest_name
-            echo "Pushing $image_name ..."
-            podman push $image_name
-            echo "Pushing $latest_name ..."
-            podman push $latest_name
-          }
-
-          podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io
-          for f in $(find . -iname '*.tar'); do
-            image_name=$(podman load -q -i $f | sed 's/Loaded image: //g')
-            push_container $image_name
+      - name: Checkout LLVM
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          sparse-checkout: |
+            .github/actions/push-container
 
-            if echo $image_name | grep '/amd64/'; then
-              # For amd64, create an alias with the arch component removed.
-              # This matches the convention used on dockerhub.
-              default_image_name=$(echo $(dirname $(dirname $image_name))/$(basename $image_name))
-              podman tag $image_name $default_image_name
-              push_container $default_image_name
-            fi
-          done
+      - uses: ./.github/actions/push-container
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/build-metrics-container.yml b/.github/workflows/build-metrics-container.yml
index 69b571575f40c..786c41214d853 100644
--- a/.github/workflows/build-metrics-container.yml
+++ b/.github/workflows/build-metrics-container.yml
@@ -49,7 +49,7 @@ jobs:
         run: |
           podman save  ${{ steps.vars.outputs.container-name-tag }} >  ${{ steps.vars.outputs.container-filename }}
       - name: Upload Container Image
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: container
           path: ${{ steps.vars.outputs.container-filename }}
@@ -66,7 +66,7 @@ jobs:
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
       - name: Download Container
-        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
+        uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
         with:
           name: container
       - name: Push Container
diff --git a/.github/workflows/check-ci.yml b/.github/workflows/check-ci.yml
index f18a69c192ee9..7fecb010a64ff 100644
--- a/.github/workflows/check-ci.yml
+++ b/.github/workflows/check-ci.yml
@@ -26,9 +26,9 @@ jobs:
         with:
           sparse-checkout: .ci
       - name: Setup Python
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
         with:
-          python-version: 3.13
+          python-version: 3.14
           cache: 'pip'
       - name: Install Python Dependencies
         run: |
diff --git a/.github/workflows/ci-post-commit-analyzer.yml b/.github/workflows/ci-post-commit-analyzer.yml
index 49cf4100dd71c..59df0b68a8ad7 100644
--- a/.github/workflows/ci-post-commit-analyzer.yml
+++ b/.github/workflows/ci-post-commit-analyzer.yml
@@ -87,7 +87,7 @@ jobs:
           scan-build --generate-index-only build/analyzer-results
 
       - name: Upload Results
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         if: always()
         with:
           name: analyzer-results
diff --git a/.github/workflows/commit-access-review.yml b/.github/workflows/commit-access-review.yml
index 734dc212fa648..7cdcfca532990 100644
--- a/.github/workflows/commit-access-review.yml
+++ b/.github/workflows/commit-access-review.yml
@@ -28,7 +28,7 @@ jobs:
           python3 .github/workflows/commit-access-review.py $GITHUB_TOKEN
 
       - name: Upload Triage List
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: triagers
           path: triagers.log
diff --git a/.github/workflows/containers/github-action-ci-tooling/Dockerfile b/.github/workflows/containers/github-action-ci-tooling/Dockerfile
index 8aaa2e88f2bab..b78c99efb9be3 100644
--- a/.github/workflows/containers/github-action-ci-tooling/Dockerfile
+++ b/.github/workflows/containers/github-action-ci-tooling/Dockerfile
@@ -22,6 +22,7 @@ RUN apt-get update && \
 
 FROM docker.io/library/ubuntu:24.04 AS base
 ENV LLVM_SYSROOT=/opt/llvm
+ENV PATH=${LLVM_SYSROOT}/bin:${PATH}
 
 # Need nodejs for some of the GitHub actions.
 # Need git for git-clang-format.
@@ -46,6 +47,28 @@ RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
 # as root in 'ci-container-code-format' and 'ci-container-code-lint' containers
 
 
+FROM base AS ci-container-build-tools
+ARG LLVM_VERSION
+ARG LLVM_VERSION_MAJOR
+
+COPY --from=llvm-downloader /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/bin/clang-${LLVM_VERSION_MAJOR} \
+                            ${LLVM_SYSROOT}/bin/
+COPY --from=llvm-downloader /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/lib/clang/${LLVM_VERSION_MAJOR}/include \
+                            ${LLVM_SYSROOT}/lib/clang/${LLVM_VERSION_MAJOR}/include
+RUN ln -s ${LLVM_SYSROOT}/bin/clang-${LLVM_VERSION_MAJOR} ${LLVM_SYSROOT}/bin/clang && \
+    ln -s ${LLVM_SYSROOT}/bin/clang ${LLVM_SYSROOT}/bin/clang++
+
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    cmake \
+    ninja-build && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CC=${LLVM_SYSROOT}/bin/clang
+ENV CXX=${LLVM_SYSROOT}/bin/clang++
+
+
 FROM base AS ci-container-code-format
 ARG LLVM_VERSION
 
@@ -53,7 +76,6 @@ COPY --from=llvm-downloader /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/bin/cla
                             /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/bin/git-clang-format \
                             ${LLVM_SYSROOT}/bin/
 
-ENV PATH=${LLVM_SYSROOT}/bin:${PATH}
 
 # Install dependencies for 'pr-code-format.yml' job
 COPY llvm/utils/git/requirements_formatting.txt requirements_formatting.txt
@@ -63,32 +85,38 @@ USER gha
 WORKDIR /home/gha
 
 
-FROM base AS ci-container-code-lint
+FROM ci-container-build-tools AS ci-container-code-lint
 ARG LLVM_VERSION
 ARG LLVM_VERSION_MAJOR
 
 COPY --from=llvm-downloader /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/bin/clang-tidy \
-                            /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/bin/clang-${LLVM_VERSION_MAJOR} \
                             ${LLVM_SYSROOT}/bin/
-COPY --from=llvm-downloader /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/lib/clang/${LLVM_VERSION_MAJOR}/include \
-                            ${LLVM_SYSROOT}/lib/clang/${LLVM_VERSION_MAJOR}/include
 COPY clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py ${LLVM_SYSROOT}/bin/clang-tidy-diff.py
 
-RUN ln -s ${LLVM_SYSROOT}/bin/clang-${LLVM_VERSION_MAJOR} ${LLVM_SYSROOT}/bin/clang && \
-    ln -s ${LLVM_SYSROOT}/bin/clang ${LLVM_SYSROOT}/bin/clang++
+# Install dependencies for 'pr-code-lint.yml' job
+COPY llvm/utils/git/requirements_linting.txt requirements_linting.txt
+RUN pip install -r requirements_linting.txt --break-system-packages && \
+    rm requirements_linting.txt
+USER gha
+WORKDIR /home/gha
 
-ENV PATH=${LLVM_SYSROOT}/bin:${PATH}
+
+FROM ci-container-build-tools as ci-container-abi-tests
 
 RUN apt-get update && \
     DEBIAN_FRONTEND=noninteractive apt-get install -y \
-    cmake \
-    ninja-build && \
+    abi-compliance-checker \
+    abi-dumper \
+    autoconf \
+    parallel \
+    pkg-config && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-# Install dependencies for 'pr-code-lint.yml' job
-COPY llvm/utils/git/requirements_linting.txt requirements_linting.txt
-RUN pip install -r requirements_linting.txt --break-system-packages && \
-    rm requirements_linting.txt
-USER gha
-WORKDIR /home/gha
+RUN git clone https://github.com/universal-ctags/ctags.git && \
+    cd ctags && \
+    ./autogen.sh && \
+    ./configure && \
+    sudo make install && \
+    rm -Rf ../ctags
+
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index b5f3413fe3b6b..3eb146d21dc40 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -60,7 +60,7 @@ jobs:
           fetch-depth: 2
       - name: Get subprojects that have doc changes
         id: docs-changed-subprojects
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
         with:
           skip_initial_fetch: true
           base_sha: 'HEAD~1'
@@ -95,9 +95,9 @@ jobs:
             workflow:
               - '.github/workflows/docs.yml'
       - name: Setup Python env
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
         with:
-          python-version: '3.13'
+          python-version: '3.14'
           cache: 'pip'
           cache-dependency-path: 'llvm/docs/requirements-hashed.txt'
       - name: Install python dependencies
@@ -209,7 +209,7 @@ jobs:
           mkdir built-docs/flang
           cp -r flang-build/docs/* built-docs/flang/
       - name: Upload docs
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: docs-output
           path: built-docs/
diff --git a/.github/workflows/email-check.yaml b/.github/workflows/email-check.yaml
index 981c6fa62cb19..ba625b2b3b062 100644
--- a/.github/workflows/email-check.yaml
+++ b/.github/workflows/email-check.yaml
@@ -39,7 +39,7 @@ jobs:
           [{"body" : "$COMMENT"}]
           EOF
 
-      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+      - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         if: always()
         with:
           name: workflow-args
diff --git a/.github/workflows/gha-codeql.yml b/.github/workflows/gha-codeql.yml
index 63388ebc706bd..4b9df6b668451 100644
--- a/.github/workflows/gha-codeql.yml
+++ b/.github/workflows/gha-codeql.yml
@@ -29,9 +29,9 @@ jobs:
           sparse-checkout: |
             .github/
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4
+        uses: github/codeql-action/init@0499de31b99561a6d14a36a5f662c2a54f91beee # v4.31.2
         with:
           languages: actions
           queries: security-extended
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4
+        uses: github/codeql-action/analyze@0499de31b99561a6d14a36a5f662c2a54f91beee # v4.31.2
diff --git a/.github/workflows/hlsl-test-all.yaml b/.github/workflows/hlsl-test-all.yaml
index dcb852312d41a..ce6ccfa23df6a 100644
--- a/.github/workflows/hlsl-test-all.yaml
+++ b/.github/workflows/hlsl-test-all.yaml
@@ -54,7 +54,7 @@ jobs:
           path: golden-images
       - name: Setup Windows
         if: runner.os == 'Windows'
-        uses: llvm/actions/setup-windows@main
+        uses: llvm/actions/setup-windows@42d80571b13f4599bbefbc7189728b64723c7f78 # main
         with:
           arch: amd64
       - name: Build DXC
@@ -80,7 +80,7 @@ jobs:
             ninja check-hlsl-unit
             ninja ${{ inputs.TestTarget }}
       - name: Publish Test Results
-        uses: EnricoMi/publish-unit-test-result-action/macos@3a74b2957438d0b6e2e61d67b05318aa25c9e6c6 # v2.20.0
+        uses: EnricoMi/publish-unit-test-result-action/macos@34d7c956a59aed1bfebf31df77b8de55db9bbaaf # v2.21.0
         if: always() && runner.os == 'macOS'
         with:
           comment_mode: off
diff --git a/.github/workflows/issue-write.yml b/.github/workflows/issue-write.yml
index 26cd60c070251..4f8fd7a48aff6 100644
--- a/.github/workflows/issue-write.yml
+++ b/.github/workflows/issue-write.yml
@@ -7,6 +7,7 @@ on:
       - "Check for private emails used in PRs"
       - "PR Request Release Note"
       - "Code lint"
+      - "CI Checks"
     types:
       - completed
 
@@ -40,7 +41,7 @@ jobs:
 
       - name: 'Comment on PR'
         if: steps.download-artifact.outputs.artifact-id != ''
-        uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7.1.0
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           script: |
diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index 5ccf976848197..b92b61de05088 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -84,6 +84,8 @@ jobs:
     if: github.repository_owner == 'llvm'
     needs: abi-dump-setup
     runs-on: ubuntu-24.04
+    container:
+      image: "ghcr.io/llvm/ci-ubuntu-24.04-abi-tests@sha256:f80125c0f767e29b8616210c0fd5cea2cd1f4fb6f2ca86d89f6016b6329b8d7f" #ghcr.io/llvm/ci-ubuntu-24.04-abi-tests:9524b37c503f
     strategy:
       matrix:
         name:
@@ -100,18 +102,7 @@ jobs:
             repo: ${{ github.repository }}
     steps:
       - name: Install Ninja
-        uses: llvm/actions/install-ninja@main
-      - name: Install abi-compliance-checker
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y abi-dumper autoconf pkg-config
-      - name: Install universal-ctags
-        run: |
-          git clone https://github.com/universal-ctags/ctags.git
-          cd ctags
-          ./autogen.sh
-          ./configure
-          sudo make install
+        uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main
       - name: Download source code
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
@@ -131,7 +122,7 @@ jobs:
             sed -i 's/LLVM_[0-9]\+/LLVM_NOVERSION/' $lib-${{ matrix.ref }}.abi
           done
       - name: Upload ABI file
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0
         with:
           name: ${{ matrix.name }}
           path: '*${{ matrix.ref }}.abi'
@@ -139,25 +130,23 @@ jobs:
   abi-compare:
     if: github.repository_owner == 'llvm'
     runs-on: ubuntu-24.04
+    container:
+      image: "ghcr.io/llvm/ci-ubuntu-24.04-abi-tests@sha256:f80125c0f767e29b8616210c0fd5cea2cd1f4fb6f2ca86d89f6016b6329b8d7f" #ghcr.io/llvm/ci-ubuntu-24.04-abi-tests:9524b37c503f
     needs:
       - abi-dump-setup
       - abi-dump
     steps:
       - name: Download baseline
-        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
+        uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
         with:
           name: build-baseline
           path: build-baseline
       - name: Download latest
-        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
+        uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
         with:
           name: build-latest
           path: build-latest
 
-      - name: Install abi-compliance-checker
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y abi-compliance-checker
       - name: Compare ABI
         run: |
           for lib in ${{ needs.abi-dump-setup.outputs.ABI_LIBS }}; do
@@ -165,7 +154,7 @@ jobs:
           done
       - name: Upload ABI Comparison
         if: always()
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0
         with:
           name: compat-report-${{ github.sha }}
           path: compat_reports/
diff --git a/.github/workflows/libclang-python-tests.yml b/.github/workflows/libclang-python-tests.yml
index 8fb8cec3b4f00..0d66f5d595e0e 100644
--- a/.github/workflows/libclang-python-tests.yml
+++ b/.github/workflows/libclang-python-tests.yml
@@ -34,7 +34,7 @@ jobs:
     steps:
       - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       - name: Setup Python
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
         with:
           python-version: ${{ matrix.python-version }}
       - name: Setup ccache
diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 6c8f2cb45ee0a..6b80d4291c0ee 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -60,7 +60,7 @@ jobs:
         env:
           CC: ${{ matrix.cc }}
           CXX: ${{ matrix.cxx }}
-      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+      - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         if: always()
         with:
           name: ${{ matrix.config }}-${{ matrix.cxx }}-results
@@ -105,7 +105,7 @@ jobs:
         env:
           CC: ${{ matrix.cc }}
           CXX: ${{ matrix.cxx }}
-      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+      - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         if: always()  # Upload artifacts even if the build or test suite fails
         with:
           name: ${{ matrix.config }}-${{ matrix.cxx }}-results
@@ -169,7 +169,7 @@ jobs:
         env:
           CC: clang-22
           CXX: clang++-22
-      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+      - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         if: always()
         with:
           name: ${{ matrix.config }}-results
@@ -222,8 +222,8 @@ jobs:
           python3 -m venv .venv
           source .venv/bin/activate
           python -m pip install psutil
-          bash libcxx/utils/ci/run-buildbot ${{ matrix.config }}
-      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+          xcrun bash libcxx/utils/ci/run-buildbot ${{ matrix.config }}
+      - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         if: always()  # Upload artifacts even if the build or test suite fails
         with:
           name: macos-${{ matrix.config }}-results
diff --git a/.github/workflows/libcxx-build-containers.yml b/.github/workflows/libcxx-build-containers.yml
index 312cb47fc3d93..4bce86145fc0c 100644
--- a/.github/workflows/libcxx-build-containers.yml
+++ b/.github/workflows/libcxx-build-containers.yml
@@ -55,7 +55,7 @@ jobs:
         TAG: ${{ github.sha }}
 
     - name: Log in to GitHub Container Registry
-      uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0
+      uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
       with:
         registry: ghcr.io
         username: ${{ github.actor }}
diff --git a/.github/workflows/libcxx-run-benchmarks.yml b/.github/workflows/libcxx-run-benchmarks.yml
index 9e8f55859fc7a..e2ca940d2f0b3 100644
--- a/.github/workflows/libcxx-run-benchmarks.yml
+++ b/.github/workflows/libcxx-run-benchmarks.yml
@@ -35,7 +35,7 @@ jobs:
     steps:
       - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
         with:
-          python-version: '3.13'
+          python-version: '3.14'
 
       - name: Extract information from the PR
         id: vars
diff --git a/.github/workflows/llvm-abi-tests.yml b/.github/workflows/llvm-abi-tests.yml
index f73d180bb0005..f75dd9c3abd9e 100644
--- a/.github/workflows/llvm-abi-tests.yml
+++ b/.github/workflows/llvm-abi-tests.yml
@@ -10,13 +10,13 @@ on:
       - 'release/**'
     paths:
       - 'llvm/**'
-      - '.github/workflows/llvm-tests.yml'
+      - '.github/workflows/llvm-abi-tests.yml'
   pull_request:
     branches:
       - 'release/**'
     paths:
       - 'llvm/**'
-      - '.github/workflows/llvm-tests.yml'
+      - '.github/workflows/llvm-abi-tests.yml'
 
 concurrency:
   # Skip intermediate builds: always.
@@ -72,6 +72,8 @@ jobs:
     if: github.repository_owner == 'llvm'
     needs: abi-dump-setup
     runs-on: ubuntu-24.04
+    container:
+      image: "ghcr.io/llvm/ci-ubuntu-24.04-abi-tests@sha256:01e66b0847c1e9c88f0bd0492ed7c3374550a0730b48040f63888393f1ff6c13" #ghcr.io/llvm/ci-ubuntu-24.04-abi-tests:bb0bd382ab2b"
     strategy:
       matrix:
         name:
@@ -87,19 +89,6 @@ jobs:
             ref: ${{ github.sha }}
             repo: ${{ github.repository }}
     steps:
-      - name: Install Ninja
-        uses: llvm/actions/install-ninja@main
-      - name: Install abi-compliance-checker
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install abi-dumper autoconf pkg-config
-      - name: Install universal-ctags
-        run: |
-          git clone https://github.com/universal-ctags/ctags.git
-          cd ctags
-          ./autogen.sh
-          ./configure
-          sudo make install
       - name: Download source code
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
@@ -128,14 +117,14 @@ jobs:
           # Remove symbol versioning from dumps, so we can compare across major versions.
           sed -i 's/LLVM_${{ matrix.llvm_version_major }}/LLVM_NOVERSION/' ${{ matrix.ref }}.abi
       - name: Upload ABI file
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0
         with:
           name: ${{ matrix.name }}
           path: ${{ matrix.ref }}.abi
 
       - name: Upload symbol list file
         if: matrix.name == 'build-baseline'
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0
         with:
           name: symbol-list
           path: llvm.symbols
@@ -143,30 +132,28 @@ jobs:
   abi-compare:
     if: github.repository_owner == 'llvm'
     runs-on: ubuntu-24.04
+    container:
+      image: "ghcr.io/llvm/ci-ubuntu-24.04-abi-tests@sha256:01e66b0847c1e9c88f0bd0492ed7c3374550a0730b48040f63888393f1ff6c13" #ghcr.io/llvm/ci-ubuntu-24.04-abi-tests:bb0bd382ab2b
     needs:
       - abi-dump-setup
       - abi-dump
     steps:
       - name: Download baseline
-        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
+        uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
         with:
           name: build-baseline
           path: build-baseline
       - name: Download latest
-        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
+        uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
         with:
           name: build-latest
           path: build-latest
       - name: Download symbol list
-        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
+        uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
         with:
           name: symbol-list
           path: symbol-list
 
-      - name: Install abi-compliance-checker
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install abi-compliance-checker
       - name: Compare ABI
         run: |
           if [ -s symbol-list/llvm.symbols ]; then
@@ -179,7 +166,7 @@ jobs:
           abi-compliance-checker $EXTRA_ARGS -l libLLVM.so -old build-baseline/*.abi -new build-latest/*.abi || test "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c"
       - name: Upload ABI Comparison
         if: always()
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0
         with:
           name: compat-report-${{ github.sha }}
           path: compat_reports/
diff --git a/.github/workflows/llvm-bugs.yml b/.github/workflows/llvm-bugs.yml
index 7d42abfadde7b..96fc553abfe35 100644
--- a/.github/workflows/llvm-bugs.yml
+++ b/.github/workflows/llvm-bugs.yml
@@ -14,13 +14,13 @@ jobs:
     runs-on: ubuntu-24.04
     if: github.repository == 'llvm/llvm-project'
     steps:
-      - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
+      - uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
         with:
-          node-version: 18
+          node-version: 24
           check-latest: true
       - run: npm install mailgun.js form-data
       - name: Send notification
-        uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
         env:
           MAILGUN_API_KEY: ${{ secrets.LLVM_BUGS_KEY }}
         with:
@@ -39,6 +39,12 @@ jobs:
               repo: context.repo.repo
             })
             .then((issue) => {
+              var maybeTruncatedBody = issue.data.body;
+              if (maybeTruncatedBody.length > 15000) {
+                maybeTruncatedBody = maybeTruncatedBody.substring(0,
+                    15000) +
+                  "<truncated>Please see the issue for the entire body."
+              }
               const payload = {
                 author : issue.data.user.login,
                 issue  : issue.data.number,
@@ -46,7 +52,7 @@ jobs:
                 url    : issue.data.html_url,
                 labels : issue.data.labels.map((label) => label.name),
                 assignee : issue.data.assignees.map((assignee) => assignee.login),
-                body   : issue.data.body
+                body   : maybeTruncatedBody
               };
 
               const data = {
diff --git a/.github/workflows/new-issues.yml b/.github/workflows/new-issues.yml
index 8480a657cc717..a5dcad28dbe24 100644
--- a/.github/workflows/new-issues.yml
+++ b/.github/workflows/new-issues.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-24.04
     if: github.repository == 'llvm/llvm-project'
     steps:
-      - uses: llvm/actions/issue-labeler@main
+      - uses: llvm/actions/issue-labeler@42d80571b13f4599bbefbc7189728b64723c7f78 # main
         with:
           repo-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}
           configuration-path: .github/new-issues-labeler.yml
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index ac0689b4d3243..dc253e4fbae98 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -27,7 +27,7 @@ jobs:
 
       - name: Get changed files
         id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
         with:
           separator: ","
           skip_initial_fetch: true
@@ -56,7 +56,7 @@ jobs:
             --end-rev HEAD \
             --changed-files "$CHANGED_FILES"
 
-      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+      - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         if: always()
         with:
           name: workflow-args
diff --git a/.github/workflows/pr-code-lint.yml b/.github/workflows/pr-code-lint.yml
index 8ba9378703739..5444a29c22205 100644
--- a/.github/workflows/pr-code-lint.yml
+++ b/.github/workflows/pr-code-lint.yml
@@ -27,13 +27,13 @@ jobs:
       cancel-in-progress: true
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           fetch-depth: 2
       
       - name: Get changed files
         id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
         with:
           separator: ","
           skip_initial_fetch: true
@@ -91,7 +91,7 @@ jobs:
             --changed-files "$CHANGED_FILES"
       
       - name: Upload results
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         if: always()
         with:
           name: workflow-args
diff --git a/.github/workflows/pr-request-release-note.yml b/.github/workflows/pr-request-release-note.yml
index 8162a8984ee5f..c2dc2de65f133 100644
--- a/.github/workflows/pr-request-release-note.yml
+++ b/.github/workflows/pr-request-release-note.yml
@@ -41,7 +41,7 @@ jobs:
             request-release-note \
             --pr-number ${{ github.event.pull_request.number}}
 
-      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+      - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         if: always()
         with:
           name: workflow-args
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 6303a119750b5..02a6f3b868d85 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -64,6 +64,9 @@ jobs:
       - name: Build and Test
         timeout-minutes: 120
         continue-on-error: ${{ runner.arch == 'ARM64' }}
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+          GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           git config --global --add safe.directory '*'
 
@@ -110,7 +113,7 @@ jobs:
         # https://github.com/actions/upload-artifact/issues/569
         continue-on-error: true
         if: '!cancelled()'
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: Premerge Artifacts (Linux ${{ runner.arch }})
           path: artifacts/
@@ -153,6 +156,9 @@ jobs:
         timeout-minutes: 180
         if: ${{ steps.vars.outputs.windows-projects != '' }}
         shell: cmd
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+          GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           call C:\\BuildTools\\Common7\\Tools\\VsDevCmd.bat -arch=amd64 -host_arch=amd64
           # See the comments above in the Linux job for why we define each of
@@ -165,7 +171,7 @@ jobs:
         # https://github.com/actions/upload-artifact/issues/569
         continue-on-error: true
         if: '!cancelled()'
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: Premerge Artifacts (Windows)
           path: artifacts/
@@ -190,7 +196,8 @@ jobs:
         with:
           max-size: "2000M"
       - name: Install Ninja
-        uses: llvm/actions/install-ninja@main
+        run: |
+          brew install ninja
       - name: Build and Test
         run: |
           source <(git diff --name-only HEAD~1...HEAD | python3 .ci/compute_projects.py)
diff --git a/.github/workflows/release-asset-audit.yml b/.github/workflows/release-asset-audit.yml
index 8b24948b568eb..b658167d1db36 100644
--- a/.github/workflows/release-asset-audit.yml
+++ b/.github/workflows/release-asset-audit.yml
@@ -38,7 +38,7 @@ jobs:
         if: >-
           github.event_name != 'pull_request' &&
           failure()
-        uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7.1.0
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
         with:
           github-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}
           script: |
diff --git a/.github/workflows/release-binaries-all.yml b/.github/workflows/release-binaries-all.yml
index 0b52a08202f1a..eef49b5e3625d 100644
--- a/.github/workflows/release-binaries-all.yml
+++ b/.github/workflows/release-binaries-all.yml
@@ -90,7 +90,6 @@ jobs:
         runs-on:
           - ubuntu-22.04
           - ubuntu-22.04-arm
-          - macos-13
           - macos-14
 
     uses: ./.github/workflows/release-binaries.yml
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 8145926265256..1ffa088e99fa1 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -21,7 +21,6 @@ on:
         options:
           - ubuntu-22.04
           - ubuntu-22.04-arm
-          - macos-13
           - macos-14
 
   workflow_call:
@@ -69,7 +68,7 @@ jobs:
     # due to https://github.com/actions/runner-images/issues/10385
     - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
       with:
-        python-version: '3.13'
+        python-version: '3.14'
 
     - name: Checkout LLVM
       uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
@@ -130,8 +129,6 @@ jobs:
           target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_BOOTSTRAP_COMPILER_RT_ENABLE_IOS=OFF"
           if [ "$RUNNER_ARCH" = "ARM64" ]; then
             arches=arm64
-          else
-            arches=x86_64
           fi
           target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_BOOTSTRAP_DARWIN_osx_ARCHS=$arches -DBOOTSTRAP_BOOTSTRAP_DARWIN_osx_BUILTIN_ARCHS=$arches"
         fi
@@ -141,20 +138,11 @@ jobs:
           target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF"
         fi
 
-        echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT
         case "${{ inputs.runs-on }}" in
           ubuntu-22.04*)
             build_runs_on="depot-${{ inputs.runs-on }}-16"
             test_runs_on=$build_runs_on
             ;;
-          macos-13)
-            if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
-              build_runs_on="${{ inputs.runs-on }}"
-            else
-              build_runs_on="macos-13-large"
-            fi
-            test_runs_on="${{ inputs.runs-on }}"
-            ;;
           macos-14)
             if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
               build_runs_on="${{ inputs.runs-on }}"
@@ -168,6 +156,23 @@ jobs:
             build_runs_on=$test_runs_on
             ;;
         esac
+
+        case "$build_runs_on" in
+          # These runners cannot build the full release package faster than
+          # the 6 hours timeout limit, so we need to use a configuration
+          # that builds more quickly.
+          macos-14)
+            bootstrap_prefix="BOOTSTRAP"
+            target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF -DLLVM_RELEASE_ENABLE_PGO=OFF"
+            ;;
+          *)
+            bootstrap_prefix="BOOTSTRAP_BOOTSTRAP"
+            ;;
+        esac
+
+        target_cmake_flags="$target_cmake_flags -D${bootstrap_prefix}_CPACK_PACKAGE_FILE_NAME=$release_binary_basename"
+
+        echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT
         echo "build-runs-on=$build_runs_on" >> $GITHUB_OUTPUT
         echo "test-runs-on=$test_runs_on" >> $GITHUB_OUTPUT
 
@@ -184,13 +189,7 @@ jobs:
         ref: ${{ needs.prepare.outputs.ref }}
 
     - name: Install Ninja
-      uses: llvm/actions/install-ninja@a1ea791b03c8e61f53a0e66f2f73db283aa0f01e # main
-
-    - name: Setup Windows
-      if: startsWith(runner.os, 'Windows')
-      uses: llvm/actions/setup-windows@main
-      with:
-        arch: amd64
+      uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main
 
     - name: Set Build Prefix
       id: setup-stage
@@ -211,8 +210,7 @@ jobs:
         # so we need to set some extra cmake flags to disable this.
         cmake -G Ninja -S llvm -B ${{ steps.setup-stage.outputs.build-prefix }}/build \
             ${{ needs.prepare.outputs.target-cmake-flags }} \
-            -C clang/cmake/caches/Release.cmake \
-            -DBOOTSTRAP_BOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}"
+            -C clang/cmake/caches/Release.cmake
 
     - name: Build
       shell: bash
@@ -221,7 +219,7 @@ jobs:
         release_dir=`find ${{ steps.setup-stage.outputs.build-prefix }}/build -iname 'stage2-bins'`
         mv $release_dir/${{ needs.prepare.outputs.release-binary-filename }} .
     
-    - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+    - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
       with:
         name: ${{ runner.os }}-${{ runner.arch }}-release-binary
         # Due to path differences on Windows when running in bash vs running on node,
@@ -259,14 +257,14 @@ jobs:
         sparse-checkout-cone-mode: false
 
     - name: 'Download artifact'
-      uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
+      uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
       with:
         pattern: '*-release-binary'
         merge-multiple: true
 
     - name: Attest Build Provenance
       id: provenance
-      uses: actions/attest-build-provenance@ef244123eb79f2f7a7e75d99086184180e6d0018 # v1.4.4
+      uses: actions/attest-build-provenance@977bb373ede98d70efdf65b84cb5f73e068dcc2a # v3.0.0
       with:
         subject-path: ${{ needs.prepare.outputs.release-binary-filename }}
 
@@ -275,7 +273,7 @@ jobs:
         mv ${{ steps.provenance.outputs.bundle-path }} ${{ needs.prepare.outputs.release-binary-filename }}.jsonl
 
     - name: Upload Build Provenance
-      uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+      uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
       with:
         name: ${{ needs.prepare.outputs.release-binary-filename }}-attestation
         path: ${{ needs.prepare.outputs.release-binary-filename }}.jsonl
diff --git a/.github/workflows/release-documentation.yml b/.github/workflows/release-documentation.yml
index 4cf973d000a4b..c09ad57066711 100644
--- a/.github/workflows/release-documentation.yml
+++ b/.github/workflows/release-documentation.yml
@@ -41,7 +41,7 @@ jobs:
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Setup Python env
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
         with:
           cache: 'pip'
           cache-dependency-path: './llvm/docs/requirements.txt'
@@ -63,7 +63,7 @@ jobs:
           ./llvm/utils/release/build-docs.sh -release "${{ inputs.release-version }}" -no-doxygen
 
       - name: Create Release Notes Artifact
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # 5.0.0
         with:
           name: release-notes
           path: docs-build/html-export/
diff --git a/.github/workflows/release-doxygen.yml b/.github/workflows/release-doxygen.yml
index 79e509e5e6a8b..c31319e47833d 100644
--- a/.github/workflows/release-doxygen.yml
+++ b/.github/workflows/release-doxygen.yml
@@ -43,7 +43,7 @@ jobs:
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Setup Python env
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
         with:
           cache: 'pip'
           cache-dependency-path: './llvm/docs/requirements.txt'
diff --git a/.github/workflows/release-sources.yml b/.github/workflows/release-sources.yml
index 2278b96dbe242..4c47bd7575d99 100644
--- a/.github/workflows/release-sources.yml
+++ b/.github/workflows/release-sources.yml
@@ -92,14 +92,14 @@ jobs:
       - name: Attest Build Provenance
         if: github.event_name != 'pull_request'
         id: provenance
-        uses: actions/attest-build-provenance@ef244123eb79f2f7a7e75d99086184180e6d0018 # v1.4.4
+        uses: actions/attest-build-provenance@977bb373ede98d70efdf65b84cb5f73e068dcc2a # v3.0.0
         with:
           subject-path: "*.xz"
       - if: github.event_name != 'pull_request'
         run: |
           mv ${{ steps.provenance.outputs.bundle-path }} .
       - name: Create Tarball Artifacts
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           path: |
             *.xz
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index c07df338cf989..05a6d98a81bad 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -36,7 +36,7 @@ jobs:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action@05b42c624433fc40578a4040d5cf5e36ddca8cde # v2.4.2
+        uses: ossf/scorecard-action@4eaacf0543bb3f2c246792bd56e8cdeffafb205a # v2.4.3
         with:
           results_file: results.sarif
           results_format: sarif
@@ -49,7 +49,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: SARIF file
           path: results.sarif
@@ -57,6 +57,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@b8d3b6e8af63cde30bdc382c0bc28114f4346c88 # v2.28.1
+        uses: github/codeql-action/upload-sarif@0499de31b99561a6d14a36a5f662c2a54f91beee # v4.31.2
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/test-unprivileged-download-artifact.yml b/.github/workflows/test-unprivileged-download-artifact.yml
new file mode 100644
index 0000000000000..a9c0912b0f44e
--- /dev/null
+++ b/.github/workflows/test-unprivileged-download-artifact.yml
@@ -0,0 +1,54 @@
+name: Test Unprivileged Download Artifact Action
+
+permissions:
+  contents: read
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/test-unprivileged-download-artifact.yml
+      - '.github/workflows/unprivileged-download-artifact/**'
+  pull_request:
+    paths:
+      - .github/workflows/test-unprivileged-download-artifact.yml
+      - '.github/workflows/unprivileged-download-artifact/**'
+
+jobs:
+  upload-test-artifact:
+    name: Upload Test Artifact
+    if: github.repository_owner == 'llvm'
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Create Test File
+        run: |
+          echo "test" > comment
+      - name: Upload Test File
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        with:
+          name: workflow-args
+          path: |
+            comment
+
+  test-download:
+    name: Test Unprivileged Download Artifact
+    if: github.repository_owner == 'llvm'
+    runs-on: ubuntu-24.04
+    needs: [ upload-test-artifact ]
+    steps:
+      - name: Chekcout LLVM
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          sparse-checkout: |
+            .github/workflows/unprivileged-download-artifact/action.yml
+      - name: Download Artifact
+        uses: ./.github/workflows/unprivileged-download-artifact
+        id: download-artifact
+        with:
+          run-id: ${{ github.run_id }}
+          artifact-name: workflow-args
+      - name: Assert That Contents are the Same
+        run: |
+          cat comment
+          [[ "$(cat comment)" == "test" ]]
diff --git a/.github/workflows/unprivileged-download-artifact/action.yml b/.github/workflows/unprivileged-download-artifact/action.yml
index 5b50d7ce3d3fb..72815b26bcf41 100644
--- a/.github/workflows/unprivileged-download-artifact/action.yml
+++ b/.github/workflows/unprivileged-download-artifact/action.yml
@@ -27,7 +27,7 @@ outputs:
 runs:
   using: "composite"
   steps:
-    - uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7.1.0
+    - uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
       id: artifact-url
       with:
         script: |
diff --git a/bolt/README.md b/bolt/README.md
index 902d1eb6e7694..55f742c5019f5 100644
--- a/bolt/README.md
+++ b/bolt/README.md
@@ -173,7 +173,7 @@ Once you have `perf.fdata` ready, you can use it for optimizations with
 BOLT. Assuming your environment is setup to include the right path, execute
 `llvm-bolt`:
 ```
-$ llvm-bolt <executable> -o <executable>.bolt -data=perf.fdata -reorder-blocks=ext-tsp -reorder-functions=hfsort -split-functions -split-all-cold -split-eh -dyno-stats
+$ llvm-bolt <executable> -o <executable>.bolt -data=perf.fdata -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions -split-all-cold -split-eh -dyno-stats
 ```
 
 If you do need an updated debug info, then add `-update-debug-sections` option
diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index 43ceceee7de45..7c6e01d669b74 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -381,11 +381,6 @@
 
   Set verbosity level for diagnostic output
 
-- `--write-dwp`
-
-  Output a single dwarf package file (dwp) instead of multiple non-relocatable
-  dwarf object files (dwo).
-
 ### BOLT optimization options:
 
 - `--align-blocks`
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 5cbc28fb38a33..2af1d330b7545 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -354,9 +354,6 @@ class BinaryContext {
   /// Newly created segments.
   std::vector<SegmentInfo> NewSegments;
 
-  /// Symbols that are expected to be undefined in MCContext during emission.
-  std::unordered_set<MCSymbol *> UndefinedSymbols;
-
   /// [name] -> [BinaryData*] map used for global symbol resolution.
   using SymbolMapType = StringMap<BinaryData *>;
   SymbolMapType GlobalSymbols;
@@ -500,7 +497,7 @@ class BinaryContext {
   ///
   /// As we fold identical functions, multiple symbols can point
   /// to the same BinaryFunction.
-  std::unordered_map<const MCSymbol *, BinaryFunction *> SymbolToFunctionMap;
+  DenseMap<const MCSymbol *, BinaryFunction *> SymbolToFunctionMap;
 
   /// A mutex that is used to control parallel accesses to SymbolToFunctionMap
   mutable llvm::sys::RWMutex SymbolToFunctionMapMutex;
@@ -932,6 +929,16 @@ class BinaryContext {
   std::pair<const MCSymbol *, uint64_t>
   handleAddressRef(uint64_t Address, BinaryFunction &BF, bool IsPCRel);
 
+  /// When \p Address inside function \p BF is a target of a control transfer
+  /// instruction (branch) from another function, return a corresponding symbol
+  /// that should be used by the branch. For example, main or secondary entry
+  /// point.
+  ///
+  /// If \p Address is an invalid destination, such as a constant island, return
+  /// nullptr and mark \p BF as ignored, since we cannot properly handle a
+  /// branch to a constant island.
+  MCSymbol *handleExternalBranchTarget(uint64_t Address, BinaryFunction &BF);
+
   /// Analyze memory contents at the given \p Address and return the type of
   /// memory contents (such as a possible jump table).
   MemoryContentsType analyzeMemoryAt(uint64_t Address, BinaryFunction &BF);
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index b215a1558cbb4..a720c6af216d7 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -281,6 +281,14 @@ class BinaryFunction {
   /// goto labels.
   std::set<uint64_t> ExternallyReferencedOffsets;
 
+  /// Relocations from data sections targeting internals of this function, i.e.
+  /// some code not at an entry point. These include, but are not limited to,
+  /// jump table relocations and computed goto tables.
+  ///
+  /// Since relocations can be removed/deallocated, we store relocation offsets
+  /// instead of pointers.
+  DenseSet<uint64_t> InternalRefDataRelocations;
+
   /// Offsets of indirect branches with unknown destinations.
   std::set<uint64_t> UnknownIndirectBranchOffsets;
 
@@ -640,6 +648,20 @@ class BinaryFunction {
     Islands->CodeOffsets.emplace(Offset);
   }
 
+  /// Register a relocation from data section referencing code at a non-zero
+  /// offset in this function.
+  void registerInternalRefDataRelocation(uint64_t FuncOffset,
+                                         uint64_t RelOffset) {
+    assert(FuncOffset != 0 && "Relocation should reference function internals");
+    registerReferencedOffset(FuncOffset);
+    InternalRefDataRelocations.insert(RelOffset);
+    const MCSymbol *ReferencedSymbol =
+        getOrCreateLocalLabel(getAddress() + FuncOffset);
+
+    // Track the symbol mapping since it's used in relocation handling.
+    BC.setSymbolToFunctionMap(ReferencedSymbol, this);
+  }
+
   /// Register an internal offset in a function referenced from outside.
   void registerReferencedOffset(uint64_t Offset) {
     ExternallyReferencedOffsets.emplace(Offset);
@@ -1299,6 +1321,12 @@ class BinaryFunction {
   void addRelocation(uint64_t Address, MCSymbol *Symbol, uint32_t RelType,
                      uint64_t Addend, uint64_t Value);
 
+  /// Return locations (offsets) of data section relocations targeting internals
+  /// of this functions.
+  const DenseSet<uint64_t> &getInternalRefDataRelocations() const {
+    return InternalRefDataRelocations;
+  }
+
   /// Return the name of the section this function originated from.
   std::optional<StringRef> getOriginSectionName() const {
     if (!OriginSection)
diff --git a/bolt/include/bolt/Core/DebugData.h b/bolt/include/bolt/Core/DebugData.h
index 7c8ea12ee3ee3..faf7bb62c6bee 100644
--- a/bolt/include/bolt/Core/DebugData.h
+++ b/bolt/include/bolt/Core/DebugData.h
@@ -471,6 +471,12 @@ class DebugStrOffsetsWriter {
     return std::move(StrOffsetsBuffer);
   }
 
+  /// Returns strings of .debug_str_offsets.
+  StringRef getBufferStr() {
+    return StringRef(reinterpret_cast<const char *>(StrOffsetsBuffer->data()),
+                     StrOffsetsBuffer->size());
+  }
+
   /// Initializes Buffer and Stream.
   void initialize(DWARFUnit &Unit);
 
@@ -507,6 +513,12 @@ class DebugStrWriter {
     return std::move(StrBuffer);
   }
 
+  /// Returns strings of .debug_str.
+  StringRef getBufferStr() {
+    return StringRef(reinterpret_cast<const char *>(StrBuffer->data()),
+                     StrBuffer->size());
+  }
+
   /// Adds string to .debug_str.
   /// On first invocation it initializes internal data structures.
   uint32_t addString(StringRef Str);
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index d666c10885ad5..69ae4fb8ddcc9 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -784,6 +784,11 @@ class MCPlusBuilder {
 
   virtual bool isPop(const MCInst &Inst) const { return false; }
 
+  /// Determine if a basic block looks like an epilogue. For now it is only
+  /// called at the final stage of building CFG to check basic block ending
+  /// with an indirect call that has unknown control flow attribute.
+  virtual bool isEpilogue(const BinaryBasicBlock &BB) const { return false; }
+
   /// Return true if the instruction is used to terminate an indirect branch.
   virtual bool isTerminateBranch(const MCInst &Inst) const {
     llvm_unreachable("not implemented");
@@ -840,6 +845,16 @@ class MCPlusBuilder {
     return false;
   }
 
+  virtual bool isLDRWl(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isLDRXl(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
   virtual bool isMOVW(const MCInst &Inst) const {
     llvm_unreachable("not implemented");
     return false;
@@ -1361,20 +1376,13 @@ class MCPlusBuilder {
   /// Return true if \p Inst has RestoreState annotation.
   bool hasRestoreState(const MCInst &Inst) const;
 
-  /// Stores RA Signed annotation on \p Inst.
-  void setRASigned(MCInst &Inst) const;
+  /// Sets kRASigned or kRAUnsigned annotation on \p Inst.
+  /// Fails if \p Inst has either annotation already set.
+  void setRAState(MCInst &Inst, bool State) const;
 
-  /// Return true if \p Inst has Signed RA annotation.
-  bool isRASigned(const MCInst &Inst) const;
-
-  /// Stores RA Unsigned annotation on \p Inst.
-  void setRAUnsigned(MCInst &Inst) const;
-
-  /// Return true if \p Inst has Unsigned RA annotation.
-  bool isRAUnsigned(const MCInst &Inst) const;
-
-  /// Return true if \p Inst doesn't have any annotation related to RA state.
-  bool isRAStateUnknown(const MCInst &Inst) const;
+  /// Return true if \p Inst has kRASigned annotation, false if it has
+  /// kRAUnsigned annotation, and std::nullopt if neither annotation is set.
+  std::optional<bool> getRAState(const MCInst &Inst) const;
 
   /// Return true if the instruction is a call with an exception handling info.
   virtual bool isInvoke(const MCInst &Inst) const {
@@ -1789,6 +1797,19 @@ class MCPlusBuilder {
     llvm_unreachable("not implemented");
   }
 
+  /// Take \p LDRInst and return ADRP+LDR instruction sequence - for
+  ///
+  ///     ldr  x0, [label]
+  ///
+  /// the following sequence will be generated:
+  ///
+  ///     adrp x0, PageBase(label)
+  ///     ldr  x0, [x0, PageOffset(label)]
+  virtual InstructionListType createAdrpLdr(const MCInst &LDRInst,
+                                            MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+  }
+
   /// Return not 0 if the instruction CurInst, in combination with the recent
   /// history of disassembled instructions supplied by [Begin, End), is a linker
   /// generated veneer/stub that needs patching. This happens in AArch64 when
diff --git a/bolt/include/bolt/Passes/ADRRelaxationPass.h b/bolt/include/bolt/Passes/AArch64RelaxationPass.h
similarity index 51%
rename from bolt/include/bolt/Passes/ADRRelaxationPass.h
rename to bolt/include/bolt/Passes/AArch64RelaxationPass.h
index b9f92dec7f03b..b9185a1e34388 100644
--- a/bolt/include/bolt/Passes/ADRRelaxationPass.h
+++ b/bolt/include/bolt/Passes/AArch64RelaxationPass.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/ADRRelaxationPass.h --------------------------*- C++ -*-===//
+//===- bolt/Passes/AArch64RelaxationPass.h ----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,29 +6,29 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file declares the ADRRelaxationPass class, which replaces AArch64
-// non-local ADR instructions with ADRP + ADD due to small offset range of ADR
-// instruction (+- 1MB) which could be easily overflowed after BOLT
-// optimizations. Such problems are usually connected with errata 843419
-// https://developer.arm.com/documentation/epm048406/2100/
+// This file declares the AArch64RelaxationPass class, which replaces AArch64
+// non-local ADR/LDR instructions with ADRP + ADD/LDR due to small offset
+// range of ADR and LDR instruction (+- 1MB) which could be easily overflowed
+// after BOLT optimizations. Such problems are usually connected with errata
+// 843419: https://developer.arm.com/documentation/epm048406/2100/
 // The linker could replace ADRP instruction with ADR in some cases.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef BOLT_PASSES_ADRRELAXATIONPASS_H
-#define BOLT_PASSES_ADRRELAXATIONPASS_H
+#ifndef BOLT_PASSES_AARCH64RELAXATIONPASS_H
+#define BOLT_PASSES_AARCH64RELAXATIONPASS_H
 
 #include "bolt/Passes/BinaryPasses.h"
 
 namespace llvm {
 namespace bolt {
 
-class ADRRelaxationPass : public BinaryFunctionPass {
+class AArch64RelaxationPass : public BinaryFunctionPass {
 public:
-  explicit ADRRelaxationPass(const cl::opt<bool> &PrintPass)
+  explicit AArch64RelaxationPass(const cl::opt<bool> &PrintPass)
       : BinaryFunctionPass(PrintPass) {}
 
-  const char *getName() const override { return "adr-relaxation"; }
+  const char *getName() const override { return "aarch64-relaxation"; }
 
   /// Pass entry point
   Error runOnFunctions(BinaryContext &BC) override;
diff --git a/bolt/include/bolt/Passes/FixRelaxationPass.h b/bolt/include/bolt/Passes/FixRelaxationPass.h
index 50b64480aa62e..cf5a8a1fcb134 100644
--- a/bolt/include/bolt/Passes/FixRelaxationPass.h
+++ b/bolt/include/bolt/Passes/FixRelaxationPass.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/ADRRelaxationPass.h --------------------------*- C++ -*-===//
+//===- bolt/Passes/FixRelaxationPass.h --------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/include/bolt/Profile/ProfileYAMLMapping.h b/bolt/include/bolt/Profile/ProfileYAMLMapping.h
index 41e2bd1651efd..b393c85321b7d 100644
--- a/bolt/include/bolt/Profile/ProfileYAMLMapping.h
+++ b/bolt/include/bolt/Profile/ProfileYAMLMapping.h
@@ -29,6 +29,10 @@ struct CallSiteInfo {
   uint32_t EntryDiscriminator{0}; /// multiple entry discriminator
   uint64_t Count{0};
   uint64_t Mispreds{0};
+  // Pseudo probe information, optional
+  uint32_t Probe{0};
+  bool Indirect = false;
+  uint32_t InlineTreeNode{0};
 
   bool operator==(const CallSiteInfo &Other) const {
     return Offset == Other.Offset && DestId == Other.DestId &&
@@ -63,6 +67,9 @@ template <> struct MappingTraits<bolt::CallSiteInfo> {
     YamlIO.mapOptional("disc", CSI.EntryDiscriminator, (uint32_t)0);
     YamlIO.mapRequired("cnt", CSI.Count);
     YamlIO.mapOptional("mis", CSI.Mispreds, (uint64_t)0);
+    YamlIO.mapOptional("pp", CSI.Probe, 0);
+    YamlIO.mapOptional("ppn", CSI.InlineTreeNode, 0);
+    YamlIO.mapOptional("ind", CSI.Indirect, false);
   }
 
   static const bool flow = true;
@@ -95,29 +102,20 @@ template <> struct MappingTraits<bolt::SuccessorInfo> {
 
 namespace bolt {
 struct PseudoProbeInfo {
-  uint32_t InlineTreeIndex = 0;
-  uint64_t BlockMask = 0;            // bitset with probe indices from 1 to 64
-  std::vector<uint64_t> BlockProbes; // block probes with indices above 64
-  std::vector<uint64_t> CallProbes;
-  std::vector<uint64_t> IndCallProbes;
+  std::vector<uint64_t> BlockProbes;
   std::vector<uint32_t> InlineTreeNodes;
 
   bool operator==(const PseudoProbeInfo &Other) const {
-    return InlineTreeIndex == Other.InlineTreeIndex &&
-           BlockProbes == Other.BlockProbes && CallProbes == Other.CallProbes &&
-           IndCallProbes == Other.IndCallProbes;
+    return InlineTreeNodes == Other.InlineTreeNodes &&
+           BlockProbes == Other.BlockProbes;
   }
 };
 } // end namespace bolt
 
 template <> struct MappingTraits<bolt::PseudoProbeInfo> {
   static void mapping(IO &YamlIO, bolt::PseudoProbeInfo &PI) {
-    YamlIO.mapOptional("blx", PI.BlockMask, 0);
-    YamlIO.mapOptional("blk", PI.BlockProbes, std::vector<uint64_t>());
-    YamlIO.mapOptional("call", PI.CallProbes, std::vector<uint64_t>());
-    YamlIO.mapOptional("icall", PI.IndCallProbes, std::vector<uint64_t>());
-    YamlIO.mapOptional("id", PI.InlineTreeIndex, 0);
-    YamlIO.mapOptional("ids", PI.InlineTreeNodes, std::vector<uint32_t>());
+    YamlIO.mapOptional("blk", PI.BlockProbes, std::vector<uint64_t>(1, 1));
+    YamlIO.mapOptional("ids", PI.InlineTreeNodes, std::vector<uint32_t>(1, 0));
   }
 
   static const bool flow = true;
diff --git a/bolt/include/bolt/Profile/YAMLProfileWriter.h b/bolt/include/bolt/Profile/YAMLProfileWriter.h
index d4d7217464cc8..50ee78d342df8 100644
--- a/bolt/include/bolt/Profile/YAMLProfileWriter.h
+++ b/bolt/include/bolt/Profile/YAMLProfileWriter.h
@@ -74,25 +74,24 @@ class YAMLProfileWriter {
   collectInlineTree(const MCPseudoProbeDecoder &Decoder,
                     const MCDecodedPseudoProbeInlineTree &Root);
 
-  // 0 - block probe, 1 - indirect call, 2 - direct call
-  using ProbeList = std::array<SmallVector<uint64_t, 0>, 3>;
-  using NodeIdToProbes = DenseMap<uint32_t, ProbeList>;
-  static std::vector<yaml::bolt::PseudoProbeInfo>
-  convertNodeProbes(NodeIdToProbes &NodeProbes);
-
 public:
-  template <typename T>
-  static std::vector<yaml::bolt::PseudoProbeInfo>
-  writeBlockProbes(T Probes, const InlineTreeMapTy &InlineTreeNodeId) {
-    NodeIdToProbes NodeProbes;
-    for (const MCDecodedPseudoProbe &Probe : Probes) {
-      auto It = InlineTreeNodeId.find(Probe.getInlineTreeNode());
-      if (It == InlineTreeNodeId.end())
-        continue;
-      NodeProbes[It->second][Probe.getType()].emplace_back(Probe.getIndex());
-    }
-    return convertNodeProbes(NodeProbes);
-  }
+  class BlockProbeCtx {
+    struct Call {
+      uint64_t Id;
+      uint32_t Node;
+      bool Indirect;
+      bool Used;
+    };
+    // Group block probes by node id.
+    DenseMap<uint32_t, std::vector<uint64_t>> NodeToProbes;
+    // Offset -> call probe
+    DenseMap<uint32_t, Call> CallProbes;
+
+  public:
+    void addBlockProbe(const InlineTreeMapTy &Map,
+                       const MCDecodedPseudoProbe &Probe, uint32_t ProbeOffset);
+    void finalize(yaml::bolt::BinaryBasicBlockProfile &YamlBB);
+  };
 };
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/include/bolt/Rewrite/MetadataRewriters.h b/bolt/include/bolt/Rewrite/MetadataRewriters.h
index 2c09c879b9128..6b74b0e776997 100644
--- a/bolt/include/bolt/Rewrite/MetadataRewriters.h
+++ b/bolt/include/bolt/Rewrite/MetadataRewriters.h
@@ -19,12 +19,14 @@ class BinaryContext;
 
 // The list of rewriter build functions.
 
-std::unique_ptr<MetadataRewriter> createLinuxKernelRewriter(BinaryContext &);
-
 std::unique_ptr<MetadataRewriter> createBuildIDRewriter(BinaryContext &);
 
+std::unique_ptr<MetadataRewriter> createLinuxKernelRewriter(BinaryContext &);
+
 std::unique_ptr<MetadataRewriter> createPseudoProbeRewriter(BinaryContext &);
 
+std::unique_ptr<MetadataRewriter> createRSeqRewriter(BinaryContext &);
+
 std::unique_ptr<MetadataRewriter> createSDTRewriter(BinaryContext &);
 
 std::unique_ptr<MetadataRewriter> createGNUPropertyRewriter(BinaryContext &);
diff --git a/bolt/lib/Core/BinaryBasicBlock.cpp b/bolt/lib/Core/BinaryBasicBlock.cpp
index d680850bf2ea9..a6d0ca9481154 100644
--- a/bolt/lib/Core/BinaryBasicBlock.cpp
+++ b/bolt/lib/Core/BinaryBasicBlock.cpp
@@ -22,8 +22,6 @@
 namespace llvm {
 namespace bolt {
 
-constexpr uint32_t BinaryBasicBlock::INVALID_OFFSET;
-
 bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) {
   return LHS.Index < RHS.Index;
 }
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index c33540ada8a05..6a285f5538dbd 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -78,6 +78,11 @@ cl::opt<std::string> CompDirOverride(
              "to *.dwo files."),
     cl::Hidden, cl::init(""), cl::cat(BoltCategory));
 
+static cl::opt<bool> CloneConstantIsland("clone-constant-island",
+                                         cl::desc("clone constant islands"),
+                                         cl::Hidden, cl::init(true),
+                                         cl::ZeroOrMore, cl::cat(BoltCategory));
+
 static cl::opt<bool>
     FailOnInvalidPadding("fail-on-invalid-padding", cl::Hidden, cl::init(false),
                          cl::desc("treat invalid code padding as error"),
@@ -461,7 +466,8 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF,
       // of dynamic relocs, as we currently do not support cloning them.
       // Notice: we might fail to link because of this, if the original constant
       // island we are referring would be emitted too far away.
-      if (IslandIter->second->hasDynamicRelocationAtIsland()) {
+      if (IslandIter->second->hasDynamicRelocationAtIsland() ||
+          !opts::CloneConstantIsland) {
         MCSymbol *IslandSym =
             IslandIter->second->getOrCreateIslandAccess(Address);
         if (IslandSym)
@@ -469,6 +475,12 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF,
       } else if (MCSymbol *IslandSym =
                      IslandIter->second->getOrCreateProxyIslandAccess(Address,
                                                                       BF)) {
+        LLVM_DEBUG(
+            dbgs() << "BOLT-DEBUG: clone constant island at address 0x"
+                   << Twine::utohexstr(IslandIter->first) << " with size of 0x"
+                   << Twine::utohexstr(
+                          IslandIter->second->estimateConstantIslandSize())
+                   << " bytes, referenced by " << BF << "\n");
         BF.createIslandDependency(IslandSym, IslandIter->second);
         return std::make_pair(IslandSym, 0);
       }
@@ -518,6 +530,23 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF,
   return std::make_pair(TargetSymbol, 0);
 }
 
+MCSymbol *BinaryContext::handleExternalBranchTarget(uint64_t Address,
+                                                    BinaryFunction &BF) {
+  if (BF.isInConstantIsland(Address)) {
+    BF.setIgnored();
+    this->outs() << "BOLT-WARNING: ignoring entry point at address 0x"
+                 << Twine::utohexstr(Address)
+                 << " in constant island of function " << BF << '\n';
+    return nullptr;
+  }
+
+  const uint64_t Offset = Address - BF.getAddress();
+  assert(Offset < BF.getSize() &&
+         "Address should be inside the referenced function");
+
+  return Offset ? BF.addEntryPointAtOffset(Offset) : BF.getSymbol();
+}
+
 MemoryContentsType BinaryContext::analyzeMemoryAt(uint64_t Address,
                                                   BinaryFunction &BF) {
   if (!isX86())
@@ -761,13 +790,17 @@ void BinaryContext::populateJumpTables() {
   }
 
   if (opts::StrictMode && DataPCRelocations.size()) {
-    LLVM_DEBUG({
-      dbgs() << DataPCRelocations.size()
-             << " unclaimed PC-relative relocations left in data:\n";
-      for (uint64_t Reloc : DataPCRelocations)
-        dbgs() << Twine::utohexstr(Reloc) << '\n';
-    });
-    assert(0 && "unclaimed PC-relative relocations left in data\n");
+    this->errs() << "BOLT-ERROR: " << DataPCRelocations.size()
+                 << " unclaimed PC-relative relocation(s) left in data";
+    if (opts::Verbosity) {
+      this->errs() << ":\n";
+      for (uint64_t RelocOffset : DataPCRelocations)
+        this->errs() << "  @0x" << Twine::utohexstr(RelocOffset) << '\n';
+    } else {
+      this->errs() << ". Re-run with -v=1 to see the list\n";
+    }
+    this->errs() << "BOLT-ERROR: unable to proceed with --strict\n";
+    exit(1);
   }
   clearList(DataPCRelocations);
 }
@@ -977,14 +1010,12 @@ bool BinaryContext::hasValidCodePadding(const BinaryFunction &BF) {
     return Offset - StartOffset;
   };
 
-  // Skip a sequence of zero bytes. For AArch64 we only skip 4 bytes of zeros
-  // in case the following zeros belong to constant island or veneer.
+  // Skip a sequence of zero bytes. For AArch64 we only skip 4's exact
+  // multiple number of zeros in case the following zeros belong to veneer.
   auto skipZeros = [&]() {
     const uint64_t StartOffset = Offset;
     uint64_t CurrentOffset = Offset;
-    for (; CurrentOffset < BF.getMaxSize() &&
-           (!isAArch64() || CurrentOffset < StartOffset + 4);
-         ++CurrentOffset)
+    for (; CurrentOffset < BF.getMaxSize(); ++CurrentOffset)
       if ((*FunctionData)[CurrentOffset] != 0)
         break;
 
@@ -1399,17 +1430,10 @@ void BinaryContext::processInterproceduralReferences() {
             << Function.getPrintName() << " and "
             << TargetFunction->getPrintName() << '\n';
       }
-      if (uint64_t Offset = Address - TargetFunction->getAddress()) {
-        if (!TargetFunction->isInConstantIsland(Address)) {
-          TargetFunction->addEntryPointAtOffset(Offset);
-        } else {
-          TargetFunction->setIgnored();
-          this->outs() << "BOLT-WARNING: Ignoring entry point at address 0x"
-                       << Twine::utohexstr(Address)
-                       << " in constant island of function " << *TargetFunction
-                       << '\n';
-        }
-      }
+
+      // Create an extra entry point if needed. Can also render the target
+      // function ignored if the reference is invalid.
+      handleExternalBranchTarget(Address, *TargetFunction);
 
       continue;
     }
@@ -1496,6 +1520,17 @@ void BinaryContext::foldFunction(BinaryFunction &ChildBF,
   }
   ChildBF.getSymbols().clear();
 
+  // Reset function mapping for local symbols.
+  for (uint64_t RelOffset : ChildBF.getInternalRefDataRelocations()) {
+    const Relocation *Rel = getRelocationAt(RelOffset);
+    if (!Rel || !Rel->Symbol)
+      continue;
+
+    WriteSymbolMapLock.lock();
+    SymbolToFunctionMap[Rel->Symbol] = nullptr;
+    WriteSymbolMapLock.unlock();
+  }
+
   // Move other names the child function is known under.
   llvm::move(ChildBF.Aliases, std::back_inserter(ParentBF.Aliases));
   ChildBF.Aliases.clear();
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 84023efe1084e..a0d8385aa3824 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -1697,11 +1697,12 @@ bool BinaryFunction::scanExternalRefs() {
       if (!TargetFunction || ignoreFunctionRef(*TargetFunction))
         continue;
 
-      const uint64_t FunctionOffset =
-          TargetAddress - TargetFunction->getAddress();
+      // Get a reference symbol for the function when address is a valid code
+      // reference.
       BranchTargetSymbol =
-          FunctionOffset ? TargetFunction->addEntryPointAtOffset(FunctionOffset)
-                         : TargetFunction->getSymbol();
+          BC.handleExternalBranchTarget(TargetAddress, *TargetFunction);
+      if (!BranchTargetSymbol)
+        continue;
     }
 
     // Can't find more references. Not creating relocations since we are not
@@ -1895,16 +1896,6 @@ bool BinaryFunction::scanExternalRefs() {
     }
   }
 
-  // Inform BinaryContext that this function symbols will not be defined and
-  // relocations should not be created against them.
-  if (BC.HasRelocations) {
-    for (std::pair<const uint32_t, MCSymbol *> &LI : Labels)
-      BC.UndefinedSymbols.insert(LI.second);
-    for (MCSymbol *const EndLabel : FunctionEndLabels)
-      if (EndLabel)
-        BC.UndefinedSymbols.insert(EndLabel);
-  }
-
   clearList(Relocations);
   clearList(ExternallyReferencedOffsets);
 
@@ -2176,13 +2167,10 @@ bool BinaryFunction::postProcessIndirectBranches(
         continue;
       }
 
-      // If this block contains an epilogue code and has an indirect branch,
-      // then most likely it's a tail call. Otherwise, we cannot tell for sure
-      // what it is and conservatively reject the function's CFG.
-      bool IsEpilogue = llvm::any_of(BB, [&](const MCInst &Instr) {
-        return BC.MIB->isLeave(Instr) || BC.MIB->isPop(Instr);
-      });
-      if (IsEpilogue) {
+      // If this block contains epilogue code and has an indirect branch,
+      // then most likely it's a tail call. Otherwise, we cannot tell for
+      // sure what it is and conservatively reject the function's CFG.
+      if (BC.MIB->isEpilogue(BB)) {
         BC.MIB->convertJmpToTailCall(Instr);
         BB.removeAllSuccessors();
         continue;
@@ -3233,14 +3221,6 @@ void BinaryFunction::clearDisasmState() {
   clearList(Instructions);
   clearList(IgnoredBranches);
   clearList(TakenBranches);
-
-  if (BC.HasRelocations) {
-    for (std::pair<const uint32_t, MCSymbol *> &LI : Labels)
-      BC.UndefinedSymbols.insert(LI.second);
-    for (MCSymbol *const EndLabel : FunctionEndLabels)
-      if (EndLabel)
-        BC.UndefinedSymbols.insert(EndLabel);
-  }
 }
 
 void BinaryFunction::setTrapOnEntry() {
diff --git a/bolt/lib/Core/BinarySection.cpp b/bolt/lib/Core/BinarySection.cpp
index 6f07017c26060..e803d17021f8b 100644
--- a/bolt/lib/Core/BinarySection.cpp
+++ b/bolt/lib/Core/BinarySection.cpp
@@ -112,8 +112,10 @@ void BinarySection::emitAsData(MCStreamer &Streamer,
       RI = ROE;
 
       // Skip undefined symbols.
-      auto HasUndefSym = [this](const auto &Relocation) {
-        return BC.UndefinedSymbols.count(Relocation.Symbol);
+      auto HasUndefSym = [](const auto &Relocation) {
+        return Relocation.Symbol && Relocation.Symbol->isTemporary() &&
+               Relocation.Symbol->isUndefined() &&
+               !Relocation.Symbol->isRegistered();
       };
 
       if (std::any_of(ROI, ROE, HasUndefSym))
diff --git a/bolt/lib/Core/DebugNames.cpp b/bolt/lib/Core/DebugNames.cpp
index 6be2c5aa4e6c1..5272d402be7f3 100644
--- a/bolt/lib/Core/DebugNames.cpp
+++ b/bolt/lib/Core/DebugNames.cpp
@@ -555,7 +555,7 @@ void DWARF5AcceleratorTable::populateAbbrevsMap() {
 
 void DWARF5AcceleratorTable::writeEntry(BOLTDWARF5AccelTableData &Entry) {
   const uint64_t EntryID = getEntryID(Entry);
-  if (EntryRelativeOffsets.find(EntryID) != EntryRelativeOffsets.end())
+  if (EntryRelativeOffsets.contains(EntryID))
     EntryRelativeOffsets[EntryID] = EntriesBuffer->size();
 
   const std::optional<DWARF5AccelTable::UnitIndexAndEncoding> EntryRet =
diff --git a/bolt/lib/Core/DynoStats.cpp b/bolt/lib/Core/DynoStats.cpp
index 1d9818777596e..64a6d12b76e82 100644
--- a/bolt/lib/Core/DynoStats.cpp
+++ b/bolt/lib/Core/DynoStats.cpp
@@ -51,8 +51,6 @@ PrintDynoOpcodeStat("print-dyno-opcode-stats",
 namespace llvm {
 namespace bolt {
 
-constexpr const char *DynoStats::Desc[];
-
 bool DynoStats::operator<(const DynoStats &Other) const {
   return std::lexicographical_compare(
       &Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT],
diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp
index e96de80bfa701..0cb4ba1ebfbd7 100644
--- a/bolt/lib/Core/MCPlusBuilder.cpp
+++ b/bolt/lib/Core/MCPlusBuilder.cpp
@@ -186,26 +186,21 @@ bool MCPlusBuilder::hasRestoreState(const MCInst &Inst) const {
   return hasAnnotation(Inst, MCAnnotation::kRestoreState);
 }
 
-void MCPlusBuilder::setRASigned(MCInst &Inst) const {
+void MCPlusBuilder::setRAState(MCInst &Inst, bool State) const {
   assert(!hasAnnotation(Inst, MCAnnotation::kRASigned));
-  setAnnotationOpValue(Inst, MCAnnotation::kRASigned, true);
-}
-
-bool MCPlusBuilder::isRASigned(const MCInst &Inst) const {
-  return hasAnnotation(Inst, MCAnnotation::kRASigned);
-}
-
-void MCPlusBuilder::setRAUnsigned(MCInst &Inst) const {
   assert(!hasAnnotation(Inst, MCAnnotation::kRAUnsigned));
-  setAnnotationOpValue(Inst, MCAnnotation::kRAUnsigned, true);
+  if (State)
+    setAnnotationOpValue(Inst, MCAnnotation::kRASigned, true);
+  else
+    setAnnotationOpValue(Inst, MCAnnotation::kRAUnsigned, true);
 }
 
-bool MCPlusBuilder::isRAUnsigned(const MCInst &Inst) const {
-  return hasAnnotation(Inst, MCAnnotation::kRAUnsigned);
-}
-
-bool MCPlusBuilder::isRAStateUnknown(const MCInst &Inst) const {
-  return !(isRAUnsigned(Inst) || isRASigned(Inst));
+std::optional<bool> MCPlusBuilder::getRAState(const MCInst &Inst) const {
+  if (hasAnnotation(Inst, MCAnnotation::kRASigned))
+    return true;
+  if (hasAnnotation(Inst, MCAnnotation::kRAUnsigned))
+    return false;
+  return std::nullopt;
 }
 
 std::optional<MCLandingPad> MCPlusBuilder::getEHInfo(const MCInst &Inst) const {
diff --git a/bolt/lib/Core/Relocation.cpp b/bolt/lib/Core/Relocation.cpp
index 4b827b647b06c..f872db2cae0ce 100644
--- a/bolt/lib/Core/Relocation.cpp
+++ b/bolt/lib/Core/Relocation.cpp
@@ -1018,41 +1018,15 @@ void Relocation::print(raw_ostream &OS) const {
   default:
     OS << "RType:" << Twine::utohexstr(Type);
     break;
-
-  case Triple::aarch64: {
-    static const char *const AArch64RelocNames[] = {
-#define ELF_RELOC(name, value) #name,
-#include "llvm/BinaryFormat/ELFRelocs/AArch64.def"
-#undef ELF_RELOC
-    };
-    assert(Type < ArrayRef(AArch64RelocNames).size());
-    OS << AArch64RelocNames[Type];
-  } break;
-
+  case Triple::aarch64:
+    OS << object::getELFRelocationTypeName(ELF::EM_AARCH64, Type);
+    break;
   case Triple::riscv64:
-    // RISC-V relocations are not sequentially numbered so we cannot use an
-    // array
-    switch (Type) {
-    default:
-      llvm_unreachable("illegal RISC-V relocation");
-#define ELF_RELOC(name, value)                                                 \
-  case value:                                                                  \
-    OS << #name;                                                               \
+    OS << object::getELFRelocationTypeName(ELF::EM_RISCV, Type);
     break;
-#include "llvm/BinaryFormat/ELFRelocs/RISCV.def"
-#undef ELF_RELOC
-    }
+  case Triple::x86_64:
+    OS << object::getELFRelocationTypeName(ELF::EM_X86_64, Type);
     break;
-
-  case Triple::x86_64: {
-    static const char *const X86RelocNames[] = {
-#define ELF_RELOC(name, value) #name,
-#include "llvm/BinaryFormat/ELFRelocs/x86_64.def"
-#undef ELF_RELOC
-    };
-    assert(Type < ArrayRef(X86RelocNames).size());
-    OS << X86RelocNames[Type];
-  } break;
   }
   OS << ", 0x" << Twine::utohexstr(Offset);
   if (Symbol) {
diff --git a/bolt/lib/Passes/ADRRelaxationPass.cpp b/bolt/lib/Passes/AArch64RelaxationPass.cpp
similarity index 67%
rename from bolt/lib/Passes/ADRRelaxationPass.cpp
rename to bolt/lib/Passes/AArch64RelaxationPass.cpp
index c3954c94a7f92..610adad58cfcb 100644
--- a/bolt/lib/Passes/ADRRelaxationPass.cpp
+++ b/bolt/lib/Passes/AArch64RelaxationPass.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/ADRRelaxationPass.cpp ----------------------------------===//
+//===- bolt/Passes/AArch64RelaxationPass.cpp ------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the ADRRelaxationPass class.
+// This file implements the AArch64RelaxationPass class.
 //
 //===----------------------------------------------------------------------===//
 
-#include "bolt/Passes/ADRRelaxationPass.h"
+#include "bolt/Passes/AArch64RelaxationPass.h"
 #include "bolt/Core/ParallelUtilities.h"
 #include "bolt/Utils/CommandLineOpts.h"
 #include <iterator>
@@ -20,10 +20,10 @@ using namespace llvm;
 namespace opts {
 extern cl::OptionCategory BoltCategory;
 
-static cl::opt<bool>
-    AdrPassOpt("adr-relaxation",
-               cl::desc("Replace ARM non-local ADR instructions with ADRP"),
-               cl::init(true), cl::cat(BoltCategory), cl::ReallyHidden);
+static cl::opt<bool> AArch64PassOpt(
+    "aarch64-relaxation",
+    cl::desc("Replace ARM non-local ADR/LDR instructions with ADRP"),
+    cl::init(true), cl::cat(BoltCategory), cl::ReallyHidden);
 } // namespace opts
 
 namespace llvm {
@@ -35,7 +35,7 @@ namespace bolt {
 // jobs and checking the exit flag after it.
 static bool PassFailed = false;
 
-void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
+void AArch64RelaxationPass::runOnFunction(BinaryFunction &BF) {
   if (PassFailed)
     return;
 
@@ -43,10 +43,13 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
   for (BinaryBasicBlock &BB : BF) {
     for (auto It = BB.begin(); It != BB.end(); ++It) {
       MCInst &Inst = *It;
-      if (!BC.MIB->isADR(Inst))
+      bool IsADR = BC.MIB->isADR(Inst);
+
+      // TODO: Handle other types of LDR (literal, PC-relative) instructions.
+      if (!IsADR && !BC.MIB->isLDRXl(Inst) && !BC.MIB->isLDRWl(Inst))
         continue;
 
-      const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Inst);
+      const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Inst, IsADR ? 0 : 1);
       if (!Symbol)
         continue;
 
@@ -56,25 +59,27 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
           continue;
       }
 
-      // Don't relax ADR if it points to the same function and is in the main
-      // fragment and BF initial size is < 1MB.
+      // Don't relax ADR/LDR if it points to the same function and is in the
+      // main fragment and BF initial size is < 1MB.
       const unsigned OneMB = 0x100000;
       if (BF.getSize() < OneMB) {
         BinaryFunction *TargetBF = BC.getFunctionForSymbol(Symbol);
         if (TargetBF == &BF && !BB.isSplit())
           continue;
 
-        // No relaxation needed if ADR references a basic block in the same
+        // No relaxation needed if ADR/LDR references a basic block in the same
         // fragment.
         if (BinaryBasicBlock *TargetBB = BF.getBasicBlockForLabel(Symbol))
           if (BB.getFragmentNum() == TargetBB->getFragmentNum())
             continue;
       }
 
-      InstructionListType AdrpAdd;
+      InstructionListType AdrpMaterialization;
       {
         auto L = BC.scopeLock();
-        AdrpAdd = BC.MIB->undoAdrpAddRelaxation(Inst, BC.Ctx.get());
+        AdrpMaterialization =
+            IsADR ? BC.MIB->undoAdrpAddRelaxation(Inst, BC.Ctx.get())
+                  : BC.MIB->createAdrpLdr(Inst, BC.Ctx.get());
       }
 
       if (It != BB.begin() && BC.MIB->isNoop(*std::prev(It))) {
@@ -88,18 +93,18 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
         // invalidate this offset, so we have to rely on linker-inserted NOP to
         // replace it with ADRP, and abort if it is not present.
         auto L = BC.scopeLock();
-        BC.errs() << "BOLT-ERROR: cannot relax ADR in non-simple function "
-                  << BF << '\n';
+        BC.errs() << "BOLT-ERROR: cannot relax " << (IsADR ? "ADR" : "LDR")
+                  << " in non-simple function " << BF << '\n';
         PassFailed = true;
         return;
       }
-      It = BB.replaceInstruction(It, AdrpAdd);
+      It = BB.replaceInstruction(It, AdrpMaterialization);
     }
   }
 }
 
-Error ADRRelaxationPass::runOnFunctions(BinaryContext &BC) {
-  if (!opts::AdrPassOpt || !BC.HasRelocations)
+Error AArch64RelaxationPass::runOnFunctions(BinaryContext &BC) {
+  if (!opts::AArch64PassOpt || !BC.HasRelocations)
     return Error::success();
 
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
@@ -108,7 +113,7 @@ Error ADRRelaxationPass::runOnFunctions(BinaryContext &BC) {
 
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, nullptr,
-      "ADRRelaxationPass");
+      "AArch64RelaxationPass");
 
   if (PassFailed)
     return createFatalBOLTError("");
diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt
index d7519518f186f..3197e62faad21 100644
--- a/bolt/lib/Passes/CMakeLists.txt
+++ b/bolt/lib/Passes/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_llvm_library(LLVMBOLTPasses
-  ADRRelaxationPass.cpp
+  AArch64RelaxationPass.cpp
   Aligner.cpp
   AllocCombiner.cpp
   AsmDump.cpp
diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
index 33664e1160a7b..775b7795e77c5 100644
--- a/bolt/lib/Passes/InsertNegateRAStatePass.cpp
+++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
@@ -21,7 +21,12 @@ using namespace llvm;
 namespace llvm {
 namespace bolt {
 
+static bool PassFailed = false;
+
 void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
+  if (PassFailed)
+    return;
+
   BinaryContext &BC = BF.getBinaryContext();
 
   if (BF.getState() == BinaryFunction::State::Empty)
@@ -39,7 +44,7 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
   for (FunctionFragment &FF : BF.getLayout().fragments()) {
     coverFunctionFragmentStart(BF, FF);
     bool FirstIter = true;
-    MCInst PrevInst;
+    bool PrevRAState = false;
     // As this pass runs after function splitting, we should only check
     // consecutive instructions inside FunctionFragments.
     for (BinaryBasicBlock *BB : FF) {
@@ -47,18 +52,23 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
         MCInst &Inst = *It;
         if (BC.MIB->isCFI(Inst))
           continue;
+        auto RAState = BC.MIB->getRAState(Inst);
+        if (!RAState) {
+          BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates "
+                    << " in function " << BF.getPrintName() << "\n";
+          PassFailed = true;
+          return;
+        }
         if (!FirstIter) {
           // Consecutive instructions with different RAState means we need to
           // add a OpNegateRAState.
-          if ((BC.MIB->isRASigned(PrevInst) && BC.MIB->isRAUnsigned(Inst)) ||
-              (BC.MIB->isRAUnsigned(PrevInst) && BC.MIB->isRASigned(Inst))) {
+          if (*RAState != PrevRAState)
             It = BF.addCFIInstruction(
                 BB, It, MCCFIInstruction::createNegateRAState(nullptr));
-          }
         } else {
           FirstIter = false;
         }
-        PrevInst = *It;
+        PrevRAState = *RAState;
       }
     }
   }
@@ -81,10 +91,17 @@ void InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF,
       });
   // If a function is already split in the input, the first FF can also start
   // with Signed state. This covers that scenario as well.
-  if (BC.MIB->isRASigned(*((*FirstNonEmpty)->begin()))) {
-    BF.addCFIInstruction(*FirstNonEmpty, (*FirstNonEmpty)->begin(),
-                         MCCFIInstruction::createNegateRAState(nullptr));
+  auto II = (*FirstNonEmpty)->getFirstNonPseudo();
+  auto RAState = BC.MIB->getRAState(*II);
+  if (!RAState) {
+    BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates "
+              << " in function " << BF.getPrintName() << "\n";
+    PassFailed = true;
+    return;
   }
+  if (*RAState)
+    BF.addCFIInstruction(*FirstNonEmpty, II,
+                         MCCFIInstruction::createNegateRAState(nullptr));
 }
 
 void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) {
@@ -96,15 +113,21 @@ void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) {
       if (BC.MIB->isCFI(Inst))
         continue;
 
-      if (!FirstIter && BC.MIB->isRAStateUnknown(Inst)) {
-        if (BC.MIB->isRASigned(PrevInst) || BC.MIB->isPSignOnLR(PrevInst)) {
-          BC.MIB->setRASigned(Inst);
-        } else if (BC.MIB->isRAUnsigned(PrevInst) ||
-                   BC.MIB->isPAuthOnLR(PrevInst)) {
-          BC.MIB->setRAUnsigned(Inst);
+      auto RAState = BC.MIB->getRAState(Inst);
+      if (!FirstIter && !RAState) {
+        if (BC.MIB->isPSignOnLR(PrevInst))
+          RAState = true;
+        else if (BC.MIB->isPAuthOnLR(PrevInst))
+          RAState = false;
+        else {
+          auto PrevRAState = BC.MIB->getRAState(PrevInst);
+          RAState = PrevRAState ? *PrevRAState : false;
         }
+        BC.MIB->setRAState(Inst, *RAState);
       } else {
         FirstIter = false;
+        if (!RAState)
+          BC.MIB->setRAState(Inst, BF.getInitialRAState());
       }
       PrevInst = Inst;
     }
@@ -135,6 +158,8 @@ Error InsertNegateRAState::runOnFunctions(BinaryContext &BC) {
             << " functions "
             << format("(%.2lf%%).\n", (100.0 * FunctionsModified) /
                                           BC.getBinaryFunctions().size());
+  if (PassFailed)
+    return createFatalBOLTError("");
   return Error::success();
 }
 
diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp
index b262d66732b7d..51075be0e1ac2 100644
--- a/bolt/lib/Passes/MarkRAStates.cpp
+++ b/bolt/lib/Passes/MarkRAStates.cpp
@@ -72,9 +72,6 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) {
           BF.setIgnored();
           return false;
         }
-        // The signing instruction itself is unsigned, the next will be
-        // signed.
-        BC.MIB->setRAUnsigned(Inst);
       } else if (BC.MIB->isPAuthOnLR(Inst)) {
         if (!RAState) {
           // RA authenticating instructions should only follow signed RA state.
@@ -86,15 +83,10 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) {
           BF.setIgnored();
           return false;
         }
-        // The authenticating instruction itself is signed, but the next will be
-        // unsigned.
-        BC.MIB->setRASigned(Inst);
-      } else if (RAState) {
-        BC.MIB->setRASigned(Inst);
-      } else {
-        BC.MIB->setRAUnsigned(Inst);
       }
 
+      BC.MIB->setRAState(Inst, RAState);
+
       // Updating RAState. All updates are valid from the next instruction.
       // Because the same instruction can have remember and restore, the order
       // here is relevant. This is the reason to loop over Annotations instead
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index dc3d918d14bd6..6b969011df589 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -159,8 +159,6 @@ std::vector<SectionNameAndRange> getTextSections(const BinaryContext *BC) {
 }
 }
 
-constexpr uint64_t DataAggregator::KernelBaseAddr;
-
 DataAggregator::~DataAggregator() { deleteTempFiles(); }
 
 namespace {
@@ -564,13 +562,18 @@ void DataAggregator::imputeFallThroughs() {
     // Skip fall-throughs in external code.
     if (Trace.From == Trace::EXTERNAL)
       continue;
-    std::pair CurrentBranch(Trace.Branch, Trace.From);
+    if (std::pair CurrentBranch(Trace.Branch, Trace.From);
+        CurrentBranch != PrevBranch) {
+      // New group: reset aggregates.
+      AggregateCount = AggregateFallthroughSize = 0;
+      PrevBranch = CurrentBranch;
+    }
     // BR_ONLY must be the last trace in the group
     if (Trace.To == Trace::BR_ONLY) {
       // If the group is not empty, use aggregate values, otherwise 0-length
       // for unconditional jumps (call/ret/uncond branch) or 1-length for others
       uint64_t InferredBytes =
-          PrevBranch == CurrentBranch
+          AggregateFallthroughSize
               ? AggregateFallthroughSize / AggregateCount
               : !checkUnconditionalControlTransfer(Trace.From);
       Trace.To = Trace.From + InferredBytes;
@@ -578,16 +581,11 @@ void DataAggregator::imputeFallThroughs() {
                         << " bytes)\n");
       ++InferredTraces;
     } else {
-      // Trace with a valid fall-through
-      // New group: reset aggregates.
-      if (CurrentBranch != PrevBranch)
-        AggregateCount = AggregateFallthroughSize = 0;
       // Only use valid fall-through lengths
       if (Trace.To != Trace::EXTERNAL)
         AggregateFallthroughSize += (Trace.To - Trace.From) * Info.TakenCount;
       AggregateCount += Info.TakenCount;
     }
-    PrevBranch = CurrentBranch;
   }
   if (opts::Verbosity >= 1)
     outs() << "BOLT-INFO: imputed " << InferredTraces << " traces\n";
@@ -1321,7 +1319,8 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
     }
 
     using SSI = StringSwitch<int>;
-    AddrNum = SSI(Str).Cases("T", "R", 3).Case("S", 1).Case("E", 0).Default(2);
+    AddrNum =
+        SSI(Str).Cases({"T", "R"}, 3).Case("S", 1).Case("E", 0).Default(2);
     CounterNum = SSI(Str).Case("B", 2).Case("E", 0).Default(1);
   }
 
@@ -2215,7 +2214,7 @@ DataAggregator::writeAggregatedFile(StringRef OutputFilename) const {
     OutFile << "boltedcollection\n";
   if (opts::BasicAggregation) {
     OutFile << "no_lbr";
-    for (const StringMapEntry<std::nullopt_t> &Entry : EventNames)
+    for (const StringMapEntry<EmptyStringSetTag> &Entry : EventNames)
       OutFile << " " << Entry.getKey();
     OutFile << "\n";
 
@@ -2291,7 +2290,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
 
   ListSeparator LS(",");
   raw_string_ostream EventNamesOS(BP.Header.EventNames);
-  for (const StringMapEntry<std::nullopt_t> &EventEntry : EventNames)
+  for (const StringMapEntry<EmptyStringSetTag> &EventEntry : EventNames)
     EventNamesOS << LS << EventEntry.first().str();
 
   BP.Header.Flags = opts::BasicAggregation ? BinaryFunction::PF_BASIC
@@ -2398,10 +2397,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
             PseudoProbeDecoder->getAddress2ProbesMap();
         BinaryFunction::FragmentsSetTy Fragments(BF->Fragments);
         Fragments.insert(BF);
-        DenseMap<
-            uint32_t,
-            std::vector<std::reference_wrapper<const MCDecodedPseudoProbe>>>
-            BlockProbes;
+        DenseMap<uint32_t, YAMLProfileWriter::BlockProbeCtx> BlockCtx;
         for (const BinaryFunction *F : Fragments) {
           const uint64_t FuncAddr = F->getAddress();
           for (const MCDecodedPseudoProbe &Probe :
@@ -2409,15 +2405,14 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
             const uint32_t OutputAddress = Probe.getAddress();
             const uint32_t InputOffset = BAT->translate(
                 FuncAddr, OutputAddress - FuncAddr, /*IsBranchSrc=*/true);
-            const unsigned BlockIndex = getBlock(InputOffset).second;
-            BlockProbes[BlockIndex].emplace_back(Probe);
+            const auto &[BlockOffset, BlockIndex] = getBlock(InputOffset);
+            BlockCtx[BlockIndex].addBlockProbe(InlineTreeNodeId, Probe,
+                                               InputOffset - BlockOffset);
           }
         }
 
-        for (auto &[Block, Probes] : BlockProbes) {
-          YamlBF.Blocks[Block].PseudoProbes =
-              YAMLProfileWriter::writeBlockProbes(Probes, InlineTreeNodeId);
-        }
+        for (auto &[Block, Ctx] : BlockCtx)
+          Ctx.finalize(YamlBF.Blocks[Block]);
       }
       // Skip printing if there's no profile data
       llvm::erase_if(
diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 1a61949d77472..5fb65153cf313 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -348,26 +348,10 @@ class StaleMatcher {
       return It->second;
     };
 
-    auto matchPseudoProbeInfo = [&](const yaml::bolt::PseudoProbeInfo
-                                        &ProfileProbe,
-                                    uint32_t NodeId) {
-      for (uint64_t Index = 0; Index < 64; ++Index)
-        if (ProfileProbe.BlockMask & 1ull << Index)
-          ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, Index + 1)];
-      for (const auto &ProfileProbes :
-           {ProfileProbe.BlockProbes, ProfileProbe.IndCallProbes,
-            ProfileProbe.CallProbes})
-        for (uint64_t ProfileProbe : ProfileProbes)
-          ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, ProfileProbe)];
-    };
-
-    for (const yaml::bolt::PseudoProbeInfo &ProfileProbe : BlockPseudoProbes) {
-      if (!ProfileProbe.InlineTreeNodes.empty())
-        for (uint32_t ProfileInlineTreeNode : ProfileProbe.InlineTreeNodes)
-          matchPseudoProbeInfo(ProfileProbe, ProfileInlineTreeNode);
-      else
-        matchPseudoProbeInfo(ProfileProbe, ProfileProbe.InlineTreeIndex);
-    }
+    for (const yaml::bolt::PseudoProbeInfo &ProfileProbe : BlockPseudoProbes)
+      for (uint32_t Node : ProfileProbe.InlineTreeNodes)
+        for (uint64_t Probe : ProfileProbe.BlockProbes)
+          ++FlowBlockMatchCount[matchProfileProbeToBlock(Node, Probe)];
     uint32_t BestMatchCount = 0;
     uint32_t TotalMatchCount = 0;
     const FlowBlock *BestMatchBlock = nullptr;
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index 1632aa1c6bfe2..cd4e77b0dbb60 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -129,50 +129,62 @@ YAMLProfileWriter::convertPseudoProbeDesc(const MCPseudoProbeDecoder &Decoder) {
   return {Desc, InlineTree};
 }
 
-std::vector<yaml::bolt::PseudoProbeInfo>
-YAMLProfileWriter::convertNodeProbes(NodeIdToProbes &NodeProbes) {
-  struct BlockProbeInfoHasher {
-    size_t operator()(const yaml::bolt::PseudoProbeInfo &BPI) const {
-      return llvm::hash_combine(llvm::hash_combine_range(BPI.BlockProbes),
-                                llvm::hash_combine_range(BPI.CallProbes),
-                                llvm::hash_combine_range(BPI.IndCallProbes));
+void YAMLProfileWriter::BlockProbeCtx::addBlockProbe(
+    const InlineTreeMapTy &Map, const MCDecodedPseudoProbe &Probe,
+    uint32_t ProbeOffset) {
+  auto It = Map.find(Probe.getInlineTreeNode());
+  if (It == Map.end())
+    return;
+  auto NodeId = It->second;
+  uint32_t Index = Probe.getIndex();
+  if (Probe.isCall())
+    CallProbes[ProbeOffset] =
+        Call{Index, NodeId, Probe.isIndirectCall(), false};
+  else
+    NodeToProbes[NodeId].emplace_back(Index);
+}
+
+void YAMLProfileWriter::BlockProbeCtx::finalize(
+    yaml::bolt::BinaryBasicBlockProfile &YamlBB) {
+  // Hash block probes by vector
+  struct ProbeHasher {
+    size_t operator()(const ArrayRef<uint64_t> Probes) const {
+      return llvm::hash_combine_range(Probes);
     }
   };
 
-  // Check identical BlockProbeInfo structs and merge them
-  std::unordered_map<yaml::bolt::PseudoProbeInfo, std::vector<uint32_t>,
-                     BlockProbeInfoHasher>
-      BPIToNodes;
-  for (auto &[NodeId, Probes] : NodeProbes) {
-    yaml::bolt::PseudoProbeInfo BPI;
-    BPI.BlockProbes = std::vector(Probes[0].begin(), Probes[0].end());
-    BPI.IndCallProbes = std::vector(Probes[1].begin(), Probes[1].end());
-    BPI.CallProbes = std::vector(Probes[2].begin(), Probes[2].end());
-    BPIToNodes[BPI].push_back(NodeId);
+  // Check identical block probes and merge them
+  std::unordered_map<std::vector<uint64_t>, std::vector<uint32_t>, ProbeHasher>
+      ProbesToNodes;
+  for (auto &[NodeId, Probes] : NodeToProbes) {
+    llvm::sort(Probes);
+    ProbesToNodes[Probes].emplace_back(NodeId);
   }
-
-  auto handleMask = [](const auto &Ids, auto &Vec, auto &Mask) {
-    for (auto Id : Ids)
-      if (Id > 64)
-        Vec.emplace_back(Id);
-      else
-        Mask |= 1ull << (Id - 1);
-  };
-
-  // Add to YAML with merged nodes/block mask optimizations
-  std::vector<yaml::bolt::PseudoProbeInfo> YamlProbes;
-  YamlProbes.reserve(BPIToNodes.size());
-  for (const auto &[BPI, Nodes] : BPIToNodes) {
-    auto &YamlBPI = YamlProbes.emplace_back(yaml::bolt::PseudoProbeInfo());
-    YamlBPI.CallProbes = BPI.CallProbes;
-    YamlBPI.IndCallProbes = BPI.IndCallProbes;
-    if (Nodes.size() == 1)
-      YamlBPI.InlineTreeIndex = Nodes.front();
-    else
-      YamlBPI.InlineTreeNodes = Nodes;
-    handleMask(BPI.BlockProbes, YamlBPI.BlockProbes, YamlBPI.BlockMask);
+  for (auto &[Probes, Nodes] : ProbesToNodes) {
+    llvm::sort(Nodes);
+    YamlBB.PseudoProbes.emplace_back(
+        yaml::bolt::PseudoProbeInfo{Probes, Nodes});
+  }
+  for (yaml::bolt::CallSiteInfo &CSI : YamlBB.CallSites) {
+    auto It = CallProbes.find(CSI.Offset);
+    if (It == CallProbes.end())
+      continue;
+    Call &Probe = It->second;
+    CSI.Probe = Probe.Id;
+    CSI.InlineTreeNode = Probe.Node;
+    CSI.Indirect = Probe.Indirect;
+    Probe.Used = true;
+  }
+  for (const auto &[Offset, Probe] : CallProbes) {
+    if (Probe.Used)
+      continue;
+    yaml::bolt::CallSiteInfo CSI;
+    CSI.Offset = Offset;
+    CSI.Probe = Probe.Id;
+    CSI.InlineTreeNode = Probe.Node;
+    CSI.Indirect = Probe.Indirect;
+    YamlBB.CallSites.emplace_back(CSI);
   }
-  return YamlProbes;
 }
 
 std::tuple<std::vector<yaml::bolt::InlineTreeNode>,
@@ -343,12 +355,13 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
       const AddressProbesMap &ProbeMap =
           PseudoProbeDecoder->getAddress2ProbesMap();
       const uint64_t FuncAddr = BF.getAddress();
-      const std::pair<uint64_t, uint64_t> &BlockRange =
-          BB->getInputAddressRange();
-      const std::pair<uint64_t, uint64_t> BlockAddrRange = {
-          FuncAddr + BlockRange.first, FuncAddr + BlockRange.second};
-      auto Probes = ProbeMap.find(BlockAddrRange.first, BlockAddrRange.second);
-      YamlBB.PseudoProbes = writeBlockProbes(Probes, InlineTreeNodeId);
+      auto [Start, End] = BB->getInputAddressRange();
+      Start += FuncAddr;
+      End += FuncAddr;
+      BlockProbeCtx Ctx;
+      for (const MCDecodedPseudoProbe &Probe : ProbeMap.find(Start, End))
+        Ctx.addBlockProbe(InlineTreeNodeId, Probe, Probe.getAddress() - Start);
+      Ctx.finalize(YamlBB);
     }
 
     YamlBF.Blocks.emplace_back(YamlBB);
@@ -382,7 +395,7 @@ std::error_code YAMLProfileWriter::writeProfile(const RewriteInstance &RI) {
   StringSet<> EventNames = RI.getProfileReader()->getEventNames();
   if (!EventNames.empty()) {
     std::string Sep;
-    for (const StringMapEntry<std::nullopt_t> &EventEntry : EventNames) {
+    for (const StringMapEntry<EmptyStringSetTag> &EventEntry : EventNames) {
       BP.Header.EventNames += Sep + EventEntry.first().str();
       Sep = ",";
     }
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 782137e807662..1a0f6d75d63e8 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "bolt/Rewrite/BinaryPassManager.h"
-#include "bolt/Passes/ADRRelaxationPass.h"
+#include "bolt/Passes/AArch64RelaxationPass.h"
 #include "bolt/Passes/Aligner.h"
 #include "bolt/Passes/AllocCombiner.h"
 #include "bolt/Passes/AsmDump.h"
@@ -129,10 +129,10 @@ static cl::opt<bool> PrintJTFootprintReduction(
     cl::desc("print function after jt-footprint-reduction pass"), cl::Hidden,
     cl::cat(BoltOptCategory));
 
-static cl::opt<bool>
-    PrintAdrRelaxation("print-adr-relaxation",
-                       cl::desc("print functions after ADR Relaxation pass"),
-                       cl::Hidden, cl::cat(BoltOptCategory));
+static cl::opt<bool> PrintAArch64Relaxation(
+    "print-adr-ldr-relaxation",
+    cl::desc("print functions after ADR/LDR Relaxation pass"), cl::Hidden,
+    cl::cat(BoltOptCategory));
 
 static cl::opt<bool>
     PrintLongJmp("print-longjmp",
@@ -517,7 +517,7 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
 
   if (BC.isAArch64()) {
     Manager.registerPass(
-        std::make_unique<ADRRelaxationPass>(PrintAdrRelaxation));
+        std::make_unique<AArch64RelaxationPass>(PrintAArch64Relaxation));
 
     // Tighten branches according to offset differences between branch and
     // targets. No extra instructions after this pass, otherwise we may have
diff --git a/bolt/lib/Rewrite/CMakeLists.txt b/bolt/lib/Rewrite/CMakeLists.txt
index 5b15edcacb482..bc1b2ed3c2e3c 100644
--- a/bolt/lib/Rewrite/CMakeLists.txt
+++ b/bolt/lib/Rewrite/CMakeLists.txt
@@ -24,6 +24,7 @@ add_llvm_library(LLVMBOLTRewrite
   BuildIDRewriter.cpp
   PseudoProbeRewriter.cpp
   RewriteInstance.cpp
+  RSeqRewriter.cpp
   SDTRewriter.cpp
   GNUPropertyRewriter.cpp
 
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 5e3fa931e826f..816acb229fec5 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -1723,7 +1723,76 @@ StringRef getSectionName(const SectionRef &Section) {
   return Name;
 }
 
-// Extracts an appropriate slice if input is DWP.
+/// Extracts the slice of the .debug_str.dwo section for a given CU from a DWP
+/// file, based on the .debug_str_offsets.dwo section. This helps address DWO
+/// bloat that may occur after updates.
+///
+/// A slice of .debug_str.dwo may be composed of several non-contiguous
+/// fragments. These non-contiguous string views will be written out
+/// sequentially, avoiding the copying overhead caused by assembling them.
+///
+/// The .debug_str_offsets for the first CU often does not need to be updated,
+/// so copying is only performed when .debug_str_offsets requires updating.
+static void UpdateStrAndStrOffsets(StringRef StrDWOContent,
+                                   StringRef StrOffsetsContent,
+                                   SmallVectorImpl<StringRef> &StrDWOOutData,
+                                   std::string &StrOffsetsOutData,
+                                   unsigned DwarfVersion, bool IsLittleEndian) {
+  const llvm::endianness Endian =
+      IsLittleEndian ? llvm::endianness::little : llvm::endianness::big;
+  const uint64_t HeaderOffset = (DwarfVersion >= 5) ? 8 : 0;
+  constexpr size_t SizeOfOffset = sizeof(int32_t);
+  const uint64_t NumOffsets =
+      (StrOffsetsContent.size() - HeaderOffset) / SizeOfOffset;
+
+  DataExtractor Extractor(StrOffsetsContent, IsLittleEndian, 0);
+  uint64_t ExtractionOffset = HeaderOffset;
+
+  using StringFragment = DWARFUnitIndex::Entry::SectionContribution;
+  const auto getStringLength = [](StringRef Content,
+                                  uint64_t Offset) -> uint64_t {
+    size_t NullPos = Content.find('\0', Offset);
+    return (NullPos != StringRef::npos) ? (NullPos - Offset + 1) : 0;
+  };
+  const auto isContiguous = [](const StringFragment &Fragment,
+                               uint64_t NextOffset) -> bool {
+    return NextOffset == Fragment.getOffset() + Fragment.getLength();
+  };
+  std::optional<StringFragment> CurrentFragment;
+  uint64_t AccumulatedStrLen = 0;
+  for (uint64_t I = 0; I < NumOffsets; ++I) {
+    const uint64_t StrOffset = Extractor.getU32(&ExtractionOffset);
+    const uint64_t StringLength = getStringLength(StrDWOContent, StrOffset);
+    if (!CurrentFragment) {
+      // First init.
+      CurrentFragment = StringFragment(StrOffset, StringLength);
+    } else {
+      if (isContiguous(*CurrentFragment, StrOffset)) {
+        // Expanding the current fragment.
+        CurrentFragment->setLength(CurrentFragment->getLength() + StringLength);
+      } else {
+        // Saving the current fragment and start a new one.
+        StrDWOOutData.push_back(StrDWOContent.substr(
+            CurrentFragment->getOffset(), CurrentFragment->getLength()));
+        CurrentFragment = StringFragment(StrOffset, StringLength);
+      }
+    }
+    if (AccumulatedStrLen != StrOffset) {
+      // Updating str offsets.
+      if (StrOffsetsOutData.empty())
+        StrOffsetsOutData = StrOffsetsContent.str();
+      llvm::support::endian::write32(
+          &StrOffsetsOutData[HeaderOffset + I * SizeOfOffset],
+          static_cast<uint32_t>(AccumulatedStrLen), Endian);
+    }
+    AccumulatedStrLen += StringLength;
+  }
+  if (CurrentFragment)
+    StrDWOOutData.push_back(StrDWOContent.substr(CurrentFragment->getOffset(),
+                                                 CurrentFragment->getLength()));
+}
+
+// Exctracts an appropriate slice if input is DWP.
 // Applies patches or overwrites the section.
 std::optional<StringRef> updateDebugData(
     DWARFContext &DWCtx, StringRef SectionName, StringRef SectionContents,
@@ -1772,6 +1841,8 @@ std::optional<StringRef> updateDebugData(
       errs() << "BOLT-WARNING: unsupported debug section: " << SectionName
              << "\n";
     if (StrWriter.isInitialized()) {
+      if (CUDWOEntry)
+        return StrWriter.getBufferStr();
       OutputBuffer = StrWriter.releaseBuffer();
       return StringRef(reinterpret_cast<const char *>(OutputBuffer->data()),
                        OutputBuffer->size());
@@ -1786,6 +1857,8 @@ std::optional<StringRef> updateDebugData(
   }
   case DWARFSectionKind::DW_SECT_STR_OFFSETS: {
     if (StrOffstsWriter.isFinalized()) {
+      if (CUDWOEntry)
+        return StrOffstsWriter.getBufferStr();
       OutputBuffer = StrOffstsWriter.releaseBuffer();
       return StringRef(reinterpret_cast<const char *>(OutputBuffer->data()),
                        OutputBuffer->size());
@@ -1888,6 +1961,10 @@ void DWARFRewriter::writeDWOFiles(
     }
   }
 
+  StringRef StrDWOContent;
+  StringRef StrOffsetsContent;
+  llvm::SmallVector<StringRef, 3> StrDWOOutData;
+  std::string StrOffsetsOutData;
   for (const SectionRef &Section : File->sections()) {
     std::unique_ptr<DebugBufferVector> OutputData;
     StringRef SectionName = getSectionName(Section);
@@ -1895,11 +1972,50 @@ void DWARFRewriter::writeDWOFiles(
       continue;
     Expected<StringRef> ContentsExp = Section.getContents();
     assert(ContentsExp && "Invalid contents.");
+    if (IsDWP && SectionName == "debug_str.dwo") {
+      if (StrWriter.isInitialized())
+        StrDWOContent = StrWriter.getBufferStr();
+      else
+        StrDWOContent = *ContentsExp;
+      continue;
+    }
     if (std::optional<StringRef> OutData = updateDebugData(
             (*DWOCU)->getContext(), SectionName, *ContentsExp, KnownSections,
             *Streamer, *this, CUDWOEntry, DWOId, OutputData, RangeListssWriter,
-            LocWriter, StrOffstsWriter, StrWriter, OverridenSections))
+            LocWriter, StrOffstsWriter, StrWriter, OverridenSections)) {
+      if (IsDWP && SectionName == "debug_str_offsets.dwo") {
+        StrOffsetsContent = *OutData;
+        continue;
+      }
       Streamer->emitBytes(*OutData);
+    }
+  }
+
+  if (IsDWP) {
+    // Handling both .debug_str.dwo and .debug_str_offsets.dwo concurrently. In
+    // the original DWP, .debug_str is a deduplicated global table, and the
+    // .debug_str.dwo slice for a single CU needs to be extracted according to
+    // .debug_str_offsets.dwo.
+    UpdateStrAndStrOffsets(StrDWOContent, StrOffsetsContent, StrDWOOutData,
+                           StrOffsetsOutData, CU.getVersion(),
+                           (*DWOCU)->getContext().isLittleEndian());
+    auto SectionIter = KnownSections.find("debug_str.dwo");
+    if (SectionIter != KnownSections.end()) {
+      Streamer->switchSection(SectionIter->second.first);
+      for (size_t i = 0; i < StrDWOOutData.size(); ++i) {
+        StringRef OutData = StrDWOOutData[i];
+        if (!OutData.empty())
+          Streamer->emitBytes(OutData);
+      }
+    }
+    SectionIter = KnownSections.find("debug_str_offsets.dwo");
+    if (SectionIter != KnownSections.end()) {
+      Streamer->switchSection(SectionIter->second.first);
+      if (!StrOffsetsOutData.empty())
+        Streamer->emitBytes(StrOffsetsOutData);
+      else
+        Streamer->emitBytes(StrOffsetsContent);
+    }
   }
   Streamer->finish();
   TempOut->keep();
diff --git a/bolt/lib/Rewrite/RSeqRewriter.cpp b/bolt/lib/Rewrite/RSeqRewriter.cpp
new file mode 100644
index 0000000000000..46bce66d13ddf
--- /dev/null
+++ b/bolt/lib/Rewrite/RSeqRewriter.cpp
@@ -0,0 +1,72 @@
+//===- bolt/Rewrite/RSeqRewriter.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Basic support for restartable sequences used by tcmalloc. Prevent critical
+// section overrides by ignoring optimizations in containing functions.
+//
+// References:
+//   * https://google.github.io/tcmalloc/rseq.html
+//   * tcmalloc/internal/percpu_rseq_x86_64.S
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Core/BinaryFunction.h"
+#include "bolt/Rewrite/MetadataRewriter.h"
+#include "bolt/Rewrite/MetadataRewriters.h"
+#include "llvm/Support/Errc.h"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace {
+
+class RSeqRewriter final : public MetadataRewriter {
+public:
+  RSeqRewriter(StringRef Name, BinaryContext &BC)
+      : MetadataRewriter(Name, BC) {}
+
+  Error preCFGInitializer() override {
+    for (const BinarySection &Section : BC.allocatableSections()) {
+      if (Section.getName() != "__rseq_cs")
+        continue;
+
+      auto handleRelocation = [&](const Relocation &Rel, bool IsDynamic) {
+        BinaryFunction *BF = nullptr;
+        if (Rel.Symbol)
+          BF = BC.getFunctionForSymbol(Rel.Symbol);
+        else if (Relocation::isRelative(Rel.Type))
+          BF = BC.getBinaryFunctionContainingAddress(Rel.Addend);
+
+        if (!BF) {
+          BC.errs() << "BOLT-WARNING: no function found matching "
+                    << (IsDynamic ? "dynamic " : "")
+                    << "relocation in __rseq_cs\n";
+        } else if (!BF->isIgnored()) {
+          BC.outs() << "BOLT-INFO: restartable sequence reference detected in "
+                    << *BF << ". Function will not be optimized\n";
+          BF->setIgnored();
+        }
+      };
+
+      for (const Relocation &Rel : Section.dynamicRelocations())
+        handleRelocation(Rel, /*IsDynamic*/ true);
+
+      for (const Relocation &Rel : Section.relocations())
+        handleRelocation(Rel, /*IsDynamic*/ false);
+    }
+
+    return Error::success();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<MetadataRewriter>
+llvm::bolt::createRSeqRewriter(BinaryContext &BC) {
+  return std::make_unique<RSeqRewriter>("rseq-cs-rewriter", BC);
+}
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 77e5688781d57..8d6731e7540a8 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -295,7 +295,6 @@ cl::bits<GadgetScannerKind> GadgetScannersToRun(
 } // namespace opts
 
 // FIXME: implement a better way to mark sections for replacement.
-constexpr const char *RewriteInstance::SectionsToOverwrite[];
 std::vector<std::string> RewriteInstance::DebugSectionsToOverwrite = {
     ".debug_abbrev", ".debug_aranges",  ".debug_line",   ".debug_line_str",
     ".debug_loc",    ".debug_loclists", ".debug_ranges", ".debug_rnglists",
@@ -2955,8 +2954,10 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection,
           // if-condition above) so we're handling a relocation from a function
           // to itself. RISC-V uses such relocations for branches, for example.
           // These should not be registered as externally references offsets.
-          if (!ContainingBF)
-            ReferencedBF->registerReferencedOffset(RefFunctionOffset);
+          if (!ContainingBF && !ReferencedBF->isInConstantIsland(Address)) {
+            ReferencedBF->registerInternalRefDataRelocation(RefFunctionOffset,
+                                                            Rel.getOffset());
+          }
         }
         if (opts::Verbosity > 1 &&
             BinarySection(*BC, RelocatedSection).isWritable())
@@ -3345,6 +3346,8 @@ void RewriteInstance::initializeMetadataManager() {
 
   MetadataManager.registerRewriter(createPseudoProbeRewriter(*BC));
 
+  MetadataManager.registerRewriter(createRSeqRewriter(*BC));
+
   MetadataManager.registerRewriter(createSDTRewriter(*BC));
 
   MetadataManager.registerRewriter(createGNUPropertyRewriter(*BC));
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 7769162d67eaf..db3989d6b0b5f 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -142,6 +142,7 @@ static InstructionListType createIncMemory(MCPhysReg RegTo, MCPhysReg RegTmp) {
   atomicAdd(Insts.back(), RegTo, RegTmp);
   return Insts;
 }
+
 class AArch64MCPlusBuilder : public MCPlusBuilder {
 public:
   using MCPlusBuilder::MCPlusBuilder;
@@ -163,11 +164,53 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
   bool isPush(const MCInst &Inst) const override {
     return isStoreToStack(Inst);
-  };
+  }
 
   bool isPop(const MCInst &Inst) const override {
     return isLoadFromStack(Inst);
-  };
+  }
+
+  // We look for instructions that load from stack or make stack pointer
+  // adjustment, and assume the basic block is an epilogue if and only if
+  // such instructions are present and also immediately precede the branch
+  // instruction that ends the basic block.
+  bool isEpilogue(const BinaryBasicBlock &BB) const override {
+    if (BB.succ_size())
+      return false;
+
+    bool SeenLoadFromStack = false;
+    bool SeenStackPointerAdjustment = false;
+    for (const MCInst &Instr : BB) {
+      // Skip CFI pseudo instruction.
+      if (isCFI(Instr))
+        continue;
+
+      bool IsPop = isPop(Instr);
+      // A load from stack instruction could do SP adjustment in pre-index or
+      // post-index form, which we can skip to check for epilogue recognition
+      // purpose.
+      bool IsSPAdj = (isADD(Instr) || isMOVW(Instr)) &&
+                     Instr.getOperand(0).isReg() &&
+                     Instr.getOperand(0).getReg() == AArch64::SP;
+      SeenLoadFromStack |= IsPop;
+      SeenStackPointerAdjustment |= IsSPAdj;
+
+      if (!SeenLoadFromStack && !SeenStackPointerAdjustment)
+        continue;
+      if (IsPop || IsSPAdj || isPAuthOnLR(Instr))
+        continue;
+      if (isReturn(Instr))
+        return true;
+      if (isBranch(Instr))
+        break;
+
+      // Any previously seen load from stack or stack adjustment instruction
+      // is definitely not part of epilogue code sequence, so reset these two.
+      SeenLoadFromStack = false;
+      SeenStackPointerAdjustment = false;
+    }
+    return SeenLoadFromStack || SeenStackPointerAdjustment;
+  }
 
   void createCall(MCInst &Inst, const MCSymbol *Target,
                   MCContext *Ctx) override {
@@ -583,6 +626,14 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return Inst.getOpcode() == AArch64::ADDXri;
   }
 
+  bool isLDRWl(const MCInst &Inst) const override {
+    return Inst.getOpcode() == AArch64::LDRWl;
+  }
+
+  bool isLDRXl(const MCInst &Inst) const override {
+    return Inst.getOpcode() == AArch64::LDRXl;
+  }
+
   MCPhysReg getADRReg(const MCInst &Inst) const {
     assert((isADR(Inst) || isADRP(Inst)) && "Not an ADR instruction");
     assert(MCPlus::getNumPrimeOperands(Inst) != 0 &&
@@ -602,6 +653,40 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return materializeAddress(Target, Ctx, Reg, Addend);
   }
 
+  InstructionListType createAdrpLdr(const MCInst &LDRInst,
+                                    MCContext *Ctx) const override {
+    assert((isLDRXl(LDRInst) || isLDRWl(LDRInst)) &&
+           "LDR (literal, 32 or 64-bit integer load) instruction expected");
+    assert(LDRInst.getOperand(0).isReg() &&
+           "unexpected operand in LDR instruction");
+    const MCPhysReg DataReg = LDRInst.getOperand(0).getReg();
+    const MCPhysReg AddrReg =
+        isLDRXl(LDRInst) ? DataReg
+                         : (MCPhysReg)RegInfo->getMatchingSuperReg(
+                               DataReg, AArch64::sub_32,
+                               &RegInfo->getRegClass(AArch64::GPR64RegClassID));
+    const MCSymbol *Target = getTargetSymbol(LDRInst, 1);
+    assert(Target && "missing target symbol in LDR instruction");
+
+    InstructionListType Insts(2);
+    Insts[0].setOpcode(AArch64::ADRP);
+    Insts[0].clear();
+    Insts[0].addOperand(MCOperand::createReg(AddrReg));
+    Insts[0].addOperand(MCOperand::createImm(0));
+    setOperandToSymbolRef(Insts[0], /* OpNum */ 1, Target, 0, Ctx,
+                          ELF::R_AARCH64_NONE);
+    Insts[1].setOpcode(isLDRXl(LDRInst) ? AArch64::LDRXui : AArch64::LDRWui);
+    Insts[1].clear();
+    Insts[1].addOperand(MCOperand::createReg(DataReg));
+    Insts[1].addOperand(MCOperand::createReg(AddrReg));
+    Insts[1].addOperand(MCOperand::createImm(0));
+    Insts[1].addOperand(MCOperand::createImm(0));
+    setOperandToSymbolRef(Insts[1], /* OpNum */ 2, Target, 0, Ctx,
+                          isLDRXl(LDRInst) ? ELF::R_AARCH64_LDST64_ABS_LO12_NC
+                                           : ELF::R_AARCH64_LDST32_ABS_LO12_NC);
+    return Insts;
+  }
+
   bool isTB(const MCInst &Inst) const {
     return (Inst.getOpcode() == AArch64::TBNZW ||
             Inst.getOpcode() == AArch64::TBNZX ||
@@ -2762,7 +2847,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     BitVector WrittenRegs(RegInfo->getNumRegs());
     const BitVector &SizeRegAliases = getAliases(SizeReg);
 
-    for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) {
+    for (auto InstIt = CallInst; InstIt != BB.begin(); --InstIt) {
       const MCInst &Inst = *InstIt;
       WrittenRegs.reset();
       getWrittenRegs(Inst, WrittenRegs);
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 5fca5e813515f..7c24c2ce136fa 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -219,6 +219,12 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     return getPopSize(Inst) == 0 ? false : true;
   }
 
+  bool isEpilogue(const BinaryBasicBlock &BB) const override {
+    return ::llvm::any_of(BB, [&](const MCInst &Instr) {
+      return isLeave(Instr) || isPop(Instr);
+    });
+  }
+
   bool isTerminateBranch(const MCInst &Inst) const override {
     return Inst.getOpcode() == X86::ENDBR32 || Inst.getOpcode() == X86::ENDBR64;
   }
diff --git a/bolt/test/AArch64/constant-island-entry.s b/bolt/test/AArch64/constant-island-entry.s
index 6567114eb980a..a82b876fde46d 100644
--- a/bolt/test/AArch64/constant-island-entry.s
+++ b/bolt/test/AArch64/constant-island-entry.s
@@ -1,11 +1,16 @@
-// This test checks that we ignore functions which add an entry point that
-// is in a constant island.
+## This test checks that we ignore functions which add an entry point that
+## is in a constant island.
 
 # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
 # RUN: %clang %cflags %t.o -pie -Wl,-q -o %t.exe
+
+## Check when the caller is successfully disassembled.
 # RUN: llvm-bolt %t.exe -o %t.bolt 2>&1 | FileCheck %s
 
-# CHECK: BOLT-WARNING: Ignoring entry point at address 0x{{[0-9a-f]+}} in constant island of function func
+## Skip caller to check the identical warning is triggered from ScanExternalRefs().
+# RUN: llvm-bolt %t.exe -o %t.bolt -skip-funcs=caller 2>&1 | FileCheck %s
+
+# CHECK: BOLT-WARNING: ignoring entry point at address 0x{{[0-9a-f]+}} in constant island of function func
 
 .globl func
 .type func, %function
diff --git a/bolt/test/AArch64/epilogue-determination.s b/bolt/test/AArch64/epilogue-determination.s
new file mode 100644
index 0000000000000..437d8149c0d6b
--- /dev/null
+++ b/bolt/test/AArch64/epilogue-determination.s
@@ -0,0 +1,48 @@
+# Test that we will not incorrectly take the first basic block in function
+# `_foo` as epilogue due to the first load from stack instruction.
+
+# RUN: %clang %cflags %s -o %t.so -Wl,-q
+# RUN: llvm-bolt %t.so -o %t.bolt --print-cfg | FileCheck %s
+
+  .text
+  .global _foo
+  .type _foo, %function
+_foo:
+  ldr w8, [sp]
+  adr x10, _jmptbl
+  ldrsw x9, [x10, x9, lsl #2]
+  add x10, x10, x9
+  br x10
+# CHECK-NOT: x10 # TAILCALL
+# CHECK: x10 # UNKNOWN CONTROL FLOW
+  mov x0, 0
+  ret
+  mov x0, 1
+  ret
+
+  .balign 4
+_jmptbl:
+  .long -16
+  .long -8
+
+  .global _bar
+  .type _bar, %function
+_bar:
+  stp x29, x30, [sp, #-0x10]!
+  mov x29, sp
+  sub sp, sp, #0x10
+  ldr x8, [x29, #0x30]
+  blr x8
+  add sp, sp, #0x10
+  ldp x29, x30, [sp], #0x10
+  br x2
+# CHECK-NOT: x2 # UNKNOWN CONTROL FLOW
+# CHECK: x2 # TAILCALL
+
+  .global _start
+  .type _start, %function
+_start:
+  ret
+
+  # Dummy relocation to force relocation mode
+  .reloc 0, R_AARCH64_NONE
diff --git a/bolt/test/AArch64/ldr-relaxation.s b/bolt/test/AArch64/ldr-relaxation.s
new file mode 100644
index 0000000000000..7632504a01635
--- /dev/null
+++ b/bolt/test/AArch64/ldr-relaxation.s
@@ -0,0 +1,122 @@
+## Check that LDR relaxation will fail since LDR is inside a non-simple
+## function and there is no NOP next to it.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN:    --defsym FAIL=1 %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.so -Wl,-q
+# RUN: not llvm-bolt %t.so -o %t.bolt 2>&1 | FileCheck %s --check-prefix=FAIL
+
+# FAIL: BOLT-ERROR: cannot relax LDR in non-simple function _start
+
+.ifdef FAIL
+  .text
+  .global _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  br x2
+  ldr x0, _foo
+  ret
+  .cfi_endproc
+.size _start, .-_start
+.endif
+
+## Check that LDR relaxation is not needed since the reference is not far away.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN:    --defsym NOT_NEEDED=1 %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.so -Wl,-q
+# RUN: llvm-bolt %t.so -o %t.bolt
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=NOT_NEEDED
+
+# NOT_NEEDED: <_start>
+# NOT_NEEDED-NEXT: ldr
+
+.ifdef NOT_NEEDED
+  .text
+  .global _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  ldr x0, _start
+  ret
+  .cfi_endproc
+.size _start, .-_start
+.endif
+
+## Check that LDR relaxation is done in a simple function, where NOP will
+## be inserted as needed.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN:    --defsym RELAX_SIMPLE=1 %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.so -Wl,-q
+# RUN: llvm-bolt %t.so -o %t.bolt
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAX
+
+# RELAX: adrp
+# RELAX-NEXT: ldr
+
+.ifdef RELAX_SIMPLE
+  .text
+  .global _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  ldr x0, _foo
+  ret
+  .cfi_endproc
+.size _start, .-_start
+.endif
+
+## Check that LDR relaxation is done in a non-simple function, where NOP
+## exists next to LDR.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN:    --defsym RELAX_NON_SIMPLE=1 %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.so -Wl,-q
+# RUN: llvm-bolt %t.so -o %t.bolt
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAX
+
+.ifdef RELAX_NON_SIMPLE
+  .text
+  .global _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  br x2
+  ldr x0, _foo
+  nop
+  ret
+  .cfi_endproc
+.size _start, .-_start
+.endif
+
+## Check LDR relaxation works on loading W (low 32-bit of X) registers.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN:    --defsym RELAX_SIMPLE_WREG=1 %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.so -Wl,-q
+# RUN: llvm-bolt %t.so -o %t.bolt
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAXW
+
+# RELAXW: adrp x0
+# RELAXW-NEXT: ldr w0
+
+.ifdef RELAX_SIMPLE_WREG
+  .text
+  .global _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  ldr w0, _foo
+  ret
+  .cfi_endproc
+.size _start, .-_start
+.endif
+
+  .section .text_cold
+  .global _foo
+  .align 3
+_foo:
+  .long 0x12345678
+.size _foo, .-_foo
diff --git a/bolt/test/AArch64/relocation-type-print.s b/bolt/test/AArch64/relocation-type-print.s
new file mode 100644
index 0000000000000..111cbbb94bc54
--- /dev/null
+++ b/bolt/test/AArch64/relocation-type-print.s
@@ -0,0 +1,24 @@
+## Verify that llvm-bolt correctly prints relocation types.
+
+# REQUIRES: system-linux
+
+# RUN: %clang %cflags -nostartfiles %s -o %t.exe -Wl,-q,--no-relax
+# RUN: llvm-bolt %t.exe --print-cfg --print-relocations -o %t.bolt \
+# RUN:   | FileCheck %s
+
+  .section .text
+  .align 4
+  .globl _start
+  .type _start, %function
+_start:
+
+  adrp x0, _start
+# CHECK: adrp
+# CHECK-SAME: R_AARCH64_ADR_PREL_PG_HI21
+
+  add x0, x0, :lo12:_start
+# CHECK-NEXT: add
+# CHECK-SAME: R_AARCH64_ADD_ABS_LO12_NC
+
+  ret
+  .size _start, .-_start
diff --git a/bolt/test/X86/Inputs/dwarf4-str-split-dwarf.s b/bolt/test/X86/Inputs/dwarf4-str-split-dwarf.s
new file mode 100644
index 0000000000000..cc951b689a5c6
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf4-str-split-dwarf.s
@@ -0,0 +1,330 @@
+#--- main.s
+# clang++ -g2 -gdwarf-4 -gsplit-dwarf=split -gno-pubnames -S main.cpp
+# extern int getReturn();
+# int main() {
+#   return getReturn();
+# }
+	.file	"main.cpp"
+	.globl	main                            # -- Begin function main
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.file	1 "." "main.cpp"
+	.loc	1 2 0                           # main.cpp:2:0
+	.loc	1 3 10 prologue_end             # main.cpp:3:10
+	.loc	1 3 3 epilogue_begin is_stmt 0  # main.cpp:3:3
+	retq
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.ascii	"\263B"                         # DW_AT_GNU_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x25 DW_TAG_compile_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lskel_string0                  # DW_AT_comp_dir
+	.long	.Lskel_string1                  # DW_AT_GNU_dwo_name
+	.quad	-9094791692727444213            # DW_AT_GNU_dwo_id
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_GNU_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"." # string offset=0
+.Lskel_string1:
+	.asciz	"main.dwo"                      # string offset=2
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"main"                          # string offset=0
+.Linfo_string1:
+	.asciz	"int"                           # string offset=5
+.Linfo_string2:
+	.asciz	"clang version 22.0.0" # string offset=9
+.Linfo_string3:
+	.asciz	"main.cpp"                      # string offset=30
+.Linfo_string4:
+	.asciz	"main.dwo"                      # string offset=39
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	5
+	.long	9
+	.long	30
+	.long	39
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	4                               # DWARF version number
+	.long	0                               # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x22 DW_TAG_compile_unit
+	.byte	2                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	3                               # DW_AT_name
+	.byte	4                               # DW_AT_GNU_dwo_name
+	.quad	-9094791692727444213            # DW_AT_GNU_dwo_id
+	.byte	2                               # Abbrev [2] 0x19:0xf DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	0                               # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.long	40                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x28:0x4 DW_TAG_base_type
+	.byte	1                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	17                              # DW_AT_low_pc
+	.ascii	"\201>"                         # DW_FORM_GNU_addr_index
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"",@progbits
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+	.ident	"clang version 22.0.0"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.addrsig_sym _Z9getReturnv
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
+#--- helper.s
+# clang++ -g2 -gdwarf-4 -gsplit-dwarf=split -gno-pubnames -S helper.cpp
+# int getReturn() {
+#   return 0;
+# }
+	.file	"helper.cpp"
+	.globl	_Z9getReturnv                   # -- Begin function _Z9getReturnv
+	.type	_Z9getReturnv,@function
+_Z9getReturnv:                          # @_Z9getReturnv
+.Lfunc_begin0:
+	.file	1 "." "helper.cpp"
+	.loc	1 1 0                           # helper.cpp:1:0
+	.loc	1 2 3 prologue_end              # helper.cpp:2:3
+	.loc	1 2 3 epilogue_begin is_stmt 0  # helper.cpp:2:3
+	retq
+.Lfunc_end0:
+	.size	_Z9getReturnv, .Lfunc_end0-_Z9getReturnv
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.ascii	"\263B"                         # DW_AT_GNU_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x25 DW_TAG_compile_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lskel_string0                  # DW_AT_comp_dir
+	.long	.Lskel_string1                  # DW_AT_GNU_dwo_name
+	.quad	5976014880088676049             # DW_AT_GNU_dwo_id
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_GNU_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"." # string offset=0
+.Lskel_string1:
+	.asciz	"helper.dwo"                    # string offset=2
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"_Z9getReturnv"                 # string offset=0
+.Linfo_string1:
+	.asciz	"getReturn"                     # string offset=14
+.Linfo_string2:
+	.asciz	"int"                           # string offset=24
+.Linfo_string3:
+	.asciz	"clang version 22.0.0" # string offset=28
+.Linfo_string4:
+	.asciz	"helper.cpp"                    # string offset=49
+.Linfo_string5:
+	.asciz	"helper.dwo"                    # string offset=60
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	14
+	.long	24
+	.long	28
+	.long	49
+	.long	60
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	4                               # DWARF version number
+	.long	0                               # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x23 DW_TAG_compile_unit
+	.byte	3                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_GNU_dwo_name
+	.quad	5976014880088676049             # DW_AT_GNU_dwo_id
+	.byte	2                               # Abbrev [2] 0x19:0x10 DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	0                               # DW_AT_linkage_name
+	.byte	1                               # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.long	41                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x29:0x4 DW_TAG_base_type
+	.byte	2                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	17                              # DW_AT_low_pc
+	.ascii	"\201>"                         # DW_FORM_GNU_addr_index
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"",@progbits
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+	.ident	"clang version 22.0.0"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-str-split-dwarf.s b/bolt/test/X86/Inputs/dwarf5-str-split-dwarf.s
new file mode 100644
index 0000000000000..5e938ea98bf95
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-str-split-dwarf.s
@@ -0,0 +1,368 @@
+#--- main.s
+# clang++ -g2 -gdwarf-5 -gsplit-dwarf=split -gno-pubnames -S main.cpp
+# extern int getReturn();
+# int main() {
+#   return getReturn();
+# }
+	.file	"main.cpp"
+	.globl	main                            # -- Begin function main
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.file	0 "." "main.cpp" md5 0x9cdef858e26cf684ed9ef3b60e05bdad
+	.loc	0 2 0                           # main.cpp:2:0
+	.loc	0 3 10 prologue_end             # main.cpp:3:10
+	.loc	0 3 3 epilogue_begin is_stmt 0  # main.cpp:3:3
+	retq
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	74                              # DW_TAG_skeleton_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	4                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	-9094791692727444213
+	.byte	1                               # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	0                               # DW_AT_comp_dir
+	.byte	1                               # DW_AT_dwo_name
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	12                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"." # string offset=0
+.Lskel_string1:
+	.asciz	"main.dwo"                      # string offset=2
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Lskel_string0
+	.long	.Lskel_string1
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	24                              # Length of String Offsets Set
+	.short	5
+	.short	0
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"main"                          # string offset=0
+.Linfo_string1:
+	.asciz	"int"                           # string offset=5
+.Linfo_string2:
+	.asciz	"clang version 22.0.0" # string offset=9
+.Linfo_string3:
+	.asciz	"main.cpp"                      # string offset=30
+.Linfo_string4:
+	.asciz	"main.dwo"                      # string offset=39
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	5
+	.long	9
+	.long	30
+	.long	39
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	5                               # DWARF version number
+	.byte	5                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	-9094791692727444213
+	.byte	1                               # Abbrev [1] 0x14:0x1a DW_TAG_compile_unit
+	.byte	2                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	3                               # DW_AT_name
+	.byte	4                               # DW_AT_dwo_name
+	.byte	2                               # Abbrev [2] 0x1a:0xf DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	0                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.long	41                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x29:0x4 DW_TAG_base_type
+	.byte	1                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.ident	"clang version 22.0.0"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.addrsig_sym _Z9getReturnv
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
+#--- helper.s
+# clang++ -g2 -gdwarf-5 -gsplit-dwarf=split -gno-pubnames -S helper.cpp
+# int getReturn() {
+#   return 0;
+# }
+	.file	"helper.cpp"
+	.globl	_Z9getReturnv                   # -- Begin function _Z9getReturnv
+	.type	_Z9getReturnv,@function
+_Z9getReturnv:                          # @_Z9getReturnv
+.Lfunc_begin0:
+	.file	0 "." "helper.cpp" md5 0xc7d7879297b54325c71b3e0cfbb65e2d
+	.loc	0 1 0                           # helper.cpp:1:0
+	.loc	0 2 3 prologue_end              # helper.cpp:2:3
+	.loc	0 2 3 epilogue_begin is_stmt 0  # helper.cpp:2:3
+	retq
+.Lfunc_end0:
+	.size	_Z9getReturnv, .Lfunc_end0-_Z9getReturnv
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	74                              # DW_TAG_skeleton_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	4                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	5976014880088676049
+	.byte	1                               # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	0                               # DW_AT_comp_dir
+	.byte	1                               # DW_AT_dwo_name
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	12                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"." # string offset=0
+.Lskel_string1:
+	.asciz	"helper.dwo"                    # string offset=2
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Lskel_string0
+	.long	.Lskel_string1
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	28                              # Length of String Offsets Set
+	.short	5
+	.short	0
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"_Z9getReturnv"                 # string offset=0
+.Linfo_string1:
+	.asciz	"getReturn"                     # string offset=14
+.Linfo_string2:
+	.asciz	"int"                           # string offset=24
+.Linfo_string3:
+	.asciz	"clang version 22.0.0" # string offset=28
+.Linfo_string4:
+	.asciz	"helper.cpp"                    # string offset=49
+.Linfo_string5:
+	.asciz	"helper.dwo"                    # string offset=60
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	14
+	.long	24
+	.long	28
+	.long	49
+	.long	60
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	5                               # DWARF version number
+	.byte	5                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	5976014880088676049
+	.byte	1                               # Abbrev [1] 0x14:0x1b DW_TAG_compile_unit
+	.byte	3                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_dwo_name
+	.byte	2                               # Abbrev [2] 0x1a:0x10 DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	0                               # DW_AT_linkage_name
+	.byte	1                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.long	42                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x2a:0x4 DW_TAG_base_type
+	.byte	2                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.ident	"clang version 22.0.0"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s
index 8c05491e7bca0..ef0bb55df1faf 100644
--- a/bolt/test/X86/callcont-fallthru.s
+++ b/bolt/test/X86/callcont-fallthru.s
@@ -15,6 +15,8 @@
 # External return to a landing pad/entry point call continuation
 # RUN: link_fdata %s %t %t.pa-eret PREAGG-ERET
 # RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT
+## Fall-through imputing test cases
+# RUN: link_fdata %s %t %t.pa-imp PREAGG-IMP
 
 # RUN: llvm-strip --strip-unneeded %t -o %t.strip
 # RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh
@@ -63,6 +65,11 @@
 # RUN-DISABLED:   --check-prefix=CHECK-PLT
 # CHECK-PLT: traces mismatching disassembled function contents: 0
 
+## Check --impute-trace-fall-throughs accepting duplicate branch-only traces
+# RUN: perf2bolt %t --pa -p %t.pa-imp -o %t.pa-imp.fdata --impute-trace-fall-through
+# RUN: FileCheck %s --check-prefix=CHECK-IMP --input-file %t.pa-imp.fdata
+# CHECK-IMP: 0 [unknown] 0 1 main {{.*}} 0 3
+
   .globl foo
   .type foo, %function
 foo:
@@ -102,6 +109,8 @@ Ltmp1:
 
 Ltmp4:
 	cmpl	$0x0, -0x14(%rbp)
+# PREAGG-IMP: B X:0 #Ltmp4_br# 1 0
+# PREAGG-IMP: B X:0 #Ltmp4_br# 2 0
 Ltmp4_br:
 	je	Ltmp0
 
diff --git a/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test b/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test
index 673e86bb1533a..a08e352d605fe 100644
--- a/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test
+++ b/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test
@@ -1,4 +1,4 @@
-# UNSUPPORTED: true
+# REQUIRES: system-linux
 ; RUN: rm -rf %t
 ; RUN: mkdir %t
 ; RUN: cd %t
@@ -8,7 +8,8 @@
 ; RUN: llvm-dwp -e main.exe -o main.exe.dwp
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.exe.dwp | FileCheck -check-prefix=PRE-BOLT %s
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.dwp | FileCheck -check-prefix=PRE-BOLT-DWP-TU-INDEX %s
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --write-dwp
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-dwp -e main.exe.bolt -o main.exe.bolt.dwp
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.exe.bolt.dwp | FileCheck -check-prefix=BOLT %s
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.bolt.dwp | FileCheck -check-prefix=BOLT-DWP-TU-INDEX %s
 
diff --git a/bolt/test/X86/dwarf4-str-dwp-input-dwo-output.test b/bolt/test/X86/dwarf4-str-dwp-input-dwo-output.test
new file mode 100644
index 0000000000000..a0e8721374a87
--- /dev/null
+++ b/bolt/test/X86/dwarf4-str-dwp-input-dwo-output.test
@@ -0,0 +1,76 @@
+; RUN: split-file %p/Inputs/dwarf4-str-split-dwarf.s %t
+; RUN: cd %t
+; RUN: llvm-mc --split-dwarf-file=main.dwo --triple=x86_64-unknown-linux-gnu \
+; RUN: --filetype=obj main.s -o=main.o
+; RUN: llvm-mc --split-dwarf-file=helper.dwo --triple=x86_64-unknown-linux-gnu \
+; RUN: --filetype=obj helper.s -o=helper.o
+; RUN: %clang %cflags -gdwarf-4 -gsplit-dwarf=split main.o helper.o -o main.exe
+; RUN: llvm-dwp -e main.exe -o main.exe.dwp
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str main.exe.dwp \
+; RUN: | FileCheck -check-prefix=PRE-BOLT-STR %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets main.exe.dwp \
+; RUN: | FileCheck -check-prefix=PRE-BOLT-STR-OFFSETS %s
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str main.dwo.dwo \
+; RUN: | FileCheck -check-prefix=BOLT-MAIN-STR %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets main.dwo.dwo \
+; RUN: | FileCheck -check-prefix=BOLT-MAIN-STR-OFFSETS %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str helper.dwo.dwo \
+; RUN: | FileCheck -check-prefix=BOLT-HELPER-STR %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets helper.dwo.dwo \
+; RUN: | FileCheck -check-prefix=BOLT-HELPER-STR-OFFSETS %s
+
+;; For DWARF4, this test checks that strings are split correctly from a combined
+;; section in DWP file, into appropriate .dwo files.
+
+; PRE-BOLT-STR: 0x00000000: "main"
+; PRE-BOLT-STR: 0x00000005: "int"
+; PRE-BOLT-STR: 0x00000009: "clang version 22.0.0"
+; PRE-BOLT-STR: 0x0000001e: "main.cpp"
+; PRE-BOLT-STR: 0x00000027: "main.dwo"
+; PRE-BOLT-STR: 0x00000030: "_Z9getReturnv"
+; PRE-BOLT-STR: 0x0000003e: "getReturn"
+; PRE-BOLT-STR: 0x00000048: "helper.cpp"
+; PRE-BOLT-STR: 0x00000053: "helper.dwo"
+
+; PRE-BOLT-STR-OFFSETS: 0x00000000: Contribution size = 20, Format = DWARF32, Version = 4
+; PRE-BOLT-STR-OFFSETS: 0x00000000: 00000000 "main"
+; PRE-BOLT-STR-OFFSETS: 0x00000004: 00000005 "int"
+; PRE-BOLT-STR-OFFSETS: 0x00000008: 00000009 "clang version 22.0.0"
+; PRE-BOLT-STR-OFFSETS: 0x0000000c: 0000001e "main.cpp"
+; PRE-BOLT-STR-OFFSETS: 0x00000010: 00000027 "main.dwo"
+; PRE-BOLT-STR-OFFSETS: 0x00000014: Contribution size = 24, Format = DWARF32, Version = 4
+; PRE-BOLT-STR-OFFSETS: 0x00000014: 00000030 "_Z9getReturnv"
+; PRE-BOLT-STR-OFFSETS: 0x00000018: 0000003e "getReturn"
+; PRE-BOLT-STR-OFFSETS: 0x0000001c: 00000005 "int"
+; PRE-BOLT-STR-OFFSETS: 0x00000020: 00000009 "clang version 22.0.0"
+; PRE-BOLT-STR-OFFSETS: 0x00000024: 00000048 "helper.cpp"
+; PRE-BOLT-STR-OFFSETS: 0x00000028: 00000053 "helper.dwo"
+
+; BOLT-MAIN-STR: 0x00000000: "main"
+; BOLT-MAIN-STR: 0x00000005: "int"
+; BOLT-MAIN-STR: 0x00000009: "clang version 22.0.0"
+; BOLT-MAIN-STR: 0x0000001e: "main.cpp"
+; BOLT-MAIN-STR: 0x00000027: "main.dwo"
+
+; BOLT-MAIN-STR-OFFSETS: 0x00000000: Contribution size = 20, Format = DWARF32, Version = 4
+; BOLT-MAIN-STR-OFFSETS: 0x00000000: 00000000 "main"
+; BOLT-MAIN-STR-OFFSETS: 0x00000004: 00000005 "int"
+; BOLT-MAIN-STR-OFFSETS: 0x00000008: 00000009 "clang version 22.0.0"
+; BOLT-MAIN-STR-OFFSETS: 0x0000000c: 0000001e "main.cpp"
+; BOLT-MAIN-STR-OFFSETS: 0x00000010: 00000027 "main.dwo"
+
+; BOLT-HELPER-STR: 0x00000000: "_Z9getReturnv"
+; BOLT-HELPER-STR: 0x0000000e: "getReturn"
+; BOLT-HELPER-STR: 0x00000018: "int"
+; BOLT-HELPER-STR: 0x0000001c: "clang version 22.0.0"
+; BOLT-HELPER-STR: 0x00000031: "helper.cpp"
+; BOLT-HELPER-STR: 0x0000003c: "helper.dwo"
+
+; BOLT-HELPER-STR-OFFSETS: 0x00000000: Contribution size = 24, Format = DWARF32, Version = 4
+; BOLT-HELPER-STR-OFFSETS: 0x00000000: 00000000 "_Z9getReturnv"
+; BOLT-HELPER-STR-OFFSETS: 0x00000004: 0000000e "getReturn"
+; BOLT-HELPER-STR-OFFSETS: 0x00000008: 00000018 "int"
+; BOLT-HELPER-STR-OFFSETS: 0x0000000c: 0000001c "clang version 22.0.0"
+; BOLT-HELPER-STR-OFFSETS: 0x00000010: 00000031 "helper.cpp"
+; BOLT-HELPER-STR-OFFSETS: 0x00000014: 0000003c "helper.dwo"
diff --git a/bolt/test/X86/dwarf5-str-dwp-input-dwo-output.test b/bolt/test/X86/dwarf5-str-dwp-input-dwo-output.test
new file mode 100644
index 0000000000000..2e72c6a808924
--- /dev/null
+++ b/bolt/test/X86/dwarf5-str-dwp-input-dwo-output.test
@@ -0,0 +1,76 @@
+; RUN: split-file %p/Inputs/dwarf5-str-split-dwarf.s %t
+; RUN: cd %t
+; RUN: llvm-mc --split-dwarf-file=main.dwo --triple=x86_64-unknown-linux-gnu \
+; RUN: --filetype=obj main.s -o=main.o
+; RUN: llvm-mc --split-dwarf-file=helper.dwo --triple=x86_64-unknown-linux-gnu \
+; RUN: --filetype=obj helper.s -o=helper.o
+; RUN: %clang %cflags -gdwarf-4 -gsplit-dwarf=split main.o helper.o -o main.exe
+; RUN: llvm-dwp -e main.exe -o main.exe.dwp
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str main.exe.dwp \
+; RUN: | FileCheck -check-prefix=PRE-BOLT-STR %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets main.exe.dwp \
+; RUN: | FileCheck -check-prefix=PRE-BOLT-STR-OFFSETS %s
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str main.dwo.dwo \
+; RUN: | FileCheck -check-prefix=BOLT-MAIN-STR %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets main.dwo.dwo \
+; RUN: | FileCheck -check-prefix=BOLT-MAIN-STR-OFFSETS %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str helper.dwo.dwo \
+; RUN: | FileCheck -check-prefix=BOLT-HELPER-STR %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets helper.dwo.dwo \
+; RUN: | FileCheck -check-prefix=BOLT-HELPER-STR-OFFSETS %s
+
+;; For DWARF5, this test checks that strings are split correctly from a combined
+;; section in DWP file, into appropriate .dwo files.
+
+; PRE-BOLT-STR: 0x00000000: "main"
+; PRE-BOLT-STR: 0x00000005: "int"
+; PRE-BOLT-STR: 0x00000009: "clang version 22.0.0"
+; PRE-BOLT-STR: 0x0000001e: "main.cpp"
+; PRE-BOLT-STR: 0x00000027: "main.dwo"
+; PRE-BOLT-STR: 0x00000030: "_Z9getReturnv"
+; PRE-BOLT-STR: 0x0000003e: "getReturn"
+; PRE-BOLT-STR: 0x00000048: "helper.cpp"
+; PRE-BOLT-STR: 0x00000053: "helper.dwo"
+
+; PRE-BOLT-STR-OFFSETS: 0x00000000: Contribution size = 24, Format = DWARF32, Version = 5
+; PRE-BOLT-STR-OFFSETS: 0x00000008: 00000000 "main"
+; PRE-BOLT-STR-OFFSETS: 0x0000000c: 00000005 "int"
+; PRE-BOLT-STR-OFFSETS: 0x00000010: 00000009 "clang version 22.0.0"
+; PRE-BOLT-STR-OFFSETS: 0x00000014: 0000001e "main.cpp"
+; PRE-BOLT-STR-OFFSETS: 0x00000018: 00000027 "main.dwo"
+; PRE-BOLT-STR-OFFSETS: 0x0000001c: Contribution size = 28, Format = DWARF32, Version = 5
+; PRE-BOLT-STR-OFFSETS: 0x00000024: 00000030 "_Z9getReturnv"
+; PRE-BOLT-STR-OFFSETS: 0x00000028: 0000003e "getReturn"
+; PRE-BOLT-STR-OFFSETS: 0x0000002c: 00000005 "int"
+; PRE-BOLT-STR-OFFSETS: 0x00000030: 00000009 "clang version 22.0.0"
+; PRE-BOLT-STR-OFFSETS: 0x00000034: 00000048 "helper.cpp"
+; PRE-BOLT-STR-OFFSETS: 0x00000038: 00000053 "helper.dwo"
+
+; BOLT-MAIN-STR: 0x00000000: "main"
+; BOLT-MAIN-STR: 0x00000005: "int"
+; BOLT-MAIN-STR: 0x00000009: "clang version 22.0.0"
+; BOLT-MAIN-STR: 0x0000001e: "main.cpp"
+; BOLT-MAIN-STR: 0x00000027: "main.dwo"
+
+; BOLT-MAIN-STR-OFFSETS: 0x00000000: Contribution size = 24, Format = DWARF32, Version = 5
+; BOLT-MAIN-STR-OFFSETS: 0x00000008: 00000000 "main"
+; BOLT-MAIN-STR-OFFSETS: 0x0000000c: 00000005 "int"
+; BOLT-MAIN-STR-OFFSETS: 0x00000010: 00000009 "clang version 22.0.0"
+; BOLT-MAIN-STR-OFFSETS: 0x00000014: 0000001e "main.cpp"
+; BOLT-MAIN-STR-OFFSETS: 0x00000018: 00000027 "main.dwo"
+
+; BOLT-HELPER-STR: 0x00000000: "_Z9getReturnv"
+; BOLT-HELPER-STR: 0x0000000e: "getReturn"
+; BOLT-HELPER-STR: 0x00000018: "int"
+; BOLT-HELPER-STR: 0x0000001c: "clang version 22.0.0"
+; BOLT-HELPER-STR: 0x00000031: "helper.cpp"
+; BOLT-HELPER-STR: 0x0000003c: "helper.dwo"
+
+; BOLT-HELPER-STR-OFFSETS: 0x00000000: Contribution size = 28, Format = DWARF32, Version = 5
+; BOLT-HELPER-STR-OFFSETS: 0x00000008: 00000000 "_Z9getReturnv"
+; BOLT-HELPER-STR-OFFSETS: 0x0000000c: 0000000e "getReturn"
+; BOLT-HELPER-STR-OFFSETS: 0x00000010: 00000018 "int"
+; BOLT-HELPER-STR-OFFSETS: 0x00000014: 0000001c "clang version 22.0.0"
+; BOLT-HELPER-STR-OFFSETS: 0x00000018: 00000031 "helper.cpp"
+; BOLT-HELPER-STR-OFFSETS: 0x0000001c: 0000003c "helper.dwo"
diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes-inline.test b/bolt/test/X86/match-blocks-with-pseudo-probes-inline.test
index accb4742851ea..9224cf163dbcc 100644
--- a/bolt/test/X86/match-blocks-with-pseudo-probes-inline.test
+++ b/bolt/test/X86/match-blocks-with-pseudo-probes-inline.test
@@ -30,7 +30,7 @@ functions:
         insns:           11
         hash:            0x1
         exec:            1
-        probes:          [ { blx: 9 } ]
+        probes:          [ { blk: [ 1, 4 ] } ]
     inline_tree:     [ {  } ]
   - name:            foo
     fid:             10
@@ -43,7 +43,7 @@ functions:
         hash:            0x2
         exec:            1
         succ:            [ { bid: 3, cnt: 0 } ]
-        probes:          [ { blx: 3 } ]
+        probes:          [ { blk: [ 1, 2 ] } ]
     inline_tree:     [ { g: 1 }, { g: 0, cs: 8 } ]
   - name:            main
     fid:             11
@@ -56,7 +56,7 @@ functions:
         hash:            0x3
         exec:            1
         succ:            [ { bid: 3, cnt: 0 } ]
-        probes:          [ { blx: 3, id: 1 }, { blx: 1 } ]
+        probes:          [ { blk: [ 1, 2 ], ids: [ 1 ] }, { blk: [ 1 ] } ]
     inline_tree:     [ { g: 2 }, { g: 1, cs: 2 }, { g: 0, p: 1, cs: 8 } ]
 pseudo_probe_desc:
   gs:              [ 0xE413754A191DB537, 0x5CF8C24CDB18BDAC, 0xDB956436E78DD5FA ]
diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes.test b/bolt/test/X86/match-blocks-with-pseudo-probes.test
index 40cb64ee82919..7be327d698b17 100644
--- a/bolt/test/X86/match-blocks-with-pseudo-probes.test
+++ b/bolt/test/X86/match-blocks-with-pseudo-probes.test
@@ -55,7 +55,7 @@ functions:
         hash:            0xFFFFFFFFFFFFFFF1
         insns:           1
         succ:            [ { bid: 3, cnt: 1} ]
-        probes:          [ { blx: 1 } ]
+        probes:          [ { blk: [ 1 ] } ]
     inline_tree:         [ { g: 0 } ]
 pseudo_probe_desc:
   gs:              [ 0xDB956436E78DD5FA ]
diff --git a/bolt/test/X86/pseudoprobe-decoding-inline.test b/bolt/test/X86/pseudoprobe-decoding-inline.test
index e5e8aadc18f9e..9748fc1b6a4d4 100644
--- a/bolt/test/X86/pseudoprobe-decoding-inline.test
+++ b/bolt/test/X86/pseudoprobe-decoding-inline.test
@@ -14,17 +14,17 @@
 # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML
 # CHECK-YAML: name: bar
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   probes: [ { blx: 9 } ]
+# CHECK-YAML:   probes: [ { blk: [ 1, 4 ] } ]
 # CHECK-YAML: inline_tree: [ { } ]
 #
 # CHECK-YAML: name: foo
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   probes: [ { blx: 3 } ]
+# CHECK-YAML:   probes: [ { blk: [ 1, 2 ] } ]
 # CHECK-YAML: inline_tree: [ { g: 1 }, { g: 0, cs: 8 } ]
 #
 # CHECK-YAML: name: main
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   probes: [ { blx: 3, id: 1 }, { blx: 1 } ]
+# CHECK-YAML:   probes: [ { blk: [ 1, 2 ], ids: [ 1 ] }, { } ]
 # CHECK-YAML: inline_tree: [ { g: 2 }, { g: 1, cs: 2 }, { g: 0, p: 1, cs: 8 } ]
 #
 # CHECK-YAML: pseudo_probe_desc:
diff --git a/bolt/test/X86/pseudoprobe-decoding-noinline.test b/bolt/test/X86/pseudoprobe-decoding-noinline.test
index 36a2fab74e857..4ba51cdc96f9e 100644
--- a/bolt/test/X86/pseudoprobe-decoding-noinline.test
+++ b/bolt/test/X86/pseudoprobe-decoding-noinline.test
@@ -15,17 +15,18 @@
 # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML
 # CHECK-YAML: name: bar
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   probes: [ { blx: 9 } ]
+# CHECK-YAML:   probes: [ { blk: [ 1, 4 ] } ]
 # CHECK-YAML: inline_tree: [ {  } ]
 #
 # CHECK-YAML: name: foo
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   probes: [ { blx: 3 } ]
+# CHECK-YAML:   probes: [ { blk: [ 1, 2 ] } ]
 # CHECK-YAML: inline_tree: [ { g: 2 } ]
 #
 # CHECK-YAML: name: main
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   probes: [ { blx: 1, call: [ 2 ] } ]
+# CHECK-YAML:   calls: [ { off: 0x4, fid: 0, cnt: 0, pp: 2 } ]
+# CHECK-YAML:   probes: [ { } ]
 # CHECK-YAML: inline_tree: [ { g: 1 } ]
 #
 # CHECK-YAML: pseudo_probe_desc:
diff --git a/bolt/test/X86/rseq.s b/bolt/test/X86/rseq.s
new file mode 100644
index 0000000000000..ef81bca02c8b7
--- /dev/null
+++ b/bolt/test/X86/rseq.s
@@ -0,0 +1,38 @@
+## Check that llvm-bolt avoids optimization of functions referenced from
+## __rseq_cs section, i.e. containing critical sections and abort handlers used
+## by restartable sequences in tcmalloc.
+
+# RUN: %clang %cflags %s -o %t -nostdlib -no-pie -Wl,-q
+# RUN: llvm-bolt %t -o %t.bolt --print-cfg 2>&1 | FileCheck %s
+# RUN: %clang %cflags %s -o %t.pie -nostdlib -pie -Wl,-q
+# RUN: llvm-bolt %t.pie -o %t.pie.bolt 2>&1 | FileCheck %s
+
+# CHECK: restartable sequence reference detected in _start
+# CHECK: restartable sequence reference detected in __rseq_abort
+
+## Force relocations against .text
+  .text
+.reloc 0, R_X86_64_NONE
+
+  .global _start
+  .type _start, %function
+_start:
+  pushq %rbp
+  mov %rsp, %rbp
+.L1:
+  pop %rbp
+.L2:
+  retq
+  .size _start, .-_start
+
+  .section __rseq_abort, "ax"
+## Signature for rseq abort IP. Unmarked in the symbol table.
+  .byte 0x0f, 0x1f, 0x05
+  .long 0x42424242
+.L3:
+  jmp .L2
+
+.section __rseq_cs, "aw"
+.balign 32
+  .quad .L1
+  .quad .L3
diff --git a/bolt/test/X86/unclaimed-pc-rel.s b/bolt/test/X86/unclaimed-pc-rel.s
new file mode 100644
index 0000000000000..5292cccba754d
--- /dev/null
+++ b/bolt/test/X86/unclaimed-pc-rel.s
@@ -0,0 +1,24 @@
+## Check that unclaimed PC-relative relocation from data to code is detected
+## and reported to the user.
+
+# REQUIRES: system-linux
+
+# RUN: %clang %cflags -no-pie %s -o %t.exe -Wl,-q -nostartfiles
+# RUN: not llvm-bolt %t.exe -o %t.bolt --strict 2>&1 | FileCheck %s
+
+# CHECK: BOLT-ERROR: 1 unclaimed PC-relative relocation(s) left in data
+
+  .text
+  .globl _start
+  .type _start, %function
+_start:
+  movl $42, %eax
+.L0:
+  ret
+  .size _start, .-_start
+
+## Force relocation mode.
+  .reloc 0, R_X86_64_NONE
+
+  .section .rodata
+  .long .L0-.
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index dc59a08b889a7..75066c855b9ed 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,7 +7,7 @@
 # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
 # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
 
-# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 16 total calls)
+# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 17 total calls)
 # CHECK-INLINE: BOLT-INFO: inlined 11 memcpy() calls
 
 # Each function should use optimal size-specific instructions and NO memcpy calls
@@ -81,11 +81,14 @@
 # CHECK-ASM: bl{{.*}}<memcpy
 
 # Register move should NOT be inlined (size unknown at compile time)
-# CHECK-ASM-LABEL: <test_register_move_negative>:
+# CHECK-ASM-LABEL: <test_register_move_unknown>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
+# CHECK-ASM-LABEL: <test_x2_rewrite_unknown>:
 # CHECK-ASM: bl{{.*}}<memcpy
 
 # Live-in parameter should NOT be inlined (size unknown at compile time)
-# CHECK-ASM-LABEL: <test_live_in_negative>:
+# CHECK-ASM-LABEL: <test_live_in_unknown>:
 # CHECK-ASM: bl{{.*}}<memcpy
 
 # _memcpy8 should be inlined with end-pointer return (dest+size)
@@ -259,9 +262,9 @@ test_4_byte_add_immediate:
 	ret
 	.size	test_4_byte_add_immediate, .-test_4_byte_add_immediate
 
-	.globl	test_register_move_negative
-	.type	test_register_move_negative,@function
-test_register_move_negative:
+	.globl	test_register_move_unknown
+	.type	test_register_move_unknown,@function
+test_register_move_unknown:
 	stp	x29, x30, [sp, #-32]!
 	mov	x29, sp
 	add	x1, sp, #16
@@ -271,11 +274,20 @@ test_register_move_negative:
 	bl	memcpy
 	ldp	x29, x30, [sp], #32
 	ret
-	.size	test_register_move_negative, .-test_register_move_negative
+	.size	test_register_move_unknown, .-test_register_move_unknown
+
+	.globl  test_x2_rewrite_unknown
+	.type   test_x2_rewrite_unknown,@function
+test_x2_rewrite_unknown:
+	mov     x2, #8
+	ldr     x2, [sp, #24]
+	bl      memcpy
+	ret
+	.size   test_x2_rewrite_unknown, .-test_x2_rewrite_unknown
 
-	.globl	test_live_in_negative
-	.type	test_live_in_negative,@function
-test_live_in_negative:
+	.globl	test_live_in_unknown
+	.type	test_live_in_unknown,@function
+test_live_in_unknown:
 	# x2 comes in as parameter, no instruction sets it (should NOT inline)
 	stp	x29, x30, [sp, #-32]!
 	mov	x29, sp
@@ -285,7 +297,7 @@ test_live_in_negative:
 	bl	memcpy
 	ldp	x29, x30, [sp], #32
 	ret
-	.size	test_live_in_negative, .-test_live_in_negative
+	.size	test_live_in_unknown, .-test_live_in_unknown
 
 	.globl	test_memcpy8_4_byte
 	.type	test_memcpy8_4_byte,@function
diff --git a/bolt/unittests/Profile/PerfSpeEvents.cpp b/bolt/unittests/Profile/PerfSpeEvents.cpp
index 8d023cd7b7e74..4f060cd0aa7c8 100644
--- a/bolt/unittests/Profile/PerfSpeEvents.cpp
+++ b/bolt/unittests/Profile/PerfSpeEvents.cpp
@@ -161,4 +161,92 @@ TEST_F(PerfSpeEventsTestHelper, SpeBranchesWithBrstack) {
   parseAndCheckBrstackEvents(1234, ExpectedSamples);
 }
 
+TEST_F(PerfSpeEventsTestHelper, SpeBranchesWithBrstackAndPbt) {
+  // Check perf input with SPE branch events as brstack format by
+  // combining with the previous branch target address (named as PBT).
+  // Example collection command:
+  // ```
+  // perf record -e 'arm_spe_0/branch_filter=1/u' -- BINARY
+  // ```
+  // How Bolt extracts the branch events:
+  // ```
+  // perf script -F pid,brstack --itrace=bl
+  // ```
+
+  opts::ArmSPE = true;
+  opts::ReadPerfEvents =
+      // "<PID> <SRC>/<DEST>/PN/-/-/10/COND/- <NULL>/<PBT>/-/-/-/0//-\n"
+      "  4567  0xa002/0xa003/PN/-/-/10/COND/- 0x0/0xa001/-/-/-/0//-\n"
+      "  4567  0xb002/0xb003/P/-/-/4/RET/- 0x0/0xb001/-/-/-/0//-\n"
+      "  4567  0xc456/0xc789/P/-/-/13/-/- 0x0/0xc123/-/-/-/0//-\n"
+      "  4567  0xd456/0xd789/M/-/-/7/RET/- 0x0/0xd123/-/-/-/0//-\n"
+      "  4567  0xe005/0xe009/P/-/-/14/RET/- 0x0/0xe001/-/-/-/0//-\n"
+      "  4567  0xd456/0xd789/M/-/-/7/RET/- 0x0/0xd123/-/-/-/0//-\n"
+      "  4567  0xf002/0xf003/MN/-/-/8/COND/- 0x0/0xf001/-/-/-/0//-\n"
+      "  4567  0xc456/0xc789/P/-/-/13/-/- 0x0/0xc123/-/-/-/0//-\n";
+
+  // ExpectedSamples contains the aggregated information about
+  // a branch {{From, To, TraceTo}, {TakenCount, MispredCount}}.
+  // Where
+  // - From: is the source address of the sampled branch operation.
+  // - To: is the target address of the sampled branch operation.
+  // - TraceTo could be either
+  //    - A 'Type = Trace::BR_ONLY', which means the trace only contains branch
+  //    data.
+  //    - Or an address, when the trace contains information about the previous
+  //    branch.
+  //
+  // When FEAT_SPE_PBT is present, Arm SPE emits two records per sample:
+  // - the current branch (Spe.From/Spe.To), and
+  // - the previous taken branch target (PBT) (PBT.From, PBT.To).
+  //
+  // Together they behave like a depth-1 branch stack where:
+  //   - the PBT entry is always taken
+  //   - the current branch entry may represent a taken branch or a fall-through
+  //   - the destination (Spe.To) is the architecturally executed target
+  //
+  // There can be fall-throughs to be inferred between the PBT entry and
+  // the current branch (Spe.From), but there cannot be between current
+  // branch's (Spe.From/Spe.To).
+  //
+  // PBT records only the target address (PBT.To), meaning we have no
+  // information as the branch source (PBT.From=0x0), branch type, and the
+  // prediction bit.
+  //
+  // Consider the trace pair:
+  // {{Spe.From, Spe.To, Type}, {TK, MP}},
+  //   {{PBT.From, PBT.To, TraceTo}, {TK, MP}}
+  // {{0xd456, 0xd789, Trace::BR_ONLY}, {2, 2}}, {{0x0, 0xd123, 0xd456}, {2, 0}}
+  //
+  // The first entry is the Spe record, which represents a trace from 0xd456
+  // (Spe.From) to 0xd789 (Spe.To). Type = Trace::BR_ONLY, as Bolt processes the
+  // current branch event first. At this point we have no information about the
+  // previous trace (PBT). This entry has a TakenCount = 2, as we have two
+  // samples for (0xd456, 0xd789) in our input. It also has MispredsCount = 2,
+  // as 'M' misprediction flag appears in both cases.
+  //
+  // The second entry is the PBT record. TakenCount = 2 because the
+  // (PBT.From = 0x0, PBT.To = 0xd123) branch target appears twice in the input,
+  // and MispredsCount = 0 because prediction data is absent. There is no branch
+  // source information, so the PBT.From field is zero (0x0). TraceTo = 0xd456
+  // connect the flow from the previous taken branch at 0xd123 (PBT.To) to the
+  // current source branch at 0xd456 (Spe.From), which then continues to 0xd789
+  // (Spe.To).
+  std::vector<std::pair<Trace, TakenBranchInfo>> ExpectedSamples = {
+      {{0xa002, 0xa003, Trace::BR_ONLY}, {1, 0}},
+      {{0x0, 0xa001, 0xa002}, {1, 0}},
+      {{0xb002, 0xb003, Trace::BR_ONLY}, {1, 0}},
+      {{0x0, 0xb001, 0xb002}, {1, 0}},
+      {{0xc456, 0xc789, Trace::BR_ONLY}, {2, 0}},
+      {{0x0, 0xc123, 0xc456}, {2, 0}},
+      {{0xd456, 0xd789, Trace::BR_ONLY}, {2, 2}},
+      {{0x0, 0xd123, 0xd456}, {2, 0}},
+      {{0xe005, 0xe009, Trace::BR_ONLY}, {1, 0}},
+      {{0x0, 0xe001, 0xe005}, {1, 0}},
+      {{0xf002, 0xf003, Trace::BR_ONLY}, {1, 1}},
+      {{0x0, 0xf001, 0xf002}, {1, 0}}};
+
+  parseAndCheckBrstackEvents(4567, ExpectedSamples);
+}
+
 #endif
diff --git a/clang-tools-extra/Maintainers.txt b/clang-tools-extra/Maintainers.rst
similarity index 91%
rename from clang-tools-extra/Maintainers.txt
rename to clang-tools-extra/Maintainers.rst
index 43dfd48ad1f57..2603ebadf529c 100644
--- a/clang-tools-extra/Maintainers.txt
+++ b/clang-tools-extra/Maintainers.rst
@@ -2,9 +2,13 @@
 Clang Tools Extra Maintainers
 =============================
 
-This file is a list of the maintainers
-(https://llvm.org/docs/DeveloperPolicy.html#maintainers) for clang-tools-extra.
+This file is a list of the 
+`maintainers <https://llvm.org/docs/DeveloperPolicy.html#maintainers>`_
+for `Extra Clang Tools <https://clang.llvm.org/extra/index.html>`_ project.
 
+.. contents::
+   :depth: 2
+   :local:
 
 Active Maintainers
 ==================
diff --git a/clang-tools-extra/clang-doc/BitcodeWriter.cpp b/clang-tools-extra/clang-doc/BitcodeWriter.cpp
index e23511bf63690..3a7ac6e2abcdd 100644
--- a/clang-tools-extra/clang-doc/BitcodeWriter.cpp
+++ b/clang-tools-extra/clang-doc/BitcodeWriter.cpp
@@ -303,8 +303,6 @@ static const std::vector<std::pair<BlockId, std::vector<RecordId>>>
 
 // AbbreviationMap
 
-constexpr unsigned char BitCodeConstants::Signature[];
-
 void ClangDocBitcodeWriter::AbbreviationMap::add(RecordId RID,
                                                  unsigned AbbrevID) {
   assert(RecordIdNameMap[RID] && "Unknown RecordId.");
diff --git a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp b/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
index b4b9322b0500a..1e757101549c6 100644
--- a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
+++ b/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
@@ -29,7 +29,8 @@ namespace clang {
 namespace doc {
 static Error generateDocForJSON(json::Value &JSON, StringRef Filename,
                                 StringRef Path, raw_fd_ostream &OS,
-                                const ClangDocContext &CDCtx);
+                                const ClangDocContext &CDCtx,
+                                StringRef HTMLRootPath);
 
 static Error createFileOpenError(StringRef FileName, std::error_code EC) {
   return createFileError("cannot open file " + FileName, EC);
@@ -155,20 +156,27 @@ Error MustacheHTMLGenerator::generateDocs(
   SmallString<128> JSONPath;
   sys::path::native(RootDir.str() + "/json", JSONPath);
 
-  StringMap<json::Value> JSONFileMap;
   {
     llvm::TimeTraceScope TS("Iterate JSON files");
     std::error_code EC;
-    sys::fs::directory_iterator JSONIter(JSONPath, EC);
+    sys::fs::recursive_directory_iterator JSONIter(JSONPath, EC);
     std::vector<json::Value> JSONFiles;
     JSONFiles.reserve(Infos.size());
     if (EC)
       return createStringError("Failed to create directory iterator.");
 
-    SmallString<128> HTMLDirPath(RootDir.str() + "/html/");
+    SmallString<128> HTMLDirPath(RootDir.str() + "/html");
     if (auto EC = sys::fs::create_directories(HTMLDirPath))
       return createFileError(HTMLDirPath, EC);
-    while (JSONIter != sys::fs::directory_iterator()) {
+    while (JSONIter != sys::fs::recursive_directory_iterator()) {
+      // create the same directory structure in the HTML dir
+      if (JSONIter->type() == sys::fs::file_type::directory_file) {
+        SmallString<128> HTMLClonedPath(JSONIter->path());
+        sys::path::replace_path_prefix(HTMLClonedPath, JSONPath, HTMLDirPath);
+        if (auto EC = sys::fs::create_directories(HTMLClonedPath))
+          return createFileError(HTMLClonedPath, EC);
+      }
+
       if (EC)
         return createFileError("Failed to iterate: " + JSONIter->path(), EC);
 
@@ -190,15 +198,16 @@ Error MustacheHTMLGenerator::generateDocs(
         return Parsed.takeError();
 
       std::error_code FileErr;
-      SmallString<128> HTMLFilePath(HTMLDirPath);
-      sys::path::append(HTMLFilePath, sys::path::filename(Path));
+      SmallString<128> HTMLFilePath(JSONIter->path());
+      sys::path::replace_path_prefix(HTMLFilePath, JSONPath, HTMLDirPath);
       sys::path::replace_extension(HTMLFilePath, "html");
       raw_fd_ostream InfoOS(HTMLFilePath, FileErr, sys::fs::OF_None);
       if (FileErr)
         return createFileOpenError(Path, FileErr);
 
-      if (Error Err = generateDocForJSON(*Parsed, sys::path::stem(HTMLFilePath),
-                                         HTMLFilePath, InfoOS, CDCtx))
+      if (Error Err =
+              generateDocForJSON(*Parsed, sys::path::stem(HTMLFilePath),
+                                 HTMLFilePath, InfoOS, CDCtx, HTMLDirPath))
         return Err;
       JSONIter.increment(EC);
     }
@@ -207,16 +216,16 @@ Error MustacheHTMLGenerator::generateDocs(
   return Error::success();
 }
 
-static Error setupTemplateValue(const ClangDocContext &CDCtx, json::Value &V) {
+static Error setupTemplateValue(const ClangDocContext &CDCtx, json::Value &V,
+                                SmallString<128> RelativeHTMLPath) {
   V.getAsObject()->insert({"ProjectName", CDCtx.ProjectName});
   json::Value StylesheetArr = Array();
-  SmallString<128> RelativePath("./");
-  sys::path::native(RelativePath, sys::path::Style::posix);
+  sys::path::native(RelativeHTMLPath, sys::path::Style::posix);
 
   auto *SSA = StylesheetArr.getAsArray();
   SSA->reserve(CDCtx.UserStylesheets.size());
   for (const auto &FilePath : CDCtx.UserStylesheets) {
-    SmallString<128> StylesheetPath = RelativePath;
+    SmallString<128> StylesheetPath = RelativeHTMLPath;
     sys::path::append(StylesheetPath, sys::path::Style::posix,
                       sys::path::filename(FilePath));
     SSA->emplace_back(StylesheetPath);
@@ -227,7 +236,7 @@ static Error setupTemplateValue(const ClangDocContext &CDCtx, json::Value &V) {
   auto *SCA = ScriptArr.getAsArray();
   SCA->reserve(CDCtx.JsScripts.size());
   for (auto Script : CDCtx.JsScripts) {
-    SmallString<128> JsPath = RelativePath;
+    SmallString<128> JsPath = RelativeHTMLPath;
     sys::path::append(JsPath, sys::path::Style::posix,
                       sys::path::filename(Script));
     SCA->emplace_back(JsPath);
@@ -238,7 +247,8 @@ static Error setupTemplateValue(const ClangDocContext &CDCtx, json::Value &V) {
 
 static Error generateDocForJSON(json::Value &JSON, StringRef Filename,
                                 StringRef Path, raw_fd_ostream &OS,
-                                const ClangDocContext &CDCtx) {
+                                const ClangDocContext &CDCtx,
+                                StringRef HTMLRootPath) {
   auto StrValue = (*JSON.getAsObject())["InfoType"];
   if (StrValue.kind() != json::Value::Kind::String)
     return createStringError("JSON file '%s' does not contain key: 'InfoType'.",
@@ -249,13 +259,17 @@ static Error generateDocForJSON(json::Value &JSON, StringRef Filename,
         "JSON file '%s' does not contain 'InfoType' field as a string.",
         Filename.str().c_str());
 
+  SmallString<128> PathVec(Path);
+  // Remove filename, or else the relative path will have an extra "../"
+  sys::path::remove_filename(PathVec);
+  auto RelativeHTMLPath = computeRelativePath(HTMLRootPath, PathVec);
   if (ObjTypeStr.value() == "namespace") {
-    if (auto Err = setupTemplateValue(CDCtx, JSON))
+    if (auto Err = setupTemplateValue(CDCtx, JSON, RelativeHTMLPath))
       return Err;
     assert(NamespaceTemplate && "NamespaceTemplate is nullptr.");
     NamespaceTemplate->render(JSON, OS);
   } else if (ObjTypeStr.value() == "record") {
-    if (auto Err = setupTemplateValue(CDCtx, JSON))
+    if (auto Err = setupTemplateValue(CDCtx, JSON, RelativeHTMLPath))
       return Err;
     assert(RecordTemplate && "RecordTemplate is nullptr.");
     RecordTemplate->render(JSON, OS);
diff --git a/clang-tools-extra/clang-doc/JSONGenerator.cpp b/clang-tools-extra/clang-doc/JSONGenerator.cpp
index b17cc80bdba34..9a770d7939a4b 100644
--- a/clang-tools-extra/clang-doc/JSONGenerator.cpp
+++ b/clang-tools-extra/clang-doc/JSONGenerator.cpp
@@ -468,7 +468,6 @@ static void insertArray(Object &Obj, json::Value &Array, StringRef Key) {
 static void serializeInfo(const RecordInfo &I, json::Object &Obj,
                           const std::optional<StringRef> &RepositoryUrl) {
   serializeCommonAttributes(I, Obj, RepositoryUrl);
-  Obj["FullName"] = I.FullName;
   Obj["TagType"] = getTagType(I.TagType);
   Obj["IsTypedef"] = I.IsTypeDef;
   Obj["MangledName"] = I.MangledName;
@@ -582,22 +581,14 @@ static SmallString<16> determineFileName(Info *I, SmallString<128> &Path) {
   if (I->IT == InfoType::IT_record) {
     auto *RecordSymbolInfo = static_cast<SymbolInfo *>(I);
     FileName = RecordSymbolInfo->MangledName;
-  } else if (I->USR == GlobalNamespaceID)
+  } else if (I->IT == InfoType::IT_namespace) {
     FileName = "index";
-  else if (I->IT == InfoType::IT_namespace) {
-    for (const auto &NS : I->Namespace) {
-      FileName += NS.Name;
-      FileName += "_";
-    }
-    FileName += I->Name;
   } else
     FileName = I->Name;
   sys::path::append(Path, FileName + ".json");
   return FileName;
 }
 
-// FIXME: Revert back to creating nested directories for namespaces instead of
-// putting everything in a flat directory structure.
 Error JSONGenerator::generateDocs(
     StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
     const ClangDocContext &CDCtx) {
@@ -610,6 +601,7 @@ Error JSONGenerator::generateDocs(
     auto RootDirStr = RootDir.str() + "/json";
     StringRef JSONDir = StringRef(RootDirStr);
     sys::path::native(JSONDir, Path);
+    sys::path::append(Path, Info->getRelativeFilePath(""));
     if (!CreatedDirs.contains(Path)) {
       if (std::error_code Err = sys::fs::create_directories(Path);
           Err != std::error_code())
diff --git a/clang-tools-extra/clang-doc/Representation.h b/clang-tools-extra/clang-doc/Representation.h
index d8c2b9c0a5842..79e9bfc291c3a 100644
--- a/clang-tools-extra/clang-doc/Representation.h
+++ b/clang-tools-extra/clang-doc/Representation.h
@@ -437,10 +437,6 @@ struct FunctionInfo : public SymbolInfo {
   // (AS_public = 0, AS_protected = 1, AS_private = 2, AS_none = 3)
   AccessSpecifier Access = AccessSpecifier::AS_public;
 
-  // Full qualified name of this function, including namespaces and template
-  // specializations.
-  SmallString<16> FullName;
-
   // Function Prototype
   SmallString<256> Prototype;
 
@@ -460,10 +456,6 @@ struct RecordInfo : public SymbolInfo {
   // Type of this record (struct, class, union, interface).
   TagTypeKind TagType = TagTypeKind::Struct;
 
-  // Full qualified name of this record, including namespaces and template
-  // specializations.
-  SmallString<16> FullName;
-
   // When present, this record is a template or specialization.
   std::optional<TemplateInfo> Template;
 
diff --git a/clang-tools-extra/clang-doc/Serialize.cpp b/clang-tools-extra/clang-doc/Serialize.cpp
index 186f634dd892a..7f8691d63622f 100644
--- a/clang-tools-extra/clang-doc/Serialize.cpp
+++ b/clang-tools-extra/clang-doc/Serialize.cpp
@@ -178,55 +178,6 @@ static llvm::SmallString<16> getTypeAlias(const TypeAliasDecl *Alias) {
   return Result;
 }
 
-// extract full syntax for record declaration
-static llvm::SmallString<16> getRecordPrototype(const CXXRecordDecl *CXXRD) {
-  llvm::SmallString<16> Result;
-  LangOptions LangOpts;
-  PrintingPolicy Policy(LangOpts);
-  Policy.SuppressTagKeyword = false;
-  Policy.FullyQualifiedName = true;
-  Policy.IncludeNewlines = false;
-  llvm::raw_svector_ostream OS(Result);
-  if (const auto *TD = CXXRD->getDescribedClassTemplate()) {
-    OS << "template <";
-    bool FirstParam = true;
-    for (const auto *Param : *TD->getTemplateParameters()) {
-      if (!FirstParam)
-        OS << ", ";
-      Param->print(OS, Policy);
-      FirstParam = false;
-    }
-    OS << ">\n";
-  }
-
-  if (CXXRD->isStruct())
-    OS << "struct ";
-  else if (CXXRD->isClass())
-    OS << "class ";
-  else if (CXXRD->isUnion())
-    OS << "union ";
-
-  OS << CXXRD->getNameAsString();
-
-  // We need to make sure we have a good enough declaration to check. In the
-  // case where the class is a forward declaration, we'll fail assertions  in
-  // DeclCXX.
-  if (CXXRD->isCompleteDefinition() && CXXRD->getNumBases() > 0) {
-    OS << " : ";
-    bool FirstBase = true;
-    for (const auto &Base : CXXRD->bases()) {
-      if (!FirstBase)
-        OS << ", ";
-      if (Base.isVirtual())
-        OS << "virtual ";
-      OS << getAccessSpelling(Base.getAccessSpecifier()) << " ";
-      OS << Base.getType().getAsString(Policy);
-      FirstBase = false;
-    }
-  }
-  return Result;
-}
-
 // A function to extract the appropriate relative path for a given info's
 // documentation. The path returned is a composite of the parent namespaces.
 //
@@ -1033,7 +984,6 @@ emitInfo(const RecordDecl *D, const FullComment *FC, Location Loc,
   parseFields(*RI, D, PublicOnly);
 
   if (const auto *C = dyn_cast<CXXRecordDecl>(D)) {
-    RI->FullName = getRecordPrototype(C);
     if (const TypedefNameDecl *TD = C->getTypedefNameForAnonDecl()) {
       RI->Name = TD->getNameAsString();
       RI->IsTypeDef = true;
diff --git a/clang-tools-extra/clang-doc/assets/class-template.mustache b/clang-tools-extra/clang-doc/assets/class-template.mustache
index b1a7470f7c33a..a320a938a91ff 100644
--- a/clang-tools-extra/clang-doc/assets/class-template.mustache
+++ b/clang-tools-extra/clang-doc/assets/class-template.mustache
@@ -141,9 +141,7 @@
                 <div>
                     {{#PublicMembers}}
                     <div id="{{Name}}" class="delimiter-container">
-                        <pre>
-                            <code class="language-cpp code-clang-doc" >{{Type}} {{Name}}</code>
-                        </pre>
+                        <pre><code class="language-cpp code-clang-doc" >{{Type}} {{Name}}</code></pre>
                         {{#MemberComments}}
                         <div>
                             {{>Comments}}
@@ -160,9 +158,7 @@
                 <div>
                     {{#Obj}}
                     <div id="{{Name}}" class="delimiter-container">
-                        <pre>
-<code class="language-cpp code-clang-doc" >{{Type}} {{Name}}</code>
-                        </pre>
+                        <pre><code class="language-cpp code-clang-doc" >{{Type}} {{Name}}</code></pre>
                         {{#MemberComments}}
                         <div>
                             {{>Comments}}
diff --git a/clang-tools-extra/clang-doc/assets/namespace-template.mustache b/clang-tools-extra/clang-doc/assets/namespace-template.mustache
index d96bc5ce91f3a..f4a35cfe4c79a 100644
--- a/clang-tools-extra/clang-doc/assets/namespace-template.mustache
+++ b/clang-tools-extra/clang-doc/assets/namespace-template.mustache
@@ -92,9 +92,7 @@
                         {{#Records}}
                             <li id="{{USR}}" style="max-height: 40px;">
                                 <a href="{{DocumentationFileName}}.html">
-                                    <pre>
-                                        <code class="language-cpp code-clang-doc">class {{Name}}</code>
-                                    </pre>
+                                    <pre><code class="language-cpp code-clang-doc">class {{Name}}</code></pre>
                                 </a>
                             </li>
                         {{/Records}}
diff --git a/clang-tools-extra/clang-tidy/.clang-tidy b/clang-tools-extra/clang-tidy/.clang-tidy
index 0c2f34b529016..2cd9af494c1ec 100644
--- a/clang-tools-extra/clang-tidy/.clang-tidy
+++ b/clang-tools-extra/clang-tidy/.clang-tidy
@@ -15,7 +15,6 @@ Checks: >
   performance-*,
   -performance-enum-size,
   -performance-no-int-to-ptr,
-  -performance-unnecessary-value-param,
   readability-*,
   -readability-avoid-nested-conditional-operator,
   -readability-braces-around-statements,
diff --git a/clang-tools-extra/clang-tidy/ClangTidy.cpp b/clang-tools-extra/clang-tidy/ClangTidy.cpp
index 7e18f3806a143..7b40c80653ebc 100644
--- a/clang-tools-extra/clang-tidy/ClangTidy.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidy.cpp
@@ -117,7 +117,8 @@ class ErrorReporter {
 
   void reportDiagnostic(const ClangTidyError &Error) {
     const tooling::DiagnosticMessage &Message = Error.Message;
-    SourceLocation Loc = getLocation(Message.FilePath, Message.FileOffset);
+    const SourceLocation Loc =
+        getLocation(Message.FilePath, Message.FileOffset);
     // Contains a pair for each attempted fix: location and whether the fix was
     // applied successfully.
     SmallVector<std::pair<SourceLocation, bool>, 4> FixLocations;
@@ -157,11 +158,11 @@ class ErrorReporter {
               // FIXME: Implement better conflict handling.
               llvm::errs() << "Trying to resolve conflict: "
                            << llvm::toString(std::move(Err)) << "\n";
-              unsigned NewOffset =
+              const unsigned NewOffset =
                   Replacements.getShiftedCodePosition(R.getOffset());
-              unsigned NewLength = Replacements.getShiftedCodePosition(
-                                       R.getOffset() + R.getLength()) -
-                                   NewOffset;
+              const unsigned NewLength = Replacements.getShiftedCodePosition(
+                                             R.getOffset() + R.getLength()) -
+                                         NewOffset;
               if (NewLength == R.getLength()) {
                 R = Replacement(R.getFilePath(), NewOffset, NewLength,
                                 R.getReplacementText());
@@ -200,7 +201,7 @@ class ErrorReporter {
 
       for (const auto &FileAndReplacements : FileReplacements) {
         Rewriter Rewrite(SourceMgr, LangOpts);
-        StringRef File = FileAndReplacements.first();
+        const StringRef File = FileAndReplacements.first();
         VFS.setCurrentWorkingDirectory(FileAndReplacements.second.BuildDir);
         llvm::ErrorOr<std::unique_ptr<MemoryBuffer>> Buffer =
             SourceMgr.getFileManager().getBufferForFile(File);
@@ -210,7 +211,7 @@ class ErrorReporter {
           // FIXME: Maybe don't apply fixes for other files as well.
           continue;
         }
-        StringRef Code = Buffer.get()->getBuffer();
+        const StringRef Code = Buffer.get()->getBuffer();
         auto Style = format::getStyle(
             *Context.getOptionsForFile(File).FormatStyle, File, "none");
         if (!Style) {
@@ -262,7 +263,7 @@ class ErrorReporter {
     if (!File)
       return {};
 
-    FileID ID = SourceMgr.getOrCreateFileID(*File, SrcMgr::C_User);
+    const FileID ID = SourceMgr.getOrCreateFileID(*File, SrcMgr::C_User);
     return SourceMgr.getLocForStartOfFile(ID).getLocWithOffset(Offset);
   }
 
@@ -284,7 +285,8 @@ class ErrorReporter {
   }
 
   void reportNote(const tooling::DiagnosticMessage &Message) {
-    SourceLocation Loc = getLocation(Message.FilePath, Message.FileOffset);
+    const SourceLocation Loc =
+        getLocation(Message.FilePath, Message.FileOffset);
     auto Diag =
         Diags.Report(Loc, Diags.getCustomDiagID(DiagnosticsEngine::Note, "%0"))
         << Message.Message;
@@ -296,8 +298,9 @@ class ErrorReporter {
   CharSourceRange getRange(const FileByteRange &Range) {
     SmallString<128> AbsoluteFilePath{Range.FilePath};
     Files.makeAbsolutePath(AbsoluteFilePath);
-    SourceLocation BeginLoc = getLocation(AbsoluteFilePath, Range.FileOffset);
-    SourceLocation EndLoc = BeginLoc.getLocWithOffset(Range.Length);
+    const SourceLocation BeginLoc =
+        getLocation(AbsoluteFilePath, Range.FileOffset);
+    const SourceLocation EndLoc = BeginLoc.getLocWithOffset(Range.Length);
     // Retrieve the source range for applicable highlights and fixes. Macro
     // definition on the command line have locations in a virtual buffer and
     // don't have valid file paths and are therefore not applicable.
@@ -353,7 +356,8 @@ ClangTidyASTConsumerFactory::ClangTidyASTConsumerFactory(
   if (Context.canExperimentalCustomChecks() && custom::RegisterCustomChecks)
     custom::RegisterCustomChecks(Context.getOptions(), *CheckFactories);
 #endif
-  for (ClangTidyModuleRegistry::entry E : ClangTidyModuleRegistry::entries()) {
+  for (const ClangTidyModuleRegistry::entry E :
+       ClangTidyModuleRegistry::entries()) {
     std::unique_ptr<ClangTidyModule> Module = E.instantiate();
     Module->addCheckFactories(*CheckFactories);
   }
@@ -394,8 +398,9 @@ static CheckersList getAnalyzerCheckersAndPackages(ClangTidyContext &Context,
   // Always add all core checkers if any other static analyzer check is enabled.
   // This is currently necessary, as other path sensitive checks rely on the
   // core checkers.
-  for (StringRef CheckName : RegisteredCheckers) {
-    std::string ClangTidyCheckName((AnalyzerCheckNamePrefix + CheckName).str());
+  for (const StringRef CheckName : RegisteredCheckers) {
+    const std::string ClangTidyCheckName(
+        (AnalyzerCheckNamePrefix + CheckName).str());
 
     if (CheckName.starts_with("core") ||
         Context.isCheckEnabled(ClangTidyCheckName)) {
@@ -450,8 +455,8 @@ ClangTidyASTConsumerFactory::createASTConsumer(
 
   if (Context.canEnableModuleHeadersParsing() &&
       Context.getLangOpts().Modules && OverlayFS != nullptr) {
-    auto ModuleExpander =
-        std::make_unique<ExpandModularHeadersPPCallbacks>(&Compiler, OverlayFS);
+    auto ModuleExpander = std::make_unique<ExpandModularHeadersPPCallbacks>(
+        &Compiler, *OverlayFS);
     ModuleExpanderPP = ModuleExpander->getPreprocessor();
     PP->addPPCallbacks(std::move(ModuleExpander));
   }
@@ -504,7 +509,7 @@ std::vector<std::string> ClangTidyASTConsumerFactory::getCheckNames() {
 
 ClangTidyOptions::OptionMap ClangTidyASTConsumerFactory::getCheckOptions() {
   ClangTidyOptions::OptionMap Options;
-  std::vector<std::unique_ptr<ClangTidyCheck>> Checks =
+  const std::vector<std::unique_ptr<ClangTidyCheck>> Checks =
       CheckFactories->createChecks(&Context);
   for (const auto &Check : Checks)
     Check->storeOptions(Options);
@@ -564,7 +569,7 @@ runClangTidy(clang::tidy::ClangTidyContext &Context,
                  std::make_shared<PCHContainerOperations>(), BaseFS);
 
   // Add extra arguments passed by the clang-tidy command-line.
-  ArgumentsAdjuster PerFileExtraArgumentsInserter =
+  const ArgumentsAdjuster PerFileExtraArgumentsInserter =
       [&Context](const CommandLineArguments &Args, StringRef Filename) {
         ClangTidyOptions Opts = Context.getOptionsForFile(Filename);
         CommandLineArguments AdjustedArgs = Args;
@@ -703,7 +708,7 @@ ChecksAndOptions getAllChecksAndOptions(bool AllowEnablingAnalyzerAlphaCheckers,
 
 #if CLANG_TIDY_ENABLE_STATIC_ANALYZER
   SmallString<64> Buffer(AnalyzerCheckNamePrefix);
-  size_t DefSize = Buffer.size();
+  const size_t DefSize = Buffer.size();
   for (const auto &AnalyzerCheck : AnalyzerOptions::getRegisteredCheckers(
            AllowEnablingAnalyzerAlphaCheckers)) {
     Buffer.truncate(DefSize);
diff --git a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
index 6e0c252a80bf8..b747657594ac0 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
@@ -161,7 +161,7 @@ ClangTidyCheck::OptionsView::getEnumInt(StringRef LocalName,
   if (Iter == CheckOptions.end())
     return std::nullopt;
 
-  StringRef Value = Iter->getValue().Value;
+  const StringRef Value = Iter->getValue().Value;
   StringRef Closest;
   unsigned EditDistance = 3;
   for (const auto &NameAndEnum : Mapping) {
@@ -173,7 +173,7 @@ ClangTidyCheck::OptionsView::getEnumInt(StringRef LocalName,
       EditDistance = 0;
       continue;
     }
-    unsigned Distance =
+    const unsigned Distance =
         Value.edit_distance(NameAndEnum.second, true, EditDistance);
     if (Distance < EditDistance) {
       EditDistance = Distance;
diff --git a/clang-tools-extra/clang-tidy/ClangTidyCheck.h b/clang-tools-extra/clang-tidy/ClangTidyCheck.h
index e53ae532d7e5f..905e419cdf0ca 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyCheck.h
+++ b/clang-tools-extra/clang-tidy/ClangTidyCheck.h
@@ -458,7 +458,7 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback {
     template <typename T>
     std::enable_if_t<std::is_enum_v<T>, std::vector<NameAndValue>>
     typeEraseMapping() const {
-      ArrayRef<std::pair<T, StringRef>> Mapping =
+      const ArrayRef<std::pair<T, StringRef>> Mapping =
           OptionEnumMapping<T>::getEnumMapping();
       std::vector<NameAndValue> Result;
       Result.reserve(Mapping.size());
diff --git a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp
index 65fd09f99ef0f..81a9f932e547d 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp
@@ -61,7 +61,7 @@ class ClangTidyDiagnosticRenderer : public DiagnosticRenderer {
     // FIXME: Remove this once there's a better way to pass check names than
     // appending the check name to the message in ClangTidyContext::diag and
     // using getCustomDiagID.
-    std::string CheckNameInMessage = " [" + Error.DiagnosticName + "]";
+    const std::string CheckNameInMessage = " [" + Error.DiagnosticName + "]";
     Message.consume_back(CheckNameInMessage);
 
     auto TidyMessage =
@@ -77,7 +77,7 @@ class ClangTidyDiagnosticRenderer : public DiagnosticRenderer {
       if (SourceRange.isCharRange())
         return SourceRange;
       assert(SourceRange.isTokenRange());
-      SourceLocation End = Lexer::getLocForEndOfToken(
+      const SourceLocation End = Lexer::getLocForEndOfToken(
           SourceRange.getEnd(), 0, Loc.getManager(), LangOpts);
       return CharSourceRange::getCharRange(SourceRange.getBegin(), End);
     };
@@ -114,14 +114,14 @@ class ClangTidyDiagnosticRenderer : public DiagnosticRenderer {
         Level == DiagnosticsEngine::Note ? &Error.Notes.back() : &Error.Message;
 
     for (const auto &FixIt : Hints) {
-      CharSourceRange Range = FixIt.RemoveRange;
+      const CharSourceRange Range = FixIt.RemoveRange;
       assert(Range.getBegin().isValid() && Range.getEnd().isValid() &&
              "Invalid range in the fix-it hint.");
       assert(Range.getBegin().isFileID() && Range.getEnd().isFileID() &&
              "Only file locations supported in fix-it hints.");
 
-      tooling::Replacement Replacement(Loc.getManager(), Range,
-                                       FixIt.CodeToInsert);
+      const tooling::Replacement Replacement(Loc.getManager(), Range,
+                                             FixIt.CodeToInsert);
       llvm::Error Err =
           DiagWithFix->Fix[Replacement.getFilePath()].add(Replacement);
       // FIXME: better error handling (at least, don't let other replacements be
@@ -177,7 +177,7 @@ DiagnosticBuilder ClangTidyContext::diag(
     StringRef CheckName, SourceLocation Loc, StringRef Description,
     DiagnosticIDs::Level Level /* = DiagnosticIDs::Warning*/) {
   assert(Loc.isValid());
-  unsigned ID = DiagEngine->getDiagnosticIDs()->getCustomDiagID(
+  const unsigned ID = DiagEngine->getDiagnosticIDs()->getCustomDiagID(
       Level, (Description + " [" + CheckName + "]").str());
   CheckNamesByDiagnosticID.try_emplace(ID, CheckName);
   return DiagEngine->Report(Loc, ID);
@@ -186,7 +186,7 @@ DiagnosticBuilder ClangTidyContext::diag(
 DiagnosticBuilder ClangTidyContext::diag(
     StringRef CheckName, StringRef Description,
     DiagnosticIDs::Level Level /* = DiagnosticIDs::Warning*/) {
-  unsigned ID = DiagEngine->getDiagnosticIDs()->getCustomDiagID(
+  const unsigned ID = DiagEngine->getDiagnosticIDs()->getCustomDiagID(
       Level, (Description + " [" + CheckName + "]").str());
   CheckNamesByDiagnosticID.try_emplace(ID, CheckName);
   return DiagEngine->Report(ID);
@@ -195,10 +195,11 @@ DiagnosticBuilder ClangTidyContext::diag(
 DiagnosticBuilder ClangTidyContext::diag(const tooling::Diagnostic &Error) {
   SourceManager &SM = DiagEngine->getSourceManager();
   FileManager &FM = SM.getFileManager();
-  FileEntryRef File = llvm::cantFail(FM.getFileRef(Error.Message.FilePath));
-  FileID ID = SM.getOrCreateFileID(File, SrcMgr::C_User);
-  SourceLocation FileStartLoc = SM.getLocForStartOfFile(ID);
-  SourceLocation Loc = FileStartLoc.getLocWithOffset(
+  const FileEntryRef File =
+      llvm::cantFail(FM.getFileRef(Error.Message.FilePath));
+  const FileID ID = SM.getOrCreateFileID(File, SrcMgr::C_User);
+  const SourceLocation FileStartLoc = SM.getLocForStartOfFile(ID);
+  const SourceLocation Loc = FileStartLoc.getLocWithOffset(
       static_cast<SourceLocation::IntTy>(Error.Message.FileOffset));
   return diag(Error.DiagnosticName, Loc, Error.Message.Message,
               static_cast<DiagnosticIDs::Level>(Error.DiagLevel));
@@ -214,7 +215,7 @@ bool ClangTidyContext::shouldSuppressDiagnostic(
     DiagnosticsEngine::Level DiagLevel, const Diagnostic &Info,
     SmallVectorImpl<tooling::Diagnostic> &NoLintErrors, bool AllowIO,
     bool EnableNoLintBlocks) {
-  std::string CheckName = getCheckName(Info.getID());
+  const std::string CheckName = getCheckName(Info.getID());
   return NoLintHandler.shouldSuppress(DiagLevel, Info, CheckName, NoLintErrors,
                                       AllowIO, EnableNoLintBlocks);
 }
@@ -226,7 +227,7 @@ void ClangTidyContext::setSourceManager(SourceManager *SourceMgr) {
 static bool parseFileExtensions(llvm::ArrayRef<std::string> AllFileExtensions,
                                 FileExtensionsSet &FileExtensions) {
   FileExtensions.clear();
-  for (StringRef Suffix : AllFileExtensions) {
+  for (const StringRef Suffix : AllFileExtensions) {
     StringRef Extension = Suffix.trim();
     if (!llvm::all_of(Extension, isAlphanumeric))
       return false;
@@ -294,11 +295,11 @@ bool ClangTidyContext::treatAsError(StringRef CheckName) const {
 }
 
 std::string ClangTidyContext::getCheckName(unsigned DiagnosticID) const {
-  std::string ClangWarningOption = std::string(
+  const std::string ClangWarningOption = std::string(
       DiagEngine->getDiagnosticIDs()->getWarningOptionForDiag(DiagnosticID));
   if (!ClangWarningOption.empty())
     return "clang-diagnostic-" + ClangWarningOption;
-  llvm::DenseMap<unsigned, std::string>::const_iterator I =
+  const llvm::DenseMap<unsigned, std::string>::const_iterator I =
       CheckNamesByDiagnosticID.find(DiagnosticID);
   if (I != CheckNamesByDiagnosticID.end())
     return I->second;
@@ -316,7 +317,7 @@ ClangTidyDiagnosticConsumer::ClangTidyDiagnosticConsumer(
 
 void ClangTidyDiagnosticConsumer::finalizeLastError() {
   if (!Errors.empty()) {
-    ClangTidyError &Error = Errors.back();
+    const ClangTidyError &Error = Errors.back();
     if (Error.DiagnosticName == "clang-tidy-config") {
       // Never ignore these.
     } else if (!Context.isCheckEnabled(Error.DiagnosticName) &&
@@ -436,8 +437,8 @@ void ClangTidyDiagnosticConsumer::HandleDiagnostic(
       Level = ClangTidyError::Remark;
     }
 
-    bool IsWarningAsError = DiagLevel == DiagnosticsEngine::Warning &&
-                            Context.treatAsError(CheckName);
+    const bool IsWarningAsError = DiagLevel == DiagnosticsEngine::Warning &&
+                                  Context.treatAsError(CheckName);
     Errors.emplace_back(CheckName, Level, Context.getCurrentBuildDirectory(),
                         IsWarningAsError);
   }
@@ -491,8 +492,9 @@ void ClangTidyDiagnosticConsumer::forwardDiagnostic(const Diagnostic &Info) {
   // Acquire a diagnostic ID also in the external diagnostics engine.
   auto DiagLevelAndFormatString =
       Context.getDiagLevelAndFormatString(Info.getID(), Info.getLocation());
-  unsigned ExternalID = ExternalDiagEngine->getDiagnosticIDs()->getCustomDiagID(
-      DiagLevelAndFormatString.first, DiagLevelAndFormatString.second);
+  const unsigned ExternalID =
+      ExternalDiagEngine->getDiagnosticIDs()->getCustomDiagID(
+          DiagLevelAndFormatString.first, DiagLevelAndFormatString.second);
 
   // Forward the details.
   auto Builder = ExternalDiagEngine->Report(Info.getLocation(), ExternalID);
@@ -501,7 +503,7 @@ void ClangTidyDiagnosticConsumer::forwardDiagnostic(const Diagnostic &Info) {
   for (auto Range : Info.getRanges())
     Builder << Range;
   for (unsigned Index = 0; Index < Info.getNumArgs(); ++Index) {
-    DiagnosticsEngine::ArgumentKind Kind = Info.getArgKind(Index);
+    const DiagnosticsEngine::ArgumentKind Kind = Info.getArgKind(Index);
     switch (Kind) {
     case clang::DiagnosticsEngine::ak_std_string:
       Builder << Info.getArgStdStr(Index);
@@ -574,7 +576,7 @@ void ClangTidyDiagnosticConsumer::checkFilters(SourceLocation Location,
   // FIXME: We start with a conservative approach here, but the actual type of
   // location needed depends on the check (in particular, where this check wants
   // to apply fixes).
-  FileID FID = Sources.getDecomposedExpansionLoc(Location).first;
+  const FileID FID = Sources.getDecomposedExpansionLoc(Location).first;
   OptionalFileEntryRef File = Sources.getFileEntryRefForID(FID);
 
   // -DMACRO definitions on the command line have locations in a virtual buffer
@@ -585,13 +587,13 @@ void ClangTidyDiagnosticConsumer::checkFilters(SourceLocation Location,
     return;
   }
 
-  StringRef FileName(File->getName());
+  const StringRef FileName(File->getName());
   LastErrorRelatesToUserCode = LastErrorRelatesToUserCode ||
                                Sources.isInMainFile(Location) ||
                                (getHeaderFilter()->match(FileName) &&
                                 !getExcludeHeaderFilter()->match(FileName));
 
-  unsigned LineNumber = Sources.getExpansionLineNumber(Location);
+  const unsigned LineNumber = Sources.getExpansionLineNumber(Location);
   LastErrorPassesLineFilter =
       LastErrorPassesLineFilter || passesLineFilter(FileName, LineNumber);
 }
@@ -707,8 +709,8 @@ void ClangTidyDiagnosticConsumer::removeIncompatibleErrors() {
   for (unsigned I = 0; I < ErrorFixes.size(); ++I) {
     for (const auto &FileAndReplace : *ErrorFixes[I].second) {
       for (const auto &Replace : FileAndReplace.second) {
-        unsigned Begin = Replace.getOffset();
-        unsigned End = Begin + Replace.getLength();
+        const unsigned Begin = Replace.getOffset();
+        const unsigned End = Begin + Replace.getLength();
         auto &Events = FileEvents[Replace.getFilePath()];
         if (Begin == End) {
           Events.emplace_back(Begin, End, Event::ET_Insert, I, Sizes[I]);
@@ -767,7 +769,7 @@ struct LessClangTidyError {
 };
 struct EqualClangTidyError {
   bool operator()(const ClangTidyError &LHS, const ClangTidyError &RHS) const {
-    LessClangTidyError Less;
+    const LessClangTidyError Less;
     return !Less(LHS, RHS) && !Less(RHS, LHS);
   }
 };
@@ -803,7 +805,7 @@ void ClangTidyDiagnosticConsumer::removeDuplicatedDiagnosticsOfAliasCheckers() {
   auto IT = Errors.begin();
   while (IT != Errors.end()) {
     ClangTidyError &Error = *IT;
-    std::pair<UniqueErrorSet::iterator, bool> Inserted =
+    const std::pair<UniqueErrorSet::iterator, bool> Inserted =
         UniqueErrors.insert(&Error);
 
     // Unique error, we keep it and move along.
diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
index c4b47a440e44b..550f7809d75f9 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
@@ -119,7 +119,7 @@ void yamlize(IO &IO, ClangTidyOptions::OptionMap &Val, bool,
       yamlize(IO, NOpts->Options, true, Ctx);
     } else if (isa<MappingNode>(I.getCurrentNode())) {
       IO.beginMapping();
-      for (StringRef Key : IO.keys()) {
+      for (const StringRef Key : IO.keys()) {
         // Requires 'llvm::yaml::IO' to accept 'StringRef'
         // NOLINTNEXTLINE(bugprone-suspicious-stringview-data-usage)
         IO.mapRequired(Key.data(), Val[Key].Value);
@@ -392,7 +392,7 @@ llvm::ErrorOr<llvm::SmallString<128>>
 FileOptionsBaseProvider::getNormalizedAbsolutePath(llvm::StringRef Path) {
   assert(FS && "FS must be set.");
   llvm::SmallString<128> NormalizedAbsolutePath = {Path};
-  std::error_code Err = FS->makeAbsolute(NormalizedAbsolutePath);
+  const std::error_code Err = FS->makeAbsolute(NormalizedAbsolutePath);
   if (Err)
     return Err;
   llvm::sys::path::remove_dots(NormalizedAbsolutePath, /*remove_dot_dot=*/true);
@@ -463,7 +463,7 @@ FileOptionsProvider::getRawOptions(StringRef FileName) {
   LLVM_DEBUG(llvm::dbgs() << "Getting options for file " << FileName
                           << "...\n");
 
-  llvm::ErrorOr<llvm::SmallString<128>> AbsoluteFilePath =
+  const llvm::ErrorOr<llvm::SmallString<128>> AbsoluteFilePath =
       getNormalizedAbsolutePath(FileName);
   if (!AbsoluteFilePath)
     return {};
@@ -471,8 +471,8 @@ FileOptionsProvider::getRawOptions(StringRef FileName) {
   std::vector<OptionsSource> RawOptions =
       DefaultOptionsProvider::getRawOptions(AbsoluteFilePath->str());
   addRawFileOptions(AbsoluteFilePath->str(), RawOptions);
-  OptionsSource CommandLineOptions(OverrideOptions,
-                                   OptionsSourceTypeCheckCommandLineOption);
+  const OptionsSource CommandLineOptions(
+      OverrideOptions, OptionsSourceTypeCheckCommandLineOption);
 
   RawOptions.push_back(CommandLineOptions);
   return RawOptions;
@@ -502,7 +502,7 @@ FileOptionsBaseProvider::tryReadConfigFile(StringRef Directory) {
 
     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
         FS->getBufferForFile(ConfigFile);
-    if (std::error_code EC = Text.getError()) {
+    if (const std::error_code EC = Text.getError()) {
       llvm::errs() << "Can't read " << ConfigFile << ": " << EC.message()
                    << "\n";
       continue;
diff --git a/clang-tools-extra/clang-tidy/ClangTidyProfiling.cpp b/clang-tools-extra/clang-tidy/ClangTidyProfiling.cpp
index 8ea6b76819804..6fee154be448c 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyProfiling.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyProfiling.cpp
@@ -59,7 +59,8 @@ void ClangTidyProfiling::storeProfileData(llvm::TimerGroup &TG) {
 
   llvm::SmallString<256> OutputDirectory(Storage->StoreFilename);
   llvm::sys::path::remove_filename(OutputDirectory);
-  if (std::error_code EC = llvm::sys::fs::create_directories(OutputDirectory)) {
+  if (const std::error_code EC =
+          llvm::sys::fs::create_directories(OutputDirectory)) {
     llvm::errs() << "Unable to create output directory '" << OutputDirectory
                  << "': " << EC.message() << "\n";
     return;
diff --git a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp
index 487e5e299d132..9a4fc7a30b472 100644
--- a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp
+++ b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp
@@ -65,8 +65,7 @@ class ExpandModularHeadersPPCallbacks::FileRecorder {
 };
 
 ExpandModularHeadersPPCallbacks::ExpandModularHeadersPPCallbacks(
-    CompilerInstance *CI,
-    IntrusiveRefCntPtr<llvm::vfs::OverlayFileSystem> OverlayFS)
+    CompilerInstance *CI, llvm::vfs::OverlayFileSystem &OverlayFS)
     : Recorder(std::make_unique<FileRecorder>()), Compiler(*CI),
       InMemoryFs(new llvm::vfs::InMemoryFileSystem),
       Sources(Compiler.getSourceManager()),
@@ -76,7 +75,7 @@ ExpandModularHeadersPPCallbacks::ExpandModularHeadersPPCallbacks(
       LangOpts(Compiler.getLangOpts()), HSOpts(Compiler.getHeaderSearchOpts()) {
   // Add a FileSystem containing the extra files needed in place of modular
   // headers.
-  OverlayFS->pushOverlay(InMemoryFs);
+  OverlayFS.pushOverlay(InMemoryFs);
 
   Diags.setSourceManager(&Sources);
   // FIXME: Investigate whatever is there better way to initialize DiagEngine
diff --git a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h
index aaa04107a11ec..95216368492ca 100644
--- a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h
+++ b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLING_EXPANDMODULARHEADERSPPCALLBACKS_H_
-#define LLVM_CLANG_TOOLING_EXPANDMODULARHEADERSPPCALLBACKS_H_
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_EXPANDMODULARHEADERSPPCALLBACKS_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_EXPANDMODULARHEADERSPPCALLBACKS_H
 
 #include "clang/Lex/HeaderSearchOptions.h"
 #include "clang/Lex/PPCallbacks.h"
@@ -41,9 +41,8 @@ namespace tooling {
 /// non-modular way.
 class ExpandModularHeadersPPCallbacks : public PPCallbacks {
 public:
-  ExpandModularHeadersPPCallbacks(
-      CompilerInstance *CI,
-      IntrusiveRefCntPtr<llvm::vfs::OverlayFileSystem> OverlayFS);
+  ExpandModularHeadersPPCallbacks(CompilerInstance *CI,
+                                  llvm::vfs::OverlayFileSystem &OverlayFS);
   ~ExpandModularHeadersPPCallbacks() override;
 
   /// Returns the preprocessor that provides callbacks for the whole
@@ -144,4 +143,4 @@ class ExpandModularHeadersPPCallbacks : public PPCallbacks {
 } // namespace tooling
 } // namespace clang
 
-#endif // LLVM_CLANG_TOOLING_EXPANDMODULARHEADERSPPCALLBACKS_H_
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_EXPANDMODULARHEADERSPPCALLBACKS_H
diff --git a/clang-tools-extra/clang-tidy/FileExtensionsSet.h b/clang-tools-extra/clang-tidy/FileExtensionsSet.h
index 95c221c84da2e..f97bb64ff946e 100644
--- a/clang-tools-extra/clang-tidy/FileExtensionsSet.h
+++ b/clang-tools-extra/clang-tidy/FileExtensionsSet.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FILE_EXTENSIONS_SET_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FILE_EXTENSIONS_SET_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FILEEXTENSIONSSET_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FILEEXTENSIONSSET_H
 
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
@@ -16,4 +16,4 @@ namespace clang::tidy {
 using FileExtensionsSet = llvm::SmallSet<llvm::StringRef, 5>;
 } // namespace clang::tidy
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FILE_EXTENSIONS_SET_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FILEEXTENSIONSSET_H
diff --git a/clang-tools-extra/clang-tidy/GlobList.cpp b/clang-tools-extra/clang-tidy/GlobList.cpp
index 667a25657a4c9..5d5a5f2d8d865 100644
--- a/clang-tools-extra/clang-tidy/GlobList.cpp
+++ b/clang-tools-extra/clang-tidy/GlobList.cpp
@@ -23,16 +23,17 @@ static bool consumeNegativeIndicator(StringRef &GlobList) {
 // removes it and the trailing comma from the GlobList and
 // returns the extracted glob.
 static llvm::StringRef extractNextGlob(StringRef &GlobList) {
-  StringRef UntrimmedGlob = GlobList.substr(0, GlobList.find_first_of(",\n"));
-  StringRef Glob = UntrimmedGlob.trim();
+  const StringRef UntrimmedGlob =
+      GlobList.substr(0, GlobList.find_first_of(",\n"));
+  const StringRef Glob = UntrimmedGlob.trim();
   GlobList = GlobList.substr(UntrimmedGlob.size() + 1);
   return Glob;
 }
 
 static llvm::Regex createRegexFromGlob(StringRef &Glob) {
   SmallString<128> RegexText("^");
-  StringRef MetaChars("()^$|*+?.[]\\{}");
-  for (char C : Glob) {
+  const StringRef MetaChars("()^$|*+?.[]\\{}");
+  for (const char C : Glob) {
     if (C == '*')
       RegexText.push_back('.');
     else if (MetaChars.contains(C))
diff --git a/clang-tools-extra/clang-tidy/NoLintDirectiveHandler.cpp b/clang-tools-extra/clang-tidy/NoLintDirectiveHandler.cpp
index ef20ee18347df..a74ab008fa7c7 100644
--- a/clang-tools-extra/clang-tidy/NoLintDirectiveHandler.cpp
+++ b/clang-tools-extra/clang-tidy/NoLintDirectiveHandler.cpp
@@ -134,7 +134,7 @@ static SmallVector<NoLintToken> getNoLints(StringRef Buffer) {
     // Get checks, if specified.
     std::optional<StringRef> Checks;
     if (Pos < Buffer.size() && Buffer[Pos] == '(') {
-      size_t ClosingBracket = Buffer.find_first_of("\n)", ++Pos);
+      const size_t ClosingBracket = Buffer.find_first_of("\n)", ++Pos);
       if (ClosingBracket != StringRef::npos && Buffer[ClosingBracket] == ')') {
         Checks = Buffer.slice(Pos, ClosingBracket);
         Pos = ClosingBracket + 1;
@@ -183,13 +183,13 @@ static tooling::Diagnostic makeNoLintError(const SourceManager &SrcMgr,
   tooling::Diagnostic Error;
   Error.DiagLevel = tooling::Diagnostic::Error;
   Error.DiagnosticName = "clang-tidy-nolint";
-  StringRef Message =
+  const StringRef Message =
       (NoLint.Type == NoLintType::NoLintBegin)
           ? ("unmatched 'NOLINTBEGIN' comment without a subsequent 'NOLINT"
              "END' comment")
           : ("unmatched 'NOLINTEND' comment without a previous 'NOLINT"
              "BEGIN' comment");
-  SourceLocation Loc = SrcMgr.getComposedLoc(File, NoLint.Pos);
+  const SourceLocation Loc = SrcMgr.getComposedLoc(File, NoLint.Pos);
   Error.Message = tooling::DiagnosticMessage(Message, SrcMgr, Loc);
   return Error;
 }
@@ -294,8 +294,8 @@ bool NoLintDirectiveHandler::Impl::diagHasNoLintInMacro(
 // this line.
 static std::pair<size_t, size_t> getLineStartAndEnd(StringRef Buffer,
                                                     size_t From) {
-  size_t StartPos = Buffer.find_last_of('\n', From) + 1;
-  size_t EndPos = std::min(Buffer.find('\n', From), Buffer.size());
+  const size_t StartPos = Buffer.find_last_of('\n', From) + 1;
+  const size_t EndPos = std::min(Buffer.find('\n', From), Buffer.size());
   return {StartPos, EndPos};
 }
 
diff --git a/clang-tools-extra/clang-tidy/abseil/AbseilMatcher.h b/clang-tools-extra/clang-tidy/abseil/AbseilMatcher.h
index 2ae3c00f7ee3e..982774ca4db5b 100644
--- a/clang-tools-extra/clang-tidy/abseil/AbseilMatcher.h
+++ b/clang-tools-extra/clang-tidy/abseil/AbseilMatcher.h
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_ABSEILMATCHER_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_ABSEILMATCHER_H
+
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include <algorithm>
@@ -31,7 +34,7 @@ AST_POLYMORPHIC_MATCHER(
     isInAbseilFile, AST_POLYMORPHIC_SUPPORTED_TYPES(Decl, Stmt, TypeLoc,
                                                     NestedNameSpecifierLoc)) {
   auto &SourceManager = Finder->getASTContext().getSourceManager();
-  SourceLocation Loc = SourceManager.getSpellingLoc(Node.getBeginLoc());
+  const SourceLocation Loc = SourceManager.getSpellingLoc(Node.getBeginLoc());
   if (Loc.isInvalid())
     return false;
   OptionalFileEntryRef FileEntry =
@@ -42,7 +45,7 @@ AST_POLYMORPHIC_MATCHER(
   // [absl-library] is AbseilLibraries list entry.
   StringRef Path = FileEntry->getName();
   static constexpr llvm::StringLiteral AbslPrefix("absl/");
-  size_t PrefixPosition = Path.find(AbslPrefix);
+  const size_t PrefixPosition = Path.find(AbslPrefix);
   if (PrefixPosition == StringRef::npos)
     return false;
   Path = Path.drop_front(PrefixPosition + AbslPrefix.size());
@@ -57,3 +60,5 @@ AST_POLYMORPHIC_MATCHER(
 }
 
 } // namespace clang::ast_matchers
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_ABSEILMATCHER_H
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.cpp
index 03f78f1c96252..421e5973d4fe0 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.cpp
@@ -41,7 +41,7 @@ void DurationAdditionCheck::check(const MatchFinder::MatchResult &Result) {
   if (!Scale)
     return;
 
-  llvm::StringRef TimeFactory = getTimeInverseForScale(*Scale);
+  const llvm::StringRef TimeFactory = getTimeInverseForScale(*Scale);
 
   FixItHint Hint;
   if (Call == Binop->getLHS()->IgnoreParenImpCasts()) {
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.h
index b728118c3da03..f5bab53035f87 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMEADDITIONCHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMEADDITIONCHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_DURATIONADDITIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_DURATIONADDITIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -31,4 +31,4 @@ class DurationAdditionCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::abseil
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMEADDITIONCHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_DURATIONADDITIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.cpp
index 16a244b7e9997..f00877754f952 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.cpp
@@ -38,9 +38,9 @@ void DurationComparisonCheck::check(const MatchFinder::MatchResult &Result) {
   // if nothing needs to be done.
   if (isInMacro(Result, Binop->getLHS()) || isInMacro(Result, Binop->getRHS()))
     return;
-  std::string LhsReplacement =
+  const std::string LhsReplacement =
       rewriteExprFromNumberToDuration(Result, *Scale, Binop->getLHS());
-  std::string RhsReplacement =
+  const std::string RhsReplacement =
       rewriteExprFromNumberToDuration(Result, *Scale, Binop->getRHS());
 
   diag(Binop->getBeginLoc(), "perform comparison in the duration domain")
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.cpp
index 11d6017c22e9d..ef06a9e2ba572 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.cpp
@@ -41,7 +41,7 @@ void DurationConversionCastCheck::check(
 
   const auto *FuncDecl = Result.Nodes.getNodeAs<FunctionDecl>("func_decl");
   const auto *Arg = Result.Nodes.getNodeAs<Expr>("arg");
-  StringRef ConversionFuncName = FuncDecl->getName();
+  const StringRef ConversionFuncName = FuncDecl->getName();
 
   std::optional<DurationScale> Scale =
       getScaleForDurationInverse(ConversionFuncName);
@@ -51,7 +51,8 @@ void DurationConversionCastCheck::check(
   // Casting a double to an integer.
   if (MatchedCast->getTypeAsWritten()->isIntegerType() &&
       ConversionFuncName.contains("Double")) {
-    llvm::StringRef NewFuncName = getDurationInverseForScale(*Scale).second;
+    const llvm::StringRef NewFuncName =
+        getDurationInverseForScale(*Scale).second;
 
     diag(MatchedCast->getBeginLoc(),
          "duration should be converted directly to an integer rather than "
@@ -66,7 +67,8 @@ void DurationConversionCastCheck::check(
   // Casting an integer to a double.
   if (MatchedCast->getTypeAsWritten()->isRealFloatingType() &&
       ConversionFuncName.contains("Int64")) {
-    llvm::StringRef NewFuncName = getDurationInverseForScale(*Scale).first;
+    const llvm::StringRef NewFuncName =
+        getDurationInverseForScale(*Scale).first;
 
     diag(MatchedCast->getBeginLoc(), "duration should be converted directly to "
                                      "a floating-point number rather than "
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.cpp
index 334629767aff2..9e403fb8be3dd 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.cpp
@@ -158,7 +158,7 @@ void DurationFactoryScaleCheck::check(const MatchFinder::MatchResult &Result) {
   if (!MaybeScale)
     return;
 
-  DurationScale Scale = *MaybeScale;
+  const DurationScale Scale = *MaybeScale;
   const Expr *Remainder = nullptr;
   std::optional<DurationScale> NewScale;
 
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationRewriter.cpp b/clang-tools-extra/clang-tidy/abseil/DurationRewriter.cpp
index ee1979658aaed..a78d07d2e5861 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationRewriter.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/DurationRewriter.cpp
@@ -20,7 +20,7 @@ namespace clang::tidy::abseil {
 /// Returns an integer if the fractional part of a `FloatingLiteral` is `0`.
 static std::optional<llvm::APSInt>
 truncateIfIntegral(const FloatingLiteral &FloatLiteral) {
-  double Value = FloatLiteral.getValueAsApproximateDouble();
+  const double Value = FloatLiteral.getValueAsApproximateDouble();
   if (std::fmod(Value, 1) == 0) {
     if (Value >= static_cast<double>(1U << 31))
       return std::nullopt;
@@ -69,7 +69,7 @@ rewriteInverseDurationCall(const MatchFinder::MatchResult &Result,
 static std::optional<std::string>
 rewriteInverseTimeCall(const MatchFinder::MatchResult &Result,
                        DurationScale Scale, const Expr &Node) {
-  llvm::StringRef InverseFunction = getTimeInverseForScale(Scale);
+  const llvm::StringRef InverseFunction = getTimeInverseForScale(Scale);
   if (const auto *MaybeCallArg = selectFirst<const Expr>(
           "e", match(callExpr(callee(functionDecl(hasName(InverseFunction))),
                               hasArgument(0, expr().bind("e"))),
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.cpp
index c5d93ad51ad17..42a7df496f6ad 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.cpp
@@ -41,7 +41,7 @@ void DurationSubtractionCheck::check(const MatchFinder::MatchResult &Result) {
   if (!Scale)
     return;
 
-  std::string RhsReplacement =
+  const std::string RhsReplacement =
       rewriteExprFromNumberToDuration(Result, *Scale, Binop->getRHS());
 
   const Expr *LhsArg = Result.Nodes.getNodeAs<Expr>("lhs_arg");
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.cpp b/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.cpp
index 805d7dacd4eec..5867fb630315d 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.cpp
@@ -19,10 +19,10 @@ namespace clang::tidy::abseil {
 void DurationUnnecessaryConversionCheck::registerMatchers(MatchFinder *Finder) {
   for (const auto &Scale : {"Hours", "Minutes", "Seconds", "Milliseconds",
                             "Microseconds", "Nanoseconds"}) {
-    std::string DurationFactory = (llvm::Twine("::absl::") + Scale).str();
-    std::string FloatConversion =
+    const std::string DurationFactory = (llvm::Twine("::absl::") + Scale).str();
+    const std::string FloatConversion =
         (llvm::Twine("::absl::ToDouble") + Scale).str();
-    std::string IntegerConversion =
+    const std::string IntegerConversion =
         (llvm::Twine("::absl::ToInt64") + Scale).str();
 
     // Matcher which matches the current scale's factory with a `1` argument,
diff --git a/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.h
index 59af8968e8b38..f5d25116b5bc1 100644
--- a/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMEDOUBLECONVERSIONCHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMEDOUBLECONVERSIONCHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_DURATIONUNNECESSARYCONVERSIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_DURATIONUNNECESSARYCONVERSIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -31,4 +31,4 @@ class DurationUnnecessaryConversionCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::abseil
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMEDOUBLECONVERSIONCHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_DURATIONUNNECESSARYCONVERSIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.cpp b/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.cpp
index d9f6551739d9e..0827526ba3b5d 100644
--- a/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.cpp
@@ -29,7 +29,7 @@ makeCharacterLiteral(const StringLiteral *Literal, const ASTContext &Context) {
   assert(Literal->getCharByteWidth() == 1 &&
          "StrSplit doesn't support wide char");
   std::string Result = clang::tooling::fixit::getText(*Literal, Context).str();
-  bool IsRawStringLiteral = StringRef(Result).starts_with(R"(R")");
+  const bool IsRawStringLiteral = StringRef(Result).starts_with(R"(R")");
   // Since raw string literal might contain unescaped non-printable characters,
   // we normalize them using `StringLiteral::outputString`.
   if (IsRawStringLiteral) {
diff --git a/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.cpp b/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.cpp
index c090e5ac54222..5f4cb66424700 100644
--- a/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.cpp
@@ -32,7 +32,7 @@ void NoInternalDependenciesCheck::check(
   const auto *InternalDependency =
       Result.Nodes.getNodeAs<NestedNameSpecifierLoc>("InternalDep");
 
-  SourceLocation LocAtFault =
+  const SourceLocation LocAtFault =
       Result.SourceManager->getSpellingLoc(InternalDependency->getBeginLoc());
 
   if (!LocAtFault.isValid())
diff --git a/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.h b/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.h
index 2911a1ad14ae8..22918311398f1 100644
--- a/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_NOINTERNALDEPSCHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_NOINTERNALDEPSCHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_NOINTERNALDEPENDENCIESCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_NOINTERNALDEPENDENCIESCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -31,4 +31,4 @@ class NoInternalDependenciesCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::abseil
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_NOINTERNALDEPSCHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_NOINTERNALDEPENDENCIESCHECK_H
diff --git a/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp b/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp
index 92d63057caf65..e1063c4f8a46e 100644
--- a/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp
@@ -92,7 +92,7 @@ void StringFindStartswithCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *FindFun = Result.Nodes.getNodeAs<CXXMethodDecl>("findfun");
   assert(FindFun != nullptr);
 
-  bool Rev = FindFun->getName().contains("rfind");
+  const bool Rev = FindFun->getName().contains("rfind");
 
   if (ComparisonExpr->getBeginLoc().isMacroID())
     return;
@@ -107,7 +107,7 @@ void StringFindStartswithCheck::check(const MatchFinder::MatchResult &Result) {
       Context.getLangOpts());
 
   // Create the StartsWith string, negating if comparison was "!=".
-  bool Neg = ComparisonExpr->getOpcode() == BO_NE;
+  const bool Neg = ComparisonExpr->getOpcode() == BO_NE;
 
   // Create the warning message and a FixIt hint replacing the original expr.
   auto Diagnostic =
diff --git a/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.cpp b/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.cpp
index 7a97a1895ad02..5d80b16239838 100644
--- a/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.cpp
@@ -39,9 +39,9 @@ void TimeComparisonCheck::check(const MatchFinder::MatchResult &Result) {
   // want to handle the case of rewriting both sides. This is much simpler if
   // we unconditionally try and rewrite both, and let the rewriter determine
   // if nothing needs to be done.
-  std::string LhsReplacement =
+  const std::string LhsReplacement =
       rewriteExprFromNumberToTime(Result, *Scale, Binop->getLHS());
-  std::string RhsReplacement =
+  const std::string RhsReplacement =
       rewriteExprFromNumberToTime(Result, *Scale, Binop->getRHS());
 
   diag(Binop->getBeginLoc(), "perform comparison in the time domain")
diff --git a/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.h b/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.h
index 703d9514e8c07..74a877a84d9e8 100644
--- a/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.h
+++ b/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMECOMPARECHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMECOMPARECHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMECOMPARISONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMECOMPARISONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -31,4 +31,4 @@ class TimeComparisonCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::abseil
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMECOMPARECHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_TIMECOMPARISONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.cpp b/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.cpp
index 228d974cd5e23..4ae49d285930d 100644
--- a/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.cpp
@@ -93,7 +93,7 @@ void TimeSubtractionCheck::emitDiagnostic(const Expr *Node,
 void TimeSubtractionCheck::registerMatchers(MatchFinder *Finder) {
   for (const char *ScaleName :
        {"Hours", "Minutes", "Seconds", "Millis", "Micros", "Nanos"}) {
-    std::string TimeInverse = (llvm::Twine("ToUnix") + ScaleName).str();
+    const std::string TimeInverse = (llvm::Twine("ToUnix") + ScaleName).str();
     std::optional<DurationScale> Scale = getScaleForTimeInverse(TimeInverse);
     assert(Scale && "Unknown scale encountered");
 
@@ -127,7 +127,7 @@ void TimeSubtractionCheck::registerMatchers(MatchFinder *Finder) {
 
 void TimeSubtractionCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *BinOp = Result.Nodes.getNodeAs<BinaryOperator>("binop");
-  std::string InverseName =
+  const std::string InverseName =
       Result.Nodes.getNodeAs<FunctionDecl>("func_decl")->getNameAsString();
   if (insideMacroDefinition(Result, BinOp->getSourceRange()))
     return;
@@ -144,7 +144,7 @@ void TimeSubtractionCheck::check(const MatchFinder::MatchResult &Result) {
     // We're working with the first case of matcher, and need to replace the
     // entire 'Duration' factory call. (Which also means being careful about
     // our order-of-operations and optionally putting in some parenthesis.
-    bool NeedParens = parensRequired(Result, OuterCall);
+    const bool NeedParens = parensRequired(Result, OuterCall);
 
     emitDiagnostic(
         OuterCall,
@@ -169,7 +169,7 @@ void TimeSubtractionCheck::check(const MatchFinder::MatchResult &Result) {
       // converts it from the inverse to a Duration.  In this case, we replace
       // the outer with just the subtraction expression, which gives the right
       // type and scale, taking care again about parenthesis.
-      bool NeedParens = parensRequired(Result, MaybeCallArg);
+      const bool NeedParens = parensRequired(Result, MaybeCallArg);
 
       emitDiagnostic(
           MaybeCallArg,
diff --git a/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.cpp b/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.cpp
index 8b197e5b939e7..1a6ff30fc8d96 100644
--- a/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.cpp
@@ -117,10 +117,10 @@ void UpgradeDurationConversionsCheck::check(
       "implicit conversion to 'int64_t' is deprecated in this context; use an "
       "explicit cast instead";
 
-  TraversalKindScope RAII(*Result.Context, TK_AsIs);
+  const TraversalKindScope RAII(*Result.Context, TK_AsIs);
 
   const auto *ArgExpr = Result.Nodes.getNodeAs<Expr>("arg");
-  SourceLocation Loc = ArgExpr->getBeginLoc();
+  const SourceLocation Loc = ArgExpr->getBeginLoc();
 
   const auto *OuterExpr = Result.Nodes.getNodeAs<Expr>("OuterExpr");
 
@@ -139,13 +139,13 @@ void UpgradeDurationConversionsCheck::check(
 
   // We gather source locations from template matches not in template
   // instantiations for future matches.
-  internal::Matcher<Stmt> IsInsideTemplate =
+  const internal::Matcher<Stmt> IsInsideTemplate =
       hasAncestor(decl(anyOf(classTemplateDecl(), functionTemplateDecl())));
   if (!match(IsInsideTemplate, *ArgExpr, *Result.Context).empty())
     MatchedTemplateLocations.insert(Loc);
 
-  DiagnosticBuilder Diag = diag(Loc, Message);
-  CharSourceRange SourceRange = Lexer::makeFileCharRange(
+  const DiagnosticBuilder Diag = diag(Loc, Message);
+  const CharSourceRange SourceRange = Lexer::makeFileCharRange(
       CharSourceRange::getTokenRange(ArgExpr->getSourceRange()),
       *Result.SourceManager, Result.Context->getLangOpts());
   if (SourceRange.isInvalid())
diff --git a/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.cpp b/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.cpp
index 49ba17ce643fe..519d90914580f 100644
--- a/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.cpp
+++ b/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.cpp
@@ -76,7 +76,7 @@ void IdDependentBackwardBranchCheck::registerMatchers(MatchFinder *Finder) {
                      this);
 }
 
-IdDependentBackwardBranchCheck::IdDependencyRecord *
+const IdDependentBackwardBranchCheck::IdDependencyRecord *
 IdDependentBackwardBranchCheck::hasIdDepVar(const Expr *Expression) {
   if (!Expression)
     return nullptr;
@@ -94,12 +94,12 @@ IdDependentBackwardBranchCheck::hasIdDepVar(const Expr *Expression) {
   }
   for (const auto *Child : Expression->children())
     if (const auto *ChildExpression = dyn_cast_if_present<Expr>(Child))
-      if (IdDependencyRecord *Result = hasIdDepVar(ChildExpression))
+      if (const IdDependencyRecord *Result = hasIdDepVar(ChildExpression))
         return Result;
   return nullptr;
 }
 
-IdDependentBackwardBranchCheck::IdDependencyRecord *
+const IdDependentBackwardBranchCheck::IdDependencyRecord *
 IdDependentBackwardBranchCheck::hasIdDepField(const Expr *Expression) {
   if (!Expression)
     return nullptr;
@@ -116,7 +116,7 @@ IdDependentBackwardBranchCheck::hasIdDepField(const Expr *Expression) {
   }
   for (const auto *Child : Expression->children())
     if (const auto *ChildExpression = dyn_cast_if_present<Expr>(Child))
-      if (IdDependencyRecord *Result = hasIdDepField(ChildExpression))
+      if (const IdDependencyRecord *Result = hasIdDepField(ChildExpression))
         return Result;
   return nullptr;
 }
@@ -239,7 +239,7 @@ void IdDependentBackwardBranchCheck::check(
   const auto *Loop = Result.Nodes.getNodeAs<Stmt>("backward_branch");
   if (!Loop)
     return;
-  LoopType Type = getLoopType(Loop);
+  const LoopType Type = getLoopType(Loop);
   if (CondExpr) {
     if (IDCall) { // Conditional expression calls an ID function directly.
       diag(CondExpr->getBeginLoc(),
@@ -249,8 +249,8 @@ void IdDependentBackwardBranchCheck::check(
       return;
     }
     // Conditional expression has DeclRefExpr(s), check ID-dependency.
-    IdDependencyRecord *IdDepVar = hasIdDepVar(CondExpr);
-    IdDependencyRecord *IdDepField = hasIdDepField(CondExpr);
+    const IdDependencyRecord *IdDepVar = hasIdDepVar(CondExpr);
+    const IdDependencyRecord *IdDepField = hasIdDepField(CondExpr);
     if (IdDepVar) {
       diag(CondExpr->getBeginLoc(),
            "backward branch (%select{do|while|for}0 loop) is ID-dependent due "
diff --git a/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.h b/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.h
index b777918ab7e7b..297e7751e4f49 100644
--- a/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.h
+++ b/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.h
@@ -44,10 +44,10 @@ class IdDependentBackwardBranchCheck : public ClangTidyCheck {
   std::map<const FieldDecl *, IdDependencyRecord> IdDepFieldsMap;
   /// Returns an IdDependencyRecord if the Expression contains an ID-dependent
   /// variable, returns a nullptr otherwise.
-  IdDependencyRecord *hasIdDepVar(const Expr *Expression);
+  const IdDependencyRecord *hasIdDepVar(const Expr *Expression);
   /// Returns an IdDependencyRecord if the Expression contains an ID-dependent
   /// field, returns a nullptr otherwise.
-  IdDependencyRecord *hasIdDepField(const Expr *Expression);
+  const IdDependencyRecord *hasIdDepField(const Expr *Expression);
   /// Stores the location an ID-dependent variable is created from a call to
   /// an ID function in IdDepVarsMap.
   void saveIdDepVar(const Stmt *Statement, const VarDecl *Variable);
diff --git a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp
index 4c740e31ae7be..8aa23b8fc7b11 100644
--- a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.cpp
@@ -77,7 +77,7 @@ void KernelNameRestrictionPPCallbacks::EndOfMainFile() {
 
   // Check main file for restricted names.
   OptionalFileEntryRef Entry = SM.getFileEntryRefForID(SM.getMainFileID());
-  StringRef FileName = llvm::sys::path::filename(Entry->getName());
+  const StringRef FileName = llvm::sys::path::filename(Entry->getName());
   if (fileNameIsRestricted(FileName))
     Check.diag(SM.getLocForStartOfFile(SM.getMainFileID()),
                "compiling '%0' may cause additional compilation errors due "
@@ -90,7 +90,7 @@ void KernelNameRestrictionPPCallbacks::EndOfMainFile() {
 
   // Check included files for restricted names.
   for (const IncludeDirective &ID : IncludeDirectives) {
-    StringRef FileName = llvm::sys::path::filename(ID.FileName);
+    const StringRef FileName = llvm::sys::path::filename(ID.FileName);
     if (fileNameIsRestricted(FileName))
       Check.diag(ID.Loc,
                  "including '%0' may cause additional compilation errors due "
diff --git a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.h b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.h
index 182d10b5539e5..441cf36446c9a 100644
--- a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.h
+++ b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_KERNEL_NAME_RESTRICTION_CHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_KERNEL_NAME_RESTRICTION_CHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_KERNELNAMERESTRICTIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_KERNELNAMERESTRICTIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -28,4 +28,4 @@ class KernelNameRestrictionCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::altera
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_KERNEL_NAME_RESTRICTION_CHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_KERNELNAMERESTRICTIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.h b/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.h
index dab3dbce50371..dcfefcb0a1b29 100644
--- a/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.h
+++ b/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_SINGLE_WORK_ITEM_BARRIER_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_SINGLE_WORK_ITEM_BARRIER_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_SINGLEWORKITEMBARRIERCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_SINGLEWORKITEMBARRIERCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -33,4 +33,4 @@ class SingleWorkItemBarrierCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::altera
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_SINGLE_WORK_ITEM_BARRIER_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ALTERA_SINGLEWORKITEMBARRIERCHECK_H
diff --git a/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.cpp b/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.cpp
index 0a19378949f46..2b1312c8967da 100644
--- a/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.cpp
+++ b/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.cpp
@@ -60,10 +60,10 @@ void StructPackAlignCheck::check(const MatchFinder::MatchResult &Result) {
     // For each StructField, record how big it is (in bits).
     // Would be good to use a pair of <offset, size> to advise a better
     // packing order.
-    QualType StructFieldTy = StructField->getType();
+    const QualType StructFieldTy = StructField->getType();
     if (StructFieldTy->isIncompleteType())
       return;
-    unsigned int StructFieldWidth =
+    const unsigned int StructFieldWidth =
         (unsigned int)Result.Context->getTypeInfo(StructFieldTy.getTypePtr())
             .Width;
     FieldSizes.emplace_back(StructFieldWidth, StructField->getFieldIndex());
@@ -72,21 +72,22 @@ void StructPackAlignCheck::check(const MatchFinder::MatchResult &Result) {
     TotalBitSize += StructFieldWidth;
   }
 
-  uint64_t CharSize = Result.Context->getCharWidth();
-  CharUnits CurrSize = Result.Context->getASTRecordLayout(Struct).getSize();
-  CharUnits MinByteSize =
+  const uint64_t CharSize = Result.Context->getCharWidth();
+  const CharUnits CurrSize =
+      Result.Context->getASTRecordLayout(Struct).getSize();
+  const CharUnits MinByteSize =
       CharUnits::fromQuantity(std::max<clang::CharUnits::QuantityType>(
           std::ceil(static_cast<float>(TotalBitSize) / CharSize), 1));
-  CharUnits MaxAlign = CharUnits::fromQuantity(
+  const CharUnits MaxAlign = CharUnits::fromQuantity(
       std::ceil((float)Struct->getMaxAlignment() / CharSize));
-  CharUnits CurrAlign =
+  const CharUnits CurrAlign =
       Result.Context->getASTRecordLayout(Struct).getAlignment();
-  CharUnits NewAlign = computeRecommendedAlignment(MinByteSize);
+  const CharUnits NewAlign = computeRecommendedAlignment(MinByteSize);
 
-  bool IsPacked = Struct->hasAttr<PackedAttr>();
-  bool NeedsPacking = (MinByteSize < CurrSize) && (MaxAlign != NewAlign) &&
-                      (CurrSize != NewAlign);
-  bool NeedsAlignment = CurrAlign.getQuantity() != NewAlign.getQuantity();
+  const bool IsPacked = Struct->hasAttr<PackedAttr>();
+  const bool NeedsPacking = (MinByteSize < CurrSize) &&
+                            (MaxAlign != NewAlign) && (CurrSize != NewAlign);
+  const bool NeedsAlignment = CurrAlign.getQuantity() != NewAlign.getQuantity();
 
   if (!NeedsAlignment && !NeedsPacking)
     return;
@@ -111,7 +112,8 @@ void StructPackAlignCheck::check(const MatchFinder::MatchResult &Result) {
 
   FixItHint FixIt;
   auto *Attribute = Struct->getAttr<AlignedAttr>();
-  std::string NewAlignQuantity = std::to_string((int)NewAlign.getQuantity());
+  const std::string NewAlignQuantity =
+      std::to_string((int)NewAlign.getQuantity());
   if (Attribute) {
     FixIt = FixItHint::CreateReplacement(
         Attribute->getRange(),
diff --git a/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp b/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp
index e90cdd00eb1fe..b0cd4cdb41e91 100644
--- a/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp
@@ -127,7 +127,7 @@ bool UnrollLoopsCheck::hasKnownBounds(const Stmt *Statement,
   if (const auto *InitDeclStatement = dyn_cast<DeclStmt>(Initializer)) {
     if (const auto *VariableDecl =
             dyn_cast<VarDecl>(InitDeclStatement->getSingleDecl())) {
-      APValue *Evaluation = VariableDecl->evaluateValue();
+      const APValue *Evaluation = VariableDecl->evaluateValue();
       if (!Evaluation || !Evaluation->hasValue())
         return false;
     }
diff --git a/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.h b/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.h
index 02c4e0056ea15..5637fc8bb350e 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_ACCEPT4_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_ACCEPT4_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECACCEPT4CHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECACCEPT4CHECK_H
 
 #include "CloexecCheck.h"
 
@@ -27,4 +27,4 @@ class CloexecAccept4Check : public CloexecCheck {
 
 } // namespace clang::tidy::android
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_ACCEPT4_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECACCEPT4CHECK_H
diff --git a/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.cpp
index 9cd888cca023b..a624523b18137 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.cpp
@@ -26,10 +26,11 @@ void CloexecAcceptCheck::registerMatchers(MatchFinder *Finder) {
 }
 
 void CloexecAcceptCheck::check(const MatchFinder::MatchResult &Result) {
-  std::string ReplacementText = (Twine("accept4(") + getSpellingArg(Result, 0) +
-                                 ", " + getSpellingArg(Result, 1) + ", " +
-                                 getSpellingArg(Result, 2) + ", SOCK_CLOEXEC)")
-                                    .str();
+  const std::string ReplacementText =
+      (Twine("accept4(") + getSpellingArg(Result, 0) + ", " +
+       getSpellingArg(Result, 1) + ", " + getSpellingArg(Result, 2) +
+       ", SOCK_CLOEXEC)")
+          .str();
 
   replaceFunc(
       Result,
diff --git a/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.h b/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.h
index 4540f938fd478..332a97ace91a4 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_ACCEPT_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_ACCEPT_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECACCEPTCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECACCEPTCHECK_H
 
 #include "CloexecCheck.h"
 
@@ -27,4 +27,4 @@ class CloexecAcceptCheck : public CloexecCheck {
 
 } // namespace clang::tidy::android
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_ACCEPT_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECACCEPTCHECK_H
diff --git a/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp
index 48c54c0ae02c3..ff86fc52879d9 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp
@@ -30,12 +30,13 @@ static std::string buildFixMsgForStringFlag(const Expr *Arg,
             " \"" + Twine(Mode) + "\"")
         .str();
 
-  StringRef SR = cast<StringLiteral>(Arg->IgnoreParenCasts())->getString();
+  const StringRef SR =
+      cast<StringLiteral>(Arg->IgnoreParenCasts())->getString();
   return ("\"" + SR + Twine(Mode) + "\"").str();
 }
 
 void CloexecCheck::registerMatchersImpl(
-    MatchFinder *Finder, internal::Matcher<FunctionDecl> Function) {
+    MatchFinder *Finder, const internal::Matcher<FunctionDecl> &Function) {
   // We assume all the checked APIs are C functions.
   Finder->addMatcher(
       callExpr(
@@ -49,14 +50,14 @@ void CloexecCheck::insertMacroFlag(const MatchFinder::MatchResult &Result,
   const auto *MatchedCall = Result.Nodes.getNodeAs<CallExpr>(FuncBindingStr);
   const auto *FlagArg = MatchedCall->getArg(ArgPos);
   const auto *FD = Result.Nodes.getNodeAs<FunctionDecl>(FuncDeclBindingStr);
-  SourceManager &SM = *Result.SourceManager;
+  const SourceManager &SM = *Result.SourceManager;
 
   if (utils::exprHasBitFlagWithSpelling(FlagArg->IgnoreParenCasts(), SM,
                                         Result.Context->getLangOpts(),
                                         MacroFlag))
     return;
 
-  SourceLocation EndLoc =
+  const SourceLocation EndLoc =
       Lexer::getLocForEndOfToken(SM.getFileLoc(FlagArg->getEndLoc()), 0, SM,
                                  Result.Context->getLangOpts());
 
@@ -84,7 +85,7 @@ void CloexecCheck::insertStringFlag(
   if (!ModeStr || ModeStr->getString().contains(Mode))
     return;
 
-  std::string ReplacementText = buildFixMsgForStringFlag(
+  const std::string ReplacementText = buildFixMsgForStringFlag(
       ModeArg, *Result.SourceManager, Result.Context->getLangOpts(), Mode);
 
   diag(ModeArg->getBeginLoc(), "use %0 mode '%1' to set O_CLOEXEC")
diff --git a/clang-tools-extra/clang-tidy/android/CloexecCheck.h b/clang-tools-extra/clang-tidy/android/CloexecCheck.h
index 858d96ab45b61..a6dcb57d488da 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecCheck.h
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -29,9 +29,9 @@ class CloexecCheck : public ClangTidyCheck {
       : ClangTidyCheck(Name, Context) {}
 
 protected:
-  void
-  registerMatchersImpl(ast_matchers::MatchFinder *Finder,
-                       ast_matchers::internal::Matcher<FunctionDecl> Function);
+  void registerMatchersImpl(
+      ast_matchers::MatchFinder *Finder,
+      const ast_matchers::internal::Matcher<FunctionDecl> &Function);
 
   /// Currently, we have three types of fixes.
   ///
@@ -97,4 +97,4 @@ class CloexecCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::android
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECCHECK_H
diff --git a/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.h b/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.h
index ee2f51abf05fc..d7d2b42049cd8 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_CREAT_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_CREAT_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECCREATCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECCREATCHECK_H
 
 #include "CloexecCheck.h"
 
@@ -27,4 +27,4 @@ class CloexecCreatCheck : public CloexecCheck {
 
 } // namespace clang::tidy::android
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_CREAT_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECCREATCHECK_H
diff --git a/clang-tools-extra/clang-tidy/android/CloexecDupCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecDupCheck.cpp
index 5ac1b6fb632e1..5db57461a5909 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecDupCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecDupCheck.cpp
@@ -20,7 +20,7 @@ void CloexecDupCheck::registerMatchers(MatchFinder *Finder) {
 }
 
 void CloexecDupCheck::check(const MatchFinder::MatchResult &Result) {
-  std::string ReplacementText =
+  const std::string ReplacementText =
       (Twine("fcntl(") + getSpellingArg(Result, 0) + ", F_DUPFD_CLOEXEC)")
           .str();
 
diff --git a/clang-tools-extra/clang-tidy/android/CloexecDupCheck.h b/clang-tools-extra/clang-tidy/android/CloexecDupCheck.h
index f5699685ed086..4eae507b99b10 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecDupCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecDupCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_DUP_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_DUP_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECDUPCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECDUPCHECK_H
 
 #include "CloexecCheck.h"
 
@@ -28,4 +28,4 @@ class CloexecDupCheck : public CloexecCheck {
 
 } // namespace clang::tidy::android
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_DUP_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECDUPCHECK_H
diff --git a/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.h b/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.h
index f467b87a6cf70..03a529f02a6d7 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_EPOLL_CREATE1_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_EPOLL_CREATE1_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECEPOLLCREATE1CHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECEPOLLCREATE1CHECK_H
 
 #include "CloexecCheck.h"
 
@@ -27,4 +27,4 @@ class CloexecEpollCreate1Check : public CloexecCheck {
 
 } // namespace clang::tidy::android
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_EPOLL_CREATE1_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECEPOLLCREATE1CHECK_H
diff --git a/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.h b/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.h
index a8d17c82d457d..243b9bd7b1317 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_EPOLL_CREATE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_EPOLL_CREATE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECEPOLLCREATECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECEPOLLCREATECHECK_H
 
 #include "CloexecCheck.h"
 
@@ -27,4 +27,4 @@ class CloexecEpollCreateCheck : public CloexecCheck {
 
 } // namespace clang::tidy::android
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_EPOLL_CREATE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECEPOLLCREATECHECK_H
diff --git a/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.h b/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.h
index 646b237a663e0..a018fc5deaddb 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_FOPEN_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_FOPEN_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECFOPENCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECFOPENCHECK_H
 
 #include "CloexecCheck.h"
 
@@ -30,4 +30,4 @@ class CloexecFopenCheck : public CloexecCheck {
 
 } // namespace clang::tidy::android
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_FOPEN_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECFOPENCHECK_H
diff --git a/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.h b/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.h
index 3960d05e2e1f0..c2e45332fd048 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_INOTIFY_INIT1_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_INOTIFY_INIT1_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECINOTIFYINIT1CHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECINOTIFYINIT1CHECK_H
 
 #include "CloexecCheck.h"
 
@@ -27,4 +27,4 @@ class CloexecInotifyInit1Check : public CloexecCheck {
 
 } // namespace clang::tidy::android
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_INOTIFY_INIT1_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECINOTIFYINIT1CHECK_H
diff --git a/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.h b/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.h
index cb9e6820571bc..cd202c2ad97f8 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_INOTIFY_INIT_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_INOTIFY_INIT_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECINOTIFYINITCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECINOTIFYINITCHECK_H
 
 #include "CloexecCheck.h"
 
@@ -27,4 +27,4 @@ class CloexecInotifyInitCheck : public CloexecCheck {
 
 } // namespace clang::tidy::android
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_INOTIFY_INIT_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECINOTIFYINITCHECK_H
diff --git a/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.h b/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.h
index dd96ee968f3b4..1a77c7fcb196f 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_MEMFD_CREATE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_MEMFD_CREATE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECMEMFDCREATECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECMEMFDCREATECHECK_H
 
 #include "CloexecCheck.h"
 
@@ -27,4 +27,4 @@ class CloexecMemfdCreateCheck : public CloexecCheck {
 
 } // namespace clang::tidy::android
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_MEMFD_CREATE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECMEMFDCREATECHECK_H
diff --git a/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.cpp
index 8c24482c73251..9938027c53b0e 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.cpp
@@ -30,7 +30,7 @@ void CloexecOpenCheck::registerMatchers(MatchFinder *Finder) {
 void CloexecOpenCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *FD = Result.Nodes.getNodeAs<FunctionDecl>(FuncDeclBindingStr);
   assert(FD->param_size() > 1);
-  int ArgPos = (FD->param_size() > 2) ? 2 : 1;
+  const int ArgPos = (FD->param_size() > 2) ? 2 : 1;
   insertMacroFlag(Result, /*MacroFlag=*/"O_CLOEXEC", ArgPos);
 }
 
diff --git a/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.h b/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.h
index d95fe21fb3e88..d30b456dcdc17 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecOpenCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_OPEN_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_OPEN_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECOPENCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECOPENCHECK_H
 
 #include "CloexecCheck.h"
 
@@ -32,4 +32,4 @@ class CloexecOpenCheck : public CloexecCheck {
 
 } // namespace clang::tidy::android
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_OPEN_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECOPENCHECK_H
diff --git a/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.h b/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.h
index 496bd6b6cbbc0..31653081ef98d 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_PIPE2_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_PIPE2_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECPIPE2CHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECPIPE2CHECK_H
 
 #include "CloexecCheck.h"
 
@@ -27,4 +27,4 @@ class CloexecPipe2Check : public CloexecCheck {
 
 } // namespace clang::tidy::android
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_PIPE2_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECPIPE2CHECK_H
diff --git a/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.cpp
index a475dff4a2682..37e3c56e97021 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.cpp
@@ -20,7 +20,7 @@ void CloexecPipeCheck::registerMatchers(MatchFinder *Finder) {
 }
 
 void CloexecPipeCheck::check(const MatchFinder::MatchResult &Result) {
-  std::string ReplacementText =
+  const std::string ReplacementText =
       (Twine("pipe2(") + getSpellingArg(Result, 0) + ", O_CLOEXEC)").str();
 
   replaceFunc(Result,
diff --git a/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.h b/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.h
index f0145e14eb49f..721a68883dd01 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_PIPE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_PIPE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECPIPECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECPIPECHECK_H
 
 #include "CloexecCheck.h"
 
@@ -27,4 +27,4 @@ class CloexecPipeCheck : public CloexecCheck {
 
 } // namespace clang::tidy::android
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_PIPE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECPIPECHECK_H
diff --git a/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.h b/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.h
index 0a29d7224e781..8865db3a16788 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_SOCKET_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_SOCKET_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECSOCKETCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECSOCKETCHECK_H
 
 #include "CloexecCheck.h"
 
@@ -27,4 +27,4 @@ class CloexecSocketCheck : public CloexecCheck {
 
 } // namespace clang::tidy::android
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXEC_SOCKET_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ANDROID_CLOEXECSOCKETCHECK_H
diff --git a/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp b/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp
index 36ac9a44695c9..c42f069b487c3 100644
--- a/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp
@@ -64,7 +64,7 @@ void ComparisonInTempFailureRetryCheck::check(
   const LangOptions &Opts = Result.Context->getLangOpts();
   SourceLocation LocStart = Node.getBeginLoc();
   while (LocStart.isMacroID()) {
-    SourceLocation Invocation = SM.getImmediateMacroCallerLoc(LocStart);
+    const SourceLocation Invocation = SM.getImmediateMacroCallerLoc(LocStart);
     Token Tok;
     if (!Lexer::getRawToken(SM.getSpellingLoc(Invocation), Tok, SM, Opts,
                             /*IgnoreWhiteSpace=*/true)) {
diff --git a/clang-tools-extra/clang-tidy/boost/UseRangesCheck.cpp b/clang-tools-extra/clang-tidy/boost/UseRangesCheck.cpp
index 34ecee5badb15..36dd3c94ee19f 100644
--- a/clang-tools-extra/clang-tidy/boost/UseRangesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/boost/UseRangesCheck.cpp
@@ -18,6 +18,7 @@
 #include <initializer_list>
 #include <optional>
 #include <string>
+#include <utility>
 
 // FixItHint - Let the docs script know that this class does provide fixits
 
@@ -217,11 +218,11 @@ utils::UseRangesCheck::ReplacerMap UseRangesCheck::getReplacerMap() const {
   const auto AddFromStd =
       [&](llvm::IntrusiveRefCntPtr<UseRangesCheck::Replacer> Replacer,
           std::initializer_list<StringRef> Names) {
-        AddFrom(Replacer, Names, "std");
+        AddFrom(std::move(Replacer), Names, "std");
       };
 
   const auto AddFromBoost =
-      [&](llvm::IntrusiveRefCntPtr<UseRangesCheck::Replacer> Replacer,
+      [&](const llvm::IntrusiveRefCntPtr<UseRangesCheck::Replacer> &Replacer,
           std::initializer_list<
               std::pair<StringRef, std::initializer_list<StringRef>>>
               NamespaceAndNames) {
@@ -341,7 +342,7 @@ void UseRangesCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
 }
 
 DiagnosticBuilder UseRangesCheck::createDiag(const CallExpr &Call) {
-  DiagnosticBuilder D =
+  const DiagnosticBuilder D =
       diag(Call.getBeginLoc(), "use a %0 version of this algorithm");
   D << (Call.getDirectCallee()->isInStdNamespace() ? "boost" : "ranged");
   return D;
diff --git a/clang-tools-extra/clang-tidy/boost/UseToStringCheck.h b/clang-tools-extra/clang-tidy/boost/UseToStringCheck.h
index af87f15a1dc0b..dae3f7c125db4 100644
--- a/clang-tools-extra/clang-tidy/boost/UseToStringCheck.h
+++ b/clang-tools-extra/clang-tidy/boost/UseToStringCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BOOST_USE_TO_STRING_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BOOST_USE_TO_STRING_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BOOST_USETOSTRINGCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BOOST_USETOSTRINGCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -32,4 +32,4 @@ class UseToStringCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::boost
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BOOST_USE_TO_STRING_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BOOST_USETOSTRINGCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
index c0a778a027377..ed30d01e645d1 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
@@ -81,14 +81,16 @@ static std::vector<std::pair<SourceLocation, StringRef>>
 getCommentsInRange(ASTContext *Ctx, CharSourceRange Range) {
   std::vector<std::pair<SourceLocation, StringRef>> Comments;
   auto &SM = Ctx->getSourceManager();
-  std::pair<FileID, unsigned> BeginLoc = SM.getDecomposedLoc(Range.getBegin()),
-                              EndLoc = SM.getDecomposedLoc(Range.getEnd());
+  const std::pair<FileID, unsigned> BeginLoc =
+                                        SM.getDecomposedLoc(Range.getBegin()),
+                                    EndLoc =
+                                        SM.getDecomposedLoc(Range.getEnd());
 
   if (BeginLoc.first != EndLoc.first)
     return Comments;
 
   bool Invalid = false;
-  StringRef Buffer = SM.getBufferData(BeginLoc.first, &Invalid);
+  const StringRef Buffer = SM.getBufferData(BeginLoc.first, &Invalid);
   if (Invalid)
     return Comments;
 
@@ -106,7 +108,7 @@ getCommentsInRange(ASTContext *Ctx, CharSourceRange Range) {
       break;
 
     if (Tok.is(tok::comment)) {
-      std::pair<FileID, unsigned> CommentLoc =
+      const std::pair<FileID, unsigned> CommentLoc =
           SM.getDecomposedLoc(Tok.getLocation());
       assert(CommentLoc.first == BeginLoc.first);
       Comments.emplace_back(
@@ -125,7 +127,7 @@ static std::vector<std::pair<SourceLocation, StringRef>>
 getCommentsBeforeLoc(ASTContext *Ctx, SourceLocation Loc) {
   std::vector<std::pair<SourceLocation, StringRef>> Comments;
   while (Loc.isValid()) {
-    clang::Token Tok = utils::lexer::getPreviousToken(
+    const clang::Token Tok = utils::lexer::getPreviousToken(
         Loc, Ctx->getSourceManager(), Ctx->getLangOpts(),
         /*SkipComments=*/false);
     if (Tok.isNot(tok::comment))
@@ -142,11 +144,11 @@ getCommentsBeforeLoc(ASTContext *Ctx, SourceLocation Loc) {
 
 static bool isLikelyTypo(llvm::ArrayRef<ParmVarDecl *> Params,
                          StringRef ArgName, unsigned ArgIndex) {
-  std::string ArgNameLowerStr = ArgName.lower();
-  StringRef ArgNameLower = ArgNameLowerStr;
+  const std::string ArgNameLowerStr = ArgName.lower();
+  const StringRef ArgNameLower = ArgNameLowerStr;
   // The threshold is arbitrary.
-  unsigned UpperBound = ((ArgName.size() + 2) / 3) + 1;
-  unsigned ThisED = ArgNameLower.edit_distance(
+  const unsigned UpperBound = ((ArgName.size() + 2) / 3) + 1;
+  const unsigned ThisED = ArgNameLower.edit_distance(
       Params[ArgIndex]->getIdentifier()->getName().lower(),
       /*AllowReplacements=*/true, UpperBound);
   if (ThisED >= UpperBound)
@@ -155,7 +157,7 @@ static bool isLikelyTypo(llvm::ArrayRef<ParmVarDecl *> Params,
   for (unsigned I = 0, E = Params.size(); I != E; ++I) {
     if (I == ArgIndex)
       continue;
-    IdentifierInfo *II = Params[I]->getIdentifier();
+    const IdentifierInfo *II = Params[I]->getIdentifier();
     if (!II)
       continue;
 
@@ -163,9 +165,9 @@ static bool isLikelyTypo(llvm::ArrayRef<ParmVarDecl *> Params,
     // Other parameters must be an edit distance at least Threshold more away
     // from this parameter. This gives us greater confidence that this is a
     // typo of this parameter and not one with a similar name.
-    unsigned OtherED = ArgNameLower.edit_distance(II->getName().lower(),
-                                                  /*AllowReplacements=*/true,
-                                                  ThisED + Threshold);
+    const unsigned OtherED = ArgNameLower.edit_distance(
+        II->getName().lower(),
+        /*AllowReplacements=*/true, ThisED + Threshold);
     if (OtherED < ThisED + Threshold)
       return false;
   }
@@ -267,7 +269,8 @@ void ArgumentCommentCheck::checkCallArgs(ASTContext *Ctx,
     return;
 
   Callee = Callee->getFirstDecl();
-  unsigned NumArgs = std::min<unsigned>(Args.size(), Callee->getNumParams());
+  const unsigned NumArgs =
+      std::min<unsigned>(Args.size(), Callee->getNumParams());
   if ((NumArgs == 0) || (IgnoreSingleArgument && NumArgs == 1))
     return;
 
@@ -279,7 +282,7 @@ void ArgumentCommentCheck::checkCallArgs(ASTContext *Ctx,
 
   for (unsigned I = 0; I < NumArgs; ++I) {
     const ParmVarDecl *PVD = Callee->getParamDecl(I);
-    IdentifierInfo *II = PVD->getIdentifier();
+    const IdentifierInfo *II = PVD->getIdentifier();
     if (!II)
       continue;
     if (FunctionDecl *Template = Callee->getTemplateInstantiationPattern()) {
@@ -293,7 +296,7 @@ void ArgumentCommentCheck::checkCallArgs(ASTContext *Ctx,
       }
     }
 
-    CharSourceRange BeforeArgument =
+    const CharSourceRange BeforeArgument =
         MakeFileCharRange(ArgBeginLoc, Args[I]->getBeginLoc());
     ArgBeginLoc = Args[I]->getEndLoc();
 
@@ -302,7 +305,7 @@ void ArgumentCommentCheck::checkCallArgs(ASTContext *Ctx,
       Comments = getCommentsInRange(Ctx, BeforeArgument);
     } else {
       // Fall back to parsing back from the start of the argument.
-      CharSourceRange ArgsRange =
+      const CharSourceRange ArgsRange =
           MakeFileCharRange(Args[I]->getBeginLoc(), Args[I]->getEndLoc());
       Comments = getCommentsBeforeLoc(Ctx, ArgsRange.getBegin());
     }
@@ -312,7 +315,7 @@ void ArgumentCommentCheck::checkCallArgs(ASTContext *Ctx,
       if (IdentRE.match(Comment.second, &Matches) &&
           !sameName(Matches[2], II->getName(), StrictMode)) {
         {
-          DiagnosticBuilder Diag =
+          const DiagnosticBuilder Diag =
               diag(Comment.first, "argument name '%0' in comment does not "
                                   "match parameter name %1")
               << Matches[2] << II;
@@ -332,9 +335,9 @@ void ArgumentCommentCheck::checkCallArgs(ASTContext *Ctx,
 
     // If the argument comments are missing for literals add them.
     if (Comments.empty() && shouldAddComment(Args[I])) {
-      std::string ArgComment =
+      const std::string ArgComment =
           (llvm::Twine("/*") + II->getName() + "=*/").str();
-      DiagnosticBuilder Diag =
+      const DiagnosticBuilder Diag =
           diag(Args[I]->getBeginLoc(),
                "argument comment missing for literal argument %0")
           << II
diff --git a/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp
index 170050247014a..a29aa552b0953 100644
--- a/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp
@@ -29,7 +29,7 @@ AST_MATCHER_P2(Expr, hasSideEffect, bool, CheckFunctionCalls,
   const Expr *E = &Node;
 
   if (const auto *Op = dyn_cast<UnaryOperator>(E)) {
-    UnaryOperator::Opcode OC = Op->getOpcode();
+    const UnaryOperator::Opcode OC = Op->getOpcode();
     return OC == UO_PostInc || OC == UO_PostDec || OC == UO_PreInc ||
            OC == UO_PreDec;
   }
@@ -44,7 +44,7 @@ AST_MATCHER_P2(Expr, hasSideEffect, bool, CheckFunctionCalls,
       if (MethodDecl->isConst())
         return false;
 
-    OverloadedOperatorKind OpKind = OpCallExpr->getOperator();
+    const OverloadedOperatorKind OpKind = OpCallExpr->getOperator();
     return OpKind == OO_Equal || OpKind == OO_PlusEqual ||
            OpKind == OO_MinusEqual || OpKind == OO_StarEqual ||
            OpKind == OO_SlashEqual || OpKind == OO_AmpEqual ||
@@ -130,7 +130,7 @@ void AssertSideEffectCheck::check(const MatchFinder::MatchResult &Result) {
 
   StringRef AssertMacroName;
   while (Loc.isValid() && Loc.isMacroID()) {
-    StringRef MacroName = Lexer::getImmediateMacroName(Loc, SM, LangOpts);
+    const StringRef MacroName = Lexer::getImmediateMacroName(Loc, SM, LangOpts);
     Loc = SM.getImmediateMacroCallerLoc(Loc);
 
     // Check if this macro is an assert.
diff --git a/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.cpp
index 2c8856298e7be..d5d8a29d96969 100644
--- a/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.cpp
@@ -66,7 +66,7 @@ void AssignmentInIfConditionCheck::check(
 }
 
 void AssignmentInIfConditionCheck::report(const Expr *AssignmentExpr) {
-  SourceLocation OpLoc =
+  const SourceLocation OpLoc =
       isa<BinaryOperator>(AssignmentExpr)
           ? cast<BinaryOperator>(AssignmentExpr)->getOperatorLoc()
           : cast<CXXOperatorCallExpr>(AssignmentExpr)->getOperatorLoc();
diff --git a/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.cpp
index e1d0538ab1644..3e1188d5e2463 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.cpp
@@ -40,7 +40,7 @@ void BadSignalToKillThreadCheck::check(const MatchFinder::MatchResult &Result) {
     const Token &T = MI->tokens().back();
     if (!T.isLiteral() || !T.getLiteralData())
       return std::nullopt;
-    StringRef ValueStr = StringRef(T.getLiteralData(), T.getLength());
+    const StringRef ValueStr = StringRef(T.getLiteralData(), T.getLength());
 
     llvm::APInt IntValue;
     constexpr unsigned AutoSenseRadix = 0;
diff --git a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp
index 07bb08166a006..8e0f0c55bdf94 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp
@@ -281,8 +281,8 @@ static bool isIdenticalStmt(const ASTContext &Ctx, const Stmt *Stmt1,
     const auto *IntLit1 = cast<IntegerLiteral>(Stmt1);
     const auto *IntLit2 = cast<IntegerLiteral>(Stmt2);
 
-    llvm::APInt I1 = IntLit1->getValue();
-    llvm::APInt I2 = IntLit2->getValue();
+    const llvm::APInt I1 = IntLit1->getValue();
+    const llvm::APInt I2 = IntLit2->getValue();
     if (I1.getBitWidth() != I2.getBitWidth())
       return false;
     return I1 == I2;
@@ -352,7 +352,7 @@ void BranchCloneCheck::check(const MatchFinder::MatchResult &Result) {
       }
     }
 
-    size_t N = Branches.size();
+    const size_t N = Branches.size();
     llvm::BitVector KnownAsClone(N);
 
     for (size_t I = 0; I + 1 < N; I++) {
@@ -375,7 +375,7 @@ void BranchCloneCheck::check(const MatchFinder::MatchResult &Result) {
           // We report the first occurrence only when we find the second one.
           diag(Branches[I]->getBeginLoc(),
                "repeated branch body in conditional chain");
-          SourceLocation End =
+          const SourceLocation End =
               Lexer::getLocForEndOfToken(Branches[I]->getEndLoc(), 0,
                                          *Result.SourceManager, getLangOpts());
           if (End.isValid()) {
diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index e6115f67656bc..6859dc97c112a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -22,13 +22,17 @@
 #include "CommandProcessorCheck.h"
 #include "ComparePointerToMemberVirtualFunctionCheck.h"
 #include "CopyConstructorInitCheck.h"
+#include "CopyConstructorMutatesArgumentCheck.h"
 #include "CrtpConstructorAccessibilityCheck.h"
 #include "DanglingHandleCheck.h"
+#include "DefaultOperatorNewOnOveralignedTypeCheck.h"
 #include "DerivedMethodShadowingBaseMethodCheck.h"
 #include "DynamicStaticInitializersCheck.h"
 #include "EasilySwappableParametersCheck.h"
 #include "EmptyCatchCheck.h"
+#include "ExceptionCopyConstructorThrowsCheck.h"
 #include "ExceptionEscapeCheck.h"
+#include "FloatLoopCounterCheck.h"
 #include "FoldInitTypeCheck.h"
 #include "ForwardDeclarationNamespaceCheck.h"
 #include "ForwardingReferenceOverloadCheck.h"
@@ -61,6 +65,8 @@
 #include "ParentVirtualCallCheck.h"
 #include "PointerArithmeticOnPolymorphicObjectCheck.h"
 #include "PosixReturnCheck.h"
+#include "RandomGeneratorSeedCheck.h"
+#include "RawMemoryCallOnNonTrivialTypeCheck.h"
 #include "RedundantBranchConditionCheck.h"
 #include "ReservedIdentifierCheck.h"
 #include "ReturnConstRefFromParameterCheck.h"
@@ -71,6 +77,7 @@
 #include "SizeofExpressionCheck.h"
 #include "SpuriouslyWakeUpFunctionsCheck.h"
 #include "StandaloneEmptyCheck.h"
+#include "StdNamespaceModificationCheck.h"
 #include "StringConstructorCheck.h"
 #include "StringIntegerAssignmentCheck.h"
 #include "StringLiteralWithEmbeddedNulCheck.h"
@@ -137,8 +144,12 @@ class BugproneModule : public ClangTidyModule {
         "bugprone-compare-pointer-to-member-virtual-function");
     CheckFactories.registerCheck<CopyConstructorInitCheck>(
         "bugprone-copy-constructor-init");
+    CheckFactories.registerCheck<CopyConstructorMutatesArgumentCheck>(
+        "bugprone-copy-constructor-mutates-argument");
     CheckFactories.registerCheck<DanglingHandleCheck>(
         "bugprone-dangling-handle");
+    CheckFactories.registerCheck<DefaultOperatorNewOnOveralignedTypeCheck>(
+        "bugprone-default-operator-new-on-overaligned-type");
     CheckFactories.registerCheck<DerivedMethodShadowingBaseMethodCheck>(
         "bugprone-derived-method-shadowing-base-method");
     CheckFactories.registerCheck<DynamicStaticInitializersCheck>(
@@ -146,8 +157,12 @@ class BugproneModule : public ClangTidyModule {
     CheckFactories.registerCheck<EasilySwappableParametersCheck>(
         "bugprone-easily-swappable-parameters");
     CheckFactories.registerCheck<EmptyCatchCheck>("bugprone-empty-catch");
+    CheckFactories.registerCheck<ExceptionCopyConstructorThrowsCheck>(
+        "bugprone-exception-copy-constructor-throws");
     CheckFactories.registerCheck<ExceptionEscapeCheck>(
         "bugprone-exception-escape");
+    CheckFactories.registerCheck<FloatLoopCounterCheck>(
+        "bugprone-float-loop-counter");
     CheckFactories.registerCheck<FoldInitTypeCheck>("bugprone-fold-init-type");
     CheckFactories.registerCheck<ForwardDeclarationNamespaceCheck>(
         "bugprone-forward-declaration-namespace");
@@ -216,6 +231,10 @@ class BugproneModule : public ClangTidyModule {
     CheckFactories.registerCheck<ParentVirtualCallCheck>(
         "bugprone-parent-virtual-call");
     CheckFactories.registerCheck<PosixReturnCheck>("bugprone-posix-return");
+    CheckFactories.registerCheck<RandomGeneratorSeedCheck>(
+        "bugprone-random-generator-seed");
+    CheckFactories.registerCheck<RawMemoryCallOnNonTrivialTypeCheck>(
+        "bugprone-raw-memory-call-on-non-trivial-type");
     CheckFactories.registerCheck<ReservedIdentifierCheck>(
         "bugprone-reserved-identifier");
     CheckFactories.registerCheck<SharedPtrArrayMismatchCheck>(
@@ -231,6 +250,8 @@ class BugproneModule : public ClangTidyModule {
         "bugprone-spuriously-wake-up-functions");
     CheckFactories.registerCheck<StandaloneEmptyCheck>(
         "bugprone-standalone-empty");
+    CheckFactories.registerCheck<StdNamespaceModificationCheck>(
+        "bugprone-std-namespace-modification");
     CheckFactories.registerCheck<StringConstructorCheck>(
         "bugprone-string-constructor");
     CheckFactories.registerCheck<StringIntegerAssignmentCheck>(
diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index c8943e5b22ef8..db1256d91d311 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -18,13 +18,17 @@ add_clang_library(clangTidyBugproneModule STATIC
   CommandProcessorCheck.cpp
   ComparePointerToMemberVirtualFunctionCheck.cpp
   CopyConstructorInitCheck.cpp
+  CopyConstructorMutatesArgumentCheck.cpp
   CrtpConstructorAccessibilityCheck.cpp
   DanglingHandleCheck.cpp
+  DefaultOperatorNewOnOveralignedTypeCheck.cpp
   DerivedMethodShadowingBaseMethodCheck.cpp
   DynamicStaticInitializersCheck.cpp
   EasilySwappableParametersCheck.cpp
   EmptyCatchCheck.cpp
+  ExceptionCopyConstructorThrowsCheck.cpp
   ExceptionEscapeCheck.cpp
+  FloatLoopCounterCheck.cpp
   FoldInitTypeCheck.cpp
   ForwardDeclarationNamespaceCheck.cpp
   ForwardingReferenceOverloadCheck.cpp
@@ -62,6 +66,8 @@ add_clang_library(clangTidyBugproneModule STATIC
   ParentVirtualCallCheck.cpp
   PointerArithmeticOnPolymorphicObjectCheck.cpp
   PosixReturnCheck.cpp
+  RandomGeneratorSeedCheck.cpp
+  RawMemoryCallOnNonTrivialTypeCheck.cpp
   RedundantBranchConditionCheck.cpp
   ReservedIdentifierCheck.cpp
   ReturnConstRefFromParameterCheck.cpp
@@ -73,6 +79,7 @@ add_clang_library(clangTidyBugproneModule STATIC
   SmartPtrArrayMismatchCheck.cpp
   SpuriouslyWakeUpFunctionsCheck.cpp
   StandaloneEmptyCheck.cpp
+  StdNamespaceModificationCheck.cpp
   StringConstructorCheck.cpp
   StringIntegerAssignmentCheck.cpp
   StringLiteralWithEmbeddedNulCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.cpp
index 76bcbbbcdf680..ccbc86ae74cc6 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.cpp
@@ -31,7 +31,7 @@ void CopyConstructorInitCheck::registerMatchers(MatchFinder *Finder) {
 
 void CopyConstructorInitCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *Ctor = Result.Nodes.getNodeAs<CXXConstructorDecl>("ctor");
-  std::string ParamName = Ctor->getParamDecl(0)->getNameAsString();
+  const std::string ParamName = Ctor->getParamDecl(0)->getNameAsString();
 
   // We want only one warning (and FixIt) for each ctor.
   std::string FixItInitList;
@@ -40,7 +40,7 @@ void CopyConstructorInitCheck::check(const MatchFinder::MatchResult &Result) {
   bool HasWrittenInitializer = false;
   SmallVector<FixItHint, 2> SafeFixIts;
   for (const auto *Init : Ctor->inits()) {
-    bool CtorInitIsWritten = Init->isWritten();
+    const bool CtorInitIsWritten = Init->isWritten();
     HasWrittenInitializer = HasWrittenInitializer || CtorInitIsWritten;
     if (!Init->isBaseInitializer())
       continue;
diff --git a/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.h b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.h
index cba1a25d9bc19..e977bc2466dc8 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPY_CONSTRUCTOR_INIT_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPY_CONSTRUCTOR_INIT_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPYCONSTRUCTORINITCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPYCONSTRUCTORINITCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -31,4 +31,4 @@ class CopyConstructorInitCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPY_CONSTRUCTOR_INIT_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPYCONSTRUCTORINITCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorMutatesArgumentCheck.cpp
similarity index 89%
rename from clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.cpp
rename to clang-tools-extra/clang-tidy/bugprone/CopyConstructorMutatesArgumentCheck.cpp
index fb9d72ce6bd31..cbbb1a0070a02 100644
--- a/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorMutatesArgumentCheck.cpp
@@ -6,19 +6,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MutatingCopyCheck.h"
+#include "CopyConstructorMutatesArgumentCheck.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 
 using namespace clang::ast_matchers;
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
 static constexpr llvm::StringLiteral SourceDeclName = "ChangedPVD";
 static constexpr llvm::StringLiteral MutatingOperatorName = "MutatingOp";
 static constexpr llvm::StringLiteral MutatingCallName = "MutatingCall";
 
-void MutatingCopyCheck::registerMatchers(MatchFinder *Finder) {
+void CopyConstructorMutatesArgumentCheck::registerMatchers(
+    MatchFinder *Finder) {
   const auto MemberExprOrSourceObject = anyOf(
       memberExpr(),
       declRefExpr(to(decl(equalsBoundNode(std::string(SourceDeclName))))));
@@ -60,7 +61,8 @@ void MutatingCopyCheck::registerMatchers(MatchFinder *Finder) {
                      this);
 }
 
-void MutatingCopyCheck::check(const MatchFinder::MatchResult &Result) {
+void CopyConstructorMutatesArgumentCheck::check(
+    const MatchFinder::MatchResult &Result) {
   if (const auto *MemberCall =
           Result.Nodes.getNodeAs<CXXMemberCallExpr>(MutatingCallName))
     diag(MemberCall->getBeginLoc(), "call mutates copied object");
@@ -69,4 +71,4 @@ void MutatingCopyCheck::check(const MatchFinder::MatchResult &Result) {
     diag(Assignment->getBeginLoc(), "mutating copied object");
 }
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.h b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorMutatesArgumentCheck.h
similarity index 60%
rename from clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.h
rename to clang-tools-extra/clang-tidy/bugprone/CopyConstructorMutatesArgumentCheck.h
index c211fa004120c..0fed57258b0d8 100644
--- a/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorMutatesArgumentCheck.h
@@ -6,21 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_MUTATINGCOPYCHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_MUTATINGCOPYCHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPYCONSTRUCTORMUTATESARGUMENTCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPYCONSTRUCTORMUTATESARGUMENTCHECK_H
 
 #include "../ClangTidyCheck.h"
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
 /// Finds assignments to the copied object and its direct or indirect members
 /// in copy constructors and copy assignment operators.
 ///
 /// For the user-facing documentation see:
-/// https://clang.llvm.org/extra/clang-tidy/checks/cert/oop58-cpp.html
-class MutatingCopyCheck : public ClangTidyCheck {
+/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/copy-constructor-mutates-argument.html
+class CopyConstructorMutatesArgumentCheck : public ClangTidyCheck {
 public:
-  MutatingCopyCheck(StringRef Name, ClangTidyContext *Context)
+  CopyConstructorMutatesArgumentCheck(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context) {}
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus;
@@ -29,6 +29,6 @@ class MutatingCopyCheck : public ClangTidyCheck {
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 };
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_MUTATINGCOPYCHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COPYCONSTRUCTORMUTATESARGUMENTCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.cpp
index 60f7be8996933..5ef72eac763e7 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.cpp
@@ -116,9 +116,10 @@ void CrtpConstructorAccessibilityCheck::check(
   assert(DerivedTemplateParameter &&
          "No template parameter corresponds to the derived class of the CRTP.");
 
-  bool NeedsFriend = !isDerivedParameterBefriended(CRTPDeclaration,
-                                                   DerivedTemplateParameter) &&
-                     !isDerivedClassBefriended(CRTPDeclaration, DerivedRecord);
+  const bool NeedsFriend =
+      !isDerivedParameterBefriended(CRTPDeclaration,
+                                    DerivedTemplateParameter) &&
+      !isDerivedClassBefriended(CRTPDeclaration, DerivedRecord);
 
   const FixItHint HintFriend = FixItHint::CreateInsertion(
       CRTPDeclaration->getBraceRange().getEnd(),
diff --git a/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.h b/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.h
index 486562c30f79e..0b71bc43057d9 100644
--- a/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DANGLING_HANDLE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DANGLING_HANDLE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DANGLINGHANDLECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DANGLINGHANDLECHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -37,4 +37,4 @@ class DanglingHandleCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DANGLING_HANDLE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DANGLINGHANDLECHECK_H
diff --git a/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/DefaultOperatorNewOnOveralignedTypeCheck.cpp
similarity index 71%
rename from clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.cpp
rename to clang-tools-extra/clang-tidy/bugprone/DefaultOperatorNewOnOveralignedTypeCheck.cpp
index 45c170ec20f4e..cb4f69ae96115 100644
--- a/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/DefaultOperatorNewOnOveralignedTypeCheck.cpp
@@ -6,26 +6,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DefaultOperatorNewAlignmentCheck.h"
+#include "DefaultOperatorNewOnOveralignedTypeCheck.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/Basic/TargetInfo.h"
 
 using namespace clang::ast_matchers;
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
-void DefaultOperatorNewAlignmentCheck::registerMatchers(MatchFinder *Finder) {
+void DefaultOperatorNewOnOveralignedTypeCheck::registerMatchers(
+    MatchFinder *Finder) {
   Finder->addMatcher(
       cxxNewExpr(unless(hasAnyPlacementArg(anything()))).bind("new"), this);
 }
 
-void DefaultOperatorNewAlignmentCheck::check(
+void DefaultOperatorNewOnOveralignedTypeCheck::check(
     const MatchFinder::MatchResult &Result) {
   // Get the found 'new' expression.
   const auto *NewExpr = Result.Nodes.getNodeAs<CXXNewExpr>("new");
 
-  QualType T = NewExpr->getAllocatedType();
+  const QualType T = NewExpr->getAllocatedType();
   // Dependent types do not have fixed alignment.
   if (T->isDependentType())
     return;
@@ -34,25 +35,25 @@ void DefaultOperatorNewAlignmentCheck::check(
   if (!D || !D->isCompleteDefinition())
     return;
 
-  ASTContext &Context = D->getASTContext();
+  const ASTContext &Context = D->getASTContext();
 
   // Check if no alignment was specified for the type.
   if (!Context.isAlignmentRequired(T))
     return;
 
   // The user-specified alignment (in bits).
-  unsigned SpecifiedAlignment = D->getMaxAlignment();
+  const unsigned SpecifiedAlignment = D->getMaxAlignment();
   // Double-check if no alignment was specified.
   if (!SpecifiedAlignment)
     return;
   // The alignment used by default 'operator new' (in bits).
-  unsigned DefaultNewAlignment = Context.getTargetInfo().getNewAlign();
+  const unsigned DefaultNewAlignment = Context.getTargetInfo().getNewAlign();
 
-  bool OverAligned = SpecifiedAlignment > DefaultNewAlignment;
-  bool HasDefaultOperatorNew =
+  const bool OverAligned = SpecifiedAlignment > DefaultNewAlignment;
+  const bool HasDefaultOperatorNew =
       !NewExpr->getOperatorNew() || NewExpr->getOperatorNew()->isImplicit();
 
-  unsigned CharWidth = Context.getTargetInfo().getCharWidth();
+  const unsigned CharWidth = Context.getTargetInfo().getCharWidth();
   if (HasDefaultOperatorNew && OverAligned)
     diag(NewExpr->getBeginLoc(),
          "allocation function returns a pointer with alignment %0 but the "
@@ -61,4 +62,4 @@ void DefaultOperatorNewAlignmentCheck::check(
         << (SpecifiedAlignment / CharWidth);
 }
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.h b/clang-tools-extra/clang-tidy/bugprone/DefaultOperatorNewOnOveralignedTypeCheck.h
similarity index 56%
rename from clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.h
rename to clang-tools-extra/clang-tidy/bugprone/DefaultOperatorNewOnOveralignedTypeCheck.h
index 8f9d0e470a755..b5b365b441209 100644
--- a/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/DefaultOperatorNewOnOveralignedTypeCheck.h
@@ -6,21 +6,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DEFAULTOPERATORNEWALIGNMENTCHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DEFAULTOPERATORNEWALIGNMENTCHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DEFAULTOPERATORNEWONOVERALIGNEDTYPECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DEFAULTOPERATORNEWONOVERALIGNEDTYPECHECK_H
 
 #include "../ClangTidyCheck.h"
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
 /// Checks if an object of type with extended alignment is allocated by using
 /// the default operator new.
 ///
 /// For the user-facing documentation see:
-/// https://clang.llvm.org/extra/clang-tidy/checks/cert/mem57-cpp.html
-class DefaultOperatorNewAlignmentCheck : public ClangTidyCheck {
+/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/bugprone-default-operator-new-on-overaligned-type.html
+class DefaultOperatorNewOnOveralignedTypeCheck : public ClangTidyCheck {
 public:
-  DefaultOperatorNewAlignmentCheck(StringRef Name, ClangTidyContext *Context)
+  DefaultOperatorNewOnOveralignedTypeCheck(StringRef Name,
+                                           ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context) {}
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return !LangOpts.CPlusPlus17;
@@ -29,6 +30,6 @@ class DefaultOperatorNewAlignmentCheck : public ClangTidyCheck {
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 };
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DEFAULTOPERATORNEWALIGNMENTCHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DEFAULTOPERATORNEWONOVERALIGNEDTYPECHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.cpp
index 743e6cd27509b..7c5867619cf4e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.cpp
@@ -65,7 +65,7 @@ AST_MATCHER(CXXMethodDecl, nameCollidesWithMethodInBase) {
 
       for (const auto &BaseMethod : CurrentRecord->methods()) {
         if (namesCollide(*BaseMethod, Node)) {
-          ast_matchers::internal::BoundNodesTreeBuilder Result(*Builder);
+          const ast_matchers::internal::BoundNodesTreeBuilder Result(*Builder);
           Builder->setBinding("base_method",
                               clang::DynTypedNode::create(*BaseMethod));
           return true;
diff --git a/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.cpp
index 4d0428ec18598..48de7fbe7fad6 100644
--- a/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.cpp
@@ -43,7 +43,7 @@ void DynamicStaticInitializersCheck::registerMatchers(MatchFinder *Finder) {
 void DynamicStaticInitializersCheck::check(
     const MatchFinder::MatchResult &Result) {
   const auto *Var = Result.Nodes.getNodeAs<VarDecl>("var");
-  SourceLocation Loc = Var->getLocation();
+  const SourceLocation Loc = Var->getLocation();
   if (!Loc.isValid() || !utils::isPresumedLocInHeaderFile(
                             Loc, *Result.SourceManager, HeaderFileExtensions))
     return;
diff --git a/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.h b/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.h
index e02c62a53ffa0..00e4bb1e75000 100644
--- a/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DYNAMIC_STATIC_INITIALIZERS_CHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DYNAMIC_STATIC_INITIALIZERS_CHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DYNAMICSTATICINITIALIZERSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DYNAMICSTATICINITIALIZERSCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "../FileExtensionsSet.h"
@@ -30,4 +30,4 @@ class DynamicStaticInitializersCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DYNAMIC_STATIC_INITIALIZERS_CHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_DYNAMICSTATICINITIALIZERSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
index b4ee35154f5f0..a07a68c8a3e65 100644
--- a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
@@ -417,7 +417,7 @@ struct MixData {
   void sanitize() {
     assert(Flags != MixFlags::Invalid && "sanitize() called on invalid bitvec");
 
-    MixFlags CanonicalAndWorkaround =
+    const MixFlags CanonicalAndWorkaround =
         MixFlags::Canonical | MixFlags::WorkaroundDisableCanonicalEquivalence;
     if ((Flags & CanonicalAndWorkaround) == CanonicalAndWorkaround) {
       // A workaround for too eagerly equivalent canonical types was requested,
@@ -483,7 +483,7 @@ struct MixData {
     if (CommonType.isNull())
       return *this;
 
-    QualType NewCommonType = Func(CommonType);
+    const QualType NewCommonType = Func(CommonType);
 
     if (CreatedFromOneWayConversion) {
       MixData M{Flags, Conversion};
@@ -761,7 +761,7 @@ calculateMixability(const TheCheck &Check, QualType LType, QualType RType,
       return {MixFlags::None};
     }
 
-    MixData UnqualifiedMixability =
+    const MixData UnqualifiedMixability =
         calculateMixability(Check, LType.getLocalUnqualifiedType(),
                             RType.getLocalUnqualifiedType(), Ctx, ImplicitMode)
             .withCommonTypeTransformed([&AdditionalQuals, &Ctx](QualType QT) {
@@ -813,7 +813,7 @@ calculateMixability(const TheCheck &Check, QualType LType, QualType RType,
 
   if (ImplicitMode > ImplicitConversionModellingMode::None) {
     LLVM_DEBUG(llvm::dbgs() << "--- calculateMixability. Start implicit...\n");
-    MixData MixLTR =
+    const MixData MixLTR =
         approximateImplicitConversion(Check, LType, RType, Ctx, ImplicitMode);
     LLVM_DEBUG(
         if (hasFlag(MixLTR.Flags, MixFlags::ImplicitConversion)) llvm::dbgs()
@@ -833,7 +833,7 @@ calculateMixability(const TheCheck &Check, QualType LType, QualType RType,
 
     // Otherwise if the invoker requested a full modelling, do the other
     // direction as well.
-    MixData MixRTL =
+    const MixData MixRTL =
         approximateImplicitConversion(Check, RType, LType, Ctx, ImplicitMode);
     LLVM_DEBUG(
         if (hasFlag(MixRTL.Flags, MixFlags::ImplicitConversion)) llvm::dbgs()
@@ -868,7 +868,7 @@ calculateMixability(const TheCheck &Check, QualType LType, QualType RType,
 
   // If none of the previous logic found a match, try if Clang otherwise
   // believes the types to be the same.
-  QualType LCanonical = LType.getCanonicalType();
+  const QualType LCanonical = LType.getCanonicalType();
   if (LCanonical == RType.getCanonicalType()) {
     LLVM_DEBUG(llvm::dbgs()
                << "<<< calculateMixability. Same CanonicalType.\n");
@@ -983,9 +983,9 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From,
   // Numeric promotions and conversions.
   const auto *FromBuiltin = WorkType->getAs<BuiltinType>();
   const auto *ToBuiltin = To->getAs<BuiltinType>();
-  bool FromNumeric = FromBuiltin && (FromBuiltin->isIntegerType() ||
-                                     FromBuiltin->isFloatingType());
-  bool ToNumeric =
+  const bool FromNumeric = FromBuiltin && (FromBuiltin->isIntegerType() ||
+                                           FromBuiltin->isFloatingType());
+  const bool ToNumeric =
       ToBuiltin && (ToBuiltin->isIntegerType() || ToBuiltin->isFloatingType());
   if (FromNumeric && ToNumeric) {
     // If both are integral types, the numeric conversion is performed.
@@ -1150,9 +1150,9 @@ class UserDefinedConversionSelector {
         continue;
       }
 
-      bool BestConversionHasImplicit =
+      const bool BestConversionHasImplicit =
           hasFlag(BestConversion->Flags, MixFlags::ImplicitConversion);
-      bool ThisConversionHasImplicit =
+      const bool ThisConversionHasImplicit =
           hasFlag(Prepared.Flags, MixFlags::ImplicitConversion);
       if (!BestConversionHasImplicit && ThisConversionHasImplicit)
         // This is a worse conversion, because a better one was found earlier.
@@ -1221,7 +1221,7 @@ tryConversionOperators(const TheCheck &Check, const CXXRecordDecl *RD,
 
   if (std::optional<UserDefinedConversionSelector::PreparedConversion>
           SelectedConversion = ConversionSet()) {
-    CanQualType RecordType = RD->getASTContext().getCanonicalTagType(RD);
+    const CanQualType RecordType = RD->getASTContext().getCanonicalTagType(RD);
 
     ConversionSequence Result{RecordType, ToType};
     // The conversion from the operator call's return type to ToType was
@@ -1272,7 +1272,7 @@ tryConvertingConstructors(const TheCheck &Check, QualType FromType,
 
   if (std::optional<UserDefinedConversionSelector::PreparedConversion>
           SelectedConversion = ConversionSet()) {
-    CanQualType RecordType = RD->getASTContext().getCanonicalTagType(RD);
+    const CanQualType RecordType = RD->getASTContext().getCanonicalTagType(RD);
 
     ConversionSequence Result{FromType, RecordType};
     Result.AfterFirstStandard = SelectedConversion->Seq.AfterFirstStandard;
@@ -1385,7 +1385,7 @@ approximateImplicitConversion(const TheCheck &Check, QualType LType,
   LLVM_DEBUG(
       llvm::dbgs()
       << "--- approximateImplicitConversion. Try to find post-conversion.\n");
-  MixData SecondStdConv = approximateImplicitConversion(
+  const MixData SecondStdConv = approximateImplicitConversion(
       Check, WorkType, RType, Ctx,
       ImplicitConversionModellingMode::OneWaySingleStandardOnly);
   if (SecondStdConv.indicatesMixability()) {
@@ -1414,7 +1414,7 @@ approximateImplicitConversion(const TheCheck &Check, QualType LType,
 static MixableParameterRange modelMixingRange(
     const TheCheck &Check, const FunctionDecl *FD, std::size_t StartIndex,
     const filter::SimilarlyUsedParameterPairSuppressor &UsageBasedSuppressor) {
-  std::size_t NumParams = FD->getNumParams();
+  const std::size_t NumParams = FD->getNumParams();
   assert(StartIndex < NumParams && "out of bounds for start");
   const ASTContext &Ctx = FD->getASTContext();
 
@@ -1424,7 +1424,7 @@ static MixableParameterRange modelMixingRange(
 
   for (std::size_t I = StartIndex + 1; I < NumParams; ++I) {
     const ParmVarDecl *Ith = FD->getParamDecl(I);
-    StringRef ParamName = Ith->getName();
+    const StringRef ParamName = Ith->getName();
     LLVM_DEBUG(llvm::dbgs()
                << "Check param #" << I << " '" << ParamName << "'...\n");
     if (filter::isIgnoredParameter(Check, Ith)) {
@@ -1432,7 +1432,7 @@ static MixableParameterRange modelMixingRange(
       break;
     }
 
-    StringRef PrevParamName = FD->getParamDecl(I - 1)->getName();
+    const StringRef PrevParamName = FD->getParamDecl(I - 1)->getName();
     if (!ParamName.empty() && !PrevParamName.empty() &&
         filter::prefixSuffixCoverUnderThreshold(
             Check.NamePrefixSuffixSilenceDissimilarityThreshold, PrevParamName,
@@ -1518,18 +1518,18 @@ static bool isIgnoredParameter(const TheCheck &Check, const ParmVarDecl *Node) {
   if (!Node->getIdentifier())
     return llvm::is_contained(Check.IgnoredParameterNames, "\"\"");
 
-  StringRef NodeName = Node->getName();
+  const StringRef NodeName = Node->getName();
   if (llvm::is_contained(Check.IgnoredParameterNames, NodeName)) {
     LLVM_DEBUG(llvm::dbgs() << "\tName ignored.\n");
     return true;
   }
 
-  StringRef NodeTypeName = [Node] {
+  const StringRef NodeTypeName = [Node] {
     const ASTContext &Ctx = Node->getASTContext();
     const SourceManager &SM = Ctx.getSourceManager();
     SourceLocation B = Node->getTypeSpecStartLoc();
     SourceLocation E = Node->getTypeSpecEndLoc();
-    LangOptions LO;
+    const LangOptions LO;
 
     LLVM_DEBUG(llvm::dbgs() << "\tType name code is '"
                             << Lexer::getSourceText(
@@ -1633,7 +1633,7 @@ class AppearsInSameExpr : public RecursiveASTVisitor<AppearsInSameExpr> {
         RootSetInCurrentStackFrame = true;
       }
 
-      bool Ret = Base::TraverseStmt(S);
+      const bool Ret = Base::TraverseStmt(S);
 
       if (RootSetInCurrentStackFrame)
         CurrentExprOnlyTreeRoot = nullptr;
@@ -1684,7 +1684,7 @@ class PassedToSameFunction {
         continue;
 
       std::optional<unsigned> TargetIdx;
-      unsigned NumFnParams = CalledFn->getNumParams();
+      const unsigned NumFnParams = CalledFn->getNumParams();
       for (unsigned Idx = 0; Idx < NumFnParams; ++Idx)
         if (CalledFn->getParamDecl(Idx) == PassedToParam)
           TargetIdx.emplace(Idx);
@@ -1837,16 +1837,16 @@ static void padStringAtBegin(SmallVectorImpl<char> &Str, std::size_t ToLen) {
 static bool isCommonPrefixWithoutSomeCharacters(std::size_t N, StringRef S1,
                                                 StringRef S2) {
   assert(S1.size() >= N && S2.size() >= N);
-  StringRef S1Prefix = S1.take_front(S1.size() - N),
-            S2Prefix = S2.take_front(S2.size() - N);
+  const StringRef S1Prefix = S1.take_front(S1.size() - N),
+                  S2Prefix = S2.take_front(S2.size() - N);
   return S1Prefix == S2Prefix && !S1Prefix.empty();
 }
 
 static bool isCommonSuffixWithoutSomeCharacters(std::size_t N, StringRef S1,
                                                 StringRef S2) {
   assert(S1.size() >= N && S2.size() >= N);
-  StringRef S1Suffix = S1.take_back(S1.size() - N),
-            S2Suffix = S2.take_back(S2.size() - N);
+  const StringRef S1Suffix = S1.take_back(S1.size() - N),
+                  S2Suffix = S2.take_back(S2.size() - N);
   return S1Suffix == S2Suffix && !S1Suffix.empty();
 }
 
@@ -1858,7 +1858,7 @@ static bool prefixSuffixCoverUnderThreshold(std::size_t Threshold,
     return false;
 
   // Pad the two strings to the longer length.
-  std::size_t BiggerLength = std::max(Str1.size(), Str2.size());
+  const std::size_t BiggerLength = std::max(Str1.size(), Str2.size());
 
   if (BiggerLength <= Threshold)
     // If the length of the strings is still smaller than the threshold, they
@@ -1980,7 +1980,7 @@ struct FormattedConversionSequence {
 
     // However, the parameter's defined type might not be what the implicit
     // conversion started with, e.g. if a typedef is found to convert.
-    std::string SeqBeginTypeStr = Conv.Begin.getAsString(PP);
+    const std::string SeqBeginTypeStr = Conv.Begin.getAsString(PP);
     std::string SeqEndTypeStr = Conv.End.getAsString(PP);
     if (StartTypeAsDiagnosed != SeqBeginTypeStr) {
       OS << " (as '" << SeqBeginTypeStr << "')";
@@ -1995,7 +1995,7 @@ struct FormattedConversionSequence {
         ++NumElementsAdded;
       }
     };
-    for (QualType InvolvedType : Conv.getInvolvedTypesInSequence())
+    for (const QualType InvolvedType : Conv.getInvolvedTypesInSequence())
       // Print every type that's unique in the sequence into the diagnosis.
       AddType(InvolvedType.getAsString(PP));
 
@@ -2073,12 +2073,14 @@ class UniqueTypeAliasDiagnosticHelper
     if (CommonType.isNull() || CommonType == LHSType || CommonType == RHSType)
       return Base::operator()({LHSType, RHSType, {}});
 
-    TypeAliasDiagnosticTuple ThreeTuple{LHSType, RHSType, CommonType};
+    const TypeAliasDiagnosticTuple ThreeTuple{LHSType, RHSType, CommonType};
     if (!Base::operator()(ThreeTuple))
       return false;
 
-    bool AlreadySaidLHSAndCommonIsSame = calledWith({LHSType, CommonType, {}});
-    bool AlreadySaidRHSAndCommonIsSame = calledWith({RHSType, CommonType, {}});
+    const bool AlreadySaidLHSAndCommonIsSame =
+        calledWith({LHSType, CommonType, {}});
+    const bool AlreadySaidRHSAndCommonIsSame =
+        calledWith({RHSType, CommonType, {}});
     if (AlreadySaidLHSAndCommonIsSame && AlreadySaidRHSAndCommonIsSame) {
       // "SomeInt == int" && "SomeOtherInt == int" => "Common(SomeInt,
       // SomeOtherInt) == int", no need to diagnose it. Save the 3-tuple only
@@ -2154,12 +2156,12 @@ void EasilySwappableParametersCheck::check(
   assert(FD);
 
   const PrintingPolicy &PP = FD->getASTContext().getPrintingPolicy();
-  std::size_t NumParams = FD->getNumParams();
+  const std::size_t NumParams = FD->getNumParams();
   std::size_t MixableRangeStartIndex = 0;
 
   // Spawn one suppressor and if the user requested, gather information from
   // the AST for the parameters' usages.
-  filter::SimilarlyUsedParameterPairSuppressor UsageBasedSuppressor{
+  const filter::SimilarlyUsedParameterPairSuppressor UsageBasedSuppressor{
       FD, SuppressParametersUsedTogether};
 
   LLVM_DEBUG(llvm::dbgs() << "Begin analysis of " << getName(FD) << " with "
@@ -2182,11 +2184,13 @@ void EasilySwappableParametersCheck::check(
       continue;
     }
 
-    bool NeedsAnyTypeNote = llvm::any_of(R.Mixes, needsToPrintTypeInDiagnostic);
-    bool HasAnyImplicits =
+    const bool NeedsAnyTypeNote =
+        llvm::any_of(R.Mixes, needsToPrintTypeInDiagnostic);
+    const bool HasAnyImplicits =
         llvm::any_of(R.Mixes, needsToElaborateImplicitConversion);
     const ParmVarDecl *First = R.getFirstParam(), *Last = R.getLastParam();
-    std::string FirstParamTypeAsWritten = First->getType().getAsString(PP);
+    const std::string FirstParamTypeAsWritten =
+        First->getType().getAsString(PP);
     {
       StringRef DiagText;
 
@@ -2205,7 +2209,7 @@ void EasilySwappableParametersCheck::check(
       if (!NeedsAnyTypeNote)
         Diag << FirstParamTypeAsWritten;
 
-      CharSourceRange HighlightRange = CharSourceRange::getTokenRange(
+      const CharSourceRange HighlightRange = CharSourceRange::getTokenRange(
           First->getBeginLoc(), Last->getEndLoc());
       Diag << HighlightRange;
     }
@@ -2240,12 +2244,12 @@ void EasilySwappableParametersCheck::check(
       // emitted to a note diagnostic, so prepare it.
       const ParmVarDecl *LVar = M.First;
       const ParmVarDecl *RVar = M.Second;
-      QualType LType = LVar->getType();
-      QualType RType = RVar->getType();
-      QualType CommonType = M.commonUnderlyingType();
-      std::string LTypeStr = LType.getAsString(PP);
-      std::string RTypeStr = RType.getAsString(PP);
-      std::string CommonTypeStr = CommonType.getAsString(PP);
+      const QualType LType = LVar->getType();
+      const QualType RType = RVar->getType();
+      const QualType CommonType = M.commonUnderlyingType();
+      const std::string LTypeStr = LType.getAsString(PP);
+      const std::string RTypeStr = RType.getAsString(PP);
+      const std::string CommonTypeStr = CommonType.getAsString(PP);
 
       if (hasFlag(M.flags(), MixFlags::TypeAlias) &&
           UniqueTypeAlias(LType, RType, CommonType)) {
@@ -2274,8 +2278,9 @@ void EasilySwappableParametersCheck::check(
       if ((hasFlag(M.flags(), MixFlags::ReferenceBind) ||
            hasFlag(M.flags(), MixFlags::Qualifiers)) &&
           UniqueBindPower({LType, RType})) {
-        StringRef DiagText = "'%0' and '%1' parameters accept and bind the "
-                             "same kind of values";
+        const StringRef DiagText =
+            "'%0' and '%1' parameters accept and bind the "
+            "same kind of values";
         diag(RVar->getOuterLocStart(), DiagText, DiagnosticIDs::Note)
             << LTypeStr << RTypeStr;
       }
@@ -2286,8 +2291,8 @@ void EasilySwappableParametersCheck::check(
             M.leftToRightConversionSequence();
         const model::ConversionSequence &RTL =
             M.rightToLeftConversionSequence();
-        FormattedConversionSequence LTRFmt{PP, LTypeStr, LTR, RTypeStr};
-        FormattedConversionSequence RTLFmt{PP, RTypeStr, RTL, LTypeStr};
+        const FormattedConversionSequence LTRFmt{PP, LTypeStr, LTR, RTypeStr};
+        const FormattedConversionSequence RTLFmt{PP, RTypeStr, RTL, LTypeStr};
 
         StringRef DiagText = "'%0' and '%1' may be implicitly converted";
         if (!LTRFmt.Trivial || !RTLFmt.Trivial)
@@ -2302,7 +2307,7 @@ void EasilySwappableParametersCheck::check(
             Diag << LTRFmt.DiagnosticText << RTLFmt.DiagnosticText;
         }
 
-        StringRef ConversionFunctionDiagText =
+        const StringRef ConversionFunctionDiagText =
             "the implicit conversion involves the "
             "%select{|converting constructor|conversion operator}0 "
             "declared here";
diff --git a/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.cpp
index eebab847d1070..5dd2f62504c71 100644
--- a/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.cpp
@@ -25,7 +25,7 @@ AST_MATCHER(CXXCatchStmt, isInMacro) {
 }
 
 AST_MATCHER_P(CXXCatchStmt, hasHandler, Matcher<Stmt>, InnerMatcher) {
-  Stmt *Handler = Node.getHandlerBlock();
+  const Stmt *Handler = Node.getHandlerBlock();
   if (!Handler)
     return false;
   return InnerMatcher.matches(*Handler, Finder, Builder);
@@ -41,7 +41,7 @@ AST_MATCHER_P(CompoundStmt, hasAnyTextFromList, std::vector<llvm::StringRef>,
     return false;
 
   ASTContext &Context = Finder->getASTContext();
-  SourceManager &SM = Context.getSourceManager();
+  const SourceManager &SM = Context.getSourceManager();
   StringRef Text = Lexer::getSourceText(
       CharSourceRange::getTokenRange(Node.getSourceRange()), SM,
       Context.getLangOpts());
diff --git a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.cpp
similarity index 75%
rename from clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.cpp
rename to clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.cpp
index 2225a90aeece1..73658459b8e26 100644
--- a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.cpp
@@ -6,15 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ThrownExceptionTypeCheck.h"
+#include "ExceptionCopyConstructorThrowsCheck.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 
 using namespace clang::ast_matchers;
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
-void ThrownExceptionTypeCheck::registerMatchers(MatchFinder *Finder) {
+void ExceptionCopyConstructorThrowsCheck::registerMatchers(
+    MatchFinder *Finder) {
   Finder->addMatcher(
       traverse(
           TK_AsIs,
@@ -25,10 +26,11 @@ void ThrownExceptionTypeCheck::registerMatchers(MatchFinder *Finder) {
       this);
 }
 
-void ThrownExceptionTypeCheck::check(const MatchFinder::MatchResult &Result) {
+void ExceptionCopyConstructorThrowsCheck::check(
+    const MatchFinder::MatchResult &Result) {
   const auto *E = Result.Nodes.getNodeAs<Expr>("expr");
   diag(E->getExprLoc(),
        "thrown exception type is not nothrow copy constructible");
 }
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.h b/clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.h
similarity index 58%
rename from clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.h
rename to clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.h
index 41a5145209686..f1d7cca0e5bad 100644
--- a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.h
@@ -6,20 +6,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_THROWNEXCEPTIONTYPECHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_THROWNEXCEPTIONTYPECHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTIONCOPYCONSTRUCTORTHROWSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTIONCOPYCONSTRUCTORTHROWSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
 /// Checks whether a thrown object is nothrow copy constructible.
 ///
 /// For the user-facing documentation see:
-/// https://clang.llvm.org/extra/clang-tidy/checks/cert/err60-cpp.html
-class ThrownExceptionTypeCheck : public ClangTidyCheck {
+/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/exception-copy-constructor-throws.html
+class ExceptionCopyConstructorThrowsCheck : public ClangTidyCheck {
 public:
-  ThrownExceptionTypeCheck(StringRef Name, ClangTidyContext *Context)
+  ExceptionCopyConstructorThrowsCheck(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context) {}
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus;
@@ -28,6 +28,6 @@ class ThrownExceptionTypeCheck : public ClangTidyCheck {
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 };
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_THROWNEXCEPTIONTYPECHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTIONCOPYCONSTRUCTORTHROWSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
index 837a86ff8655e..b7de8395ffa05 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
@@ -36,13 +36,22 @@ ExceptionEscapeCheck::ExceptionEscapeCheck(StringRef Name,
                                            ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context), RawFunctionsThatShouldNotThrow(Options.get(
                                          "FunctionsThatShouldNotThrow", "")),
-      RawIgnoredExceptions(Options.get("IgnoredExceptions", "")) {
+      RawIgnoredExceptions(Options.get("IgnoredExceptions", "")),
+      RawCheckedSwapFunctions(
+          Options.get("CheckedSwapFunctions", "swap,iter_swap,iter_move")),
+      CheckDestructors(Options.get("CheckDestructors", true)),
+      CheckMoveMemberFunctions(Options.get("CheckMoveMemberFunctions", true)),
+      CheckMain(Options.get("CheckMain", true)),
+      CheckNothrowFunctions(Options.get("CheckNothrowFunctions", true)) {
   llvm::SmallVector<StringRef, 8> FunctionsThatShouldNotThrowVec,
-      IgnoredExceptionsVec;
+      IgnoredExceptionsVec, CheckedSwapFunctionsVec;
   RawFunctionsThatShouldNotThrow.split(FunctionsThatShouldNotThrowVec, ",", -1,
                                        false);
   FunctionsThatShouldNotThrow.insert_range(FunctionsThatShouldNotThrowVec);
 
+  RawCheckedSwapFunctions.split(CheckedSwapFunctionsVec, ",", -1, false);
+  CheckedSwapFunctions.insert_range(CheckedSwapFunctionsVec);
+
   llvm::StringSet<> IgnoredExceptions;
   RawIgnoredExceptions.split(IgnoredExceptionsVec, ",", -1, false);
   IgnoredExceptions.insert_range(IgnoredExceptionsVec);
@@ -54,20 +63,33 @@ void ExceptionEscapeCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "FunctionsThatShouldNotThrow",
                 RawFunctionsThatShouldNotThrow);
   Options.store(Opts, "IgnoredExceptions", RawIgnoredExceptions);
+  Options.store(Opts, "CheckedSwapFunctions", RawCheckedSwapFunctions);
+  Options.store(Opts, "CheckDestructors", CheckDestructors);
+  Options.store(Opts, "CheckMoveMemberFunctions", CheckMoveMemberFunctions);
+  Options.store(Opts, "CheckMain", CheckMain);
+  Options.store(Opts, "CheckNothrowFunctions", CheckNothrowFunctions);
 }
 
 void ExceptionEscapeCheck::registerMatchers(MatchFinder *Finder) {
+  auto MatchIf = [](bool Enabled, const auto &Matcher) {
+    ast_matchers::internal::Matcher<FunctionDecl> Nothing = unless(anything());
+    return Enabled ? Matcher : Nothing;
+  };
   Finder->addMatcher(
       functionDecl(
           isDefinition(),
-          anyOf(isNoThrow(),
-                allOf(anyOf(cxxDestructorDecl(),
-                            cxxConstructorDecl(isMoveConstructor()),
-                            cxxMethodDecl(isMoveAssignmentOperator()), isMain(),
-                            allOf(hasAnyName("swap", "iter_swap", "iter_move"),
-                                  hasAtLeastOneParameter())),
-                      unless(isExplicitThrow())),
-                isEnabled(FunctionsThatShouldNotThrow)))
+          anyOf(
+              MatchIf(CheckNothrowFunctions, isNoThrow()),
+              allOf(anyOf(MatchIf(CheckDestructors, cxxDestructorDecl()),
+                          MatchIf(
+                              CheckMoveMemberFunctions,
+                              anyOf(cxxConstructorDecl(isMoveConstructor()),
+                                    cxxMethodDecl(isMoveAssignmentOperator()))),
+                          MatchIf(CheckMain, isMain()),
+                          allOf(isEnabled(CheckedSwapFunctions),
+                                hasAtLeastOneParameter())),
+                    unless(isExplicitThrow())),
+              isEnabled(FunctionsThatShouldNotThrow)))
           .bind("thrower"),
       this);
 }
diff --git a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h
index bd1e7bae57f5d..c3bf4a4335273 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTION_ESCAPE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTION_ESCAPE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTIONESCAPECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTIONESCAPECHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "../utils/ExceptionAnalyzer.h"
@@ -35,11 +35,18 @@ class ExceptionEscapeCheck : public ClangTidyCheck {
 private:
   StringRef RawFunctionsThatShouldNotThrow;
   StringRef RawIgnoredExceptions;
+  StringRef RawCheckedSwapFunctions;
+
+  const bool CheckDestructors;
+  const bool CheckMoveMemberFunctions;
+  const bool CheckMain;
+  const bool CheckNothrowFunctions;
 
   llvm::StringSet<> FunctionsThatShouldNotThrow;
+  llvm::StringSet<> CheckedSwapFunctions;
   utils::ExceptionAnalyzer Tracer;
 };
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTION_ESCAPE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTIONESCAPECHECK_H
diff --git a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.cpp b/clang-tools-extra/clang-tidy/bugprone/FloatLoopCounterCheck.cpp
similarity index 86%
rename from clang-tools-extra/clang-tidy/cert/FloatLoopCounter.cpp
rename to clang-tools-extra/clang-tidy/bugprone/FloatLoopCounterCheck.cpp
index 01299e0e5ab48..adf2d2b4bcc07 100644
--- a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/FloatLoopCounterCheck.cpp
@@ -6,16 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "FloatLoopCounter.h"
+#include "FloatLoopCounterCheck.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
 
 using namespace clang::ast_matchers;
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
-void FloatLoopCounter::registerMatchers(MatchFinder *Finder) {
+void FloatLoopCounterCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(
       forStmt(hasIncrement(forEachDescendant(
                   declRefExpr(hasType(realFloatingPointType()),
@@ -29,7 +29,7 @@ void FloatLoopCounter::registerMatchers(MatchFinder *Finder) {
       this);
 }
 
-void FloatLoopCounter::check(const MatchFinder::MatchResult &Result) {
+void FloatLoopCounterCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *FS = Result.Nodes.getNodeAs<ForStmt>("for");
 
   diag(FS->getInc()->getBeginLoc(), "loop induction expression should not have "
@@ -43,4 +43,4 @@ void FloatLoopCounter::check(const MatchFinder::MatchResult &Result) {
            DiagnosticIDs::Note);
 }
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.h b/clang-tools-extra/clang-tidy/bugprone/FloatLoopCounterCheck.h
similarity index 64%
rename from clang-tools-extra/clang-tidy/cert/FloatLoopCounter.h
rename to clang-tools-extra/clang-tidy/bugprone/FloatLoopCounterCheck.h
index d00c036f00f24..43dd9c2c93515 100644
--- a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.h
+++ b/clang-tools-extra/clang-tidy/bugprone/FloatLoopCounterCheck.h
@@ -6,27 +6,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_FLOAT_LOOP_COUNTER_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_FLOAT_LOOP_COUNTER_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FLOATLOOPCOUNTERCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FLOATLOOPCOUNTERCHECK_H
 
 #include "../ClangTidyCheck.h"
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
 /// This check diagnoses when the loop induction expression of a for loop has
 /// floating-point type. The check corresponds to:
 /// https://www.securecoding.cert.org/confluence/display/c/FLP30-C.+Do+not+use+floating-point+variables+as+loop+counters
 ///
 /// For the user-facing documentation see:
-/// https://clang.llvm.org/extra/clang-tidy/checks/cert/flp30-c.html
-class FloatLoopCounter : public ClangTidyCheck {
+/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/float-loop-counter.html
+class FloatLoopCounterCheck : public ClangTidyCheck {
 public:
-  FloatLoopCounter(StringRef Name, ClangTidyContext *Context)
+  FloatLoopCounterCheck(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context) {}
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 };
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_FLOAT_LOOP_COUNTER_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FLOATLOOPCOUNTERCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.h b/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.h
index 119728d972309..ef8b4d11d6517 100644
--- a/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FOLD_INIT_TYPE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FOLD_INIT_TYPE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FOLDINITTYPECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FOLDINITTYPECHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -39,4 +39,4 @@ class FoldInitTypeCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FOLD_INIT_TYPE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FOLDINITTYPECHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp
index c3db8fa9b3af2..11270e7f34d79 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp
@@ -46,7 +46,7 @@ void ForwardDeclarationNamespaceCheck::check(
     const MatchFinder::MatchResult &Result) {
   if (const auto *RecordDecl =
           Result.Nodes.getNodeAs<CXXRecordDecl>("record_decl")) {
-    StringRef DeclName = RecordDecl->getName();
+    const StringRef DeclName = RecordDecl->getName();
     if (RecordDecl->isThisDeclarationADefinition()) {
       DeclNameToDefinitions[DeclName].push_back(RecordDecl);
     } else {
diff --git a/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp
index d372cbd798b2e..c1e66f210b8b2 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp
@@ -40,7 +40,7 @@ AST_MATCHER(QualType, isEnableIf) {
   if (CheckTemplate(BaseType->getAs<TemplateSpecializationType>()))
     return true; // Case: enable_if_t< >.
   if (const auto *TT = BaseType->getAs<TypedefType>())
-    if (NestedNameSpecifier Q = TT->getQualifier();
+    if (const NestedNameSpecifier Q = TT->getQualifier();
         Q.getKind() == NestedNameSpecifier::Kind::Type)
       if (CheckTemplate(Q.getAsType()->getAs<TemplateSpecializationType>()))
         return true; // Case: enable_if< >::type.
@@ -67,7 +67,7 @@ void ForwardingReferenceOverloadCheck::registerMatchers(MatchFinder *Finder) {
                            unless(references(isConstQualified())))))
           .bind("parm-var");
 
-  DeclarationMatcher FindOverload =
+  const DeclarationMatcher FindOverload =
       cxxConstructorDecl(
           hasParameter(0, ForwardingRefParm), unless(isDeleted()),
           unless(hasAnyParameter(
@@ -128,8 +128,9 @@ void ForwardingReferenceOverloadCheck::check(
         (OtherCtor->isCopyConstructor() ? EnabledCopy : EnabledMove) = true;
     }
   }
-  bool Copy = (!EnabledMove && !DisabledMove && !DisabledCopy) || EnabledCopy;
-  bool Move = !DisabledMove || EnabledMove;
+  const bool Copy =
+      (!EnabledMove && !DisabledMove && !DisabledCopy) || EnabledCopy;
+  const bool Move = !DisabledMove || EnabledMove;
   if (!Copy && !Move)
     return;
   diag(Ctor->getLocation(),
diff --git a/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.cpp
index 2211a0ba24ebc..634d54c2b9bd3 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.cpp
@@ -71,18 +71,18 @@ ImplicitWideningOfMultiplicationResultCheck::includeStddefHeader(
 
 void ImplicitWideningOfMultiplicationResultCheck::handleImplicitCastExpr(
     const ImplicitCastExpr *ICE) {
-  ASTContext *Context = Result->Context;
+  const ASTContext *Context = Result->Context;
 
   const Expr *E = ICE->getSubExpr()->IgnoreParens();
-  QualType Ty = ICE->getType();
-  QualType ETy = E->getType();
+  const QualType Ty = ICE->getType();
+  const QualType ETy = E->getType();
 
   assert(!ETy->isDependentType() && !Ty->isDependentType() &&
          "Don't expect to ever get here in template Context.");
 
   // This must be a widening cast. Else we do not care.
-  unsigned SrcWidth = Context->getIntWidth(ETy);
-  unsigned TgtWidth = Context->getIntWidth(Ty);
+  const unsigned SrcWidth = Context->getIntWidth(ETy);
+  const unsigned TgtWidth = Context->getIntWidth(Ty);
   if (TgtWidth <= SrcWidth)
     return;
 
@@ -92,7 +92,7 @@ void ImplicitWideningOfMultiplicationResultCheck::handleImplicitCastExpr(
       !ETy->isUnsignedIntegerType()) {
     if (const auto ConstExprResult = E->getIntegerConstantExpr(*Context)) {
       const auto TypeSize = Context->getTypeSize(ETy);
-      llvm::APSInt WidenedResult = ConstExprResult->extOrTrunc(TypeSize);
+      const llvm::APSInt WidenedResult = ConstExprResult->extOrTrunc(TypeSize);
       if (WidenedResult <= llvm::APSInt::getMaxValue(TypeSize, false) &&
           WidenedResult >= llvm::APSInt::getMinValue(TypeSize, false))
         return;
@@ -168,7 +168,7 @@ void ImplicitWideningOfMultiplicationResultCheck::handleImplicitCastExpr(
 
 void ImplicitWideningOfMultiplicationResultCheck::handlePointerOffsetting(
     const Expr *E) {
-  ASTContext *Context = Result->Context;
+  const ASTContext *Context = Result->Context;
 
   // We are looking for a pointer offset operation,
   // with one hand being a pointer, and another one being an offset.
@@ -191,19 +191,20 @@ void ImplicitWideningOfMultiplicationResultCheck::handlePointerOffsetting(
 
   IndexExpr = IndexExpr->IgnoreParens();
 
-  QualType IndexExprType = IndexExpr->getType();
+  const QualType IndexExprType = IndexExpr->getType();
 
   // If the index expression's type is not known (i.e. we are in a template),
   // we can't do anything here.
   if (IndexExprType->isDependentType())
     return;
 
-  QualType SSizeTy = Context->getPointerDiffType();
-  QualType USizeTy = Context->getSizeType();
-  QualType SizeTy = IndexExprType->isSignedIntegerType() ? SSizeTy : USizeTy;
+  const QualType SSizeTy = Context->getPointerDiffType();
+  const QualType USizeTy = Context->getSizeType();
+  const QualType SizeTy =
+      IndexExprType->isSignedIntegerType() ? SSizeTy : USizeTy;
   // FIXME: is there a way to actually get the QualType for size_t/ptrdiff_t?
   // Note that SizeTy.getAsString() will be unsigned long/..., NOT size_t!
-  StringRef TyAsString =
+  const StringRef TyAsString =
       IndexExprType->isSignedIntegerType() ? "ptrdiff_t" : "size_t";
 
   // So, is size_t actually wider than the result of the multiplication?
diff --git a/clang-tools-extra/clang-tidy/bugprone/InaccurateEraseCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/InaccurateEraseCheck.cpp
index b0dd9017c8426..12fa3655ffcd6 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InaccurateEraseCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/InaccurateEraseCheck.cpp
@@ -43,7 +43,7 @@ void InaccurateEraseCheck::check(const MatchFinder::MatchResult &Result) {
 
   if (!Loc.isMacroID() && EndExpr) {
     const auto *AlgCall = Result.Nodes.getNodeAs<CallExpr>("alg");
-    std::string ReplacementText = std::string(Lexer::getSourceText(
+    const std::string ReplacementText = std::string(Lexer::getSourceText(
         CharSourceRange::getTokenRange(EndExpr->getSourceRange()),
         *Result.SourceManager, getLangOpts()));
     const SourceLocation EndLoc = Lexer::getLocForEndOfToken(
diff --git a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp
index 84a99c36523ac..6181ac84f36e3 100644
--- a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp
@@ -22,7 +22,7 @@ AST_MATCHER_P(TemplateTypeParmDecl, hasUnnamedDefaultArgument,
       Node.getDefaultArgument().getArgument().isNull())
     return false;
 
-  TypeLoc DefaultArgTypeLoc =
+  const TypeLoc DefaultArgTypeLoc =
       Node.getDefaultArgument().getTypeSourceInfo()->getTypeLoc();
   return InnerMatcher.matches(DefaultArgTypeLoc, Finder, Builder);
 }
@@ -51,7 +51,7 @@ void IncorrectEnableIfCheck::check(const MatchFinder::MatchResult &Result) {
     return;
 
   const SourceManager &SM = *Result.SourceManager;
-  SourceLocation RAngleLoc =
+  const SourceLocation RAngleLoc =
       SM.getExpansionLoc(EnableIfSpecializationLoc->getRAngleLoc());
 
   auto Diag = diag(EnableIf->getBeginLoc(),
diff --git a/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp
index 1e516c1573219..50280d22be0d8 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp
@@ -34,7 +34,7 @@ AST_MATCHER(FunctionType, typeHasNoReturnAttr) {
 } // namespace
 
 static Matcher<Stmt> loopEndingStmt(Matcher<Stmt> Internal) {
-  Matcher<QualType> IsNoReturnFunType =
+  const Matcher<QualType> IsNoReturnFunType =
       ignoringParens(functionType(typeHasNoReturnAttr()));
   Matcher<Decl> IsNoReturnDecl =
       anyOf(declHasNoReturnAttr(), functionDecl(hasType(IsNoReturnFunType)),
@@ -145,7 +145,7 @@ static std::string getCondVarNames(const Stmt *Cond) {
     if (!Child)
       continue;
 
-    std::string NewNames = getCondVarNames(Child);
+    const std::string NewNames = getCondVarNames(Child);
     if (!Result.empty() && !NewNames.empty())
       Result += ", ";
     Result += NewNames;
@@ -332,7 +332,7 @@ void InfiniteLoopCheck::check(const MatchFinder::MatchResult &Result) {
                                               Result.Context))
     return;
 
-  std::string CondVarNames = getCondVarNames(Cond);
+  const std::string CondVarNames = getCondVarNames(Cond);
   if (ShouldHaveConditionVariables && CondVarNames.empty())
     return;
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.h b/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.h
index 777e31868c961..acab7be7f33c6 100644
--- a/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_INTEGER_DIVISION_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_INTEGER_DIVISION_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_INTEGERDIVISIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_INTEGERDIVISIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -28,4 +28,4 @@ class IntegerDivisionCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_INTEGER_DIVISION_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_INTEGERDIVISIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
index 76df992f29fc1..f3e94b62f0dbd 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
@@ -20,6 +20,8 @@ namespace clang::tidy::bugprone {
 
 namespace {
 
+// Preserve same name as AST_MATCHER(isCompleteAndHasNoZeroValue)
+// NOLINTNEXTLINE(llvm-prefer-static-over-anonymous-namespace)
 bool isCompleteAndHasNoZeroValue(const EnumDecl *D) {
   const EnumDecl *Definition = D->getDefinition();
   return Definition && Definition->isComplete() &&
@@ -149,7 +151,7 @@ void InvalidEnumDefaultInitializationCheck::check(
   SourceLocation Loc = InitExpr->getExprLoc();
   if (Loc.isInvalid()) {
     if (isa<ImplicitValueInitExpr, InitListExpr>(InitExpr)) {
-      DynTypedNodeList Parents = ACtx.getParents(*InitExpr);
+      const DynTypedNodeList Parents = ACtx.getParents(*InitExpr);
       if (Parents.empty())
         return;
 
@@ -168,7 +170,7 @@ void InvalidEnumDefaultInitializationCheck::check(
         // The expression may be implicitly generated for an initialization.
         // Search for a parent initialization list with valid source location.
         while (InitList->getExprLoc().isInvalid()) {
-          DynTypedNodeList Parents = ACtx.getParents(*InitList);
+          const DynTypedNodeList Parents = ACtx.getParents(*InitList);
           if (Parents.empty())
             return;
           InitList = Parents[0].get<InitListExpr>();
diff --git a/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp
index fb73e896fdb13..1f666d2a4345f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp
@@ -40,7 +40,7 @@ class MacroExpansionsWithFileAndLine : public PPCallbacks {
     bool HasLine = false;
     for (const Token &T : MD.getMacroInfo()->tokens()) {
       if (T.is(tok::identifier)) {
-        StringRef IdentName = T.getIdentifierInfo()->getName();
+        const StringRef IdentName = T.getIdentifierInfo()->getName();
         if (IdentName == "__FILE__") {
           HasFile = true;
         } else if (IdentName == "__LINE__") {
diff --git a/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.cpp
index 78a53d12bd312..c79320fbf3304 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.cpp
@@ -127,7 +127,7 @@ unsigned MacroRepeatedPPCallbacks::countArgumentExpansions(
         continue;
     }
 
-    IdentifierInfo *TII = T.getIdentifierInfo();
+    const IdentifierInfo *TII = T.getIdentifierInfo();
     // If not existent, skip it.
     if (TII == nullptr)
       continue;
diff --git a/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.h b/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.h
index f650145203ce6..c40aef339e91a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACED_OPERATOR_IN_STRLEN_IN_ALLOC_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACED_OPERATOR_IN_STRLEN_IN_ALLOC_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACEDOPERATORINSTRLENINALLOCCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACEDOPERATORINSTRLENINALLOCCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -30,4 +30,4 @@ class MisplacedOperatorInStrlenInAllocCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACED_OPERATOR_IN_STRLEN_IN_ALLOC_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACEDOPERATORINSTRLENINALLOCCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.h b/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.h
index e78c30cbb644a..9f6504fe8a911 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACED_OPERATOR_IN_ALLOC_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACED_OPERATOR_IN_ALLOC_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACEDPOINTERARITHMETICINALLOCCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACEDPOINTERARITHMETICINALLOCCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -29,4 +29,4 @@ class MisplacedPointerArithmeticInAllocCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACED_OPERATOR_IN_ALLOC_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MISPLACEDPOINTERARITHMETICINALLOCCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.cpp
index d508e2aaba53c..f040235322a4f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.cpp
@@ -52,8 +52,8 @@ static unsigned getMaxCalculationWidth(const ASTContext &Context,
   E = E->IgnoreParenImpCasts();
 
   if (const auto *Bop = dyn_cast<BinaryOperator>(E)) {
-    unsigned LHSWidth = getMaxCalculationWidth(Context, Bop->getLHS());
-    unsigned RHSWidth = getMaxCalculationWidth(Context, Bop->getRHS());
+    const unsigned LHSWidth = getMaxCalculationWidth(Context, Bop->getLHS());
+    const unsigned RHSWidth = getMaxCalculationWidth(Context, Bop->getRHS());
     if (Bop->getOpcode() == BO_Mul)
       return LHSWidth + RHSWidth;
     if (Bop->getOpcode() == BO_Add)
@@ -79,7 +79,7 @@ static unsigned getMaxCalculationWidth(const ASTContext &Context,
     if (Uop->getOpcode() == UO_Not)
       return 1024U;
 
-    QualType T = Uop->getType();
+    const QualType T = Uop->getType();
     return T->isIntegerType() ? Context.getIntWidth(T) : 1024U;
   } else if (const auto *I = dyn_cast<IntegerLiteral>(E)) {
     return I->getValue().getActiveBits();
@@ -190,10 +190,10 @@ void MisplacedWideningCastCheck::check(const MatchFinder::MatchResult &Result) {
       Calc->isTypeDependent() || Calc->isValueDependent())
     return;
 
-  ASTContext &Context = *Result.Context;
+  const ASTContext &Context = *Result.Context;
 
-  QualType CastType = Cast->getType();
-  QualType CalcType = Calc->getType();
+  const QualType CastType = Cast->getType();
+  const QualType CalcType = Calc->getType();
 
   // Explicit truncation using cast.
   if (Context.getIntWidth(CastType) < Context.getIntWidth(CalcType))
diff --git a/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp
index 66559a0e5d7b5..e182df75b1d9a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp
@@ -21,7 +21,7 @@ static void replaceMoveWithForward(const UnresolvedLookupExpr *Callee,
   const SourceManager &SM = Context.getSourceManager();
   const LangOptions &LangOpts = Context.getLangOpts();
 
-  CharSourceRange CallRange =
+  const CharSourceRange CallRange =
       Lexer::makeFileCharRange(CharSourceRange::getTokenRange(
                                    Callee->getBeginLoc(), Callee->getEndLoc()),
                                SM, LangOpts);
@@ -39,7 +39,7 @@ static void replaceMoveWithForward(const UnresolvedLookupExpr *Callee,
     // std::move(). This will hopefully prevent erroneous replacements if the
     // code does unusual things (e.g. create an alias for std::move() in
     // another namespace).
-    NestedNameSpecifier NNS = Callee->getQualifier();
+    const NestedNameSpecifier NNS = Callee->getQualifier();
     switch (NNS.getKind()) {
     case NestedNameSpecifier::Kind::Null:
       // Called as "move" (i.e. presumably the code had a "using std::move;").
diff --git a/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.cpp
index 2eff013b2ab7d..78f2017984a96 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.cpp
@@ -86,8 +86,9 @@ MultiLevelImplicitPointerConversionCheck::getCheckTraversalKind() const {
 void MultiLevelImplicitPointerConversionCheck::check(
     const MatchFinder::MatchResult &Result) {
   const auto *MatchedExpr = Result.Nodes.getNodeAs<ImplicitCastExpr>("expr");
-  QualType Target = MatchedExpr->getType().getDesugaredType(*Result.Context);
-  QualType Source =
+  const QualType Target =
+      MatchedExpr->getType().getDesugaredType(*Result.Context);
+  const QualType Source =
       MatchedExpr->getSubExpr()->getType().getDesugaredType(*Result.Context);
 
   diag(MatchedExpr->getExprLoc(), "multilevel pointer conversion from %0 to "
diff --git a/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp
index 17aea9392bd26..b81d2b438d58d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp
@@ -51,7 +51,8 @@ namespace {
 
 AST_MATCHER_P(CXXTryStmt, hasHandlerFor,
               ast_matchers::internal::Matcher<QualType>, InnerMatcher) {
-  for (unsigned NH = Node.getNumHandlers(), I = 0; I < NH; ++I) {
+  const unsigned NH = Node.getNumHandlers();
+  for (unsigned I = 0; I < NH; ++I) {
     const CXXCatchStmt *CatchS = Node.getHandler(I);
     // Check for generic catch handler (match anything).
     if (CatchS->getCaughtType().isNull())
@@ -66,7 +67,7 @@ AST_MATCHER_P(CXXTryStmt, hasHandlerFor,
 }
 
 AST_MATCHER(CXXNewExpr, mayThrow) {
-  FunctionDecl *OperatorNew = Node.getOperatorNew();
+  const FunctionDecl *OperatorNew = Node.getOperatorNew();
   if (!OperatorNew)
     return false;
   return !OperatorNew->getType()->castAs<FunctionProtoType>()->isNothrow();
diff --git a/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.h b/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.h
index 1a2d4a410b46e..1c3679a893ce5 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MULTIPLE_STATEMENT_MACRO_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MULTIPLE_STATEMENT_MACRO_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MULTIPLESTATEMENTMACROCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MULTIPLESTATEMENTMACROCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -29,4 +29,4 @@ class MultipleStatementMacroCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MULTIPLE_STATEMENT_MACRO_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_MULTIPLESTATEMENTMACROCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
index 287ee95a4db55..501a82d67d558 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
@@ -29,15 +29,15 @@ AST_MATCHER_P(QualType, hasAnyType, std::vector<StringRef>, Names) {
   if (Names.empty())
     return false;
 
-  std::string Name = Node.getLocalUnqualifiedType().getAsString();
+  const std::string Name = Node.getLocalUnqualifiedType().getAsString();
   return llvm::is_contained(Names, Name);
 }
 
 AST_MATCHER(FieldDecl, hasIntBitwidth) {
   assert(Node.isBitField());
   const ASTContext &Ctx = Node.getASTContext();
-  unsigned IntBitWidth = Ctx.getIntWidth(Ctx.IntTy);
-  unsigned CurrentBitWidth = Node.getBitWidthValue();
+  const unsigned IntBitWidth = Ctx.getIntWidth(Ctx.IntTy);
+  const unsigned CurrentBitWidth = Node.getBitWidthValue();
   return IntBitWidth == CurrentBitWidth;
 }
 
@@ -79,7 +79,7 @@ void NarrowingConversionsCheck::registerMatchers(MatchFinder *Finder) {
   const auto IsCeilFloorCallExpr = expr(callExpr(callee(functionDecl(
       hasAnyName("::ceil", "::std::ceil", "::floor", "::std::floor")))));
 
-  std::vector<StringRef> IgnoreConversionFromTypesVec =
+  const std::vector<StringRef> IgnoreConversionFromTypesVec =
       utils::options::parseStringList(IgnoreConversionFromTypes);
 
   // We may want to exclude other types from the checks, such as `size_type`
@@ -243,7 +243,7 @@ struct IntegerRange {
 static IntegerRange createFromType(const ASTContext &Context,
                                    const BuiltinType &T) {
   if (T.isFloatingPoint()) {
-    unsigned PrecisionBits = llvm::APFloatBase::semanticsPrecision(
+    const unsigned PrecisionBits = llvm::APFloatBase::semanticsPrecision(
         Context.getFloatTypeSemantics(T.desugar()));
     // Contrary to two's complement integer, floating point values are
     // symmetric and have the same number of positive and negative values.
@@ -262,8 +262,8 @@ static IntegerRange createFromType(const ASTContext &Context,
     return {LowerValue, UpperValue};
   }
   assert(T.isInteger() && "Unexpected builtin type");
-  uint64_t TypeSize = Context.getTypeSize(&T);
-  bool IsUnsignedInteger = T.isUnsignedInteger();
+  const uint64_t TypeSize = Context.getTypeSize(&T);
+  const bool IsUnsignedInteger = T.isUnsignedInteger();
   return {llvm::APSInt::getMinValue(TypeSize, IsUnsignedInteger),
           llvm::APSInt::getMaxValue(TypeSize, IsUnsignedInteger)};
 }
@@ -271,15 +271,15 @@ static IntegerRange createFromType(const ASTContext &Context,
 static bool isWideEnoughToHold(const ASTContext &Context,
                                const BuiltinType &FromType,
                                const BuiltinType &ToType) {
-  IntegerRange FromIntegerRange = createFromType(Context, FromType);
-  IntegerRange ToIntegerRange = createFromType(Context, ToType);
+  const IntegerRange FromIntegerRange = createFromType(Context, FromType);
+  const IntegerRange ToIntegerRange = createFromType(Context, ToType);
   return ToIntegerRange.contains(FromIntegerRange);
 }
 
 static bool isWideEnoughToHold(const ASTContext &Context,
                                const llvm::APSInt &IntegerConstant,
                                const BuiltinType &ToType) {
-  IntegerRange ToIntegerRange = createFromType(Context, ToType);
+  const IntegerRange ToIntegerRange = createFromType(Context, ToType);
   return ToIntegerRange.contains(IntegerConstant);
 }
 
@@ -289,13 +289,13 @@ static bool isWideEnoughToHold(const ASTContext &Context,
 static bool isFloatExactlyRepresentable(const ASTContext &Context,
                                         const llvm::APFloat &FloatConstant,
                                         const QualType &DestType) {
-  unsigned DestWidth = Context.getIntWidth(DestType);
-  bool DestSigned = DestType->isSignedIntegerOrEnumerationType();
+  const unsigned DestWidth = Context.getIntWidth(DestType);
+  const bool DestSigned = DestType->isSignedIntegerOrEnumerationType();
   llvm::APSInt Result = llvm::APSInt(DestWidth, !DestSigned);
   bool IsExact = false;
-  bool Overflows = FloatConstant.convertToInteger(
-                       Result, llvm::APFloat::rmTowardZero, &IsExact) &
-                   llvm::APFloat::opInvalidOp;
+  const bool Overflows = FloatConstant.convertToInteger(
+                             Result, llvm::APFloat::rmTowardZero, &IsExact) &
+                         llvm::APFloat::opInvalidOp;
   return !Overflows && IsExact;
 }
 
@@ -321,8 +321,8 @@ bool NarrowingConversionsCheck::isWarningInhibitedByEquivalentSize(
   // With this option, we don't warn on conversions that have equivalent width
   // in bits. eg. uint32 <-> int32.
   if (!WarnOnEquivalentBitWidth) {
-    uint64_t FromTypeSize = Context.getTypeSize(&FromType);
-    uint64_t ToTypeSize = Context.getTypeSize(&ToType);
+    const uint64_t FromTypeSize = Context.getTypeSize(&FromType);
+    const uint64_t ToTypeSize = Context.getTypeSize(&ToType);
     if (FromTypeSize == ToTypeSize) {
       return true;
     }
@@ -406,8 +406,8 @@ void NarrowingConversionsCheck::handleIntegralCast(const ASTContext &Context,
     // With this option, we don't warn on conversions that have equivalent width
     // in bits. eg. uint32 <-> int32.
     if (!WarnOnEquivalentBitWidth) {
-      uint64_t FromTypeSize = Context.getTypeSize(FromType);
-      uint64_t ToTypeSize = Context.getTypeSize(ToType);
+      const uint64_t FromTypeSize = Context.getTypeSize(FromType);
+      const uint64_t ToTypeSize = Context.getTypeSize(ToType);
       if (FromTypeSize == ToTypeSize)
         return;
     }
@@ -583,7 +583,7 @@ void NarrowingConversionsCheck::handleImplicitCast(
     return;
   if (handleConditionalOperator(Context, Lhs, Rhs))
     return;
-  SourceLocation SourceLoc = Lhs.getExprLoc();
+  const SourceLocation SourceLoc = Lhs.getExprLoc();
   switch (Cast.getCastKind()) {
   case CK_BooleanToSignedIntegral:
     handleBooleanToSignedIntegral(Context, SourceLoc, Lhs, Rhs);
diff --git a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h
index 9631c71dee64e..e506e5b0315db 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWING_CONVERSIONS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWING_CONVERSIONS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWINGCONVERSIONSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWINGCONVERSIONSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -108,4 +108,4 @@ class NarrowingConversionsCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWING_CONVERSIONS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWINGCONVERSIONSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.cpp
index abde115d10a1b..40305cab81c7f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.cpp
@@ -60,7 +60,7 @@ void NondeterministicPointerIterationOrderCheck::check(
           TemplateArgs[0].getAsType()->isPointerType();
 
       if (IsAlgoArgPointer) {
-        SourceRange R = RangeInit->getSourceRange();
+        const SourceRange R = RangeInit->getSourceRange();
         diag(R.getBegin(), "iteration of pointers is nondeterministic") << R;
       }
     }
@@ -69,7 +69,7 @@ void NondeterministicPointerIterationOrderCheck::check(
   const auto *SortPointers = Result.Nodes.getNodeAs<Stmt>("sortsemantic");
 
   if ((SortPointers) && !(SortPointers->getBeginLoc().isMacroID())) {
-    SourceRange R = SortPointers->getSourceRange();
+    const SourceRange R = SortPointers->getSourceRange();
     diag(R.getBegin(), "sorting pointers is nondeterministic") << R;
   }
 }
diff --git a/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.h b/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.h
index 054d5804745bc..46b4e12508629 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NONDETERMINISTIC_POINTER_ITERATION_ORDER_CHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NONDETERMINISTIC_POINTER_ITERATION_ORDER_CHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NONDETERMINISTICPOINTERITERATIONORDERCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NONDETERMINISTICPOINTERITERATIONORDERCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -36,4 +36,4 @@ class NondeterministicPointerIterationOrderCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NONDETERMINISTIC_POINTER_ITERATION_ORDER_CHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NONDETERMINISTICPOINTERITERATIONORDERCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
index 08fae7b59bae5..7198c1b1c8aaf 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
@@ -120,18 +120,18 @@ static int getGivenLength(const MatchFinder::MatchResult &Result) {
   if (Result.Nodes.getNodeAs<Expr>(UnknownLengthName))
     return 0;
 
-  if (int Length =
+  if (const int Length =
           getLength(Result.Nodes.getNodeAs<Expr>(WrongLengthExprName), Result))
     return Length;
 
-  if (int Length =
+  if (const int Length =
           getLength(Result.Nodes.getNodeAs<Expr>(LengthExprName), Result))
     return Length;
 
   // Special case, for example 'strlen("foo")'.
   if (const CallExpr *StrlenCE = getStrlenExpr(Result))
     if (const Expr *Arg = StrlenCE->getArg(0)->IgnoreImpCasts())
-      if (int ArgLength = getLength(Arg, Result))
+      if (const int ArgLength = getLength(Arg, Result))
         return ArgLength;
 
   return 0;
@@ -174,9 +174,9 @@ static bool isKnownDest(const MatchFinder::MatchResult &Result) {
 // True if the capacity of the destination array is based on the given length,
 // therefore we assume that it cannot overflow (e.g. 'malloc(given_length + 1)'
 static bool isDestBasedOnGivenLength(const MatchFinder::MatchResult &Result) {
-  StringRef DestCapacityExprStr =
+  const StringRef DestCapacityExprStr =
       exprToStr(getDestCapacityExpr(Result), Result).trim();
-  StringRef LengthExprStr =
+  const StringRef LengthExprStr =
       exprToStr(Result.Nodes.getNodeAs<Expr>(LengthExprName), Result).trim();
 
   return !DestCapacityExprStr.empty() && !LengthExprStr.empty() &&
@@ -226,8 +226,9 @@ isGivenLengthEqualToSrcLength(const MatchFinder::MatchResult &Result) {
   if (isStringDataAndLength(Result))
     return true;
 
-  int GivenLength = getGivenLength(Result);
-  int SrcLength = getLength(Result.Nodes.getNodeAs<Expr>(SrcExprName), Result);
+  const int GivenLength = getGivenLength(Result);
+  const int SrcLength =
+      getLength(Result.Nodes.getNodeAs<Expr>(SrcExprName), Result);
 
   if (GivenLength != 0 && SrcLength != 0 && GivenLength == SrcLength)
     return true;
@@ -261,15 +262,15 @@ static bool isDestCapacityOverflows(const MatchFinder::MatchResult &Result) {
     return true;
 
   const Expr *DestCapacityExpr = getDestCapacityExpr(Result);
-  int DestCapacity = getLength(DestCapacityExpr, Result);
-  int GivenLength = getGivenLength(Result);
+  const int DestCapacity = getLength(DestCapacityExpr, Result);
+  const int GivenLength = getGivenLength(Result);
 
   if (GivenLength != 0 && DestCapacity != 0)
     return isGivenLengthEqualToSrcLength(Result) && DestCapacity == GivenLength;
 
   // Assume that the destination array's capacity cannot overflow if the
   // expression of the memory allocation contains '+ 1'.
-  StringRef DestCapacityExprStr = exprToStr(DestCapacityExpr, Result);
+  const StringRef DestCapacityExprStr = exprToStr(DestCapacityExpr, Result);
   if (DestCapacityExprStr.contains("+1") || DestCapacityExprStr.contains("+ 1"))
     return false;
 
@@ -297,7 +298,7 @@ static void lengthExprHandle(const Expr *LengthExpr,
 
   // See whether we work with a macro.
   bool IsMacroDefinition = false;
-  StringRef LengthExprStr = exprToStr(LengthExpr, Result);
+  const StringRef LengthExprStr = exprToStr(LengthExpr, Result);
   Preprocessor::macro_iterator It = PP->macro_begin();
   while (It != PP->macro_end() && !IsMacroDefinition) {
     if (It->first->getName() == LengthExprStr)
@@ -309,7 +310,7 @@ static void lengthExprHandle(const Expr *LengthExpr,
   // Try to obtain an 'IntegerLiteral' and adjust it.
   if (!IsMacroDefinition) {
     if (const auto *LengthIL = dyn_cast<IntegerLiteral>(LengthExpr)) {
-      uint64_t NewLength =
+      const uint64_t NewLength =
           LengthIL->getValue().getZExtValue() +
           (LengthHandle == LengthHandleKind::Increase ? 1 : -1);
 
@@ -347,7 +348,7 @@ static void lengthExprHandle(const Expr *LengthExpr,
   }
 
   // Try to inject the '+ 1'/'- 1' string.
-  bool NeedInnerParen = BO && BO->getOpcode() != BO_Add;
+  const bool NeedInnerParen = BO && BO->getOpcode() != BO_Add;
 
   if (NeedInnerParen)
     Diag << FixItHint::CreateInsertion(LengthExpr->getBeginLoc(), "(");
@@ -384,8 +385,8 @@ static bool isDestExprFix(const MatchFinder::MatchResult &Result,
   if (!Dest)
     return false;
 
-  std::string TempTyStr = Dest->getType().getAsString();
-  StringRef TyStr = TempTyStr;
+  const std::string TempTyStr = Dest->getType().getAsString();
+  const StringRef TyStr = TempTyStr;
   if (TyStr.starts_with("char") || TyStr.starts_with("wchar_t"))
     return false;
 
@@ -397,7 +398,7 @@ static bool isDestExprFix(const MatchFinder::MatchResult &Result,
 // increase the capacity by one to create space for the null terminator.
 static bool isDestCapacityFix(const MatchFinder::MatchResult &Result,
                               DiagnosticBuilder &Diag) {
-  bool IsOverflows = isDestCapacityOverflows(Result);
+  const bool IsOverflows = isDestCapacityOverflows(Result);
   if (IsOverflows)
     if (const Expr *CapacityExpr = getDestCapacityExpr(Result))
       lengthExprHandle(CapacityExpr, LengthHandleKind::Increase, Result, Diag);
@@ -424,9 +425,9 @@ static void renameFunc(StringRef NewFuncName,
                        const MatchFinder::MatchResult &Result,
                        DiagnosticBuilder &Diag) {
   const auto *FunctionExpr = Result.Nodes.getNodeAs<CallExpr>(FunctionExprName);
-  int FuncNameLength =
+  const int FuncNameLength =
       FunctionExpr->getDirectCallee()->getIdentifier()->getLength();
-  SourceRange FuncNameRange(
+  const SourceRange FuncNameRange(
       FunctionExpr->getBeginLoc(),
       FunctionExpr->getBeginLoc().getLocWithOffset(FuncNameLength - 1));
 
@@ -451,7 +452,7 @@ static void insertDestCapacityArg(bool IsOverflows, StringRef Name,
   const auto *FunctionExpr = Result.Nodes.getNodeAs<CallExpr>(FunctionExprName);
   SmallString<64> NewSecondArg;
 
-  if (int DestLength = getDestCapacity(Result)) {
+  if (const int DestLength = getDestCapacity(Result)) {
     NewSecondArg = Twine(IsOverflows ? DestLength + 1 : DestLength).str();
   } else {
     NewSecondArg =
@@ -470,12 +471,12 @@ static void insertNullTerminatorExpr(StringRef Name,
                                      const MatchFinder::MatchResult &Result,
                                      DiagnosticBuilder &Diag) {
   const auto *FunctionExpr = Result.Nodes.getNodeAs<CallExpr>(FunctionExprName);
-  int FuncLocStartColumn = Result.SourceManager->getPresumedColumnNumber(
+  const int FuncLocStartColumn = Result.SourceManager->getPresumedColumnNumber(
       FunctionExpr->getBeginLoc());
-  SourceRange SpaceRange(
+  const SourceRange SpaceRange(
       FunctionExpr->getBeginLoc().getLocWithOffset(-FuncLocStartColumn + 1),
       FunctionExpr->getBeginLoc());
-  StringRef SpaceBeforeStmtStr = Lexer::getSourceText(
+  const StringRef SpaceBeforeStmtStr = Lexer::getSourceText(
       CharSourceRange::getCharRange(SpaceRange), *Result.SourceManager,
       Result.Context->getLangOpts(), nullptr);
 
@@ -717,10 +718,10 @@ void NotNullTerminatedResultCheck::registerMatchers(MatchFinder *Finder) {
   };
 
   auto MatchCall = [=](CallContext CC) {
-    std::string CharHandlerFuncName = "::" + CC.Name.str();
+    const std::string CharHandlerFuncName = "::" + CC.Name.str();
 
     // Try to match with 'wchar_t' based function calls.
-    std::string WcharHandlerFuncName =
+    const std::string WcharHandlerFuncName =
         "::" + (CC.Name.starts_with("mem") ? "w" + CC.Name.str()
                                            : "wcs" + CC.Name.substr(3).str());
 
@@ -804,7 +805,8 @@ void NotNullTerminatedResultCheck::check(
         if (MI) {
           const auto &T = MI->tokens().back();
           if (T.isLiteral() && T.getLiteralData()) {
-            StringRef ValueStr = StringRef(T.getLiteralData(), T.getLength());
+            const StringRef ValueStr =
+                StringRef(T.getLiteralData(), T.getLength());
             llvm::APInt IntValue;
             ValueStr.getAsInteger(10, IntValue);
             AreSafeFunctionsWanted = IntValue.getZExtValue();
@@ -819,7 +821,7 @@ void NotNullTerminatedResultCheck::check(
       UseSafeFunctions = *AreSafeFunctionsWanted;
   }
 
-  StringRef Name = FunctionExpr->getDirectCallee()->getName();
+  const StringRef Name = FunctionExpr->getDirectCallee()->getName();
   if (Name.starts_with("mem") || Name.starts_with("wmem"))
     memoryHandlerFunctionFix(Name, Result);
   else if (Name == "strerror_s")
@@ -864,16 +866,16 @@ void NotNullTerminatedResultCheck::memoryHandlerFunctionFix(
 void NotNullTerminatedResultCheck::memcpyFix(
     StringRef Name, const MatchFinder::MatchResult &Result,
     DiagnosticBuilder &Diag) {
-  bool IsOverflows = isDestCapacityFix(Result, Diag);
-  bool IsDestFixed = isDestExprFix(Result, Diag);
+  const bool IsOverflows = isDestCapacityFix(Result, Diag);
+  const bool IsDestFixed = isDestExprFix(Result, Diag);
 
-  bool IsCopy =
+  const bool IsCopy =
       isGivenLengthEqualToSrcLength(Result) || isDestBasedOnGivenLength(Result);
 
-  bool IsSafe = UseSafeFunctions && IsOverflows && isKnownDest(Result) &&
-                !isDestBasedOnGivenLength(Result);
+  const bool IsSafe = UseSafeFunctions && IsOverflows && isKnownDest(Result) &&
+                      !isDestBasedOnGivenLength(Result);
 
-  bool IsDestLengthNotRequired =
+  const bool IsDestLengthNotRequired =
       IsSafe && getLangOpts().CPlusPlus &&
       Result.Nodes.getNodeAs<ArrayType>(DestArrayTyName) && !IsDestFixed;
 
@@ -892,14 +894,14 @@ void NotNullTerminatedResultCheck::memcpyFix(
 void NotNullTerminatedResultCheck::memcpySFix(
     StringRef Name, const MatchFinder::MatchResult &Result,
     DiagnosticBuilder &Diag) {
-  bool IsOverflows = isDestCapacityFix(Result, Diag);
-  bool IsDestFixed = isDestExprFix(Result, Diag);
+  const bool IsOverflows = isDestCapacityFix(Result, Diag);
+  const bool IsDestFixed = isDestExprFix(Result, Diag);
 
-  bool RemoveDestLength = getLangOpts().CPlusPlus &&
-                          Result.Nodes.getNodeAs<ArrayType>(DestArrayTyName) &&
-                          !IsDestFixed;
-  bool IsCopy = isGivenLengthEqualToSrcLength(Result);
-  bool IsSafe = IsOverflows;
+  const bool RemoveDestLength =
+      getLangOpts().CPlusPlus &&
+      Result.Nodes.getNodeAs<ArrayType>(DestArrayTyName) && !IsDestFixed;
+  const bool IsCopy = isGivenLengthEqualToSrcLength(Result);
+  const bool IsSafe = IsOverflows;
 
   renameMemcpy(Name, IsCopy, IsSafe, Result, Diag);
 
@@ -932,7 +934,7 @@ void NotNullTerminatedResultCheck::memchrFix(
     Diag << CastRemoveFix;
   }
 
-  StringRef NewFuncName = (Name[0] != 'w') ? "strchr" : "wcschr";
+  const StringRef NewFuncName = (Name[0] != 'w') ? "strchr" : "wcschr";
   renameFunc(NewFuncName, Result, Diag);
   removeArg(2, Result, Diag);
 }
@@ -940,7 +942,7 @@ void NotNullTerminatedResultCheck::memchrFix(
 void NotNullTerminatedResultCheck::memmoveFix(
     StringRef Name, const MatchFinder::MatchResult &Result,
     DiagnosticBuilder &Diag) const {
-  bool IsOverflows = isDestCapacityFix(Result, Diag);
+  const bool IsOverflows = isDestCapacityFix(Result, Diag);
 
   if (UseSafeFunctions && isKnownDest(Result)) {
     renameFunc((Name[0] != 'w') ? "memmove_s" : "wmemmove_s", Result, Diag);
@@ -970,15 +972,15 @@ void NotNullTerminatedResultCheck::ncmpFix(
 
   if (const CallExpr *StrlenExpr = getStrlenExpr(Result)) {
     const Expr *LengthExprArg = StrlenExpr->getArg(0);
-    StringRef FirstExprStr = exprToStr(FirstArgExpr, Result).trim();
-    StringRef SecondExprStr = exprToStr(SecondArgExpr, Result).trim();
-    StringRef LengthArgStr = exprToStr(LengthExprArg, Result).trim();
+    const StringRef FirstExprStr = exprToStr(FirstArgExpr, Result).trim();
+    const StringRef SecondExprStr = exprToStr(SecondArgExpr, Result).trim();
+    const StringRef LengthArgStr = exprToStr(LengthExprArg, Result).trim();
     IsLengthTooLong =
         LengthArgStr == FirstExprStr || LengthArgStr == SecondExprStr;
   } else {
-    int SrcLength =
+    const int SrcLength =
         getLength(Result.Nodes.getNodeAs<Expr>(SrcExprName), Result);
-    int GivenLength = getGivenLength(Result);
+    const int GivenLength = getGivenLength(Result);
     if (SrcLength != 0 && GivenLength != 0)
       IsLengthTooLong = GivenLength > SrcLength;
   }
diff --git a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.h b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.h
index a8f4ca32a0b5b..cf61eb5c585f4 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NOT_NULL_TERMINATED_RESULT_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NOT_NULL_TERMINATED_RESULT_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NOTNULLTERMINATEDRESULTCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NOTNULLTERMINATEDRESULTCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -60,4 +60,4 @@ class NotNullTerminatedResultCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NOT_NULL_TERMINATED_RESULT_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NOTNULLTERMINATEDRESULTCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.cpp
index 57196adf38fb6..0084ace7d0fcc 100644
--- a/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.cpp
@@ -66,8 +66,8 @@ void PosixReturnCheck::registerMatchers(MatchFinder *Finder) {
 void PosixReturnCheck::check(const MatchFinder::MatchResult &Result) {
   if (const auto *LessThanZeroOp =
           Result.Nodes.getNodeAs<BinaryOperator>("ltzop")) {
-    SourceLocation OperatorLoc = LessThanZeroOp->getOperatorLoc();
-    StringRef NewBinOp =
+    const SourceLocation OperatorLoc = LessThanZeroOp->getOperatorLoc();
+    const StringRef NewBinOp =
         LessThanZeroOp->getOpcode() == BinaryOperator::Opcode::BO_LT ? ">"
                                                                      : "<";
     diag(OperatorLoc, "the comparison always evaluates to false because %0 "
diff --git a/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.h b/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.h
index d72c86c060fb9..a9cb7a6e34477 100644
--- a/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/PosixReturnCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_POSIX_RETURN_CHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_POSIX_RETURN_CHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_POSIXRETURNCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_POSIXRETURNCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -23,4 +23,4 @@ class PosixReturnCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_POSIX_RETURN_CHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_POSIXRETURNCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/RandomGeneratorSeedCheck.cpp
similarity index 84%
rename from clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp
rename to clang-tools-extra/clang-tidy/bugprone/RandomGeneratorSeedCheck.cpp
index b8bca7286ce69..3e32e9b8a704c 100644
--- a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/RandomGeneratorSeedCheck.cpp
@@ -6,29 +6,28 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ProperlySeededRandomGeneratorCheck.h"
+#include "RandomGeneratorSeedCheck.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "llvm/ADT/STLExtras.h"
 
 using namespace clang::ast_matchers;
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
-ProperlySeededRandomGeneratorCheck::ProperlySeededRandomGeneratorCheck(
-    StringRef Name, ClangTidyContext *Context)
+RandomGeneratorSeedCheck::RandomGeneratorSeedCheck(StringRef Name,
+                                                   ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
       RawDisallowedSeedTypes(
           Options.get("DisallowedSeedTypes", "time_t,std::time_t")) {
   RawDisallowedSeedTypes.split(DisallowedSeedTypes, ',');
 }
 
-void ProperlySeededRandomGeneratorCheck::storeOptions(
-    ClangTidyOptions::OptionMap &Opts) {
+void RandomGeneratorSeedCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "DisallowedSeedTypes", RawDisallowedSeedTypes);
 }
 
-void ProperlySeededRandomGeneratorCheck::registerMatchers(MatchFinder *Finder) {
+void RandomGeneratorSeedCheck::registerMatchers(MatchFinder *Finder) {
   auto RandomGeneratorEngineDecl = cxxRecordDecl(hasAnyName(
       "::std::linear_congruential_engine", "::std::mersenne_twister_engine",
       "::std::subtract_with_carry_engine", "::std::discard_block_engine",
@@ -75,8 +74,7 @@ void ProperlySeededRandomGeneratorCheck::registerMatchers(MatchFinder *Finder) {
       this);
 }
 
-void ProperlySeededRandomGeneratorCheck::check(
-    const MatchFinder::MatchResult &Result) {
+void RandomGeneratorSeedCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *Ctor = Result.Nodes.getNodeAs<CXXConstructExpr>("ctor");
   if (Ctor)
     checkSeed(Result, Ctor);
@@ -91,8 +89,8 @@ void ProperlySeededRandomGeneratorCheck::check(
 }
 
 template <class T>
-void ProperlySeededRandomGeneratorCheck::checkSeed(
-    const MatchFinder::MatchResult &Result, const T *Func) {
+void RandomGeneratorSeedCheck::checkSeed(const MatchFinder::MatchResult &Result,
+                                         const T *Func) {
   if (Func->getNumArgs() == 0 || Func->getArg(0)->isDefaultArgument()) {
     diag(Func->getExprLoc(),
          "random number generator seeded with a default argument will generate "
@@ -118,4 +116,4 @@ void ProperlySeededRandomGeneratorCheck::checkSeed(
   }
 }
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h b/clang-tools-extra/clang-tidy/bugprone/RandomGeneratorSeedCheck.h
similarity index 67%
rename from clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h
rename to clang-tools-extra/clang-tidy/bugprone/RandomGeneratorSeedCheck.h
index 7da01cc857187..c9c54eaa14000 100644
--- a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/RandomGeneratorSeedCheck.h
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_PROPERLY_SEEDED_RANDOM_GENERATOR_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_PROPERLY_SEEDED_RANDOM_GENERATOR_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_RANDOMGENERATORSEEDCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_RANDOMGENERATORSEEDCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include <string>
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
 /// Random number generator must be seeded properly.
 ///
@@ -20,10 +20,10 @@ namespace clang::tidy::cert {
 /// constant expression is a security vulnerability.
 ///
 /// For the user-facing documentation see:
-/// https://clang.llvm.org/extra/clang-tidy/checks/cert/msc51-cpp.html
-class ProperlySeededRandomGeneratorCheck : public ClangTidyCheck {
+/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/random-generator-seed.html
+class RandomGeneratorSeedCheck : public ClangTidyCheck {
 public:
-  ProperlySeededRandomGeneratorCheck(StringRef Name, ClangTidyContext *Context);
+  RandomGeneratorSeedCheck(StringRef Name, ClangTidyContext *Context);
   void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
@@ -37,6 +37,6 @@ class ProperlySeededRandomGeneratorCheck : public ClangTidyCheck {
   SmallVector<StringRef, 5> DisallowedSeedTypes;
 };
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_PROPERLY_SEEDED_RANDOM_GENERATOR_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_RANDOMGENERATORSEEDCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.cpp
similarity index 87%
rename from clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.cpp
rename to clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.cpp
index e266cf995e8a7..e212301047ce2 100644
--- a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "NonTrivialTypesLibcMemoryCallsCheck.h"
+#include "RawMemoryCallOnNonTrivialTypeCheck.h"
 #include "../utils/OptionsUtils.h"
 #include "clang/AST/Decl.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
@@ -17,7 +17,7 @@
 
 using namespace clang::ast_matchers;
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
 namespace {
 AST_MATCHER(CXXRecordDecl, isTriviallyDefaultConstructible) {
@@ -48,24 +48,24 @@ static constexpr llvm::StringRef ComparisonOperators[] = {
     "operator==", "operator!=", "operator<",
     "operator>",  "operator<=", "operator>="};
 
-NonTrivialTypesLibcMemoryCallsCheck::NonTrivialTypesLibcMemoryCallsCheck(
+RawMemoryCallOnNonTrivialTypeCheck::RawMemoryCallOnNonTrivialTypeCheck(
     StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
       MemSetNames(Options.get("MemSetNames", "")),
       MemCpyNames(Options.get("MemCpyNames", "")),
       MemCmpNames(Options.get("MemCmpNames", "")) {}
 
-void NonTrivialTypesLibcMemoryCallsCheck::storeOptions(
+void RawMemoryCallOnNonTrivialTypeCheck::storeOptions(
     ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "MemSetNames", MemSetNames);
   Options.store(Opts, "MemCpyNames", MemCpyNames);
   Options.store(Opts, "MemCmpNames", MemCmpNames);
 }
 
-void NonTrivialTypesLibcMemoryCallsCheck::registerMatchers(
-    MatchFinder *Finder) {
+void RawMemoryCallOnNonTrivialTypeCheck::registerMatchers(MatchFinder *Finder) {
   using namespace ast_matchers::internal;
-  auto IsStructPointer = [](Matcher<CXXRecordDecl> Constraint = anything(),
+  auto IsStructPointer = [](const Matcher<CXXRecordDecl> &Constraint =
+                                anything(),
                             bool Bind = false) {
     return expr(unaryOperator(
         hasOperatorName("&"),
@@ -75,8 +75,8 @@ void NonTrivialTypesLibcMemoryCallsCheck::registerMatchers(
   };
   auto IsRecordSizeOf =
       expr(sizeOfExpr(hasArgumentOfType(equalsBoundNode("Record"))));
-  auto ArgChecker = [&](Matcher<CXXRecordDecl> RecordConstraint,
-                        BindableMatcher<Stmt> SecondArg = expr()) {
+  auto ArgChecker = [&](const Matcher<CXXRecordDecl> &RecordConstraint,
+                        const BindableMatcher<Stmt> &SecondArg = expr()) {
     return allOf(argumentCountIs(3),
                  hasArgument(0, IsStructPointer(RecordConstraint, true)),
                  hasArgument(1, SecondArg), hasArgument(2, IsRecordSizeOf));
@@ -103,7 +103,7 @@ void NonTrivialTypesLibcMemoryCallsCheck::registerMatchers(
       this);
 }
 
-void NonTrivialTypesLibcMemoryCallsCheck::check(
+void RawMemoryCallOnNonTrivialTypeCheck::check(
     const MatchFinder::MatchResult &Result) {
   if (const auto *Caller = Result.Nodes.getNodeAs<CallExpr>("lazyConstruct")) {
     diag(Caller->getBeginLoc(), "calling %0 on a non-trivially default "
@@ -122,4 +122,4 @@ void NonTrivialTypesLibcMemoryCallsCheck::check(
   }
 }
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.h b/clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.h
similarity index 59%
rename from clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.h
rename to clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.h
index 4589ce444c878..002aac6d37bfb 100644
--- a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/RawMemoryCallOnNonTrivialTypeCheck.h
@@ -6,22 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_NONTRIVIALTYPESLIBCMEMORYCALLSCHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_NONTRIVIALTYPESLIBCMEMORYCALLSCHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_RAWMEMORYCALLONNONTRIVIALTYPECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_RAWMEMORYCALLONNONTRIVIALTYPECHECK_H
 
 #include "../ClangTidyCheck.h"
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
-/// Flags use of the `C` standard library functions 'memset', 'memcpy' and
+/// Flags use of the C standard library functions 'memset', 'memcpy' and
 /// 'memcmp' and similar derivatives on non-trivial types.
 ///
 /// For the user-facing documentation see:
-/// https://clang.llvm.org/extra/clang-tidy/checks/cert/oop57-cpp.html
-class NonTrivialTypesLibcMemoryCallsCheck : public ClangTidyCheck {
+/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/raw-memory-call-on-non-trivial-type.html
+class RawMemoryCallOnNonTrivialTypeCheck : public ClangTidyCheck {
 public:
-  NonTrivialTypesLibcMemoryCallsCheck(StringRef Name,
-                                      ClangTidyContext *Context);
+  RawMemoryCallOnNonTrivialTypeCheck(StringRef Name, ClangTidyContext *Context);
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus && !LangOpts.ObjC;
   }
@@ -35,6 +34,6 @@ class NonTrivialTypesLibcMemoryCallsCheck : public ClangTidyCheck {
   const StringRef MemCmpNames;
 };
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_NONTRIVIALTYPESLIBCMEMORYCALLSCHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_RAWMEMORYCALLONNONTRIVIALTYPECHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.cpp
index 6abe53f47b8f9..528c254dbe17e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.cpp
@@ -112,7 +112,7 @@ void RedundantBranchConditionCheck::check(
 
   if (isa<DeclRefExpr>(InnerIf->getCond()->IgnoreParenImpCasts()) ||
       (BinOpCond && BinOpCond->getOpcode() == BO_LOr)) {
-    SourceLocation IfBegin = InnerIf->getBeginLoc();
+    const SourceLocation IfBegin = InnerIf->getBeginLoc();
     const Stmt *Body = InnerIf->getThen();
     const Expr *OtherSide = nullptr;
     if (BinOpCond) {
@@ -132,9 +132,9 @@ void RedundantBranchConditionCheck::check(
 
     // If the other side has side effects then keep it.
     if (OtherSide && OtherSide->HasSideEffects(*Result.Context)) {
-      SourceLocation BeforeOtherSide =
+      const SourceLocation BeforeOtherSide =
           OtherSide->getBeginLoc().getLocWithOffset(-1);
-      SourceLocation AfterOtherSide =
+      const SourceLocation AfterOtherSide =
           Lexer::findNextToken(OtherSide->getEndLoc(), *Result.SourceManager,
                                getLangOpts())
               ->getLocation();
@@ -161,12 +161,12 @@ void RedundantBranchConditionCheck::check(
     const auto *LeftDRE =
         dyn_cast<DeclRefExpr>(CondOp->getLHS()->IgnoreParenImpCasts());
     if (LeftDRE && LeftDRE->getDecl() == CondVar) {
-      SourceLocation BeforeRHS =
+      const SourceLocation BeforeRHS =
           CondOp->getRHS()->getBeginLoc().getLocWithOffset(-1);
       Diag << FixItHint::CreateRemoval(CharSourceRange::getTokenRange(
           CondOp->getLHS()->getBeginLoc(), BeforeRHS));
     } else {
-      SourceLocation AfterLHS =
+      const SourceLocation AfterLHS =
           Lexer::findNextToken(CondOp->getLHS()->getEndLoc(),
                                *Result.SourceManager, getLangOpts())
               ->getLocation();
diff --git a/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp
index a3265293bef58..1107cefe4d3c6 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp
@@ -83,7 +83,7 @@ static const Decl *findRVRefOverload(const FunctionDecl &FD,
   // FIXME:
   // 1. overload in anonymous namespace
   // 2. forward reference
-  DeclContext::lookup_result LookupResult =
+  const DeclContext::lookup_result LookupResult =
       FD.getParent()->lookup(FD.getNameInfo().getName());
   if (LookupResult.isSingleResult()) {
     return nullptr;
diff --git a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp
index c262b1c05b047..282a3b2581b8b 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp
@@ -283,7 +283,7 @@ static bool isStandardFunction(const FunctionDecl *FD) {
 /// This includes all statements that have a class name with "CXX" prefix
 /// and every other statement that is declared in file ExprCXX.h.
 static bool isCXXOnlyStmt(const Stmt *S) {
-  StringRef Name = S->getStmtClassName();
+  const StringRef Name = S->getStmtClassName();
   if (Name.starts_with("CXX"))
     return true;
   // Check for all other class names in ExprCXX.h that have no 'CXX' prefix.
@@ -317,7 +317,7 @@ static SourceRange getSourceRangeOfStmt(const Stmt *S, ASTContext &Ctx) {
   ParentMapContext &PM = Ctx.getParentMapContext();
   DynTypedNode P = DynTypedNode::create(*S);
   while (P.getSourceRange().isInvalid()) {
-    DynTypedNodeList PL = PM.getParents(P);
+    const DynTypedNodeList PL = PM.getParents(P);
     if (PL.size() != 1)
       return {};
     P = PL[0];
@@ -401,14 +401,15 @@ void SignalHandlerCheck::check(const MatchFinder::MatchResult &Result) {
   }
 
   // FIXME: Update CallGraph::getNode to use canonical decl?
-  CallGraphNode *HandlerNode = CG.getNode(HandlerDecl->getCanonicalDecl());
+  const CallGraphNode *HandlerNode =
+      CG.getNode(HandlerDecl->getCanonicalDecl());
   assert(HandlerNode &&
          "Handler with body should be present in the call graph.");
   // Start from signal handler and visit every function call.
   auto Itr = llvm::df_begin(HandlerNode), ItrE = llvm::df_end(HandlerNode);
   while (Itr != ItrE) {
     const auto *CallF = dyn_cast<FunctionDecl>((*Itr)->getDecl());
-    unsigned int PathL = Itr.getPathLength();
+    const unsigned int PathL = Itr.getPathLength();
     if (CallF) {
       // A signal handler or a function transitively reachable from the signal
       // handler was found to be unsafe.
@@ -434,8 +435,8 @@ void SignalHandlerCheck::check(const MatchFinder::MatchResult &Result) {
 
 bool SignalHandlerCheck::checkFunction(
     const FunctionDecl *FD, const Expr *CallOrRef,
-    std::function<void(bool)> ChainReporter) {
-  bool FunctionIsCalled = isa<CallExpr>(CallOrRef);
+    llvm::function_ref<void(bool)> ChainReporter) {
+  const bool FunctionIsCalled = isa<CallExpr>(CallOrRef);
 
   if (isStandardFunction(FD)) {
     if (!isStandardFunctionAsyncSafe(FD)) {
@@ -470,7 +471,7 @@ bool SignalHandlerCheck::checkFunction(
 
 bool SignalHandlerCheck::checkFunctionCPP14(
     const FunctionDecl *FD, const Expr *CallOrRef,
-    std::function<void(bool)> ChainReporter) {
+    llvm::function_ref<void(bool)> ChainReporter) {
   if (!FD->isExternC()) {
     diag(CallOrRef->getBeginLoc(),
          "functions without C linkage are not allowed as signal "
@@ -492,7 +493,7 @@ bool SignalHandlerCheck::checkFunctionCPP14(
   for (const auto &Match : Matches) {
     const auto *FoundS = Match.getNodeAs<Stmt>("stmt");
     if (isCXXOnlyStmt(FoundS)) {
-      SourceRange R = getSourceRangeOfStmt(FoundS, Ctx);
+      const SourceRange R = getSourceRangeOfStmt(FoundS, Ctx);
       if (R.isInvalid())
         continue;
       diag(R.getBegin(),
@@ -531,7 +532,7 @@ bool SignalHandlerCheck::isStandardFunctionAsyncSafe(
 }
 
 void SignalHandlerCheck::reportHandlerChain(
-    const llvm::df_iterator<clang::CallGraphNode *> &Itr,
+    const llvm::df_iterator<const clang::CallGraphNode *> &Itr,
     const DeclRefExpr *HandlerRef, bool SkipPathEnd) {
   int CallLevel = Itr.getPathLength() - 2;
   assert(CallLevel >= -1 && "Empty iterator?");
diff --git a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.h b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.h
index b5317793cbf45..324b2c88207fd 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.h
@@ -48,10 +48,10 @@ class SignalHandlerCheck : public ClangTidyCheck {
   /// The bool parameter is used like \c SkipPathEnd in \c reportHandlerChain .
   /// \return Returns true if a diagnostic was emitted for this function.
   bool checkFunction(const FunctionDecl *FD, const Expr *CallOrRef,
-                     std::function<void(bool)> ChainReporter);
+                     llvm::function_ref<void(bool)> ChainReporter);
   /// Similar as \c checkFunction but only check for C++14 rules.
   bool checkFunctionCPP14(const FunctionDecl *FD, const Expr *CallOrRef,
-                          std::function<void(bool)> ChainReporter);
+                          llvm::function_ref<void(bool)> ChainReporter);
   /// Returns true if a standard library function is considered
   /// asynchronous-safe.
   bool isStandardFunctionAsyncSafe(const FunctionDecl *FD) const;
@@ -65,8 +65,9 @@ class SignalHandlerCheck : public ClangTidyCheck {
   /// registered as signal handler.
   /// @param SkipPathEnd If true the last item of the call chain (farthest away
   /// from the \c signal call) is omitted from note generation.
-  void reportHandlerChain(const llvm::df_iterator<clang::CallGraphNode *> &Itr,
-                          const DeclRefExpr *HandlerRef, bool SkipPathEnd);
+  void
+  reportHandlerChain(const llvm::df_iterator<const clang::CallGraphNode *> &Itr,
+                     const DeclRefExpr *HandlerRef, bool SkipPathEnd);
 
   clang::CallGraph CG;
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.cpp
index 742d85bb7bab9..31c5413b8aa4c 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.cpp
@@ -140,7 +140,7 @@ void SignedCharMisuseCheck::check(const MatchFinder::MatchResult &Result) {
   if (!SignedCastExpression->isValueDependent() &&
       SignedCastExpression->getSubExpr()->EvaluateAsInt(EVResult,
                                                         *Result.Context)) {
-    llvm::APSInt Value = EVResult.Val.getInt();
+    const llvm::APSInt Value = EVResult.Val.getInt();
     if (Value.isNonNegative())
       return;
   }
@@ -154,7 +154,7 @@ void SignedCharMisuseCheck::check(const MatchFinder::MatchResult &Result) {
     if (!UnSignedCastExpression->isValueDependent() &&
         UnSignedCastExpression->getSubExpr()->EvaluateAsInt(EVResult,
                                                             *Result.Context)) {
-      llvm::APSInt Value = EVResult.Val.getInt();
+      const llvm::APSInt Value = EVResult.Val.getInt();
       if (Value <= UnsignedASCIIUpperBound)
         return;
     }
diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
index 2672dc74f82f7..49ba3b83795dd 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
@@ -407,9 +407,9 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) {
     const auto *ElementTy = Result.Nodes.getNodeAs<Type>("elem-type");
     const auto *PointedTy = Result.Nodes.getNodeAs<Type>("elem-ptr-type");
 
-    CharUnits NumeratorSize = getSizeOfType(Ctx, NumTy);
-    CharUnits DenominatorSize = getSizeOfType(Ctx, DenomTy);
-    CharUnits ElementSize = getSizeOfType(Ctx, ElementTy);
+    const CharUnits NumeratorSize = getSizeOfType(Ctx, NumTy);
+    const CharUnits DenominatorSize = getSizeOfType(Ctx, DenomTy);
+    const CharUnits ElementSize = getSizeOfType(Ctx, ElementTy);
 
     if (DenominatorSize > CharUnits::Zero() &&
         !NumeratorSize.isMultipleOf(DenominatorSize)) {
diff --git a/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.cpp
index ee797ecb694bd..af478b105fdd1 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SmartPtrArrayMismatchCheck.cpp
@@ -15,13 +15,11 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::bugprone {
 
-namespace {
+static constexpr char ConstructExprN[] = "found_construct_expr";
+static constexpr char NewExprN[] = "found_new_expr";
+static constexpr char ConstructorN[] = "found_constructor";
 
-constexpr char ConstructExprN[] = "found_construct_expr";
-constexpr char NewExprN[] = "found_new_expr";
-constexpr char ConstructorN[] = "found_constructor";
-
-bool isInSingleDeclStmt(const DeclaratorDecl *D) {
+static bool isInSingleDeclStmt(const DeclaratorDecl *D) {
   const DynTypedNodeList Parents =
       D->getASTContext().getParentMapContext().getParents(*D);
   for (const DynTypedNode &PNode : Parents)
@@ -30,8 +28,8 @@ bool isInSingleDeclStmt(const DeclaratorDecl *D) {
   return false;
 }
 
-const DeclaratorDecl *getConstructedVarOrField(const Expr *FoundConstructExpr,
-                                               ASTContext &Ctx) {
+static const DeclaratorDecl *
+getConstructedVarOrField(const Expr *FoundConstructExpr, ASTContext &Ctx) {
   const DynTypedNodeList ConstructParents =
       Ctx.getParentMapContext().getParents(*FoundConstructExpr);
   if (ConstructParents.size() != 1)
@@ -43,8 +41,6 @@ const DeclaratorDecl *getConstructedVarOrField(const Expr *FoundConstructExpr,
   return nullptr;
 }
 
-} // namespace
-
 const char SmartPtrArrayMismatchCheck::PointerTypeN[] = "pointer_type";
 
 SmartPtrArrayMismatchCheck::SmartPtrArrayMismatchCheck(
@@ -97,10 +93,10 @@ void SmartPtrArrayMismatchCheck::check(const MatchFinder::MatchResult &Result) {
     assert(TSTypeLoc.getNumArgs() >= 1 &&
            "Matched type should have at least 1 template argument.");
 
-    SourceRange TemplateArgumentRange = TSTypeLoc.getArgLoc(0)
-                                            .getTypeSourceInfo()
-                                            ->getTypeLoc()
-                                            .getSourceRange();
+    const SourceRange TemplateArgumentRange = TSTypeLoc.getArgLoc(0)
+                                                  .getTypeSourceInfo()
+                                                  ->getTypeLoc()
+                                                  .getSourceRange();
     D << TemplateArgumentRange;
 
     if (isInSingleDeclStmt(VarOrField)) {
@@ -108,7 +104,7 @@ void SmartPtrArrayMismatchCheck::check(const MatchFinder::MatchResult &Result) {
       if (!utils::rangeCanBeFixed(TemplateArgumentRange, &SM))
         return;
 
-      SourceLocation InsertLoc = Lexer::getLocForEndOfToken(
+      const SourceLocation InsertLoc = Lexer::getLocForEndOfToken(
           TemplateArgumentRange.getEnd(), 0, SM, Ctx.getLangOpts());
       D << FixItHint::CreateInsertion(InsertLoc, "[]");
     }
diff --git a/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.cpp
index 1e8058bc4abc9..a093b094de4c7 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.cpp
@@ -77,7 +77,7 @@ void SpuriouslyWakeUpFunctionsCheck::registerMatchers(MatchFinder *Finder) {
 void SpuriouslyWakeUpFunctionsCheck::check(
     const MatchFinder::MatchResult &Result) {
   const auto *MatchedWait = Result.Nodes.getNodeAs<CallExpr>("wait");
-  StringRef WaitName = MatchedWait->getDirectCallee()->getName();
+  const StringRef WaitName = MatchedWait->getDirectCallee()->getName();
   diag(MatchedWait->getExprLoc(),
        "'%0' should be placed inside a while statement %select{|or used with a "
        "conditional parameter}1")
diff --git a/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.cpp
index a7958cc229ffe..056ae4b80f109 100644
--- a/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.cpp
@@ -117,12 +117,13 @@ void StandaloneEmptyCheck::check(const MatchFinder::MatchResult &Result) {
     if (ParentReturnStmt)
       return;
 
-    SourceLocation MemberLoc = MemberCall->getBeginLoc();
-    SourceLocation ReplacementLoc = MemberCall->getExprLoc();
-    SourceRange ReplacementRange = SourceRange(ReplacementLoc, ReplacementLoc);
+    const SourceLocation MemberLoc = MemberCall->getBeginLoc();
+    const SourceLocation ReplacementLoc = MemberCall->getExprLoc();
+    const SourceRange ReplacementRange =
+        SourceRange(ReplacementLoc, ReplacementLoc);
 
     ASTContext &Context = MemberCall->getRecordDecl()->getASTContext();
-    DeclarationName Name =
+    const DeclarationName Name =
         Context.DeclarationNames.getIdentifier(&Context.Idents.get("clear"));
 
     auto Candidates = HeuristicResolver(Context).lookupDependentName(
@@ -133,11 +134,12 @@ void StandaloneEmptyCheck::check(const MatchFinder::MatchResult &Result) {
                  !llvm::cast<CXXMethodDecl>(ND)->isConst();
         });
 
-    bool HasClear = !Candidates.empty();
+    const bool HasClear = !Candidates.empty();
     if (HasClear) {
       const auto *Clear = llvm::cast<CXXMethodDecl>(Candidates.at(0));
-      QualType RangeType = MemberCall->getImplicitObjectArgument()->getType();
-      bool QualifierIncompatible =
+      const QualType RangeType =
+          MemberCall->getImplicitObjectArgument()->getType();
+      const bool QualifierIncompatible =
           (!Clear->isVolatile() && RangeType.isVolatileQualified()) ||
           RangeType.isConstQualified();
       if (!QualifierIncompatible) {
@@ -162,8 +164,8 @@ void StandaloneEmptyCheck::check(const MatchFinder::MatchResult &Result) {
     if (NonMemberCall->getNumArgs() != 1)
       return;
 
-    SourceLocation NonMemberLoc = NonMemberCall->getExprLoc();
-    SourceLocation NonMemberEndLoc = NonMemberCall->getEndLoc();
+    const SourceLocation NonMemberLoc = NonMemberCall->getExprLoc();
+    const SourceLocation NonMemberEndLoc = NonMemberCall->getEndLoc();
 
     const Expr *Arg = NonMemberCall->getArg(0);
     CXXRecordDecl *ArgRecordDecl = Arg->getType()->getAsCXXRecordDecl();
@@ -171,7 +173,7 @@ void StandaloneEmptyCheck::check(const MatchFinder::MatchResult &Result) {
       return;
 
     ASTContext &Context = ArgRecordDecl->getASTContext();
-    DeclarationName Name =
+    const DeclarationName Name =
         Context.DeclarationNames.getIdentifier(&Context.Idents.get("clear"));
 
     auto Candidates = HeuristicResolver(Context).lookupDependentName(
@@ -182,20 +184,20 @@ void StandaloneEmptyCheck::check(const MatchFinder::MatchResult &Result) {
                  !llvm::cast<CXXMethodDecl>(ND)->isConst();
         });
 
-    bool HasClear = !Candidates.empty();
+    const bool HasClear = !Candidates.empty();
 
     if (HasClear) {
       const auto *Clear = llvm::cast<CXXMethodDecl>(Candidates.at(0));
-      bool QualifierIncompatible =
+      const bool QualifierIncompatible =
           (!Clear->isVolatile() && Arg->getType().isVolatileQualified()) ||
           Arg->getType().isConstQualified();
       if (!QualifierIncompatible) {
-        std::string ReplacementText =
+        const std::string ReplacementText =
             std::string(Lexer::getSourceText(
                 CharSourceRange::getTokenRange(Arg->getSourceRange()),
                 *Result.SourceManager, getLangOpts())) +
             ".clear()";
-        SourceRange ReplacementRange =
+        const SourceRange ReplacementRange =
             SourceRange(NonMemberLoc, NonMemberEndLoc);
         diag(NonMemberLoc,
              "ignoring the result of '%0'; did you mean 'clear()'?")
diff --git a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp
similarity index 93%
rename from clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp
rename to clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp
index 79fbc66b5f8a3..1dff741be3c08 100644
--- a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DontModifyStdNamespaceCheck.h"
+#include "StdNamespaceModificationCheck.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchersInternal.h"
 
@@ -20,12 +20,12 @@ AST_POLYMORPHIC_MATCHER_P(
     AST_POLYMORPHIC_SUPPORTED_TYPES(ClassTemplateSpecializationDecl,
                                     TemplateSpecializationType, FunctionDecl),
     clang::ast_matchers::internal::Matcher<TemplateArgument>, InnerMatcher) {
-  ArrayRef<TemplateArgument> Args =
+  const ArrayRef<TemplateArgument> Args =
       clang::ast_matchers::internal::getTemplateSpecializationArgs(Node);
   for (const auto &Arg : Args) {
     if (Arg.getKind() != TemplateArgument::Pack)
       continue;
-    ArrayRef<TemplateArgument> PackArgs = Arg.getPackAsArray();
+    const ArrayRef<TemplateArgument> PackArgs = Arg.getPackAsArray();
     if (matchesFirstInRange(InnerMatcher, PackArgs.begin(), PackArgs.end(),
                             Finder, Builder) != PackArgs.end())
       return true;
@@ -36,9 +36,9 @@ AST_POLYMORPHIC_MATCHER_P(
 
 } // namespace
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
-void DontModifyStdNamespaceCheck::registerMatchers(MatchFinder *Finder) {
+void StdNamespaceModificationCheck::registerMatchers(MatchFinder *Finder) {
   auto HasStdParent =
       hasDeclContext(namespaceDecl(hasAnyName("std", "posix"),
                                    unless(hasParent(namespaceDecl())))
@@ -96,7 +96,7 @@ void DontModifyStdNamespaceCheck::registerMatchers(MatchFinder *Finder) {
                          .bind("decl"),
                      this);
 }
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
 
 static const NamespaceDecl *getTopLevelLexicalNamespaceDecl(const Decl *D) {
   const NamespaceDecl *LastNS = nullptr;
@@ -108,7 +108,7 @@ static const NamespaceDecl *getTopLevelLexicalNamespaceDecl(const Decl *D) {
   return LastNS;
 }
 
-void clang::tidy::cert::DontModifyStdNamespaceCheck::check(
+void clang::tidy::bugprone::StdNamespaceModificationCheck::check(
     const MatchFinder::MatchResult &Result) {
   const auto *D = Result.Nodes.getNodeAs<Decl>("decl");
   const auto *NS = Result.Nodes.getNodeAs<NamespaceDecl>("nmspc");
diff --git a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h
similarity index 61%
rename from clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h
rename to clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h
index cfcd878644ddb..0f62dc3d9ab70 100644
--- a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h
@@ -6,21 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
 /// Modification of the std or posix namespace can result in undefined behavior.
 /// This check warns for such modifications.
 ///
 /// For the user-facing documentation see:
-/// https://clang.llvm.org/extra/clang-tidy/checks/cert/dcl58-cpp.html
-class DontModifyStdNamespaceCheck : public ClangTidyCheck {
+/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/std-namespace-modification.html
+class StdNamespaceModificationCheck : public ClangTidyCheck {
 public:
-  DontModifyStdNamespaceCheck(StringRef Name, ClangTidyContext *Context)
+  StdNamespaceModificationCheck(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context) {}
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus;
@@ -29,6 +29,6 @@ class DontModifyStdNamespaceCheck : public ClangTidyCheck {
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 };
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.cpp
index 832377e376feb..d2e631e539b78 100644
--- a/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.cpp
@@ -29,8 +29,8 @@ static std::vector<StringRef>
 removeNamespaces(const std::vector<StringRef> &Names) {
   std::vector<StringRef> Result;
   Result.reserve(Names.size());
-  for (StringRef Name : Names) {
-    std::string::size_type ColonPos = Name.rfind(':');
+  for (const StringRef Name : Names) {
+    const std::string::size_type ColonPos = Name.rfind(':');
     Result.push_back(
         Name.substr(ColonPos == std::string::npos ? 0 : ColonPos + 1));
   }
@@ -168,7 +168,7 @@ void StringConstructorCheck::check(const MatchFinder::MatchResult &Result) {
   const ASTContext &Ctx = *Result.Context;
   const auto *E = Result.Nodes.getNodeAs<CXXConstructExpr>("constructor");
   assert(E && "missing constructor expression");
-  SourceLocation Loc = E->getBeginLoc();
+  const SourceLocation Loc = E->getBeginLoc();
 
   if (Result.Nodes.getNodeAs<Expr>("swapped-parameter")) {
     const Expr *P0 = E->getArg(0);
diff --git a/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.h b/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.h
index 0d7a203a52e12..9c08e4bc3f3f9 100644
--- a/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STRING_CONSTRUCTOR_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STRING_CONSTRUCTOR_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STRINGCONSTRUCTORCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STRINGCONSTRUCTORCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -36,4 +36,4 @@ class StringConstructorCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STRING_CONSTRUCTOR_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STRINGCONSTRUCTORCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.cpp
index 93a55ef549896..8454fd1045673 100644
--- a/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.cpp
@@ -129,7 +129,7 @@ void StringIntegerAssignmentCheck::check(
   const auto *Argument = Result.Nodes.getNodeAs<Expr>("expr");
   const auto CharType =
       Result.Nodes.getNodeAs<QualType>("type")->getCanonicalType();
-  SourceLocation Loc = Argument->getBeginLoc();
+  const SourceLocation Loc = Argument->getBeginLoc();
 
   // Try to detect a few common expressions to reduce false positives.
   if (CharExpressionDetector(CharType, *Result.Context)
@@ -145,7 +145,7 @@ void StringIntegerAssignmentCheck::check(
   if (Loc.isMacroID())
     return;
 
-  bool IsWideCharType = CharType->isWideCharType();
+  const bool IsWideCharType = CharType->isWideCharType();
   if (!CharType->isCharType() && !IsWideCharType)
     return;
   bool IsOneDigit = false;
@@ -155,7 +155,7 @@ void StringIntegerAssignmentCheck::check(
     IsLiteral = true;
   }
 
-  SourceLocation EndLoc = Lexer::getLocForEndOfToken(
+  const SourceLocation EndLoc = Lexer::getLocForEndOfToken(
       Argument->getEndLoc(), 0, *Result.SourceManager, getLangOpts());
   if (IsOneDigit) {
     Diag << FixItHint::CreateInsertion(Loc, IsWideCharType ? "L'" : "'")
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp
index 8dbe1c0153f35..ef7f0b5b54eb3 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp
@@ -54,7 +54,7 @@ static int enumLength(const EnumDecl *EnumDec) {
 
 static bool hasDisjointValueRange(const EnumDecl *Enum1,
                                   const EnumDecl *Enum2) {
-  ValueRange Range1(Enum1), Range2(Enum2);
+  const ValueRange Range1(Enum1), Range2(Enum2);
   return llvm::APSInt::compareValues(Range1.MaxVal, Range2.MinVal) < 0 ||
          llvm::APSInt::compareValues(Range2.MaxVal, Range1.MinVal) < 0;
 }
@@ -94,9 +94,9 @@ static int countNonPowOfTwoLiteralNum(const EnumDecl *EnumDec) {
 /// last enumerator is the sum of the lesser values (and initialized by a
 /// literal) or when it could contain consecutive values.
 static bool isPossiblyBitMask(const EnumDecl *EnumDec) {
-  ValueRange VR(EnumDec);
-  int EnumLen = enumLength(EnumDec);
-  int NonPowOfTwoCounter = countNonPowOfTwoLiteralNum(EnumDec);
+  const ValueRange VR(EnumDec);
+  const int EnumLen = enumLength(EnumDec);
+  const int NonPowOfTwoCounter = countNonPowOfTwoLiteralNum(EnumDec);
   return NonPowOfTwoCounter >= 1 && NonPowOfTwoCounter <= 2 &&
          NonPowOfTwoCounter < EnumLen / 2 &&
          (VR.MaxVal - VR.MinVal != EnumLen - 1) &&
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp
index aaf0594a02dfc..5abbadafc0d63 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.cpp
@@ -66,7 +66,7 @@ void SuspiciousIncludePPCallbacks::InclusionDirective(
   if (!Check.IgnoredRegexString.empty() && Check.IgnoredRegex.match(FileName))
     return;
 
-  SourceLocation DiagLoc = FilenameRange.getBegin().getLocWithOffset(1);
+  const SourceLocation DiagLoc = FilenameRange.getBegin().getLocWithOffset(1);
 
   const std::optional<StringRef> IFE =
       utils::getFileExtension(FileName, Check.ImplementationFileExtensions);
@@ -81,7 +81,7 @@ void SuspiciousIncludePPCallbacks::InclusionDirective(
     llvm::sys::path::replace_extension(GuessedFileName,
                                        (!HFE.empty() ? "." : "") + HFE);
 
-    OptionalFileEntryRef File =
+    const OptionalFileEntryRef File =
         PP->LookupFile(DiagLoc, GuessedFileName, IsAngled, nullptr, nullptr,
                        nullptr, nullptr, nullptr, nullptr, nullptr, nullptr);
     if (File) {
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.cpp
index d1df2a8634035..7890afb41addb 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.cpp
@@ -44,10 +44,10 @@ void SuspiciousMemoryComparisonCheck::check(
 
   for (unsigned int ArgIndex = 0; ArgIndex < 2; ++ArgIndex) {
     const Expr *ArgExpr = CE->getArg(ArgIndex);
-    QualType ArgType = ArgExpr->IgnoreImplicit()->getType();
+    const QualType ArgType = ArgExpr->IgnoreImplicit()->getType();
     const Type *PointeeType = ArgType->getPointeeOrArrayElementType();
     assert(PointeeType != nullptr && "PointeeType should always be available.");
-    QualType PointeeQualifiedType(PointeeType, 0);
+    const QualType PointeeQualifiedType(PointeeType, 0);
 
     if (PointeeType->isRecordType()) {
       if (const RecordDecl *RD =
@@ -65,7 +65,7 @@ void SuspiciousMemoryComparisonCheck::check(
     }
 
     if (!PointeeType->isIncompleteType()) {
-      uint64_t PointeeSize = Ctx.getTypeSize(PointeeType);
+      const uint64_t PointeeSize = Ctx.getTypeSize(PointeeType);
       if (ComparedBits && *ComparedBits >= PointeeSize &&
           !Ctx.hasUniqueObjectRepresentations(PointeeQualifiedType)) {
         diag(CE->getBeginLoc(),
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.cpp
index b1d12ba306814..51ae132ce38a6 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.cpp
@@ -60,7 +60,7 @@ void SuspiciousMemsetUsageCheck::check(const MatchFinder::MatchResult &Result) {
     // Case 1: fill_char of memset() is a character '0'. Probably an
     // integer zero was intended.
 
-    SourceRange CharRange = CharZeroFill->getSourceRange();
+    const SourceRange CharRange = CharZeroFill->getSourceRange();
     auto Diag =
         diag(CharZeroFill->getBeginLoc(), "memset fill value is char '0', "
                                           "potentially mistaken for int 0");
@@ -82,7 +82,7 @@ void SuspiciousMemsetUsageCheck::check(const MatchFinder::MatchResult &Result) {
     if (!NumFill->EvaluateAsInt(EVResult, *Result.Context))
       return;
 
-    llvm::APSInt NumValue = EVResult.Val.getInt();
+    const llvm::APSInt NumValue = EVResult.Val.getInt();
     if (NumValue >= 0 && NumValue <= UCharMax)
       return;
 
@@ -110,7 +110,7 @@ void SuspiciousMemsetUsageCheck::check(const MatchFinder::MatchResult &Result) {
     Expr::EvalResult EVResult;
     if (!FillChar->isValueDependent() &&
         FillChar->EvaluateAsInt(EVResult, *Result.Context)) {
-      llvm::APSInt Value1 = EVResult.Val.getInt();
+      const llvm::APSInt Value1 = EVResult.Val.getInt();
       if (Value1 == 0 || Value1.isNegative())
         return;
     }
@@ -120,8 +120,10 @@ void SuspiciousMemsetUsageCheck::check(const MatchFinder::MatchResult &Result) {
     // and fix-its to swap the arguments.
     auto D = diag(Call->getBeginLoc(),
                   "memset of size zero, potentially swapped arguments");
-    StringRef RHSString = tooling::fixit::getText(*ByteCount, *Result.Context);
-    StringRef LHSString = tooling::fixit::getText(*FillChar, *Result.Context);
+    const StringRef RHSString =
+        tooling::fixit::getText(*ByteCount, *Result.Context);
+    const StringRef LHSString =
+        tooling::fixit::getText(*FillChar, *Result.Context);
     if (LHSString.empty() || RHSString.empty())
       return;
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.h
index a1f5f2bfd1a3b..c45f3326733f8 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUS_MEMSET_USAGE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUS_MEMSET_USAGE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUSMEMSETUSAGECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUSMEMSETUSAGECHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -30,4 +30,4 @@ class SuspiciousMemsetUsageCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUS_MEMSET_USAGE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUSMEMSETUSAGECHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.cpp
index a41f65083653a..cf8bc9794d9ce 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.cpp
@@ -19,7 +19,7 @@ static bool isConcatenatedLiteralsOnPurpose(ASTContext *Ctx,
   // String literals surrounded by parentheses are assumed to be on purpose.
   //    i.e.:  const char* Array[] = { ("a" "b" "c"), "d", [...] };
 
-  TraversalKindScope RAII(*Ctx, TK_AsIs);
+  const TraversalKindScope RAII(*Ctx, TK_AsIs);
   auto Parents = Ctx->getParents(*Lit);
   if (Parents.size() == 1 && Parents[0].get<ParenExpr>() != nullptr)
     return true;
@@ -35,15 +35,15 @@ static bool isConcatenatedLiteralsOnPurpose(ASTContext *Ctx,
   //     };
   const SourceManager &SM = Ctx->getSourceManager();
   bool IndentedCorrectly = true;
-  SourceLocation FirstToken = Lit->getStrTokenLoc(0);
-  FileID BaseFID = SM.getFileID(FirstToken);
-  unsigned int BaseIndent = SM.getSpellingColumnNumber(FirstToken);
-  unsigned int BaseLine = SM.getSpellingLineNumber(FirstToken);
+  const SourceLocation FirstToken = Lit->getStrTokenLoc(0);
+  const FileID BaseFID = SM.getFileID(FirstToken);
+  const unsigned int BaseIndent = SM.getSpellingColumnNumber(FirstToken);
+  const unsigned int BaseLine = SM.getSpellingLineNumber(FirstToken);
   for (unsigned int TokNum = 1; TokNum < Lit->getNumConcatenated(); ++TokNum) {
-    SourceLocation Token = Lit->getStrTokenLoc(TokNum);
-    FileID FID = SM.getFileID(Token);
-    unsigned int Indent = SM.getSpellingColumnNumber(Token);
-    unsigned int Line = SM.getSpellingLineNumber(Token);
+    const SourceLocation Token = Lit->getStrTokenLoc(TokNum);
+    const FileID FID = SM.getFileID(Token);
+    const unsigned int Indent = SM.getSpellingColumnNumber(Token);
+    const unsigned int Line = SM.getSpellingLineNumber(Token);
     if (FID != BaseFID || Line != BaseLine + TokNum || Indent <= BaseIndent) {
       IndentedCorrectly = false;
       break;
@@ -100,7 +100,7 @@ void SuspiciousMissingCommaCheck::check(
   assert(InitializerList && ConcatenatedLiteral);
 
   // Skip small arrays as they often generate false-positive.
-  unsigned int Size = InitializerList->getNumInits();
+  const unsigned int Size = InitializerList->getNumInits();
   if (Size < SizeThreshold)
     return;
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.cpp
index b5da8016f2cc8..7cc3630204e63 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.cpp
@@ -44,7 +44,7 @@ class IsSamePtrExpr : public StmtVisitor<IsSamePtrExpr, bool> {
       return false;
     if (!check(E1->getBase(), E2->getBase()))
       return false;
-    DeclAccessPair FD = E1->getFoundDecl();
+    const DeclAccessPair FD = E1->getFoundDecl();
     return isa<FieldDecl>(FD.getDecl()) && FD == E2->getFoundDecl();
   }
 
@@ -145,7 +145,7 @@ void SuspiciousReallocUsageCheck::check(
         if (FindAssignToVarBefore{Var, DeclRef, SM}.Visit(Func->getBody()))
           return;
 
-  StringRef CodeOfAssignedExpr = Lexer::getSourceText(
+  const StringRef CodeOfAssignedExpr = Lexer::getSourceText(
       CharSourceRange::getTokenRange(PtrResultExpr->getSourceRange()), SM,
       getLangOpts());
   diag(Call->getBeginLoc(), "'%0' may be set to null if 'realloc' fails, which "
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.cpp
index 543d31285af8c..9d37fc1e8728e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.cpp
@@ -31,7 +31,7 @@ void SuspiciousSemicolonCheck::check(const MatchFinder::MatchResult &Result) {
     return;
 
   const auto *Semicolon = Result.Nodes.getNodeAs<NullStmt>("semi");
-  SourceLocation LocStart = Semicolon->getBeginLoc();
+  const SourceLocation LocStart = Semicolon->getBeginLoc();
 
   if (LocStart.isMacroID())
     return;
@@ -40,7 +40,7 @@ void SuspiciousSemicolonCheck::check(const MatchFinder::MatchResult &Result) {
   auto Token = utils::lexer::getPreviousToken(LocStart, Ctxt.getSourceManager(),
                                               Ctxt.getLangOpts());
   auto &SM = *Result.SourceManager;
-  unsigned SemicolonLine = SM.getSpellingLineNumber(LocStart);
+  const unsigned SemicolonLine = SM.getSpellingLineNumber(LocStart);
 
   const auto *Statement = Result.Nodes.getNodeAs<Stmt>("stmt");
   const bool IsIfStmt = isa<IfStmt>(Statement);
@@ -49,18 +49,20 @@ void SuspiciousSemicolonCheck::check(const MatchFinder::MatchResult &Result) {
       SM.getSpellingLineNumber(Token.getLocation()) != SemicolonLine)
     return;
 
-  SourceLocation LocEnd = Semicolon->getEndLoc();
-  FileID FID = SM.getFileID(LocEnd);
-  llvm::MemoryBufferRef Buffer = SM.getBufferOrFake(FID, LocEnd);
+  const SourceLocation LocEnd = Semicolon->getEndLoc();
+  const FileID FID = SM.getFileID(LocEnd);
+  const llvm::MemoryBufferRef Buffer = SM.getBufferOrFake(FID, LocEnd);
   Lexer Lexer(SM.getLocForStartOfFile(FID), Ctxt.getLangOpts(),
               Buffer.getBufferStart(), SM.getCharacterData(LocEnd) + 1,
               Buffer.getBufferEnd());
   if (Lexer.LexFromRawLexer(Token))
     return;
 
-  unsigned BaseIndent = SM.getSpellingColumnNumber(Statement->getBeginLoc());
-  unsigned NewTokenIndent = SM.getSpellingColumnNumber(Token.getLocation());
-  unsigned NewTokenLine = SM.getSpellingLineNumber(Token.getLocation());
+  const unsigned BaseIndent =
+      SM.getSpellingColumnNumber(Statement->getBeginLoc());
+  const unsigned NewTokenIndent =
+      SM.getSpellingColumnNumber(Token.getLocation());
+  const unsigned NewTokenLine = SM.getSpellingLineNumber(Token.getLocation());
 
   if (!IsIfStmt && NewTokenIndent <= BaseIndent &&
       Token.getKind() != tok::l_brace && NewTokenLine != SemicolonLine)
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.cpp
index 7519685418c8c..5da9240de74dc 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.cpp
@@ -88,7 +88,7 @@ void SuspiciousStringCompareCheck::registerMatchers(MatchFinder *Finder) {
 
   // Add the list of known string compare-like functions and add user-defined
   // functions.
-  std::vector<StringRef> FunctionNames = utils::options::parseListPair(
+  const std::vector<StringRef> FunctionNames = utils::options::parseListPair(
       KnownStringCompareFunctions, StringCompareLikeFunctions);
 
   // Match a call to a string compare functions.
@@ -163,7 +163,7 @@ void SuspiciousStringCompareCheck::check(
   assert(Decl != nullptr && Call != nullptr);
 
   if (Result.Nodes.getNodeAs<Stmt>("missing-comparison")) {
-    SourceLocation EndLoc = Lexer::getLocForEndOfToken(
+    const SourceLocation EndLoc = Lexer::getLocForEndOfToken(
         Call->getRParenLoc(), 0, Result.Context->getSourceManager(),
         getLangOpts());
 
@@ -173,10 +173,10 @@ void SuspiciousStringCompareCheck::check(
   }
 
   if (const auto *E = Result.Nodes.getNodeAs<Expr>("logical-not-comparison")) {
-    SourceLocation EndLoc = Lexer::getLocForEndOfToken(
+    const SourceLocation EndLoc = Lexer::getLocForEndOfToken(
         Call->getRParenLoc(), 0, Result.Context->getSourceManager(),
         getLangOpts());
-    SourceLocation NotLoc = E->getBeginLoc();
+    const SourceLocation NotLoc = E->getBeginLoc();
 
     diag(Call->getBeginLoc(),
          "function %0 is compared using logical not operator")
diff --git a/clang-tools-extra/clang-tidy/bugprone/SwappedArgumentsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SwappedArgumentsCheck.cpp
index bcedff5ef5aa2..152c0cbd106f5 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SwappedArgumentsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SwappedArgumentsCheck.cpp
@@ -70,7 +70,7 @@ static bool areArgumentsPotentiallySwapped(const QualType LTo,
   if (LTo == RFrom && REq)
     return true;
 
-  bool LEq = areTypesSemiEqual(LTo, RFrom);
+  const bool LEq = areTypesSemiEqual(LTo, RFrom);
   if (RTo == LFrom && LEq)
     return true;
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/ThrowingStaticInitializationCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ThrowingStaticInitializationCheck.cpp
index 56ec5a5af182e..80905e260d5d4 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ThrowingStaticInitializationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ThrowingStaticInitializationCheck.cpp
@@ -44,7 +44,7 @@ void ThrowingStaticInitializationCheck::check(
        "duration may throw an exception that cannot be caught")
       << VD << (VD->getStorageDuration() == SD_Static ? 0 : 1);
 
-  SourceLocation FuncLocation = Func->getLocation();
+  const SourceLocation FuncLocation = Func->getLocation();
   if (FuncLocation.isValid()) {
     diag(FuncLocation,
          "possibly throwing %select{constructor|function}0 declared here",
diff --git a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp
index 536b6806c66e6..71b785f1c04f1 100644
--- a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.cpp
@@ -67,21 +67,21 @@ void TooSmallLoopVariableCheck::storeOptions(
 ///   LoopName: The entire for loop (as a ForStmt)
 ///
 void TooSmallLoopVariableCheck::registerMatchers(MatchFinder *Finder) {
-  StatementMatcher LoopVarMatcher =
+  const StatementMatcher LoopVarMatcher =
       expr(ignoringParenImpCasts(
                anyOf(declRefExpr(to(varDecl(hasType(isInteger())))),
                      memberExpr(member(fieldDecl(hasType(isInteger())))))))
           .bind(LoopVarName);
 
   // We need to catch only those comparisons which contain any integer cast.
-  StatementMatcher LoopVarConversionMatcher = traverse(
+  const StatementMatcher LoopVarConversionMatcher = traverse(
       TK_AsIs, implicitCastExpr(hasImplicitDestinationType(isInteger()),
                                 has(ignoringParenImpCasts(LoopVarMatcher)))
                    .bind(LoopVarCastName));
 
   // We are interested in only those cases when the loop bound is a variable
   // value (not const, enum, etc.).
-  StatementMatcher LoopBoundMatcher =
+  const StatementMatcher LoopBoundMatcher =
       expr(ignoringParenImpCasts(allOf(
                hasType(isInteger()), unless(integerLiteral()),
                unless(allOf(
@@ -94,7 +94,7 @@ void TooSmallLoopVariableCheck::registerMatchers(MatchFinder *Finder) {
 
   // We use the loop increment expression only to make sure we found the right
   // loop variable.
-  StatementMatcher IncrementMatcher =
+  const StatementMatcher IncrementMatcher =
       expr(ignoringParenImpCasts(hasType(isInteger()))).bind(LoopIncrementName);
 
   Finder->addMatcher(
@@ -121,14 +121,14 @@ static MagnitudeBits calcMagnitudeBits(const ASTContext &Context,
                                        const Expr *IntExpr) {
   assert(IntExprType->isIntegerType());
 
-  unsigned SignedBits = IntExprType->isUnsignedIntegerType() ? 0U : 1U;
+  const unsigned SignedBits = IntExprType->isUnsignedIntegerType() ? 0U : 1U;
 
   if (const auto *BitField = IntExpr->getSourceBitField()) {
-    unsigned BitFieldWidth = BitField->getBitWidthValue();
+    const unsigned BitFieldWidth = BitField->getBitWidthValue();
     return {BitFieldWidth - SignedBits, BitFieldWidth};
   }
 
-  unsigned IntWidth = Context.getIntWidth(IntExprType);
+  const unsigned IntWidth = Context.getIntWidth(IntExprType);
   return {IntWidth - SignedBits, 0U};
 }
 
@@ -143,18 +143,18 @@ calcUpperBoundMagnitudeBits(const ASTContext &Context, const Expr *UpperBound,
     const Expr *RHSE = BinOperator->getRHS()->IgnoreParenImpCasts();
     const Expr *LHSE = BinOperator->getLHS()->IgnoreParenImpCasts();
 
-    QualType RHSEType = RHSE->getType();
-    QualType LHSEType = LHSE->getType();
+    const QualType RHSEType = RHSE->getType();
+    const QualType LHSEType = LHSE->getType();
 
     if (!RHSEType->isIntegerType() || !LHSEType->isIntegerType())
       return {};
 
-    bool RHSEIsConstantValue = RHSEType->isEnumeralType() ||
-                               RHSEType.isConstQualified() ||
-                               isa<IntegerLiteral>(RHSE);
-    bool LHSEIsConstantValue = LHSEType->isEnumeralType() ||
-                               LHSEType.isConstQualified() ||
-                               isa<IntegerLiteral>(LHSE);
+    const bool RHSEIsConstantValue = RHSEType->isEnumeralType() ||
+                                     RHSEType.isConstQualified() ||
+                                     isa<IntegerLiteral>(RHSE);
+    const bool LHSEIsConstantValue = LHSEType->isEnumeralType() ||
+                                     LHSEType.isConstQualified() ||
+                                     isa<IntegerLiteral>(LHSE);
 
     // Avoid false positives produced by two constant values.
     if (RHSEIsConstantValue && LHSEIsConstantValue)
@@ -193,7 +193,7 @@ void TooSmallLoopVariableCheck::check(const MatchFinder::MatchResult &Result) {
   if (LoopVar->getType() != LoopIncrement->getType())
     return;
 
-  ASTContext &Context = *Result.Context;
+  const ASTContext &Context = *Result.Context;
 
   const QualType LoopVarType = LoopVar->getType();
   const MagnitudeBits LoopVarMagnitudeBits =
diff --git a/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h b/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h
index 11086fb4bfda1..62bf42da4f9f9 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h
@@ -25,7 +25,8 @@ class UncheckedOptionalAccessCheck : public ClangTidyCheck {
 public:
   UncheckedOptionalAccessCheck(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context),
-        ModelOptions{Options.get("IgnoreSmartPointerDereference", false)} {}
+        ModelOptions{Options.get("IgnoreSmartPointerDereference", false),
+                     Options.get("IgnoreValueCalls", false)} {}
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
@@ -34,6 +35,7 @@ class UncheckedOptionalAccessCheck : public ClangTidyCheck {
   void storeOptions(ClangTidyOptions::OptionMap &Opts) override {
     Options.store(Opts, "IgnoreSmartPointerDereference",
                   ModelOptions.IgnoreSmartPointerDereference);
+    Options.store(Opts, "IgnoreValueCalls", ModelOptions.IgnoreValueCalls);
   }
 
 private:
diff --git a/clang-tools-extra/clang-tidy/bugprone/UncheckedStringToNumberConversionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UncheckedStringToNumberConversionCheck.cpp
index d1e7b895f9a35..b82c9d3ffc55b 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UncheckedStringToNumberConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UncheckedStringToNumberConversionCheck.cpp
@@ -51,7 +51,7 @@ enum class ConversionKind {
 
 static ConversionKind classifyConversionFunc(const FunctionDecl *FD) {
   return llvm::StringSwitch<ConversionKind>(FD->getName())
-      .Cases("atoi", "atol", ConversionKind::ToInt)
+      .Cases({"atoi", "atol"}, ConversionKind::ToInt)
       .Case("atoll", ConversionKind::ToLongInt)
       .Case("atof", ConversionKind::ToDouble)
       .Default(ConversionKind::None);
@@ -76,7 +76,8 @@ static ConversionKind classifyFormatString(StringRef Fmt, const LangOptions &LO,
 
       // Get the conversion specifier and use it to determine the conversion
       // kind.
-      analyze_scanf::ScanfConversionSpecifier SCS = FS.getConversionSpecifier();
+      const analyze_scanf::ScanfConversionSpecifier SCS =
+          FS.getConversionSpecifier();
       if (SCS.isIntArg()) {
         switch (FS.getLengthModifier().getKind()) {
         case analyze_scanf::LengthModifier::AsLongLong:
@@ -194,7 +195,7 @@ void UncheckedStringToNumberConversionCheck::check(
     // The format string comes from the call expression and depends on which
     // flavor of scanf is called.
     // Index 0: scanf, vscanf, Index 1: fscanf, sscanf, vfscanf, vsscanf.
-    unsigned Idx =
+    const unsigned Idx =
         (FFD->getName() == "scanf" || FFD->getName() == "vscanf") ? 0 : 1;
 
     // Given the index, see if the call expression argument at that index is
diff --git a/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.h b/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.h
index c9a232a1b177d..409122feefd01 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDEFINED_MEMORY_MANIPULATION_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDEFINED_MEMORY_MANIPULATION_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDEFINEDMEMORYMANIPULATIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDEFINEDMEMORYMANIPULATIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -29,4 +29,4 @@ class UndefinedMemoryManipulationCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDEFINED_MEMORY_MANIPULATION_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDEFINEDMEMORYMANIPULATIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/UndelegatedConstructorCheck.h b/clang-tools-extra/clang-tidy/bugprone/UndelegatedConstructorCheck.h
index 18465f7353b1d..c7cadbf6e1653 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UndelegatedConstructorCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UndelegatedConstructorCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNDELEGATEDCONSTRUCTOR_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNDELEGATEDCONSTRUCTOR_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDELEGATEDCONSTRUCTORCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDELEGATEDCONSTRUCTORCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -31,4 +31,4 @@ class UndelegatedConstructorCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNDELEGATEDCONSTRUCTOR_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNDELEGATEDCONSTRUCTORCHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.cpp
index bf30753f0e5ef..340b136700c5f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.cpp
@@ -16,7 +16,8 @@ namespace {
 
 AST_MATCHER_P(CXXTryStmt, hasHandlerFor,
               ast_matchers::internal::Matcher<QualType>, InnerMatcher) {
-  for (unsigned NH = Node.getNumHandlers(), I = 0; I < NH; ++I) {
+  const unsigned NH = Node.getNumHandlers();
+  for (unsigned I = 0; I < NH; ++I) {
     const CXXCatchStmt *CatchS = Node.getHandler(I);
     // Check for generic catch handler (match anything).
     if (CatchS->getCaughtType().isNull())
@@ -31,7 +32,7 @@ AST_MATCHER_P(CXXTryStmt, hasHandlerFor,
 }
 
 AST_MATCHER(CXXNewExpr, mayThrow) {
-  FunctionDecl *OperatorNew = Node.getOperatorNew();
+  const FunctionDecl *OperatorNew = Node.getOperatorNew();
   if (!OperatorNew)
     return false;
   return !OperatorNew->getType()->castAs<FunctionProtoType>()->isNothrow();
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.cpp
index bce46572bdeb9..e10b17ca20753 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.cpp
@@ -80,17 +80,17 @@ void UnintendedCharOstreamOutputCheck::check(
   const Expr *Value = Call->getArg(1);
   const SourceRange SourceRange = Value->getSourceRange();
 
-  DiagnosticBuilder Builder =
+  const DiagnosticBuilder Builder =
       diag(Call->getOperatorLoc(),
            "%0 passed to 'operator<<' outputs as character instead of integer. "
            "cast to 'unsigned int' to print numeric value or cast to 'char' to "
            "print as character")
       << Value->getType() << SourceRange;
 
-  QualType T = Value->getType();
+  const QualType T = Value->getType();
   const Type *UnqualifiedDesugaredType = T->getUnqualifiedDesugaredType();
 
-  llvm::StringRef CastType = CastTypeName.value_or(
+  const llvm::StringRef CastType = CastTypeName.value_or(
       UnqualifiedDesugaredType->isSpecificBuiltinType(BuiltinType::SChar)
           ? "int"
           : "unsigned int");
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
index 61ccd26e48c1e..5524c4b484be1 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
@@ -141,7 +141,7 @@ parseCheckedFunctions(StringRef Option, ClangTidyContext *Context) {
   std::vector<UnsafeFunctionsCheck::CheckedFunction> Result;
   Result.reserve(Functions.size());
 
-  for (StringRef Function : Functions) {
+  for (const StringRef Function : Functions) {
     if (Function.empty())
       continue;
 
@@ -301,7 +301,7 @@ void UnsafeFunctionsCheck::check(const MatchFinder::MatchResult &Result) {
   if (Custom) {
     for (const auto &Entry : CustomFunctions) {
       if (Entry.Pattern.match(*FuncDecl)) {
-        StringRef Reason =
+        const StringRef Reason =
             Entry.Reason.empty() ? "is marked as unsafe" : Entry.Reason.c_str();
 
         if (Entry.Replacement.empty()) {
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.cpp
index dae679baf14e5..6502fc9bfb89e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.cpp
@@ -37,8 +37,8 @@ void UnusedRaiiCheck::registerMatchers(MatchFinder *Finder) {
 }
 
 template <typename T>
-static void reportDiagnostic(DiagnosticBuilder D, const T *Node, SourceRange SR,
-                             bool DefaultConstruction) {
+static void reportDiagnostic(const DiagnosticBuilder &D, const T *Node,
+                             SourceRange SR, bool DefaultConstruction) {
   const char *Replacement = " give_me_a_name";
 
   // If this is a default ctor we have to remove the parens or we'll introduce a
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
index c2fc4af86391d..6fbd3922b532d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
@@ -25,7 +25,8 @@ namespace {
 // member function are matched directly with InnerMatcher.
 AST_MATCHER_P(FunctionDecl, isInstantiatedFrom, Matcher<FunctionDecl>,
               InnerMatcher) {
-  FunctionDecl *InstantiatedFrom = Node.getInstantiatedFromMemberFunction();
+  const FunctionDecl *InstantiatedFrom =
+      Node.getInstantiatedFromMemberFunction();
   return InnerMatcher.matches(InstantiatedFrom ? *InstantiatedFrom : Node,
                               Finder, Builder);
 }
diff --git a/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp
index efb5ec64689cf..6d134a0e896a0 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp
@@ -255,7 +255,7 @@ static bool isStandardSmartPointer(const ValueDecl *VD) {
   if (!ID)
     return false;
 
-  StringRef Name = ID->getName();
+  const StringRef Name = ID->getName();
   if (Name != "unique_ptr" && Name != "shared_ptr" && Name != "weak_ptr")
     return false;
 
@@ -369,7 +369,7 @@ void UseAfterMoveFinder::getReinits(
     if (!S)
       continue;
 
-    SmallVector<BoundNodes, 1> Matches =
+    const SmallVector<BoundNodes, 1> Matches =
         match(findAll(ReinitMatcher), *S->getStmt(), *Context);
 
     for (const auto &Match : Matches) {
@@ -506,7 +506,7 @@ void UseAfterMoveCheck::check(const MatchFinder::MatchResult &Result) {
     if (ContainingCtorInit) {
       // Collect the constructor initializer expressions.
       bool BeforeMove{true};
-      for (CXXCtorInitializer *Init : ContainingCtor->inits()) {
+      for (const CXXCtorInitializer *Init : ContainingCtor->inits()) {
         if (BeforeMove && Init->getInit()->IgnoreImplicit() ==
                               ContainingCtorInit->IgnoreImplicit())
           BeforeMove = false;
diff --git a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp
index cef8b4da7fc17..0d69b9fd88213 100644
--- a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp
@@ -37,14 +37,14 @@ static bool isOverrideMethod(const CXXMethodDecl *MD) {
 static bool checkOverridingFunctionReturnType(const ASTContext *Context,
                                               const CXXMethodDecl *BaseMD,
                                               const CXXMethodDecl *DerivedMD) {
-  QualType BaseReturnTy = BaseMD->getType()
-                              ->castAs<FunctionType>()
-                              ->getReturnType()
-                              .getCanonicalType();
-  QualType DerivedReturnTy = DerivedMD->getType()
-                                 ->castAs<FunctionType>()
-                                 ->getReturnType()
-                                 .getCanonicalType();
+  const QualType BaseReturnTy = BaseMD->getType()
+                                    ->castAs<FunctionType>()
+                                    ->getReturnType()
+                                    .getCanonicalType();
+  const QualType DerivedReturnTy = DerivedMD->getType()
+                                       ->castAs<FunctionType>()
+                                       ->getReturnType()
+                                       .getCanonicalType();
 
   if (DerivedReturnTy->isDependentType() || BaseReturnTy->isDependentType())
     return false;
@@ -63,8 +63,8 @@ static bool checkOverridingFunctionReturnType(const ASTContext *Context,
   /// BTy is the class type in return type of BaseMD. For example,
   ///    B* Base::md()
   /// While BRD is the declaration of B.
-  QualType DTy = DerivedReturnTy->getPointeeType().getCanonicalType();
-  QualType BTy = BaseReturnTy->getPointeeType().getCanonicalType();
+  const QualType DTy = DerivedReturnTy->getPointeeType().getCanonicalType();
+  const QualType BTy = BaseReturnTy->getPointeeType().getCanonicalType();
 
   const CXXRecordDecl *DRD = DTy->getAsCXXRecordDecl();
   const CXXRecordDecl *BRD = BTy->getAsCXXRecordDecl();
@@ -94,7 +94,7 @@ static bool checkOverridingFunctionReturnType(const ASTContext *Context,
     // Check accessibility.
     // FIXME: We currently only support checking if B is accessible base class
     // of D, or D is the same class which DerivedMD is in.
-    bool IsItself =
+    const bool IsItself =
         DRD->getCanonicalDecl() == DerivedMD->getParent()->getCanonicalDecl();
     bool HasPublicAccess = false;
     for (const auto &Path : Paths) {
@@ -129,8 +129,8 @@ static QualType getDecayedType(QualType Type) {
 /// \returns true if the param types are the same.
 static bool checkParamTypes(const CXXMethodDecl *BaseMD,
                             const CXXMethodDecl *DerivedMD) {
-  unsigned NumParamA = BaseMD->getNumParams();
-  unsigned NumParamB = DerivedMD->getNumParams();
+  const unsigned NumParamA = BaseMD->getNumParams();
+  const unsigned NumParamB = DerivedMD->getNumParams();
   if (NumParamA != NumParamB)
     return false;
 
@@ -184,10 +184,10 @@ bool VirtualNearMissCheck::isPossibleToBeOverridden(
   if (!Inserted)
     return Iter->second;
 
-  bool IsPossible = !BaseMD->isImplicit() && !isa<CXXConstructorDecl>(BaseMD) &&
-                    !isa<CXXDestructorDecl>(BaseMD) && BaseMD->isVirtual() &&
-                    !BaseMD->isOverloadedOperator() &&
-                    !isa<CXXConversionDecl>(BaseMD);
+  const bool IsPossible =
+      !BaseMD->isImplicit() && !isa<CXXConstructorDecl>(BaseMD) &&
+      !isa<CXXDestructorDecl>(BaseMD) && BaseMD->isVirtual() &&
+      !BaseMD->isOverloadedOperator() && !isa<CXXConversionDecl>(BaseMD);
   Iter->second = IsPossible;
   return IsPossible;
 }
@@ -241,7 +241,7 @@ void VirtualNearMissCheck::check(const MatchFinder::MatchResult &Result) {
         if (isOverriddenByDerivedClass(BaseMD, DerivedRD))
           continue;
 
-        unsigned EditDistance = BaseMD->getName().edit_distance(
+        const unsigned EditDistance = BaseMD->getName().edit_distance(
             DerivedMD->getName(), EditDistanceThreshold);
         if (EditDistance > 0 && EditDistance <= EditDistanceThreshold) {
           if (checkOverrideWithoutName(Context, BaseMD, DerivedMD)) {
@@ -249,8 +249,8 @@ void VirtualNearMissCheck::check(const MatchFinder::MatchResult &Result) {
             auto Range = CharSourceRange::getTokenRange(
                 SourceRange(DerivedMD->getLocation()));
 
-            bool ApplyFix = !BaseMD->isTemplateInstantiation() &&
-                            !DerivedMD->isTemplateInstantiation();
+            const bool ApplyFix = !BaseMD->isTemplateInstantiation() &&
+                                  !DerivedMD->isTemplateInstantiation();
             auto Diag =
                 diag(DerivedMD->getBeginLoc(),
                      "method '%0' has a similar name and the same signature as "
diff --git a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.h b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.h
index 71d302f49ff95..22788177c86bc 100644
--- a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_VIRTUAL_NEAR_MISS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_VIRTUAL_NEAR_MISS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_VIRTUALNEARMISSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_VIRTUALNEARMISSCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "llvm/ADT/DenseMap.h"
@@ -60,4 +60,4 @@ class VirtualNearMissCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_VIRTUAL_NEAR_MISS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_VIRTUALNEARMISSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
index c1ca2cec7a1eb..f46dd4cc6195a 100644
--- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
@@ -11,12 +11,19 @@
 #include "../ClangTidyModuleRegistry.h"
 #include "../bugprone/BadSignalToKillThreadCheck.h"
 #include "../bugprone/CommandProcessorCheck.h"
+#include "../bugprone/CopyConstructorMutatesArgumentCheck.h"
+#include "../bugprone/DefaultOperatorNewOnOveralignedTypeCheck.h"
+#include "../bugprone/ExceptionCopyConstructorThrowsCheck.h"
+#include "../bugprone/FloatLoopCounterCheck.h"
 #include "../bugprone/PointerArithmeticOnPolymorphicObjectCheck.h"
+#include "../bugprone/RandomGeneratorSeedCheck.h"
+#include "../bugprone/RawMemoryCallOnNonTrivialTypeCheck.h"
 #include "../bugprone/ReservedIdentifierCheck.h"
 #include "../bugprone/SignalHandlerCheck.h"
 #include "../bugprone/SignedCharMisuseCheck.h"
 #include "../bugprone/SizeofExpressionCheck.h"
 #include "../bugprone/SpuriouslyWakeUpFunctionsCheck.h"
+#include "../bugprone/StdNamespaceModificationCheck.h"
 #include "../bugprone/SuspiciousMemoryComparisonCheck.h"
 #include "../bugprone/ThrowingStaticInitializationCheck.h"
 #include "../bugprone/UncheckedStringToNumberConversionCheck.h"
@@ -26,7 +33,7 @@
 #include "../concurrency/ThreadCanceltypeAsynchronousCheck.h"
 #include "../google/UnnamedNamespaceInHeaderCheck.h"
 #include "../misc/NewDeleteOverloadsCheck.h"
-#include "../misc/NonCopyableObjects.h"
+#include "../misc/NonCopyableObjectsCheck.h"
 #include "../misc/StaticAssertCheck.h"
 #include "../misc/ThrowByValueCatchByReferenceCheck.h"
 #include "../modernize/AvoidSetjmpLongjmpCheck.h"
@@ -34,14 +41,7 @@
 #include "../performance/MoveConstructorInitCheck.h"
 #include "../readability/EnumInitialValueCheck.h"
 #include "../readability/UppercaseLiteralSuffixCheck.h"
-#include "DefaultOperatorNewAlignmentCheck.h"
-#include "DontModifyStdNamespaceCheck.h"
-#include "FloatLoopCounter.h"
 #include "LimitedRandomnessCheck.h"
-#include "MutatingCopyCheck.h"
-#include "NonTrivialTypesLibcMemoryCallsCheck.h"
-#include "ProperlySeededRandomGeneratorCheck.h"
-#include "ThrownExceptionTypeCheck.h"
 
 namespace {
 
@@ -251,7 +251,8 @@ class CERTModule : public ClangTidyModule {
         "cert-dcl51-cpp");
     CheckFactories.registerCheck<misc::NewDeleteOverloadsCheck>(
         "cert-dcl54-cpp");
-    CheckFactories.registerCheck<DontModifyStdNamespaceCheck>("cert-dcl58-cpp");
+    CheckFactories.registerCheck<bugprone::StdNamespaceModificationCheck>(
+        "cert-dcl58-cpp");
     CheckFactories.registerCheck<google::build::UnnamedNamespaceInHeaderCheck>(
         "cert-dcl59-cpp");
     // ERR
@@ -261,15 +262,17 @@ class CERTModule : public ClangTidyModule {
         "cert-err52-cpp");
     CheckFactories.registerCheck<bugprone::ThrowingStaticInitializationCheck>(
         "cert-err58-cpp");
-    CheckFactories.registerCheck<ThrownExceptionTypeCheck>("cert-err60-cpp");
+    CheckFactories.registerCheck<bugprone::ExceptionCopyConstructorThrowsCheck>(
+        "cert-err60-cpp");
     CheckFactories.registerCheck<misc::ThrowByValueCatchByReferenceCheck>(
         "cert-err61-cpp");
     // MEM
-    CheckFactories.registerCheck<DefaultOperatorNewAlignmentCheck>(
-        "cert-mem57-cpp");
+    CheckFactories
+        .registerCheck<bugprone::DefaultOperatorNewOnOveralignedTypeCheck>(
+            "cert-mem57-cpp");
     // MSC
     CheckFactories.registerCheck<LimitedRandomnessCheck>("cert-msc50-cpp");
-    CheckFactories.registerCheck<ProperlySeededRandomGeneratorCheck>(
+    CheckFactories.registerCheck<bugprone::RandomGeneratorSeedCheck>(
         "cert-msc51-cpp");
     CheckFactories.registerCheck<bugprone::SignalHandlerCheck>(
         "cert-msc54-cpp");
@@ -278,9 +281,10 @@ class CERTModule : public ClangTidyModule {
         "cert-oop11-cpp");
     CheckFactories.registerCheck<bugprone::UnhandledSelfAssignmentCheck>(
         "cert-oop54-cpp");
-    CheckFactories.registerCheck<NonTrivialTypesLibcMemoryCallsCheck>(
+    CheckFactories.registerCheck<bugprone::RawMemoryCallOnNonTrivialTypeCheck>(
         "cert-oop57-cpp");
-    CheckFactories.registerCheck<MutatingCopyCheck>("cert-oop58-cpp");
+    CheckFactories.registerCheck<bugprone::CopyConstructorMutatesArgumentCheck>(
+        "cert-oop58-cpp");
 
     // C checkers
     // ARR
@@ -308,7 +312,8 @@ class CERTModule : public ClangTidyModule {
     CheckFactories.registerCheck<bugprone::SuspiciousMemoryComparisonCheck>(
         "cert-exp42-c");
     // FLP
-    CheckFactories.registerCheck<FloatLoopCounter>("cert-flp30-c");
+    CheckFactories.registerCheck<bugprone::FloatLoopCounterCheck>(
+        "cert-flp30-c");
     CheckFactories.registerCheck<bugprone::SuspiciousMemoryComparisonCheck>(
         "cert-flp37-c");
     // FIO
@@ -320,7 +325,7 @@ class CERTModule : public ClangTidyModule {
     CheckFactories.registerCheck<bugprone::UnsafeFunctionsCheck>(
         "cert-msc24-c");
     CheckFactories.registerCheck<LimitedRandomnessCheck>("cert-msc30-c");
-    CheckFactories.registerCheck<ProperlySeededRandomGeneratorCheck>(
+    CheckFactories.registerCheck<bugprone::RandomGeneratorSeedCheck>(
         "cert-msc32-c");
     CheckFactories.registerCheck<bugprone::UnsafeFunctionsCheck>(
         "cert-msc33-c");
diff --git a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
index 453d1d30921e9..0ed903c4826a3 100644
--- a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
@@ -5,14 +5,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_clang_library(clangTidyCERTModule STATIC
   CERTTidyModule.cpp
-  DefaultOperatorNewAlignmentCheck.cpp
-  DontModifyStdNamespaceCheck.cpp
-  FloatLoopCounter.cpp
   LimitedRandomnessCheck.cpp
-  MutatingCopyCheck.cpp
-  NonTrivialTypesLibcMemoryCallsCheck.cpp
-  ProperlySeededRandomGeneratorCheck.cpp
-  ThrownExceptionTypeCheck.cpp
 
   LINK_LIBS
   clangTidy
diff --git a/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.h b/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.h
index a9d607665adb3..a806cd344d217 100644
--- a/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.h
+++ b/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_LIMITED_RANDOMNESS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_LIMITED_RANDOMNESS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_LIMITEDRANDOMNESSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_LIMITEDRANDOMNESSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -30,4 +30,4 @@ class LimitedRandomnessCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::cert
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_LIMITED_RANDOMNESS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_LIMITEDRANDOMNESSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt b/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
index 0abb000991859..4b3330d7e14c1 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
@@ -21,7 +21,7 @@ add_clang_library(clangTidyCppCoreGuidelinesModule STATIC
   OwningMemoryCheck.cpp
   PreferMemberInitializerCheck.cpp
   ProBoundsArrayToPointerDecayCheck.cpp
-  ProBoundsAvoidUncheckedContainerAccess.cpp
+  ProBoundsAvoidUncheckedContainerAccessCheck.cpp
   ProBoundsConstantArrayIndexCheck.cpp
   ProBoundsPointerArithmeticCheck.cpp
   ProTypeConstCastCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
index 5f4c9b48e346a..66639552276a2 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
@@ -36,7 +36,7 @@
 #include "OwningMemoryCheck.h"
 #include "PreferMemberInitializerCheck.h"
 #include "ProBoundsArrayToPointerDecayCheck.h"
-#include "ProBoundsAvoidUncheckedContainerAccess.h"
+#include "ProBoundsAvoidUncheckedContainerAccessCheck.h"
 #include "ProBoundsConstantArrayIndexCheck.h"
 #include "ProBoundsPointerArithmeticCheck.h"
 #include "ProTypeConstCastCheck.h"
@@ -108,7 +108,7 @@ class CppCoreGuidelinesModule : public ClangTidyModule {
         "cppcoreguidelines-prefer-member-initializer");
     CheckFactories.registerCheck<ProBoundsArrayToPointerDecayCheck>(
         "cppcoreguidelines-pro-bounds-array-to-pointer-decay");
-    CheckFactories.registerCheck<ProBoundsAvoidUncheckedContainerAccess>(
+    CheckFactories.registerCheck<ProBoundsAvoidUncheckedContainerAccessCheck>(
         "cppcoreguidelines-pro-bounds-avoid-unchecked-container-access");
     CheckFactories.registerCheck<ProBoundsConstantArrayIndexCheck>(
         "cppcoreguidelines-pro-bounds-constant-array-index");
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp
index 2545548df4f45..93b5b96926865 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp
@@ -37,7 +37,7 @@ void InitVariablesCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
 }
 
 void InitVariablesCheck::registerMatchers(MatchFinder *Finder) {
-  std::string BadDecl = "badDecl";
+  const std::string BadDecl = "badDecl";
   Finder->addMatcher(
       varDecl(unless(hasInitializer(anything())), unless(isInstantiated()),
               isLocalVarDecl(), unless(isStaticLocal()), isDefinition(),
@@ -82,7 +82,7 @@ void InitVariablesCheck::check(const MatchFinder::MatchResult &Result) {
   if (MatchedDecl->getEndLoc().isMacroID())
     return;
 
-  QualType TypePtr = MatchedDecl->getType();
+  const QualType TypePtr = MatchedDecl->getType();
   std::optional<const char *> InitializationString;
   bool AddMathInclude = false;
 
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.h
index 780b4b39254a7..dc91854ee4971 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_INTERFACES_GLOBAL_INIT_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_INTERFACES_GLOBAL_INIT_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_INTERFACESGLOBALINITCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_INTERFACESGLOBALINITCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -27,4 +27,4 @@ class InterfacesGlobalInitCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::cppcoreguidelines
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_INTERFACES_GLOBAL_INIT_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_INTERFACESGLOBALINITCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.cpp
index 0836a5c386dd8..b301a2bd848ac 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.cpp
@@ -47,7 +47,7 @@ class MacroUsageCallbacks : public PPCallbacks {
         SM.isWrittenInCommandLineFile(MD->getLocation()))
       return;
 
-    StringRef MacroName = MacroNameTok.getIdentifierInfo()->getName();
+    const StringRef MacroName = MacroNameTok.getIdentifierInfo()->getName();
     if (MacroName == "__GCC_HAVE_DWARF2_CFI_ASM")
       return;
     if (!CheckCapsOnly && !RegExp.match(MacroName))
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp
index 57d98ee1fd8b4..366bd1296bf9d 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp
@@ -81,7 +81,7 @@ void MisleadingCaptureDefaultByValueCheck::check(
     return;
 
   if (Lambda->getCaptureDefault() == LCD_ByCopy) {
-    bool IsThisImplicitlyCaptured = std::any_of(
+    const bool IsThisImplicitlyCaptured = std::any_of(
         Lambda->implicit_capture_begin(), Lambda->implicit_capture_end(),
         [](const LambdaCapture &Capture) { return Capture.capturesThis(); });
     auto Diag = diag(Lambda->getCaptureDefaultLoc(),
@@ -89,8 +89,8 @@ void MisleadingCaptureDefaultByValueCheck::check(
                      "should not specify a by-value capture default")
                 << IsThisImplicitlyCaptured;
 
-    std::string ReplacementText = createReplacementText(Lambda);
-    SourceLocation DefaultCaptureEnd =
+    const std::string ReplacementText = createReplacementText(Lambda);
+    const SourceLocation DefaultCaptureEnd =
         findDefaultCaptureEnd(Lambda, *Result.Context);
     Diag << FixItHint::CreateReplacement(
         CharSourceRange::getCharRange(Lambda->getCaptureDefaultLoc(),
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
index 090ab2f0474c4..d1d81d510c8fb 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
@@ -26,11 +26,12 @@ AST_MATCHER_P(QualType, possiblyPackExpansionOf,
 }
 
 AST_MATCHER(ParmVarDecl, isTemplateTypeParameter) {
-  ast_matchers::internal::Matcher<QualType> Inner = possiblyPackExpansionOf(
-      qualType(rValueReferenceType(),
-               references(templateTypeParmType(
-                   hasDeclaration(templateTypeParmDecl()))),
-               unless(references(qualType(isConstQualified())))));
+  const ast_matchers::internal::Matcher<QualType> Inner =
+      possiblyPackExpansionOf(
+          qualType(rValueReferenceType(),
+                   references(templateTypeParmType(
+                       hasDeclaration(templateTypeParmDecl()))),
+                   unless(references(qualType(isConstQualified())))));
   if (!Inner.matches(Node.getType(), Finder, Builder))
     return false;
 
@@ -43,7 +44,7 @@ AST_MATCHER(ParmVarDecl, isTemplateTypeParameter) {
   if (!FuncTemplate)
     return false;
 
-  QualType ParamType =
+  const QualType ParamType =
       Node.getType().getNonPackExpansionType()->getPointeeType();
   const auto *TemplateType = ParamType->getAsCanonical<TemplateTypeParmType>();
   if (!TemplateType)
@@ -54,10 +55,10 @@ AST_MATCHER(ParmVarDecl, isTemplateTypeParameter) {
 }
 
 AST_MATCHER_P(NamedDecl, hasSameNameAsBoundNode, std::string, BindingID) {
-  IdentifierInfo *II = Node.getIdentifier();
+  const IdentifierInfo *II = Node.getIdentifier();
   if (nullptr == II)
     return false;
-  StringRef Name = II->getName();
+  const StringRef Name = II->getName();
 
   return Builder->removeBindings(
       [this, Name](const ast_matchers::internal::BoundNodesMap &Nodes) {
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.h
index e4dece6a54c90..da35b530f5d3b 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NO_MALLOC_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NO_MALLOC_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NOMALLOCCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NOMALLOCCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -56,4 +56,4 @@ class NoMallocCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::cppcoreguidelines
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NO_MALLOC_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NOMALLOCCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.cpp
index 43df277927d8b..8ecbccda3c5f2 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.cpp
@@ -27,7 +27,7 @@ void NoSuspendWithLockCheck::registerMatchers(MatchFinder *Finder) {
       hasDeclaration(namedDecl(matchers::matchesAnyListedName(
           utils::options::parseStringList(LockGuards)))));
 
-  StatementMatcher Lock =
+  const StatementMatcher Lock =
       declStmt(has(varDecl(hasType(LockType)).bind("lock-decl")))
           .bind("lock-decl-stmt");
   Finder->addMatcher(
@@ -55,12 +55,12 @@ void NoSuspendWithLockCheck::check(const MatchFinder::MatchResult &Result) {
   Options.AddImplicitDtors = true;
   Options.AddTemporaryDtors = true;
 
-  std::unique_ptr<CFG> TheCFG = CFG::buildCFG(
+  const std::unique_ptr<CFG> TheCFG = CFG::buildCFG(
       nullptr, const_cast<clang::CompoundStmt *>(Block), &Context, Options);
   if (!TheCFG)
     return;
 
-  utils::ExprSequence Sequence(TheCFG.get(), Block, &Context);
+  const utils::ExprSequence Sequence(TheCFG.get(), Block, &Context);
   const Stmt *LastBlockStmt = Block->body_back();
   if (Sequence.inSequence(LockStmt, Suspend) &&
       (Suspend == LastBlockStmt ||
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.h
index 462e9864a3f5d..248b5c2190e03 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_OWNING_MEMORY_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_OWNING_MEMORY_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_OWNINGMEMORYCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_OWNINGMEMORYCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -61,4 +61,4 @@ class OwningMemoryCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::cppcoreguidelines
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_OWNING_MEMORY_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_OWNINGMEMORYCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
index 9913671c6f74e..51a1468f49813 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
@@ -164,12 +164,12 @@ void PreferMemberInitializerCheck::check(
   llvm::DenseMap<const FieldDecl *, AssignedLevel> AssignedFields{};
 
   for (const CXXCtorInitializer *Init : Ctor->inits())
-    if (FieldDecl *Field = Init->getMember())
+    if (const FieldDecl *Field = Init->getMember())
       updateAssignmentLevel(Field, Init->getInit(), Ctor, AssignedFields);
 
   for (const Stmt *S : Body->body()) {
     if (S->getBeginLoc().isMacroID()) {
-      StringRef MacroName = Lexer::getImmediateMacroName(
+      const StringRef MacroName = Lexer::getImmediateMacroName(
           S->getBeginLoc(), *Result.SourceManager, getLangOpts());
       if (MacroName.contains_insensitive("assert"))
         return;
@@ -206,7 +206,7 @@ void PreferMemberInitializerCheck::check(
     bool AddComma = false;
     bool AddBrace = false;
     bool InvalidFix = false;
-    unsigned Index = Field->getFieldIndex();
+    const unsigned Index = Field->getFieldIndex();
     const CXXCtorInitializer *LastInListInit = nullptr;
     for (const CXXCtorInitializer *Init : Ctor->inits()) {
       if (!Init->isWritten() || Init->isInClassMemberInitializer())
@@ -276,7 +276,7 @@ void PreferMemberInitializerCheck::check(
                 << Field;
     if (InvalidFix)
       continue;
-    StringRef NewInit = Lexer::getSourceText(
+    const StringRef NewInit = Lexer::getSourceText(
         Result.SourceManager->getExpansionRange(InitValue->getSourceRange()),
         *Result.SourceManager, getLangOpts());
     if (HasInitAlready) {
@@ -288,8 +288,8 @@ void PreferMemberInitializerCheck::check(
       else
         Diag << FixItHint::CreateReplacement(ReplaceRange, NewInit);
     } else {
-      SmallString<128> Insertion({InsertPrefix, Field->getName(), "(", NewInit,
-                                  AddComma ? "), " : ")"});
+      const SmallString<128> Insertion({InsertPrefix, Field->getName(), "(",
+                                        NewInit, AddComma ? "), " : ")"});
       Diag << FixItHint::CreateInsertion(InsertPos, Insertion,
                                          FirstToCtorInits);
       FirstToCtorInits = areDiagsSelfContained();
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.cpp
index f3237f4d7dae0..d0f86526d1a29 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.cpp
@@ -35,7 +35,7 @@ AST_MATCHER_P(Expr, hasParentIgnoringImpCasts,
               ast_matchers::internal::Matcher<Expr>, InnerMatcher) {
   const Expr *E = &Node;
   do {
-    DynTypedNodeList Parents = Finder->getASTContext().getParents(*E);
+    const DynTypedNodeList Parents = Finder->getASTContext().getParents(*E);
     if (Parents.size() != 1)
       return false;
     E = Parents[0].get<Expr>();
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h
index cea4bfacd6644..2d4b40b3bfb9e 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_ARRAY_TO_POINTER_DECAY_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_ARRAY_TO_POINTER_DECAY_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSARRAYTOPOINTERDECAYCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSARRAYTOPOINTERDECAYCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -30,4 +30,4 @@ class ProBoundsArrayToPointerDecayCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::cppcoreguidelines
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_ARRAY_TO_POINTER_DECAY_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSARRAYTOPOINTERDECAYCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccessCheck.cpp
similarity index 95%
rename from clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.cpp
rename to clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccessCheck.cpp
index dd7b2b553b7a1..83803a3e81937 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccessCheck.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ProBoundsAvoidUncheckedContainerAccess.h"
+#include "ProBoundsAvoidUncheckedContainerAccessCheck.h"
 #include "../utils/Matchers.h"
 #include "../utils/OptionsUtils.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
@@ -19,8 +19,9 @@ namespace clang::tidy::cppcoreguidelines {
 static constexpr llvm::StringRef DefaultExclusionStr =
     "::std::map;::std::unordered_map;::std::flat_map";
 
-ProBoundsAvoidUncheckedContainerAccess::ProBoundsAvoidUncheckedContainerAccess(
-    StringRef Name, ClangTidyContext *Context)
+ProBoundsAvoidUncheckedContainerAccessCheck::
+    ProBoundsAvoidUncheckedContainerAccessCheck(StringRef Name,
+                                                ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
       ExcludedClasses(utils::options::parseStringList(
           Options.get("ExcludeClasses", DefaultExclusionStr))),
@@ -28,7 +29,7 @@ ProBoundsAvoidUncheckedContainerAccess::ProBoundsAvoidUncheckedContainerAccess(
       FixFunction(Options.get("FixFunction", "gsl::at")),
       FixFunctionEmptyArgs(Options.get("FixFunctionEmptyArgs", FixFunction)) {}
 
-void ProBoundsAvoidUncheckedContainerAccess::storeOptions(
+void ProBoundsAvoidUncheckedContainerAccessCheck::storeOptions(
     ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "ExcludeClasses",
                 utils::options::serializeStringList(ExcludedClasses));
@@ -86,7 +87,7 @@ findAlternativeAt(const CXXMethodDecl *MatchedOperator) {
   return nullptr;
 }
 
-void ProBoundsAvoidUncheckedContainerAccess::registerMatchers(
+void ProBoundsAvoidUncheckedContainerAccessCheck::registerMatchers(
     MatchFinder *Finder) {
   Finder->addMatcher(
       mapAnyOf(cxxOperatorCallExpr, cxxMemberCallExpr)
@@ -100,7 +101,7 @@ void ProBoundsAvoidUncheckedContainerAccess::registerMatchers(
       this);
 }
 
-void ProBoundsAvoidUncheckedContainerAccess::check(
+void ProBoundsAvoidUncheckedContainerAccessCheck::check(
     const MatchFinder::MatchResult &Result) {
 
   const auto *MatchedExpr = Result.Nodes.getNodeAs<CallExpr>("caller");
@@ -251,7 +252,7 @@ void ProBoundsAvoidUncheckedContainerAccess::check(
 } // namespace clang::tidy::cppcoreguidelines
 
 namespace clang::tidy {
-using P = cppcoreguidelines::ProBoundsAvoidUncheckedContainerAccess;
+using P = cppcoreguidelines::ProBoundsAvoidUncheckedContainerAccessCheck;
 
 llvm::ArrayRef<std::pair<P::FixModes, StringRef>>
 OptionEnumMapping<P::FixModes>::getEnumMapping() {
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccessCheck.h
similarity index 73%
rename from clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.h
rename to clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccessCheck.h
index 0755da7ce4409..85b5a93940562 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccessCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_AVOID_UNCHECKED_CONTAINER_ACCESS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_AVOID_UNCHECKED_CONTAINER_ACCESS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSAVOIDUNCHECKEDCONTAINERACCESSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSAVOIDUNCHECKEDCONTAINERACCESSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -20,10 +20,10 @@ namespace clang::tidy::cppcoreguidelines {
 /// https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#slcon3-avoid-bounds-errors
 /// For the user-facing documentation see:
 /// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.html
-class ProBoundsAvoidUncheckedContainerAccess : public ClangTidyCheck {
+class ProBoundsAvoidUncheckedContainerAccessCheck : public ClangTidyCheck {
 public:
-  ProBoundsAvoidUncheckedContainerAccess(StringRef Name,
-                                         ClangTidyContext *Context);
+  ProBoundsAvoidUncheckedContainerAccessCheck(StringRef Name,
+                                              ClangTidyContext *Context);
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus;
   }
@@ -46,11 +46,11 @@ class ProBoundsAvoidUncheckedContainerAccess : public ClangTidyCheck {
 namespace clang::tidy {
 template <>
 struct OptionEnumMapping<
-    cppcoreguidelines::ProBoundsAvoidUncheckedContainerAccess::FixModes> {
+    cppcoreguidelines::ProBoundsAvoidUncheckedContainerAccessCheck::FixModes> {
   static ArrayRef<std::pair<
-      cppcoreguidelines::ProBoundsAvoidUncheckedContainerAccess::FixModes,
+      cppcoreguidelines::ProBoundsAvoidUncheckedContainerAccessCheck::FixModes,
       StringRef>>
   getEnumMapping();
 };
 } // namespace clang::tidy
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_AVOID_UNCHECKED_CONTAINER_ACCESS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSAVOIDUNCHECKEDCONTAINERACCESSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp
index 634ec186616d5..82fc9f253ac1c 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp
@@ -78,7 +78,7 @@ void ProBoundsConstantArrayIndexCheck::check(
     else
       BaseRange =
           cast<CXXOperatorCallExpr>(Matched)->getArg(0)->getSourceRange();
-    SourceRange IndexRange = IndexExpr->getSourceRange();
+    const SourceRange IndexRange = IndexExpr->getSourceRange();
 
     auto Diag = diag(Matched->getExprLoc(),
                      "do not use array subscript when the index is "
@@ -115,7 +115,7 @@ void ProBoundsConstantArrayIndexCheck::check(
   const auto &SizeArg = TemplateArgs[1];
   if (SizeArg.getKind() != TemplateArgument::Integral)
     return;
-  llvm::APInt ArraySize = SizeArg.getAsIntegral();
+  const llvm::APInt ArraySize = SizeArg.getAsIntegral();
 
   // Get uint64_t values, because different bitwidths would lead to an assertion
   // in APInt::uge.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h
index 73f185529e1eb..7c8fec0d60c59 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_CONSTANT_ARRAY_INDEX_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_CONSTANT_ARRAY_INDEX_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSCONSTANTARRAYINDEXCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSCONSTANTARRAYINDEXCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "../utils/IncludeInserter.h"
@@ -37,4 +37,4 @@ class ProBoundsConstantArrayIndexCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::cppcoreguidelines
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_CONSTANT_ARRAY_INDEX_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSCONSTANTARRAYINDEXCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.h
index 45b798527ed4e..4f6b17f15c9fd 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_POINTER_ARITHMETIC_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_POINTER_ARITHMETIC_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSPOINTERARITHMETICCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSPOINTERARITHMETICCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -35,4 +35,4 @@ class ProBoundsPointerArithmeticCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::cppcoreguidelines
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_BOUNDS_POINTER_ARITHMETIC_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROBOUNDSPOINTERARITHMETICCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h
index 0b8cfc830854f..a0a368cbc6a1f 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_CONST_CAST_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_CONST_CAST_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPECONSTCASTCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPECONSTCASTCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -36,4 +36,4 @@ class ProTypeConstCastCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::cppcoreguidelines
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_CONST_CAST_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPECONSTCASTCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.cpp
index b9867c2393f0b..fcd9c6d37d99f 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.cpp
@@ -45,7 +45,7 @@ void ProTypeCstyleCastCheck::check(const MatchFinder::MatchResult &Result) {
     return;
   }
 
-  QualType SourceType = MatchedCast->getSubExpr()->getType();
+  const QualType SourceType = MatchedCast->getSubExpr()->getType();
 
   if (MatchedCast->getCastKind() == CK_BaseToDerived) {
     const auto *SourceDecl = SourceType->getPointeeCXXRecordDecl();
@@ -58,7 +58,7 @@ void ProTypeCstyleCastCheck::check(const MatchFinder::MatchResult &Result) {
       // Leave type spelling exactly as it was (unlike
       // getTypeAsWritten().getAsString() which would spell enum types 'enum
       // X').
-      StringRef DestTypeString = Lexer::getSourceText(
+      const StringRef DestTypeString = Lexer::getSourceText(
           CharSourceRange::getTokenRange(
               MatchedCast->getLParenLoc().getLocWithOffset(1),
               MatchedCast->getRParenLoc().getLocWithOffset(-1)),
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.h
index f8e1d5a893da0..5fd0208ea9918 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_CSTYLE_CAST_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_CSTYLE_CAST_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPECSTYLECASTCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPECSTYLECASTCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -31,4 +31,4 @@ class ProTypeCstyleCastCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::cppcoreguidelines
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_CSTYLE_CAST_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPECSTYLECASTCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
index 1ac9b8bbdfedb..111c62153fe79 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
@@ -173,7 +173,7 @@ struct InitializerInsertion {
     assert(!Initializers.empty() && "No initializers to insert");
     std::string Code;
     llvm::raw_string_ostream Stream(Code);
-    std::string Joined =
+    const std::string Joined =
         llvm::join(Initializers.begin(), Initializers.end(), "(), ");
     switch (Placement) {
     case InitializerPlacement::New:
@@ -434,7 +434,7 @@ static llvm::StringLiteral getInitializer(QualType QT, bool UseAssignment) {
 void ProTypeMemberInitCheck::checkMissingMemberInitializer(
     ASTContext &Context, const CXXRecordDecl &ClassDecl,
     const CXXConstructorDecl *Ctor) {
-  bool IsUnion = ClassDecl.isUnion();
+  const bool IsUnion = ClassDecl.isUnion();
 
   if (IsUnion && ClassDecl.hasInClassInitializer())
     return;
@@ -583,7 +583,7 @@ void ProTypeMemberInitCheck::checkMissingBaseClassInitializer(
 
 void ProTypeMemberInitCheck::checkUninitializedTrivialType(
     const ASTContext &Context, const VarDecl *Var) {
-  DiagnosticBuilder Diag =
+  const DiagnosticBuilder Diag =
       diag(Var->getBeginLoc(), "uninitialized record type: %0") << Var;
 
   Diag << FixItHint::CreateInsertion(
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h
index 8beaab394f04a..89d3074fb0a97 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_MEMBER_INIT_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_MEMBER_INIT_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEMEMBERINITCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEMEMBERINITCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "llvm/ADT/DenseSet.h"
@@ -79,4 +79,4 @@ class ProTypeMemberInitCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::cppcoreguidelines
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_MEMBER_INIT_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEMEMBERINITCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.h
index 4948d0ac2d785..566944dfda60f 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_REINTERPRETCAST_CHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_REINTERPRETCAST_CHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEREINTERPRETCASTCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEREINTERPRETCASTCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -30,4 +30,4 @@ class ProTypeReinterpretCastCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::cppcoreguidelines
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_REINTERPRETCAST_CHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEREINTERPRETCASTCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.cpp
index c200a79cb8c49..e7b92fcf801c1 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.cpp
@@ -33,7 +33,7 @@ void ProTypeStaticCastDowncastCheck::check(
     const MatchFinder::MatchResult &Result) {
   const auto *MatchedCast = Result.Nodes.getNodeAs<CXXStaticCastExpr>("cast");
 
-  QualType SourceType = MatchedCast->getSubExpr()->getType();
+  const QualType SourceType = MatchedCast->getSubExpr()->getType();
   const auto *SourceDecl = SourceType->getPointeeCXXRecordDecl();
   if (!SourceDecl) // The cast is from object to reference
     SourceDecl = SourceType->getAsCXXRecordDecl();
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h
index 3d01fb9e52809..02d54a5e25c21 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_STATIC_CAST_DOWNCAST_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_STATIC_CAST_DOWNCAST_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPESTATICCASTDOWNCASTCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPESTATICCASTDOWNCASTCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -37,4 +37,4 @@ class ProTypeStaticCastDowncastCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::cppcoreguidelines
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_STATIC_CAST_DOWNCAST_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPESTATICCASTDOWNCASTCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.h
index fe82ce9630589..41154e8eedce9 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_UNION_ACCESS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_UNION_ACCESS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEUNIONACCESSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEUNIONACCESSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -31,4 +31,4 @@ class ProTypeUnionAccessCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::cppcoreguidelines
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_UNION_ACCESS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEUNIONACCESSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.cpp
index 431b2a76feeea..c223ed1565d90 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.cpp
@@ -65,9 +65,9 @@ static constexpr StringRef VaArgWarningMessage =
 
 namespace {
 AST_MATCHER(QualType, isVAList) {
-  ASTContext &Context = Finder->getASTContext();
-  QualType Desugar = Node.getDesugaredType(Context);
-  QualType NodeTy = Node.getUnqualifiedType();
+  const ASTContext &Context = Finder->getASTContext();
+  const QualType Desugar = Node.getDesugaredType(Context);
+  const QualType NodeTy = Node.getUnqualifiedType();
 
   auto CheckVaList = [](QualType NodeTy, QualType Expected,
                         const ASTContext &Context) {
@@ -88,7 +88,8 @@ AST_MATCHER(QualType, isVAList) {
   // type. Some targets implements va_list as 'char *' or 'void *'.
   // In these cases we need to remove all typedefs one by one to check this.
   using BuiltinVaListKind = TargetInfo::BuiltinVaListKind;
-  BuiltinVaListKind VaListKind = Context.getTargetInfo().getBuiltinVaListKind();
+  const BuiltinVaListKind VaListKind =
+      Context.getTargetInfo().getBuiltinVaListKind();
   if (VaListKind == BuiltinVaListKind::CharPtrBuiltinVaList ||
       VaListKind == BuiltinVaListKind::VoidPtrBuiltinVaList) {
     if (CheckVaList(NodeTy, Context.getBuiltinVaListType(), Context))
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.h
index b28d3657703ba..5be6163f3b456 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_VARARG_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_VARARG_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEVARARGCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEVARARGCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -33,4 +33,4 @@ class ProTypeVarargCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::cppcoreguidelines
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_VARARG_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PROTYPEVARARGCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.cpp
index c40ac7ab5102b..28bfe57622398 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.cpp
@@ -39,7 +39,7 @@ AST_MATCHER_P2(Stmt, argumentOf, bool, AllowPartialMove, StatementMatcher,
 void RvalueReferenceParamNotMovedCheck::registerMatchers(MatchFinder *Finder) {
   auto ToParam = hasAnyParameter(parmVarDecl(equalsBoundNode("param")));
 
-  StatementMatcher MoveCallMatcher =
+  const StatementMatcher MoveCallMatcher =
       callExpr(
           argumentCountIs(1),
           anyOf(callee(functionDecl(hasName(MoveFunction))),
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.h
index 61990e6b493db..520a763f5abf5 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SLICING_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SLICING_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SLICINGCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SLICINGCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -36,4 +36,4 @@ class SlicingCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::cppcoreguidelines
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SLICING_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SLICINGCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp
index b38a0c66eb582..77a7b2b25aecb 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp
@@ -109,7 +109,7 @@ join(ArrayRef<SpecialMemberFunctionsCheck::SpecialMemberFunctionKind> SMFS,
   llvm::raw_string_ostream Stream(Buffer);
 
   Stream << toString(SMFS[0]);
-  size_t LastIndex = SMFS.size() - 1;
+  const size_t LastIndex = SMFS.size() - 1;
   for (size_t I = 1; I < LastIndex; ++I) {
     Stream << ", " << toString(SMFS[I]);
   }
@@ -146,7 +146,7 @@ void SpecialMemberFunctionsCheck::check(
     StoreMember({DestructorType, Dtor->isDeleted()});
   }
 
-  std::initializer_list<std::pair<std::string, SpecialMemberFunctionKind>>
+  const std::initializer_list<std::pair<std::string, SpecialMemberFunctionKind>>
       Matchers = {{"copy-ctor", SpecialMemberFunctionKind::CopyConstructor},
                   {"copy-assign", SpecialMemberFunctionKind::CopyAssignment},
                   {"move-ctor", SpecialMemberFunctionKind::MoveConstructor},
@@ -202,7 +202,7 @@ void SpecialMemberFunctionsCheck::checkForMissingMembers(
       MissingMembers.push_back(Kind2);
   };
 
-  bool RequireThree =
+  const bool RequireThree =
       HasMember(SpecialMemberFunctionKind::NonDefaultDestructor) ||
       (!AllowSoleDefaultDtor &&
        (HasMember(SpecialMemberFunctionKind::Destructor) ||
@@ -212,10 +212,11 @@ void SpecialMemberFunctionsCheck::checkForMissingMembers(
       HasMember(SpecialMemberFunctionKind::MoveConstructor) ||
       HasMember(SpecialMemberFunctionKind::MoveAssignment);
 
-  bool RequireFive = (!AllowMissingMoveFunctions && RequireThree &&
-                      getLangOpts().CPlusPlus11) ||
-                     HasMember(SpecialMemberFunctionKind::MoveConstructor) ||
-                     HasMember(SpecialMemberFunctionKind::MoveAssignment);
+  const bool RequireFive =
+      (!AllowMissingMoveFunctions && RequireThree &&
+       getLangOpts().CPlusPlus11) ||
+      HasMember(SpecialMemberFunctionKind::MoveConstructor) ||
+      HasMember(SpecialMemberFunctionKind::MoveAssignment);
 
   if (RequireThree) {
     if (!HasMember(SpecialMemberFunctionKind::Destructor) &&
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h
index 507aaa1cb9d79..6d76e07078f3b 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SPECIAL_MEMBER_FUNCTIONS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SPECIAL_MEMBER_FUNCTIONS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SPECIALMEMBERFUNCTIONSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SPECIALMEMBERFUNCTIONSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -93,11 +93,11 @@ struct DenseMapInfo<
             "TOMBSTONE"};
   }
 
-  static unsigned getHashValue(ClassDefId Val) {
+  static unsigned getHashValue(const ClassDefId &Val) {
     assert(Val != getEmptyKey() && "Cannot hash the empty key!");
     assert(Val != getTombstoneKey() && "Cannot hash the tombstone key!");
 
-    std::hash<ClassDefId::second_type> SecondHash;
+    const std::hash<ClassDefId::second_type> SecondHash;
     return Val.first.getHashValue() + SecondHash(Val.second);
   }
 
@@ -112,4 +112,4 @@ struct DenseMapInfo<
 
 } // namespace llvm
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SPECIAL_MEMBER_FUNCTIONS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_SPECIALMEMBERFUNCTIONSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.cpp
index 770088991419b..89f1bf1ddfd10 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.cpp
@@ -56,10 +56,10 @@ getVirtualKeywordRange(const CXXDestructorDecl &Destructor,
   if (Destructor.getLocation().isMacroID())
     return std::nullopt;
 
-  SourceLocation VirtualBeginLoc = Destructor.getBeginLoc();
-  SourceLocation VirtualBeginSpellingLoc =
+  const SourceLocation VirtualBeginLoc = Destructor.getBeginLoc();
+  const SourceLocation VirtualBeginSpellingLoc =
       SM.getSpellingLoc(Destructor.getBeginLoc());
-  SourceLocation VirtualEndLoc = VirtualBeginSpellingLoc.getLocWithOffset(
+  const SourceLocation VirtualEndLoc = VirtualBeginSpellingLoc.getLocWithOffset(
       Lexer::MeasureTokenLength(VirtualBeginSpellingLoc, SM, LangOpts));
 
   /// Range ends with \c StartOfNextToken so that any whitespace after \c
@@ -68,7 +68,7 @@ getVirtualKeywordRange(const CXXDestructorDecl &Destructor,
       Lexer::findNextToken(VirtualEndLoc, SM, LangOpts);
   if (!NextToken)
     return std::nullopt;
-  SourceLocation StartOfNextToken = NextToken->getLocation();
+  const SourceLocation StartOfNextToken = NextToken->getLocation();
 
   return CharSourceRange::getCharRange(VirtualBeginLoc, StartOfNextToken);
 }
@@ -79,7 +79,7 @@ getPublicASDecl(const CXXRecordDecl &StructOrClass) {
            AS{StructOrClass.decls_begin()},
        ASEnd{StructOrClass.decls_end()};
        AS != ASEnd; ++AS) {
-    AccessSpecDecl *ASDecl = *AS;
+    const AccessSpecDecl *ASDecl = *AS;
     if (ASDecl->getAccess() == AccessSpecifier::AS_public)
       return ASDecl;
   }
@@ -125,7 +125,7 @@ static std::string getSourceText(const CXXDestructorDecl &Destructor) {
 
 static std::string eraseKeyword(std::string &DestructorString,
                                 const std::string &Keyword) {
-  size_t KeywordIndex = DestructorString.find(Keyword);
+  const size_t KeywordIndex = DestructorString.find(Keyword);
   if (KeywordIndex != std::string::npos)
     DestructorString.erase(KeywordIndex, Keyword.length());
   return DestructorString;
diff --git a/clang-tools-extra/clang-tidy/custom/QueryCheck.cpp b/clang-tools-extra/clang-tidy/custom/QueryCheck.cpp
index f83c138fbfaf5..315ce5840e5d4 100644
--- a/clang-tools-extra/clang-tidy/custom/QueryCheck.cpp
+++ b/clang-tools-extra/clang-tidy/custom/QueryCheck.cpp
@@ -33,7 +33,7 @@ parseQuery(const ClangTidyOptions::CustomCheckValue &V,
   clang::query::QuerySession QS({});
   llvm::StringRef QueryStringRef{V.Query};
   while (!QueryStringRef.empty()) {
-    query::QueryRef Q = query::QueryParser::parse(QueryStringRef, QS);
+    const query::QueryRef Q = query::QueryParser::parse(QueryStringRef, QS);
     switch (Q->Kind) {
     case query::QK_Match: {
       const auto &MatchQuery = llvm::cast<query::MatchQuery>(*Q);
@@ -126,11 +126,11 @@ void QueryCheck::registerMatchers(MatchFinder *Finder) {
 void QueryCheck::check(const MatchFinder::MatchResult &Result) {
   auto Emit = [this](const DiagMaps &DiagMaps, const std::string &BindName,
                      const DynTypedNode &Node, DiagnosticIDs::Level Level) {
-    DiagMaps::const_iterator DiagMapIt = DiagMaps.find(Level);
+    const DiagMaps::const_iterator DiagMapIt = DiagMaps.find(Level);
     if (DiagMapIt == DiagMaps.end())
       return;
     const BindNameMapToDiagMessage &BindNameMap = DiagMapIt->second;
-    BindNameMapToDiagMessage::const_iterator BindNameMapIt =
+    const BindNameMapToDiagMessage::const_iterator BindNameMapIt =
         BindNameMap.find(BindName);
     if (BindNameMapIt == BindNameMap.end())
       return;
diff --git a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.h b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.h
index 51bb15325c955..ee08b76b740cd 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.h
+++ b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULT_ARGUMENTS_CALLS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULT_ARGUMENTS_CALLS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULTARGUMENTSCALLSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULTARGUMENTSCALLSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -30,4 +30,4 @@ class DefaultArgumentsCallsCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::fuchsia
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULT_ARGUMENTS_CALLS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULTARGUMENTSCALLSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.cpp b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.cpp
index d80511eb626f5..b22aff1f4381a 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.cpp
@@ -24,7 +24,7 @@ void DefaultArgumentsDeclarationsCheck::check(
   if (!D)
     return;
 
-  SourceRange DefaultArgRange = D->getDefaultArgRange();
+  const SourceRange DefaultArgRange = D->getDefaultArgRange();
 
   if (DefaultArgRange.getEnd() != D->getEndLoc())
     return;
@@ -35,10 +35,10 @@ void DefaultArgumentsDeclarationsCheck::check(
     return;
   }
 
-  SourceLocation StartLocation =
+  const SourceLocation StartLocation =
       D->getName().empty() ? D->getBeginLoc() : D->getLocation();
 
-  SourceRange RemovalRange(
+  const SourceRange RemovalRange(
       Lexer::getLocForEndOfToken(StartLocation, 0, *Result.SourceManager,
                                  Result.Context->getLangOpts()),
       DefaultArgRange.getEnd());
diff --git a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.h b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.h
index 1b0e3dd0a16f5..aa991f8a6adf2 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.h
+++ b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULT_ARGUMENTS_DECLARATIONS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULT_ARGUMENTS_DECLARATIONS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULTARGUMENTSDECLARATIONSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULTARGUMENTSDECLARATIONSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -30,4 +30,4 @@ class DefaultArgumentsDeclarationsCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::fuchsia
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULT_ARGUMENTS_DECLARATIONS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_DEFAULTARGUMENTSDECLARATIONSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.cpp b/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.cpp
index 80de0282ee595..652dec9bcc2a9 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.cpp
@@ -28,7 +28,7 @@ AST_MATCHER(CXXRecordDecl, hasBases) {
 void MultipleInheritanceCheck::addNodeToInterfaceMap(const CXXRecordDecl *Node,
                                                      bool IsInterface) {
   assert(Node->getIdentifier());
-  StringRef Name = Node->getIdentifier()->getName();
+  const StringRef Name = Node->getIdentifier()->getName();
   InterfaceMap.insert(std::make_pair(Name, IsInterface));
 }
 
@@ -38,7 +38,7 @@ void MultipleInheritanceCheck::addNodeToInterfaceMap(const CXXRecordDecl *Node,
 bool MultipleInheritanceCheck::getInterfaceStatus(const CXXRecordDecl *Node,
                                                   bool &IsInterface) const {
   assert(Node->getIdentifier());
-  StringRef Name = Node->getIdentifier()->getName();
+  const StringRef Name = Node->getIdentifier()->getName();
   auto Pair = InterfaceMap.find(Name);
   if (Pair == InterfaceMap.end())
     return false;
@@ -81,7 +81,7 @@ bool MultipleInheritanceCheck::isInterface(const CXXRecordDecl *Node) {
     }
   }
 
-  bool CurrentClassIsInterface = isCurrentClassInterface(Node);
+  const bool CurrentClassIsInterface = isCurrentClassInterface(Node);
   addNodeToInterfaceMap(Node, CurrentClassIsInterface);
   return CurrentClassIsInterface;
 }
diff --git a/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.h b/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.h
index 66be18267ab8a..2e268432c17cf 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.h
+++ b/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_MULTIPLE_INHERITANCE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_MULTIPLE_INHERITANCE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_MULTIPLEINHERITANCECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_MULTIPLEINHERITANCECHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -43,4 +43,4 @@ class MultipleInheritanceCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::fuchsia
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_MULTIPLE_INHERITANCE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_MULTIPLEINHERITANCECHECK_H
diff --git a/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.cpp b/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.cpp
index e202c288d6986..4a498fbc339f8 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.cpp
@@ -34,7 +34,7 @@ void OverloadedOperatorCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *D = Result.Nodes.getNodeAs<FunctionDecl>("decl");
   assert(D && "No FunctionDecl captured!");
 
-  SourceLocation Loc = D->getBeginLoc();
+  const SourceLocation Loc = D->getBeginLoc();
   if (Loc.isValid())
     diag(Loc, "overloading %0 is disallowed") << D;
 }
diff --git a/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.h b/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.h
index d91ecf8e468d2..4945ad213037f 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.h
+++ b/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_OVERLOADED_OPERATOR_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_OVERLOADED_OPERATOR_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_OVERLOADEDOPERATORCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_OVERLOADEDOPERATORCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -30,4 +30,4 @@ class OverloadedOperatorCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::fuchsia
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_OVERLOADED_OPERATOR_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_OVERLOADEDOPERATORCHECK_H
diff --git a/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.h b/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.h
index 42d643e62f28b..d2403b04a2066 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.h
+++ b/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_STATICALLY_CONSTRUCTED_OBJECTS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_STATICALLY_CONSTRUCTED_OBJECTS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_STATICALLYCONSTRUCTEDOBJECTSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_STATICALLYCONSTRUCTEDOBJECTSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -32,4 +32,4 @@ class StaticallyConstructedObjectsCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::fuchsia
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_STATICALLY_CONSTRUCTED_OBJECTS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_STATICALLYCONSTRUCTEDOBJECTSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/fuchsia/TemporaryObjectsCheck.cpp b/clang-tools-extra/clang-tidy/fuchsia/TemporaryObjectsCheck.cpp
index 7b910b1021979..3acd5fb555532 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/TemporaryObjectsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/fuchsia/TemporaryObjectsCheck.cpp
@@ -20,7 +20,7 @@ namespace clang::tidy::fuchsia {
 namespace {
 
 AST_MATCHER_P(CXXRecordDecl, matchesAnyName, ArrayRef<StringRef>, Names) {
-  std::string QualifiedName = Node.getQualifiedNameAsString();
+  const std::string QualifiedName = Node.getQualifiedNameAsString();
   return llvm::is_contained(Names, QualifiedName);
 }
 
diff --git a/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.h b/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.h
index c644e875b3a38..ba1dbeb27ca5f 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.h
+++ b/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_TRAILING_RETURN_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_TRAILING_RETURN_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_TRAILINGRETURNCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_TRAILINGRETURNCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -32,4 +32,4 @@ class TrailingReturnCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::fuchsia
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_TRAILING_RETURN_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_TRAILINGRETURNCHECK_H
diff --git a/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.h b/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.h
index 45c6019f3abe4..e940602e144d1 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.h
+++ b/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_VIRTUAL_INHERITANCE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_VIRTUAL_INHERITANCE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_VIRTUALINHERITANCECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_VIRTUALINHERITANCECHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -30,4 +30,4 @@ class VirtualInheritanceCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::fuchsia
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_VIRTUAL_INHERITANCE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_FUCHSIA_VIRTUALINHERITANCECHECK_H
diff --git a/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp b/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp
index 174ecb0ed7b77..47e859d21e451 100644
--- a/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp
@@ -140,7 +140,7 @@ void AvoidCStyleCastsCheck::check(const MatchFinder::MatchResult &Result) {
 
   CharSourceRange ReplaceRange = getReplaceRange(CastExpr);
 
-  bool FnToFnCast =
+  const bool FnToFnCast =
       IsFunction(SourceTypeAsWritten) && IsFunction(DestTypeAsWritten);
 
   const bool ConstructorCast = !CastExpr->getTypeAsWritten().hasQualifiers() &&
@@ -239,8 +239,8 @@ void AvoidCStyleCastsCheck::check(const MatchFinder::MatchResult &Result) {
       return;
     }
     if (DestType->isReferenceType()) {
-      QualType Dest = DestType.getNonReferenceType();
-      QualType Source = SourceType.getNonReferenceType();
+      const QualType Dest = DestType.getNonReferenceType();
+      const QualType Source = SourceType.getNonReferenceType();
       if (Source == Dest.withConst() ||
           SourceType.getNonReferenceType() == DestType.getNonReferenceType()) {
         ReplaceWithNamedCast("const_cast");
@@ -269,6 +269,12 @@ void AvoidCStyleCastsCheck::check(const MatchFinder::MatchResult &Result) {
       return;
     }
     break;
+  case CK_BaseToDerived:
+    if (!needsConstCast(SourceType, DestType)) {
+      ReplaceWithNamedCast("static_cast");
+      return;
+    }
+    break;
   default:
     break;
   }
diff --git a/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.cpp b/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.cpp
index daf49481bf3b0..5221e4ba5d821 100644
--- a/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.cpp
@@ -22,11 +22,11 @@ using namespace clang::ast_matchers;
 namespace clang::tidy::google::objc {
 
 static bool isMessageExpressionInsideMacro(const ObjCMessageExpr *Expr) {
-  SourceLocation ReceiverLocation = Expr->getReceiverRange().getBegin();
+  const SourceLocation ReceiverLocation = Expr->getReceiverRange().getBegin();
   if (ReceiverLocation.isMacroID())
     return true;
 
-  SourceLocation SelectorLocation = Expr->getSelectorStartLoc();
+  const SourceLocation SelectorLocation = Expr->getSelectorStartLoc();
   if (SelectorLocation.isMacroID())
     return true;
 
@@ -58,7 +58,7 @@ static bool isInitMethodAvailable(const ObjCInterfaceDecl *ClassDecl) {
 static StringRef getReceiverString(SourceRange ReceiverRange,
                                    const SourceManager &SM,
                                    const LangOptions &LangOpts) {
-  CharSourceRange CharRange = Lexer::makeFileCharRange(
+  const CharSourceRange CharRange = Lexer::makeFileCharRange(
       CharSourceRange::getTokenRange(ReceiverRange), SM, LangOpts);
   return Lexer::getSourceText(CharRange, SM, LangOpts);
 }
@@ -77,13 +77,13 @@ static FixItHint getCallFixItHint(const ObjCMessageExpr *Expr,
   if (FoundClassFactory != ClassToFactoryMethodMap.end()) {
     StringRef ClassName = FoundClassFactory->first;
     StringRef FactorySelector = FoundClassFactory->second;
-    std::string NewCall =
+    const std::string NewCall =
         std::string(llvm::formatv("[{0} {1}]", ClassName, FactorySelector));
     return FixItHint::CreateReplacement(Expr->getSourceRange(), NewCall);
   }
 
   if (isInitMethodAvailable(Expr->getReceiverInterface())) {
-    std::string NewCall =
+    const std::string NewCall =
         std::string(llvm::formatv("[[{0} alloc] init]", Receiver));
     return FixItHint::CreateReplacement(Expr->getSourceRange(), NewCall);
   }
diff --git a/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.cpp b/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.cpp
index 73476571c252f..6b96f71f8e7e9 100644
--- a/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.cpp
@@ -40,7 +40,7 @@ void AvoidThrowingObjCExceptionCheck::check(
   // If the match location was in a macro, check if the macro was in a system
   // header.
   if (SourceLoc.isMacroID()) {
-    SourceManager &SM = *Result.SourceManager;
+    const SourceManager &SM = *Result.SourceManager;
     auto MacroLoc = SM.getImmediateMacroCallerLoc(SourceLoc);
 
     // Matches in system header macros should be ignored.
diff --git a/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.h b/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.h
index 26a0465bc197f..417bb8ffad184 100644
--- a/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.h
+++ b/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_AVOID_THROWING_EXCEPTION_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_AVOID_THROWING_EXCEPTION_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_AVOIDTHROWINGOBJCEXCEPTIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_AVOIDTHROWINGOBJCEXCEPTIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -32,4 +32,4 @@ class AvoidThrowingObjCExceptionCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::google::objc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_AVOID_THROWING_EXCEPTION_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_AVOIDTHROWINGOBJCEXCEPTIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp b/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp
index b335463bc78bd..b156d7552419f 100644
--- a/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.cpp
@@ -39,10 +39,11 @@ class AvoidUnderscoreInGoogletestNameCallback : public PPCallbacks {
   void MacroExpands(const Token &MacroNameToken,
                     const MacroDefinition &MacroDefinition, SourceRange Range,
                     const MacroArgs *Args) override {
-    IdentifierInfo *NameIdentifierInfo = MacroNameToken.getIdentifierInfo();
+    const IdentifierInfo *NameIdentifierInfo =
+        MacroNameToken.getIdentifierInfo();
     if (!NameIdentifierInfo)
       return;
-    StringRef MacroName = NameIdentifierInfo->getName();
+    const StringRef MacroName = NameIdentifierInfo->getName();
     if (!isGoogletestTestMacro(MacroName) || !Args ||
         Args->getNumMacroArguments() < 2)
       return;
@@ -50,7 +51,7 @@ class AvoidUnderscoreInGoogletestNameCallback : public PPCallbacks {
     const Token *TestNameToken = Args->getUnexpArgument(1);
     if (!TestSuiteNameToken || !TestNameToken)
       return;
-    std::string TestSuiteNameMaybeDisabled =
+    const std::string TestSuiteNameMaybeDisabled =
         PP->getSpelling(*TestSuiteNameToken);
     StringRef TestSuiteName = TestSuiteNameMaybeDisabled;
     TestSuiteName.consume_front(KDisabledTestPrefix);
@@ -60,7 +61,7 @@ class AvoidUnderscoreInGoogletestNameCallback : public PPCallbacks {
                   "Googletest FAQ")
           << TestSuiteName;
 
-    std::string TestNameMaybeDisabled = PP->getSpelling(*TestNameToken);
+    const std::string TestNameMaybeDisabled = PP->getSpelling(*TestNameToken);
     StringRef TestName = TestNameMaybeDisabled;
     TestName.consume_front(KDisabledTestPrefix);
     if (TestName.contains('_'))
diff --git a/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.h b/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.h
index 1d1e4e31f0c6c..0f397b46122d9 100644
--- a/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.h
+++ b/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_DEFAULT_ARGUMENTS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_DEFAULT_ARGUMENTS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_DEFAULTARGUMENTSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_DEFAULTARGUMENTSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -32,4 +32,4 @@ class DefaultArgumentsCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::google
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_DEFAULT_ARGUMENTS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_DEFAULTARGUMENTSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp b/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp
index 6d5182d1e9787..ac604b7b9f1b4 100644
--- a/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp
@@ -39,8 +39,8 @@ static SourceRange findToken(const SourceManager &Sources,
                              bool (*Pred)(const Token &)) {
   if (StartLoc.isMacroID() || EndLoc.isMacroID())
     return {};
-  FileID File = Sources.getFileID(Sources.getSpellingLoc(StartLoc));
-  StringRef Buf = Sources.getBufferData(File);
+  const FileID File = Sources.getFileID(Sources.getSpellingLoc(StartLoc));
+  const StringRef Buf = Sources.getBufferData(File);
   const char *StartChar = Sources.getCharacterData(StartLoc);
   Lexer Lex(StartLoc, LangOpts, StartChar, StartChar, Buf.end());
   Lex.SetCommentRetentionState(true);
@@ -88,7 +88,7 @@ void ExplicitConstructorCheck::check(const MatchFinder::MatchResult &Result) {
           Result.Nodes.getNodeAs<CXXConversionDecl>("conversion")) {
     if (Conversion->isOutOfLine())
       return;
-    SourceLocation Loc = Conversion->getLocation();
+    const SourceLocation Loc = Conversion->getLocation();
     // Ignore all macros until we learn to ignore specific ones (e.g. used in
     // gmock to define matchers).
     if (Loc.isMacroID())
@@ -105,7 +105,7 @@ void ExplicitConstructorCheck::check(const MatchFinder::MatchResult &Result) {
 
   const ExplicitSpecifier ExplicitSpec = Ctor->getExplicitSpecifier();
 
-  bool TakesInitializerList = isStdInitializerList(
+  const bool TakesInitializerList = isStdInitializerList(
       Ctor->getParamDecl(0)->getType().getNonReferenceType());
   if (ExplicitSpec.isExplicit() &&
       (Ctor->isCopyOrMoveConstructor() || TakesInitializerList)) {
@@ -113,7 +113,7 @@ void ExplicitConstructorCheck::check(const MatchFinder::MatchResult &Result) {
       return Tok.is(tok::raw_identifier) &&
              Tok.getRawIdentifier() == "explicit";
     };
-    SourceRange ExplicitTokenRange =
+    const SourceRange ExplicitTokenRange =
         findToken(*Result.SourceManager, getLangOpts(),
                   Ctor->getOuterLocStart(), Ctor->getEndLoc(), IsKwExplicit);
     StringRef ConstructorDescription;
@@ -149,7 +149,7 @@ void ExplicitConstructorCheck::check(const MatchFinder::MatchResult &Result) {
 
   const bool SingleArgument =
       Ctor->getNumParams() == 1 && !Ctor->getParamDecl(0)->isParameterPack();
-  SourceLocation Loc = Ctor->getLocation();
+  const SourceLocation Loc = Ctor->getLocation();
   auto Diag =
       diag(Loc, ExplicitExpr ? WithExpressionWarningMessage
                              : NoExpressionWarningMessage)
diff --git a/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.cpp b/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.cpp
index 3d75f4dd25bd1..2b9183cd7b8a1 100644
--- a/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.cpp
@@ -14,9 +14,7 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::google::objc {
 
-namespace {
-
-std::string validFunctionNameRegex(bool RequirePrefix) {
+static std::string validFunctionNameRegex(bool RequirePrefix) {
   // Allow the following name patterns for all functions:
   // • ABFoo (prefix + UpperCamelCase)
   // • ABURL (prefix + capitalized acronym/initialism)
@@ -35,7 +33,7 @@ std::string validFunctionNameRegex(bool RequirePrefix) {
   // If a prefix is required, the regex checks for a capital letter followed by
   // another capital letter or number that is part of the prefix and another
   // capital letter or number that begins the name following the prefix.
-  std::string FunctionNameMatcher =
+  const std::string FunctionNameMatcher =
       std::string(RequirePrefix ? "[A-Z][A-Z0-9]+" : "") + "[A-Z][a-zA-Z0-9]*";
   return std::string("::(") + FunctionNameMatcher + ")$";
 }
@@ -43,20 +41,20 @@ std::string validFunctionNameRegex(bool RequirePrefix) {
 /// For now we will only fix functions of static storage class with names like
 /// 'functionName' or 'function_name' and convert them to 'FunctionName'. For
 /// other cases the user must determine an appropriate name on their own.
-FixItHint generateFixItHint(const FunctionDecl *Decl) {
+static FixItHint generateFixItHint(const FunctionDecl *Decl) {
   // A fixit can be generated for functions of static storage class but
   // otherwise the check cannot determine the appropriate function name prefix
   // to use.
   if (Decl->getStorageClass() != SC_Static)
     return {};
 
-  StringRef Name = Decl->getName();
+  const StringRef Name = Decl->getName();
   std::string NewName = Decl->getName().str();
 
   size_t Index = 0;
   bool AtWordBoundary = true;
   while (Index < NewName.size()) {
-    char Ch = NewName[Index];
+    const char Ch = NewName[Index];
     if (isalnum(Ch)) {
       // Capitalize the first letter after every word boundary.
       if (AtWordBoundary) {
@@ -82,8 +80,6 @@ FixItHint generateFixItHint(const FunctionDecl *Decl) {
   return {};
 }
 
-} // namespace
-
 void FunctionNamingCheck::registerMatchers(MatchFinder *Finder) {
   // Enforce Objective-C function naming conventions on all functions except:
   // • Functions defined in system headers.
@@ -105,7 +101,7 @@ void FunctionNamingCheck::registerMatchers(MatchFinder *Finder) {
 void FunctionNamingCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *MatchedDecl = Result.Nodes.getNodeAs<FunctionDecl>("function");
 
-  bool IsGlobal = MatchedDecl->getStorageClass() != SC_Static;
+  const bool IsGlobal = MatchedDecl->getStorageClass() != SC_Static;
   diag(MatchedDecl->getLocation(),
        "%select{static function|function in global namespace}1 named %0 must "
        "%select{be in|have an appropriate prefix followed by}1 Pascal case as "
diff --git a/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.h b/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.h
index e4efadfd217a6..6acc184f9f050 100644
--- a/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.h
+++ b/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_FUNCTION_NAMING_CHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_FUNCTION_NAMING_CHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_FUNCTIONNAMINGCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_FUNCTIONNAMINGCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "llvm/ADT/StringRef.h"
@@ -36,4 +36,4 @@ class FunctionNamingCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::google::objc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_FUNCTION_NAMING_CHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_FUNCTIONNAMINGCHECK_H
diff --git a/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.cpp b/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.cpp
index a4c76be92192e..7470b1eb206bb 100644
--- a/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.cpp
@@ -30,7 +30,7 @@ static FixItHint generateFixItHint(const VarDecl *Decl, bool IsConst) {
     return {};
   }
 
-  char FC = Decl->getName()[0];
+  const char FC = Decl->getName()[0];
   if (!llvm::isAlpha(FC) || Decl->getName().size() == 1) {
     // No fix available if first character is not alphabetical character, or it
     // is a single-character variable, since it is difficult to determine the
@@ -38,7 +38,7 @@ static FixItHint generateFixItHint(const VarDecl *Decl, bool IsConst) {
     // their own.
     return {};
   }
-  char SC = Decl->getName()[1];
+  const char SC = Decl->getName()[1];
   if ((FC == 'k' || FC == 'g') && !llvm::isAlpha(SC)) {
     // No fix available if the prefix is correct but the second character is
     // not alphabetical, since it is difficult to determine the proper fix in
diff --git a/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.h b/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.h
index 9b55855b1fc86..e0693d4bb38d4 100644
--- a/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.h
+++ b/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_GLOBAL_VARIABLE_DECLARATION_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_GLOBAL_VARIABLE_DECLARATION_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_GLOBALVARIABLEDECLARATIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_GLOBALVARIABLEDECLARATIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -32,4 +32,4 @@ class GlobalVariableDeclarationCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::google::objc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_OBJC_GLOBAL_VARIABLE_DECLARATION_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GOOGLE_GLOBALVARIABLEDECLARATIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp
index 52777fa5c4fd6..52bcf1b1719a4 100644
--- a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp
@@ -103,7 +103,7 @@ void IntegerTypesCheck::registerMatchers(MatchFinder *Finder) {
 
 void IntegerTypesCheck::check(const MatchFinder::MatchResult &Result) {
   auto TL = *Result.Nodes.getNodeAs<TypeLoc>("tl");
-  SourceLocation Loc = TL.getBeginLoc();
+  const SourceLocation Loc = TL.getBeginLoc();
 
   // Look through qualification.
   if (auto QualLoc = TL.getAs<QualifiedTypeLoc>())
@@ -113,7 +113,7 @@ void IntegerTypesCheck::check(const MatchFinder::MatchResult &Result) {
   if (!BuiltinLoc)
     return;
 
-  Token Tok = getTokenAtLoc(Loc, Result, *IdentTable);
+  const Token Tok = getTokenAtLoc(Loc, Result, *IdentTable);
   // Ensure the location actually points to one of the builting integral type
   // names we're interested in. Otherwise, we might be getting this match from
   // implicit code (e.g. an implicit assignment operator of a class containing
@@ -164,7 +164,7 @@ void IntegerTypesCheck::check(const MatchFinder::MatchResult &Result) {
       !isAsciiIdentifierContinue(Data[Port.size()]))
     return;
 
-  std::string Replacement =
+  const std::string Replacement =
       ((IsSigned ? SignedTypePrefix : UnsignedTypePrefix) + Twine(Width) +
        TypeSuffix)
           .str();
diff --git a/clang-tools-extra/clang-tidy/google/TodoCommentCheck.cpp b/clang-tools-extra/clang-tidy/google/TodoCommentCheck.cpp
index 8554870287c81..7331b3644b2b7 100644
--- a/clang-tools-extra/clang-tidy/google/TodoCommentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/TodoCommentCheck.cpp
@@ -20,7 +20,7 @@ class TodoCommentCheck::TodoCommentHandler : public CommentHandler {
         TodoMatch("^// *TODO *(\\(.*\\))?:?( )?(.*)$") {}
 
   bool HandleComment(Preprocessor &PP, SourceRange Range) override {
-    StringRef Text =
+    const StringRef Text =
         Lexer::getSourceText(CharSourceRange::getCharRange(Range),
                              PP.getSourceManager(), PP.getLangOpts());
 
@@ -28,13 +28,14 @@ class TodoCommentCheck::TodoCommentHandler : public CommentHandler {
     if (!TodoMatch.match(Text, &Matches))
       return false;
 
-    StringRef Username = Matches[1];
-    StringRef Comment = Matches[3];
+    const StringRef Username = Matches[1];
+    const StringRef Comment = Matches[3];
 
     if (!Username.empty())
       return false;
 
-    std::string NewText = ("// TODO(" + Twine(User) + "): " + Comment).str();
+    const std::string NewText =
+        ("// TODO(" + Twine(User) + "): " + Comment).str();
 
     Check.diag(Range.getBegin(), "missing username/bug in TODO")
         << FixItHint::CreateReplacement(CharSourceRange::getCharRange(Range),
diff --git a/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.cpp b/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.cpp
index 3066dd0ff4595..054bdc8d1230e 100644
--- a/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.cpp
@@ -28,7 +28,7 @@ void UnnamedNamespaceInHeaderCheck::registerMatchers(
 void UnnamedNamespaceInHeaderCheck::check(
     const MatchFinder::MatchResult &Result) {
   const auto *N = Result.Nodes.getNodeAs<NamespaceDecl>("anonymousNamespace");
-  SourceLocation Loc = N->getBeginLoc();
+  const SourceLocation Loc = N->getBeginLoc();
   if (!Loc.isValid())
     return;
 
diff --git a/clang-tools-extra/clang-tidy/google/UpgradeGoogletestCaseCheck.cpp b/clang-tools-extra/clang-tidy/google/UpgradeGoogletestCaseCheck.cpp
index 9da1915affd91..87fd0468ba9c6 100644
--- a/clang-tools-extra/clang-tidy/google/UpgradeGoogletestCaseCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/UpgradeGoogletestCaseCheck.cpp
@@ -64,7 +64,7 @@ class UpgradeGoogletestCasePPCallback : public PPCallbacks {
       // We check if the newly defined macro is one of the target replacements.
       // This ensures that the check creates warnings only if it is including a
       // recent enough version of Google Test.
-      llvm::StringRef FileName = PP->getSourceManager().getFilename(
+      const llvm::StringRef FileName = PP->getSourceManager().getFilename(
           MD->getMacroInfo()->getDefinitionLoc());
       ReplacementFound = FileName.ends_with("gtest/gtest-typed-test.h") &&
                          PP->getSpelling(MacroNameTok) == "TYPED_TEST_SUITE";
@@ -94,18 +94,18 @@ class UpgradeGoogletestCasePPCallback : public PPCallbacks {
     if (!ReplacementFound)
       return;
 
-    std::string Name = PP->getSpelling(MacroNameTok);
+    const std::string Name = PP->getSpelling(MacroNameTok);
 
     std::optional<llvm::StringRef> Replacement = getNewMacroName(Name);
     if (!Replacement)
       return;
 
-    llvm::StringRef FileName = PP->getSourceManager().getFilename(
+    const llvm::StringRef FileName = PP->getSourceManager().getFilename(
         MD.getMacroInfo()->getDefinitionLoc());
     if (!FileName.ends_with("gtest/gtest-typed-test.h"))
       return;
 
-    DiagnosticBuilder Diag = Check->diag(Loc, RenameCaseToSuiteMessage);
+    const DiagnosticBuilder Diag = Check->diag(Loc, RenameCaseToSuiteMessage);
 
     if (Action == CheckAction::Rename)
       Diag << FixItHint::CreateReplacement(
@@ -234,7 +234,7 @@ static bool isInInstantiation(const NodeType &Node,
 template <typename NodeType>
 static bool isInTemplate(const NodeType &Node,
                          const MatchFinder::MatchResult &Result) {
-  internal::Matcher<NodeType> IsInsideTemplate =
+  const internal::Matcher<NodeType> IsInsideTemplate =
       hasAncestor(decl(anyOf(classTemplateDecl(), functionTemplateDecl())));
   return !match(IsInsideTemplate, Node, *Result.Context).empty();
 }
@@ -340,7 +340,7 @@ void UpgradeGoogletestCaseCheck::check(const MatchFinder::MatchResult &Result) {
     // will only be instantiated with the true type name, `TestSuite`.
   }
 
-  DiagnosticBuilder Diag =
+  const DiagnosticBuilder Diag =
       diag(ReplacementRange.getBegin(), RenameCaseToSuiteMessage);
 
   ReplacementRange = Lexer::makeFileCharRange(
diff --git a/clang-tools-extra/clang-tidy/google/UsingNamespaceDirectiveCheck.cpp b/clang-tools-extra/clang-tidy/google/UsingNamespaceDirectiveCheck.cpp
index fbfd5d3430519..00446dc62d0d5 100644
--- a/clang-tools-extra/clang-tidy/google/UsingNamespaceDirectiveCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/UsingNamespaceDirectiveCheck.cpp
@@ -22,7 +22,7 @@ void UsingNamespaceDirectiveCheck::registerMatchers(
 void UsingNamespaceDirectiveCheck::check(
     const MatchFinder::MatchResult &Result) {
   const auto *U = Result.Nodes.getNodeAs<UsingDirectiveDecl>("usingNamespace");
-  SourceLocation Loc = U->getBeginLoc();
+  const SourceLocation Loc = U->getBeginLoc();
   if (U->isImplicit() || !Loc.isValid())
     return;
 
diff --git a/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h b/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h
index 12fe7f7eb340d..800e7ac9663d5 100644
--- a/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h
+++ b/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_EXCEPTION_BASECLASS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_EXCEPTION_BASECLASS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_EXCEPTIONBASECLASSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_EXCEPTIONBASECLASSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -31,4 +31,4 @@ class ExceptionBaseclassCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::hicpp
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_EXCEPTION_BASECLASS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_EXCEPTIONBASECLASSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp
index e610d99007d4e..e9a5819a939f9 100644
--- a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp
+++ b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp
@@ -152,7 +152,7 @@ void MultiwayPathsCoveredCheck::handleSwitchWithoutDefault(
   assert(CaseCount > 0 && "Switch statement without any case found. This case "
                           "should be excluded by the matcher and is handled "
                           "separately.");
-  std::size_t MaxPathsPossible = [&]() {
+  const std::size_t MaxPathsPossible = [&]() {
     if (const auto *GeneralCondition =
             Result.Nodes.getNodeAs<DeclRefExpr>("non-enum-condition")) {
       return getNumberOfPossibleValues(GeneralCondition->getType(),
diff --git a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h
index 902be2d9d324d..e22e31ac7b05a 100644
--- a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h
+++ b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_MULTIWAY_PATHS_COVERED_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_MULTIWAY_PATHS_COVERED_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_MULTIWAYPATHSCOVEREDCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_MULTIWAYPATHSCOVEREDCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -41,4 +41,4 @@ class MultiwayPathsCoveredCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::hicpp
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_MULTIWAY_PATHS_COVERED_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_MULTIWAYPATHSCOVEREDCHECK_H
diff --git a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp
index a89a896b32981..e7d97b2a26b2f 100644
--- a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp
@@ -13,17 +13,10 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::hicpp {
 
-namespace {
-AST_MATCHER(VarDecl, isAsm) { return Node.hasAttr<clang::AsmLabelAttr>(); }
-const ast_matchers::internal::VariadicDynCastAllOfMatcher<Decl,
-                                                          FileScopeAsmDecl>
-    fileScopeAsmDecl; // NOLINT(readability-identifier-*) preserve clang style
-} // namespace
-
 void NoAssemblerCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(asmStmt().bind("asm-stmt"), this);
   Finder->addMatcher(fileScopeAsmDecl().bind("asm-file-scope"), this);
-  Finder->addMatcher(varDecl(isAsm()).bind("asm-var"), this);
+  Finder->addMatcher(varDecl(hasAttr(attr::AsmLabel)).bind("asm-var"), this);
 }
 
 void NoAssemblerCheck::check(const MatchFinder::MatchResult &Result) {
diff --git a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.h b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.h
index 1ff40eae4622b..15d646fd97af3 100644
--- a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.h
+++ b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_NO_ASSEMBLER_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_NO_ASSEMBLER_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_NOASSEMBLERCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_NOASSEMBLERCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -27,4 +27,4 @@ class NoAssemblerCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::hicpp
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_NO_ASSEMBLER_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_NOASSEMBLERCHECK_H
diff --git a/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.h b/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.h
index 499a4e7bebc14..ef92a4d13f43e 100644
--- a/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.h
+++ b/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_SIGNED_BITWISE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_SIGNED_BITWISE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_SIGNEDBITWISECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_SIGNEDBITWISECHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -31,4 +31,4 @@ class SignedBitwiseCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::hicpp
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_SIGNED_BITWISE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_HICPP_SIGNEDBITWISECHECK_H
diff --git a/clang-tools-extra/clang-tidy/llvm/.clang-tidy b/clang-tools-extra/clang-tidy/llvm/.clang-tidy
new file mode 100644
index 0000000000000..08c2c1d2328e9
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/llvm/.clang-tidy
@@ -0,0 +1,5 @@
+InheritParentConfig: true
+# FIXME(vbvictor) enable this check when https://github.com/llvm/llvm-project/issues/166750 is fixed
+# and github runners are updated to include the fix
+Checks: >
+  -llvm-header-guard
diff --git a/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.cpp b/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.cpp
index 8737c1e5f4b05..ef8b6b1dfb8f7 100644
--- a/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.cpp
@@ -28,24 +28,24 @@ std::string LLVMHeaderGuardCheck::getHeaderGuard(StringRef Filename,
   // style in include/llvm and include/clang which we want to preserve.
 
   // We don't want _INCLUDE_ in our guards.
-  size_t PosInclude = Guard.rfind("include/");
+  const size_t PosInclude = Guard.rfind("include/");
   if (PosInclude != StringRef::npos)
     Guard = Guard.substr(PosInclude + std::strlen("include/"));
 
   // For clang we drop the _TOOLS_.
-  size_t PosToolsClang = Guard.rfind("tools/clang/");
+  const size_t PosToolsClang = Guard.rfind("tools/clang/");
   if (PosToolsClang != StringRef::npos)
     Guard = Guard.substr(PosToolsClang + std::strlen("tools/"));
 
   // Unlike LLVM svn, LLVM git monorepo is named llvm-project, so we replace
   // "/llvm-project/" with the canonical "/llvm/".
   const static StringRef LLVMProject = "/llvm-project/";
-  size_t PosLLVMProject = Guard.rfind(LLVMProject);
+  const size_t PosLLVMProject = Guard.rfind(LLVMProject);
   if (PosLLVMProject != StringRef::npos)
     Guard = Guard.replace(PosLLVMProject, LLVMProject.size(), "/llvm/");
 
   // The remainder is LLVM_FULL_PATH_TO_HEADER_H
-  size_t PosLLVM = Guard.rfind("llvm/");
+  const size_t PosLLVM = Guard.rfind("llvm/");
   if (PosLLVM != StringRef::npos)
     Guard = Guard.substr(PosLLVM);
 
diff --git a/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp b/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp
index f34e3a67c03ab..416aca188e01c 100644
--- a/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.cpp
@@ -162,15 +162,15 @@ void IncludeOrderPPCallbacks::EndOfMainFile() {
           continue;
         const IncludeDirective &CopyFrom = FileDirectives[IncludeIndices[I]];
 
-        SourceLocation FromLoc = CopyFrom.Range.getBegin();
+        const SourceLocation FromLoc = CopyFrom.Range.getBegin();
         const char *FromData = SM.getCharacterData(FromLoc);
-        unsigned FromLen = std::strcspn(FromData, "\n");
+        const unsigned FromLen = std::strcspn(FromData, "\n");
 
-        StringRef FixedName(FromData, FromLen);
+        const StringRef FixedName(FromData, FromLen);
 
-        SourceLocation ToLoc = FileDirectives[I].Range.getBegin();
+        const SourceLocation ToLoc = FileDirectives[I].Range.getBegin();
         const char *ToData = SM.getCharacterData(ToLoc);
-        unsigned ToLen = std::strcspn(ToData, "\n");
+        const unsigned ToLen = std::strcspn(ToData, "\n");
         auto ToRange =
             CharSourceRange::getCharRange(ToLoc, ToLoc.getLocWithOffset(ToLen));
 
diff --git a/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.cpp b/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.cpp
index f4f3543b56e5c..8966745eb44a7 100644
--- a/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.cpp
@@ -68,8 +68,8 @@ void PreferIsaOrDynCastInConditionalsCheck::check(
   //   llvm::cast<T>(x)
   //         ^  ^
   //  StartLoc  EndLoc
-  SourceLocation StartLoc = Callee->getLocation();
-  SourceLocation EndLoc = Callee->getNameInfo().getEndLoc();
+  const SourceLocation StartLoc = Callee->getLocation();
+  const SourceLocation EndLoc = Callee->getNameInfo().getEndLoc();
 
   if (Result.Nodes.getNodeAs<VarDecl>("var")) {
     diag(StartLoc,
diff --git a/clang-tools-extra/clang-tidy/llvm/TwineLocalCheck.cpp b/clang-tools-extra/clang-tidy/llvm/TwineLocalCheck.cpp
index b8b7c41e970bb..7dea84516502b 100644
--- a/clang-tools-extra/clang-tidy/llvm/TwineLocalCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/TwineLocalCheck.cpp
@@ -40,13 +40,13 @@ void TwineLocalCheck::check(const MatchFinder::MatchResult &Result) {
       C = cast<CXXConstructExpr>(C)->getArg(0)->IgnoreParenImpCasts();
     }
 
-    SourceRange TypeRange =
+    const SourceRange TypeRange =
         VD->getTypeSourceInfo()->getTypeLoc().getSourceRange();
 
     // A real Twine, turn it into a std::string.
     if (VD->getType()->getCanonicalTypeUnqualified() ==
         C->getType()->getCanonicalTypeUnqualified()) {
-      SourceLocation EndLoc = Lexer::getLocForEndOfToken(
+      const SourceLocation EndLoc = Lexer::getLocForEndOfToken(
           VD->getInit()->getEndLoc(), 0, *Result.SourceManager, getLangOpts());
       Diag << FixItHint::CreateReplacement(TypeRange, "std::string")
            << FixItHint::CreateInsertion(VD->getInit()->getBeginLoc(), "(")
diff --git a/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp b/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp
index bd51cc5037dca..55b383eca4456 100644
--- a/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp
@@ -18,18 +18,15 @@
 #include "llvm/Support/FormatVariadic.h"
 
 namespace clang::tidy::llvm_check {
-namespace {
 
 using namespace ::clang::ast_matchers;
 using namespace ::clang::transformer;
 
-EditGenerator rewrite(RangeSelector Call, RangeSelector Builder,
-                      RangeSelector CallArgs) {
+static EditGenerator rewrite(RangeSelector Call, RangeSelector Builder) {
   // This is using an EditGenerator rather than ASTEdit as we want to warn even
   // if in macro.
-  return [Call = std::move(Call), Builder = std::move(Builder),
-          CallArgs =
-              std::move(CallArgs)](const MatchFinder::MatchResult &Result)
+  return [Call = std::move(Call),
+          Builder = std::move(Builder)](const MatchFinder::MatchResult &Result)
              -> Expected<SmallVector<transformer::Edit, 1>> {
     Expected<CharSourceRange> CallRange = Call(Result);
     if (!CallRange)
@@ -39,7 +36,7 @@ EditGenerator rewrite(RangeSelector Call, RangeSelector Builder,
     SourceLocation Begin = CallRange->getBegin();
 
     // This will result in just a warning and no edit.
-    bool InMacro = CallRange->getBegin().isMacroID();
+    const bool InMacro = CallRange->getBegin().isMacroID();
     if (InMacro) {
       while (SM.isMacroArgExpansion(Begin))
         Begin = SM.getImmediateExpansionRange(Begin).getBegin();
@@ -54,7 +51,7 @@ EditGenerator rewrite(RangeSelector Call, RangeSelector Builder,
     auto NextToken = [&](std::optional<Token> CurrentToken) {
       if (!CurrentToken)
         return CurrentToken;
-      if (CurrentToken->getEndLoc() >= CallRange->getEnd())
+      if (CurrentToken->is(clang::tok::eof))
         return std::optional<Token>();
       return clang::Lexer::findNextToken(CurrentToken->getLocation(), SM,
                                          LangOpts);
@@ -68,9 +65,10 @@ EditGenerator rewrite(RangeSelector Call, RangeSelector Builder,
       return llvm::make_error<llvm::StringError>(llvm::errc::invalid_argument,
                                                  "missing '<' token");
     }
+
     std::optional<Token> EndToken = NextToken(LessToken);
-    for (std::optional<Token> GreaterToken = NextToken(EndToken);
-         GreaterToken && GreaterToken->getKind() != clang::tok::greater;
+    std::optional<Token> GreaterToken = NextToken(EndToken);
+    for (; GreaterToken && GreaterToken->getKind() != clang::tok::greater;
          GreaterToken = NextToken(GreaterToken)) {
       EndToken = GreaterToken;
     }
@@ -79,12 +77,21 @@ EditGenerator rewrite(RangeSelector Call, RangeSelector Builder,
                                                  "missing '>' token");
     }
 
+    std::optional<Token> ArgStart = NextToken(GreaterToken);
+    if (!ArgStart || ArgStart->getKind() != clang::tok::l_paren) {
+      return llvm::make_error<llvm::StringError>(llvm::errc::invalid_argument,
+                                                 "missing '(' token");
+    }
+    std::optional<Token> Arg = NextToken(ArgStart);
+    if (!Arg) {
+      return llvm::make_error<llvm::StringError>(llvm::errc::invalid_argument,
+                                                 "unexpected end of file");
+    }
+    const bool HasArgs = Arg->getKind() != clang::tok::r_paren;
+
     Expected<CharSourceRange> BuilderRange = Builder(Result);
     if (!BuilderRange)
       return BuilderRange.takeError();
-    Expected<CharSourceRange> CallArgsRange = CallArgs(Result);
-    if (!CallArgsRange)
-      return CallArgsRange.takeError();
 
     // Helper for concatting below.
     auto GetText = [&](const CharSourceRange &Range) {
@@ -93,43 +100,42 @@ EditGenerator rewrite(RangeSelector Call, RangeSelector Builder,
 
     Edit Replace;
     Replace.Kind = EditKind::Range;
-    Replace.Range = *CallRange;
-    std::string CallArgsStr;
-    // Only emit args if there are any.
-    if (auto CallArgsText = GetText(*CallArgsRange).ltrim();
-        !CallArgsText.rtrim().empty()) {
-      CallArgsStr = llvm::formatv(", {}", CallArgsText);
+    Replace.Range.setBegin(CallRange->getBegin());
+    Replace.Range.setEnd(ArgStart->getEndLoc());
+    const Expr *BuilderExpr = Result.Nodes.getNodeAs<Expr>("builder");
+    std::string BuilderText = GetText(*BuilderRange).str();
+    if (BuilderExpr->getType()->isPointerType()) {
+      BuilderText = BuilderExpr->isImplicitCXXThis()
+                        ? "*this"
+                        : llvm::formatv("*{}", BuilderText).str();
     }
-    Replace.Replacement =
-        llvm::formatv("{}::create({}{})",
-                      GetText(CharSourceRange::getTokenRange(
-                          LessToken->getEndLoc(), EndToken->getLastLoc())),
-                      GetText(*BuilderRange), CallArgsStr);
+    const StringRef OpType = GetText(CharSourceRange::getTokenRange(
+        LessToken->getEndLoc(), EndToken->getLastLoc()));
+    Replace.Replacement = llvm::formatv("{}::create({}{}", OpType, BuilderText,
+                                        HasArgs ? ", " : "");
 
     return SmallVector<Edit, 1>({Replace});
   };
 }
 
-RewriteRuleWith<std::string> useNewMlirOpBuilderCheckRule() {
-  Stencil Message = cat("use 'OpType::create(builder, ...)' instead of "
-                        "'builder.create<OpType>(...)'");
+static RewriteRuleWith<std::string> useNewMlirOpBuilderCheckRule() {
+  const Stencil Message = cat("use 'OpType::create(builder, ...)' instead of "
+                              "'builder.create<OpType>(...)'");
   // Match a create call on an OpBuilder.
-  ast_matchers::internal::Matcher<Stmt> Base =
+  auto BuilderType = cxxRecordDecl(isSameOrDerivedFrom("::mlir::OpBuilder"));
+  const ast_matchers::internal::Matcher<Stmt> Base =
       cxxMemberCallExpr(
-          on(expr(hasType(
-                      cxxRecordDecl(isSameOrDerivedFrom("::mlir::OpBuilder"))))
+          on(expr(anyOf(hasType(BuilderType), hasType(pointsTo(BuilderType))))
                  .bind("builder")),
-          callee(cxxMethodDecl(hasTemplateArgument(0, templateArgument()))),
-          callee(cxxMethodDecl(hasName("create"))))
+          callee(cxxMethodDecl(hasTemplateArgument(0, templateArgument()),
+                               hasName("create"))))
           .bind("call");
   return applyFirst(
       //  Attempt rewrite given an lvalue builder, else just warn.
       {makeRule(cxxMemberCallExpr(unless(on(cxxTemporaryObjectExpr())), Base),
-                rewrite(node("call"), node("builder"), callArgs("call")),
-                Message),
+                rewrite(node("call"), node("builder")), Message),
        makeRule(Base, noopEdit(node("call")), Message)});
 }
-} // namespace
 
 UseNewMlirOpBuilderCheck::UseNewMlirOpBuilderCheck(StringRef Name,
                                                    ClangTidyContext *Context)
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.cpp b/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.cpp
index 091e4fe84b36a..9f09947c4da29 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.cpp
@@ -78,7 +78,8 @@ void InlineFunctionDeclCheck::check(const MatchFinder::MatchResult &Result) {
   // Check if decl starts with LIBC_INLINE
   auto Loc = FullSourceLoc(Result.SourceManager->getFileLoc(SrcBegin),
                            *Result.SourceManager);
-  llvm::StringRef SrcText = Loc.getBufferData().drop_front(Loc.getFileOffset());
+  const llvm::StringRef SrcText =
+      Loc.getBufferData().drop_front(Loc.getFileOffset());
   if (SrcText.starts_with("LIBC_INLINE"))
     return;
 
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/NamespaceConstants.h b/clang-tools-extra/clang-tidy/llvmlibc/NamespaceConstants.h
index 50669dc073291..8b8b719df62f4 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/NamespaceConstants.h
+++ b/clang-tools-extra/clang-tidy/llvmlibc/NamespaceConstants.h
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_LLVMLIBC_NAMESPACECONSTANTS_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_LLVMLIBC_NAMESPACECONSTANTS_H
+
 #include "llvm/ADT/StringRef.h"
 
 namespace clang::tidy::llvm_libc {
@@ -18,3 +21,5 @@ const static llvm::StringRef RequiredNamespaceDeclMacroName =
     "LIBC_NAMESPACE_DECL";
 
 } // namespace clang::tidy::llvm_libc
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_LLVMLIBC_NAMESPACECONSTANTS_H
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp
index 129b8a9a30a59..ecd8e19b8b2c6 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp
@@ -22,11 +22,11 @@ namespace {
 class RestrictedIncludesPPCallbacks
     : public portability::RestrictedIncludesPPCallbacks {
 public:
-  explicit RestrictedIncludesPPCallbacks(
-      RestrictSystemLibcHeadersCheck &Check, const SourceManager &SM,
-      const SmallString<128> CompilerIncudeDir)
+  explicit RestrictedIncludesPPCallbacks(RestrictSystemLibcHeadersCheck &Check,
+                                         const SourceManager &SM,
+                                         SmallString<128> CompilerIncudeDir)
       : portability::RestrictedIncludesPPCallbacks(Check, SM),
-        CompilerIncudeDir(CompilerIncudeDir) {}
+        CompilerIncudeDir(std::move(CompilerIncudeDir)) {}
 
   void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok,
                           StringRef FileName, bool IsAngled,
@@ -61,7 +61,7 @@ void RestrictSystemLibcHeadersCheck::registerPPCallbacks(
       StringRef(PP->getHeaderSearchInfo().getHeaderSearchOpts().ResourceDir);
   llvm::sys::path::append(CompilerIncudeDir, "include");
   PP->addPPCallbacks(std::make_unique<RestrictedIncludesPPCallbacks>(
-      *this, SM, CompilerIncudeDir));
+      *this, SM, std::move(CompilerIncudeDir)));
 }
 
 } // namespace clang::tidy::llvm_libc
diff --git a/clang-tools-extra/clang-tidy/misc/CMakeLists.txt b/clang-tools-extra/clang-tidy/misc/CMakeLists.txt
index 2cfee5fd10713..6214ee92927f2 100644
--- a/clang-tools-extra/clang-tidy/misc/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/misc/CMakeLists.txt
@@ -25,12 +25,12 @@ add_clang_library(clangTidyMiscModule STATIC
   HeaderIncludeCycleCheck.cpp
   IncludeCleanerCheck.cpp
   MiscTidyModule.cpp
-  MisleadingBidirectional.cpp
-  MisleadingIdentifier.cpp
+  MisleadingBidirectionalCheck.cpp
+  MisleadingIdentifierCheck.cpp
   MisplacedConstCheck.cpp
   NewDeleteOverloadsCheck.cpp
   NoRecursionCheck.cpp
-  NonCopyableObjects.cpp
+  NonCopyableObjectsCheck.cpp
   NonPrivateMemberVariablesInClassesCheck.cpp
   OverrideWithDifferentVisibilityCheck.cpp
   RedundantExpressionCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp
index b05fd049cef74..61d5477583b80 100644
--- a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp
@@ -56,7 +56,7 @@ static llvm::SmallString<64U> skeleton(StringRef Name) {
 
     const char *Prev = Curr;
     UTF32 CodePoint = 0;
-    ConversionResult Result = convertUTF8Sequence(
+    const ConversionResult Result = convertUTF8Sequence(
         reinterpret_cast<const UTF8 **>(&Curr),
         reinterpret_cast<const UTF8 *>(End), &CodePoint, strictConversion);
     if (Result != conversionOK) {
@@ -64,7 +64,7 @@ static llvm::SmallString<64U> skeleton(StringRef Name) {
       break;
     }
 
-    StringRef Key(Prev, Curr - Prev);
+    const StringRef Key(Prev, Curr - Prev);
     auto *Where = llvm::lower_bound(ConfusableEntries, CodePoint,
                                     [](decltype(ConfusableEntries[0]) X,
                                        UTF32 Y) { return X.codepoint < Y; });
@@ -183,7 +183,7 @@ void ConfusableIdentifierCheck::addDeclToCheck(const NamedDecl *ND,
   if (!NDII)
     return;
 
-  StringRef NDName = NDII->getName();
+  const StringRef NDName = NDII->getName();
   if (NDName.empty())
     return;
 
@@ -216,7 +216,7 @@ void ConfusableIdentifierCheck::onEndOfTranslationUnit() {
     // the same skeleton.
     for (const IdentifierInfo *II : Idents) {
       for (auto [OuterND, OuterParent] : NameToDecls[II]) {
-        for (Entry Inner : DeclsWithinContext[OuterParent]) {
+        for (const Entry Inner : DeclsWithinContext[OuterParent]) {
           // Don't complain if the identifiers are the same.
           if (OuterND->getIdentifier() == Inner.ND->getIdentifier())
             continue;
diff --git a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.h b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.h
index b341d03083c92..5b98d48780eb5 100644
--- a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLE_IDENTIFIER_CHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLE_IDENTIFIER_CHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLEIDENTIFIERCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLEIDENTIFIERCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "llvm/ADT/DenseMap.h"
@@ -41,4 +41,4 @@ class ConfusableIdentifierCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::misc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLE_IDENTIFIER_CHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLEIDENTIFIERCHECK_H
diff --git a/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp b/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp
index 6a079024cfe1c..f5dcc7f4edcb6 100644
--- a/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp
+++ b/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp
@@ -26,7 +26,7 @@ int main(int argc, char *argv[]) {
 
   std::vector<std::pair<llvm::UTF32, SmallVector<llvm::UTF32>>> Entries;
   SmallVector<StringRef> Values;
-  for (StringRef Line : Lines) {
+  for (const StringRef Line : Lines) {
     if (Line.starts_with("#"))
       continue;
 
@@ -37,14 +37,14 @@ int main(int argc, char *argv[]) {
       return 2;
     }
 
-    llvm::StringRef From = Values[0].trim();
+    const llvm::StringRef From = Values[0].trim();
     llvm::UTF32 CodePoint = 0;
     From.getAsInteger(16, CodePoint);
 
     SmallVector<llvm::UTF32> To;
     SmallVector<StringRef> ToN;
     Values[1].split(ToN, ' ', -1, false);
-    for (StringRef ToI : ToN) {
+    for (const StringRef ToI : ToN) {
       llvm::UTF32 ToCodePoint = 0;
       ToI.trim().getAsInteger(16, ToCodePoint);
       To.push_back(ToCodePoint);
@@ -56,7 +56,7 @@ int main(int argc, char *argv[]) {
   }
   llvm::sort(Entries);
 
-  unsigned LargestValue =
+  const unsigned LargestValue =
       llvm::max_element(Entries, [](const auto &Entry0, const auto &Entry1) {
         return Entry0.second.size() < Entry1.second.size();
       })->second.size();
diff --git a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
index 3b9b8e0daa62a..a7b74944690b4 100644
--- a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
@@ -56,7 +56,7 @@ AST_MATCHER_P(Stmt, forEachPrevStmt, ast_matchers::internal::Matcher<Stmt>,
 // Matches the expression awaited by the `co_await`.
 AST_MATCHER_P(CoawaitExpr, awaitable, ast_matchers::internal::Matcher<Expr>,
               InnerMatcher) {
-  if (Expr *E = Node.getOperand())
+  if (const Expr *E = Node.getOperand())
     return InnerMatcher.matches(*E, Finder, Builder);
   return false;
 }
@@ -73,7 +73,9 @@ CoroutineHostileRAIICheck::CoroutineHostileRAIICheck(StringRef Name,
       RAIITypesList(utils::options::parseStringList(
           Options.get("RAIITypesList", "std::lock_guard;std::scoped_lock"))),
       AllowedAwaitablesList(utils::options::parseStringList(
-          Options.get("AllowedAwaitablesList", ""))) {}
+          Options.get("AllowedAwaitablesList", ""))),
+      AllowedCallees(
+          utils::options::parseStringList(Options.get("AllowedCallees", ""))) {}
 
 void CoroutineHostileRAIICheck::registerMatchers(MatchFinder *Finder) {
   // A suspension happens with co_await or co_yield.
@@ -81,7 +83,9 @@ void CoroutineHostileRAIICheck::registerMatchers(MatchFinder *Finder) {
                                     hasAttr(attr::Kind::ScopedLockable)))))
                             .bind("scoped-lockable");
   auto OtherRAII = varDecl(typeWithNameIn(RAIITypesList)).bind("raii");
-  auto AllowedSuspend = awaitable(typeWithNameIn(AllowedAwaitablesList));
+  auto AllowedSuspend = awaitable(
+      anyOf(typeWithNameIn(AllowedAwaitablesList),
+            callExpr(callee(functionDecl(hasAnyName(AllowedCallees))))));
   Finder->addMatcher(
       expr(anyOf(coawaitExpr(unless(AllowedSuspend)), coyieldExpr()),
            forEachPrevStmt(
@@ -111,5 +115,7 @@ void CoroutineHostileRAIICheck::storeOptions(
                 utils::options::serializeStringList(RAIITypesList));
   Options.store(Opts, "SafeAwaitableList",
                 utils::options::serializeStringList(AllowedAwaitablesList));
+  Options.store(Opts, "SafeCallees",
+                utils::options::serializeStringList(AllowedCallees));
 }
 } // namespace clang::tidy::misc
diff --git a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h
index e100509ea261d..12ad1b1e0e220 100644
--- a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h
+++ b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_COROUTINESHOSTILERAIICHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_COROUTINESHOSTILERAIICHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_COROUTINEHOSTILERAIICHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_COROUTINEHOSTILERAIICHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "clang/AST/ASTTypeTraits.h"
@@ -46,8 +46,11 @@ class CoroutineHostileRAIICheck : public ClangTidyCheck {
   // List of fully qualified awaitable types which are considered safe to
   // co_await.
   std::vector<StringRef> AllowedAwaitablesList;
+  // List of callees whose return values are considered safe to directly
+  // co_await.
+  std::vector<StringRef> AllowedCallees;
 };
 
 } // namespace clang::tidy::misc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_COROUTINESHOSTILERAIICHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_COROUTINEHOSTILERAIICHECK_H
diff --git a/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.cpp b/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.cpp
index 714af111e7f7a..c10ee1d92cd59 100644
--- a/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.cpp
@@ -93,7 +93,8 @@ void DefinitionsInHeadersCheck::check(const MatchFinder::MatchResult &Result) {
       }
     }
 
-    bool IsFullSpec = FD->getTemplateSpecializationKind() != TSK_Undeclared;
+    const bool IsFullSpec =
+        FD->getTemplateSpecializationKind() != TSK_Undeclared;
     diag(FD->getLocation(),
          "%select{function|full function template specialization}0 %1 defined "
          "in a header file; function definitions in header files can lead to "
diff --git a/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.h b/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.h
index 0c162cc53ff5f..e52fa20460c9d 100644
--- a/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_DEFINITIONS_IN_HEADERS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_DEFINITIONS_IN_HEADERS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_DEFINITIONSINHEADERSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_DEFINITIONSINHEADERSCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "../utils/FileExtensionsUtils.h"
@@ -38,4 +38,4 @@ class DefinitionsInHeadersCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::misc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_DEFINITIONS_IN_HEADERS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_DEFINITIONSINHEADERSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp
index 1a5aa4b0758a6..558c368901f1c 100644
--- a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp
@@ -200,7 +200,7 @@ void IncludeCleanerCheck::check(const MatchFinder::MatchResult &Result) {
     Unused.push_back(&I);
   }
 
-  llvm::StringRef Code = SM->getBufferData(SM->getMainFileID());
+  const llvm::StringRef Code = SM->getBufferData(SM->getMainFileID());
   auto FileStyle =
       format::getStyle(format::DefaultFormatStyle, getCurrentMainFile(),
                        format::DefaultFallbackStyle, Code,
@@ -220,14 +220,14 @@ void IncludeCleanerCheck::check(const MatchFinder::MatchResult &Result) {
   }
 
   if (MissingIncludes) {
-    tooling::HeaderIncludes HeaderIncludes(getCurrentMainFile(), Code,
-                                           FileStyle->IncludeStyle);
+    const tooling::HeaderIncludes HeaderIncludes(getCurrentMainFile(), Code,
+                                                 FileStyle->IncludeStyle);
     // Deduplicate insertions when running in bulk fix mode.
     llvm::StringSet<> InsertedHeaders{};
     for (const auto &Inc : Missing) {
-      std::string Spelling = include_cleaner::spellHeader(
+      const std::string Spelling = include_cleaner::spellHeader(
           {Inc.Missing, PP->getHeaderSearchInfo(), MainFile});
-      bool Angled = llvm::StringRef{Spelling}.starts_with("<");
+      const bool Angled = llvm::StringRef{Spelling}.starts_with("<");
       // We might suggest insertion of an existing include in edge cases, e.g.,
       // include is present in a PP-disabled region, or spelling of the header
       // turns out to be the same as one of the unresolved includes in the
@@ -235,7 +235,7 @@ void IncludeCleanerCheck::check(const MatchFinder::MatchResult &Result) {
       if (auto Replacement = HeaderIncludes.insert(
               llvm::StringRef{Spelling}.trim("\"<>"), Angled,
               tooling::IncludeDirective::Include)) {
-        DiagnosticBuilder DB =
+        const DiagnosticBuilder DB =
             diag(SM->getSpellingLoc(Inc.SymRef.RefLocation),
                  "no header providing \"%0\" is directly included")
             << Inc.SymRef.Target.name();
diff --git a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.h b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.h
index 43e1ed894a16c..619d8191ab41f 100644
--- a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_INCLUDECLEANER_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_INCLUDECLEANER_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_INCLUDECLEANERCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_INCLUDECLEANERCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "../ClangTidyDiagnosticConsumer.h"
@@ -57,4 +57,4 @@ class IncludeCleanerCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::misc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_INCLUDECLEANER_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_INCLUDECLEANERCHECK_H
diff --git a/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp b/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp
index 6f4af6c44dcb4..347fa2a82e43c 100644
--- a/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp
@@ -15,12 +15,12 @@
 #include "DefinitionsInHeadersCheck.h"
 #include "HeaderIncludeCycleCheck.h"
 #include "IncludeCleanerCheck.h"
-#include "MisleadingBidirectional.h"
-#include "MisleadingIdentifier.h"
+#include "MisleadingBidirectionalCheck.h"
+#include "MisleadingIdentifierCheck.h"
 #include "MisplacedConstCheck.h"
 #include "NewDeleteOverloadsCheck.h"
 #include "NoRecursionCheck.h"
-#include "NonCopyableObjects.h"
+#include "NonCopyableObjectsCheck.h"
 #include "NonPrivateMemberVariablesInClassesCheck.h"
 #include "OverrideWithDifferentVisibilityCheck.h"
 #include "RedundantExpressionCheck.h"
diff --git a/clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.cpp b/clang-tools-extra/clang-tidy/misc/MisleadingBidirectionalCheck.cpp
similarity index 92%
rename from clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.cpp
rename to clang-tools-extra/clang-tidy/misc/MisleadingBidirectionalCheck.cpp
index f89c539423507..4807567710f2d 100644
--- a/clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.cpp
+++ b/clang-tools-extra/clang-tidy/misc/MisleadingBidirectionalCheck.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MisleadingBidirectional.h"
+#include "MisleadingBidirectionalCheck.h"
 
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Lex/Preprocessor.h"
@@ -40,18 +40,18 @@ static bool containsMisleadingBidi(StringRef Buffer,
   //
   // Warn if we end up with an unclosed context.
   while (CurPtr < Buffer.end()) {
-    unsigned char C = *CurPtr;
+    const unsigned char C = *CurPtr;
     if (isASCII(C)) {
       ++CurPtr;
-      bool IsParagrapSep =
+      const bool IsParagrapSep =
           (C == 0xA || C == 0xD || (0x1C <= C && C <= 0x1E) || C == 0x85);
-      bool IsSegmentSep = (C == 0x9 || C == 0xB || C == 0x1F);
+      const bool IsSegmentSep = (C == 0x9 || C == 0xB || C == 0x1F);
       if (IsParagrapSep || IsSegmentSep)
         BidiContexts.clear();
       continue;
     }
     llvm::UTF32 CodePoint = 0;
-    llvm::ConversionResult Result = llvm::convertUTF8Sequence(
+    const llvm::ConversionResult Result = llvm::convertUTF8Sequence(
         (const llvm::UTF8 **)&CurPtr, (const llvm::UTF8 *)Buffer.end(),
         &CodePoint, llvm::strictConversion);
 
@@ -94,7 +94,7 @@ class MisleadingBidirectionalCheck::MisleadingBidirectionalHandler
 
   bool HandleComment(Preprocessor &PP, SourceRange Range) override {
     // FIXME: check that we are in a /* */ comment
-    StringRef Text =
+    const StringRef Text =
         Lexer::getSourceText(CharSourceRange::getCharRange(Range),
                              PP.getSourceManager(), PP.getLangOpts());
 
@@ -124,7 +124,7 @@ void MisleadingBidirectionalCheck::registerPPCallbacks(
 void MisleadingBidirectionalCheck::check(
     const ast_matchers::MatchFinder::MatchResult &Result) {
   if (const auto *SL = Result.Nodes.getNodeAs<StringLiteral>("strlit")) {
-    StringRef Literal = SL->getBytes();
+    const StringRef Literal = SL->getBytes();
     if (containsMisleadingBidi(Literal, false))
       diag(SL->getBeginLoc(), "string literal contains misleading "
                               "bidirectional Unicode characters");
diff --git a/clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.h b/clang-tools-extra/clang-tidy/misc/MisleadingBidirectionalCheck.h
similarity index 86%
rename from clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.h
rename to clang-tools-extra/clang-tidy/misc/MisleadingBidirectionalCheck.h
index ba895b95b9a25..7c7577f129f36 100644
--- a/clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.h
+++ b/clang-tools-extra/clang-tidy/misc/MisleadingBidirectionalCheck.h
@@ -13,6 +13,10 @@
 
 namespace clang::tidy::misc {
 
+/// Warns about unterminated bidirectional unicode sequence.
+///
+/// For the user-facing documentation see:
+/// https://clang.llvm.org/extra/clang-tidy/checks/misc/misleading-bidirectional.html
 class MisleadingBidirectionalCheck : public ClangTidyCheck {
 public:
   MisleadingBidirectionalCheck(StringRef Name, ClangTidyContext *Context);
diff --git a/clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.cpp b/clang-tools-extra/clang-tidy/misc/MisleadingIdentifierCheck.cpp
similarity index 97%
rename from clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.cpp
rename to clang-tools-extra/clang-tidy/misc/MisleadingIdentifierCheck.cpp
index ce04fb6fa4096..335fffc5d47af 100644
--- a/clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.cpp
+++ b/clang-tools-extra/clang-tidy/misc/MisleadingIdentifierCheck.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MisleadingIdentifier.h"
+#include "MisleadingIdentifierCheck.h"
 
 #include "llvm/Support/ConvertUTF.h"
 
@@ -124,7 +124,7 @@ static bool hasRTLCharacters(StringRef Buffer) {
   const char *EndPtr = Buffer.end();
   while (CurPtr < EndPtr) {
     llvm::UTF32 CodePoint = 0;
-    llvm::ConversionResult Result = llvm::convertUTF8Sequence(
+    const llvm::ConversionResult Result = llvm::convertUTF8Sequence(
         (const llvm::UTF8 **)&CurPtr, (const llvm::UTF8 *)EndPtr, &CodePoint,
         llvm::strictConversion);
     if (Result != llvm::conversionOK)
@@ -144,9 +144,9 @@ MisleadingIdentifierCheck::~MisleadingIdentifierCheck() = default;
 void MisleadingIdentifierCheck::check(
     const ast_matchers::MatchFinder::MatchResult &Result) {
   if (const auto *ND = Result.Nodes.getNodeAs<NamedDecl>("nameddecl")) {
-    IdentifierInfo *II = ND->getIdentifier();
+    const IdentifierInfo *II = ND->getIdentifier();
     if (II) {
-      StringRef NDName = II->getName();
+      const StringRef NDName = II->getName();
       if (hasRTLCharacters(NDName))
         diag(ND->getBeginLoc(), "identifier has right-to-left codepoints");
     }
diff --git a/clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.h b/clang-tools-extra/clang-tidy/misc/MisleadingIdentifierCheck.h
similarity index 76%
rename from clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.h
rename to clang-tools-extra/clang-tidy/misc/MisleadingIdentifierCheck.h
index 06b83d567a9d2..9f8eb4039fa44 100644
--- a/clang-tools-extra/clang-tidy/misc/MisleadingIdentifier.h
+++ b/clang-tools-extra/clang-tidy/misc/MisleadingIdentifierCheck.h
@@ -13,6 +13,12 @@
 
 namespace clang::tidy::misc {
 
+/// Finds identifiers that contain Unicode characters with right-to-left
+/// direction, which can be confusing as they may change the understanding of a
+/// whole statement line.
+///
+/// For the user-facing documentation see:
+/// https://clang.llvm.org/extra/clang-tidy/checks/misc/misleading-identifier.html
 class MisleadingIdentifierCheck : public ClangTidyCheck {
 public:
   MisleadingIdentifierCheck(StringRef Name, ClangTidyContext *Context);
diff --git a/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.cpp b/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.cpp
index afa59f31d7259..c8c0cfd1c6ad5 100644
--- a/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.cpp
@@ -40,7 +40,7 @@ static QualType guessAlternateQualification(ASTContext &Context, QualType QT) {
   Qualifiers Quals = QT.getLocalQualifiers();
   Quals.removeConst();
 
-  QualType NewQT = Context.getPointerType(
+  const QualType NewQT = Context.getPointerType(
       QualType(QT->getPointeeType().getTypePtr(), Qualifiers::Const));
   return NewQT.withCVRQualifiers(Quals.getCVRQualifiers());
 }
@@ -48,7 +48,7 @@ static QualType guessAlternateQualification(ASTContext &Context, QualType QT) {
 void MisplacedConstCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *Var = Result.Nodes.getNodeAs<ValueDecl>("decl");
   ASTContext &Ctx = *Result.Context;
-  QualType CanQT = Var->getType().getCanonicalType();
+  const QualType CanQT = Var->getType().getCanonicalType();
 
   SourceLocation AliasLoc;
   const char *AliasType = nullptr;
diff --git a/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.h b/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.h
index 2b8a05d003fad..5f5a4cfdc6752 100644
--- a/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MISPLACED_CONST_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MISPLACED_CONST_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MISPLACEDCONSTCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MISPLACEDCONSTCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -28,4 +28,4 @@ class MisplacedConstCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::misc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MISPLACED_CONST_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MISPLACEDCONSTCHECK_H
diff --git a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
index 9801c9ea04d2d..a44e9b381d982 100644
--- a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
@@ -51,7 +51,7 @@ AST_MATCHER(FunctionDecl, isPlacementOverload) {
     return true;
 
   const auto *FPT = Node.getType()->castAs<FunctionProtoType>();
-  ASTContext &Ctx = Node.getASTContext();
+  const ASTContext &Ctx = Node.getASTContext();
   if (Ctx.getLangOpts().SizedDeallocation &&
       ASTContext::hasSameType(FPT->getParamType(1), Ctx.getSizeType()))
     return false;
diff --git a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.h b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.h
index 93c39fc7005cf..9c7aff082f8cd 100644
--- a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NEWDELETEOVERLOADS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NEWDELETEOVERLOADS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NEWDELETEOVERLOADSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NEWDELETEOVERLOADSCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "llvm/ADT/SmallVector.h"
@@ -33,4 +33,4 @@ class NewDeleteOverloadsCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::misc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NEWDELETEOVERLOADS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NEWDELETEOVERLOADSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp b/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp
index 035598d354503..8bcbb61961fa6 100644
--- a/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp
@@ -122,7 +122,7 @@ template <typename T, unsigned SmallSize> class SmartSmallSetVector {
     }
     // Set time!
     // Note that this must be after `populateSet()` might have been called.
-    bool SetInsertionSucceeded = Set.insert(V).second;
+    const bool SetInsertionSucceeded = Set.insert(V).second;
     (void)SetInsertionSucceeded;
     assert(SetInsertionSucceeded && "We did check that no such value existed");
     return true;
@@ -132,7 +132,7 @@ template <typename T, unsigned SmallSize> class SmartSmallSetVector {
   /// Insert a new element into the SmartSmallSetVector.
   /// \returns true if the element was inserted into the SmartSmallSetVector.
   bool insert(const T &X) {
-    bool Result = setInsert(X);
+    const bool Result = setInsert(X);
     if (Result)
       Vector.push_back(X);
     return Result;
@@ -200,8 +200,8 @@ void NoRecursionCheck::handleSCC(ArrayRef<CallGraphNode *> SCC) {
   assert(!SCC.empty() && "Empty SCC does not make sense.");
 
   // First of all, call out every strongly connected function.
-  for (CallGraphNode *N : SCC) {
-    FunctionDecl *D = N->getDefinition();
+  for (const CallGraphNode *N : SCC) {
+    const FunctionDecl *D = N->getDefinition();
     diag(D->getLocation(), "function %0 is within a recursive call chain") << D;
   }
 
@@ -224,7 +224,8 @@ void NoRecursionCheck::handleSCC(ArrayRef<CallGraphNode *> SCC) {
   assert(CyclicCallStack.size() >= 2 && "Cycle requires at least 2 frames");
 
   // Which function we decided to be the entry point that lead to the recursion?
-  FunctionDecl *CycleEntryFn = CyclicCallStack.front().Callee->getDefinition();
+  const FunctionDecl *CycleEntryFn =
+      CyclicCallStack.front().Callee->getDefinition();
   // And now, for ease of understanding, let's print the call sequence that
   // forms the cycle in question.
   diag(CycleEntryFn->getLocation(),
@@ -233,8 +234,8 @@ void NoRecursionCheck::handleSCC(ArrayRef<CallGraphNode *> SCC) {
       << CycleEntryFn;
   for (int CurFrame = 1, NumFrames = CyclicCallStack.size();
        CurFrame != NumFrames; ++CurFrame) {
-    CallGraphNode::CallRecord PrevNode = CyclicCallStack[CurFrame - 1];
-    CallGraphNode::CallRecord CurrNode = CyclicCallStack[CurFrame];
+    const CallGraphNode::CallRecord PrevNode = CyclicCallStack[CurFrame - 1];
+    const CallGraphNode::CallRecord CurrNode = CyclicCallStack[CurFrame];
 
     Decl *PrevDecl = PrevNode.Callee->getDecl();
     Decl *CurrDecl = CurrNode.Callee->getDecl();
diff --git a/clang-tools-extra/clang-tidy/misc/NonCopyableObjects.cpp b/clang-tools-extra/clang-tidy/misc/NonCopyableObjectsCheck.cpp
similarity index 98%
rename from clang-tools-extra/clang-tidy/misc/NonCopyableObjects.cpp
rename to clang-tools-extra/clang-tidy/misc/NonCopyableObjectsCheck.cpp
index b33e2667ef660..bfeb5fa855af6 100644
--- a/clang-tools-extra/clang-tidy/misc/NonCopyableObjects.cpp
+++ b/clang-tools-extra/clang-tidy/misc/NonCopyableObjectsCheck.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "NonCopyableObjects.h"
+#include "NonCopyableObjectsCheck.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 
diff --git a/clang-tools-extra/clang-tidy/misc/NonCopyableObjects.h b/clang-tools-extra/clang-tidy/misc/NonCopyableObjectsCheck.h
similarity index 82%
rename from clang-tools-extra/clang-tidy/misc/NonCopyableObjects.h
rename to clang-tools-extra/clang-tidy/misc/NonCopyableObjectsCheck.h
index 2fcbf41dcf5e1..608e07833d000 100644
--- a/clang-tools-extra/clang-tidy/misc/NonCopyableObjects.h
+++ b/clang-tools-extra/clang-tidy/misc/NonCopyableObjectsCheck.h
@@ -6,15 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NONCOPYABLEOBJECTS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NONCOPYABLEOBJECTS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NONCOPYABLEOBJECTSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NONCOPYABLEOBJECTSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
 namespace clang::tidy::misc {
 
-/// The check flags dereferences and non-pointer declarations of objects that
+/// Flags dereferences and non-pointer declarations of objects that
 /// are not meant to be passed by value, such as C FILE objects.
+///
+/// For the user-facing documentation see:
+/// https://clang.llvm.org/extra/clang-tidy/checks/misc/non-copyable-objects.html
 class NonCopyableObjectsCheck : public ClangTidyCheck {
 public:
   NonCopyableObjectsCheck(StringRef Name, ClangTidyContext *Context)
@@ -25,4 +28,4 @@ class NonCopyableObjectsCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::misc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NONCOPYABLEOBJECTS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_NONCOPYABLEOBJECTSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp
index 6baa12a8bcedf..ea8405364df4c 100644
--- a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp
@@ -77,8 +77,8 @@ static bool areEquivalentExpr(const Expr *Left, const Expr *Right) {
     return cast<CharacterLiteral>(Left)->getValue() ==
            cast<CharacterLiteral>(Right)->getValue();
   case Stmt::IntegerLiteralClass: {
-    llvm::APInt LeftLit = cast<IntegerLiteral>(Left)->getValue();
-    llvm::APInt RightLit = cast<IntegerLiteral>(Right)->getValue();
+    const llvm::APInt LeftLit = cast<IntegerLiteral>(Left)->getValue();
+    const llvm::APInt RightLit = cast<IntegerLiteral>(Right)->getValue();
     return LeftLit.getBitWidth() == RightLit.getBitWidth() &&
            LeftLit == RightLit;
   }
@@ -256,7 +256,7 @@ static bool rangeSubsumesRange(BinaryOperatorKind OpcodeLHS,
                                const APSInt &ValueLHS,
                                BinaryOperatorKind OpcodeRHS,
                                const APSInt &ValueRHS) {
-  int Comparison = APSInt::compareValues(ValueLHS, ValueRHS);
+  const int Comparison = APSInt::compareValues(ValueLHS, ValueRHS);
   switch (OpcodeLHS) {
   case BO_EQ:
     return OpcodeRHS == BO_EQ && Comparison == 0;
@@ -352,11 +352,11 @@ static bool hasSameOperatorParent(const Expr *TheExpr,
                                   ASTContext &Context) {
   // IgnoreParenImpCasts logic in reverse: skip surrounding uninteresting nodes
   const DynTypedNodeList Parents = Context.getParents(*TheExpr);
-  for (DynTypedNode DynParent : Parents) {
+  for (const DynTypedNode DynParent : Parents) {
     if (const auto *Parent = DynParent.get<Expr>()) {
-      bool Skip = isa<ParenExpr>(Parent) || isa<ImplicitCastExpr>(Parent) ||
-                  isa<FullExpr>(Parent) ||
-                  isa<MaterializeTemporaryExpr>(Parent);
+      const bool Skip =
+          isa<ParenExpr>(Parent) || isa<ImplicitCastExpr>(Parent) ||
+          isa<FullExpr>(Parent) || isa<MaterializeTemporaryExpr>(Parent);
       if (Skip && hasSameOperatorParent<TExpr>(Parent, OpKind, Context))
         return true;
       if (checkOpKind<TExpr>(Parent, OpKind))
@@ -392,7 +392,7 @@ markDuplicateOperands(const TExpr *TheExpr,
     return false;
   if (collectOperands<TExpr>(Operands.second, AllOperands, OpKind))
     return false;
-  size_t NumOperands = AllOperands.size();
+  const size_t NumOperands = AllOperands.size();
   llvm::SmallBitVector Duplicates(NumOperands);
   for (size_t I = 0; I < NumOperands; I++) {
     if (Duplicates[I])
@@ -463,7 +463,7 @@ AST_MATCHER_P(Expr, expandedByMacro, ArrayRef<llvm::StringLiteral>, Names) {
   const LangOptions &LO = Finder->getASTContext().getLangOpts();
   SourceLocation Loc = Node.getExprLoc();
   while (Loc.isMacroID()) {
-    StringRef MacroName = Lexer::getImmediateMacroName(Loc, SM, LO);
+    const StringRef MacroName = Lexer::getImmediateMacroName(Loc, SM, LO);
     if (llvm::is_contained(Names, MacroName))
       return true;
     Loc = SM.getImmediateMacroCallerLoc(Loc);
@@ -476,7 +476,7 @@ AST_MATCHER_P(Expr, expandedByMacro, ArrayRef<llvm::StringLiteral>, Names) {
 // Returns a matcher for integer constant expressions.
 static ast_matchers::internal::Matcher<Expr>
 matchIntegerConstantExpr(StringRef Id) {
-  std::string CstId = (Id + "-const").str();
+  const std::string CstId = (Id + "-const").str();
   return expr(isIntegerConstantExpr()).bind(CstId);
 }
 
@@ -486,7 +486,7 @@ matchIntegerConstantExpr(StringRef Id) {
 static bool retrieveIntegerConstantExpr(const MatchFinder::MatchResult &Result,
                                         StringRef Id, APSInt &Value,
                                         const Expr *&ConstExpr) {
-  std::string CstId = (Id + "-const").str();
+  const std::string CstId = (Id + "-const").str();
   ConstExpr = Result.Nodes.getNodeAs<Expr>(CstId);
   if (!ConstExpr)
     return false;
@@ -508,7 +508,7 @@ static bool retrieveIntegerConstantExpr(const MatchFinder::MatchResult &Result,
 // Returns a matcher for symbolic expressions (matches every expression except
 // ingeter constant expressions).
 static ast_matchers::internal::Matcher<Expr> matchSymbolicExpr(StringRef Id) {
-  std::string SymId = (Id + "-sym").str();
+  const std::string SymId = (Id + "-sym").str();
   return ignoringParenImpCasts(
       expr(unless(isIntegerConstantExpr())).bind(SymId));
 }
@@ -517,7 +517,7 @@ static ast_matchers::internal::Matcher<Expr> matchSymbolicExpr(StringRef Id) {
 // stores it into 'SymExpr'.
 static bool retrieveSymbolicExpr(const MatchFinder::MatchResult &Result,
                                  StringRef Id, const Expr *&SymExpr) {
-  std::string SymId = (Id + "-sym").str();
+  const std::string SymId = (Id + "-sym").str();
   if (const auto *Node = Result.Nodes.getNodeAs<Expr>(SymId)) {
     SymExpr = Node;
     return true;
@@ -557,11 +557,11 @@ retrieveBinOpIntegerConstantExpr(const MatchFinder::MatchResult &Result,
 // Matches relational expressions: 'Expr <op> k' (i.e. x < 2, x != 3, 12 <= x).
 static ast_matchers::internal::Matcher<Expr>
 matchRelationalIntegerConstantExpr(StringRef Id) {
-  std::string CastId = (Id + "-cast").str();
-  std::string SwapId = (Id + "-swap").str();
-  std::string NegateId = (Id + "-negate").str();
-  std::string OverloadId = (Id + "-overload").str();
-  std::string ConstId = (Id + "-const").str();
+  const std::string CastId = (Id + "-cast").str();
+  const std::string SwapId = (Id + "-swap").str();
+  const std::string NegateId = (Id + "-negate").str();
+  const std::string OverloadId = (Id + "-overload").str();
+  const std::string ConstId = (Id + "-const").str();
 
   const auto RelationalExpr = ignoringParenImpCasts(binaryOperator(
       isComparisonOperator(), expr().bind(Id),
@@ -625,7 +625,7 @@ canOverloadedOperatorArgsBeModified(const CXXOperatorCallExpr *OperatorCall,
   if (!OperatorDecl)
     return true;
 
-  unsigned ParamCount = OperatorDecl->getNumParams();
+  const unsigned ParamCount = OperatorDecl->getNumParams();
 
   // Overloaded operators declared inside a class have only one param.
   // These functions must be declared const in order to not be able to modify
@@ -647,10 +647,10 @@ static bool retrieveRelationalIntegerConstantExpr(
     const MatchFinder::MatchResult &Result, StringRef Id,
     const Expr *&OperandExpr, BinaryOperatorKind &Opcode, const Expr *&Symbol,
     APSInt &Value, const Expr *&ConstExpr) {
-  std::string CastId = (Id + "-cast").str();
-  std::string SwapId = (Id + "-swap").str();
-  std::string NegateId = (Id + "-negate").str();
-  std::string OverloadId = (Id + "-overload").str();
+  const std::string CastId = (Id + "-cast").str();
+  const std::string SwapId = (Id + "-swap").str();
+  const std::string NegateId = (Id + "-negate").str();
+  const std::string OverloadId = (Id + "-overload").str();
 
   if (const auto *Bin = Result.Nodes.getNodeAs<BinaryOperator>(Id)) {
     // Operand received with explicit comparator.
@@ -829,11 +829,11 @@ static bool areExprsFromDifferentMacros(const Expr *LhsExpr,
   const SourceManager &SM = AstCtx->getSourceManager();
   const LangOptions &LO = AstCtx->getLangOpts();
 
-  std::pair<FileID, unsigned> LsrLocInfo =
+  const std::pair<FileID, unsigned> LsrLocInfo =
       SM.getDecomposedLoc(SM.getExpansionLoc(Lsr.getBegin()));
-  std::pair<FileID, unsigned> RsrLocInfo =
+  const std::pair<FileID, unsigned> RsrLocInfo =
       SM.getDecomposedLoc(SM.getExpansionLoc(Rsr.getBegin()));
-  llvm::MemoryBufferRef MB = SM.getBufferOrFake(LsrLocInfo.first);
+  const llvm::MemoryBufferRef MB = SM.getBufferOrFake(LsrLocInfo.first);
 
   const char *LTokenPos = MB.getBufferStart() + LsrLocInfo.second;
   const char *RTokenPos = MB.getBufferStart() + RsrLocInfo.second;
@@ -1097,7 +1097,7 @@ void RedundantExpressionCheck::checkArithmeticExpr(
 
   if (const auto *ComparisonOperator = Result.Nodes.getNodeAs<BinaryOperator>(
           "binop-const-compare-to-sym")) {
-    BinaryOperatorKind Opcode = ComparisonOperator->getOpcode();
+    const BinaryOperatorKind Opcode = ComparisonOperator->getOpcode();
     if (!retrieveBinOpIntegerConstantExpr(Result, "lhs", LhsOpcode, LhsSymbol,
                                           LhsValue) ||
         !retrieveSymbolicExpr(Result, "rhs", RhsSymbol) ||
@@ -1118,7 +1118,7 @@ void RedundantExpressionCheck::checkArithmeticExpr(
   } else if (const auto *ComparisonOperator =
                  Result.Nodes.getNodeAs<BinaryOperator>(
                      "binop-const-compare-to-binop-const")) {
-    BinaryOperatorKind Opcode = ComparisonOperator->getOpcode();
+    const BinaryOperatorKind Opcode = ComparisonOperator->getOpcode();
 
     if (!retrieveBinOpIntegerConstantExpr(Result, "lhs", LhsOpcode, LhsSymbol,
                                           LhsValue) ||
@@ -1147,16 +1147,18 @@ void RedundantExpressionCheck::checkArithmeticExpr(
   }
 }
 
-static bool exprEvaluatesToZero(BinaryOperatorKind Opcode, APSInt Value) {
+static bool exprEvaluatesToZero(BinaryOperatorKind Opcode,
+                                const APSInt &Value) {
   return (Opcode == BO_And || Opcode == BO_AndAssign) && Value == 0;
 }
 
 static bool exprEvaluatesToBitwiseNegatedZero(BinaryOperatorKind Opcode,
-                                              APSInt Value) {
+                                              const APSInt &Value) {
   return (Opcode == BO_Or || Opcode == BO_OrAssign) && ~Value == 0;
 }
 
-static bool exprEvaluatesToSymbolic(BinaryOperatorKind Opcode, APSInt Value) {
+static bool exprEvaluatesToSymbolic(BinaryOperatorKind Opcode,
+                                    const APSInt &Value) {
   return ((Opcode == BO_Or || Opcode == BO_OrAssign) && Value == 0) ||
          ((Opcode == BO_And || Opcode == BO_AndAssign) && ~Value == 0);
 }
@@ -1165,7 +1167,7 @@ void RedundantExpressionCheck::checkBitwiseExpr(
     const MatchFinder::MatchResult &Result) {
   if (const auto *ComparisonOperator = Result.Nodes.getNodeAs<BinaryOperator>(
           "binop-const-compare-to-const")) {
-    BinaryOperatorKind Opcode = ComparisonOperator->getOpcode();
+    const BinaryOperatorKind Opcode = ComparisonOperator->getOpcode();
 
     APSInt LhsValue, RhsValue;
     const Expr *LhsSymbol = nullptr;
@@ -1175,9 +1177,9 @@ void RedundantExpressionCheck::checkBitwiseExpr(
         !retrieveIntegerConstantExpr(Result, "rhs", RhsValue))
       return;
 
-    uint64_t LhsConstant = LhsValue.getZExtValue();
-    uint64_t RhsConstant = RhsValue.getZExtValue();
-    SourceLocation Loc = ComparisonOperator->getOperatorLoc();
+    const uint64_t LhsConstant = LhsValue.getZExtValue();
+    const uint64_t RhsConstant = RhsValue.getZExtValue();
+    const SourceLocation Loc = ComparisonOperator->getOperatorLoc();
 
     // Check expression: x & k1 == k2  (i.e. x & 0xFF == 0xF00)
     if (LhsOpcode == BO_And && (LhsConstant & RhsConstant) != RhsConstant) {
@@ -1208,24 +1210,24 @@ void RedundantExpressionCheck::checkBitwiseExpr(
     if ((Value != 0 && ~Value != 0) || Sym->getExprLoc().isMacroID())
       return;
 
-    SourceLocation Loc = IneffectiveOperator->getOperatorLoc();
+    const SourceLocation Loc = IneffectiveOperator->getOperatorLoc();
 
-    BinaryOperatorKind Opcode = IneffectiveOperator->getOpcode();
+    const BinaryOperatorKind Opcode = IneffectiveOperator->getOpcode();
     if (exprEvaluatesToZero(Opcode, Value)) {
       diag(Loc, "expression always evaluates to 0");
     } else if (exprEvaluatesToBitwiseNegatedZero(Opcode, Value)) {
-      SourceRange ConstExprRange(ConstExpr->getBeginLoc(),
-                                 ConstExpr->getEndLoc());
-      StringRef ConstExprText = Lexer::getSourceText(
+      const SourceRange ConstExprRange(ConstExpr->getBeginLoc(),
+                                       ConstExpr->getEndLoc());
+      const StringRef ConstExprText = Lexer::getSourceText(
           CharSourceRange::getTokenRange(ConstExprRange), *Result.SourceManager,
           Result.Context->getLangOpts());
 
       diag(Loc, "expression always evaluates to '%0'") << ConstExprText;
 
     } else if (exprEvaluatesToSymbolic(Opcode, Value)) {
-      SourceRange SymExprRange(Sym->getBeginLoc(), Sym->getEndLoc());
+      const SourceRange SymExprRange(Sym->getBeginLoc(), Sym->getEndLoc());
 
-      StringRef ExprText = Lexer::getSourceText(
+      const StringRef ExprText = Lexer::getSourceText(
           CharSourceRange::getTokenRange(SymExprRange), *Result.SourceManager,
           Result.Context->getLangOpts());
 
@@ -1240,7 +1242,7 @@ void RedundantExpressionCheck::checkRelationalExpr(
           "comparisons-of-symbol-and-const")) {
     // Matched expressions are: (x <op> k1) <REL> (x <op> k2).
     // E.g.: (X < 2) && (X > 4)
-    BinaryOperatorKind Opcode = ComparisonOperator->getOpcode();
+    const BinaryOperatorKind Opcode = ComparisonOperator->getOpcode();
 
     const Expr *LhsExpr = nullptr, *RhsExpr = nullptr;
     const Expr *LhsSymbol = nullptr, *RhsSymbol = nullptr;
@@ -1392,7 +1394,7 @@ void RedundantExpressionCheck::check(const MatchFinder::MatchResult &Result) {
     if (Call && canOverloadedOperatorArgsBeModified(Call, true))
       return;
 
-    StringRef Message =
+    const StringRef Message =
         Call ? "overloaded operator has equivalent nested operands"
              : "operator has equivalent nested operands";
 
@@ -1405,12 +1407,12 @@ void RedundantExpressionCheck::check(const MatchFinder::MatchResult &Result) {
 
   if (const auto *NegateOperator =
           Result.Nodes.getNodeAs<UnaryOperator>("logical-bitwise-confusion")) {
-    SourceLocation OperatorLoc = NegateOperator->getOperatorLoc();
+    const SourceLocation OperatorLoc = NegateOperator->getOperatorLoc();
 
     auto Diag =
         diag(OperatorLoc,
              "ineffective logical negation operator used; did you mean '~'?");
-    SourceLocation LogicalNotLocation = OperatorLoc.getLocWithOffset(1);
+    const SourceLocation LogicalNotLocation = OperatorLoc.getLocWithOffset(1);
 
     if (!LogicalNotLocation.isMacroID())
       Diag << FixItHint::CreateReplacement(
diff --git a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.h b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.h
index 57289c39df22d..f1270076ced15 100644
--- a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_REDUNDANT_EXPRESSION_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_REDUNDANT_EXPRESSION_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_REDUNDANTEXPRESSIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_REDUNDANTEXPRESSIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -33,4 +33,4 @@ class RedundantExpressionCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::misc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_REDUNDANT_EXPRESSION_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_REDUNDANTEXPRESSIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp b/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp
index 5ac53005ad0fa..ff866e9ea7a81 100644
--- a/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp
@@ -84,12 +84,12 @@ void StaticAssertCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *AssertExprRoot =
       Result.Nodes.getNodeAs<BinaryOperator>("assertExprRoot");
   const auto *CastExpr = Result.Nodes.getNodeAs<CStyleCastExpr>("castExpr");
-  SourceLocation AssertExpansionLoc = CondStmt->getBeginLoc();
+  const SourceLocation AssertExpansionLoc = CondStmt->getBeginLoc();
 
   if (!AssertExpansionLoc.isValid() || !AssertExpansionLoc.isMacroID())
     return;
 
-  StringRef MacroName =
+  const StringRef MacroName =
       Lexer::getImmediateMacroName(AssertExpansionLoc, SM, Opts);
 
   if (MacroName != "assert" || Condition->isValueDependent() ||
@@ -99,19 +99,20 @@ void StaticAssertCheck::check(const MatchFinder::MatchResult &Result) {
 
   // False literal is not the result of macro expansion.
   if (IsAlwaysFalse && (!CastExpr || CastExpr->getType()->isPointerType())) {
-    SourceLocation FalseLiteralLoc =
+    const SourceLocation FalseLiteralLoc =
         SM.getImmediateSpellingLoc(IsAlwaysFalse->getExprLoc());
     if (!FalseLiteralLoc.isMacroID())
       return;
 
-    StringRef FalseMacroName =
+    const StringRef FalseMacroName =
         Lexer::getImmediateMacroName(FalseLiteralLoc, SM, Opts);
     if (FalseMacroName.compare_insensitive("false") == 0 ||
         FalseMacroName.compare_insensitive("null") == 0)
       return;
   }
 
-  SourceLocation AssertLoc = SM.getImmediateMacroCallerLoc(AssertExpansionLoc);
+  const SourceLocation AssertLoc =
+      SM.getImmediateMacroCallerLoc(AssertExpansionLoc);
 
   SmallVector<FixItHint, 4> FixItHints;
   SourceLocation LastParenLoc;
diff --git a/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.h b/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.h
index 15c17e7fa8f65..56e4c12e97ed5 100644
--- a/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_THROW_BY_VALUE_CATCH_BY_REFERENCE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_THROW_BY_VALUE_CATCH_BY_REFERENCE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_THROWBYVALUECATCHBYREFERENCECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_THROWBYVALUECATCHBYREFERENCECHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -49,4 +49,4 @@ class ThrowByValueCatchByReferenceCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::misc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_THROW_BY_VALUE_CATCH_BY_REFERENCE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_THROWBYVALUECATCHBYREFERENCECHECK_H
diff --git a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp
index 8a85e79f5aa21..a14a559798789 100644
--- a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp
@@ -18,7 +18,7 @@ namespace {
 
 AST_MATCHER_P(CXXMethodDecl, firstParameter,
               ast_matchers::internal::Matcher<ParmVarDecl>, InnerMatcher) {
-  unsigned N = Node.isExplicitObjectMemberFunction() ? 1 : 0;
+  const unsigned N = Node.isExplicitObjectMemberFunction() ? 1 : 0;
   return (N < Node.parameters().size() &&
           InnerMatcher.matches(*Node.parameters()[N], Finder, Builder));
 }
diff --git a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.h b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.h
index be9e7b971256c..941fe7208814c 100644
--- a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_ASSIGNOPERATORSIGNATURECHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_ASSIGNOPERATORSIGNATURECHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNCONVENTIONALASSIGNOPERATORCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNCONVENTIONALASSIGNOPERATORCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -37,4 +37,4 @@ class UnconventionalAssignOperatorCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::misc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_ASSIGNOPERATORSIGNATURECHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNCONVENTIONALASSIGNOPERATORCHECK_H
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.h b/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.h
index ffe82ca989d17..b9d85c139fc47 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_ALIAS_DECLS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_ALIAS_DECLS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDALIASDECLSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDALIASDECLSCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "llvm/ADT/DenseMap.h"
@@ -32,4 +32,4 @@ class UnusedAliasDeclsCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::misc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_ALIAS_DECLS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDALIASDECLSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
index f2189f546cf55..ae080960b95bc 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
@@ -165,7 +165,7 @@ void UnusedParametersCheck::warnOnUnusedParameter(
     if (!Result.Context->getLangOpts().CPlusPlus)
       return;
 
-    SourceRange RemovalRange(Param->getLocation());
+    const SourceRange RemovalRange(Param->getLocation());
     // Note: We always add a space before the '/*' to not accidentally create
     // a '*/*' for pointer types, which doesn't start a comment. clang-format
     // will clean this up afterwards.
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.h b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.h
index 877fc4d6503d6..fe2cc6e46c34e 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_PARAMETERS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_PARAMETERS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDPARAMETERSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDPARAMETERSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -36,4 +36,4 @@ class UnusedParametersCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::misc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_PARAMETERS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDPARAMETERSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h
index 96d8d9da3ceb2..986bf37e259e8 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_USING_DECLS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_USING_DECLS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDUSINGDECLSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDUSINGDECLSCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "../utils/FileExtensionsUtils.h"
@@ -56,4 +56,4 @@ class UnusedUsingDeclsCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::misc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSED_USING_DECLS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_UNUSEDUSINGDECLSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.cpp
index aa0cc1ecd5761..ed36d486a1734 100644
--- a/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.cpp
@@ -52,7 +52,8 @@ void UseAnonymousNamespaceCheck::registerMatchers(MatchFinder *Finder) {
 
 void UseAnonymousNamespaceCheck::check(const MatchFinder::MatchResult &Result) {
   if (const auto *MatchedDecl = Result.Nodes.getNodeAs<NamedDecl>("x")) {
-    StringRef Type = llvm::isa<VarDecl>(MatchedDecl) ? "variable" : "function";
+    const StringRef Type =
+        llvm::isa<VarDecl>(MatchedDecl) ? "variable" : "function";
     diag(MatchedDecl->getLocation(),
          "%0 %1 declared 'static', move to anonymous namespace instead")
         << Type << MatchedDecl;
diff --git a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
index 415852d6f14e9..bad51c600f1cb 100644
--- a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
@@ -43,12 +43,6 @@ struct OptionEnumMapping<misc::UseInternalLinkageCheck::FixModeKind> {
 
 namespace clang::tidy::misc {
 
-namespace {
-
-AST_MATCHER(Decl, isFirstDecl) { return Node.isFirstDecl(); }
-
-AST_MATCHER(FunctionDecl, hasBody) { return Node.hasBody(); }
-
 static bool isInMainFile(SourceLocation L, SourceManager &SM,
                          const FileExtensionsSet &HeaderFileExtensions) {
   for (;;) {
@@ -65,6 +59,12 @@ static bool isInMainFile(SourceLocation L, SourceManager &SM,
   }
 }
 
+namespace {
+
+AST_MATCHER(Decl, isFirstDecl) { return Node.isFirstDecl(); }
+
+AST_MATCHER(FunctionDecl, hasBody) { return Node.hasBody(); }
+
 AST_MATCHER_P(Decl, isAllRedeclsInMainFile, FileExtensionsSet,
               HeaderFileExtensions) {
   return llvm::all_of(Node.redecls(), [&](const Decl *D) {
@@ -142,7 +142,8 @@ static constexpr StringRef Message =
 
 void UseInternalLinkageCheck::check(const MatchFinder::MatchResult &Result) {
   if (const auto *FD = Result.Nodes.getNodeAs<FunctionDecl>("fn")) {
-    DiagnosticBuilder DB = diag(FD->getLocation(), Message) << "function" << FD;
+    const DiagnosticBuilder DB = diag(FD->getLocation(), Message)
+                                 << "function" << FD;
     const SourceLocation FixLoc = FD->getInnerLocStart();
     if (FixLoc.isInvalid() || FixLoc.isMacroID())
       return;
@@ -157,7 +158,8 @@ void UseInternalLinkageCheck::check(const MatchFinder::MatchResult &Result) {
     if (getLangOpts().CPlusPlus && VD->getType().isConstQualified())
       return;
 
-    DiagnosticBuilder DB = diag(VD->getLocation(), Message) << "variable" << VD;
+    const DiagnosticBuilder DB = diag(VD->getLocation(), Message)
+                                 << "variable" << VD;
     const SourceLocation FixLoc = VD->getInnerLocStart();
     if (FixLoc.isInvalid() || FixLoc.isMacroID())
       return;
diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp
index 1c0043b423361..531311e732290 100644
--- a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp
@@ -252,7 +252,7 @@ static SmallVector<BindArgument, 4>
 buildBindArguments(const MatchFinder::MatchResult &Result,
                    const CallableInfo &Callable) {
   SmallVector<BindArgument, 4> BindArguments;
-  static llvm::Regex MatchPlaceholder("^_([0-9]+)$");
+  static const llvm::Regex MatchPlaceholder("^_([0-9]+)$");
 
   const auto *BindCall = Result.Nodes.getNodeAs<CallExpr>("bind");
 
@@ -267,7 +267,7 @@ buildBindArguments(const MatchFinder::MatchResult &Result,
     if (Callable.Type == CT_MemberFunction)
       --ArgIndex;
 
-    bool IsObjectPtr = (I == 1 && Callable.Type == CT_MemberFunction);
+    const bool IsObjectPtr = (I == 1 && Callable.Type == CT_MemberFunction);
     B.E = E;
     B.SourceTokens = getSourceTextForExpr(Result, E);
 
@@ -340,13 +340,13 @@ static void addPlaceholderArgs(const LambdaProperties &LP,
                                    MaxPlaceholderIt->PlaceHolderIndex == 0))
     return;
 
-  size_t PlaceholderCount = MaxPlaceholderIt->PlaceHolderIndex;
+  const size_t PlaceholderCount = MaxPlaceholderIt->PlaceHolderIndex;
   Stream << "(";
   StringRef Delimiter = "";
   for (size_t I = 1; I <= PlaceholderCount; ++I) {
     Stream << Delimiter << "auto &&";
 
-    int ArgIndex = findPositionOfPlaceholderUse(Args, I);
+    const int ArgIndex = findPositionOfPlaceholderUse(Args, I);
 
     if (ArgIndex != -1 && Args[ArgIndex].IsUsed)
       Stream << " " << Args[ArgIndex].UsageIdentifier;
@@ -392,7 +392,7 @@ findCandidateCallOperators(const CXXRecordDecl *RecordDecl, size_t NumArgs) {
   std::vector<const FunctionDecl *> Candidates;
 
   for (const clang::CXXMethodDecl *Method : RecordDecl->methods()) {
-    OverloadedOperatorKind OOK = Method->getOverloadedOperator();
+    const OverloadedOperatorKind OOK = Method->getOverloadedOperator();
 
     if (OOK != OverloadedOperatorKind::OO_Call)
       continue;
@@ -410,7 +410,7 @@ findCandidateCallOperators(const CXXRecordDecl *RecordDecl, size_t NumArgs) {
       continue;
     const FunctionDecl *FD = FTD->getTemplatedDecl();
 
-    OverloadedOperatorKind OOK = FD->getOverloadedOperator();
+    const OverloadedOperatorKind OOK = FD->getOverloadedOperator();
     if (OOK != OverloadedOperatorKind::OO_Call)
       continue;
 
@@ -471,7 +471,7 @@ getCallMethodDecl(const MatchFinder::MatchResult &Result, CallableType Type,
 
   if (Type == CT_Object) {
     const auto *BindCall = Result.Nodes.getNodeAs<CallExpr>("bind");
-    size_t NumArgs = BindCall->getNumArgs() - 1;
+    const size_t NumArgs = BindCall->getNumArgs() - 1;
     return getCallOperator(Callee->getType()->getAsCXXRecordDecl(), NumArgs);
   }
 
@@ -488,7 +488,7 @@ getCallMethodDecl(const MatchFinder::MatchResult &Result, CallableType Type,
 static CallableType getCallableType(const MatchFinder::MatchResult &Result) {
   const auto *CallableExpr = Result.Nodes.getNodeAs<Expr>("ref");
 
-  QualType QT = CallableExpr->getType();
+  const QualType QT = CallableExpr->getType();
   if (QT->isMemberFunctionPointerType())
     return CT_MemberFunction;
 
@@ -614,7 +614,7 @@ static void emitCaptureList(const LambdaProperties &LP,
     if (B.CM == CM_None || !B.IsUsed)
       continue;
 
-    StringRef Delimiter = AnyCapturesEmitted ? ", " : "";
+    const StringRef Delimiter = AnyCapturesEmitted ? ", " : "";
 
     if (emitCapture(CaptureSet, Delimiter, B.CM, B.CE, B.CaptureIdentifier,
                     B.SourceTokens, Stream))
@@ -669,7 +669,7 @@ void AvoidBindCheck::check(const MatchFinder::MatchResult &Result) {
   emitCaptureList(LP, Result, Stream);
   Stream << "]";
 
-  ArrayRef<BindArgument> FunctionCallArgs = ArrayRef(LP.BindArguments);
+  const ArrayRef<BindArgument> FunctionCallArgs = ArrayRef(LP.BindArguments);
 
   addPlaceholderArgs(LP, Stream, PermissiveParameterList);
 
diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h
index 94838cb1b5a78..22e629f3826cd 100644
--- a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_AVOID_BIND_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_AVOID_BIND_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_AVOIDBINDCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_AVOIDBINDCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -34,4 +34,4 @@ class AvoidBindCheck : public ClangTidyCheck {
 };
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_AVOID_BIND_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_AVOIDBINDCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.cpp b/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.cpp
index 92900192957e5..71d89d3ab6098 100644
--- a/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.cpp
@@ -15,6 +15,14 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::modernize {
 
+template <typename TargetType, typename NodeType>
+static const TargetType *getAs(const NodeType *Node) {
+  if constexpr (std::is_same_v<NodeType, clang::DynTypedNode>)
+    return Node->template get<TargetType>();
+  else
+    return llvm::dyn_cast<TargetType>(Node);
+}
+
 namespace {
 
 AST_MATCHER(clang::TypeLoc, hasValidBeginLoc) {
@@ -39,14 +47,6 @@ AST_MATCHER(clang::ParmVarDecl, isArgvOfMain) {
   return FD ? FD->isMain() : false;
 }
 
-template <typename TargetType, typename NodeType>
-const TargetType *getAs(const NodeType *Node) {
-  if constexpr (std::is_same_v<NodeType, clang::DynTypedNode>)
-    return Node->template get<TargetType>();
-  else
-    return llvm::dyn_cast<TargetType>(Node);
-}
-
 AST_MATCHER(clang::TypeLoc, isWithinImplicitTemplateInstantiation) {
   const auto IsImplicitTemplateInstantiation = [](const auto *Node) {
     const auto IsImplicitInstantiation = [](const auto *Node) {
diff --git a/clang-tools-extra/clang-tidy/modernize/ConcatNestedNamespacesCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ConcatNestedNamespacesCheck.cpp
index 6e28cb223370a..7c82e9ef029ba 100644
--- a/clang-tools-extra/clang-tidy/modernize/ConcatNestedNamespacesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/ConcatNestedNamespacesCheck.cpp
@@ -25,7 +25,8 @@ static bool locationsInSameFile(const SourceManager &Sources,
 static StringRef getRawStringRef(const SourceRange &Range,
                                  const SourceManager &Sources,
                                  const LangOptions &LangOpts) {
-  CharSourceRange TextRange = Lexer::getAsCharRange(Range, Sources, LangOpts);
+  const CharSourceRange TextRange =
+      Lexer::getAsCharRange(Range, Sources, LangOpts);
   return Lexer::getSourceText(TextRange, Sources, LangOpts);
 }
 
@@ -56,15 +57,16 @@ SourceRange NS::getDefaultNamespaceBackRange() const {
 SourceRange NS::getNamespaceBackRange(const SourceManager &SM,
                                       const LangOptions &LangOpts) const {
   // Back from '}' to conditional '// namespace xxx'
-  SourceLocation Loc = front()->getRBraceLoc();
+  const SourceLocation Loc = front()->getRBraceLoc();
   std::optional<Token> Tok =
       utils::lexer::findNextTokenIncludingComments(Loc, SM, LangOpts);
   if (!Tok)
     return getDefaultNamespaceBackRange();
   if (Tok->getKind() != tok::TokenKind::comment)
     return getDefaultNamespaceBackRange();
-  SourceRange TokRange = SourceRange{Tok->getLocation(), Tok->getEndLoc()};
-  StringRef TokText = getRawStringRef(TokRange, SM, LangOpts);
+  const SourceRange TokRange =
+      SourceRange{Tok->getLocation(), Tok->getEndLoc()};
+  const StringRef TokText = getRawStringRef(TokRange, SM, LangOpts);
   NamespaceName CloseComment{"namespace "};
   appendCloseComment(CloseComment);
   // current fix hint in readability/NamespaceCommentCheck.cpp use single line
@@ -98,7 +100,7 @@ bool ConcatNestedNamespacesCheck::unsupportedNamespace(const NamespaceDecl &ND,
     return true;
   if (getLangOpts().CPlusPlus20) {
     // C++20 support inline nested namespace
-    bool IsFirstNS = IsChild || !Namespaces.empty();
+    const bool IsFirstNS = IsChild || !Namespaces.empty();
     return ND.isInlineNamespace() && !IsFirstNS;
   }
   return ND.isInlineNamespace();
@@ -106,7 +108,7 @@ bool ConcatNestedNamespacesCheck::unsupportedNamespace(const NamespaceDecl &ND,
 
 bool ConcatNestedNamespacesCheck::singleNamedNamespaceChild(
     const NamespaceDecl &ND) const {
-  NamespaceDecl::decl_range Decls = ND.decls();
+  const NamespaceDecl::decl_range Decls = ND.decls();
   if (std::distance(Decls.begin(), Decls.end()) != 1)
     return false;
 
@@ -121,7 +123,7 @@ void ConcatNestedNamespacesCheck::registerMatchers(
 
 void ConcatNestedNamespacesCheck::reportDiagnostic(
     const SourceManager &SM, const LangOptions &LangOpts) {
-  DiagnosticBuilder DB =
+  const DiagnosticBuilder DB =
       diag(Namespaces.front().front()->getBeginLoc(),
            "nested namespaces can be concatenated", DiagnosticIDs::Warning);
 
@@ -143,7 +145,7 @@ void ConcatNestedNamespacesCheck::reportDiagnostic(
 
   // the last one should be handled specially
   Fronts.pop_back();
-  SourceRange LastRBrace = Backs.pop_back_val();
+  const SourceRange LastRBrace = Backs.pop_back_val();
 
   NamespaceName ConcatNameSpace{"namespace "};
   for (const NS &NS : Namespaces) {
diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
index 1de9e136c5719..21eefab843af9 100644
--- a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
@@ -184,7 +184,7 @@ void IncludeModernizePPCallbacks::InclusionDirective(
   // 1. Insert std prefix for every such symbol occurrence.
   // 2. Insert `using namespace std;` to the beginning of TU.
   // 3. Do nothing and let the user deal with the migration himself.
-  SourceLocation DiagLoc = FilenameRange.getBegin();
+  const SourceLocation DiagLoc = FilenameRange.getBegin();
   if (auto It = CStyledHeaderToCxx.find(FileName);
       It != CStyledHeaderToCxx.end()) {
     IncludesToBeProcessed.emplace_back(IncludeMarker{
diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.h b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.h
index badb2b41f164f..015404ee9503c 100644
--- a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_C_HEADERS_TO_CXX_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_C_HEADERS_TO_CXX_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_DEPRECATEDHEADERSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_DEPRECATEDHEADERSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -57,4 +57,4 @@ class DeprecatedHeadersCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_C_HEADERS_TO_CXX_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_DEPRECATEDHEADERSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.cpp b/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.cpp
index 5e254376c9796..7e43165fb09f1 100644
--- a/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.cpp
@@ -36,10 +36,10 @@ void DeprecatedIosBaseAliasesCheck::registerMatchers(MatchFinder *Finder) {
 
 void DeprecatedIosBaseAliasesCheck::check(
     const MatchFinder::MatchResult &Result) {
-  SourceManager &SM = *Result.SourceManager;
+  const SourceManager &SM = *Result.SourceManager;
 
   const auto *Typedef = Result.Nodes.getNodeAs<TypedefDecl>("TypeDecl");
-  StringRef TypeName = Typedef->getName();
+  const StringRef TypeName = Typedef->getName();
   auto Replacement = getReplacementType(TypeName);
 
   TypeLoc TL = *Result.Nodes.getNodeAs<TypeLoc>("TypeLoc");
@@ -55,7 +55,8 @@ void DeprecatedIosBaseAliasesCheck::check(
     Fix = false;
   }
 
-  SourceLocation EndLoc = IoStateLoc.getLocWithOffset(TypeName.size() - 1);
+  const SourceLocation EndLoc =
+      IoStateLoc.getLocWithOffset(TypeName.size() - 1);
 
   if (Replacement) {
     const char *FixName = *Replacement;
diff --git a/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.cpp b/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.cpp
index 05cf51a430f3f..862ca184ecd97 100644
--- a/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.cpp
@@ -95,7 +95,7 @@ bool IntegralLiteralExpressionMatcher::unaryOperator() {
 }
 
 static LiteralSize literalTokenSize(const Token &Tok) {
-  unsigned int Length = Tok.getLength();
+  const unsigned int Length = Tok.getLength();
   if (Length <= 1)
     return LiteralSize::Int;
 
diff --git a/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.h b/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.h
index d495087f49491..ce0d7c04107aa 100644
--- a/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.h
+++ b/clang-tools-extra/clang-tidy/modernize/IntegralLiteralExpressionMatcher.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_INTEGRAL_LITERAL_EXPRESSION_MATCHER_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_INTEGRAL_LITERAL_EXPRESSION_MATCHER_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_INTEGRALLITERALEXPRESSIONMATCHER_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_INTEGRALLITERALEXPRESSIONMATCHER_H
 
 #include <clang/Lex/Token.h>
 #include <llvm/ADT/ArrayRef.h>
@@ -73,4 +73,4 @@ class IntegralLiteralExpressionMatcher {
 
 } // namespace clang::tidy::modernize
 
-#endif
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_INTEGRALLITERALEXPRESSIONMATCHER_H
diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
index fea5ac6f29eff..65c17223bae92 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
@@ -92,7 +92,7 @@ static StatementMatcher incrementVarMatcher() {
 }
 
 static StatementMatcher
-arrayConditionMatcher(internal::Matcher<Expr> LimitExpr) {
+arrayConditionMatcher(const internal::Matcher<Expr> &LimitExpr) {
   return binaryOperator(
       anyOf(allOf(hasOperatorName("<"), hasLHS(integerComparisonMatcher()),
                   hasRHS(LimitExpr)),
@@ -115,7 +115,7 @@ arrayConditionMatcher(internal::Matcher<Expr> LimitExpr) {
 ///   - The index variable is only used as an array index.
 ///   - All arrays indexed by the loop are the same.
 static StatementMatcher makeArrayLoopMatcher() {
-  StatementMatcher ArrayBoundMatcher =
+  const StatementMatcher ArrayBoundMatcher =
       expr(hasType(isInteger())).bind(ConditionBoundName);
 
   return forStmt(unless(isInTemplateInstantiation()),
@@ -168,7 +168,7 @@ static StatementMatcher makeIteratorLoopMatcher(bool IsReverse) {
   auto EndNameMatcherStd = IsReverse ? hasAnyName("::std::rend", "::std::crend")
                                      : hasAnyName("::std::end", "::std::cend");
 
-  StatementMatcher BeginCallMatcher =
+  const StatementMatcher BeginCallMatcher =
       expr(anyOf(cxxMemberCallExpr(argumentCountIs(0),
                                    callee(cxxMethodDecl(BeginNameMatcher))),
                  callExpr(argumentCountIs(1),
@@ -177,37 +177,37 @@ static StatementMatcher makeIteratorLoopMatcher(bool IsReverse) {
                           callee(functionDecl(BeginNameMatcherStd)))))
           .bind(BeginCallName);
 
-  DeclarationMatcher InitDeclMatcher =
+  const DeclarationMatcher InitDeclMatcher =
       varDecl(hasInitializer(anyOf(ignoringParenImpCasts(BeginCallMatcher),
                                    materializeTemporaryExpr(
                                        ignoringParenImpCasts(BeginCallMatcher)),
                                    hasDescendant(BeginCallMatcher))))
           .bind(InitVarName);
 
-  DeclarationMatcher EndDeclMatcher =
+  const DeclarationMatcher EndDeclMatcher =
       varDecl(hasInitializer(anything())).bind(EndVarName);
 
-  StatementMatcher EndCallMatcher = expr(anyOf(
+  const StatementMatcher EndCallMatcher = expr(anyOf(
       cxxMemberCallExpr(argumentCountIs(0),
                         callee(cxxMethodDecl(EndNameMatcher))),
       callExpr(argumentCountIs(1), callee(functionDecl(EndNameMatcher)),
                usesADL()),
       callExpr(argumentCountIs(1), callee(functionDecl(EndNameMatcherStd)))));
 
-  StatementMatcher IteratorBoundMatcher =
+  const StatementMatcher IteratorBoundMatcher =
       expr(anyOf(ignoringParenImpCasts(
                      declRefExpr(to(varDecl(equalsBoundNode(EndVarName))))),
                  ignoringParenImpCasts(expr(EndCallMatcher).bind(EndCallName)),
                  materializeTemporaryExpr(ignoringParenImpCasts(
                      expr(EndCallMatcher).bind(EndCallName)))));
 
-  StatementMatcher IteratorComparisonMatcher = expr(ignoringParenImpCasts(
+  const StatementMatcher IteratorComparisonMatcher = expr(ignoringParenImpCasts(
       declRefExpr(to(varDecl(equalsBoundNode(InitVarName))))));
 
   // This matcher tests that a declaration is a CXXRecordDecl that has an
   // overloaded operator*(). If the operator*() returns by value instead of by
   // reference then the return type is tagged with DerefByValueResultName.
-  internal::Matcher<VarDecl> TestDerefReturnsByValue =
+  const internal::Matcher<VarDecl> TestDerefReturnsByValue =
       hasType(hasUnqualifiedDesugaredType(
           recordType(hasDeclaration(cxxRecordDecl(hasMethod(cxxMethodDecl(
               hasOverloadedOperatorName("*"),
@@ -280,7 +280,7 @@ static StatementMatcher makePseudoArrayLoopMatcher() {
   // FIXME: Also, a record doesn't necessarily need begin() and end(). Free
   // functions called begin() and end() taking the container as an argument
   // are also allowed.
-  TypeMatcher RecordWithBeginEnd = qualType(anyOf(
+  const TypeMatcher RecordWithBeginEnd = qualType(anyOf(
       qualType(isConstQualified(),
                hasUnqualifiedDesugaredType(recordType(hasDeclaration(
                    cxxRecordDecl(isSameOrDerivedFrom(cxxRecordDecl(
@@ -295,7 +295,7 @@ static StatementMatcher makePseudoArrayLoopMatcher() {
                        hasMethod(hasName("end"))))))))) // qualType
       ));
 
-  StatementMatcher SizeCallMatcher = expr(anyOf(
+  const StatementMatcher SizeCallMatcher = expr(anyOf(
       cxxMemberCallExpr(argumentCountIs(0),
                         callee(cxxMethodDecl(hasAnyName("size", "length"))),
                         on(anyOf(hasType(pointsTo(RecordWithBeginEnd)),
@@ -310,10 +310,10 @@ static StatementMatcher makePseudoArrayLoopMatcher() {
                  explicitCastExpr(hasSourceExpression(ignoringParenImpCasts(
                      expr(SizeCallMatcher).bind(EndCallName))))));
 
-  DeclarationMatcher EndDeclMatcher =
+  const DeclarationMatcher EndDeclMatcher =
       varDecl(hasInitializer(EndInitMatcher)).bind(EndVarName);
 
-  StatementMatcher IndexBoundMatcher =
+  const StatementMatcher IndexBoundMatcher =
       expr(anyOf(ignoringParenImpCasts(
                      declRefExpr(to(varDecl(equalsBoundNode(EndVarName))))),
                  EndInitMatcher));
@@ -620,7 +620,7 @@ void LoopConvertCheck::getAliasRange(SourceManager &SM, SourceRange &Range) {
       SM.getCharacterData(Range.getEnd().getLocWithOffset(1), &Invalid);
   if (Invalid)
     return;
-  unsigned Offset = std::strspn(TextAfter, " \t\r\n");
+  const unsigned Offset = std::strspn(TextAfter, " \t\r\n");
   Range =
       SourceRange(Range.getBegin(), Range.getEnd().getLocWithOffset(Offset));
 }
@@ -633,7 +633,7 @@ void LoopConvertCheck::doConversion(
     const DeclStmt *AliasDecl, bool AliasUseRequired, bool AliasFromForInit,
     const ForStmt *Loop, RangeDescriptor Descriptor) {
   std::string VarNameOrStructuredBinding;
-  bool VarNameFromAlias = (Usages.size() == 1) && AliasDecl;
+  const bool VarNameFromAlias = (Usages.size() == 1) && AliasDecl;
   bool AliasVarIsRef = false;
   bool CanCopy = true;
   std::vector<FixItHint> FixIts;
@@ -743,7 +743,7 @@ void LoopConvertCheck::doConversion(
   }
 
   // Now, we need to construct the new range expression.
-  SourceRange ParenRange(Loop->getLParenLoc(), Loop->getRParenLoc());
+  const SourceRange ParenRange(Loop->getLParenLoc(), Loop->getRParenLoc());
 
   QualType Type = Context->getAutoDeductType();
   if (!Descriptor.ElemType.isNull() && Descriptor.ElemType->isFundamentalType())
@@ -753,14 +753,15 @@ void LoopConvertCheck::doConversion(
   // If the new variable name is from the aliased variable, then the reference
   // type for the new variable should only be used if the aliased variable was
   // declared as a reference.
-  bool IsCheapToCopy =
+  const bool IsCheapToCopy =
       !Descriptor.ElemType.isNull() &&
       Descriptor.ElemType.isTriviallyCopyableType(*Context) &&
       !Descriptor.ElemType->isDependentSizedArrayType() &&
       // TypeInfo::Width is in bits.
       Context->getTypeInfo(Descriptor.ElemType).Width <= 8 * MaxCopySize;
-  bool UseCopy = CanCopy && ((VarNameFromAlias && !AliasVarIsRef) ||
-                             (Descriptor.DerefByConstRef && IsCheapToCopy));
+  const bool UseCopy =
+      CanCopy && ((VarNameFromAlias && !AliasVarIsRef) ||
+                  (Descriptor.DerefByConstRef && IsCheapToCopy));
 
   if (!UseCopy) {
     if (Descriptor.DerefByConstRef) {
@@ -866,7 +867,7 @@ void LoopConvertCheck::getIteratorLoopQualifiers(ASTContext *Context,
   // The matchers for iterator loops provide bound nodes to obtain this
   // information.
   const auto *InitVar = Nodes.getNodeAs<VarDecl>(InitVarName);
-  QualType CanonicalInitVarType = InitVar->getType().getCanonicalType();
+  const QualType CanonicalInitVarType = InitVar->getType().getCanonicalType();
   const auto *DerefByValueType =
       Nodes.getNodeAs<QualType>(DerefByValueResultName);
   Descriptor.DerefByValue = DerefByValueType;
@@ -934,12 +935,12 @@ bool LoopConvertCheck::isConvertible(ASTContext *Context,
 
   // FIXME: Try to put most of this logic inside a matcher.
   if (FixerKind == LFK_Iterator || FixerKind == LFK_ReverseIterator) {
-    QualType InitVarType = InitVar->getType();
-    QualType CanonicalInitVarType = InitVarType.getCanonicalType();
+    const QualType InitVarType = InitVar->getType();
+    const QualType CanonicalInitVarType = InitVarType.getCanonicalType();
 
     const auto *BeginCall = Nodes.getNodeAs<CallExpr>(BeginCallName);
     assert(BeginCall && "Bad Callback. No begin call expression");
-    QualType CanonicalBeginType =
+    const QualType CanonicalBeginType =
         BeginCall->getDirectCallee()->getReturnType().getCanonicalType();
     if (CanonicalBeginType->isPointerType() &&
         CanonicalInitVarType->isPointerType()) {
@@ -1054,7 +1055,7 @@ void LoopConvertCheck::check(const MatchFinder::MatchResult &Result) {
   }
 
   // Find out which qualifiers we have to use in the loop range.
-  TraversalKindScope RAII(*Context, TK_AsIs);
+  const TraversalKindScope RAII(*Context, TK_AsIs);
   const UsageResult &Usages = Finder.getUsages();
   determineRangeDescriptor(Context, Nodes, Loop, FixerKind, ContainerExpr,
                            Usages, Descriptor);
diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.h b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.h
index 55487828ca69e..958b4eb4ea2a5 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOP_CONVERT_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOP_CONVERT_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOPCONVERTCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOPCONVERTCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "../utils/IncludeInserter.h"
@@ -85,4 +85,4 @@ class LoopConvertCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOP_CONVERT_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOPCONVERTCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
index 6fb780844f2b6..170a4f6d8731f 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
@@ -101,7 +101,8 @@ bool DependencyFinderASTVisitor::VisitVarDecl(VarDecl *V) {
 /// If we already created a variable for TheLoop, check to make sure
 /// that the name was not already taken.
 bool DeclFinderASTVisitor::VisitForStmt(ForStmt *TheLoop) {
-  StmtGeneratedVarNameMap::const_iterator I = GeneratedDecls->find(TheLoop);
+  const StmtGeneratedVarNameMap::const_iterator I =
+      GeneratedDecls->find(TheLoop);
   if (I != GeneratedDecls->end() && I->second == Name) {
     Found = true;
     return false;
@@ -131,7 +132,7 @@ bool DeclFinderASTVisitor::VisitDeclRefExpr(DeclRefExpr *DeclRef) {
 /// If the new variable name conflicts with any type used in the loop,
 /// then we mark that variable name as taken.
 bool DeclFinderASTVisitor::VisitTypeLoc(TypeLoc TL) {
-  QualType QType = TL.getType();
+  const QualType QType = TL.getType();
 
   // Check if our name conflicts with a type, to handle for typedefs.
   if (QType.getAsString() == Name) {
@@ -364,7 +365,7 @@ static bool isAliasDecl(ASTContext *Context, const Decl *TheDecl,
   // Check that the declared type is the same as (or a reference to) the
   // container type.
   if (!OnlyCasts) {
-    QualType InitType = Init->getType();
+    const QualType InitType = Init->getType();
     QualType DeclarationType = VDecl->getType();
     if (!DeclarationType.isNull() && DeclarationType->isReferenceType())
       DeclarationType = DeclarationType.getNonReferenceType();
@@ -440,7 +441,7 @@ static bool arrayMatchesBoundExpr(ASTContext *Context,
       ConditionExpr->getIntegerConstantExpr(*Context);
   if (!ConditionSize)
     return false;
-  llvm::APSInt ArraySize(ConstType->getSize());
+  const llvm::APSInt ArraySize(ConstType->getSize());
   return llvm::APSInt::isSameValue(*ConditionSize, ArraySize);
 }
 
@@ -571,7 +572,7 @@ bool ForLoopIndexUseVisitor::TraverseMemberExpr(MemberExpr *Member) {
 
     // FIXME: This works around not having the location of the arrow operator.
     // Consider adding OperatorLoc to MemberExpr?
-    SourceLocation ArrowLoc = Lexer::getLocForEndOfToken(
+    const SourceLocation ArrowLoc = Lexer::getLocForEndOfToken(
         Base->getExprLoc(), 0, Context->getSourceManager(),
         Context->getLangOpts());
     // If something complicated is happening (i.e. the next token isn't an
@@ -821,7 +822,7 @@ bool ForLoopIndexUseVisitor::traverseStmtImpl(Stmt *S) {
   const Stmt *OldNextParent = NextStmtParent;
   CurrStmtParent = NextStmtParent;
   NextStmtParent = S;
-  bool Result = VisitorBase::TraverseStmt(S);
+  const bool Result = VisitorBase::TraverseStmt(S);
   NextStmtParent = OldNextParent;
   return Result;
 }
@@ -850,7 +851,7 @@ std::string VariableNamer::createIndexName() {
   if (TheContainer)
     ContainerName = TheContainer->getName();
 
-  size_t Len = ContainerName.size();
+  const size_t Len = ContainerName.size();
   if (Len > 1 && ContainerName.ends_with(Style == NS_UpperCase ? "S" : "s")) {
     IteratorName = std::string(ContainerName.substr(0, Len - 1));
     // E.g.: (auto thing : things)
@@ -876,7 +877,7 @@ std::string VariableNamer::createIndexName() {
 /// converter in a loop nested within SourceStmt.
 bool VariableNamer::declarationExists(StringRef Symbol) {
   assert(Context != nullptr && "Expected an ASTContext");
-  IdentifierInfo &Ident = Context->Idents.get(Symbol);
+  const IdentifierInfo &Ident = Context->Idents.get(Symbol);
 
   // Check if the symbol is not an identifier (ie. is a keyword or alias).
   if (!isAnyIdentifier(Ident.getTokenID()))
@@ -888,7 +889,7 @@ bool VariableNamer::declarationExists(StringRef Symbol) {
 
   // Determine if the symbol was generated in a parent context.
   for (const Stmt *S = SourceStmt; S != nullptr; S = ReverseAST->lookup(S)) {
-    StmtGeneratedVarNameMap::const_iterator I = GeneratedDecls->find(S);
+    const StmtGeneratedVarNameMap::const_iterator I = GeneratedDecls->find(S);
     if (I != GeneratedDecls->end() && I->second == Symbol)
       return true;
   }
diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.h b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.h
index 0a0db5e6c633f..5d0800d8e7880 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.h
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOP_CONVERT_UTILS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOP_CONVERT_UTILS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOPCONVERTUTILS_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOPCONVERTUTILS_H
 
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/RecursiveASTVisitor.h"
@@ -466,4 +466,4 @@ class VariableNamer {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOP_CONVERT_UTILS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_LOOPCONVERTUTILS_H
diff --git a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
index 2669aa2361ea1..098d46cae5df4 100644
--- a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
@@ -23,7 +23,7 @@ static bool hasOnlyComments(SourceLocation Loc, const LangOptions &Options,
                             StringRef Text) {
   // Use a lexer to look for tokens; if we find something other than a single
   // hash, then there were intervening tokens between macro definitions.
-  std::string Buffer{Text};
+  const std::string Buffer{Text};
   Lexer Lex(Loc, Options, Buffer.c_str(), Buffer.c_str(),
             Buffer.c_str() + Buffer.size());
   Token Tok;
@@ -47,7 +47,7 @@ static bool hasOnlyComments(SourceLocation Loc, const LangOptions &Options,
   };
 
   WhiteSpace State = WhiteSpace::Nothing;
-  for (char C : Text) {
+  for (const char C : Text) {
     switch (C) {
     case '\r':
       if (State == WhiteSpace::CR)
@@ -227,17 +227,17 @@ bool MacroToEnumCallbacks::isConsecutiveMacro(const MacroDirective *MD) const {
   if (CurrentFile->LastMacroLocation.isInvalid())
     return false;
 
-  SourceLocation Loc = MD->getLocation();
+  const SourceLocation Loc = MD->getLocation();
   if (CurrentFile->LastLine + 1 == SM.getSpellingLineNumber(Loc))
     return true;
 
-  SourceLocation Define =
+  const SourceLocation Define =
       SM.translateLineCol(SM.getFileID(Loc), SM.getSpellingLineNumber(Loc), 1);
-  CharSourceRange BetweenMacros{
+  const CharSourceRange BetweenMacros{
       SourceRange{CurrentFile->LastMacroLocation, Define}, true};
-  CharSourceRange CharRange =
+  const CharSourceRange CharRange =
       Lexer::makeFileCharRange(BetweenMacros, SM, LangOpts);
-  StringRef BetweenText = Lexer::getSourceText(CharRange, SM, LangOpts);
+  const StringRef BetweenText = Lexer::getSourceText(CharRange, SM, LangOpts);
   return hasOnlyComments(Define, LangOpts, BetweenText);
 }
 
@@ -258,7 +258,7 @@ void MacroToEnumCallbacks::conditionStart(const SourceLocation &Loc) {
 }
 
 void MacroToEnumCallbacks::checkCondition(SourceRange Range) {
-  CharSourceRange CharRange = Lexer::makeFileCharRange(
+  const CharSourceRange CharRange = Lexer::makeFileCharRange(
       CharSourceRange::getTokenRange(Range), SM, LangOpts);
   std::string Text = Lexer::getSourceText(CharRange, SM, LangOpts).str();
   Lexer Lex(CharRange.getBegin(), LangOpts, Text.data(), Text.data(),
@@ -285,7 +285,7 @@ void MacroToEnumCallbacks::checkName(const Token &MacroNameTok) {
 }
 
 void MacroToEnumCallbacks::rememberExpressionName(const Token &Tok) {
-  std::string Id = getTokenName(Tok).str();
+  const std::string Id = getTokenName(Tok).str();
   auto Pos = llvm::lower_bound(ExpressionNames, Id);
   if (Pos == ExpressionNames.end() || *Pos != Id) {
     ExpressionNames.insert(Pos, Id);
@@ -294,7 +294,7 @@ void MacroToEnumCallbacks::rememberExpressionName(const Token &Tok) {
 
 void MacroToEnumCallbacks::rememberExpressionTokens(
     ArrayRef<Token> MacroTokens) {
-  for (Token Tok : MacroTokens) {
+  for (const Token Tok : MacroTokens) {
     if (Tok.isAnyIdentifier())
       rememberExpressionName(Tok);
   }
@@ -318,8 +318,8 @@ void MacroToEnumCallbacks::FileChanged(SourceLocation Loc,
 
 bool MacroToEnumCallbacks::isInitializer(ArrayRef<Token> MacroTokens) {
   IntegralLiteralExpressionMatcher Matcher(MacroTokens, LangOpts.C99 == 0);
-  bool Matched = Matcher.match();
-  bool IsC = !LangOpts.CPlusPlus;
+  const bool Matched = Matcher.match();
+  const bool IsC = !LangOpts.CPlusPlus;
   if (IsC && (Matcher.largestLiteralSize() != LiteralSize::Int &&
               Matcher.largestLiteralSize() != LiteralSize::UnsignedInt))
     return false;
@@ -344,7 +344,7 @@ void MacroToEnumCallbacks::MacroDefined(const Token &MacroNameTok,
     return;
 
   const MacroInfo *Info = MD->getMacroInfo();
-  ArrayRef<Token> MacroTokens = Info->tokens();
+  const ArrayRef<Token> MacroTokens = Info->tokens();
   if (Info->isBuiltinMacro() || MacroTokens.empty())
     return;
   if (Info->isFunctionLike()) {
@@ -474,26 +474,26 @@ void MacroToEnumCallbacks::fixEnumMacro(const MacroList &MacroList) const {
       MacroList.front().Directive->getMacroInfo()->getDefinitionLoc();
   Begin = SM.translateLineCol(SM.getFileID(Begin),
                               SM.getSpellingLineNumber(Begin), 1);
-  DiagnosticBuilder Diagnostic =
+  const DiagnosticBuilder Diagnostic =
       Check->diag(Begin, "replace macro with enum")
       << FixItHint::CreateInsertion(Begin, "enum {\n");
 
   for (size_t I = 0U; I < MacroList.size(); ++I) {
     const EnumMacro &Macro = MacroList[I];
-    SourceLocation DefineEnd =
+    const SourceLocation DefineEnd =
         Macro.Directive->getMacroInfo()->getDefinitionLoc();
-    SourceLocation DefineBegin = SM.translateLineCol(
+    const SourceLocation DefineBegin = SM.translateLineCol(
         SM.getFileID(DefineEnd), SM.getSpellingLineNumber(DefineEnd), 1);
     CharSourceRange DefineRange;
     DefineRange.setBegin(DefineBegin);
     DefineRange.setEnd(DefineEnd);
     Diagnostic << FixItHint::CreateRemoval(DefineRange);
 
-    SourceLocation NameEnd = Lexer::getLocForEndOfToken(
+    const SourceLocation NameEnd = Lexer::getLocForEndOfToken(
         Macro.Directive->getMacroInfo()->getDefinitionLoc(), 0, SM, LangOpts);
     Diagnostic << FixItHint::CreateInsertion(NameEnd, " =");
 
-    SourceLocation ValueEnd = Lexer::getLocForEndOfToken(
+    const SourceLocation ValueEnd = Lexer::getLocForEndOfToken(
         Macro.Directive->getMacroInfo()->getDefinitionEndLoc(), 0, SM,
         LangOpts);
     if (I < MacroList.size() - 1)
diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.h b/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.h
index 063b35fc46d4f..4b7f6250a3194 100644
--- a/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_SHARED_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_SHARED_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKESHAREDCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKESHAREDCHECK_H
 
 #include "MakeSmartPtrCheck.h"
 
@@ -35,4 +35,4 @@ class MakeSharedCheck : public MakeSmartPtrCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_SHARED_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKESHAREDCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
index 9d01e27fbab9c..7940939eb21a5 100644
--- a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
@@ -24,7 +24,7 @@ static constexpr char NewExpression[] = "newExpression";
 static std::string getNewExprName(const CXXNewExpr *NewExpr,
                                   const SourceManager &SM,
                                   const LangOptions &Lang) {
-  StringRef WrittenName = Lexer::getSourceText(
+  const StringRef WrittenName = Lexer::getSourceText(
       CharSourceRange::getTokenRange(
           NewExpr->getAllocatedTypeSourceInfo()->getTypeLoc().getSourceRange()),
       SM, Lang);
@@ -134,9 +134,9 @@ void MakeSmartPtrCheck::check(const MatchFinder::MatchResult &Result) {
   //
   // The fix of the check has side effect, it introduces value initialization
   // which maybe unexpected and cause performance regression.
-  bool Initializes = New->hasInitializer() ||
-                     !utils::type_traits::isTriviallyDefaultConstructible(
-                         New->getAllocatedType(), *Result.Context);
+  const bool Initializes = New->hasInitializer() ||
+                           !utils::type_traits::isTriviallyDefaultConstructible(
+                               New->getAllocatedType(), *Result.Context);
   if (!Initializes && IgnoreDefaultInitialization)
     return;
   if (Construct)
@@ -150,15 +150,15 @@ void MakeSmartPtrCheck::checkConstruct(SourceManager &SM, ASTContext *Ctx,
                                        const VarDecl *DVar,
                                        const QualType *Type,
                                        const CXXNewExpr *New) {
-  SourceLocation ConstructCallStart = Construct->getExprLoc();
-  bool InMacro = ConstructCallStart.isMacroID();
+  const SourceLocation ConstructCallStart = Construct->getExprLoc();
+  const bool InMacro = ConstructCallStart.isMacroID();
 
   if (InMacro && IgnoreMacros) {
     return;
   }
 
   bool Invalid = false;
-  StringRef ExprStr = Lexer::getSourceText(
+  const StringRef ExprStr = Lexer::getSourceText(
       CharSourceRange::getCharRange(
           ConstructCallStart, Construct->getParenOrBraceRange().getBegin()),
       SM, getLangOpts(), &Invalid);
@@ -178,7 +178,7 @@ void MakeSmartPtrCheck::checkConstruct(SourceManager &SM, ASTContext *Ctx,
   }
 
   // Find the location of the template's left angle.
-  size_t LAngle = ExprStr.find('<');
+  const size_t LAngle = ExprStr.find('<');
   SourceLocation ConstructCallEnd;
   if (LAngle == StringRef::npos) {
     // If the template argument is missing (because it is part of the alias)
@@ -202,7 +202,7 @@ void MakeSmartPtrCheck::checkConstruct(SourceManager &SM, ASTContext *Ctx,
   // If the smart_ptr is built with brace enclosed direct initialization, use
   // parenthesis instead.
   if (Construct->isListInitialization()) {
-    SourceRange BraceRange = Construct->getParenOrBraceRange();
+    const SourceRange BraceRange = Construct->getParenOrBraceRange();
     Diag << FixItHint::CreateReplacement(
         CharSourceRange::getCharRange(
             BraceRange.getBegin(), BraceRange.getBegin().getLocWithOffset(1)),
@@ -220,13 +220,13 @@ void MakeSmartPtrCheck::checkReset(SourceManager &SM, ASTContext *Ctx,
                                    const CXXMemberCallExpr *Reset,
                                    const CXXNewExpr *New) {
   const auto *Expr = cast<MemberExpr>(Reset->getCallee());
-  SourceLocation OperatorLoc = Expr->getOperatorLoc();
-  SourceLocation ResetCallStart = Reset->getExprLoc();
-  SourceLocation ExprStart = Expr->getBeginLoc();
-  SourceLocation ExprEnd =
+  const SourceLocation OperatorLoc = Expr->getOperatorLoc();
+  const SourceLocation ResetCallStart = Reset->getExprLoc();
+  const SourceLocation ExprStart = Expr->getBeginLoc();
+  const SourceLocation ExprEnd =
       Lexer::getLocForEndOfToken(Expr->getEndLoc(), 0, SM, getLangOpts());
 
-  bool InMacro = ExprStart.isMacroID();
+  const bool InMacro = ExprStart.isMacroID();
 
   if (InMacro && IgnoreMacros) {
     return;
@@ -267,7 +267,7 @@ bool MakeSmartPtrCheck::replaceNew(DiagnosticBuilder &Diag,
                                    const CXXNewExpr *New, SourceManager &SM,
                                    ASTContext *Ctx) {
   auto SkipParensParents = [&](const Expr *E) {
-    TraversalKindScope RAII(*Ctx, TK_AsIs);
+    const TraversalKindScope RAII(*Ctx, TK_AsIs);
 
     for (const Expr *OldE = nullptr; E != OldE;) {
       OldE = E;
@@ -281,9 +281,9 @@ bool MakeSmartPtrCheck::replaceNew(DiagnosticBuilder &Diag,
     return E;
   };
 
-  SourceRange NewRange = SkipParensParents(New)->getSourceRange();
-  SourceLocation NewStart = NewRange.getBegin();
-  SourceLocation NewEnd = NewRange.getEnd();
+  const SourceRange NewRange = SkipParensParents(New)->getSourceRange();
+  const SourceLocation NewStart = NewRange.getBegin();
+  const SourceLocation NewEnd = NewRange.getEnd();
 
   // Skip when the source location of the new expression is invalid.
   if (NewStart.isInvalid() || NewEnd.isInvalid())
@@ -362,7 +362,7 @@ bool MakeSmartPtrCheck::replaceNew(DiagnosticBuilder &Diag,
         return false;
     }
     if (ArraySizeExpr.empty()) {
-      SourceRange InitRange = New->getDirectInitRange();
+      const SourceRange InitRange = New->getDirectInitRange();
       Diag << FixItHint::CreateRemoval(
           SourceRange(NewStart, InitRange.getBegin()));
       Diag << FixItHint::CreateRemoval(SourceRange(InitRange.getEnd(), NewEnd));
diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h
index 28d5b459dd914..1d70f62d4be4e 100644
--- a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_SMART_PTR_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_SMART_PTR_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKESMARTPTRCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKESMARTPTRCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "../utils/IncludeInserter.h"
@@ -64,4 +64,4 @@ class MakeSmartPtrCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_SMART_PTR_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKESMARTPTRCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/MakeUniqueCheck.h b/clang-tools-extra/clang-tidy/modernize/MakeUniqueCheck.h
index 9c4f6bc746392..170343b9fca23 100644
--- a/clang-tools-extra/clang-tidy/modernize/MakeUniqueCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/MakeUniqueCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_UNIQUE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_UNIQUE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKEUNIQUECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKEUNIQUECHECK_H
 
 #include "MakeSmartPtrCheck.h"
 
@@ -37,4 +37,4 @@ class MakeUniqueCheck : public MakeSmartPtrCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKE_UNIQUE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MAKEUNIQUECHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp
index d5ccbb73735ec..a257f5325f780 100644
--- a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp
@@ -24,7 +24,8 @@ static bool isFirstFriendOfSecond(const CXXRecordDecl *Friend,
                                   const CXXRecordDecl *Class) {
   return llvm::any_of(
       Class->friends(), [Friend](FriendDecl *FriendDecl) -> bool {
-        if (TypeSourceInfo *FriendTypeSource = FriendDecl->getFriendType()) {
+        if (const TypeSourceInfo *FriendTypeSource =
+                FriendDecl->getFriendType()) {
           const QualType FriendType = FriendTypeSource->getType();
           return FriendType->getAsCXXRecordDecl() == Friend;
         }
@@ -208,7 +209,7 @@ static SmallVector<const ParmVarDecl *, 2>
 collectParamDecls(const CXXConstructorDecl *Ctor,
                   const ParmVarDecl *ParamDecl) {
   SmallVector<const ParmVarDecl *, 2> Results;
-  unsigned ParamIdx = ParamDecl->getFunctionScopeIndex();
+  const unsigned ParamIdx = ParamDecl->getFunctionScopeIndex();
 
   for (const FunctionDecl *Redecl : Ctor->redecls())
     Results.push_back(Redecl->getParamDecl(ParamIdx));
@@ -275,7 +276,7 @@ void PassByValueCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *ParamDecl = Result.Nodes.getNodeAs<ParmVarDecl>("Param");
   const auto *Initializer =
       Result.Nodes.getNodeAs<CXXCtorInitializer>("Initializer");
-  SourceManager &SM = *Result.SourceManager;
+  const SourceManager &SM = *Result.SourceManager;
 
   // If the parameter is used or anything other than the copy, do not apply
   // the changes.
@@ -299,7 +300,7 @@ void PassByValueCheck::check(const MatchFinder::MatchResult &Result) {
   if (ParamDecl->getType()->isLValueReferenceType()) {
     // Check if we can succesfully rewrite all declarations of the constructor.
     for (const ParmVarDecl *ParmDecl : collectParamDecls(Ctor, ParamDecl)) {
-      TypeLoc ParamTL = ParmDecl->getTypeSourceInfo()->getTypeLoc();
+      const TypeLoc ParamTL = ParmDecl->getTypeSourceInfo()->getTypeLoc();
       auto RefTL = ParamTL.getAs<ReferenceTypeLoc>();
       if (RefTL.isNull()) {
         // We cannot rewrite this instance. The type is probably hidden behind
@@ -309,11 +310,11 @@ void PassByValueCheck::check(const MatchFinder::MatchResult &Result) {
     }
     // Rewrite all declarations.
     for (const ParmVarDecl *ParmDecl : collectParamDecls(Ctor, ParamDecl)) {
-      TypeLoc ParamTL = ParmDecl->getTypeSourceInfo()->getTypeLoc();
+      const TypeLoc ParamTL = ParmDecl->getTypeSourceInfo()->getTypeLoc();
       auto RefTL = ParamTL.getAs<ReferenceTypeLoc>();
 
-      TypeLoc ValueTL = RefTL.getPointeeLoc();
-      CharSourceRange TypeRange = CharSourceRange::getTokenRange(
+      const TypeLoc ValueTL = RefTL.getPointeeLoc();
+      const CharSourceRange TypeRange = CharSourceRange::getTokenRange(
           ParmDecl->getBeginLoc(), ParamTL.getEndLoc());
       std::string ValueStr =
           Lexer::getSourceText(
diff --git a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h
index f27871c1a98b7..eb51f4a4c46ac 100644
--- a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_PASS_BY_VALUE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_PASS_BY_VALUE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_PASSBYVALUECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_PASSBYVALUECHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "../utils/IncludeInserter.h"
@@ -33,4 +33,4 @@ class PassByValueCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_PASS_BY_VALUE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_PASSBYVALUECHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp
index 8e514e4bc9893..2c4bddf262721 100644
--- a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp
@@ -51,11 +51,11 @@ static bool containsEscapedCharacters(const MatchFinder::MatchResult &Result,
     if (DisallowedChars.test(C))
       return false;
 
-  CharSourceRange CharRange = Lexer::makeFileCharRange(
+  const CharSourceRange CharRange = Lexer::makeFileCharRange(
       CharSourceRange::getTokenRange(Literal->getSourceRange()),
       *Result.SourceManager, Result.Context->getLangOpts());
-  StringRef Text = Lexer::getSourceText(CharRange, *Result.SourceManager,
-                                        Result.Context->getLangOpts());
+  const StringRef Text = Lexer::getSourceText(CharRange, *Result.SourceManager,
+                                              Result.Context->getLangOpts());
   if (Text.empty() || isRawStringLiteral(Text))
     return false;
 
@@ -116,7 +116,7 @@ createUserDefinedSuffix(const StringLiteral *Literal, const SourceManager &SM,
   const CharSourceRange CharRange =
       Lexer::makeFileCharRange(TokenRange, SM, LangOpts);
   if (T.hasUDSuffix()) {
-    StringRef Text = Lexer::getSourceText(CharRange, SM, LangOpts);
+    const StringRef Text = Lexer::getSourceText(CharRange, SM, LangOpts);
     const size_t UDSuffixPos = Text.find_last_of('"');
     if (UDSuffixPos == StringRef::npos)
       return std::nullopt;
@@ -135,7 +135,7 @@ static std::string createRawStringLiteral(const StringLiteral *Literal,
     Delimiter = (I == 0) ? DelimiterStem : DelimiterStem + std::to_string(I);
   }
 
-  std::optional<StringRef> UserDefinedSuffix =
+  const std::optional<StringRef> UserDefinedSuffix =
       createUserDefinedSuffix(Literal, SM, LangOpts);
 
   if (Delimiter.empty())
diff --git a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h
index 8ce6ec0bef636..5be38dd9dc5be 100644
--- a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RAW_STRING_LITERAL_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RAW_STRING_LITERAL_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RAWSTRINGLITERALCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RAWSTRINGLITERALCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include <bitset>
@@ -40,4 +40,4 @@ class RawStringLiteralCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RAW_STRING_LITERAL_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RAWSTRINGLITERALCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp
index 38b30f7994ff3..aa2db2146475b 100644
--- a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp
@@ -89,9 +89,9 @@ void RedundantVoidArgCheck::check(const MatchFinder::MatchResult &Result) {
 void RedundantVoidArgCheck::processFunctionDecl(
     const MatchFinder::MatchResult &Result, const FunctionDecl *Function) {
   const auto *Method = dyn_cast<CXXMethodDecl>(Function);
-  SourceLocation Start = Method && Method->getParent()->isLambda()
-                             ? Method->getBeginLoc()
-                             : Function->getLocation();
+  const SourceLocation Start = Method && Method->getParent()->isLambda()
+                                   ? Method->getBeginLoc()
+                                   : Function->getLocation();
   SourceLocation End = Function->getEndLoc();
   if (Function->isThisDeclarationADefinition()) {
     if (const Stmt *Body = Function->getBody()) {
@@ -113,7 +113,8 @@ static bool isMacroIdentifier(const IdentifierTable &Idents,
   if (!ProtoToken.is(tok::TokenKind::raw_identifier))
     return false;
 
-  IdentifierTable::iterator It = Idents.find(ProtoToken.getRawIdentifier());
+  const IdentifierTable::iterator It =
+      Idents.find(ProtoToken.getRawIdentifier());
   if (It == Idents.end())
     return false;
 
@@ -123,7 +124,7 @@ static bool isMacroIdentifier(const IdentifierTable &Idents,
 void RedundantVoidArgCheck::removeVoidArgumentTokens(
     const ast_matchers::MatchFinder::MatchResult &Result, SourceRange Range,
     StringRef GrammarLocation) {
-  CharSourceRange CharRange =
+  const CharSourceRange CharRange =
       Lexer::makeFileCharRange(CharSourceRange::getTokenRange(Range),
                                *Result.SourceManager, getLangOpts());
 
@@ -145,7 +146,7 @@ void RedundantVoidArgCheck::removeVoidArgumentTokens(
   Token ProtoToken;
   const IdentifierTable &Idents = Result.Context->Idents;
   int MacroLevel = 0;
-  std::string Diagnostic =
+  const std::string Diagnostic =
       ("redundant void argument list in " + GrammarLocation).str();
 
   while (!PrototypeLexer.LexFromRawLexer(ProtoToken)) {
@@ -216,7 +217,7 @@ void RedundantVoidArgCheck::removeVoidArgumentTokens(
 
 void RedundantVoidArgCheck::removeVoidToken(Token VoidToken,
                                             StringRef Diagnostic) {
-  SourceLocation VoidLoc = VoidToken.getLocation();
+  const SourceLocation VoidLoc = VoidToken.getLocation();
   diag(VoidLoc, Diagnostic) << FixItHint::CreateRemoval(VoidLoc);
 }
 
@@ -239,9 +240,9 @@ void RedundantVoidArgCheck::processFieldDecl(
 void RedundantVoidArgCheck::processVarDecl(
     const MatchFinder::MatchResult &Result, const VarDecl *Var) {
   if (protoTypeHasNoParms(Var->getType())) {
-    SourceLocation Begin = Var->getBeginLoc();
+    const SourceLocation Begin = Var->getBeginLoc();
     if (Var->hasInit()) {
-      SourceLocation InitStart =
+      const SourceLocation InitStart =
           Result.SourceManager->getExpansionLoc(Var->getInit()->getBeginLoc())
               .getLocWithOffset(-1);
       removeVoidArgumentTokens(Result, SourceRange(Begin, InitStart),
@@ -273,8 +274,9 @@ void RedundantVoidArgCheck::processLambdaExpr(
     const MatchFinder::MatchResult &Result, const LambdaExpr *Lambda) {
   if (Lambda->getLambdaClass()->getLambdaCallOperator()->getNumParams() == 0 &&
       Lambda->hasExplicitParameters()) {
-    SourceManager *SM = Result.SourceManager;
-    TypeLoc TL = Lambda->getLambdaClass()->getLambdaTypeInfo()->getTypeLoc();
+    const SourceManager *SM = Result.SourceManager;
+    const TypeLoc TL =
+        Lambda->getLambdaClass()->getLambdaTypeInfo()->getTypeLoc();
     removeVoidArgumentTokens(Result,
                              {SM->getSpellingLoc(TL.getBeginLoc()),
                               SM->getSpellingLoc(TL.getEndLoc())},
diff --git a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.h b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.h
index 53de74b68ff26..d6edd9950ddae 100644
--- a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REDUNDANT_VOID_ARG_CHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REDUNDANT_VOID_ARG_CHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REDUNDANTVOIDARGCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REDUNDANTVOIDARGCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "clang/Lex/Token.h"
@@ -73,4 +73,4 @@ class RedundantVoidArgCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REDUNDANT_VOID_ARG_CHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REDUNDANTVOIDARGCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp
index b562ae85aa266..d0577aeccd2f1 100644
--- a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp
@@ -96,10 +96,10 @@ void ReplaceAutoPtrCheck::registerPPCallbacks(const SourceManager &SM,
 }
 
 void ReplaceAutoPtrCheck::check(const MatchFinder::MatchResult &Result) {
-  SourceManager &SM = *Result.SourceManager;
+  const SourceManager &SM = *Result.SourceManager;
   if (const auto *E =
           Result.Nodes.getNodeAs<Expr>(AutoPtrOwnershipTransferId)) {
-    CharSourceRange Range = Lexer::makeFileCharRange(
+    const CharSourceRange Range = Lexer::makeFileCharRange(
         CharSourceRange::getTokenRange(E->getSourceRange()), SM, LangOptions());
 
     if (Range.isInvalid())
@@ -140,7 +140,8 @@ void ReplaceAutoPtrCheck::check(const MatchFinder::MatchResult &Result) {
       "auto_ptr")
     return;
 
-  SourceLocation EndLoc = AutoPtrLoc.getLocWithOffset(strlen("auto_ptr") - 1);
+  const SourceLocation EndLoc =
+      AutoPtrLoc.getLocWithOffset(strlen("auto_ptr") - 1);
   diag(AutoPtrLoc, "auto_ptr is deprecated, use unique_ptr instead")
       << FixItHint::CreateReplacement(SourceRange(AutoPtrLoc, EndLoc),
                                       "unique_ptr");
diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h
index 9a6e2bb0e074d..18f4740567d53 100644
--- a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACE_AUTO_PTR_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACE_AUTO_PTR_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACEAUTOPTRCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACEAUTOPTRCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "../utils/IncludeInserter.h"
@@ -56,4 +56,4 @@ class ReplaceAutoPtrCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACE_AUTO_PTR_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACEAUTOPTRCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.cpp
index 64b0029fc0e37..be5e21dce3ba1 100644
--- a/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.cpp
@@ -26,7 +26,7 @@ class ReplaceDisallowCopyAndAssignMacroCallbacks : public PPCallbacks {
 
   void MacroExpands(const Token &MacroNameTok, const MacroDefinition &MD,
                     SourceRange Range, const MacroArgs *Args) override {
-    IdentifierInfo *Info = MacroNameTok.getIdentifierInfo();
+    const IdentifierInfo *Info = MacroNameTok.getIdentifierInfo();
     if (!Info || !Args || Args->getNumMacroArguments() != 1)
       return;
     if (Info->getName() != Check.getMacroName())
@@ -38,11 +38,11 @@ class ReplaceDisallowCopyAndAssignMacroCallbacks : public PPCallbacks {
       // For now we only support simple argument that don't need to be
       // pre-expanded.
       return;
-    clang::IdentifierInfo *ClassIdent = ClassNameTok->getIdentifierInfo();
+    const clang::IdentifierInfo *ClassIdent = ClassNameTok->getIdentifierInfo();
     if (!ClassIdent)
       return;
 
-    std::string Replacement = llvm::formatv(
+    const std::string Replacement = llvm::formatv(
         R"cpp({0}(const {0} &) = delete;
 const {0} &operator=(const {0} &) = delete{1})cpp",
         ClassIdent->getName(), shouldAppendSemi(Range) ? ";" : "");
diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp
index 3d7b3eae544b6..cfc546a06b40c 100644
--- a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp
@@ -78,7 +78,7 @@ void ReplaceRandomShuffleCheck::check(const MatchFinder::MatchResult &Result) {
   }();
 
   std::string NewName = "shuffle";
-  StringRef ContainerText = Lexer::getSourceText(
+  const StringRef ContainerText = Lexer::getSourceText(
       CharSourceRange::getTokenRange(MatchedDecl->getSourceRange()),
       *Result.SourceManager, getLangOpts());
   if (ContainerText.starts_with("std::"))
diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h
index 5f2be10ca66bb..3ffa3878bc429 100644
--- a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACE_RANDOM_SHUFFLE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACE_RANDOM_SHUFFLE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACERANDOMSHUFFLECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACERANDOMSHUFFLECHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "../utils/IncludeInserter.h"
@@ -37,4 +37,4 @@ class ReplaceRandomShuffleCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACE_RANDOM_SHUFFLE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_REPLACERANDOMSHUFFLECHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.cpp
index eba2445c0aaea..15b64bc413be8 100644
--- a/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.cpp
@@ -54,7 +54,7 @@ void ReturnBracedInitListCheck::check(const MatchFinder::MatchResult &Result) {
       Result.Nodes.getNodeAs<CXXConstructExpr>("ctor");
 
   // Don't make replacements in macro.
-  SourceLocation Loc = MatchedConstructExpr->getExprLoc();
+  const SourceLocation Loc = MatchedConstructExpr->getExprLoc();
   if (Loc.isMacroID())
     return;
 
@@ -88,7 +88,7 @@ void ReturnBracedInitListCheck::check(const MatchFinder::MatchResult &Result) {
   }
 
   // Range for constructor name and opening brace.
-  CharSourceRange CtorCallSourceRange = CharSourceRange::getTokenRange(
+  const CharSourceRange CtorCallSourceRange = CharSourceRange::getTokenRange(
       Loc, CallParensRange.getBegin().getLocWithOffset(-1));
 
   Diag << FixItHint::CreateRemoval(CtorCallSourceRange)
diff --git a/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.h b/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.h
index ef465ea5e189d..be785716611a1 100644
--- a/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RETURN_BRACED_INIT_LIST_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RETURN_BRACED_INIT_LIST_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RETURNBRACEDINITLISTCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RETURNBRACEDINITLISTCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -34,4 +34,4 @@ class ReturnBracedInitListCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RETURN_BRACED_INIT_LIST_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_RETURNBRACEDINITLISTCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.cpp
index 6078013166d46..06982b8698e0c 100644
--- a/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.cpp
@@ -286,7 +286,7 @@ void TypeTraitsCheck::check(const MatchFinder::MatchResult &Result) {
 
   if (const auto *TL = Result.Nodes.getNodeAs<TypedefTypeLoc>(Bind)) {
     const NestedNameSpecifierLoc QualLoc = TL->getQualifierLoc();
-    NestedNameSpecifier NNS = QualLoc.getNestedNameSpecifier();
+    const NestedNameSpecifier NNS = QualLoc.getNestedNameSpecifier();
     if (const auto *CTSD = dyn_cast_if_present<ClassTemplateSpecializationDecl>(
             NNS.getAsRecordDecl())) {
       if (isNamedDeclInStdTraitsSet(CTSD, TypeTraits))
@@ -304,7 +304,7 @@ void TypeTraitsCheck::check(const MatchFinder::MatchResult &Result) {
   }
 
   if (const auto *DNTL = Result.Nodes.getNodeAs<DependentNameTypeLoc>(Bind)) {
-    NestedNameSpecifierLoc QualLoc = DNTL->getQualifierLoc();
+    const NestedNameSpecifierLoc QualLoc = DNTL->getQualifierLoc();
     if (checkTemplatedDecl(QualLoc.getNestedNameSpecifier(), TypeTraits))
       EmitTypeWarning(QualLoc, DNTL->getEndLoc(),
                       DNTL->getElaboratedKeywordLoc());
diff --git a/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.cpp
index 4e4817f2ec2e6..28d8f7572d32b 100644
--- a/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.cpp
@@ -23,7 +23,7 @@ void UnaryStaticAssertCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *AssertMessage =
       dyn_cast_if_present<StringLiteral>(MatchedDecl->getMessage());
 
-  SourceLocation Loc = MatchedDecl->getLocation();
+  const SourceLocation Loc = MatchedDecl->getLocation();
 
   if (!AssertMessage || AssertMessage->getLength() ||
       AssertMessage->getBeginLoc().isMacroID() || Loc.isMacroID())
diff --git a/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.h b/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.h
index 95611c9b13e77..ebe77b986d8a2 100644
--- a/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_UNARY_STATIC_ASSERT_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_UNARY_STATIC_ASSERT_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_UNARYSTATICASSERTCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_UNARYSTATICASSERTCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -31,4 +31,4 @@ class UnaryStaticAssertCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_UNARY_STATIC_ASSERT_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_UNARYSTATICASSERTCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
index 01796a6f4af2d..977ade12e2c3a 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
@@ -20,14 +20,13 @@ using namespace clang::ast_matchers;
 using namespace clang::ast_matchers::internal;
 
 namespace clang::tidy::modernize {
-namespace {
 
-const char IteratorDeclStmtId[] = "iterator_decl";
-const char DeclWithNewId[] = "decl_new";
-const char DeclWithCastId[] = "decl_cast";
-const char DeclWithTemplateCastId[] = "decl_template";
+static const char IteratorDeclStmtId[] = "iterator_decl";
+static const char DeclWithNewId[] = "decl_new";
+static const char DeclWithCastId[] = "decl_cast";
+static const char DeclWithTemplateCastId[] = "decl_template";
 
-size_t getTypeNameLength(bool RemoveStars, StringRef Text) {
+static size_t getTypeNameLength(bool RemoveStars, StringRef Text) {
   enum CharType { Space, Alpha, Punctuation };
   CharType LastChar = Space, BeforeSpace = Punctuation;
   size_t NumChars = 0;
@@ -54,6 +53,7 @@ size_t getTypeNameLength(bool RemoveStars, StringRef Text) {
   return NumChars;
 }
 
+namespace {
 /// Matches variable declarations that have explicit initializers that
 /// are not initializer lists.
 ///
@@ -65,7 +65,7 @@ size_t getTypeNameLength(bool RemoveStars, StringRef Text) {
 ///   MyType C;
 /// \endcode
 ///
-/// varDecl(hasWrittenNonListInitializer()) maches \c I and \c A but not \c B
+/// varDecl(hasWrittenNonListInitializer()) matches \c I and \c A but not \c B
 /// or \c C.
 AST_MATCHER(VarDecl, hasWrittenNonListInitializer) {
   const Expr *Init = Node.getAnyInitializer();
@@ -101,13 +101,23 @@ AST_MATCHER_P(QualType, isSugarFor, Matcher<QualType>, SugarMatcher) {
     if (SugarMatcher.matches(QT, Finder, Builder))
       return true;
 
-    QualType NewQT = QT.getSingleStepDesugaredType(Finder->getASTContext());
+    const QualType NewQT =
+        QT.getSingleStepDesugaredType(Finder->getASTContext());
     if (NewQT == QT)
       return false;
     QT = NewQT;
   }
 }
 
+/// Matches declaration reference or member expressions with explicit template
+/// arguments.
+AST_POLYMORPHIC_MATCHER(hasExplicitTemplateArgs,
+                        AST_POLYMORPHIC_SUPPORTED_TYPES(DeclRefExpr,
+                                                        MemberExpr)) {
+  return Node.hasExplicitTemplateArgs();
+}
+} // namespace
+
 /// Matches named declarations that have one of the standard iterator
 /// names: iterator, reverse_iterator, const_iterator, const_reverse_iterator.
 ///
@@ -118,7 +128,7 @@ AST_MATCHER_P(QualType, isSugarFor, Matcher<QualType>, SugarMatcher) {
 /// \endcode
 ///
 /// namedDecl(hasStdIteratorName()) matches \c I and \c CI.
-Matcher<NamedDecl> hasStdIteratorName() {
+static Matcher<NamedDecl> hasStdIteratorName() {
   static const StringRef IteratorNames[] = {"iterator", "reverse_iterator",
                                             "const_iterator",
                                             "const_reverse_iterator"};
@@ -137,34 +147,27 @@ Matcher<NamedDecl> hasStdIteratorName() {
 ///
 /// recordDecl(hasStdContainerName()) matches \c vector and \c forward_list
 /// but not \c my_vec.
-Matcher<NamedDecl> hasStdContainerName() {
-  static StringRef ContainerNames[] = {"array",         "deque",
-                                       "forward_list",  "list",
-                                       "vector",
+static Matcher<NamedDecl> hasStdContainerName() {
+  static const StringRef ContainerNames[] = {
+      "array",         "deque",
+      "forward_list",  "list",
+      "vector",
 
-                                       "map",           "multimap",
-                                       "set",           "multiset",
+      "map",           "multimap",
+      "set",           "multiset",
 
-                                       "unordered_map", "unordered_multimap",
-                                       "unordered_set", "unordered_multiset",
+      "unordered_map", "unordered_multimap",
+      "unordered_set", "unordered_multiset",
 
-                                       "queue",         "priority_queue",
-                                       "stack"};
+      "queue",         "priority_queue",
+      "stack"};
 
   return hasAnyName(ContainerNames);
 }
 
-/// Matches declaration reference or member expressions with explicit template
-/// arguments.
-AST_POLYMORPHIC_MATCHER(hasExplicitTemplateArgs,
-                        AST_POLYMORPHIC_SUPPORTED_TYPES(DeclRefExpr,
-                                                        MemberExpr)) {
-  return Node.hasExplicitTemplateArgs();
-}
-
 /// Returns a DeclarationMatcher that matches standard iterators nested
 /// inside records with a standard container name.
-DeclarationMatcher standardIterator() {
+static DeclarationMatcher standardIterator() {
   return decl(
       namedDecl(hasStdIteratorName()),
       hasDeclContext(recordDecl(hasStdContainerName(), isInStdNamespace())));
@@ -172,19 +175,19 @@ DeclarationMatcher standardIterator() {
 
 /// Returns a TypeMatcher that matches typedefs for standard iterators
 /// inside records with a standard container name.
-TypeMatcher typedefIterator() {
+static TypeMatcher typedefIterator() {
   return typedefType(hasDeclaration(standardIterator()));
 }
 
 /// Returns a TypeMatcher that matches records named for standard
 /// iterators nested inside records named for standard containers.
-TypeMatcher nestedIterator() {
+static TypeMatcher nestedIterator() {
   return recordType(hasDeclaration(standardIterator()));
 }
 
 /// Returns a TypeMatcher that matches types declared with using
 /// declarations and which name standard iterators for standard containers.
-TypeMatcher iteratorFromUsingDeclaration() {
+static TypeMatcher iteratorFromUsingDeclaration() {
   auto HasIteratorDecl = hasDeclaration(namedDecl(hasStdIteratorName()));
   // Unwrap the nested name specifier to test for one of the standard
   // containers.
@@ -198,7 +201,7 @@ TypeMatcher iteratorFromUsingDeclaration() {
 
 /// This matcher returns declaration statements that contain variable
 /// declarations with written non-list initializer for standard iterators.
-StatementMatcher makeIteratorDeclMatcher() {
+static StatementMatcher makeIteratorDeclMatcher() {
   return declStmt(unless(has(
                       varDecl(anyOf(unless(hasWrittenNonListInitializer()),
                                     unless(hasType(isSugarFor(anyOf(
@@ -207,7 +210,7 @@ StatementMatcher makeIteratorDeclMatcher() {
       .bind(IteratorDeclStmtId);
 }
 
-StatementMatcher makeDeclWithNewMatcher() {
+static StatementMatcher makeDeclWithNewMatcher() {
   return declStmt(
              unless(has(varDecl(anyOf(
                  unless(hasInitializer(ignoringParenImpCasts(cxxNewExpr()))),
@@ -225,13 +228,13 @@ StatementMatcher makeDeclWithNewMatcher() {
       .bind(DeclWithNewId);
 }
 
-StatementMatcher makeDeclWithCastMatcher() {
+static StatementMatcher makeDeclWithCastMatcher() {
   return declStmt(
              unless(has(varDecl(unless(hasInitializer(explicitCastExpr()))))))
       .bind(DeclWithCastId);
 }
 
-StatementMatcher makeDeclWithTemplateCastMatcher() {
+static StatementMatcher makeDeclWithTemplateCastMatcher() {
   auto ST =
       substTemplateTypeParmType(hasReplacementType(equalsBoundNode("arg")));
 
@@ -252,7 +255,7 @@ StatementMatcher makeDeclWithTemplateCastMatcher() {
       .bind(DeclWithTemplateCastId);
 }
 
-StatementMatcher makeCombinedMatcher() {
+static StatementMatcher makeCombinedMatcher() {
   return declStmt(
       // At least one varDecl should be a child of the declStmt to ensure
       // it's a declaration list and avoid matching other declarations,
@@ -265,8 +268,6 @@ StatementMatcher makeCombinedMatcher() {
             makeDeclWithCastMatcher(), makeDeclWithTemplateCastMatcher()));
 }
 
-} // namespace
-
 UseAutoCheck::UseAutoCheck(StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
       MinTypeNameLength(Options.get("MinTypeNameLength", 5)),
@@ -327,7 +328,8 @@ void UseAutoCheck::replaceIterators(const DeclStmt *D, ASTContext *Context) {
   // like function pointers. Not a concern since this action only works with
   // iterators but something to keep in mind in the future.
 
-  SourceRange Range(V->getTypeSourceInfo()->getTypeLoc().getSourceRange());
+  const SourceRange Range(
+      V->getTypeSourceInfo()->getTypeLoc().getSourceRange());
   diag(Range.getBegin(), "use auto when declaring iterators")
       << FixItHint::CreateReplacement(Range, "auto");
 }
@@ -343,7 +345,7 @@ static bool isMultiLevelPointerToTypeLocClasses(
     TypeLoc Loc,
     const std::initializer_list<TypeLoc::TypeLocClass> &LocClasses) {
   ignoreTypeLocClasses(Loc, {TypeLoc::Paren, TypeLoc::Qualified});
-  TypeLoc::TypeLocClass TLC = Loc.getTypeLocClass();
+  const TypeLoc::TypeLocClass TLC = Loc.getTypeLocClass();
   if (TLC != TypeLoc::Pointer && TLC != TypeLoc::MemberPointer)
     return false;
   ignoreTypeLocClasses(Loc, {TypeLoc::Paren, TypeLoc::Qualified,
@@ -360,7 +362,7 @@ void UseAutoCheck::replaceExpr(
     return;
 
   const QualType FirstDeclType = FirstDecl->getType().getCanonicalType();
-  TypeSourceInfo *TSI = FirstDecl->getTypeSourceInfo();
+  const TypeSourceInfo *TSI = FirstDecl->getTypeSourceInfo();
 
   if (TSI == nullptr)
     return;
@@ -410,7 +412,7 @@ void UseAutoCheck::replaceExpr(
     ignoreTypeLocClasses(Loc, {TypeLoc::Pointer, TypeLoc::Qualified});
   ignoreTypeLocClasses(Loc, {TypeLoc::LValueReference, TypeLoc::RValueReference,
                              TypeLoc::Qualified});
-  SourceRange Range(Loc.getSourceRange());
+  const SourceRange Range(Loc.getSourceRange());
 
   if (MinTypeNameLength != 0 &&
       getTypeNameLength(RemoveStars,
@@ -421,17 +423,17 @@ void UseAutoCheck::replaceExpr(
 
   auto Diag = diag(Range.getBegin(), Message);
 
-  bool ShouldReplenishVariableName = isMultiLevelPointerToTypeLocClasses(
+  const bool ShouldReplenishVariableName = isMultiLevelPointerToTypeLocClasses(
       TSI->getTypeLoc(), {TypeLoc::FunctionProto, TypeLoc::ConstantArray});
 
   // Space after 'auto' to handle cases where the '*' in the pointer type is
   // next to the identifier. This avoids changing 'int *p' into 'autop'.
-  llvm::StringRef Auto = ShouldReplenishVariableName
-                             ? (RemoveStars ? "auto " : "auto *")
-                             : (RemoveStars ? "auto " : "auto");
-  std::string ReplenishedVariableName =
+  const llvm::StringRef Auto = ShouldReplenishVariableName
+                                   ? (RemoveStars ? "auto " : "auto *")
+                                   : (RemoveStars ? "auto " : "auto");
+  const std::string ReplenishedVariableName =
       ShouldReplenishVariableName ? FirstDecl->getQualifiedNameAsString() : "";
-  std::string Replacement =
+  const std::string Replacement =
       (Auto + llvm::StringRef{ReplenishedVariableName}).str();
   Diag << FixItHint::CreateReplacement(Range, Replacement) << StarRemovals;
 }
diff --git a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.h b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.h
index dc39077d5ac99..85e87fe918e34 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_AUTO_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_AUTO_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEAUTOCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEAUTOCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -35,4 +35,4 @@ class UseAutoCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_AUTO_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEAUTOCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp
index 8b5ffe86b1839..6e2118787f9b4 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp
@@ -50,14 +50,14 @@ void UseBoolLiteralsCheck::registerMatchers(MatchFinder *Finder) {
 void UseBoolLiteralsCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *Literal = Result.Nodes.getNodeAs<IntegerLiteral>("literal");
   const auto *Cast = Result.Nodes.getNodeAs<Expr>("cast");
-  bool LiteralBooleanValue = Literal->getValue().getBoolValue();
+  const bool LiteralBooleanValue = Literal->getValue().getBoolValue();
 
   if (Literal->isInstantiationDependent())
     return;
 
   const Expr *Expression = Cast ? Cast : Literal;
 
-  bool InMacro = Expression->getBeginLoc().isMacroID();
+  const bool InMacro = Expression->getBeginLoc().isMacroID();
 
   if (InMacro && IgnoreMacros)
     return;
diff --git a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.h b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.h
index 64aff84b1be64..95bce0791258b 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_BOOL_LITERALS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_BOOL_LITERALS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEBOOLLITERALSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEBOOLLITERALSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -33,4 +33,4 @@ class UseBoolLiteralsCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_BOOL_LITERALS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEBOOLLITERALSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
index d5342a1664153..fdb088fe44be2 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
@@ -55,7 +55,7 @@ static std::optional<TemplateSpecializationTypeLoc>
 matchEnableIfSpecializationImplTypename(TypeLoc TheType) {
   if (const auto Dep = TheType.getAs<DependentNameTypeLoc>()) {
     const IdentifierInfo *Identifier = Dep.getTypePtr()->getIdentifier();
-    ElaboratedTypeKeyword Keyword = Dep.getTypePtr()->getKeyword();
+    const ElaboratedTypeKeyword Keyword = Dep.getTypePtr()->getKeyword();
     if (!Identifier || Identifier->getName() != "type" ||
         (Keyword != ElaboratedTypeKeyword::Typename &&
          Keyword != ElaboratedTypeKeyword::None)) {
@@ -88,7 +88,7 @@ matchEnableIfSpecializationImplTypename(TypeLoc TheType) {
     if (!FirstParam || !FirstParam->getType()->isBooleanType())
       return std::nullopt;
 
-    int NumArgs = SpecializationLoc.getNumArgs();
+    const int NumArgs = SpecializationLoc.getNumArgs();
     if (NumArgs != 1 && NumArgs != 2)
       return std::nullopt;
 
@@ -124,7 +124,7 @@ matchEnableIfSpecializationImplTrait(TypeLoc TheType) {
 
     if (const auto *AliasedType =
             dyn_cast<DependentNameType>(Specialization->getAliasedType())) {
-      ElaboratedTypeKeyword Keyword = AliasedType->getKeyword();
+      const ElaboratedTypeKeyword Keyword = AliasedType->getKeyword();
       if (AliasedType->getIdentifier()->getName() != "type" ||
           (Keyword != ElaboratedTypeKeyword::Typename &&
            Keyword != ElaboratedTypeKeyword::None)) {
@@ -133,7 +133,7 @@ matchEnableIfSpecializationImplTrait(TypeLoc TheType) {
     } else {
       return std::nullopt;
     }
-    int NumArgs = SpecializationLoc.getNumArgs();
+    const int NumArgs = SpecializationLoc.getNumArgs();
     if (NumArgs != 1 && NumArgs != 2)
       return std::nullopt;
 
@@ -223,7 +223,7 @@ getConditionRange(ASTContext &Context,
   const LangOptions &LangOpts = Context.getLangOpts();
   const SourceManager &SM = Context.getSourceManager();
   if (EnableIf.getNumArgs() > 1) {
-    TemplateArgumentLoc NextArg = EnableIf.getArgLoc(1);
+    const TemplateArgumentLoc NextArg = EnableIf.getArgLoc(1);
     return {EnableIf.getLAngleLoc().getLocWithOffset(1),
             utils::lexer::findPreviousTokenKind(
                 NextArg.getSourceRange().getBegin(), SM, LangOpts, tok::comma)};
@@ -235,7 +235,7 @@ getConditionRange(ASTContext &Context,
 
 static SourceRange getTypeRange(ASTContext &Context,
                                 const TemplateSpecializationTypeLoc &EnableIf) {
-  TemplateArgumentLoc Arg = EnableIf.getArgLoc(1);
+  const TemplateArgumentLoc Arg = EnableIf.getArgLoc(1);
   const LangOptions &LangOpts = Context.getLangOpts();
   const SourceManager &SM = Context.getSourceManager();
   return {utils::lexer::findPreviousTokenKind(Arg.getSourceRange().getBegin(),
@@ -269,7 +269,7 @@ getTypeText(ASTContext &Context,
 
 static std::optional<SourceLocation>
 findInsertionForConstraint(const FunctionDecl *Function, ASTContext &Context) {
-  SourceManager &SM = Context.getSourceManager();
+  const SourceManager &SM = Context.getSourceManager();
   const LangOptions &LangOpts = Context.getLangOpts();
 
   if (const auto *Constructor = dyn_cast<CXXConstructorDecl>(Function)) {
@@ -282,7 +282,7 @@ findInsertionForConstraint(const FunctionDecl *Function, ASTContext &Context) {
       return std::nullopt;
   }
   if (Function->isDeleted()) {
-    SourceLocation FunctionEnd = Function->getSourceRange().getEnd();
+    const SourceLocation FunctionEnd = Function->getSourceRange().getEnd();
     return utils::lexer::findNextAnyTokenKind(FunctionEnd, SM, LangOpts,
                                               tok::equal, tok::equal);
   }
@@ -314,7 +314,7 @@ static bool isPrimaryExpression(const Expr *Expression) {
 static std::optional<std::string> getConditionText(const Expr *ConditionExpr,
                                                    SourceRange ConditionRange,
                                                    ASTContext &Context) {
-  SourceManager &SM = Context.getSourceManager();
+  const SourceManager &SM = Context.getSourceManager();
   const LangOptions &LangOpts = Context.getLangOpts();
 
   SourceLocation PrevTokenLoc = ConditionRange.getEnd();
@@ -325,14 +325,14 @@ static std::optional<std::string> getConditionText(const Expr *ConditionExpr,
   Token PrevToken;
   std::tie(PrevToken, PrevTokenLoc) = utils::lexer::getPreviousTokenAndStart(
       PrevTokenLoc, SM, LangOpts, SkipComments);
-  bool EndsWithDoubleSlash =
+  const bool EndsWithDoubleSlash =
       PrevToken.is(tok::comment) &&
       Lexer::getSourceText(CharSourceRange::getCharRange(
                                PrevTokenLoc, PrevTokenLoc.getLocWithOffset(2)),
                            SM, LangOpts) == "//";
 
   bool Invalid = false;
-  llvm::StringRef ConditionText = Lexer::getSourceText(
+  const llvm::StringRef ConditionText = Lexer::getSourceText(
       CharSourceRange::getCharRange(ConditionRange), SM, LangOpts, &Invalid);
   if (Invalid)
     return std::nullopt;
@@ -361,9 +361,9 @@ static std::vector<FixItHint> handleReturnType(const FunctionDecl *Function,
                                                const TypeLoc &ReturnType,
                                                const EnableIfData &EnableIf,
                                                ASTContext &Context) {
-  TemplateArgumentLoc EnableCondition = EnableIf.Loc.getArgLoc(0);
+  const TemplateArgumentLoc EnableCondition = EnableIf.Loc.getArgLoc(0);
 
-  SourceRange ConditionRange = getConditionRange(Context, EnableIf.Loc);
+  const SourceRange ConditionRange = getConditionRange(Context, EnableIf.Loc);
 
   std::optional<std::string> ConditionText = getConditionText(
       EnableCondition.getSourceExpression(), ConditionRange, Context);
@@ -410,12 +410,12 @@ handleTrailingTemplateType(const FunctionTemplateDecl *FunctionTemplate,
                            const FunctionDecl *Function,
                            const Decl *LastTemplateParam,
                            const EnableIfData &EnableIf, ASTContext &Context) {
-  SourceManager &SM = Context.getSourceManager();
+  const SourceManager &SM = Context.getSourceManager();
   const LangOptions &LangOpts = Context.getLangOpts();
 
-  TemplateArgumentLoc EnableCondition = EnableIf.Loc.getArgLoc(0);
+  const TemplateArgumentLoc EnableCondition = EnableIf.Loc.getArgLoc(0);
 
-  SourceRange ConditionRange = getConditionRange(Context, EnableIf.Loc);
+  const SourceRange ConditionRange = getConditionRange(Context, EnableIf.Loc);
 
   std::optional<std::string> ConditionText = getConditionText(
       EnableCondition.getSourceExpression(), ConditionRange, Context);
diff --git a/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.cpp
index 0d2c3a79b9ece..cc6b7bfd4fd5b 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.cpp
@@ -163,7 +163,7 @@ static bool isZero(const Expr *E) {
   case Stmt::IntegerLiteralClass:
     return !cast<IntegerLiteral>(E)->getValue();
   case Stmt::FloatingLiteralClass: {
-    llvm::APFloat Value = cast<FloatingLiteral>(E)->getValue();
+    const llvm::APFloat Value = cast<FloatingLiteral>(E)->getValue();
     return Value.isZero() && !Value.isNegative();
   }
   default:
@@ -297,16 +297,16 @@ void UseDefaultMemberInitCheck::checkDefaultInit(
       }) > 1)
     return;
 
-  SourceLocation StartLoc = Field->getBeginLoc();
+  const SourceLocation StartLoc = Field->getBeginLoc();
   if (StartLoc.isMacroID() && IgnoreMacros)
     return;
 
-  SourceLocation FieldEnd =
+  const SourceLocation FieldEnd =
       Lexer::getLocForEndOfToken(Field->getSourceRange().getEnd(), 0,
                                  *Result.SourceManager, getLangOpts());
-  SourceLocation LParenEnd = Lexer::getLocForEndOfToken(
+  const SourceLocation LParenEnd = Lexer::getLocForEndOfToken(
       Init->getLParenLoc(), 0, *Result.SourceManager, getLangOpts());
-  CharSourceRange InitRange =
+  const CharSourceRange InitRange =
       CharSourceRange::getCharRange(LParenEnd, Init->getRParenLoc());
 
   const Expr *InitExpression = Init->getInit();
diff --git a/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.h b/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.h
index be6a18ad66d99..f379214308715 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_DEFAULT_MEMBER_INIT_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_DEFAULT_MEMBER_INIT_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEDEFAULTMEMBERINITCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEDEFAULTMEMBERINITCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -44,4 +44,4 @@ class UseDefaultMemberInitCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_DEFAULT_MEMBER_INIT_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEDEFAULTMEMBERINITCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp
index cc7c2d1e1dff5..2cc3ce1f7a686 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.cpp
@@ -40,6 +40,12 @@ static constexpr char StrictCppStandardComplianceName[] =
     "StrictCppStandardCompliance";
 static constexpr bool StrictCppStandardComplianceDefault = true;
 
+static unsigned getNumberOfDesignated(const InitListExpr *SyntacticInitList) {
+  return llvm::count_if(*SyntacticInitList, [](auto *InitExpr) {
+    return isa<DesignatedInitExpr>(InitExpr);
+  });
+}
+
 namespace {
 
 struct Designators {
@@ -74,12 +80,6 @@ struct Designators {
   }
 };
 
-unsigned getNumberOfDesignated(const InitListExpr *SyntacticInitList) {
-  return llvm::count_if(*SyntacticInitList, [](auto *InitExpr) {
-    return isa<DesignatedInitExpr>(InitExpr);
-  });
-}
-
 AST_MATCHER(CXXRecordDecl, isAggregate) {
   return Node.hasDefinition() && Node.isAggregate();
 }
@@ -152,7 +152,7 @@ void UseDesignatedInitializersCheck::check(
     if (IgnoreMacros && InitList->getBeginLoc().isMacroID())
       return;
     {
-      DiagnosticBuilder Diag =
+      const DiagnosticBuilder Diag =
           diag(InitList->getLBraceLoc(),
                "use designated initializer list to initialize %0");
       Diag << InitList->getType() << InitList->getSourceRange();
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
index ade0085267db3..e585dd1d40002 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
@@ -81,41 +81,44 @@ AST_MATCHER(CXXMemberCallExpr, hasSameNumArgsAsDeclNumParams) {
 AST_MATCHER(DeclRefExpr, hasExplicitTemplateArgs) {
   return Node.hasExplicitTemplateArgs();
 }
+} // namespace
 
 // Helper Matcher which applies the given QualType Matcher either directly or by
 // resolving a pointer type to its pointee. Used to match v.push_back() as well
 // as p->push_back().
-auto hasTypeOrPointeeType(
+static auto hasTypeOrPointeeType(
     const ast_matchers::internal::Matcher<QualType> &TypeMatcher) {
   return anyOf(hasType(TypeMatcher),
                hasType(pointerType(pointee(TypeMatcher))));
 }
 
 // Matches if the node has canonical type matching any of the given names.
-auto hasWantedType(llvm::ArrayRef<StringRef> TypeNames) {
+static auto hasWantedType(llvm::ArrayRef<StringRef> TypeNames) {
   return hasCanonicalType(hasDeclaration(cxxRecordDecl(hasAnyName(TypeNames))));
 }
 
 // Matches member call expressions of the named method on the listed container
 // types.
-auto cxxMemberCallExprOnContainer(StringRef MethodName,
-                                  llvm::ArrayRef<StringRef> ContainerNames) {
+static auto
+cxxMemberCallExprOnContainer(StringRef MethodName,
+                             llvm::ArrayRef<StringRef> ContainerNames) {
   return cxxMemberCallExpr(
       hasDeclaration(functionDecl(hasName(MethodName))),
       on(hasTypeOrPointeeType(hasWantedType(ContainerNames))));
 }
 
-const auto DefaultContainersWithPushBack =
+static const auto DefaultContainersWithPushBack =
     "::std::vector; ::std::list; ::std::deque";
-const auto DefaultContainersWithPush =
+static const auto DefaultContainersWithPush =
     "::std::stack; ::std::queue; ::std::priority_queue";
-const auto DefaultContainersWithPushFront =
+static const auto DefaultContainersWithPushFront =
     "::std::forward_list; ::std::list; ::std::deque";
-const auto DefaultSmartPointers =
+static const auto DefaultSmartPointers =
     "::std::shared_ptr; ::std::unique_ptr; ::std::auto_ptr; ::std::weak_ptr";
-const auto DefaultTupleTypes = "::std::pair; ::std::tuple";
-const auto DefaultTupleMakeFunctions = "::std::make_pair; ::std::make_tuple";
-const auto DefaultEmplacyFunctions =
+static const auto DefaultTupleTypes = "::std::pair; ::std::tuple";
+static const auto DefaultTupleMakeFunctions =
+    "::std::make_pair; ::std::make_tuple";
+static const auto DefaultEmplacyFunctions =
     "vector::emplace_back; vector::emplace;"
     "deque::emplace; deque::emplace_front; deque::emplace_back;"
     "forward_list::emplace_after; forward_list::emplace_front;"
@@ -129,7 +132,6 @@ const auto DefaultEmplacyFunctions =
     "unordered_multiset::emplace; unordered_multiset::emplace_hint;"
     "unordered_multimap::emplace; unordered_multimap::emplace_hint;"
     "stack::emplace; queue::emplace; priority_queue::emplace";
-} // namespace
 
 UseEmplaceCheck::UseEmplaceCheck(StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context), IgnoreImplicitConstructors(Options.get(
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.h b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.h
index 87ebf6ff98c2b..a7ad5bb166b6b 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EMPLACE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EMPLACE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEMPLACECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEMPLACECHECK_H
 
 #include "../ClangTidyCheck.h"
 #include <string>
@@ -45,4 +45,4 @@ class UseEmplaceCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EMPLACE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEMPLACECHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.cpp
index d6ddbb69f7b0d..fde9c7323ce3c 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.cpp
@@ -200,7 +200,7 @@ static bool isCopyAssignmentAndCanBeDefaulted(ASTContext *Context,
 /// Returns false if the body has any non-whitespace character.
 static bool bodyEmpty(const ASTContext *Context, const CompoundStmt *Body) {
   bool Invalid = false;
-  StringRef Text = Lexer::getSourceText(
+  const StringRef Text = Lexer::getSourceText(
       CharSourceRange::getCharRange(Body->getLBracLoc().getLocWithOffset(1),
                                     Body->getRBracLoc()),
       Context->getSourceManager(), Context->getLangOpts(), &Invalid);
@@ -306,8 +306,8 @@ void UseEqualsDefaultCheck::check(const MatchFinder::MatchResult &Result) {
     return;
 
   // If there are comments inside the body, don't do the change.
-  bool ApplyFix = SpecialFunctionDecl->isCopyAssignmentOperator() ||
-                  bodyEmpty(Result.Context, Body);
+  const bool ApplyFix = SpecialFunctionDecl->isCopyAssignmentOperator() ||
+                        bodyEmpty(Result.Context, Body);
 
   std::vector<FixItHint> RemoveInitializers;
   unsigned MemberType = 0;
@@ -345,14 +345,14 @@ void UseEqualsDefaultCheck::check(const MatchFinder::MatchResult &Result) {
   Diag << MemberType;
 
   if (ApplyFix) {
-    SourceLocation UnifiedEnd = utils::lexer::getUnifiedEndLoc(
+    const SourceLocation UnifiedEnd = utils::lexer::getUnifiedEndLoc(
         *Body, Result.Context->getSourceManager(),
         Result.Context->getLangOpts());
     // Skipping comments, check for a semicolon after Body->getSourceRange()
     std::optional<Token> Token = utils::lexer::findNextTokenSkippingComments(
         UnifiedEnd, Result.Context->getSourceManager(),
         Result.Context->getLangOpts());
-    StringRef Replacement =
+    const StringRef Replacement =
         Token && Token->is(tok::semi) ? "= default" : "= default;";
     Diag << FixItHint::CreateReplacement(Body->getSourceRange(), Replacement)
          << RemoveInitializers;
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h
index 519f1899170cc..a17d3d894e3cd 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EQUALS_DEFAULT_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EQUALS_DEFAULT_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEQUALSDEFAULTCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEQUALSDEFAULTCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -48,4 +48,4 @@ class UseEqualsDefaultCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EQUALS_DEFAULT_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEQUALSDEFAULTCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.cpp
index ab2d41a52040e..a19d2ecdad88d 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.cpp
@@ -74,7 +74,7 @@ void UseEqualsDeleteCheck::registerMatchers(MatchFinder *Finder) {
 void UseEqualsDeleteCheck::check(const MatchFinder::MatchResult &Result) {
   if (const auto *Func =
           Result.Nodes.getNodeAs<CXXMethodDecl>(SpecialFunction)) {
-    SourceLocation EndLoc = Lexer::getLocForEndOfToken(
+    const SourceLocation EndLoc = Lexer::getLocForEndOfToken(
         Func->getEndLoc(), 0, *Result.SourceManager, getLangOpts());
 
     if (IgnoreMacros && Func->getLocation().isMacroID())
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h
index 31a956bc49c5f..17155febbd374 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EQUALS_DELETE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EQUALS_DELETE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEQUALSDELETECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEQUALSDELETECHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -38,4 +38,4 @@ class UseEqualsDeleteCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_EQUALS_DELETE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEEQUALSDELETECHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
index 35320e83c5d4b..574cbea46124b 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
@@ -146,7 +146,7 @@ void UseIntegerSignComparisonCheck::check(
     R3.setBegin(Lexer::getLocForEndOfToken(
         SubExprRHS->getEndLoc(), 0, *Result.SourceManager, getLangOpts()));
   }
-  DiagnosticBuilder Diag =
+  const DiagnosticBuilder Diag =
       diag(BinaryOp->getBeginLoc(),
            "comparison between 'signed' and 'unsigned' integers");
   StringRef CmpNamespace;
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.cpp
index d22c99335d9bb..7e8d98241118a 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.cpp
@@ -110,11 +110,11 @@ void UseNodiscardCheck::registerMatchers(MatchFinder *Finder) {
 void UseNodiscardCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *MatchedDecl = Result.Nodes.getNodeAs<CXXMethodDecl>("no_discard");
   // Don't make replacements if the location is invalid or in a macro.
-  SourceLocation Loc = MatchedDecl->getLocation();
+  const SourceLocation Loc = MatchedDecl->getLocation();
   if (Loc.isInvalid() || Loc.isMacroID())
     return;
 
-  SourceLocation RetLoc = MatchedDecl->getInnerLocStart();
+  const SourceLocation RetLoc = MatchedDecl->getInnerLocStart();
 
   ASTContext &Context = *Result.Context;
 
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.cpp
index d1388dc6298e4..6bd5485abbac9 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.cpp
@@ -81,12 +81,12 @@ void UseNoexceptCheck::check(const MatchFinder::MatchResult &Result) {
 
   assert(Range.isValid() && "Exception Source Range is invalid.");
 
-  CharSourceRange CRange = Lexer::makeFileCharRange(
+  const CharSourceRange CRange = Lexer::makeFileCharRange(
       CharSourceRange::getTokenRange(Range), *Result.SourceManager,
       Result.Context->getLangOpts());
 
-  bool IsNoThrow = FnTy->isNothrow();
-  StringRef ReplacementStr =
+  const bool IsNoThrow = FnTy->isNothrow();
+  const StringRef ReplacementStr =
       IsNoThrow ? NoexceptMacro.empty() ? "noexcept" : NoexceptMacro
       : NoexceptMacro.empty()
           ? (DtorOrOperatorDel || UseNoexceptFalse) ? "noexcept(false)" : ""
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.h b/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.h
index 30b5d4ecd1cf2..a97b39bf54c4d 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_NOEXCEPT_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_NOEXCEPT_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USENOEXCEPTCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USENOEXCEPTCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -44,4 +44,4 @@ class UseNoexceptCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_NOEXCEPT_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USENOEXCEPTCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
index b6834c69204c2..928a00775fe12 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
@@ -92,12 +92,13 @@ static bool isReplaceableRange(SourceLocation StartLoc, SourceLocation EndLoc,
 /// Returns true if and only if a replacement was made.
 static void replaceWithNullptr(ClangTidyCheck &Check, SourceManager &SM,
                                SourceLocation StartLoc, SourceLocation EndLoc) {
-  CharSourceRange Range(SourceRange(StartLoc, EndLoc), true);
+  const CharSourceRange Range(SourceRange(StartLoc, EndLoc), true);
   // Add a space if nullptr follows an alphanumeric character. This happens
   // whenever there is an c-style explicit cast to nullptr not surrounded by
   // parentheses and right beside a return statement.
-  SourceLocation PreviousLocation = StartLoc.getLocWithOffset(-1);
-  bool NeedsSpace = isAlphanumeric(*SM.getCharacterData(PreviousLocation));
+  const SourceLocation PreviousLocation = StartLoc.getLocWithOffset(-1);
+  const bool NeedsSpace =
+      isAlphanumeric(*SM.getCharacterData(PreviousLocation));
   Check.diag(Range.getBegin(), "use nullptr") << FixItHint::CreateReplacement(
       Range, NeedsSpace ? " nullptr" : "nullptr");
 }
@@ -136,7 +137,7 @@ class MacroArgUsageVisitor : public RecursiveASTVisitor<MacroArgUsageVisitor> {
   }
 
   bool TraverseStmt(Stmt *S) {
-    bool VisitedPreviously = Visited;
+    const bool VisitedPreviously = Visited;
 
     if (!RecursiveASTVisitor<MacroArgUsageVisitor>::TraverseStmt(S))
       return false;
@@ -258,8 +259,8 @@ class CastSequenceVisitor : public RecursiveASTVisitor<CastSequenceVisitor> {
     // If the location comes from a macro body expansion, check to see if its
     // coming from one of the allowed 'NULL' macros.
     if (SM.isMacroArgExpansion(StartLoc) && SM.isMacroArgExpansion(EndLoc)) {
-      SourceLocation FileLocStart = SM.getFileLoc(StartLoc),
-                     FileLocEnd = SM.getFileLoc(EndLoc);
+      const SourceLocation FileLocStart = SM.getFileLoc(StartLoc),
+                           FileLocEnd = SM.getFileLoc(EndLoc);
       SourceLocation ImmediateMacroArgLoc, MacroLoc;
       // Skip NULL macros used in macro.
       if (!getMacroAndArgLocations(StartLoc, ImmediateMacroArgLoc, MacroLoc) ||
@@ -274,7 +275,7 @@ class CastSequenceVisitor : public RecursiveASTVisitor<CastSequenceVisitor> {
     }
 
     if (SM.isMacroBodyExpansion(StartLoc) && SM.isMacroBodyExpansion(EndLoc)) {
-      StringRef OutermostMacroName =
+      const StringRef OutermostMacroName =
           getOutermostMacroName(StartLoc, SM, Context.getLangOpts());
 
       // Check to see if the user wants to replace the macro being expanded.
@@ -302,7 +303,7 @@ class CastSequenceVisitor : public RecursiveASTVisitor<CastSequenceVisitor> {
   /// Tests that all expansions of a macro arg, one of which expands to
   /// result in \p CE, yield NullTo(Member)Pointer casts.
   bool allArgUsesValid(const CastExpr *CE) {
-    SourceLocation CastLoc = CE->getBeginLoc();
+    const SourceLocation CastLoc = CE->getBeginLoc();
 
     // Step 1: Get location of macro arg and location of the macro the arg was
     // provided to.
@@ -348,17 +349,17 @@ class CastSequenceVisitor : public RecursiveASTVisitor<CastSequenceVisitor> {
 
     // Find the location of the immediate macro expansion.
     while (true) {
-      std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(ArgLoc);
+      const std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(ArgLoc);
       const SrcMgr::SLocEntry *E = &SM.getSLocEntry(LocInfo.first);
       const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
 
-      SourceLocation OldArgLoc = ArgLoc;
+      const SourceLocation OldArgLoc = ArgLoc;
       ArgLoc = Expansion.getExpansionLocStart();
       if (!Expansion.isMacroArgExpansion()) {
         if (!MacroLoc.isFileID())
           return false;
 
-        StringRef Name =
+        const StringRef Name =
             Lexer::getImmediateMacroName(OldArgLoc, SM, Context.getLangOpts());
         return llvm::is_contained(NullMacros, Name);
       }
@@ -371,7 +372,7 @@ class CastSequenceVisitor : public RecursiveASTVisitor<CastSequenceVisitor> {
 
       // If spelling location resides in the same FileID as macro expansion
       // location, it means there is no inner macro.
-      FileID MacroFID = SM.getFileID(MacroLoc);
+      const FileID MacroFID = SM.getFileID(MacroLoc);
       if (SM.isInFileID(ArgLoc, MacroFID)) {
         // Don't transform this case. If the characters that caused the
         // null-conversion come from within a macro, they can't be changed.
@@ -401,7 +402,7 @@ class CastSequenceVisitor : public RecursiveASTVisitor<CastSequenceVisitor> {
     SourceLocation Loc = TestLoc, MacroLoc;
 
     while (true) {
-      std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
+      const std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
       const SrcMgr::SLocEntry *E = &SM.getSLocEntry(LocInfo.first);
       const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
 
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h
index 7c7b5ae02f1cd..1caa07afe352a 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_NULLPTR_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_NULLPTR_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USENULLPTRCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USENULLPTRCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -31,4 +31,4 @@ class UseNullptrCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_NULLPTR_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USENULLPTRCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp
index 6a19183737119..62a2de23147a7 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp
@@ -55,9 +55,9 @@ void UseOverrideCheck::registerMatchers(MatchFinder *Finder) {
 static SmallVector<Token, 16>
 parseTokens(CharSourceRange Range, const MatchFinder::MatchResult &Result) {
   const SourceManager &Sources = *Result.SourceManager;
-  std::pair<FileID, unsigned> LocInfo =
+  const std::pair<FileID, unsigned> LocInfo =
       Sources.getDecomposedLoc(Range.getBegin());
-  StringRef File = Sources.getBufferData(LocInfo.first);
+  const StringRef File = Sources.getBufferData(LocInfo.first);
   const char *TokenBegin = File.data() + LocInfo.second;
   Lexer RawLexer(Sources.getLocForStartOfFile(LocInfo.first),
                  Result.Context->getLangOpts(), File.begin(), TokenBegin,
@@ -103,12 +103,12 @@ void UseOverrideCheck::check(const MatchFinder::MatchResult &Result) {
       Method->isOutOfLine())
     return;
 
-  bool HasVirtual = Method->isVirtualAsWritten();
-  bool HasOverride = Method->getAttr<OverrideAttr>();
-  bool HasFinal = Method->getAttr<FinalAttr>();
+  const bool HasVirtual = Method->isVirtualAsWritten();
+  const bool HasOverride = Method->getAttr<OverrideAttr>();
+  const bool HasFinal = Method->getAttr<FinalAttr>();
 
-  bool OnlyVirtualSpecified = HasVirtual && !HasOverride && !HasFinal;
-  unsigned KeywordCount = HasVirtual + HasOverride + HasFinal;
+  const bool OnlyVirtualSpecified = HasVirtual && !HasOverride && !HasFinal;
+  const unsigned KeywordCount = HasVirtual + HasOverride + HasFinal;
 
   if ((!OnlyVirtualSpecified && KeywordCount == 1) ||
       (!HasVirtual && HasOverride && HasFinal && AllowOverrideAndFinal))
@@ -120,12 +120,12 @@ void UseOverrideCheck::check(const MatchFinder::MatchResult &Result) {
   } else if (KeywordCount == 0) {
     Message = "annotate this function with '%0' or (rarely) '%1'";
   } else {
-    StringRef Redundant =
+    const StringRef Redundant =
         HasVirtual ? (HasOverride && HasFinal && !AllowOverrideAndFinal
                           ? "'virtual' and '%0' are"
                           : "'virtual' is")
                    : "'%0' is";
-    StringRef Correct = HasFinal ? "'%1'" : "'%0'";
+    const StringRef Correct = HasFinal ? "'%1'" : "'%0'";
 
     Message = (llvm::Twine(Redundant) +
                " redundant since the function is already declared " + Correct)
@@ -135,7 +135,7 @@ void UseOverrideCheck::check(const MatchFinder::MatchResult &Result) {
   auto Diag = diag(Method->getLocation(), Message)
               << OverrideSpelling << FinalSpelling;
 
-  CharSourceRange FileRange = Lexer::makeFileCharRange(
+  const CharSourceRange FileRange = Lexer::makeFileCharRange(
       CharSourceRange::getTokenRange(Method->getSourceRange()), Sources,
       getLangOpts());
 
@@ -151,9 +151,9 @@ void UseOverrideCheck::check(const MatchFinder::MatchResult &Result) {
   if (!HasFinal && !HasOverride) {
     SourceLocation InsertLoc;
     std::string ReplacementText = (OverrideSpelling + " ").str();
-    SourceLocation MethodLoc = Method->getLocation();
+    const SourceLocation MethodLoc = Method->getLocation();
 
-    for (Token T : Tokens) {
+    for (const Token T : Tokens) {
       if (T.is(tok::kw___attribute) &&
           !Sources.isBeforeInTranslationUnit(T.getLocation(), MethodLoc)) {
         InsertLoc = T.getLocation();
@@ -164,7 +164,7 @@ void UseOverrideCheck::check(const MatchFinder::MatchResult &Result) {
     if (Method->hasAttrs()) {
       for (const clang::Attr *A : Method->getAttrs()) {
         if (!A->isImplicit() && !A->isInherited()) {
-          SourceLocation Loc =
+          const SourceLocation Loc =
               Sources.getExpansionLoc(A->getRange().getBegin());
           if ((!InsertLoc.isValid() ||
                Sources.isBeforeInTranslationUnit(Loc, InsertLoc)) &&
@@ -221,13 +221,14 @@ void UseOverrideCheck::check(const MatchFinder::MatchResult &Result) {
   }
 
   if (HasFinal && HasOverride && !AllowOverrideAndFinal) {
-    SourceLocation OverrideLoc = Method->getAttr<OverrideAttr>()->getLocation();
+    const SourceLocation OverrideLoc =
+        Method->getAttr<OverrideAttr>()->getLocation();
     Diag << FixItHint::CreateRemoval(
         CharSourceRange::getTokenRange(OverrideLoc, OverrideLoc));
   }
 
   if (HasVirtual) {
-    for (Token Tok : Tokens) {
+    for (const Token Tok : Tokens) {
       if (Tok.is(tok::kw_virtual)) {
         std::optional<Token> NextToken =
             utils::lexer::findNextTokenIncludingComments(
diff --git a/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp
index 9bf316939e2d0..8849c331608f9 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp
@@ -74,7 +74,8 @@ findLocksInCompoundStmt(const CompoundStmt *Block,
 
   for (const Stmt *Stmt : Block->body()) {
     if (const auto *DS = dyn_cast<DeclStmt>(Stmt)) {
-      llvm::SmallVector<const VarDecl *> LockGuards = getLockGuardsFromDecl(DS);
+      const llvm::SmallVector<const VarDecl *> LockGuards =
+          getLockGuardsFromDecl(DS);
 
       if (!LockGuards.empty()) {
         CurrentLockGuardGroup.append(LockGuards);
@@ -176,7 +177,7 @@ void UseScopedLockCheck::registerMatchers(MatchFinder *Finder) {
 
 void UseScopedLockCheck::check(const MatchFinder::MatchResult &Result) {
   if (const auto *DS = Result.Nodes.getNodeAs<DeclStmt>("lock-decl-single")) {
-    llvm::SmallVector<const VarDecl *> Decls = getLockGuardsFromDecl(DS);
+    const llvm::SmallVector<const VarDecl *> Decls = getLockGuardsFromDecl(DS);
     diagOnMultipleLocks({Decls}, Result);
     return;
   }
diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.cpp
index 414aa86c5fbd2..0315728f851d1 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.cpp
@@ -81,16 +81,17 @@ AST_MATCHER_P(clang::Expr, anyOfExhaustive, std::vector<Matcher<clang::Stmt>>,
 // literals.
 struct MatchBuilder {
   auto
-  ignoreParenAndArithmeticCasting(const Matcher<clang::Expr> Matcher) const {
+  ignoreParenAndArithmeticCasting(const Matcher<clang::Expr> &Matcher) const {
     return expr(hasType(qualType(isArithmetic())), ignoringParenCasts(Matcher));
   }
 
-  auto ignoreParenAndFloatingCasting(const Matcher<clang::Expr> Matcher) const {
+  auto
+  ignoreParenAndFloatingCasting(const Matcher<clang::Expr> &Matcher) const {
     return expr(hasType(qualType(isFloating())), ignoringParenCasts(Matcher));
   }
 
   auto matchMathCall(const StringRef FunctionName,
-                     const Matcher<clang::Expr> ArgumentMatcher) const {
+                     const Matcher<clang::Expr> &ArgumentMatcher) const {
     auto HasAnyPrecisionName = hasAnyName(
         FunctionName, (FunctionName + "l").str(),
         (FunctionName + "f").str()); // Support long double(l) and float(f).
@@ -100,7 +101,7 @@ struct MatchBuilder {
                  hasArgument(0, ArgumentMatcher))));
   }
 
-  auto matchSqrt(const Matcher<clang::Expr> ArgumentMatcher) const {
+  auto matchSqrt(const Matcher<clang::Expr> &ArgumentMatcher) const {
     return matchMathCall("sqrt", ArgumentMatcher);
   }
 
@@ -148,7 +149,7 @@ struct MatchBuilder {
     return expr(anyOf(Int, Float, Dref));
   }
 
-  auto match1Div(const Matcher<clang::Expr> Match) const {
+  auto match1Div(const Matcher<clang::Expr> &Match) const {
     return binaryOperator(hasOperatorName("/"), hasLHS(matchValue(1)),
                           hasRHS(Match));
   }
@@ -307,7 +308,7 @@ UseStdNumbersCheck::UseStdNumbersCheck(const StringRef Name,
 
 void UseStdNumbersCheck::registerMatchers(MatchFinder *const Finder) {
   const auto Matches = MatchBuilder{DiffThreshold};
-  std::vector<Matcher<clang::Stmt>> ConstantMatchers = {
+  const std::vector<Matcher<clang::Stmt>> ConstantMatchers = {
       Matches.matchLog2Euler(),     Matches.matchLog10Euler(),
       Matches.matchEulerTopLevel(), Matches.matchEgamma(),
       Matches.matchInvSqrtPi(),     Matches.matchInvPi(),
diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp
index 99ade046305c1..22dc0683ac348 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.cpp
@@ -70,8 +70,8 @@ void UseStdPrintCheck::registerPPCallbacks(const SourceManager &SM,
   this->PP = PP;
 }
 
-static clang::ast_matchers::StatementMatcher
-unusedReturnValue(clang::ast_matchers::StatementMatcher MatchedCallExpr) {
+static clang::ast_matchers::StatementMatcher unusedReturnValue(
+    const clang::ast_matchers::StatementMatcher &MatchedCallExpr) {
   auto UnusedInCompoundStmt =
       compoundStmt(forEach(MatchedCallExpr),
                    // The checker can't currently differentiate between the
diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h
index f5b3f719c56ce..18cff9aa962b5 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_USESTDPRINTCHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_USESTDPRINTCHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USESTDPRINTCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USESTDPRINTCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "../utils/IncludeInserter.h"
@@ -48,4 +48,4 @@ class UseStdPrintCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_USESTDPRINTCHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USESTDPRINTCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
index d623ec402179b..3c828c4c37fe1 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
@@ -55,7 +55,7 @@ struct UnqualNameVisitor : public RecursiveASTVisitor<UnqualNameVisitor> {
 
   bool visitUnqualName(StringRef UnqualName) {
     // Check for collisions with function arguments.
-    for (ParmVarDecl *Param : F.parameters())
+    for (const ParmVarDecl *Param : F.parameters())
       if (const IdentifierInfo *Ident = Param->getIdentifier())
         if (Ident->getName() == UnqualName) {
           Collision = true;
@@ -126,7 +126,7 @@ struct UnqualNameVisitor : public RecursiveASTVisitor<UnqualNameVisitor> {
   }
 
   bool VisitDeclRefExpr(DeclRefExpr *S) {
-    DeclarationName Name = S->getNameInfo().getName();
+    const DeclarationName Name = S->getNameInfo().getName();
     return S->getQualifierLoc() || Name.isEmpty() || !Name.isIdentifier() ||
            !visitUnqualName(Name.getAsIdentifierInfo()->getName());
   }
@@ -159,14 +159,14 @@ static SourceLocation findTrailingReturnTypeSourceLocation(
     const FunctionDecl &F, const FunctionTypeLoc &FTL, const ASTContext &Ctx,
     const SourceManager &SM, const LangOptions &LangOpts) {
   // We start with the location of the closing parenthesis.
-  SourceRange ExceptionSpecRange = F.getExceptionSpecSourceRange();
+  const SourceRange ExceptionSpecRange = F.getExceptionSpecSourceRange();
   if (ExceptionSpecRange.isValid())
     return Lexer::getLocForEndOfToken(ExceptionSpecRange.getEnd(), 0, SM,
                                       LangOpts);
 
   // If the function argument list ends inside of a macro, it is dangerous to
   // start lexing from here - bail out.
-  SourceLocation ClosingParen = FTL.getRParenLoc();
+  const SourceLocation ClosingParen = FTL.getRParenLoc();
   if (ClosingParen.isMacroID())
     return {};
 
@@ -174,8 +174,8 @@ static SourceLocation findTrailingReturnTypeSourceLocation(
       Lexer::getLocForEndOfToken(ClosingParen, 0, SM, LangOpts);
 
   // Skip subsequent CV and ref qualifiers.
-  std::pair<FileID, unsigned> Loc = SM.getDecomposedLoc(Result);
-  StringRef File = SM.getBufferData(Loc.first);
+  const std::pair<FileID, unsigned> Loc = SM.getDecomposedLoc(Result);
+  const StringRef File = SM.getBufferData(Loc.first);
   const char *TokenBegin = File.data() + Loc.second;
   Lexer Lexer(SM.getLocForStartOfFile(Loc.first), LangOpts, File.begin(),
               TokenBegin, File.end());
@@ -220,7 +220,7 @@ classifyToken(const FunctionDecl &F, Preprocessor &PP, Token Tok) {
   Token End;
   End.startToken();
   End.setKind(tok::eof);
-  SmallVector<Token, 2> Stream{Tok, End};
+  const SmallVector<Token, 2> Stream{Tok, End};
 
   // FIXME: do not report these token to Preprocessor.TokenWatcher.
   PP.EnterTokenStream(Stream, false, /*IsReinject=*/false);
@@ -230,8 +230,8 @@ classifyToken(const FunctionDecl &F, Preprocessor &PP, Token Tok) {
     if (T.is(tok::eof))
       break;
 
-    bool Qual = isCvr(T);
-    bool Spec = isSpecifier(T);
+    const bool Qual = isCvr(T);
+    const bool Spec = isSpecifier(T);
     CT.IsQualifier &= Qual;
     CT.IsSpecifier &= Spec;
     ContainsQualifiers |= Qual;
@@ -252,12 +252,12 @@ classifyTokensBeforeFunctionName(const FunctionDecl &F, const ASTContext &Ctx,
                                  const SourceManager &SM,
                                  const LangOptions &LangOpts,
                                  Preprocessor *PP) {
-  SourceLocation BeginF = expandIfMacroId(F.getBeginLoc(), SM);
-  SourceLocation BeginNameF = expandIfMacroId(F.getLocation(), SM);
+  const SourceLocation BeginF = expandIfMacroId(F.getBeginLoc(), SM);
+  const SourceLocation BeginNameF = expandIfMacroId(F.getLocation(), SM);
 
   // Create tokens for everything before the name of the function.
-  std::pair<FileID, unsigned> Loc = SM.getDecomposedLoc(BeginF);
-  StringRef File = SM.getBufferData(Loc.first);
+  const std::pair<FileID, unsigned> Loc = SM.getDecomposedLoc(BeginF);
+  const StringRef File = SM.getBufferData(Loc.first);
   const char *TokenBegin = File.data() + Loc.second;
   Lexer Lexer(SM.getLocForStartOfFile(Loc.first), LangOpts, File.begin(),
               TokenBegin, File.end());
@@ -369,9 +369,9 @@ static SourceLocation findLambdaTrailingReturnInsertLoc(
     else
       ParamEndLoc = Method->getParametersSourceRange().getEnd();
 
-    std::pair<FileID, unsigned> ParamEndLocInfo =
+    const std::pair<FileID, unsigned> ParamEndLocInfo =
         SM.getDecomposedLoc(ParamEndLoc);
-    StringRef Buffer = SM.getBufferData(ParamEndLocInfo.first);
+    const StringRef Buffer = SM.getBufferData(ParamEndLocInfo.first);
 
     Lexer Lexer(SM.getLocForStartOfFile(ParamEndLocInfo.first), LangOpts,
                 Buffer.begin(), Buffer.data() + ParamEndLocInfo.second,
@@ -421,11 +421,11 @@ static void keepSpecifiers(std::string &ReturnType, std::string &Auto,
     return;
 
   // Find specifiers, remove them from the return type, add them to 'auto'.
-  unsigned int ReturnTypeBeginOffset =
+  const unsigned int ReturnTypeBeginOffset =
       SM.getDecomposedLoc(ReturnTypeCVRange.getBegin()).second;
-  size_t InitialAutoLength = Auto.size();
+  const size_t InitialAutoLength = Auto.size();
   unsigned int DeletedChars = 0;
-  for (ClassifiedToken CT : *MaybeTokens) {
+  for (const ClassifiedToken CT : *MaybeTokens) {
     if (SM.isBeforeInTranslationUnit(CT.T.getLocation(),
                                      ReturnTypeCVRange.getBegin()) ||
         SM.isBeforeInTranslationUnit(ReturnTypeCVRange.getEnd(),
@@ -436,10 +436,11 @@ static void keepSpecifiers(std::string &ReturnType, std::string &Auto,
 
     // Add the token to 'auto' and remove it from the return type, including
     // any whitespace following the token.
-    unsigned int TOffset = SM.getDecomposedLoc(CT.T.getLocation()).second;
+    const unsigned int TOffset = SM.getDecomposedLoc(CT.T.getLocation()).second;
     assert(TOffset >= ReturnTypeBeginOffset &&
            "Token location must be after the beginning of the return type");
-    unsigned int TOffsetInRT = TOffset - ReturnTypeBeginOffset - DeletedChars;
+    const unsigned int TOffsetInRT =
+        TOffset - ReturnTypeBeginOffset - DeletedChars;
     unsigned int TLengthWithWS = CT.T.getLength();
     while (TOffsetInRT + TLengthWithWS < ReturnType.size() &&
            llvm::isSpace(ReturnType[TOffsetInRT + TLengthWithWS]))
@@ -548,7 +549,7 @@ void UseTrailingReturnTypeCheck::check(const MatchFinder::MatchResult &Result) {
     return;
   }
 
-  SourceLocation InsertionLoc =
+  const SourceLocation InsertionLoc =
       findTrailingReturnTypeSourceLocation(*F, FTL, Ctx, SM, LangOpts);
   if (InsertionLoc.isInvalid()) {
     diag(F->getLocation(), ErrorMessageOnFunction);
@@ -558,7 +559,7 @@ void UseTrailingReturnTypeCheck::check(const MatchFinder::MatchResult &Result) {
   // Using the declared return type via F->getDeclaredReturnType().getAsString()
   // discards user formatting and order of const, volatile, type, whitespace,
   // space before & ... .
-  SourceRange ReturnTypeCVRange = findReturnTypeAndCVSourceRange(
+  const SourceRange ReturnTypeCVRange = findReturnTypeAndCVSourceRange(
       *F, FTL.getReturnLoc(), Ctx, SM, LangOpts, PP);
   if (ReturnTypeCVRange.isInvalid()) {
     diag(F->getLocation(), ErrorMessageOnFunction);
@@ -580,13 +581,13 @@ void UseTrailingReturnTypeCheck::check(const MatchFinder::MatchResult &Result) {
     return;
   }
 
-  SourceLocation ReturnTypeEnd =
+  const SourceLocation ReturnTypeEnd =
       Lexer::getLocForEndOfToken(ReturnTypeCVRange.getEnd(), 0, SM, LangOpts);
-  StringRef CharAfterReturnType = Lexer::getSourceText(
+  const StringRef CharAfterReturnType = Lexer::getSourceText(
       CharSourceRange::getCharRange(ReturnTypeEnd,
                                     ReturnTypeEnd.getLocWithOffset(1)),
       SM, LangOpts);
-  bool NeedSpaceAfterAuto =
+  const bool NeedSpaceAfterAuto =
       CharAfterReturnType.empty() || !llvm::isSpace(CharAfterReturnType[0]);
 
   std::string Auto = NeedSpaceAfterAuto ? "auto " : "auto";
diff --git a/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.cpp
index 03ecec9bd175b..e3672f84a3a5c 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.cpp
@@ -96,7 +96,7 @@ void UseTransparentFunctorsCheck::check(
         FunctorParentType->template_arguments()[ArgNum];
     if (Arg.getKind() != TemplateArgument::Type)
       continue;
-    QualType ParentArgType = Arg.getAsType();
+    const QualType ParentArgType = Arg.getAsType();
     if (ParentArgType->isRecordType() &&
         ParentArgType->getAsCXXRecordDecl() ==
             Functor->getAsType()->getAsCXXRecordDecl())
@@ -105,13 +105,13 @@ void UseTransparentFunctorsCheck::check(
   // Functor is a default template argument.
   if (ArgNum == FunctorParentType->template_arguments().size())
     return;
-  TemplateArgumentLoc FunctorLoc = FunctorParentLoc.getArgLoc(ArgNum);
+  const TemplateArgumentLoc FunctorLoc = FunctorParentLoc.getArgLoc(ArgNum);
   auto FunctorTypeLoc = getInnerTypeLocAs<TemplateSpecializationTypeLoc>(
       FunctorLoc.getTypeSourceInfo()->getTypeLoc());
   if (FunctorTypeLoc.isNull())
     return;
 
-  SourceLocation ReportLoc = FunctorLoc.getLocation();
+  const SourceLocation ReportLoc = FunctorLoc.getLocation();
   if (ReportLoc.isInvalid())
     return;
   diag(ReportLoc, Message) << FuncClass->getName()
diff --git a/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h b/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h
index 0af729b54cfce..936eaf1b23060 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_TRANSPARENT_FUNCTORS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_TRANSPARENT_FUNCTORS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USETRANSPARENTFUNCTORSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USETRANSPARENTFUNCTORSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -33,4 +33,4 @@ class UseTransparentFunctorsCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_TRANSPARENT_FUNCTORS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USETRANSPARENTFUNCTORSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.cpp
index eef9d39800360..08c40d4554488 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.cpp
@@ -15,7 +15,7 @@ using namespace clang::ast_matchers;
 namespace clang::tidy::modernize {
 
 void UseUncaughtExceptionsCheck::registerMatchers(MatchFinder *Finder) {
-  std::string MatchText = "::std::uncaught_exception";
+  const std::string MatchText = "::std::uncaught_exception";
 
   // Using declaration: warning and fix-it.
   Finder->addMatcher(
@@ -78,7 +78,7 @@ void UseUncaughtExceptionsCheck::check(const MatchFinder::MatchResult &Result) {
                              *Result.SourceManager, getLangOpts());
 
     Text.consume_back("()");
-    int TextLength = Text.size();
+    const int TextLength = Text.size();
 
     if (WarnOnly) {
       return;
diff --git a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h
index 772133d492a9f..09b0ba517caa0 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_UNCAUGHT_EXCEPTIONS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_UNCAUGHT_EXCEPTIONS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEUNCAUGHTEXCEPTIONSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEUNCAUGHTEXCEPTIONSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -33,4 +33,4 @@ class UseUncaughtExceptionsCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_UNCAUGHT_EXCEPTIONS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEUNCAUGHTEXCEPTIONSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
index 72673753e6c60..38b368957c5d5 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
@@ -114,7 +114,7 @@ void UseUsingCheck::check(const MatchFinder::MatchResult &Result) {
   if (ExternCDecl && IgnoreExternC)
     return;
 
-  SourceLocation StartLoc = MatchedDecl->getBeginLoc();
+  const SourceLocation StartLoc = MatchedDecl->getBeginLoc();
 
   if (StartLoc.isMacroID() && IgnoreMacros)
     return;
@@ -172,7 +172,7 @@ void UseUsingCheck::check(const MatchFinder::MatchResult &Result) {
             .str(),
         ExtraReference.str()};
   }();
-  StringRef Name = MatchedDecl->getName();
+  const StringRef Name = MatchedDecl->getName();
   SourceRange ReplaceRange = MatchedDecl->getSourceRange();
 
   // typedefs with multiple comma-separated definitions produce multiple
@@ -223,7 +223,8 @@ void UseUsingCheck::check(const MatchFinder::MatchResult &Result) {
       return;
   }
 
-  std::string Replacement = (Using + Name + " = " + Type + QualifierStr).str();
+  const std::string Replacement =
+      (Using + Name + " = " + Type + QualifierStr).str();
   Diag << FixItHint::CreateReplacement(ReplaceRange, Replacement);
 }
 } // namespace clang::tidy::modernize
diff --git a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
index 5ecabc7a17a45..60813cd04c66e 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_USING_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_USING_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEUSINGCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEUSINGCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -40,4 +40,4 @@ class UseUsingCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::modernize
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USE_USING_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_USEUSINGCHECK_H
diff --git a/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.h b/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.h
index 07ee68a55b6b4..a44ef31a683aa 100644
--- a/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.h
+++ b/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_BUFFER_DEREF_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_BUFFER_DEREF_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_BUFFERDEREFCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_BUFFERDEREFCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "clang/StaticAnalyzer/Checkers/MPIFunctionClassifier.h"
@@ -48,4 +48,4 @@ class BufferDerefCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::mpi
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_BUFFER_DEREF_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_BUFFERDEREFCHECK_H
diff --git a/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.cpp b/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.cpp
index 17c1283b4d414..370a54d892809 100644
--- a/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.cpp
+++ b/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.cpp
@@ -41,39 +41,39 @@ isMPITypeMatching(const std::multimap<BuiltinType::Kind, StringRef> &MultiMap,
 ///
 /// \returns true if the type is a standard type
 static bool isStandardMPIDatatype(StringRef MPIDatatype) {
-  static llvm::StringSet<> AllTypes = {"MPI_C_BOOL",
-                                       "MPI_CHAR",
-                                       "MPI_SIGNED_CHAR",
-                                       "MPI_UNSIGNED_CHAR",
-                                       "MPI_WCHAR",
-                                       "MPI_INT",
-                                       "MPI_LONG",
-                                       "MPI_SHORT",
-                                       "MPI_LONG_LONG",
-                                       "MPI_LONG_LONG_INT",
-                                       "MPI_UNSIGNED",
-                                       "MPI_UNSIGNED_SHORT",
-                                       "MPI_UNSIGNED_LONG",
-                                       "MPI_UNSIGNED_LONG_LONG",
-                                       "MPI_FLOAT",
-                                       "MPI_DOUBLE",
-                                       "MPI_LONG_DOUBLE",
-                                       "MPI_C_COMPLEX",
-                                       "MPI_C_FLOAT_COMPLEX",
-                                       "MPI_C_DOUBLE_COMPLEX",
-                                       "MPI_C_LONG_DOUBLE_COMPLEX",
-                                       "MPI_INT8_T",
-                                       "MPI_INT16_T",
-                                       "MPI_INT32_T",
-                                       "MPI_INT64_T",
-                                       "MPI_UINT8_T",
-                                       "MPI_UINT16_T",
-                                       "MPI_UINT32_T",
-                                       "MPI_UINT64_T",
-                                       "MPI_CXX_BOOL",
-                                       "MPI_CXX_FLOAT_COMPLEX",
-                                       "MPI_CXX_DOUBLE_COMPLEX",
-                                       "MPI_CXX_LONG_DOUBLE_COMPLEX"};
+  static const llvm::StringSet<> AllTypes = {"MPI_C_BOOL",
+                                             "MPI_CHAR",
+                                             "MPI_SIGNED_CHAR",
+                                             "MPI_UNSIGNED_CHAR",
+                                             "MPI_WCHAR",
+                                             "MPI_INT",
+                                             "MPI_LONG",
+                                             "MPI_SHORT",
+                                             "MPI_LONG_LONG",
+                                             "MPI_LONG_LONG_INT",
+                                             "MPI_UNSIGNED",
+                                             "MPI_UNSIGNED_SHORT",
+                                             "MPI_UNSIGNED_LONG",
+                                             "MPI_UNSIGNED_LONG_LONG",
+                                             "MPI_FLOAT",
+                                             "MPI_DOUBLE",
+                                             "MPI_LONG_DOUBLE",
+                                             "MPI_C_COMPLEX",
+                                             "MPI_C_FLOAT_COMPLEX",
+                                             "MPI_C_DOUBLE_COMPLEX",
+                                             "MPI_C_LONG_DOUBLE_COMPLEX",
+                                             "MPI_INT8_T",
+                                             "MPI_INT16_T",
+                                             "MPI_INT32_T",
+                                             "MPI_INT64_T",
+                                             "MPI_UINT8_T",
+                                             "MPI_UINT16_T",
+                                             "MPI_UINT32_T",
+                                             "MPI_UINT64_T",
+                                             "MPI_CXX_BOOL",
+                                             "MPI_CXX_FLOAT_COMPLEX",
+                                             "MPI_CXX_DOUBLE_COMPLEX",
+                                             "MPI_CXX_LONG_DOUBLE_COMPLEX"};
 
   return AllTypes.contains(MPIDatatype);
 }
@@ -90,7 +90,7 @@ static bool isBuiltinTypeMatching(const BuiltinType *Builtin,
                                   std::string &BufferTypeName,
                                   StringRef MPIDatatype,
                                   const LangOptions &LO) {
-  static std::multimap<BuiltinType::Kind, StringRef> BuiltinMatches = {
+  static const std::multimap<BuiltinType::Kind, StringRef> BuiltinMatches = {
       // On some systems like PPC or ARM, 'char' is unsigned by default which is
       // why distinct signedness for the buffer and MPI type is tolerated.
       {BuiltinType::SChar, "MPI_CHAR"},
@@ -143,7 +143,7 @@ static bool isCComplexTypeMatching(const ComplexType *const Complex,
                                    std::string &BufferTypeName,
                                    StringRef MPIDatatype,
                                    const LangOptions &LO) {
-  static std::multimap<BuiltinType::Kind, StringRef> ComplexCMatches = {
+  static const std::multimap<BuiltinType::Kind, StringRef> ComplexCMatches = {
       {BuiltinType::Float, "MPI_C_COMPLEX"},
       {BuiltinType::Float, "MPI_C_FLOAT_COMPLEX"},
       {BuiltinType::Double, "MPI_C_DOUBLE_COMPLEX"},
@@ -173,7 +173,7 @@ static bool
 isCXXComplexTypeMatching(const TemplateSpecializationType *const Template,
                          std::string &BufferTypeName, StringRef MPIDatatype,
                          const LangOptions &LO) {
-  static std::multimap<BuiltinType::Kind, StringRef> ComplexCXXMatches = {
+  static const std::multimap<BuiltinType::Kind, StringRef> ComplexCXXMatches = {
       {BuiltinType::Float, "MPI_CXX_FLOAT_COMPLEX"},
       {BuiltinType::Double, "MPI_CXX_DOUBLE_COMPLEX"},
       {BuiltinType::LongDouble, "MPI_CXX_LONG_DOUBLE_COMPLEX"}};
@@ -264,7 +264,7 @@ void TypeMismatchCheck::check(const MatchFinder::MatchResult &Result) {
             "MPI_IN_PLACE")
       return;
 
-    StringRef MPIDatatype =
+    const StringRef MPIDatatype =
         tooling::fixit::getText(*CE->getArg(DatatypeIdx), *Result.Context);
 
     const Type *ArgType = argumentType(CE, BufferIdx);
diff --git a/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.h b/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.h
index 5a7db17819967..043f99ee84bdd 100644
--- a/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.h
+++ b/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_TYPE_MISMATCH_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_TYPE_MISMATCH_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_TYPEMISMATCHCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_TYPEMISMATCHCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
@@ -49,4 +49,4 @@ class TypeMismatchCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::mpi
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_TYPE_MISMATCH_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MPI_TYPEMISMATCHCHECK_H
diff --git a/clang-tools-extra/clang-tidy/objc/AssertEquals.cpp b/clang-tools-extra/clang-tidy/objc/AssertEqualsCheck.cpp
similarity index 60%
rename from clang-tools-extra/clang-tidy/objc/AssertEquals.cpp
rename to clang-tools-extra/clang-tidy/objc/AssertEqualsCheck.cpp
index 3f1bc17926ba2..a9e6a4b9ea9bb 100644
--- a/clang-tools-extra/clang-tidy/objc/AssertEquals.cpp
+++ b/clang-tools-extra/clang-tidy/objc/AssertEqualsCheck.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AssertEquals.h"
+#include "AssertEqualsCheck.h"
+#include "llvm/ADT/StringMap.h"
 
-#include <map>
 #include <string>
 
 using namespace clang::ast_matchers;
@@ -16,44 +16,40 @@ using namespace clang::ast_matchers;
 namespace clang::tidy::objc {
 
 // Mapping from `XCTAssert*Equal` to `XCTAssert*EqualObjects` name.
-static const std::map<std::string, std::string> &nameMap() {
-  static std::map<std::string, std::string> Map{
-      {"XCTAssertEqual", "XCTAssertEqualObjects"},
-      {"XCTAssertNotEqual", "XCTAssertNotEqualObjects"},
+static const llvm::StringMap<StringRef> NameMap{
+    {"XCTAssertEqual", "XCTAssertEqualObjects"},
+    {"XCTAssertNotEqual", "XCTAssertNotEqualObjects"},
+};
 
-  };
-  return Map;
-}
-
-void AssertEquals::registerMatchers(MatchFinder *Finder) {
-  for (const auto &Pair : nameMap()) {
+void AssertEqualsCheck::registerMatchers(MatchFinder *Finder) {
+  for (const auto &[CurrName, _] : NameMap) {
     Finder->addMatcher(
         binaryOperator(anyOf(hasOperatorName("!="), hasOperatorName("==")),
-                       isExpandedFromMacro(Pair.first),
+                       isExpandedFromMacro(std::string(CurrName)),
                        anyOf(hasLHS(hasType(qualType(
                                  hasCanonicalType(asString("NSString *"))))),
                              hasRHS(hasType(qualType(
-                                 hasCanonicalType(asString("NSString *"))))))
-
-                           )
-            .bind(Pair.first),
+                                 hasCanonicalType(asString("NSString *")))))))
+            .bind(CurrName),
         this);
   }
 }
 
-void AssertEquals::check(const ast_matchers::MatchFinder::MatchResult &Result) {
-  for (const auto &Pair : nameMap()) {
-    if (const auto *Root = Result.Nodes.getNodeAs<BinaryOperator>(Pair.first)) {
-      SourceManager *Sm = Result.SourceManager;
+void AssertEqualsCheck::check(
+    const ast_matchers::MatchFinder::MatchResult &Result) {
+  for (const auto &[CurrName, TargetName] : NameMap) {
+    if (const auto *Root = Result.Nodes.getNodeAs<BinaryOperator>(CurrName)) {
+      const SourceManager *Sm = Result.SourceManager;
       // The macros are nested two levels, so going up twice.
       auto MacroCallsite = Sm->getImmediateMacroCallerLoc(
           Sm->getImmediateMacroCallerLoc(Root->getBeginLoc()));
-      diag(MacroCallsite, "use " + Pair.second + " for comparing objects")
+      diag(MacroCallsite,
+           (Twine("use ") + TargetName + " for comparing objects").str())
           << FixItHint::CreateReplacement(
                  clang::CharSourceRange::getCharRange(
                      MacroCallsite,
-                     MacroCallsite.getLocWithOffset(Pair.first.length())),
-                 Pair.second);
+                     MacroCallsite.getLocWithOffset(CurrName.size())),
+                 TargetName);
     }
   }
 }
diff --git a/clang-tools-extra/clang-tidy/objc/AssertEquals.h b/clang-tools-extra/clang-tidy/objc/AssertEqualsCheck.h
similarity index 67%
rename from clang-tools-extra/clang-tidy/objc/AssertEquals.h
rename to clang-tools-extra/clang-tidy/objc/AssertEqualsCheck.h
index 8c21f9bd3a75e..237dabac7114c 100644
--- a/clang-tools-extra/clang-tidy/objc/AssertEquals.h
+++ b/clang-tools-extra/clang-tidy/objc/AssertEqualsCheck.h
@@ -6,22 +6,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef THIRD_PARTY_LLVM_LLVM_PROJECT_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_OBJCASSERTEQUALS_H_
-#define THIRD_PARTY_LLVM_LLVM_PROJECT_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_OBJCASSERTEQUALS_H_
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_ASSERTEQUALSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_ASSERTEQUALSCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 
 namespace clang::tidy::objc {
 
-/// Warn if XCTAssertEqual() or XCTAssertNotEqual() is used with at least one
+/// Warns if XCTAssertEqual() or XCTAssertNotEqual() is used with at least one
 /// operands of type NSString*.
 ///
 /// For the user-facing documentation see:
 /// https://clang.llvm.org/extra/clang-tidy/checks/objc/assert-equals.html
-class AssertEquals final : public ClangTidyCheck {
+class AssertEqualsCheck final : public ClangTidyCheck {
 public:
-  AssertEquals(StringRef Name, ClangTidyContext *Context)
+  AssertEqualsCheck(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context) {}
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.ObjC;
@@ -32,4 +32,4 @@ class AssertEquals final : public ClangTidyCheck {
 
 } // namespace clang::tidy::objc
 
-#endif // THIRD_PARTY_LLVM_LLVM_PROJECT_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_OBJCASSERTEQUALS_H_
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_ASSERTEQUALSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/objc/CMakeLists.txt b/clang-tools-extra/clang-tidy/objc/CMakeLists.txt
index e28d25deee84c..2908d11e2a018 100644
--- a/clang-tools-extra/clang-tidy/objc/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/objc/CMakeLists.txt
@@ -4,7 +4,7 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_clang_library(clangTidyObjCModule STATIC
-  AssertEquals.cpp
+  AssertEqualsCheck.cpp
   AvoidNSErrorInitCheck.cpp
   DeallocInCategoryCheck.cpp
   ForbiddenSubclassingCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.h b/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.h
index 2d238690d627a..6e0a12a00dd56 100644
--- a/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.h
+++ b/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_FORBIDDEN_SUBCLASSING_CHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_FORBIDDEN_SUBCLASSING_CHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_FORBIDDENSUBCLASSINGCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_FORBIDDENSUBCLASSINGCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "llvm/ADT/StringRef.h"
@@ -36,4 +36,4 @@ class ForbiddenSubclassingCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::objc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_FORBIDDEN_SUBCLASSING_CHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_FORBIDDENSUBCLASSINGCHECK_H
diff --git a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp
index 1481b2bb24e95..31d098e5034e5 100644
--- a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp
@@ -49,7 +49,7 @@ void NSDateFormatterCheck::check(const MatchFinder::MatchResult &Result) {
   // Callback implementation.
   const auto *StrExpr = Result.Nodes.getNodeAs<ObjCStringLiteral>("str_lit");
   const StringLiteral *SL = cast<ObjCStringLiteral>(StrExpr)->getString();
-  StringRef SR = SL->getString();
+  const StringRef SR = SL->getString();
 
   if (!isValidDatePattern(SR)) {
     diag(StrExpr->getExprLoc(), "invalid date format specifier");
diff --git a/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.cpp b/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.cpp
index 8a32c38a04695..69caaed2b8542 100644
--- a/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.cpp
@@ -43,7 +43,7 @@ AST_POLYMORPHIC_MATCHER(isObjCManagedLifetime,
                         AST_POLYMORPHIC_SUPPORTED_TYPES(ObjCIvarRefExpr,
                                                         DeclRefExpr,
                                                         MemberExpr)) {
-  QualType QT = Node.getType();
+  const QualType QT = Node.getType();
   return QT->isScalarType() &&
          (QT->getScalarTypeKind() == Type::STK_ObjCObjectPointer ||
           QT->getScalarTypeKind() == Type::STK_BlockPointer) &&
@@ -55,12 +55,12 @@ AST_POLYMORPHIC_MATCHER(isObjCManagedLifetime,
 static std::optional<FixItHint>
 fixItHintReplacementForOwnershipString(StringRef Text, CharSourceRange Range,
                                        StringRef Ownership) {
-  size_t Index = Text.find(Ownership);
+  const size_t Index = Text.find(Ownership);
   if (Index == StringRef::npos)
     return std::nullopt;
 
-  SourceLocation Begin = Range.getBegin().getLocWithOffset(Index);
-  SourceLocation End = Begin.getLocWithOffset(Ownership.size());
+  const SourceLocation Begin = Range.getBegin().getLocWithOffset(Index);
+  const SourceLocation End = Begin.getLocWithOffset(Ownership.size());
   return FixItHint::CreateReplacement(SourceRange(Begin, End),
                                       UnsafeUnretainedText);
 }
@@ -76,7 +76,7 @@ fixItHintForVarDecl(const VarDecl *VD, const SourceManager &SM,
   // Currently there is no way to directly get the source range for the
   // __weak/__strong ObjC lifetime qualifiers, so it's necessary to string
   // search in the source code.
-  CharSourceRange Range = Lexer::makeFileCharRange(
+  const CharSourceRange Range = Lexer::makeFileCharRange(
       CharSourceRange::getTokenRange(VD->getSourceRange()), SM, LangOpts);
   if (Range.isInvalid()) {
     // An invalid range likely means inside a macro, in which case don't supply
@@ -84,7 +84,7 @@ fixItHintForVarDecl(const VarDecl *VD, const SourceManager &SM,
     return std::nullopt;
   }
 
-  StringRef VarDeclText = Lexer::getSourceText(Range, SM, LangOpts);
+  const StringRef VarDeclText = Lexer::getSourceText(Range, SM, LangOpts);
   if (std::optional<FixItHint> Hint =
           fixItHintReplacementForOwnershipString(VarDeclText, Range, WeakText))
     return Hint;
diff --git a/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp b/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp
index c21b459964692..411d252e3a4b7 100644
--- a/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp
@@ -9,7 +9,7 @@
 #include "../ClangTidy.h"
 #include "../ClangTidyModule.h"
 #include "../ClangTidyModuleRegistry.h"
-#include "AssertEquals.h"
+#include "AssertEqualsCheck.h"
 #include "AvoidNSErrorInitCheck.h"
 #include "DeallocInCategoryCheck.h"
 #include "ForbiddenSubclassingCheck.h"
@@ -29,7 +29,7 @@ class ObjCModule : public ClangTidyModule {
   void addCheckFactories(ClangTidyCheckFactories &CheckFactories) override {
     CheckFactories.registerCheck<AvoidNSErrorInitCheck>(
         "objc-avoid-nserror-init");
-    CheckFactories.registerCheck<AssertEquals>("objc-assert-equals");
+    CheckFactories.registerCheck<AssertEqualsCheck>("objc-assert-equals");
 
     CheckFactories.registerCheck<DeallocInCategoryCheck>(
         "objc-dealloc-in-category");
diff --git a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
index 4a586c8ff0ac9..690572ffbf93a 100644
--- a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
@@ -39,7 +39,7 @@ static FixItHint generateFixItHint(const ObjCPropertyDecl *Decl,
   auto NewName = Decl->getName().str();
   size_t Index = 0;
   if (Style == CategoryProperty) {
-    size_t UnderScorePos = Name.find_first_of('_');
+    const size_t UnderScorePos = Name.find_first_of('_');
     if (UnderScorePos != llvm::StringRef::npos) {
       Index = UnderScorePos + 1;
       NewName.replace(0, Index - 1, Name.substr(0, Index - 1).lower());
@@ -74,7 +74,7 @@ static std::string validPropertyNameRegex(bool UsedInMatcher) {
   //
   // aRbITRaRyCapS is allowed to avoid generating false positives for names
   // like isVitaminBSupplement, CProgrammingLanguage, and isBeforeM.
-  std::string StartMatcher = UsedInMatcher ? "::" : "^";
+  const std::string StartMatcher = UsedInMatcher ? "::" : "^";
   return StartMatcher + "([a-z]|[A-Z][A-Z0-9])[a-z0-9A-Z]*$";
 }
 
@@ -85,7 +85,7 @@ static bool hasCategoryPropertyPrefix(llvm::StringRef PropertyName) {
 }
 
 static bool prefixedPropertyNameValid(llvm::StringRef PropertyName) {
-  size_t Start = PropertyName.find_first_of('_');
+  const size_t Start = PropertyName.find_first_of('_');
   assert(Start != llvm::StringRef::npos && Start + 1 < PropertyName.size());
   auto Prefix = PropertyName.substr(0, Start);
   if (Prefix.lower() != Prefix) {
@@ -115,8 +115,9 @@ void PropertyDeclarationCheck::check(const MatchFinder::MatchResult &Result) {
       hasCategoryPropertyPrefix(MatchedDecl->getName())) {
     if (!prefixedPropertyNameValid(MatchedDecl->getName()) ||
         CategoryDecl->IsClassExtension()) {
-      NamingStyle Style = CategoryDecl->IsClassExtension() ? StandardProperty
-                                                           : CategoryProperty;
+      const NamingStyle Style = CategoryDecl->IsClassExtension()
+                                    ? StandardProperty
+                                    : CategoryProperty;
       diag(MatchedDecl->getLocation(),
            "property name '%0' not using lowerCamelCase style or not prefixed "
            "in a category, according to the Apple Coding Guidelines")
diff --git a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.h b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.h
index daaebb11673a8..1e185b910cd05 100644
--- a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.h
+++ b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_PROPERTY_DECLARATION_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_PROPERTY_DECLARATION_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_PROPERTYDECLARATIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_PROPERTYDECLARATIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -34,4 +34,4 @@ class PropertyDeclarationCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::objc
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_PROPERTY_DECLARATION_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_PROPERTYDECLARATIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.cpp b/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.cpp
index 3c133ad7dd96b..3887afe703389 100644
--- a/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.cpp
@@ -90,11 +90,11 @@ void SuperSelfCheck::check(const MatchFinder::MatchResult &Result) {
                                           "invoke a superclass initializer?")
               << Message->getMethodDecl();
 
-  SourceLocation ReceiverLoc = Message->getReceiverRange().getBegin();
+  const SourceLocation ReceiverLoc = Message->getReceiverRange().getBegin();
   if (ReceiverLoc.isMacroID() || ReceiverLoc.isInvalid())
     return;
 
-  SourceLocation SelectorLoc = Message->getSelectorStartLoc();
+  const SourceLocation SelectorLoc = Message->getSelectorStartLoc();
   if (SelectorLoc.isMacroID() || SelectorLoc.isInvalid())
     return;
 
diff --git a/clang-tools-extra/clang-tidy/performance/CMakeLists.txt b/clang-tools-extra/clang-tidy/performance/CMakeLists.txt
index c6e547c5089fb..9a2f90069edbf 100644
--- a/clang-tools-extra/clang-tidy/performance/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/performance/CMakeLists.txt
@@ -23,7 +23,7 @@ add_clang_library(clangTidyPerformanceModule STATIC
   PerformanceTidyModule.cpp
   TriviallyDestructibleCheck.cpp
   TypePromotionInMathFnCheck.cpp
-  UnnecessaryCopyInitialization.cpp
+  UnnecessaryCopyInitializationCheck.cpp
   UnnecessaryValueParamCheck.cpp
 
   LINK_LIBS
diff --git a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h
index 2452d2e66ecd4..74067c1f5792d 100644
--- a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_FASTER_STRING_FIND_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_FASTER_STRING_FIND_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_FASTERSTRINGFINDCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_FASTERSTRINGFINDCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -38,4 +38,4 @@ class FasterStringFindCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::performance
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_FASTER_STRING_FIND_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_FASTERSTRINGFINDCHECK_H
diff --git a/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.cpp b/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.cpp
index a558954b3fe1d..74a76fadffa1c 100644
--- a/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.cpp
@@ -87,8 +87,8 @@ void ImplicitConversionInLoopCheck::reportAndFix(const ASTContext *Context,
                                                  const Expr *OperatorCall) {
   // We only match on const ref, so we should print a const ref version of the
   // type.
-  QualType ConstType = OperatorCall->getType().withConst();
-  QualType ConstRefType = Context->getLValueReferenceType(ConstType);
+  const QualType ConstType = OperatorCall->getType().withConst();
+  const QualType ConstRefType = Context->getLValueReferenceType(ConstType);
   const char Message[] =
       "the type of the loop variable %0 is different from the one returned "
       "by the iterator and generates an implicit conversion; you can either "
diff --git a/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.h b/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.h
index 786081a351070..4690caa8b5238 100644
--- a/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_IMPLICIT_CONVERSION_IN_LOOP_CHECK_H_
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_IMPLICIT_CONVERSION_IN_LOOP_CHECK_H_
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_IMPLICITCONVERSIONINLOOPCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_IMPLICITCONVERSIONINLOOPCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -33,4 +33,4 @@ class ImplicitConversionInLoopCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::performance
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_IMPLICIT_CONVERSION_IN_LOOP_CHECK_H_
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_IMPLICITCONVERSIONINLOOPCHECK_H
diff --git a/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp b/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp
index cd128c3556725..b57fdb2b3ffee 100644
--- a/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp
@@ -71,8 +71,8 @@ void InefficientAlgorithmCheck::check(const MatchFinder::MatchResult &Result) {
 
   // Store if the key type of the container is compatible with the value
   // that is searched for.
-  QualType ValueType = AlgCall->getArg(2)->getType();
-  QualType KeyType =
+  const QualType ValueType = AlgCall->getArg(2)->getType();
+  const QualType KeyType =
       IneffCont->getTemplateArgs()[0].getAsType().getCanonicalType();
   const bool CompatibleTypes = areTypesCompatible(KeyType, ValueType);
 
@@ -104,8 +104,8 @@ void InefficientAlgorithmCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *IneffContExpr = Result.Nodes.getNodeAs<Expr>("IneffContExpr");
   FixItHint Hint;
 
-  SourceManager &SM = *Result.SourceManager;
-  LangOptions LangOpts = getLangOpts();
+  const SourceManager &SM = *Result.SourceManager;
+  const LangOptions LangOpts = getLangOpts();
 
   CharSourceRange CallRange =
       CharSourceRange::getTokenRange(AlgCall->getSourceRange());
@@ -128,13 +128,13 @@ void InefficientAlgorithmCheck::check(const MatchFinder::MatchResult &Result) {
   }
 
   if (!CallRange.getBegin().isMacroID() && !Maplike && CompatibleTypes) {
-    StringRef ContainerText = Lexer::getSourceText(
+    const StringRef ContainerText = Lexer::getSourceText(
         CharSourceRange::getTokenRange(IneffContExpr->getSourceRange()), SM,
         LangOpts);
-    StringRef ParamText = Lexer::getSourceText(
+    const StringRef ParamText = Lexer::getSourceText(
         CharSourceRange::getTokenRange(AlgParam->getSourceRange()), SM,
         LangOpts);
-    std::string ReplacementText =
+    const std::string ReplacementText =
         (llvm::Twine(ContainerText) + (PtrToContainer ? "->" : ".") +
          AlgDecl->getName() + "(" + ParamText + ")")
             .str();
diff --git a/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.h b/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.h
index b82a838e737d8..0fae10f2a5f17 100644
--- a/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTSTRINGCONCATENATION_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTSTRINGCONCATENATION_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTSTRINGCONCATENATIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTSTRINGCONCATENATIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -35,4 +35,4 @@ class InefficientStringConcatenationCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::performance
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTSTRINGCONCATENATION_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTSTRINGCONCATENATIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp
index 4a8f292b726ee..a59ab333e6f10 100644
--- a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp
@@ -208,7 +208,7 @@ void InefficientVectorOperationCheck::check(
   if (!TargetVarDecl)
     TargetVarDecl = ProtoVarDecl;
 
-  llvm::SmallPtrSet<const DeclRefExpr *, 16> AllVarRefs =
+  const llvm::SmallPtrSet<const DeclRefExpr *, 16> AllVarRefs =
       utils::decl_ref_expr::allDeclRefExprs(*TargetVarDecl, *LoopParent,
                                             *Context);
   for (const auto *Ref : AllVarRefs) {
@@ -231,12 +231,12 @@ void InefficientVectorOperationCheck::check(
   } else {
     llvm::StringRef FieldName = ProtoAddFieldCall->getMethodDecl()->getName();
     FieldName.consume_front("add_");
-    std::string MutableFieldName = ("mutable_" + FieldName).str();
+    const std::string MutableFieldName = ("mutable_" + FieldName).str();
     PartialReserveStmt = "." + MutableFieldName +
                          "()->Reserve"; // e.g., ".mutable_xxx()->Reserve"
   }
 
-  llvm::StringRef VarName = Lexer::getSourceText(
+  const llvm::StringRef VarName = Lexer::getSourceText(
       CharSourceRange::getTokenRange(
           AppendCall->getImplicitObjectArgument()->getSourceRange()),
       SM, Context->getLangOpts());
@@ -246,14 +246,14 @@ void InefficientVectorOperationCheck::check(
   if (RangeLoop) {
     // Get the range-expression in a for-range statement represented as
     // `for (range-declarator: range-expression)`.
-    StringRef RangeInitExpName =
+    const StringRef RangeInitExpName =
         Lexer::getSourceText(CharSourceRange::getTokenRange(
                                  RangeLoop->getRangeInit()->getSourceRange()),
                              SM, Context->getLangOpts());
     ReserveSize = (RangeInitExpName + ".size()").str();
   } else if (ForLoop) {
     // Handle counter-based loop cases.
-    StringRef LoopEndSource = Lexer::getSourceText(
+    const StringRef LoopEndSource = Lexer::getSourceText(
         CharSourceRange::getTokenRange(LoopEndExpr->getSourceRange()), SM,
         Context->getLangOpts());
     ReserveSize = std::string(LoopEndSource);
@@ -264,7 +264,7 @@ void InefficientVectorOperationCheck::check(
                    "container capacity before the loop")
               << AppendCall->getMethodDecl()->getDeclName();
   if (!ReserveSize.empty()) {
-    std::string ReserveStmt =
+    const std::string ReserveStmt =
         (VarName + PartialReserveStmt + "(" + ReserveSize + ");\n").str();
     Diag << FixItHint::CreateInsertion(LoopStmt->getBeginLoc(), ReserveStmt);
   }
diff --git a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h
index 18f7c1937edf7..5f3b88f51d626 100644
--- a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENT_VECTOR_OPERATION_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENT_VECTOR_OPERATION_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTVECTOROPERATIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTVECTOROPERATIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -44,4 +44,4 @@ class InefficientVectorOperationCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::performance
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENT_VECTOR_OPERATION_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_INEFFICIENTVECTOROPERATIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.cpp b/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.cpp
index 854f09aeb0b51..4d26c39fcbd18 100644
--- a/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.cpp
@@ -19,10 +19,10 @@ static void replaceCallWithArg(const CallExpr *Call, DiagnosticBuilder &Diag,
                                const LangOptions &LangOpts) {
   const Expr *Arg = Call->getArg(0);
 
-  CharSourceRange BeforeArgumentsRange = Lexer::makeFileCharRange(
+  const CharSourceRange BeforeArgumentsRange = Lexer::makeFileCharRange(
       CharSourceRange::getCharRange(Call->getBeginLoc(), Arg->getBeginLoc()),
       SM, LangOpts);
-  CharSourceRange AfterArgumentsRange = Lexer::makeFileCharRange(
+  const CharSourceRange AfterArgumentsRange = Lexer::makeFileCharRange(
       CharSourceRange::getCharRange(Call->getEndLoc(),
                                     Call->getEndLoc().getLocWithOffset(1)),
       SM, LangOpts);
@@ -114,17 +114,18 @@ void MoveConstArgCheck::check(const MatchFinder::MatchResult &Result) {
 
   const Expr *Arg = CallMove->getArg(0);
   const QualType ArgType = Arg->getType().getCanonicalType();
-  SourceManager &SM = Result.Context->getSourceManager();
+  const SourceManager &SM = Result.Context->getSourceManager();
 
-  CharSourceRange MoveRange =
+  const CharSourceRange MoveRange =
       CharSourceRange::getCharRange(CallMove->getSourceRange());
-  CharSourceRange FileMoveRange =
+  const CharSourceRange FileMoveRange =
       Lexer::makeFileCharRange(MoveRange, SM, getLangOpts());
   if (!FileMoveRange.isValid())
     return;
 
-  bool IsConstArg = ArgType.isConstQualified();
-  bool IsTriviallyCopyable = ArgType.isTriviallyCopyableType(*Result.Context);
+  const bool IsConstArg = ArgType.isConstQualified();
+  const bool IsTriviallyCopyable =
+      ArgType.isTriviallyCopyableType(*Result.Context);
 
   if (IsConstArg || IsTriviallyCopyable) {
     if (const CXXRecordDecl *R = ArgType->getAsCXXRecordDecl()) {
@@ -143,10 +144,10 @@ void MoveConstArgCheck::check(const MatchFinder::MatchResult &Result) {
     if (!IsConstArg && IsTriviallyCopyable && !CheckTriviallyCopyableMove)
       return;
 
-    bool IsVariable = isa<DeclRefExpr>(Arg);
+    const bool IsVariable = isa<DeclRefExpr>(Arg);
     // std::move shouldn't be removed when an lvalue wrapped by std::move is
     // passed to the function with an rvalue reference parameter.
-    bool IsRVRefParam =
+    const bool IsRVRefParam =
         isRValueReferenceParam(ReceivingExpr, InvocationParmType, Arg);
     const auto *Var =
         IsVariable ? dyn_cast<DeclRefExpr>(Arg)->getDecl() : nullptr;
diff --git a/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.h b/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.h
index 9f67f64857168..ff1d67ba77a9e 100644
--- a/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MOVECONSTANTARGUMENTCHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MOVECONSTANTARGUMENTCHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_MOVECONSTARGCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_MOVECONSTARGCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "llvm/ADT/DenseSet.h"
@@ -43,4 +43,4 @@ class MoveConstArgCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::performance
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_MOVECONSTANTARGUMENTCHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_MOVECONSTARGCHECK_H
diff --git a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp
index 44f6d20ac2be3..d8cdc0d101d81 100644
--- a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp
@@ -39,7 +39,7 @@ void MoveConstructorInitCheck::check(const MatchFinder::MatchResult &Result) {
 
   // Do not diagnose if the expression used to perform the initialization is a
   // trivially-copyable type.
-  QualType QT = Initializer->getInit()->getType();
+  const QualType QT = Initializer->getInit()->getType();
   if (QT.isTriviallyCopyableType(*Result.Context))
     return;
 
diff --git a/clang-tools-extra/clang-tidy/performance/NoexceptFunctionBaseCheck.h b/clang-tools-extra/clang-tidy/performance/NoexceptFunctionBaseCheck.h
index 56a1e4af010a2..6ed30255e0f96 100644
--- a/clang-tools-extra/clang-tidy/performance/NoexceptFunctionBaseCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/NoexceptFunctionBaseCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_NOEXCEPTFUNCTIONCHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_NOEXCEPTFUNCTIONCHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_NOEXCEPTFUNCTIONBASECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_NOEXCEPTFUNCTIONBASECHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "../utils/ExceptionSpecAnalyzer.h"
@@ -46,4 +46,4 @@ class NoexceptFunctionBaseCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::performance
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_NOEXCEPTFUNCTIONCHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_NOEXCEPTFUNCTIONBASECHECK_H
diff --git a/clang-tools-extra/clang-tidy/performance/PerformanceTidyModule.cpp b/clang-tools-extra/clang-tidy/performance/PerformanceTidyModule.cpp
index ae15208ae3dc5..3497ea7316c6b 100644
--- a/clang-tools-extra/clang-tidy/performance/PerformanceTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/performance/PerformanceTidyModule.cpp
@@ -26,7 +26,7 @@
 #include "NoexceptSwapCheck.h"
 #include "TriviallyDestructibleCheck.h"
 #include "TypePromotionInMathFnCheck.h"
-#include "UnnecessaryCopyInitialization.h"
+#include "UnnecessaryCopyInitializationCheck.h"
 #include "UnnecessaryValueParamCheck.h"
 
 namespace clang::tidy {
@@ -66,7 +66,7 @@ class PerformanceModule : public ClangTidyModule {
         "performance-trivially-destructible");
     CheckFactories.registerCheck<TypePromotionInMathFnCheck>(
         "performance-type-promotion-in-math-fn");
-    CheckFactories.registerCheck<UnnecessaryCopyInitialization>(
+    CheckFactories.registerCheck<UnnecessaryCopyInitializationCheck>(
         "performance-unnecessary-copy-initialization");
     CheckFactories.registerCheck<UnnecessaryValueParamCheck>(
         "performance-unnecessary-value-param");
diff --git a/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.cpp b/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.cpp
index 0db66c0d5803d..2f54b17367b06 100644
--- a/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.cpp
@@ -24,7 +24,7 @@ AST_MATCHER(Decl, isFirstDecl) { return Node.isFirstDecl(); }
 
 AST_MATCHER_P(CXXRecordDecl, hasBase, Matcher<QualType>, InnerMatcher) {
   for (const CXXBaseSpecifier &BaseSpec : Node.bases()) {
-    QualType BaseType = BaseSpec.getType();
+    const QualType BaseType = BaseSpec.getType();
     if (InnerMatcher.matches(BaseType, Finder, Builder))
       return true;
   }
@@ -50,7 +50,7 @@ void TriviallyDestructibleCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *MatchedDecl = Result.Nodes.getNodeAs<CXXDestructorDecl>("decl");
 
   // Get locations of both first and out-of-line declarations.
-  SourceManager &SM = *Result.SourceManager;
+  const SourceManager &SM = *Result.SourceManager;
   const auto *FirstDecl = cast<CXXMethodDecl>(MatchedDecl->getFirstDecl());
   const SourceLocation FirstDeclEnd = utils::lexer::findNextTerminator(
       FirstDecl->getEndLoc(), SM, getLangOpts());
diff --git a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp
index 096ca57ee8e22..a462c55888c73 100644
--- a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp
@@ -155,18 +155,18 @@ void TypePromotionInMathFnCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *Call = Result.Nodes.getNodeAs<CallExpr>("call");
   assert(Call != nullptr);
 
-  StringRef OldFnName = Call->getDirectCallee()->getName();
+  const StringRef OldFnName = Call->getDirectCallee()->getName();
 
   // In C++ mode, we prefer std::foo to ::foof.  But some of these suggestions
   // are only valid in C++11 and newer.
-  static llvm::StringSet<> Cpp11OnlyFns = {
+  static const llvm::StringSet<> Cpp11OnlyFns = {
       "acosh",     "asinh",      "atanh",     "cbrt",   "copysign", "erf",
       "erfc",      "exp2",       "expm1",     "fdim",   "fma",      "fmax",
       "fmin",      "hypot",      "ilogb",     "lgamma", "llrint",   "llround",
       "log1p",     "log2",       "logb",      "lrint",  "lround",   "nearbyint",
       "nextafter", "nexttoward", "remainder", "remquo", "rint",     "round",
       "scalbln",   "scalbn",     "tgamma",    "trunc"};
-  bool StdFnRequiresCpp11 = Cpp11OnlyFns.contains(OldFnName);
+  const bool StdFnRequiresCpp11 = Cpp11OnlyFns.contains(OldFnName);
 
   std::string NewFnName;
   bool FnInCmath = false;
diff --git a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h
index cf74f80006274..21a7f4d040cd8 100644
--- a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_TYPE_PROMOTION_IN_MATH_FN_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_TYPE_PROMOTION_IN_MATH_FN_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_TYPEPROMOTIONINMATHFNCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_TYPEPROMOTIONINMATHFNCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "../utils/IncludeInserter.h"
@@ -39,4 +39,4 @@ class TypePromotionInMathFnCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::performance
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_TYPE_PROMOTION_IN_MATH_FN_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_TYPEPROMOTIONINMATHFNCHECK_H
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitializationCheck.cpp
similarity index 94%
rename from clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
rename to clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitializationCheck.cpp
index 591836667a2ba..e6fe857779458 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitializationCheck.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "UnnecessaryCopyInitialization.h"
+#include "UnnecessaryCopyInitializationCheck.h"
 #include "../utils/DeclRefExprUtils.h"
 #include "../utils/FixItHintUtils.h"
 #include "../utils/LexerUtils.h"
@@ -46,7 +46,7 @@ static std::optional<SourceLocation> firstLocAfterNewLine(SourceLocation Loc,
   if (Invalid) {
     return std::nullopt;
   }
-  size_t Offset = std::strcspn(TextAfter, "\n");
+  const size_t Offset = std::strcspn(TextAfter, "\n");
   return Loc.getLocWithOffset(TextAfter[Offset] == '\0' ? Offset : Offset + 1);
 }
 
@@ -148,7 +148,7 @@ AST_MATCHER_FUNCTION_P(StatementMatcher, initializerReturnsReferenceToConst,
 static bool isInitializingVariableImmutable(
     const VarDecl &InitializingVar, const Stmt &BlockStmt, ASTContext &Context,
     const std::vector<StringRef> &ExcludedContainerTypes) {
-  QualType T = InitializingVar.getType().getCanonicalType();
+  const QualType T = InitializingVar.getType().getCanonicalType();
   if (!isOnlyUsedAsConst(InitializingVar, BlockStmt, Context,
                          T->isPointerType() ? 1 : 0))
     return false;
@@ -227,7 +227,7 @@ static QualType constructorArgumentType(const VarDecl *OldVar,
   return MethodDecl->getReturnType();
 }
 
-UnnecessaryCopyInitialization::UnnecessaryCopyInitialization(
+UnnecessaryCopyInitializationCheck::UnnecessaryCopyInitializationCheck(
     StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
       AllowedTypes(
@@ -235,7 +235,7 @@ UnnecessaryCopyInitialization::UnnecessaryCopyInitialization(
       ExcludedContainerTypes(utils::options::parseStringList(
           Options.get("ExcludedContainerTypes", ""))) {}
 
-void UnnecessaryCopyInitialization::registerMatchers(MatchFinder *Finder) {
+void UnnecessaryCopyInitializationCheck::registerMatchers(MatchFinder *Finder) {
   auto LocalVarCopiedFrom =
       [this](const ast_matchers::internal::Matcher<Expr> &CopyCtorArg) {
         return compoundStmt(
@@ -276,7 +276,7 @@ void UnnecessaryCopyInitialization::registerMatchers(MatchFinder *Finder) {
                      this);
 }
 
-void UnnecessaryCopyInitialization::check(
+void UnnecessaryCopyInitializationCheck::check(
     const MatchFinder::MatchResult &Result) {
   const auto &NewVar = *Result.Nodes.getNodeAs<VarDecl>("newVarDecl");
   const auto &BlockStmt = *Result.Nodes.getNodeAs<Stmt>("blockStmt");
@@ -297,7 +297,7 @@ void UnnecessaryCopyInitialization::check(
   const auto *ObjectArg = Result.Nodes.getNodeAs<VarDecl>(ObjectArgId);
   const auto *CtorCall = Result.Nodes.getNodeAs<CXXConstructExpr>("ctorCall");
 
-  TraversalKindScope RAII(*Result.Context, TK_AsIs);
+  const TraversalKindScope RAII(*Result.Context, TK_AsIs);
 
   // A constructor that looks like T(const T& t, bool arg = false) counts as a
   // copy only when it is called with default arguments for the arguments after
@@ -325,9 +325,9 @@ void UnnecessaryCopyInitialization::check(
   }
 }
 
-void UnnecessaryCopyInitialization::handleCopyFromMethodReturn(
+void UnnecessaryCopyInitializationCheck::handleCopyFromMethodReturn(
     const CheckContext &Ctx, const VarDecl *ObjectArg) {
-  bool IsConstQualified = Ctx.Var.getType().isConstQualified();
+  const bool IsConstQualified = Ctx.Var.getType().isConstQualified();
   if (!IsConstQualified && !Ctx.IsVarOnlyUsedAsConst)
     return;
   if (ObjectArg != nullptr &&
@@ -337,7 +337,7 @@ void UnnecessaryCopyInitialization::handleCopyFromMethodReturn(
   diagnoseCopyFromMethodReturn(Ctx);
 }
 
-void UnnecessaryCopyInitialization::handleCopyFromLocalVar(
+void UnnecessaryCopyInitializationCheck::handleCopyFromLocalVar(
     const CheckContext &Ctx, const VarDecl &OldVar) {
   if (!Ctx.IsVarOnlyUsedAsConst ||
       !isInitializingVariableImmutable(OldVar, Ctx.BlockStmt, Ctx.ASTCtx,
@@ -346,7 +346,7 @@ void UnnecessaryCopyInitialization::handleCopyFromLocalVar(
   diagnoseCopyFromLocalVar(Ctx, OldVar);
 }
 
-void UnnecessaryCopyInitialization::diagnoseCopyFromMethodReturn(
+void UnnecessaryCopyInitializationCheck::diagnoseCopyFromMethodReturn(
     const CheckContext &Ctx) {
   auto Diagnostic =
       diag(Ctx.Var.getLocation(),
@@ -360,7 +360,7 @@ void UnnecessaryCopyInitialization::diagnoseCopyFromMethodReturn(
   maybeIssueFixes(Ctx, Diagnostic);
 }
 
-void UnnecessaryCopyInitialization::diagnoseCopyFromLocalVar(
+void UnnecessaryCopyInitializationCheck::diagnoseCopyFromLocalVar(
     const CheckContext &Ctx, const VarDecl &OldVar) {
   auto Diagnostic =
       diag(Ctx.Var.getLocation(),
@@ -372,7 +372,7 @@ void UnnecessaryCopyInitialization::diagnoseCopyFromLocalVar(
   maybeIssueFixes(Ctx, Diagnostic);
 }
 
-void UnnecessaryCopyInitialization::maybeIssueFixes(
+void UnnecessaryCopyInitializationCheck::maybeIssueFixes(
     const CheckContext &Ctx, DiagnosticBuilder &Diagnostic) {
   if (Ctx.IssueFix) {
     if (Ctx.IsVarUnused)
@@ -382,7 +382,7 @@ void UnnecessaryCopyInitialization::maybeIssueFixes(
   }
 }
 
-void UnnecessaryCopyInitialization::storeOptions(
+void UnnecessaryCopyInitializationCheck::storeOptions(
     ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "AllowedTypes",
                 utils::options::serializeStringList(AllowedTypes));
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.h b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitializationCheck.h
similarity index 87%
rename from clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.h
rename to clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitializationCheck.h
index 66231889b8014..89957a5ed09e7 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.h
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitializationCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARY_COPY_INITIALIZATION_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARY_COPY_INITIALIZATION_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARYCOPYINITIALIZATIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARYCOPYINITIALIZATIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "clang/AST/Decl.h"
@@ -22,9 +22,12 @@ namespace clang::tidy::performance {
 // The check currently only understands a subset of variables that are
 // guaranteed to outlive the const reference returned, namely: const variables,
 // const references, and const pointers to const.
-class UnnecessaryCopyInitialization : public ClangTidyCheck {
+//
+// For the user-facing documentation see:
+// https://clang.llvm.org/extra/clang-tidy/checks/performance/unnecessary-copy-initialization.html
+class UnnecessaryCopyInitializationCheck : public ClangTidyCheck {
 public:
-  UnnecessaryCopyInitialization(StringRef Name, ClangTidyContext *Context);
+  UnnecessaryCopyInitializationCheck(StringRef Name, ClangTidyContext *Context);
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus;
   }
@@ -64,4 +67,4 @@ class UnnecessaryCopyInitialization : public ClangTidyCheck {
 
 } // namespace clang::tidy::performance
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARY_COPY_INITIALIZATION_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARYCOPYINITIALIZATIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
index 3f5b43feca1ad..d62629713cb41 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
@@ -73,7 +73,7 @@ void UnnecessaryValueParamCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *Param = Result.Nodes.getNodeAs<ParmVarDecl>("param");
   const auto *Function = Result.Nodes.getNodeAs<FunctionDecl>("functionDecl");
 
-  TraversalKindScope RAII(*Result.Context, TK_AsIs);
+  const TraversalKindScope RAII(*Result.Context, TK_AsIs);
 
   FunctionParmMutationAnalyzer *Analyzer =
       FunctionParmMutationAnalyzer::getFunctionParmMutationAnalyzer(
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
index d59fb4105381e..22df689298fbb 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARY_VALUE_PARAM_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARY_VALUE_PARAM_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARYVALUEPARAMCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARYVALUEPARAMCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "../utils/IncludeInserter.h"
@@ -51,4 +51,4 @@ class UnnecessaryValueParamCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::performance
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARY_VALUE_PARAM_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PERFORMANCE_UNNECESSARYVALUEPARAMCHECK_H
diff --git a/clang-tools-extra/clang-tidy/plugin/ClangTidyPlugin.cpp b/clang-tools-extra/clang-tidy/plugin/ClangTidyPlugin.cpp
index 195418d2e2ca2..172b9185519c5 100644
--- a/clang-tools-extra/clang-tidy/plugin/ClangTidyPlugin.cpp
+++ b/clang-tools-extra/clang-tidy/plugin/ClangTidyPlugin.cpp
@@ -55,13 +55,13 @@ class ClangTidyPluginAction : public PluginASTAction {
 
   bool ParseArgs(const CompilerInstance &,
                  const std::vector<std::string> &Args) override {
-    ClangTidyGlobalOptions GlobalOptions;
-    ClangTidyOptions DefaultOptions;
+    const ClangTidyGlobalOptions GlobalOptions;
+    const ClangTidyOptions DefaultOptions;
     ClangTidyOptions OverrideOptions;
 
     // Parse the extra command line args.
     // FIXME: This is very limited at the moment.
-    for (StringRef Arg : Args)
+    for (const StringRef Arg : Args)
       if (Arg.starts_with("-checks="))
         OverrideOptions.Checks = std::string(Arg.substr(strlen("-checks=")));
 
diff --git a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp
index 5174f56207b54..4225c3e15af98 100644
--- a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp
@@ -43,8 +43,9 @@ void RestrictedIncludesPPCallbacks::EndOfMainFile() {
     for (const auto &Include : FileDirectives) {
       // Fetch the length of the include statement from the start to just after
       // the newline, for finding the end (including the newline).
-      unsigned ToLen = std::strcspn(SM.getCharacterData(Include.Loc), "\n") + 1;
-      CharSourceRange ToRange = CharSourceRange::getCharRange(
+      const unsigned ToLen =
+          std::strcspn(SM.getCharacterData(Include.Loc), "\n") + 1;
+      const CharSourceRange ToRange = CharSourceRange::getCharRange(
           Include.Loc, Include.Loc.getLocWithOffset(ToLen));
 
       if (!Include.IsInMainFile) {
diff --git a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h
index e37f89336bc92..d66149aceae91 100644
--- a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h
+++ b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_RESTRICTINCLUDESSCHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_RESTRICTINCLUDESSCHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_RESTRICTSYSTEMINCLUDESCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_RESTRICTSYSTEMINCLUDESCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "../GlobList.h"
@@ -23,7 +23,7 @@ namespace clang::tidy::portability {
 class RestrictSystemIncludesCheck : public ClangTidyCheck {
 public:
   RestrictSystemIncludesCheck(StringRef Name, ClangTidyContext *Context,
-                              std::string DefaultAllowedIncludes = "*")
+                              StringRef DefaultAllowedIncludes = "*")
       : ClangTidyCheck(Name, Context),
         AllowedIncludes(Options.get("Includes", DefaultAllowedIncludes)),
         AllowedIncludesGlobList(AllowedIncludes) {}
@@ -36,7 +36,7 @@ class RestrictSystemIncludesCheck : public ClangTidyCheck {
   }
 
 private:
-  std::string AllowedIncludes;
+  StringRef AllowedIncludes;
   GlobList AllowedIncludesGlobList;
 };
 
@@ -79,4 +79,4 @@ class RestrictedIncludesPPCallbacks : public PPCallbacks {
 
 } // namespace clang::tidy::portability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_RESTRICTINCLUDESSCHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_RESTRICTSYSTEMINCLUDESCHECK_H
diff --git a/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.cpp b/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.cpp
index d90b09abb1be8..fb4b22c63971e 100644
--- a/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.cpp
@@ -104,9 +104,9 @@ void SIMDIntrinsicsCheck::check(const MatchFinder::MatchResult &Result) {
   if (!Callee)
     return;
 
-  StringRef Old = Callee->getName();
+  const StringRef Old = Callee->getName();
   StringRef New;
-  llvm::Triple::ArchType Arch =
+  const llvm::Triple::ArchType Arch =
       Result.Context->getTargetInfo().getTriple().getArch();
 
   // We warn or suggest if this SIMD intrinsic function has a std::simd
diff --git a/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h b/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h
index db2d2307b1943..addcecbcb9370 100644
--- a/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h
+++ b/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_SIMD_INTRINSICS_CHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_SIMD_INTRINSICS_CHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_SIMDINTRINSICSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_SIMDINTRINSICSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -37,4 +37,4 @@ class SIMDIntrinsicsCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::portability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_SIMD_INTRINSICS_CHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_SIMDINTRINSICSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp b/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDeclsCheck.cpp
similarity index 89%
rename from clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp
rename to clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDeclsCheck.cpp
index 02fe913ee7918..affcea441ada7 100644
--- a/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp
+++ b/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDeclsCheck.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AvoidConstParamsInDecls.h"
+#include "AvoidConstParamsInDeclsCheck.h"
 #include "../utils/LexerUtils.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
@@ -27,7 +27,7 @@ static std::optional<Token>
 findConstToRemove(const ParmVarDecl &Param,
                   const MatchFinder::MatchResult &Result) {
 
-  CharSourceRange FileRange = Lexer::makeFileCharRange(
+  const CharSourceRange FileRange = Lexer::makeFileCharRange(
       CharSourceRange::getTokenRange(getTypeRange(Param)),
       *Result.SourceManager, Result.Context->getLangOpts());
 
@@ -38,11 +38,12 @@ findConstToRemove(const ParmVarDecl &Param,
       tok::kw_const, FileRange, *Result.Context, *Result.SourceManager);
 }
 
-void AvoidConstParamsInDecls::storeOptions(ClangTidyOptions::OptionMap &Opts) {
+void AvoidConstParamsInDeclsCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "IgnoreMacros", IgnoreMacros);
 }
 
-void AvoidConstParamsInDecls::registerMatchers(MatchFinder *Finder) {
+void AvoidConstParamsInDeclsCheck::registerMatchers(MatchFinder *Finder) {
   const auto ConstParamDecl =
       parmVarDecl(hasType(qualType(isConstQualified()))).bind("param");
   Finder->addMatcher(functionDecl(unless(isDefinition()),
@@ -51,7 +52,8 @@ void AvoidConstParamsInDecls::registerMatchers(MatchFinder *Finder) {
                      this);
 }
 
-void AvoidConstParamsInDecls::check(const MatchFinder::MatchResult &Result) {
+void AvoidConstParamsInDeclsCheck::check(
+    const MatchFinder::MatchResult &Result) {
   const auto *Func = Result.Nodes.getNodeAs<FunctionDecl>("func");
   const auto *Param = Result.Nodes.getNodeAs<ParmVarDecl>("param");
 
diff --git a/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.h b/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDeclsCheck.h
similarity index 66%
rename from clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.h
rename to clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDeclsCheck.h
index 1dd28fde217ed..467a9a48ef0b7 100644
--- a/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.h
+++ b/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDeclsCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOID_CONST_PARAMS_IN_DECLS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOID_CONST_PARAMS_IN_DECLS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOIDCONSTPARAMSINDECLSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOIDCONSTPARAMSINDECLSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -15,9 +15,12 @@ namespace clang::tidy::readability {
 
 // Detect function declarations that have const value parameters and discourage
 // them.
-class AvoidConstParamsInDecls : public ClangTidyCheck {
+//
+// For the user-facing documentation see:
+// https://clang.llvm.org/extra/clang-tidy/checks/readability/avoid-const-params-in-decls.html
+class AvoidConstParamsInDeclsCheck : public ClangTidyCheck {
 public:
-  AvoidConstParamsInDecls(StringRef Name, ClangTidyContext *Context)
+  AvoidConstParamsInDeclsCheck(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context),
         IgnoreMacros(Options.get("IgnoreMacros", true)) {}
 
@@ -34,4 +37,4 @@ class AvoidConstParamsInDecls : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOID_CONST_PARAMS_IN_DECLS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOIDCONSTPARAMSINDECLSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.h b/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.h
index 260c84304e138..0e729ecb0134f 100644
--- a/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOID_NESTED_CONDITIONAL_OPERATOR_CHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOID_NESTED_CONDITIONAL_OPERATOR_CHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOIDNESTEDCONDITIONALOPERATORCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOIDNESTEDCONDITIONALOPERATORCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -30,4 +30,4 @@ class AvoidNestedConditionalOperatorCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOID_NESTED_CONDITIONAL_OPERATOR_CHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOIDNESTEDCONDITIONALOPERATORCHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp b/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp
index 40a4fa114681e..2b31281bb4a63 100644
--- a/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp
@@ -47,9 +47,9 @@ void AvoidReturnWithVoidValueCheck::check(
       Result.Nodes.getNodeAs<CompoundStmt>("compound_parent");
   if (!StrictMode && !SurroundingBlock)
     return;
-  DiagnosticBuilder Diag = diag(VoidReturn->getBeginLoc(),
-                                "return statement within a void function "
-                                "should not have a specified return value");
+  const DiagnosticBuilder Diag = diag(
+      VoidReturn->getBeginLoc(), "return statement within a void function "
+                                 "should not have a specified return value");
   const SourceLocation SemicolonPos = utils::lexer::findNextTerminator(
       VoidReturn->getEndLoc(), *Result.SourceManager, getLangOpts());
   if (SemicolonPos.isInvalid())
diff --git a/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.cpp b/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.cpp
index c53c70667dbbc..631bb14753163 100644
--- a/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.cpp
@@ -40,7 +40,7 @@ struct AvoidUnconditionalPreprocessorIfPPCallbacks : public PPCallbacks {
 
   bool isImmutable(SourceManager &SM, const LangOptions &LangOpts,
                    SourceRange ConditionRange) {
-    SourceLocation Loc = ConditionRange.getBegin();
+    const SourceLocation Loc = ConditionRange.getBegin();
     if (Loc.isMacroID())
       return false;
 
diff --git a/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp b/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp
index 1952e14d1fc3d..2b55bb819da9c 100644
--- a/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp
@@ -20,7 +20,8 @@ namespace clang::tidy::readability {
 static tok::TokenKind getTokenKind(SourceLocation Loc, const SourceManager &SM,
                                    const LangOptions &LangOpts) {
   Token Tok;
-  SourceLocation Beginning = Lexer::GetBeginningOfToken(Loc, SM, LangOpts);
+  const SourceLocation Beginning =
+      Lexer::GetBeginningOfToken(Loc, SM, LangOpts);
   const bool Invalid = Lexer::getRawToken(Beginning, Tok, SM, LangOpts);
   assert(!Invalid && "Expected a valid token.");
 
@@ -38,7 +39,7 @@ forwardSkipWhitespaceAndComments(SourceLocation Loc, const SourceManager &SM,
     while (isWhitespace(*SM.getCharacterData(Loc)))
       Loc = Loc.getLocWithOffset(1);
 
-    tok::TokenKind TokKind = getTokenKind(Loc, SM, LangOpts);
+    const tok::TokenKind TokKind = getTokenKind(Loc, SM, LangOpts);
     if (TokKind != tok::comment)
       return Loc;
 
@@ -80,7 +81,8 @@ void BracesAroundStatementsCheck::check(
   } else if (const auto *S = Result.Nodes.getNodeAs<DoStmt>("do")) {
     checkStmt(Result, S->getBody(), S->getDoLoc(), S->getWhileLoc());
   } else if (const auto *S = Result.Nodes.getNodeAs<WhileStmt>("while")) {
-    SourceLocation StartLoc = findRParenLoc(S, SM, Context->getLangOpts());
+    const SourceLocation StartLoc =
+        findRParenLoc(S, SM, Context->getLangOpts());
     if (StartLoc.isInvalid())
       return;
     checkStmt(Result, S->getBody(), StartLoc);
@@ -89,12 +91,14 @@ void BracesAroundStatementsCheck::check(
     if (S->isConsteval())
       return;
 
-    SourceLocation StartLoc = findRParenLoc(S, SM, Context->getLangOpts());
+    const SourceLocation StartLoc =
+        findRParenLoc(S, SM, Context->getLangOpts());
     if (StartLoc.isInvalid())
       return;
     if (ForceBracesStmts.erase(S))
       ForceBracesStmts.insert(S->getThen());
-    bool BracedIf = checkStmt(Result, S->getThen(), StartLoc, S->getElseLoc());
+    const bool BracedIf =
+        checkStmt(Result, S->getThen(), StartLoc, S->getElseLoc());
     const Stmt *Else = S->getElse();
     if (Else && BracedIf)
       ForceBracesStmts.insert(Else);
@@ -125,7 +129,7 @@ BracesAroundStatementsCheck::findRParenLoc(const IfOrWhileStmt *S,
     return {};
   }
 
-  SourceLocation PastCondEndLoc =
+  const SourceLocation PastCondEndLoc =
       Lexer::getLocForEndOfToken(CondEndLoc, 0, SM, LangOpts);
   if (PastCondEndLoc.isInvalid())
     return {};
@@ -133,7 +137,7 @@ BracesAroundStatementsCheck::findRParenLoc(const IfOrWhileStmt *S,
       forwardSkipWhitespaceAndComments(PastCondEndLoc, SM, LangOpts);
   if (RParenLoc.isInvalid())
     return {};
-  tok::TokenKind TokKind = getTokenKind(RParenLoc, SM, LangOpts);
+  const tok::TokenKind TokKind = getTokenKind(RParenLoc, SM, LangOpts);
   if (TokKind != tok::r_paren)
     return {};
   return RParenLoc;
diff --git a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
index 91e9354d454d2..161a0d96caf41 100644
--- a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
@@ -5,7 +5,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_clang_library(clangTidyReadabilityModule STATIC
   AmbiguousSmartptrResetCallCheck.cpp
-  AvoidConstParamsInDecls.cpp
+  AvoidConstParamsInDeclsCheck.cpp
   AvoidNestedConditionalOperatorCheck.cpp
   AvoidReturnWithVoidValueCheck.cpp
   AvoidUnconditionalPreprocessorIfCheck.cpp
@@ -14,7 +14,7 @@ add_clang_library(clangTidyReadabilityModule STATIC
   ContainerContainsCheck.cpp
   ContainerDataPointerCheck.cpp
   ContainerSizeEmptyCheck.cpp
-  ConvertMemberFunctionsToStatic.cpp
+  ConvertMemberFunctionsToStaticCheck.cpp
   DeleteNullPointerCheck.cpp
   DuplicateIncludeCheck.cpp
   ElseAfterReturnCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.cpp b/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.cpp
index 6ccd933ff4c21..cfdf0e9c4a331 100644
--- a/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.cpp
@@ -32,13 +32,13 @@ findConstToRemove(const FunctionDecl *Def,
   // written in the source (for out-of-line declarations). A FunctionDecl's
   // "location" is the start of its name, so, when the name is unqualified, we
   // use `getLocation()`.
-  SourceLocation NameBeginLoc = Def->getQualifier()
-                                    ? Def->getQualifierLoc().getBeginLoc()
-                                    : Def->getLocation();
+  const SourceLocation NameBeginLoc = Def->getQualifier()
+                                          ? Def->getQualifierLoc().getBeginLoc()
+                                          : Def->getLocation();
   // Since either of the locs can be in a macro, use `makeFileCharRange` to be
   // sure that we have a consistent `CharSourceRange`, located entirely in the
   // source file.
-  CharSourceRange FileRange = Lexer::makeFileCharRange(
+  const CharSourceRange FileRange = Lexer::makeFileCharRange(
       CharSourceRange::getCharRange(Def->getBeginLoc(), NameBeginLoc),
       *Result.SourceManager, Result.Context->getLangOpts());
 
@@ -118,12 +118,12 @@ void ConstReturnTypeCheck::check(const MatchFinder::MatchResult &Result) {
       (Def->getBeginLoc().isMacroID() || Def->getEndLoc().isMacroID()))
     return;
 
-  CheckResult CR = checkDef(Def, Result);
+  const CheckResult CR = checkDef(Def, Result);
   {
     // Clang only supports one in-flight diagnostic at a time. So, delimit the
     // scope of `Diagnostic` to allow further diagnostics after the scope.  We
     // use `getInnerLocStart` to get the start of the return type.
-    DiagnosticBuilder Diagnostic =
+    const DiagnosticBuilder Diagnostic =
         diag(Def->getInnerLocStart(),
              "return type %0 is 'const'-qualified at the top level, which may "
              "reduce code readability without improving const correctness")
diff --git a/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.cpp b/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.cpp
index 850ef86c85b17..efcf13d63b4ff 100644
--- a/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.cpp
@@ -47,9 +47,8 @@ void ContainerContainsCheck::registerMatchers(MatchFinder *Finder) {
   const auto StringNpos = anyOf(declRefExpr(to(varDecl(hasName("npos")))),
                                 memberExpr(member(hasName("npos"))));
 
-  auto AddSimpleMatcher = [&](auto Matcher) {
-    Finder->addMatcher(
-        traverse(TK_IgnoreUnlessSpelledInSource, std::move(Matcher)), this);
+  auto AddSimpleMatcher = [&](const auto &Matcher) {
+    Finder->addMatcher(traverse(TK_IgnoreUnlessSpelledInSource, Matcher), this);
   };
 
   // Find membership tests which use `count()`.
@@ -110,7 +109,7 @@ void ContainerContainsCheck::check(const MatchFinder::MatchResult &Result) {
       Result.Nodes.getNodeAs<Expr>("negativeComparison");
   assert((!PositiveComparison || !NegativeComparison) &&
          "only one of PositiveComparison or NegativeComparison should be set");
-  bool Negated = NegativeComparison != nullptr;
+  const bool Negated = NegativeComparison != nullptr;
   const auto *Comparison = Negated ? NegativeComparison : PositiveComparison;
   const StringRef ContainsFunName =
       Result.Nodes.getNodeAs<CXXMethodDecl>("contains_fun")->getName();
@@ -121,7 +120,7 @@ void ContainerContainsCheck::check(const MatchFinder::MatchResult &Result) {
               << ContainsFunName;
 
   // Don't fix it if it's in a macro invocation. Leave fixing it to the user.
-  SourceLocation FuncCallLoc = Comparison->getEndLoc();
+  const SourceLocation FuncCallLoc = Comparison->getEndLoc();
   if (!FuncCallLoc.isValid() || FuncCallLoc.isMacroID())
     return;
 
diff --git a/clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.cpp b/clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.cpp
index 11756d10a8221..e308aefcf156a 100644
--- a/clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ContainerDataPointerCheck.cpp
@@ -101,14 +101,17 @@ void ContainerDataPointerCheck::check(const MatchFinder::MatchResult &Result) {
   else if (ACE)
     CE = ACE;
 
-  SourceRange SrcRange = CE->getSourceRange();
+  const SourceRange SrcRange = CE->getSourceRange();
 
   std::string ReplacementText{
       Lexer::getSourceText(CharSourceRange::getTokenRange(SrcRange),
                            *Result.SourceManager, getLangOpts())};
 
-  if (!isa<DeclRefExpr, ArraySubscriptExpr, CXXOperatorCallExpr, CallExpr,
-           MemberExpr>(CE))
+  const auto *OpCall = dyn_cast<CXXOperatorCallExpr>(CE);
+  const bool NeedsParens =
+      OpCall ? (OpCall->getOperator() != OO_Subscript)
+             : !isa<DeclRefExpr, MemberExpr, ArraySubscriptExpr, CallExpr>(CE);
+  if (NeedsParens)
     ReplacementText = "(" + ReplacementText + ")";
 
   if (CE->getType()->isPointerType())
@@ -116,7 +119,7 @@ void ContainerDataPointerCheck::check(const MatchFinder::MatchResult &Result) {
   else
     ReplacementText += ".data()";
 
-  FixItHint Hint =
+  const FixItHint Hint =
       FixItHint::CreateReplacement(UO->getSourceRange(), ReplacementText);
   diag(UO->getBeginLoc(),
        "'data' should be used for accessing the data pointer instead of taking "
diff --git a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStaticCheck.cpp
similarity index 88%
rename from clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp
rename to clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStaticCheck.cpp
index 6da4cf7c6bf94..e6276e317b3ff 100644
--- a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStaticCheck.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ConvertMemberFunctionsToStatic.h"
+#include "ConvertMemberFunctionsToStaticCheck.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/RecursiveASTVisitor.h"
@@ -78,7 +78,8 @@ AST_MATCHER(CXXMethodDecl, usesThis) {
 
 } // namespace
 
-void ConvertMemberFunctionsToStatic::registerMatchers(MatchFinder *Finder) {
+void ConvertMemberFunctionsToStaticCheck::registerMatchers(
+    MatchFinder *Finder) {
   Finder->addMatcher(
       cxxMethodDecl(
           isDefinition(), isUserProvided(),
@@ -118,26 +119,26 @@ static SourceRange getLocationOfConst(const TypeSourceInfo *TSI,
   const auto FTL = TSI->getTypeLoc().IgnoreParens().getAs<FunctionTypeLoc>();
   assert(FTL);
 
-  SourceRange Range{FTL.getRParenLoc().getLocWithOffset(1),
-                    FTL.getLocalRangeEnd()};
+  const SourceRange Range{FTL.getRParenLoc().getLocWithOffset(1),
+                          FTL.getLocalRangeEnd()};
   // Inside Range, there might be other keywords and trailing return types.
   // Find the exact position of "const".
-  StringRef Text = getStringFromRange(SourceMgr, LangOpts, Range);
-  size_t Offset = Text.find("const");
+  const StringRef Text = getStringFromRange(SourceMgr, LangOpts, Range);
+  const size_t Offset = Text.find("const");
   if (Offset == StringRef::npos)
     return {};
 
-  SourceLocation Start = Range.getBegin().getLocWithOffset(Offset);
+  const SourceLocation Start = Range.getBegin().getLocWithOffset(Offset);
   return {Start, Start.getLocWithOffset(strlen("const") - 1)};
 }
 
-void ConvertMemberFunctionsToStatic::check(
+void ConvertMemberFunctionsToStaticCheck::check(
     const MatchFinder::MatchResult &Result) {
   const auto *Definition = Result.Nodes.getNodeAs<CXXMethodDecl>("x");
 
   // TODO: For out-of-line declarations, don't modify the source if the header
   // is excluded by the -header-filter option.
-  DiagnosticBuilder Diag =
+  const DiagnosticBuilder Diag =
       diag(Definition->getLocation(), "method %0 can be made static")
       << Definition;
 
@@ -152,15 +153,15 @@ void ConvertMemberFunctionsToStatic::check(
   if (Definition->isConst()) {
     // Make sure that we either remove 'const' on both declaration and
     // definition or emit no fix-it at all.
-    SourceRange DefConst = getLocationOfConst(Definition->getTypeSourceInfo(),
-                                              *Result.SourceManager,
-                                              Result.Context->getLangOpts());
+    const SourceRange DefConst = getLocationOfConst(
+        Definition->getTypeSourceInfo(), *Result.SourceManager,
+        Result.Context->getLangOpts());
 
     if (DefConst.isInvalid())
       return;
 
     if (Declaration != Definition) {
-      SourceRange DeclConst = getLocationOfConst(
+      const SourceRange DeclConst = getLocationOfConst(
           Declaration->getTypeSourceInfo(), *Result.SourceManager,
           Result.Context->getLangOpts());
 
diff --git a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.h b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStaticCheck.h
similarity index 80%
rename from clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.h
rename to clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStaticCheck.h
index 2aab03f1a896f..4f8a1a974bd50 100644
--- a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.h
+++ b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStaticCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_CONVERTMEMFUNCTOSTATIC_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_CONVERTMEMFUNCTOSTATIC_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_CONVERTMEMBERFUNCTIONSTOSTATICCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_CONVERTMEMBERFUNCTIONSTOSTATICCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -18,10 +18,10 @@ namespace clang::tidy::readability {
 ///
 /// For the user-facing documentation see:
 /// https://clang.llvm.org/extra/clang-tidy/checks/
-/// readability-convert-member-functions-to-static.html
-class ConvertMemberFunctionsToStatic : public ClangTidyCheck {
+/// readability/convert-member-functions-to-static.html
+class ConvertMemberFunctionsToStaticCheck : public ClangTidyCheck {
 public:
-  ConvertMemberFunctionsToStatic(StringRef Name, ClangTidyContext *Context)
+  ConvertMemberFunctionsToStaticCheck(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context) {}
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus;
@@ -32,4 +32,4 @@ class ConvertMemberFunctionsToStatic : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_CONVERTMEMFUNCTOSTATIC_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_CONVERTMEMBERFUNCTIONSTOSTATICCHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.h b/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.h
index 52b1b2625e403..b346f6856277d 100644
--- a/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DELETE_NULL_POINTER_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DELETE_NULL_POINTER_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DELETENULLPOINTERCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DELETENULLPOINTERCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -34,4 +34,4 @@ class DeleteNullPointerCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DELETE_NULL_POINTER_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DELETENULLPOINTERCHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
index 0237c057afed5..cc9ae471a926d 100644
--- a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
@@ -88,9 +88,9 @@ void DuplicateIncludeCallbacks::InclusionDirective(
   if (llvm::is_contained(Files.back(), FileName)) {
     // We want to delete the entire line, so make sure that [Start,End] covers
     // everything.
-    SourceLocation Start =
+    const SourceLocation Start =
         advanceBeyondCurrentLine(SM, HashLoc, -1).getLocWithOffset(-1);
-    SourceLocation End =
+    const SourceLocation End =
         advanceBeyondCurrentLine(SM, FilenameRange.getEnd(), 1);
     Check.diag(HashLoc, "duplicate include")
         << FixItHint::CreateRemoval(SourceRange{Start, End});
diff --git a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.h b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.h
index 297999cf4f921..ca3679108e60b 100644
--- a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DUPLICATE_INCLUDE_CHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DUPLICATE_INCLUDE_CHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DUPLICATEINCLUDECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DUPLICATEINCLUDECHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -28,4 +28,4 @@ class DuplicateIncludeCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DUPLICATE_INCLUDE_CHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_DUPLICATEINCLUDECHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.cpp b/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.cpp
index 6399e7d99a9c7..a420c5653cfe8 100644
--- a/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.cpp
@@ -124,21 +124,21 @@ static void removeElseAndBrackets(DiagnosticBuilder &Diag, ASTContext &Context,
 
   if (const auto *CS = dyn_cast<CompoundStmt>(Else)) {
     Diag << tooling::fixit::createRemoval(ElseLoc);
-    SourceLocation LBrace = CS->getLBracLoc();
-    SourceLocation RBrace = CS->getRBracLoc();
-    SourceLocation RangeStart =
+    const SourceLocation LBrace = CS->getLBracLoc();
+    const SourceLocation RBrace = CS->getRBracLoc();
+    const SourceLocation RangeStart =
         Remap(LBrace).getLocWithOffset(TokLen(LBrace) + 1);
-    SourceLocation RangeEnd = Remap(RBrace).getLocWithOffset(-1);
+    const SourceLocation RangeEnd = Remap(RBrace).getLocWithOffset(-1);
 
-    llvm::StringRef Repl = Lexer::getSourceText(
+    const llvm::StringRef Repl = Lexer::getSourceText(
         CharSourceRange::getTokenRange(RangeStart, RangeEnd),
         Context.getSourceManager(), Context.getLangOpts());
     Diag << tooling::fixit::createReplacement(CS->getSourceRange(), Repl);
   } else {
-    SourceLocation ElseExpandedLoc = Remap(ElseLoc);
-    SourceLocation EndLoc = Remap(Else->getEndLoc());
+    const SourceLocation ElseExpandedLoc = Remap(ElseLoc);
+    const SourceLocation EndLoc = Remap(Else->getEndLoc());
 
-    llvm::StringRef Repl = Lexer::getSourceText(
+    const llvm::StringRef Repl = Lexer::getSourceText(
         CharSourceRange::getTokenRange(
             ElseExpandedLoc.getLocWithOffset(TokLen(ElseLoc) + 1), EndLoc),
         Context.getSourceManager(), Context.getLangOpts());
@@ -186,8 +186,8 @@ static bool hasPreprocessorBranchEndBetweenLocations(
     const ElseAfterReturnCheck::ConditionalBranchMap &ConditionalBranchMap,
     const SourceManager &SM, SourceLocation StartLoc, SourceLocation EndLoc) {
 
-  SourceLocation ExpandedStartLoc = SM.getExpansionLoc(StartLoc);
-  SourceLocation ExpandedEndLoc = SM.getExpansionLoc(EndLoc);
+  const SourceLocation ExpandedStartLoc = SM.getExpansionLoc(StartLoc);
+  const SourceLocation ExpandedEndLoc = SM.getExpansionLoc(EndLoc);
   if (!SM.isWrittenInSameFile(ExpandedStartLoc, ExpandedEndLoc))
     return false;
 
@@ -239,14 +239,14 @@ void ElseAfterReturnCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *Else = Result.Nodes.getNodeAs<Stmt>("else");
   const auto *OuterScope = Result.Nodes.getNodeAs<CompoundStmt>("cs");
   const auto *Interrupt = Result.Nodes.getNodeAs<Stmt>(InterruptingStr);
-  SourceLocation ElseLoc = If->getElseLoc();
+  const SourceLocation ElseLoc = If->getElseLoc();
 
   if (hasPreprocessorBranchEndBetweenLocations(
           PPConditionals, *Result.SourceManager, Interrupt->getBeginLoc(),
           ElseLoc))
     return;
 
-  bool IsLastInScope = OuterScope->body_back() == If;
+  const bool IsLastInScope = OuterScope->body_back() == If;
   const StringRef ControlFlowInterrupter = getControlFlowString(*Interrupt);
 
   if (!IsLastInScope && containsDeclInScope(Else)) {
@@ -276,7 +276,7 @@ void ElseAfterReturnCheck::check(const MatchFinder::MatchResult &Result) {
       }
       const DeclStmt *VDeclStmt = If->getConditionVariableDeclStmt();
       const VarDecl *VDecl = If->getConditionVariable();
-      std::string Repl =
+      const std::string Repl =
           (tooling::fixit::getText(*VDeclStmt, *Result.Context) +
            llvm::StringRef(";\n") +
            tooling::fixit::getText(If->getIfLoc(), *Result.Context))
diff --git a/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.cpp b/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.cpp
index a2a5c3e10ee07..049ad759b834c 100644
--- a/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.cpp
@@ -75,7 +75,7 @@ static void cleanInitialValue(DiagnosticBuilder &Diag,
 namespace {
 
 AST_MATCHER(EnumDecl, isMacro) {
-  SourceLocation Loc = Node.getBeginLoc();
+  const SourceLocation Loc = Node.getBeginLoc();
   return Loc.isMacroID();
 }
 
@@ -165,7 +165,7 @@ void EnumInitialValueCheck::registerMatchers(MatchFinder *Finder) {
 
 void EnumInitialValueCheck::check(const MatchFinder::MatchResult &Result) {
   if (const auto *Enum = Result.Nodes.getNodeAs<EnumDecl>("inconsistent")) {
-    DiagnosticBuilder Diag =
+    const DiagnosticBuilder Diag =
         diag(
             Enum->getBeginLoc(),
             "initial values in enum '%0' are not consistent, consider explicit "
diff --git a/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp b/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp
index 4791df037d77d..ccac645892948 100644
--- a/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp
@@ -229,14 +229,14 @@ class FunctionASTVisitor final
 
   bool traverseStmtWithIncreasedNestingLevel(Stmt *Node) {
     ++CurrentNestingLevel;
-    bool ShouldContinue = Base::TraverseStmt(Node);
+    const bool ShouldContinue = Base::TraverseStmt(Node);
     --CurrentNestingLevel;
     return ShouldContinue;
   }
 
   bool traverseDeclWithIncreasedNestingLevel(Decl *Node) {
     ++CurrentNestingLevel;
-    bool ShouldContinue = Base::TraverseDecl(Node);
+    const bool ShouldContinue = Base::TraverseDecl(Node);
     --CurrentNestingLevel;
     return ShouldContinue;
   }
@@ -336,7 +336,7 @@ class FunctionASTVisitor final
 
     // Record the operator that we are currently processing and traverse it.
     CurrentBinaryOperator = Op->getOpcode();
-    bool ShouldContinue = Base::TraverseBinaryOperator(Op);
+    const bool ShouldContinue = Base::TraverseBinaryOperator(Op);
 
     // And restore the previous binary operator, which might be nonexistent.
     CurrentBinaryOperator = BinOpCopy;
@@ -354,7 +354,7 @@ class FunctionASTVisitor final
 
     // Else, do add [uninitialized] frame to the stack, and traverse call.
     BinaryOperatorsStack.emplace();
-    bool ShouldContinue = Base::TraverseCallExpr(Node);
+    const bool ShouldContinue = Base::TraverseCallExpr(Node);
     // And remove the top frame.
     BinaryOperatorsStack.pop();
 
diff --git a/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.cpp b/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.cpp
index 8c58346ede3fa..2f0949c231844 100644
--- a/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.cpp
@@ -181,14 +181,14 @@ void FunctionSizeCheck::check(const MatchFinder::MatchResult &Result) {
 
   // Count the lines including whitespace and comments. Really simple.
   if (const Stmt *Body = Func->getBody()) {
-    SourceManager *SM = Result.SourceManager;
+    const SourceManager *SM = Result.SourceManager;
     if (SM->isWrittenInSameFile(Body->getBeginLoc(), Body->getEndLoc())) {
       FI.Lines = SM->getSpellingLineNumber(Body->getEndLoc()) -
                  SM->getSpellingLineNumber(Body->getBeginLoc());
     }
   }
 
-  unsigned ActualNumberParameters = Func->getNumParams();
+  const unsigned ActualNumberParameters = Func->getNumParams();
 
   if ((LineThreshold && FI.Lines > LineThreshold) ||
       (StatementThreshold && FI.Statements > StatementThreshold) ||
diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.cpp
index 877f0a45f9ea7..a6204de16224d 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.cpp
@@ -91,7 +91,7 @@ void IdentifierLengthCheck::check(const MatchFinder::MatchResult &Result) {
     if (!StandaloneVar->getIdentifier())
       return;
 
-    StringRef VarName = StandaloneVar->getName();
+    const StringRef VarName = StandaloneVar->getName();
 
     if (VarName.size() >= MinimumVariableNameLength ||
         IgnoredVariableNames.match(VarName))
@@ -106,7 +106,7 @@ void IdentifierLengthCheck::check(const MatchFinder::MatchResult &Result) {
     if (!ExceptionVarName->getIdentifier())
       return;
 
-    StringRef VarName = ExceptionVarName->getName();
+    const StringRef VarName = ExceptionVarName->getName();
     if (VarName.size() >= MinimumExceptionNameLength ||
         IgnoredExceptionVariableNames.match(VarName))
       return;
@@ -120,7 +120,7 @@ void IdentifierLengthCheck::check(const MatchFinder::MatchResult &Result) {
     if (!LoopVar->getIdentifier())
       return;
 
-    StringRef VarName = LoopVar->getName();
+    const StringRef VarName = LoopVar->getName();
 
     if (VarName.size() >= MinimumLoopCounterNameLength ||
         IgnoredLoopCounterNames.match(VarName))
@@ -135,7 +135,7 @@ void IdentifierLengthCheck::check(const MatchFinder::MatchResult &Result) {
     if (!ParamVar->getIdentifier())
       return;
 
-    StringRef VarName = ParamVar->getName();
+    const StringRef VarName = ParamVar->getName();
 
     if (VarName.size() >= MinimumParameterNameLength ||
         IgnoredParameterNames.match(VarName))
diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
index ef3eac80301d3..890ce4074345d 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
@@ -261,7 +261,7 @@ IdentifierNamingCheck::FileStyle IdentifierNamingCheck::getFileStyleFromOptions(
   Styles.resize(SK_Count);
   SmallString<64> StyleString;
   for (unsigned I = 0; I < SK_Count; ++I) {
-    size_t StyleSize = StyleNames[I].size();
+    const size_t StyleSize = StyleNames[I].size();
     StyleString.assign({StyleNames[I], "HungarianPrefix"});
 
     auto HPTOpt =
@@ -271,13 +271,13 @@ IdentifierNamingCheck::FileStyle IdentifierNamingCheck::getFileStyleFromOptions(
 
     memcpy(&StyleString[StyleSize], "IgnoredRegexp", 13);
     StyleString.truncate(StyleSize + 13);
-    std::optional<StringRef> IgnoredRegexpStr = Options.get(StyleString);
+    const std::optional<StringRef> IgnoredRegexpStr = Options.get(StyleString);
     memcpy(&StyleString[StyleSize], "Prefix", 6);
     StyleString.truncate(StyleSize + 6);
-    std::optional<StringRef> Prefix(Options.get(StyleString));
+    const std::optional<StringRef> Prefix(Options.get(StyleString));
     // Fast replacement of [Pre]fix -> [Suf]fix.
     memcpy(&StyleString[StyleSize], "Suf", 3);
-    std::optional<StringRef> Postfix(Options.get(StyleString));
+    const std::optional<StringRef> Postfix(Options.get(StyleString));
     memcpy(&StyleString[StyleSize], "Case", 4);
     StyleString.pop_back_n(2);
     std::optional<CaseType> CaseOptional =
@@ -288,8 +288,9 @@ IdentifierNamingCheck::FileStyle IdentifierNamingCheck::getFileStyleFromOptions(
                         Postfix.value_or(""), IgnoredRegexpStr.value_or(""),
                         HPTOpt.value_or(IdentifierNamingCheck::HPT_Off));
   }
-  bool IgnoreMainLike = Options.get("IgnoreMainLikeFunctions", false);
-  bool CheckAnonFieldInParent = Options.get("CheckAnonFieldInParent", false);
+  const bool IgnoreMainLike = Options.get("IgnoreMainLikeFunctions", false);
+  const bool CheckAnonFieldInParent =
+      Options.get("CheckAnonFieldInParent", false);
   return {std::move(Styles), std::move(HNOption), IgnoreMainLike,
           CheckAnonFieldInParent};
 }
@@ -340,7 +341,7 @@ std::string IdentifierNamingCheck::HungarianNotation::getDeclTypeName(
         "virtual"};
 
     // Remove keywords
-    for (StringRef Kw : Keywords) {
+    for (const StringRef Kw : Keywords) {
       for (size_t Pos = 0; (Pos = Type.find(Kw, Pos)) != std::string::npos;) {
         Type.replace(Pos, Kw.size(), "");
       }
@@ -376,7 +377,7 @@ std::string IdentifierNamingCheck::HungarianNotation::getDeclTypeName(
         " int", " char", " double", " long", " short"};
     bool RedundantRemoved = false;
     for (auto Kw : TailsOfMultiWordType) {
-      size_t Pos = Type.rfind(Kw);
+      const size_t Pos = Type.rfind(Kw);
       if (Pos != std::string::npos) {
         const size_t PtrCount = getAsteriskCount(Type, ND);
         Type = Type.substr(0, Pos + Kw.size() + PtrCount);
@@ -387,14 +388,14 @@ std::string IdentifierNamingCheck::HungarianNotation::getDeclTypeName(
 
     TypeName = Type.erase(0, Type.find_first_not_of(' '));
     if (!RedundantRemoved) {
-      std::size_t FoundSpace = Type.find(' ');
+      const std::size_t FoundSpace = Type.find(' ');
       if (FoundSpace != std::string::npos)
         Type = Type.substr(0, FoundSpace);
     }
 
     TypeName = Type.erase(0, Type.find_first_not_of(' '));
 
-    QualType QT = VD->getType();
+    const QualType QT = VD->getType();
     if (!QT.isNull() && QT->isArrayType())
       TypeName.append("[]");
   }
@@ -451,14 +452,14 @@ void IdentifierNamingCheck::HungarianNotation::loadFileConfig(
   static constexpr StringRef HNDerivedTypes[] = {"Array", "Pointer",
                                                  "FunctionPointer"};
 
-  StringRef Section = "HungarianNotation.";
+  const StringRef Section = "HungarianNotation.";
 
   SmallString<128> Buffer = {Section, "General."};
   size_t DefSize = Buffer.size();
   for (const auto &Opt : HNOpts) {
     Buffer.truncate(DefSize);
     Buffer.append(Opt);
-    StringRef Val = Options.get(Buffer, "");
+    const StringRef Val = Options.get(Buffer, "");
     if (!Val.empty())
       HNOption.General[Opt] = Val.str();
   }
@@ -468,7 +469,7 @@ void IdentifierNamingCheck::HungarianNotation::loadFileConfig(
   for (const auto &Type : HNDerivedTypes) {
     Buffer.truncate(DefSize);
     Buffer.append(Type);
-    StringRef Val = Options.get(Buffer, "");
+    const StringRef Val = Options.get(Buffer, "");
     if (!Val.empty())
       HNOption.DerivedType[Type] = Val.str();
   }
@@ -484,7 +485,7 @@ void IdentifierNamingCheck::HungarianNotation::loadFileConfig(
   for (const auto &CStr : HNCStrings) {
     Buffer.truncate(DefSize);
     Buffer.append(CStr.first);
-    StringRef Val = Options.get(Buffer, "");
+    const StringRef Val = Options.get(Buffer, "");
     if (!Val.empty())
       HNOption.CString[CStr.second] = Val.str();
   }
@@ -494,7 +495,7 @@ void IdentifierNamingCheck::HungarianNotation::loadFileConfig(
   for (const auto &PrimType : HungarianNotationPrimitiveTypes) {
     Buffer.truncate(DefSize);
     Buffer.append(PrimType);
-    StringRef Val = Options.get(Buffer, "");
+    const StringRef Val = Options.get(Buffer, "");
     if (!Val.empty()) {
       std::string Type = PrimType.str();
       llvm::replace(Type, '-', ' ');
@@ -507,7 +508,7 @@ void IdentifierNamingCheck::HungarianNotation::loadFileConfig(
   for (const auto &Type : HungarianNotationUserDefinedTypes) {
     Buffer.truncate(DefSize);
     Buffer.append(Type);
-    StringRef Val = Options.get(Buffer, "");
+    const StringRef Val = Options.get(Buffer, "");
     if (!Val.empty())
       HNOption.UserDefinedType[Type] = Val.str();
   }
@@ -528,7 +529,7 @@ std::string IdentifierNamingCheck::HungarianNotation::getPrefix(
   } else if (const auto *CRD = dyn_cast<CXXRecordDecl>(ND)) {
     Prefix = getClassPrefix(CRD, HNOption);
   } else if (isa<VarDecl, FieldDecl, RecordDecl>(ND)) {
-    std::string TypeName = getDeclTypeName(ND);
+    const std::string TypeName = getDeclTypeName(ND);
     if (!TypeName.empty())
       Prefix = getDataTypePrefix(TypeName, ND, HNOption);
   }
@@ -542,8 +543,8 @@ bool IdentifierNamingCheck::HungarianNotation::removeDuplicatedPrefix(
   if (Words.size() <= 1)
     return true;
 
-  std::string CorrectName = Words[0].str();
-  std::vector<llvm::StringMap<std::string>> MapList = {
+  const std::string CorrectName = Words[0].str();
+  const std::vector<llvm::StringMap<std::string>> MapList = {
       HNOption.CString, HNOption.DerivedType, HNOption.PrimitiveType,
       HNOption.UserDefinedType};
 
@@ -570,12 +571,12 @@ std::string IdentifierNamingCheck::HungarianNotation::getDataTypePrefix(
   // Derived types
   std::string PrefixStr;
   if (const auto *TD = dyn_cast<ValueDecl>(ND)) {
-    QualType QT = TD->getType();
+    const QualType QT = TD->getType();
     if (QT->isFunctionPointerType()) {
       PrefixStr = HNOption.DerivedType.lookup("FunctionPointer");
     } else if (QT->isPointerType()) {
       for (const auto &CStr : HNOption.CString) {
-        std::string Key = CStr.getKey().str();
+        const std::string Key = CStr.getKey().str();
         if (ModifiedTypeName.find(Key) == 0) {
           PrefixStr = CStr.getValue();
           ModifiedTypeName = ModifiedTypeName.substr(
@@ -585,7 +586,7 @@ std::string IdentifierNamingCheck::HungarianNotation::getDataTypePrefix(
       }
     } else if (QT->isArrayType()) {
       for (const auto &CStr : HNOption.CString) {
-        std::string Key = CStr.getKey().str();
+        const std::string Key = CStr.getKey().str();
         if (ModifiedTypeName.find(Key) == 0) {
           PrefixStr = CStr.getValue();
           break;
@@ -594,14 +595,14 @@ std::string IdentifierNamingCheck::HungarianNotation::getDataTypePrefix(
       if (PrefixStr.empty())
         PrefixStr = HNOption.DerivedType.lookup("Array");
     } else if (QT->isReferenceType()) {
-      size_t Pos = ModifiedTypeName.find_last_of('&');
+      const size_t Pos = ModifiedTypeName.find_last_of('&');
       if (Pos != std::string::npos)
         ModifiedTypeName = ModifiedTypeName.substr(0, Pos);
     }
   }
 
   // Pointers
-  size_t PtrCount = getAsteriskCount(ModifiedTypeName);
+  const size_t PtrCount = getAsteriskCount(ModifiedTypeName);
   if (PtrCount > 0) {
     ModifiedTypeName = [&](std::string Str, StringRef From, StringRef To) {
       size_t StartPos = 0;
@@ -663,10 +664,10 @@ std::string IdentifierNamingCheck::HungarianNotation::getEnumPrefix(
     Name = Name.erase(0, Name.find_first_not_of(' '));
   }
 
-  static llvm::Regex Splitter(
+  static const llvm::Regex Splitter(
       "([a-z0-9A-Z]*)(_+)|([A-Z]?[a-z0-9]+)([A-Z]|$)|([A-Z]+)([A-Z]|$)");
 
-  StringRef EnumName(Name);
+  const StringRef EnumName(Name);
   SmallVector<StringRef, 8> Substrs;
   EnumName.split(Substrs, "_", -1, false);
 
@@ -692,7 +693,7 @@ std::string IdentifierNamingCheck::HungarianNotation::getEnumPrefix(
   }
 
   std::string Initial;
-  for (StringRef Word : Words)
+  for (const StringRef Word : Words)
     Initial += tolower(Word[0]);
 
   return Initial;
@@ -713,7 +714,7 @@ size_t IdentifierNamingCheck::HungarianNotation::getAsteriskCount(
     const std::string &TypeName, const NamedDecl *ND) const {
   size_t PtrCount = 0;
   if (const auto *TD = dyn_cast<ValueDecl>(ND)) {
-    QualType QT = TD->getType();
+    const QualType QT = TD->getType();
     if (QT->isPointerType())
       PtrCount = getAsteriskCount(TypeName);
   }
@@ -834,11 +835,12 @@ void IdentifierNamingCheck::HungarianNotation::loadDefaultConfig(
 void IdentifierNamingCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   RenamerClangTidyCheck::storeOptions(Opts);
   SmallString<64> StyleString;
-  ArrayRef<std::optional<NamingStyle>> Styles = MainFileStyle->getStyles();
+  const ArrayRef<std::optional<NamingStyle>> Styles =
+      MainFileStyle->getStyles();
   for (size_t I = 0; I < SK_Count; ++I) {
     if (!Styles[I])
       continue;
-    size_t StyleSize = StyleNames[I].size();
+    const size_t StyleSize = StyleNames[I].size();
     StyleString.assign({StyleNames[I], "HungarianPrefix"});
 
     Options.store(Opts, StyleString, Styles[I]->HPType);
@@ -871,7 +873,7 @@ bool IdentifierNamingCheck::matchesStyle(
     const IdentifierNamingCheck::NamingStyle &Style,
     const IdentifierNamingCheck::HungarianNotationOption &HNOption,
     const NamedDecl *Decl) const {
-  static llvm::Regex Matchers[] = {
+  static const llvm::Regex Matchers[] = {
       llvm::Regex("^.*$"),
       llvm::Regex("^[a-z][a-z0-9_]*$"),
       llvm::Regex("^[a-z][a-zA-Z0-9]*$"),
@@ -887,7 +889,7 @@ bool IdentifierNamingCheck::matchesStyle(
   if (!Name.consume_back(Style.Suffix))
     return false;
   if (IdentifierNamingCheck::HungarianPrefixType::HPT_Off != Style.HPType) {
-    std::string HNPrefix = HungarianNotation.getPrefix(Decl, HNOption);
+    const std::string HNPrefix = HungarianNotation.getPrefix(Decl, HNOption);
     if (!HNPrefix.empty()) {
       if (!Name.consume_front(HNPrefix))
         return false;
@@ -914,7 +916,7 @@ std::string IdentifierNamingCheck::fixupWithCase(
     const IdentifierNamingCheck::NamingStyle &Style,
     const IdentifierNamingCheck::HungarianNotationOption &HNOption,
     IdentifierNamingCheck::CaseType Case) const {
-  static llvm::Regex Splitter(
+  static const llvm::Regex Splitter(
       "([a-z0-9A-Z]*)(_+)|([A-Z]?[a-z0-9]+)([A-Z]|$)|([A-Z]+)([A-Z]|$)");
 
   SmallVector<StringRef, 8> Substrs;
@@ -1070,7 +1072,7 @@ bool IdentifierNamingCheck::isParamInMainLikeFunction(
     return false;
   if (!IsIntType(FDecl->parameters()[0]->getType()))
     return false;
-  MainType Type = IsCharPtrPtr(FDecl->parameters()[1]->getType());
+  const MainType Type = IsCharPtrPtr(FDecl->parameters()[1]->getType());
   if (Type == None)
     return false;
   if (FDecl->getNumParams() == 3 &&
@@ -1078,13 +1080,14 @@ bool IdentifierNamingCheck::isParamInMainLikeFunction(
     return false;
 
   if (Type == Main) {
-    static llvm::Regex Matcher(
+    static const llvm::Regex Matcher(
         "(^[Mm]ain([_A-Z]|$))|([a-z0-9_]Main([_A-Z]|$))|(_main(_|$))");
     assert(Matcher.isValid() && "Invalid Matcher for main like functions.");
     return Matcher.match(FDecl->getName());
   }
-  static llvm::Regex Matcher("(^((W[Mm])|(wm))ain([_A-Z]|$))|([a-z0-9_]W[Mm]"
-                             "ain([_A-Z]|$))|(_wmain(_|$))");
+  static const llvm::Regex Matcher(
+      "(^((W[Mm])|(wm))ain([_A-Z]|$))|([a-z0-9_]W[Mm]"
+      "ain([_A-Z]|$))|(_wmain(_|$))");
   assert(Matcher.isValid() && "Invalid Matcher for wmain like functions.");
   return Matcher.match(FDecl->getName());
 }
@@ -1212,7 +1215,7 @@ StyleKind IdentifierNamingCheck::findStyleKind(
   if (const auto *Decl = dyn_cast<ParmVarDecl>(D)) {
     if (isParamInMainLikeFunction(*Decl, IgnoreMainLikeFunctions))
       return SK_Invalid;
-    QualType Type = Decl->getType();
+    const QualType Type = Decl->getType();
 
     if (Decl->isConstexpr() && NamingStyles[SK_ConstexprVariable])
       return SK_ConstexprVariable;
@@ -1381,7 +1384,7 @@ IdentifierNamingCheck::getDeclFailureInfo(const NamedDecl *Decl,
   if (Decl->isImplicit())
     return std::nullopt;
 
-  SourceLocation Loc = Decl->getLocation();
+  const SourceLocation Loc = Decl->getLocation();
   const FileStyle &FileStyle = getStyleForFile(SM.getFilename(Loc));
   if (!FileStyle.isActive())
     return std::nullopt;
@@ -1398,7 +1401,7 @@ IdentifierNamingCheck::getDeclFailureInfo(const NamedDecl *Decl,
 std::optional<RenamerClangTidyCheck::FailureInfo>
 IdentifierNamingCheck::getMacroFailureInfo(const Token &MacroNameTok,
                                            const SourceManager &SM) const {
-  SourceLocation Loc = MacroNameTok.getLocation();
+  const SourceLocation Loc = MacroNameTok.getLocation();
   const FileStyle &Style = getStyleForFile(SM.getFilename(Loc));
   if (!Style.isActive())
     return std::nullopt;
@@ -1431,13 +1434,13 @@ IdentifierNamingCheck::getStyleForFile(StringRef FileName) const {
   if (!GetConfigPerFile)
     return *MainFileStyle;
 
-  StringRef RealFileName = getRealFileName(FileName);
-  StringRef Parent = llvm::sys::path::parent_path(RealFileName);
+  const StringRef RealFileName = getRealFileName(FileName);
+  const StringRef Parent = llvm::sys::path::parent_path(RealFileName);
   auto Iter = NamingStylesCache.find(Parent);
   if (Iter != NamingStylesCache.end())
     return Iter->getValue();
 
-  llvm::StringRef CheckName = getID();
+  const llvm::StringRef CheckName = getID();
   ClangTidyOptions Options = Context->getOptionsForFile(RealFileName);
   if (Options.Checks && GlobList(*Options.Checks).contains(CheckName)) {
     auto It = NamingStylesCache.try_emplace(
@@ -1459,7 +1462,7 @@ StyleKind IdentifierNamingCheck::findStyleKindForAnonField(
       utils::findOutermostIndirectFieldDeclForField(AnonField);
   assert(IFD && "Found an anonymous record field without an IndirectFieldDecl");
 
-  QualType Type = AnonField->getType();
+  const QualType Type = AnonField->getType();
 
   if (const auto *F = dyn_cast<FieldDecl>(IFD->chain().front())) {
     return findStyleKindForField(F, Type, NamingStyles);
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
index 6f6da57d7822b..77150fd3ac9b4 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@@ -22,17 +22,19 @@ namespace clang::tidy::readability {
 namespace {
 
 AST_MATCHER(Stmt, isMacroExpansion) {
-  SourceManager &SM = Finder->getASTContext().getSourceManager();
-  SourceLocation Loc = Node.getBeginLoc();
+  const SourceManager &SM = Finder->getASTContext().getSourceManager();
+  const SourceLocation Loc = Node.getBeginLoc();
   return SM.isMacroBodyExpansion(Loc) || SM.isMacroArgExpansion(Loc);
 }
 
 AST_MATCHER(Stmt, isC23) { return Finder->getASTContext().getLangOpts().C23; }
 
+// Preserve same name as AST_MATCHER(isNULLMacroExpansion)
+// NOLINTNEXTLINE(llvm-prefer-static-over-anonymous-namespace)
 bool isNULLMacroExpansion(const Stmt *Statement, ASTContext &Context) {
-  SourceManager &SM = Context.getSourceManager();
+  const SourceManager &SM = Context.getSourceManager();
   const LangOptions &LO = Context.getLangOpts();
-  SourceLocation Loc = Statement->getBeginLoc();
+  const SourceLocation Loc = Statement->getBeginLoc();
   return SM.isMacroBodyExpansion(Loc) &&
          Lexer::getImmediateMacroName(Loc, SM, LO) == "NULL";
 }
@@ -75,11 +77,11 @@ static void fixGenericExprCastToBool(DiagnosticBuilder &Diag,
                                      bool UseUpperCaseLiteralSuffix) {
   // In case of expressions like (! integer), we should remove the redundant not
   // operator and use inverted comparison (integer == 0).
-  bool InvertComparison =
+  const bool InvertComparison =
       Parent != nullptr && isUnaryLogicalNotOperator(Parent);
   if (InvertComparison) {
-    SourceLocation ParentStartLoc = Parent->getBeginLoc();
-    SourceLocation ParentEndLoc =
+    const SourceLocation ParentStartLoc = Parent->getBeginLoc();
+    const SourceLocation ParentEndLoc =
         cast<UnaryOperator>(Parent)->getSubExpr()->getBeginLoc();
     Diag << FixItHint::CreateRemoval(
         CharSourceRange::getCharRange(ParentStartLoc, ParentEndLoc));
@@ -89,9 +91,9 @@ static void fixGenericExprCastToBool(DiagnosticBuilder &Diag,
 
   const Expr *SubExpr = Cast->getSubExpr();
 
-  bool NeedInnerParens =
+  const bool NeedInnerParens =
       utils::fixit::areParensNeededForStatement(*SubExpr->IgnoreImpCasts());
-  bool NeedOuterParens =
+  const bool NeedOuterParens =
       Parent != nullptr && utils::fixit::areParensNeededForStatement(*Parent);
 
   std::string StartLocInsertion;
@@ -131,7 +133,7 @@ static void fixGenericExprCastToBool(DiagnosticBuilder &Diag,
     EndLocInsertion += ")";
   }
 
-  SourceLocation EndLoc = Lexer::getLocForEndOfToken(
+  const SourceLocation EndLoc = Lexer::getLocForEndOfToken(
       Cast->getEndLoc(), 0, Context.getSourceManager(), Context.getLangOpts());
   Diag << FixItHint::CreateInsertion(EndLoc, EndLocInsertion);
 }
@@ -165,8 +167,8 @@ static StringRef getEquivalentBoolLiteralForExpr(const Expr *Expression,
 }
 
 static bool needsSpacePrefix(SourceLocation Loc, ASTContext &Context) {
-  SourceRange PrefixRange(Loc.getLocWithOffset(-1), Loc);
-  StringRef SpaceBeforeStmtStr = Lexer::getSourceText(
+  const SourceRange PrefixRange(Loc.getLocWithOffset(-1), Loc);
+  const StringRef SpaceBeforeStmtStr = Lexer::getSourceText(
       CharSourceRange::getCharRange(PrefixRange), Context.getSourceManager(),
       Context.getLangOpts(), nullptr);
   if (SpaceBeforeStmtStr.empty())
@@ -196,7 +198,7 @@ static void fixGenericExprCastFromBool(DiagnosticBuilder &Diag,
                                .str());
 
   if (NeedParens) {
-    SourceLocation EndLoc = Lexer::getLocForEndOfToken(
+    const SourceLocation EndLoc = Lexer::getLocForEndOfToken(
         Cast->getEndLoc(), 0, Context.getSourceManager(),
         Context.getLangOpts());
 
@@ -232,7 +234,7 @@ static bool isCastAllowedInCondition(const ImplicitCastExpr *Cast,
   std::queue<const Stmt *> Q;
   Q.push(Cast);
 
-  TraversalKindScope RAII(Context, TK_AsIs);
+  const TraversalKindScope RAII(Context, TK_AsIs);
 
   while (!Q.empty()) {
     for (const auto &N : Context.getParents(*Q.front())) {
@@ -394,7 +396,7 @@ void ImplicitBoolConversionCheck::handleCastToBool(const ImplicitCastExpr *Cast,
   auto Diag = diag(Cast->getBeginLoc(), "implicit conversion %0 -> 'bool'")
               << Cast->getSubExpr()->getType();
 
-  StringRef EquivalentLiteral =
+  const StringRef EquivalentLiteral =
       getEquivalentBoolLiteralForExpr(Cast->getSubExpr(), Context);
   if (!EquivalentLiteral.empty()) {
     Diag << tooling::fixit::createReplacement(*Cast, EquivalentLiteral);
@@ -407,7 +409,7 @@ void ImplicitBoolConversionCheck::handleCastToBool(const ImplicitCastExpr *Cast,
 void ImplicitBoolConversionCheck::handleCastFromBool(
     const ImplicitCastExpr *Cast, const ImplicitCastExpr *NextImplicitCast,
     ASTContext &Context) {
-  QualType DestType =
+  const QualType DestType =
       NextImplicitCast ? NextImplicitCast->getType() : Cast->getType();
   auto Diag = diag(Cast->getBeginLoc(), "implicit conversion 'bool' -> %0")
               << DestType;
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h
index f88ceb1dd5a0c..101089ccfb2e9 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_IMPLICIT_BOOL_CONVERSION_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_IMPLICIT_BOOL_CONVERSION_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_IMPLICITBOOLCONVERSIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_IMPLICITBOOLCONVERSIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -41,4 +41,4 @@ class ImplicitBoolConversionCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_IMPLICIT_BOOL_CONVERSION_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_IMPLICITBOOLCONVERSIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp
index 93580a7e67c4a..c49684112a5d4 100644
--- a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp
@@ -113,12 +113,12 @@ findDifferingParamsInDeclaration(const FunctionDecl *ParameterSourceDeclaration,
     // FIXME: Provide a way to extract commented out parameter name from comment
     // next to it.
     if (!nameMatch(SourceParamName, OtherParamName, Strict)) {
-      SourceRange OtherParamNameRange =
+      const SourceRange OtherParamNameRange =
           DeclarationNameInfo((*OtherParamIt)->getDeclName(),
                               (*OtherParamIt)->getLocation())
               .getSourceRange();
 
-      bool GenerateFixItHint = checkIfFixItHintIsApplicable(
+      const bool GenerateFixItHint = checkIfFixItHintIsApplicable(
           ParameterSourceDeclaration, *SourceParamIt, OriginalDeclaration);
 
       DifferingParams.emplace_back(SourceParamName, OtherParamName,
@@ -137,11 +137,11 @@ findInconsistentDeclarations(const FunctionDecl *OriginalDeclaration,
                              const FunctionDecl *ParameterSourceDeclaration,
                              SourceManager &SM, bool Strict) {
   InconsistentDeclarationsContainer InconsistentDeclarations;
-  SourceLocation ParameterSourceLocation =
+  const SourceLocation ParameterSourceLocation =
       ParameterSourceDeclaration->getLocation();
 
   for (const FunctionDecl *OtherDeclaration : OriginalDeclaration->redecls()) {
-    SourceLocation OtherLocation = OtherDeclaration->getLocation();
+    const SourceLocation OtherLocation = OtherDeclaration->getLocation();
     if (OtherLocation != ParameterSourceLocation) { // Skip self.
       DifferingParamsContainer DifferingParams =
           findDifferingParamsInDeclaration(ParameterSourceDeclaration,
@@ -305,7 +305,7 @@ void InconsistentDeclarationParameterNameCheck::check(
   const FunctionDecl *ParameterSourceDeclaration =
       getParameterSourceDeclaration(OriginalDeclaration);
 
-  InconsistentDeclarationsContainer InconsistentDeclarations =
+  const InconsistentDeclarationsContainer InconsistentDeclarations =
       findInconsistentDeclarations(OriginalDeclaration,
                                    ParameterSourceDeclaration,
                                    *Result.SourceManager, Strict);
@@ -315,7 +315,7 @@ void InconsistentDeclarationParameterNameCheck::check(
     return;
   }
 
-  SourceLocation StartLoc = OriginalDeclaration->getBeginLoc();
+  const SourceLocation StartLoc = OriginalDeclaration->getBeginLoc();
   if (StartLoc.isMacroID() && IgnoreMacros) {
     markRedeclarationsAsVisited(OriginalDeclaration);
     return;
diff --git a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.h b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.h
index 289e131d0d97a..32218e1ffc1c3 100644
--- a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_INCONSISTENT_DECLARATION_PARAMETER_NAME_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_INCONSISTENT_DECLARATION_PARAMETER_NAME_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_INCONSISTENTDECLARATIONPARAMETERNAMECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_INCONSISTENTDECLARATIONPARAMETERNAMECHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -45,4 +45,4 @@ class InconsistentDeclarationParameterNameCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_INCONSISTENT_DECLARATION_PARAMETER_NAME_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_INCONSISTENTDECLARATIONPARAMETERNAMECHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.cpp b/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.cpp
index bc5edecb8a65b..fa5a0b7cd3647 100644
--- a/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.cpp
@@ -107,7 +107,7 @@ static bool typeIsMemberPointer(const Type *T) {
 static std::optional<std::vector<SourceRange>>
 declRanges(const DeclStmt *DS, const SourceManager &SM,
            const LangOptions &LangOpts) {
-  std::size_t DeclCount = std::distance(DS->decl_begin(), DS->decl_end());
+  const std::size_t DeclCount = std::distance(DS->decl_begin(), DS->decl_end());
   if (DeclCount < 2)
     return std::nullopt;
 
@@ -157,7 +157,7 @@ declRanges(const DeclStmt *DS, const SourceManager &SM,
     if (Start.isInvalid() || Start.isMacroID())
       break;
 
-    Token T = getPreviousToken(Start, SM, LangOpts);
+    const Token T = getPreviousToken(Start, SM, LangOpts);
     if (T.is(tok::l_paren)) {
       Start = findPreviousTokenStart(Start, SM, LangOpts);
       continue;
@@ -165,7 +165,7 @@ declRanges(const DeclStmt *DS, const SourceManager &SM,
     break;
   }
 
-  SourceRange DeclRange(DS->getBeginLoc(), Start);
+  const SourceRange DeclRange(DS->getBeginLoc(), Start);
   if (DeclRange.isInvalid() || isMacroID(DeclRange))
     return std::nullopt;
 
@@ -183,13 +183,13 @@ declRanges(const DeclStmt *DS, const SourceManager &SM,
     if (typeIsMemberPointer(CurrentDecl->getType().IgnoreParens().getTypePtr()))
       return std::nullopt;
 
-    SourceLocation DeclEnd =
+    const SourceLocation DeclEnd =
         CurrentDecl->hasInit()
             ? findNextTerminator(CurrentDecl->getInit()->getEndLoc(), SM,
                                  LangOpts)
             : findNextTerminator(CurrentDecl->getEndLoc(), SM, LangOpts);
 
-    SourceRange VarNameRange(DeclBegin, DeclEnd);
+    const SourceRange VarNameRange(DeclBegin, DeclEnd);
     if (VarNameRange.isInvalid() || isMacroID(VarNameRange))
       return std::nullopt;
 
@@ -206,7 +206,7 @@ collectSourceRanges(llvm::ArrayRef<SourceRange> Ranges, const SourceManager &SM,
   Snippets.reserve(Ranges.size());
 
   for (const auto &Range : Ranges) {
-    CharSourceRange CharRange = Lexer::getAsCharRange(
+    const CharSourceRange CharRange = Lexer::getAsCharRange(
         CharSourceRange::getCharRange(Range.getBegin(), Range.getEnd()), SM,
         LangOpts);
 
@@ -214,7 +214,7 @@ collectSourceRanges(llvm::ArrayRef<SourceRange> Ranges, const SourceManager &SM,
       return std::nullopt;
 
     bool InvalidText = false;
-    StringRef Snippet =
+    const StringRef Snippet =
         Lexer::getSourceText(CharRange, SM, LangOpts, &InvalidText);
 
     if (InvalidText)
@@ -262,7 +262,7 @@ void IsolateDeclarationCheck::check(const MatchFinder::MatchResult &Result) {
     return;
 
   std::vector<std::string> NewDecls = createIsolatedDecls(*PotentialSnippets);
-  std::string Replacement = llvm::join(
+  const std::string Replacement = llvm::join(
       NewDecls,
       (Twine("\n") + Lexer::getIndentationForLine(WholeDecl->getBeginLoc(),
                                                   *Result.SourceManager))
diff --git a/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.h b/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.h
index 750b4d887de58..0bf22e5c518f4 100644
--- a/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_ISOLATEDECLCHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_ISOLATEDECLCHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_ISOLATEDECLARATIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_ISOLATEDECLARATIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -28,4 +28,4 @@ class IsolateDeclarationCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_ISOLATEDECLCHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_ISOLATEDECLARATIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp
index a38f7bc029e8b..01abf513ce9b9 100644
--- a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp
@@ -145,7 +145,7 @@ void MagicNumbersCheck::registerMatchers(MatchFinder *Finder) {
 
 void MagicNumbersCheck::check(const MatchFinder::MatchResult &Result) {
 
-  TraversalKindScope RAII(*Result.Context, TK_AsIs);
+  const TraversalKindScope RAII(*Result.Context, TK_AsIs);
 
   checkBoundMatch<IntegerLiteral>(Result, "integer");
   checkBoundMatch<FloatingLiteral>(Result, "float");
@@ -248,7 +248,7 @@ bool MagicNumbersCheck::isBitFieldWidth(
 bool MagicNumbersCheck::isUserDefinedLiteral(
     const clang::ast_matchers::MatchFinder::MatchResult &Result,
     const clang::Expr &Literal) const {
-  DynTypedNodeList Parents = Result.Context->getParents(Literal);
+  const DynTypedNodeList Parents = Result.Context->getParents(Literal);
   if (Parents.empty())
     return false;
   return Parents[0].get<UserDefinedLiteral>() != nullptr;
diff --git a/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp b/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp
index bea68884e3bda..ddc92ef312446 100644
--- a/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp
@@ -59,7 +59,7 @@ class FindUsageOfThis : public RecursiveASTVisitor<FindUsageOfThis> {
   UsageKind Usage = Unused;
 
   template <class T> const T *getParent(const Expr *E) {
-    DynTypedNodeList Parents = Ctxt.getParents(*E);
+    const DynTypedNodeList Parents = Ctxt.getParents(*E);
     if (Parents.size() != 1)
       return nullptr;
 
@@ -241,7 +241,7 @@ void MakeMemberFunctionConstCheck::registerMatchers(MatchFinder *Finder) {
 }
 
 static SourceLocation getConstInsertionPoint(const CXXMethodDecl *M) {
-  TypeSourceInfo *TSI = M->getTypeSourceInfo();
+  const TypeSourceInfo *TSI = M->getTypeSourceInfo();
   if (!TSI)
     return {};
 
diff --git a/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp b/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp
index e15b2ecd8f5c0..69bc554778379 100644
--- a/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp
@@ -56,8 +56,8 @@ static void addParentheses(const BinaryOperator *BinOp,
   if (!BinOp)
     return;
 
-  int Precedence1 = getPrecedence(BinOp);
-  int Precedence2 = getPrecedence(ParentBinOp);
+  const int Precedence1 = getPrecedence(BinOp);
+  const int Precedence2 = getPrecedence(ParentBinOp);
 
   if (ParentBinOp != nullptr && Precedence1 != Precedence2 && Precedence1 > 0 &&
       Precedence2 > 0) {
diff --git a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp
index 0765d8d82ee04..450961c8b4fee 100644
--- a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp
@@ -21,7 +21,7 @@ static const IfStmt *getPrecedingIf(const SourceManager &SM,
   if (Parents.size() != 1)
     return nullptr;
   if (const auto *PrecedingIf = Parents[0].get<IfStmt>()) {
-    SourceLocation PreviousElseLoc = PrecedingIf->getElseLoc();
+    const SourceLocation PreviousElseLoc = PrecedingIf->getElseLoc();
     if (SM.getExpansionLineNumber(PreviousElseLoc) ==
         SM.getExpansionLineNumber(If->getIfLoc()))
       return PrecedingIf;
@@ -33,7 +33,7 @@ void MisleadingIndentationCheck::danglingElseCheck(const SourceManager &SM,
                                                    ASTContext *Context,
                                                    const IfStmt *If) {
   SourceLocation IfLoc = If->getIfLoc();
-  SourceLocation ElseLoc = If->getElseLoc();
+  const SourceLocation ElseLoc = If->getElseLoc();
 
   if (IfLoc.isMacroID() || ElseLoc.isMacroID())
     return;
@@ -89,8 +89,8 @@ void MisleadingIndentationCheck::missingBracesCheck(
     if (isa<CompoundStmt>(Inner))
       continue;
 
-    SourceLocation InnerLoc = Inner->getBeginLoc();
-    SourceLocation OuterLoc = CurrentStmt->getBeginLoc();
+    const SourceLocation InnerLoc = Inner->getBeginLoc();
+    const SourceLocation OuterLoc = CurrentStmt->getBeginLoc();
 
     if (InnerLoc.isInvalid() || InnerLoc.isMacroID() || OuterLoc.isInvalid() ||
         OuterLoc.isMacroID())
@@ -101,7 +101,7 @@ void MisleadingIndentationCheck::missingBracesCheck(
       continue;
 
     const Stmt *NextStmt = CStmt->body_begin()[I + 1];
-    SourceLocation NextLoc = NextStmt->getBeginLoc();
+    const SourceLocation NextLoc = NextStmt->getBeginLoc();
 
     if (NextLoc.isInvalid() || NextLoc.isMacroID())
       continue;
diff --git a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h
index 8347f1a3611d9..edd2b1a1ff73e 100644
--- a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISLEADING_INDENTATION_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISLEADING_INDENTATION_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISLEADINGINDENTATIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISLEADINGINDENTATIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -38,4 +38,4 @@ class MisleadingIndentationCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISLEADING_INDENTATION_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISLEADINGINDENTATIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.h b/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.h
index 0a6e0c8fb25a0..f0c565b1d7377 100644
--- a/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISPLACED_ARRAY_INDEX_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISPLACED_ARRAY_INDEX_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISPLACEDARRAYINDEXCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISPLACEDARRAYINDEXCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -28,4 +28,4 @@ class MisplacedArrayIndexCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISPLACED_ARRAY_INDEX_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MISPLACEDARRAYINDEXCHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.cpp b/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.cpp
index 7251d63edfd89..1283632a91bb1 100644
--- a/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.cpp
@@ -79,7 +79,7 @@ void NamedParameterCheck::check(const MatchFinder::MatchResult &Result) {
     // void foo(int /*unused*/)
     const char *Begin = SM.getCharacterData(Parm->getBeginLoc());
     const char *End = SM.getCharacterData(Parm->getLocation());
-    StringRef Data(Begin, End - Begin);
+    const StringRef Data(Begin, End - Begin);
     if (Data.contains("/*"))
       continue;
 
@@ -104,7 +104,7 @@ void NamedParameterCheck::check(const MatchFinder::MatchResult &Result) {
       if (M && M->size_overridden_methods() > 0) {
         const ParmVarDecl *OtherParm =
             (*M->begin_overridden_methods())->getParamDecl(P.second);
-        StringRef Name = OtherParm->getName();
+        const StringRef Name = OtherParm->getName();
         if (!Name.empty())
           NewName = Name;
       }
@@ -112,7 +112,7 @@ void NamedParameterCheck::check(const MatchFinder::MatchResult &Result) {
       // If the definition has a named parameter use that name.
       if (Definition) {
         const ParmVarDecl *DefParm = Definition->getParamDecl(P.second);
-        StringRef Name = DefParm->getName();
+        const StringRef Name = DefParm->getName();
         if (!Name.empty())
           NewName = Name;
       }
diff --git a/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.cpp b/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.cpp
index 744d23a6fdbcd..dffd7fdcc1beb 100644
--- a/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.cpp
@@ -70,7 +70,7 @@ getNamespaceNameAsWritten(SourceLocation &Loc, const SourceManager &Sources,
       --Nesting;
     } else if (Nesting == 0) {
       if (T->is(tok::raw_identifier)) {
-        StringRef ID = T->getRawIdentifier();
+        const StringRef ID = T->getRawIdentifier();
         if (ID != "namespace")
           Result.append(std::string(ID));
         if (ID == "inline")
@@ -96,13 +96,13 @@ void NamespaceCommentCheck::check(const MatchFinder::MatchResult &Result) {
 
   // Don't require closing comments for namespaces spanning less than certain
   // number of lines.
-  unsigned StartLine = Sources.getSpellingLineNumber(ND->getBeginLoc());
-  unsigned EndLine = Sources.getSpellingLineNumber(ND->getRBraceLoc());
+  const unsigned StartLine = Sources.getSpellingLineNumber(ND->getBeginLoc());
+  const unsigned EndLine = Sources.getSpellingLineNumber(ND->getRBraceLoc());
   if (EndLine - StartLine + 1 <= ShortNamespaceLines)
     return;
 
   // Find next token after the namespace closing brace.
-  SourceLocation AfterRBrace = Lexer::getLocForEndOfToken(
+  const SourceLocation AfterRBrace = Lexer::getLocForEndOfToken(
       ND->getRBraceLoc(), /*Offset=*/0, Sources, getLangOpts());
   SourceLocation Loc = AfterRBrace;
   SourceLocation LBraceLoc = ND->getBeginLoc();
@@ -137,7 +137,8 @@ void NamespaceCommentCheck::check(const MatchFinder::MatchResult &Result) {
   if (!locationsInSameFile(Sources, ND->getRBraceLoc(), Loc))
     return;
 
-  bool NextTokenIsOnSameLine = Sources.getSpellingLineNumber(Loc) == EndLine;
+  const bool NextTokenIsOnSameLine =
+      Sources.getSpellingLineNumber(Loc) == EndLine;
   // If we insert a line comment before the token in the same line, we need
   // to insert a line break.
   bool NeedLineBreak = NextTokenIsOnSameLine && Tok.isNot(tok::eof);
@@ -148,11 +149,12 @@ void NamespaceCommentCheck::check(const MatchFinder::MatchResult &Result) {
 
   // Try to find existing namespace closing comment on the same line.
   if (Tok.is(tok::comment) && NextTokenIsOnSameLine) {
-    StringRef Comment(Sources.getCharacterData(Loc), Tok.getLength());
+    const StringRef Comment(Sources.getCharacterData(Loc), Tok.getLength());
     SmallVector<StringRef, 7> Groups;
     if (NamespaceCommentPattern.match(Comment, &Groups)) {
-      StringRef NamespaceNameInComment = Groups.size() > 5 ? Groups[5] : "";
-      StringRef Anonymous = Groups.size() > 3 ? Groups[3] : "";
+      const StringRef NamespaceNameInComment =
+          Groups.size() > 5 ? Groups[5] : "";
+      const StringRef Anonymous = Groups.size() > 3 ? Groups[3] : "";
 
       if ((ND->isAnonymousNamespace() && NamespaceNameInComment.empty()) ||
           (*NamespaceNameAsWritten == NamespaceNameInComment &&
@@ -186,7 +188,7 @@ void NamespaceCommentCheck::check(const MatchFinder::MatchResult &Result) {
     // multi-line or there may be other tokens behind it.
   }
 
-  std::string NamespaceNameForDiag =
+  const std::string NamespaceNameForDiag =
       ND->isAnonymousNamespace()
           ? "anonymous namespace"
           : ("namespace '" + *NamespaceNameAsWritten + "'");
@@ -203,7 +205,7 @@ void NamespaceCommentCheck::check(const MatchFinder::MatchResult &Result) {
     Fix.append("\n");
 
   // Place diagnostic at an old comment, or closing brace if we did not have it.
-  SourceLocation DiagLoc =
+  const SourceLocation DiagLoc =
       OldCommentRange.getBegin() != OldCommentRange.getEnd()
           ? OldCommentRange.getBegin()
           : ND->getRBraceLoc();
diff --git a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp
index 29fff3971599e..9fbe3badc864b 100644
--- a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.cpp
@@ -155,7 +155,7 @@ void NonConstParameterCheck::diagnoseNonConstParameters() {
         dyn_cast_or_null<const FunctionDecl>(Par->getParentFunctionOrMethod());
     if (!Function)
       continue;
-    unsigned Index = Par->getFunctionScopeIndex();
+    const unsigned Index = Par->getFunctionScopeIndex();
     for (FunctionDecl *FnDecl : Function->redecls()) {
       if (FnDecl->getNumParams() <= Index)
         continue;
diff --git a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.h b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.h
index b0156183c0b88..7dcb16e4253b8 100644
--- a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_NON_CONST_PARAMETER_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_NON_CONST_PARAMETER_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_NONCONSTPARAMETERCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_NONCONSTPARAMETERCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -59,4 +59,4 @@ class NonConstParameterCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_NON_CONST_PARAMETER_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_NONCONSTPARAMETERCHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.cpp b/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.cpp
index 196fb31bd4b7a..4260e0fc41754 100644
--- a/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.cpp
@@ -23,7 +23,7 @@ static StringRef getOperatorSpelling(SourceLocation Loc, ASTContext &Context) {
   if (Loc.isInvalid())
     return {};
 
-  SourceManager &SM = Context.getSourceManager();
+  const SourceManager &SM = Context.getSourceManager();
 
   Loc = SM.getSpellingLoc(Loc);
   if (Loc.isInvalid())
@@ -41,7 +41,7 @@ AST_MATCHER_P2(BinaryOperator, hasInvalidBinaryOperatorRepresentation,
   if (Node.getOpcode() != Kind || ExpectedRepresentation.empty())
     return false;
 
-  StringRef Spelling =
+  const StringRef Spelling =
       getOperatorSpelling(Node.getOperatorLoc(), Finder->getASTContext());
   return !Spelling.empty() && Spelling != ExpectedRepresentation;
 }
@@ -52,7 +52,7 @@ AST_MATCHER_P2(UnaryOperator, hasInvalidUnaryOperatorRepresentation,
   if (Node.getOpcode() != Kind || ExpectedRepresentation.empty())
     return false;
 
-  StringRef Spelling =
+  const StringRef Spelling =
       getOperatorSpelling(Node.getOperatorLoc(), Finder->getASTContext());
   return !Spelling.empty() && Spelling != ExpectedRepresentation;
 }
@@ -63,7 +63,7 @@ AST_MATCHER_P2(CXXOperatorCallExpr, hasInvalidOverloadedOperatorRepresentation,
   if (Node.getOperator() != Kind || ExpectedRepresentation.empty())
     return false;
 
-  StringRef Spelling =
+  const StringRef Spelling =
       getOperatorSpelling(Node.getOperatorLoc(), Finder->getASTContext());
   return !Spelling.empty() && Spelling != ExpectedRepresentation;
 }
@@ -297,9 +297,9 @@ void OperatorsRepresentationCheck::check(
   if (TokenRange.isInvalid())
     return;
 
-  StringRef Spelling = Lexer::getSourceText(TokenRange, *Result.SourceManager,
-                                            Result.Context->getLangOpts());
-  StringRef TranslatedSpelling = translate(Spelling);
+  const StringRef Spelling = Lexer::getSourceText(
+      TokenRange, *Result.SourceManager, Result.Context->getLangOpts());
+  const StringRef TranslatedSpelling = translate(Spelling);
 
   if (TranslatedSpelling.empty())
     return;
@@ -312,7 +312,7 @@ void OperatorsRepresentationCheck::check(
     SourceRepresentation = "a traditional";
     TargetRepresentation = "an alternative";
 
-    StringRef SpellingEx = Lexer::getSourceText(
+    const StringRef SpellingEx = Lexer::getSourceText(
         CharSourceRange::getCharRange(
             TokenRange.getBegin().getLocWithOffset(-1),
             TokenRange.getBegin().getLocWithOffset(Spelling.size() + 1U)),
diff --git a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
index 942a0a8a4469f..556f7fe7a7eb9 100644
--- a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
@@ -44,18 +44,18 @@ findQualToken(const VarDecl *Decl, Qualifier Qual,
   SourceLocation BeginLoc = Decl->getQualifierLoc().getBeginLoc();
   if (BeginLoc.isInvalid())
     BeginLoc = Decl->getBeginLoc();
-  SourceLocation EndLoc = Decl->getLocation();
+  const SourceLocation EndLoc = Decl->getLocation();
 
-  CharSourceRange FileRange = Lexer::makeFileCharRange(
+  const CharSourceRange FileRange = Lexer::makeFileCharRange(
       CharSourceRange::getCharRange(BeginLoc, EndLoc), *Result.SourceManager,
       Result.Context->getLangOpts());
 
   if (FileRange.isInvalid())
     return std::nullopt;
 
-  tok::TokenKind Tok = Qual == Qualifier::Const      ? tok::kw_const
-                       : Qual == Qualifier::Volatile ? tok::kw_volatile
-                                                     : tok::kw_restrict;
+  const tok::TokenKind Tok = Qual == Qualifier::Const      ? tok::kw_const
+                             : Qual == Qualifier::Volatile ? tok::kw_volatile
+                                                           : tok::kw_restrict;
 
   return utils::lexer::getQualifyingToken(Tok, FileRange, *Result.Context,
                                           *Result.SourceManager);
@@ -90,13 +90,13 @@ mergeReplacementRange(SourceRange &TypeSpecifier, const Token &ConstToken) {
 }
 
 static bool isPointerConst(QualType QType) {
-  QualType Pointee = QType->getPointeeType();
+  const QualType Pointee = QType->getPointeeType();
   assert(!Pointee.isNull() && "can't have a null Pointee");
   return Pointee.isConstQualified();
 }
 
 static bool isAutoPointerConst(QualType QType) {
-  QualType Pointee =
+  const QualType Pointee =
       cast<AutoType>(QType->getPointeeType().getTypePtr())->desugar();
   assert(!Pointee.isNull() && "can't have a null Pointee");
   return Pointee.isConstQualified();
@@ -146,7 +146,6 @@ void QualifiedAutoCheck::registerMatchers(MatchFinder *Finder) {
     return qualType(anyOf(qualType(pointerType(pointee(InnerMatchers...))),
                           qualType(substTemplateTypeParmType(hasReplacementType(
                               pointerType(pointee(InnerMatchers...)))))));
-   
   };
 
   auto IsAutoDeducedToPointer =
@@ -223,33 +222,34 @@ void QualifiedAutoCheck::check(const MatchFinder::MatchResult &Result) {
     if (Var->getLocation() == TypeSpecifier.getEnd().getLocWithOffset(1))
       TypeSpecifier.setEnd(TypeSpecifier.getEnd().getLocWithOffset(1));
 
-    CharSourceRange FixItRange = CharSourceRange::getCharRange(TypeSpecifier);
+    const CharSourceRange FixItRange =
+        CharSourceRange::getCharRange(TypeSpecifier);
     if (FixItRange.isInvalid())
       return;
 
     SourceLocation FixitLoc = FixItRange.getBegin();
-    for (SourceRange &Range : RemoveQualifiersRange) {
+    for (const SourceRange &Range : RemoveQualifiersRange) {
       if (Range.getBegin() < FixitLoc)
         FixitLoc = Range.getBegin();
     }
 
-    std::string ReplStr = [&] {
-      llvm::StringRef PtrConst = isPointerConst(Var->getType()) ? "const " : "";
-      llvm::StringRef LocalConst = IsLocalConst ? "const " : "";
-      llvm::StringRef LocalVol = IsLocalVolatile ? "volatile " : "";
-      llvm::StringRef LocalRestrict = IsLocalRestrict ? "__restrict " : "";
+    const std::string ReplStr = [&] {
+      const StringRef PtrConst = isPointerConst(Var->getType()) ? "const " : "";
+      const StringRef LocalConst = IsLocalConst ? "const " : "";
+      const StringRef LocalVol = IsLocalVolatile ? "volatile " : "";
+      const StringRef LocalRestrict = IsLocalRestrict ? "__restrict " : "";
       return (PtrConst + "auto *" + LocalConst + LocalVol + LocalRestrict)
           .str();
     }();
 
-    DiagnosticBuilder Diag =
+    const DiagnosticBuilder Diag =
         diag(FixitLoc,
              "'%select{|const }0%select{|volatile }1%select{|__restrict }2auto "
              "%3' can be declared as '%4%3'")
         << IsLocalConst << IsLocalVolatile << IsLocalRestrict << Var->getName()
         << ReplStr;
 
-    for (SourceRange &Range : RemoveQualifiersRange) {
+    for (const SourceRange &Range : RemoveQualifiersRange) {
       Diag << FixItHint::CreateRemoval(CharSourceRange::getCharRange(Range));
     }
 
@@ -286,7 +286,7 @@ void QualifiedAutoCheck::check(const MatchFinder::MatchResult &Result) {
       if (TypeSpec->isInvalid() || TypeSpec->getBegin().isMacroID() ||
           TypeSpec->getEnd().isMacroID())
         return;
-      SourceLocation InsertPos = TypeSpec->getBegin();
+      const SourceLocation InsertPos = TypeSpec->getBegin();
       diag(InsertPos,
            "'auto *%select{|const }0%select{|volatile }1%2' can be declared as "
            "'const auto *%select{|const }0%select{|volatile }1%2'")
@@ -308,7 +308,7 @@ void QualifiedAutoCheck::check(const MatchFinder::MatchResult &Result) {
       if (TypeSpec->isInvalid() || TypeSpec->getBegin().isMacroID() ||
           TypeSpec->getEnd().isMacroID())
         return;
-      SourceLocation InsertPos = TypeSpec->getBegin();
+      const SourceLocation InsertPos = TypeSpec->getBegin();
       diag(InsertPos, "'auto &%0' can be declared as 'const auto &%0'")
           << Var->getName() << FixItHint::CreateInsertion(InsertPos, "const ");
     }
diff --git a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
index 569302e6065f2..afb63571de583 100644
--- a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
@@ -10,7 +10,7 @@
 #include "../ClangTidyModule.h"
 #include "../ClangTidyModuleRegistry.h"
 #include "AmbiguousSmartptrResetCallCheck.h"
-#include "AvoidConstParamsInDecls.h"
+#include "AvoidConstParamsInDeclsCheck.h"
 #include "AvoidNestedConditionalOperatorCheck.h"
 #include "AvoidReturnWithVoidValueCheck.h"
 #include "AvoidUnconditionalPreprocessorIfCheck.h"
@@ -19,7 +19,7 @@
 #include "ContainerContainsCheck.h"
 #include "ContainerDataPointerCheck.h"
 #include "ContainerSizeEmptyCheck.h"
-#include "ConvertMemberFunctionsToStatic.h"
+#include "ConvertMemberFunctionsToStaticCheck.h"
 #include "DeleteNullPointerCheck.h"
 #include "DuplicateIncludeCheck.h"
 #include "ElseAfterReturnCheck.h"
@@ -74,7 +74,7 @@ class ReadabilityModule : public ClangTidyModule {
   void addCheckFactories(ClangTidyCheckFactories &CheckFactories) override {
     CheckFactories.registerCheck<AmbiguousSmartptrResetCallCheck>(
         "readability-ambiguous-smartptr-reset-call");
-    CheckFactories.registerCheck<AvoidConstParamsInDecls>(
+    CheckFactories.registerCheck<AvoidConstParamsInDeclsCheck>(
         "readability-avoid-const-params-in-decls");
     CheckFactories.registerCheck<AvoidNestedConditionalOperatorCheck>(
         "readability-avoid-nested-conditional-operator");
@@ -92,7 +92,7 @@ class ReadabilityModule : public ClangTidyModule {
         "readability-container-data-pointer");
     CheckFactories.registerCheck<ContainerSizeEmptyCheck>(
         "readability-container-size-empty");
-    CheckFactories.registerCheck<ConvertMemberFunctionsToStatic>(
+    CheckFactories.registerCheck<ConvertMemberFunctionsToStaticCheck>(
         "readability-convert-member-functions-to-static");
     CheckFactories.registerCheck<DeleteNullPointerCheck>(
         "readability-delete-null-pointer");
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.cpp
index e93aa16ebdb13..14580a6a26809 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.cpp
@@ -43,7 +43,7 @@ void RedundantAccessSpecifiersCheck::check(
       LastASDecl = ASDecl;
 
       if (CheckFirstDeclaration) {
-        AccessSpecifier DefaultSpecifier =
+        const AccessSpecifier DefaultSpecifier =
             MatchedDecl->isClass() ? AS_private : AS_public;
         if (ASDecl->getAccess() == DefaultSpecifier) {
           diag(ASDecl->getLocation(),
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp
index 1ee75220b1c4e..d11c41c33d2be 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp
@@ -25,8 +25,8 @@ static bool areTypesEqual(QualType S, QualType D) {
   if (TS != TD)
     return false;
 
-  QualType PtrS = S->getPointeeType();
-  QualType PtrD = D->getPointeeType();
+  const QualType PtrS = S->getPointeeType();
+  const QualType PtrD = D->getPointeeType();
 
   if (!PtrS.isNull() && !PtrD.isNull())
     return areTypesEqual(PtrS, PtrD);
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.cpp
index b3b84e2cc0ccd..132b7ddc4311b 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.cpp
@@ -50,7 +50,7 @@ void RedundantControlFlowCheck::check(const MatchFinder::MatchResult &Result) {
 
 void RedundantControlFlowCheck::checkRedundantReturn(
     const MatchFinder::MatchResult &Result, const CompoundStmt *Block) {
-  CompoundStmt::const_reverse_body_iterator Last = Block->body_rbegin();
+  const CompoundStmt::const_reverse_body_iterator Last = Block->body_rbegin();
   if (const auto *Return = dyn_cast<ReturnStmt>(*Last))
     issueDiagnostic(Result, Block, Return->getSourceRange(),
                     RedundantReturnDiag);
@@ -58,7 +58,7 @@ void RedundantControlFlowCheck::checkRedundantReturn(
 
 void RedundantControlFlowCheck::checkRedundantContinue(
     const MatchFinder::MatchResult &Result, const CompoundStmt *Block) {
-  CompoundStmt::const_reverse_body_iterator Last = Block->body_rbegin();
+  const CompoundStmt::const_reverse_body_iterator Last = Block->body_rbegin();
   if (const auto *Continue = dyn_cast<ContinueStmt>(*Last))
     issueDiagnostic(Result, Block, Continue->getSourceRange(),
                     RedundantContinueDiag);
@@ -67,11 +67,12 @@ void RedundantControlFlowCheck::checkRedundantContinue(
 void RedundantControlFlowCheck::issueDiagnostic(
     const MatchFinder::MatchResult &Result, const CompoundStmt *const Block,
     const SourceRange &StmtRange, const char *const Diag) {
-  SourceManager &SM = *Result.SourceManager;
+  const SourceManager &SM = *Result.SourceManager;
   if (isLocationInMacroExpansion(SM, StmtRange.getBegin()))
     return;
 
-  CompoundStmt::const_reverse_body_iterator Previous = ++Block->body_rbegin();
+  const CompoundStmt::const_reverse_body_iterator Previous =
+      ++Block->body_rbegin();
   SourceLocation Start;
   if (Previous != Block->body_rend())
     Start = Lexer::findLocationAfterToken(
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.h
index 3018b1f8d14e6..fde305039d4c9 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_CONTROL_FLOW_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_CONTROL_FLOW_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTCONTROLFLOWCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTCONTROLFLOWCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -47,4 +47,4 @@ class RedundantControlFlowCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_CONTROL_FLOW_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTCONTROLFLOWCHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.cpp
index cf6e92d84e92a..0f12b8bcea6fb 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.cpp
@@ -79,7 +79,7 @@ void RedundantDeclarationCheck::check(const MatchFinder::MatchResult &Result) {
     }
   }
 
-  SourceLocation EndLoc = Lexer::getLocForEndOfToken(
+  const SourceLocation EndLoc = Lexer::getLocForEndOfToken(
       D->getSourceRange().getEnd(), 0, SM, Result.Context->getLangOpts());
   {
     auto Diag = diag(D->getLocation(), "redundant %0 declaration") << D;
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.h
index b22cef9a2b776..9b1b09f914a00 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_DECLARATION_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_DECLARATION_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTDECLARATIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTDECLARATIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -30,4 +30,4 @@ class RedundantDeclarationCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_DECLARATION_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTDECLARATIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h
index 5c82a5e02645f..49cbf69c06f3f 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_FUNCTION_PTR_DEREFERENCE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_FUNCTION_PTR_DEREFERENCE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTFUNCTIONPTRDEREFERENCECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTFUNCTIONPTRDEREFERENCECHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -28,4 +28,4 @@ class RedundantFunctionPtrDereferenceCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_FUNCTION_PTR_DEREFERENCE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTFUNCTIONPTRDEREFERENCECHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp
index 2053b89ada7e2..76adaa80207da 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp
@@ -52,7 +52,7 @@ AST_POLYMORPHIC_MATCHER_P(isInternalLinkage,
 static SourceLocation getInlineTokenLocation(SourceRange RangeLocation,
                                              const SourceManager &Sources,
                                              const LangOptions &LangOpts) {
-  SourceLocation Loc = RangeLocation.getBegin();
+  const SourceLocation Loc = RangeLocation.getBegin();
   if (Loc.isMacroID())
     return {};
 
@@ -106,7 +106,7 @@ template <typename T>
 void RedundantInlineSpecifierCheck::handleMatchedDecl(
     const T *MatchedDecl, const SourceManager &Sources,
     const MatchFinder::MatchResult &Result, StringRef Message) {
-  SourceLocation Loc = getInlineTokenLocation(
+  const SourceLocation Loc = getInlineTokenLocation(
       MatchedDecl->getSourceRange(), Sources, Result.Context->getLangOpts());
   if (Loc.isValid())
     diag(Loc, Message) << MatchedDecl << FixItHint::CreateRemoval(Loc);
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.h
index 64d365d1e3f45..ff8b02d141a46 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_MEMBER_INIT_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_MEMBER_INIT_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTMEMBERINITCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTMEMBERINITCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -40,4 +40,4 @@ class RedundantMemberInitCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_MEMBER_INIT_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTMEMBERINITCHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp
index 0ab59fff39d88..874b9618bd882 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "RedundantParenthesesCheck.h"
+#include "../utils/Matchers.h"
+#include "../utils/OptionsUtils.h"
 #include "clang/AST/Expr.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
@@ -32,15 +34,30 @@ AST_MATCHER(ParenExpr, isInMacro) {
 
 } // namespace
 
+RedundantParenthesesCheck::RedundantParenthesesCheck(StringRef Name,
+                                                     ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      AllowedDecls(utils::options::parseStringList(
+          Options.get("AllowedDecls", "std::max;std::min"))) {}
+
+void RedundantParenthesesCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "AllowedDecls",
+                utils::options::serializeStringList(AllowedDecls));
+}
+
 void RedundantParenthesesCheck::registerMatchers(MatchFinder *Finder) {
   const auto ConstantExpr =
       expr(anyOf(integerLiteral(), floatLiteral(), characterLiteral(),
                  cxxBoolLiteral(), stringLiteral(), cxxNullPtrLiteralExpr()));
   Finder->addMatcher(
-      parenExpr(subExpr(anyOf(parenExpr(), ConstantExpr, declRefExpr())),
-                unless(anyOf(isInMacro(),
-                             // sizeof(...) is common used.
-                             hasParent(unaryExprOrTypeTraitExpr()))))
+      parenExpr(
+          subExpr(anyOf(parenExpr(), ConstantExpr,
+                        declRefExpr(to(namedDecl(unless(
+                            matchers::matchesAnyListedName(AllowedDecls))))))),
+          unless(anyOf(isInMacro(),
+                       // sizeof(...) is common used.
+                       hasParent(unaryExprOrTypeTraitExpr()))))
           .bind("dup"),
       this);
 }
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h
index 9a0409b83fff3..2638a09730f7e 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h
@@ -20,13 +20,16 @@ namespace clang::tidy::readability {
 /// https://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-parentheses.html
 class RedundantParenthesesCheck : public ClangTidyCheck {
 public:
-  RedundantParenthesesCheck(StringRef Name, ClangTidyContext *Context)
-      : ClangTidyCheck(Name, Context) {}
+  RedundantParenthesesCheck(StringRef Name, ClangTidyContext *Context);
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus | LangOpts.C99;
   }
+
+private:
+  const std::vector<StringRef> AllowedDecls;
 };
 
 } // namespace clang::tidy::readability
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.cpp
index 931126a154d1e..4c503714346f8 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.cpp
@@ -36,7 +36,7 @@ class RedundantPreprocessorCallbacks : public PPCallbacks {
 
   void If(SourceLocation Loc, SourceRange ConditionRange,
           ConditionValueKind ConditionValue) override {
-    StringRef Condition =
+    const StringRef Condition =
         Lexer::getSourceText(CharSourceRange::getTokenRange(ConditionRange),
                              PP.getSourceManager(), PP.getLangOpts());
     checkMacroRedundancy(Loc, Condition, IfStack, DK_If, DK_If, true);
@@ -44,7 +44,7 @@ class RedundantPreprocessorCallbacks : public PPCallbacks {
 
   void Ifdef(SourceLocation Loc, const Token &MacroNameTok,
              const MacroDefinition &MacroDefinition) override {
-    std::string MacroName = PP.getSpelling(MacroNameTok);
+    const std::string MacroName = PP.getSpelling(MacroNameTok);
     checkMacroRedundancy(Loc, MacroName, IfdefStack, DK_Ifdef, DK_Ifdef, true);
     checkMacroRedundancy(Loc, MacroName, IfndefStack, DK_Ifdef, DK_Ifndef,
                          false);
@@ -52,7 +52,7 @@ class RedundantPreprocessorCallbacks : public PPCallbacks {
 
   void Ifndef(SourceLocation Loc, const Token &MacroNameTok,
               const MacroDefinition &MacroDefinition) override {
-    std::string MacroName = PP.getSpelling(MacroNameTok);
+    const std::string MacroName = PP.getSpelling(MacroNameTok);
     checkMacroRedundancy(Loc, MacroName, IfndefStack, DK_Ifndef, DK_Ifndef,
                          true);
     checkMacroRedundancy(Loc, MacroName, IfdefStack, DK_Ifndef, DK_Ifdef,
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
index 11065230edc60..a458ae3ebc20d 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
@@ -149,8 +149,9 @@ void RedundantSmartptrGetCheck::check(const MatchFinder::MatchResult &Result) {
   if (!allReturnTypesMatch(Result))
     return;
 
-  bool IsPtrToPtr = Result.Nodes.getNodeAs<Decl>("ptr_to_ptr") != nullptr;
-  bool IsMemberExpr = Result.Nodes.getNodeAs<Expr>("memberExpr") != nullptr;
+  const bool IsPtrToPtr = Result.Nodes.getNodeAs<Decl>("ptr_to_ptr") != nullptr;
+  const bool IsMemberExpr =
+      Result.Nodes.getNodeAs<Expr>("memberExpr") != nullptr;
   const auto *GetCall = Result.Nodes.getNodeAs<Expr>("redundant_get");
   if (GetCall->getBeginLoc().isMacroID() && IgnoreMacros)
     return;
@@ -178,7 +179,8 @@ void RedundantSmartptrGetCheck::check(const MatchFinder::MatchResult &Result) {
     SmartptrText = SmartptrText.drop_back(2);
   }
   // Replace foo->get() with *foo, and foo.get() with foo.
-  std::string Replacement = Twine(IsPtrToPtr ? "*" : "", SmartptrText).str();
+  const std::string Replacement =
+      Twine(IsPtrToPtr ? "*" : "", SmartptrText).str();
   diag(GetCall->getBeginLoc(), "redundant get() call on smart pointer")
       << FixItHint::CreateReplacement(SR, Replacement);
 }
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp
index c90d1521e6b8d..e4d08cbf0d282 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp
@@ -171,10 +171,10 @@ void RedundantStringCStrCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *Call = Result.Nodes.getNodeAs<CallExpr>("call");
   const auto *Arg = Result.Nodes.getNodeAs<Expr>("arg");
   const auto *Member = Result.Nodes.getNodeAs<MemberExpr>("member");
-  bool Arrow = Member->isArrow();
+  const bool Arrow = Member->isArrow();
   // Replace the "call" node with the "arg" node, prefixed with '*'
   // if the call was using '->' rather than '.'.
-  std::string ArgText =
+  const std::string ArgText =
       Arrow ? utils::fixit::formatDereference(*Arg, *Result.Context)
             : tooling::fixit::getText(*Arg, *Result.Context).str();
   if (ArgText.empty())
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.cpp
index b579aafe8ea43..756fe437b3e10 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.cpp
@@ -23,8 +23,8 @@ const char DefaultStringNames[] =
 static std::vector<StringRef> removeNamespaces(ArrayRef<StringRef> Names) {
   std::vector<StringRef> Result;
   Result.reserve(Names.size());
-  for (StringRef Name : Names) {
-    StringRef::size_type ColonPos = Name.rfind(':');
+  for (const StringRef Name : Names) {
+    const StringRef::size_type ColonPos = Name.rfind(':');
     Result.push_back(
         Name.drop_front(ColonPos == StringRef::npos ? 0 : ColonPos + 1));
   }
@@ -125,14 +125,14 @@ void RedundantStringInitCheck::check(const MatchFinder::MatchResult &Result) {
   if (const auto *VDecl = Result.Nodes.getNodeAs<VarDecl>("vardecl")) {
     // VarDecl's getSourceRange() spans 'string foo = ""' or 'string bar("")'.
     // So start at getLocation() to span just 'foo = ""' or 'bar("")'.
-    SourceRange ReplaceRange(VDecl->getLocation(), VDecl->getEndLoc());
+    const SourceRange ReplaceRange(VDecl->getLocation(), VDecl->getEndLoc());
     diag(VDecl->getLocation(), "redundant string initialization")
         << FixItHint::CreateReplacement(ReplaceRange, VDecl->getName());
   }
   if (const auto *FDecl = Result.Nodes.getNodeAs<FieldDecl>("fieldDecl")) {
     // FieldDecl's getSourceRange() spans 'string foo = ""'.
     // So start at getLocation() to span just 'foo = ""'.
-    SourceRange ReplaceRange(FDecl->getLocation(), FDecl->getEndLoc());
+    const SourceRange ReplaceRange(FDecl->getLocation(), FDecl->getEndLoc());
     diag(FDecl->getLocation(), "redundant string initialization")
         << FixItHint::CreateReplacement(ReplaceRange, FDecl->getName());
   }
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.h
index 853ea2fcd0313..5c4b744c64459 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantStringInitCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_STRING_INIT_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_STRING_INIT_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTSTRINGINITCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTSTRINGINITCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include <string>
@@ -32,4 +32,4 @@ class RedundantStringInitCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANT_STRING_INIT_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTSTRINGINITCHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.cpp b/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.cpp
index 5d3fd14b92471..398bee1d40923 100644
--- a/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.cpp
@@ -37,7 +37,7 @@ struct NotExtendedByDeclBoundToPredicate {
 
 AST_MATCHER_P(MaterializeTemporaryExpr, isExtendedByDeclBoundTo, StringRef,
               ID) {
-  NotExtendedByDeclBoundToPredicate Predicate{
+  const NotExtendedByDeclBoundToPredicate Predicate{
       ID, ::clang::DynTypedNode::create(Node)};
   return Builder->removeBindings(Predicate);
 }
diff --git a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
index 9f3f26b775c9a..1a9c161068030 100644
--- a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
@@ -203,7 +203,7 @@ static std::string replacementExpression(const ASTContext &Context,
                          .str(),
                      NeedsStaticCast));
 
-    StringRef Text = getText(Context, *E);
+    const StringRef Text = getText(Context, *E);
     if (!NeedsStaticCast && needsParensAfterUnaryNegation(E))
       return ("!(" + Text + ")").str();
 
@@ -366,7 +366,7 @@ class SimplifyBooleanExprCheck::Visitor : public RecursiveASTVisitor<Visitor> {
      * if (false) ThenStmt(); -> <Empty>;
      * if (false) ThenStmt(); else ElseStmt() -> ElseStmt();
      */
-    Expr *Cond = If->getCond()->IgnoreImplicit();
+    const Expr *Cond = If->getCond()->IgnoreImplicit();
     if (std::optional<bool> Bool = getAsBoolLiteral(Cond, true)) {
       if (*Bool)
         Check->replaceWithThenStatement(Context, If, Cond);
@@ -379,9 +379,9 @@ class SimplifyBooleanExprCheck::Visitor : public RecursiveASTVisitor<Visitor> {
        * if (Cond) return true; else return false; -> return Cond;
        * if (Cond) return false; else return true; -> return !Cond;
        */
-      if (ExprAndBool ThenReturnBool =
+      if (const ExprAndBool ThenReturnBool =
               checkSingleStatement(If->getThen(), parseReturnLiteralBool)) {
-        ExprAndBool ElseReturnBool =
+        const ExprAndBool ElseReturnBool =
             checkSingleStatement(If->getElse(), parseReturnLiteralBool);
         if (ElseReturnBool && ThenReturnBool.Bool != ElseReturnBool.Bool) {
           if (Check->ChainedConditionalReturn ||
@@ -418,9 +418,9 @@ class SimplifyBooleanExprCheck::Visitor : public RecursiveASTVisitor<Visitor> {
             return {ME->getMemberDecl(), *RightasBool};
           return {};
         };
-        if (DeclAndBool ThenAssignment =
+        if (const DeclAndBool ThenAssignment =
                 checkSingleStatement(If->getThen(), VarBoolAssignmentMatcher)) {
-          DeclAndBool ElseAssignment =
+          const DeclAndBool ElseAssignment =
               checkSingleStatement(If->getElse(), VarBoolAssignmentMatcher);
           if (ElseAssignment.Item == ThenAssignment.Item &&
               ElseAssignment.Bool != ThenAssignment.Bool) {
@@ -461,7 +461,7 @@ class SimplifyBooleanExprCheck::Visitor : public RecursiveASTVisitor<Visitor> {
          Second != End; ++Second, ++First) {
       PrevIf = CurIf;
       CurIf = isa<IfStmt>(*First);
-      ExprAndBool TrailingReturnBool = parseReturnLiteralBool(*Second);
+      const ExprAndBool TrailingReturnBool = parseReturnLiteralBool(*Second);
       if (!TrailingReturnBool)
         continue;
 
@@ -473,7 +473,7 @@ class SimplifyBooleanExprCheck::Visitor : public RecursiveASTVisitor<Visitor> {
         auto *If = cast<IfStmt>(*First);
         if (!If->hasInitStorage() && !If->hasVarStorage() &&
             !If->isConsteval()) {
-          ExprAndBool ThenReturnBool =
+          const ExprAndBool ThenReturnBool =
               checkSingleStatement(If->getThen(), parseReturnLiteralBool);
           if (ThenReturnBool &&
               ThenReturnBool.Bool != TrailingReturnBool.Bool) {
@@ -497,7 +497,7 @@ class SimplifyBooleanExprCheck::Visitor : public RecursiveASTVisitor<Visitor> {
         auto *SubIf = dyn_cast<IfStmt>(SubStmt);
         if (SubIf && !SubIf->getElse() && !SubIf->hasInitStorage() &&
             !SubIf->hasVarStorage() && !SubIf->isConsteval()) {
-          ExprAndBool ThenReturnBool =
+          const ExprAndBool ThenReturnBool =
               checkSingleStatement(SubIf->getThen(), parseReturnLiteralBool);
           if (ThenReturnBool &&
               ThenReturnBool.Bool != TrailingReturnBool.Bool) {
@@ -574,7 +574,7 @@ class SimplifyBooleanExprCheck::Visitor : public RecursiveASTVisitor<Visitor> {
       if (Check->reportDeMorgan(Context, Op, BinaryOp, !IsProcessing, parent(),
                                 Parens) &&
           !Check->areDiagsSelfContained()) {
-        llvm::SaveAndRestore RAII(IsProcessing, true);
+        const llvm::SaveAndRestore RAII(IsProcessing, true);
         return Base::TraverseUnaryOperator(Op);
       }
     }
@@ -638,13 +638,13 @@ void SimplifyBooleanExprCheck::reportBinOp(const ASTContext &Context,
   if (!isa<CXXBoolLiteralExpr>(Other) && containsBoolLiteral(Other))
     return;
 
-  bool BoolValue = Bool->getValue();
+  const bool BoolValue = Bool->getValue();
 
   auto ReplaceWithExpression = [this, &Context, LHS, RHS,
                                 Bool](const Expr *ReplaceWith, bool Negated) {
-    std::string Replacement =
+    const std::string Replacement =
         replacementExpression(Context, Negated, ReplaceWith);
-    SourceRange Range(LHS->getBeginLoc(), RHS->getEndLoc());
+    const SourceRange Range(LHS->getBeginLoc(), RHS->getEndLoc());
     issueDiag(Context, Bool->getBeginLoc(), SimplifyOperatorDiagnostic, Range,
               Replacement);
   };
@@ -706,11 +706,11 @@ bool SimplifyBooleanExprCheck::issueDiag(const ASTContext &Context,
                                          StringRef Description,
                                          SourceRange ReplacementRange,
                                          StringRef Replacement) {
-  CharSourceRange CharRange =
+  const CharSourceRange CharRange =
       Lexer::makeFileCharRange(CharSourceRange::getTokenRange(ReplacementRange),
                                Context.getSourceManager(), getLangOpts());
 
-  DiagnosticBuilder Diag = diag(Loc, Description);
+  const DiagnosticBuilder Diag = diag(Loc, Description);
   const bool HasReplacement = !containsDiscardedTokens(Context, CharRange);
   if (HasReplacement)
     Diag << FixItHint::CreateReplacement(CharRange, Replacement);
@@ -737,7 +737,7 @@ void SimplifyBooleanExprCheck::replaceWithElseStatement(
 void SimplifyBooleanExprCheck::replaceWithCondition(
     const ASTContext &Context, const ConditionalOperator *Ternary,
     bool Negated) {
-  std::string Replacement =
+  const std::string Replacement =
       replacementExpression(Context, Negated, Ternary->getCond());
   issueDiag(Context, Ternary->getTrueExpr()->getBeginLoc(),
             "redundant boolean literal in ternary expression result",
@@ -747,11 +747,11 @@ void SimplifyBooleanExprCheck::replaceWithCondition(
 void SimplifyBooleanExprCheck::replaceWithReturnCondition(
     const ASTContext &Context, const IfStmt *If, const Expr *BoolLiteral,
     bool Negated) {
-  StringRef Terminator = isa<CompoundStmt>(If->getElse()) ? ";" : "";
-  std::string Condition =
+  const StringRef Terminator = isa<CompoundStmt>(If->getElse()) ? ";" : "";
+  const std::string Condition =
       replacementExpression(Context, Negated, If->getCond());
-  std::string Replacement = ("return " + Condition + Terminator).str();
-  SourceLocation Start = BoolLiteral->getBeginLoc();
+  const std::string Replacement = ("return " + Condition + Terminator).str();
+  const SourceLocation Start = BoolLiteral->getBeginLoc();
 
   const bool HasReplacement =
       issueDiag(Context, Start, SimplifyConditionalReturnDiagnostic,
@@ -795,12 +795,13 @@ void SimplifyBooleanExprCheck::replaceWithAssignment(const ASTContext &Context,
                                                      const Expr *Var,
                                                      SourceLocation Loc,
                                                      bool Negated) {
-  SourceRange Range = IfAssign->getSourceRange();
-  StringRef VariableName = getText(Context, *Var);
-  StringRef Terminator = isa<CompoundStmt>(IfAssign->getElse()) ? ";" : "";
-  std::string Condition =
+  const SourceRange Range = IfAssign->getSourceRange();
+  const StringRef VariableName = getText(Context, *Var);
+  const StringRef Terminator =
+      isa<CompoundStmt>(IfAssign->getElse()) ? ";" : "";
+  const std::string Condition =
       replacementExpression(Context, Negated, IfAssign->getCond());
-  std::string Replacement =
+  const std::string Replacement =
       (VariableName + " = " + Condition + Terminator).str();
   issueDiag(Context, Loc, "redundant boolean literal in conditional assignment",
             Range, Replacement);
diff --git a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h
index 466bc411bf800..99520d76c6c6f 100644
--- a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_SIMPLIFY_BOOLEAN_EXPR_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_SIMPLIFY_BOOLEAN_EXPR_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_SIMPLIFYBOOLEANEXPRCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_SIMPLIFYBOOLEANEXPRCHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -75,4 +75,4 @@ class SimplifyBooleanExprCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_SIMPLIFY_BOOLEAN_EXPR_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_SIMPLIFYBOOLEANEXPRCHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.h b/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.h
index c376806d00098..38a2ea6419755 100644
--- a/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATIC_ACCESSED_THROUGH_INSTANCE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATIC_ACCESSED_THROUGH_INSTANCE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATICACCESSEDTHROUGHINSTANCECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATICACCESSEDTHROUGHINSTANCECHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -41,4 +41,4 @@ class StaticAccessedThroughInstanceCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATIC_ACCESSED_THROUGH_INSTANCE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATICACCESSEDTHROUGHINSTANCECHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.cpp
index e9a2eae11bfde..abc9f6709125b 100644
--- a/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.cpp
@@ -45,8 +45,8 @@ void StaticDefinitionInAnonymousNamespaceCheck::check(
   while (Loc < Def->getSourceRange().getEnd() &&
          !Lexer::getRawToken(Loc, Tok, *Result.SourceManager, getLangOpts(),
                              true)) {
-    SourceRange TokenRange(Tok.getLocation(), Tok.getEndLoc());
-    StringRef SourceText =
+    const SourceRange TokenRange(Tok.getLocation(), Tok.getEndLoc());
+    const StringRef SourceText =
         Lexer::getSourceText(CharSourceRange::getTokenRange(TokenRange),
                              *Result.SourceManager, getLangOpts());
     if (SourceText == "static") {
diff --git a/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.h b/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.h
index 55306556fb0a6..e096682ad031f 100644
--- a/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATIC_DEFINITION_IN_ANONYMOUS_NAMESPACE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATIC_DEFINITION_IN_ANONYMOUS_NAMESPACE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATICDEFINITIONINANONYMOUSNAMESPACECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATICDEFINITIONINANONYMOUSNAMESPACECHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -34,4 +34,4 @@ class StaticDefinitionInAnonymousNamespaceCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATIC_DEFINITION_IN_ANONYMOUS_NAMESPACE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_STATICDEFINITIONINANONYMOUSNAMESPACECHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
index feb248dd62411..19b47263c0089 100644
--- a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
@@ -150,8 +150,8 @@ static bool applyAbbreviationHeuristic(
 /// Check whether the shorter String is a prefix of the longer String.
 static bool applyPrefixHeuristic(StringRef Arg, StringRef Param,
                                  int8_t Threshold) {
-  StringRef Shorter = Arg.size() < Param.size() ? Arg : Param;
-  StringRef Longer = Arg.size() >= Param.size() ? Arg : Param;
+  const StringRef Shorter = Arg.size() < Param.size() ? Arg : Param;
+  const StringRef Longer = Arg.size() >= Param.size() ? Arg : Param;
 
   if (Longer.starts_with_insensitive(Shorter))
     return percentage(Shorter.size(), Longer.size()) > Threshold;
@@ -162,8 +162,8 @@ static bool applyPrefixHeuristic(StringRef Arg, StringRef Param,
 /// Check whether the shorter String is a suffix of the longer String.
 static bool applySuffixHeuristic(StringRef Arg, StringRef Param,
                                  int8_t Threshold) {
-  StringRef Shorter = Arg.size() < Param.size() ? Arg : Param;
-  StringRef Longer = Arg.size() >= Param.size() ? Arg : Param;
+  const StringRef Shorter = Arg.size() < Param.size() ? Arg : Param;
+  const StringRef Longer = Arg.size() >= Param.size() ? Arg : Param;
 
   if (Longer.ends_with_insensitive(Shorter))
     return percentage(Shorter.size(), Longer.size()) > Threshold;
@@ -196,13 +196,13 @@ static bool applySubstringHeuristic(StringRef Arg, StringRef Param,
     Current.swap(Previous);
   }
 
-  size_t LongerLength = std::max(Arg.size(), Param.size());
+  const size_t LongerLength = std::max(Arg.size(), Param.size());
   return percentage(MaxLength, LongerLength) > Threshold;
 }
 
 static bool applyLevenshteinHeuristic(StringRef Arg, StringRef Param,
                                       int8_t Threshold) {
-  std::size_t LongerLength = std::max(Arg.size(), Param.size());
+  const std::size_t LongerLength = std::max(Arg.size(), Param.size());
   double Dist = Arg.edit_distance(Param);
   Dist = (1.0 - Dist / LongerLength) * 100.0;
   return Dist > Threshold;
@@ -212,11 +212,11 @@ static bool applyLevenshteinHeuristic(StringRef Arg, StringRef Param,
 static bool applyJaroWinklerHeuristic(StringRef Arg, StringRef Param,
                                       int8_t Threshold) {
   std::size_t Match = 0, Transpos = 0;
-  std::ptrdiff_t ArgLen = Arg.size();
-  std::ptrdiff_t ParamLen = Param.size();
+  const std::ptrdiff_t ArgLen = Arg.size();
+  const std::ptrdiff_t ParamLen = Param.size();
   SmallVector<int, SmallVectorSize> ArgFlags(ArgLen);
   SmallVector<int, SmallVectorSize> ParamFlags(ParamLen);
-  std::ptrdiff_t Range =
+  const std::ptrdiff_t Range =
       std::max(std::ptrdiff_t{0}, (std::max(ArgLen, ParamLen) / 2) - 1);
 
   // Calculate matching characters.
@@ -252,7 +252,7 @@ static bool applyJaroWinklerHeuristic(StringRef Arg, StringRef Param,
   Transpos /= 2;
 
   // Jaro distance.
-  double MatchD = Match;
+  const double MatchD = Match;
   double Dist = ((MatchD / ArgLen) + (MatchD / ParamLen) +
                  ((MatchD - Transpos) / Match)) /
                 3.0;
@@ -347,7 +347,7 @@ static bool arePointersStillQualCompatible(QualType ArgType, QualType ParamType,
   // The types are compatible, if the parameter is at least as qualified as the
   // argument, and if it is more qualified, it has to be const on upper pointer
   // levels.
-  bool AreTypesQualCompatible =
+  const bool AreTypesQualCompatible =
       ParamType.isAtLeastAsQualifiedAs(ArgType, Ctx) &&
       (!ParamType.hasQualifiers() || IsParamContinuouslyConst);
   // Check whether the parameter's constness continues at the current pointer
@@ -401,7 +401,7 @@ static bool areTypesCompatible(QualType ArgType, QualType ParamType,
   if (!areRefAndQualCompatible(ArgType, ParamType, Ctx))
     return false;
 
-  bool IsParamReference = ParamType->isReferenceType();
+  const bool IsParamReference = ParamType->isReferenceType();
 
   // Reference-ness has already been checked and should be removed
   // before further checking.
@@ -438,7 +438,7 @@ static bool areTypesCompatible(QualType ArgType, QualType ParamType,
   if (IsParamReference && ParamType->isArrayType())
     return isCompatibleWithArrayReference(ArgType, ParamType, Ctx);
 
-  bool IsParamContinuouslyConst =
+  const bool IsParamContinuouslyConst =
       !IsParamReference || ParamType.getNonReferenceType().isConstQualified();
 
   // Remove the first level of indirection.
@@ -513,9 +513,9 @@ SuspiciousCallArgumentCheck::SuspiciousCallArgumentCheck(
     SmallString<32> Key = HeuristicToString[Idx];
     Key.append(BK == BoundKind::DissimilarBelow ? "DissimilarBelow"
                                                 : "SimilarAbove");
-    int8_t Default = BK == BoundKind::DissimilarBelow
-                         ? Defaults[Idx].DissimilarBelow
-                         : Defaults[Idx].SimilarAbove;
+    const int8_t Default = BK == BoundKind::DissimilarBelow
+                               ? Defaults[Idx].DissimilarBelow
+                               : Defaults[Idx].SimilarAbove;
     return Options.get(Key, Default);
   };
   for (std::size_t Idx = 0; Idx < HeuristicCount; ++Idx) {
@@ -527,7 +527,7 @@ SuspiciousCallArgumentCheck::SuspiciousCallArgumentCheck(
                        GetBoundOpt(H, BoundKind::SimilarAbove)));
   }
 
-  for (StringRef Abbreviation : optutils::parseStringList(
+  for (const StringRef Abbreviation : optutils::parseStringList(
            Options.get("Abbreviations", DefaultAbbreviations))) {
     auto KeyAndValue = Abbreviation.split("=");
     assert(!KeyAndValue.first.empty() && !KeyAndValue.second.empty());
@@ -652,7 +652,7 @@ void SuspiciousCallArgumentCheck::check(
   if (ArgNames.empty())
     return;
 
-  std::size_t ParamCount = ParamNames.size();
+  const std::size_t ParamCount = ParamNames.size();
 
   // Check similarity.
   for (std::size_t I = 0; I < ParamCount; ++I) {
@@ -673,9 +673,9 @@ void SuspiciousCallArgumentCheck::check(
           << MatchedCallExpr->getArg(J)->getSourceRange();
 
       // Note at the functions declaration.
-      SourceLocation IParNameLoc =
+      const SourceLocation IParNameLoc =
           CalleeFuncDecl->getParamDecl(I)->getLocation();
-      SourceLocation JParNameLoc =
+      const SourceLocation JParNameLoc =
           CalleeFuncDecl->getParamDecl(J)->getLocation();
 
       diag(CalleeFuncDecl->getLocation(), "in the call to %0, declared here",
@@ -697,7 +697,7 @@ void SuspiciousCallArgumentCheck::setParamNamesAndTypes(
   for (const ParmVarDecl *Param : CalleeFuncDecl->parameters()) {
     ParamTypes.push_back(Param->getType());
 
-    if (IdentifierInfo *II = Param->getIdentifier())
+    if (const IdentifierInfo *II = Param->getIdentifier())
       ParamNames.push_back(II->getName());
     else
       ParamNames.push_back(StringRef());
@@ -759,16 +759,16 @@ bool SuspiciousCallArgumentCheck::areParamAndArgComparable(
 
 bool SuspiciousCallArgumentCheck::areArgsSwapped(std::size_t Position1,
                                                  std::size_t Position2) const {
-  for (Heuristic H : AppliedHeuristics) {
-    bool A1ToP2Similar = areNamesSimilar(
+  for (const Heuristic H : AppliedHeuristics) {
+    const bool A1ToP2Similar = areNamesSimilar(
         ArgNames[Position2], ParamNames[Position1], H, BoundKind::SimilarAbove);
-    bool A2ToP1Similar = areNamesSimilar(
+    const bool A2ToP1Similar = areNamesSimilar(
         ArgNames[Position1], ParamNames[Position2], H, BoundKind::SimilarAbove);
 
-    bool A1ToP1Dissimilar =
+    const bool A1ToP1Dissimilar =
         !areNamesSimilar(ArgNames[Position1], ParamNames[Position1], H,
                          BoundKind::DissimilarBelow);
-    bool A2ToP2Dissimilar =
+    const bool A2ToP2Dissimilar =
         !areNamesSimilar(ArgNames[Position2], ParamNames[Position2], H,
                          BoundKind::DissimilarBelow);
 
diff --git a/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.h b/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.h
index ab6449e3fd416..28742087bad52 100644
--- a/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_UNIQUEPTR_DELETE_RELEASE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_UNIQUEPTR_DELETE_RELEASE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_UNIQUEPTRDELETERELEASECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_UNIQUEPTRDELETERELEASECHECK_H
 
 #include "../ClangTidyCheck.h"
 
@@ -37,4 +37,4 @@ class UniqueptrDeleteReleaseCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_UNIQUEPTR_DELETE_RELEASE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_UNIQUEPTRDELETERELEASECHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp b/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp
index 740a68d852c9e..db226f97818c5 100644
--- a/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp
@@ -110,7 +110,7 @@ shouldReplaceLiteralSuffix(const Expr &Literal,
   ReplacementDsc.LiteralLocation = L.getSourceRange();
 
   // Was this literal fully spelled or is it a product of macro expansion?
-  bool RangeCanBeFixed =
+  const bool RangeCanBeFixed =
       utils::rangeCanBeFixed(ReplacementDsc.LiteralLocation, &SM);
 
   // The literal may have macro expansion, we need the final expanded src range.
diff --git a/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.cpp b/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.cpp
index 82eb6de8fa3dc..fe03f2194e1b8 100644
--- a/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.cpp
@@ -20,7 +20,7 @@ namespace {
 /// followed by a Stmt matching the inner matcher.
 AST_MATCHER_P(Stmt, nextStmt, ast_matchers::internal::Matcher<Stmt>,
               InnerMatcher) {
-  DynTypedNodeList Parents = Finder->getASTContext().getParents(Node);
+  const DynTypedNodeList Parents = Finder->getASTContext().getParents(Node);
   if (Parents.size() != 1)
     return false;
 
diff --git a/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.h b/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.h
index f431311b4282a..32983e48450f1 100644
--- a/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USEALGORITHMCHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USEALGORITHMCHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USEANYOFALLOFCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USEANYOFALLOFCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "../utils/IncludeInserter.h"
@@ -33,4 +33,4 @@ class UseAnyOfAllOfCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::readability
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USEALGORITHMCHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USEANYOFALLOFCHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/UseConcisePreprocessorDirectivesCheck.cpp b/clang-tools-extra/clang-tidy/readability/UseConcisePreprocessorDirectivesCheck.cpp
index 40aaff4cb3893..ef495d3bf0f6e 100644
--- a/clang-tools-extra/clang-tidy/readability/UseConcisePreprocessorDirectivesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/UseConcisePreprocessorDirectivesCheck.cpp
@@ -91,7 +91,10 @@ class IfPreprocessorCallbacks final : public PPCallbacks {
     Check.diag(
         DirectiveLoc,
         "preprocessor condition can be written more concisely using '#%0'")
-        << FixItHint::CreateReplacement(DirectiveLoc, Replacements[Inverted])
+        << FixItHint::CreateReplacement(
+               CharSourceRange::getCharRange(DirectiveLoc,
+                                             ConditionRange.getBegin()),
+               (Replacements[Inverted].str() + " "))
         << FixItHint::CreateReplacement(ConditionRange, Macro)
         << Replacements[Inverted];
   }
diff --git a/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.cpp b/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.cpp
index 8052e04c99f43..5a7add88d6eeb 100644
--- a/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.cpp
@@ -77,11 +77,11 @@ static QualType getNonTemplateAlias(QualType QT) {
 
 static QualType getReplacementCastType(const Expr *CondLhs, const Expr *CondRhs,
                                        QualType ComparedType) {
-  QualType LhsType = CondLhs->getType();
-  QualType RhsType = CondRhs->getType();
-  QualType LhsCanonicalType =
+  const QualType LhsType = CondLhs->getType();
+  const QualType RhsType = CondRhs->getType();
+  const QualType LhsCanonicalType =
       LhsType.getCanonicalType().getNonReferenceType().getUnqualifiedType();
-  QualType RhsCanonicalType =
+  const QualType RhsCanonicalType =
       RhsType.getCanonicalType().getNonReferenceType().getUnqualifiedType();
   QualType GlobalImplicitCastType;
   if (LhsCanonicalType != RhsCanonicalType) {
@@ -109,7 +109,7 @@ static std::string createReplacement(const Expr *CondLhs, const Expr *CondRhs,
   const llvm::StringRef AssignLhsStr = Lexer::getSourceText(
       Source.getExpansionRange(AssignLhs->getSourceRange()), Source, LO);
 
-  QualType GlobalImplicitCastType =
+  const QualType GlobalImplicitCastType =
       getReplacementCastType(CondLhs, CondRhs, BO->getLHS()->getType());
 
   return (AssignLhsStr + " = " + FunctionName +
diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
index 1ae8756c339e7..bc6bd164e24f8 100644
--- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
+++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
@@ -390,7 +390,7 @@ static void printStats(const ClangTidyStats &Stats) {
 static std::unique_ptr<ClangTidyOptionsProvider>
 createOptionsProvider(llvm::IntrusiveRefCntPtr<vfs::FileSystem> FS) {
   ClangTidyGlobalOptions GlobalOptions;
-  if (std::error_code Err = parseLineFilter(LineFilter, GlobalOptions)) {
+  if (const std::error_code Err = parseLineFilter(LineFilter, GlobalOptions)) {
     llvm::errs() << "Invalid LineFilter: " << Err.message() << "\n\nUsage:\n";
     llvm::cl::PrintHelpMessage(/*Hidden=*/false, /*Categorized=*/true);
     return nullptr;
@@ -448,7 +448,7 @@ createOptionsProvider(llvm::IntrusiveRefCntPtr<vfs::FileSystem> FS) {
 
     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
         llvm::MemoryBuffer::getFile(ConfigFile);
-    if (std::error_code EC = Text.getError()) {
+    if (const std::error_code EC = Text.getError()) {
       llvm::errs() << "Error: can't read config-file '" << ConfigFile
                    << "': " << EC.message() << "\n";
       return nullptr;
@@ -466,10 +466,9 @@ createOptionsProvider(llvm::IntrusiveRefCntPtr<vfs::FileSystem> FS) {
 }
 
 static llvm::IntrusiveRefCntPtr<vfs::FileSystem>
-getVfsFromFile(const std::string &OverlayFile,
-               llvm::IntrusiveRefCntPtr<vfs::FileSystem> BaseFS) {
+getVfsFromFile(const std::string &OverlayFile, vfs::FileSystem &BaseFS) {
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Buffer =
-      BaseFS->getBufferForFile(OverlayFile);
+      BaseFS.getBufferForFile(OverlayFile);
   if (!Buffer) {
     llvm::errs() << "Can't load virtual filesystem overlay file '"
                  << OverlayFile << "': " << Buffer.getError().message()
@@ -491,7 +490,7 @@ static StringRef closest(StringRef Value, const StringSet<> &Allowed) {
   unsigned MaxEdit = 5U;
   StringRef Closest;
   for (auto Item : Allowed.keys()) {
-    unsigned Cur = Value.edit_distance_insensitive(Item, true, MaxEdit);
+    const unsigned Cur = Value.edit_distance_insensitive(Item, true, MaxEdit);
     if (Cur < MaxEdit) {
       Closest = Item;
       MaxEdit = Cur;
@@ -504,7 +503,7 @@ static constexpr StringLiteral VerifyConfigWarningEnd = " [-verify-config]\n";
 
 static bool verifyChecks(const StringSet<> &AllChecks, StringRef CheckGlob,
                          StringRef Source) {
-  GlobList Globs(CheckGlob);
+  const GlobList Globs(CheckGlob);
   bool AnyInvalid = false;
   for (const auto &Item : Globs.getItems()) {
     if (Item.Text.starts_with("clang-diagnostic"))
@@ -520,7 +519,7 @@ static bool verifyChecks(const StringSet<> &AllChecks, StringRef CheckGlob,
         llvm::raw_ostream &Output =
             llvm::WithColor::warning(llvm::errs(), Source)
             << "unknown check '" << Item.Text << '\'';
-        llvm::StringRef Closest = closest(Item.Text, AllChecks);
+        const llvm::StringRef Closest = closest(Item.Text, AllChecks);
         if (!Closest.empty())
           Output << "; did you mean '" << Closest << '\'';
         Output << VerifyConfigWarningEnd;
@@ -560,7 +559,7 @@ static bool verifyOptions(const llvm::StringSet<> &ValidOptions,
     AnyInvalid = true;
     auto &Output = llvm::WithColor::warning(llvm::errs(), Source)
                    << "unknown check option '" << Key << '\'';
-    llvm::StringRef Closest = closest(Key, ValidOptions);
+    const llvm::StringRef Closest = closest(Key, ValidOptions);
     if (!Closest.empty())
       Output << "; did you mean '" << Closest << '\'';
     Output << VerifyConfigWarningEnd;
@@ -572,7 +571,7 @@ static SmallString<256> makeAbsolute(llvm::StringRef Input) {
   if (Input.empty())
     return {};
   SmallString<256> AbsolutePath(Input);
-  if (std::error_code EC = llvm::sys::fs::make_absolute(AbsolutePath)) {
+  if (const std::error_code EC = llvm::sys::fs::make_absolute(AbsolutePath)) {
     llvm::errs() << "Can't make absolute path from " << Input << ": "
                  << EC.message() << "\n";
   }
@@ -585,7 +584,7 @@ static llvm::IntrusiveRefCntPtr<vfs::OverlayFileSystem> createBaseFS() {
 
   if (!VfsOverlay.empty()) {
     IntrusiveRefCntPtr<vfs::FileSystem> VfsFromFile =
-        getVfsFromFile(VfsOverlay, BaseFS);
+        getVfsFromFile(VfsOverlay, *BaseFS);
     if (!VfsFromFile)
       return nullptr;
     BaseFS->pushOverlay(std::move(VfsFromFile));
@@ -594,7 +593,7 @@ static llvm::IntrusiveRefCntPtr<vfs::OverlayFileSystem> createBaseFS() {
 }
 
 int clangTidyMain(int argc, const char **argv) {
-  llvm::InitLLVM X(argc, argv);
+  const llvm::InitLLVM X(argc, argv);
   SmallVector<const char *> Args{argv, argv + argc};
 
   // expand parameters file to argc and argv.
@@ -623,7 +622,8 @@ int clangTidyMain(int argc, const char **argv) {
     return 1;
   }
 
-  llvm::IntrusiveRefCntPtr<vfs::OverlayFileSystem> BaseFS = createBaseFS();
+  const llvm::IntrusiveRefCntPtr<vfs::OverlayFileSystem> BaseFS =
+      createBaseFS();
   if (!BaseFS)
     return 1;
 
@@ -632,7 +632,7 @@ int clangTidyMain(int argc, const char **argv) {
   if (!OptionsProvider)
     return 1;
 
-  SmallString<256> ProfilePrefix = makeAbsolute(StoreCheckProfile);
+  const SmallString<256> ProfilePrefix = makeAbsolute(StoreCheckProfile);
 
   StringRef FileName("dummy");
   auto PathList = OptionsParser->getSourcePathList();
@@ -640,10 +640,10 @@ int clangTidyMain(int argc, const char **argv) {
     FileName = PathList.front();
   }
 
-  SmallString<256> FilePath = makeAbsolute(FileName);
+  const SmallString<256> FilePath = makeAbsolute(FileName);
   ClangTidyOptions EffectiveOptions = OptionsProvider->getOptions(FilePath);
 
-  std::vector<std::string> EnabledChecks =
+  const std::vector<std::string> EnabledChecks =
       getCheckNames(EffectiveOptions, AllowEnablingAnalyzerAlphaCheckers,
                     ExperimentalCustomChecks);
 
@@ -687,9 +687,9 @@ int clangTidyMain(int argc, const char **argv) {
   }
 
   if (VerifyConfig) {
-    std::vector<ClangTidyOptionsProvider::OptionsSource> RawOptions =
+    const std::vector<ClangTidyOptionsProvider::OptionsSource> RawOptions =
         OptionsProvider->getRawOptions(FileName);
-    ChecksAndOptions Valid = getAllChecksAndOptions(
+    const ChecksAndOptions Valid = getAllChecksAndOptions(
         AllowEnablingAnalyzerAlphaCheckers, ExperimentalCustomChecks);
     bool AnyInvalid = false;
     for (const auto &[Opts, Source] : RawOptions) {
@@ -733,14 +733,14 @@ int clangTidyMain(int argc, const char **argv) {
   std::vector<ClangTidyError> Errors =
       runClangTidy(Context, OptionsParser->getCompilations(), PathList, BaseFS,
                    FixNotes, EnableCheckProfile, ProfilePrefix, Quiet);
-  bool FoundErrors = llvm::any_of(Errors, [](const ClangTidyError &E) {
+  const bool FoundErrors = llvm::any_of(Errors, [](const ClangTidyError &E) {
     return E.DiagLevel == ClangTidyError::Error;
   });
 
   // --fix-errors and --fix-notes imply --fix.
-  FixBehaviour Behaviour = FixNotes             ? FB_FixNotes
-                           : (Fix || FixErrors) ? FB_Fix
-                                                : FB_NoFix;
+  const FixBehaviour Behaviour = FixNotes             ? FB_FixNotes
+                                 : (Fix || FixErrors) ? FB_Fix
+                                                      : FB_NoFix;
 
   const bool DisableFixes = FoundErrors && !FixErrors;
 
@@ -769,7 +769,7 @@ int clangTidyMain(int argc, const char **argv) {
 
   if (WErrorCount) {
     if (!Quiet) {
-      StringRef Plural = WErrorCount == 1 ? "" : "s";
+      const StringRef Plural = WErrorCount == 1 ? "" : "s";
       llvm::errs() << WErrorCount << " warning" << Plural << " treated as error"
                    << Plural << "\n";
     }
diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.h b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.h
index f86828e8c46e9..44b7a379e5277 100644
--- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.h
+++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.h
@@ -14,8 +14,13 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_TOOL_CLANGTIDYMAIN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_TOOL_CLANGTIDYMAIN_H
+
 namespace clang::tidy {
 
 int clangTidyMain(int argc, const char **argv);
 
 } // namespace clang::tidy
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_TOOL_CLANGTIDYMAIN_H
diff --git a/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp b/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
index d5deb99a8442d..26278139c04c3 100644
--- a/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
@@ -67,7 +67,7 @@ bool rangeIsEntirelyWithinMacroArgument(SourceRange Range,
   // Check if the range is entirely contained within a macro argument.
   SourceLocation MacroArgExpansionStartForRangeBegin;
   SourceLocation MacroArgExpansionStartForRangeEnd;
-  bool RangeIsEntirelyWithinMacroArgument =
+  const bool RangeIsEntirelyWithinMacroArgument =
       SM &&
       SM->isMacroArgExpansion(Range.getBegin(),
                               &MacroArgExpansionStartForRangeBegin) &&
diff --git a/clang-tools-extra/clang-tidy/utils/ASTUtils.h b/clang-tools-extra/clang-tidy/utils/ASTUtils.h
index c2127f0746986..808cd4a54fd1e 100644
--- a/clang-tools-extra/clang-tidy/utils/ASTUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/ASTUtils.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ASTUTILS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ASTUTILS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_ASTUTILS_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_ASTUTILS_H
 
 #include "clang/AST/AST.h"
 
@@ -47,4 +47,4 @@ findOutermostIndirectFieldDeclForField(const FieldDecl *FD);
 
 } // namespace clang::tidy::utils
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ASTUTILS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_ASTUTILS_H
diff --git a/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.cpp b/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.cpp
index 14770c49c2e25..d0659ad94b86a 100644
--- a/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.cpp
+++ b/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.cpp
@@ -48,7 +48,8 @@ FixItHint BraceInsertionHints::closingBraceFixIt() const {
 static tok::TokenKind getTokenKind(SourceLocation Loc, const SourceManager &SM,
                                    const LangOptions &LangOpts) {
   Token Tok;
-  SourceLocation Beginning = Lexer::GetBeginningOfToken(Loc, SM, LangOpts);
+  const SourceLocation Beginning =
+      Lexer::GetBeginningOfToken(Loc, SM, LangOpts);
   const bool Invalid = Lexer::getRawToken(Beginning, Tok, SM, LangOpts);
   assert(!Invalid && "Expected a valid token.");
 
@@ -77,15 +78,16 @@ static SourceLocation findEndLocation(const Stmt &S, const SourceManager &SM,
       // EOL, insert brace before.
       break;
     }
-    tok::TokenKind TokKind = getTokenKind(Loc, SM, LangOpts);
+    const tok::TokenKind TokKind = getTokenKind(Loc, SM, LangOpts);
     if (TokKind != tok::comment) {
       // Non-comment token, insert brace before.
       break;
     }
 
-    SourceLocation TokEndLoc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
-    SourceRange TokRange(Loc, TokEndLoc);
-    StringRef Comment = Lexer::getSourceText(
+    const SourceLocation TokEndLoc =
+        Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
+    const SourceRange TokRange(Loc, TokEndLoc);
+    const StringRef Comment = Lexer::getSourceText(
         CharSourceRange::getTokenRange(TokRange), SM, LangOpts);
     if (Comment.starts_with("/*") && Comment.contains('\n')) {
       // Multi-line block comment, insert brace before.
@@ -139,7 +141,7 @@ BraceInsertionHints getBraceInsertionsHints(const Stmt *const S,
 
   // StartLoc points at the location of the opening brace to be inserted.
   SourceLocation EndLoc;
-  std::string ClosingInsertion;
+  StringRef ClosingInsertion;
   if (EndLocHint.isValid()) {
     EndLoc = EndLocHint;
     ClosingInsertion = "} ";
diff --git a/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.h b/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.h
index 53ce2e0ea859c..2b2d71f3cf7b2 100644
--- a/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.h
+++ b/clang-tools-extra/clang-tidy/utils/BracesAroundStatement.h
@@ -11,6 +11,9 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_BRACESAROUNDSTATEMENT_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_BRACESAROUNDSTATEMENT_H
+
 #include "clang/AST/Stmt.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/SourceLocation.h"
@@ -36,7 +39,7 @@ struct BraceInsertionHints {
   /// Constructor for a hint offering fix-its for brace insertion. Both
   /// positions must be valid.
   BraceInsertionHints(SourceLocation OpeningBracePos,
-                      SourceLocation ClosingBracePos, std::string ClosingBrace)
+                      SourceLocation ClosingBracePos, StringRef ClosingBrace)
       : DiagnosticPos(OpeningBracePos), OpeningBracePos(OpeningBracePos),
         ClosingBracePos(ClosingBracePos), ClosingBrace(ClosingBrace) {
     assert(offersFixIts());
@@ -61,7 +64,7 @@ struct BraceInsertionHints {
 private:
   SourceLocation OpeningBracePos;
   SourceLocation ClosingBracePos;
-  std::string ClosingBrace;
+  StringRef ClosingBrace;
 };
 
 /// Create fix-it hints for braces that wrap the given statement when applied.
@@ -73,3 +76,5 @@ getBraceInsertionsHints(const Stmt *S, const LangOptions &LangOpts,
                         SourceLocation EndLocHint = SourceLocation());
 
 } // namespace clang::tidy::utils
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_BRACESAROUNDSTATEMENT_H
diff --git a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
index a5b08836db2c8..75a6dafed3c5e 100644
--- a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
@@ -63,7 +63,7 @@ static bool hasSameParameterTypes(const CXXMethodDecl &D,
 static const CXXMethodDecl *findConstOverload(const CXXMethodDecl &D) {
   assert(!D.isConst());
 
-  DeclContext::lookup_result LookupResult =
+  const DeclContext::lookup_result LookupResult =
       D.getParent()->lookup(D.getNameInfo().getName());
   if (LookupResult.isSingleResult()) {
     // No overload.
diff --git a/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.h b/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.h
index 910960137ddbb..1960eabf074c5 100644
--- a/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.h
+++ b/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.h
@@ -11,6 +11,9 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_DESIGNATEDINITIALIZERS_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_DESIGNATEDINITIALIZERS_H
+
 #include "clang/AST/Expr.h"
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/ADT/DenseMap.h"
@@ -40,3 +43,5 @@ llvm::DenseMap<clang::SourceLocation, std::string>
 getUnwrittenDesignators(const clang::InitListExpr *Syn);
 
 } // namespace clang::tidy::utils
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_DESIGNATEDINITIALIZERS_H
diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
index fd4320eb8144b..c774f54b1da5a 100644
--- a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp
@@ -44,10 +44,9 @@ ExceptionAnalyzer::ExceptionInfo &ExceptionAnalyzer::ExceptionInfo::merge(
 }
 
 // FIXME: This could be ported to clang later.
-namespace {
 
-bool isUnambiguousPublicBaseClass(const Type *DerivedType,
-                                  const Type *BaseType) {
+static bool isUnambiguousPublicBaseClass(const Type *DerivedType,
+                                         const Type *BaseType) {
   const auto *DerivedClass =
       DerivedType->getCanonicalTypeUnqualified()->getAsCXXRecordDecl();
   const auto *BaseClass =
@@ -78,11 +77,11 @@ bool isUnambiguousPublicBaseClass(const Type *DerivedType,
          IsPublicBaseClass;
 }
 
-inline bool isPointerOrPointerToMember(const Type *T) {
+static bool isPointerOrPointerToMember(const Type *T) {
   return T->isPointerType() || T->isMemberPointerType();
 }
 
-std::optional<QualType> getPointeeOrArrayElementQualType(QualType T) {
+static std::optional<QualType> getPointeeOrArrayElementQualType(QualType T) {
   if (T->isAnyPointerType() || T->isMemberPointerType())
     return T->getPointeeType();
 
@@ -92,7 +91,7 @@ std::optional<QualType> getPointeeOrArrayElementQualType(QualType T) {
   return std::nullopt;
 }
 
-bool isBaseOf(const Type *DerivedType, const Type *BaseType) {
+static bool isBaseOf(const Type *DerivedType, const Type *BaseType) {
   const auto *DerivedClass = DerivedType->getAsCXXRecordDecl();
   const auto *BaseClass = BaseType->getAsCXXRecordDecl();
   if (!DerivedClass || !BaseClass)
@@ -103,12 +102,12 @@ bool isBaseOf(const Type *DerivedType, const Type *BaseType) {
 }
 
 // Check if T1 is more or Equally qualified than T2.
-bool moreOrEquallyQualified(QualType T1, QualType T2) {
+static bool moreOrEquallyQualified(QualType T1, QualType T2) {
   return T1.getQualifiers().isStrictSupersetOf(T2.getQualifiers()) ||
          T1.getQualifiers() == T2.getQualifiers();
 }
 
-bool isStandardPointerConvertible(QualType From, QualType To) {
+static bool isStandardPointerConvertible(QualType From, QualType To) {
   assert((From->isPointerType() || From->isMemberPointerType()) &&
          (To->isPointerType() || To->isMemberPointerType()) &&
          "Pointer conversion should be performed on pointer types only.");
@@ -150,7 +149,7 @@ bool isStandardPointerConvertible(QualType From, QualType To) {
   return false;
 }
 
-bool isFunctionPointerConvertible(QualType From, QualType To) {
+static bool isFunctionPointerConvertible(QualType From, QualType To) {
   if (!From->isFunctionPointerType() && !From->isFunctionType() &&
       !From->isMemberFunctionPointerType())
     return false;
@@ -192,8 +191,8 @@ bool isFunctionPointerConvertible(QualType From, QualType To) {
 // from the C rules.
 //
 // The function should only be called in C++ mode.
-bool isQualificationConvertiblePointer(QualType From, QualType To,
-                                       LangOptions LangOpts) {
+static bool isQualificationConvertiblePointer(QualType From, QualType To,
+                                              const LangOptions &LangOpts) {
 
   // [N4659 7.5 (1)]
   // A cv-decomposition of a type T is a sequence of cv_i and P_i such that T is
@@ -320,7 +319,6 @@ bool isQualificationConvertiblePointer(QualType From, QualType To,
 
   return From.getTypePtr() == To.getTypePtr();
 }
-} // namespace
 
 static bool canThrow(const FunctionDecl *Func) {
   // consteval specifies that every call to the function must produce a
@@ -362,8 +360,9 @@ ExceptionAnalyzer::ExceptionInfo::filterByCatch(const Type *HandlerTy,
   llvm::SmallVector<const Type *, 8> TypesToDelete;
   for (const auto &ThrownException : ThrownExceptions) {
     const Type *ExceptionTy = ThrownException.getFirst();
-    CanQualType ExceptionCanTy = ExceptionTy->getCanonicalTypeUnqualified();
-    CanQualType HandlerCanTy = HandlerTy->getCanonicalTypeUnqualified();
+    const CanQualType ExceptionCanTy =
+        ExceptionTy->getCanonicalTypeUnqualified();
+    const CanQualType HandlerCanTy = HandlerTy->getCanonicalTypeUnqualified();
 
     // The handler is of type cv T or cv T& and E and T are the same type
     // (ignoring the top-level cv-qualifiers) ...
@@ -478,7 +477,7 @@ ExceptionAnalyzer::ExceptionInfo ExceptionAnalyzer::throwsException(
     // For a constructor, we also have to check the initializers.
     if (const auto *Ctor = dyn_cast<CXXConstructorDecl>(Func)) {
       for (const CXXCtorInitializer *Init : Ctor->inits()) {
-        ExceptionInfo Excs =
+        const ExceptionInfo Excs =
             throwsException(Init->getInit(), Caught, CallStack);
         Result.merge(Excs);
       }
@@ -535,7 +534,7 @@ ExceptionAnalyzer::throwsException(const Stmt *St,
 
       // Everything is caught through 'catch(...)'.
       if (!Catch->getExceptionDecl()) {
-        ExceptionInfo Rethrown = throwsException(
+        const ExceptionInfo Rethrown = throwsException(
             Catch->getHandlerBlock(), Uncaught.getExceptions(), CallStack);
         Results.merge(Rethrown);
         Uncaught.clear();
@@ -556,53 +555,59 @@ ExceptionAnalyzer::throwsException(const Stmt *St,
             Uncaught.filterByCatch(CaughtType,
                                    Catch->getExceptionDecl()->getASTContext());
         if (!FilteredExceptions.empty()) {
-          ExceptionInfo Rethrown = throwsException(
+          const ExceptionInfo Rethrown = throwsException(
               Catch->getHandlerBlock(), FilteredExceptions, CallStack);
           Results.merge(Rethrown);
         }
       }
     }
     Results.merge(Uncaught);
-  } else if (const auto *Call = dyn_cast<CallExpr>(St)) {
-    if (const FunctionDecl *Func = Call->getDirectCallee()) {
-      ExceptionInfo Excs =
-          throwsException(Func, Caught, CallStack, Call->getBeginLoc());
-      Results.merge(Excs);
-    }
-  } else if (const auto *Construct = dyn_cast<CXXConstructExpr>(St)) {
-    ExceptionInfo Excs = throwsException(Construct->getConstructor(), Caught,
-                                         CallStack, Construct->getBeginLoc());
-    Results.merge(Excs);
   } else if (const auto *DefaultInit = dyn_cast<CXXDefaultInitExpr>(St)) {
-    ExceptionInfo Excs =
+    const ExceptionInfo Excs =
         throwsException(DefaultInit->getExpr(), Caught, CallStack);
     Results.merge(Excs);
   } else if (const auto *Coro = dyn_cast<CoroutineBodyStmt>(St)) {
     for (const Stmt *Child : Coro->childrenExclBody()) {
       if (Child != Coro->getExceptionHandler()) {
-        ExceptionInfo Excs = throwsException(Child, Caught, CallStack);
+        const ExceptionInfo Excs = throwsException(Child, Caught, CallStack);
         Results.merge(Excs);
       }
     }
-    ExceptionInfo Excs = throwsException(Coro->getBody(), Caught, CallStack);
+    const ExceptionInfo Excs =
+        throwsException(Coro->getBody(), Caught, CallStack);
     Results.merge(throwsException(Coro->getExceptionHandler(),
                                   Excs.getExceptions(), CallStack));
     for (const auto &Exception : Excs.getExceptions()) {
       const Type *ExcType = Exception.getFirst();
       if (const CXXRecordDecl *ThrowableRec = ExcType->getAsCXXRecordDecl()) {
-        ExceptionInfo DestructorExcs = throwsException(
+        const ExceptionInfo DestructorExcs = throwsException(
             ThrowableRec->getDestructor(), Caught, CallStack, SourceLocation{});
         Results.merge(DestructorExcs);
       }
     }
   } else if (const auto *Lambda = dyn_cast<LambdaExpr>(St)) {
     for (const Stmt *Init : Lambda->capture_inits()) {
-      ExceptionInfo Excs = throwsException(Init, Caught, CallStack);
+      const ExceptionInfo Excs = throwsException(Init, Caught, CallStack);
       Results.merge(Excs);
     }
   } else {
+    // Check whether any of this node's subexpressions throws.
     for (const Stmt *Child : St->children()) {
-      ExceptionInfo Excs = throwsException(Child, Caught, CallStack);
+      const ExceptionInfo Excs = throwsException(Child, Caught, CallStack);
+      Results.merge(Excs);
+    }
+
+    // If this node is a call to a function or constructor, also check
+    // whether the call itself throws.
+    if (const auto *Call = dyn_cast<CallExpr>(St)) {
+      if (const FunctionDecl *Func = Call->getDirectCallee()) {
+        ExceptionInfo Excs =
+            throwsException(Func, Caught, CallStack, Call->getBeginLoc());
+        Results.merge(Excs);
+      }
+    } else if (const auto *Construct = dyn_cast<CXXConstructExpr>(St)) {
+      ExceptionInfo Excs = throwsException(Construct->getConstructor(), Caught,
+                                           CallStack, Construct->getBeginLoc());
       Results.merge(Excs);
     }
   }
diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.h b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.h
index 1ab6dcb2eb255..1a277c8a6d3b2 100644
--- a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.h
+++ b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTION_ANALYZER_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTION_ANALYZER_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTIONANALYZER_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTIONANALYZER_H
 
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
@@ -158,4 +158,4 @@ class ExceptionAnalyzer {
 
 } // namespace clang::tidy::utils
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTION_ANALYZER_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTIONANALYZER_H
diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.cpp b/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.cpp
index b1d6b195f9470..2da09669dd7f8 100644
--- a/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.cpp
@@ -20,7 +20,7 @@ ExceptionSpecAnalyzer::analyze(const FunctionDecl *FuncDecl) {
   const auto [CacheEntry, NotFound] =
       FunctionCache.try_emplace(FuncDecl, State::NotThrowing);
   if (NotFound) {
-    ExceptionSpecAnalyzer::State State = analyzeImpl(FuncDecl);
+    const ExceptionSpecAnalyzer::State State = analyzeImpl(FuncDecl);
     // Update result with calculated value
     FunctionCache[FuncDecl] = State;
     return State;
@@ -87,20 +87,20 @@ ExceptionSpecAnalyzer::analyzeRecord(const CXXRecordDecl *RecordDecl,
         return analyze(MethodDecl);
 
   for (const auto &BaseSpec : RecordDecl->bases()) {
-    State Result = analyzeBase(BaseSpec, Kind);
+    const State Result = analyzeBase(BaseSpec, Kind);
     if (Result == State::Throwing || Result == State::Unknown)
       return Result;
   }
 
   for (const auto &BaseSpec : RecordDecl->vbases()) {
-    State Result = analyzeBase(BaseSpec, Kind);
+    const State Result = analyzeBase(BaseSpec, Kind);
     if (Result == State::Throwing || Result == State::Unknown)
       return Result;
   }
 
   for (const auto *FDecl : RecordDecl->fields())
     if (!FDecl->isInvalidDecl() && !FDecl->isUnnamedBitField()) {
-      State Result = analyzeFieldDecl(FDecl, Kind);
+      const State Result = analyzeFieldDecl(FDecl, Kind);
       if (Result == State::Throwing || Result == State::Unknown)
         return Result;
     }
diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.h b/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.h
index 3fd6fe170c734..06d11c888a0c1 100644
--- a/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.h
+++ b/clang-tools-extra/clang-tidy/utils/ExceptionSpecAnalyzer.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTION_SPEC_ANALYZER_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTION_SPEC_ANALYZER_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTIONSPECANALYZER_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTIONSPECANALYZER_H
 
 #include "clang/AST/DeclCXX.h"
 #include "llvm/ADT/DenseMap.h"
@@ -86,4 +86,4 @@ class ExceptionSpecAnalyzer {
 
 } // namespace clang::tidy::utils
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTION_SPEC_ANALYZER_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXCEPTIONSPECANALYZER_H
diff --git a/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp b/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp
index 46eebf4e7a86e..0375d0f6c740f 100644
--- a/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp
@@ -29,13 +29,13 @@ static SmallVector<const Stmt *, 1> getParentStmts(const Stmt *S,
                                                    ASTContext *Context) {
   SmallVector<const Stmt *, 1> Result;
 
-  TraversalKindScope RAII(*Context, TK_AsIs);
+  const TraversalKindScope RAII(*Context, TK_AsIs);
   DynTypedNodeList Parents = Context->getParents(*S);
 
   SmallVector<DynTypedNode, 1> NodesToProcess(Parents.begin(), Parents.end());
 
   while (!NodesToProcess.empty()) {
-    DynTypedNode Node = NodesToProcess.back();
+    const DynTypedNode Node = NodesToProcess.back();
     NodesToProcess.pop_back();
 
     if (const auto *S = Node.get<Stmt>()) {
@@ -95,7 +95,8 @@ bool ExprSequence::inSequence(const Stmt *Before, const Stmt *After) const {
       return true;
   }
 
-  SmallVector<const Stmt *, 1> BeforeParents = getParentStmts(Before, Context);
+  const SmallVector<const Stmt *, 1> BeforeParents =
+      getParentStmts(Before, Context);
 
   // Since C++17, the callee of a call expression is guaranteed to be sequenced
   // before all of the arguments.
diff --git a/clang-tools-extra/clang-tidy/utils/ExprSequence.h b/clang-tools-extra/clang-tidy/utils/ExprSequence.h
index 9ef94e0e3bcde..2aea99e1440c1 100644
--- a/clang-tools-extra/clang-tidy/utils/ExprSequence.h
+++ b/clang-tools-extra/clang-tidy/utils/ExprSequence.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_EXPRSEQUENCE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_EXPRSEQUENCE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXPRSEQUENCE_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXPRSEQUENCE_H
 
 #include "clang/Analysis/CFG.h"
 #include "clang/Lex/Lexer.h"
@@ -117,4 +117,4 @@ class StmtToBlockMap {
 
 } // namespace clang::tidy::utils
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_EXPRSEQUENCE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_EXPRSEQUENCE_H
diff --git a/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.cpp b/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.cpp
index 41d5131599ce6..97be36a06a89d 100644
--- a/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.cpp
@@ -15,19 +15,19 @@ namespace clang::tidy::utils {
 
 bool isExpansionLocInHeaderFile(SourceLocation Loc, const SourceManager &SM,
                                 const FileExtensionsSet &HeaderFileExtensions) {
-  SourceLocation ExpansionLoc = SM.getExpansionLoc(Loc);
+  const SourceLocation ExpansionLoc = SM.getExpansionLoc(Loc);
   return isFileExtension(SM.getFilename(ExpansionLoc), HeaderFileExtensions);
 }
 
 bool isPresumedLocInHeaderFile(SourceLocation Loc, SourceManager &SM,
                                const FileExtensionsSet &HeaderFileExtensions) {
-  PresumedLoc PresumedLocation = SM.getPresumedLoc(Loc);
+  const PresumedLoc PresumedLocation = SM.getPresumedLoc(Loc);
   return isFileExtension(PresumedLocation.getFilename(), HeaderFileExtensions);
 }
 
 bool isSpellingLocInHeaderFile(SourceLocation Loc, SourceManager &SM,
                                const FileExtensionsSet &HeaderFileExtensions) {
-  SourceLocation SpellingLoc = SM.getSpellingLoc(Loc);
+  const SourceLocation SpellingLoc = SM.getSpellingLoc(Loc);
   return isFileExtension(SM.getFilename(SpellingLoc), HeaderFileExtensions);
 }
 
@@ -35,7 +35,7 @@ bool parseFileExtensions(StringRef AllFileExtensions,
                          FileExtensionsSet &FileExtensions,
                          StringRef Delimiters) {
   SmallVector<StringRef, 5> Suffixes;
-  for (char Delimiter : Delimiters) {
+  for (const char Delimiter : Delimiters) {
     if (AllFileExtensions.contains(Delimiter)) {
       AllFileExtensions.split(Suffixes, Delimiter);
       break;
@@ -43,7 +43,7 @@ bool parseFileExtensions(StringRef AllFileExtensions,
   }
 
   FileExtensions.clear();
-  for (StringRef Suffix : Suffixes) {
+  for (const StringRef Suffix : Suffixes) {
     StringRef Extension = Suffix.trim();
     if (!llvm::all_of(Extension, isAlphanumeric))
       return false;
diff --git a/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.h b/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.h
index dfab141e32417..425c29c3b3b19 100644
--- a/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/FileExtensionsUtils.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_FILE_EXTENSIONS_UTILS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_FILE_EXTENSIONS_UTILS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_FILEEXTENSIONSUTILS_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_FILEEXTENSIONSUTILS_H
 
 #include "../FileExtensionsSet.h"
 #include "clang/Basic/SourceLocation.h"
@@ -60,4 +60,4 @@ bool isFileExtension(StringRef FileName,
 
 } // namespace clang::tidy::utils
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_FILE_EXTENSIONS_UTILS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_FILEEXTENSIONSUTILS_H
diff --git a/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp b/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp
index b30c83e3aeb35..c4cdf0d63af4c 100644
--- a/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/FixItHintUtils.cpp
@@ -140,7 +140,7 @@ changePointer(const VarDecl &Var, Qualifiers::TQ Qualifier, const Type *Pointee,
     // the `*` token and placing the `const` left of it.
     // (`int const* p = nullptr;`)
     if (QualPolicy == QualifierPolicy::Right) {
-      SourceLocation BeforeStar = lexer::findPreviousTokenKind(
+      const SourceLocation BeforeStar = lexer::findPreviousTokenKind(
           Var.getLocation(), Context.getSourceManager(), Context.getLangOpts(),
           tok::star);
       if (locDangerous(BeforeStar))
@@ -161,7 +161,7 @@ changePointer(const VarDecl &Var, Qualifiers::TQ Qualifier, const Type *Pointee,
     // is the same as 'QualPolicy == Right && isValueType(Pointee)'.
     // The `const` must be left of the last `*` token.
     // (`int * const* p = nullptr;`)
-    SourceLocation BeforeStar = lexer::findPreviousTokenKind(
+    const SourceLocation BeforeStar = lexer::findPreviousTokenKind(
         Var.getLocation(), Context.getSourceManager(), Context.getLangOpts(),
         tok::star);
     return fixIfNotDangerous(BeforeStar, buildQualifier(Qualifier, true));
@@ -178,7 +178,7 @@ changeReferencee(const VarDecl &Var, Qualifiers::TQ Qualifier, QualType Pointee,
     return fixIfNotDangerous(Var.getTypeSpecStartLoc(),
                              buildQualifier(Qualifier));
 
-  SourceLocation BeforeRef = lexer::findPreviousAnyTokenKind(
+  const SourceLocation BeforeRef = lexer::findPreviousAnyTokenKind(
       Var.getLocation(), Context.getSourceManager(), Context.getLangOpts(),
       tok::amp, tok::ampamp);
   std::optional<SourceLocation> IgnoredParens =
@@ -201,7 +201,7 @@ std::optional<FixItHint> addQualifierToVarDecl(const VarDecl &Var,
           QualTarget == QualifierTarget::Value) &&
          "Unexpected Target");
 
-  QualType ParenStrippedType = Var.getType().IgnoreParens();
+  const QualType ParenStrippedType = Var.getType().IgnoreParens();
   if (isValueType(ParenStrippedType))
     return changeValue(Var, Qualifier, QualTarget, QualPolicy, Context);
 
diff --git a/clang-tools-extra/clang-tidy/utils/FormatStringConverter.cpp b/clang-tools-extra/clang-tidy/utils/FormatStringConverter.cpp
index f4945b2113c69..127de30cf6e42 100644
--- a/clang-tools-extra/clang-tidy/utils/FormatStringConverter.cpp
+++ b/clang-tools-extra/clang-tidy/utils/FormatStringConverter.cpp
@@ -245,7 +245,7 @@ FormatStringConverter::formatStringContainsUnreplaceableMacro(
   // inhibit conversion. The whole format string will appear to come from that
   // macro, as will the function call.
   std::optional<StringRef> MaybeSurroundingMacroName;
-  if (SourceLocation BeginCallLoc = Call->getBeginLoc();
+  if (const SourceLocation BeginCallLoc = Call->getBeginLoc();
       BeginCallLoc.isMacroID())
     MaybeSurroundingMacroName =
         Lexer::getImmediateMacroName(BeginCallLoc, SM, PP.getLangOpts());
@@ -283,7 +283,8 @@ FormatStringConverter::formatStringContainsUnreplaceableMacro(
 
 void FormatStringConverter::emitAlignment(const PrintfSpecifier &FS,
                                           std::string &FormatSpec) {
-  ConversionSpecifier::Kind ArgKind = FS.getConversionSpecifier().getKind();
+  const ConversionSpecifier::Kind ArgKind =
+      FS.getConversionSpecifier().getKind();
 
   // We only care about alignment if a field width is specified
   if (FS.getFieldWidth().getHowSpecified() != OptionalAmount::NotSpecified) {
@@ -499,7 +500,8 @@ bool FormatStringConverter::emitIntegerArgument(
 /// @returns true on success, false on failure
 bool FormatStringConverter::emitType(const PrintfSpecifier &FS, const Expr *Arg,
                                      std::string &FormatSpec) {
-  ConversionSpecifier::Kind ArgKind = FS.getConversionSpecifier().getKind();
+  const ConversionSpecifier::Kind ArgKind =
+      FS.getConversionSpecifier().getKind();
   switch (ArgKind) {
   case ConversionSpecifier::Kind::sArg:
     emitStringArgument(FS.getArgIndex() + ArgsOffset, Arg);
@@ -798,7 +800,7 @@ void FormatStringConverter::applyFixes(DiagnosticBuilder &Diag,
   }
 
   for (const auto &[ArgIndex, Replacement] : ArgFixes) {
-    SourceLocation AfterOtherSide =
+    const SourceLocation AfterOtherSide =
         Lexer::findNextToken(Args[ArgIndex]->getEndLoc(), SM, LangOpts)
             ->getLocation();
 
diff --git a/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp b/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp
index e1d13876d64a9..d36b187b1da14 100644
--- a/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp
+++ b/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp
@@ -32,11 +32,11 @@ class HeaderGuardPPCallbacks : public PPCallbacks {
                    FileID PrevFID) override {
     // Record all files we enter. We'll need them to diagnose headers without
     // guards.
-    SourceManager &SM = PP->getSourceManager();
+    const SourceManager &SM = PP->getSourceManager();
     if (Reason == EnterFile && FileType == SrcMgr::C_User) {
       if (OptionalFileEntryRef FE =
               SM.getFileEntryRefForID(SM.getFileID(Loc))) {
-        std::string FileName = cleanPath(FE->getName());
+        const std::string FileName = cleanPath(FE->getName());
         Files[FileName] = *FE;
       }
     }
@@ -66,7 +66,7 @@ class HeaderGuardPPCallbacks : public PPCallbacks {
 
   void EndOfMainFile() override {
     // Now that we have all this information from the preprocessor, use it!
-    SourceManager &SM = PP->getSourceManager();
+    const SourceManager &SM = PP->getSourceManager();
 
     for (const auto &MacroEntry : Macros) {
       const MacroInfo *MI = MacroEntry.second;
@@ -79,7 +79,7 @@ class HeaderGuardPPCallbacks : public PPCallbacks {
 
       OptionalFileEntryRef FE =
           SM.getFileEntryRefForID(SM.getFileID(MI->getDefinitionLoc()));
-      std::string FileName = cleanPath(FE->getName());
+      const std::string FileName = cleanPath(FE->getName());
       Files.erase(FileName);
 
       // See if we should check and fix this header guard.
@@ -88,16 +88,16 @@ class HeaderGuardPPCallbacks : public PPCallbacks {
 
       // Look up Locations for this guard.
       const auto &Locs = Ifndefs[MacroEntry.first.getIdentifierInfo()];
-      SourceLocation Ifndef = Locs.second;
-      SourceLocation Define = MacroEntry.first.getLocation();
-      SourceLocation EndIf = EndIfs[Locs.first];
+      const SourceLocation Ifndef = Locs.second;
+      const SourceLocation Define = MacroEntry.first.getLocation();
+      const SourceLocation EndIf = EndIfs[Locs.first];
 
       // If the macro Name is not equal to what we can compute, correct it in
       // the #ifndef and #define.
-      StringRef CurHeaderGuard =
+      const StringRef CurHeaderGuard =
           MacroEntry.first.getIdentifierInfo()->getName();
       std::vector<FixItHint> FixIts;
-      std::string NewGuard = checkHeaderGuardDefinition(
+      const std::string NewGuard = checkHeaderGuardDefinition(
           Ifndef, Define, EndIf, FileName, CurHeaderGuard, FixIts);
 
       // Now look at the #endif. We want a comment with the header guard. Fix it
@@ -129,7 +129,7 @@ class HeaderGuardPPCallbacks : public PPCallbacks {
     if (!EndIf.isValid())
       return false;
     const char *EndIfData = PP->getSourceManager().getCharacterData(EndIf);
-    size_t EndIfLen = std::strcspn(EndIfData, "\r\n");
+    const size_t EndIfLen = std::strcspn(EndIfData, "\r\n");
     if (EndIfLenPtr)
       *EndIfLenPtr = EndIfLen;
 
@@ -137,12 +137,12 @@ class HeaderGuardPPCallbacks : public PPCallbacks {
     EndIfStr = EndIfStr.substr(EndIfStr.find_first_not_of("#endif \t"));
 
     // Give up if there's an escaped newline.
-    size_t FindEscapedNewline = EndIfStr.find_last_not_of(' ');
+    const size_t FindEscapedNewline = EndIfStr.find_last_not_of(' ');
     if (FindEscapedNewline != StringRef::npos &&
         EndIfStr[FindEscapedNewline] == '\\')
       return false;
 
-    bool IsLineComment =
+    const bool IsLineComment =
         EndIfStr.consume_front("//") ||
         (EndIfStr.consume_front("/*") && EndIfStr.consume_back("*/"));
     if (!IsLineComment)
@@ -162,7 +162,7 @@ class HeaderGuardPPCallbacks : public PPCallbacks {
                                          std::vector<FixItHint> &FixIts) {
     std::string CPPVar = Check->getHeaderGuard(FileName, CurHeaderGuard);
     CPPVar = Check->sanitizeHeaderGuard(CPPVar);
-    std::string CPPVarUnder = CPPVar + '_';
+    const std::string CPPVarUnder = CPPVar + '_';
 
     // Allow a trailing underscore if and only if we don't have to change the
     // endif comment too.
@@ -203,19 +203,20 @@ class HeaderGuardPPCallbacks : public PPCallbacks {
     // fix-its to add the guard.
     // TODO: Insert the guard after top comments.
     for (const auto &FE : Files) {
-      StringRef FileName = FE.getKey();
+      const StringRef FileName = FE.getKey();
       if (!Check->shouldSuggestToAddHeaderGuard(FileName))
         continue;
 
-      SourceManager &SM = PP->getSourceManager();
-      FileID FID = SM.translateFile(FE.getValue());
-      SourceLocation StartLoc = SM.getLocForStartOfFile(FID);
+      const SourceManager &SM = PP->getSourceManager();
+      const FileID FID = SM.translateFile(FE.getValue());
+      const SourceLocation StartLoc = SM.getLocForStartOfFile(FID);
       if (StartLoc.isInvalid())
         continue;
 
       std::string CPPVar = Check->getHeaderGuard(FileName);
       CPPVar = Check->sanitizeHeaderGuard(CPPVar);
-      std::string CPPVarUnder = CPPVar + '_'; // Allow a trailing underscore.
+      const std::string CPPVarUnder =
+          CPPVar + '_'; // Allow a trailing underscore.
       // If there's a macro with a name that follows the header guard convention
       // but was not recognized by the preprocessor as a header guard there must
       // be code outside of the guarded area. Emit a plain warning without
@@ -223,8 +224,8 @@ class HeaderGuardPPCallbacks : public PPCallbacks {
       // FIXME: Can we move it into the right spot?
       bool SeenMacro = false;
       for (const auto &MacroEntry : Macros) {
-        StringRef Name = MacroEntry.first.getIdentifierInfo()->getName();
-        SourceLocation DefineLoc = MacroEntry.first.getLocation();
+        const StringRef Name = MacroEntry.first.getIdentifierInfo()->getName();
+        const SourceLocation DefineLoc = MacroEntry.first.getLocation();
         if ((Name == CPPVar || Name == CPPVarUnder) &&
             SM.isWrittenInSameFile(StartLoc, DefineLoc)) {
           Check->diag(DefineLoc, "code/includes outside of area guarded by "
diff --git a/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp b/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp
index 0b67cba6ffb0a..81bb8dec57a74 100644
--- a/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp
+++ b/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp
@@ -69,7 +69,7 @@ IncludeSorter &IncludeInserter::getOrCreate(FileID FileID) {
 
 std::optional<FixItHint>
 IncludeInserter::createIncludeInsertion(FileID FileID, llvm::StringRef Header) {
-  bool IsAngled = Header.consume_front("<");
+  const bool IsAngled = Header.consume_front("<");
   if (IsAngled != Header.consume_back(">"))
     return std::nullopt;
   // We assume the same Header will never be included both angled and not
@@ -94,7 +94,7 @@ void IncludeInserter::addInclude(StringRef FileName, bool IsAngled,
                                  SourceLocation EndLocation) {
   assert(SourceMgr && "SourceMgr shouldn't be null; did you remember to call "
                       "registerPreprocessor()?");
-  FileID FileID = SourceMgr->getFileID(HashLocation);
+  const FileID FileID = SourceMgr->getFileID(HashLocation);
   getOrCreate(FileID).addInclude(FileName, IsAngled, HashLocation, EndLocation);
 }
 
diff --git a/clang-tools-extra/clang-tidy/utils/IncludeInserter.h b/clang-tools-extra/clang-tidy/utils/IncludeInserter.h
index f6ca7d63632de..9dbf2a76369cd 100644
--- a/clang-tools-extra/clang-tidy/utils/IncludeInserter.h
+++ b/clang-tools-extra/clang-tidy/utils/IncludeInserter.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_INCLUDEINSERTER_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_INCLUDEINSERTER_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_INCLUDEINSERTER_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_INCLUDEINSERTER_H
 
 #include "IncludeSorter.h"
 #include "clang/Basic/Diagnostic.h"
@@ -100,4 +100,4 @@ class IncludeInserter {
 
 } // namespace tidy::utils
 } // namespace clang
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_INCLUDEINSERTER_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_INCLUDEINSERTER_H
diff --git a/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp b/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp
index 7e2aad9a97c8e..f113f8cabea87 100644
--- a/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp
+++ b/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp
@@ -17,7 +17,7 @@ namespace utils {
 
 static StringRef removeFirstSuffix(StringRef Str,
                                    ArrayRef<const char *> Suffixes) {
-  for (StringRef Suffix : Suffixes) {
+  for (const StringRef Suffix : Suffixes) {
     if (Str.consume_back(Suffix))
       return Str;
   }
@@ -37,7 +37,7 @@ static StringRef makeCanonicalName(StringRef Str,
         removeFirstSuffix(Str, {".cc", ".cpp", ".c", ".h", ".hpp"}), {"Test"});
   }
   if (Style == IncludeSorter::IS_Google_ObjC) {
-    StringRef Canonical =
+    const StringRef Canonical =
         removeFirstSuffix(removeFirstSuffix(Str, {".cc", ".cpp", ".c", ".h",
                                                   ".hpp", ".mm", ".m"}),
                           {"_unittest", "_regtest", "_test", "Test"});
@@ -57,7 +57,7 @@ static StringRef makeCanonicalName(StringRef Str,
 
 // Scan to the end of the line and return the offset of the next line.
 static size_t findNextLine(const char *Text) {
-  size_t EOLIndex = std::strcspn(Text, "\n");
+  const size_t EOLIndex = std::strcspn(Text, "\n");
   return Text[EOLIndex] == '\0' ? EOLIndex : EOLIndex + 1;
 }
 
@@ -74,14 +74,15 @@ determineIncludeKind(StringRef CanonicalFile, StringRef IncludeFile,
     return IncludeFile.ends_with(".h") ? IncludeSorter::IK_CSystemInclude
                                        : IncludeSorter::IK_CXXSystemInclude;
   }
-  StringRef CanonicalInclude = makeCanonicalName(IncludeFile, Style);
+  const StringRef CanonicalInclude = makeCanonicalName(IncludeFile, Style);
   if (CanonicalFile.ends_with(CanonicalInclude) ||
       CanonicalInclude.ends_with(CanonicalFile)) {
     return IncludeSorter::IK_MainTUInclude;
   }
   if ((Style == IncludeSorter::IS_Google) ||
       (Style == IncludeSorter::IS_Google_ObjC)) {
-    std::pair<StringRef, StringRef> Parts = CanonicalInclude.split("/public/");
+    const std::pair<StringRef, StringRef> Parts =
+        CanonicalInclude.split("/public/");
     StringRef FileCopy = CanonicalFile;
     if (FileCopy.consume_front(Parts.first) &&
         FileCopy.consume_back(Parts.second)) {
@@ -126,7 +127,7 @@ IncludeSorter::IncludeSorter(const SourceManager *SourceMgr, FileID FileID,
 void IncludeSorter::addInclude(StringRef FileName, bool IsAngled,
                                SourceLocation HashLocation,
                                SourceLocation EndLocation) {
-  int Offset = findNextLine(SourceMgr->getCharacterData(EndLocation));
+  const int Offset = findNextLine(SourceMgr->getCharacterData(EndLocation));
 
   // Record the relevant location information for this inclusion directive.
   auto &IncludeLocation = IncludeLocations[FileName];
@@ -139,7 +140,7 @@ void IncludeSorter::addInclude(StringRef FileName, bool IsAngled,
     return;
 
   // Add the included file's name to the appropriate bucket.
-  IncludeKinds Kind =
+  const IncludeKinds Kind =
       determineIncludeKind(CanonicalFile, FileName, IsAngled, Style);
   if (Kind != IK_InvalidInclude)
     IncludeBucket[Kind].push_back(FileName.str());
@@ -181,7 +182,8 @@ IncludeSorter::createIncludeInsertion(StringRef FileName, bool IsAngled) {
     // FileName comes after all include entries in bucket, insert it after
     // last.
     const std::string &LastInclude = IncludeBucket[IncludeKind].back();
-    SourceRange LastIncludeLocation = IncludeLocations[LastInclude].back();
+    const SourceRange LastIncludeLocation =
+        IncludeLocations[LastInclude].back();
     return FixItHint::CreateInsertion(LastIncludeLocation.getEnd(),
                                       IncludeStmt);
   }
@@ -205,14 +207,16 @@ IncludeSorter::createIncludeInsertion(StringRef FileName, bool IsAngled) {
   if (NonEmptyKind < IncludeKind) {
     // Create a block after.
     const std::string &LastInclude = IncludeBucket[NonEmptyKind].back();
-    SourceRange LastIncludeLocation = IncludeLocations[LastInclude].back();
+    const SourceRange LastIncludeLocation =
+        IncludeLocations[LastInclude].back();
     IncludeStmt = '\n' + IncludeStmt;
     return FixItHint::CreateInsertion(LastIncludeLocation.getEnd(),
                                       IncludeStmt);
   }
   // Create a block before.
   const std::string &FirstInclude = IncludeBucket[NonEmptyKind][0];
-  SourceRange FirstIncludeLocation = IncludeLocations[FirstInclude].back();
+  const SourceRange FirstIncludeLocation =
+      IncludeLocations[FirstInclude].back();
   IncludeStmt.append("\n");
   return FixItHint::CreateInsertion(FirstIncludeLocation.getBegin(),
                                     IncludeStmt);
diff --git a/clang-tools-extra/clang-tidy/utils/IncludeSorter.h b/clang-tools-extra/clang-tidy/utils/IncludeSorter.h
index 66830ee7f1ef3..6efec976847b5 100644
--- a/clang-tools-extra/clang-tidy/utils/IncludeSorter.h
+++ b/clang-tools-extra/clang-tidy/utils/IncludeSorter.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_INCLUDESORTER_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_INCLUDESORTER_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_INCLUDESORTER_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_INCLUDESORTER_H
 
 #include "../ClangTidyCheck.h"
 #include <optional>
@@ -73,4 +73,4 @@ template <> struct OptionEnumMapping<utils::IncludeSorter::IncludeStyle> {
   getEnumMapping();
 };
 } // namespace clang::tidy
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_INCLUDESORTER_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_INCLUDESORTER_H
diff --git a/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp b/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp
index 7222f64804f63..0f20fdec6c29b 100644
--- a/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp
@@ -42,7 +42,7 @@ SourceLocation findPreviousTokenStart(SourceLocation Start,
   if (Start.isInvalid() || Start.isMacroID())
     return {};
 
-  SourceLocation BeforeStart = Start.getLocWithOffset(-1);
+  const SourceLocation BeforeStart = Start.getLocWithOffset(-1);
   if (BeforeStart.isInvalid() || BeforeStart.isMacroID())
     return {};
 
@@ -57,7 +57,7 @@ SourceLocation findPreviousTokenKind(SourceLocation Start,
     return {};
 
   while (true) {
-    SourceLocation L = findPreviousTokenStart(Start, SM, LangOpts);
+    const SourceLocation L = findPreviousTokenStart(Start, SM, LangOpts);
     if (L.isInvalid() || L.isMacroID())
       return {};
 
@@ -123,8 +123,9 @@ std::optional<Token> getQualifyingToken(tok::TokenKind TK,
   assert((TK == tok::kw_const || TK == tok::kw_volatile ||
           TK == tok::kw_restrict) &&
          "TK is not a qualifier keyword");
-  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Range.getBegin());
-  StringRef File = SM.getBufferData(LocInfo.first);
+  const std::pair<FileID, unsigned> LocInfo =
+      SM.getDecomposedLoc(Range.getBegin());
+  const StringRef File = SM.getBufferData(LocInfo.first);
   Lexer RawLexer(SM.getLocForStartOfFile(LocInfo.first), Context.getLangOpts(),
                  File.begin(), File.data() + LocInfo.second, File.end());
   std::optional<Token> LastMatchBeforeTemplate;
diff --git a/clang-tools-extra/clang-tidy/utils/LexerUtils.h b/clang-tools-extra/clang-tidy/utils/LexerUtils.h
index b76a37874b514..c5fb646c0efd9 100644
--- a/clang-tools-extra/clang-tidy/utils/LexerUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/LexerUtils.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_LEXER_UTILS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_LEXER_UTILS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_LEXERUTILS_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_LEXERUTILS_H
 
 #include "clang/AST/ASTContext.h"
 #include "clang/Basic/TokenKinds.h"
@@ -48,7 +48,7 @@ SourceLocation findPreviousAnyTokenKind(SourceLocation Start,
   if (Start.isInvalid() || Start.isMacroID())
     return {};
   while (true) {
-    SourceLocation L = findPreviousTokenStart(Start, SM, LangOpts);
+    const SourceLocation L = findPreviousTokenStart(Start, SM, LangOpts);
     if (L.isInvalid() || L.isMacroID())
       return {};
 
@@ -76,7 +76,7 @@ SourceLocation findNextAnyTokenKind(SourceLocation Start,
     if (!CurrentToken)
       return {};
 
-    Token PotentialMatch = *CurrentToken;
+    const Token PotentialMatch = *CurrentToken;
     if (PotentialMatch.isOneOf(TK, TKs...))
       return PotentialMatch.getLocation();
 
@@ -130,4 +130,4 @@ SourceLocation getLocationForNoexceptSpecifier(const FunctionDecl *FuncDecl,
 } // namespace tidy::utils::lexer
 } // namespace clang
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_LEXER_UTILS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_LEXERUTILS_H
diff --git a/clang-tools-extra/clang-tidy/utils/Matchers.h b/clang-tools-extra/clang-tidy/utils/Matchers.h
index 4eac0655e3922..a444ab623a56a 100644
--- a/clang-tools-extra/clang-tidy/utils/Matchers.h
+++ b/clang-tools-extra/clang-tidy/utils/Matchers.h
@@ -162,7 +162,7 @@ struct NotIdenticalStatementsPredicate {
 // Checks if statement is identical (utils::areStatementsIdentical) to one bound
 // to ID node.
 AST_MATCHER_P(Stmt, isStatementIdenticalToBoundNode, std::string, ID) {
-  NotIdenticalStatementsPredicate Predicate{
+  const NotIdenticalStatementsPredicate Predicate{
       ID, ::clang::DynTypedNode::create(Node), &(Finder->getASTContext())};
   return Builder->removeBindings(Predicate);
 }
diff --git a/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.cpp b/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.cpp
index 3af7f8dcf2ee5..c862364c886dd 100644
--- a/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.cpp
+++ b/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.cpp
@@ -55,7 +55,7 @@ NamespaceAliaser::createAlias(ASTContext &Context, const Stmt &Statement,
   }
 
   for (const auto &Abbreviation : Abbreviations) {
-    DeclarationMatcher ConflictMatcher = namedDecl(hasName(Abbreviation));
+    const DeclarationMatcher ConflictMatcher = namedDecl(hasName(Abbreviation));
     const auto HasConflictingChildren =
         !match(findAll(ConflictMatcher), *Function, Context).empty();
     const auto HasConflictingAncestors =
@@ -65,10 +65,10 @@ NamespaceAliaser::createAlias(ASTContext &Context, const Stmt &Statement,
     if (HasConflictingAncestors || HasConflictingChildren)
       continue;
 
-    std::string Declaration =
+    const std::string Declaration =
         (llvm::Twine("\nnamespace ") + Abbreviation + " = " + Namespace + ";")
             .str();
-    SourceLocation Loc =
+    const SourceLocation Loc =
         Lexer::getLocForEndOfToken(Function->getBody()->getBeginLoc(), 0,
                                    SourceMgr, Context.getLangOpts());
     AddedAliases[Function][Namespace.str()] = Abbreviation;
diff --git a/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.h b/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.h
index 497b67e82a900..fbf0ade3b0cc7 100644
--- a/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.h
+++ b/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_NAMESPACEALIASER_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_NAMESPACEALIASER_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_NAMESPACEALIASER_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_NAMESPACEALIASER_H
 
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Stmt.h"
@@ -45,4 +45,4 @@ class NamespaceAliaser {
 
 } // namespace clang::tidy::utils
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_NAMESPACEALIASER_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_NAMESPACEALIASER_H
diff --git a/clang-tools-extra/clang-tidy/utils/OptionsUtils.h b/clang-tools-extra/clang-tidy/utils/OptionsUtils.h
index aec24ab0a84b3..3a123484fae61 100644
--- a/clang-tools-extra/clang-tidy/utils/OptionsUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/OptionsUtils.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_OPTIONUTILS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_OPTIONUTILS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_OPTIONSUTILS_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_OPTIONSUTILS_H
 
 #include "clang/Basic/LLVM.h"
 #include <string>
@@ -26,4 +26,4 @@ std::string serializeStringList(ArrayRef<StringRef> Strings);
 
 } // namespace clang::tidy::utils::options
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_OPTIONUTILS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_OPTIONSUTILS_H
diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
index 6bd6d981858cb..bc6de97b8b74e 100644
--- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
@@ -103,7 +103,7 @@ static const CXXMethodDecl *getOverrideMethod(const CXXMethodDecl *Method) {
   while (true) {
     Method = *Method->begin_overridden_methods();
     assert(Method && "Overridden method shouldn't be null");
-    unsigned NumOverrides = Method->size_overridden_methods();
+    const unsigned NumOverrides = Method->size_overridden_methods();
     if (NumOverrides == 0)
       return Method;
     if (NumOverrides > 1)
@@ -148,7 +148,7 @@ static NameLookup findDeclInBases(const CXXRecordDecl &Parent,
     return NameLookup(InClassRef);
   const NamedDecl *Found = nullptr;
 
-  for (CXXBaseSpecifier Base : Parent.bases()) {
+  for (const CXXBaseSpecifier Base : Parent.bases()) {
     const auto *Record = Base.getType()->getAsCXXRecordDecl();
     if (!Record && AggressiveTemplateLookup) {
       if (const auto *TST =
@@ -269,7 +269,7 @@ class RenamerClangTidyVisitor
   }
 
   bool VisitNamedDecl(NamedDecl *Decl) {
-    SourceRange UsageRange =
+    const SourceRange UsageRange =
         DeclarationNameInfo(Decl->getDeclName(), Decl->getLocation())
             .getSourceRange();
     Check->addUsage(Decl, UsageRange, SM);
@@ -277,13 +277,13 @@ class RenamerClangTidyVisitor
   }
 
   bool VisitDeclRefExpr(DeclRefExpr *DeclRef) {
-    SourceRange Range = DeclRef->getNameInfo().getSourceRange();
+    const SourceRange Range = DeclRef->getNameInfo().getSourceRange();
     Check->addUsage(DeclRef->getDecl(), Range, SM);
     return true;
   }
 
   bool TraverseNestedNameSpecifierLoc(NestedNameSpecifierLoc Loc) {
-    if (NestedNameSpecifier Spec = Loc.getNestedNameSpecifier();
+    if (const NestedNameSpecifier Spec = Loc.getNestedNameSpecifier();
         Spec.getKind() == NestedNameSpecifier::Kind::Namespace) {
       if (const auto *Decl =
               dyn_cast<NamespaceDecl>(Spec.getAsNamespaceAndPrefix().Namespace))
@@ -295,27 +295,28 @@ class RenamerClangTidyVisitor
   }
 
   bool VisitMemberExpr(MemberExpr *MemberRef) {
-    SourceRange Range = MemberRef->getMemberNameInfo().getSourceRange();
+    const SourceRange Range = MemberRef->getMemberNameInfo().getSourceRange();
     Check->addUsage(MemberRef->getMemberDecl(), Range, SM);
     return true;
   }
 
   bool
   VisitCXXDependentScopeMemberExpr(CXXDependentScopeMemberExpr *DepMemberRef) {
-    QualType BaseType = DepMemberRef->isArrow()
-                            ? DepMemberRef->getBaseType()->getPointeeType()
-                            : DepMemberRef->getBaseType();
+    const QualType BaseType =
+        DepMemberRef->isArrow() ? DepMemberRef->getBaseType()->getPointeeType()
+                                : DepMemberRef->getBaseType();
     if (BaseType.isNull())
       return true;
     const CXXRecordDecl *Base = BaseType.getTypePtr()->getAsCXXRecordDecl();
     if (!Base)
       return true;
-    DeclarationName DeclName = DepMemberRef->getMemberNameInfo().getName();
+    const DeclarationName DeclName =
+        DepMemberRef->getMemberNameInfo().getName();
     if (!DeclName.isIdentifier())
       return true;
-    StringRef DependentName = DeclName.getAsIdentifierInfo()->getName();
+    const StringRef DependentName = DeclName.getAsIdentifierInfo()->getName();
 
-    if (NameLookup Resolved = findDeclInBases(
+    if (const NameLookup Resolved = findDeclInBases(
             *Base, DependentName, AggressiveDependentMemberLookup)) {
       if (*Resolved)
         Check->addUsage(*Resolved,
@@ -370,7 +371,7 @@ class RenamerClangTidyVisitor
       const IdentifierInfo *II = FD->getIdentifier();
       if (!II)
         continue;
-      SourceRange FixLocation{D.getFieldLoc(), D.getFieldLoc()};
+      const SourceRange FixLocation{D.getFieldLoc(), D.getFieldLoc()};
       Check->addUsage(FD, FixLocation, SM);
     }
 
@@ -473,7 +474,8 @@ void RenamerClangTidyCheck::addUsage(const NamedDecl *Decl,
   if (!MaybeFailure)
     return;
 
-  NamingCheckId FailureId(FailureDecl->getLocation(), FailureDecl->getName());
+  const NamingCheckId FailureId(FailureDecl->getLocation(),
+                                FailureDecl->getName());
 
   auto [FailureIter, NewFailure] = addUsage(FailureId, UsageRange, SourceMgr);
 
@@ -527,10 +529,10 @@ void RenamerClangTidyCheck::checkMacro(const Token &MacroNameTok,
   if (!MaybeFailure)
     return;
   FailureInfo &Info = *MaybeFailure;
-  StringRef Name = MacroNameTok.getIdentifierInfo()->getName();
-  NamingCheckId ID(MI->getDefinitionLoc(), Name);
+  const StringRef Name = MacroNameTok.getIdentifierInfo()->getName();
+  const NamingCheckId ID(MI->getDefinitionLoc(), Name);
   NamingCheckFailure &Failure = NamingCheckFailures[ID];
-  SourceRange Range(MacroNameTok.getLocation(), MacroNameTok.getEndLoc());
+  const SourceRange Range(MacroNameTok.getLocation(), MacroNameTok.getEndLoc());
 
   if (!isValidAsciiIdentifier(Info.Fixup))
     Failure.FixStatus = ShouldFixStatus::FixInvalidIdentifier;
@@ -542,14 +544,14 @@ void RenamerClangTidyCheck::checkMacro(const Token &MacroNameTok,
 void RenamerClangTidyCheck::expandMacro(const Token &MacroNameTok,
                                         const MacroInfo *MI,
                                         const SourceManager &SourceMgr) {
-  StringRef Name = MacroNameTok.getIdentifierInfo()->getName();
-  NamingCheckId ID(MI->getDefinitionLoc(), Name);
+  const StringRef Name = MacroNameTok.getIdentifierInfo()->getName();
+  const NamingCheckId ID(MI->getDefinitionLoc(), Name);
 
   auto Failure = NamingCheckFailures.find(ID);
   if (Failure == NamingCheckFailures.end())
     return;
 
-  SourceRange Range(MacroNameTok.getLocation(), MacroNameTok.getEndLoc());
+  const SourceRange Range(MacroNameTok.getLocation(), MacroNameTok.getEndLoc());
   addUsage(ID, Range, SourceMgr);
 }
 
diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.h b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.h
index b38bc082644cb..fe1059d5e5b81 100644
--- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.h
+++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_RENAMERCLANGTIDYCHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_RENAMERCLANGTIDYCHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_RENAMERCLANGTIDYCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_RENAMERCLANGTIDYCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "llvm/ADT/DenseMap.h"
@@ -167,4 +167,4 @@ class RenamerClangTidyCheck : public ClangTidyCheck {
 } // namespace tidy
 } // namespace clang
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_RENAMERCLANGTIDYCHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_RENAMERCLANGTIDYCHECK_H
diff --git a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp
index 87602d1187d59..b58e716f7103f 100644
--- a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp
@@ -66,7 +66,7 @@ TransformerClangTidyCheck::TransformerClangTidyCheck(StringRef Name,
 // we would be accessing `getLangOpts` and `Options` before the underlying
 // `ClangTidyCheck` instance was properly initialized.
 TransformerClangTidyCheck::TransformerClangTidyCheck(
-    std::function<std::optional<RewriteRuleWith<std::string>>(
+    llvm::function_ref<std::optional<RewriteRuleWith<std::string>>(
         const LangOptions &, const OptionsView &)>
         MakeRule,
     StringRef Name, ClangTidyContext *Context)
@@ -105,7 +105,7 @@ void TransformerClangTidyCheck::check(
   if (Result.Context->getDiagnostics().hasErrorOccurred())
     return;
 
-  size_t I = transformer::detail::findSelectedCase(Result, Rule);
+  const size_t I = transformer::detail::findSelectedCase(Result, Rule);
   Expected<SmallVector<transformer::Edit, 1>> Edits =
       Rule.Cases[I].Edits(Result);
   if (!Edits) {
@@ -127,7 +127,7 @@ void TransformerClangTidyCheck::check(
 
   // Associate the diagnostic with the location of the first change.
   {
-    DiagnosticBuilder Diag =
+    const DiagnosticBuilder Diag =
         diag((*Edits)[0].Range.getBegin(), escapeForDiagnostic(*Explanation));
     for (const auto &T : *Edits) {
       switch (T.Kind) {
diff --git a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h
index ad20fbd475759..da8606f993b6e 100644
--- a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h
+++ b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_TRANSFORMER_CLANG_TIDY_CHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_TRANSFORMER_CLANG_TIDY_CHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_TRANSFORMERCLANGTIDYCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_TRANSFORMERCLANGTIDYCHECK_H
 
 #include "../ClangTidyCheck.h"
 #include "IncludeInserter.h"
@@ -48,8 +48,9 @@ class TransformerClangTidyCheck : public ClangTidyCheck {
   ///
   /// See \c setRule for constraints on the rule.
   TransformerClangTidyCheck(
-      std::function<std::optional<transformer::RewriteRuleWith<std::string>>(
-          const LangOptions &, const OptionsView &)>
+      llvm::function_ref<
+          std::optional<transformer::RewriteRuleWith<std::string>>(
+              const LangOptions &, const OptionsView &)>
           MakeRule,
       StringRef Name, ClangTidyContext *Context);
 
@@ -83,4 +84,4 @@ class TransformerClangTidyCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::utils
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_TRANSFORMER_CLANG_TIDY_CHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_TRANSFORMERCLANGTIDYCHECK_H
diff --git a/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp b/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp
index d4e079f1cf4c2..98a5d40d49313 100644
--- a/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp
+++ b/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp
@@ -111,7 +111,7 @@ bool isTriviallyDefaultConstructible(QualType Type, const ASTContext &Context) {
     }
   }
 
-  QualType CanonicalType = Type.getCanonicalType();
+  const QualType CanonicalType = Type.getCanonicalType();
   if (CanonicalType->isDependentType())
     return false;
 
diff --git a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp
index cb1495163a2f9..09adbf1155e62 100644
--- a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp
@@ -55,7 +55,7 @@ AST_MATCHER(Expr, hasSideEffects) {
 } // namespace
 
 static auto
-makeExprMatcher(ast_matchers::internal::Matcher<Expr> ArgumentMatcher,
+makeExprMatcher(const ast_matchers::internal::Matcher<Expr> &ArgumentMatcher,
                 ArrayRef<StringRef> MethodNames,
                 ArrayRef<StringRef> FreeNames) {
   return expr(
@@ -73,7 +73,7 @@ makeMatcherPair(StringRef State, const UseRangesCheck::Indexes &Indexes,
                 const std::optional<UseRangesCheck::ReverseIteratorDescriptor>
                     &ReverseDescriptor) {
   std::string ArgBound = (ArgName + llvm::Twine(Indexes.BeginArg)).str();
-  SmallString<64> ID = {BoundCall, State};
+  const SmallString<64> ID = {BoundCall, State};
   ast_matchers::internal::Matcher<CallExpr> ArgumentMatcher = allOf(
       hasArgument(Indexes.BeginArg,
                   makeExprMatcher(expr(unless(hasSideEffects())).bind(ArgBound),
@@ -84,9 +84,9 @@ makeMatcherPair(StringRef State, const UseRangesCheck::Indexes &Indexes,
                       {"end", "cend"}, EndFreeNames)));
   if (ReverseDescriptor) {
     ArgBound.push_back('R');
-    SmallVector<StringRef> RBegin{
+    const SmallVector<StringRef> RBegin{
         llvm::make_first_range(ReverseDescriptor->FreeReverseNames)};
-    SmallVector<StringRef> REnd{
+    const SmallVector<StringRef> REnd{
         llvm::make_second_range(ReverseDescriptor->FreeReverseNames)};
     ArgumentMatcher = anyOf(
         ArgumentMatcher,
@@ -110,9 +110,9 @@ void UseRangesCheck::registerMatchers(MatchFinder *Finder) {
   auto Replaces = getReplacerMap();
   ReverseDescriptor = getReverseDescriptor();
   auto BeginEndNames = getFreeBeginEndMethods();
-  llvm::SmallVector<StringRef, 4> BeginNames{
+  const llvm::SmallVector<StringRef, 4> BeginNames{
       llvm::make_first_range(BeginEndNames)};
-  llvm::SmallVector<StringRef, 4> EndNames{
+  const llvm::SmallVector<StringRef, 4> EndNames{
       llvm::make_second_range(BeginEndNames)};
   Replacers.clear();
   llvm::DenseSet<Replacer *> SeenRepl;
@@ -169,7 +169,7 @@ static void removeFunctionArgs(DiagnosticBuilder &Diag, const CallExpr &Call,
   llvm::SmallBitVector Commas(Call.getNumArgs());
   // The first comma is actually the '(' which we can't remove
   Commas[0] = true;
-  for (unsigned Index : Sorted) {
+  for (const unsigned Index : Sorted) {
     const Expr *Arg = Call.getArg(Index);
     if (Commas[Index]) {
       if (Index >= Commas.size()) {
@@ -192,7 +192,7 @@ static void removeFunctionArgs(DiagnosticBuilder &Diag, const CallExpr &Call,
 }
 
 void UseRangesCheck::check(const MatchFinder::MatchResult &Result) {
-  Replacer *Replacer = nullptr;
+  const Replacer *Replacer = nullptr;
   const FunctionDecl *Function = nullptr;
   for (const auto &[Node, Value] : Result.Nodes.getMap()) {
     StringRef NodeStr(Node);
@@ -254,7 +254,7 @@ void UseRangesCheck::check(const MatchFinder::MatchResult &Result) {
           Diag << Inserter.createIncludeInsertion(
               Result.SourceManager->getFileID(Call->getBeginLoc()),
               *ReverseDescriptor->ReverseHeader);
-        StringRef ArgText = Lexer::getSourceText(
+        const StringRef ArgText = Lexer::getSourceText(
             CharSourceRange::getTokenRange(ArgExpr->getSourceRange()),
             Result.Context->getSourceManager(), Result.Context->getLangOpts());
         SmallString<128> ReplaceText;
diff --git a/clang-tools-extra/clang-tidy/utils/UsingInserter.cpp b/clang-tools-extra/clang-tidy/utils/UsingInserter.cpp
index e4c71aa60a7a2..6a591c1a84a47 100644
--- a/clang-tools-extra/clang-tidy/utils/UsingInserter.cpp
+++ b/clang-tools-extra/clang-tidy/utils/UsingInserter.cpp
@@ -19,7 +19,7 @@ namespace clang::tidy::utils {
 using namespace ast_matchers;
 
 static StringRef getUnqualifiedName(StringRef QualifiedName) {
-  size_t LastSeparatorPos = QualifiedName.rfind("::");
+  const size_t LastSeparatorPos = QualifiedName.rfind("::");
   if (LastSeparatorPos == StringRef::npos)
     return QualifiedName;
   return QualifiedName.drop_front(LastSeparatorPos + 2);
@@ -30,7 +30,7 @@ UsingInserter::UsingInserter(const SourceManager &SourceMgr)
 
 std::optional<FixItHint> UsingInserter::createUsingDeclaration(
     ASTContext &Context, const Stmt &Statement, StringRef QualifiedName) {
-  StringRef UnqualifiedName = getUnqualifiedName(QualifiedName);
+  const StringRef UnqualifiedName = getUnqualifiedName(QualifiedName);
   const FunctionDecl *Function = getSurroundingFunction(Context, Statement);
   if (!Function)
     return std::nullopt;
@@ -38,7 +38,7 @@ std::optional<FixItHint> UsingInserter::createUsingDeclaration(
   if (AddedUsing.count(std::make_pair(Function, QualifiedName.str())) != 0)
     return std::nullopt;
 
-  SourceLocation InsertLoc = Lexer::getLocForEndOfToken(
+  const SourceLocation InsertLoc = Lexer::getLocForEndOfToken(
       Function->getBody()->getBeginLoc(), 0, SourceMgr, Context.getLangOpts());
 
   // Only use using declarations in the main file, not in includes.
@@ -47,7 +47,7 @@ std::optional<FixItHint> UsingInserter::createUsingDeclaration(
 
   // FIXME: This declaration could be masked. Investigate if
   // there is a way to avoid using Sema.
-  bool AlreadyHasUsingDecl =
+  const bool AlreadyHasUsingDecl =
       !match(stmt(hasAncestor(decl(has(usingDecl(hasAnyUsingShadowDecl(
                  hasTargetDecl(hasName(QualifiedName.str())))))))),
              Statement, Context)
@@ -58,15 +58,15 @@ std::optional<FixItHint> UsingInserter::createUsingDeclaration(
   }
   // Find conflicting declarations and references.
   auto ConflictingDecl = namedDecl(hasName(UnqualifiedName));
-  bool HasConflictingDeclaration =
+  const bool HasConflictingDeclaration =
       !match(findAll(ConflictingDecl), *Function, Context).empty();
-  bool HasConflictingDeclRef =
+  const bool HasConflictingDeclRef =
       !match(findAll(declRefExpr(to(ConflictingDecl))), *Function, Context)
            .empty();
   if (HasConflictingDeclaration || HasConflictingDeclRef)
     return std::nullopt;
 
-  std::string Declaration =
+  const std::string Declaration =
       (llvm::Twine("\nusing ") + QualifiedName + ";").str();
 
   AddedUsing.emplace(Function, QualifiedName.str());
diff --git a/clang-tools-extra/clang-tidy/utils/UsingInserter.h b/clang-tools-extra/clang-tidy/utils/UsingInserter.h
index 23c317581c191..3e943569047ae 100644
--- a/clang-tools-extra/clang-tidy/utils/UsingInserter.h
+++ b/clang-tools-extra/clang-tidy/utils/UsingInserter.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_USINGINSERTER_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_USINGINSERTER_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_USINGINSERTER_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_USINGINSERTER_H
 
 #include "clang/AST/Decl.h"
 #include "clang/AST/Stmt.h"
@@ -43,4 +43,4 @@ class UsingInserter {
 };
 
 } // namespace clang::tidy::utils
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_USINGINSERTER_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_USINGINSERTER_H
diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt
index fb3f05329be21..d7ec853af862f 100644
--- a/clang-tools-extra/clangd/CMakeLists.txt
+++ b/clang-tools-extra/clangd/CMakeLists.txt
@@ -165,6 +165,7 @@ clang_target_link_libraries(clangDaemon
   clangBasic
   clangDependencyScanning
   clangDriver
+  clangOptions
   clangFormat
   clangFrontend
   clangIndex
diff --git a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp
index 0f765e96fb152..f8e6da73bbb1f 100644
--- a/clang-tools-extra/clangd/ClangdLSPServer.cpp
+++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp
@@ -456,7 +456,6 @@ class ClangdLSPServer::MessageHandler : public Transport::MessageHandler {
 
   ClangdLSPServer &Server;
 };
-constexpr int ClangdLSPServer::MessageHandler::MaxReplayCallbacks;
 
 // call(), notify(), and reply() wrap the Transport, adding logging and locking.
 void ClangdLSPServer::callMethod(StringRef Method, llvm::json::Value Params,
diff --git a/clang-tools-extra/clangd/CompileCommands.cpp b/clang-tools-extra/clangd/CompileCommands.cpp
index c1be93730129a..7990f2719e9a0 100644
--- a/clang-tools-extra/clangd/CompileCommands.cpp
+++ b/clang-tools-extra/clangd/CompileCommands.cpp
@@ -11,8 +11,8 @@
 #include "support/Logger.h"
 #include "support/Trace.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInvocation.h"
+#include "clang/Options/Options.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -206,7 +206,7 @@ void CommandMangler::operator()(tooling::CompileCommand &Command,
   if (Cmd.empty())
     return;
 
-  auto &OptTable = clang::driver::getDriverOptTable();
+  auto &OptTable = getDriverOptTable();
   // OriginalArgs needs to outlive ArgList.
   llvm::SmallVector<const char *, 16> OriginalArgs;
   OriginalArgs.reserve(Cmd.size());
@@ -222,8 +222,8 @@ void CommandMangler::operator()(tooling::CompileCommand &Command,
   llvm::opt::InputArgList ArgList;
   ArgList = OptTable.ParseArgs(
       llvm::ArrayRef(OriginalArgs).drop_front(), IgnoredCount, IgnoredCount,
-      llvm::opt::Visibility(IsCLMode ? driver::options::CLOption
-                                     : driver::options::ClangOption));
+      llvm::opt::Visibility(IsCLMode ? options::CLOption
+                                     : options::ClangOption));
 
   llvm::SmallVector<unsigned, 1> IndicesToDrop;
   // Having multiple architecture options (e.g. when building fat binaries)
@@ -232,7 +232,7 @@ void CommandMangler::operator()(tooling::CompileCommand &Command,
   // As there are no signals to figure out which one user actually wants. They
   // can explicitly specify one through `CompileFlags.Add` if need be.
   unsigned ArchOptCount = 0;
-  for (auto *Input : ArgList.filtered(driver::options::OPT_arch)) {
+  for (auto *Input : ArgList.filtered(options::OPT_arch)) {
     ++ArchOptCount;
     for (auto I = 0U; I <= Input->getNumValues(); ++I)
       IndicesToDrop.push_back(Input->getIndex() + I);
@@ -262,13 +262,12 @@ void CommandMangler::operator()(tooling::CompileCommand &Command,
   // explicitly at the end of the flags. This ensures modifications done in the
   // following steps apply in more cases (like setting -x, which only affects
   // inputs that come after it).
-  for (auto *Input : ArgList.filtered(driver::options::OPT_INPUT)) {
+  for (auto *Input : ArgList.filtered(options::OPT_INPUT)) {
     SawInput(Input->getValue(0));
     IndicesToDrop.push_back(Input->getIndex());
   }
   // Anything after `--` is also treated as input, drop them as well.
-  if (auto *DashDash =
-          ArgList.getLastArgNoClaim(driver::options::OPT__DASH_DASH)) {
+  if (auto *DashDash = ArgList.getLastArgNoClaim(options::OPT__DASH_DASH)) {
     auto DashDashIndex = DashDash->getIndex() + 1; // +1 accounts for Cmd[0]
     // Another +1 so we don't treat the `--` itself as an input.
     for (unsigned I = DashDashIndex + 1; I < Cmd.size(); ++I)
@@ -424,11 +423,11 @@ DriverMode getDriverMode(const std::vector<std::string> &Args) {
 // Returns the set of DriverModes where an option may be used.
 unsigned char getModes(const llvm::opt::Option &Opt) {
   unsigned char Result = DM_None;
-  if (Opt.hasVisibilityFlag(driver::options::ClangOption))
+  if (Opt.hasVisibilityFlag(options::ClangOption))
     Result |= DM_GCC;
-  if (Opt.hasVisibilityFlag(driver::options::CC1Option))
+  if (Opt.hasVisibilityFlag(options::CC1Option))
     Result |= DM_CC1;
-  if (Opt.hasVisibilityFlag(driver::options::CLOption))
+  if (Opt.hasVisibilityFlag(options::CLOption))
     Result |= DM_CL;
   return Result;
 }
@@ -442,8 +441,8 @@ llvm::ArrayRef<ArgStripper::Rule> ArgStripper::rulesFor(llvm::StringRef Arg) {
   using TableTy =
       llvm::StringMap<llvm::SmallVector<Rule, 4>, llvm::BumpPtrAllocator>;
   static TableTy *Table = [] {
-    auto &DriverTable = driver::getDriverOptTable();
-    using DriverID = clang::driver::options::ID;
+    auto &DriverTable = getDriverOptTable();
+    using DriverID = clang::options::ID;
 
     // Collect sets of aliases, so we can treat -foo and -foo= as synonyms.
     // Conceptually a double-linked list: PrevAlias[I] -> I -> NextAlias[I].
@@ -468,7 +467,7 @@ llvm::ArrayRef<ArgStripper::Rule> ArgStripper::rulesFor(llvm::StringRef Arg) {
                FLAGS, VISIBILITY, PARAM, HELPTEXT, HELPTEXTSFORVARIANTS,       \
                METAVAR, VALUES, SUBCOMMANDIDS_OFFSET)                          \
   {DriverID::OPT_##ID, DriverID::OPT_##ALIAS, ALIASARGS},
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTION
     };
     for (auto &E : AliasTable)
diff --git a/clang-tools-extra/clangd/FileDistance.cpp b/clang-tools-extra/clangd/FileDistance.cpp
index 06c1a8bc92a86..d587c26a82145 100644
--- a/clang-tools-extra/clangd/FileDistance.cpp
+++ b/clang-tools-extra/clangd/FileDistance.cpp
@@ -54,7 +54,6 @@ static llvm::SmallString<128> canonicalize(llvm::StringRef Path) {
   return Result;
 }
 
-constexpr const unsigned FileDistance::Unreachable;
 const llvm::hash_code FileDistance::RootHash =
     llvm::hash_value(llvm::StringRef("/"));
 
diff --git a/clang-tools-extra/clangd/FuzzyMatch.cpp b/clang-tools-extra/clangd/FuzzyMatch.cpp
index de7280d80361d..cf5182bc1b2d7 100644
--- a/clang-tools-extra/clangd/FuzzyMatch.cpp
+++ b/clang-tools-extra/clangd/FuzzyMatch.cpp
@@ -62,9 +62,6 @@
 namespace clang {
 namespace clangd {
 
-constexpr int FuzzyMatcher::MaxPat;
-constexpr int FuzzyMatcher::MaxWord;
-
 static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; }
 // A "negative infinity" score that won't overflow.
 // We use this to mark unreachable states and forbidden solutions.
diff --git a/clang-tools-extra/clangd/Selection.cpp b/clang-tools-extra/clangd/Selection.cpp
index 06165dfbbcdd2..faa00d20497fa 100644
--- a/clang-tools-extra/clangd/Selection.cpp
+++ b/clang-tools-extra/clangd/Selection.cpp
@@ -958,6 +958,18 @@ class SelectionVisitor : public RecursiveASTVisitor<SelectionVisitor> {
         claimRange(SourceRange(FTL.getLParenLoc(), FTL.getEndLoc()), Result);
         return;
       }
+      if (auto ATL = TL->getAs<AttributedTypeLoc>()) {
+        // For attributed function types like `int foo() [[attr]]`, the
+        // AttributedTypeLoc's range includes the function name. We want to
+        // allow the function name to be associated with the FunctionDecl
+        // rather than the AttributedTypeLoc, so we only claim the attribute
+        // range itself.
+        if (ATL.getModifiedLoc().getAs<FunctionTypeLoc>()) {
+          // Only claim the attribute's source range, not the whole type.
+          claimRange(ATL.getLocalSourceRange(), Result);
+          return;
+        }
+      }
     }
     claimRange(getSourceRange(N), Result);
   }
diff --git a/clang-tools-extra/clangd/index/SymbolLocation.cpp b/clang-tools-extra/clangd/index/SymbolLocation.cpp
index 61da267b93ce5..058cb1e0945f2 100644
--- a/clang-tools-extra/clangd/index/SymbolLocation.cpp
+++ b/clang-tools-extra/clangd/index/SymbolLocation.cpp
@@ -11,9 +11,6 @@
 namespace clang {
 namespace clangd {
 
-constexpr uint32_t SymbolLocation::Position::MaxLine;
-constexpr uint32_t SymbolLocation::Position::MaxColumn;
-
 void SymbolLocation::Position::setLine(uint32_t L) {
   if (L > MaxLine)
     L = MaxLine;
diff --git a/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp b/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp
index 16febeca70809..b557066d979f5 100644
--- a/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp
+++ b/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp
@@ -79,7 +79,7 @@
 
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclCXX.h"
-#include "clang/AST/Type.h"
+#include "clang/AST/TypeBase.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/SourceLocation.h"
@@ -116,7 +116,8 @@ std::string removePureVirtualSyntax(const std::string &MethodDecl,
 
     DeclString += Tk.text();
     if (Tk.Kind != tok::l_paren && Next.Kind != tok::comma &&
-        Next.Kind != tok::r_paren && Next.Kind != tok::l_paren)
+        Next.Kind != tok::r_paren && Next.Kind != tok::l_paren &&
+        Tk.Kind != tok::coloncolon && Next.Kind != tok::coloncolon)
       DeclString += ' ';
   }
   // Trim the last whitespace.
diff --git a/clang-tools-extra/clangd/support/DirectiveTree.cpp b/clang-tools-extra/clangd/support/DirectiveTree.cpp
index 97b0598e82c58..16d12f332a0be 100644
--- a/clang-tools-extra/clangd/support/DirectiveTree.cpp
+++ b/clang-tools-extra/clangd/support/DirectiveTree.cpp
@@ -305,8 +305,8 @@ class BranchChooser {
     if (&Value >= Tokens.end() || &Value.nextNC() < Tokens.end())
       return std::nullopt;
     return llvm::StringSwitch<std::optional<bool>>(Value.text())
-        .Cases("true", "1", true)
-        .Cases("false", "0", false)
+        .Cases({"true", "1"}, true)
+        .Cases({"false", "0"}, false)
         .Default(std::nullopt);
   }
 
diff --git a/clang-tools-extra/clangd/unittests/SelectionTests.cpp b/clang-tools-extra/clangd/unittests/SelectionTests.cpp
index 3df19d8fc174d..63c0403ab2e70 100644
--- a/clang-tools-extra/clangd/unittests/SelectionTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SelectionTests.cpp
@@ -311,6 +311,19 @@ TEST(SelectionTest, CommonAncestor) {
       {"[[void foo^()]];", "FunctionProtoTypeLoc"},
       {"[[^void foo^()]];", "FunctionDecl"},
       {"[[void ^foo()]];", "FunctionDecl"},
+      // Tricky case: with function attributes, the AttributedTypeLoc's range
+      // includes the function name, but we want the name to be associated with
+      // the CXXMethodDecl.
+      {"struct X { [[const int* ^Get() const <:[clang::lifetimebound]:> "
+       "{return nullptr;}]]; };",
+       "CXXMethodDecl"},
+      // When the cursor is on the attribute itself, we should select the
+      // AttributedTypeLoc. Note: Due to a bug or deliberate quirk in the AST
+      // modeling of AttributedTypeLoc, its range ends at the attribute name
+      // token, not including the closing brackets ":>:>".
+      {"struct X { const [[int* Foo() const <:<:clang::life^timebound]]:>:> "
+       "{return nullptr;}; };",
+       "AttributedTypeLoc"},
       // Tricky case: two VarDecls share a specifier.
       {"[[int ^a]], b;", "VarDecl"},
       {"[[int a, ^b]];", "VarDecl"},
diff --git a/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp b/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp
index b7dcbee1650ec..72095ab2f5982 100644
--- a/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp
@@ -715,6 +715,45 @@ class D : public B {
   EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
 }
 
+TEST_F(OverridePureVirtualsTests, QualifiedNames) {
+  constexpr auto Before = R"cpp(
+namespace foo { struct S{}; namespace bar { struct S2{}; } }
+
+class B {
+public:
+  virtual foo::S foo(int var = 0) = 0;
+  virtual foo::bar::S2 bar(int var = 0) = 0;
+};
+
+class ^D : public B {};
+)cpp";
+
+  constexpr auto Expected = R"cpp(
+namespace foo { struct S{}; namespace bar { struct S2{}; } }
+
+class B {
+public:
+  virtual foo::S foo(int var = 0) = 0;
+  virtual foo::bar::S2 bar(int var = 0) = 0;
+};
+
+class D : public B {
+public:
+  foo::S foo(int var = 0) override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `foo` is not implemented.");
+  }
+
+  foo::bar::S2 bar(int var = 0) override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `bar` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/docs/Maintainers.rst b/clang-tools-extra/docs/Maintainers.rst
new file mode 100644
index 0000000000000..f78e9ecf279a6
--- /dev/null
+++ b/clang-tools-extra/docs/Maintainers.rst
@@ -0,0 +1 @@
+.. include:: ../Maintainers.rst
\ No newline at end of file
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 6701bf25df166..f25c4cacdacb7 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -184,17 +184,17 @@ Improvements to clang-tidy
 New checks
 ^^^^^^^^^^
 
+- New :doc:`bugprone-derived-method-shadowing-base-method
+  <clang-tidy/checks/bugprone/derived-method-shadowing-base-method>` check.
+
+  Finds derived class methods that shadow a (non-virtual) base class method.
+
 - New :doc:`bugprone-invalid-enum-default-initialization
   <clang-tidy/checks/bugprone/invalid-enum-default-initialization>` check.
 
   Detects default initialization (to 0) of variables with ``enum`` type where
   the enum has no enumerator with value of 0.
 
-- New :doc:`bugprone-derived-method-shadowing-base-method
-  <clang-tidy/checks/bugprone/derived-method-shadowing-base-method>` check.
-
-  Finds derived class methods that shadow a (non-virtual) base class method.
-
 - New :doc:`cppcoreguidelines-pro-bounds-avoid-unchecked-container-access
   <clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access>`
   check.
@@ -244,6 +244,11 @@ New check aliases
   <clang-tidy/checks/modernize/avoid-variadic-functions>`
   keeping initial check as an alias to the new one.
 
+- Renamed :doc:`cert-dcl58-cpp <clang-tidy/checks/cert/dcl58-cpp>` to
+  :doc:`bugprone-std-namespace-modification
+  <clang-tidy/checks/bugprone/std-namespace-modification>`
+  keeping initial check as an alias to the new one.
+
 - Renamed :doc:`cert-env33-c <clang-tidy/checks/cert/env33-c>` to
   :doc:`bugprone-command-processor
   <clang-tidy/checks/bugprone/command-processor>`
@@ -264,6 +269,40 @@ New check aliases
   <clang-tidy/checks/bugprone/throwing-static-initialization>`
   keeping initial check as an alias to the new one.
 
+- Renamed :doc:`cert-err60-cpp <clang-tidy/checks/cert/err60-cpp>` to
+  :doc:`bugprone-exception-copy-constructor-throws
+  <clang-tidy/checks/bugprone/exception-copy-constructor-throws>`
+
+- Renamed :doc:`cert-flp30-c <clang-tidy/checks/cert/flp30-c>` to
+  :doc:`bugprone-float-loop-counter
+  <clang-tidy/checks/bugprone/float-loop-counter>`
+  keeping initial check as an alias to the new one.
+
+- Renamed :doc:`cert-mem57-cpp <clang-tidy/checks/cert/mem57-cpp>` to
+  :doc:`bugprone-default-operator-new-on-overaligned-type
+  <clang-tidy/checks/bugprone/default-operator-new-on-overaligned-type>`
+  keeping initial check as an alias to the new one.
+
+- Renamed :doc:`cert-msc32-c <clang-tidy/checks/cert/msc32-c>` to
+  :doc:`bugprone-random-generator-seed
+  <clang-tidy/checks/bugprone/random-generator-seed>`
+  keeping initial check as an alias to the new one.
+
+- Renamed :doc:`cert-msc51-cpp <clang-tidy/checks/cert/msc51-cpp>` to
+  :doc:`bugprone-random-generator-seed
+  <clang-tidy/checks/bugprone/random-generator-seed>`
+  keeping initial check as an alias to the new one.
+
+- Renamed :doc:`cert-oop57-cpp <clang-tidy/checks/cert/oop57-cpp>` to
+  :doc:`bugprone-raw-memory-call-on-non-trivial-type
+  <clang-tidy/checks/bugprone/raw-memory-call-on-non-trivial-type>`
+  keeping initial check as an alias to the new one.
+
+- Renamed :doc:`cert-oop58-cpp <clang-tidy/checks/cert/oop58-cpp>` to
+  :doc:`bugprone-copy-constructor-mutates-argument
+  <clang-tidy/checks/bugprone/copy-constructor-mutates-argument>`
+  keeping initial check as an alias to the new one.
+
 Changes in existing checks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -275,7 +314,11 @@ Changes in existing checks
 - Improved :doc:`bugprone-exception-escape
   <clang-tidy/checks/bugprone/exception-escape>` check's handling of lambdas:
   exceptions from captures are now diagnosed, exceptions in the bodies of
-  lambdas that aren't actually invoked are not.
+  lambdas that aren't actually invoked are not. Additionally, fixed an issue
+  where the check wouldn't diagnose throws in arguments to functions or
+  constructors. Added fine-grained configuration via options 
+  `CheckDestructors`, `CheckMoveMemberFunctions`, `CheckMain`,
+  `CheckedSwapFunctions`, and `CheckNothrowFunctions`.
 
 - Improved :doc:`bugprone-infinite-loop
   <clang-tidy/checks/bugprone/infinite-loop>` check by adding detection for
@@ -324,7 +367,11 @@ Changes in existing checks
 - Improved :doc:`bugprone-unchecked-optional-access
   <clang-tidy/checks/bugprone/unchecked-optional-access>` check by supporting
   ``NullableValue::makeValue`` and ``NullableValue::makeValueInplace`` to
-  prevent false-positives for ``BloombergLP::bdlb::NullableValue`` type.
+  prevent false-positives for ``BloombergLP::bdlb::NullableValue`` type, and by
+  adding the `IgnoreValueCalls` option to suppress diagnostics for
+  ``optional::value()`` and the `IgnoreSmartPointerDereference` option to
+  ignore optionals reached via smart-pointer-like dereference, while still
+  diagnosing UB-prone dereferences via ``operator*`` and ``operator->``.
 
 - Improved :doc:`bugprone-unhandled-self-assignment
   <clang-tidy/checks/bugprone/unhandled-self-assignment>` check by adding
@@ -355,6 +402,10 @@ Changes in existing checks
   adding an option to allow pointer arithmetic via prefix/postfix increment or
   decrement operators.
 
+- Improved :doc:`google-readability-casting
+  <clang-tidy/checks/google/readability-casting>` check by adding fix-it
+  notes for downcasts.
+
 - Improved :doc:`llvm-prefer-isa-or-dyn-cast-in-conditionals
   <clang-tidy/checks/llvm/prefer-isa-or-dyn-cast-in-conditionals>` check:
 
@@ -367,9 +418,15 @@ Changes in existing checks
 
 - Improved :doc:`misc-const-correctness
   <clang-tidy/checks/misc/const-correctness>` check to avoid false
-  positives when pointers is transferred to non-const references 
+  positives when pointers is transferred to non-const references
   and avoid false positives of function pointer and fix false
-  positives on return of non-const pointer.
+  positives on return of non-const pointer and fix false positives on
+  pointer-to-member operator.
+
+- Improved :doc:`misc-coroutine-hostile-raii
+  <clang-tidy/checks/misc/coroutine-hostile-raii>` check by adding the option
+  `AllowedCallees`, that allows exempting safely awaitable callees from the
+  check.
 
 - Improved :doc:`misc-header-include-cycle
   <clang-tidy/checks/misc/header-include-cycle>` check performance.
@@ -434,6 +491,10 @@ Changes in existing checks
   comparisons to ``npos``. Internal changes may cause new rare false positives
   in non-standard containers.
 
+- Improved :doc:`readability-container-data-pointer
+  <clang-tidy/checks/readability/container-data-pointer>` check by correctly
+  adding parentheses when the container expression is a dereference.
+
 - Improved :doc:`readability-container-size-empty
   <clang-tidy/checks/readability/container-size-empty>` check by correctly
   generating fix-it hints when size method is called from implicit ``this``,
@@ -459,6 +520,10 @@ Changes in existing checks
   <clang-tidy/checks/readability/uppercase-literal-suffix>` check to recognize
   literal suffixes added in C++23 and C23.
 
+- Improved :doc:`readability-use-concise-preprocessor-directives
+  <clang-tidy/checks/readability/use-concise-preprocessor-directives>` check to
+  generate correct fix-its for forms without a space after the directive.
+
 Removed checks
 ^^^^^^^^^^^^^^
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/boost/use-ranges.rst b/clang-tools-extra/docs/clang-tidy/checks/boost/use-ranges.rst
index 01b0024684919..6cf54347ad613 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/boost/use-ranges.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/boost/use-ranges.rst
@@ -146,7 +146,7 @@ If calls are made using reverse iterators on containers, The code will be
 fixed using the ``boost::adaptors::reverse`` adaptor.
 
 .. code-block:: c++
-  
+
   auto AreSame = std::equal(Items1.rbegin(), Items1.rend(),
                             std::crbegin(Items2), std::crend(Items2));
 
@@ -166,7 +166,7 @@ Options
    is `llvm`.
 
 .. option:: IncludeBoostSystem
-   
+
    If `true` (default value) the boost headers are included as system headers
    with angle brackets (`#include <boost.hpp>`), otherwise quotes are used
    (`#include "boost.hpp"`).
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/assignment-in-if-condition.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/assignment-in-if-condition.rst
index 9fa37c0593815..691b6e4db096b 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/assignment-in-if-condition.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/assignment-in-if-condition.rst
@@ -8,7 +8,7 @@ Such assignments are bug-prone because they may have been intended as equality t
 
 This check finds all assignments within `if` conditions, including ones that are not flagged
 by `-Wparentheses` due to an extra set of parentheses, and including assignments that call
-an overloaded `operator=()`. The identified assignments violate 
+an overloaded `operator=()`. The identified assignments violate
 `BARR group "Rule 8.2.c" <https://barrgroup.com/embedded-systems/books/embedded-c-coding-standard/statement-rules/if-else-statements>`_.
 
 .. code-block:: c++
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst
index dfc2ca1bbc7dd..1017462b8806b 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst
@@ -30,7 +30,7 @@ Possible fixes:
   - marking copy and move constructors and assignment operators deleted.
   - using class member method instead of class member variable with function
     object types.
-  - passing ``this`` pointer as parameter 
+  - passing ``this`` pointer as parameter.
 
 Options
 -------
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/copy-constructor-mutates-argument.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/copy-constructor-mutates-argument.rst
new file mode 100644
index 0000000000000..28e5015beeaad
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/copy-constructor-mutates-argument.rst
@@ -0,0 +1,11 @@
+.. title:: clang-tidy - bugprone-copy-constructor-mutates-argument
+
+bugprone-copy-constructor-mutates-argument
+==========================================
+
+Finds assignments to the copied object and its direct or indirect members
+in copy constructors and copy assignment operators.
+
+This check corresponds to the CERT C Coding Standard rule
+`OOP58-CPP. Copy operations must not mutate the source object
+<https://wiki.sei.cmu.edu/confluence/display/cplusplus/OOP58-CPP.+Copy+operations+must+not+mutate+the+source+object>`_.
\ No newline at end of file
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/crtp-constructor-accessibility.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/crtp-constructor-accessibility.rst
index f24abfd1b5f5f..53082f44638b6 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/crtp-constructor-accessibility.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/crtp-constructor-accessibility.rst
@@ -6,7 +6,7 @@ bugprone-crtp-constructor-accessibility
 Detects error-prone Curiously Recurring Template Pattern usage, when the CRTP
 can be constructed outside itself and the derived class.
 
-The CRTP is an idiom, in which a class derives from a template class, where 
+The CRTP is an idiom, in which a class derives from a template class, where
 itself is the template argument. It should be ensured that if a class is
 intended to be a base class in this idiom, it can only be instantiated if
 the derived class is its template argument.
@@ -23,7 +23,7 @@ Example:
 
   class Derived : CRTP<Derived> {};
 
-Below can be seen some common mistakes that will allow the breaking of the 
+Below can be seen some common mistakes that will allow the breaking of the
 idiom.
 
 If the constructor of a class intended to be used in a CRTP is public, then
@@ -62,7 +62,7 @@ Example:
   class Bad : CRTP<Good> {};
   Bad BadInstance;
 
-To ensure that no accidental instantiation happens, the best practice is to 
+To ensure that no accidental instantiation happens, the best practice is to
 make the constructor private and declare the derived class as friend. Note
 that as a tradeoff, this also gives the derived class access to every other
 private members of the CRTP. However, constructors can still be public or
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/default-operator-new-on-overaligned-type.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/default-operator-new-on-overaligned-type.rst
new file mode 100644
index 0000000000000..c9918120f0770
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/default-operator-new-on-overaligned-type.rst
@@ -0,0 +1,20 @@
+.. title:: clang-tidy - bugprone-default-operator-new-on-overaligned-type
+
+bugprone-default-operator-new-on-overaligned-type
+=================================================
+
+Flags uses of default ``operator new`` where the type has extended
+alignment (an alignment greater than the fundamental alignment).
+
+The default ``operator new`` is guaranteed to provide the correct alignment
+if the requested alignment is less or equal to the fundamental alignment.
+Only cases are detected (by design) where the ``operator new`` is not
+user-defined and is not a placement new (the reason is that in these cases we
+assume that the user provided the correct memory allocation).
+
+References
+----------
+
+This check corresponds to the CERT C++ Coding Standard rule
+`MEM57-CPP. Avoid using default operator new for over-aligned types
+<https://wiki.sei.cmu.edu/confluence/display/cplusplus/MEM57-CPP.+Avoid+using+default+operator+new+for+over-aligned+types>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.rst
index f544abc14ffbf..aff3e1e6b6fb0 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.rst
@@ -7,7 +7,7 @@ Finds derived class methods that shadow a (non-virtual) base class method.
 
 In order to be considered "shadowing", methods must have the same signature
 (i.e. the same name, same number of parameters, same parameter types, etc).
-Only checks public, non-templated methods. 
+Only checks public, non-templated methods.
 
 The below example is bugprone because consumers of the ``Derived`` class will
 expect the ``reset`` method to do the work of ``Base::reset()`` in addition to extra
@@ -27,4 +27,4 @@ This is also a violation of the Liskov Substitution Principle.
 
   struct Derived : public Base {
     void reset() {/* reset the derived class, but not the base class */};
-  };
\ No newline at end of file
+  };
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-copy-constructor-throws.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-copy-constructor-throws.rst
new file mode 100644
index 0000000000000..8c3becf80a541
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-copy-constructor-throws.rst
@@ -0,0 +1,31 @@
+.. title:: clang-tidy - bugprone-exception-copy-constructor-throws
+
+bugprone-exception-copy-constructor-throws
+==========================================
+
+Checks whether a thrown object's copy constructor can throw.
+
+Exception objects are required to be copy constructible in C++. However, an
+exception's copy constructor should not throw to avoid potential issues when
+unwinding the stack. If an exception is thrown during stack unwinding (such
+as from a copy constructor of an exception object), the program will
+terminate via ``std::terminate``.
+
+.. code-block:: c++
+
+  class SomeException {
+  public:
+    SomeException() = default;
+    SomeException(const SomeException&) { /* may throw */ }
+  };
+
+  void f() {
+    throw SomeException();  // warning: thrown exception type's copy constructor can throw
+  }
+
+References
+----------
+
+This check corresponds to the CERT C++ Coding Standard rule
+`ERR60-CPP. Exception objects must be nothrow copy constructible
+<https://wiki.sei.cmu.edu/confluence/display/cplusplus/ERR60-CPP.+Exception+objects+must+be+nothrow+copy+constructible>`_.
\ No newline at end of file
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-escape.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-escape.rst
index 182fade7f47a0..7eaa333d5403a 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-escape.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-escape.rst
@@ -35,6 +35,31 @@ WARNING! This check may be expensive on large source files.
 Options
 -------
 
+.. option:: CheckDestructors
+
+   When `true`, destructors are analyzed to not throw exceptions.
+   Default value is `true`.
+
+.. option:: CheckMoveMemberFunctions
+
+   When `true`, move constructors and move assignment operators are analyzed
+   to not throw exceptions. Default value is `true`.
+
+.. option:: CheckMain
+
+   When `true`, the ``main()`` function is analyzed to not throw exceptions.
+   Default value is `true`.
+
+.. option:: CheckNothrowFunctions
+
+   When `true`, functions marked with ``noexcept`` or ``throw()`` exception
+   specifications are analyzed to not throw exceptions. Default value is `true`.
+
+.. option:: CheckedSwapFunctions
+
+   Comma-separated list of swap function names which should not throw exceptions.
+   Default value is `swap,iter_swap,iter_move`.
+
 .. option:: FunctionsThatShouldNotThrow
 
    Comma separated list containing function names which should not throw. An
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/float-loop-counter.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/float-loop-counter.rst
new file mode 100644
index 0000000000000..e663b40ab5a5d
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/float-loop-counter.rst
@@ -0,0 +1,13 @@
+.. title:: clang-tidy - bugprone-float-loop-counter
+
+bugprone-float-loop-counter
+===========================
+
+Flags ``for`` loops where the induction expression has a floating-point type.
+
+References
+----------
+
+This check corresponds to the CERT C Coding Standard rule
+`FLP30-C. Do not use floating-point variables as loop counters
+<https://www.securecoding.cert.org/confluence/display/c/FLP30-C.+Do+not+use+floating-point+variables+as+loop+counters>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/incorrect-enable-shared-from-this.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/incorrect-enable-shared-from-this.rst
index cc9e7be70f6ea..968340a6e8f98 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/incorrect-enable-shared-from-this.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/incorrect-enable-shared-from-this.rst
@@ -3,8 +3,8 @@
 bugprone-incorrect-enable-shared-from-this
 ==========================================
 
-Detect classes or structs that do not publicly inherit from 
-``std::enable_shared_from_this``, because unintended behavior will 
+Detect classes or structs that do not publicly inherit from
+``std::enable_shared_from_this``, because unintended behavior will
 otherwise occur when calling ``shared_from_this``.
 
 Consider the following code:
@@ -15,7 +15,7 @@ Consider the following code:
 
     // private inheritance
     class BadExample : std::enable_shared_from_this<BadExample> {
-    
+
     // ``shared_from_this``` unintended behaviour
     // `libstdc++` implementation returns uninitialized ``weak_ptr``
         public:
@@ -29,6 +29,6 @@ Consider the following code:
         b_ex->bar();
     }
 
-Using `libstdc++` implementation, ``shared_from_this`` will throw 
-``std::bad_weak_ptr``. When ``using_not_public()`` is called, this code will 
+Using `libstdc++` implementation, ``shared_from_this`` will throw
+``std::bad_weak_ptr``. When ``using_not_public()`` is called, this code will
 crash without exception handling.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object.rst
index 95509ef3c724d..2641cfe72e18c 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object.rst
@@ -54,7 +54,7 @@ Options
   Default: `false`.
 
   .. code-block:: c++
-  
+
     void bar(Base b[], Derived d[]) {
       b += 1; // warning, as Base declares a virtual destructor
       d += 1; // warning only if IgnoreVirtualDeclarationsOnly is set to false
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/random-generator-seed.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/random-generator-seed.rst
new file mode 100644
index 0000000000000..25712447f7897
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/random-generator-seed.rst
@@ -0,0 +1,44 @@
+.. title:: clang-tidy - bugprone-random-generator-seed
+
+bugprone-random-generator-seed
+==============================
+
+Flags all pseudo-random number engines, engine adaptor
+instantiations and ``srand()`` when initialized or seeded with default argument,
+constant expression or any user-configurable type. Pseudo-random number
+engines seeded with a predictable value may cause vulnerabilities e.g. in
+security protocols.
+
+Examples:
+
+.. code-block:: c++
+
+  void foo() {
+    std::mt19937 engine1; // Diagnose, always generate the same sequence
+    std::mt19937 engine2(1); // Diagnose
+    engine1.seed(); // Diagnose
+    engine2.seed(1); // Diagnose
+
+    std::time_t t;
+    engine1.seed(std::time(&t)); // Diagnose, system time might be controlled by user
+
+    int x = atoi(argv[1]);
+    std::mt19937 engine3(x);  // Will not warn
+  }
+
+Options
+-------
+
+.. option:: DisallowedSeedTypes
+
+   A comma-separated list of the type names which are disallowed.
+   Default value is `time_t,std::time_t`.
+
+References
+----------
+
+This check corresponds to the CERT C++ Coding Standard rules
+`MSC51-CPP. Ensure your random number generator is properly seeded
+<https://wiki.sei.cmu.edu/confluence/display/cplusplus/MSC51-CPP.+Ensure+your+random+number+generator+is+properly+seeded>`_ and
+`MSC32-C. Properly seed pseudorandom number generators
+<https://wiki.sei.cmu.edu/confluence/display/c/MSC32-C.+Properly+seed+pseudorandom+number+generators>`_.
\ No newline at end of file
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/raw-memory-call-on-non-trivial-type.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/raw-memory-call-on-non-trivial-type.rst
new file mode 100644
index 0000000000000..db3844447b3fd
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/raw-memory-call-on-non-trivial-type.rst
@@ -0,0 +1,35 @@
+.. title:: clang-tidy - bugprone-raw-memory-call-on-non-trivial-type
+
+bugprone-raw-memory-call-on-non-trivial-type
+============================================
+
+Flags use of the C standard library functions ``memset``, ``memcpy`` and
+``memcmp`` and similar derivatives on non-trivial types.
+
+The check will detect the following functions: ``memset``, ``std::memset``,
+``std::memcpy``, ``memcpy``, ``std::memmove``, ``memmove``, ``std::strcpy``,
+``strcpy``, ``memccpy``, ``stpncpy``, ``strncpy``, ``std::memcmp``, ``memcmp``,
+``std::strcmp``, ``strcmp``, ``strncmp``.
+
+Options
+-------
+
+.. option:: MemSetNames
+
+   Specify extra functions to flag that act similarly to ``memset``. Specify
+   names in a semicolon-delimited list. Default is an empty string.
+
+.. option:: MemCpyNames
+
+   Specify extra functions to flag that act similarly to ``memcpy``. Specify
+   names in a semicolon-delimited list. Default is an empty string.
+
+.. option:: MemCmpNames
+
+   Specify extra functions to flag that act similarly to ``memcmp``. Specify
+   names in a semicolon-delimited list. Default is an empty string.
+
+This check corresponds to the CERT C++ Coding Standard rule
+`OOP57-CPP. Prefer special member functions and overloaded operators to C
+Standard Library functions
+<https://wiki.sei.cmu.edu/confluence/display/cplusplus/OOP57-CPP.+Prefer+special+member+functions+and+overloaded+operators+to+C+Standard+Library+functions>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst
index 325a0a2aa9cc2..00759a2ca003b 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst
@@ -22,7 +22,7 @@ Example
     S(int);
     ~S();
   };
-  
+
   const S &fn(const S &a) {
     return a;
   }
@@ -35,7 +35,7 @@ This issue can be resolved by declaring an overload of the problematic function
 where the ``const &`` parameter is instead declared as ``&&``. The developer has
 to ensure that the implementation of that function does not produce a
 use-after-free, the exact error that this check is warning against.
-Marking such an ``&&`` overload as ``deleted``, will silence the warning as 
+Marking such an ``&&`` overload as ``deleted``, will silence the warning as
 well. In the case of different ``const &`` parameters being returned depending
 on the control flow of the function, an overload where all problematic
 ``const &`` parameters have been declared as ``&&`` will resolve the issue.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/signal-handler.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/signal-handler.rst
index 658b6555f1a1c..848fb667e1823 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/signal-handler.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/signal-handler.rst
@@ -44,7 +44,7 @@ Options
 
   Selects which set of functions is considered as asynchronous-safe
   (and therefore allowed in signal handlers). It can be set to the following values:
-  
+
   ``minimal``
      Selects a minimal set that is defined in the CERT SIG30-C rule.
      and includes functions ``abort()``, ``_Exit()``, ``quick_exit()`` and
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst
new file mode 100644
index 0000000000000..c6e5608280264
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst
@@ -0,0 +1,63 @@
+.. title:: clang-tidy - bugprone-std-namespace-modification
+
+bugprone-std-namespace-modification
+===================================
+
+Warns on modifications of the ``std`` or ``posix`` namespaces which can
+result in undefined behavior.
+
+The ``std`` (or ``posix``) namespace is allowed to be extended with (class or
+function) template specializations that depend on an user-defined type (a type
+that is not defined in the standard system headers).
+
+The check detects the following (user provided) declarations in namespace ``std`` or ``posix``:
+
+- Anything that is not a template specialization.
+- Explicit specializations of any standard library function template or class template, if it does not have any user-defined type as template argument.
+- Explicit specializations of any member function of a standard library class template.
+- Explicit specializations of any member function template of a standard library class or class template.
+- Explicit or partial specialization of any member class template of a standard library class or class template.
+
+Examples:
+
+.. code-block:: c++
+
+  namespace std {
+    int x; // warning: modification of 'std' namespace can result in undefined behavior [bugprone-dont-modify-std-namespace]
+  }
+
+  namespace posix::a { // warning: modification of 'posix' namespace can result in undefined behavior
+  }
+
+  template <>
+  struct ::std::hash<long> { // warning: modification of 'std' namespace can result in undefined behavior
+    unsigned long operator()(const long &K) const {
+      return K;
+    }
+  };
+
+  struct MyData { long data; };
+
+  template <>
+  struct ::std::hash<MyData> { // no warning: specialization with user-defined type
+    unsigned long operator()(const MyData &K) const {
+      return K.data;
+    }
+  };
+
+  namespace std {
+    template <>
+    void swap<bool>(bool &a, bool &b); // warning: modification of 'std' namespace can result in undefined behavior
+
+    template <>
+    bool less<void>::operator()<MyData &&, MyData &&>(MyData &&, MyData &&) const { // warning: modification of 'std' namespace can result in undefined behavior
+      return true;
+    }
+  }
+
+References
+----------
+
+This check corresponds to the CERT C++ Coding Standard rule
+`DCL58-CPP. Do not modify the standard namespaces
+<https://www.securecoding.cert.org/confluence/display/cplusplus/DCL58-CPP.+Do+not+modify+the+standard+namespaces>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/string-constructor.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/string-constructor.rst
index a0bd1d7c5bc15..ad4ed895bf012 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/string-constructor.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/string-constructor.rst
@@ -36,7 +36,7 @@ Examples:
   std::string_view("test", 0);
 
 Passing an invalid first character position parameter to constructor will
-cause ``std::out_of_range`` exception at runtime. 
+cause ``std::out_of_range`` exception at runtime.
 
 Examples:
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-realloc-usage.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-realloc-usage.rst
index 67e416b711b64..25a0d8885689b 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-realloc-usage.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-realloc-usage.rst
@@ -10,7 +10,7 @@ The problem with this construct is that if ``realloc`` fails it returns a
 null pointer but does not deallocate the original memory. If no other variable
 is pointing to it, the original memory block is not available any more for the
 program to use or free. In either case ``p = realloc(p, size);`` indicates bad
-coding style and can be replaced by ``q = realloc(p, size);``. 
+coding style and can be replaced by ``q = realloc(p, size);``.
 
 The pointer expression (used at ``realloc``) can be a variable or a field member
 of a data structure, but can not contain function calls or unresolved types.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/tagged-union-member-count.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/tagged-union-member-count.rst
index 072b5a3eee20f..a3469dc451562 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/tagged-union-member-count.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/tagged-union-member-count.rst
@@ -9,7 +9,7 @@ different from the number of data members inside the union.
 A struct or a class is considered to be a tagged union if it has
 exactly one union data member and exactly one enum data member and
 any number of other data members that are neither unions or enums.
-Furthermore, the types of the union and the enum members must 
+Furthermore, the types of the union and the enum members must
 not come from system header files nor the ``std`` namespace.
 
 Example:
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst
index 552e6db699696..ebed79e339d4b 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst
@@ -308,3 +308,22 @@ advantages:
   * Performance. A single check can cover many or even all accesses within
     scope. This gives the user the best of both worlds -- the safety of a
     dynamic check, but without incurring redundant costs.
+
+Options
+-------
+
+.. option:: IgnoreSmartPointerDereference
+
+   If set to `true`, the check ignores optionals that
+   are reached through overloaded smart-pointer-like dereference (``operator*``,
+   ``operator->``) on classes other than the optional type itself. This helps
+   avoid false positives where the analysis cannot equate results across such
+   calls. This does not cover access through ``operator[]``. Default is `false`.
+
+.. option:: IgnoreValueCalls
+
+   If set to `true`, the check does not diagnose calls
+   to ``optional::value()``. Diagnostics for ``operator*()`` and
+   ``operator->()`` remain enabled. This is useful for codebases that
+   intentionally rely on ``value()`` for defined, guarded access while still
+   flagging UB-prone operator dereferences. Default is `false`.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
index 317db9c5564e2..6937c5177b6c2 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
@@ -86,7 +86,7 @@ checked. The format is the following, without newlines:
 .. code::
 
    bugprone-unsafe-functions.CustomFunctions="
-     functionRegex1[, replacement1[, reason1]]; 
+     functionRegex1[, replacement1[, reason1]];
      functionRegex2[, replacement2[, reason2]];
      ...
    "
@@ -104,7 +104,7 @@ As an example, the configuration `^original$, replacement, is deprecated;`
 will produce the following diagnostic message.
 
 .. code:: c
-  
+
    original(); // warning: function 'original' is deprecated; 'replacement' should be used instead.
    ::std::original(); // no-warning
    original_function(); // no-warning
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst
index fbcc6281a8898..1b8c2c4f97dde 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst
@@ -3,57 +3,9 @@
 cert-dcl58-cpp
 ==============
 
-Modification of the ``std`` or ``posix`` namespace can result in undefined
-behavior.
-This check warns for such modifications.
-The ``std`` (or ``posix``) namespace is allowed to be extended with (class or
-function) template specializations that depend on an user-defined type (a type
-that is not defined in the standard system headers).
-
-The check detects the following (user provided) declarations in namespace ``std`` or ``posix``:
-
-- Anything that is not a template specialization.
-- Explicit specializations of any standard library function template or class template, if it does not have any user-defined type as template argument.
-- Explicit specializations of any member function of a standard library class template.
-- Explicit specializations of any member function template of a standard library class or class template.
-- Explicit or partial specialization of any member class template of a standard library class or class template.
-
-Examples:
-
-.. code-block:: c++
-
-  namespace std {
-    int x; // warning: modification of 'std' namespace can result in undefined behavior [cert-dcl58-cpp]
-  }
-
-  namespace posix::a { // warning: modification of 'posix' namespace can result in undefined behavior
-  }
-
-  template <>
-  struct ::std::hash<long> { // warning: modification of 'std' namespace can result in undefined behavior
-    unsigned long operator()(const long &K) const {
-      return K;
-    }
-  };
-
-  struct MyData { long data; };
-
-  template <>
-  struct ::std::hash<MyData> { // no warning: specialization with user-defined type
-    unsigned long operator()(const MyData &K) const {
-      return K.data;
-    }
-  };
-
-  namespace std {
-    template <>
-    void swap<bool>(bool &a, bool &b); // warning: modification of 'std' namespace can result in undefined behavior
-
-    template <>
-    bool less<void>::operator()<MyData &&, MyData &&>(MyData &&, MyData &&) const { // warning: modification of 'std' namespace can result in undefined behavior
-      return true;
-    }
-  }
+The `cert-dcl58-cpp` is an aliaes, please see
+`bugprone-std-namespace-modification <../bugprone/std-namespace-modification.html>`_
+for more information.
 
 This check corresponds to the CERT C++ Coding Standard rule
 `DCL58-CPP. Do not modify the standard namespaces
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/err60-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/err60-cpp.rst
index 9fcb840fc06f8..8d6dd1bf4b9b7 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/err60-cpp.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/err60-cpp.rst
@@ -1,11 +1,14 @@
 .. title:: clang-tidy - cert-err60-cpp
+.. meta::
+   :http-equiv=refresh: 5;URL=../bugprone/exception-copy-constructor-throws.html
 
 cert-err60-cpp
 ==============
 
-This check flags all throw expressions where the exception object is not nothrow
-copy constructible.
+The `cert-err60-cpp` check is an alias, please see
+`bugprone-exception-copy-constructor-throws <../bugprone/exception-copy-constructor-throws.html>`_
+for more information.
 
 This check corresponds to the CERT C++ Coding Standard rule
 `ERR60-CPP. Exception objects must be nothrow copy constructible
-<https://www.securecoding.cert.org/confluence/display/cplusplus/ERR60-CPP.+Exception+objects+must+be+nothrow+copy+constructible>`_.
+<https://wiki.sei.cmu.edu/confluence/display/cplusplus/ERR60-CPP.+Exception+objects+must+be+nothrow+copy+constructible>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/fio38-c.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/fio38-c.rst
index aeaaad793ca5c..9eaa12fc223aa 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/fio38-c.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/fio38-c.rst
@@ -8,3 +8,6 @@ cert-fio38-c
 The `cert-fio38-c` check is an alias, please see
 :doc:`misc-non-copyable-objects <../misc/non-copyable-objects>` for more
 information.
+
+This check corresponds to CERT C++ Coding Standard rule `FIO38-C. Do not copy a FILE object
+<https://www.securecoding.cert.org/confluence/display/c/FIO38-C.+Do+not+copy+a+FILE+object>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/flp30-c.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/flp30-c.rst
index c37b63980be76..5f6eff447cc2f 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/flp30-c.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/flp30-c.rst
@@ -3,8 +3,9 @@
 cert-flp30-c
 ============
 
-This check flags ``for`` loops where the induction expression has a
-floating-point type.
+The `cert-flp30-c` check is an alias, please see
+`bugprone-float-loop-counter <../bugprone/float-loop-counter.html>`_
+for more information
 
 This check corresponds to the CERT C Coding Standard rule
 `FLP30-C. Do not use floating-point variables as loop counters
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/mem57-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/mem57-cpp.rst
index 135cfb86f3d50..b359d85ad0cdc 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/mem57-cpp.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/mem57-cpp.rst
@@ -3,13 +3,9 @@
 cert-mem57-cpp
 ==============
 
-This check flags uses of default ``operator new`` where the type has extended
-alignment (an alignment greater than the fundamental alignment). (The default
-``operator new`` is guaranteed to provide the correct alignment if the
-requested alignment is less or equal to the fundamental alignment).
-Only cases are detected (by design) where the ``operator new`` is not
-user-defined and is not a placement new (the reason is that in these cases we
-assume that the user provided the correct memory allocation).
+The `cert-mem57-cpp` is an aliaes, please see
+`bugprone-default-operator-new-on-overaligned-type <../bugprone/default-operator-new-on-overaligned-type.html>`_
+for more information.
 
 This check corresponds to the CERT C++ Coding Standard rule
 `MEM57-CPP. Avoid using default operator new for over-aligned types
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/msc32-c.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/msc32-c.rst
index 6e453edefa76e..e0ed8074185ca 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/msc32-c.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/msc32-c.rst
@@ -1,9 +1,14 @@
 .. title:: clang-tidy - cert-msc32-c
 .. meta::
-   :http-equiv=refresh: 5;URL=../cert/msc51-cpp.html
+   :http-equiv=refresh: 5;URL=../bugprone/random-generator-seed.html
 
 cert-msc32-c
 ============
 
 The `cert-msc32-c` check is an alias, please see
-:doc:`cert-msc51-cpp <../cert/msc51-cpp>` for more information.
+:doc:`bugprone-random-generator-seed <../bugprone/random-generator-seed>`
+for more information.
+
+This check corresponds to the CERT C Coding Standard rule
+`MSC32-C. Properly seed pseudorandom number generators
+<https://wiki.sei.cmu.edu/confluence/display/c/MSC32-C.+Properly+seed+pseudorandom+number+generators>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/msc51-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/msc51-cpp.rst
index 99e550aef0e7a..a9b8672091bc6 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/msc51-cpp.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/msc51-cpp.rst
@@ -1,40 +1,14 @@
 .. title:: clang-tidy - cert-msc51-cpp
+.. meta::
+   :http-equiv=refresh: 5;URL=../bugprone/random-generator-seed.html
 
 cert-msc51-cpp
 ==============
 
-This check flags all pseudo-random number engines, engine adaptor
-instantiations and ``srand()`` when initialized or seeded with default argument,
-constant expression or any user-configurable type. Pseudo-random number
-engines seeded with a predictable value may cause vulnerabilities e.g. in
-security protocols.
-This is a CERT security rule, see
-`MSC51-CPP. Ensure your random number generator is properly seeded
-<https://wiki.sei.cmu.edu/confluence/display/cplusplus/MSC51-CPP.+Ensure+your+random+number+generator+is+properly+seeded>`_ and
-`MSC32-C. Properly seed pseudorandom number generators
-<https://wiki.sei.cmu.edu/confluence/display/c/MSC32-C.+Properly+seed+pseudorandom+number+generators>`_.
-
-Examples:
-
-.. code-block:: c++
-
-  void foo() {
-    std::mt19937 engine1; // Diagnose, always generate the same sequence
-    std::mt19937 engine2(1); // Diagnose
-    engine1.seed(); // Diagnose
-    engine2.seed(1); // Diagnose
-
-    std::time_t t;
-    engine1.seed(std::time(&t)); // Diagnose, system time might be controlled by user
+The `cert-msc51-cpp` check is an alias, please see
+:doc:`bugprone-random-generator-seed <../bugprone/random-generator-seed>`
+for more information.
 
-    int x = atoi(argv[1]);
-    std::mt19937 engine3(x);  // Will not warn
-  }
-
-Options
--------
-
-.. option:: DisallowedSeedTypes
-
-   A comma-separated list of the type names which are disallowed.
-   Default value is `time_t,std::time_t`.
+This check corresponds to the CERT C++ Coding Standard rule
+`MSC51-CPP. Ensure your random number generator is properly seeded
+<https://wiki.sei.cmu.edu/confluence/display/cplusplus/MSC51-CPP.+Ensure+your+random+number+generator+is+properly+seeded>`_.
\ No newline at end of file
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/oop57-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/oop57-cpp.rst
index 4787abf1554ab..414f788bf2500 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/oop57-cpp.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/oop57-cpp.rst
@@ -1,38 +1,13 @@
 .. title:: clang-tidy - cert-oop57-cpp
+.. meta::
+   :http-equiv=refresh: 5;URL=../bugprone/raw-memory-call-on-non-trivial-type.html
 
 cert-oop57-cpp
 ==============
 
-  Flags use of the `C` standard library functions ``memset``, ``memcpy`` and
-  ``memcmp`` and similar derivatives on non-trivial types.
-
-Options
--------
-
-.. option:: MemSetNames
-
-   Specify extra functions to flag that act similarly to ``memset``.
-   Specify names in a semicolon delimited list.
-   Default is an empty string.
-   The check will detect the following functions:
-   `memset`, `std::memset`.
-
-.. option:: MemCpyNames
-
-   Specify extra functions to flag that act similarly to ``memcpy``.
-   Specify names in a semicolon delimited list.
-   Default is an empty string.
-   The check will detect the following functions:
-   `std::memcpy`, `memcpy`, `std::memmove`, `memmove`, `std::strcpy`,
-   `strcpy`, `memccpy`, `stpncpy`, `strncpy`.
-
-.. option:: MemCmpNames
-
-   Specify extra functions to flag that act similarly to ``memcmp``.
-   Specify names in a semicolon delimited list.
-   Default is an empty string.
-   The check will detect the following functions:
-   `std::memcmp`, `memcmp`, `std::strcmp`, `strcmp`, `strncmp`.
+The `cert-oop57-cpp` check is an alias, please see
+`bugprone-raw-memory-call-on-non-trivial-type <../bugprone/raw-memory-call-on-non-trivial-type.html>`_
+for more information.
 
 This check corresponds to the CERT C++ Coding Standard rule
 `OOP57-CPP. Prefer special member functions and overloaded operators to C
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/oop58-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/oop58-cpp.rst
index 399fb1b7e9279..e435490f0711a 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/oop58-cpp.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/oop58-cpp.rst
@@ -1,11 +1,10 @@
-.. title:: clang-tidy - cert-mutating-copy
+.. title:: clang-tidy - cert-oop58-cpp
+.. meta::
+   :http-equiv=refresh: 5;URL=../bugprone/copy-constructor-mutates-argument.html
 
 cert-oop58-cpp
 ==============
 
-Finds assignments to the copied object and its direct or indirect members
-in copy constructors and copy assignment operators.
-
-This check corresponds to the CERT C Coding Standard rule
-`OOP58-CPP. Copy operations must not mutate the source object
-<https://wiki.sei.cmu.edu/confluence/display/cplusplus/OOP58-CPP.+Copy+operations+must+not+mutate+the+source+object>`_.
+The `cert-oop58-cpp` check is an alias, please see
+:doc:`bugprone-copy-constructor-mutates-argument <../bugprone/copy-constructor-mutates-argument>`
+for more information.
\ No newline at end of file
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst
index 556d90213b216..f45bca684d492 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.rst
@@ -39,14 +39,14 @@ Options
 
 .. option:: ExcludeClasses
 
-    Semicolon-delimited list of class names for overwriting the default
-    exclusion list. The default is:
+    Semicolon-separated list of regular expressions matching class names that
+    overwrites the default exclusion list. The default is:
     `::std::map;::std::unordered_map;::std::flat_map`.
-    
+
 .. option:: FixMode
 
-    Determines what fixes are suggested. Either `none`, `at` (use 
-    ``a.at(index)`` if a fitting function exists) or `function` (use a 
+    Determines what fixes are suggested. Either `none`, `at` (use
+    ``a.at(index)`` if a fitting function exists) or `function` (use a
     function ``f(a, index)``). The default is `none`.
 
 .. option:: FixFunction
@@ -54,7 +54,7 @@ Options
     The function to use in the `function` mode. For C++23 and beyond, the
     passed function must support the empty subscript operator, i.e., the case
     where ``a[]`` becomes ``f(a)``. :option:`FixFunctionEmptyArgs` can be
-    used to override the suggested function in that case. The default is `gsl::at`. 
+    used to override the suggested function in that case. The default is `gsl::at`.
 
 .. option:: FixFunctionEmptyArgs
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index d3c89e469188d..a4014b5f15f0b 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -90,13 +90,17 @@ Clang-Tidy Checks
    :doc:`bugprone-command-processor <bugprone/command-processor>`,
    :doc:`bugprone-compare-pointer-to-member-virtual-function <bugprone/compare-pointer-to-member-virtual-function>`,
    :doc:`bugprone-copy-constructor-init <bugprone/copy-constructor-init>`, "Yes"
+   :doc:`bugprone-copy-constructor-mutates-argument <bugprone/copy-constructor-mutates-argument>`,
    :doc:`bugprone-crtp-constructor-accessibility <bugprone/crtp-constructor-accessibility>`, "Yes"
    :doc:`bugprone-dangling-handle <bugprone/dangling-handle>`,
+   :doc:`bugprone-default-operator-new-on-overaligned-type <bugprone/default-operator-new-on-overaligned-type>`,
    :doc:`bugprone-derived-method-shadowing-base-method <bugprone/derived-method-shadowing-base-method>`,
    :doc:`bugprone-dynamic-static-initializers <bugprone/dynamic-static-initializers>`,
    :doc:`bugprone-easily-swappable-parameters <bugprone/easily-swappable-parameters>`,
    :doc:`bugprone-empty-catch <bugprone/empty-catch>`,
+   :doc:`bugprone-exception-copy-constructor-throws <bugprone/exception-copy-constructor-throws>`,
    :doc:`bugprone-exception-escape <bugprone/exception-escape>`,
+   :doc:`bugprone-float-loop-counter <bugprone/float-loop-counter>`,
    :doc:`bugprone-fold-init-type <bugprone/fold-init-type>`,
    :doc:`bugprone-forward-declaration-namespace <bugprone/forward-declaration-namespace>`,
    :doc:`bugprone-forwarding-reference-overload <bugprone/forwarding-reference-overload>`,
@@ -129,6 +133,8 @@ Clang-Tidy Checks
    :doc:`bugprone-parent-virtual-call <bugprone/parent-virtual-call>`, "Yes"
    :doc:`bugprone-pointer-arithmetic-on-polymorphic-object <bugprone/pointer-arithmetic-on-polymorphic-object>`,
    :doc:`bugprone-posix-return <bugprone/posix-return>`, "Yes"
+   :doc:`bugprone-random-generator-seed <bugprone/random-generator-seed>`,
+   :doc:`bugprone-raw-memory-call-on-non-trivial-type <bugprone/raw-memory-call-on-non-trivial-type>`,
    :doc:`bugprone-redundant-branch-condition <bugprone/redundant-branch-condition>`, "Yes"
    :doc:`bugprone-reserved-identifier <bugprone/reserved-identifier>`, "Yes"
    :doc:`bugprone-return-const-ref-from-parameter <bugprone/return-const-ref-from-parameter>`,
@@ -139,6 +145,7 @@ Clang-Tidy Checks
    :doc:`bugprone-sizeof-expression <bugprone/sizeof-expression>`,
    :doc:`bugprone-spuriously-wake-up-functions <bugprone/spuriously-wake-up-functions>`,
    :doc:`bugprone-standalone-empty <bugprone/standalone-empty>`, "Yes"
+   :doc:`bugprone-std-namespace-modification <bugprone/std-namespace-modification>`,
    :doc:`bugprone-string-constructor <bugprone/string-constructor>`, "Yes"
    :doc:`bugprone-string-integer-assignment <bugprone/string-integer-assignment>`, "Yes"
    :doc:`bugprone-string-literal-with-embedded-nul <bugprone/string-literal-with-embedded-nul>`,
@@ -173,14 +180,10 @@ Clang-Tidy Checks
    :doc:`bugprone-unused-return-value <bugprone/unused-return-value>`,
    :doc:`bugprone-use-after-move <bugprone/use-after-move>`,
    :doc:`bugprone-virtual-near-miss <bugprone/virtual-near-miss>`, "Yes"
-   :doc:`cert-dcl58-cpp <cert/dcl58-cpp>`,
    :doc:`cert-err33-c <cert/err33-c>`,
    :doc:`cert-err60-cpp <cert/err60-cpp>`,
    :doc:`cert-flp30-c <cert/flp30-c>`,
-   :doc:`cert-mem57-cpp <cert/mem57-cpp>`,
    :doc:`cert-msc50-cpp <cert/msc50-cpp>`,
-   :doc:`cert-msc51-cpp <cert/msc51-cpp>`,
-   :doc:`cert-oop57-cpp <cert/oop57-cpp>`,
    :doc:`cert-oop58-cpp <cert/oop58-cpp>`,
    :doc:`concurrency-mt-unsafe <concurrency/mt-unsafe>`,
    :doc:`concurrency-thread-canceltype-asynchronous <concurrency/thread-canceltype-asynchronous>`,
@@ -441,24 +444,31 @@ Check aliases
    :doc:`cert-dcl50-cpp <cert/dcl50-cpp>`, :doc:`modernize-avoid-variadic-functions <modernize/avoid-variadic-functions>`,
    :doc:`cert-dcl51-cpp <cert/dcl51-cpp>`, :doc:`bugprone-reserved-identifier <bugprone/reserved-identifier>`, "Yes"
    :doc:`cert-dcl54-cpp <cert/dcl54-cpp>`, :doc:`misc-new-delete-overloads <misc/new-delete-overloads>`,
+   :doc:`cert-dcl58-cpp <cert/dcl58-cpp>`, :doc:`bugprone-std-namespace-modification <bugprone/std-namespace-modification>`,
    :doc:`cert-dcl59-cpp <cert/dcl59-cpp>`, :doc:`google-build-namespaces <google/build-namespaces>`,
-   :doc:`cert-err09-cpp <cert/err09-cpp>`, :doc:`misc-throw-by-value-catch-by-reference <misc/throw-by-value-catch-by-reference>`,
    :doc:`cert-env33-c <cert/env33-c>`, :doc:`bugprone-command-processor <bugprone/command-processor>`,
+   :doc:`cert-err09-cpp <cert/err09-cpp>`, :doc:`misc-throw-by-value-catch-by-reference <misc/throw-by-value-catch-by-reference>`,
    :doc:`cert-err34-c <cert/err34-c>`, :doc:`bugprone-unchecked-string-to-number-conversion <bugprone/unchecked-string-to-number-conversion>`,
    :doc:`cert-err52-cpp <cert/err52-cpp>`, :doc:`modernize-avoid-setjmp-longjmp <modernize/avoid-setjmp-longjmp>`,
    :doc:`cert-err58-cpp <cert/err58-cpp>`, :doc:`bugprone-throwing-static-initialization <bugprone/throwing-static-initialization>`,
+   :doc:`cert-err60-cpp <cert/err60-cpp>`, :doc:`bugprone-exception-copy-constructor-throws <bugprone/exception-copy-constructor-throws>`,
    :doc:`cert-err61-cpp <cert/err61-cpp>`, :doc:`misc-throw-by-value-catch-by-reference <misc/throw-by-value-catch-by-reference>`,
    :doc:`cert-exp42-c <cert/exp42-c>`, :doc:`bugprone-suspicious-memory-comparison <bugprone/suspicious-memory-comparison>`,
    :doc:`cert-fio38-c <cert/fio38-c>`, :doc:`misc-non-copyable-objects <misc/non-copyable-objects>`,
+   :doc:`cert-flp30-c <cert/flp30-c>`, :doc:`bugprone-float-loop-counter <bugprone/float-loop-counter>`,
    :doc:`cert-flp37-c <cert/flp37-c>`, :doc:`bugprone-suspicious-memory-comparison <bugprone/suspicious-memory-comparison>`,
    :doc:`cert-int09-c <cert/int09-c>`, :doc:`readability-enum-initial-value <readability/enum-initial-value>`, "Yes"
+   :doc:`cert-mem57-cpp <cert/mem57-cpp>`, :doc:`bugprone-default-operator-new-on-overaligned-type <bugprone/default-operator-new-on-overaligned-type>`,
    :doc:`cert-msc24-c <cert/msc24-c>`, :doc:`bugprone-unsafe-functions <bugprone/unsafe-functions>`,
    :doc:`cert-msc30-c <cert/msc30-c>`, :doc:`cert-msc50-cpp <cert/msc50-cpp>`,
-   :doc:`cert-msc32-c <cert/msc32-c>`, :doc:`cert-msc51-cpp <cert/msc51-cpp>`,
+   :doc:`cert-msc32-c <cert/msc32-c>`, :doc:`bugprone-random-generator-seed <bugprone/random-generator-seed>`,
    :doc:`cert-msc33-c <cert/msc33-c>`, :doc:`bugprone-unsafe-functions <bugprone/unsafe-functions>`,
+   :doc:`cert-msc51-cpp <cert/msc51-cpp>`, :doc:`bugprone-random-generator-seed <bugprone/random-generator-seed>`,
    :doc:`cert-msc54-cpp <cert/msc54-cpp>`, :doc:`bugprone-signal-handler <bugprone/signal-handler>`,
    :doc:`cert-oop11-cpp <cert/oop11-cpp>`, :doc:`performance-move-constructor-init <performance/move-constructor-init>`,
    :doc:`cert-oop54-cpp <cert/oop54-cpp>`, :doc:`bugprone-unhandled-self-assignment <bugprone/unhandled-self-assignment>`,
+   :doc:`cert-oop57-cpp <cert/oop57-cpp>`, :doc:`bugprone-raw-memory-call-on-non-trivial-type <bugprone/raw-memory-call-on-non-trivial-type>`,
+   :doc:`cert-oop58-cpp <cert/oop58-cpp>`, :doc:`bugprone-copy-constructor-mutates-argument <bugprone/copy-constructor-mutates-argument>`,
    :doc:`cert-pos44-c <cert/pos44-c>`, :doc:`bugprone-bad-signal-to-kill-thread <bugprone/bad-signal-to-kill-thread>`,
    :doc:`cert-pos47-c <cert/pos47-c>`, :doc:`concurrency-thread-canceltype-asynchronous <concurrency/thread-canceltype-asynchronous>`,
    :doc:`cert-sig30-c <cert/sig30-c>`, :doc:`bugprone-signal-handler <bugprone/signal-handler>`,
diff --git a/clang-tools-extra/docs/clang-tidy/checks/llvm/prefer-static-over-anonymous-namespace.rst b/clang-tools-extra/docs/clang-tidy/checks/llvm/prefer-static-over-anonymous-namespace.rst
index 85579ca676a68..79179ce808302 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/llvm/prefer-static-over-anonymous-namespace.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/llvm/prefer-static-over-anonymous-namespace.rst
@@ -37,7 +37,7 @@ For example non-compliant code:
 Should become:
 
 .. code-block:: c++
-  
+
   // Small anonymous namespace for class declaration
   namespace {
 
@@ -48,7 +48,7 @@ Should become:
   };
 
   }
-  
+
   // placed method definition outside of the anonymous namespace
   bool StringSort::operator<(const char *RHS) const {}
 
@@ -70,4 +70,4 @@ Options
 .. option:: AllowMemberFunctionsInClass
 
   When `true`, only methods defined in anonymous namespace outside of the
-  corresponding class will be warned. Default value is `true`.
\ No newline at end of file
+  corresponding class will be warned. Default value is `true`.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst b/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst
index ec9ef1c60913c..6c994a48d83de 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst
@@ -14,3 +14,21 @@ should be generally avoided.
   // becomes
 
   static std::string Moo = (Twine("bark") + "bah").str();
+
+The ``Twine`` does not own the memory of its contents, so it is not
+recommended to use ``Twine`` created from temporary strings or string literals.
+
+.. code-block:: c++
+
+  static Twine getModuleIdentifier(StringRef moduleName) {
+    return moduleName + "_module";
+  }
+  void foo() {
+    Twine result = getModuleIdentifier(std::string{"abc"} + "def");
+    // temporary std::string is destroyed here, result is dangling
+  }
+
+After applying this fix-it hints, the code will use ``std::string`` instead of
+``Twine`` for local variables. However, ``Twine`` has lots of methods that
+are incompatible with ``std::string``, so the user may need to adjust the code
+manually after applying the fix-it hints.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/llvm/use-ranges.rst b/clang-tools-extra/docs/clang-tidy/checks/llvm/use-ranges.rst
index fffa2ff342a36..1a0454703822e 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/llvm/use-ranges.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/llvm/use-ranges.rst
@@ -12,7 +12,7 @@ Example
 .. code-block:: c++
 
   auto it = std::find(vec.begin(), vec.end(), value);
-  bool all = std::all_of(vec.begin(), vec.end(), 
+  bool all = std::all_of(vec.begin(), vec.end(),
                          [](int x) { return x > 0; });
 
 Transforms to:
diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/const-correctness.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/const-correctness.rst
index 93a5762be189a..18ec10be347dc 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/misc/const-correctness.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/misc/const-correctness.rst
@@ -99,7 +99,7 @@ Options
 .. option:: AnalyzePointers
 
   Enable or disable the analysis of pointers variables, like
-  ``int *ptr = &i;``. For specific checks, see 
+  ``int *ptr = &i;``. For specific checks, see
   :option:`WarnPointersAsValues` and :option:`WarnPointersAsPointers`.
   Default is `true`.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst
index 0b054e4e20bd6..be80d39e4abf9 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst
@@ -81,3 +81,23 @@ Options
     Eg: `my::safe::awaitable;other::awaitable`
     Default is an empty string.
 
+.. option:: AllowedCallees
+
+    A semicolon-separated list of callee function names which can
+    be safely awaited while having hostile RAII objects in scope.
+    Example usage:
+
+    .. code-block:: c++
+
+      // Consider option AllowedCallees = "noop"
+      task noop() { co_return; }
+
+      task coro() {
+        // This persists across the co_await but is not flagged
+        // because the awaitable is considered safe to await on.
+        const std::lock_guard l(&mu_);
+        co_await noop();
+      }
+
+    Eg: `my::safe::await;other::await`
+    Default is an empty string.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/include-cleaner.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/include-cleaner.rst
index 34833a3dd1aea..4364610787058 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/misc/include-cleaner.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/misc/include-cleaner.rst
@@ -10,7 +10,7 @@ Findings correspond to https://clangd.llvm.org/design/include-cleaner.
 Example:
 
 .. code-block:: c++
-   
+
    // foo.h
    class Foo{};
    // bar.h
diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/misleading-bidirectional.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/misleading-bidirectional.rst
index 16ffc97d56ab4..e9bc19a4b10e1 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/misc/misleading-bidirectional.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/misc/misleading-bidirectional.rst
@@ -3,7 +3,7 @@
 misc-misleading-bidirectional
 =============================
 
-Warn about unterminated bidirectional unicode sequence, detecting potential attack
+Warns about unterminated bidirectional unicode sequence, detecting potential attack
 as described in the `Trojan Source <https://www.trojansource.codes>`_ attack.
 
 Example:
diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/non-copyable-objects.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/non-copyable-objects.rst
index d1f7bba39b37c..d3190e1155f08 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/misc/non-copyable-objects.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/misc/non-copyable-objects.rst
@@ -5,9 +5,12 @@ misc-non-copyable-objects
 
 `cert-fio38-c` redirects here as an alias for this check.
 
-The check flags dereferences and non-pointer declarations of objects that are
+Flags dereferences and non-pointer declarations of objects that are
 not meant to be passed by value, such as C FILE objects or POSIX
 ``pthread_mutex_t`` objects.
 
+References
+----------
+
 This check corresponds to CERT C++ Coding Standard rule `FIO38-C. Do not copy a FILE object
 <https://www.securecoding.cert.org/confluence/display/c/FIO38-C.+Do+not+copy+a+FILE+object>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/override-with-different-visibility.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/override-with-different-visibility.rst
index 310bfe2b01080..24be51b53c9c4 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/misc/override-with-different-visibility.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/misc/override-with-different-visibility.rst
@@ -20,7 +20,7 @@ the ``using`` keyword is not considered as visibility change by this check.
   private:
     virtual void f_priv();
   };
-  
+
   class B: public A {
   public:
     void f_priv(); // warning: changed visibility from private to public
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/min-max-use-initializer-list.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/min-max-use-initializer-list.rst
index d6721a25629b0..157c447ee4d98 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/min-max-use-initializer-list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/min-max-use-initializer-list.rst
@@ -3,7 +3,7 @@
 modernize-min-max-use-initializer-list
 ======================================
 
-Replaces nested ``std::min`` and ``std::max`` calls with an initializer list 
+Replaces nested ``std::min`` and ``std::max`` calls with an initializer list
 where applicable.
 
 For instance, consider the following code:
@@ -21,8 +21,8 @@ The check will transform the above code to:
 Performance Considerations
 ==========================
 
-While this check simplifies the code and makes it more readable, it may cause 
-performance degradation for non-trivial types due to the need to copy objects 
+While this check simplifies the code and makes it more readable, it may cause
+performance degradation for non-trivial types due to the need to copy objects
 into the initializer list.
 
 To avoid this, it is recommended to use `std::ref` or `std::cref` for
@@ -47,4 +47,4 @@ Options
 .. option:: IgnoreTrivialTypesOfSizeAbove
 
   An integer specifying the size (in bytes) above which trivial types are
-  ignored. Default is `32`.
\ No newline at end of file
+  ignored. Default is `32`.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/type-traits.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/type-traits.rst
index 91be4fb05a0a9..c0cffde820e84 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/type-traits.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/type-traits.rst
@@ -37,7 +37,7 @@ Options
 
     #define IS_SIGNED(T) std::is_signed<T>::value
 
-  Defaults to `false`. 
+  Defaults to `false`.
 
 
 Limitations
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-ranges.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-ranges.rst
index 912b42b33f919..98779d8687348 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-ranges.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-ranges.rst
@@ -114,7 +114,7 @@ If calls are made using reverse iterators on containers, The code will be
 fixed using the ``std::views::reverse`` adaptor.
 
 .. code-block:: c++
-  
+
   auto AreSame = std::equal(Items1.rbegin(), Items1.rend(),
                             std::crbegin(Items2), std::crend(Items2));
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-scoped-lock.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-scoped-lock.rst
index 7cf24b43d2e7b..9235d4246782e 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-scoped-lock.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-scoped-lock.rst
@@ -93,9 +93,9 @@ Options
 
     template <typename T>
     using Lock = std::lock_guard<T>; // warning: use 'std::scoped_lock' instead of 'std::lock_guard'
-    
+
     using LockMutex = std::lock_guard<std::mutex>; // warning: use 'std::scoped_lock' instead of 'std::lock_guard'
-    
+
     typedef std::lock_guard<std::mutex> LockDef; // warning: use 'std::scoped_lock' instead of 'std::lock_guard'
 
-    using std::lock_guard; // warning: use 'std::scoped_lock' instead of 'std::lock_guard'
\ No newline at end of file
+    using std::lock_guard; // warning: use 'std::scoped_lock' instead of 'std::lock_guard'
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-starts-ends-with.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-starts-ends-with.rst
index 1babc2d1660ec..fd89b780f7519 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-starts-ends-with.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-starts-ends-with.rst
@@ -4,7 +4,7 @@ modernize-use-starts-ends-with
 ==============================
 
 Checks for common roundabout ways to express ``starts_with`` and ``ends_with``
-and suggests replacing with the simpler method when it is available. Notably, 
+and suggests replacing with the simpler method when it is available. Notably,
 this will work with ``std::string`` and ``std::string_view``.
 
 Covered scenarios:
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-format.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-format.rst
index cfa11d3cac8bf..21bb254217910 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-format.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-format.rst
@@ -62,12 +62,12 @@ Options
 
 .. option:: StrFormatLikeFunctions
 
-   A semicolon-separated list of (fully qualified) function names to
-   replace, with the requirement that the first parameter contains the
-   printf-style format string and the arguments to be formatted follow
-   immediately afterwards. Qualified member function names are supported,
-   but the replacement function name must be unqualified. The default value
-   for this option is `absl::StrFormat`.
+   A semicolon-separated list of regular expressions matching the
+   (fully qualified) names of functions to replace, with the requirement that
+   the first parameter contains the printf-style format string and the
+   arguments to be formatted follow immediately afterwards. Qualified member
+   function names are supported, but the replacement function name must be
+   unqualified. The default value is `absl::StrFormat`.
 
 .. option:: ReplacementFormatFunction
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-print.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-print.rst
index 0cf51e3961a05..3005708c6f8a8 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-print.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-std-print.rst
@@ -122,25 +122,27 @@ Options
 
 .. option:: PrintfLikeFunctions
 
-   A semicolon-separated list of (fully qualified) function names to
-   replace, with the requirement that the first parameter contains the
-   printf-style format string and the arguments to be formatted follow
-   immediately afterwards. Qualified member function names are supported,
-   but the replacement function name must be unqualified. If neither this
-   option nor `FprintfLikeFunctions` are set then the default value for
-   this option is `printf; absl::PrintF`, otherwise it is empty.
+   A semicolon-separated list of regular expressions matching the
+   (fully qualified) names of functions to replace, with the requirement
+   that the first parameter contains the printf-style format string and the
+   arguments to be formatted follow immediately afterwards. Qualified member
+   function names are supported, but the replacement function name must be
+   unqualified. If neither this option nor `FprintfLikeFunctions` are set then
+   the default value is `printf; absl::PrintF`, otherwise it is the empty
+   string.
 
 
 .. option:: FprintfLikeFunctions
 
-   A semicolon-separated list of (fully qualified) function names to
-   replace, with the requirement that the first parameter is retained, the
-   second parameter contains the printf-style format string and the
-   arguments to be formatted follow immediately afterwards. Qualified
-   member function names are supported, but the replacement function name
-   must be unqualified. If neither this option nor `PrintfLikeFunctions`
-   are set then the default value for this option is `fprintf;
-   absl::FPrintF`, otherwise it is empty.
+   A semicolon-separated list of regular expressions matching the
+   (fully qualified) names of functions to replace, with the requirement
+   that the first parameter is retained, the second parameter contains the
+   printf-style format string and the arguments to be formatted follow
+   immediately afterwards. Qualified member function names are supported,
+   but the replacement function name must be unqualified. If neither this
+   option nor `PrintfLikeFunctions` are set then the default value is
+   `fprintf;absl::FPrintF`, otherwise it is the empty string.
+
 
 .. option:: ReplacementPrintFunction
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/objc/nsdate-formatter.rst b/clang-tools-extra/docs/clang-tidy/checks/objc/nsdate-formatter.rst
index cff493b52913f..b5a1386d2166e 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/objc/nsdate-formatter.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/objc/nsdate-formatter.rst
@@ -10,64 +10,64 @@ despite being legal. See http://www.unicode.org/reports/tr35/tr35-dates.html#Dat
 This checker reports as warnings the following string patterns in a date format specifier:
 
 #. yyyy + ww : Calendar year specified with week of a week year (unless YYYY is also specified).
-  
-   * | **Example 1:** Input Date: `29 December 2014` ; Format String: `yyyy-ww`; 
+
+   * | **Example 1:** Input Date: `29 December 2014` ; Format String: `yyyy-ww`;
      | Output string: `2014-01` (Wrong because it’s not the first week of 2014)
-    
-   * | **Example 2:** Input Date: `29 December 2014` ; Format String: `dd-MM-yyyy (ww-YYYY)`; 
+
+   * | **Example 2:** Input Date: `29 December 2014` ; Format String: `dd-MM-yyyy (ww-YYYY)`;
      | Output string: `29-12-2014 (01-2015)` (This is correct)
-    
+
 #. F without ee/EE : Numeric day of week in a month without actual day.
-    
-   * | **Example:** Input Date: `29 December 2014` ; Format String: `F-MM`; 
+
+   * | **Example:** Input Date: `29 December 2014` ; Format String: `F-MM`;
      | Output string: `5-12` (Wrong because it reads as *5th ___ of Dec* in English)
-    
+
 #. F without MM : Numeric day of week in a month without month.
-   
+
    * | **Example:** Input Date: `29 December 2014` ; Format String: `F-EE`
      | Output string: `5-Mon` (Wrong because it reads as *5th Mon of ___* in English)
-    
+
 #. WW without MM : Week of the month without the month.
-   
+
    * | **Example:** Input Date: `29 December 2014` ; Format String: `WW-yyyy`
      | Output string: `05-2014` (Wrong because it reads as *5th Week of ___* in English)
-    
+
 #. YYYY + QQ : Week year specified with quarter of normal year (unless yyyy is also specified).
-   
+
    * | **Example 1:** Input Date: `29 December 2014` ; Format String: `YYYY-QQ`
      | Output string: `2015-04` (Wrong because it’s not the 4th quarter of 2015)
-    
+
    * | **Example 2:** Input Date: `29 December 2014` ; Format String: `ww-YYYY (QQ-yyyy)`
      | Output string: `01-2015 (04-2014)` (This is correct)
-    
+
 #. YYYY + MM :  Week year specified with Month of a calendar year (unless yyyy is also specified).
-    
+
    * | **Example 1:** Input Date: `29 December 2014` ; Format String: `YYYY-MM`
      | Output string: `2015-12` (Wrong because it’s not the 12th month of 2015)
-    
+
    * | **Example 2:** Input Date: `29 December 2014` ; Format String: `ww-YYYY (MM-yyyy)`
      | Output string: `01-2015 (12-2014)` (This is correct)
-    
+
 #. YYYY + DD : Week year with day of a calendar year (unless yyyy is also specified).
-    
+
    * | **Example 1:** Input Date: `29 December 2014` ; Format String: `YYYY-DD`
      | Output string: `2015-363` (Wrong because it’s not the 363rd day of 2015)
-    
+
    * | **Example 2:** Input Date: `29 December 2014` ; Format String: `ww-YYYY (DD-yyyy)`
      | Output string: `01-2015 (363-2014)` (This is correct)
-    
+
 #. YYYY + WW : Week year with week of a calendar year (unless yyyy is also specified).
-    
+
    * | **Example 1:** Input Date: `29 December 2014` ; Format String: `YYYY-WW`
      | Output string: `2015-05` (Wrong because it’s not the 5th week of 2015)
-    
+
    * | **Example 2:** Input Date: `29 December 2014` ; Format String: `ww-YYYY (WW-MM-yyyy)`
      | Output string: `01-2015 (05-12-2014)` (This is correct)
-    
+
 #. YYYY + F : Week year with day of week in a calendar month (unless yyyy is also specified).
-    
+
    * | **Example 1:** Input Date: `29 December 2014` ; Format String: `YYYY-ww-F-EE`
      | Output string: `2015-01-5-Mon` (Wrong because it’s not the 5th Monday of January in 2015)
-    
+
    * | **Example 2:** Input Date: `29 December 2014` ; Format String: `ww-YYYY (F-EE-MM-yyyy)`
      | Output string: `01-2015 (5-Mon-12-2014)` (This is correct)
diff --git a/clang-tools-extra/docs/clang-tidy/checks/portability/template-virtual-member-function.rst b/clang-tools-extra/docs/clang-tidy/checks/portability/template-virtual-member-function.rst
index aa3ed6653b475..913b20f93b438 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/portability/template-virtual-member-function.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/portability/template-virtual-member-function.rst
@@ -3,11 +3,11 @@
 portability-template-virtual-member-function
 ============================================
 
-Finds cases when an uninstantiated virtual member function in a template class causes 
+Finds cases when an uninstantiated virtual member function in a template class causes
 cross-compiler incompatibility.
 
-Upon instantiating a template class, non-virtual member functions don't have to be 
-instantiated unless they are used. Virtual member function instantiation on the other hand 
+Upon instantiating a template class, non-virtual member functions don't have to be
+instantiated unless they are used. Virtual member function instantiation on the other hand
 is unspecified and depends on the implementation of the compiler.
 
 In the following snippets the virtual member function is not instantiated by GCC and Clang,
@@ -19,7 +19,7 @@ it is rejected by the latter.
     template<typename T>
     struct CrossPlatformError {
         virtual ~CrossPlatformError() = default;
-        
+
         static void used() {}
 
         virtual void unused() {
@@ -33,5 +33,5 @@ it is rejected by the latter.
     }
 
 Cross-platform projects that need to support MSVC on Windows might see compiler errors
-because certain virtual member functions are instantiated, which are not instantiated 
+because certain virtual member functions are instantiated, which are not instantiated
 by other compilers on other platforms. This check highlights such virtual member functions.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/ambiguous-smartptr-reset-call.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/ambiguous-smartptr-reset-call.rst
index cf73839a46cfb..f8df02dd4460e 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/ambiguous-smartptr-reset-call.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/ambiguous-smartptr-reset-call.rst
@@ -40,14 +40,14 @@ other smart pointers or other classes use the :option:`SmartPointers` option.
 
 
 .. note::
-    
+
   The check may emit invalid fix-its and misleading warning messages when
   specifying custom smart pointers or other classes in the
   :option:`SmartPointers` option. For example, ``boost::scoped_ptr`` does not
   have an ``operator=`` which makes fix-its invalid.
 
 .. note::
-    
+
   Automatic fix-its are enabled only if :program:`clang-tidy` is invoked with
   the `--fix-notes` option.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst
index da6f770b3d74b..cc012fdcd7649 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst
@@ -30,8 +30,8 @@ Options
 
 .. option:: ExcludedComparisonTypes
 
-    A semicolon-separated list of class names for which the check will ignore
-    comparisons of objects with default-constructed objects of the same type.
-    If a class is listed here, the check will not suggest using ``empty()``
-    instead of such comparisons for objects of that class.
-    Default value is: `::std::array`.
+    A semicolon-separated list of regular expressions matching class names for
+    which the check will ignore comparisons of objects with default-constructed
+    objects of the same type. If a class is listed here, the check will not
+    suggest using ``empty()`` instead of such comparisons for objects of that
+    class. Default value is: `::std::array`.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/math-missing-parentheses.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/math-missing-parentheses.rst
index 21d66daab334c..59f17ebc2d08b 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/math-missing-parentheses.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/math-missing-parentheses.rst
@@ -9,7 +9,7 @@ of different priorities.
 Parentheses in mathematical expressions clarify the order
 of operations, especially with different-priority operators. Lengthy or multiline
 expressions can obscure this order, leading to coding errors. IDEs can aid clarity
-by highlighting parentheses. Explicitly using parentheses also clarifies what the 
+by highlighting parentheses. Explicitly using parentheses also clarifies what the
 developer had in mind when writing the expression. Ensuring their presence reduces
 ambiguity and errors, promoting clearer and more maintainable code.
 
@@ -24,4 +24,4 @@ After:
 
 .. code-block:: c++
 
-  int x = 1 + (2 * 3) - (4 / 5);
\ No newline at end of file
+  int x = 1 + (2 * 3) - (4 / 5);
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-inline-specifier.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-inline-specifier.rst
index c33c05b42e500..5ae80d54e9154 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-inline-specifier.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-inline-specifier.rst
@@ -15,7 +15,7 @@ In the example above the keyword ``inline`` is redundant since constexpr
 functions are implicitly inlined
 
 .. code-block:: c++
-   
+
    class MyClass {
        inline void myMethod() {}
    };
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst
index 23d975e646490..b9c50c5b59889 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst
@@ -27,3 +27,16 @@ affect the semantics.
 .. code-block:: c++
 
   int a = (1 * 2) + 3; // no warning
+
+Options
+-------
+
+.. option:: AllowedDecls
+
+  Semicolon-separated list of regular expressions matching names of declarations
+  to ignore when the parentheses are around. Declarations can include variables
+  or functions. The default is an `std::max;std::min`.
+
+  Some STL library functions may have the same name as widely used function-like
+  macro. For example, ``std::max`` and ``max`` macro. A workaround to distinguish
+  them is adding parentheses around functions to prevent function-like macro.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-string-cstr.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-string-cstr.rst
index 2789f9c096ccf..7b507771d6799 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-string-cstr.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-string-cstr.rst
@@ -11,10 +11,10 @@ Options
 
 .. option:: StringParameterFunctions
 
-   A semicolon-separated list of (fully qualified) function/method/operator
-   names, with the requirement that any parameter currently accepting a
-   ``const char*`` input should also be able to accept ``std::string``
-   inputs, or proper overload candidates that can do so should exist. This
-   can be used to configure functions such as ``fmt::format``,
-   ``spdlog::logger::info``, or wrappers around these and similar
-   functions. The default value is the empty string.
+   A semicolon-separated list of regular expressions matching the
+   (fully qualified) names of function/method/operator, with the requirement
+   that any parameter currently accepting a ``const char*`` input should also
+   be able to accept ``std::string`` inputs, or proper overload candidates that
+   can do so should exist. This can be used to configure functions such as
+   ``fmt::format``, ``spdlog::logger::info``, or wrappers around these and
+   similar functions. The default value is the empty string.
diff --git a/clang-tools-extra/docs/index.rst b/clang-tools-extra/docs/index.rst
index 3f3a99d1b70c6..eba4a2cdbc558 100644
--- a/clang-tools-extra/docs/index.rst
+++ b/clang-tools-extra/docs/index.rst
@@ -22,6 +22,7 @@ Contents
    pp-trace
    clangd <https://clangd.llvm.org/>
    clang-doc
+   Maintainers
 
 
 Doxygen Documentation
diff --git a/clang-tools-extra/modularize/CMakeLists.txt b/clang-tools-extra/modularize/CMakeLists.txt
index eb5383c3ad44e..a775b790a3147 100644
--- a/clang-tools-extra/modularize/CMakeLists.txt
+++ b/clang-tools-extra/modularize/CMakeLists.txt
@@ -20,6 +20,7 @@ clang_target_link_libraries(modularize
   clangAST
   clangBasic
   clangDriver
+  clangOptions
   clangFrontend
   clangLex
   clangSerialization
diff --git a/clang-tools-extra/modularize/CoverageChecker.cpp b/clang-tools-extra/modularize/CoverageChecker.cpp
index 1345a6ef8f489..d80d78c64c6e2 100644
--- a/clang-tools-extra/modularize/CoverageChecker.cpp
+++ b/clang-tools-extra/modularize/CoverageChecker.cpp
@@ -50,18 +50,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CoverageChecker.h"
 #include "ModularizeUtilities.h"
 #include "clang/AST/ASTConsumer.h"
-#include "CoverageChecker.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/SourceManager.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendAction.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Lex/PPCallbacks.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Options/Options.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/Option/Option.h"
@@ -73,7 +73,7 @@
 using namespace Modularize;
 using namespace clang;
 using namespace clang::driver;
-using namespace clang::driver::options;
+using namespace clang::options;
 using namespace clang::tooling;
 namespace cl = llvm::cl;
 namespace sys = llvm::sys;
diff --git a/clang-tools-extra/modularize/Modularize.cpp b/clang-tools-extra/modularize/Modularize.cpp
index 376ad0c7875bf..33966b44f719a 100644
--- a/clang-tools-extra/modularize/Modularize.cpp
+++ b/clang-tools-extra/modularize/Modularize.cpp
@@ -231,11 +231,11 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/SourceManager.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendAction.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Options/Options.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/Option/Arg.h"
@@ -254,7 +254,7 @@
 
 using namespace clang;
 using namespace clang::driver;
-using namespace clang::driver::options;
+using namespace clang::options;
 using namespace clang::tooling;
 using namespace llvm;
 using namespace llvm::opt;
diff --git a/clang-tools-extra/modularize/ModularizeUtilities.cpp b/clang-tools-extra/modularize/ModularizeUtilities.cpp
index 4dd84feac5df4..6978a6b2fe1b7 100644
--- a/clang-tools-extra/modularize/ModularizeUtilities.cpp
+++ b/clang-tools-extra/modularize/ModularizeUtilities.cpp
@@ -12,17 +12,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ModularizeUtilities.h"
+#include "CoverageChecker.h"
 #include "clang/Basic/SourceManager.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendActions.h"
-#include "CoverageChecker.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
-#include "ModularizeUtilities.h"
 
 using namespace clang;
 using namespace llvm;
diff --git a/clang-tools-extra/pp-trace/CMakeLists.txt b/clang-tools-extra/pp-trace/CMakeLists.txt
index 1323adbc35269..da36582ee0234 100644
--- a/clang-tools-extra/pp-trace/CMakeLists.txt
+++ b/clang-tools-extra/pp-trace/CMakeLists.txt
@@ -14,6 +14,7 @@ clang_target_link_libraries(pp-trace
   PRIVATE
   clangAST
   clangBasic
+  clangOptions
   clangFrontend
   clangLex
   clangSerialization
diff --git a/clang-tools-extra/pp-trace/PPTrace.cpp b/clang-tools-extra/pp-trace/PPTrace.cpp
index 0b078c49a55b7..ba5a06a26830d 100644
--- a/clang-tools-extra/pp-trace/PPTrace.cpp
+++ b/clang-tools-extra/pp-trace/PPTrace.cpp
@@ -28,11 +28,11 @@
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/Basic/SourceManager.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendAction.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Options/Options.h"
 #include "clang/Tooling/Execution.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/Option/Arg.h"
diff --git a/clang-tools-extra/test/CMakeLists.txt b/clang-tools-extra/test/CMakeLists.txt
index a70d2ef2d92f2..78447e7a00db8 100644
--- a/clang-tools-extra/test/CMakeLists.txt
+++ b/clang-tools-extra/test/CMakeLists.txt
@@ -87,4 +87,7 @@ add_lit_testsuite(check-clang-extra "Running clang-tools-extra/test"
 
 add_lit_testsuites(CLANG-EXTRA ${CMAKE_CURRENT_SOURCE_DIR}
   DEPENDS ${CLANG_TOOLS_TEST_DEPS}
+   SKIP "^clang-doc"
   )
+
+add_subdirectory(clang-doc)
diff --git a/clang-tools-extra/test/clang-doc/CMakeLists.txt b/clang-tools-extra/test/clang-doc/CMakeLists.txt
new file mode 100644
index 0000000000000..4446b2a3c897f
--- /dev/null
+++ b/clang-tools-extra/test/clang-doc/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Specialize the clang-doc target to avoid building other projects
+add_lit_testsuite(check-clang-extra-clang-doc "Running clang-doc tests"
+  ${CMAKE_CURRENT_BINARY_DIR}
+  EXCLUDE_FROM_CHECK_ALL
+  DEPENDS clang-doc
+  DEPENDS ${LLVM_UTILS_DEPS}
+)
diff --git a/clang-tools-extra/test/clang-doc/basic-project.mustache.test b/clang-tools-extra/test/clang-doc/basic-project.mustache.test
index 55099517101f2..5a40a6b7c9799 100644
--- a/clang-tools-extra/test/clang-doc/basic-project.mustache.test
+++ b/clang-tools-extra/test/clang-doc/basic-project.mustache.test
@@ -2,17 +2,17 @@
 // RUN: sed 's|$test_dir|%/S|g' %S/Inputs/basic-project/database_template.json > %t/build/compile_commands.json
 
 // RUN: clang-doc --format=mustache --output=%t/docs --executor=all-TUs %t/build/compile_commands.json
-// RUN: FileCheck %s -input-file=%t/docs/html/_ZTV5Shape.html -check-prefix=HTML-SHAPE
-// RUN: FileCheck %s -input-file=%t/docs/html/_ZTV10Calculator.html -check-prefix=HTML-CALC
-// RUN: FileCheck %s -input-file=%t/docs/html/_ZTV9Rectangle.html -check-prefix=HTML-RECTANGLE
-// RUN: FileCheck %s -input-file=%t/docs/html/_ZTV6Circle.html -check-prefix=HTML-CIRCLE
+// RUN: FileCheck %s -input-file=%t/docs/html/GlobalNamespace/_ZTV5Shape.html -check-prefix=HTML-SHAPE
+// RUN: FileCheck %s -input-file=%t/docs/html/GlobalNamespace/_ZTV10Calculator.html -check-prefix=HTML-CALC
+// RUN: FileCheck %s -input-file=%t/docs/html/GlobalNamespace/_ZTV9Rectangle.html -check-prefix=HTML-RECTANGLE
+// RUN: FileCheck %s -input-file=%t/docs/html/GlobalNamespace/_ZTV6Circle.html -check-prefix=HTML-CIRCLE
 
 HTML-SHAPE: <html lang="en-US">
 HTML-SHAPE: <head>
 HTML-SHAPE:     <meta charset="utf-8"/>
 HTML-SHAPE:     <title>Shape</title>
-HTML-SHAPE:         <link rel="stylesheet" type="text/css" href="./clang-doc-mustache.css"/>
-HTML-SHAPE:         <script src="./mustache-index.js"></script>
+HTML-SHAPE:         <link rel="stylesheet" type="text/css" href="../clang-doc-mustache.css"/>
+HTML-SHAPE:         <script src="../mustache-index.js"></script>
 HTML-SHAPE:     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css">
 HTML-SHAPE:     <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
 HTML-SHAPE:     <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/cpp.min.js"></script>
@@ -151,8 +151,8 @@ HTML-CALC: <html lang="en-US">
 HTML-CALC: <head>
 HTML-CALC:     <meta charset="utf-8"/>
 HTML-CALC:     <title>Calculator</title>
-HTML-CALC:         <link rel="stylesheet" type="text/css" href="./clang-doc-mustache.css"/>
-HTML-CALC:         <script src="./mustache-index.js"></script>
+HTML-CALC:         <link rel="stylesheet" type="text/css" href="../clang-doc-mustache.css"/>
+HTML-CALC:         <script src="../mustache-index.js"></script>
 HTML-CALC:     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css">
 HTML-CALC:     <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
 HTML-CALC:     <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/cpp.min.js"></script>
@@ -440,8 +440,8 @@ HTML-RECTANGLE: <html lang="en-US">
 HTML-RECTANGLE: <head>
 HTML-RECTANGLE:     <meta charset="utf-8"/>
 HTML-RECTANGLE:     <title>Rectangle</title>
-HTML-RECTANGLE:         <link rel="stylesheet" type="text/css" href="./clang-doc-mustache.css"/>
-HTML-RECTANGLE:         <script src="./mustache-index.js"></script>
+HTML-RECTANGLE:         <link rel="stylesheet" type="text/css" href="../clang-doc-mustache.css"/>
+HTML-RECTANGLE:         <script src="../mustache-index.js"></script>
 HTML-RECTANGLE:     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css">
 HTML-RECTANGLE:     <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
 HTML-RECTANGLE:     <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/cpp.min.js"></script>
@@ -597,8 +597,8 @@ HTML-CIRCLE: <html lang="en-US">
 HTML-CIRCLE: <head>
 HTML-CIRCLE:     <meta charset="utf-8"/>
 HTML-CIRCLE:     <title>Circle</title>
-HTML-CIRCLE:         <link rel="stylesheet" type="text/css" href="./clang-doc-mustache.css"/>
-HTML-CIRCLE:         <script src="./mustache-index.js"></script>
+HTML-CIRCLE:         <link rel="stylesheet" type="text/css" href="../clang-doc-mustache.css"/>
+HTML-CIRCLE:         <script src="../mustache-index.js"></script>
 HTML-CIRCLE:     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css">
 HTML-CIRCLE:     <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
 HTML-CIRCLE:     <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/cpp.min.js"></script>
diff --git a/clang-tools-extra/test/clang-doc/json/class-requires.cpp b/clang-tools-extra/test/clang-doc/json/class-requires.cpp
index 513961723990e..4e5ec3a5729cd 100644
--- a/clang-tools-extra/test/clang-doc/json/class-requires.cpp
+++ b/clang-tools-extra/test/clang-doc/json/class-requires.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/json/_ZTV7MyClass.json
+// RUN: FileCheck %s < %t/json/GlobalNamespace/_ZTV7MyClass.json
 
 template<typename T>
 concept Addable = requires(T a, T b) {
diff --git a/clang-tools-extra/test/clang-doc/json/class-specialization.cpp b/clang-tools-extra/test/clang-doc/json/class-specialization.cpp
index d3ad6957e7851..60f3b44eb69b0 100644
--- a/clang-tools-extra/test/clang-doc/json/class-specialization.cpp
+++ b/clang-tools-extra/test/clang-doc/json/class-specialization.cpp
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/json/_ZTV7MyClass.json --check-prefix=BASE
-// RUN: FileCheck %s < %t/json/_ZTV7MyClassIiE.json --check-prefix=SPECIALIZATION
+// RUN: FileCheck %s < %t/json/GlobalNamespace/_ZTV7MyClass.json --check-prefix=BASE
+// RUN: FileCheck %s < %t/json/GlobalNamespace/_ZTV7MyClassIiE.json --check-prefix=SPECIALIZATION
 
 template<typename T> struct MyClass {};
 
diff --git a/clang-tools-extra/test/clang-doc/json/class-template.cpp b/clang-tools-extra/test/clang-doc/json/class-template.cpp
index 5ef78f54854dd..de52064466140 100644
--- a/clang-tools-extra/test/clang-doc/json/class-template.cpp
+++ b/clang-tools-extra/test/clang-doc/json/class-template.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/json/_ZTV7MyClass.json
+// RUN: FileCheck %s < %t/json/GlobalNamespace/_ZTV7MyClass.json
 
 template<typename T> struct MyClass {
   T MemberTemplate;
diff --git a/clang-tools-extra/test/clang-doc/json/class.cpp b/clang-tools-extra/test/clang-doc/json/class.cpp
index 20a9f218b3d79..91160585bef1a 100644
--- a/clang-tools-extra/test/clang-doc/json/class.cpp
+++ b/clang-tools-extra/test/clang-doc/json/class.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/json/_ZTV7MyClass.json
+// RUN: FileCheck %s < %t/json/GlobalNamespace/_ZTV7MyClass.json
 
 struct Foo;
 
@@ -124,8 +124,6 @@ struct MyClass {
 // CHECK-NEXT:        }
 // CHECK-NEXT:      }
 // CHECK-NEXT:    ],
-// COM:           FIXME: FullName is not emitted correctly.
-// CHECK-NEXT:    "FullName": "",
 // CHECK-NEXT:    "HasEnums": true,
 // CHECK-NEXT:    "HasPublicFunctions": true,
 // CHECK-NEXT:    "HasPublicMembers": true,
diff --git a/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp b/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp
index 1a73a0ddb722f..5b15a88d562de 100644
--- a/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp
+++ b/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/json/index.json
+// RUN: FileCheck %s < %t/json/GlobalNamespace/index.json
 
 template<typename T> concept Incrementable = requires (T a) {
   a++;
diff --git a/clang-tools-extra/test/clang-doc/json/concept.cpp b/clang-tools-extra/test/clang-doc/json/concept.cpp
index e96ec14d7dde4..5d8c47eff0a16 100644
--- a/clang-tools-extra/test/clang-doc/json/concept.cpp
+++ b/clang-tools-extra/test/clang-doc/json/concept.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/json/index.json
+// RUN: FileCheck %s < %t/json/GlobalNamespace/index.json
 
 // Requires that T suports post and pre-incrementing.
 template<typename T>
diff --git a/clang-tools-extra/test/clang-doc/json/function-requires.cpp b/clang-tools-extra/test/clang-doc/json/function-requires.cpp
index 94271467cba63..8ba6adc66a54b 100644
--- a/clang-tools-extra/test/clang-doc/json/function-requires.cpp
+++ b/clang-tools-extra/test/clang-doc/json/function-requires.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/json/index.json
+// RUN: FileCheck %s < %t/json/GlobalNamespace/index.json
 
 template<typename T>
 concept Incrementable = requires(T x) {
diff --git a/clang-tools-extra/test/clang-doc/json/function-specifiers.cpp b/clang-tools-extra/test/clang-doc/json/function-specifiers.cpp
index faaccb7d4f63f..6630d9e873dcf 100644
--- a/clang-tools-extra/test/clang-doc/json/function-specifiers.cpp
+++ b/clang-tools-extra/test/clang-doc/json/function-specifiers.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/json/index.json
+// RUN: FileCheck %s < %t/json/GlobalNamespace/index.json
 
 static void myFunction() {}
 
diff --git a/clang-tools-extra/test/clang-doc/json/method-template.cpp b/clang-tools-extra/test/clang-doc/json/method-template.cpp
index 87977f891a223..f4885d956ad9b 100644
--- a/clang-tools-extra/test/clang-doc/json/method-template.cpp
+++ b/clang-tools-extra/test/clang-doc/json/method-template.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/json/_ZTV7MyClass.json
+// RUN: FileCheck %s < %t/json/GlobalNamespace/_ZTV7MyClass.json
 
 struct MyClass {
   template<class T> T methodTemplate(T param) {
diff --git a/clang-tools-extra/test/clang-doc/json/multiple-namespaces.cpp b/clang-tools-extra/test/clang-doc/json/multiple-namespaces.cpp
index 04fcfc1dc0a85..69269989e03d4 100644
--- a/clang-tools-extra/test/clang-doc/json/multiple-namespaces.cpp
+++ b/clang-tools-extra/test/clang-doc/json/multiple-namespaces.cpp
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/json/foo_tools.json --check-prefix=CHECK-FOO
-// RUN: FileCheck %s < %t/json/bar_tools.json --check-prefix=CHECK-BAR
+// RUN: FileCheck %s < %t/json/foo/tools/index.json --check-prefix=CHECK-FOO
+// RUN: FileCheck %s < %t/json/bar/tools/index.json --check-prefix=CHECK-BAR
 
 namespace foo {
   namespace tools {
diff --git a/clang-tools-extra/test/clang-doc/json/namespace.cpp b/clang-tools-extra/test/clang-doc/json/namespace.cpp
index dcf83236bae28..dd7a9af9c82a0 100644
--- a/clang-tools-extra/test/clang-doc/json/namespace.cpp
+++ b/clang-tools-extra/test/clang-doc/json/namespace.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/json/index.json
+// RUN: FileCheck %s < %t/json/GlobalNamespace/index.json
 
 class MyClass {};
 
diff --git a/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp b/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp
index cf19e1e34a818..5baca7f39b783 100644
--- a/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp
+++ b/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/json/nested.json --check-prefix=NESTED
-// RUN: FileCheck %s < %t/json/nested_inner.json --check-prefix=INNER
+// RUN: FileCheck %s < %t/json/nested/index.json --check-prefix=NESTED
+// RUN: FileCheck %s < %t/json/nested/inner/index.json --check-prefix=INNER
 
 namespace nested {
   int Global;
diff --git a/clang-tools-extra/test/clang-doc/long-name.cpp b/clang-tools-extra/test/clang-doc/long-name.cpp
index e29c468ecc4da..77e50b1553ad5 100644
--- a/clang-tools-extra/test/clang-doc/long-name.cpp
+++ b/clang-tools-extra/test/clang-doc/long-name.cpp
@@ -2,8 +2,8 @@
 // UNSUPPORTED: system-windows
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=mustache --executor=standalone %s
-// RUN: ls %t/json | FileCheck %s -check-prefix=CHECK-JSON
-// RUN: ls %t/html | FileCheck %s -check-prefix=CHECK-HTML
+// RUN: ls %t/json/GlobalNamespace | FileCheck %s -check-prefix=CHECK-JSON
+// RUN: ls %t/html/GlobalNamespace | FileCheck %s -check-prefix=CHECK-HTML
 
 struct ThisStructHasANameThatResultsInAMangledNameThatIsExactly250CharactersLongThatIsSupposedToTestTheFilenameLengthLimitsWithinClangDocInOrdertoSeeifclangdocwillcrashornotdependingonthelengthofthestructIfTheLengthIsTooLongThenClangDocWillCrashAnd12 {};
 
diff --git a/clang-tools-extra/test/clang-doc/mustache-index.cpp b/clang-tools-extra/test/clang-doc/mustache-index.cpp
index f9aad193799b3..7b98c6b7c9880 100644
--- a/clang-tools-extra/test/clang-doc/mustache-index.cpp
+++ b/clang-tools-extra/test/clang-doc/mustache-index.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --format=mustache --output=%t --executor=standalone %s 
-// RUN: FileCheck %s < %t/html/index.html
+// RUN: FileCheck %s < %t/html/GlobalNamespace/index.html
 
 enum Color {
   RED,
@@ -70,9 +70,7 @@ class Foo;
 // CHECK-NEXT:      <ul class="class-container">
 // CHECK-NEXT:          <li id="{{[0-9A-F]*}}" style="max-height: 40px;">
 // CHECK-NEXT:              <a href="_ZTV3Foo.html">
-// CHECK-NEXT:                  <pre>
-// CHECK-NEXT:                      <code class="language-cpp code-clang-doc">class Foo</code>
-// CHECK-NEXT:                  </pre>
+// CHECK-NEXT:                  <pre><code class="language-cpp code-clang-doc">class Foo</code></pre>
 // CHECK-NEXT:              </a>
 // CHECK-NEXT:          </li>
 // CHECK-NEXT:      </ul>
diff --git a/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp b/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp
index a73a5ab6a843b..ee8b844d1f7f1 100644
--- a/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp
+++ b/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --format=mustache --output=%t --executor=standalone %s 
-// RUN: FileCheck %s < %t/html/MyNamespace.html
+// RUN: FileCheck %s < %t/html/MyNamespace/index.html
 
 namespace MyNamespace {
   class Foo;
@@ -9,9 +9,7 @@ namespace MyNamespace {
 // CHECK:       <ul class="class-container">
 // CHECK-NEXT:    <li id="{{[0-9A-F]*}}" style="max-height: 40px;">
 // CHECK-NEXT:        <a href="_ZTVN11MyNamespace3FooE.html">
-// CHECK-NEXT:            <pre>
-// CHECK-NEXT:                <code class="language-cpp code-clang-doc">class Foo</code>
-// CHECK-NEXT:            </pre>
+// CHECK-NEXT:            <pre><code class="language-cpp code-clang-doc">class Foo</code></pre>
 // CHECK-NEXT:        </a>
 // CHECK-NEXT:    </li>
 // CHECK-NEXT: </ul>
diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h
index b6977cd9ce6c6..0870f60eaa39b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h
@@ -59,7 +59,7 @@ struct X {};
 } // namespace std
 
 // Template specializations that are in a system-header file.
-// The purpose is to test cert-dcl58-cpp (no warnings here).
+// The purpose is to test bugprone-std-namespace-modification (no warnings here).
 namespace std {
 template <>
 void swap<short>(short &, short &){};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/oop58-cpp.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/copy-constructor-mutates-argument.cpp
similarity index 97%
rename from clang-tools-extra/test/clang-tidy/checkers/cert/oop58-cpp.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/copy-constructor-mutates-argument.cpp
index 223248cb8847f..9fdbb7af90f90 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cert/oop58-cpp.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/copy-constructor-mutates-argument.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s cert-oop58-cpp %t
+// RUN: %check_clang_tidy %s bugprone-copy-constructor-mutates-argument %t
 
 // Example test cases from CERT rule
 // https://wiki.sei.cmu.edu/confluence/display/cplusplus/OOP58-CPP.+Copy+operations+must+not+mutate+the+source+object
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/default-operator-new-on-overaligned-type-cpp17.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/default-operator-new-on-overaligned-type-cpp17.cpp
new file mode 100644
index 0000000000000..b05108c1e9775
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/default-operator-new-on-overaligned-type-cpp17.cpp
@@ -0,0 +1,12 @@
+// RUN: %check_clang_tidy %s -std=c++14 bugprone-default-operator-new-on-overaligned-type %t
+// RUN: clang-tidy -checks='-*,bugprone-default-operator-new-on-overaligned-type' --extra-arg=-Wno-unused-variable --warnings-as-errors='*' %s -- -std=c++17 -faligned-allocation
+// RUN: clang-tidy -checks='-*,bugprone-default-operator-new-on-overaligned-type' --extra-arg=-Wno-unused-variable --warnings-as-errors='*' %s -- -std=c++17 -faligned-allocation
+
+struct alignas(128) Vector {
+  char Elems[128];
+};
+
+void f() {
+  auto *V1 = new Vector;        // CHECK-MESSAGES: warning: allocation function returns a pointer with alignment {{[0-9]+}} but the over-aligned type being allocated requires alignment 128 [bugprone-default-operator-new-on-overaligned-type]
+  auto *V1_Arr = new Vector[2]; // CHECK-MESSAGES: warning: allocation function returns a pointer with alignment {{[0-9]+}} but the over-aligned type being allocated requires alignment 128 [bugprone-default-operator-new-on-overaligned-type]
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/mem57-cpp.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/default-operator-new-on-overaligned-type.cpp
similarity index 75%
rename from clang-tools-extra/test/clang-tidy/checkers/cert/mem57-cpp.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/default-operator-new-on-overaligned-type.cpp
index e0300e35183dc..379d8a2ff0f3c 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cert/mem57-cpp.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/default-operator-new-on-overaligned-type.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s -std=c++14 cert-mem57-cpp %t
+// RUN: %check_clang_tidy %s -std=c++14 bugprone-default-operator-new-on-overaligned-type %t
 
 namespace std {
 typedef __typeof(sizeof(int)) size_t;
@@ -30,10 +30,10 @@ struct alignas(8) Vector4 {
 
 void f() {
   auto *V1 = new Vector1;
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: allocation function returns a pointer with alignment {{[0-9]+}} but the over-aligned type being allocated requires alignment 128 [cert-mem57-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: allocation function returns a pointer with alignment {{[0-9]+}} but the over-aligned type being allocated requires alignment 128 [bugprone-default-operator-new-on-overaligned-type]
   auto *V2 = new Vector2;
   auto *V3 = new Vector3;
   auto *V4 = new Vector4;
   auto *V1_Arr = new Vector1[2];
-  // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: allocation function returns a pointer with alignment {{[0-9]+}} but the over-aligned type being allocated requires alignment 128 [cert-mem57-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: allocation function returns a pointer with alignment {{[0-9]+}} but the over-aligned type being allocated requires alignment 128 [bugprone-default-operator-new-on-overaligned-type]
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/throw-exception-type.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-copy-constructor-throws.cpp
similarity index 93%
rename from clang-tools-extra/test/clang-tidy/checkers/cert/throw-exception-type.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-copy-constructor-throws.cpp
index 34ca83795c397..7e2d586175c1b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cert/throw-exception-type.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-copy-constructor-throws.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy -std=c++11,c++14 %s cert-err60-cpp %t -- -- -fcxx-exceptions
+// RUN: %check_clang_tidy -std=c++11,c++14 %s bugprone-exception-copy-constructor-throws %t -- -- -fcxx-exceptions
 // FIXME: Split off parts of this test that rely on dynamic exception
 // specifications, and run this test in all language modes.
 // FIXME: Fix the checker to work in C++17 or later mode.
@@ -92,7 +92,7 @@ void f() {
   throw U(); // ok
   throw V(); // ok
   throw W(); // match, noexcept(false)
-  // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: thrown exception type is not nothrow copy constructible [cert-err60-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: thrown exception type is not nothrow copy constructible [bugprone-exception-copy-constructor-throws]
   throw X(); // match, no noexcept clause, nontrivial
   // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: thrown exception type is not nothrow copy constructible
   throw Y(); // ok
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-options.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-options.cpp
new file mode 100644
index 0000000000000..48c9bacd1b2e5
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-options.cpp
@@ -0,0 +1,47 @@
+// RUN: %check_clang_tidy -std=c++11-or-later %s bugprone-exception-escape %t -- \
+// RUN:     -config="{CheckOptions: { \
+// RUN:         bugprone-exception-escape.CheckDestructors: false, \
+// RUN:         bugprone-exception-escape.CheckMoveMemberFunctions: false, \
+// RUN:         bugprone-exception-escape.CheckMain: false, \
+// RUN:         bugprone-exception-escape.CheckedSwapFunctions: '', \
+// RUN:         bugprone-exception-escape.CheckNothrowFunctions: false \
+// RUN:     }}" \
+// RUN:     -- -fexceptions
+
+// CHECK-MESSAGES-NOT: warning:
+
+struct destructor {
+  ~destructor() {
+    throw 1;
+  }
+};
+
+struct move {
+    move(move&&) { throw 42; }
+    move& operator=(move&&) { throw 42; }
+};
+
+void swap(int&, int&) {
+  throw 1;
+}
+
+void iter_swap(int&, int&) {
+  throw 1;
+}
+
+void iter_move(int&) {
+  throw 1;
+}
+
+void nothrow_func() throw() {
+  throw 1;
+}
+
+void noexcept_func() noexcept {
+  throw 1;
+}
+
+int main() {
+  throw 1;
+  return 0;
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp
index a52bbe2246d1e..140c93f5c2536 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp
@@ -948,7 +948,7 @@ const auto throw_in_noexcept_lambda = [] () noexcept { throw 42; };
 // CHECK-MESSAGES: :[[@LINE-1]]:39: warning: an exception may be thrown in function 'operator()' which should not throw exceptions
 // CHECK-MESSAGES: :[[@LINE-2]]:56: note: frame #0: unhandled exception of type 'int' may be thrown in function 'operator()' here
 
-void thrower() {
+int thrower() {
   throw 42;
 }
 
@@ -956,3 +956,54 @@ const auto indirect_throw_in_noexcept_lambda = [] () noexcept { thrower(); };
 // CHECK-MESSAGES: :[[@LINE-1]]:48: warning: an exception may be thrown in function 'operator()' which should not throw exceptions
 // CHECK-MESSAGES: :[[@LINE-5]]:3: note: frame #0: unhandled exception of type 'int' may be thrown in function 'thrower' here
 // CHECK-MESSAGES: :[[@LINE-3]]:65: note: frame #1: function 'operator()' calls function 'thrower' here
+
+int f(int);
+void throw_in_function_arg() noexcept {
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'throw_in_function_arg' which should not throw exceptions
+  f(false ? 0 : throw 1);
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:17: note: frame #0: unhandled exception of type 'int' may be thrown in function 'throw_in_function_arg' here
+
+int g(int, int, int);
+void throw_in_last_function_arg() noexcept {
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'throw_in_last_function_arg' which should not throw exceptions
+  g(42, 67, false ? 0 : throw 1);
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:25: note: frame #0: unhandled exception of type 'int' may be thrown in function 'throw_in_last_function_arg' here
+
+void indirect_throw_in_function_arg() noexcept {
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'indirect_throw_in_function_arg' which should not throw exceptions
+  f(thrower());
+}
+// CHECK-MESSAGES: :[[@LINE-26]]:3: note: frame #0: unhandled exception of type 'int' may be thrown in function 'thrower' here
+// CHECK-MESSAGES: :[[@LINE-3]]:5: note: frame #1: function 'indirect_throw_in_function_arg' calls function 'thrower' here
+
+void indirect_throw_from_lambda_in_function_arg() noexcept {
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'indirect_throw_from_lambda_in_function_arg' which should not throw exceptions
+  f([] { throw 1; return 0; }());
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:10: note: frame #0: unhandled exception of type 'int' may be thrown in function 'operator()' here
+// CHECK-MESSAGES: :[[@LINE-3]]:30: note: frame #1: function 'indirect_throw_from_lambda_in_function_arg' calls function 'operator()' here
+
+struct S {
+  S(int) noexcept {}
+};
+
+void throw_in_constructor_arg() noexcept {
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'throw_in_constructor_arg' which should not throw exceptions
+  S s(false ? 0 : throw 1);
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:19: note: frame #0: unhandled exception of type 'int' may be thrown in function 'throw_in_constructor_arg' here
+
+void indirect_throw_in_constructor_arg() noexcept {
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'indirect_throw_in_constructor_arg' which should not throw exceptions
+  S s = thrower();
+}
+// CHECK-MESSAGES: :[[@LINE-50]]:3: note: frame #0: unhandled exception of type 'int' may be thrown in function 'thrower' here
+// CHECK-MESSAGES: :[[@LINE-3]]:9: note: frame #1: function 'indirect_throw_in_constructor_arg' calls function 'thrower' here
+
+void weird_throw_in_call_subexpression() noexcept {
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'weird_throw_in_call_subexpression' which should not throw exceptions
+  (false ? []{} : throw 1)();
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:19: note: frame #0: unhandled exception of type 'int' may be thrown in function 'weird_throw_in_call_subexpression' here
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/flp30-c.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/float-loop-counter.c
similarity index 67%
rename from clang-tools-extra/test/clang-tidy/checkers/cert/flp30-c.c
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/float-loop-counter.c
index b9985887a81c5..77812f01eace5 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cert/flp30-c.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/float-loop-counter.c
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s cert-flp30-c %t
+// RUN: %check_clang_tidy %s bugprone-float-loop-counter %t
 
 float g(void);
 int c(float);
@@ -7,16 +7,16 @@ float f = 1.0f;
 void match(void) {
 
   for (float x = 0.1f; x <= 1.0f; x += 0.1f) {}
-  // CHECK-MESSAGES: :[[@LINE-1]]:35: warning: loop induction expression should not have floating-point type [cert-flp30-c]
+  // CHECK-MESSAGES: :[[@LINE-1]]:35: warning: loop induction expression should not have floating-point type [bugprone-float-loop-counter]
 
   for (; f > 0; --f) {}
-  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: loop induction expression should not have floating-point type [cert-flp30-c]
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: loop induction expression should not have floating-point type [bugprone-float-loop-counter]
 
   for (float x = 0.0f; c(x); x = g()) {}
-  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: loop induction expression should not have floating-point type [cert-flp30-c]
+  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: loop induction expression should not have floating-point type [bugprone-float-loop-counter]
 
   for (int i=0; i < 10 && f < 2.0f; f++, i++) {}
-  // CHECK-MESSAGES: :[[@LINE-1]]:37: warning: loop induction expression should not have floating-point type [cert-flp30-c]
+  // CHECK-MESSAGES: :[[@LINE-1]]:37: warning: loop induction expression should not have floating-point type [bugprone-float-loop-counter]
   // CHECK-MESSAGES: :5:1: note: floating-point type loop induction variable
 }
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/msc32-c.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/random-generator-seed.c
similarity index 66%
rename from clang-tools-extra/test/clang-tidy/checkers/cert/msc32-c.c
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/random-generator-seed.c
index 0a1d79b4d916b..7f2a0685e4a2e 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cert/msc32-c.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/random-generator-seed.c
@@ -1,4 +1,5 @@
-// RUN: %check_clang_tidy %s cert-msc32-c %t -- -config="{CheckOptions: {cert-msc32-c.DisallowedSeedTypes: 'some_type,time_t'}}" -- -std=c99
+// RUN: %check_clang_tidy %s bugprone-random-generator-seed %t -- \
+// RUN: -config="{CheckOptions: {bugprone-random-generator-seed.DisallowedSeedTypes: 'some_type,time_t'}}"
 
 void srand(int seed);
 typedef int time_t;
@@ -6,15 +7,15 @@ time_t time(time_t *t);
 
 void f(void) {
   srand(1);
-  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc32-c]
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
 
   const int a = 1;
   srand(a);
-  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc32-c]
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
 
   time_t t;
   srand(time(&t)); // Disallowed seed type
-  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc32-c]
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed]
 }
 
 void g(void) {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/msc51-cpp.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/random-generator-seed.cpp
similarity index 77%
rename from clang-tools-extra/test/clang-tidy/checkers/cert/msc51-cpp.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/random-generator-seed.cpp
index 637ba58688abe..c8818d6770799 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cert/msc51-cpp.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/random-generator-seed.cpp
@@ -1,5 +1,5 @@
-// RUN: %check_clang_tidy %s cert-msc51-cpp %t -- \
-// RUN:     -config="{CheckOptions: {cert-msc51-cpp.DisallowedSeedTypes: 'some_type,time_t'}}"
+// RUN: %check_clang_tidy %s bugprone-random-generator-seed %t -- \
+// RUN:     -config="{CheckOptions: {bugprone-random-generator-seed.DisallowedSeedTypes: 'some_type,time_t'}}"
 
 namespace std {
 
@@ -71,114 +71,114 @@ void f() {
   time_t t;
 
   std::srand(0);
-  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::srand(seed);
-  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::srand(time(&t));
-  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed]
 
   // One instantiation for every engine
   std::default_random_engine engine1;
-  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::default_random_engine engine2(1);
-  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::default_random_engine engine3(seed);
-  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::default_random_engine engine4(time(&t));
-  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine1.seed();
-  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine1.seed(1);
-  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine1.seed(seed);
-  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine1.seed(time(&t));
-  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed]
 
   std::mt19937 engine5;
-  // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::mt19937 engine6(1);
-  // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::mt19937 engine7(seed);
-  // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::mt19937 engine8(time(&t));
-  // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine5.seed();
-  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine5.seed(1);
-  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine5.seed(seed);
-  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine5.seed(time(&t));
-  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed]
 
   std::ranlux24_base engine9;
-  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::ranlux24_base engine10(1);
-  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::ranlux24_base engine11(seed);
-  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::ranlux24_base engine12(time(&t));
-  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine9.seed();
-  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine9.seed(1);
-  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine9.seed(seed);
-  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine9.seed(time(&t));
-  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed]
 
   std::ranlux24 engine13;
-  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::ranlux24 engine14(1);
-  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::ranlux24 engine15(seed);
-  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::ranlux24 engine16(time(&t));
-  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine13.seed();
-  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine13.seed(1);
-  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine13.seed(seed);
-  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine13.seed(time(&t));
-  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed]
 
   std::independent_bits engine17;
-  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::independent_bits engine18(1);
-  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::independent_bits engine19(seed);
-  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::independent_bits engine20(time(&t));
-  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine17.seed();
-  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine17.seed(1);
-  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine17.seed(seed);
-  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine17.seed(time(&t));
-  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed]
 
   std::shuffle_order engine21;
-  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::shuffle_order engine22(1);
-  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::shuffle_order engine23(seed);
-  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   std::shuffle_order engine24(time(&t));
-  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine21.seed();
-  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a default argument will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a default argument will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine21.seed(1);
-  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine21.seed(seed);
-  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a constant value will generate a predictable sequence of values [bugprone-random-generator-seed]
   engine21.seed(time(&t));
-  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [cert-msc51-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: random number generator seeded with a disallowed source of seed value will generate a predictable sequence of values [bugprone-random-generator-seed]
 }
 
 struct A {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/oop57-cpp.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/raw-memory-call-on-non-trivial-type.cpp
similarity index 93%
rename from clang-tools-extra/test/clang-tidy/checkers/cert/oop57-cpp.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/raw-memory-call-on-non-trivial-type.cpp
index e34315fc98d25..41a86ff385dbf 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cert/oop57-cpp.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/raw-memory-call-on-non-trivial-type.cpp
@@ -1,8 +1,8 @@
-// RUN: %check_clang_tidy %s cert-oop57-cpp %t -- \
+// RUN: %check_clang_tidy %s bugprone-raw-memory-call-on-non-trivial-type %t -- \
 // RUN: -config='{CheckOptions: \
-// RUN:  {cert-oop57-cpp.MemSetNames: mymemset, \
-// RUN:  cert-oop57-cpp.MemCpyNames: mymemcpy, \
-// RUN:  cert-oop57-cpp.MemCmpNames: mymemcmp}}' \
+// RUN:  {bugprone-raw-memory-call-on-non-trivial-type.MemSetNames: mymemset, \
+// RUN:  bugprone-raw-memory-call-on-non-trivial-type.MemCpyNames: mymemcpy, \
+// RUN:  bugprone-raw-memory-call-on-non-trivial-type.MemCmpNames: mymemcmp}}' \
 // RUN: --
 
 void mymemset(void *, unsigned char, decltype(sizeof(int)));
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp
similarity index 97%
rename from clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp
index 01964e7dc6c76..32bcbcaa21c0d 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy -std=c++17-or-later %s cert-dcl58-cpp %t -- -- -I %clang_tidy_headers
+// RUN: %check_clang_tidy -std=c++17-or-later %s bugprone-std-namespace-modification %t -- -- -I %clang_tidy_headers
 
 #include "system-header-simulation.h"
 
@@ -15,7 +15,7 @@ namespace A {
 }
 
 namespace posix {
-// CHECK-MESSAGES: :[[@LINE+2]]:11: warning: modification of 'posix' namespace can result in undefined behavior [cert-dcl58-cpp]
+// CHECK-MESSAGES: :[[@LINE+2]]:11: warning: modification of 'posix' namespace can result in undefined behavior [bugprone-std-namespace-modification]
 // CHECK-MESSAGES: :[[@LINE-2]]:11: note: 'posix' namespace opened here
 namespace foo {
 int foobar;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unchecked-optional-access-ignore-value.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unchecked-optional-access-ignore-value.cpp
new file mode 100644
index 0000000000000..f54621269f8c0
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unchecked-optional-access-ignore-value.cpp
@@ -0,0 +1,25 @@
+// RUN: %check_clang_tidy %s bugprone-unchecked-optional-access %t -- \
+// RUN:   -config="{CheckOptions:  \
+// RUN:     {bugprone-unchecked-optional-access.IgnoreValueCalls: true}}" -- \
+// RUN:   -I %S/Inputs/unchecked-optional-access
+
+#include "absl/types/optional.h"
+
+struct Foo {
+  void foo() const {}
+};
+
+void unchecked_value_access(const absl::optional<int> &opt) {
+  opt.value(); // no-warning
+}
+
+void unchecked_deref_operator_access(const absl::optional<int> &opt) {
+  *opt;
+  // CHECK-MESSAGES: :[[@LINE-1]]:4: warning: unchecked access to optional value
+}
+
+void unchecked_arrow_operator_access(const absl::optional<Foo> &opt) {
+  opt->foo();
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: unchecked access to optional value
+}
+
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/mem57-cpp-cpp17.cpp b/clang-tools-extra/test/clang-tidy/checkers/cert/mem57-cpp-cpp17.cpp
deleted file mode 100644
index 38ffcbd7e805d..0000000000000
--- a/clang-tools-extra/test/clang-tidy/checkers/cert/mem57-cpp-cpp17.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: %check_clang_tidy %s -std=c++14 cert-mem57-cpp %t
-// RUN: clang-tidy -checks='-*,cert-mem57-cpp' --extra-arg=-Wno-unused-variable --warnings-as-errors='*' %s -- -std=c++17 -faligned-allocation
-// RUN: clang-tidy -checks='-*,cert-mem57-cpp' --extra-arg=-Wno-unused-variable --warnings-as-errors='*' %s -- -std=c++17 -faligned-allocation
-
-struct alignas(128) Vector {
-  char Elems[128];
-};
-
-void f() {
-  auto *V1 = new Vector;        // CHECK-MESSAGES: warning: allocation function returns a pointer with alignment {{[0-9]+}} but the over-aligned type being allocated requires alignment 128 [cert-mem57-cpp]
-  auto *V1_Arr = new Vector[2]; // CHECK-MESSAGES: warning: allocation function returns a pointer with alignment {{[0-9]+}} but the over-aligned type being allocated requires alignment 128 [cert-mem57-cpp]
-}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp b/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp
index 7ccdf705e8399..f9feb8854249b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp
@@ -102,9 +102,11 @@ void f(int a, double b, const char *cpc, const void *cpv, X *pX) {
   // CHECK-FIXES: b1 = static_cast<int>(b);
 
   Y *pB = (Y*)pX;
-  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: {{.*}}; use static_cast/const_cast/reinterpret_cast [
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: {{.*}}; use static_cast {{.*}}
+  // CHECK-FIXES: Y *pB = static_cast<Y*>(pX);
   Y &rB = (Y&)*pX;
-  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: {{.*}}; use static_cast/const_cast/reinterpret_cast [
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: {{.*}}; use static_cast {{.*}}
+  // CHECK-FIXES: Y &rB = static_cast<Y&>(*pX);
 
   const char *pc3 = (const char*)cpv;
   // CHECK-MESSAGES: :[[@LINE-1]]:21: warning: {{.*}}; use static_cast [
diff --git a/clang-tools-extra/test/clang-tidy/checkers/llvm/use-new-mlir-op-builder.cpp b/clang-tools-extra/test/clang-tidy/checkers/llvm/use-new-mlir-op-builder.cpp
index b57eab089c748..c4a1d8d66cdeb 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/llvm/use-new-mlir-op-builder.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/llvm/use-new-mlir-op-builder.cpp
@@ -2,6 +2,7 @@
 
 namespace mlir {
 class Location {};
+class Value {};
 class OpBuilder {
 public:
   template <typename OpTy, typename... Args>
@@ -28,6 +29,13 @@ struct NamedOp {
   static NamedOp create(OpBuilder &builder, Location location, const char* name) {
     return NamedOp(name);
   }
+  Value getResult() { return Value(); }
+};
+struct OperandOp {
+  OperandOp(Value val) {}
+  static OperandOp create(OpBuilder &builder, Location location, Value val) {
+    return OperandOp(val);
+  }
 };
 } // namespace mlir
 
@@ -40,6 +48,22 @@ void g(mlir::OpBuilder &b) {
   b.create<T>(b.getUnknownLoc(), "gaz");
 }
 
+class CustomBuilder : public mlir::ImplicitLocOpBuilder {
+public:
+  mlir::NamedOp f(const char *name) {
+    // CHECK-MESSAGES: :[[@LINE+2]]:12: warning: use 'OpType::create(builder, ...)'
+    // CHECK-FIXES: return mlir::NamedOp::create(*this, name);
+    return create<mlir::NamedOp>(name);
+  }
+
+  mlir::NamedOp g(const char *name) {
+    using mlir::NamedOp;
+    // CHECK-MESSAGES: :[[@LINE+2]]:12: warning: use 'OpType::create(builder, ...)'
+    // CHECK-FIXES: return NamedOp::create(*this, name);
+    return create<NamedOp>(name);
+  }
+};
+
 void f() {
   mlir::OpBuilder builder;
   // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create<OpType>(...)' [llvm-use-new-mlir-op-builder]
@@ -47,15 +71,18 @@ void f() {
   builder.create<mlir::  ModuleOp>(builder.getUnknownLoc());
 
   using mlir::NamedOp;
+  using mlir::OperandOp;
+
   // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create<OpType>(...)' [llvm-use-new-mlir-op-builder]
   // CHECK-FIXES: NamedOp::create(builder, builder.getUnknownLoc(), "baz");
   builder.create<NamedOp>(builder.getUnknownLoc(), "baz");
 
-  // CHECK-MESSAGES: :[[@LINE+3]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create<OpType>(...)' [llvm-use-new-mlir-op-builder]
-  // CHECK-FIXES: NamedOp::create(builder, builder.getUnknownLoc(),
-  // CHECK-FIXES:   "caz");
+  // CHECK-MESSAGES: :[[@LINE+4]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create<OpType>(...)' [llvm-use-new-mlir-op-builder]
+  // CHECK-FIXES: NamedOp::create(builder,
+  // CHECK-FIXES:      builder.getUnknownLoc(),
+  // CHECK-FIXES:      "caz");
   builder.
-   create<NamedOp>(
+   create<NamedOp>  (
      builder.getUnknownLoc(),
      "caz");
 
@@ -66,10 +93,26 @@ void f() {
 
   mlir::ImplicitLocOpBuilder ib;
   // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create<OpType>(...)' [llvm-use-new-mlir-op-builder]
-  // CHECK-FIXES: mlir::ModuleOp::create(ib);
+  // CHECK-FIXES: mlir::ModuleOp::create(ib );
   ib.create<mlir::ModuleOp>(   );
 
   // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create<OpType>(...)' [llvm-use-new-mlir-op-builder]
   // CHECK-FIXES: mlir::OpBuilder().create<mlir::ModuleOp>(builder.getUnknownLoc());
   mlir::OpBuilder().create<mlir::ModuleOp>(builder.getUnknownLoc());
+
+  auto *p = &builder;
+  // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)'
+  // CHECK-FIXES: NamedOp::create(*p, builder.getUnknownLoc(), "eaz");
+  p->create<NamedOp>(builder.getUnknownLoc(), "eaz");
+
+  CustomBuilder cb;
+  cb.f("faz");
+  cb.g("gaz");
+
+  // CHECK-FIXES:      OperandOp::create(builder, builder.getUnknownLoc(),
+  // CHECK-FIXES-NEXT:   NamedOp::create(builder, builder.getUnknownLoc(), "haz").getResult());
+  // CHECK-MESSAGES: :[[@LINE+2]]:3: warning: use 'OpType::create(builder, ...)' instead of 'builder.create<OpType>(...)' [llvm-use-new-mlir-op-builder]
+  // CHECK-MESSAGES: :[[@LINE+2]]:5: warning: use 'OpType::create(builder, ...)' instead of 'builder.create<OpType>(...)' [llvm-use-new-mlir-op-builder]
+  builder.create<OperandOp>(builder.getUnknownLoc(),
+    builder.create<NamedOp>(builder.getUnknownLoc(), "haz").getResult());
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/coroutine-hostile-raii.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/coroutine-hostile-raii.cpp
index c23c355dac1b2..ec6ddec56e1f2 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/coroutine-hostile-raii.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/coroutine-hostile-raii.cpp
@@ -1,7 +1,8 @@
 // RUN: %check_clang_tidy -std=c++20 %s misc-coroutine-hostile-raii %t \
 // RUN:   -config="{CheckOptions: {\
 // RUN:             misc-coroutine-hostile-raii.RAIITypesList: 'my::Mutex; ::my::other::Mutex', \
-// RUN:             misc-coroutine-hostile-raii.AllowedAwaitablesList: 'safe::awaitable; ::transformable::awaitable' \
+// RUN:             misc-coroutine-hostile-raii.AllowedAwaitablesList: 'safe::awaitable; ::transformable::awaitable', \
+// RUN:             misc-coroutine-hostile-raii.AllowedCallees: 'safe::AwaitFunc; ::safe::Obj::AwaitMethod' \
 // RUN:             }}"
 
 namespace std {
@@ -145,12 +146,18 @@ namespace safe {
   void await_suspend(std::coroutine_handle<>) noexcept {}
   void await_resume() noexcept {}
 };
+  std::suspend_always AwaitFunc();
+  struct Obj {
+    std::suspend_always AwaitMethod();
+  };
 } // namespace safe
 ReturnObject RAIISafeSuspendTest() {
   absl::Mutex a;
   co_await safe::awaitable{};
   using other = safe::awaitable;
   co_await other{};
+  co_await safe::AwaitFunc();
+  co_await safe::Obj().AwaitMethod();
 } 
 
 // ================================================================================
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/container-data-pointer.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/container-data-pointer.cpp
index a8e0eb6d262e6..2ed1e939d71d4 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/container-data-pointer.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/container-data-pointer.cpp
@@ -35,6 +35,12 @@ template <typename T>
 struct enable_if<true, T> {
   typedef T type;
 };
+
+template <typename T>
+struct unique_ptr {
+  T &operator*() const;
+  T *operator->() const;
+};
 }
 
 template <typename T>
@@ -144,3 +150,20 @@ int *r() {
   // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: 'data' should be used for accessing the data pointer instead of taking the address of the 0-th element [readability-container-data-pointer]
   // CHECK-FIXES: return holder.v.data();
 }
+
+void s(std::unique_ptr<std::vector<unsigned char>> p) {
+  f(&(*p)[0]);
+  // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'data' should be used for accessing the data pointer instead of taking the address of the 0-th element [readability-container-data-pointer]
+  // CHECK-FIXES: f((*p).data());
+}
+
+void t(std::unique_ptr<container_without_data<unsigned char>> p) {
+  // p has no "data" member function, so no warning
+  f(&(*p)[0]);
+}
+
+template <typename T>
+void u(std::unique_ptr<T> p) {
+  // we don't know if 'T' will always have "data" member function, so no warning
+  f(&(*p)[0]);
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp
index 926cb118c77cf..c77608c66469c 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp
@@ -62,3 +62,12 @@ void exceptions() {
   // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: redundant parentheses around expression [readability-redundant-parentheses]
   // CHECK-FIXES:    alignof(3);
 }
+
+namespace std {
+  template<class T> T max(T, T);
+  template<class T> T min(T, T);
+} // namespace std
+void ignoreStdMaxMin() {
+  (std::max)(1,2);
+  (std::min)(1,2);
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/use-concise-preprocessor-directives.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/use-concise-preprocessor-directives.cpp
index 53e079bcca40f..b8a4953161d86 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/use-concise-preprocessor-directives.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/use-concise-preprocessor-directives.cpp
@@ -28,6 +28,14 @@
 #  elif (defined(BAR))
 #endif
 
+// CHECK-MESSAGES: :[[@LINE+2]]:2: warning: preprocessor condition can be written more concisely using '#ifdef' [readability-use-concise-preprocessor-directives]
+// CHECK-FIXES: #ifdef FOO
+#if(defined(FOO))
+// CHECK-MESSAGES-23: :[[@LINE+2]]:2: warning: preprocessor condition can be written more concisely using '#elifdef' [readability-use-concise-preprocessor-directives]
+// CHECK-FIXES-23: #elifdef BAR
+#elif(defined(BAR))
+#endif
+
 // CHECK-MESSAGES: :[[@LINE+2]]:2: warning: preprocessor condition can be written more concisely using '#ifdef' [readability-use-concise-preprocessor-directives]
 // CHECK-FIXES: #ifdef FOO
 #if (defined FOO)
diff --git a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp
index ea9896e1cfde2..fccbd9b3740bd 100644
--- a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp
+++ b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp
@@ -39,7 +39,6 @@
 // CHECK-NEXT:   Reason: EnterFile
 // CHECK-NEXT:   FileType: C_User
 // CHECK-NEXT:   PrevFID: (invalid)
-// CHECK:      - Callback: MacroDefined
 // CHECK:      - Callback: FileChanged
 // CHECK-NEXT:   Loc: "<built-in>:1:1"
 // CHECK-NEXT:   Reason: ExitFile
diff --git a/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp b/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp
index 7c2a231101070..5bd38e0dade28 100644
--- a/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp
+++ b/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp
@@ -40,7 +40,6 @@ X
 // CHECK-NEXT:   MacroNameTok: __STDC_EMBED_EMPTY__
 // CHECK-NEXT:   MacroDirective: MD_Define
 // CHECK:      - Callback: MacroDefined
-// CHECK:      - Callback: MacroDefined
 // CHECK-NEXT:   MacroNameTok: MACRO
 // CHECK-NEXT:   MacroDirective: MD_Define
 // CHECK-NEXT: - Callback: MacroExpands
diff --git a/clang-tools-extra/unittests/clang-doc/JSONGeneratorTest.cpp b/clang-tools-extra/unittests/clang-doc/JSONGeneratorTest.cpp
index 07c761fcd0685..2706a5145ebfd 100644
--- a/clang-tools-extra/unittests/clang-doc/JSONGeneratorTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/JSONGeneratorTest.cpp
@@ -16,8 +16,6 @@ static std::unique_ptr<Generator> getJSONGenerator() {
 TEST(JSONGeneratorTest, emitRecordJSON) {
   RecordInfo I;
   I.Name = "Foo";
-  // FIXME: FullName is not emitted correctly.
-  I.FullName = "";
   I.IsTypeDef = false;
   I.Namespace.emplace_back(EmptySID, "GlobalNamespace", InfoType::IT_namespace);
   I.Path = "GlobalNamespace";
@@ -64,7 +62,6 @@ TEST(JSONGeneratorTest, emitRecordJSON) {
     {
       "Access": "public",
       "End": true,
-      "FullName": "",
       "HasPublicFunctions": true,
       "HasPublicMembers": true,
       "InfoType": "record",
@@ -115,7 +112,6 @@ TEST(JSONGeneratorTest, emitRecordJSON) {
       "USR": "0000000000000000000000000000000000000000"
     }
   ],
-  "FullName": "",
   "HasEnums": true,
   "HasPublicFunctions": true,
   "HasRecords": true,
diff --git a/clang/AreaTeamMembers.txt b/clang/AreaTeamMembers.txt
index 964d11e79f694..2928943f47533 100644
--- a/clang/AreaTeamMembers.txt
+++ b/clang/AreaTeamMembers.txt
@@ -13,5 +13,5 @@ rnk@google.com (email), rnk (Discourse), rnk (GitHub), rnk (Discord)
 Other Members
 -------------
 Eli Friedman
-efriedma@quicinc.com> (email), efriedma-quic (Discourse), efriedma-quic (GitHub)
+efriedma@qti.qualcomm.com> (email), efriedma-quic (Discourse), efriedma-quic (GitHub)
 
diff --git a/clang/Maintainers.rst b/clang/Maintainers.rst
index 8fb2201aae16c..847d37d124083 100644
--- a/clang/Maintainers.rst
+++ b/clang/Maintainers.rst
@@ -46,7 +46,7 @@ Clang LLVM IR generation
 | rjmccall\@apple.com (email), rjmccall (Phabricator), rjmccall (GitHub)
 
 | Eli Friedman
-| efriedma\@quicinc.com (email), efriedma (Phabricator), efriedma-quic (GitHub)
+| efriedma\@qti.qualcomm.com (email), efriedma (Phabricator), efriedma-quic (GitHub)
 
 | Anton Korobeynikov
 | anton\@korobeynikov.info (email), asl (Phabricator), asl (GitHub)
@@ -242,7 +242,7 @@ ARM EABI
 Compiler-Wide Topics
 --------------------
 The following people are responsible for functionality that does not fit into
-a single part of the compiler, but instead span multiple components within the
+a single part of the compiler, but instead spans multiple components within the
 compiler.
 
 Attributes
diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py
index 2786add27f5e8..d352373e85c60 100644
--- a/clang/bindings/python/clang/cindex.py
+++ b/clang/bindings/python/clang/cindex.py
@@ -333,18 +333,18 @@ def offset(self):
     @property
     def is_in_system_header(self):
         """Returns true if the given source location is in a system header."""
-        return conf.lib.clang_Location_isInSystemHeader(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_Location_isInSystemHeader(self))
 
     def __eq__(self, other):
-        return isinstance(other, SourceLocation) and conf.lib.clang_equalLocations(
-            self, other
+        return isinstance(other, SourceLocation) and bool(
+            conf.lib.clang_equalLocations(self, other)
         )
 
     def __ne__(self, other):
         return not self.__eq__(other)
 
     def __lt__(self, other: SourceLocation) -> bool:
-        return conf.lib.clang_isBeforeInTranslationUnit(self, other)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_isBeforeInTranslationUnit(self, other))
 
     def __le__(self, other: SourceLocation) -> bool:
         return self < other or self == other
@@ -396,8 +396,8 @@ def end(self):
         return conf.lib.clang_getRangeEnd(self)  # type: ignore [no-any-return]
 
     def __eq__(self, other):
-        return isinstance(other, SourceRange) and conf.lib.clang_equalRanges(
-            self, other
+        return isinstance(other, SourceRange) and bool(
+            conf.lib.clang_equalRanges(self, other)
         )
 
     def __ne__(self, other):
@@ -674,39 +674,39 @@ def get_all_kinds():
 
     def is_declaration(self):
         """Test if this is a declaration kind."""
-        return conf.lib.clang_isDeclaration(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_isDeclaration(self))
 
     def is_reference(self):
         """Test if this is a reference kind."""
-        return conf.lib.clang_isReference(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_isReference(self))
 
     def is_expression(self):
         """Test if this is an expression kind."""
-        return conf.lib.clang_isExpression(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_isExpression(self))
 
     def is_statement(self):
         """Test if this is a statement kind."""
-        return conf.lib.clang_isStatement(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_isStatement(self))
 
     def is_attribute(self):
         """Test if this is an attribute kind."""
-        return conf.lib.clang_isAttribute(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_isAttribute(self))
 
     def is_invalid(self):
         """Test if this is an invalid kind."""
-        return conf.lib.clang_isInvalid(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_isInvalid(self))
 
     def is_translation_unit(self):
         """Test if this is a translation unit kind."""
-        return conf.lib.clang_isTranslationUnit(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_isTranslationUnit(self))
 
     def is_preprocessing(self):
         """Test if this is a preprocessing kind."""
-        return conf.lib.clang_isPreprocessing(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_isPreprocessing(self))
 
     def is_unexposed(self):
         """Test if this is an unexposed kind."""
-        return conf.lib.clang_isUnexposed(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_isUnexposed(self))
 
     ###
     # Declaration Kinds
@@ -1650,7 +1650,9 @@ def from_location(tu: TranslationUnit, location: SourceLocation) -> Cursor | Non
 
     # This function is not null-guarded because it is used in cursor_null_guard itself
     def __eq__(self, other: object) -> bool:
-        return isinstance(other, Cursor) and conf.lib.clang_equalCursors(self, other)
+        return isinstance(other, Cursor) and bool(
+            conf.lib.clang_equalCursors(self, other)
+        )
 
     # Not null-guarded for consistency with __eq__
     def __ne__(self, other: object) -> bool:
@@ -1670,48 +1672,48 @@ def is_definition(self) -> bool:
         Returns true if the declaration pointed at by the cursor is also a
         definition of that entity.
         """
-        return conf.lib.clang_isCursorDefinition(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_isCursorDefinition(self))
 
     @cursor_null_guard
     def is_const_method(self) -> bool:
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared 'const'.
         """
-        return conf.lib.clang_CXXMethod_isConst(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_CXXMethod_isConst(self))
 
     @cursor_null_guard
     def is_converting_constructor(self) -> bool:
         """Returns True if the cursor refers to a C++ converting constructor."""
-        return conf.lib.clang_CXXConstructor_isConvertingConstructor(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_CXXConstructor_isConvertingConstructor(self))
 
     @cursor_null_guard
     def is_copy_constructor(self) -> bool:
         """Returns True if the cursor refers to a C++ copy constructor."""
-        return conf.lib.clang_CXXConstructor_isCopyConstructor(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_CXXConstructor_isCopyConstructor(self))
 
     @cursor_null_guard
     def is_default_constructor(self) -> bool:
         """Returns True if the cursor refers to a C++ default constructor."""
-        return conf.lib.clang_CXXConstructor_isDefaultConstructor(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_CXXConstructor_isDefaultConstructor(self))
 
     @cursor_null_guard
     def is_move_constructor(self) -> bool:
         """Returns True if the cursor refers to a C++ move constructor."""
-        return conf.lib.clang_CXXConstructor_isMoveConstructor(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_CXXConstructor_isMoveConstructor(self))
 
     @cursor_null_guard
     def is_default_method(self) -> bool:
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared '= default'.
         """
-        return conf.lib.clang_CXXMethod_isDefaulted(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_CXXMethod_isDefaulted(self))
 
     @cursor_null_guard
     def is_deleted_method(self) -> bool:
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared '= delete'.
         """
-        return conf.lib.clang_CXXMethod_isDeleted(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_CXXMethod_isDeleted(self))
 
     @cursor_null_guard
     def is_copy_assignment_operator_method(self) -> bool:
@@ -1737,7 +1739,7 @@ class Bar {
 
         Is not.
         """
-        return conf.lib.clang_CXXMethod_isCopyAssignmentOperator(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_CXXMethod_isCopyAssignmentOperator(self))
 
     @cursor_null_guard
     def is_move_assignment_operator_method(self) -> bool:
@@ -1763,7 +1765,7 @@ class Bar {
 
         Is not.
         """
-        return conf.lib.clang_CXXMethod_isMoveAssignmentOperator(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_CXXMethod_isMoveAssignmentOperator(self))
 
     @cursor_null_guard
     def is_explicit_method(self) -> bool:
@@ -1809,47 +1811,47 @@ class Foo {
         This method will return 0 for the constructor and 1 for
         the conversion function.
         """
-        return conf.lib.clang_CXXMethod_isExplicit(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_CXXMethod_isExplicit(self))
 
     @cursor_null_guard
     def is_mutable_field(self) -> bool:
         """Returns True if the cursor refers to a C++ field that is declared
         'mutable'.
         """
-        return conf.lib.clang_CXXField_isMutable(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_CXXField_isMutable(self))
 
     @cursor_null_guard
     def is_pure_virtual_method(self) -> bool:
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared pure virtual.
         """
-        return conf.lib.clang_CXXMethod_isPureVirtual(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_CXXMethod_isPureVirtual(self))
 
     @cursor_null_guard
     def is_static_method(self) -> bool:
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared 'static'.
         """
-        return conf.lib.clang_CXXMethod_isStatic(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_CXXMethod_isStatic(self))
 
     @cursor_null_guard
     def is_virtual_method(self) -> bool:
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared 'virtual'.
         """
-        return conf.lib.clang_CXXMethod_isVirtual(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_CXXMethod_isVirtual(self))
 
     @cursor_null_guard
     def is_abstract_record(self) -> bool:
         """Returns True if the cursor refers to a C++ record declaration
         that has pure virtual member functions.
         """
-        return conf.lib.clang_CXXRecord_isAbstract(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_CXXRecord_isAbstract(self))
 
     @cursor_null_guard
     def is_scoped_enum(self) -> bool:
         """Returns True if the cursor refers to a scoped enum declaration."""
-        return conf.lib.clang_EnumDecl_isScoped(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_EnumDecl_isScoped(self))
 
     @cursor_null_guard
     def get_definition(self) -> Cursor | None:
@@ -2322,7 +2324,7 @@ def get_base_offsetof(self, parent: Cursor) -> int:
     @cursor_null_guard
     def is_virtual_base(self) -> bool:
         """Returns whether the CXX_BASE_SPECIFIER pointed by this Cursor is virtual."""
-        return conf.lib.clang_isVirtualBase(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_isVirtualBase(self))
 
     @cursor_null_guard
     def is_anonymous(self) -> bool:
@@ -2335,7 +2337,7 @@ def is_anonymous(self) -> bool:
         """
         if self.kind == CursorKind.FIELD_DECL:
             return self.type.get_declaration().is_anonymous()
-        return conf.lib.clang_Cursor_isAnonymous(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_Cursor_isAnonymous(self))
 
     @cursor_null_guard
     def is_anonymous_record_decl(self) -> bool:
@@ -2346,14 +2348,14 @@ def is_anonymous_record_decl(self) -> bool:
         """
         if self.kind == CursorKind.FIELD_DECL:
             return self.type.get_declaration().is_anonymous_record_decl()
-        return conf.lib.clang_Cursor_isAnonymousRecordDecl(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_Cursor_isAnonymousRecordDecl(self))
 
     @cursor_null_guard
     def is_bitfield(self) -> bool:
         """
         Check if the field is a bitfield.
         """
-        return conf.lib.clang_Cursor_isBitField(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_Cursor_isBitField(self))
 
     @cursor_null_guard
     def get_bitfield_width(self) -> int:
@@ -2362,6 +2364,13 @@ def get_bitfield_width(self) -> int:
         """
         return conf.lib.clang_getFieldDeclBitWidth(self)  # type: ignore [no-any-return]
 
+    @cursor_null_guard
+    def is_function_inlined(self) -> bool:
+        """
+        Check if the function is inlined.
+        """
+        return bool(conf.lib.clang_Cursor_isFunctionInlined(self))
+
     @cursor_null_guard
     def has_attrs(self) -> bool:
         """
@@ -2815,7 +2824,7 @@ def is_const_qualified(self) -> bool:
         This does not look through typedefs that may have added "const"
         at a different level.
         """
-        return conf.lib.clang_isConstQualifiedType(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_isConstQualifiedType(self))
 
     def is_volatile_qualified(self) -> bool:
         """Determine whether a Type has the "volatile" qualifier set.
@@ -2823,7 +2832,7 @@ def is_volatile_qualified(self) -> bool:
         This does not look through typedefs that may have added "volatile"
         at a different level.
         """
-        return conf.lib.clang_isVolatileQualifiedType(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_isVolatileQualifiedType(self))
 
     def is_restrict_qualified(self) -> bool:
         """Determine whether a Type has the "restrict" qualifier set.
@@ -2831,13 +2840,13 @@ def is_restrict_qualified(self) -> bool:
         This does not look through typedefs that may have added "restrict" at
         a different level.
         """
-        return conf.lib.clang_isRestrictQualifiedType(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_isRestrictQualifiedType(self))
 
     def is_function_variadic(self) -> bool:
         """Determine whether this function Type is a variadic function type."""
         assert self.kind == TypeKind.FUNCTIONPROTO
 
-        return conf.lib.clang_isFunctionTypeVariadic(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_isFunctionTypeVariadic(self))
 
     def get_address_space(self) -> int:
         return conf.lib.clang_getAddressSpace(self)  # type: ignore [no-any-return]
@@ -2847,7 +2856,7 @@ def get_typedef_name(self) -> str:
 
     def is_pod(self) -> bool:
         """Determine whether this Type represents plain old data (POD)."""
-        return conf.lib.clang_isPODType(self)  # type: ignore [no-any-return]
+        return bool(conf.lib.clang_isPODType(self))
 
     def get_pointee(self) -> Type:
         """
@@ -2981,7 +2990,7 @@ def pretty_printed(self, policy: PrintingPolicy) -> str:
         return _CXString.from_result(conf.lib.clang_getTypePrettyPrinted(self, policy))
 
     def __eq__(self, other: object) -> bool:
-        return isinstance(other, Type) and conf.lib.clang_equalTypes(self, other)
+        return isinstance(other, Type) and bool(conf.lib.clang_equalTypes(self, other))
 
     def __ne__(self, other: object) -> bool:
         return not self.__eq__(other)
@@ -4120,22 +4129,22 @@ def set_property(self, property, value):
     ("clang_CXRewriter_removeText", [Rewriter, SourceRange]),
     ("clang_CXRewriter_replaceText", [Rewriter, SourceRange, c_interop_string]),
     ("clang_CXRewriter_writeMainFileToStdOut", [Rewriter]),
-    ("clang_CXXConstructor_isConvertingConstructor", [Cursor], bool),
-    ("clang_CXXConstructor_isCopyConstructor", [Cursor], bool),
-    ("clang_CXXConstructor_isDefaultConstructor", [Cursor], bool),
-    ("clang_CXXConstructor_isMoveConstructor", [Cursor], bool),
-    ("clang_CXXField_isMutable", [Cursor], bool),
-    ("clang_CXXMethod_isConst", [Cursor], bool),
-    ("clang_CXXMethod_isDefaulted", [Cursor], bool),
-    ("clang_CXXMethod_isDeleted", [Cursor], bool),
-    ("clang_CXXMethod_isCopyAssignmentOperator", [Cursor], bool),
-    ("clang_CXXMethod_isMoveAssignmentOperator", [Cursor], bool),
-    ("clang_CXXMethod_isExplicit", [Cursor], bool),
-    ("clang_CXXMethod_isPureVirtual", [Cursor], bool),
-    ("clang_CXXMethod_isStatic", [Cursor], bool),
-    ("clang_CXXMethod_isVirtual", [Cursor], bool),
-    ("clang_CXXRecord_isAbstract", [Cursor], bool),
-    ("clang_EnumDecl_isScoped", [Cursor], bool),
+    ("clang_CXXConstructor_isConvertingConstructor", [Cursor], c_uint),
+    ("clang_CXXConstructor_isCopyConstructor", [Cursor], c_uint),
+    ("clang_CXXConstructor_isDefaultConstructor", [Cursor], c_uint),
+    ("clang_CXXConstructor_isMoveConstructor", [Cursor], c_uint),
+    ("clang_CXXField_isMutable", [Cursor], c_uint),
+    ("clang_CXXMethod_isConst", [Cursor], c_uint),
+    ("clang_CXXMethod_isDefaulted", [Cursor], c_uint),
+    ("clang_CXXMethod_isDeleted", [Cursor], c_uint),
+    ("clang_CXXMethod_isCopyAssignmentOperator", [Cursor], c_uint),
+    ("clang_CXXMethod_isMoveAssignmentOperator", [Cursor], c_uint),
+    ("clang_CXXMethod_isExplicit", [Cursor], c_uint),
+    ("clang_CXXMethod_isPureVirtual", [Cursor], c_uint),
+    ("clang_CXXMethod_isStatic", [Cursor], c_uint),
+    ("clang_CXXMethod_isVirtual", [Cursor], c_uint),
+    ("clang_CXXRecord_isAbstract", [Cursor], c_uint),
+    ("clang_EnumDecl_isScoped", [Cursor], c_uint),
     ("clang_defaultDiagnosticDisplayOptions", [], c_uint),
     ("clang_defaultSaveOptions", [TranslationUnit], c_uint),
     ("clang_disposeCodeCompleteResults", [CodeCompletionResults]),
@@ -4146,10 +4155,10 @@ def set_property(self, property, value):
     ("clang_disposeString", [_CXString]),
     ("clang_disposeTokens", [TranslationUnit, POINTER(Token), c_uint]),
     ("clang_disposeTranslationUnit", [TranslationUnit]),
-    ("clang_equalCursors", [Cursor, Cursor], bool),
-    ("clang_equalLocations", [SourceLocation, SourceLocation], bool),
-    ("clang_equalRanges", [SourceRange, SourceRange], bool),
-    ("clang_equalTypes", [Type, Type], bool),
+    ("clang_equalCursors", [Cursor, Cursor], c_uint),
+    ("clang_equalLocations", [SourceLocation, SourceLocation], c_uint),
+    ("clang_equalRanges", [SourceRange, SourceRange], c_uint),
+    ("clang_equalTypes", [Type, Type], c_uint),
     ("clang_formatDiagnostic", [Diagnostic, c_uint], _CXString),
     ("clang_getAddressSpace", [Type], c_uint),
     ("clang_getArgType", [Type, c_uint], Type),
@@ -4213,7 +4222,7 @@ def set_property(self, property, value):
     ("clang_getFile", [TranslationUnit, c_interop_string], c_object_p),
     ("clang_getFileName", [File], _CXString),
     ("clang_getFileTime", [File], c_uint),
-    ("clang_File_isEqual", [File, File], bool),
+    ("clang_File_isEqual", [File, File], c_int),
     ("clang_getIBOutletCollectionType", [Cursor], Type),
     ("clang_getIncludedFile", [Cursor], c_object_p),
     (
@@ -4262,25 +4271,25 @@ def set_property(self, property, value):
     ("clang_getTypePrettyPrinted", [Type, PrintingPolicy], _CXString),
     ("clang_getTypeSpelling", [Type], _CXString),
     ("clang_hashCursor", [Cursor], c_uint),
-    ("clang_isAttribute", [CursorKind], bool),
+    ("clang_isAttribute", [CursorKind], c_uint),
     ("clang_getFullyQualifiedName", [Type, PrintingPolicy, c_uint], _CXString),
-    ("clang_isConstQualifiedType", [Type], bool),
-    ("clang_isCursorDefinition", [Cursor], bool),
-    ("clang_isDeclaration", [CursorKind], bool),
-    ("clang_isExpression", [CursorKind], bool),
-    ("clang_isFileMultipleIncludeGuarded", [TranslationUnit, File], bool),
-    ("clang_isFunctionTypeVariadic", [Type], bool),
-    ("clang_isInvalid", [CursorKind], bool),
-    ("clang_isPODType", [Type], bool),
-    ("clang_isPreprocessing", [CursorKind], bool),
-    ("clang_isReference", [CursorKind], bool),
-    ("clang_isRestrictQualifiedType", [Type], bool),
-    ("clang_isStatement", [CursorKind], bool),
-    ("clang_isTranslationUnit", [CursorKind], bool),
-    ("clang_isUnexposed", [CursorKind], bool),
-    ("clang_isVirtualBase", [Cursor], bool),
-    ("clang_isVolatileQualifiedType", [Type], bool),
-    ("clang_isBeforeInTranslationUnit", [SourceLocation, SourceLocation], bool),
+    ("clang_isConstQualifiedType", [Type], c_uint),
+    ("clang_isCursorDefinition", [Cursor], c_uint),
+    ("clang_isDeclaration", [CursorKind], c_uint),
+    ("clang_isExpression", [CursorKind], c_uint),
+    ("clang_isFileMultipleIncludeGuarded", [TranslationUnit, File], c_uint),
+    ("clang_isFunctionTypeVariadic", [Type], c_uint),
+    ("clang_isInvalid", [CursorKind], c_uint),
+    ("clang_isPODType", [Type], c_uint),
+    ("clang_isPreprocessing", [CursorKind], c_uint),
+    ("clang_isReference", [CursorKind], c_uint),
+    ("clang_isRestrictQualifiedType", [Type], c_uint),
+    ("clang_isStatement", [CursorKind], c_uint),
+    ("clang_isTranslationUnit", [CursorKind], c_uint),
+    ("clang_isUnexposed", [CursorKind], c_uint),
+    ("clang_isVirtualBase", [Cursor], c_uint),
+    ("clang_isVolatileQualifiedType", [Type], c_uint),
+    ("clang_isBeforeInTranslationUnit", [SourceLocation, SourceLocation], c_uint),
     (
         "clang_parseTranslationUnit",
         [Index, c_interop_string, c_void_p, c_int, c_void_p, c_int, c_int],
@@ -4307,10 +4316,11 @@ def set_property(self, property, value):
     ("clang_Cursor_getRawCommentText", [Cursor], _CXString),
     ("clang_Cursor_getOffsetOfField", [Cursor], c_longlong),
     ("clang_Cursor_getStorageClass", [Cursor], c_int),
-    ("clang_Cursor_isAnonymous", [Cursor], bool),
-    ("clang_Cursor_isAnonymousRecordDecl", [Cursor], bool),
-    ("clang_Cursor_isBitField", [Cursor], bool),
-    ("clang_Location_isInSystemHeader", [SourceLocation], bool),
+    ("clang_Cursor_isAnonymous", [Cursor], c_uint),
+    ("clang_Cursor_isAnonymousRecordDecl", [Cursor], c_uint),
+    ("clang_Cursor_isBitField", [Cursor], c_uint),
+    ("clang_Cursor_isFunctionInlined", [Cursor], c_uint),
+    ("clang_Location_isInSystemHeader", [SourceLocation], c_int),
     ("clang_PrintingPolicy_dispose", [PrintingPolicy]),
     ("clang_PrintingPolicy_getProperty", [PrintingPolicy, c_int], c_uint),
     ("clang_PrintingPolicy_setProperty", [PrintingPolicy, c_int, c_uint]),
diff --git a/clang/bindings/python/tests/cindex/test_cursor.py b/clang/bindings/python/tests/cindex/test_cursor.py
index eb0d1d50601a6..7cb616a7ef148 100644
--- a/clang/bindings/python/tests/cindex/test_cursor.py
+++ b/clang/bindings/python/tests/cindex/test_cursor.py
@@ -784,6 +784,21 @@ def test_storage_class(self):
         cursor = get_cursor(tu, "reg")
         self.assertEqual(cursor.storage_class, StorageClass.REGISTER)
 
+    def test_function_inlined(self):
+        tu = get_tu(
+            """
+inline void f_inline(void);
+void f_noninline(void);
+int d_noninline;
+"""
+        )
+        cursor = get_cursor(tu, "f_inline")
+        self.assertEqual(cursor.is_function_inlined(), True)
+        cursor = get_cursor(tu, "f_noninline")
+        self.assertEqual(cursor.is_function_inlined(), False)
+        cursor = get_cursor(tu, "d_noninline")
+        self.assertEqual(cursor.is_function_inlined(), False)
+
     def test_availability(self):
         tu = get_tu("class A { A(A const&) = delete; };", lang="cpp")
 
diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index 3d4d71a680d96..be3d0cfa2e657 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -200,16 +200,17 @@ endforeach()
 
 if(FUCHSIA_SDK)
   set(FUCHSIA_aarch64-unknown-fuchsia_NAME arm64)
+  set(FUCHSIA_arm-unknown-fuchsia_NAME arm)
   set(FUCHSIA_i386-unknown-fuchsia_NAME x64)
   set(FUCHSIA_x86_64-unknown-fuchsia_NAME x64)
   set(FUCHSIA_riscv64-unknown-fuchsia_NAME riscv64)
-  foreach(target i386-unknown-fuchsia;x86_64-unknown-fuchsia;aarch64-unknown-fuchsia;riscv64-unknown-fuchsia)
+  foreach(target i386-unknown-fuchsia;x86_64-unknown-fuchsia;aarch64-unknown-fuchsia;arm-unknown-fuchsia;riscv64-unknown-fuchsia)
     set(FUCHSIA_${target}_COMPILER_FLAGS "--target=${target} -I${FUCHSIA_SDK}/pkg/sync/include -I${FUCHSIA_SDK}/pkg/fdio/include")
     set(FUCHSIA_${target}_LINKER_FLAGS "-L${FUCHSIA_SDK}/arch/${FUCHSIA_${target}_NAME}/lib")
     set(FUCHSIA_${target}_SYSROOT "${FUCHSIA_SDK}/arch/${FUCHSIA_${target}_NAME}/sysroot")
   endforeach()
 
-  foreach(target i386-unknown-fuchsia;x86_64-unknown-fuchsia;aarch64-unknown-fuchsia;riscv64-unknown-fuchsia)
+  foreach(target i386-unknown-fuchsia;x86_64-unknown-fuchsia;aarch64-unknown-fuchsia;arm-unknown-fuchsia;riscv64-unknown-fuchsia)
     # Set the per-target builtins options.
     list(APPEND BUILTIN_TARGETS "${target}")
     set(BUILTINS_${target}_CMAKE_SYSTEM_NAME Fuchsia CACHE STRING "")
diff --git a/clang/docs/AddressSanitizer.rst b/clang/docs/AddressSanitizer.rst
index 21e1a3652192e..2c2131b01d361 100644
--- a/clang/docs/AddressSanitizer.rst
+++ b/clang/docs/AddressSanitizer.rst
@@ -159,7 +159,7 @@ eliminating this check (``-fsanitize-address-use-after-return=never``).
 
 To summarize: ``-fsanitize-address-use-after-return=<mode>``
   * ``never``: Completely disables detection of UAR errors (reduces code size).
-  * ``runtime``: Adds the code for detection, but it can be disable via the
+  * ``runtime``: Adds the code for detection, but it can be disabled via the
     runtime environment (``ASAN_OPTIONS=detect_stack_use_after_return=0``).
   * ``always``: Enables detection of UAR errors in all cases. (reduces code
     size, but not as much as ``never``).
@@ -239,7 +239,7 @@ from adding redzones around it and detecting out of bounds accesses.
 
 AddressSanitizer also supports
 ``__attribute__((disable_sanitizer_instrumentation))``. This attribute
-works similar to ``__attribute__((no_sanitize("address")))``, but it also
+works similarly to ``__attribute__((no_sanitize("address")))``, but it also
 prevents instrumentation performed by other sanitizers.
 
 Suppressing Errors in Recompiled Code (Ignorelist)
@@ -305,7 +305,7 @@ Limitations
 ===========
 
 * AddressSanitizer uses more real memory than a native run. Exact overhead
-  depends on the allocations sizes. The smaller the allocations you make the
+  depends on the allocation sizes. The smaller the allocations you make the
   bigger the overhead is.
 * AddressSanitizer uses more stack memory. We have seen up to 3x increase.
 * On 64-bit platforms AddressSanitizer maps (but not reserves) 16+ Terabytes of
diff --git a/clang/docs/BlockLanguageSpec.rst b/clang/docs/BlockLanguageSpec.rst
index 3632d566838a6..0c3a000be5c88 100644
--- a/clang/docs/BlockLanguageSpec.rst
+++ b/clang/docs/BlockLanguageSpec.rst
@@ -279,7 +279,7 @@ copy. The net effect is that instance variables can be mutated.
 
 The :block-term:`Block_copy` operator retains all objects held in
 variables of automatic storage referenced within the Block expression
-(or form strong references if running under garbage collection).
+(or forms strong references if running under garbage collection).
 Object variables of ``__block`` storage type are assumed to hold
 normal pointers with no provision for retain and release messages.
 
diff --git a/clang/docs/BoundsSafety.rst b/clang/docs/BoundsSafety.rst
index 519c7b685e60d..b0f77c38b28af 100644
--- a/clang/docs/BoundsSafety.rst
+++ b/clang/docs/BoundsSafety.rst
@@ -58,7 +58,7 @@ adopt, offering these properties that make it widely adoptable in practice:
 * It has a relatively low adoption cost.
 
 This document discusses the key designs of ``-fbounds-safety``. The document is
-subject to be actively updated with a more detailed specification.
+subject to active updates with a more detailed specification.
 
 Programming Model
 =================
@@ -574,7 +574,7 @@ When ``sizeof()`` takes a type name, the compiler doesn't apply an implicit
 bounds annotation on the named pointer types. This means if a bounds annotation
 is not specified, the evaluated pointer type is treated identically to a plain C
 pointer type. Therefore, ``sizeof(int*)`` remains the same with or without
-``-fbounds-safety``. That said, programmers can explicitly add attribute to the
+``-fbounds-safety``. That said, programmers can explicitly add attributes to the
 types, e.g., ``sizeof(int *__bidi_indexable)``, in which case the sizeof
 evaluates to the size of type ``int *__bidi_indexable`` (the value equivalent to
 ``3 * sizeof(int*)``).
diff --git a/clang/docs/BoundsSafetyImplPlans.rst b/clang/docs/BoundsSafetyImplPlans.rst
index 34276c920f31e..b374b0a0c68a4 100644
--- a/clang/docs/BoundsSafetyImplPlans.rst
+++ b/clang/docs/BoundsSafetyImplPlans.rst
@@ -154,7 +154,7 @@ verify its bounds safety. The implementation relies on LLVM optimizations to
 remove redundant run-time checks. Using this optimization strategy, if the
 original source code already has bounds checks, the fewer additional checks
 ``-fbounds-safety`` will introduce. The LLVM ``ConstraintElimination`` pass is
-design to remove provable redundant checks (please check Florian Hahn’s
+designed to remove provable redundant checks (please check Florian Hahn’s
 presentation in 2021 LLVM Dev Meeting and the implementation to learn more). In
 the following example, ``-fbounds-safety`` implicitly adds the redundant bounds
 checks that the optimizer can remove:
diff --git a/clang/docs/CMakeLists.txt b/clang/docs/CMakeLists.txt
index 1f06c040c96cb..9469a832adb62 100644
--- a/clang/docs/CMakeLists.txt
+++ b/clang/docs/CMakeLists.txt
@@ -132,7 +132,7 @@ if (LLVM_ENABLE_SPHINX)
     # Generated files
     gen_rst_file_from_td(AttributeReference.rst -gen-attr-docs ../include/clang/Basic/Attr.td "${docs_targets}")
     gen_rst_file_from_td(DiagnosticsReference.rst -gen-diag-docs ../include/clang/Basic/Diagnostic.td "${docs_targets}")
-    gen_rst_file_from_td(ClangCommandLineReference.rst -gen-opt-docs ../include/clang/Driver/ClangOptionDocs.td "${docs_targets}")
+    gen_rst_file_from_td(ClangCommandLineReference.rst -gen-opt-docs ../include/clang/Options/ClangOptionDocs.td "${docs_targets}")
 
     # Another generated file from a different source
     set(docs_tools_dir ${CMAKE_CURRENT_SOURCE_DIR}/tools)
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 570cab262c115..94d6f0d27619f 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -197,57 +197,29 @@ the configuration (without a prefix: ``Auto``).
 
 .. _AlignAfterOpenBracket:
 
-**AlignAfterOpenBracket** (``BracketAlignmentStyle``) :versionbadge:`clang-format 3.8` :ref:`¶ <AlignAfterOpenBracket>`
+**AlignAfterOpenBracket** (``Boolean``) :versionbadge:`clang-format 3.8` :ref:`¶ <AlignAfterOpenBracket>`
   If ``true``, horizontally aligns arguments after an open bracket.
 
-  This applies to round brackets (parentheses), angle brackets and square
-  brackets.
-
-  Possible values:
-
-  * ``BAS_Align`` (in configuration: ``Align``)
-    Align parameters on the open bracket, e.g.:
-
-    .. code-block:: c++
-
-      someLongFunction(argument1,
-                       argument2);
-
-  * ``BAS_DontAlign`` (in configuration: ``DontAlign``)
-    Don't align, instead use ``ContinuationIndentWidth``, e.g.:
-
-    .. code-block:: c++
 
-      someLongFunction(argument1,
-          argument2);
-
-  * ``BAS_AlwaysBreak`` (in configuration: ``AlwaysBreak``)
-    Always break after an open bracket, if the parameters don't fit
-    on a single line, e.g.:
-
-    .. code-block:: c++
-
-      someLongFunction(
-          argument1, argument2);
-
-  * ``BAS_BlockIndent`` (in configuration: ``BlockIndent``)
-    Always break after an open bracket, if the parameters don't fit
-    on a single line. Closing brackets will be placed on a new line.
-    E.g.:
-
-    .. code-block:: c++
-
-      someLongFunction(
-          argument1, argument2
-      )
+  .. code-block:: c++
 
+    true:                         vs.   false
+    someLongFunction(argument1,         someLongFunction(argument1,
+                     argument2);            argument2);
 
-    .. note::
 
-     This currently only applies to braced initializer lists (when
-     ``Cpp11BracedListStyle`` is not ``Block``) and parentheses.
+  .. note::
 
+    As of clang-format 22 this option is a bool with the previous
+    option of ``Align`` replaced with ``true``, ``DontAlign`` replaced
+    with ``false``, and the options of ``AlwaysBreak`` and ``BlockIndent``
+    replaced with ``true`` and with setting of new style options using
+    ``BreakAfterOpenBracketBracedList``, ``BreakAfterOpenBracketFunction``,
+    ``BreakAfterOpenBracketIf``, ``BreakBeforeCloseBracketBracedList``,
+    ``BreakBeforeCloseBracketFunction``, and ``BreakBeforeCloseBracketIf``.
 
+  This applies to round brackets (parentheses), angle brackets and square
+  brackets.
 
 .. _AlignArrayOfStructures:
 
@@ -1701,6 +1673,16 @@ the configuration (without a prefix: ``Auto``).
 
       int abcdef; // but this isn't
 
+  * ``bool AlignPPAndNotPP`` If comments following preprocessor directive should be aligned with
+    comments that don't.
+
+    .. code-block:: c++
+
+      true:                               false:
+      #define A  // Comment   vs.         #define A  // Comment
+      #define AB // Aligned               #define AB // Aligned
+      int i;     // Aligned               int i; // Not aligned
+
 
 .. _AllowAllArgumentsOnNextLine:
 
@@ -2746,6 +2728,67 @@ the configuration (without a prefix: ``Auto``).
      @Mock
      DataLoad loader;
 
+.. _BreakAfterOpenBracketBracedList:
+
+**BreakAfterOpenBracketBracedList** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakAfterOpenBracketBracedList>`
+  Force break after the left bracket of a braced initializer list (when
+  ``Cpp11BracedListStyle`` is ``true``) when the list exceeds the column
+  limit.
+
+  .. code-block:: c++
+
+    true:                             false:
+    vector<int> x {         vs.       vector<int> x {1,
+       1, 2, 3}                            2, 3}
+
+.. _BreakAfterOpenBracketFunction:
+
+**BreakAfterOpenBracketFunction** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakAfterOpenBracketFunction>`
+  Force break after the left parenthesis of a function (declaration,
+  definition, call) when the parameters exceed the column limit.
+
+  .. code-block:: c++
+
+    true:                             false:
+    foo (                   vs.       foo (a,
+       a , b)                              b)
+
+.. _BreakAfterOpenBracketIf:
+
+**BreakAfterOpenBracketIf** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakAfterOpenBracketIf>`
+  Force break after the left parenthesis of an if control statement
+  when the expression exceeds the column limit.
+
+  .. code-block:: c++
+
+    true:                             false:
+    if constexpr (          vs.       if constexpr (a ||
+       a || b)                                      b)
+
+.. _BreakAfterOpenBracketLoop:
+
+**BreakAfterOpenBracketLoop** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakAfterOpenBracketLoop>`
+  Force break after the left parenthesis of a loop control statement
+  when the expression exceeds the column limit.
+
+  .. code-block:: c++
+
+    true:                             false:
+    while (                  vs.      while (a &&
+       a && b) {                             b) {
+
+.. _BreakAfterOpenBracketSwitch:
+
+**BreakAfterOpenBracketSwitch** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakAfterOpenBracketSwitch>`
+  Force break after the left parenthesis of a switch control statement
+  when the expression exceeds the column limit.
+
+  .. code-block:: c++
+
+    true:                             false:
+    switch (                 vs.      switch (a +
+       a + b) {                               b) {
+
 .. _BreakAfterReturnType:
 
 **BreakAfterReturnType** (``ReturnTypeBreakingStyle``) :versionbadge:`clang-format 19` :ref:`¶ <BreakAfterReturnType>`
@@ -3383,6 +3426,79 @@ the configuration (without a prefix: ``Auto``).
 
 
 
+.. _BreakBeforeCloseBracketBracedList:
+
+**BreakBeforeCloseBracketBracedList** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakBeforeCloseBracketBracedList>`
+  Force break before the right bracket of a braced initializer list (when
+  ``Cpp11BracedListStyle`` is ``true``) when the list exceeds the column
+  limit. The break before the right bracket is only made if there is a
+  break after the opening bracket.
+
+  .. code-block:: c++
+
+    true:                             false:
+    vector<int> x {         vs.       vector<int> x {
+       1, 2, 3                           1, 2, 3}
+    }
+
+.. _BreakBeforeCloseBracketFunction:
+
+**BreakBeforeCloseBracketFunction** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakBeforeCloseBracketFunction>`
+  Force break before the right parenthesis of a function (declaration,
+  definition, call) when the parameters exceed the column limit.
+
+  .. code-block:: c++
+
+    true:                             false:
+    foo (                   vs.       foo (
+       a , b                             a , b)
+    )
+
+.. _BreakBeforeCloseBracketIf:
+
+**BreakBeforeCloseBracketIf** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakBeforeCloseBracketIf>`
+  Force break before the right parenthesis of an if control statement
+  when the expression exceeds the column limit. The break before the
+  closing parenthesis is only made if there is a break after the opening
+  parenthesis.
+
+  .. code-block:: c++
+
+    true:                             false:
+    if constexpr (          vs.       if constexpr (
+       a || b                            a || b )
+    )
+
+.. _BreakBeforeCloseBracketLoop:
+
+**BreakBeforeCloseBracketLoop** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakBeforeCloseBracketLoop>`
+  Force break before the right parenthesis of a loop control statement
+  when the expression exceeds the column limit. The break before the
+  closing parenthesis is only made if there is a break after the opening
+  parenthesis.
+
+  .. code-block:: c++
+
+    true:                             false:
+    while (                  vs.      while (
+       a && b                            a && b) {
+    ) {
+
+.. _BreakBeforeCloseBracketSwitch:
+
+**BreakBeforeCloseBracketSwitch** (``Boolean``) :versionbadge:`clang-format 22` :ref:`¶ <BreakBeforeCloseBracketSwitch>`
+  Force break before the right parenthesis of a switch control statement
+  when the expression exceeds the column limit. The break before the
+  closing parenthesis is only made if there is a break after the opening
+  parenthesis.
+
+  .. code-block:: c++
+
+    true:                             false:
+    switch (                 vs.      switch (
+       a + b                             a + b) {
+    ) {
+
 .. _BreakBeforeConceptDeclarations:
 
 **BreakBeforeConceptDeclarations** (``BreakBeforeConceptDeclarationsStyle``) :versionbadge:`clang-format 12` :ref:`¶ <BreakBeforeConceptDeclarations>`
diff --git a/clang/docs/ClangLinkerWrapper.rst b/clang/docs/ClangLinkerWrapper.rst
index 28f48fce6fe36..3637bdb848273 100644
--- a/clang/docs/ClangLinkerWrapper.rst
+++ b/clang/docs/ClangLinkerWrapper.rst
@@ -27,7 +27,7 @@ only for the linker wrapper will be forwarded to the wrapped linker job.
 
 .. code-block:: console
 
-  USAGE: clang-linker-wrapper [options] -- <options to passed to the linker>
+  USAGE: clang-linker-wrapper [options] -- <options to pass to the linker>
 
   OPTIONS:
     --cuda-path=<dir>      Set the system CUDA path
diff --git a/clang/docs/ClangNVLinkWrapper.rst b/clang/docs/ClangNVLinkWrapper.rst
index 2acdb054572f8..28763b3891f57 100644
--- a/clang/docs/ClangNVLinkWrapper.rst
+++ b/clang/docs/ClangNVLinkWrapper.rst
@@ -10,7 +10,7 @@ Clang nvlink Wrapper
 Introduction
 ============
 
-This tools works as a wrapper around the NVIDIA ``nvlink`` linker. The purpose
+This tool works as a wrapper around the NVIDIA ``nvlink`` linker. The purpose
 of this wrapper is to provide an interface similar to the ``ld.lld`` linker
 while still relying on NVIDIA's proprietary linker to produce the final output.
 
@@ -37,7 +37,7 @@ only for the linker wrapper will be forwarded to ``nvlink``.
     --arch <value>       Specify the 'sm_' name of the target architecture.
     --cuda-path=<dir>    Set the system CUDA path
     --dry-run            Print generated commands without running.
-    --feature <value>    Specify the '+ptx' freature to use for LTO.
+    --feature <value>    Specify the '+ptx' feature to use for LTO.
     -g                   Specify that this was a debug compile.
     -help-hidden         Display all available options
     -help                Display available options (--help-hidden for more)
diff --git a/clang/docs/ClangPlugins.rst b/clang/docs/ClangPlugins.rst
index 92e41fb5877fe..3bd9e963d48ab 100644
--- a/clang/docs/ClangPlugins.rst
+++ b/clang/docs/ClangPlugins.rst
@@ -150,7 +150,7 @@ passed to the plugin can.
             -fplugin-arg-call_super_plugin-help \
             test.cpp
 
-If your plugin name contains dashes, either rename the plugin or used the
+If your plugin name contains dashes, either rename the plugin or use the
 cc1 command line options listed below.
 
 
diff --git a/clang/docs/ClangTools.rst b/clang/docs/ClangTools.rst
index 3216328bbb6a6..b53c125f5b42e 100644
--- a/clang/docs/ClangTools.rst
+++ b/clang/docs/ClangTools.rst
@@ -66,7 +66,7 @@ in a fast, command line interface. It can also accept flags to re-display the
 diagnostics in different formats with different flags, suitable for use driving
 an IDE or editor. Furthermore, it can be used in fixit-mode to directly apply
 fixit-hints offered by clang. See :doc:`HowToSetupToolingForLLVM` for
-instructions on how to setup and used `clang-check`.
+instructions on how to setup and use `clang-check`.
 
 ``clang-format``
 ----------------
diff --git a/clang/docs/ConstantInterpreter.rst b/clang/docs/ConstantInterpreter.rst
index a71ee4b430a6e..3b1bd4b3bda18 100644
--- a/clang/docs/ConstantInterpreter.rst
+++ b/clang/docs/ConstantInterpreter.rst
@@ -140,7 +140,7 @@ pointer goes out of scope, dead blocks are also deallocated.
 The lifetime of blocks is managed through 3 methods stored in the
 descriptor of the block:
 
-* **CtorFn**: initializes the metadata which is store in the block,
+* **CtorFn**: initializes the metadata which is stored in the block,
   alongside actual data. Invokes the default constructors of objects
   which are not trivial (``Pointer``, ``RealFP``, etc.)
 
diff --git a/clang/docs/ControlFlowIntegrity.rst b/clang/docs/ControlFlowIntegrity.rst
index baff9ab54ff26..cfe5bd836cacf 100644
--- a/clang/docs/ControlFlowIntegrity.rst
+++ b/clang/docs/ControlFlowIntegrity.rst
@@ -135,7 +135,7 @@ Bad Cast Checking
 This scheme checks that pointer casts are made to an object of the correct
 dynamic type; that is, the dynamic type of the object must be a derived class
 of the pointee type of the cast. The checks are currently only introduced
-where the class being casted to is a polymorphic class.
+where the class being cast to is a polymorphic class.
 
 Bad casts are not in themselves control flow integrity violations, but they
 can also create security vulnerabilities, and the implementation uses many
diff --git a/clang/docs/DataFlowSanitizer.rst b/clang/docs/DataFlowSanitizer.rst
index 5ff50b85dcdcf..154229f9780b1 100644
--- a/clang/docs/DataFlowSanitizer.rst
+++ b/clang/docs/DataFlowSanitizer.rst
@@ -243,7 +243,7 @@ labels of just ``v1`` and ``v2``.
   This signature is the same when origin tracking is disabled - in this case
   the dfsan_origin passed in it will always be 0.
 
-  The callback will be called when a tained value reach stack/registers
+  The callback will be called when a tainted value reaches stack/registers
   in the context of a function. Tainted values can reach a function:
   * via the arguments of the function
   * via the return value of a call that occurs in the function
diff --git a/clang/docs/HardwareAssistedAddressSanitizerDesign.rst b/clang/docs/HardwareAssistedAddressSanitizerDesign.rst
index 014d10382e725..f2e76d6faa400 100644
--- a/clang/docs/HardwareAssistedAddressSanitizerDesign.rst
+++ b/clang/docs/HardwareAssistedAddressSanitizerDesign.rst
@@ -15,7 +15,7 @@ Introduction
 tags every 8 bytes of the application memory with a 1 byte tag (using *shadow memory*),
 uses *redzones* to find buffer-overflows and
 *quarantine* to find use-after-free.
-The redzones, the quarantine, and, to a less extent, the shadow, are the
+The redzones, the quarantine, and, to a lesser extent, the shadow, are the
 sources of AddressSanitizer's memory overhead.
 See the `AddressSanitizer paper`_ for details.
 
diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst
index eff46ab46e1ca..a849d05eb7ae9 100644
--- a/clang/docs/InternalsManual.rst
+++ b/clang/docs/InternalsManual.rst
@@ -667,7 +667,7 @@ Command Line Interface
 ----------------------
 
 The command line interface of the Clang ``-cc1`` frontend is defined alongside
-the driver options in ``clang/Driver/Options.td``. The information making up an
+the driver options in ``clang/Options/Options.td``. The information making up an
 option definition includes its prefix and name (for example ``-std=``), form and
 position of the option value, help text, aliases and more. Each option may
 belong to a certain group and can be marked with zero or more flags. Options
@@ -712,7 +712,7 @@ variable for the option value:
     }
 
 Next, declare the command line interface of the option in the tablegen file
-``clang/include/clang/Driver/Options.td``. This is done by instantiating the
+``clang/include/clang/Options/Options.td``. This is done by instantiating the
 ``Option`` class (defined in ``llvm/include/llvm/Option/OptParser.td``). The
 instance is typically created through one of the helper classes that encode the
 acceptable ways to specify the option value on the command line:
@@ -906,7 +906,7 @@ command line:
                                   SHOULD_PARSE, KEYPATH, DEFAULT_VALUE,          \
                                   IMPLIED_CHECK, IMPLIED_VALUE, NORMALIZER,      \
                                   MERGER, TABLE_INDEX)
-  #include "clang/Driver/Options.inc"
+  #include "clang/Options/Options.inc"
   #undef LANG_OPTION_WITH_MARSHALLING
 
     // ...
@@ -925,7 +925,7 @@ command line:
     GENERATE_OPTION_WITH_MARSHALLING(                                            \
         Args, SA, KIND, FLAGS, SPELLING, ALWAYS_EMIT, KEYPATH, DEFAULT_VALUE,    \
         IMPLIED_CHECK, IMPLIED_VALUE, DENORMALIZER, EXTRACTOR, TABLE_INDEX)
-  #include "clang/Driver/Options.inc"
+  #include "clang/Options/Options.inc"
   #undef LANG_OPTION_WITH_MARSHALLING
 
     // ...
diff --git a/clang/docs/JSONCompilationDatabase.rst b/clang/docs/JSONCompilationDatabase.rst
index f5432278bd4d4..936ba11b087bd 100644
--- a/clang/docs/JSONCompilationDatabase.rst
+++ b/clang/docs/JSONCompilationDatabase.rst
@@ -54,7 +54,7 @@ python bindings also support this (since clang 3.2); see
 Format
 ======
 
-A compilation database is a JSON file, which consist of an array of
+A compilation database is a JSON file, which consists of an array of
 "command objects", where each command object specifies one way a
 translation unit is compiled in the project.
 
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 495f2ab3926ce..a3db3e5d356b3 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -385,7 +385,9 @@ Builtin Macros
 
 ``__COUNTER__``
   Defined to an integer value that starts at zero and is incremented each time
-  the ``__COUNTER__`` macro is expanded.
+  the ``__COUNTER__`` macro is expanded. This is a standard feature in C2y but
+  is an extension in earlier language modes and in C++. This macro can only be
+  expanded 2147483647 times at most.
 
 ``__INCLUDE_LEVEL__``
   Defined to an integral value that is the include depth of the file currently
@@ -805,6 +807,8 @@ of different sizes and signs is forbidden in binary and ternary builtins.
  T __builtin_elementwise_exp(T x)               returns the base-e exponential, e^x, of the specified value            floating point types
  T __builtin_elementwise_exp2(T x)              returns the base-2 exponential, 2^x, of the specified value            floating point types
  T __builtin_elementwise_exp10(T x)             returns the base-10 exponential, 10^x, of the specified value          floating point types
+ T __builtin_elementwise_ldexp(T x, IntT y)     returns the product of x and 2 raised to the power y.                  T: floating point types,
+                                                y must be an integer type matching the shape of x.                     IntT: integer types
 
  T __builtin_elementwise_sqrt(T x)              return the square root of a floating-point number                      floating point types
  T __builtin_elementwise_roundeven(T x)         round x to the nearest integer value in floating point format,         floating point types
@@ -1821,6 +1825,7 @@ Octal literals prefixed with ``0o`` or ``0O``                                  C
 ``_Countof`` (N3369, N3469)                                                    C2y           C89
 ``_Generic`` with a type operand (N3260)                                       C2y           C89, C++
 ``++``/``--`` on ``_Complex`` value (N3259)                                    C2y           C89, C++
+``__COUNTER__`` (N3457)                                                        C2y           C89, C++
 ============================================= ================================ ============= =============
 
 Builtin type aliases
@@ -2406,6 +2411,16 @@ those modes.
 Use ``__has_feature(c_fixed_enum)`` to determine whether support for fixed
 underlying types is available in C23 and later.
 
+Enumerations with no enumerators
+--------------------------------
+
+Clang provides support for Microsoft extensions to support enumerations with no enumerators.
+
+.. code-block:: c++
+
+  typedef enum empty { } A;
+
+
 Interoperability with C++11 lambdas
 -----------------------------------
 
diff --git a/clang/docs/LibASTImporter.rst b/clang/docs/LibASTImporter.rst
index f5d40928d01e8..e438de6624fd7 100644
--- a/clang/docs/LibASTImporter.rst
+++ b/clang/docs/LibASTImporter.rst
@@ -35,12 +35,12 @@ Importing one AST node copies that node into the destination ``ASTContext``.
 Why do we have to copy the node?
 Isn't enough to insert the pointer to that node into the destination context?
 One reason is that the "from" context may outlive the "to" context.
-Also, the Clang AST consider nodes (or certain properties of nodes) equivalent if they have the same address!
+Also, the Clang AST considers nodes (or certain properties of nodes) equivalent if they have the same address!
 
 The import algorithm has to ensure that the structurally equivalent nodes in the different translation units are not getting duplicated in the merged AST.
 E.g. if we include the definition of the vector template (``#include <vector>``) in two translation units, then their merged AST should have only one node which represents the template.
 Also, we have to discover *one definition rule* (ODR) violations.
-For instance, if there is a class definition with the same name in both translation units, but one of the definition contains a different number of fields.
+For instance, if there is a class definition with the same name in both translation units, but one of the definitions contains a different number of fields.
 So, we look up existing definitions, and then we check the structural equivalency on those nodes.
 The following pseudo-code demonstrates the basics of the import mechanism:
 
diff --git a/clang/docs/LibASTMatchers.rst b/clang/docs/LibASTMatchers.rst
index 3b9f0a66db139..0aa7923fda9aa 100644
--- a/clang/docs/LibASTMatchers.rst
+++ b/clang/docs/LibASTMatchers.rst
@@ -95,7 +95,7 @@ and flexibility.
 ``VariadicDynCastAllOfMatcher<Base, Derived>``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Those match all nodes of type *Base* if they can be dynamically casted to
+Those match all nodes of type *Base* if they can be dynamically cast to
 *Derived*.  The names of those matchers are nouns, which closely resemble
 *Derived*.  ``VariadicDynCastAllOfMatchers`` are the backbone of the matcher
 hierarchy.  Most often, your match expression will start with one of them, and
diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html
index 5b2a96d00d592..ac1abb4d9f381 100644
--- a/clang/docs/LibASTMatchersReference.html
+++ b/clang/docs/LibASTMatchersReference.html
@@ -825,6 +825,20 @@ <h2 id="decl-matchers">Node Matchers</h2>
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('fileScopeAsmDecl0')"><a name="fileScopeAsmDecl0Anchor">fileScopeAsmDecl</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FileScopeAsmDecl.html">FileScopeAsmDecl</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="fileScopeAsmDecl0"><pre>Matches top level asm declarations.
+
+Given
+   __asm("nop");
+   void f() {
+     __asm("mov al, 2");
+   }
+fileScopeAsmDecl()
+  matches '__asm("nop")',
+  but not '__asm("mov al, 2")'.
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('friendDecl0')"><a name="friendDecl0Anchor">friendDecl</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FriendDecl.html">FriendDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="friendDecl0"><pre>Matches friend declarations.
 
diff --git a/clang/docs/LibASTMatchersTutorial.rst b/clang/docs/LibASTMatchersTutorial.rst
index d2883688ebfac..e901eb9481fd6 100644
--- a/clang/docs/LibASTMatchersTutorial.rst
+++ b/clang/docs/LibASTMatchersTutorial.rst
@@ -209,7 +209,7 @@ and traversal matchers to get from one kind of AST node to another. For
 a complete list of AST matchers, take a look at the `AST Matcher
 References <LibASTMatchersReference.html>`_
 
-All matcher that are nouns describe entities in the AST and can be
+All matchers that are nouns describe entities in the AST and can be
 bound, so that they can be referred to whenever a match is found. To do
 so, simply call the method ``bind`` on these matchers, e.g.:
 
diff --git a/clang/docs/LibClang.rst b/clang/docs/LibClang.rst
index e747022b9c173..6c62bcb5f8c29 100644
--- a/clang/docs/LibClang.rst
+++ b/clang/docs/LibClang.rst
@@ -38,6 +38,7 @@ Code example
 
 .. code-block:: cpp
 
+  // main.cpp
   #include <clang-c/Index.h>
   #include <iostream>
 
@@ -57,6 +58,22 @@ Code example
     CXCursor cursor = clang_getTranslationUnitCursor(unit); //Obtain a cursor at the root of the translation unit
   }
 
+.. code-block:: cmake
+
+  # CMakeLists.txt
+  cmake_minimum_required(VERSION 3.20)
+  project(my_clang_tool VERSION 0.1.0)
+
+  # This will find the default system installation of Clang; if you want to
+  # use a different build of clang, pass -DClang_DIR=/foobar/lib/cmake/clang
+  # to the CMake configure command, where /foobar is the build directory where
+  # you built Clang.
+  find_package(Clang CONFIG REQUIRED)
+
+  add_executable(my_clang_tool main.cpp)
+  target_include_directories(my_clang_tool PRIVATE ${CLANG_INCLUDE_DIRS})
+  target_link_libraries(my_clang_tool PRIVATE libclang)
+
 Visiting elements of an AST
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The elements of an AST can be recursively visited with pre-order traversal with ``clang_visitChildren``.
@@ -283,6 +300,7 @@ Complete example code
 
 .. code-block:: cpp
 
+  // main.cpp
   #include <clang-c/Index.h>
   #include <iostream>
 
@@ -356,6 +374,21 @@ Complete example code
     );
   }
 
+.. code-block:: cmake
+
+  # CMakeLists.txt
+  cmake_minimum_required(VERSION 3.20)
+  project(my_clang_tool VERSION 0.1.0)
+
+  # This will find the default system installation of Clang; if you want to
+  # use a different build of clang, pass -DClang_DIR=/foobar/lib/cmake/clang
+  # to the CMake configure command, where /foobar is the build directory where
+  # you built Clang.
+  find_package(Clang CONFIG REQUIRED)
+
+  add_executable(my_clang_tool main.cpp)
+  target_include_directories(my_clang_tool PRIVATE ${CLANG_INCLUDE_DIRS})
+  target_link_libraries(my_clang_tool PRIVATE libclang)
 
 .. _Index.h: https://github.com/llvm/llvm-project/blob/main/clang/include/clang-c/Index.h
 
diff --git a/clang/docs/LibFormat.rst b/clang/docs/LibFormat.rst
index 833f768c54a64..9450073b4841c 100644
--- a/clang/docs/LibFormat.rst
+++ b/clang/docs/LibFormat.rst
@@ -3,7 +3,7 @@ LibFormat
 =========
 
 LibFormat is a library that implements automatic source code formatting based
-on Clang. This documents describes the LibFormat interface and design as well
+on Clang. This document describes the LibFormat interface and design as well
 as some basic style discussions.
 
 If you just want to use `clang-format` as a tool or integrated into an editor,
diff --git a/clang/docs/MatrixTypes.rst b/clang/docs/MatrixTypes.rst
index 32949c6c43529..b3a2c8cf53670 100644
--- a/clang/docs/MatrixTypes.rst
+++ b/clang/docs/MatrixTypes.rst
@@ -53,7 +53,7 @@ type of the *typedef* becomes a matrix type with the given dimensions and an
 element type of the former underlying type.
 
 If a declaration of a *typedef-name* has a ``matrix_type`` attribute, then all
-declaration of that *typedef-name* shall have a matrix_type attribute with the
+declarations of that *typedef-name* shall have a matrix_type attribute with the
 same element type, number of rows, and number of columns.
 
 Standard Conversions
diff --git a/clang/docs/MemorySanitizer.rst b/clang/docs/MemorySanitizer.rst
index 9f0d3f13a9d62..4f581427c36af 100644
--- a/clang/docs/MemorySanitizer.rst
+++ b/clang/docs/MemorySanitizer.rst
@@ -176,7 +176,7 @@ for `lifetime <https://eel.is/c++draft/basic.life#1>`_ definition.
 
 This feature can be disabled with either:
 
-#. Pass addition Clang option ``-fno-sanitize-memory-use-after-dtor`` during
+#. Pass additional Clang option ``-fno-sanitize-memory-use-after-dtor`` during
    compilation.
 #. Set environment variable `MSAN_OPTIONS=poison_in_dtor=0` before running
    the program.
diff --git a/clang/docs/Modules.rst b/clang/docs/Modules.rst
index e45ee9ff9eac2..0abb85c1d6563 100644
--- a/clang/docs/Modules.rst
+++ b/clang/docs/Modules.rst
@@ -115,7 +115,7 @@ Objective-C provides syntax for importing a module via an *@import declaration*,
 
   @import std;
 
-The ``@import`` declaration above imports the entire contents of the ``std`` module (which would contain, e.g., the entire C or C++ standard library) and make its API available within the current translation unit. To import only part of a module, one may use dot syntax to specific a particular submodule, e.g.,
+The ``@import`` declaration above imports the entire contents of the ``std`` module (which would contain, e.g., the entire C or C++ standard library) and make its API available within the current translation unit. To import only part of a module, one may use dot syntax to specify a particular submodule, e.g.,
 
 .. parsed-literal::
 
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index 10a8d095fede3..f7e6061044c6d 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -559,7 +559,7 @@ implementation.
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | Clarifications to Fortran map semantics                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| default clause at target construct                          | :part:`In Progress`       | :none:`unclaimed`         |                                                                          |
+| default clause at target construct                          | :good:`done`              | :none:`unclaimed`         | https://github.com/llvm/llvm-project/pull/162910                         |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | ref count update use_device_{ptr, addr}                     | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 8435f367029a5..88a05affebf9e 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -69,6 +69,19 @@ Potentially Breaking Changes
   call the member ``operator delete`` instead of the expected global
   delete operator. The old behavior is retained under ``-fclang-abi-compat=21``
   flag.
+- Trailing null statements in GNU statement expressions are no longer
+  ignored by Clang; they now result in a void type. Clang previously
+  matched GCC's behavior, which was recently clarified to be incorrect.
+
+  .. code-block:: c++
+
+    // The resulting type is 'void', not 'int'
+    void foo(void) {
+      return ({ 1;; });
+    }
+- Downstream projects that previously linked only against ``clangDriver`` may
+  now (also) need to link against the new ``clangOptions`` library, since
+  options-related code has been moved out of the Driver into a separate library.
 
 C/C++ Language Potentially Breaking Changes
 -------------------------------------------
@@ -109,6 +122,7 @@ C++ Specific Potentially Breaking Changes
 
 ABI Changes in This Version
 ---------------------------
+- Fix AArch64 argument passing for C++ empty classes with large explicitly specified alignment.
 
 AST Dumping Potentially Breaking Changes
 ----------------------------------------
@@ -196,6 +210,11 @@ C2y Feature Support
   function or variable within an extern inline function is no longer a
   constraint per `WG14 N3622 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3622.txt>`_.
 - Clang now supports `N3355 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3355.htm>`_ Named Loops.
+- Clang's implementation of ``__COUNTER__`` was updated to conform to
+  `WG14 N3457 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3457.htm>`_.
+  This includes adding pedantic warnings for the feature being an extension in
+  other language modes as well as an error when the counter is expanded more
+  than 2147483647 times.
 
 C23 Feature Support
 ^^^^^^^^^^^^^^^^^^^
@@ -204,9 +223,13 @@ C23 Feature Support
   `WG14 N2710 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2710.htm>`_.
 - Fixed accepting as compatible unnamed tag types with the same fields within
   the same translation unit but from different types.
+- ``-MG`` now silences the "file not found" errors with ``#embed`` when
+  scanning for dependencies and encountering an unknown file. #GH165632
 
 Non-comprehensive list of changes in this release
 -------------------------------------------------
+- Added ``__builtin_elementwise_ldexp``.
+
 - Added ``__builtin_elementwise_fshl`` and ``__builtin_elementwise_fshr``.
 
 - ``__builtin_elementwise_abs`` can now be used in constant expression.
@@ -322,6 +345,9 @@ Improvements to Clang's diagnostics
 -----------------------------------
 - Diagnostics messages now refer to ``structured binding`` instead of ``decomposition``,
   to align with `P0615R0 <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0615r0.html>`_ changing the term. (#GH157880)
+- Clang now suppresses runtime behavior warnings for unreachable code in file-scope
+  variable initializers, matching the behavior for functions. This prevents false
+  positives for operations in unreachable branches of constant expressions.
 - Added a separate diagnostic group ``-Wfunction-effect-redeclarations``, for the more pedantic
   diagnostics for function effects (``[[clang::nonblocking]]`` and ``[[clang::nonallocating]]``).
   Moved the warning for a missing (though implied) attribute on a redeclaration into this group.
@@ -350,7 +376,7 @@ Improvements to Clang's diagnostics
   potential misaligned members get processed before they can get discarded.
   (#GH144729)
 
-- Clang now emits dignostic with correct message in case of assigning to const reference captured in lambda. (#GH105647)
+- Clang now emits a diagnostic with the correct message in case of assigning to const reference captured in lambda. (#GH105647)
 
 - Fixed false positive in ``-Wmissing-noreturn`` diagnostic when it was requiring the usage of
   ``[[noreturn]]`` on lambdas before C++23 (#GH154493).
@@ -390,6 +416,11 @@ Improvements to Clang's diagnostics
   that were previously incorrectly accepted in case of other irrelevant
   conditions are now consistently diagnosed, identical to C++ mode.
 
+- Fix false-positive unused label diagnostic when a label is used in a named break
+  or continue (#GH166013)
+- Clang now emits a diagnostic in case `vector_size` or `ext_vector_type`
+  attributes are used with a negative size (#GH165463).
+
 Improvements to Clang's time-trace
 ----------------------------------
 
@@ -432,6 +463,8 @@ Bug Fixes in This Version
 - Fixed a failed assertion with empty filename arguments in ``__has_embed``. (#GH159898)
 - Fixed a failed assertion with empty filename in ``#embed`` directive. (#GH162951)
 - Fixed a crash triggered by unterminated ``__has_embed``. (#GH162953)
+- Accept empty enumerations in MSVC-compatible C mode. (#GH114402)
+- Fixed false-positive shadow diagnostics for lambdas in explicit object member functions. (#GH163731)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -451,6 +484,8 @@ Bug Fixes to Attribute Support
   ``[[gnu::error("some error")]]`` now correctly triggers an error. (#GH146520)
 - Fix a crash when the function name is empty in the `swift_name` attribute. (#GH157075)
 - Fixes crashes or missing diagnostics with the `device_kernel` attribute. (#GH161905)
+- Fix handling of parameter indexes when an attribute is applied to a C++23 explicit object member function.
+- Fixed several false positives and false negatives in function effect (`nonblocking`) analysis. (#GH166078) (#GH166101) (#GH166110)
 
 Bug Fixes to C++ Support
 ^^^^^^^^^^^^^^^^^^^^^^^^
@@ -465,7 +500,7 @@ Bug Fixes to C++ Support
   casts that are guaranteed to fail (#GH137518).
 - Fix bug rejecting partial specialization of variable templates with auto NTTPs (#GH118190).
 - Fix a crash if errors "member of anonymous [...] redeclares" and
-  "intializing multiple members of union" coincide (#GH149985).
+  "initializing multiple members of union" coincide (#GH149985).
 - Fix a crash when using ``explicit(bool)`` in pre-C++11 language modes. (#GH152729)
 - Fix the parsing of variadic member functions when the ellipis immediately follows a default argument.(#GH153445)
 - Fixed a bug that caused ``this`` captured by value in a lambda with a dependent explicit object parameter to not be
@@ -495,6 +530,8 @@ Bug Fixes to C++ Support
   nontrivial member when another member has an initializer. (#GH81774)
 - Fixed a template depth issue when parsing lambdas inside a type constraint. (#GH162092)
 - Diagnose unresolved overload sets in non-dependent compound requirements. (#GH51246) (#GH97753)
+- Fix a crash when extracting unavailable member type from alias in template deduction. (#GH165560)
+- Fix incorrect diagnostics for lambdas with init-captures inside braced initializers. (#GH163498)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -542,6 +579,8 @@ X86 Support
 
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
+- More intrinsics for the following AArch64 instructions:
+  FCVTZ[US], FCVTN[US], FCVTM[US], FCVTP[US], FCVTA[US]
 
 Android Support
 ^^^^^^^^^^^^^^^
@@ -562,6 +601,9 @@ RISC-V Support
 - Add `-march=unset` to clear any previous `-march=` value. This ISA string will
   be computed from `-mcpu` or the platform default.
 
+- `__GCC_CONSTRUCTIVE_SIZE` and `__GCC_DESTRUCTIVE_SIZE` are changed to 64. These values are
+  unstable according to `Clang's documentation <https://clang.llvm.org/docs/LanguageExtensions.html#gcc-destructive-size-and-gcc-constructive-size>`_.
+
 CUDA/HIP Language Changes
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -579,6 +621,8 @@ NetBSD Support
 WebAssembly Support
 ^^^^^^^^^^^^^^^^^^^
 
+- Fix a bug so that ``__has_attribute(musttail)`` is no longer true when WebAssembly's tail-call is not enabled. (#GH163256)
+
 AVR Support
 ^^^^^^^^^^^
 
@@ -613,6 +657,15 @@ clang-format
   literals.
 - Add ``Leave`` suboption to ``IndentPPDirectives``.
 - Add ``AllowBreakBeforeQtProperty`` option.
+- Add ``BreakAfterOpenBracketBracedList'', ``BreakAfterOpenBracketFunction'',
+  ``BreakAfterOpenBracketIf``, ``BreakAfterOpenBracketLoop``,
+  ``BreakAfterOpenBracketSwitch``, ``BreakBeforeCloseBracketBracedList'',
+  ``BreakBeforeCloseBracketFunction``, ``BreakBeforeCloseBracketIf``,
+  ``BreakBeforeCloseBracketLoop``, ``BreakBeforeCloseBracketSwitch`` options.
+- Deprecate ``AlwaysBreak`` and ``BlockIndent`` suboptions from the
+  ``AlignAfterOpenBracket`` option, and make ``AlignAfterOpenBracket`` a
+  ``bool`` type.
+- Add ``AlignPPAndNotPP`` suboption to ``AlignTrailingComments``.
 
 libclang
 --------
@@ -651,6 +704,7 @@ Sanitizers
 
 Python Binding Changes
 ----------------------
+- Exposed ``clang_Cursor_isFunctionInlined``.
 - Exposed ``clang_getCursorLanguage`` via ``Cursor.language``.
 - Add all missing ``CursorKind``s, ``TypeKind``s and
   ``ExceptionSpecificationKind``s from ``Index.h``
@@ -671,6 +725,7 @@ OpenMP Support
 - Added support for 'omp fuse' directive.
 - Updated parsing and semantic analysis support for ``nowait`` clause to accept
   optional argument in OpenMP >= 60.
+- Added support for ``default`` clause on ``target`` directive.
 
 Improvements
 ^^^^^^^^^^^^
diff --git a/clang/docs/tools/dump_ast_matchers.py b/clang/docs/tools/dump_ast_matchers.py
index 46b7bb718ba08..5db6826070934 100755
--- a/clang/docs/tools/dump_ast_matchers.py
+++ b/clang/docs/tools/dump_ast_matchers.py
@@ -6,11 +6,8 @@
 import collections
 import re
 import os
+from urllib.request import urlopen
 
-try:
-    from urllib.request import urlopen
-except ImportError:
-    from urllib2 import urlopen
 
 CLASS_INDEX_PAGE_URL = "https://clang.llvm.org/doxygen/classes.html"
 try:
diff --git a/clang/include/clang/AST/APNumericStorage.h b/clang/include/clang/AST/APNumericStorage.h
index e1948a552bf7e..04424086b98cf 100644
--- a/clang/include/clang/AST/APNumericStorage.h
+++ b/clang/include/clang/AST/APNumericStorage.h
@@ -41,9 +41,8 @@ class APNumericStorage {
   llvm::APInt getIntValue() const {
     unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
     if (NumWords > 1)
-      return llvm::APInt(BitWidth, NumWords, pVal);
-    else
-      return llvm::APInt(BitWidth, VAL);
+      return llvm::APInt(BitWidth, llvm::ArrayRef(pVal, NumWords));
+    return llvm::APInt(BitWidth, VAL);
   }
   void setIntValue(const ASTContext &C, const llvm::APInt &Val);
 };
diff --git a/clang/include/clang/AST/AbstractBasicReader.h b/clang/include/clang/AST/AbstractBasicReader.h
index 0d187eb49d6ca..064a342aa0684 100644
--- a/clang/include/clang/AST/AbstractBasicReader.h
+++ b/clang/include/clang/AST/AbstractBasicReader.h
@@ -173,7 +173,7 @@ class DataStreamBasicReader : public BasicReaderBase<Impl> {
     llvm::SmallVector<uint64_t, 4> data;
     for (uint32_t i = 0; i != numWords; ++i)
       data.push_back(asImpl().readUInt64());
-    return llvm::APInt(bitWidth, numWords, &data[0]);
+    return llvm::APInt(bitWidth, data);
   }
 
   llvm::FixedPointSemantics readFixedPointSemantics() {
diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h
index ce273c167aa22..14d7caa0e16d7 100644
--- a/clang/include/clang/AST/Attr.h
+++ b/clang/include/clang/AST/Attr.h
@@ -16,6 +16,7 @@
 #include "clang/AST/ASTFwd.h"
 #include "clang/AST/AttrIterator.h"
 #include "clang/AST/Decl.h"
+#include "clang/AST/DeclCXX.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/AttrKinds.h"
 #include "clang/Basic/AttributeCommonInfo.h"
@@ -327,8 +328,8 @@ class ParamIdx {
   ParamIdx(unsigned Idx, const Decl *D)
       : Idx(Idx), HasThis(false), IsValid(true) {
     assert(Idx >= 1 && "Idx must be one-origin");
-    if (const auto *FD = dyn_cast<FunctionDecl>(D))
-      HasThis = FD->isCXXInstanceMember();
+    if (const auto *MethodDecl = dyn_cast<CXXMethodDecl>(D))
+      HasThis = MethodDecl->isImplicitObjectMemberFunction();
   }
 
   /// A type into which \c ParamIdx can be serialized.
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 406d79ebd6641..ee2321dd158d4 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -2335,7 +2335,7 @@ class FunctionDecl : public DeclaratorDecl,
   }
 
   void setDefaultedOrDeletedInfo(DefaultedOrDeletedFunctionInfo *Info);
-  DefaultedOrDeletedFunctionInfo *getDefalutedOrDeletedInfo() const;
+  DefaultedOrDeletedFunctionInfo *getDefaultedOrDeletedInfo() const;
 
   /// Whether this function is variadic.
   bool isVariadic() const;
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index c6326a8ba506d..5519787d71f88 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -2642,7 +2642,7 @@ class DeclContext {
 
   using udir_iterator_base =
       llvm::iterator_adaptor_base<udir_iterator, lookup_iterator,
-                                  typename lookup_iterator::iterator_category,
+                                  lookup_iterator::iterator_category,
                                   UsingDirectiveDecl *>;
 
   struct udir_iterator : udir_iterator_base {
diff --git a/clang/include/clang/AST/JSONNodeDumper.h b/clang/include/clang/AST/JSONNodeDumper.h
index 427a9c51ece1b..d364795a05811 100644
--- a/clang/include/clang/AST/JSONNodeDumper.h
+++ b/clang/include/clang/AST/JSONNodeDumper.h
@@ -149,7 +149,7 @@ class JSONNodeDumper
   void writeIncludeStack(PresumedLoc Loc, bool JustFirst = false);
 
   // Writes the attributes of a SourceLocation object without.
-  void writeBareSourceLocation(SourceLocation Loc, bool IsSpelling);
+  void writeBareSourceLocation(SourceLocation Loc);
 
   // Writes the attributes of a SourceLocation to JSON based on its presumed
   // spelling location. If the given location represents a macro invocation,
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 4f507485968cd..3296fbf409552 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -10068,6 +10068,152 @@ class OMPXDynCGroupMemClause
   Expr *getSize() const { return getStmtAs<Expr>(); }
 };
 
+/// This represents 'dyn_groupprivate' clause in '#pragma omp target ...'
+/// and '#pragma omp teams ...' directives.
+///
+/// \code
+/// #pragma omp target [...] dyn_groupprivate(a,b: N)
+/// \endcode
+class OMPDynGroupprivateClause : public OMPClause, public OMPClauseWithPreInit {
+  friend class OMPClauseReader;
+
+  /// Location of '('.
+  SourceLocation LParenLoc;
+
+  /// Modifiers for 'dyn_groupprivate' clause.
+  enum { SIMPLE, FALLBACK, NUM_MODIFIERS };
+  unsigned Modifiers[NUM_MODIFIERS];
+
+  /// Locations of modifiers.
+  SourceLocation ModifiersLoc[NUM_MODIFIERS];
+
+  /// The size of the dyn_groupprivate.
+  Expr *Size = nullptr;
+
+  /// Set the first dyn_groupprivate modifier.
+  ///
+  /// \param M The modifier.
+  void setDynGroupprivateModifier(OpenMPDynGroupprivateClauseModifier M) {
+    Modifiers[SIMPLE] = M;
+  }
+
+  /// Set the second dyn_groupprivate modifier.
+  ///
+  /// \param M The modifier.
+  void setDynGroupprivateFallbackModifier(
+      OpenMPDynGroupprivateClauseFallbackModifier M) {
+    Modifiers[FALLBACK] = M;
+  }
+
+  /// Set location of the first dyn_groupprivate modifier.
+  void setDynGroupprivateModifierLoc(SourceLocation Loc) {
+    ModifiersLoc[SIMPLE] = Loc;
+  }
+
+  /// Set location of the second dyn_groupprivate modifier.
+  void setDynGroupprivateFallbackModifierLoc(SourceLocation Loc) {
+    ModifiersLoc[FALLBACK] = Loc;
+  }
+
+  /// Sets the location of '('.
+  ///
+  /// \param Loc Location of '('.
+  void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; }
+
+  /// Set size.
+  ///
+  /// \param E Size.
+  void setSize(Expr *E) { Size = E; }
+
+public:
+  /// Build 'dyn_groupprivate' clause with a size expression \a Size.
+  ///
+  /// \param StartLoc Starting location of the clause.
+  /// \param LParenLoc Location of '('.
+  /// \param EndLoc Ending location of the clause.
+  /// \param Size Size.
+  /// \param M1 The first modifier applied to 'dyn_groupprivate' clause.
+  /// \param M1Loc Location of the first modifier.
+  /// \param M2 The second modifier applied to 'dyn_groupprivate' clause.
+  /// \param M2Loc Location of the second modifier.
+  OMPDynGroupprivateClause(SourceLocation StartLoc, SourceLocation LParenLoc,
+                           SourceLocation EndLoc, Expr *Size, Stmt *HelperSize,
+                           OpenMPDirectiveKind CaptureRegion,
+                           OpenMPDynGroupprivateClauseModifier M1,
+                           SourceLocation M1Loc,
+                           OpenMPDynGroupprivateClauseFallbackModifier M2,
+                           SourceLocation M2Loc)
+      : OMPClause(llvm::omp::OMPC_dyn_groupprivate, StartLoc, EndLoc),
+        OMPClauseWithPreInit(this), LParenLoc(LParenLoc), Size(Size) {
+    setPreInitStmt(HelperSize, CaptureRegion);
+    Modifiers[SIMPLE] = M1;
+    Modifiers[FALLBACK] = M2;
+    ModifiersLoc[SIMPLE] = M1Loc;
+    ModifiersLoc[FALLBACK] = M2Loc;
+  }
+
+  /// Build an empty clause.
+  explicit OMPDynGroupprivateClause()
+      : OMPClause(llvm::omp::OMPC_dyn_groupprivate, SourceLocation(),
+                  SourceLocation()),
+        OMPClauseWithPreInit(this) {
+    Modifiers[SIMPLE] = OMPC_DYN_GROUPPRIVATE_unknown;
+    Modifiers[FALLBACK] = OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown;
+  }
+
+  /// Get the first modifier of the clause.
+  OpenMPDynGroupprivateClauseModifier getDynGroupprivateModifier() const {
+    return static_cast<OpenMPDynGroupprivateClauseModifier>(Modifiers[SIMPLE]);
+  }
+
+  /// Get the second modifier of the clause.
+  OpenMPDynGroupprivateClauseFallbackModifier
+  getDynGroupprivateFallbackModifier() const {
+    return static_cast<OpenMPDynGroupprivateClauseFallbackModifier>(
+        Modifiers[FALLBACK]);
+  }
+
+  /// Get location of '('.
+  SourceLocation getLParenLoc() { return LParenLoc; }
+
+  /// Get the first modifier location.
+  SourceLocation getDynGroupprivateModifierLoc() const {
+    return ModifiersLoc[SIMPLE];
+  }
+
+  /// Get the second modifier location.
+  SourceLocation getDynGroupprivateFallbackModifierLoc() const {
+    return ModifiersLoc[FALLBACK];
+  }
+
+  /// Get size.
+  Expr *getSize() { return Size; }
+
+  /// Get size.
+  const Expr *getSize() const { return Size; }
+
+  child_range children() {
+    return child_range(reinterpret_cast<Stmt **>(&Size),
+                       reinterpret_cast<Stmt **>(&Size) + 1);
+  }
+
+  const_child_range children() const {
+    auto Children = const_cast<OMPDynGroupprivateClause *>(this)->children();
+    return const_child_range(Children.begin(), Children.end());
+  }
+
+  child_range used_children() {
+    return child_range(child_iterator(), child_iterator());
+  }
+  const_child_range used_children() const {
+    return const_child_range(const_child_iterator(), const_child_iterator());
+  }
+
+  static bool classof(const OMPClause *T) {
+    return T->getClauseKind() == llvm::omp::OMPC_dyn_groupprivate;
+  }
+};
+
 /// This represents the 'doacross' clause for the '#pragma omp ordered'
 /// directive.
 ///
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 8cb0a657023b4..8f427427d71ed 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -4165,6 +4165,14 @@ bool RecursiveASTVisitor<Derived>::VisitOMPXDynCGroupMemClause(
   return true;
 }
 
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPDynGroupprivateClause(
+    OMPDynGroupprivateClause *C) {
+  TRY_TO(VisitOMPClauseWithPreInit(C));
+  TRY_TO(TraverseStmt(C->getSize()));
+  return true;
+}
+
 template <typename Derived>
 bool RecursiveASTVisitor<Derived>::VisitOMPDoacrossClause(
     OMPDoacrossClause *C) {
diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h
index 76942f1a84f9a..bec4066cc16eb 100644
--- a/clang/include/clang/AST/Stmt.h
+++ b/clang/include/clang/AST/Stmt.h
@@ -1831,26 +1831,6 @@ class CompoundStmt final
     return const_reverse_body_iterator(body_begin());
   }
 
-  // Get the Stmt that StmtExpr would consider to be the result of this
-  // compound statement. This is used by StmtExpr to properly emulate the GCC
-  // compound expression extension, which ignores trailing NullStmts when
-  // getting the result of the expression.
-  // i.e. ({ 5;;; })
-  //           ^^ ignored
-  // If we don't find something that isn't a NullStmt, just return the last
-  // Stmt.
-  Stmt *getStmtExprResult() {
-    for (auto *B : llvm::reverse(body())) {
-      if (!isa<NullStmt>(B))
-        return B;
-    }
-    return body_back();
-  }
-
-  const Stmt *getStmtExprResult() const {
-    return const_cast<CompoundStmt *>(this)->getStmtExprResult();
-  }
-
   SourceLocation getBeginLoc() const { return LBraceLoc; }
   SourceLocation getEndLoc() const { return RBraceLoc; }
 
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index 98e62de2a9bfb..bca2d8425b3f5 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -2478,6 +2478,21 @@ extern const internal::VariadicDynCastAllOfMatcher<Stmt, NullStmt> nullStmt;
 ///   matches '__asm("mov al, 2")'
 extern const internal::VariadicDynCastAllOfMatcher<Stmt, AsmStmt> asmStmt;
 
+/// Matches top level asm declarations.
+///
+/// Given
+/// \code
+///    __asm("nop");
+///    void f() {
+///      __asm("mov al, 2");
+///    }
+/// \endcode
+/// fileScopeAsmDecl()
+///   matches '__asm("nop")',
+///   but not '__asm("mov al, 2")'.
+extern const internal::VariadicDynCastAllOfMatcher<Decl, FileScopeAsmDecl>
+    fileScopeAsmDecl;
+
 /// Matches bool literals.
 ///
 /// Example matches true
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h
index 6a90aeb01e638..b9cad5340c940 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h
@@ -16,6 +16,7 @@
 
 #include "clang/Analysis/Analyses/LifetimeSafety/Loans.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Utils.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
 #include "clang/Analysis/CFG.h"
 #include "llvm/ADT/SmallVector.h"
@@ -23,6 +24,9 @@
 #include <cstdint>
 
 namespace clang::lifetimes::internal {
+
+using FactID = utils::ID<struct FactTag>;
+
 /// An abstract base class for a single, atomic lifetime-relevant event.
 class Fact {
 
@@ -48,6 +52,7 @@ class Fact {
 
 private:
   Kind K;
+  FactID ID;
 
 protected:
   Fact(Kind K) : K(K) {}
@@ -56,6 +61,9 @@ class Fact {
   virtual ~Fact() = default;
   Kind getKind() const { return K; }
 
+  void setID(FactID ID) { this->ID = ID; }
+  FactID getID() const { return ID; }
+
   template <typename T> const T *getAs() const {
     if (T::classof(this))
       return static_cast<const T *>(this);
@@ -144,6 +152,7 @@ class ReturnOfOriginFact : public Fact {
 
 class UseFact : public Fact {
   const Expr *UseExpr;
+  OriginID OID;
   // True if this use is a write operation (e.g., left-hand side of assignment).
   // Write operations are exempted from use-after-free checks.
   bool IsWritten = false;
@@ -151,12 +160,10 @@ class UseFact : public Fact {
 public:
   static bool classof(const Fact *F) { return F->getKind() == Kind::Use; }
 
-  UseFact(const Expr *UseExpr) : Fact(Kind::Use), UseExpr(UseExpr) {}
+  UseFact(const Expr *UseExpr, OriginManager &OM)
+      : Fact(Kind::Use), UseExpr(UseExpr), OID(OM.get(*UseExpr)) {}
 
-  OriginID getUsedOrigin(const OriginManager &OM) const {
-    // TODO: Remove const cast and make OriginManager::get as const.
-    return const_cast<OriginManager &>(OM).get(*UseExpr);
-  }
+  OriginID getUsedOrigin() const { return OID; }
   const Expr *getUseExpr() const { return UseExpr; }
   void markAsWritten() { IsWritten = true; }
   bool isWritten() const { return IsWritten; }
@@ -184,22 +191,26 @@ class TestPointFact : public Fact {
 
 class FactManager {
 public:
+  void init(const CFG &Cfg) {
+    assert(BlockToFacts.empty() && "FactManager already initialized");
+    BlockToFacts.resize(Cfg.getNumBlockIDs());
+  }
+
   llvm::ArrayRef<const Fact *> getFacts(const CFGBlock *B) const {
-    auto It = BlockToFactsMap.find(B);
-    if (It != BlockToFactsMap.end())
-      return It->second;
-    return {};
+    return BlockToFacts[B->getBlockID()];
   }
 
   void addBlockFacts(const CFGBlock *B, llvm::ArrayRef<Fact *> NewFacts) {
     if (!NewFacts.empty())
-      BlockToFactsMap[B].assign(NewFacts.begin(), NewFacts.end());
+      BlockToFacts[B->getBlockID()].assign(NewFacts.begin(), NewFacts.end());
   }
 
   template <typename FactType, typename... Args>
   FactType *createFact(Args &&...args) {
     void *Mem = FactAllocator.Allocate<FactType>();
-    return new (Mem) FactType(std::forward<Args>(args)...);
+    FactType *Res = new (Mem) FactType(std::forward<Args>(args)...);
+    Res->setID(NextFactID++);
+    return Res;
   }
 
   void dump(const CFG &Cfg, AnalysisDeclContext &AC) const;
@@ -215,16 +226,19 @@ class FactManager {
   /// \note This is intended for testing only.
   llvm::StringMap<ProgramPoint> getTestPoints() const;
 
+  unsigned getNumFacts() const { return NextFactID.Value; }
+
   LoanManager &getLoanMgr() { return LoanMgr; }
   const LoanManager &getLoanMgr() const { return LoanMgr; }
   OriginManager &getOriginMgr() { return OriginMgr; }
   const OriginManager &getOriginMgr() const { return OriginMgr; }
 
 private:
+  FactID NextFactID{0};
   LoanManager LoanMgr;
   OriginManager OriginMgr;
-  llvm::DenseMap<const clang::CFGBlock *, llvm::SmallVector<const Fact *>>
-      BlockToFactsMap;
+  /// Facts for each CFG block, indexed by block ID.
+  llvm::SmallVector<llvm::SmallVector<const Fact *>> BlockToFacts;
   llvm::BumpPtrAllocator FactAllocator;
 };
 } // namespace clang::lifetimes::internal
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/FactsGenerator.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/FactsGenerator.h
index 5e58abee2bbb3..4c8ab3f859a49 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/FactsGenerator.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/FactsGenerator.h
@@ -43,6 +43,7 @@ class FactsGenerator : public ConstStmtVisitor<FactsGenerator> {
   void VisitUnaryOperator(const UnaryOperator *UO);
   void VisitReturnStmt(const ReturnStmt *RS);
   void VisitBinaryOperator(const BinaryOperator *BO);
+  void VisitConditionalOperator(const ConditionalOperator *CO);
   void VisitCXXOperatorCallExpr(const CXXOperatorCallExpr *OCE);
   void VisitCXXFunctionalCastExpr(const CXXFunctionalCastExpr *FCE);
   void VisitInitListExpr(const InitListExpr *ILE);
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
index ba138b078b379..56b9010f41fa2 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
@@ -74,6 +74,8 @@ class OriginManager {
 
   OriginID getOrCreate(const ValueDecl &D);
 
+  unsigned getNumOrigins() const { return NextOriginID.Value; }
+
   void dump(OriginID OID, llvm::raw_ostream &OS) const;
 
 private:
diff --git a/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h b/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
index 696c9f4a6cf5c..c547d6ce2e387 100644
--- a/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
+++ b/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
@@ -46,6 +46,9 @@ struct UncheckedOptionalAccessModelOptions {
   /// are confident in this const accessor caching, we shouldn't need the
   /// IgnoreSmartPointerDereference option anymore.
   bool IgnoreSmartPointerDereference = false;
+
+  /// In generating diagnostics, ignore calls to `optional::value()`.
+  bool IgnoreValueCalls = false;
 };
 
 using UncheckedOptionalAccessLattice = CachedConstAccessorsLattice<NoopLattice>;
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 749f531ec9ab1..aac8c1f550cb2 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -508,6 +508,10 @@ def TargetMicrosoftRecordLayout : TargetArch<["x86", "x86_64", "arm", "thumb",
   let CustomCode = [{ Target.hasMicrosoftRecordLayout() }];
 }
 
+def TargetMustTailAvaiable:  TargetSpec {
+  let CustomCode = [{ Target.hasMustTail() }];
+}
+
 def TargetELF : TargetSpec {
   let ObjectFormats = ["ELF"];
 }
@@ -1069,7 +1073,7 @@ def AVRSignal : InheritableAttr, TargetSpecificAttr<TargetAVR> {
 }
 
 def AsmLabel : InheritableAttr {
-  let Spellings = [CustomKeyword<"asm">, CustomKeyword<"__asm__">];
+  let Spellings = [CustomKeyword<"asm">, CustomKeyword<"__asm">, CustomKeyword<"__asm__">];
   let Args = [
       // Label specifies the mangled name for the decl.
       StringArgument<"Label">, ];
@@ -1913,7 +1917,7 @@ def NoMerge : DeclOrStmtAttr {
                              "functions, statements and variables">;
 }
 
-def MustTail : StmtAttr {
+def MustTail : StmtAttr, TargetSpecificAttr<TargetMustTailAvaiable> {
   let Spellings = [Clang<"musttail">];
   let Documentation = [MustTailDocs];
   let Subjects = SubjectList<[ReturnStmt], ErrorDiag, "return statements">;
@@ -5017,6 +5021,10 @@ def HLSLUnparsedSemantic : HLSLAnnotationAttr {
   let Documentation = [InternalOnly];
 }
 
+def HLSLUserSemantic : HLSLSemanticAttr</* Indexable= */ 1> {
+  let Documentation = [InternalOnly];
+}
+
 def HLSLSV_Position : HLSLSemanticAttr</* Indexable= */ 1> {
   let Documentation = [HLSLSV_PositionDocs];
 }
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 2fdd041c1b46e..f1dbd8af6093a 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3450,9 +3450,9 @@ Mac, and BSD. This attribute has no effect on other targets.
 def MSABIDocs : Documentation {
   let Category = DocCatCallingConvs;
   let Content = [{
-On non-Windows x86_64 targets, this attribute changes the calling convention of
-a function to match the default convention used on Windows x86_64. This
-attribute has no effect on Windows targets or non-x86_64 targets.
+On non-Windows x86_64 and aarch64 targets, this attribute changes the calling convention of
+a function to match the default convention used on Windows. This
+attribute has no effect on Windows targets or non-x86_64, non-aarch64 targets.
   }];
 }
 
@@ -4295,17 +4295,17 @@ used by other languages. (This prefix is also added to the standard Itanium
 C++ ABI prefix on "mangled" symbol names, so that e.g. on such targets the true
 symbol name for a C++ variable declared as ``int cppvar;`` would be
 ``__Z6cppvar``; note the two underscores.)  This prefix is *not* added to the
-symbol names specified by the ``asm`` attribute; programmers wishing to match a
-C symbol name must compensate for this.
+symbol names specified by the ``__asm`` attribute; programmers wishing to match
+a C symbol name must compensate for this.
 
 For example, consider the following C code:
 
 .. code-block:: c
 
-  int var1 asm("altvar") = 1;  // "altvar" in symbol table.
+  int var1 __asm("altvar") = 1;  // "altvar" in symbol table.
   int var2 = 1; // "_var2" in symbol table.
 
-  void func1(void) asm("altfunc");
+  void func1(void) __asm("altfunc");
   void func1(void) {} // "altfunc" in symbol table.
   void func2(void) {} // "_func2" in symbol table.
 
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 2b400b012d6ed..d4a3e34a43c53 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1418,6 +1418,12 @@ def ElementwiseExp10 : Builtin {
   let Prototype = "void(...)";
 }
 
+def ElementwiseLdexp : Builtin {
+  let Spellings = ["__builtin_elementwise_ldexp"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
 def ElementwiseFloor : Builtin {
   let Spellings = ["__builtin_elementwise_floor"];
   let Attributes = [NoThrow, Const, CustomTypeChecking];
@@ -5235,6 +5241,12 @@ def HLSLGetSpirvSpecConstant : LangBuiltin<"HLSL_LANG">, HLSLScalarTemplate {
   let Prototype = "T(unsigned int, T)";
 }
 
+def HLSLF16ToF32 : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_f16tof32"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
 // Builtins for XRay.
 def XRayCustomEvent : Builtin {
   let Spellings = ["__xray_customevent"];
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 36cb527a9c806..2b6fcb1fd479b 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -180,7 +180,7 @@ BUILTIN(__builtin_amdgcn_raw_buffer_load_b128, "V4UiQbiiIi", "n")
 BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_add_i32, "iiQbiiIi", "")
 
 TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32, "ffQbiiIi", "", "atomic-fadd-rtn-insts")
-TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16, "V2hV2hQbiiIi", "t", "atomic-buffer-global-pk-add-f16-insts")
+TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16, "V2hV2hQbiiIi", "", "atomic-buffer-global-pk-add-f16-insts")
 
 TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f32, "ffQbiiIi", "", "atomic-fmin-fmax-global-f32")
 TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f32, "ffQbiiIi", "", "atomic-fmin-fmax-global-f32")
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 9e877b92eac68..cb08e2107f072 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -93,22 +93,6 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
   }
 
 
-  let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
-    def pavgb128 : X86Builtin<"_Vector<16, unsigned char>(_Vector<16, unsigned char>, _Vector<16, unsigned char>)">;
-    def pavgw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
-    def pmulhw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
-    def pmulhuw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
-    def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
-    def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
-    def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
-
-    def vec_ext_v2di : X86Builtin<"long long int(_Vector<2, long long int>, _Constant int)">;
-    def vec_ext_v4si : X86Builtin<"int(_Vector<4, int>, _Constant int)">;
-    def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">;
-    def vec_ext_v8hi : X86Builtin<"short(_Vector<8, short>, _Constant int)">;
-    def vec_set_v8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, short, _Constant int)">;
-  }
-
   let Features = "sse3" in {
     foreach Op = ["addsub"] in {
       def Op#ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>)">;
@@ -219,15 +203,6 @@ let Features = "sse2", Attributes = [NoThrow] in {
   def movnti : X86Builtin<"void(int *, int)">;
 }
 
-let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
-  def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
-  def pshufd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
-  def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
-  def movmskpd : X86Builtin<"int(_Vector<2, double>)">;
-  def pmovmskb128 : X86Builtin<"int(_Vector<16, char>)">;
-  def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
-}
-
 let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
   def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
@@ -285,12 +260,27 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
   def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
 }
 
-let Features = "sse2",
-    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def movmskpd : X86Builtin<"int(_Vector<2, double>)">;
+  def pmovmskb128 : X86Builtin<"int(_Vector<16, char>)">;
+
+  def pavgb128 : X86Builtin<"_Vector<16, unsigned char>(_Vector<16, unsigned char>, _Vector<16, unsigned char>)">;
+  def pavgw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
+
   def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">;
-  
+  def pmulhw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def pmulhuw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
   def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
 
+  def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
+  def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
+  def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
+
+  def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
+  def pshufd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
+  def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
+  def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
+
   def psllwi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">;
   def pslldi128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int)">;
   def psllqi128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, int)">;
@@ -304,6 +294,12 @@ let Features = "sse2",
 
   def pslldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
   def psrldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
+
+  def vec_ext_v2di : X86Builtin<"long long int(_Vector<2, long long int>, _Constant int)">;
+  def vec_ext_v4si : X86Builtin<"int(_Vector<4, int>, _Constant int)">;
+  def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">;
+  def vec_ext_v8hi : X86Builtin<"short(_Vector<8, short>, _Constant int)">;
+  def vec_set_v8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, short, _Constant int)">;
 }
 
 let Features = "sse3", Attributes = [NoThrow] in {
@@ -315,7 +311,7 @@ let Features = "sse3", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
   def lddqu : X86Builtin<"_Vector<16, char>(char const *)">;
 }
 
-let Features = "ssse3", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def palignr128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant int)">;
 }
 
@@ -609,8 +605,7 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid
 
 let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
   def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
-  def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, "
-                              "_Vector<32, char>, _Constant int)">;
+
   def psadbw256
       : X86Builtin<
             "_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
@@ -634,6 +629,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
   def pavgb256 : X86Builtin<"_Vector<32, unsigned char>(_Vector<32, unsigned char>, _Vector<32, unsigned char>)">;
   def pavgw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
+  def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
 
   def pblendd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
   def pblendd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
@@ -1765,75 +1761,48 @@ let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
   def scattersiv8si : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, int>, _Constant int)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpermi2vard128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
-}
-
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">;
-}
-
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">;
-}
-
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def vpermi2varq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+  def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">;
+  def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
   def vpermi2varq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+  def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">;
+  def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">;
 }
 
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
+  def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
   def vpermi2varq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+  def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">;
+  def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">;
 }
 
-let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpermi2varqi128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
 }
 
-let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vpermi2varqi256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;
 }
 
-let Features = "avx512vbmi", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512vbmi", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def vpermi2varqi512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Vector<64, char>)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpermi2varhi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vpermi2varhi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def vpermi2varhi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">;
 }
 
@@ -3194,31 +3163,31 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in {
   def kordi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
 }
 
-let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr] in {
   def kortestcqi : X86Builtin<"int(unsigned char, unsigned char)">;
   def kortestzqi : X86Builtin<"int(unsigned char, unsigned char)">;
 }
 
-let Features = "avx512f", Attributes = [NoThrow, Const] in {
+let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr] in {
   def kortestchi : X86Builtin<"int(unsigned short, unsigned short)">;
   def kortestzhi : X86Builtin<"int(unsigned short, unsigned short)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in {
   def kortestcsi : X86Builtin<"int(unsigned int, unsigned int)">;
   def kortestzsi : X86Builtin<"int(unsigned int, unsigned int)">;
   def kortestcdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
   def kortestzdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
 }
 
-let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr] in {
   def ktestcqi : X86Builtin<"int(unsigned char, unsigned char)">;
   def ktestzqi : X86Builtin<"int(unsigned char, unsigned char)">;
   def ktestchi : X86Builtin<"int(unsigned short, unsigned short)">;
   def ktestzhi : X86Builtin<"int(unsigned short, unsigned short)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in {
   def ktestcsi : X86Builtin<"int(unsigned int, unsigned int)">;
   def ktestzsi : X86Builtin<"int(unsigned int, unsigned int)">;
   def ktestcdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
@@ -3294,7 +3263,7 @@ let Features = "avx512bw", Attributes = [NoThrow, Const] in {
   def kmovq : X86Builtin<"unsigned long long int(unsigned long long int)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def palignr512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant int)">;
 }
 
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.td b/clang/include/clang/Basic/BuiltinsX86_64.td
index 275278c5ac089..062060e6afbbe 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.td
+++ b/clang/include/clang/Basic/BuiltinsX86_64.td
@@ -239,57 +239,6 @@ let Features = "amx-complex", Attributes = [NoThrow] in {
   def tcmmrlfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
 }
 
-let Features = "amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz0_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz0rs_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz0t1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz0rst1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz1rs_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz1t1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz1rst1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-transpose", Attributes = [NoThrow] in {
-  def ttransposed_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, _Vector<256, int>)">;
-}
-
-let Features = "amx-bf16,amx-transpose", Attributes = [NoThrow] in {
-  def ttdpbf16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-}
-
-let Features = "amx-fp16,amx-transpose", Attributes = [NoThrow] in {
-  def ttdpfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-}
-
-let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in {
-  def ttcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-  def ttcmmrlfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-  def tconjtcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-  def tconjtfp16_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, _Vector<256, int>)">;
-}
-
 let Features = "amx-avx512,avx10.2", Attributes = [NoThrow] in {
   def tcvtrowd2ps_internal : X86Builtin<"_Vector<16, float>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
   def tcvtrowps2bf16h_internal : X86Builtin<"_Vector<32, __bf16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
@@ -303,10 +252,6 @@ let Features = "amx-tf32", Attributes = [NoThrow] in {
   def tmmultf32ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
 }
 
-let Features = "amx-tf32,amx-transpose", Attributes = [NoThrow] in {
-  def ttmmultf32ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-}
-
 let Features = "amx-fp8", Attributes = [NoThrow] in {
   def tdpbf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
   def tdpbhf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
@@ -321,13 +266,6 @@ let Features = "amx-tile", Attributes = [NoThrow] in {
   def tilezero : X86Builtin<"void(unsigned char)">;
 }
 
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz0rs : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-  def t2rpntlvwz0rst1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-  def t2rpntlvwz1rs : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-  def t2rpntlvwz1rst1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-}
-
 let Features = "amx-movrs", Attributes = [NoThrow] in {
   def tileloaddrs64 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
   def tileloaddrst164 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
@@ -359,29 +297,6 @@ let Features = "amx-complex", Attributes = [NoThrow] in {
   def tcmmrlfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
 }
 
-let Features = "amx-transpose", Attributes = [NoThrow] in {
-  def t2rpntlvwz0 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-  def t2rpntlvwz0t1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-  def t2rpntlvwz1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-  def t2rpntlvwz1t1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-  def ttransposed : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char)">;
-}
-
-let Features = "amx-bf16,amx-transpose", Attributes = [NoThrow] in {
-  def ttdpbf16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-}
-
-let Features = "amx-fp16,amx-transpose", Attributes = [NoThrow] in {
-  def ttdpfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-}
-
-let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in {
-  def ttcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-  def ttcmmrlfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-  def tconjtcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-  def tconjtfp16 : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char)">;
-}
-
 let Features = "amx-avx512,avx10.2", Attributes = [NoThrow] in {
   def tcvtrowd2ps : X86Builtin<"_Vector<16, float>(_Constant unsigned char, unsigned int)">;
   def tcvtrowps2bf16h : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, unsigned int)">;
@@ -406,10 +321,6 @@ let Features = "amx-tf32", Attributes = [NoThrow] in {
   def tmmultf32ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
 }
 
-let Features = "amx-tf32,amx-transpose", Attributes = [NoThrow] in {
-  def ttmmultf32ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-}
-
 let Features = "prefetchi", Attributes = [NoThrow, Const] in {
   def prefetchi : X86Builtin<"void(void const *, unsigned int)">;
 }
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 90e1f8d1eb5e9..52360b67b306c 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -54,7 +54,7 @@ CODEGENOPT(SeparateNamedSections, 1, 0, Benign) ///< Set for -fseparate-named-se
 CODEGENOPT(EnableAIXExtendedAltivecABI, 1, 0, Benign) ///< Set for -mabi=vec-extabi. Enables the extended Altivec ABI on AIX.
 CODEGENOPT(XCOFFReadOnlyPointers, 1, 0, Benign) ///< Set for -mxcoff-roptr.
 CODEGENOPT(AllTocData, 1, 0, Benign) ///< AIX -mtocdata
-ENUM_CODEGENOPT(FramePointer, FramePointerKind, 2, FramePointerKind::None, Benign) /// frame-pointer: all,non-leaf,reserved,none
+ENUM_CODEGENOPT(FramePointer, FramePointerKind, 3, FramePointerKind::None, Benign) /// frame-pointer: all,non-leaf,non-leaf-no-reserve,reserved,none
 
 ENUM_CODEGENOPT(ExceptionHandling, ExceptionHandlingKind, 3, ExceptionHandlingKind::None, NotCompatible)
 
diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index 5d5cf250b56b9..6c445253d518b 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -155,10 +155,13 @@ class CodeGenOptions : public CodeGenOptionsBase {
   std::string BinutilsVersion;
 
   enum class FramePointerKind {
-    None,     // Omit all frame pointers.
-    Reserved, // Maintain valid frame pointer chain.
-    NonLeaf,  // Keep non-leaf frame pointers.
-    All,      // Keep all frame pointers.
+    NonLeafNoReserve, // Keep non-leaf frame pointers, allow the FP to be used
+                      // as a GPR in leaf functions.
+    None,             // Omit all frame pointers.
+    Reserved,         // Maintain valid frame pointer chain.
+    NonLeaf, // Keep non-leaf frame pointers, don't allow the FP to be used as a
+             // GPR in leaf functions.
+    All,     // Keep all frame pointers.
   };
 
   static StringRef getFramePointerKindName(FramePointerKind Kind) {
@@ -167,6 +170,8 @@ class CodeGenOptions : public CodeGenOptionsBase {
       return "none";
     case FramePointerKind::Reserved:
       return "reserved";
+    case FramePointerKind::NonLeafNoReserve:
+      return "non-leaf-no-reserve";
     case FramePointerKind::NonLeaf:
       return "non-leaf";
     case FramePointerKind::All:
diff --git a/clang/include/clang/Basic/DebugOptions.def b/clang/include/clang/Basic/DebugOptions.def
index a768b12fa4e0d..ea3636ffa1af1 100644
--- a/clang/include/clang/Basic/DebugOptions.def
+++ b/clang/include/clang/Basic/DebugOptions.def
@@ -46,6 +46,8 @@ ENUM_DEBUGOPT(EmitDwarfUnwind, EmitDwarfUnwindType, 2,
 DEBUGOPT(NoDwarfDirectoryAsm , 1, 0, Benign) ///< Set when -fno-dwarf-directory-asm
                                              ///< is enabled.
 
+DEBUGOPT(Dwarf2CFIAsm, 1, 0, NotCompatible) ///< Set when -fdwarf2-cfi-asm is enabled.
+
 DEBUGOPT(NoInlineLineTables, 1, 0, Benign) ///< Whether debug info should contain
                                            ///< inline line tables.
 
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 83980e3ac35b7..98e08c2faa59e 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -207,6 +207,9 @@ def err_drv_amdgpu_ieee_without_no_honor_nans : Error<
   "invalid argument '-mno-amdgpu-ieee' only allowed with relaxed NaN handling">;
 def err_drv_argument_not_allowed_with : Error<
   "invalid argument '%0' not allowed with '%1'">;
+def warn_drv_argument_not_allowed_with : Warning<
+  "invalid argument '%0' not allowed with '%1'">,
+  InGroup<OptionIgnored>;
 def err_drv_cannot_open_randomize_layout_seed_file : Error<
   "cannot read randomize layout seed file '%0'">;
 def err_drv_invalid_version_number : Error<
@@ -312,6 +315,8 @@ def warn_drv_yc_multiple_inputs_clang_cl : Warning<
 def warn_drv_potentially_misspelled_joined_argument : Warning<
   "joined argument treated as '%0'; did you mean '%1'?">, InGroup<UnknownArgument>;
 
+def err_drv_too_many_actions: Error<
+    "only one action option is allowed. Got %0">;
 def err_drv_invalid_value : Error<"invalid value '%1' in '%0'">;
 def err_drv_invalid_int_value : Error<"invalid integral value '%1' in '%0'">;
 def err_drv_invalid_value_with_suggestion : Error<
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 8aa3489a2a62b..1e0321de3f4b6 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1436,6 +1436,7 @@ def MicrosoftDrectveSection : DiagGroup<"microsoft-drectve-section">;
 def MicrosoftInclude : DiagGroup<"microsoft-include">;
 def MicrosoftCppMacro : DiagGroup<"microsoft-cpp-macro">;
 def MicrosoftFixedEnum : DiagGroup<"microsoft-fixed-enum">;
+def MicrosoftEmptyEnum : DiagGroup<"microsoft-empty-enum">;
 def MicrosoftSealed : DiagGroup<"microsoft-sealed">;
 def MicrosoftAbstract : DiagGroup<"microsoft-abstract">;
 def MicrosoftUnqualifiedFriend : DiagGroup<"microsoft-unqualified-friend">;
@@ -1489,7 +1490,8 @@ def Microsoft : DiagGroup<"microsoft",
      MicrosoftConstInit, MicrosoftVoidPseudoDtor, MicrosoftAnonTag,
      MicrosoftCommentPaste, MicrosoftEndOfFile,
      MicrosoftInitFromPredefined, MicrosoftStringLiteralFromPredefined,
-     MicrosoftInconsistentDllImport, MicrosoftInlineOnNonFunction]>;
+     MicrosoftInconsistentDllImport, MicrosoftInlineOnNonFunction,
+     MicrosoftEmptyEnum]>;
 
 def ClangClPch : DiagGroup<"clang-cl-pch">;
 
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index c7fe6e1db6d1f..417187222e448 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -90,6 +90,14 @@ def err_unterminated___pragma : Error<"missing terminating ')' character">;
 
 def err_conflict_marker : Error<"version control conflict marker in file">;
 
+def err_counter_overflow : Error<
+  "'__COUNTER__' value cannot exceed 2'147'483'647">;
+def ext_counter : Extension<
+  "'__COUNTER__' is a C2y extension">, InGroup<C2y>;
+def warn_counter : Warning<
+  "'__COUNTER__' is incompatible with standards before C2y">,
+  InGroup<CPre2yCompat>, DefaultIgnore;
+
 def err_raw_delim_too_long : Error<
   "raw string delimiter longer than 16 characters"
   "; use PREFIX( )PREFIX to delimit raw string">;
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index e5e071f43fa75..aa0ccb0c05101 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -116,6 +116,9 @@ def err_enumerator_unnamed_no_def : Error<
 def ext_ms_c_enum_fixed_underlying_type : Extension<
   "enumeration types with a fixed underlying type are a Microsoft extension">,
   InGroup<MicrosoftFixedEnum>;
+def ext_ms_c_empty_enum_type : Extension<
+  "empty enumeration types are a Microsoft extension">,
+  InGroup<MicrosoftEmptyEnum>;
 def ext_c23_enum_fixed_underlying_type : Extension<
   "enumeration types with a fixed underlying type are a C23 extension">,
   InGroup<C23>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 4e369be0bbb92..3e864475f22a1 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3510,6 +3510,8 @@ def err_init_method_bad_return_type : Error<
   "init methods must return an object pointer type, not %0">;
 def err_attribute_invalid_size : Error<
   "vector size not an integral multiple of component size">;
+def err_attribute_vec_negative_size
+    : Error<"vector must have non-negative size">;
 def err_attribute_zero_size : Error<"zero %0 size">;
 def err_attribute_size_too_large : Error<"%0 size too large">;
 def err_typecheck_sve_rvv_ambiguous : Error<
@@ -8114,6 +8116,17 @@ def ext_gnu_ptr_func_arith : Extension<
   "arithmetic on%select{ a|}0 pointer%select{|s}0 to%select{ the|}2 function "
   "type%select{|s}2 %1%select{| and %3}2 is a GNU extension">,
   InGroup<GNUPointerArith>;
+def ext_gnu_counted_by_void_ptr
+    : Extension<
+          "'%select{counted_by|sized_by|counted_by_or_null|sized_by_or_null}0' "
+          "on a pointer to void is a GNU extension, treated as "
+          "'%select{sized_by|sized_by|sized_by_or_null|sized_by_or_null}0'">,
+      InGroup<GNUPointerArith>;
+def note_gnu_counted_by_void_ptr_use_sized_by
+    : Note<"use "
+           "'%select{__sized_by|__sized_by|__sized_by_or_null|__sized_by_or_"
+           "null}0' "
+           "to suppress this warning">;
 def err_readonly_message_assignment : Error<
   "assigning to 'readonly' return result of an Objective-C message not allowed">;
 def ext_c2y_increment_complex : Extension<
@@ -12124,6 +12137,9 @@ def err_omp_unexpected_schedule_modifier : Error<
   "modifier '%0' cannot be used along with modifier '%1'">;
 def err_omp_schedule_nonmonotonic_static : Error<
   "'nonmonotonic' modifier can only be specified with 'dynamic' or 'guided' schedule kind">;
+def err_omp_incompatible_dyn_groupprivate_modifier
+    : Error<"modifier '%0' cannot be used along with modifier '%1' in "
+            "dyn_groupprivate">;
 def err_omp_simple_clause_incompatible_with_ordered : Error<
   "'%0' clause with '%1' modifier cannot be specified if an 'ordered' clause is specified">;
 def err_omp_ordered_simd : Error<
@@ -13182,6 +13198,7 @@ def err_hlsl_semantic_indexing_not_supported
     : Error<"semantic %0 does not allow indexing">;
 def err_hlsl_init_priority_unsupported : Error<
   "initializer priorities are not supported in HLSL">;
+def err_hlsl_semantic_index_overlap : Error<"semantic index overlap %0">;
 
 def warn_hlsl_user_defined_type_missing_member: Warning<"binding type '%select{t|u|b|s|c}0' only applies to types containing %select{SRV resources|UAV resources|constant buffer resources|sampler state|numeric types}0">, InGroup<LegacyConstantRegisterBinding>;
 def err_hlsl_binding_type_mismatch: Error<"binding type '%select{t|u|b|s|c}0' only applies to %select{SRV resources|UAV resources|constant buffer resources|sampler state|numeric variables in the global scope}0">;
diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h
index e4044bcdfcc60..b27492d19a65b 100644
--- a/clang/include/clang/Basic/IdentifierTable.h
+++ b/clang/include/clang/Basic/IdentifierTable.h
@@ -46,6 +46,57 @@ class LangOptions;
 class MultiKeywordSelector;
 class SourceLocation;
 
+/// Constants for TokenKinds.def
+enum TokenKey : unsigned {
+  KEYC99 = 0x1,
+  KEYCXX = 0x2,
+  KEYCXX11 = 0x4,
+  KEYGNU = 0x8,
+  KEYMS = 0x10,
+  BOOLSUPPORT = 0x20,
+  KEYALTIVEC = 0x40,
+  KEYNOCXX = 0x80,
+  KEYBORLAND = 0x100,
+  KEYOPENCLC = 0x200,
+  KEYC23 = 0x400,
+  KEYNOMS18 = 0x800,
+  KEYNOOPENCL = 0x1000,
+  WCHARSUPPORT = 0x2000,
+  HALFSUPPORT = 0x4000,
+  CHAR8SUPPORT = 0x8000,
+  KEYOBJC = 0x10000,
+  KEYZVECTOR = 0x20000,
+  KEYCOROUTINES = 0x40000,
+  KEYMODULES = 0x80000,
+  KEYCXX20 = 0x100000,
+  KEYOPENCLCXX = 0x200000,
+  KEYMSCOMPAT = 0x400000,
+  KEYSYCL = 0x800000,
+  KEYCUDA = 0x1000000,
+  KEYZOS = 0x2000000,
+  KEYNOZOS = 0x4000000,
+  KEYHLSL = 0x8000000,
+  KEYFIXEDPOINT = 0x10000000,
+  KEYMAX = KEYFIXEDPOINT, // The maximum key
+  KEYALLCXX = KEYCXX | KEYCXX11 | KEYCXX20,
+  KEYALL = (KEYMAX | (KEYMAX - 1)) & ~KEYNOMS18 & ~KEYNOOPENCL &
+           ~KEYNOZOS // KEYNOMS18, KEYNOOPENCL, KEYNOZOS are excluded.
+};
+
+/// How a keyword is treated in the selected standard. This enum is ordered
+/// intentionally so that the value that 'wins' is the most 'permissive'.
+enum KeywordStatus {
+  KS_Unknown,   // Not yet calculated. Used when figuring out the status.
+  KS_Disabled,  // Disabled
+  KS_Future,    // Is a keyword in future standard
+  KS_Extension, // Is an extension
+  KS_Enabled,   // Enabled
+};
+
+/// Translates flags as specified in TokenKinds.def into keyword status
+/// in the given language standard.
+KeywordStatus getKeywordStatus(const LangOptions &LangOpts, unsigned Flags);
+
 enum class ReservedIdentifierStatus {
   NotReserved = 0,
   StartsWithUnderscoreAtGlobalScope,
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 8d6b8a14740ce..40fc66ea12e34 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -216,6 +216,7 @@ LANGOPT(OpenCLGenericAddressSpace, 1, 0, NotCompatible, "OpenCL generic keyword"
 LANGOPT(OpenCLPipes              , 1, 0, NotCompatible, "OpenCL pipes language constructs and built-ins")
 LANGOPT(NativeHalfType    , 1, 0, NotCompatible, "Native half type support")
 LANGOPT(NativeHalfArgsAndReturns, 1, 0, NotCompatible, "Native half args and returns")
+LANGOPT(NativeInt16Type   , 1, 1, NotCompatible, "Native int 16 type support")
 LANGOPT(CUDA              , 1, 0, NotCompatible, "CUDA")
 LANGOPT(HIP               , 1, 0, NotCompatible, "HIP")
 LANGOPT(OpenMP            , 32, 0, NotCompatible, "OpenMP support and version of OpenMP (31, 40 or 45)")
@@ -453,7 +454,7 @@ LANGOPT(BranchTargetEnforcement, 1, 0, NotCompatible, "Branch-target enforcement
 LANGOPT(BranchProtectionPAuthLR, 1, 0, NotCompatible, "Use PC as a diversifier using PAuthLR NOP instructions.")
 LANGOPT(GuardedControlStack, 1, 0, NotCompatible, "Guarded control stack enabled")
 
-LANGOPT(SpeculativeLoadHardening, 1, 0, NotCompatible, "Speculative load hardening enabled")
+LANGOPT(SpeculativeLoadHardening, 1, 0, Benign, "Speculative load hardening enabled")
 
 LANGOPT(RelativeCXXABIVTables, 1, 0, NotCompatible,
         "Use an ABI-incompatible v-table layout that uses relative references")
diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def
index 328a0747a82a8..166b80314f687 100644
--- a/clang/include/clang/Basic/OpenMPKinds.def
+++ b/clang/include/clang/Basic/OpenMPKinds.def
@@ -86,6 +86,12 @@
 #ifndef OPENMP_GRAINSIZE_MODIFIER
 #define OPENMP_GRAINSIZE_MODIFIER(Name)
 #endif
+#ifndef OPENMP_DYN_GROUPPRIVATE_MODIFIER
+#define OPENMP_DYN_GROUPPRIVATE_MODIFIER(Name)
+#endif
+#ifndef OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER
+#define OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(Name)
+#endif
 #ifndef OPENMP_NUMTASKS_MODIFIER
 #define OPENMP_NUMTASKS_MODIFIER(Name)
 #endif
@@ -242,6 +248,14 @@ OPENMP_BIND_KIND(thread)
 // Modifiers for the 'grainsize' clause.
 OPENMP_GRAINSIZE_MODIFIER(strict)
 
+// Modifiers for the 'dyn_groupprivate' clause.
+OPENMP_DYN_GROUPPRIVATE_MODIFIER(cgroup)
+
+// Fallback modifiers for the 'dyn_groupprivate' clause.
+OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(abort)
+OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(null)
+OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(default_mem)
+
 // Modifiers for the 'num_tasks' clause.
 OPENMP_NUMTASKS_MODIFIER(strict)
 
@@ -263,6 +277,8 @@ OPENMP_THREADSET_KIND(omp_team)
 
 #undef OPENMP_NUMTASKS_MODIFIER
 #undef OPENMP_NUMTHREADS_MODIFIER
+#undef OPENMP_DYN_GROUPPRIVATE_MODIFIER
+#undef OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER
 #undef OPENMP_GRAINSIZE_MODIFIER
 #undef OPENMP_BIND_KIND
 #undef OPENMP_ADJUST_ARGS_KIND
diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h
index c9ddbcd6d46c1..41b2c4e41dcb8 100644
--- a/clang/include/clang/Basic/OpenMPKinds.h
+++ b/clang/include/clang/Basic/OpenMPKinds.h
@@ -224,6 +224,20 @@ enum OpenMPGrainsizeClauseModifier {
   OMPC_GRAINSIZE_unknown
 };
 
+enum OpenMPDynGroupprivateClauseModifier {
+#define OPENMP_DYN_GROUPPRIVATE_MODIFIER(Name) OMPC_DYN_GROUPPRIVATE_##Name,
+#include "clang/Basic/OpenMPKinds.def"
+  OMPC_DYN_GROUPPRIVATE_unknown
+};
+
+enum OpenMPDynGroupprivateClauseFallbackModifier {
+  OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown = OMPC_DYN_GROUPPRIVATE_unknown,
+#define OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(Name)                        \
+  OMPC_DYN_GROUPPRIVATE_FALLBACK_##Name,
+#include "clang/Basic/OpenMPKinds.def"
+  OMPC_DYN_GROUPPRIVATE_FALLBACK_last
+};
+
 enum OpenMPNumTasksClauseModifier {
 #define OPENMP_NUMTASKS_MODIFIER(Name) OMPC_NUMTASKS_##Name,
 #include "clang/Basic/OpenMPKinds.def"
diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h
index ed967fd47dc83..bc9e97863556d 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -1286,16 +1286,7 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// If the location is an expansion record, walk through it until we find
   /// the final location expanded.
   FileIDAndOffset getDecomposedExpansionLoc(SourceLocation Loc) const {
-    FileID FID = getFileID(Loc);
-    auto *E = getSLocEntryOrNull(FID);
-    if (!E)
-      return std::make_pair(FileID(), 0);
-
-    unsigned Offset = Loc.getOffset()-E->getOffset();
-    if (Loc.isFileID())
-      return std::make_pair(FID, Offset);
-
-    return getDecomposedExpansionLocSlowCase(E);
+    return getDecomposedLoc(getExpansionLoc(Loc));
   }
 
   /// Decompose the specified location into a raw FileID + Offset pair.
@@ -1303,15 +1294,7 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// If the location is an expansion record, walk through it until we find
   /// its spelling record.
   FileIDAndOffset getDecomposedSpellingLoc(SourceLocation Loc) const {
-    FileID FID = getFileID(Loc);
-    auto *E = getSLocEntryOrNull(FID);
-    if (!E)
-      return std::make_pair(FileID(), 0);
-
-    unsigned Offset = Loc.getOffset()-E->getOffset();
-    if (Loc.isFileID())
-      return std::make_pair(FID, Offset);
-    return getDecomposedSpellingLocSlowCase(E, Offset);
+    return getDecomposedLoc(getSpellingLoc(Loc));
   }
 
   /// Returns the "included/expanded in" decomposed location of the given
@@ -1426,10 +1409,15 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// before calling this method.
   unsigned getColumnNumber(FileID FID, unsigned FilePos,
                            bool *Invalid = nullptr) const;
+  unsigned getColumnNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
   unsigned getSpellingColumnNumber(SourceLocation Loc,
-                                   bool *Invalid = nullptr) const;
+                                   bool *Invalid = nullptr) const {
+    return getColumnNumber(getSpellingLoc(Loc), Invalid);
+  }
   unsigned getExpansionColumnNumber(SourceLocation Loc,
-                                    bool *Invalid = nullptr) const;
+                                    bool *Invalid = nullptr) const {
+    return getColumnNumber(getExpansionLoc(Loc), Invalid);
+  }
   unsigned getPresumedColumnNumber(SourceLocation Loc,
                                    bool *Invalid = nullptr) const;
 
@@ -1440,8 +1428,15 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// MemoryBuffer, so this is not cheap: use only when about to emit a
   /// diagnostic.
   unsigned getLineNumber(FileID FID, unsigned FilePos, bool *Invalid = nullptr) const;
-  unsigned getSpellingLineNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
-  unsigned getExpansionLineNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
+  unsigned getLineNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
+  unsigned getSpellingLineNumber(SourceLocation Loc,
+                                 bool *Invalid = nullptr) const {
+    return getLineNumber(getSpellingLoc(Loc), Invalid);
+  }
+  unsigned getExpansionLineNumber(SourceLocation Loc,
+                                  bool *Invalid = nullptr) const {
+    return getLineNumber(getExpansionLoc(Loc), Invalid);
+  }
   unsigned getPresumedLineNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
 
   /// Return the filename or buffer identifier of the buffer the
@@ -1979,10 +1974,6 @@ class SourceManager : public RefCountedBase<SourceManager> {
   SourceLocation getSpellingLocSlowCase(SourceLocation Loc) const;
   SourceLocation getFileLocSlowCase(SourceLocation Loc) const;
 
-  FileIDAndOffset
-  getDecomposedExpansionLocSlowCase(const SrcMgr::SLocEntry *E) const;
-  FileIDAndOffset getDecomposedSpellingLocSlowCase(const SrcMgr::SLocEntry *E,
-                                                   unsigned Offset) const;
   void computeMacroArgsCache(MacroArgsMap &MacroArgsCache, FileID FID) const;
   void associateFileChunkWithMacroArgExp(MacroArgsMap &MacroArgsCache,
                                          FileID FID,
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index ea73ed915bf03..39af84c8d0872 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -229,6 +229,7 @@ class TargetInfo : public TransferrableTargetInfo,
 protected:
   // Target values set by the ctor of the actual target implementation.  Default
   // values are specified by the TargetInfo constructor.
+  bool HasMustTail;
   bool BigEndian;
   bool TLSSupported;
   bool VLASupported;
@@ -669,6 +670,8 @@ class TargetInfo : public TransferrableTargetInfo,
                                        : getLongFractScale() + 1;
   }
 
+  virtual bool hasMustTail() const { return HasMustTail; }
+
   /// Determine whether the __int128 type is supported on this target.
   virtual bool hasInt128Type() const {
     return (getPointerWidth(LangAS::Default) >= 64) ||
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index ef196103035e8..e91d7ce975d31 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -1466,26 +1466,51 @@ def SCALAR_UCVTFD : SInst<"vcvt_f64", "(1F)(1!)", "SUl">;
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Floating-point Converts
 def SCALAR_FCVTXN  : IInst<"vcvtx_f32", "(1F<)(1!)", "Sd">;
-def SCALAR_FCVTNSS : SInst<"vcvtn_s32", "(1S)1", "Sf">;
-def SCALAR_FCVTNUS : SInst<"vcvtn_u32", "(1U)1", "Sf">;
-def SCALAR_FCVTNSD : SInst<"vcvtn_s64", "(1S)1", "Sd">;
-def SCALAR_FCVTNUD : SInst<"vcvtn_u64", "(1U)1", "Sd">;
-def SCALAR_FCVTMSS : SInst<"vcvtm_s32", "(1S)1", "Sf">;
-def SCALAR_FCVTMUS : SInst<"vcvtm_u32", "(1U)1", "Sf">;
-def SCALAR_FCVTMSD : SInst<"vcvtm_s64", "(1S)1", "Sd">;
-def SCALAR_FCVTMUD : SInst<"vcvtm_u64", "(1U)1", "Sd">;
-def SCALAR_FCVTASS : SInst<"vcvta_s32", "(1S)1", "Sf">;
-def SCALAR_FCVTAUS : SInst<"vcvta_u32", "(1U)1", "Sf">;
-def SCALAR_FCVTASD : SInst<"vcvta_s64", "(1S)1", "Sd">;
-def SCALAR_FCVTAUD : SInst<"vcvta_u64", "(1U)1", "Sd">;
-def SCALAR_FCVTPSS : SInst<"vcvtp_s32", "(1S)1", "Sf">;
-def SCALAR_FCVTPUS : SInst<"vcvtp_u32", "(1U)1", "Sf">;
-def SCALAR_FCVTPSD : SInst<"vcvtp_s64", "(1S)1", "Sd">;
-def SCALAR_FCVTPUD : SInst<"vcvtp_u64", "(1U)1", "Sd">;
-def SCALAR_FCVTZSS : SInst<"vcvt_s32", "(1S)1", "Sf">;
-def SCALAR_FCVTZUS : SInst<"vcvt_u32", "(1U)1", "Sf">;
-def SCALAR_FCVTZSD : SInst<"vcvt_s64", "(1S)1", "Sd">;
-def SCALAR_FCVTZUD : SInst<"vcvt_u64", "(1U)1", "Sd">;
+
+def SCALAR_FCVTN_F32toSS  : SInst<"vcvtn_s32", "(1S)1", "Sf">;
+def SCALAR_FCVTN_F32toUS  : SInst<"vcvtn_u32", "(1U)1", "Sf">;
+def SCALAR_FCVTN_F64toSS  : SInst<"vcvtn_s32", "(1S<)1", "Sd">;
+def SCALAR_FCVTN_F64toUS  : SInst<"vcvtn_u32", "(1U<)1", "Sd">;
+def SCALAR_FCVTN_F32toSD  : SInst<"vcvtn_s64", "(1S>)1", "Sf">;
+def SCALAR_FCVTN_F32toUD  : SInst<"vcvtn_u64", "(1U>)1", "Sf">;
+def SCALAR_FCVTN_F64toSD  : SInst<"vcvtn_s64", "(1S)1", "Sd">;
+def SCALAR_FCVTN_F64toUD  : SInst<"vcvtn_u64", "(1U)1", "Sd">;
+
+def SCALAR_FCVTM_F32toSS  : SInst<"vcvtm_s32", "(1S)1", "Sf">;
+def SCALAR_FCVTM_F32toUS  : SInst<"vcvtm_u32", "(1U)1", "Sf">;
+def SCALAR_FCVTM_F64toSS  : SInst<"vcvtm_s32", "(1S<)1", "Sd">;
+def SCALAR_FCVTM_F64toUS  : SInst<"vcvtm_u32", "(1U<)1", "Sd">;
+def SCALAR_FCVTM_F32toSD  : SInst<"vcvtm_s64", "(1S>)1", "Sf">;
+def SCALAR_FCVTM_F32toUD  : SInst<"vcvtm_u64", "(1U>)1", "Sf">;
+def SCALAR_FCVTM_F64toSD  : SInst<"vcvtm_s64", "(1S)1", "Sd">;
+def SCALAR_FCVTM_F64toUD  : SInst<"vcvtm_u64", "(1U)1", "Sd">;
+
+def SCALAR_FCVTA_F32toSS  : SInst<"vcvta_s32", "(1S)1", "Sf">;
+def SCALAR_FCVTA_F32toUS  : SInst<"vcvta_u32", "(1U)1", "Sf">;
+def SCALAR_FCVTA_F64toSS  : SInst<"vcvta_s32", "(1S<)1", "Sd">;
+def SCALAR_FCVTA_F64toUS  : SInst<"vcvta_u32", "(1U<)1", "Sd">;
+def SCALAR_FCVTA_F32toSD  : SInst<"vcvta_s64", "(1S>)1", "Sf">;
+def SCALAR_FCVTA_F32toUD  : SInst<"vcvta_u64", "(1U>)1", "Sf">;
+def SCALAR_FCVTA_F64toSD  : SInst<"vcvta_s64", "(1S)1", "Sd">;
+def SCALAR_FCVTA_F64toUD  : SInst<"vcvta_u64", "(1U)1", "Sd">;
+
+def SCALAR_FCVTP_F32toSS  : SInst<"vcvtp_s32", "(1S)1", "Sf">;
+def SCALAR_FCVTP_F32toUS  : SInst<"vcvtp_u32", "(1U)1", "Sf">;
+def SCALAR_FCVTP_F64toSS  : SInst<"vcvtp_s32", "(1S<)1", "Sd">;
+def SCALAR_FCVTP_F64toUS  : SInst<"vcvtp_u32", "(1U<)1", "Sd">;
+def SCALAR_FCVTP_F32toSD  : SInst<"vcvtp_s64", "(1S>)1", "Sf">;
+def SCALAR_FCVTP_F32toUD  : SInst<"vcvtp_u64", "(1U>)1", "Sf">;
+def SCALAR_FCVTP_F64toSD  : SInst<"vcvtp_s64", "(1S)1", "Sd">;
+def SCALAR_FCVTP_F64toUD  : SInst<"vcvtp_u64", "(1U)1", "Sd">;
+
+def SCALAR_FCVTZ_F32toSS  : SInst<"vcvt_s32", "(1S)1", "Sf">;
+def SCALAR_FCVTZ_F32toUS  : SInst<"vcvt_u32", "(1U)1", "Sf">;
+def SCALAR_FCVTZ_F64toSS  : SInst<"vcvt_s32", "(1S<)1", "Sd">;
+def SCALAR_FCVTZ_F64toUS  : SInst<"vcvt_u32", "(1U<)1", "Sd">;
+def SCALAR_FCVTZ_F32toSD  : SInst<"vcvt_s64", "(1S>)1", "Sf">;
+def SCALAR_FCVTZ_F32toUD  : SInst<"vcvt_u64", "(1U>)1", "Sf">;
+def SCALAR_FCVTZ_F64toSD  : SInst<"vcvt_s64", "(1S)1", "Sd">;
+def SCALAR_FCVTZ_F64toUD  : SInst<"vcvt_u64", "(1U)1", "Sd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Floating-point Reciprocal Estimate
@@ -1896,6 +1921,14 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
   def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>;
 }
 
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f16mm,neon" in {
+  def VMMLA_F16_MF8 : VInst<"vmmla_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+}
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f32mm,neon" in {
+  def VMMLA_F32_MF8 : VInst<"vmmla_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+}
+
 let TargetGuard = "i8mm,neon" in {
   def VMMLA   : SInst<"vmmla", "..(<<)(<<)", "QUiQi">;
   def VUSMMLA : SInst<"vusmmla", "..(<<U)(<<)", "Qi">;
diff --git a/clang/include/clang/Basic/riscv_sifive_vector.td b/clang/include/clang/Basic/riscv_sifive_vector.td
index 89e644a078682..0371279aafc08 100644
--- a/clang/include/clang/Basic/riscv_sifive_vector.td
+++ b/clang/include/clang/Basic/riscv_sifive_vector.td
@@ -121,6 +121,13 @@ multiclass RVVVQMACCQOQBuiltinSet<list<list<string>> suffixes_prototypes> {
     defm NAME : RVVOutOp1Op2BuiltinSet<NAME, "s", suffixes_prototypes>;
 }
 
+multiclass RVVVFEXPBuiltinSet<list<list<string>> suffixes_prototypes, string type_range> {
+  let UnMaskedPolicyScheme = HasPassthruOperand,
+      OverloadedName = NAME,
+      Log2LMUL = [-2, -1, 0, 1, 2, 3] in
+    defm NAME : RVVOutBuiltinSet<NAME, type_range, suffixes_prototypes>;
+}
+
 multiclass RVVVFNRCLIPBuiltinSet<string suffix, string prototype, string type_range> {
   let Log2LMUL = [-3, -2, -1, 0, 1, 2],
       Name = NAME,
@@ -145,6 +152,26 @@ let UnMaskedPolicyScheme = HasPolicyOperand in
     defm sf_vqmaccsu_4x8x4 : RVVVQMACCQOQBuiltinSet<[["", "w", "ww(FixedSEW:8)Sv(FixedSEW:8)Uv"]]>;
   }
 
+let RequiredFeatures = ["xsfvfbfexp16e"] in {
+  defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "y">;
+}
+
+let RequiredFeatures = ["xsfvfexp16e"] in {
+  defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "x">;
+}
+
+let RequiredFeatures = ["xsfvfexp32e"] in {
+  defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "f">;
+}
+
+let RequiredFeatures = ["xsfvfexpa"] in {
+  defm sf_vfexpa : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "xf">;
+}
+
+let RequiredFeatures = ["xsfvfexpa64e"] in {
+  defm sf_vfexpa : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "d">;
+}
+
 let UnMaskedPolicyScheme = HasPolicyOperand in
   let RequiredFeatures = ["xsfvfwmaccqqq"] in
     defm sf_vfwmacc_4x4x4 : RVVVFWMACCBuiltinSet<[["", "Fw", "FwFwSvv"]]>;
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 2b361ed0982c6..16258513239d9 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -4089,6 +4089,57 @@ def CIR_PrefetchOp : CIR_Op<"prefetch"> {
     }];
 }
 
+//===----------------------------------------------------------------------===//
+// ObjSizeOp
+//===----------------------------------------------------------------------===//
+
+def CIR_ObjSizeOp : CIR_Op<"objsize", [Pure]> {
+  let summary = "Implements the llvm.objsize builtin";
+  let description = [{
+    The `cir.objsize` operation is designed to provide information to the
+    optimizer to determine whether a) an operation (like memcpy) will
+    overflow a buffer that corresponds to an object, or b) that a runtime
+    check for overflow isn’t necessary. An object in this context means an
+    allocation of a specific class, structure, array, or other object.
+
+    When the `min` attribute is present, the operation returns the minimum
+    guaranteed accessible size. When absent (max mode), it returns the maximum
+    possible object size. Corresponds to `llvm.objectsize`'s `min` argument.
+    
+    The `dynamic` attribute determines if the value should be evaluated at
+    runtime. Corresponds to `llvm.objectsize`'s `dynamic` argument.
+
+    The `nullunknown` attribute controls how null pointers are handled. When
+    present, null pointers are treated as having unknown size. When absent,
+    null pointers are treated as having 0 size (in min mode) or -1 size
+    (in max mode). Corresponds to `llvm.objectsize`'s `nullunknown` argument.
+
+    Example:
+
+    ```mlir
+    %size = cir.objsize min %ptr : !cir.ptr<i32> -> i64
+    %dsize = cir.objsize max dynamic %ptr : !cir.ptr<i32> -> i64
+    %nsize = cir.objsize min nullunknown %ptr : !cir.ptr<i32> -> i64
+    ```
+  }];
+
+  let arguments = (ins
+    CIR_PointerType:$ptr,
+    UnitAttr:$min,
+    UnitAttr:$nullunknown,
+    UnitAttr:$dynamic
+  );
+
+  let results = (outs CIR_AnyFundamentalIntType:$result);
+
+  let assemblyFormat = [{
+      (`min` $min^) : (`max`)?
+      (`nullunknown` $nullunknown^)?
+      (`dynamic` $dynamic^)?
+      $ptr `:` qualified(type($ptr)) `->` qualified(type($result)) attr-dict
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // PtrDiffOp
 //===----------------------------------------------------------------------===//
@@ -4171,6 +4222,16 @@ def CIR_ATanOp : CIR_UnaryFPToFPBuiltinOp<"atan", "ATanOp"> {
   }];
 }
 
+def CIR_CeilOp : CIR_UnaryFPToFPBuiltinOp<"ceil", "FCeilOp"> {
+  let summary = "Computes the ceiling of the specified value";
+  let description = [{
+    `cir.ceil` computes the ceiling of a given value and returns a result
+    of the same type.
+
+    Floating-point exceptions are ignored, and it does not set `errno`.
+  }];
+}
+
 def CIR_CosOp : CIR_UnaryFPToFPBuiltinOp<"cos", "CosOp"> {
   let summary = "Computes the floating-point cosine value";
   let description = [{
@@ -4181,6 +4242,16 @@ def CIR_CosOp : CIR_UnaryFPToFPBuiltinOp<"cos", "CosOp"> {
   }];
 }
 
+def CIR_ExpOp : CIR_UnaryFPToFPBuiltinOp<"exp", "ExpOp"> {
+  let summary = "Computes the floating-point base-e exponential value";
+  let description = [{
+    `cir.exp` computes the exponential of a floating-point operand and returns
+    a result of the same type.
+
+    Floating-point exceptions are ignored, and it does not set `errno`.
+  }];
+}
+
 def CIR_FAbsOp : CIR_UnaryFPToFPBuiltinOp<"fabs", "FAbsOp"> {
   let summary = "Computes the floating-point absolute value";
   let description = [{
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 48ef8be9fb782..a673ae6f9eb71 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -153,6 +153,7 @@ struct MissingFeatures {
   static bool coroEndBuiltinCall() { return false; }
   static bool coroutineFrame() { return false; }
   static bool emitBodyAndFallthrough() { return false; }
+  static bool coroOutsideFrameMD() { return false; }
 
   // Various handling of deferred processing in CIRGenModule.
   static bool cgmRelease() { return false; }
@@ -180,6 +181,8 @@ struct MissingFeatures {
   static bool atomicSyncScopeID() { return false; }
   static bool atomicTypes() { return false; }
   static bool atomicUseLibCall() { return false; }
+  static bool atomicMicrosoftVolatile() { return false; }
+  static bool atomicOpenMP() { return false; }
 
   // Global ctor handling
   static bool globalCtorLexOrder() { return false; }
@@ -187,6 +190,7 @@ struct MissingFeatures {
 
   // Misc
   static bool abiArgInfo() { return false; }
+  static bool addAutoInitAnnotation() { return false; }
   static bool addHeapAllocSiteMetadata() { return false; }
   static bool aggEmitFinalDestCopyRValue() { return false; }
   static bool aggValueSlot() { return false; }
@@ -196,6 +200,7 @@ struct MissingFeatures {
   static bool aggValueSlotMayOverlap() { return false; }
   static bool aggValueSlotVolatile() { return false; }
   static bool alignCXXRecordDecl() { return false; }
+  static bool appleKext() { return false; }
   static bool armComputeVolatileBitfields() { return false; }
   static bool asmGoto() { return false; }
   static bool asmInputOperands() { return false; }
@@ -213,6 +218,7 @@ struct MissingFeatures {
   static bool builtinCallMathErrno() { return false; }
   static bool builtinCheckKind() { return false; }
   static bool cgCapturedStmtInfo() { return false; }
+  static bool countedBySize() { return false; }
   static bool cgFPOptionsRAII() { return false; }
   static bool checkBitfieldClipping() { return false; }
   static bool cirgenABIInfo() { return false; }
@@ -241,6 +247,7 @@ struct MissingFeatures {
   static bool deleteArray() { return false; }
   static bool devirtualizeDestructor() { return false; }
   static bool devirtualizeMemberFunction() { return false; }
+  static bool dtorCleanups() { return false; }
   static bool ehCleanupFlags() { return false; }
   static bool ehCleanupHasPrebranchedFallthrough() { return false; }
   static bool ehCleanupScope() { return false; }
@@ -286,11 +293,13 @@ struct MissingFeatures {
   static bool objCGC() { return false; }
   static bool objCLifetime() { return false; }
   static bool hlsl() { return false; }
+  static bool msvcBuiltins() { return false; }
   static bool openCL() { return false; }
   static bool openMP() { return false; }
   static bool opTBAA() { return false; }
   static bool peepholeProtection() { return false; }
   static bool pgoUse() { return false; }
+  static bool pointerAuthentication() { return false; }
   static bool pointerOverflowSanitizer() { return false; }
   static bool preservedAccessIndexRegion() { return false; }
   static bool requiresCleanups() { return false; }
@@ -300,6 +309,10 @@ struct MissingFeatures {
   static bool setNonGC() { return false; }
   static bool setObjCGCLValueClass() { return false; }
   static bool setTargetAttributes() { return false; }
+  static bool shouldCreateMemCpyFromGlobal() { return false; }
+  static bool shouldSplitConstantStore() { return false; }
+  static bool shouldUseBZeroPlusStoresToInitialize() { return false; }
+  static bool shouldUseMemSetToInitialize() { return false; }
   static bool simplifyCleanupEntry() { return false; }
   static bool sourceLanguageCases() { return false; }
   static bool stackBase() { return false; }
@@ -311,16 +324,14 @@ struct MissingFeatures {
   static bool thunks() { return false; }
   static bool tryEmitAsConstant() { return false; }
   static bool typeChecks() { return false; }
-  static bool weakRefReference() { return false; }
-  static bool writebacks() { return false; }
-  static bool appleKext() { return false; }
-  static bool dtorCleanups() { return false; }
+  static bool vaArgABILowering() { return false; }
+  static bool vectorConstants() { return false; }
+  static bool vlas() { return false; }
   static bool vtableInitialization() { return false; }
   static bool vtableEmitMetadata() { return false; }
   static bool vtableRelativeLayout() { return false; }
-  static bool msvcBuiltins() { return false; }
-  static bool vaArgABILowering() { return false; }
-  static bool vlas() { return false; }
+  static bool weakRefReference() { return false; }
+  static bool writebacks() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/include/clang/CMakeLists.txt b/clang/include/clang/CMakeLists.txt
index 47ac70cd21690..77a44e4c48de5 100644
--- a/clang/include/clang/CMakeLists.txt
+++ b/clang/include/clang/CMakeLists.txt
@@ -3,7 +3,7 @@ add_subdirectory(Basic)
 if(CLANG_ENABLE_CIR)
   add_subdirectory(CIR)
 endif()
-add_subdirectory(Driver)
+add_subdirectory(Options)
 add_subdirectory(Parse)
 add_subdirectory(Sema)
 add_subdirectory(Serialization)
diff --git a/clang/include/clang/CodeGen/ModuleBuilder.h b/clang/include/clang/CodeGen/ModuleBuilder.h
index f1b8229edd362..4298ba06c472e 100644
--- a/clang/include/clang/CodeGen/ModuleBuilder.h
+++ b/clang/include/clang/CodeGen/ModuleBuilder.h
@@ -120,6 +120,23 @@ CodeGenerator *CreateLLVMCodeGen(DiagnosticsEngine &Diags,
                                  llvm::LLVMContext &C,
                                  CoverageSourceInfo *CoverageInfo = nullptr);
 
+namespace CodeGen {
+/// Demangle the artificial function name (\param FuncName) used to encode trap
+/// reasons used in debug info for traps (e.g. __builtin_verbose_trap). See
+/// `CGDebugInfo::CreateTrapFailureMessageFor`.
+///
+/// \param FuncName - The function name to demangle.
+///
+/// \return A std::optional. If demangling succeeds the optional will contain
+/// a pair of StringRefs where the first field is the trap category and the
+/// second is the trap message. These can both be empty. If demangling fails the
+/// optional will not contain a value. Note the returned StringRefs if non-empty
+/// point into the underlying storage for \param FuncName and thus have the same
+/// lifetime.
+std::optional<std::pair<StringRef, StringRef>>
+DemangleTrapReasonInDebugInfo(StringRef FuncName);
+} // namespace CodeGen
+
 } // end namespace clang
 
 #endif
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index b9b187ada8add..aa86bffb802a4 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -15,11 +15,11 @@
 #include "clang/Driver/Action.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Phases.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Driver/Types.h"
 #include "clang/Driver/Util.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/StringMap.h"
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 2852c4a2916a4..b6f124f948b59 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -62,49 +62,28 @@ struct FormatStyle {
   /// \version 3.3
   int AccessModifierOffset;
 
-  /// Different styles for aligning after open brackets.
-  enum BracketAlignmentStyle : int8_t {
-    /// Align parameters on the open bracket, e.g.:
-    /// \code
-    ///   someLongFunction(argument1,
-    ///                    argument2);
-    /// \endcode
-    BAS_Align,
-    /// Don't align, instead use ``ContinuationIndentWidth``, e.g.:
-    /// \code
-    ///   someLongFunction(argument1,
-    ///       argument2);
-    /// \endcode
-    BAS_DontAlign,
-    /// Always break after an open bracket, if the parameters don't fit
-    /// on a single line, e.g.:
-    /// \code
-    ///   someLongFunction(
-    ///       argument1, argument2);
-    /// \endcode
-    BAS_AlwaysBreak,
-    /// Always break after an open bracket, if the parameters don't fit
-    /// on a single line. Closing brackets will be placed on a new line.
-    /// E.g.:
-    /// \code
-    ///   someLongFunction(
-    ///       argument1, argument2
-    ///   )
-    /// \endcode
-    ///
-    /// \note
-    ///  This currently only applies to braced initializer lists (when
-    ///  ``Cpp11BracedListStyle`` is not ``Block``) and parentheses.
-    /// \endnote
-    BAS_BlockIndent,
-  };
-
   /// If ``true``, horizontally aligns arguments after an open bracket.
   ///
+  /// \code
+  ///   true:                         vs.   false
+  ///   someLongFunction(argument1,         someLongFunction(argument1,
+  ///                    argument2);            argument2);
+  /// \endcode
+  ///
+  /// \note
+  ///   As of clang-format 22 this option is a bool with the previous
+  ///   option of ``Align`` replaced with ``true``, ``DontAlign`` replaced
+  ///   with ``false``, and the options of ``AlwaysBreak`` and ``BlockIndent``
+  ///   replaced with ``true`` and with setting of new style options using
+  ///   ``BreakAfterOpenBracketBracedList``, ``BreakAfterOpenBracketFunction``,
+  ///   ``BreakAfterOpenBracketIf``, ``BreakBeforeCloseBracketBracedList``,
+  ///   ``BreakBeforeCloseBracketFunction``, and ``BreakBeforeCloseBracketIf``.
+  /// \endnote
+  ///
   /// This applies to round brackets (parentheses), angle brackets and square
   /// brackets.
   /// \version 3.8
-  BracketAlignmentStyle AlignAfterOpenBracket;
+  bool AlignAfterOpenBracket;
 
   /// Different style for aligning array initializers.
   enum ArrayInitializerAlignmentStyle : int8_t {
@@ -622,9 +601,19 @@ struct FormatStyle {
     ///   int abcdef; // but this isn't
     /// \endcode
     unsigned OverEmptyLines;
+    /// If comments following preprocessor directive should be aligned with
+    /// comments that don't.
+    /// \code
+    ///   true:                               false:
+    ///   #define A  // Comment   vs.         #define A  // Comment
+    ///   #define AB // Aligned               #define AB // Aligned
+    ///   int i;     // Aligned               int i; // Not aligned
+    /// \endcode
+    bool AlignPPAndNotPP;
 
     bool operator==(const TrailingCommentsAlignmentStyle &R) const {
-      return Kind == R.Kind && OverEmptyLines == R.OverEmptyLines;
+      return Kind == R.Kind && OverEmptyLines == R.OverEmptyLines &&
+             AlignPPAndNotPP == R.AlignPPAndNotPP;
     }
     bool operator!=(const TrailingCommentsAlignmentStyle &R) const {
       return !(*this == R);
@@ -1708,6 +1697,57 @@ struct FormatStyle {
   /// \version 16
   AttributeBreakingStyle BreakAfterAttributes;
 
+  /// Force break after the left bracket of a braced initializer list (when
+  /// ``Cpp11BracedListStyle`` is ``true``) when the list exceeds the column
+  /// limit.
+  /// \code
+  ///   true:                             false:
+  ///   vector<int> x {         vs.       vector<int> x {1,
+  ///      1, 2, 3}                            2, 3}
+  /// \endcode
+  /// \version 22
+  bool BreakAfterOpenBracketBracedList;
+
+  /// Force break after the left parenthesis of a function (declaration,
+  /// definition, call) when the parameters exceed the column limit.
+  /// \code
+  ///   true:                             false:
+  ///   foo (                   vs.       foo (a,
+  ///      a , b)                              b)
+  /// \endcode
+  /// \version 22
+  bool BreakAfterOpenBracketFunction;
+
+  /// Force break after the left parenthesis of an if control statement
+  /// when the expression exceeds the column limit.
+  /// \code
+  ///   true:                             false:
+  ///   if constexpr (          vs.       if constexpr (a ||
+  ///      a || b)                                      b)
+  /// \endcode
+  /// \version 22
+  bool BreakAfterOpenBracketIf;
+
+  /// Force break after the left parenthesis of a loop control statement
+  /// when the expression exceeds the column limit.
+  /// \code
+  ///   true:                             false:
+  ///   while (                  vs.      while (a &&
+  ///      a && b) {                             b) {
+  /// \endcode
+  /// \version 22
+  bool BreakAfterOpenBracketLoop;
+
+  /// Force break after the left parenthesis of a switch control statement
+  /// when the expression exceeds the column limit.
+  /// \code
+  ///   true:                             false:
+  ///   switch (                 vs.      switch (a +
+  ///      a + b) {                               b) {
+  /// \endcode
+  /// \version 22
+  bool BreakAfterOpenBracketSwitch;
+
   /// The function declaration return type breaking style to use.
   /// \version 19
   ReturnTypeBreakingStyle BreakAfterReturnType;
@@ -2221,6 +2261,69 @@ struct FormatStyle {
   /// \version 3.7
   BraceBreakingStyle BreakBeforeBraces;
 
+  /// Force break before the right bracket of a braced initializer list (when
+  /// ``Cpp11BracedListStyle`` is ``true``) when the list exceeds the column
+  /// limit. The break before the right bracket is only made if there is a
+  /// break after the opening bracket.
+  /// \code
+  ///   true:                             false:
+  ///   vector<int> x {         vs.       vector<int> x {
+  ///      1, 2, 3                           1, 2, 3}
+  ///   }
+  /// \endcode
+  /// \version 22
+  bool BreakBeforeCloseBracketBracedList;
+
+  /// Force break before the right parenthesis of a function (declaration,
+  /// definition, call) when the parameters exceed the column limit.
+  /// \code
+  ///   true:                             false:
+  ///   foo (                   vs.       foo (
+  ///      a , b                             a , b)
+  ///   )
+  /// \endcode
+  /// \version 22
+  bool BreakBeforeCloseBracketFunction;
+
+  /// Force break before the right parenthesis of an if control statement
+  /// when the expression exceeds the column limit. The break before the
+  /// closing parenthesis is only made if there is a break after the opening
+  /// parenthesis.
+  /// \code
+  ///   true:                             false:
+  ///   if constexpr (          vs.       if constexpr (
+  ///      a || b                            a || b )
+  ///   )
+  /// \endcode
+  /// \version 22
+  bool BreakBeforeCloseBracketIf;
+
+  /// Force break before the right parenthesis of a loop control statement
+  /// when the expression exceeds the column limit. The break before the
+  /// closing parenthesis is only made if there is a break after the opening
+  /// parenthesis.
+  /// \code
+  ///   true:                             false:
+  ///   while (                  vs.      while (
+  ///      a && b                            a && b) {
+  ///   ) {
+  /// \endcode
+  /// \version 22
+  bool BreakBeforeCloseBracketLoop;
+
+  /// Force break before the right parenthesis of a switch control statement
+  /// when the expression exceeds the column limit. The break before the
+  /// closing parenthesis is only made if there is a break after the opening
+  /// parenthesis.
+  /// \code
+  ///   true:                             false:
+  ///   switch (                 vs.      switch (
+  ///      a + b                             a + b) {
+  ///   ) {
+  /// \endcode
+  /// \version 22
+  bool BreakBeforeCloseBracketSwitch;
+
   /// Different ways to break before concept declarations.
   enum BreakBeforeConceptDeclarationsStyle : int8_t {
     /// Keep the template declaration line together with ``concept``.
@@ -5530,10 +5633,23 @@ struct FormatStyle {
            BreakAdjacentStringLiterals == R.BreakAdjacentStringLiterals &&
            BreakAfterAttributes == R.BreakAfterAttributes &&
            BreakAfterJavaFieldAnnotations == R.BreakAfterJavaFieldAnnotations &&
+           BreakAfterOpenBracketBracedList ==
+               R.BreakAfterOpenBracketBracedList &&
+           BreakAfterOpenBracketFunction == R.BreakAfterOpenBracketFunction &&
+           BreakAfterOpenBracketIf == R.BreakAfterOpenBracketIf &&
+           BreakAfterOpenBracketLoop == R.BreakAfterOpenBracketLoop &&
+           BreakAfterOpenBracketSwitch == R.BreakAfterOpenBracketSwitch &&
            BreakAfterReturnType == R.BreakAfterReturnType &&
            BreakArrays == R.BreakArrays &&
            BreakBeforeBinaryOperators == R.BreakBeforeBinaryOperators &&
            BreakBeforeBraces == R.BreakBeforeBraces &&
+           BreakBeforeCloseBracketBracedList ==
+               R.BreakBeforeCloseBracketBracedList &&
+           BreakBeforeCloseBracketFunction ==
+               R.BreakBeforeCloseBracketFunction &&
+           BreakBeforeCloseBracketIf == R.BreakBeforeCloseBracketIf &&
+           BreakBeforeCloseBracketLoop == R.BreakBeforeCloseBracketLoop &&
+           BreakBeforeCloseBracketSwitch == R.BreakBeforeCloseBracketSwitch &&
            BreakBeforeConceptDeclarations == R.BreakBeforeConceptDeclarations &&
            BreakBeforeInlineASMColon == R.BreakBeforeInlineASMColon &&
            BreakBeforeTemplateCloser == R.BreakBeforeTemplateCloser &&
diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h
index 2403cbbb652dd..18ad7bf292f1e 100644
--- a/clang/include/clang/Frontend/CompilerInstance.h
+++ b/clang/include/clang/Frontend/CompilerInstance.h
@@ -946,6 +946,12 @@ class CompilerInstance : public ModuleLoader {
     DependencyCollectors.push_back(std::move(Listener));
   }
 
+  void clearDependencyCollectors() { DependencyCollectors.clear(); }
+
+  std::vector<std::shared_ptr<DependencyCollector>> &getDependencyCollectors() {
+    return DependencyCollectors;
+  }
+
   void setExternalSemaSource(IntrusiveRefCntPtr<ExternalSemaSource> ESS);
 
   ModuleCache &getModuleCache() const { return *ModCache; }
diff --git a/clang/include/clang/Frontend/CompilerInvocation.h b/clang/include/clang/Frontend/CompilerInvocation.h
index e147d2ba6087e..51787d914e1ec 100644
--- a/clang/include/clang/Frontend/CompilerInvocation.h
+++ b/clang/include/clang/Frontend/CompilerInvocation.h
@@ -147,6 +147,13 @@ class CompilerInvocationBase {
   }
   /// @}
 
+  /// Visitation.
+  /// @{
+  /// Visits paths stored in the invocation. The callback may return true to
+  /// short-circuit the visitation, or return false to continue visiting.
+  void visitPaths(llvm::function_ref<bool(StringRef)> Callback) const;
+  /// @}
+
   /// Command line generation.
   /// @{
   using StringAllocator = llvm::function_ref<const char *(const Twine &)>;
@@ -181,6 +188,12 @@ class CompilerInvocationBase {
   /// This is a (less-efficient) wrapper over generateCC1CommandLine().
   std::vector<std::string> getCC1CommandLine() const;
 
+protected:
+  /// Visits paths stored in the invocation. This is generally unsafe to call
+  /// directly, and each sub-class need to ensure calling this doesn't violate
+  /// its invariants.
+  void visitPathsImpl(llvm::function_ref<bool(std::string &)> Predicate);
+
 private:
   /// Generate command line options from DiagnosticOptions.
   static void GenerateDiagnosticArgs(const DiagnosticOptions &Opts,
diff --git a/clang/include/clang/Frontend/FrontendActions.h b/clang/include/clang/Frontend/FrontendActions.h
index 73308c004bd23..87a9f0d4cb06c 100644
--- a/clang/include/clang/Frontend/FrontendActions.h
+++ b/clang/include/clang/Frontend/FrontendActions.h
@@ -320,15 +320,6 @@ class PrintPreprocessedAction : public PreprocessorFrontendAction {
   bool hasPCHSupport() const override { return true; }
 };
 
-class GetDependenciesByModuleNameAction : public PreprocessOnlyAction {
-  StringRef ModuleName;
-  void ExecuteAction() override;
-
-public:
-  GetDependenciesByModuleNameAction(StringRef ModuleName)
-      : ModuleName(ModuleName) {}
-};
-
 //===----------------------------------------------------------------------===//
 // HLSL Specific Actions
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Frontend/FrontendOptions.h b/clang/include/clang/Frontend/FrontendOptions.h
index c919a53ae089e..ba7da56cb9fce 100644
--- a/clang/include/clang/Frontend/FrontendOptions.h
+++ b/clang/include/clang/Frontend/FrontendOptions.h
@@ -241,6 +241,8 @@ class FrontendInputFile {
   /// Whether we're dealing with a 'system' input (vs. a 'user' input).
   bool IsSystem = false;
 
+  friend class CompilerInvocationBase;
+
 public:
   FrontendInputFile() = default;
   FrontendInputFile(StringRef File, InputKind Kind, bool IsSystem = false)
diff --git a/clang/include/clang/Frontend/Utils.h b/clang/include/clang/Frontend/Utils.h
index 49fd920d1ec43..ed2703c76f18d 100644
--- a/clang/include/clang/Frontend/Utils.h
+++ b/clang/include/clang/Frontend/Utils.h
@@ -15,8 +15,8 @@
 
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/LLVM.h"
-#include "clang/Driver/OptionUtils.h"
 #include "clang/Frontend/DependencyOutputOptions.h"
+#include "clang/Options/OptionUtils.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/StringMap.h"
diff --git a/clang/include/clang/Lex/PPEmbedParameters.h b/clang/include/clang/Lex/PPEmbedParameters.h
index c4fb8d02f6f35..41a69664df366 100644
--- a/clang/include/clang/Lex/PPEmbedParameters.h
+++ b/clang/include/clang/Lex/PPEmbedParameters.h
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Defines all of the preprocessor directive parmeters for #embed
+// Defines all of the preprocessor directive parameters for #embed
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 39754847a93e4..b1c648e647f41 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -226,7 +226,7 @@ class Preprocessor {
       LangOptions::FPEvalMethodKind::FEM_UnsetOnCommandLine;
 
   // Next __COUNTER__ value, starts at 0.
-  unsigned CounterValue = 0;
+  uint32_t CounterValue = 0;
 
   enum {
     /// Maximum depth of \#includes.
@@ -1327,6 +1327,7 @@ class Preprocessor {
                                                 std::move(Callbacks));
     Callbacks = std::move(C);
   }
+  void removePPCallbacks() { Callbacks.reset(); }
   /// \}
 
   /// Get the number of tokens processed so far.
@@ -2421,8 +2422,8 @@ class Preprocessor {
   bool SawDateOrTime() const {
     return DATELoc != SourceLocation() || TIMELoc != SourceLocation();
   }
-  unsigned getCounterValue() const { return CounterValue; }
-  void setCounterValue(unsigned V) { CounterValue = V; }
+  uint32_t getCounterValue() const { return CounterValue; }
+  void setCounterValue(uint32_t V) { CounterValue = V; }
 
   LangOptions::FPEvalMethodKind getCurrentFPEvalMethod() const {
     assert(CurrentFPEvalMethod != LangOptions::FEM_UnsetOnCommandLine &&
diff --git a/clang/include/clang/Lex/PreprocessorOptions.h b/clang/include/clang/Lex/PreprocessorOptions.h
index d4c4e1ccbf2c4..1c2f6e72e1b93 100644
--- a/clang/include/clang/Lex/PreprocessorOptions.h
+++ b/clang/include/clang/Lex/PreprocessorOptions.h
@@ -198,6 +198,10 @@ class PreprocessorOptions {
   /// If set, the UNIX timestamp specified by SOURCE_DATE_EPOCH.
   std::optional<uint64_t> SourceDateEpoch;
 
+  /// The initial value for __COUNTER__; typically is zero but can be set via a
+  /// -cc1 flag for testing purposes.
+  uint32_t InitialCounterValue = 0;
+
 public:
   PreprocessorOptions() : PrecompiledPreambleBytes(0, false) {}
 
diff --git a/clang/include/clang/Driver/CMakeLists.txt b/clang/include/clang/Options/CMakeLists.txt
similarity index 100%
rename from clang/include/clang/Driver/CMakeLists.txt
rename to clang/include/clang/Options/CMakeLists.txt
diff --git a/clang/include/clang/Driver/ClangOptionDocs.td b/clang/include/clang/Options/ClangOptionDocs.td
similarity index 100%
rename from clang/include/clang/Driver/ClangOptionDocs.td
rename to clang/include/clang/Options/ClangOptionDocs.td
diff --git a/clang/include/clang/Driver/OptionUtils.h b/clang/include/clang/Options/OptionUtils.h
similarity index 94%
rename from clang/include/clang/Driver/OptionUtils.h
rename to clang/include/clang/Options/OptionUtils.h
index 922f536bf33ea..83c48bd7d6843 100644
--- a/clang/include/clang/Driver/OptionUtils.h
+++ b/clang/include/clang/Options/OptionUtils.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_DRIVER_OPTIONUTILS_H
-#define LLVM_CLANG_DRIVER_OPTIONUTILS_H
+#ifndef LLVM_CLANG_OPTIONS_OPTIONUTILS_H
+#define LLVM_CLANG_OPTIONS_OPTIONUTILS_H
 
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/LLVM.h"
@@ -55,4 +55,4 @@ inline uint64_t getLastArgUInt64Value(const llvm::opt::ArgList &Args,
 
 } // namespace clang
 
-#endif // LLVM_CLANG_DRIVER_OPTIONUTILS_H
+#endif // LLVM_CLANG_OPTIONS_OPTIONUTILS_H
diff --git a/clang/include/clang/Driver/Options.h b/clang/include/clang/Options/Options.h
similarity index 83%
rename from clang/include/clang/Driver/Options.h
rename to clang/include/clang/Options/Options.h
index 0797410e9940e..ac98699001965 100644
--- a/clang/include/clang/Driver/Options.h
+++ b/clang/include/clang/Options/Options.h
@@ -6,14 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_DRIVER_OPTIONS_H
-#define LLVM_CLANG_DRIVER_OPTIONS_H
+#ifndef LLVM_CLANG_OPTIONS_OPTIONS_H
+#define LLVM_CLANG_OPTIONS_OPTIONS_H
 
 #include "llvm/Option/OptTable.h"
 #include "llvm/Option/Option.h"
 
 namespace clang {
-namespace driver {
 
 namespace options {
 /// Flags specifically for clang options.  Must not overlap with
@@ -42,16 +41,15 @@ enum ClangVisibility {
 };
 
 enum ID {
-    OPT_INVALID = 0, // This is not an option ID.
+  OPT_INVALID = 0, // This is not an option ID.
 #define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__),
-#include "clang/Driver/Options.inc"
-    LastOption
+#include "clang/Options/Options.inc"
+  LastOption
 #undef OPTION
-  };
-}
+};
+} // namespace options
 
 const llvm::opt::OptTable &getDriverOptTable();
-}
-}
+} // namespace clang
 
-#endif
+#endif // LLVM_CLANG_OPTIONS_OPTIONS_H
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Options/Options.td
similarity index 99%
rename from clang/include/clang/Driver/Options.td
rename to clang/include/clang/Options/Options.td
index cb5cb888c6da7..2f7434d8afe11 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -741,6 +741,7 @@ def gcc_toolchain : Joined<["--"], "gcc-toolchain=">, Flags<[NoXarchOption]>,
     "Specify a directory where Flang can find 'lib{,32,64}/gcc{,-cross}/$triple/$version'. "
     "Flang will use the GCC installation with the largest version">;
 def gcc_triple_EQ : Joined<["--"], "gcc-triple=">,
+  Visibility<[ClangOption, FlangOption]>,
   HelpText<"Search for the GCC installation with the specified triple.">;
 def CC : Flag<["-"], "CC">, Visibility<[ClangOption, CC1Option]>,
   Group<Preprocessor_Group>,
@@ -950,9 +951,9 @@ def Xarch__
   the host system, which can be used to suppress incompatible GPU arguments.}]>,
       MetaVarName<"<arch> <arg>">;
 def Xarch_host : Separate<["-"], "Xarch_host">, Flags<[NoXarchOption]>,
-  HelpText<"Pass <arg> to the CUDA/HIP host compilation">, MetaVarName<"<arg>">;
+  HelpText<"Pass <arg> to host compilation in the offloading toolchain">, MetaVarName<"<arg>">;
 def Xarch_device : Separate<["-"], "Xarch_device">, Flags<[NoXarchOption]>,
-  HelpText<"Pass <arg> to the CUDA/HIP device compilation">, MetaVarName<"<arg>">;
+  HelpText<"Pass <arg> to device compilation in the offloading toolchain">, MetaVarName<"<arg>">;
 def Xassembler : Separate<["-"], "Xassembler">,
   HelpText<"Pass <arg> to the assembler">, MetaVarName<"<arg>">,
   Group<CompileOnly_Group>;
@@ -2154,8 +2155,12 @@ defm dollars_in_identifiers : BoolFOption<"dollars-in-identifiers",
   PosFlag<SetTrue, [], [ClangOption], "Allow">,
   NegFlag<SetFalse, [], [ClangOption], "Disallow">,
   BothFlags<[], [ClangOption, CC1Option], " '$' in identifiers">>;
-def fdwarf2_cfi_asm : Flag<["-"], "fdwarf2-cfi-asm">, Group<clang_ignored_f_Group>;
-def fno_dwarf2_cfi_asm : Flag<["-"], "fno-dwarf2-cfi-asm">, Group<clang_ignored_f_Group>;
+
+defm dwarf2_cfi_asm
+    : BoolFOption<"dwarf2-cfi-asm", CodeGenOpts<"Dwarf2CFIAsm">, DefaultFalse,
+                  PosFlag<SetTrue, [], [ClangOption, CC1Option]>,
+                  NegFlag<SetFalse>>;
+
 defm dwarf_directory_asm : BoolFOption<"dwarf-directory-asm",
   CodeGenOpts<"NoDwarfDirectoryAsm">, DefaultFalse,
   NegFlag<SetTrue, [], [ClangOption, CC1Option]>,
@@ -5661,6 +5666,9 @@ def mno_warn_nonportable_cfstrings : Flag<["-"], "mno-warn-nonportable-cfstrings
 def mno_omit_leaf_frame_pointer : Flag<["-"], "mno-omit-leaf-frame-pointer">, Group<m_Group>;
 def momit_leaf_frame_pointer : Flag<["-"], "momit-leaf-frame-pointer">, Group<m_Group>,
   HelpText<"Omit frame pointer setup for leaf functions">;
+def mno_reserve_frame_pointer_reg : Flag<["-"], "mno-reserve-frame-pointer-reg">, Group<m_Group>;
+def mreserve_frame_pointer_reg : Flag<["-"], "mreserve-frame-pointer-reg">, Group<m_Group>,
+  HelpText<"Reserve the frame pointer register even if the function doesn't have a frame">;
 def moslib_EQ : Joined<["-"], "moslib=">, Group<m_Group>;
 def mpascal_strings : Flag<["-"], "mpascal-strings">, Alias<fpascal_strings>;
 def mred_zone : Flag<["-"], "mred-zone">, Group<m_Group>;
@@ -6695,8 +6703,6 @@ def mamx_tf32 : Flag<["-"], "mamx-tf32">, Group<m_x86_Features_Group>;
 def mno_amx_tf32 : Flag<["-"], "mno-amx-tf32">, Group<m_x86_Features_Group>;
 def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
 def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
-def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>;
-def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>;
 def mamx_movrs: Flag<["-"], "mamx-movrs">, Group<m_x86_Features_Group>;
 def mno_amx_movrs: Flag<["-"], "mno-amx-movrs">, Group<m_x86_Features_Group>;
 def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>;
@@ -8447,6 +8453,10 @@ def aligned_alloc_unavailable : Flag<["-"], "faligned-alloc-unavailable">,
   MarshallingInfoFlag<LangOpts<"AlignedAllocationUnavailable">>,
   ShouldParseIf<faligned_allocation.KeyPath>;
 
+def finitial_counter_value_EQ : Joined<["-"], "finitial-counter-value=">,
+  HelpText<"Sets the initial value for __COUNTER__, defaults to 0.">,
+  MarshallingInfoInt<PreprocessorOpts<"InitialCounterValue">, "0">;
+
 } // let Visibility = [CC1Option]
 
 //===----------------------------------------------------------------------===//
@@ -8487,8 +8497,8 @@ def pic_is_pie : Flag<["-"], "pic-is-pie">,
   MarshallingInfoFlag<LangOpts<"PIE">>;
 
 def mframe_pointer_EQ : Joined<["-"], "mframe-pointer=">,
-  HelpText<"Specify which frame pointers to retain.">, Values<"all,non-leaf,reserved,none">,
-  NormalizedValuesScope<"CodeGenOptions::FramePointerKind">, NormalizedValues<["All", "NonLeaf", "Reserved", "None"]>,
+  HelpText<"Specify which frame pointers to retain.">, Values<"all,non-leaf,non-leaf-no-reserve,reserved,none">,
+  NormalizedValuesScope<"CodeGenOptions::FramePointerKind">, NormalizedValues<["All", "NonLeaf", "NonLeafNoReserve", "Reserved", "None"]>,
   MarshallingInfoEnum<CodeGenOpts<"FramePointer">, "None">;
 
 
@@ -8628,6 +8638,11 @@ def fobjc_subscripting_legacy_runtime : Flag<["-"], "fobjc-subscripting-legacy-r
 def vtordisp_mode_EQ : Joined<["-"], "vtordisp-mode=">,
   HelpText<"Control vtordisp placement on win32 targets">,
   MarshallingInfoInt<LangOpts<"VtorDispMode">, "1">;
+def fnative_int16_type : Flag<["-"], "fnative-int16-type">,
+  HelpText<"Use 16 bit integer types">,
+  // This option is implied unless we are in HLSL lang mode
+  ImpliedByAnyOf<[!strconcat("!", hlsl.KeyPath)]>,
+  MarshallingInfoFlag<LangOpts<"NativeInt16Type">>;
 def fnative_half_type: Flag<["-"], "fnative-half-type">,
   HelpText<"Use the native half type for __fp16 instead of promoting to float">,
   MarshallingInfoFlag<LangOpts<"NativeHalfType">>,
@@ -9206,6 +9221,12 @@ def : CLFlag<"Qscatter-">, Alias<mno_scatter>,
 
 def _SLASH_arch : CLCompileJoined<"arch:">,
   HelpText<"Set architecture for code generation">;
+def _SLASH_vlen : CLFlag<"vlen">,
+  HelpText<"Set default vector length for autovectorization and other optimizations">;
+def _SLASH_vlen_EQ_256 : CLFlag<"vlen=256">,
+  HelpText<"Set vector length of 256 bits for autovectorization and other optimizations">;
+def _SLASH_vlen_EQ_512 : CLFlag<"vlen=512">,
+  HelpText<"Set vector length of 512 bits for autovectorization and other optimizations">;
 
 def _SLASH_M_Group : OptionGroup<"</M group>">, Group<cl_compile_Group>;
 def _SLASH_volatile_Group : OptionGroup<"</volatile group>">,
@@ -9520,7 +9541,7 @@ def emit_pristine_llvm : DXCFlag<"emit-pristine-llvm">,
   HelpText<"Emit pristine LLVM IR from the frontend by not running any LLVM passes at all."
            "Same as -S + -emit-llvm + -disable-llvm-passes.">;
 def fcgl : DXCFlag<"fcgl">, Alias<emit_pristine_llvm>;
-def enable_16bit_types : DXCFlag<"enable-16bit-types">, Alias<fnative_half_type>,
+def enable_16bit_types : DXCFlag<"enable-16bit-types">,
   HelpText<"Enable 16-bit types and disable min precision types."
            "Available in HLSL 2018 and shader model 6.2.">;
 def fdx_rootsignature_version :
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index dad8efd0f017f..58eb1c0a7c114 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -5223,11 +5223,7 @@ class Parser : public CodeCompletionHandler {
   ///         assignment-expression
   ///         '{' ...
   /// \endverbatim
-  ExprResult ParseInitializer() {
-    if (Tok.isNot(tok::l_brace))
-      return ParseAssignmentExpression();
-    return ParseBraceInitializer();
-  }
+  ExprResult ParseInitializer(Decl *DeclForInitializer = nullptr);
 
   /// MayBeDesignationStart - Return true if the current token might be the
   /// start of a designator.  If we can tell it is impossible that it is a
diff --git a/clang/include/clang/Sema/AnalysisBasedWarnings.h b/clang/include/clang/Sema/AnalysisBasedWarnings.h
index 4103c3f006a8f..20a2030f56034 100644
--- a/clang/include/clang/Sema/AnalysisBasedWarnings.h
+++ b/clang/include/clang/Sema/AnalysisBasedWarnings.h
@@ -14,15 +14,19 @@
 #define LLVM_CLANG_SEMA_ANALYSISBASEDWARNINGS_H
 
 #include "clang/AST/Decl.h"
+#include "clang/Sema/ScopeInfo.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
 #include <memory>
 
 namespace clang {
 
+class AnalysisDeclContext;
 class Decl;
 class FunctionDecl;
 class QualType;
 class Sema;
+class VarDecl;
 namespace sema {
   class FunctionScopeInfo;
   class SemaPPCallbacks;
@@ -57,6 +61,8 @@ class AnalysisBasedWarnings {
 
   enum VisitFlag { NotVisited = 0, Visited = 1, Pending = 2 };
   llvm::DenseMap<const FunctionDecl*, VisitFlag> VisitedFD;
+  std::multimap<VarDecl *, PossiblyUnreachableDiag>
+      VarDeclPossiblyUnreachableDiags;
 
   Policy PolicyOverrides;
   void clearOverrides();
@@ -107,6 +113,10 @@ class AnalysisBasedWarnings {
   // Issue warnings that require whole-translation-unit analysis.
   void IssueWarnings(TranslationUnitDecl *D);
 
+  void registerVarDeclWarning(VarDecl *VD, PossiblyUnreachableDiag PUD);
+
+  void issueWarningsForRegisteredVarDecl(VarDecl *VD);
+
   // Gets the default policy which is in effect at the given source location.
   Policy getPolicyInEffectAt(SourceLocation Loc);
 
diff --git a/clang/include/clang/Sema/Attr.h b/clang/include/clang/Sema/Attr.h
index 3f0b10212789a..5836231818eec 100644
--- a/clang/include/clang/Sema/Attr.h
+++ b/clang/include/clang/Sema/Attr.h
@@ -123,6 +123,12 @@ inline bool isInstanceMethod(const Decl *D) {
   return false;
 }
 
+inline bool hasImplicitObjectParameter(const Decl *D) {
+  if (const auto *MethodDecl = dyn_cast<CXXMethodDecl>(D))
+    return MethodDecl->isImplicitObjectMemberFunction();
+  return false;
+}
+
 /// Diagnose mutually exclusive attributes when present on a given
 /// declaration. Returns true if diagnosed.
 template <typename AttrTy>
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 52904c72d1cfc..163ab32fafa48 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -2608,13 +2608,13 @@ class Sema final : public SemaBase {
   };
 
   /// Given a function and its FormatAttr or FormatMatchesAttr info, attempts to
-  /// populate the FomatStringInfo parameter with the attribute's correct
+  /// populate the FormatStringInfo parameter with the attribute's correct
   /// format_idx and firstDataArg. Returns true when the format fits the
   /// function and the FormatStringInfo has been populated.
   static bool getFormatStringInfo(const Decl *Function, unsigned FormatIdx,
                                   unsigned FirstArg, FormatStringInfo *FSI);
   static bool getFormatStringInfo(unsigned FormatIdx, unsigned FirstArg,
-                                  bool IsCXXMember, bool IsVariadic,
+                                  bool HasImplicitThisParam, bool IsVariadic,
                                   FormatStringInfo *FSI);
 
   // Used by C++ template instantiation.
@@ -5119,7 +5119,7 @@ class Sema final : public SemaBase {
     // In C++ the implicit 'this' function parameter also counts.
     // Parameters are counted from one.
     bool HP = hasFunctionProto(D);
-    bool HasImplicitThisParam = isInstanceMethod(D);
+    bool HasImplicitThisParam = hasImplicitObjectParameter(D);
     bool IV = HP && isFunctionOrMethodVariadic(D);
     unsigned NumParams =
         (HP ? getFunctionOrMethodNumParams(D) : 0) + HasImplicitThisParam;
@@ -6756,6 +6756,11 @@ class Sema final : public SemaBase {
     /// suffice, e.g., in a default function argument.
     Decl *ManglingContextDecl;
 
+    /// Declaration for initializer if one is currently being
+    /// parsed. Used when an expression has a possibly unreachable
+    /// diagnostic to reference the declaration as a whole.
+    VarDecl *DeclForInitializer = nullptr;
+
     /// If we are processing a decltype type, a set of call expressions
     /// for which we have deferred checking the completeness of the return type.
     SmallVector<CallExpr *, 8> DelayedDecltypeCalls;
@@ -11309,9 +11314,6 @@ class Sema final : public SemaBase {
                           InventedParameterInfos.end());
   }
 
-  /// The number of SFINAE diagnostics that have been trapped.
-  unsigned NumSFINAEErrors;
-
   ArrayRef<sema::FunctionScopeInfo *> getFunctionScopes() const {
     return llvm::ArrayRef(FunctionScopes.begin() + FunctionScopesStart,
                           FunctionScopes.end());
@@ -11668,7 +11670,7 @@ class Sema final : public SemaBase {
       ASTTemplateArgsPtr TemplateArgsIn, SourceLocation RAngleLoc);
 
   DeclResult ActOnVarTemplateSpecialization(
-      Scope *S, Declarator &D, TypeSourceInfo *DI, LookupResult &Previous,
+      Scope *S, Declarator &D, TypeSourceInfo *TSI, LookupResult &Previous,
       SourceLocation TemplateKWLoc, TemplateParameterList *TemplateParams,
       StorageClass SC, bool IsPartialSpecialization);
 
@@ -12385,49 +12387,65 @@ class Sema final : public SemaBase {
   ///@{
 
 public:
-  /// When true, access checking violations are treated as SFINAE
-  /// failures rather than hard errors.
-  bool AccessCheckingSFINAE;
+  class SFINAETrap;
+
+  struct SFINAEContextBase {
+    SFINAEContextBase(Sema &S, SFINAETrap *Cur)
+        : S(S), Prev(std::exchange(S.CurrentSFINAEContext, Cur)) {}
+
+  protected:
+    Sema &S;
+    ~SFINAEContextBase() { S.CurrentSFINAEContext = Prev; }
+
+  private:
+    SFINAETrap *Prev;
+  };
+
+  struct NonSFINAEContext : SFINAEContextBase {
+    NonSFINAEContext(Sema &S) : SFINAEContextBase(S, nullptr) {}
+  };
 
   /// RAII class used to determine whether SFINAE has
   /// trapped any errors that occur during template argument
   /// deduction.
-  class SFINAETrap {
-    Sema &SemaRef;
-    unsigned PrevSFINAEErrors;
-    bool PrevInNonInstantiationSFINAEContext;
-    bool PrevAccessCheckingSFINAE;
-    bool PrevLastDiagnosticIgnored;
+  class SFINAETrap : SFINAEContextBase {
+    bool HasErrorOcurred = false;
+    bool WithAccessChecking = false;
+    bool PrevLastDiagnosticIgnored =
+        S.getDiagnostics().isLastDiagnosticIgnored();
+    sema::TemplateDeductionInfo *DeductionInfo = nullptr;
+
+    SFINAETrap(Sema &S, sema::TemplateDeductionInfo *Info,
+               bool WithAccessChecking)
+        : SFINAEContextBase(S, this), WithAccessChecking(WithAccessChecking),
+          DeductionInfo(Info) {}
 
   public:
-    /// \param ForValidityCheck If true, discard all diagnostics (from the
+    /// \param WithAccessChecking If true, discard all diagnostics (from the
     /// immediate context) instead of adding them to the currently active
-    /// \ref TemplateDeductionInfo (as returned by \ref isSFINAEContext).
-    explicit SFINAETrap(Sema &SemaRef, bool ForValidityCheck = false)
-        : SemaRef(SemaRef), PrevSFINAEErrors(SemaRef.NumSFINAEErrors),
-          PrevInNonInstantiationSFINAEContext(
-              SemaRef.InNonInstantiationSFINAEContext),
-          PrevAccessCheckingSFINAE(SemaRef.AccessCheckingSFINAE),
-          PrevLastDiagnosticIgnored(
-              SemaRef.getDiagnostics().isLastDiagnosticIgnored()) {
-      if (ForValidityCheck || !SemaRef.isSFINAEContext())
-        SemaRef.InNonInstantiationSFINAEContext = true;
-      SemaRef.AccessCheckingSFINAE = ForValidityCheck;
-    }
+    /// \ref TemplateDeductionInfo.
+    explicit SFINAETrap(Sema &S, bool WithAccessChecking = false)
+        : SFINAETrap(S, /*Info=*/nullptr, WithAccessChecking) {}
+
+    SFINAETrap(Sema &S, sema::TemplateDeductionInfo &Info)
+        : SFINAETrap(S, &Info, /*WithAccessChecking=*/false) {}
 
     ~SFINAETrap() {
-      SemaRef.NumSFINAEErrors = PrevSFINAEErrors;
-      SemaRef.InNonInstantiationSFINAEContext =
-          PrevInNonInstantiationSFINAEContext;
-      SemaRef.AccessCheckingSFINAE = PrevAccessCheckingSFINAE;
-      SemaRef.getDiagnostics().setLastDiagnosticIgnored(
-          PrevLastDiagnosticIgnored);
+      S.getDiagnostics().setLastDiagnosticIgnored(PrevLastDiagnosticIgnored);
     }
 
-    /// Determine whether any SFINAE errors have been trapped.
-    bool hasErrorOccurred() const {
-      return SemaRef.NumSFINAEErrors > PrevSFINAEErrors;
+    SFINAETrap(const SFINAETrap &) = delete;
+    SFINAETrap &operator=(const SFINAETrap &) = delete;
+
+    sema::TemplateDeductionInfo *getDeductionInfo() const {
+      return DeductionInfo;
     }
+
+    /// Determine whether any SFINAE errors have been trapped.
+    bool hasErrorOccurred() const { return HasErrorOcurred; }
+    void setErrorOccurred() { HasErrorOcurred = true; }
+
+    bool withAccessChecking() const { return WithAccessChecking; }
   };
 
   /// RAII class used to indicate that we are performing provisional
@@ -13148,9 +13166,6 @@ class Sema final : public SemaBase {
       PartialOrderingTTP,
     } Kind;
 
-    /// Was the enclosing context a non-instantiation SFINAE context?
-    bool SavedInNonInstantiationSFINAEContext;
-
     /// Whether we're substituting into constraints.
     bool InConstraintSubstitution;
 
@@ -13195,22 +13210,15 @@ class Sema final : public SemaBase {
       return {TemplateArgs, NumTemplateArgs};
     }
 
-    /// The template deduction info object associated with the
-    /// substitution or checking of explicit or deduced template arguments.
-    sema::TemplateDeductionInfo *DeductionInfo;
-
     /// The source range that covers the construct that cause
     /// the instantiation, e.g., the template-id that causes a class
     /// template instantiation.
     SourceRange InstantiationRange;
 
     CodeSynthesisContext()
-        : Kind(TemplateInstantiation),
-          SavedInNonInstantiationSFINAEContext(false),
-          InConstraintSubstitution(false),
+        : Kind(TemplateInstantiation), InConstraintSubstitution(false),
           InParameterMappingSubstitution(false), Entity(nullptr),
-          Template(nullptr), TemplateArgs(nullptr), NumTemplateArgs(0),
-          DeductionInfo(nullptr) {}
+          Template(nullptr), TemplateArgs(nullptr), NumTemplateArgs(0) {}
 
     /// Determines whether this template is an actual instantiation
     /// that should be counted toward the maximum instantiation depth.
@@ -13262,7 +13270,6 @@ class Sema final : public SemaBase {
                           FunctionTemplateDecl *FunctionTemplate,
                           ArrayRef<TemplateArgument> TemplateArgs,
                           CodeSynthesisContext::SynthesisKind Kind,
-                          sema::TemplateDeductionInfo &DeductionInfo,
                           SourceRange InstantiationRange = SourceRange());
 
     /// Note that we are instantiating as part of template
@@ -13270,7 +13277,6 @@ class Sema final : public SemaBase {
     InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
                           TemplateDecl *Template,
                           ArrayRef<TemplateArgument> TemplateArgs,
-                          sema::TemplateDeductionInfo &DeductionInfo,
                           SourceRange InstantiationRange = SourceRange());
 
     /// Note that we are instantiating as part of template
@@ -13279,7 +13285,6 @@ class Sema final : public SemaBase {
     InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
                           ClassTemplatePartialSpecializationDecl *PartialSpec,
                           ArrayRef<TemplateArgument> TemplateArgs,
-                          sema::TemplateDeductionInfo &DeductionInfo,
                           SourceRange InstantiationRange = SourceRange());
 
     /// Note that we are instantiating as part of template
@@ -13288,7 +13293,6 @@ class Sema final : public SemaBase {
     InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
                           VarTemplatePartialSpecializationDecl *PartialSpec,
                           ArrayRef<TemplateArgument> TemplateArgs,
-                          sema::TemplateDeductionInfo &DeductionInfo,
                           SourceRange InstantiationRange = SourceRange());
 
     /// Note that we are instantiating a default argument for a function
@@ -13334,7 +13338,6 @@ class Sema final : public SemaBase {
     /// concept.
     InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
                           ConstraintSubstitution, NamedDecl *Template,
-                          sema::TemplateDeductionInfo &DeductionInfo,
                           SourceRange InstantiationRange);
 
     struct ConstraintNormalization {};
@@ -13354,7 +13357,6 @@ class Sema final : public SemaBase {
     /// a requirement of a requires expression.
     InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
                           concepts::Requirement *Req,
-                          sema::TemplateDeductionInfo &DeductionInfo,
                           SourceRange InstantiationRange = SourceRange());
 
     /// \brief Note that we are checking the satisfaction of the constraint
@@ -13366,7 +13368,6 @@ class Sema final : public SemaBase {
     /// \brief Note that we are checking a requires clause.
     InstantiatingTemplate(Sema &SemaRef, SourceLocation PointOfInstantiation,
                           const RequiresExpr *E,
-                          sema::TemplateDeductionInfo &DeductionInfo,
                           SourceRange InstantiationRange);
 
     struct BuildingDeductionGuidesTag {};
@@ -13399,8 +13400,7 @@ class Sema final : public SemaBase {
                           SourceLocation PointOfInstantiation,
                           SourceRange InstantiationRange, Decl *Entity,
                           NamedDecl *Template = nullptr,
-                          ArrayRef<TemplateArgument> TemplateArgs = {},
-                          sema::TemplateDeductionInfo *DeductionInfo = nullptr);
+                          ArrayRef<TemplateArgument> TemplateArgs = {});
 
     InstantiatingTemplate(const InstantiatingTemplate &) = delete;
 
@@ -13541,12 +13541,7 @@ class Sema final : public SemaBase {
   /// recent visible declaration of that namespace.
   llvm::DenseMap<NamedDecl *, NamedDecl *> VisibleNamespaceCache;
 
-  /// Whether we are in a SFINAE context that is not associated with
-  /// template instantiation.
-  ///
-  /// This is used when setting up a SFINAE trap (\c see SFINAETrap) outside
-  /// of a template instantiation or template argument deduction.
-  bool InNonInstantiationSFINAEContext;
+  SFINAETrap *CurrentSFINAEContext = nullptr;
 
   /// The number of \p CodeSynthesisContexts that are not template
   /// instantiations and, therefore, should not be counted as part of the
@@ -13617,15 +13612,13 @@ class Sema final : public SemaBase {
     PrintInstantiationStack(getDefaultDiagFunc());
   }
 
-  /// Determines whether we are currently in a context where
-  /// template argument substitution failures are not considered
-  /// errors.
-  ///
-  /// \returns An empty \c Optional if we're not in a SFINAE context.
-  /// Otherwise, contains a pointer that, if non-NULL, contains the nearest
-  /// template-deduction context object, which can be used to capture
-  /// diagnostics that will be suppressed.
-  std::optional<sema::TemplateDeductionInfo *> isSFINAEContext() const;
+  /// Returns a pointer to the current SFINAE context, if any.
+  [[nodiscard]] SFINAETrap *getSFINAEContext() const {
+    return CurrentSFINAEContext;
+  }
+  [[nodiscard]] bool isSFINAEContext() const {
+    return CurrentSFINAEContext != nullptr;
+  }
 
   /// Perform substitution on the type T with a given set of template
   /// arguments.
@@ -14637,7 +14630,8 @@ class Sema final : public SemaBase {
       ArrayRef<UnexpandedParameterPack> Unexpanded,
       const MultiLevelTemplateArgumentList &TemplateArgs,
       bool FailOnPackProducingTemplates, bool &ShouldExpand,
-      bool &RetainExpansion, UnsignedOrNone &NumExpansions);
+      bool &RetainExpansion, UnsignedOrNone &NumExpansions,
+      bool Diagnose = true);
 
   /// Determine the number of arguments in the given pack expansion
   /// type.
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index 8c3b6ae176389..28b03ac4c4676 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -20,7 +20,9 @@
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Sema/SemaBase.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/TargetParser/Triple.h"
 #include <initializer_list>
 
@@ -259,9 +261,11 @@ class SemaHLSL : public SemaBase {
   HLSLSemanticAttr *createSemantic(const SemanticInfo &Semantic,
                                    DeclaratorDecl *TargetDecl);
   bool determineActiveSemanticOnScalar(FunctionDecl *FD, DeclaratorDecl *D,
-                                       SemanticInfo &ActiveSemantic);
+                                       SemanticInfo &ActiveSemantic,
+                                       llvm::StringSet<> &ActiveInputSemantics);
   bool determineActiveSemantic(FunctionDecl *FD, DeclaratorDecl *D,
-                               SemanticInfo &ActiveSemantic);
+                               SemanticInfo &ActiveSemantic,
+                               llvm::StringSet<> &ActiveInputSemantics);
 
   void processExplicitBindingsOnDecl(VarDecl *D);
 
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index ba12b403d9b9a..e5628e845c9ee 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -1411,6 +1411,13 @@ class SemaOpenMP : public SemaBase {
                                             SourceLocation LParenLoc,
                                             SourceLocation EndLoc);
 
+  /// Called on a well-formed 'dyn_groupprivate' clause.
+  OMPClause *ActOnOpenMPDynGroupprivateClause(
+      OpenMPDynGroupprivateClauseModifier M1,
+      OpenMPDynGroupprivateClauseFallbackModifier M2, Expr *Size,
+      SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation M1Loc,
+      SourceLocation M2Loc, SourceLocation EndLoc);
+
   /// Called on well-formed 'doacross' clause.
   OMPClause *
   ActOnOpenMPDoacrossClause(OpenMPDoacrossClauseModifier DepType,
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index af856a8097ab1..4ca45a16408a6 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -220,8 +220,8 @@ class ASTReaderListener {
   }
 
   /// Receives __COUNTER__ value.
-  virtual void ReadCounter(const serialization::ModuleFile &M,
-                           unsigned Value) {}
+  virtual void ReadCounter(const serialization::ModuleFile &M, uint32_t Value) {
+  }
 
   /// This is called for each AST file loaded.
   virtual void visitModuleFile(StringRef Filename,
@@ -312,7 +312,7 @@ class ChainedASTReaderListener : public ASTReaderListener {
                                bool Complain,
                                std::string &SuggestedPredefines) override;
 
-  void ReadCounter(const serialization::ModuleFile &M, unsigned Value) override;
+  void ReadCounter(const serialization::ModuleFile &M, uint32_t Value) override;
   bool needsInputFileVisitation() override;
   bool needsSystemInputFileVisitation() override;
   void visitModuleFile(StringRef Filename,
@@ -352,7 +352,7 @@ class PCHValidator : public ASTReaderListener {
                                StringRef ModuleFilename,
                                StringRef SpecificModuleCachePath,
                                bool Complain) override;
-  void ReadCounter(const serialization::ModuleFile &M, unsigned Value) override;
+  void ReadCounter(const serialization::ModuleFile &M, uint32_t Value) override;
 };
 
 /// ASTReaderListenter implementation to set SuggestedPredefines of
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
index 4aee16576ebd1..66da79970ca19 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
@@ -372,12 +372,10 @@ class CallEvent {
   ProgramPoint getProgramPoint(bool IsPreVisit = false,
                                const ProgramPointTag *Tag = nullptr) const;
 
-  /// Returns a new state with all argument regions invalidated.
-  ///
-  /// This accepts an alternate state in case some processing has already
-  /// occurred.
+  /// Invalidates the regions (arguments, globals, special regions like 'this')
+  /// that may have been written by this call, returning the updated state.
   ProgramStateRef invalidateRegions(unsigned BlockCount,
-                                    ProgramStateRef Orig = nullptr) const;
+                                    ProgramStateRef State) const;
 
   using FrameBindingTy = std::pair<SVal, SVal>;
   using BindingsTy = SmallVectorImpl<FrameBindingTy>;
diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
index f222ded8a966a..ed562f46cfdaa 100644
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
@@ -154,11 +154,52 @@ class DependencyScanningTool {
   /// Given a compilation context specified via the Clang driver command-line,
   /// gather modular dependencies of module with the given name, and return the
   /// information needed for explicit build.
+  /// TODO: this method should be removed as soon as Swift and our C-APIs adopt
+  /// CompilerInstanceWithContext. We are keeping it here so that it is easier
+  /// to coordinate with Swift and C-API changes.
   llvm::Expected<TranslationUnitDeps> getModuleDependencies(
       StringRef ModuleName, const std::vector<std::string> &CommandLine,
       StringRef CWD, const llvm::DenseSet<ModuleID> &AlreadySeen,
       LookupModuleOutputCallback LookupModuleOutput);
 
+  /// The following three methods provide a new interface to perform
+  /// by name dependency scan. The new interface's intention is to improve
+  /// dependency scanning performance when a sequence of name is looked up
+  /// with the same current working directory and the command line.
+
+  /// @brief Initializing the context and the compiler instance.
+  ///        This method must be called before calling
+  ///        computeDependenciesByNameWithContext.
+  /// @param CWD The current working directory used during the scan.
+  /// @param CommandLine The commandline used for the scan.
+  /// @return Error if the initializaiton fails.
+  llvm::Error initializeCompilerInstanceWithContext(
+      StringRef CWD, const std::vector<std::string> &CommandLine);
+
+  /// @brief Computes the dependeny for the module named ModuleName.
+  /// @param ModuleName The name of the module for which this method computes
+  ///.                  dependencies.
+  /// @param AlreadySeen This stores modules which have previously been
+  ///                    reported. Use the same instance for all calls to this
+  ///                    function for a single \c DependencyScanningTool in a
+  ///                    single build. Note that this parameter is not part of
+  ///                    the context because it can be shared across different
+  ///                    worker threads and each worker thread may update it.
+  /// @param LookupModuleOutput This function is called to fill in
+  ///                           "-fmodule-file=", "-o" and other output
+  ///                           arguments for dependencies.
+  /// @return An instance of \c TranslationUnitDeps if the scan is successful.
+  ///         Otherwise it returns an error.
+  llvm::Expected<TranslationUnitDeps> computeDependenciesByNameWithContext(
+      StringRef ModuleName, const llvm::DenseSet<ModuleID> &AlreadySeen,
+      LookupModuleOutputCallback LookupModuleOutput);
+
+  /// @brief This method finializes the compiler instance. It finalizes the
+  ///        diagnostics and deletes the compiler instance. Call this method
+  ///        once all names for a same commandline are scanned.
+  /// @return Error if an error occured during finalization.
+  llvm::Error finalizeCompilerInstanceWithContext();
+
   llvm::vfs::FileSystem &getWorkerVFS() const { return Worker.getVFS(); }
 
 private:
diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h
index 6060e4b43312e..e2c353a254bf3 100644
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h
@@ -29,6 +29,7 @@ namespace tooling {
 namespace dependencies {
 
 class DependencyScanningWorkerFilesystem;
+class CompilerInstanceWithContext;
 
 /// A command-line tool invocation that is part of building a TU.
 ///
@@ -89,6 +90,8 @@ class DependencyScanningWorker {
   DependencyScanningWorker(DependencyScanningService &Service,
                            llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS);
 
+  ~DependencyScanningWorker();
+
   /// Run the dependency scanning tool for a given clang driver command-line,
   /// and report the discovered dependencies to the provided consumer. If
   /// TUBuffer is not nullopt, it is used as TU input for the dependency
@@ -103,18 +106,6 @@ class DependencyScanningWorker {
       DiagnosticConsumer &DiagConsumer,
       std::optional<llvm::MemoryBufferRef> TUBuffer = std::nullopt);
 
-  /// Run the dependency scanning tool for a given clang driver command-line
-  /// for a specific module.
-  ///
-  /// \returns false if clang errors occurred (with diagnostics reported to
-  /// \c DiagConsumer), true otherwise.
-  bool computeDependencies(StringRef WorkingDirectory,
-                           const std::vector<std::string> &CommandLine,
-                           DependencyConsumer &DepConsumer,
-                           DependencyActionController &Controller,
-                           DiagnosticConsumer &DiagConsumer,
-                           StringRef ModuleName);
-
   /// Run the dependency scanning tool for a given clang driver command-line
   /// for a specific translation unit via file system or memory buffer.
   ///
@@ -125,16 +116,46 @@ class DependencyScanningWorker {
       DependencyConsumer &Consumer, DependencyActionController &Controller,
       std::optional<llvm::MemoryBufferRef> TUBuffer = std::nullopt);
 
-  /// Run the dependency scanning tool for a given clang driver command-line
-  /// for a specific module.
-  ///
-  /// \returns A \c StringError with the diagnostic output if clang errors
-  /// occurred, success otherwise.
-  llvm::Error computeDependencies(StringRef WorkingDirectory,
-                                  const std::vector<std::string> &CommandLine,
-                                  DependencyConsumer &Consumer,
-                                  DependencyActionController &Controller,
-                                  StringRef ModuleName);
+  /// The three method below implements a new interface for by name
+  /// dependency scanning. They together enable the dependency scanning worker
+  /// to more effectively perform scanning for a sequence of modules
+  /// by name when the CWD and CommandLine do not change across the queries.
+
+  /// @brief Initializing the context and the compiler instance.
+  /// @param CWD The current working directory used during the scan.
+  /// @param CommandLine The commandline used for the scan.
+  /// @return Error if the initializaiton fails.
+  llvm::Error initializeCompilerInstanceWithContextOrError(
+      StringRef CWD, const std::vector<std::string> &CommandLine);
+
+  /// @brief Performaces dependency scanning for the module whose name is
+  ///        specified.
+  /// @param ModuleName  The name of the module whose dependency will be
+  ///                    scanned.
+  /// @param Consumer The dependency consumer that stores the results.
+  /// @param Controller The controller for the dependency scanning action.
+  /// @return Error if the scanner incurs errors.
+  llvm::Error computeDependenciesByNameWithContextOrError(
+      StringRef ModuleName, DependencyConsumer &Consumer,
+      DependencyActionController &Controller);
+
+  /// @brief Finalizes the diagnostics engine and deletes the compiler instance.
+  /// @return Error if errors occur during finalization.
+  llvm::Error finalizeCompilerInstanceWithContextOrError();
+
+  /// The three methods below provides the same functionality as the
+  /// three methods above. Instead of returning `llvm::Error`s, these
+  /// three methods return a flag to indicate if the call is successful.
+  /// The initialization function asks the client for a DiagnosticsConsumer
+  /// that it direct the diagnostics to.
+  bool initializeCompilerInstanceWithContext(
+      StringRef CWD, const std::vector<std::string> &CommandLine,
+      DiagnosticConsumer *DC = nullptr);
+  bool
+  computeDependenciesByNameWithContext(StringRef ModuleName,
+                                       DependencyConsumer &Consumer,
+                                       DependencyActionController &Controller);
+  bool finalizeCompilerInstance();
 
   llvm::vfs::FileSystem &getVFS() const { return *BaseFS; }
 
@@ -151,14 +172,16 @@ class DependencyScanningWorker {
   /// (passed in the constructor).
   llvm::IntrusiveRefCntPtr<DependencyScanningWorkerFilesystem> DepFS;
 
+  friend CompilerInstanceWithContext;
+  std::unique_ptr<CompilerInstanceWithContext> CIWithContext;
+
   /// Private helper functions.
   bool scanDependencies(StringRef WorkingDirectory,
                         const std::vector<std::string> &CommandLine,
                         DependencyConsumer &Consumer,
                         DependencyActionController &Controller,
                         DiagnosticConsumer &DC,
-                        llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
-                        std::optional<StringRef> ModuleName);
+                        llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS);
 };
 
 } // end namespace dependencies
diff --git a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
index 4136cb73f7043..b0a91b60ff6da 100644
--- a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
+++ b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
@@ -288,6 +288,8 @@ class ModuleDepCollector final : public DependencyCollector {
   void attachToPreprocessor(Preprocessor &PP) override;
   void attachToASTReader(ASTReader &R) override;
 
+  PPCallbacks *getPPCallbacks() { return CollectorPPPtr; }
+
   /// Apply any changes implied by the discovered dependencies to the given
   /// invocation, (e.g. disable implicit modules, add explicit module paths).
   void applyDiscoveredDependencies(CompilerInvocation &CI);
@@ -339,6 +341,11 @@ class ModuleDepCollector final : public DependencyCollector {
   std::optional<P1689ModuleInfo> ProvidedStdCXXModule;
   std::vector<P1689ModuleInfo> RequiredStdCXXModules;
 
+  /// A pointer to the preprocessor callback so we can invoke it directly
+  /// if needed. The callback is created and added to a Preprocessor instance by
+  /// attachToPreprocessor and the Preprocessor instance owns it.
+  ModuleDepCollectorPP *CollectorPPPtr = nullptr;
+
   /// Checks whether the module is known as being prebuilt.
   bool isPrebuiltModule(const Module *M);
 
diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap
index c5535262ae38c..a11c8683c601e 100644
--- a/clang/include/module.modulemap
+++ b/clang/include/module.modulemap
@@ -146,6 +146,7 @@ module Clang_Lex {
   module * { export * }
 }
 
+module Clang_Options { requires cplusplus umbrella "clang/Options" module * { export * } }
 module Clang_Parse { requires cplusplus umbrella "clang/Parse" module * { export * } }
 module Clang_Rewrite { requires cplusplus umbrella "clang/Rewrite/Core" module * { export * } }
 module Clang_RewriteFrontend { requires cplusplus umbrella "clang/Rewrite/Frontend" module * { export * } }
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 2669f62456711..fab907b9c1a40 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -3108,9 +3108,9 @@ TypeSourceInfo *ASTContext::CreateTypeSourceInfo(QualType T,
 
 TypeSourceInfo *ASTContext::getTrivialTypeSourceInfo(QualType T,
                                                      SourceLocation L) const {
-  TypeSourceInfo *DI = CreateTypeSourceInfo(T);
-  DI->getTypeLoc().initialize(const_cast<ASTContext &>(*this), L);
-  return DI;
+  TypeSourceInfo *TSI = CreateTypeSourceInfo(T);
+  TSI->getTypeLoc().initialize(const_cast<ASTContext &>(*this), L);
+  return TSI;
 }
 
 const ASTRecordLayout &
@@ -5891,11 +5891,11 @@ TypeSourceInfo *ASTContext::getTemplateSpecializationTypeInfo(
   QualType TST = getTemplateSpecializationType(
       Keyword, Name, SpecifiedArgs.arguments(), CanonicalArgs, Underlying);
 
-  TypeSourceInfo *DI = CreateTypeSourceInfo(TST);
-  DI->getTypeLoc().castAs<TemplateSpecializationTypeLoc>().set(
+  TypeSourceInfo *TSI = CreateTypeSourceInfo(TST);
+  TSI->getTypeLoc().castAs<TemplateSpecializationTypeLoc>().set(
       ElaboratedKeywordLoc, QualifierLoc, TemplateKeywordLoc, NameLoc,
       SpecifiedArgs);
-  return DI;
+  return TSI;
 }
 
 QualType ASTContext::getTemplateSpecializationType(
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index bf51c3e42719c..735f3157b694e 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -696,6 +696,10 @@ namespace clang {
     ExpectedStmt VisitCXXFoldExpr(CXXFoldExpr *E);
     ExpectedStmt VisitRequiresExpr(RequiresExpr* E);
     ExpectedStmt VisitConceptSpecializationExpr(ConceptSpecializationExpr* E);
+    ExpectedStmt
+    VisitSubstNonTypeTemplateParmPackExpr(SubstNonTypeTemplateParmPackExpr *E);
+    ExpectedStmt VisitPseudoObjectExpr(PseudoObjectExpr *E);
+    ExpectedStmt VisitCXXParenListInitExpr(CXXParenListInitExpr *E);
 
     // Helper for chaining together multiple imports. If an error is detected,
     // subsequent imports will return default constructed nodes, so that failure
@@ -9273,6 +9277,50 @@ ASTNodeImporter::VisitConceptSpecializationExpr(ConceptSpecializationExpr *E) {
       const_cast<ImplicitConceptSpecializationDecl *>(CSD), &Satisfaction);
 }
 
+ExpectedStmt ASTNodeImporter::VisitSubstNonTypeTemplateParmPackExpr(
+    SubstNonTypeTemplateParmPackExpr *E) {
+  Error Err = Error::success();
+  auto ToType = importChecked(Err, E->getType());
+  auto ToPackLoc = importChecked(Err, E->getParameterPackLocation());
+  auto ToArgPack = importChecked(Err, E->getArgumentPack());
+  auto ToAssociatedDecl = importChecked(Err, E->getAssociatedDecl());
+  if (Err)
+    return std::move(Err);
+
+  return new (Importer.getToContext()) SubstNonTypeTemplateParmPackExpr(
+      ToType, E->getValueKind(), ToPackLoc, ToArgPack, ToAssociatedDecl,
+      E->getIndex(), E->getFinal());
+}
+
+ExpectedStmt ASTNodeImporter::VisitPseudoObjectExpr(PseudoObjectExpr *E) {
+  SmallVector<Expr *, 4> ToSemantics(E->getNumSemanticExprs());
+  if (Error Err = ImportContainerChecked(E->semantics(), ToSemantics))
+    return std::move(Err);
+  auto ToSyntOrErr = import(E->getSyntacticForm());
+  if (!ToSyntOrErr)
+    return ToSyntOrErr.takeError();
+  return PseudoObjectExpr::Create(Importer.getToContext(), *ToSyntOrErr,
+                                  ToSemantics, E->getResultExprIndex());
+}
+
+ExpectedStmt
+ASTNodeImporter::VisitCXXParenListInitExpr(CXXParenListInitExpr *E) {
+  Error Err = Error::success();
+  auto ToType = importChecked(Err, E->getType());
+  auto ToInitLoc = importChecked(Err, E->getInitLoc());
+  auto ToBeginLoc = importChecked(Err, E->getBeginLoc());
+  auto ToEndLoc = importChecked(Err, E->getEndLoc());
+  if (Err)
+    return std::move(Err);
+
+  SmallVector<Expr *, 4> ToArgs(E->getInitExprs().size());
+  if (Error Err = ImportContainerChecked(E->getInitExprs(), ToArgs))
+    return std::move(Err);
+  return CXXParenListInitExpr::Create(Importer.getToContext(), ToArgs, ToType,
+                                      E->getUserSpecifiedInitExprs().size(),
+                                      ToInitLoc, ToBeginLoc, ToEndLoc);
+}
+
 Error ASTNodeImporter::ImportOverriddenMethods(CXXMethodDecl *ToMethod,
                                                CXXMethodDecl *FromMethod) {
   Error ImportErrors = Error::success();
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 6c088469a3ca2..1243380ca8a6b 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -476,8 +476,9 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
     return this->delegate(SubExpr);
 
   case CK_BitCast: {
+    QualType CETy = CE->getType();
     // Reject bitcasts to atomic types.
-    if (CE->getType()->isAtomicType()) {
+    if (CETy->isAtomicType()) {
       if (!this->discard(SubExpr))
         return false;
       return this->emitInvalidCast(CastKind::Reinterpret, /*Fatal=*/true, CE);
@@ -494,6 +495,7 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
 
     assert(isPtrType(*FromT));
     assert(isPtrType(*ToT));
+    bool SrcIsVoidPtr = SubExprTy->isVoidPointerType();
     if (FromT == ToT) {
       if (CE->getType()->isVoidPointerType() &&
           !SubExprTy->isFunctionPointerType()) {
@@ -502,6 +504,10 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
 
       if (!this->visit(SubExpr))
         return false;
+      if (!this->emitCheckBitCast(CETy->getPointeeType().getTypePtr(),
+                                  SrcIsVoidPtr, CE))
+        return false;
+
       if (CE->getType()->isFunctionPointerType() ||
           SubExprTy->isFunctionPointerType()) {
         return this->emitFnPtrCast(CE);
@@ -2490,7 +2496,7 @@ bool Compiler<Emitter>::VisitAbstractConditionalOperator(
   };
 
   if (std::optional<bool> BoolValue = getBoolValue(Condition)) {
-    if (BoolValue)
+    if (*BoolValue)
       return visitChildExpr(TrueExpr);
     return visitChildExpr(FalseExpr);
   }
@@ -3217,7 +3223,8 @@ bool Compiler<Emitter>::VisitCXXConstructExpr(const CXXConstructExpr *E) {
       return this->visitInitializer(E->getArg(0));
 
     // Zero initialization.
-    if (E->requiresZeroInitialization()) {
+    bool ZeroInit = E->requiresZeroInitialization();
+    if (ZeroInit) {
       const Record *R = getRecord(E->getType());
 
       if (!this->visitZeroRecordInitializer(R, E))
@@ -3228,6 +3235,19 @@ bool Compiler<Emitter>::VisitCXXConstructExpr(const CXXConstructExpr *E) {
         return true;
     }
 
+    // Avoid materializing a temporary for an elidable copy/move constructor.
+    if (!ZeroInit && E->isElidable()) {
+      const Expr *SrcObj = E->getArg(0);
+      assert(SrcObj->isTemporaryObject(Ctx.getASTContext(), Ctor->getParent()));
+      assert(Ctx.getASTContext().hasSameUnqualifiedType(E->getType(),
+                                                        SrcObj->getType()));
+      if (const auto *ME = dyn_cast<MaterializeTemporaryExpr>(SrcObj)) {
+        if (!this->emitCheckFunctionDecl(Ctor, E))
+          return false;
+        return this->visitInitializer(ME->getSubExpr());
+      }
+    }
+
     const Function *Func = getFunction(Ctor);
 
     if (!Func)
@@ -4157,7 +4177,7 @@ bool Compiler<Emitter>::VisitStmtExpr(const StmtExpr *E) {
   StmtExprScope<Emitter> SS(this);
 
   const CompoundStmt *CS = E->getSubStmt();
-  const Stmt *Result = CS->getStmtExprResult();
+  const Stmt *Result = CS->body_back();
   for (const Stmt *S : CS->body()) {
     if (S != Result) {
       if (!this->visitStmt(S))
@@ -5412,8 +5432,7 @@ bool Compiler<Emitter>::VisitCXXThisExpr(const CXXThisExpr *E) {
   unsigned EndIndex = 0;
   // Find the init list.
   for (StartIndex = InitStack.size() - 1; StartIndex > 0; --StartIndex) {
-    if (InitStack[StartIndex].Kind == InitLink::K_InitList ||
-        InitStack[StartIndex].Kind == InitLink::K_This) {
+    if (InitStack[StartIndex].Kind == InitLink::K_DIE) {
       EndIndex = StartIndex;
       --StartIndex;
       break;
@@ -5426,7 +5445,8 @@ bool Compiler<Emitter>::VisitCXXThisExpr(const CXXThisExpr *E) {
       continue;
 
     if (InitStack[StartIndex].Kind != InitLink::K_Field &&
-        InitStack[StartIndex].Kind != InitLink::K_Elem)
+        InitStack[StartIndex].Kind != InitLink::K_Elem &&
+        InitStack[StartIndex].Kind != InitLink::K_DIE)
       break;
   }
 
@@ -5437,7 +5457,8 @@ bool Compiler<Emitter>::VisitCXXThisExpr(const CXXThisExpr *E) {
 
   // Emit the instructions.
   for (unsigned I = StartIndex; I != (EndIndex + 1); ++I) {
-    if (InitStack[I].Kind == InitLink::K_InitList)
+    if (InitStack[I].Kind == InitLink::K_InitList ||
+        InitStack[I].Kind == InitLink::K_DIE)
       continue;
     if (!InitStack[I].template emit<Emitter>(this, E))
       return false;
@@ -5989,6 +6010,8 @@ bool Compiler<Emitter>::visitSwitchStmt(const SwitchStmt *S) {
       CaseLabels[SC] = this->getLabel();
 
       const Expr *Value = CS->getLHS();
+      if (Value->isValueDependent())
+        return false;
       PrimType ValueT = this->classifyPrim(Value->getType());
 
       // Compare the case statement's value to the switch condition.
@@ -6306,8 +6329,8 @@ bool Compiler<Emitter>::compileConstructor(const CXXConstructorDecl *Ctor) {
 
       unsigned FirstLinkOffset =
           R->getField(cast<FieldDecl>(IFD->chain()[0]))->Offset;
-      InitStackScope<Emitter> ISS(this, isa<CXXDefaultInitExpr>(InitExpr));
       InitLinkScope<Emitter> ILS(this, InitLink::Field(FirstLinkOffset));
+      InitStackScope<Emitter> ISS(this, isa<CXXDefaultInitExpr>(InitExpr));
       if (!emitFieldInitializer(NestedField, NestedFieldOffset, InitExpr,
                                 IsUnion))
         return false;
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index 5c46f75af4da3..0c6cab9276531 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -52,12 +52,14 @@ struct InitLink {
     K_Decl = 3,
     K_Elem = 5,
     K_RVO = 6,
-    K_InitList = 7
+    K_InitList = 7,
+    K_DIE = 8,
   };
 
   static InitLink This() { return InitLink{K_This}; }
   static InitLink InitList() { return InitLink{K_InitList}; }
   static InitLink RVO() { return InitLink{K_RVO}; }
+  static InitLink DIE() { return InitLink{K_DIE}; }
   static InitLink Field(unsigned Offset) {
     InitLink IL{K_Field};
     IL.Offset = Offset;
@@ -668,22 +670,29 @@ template <class Emitter> class InitLinkScope final {
 
   ~InitLinkScope() { this->Ctx->InitStack.pop_back(); }
 
-private:
+public:
   Compiler<Emitter> *Ctx;
 };
 
 template <class Emitter> class InitStackScope final {
 public:
   InitStackScope(Compiler<Emitter> *Ctx, bool Active)
-      : Ctx(Ctx), OldValue(Ctx->InitStackActive) {
+      : Ctx(Ctx), OldValue(Ctx->InitStackActive), Active(Active) {
     Ctx->InitStackActive = Active;
+    if (Active)
+      Ctx->InitStack.push_back(InitLink::DIE());
   }
 
-  ~InitStackScope() { this->Ctx->InitStackActive = OldValue; }
+  ~InitStackScope() {
+    this->Ctx->InitStackActive = OldValue;
+    if (Active)
+      Ctx->InitStack.pop_back();
+  }
 
 private:
   Compiler<Emitter> *Ctx;
   bool OldValue;
+  bool Active;
 };
 
 } // namespace interp
diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp
index fd0903f2e652c..638028f84ff24 100644
--- a/clang/lib/AST/ByteCode/Disasm.cpp
+++ b/clang/lib/AST/ByteCode/Disasm.cpp
@@ -436,8 +436,28 @@ LLVM_DUMP_METHOD void Descriptor::dumpFull(unsigned Offset,
 
       FO += ElemDesc->getAllocSize();
     }
+  } else if (isPrimitiveArray()) {
+    OS.indent(Spaces) << "Elements: " << getNumElems() << '\n';
+    OS.indent(Spaces) << "Element type: " << primTypeToString(getPrimType())
+                      << '\n';
+    unsigned FO = Offset + sizeof(InitMapPtr);
+    for (unsigned I = 0; I != getNumElems(); ++I) {
+      OS.indent(Spaces) << "Element " << I << " offset: " << FO << '\n';
+      FO += getElemSize();
+    }
   } else if (isRecord()) {
     ElemRecord->dump(OS, Indent + 1, Offset);
+    unsigned I = 0;
+    for (const Record::Field &F : ElemRecord->fields()) {
+      OS.indent(Spaces) << "- Field " << I << ": ";
+      {
+        ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_RED, true});
+        OS << F.Decl->getName();
+      }
+      OS << ". Offset " << (Offset + F.Offset) << "\n";
+      F.Desc->dumpFull(Offset + F.Offset, Indent + 1);
+      ++I;
+    }
   } else if (isPrimitive()) {
   } else {
   }
diff --git a/clang/lib/AST/ByteCode/Floating.h b/clang/lib/AST/ByteCode/Floating.h
index 659892e720abf..cc918dc12deb6 100644
--- a/clang/lib/AST/ByteCode/Floating.h
+++ b/clang/lib/AST/ByteCode/Floating.h
@@ -45,7 +45,8 @@ class Floating final {
     if (singleWord())
       return APFloat(getSemantics(), APInt(BitWidth, Val));
     unsigned NumWords = numWords();
-    return APFloat(getSemantics(), APInt(BitWidth, NumWords, Memory));
+    return APFloat(getSemantics(),
+                   APInt(BitWidth, llvm::ArrayRef(Memory, NumWords)));
   }
 
 public:
diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h
index 6683db941c736..b11e6eea28e3f 100644
--- a/clang/lib/AST/ByteCode/IntegralAP.h
+++ b/clang/lib/AST/ByteCode/IntegralAP.h
@@ -63,7 +63,7 @@ template <bool Signed> class IntegralAP final {
     if (singleWord())
       return APInt(BitWidth, Val, Signed);
     unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
-    return llvm::APInt(BitWidth, NumWords, Memory);
+    return llvm::APInt(BitWidth, llvm::ArrayRef(Memory, NumWords));
   }
 
 public:
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index a2fb0fb331f8a..1f2ae92f6068b 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -919,33 +919,8 @@ bool CheckInit(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   return true;
 }
 
-static bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) {
-
-  if (F->isVirtual() && !S.getLangOpts().CPlusPlus20) {
-    const SourceLocation &Loc = S.Current->getLocation(OpPC);
-    S.CCEDiag(Loc, diag::note_constexpr_virtual_call);
-    return false;
-  }
-
-  if (S.checkingPotentialConstantExpression() && S.Current->getDepth() != 0)
-    return false;
-
-  if (F->isValid() && F->hasBody() && F->isConstexpr())
-    return true;
-
-  const FunctionDecl *DiagDecl = F->getDecl();
-  const FunctionDecl *Definition = nullptr;
-  DiagDecl->getBody(Definition);
-
-  if (!Definition && S.checkingPotentialConstantExpression() &&
-      DiagDecl->isConstexpr()) {
-    return false;
-  }
-
-  // Implicitly constexpr.
-  if (F->isLambdaStaticInvoker())
-    return true;
-
+static bool diagnoseCallableDecl(InterpState &S, CodePtr OpPC,
+                                 const FunctionDecl *DiagDecl) {
   // Bail out if the function declaration itself is invalid.  We will
   // have produced a relevant diagnostic while parsing it, so just
   // note the problematic sub-expression.
@@ -953,11 +928,10 @@ static bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) {
     return Invalid(S, OpPC);
 
   // Diagnose failed assertions specially.
-  if (S.Current->getLocation(OpPC).isMacroID() &&
-      F->getDecl()->getIdentifier()) {
+  if (S.Current->getLocation(OpPC).isMacroID() && DiagDecl->getIdentifier()) {
     // FIXME: Instead of checking for an implementation-defined function,
     // check and evaluate the assert() macro.
-    StringRef Name = F->getDecl()->getName();
+    StringRef Name = DiagDecl->getName();
     bool AssertFailed =
         Name == "__assert_rtn" || Name == "__assert_fail" || Name == "_wassert";
     if (AssertFailed) {
@@ -1004,7 +978,7 @@ static bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) {
     // for a constant expression. It might be defined at the point we're
     // actually calling it.
     bool IsExtern = DiagDecl->getStorageClass() == SC_Extern;
-    bool IsDefined = F->isDefined();
+    bool IsDefined = DiagDecl->isDefined();
     if (!IsDefined && !IsExtern && DiagDecl->isConstexpr() &&
         S.checkingPotentialConstantExpression())
       return false;
@@ -1027,6 +1001,35 @@ static bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) {
   return false;
 }
 
+static bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) {
+  if (F->isVirtual() && !S.getLangOpts().CPlusPlus20) {
+    const SourceLocation &Loc = S.Current->getLocation(OpPC);
+    S.CCEDiag(Loc, diag::note_constexpr_virtual_call);
+    return false;
+  }
+
+  if (S.checkingPotentialConstantExpression() && S.Current->getDepth() != 0)
+    return false;
+
+  if (F->isValid() && F->hasBody() && F->isConstexpr())
+    return true;
+
+  const FunctionDecl *DiagDecl = F->getDecl();
+  const FunctionDecl *Definition = nullptr;
+  DiagDecl->getBody(Definition);
+
+  if (!Definition && S.checkingPotentialConstantExpression() &&
+      DiagDecl->isConstexpr()) {
+    return false;
+  }
+
+  // Implicitly constexpr.
+  if (F->isLambdaStaticInvoker())
+    return true;
+
+  return diagnoseCallableDecl(S, OpPC, DiagDecl);
+}
+
 static bool CheckCallDepth(InterpState &S, CodePtr OpPC) {
   if ((S.Current->getDepth() + 1) > S.getLangOpts().ConstexprCallDepth) {
     S.FFDiag(S.Current->getSource(OpPC),
@@ -1500,6 +1503,21 @@ bool CheckDestructor(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   return CheckActive(S, OpPC, Ptr, AK_Destroy);
 }
 
+/// Opcode. Check if the function decl can be called at compile time.
+bool CheckFunctionDecl(InterpState &S, CodePtr OpPC, const FunctionDecl *FD) {
+  if (S.checkingPotentialConstantExpression() && S.Current->getDepth() != 0)
+    return false;
+
+  const FunctionDecl *Definition = nullptr;
+  const Stmt *Body = FD->getBody(Definition);
+
+  if (Definition && Body &&
+      (Definition->isConstexpr() || Definition->hasAttr<MSConstexprAttr>()))
+    return true;
+
+  return diagnoseCallableDecl(S, OpPC, FD);
+}
+
 static void compileFunction(InterpState &S, const Function *Func) {
   const FunctionDecl *Definition = Func->getDecl()->getDefinition();
   if (!Definition)
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 5ab9c8ee75a51..cbd60c9f2b37c 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -117,6 +117,7 @@ bool CheckBitCast(InterpState &S, CodePtr OpPC, bool HasIndeterminateBits,
                   bool TargetIsUCharOrByte);
 bool CheckBCPResult(InterpState &S, const Pointer &Ptr);
 bool CheckDestructor(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
+bool CheckFunctionDecl(InterpState &S, CodePtr OpPC, const FunctionDecl *FD);
 
 bool handleFixedPointOverflow(InterpState &S, CodePtr OpPC,
                               const FixedPoint &FP);
@@ -1915,6 +1916,9 @@ bool Load(InterpState &S, CodePtr OpPC) {
     return false;
   if (!Ptr.isBlockPointer())
     return false;
+  if (const Descriptor *D = Ptr.getFieldDesc();
+      !(D->isPrimitive() || D->isPrimitiveArray()) || D->getPrimType() != Name)
+    return false;
   S.Stk.push<T>(Ptr.deref<T>());
   return true;
 }
@@ -1926,6 +1930,9 @@ bool LoadPop(InterpState &S, CodePtr OpPC) {
     return false;
   if (!Ptr.isBlockPointer())
     return false;
+  if (const Descriptor *D = Ptr.getFieldDesc();
+      !(D->isPrimitive() || D->isPrimitiveArray()) || D->getPrimType() != Name)
+    return false;
   S.Stk.push<T>(Ptr.deref<T>());
   return true;
 }
@@ -3283,17 +3290,69 @@ inline bool SideEffect(InterpState &S, CodePtr OpPC) {
   return S.noteSideEffect();
 }
 
+inline bool CheckBitCast(InterpState &S, CodePtr OpPC, const Type *TargetType,
+                         bool SrcIsVoidPtr) {
+  const auto &Ptr = S.Stk.peek<Pointer>();
+  if (Ptr.isZero())
+    return true;
+  if (!Ptr.isBlockPointer())
+    return true;
+
+  if (TargetType->isIntegerType())
+    return true;
+
+  if (SrcIsVoidPtr && S.getLangOpts().CPlusPlus) {
+    bool HasValidResult = !Ptr.isZero();
+
+    if (HasValidResult) {
+      if (S.getStdAllocatorCaller("allocate"))
+        return true;
+
+      const auto &E = cast<CastExpr>(S.Current->getExpr(OpPC));
+      if (S.getLangOpts().CPlusPlus26 &&
+          S.getASTContext().hasSimilarType(Ptr.getType(),
+                                           QualType(TargetType, 0)))
+        return true;
+
+      S.CCEDiag(E, diag::note_constexpr_invalid_void_star_cast)
+          << E->getSubExpr()->getType() << S.getLangOpts().CPlusPlus26
+          << Ptr.getType().getCanonicalType() << E->getType()->getPointeeType();
+    } else if (!S.getLangOpts().CPlusPlus26) {
+      const SourceInfo &E = S.Current->getSource(OpPC);
+      S.CCEDiag(E, diag::note_constexpr_invalid_cast)
+          << diag::ConstexprInvalidCastKind::CastFrom << "'void *'"
+          << S.Current->getRange(OpPC);
+    }
+  }
+
+  QualType PtrType = Ptr.getType();
+  if (PtrType->isRecordType() &&
+      PtrType->getAsRecordDecl() != TargetType->getAsRecordDecl()) {
+    S.CCEDiag(S.Current->getSource(OpPC), diag::note_constexpr_invalid_cast)
+        << diag::ConstexprInvalidCastKind::ThisConversionOrReinterpret
+        << S.getLangOpts().CPlusPlus << S.Current->getRange(OpPC);
+    return false;
+  }
+  return true;
+}
+
 /// Same here, but only for casts.
 inline bool InvalidCast(InterpState &S, CodePtr OpPC, CastKind Kind,
                         bool Fatal) {
   const SourceLocation &Loc = S.Current->getLocation(OpPC);
 
-  if (Kind == CastKind::Reinterpret) {
+  switch (Kind) {
+  case CastKind::Reinterpret:
     S.CCEDiag(Loc, diag::note_constexpr_invalid_cast)
-        << static_cast<unsigned>(Kind) << S.Current->getRange(OpPC);
+        << diag::ConstexprInvalidCastKind::Reinterpret
+        << S.Current->getRange(OpPC);
     return !Fatal;
-  }
-  if (Kind == CastKind::Volatile) {
+  case CastKind::ReinterpretLike:
+    S.CCEDiag(Loc, diag::note_constexpr_invalid_cast)
+        << diag::ConstexprInvalidCastKind::ThisConversionOrReinterpret
+        << S.getLangOpts().CPlusPlus << S.Current->getRange(OpPC);
+    return !Fatal;
+  case CastKind::Volatile:
     if (!S.checkingPotentialConstantExpression()) {
       const auto *E = cast<CastExpr>(S.Current->getExpr(OpPC));
       if (S.getLangOpts().CPlusPlus)
@@ -3304,14 +3363,13 @@ inline bool InvalidCast(InterpState &S, CodePtr OpPC, CastKind Kind,
     }
 
     return false;
-  }
-  if (Kind == CastKind::Dynamic) {
+  case CastKind::Dynamic:
     assert(!S.getLangOpts().CPlusPlus20);
-    S.CCEDiag(S.Current->getSource(OpPC), diag::note_constexpr_invalid_cast)
+    S.CCEDiag(Loc, diag::note_constexpr_invalid_cast)
         << diag::ConstexprInvalidCastKind::Dynamic;
     return true;
   }
-
+  llvm_unreachable("Unhandled CastKind");
   return false;
 }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 8b57b963c538f..6c7b2f502cc51 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2841,76 +2841,6 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC,
   return true;
 }
 
-static bool interp__builtin_ia32_pshufb(InterpState &S, CodePtr OpPC,
-                                        const CallExpr *Call) {
-  assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
-  const Pointer &Control = S.Stk.pop<Pointer>();
-  const Pointer &Src = S.Stk.pop<Pointer>();
-  const Pointer &Dst = S.Stk.peek<Pointer>();
-
-  unsigned NumElems = Dst.getNumElems();
-  assert(NumElems == Control.getNumElems());
-  assert(NumElems == Dst.getNumElems());
-
-  for (unsigned Idx = 0; Idx != NumElems; ++Idx) {
-    uint8_t Ctlb = static_cast<uint8_t>(Control.elem<int8_t>(Idx));
-
-    if (Ctlb & 0x80) {
-      Dst.elem<int8_t>(Idx) = 0;
-    } else {
-      unsigned LaneBase = (Idx / 16) * 16;
-      unsigned SrcOffset = Ctlb & 0x0F;
-      unsigned SrcIdx = LaneBase + SrcOffset;
-
-      Dst.elem<int8_t>(Idx) = Src.elem<int8_t>(SrcIdx);
-    }
-  }
-  Dst.initializeAllElements();
-  return true;
-}
-
-static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC,
-                                       const CallExpr *Call, bool IsShufHW) {
-  assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
-  APSInt ControlImm = popToAPSInt(S, Call->getArg(1));
-  const Pointer &Src = S.Stk.pop<Pointer>();
-  const Pointer &Dst = S.Stk.peek<Pointer>();
-
-  unsigned NumElems = Dst.getNumElems();
-  PrimType ElemT = Dst.getFieldDesc()->getPrimType();
-
-  unsigned ElemBits = static_cast<unsigned>(primSize(ElemT) * 8);
-  if (ElemBits != 16 && ElemBits != 32)
-    return false;
-
-  unsigned LaneElts = 128u / ElemBits;
-  assert(LaneElts && (NumElems % LaneElts == 0));
-
-  uint8_t Ctl = static_cast<uint8_t>(ControlImm.getZExtValue());
-
-  for (unsigned Idx = 0; Idx != NumElems; Idx++) {
-    unsigned LaneBase = (Idx / LaneElts) * LaneElts;
-    unsigned LaneIdx = Idx % LaneElts;
-    unsigned SrcIdx = Idx;
-    unsigned Sel = (Ctl >> (2 * (LaneIdx & 0x3))) & 0x3;
-    if (ElemBits == 32) {
-      SrcIdx = LaneBase + Sel;
-    } else {
-      constexpr unsigned HalfSize = 4;
-      bool InHigh = LaneIdx >= HalfSize;
-      if (!IsShufHW && !InHigh) {
-        SrcIdx = LaneBase + Sel;
-      } else if (IsShufHW && InHigh) {
-        SrcIdx = LaneBase + HalfSize + Sel;
-      }
-    }
-
-    INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem<T>(Idx) = Src.elem<T>(SrcIdx); });
-  }
-  Dst.initializeAllElements();
-  return true;
-}
-
 static bool interp__builtin_ia32_test_op(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<bool(const APInt &A, const APInt &B)> Fn) {
@@ -3377,56 +3307,70 @@ static bool interp__builtin_ia32_vpconflict(InterpState &S, CodePtr OpPC,
   return true;
 }
 
-static bool interp__builtin_x86_byteshift(
-    InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned ID,
-    llvm::function_ref<APInt(const Pointer &, unsigned Lane, unsigned I,
-                             unsigned Shift)>
-        Fn) {
-  assert(Call->getNumArgs() == 2);
-
-  APSInt ImmAPS = popToAPSInt(S, Call->getArg(1));
-  uint64_t Shift = ImmAPS.getZExtValue() & 0xff;
-
-  const Pointer &Src = S.Stk.pop<Pointer>();
-  if (!Src.getFieldDesc()->isPrimitiveArray())
-    return false;
-
-  unsigned NumElems = Src.getNumElems();
-  const Pointer &Dst = S.Stk.peek<Pointer>();
-  PrimType ElemT = Src.getFieldDesc()->getPrimType();
-
-  for (unsigned Lane = 0; Lane != NumElems; Lane += 16) {
-    for (unsigned I = 0; I != 16; ++I) {
-      unsigned Base = Lane + I;
-      APSInt Result = APSInt(Fn(Src, Lane, I, Shift));
-      INT_TYPE_SWITCH_NO_BOOL(ElemT,
-                              { Dst.elem<T>(Base) = static_cast<T>(Result); });
-    }
-  }
-
-  Dst.initializeAllElements();
-
-  return true;
-}
-
 static bool interp__builtin_ia32_shuffle_generic(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<std::pair<unsigned, int>(unsigned, unsigned)>
         GetSourceIndex) {
 
-  assert(Call->getNumArgs() == 3);
-  unsigned ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
+  assert(Call->getNumArgs() == 2 || Call->getNumArgs() == 3);
+
+  unsigned ShuffleMask = 0;
+  Pointer A, MaskVector, B;
+  bool IsVectorMask = false;
+  bool IsSingleOperand = (Call->getNumArgs() == 2);
+
+  if (IsSingleOperand) {
+    QualType MaskType = Call->getArg(1)->getType();
+    if (MaskType->isVectorType()) {
+      IsVectorMask = true;
+      MaskVector = S.Stk.pop<Pointer>();
+      A = S.Stk.pop<Pointer>();
+      B = A;
+    } else if (MaskType->isIntegerType()) {
+      ShuffleMask = popToAPSInt(S, Call->getArg(1)).getZExtValue();
+      A = S.Stk.pop<Pointer>();
+      B = A;
+    } else {
+      return false;
+    }
+  } else {
+    QualType Arg2Type = Call->getArg(2)->getType();
+    if (Arg2Type->isVectorType()) {
+      IsVectorMask = true;
+      B = S.Stk.pop<Pointer>();
+      MaskVector = S.Stk.pop<Pointer>();
+      A = S.Stk.pop<Pointer>();
+    } else if (Arg2Type->isIntegerType()) {
+      ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
+      B = S.Stk.pop<Pointer>();
+      A = S.Stk.pop<Pointer>();
+    } else {
+      return false;
+    }
+  }
 
   QualType Arg0Type = Call->getArg(0)->getType();
   const auto *VecT = Arg0Type->castAs<VectorType>();
   PrimType ElemT = *S.getContext().classify(VecT->getElementType());
   unsigned NumElems = VecT->getNumElements();
 
-  const Pointer &B = S.Stk.pop<Pointer>();
-  const Pointer &A = S.Stk.pop<Pointer>();
   const Pointer &Dst = S.Stk.peek<Pointer>();
 
+  PrimType MaskElemT = PT_Uint32;
+  if (IsVectorMask) {
+    QualType Arg1Type = Call->getArg(1)->getType();
+    const auto *MaskVecT = Arg1Type->castAs<VectorType>();
+    QualType MaskElemType = MaskVecT->getElementType();
+    MaskElemT = *S.getContext().classify(MaskElemType);
+  }
+
   for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) {
+    if (IsVectorMask) {
+      INT_TYPE_SWITCH(MaskElemT, {
+        ShuffleMask = static_cast<unsigned>(MaskVector.elem<T>(DstIdx));
+      });
+    }
+
     auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
 
     if (SrcIdx < 0) {
@@ -3803,6 +3747,42 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           return Result;
         });
 
+  case clang::X86::BI__builtin_ia32_ktestcqi:
+  case clang::X86::BI__builtin_ia32_ktestchi:
+  case clang::X86::BI__builtin_ia32_ktestcsi:
+  case clang::X86::BI__builtin_ia32_ktestcdi:
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APSInt &A, const APSInt &B) {
+          return APInt(sizeof(unsigned char) * 8, (~A & B) == 0);
+        });
+
+  case clang::X86::BI__builtin_ia32_ktestzqi:
+  case clang::X86::BI__builtin_ia32_ktestzhi:
+  case clang::X86::BI__builtin_ia32_ktestzsi:
+  case clang::X86::BI__builtin_ia32_ktestzdi:
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APSInt &A, const APSInt &B) {
+          return APInt(sizeof(unsigned char) * 8, (A & B) == 0);
+        });
+
+  case clang::X86::BI__builtin_ia32_kortestcqi:
+  case clang::X86::BI__builtin_ia32_kortestchi:
+  case clang::X86::BI__builtin_ia32_kortestcsi:
+  case clang::X86::BI__builtin_ia32_kortestcdi:
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APSInt &A, const APSInt &B) {
+          return APInt(sizeof(unsigned char) * 8, ~(A | B) == 0);
+        });
+
+  case clang::X86::BI__builtin_ia32_kortestzqi:
+  case clang::X86::BI__builtin_ia32_kortestzhi:
+  case clang::X86::BI__builtin_ia32_kortestzsi:
+  case clang::X86::BI__builtin_ia32_kortestzdi:
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APSInt &A, const APSInt &B) {
+          return APInt(sizeof(unsigned char) * 8, (A | B) == 0);
+        });
+
   case clang::X86::BI__builtin_ia32_lzcnt_u16:
   case clang::X86::BI__builtin_ia32_lzcnt_u32:
   case clang::X86::BI__builtin_ia32_lzcnt_u64:
@@ -4434,25 +4414,115 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
             return std::pair<unsigned, int>{0, static_cast<int>(DstIdx)};
           }
         });
+  case X86::BI__builtin_ia32_vpermi2varq128:
+  case X86::BI__builtin_ia32_vpermi2varpd128:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0x1;
+          unsigned SrcIdx = (ShuffleMask >> 1) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
+  case X86::BI__builtin_ia32_vpermi2vard128:
+  case X86::BI__builtin_ia32_vpermi2varps128:
+  case X86::BI__builtin_ia32_vpermi2varq256:
+  case X86::BI__builtin_ia32_vpermi2varpd256:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0x3;
+          unsigned SrcIdx = (ShuffleMask >> 2) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
+  case X86::BI__builtin_ia32_vpermi2varhi128:
+  case X86::BI__builtin_ia32_vpermi2vard256:
+  case X86::BI__builtin_ia32_vpermi2varps256:
+  case X86::BI__builtin_ia32_vpermi2varq512:
+  case X86::BI__builtin_ia32_vpermi2varpd512:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0x7;
+          unsigned SrcIdx = (ShuffleMask >> 3) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
+  case X86::BI__builtin_ia32_vpermi2varqi128:
+  case X86::BI__builtin_ia32_vpermi2varhi256:
+  case X86::BI__builtin_ia32_vpermi2vard512:
+  case X86::BI__builtin_ia32_vpermi2varps512:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0xF;
+          unsigned SrcIdx = (ShuffleMask >> 4) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
+  case X86::BI__builtin_ia32_vpermi2varqi256:
+  case X86::BI__builtin_ia32_vpermi2varhi512:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0x1F;
+          unsigned SrcIdx = (ShuffleMask >> 5) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
+  case X86::BI__builtin_ia32_vpermi2varqi512:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0x3F;
+          unsigned SrcIdx = (ShuffleMask >> 6) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
   case X86::BI__builtin_ia32_pshufb128:
   case X86::BI__builtin_ia32_pshufb256:
   case X86::BI__builtin_ia32_pshufb512:
-    return interp__builtin_ia32_pshufb(S, OpPC, Call);
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          uint8_t Ctlb = static_cast<uint8_t>(ShuffleMask);
+          if (Ctlb & 0x80)
+            return std::make_pair(0, -1);
+
+          unsigned LaneBase = (DstIdx / 16) * 16;
+          unsigned SrcOffset = Ctlb & 0x0F;
+          unsigned SrcIdx = LaneBase + SrcOffset;
+          return std::make_pair(0, static_cast<int>(SrcIdx));
+        });
 
   case X86::BI__builtin_ia32_pshuflw:
   case X86::BI__builtin_ia32_pshuflw256:
   case X86::BI__builtin_ia32_pshuflw512:
-    return interp__builtin_ia32_pshuf(S, OpPC, Call, false);
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          unsigned LaneBase = (DstIdx / 8) * 8;
+          unsigned LaneIdx = DstIdx % 8;
+          if (LaneIdx < 4) {
+            unsigned Sel = (ShuffleMask >> (2 * LaneIdx)) & 0x3;
+            return std::make_pair(0, static_cast<int>(LaneBase + Sel));
+          }
+
+          return std::make_pair(0, static_cast<int>(DstIdx));
+        });
 
   case X86::BI__builtin_ia32_pshufhw:
   case X86::BI__builtin_ia32_pshufhw256:
   case X86::BI__builtin_ia32_pshufhw512:
-    return interp__builtin_ia32_pshuf(S, OpPC, Call, true);
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          unsigned LaneBase = (DstIdx / 8) * 8;
+          unsigned LaneIdx = DstIdx % 8;
+          if (LaneIdx >= 4) {
+            unsigned Sel = (ShuffleMask >> (2 * (LaneIdx - 4))) & 0x3;
+            return std::make_pair(0, static_cast<int>(LaneBase + 4 + Sel));
+          }
+
+          return std::make_pair(0, static_cast<int>(DstIdx));
+        });
 
   case X86::BI__builtin_ia32_pshufd:
   case X86::BI__builtin_ia32_pshufd256:
   case X86::BI__builtin_ia32_pshufd512:
-    return interp__builtin_ia32_pshuf(S, OpPC, Call, false);
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          unsigned LaneBase = (DstIdx / 4) * 4;
+          unsigned LaneIdx = DstIdx % 4;
+          unsigned Sel = (ShuffleMask >> (2 * LaneIdx)) & 0x3;
+          return std::make_pair(0, static_cast<int>(LaneBase + Sel));
+        });
 
   case X86::BI__builtin_ia32_kandqi:
   case X86::BI__builtin_ia32_kandhi:
@@ -4610,13 +4680,16 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
     // The lane width is hardcoded to 16 to match the SIMD register size,
     // but the algorithm processes one byte per iteration,
     // so APInt(8, ...) is correct and intentional.
-    return interp__builtin_x86_byteshift(
-        S, OpPC, Call, BuiltinID,
-        [](const Pointer &Src, unsigned Lane, unsigned I, unsigned Shift) {
-          if (I < Shift) {
-            return APInt(8, 0);
-          }
-          return APInt(8, Src.elem<uint8_t>(Lane + I - Shift));
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call,
+        [](unsigned DstIdx, unsigned Shift) -> std::pair<unsigned, int> {
+          unsigned LaneBase = (DstIdx / 16) * 16;
+          unsigned LaneIdx = DstIdx % 16;
+          if (LaneIdx < Shift)
+            return std::make_pair(0, -1);
+
+          return std::make_pair(0,
+                                static_cast<int>(LaneBase + LaneIdx - Shift));
         });
 
   case X86::BI__builtin_ia32_psrldqi128_byteshift:
@@ -4626,14 +4699,40 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
     // The lane width is hardcoded to 16 to match the SIMD register size,
     // but the algorithm processes one byte per iteration,
     // so APInt(8, ...) is correct and intentional.
-    return interp__builtin_x86_byteshift(
-        S, OpPC, Call, BuiltinID,
-        [](const Pointer &Src, unsigned Lane, unsigned I, unsigned Shift) {
-          if (I + Shift < 16) {
-            return APInt(8, Src.elem<uint8_t>(Lane + I + Shift));
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call,
+        [](unsigned DstIdx, unsigned Shift) -> std::pair<unsigned, int> {
+          unsigned LaneBase = (DstIdx / 16) * 16;
+          unsigned LaneIdx = DstIdx % 16;
+          if (LaneIdx + Shift < 16)
+            return std::make_pair(0,
+                                  static_cast<int>(LaneBase + LaneIdx + Shift));
+
+          return std::make_pair(0, -1);
+        });
+
+  case X86::BI__builtin_ia32_palignr128:
+  case X86::BI__builtin_ia32_palignr256:
+  case X86::BI__builtin_ia32_palignr512:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned Shift) {
+          // Default to -1 → zero-fill this destination element
+          unsigned VecIdx = 1;
+          int ElemIdx = -1;
+
+          int Lane = DstIdx / 16;
+          int Offset = DstIdx % 16;
+
+          // Elements come from VecB first, then VecA after the shift boundary
+          unsigned ShiftedIdx = Offset + (Shift & 0xFF);
+          if (ShiftedIdx < 16) { // from VecB
+            ElemIdx = ShiftedIdx + (Lane * 16);
+          } else if (ShiftedIdx < 32) { // from VecA
+            VecIdx = 0;
+            ElemIdx = (ShiftedIdx - 16) + (Lane * 16);
           }
 
-          return APInt(8, 0);
+          return std::pair<unsigned, int>{VecIdx, ElemIdx};
         });
 
   default:
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index 1c17ad9e95d05..1785fcf4a7b20 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -53,6 +53,7 @@ def ArgBool : ArgType { let Name = "bool"; }
 def ArgFixedPoint : ArgType { let Name = "FixedPoint"; let AsRef = true; }
 
 def ArgFunction : ArgType { let Name = "const Function *"; }
+def ArgFunctionDecl : ArgType { let Name = "const FunctionDecl *"; }
 def ArgRecordDecl : ArgType { let Name = "const RecordDecl *"; }
 def ArgRecordField : ArgType { let Name = "const Record::Field *"; }
 def ArgFltSemantics : ArgType { let Name = "const llvm::fltSemantics *"; }
@@ -421,6 +422,8 @@ def CheckLiteralType : Opcode {
 }
 
 def CheckArraySize : Opcode { let Args = [ArgUint64]; }
+def CheckFunctionDecl : Opcode { let Args = [ArgFunctionDecl]; }
+def CheckBitCast : Opcode { let Args = [ArgTypePtr, ArgBool]; }
 
 // [] -> [Value]
 def GetGlobal : AccessOpcode;
diff --git a/clang/lib/AST/ByteCode/PrimType.h b/clang/lib/AST/ByteCode/PrimType.h
index 54fd39ac6fcc8..f0454b484ff98 100644
--- a/clang/lib/AST/ByteCode/PrimType.h
+++ b/clang/lib/AST/ByteCode/PrimType.h
@@ -101,6 +101,7 @@ inline constexpr bool isSignedType(PrimType T) {
 
 enum class CastKind : uint8_t {
   Reinterpret,
+  ReinterpretLike,
   Volatile,
   Dynamic,
 };
@@ -111,6 +112,9 @@ inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
   case interp::CastKind::Reinterpret:
     OS << "reinterpret_cast";
     break;
+  case interp::CastKind::ReinterpretLike:
+    OS << "reinterpret_like";
+    break;
   case interp::CastKind::Volatile:
     OS << "volatile";
     break;
diff --git a/clang/lib/AST/ByteCode/Program.cpp b/clang/lib/AST/ByteCode/Program.cpp
index e0b2852f0e906..c468303efea7e 100644
--- a/clang/lib/AST/ByteCode/Program.cpp
+++ b/clang/lib/AST/ByteCode/Program.cpp
@@ -36,30 +36,19 @@ unsigned Program::createGlobalString(const StringLiteral *S, const Expr *Base) {
   const size_t BitWidth = CharWidth * Ctx.getCharBit();
   unsigned StringLength = S->getLength();
 
-  PrimType CharType;
-  switch (CharWidth) {
-  case 1:
-    CharType = PT_Sint8;
-    break;
-  case 2:
-    CharType = PT_Uint16;
-    break;
-  case 4:
-    CharType = PT_Uint32;
-    break;
-  default:
-    llvm_unreachable("unsupported character width");
-  }
+  OptPrimType CharType =
+      Ctx.classify(S->getType()->castAsArrayTypeUnsafe()->getElementType());
+  assert(CharType);
 
   if (!Base)
     Base = S;
 
   // Create a descriptor for the string.
-  Descriptor *Desc =
-      allocateDescriptor(Base, CharType, Descriptor::GlobalMD, StringLength + 1,
-                         /*isConst=*/true,
-                         /*isTemporary=*/false,
-                         /*isMutable=*/false);
+  Descriptor *Desc = allocateDescriptor(Base, *CharType, Descriptor::GlobalMD,
+                                        StringLength + 1,
+                                        /*isConst=*/true,
+                                        /*isTemporary=*/false,
+                                        /*isMutable=*/false);
 
   // Allocate storage for the string.
   // The byte length does not include the null terminator.
@@ -79,26 +68,9 @@ unsigned Program::createGlobalString(const StringLiteral *S, const Expr *Base) {
   } else {
     // Construct the string in storage.
     for (unsigned I = 0; I <= StringLength; ++I) {
-      const uint32_t CodePoint = I == StringLength ? 0 : S->getCodeUnit(I);
-      switch (CharType) {
-      case PT_Sint8: {
-        using T = PrimConv<PT_Sint8>::T;
-        Ptr.elem<T>(I) = T::from(CodePoint, BitWidth);
-        break;
-      }
-      case PT_Uint16: {
-        using T = PrimConv<PT_Uint16>::T;
-        Ptr.elem<T>(I) = T::from(CodePoint, BitWidth);
-        break;
-      }
-      case PT_Uint32: {
-        using T = PrimConv<PT_Uint32>::T;
-        Ptr.elem<T>(I) = T::from(CodePoint, BitWidth);
-        break;
-      }
-      default:
-        llvm_unreachable("unsupported character type");
-      }
+      uint32_t CodePoint = I == StringLength ? 0 : S->getCodeUnit(I);
+      INT_TYPE_SWITCH_NO_BOOL(*CharType,
+                              Ptr.elem<T>(I) = T::from(CodePoint, BitWidth););
     }
   }
   Ptr.initializeAllElements();
@@ -218,21 +190,43 @@ UnsignedOrNone Program::createGlobal(const ValueDecl *VD, const Expr *Init) {
     return std::nullopt;
 
   Global *NewGlobal = Globals[*Idx];
+  // Note that this loop has one iteration where Redecl == VD.
   for (const Decl *Redecl : VD->redecls()) {
-    unsigned &PIdx = GlobalIndices[Redecl];
+
+    // If this redecl was registered as a dummy variable, it is now a proper
+    // global variable and points to the block we just created.
+    if (auto DummyIt = DummyVariables.find(Redecl);
+        DummyIt != DummyVariables.end()) {
+      Global *Dummy = Globals[DummyIt->second];
+      Dummy->block()->movePointersTo(NewGlobal->block());
+      Globals[DummyIt->second] = NewGlobal;
+      DummyVariables.erase(DummyIt);
+    }
+    // If the redeclaration hasn't been registered yet at all, we just set its
+    // global index to Idx. If it has been registered yet, it might have
+    // pointers pointing to it and we need to transfer those pointers to the new
+    // block.
+    auto [Iter, Inserted] = GlobalIndices.try_emplace(Redecl);
+    if (Inserted) {
+      GlobalIndices[Redecl] = *Idx;
+      continue;
+    }
+
     if (Redecl != VD) {
-      if (Block *RedeclBlock = Globals[PIdx]->block();
+      if (Block *RedeclBlock = Globals[Iter->second]->block();
           RedeclBlock->isExtern()) {
-        Globals[PIdx] = NewGlobal;
+
         // All pointers pointing to the previous extern decl now point to the
         // new decl.
         // A previous iteration might've already fixed up the pointers for this
         // global.
         if (RedeclBlock != NewGlobal->block())
           RedeclBlock->movePointersTo(NewGlobal->block());
+
+        Globals[Iter->second] = NewGlobal;
       }
     }
-    PIdx = *Idx;
+    Iter->second = *Idx;
   }
 
   return *Idx;
diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h
index 28fcc97f5339d..cc9127dc77860 100644
--- a/clang/lib/AST/ByteCode/Program.h
+++ b/clang/lib/AST/ByteCode/Program.h
@@ -205,7 +205,6 @@ class Program final {
     const Block *block() const { return &B; }
 
   private:
-    /// Required metadata - does not actually track pointers.
     Block B;
   };
 
diff --git a/clang/lib/AST/CommentSema.cpp b/clang/lib/AST/CommentSema.cpp
index 27ff5ab1f0c6b..d5ba240cb2bde 100644
--- a/clang/lib/AST/CommentSema.cpp
+++ b/clang/lib/AST/CommentSema.cpp
@@ -225,7 +225,7 @@ static ParamCommandPassDirection getParamPassDirection(StringRef Arg) {
   return llvm::StringSwitch<ParamCommandPassDirection>(Arg)
       .Case("[in]", ParamCommandPassDirection::In)
       .Case("[out]", ParamCommandPassDirection::Out)
-      .Cases("[in,out]", "[out,in]", ParamCommandPassDirection::InOut)
+      .Cases({"[in,out]", "[out,in]"}, ParamCommandPassDirection::InOut)
       .Default(static_cast<ParamCommandPassDirection>(-1));
 }
 
diff --git a/clang/lib/AST/ComputeDependence.cpp b/clang/lib/AST/ComputeDependence.cpp
index e0cf0deb12bd2..638080ea781a9 100644
--- a/clang/lib/AST/ComputeDependence.cpp
+++ b/clang/lib/AST/ComputeDependence.cpp
@@ -178,7 +178,7 @@ ExprDependence clang::computeDependence(StmtExpr *E, unsigned TemplateDepth) {
   auto D = toExprDependenceForImpliedType(E->getType()->getDependence());
   // Propagate dependence of the result.
   if (const auto *CompoundExprResult =
-          dyn_cast_or_null<ValueStmt>(E->getSubStmt()->getStmtExprResult()))
+          dyn_cast_or_null<ValueStmt>(E->getSubStmt()->body_back()))
     if (const Expr *ResultExpr = CompoundExprResult->getExprStmt())
       D |= ResultExpr->getDependence();
   // Note: we treat a statement-expression in a dependent context as always
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 8579e51e45697..eff2b81d61a50 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3180,7 +3180,7 @@ void FunctionDecl::DefaultedOrDeletedFunctionInfo::setDeletedMessage(
 }
 
 FunctionDecl::DefaultedOrDeletedFunctionInfo *
-FunctionDecl::getDefalutedOrDeletedInfo() const {
+FunctionDecl::getDefaultedOrDeletedInfo() const {
   return FunctionDeclBits.HasDefaultedOrDeletedInfo ? DefaultedOrDeletedInfo
                                                     : nullptr;
 }
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 97eeba8b9d6cc..1bfea24b228e8 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -3829,6 +3829,351 @@ static bool CheckArraySize(EvalInfo &Info, const ConstantArrayType *CAT,
       /*Diag=*/true);
 }
 
+static bool handleScalarCast(EvalInfo &Info, const FPOptions FPO, const Expr *E,
+                             QualType SourceTy, QualType DestTy,
+                             APValue const &Original, APValue &Result) {
+  // boolean must be checked before integer
+  // since IsIntegerType() is true for bool
+  if (SourceTy->isBooleanType()) {
+    if (DestTy->isBooleanType()) {
+      Result = Original;
+      return true;
+    }
+    if (DestTy->isIntegerType() || DestTy->isRealFloatingType()) {
+      bool BoolResult;
+      if (!HandleConversionToBool(Original, BoolResult))
+        return false;
+      uint64_t IntResult = BoolResult;
+      QualType IntType = DestTy->isIntegerType()
+                             ? DestTy
+                             : Info.Ctx.getIntTypeForBitwidth(64, false);
+      Result = APValue(Info.Ctx.MakeIntValue(IntResult, IntType));
+    }
+    if (DestTy->isRealFloatingType()) {
+      APValue Result2 = APValue(APFloat(0.0));
+      if (!HandleIntToFloatCast(Info, E, FPO,
+                                Info.Ctx.getIntTypeForBitwidth(64, false),
+                                Result.getInt(), DestTy, Result2.getFloat()))
+        return false;
+      Result = Result2;
+    }
+    return true;
+  }
+  if (SourceTy->isIntegerType()) {
+    if (DestTy->isRealFloatingType()) {
+      Result = APValue(APFloat(0.0));
+      return HandleIntToFloatCast(Info, E, FPO, SourceTy, Original.getInt(),
+                                  DestTy, Result.getFloat());
+    }
+    if (DestTy->isBooleanType()) {
+      bool BoolResult;
+      if (!HandleConversionToBool(Original, BoolResult))
+        return false;
+      uint64_t IntResult = BoolResult;
+      Result = APValue(Info.Ctx.MakeIntValue(IntResult, DestTy));
+      return true;
+    }
+    if (DestTy->isIntegerType()) {
+      Result = APValue(
+          HandleIntToIntCast(Info, E, DestTy, SourceTy, Original.getInt()));
+      return true;
+    }
+  } else if (SourceTy->isRealFloatingType()) {
+    if (DestTy->isRealFloatingType()) {
+      Result = Original;
+      return HandleFloatToFloatCast(Info, E, SourceTy, DestTy,
+                                    Result.getFloat());
+    }
+    if (DestTy->isBooleanType()) {
+      bool BoolResult;
+      if (!HandleConversionToBool(Original, BoolResult))
+        return false;
+      uint64_t IntResult = BoolResult;
+      Result = APValue(Info.Ctx.MakeIntValue(IntResult, DestTy));
+      return true;
+    }
+    if (DestTy->isIntegerType()) {
+      Result = APValue(APSInt());
+      return HandleFloatToIntCast(Info, E, SourceTy, Original.getFloat(),
+                                  DestTy, Result.getInt());
+    }
+  }
+
+  Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
+  return false;
+}
+
+// do the heavy lifting for casting to aggregate types
+// because we have to deal with bitfields specially
+static bool constructAggregate(EvalInfo &Info, const FPOptions FPO,
+                               const Expr *E, APValue &Result,
+                               QualType ResultType,
+                               SmallVectorImpl<APValue> &Elements,
+                               SmallVectorImpl<QualType> &ElTypes) {
+
+  SmallVector<std::tuple<APValue *, QualType, unsigned>> WorkList = {
+      {&Result, ResultType, 0}};
+
+  unsigned ElI = 0;
+  while (!WorkList.empty() && ElI < Elements.size()) {
+    auto [Res, Type, BitWidth] = WorkList.pop_back_val();
+
+    if (Type->isRealFloatingType()) {
+      if (!handleScalarCast(Info, FPO, E, ElTypes[ElI], Type, Elements[ElI],
+                            *Res))
+        return false;
+      ElI++;
+      continue;
+    }
+    if (Type->isIntegerType()) {
+      if (!handleScalarCast(Info, FPO, E, ElTypes[ElI], Type, Elements[ElI],
+                            *Res))
+        return false;
+      if (BitWidth > 0) {
+        if (!Res->isInt())
+          return false;
+        APSInt &Int = Res->getInt();
+        unsigned OldBitWidth = Int.getBitWidth();
+        unsigned NewBitWidth = BitWidth;
+        if (NewBitWidth < OldBitWidth)
+          Int = Int.trunc(NewBitWidth).extend(OldBitWidth);
+      }
+      ElI++;
+      continue;
+    }
+    if (Type->isVectorType()) {
+      QualType ElTy = Type->castAs<VectorType>()->getElementType();
+      unsigned NumEl = Type->castAs<VectorType>()->getNumElements();
+      SmallVector<APValue> Vals(NumEl);
+      for (unsigned I = 0; I < NumEl; ++I) {
+        if (!handleScalarCast(Info, FPO, E, ElTypes[ElI], ElTy, Elements[ElI],
+                              Vals[I]))
+          return false;
+        ElI++;
+      }
+      *Res = APValue(Vals.data(), NumEl);
+      continue;
+    }
+    if (Type->isConstantArrayType()) {
+      QualType ElTy = cast<ConstantArrayType>(Info.Ctx.getAsArrayType(Type))
+                          ->getElementType();
+      uint64_t Size =
+          cast<ConstantArrayType>(Info.Ctx.getAsArrayType(Type))->getZExtSize();
+      *Res = APValue(APValue::UninitArray(), Size, Size);
+      for (int64_t I = Size - 1; I > -1; --I)
+        WorkList.emplace_back(&Res->getArrayInitializedElt(I), ElTy, 0u);
+      continue;
+    }
+    if (Type->isRecordType()) {
+      const RecordDecl *RD = Type->getAsRecordDecl();
+
+      unsigned NumBases = 0;
+      if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD))
+        NumBases = CXXRD->getNumBases();
+
+      *Res = APValue(APValue::UninitStruct(), NumBases,
+                     std::distance(RD->field_begin(), RD->field_end()));
+
+      SmallVector<std::tuple<APValue *, QualType, unsigned>> ReverseList;
+      // we need to traverse backwards
+      // Visit the base classes.
+      if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+        if (CXXRD->getNumBases() > 0) {
+          assert(CXXRD->getNumBases() == 1);
+          const CXXBaseSpecifier &BS = CXXRD->bases_begin()[0];
+          ReverseList.emplace_back(&Res->getStructBase(0), BS.getType(), 0u);
+        }
+      }
+
+      // Visit the fields.
+      for (FieldDecl *FD : RD->fields()) {
+        unsigned FDBW = 0;
+        if (FD->isUnnamedBitField())
+          continue;
+        if (FD->isBitField()) {
+          FDBW = FD->getBitWidthValue();
+        }
+
+        ReverseList.emplace_back(&Res->getStructField(FD->getFieldIndex()),
+                                 FD->getType(), FDBW);
+      }
+
+      std::reverse(ReverseList.begin(), ReverseList.end());
+      llvm::append_range(WorkList, ReverseList);
+      continue;
+    }
+    Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
+    return false;
+  }
+  return true;
+}
+
+static bool handleElementwiseCast(EvalInfo &Info, const Expr *E,
+                                  const FPOptions FPO,
+                                  SmallVectorImpl<APValue> &Elements,
+                                  SmallVectorImpl<QualType> &SrcTypes,
+                                  SmallVectorImpl<QualType> &DestTypes,
+                                  SmallVectorImpl<APValue> &Results) {
+
+  assert((Elements.size() == SrcTypes.size()) &&
+         (Elements.size() == DestTypes.size()));
+
+  for (unsigned I = 0, ESz = Elements.size(); I < ESz; ++I) {
+    APValue Original = Elements[I];
+    QualType SourceTy = SrcTypes[I];
+    QualType DestTy = DestTypes[I];
+
+    if (!handleScalarCast(Info, FPO, E, SourceTy, DestTy, Original, Results[I]))
+      return false;
+  }
+  return true;
+}
+
+static unsigned elementwiseSize(EvalInfo &Info, QualType BaseTy) {
+
+  SmallVector<QualType> WorkList = {BaseTy};
+
+  unsigned Size = 0;
+  while (!WorkList.empty()) {
+    QualType Type = WorkList.pop_back_val();
+    if (Type->isRealFloatingType() || Type->isIntegerType() ||
+        Type->isBooleanType()) {
+      ++Size;
+      continue;
+    }
+    if (Type->isVectorType()) {
+      unsigned NumEl = Type->castAs<VectorType>()->getNumElements();
+      Size += NumEl;
+      continue;
+    }
+    if (Type->isConstantArrayType()) {
+      QualType ElTy = cast<ConstantArrayType>(Info.Ctx.getAsArrayType(Type))
+                          ->getElementType();
+      uint64_t ArrSize =
+          cast<ConstantArrayType>(Info.Ctx.getAsArrayType(Type))->getZExtSize();
+      for (uint64_t I = 0; I < ArrSize; ++I) {
+        WorkList.push_back(ElTy);
+      }
+      continue;
+    }
+    if (Type->isRecordType()) {
+      const RecordDecl *RD = Type->getAsRecordDecl();
+
+      // Visit the base classes.
+      if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+        if (CXXRD->getNumBases() > 0) {
+          assert(CXXRD->getNumBases() == 1);
+          const CXXBaseSpecifier &BS = CXXRD->bases_begin()[0];
+          WorkList.push_back(BS.getType());
+        }
+      }
+
+      // visit the fields.
+      for (FieldDecl *FD : RD->fields()) {
+        if (FD->isUnnamedBitField())
+          continue;
+        WorkList.push_back(FD->getType());
+      }
+      continue;
+    }
+  }
+  return Size;
+}
+
+static bool hlslAggSplatHelper(EvalInfo &Info, const Expr *E, APValue &SrcVal,
+                               QualType &SrcTy) {
+  SrcTy = E->getType();
+
+  if (!Evaluate(SrcVal, Info, E))
+    return false;
+
+  assert(SrcVal.isFloat() || SrcVal.isInt() ||
+         (SrcVal.isVector() && SrcVal.getVectorLength() == 1) &&
+             "Not a valid HLSLAggregateSplatCast.");
+
+  if (SrcVal.isVector()) {
+    assert(SrcTy->isVectorType() && "Type mismatch.");
+    SrcTy = SrcTy->castAs<VectorType>()->getElementType();
+    SrcVal = SrcVal.getVectorElt(0);
+  }
+  return true;
+}
+
+static bool flattenAPValue(EvalInfo &Info, const Expr *E, APValue Value,
+                           QualType BaseTy, SmallVectorImpl<APValue> &Elements,
+                           SmallVectorImpl<QualType> &Types, unsigned Size) {
+
+  SmallVector<std::pair<APValue, QualType>> WorkList = {{Value, BaseTy}};
+  unsigned Populated = 0;
+  while (!WorkList.empty() && Populated < Size) {
+    auto [Work, Type] = WorkList.pop_back_val();
+
+    if (Work.isFloat() || Work.isInt()) {
+      Elements.push_back(Work);
+      Types.push_back(Type);
+      Populated++;
+      continue;
+    }
+    if (Work.isVector()) {
+      assert(Type->isVectorType() && "Type mismatch.");
+      QualType ElTy = Type->castAs<VectorType>()->getElementType();
+      for (unsigned I = 0; I < Work.getVectorLength() && Populated < Size;
+           I++) {
+        Elements.push_back(Work.getVectorElt(I));
+        Types.push_back(ElTy);
+        Populated++;
+      }
+      continue;
+    }
+    if (Work.isArray()) {
+      assert(Type->isConstantArrayType() && "Type mismatch.");
+      QualType ElTy = cast<ConstantArrayType>(Info.Ctx.getAsArrayType(Type))
+                          ->getElementType();
+      for (int64_t I = Work.getArraySize() - 1; I > -1; --I) {
+        WorkList.emplace_back(Work.getArrayInitializedElt(I), ElTy);
+      }
+      continue;
+    }
+
+    if (Work.isStruct()) {
+      assert(Type->isRecordType() && "Type mismatch.");
+
+      const RecordDecl *RD = Type->getAsRecordDecl();
+
+      SmallVector<std::pair<APValue, QualType>> ReverseList;
+      // Visit the fields.
+      for (FieldDecl *FD : RD->fields()) {
+        if (FD->isUnnamedBitField())
+          continue;
+        ReverseList.emplace_back(Work.getStructField(FD->getFieldIndex()),
+                                 FD->getType());
+      }
+
+      std::reverse(ReverseList.begin(), ReverseList.end());
+      llvm::append_range(WorkList, ReverseList);
+
+      // Visit the base classes.
+      if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+        if (CXXRD->getNumBases() > 0) {
+          assert(CXXRD->getNumBases() == 1);
+          const CXXBaseSpecifier &BS = CXXRD->bases_begin()[0];
+          const APValue &Base = Work.getStructBase(0);
+
+          // Can happen in error cases.
+          if (!Base.isStruct())
+            return false;
+
+          WorkList.emplace_back(Base, BS.getType());
+        }
+      }
+      continue;
+    }
+    Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
+    return false;
+  }
+  return true;
+}
+
 namespace {
 /// A handle to a complete object (an object that is not a subobject of
 /// another object).
@@ -4639,6 +4984,30 @@ handleLValueToRValueConversion(EvalInfo &Info, const Expr *Conv, QualType Type,
   return Obj && extractSubobject(Info, Conv, Obj, LVal.Designator, RVal, AK);
 }
 
+static bool hlslElementwiseCastHelper(EvalInfo &Info, const Expr *E,
+                                      QualType DestTy,
+                                      SmallVectorImpl<APValue> &SrcVals,
+                                      SmallVectorImpl<QualType> &SrcTypes) {
+  APValue Val;
+  if (!Evaluate(Val, Info, E))
+    return false;
+
+  // must be dealing with a record
+  if (Val.isLValue()) {
+    LValue LVal;
+    LVal.setFrom(Info.Ctx, Val);
+    if (!handleLValueToRValueConversion(Info, E, E->getType(), LVal, Val))
+      return false;
+  }
+
+  unsigned NEls = elementwiseSize(Info, DestTy);
+  // flatten the source
+  if (!flattenAPValue(Info, E, Val, E->getType(), SrcVals, SrcTypes, NEls))
+    return false;
+
+  return true;
+}
+
 /// Perform an assignment of Val to LVal. Takes ownership of Val.
 static bool handleAssignment(EvalInfo &Info, const Expr *E, const LValue &LVal,
                              QualType LValType, APValue &Val) {
@@ -5452,10 +5821,13 @@ static EvalStmtResult EvaluateSwitch(StmtResult &Result, EvalInfo &Info,
     }
 
     const CaseStmt *CS = cast<CaseStmt>(SC);
-    APSInt LHS = CS->getLHS()->EvaluateKnownConstInt(Info.Ctx);
-    APSInt RHS = CS->getRHS() ? CS->getRHS()->EvaluateKnownConstInt(Info.Ctx)
-                              : LHS;
-    if (LHS <= Value && Value <= RHS) {
+    const Expr *LHS = CS->getLHS();
+    const Expr *RHS = CS->getRHS();
+    if (LHS->isValueDependent() || (RHS && RHS->isValueDependent()))
+      return ESR_Failed;
+    APSInt LHSValue = LHS->EvaluateKnownConstInt(Info.Ctx);
+    APSInt RHSValue = RHS ? RHS->EvaluateKnownConstInt(Info.Ctx) : LHSValue;
+    if (LHSValue <= Value && Value <= RHSValue) {
       Found = SC;
       break;
     }
@@ -8667,6 +9039,25 @@ class ExprEvaluatorBase
     case CK_UserDefinedConversion:
       return StmtVisitorTy::Visit(E->getSubExpr());
 
+    case CK_HLSLArrayRValue: {
+      const Expr *SubExpr = E->getSubExpr();
+      if (!SubExpr->isGLValue()) {
+        APValue Val;
+        if (!Evaluate(Val, Info, SubExpr))
+          return false;
+        return DerivedSuccess(Val, E);
+      }
+
+      LValue LVal;
+      if (!EvaluateLValue(SubExpr, LVal, Info))
+        return false;
+      APValue RVal;
+      // Note, we use the subexpression's type in order to retain cv-qualifiers.
+      if (!handleLValueToRValueConversion(Info, E, SubExpr->getType(), LVal,
+                                          RVal))
+        return false;
+      return DerivedSuccess(RVal, E);
+    }
     case CK_LValueToRValue: {
       LValue LVal;
       if (!EvaluateLValue(E->getSubExpr(), LVal, Info))
@@ -10851,6 +11242,42 @@ bool RecordExprEvaluator::VisitCastExpr(const CastExpr *E) {
     Result = *Value;
     return true;
   }
+  case CK_HLSLAggregateSplatCast: {
+    APValue Val;
+    QualType ValTy;
+
+    if (!hlslAggSplatHelper(Info, E->getSubExpr(), Val, ValTy))
+      return false;
+
+    unsigned NEls = elementwiseSize(Info, E->getType());
+    // splat our Val
+    SmallVector<APValue> SplatEls(NEls, Val);
+    SmallVector<QualType> SplatType(NEls, ValTy);
+
+    // cast the elements and construct our struct result
+    const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+    if (!constructAggregate(Info, FPO, E, Result, E->getType(), SplatEls,
+                            SplatType))
+      return false;
+
+    return true;
+  }
+  case CK_HLSLElementwiseCast: {
+    SmallVector<APValue> SrcEls;
+    SmallVector<QualType> SrcTypes;
+
+    if (!hlslElementwiseCastHelper(Info, E->getSubExpr(), E->getType(), SrcEls,
+                                   SrcTypes))
+      return false;
+
+    // cast the elements and construct our struct result
+    const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+    if (!constructAggregate(Info, FPO, E, Result, E->getType(), SrcEls,
+                            SrcTypes))
+      return false;
+
+    return true;
+  }
   }
 }
 
@@ -11346,6 +11773,38 @@ bool VectorExprEvaluator::VisitCastExpr(const CastExpr *E) {
       Elements.push_back(Val.getVectorElt(I));
     return Success(Elements, E);
   }
+  case CK_HLSLAggregateSplatCast: {
+    APValue Val;
+    QualType ValTy;
+
+    if (!hlslAggSplatHelper(Info, SE, Val, ValTy))
+      return false;
+
+    // cast our Val once.
+    APValue Result;
+    const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+    if (!handleScalarCast(Info, FPO, E, ValTy, VTy->getElementType(), Val,
+                          Result))
+      return false;
+
+    SmallVector<APValue, 4> SplatEls(NElts, Result);
+    return Success(SplatEls, E);
+  }
+  case CK_HLSLElementwiseCast: {
+    SmallVector<APValue> SrcVals;
+    SmallVector<QualType> SrcTypes;
+
+    if (!hlslElementwiseCastHelper(Info, SE, E->getType(), SrcVals, SrcTypes))
+      return false;
+
+    const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+    SmallVector<QualType, 4> DestTypes(NElts, VTy->getElementType());
+    SmallVector<APValue, 4> ResultEls(NElts);
+    if (!handleElementwiseCast(Info, E, FPO, SrcVals, SrcTypes, DestTypes,
+                               ResultEls))
+      return false;
+    return Success(ResultEls, E);
+  }
   default:
     return ExprEvaluatorBaseTy::VisitCastExpr(E);
   }
@@ -11628,28 +12087,75 @@ static bool evalShuffleGeneric(
   if (!VT)
     return false;
 
-  APSInt MaskImm;
-  if (!EvaluateInteger(Call->getArg(2), MaskImm, Info))
-    return false;
-  unsigned ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue());
+  unsigned ShuffleMask = 0;
+  APValue A, MaskVector, B;
+  bool IsVectorMask = false;
+  bool IsSingleOperand = (Call->getNumArgs() == 2);
 
-  APValue A, B;
-  if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
-      !EvaluateAsRValue(Info, Call->getArg(1), B))
-    return false;
+  if (IsSingleOperand) {
+    QualType MaskType = Call->getArg(1)->getType();
+    if (MaskType->isVectorType()) {
+      IsVectorMask = true;
+      if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
+          !EvaluateAsRValue(Info, Call->getArg(1), MaskVector))
+        return false;
+      B = A;
+    } else if (MaskType->isIntegerType()) {
+      APSInt MaskImm;
+      if (!EvaluateInteger(Call->getArg(1), MaskImm, Info))
+        return false;
+      ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue());
+      if (!EvaluateAsRValue(Info, Call->getArg(0), A))
+        return false;
+      B = A;
+    } else {
+      return false;
+    }
+  } else {
+    QualType Arg2Type = Call->getArg(2)->getType();
+    if (Arg2Type->isVectorType()) {
+      IsVectorMask = true;
+      if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
+          !EvaluateAsRValue(Info, Call->getArg(1), MaskVector) ||
+          !EvaluateAsRValue(Info, Call->getArg(2), B))
+        return false;
+    } else if (Arg2Type->isIntegerType()) {
+      APSInt MaskImm;
+      if (!EvaluateInteger(Call->getArg(2), MaskImm, Info))
+        return false;
+      ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue());
+      if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
+          !EvaluateAsRValue(Info, Call->getArg(1), B))
+        return false;
+    } else {
+      return false;
+    }
+  }
 
   unsigned NumElts = VT->getNumElements();
-  SmallVector<APValue, 16> ResultElements;
+  SmallVector<APValue, 64> ResultElements;
   ResultElements.reserve(NumElts);
 
   for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) {
+    if (IsVectorMask) {
+      ShuffleMask = static_cast<unsigned>(
+          MaskVector.getVectorElt(DstIdx).getInt().getZExtValue());
+    }
     auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
 
     if (SrcIdx < 0) {
       // Zero out this element
       QualType ElemTy = VT->getElementType();
-      ResultElements.push_back(
-          APValue(APFloat::getZero(Info.Ctx.getFloatTypeSemantics(ElemTy))));
+      if (ElemTy->isRealFloatingType()) {
+        ResultElements.push_back(
+            APValue(APFloat::getZero(Info.Ctx.getFloatTypeSemantics(ElemTy))));
+      } else if (ElemTy->isIntegerType()) {
+        APValue Zero(Info.Ctx.MakeIntValue(0, ElemTy));
+        ResultElements.push_back(APValue(Zero));
+      } else {
+        // Other types of fallback logic
+        ResultElements.push_back(APValue());
+      }
     } else {
       const APValue &Src = (SrcVecIdx == 0) ? A : B;
       ResultElements.push_back(Src.getVectorElt(SrcIdx));
@@ -11660,98 +12166,6 @@ static bool evalShuffleGeneric(
   return true;
 }
 
-static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
-                              APValue &Out) {
-  APValue SrcVec, ControlVec;
-  if (!EvaluateAsRValue(Info, Call->getArg(0), SrcVec))
-    return false;
-  if (!EvaluateAsRValue(Info, Call->getArg(1), ControlVec))
-    return false;
-
-  const auto *VT = Call->getType()->getAs<VectorType>();
-  if (!VT)
-    return false;
-
-  QualType ElemT = VT->getElementType();
-  unsigned NumElts = VT->getNumElements();
-
-  SmallVector<APValue, 64> ResultElements;
-  ResultElements.reserve(NumElts);
-
-  for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
-    APValue CtlVal = ControlVec.getVectorElt(Idx);
-    APSInt CtlByte = CtlVal.getInt();
-    uint8_t Ctl = static_cast<uint8_t>(CtlByte.getZExtValue());
-
-    if (Ctl & 0x80) {
-      APValue Zero(Info.Ctx.MakeIntValue(0, ElemT));
-      ResultElements.push_back(Zero);
-    } else {
-      unsigned LaneBase = (Idx / 16) * 16;
-      unsigned SrcOffset = Ctl & 0x0F;
-      unsigned SrcIdx = LaneBase + SrcOffset;
-
-      ResultElements.push_back(SrcVec.getVectorElt(SrcIdx));
-    }
-  }
-  Out = APValue(ResultElements.data(), ResultElements.size());
-  return true;
-}
-
-static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call,
-                             bool IsShufHW, APValue &Out) {
-  APValue Vec;
-  APSInt Imm;
-  if (!EvaluateAsRValue(Info, Call->getArg(0), Vec))
-    return false;
-  if (!EvaluateInteger(Call->getArg(1), Imm, Info))
-    return false;
-
-  const auto *VT = Call->getType()->getAs<VectorType>();
-  if (!VT)
-    return false;
-
-  QualType ElemT = VT->getElementType();
-  unsigned ElemBits = Info.Ctx.getTypeSize(ElemT);
-  unsigned NumElts = VT->getNumElements();
-
-  unsigned LaneBits = 128u;
-  unsigned LaneElts = LaneBits / ElemBits;
-  if (!LaneElts || (NumElts % LaneElts) != 0)
-    return false;
-
-  uint8_t Ctl = static_cast<uint8_t>(Imm.getZExtValue());
-
-  SmallVector<APValue, 32> ResultElements;
-  ResultElements.reserve(NumElts);
-
-  for (unsigned Idx = 0; Idx != NumElts; Idx++) {
-    unsigned LaneBase = (Idx / LaneElts) * LaneElts;
-    unsigned LaneIdx = Idx % LaneElts;
-    unsigned SrcIdx = Idx;
-    unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3;
-
-    if (ElemBits == 32) {
-      SrcIdx = LaneBase + Sel;
-    } else {
-      constexpr unsigned HalfSize = 4;
-      bool InHigh = LaneIdx >= HalfSize;
-      if (!IsShufHW && !InHigh) {
-        SrcIdx = LaneBase + Sel;
-      } else if (IsShufHW && InHigh) {
-        unsigned Rel = LaneIdx - HalfSize;
-        Sel = (Ctl >> (2 * Rel)) & 0x3;
-        SrcIdx = LaneBase + HalfSize + Sel;
-      }
-    }
-
-    ResultElements.push_back(Vec.getVectorElt(SrcIdx));
-  }
-
-  Out = APValue(ResultElements.data(), ResultElements.size());
-  return true;
-}
-
 bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   if (!IsConstantEvaluatedBuiltinCall(E))
     return ExprEvaluatorBaseTy::VisitCallExpr(E);
@@ -12517,7 +12931,19 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case X86::BI__builtin_ia32_pshufb256:
   case X86::BI__builtin_ia32_pshufb512: {
     APValue R;
-    if (!evalPshufbBuiltin(Info, E, R))
+    if (!evalShuffleGeneric(
+            Info, E, R,
+            [](unsigned DstIdx,
+               unsigned ShuffleMask) -> std::pair<unsigned, int> {
+              uint8_t Ctlb = static_cast<uint8_t>(ShuffleMask);
+              if (Ctlb & 0x80)
+                return std::make_pair(0, -1);
+
+              unsigned LaneBase = (DstIdx / 16) * 16;
+              unsigned SrcOffset = Ctlb & 0x0F;
+              unsigned SrcIdx = LaneBase + SrcOffset;
+              return std::make_pair(0, static_cast<int>(SrcIdx));
+            }))
       return false;
     return Success(R, E);
   }
@@ -12526,7 +12952,21 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case X86::BI__builtin_ia32_pshuflw256:
   case X86::BI__builtin_ia32_pshuflw512: {
     APValue R;
-    if (!evalPshufBuiltin(Info, E, false, R))
+    if (!evalShuffleGeneric(
+            Info, E, R,
+            [](unsigned DstIdx, unsigned Mask) -> std::pair<unsigned, int> {
+              constexpr unsigned LaneBits = 128u;
+              constexpr unsigned ElemBits = 16u;
+              constexpr unsigned LaneElts = LaneBits / ElemBits;
+              constexpr unsigned HalfSize = 4;
+              unsigned LaneBase = (DstIdx / LaneElts) * LaneElts;
+              unsigned LaneIdx = DstIdx % LaneElts;
+              if (LaneIdx < HalfSize) {
+                unsigned Sel = (Mask >> (2 * LaneIdx)) & 0x3;
+                return std::make_pair(0, static_cast<int>(LaneBase + Sel));
+              }
+              return std::make_pair(0, static_cast<int>(DstIdx));
+            }))
       return false;
     return Success(R, E);
   }
@@ -12535,7 +12975,23 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case X86::BI__builtin_ia32_pshufhw256:
   case X86::BI__builtin_ia32_pshufhw512: {
     APValue R;
-    if (!evalPshufBuiltin(Info, E, true, R))
+    if (!evalShuffleGeneric(
+            Info, E, R,
+            [](unsigned DstIdx, unsigned Mask) -> std::pair<unsigned, int> {
+              constexpr unsigned LaneBits = 128u;
+              constexpr unsigned ElemBits = 16u;
+              constexpr unsigned LaneElts = LaneBits / ElemBits;
+              constexpr unsigned HalfSize = 4;
+              unsigned LaneBase = (DstIdx / LaneElts) * LaneElts;
+              unsigned LaneIdx = DstIdx % LaneElts;
+              if (LaneIdx >= HalfSize) {
+                unsigned Rel = LaneIdx - HalfSize;
+                unsigned Sel = (Mask >> (2 * Rel)) & 0x3;
+                return std::make_pair(
+                    0, static_cast<int>(LaneBase + HalfSize + Sel));
+              }
+              return std::make_pair(0, static_cast<int>(DstIdx));
+            }))
       return false;
     return Success(R, E);
   }
@@ -12544,7 +13000,17 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case X86::BI__builtin_ia32_pshufd256:
   case X86::BI__builtin_ia32_pshufd512: {
     APValue R;
-    if (!evalPshufBuiltin(Info, E, false, R))
+    if (!evalShuffleGeneric(
+            Info, E, R,
+            [](unsigned DstIdx, unsigned Mask) -> std::pair<unsigned, int> {
+              constexpr unsigned LaneBits = 128u;
+              constexpr unsigned ElemBits = 32u;
+              constexpr unsigned LaneElts = LaneBits / ElemBits;
+              unsigned LaneBase = (DstIdx / LaneElts) * LaneElts;
+              unsigned LaneIdx = DstIdx % LaneElts;
+              unsigned Sel = (Mask >> (2 * LaneIdx)) & 0x3;
+              return std::make_pair(0, static_cast<int>(LaneBase + Sel));
+            }))
       return false;
     return Success(R, E);
   }
@@ -13024,61 +13490,144 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case X86::BI__builtin_ia32_pslldqi128_byteshift:
   case X86::BI__builtin_ia32_pslldqi256_byteshift:
   case X86::BI__builtin_ia32_pslldqi512_byteshift: {
-    assert(E->getNumArgs() == 2);
-
-    APValue Src;
-    APSInt Imm;
-    if (!EvaluateAsRValue(Info, E->getArg(0), Src) ||
-        !EvaluateInteger(E->getArg(1), Imm, Info))
+    APValue R;
+    if (!evalShuffleGeneric(
+            Info, E, R,
+            [](unsigned DstIdx, unsigned Shift) -> std::pair<unsigned, int> {
+              unsigned LaneBase = (DstIdx / 16) * 16;
+              unsigned LaneIdx = DstIdx % 16;
+              if (LaneIdx < Shift)
+                return std::make_pair(0, -1);
+
+              return std::make_pair(
+                  0, static_cast<int>(LaneBase + LaneIdx - Shift));
+            }))
       return false;
-
-    unsigned VecLen = Src.getVectorLength();
-    unsigned Shift = Imm.getZExtValue() & 0xff;
-
-    SmallVector<APValue> ResultElements;
-    for (unsigned Lane = 0; Lane != VecLen; Lane += 16) {
-      for (unsigned I = 0; I != 16; ++I) {
-        if (I < Shift) {
-          APSInt Zero(8, /*isUnsigned=*/true);
-          Zero = 0;
-          ResultElements.push_back(APValue(Zero));
-        } else {
-          ResultElements.push_back(Src.getVectorElt(Lane + I - Shift));
-        }
-      }
-    }
-
-    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+    return Success(R, E);
   }
 
   case X86::BI__builtin_ia32_psrldqi128_byteshift:
   case X86::BI__builtin_ia32_psrldqi256_byteshift:
   case X86::BI__builtin_ia32_psrldqi512_byteshift: {
-    assert(E->getNumArgs() == 2);
-
-    APValue Src;
-    APSInt Imm;
-    if (!EvaluateAsRValue(Info, E->getArg(0), Src) ||
-        !EvaluateInteger(E->getArg(1), Imm, Info))
+    APValue R;
+    if (!evalShuffleGeneric(
+            Info, E, R,
+            [](unsigned DstIdx, unsigned Shift) -> std::pair<unsigned, int> {
+              unsigned LaneBase = (DstIdx / 16) * 16;
+              unsigned LaneIdx = DstIdx % 16;
+              if (LaneIdx + Shift < 16)
+                return std::make_pair(
+                    0, static_cast<int>(LaneBase + LaneIdx + Shift));
+
+              return std::make_pair(0, -1);
+            }))
       return false;
+    return Success(R, E);
+  }
 
-    unsigned VecLen = Src.getVectorLength();
-    unsigned Shift = Imm.getZExtValue() & 0xff;
-
-    SmallVector<APValue> ResultElements;
-    for (unsigned Lane = 0; Lane != VecLen; Lane += 16) {
-      for (unsigned I = 0; I != 16; ++I) {
-        if (I + Shift < 16) {
-          ResultElements.push_back(Src.getVectorElt(Lane + I + Shift));
-        } else {
-          APSInt Zero(8, /*isUnsigned=*/true);
-          Zero = 0;
-          ResultElements.push_back(APValue(Zero));
-        }
-      }
-    }
+  case X86::BI__builtin_ia32_palignr128:
+  case X86::BI__builtin_ia32_palignr256:
+  case X86::BI__builtin_ia32_palignr512: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R, [](unsigned DstIdx, unsigned Shift) {
+          // Default to -1 → zero-fill this destination element
+          unsigned VecIdx = 1;
+          int ElemIdx = -1;
+
+          int Lane = DstIdx / 16;
+          int Offset = DstIdx % 16;
+
+          // Elements come from VecB first, then VecA after the shift boundary
+          unsigned ShiftedIdx = Offset + (Shift & 0xFF);
+          if (ShiftedIdx < 16) { // from VecB
+            ElemIdx = ShiftedIdx + (Lane * 16);
+          } else if (ShiftedIdx < 32) { // from VecA
+            VecIdx = 0;
+            ElemIdx = (ShiftedIdx - 16) + (Lane * 16);
+          }
 
-    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+          return std::pair<unsigned, int>{VecIdx, ElemIdx};
+        }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_vpermi2varq128:
+  case X86::BI__builtin_ia32_vpermi2varpd128: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0x1;
+                              unsigned SrcIdx = (ShuffleMask >> 1) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_vpermi2vard128:
+  case X86::BI__builtin_ia32_vpermi2varps128:
+  case X86::BI__builtin_ia32_vpermi2varq256:
+  case X86::BI__builtin_ia32_vpermi2varpd256: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0x3;
+                              unsigned SrcIdx = (ShuffleMask >> 2) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_vpermi2varhi128:
+  case X86::BI__builtin_ia32_vpermi2vard256:
+  case X86::BI__builtin_ia32_vpermi2varps256:
+  case X86::BI__builtin_ia32_vpermi2varq512:
+  case X86::BI__builtin_ia32_vpermi2varpd512: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0x7;
+                              unsigned SrcIdx = (ShuffleMask >> 3) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_vpermi2varqi128:
+  case X86::BI__builtin_ia32_vpermi2varhi256:
+  case X86::BI__builtin_ia32_vpermi2vard512:
+  case X86::BI__builtin_ia32_vpermi2varps512: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0xF;
+                              unsigned SrcIdx = (ShuffleMask >> 4) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_vpermi2varqi256:
+  case X86::BI__builtin_ia32_vpermi2varhi512: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0x1F;
+                              unsigned SrcIdx = (ShuffleMask >> 5) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_vpermi2varqi512: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0x3F;
+                              unsigned SrcIdx = (ShuffleMask >> 6) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
   }
   }
 }
@@ -13218,6 +13767,7 @@ namespace {
     bool VisitCallExpr(const CallExpr *E) {
       return handleCallExpr(E, Result, &This);
     }
+    bool VisitCastExpr(const CastExpr *E);
     bool VisitInitListExpr(const InitListExpr *E,
                            QualType AllocType = QualType());
     bool VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E);
@@ -13288,6 +13838,49 @@ static bool MaybeElementDependentArrayFiller(const Expr *FillerExpr) {
   return true;
 }
 
+bool ArrayExprEvaluator::VisitCastExpr(const CastExpr *E) {
+  const Expr *SE = E->getSubExpr();
+
+  switch (E->getCastKind()) {
+  default:
+    return ExprEvaluatorBaseTy::VisitCastExpr(E);
+  case CK_HLSLAggregateSplatCast: {
+    APValue Val;
+    QualType ValTy;
+
+    if (!hlslAggSplatHelper(Info, SE, Val, ValTy))
+      return false;
+
+    unsigned NEls = elementwiseSize(Info, E->getType());
+
+    SmallVector<APValue> SplatEls(NEls, Val);
+    SmallVector<QualType> SplatType(NEls, ValTy);
+
+    // cast the elements
+    const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+    if (!constructAggregate(Info, FPO, E, Result, E->getType(), SplatEls,
+                            SplatType))
+      return false;
+
+    return true;
+  }
+  case CK_HLSLElementwiseCast: {
+    SmallVector<APValue> SrcEls;
+    SmallVector<QualType> SrcTypes;
+
+    if (!hlslElementwiseCastHelper(Info, SE, E->getType(), SrcEls, SrcTypes))
+      return false;
+
+    // cast the elements
+    const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+    if (!constructAggregate(Info, FPO, E, Result, E->getType(), SrcEls,
+                            SrcTypes))
+      return false;
+    return true;
+  }
+  }
+}
+
 bool ArrayExprEvaluator::VisitInitListExpr(const InitListExpr *E,
                                            QualType AllocType) {
   const ConstantArrayType *CAT = Info.Ctx.getAsConstantArrayType(
@@ -15646,6 +16239,54 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     return Success(Val, E);
   }
 
+  case clang::X86::BI__builtin_ia32_ktestcqi:
+  case clang::X86::BI__builtin_ia32_ktestchi:
+  case clang::X86::BI__builtin_ia32_ktestcsi:
+  case clang::X86::BI__builtin_ia32_ktestcdi: {
+    APSInt A, B;
+    if (!EvaluateInteger(E->getArg(0), A, Info) ||
+        !EvaluateInteger(E->getArg(1), B, Info))
+      return false;
+
+    return Success((~A & B) == 0, E);
+  }
+
+  case clang::X86::BI__builtin_ia32_ktestzqi:
+  case clang::X86::BI__builtin_ia32_ktestzhi:
+  case clang::X86::BI__builtin_ia32_ktestzsi:
+  case clang::X86::BI__builtin_ia32_ktestzdi: {
+    APSInt A, B;
+    if (!EvaluateInteger(E->getArg(0), A, Info) ||
+        !EvaluateInteger(E->getArg(1), B, Info))
+      return false;
+
+    return Success((A & B) == 0, E);
+  }
+
+  case clang::X86::BI__builtin_ia32_kortestcqi:
+  case clang::X86::BI__builtin_ia32_kortestchi:
+  case clang::X86::BI__builtin_ia32_kortestcsi:
+  case clang::X86::BI__builtin_ia32_kortestcdi: {
+    APSInt A, B;
+    if (!EvaluateInteger(E->getArg(0), A, Info) ||
+        !EvaluateInteger(E->getArg(1), B, Info))
+      return false;
+
+    return Success(~(A | B) == 0, E);
+  }
+
+  case clang::X86::BI__builtin_ia32_kortestzqi:
+  case clang::X86::BI__builtin_ia32_kortestzhi:
+  case clang::X86::BI__builtin_ia32_kortestzsi:
+  case clang::X86::BI__builtin_ia32_kortestzdi: {
+    APSInt A, B;
+    if (!EvaluateInteger(E->getArg(0), A, Info) ||
+        !EvaluateInteger(E->getArg(1), B, Info))
+      return false;
+
+    return Success((A | B) == 0, E);
+  }
+
   case clang::X86::BI__builtin_ia32_lzcnt_u16:
   case clang::X86::BI__builtin_ia32_lzcnt_u32:
   case clang::X86::BI__builtin_ia32_lzcnt_u64: {
@@ -17094,7 +17735,6 @@ bool IntExprEvaluator::VisitCastExpr(const CastExpr *E) {
   case CK_NoOp:
   case CK_LValueToRValueBitCast:
   case CK_HLSLArrayRValue:
-  case CK_HLSLElementwiseCast:
     return ExprEvaluatorBaseTy::VisitCastExpr(E);
 
   case CK_MemberPointerToBoolean:
@@ -17241,6 +17881,21 @@ bool IntExprEvaluator::VisitCastExpr(const CastExpr *E) {
       return Error(E);
     return Success(Val.getVectorElt(0), E);
   }
+  case CK_HLSLElementwiseCast: {
+    SmallVector<APValue> SrcVals;
+    SmallVector<QualType> SrcTypes;
+
+    if (!hlslElementwiseCastHelper(Info, SubExpr, DestType, SrcVals, SrcTypes))
+      return false;
+
+    // cast our single element
+    const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+    APValue ResultVal;
+    if (!handleScalarCast(Info, FPO, E, SrcTypes[0], DestType, SrcVals[0],
+                          ResultVal))
+      return false;
+    return Success(ResultVal, E);
+  }
   }
 
   llvm_unreachable("unknown cast resulting in integral value");
@@ -17778,6 +18433,9 @@ bool FloatExprEvaluator::VisitCastExpr(const CastExpr *E) {
   default:
     return ExprEvaluatorBaseTy::VisitCastExpr(E);
 
+  case CK_HLSLAggregateSplatCast:
+    llvm_unreachable("invalid cast kind for floating value");
+
   case CK_IntegralToFloating: {
     APSInt IntResult;
     const FPOptions FPO = E->getFPFeaturesInEffect(
@@ -17816,6 +18474,23 @@ bool FloatExprEvaluator::VisitCastExpr(const CastExpr *E) {
       return Error(E);
     return Success(Val.getVectorElt(0), E);
   }
+  case CK_HLSLElementwiseCast: {
+    SmallVector<APValue> SrcVals;
+    SmallVector<QualType> SrcTypes;
+
+    if (!hlslElementwiseCastHelper(Info, SubExpr, E->getType(), SrcVals,
+                                   SrcTypes))
+      return false;
+    APValue Val;
+
+    // cast our single element
+    const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+    APValue ResultVal;
+    if (!handleScalarCast(Info, FPO, E, SrcTypes[0], E->getType(), SrcVals[0],
+                          ResultVal))
+      return false;
+    return Success(ResultVal, E);
+  }
   }
 }
 
diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
index 9f4dba9f14fa6..89abf888cbbba 100644
--- a/clang/lib/AST/JSONNodeDumper.cpp
+++ b/clang/lib/AST/JSONNodeDumper.cpp
@@ -272,15 +272,13 @@ void JSONNodeDumper::writeIncludeStack(PresumedLoc Loc, bool JustFirst) {
   JOS.attributeEnd();
 }
 
-void JSONNodeDumper::writeBareSourceLocation(SourceLocation Loc,
-                                             bool IsSpelling) {
+void JSONNodeDumper::writeBareSourceLocation(SourceLocation Loc) {
   PresumedLoc Presumed = SM.getPresumedLoc(Loc);
-  unsigned ActualLine = IsSpelling ? SM.getSpellingLineNumber(Loc)
-                                   : SM.getExpansionLineNumber(Loc);
-  StringRef ActualFile = SM.getBufferName(Loc);
-
   if (Presumed.isValid()) {
-    JOS.attribute("offset", SM.getDecomposedLoc(Loc).second);
+    StringRef ActualFile = SM.getBufferName(Loc);
+    auto [FID, FilePos] = SM.getDecomposedLoc(Loc);
+    unsigned ActualLine = SM.getLineNumber(FID, FilePos);
+    JOS.attribute("offset", FilePos);
     if (LastLocFilename != ActualFile) {
       JOS.attribute("file", ActualFile);
       JOS.attribute("line", ActualLine);
@@ -318,18 +316,17 @@ void JSONNodeDumper::writeSourceLocation(SourceLocation Loc) {
   if (Expansion != Spelling) {
     // If the expansion and the spelling are different, output subobjects
     // describing both locations.
-    JOS.attributeObject("spellingLoc", [Spelling, this] {
-      writeBareSourceLocation(Spelling, /*IsSpelling*/ true);
-    });
+    JOS.attributeObject(
+        "spellingLoc", [Spelling, this] { writeBareSourceLocation(Spelling); });
     JOS.attributeObject("expansionLoc", [Expansion, Loc, this] {
-      writeBareSourceLocation(Expansion, /*IsSpelling*/ false);
+      writeBareSourceLocation(Expansion);
       // If there is a macro expansion, add extra information if the interesting
       // bit is the macro arg expansion.
       if (SM.isMacroArgExpansion(Loc))
         JOS.attribute("isMacroArgExpansion", true);
     });
   } else
-    writeBareSourceLocation(Spelling, /*IsSpelling*/ true);
+    writeBareSourceLocation(Spelling);
 }
 
 void JSONNodeDumper::writeSourceRange(SourceRange R) {
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index 59d94590e04d1..0640fed823771 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -105,6 +105,8 @@ const OMPClauseWithPreInit *OMPClauseWithPreInit::get(const OMPClause *C) {
     return static_cast<const OMPFilterClause *>(C);
   case OMPC_ompx_dyn_cgroup_mem:
     return static_cast<const OMPXDynCGroupMemClause *>(C);
+  case OMPC_dyn_groupprivate:
+    return static_cast<const OMPDynGroupprivateClause *>(C);
   case OMPC_message:
     return static_cast<const OMPMessageClause *>(C);
   case OMPC_default:
@@ -2857,6 +2859,24 @@ void OMPClausePrinter::VisitOMPXDynCGroupMemClause(
   OS << ")";
 }
 
+void OMPClausePrinter::VisitOMPDynGroupprivateClause(
+    OMPDynGroupprivateClause *Node) {
+  OS << "dyn_groupprivate(";
+  if (Node->getDynGroupprivateModifier() != OMPC_DYN_GROUPPRIVATE_unknown) {
+    OS << getOpenMPSimpleClauseTypeName(OMPC_dyn_groupprivate,
+                                        Node->getDynGroupprivateModifier());
+    if (Node->getDynGroupprivateFallbackModifier() !=
+        OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown) {
+      OS << ", ";
+      OS << getOpenMPSimpleClauseTypeName(
+          OMPC_dyn_groupprivate, Node->getDynGroupprivateFallbackModifier());
+    }
+    OS << ": ";
+  }
+  Node->getSize()->printPretty(OS, nullptr, Policy, 0);
+  OS << ')';
+}
+
 void OMPClausePrinter::VisitOMPDoacrossClause(OMPDoacrossClause *Node) {
   OS << "doacross(";
   OpenMPDoacrossClauseModifier DepType = Node->getDependenceType();
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index c909e1bcecd38..4a8c638c85331 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -968,6 +968,12 @@ void OMPClauseProfiler::VisitOMPXDynCGroupMemClause(
   if (Expr *Size = C->getSize())
     Profiler->VisitStmt(Size);
 }
+void OMPClauseProfiler::VisitOMPDynGroupprivateClause(
+    const OMPDynGroupprivateClause *C) {
+  VisitOMPClauseWithPreInit(C);
+  if (auto *Size = C->getSize())
+    Profiler->VisitStmt(Size);
+}
 void OMPClauseProfiler::VisitOMPDoacrossClause(const OMPDoacrossClause *C) {
   VisitOMPClauseList(C);
 }
diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
index 42f124ba852ed..0874b3d0c45f5 100644
--- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -954,6 +954,8 @@ const internal::VariadicDynCastAllOfMatcher<Stmt, CXXTryStmt> cxxTryStmt;
 const internal::VariadicDynCastAllOfMatcher<Stmt, CXXThrowExpr> cxxThrowExpr;
 const internal::VariadicDynCastAllOfMatcher<Stmt, NullStmt> nullStmt;
 const internal::VariadicDynCastAllOfMatcher<Stmt, AsmStmt> asmStmt;
+const internal::VariadicDynCastAllOfMatcher<Decl, FileScopeAsmDecl>
+    fileScopeAsmDecl;
 const internal::VariadicDynCastAllOfMatcher<Stmt, CXXBoolLiteralExpr>
     cxxBoolLiteral;
 const internal::VariadicDynCastAllOfMatcher<Stmt, StringLiteral> stringLiteral;
diff --git a/clang/lib/ASTMatchers/Dynamic/Registry.cpp b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
index 01c03f309a77b..66848f7c42127 100644
--- a/clang/lib/ASTMatchers/Dynamic/Registry.cpp
+++ b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
@@ -246,6 +246,7 @@ RegistryMaps::RegistryMaps() {
   REGISTER_MATCHER(expr);
   REGISTER_MATCHER(exprWithCleanups);
   REGISTER_MATCHER(fieldDecl);
+  REGISTER_MATCHER(fileScopeAsmDecl);
   REGISTER_MATCHER(fixedPointLiteral);
   REGISTER_MATCHER(floatLiteral);
   REGISTER_MATCHER(forCallable);
diff --git a/clang/lib/Analysis/AnalysisDeclContext.cpp b/clang/lib/Analysis/AnalysisDeclContext.cpp
index 5a52056f3e6a5..f188fc6921ed1 100644
--- a/clang/lib/Analysis/AnalysisDeclContext.cpp
+++ b/clang/lib/Analysis/AnalysisDeclContext.cpp
@@ -117,6 +117,11 @@ Stmt *AnalysisDeclContext::getBody(bool &IsAutosynthesized) const {
     return BD->getBody();
   else if (const auto *FunTmpl = dyn_cast_or_null<FunctionTemplateDecl>(D))
     return FunTmpl->getTemplatedDecl()->getBody();
+  else if (const auto *VD = dyn_cast_or_null<VarDecl>(D)) {
+    if (VD->isFileVarDecl()) {
+      return const_cast<Stmt *>(dyn_cast_or_null<Stmt>(VD->getInit()));
+    }
+  }
 
   llvm_unreachable("unknown code decl");
 }
diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
index 75b17c545bb78..2f40c7e4888e3 100644
--- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp
+++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
@@ -238,10 +238,12 @@ const auto isMoveOnly = [] {
 };
 
 template <class T> struct NodeID;
-template <> struct NodeID<Expr> { static constexpr StringRef value = "expr"; };
-template <> struct NodeID<Decl> { static constexpr StringRef value = "decl"; };
-constexpr StringRef NodeID<Expr>::value;
-constexpr StringRef NodeID<Decl>::value;
+template <> struct NodeID<Expr> {
+  static constexpr StringRef value = "expr";
+};
+template <> struct NodeID<Decl> {
+  static constexpr StringRef value = "decl";
+};
 
 template <class T,
           class F = const Stmt *(ExprMutationAnalyzer::Analyzer::*)(const T *)>
@@ -746,11 +748,14 @@ ExprMutationAnalyzer::Analyzer::findPointeeMemberMutation(const Expr *Exp) {
                     Stm, Context));
   if (MemberCallExpr)
     return MemberCallExpr;
-  const auto Matches =
-      match(stmt(forEachDescendant(
-                memberExpr(hasObjectExpression(canResolveToExprPointee(Exp)))
-                    .bind(NodeID<Expr>::value))),
-            Stm, Context);
+  const auto Matches = match(
+      stmt(forEachDescendant(
+          expr(anyOf(memberExpr(
+                         hasObjectExpression(canResolveToExprPointee(Exp))),
+                     binaryOperator(hasOperatorName("->*"),
+                                    hasLHS(canResolveToExprPointee(Exp)))))
+              .bind(NodeID<Expr>::value))),
+      Stm, Context);
   return findExprMutation(Matches);
 }
 
diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
index 0fa333eedcfdd..d90f5d4eaf7bb 100644
--- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
@@ -1153,26 +1153,34 @@ auto buildDiagnoseMatchSwitch(
   // FIXME: Evaluate the efficiency of matchers. If using matchers results in a
   // lot of duplicated work (e.g. string comparisons), consider providing APIs
   // that avoid it through memoization.
-  auto IgnorableOptional = ignorableOptional(Options);
-  return CFGMatchSwitchBuilder<
-             const Environment,
-             llvm::SmallVector<UncheckedOptionalAccessDiagnostic>>()
-      // optional::value
-      .CaseOfCFGStmt<CXXMemberCallExpr>(
-          valueCall(IgnorableOptional),
-          [](const CXXMemberCallExpr *E, const MatchFinder::MatchResult &,
-             const Environment &Env) {
-            return diagnoseUnwrapCall(E->getImplicitObjectArgument(), Env);
-          })
-
-      // optional::operator*, optional::operator->
-      .CaseOfCFGStmt<CallExpr>(valueOperatorCall(IgnorableOptional),
-                               [](const CallExpr *E,
+  const auto IgnorableOptional = ignorableOptional(Options);
+
+  auto DiagBuilder =
+      CFGMatchSwitchBuilder<
+          const Environment,
+          llvm::SmallVector<UncheckedOptionalAccessDiagnostic>>()
+          // optional::operator*, optional::operator->
+          .CaseOfCFGStmt<CallExpr>(
+              valueOperatorCall(IgnorableOptional),
+              [](const CallExpr *E, const MatchFinder::MatchResult &,
+                 const Environment &Env) {
+                return diagnoseUnwrapCall(E->getArg(0), Env);
+              });
+
+  auto Builder = Options.IgnoreValueCalls
+                     ? std::move(DiagBuilder)
+                     : std::move(DiagBuilder)
+                           // optional::value
+                           .CaseOfCFGStmt<CXXMemberCallExpr>(
+                               valueCall(IgnorableOptional),
+                               [](const CXXMemberCallExpr *E,
                                   const MatchFinder::MatchResult &,
                                   const Environment &Env) {
-                                 return diagnoseUnwrapCall(E->getArg(0), Env);
-                               })
-      .Build();
+                                 return diagnoseUnwrapCall(
+                                     E->getImplicitObjectArgument(), Env);
+                               });
+
+  return std::move(Builder).Build();
 }
 
 } // namespace
diff --git a/clang/lib/Analysis/LifetimeSafety/Dataflow.h b/clang/lib/Analysis/LifetimeSafety/Dataflow.h
index 2f7bcb6e5dc81..de821bb17eb6b 100644
--- a/clang/lib/Analysis/LifetimeSafety/Dataflow.h
+++ b/clang/lib/Analysis/LifetimeSafety/Dataflow.h
@@ -67,10 +67,10 @@ class DataflowAnalysis {
   llvm::DenseMap<const CFGBlock *, Lattice> InStates;
   /// The dataflow state after a basic block is processed.
   llvm::DenseMap<const CFGBlock *, Lattice> OutStates;
-  /// The dataflow state at a Program Point.
+  /// Dataflow state at each program point, indexed by Fact ID.
   /// In a forward analysis, this is the state after the Fact at that point has
   /// been applied, while in a backward analysis, it is the state before.
-  llvm::DenseMap<ProgramPoint, Lattice> PerPointStates;
+  llvm::SmallVector<Lattice> PointToState;
 
   static constexpr bool isForward() { return Dir == Direction::Forward; }
 
@@ -86,6 +86,8 @@ class DataflowAnalysis {
     Derived &D = static_cast<Derived &>(*this);
     llvm::TimeTraceScope Time(D.getAnalysisName());
 
+    PointToState.resize(FactMgr.getNumFacts());
+
     using Worklist =
         std::conditional_t<Dir == Direction::Forward, ForwardDataflowWorklist,
                            BackwardDataflowWorklist>;
@@ -116,7 +118,9 @@ class DataflowAnalysis {
   }
 
 protected:
-  Lattice getState(ProgramPoint P) const { return PerPointStates.lookup(P); }
+  Lattice getState(ProgramPoint P) const {
+    return PointToState[P->getID().Value];
+  }
 
   std::optional<Lattice> getInState(const CFGBlock *B) const {
     auto It = InStates.find(B);
@@ -144,12 +148,12 @@ class DataflowAnalysis {
     if constexpr (isForward()) {
       for (const Fact *F : Facts) {
         State = transferFact(State, F);
-        PerPointStates[F] = State;
+        PointToState[F->getID().Value] = State;
       }
     } else {
       for (const Fact *F : llvm::reverse(Facts)) {
         // In backward analysis, capture the state before applying the fact.
-        PerPointStates[F] = State;
+        PointToState[F->getID().Value] = State;
         State = transferFact(State, F);
       }
     }
diff --git a/clang/lib/Analysis/LifetimeSafety/Facts.cpp b/clang/lib/Analysis/LifetimeSafety/Facts.cpp
index 1aea64f864367..190c038f46401 100644
--- a/clang/lib/Analysis/LifetimeSafety/Facts.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/Facts.cpp
@@ -53,7 +53,7 @@ void ReturnOfOriginFact::dump(llvm::raw_ostream &OS, const LoanManager &,
 void UseFact::dump(llvm::raw_ostream &OS, const LoanManager &,
                    const OriginManager &OM) const {
   OS << "Use (";
-  OM.dump(getUsedOrigin(OM), OS);
+  OM.dump(getUsedOrigin(), OS);
   OS << ", " << (isWritten() ? "Write" : "Read") << ")\n";
 }
 
@@ -64,12 +64,11 @@ void TestPointFact::dump(llvm::raw_ostream &OS, const LoanManager &,
 
 llvm::StringMap<ProgramPoint> FactManager::getTestPoints() const {
   llvm::StringMap<ProgramPoint> AnnotationToPointMap;
-  for (const CFGBlock *Block : BlockToFactsMap.keys()) {
-    for (const Fact *F : getFacts(Block)) {
+  for (const auto &BlockFacts : BlockToFacts) {
+    for (const Fact *F : BlockFacts) {
       if (const auto *TPF = F->getAs<TestPointFact>()) {
         StringRef PointName = TPF->getAnnotation();
-        assert(AnnotationToPointMap.find(PointName) ==
-                   AnnotationToPointMap.end() &&
+        assert(!AnnotationToPointMap.contains(PointName) &&
                "more than one test points with the same name");
         AnnotationToPointMap[PointName] = F;
       }
@@ -88,12 +87,9 @@ void FactManager::dump(const CFG &Cfg, AnalysisDeclContext &AC) const {
   // Print blocks in the order as they appear in code for a stable ordering.
   for (const CFGBlock *B : *AC.getAnalysis<PostOrderCFGView>()) {
     llvm::dbgs() << "  Block B" << B->getBlockID() << ":\n";
-    auto It = BlockToFactsMap.find(B);
-    if (It != BlockToFactsMap.end()) {
-      for (const Fact *F : It->second) {
-        llvm::dbgs() << "    ";
-        F->dump(llvm::dbgs(), LoanMgr, OriginMgr);
-      }
+    for (const Fact *F : getFacts(B)) {
+      llvm::dbgs() << "    ";
+      F->dump(llvm::dbgs(), LoanMgr, OriginMgr);
     }
     llvm::dbgs() << "  End of Block\n";
   }
diff --git a/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp b/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp
index 9b68de107e314..381ff99aae420 100644
--- a/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp
@@ -176,6 +176,15 @@ void FactsGenerator::VisitBinaryOperator(const BinaryOperator *BO) {
     handleAssignment(BO->getLHS(), BO->getRHS());
 }
 
+void FactsGenerator::VisitConditionalOperator(const ConditionalOperator *CO) {
+  if (hasOrigin(CO)) {
+    // Merge origins from both branches of the conditional operator.
+    // We kill to clear the initial state and merge both origins into it.
+    killAndFlowOrigin(*CO, *CO->getTrueExpr());
+    flowOrigin(*CO, *CO->getFalseExpr());
+  }
+}
+
 void FactsGenerator::VisitCXXOperatorCallExpr(const CXXOperatorCallExpr *OCE) {
   // Assignment operators have special "kill-then-propagate" semantics
   // and are handled separately.
@@ -333,7 +342,7 @@ void FactsGenerator::handleAssignment(const Expr *LHSExpr,
 // (e.g. on the left-hand side of an assignment).
 void FactsGenerator::handleUse(const DeclRefExpr *DRE) {
   if (isPointerType(DRE->getType())) {
-    UseFact *UF = FactMgr.createFact<UseFact>(DRE);
+    UseFact *UF = FactMgr.createFact<UseFact>(DRE, FactMgr.getOriginMgr());
     CurrentBlockFacts.push_back(UF);
     assert(!UseFacts.contains(DRE));
     UseFacts[DRE] = UF;
diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
index 00c7ed90503e7..a51ba4280f284 100644
--- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
@@ -41,6 +41,7 @@ void LifetimeSafetyAnalysis::run() {
   const CFG &Cfg = *AC.getCFG();
   DEBUG_WITH_TYPE("PrintCFG", Cfg.dump(AC.getASTContext().getLangOpts(),
                                        /*ShowColors=*/true));
+  FactMgr.init(Cfg);
 
   FactsGenerator FactGen(FactMgr, AC);
   FactGen.run();
diff --git a/clang/lib/Analysis/LifetimeSafety/LiveOrigins.cpp b/clang/lib/Analysis/LifetimeSafety/LiveOrigins.cpp
index cddb3f3ce4c1c..59f594e50fb46 100644
--- a/clang/lib/Analysis/LifetimeSafety/LiveOrigins.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/LiveOrigins.cpp
@@ -111,7 +111,7 @@ class AnalysisImpl
   /// dominates this program point. A write operation kills the liveness of
   /// the origin since it overwrites the value.
   Lattice transfer(Lattice In, const UseFact &UF) {
-    OriginID OID = UF.getUsedOrigin(FactMgr.getOriginMgr());
+    OriginID OID = UF.getUsedOrigin();
     // Write kills liveness.
     if (UF.isWritten())
       return Lattice(Factory.remove(In.LiveOrigins, OID));
diff --git a/clang/lib/Analysis/LifetimeSafety/LoanPropagation.cpp b/clang/lib/Analysis/LifetimeSafety/LoanPropagation.cpp
index 387097e705f94..0e6c194123df8 100644
--- a/clang/lib/Analysis/LifetimeSafety/LoanPropagation.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/LoanPropagation.cpp
@@ -5,36 +5,114 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-#include "clang/Analysis/Analyses/LifetimeSafety/LoanPropagation.h"
-#include "Dataflow.h"
+#include <cassert>
 #include <memory>
 
+#include "Dataflow.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Facts.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/LoanPropagation.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Loans.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Utils.h"
+#include "clang/Analysis/AnalysisDeclContext.h"
+#include "clang/Analysis/CFG.h"
+#include "clang/Basic/LLVM.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/TimeProfiler.h"
+#include "llvm/Support/raw_ostream.h"
+
 namespace clang::lifetimes::internal {
+
+// Prepass to find persistent origins. An origin is persistent if it is
+// referenced in more than one basic block.
+static llvm::BitVector computePersistentOrigins(const FactManager &FactMgr,
+                                                const CFG &C) {
+  llvm::TimeTraceScope("ComputePersistentOrigins");
+  unsigned NumOrigins = FactMgr.getOriginMgr().getNumOrigins();
+  llvm::BitVector PersistentOrigins(NumOrigins);
+
+  llvm::SmallVector<const CFGBlock *> OriginToFirstSeenBlock(NumOrigins,
+                                                             nullptr);
+  for (const CFGBlock *B : C) {
+    for (const Fact *F : FactMgr.getFacts(B)) {
+      auto CheckOrigin = [&](OriginID OID) {
+        if (PersistentOrigins.test(OID.Value))
+          return;
+        auto &FirstSeenBlock = OriginToFirstSeenBlock[OID.Value];
+        if (FirstSeenBlock == nullptr)
+          FirstSeenBlock = B;
+        if (FirstSeenBlock != B) {
+          // We saw this origin in more than one block.
+          PersistentOrigins.set(OID.Value);
+        }
+      };
+
+      switch (F->getKind()) {
+      case Fact::Kind::Issue:
+        CheckOrigin(F->getAs<IssueFact>()->getOriginID());
+        break;
+      case Fact::Kind::OriginFlow: {
+        const auto *OF = F->getAs<OriginFlowFact>();
+        CheckOrigin(OF->getDestOriginID());
+        CheckOrigin(OF->getSrcOriginID());
+        break;
+      }
+      case Fact::Kind::ReturnOfOrigin:
+        CheckOrigin(F->getAs<ReturnOfOriginFact>()->getReturnedOriginID());
+        break;
+      case Fact::Kind::Use:
+        CheckOrigin(F->getAs<UseFact>()->getUsedOrigin());
+        break;
+      case Fact::Kind::Expire:
+      case Fact::Kind::TestPoint:
+        break;
+      }
+    }
+  }
+  return PersistentOrigins;
+}
+
 namespace {
+
 /// Represents the dataflow lattice for loan propagation.
 ///
 /// This lattice tracks which loans each origin may hold at a given program
 /// point.The lattice has a finite height: An origin's loan set is bounded by
 /// the total number of loans in the function.
-/// TODO(opt): To reduce the lattice size, propagate origins of declarations,
-/// not expressions, because expressions are not visible across blocks.
 struct Lattice {
   /// The map from an origin to the set of loans it contains.
-  OriginLoanMap Origins = OriginLoanMap(nullptr);
-
-  explicit Lattice(const OriginLoanMap &S) : Origins(S) {}
+  /// Origins that appear in multiple blocks. Participates in join operations.
+  OriginLoanMap PersistentOrigins = OriginLoanMap(nullptr);
+  /// Origins confined to a single block. Discarded at block boundaries.
+  OriginLoanMap BlockLocalOrigins = OriginLoanMap(nullptr);
+
+  explicit Lattice(const OriginLoanMap &Persistent,
+                   const OriginLoanMap &BlockLocal)
+      : PersistentOrigins(Persistent), BlockLocalOrigins(BlockLocal) {}
   Lattice() = default;
 
   bool operator==(const Lattice &Other) const {
-    return Origins == Other.Origins;
+    return PersistentOrigins == Other.PersistentOrigins &&
+           BlockLocalOrigins == Other.BlockLocalOrigins;
   }
   bool operator!=(const Lattice &Other) const { return !(*this == Other); }
 
   void dump(llvm::raw_ostream &OS) const {
     OS << "LoanPropagationLattice State:\n";
-    if (Origins.isEmpty())
+    OS << " Persistent Origins:\n";
+    if (PersistentOrigins.isEmpty())
       OS << "  <empty>\n";
-    for (const auto &Entry : Origins) {
+    for (const auto &Entry : PersistentOrigins) {
+      if (Entry.second.isEmpty())
+        OS << "  Origin " << Entry.first << " contains no loans\n";
+      for (const LoanID &LID : Entry.second)
+        OS << "  Origin " << Entry.first << " contains Loan " << LID << "\n";
+    }
+    OS << " Block-Local Origins:\n";
+    if (BlockLocalOrigins.isEmpty())
+      OS << "  <empty>\n";
+    for (const auto &Entry : BlockLocalOrigins) {
       if (Entry.second.isEmpty())
         OS << "  Origin " << Entry.first << " contains no loans\n";
       for (const LoanID &LID : Entry.second)
@@ -50,7 +128,8 @@ class AnalysisImpl
                OriginLoanMap::Factory &OriginLoanMapFactory,
                LoanSet::Factory &LoanSetFactory)
       : DataflowAnalysis(C, AC, F), OriginLoanMapFactory(OriginLoanMapFactory),
-        LoanSetFactory(LoanSetFactory) {}
+        LoanSetFactory(LoanSetFactory),
+        PersistentOrigins(computePersistentOrigins(F, C)) {}
 
   using Base::transfer;
 
@@ -59,10 +138,10 @@ class AnalysisImpl
   Lattice getInitialState() { return Lattice{}; }
 
   /// Merges two lattices by taking the union of loans for each origin.
-  // TODO(opt): Keep the state small by removing origins which become dead.
+  /// Only persistent origins are joined; block-local origins are discarded.
   Lattice join(Lattice A, Lattice B) {
     OriginLoanMap JoinedOrigins = utils::join(
-        A.Origins, B.Origins, OriginLoanMapFactory,
+        A.PersistentOrigins, B.PersistentOrigins, OriginLoanMapFactory,
         [&](const LoanSet *S1, const LoanSet *S2) {
           assert((S1 || S2) && "unexpectedly merging 2 empty sets");
           if (!S1)
@@ -74,16 +153,15 @@ class AnalysisImpl
         // Asymmetric join is a performance win. For origins present only on one
         // branch, the loan set can be carried over as-is.
         utils::JoinKind::Asymmetric);
-    return Lattice(JoinedOrigins);
+    return Lattice(JoinedOrigins, OriginLoanMapFactory.getEmptyMap());
   }
 
   /// A new loan is issued to the origin. Old loans are erased.
   Lattice transfer(Lattice In, const IssueFact &F) {
     OriginID OID = F.getOriginID();
     LoanID LID = F.getLoanID();
-    return Lattice(OriginLoanMapFactory.add(
-        In.Origins, OID,
-        LoanSetFactory.add(LoanSetFactory.getEmptySet(), LID)));
+    LoanSet NewLoans = LoanSetFactory.add(LoanSetFactory.getEmptySet(), LID);
+    return setLoans(In, OID, NewLoans);
   }
 
   /// A flow from source to destination. If `KillDest` is true, this replaces
@@ -98,7 +176,7 @@ class AnalysisImpl
     LoanSet SrcLoans = getLoans(In, SrcOID);
     LoanSet MergedLoans = utils::join(DestLoans, SrcLoans, LoanSetFactory);
 
-    return Lattice(OriginLoanMapFactory.add(In.Origins, DestOID, MergedLoans));
+    return setLoans(In, DestOID, MergedLoans);
   }
 
   LoanSet getLoans(OriginID OID, ProgramPoint P) const {
@@ -106,14 +184,33 @@ class AnalysisImpl
   }
 
 private:
+  /// Returns true if the origin is persistent (referenced in multiple blocks).
+  bool isPersistent(OriginID OID) const {
+    return PersistentOrigins.test(OID.Value);
+  }
+
+  Lattice setLoans(Lattice L, OriginID OID, LoanSet Loans) {
+    if (isPersistent(OID))
+      return Lattice(OriginLoanMapFactory.add(L.PersistentOrigins, OID, Loans),
+                     L.BlockLocalOrigins);
+    return Lattice(L.PersistentOrigins,
+                   OriginLoanMapFactory.add(L.BlockLocalOrigins, OID, Loans));
+  }
+
   LoanSet getLoans(Lattice L, OriginID OID) const {
-    if (auto *Loans = L.Origins.lookup(OID))
+    const OriginLoanMap *Map =
+        isPersistent(OID) ? &L.PersistentOrigins : &L.BlockLocalOrigins;
+    if (auto *Loans = Map->lookup(OID))
       return *Loans;
     return LoanSetFactory.getEmptySet();
   }
 
   OriginLoanMap::Factory &OriginLoanMapFactory;
   LoanSet::Factory &LoanSetFactory;
+  /// Boolean vector indexed by origin ID. If true, the origin appears in
+  /// multiple basic blocks and must participate in join operations. If false,
+  /// the origin is block-local and can be discarded at block boundaries.
+  llvm::BitVector PersistentOrigins;
 };
 } // namespace
 
diff --git a/clang/lib/Analysis/LifetimeSafety/Origins.cpp b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
index ea51a75324e06..0f2eaa94a5987 100644
--- a/clang/lib/Analysis/LifetimeSafety/Origins.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
@@ -34,6 +34,8 @@ Origin &OriginManager::addOrigin(OriginID ID, const clang::Expr &E) {
 
 // TODO: Mark this method as const once we remove the call to getOrCreate.
 OriginID OriginManager::get(const Expr &E) {
+  if (auto *ParenIgnored = E.IgnoreParens(); ParenIgnored != &E)
+    return get(*ParenIgnored);
   auto It = ExprToOriginID.find(&E);
   if (It != ExprToOriginID.end())
     return It->second;
diff --git a/clang/lib/Basic/BuiltinTargetFeatures.h b/clang/lib/Basic/BuiltinTargetFeatures.h
index 9754acda2a68f..bf227a17f7869 100644
--- a/clang/lib/Basic/BuiltinTargetFeatures.h
+++ b/clang/lib/Basic/BuiltinTargetFeatures.h
@@ -20,7 +20,7 @@ using llvm::StringRef;
 namespace clang {
 namespace Builtin {
 /// TargetFeatures - This class is used to check whether the builtin function
-/// has the required tagert specific features. It is able to support the
+/// has the required target specific features. It is able to support the
 /// combination of ','(and), '|'(or), and '()'. By default, the priority of
 /// ',' is higher than that of '|' .
 /// E.g:
diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
index 2dec26ecacf26..5e9da245e2b43 100644
--- a/clang/lib/Basic/Diagnostic.cpp
+++ b/clang/lib/Basic/Diagnostic.cpp
@@ -534,7 +534,7 @@ WarningsSpecialCaseList::create(const llvm::MemoryBuffer &Input,
 void WarningsSpecialCaseList::processSections(DiagnosticsEngine &Diags) {
   static constexpr auto WarningFlavor = clang::diag::Flavor::WarningOrError;
   for (const auto &SectionEntry : sections()) {
-    StringRef DiagGroup = SectionEntry.SectionStr;
+    StringRef DiagGroup = SectionEntry.name();
     if (DiagGroup == "*") {
       // Drop the default section introduced by special case list, we only
       // support exact diagnostic group names.
diff --git a/clang/lib/Basic/IdentifierTable.cpp b/clang/lib/Basic/IdentifierTable.cpp
index 4a2b77cd16bfc..d1c959b9687c4 100644
--- a/clang/lib/Basic/IdentifierTable.cpp
+++ b/clang/lib/Basic/IdentifierTable.cpp
@@ -77,57 +77,6 @@ IdentifierTable::IdentifierTable(const LangOptions &LangOpts,
 // Language Keyword Implementation
 //===----------------------------------------------------------------------===//
 
-// Constants for TokenKinds.def
-namespace {
-
-enum TokenKey : unsigned {
-  KEYC99 = 0x1,
-  KEYCXX = 0x2,
-  KEYCXX11 = 0x4,
-  KEYGNU = 0x8,
-  KEYMS = 0x10,
-  BOOLSUPPORT = 0x20,
-  KEYALTIVEC = 0x40,
-  KEYNOCXX = 0x80,
-  KEYBORLAND = 0x100,
-  KEYOPENCLC = 0x200,
-  KEYC23 = 0x400,
-  KEYNOMS18 = 0x800,
-  KEYNOOPENCL = 0x1000,
-  WCHARSUPPORT = 0x2000,
-  HALFSUPPORT = 0x4000,
-  CHAR8SUPPORT = 0x8000,
-  KEYOBJC = 0x10000,
-  KEYZVECTOR = 0x20000,
-  KEYCOROUTINES = 0x40000,
-  KEYMODULES = 0x80000,
-  KEYCXX20 = 0x100000,
-  KEYOPENCLCXX = 0x200000,
-  KEYMSCOMPAT = 0x400000,
-  KEYSYCL = 0x800000,
-  KEYCUDA = 0x1000000,
-  KEYZOS = 0x2000000,
-  KEYNOZOS = 0x4000000,
-  KEYHLSL = 0x8000000,
-  KEYFIXEDPOINT = 0x10000000,
-  KEYMAX = KEYFIXEDPOINT, // The maximum key
-  KEYALLCXX = KEYCXX | KEYCXX11 | KEYCXX20,
-  KEYALL = (KEYMAX | (KEYMAX - 1)) & ~KEYNOMS18 & ~KEYNOOPENCL &
-           ~KEYNOZOS // KEYNOMS18, KEYNOOPENCL, KEYNOZOS are excluded.
-};
-
-/// How a keyword is treated in the selected standard. This enum is ordered
-/// intentionally so that the value that 'wins' is the most 'permissive'.
-enum KeywordStatus {
-  KS_Unknown,   // Not yet calculated. Used when figuring out the status.
-  KS_Disabled,  // Disabled
-  KS_Future,    // Is a keyword in future standard
-  KS_Extension, // Is an extension
-  KS_Enabled,   // Enabled
-};
-
-} // namespace
-
 // This works on a single TokenKey flag and checks the LangOpts to get the
 // KeywordStatus based exclusively on this flag, so that it can be merged in
 // getKeywordStatus. Most should be enabled/disabled, but some might imply
@@ -220,9 +169,7 @@ static KeywordStatus getKeywordStatusHelper(const LangOptions &LangOpts,
   }
 }
 
-/// Translates flags as specified in TokenKinds.def into keyword status
-/// in the given language standard.
-static KeywordStatus getKeywordStatus(const LangOptions &LangOpts,
+KeywordStatus clang::getKeywordStatus(const LangOptions &LangOpts,
                                       unsigned Flags) {
   // KEYALL means always enabled, so special case this one.
   if (Flags == KEYALL) return KS_Enabled;
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index 3d41f2d197b81..8e60fc26a7947 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -196,6 +196,16 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str,
       return OMPC_GRAINSIZE_unknown;
     return Type;
   }
+  case OMPC_dyn_groupprivate: {
+    return llvm::StringSwitch<unsigned>(Str)
+#define OPENMP_DYN_GROUPPRIVATE_MODIFIER(Name)                                 \
+  .Case(#Name, OMPC_DYN_GROUPPRIVATE_##Name)
+#define OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(Name)                        \
+  .Case(#Name, OMPC_DYN_GROUPPRIVATE_FALLBACK_##Name)                          \
+      .Case("fallback(" #Name ")", OMPC_DYN_GROUPPRIVATE_FALLBACK_##Name)
+#include "clang/Basic/OpenMPKinds.def"
+        .Default(OMPC_DYN_GROUPPRIVATE_unknown);
+  }
   case OMPC_num_tasks: {
     unsigned Type = llvm::StringSwitch<unsigned>(Str)
 #define OPENMP_NUMTASKS_MODIFIER(Name) .Case(#Name, OMPC_NUMTASKS_##Name)
@@ -544,6 +554,20 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind,
 #include "clang/Basic/OpenMPKinds.def"
     }
     llvm_unreachable("Invalid OpenMP 'grainsize' clause modifier");
+  case OMPC_dyn_groupprivate:
+    switch (Type) {
+    case OMPC_DYN_GROUPPRIVATE_unknown:
+    case OMPC_DYN_GROUPPRIVATE_FALLBACK_last:
+      return "unknown";
+#define OPENMP_DYN_GROUPPRIVATE_MODIFIER(Name)                                 \
+  case OMPC_DYN_GROUPPRIVATE_##Name:                                           \
+    return #Name;
+#define OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(Name)                        \
+  case OMPC_DYN_GROUPPRIVATE_FALLBACK_##Name:                                  \
+    return "fallback(" #Name ")";
+#include "clang/Basic/OpenMPKinds.def"
+    }
+    llvm_unreachable("Invalid OpenMP 'dyn_groupprivate' clause modifier");
   case OMPC_num_tasks:
     switch (Type) {
     case OMPC_NUMTASKS_unknown:
diff --git a/clang/lib/Basic/ProfileList.cpp b/clang/lib/Basic/ProfileList.cpp
index 9cb118893a0d9..8727057eb78d1 100644
--- a/clang/lib/Basic/ProfileList.cpp
+++ b/clang/lib/Basic/ProfileList.cpp
@@ -36,7 +36,7 @@ class ProfileSpecialCaseList : public llvm::SpecialCaseList {
 
   bool hasPrefix(StringRef Prefix) const {
     for (const auto &It : sections())
-      if (It.Entries.count(Prefix) > 0)
+      if (It.hasPrefix(Prefix))
         return true;
     return false;
   }
diff --git a/clang/lib/Basic/SanitizerSpecialCaseList.cpp b/clang/lib/Basic/SanitizerSpecialCaseList.cpp
index 56f551628cf89..928c086898097 100644
--- a/clang/lib/Basic/SanitizerSpecialCaseList.cpp
+++ b/clang/lib/Basic/SanitizerSpecialCaseList.cpp
@@ -42,7 +42,7 @@ void SanitizerSpecialCaseList::createSanitizerSections() {
     SanitizerMask Mask;
 
 #define SANITIZER(NAME, ID)                                                    \
-  if (S.SectionMatcher.matchAny(NAME))                                         \
+  if (S.matchName(NAME))                                                       \
     Mask |= SanitizerKind::ID;
 #define SANITIZER_GROUP(NAME, ID, ALIAS) SANITIZER(NAME, ID)
 
@@ -68,7 +68,7 @@ SanitizerSpecialCaseList::inSectionBlame(SanitizerMask Mask, StringRef Prefix,
     if (S.Mask & Mask) {
       unsigned LineNum = S.S.getLastMatch(Prefix, Query, Category);
       if (LineNum > 0)
-        return {S.S.FileIdx, LineNum};
+        return {S.S.fileIndex(), LineNum};
     }
   }
   return NotFound;
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 938c6485125ee..b6cc6ec9365f5 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -907,58 +907,27 @@ getExpansionLocSlowCase(SourceLocation Loc) const {
 
 SourceLocation SourceManager::getSpellingLocSlowCase(SourceLocation Loc) const {
   do {
-    FileIDAndOffset LocInfo = getDecomposedLoc(Loc);
-    Loc = getSLocEntry(LocInfo.first).getExpansion().getSpellingLoc();
-    Loc = Loc.getLocWithOffset(LocInfo.second);
+    const SLocEntry &Entry = getSLocEntry(getFileID(Loc));
+    Loc = Entry.getExpansion().getSpellingLoc().getLocWithOffset(
+        Loc.getOffset() - Entry.getOffset());
   } while (!Loc.isFileID());
   return Loc;
 }
 
 SourceLocation SourceManager::getFileLocSlowCase(SourceLocation Loc) const {
   do {
-    if (isMacroArgExpansion(Loc))
-      Loc = getImmediateSpellingLoc(Loc);
-    else
-      Loc = getImmediateExpansionRange(Loc).getBegin();
+    const SLocEntry &Entry = getSLocEntry(getFileID(Loc));
+    const ExpansionInfo &ExpInfo = Entry.getExpansion();
+    if (ExpInfo.isMacroArgExpansion()) {
+      Loc = ExpInfo.getSpellingLoc().getLocWithOffset(Loc.getOffset() -
+                                                      Entry.getOffset());
+    } else {
+      Loc = ExpInfo.getExpansionLocStart();
+    }
   } while (!Loc.isFileID());
   return Loc;
 }
 
-FileIDAndOffset SourceManager::getDecomposedExpansionLocSlowCase(
-    const SrcMgr::SLocEntry *E) const {
-  // If this is an expansion record, walk through all the expansion points.
-  FileID FID;
-  SourceLocation Loc;
-  unsigned Offset;
-  do {
-    Loc = E->getExpansion().getExpansionLocStart();
-
-    FID = getFileID(Loc);
-    E = &getSLocEntry(FID);
-    Offset = Loc.getOffset()-E->getOffset();
-  } while (!Loc.isFileID());
-
-  return std::make_pair(FID, Offset);
-}
-
-FileIDAndOffset
-SourceManager::getDecomposedSpellingLocSlowCase(const SrcMgr::SLocEntry *E,
-                                                unsigned Offset) const {
-  // If this is an expansion record, walk through all the expansion points.
-  FileID FID;
-  SourceLocation Loc;
-  do {
-    Loc = E->getExpansion().getSpellingLoc();
-    Loc = Loc.getLocWithOffset(Offset);
-
-    FID = getFileID(Loc);
-    E = &getSLocEntry(FID);
-    Offset = Loc.getOffset()-E->getOffset();
-  } while (!Loc.isFileID());
-
-  return std::make_pair(FID, Offset);
-}
-
 /// getImmediateSpellingLoc - Given a SourceLocation object, return the
 /// spelling location referenced by the ID.  This is the first level down
 /// towards the place where the characters that make up the lexed token can be
@@ -1190,17 +1159,11 @@ static bool isInvalid(LocType Loc, bool *Invalid) {
   return MyInvalid;
 }
 
-unsigned SourceManager::getSpellingColumnNumber(SourceLocation Loc,
-                                                bool *Invalid) const {
-  if (isInvalid(Loc, Invalid)) return 0;
-  FileIDAndOffset LocInfo = getDecomposedSpellingLoc(Loc);
-  return getColumnNumber(LocInfo.first, LocInfo.second, Invalid);
-}
-
-unsigned SourceManager::getExpansionColumnNumber(SourceLocation Loc,
-                                                 bool *Invalid) const {
+unsigned SourceManager::getColumnNumber(SourceLocation Loc,
+                                        bool *Invalid) const {
+  assert(Loc.isFileID());
   if (isInvalid(Loc, Invalid)) return 0;
-  FileIDAndOffset LocInfo = getDecomposedExpansionLoc(Loc);
+  FileIDAndOffset LocInfo = getDecomposedLoc(Loc);
   return getColumnNumber(LocInfo.first, LocInfo.second, Invalid);
 }
 
@@ -1398,18 +1361,13 @@ unsigned SourceManager::getLineNumber(FileID FID, unsigned FilePos,
   return LineNo;
 }
 
-unsigned SourceManager::getSpellingLineNumber(SourceLocation Loc,
-                                              bool *Invalid) const {
-  if (isInvalid(Loc, Invalid)) return 0;
-  FileIDAndOffset LocInfo = getDecomposedSpellingLoc(Loc);
-  return getLineNumber(LocInfo.first, LocInfo.second);
-}
-unsigned SourceManager::getExpansionLineNumber(SourceLocation Loc,
-                                               bool *Invalid) const {
+unsigned SourceManager::getLineNumber(SourceLocation Loc, bool *Invalid) const {
+  assert(Loc.isFileID());
   if (isInvalid(Loc, Invalid)) return 0;
-  FileIDAndOffset LocInfo = getDecomposedExpansionLoc(Loc);
+  FileIDAndOffset LocInfo = getDecomposedLoc(Loc);
   return getLineNumber(LocInfo.first, LocInfo.second);
 }
+
 unsigned SourceManager::getPresumedLineNumber(SourceLocation Loc,
                                               bool *Invalid) const {
   PresumedLoc PLoc = getPresumedLoc(Loc);
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index f4d7c1288cc04..9a5db6e164f66 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -59,6 +59,7 @@ static const LangASMap FakeAddrSpaceMap = {
 TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) {
   // Set defaults.  Defaults are set for a 32-bit RISC platform, like PPC or
   // SPARC.  These should be overridden by concrete targets as needed.
+  HasMustTail = true;
   BigEndian = !T.isLittleEndian();
   TLSSupported = true;
   VLASupported = true;
diff --git a/clang/lib/Basic/Targets/AVR.cpp b/clang/lib/Basic/Targets/AVR.cpp
index 2673669bc9035..90b4ac1b857cc 100644
--- a/clang/lib/Basic/Targets/AVR.cpp
+++ b/clang/lib/Basic/Targets/AVR.cpp
@@ -30,13 +30,13 @@ struct LLVM_LIBRARY_VISIBILITY MCUInfo {
 
 // NOTE: This list has been synchronized with gcc-avr 5.4.0 and avr-libc 2.0.0.
 static MCUInfo AVRMcus[] = {
-    {"avr1", NULL, "1", 0},
+    {"avr1", nullptr, "1", 0},
     {"at90s1200", "__AVR_AT90S1200__", "1", 0},
     {"attiny11", "__AVR_ATtiny11__", "1", 0},
     {"attiny12", "__AVR_ATtiny12__", "1", 0},
     {"attiny15", "__AVR_ATtiny15__", "1", 0},
     {"attiny28", "__AVR_ATtiny28__", "1", 0},
-    {"avr2", NULL, "2", 1},
+    {"avr2", nullptr, "2", 1},
     {"at90s2313", "__AVR_AT90S2313__", "2", 1},
     {"at90s2323", "__AVR_AT90S2323__", "2", 1},
     {"at90s2333", "__AVR_AT90S2333__", "2", 1},
@@ -50,7 +50,7 @@ static MCUInfo AVRMcus[] = {
     {"at90s8515", "__AVR_AT90S8515__", "2", 1},
     {"at90c8534", "__AVR_AT90c8534__", "2", 1},
     {"at90s8535", "__AVR_AT90S8535__", "2", 1},
-    {"avr25", NULL, "25", 1},
+    {"avr25", nullptr, "25", 1},
     {"ata5272", "__AVR_ATA5272__", "25", 1},
     {"ata6616c", "__AVR_ATA6616c__", "25", 1},
     {"attiny13", "__AVR_ATtiny13__", "25", 1},
@@ -80,13 +80,13 @@ static MCUInfo AVRMcus[] = {
     {"attiny48", "__AVR_ATtiny48__", "25", 1},
     {"attiny88", "__AVR_ATtiny88__", "25", 1},
     {"attiny828", "__AVR_ATtiny828__", "25", 1},
-    {"avr3", NULL, "3", 1},
+    {"avr3", nullptr, "3", 1},
     {"at43usb355", "__AVR_AT43USB355__", "3", 1},
     {"at76c711", "__AVR_AT76C711__", "3", 1},
-    {"avr31", NULL, "31", 1},
+    {"avr31", nullptr, "31", 1},
     {"atmega103", "__AVR_ATmega103__", "31", 1},
     {"at43usb320", "__AVR_AT43USB320__", "31", 1},
-    {"avr35", NULL, "35", 1},
+    {"avr35", nullptr, "35", 1},
     {"attiny167", "__AVR_ATtiny167__", "35", 1},
     {"at90usb82", "__AVR_AT90USB82__", "35", 1},
     {"at90usb162", "__AVR_AT90USB162__", "35", 1},
@@ -97,7 +97,7 @@ static MCUInfo AVRMcus[] = {
     {"atmega16u2", "__AVR_ATmega16U2__", "35", 1},
     {"atmega32u2", "__AVR_ATmega32U2__", "35", 1},
     {"attiny1634", "__AVR_ATtiny1634__", "35", 1},
-    {"avr4", NULL, "4", 1},
+    {"avr4", nullptr, "4", 1},
     {"atmega8", "__AVR_ATmega8__", "4", 1},
     {"ata6289", "__AVR_ATA6289__", "4", 1},
     {"atmega8a", "__AVR_ATmega8A__", "4", 1},
@@ -123,7 +123,7 @@ static MCUInfo AVRMcus[] = {
     {"at90pwm3", "__AVR_AT90PWM3__", "4", 1},
     {"at90pwm3b", "__AVR_AT90PWM3B__", "4", 1},
     {"at90pwm81", "__AVR_AT90PWM81__", "4", 1},
-    {"avr5", NULL, "5", 1},
+    {"avr5", nullptr, "5", 1},
     {"ata5702m322", "__AVR_ATA5702M322__", "5", 1},
     {"ata5782", "__AVR_ATA5782__", "5", 1},
     {"ata5790", "__AVR_ATA5790__", "5", 1},
@@ -230,7 +230,7 @@ static MCUInfo AVRMcus[] = {
     {"at90scr100", "__AVR_AT90SCR100__", "5", 1},
     {"at94k", "__AVR_AT94K__", "5", 1},
     {"m3000", "__AVR_AT000__", "5", 1},
-    {"avr51", NULL, "51", 2},
+    {"avr51", nullptr, "51", 2},
     {"atmega128", "__AVR_ATmega128__", "51", 2},
     {"atmega128a", "__AVR_ATmega128A__", "51", 2},
     {"atmega1280", "__AVR_ATmega1280__", "51", 2},
@@ -243,12 +243,12 @@ static MCUInfo AVRMcus[] = {
     {"at90can128", "__AVR_AT90CAN128__", "51", 2},
     {"at90usb1286", "__AVR_AT90USB1286__", "51", 2},
     {"at90usb1287", "__AVR_AT90USB1287__", "51", 2},
-    {"avr6", NULL, "6", 4},
+    {"avr6", nullptr, "6", 4},
     {"atmega2560", "__AVR_ATmega2560__", "6", 4},
     {"atmega2561", "__AVR_ATmega2561__", "6", 4},
     {"atmega256rfr2", "__AVR_ATmega256RFR2__", "6", 4},
     {"atmega2564rfr2", "__AVR_ATmega2564RFR2__", "6", 4},
-    {"avrxmega2", NULL, "102", 1},
+    {"avrxmega2", nullptr, "102", 1},
     {"atxmega16a4", "__AVR_ATxmega16A4__", "102", 1},
     {"atxmega16a4u", "__AVR_ATxmega16A4U__", "102", 1},
     {"atxmega16c4", "__AVR_ATxmega16C4__", "102", 1},
@@ -262,7 +262,7 @@ static MCUInfo AVRMcus[] = {
     {"atxmega32e5", "__AVR_ATxmega32E5__", "102", 1},
     {"atxmega16e5", "__AVR_ATxmega16E5__", "102", 1},
     {"atxmega8e5", "__AVR_ATxmega8E5__", "102", 1},
-    {"avrxmega4", NULL, "104", 1},
+    {"avrxmega4", nullptr, "104", 1},
     {"atxmega64a3", "__AVR_ATxmega64A3__", "104", 1},
     {"atxmega64a3u", "__AVR_ATxmega64A3U__", "104", 1},
     {"atxmega64a4u", "__AVR_ATxmega64A4U__", "104", 1},
@@ -271,10 +271,10 @@ static MCUInfo AVRMcus[] = {
     {"atxmega64c3", "__AVR_ATxmega64C3__", "104", 1},
     {"atxmega64d3", "__AVR_ATxmega64D3__", "104", 1},
     {"atxmega64d4", "__AVR_ATxmega64D4__", "104", 1},
-    {"avrxmega5", NULL, "105", 1},
+    {"avrxmega5", nullptr, "105", 1},
     {"atxmega64a1", "__AVR_ATxmega64A1__", "105", 1},
     {"atxmega64a1u", "__AVR_ATxmega64A1U__", "105", 1},
-    {"avrxmega6", NULL, "106", 6},
+    {"avrxmega6", nullptr, "106", 6},
     {"atxmega128a3", "__AVR_ATxmega128A3__", "106", 2},
     {"atxmega128a3u", "__AVR_ATxmega128A3U__", "106", 2},
     {"atxmega128b1", "__AVR_ATxmega128B1__", "106", 2},
@@ -294,11 +294,11 @@ static MCUInfo AVRMcus[] = {
     {"atxmega256d3", "__AVR_ATxmega256D3__", "106", 4},
     {"atxmega384c3", "__AVR_ATxmega384C3__", "106", 6},
     {"atxmega384d3", "__AVR_ATxmega384D3__", "106", 6},
-    {"avrxmega7", NULL, "107", 2},
+    {"avrxmega7", nullptr, "107", 2},
     {"atxmega128a1", "__AVR_ATxmega128A1__", "107", 2},
     {"atxmega128a1u", "__AVR_ATxmega128A1U__", "107", 2},
     {"atxmega128a4u", "__AVR_ATxmega128A4U__", "107", 2},
-    {"avrtiny", NULL, "100", 0},
+    {"avrtiny", nullptr, "100", 0},
     {"attiny4", "__AVR_ATtiny4__", "100", 0},
     {"attiny5", "__AVR_ATtiny5__", "100", 0},
     {"attiny9", "__AVR_ATtiny9__", "100", 0},
@@ -307,7 +307,7 @@ static MCUInfo AVRMcus[] = {
     {"attiny40", "__AVR_ATtiny40__", "100", 0},
     {"attiny102", "__AVR_ATtiny102__", "100", 0},
     {"attiny104", "__AVR_ATtiny104__", "100", 0},
-    {"avrxmega3", NULL, "103", 1},
+    {"avrxmega3", nullptr, "103", 1},
     {"attiny202", "__AVR_ATtiny202__", "103", 1},
     {"attiny402", "__AVR_ATtiny402__", "103", 1},
     {"attiny204", "__AVR_ATtiny204__", "103", 1},
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index 9651c3832f51d..ec4e40b0db6eb 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -171,7 +171,7 @@ ArrayRef<const char *> NVPTXTargetInfo::getGCCRegNames() const {
 
 bool NVPTXTargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
-      .Cases("ptx", "nvptx", true)
+      .Cases({"ptx", "nvptx"}, true)
       .Default(false);
 }
 
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index 846b240218172..d4ada2a0e0c38 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -125,9 +125,8 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
               .Cases({"power3", "pwr3"}, ArchDefinePpcgr)
               .Cases({"power4", "pwr4"},
                      ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
-              .Cases("power5", "pwr5",
-                     ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
-                         ArchDefinePpcsq)
+              .Cases({"power5", "pwr5"}, ArchDefinePwr5 | ArchDefinePwr4 |
+                                             ArchDefinePpcgr | ArchDefinePpcsq)
               .Cases({"power5x", "pwr5x"},
                      ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
                          ArchDefinePpcgr | ArchDefinePpcsq)
@@ -166,7 +165,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
                         ArchDefinePwr9 | ArchDefinePwr8 | ArchDefinePwr7 |
                         ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
                         ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
-              .Cases("8548", "e500", ArchDefineE500)
+              .Cases({"8548", "e500"}, ArchDefineE500)
               .Default(ArchDefineNone);
     }
     return CPUKnown;
@@ -445,27 +444,17 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo {
     LongWidth = LongAlign = PointerWidth = PointerAlign = 64;
     IntMaxType = SignedLong;
     Int64Type = SignedLong;
-    std::string DataLayout;
 
     if (Triple.isOSAIX()) {
       // TODO: Set appropriate ABI for AIX platform.
-      DataLayout = "E-m:a-Fi64-i64:64-i128:128-n32:64";
       LongDoubleWidth = 64;
       LongDoubleAlign = DoubleAlign = 32;
       LongDoubleFormat = &llvm::APFloat::IEEEdouble();
-    } else if ((Triple.getArch() == llvm::Triple::ppc64le)) {
-      DataLayout = "e-m:e-Fn32-i64:64-i128:128-n32:64";
+    } else if ((Triple.getArch() == llvm::Triple::ppc64le) ||
+               Triple.isPPC64ELFv2ABI()) {
       ABI = "elfv2";
     } else {
-      DataLayout = "E-m:e";
-      if (Triple.isPPC64ELFv2ABI()) {
-        ABI = "elfv2";
-        DataLayout += "-Fn32";
-      } else {
-        ABI = "elfv1";
-        DataLayout += "-Fi64";
-      }
-      DataLayout += "-i64:64-i128:128-n32:64";
+      ABI = "elfv1";
     }
 
     if (Triple.isOSFreeBSD() || Triple.isOSOpenBSD() || Triple.isMusl()) {
@@ -473,14 +462,12 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo {
       LongDoubleFormat = &llvm::APFloat::IEEEdouble();
     }
 
-    if (Triple.isOSAIX() || Triple.isOSLinux())
-      DataLayout += "-S128-v256:256:256-v512:512:512";
-    resetDataLayout(DataLayout);
-
     // Newer PPC64 instruction sets support atomics up to 16 bytes.
     MaxAtomicPromoteWidth = 128;
     // Baseline PPC64 supports inlining atomics up to 8 bytes.
     MaxAtomicInlineWidth = 64;
+
+    calculateDataLayout();
   }
 
   void setMaxAtomicWidth() override {
@@ -495,10 +482,33 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo {
     return TargetInfo::CharPtrBuiltinVaList;
   }
 
+  void calculateDataLayout() {
+    std::string DataLayout;
+
+    if (getTriple().isOSAIX()) {
+      DataLayout = "E-m:a-Fi64-i64:64-i128:128-n32:64";
+    } else if ((getTriple().getArch() == llvm::Triple::ppc64le)) {
+      DataLayout = "e-m:e-Fn32-i64:64-i128:128-n32:64";
+    } else {
+      DataLayout = "E-m:e";
+      if (ABI == "elfv2") {
+        DataLayout += "-Fn32";
+      } else {
+        DataLayout += "-Fi64";
+      }
+      DataLayout += "-i64:64-i128:128-n32:64";
+    }
+
+    if (getTriple().isOSAIX() || getTriple().isOSLinux())
+      DataLayout += "-S128-v256:256:256-v512:512:512";
+    resetDataLayout(DataLayout);
+  }
+
   // PPC64 Linux-specific ABI options.
   bool setABI(const std::string &Name) override {
     if (Name == "elfv1" || Name == "elfv2") {
       ABI = Name;
+      calculateDataLayout();
       return true;
     }
     return false;
diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h
index 85fa4cc07dccf..21555b94fe65d 100644
--- a/clang/lib/Basic/Targets/RISCV.h
+++ b/clang/lib/Basic/Targets/RISCV.h
@@ -126,7 +126,7 @@ class RISCVTargetInfo : public TargetInfo {
   llvm::APInt getFMVPriority(ArrayRef<StringRef> Features) const override;
 
   std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
-    return std::make_pair(32, 32);
+    return std::make_pair(64, 64);
   }
 
   bool supportsCpuSupports() const override { return getTriple().isOSLinux(); }
diff --git a/clang/lib/Basic/Targets/WebAssembly.cpp b/clang/lib/Basic/Targets/WebAssembly.cpp
index 55ffe1df0ba08..5bbb7af4c2ca1 100644
--- a/clang/lib/Basic/Targets/WebAssembly.cpp
+++ b/clang/lib/Basic/Targets/WebAssembly.cpp
@@ -213,6 +213,7 @@ bool WebAssemblyTargetInfo::initFeatureMap(
 
 bool WebAssemblyTargetInfo::handleTargetFeatures(
     std::vector<std::string> &Features, DiagnosticsEngine &Diags) {
+  HasMustTail = false;
   for (const auto &Feature : Features) {
     if (Feature == "+atomics") {
       HasAtomics = true;
@@ -345,10 +346,12 @@ bool WebAssemblyTargetInfo::handleTargetFeatures(
     }
     if (Feature == "+tail-call") {
       HasTailCall = true;
+      HasMustTail = true;
       continue;
     }
     if (Feature == "-tail-call") {
       HasTailCall = false;
+      HasMustTail = false;
       continue;
     }
     if (Feature == "+wide-arithmetic") {
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index e71f10c4c16fc..7a90c89dd7dc0 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -396,8 +396,6 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasAMXFP8 = true;
     } else if (Feature == "+amx-movrs") {
       HasAMXMOVRS = true;
-    } else if (Feature == "+amx-transpose") {
-      HasAMXTRANSPOSE = true;
     } else if (Feature == "+amx-avx512") {
       HasAMXAVX512 = true;
     } else if (Feature == "+amx-tf32") {
@@ -925,8 +923,6 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__AMX_FP8__");
   if (HasAMXMOVRS)
     Builder.defineMacro("__AMX_MOVRS__");
-  if (HasAMXTRANSPOSE)
-    Builder.defineMacro("__AMX_TRANSPOSE__");
   if (HasAMXAVX512)
     Builder.defineMacro("__AMX_AVX512__");
   if (HasAMXTF32)
@@ -1068,7 +1064,6 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("amx-movrs", true)
       .Case("amx-tf32", true)
       .Case("amx-tile", true)
-      .Case("amx-transpose", true)
       .Case("avx", true)
       .Case("avx10.1", true)
       .Case("avx10.2", true)
@@ -1189,7 +1184,6 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("amx-movrs", HasAMXMOVRS)
       .Case("amx-tf32", HasAMXTF32)
       .Case("amx-tile", HasAMXTILE)
-      .Case("amx-transpose", HasAMXTRANSPOSE)
       .Case("avx", SSELevel >= AVX)
       .Case("avx10.1", HasAVX10_1)
       .Case("avx10.2", HasAVX10_2)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index be3a473174370..e7da2622e78b5 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -160,7 +160,6 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasAMXCOMPLEX = false;
   bool HasAMXFP8 = false;
   bool HasAMXMOVRS = false;
-  bool HasAMXTRANSPOSE = false;
   bool HasAMXAVX512 = false;
   bool HasAMXTF32 = false;
   bool HasSERIALIZE = false;
diff --git a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
index 7db6e283ec0a5..cd4c1f0e5b769 100644
--- a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
@@ -27,6 +27,7 @@ class AtomicInfo {
   CharUnits atomicAlign;
   CharUnits valueAlign;
   TypeEvaluationKind evaluationKind = cir::TEK_Scalar;
+  bool useLibCall = true;
   LValue lvalue;
   mlir::Location loc;
 
@@ -62,8 +63,8 @@ class AtomicInfo {
       assert(!cir::MissingFeatures::atomicInfo());
       cgf.cgm.errorNYI(loc, "AtomicInfo: non-simple lvalue");
     }
-
-    assert(!cir::MissingFeatures::atomicUseLibCall());
+    useLibCall = !ctx.getTargetInfo().hasBuiltinAtomic(
+        atomicSizeInBits, ctx.toBits(lvalue.getAlignment()));
   }
 
   QualType getValueType() const { return valueTy; }
@@ -75,6 +76,8 @@ class AtomicInfo {
     assert(!cir::MissingFeatures::atomicInfoGetAtomicPointer());
     return nullptr;
   }
+  bool shouldUseLibCall() const { return useLibCall; }
+  const LValue &getAtomicLValue() const { return lvalue; }
   Address getAtomicAddress() const {
     mlir::Type elemTy;
     if (lvalue.isSimple()) {
@@ -96,6 +99,8 @@ class AtomicInfo {
 
   bool emitMemSetZeroIfNecessary() const;
 
+  mlir::Value getScalarRValValueOrNull(RValue rvalue) const;
+
   /// Cast the given pointer to an integer pointer suitable for atomic
   /// operations on the source.
   Address castToAtomicIntPointer(Address addr) const;
@@ -105,6 +110,9 @@ class AtomicInfo {
   /// copy the value across.
   Address convertToAtomicIntPointer(Address addr) const;
 
+  /// Converts a rvalue to integer value.
+  mlir::Value convertRValueToInt(RValue rvalue, bool cmpxchg = false) const;
+
   /// Copy an atomic r-value into atomic-layout memory.
   void emitCopyIntoMemory(RValue rvalue) const;
 
@@ -195,6 +203,12 @@ Address AtomicInfo::createTempAlloca() const {
   return tempAlloca;
 }
 
+mlir::Value AtomicInfo::getScalarRValValueOrNull(RValue rvalue) const {
+  if (rvalue.isScalar() && (!hasPadding() || !lvalue.isSimple()))
+    return rvalue.getValue();
+  return nullptr;
+}
+
 Address AtomicInfo::castToAtomicIntPointer(Address addr) const {
   auto intTy = mlir::dyn_cast<cir::IntType>(addr.getElementType());
   // Don't bother with int casts if the integer size is the same.
@@ -211,10 +225,38 @@ bool AtomicInfo::emitMemSetZeroIfNecessary() const {
     return false;
 
   cgf.cgm.errorNYI(loc,
-                   "AtomicInfo::emitMemSetZeroIfNecessary: emit memset zero");
+                   "AtomicInfo::emitMemSetZeroIfNecaessary: emit memset zero");
   return false;
 }
 
+/// Return true if \param valueTy is a type that should be casted to integer
+/// around the atomic memory operation. If \param cmpxchg is true, then the
+/// cast of a floating point type is made as that instruction can not have
+/// floating point operands.  TODO: Allow compare-and-exchange and FP - see
+/// comment in CIRGenAtomicExpandPass.cpp.
+static bool shouldCastToInt(mlir::Type valueTy, bool cmpxchg) {
+  if (cir::isAnyFloatingPointType(valueTy))
+    return isa<cir::FP80Type>(valueTy) || cmpxchg;
+  return !isa<cir::IntType>(valueTy) && !isa<cir::PointerType>(valueTy);
+}
+
+mlir::Value AtomicInfo::convertRValueToInt(RValue rvalue, bool cmpxchg) const {
+  // If we've got a scalar value of the right size, try to avoid going
+  // through memory. Floats get casted if needed by AtomicExpandPass.
+  if (mlir::Value value = getScalarRValValueOrNull(rvalue)) {
+    if (!shouldCastToInt(value.getType(), cmpxchg))
+      return cgf.emitToMemory(value, valueTy);
+
+    cgf.cgm.errorNYI(
+        loc, "AtomicInfo::convertRValueToInt: cast scalar rvalue to int");
+    return nullptr;
+  }
+
+  cgf.cgm.errorNYI(
+      loc, "AtomicInfo::convertRValueToInt: cast non-scalar rvalue to int");
+  return nullptr;
+}
+
 /// Copy an r-value into memory as part of storing to an atomic type.
 /// This needs to create a bit-pattern suitable for atomic operations.
 void AtomicInfo::emitCopyIntoMemory(RValue rvalue) const {
@@ -815,6 +857,79 @@ RValue CIRGenFunction::emitAtomicExpr(AtomicExpr *e) {
       e->getExprLoc());
 }
 
+void CIRGenFunction::emitAtomicStore(RValue rvalue, LValue dest, bool isInit) {
+  bool isVolatile = dest.isVolatileQualified();
+  auto order = cir::MemOrder::SequentiallyConsistent;
+  if (!dest.getType()->isAtomicType()) {
+    assert(!cir::MissingFeatures::atomicMicrosoftVolatile());
+  }
+  return emitAtomicStore(rvalue, dest, order, isVolatile, isInit);
+}
+
+/// Emit a store to an l-value of atomic type.
+///
+/// Note that the r-value is expected to be an r-value of the atomic type; this
+/// means that for aggregate r-values, it should include storage for any padding
+/// that was necessary.
+void CIRGenFunction::emitAtomicStore(RValue rvalue, LValue dest,
+                                     cir::MemOrder order, bool isVolatile,
+                                     bool isInit) {
+  // If this is an aggregate r-value, it should agree in type except
+  // maybe for address-space qualification.
+  mlir::Location loc = dest.getPointer().getLoc();
+  assert(!rvalue.isAggregate() ||
+         rvalue.getAggregateAddress().getElementType() ==
+             dest.getAddress().getElementType());
+
+  AtomicInfo atomics(*this, dest, loc);
+  LValue lvalue = atomics.getAtomicLValue();
+
+  if (lvalue.isSimple()) {
+    // If this is an initialization, just put the value there normally.
+    if (isInit) {
+      atomics.emitCopyIntoMemory(rvalue);
+      return;
+    }
+
+    // Check whether we should use a library call.
+    if (atomics.shouldUseLibCall()) {
+      assert(!cir::MissingFeatures::atomicUseLibCall());
+      cgm.errorNYI(loc, "emitAtomicStore: atomic store with library call");
+      return;
+    }
+
+    // Okay, we're doing this natively.
+    mlir::Value valueToStore = atomics.convertRValueToInt(rvalue);
+
+    // Do the atomic store.
+    Address addr = atomics.getAtomicAddress();
+    if (mlir::Value value = atomics.getScalarRValValueOrNull(rvalue)) {
+      if (shouldCastToInt(value.getType(), /*CmpXchg=*/false)) {
+        addr = atomics.castToAtomicIntPointer(addr);
+        valueToStore =
+            builder.createIntCast(valueToStore, addr.getElementType());
+      }
+    }
+    cir::StoreOp store = builder.createStore(loc, valueToStore, addr);
+
+    // Initializations don't need to be atomic.
+    if (!isInit) {
+      assert(!cir::MissingFeatures::atomicOpenMP());
+      store.setMemOrder(order);
+    }
+
+    // Other decoration.
+    if (isVolatile)
+      store.setIsVolatile(true);
+
+    assert(!cir::MissingFeatures::opLoadStoreTbaa());
+    return;
+  }
+
+  cgm.errorNYI(loc, "emitAtomicStore: non-simple atomic lvalue");
+  assert(!cir::MissingFeatures::opLoadStoreAtomic());
+}
+
 void CIRGenFunction::emitAtomicInit(Expr *init, LValue dest) {
   AtomicInfo atomics(*this, dest, getLoc(init->getSourceRange()));
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index e35100ffe4b6b..4e6a5ee7ee210 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -211,6 +211,28 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     assert(!cir::MissingFeatures::fastMathFlags());
     return emitUnaryMaybeConstrainedFPBuiltin<cir::CosOp>(*this, *e);
 
+  case Builtin::BIceil:
+  case Builtin::BIceilf:
+  case Builtin::BIceill:
+  case Builtin::BI__builtin_ceil:
+  case Builtin::BI__builtin_ceilf:
+  case Builtin::BI__builtin_ceilf16:
+  case Builtin::BI__builtin_ceill:
+  case Builtin::BI__builtin_ceilf128:
+    assert(!cir::MissingFeatures::fastMathFlags());
+    return emitUnaryMaybeConstrainedFPBuiltin<cir::CeilOp>(*this, *e);
+
+  case Builtin::BIexp:
+  case Builtin::BIexpf:
+  case Builtin::BIexpl:
+  case Builtin::BI__builtin_exp:
+  case Builtin::BI__builtin_expf:
+  case Builtin::BI__builtin_expf16:
+  case Builtin::BI__builtin_expl:
+  case Builtin::BI__builtin_expf128:
+    assert(!cir::MissingFeatures::fastMathFlags());
+    return emitUnaryMaybeConstrainedFPBuiltin<cir::ExpOp>(*this, *e);
+
   case Builtin::BIfabs:
   case Builtin::BIfabsf:
   case Builtin::BIfabsl:
@@ -459,6 +481,19 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     return emitCall(e->getCallee()->getType(), CIRGenCallee::forDirect(fnOp), e,
                     returnValue);
   }
+  case Builtin::BI__builtin_dynamic_object_size:
+  case Builtin::BI__builtin_object_size: {
+    unsigned type =
+        e->getArg(1)->EvaluateKnownConstInt(getContext()).getZExtValue();
+    auto resType = mlir::cast<cir::IntType>(convertType(e->getType()));
+
+    // We pass this builtin onto the optimizer so that it can figure out the
+    // object size in more complex cases.
+    bool isDynamic = builtinID == Builtin::BI__builtin_dynamic_object_size;
+    return RValue::get(emitBuiltinObjectSize(e->getArg(0), type, resType,
+                                             /*EmittedE=*/nullptr, isDynamic));
+  }
+
   case Builtin::BI__builtin_prefetch: {
     auto evaluateOperandAsInt = [&](const Expr *arg) {
       Expr::EvalResult res;
@@ -641,3 +676,42 @@ mlir::Value CIRGenFunction::emitVAArg(VAArgExpr *ve) {
   mlir::Value vaList = emitVAListRef(ve->getSubExpr()).getPointer();
   return cir::VAArgOp::create(builder, loc, type, vaList);
 }
+
+mlir::Value CIRGenFunction::emitBuiltinObjectSize(const Expr *e, unsigned type,
+                                                  cir::IntType resType,
+                                                  mlir::Value emittedE,
+                                                  bool isDynamic) {
+  assert(!cir::MissingFeatures::opCallImplicitObjectSizeArgs());
+
+  // LLVM can't handle type=3 appropriately, and __builtin_object_size shouldn't
+  // evaluate e for side-effects. In either case, just like original LLVM
+  // lowering, we shouldn't lower to `cir.objsize` but to a constant instead.
+  if (type == 3 || (!emittedE && e->HasSideEffects(getContext())))
+    return builder.getConstInt(getLoc(e->getSourceRange()), resType,
+                               (type & 2) ? 0 : -1);
+
+  mlir::Value ptr = emittedE ? emittedE : emitScalarExpr(e);
+  assert(mlir::isa<cir::PointerType>(ptr.getType()) &&
+         "Non-pointer passed to __builtin_object_size?");
+
+  assert(!cir::MissingFeatures::countedBySize());
+
+  // Extract the min/max mode from type. CIR only supports type 0
+  // (max, whole object) and type 2 (min, whole object), not type 1 or 3
+  // (closest subobject variants).
+  const bool min = ((type & 2) != 0);
+  // For GCC compatibility, __builtin_object_size treats NULL as unknown size.
+  auto op =
+      cir::ObjSizeOp::create(builder, getLoc(e->getSourceRange()), resType, ptr,
+                             min, /*nullUnknown=*/true, isDynamic);
+  return op.getResult();
+}
+
+mlir::Value CIRGenFunction::evaluateOrEmitBuiltinObjectSize(
+    const Expr *e, unsigned type, cir::IntType resType, mlir::Value emittedE,
+    bool isDynamic) {
+  uint64_t objectSize;
+  if (!e->tryEvaluateObjectSize(objectSize, getContext(), type))
+    return emitBuiltinObjectSize(e, type, resType, emittedE, isDynamic);
+  return builder.getConstInt(getLoc(e->getSourceRange()), resType, objectSize);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 3c9c7ecf35aff..0198a9d4eb192 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -771,14 +771,6 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI_WriteBarrier:
   case X86::BI_AddressOfReturnAddress:
   case X86::BI__stosb:
-  case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
   case X86::BI__ud2:
   case X86::BI__int2c:
   case X86::BI__readfsbyte:
diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
index 851328a7db680..437db306f3369 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
@@ -147,8 +147,8 @@ void *EHScopeStack::pushCleanup(CleanupKind kind, size_t size) {
 
   assert(!cir::MissingFeatures::innermostEHScope());
 
-  EHCleanupScope *scope = new (buffer)
-      EHCleanupScope(size, branchFixups.size(), innermostNormalCleanup);
+  EHCleanupScope *scope = new (buffer) EHCleanupScope(
+      size, branchFixups.size(), innermostNormalCleanup, innermostEHScope);
 
   if (isNormalCleanup)
     innermostNormalCleanup = stable_begin();
@@ -191,7 +191,9 @@ void EHScopeStack::popCleanup() {
 EHCatchScope *EHScopeStack::pushCatch(unsigned numHandlers) {
   char *buffer = allocate(EHCatchScope::getSizeForNumHandlers(numHandlers));
   assert(!cir::MissingFeatures::innermostEHScope());
-  EHCatchScope *scope = new (buffer) EHCatchScope(numHandlers);
+  EHCatchScope *scope =
+      new (buffer) EHCatchScope(numHandlers, innermostEHScope);
+  innermostEHScope = stable_begin();
   return scope;
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.h b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
index 61a09a59b05c0..a035d792ef6d1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCleanup.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
@@ -30,6 +30,8 @@ struct CatchTypeInfo {
 
 /// A protected scope for zero-cost EH handling.
 class EHScope {
+  EHScopeStack::stable_iterator enclosingEHScope;
+
   class CommonBitFields {
     friend class EHScope;
     unsigned kind : 3;
@@ -79,7 +81,10 @@ class EHScope {
 public:
   enum Kind { Cleanup, Catch, Terminate, Filter };
 
-  EHScope(Kind kind) { commonBits.kind = kind; }
+  EHScope(Kind kind, EHScopeStack::stable_iterator enclosingEHScope)
+      : enclosingEHScope(enclosingEHScope) {
+    commonBits.kind = kind;
+  }
 
   Kind getKind() const { return static_cast<Kind>(commonBits.kind); }
 
@@ -90,6 +95,10 @@ class EHScope {
     assert(!cir::MissingFeatures::ehstackBranches());
     return false;
   }
+
+  EHScopeStack::stable_iterator getEnclosingEHScope() const {
+    return enclosingEHScope;
+  }
 };
 
 /// A scope which attempts to handle some, possibly all, types of
@@ -111,6 +120,8 @@ class EHCatchScope : public EHScope {
 
     /// The catch handler for this type.
     mlir::Region *region;
+
+    bool isCatchAll() const { return type.rtti == nullptr; }
   };
 
 private:
@@ -118,12 +129,18 @@ class EHCatchScope : public EHScope {
 
   Handler *getHandlers() { return reinterpret_cast<Handler *>(this + 1); }
 
+  const Handler *getHandlers() const {
+    return reinterpret_cast<const Handler *>(this + 1);
+  }
+
 public:
   static size_t getSizeForNumHandlers(unsigned n) {
     return sizeof(EHCatchScope) + n * sizeof(Handler);
   }
 
-  EHCatchScope(unsigned numHandlers) : EHScope(Catch) {
+  EHCatchScope(unsigned numHandlers,
+               EHScopeStack::stable_iterator enclosingEHScope)
+      : EHScope(Catch, enclosingEHScope) {
     catchBits.numHandlers = numHandlers;
     assert(catchBits.numHandlers == numHandlers && "NumHandlers overflow?");
   }
@@ -136,6 +153,11 @@ class EHCatchScope : public EHScope {
     getHandlers()[i].region = region;
   }
 
+  const Handler &getHandler(unsigned i) const {
+    assert(i < getNumHandlers());
+    return getHandlers()[i];
+  }
+
   // Clear all handler blocks.
   // FIXME: it's better to always call clearHandlerBlocks in DTOR and have a
   // 'takeHandler' or some such function which removes ownership from the
@@ -144,6 +166,10 @@ class EHCatchScope : public EHScope {
     // The blocks are owned by TryOp, nothing to delete.
   }
 
+  using iterator = const Handler *;
+  iterator begin() const { return getHandlers(); }
+  iterator end() const { return getHandlers() + getNumHandlers(); }
+
   static bool classof(const EHScope *scope) {
     return scope->getKind() == Catch;
   }
@@ -176,9 +202,10 @@ class alignas(EHScopeStack::ScopeStackAlignment) EHCleanupScope
   }
 
   EHCleanupScope(unsigned cleanupSize, unsigned fixupDepth,
-                 EHScopeStack::stable_iterator enclosingNormal)
-      : EHScope(EHScope::Cleanup), enclosingNormal(enclosingNormal),
-        fixupDepth(fixupDepth) {
+                 EHScopeStack::stable_iterator enclosingNormal,
+                 EHScopeStack::stable_iterator enclosingEH)
+      : EHScope(EHScope::Cleanup, enclosingEH),
+        enclosingNormal(enclosingNormal), fixupDepth(fixupDepth) {
     // TODO(cir): When exception handling is upstreamed, isNormalCleanup and
     // isEHCleanup will be arguments to the constructor.
     cleanupBits.isNormalCleanup = true;
@@ -235,13 +262,45 @@ class EHScopeStack::iterator {
 
   EHScope *get() const { return reinterpret_cast<EHScope *>(ptr); }
 
+  EHScope *operator->() const { return get(); }
   EHScope &operator*() const { return *get(); }
+
+  iterator &operator++() {
+    size_t size;
+    switch (get()->getKind()) {
+    case EHScope::Catch:
+      size = EHCatchScope::getSizeForNumHandlers(
+          static_cast<const EHCatchScope *>(get())->getNumHandlers());
+      break;
+
+    case EHScope::Filter:
+      llvm_unreachable("EHScopeStack::iterator Filter");
+      break;
+
+    case EHScope::Cleanup:
+      llvm_unreachable("EHScopeStack::iterator Cleanup");
+      break;
+
+    case EHScope::Terminate:
+      llvm_unreachable("EHScopeStack::iterator Terminate");
+      break;
+    }
+    ptr += llvm::alignTo(size, ScopeStackAlignment);
+    return *this;
+  }
+
+  bool operator==(iterator other) const { return ptr == other.ptr; }
+  bool operator!=(iterator other) const { return ptr != other.ptr; }
 };
 
 inline EHScopeStack::iterator EHScopeStack::begin() const {
   return iterator(startOfData);
 }
 
+inline EHScopeStack::iterator EHScopeStack::end() const {
+  return iterator(endOfBuffer);
+}
+
 inline EHScopeStack::iterator
 EHScopeStack::find(stable_iterator savePoint) const {
   assert(savePoint.isValid() && "finding invalid savepoint");
@@ -254,7 +313,7 @@ inline void EHScopeStack::popCatch() {
   assert(!empty() && "popping exception stack when not empty");
 
   EHCatchScope &scope = llvm::cast<EHCatchScope>(*begin());
-  assert(!cir::MissingFeatures::innermostEHScope());
+  innermostEHScope = scope.getEnclosingEHScope();
   deallocate(EHCatchScope::getSizeForNumHandlers(scope.getNumHandlers()));
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
index 930ae55405756..05fb1aedcbf4a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
@@ -13,6 +13,7 @@
 #include "CIRGenFunction.h"
 #include "mlir/Support/LLVM.h"
 #include "clang/AST/StmtCXX.h"
+#include "clang/AST/StmtVisitor.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
@@ -33,6 +34,65 @@ struct clang::CIRGen::CGCoroData {
 CIRGenFunction::CGCoroInfo::CGCoroInfo() {}
 CIRGenFunction::CGCoroInfo::~CGCoroInfo() {}
 
+namespace {
+// FIXME: both GetParamRef and ParamReferenceReplacerRAII are good template
+// candidates to be shared among LLVM / CIR codegen.
+
+// Hunts for the parameter reference in the parameter copy/move declaration.
+struct GetParamRef : public StmtVisitor<GetParamRef> {
+public:
+  DeclRefExpr *expr = nullptr;
+  GetParamRef() {}
+  void VisitDeclRefExpr(DeclRefExpr *e) {
+    assert(expr == nullptr && "multilple declref in param move");
+    expr = e;
+  }
+  void VisitStmt(Stmt *s) {
+    for (Stmt *c : s->children()) {
+      if (c)
+        Visit(c);
+    }
+  }
+};
+
+// This class replaces references to parameters to their copies by changing
+// the addresses in CGF.LocalDeclMap and restoring back the original values in
+// its destructor.
+struct ParamReferenceReplacerRAII {
+  CIRGenFunction::DeclMapTy savedLocals;
+  CIRGenFunction::DeclMapTy &localDeclMap;
+
+  ParamReferenceReplacerRAII(CIRGenFunction::DeclMapTy &localDeclMap)
+      : localDeclMap(localDeclMap) {}
+
+  void addCopy(const DeclStmt *pm) {
+    // Figure out what param it refers to.
+
+    assert(pm->isSingleDecl());
+    const VarDecl *vd = static_cast<const VarDecl *>(pm->getSingleDecl());
+    const Expr *initExpr = vd->getInit();
+    GetParamRef visitor;
+    visitor.Visit(const_cast<Expr *>(initExpr));
+    assert(visitor.expr);
+    DeclRefExpr *dreOrig = visitor.expr;
+    auto *pd = dreOrig->getDecl();
+
+    auto it = localDeclMap.find(pd);
+    assert(it != localDeclMap.end() && "parameter is not found");
+    savedLocals.insert({pd, it->second});
+
+    auto copyIt = localDeclMap.find(vd);
+    assert(copyIt != localDeclMap.end() && "parameter copy is not found");
+    it->second = copyIt->getSecond();
+  }
+
+  ~ParamReferenceReplacerRAII() {
+    for (auto &&savedLocal : savedLocals) {
+      localDeclMap.insert({savedLocal.first, savedLocal.second});
+    }
+  }
+};
+} // namespace
 static void createCoroData(CIRGenFunction &cgf,
                            CIRGenFunction::CGCoroInfo &curCoro,
                            cir::CallOp coroId) {
@@ -149,7 +209,47 @@ CIRGenFunction::emitCoroutineBody(const CoroutineBodyStmt &s) {
   if (s.getReturnStmtOnAllocFailure())
     cgm.errorNYI("handle coroutine return alloc failure");
 
-  assert(!cir::MissingFeatures::generateDebugInfo());
-  assert(!cir::MissingFeatures::emitBodyAndFallthrough());
+  {
+    assert(!cir::MissingFeatures::generateDebugInfo());
+    ParamReferenceReplacerRAII paramReplacer(localDeclMap);
+    // Create mapping between parameters and copy-params for coroutine
+    // function.
+    llvm::ArrayRef<const Stmt *> paramMoves = s.getParamMoves();
+    assert((paramMoves.size() == 0 || (paramMoves.size() == fnArgs.size())) &&
+           "ParamMoves and FnArgs should be the same size for coroutine "
+           "function");
+    // For zipping the arg map into debug info.
+    assert(!cir::MissingFeatures::generateDebugInfo());
+
+    // Create parameter copies. We do it before creating a promise, since an
+    // evolution of coroutine TS may allow promise constructor to observe
+    // parameter copies.
+    assert(!cir::MissingFeatures::coroOutsideFrameMD());
+    for (auto *pm : paramMoves) {
+      if (emitStmt(pm, /*useCurrentScope=*/true).failed())
+        return mlir::failure();
+      paramReplacer.addCopy(cast<DeclStmt>(pm));
+    }
+
+    if (emitStmt(s.getPromiseDeclStmt(), /*useCurrentScope=*/true).failed())
+      return mlir::failure();
+    // returnValue should be valid as long as the coroutine's return type
+    // is not void. The assertion could help us to reduce the check later.
+    assert(returnValue.isValid() == (bool)s.getReturnStmt());
+    // Now we have the promise, initialize the GRO.
+    // We need to emit `get_return_object` first. According to:
+    // [dcl.fct.def.coroutine]p7
+    // The call to get_return_­object is sequenced before the call to
+    // initial_suspend and is invoked at most once.
+    //
+    // So we couldn't emit return value when we emit return statment,
+    // otherwise the call to get_return_object wouldn't be in front
+    // of initial_suspend.
+    if (returnValue.isValid())
+      emitAnyExprToMem(s.getReturnValue(), returnValue,
+                       s.getReturnValue()->getType().getQualifiers(),
+                       /*isInit*/ true);
+    assert(!cir::MissingFeatures::emitBodyAndFallthrough());
+  }
   return mlir::success();
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index aeea0efeb77c3..325875d10d6ea 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -50,6 +50,41 @@ CIRGenFunction::emitAutoVarAlloca(const VarDecl &d,
 
   Address address = Address::invalid();
   if (ty->isConstantSizeType()) {
+    // If this value is an array, struct, or vector with a statically
+    // determinable constant initializer, there are optimizations we can do.
+    //
+    // TODO: We should constant-evaluate the initializer of any variable,
+    // as long as it is initialized by a constant expression. Currently,
+    // isConstantInitializer produces wrong answers for structs with
+    // reference or bitfield members, and a few other cases, and checking
+    // for POD-ness protects us from some of these.
+    if (d.getInit() &&
+        (ty->isArrayType() || ty->isRecordType() || ty->isVectorType()) &&
+        (d.isConstexpr() ||
+         ((ty.isPODType(getContext()) ||
+           getContext().getBaseElementType(ty)->isObjCObjectPointerType()) &&
+          d.getInit()->isConstantInitializer(getContext(), false)))) {
+
+      // If the variable's a const type, and it's neither an NRVO
+      // candidate nor a __block variable and has no mutable members,
+      // emit it as a global instead.
+      // Exception is if a variable is located in non-constant address space
+      // in OpenCL.
+      // TODO(cir): perhaps we don't need this at all at CIR since this can
+      // be done as part of lowering down to LLVM.
+      bool needsDtor =
+          d.needsDestruction(getContext()) == QualType::DK_cxx_destructor;
+      if ((!getContext().getLangOpts().OpenCL ||
+           ty.getAddressSpace() == LangAS::opencl_constant) &&
+          (cgm.getCodeGenOpts().MergeAllConstants && !nrvo &&
+           !d.isEscapingByref() &&
+           ty.isConstantStorage(getContext(), true, !needsDtor))) {
+        cgm.errorNYI(d.getSourceRange(), "emitAutoVarAlloca: type constant");
+      }
+      // Otherwise, tell the initialization code that we're in this case.
+      emission.isConstantAggregate = true;
+    }
+
     // A normal fixed sized variable becomes an alloca in the entry block,
     // unless:
     // - it's an NRVO variable.
@@ -131,6 +166,47 @@ bool CIRGenFunction::isTrivialInitializer(const Expr *init) {
   return false;
 }
 
+static void emitStoresForConstant(CIRGenModule &cgm, const VarDecl &d,
+                                  Address addr, bool isVolatile,
+                                  CIRGenBuilderTy &builder,
+                                  mlir::TypedAttr constant) {
+  mlir::Type ty = constant.getType();
+  cir::CIRDataLayout layout{cgm.getModule()};
+  uint64_t constantSize = layout.getTypeAllocSize(ty);
+  if (!constantSize)
+    return;
+  assert(!cir::MissingFeatures::addAutoInitAnnotation());
+  assert(!cir::MissingFeatures::vectorConstants());
+  assert(!cir::MissingFeatures::shouldUseBZeroPlusStoresToInitialize());
+  assert(!cir::MissingFeatures::shouldUseMemSetToInitialize());
+  assert(!cir::MissingFeatures::shouldSplitConstantStore());
+  assert(!cir::MissingFeatures::shouldCreateMemCpyFromGlobal());
+  // In CIR we want to emit a store for the whole thing, later lowering
+  // prepare to LLVM should unwrap this into the best policy (see asserts
+  // above).
+  //
+  // FIXME(cir): This is closer to memcpy behavior but less optimal, instead of
+  // copy from a global, we just create a cir.const out of it.
+
+  if (addr.getElementType() != ty)
+    addr = addr.withElementType(builder, ty);
+
+  // If the address is an alloca, set the init attribute.
+  // The address is usually and alloca, but there is at least one case where
+  // emitAutoVarInit is called from the OpenACC codegen with an address that
+  // is not an alloca.
+  auto allocaOp = addr.getDefiningOp<cir::AllocaOp>();
+  if (allocaOp)
+    allocaOp.setInitAttr(mlir::UnitAttr::get(&cgm.getMLIRContext()));
+
+  // There are cases where OpenACC codegen calls emitAutoVarInit with a
+  // temporary decl that doesn't have a source range set.
+  mlir::Location loc = builder.getUnknownLoc();
+  if (d.getSourceRange().isValid())
+    loc = cgm.getLoc(d.getSourceRange());
+  builder.createStore(loc, builder.getConstant(loc, constant), addr);
+}
+
 void CIRGenFunction::emitAutoVarInit(
     const CIRGenFunction::AutoVarEmission &emission) {
   assert(emission.variable && "emission was not valid!");
@@ -237,6 +313,9 @@ void CIRGenFunction::emitAutoVarInit(
     return emitStoreThroughLValue(
         RValue::get(builder.getConstant(initLoc, typedConstant)), lv);
   }
+
+  emitStoresForConstant(cgm, d, addr, type.isVolatileQualified(), builder,
+                        typedConstant);
 }
 
 void CIRGenFunction::emitAutoVarCleanups(
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 5ccb431e626ae..9bb76894c13f1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -311,7 +311,8 @@ static LValue emitGlobalVarDeclLValue(CIRGenFunction &cgf, const Expr *e,
 
 void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr,
                                        bool isVolatile, QualType ty,
-                                       bool isInit, bool isNontemporal) {
+                                       LValueBaseInfo baseInfo, bool isInit,
+                                       bool isNontemporal) {
   assert(!cir::MissingFeatures::opLoadStoreThreadLocal());
 
   if (const auto *clangVecTy = ty->getAs<clang::VectorType>()) {
@@ -333,7 +334,13 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr,
 
   value = emitToMemory(value, ty);
 
-  assert(!cir::MissingFeatures::opLoadStoreAtomic());
+  assert(!cir::MissingFeatures::opLoadStoreTbaa());
+  LValue atomicLValue = LValue::makeAddr(addr, ty, baseInfo);
+  if (ty->isAtomicType() ||
+      (!isInit && isLValueSuitableForInlineAtomic(atomicLValue))) {
+    emitAtomicStore(RValue::get(value), atomicLValue, isInit);
+    return;
+  }
 
   // Update the alloca with more info on initialization.
   assert(addr.getPointer() && "expected pointer to exist");
@@ -550,7 +557,8 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, LValue lvalue,
   }
 
   emitStoreOfScalar(value, lvalue.getAddress(), lvalue.isVolatile(),
-                    lvalue.getType(), isInit, /*isNontemporal=*/false);
+                    lvalue.getType(), lvalue.getBaseInfo(), isInit,
+                    /*isNontemporal=*/false);
 }
 
 mlir::Value CIRGenFunction::emitLoadOfScalar(Address addr, bool isVolatile,
@@ -1630,7 +1638,7 @@ RValue CIRGenFunction::emitAnyExpr(const Expr *e, AggValueSlot aggSlot,
                                    bool ignoreResult) {
   switch (CIRGenFunction::getEvaluationKind(e->getType())) {
   case cir::TEK_Scalar:
-    return RValue::get(emitScalarExpr(e));
+    return RValue::get(emitScalarExpr(e, ignoreResult));
   case cir::TEK_Complex:
     return RValue::getComplex(emitComplexExpr(e));
   case cir::TEK_Aggregate: {
@@ -2126,79 +2134,6 @@ RValue CIRGenFunction::emitCXXMemberCallExpr(const CXXMemberCallExpr *ce,
       ce, md, returnValue, hasQualifier, qualifier, isArrow, base);
 }
 
-void CIRGenFunction::emitCXXConstructExpr(const CXXConstructExpr *e,
-                                          AggValueSlot dest) {
-  assert(!dest.isIgnored() && "Must have a destination!");
-  const CXXConstructorDecl *cd = e->getConstructor();
-
-  // If we require zero initialization before (or instead of) calling the
-  // constructor, as can be the case with a non-user-provided default
-  // constructor, emit the zero initialization now, unless destination is
-  // already zeroed.
-  if (e->requiresZeroInitialization() && !dest.isZeroed()) {
-    switch (e->getConstructionKind()) {
-    case CXXConstructionKind::Delegating:
-    case CXXConstructionKind::Complete:
-      emitNullInitialization(getLoc(e->getSourceRange()), dest.getAddress(),
-                             e->getType());
-      break;
-    case CXXConstructionKind::VirtualBase:
-    case CXXConstructionKind::NonVirtualBase:
-      cgm.errorNYI(e->getSourceRange(),
-                   "emitCXXConstructExpr: base requires initialization");
-      break;
-    }
-  }
-
-  // If this is a call to a trivial default constructor, do nothing.
-  if (cd->isTrivial() && cd->isDefaultConstructor())
-    return;
-
-  // Elide the constructor if we're constructing from a temporary
-  if (getLangOpts().ElideConstructors && e->isElidable()) {
-    // FIXME: This only handles the simplest case, where the source object is
-    //        passed directly as the first argument to the constructor. This
-    //        should also handle stepping through implicit casts and conversion
-    //        sequences which involve two steps, with a conversion operator
-    //        follwed by a converting constructor.
-    const Expr *srcObj = e->getArg(0);
-    assert(srcObj->isTemporaryObject(getContext(), cd->getParent()));
-    assert(
-        getContext().hasSameUnqualifiedType(e->getType(), srcObj->getType()));
-    emitAggExpr(srcObj, dest);
-    return;
-  }
-
-  if (const ArrayType *arrayType = getContext().getAsArrayType(e->getType())) {
-    assert(!cir::MissingFeatures::sanitizers());
-    emitCXXAggrConstructorCall(cd, arrayType, dest.getAddress(), e, false);
-  } else {
-
-    clang::CXXCtorType type = Ctor_Complete;
-    bool forVirtualBase = false;
-    bool delegating = false;
-
-    switch (e->getConstructionKind()) {
-    case CXXConstructionKind::Complete:
-      type = Ctor_Complete;
-      break;
-    case CXXConstructionKind::Delegating:
-      // We should be emitting a constructor; GlobalDecl will assert this
-      type = curGD.getCtorType();
-      delegating = true;
-      break;
-    case CXXConstructionKind::VirtualBase:
-      forVirtualBase = true;
-      [[fallthrough]];
-    case CXXConstructionKind::NonVirtualBase:
-      type = Ctor_Base;
-      break;
-    }
-
-    emitCXXConstructorCall(cd, type, forVirtualBase, delegating, dest, e);
-  }
-}
-
 RValue CIRGenFunction::emitReferenceBindingToExpr(const Expr *e) {
   // Emit the expression as an lvalue.
   LValue lv = emitLValue(e);
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index 3d3030ca87e2a..dcded94b012f4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -343,8 +343,8 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
     cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitNoInitExpr");
   }
   void VisitCXXDefaultArgExpr(CXXDefaultArgExpr *dae) {
-    cgf.cgm.errorNYI(dae->getSourceRange(),
-                     "AggExprEmitter: VisitCXXDefaultArgExpr");
+    CIRGenFunction::CXXDefaultArgExprScope scope(cgf, dae);
+    Visit(dae->getExpr());
   }
   void VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *e) {
     cgf.cgm.errorNYI(e->getSourceRange(),
@@ -779,8 +779,8 @@ void AggExprEmitter::visitCXXParenListOrInitListExpr(
     Expr *e, ArrayRef<Expr *> args, FieldDecl *initializedFieldInUnion,
     Expr *arrayFiller) {
 
-  const AggValueSlot dest =
-      ensureSlot(cgf.getLoc(e->getSourceRange()), e->getType());
+  const mlir::Location loc = cgf.getLoc(e->getSourceRange());
+  const AggValueSlot dest = ensureSlot(loc, e->getType());
 
   if (e->getType()->isConstantArrayType()) {
     cir::ArrayType arrayTy =
@@ -819,10 +819,23 @@ void AggExprEmitter::visitCXXParenListOrInitListExpr(
   if (auto *cxxrd = dyn_cast<CXXRecordDecl>(record)) {
     assert(numInitElements >= cxxrd->getNumBases() &&
            "missing initializer for base class");
-    if (cxxrd->getNumBases() > 0) {
-      cgf.cgm.errorNYI(e->getSourceRange(),
-                       "visitCXXParenListOrInitListExpr base class init");
-      return;
+    for (auto &base : cxxrd->bases()) {
+      assert(!base.isVirtual() && "should not see vbases here");
+      CXXRecordDecl *baseRD = base.getType()->getAsCXXRecordDecl();
+      Address address = cgf.getAddressOfDirectBaseInCompleteClass(
+          loc, dest.getAddress(), cxxrd, baseRD,
+          /*baseIsVirtual=*/false);
+      assert(!cir::MissingFeatures::aggValueSlotGC());
+      AggValueSlot aggSlot = AggValueSlot::forAddr(
+          address, Qualifiers(), AggValueSlot::IsDestructed,
+          AggValueSlot::IsNotAliased,
+          cgf.getOverlapForBaseInit(cxxrd, baseRD, false));
+      cgf.emitAggExpr(args[curInitIndex++], aggSlot);
+      if (base.getType().isDestructedType()) {
+        cgf.cgm.errorNYI(e->getSourceRange(),
+                         "push deferred deactivation cleanup");
+        return;
+      }
     }
   }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
index 9dd9b6d550763..ac126965a95a5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
@@ -234,6 +234,89 @@ RValue CIRGenFunction::emitCXXMemberOrOperatorCall(
   return emitCall(fnInfo, callee, returnValue, args, nullptr, loc);
 }
 
+static void emitNullBaseClassInitialization(CIRGenFunction &cgf,
+                                            Address destPtr,
+                                            const CXXRecordDecl *base) {
+  if (base->isEmpty())
+    return;
+
+  cgf.cgm.errorNYI(base->getSourceRange(),
+                   "emitNullBaseClassInitialization: not empty");
+}
+
+void CIRGenFunction::emitCXXConstructExpr(const CXXConstructExpr *e,
+                                          AggValueSlot dest) {
+  assert(!dest.isIgnored() && "Must have a destination!");
+  const CXXConstructorDecl *cd = e->getConstructor();
+
+  // If we require zero initialization before (or instead of) calling the
+  // constructor, as can be the case with a non-user-provided default
+  // constructor, emit the zero initialization now, unless destination is
+  // already zeroed.
+  if (e->requiresZeroInitialization() && !dest.isZeroed()) {
+    switch (e->getConstructionKind()) {
+    case CXXConstructionKind::Delegating:
+    case CXXConstructionKind::Complete:
+      emitNullInitialization(getLoc(e->getSourceRange()), dest.getAddress(),
+                             e->getType());
+      break;
+    case CXXConstructionKind::VirtualBase:
+    case CXXConstructionKind::NonVirtualBase:
+      emitNullBaseClassInitialization(*this, dest.getAddress(),
+                                      cd->getParent());
+      break;
+    }
+  }
+
+  // If this is a call to a trivial default constructor, do nothing.
+  if (cd->isTrivial() && cd->isDefaultConstructor())
+    return;
+
+  // Elide the constructor if we're constructing from a temporary
+  if (getLangOpts().ElideConstructors && e->isElidable()) {
+    // FIXME: This only handles the simplest case, where the source object is
+    //        passed directly as the first argument to the constructor. This
+    //        should also handle stepping through implicit casts and conversion
+    //        sequences which involve two steps, with a conversion operator
+    //        follwed by a converting constructor.
+    const Expr *srcObj = e->getArg(0);
+    assert(srcObj->isTemporaryObject(getContext(), cd->getParent()));
+    assert(
+        getContext().hasSameUnqualifiedType(e->getType(), srcObj->getType()));
+    emitAggExpr(srcObj, dest);
+    return;
+  }
+
+  if (const ArrayType *arrayType = getContext().getAsArrayType(e->getType())) {
+    assert(!cir::MissingFeatures::sanitizers());
+    emitCXXAggrConstructorCall(cd, arrayType, dest.getAddress(), e, false);
+  } else {
+
+    clang::CXXCtorType type = Ctor_Complete;
+    bool forVirtualBase = false;
+    bool delegating = false;
+
+    switch (e->getConstructionKind()) {
+    case CXXConstructionKind::Complete:
+      type = Ctor_Complete;
+      break;
+    case CXXConstructionKind::Delegating:
+      // We should be emitting a constructor; GlobalDecl will assert this
+      type = curGD.getCtorType();
+      delegating = true;
+      break;
+    case CXXConstructionKind::VirtualBase:
+      forVirtualBase = true;
+      [[fallthrough]];
+    case CXXConstructionKind::NonVirtualBase:
+      type = Ctor_Base;
+      break;
+    }
+
+    emitCXXConstructorCall(cd, type, forVirtualBase, delegating, dest, e);
+  }
+}
+
 static CharUnits calculateCookiePadding(CIRGenFunction &cgf,
                                         const CXXNewExpr *e) {
   if (!e->isArray())
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index 047f3599eed03..9ed920085c8c6 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -339,7 +339,7 @@ mlir::Value ComplexExprEmitter::emitLoadOfLValue(LValue lv,
     cgf.cgm.errorNYI(loc, "emitLoadOfLValue with Atomic LV");
 
   const Address srcAddr = lv.getAddress();
-  return builder.createLoad(cgf.getLoc(loc), srcAddr);
+  return builder.createLoad(cgf.getLoc(loc), srcAddr, lv.isVolatileQualified());
 }
 
 /// EmitStoreOfComplex - Store the specified real/imag parts into the
@@ -353,7 +353,7 @@ void ComplexExprEmitter::emitStoreOfComplex(mlir::Location loc, mlir::Value val,
   }
 
   const Address destAddr = lv.getAddress();
-  builder.createStore(loc, val, destAddr);
+  builder.createStore(loc, val, destAddr, lv.isVolatileQualified());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 119314fe27dce..4461875fcf678 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -78,11 +78,15 @@ struct BinOpInfo {
 class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   CIRGenFunction &cgf;
   CIRGenBuilderTy &builder;
+  // Unlike classic codegen we set this to false or use std::exchange to read
+  // the value instead of calling TestAndClearIgnoreResultAssign to make it
+  // explicit when the value is used
   bool ignoreResultAssign;
 
 public:
-  ScalarExprEmitter(CIRGenFunction &cgf, CIRGenBuilderTy &builder)
-      : cgf(cgf), builder(builder) {}
+  ScalarExprEmitter(CIRGenFunction &cgf, CIRGenBuilderTy &builder,
+                    bool ignoreResultAssign = false)
+      : cgf(cgf), builder(builder), ignoreResultAssign(ignoreResultAssign) {}
 
   //===--------------------------------------------------------------------===//
   //                               Utilities
@@ -221,6 +225,8 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   }
 
   mlir::Value VisitArraySubscriptExpr(ArraySubscriptExpr *e) {
+    ignoreResultAssign = false;
+
     if (e->getBase()->getType()->isVectorType()) {
       assert(!cir::MissingFeatures::scalableVectors());
 
@@ -432,6 +438,10 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return cgf.emitVAArg(ve);
   }
 
+  mlir::Value VisitCXXRewrittenBinaryOperator(CXXRewrittenBinaryOperator *e) {
+    return Visit(e->getSemanticForm());
+  }
+
   mlir::Value VisitUnaryExprOrTypeTraitExpr(const UnaryExprOrTypeTraitExpr *e);
   mlir::Value
   VisitAbstractConditionalOperator(const AbstractConditionalOperator *e);
@@ -839,6 +849,7 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
   BinOpInfo emitBinOps(const BinaryOperator *e,
                        QualType promotionType = QualType()) {
+    ignoreResultAssign = false;
     BinOpInfo result;
     result.lhs = cgf.emitPromotedScalarExpr(e->getLHS(), promotionType);
     result.rhs = cgf.emitPromotedScalarExpr(e->getRHS(), promotionType);
@@ -924,6 +935,7 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 #undef HANDLEBINOP
 
   mlir::Value emitCmp(const BinaryOperator *e) {
+    ignoreResultAssign = false;
     const mlir::Location loc = cgf.getLoc(e->getExprLoc());
     mlir::Value result;
     QualType lhsTy = e->getLHS()->getType();
@@ -1406,11 +1418,13 @@ CIRGenFunction::emitCompoundAssignmentLValue(const CompoundAssignOperator *e) {
 }
 
 /// Emit the computation of the specified expression of scalar type.
-mlir::Value CIRGenFunction::emitScalarExpr(const Expr *e) {
+mlir::Value CIRGenFunction::emitScalarExpr(const Expr *e,
+                                           bool ignoreResultAssign) {
   assert(e && hasScalarEvaluationKind(e->getType()) &&
          "Invalid scalar expression to emit");
 
-  return ScalarExprEmitter(*this, builder).Visit(const_cast<Expr *>(e));
+  return ScalarExprEmitter(*this, builder, ignoreResultAssign)
+      .Visit(const_cast<Expr *>(e));
 }
 
 mlir::Value CIRGenFunction::emitPromotedScalarExpr(const Expr *e,
@@ -1919,6 +1933,14 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) {
     return builder.createIntToPtr(middleVal, destCIRTy);
   }
 
+  case CK_UncheckedDerivedToBase:
+  case CK_DerivedToBase: {
+    // The EmitPointerWithAlignment path does this fine; just discard
+    // the alignment.
+    return cgf.getAsNaturalPointerTo(cgf.emitPointerWithAlignment(ce),
+                                     ce->getType()->getPointeeType());
+  }
+
   case CK_Dynamic: {
     Address v = cgf.emitPointerWithAlignment(subExpr);
     const auto *dce = cast<CXXDynamicCastExpr>(ce);
@@ -2054,6 +2076,11 @@ mlir::Value ScalarExprEmitter::VisitMemberExpr(MemberExpr *e) {
 mlir::Value ScalarExprEmitter::VisitInitListExpr(InitListExpr *e) {
   const unsigned numInitElements = e->getNumInits();
 
+  [[maybe_unused]] const bool ignore = std::exchange(ignoreResultAssign, false);
+  assert((ignore == false ||
+          (numInitElements == 0 && e->getType()->isVoidType())) &&
+         "init list ignored");
+
   if (e->hadArrayRangeDesignator()) {
     cgf.cgm.errorNYI(e->getSourceRange(), "ArrayRangeDesignator");
     return {};
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index 71ff20a3b0e43..cc75acc18c211 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -242,12 +242,19 @@ void CIRGenFunction::LexicalScope::cleanup() {
     }
   };
 
-  if (returnBlock != nullptr) {
-    // Write out the return block, which loads the value from `__retval` and
-    // issues the `cir.return`.
+  // Cleanup are done right before codegen resumes a scope. This is where
+  // objects are destroyed. Process all return blocks.
+  // TODO(cir): Handle returning from a switch statement through a cleanup
+  // block. We can't simply jump to the cleanup block, because the cleanup block
+  // is not part of the case region. Either reemit all cleanups in the return
+  // block or wait for MLIR structured control flow to support early exits.
+  llvm::SmallVector<mlir::Block *> retBlocks;
+  for (mlir::Block *retBlock : localScope->getRetBlocks()) {
     mlir::OpBuilder::InsertionGuard guard(builder);
-    builder.setInsertionPointToEnd(returnBlock);
-    (void)emitReturn(*returnLoc);
+    builder.setInsertionPointToEnd(retBlock);
+    retBlocks.push_back(retBlock);
+    mlir::Location retLoc = localScope->getRetLoc(retBlock);
+    emitReturn(retLoc);
   }
 
   auto insertCleanupAndLeave = [&](mlir::Block *insPt) {
@@ -274,19 +281,22 @@ void CIRGenFunction::LexicalScope::cleanup() {
 
     if (localScope->depth == 0) {
       // Reached the end of the function.
-      if (returnBlock != nullptr) {
-        if (returnBlock->getUses().empty()) {
-          returnBlock->erase();
+      // Special handling only for single return block case
+      if (localScope->getRetBlocks().size() == 1) {
+        mlir::Block *retBlock = localScope->getRetBlocks()[0];
+        mlir::Location retLoc = localScope->getRetLoc(retBlock);
+        if (retBlock->getUses().empty()) {
+          retBlock->erase();
         } else {
           // Thread return block via cleanup block.
           if (cleanupBlock) {
-            for (mlir::BlockOperand &blockUse : returnBlock->getUses()) {
+            for (mlir::BlockOperand &blockUse : retBlock->getUses()) {
               cir::BrOp brOp = mlir::cast<cir::BrOp>(blockUse.getOwner());
               brOp.setSuccessor(cleanupBlock);
             }
           }
 
-          cir::BrOp::create(builder, *returnLoc, returnBlock);
+          cir::BrOp::create(builder, retLoc, retBlock);
           return;
         }
       }
@@ -324,8 +334,10 @@ void CIRGenFunction::LexicalScope::cleanup() {
   bool entryBlock = builder.getInsertionBlock()->isEntryBlock();
   if (!entryBlock && curBlock->empty()) {
     curBlock->erase();
-    if (returnBlock != nullptr && returnBlock->getUses().empty())
-      returnBlock->erase();
+    for (mlir::Block *retBlock : retBlocks) {
+      if (retBlock->getUses().empty())
+        retBlock->erase();
+    }
     return;
   }
 
@@ -620,6 +632,10 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn,
 
     startFunction(gd, retTy, fn, funcType, args, loc, bodyRange.getBegin());
 
+    // Save parameters for coroutine function.
+    if (body && isa_and_nonnull<CoroutineBodyStmt>(body))
+      llvm::append_range(fnArgs, funcDecl->parameters());
+
     if (isa<CXXDestructorDecl>(funcDecl)) {
       emitDestructorBody(args);
     } else if (isa<CXXConstructorDecl>(funcDecl)) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index c3fcd1a69a88e..b71a28c54dbef 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -152,6 +152,9 @@ class CIRGenFunction : public CIRGenTypeCache {
   /// global initializers.
   mlir::Operation *curFn = nullptr;
 
+  /// Save Parameter Decl for coroutine.
+  llvm::SmallVector<const ParmVarDecl *> fnArgs;
+
   using DeclMapTy = llvm::DenseMap<const clang::Decl *, Address>;
   /// This keeps track of the CIR allocas or globals for local C
   /// declarations.
@@ -497,6 +500,12 @@ class CIRGenFunction : public CIRGenTypeCache {
   VlaSizePair getVLASize(const VariableArrayType *type);
   VlaSizePair getVLASize(QualType type);
 
+  Address getAsNaturalAddressOf(Address addr, QualType pointeeTy);
+
+  mlir::Value getAsNaturalPointerTo(Address addr, QualType pointeeType) {
+    return getAsNaturalAddressOf(addr, pointeeType).getBasePointer();
+  }
+
   void finishFunction(SourceLocation endLoc);
 
   /// Determine whether the given initializer is trivial in the sense
@@ -1103,44 +1112,69 @@ class CIRGenFunction : public CIRGenTypeCache {
     // ---
 
   private:
-    // `returnBlock`, `returnLoc`, and all the functions that deal with them
-    // will change and become more complicated when `switch` statements are
-    // upstreamed.  `case` statements within the `switch` are in the same scope
-    // but have their own regions.  Therefore the LexicalScope will need to
-    // keep track of multiple return blocks.
-    mlir::Block *returnBlock = nullptr;
-    std::optional<mlir::Location> returnLoc;
-
-    // See the comment on `getOrCreateRetBlock`.
+    // On switches we need one return block per region, since cases don't
+    // have their own scopes but are distinct regions nonetheless.
+
+    // TODO: This implementation should change once we have support for early
+    //       exits in MLIR structured control flow (llvm-project#161575)
+    llvm::SmallVector<mlir::Block *> retBlocks;
+    llvm::DenseMap<mlir::Block *, mlir::Location> retLocs;
+    llvm::DenseMap<cir::CaseOp, unsigned> retBlockInCaseIndex;
+    std::optional<unsigned> normalRetBlockIndex;
+
+    // There's usually only one ret block per scope, but this needs to be
+    // get or create because of potential unreachable return statements, note
+    // that for those, all source location maps to the first one found.
     mlir::Block *createRetBlock(CIRGenFunction &cgf, mlir::Location loc) {
-      assert(returnBlock == nullptr && "only one return block per scope");
-      // Create the cleanup block but don't hook it up just yet.
+      assert((isa_and_nonnull<cir::CaseOp>(
+                  cgf.builder.getBlock()->getParentOp()) ||
+              retBlocks.size() == 0) &&
+             "only switches can hold more than one ret block");
+
+      // Create the return block but don't hook it up just yet.
       mlir::OpBuilder::InsertionGuard guard(cgf.builder);
-      returnBlock =
-          cgf.builder.createBlock(cgf.builder.getBlock()->getParent());
-      updateRetLoc(returnBlock, loc);
-      return returnBlock;
+      auto *b = cgf.builder.createBlock(cgf.builder.getBlock()->getParent());
+      retBlocks.push_back(b);
+      updateRetLoc(b, loc);
+      return b;
     }
 
     cir::ReturnOp emitReturn(mlir::Location loc);
     void emitImplicitReturn();
 
   public:
-    mlir::Block *getRetBlock() { return returnBlock; }
-    mlir::Location getRetLoc(mlir::Block *b) { return *returnLoc; }
-    void updateRetLoc(mlir::Block *b, mlir::Location loc) { returnLoc = loc; }
-
-    // Create the return block for this scope, or return the existing one.
-    // This get-or-create logic is necessary to handle multiple return
-    // statements within the same scope, which can happen if some of them are
-    // dead code or if there is a `goto` into the middle of the scope.
+    llvm::ArrayRef<mlir::Block *> getRetBlocks() { return retBlocks; }
+    mlir::Location getRetLoc(mlir::Block *b) { return retLocs.at(b); }
+    void updateRetLoc(mlir::Block *b, mlir::Location loc) {
+      retLocs.insert_or_assign(b, loc);
+    }
+
     mlir::Block *getOrCreateRetBlock(CIRGenFunction &cgf, mlir::Location loc) {
-      if (returnBlock == nullptr) {
-        returnBlock = createRetBlock(cgf, loc);
-        return returnBlock;
+      // Check if we're inside a case region
+      if (auto caseOp = mlir::dyn_cast_if_present<cir::CaseOp>(
+              cgf.builder.getBlock()->getParentOp())) {
+        auto iter = retBlockInCaseIndex.find(caseOp);
+        if (iter != retBlockInCaseIndex.end()) {
+          // Reuse existing return block
+          mlir::Block *ret = retBlocks[iter->second];
+          updateRetLoc(ret, loc);
+          return ret;
+        }
+        // Create new return block
+        mlir::Block *ret = createRetBlock(cgf, loc);
+        retBlockInCaseIndex[caseOp] = retBlocks.size() - 1;
+        return ret;
+      }
+
+      if (normalRetBlockIndex) {
+        mlir::Block *ret = retBlocks[*normalRetBlockIndex];
+        updateRetLoc(ret, loc);
+        return ret;
       }
-      updateRetLoc(returnBlock, loc);
-      return returnBlock;
+
+      mlir::Block *ret = createRetBlock(cgf, loc);
+      normalRetBlockIndex = retBlocks.size() - 1;
+      return ret;
     }
 
     mlir::Block *getEntryBlock() { return entryBlock; }
@@ -1246,6 +1280,9 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   RValue emitAtomicExpr(AtomicExpr *e);
   void emitAtomicInit(Expr *init, LValue dest);
+  void emitAtomicStore(RValue rvalue, LValue dest, bool isInit);
+  void emitAtomicStore(RValue rvalue, LValue dest, cir::MemOrder order,
+                       bool isVolatile, bool isInit);
 
   AutoVarEmission emitAutoVarAlloca(const clang::VarDecl &d,
                                     mlir::OpBuilder::InsertPoint ip = {});
@@ -1279,6 +1316,28 @@ class CIRGenFunction : public CIRGenTypeCache {
   RValue emitBuiltinExpr(const clang::GlobalDecl &gd, unsigned builtinID,
                          const clang::CallExpr *e, ReturnValueSlot returnValue);
 
+  /// Returns a Value corresponding to the size of the given expression by
+  /// emitting a `cir.objsize` operation.
+  ///
+  /// \param e The expression whose object size to compute
+  /// \param type Determines the semantics of the object size computation.
+  ///   The type parameter is a 2-bit value where:
+  ///     bit 0 (type & 1): 0 = whole object, 1 = closest subobject
+  ///     bit 1 (type & 2): 0 = maximum size, 2 = minimum size
+  /// \param resType The result type for the size value
+  /// \param emittedE Optional pre-emitted pointer value. If non-null, we'll
+  ///   call `cir.objsize` on this value rather than emitting e.
+  /// \param isDynamic If true, allows runtime evaluation via dynamic mode
+  mlir::Value emitBuiltinObjectSize(const clang::Expr *e, unsigned type,
+                                    cir::IntType resType, mlir::Value emittedE,
+                                    bool isDynamic);
+
+  mlir::Value evaluateOrEmitBuiltinObjectSize(const clang::Expr *e,
+                                              unsigned type,
+                                              cir::IntType resType,
+                                              mlir::Value emittedE,
+                                              bool isDynamic);
+
   RValue emitCall(const CIRGenFunctionInfo &funcInfo,
                   const CIRGenCallee &callee, ReturnValueSlot returnValue,
                   const CallArgList &args, cir::CIRCallOpInterface *callOp,
@@ -1476,7 +1535,8 @@ class CIRGenFunction : public CIRGenTypeCache {
                               llvm::ArrayRef<mlir::Value> args = {});
 
   /// Emit the computation of the specified expression of scalar type.
-  mlir::Value emitScalarExpr(const clang::Expr *e);
+  mlir::Value emitScalarExpr(const clang::Expr *e,
+                             bool ignoreResultAssign = false);
 
   mlir::Value emitScalarPrePostIncDec(const UnaryOperator *e, LValue lv,
                                       cir::UnaryOpKind kind, bool isPre);
@@ -1654,8 +1714,8 @@ class CIRGenFunction : public CIRGenTypeCache {
                           bool isInit);
 
   void emitStoreOfScalar(mlir::Value value, Address addr, bool isVolatile,
-                         clang::QualType ty, bool isInit = false,
-                         bool isNontemporal = false);
+                         clang::QualType ty, LValueBaseInfo baseInfo,
+                         bool isInit = false, bool isNontemporal = false);
   void emitStoreOfScalar(mlir::Value value, LValue lvalue, bool isInit);
 
   /// Store the specified rvalue into the specified
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
index 50101373f3e9c..527dfd21db8a5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
@@ -126,7 +126,7 @@ class OpenACCClauseCIREmitter final
         .CaseLower("default", mlir::acc::DeviceType::Default)
         .CaseLower("host", mlir::acc::DeviceType::Host)
         .CaseLower("multicore", mlir::acc::DeviceType::Multicore)
-        .CasesLower("nvidia", "acc_device_nvidia",
+        .CasesLower({"nvidia", "acc_device_nvidia"},
                     mlir::acc::DeviceType::Nvidia)
         .CaseLower("radeon", mlir::acc::DeviceType::Radeon);
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenPointerAuth.cpp b/clang/lib/CIR/CodeGen/CIRGenPointerAuth.cpp
new file mode 100644
index 0000000000000..20b0646fdab44
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenPointerAuth.cpp
@@ -0,0 +1,23 @@
+//===--- CIRGenPointerAuth.cpp - CIR generation for ptr auth --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains common routines relating to the emission of
+// pointer authentication operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenFunction.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+
+Address CIRGenFunction::getAsNaturalAddressOf(Address addr,
+                                              QualType pointeeTy) {
+  assert(!cir::MissingFeatures::pointerAuthentication());
+  return addr;
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index 1eb7199ce6dfe..7bb8c2153056a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -66,7 +66,7 @@ static mlir::LogicalResult emitStmtWithResult(CIRGenFunction &cgf,
 mlir::LogicalResult CIRGenFunction::emitCompoundStmtWithoutScope(
     const CompoundStmt &s, Address *lastValue, AggValueSlot slot) {
   mlir::LogicalResult result = mlir::success();
-  const Stmt *exprResult = s.getStmtExprResult();
+  const Stmt *exprResult = s.body_back();
   assert((!lastValue || (lastValue && exprResult)) &&
          "If lastValue is not null then the CompoundStmt must have a "
          "StmtExprResult");
diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt
index 7c31beacc5fb3..d3e2290ceea0b 100644
--- a/clang/lib/CIR/CodeGen/CMakeLists.txt
+++ b/clang/lib/CIR/CodeGen/CMakeLists.txt
@@ -35,6 +35,7 @@ add_clang_library(clangCIR
   CIRGenOpenACC.cpp
   CIRGenOpenACCClause.cpp
   CIRGenOpenACCRecipe.cpp
+  CIRGenPointerAuth.cpp
   CIRGenRecordLayoutBuilder.cpp
   CIRGenStmt.cpp
   CIRGenStmtOpenACC.cpp
diff --git a/clang/lib/CIR/CodeGen/EHScopeStack.h b/clang/lib/CIR/CodeGen/EHScopeStack.h
index 4198c23c9cbed..9005b0106b2a4 100644
--- a/clang/lib/CIR/CodeGen/EHScopeStack.h
+++ b/clang/lib/CIR/CodeGen/EHScopeStack.h
@@ -155,6 +155,9 @@ class EHScopeStack {
   /// The innermost normal cleanup on the stack.
   stable_iterator innermostNormalCleanup = stable_end();
 
+  /// The innermost EH scope on the stack.
+  stable_iterator innermostEHScope = stable_end();
+
   /// The CGF this Stack belong to
   CIRGenFunction *cgf = nullptr;
 
@@ -226,6 +229,8 @@ class EHScopeStack {
   }
   stable_iterator getInnermostActiveNormalCleanup() const;
 
+  stable_iterator getInnermostEHScope() const { return innermostEHScope; }
+
   /// An unstable reference to a scope-stack depth.  Invalidated by
   /// pushes but not pops.
   class iterator;
@@ -233,6 +238,9 @@ class EHScopeStack {
   /// Returns an iterator pointing to the innermost EH scope.
   iterator begin() const;
 
+  /// Returns an iterator pointing to the outermost EH scope.
+  iterator end() const;
+
   /// Create a stable reference to the top of the EH stack.  The
   /// returned reference is valid until that scope is popped off the
   /// stack.
diff --git a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
index 5897352829891..f7907c76c8ccb 100644
--- a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
@@ -341,7 +341,7 @@ RecordType::getTypeSizeInBits(const mlir::DataLayout &dataLayout,
   if (isUnion())
     return dataLayout.getTypeSize(getLargestMember(dataLayout));
 
-  unsigned recordSize = computeStructSize(dataLayout);
+  auto recordSize = static_cast<uint64_t>(computeStructSize(dataLayout));
   return llvm::TypeSize::getFixed(recordSize * 8);
 }
 
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
index fbecab9774f5b..2ef09b74dc968 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
@@ -26,6 +26,11 @@
 using namespace mlir;
 using namespace cir;
 
+namespace mlir {
+#define GEN_PASS_DEF_CIRCANONICALIZE
+#include "clang/CIR/Dialect/Passes.h.inc"
+} // namespace mlir
+
 namespace {
 
 /// Removes branches between two blocks if it is the only branch.
@@ -101,7 +106,8 @@ struct RemoveEmptySwitch : public OpRewritePattern<SwitchOp> {
 // CIRCanonicalizePass
 //===----------------------------------------------------------------------===//
 
-struct CIRCanonicalizePass : public CIRCanonicalizeBase<CIRCanonicalizePass> {
+struct CIRCanonicalizePass
+    : public impl::CIRCanonicalizeBase<CIRCanonicalizePass> {
   using CIRCanonicalizeBase::CIRCanonicalizeBase;
 
   // The same operation rewriting done here could have been performed
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp b/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp
index 3c6f76892d5cb..dcef9ddee1bb4 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp
@@ -21,6 +21,11 @@
 using namespace mlir;
 using namespace cir;
 
+namespace mlir {
+#define GEN_PASS_DEF_CIRSIMPLIFY
+#include "clang/CIR/Dialect/Passes.h.inc"
+} // namespace mlir
+
 //===----------------------------------------------------------------------===//
 // Rewrite patterns
 //===----------------------------------------------------------------------===//
@@ -283,7 +288,7 @@ struct SimplifyVecSplat : public OpRewritePattern<VecSplatOp> {
 // CIRSimplifyPass
 //===----------------------------------------------------------------------===//
 
-struct CIRSimplifyPass : public CIRSimplifyBase<CIRSimplifyPass> {
+struct CIRSimplifyPass : public impl::CIRSimplifyBase<CIRSimplifyPass> {
   using CIRSimplifyBase::CIRSimplifyBase;
 
   void runOnOperation() override;
diff --git a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
index ca7554e4e3754..69a5334ca2423 100644
--- a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
@@ -26,6 +26,11 @@
 using namespace mlir;
 using namespace cir;
 
+namespace mlir {
+#define GEN_PASS_DEF_CIRFLATTENCFG
+#include "clang/CIR/Dialect/Passes.h.inc"
+} // namespace mlir
+
 namespace {
 
 /// Lowers operations with the terminator trait that have a single successor.
@@ -50,7 +55,7 @@ void walkRegionSkipping(
   });
 }
 
-struct CIRFlattenCFGPass : public CIRFlattenCFGBase<CIRFlattenCFGPass> {
+struct CIRFlattenCFGPass : public impl::CIRFlattenCFGBase<CIRFlattenCFGPass> {
 
   CIRFlattenCFGPass() = default;
   void runOnOperation() override;
diff --git a/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp b/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
index c0db98440a902..00972b6976295 100644
--- a/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
@@ -14,9 +14,14 @@
 using namespace mlir;
 using namespace cir;
 
+namespace mlir {
+#define GEN_PASS_DEF_GOTOSOLVER
+#include "clang/CIR/Dialect/Passes.h.inc"
+} // namespace mlir
+
 namespace {
 
-struct GotoSolverPass : public GotoSolverBase<GotoSolverPass> {
+struct GotoSolverPass : public impl::GotoSolverBase<GotoSolverPass> {
   GotoSolverPass() = default;
   void runOnOperation() override;
 };
diff --git a/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp b/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp
index 72bbf08c79b16..74b22faadc8ae 100644
--- a/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp
@@ -20,9 +20,14 @@
 using namespace mlir;
 using namespace cir;
 
+namespace mlir {
+#define GEN_PASS_DEF_HOISTALLOCAS
+#include "clang/CIR/Dialect/Passes.h.inc"
+} // namespace mlir
+
 namespace {
 
-struct HoistAllocasPass : public HoistAllocasBase<HoistAllocasPass> {
+struct HoistAllocasPass : public impl::HoistAllocasBase<HoistAllocasPass> {
 
   HoistAllocasPass() = default;
   void runOnOperation() override;
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index cba04649ca05e..29b1211d2c351 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -23,6 +23,11 @@
 using namespace mlir;
 using namespace cir;
 
+namespace mlir {
+#define GEN_PASS_DEF_LOWERINGPREPARE
+#include "clang/CIR/Dialect/Passes.h.inc"
+} // namespace mlir
+
 static SmallString<128> getTransformedFileName(mlir::ModuleOp mlirModule) {
   SmallString<128> fileName;
 
@@ -53,7 +58,8 @@ static cir::FuncOp getCalledFunction(cir::CallOp callOp) {
 }
 
 namespace {
-struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
+struct LoweringPreparePass
+    : public impl::LoweringPrepareBase<LoweringPreparePass> {
   LoweringPreparePass() = default;
   void runOnOperation() override;
 
diff --git a/clang/lib/CIR/Dialect/Transforms/PassDetail.h b/clang/lib/CIR/Dialect/Transforms/PassDetail.h
index 600dde56d679f..ef42a85cc2751 100644
--- a/clang/lib/CIR/Dialect/Transforms/PassDetail.h
+++ b/clang/lib/CIR/Dialect/Transforms/PassDetail.h
@@ -21,7 +21,7 @@ namespace mlir {
 template <typename ConcreteDialect>
 void registerDialect(DialectRegistry &registry);
 
-#define GEN_PASS_CLASSES
+#define GEN_PASS_DECL
 #include "clang/CIR/Dialect/Passes.h.inc"
 
 } // namespace mlir
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 5a6193fa8d840..b4afed7019417 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -194,6 +194,14 @@ mlir::LogicalResult CIRToLLVMCosOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMExpOpLowering::matchAndRewrite(
+    cir::ExpOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type resTy = typeConverter->convertType(op.getType());
+  rewriter.replaceOpWithNewOp<mlir::LLVM::ExpOp>(op, resTy, adaptor.getSrc());
+  return mlir::success();
+}
+
 static mlir::Value getLLVMIntCast(mlir::ConversionPatternRewriter &rewriter,
                                   mlir::Value llvmSrc, mlir::Type llvmDstIntTy,
                                   bool isUnsigned, uint64_t cirSrcWidth,
@@ -1336,6 +1344,14 @@ mlir::LogicalResult CIRToLLVMATanOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMCeilOpLowering::matchAndRewrite(
+    cir::CeilOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type resTy = typeConverter->convertType(op.getType());
+  rewriter.replaceOpWithNewOp<mlir::LLVM::FCeilOp>(op, resTy, adaptor.getSrc());
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMAllocaOpLowering::matchAndRewrite(
     cir::AllocaOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
@@ -2816,6 +2832,29 @@ static void collectUnreachable(mlir::Operation *parent,
   }
 }
 
+mlir::LogicalResult CIRToLLVMObjSizeOpLowering::matchAndRewrite(
+    cir::ObjSizeOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type llvmResTy = getTypeConverter()->convertType(op.getType());
+  mlir::Location loc = op->getLoc();
+
+  mlir::IntegerType i1Ty = rewriter.getI1Type();
+
+  auto i1Val = [&rewriter, &loc, &i1Ty](bool val) {
+    return mlir::LLVM::ConstantOp::create(rewriter, loc, i1Ty, val);
+  };
+
+  replaceOpWithCallLLVMIntrinsicOp(rewriter, op, "llvm.objectsize", llvmResTy,
+                                   {
+                                       adaptor.getPtr(),
+                                       i1Val(op.getMin()),
+                                       i1Val(op.getNullunknown()),
+                                       i1Val(op.getDynamic()),
+                                   });
+
+  return mlir::LogicalResult::success();
+}
+
 void ConvertCIRToLLVMPass::processCIRAttrs(mlir::ModuleOp module) {
   // Lower the module attributes to LLVM equivalents.
   if (mlir::Attribute tripleAttr =
diff --git a/clang/lib/CMakeLists.txt b/clang/lib/CMakeLists.txt
index 4f2218b583e41..e90b009da606a 100644
--- a/clang/lib/CMakeLists.txt
+++ b/clang/lib/CMakeLists.txt
@@ -13,6 +13,7 @@ add_subdirectory(Edit)
 add_subdirectory(ExtractAPI)
 add_subdirectory(Rewrite)
 add_subdirectory(Driver)
+add_subdirectory(Options)
 add_subdirectory(Serialization)
 add_subdirectory(Frontend)
 add_subdirectory(FrontendTool)
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 3c313149ca1fc..b967a26dd19d7 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -313,7 +313,7 @@ getCodeModel(const CodeGenOptions &CodeGenOpts) {
                            .Case("kernel", llvm::CodeModel::Kernel)
                            .Case("medium", llvm::CodeModel::Medium)
                            .Case("large", llvm::CodeModel::Large)
-                           .Cases("default", "", ~1u)
+                           .Cases({"default", ""}, ~1u)
                            .Default(~0u);
   assert(CodeModel != ~0u && "invalid code model!");
   if (CodeModel == ~1u)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b81e0d02da2c9..0a2ea416e5e4d 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -1211,14 +1211,10 @@ llvm::Value *CodeGenFunction::emitCountedByPointerSize(
         getContext().getTypeSizeInChars(ElementTy->getPointeeType());
 
     if (ElementSize.isZero()) {
-      // This might be a __sized_by on a 'void *', which counts bytes, not
-      // elements.
-      auto *CAT = ElementTy->getAs<CountAttributedType>();
-      if (!CAT || (CAT->getKind() != CountAttributedType::SizedBy &&
-                   CAT->getKind() != CountAttributedType::SizedByOrNull))
-        // Okay, not sure what it is now.
-        // FIXME: Should this be an assert?
-        return std::optional<CharUnits>();
+      // This might be a __sized_by (or __counted_by) on a
+      // 'void *', which counts bytes, not elements.
+      [[maybe_unused]] auto *CAT = ElementTy->getAs<CountAttributedType>();
+      assert(CAT && "must have an CountAttributedType");
 
       ElementSize = CharUnits::One();
     }
@@ -3992,6 +3988,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_elementwise_exp10:
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, Intrinsic::exp10, "elt.exp10"));
+  case Builtin::BI__builtin_elementwise_ldexp: {
+    Value *Src = EmitScalarExpr(E->getArg(0));
+    Value *Exp = EmitScalarExpr(E->getArg(1));
+    Value *Result = Builder.CreateLdexp(Src, Exp, {}, "elt.ldexp");
+    return RValue::get(Result);
+  }
   case Builtin::BI__builtin_elementwise_log:
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, Intrinsic::log, "elt.log"));
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 465f3f4e670c2..efacb3cc04c01 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -38,6 +38,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
@@ -1990,6 +1991,7 @@ static void getTrivialDefaultFunctionAttributes(
       // This is the default behavior.
       break;
     case CodeGenOptions::FramePointerKind::Reserved:
+    case CodeGenOptions::FramePointerKind::NonLeafNoReserve:
     case CodeGenOptions::FramePointerKind::NonLeaf:
     case CodeGenOptions::FramePointerKind::All:
       FuncAttrs.addAttribute("frame-pointer",
@@ -6277,6 +6279,24 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
     pushDestroy(QualType::DK_nontrivial_c_struct, Ret.getAggregateAddress(),
                 RetTy);
 
+  // Generate function declaration DISuprogram in order to be used
+  // in debug info about call sites.
+  if (CGDebugInfo *DI = getDebugInfo()) {
+    // Ensure call site info would actually be emitted before collecting
+    // further callee info.
+    if (CalleeDecl && !CalleeDecl->hasAttr<NoDebugAttr>() &&
+        DI->getCallSiteRelatedAttrs() != llvm::DINode::FlagZero) {
+      CodeGenFunction CalleeCGF(CGM);
+      const GlobalDecl &CalleeGlobalDecl =
+          Callee.getAbstractInfo().getCalleeDecl();
+      CalleeCGF.CurGD = CalleeGlobalDecl;
+      FunctionArgList Args;
+      QualType ResTy = CalleeCGF.BuildFunctionArgList(CalleeGlobalDecl, Args);
+      DI->EmitFuncDeclForCallSite(
+          CI, DI->getFunctionType(CalleeDecl, ResTy, Args), CalleeGlobalDecl);
+    }
+  }
+
   return Ret;
 }
 
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index ca579c915f49d..bda7b7487f59b 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -4957,7 +4957,7 @@ void CGDebugInfo::EmitFunctionDecl(GlobalDecl GD, SourceLocation Loc,
 
 void CGDebugInfo::EmitFuncDeclForCallSite(llvm::CallBase *CallOrInvoke,
                                           QualType CalleeType,
-                                          const FunctionDecl *CalleeDecl) {
+                                          GlobalDecl CalleeGlobalDecl) {
   if (!CallOrInvoke)
     return;
   auto *Func = dyn_cast<llvm::Function>(CallOrInvoke->getCalledOperand());
@@ -4966,6 +4966,9 @@ void CGDebugInfo::EmitFuncDeclForCallSite(llvm::CallBase *CallOrInvoke,
   if (Func->getSubprogram())
     return;
 
+  const FunctionDecl *CalleeDecl =
+      cast<FunctionDecl>(CalleeGlobalDecl.getDecl());
+
   // Do not emit a declaration subprogram for a function with nodebug
   // attribute, or if call site info isn't required.
   if (CalleeDecl->hasAttr<NoDebugAttr>() ||
@@ -4976,7 +4979,8 @@ void CGDebugInfo::EmitFuncDeclForCallSite(llvm::CallBase *CallOrInvoke,
   // create the one describing the function in order to have complete
   // call site debug info.
   if (!CalleeDecl->isStatic() && !CalleeDecl->isInlined())
-    EmitFunctionDecl(CalleeDecl, CalleeDecl->getLocation(), CalleeType, Func);
+    EmitFunctionDecl(CalleeGlobalDecl, CalleeDecl->getLocation(), CalleeType,
+                     Func);
 }
 
 void CGDebugInfo::EmitInlineFunctionStart(CGBuilderTy &Builder, GlobalDecl GD) {
diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
index 78c3eb9c5792e..2378bdd780b3b 100644
--- a/clang/lib/CodeGen/CGDebugInfo.h
+++ b/clang/lib/CodeGen/CGDebugInfo.h
@@ -511,7 +511,7 @@ class CGDebugInfo {
   /// This is needed for call site debug info.
   void EmitFuncDeclForCallSite(llvm::CallBase *CallOrInvoke,
                                QualType CalleeType,
-                               const FunctionDecl *CalleeDecl);
+                               GlobalDecl CalleeGlobalDecl);
 
   /// Constructs the debug code for exiting a function.
   void EmitFunctionEnd(CGBuilderTy &Builder, llvm::Function *Fn);
@@ -678,6 +678,10 @@ class CGDebugInfo {
   /// Emit symbol for debugger that holds the pointer to the vtable.
   void emitVTableSymbol(llvm::GlobalVariable *VTable, const CXXRecordDecl *RD);
 
+  /// Return flags which enable debug info emission for call sites, provided
+  /// that it is supported and enabled.
+  llvm::DINode::DIFlags getCallSiteRelatedAttrs() const;
+
 private:
   /// Amend \p I's DebugLoc with \p Group (its source atom group) and \p
   /// Rank (lower nonzero rank is higher precedence). Does nothing if \p I
@@ -827,11 +831,6 @@ class CGDebugInfo {
                          unsigned LineNo, StringRef LinkageName,
                          llvm::GlobalVariable *Var, llvm::DIScope *DContext);
 
-
-  /// Return flags which enable debug info emission for call sites, provided
-  /// that it is supported and enabled.
-  llvm::DINode::DIFlags getCallSiteRelatedAttrs() const;
-
   /// Get the printing policy for producing names for debug info.
   PrintingPolicy getPrintingPolicy() const;
 
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 01f2161f27555..a837f00732748 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -6632,15 +6632,6 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType,
                          E == MustTailCall, E->getExprLoc());
 
   if (auto *CalleeDecl = dyn_cast_or_null<FunctionDecl>(TargetDecl)) {
-    // Generate function declaration DISuprogram in order to be used
-    // in debug info about call sites.
-    if (CGDebugInfo *DI = getDebugInfo()) {
-      FunctionArgList Args;
-      QualType ResTy = BuildFunctionArgList(CalleeDecl, Args);
-      DI->EmitFuncDeclForCallSite(LocalCallOrInvoke,
-                                  DI->getFunctionType(CalleeDecl, ResTy, Args),
-                                  CalleeDecl);
-    }
     if (CalleeDecl->hasAttr<RestrictAttr>() ||
         CalleeDecl->hasAttr<AllocSizeAttr>()) {
       // Function has 'malloc' (aka. 'restrict') or 'alloc_size' attribute.
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index eee397f1f3d19..4e61a6f61948f 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -755,10 +755,9 @@ void AggExprEmitter::VisitOpaqueValueExpr(OpaqueValueExpr *e) {
 
 void
 AggExprEmitter::VisitCompoundLiteralExpr(CompoundLiteralExpr *E) {
-  if (Dest.isPotentiallyAliased() &&
-      E->getType().isPODType(CGF.getContext())) {
-    // For a POD type, just emit a load of the lvalue + a copy, because our
-    // compound literal might alias the destination.
+  if (Dest.isPotentiallyAliased()) {
+    // Just emit a load of the lvalue + a copy, because our compound literal
+    // might alias the destination.
     EmitAggLoadOfLValue(E);
     return;
   }
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index fbf4a5722caed..b6928ce7d9c44 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -160,6 +160,57 @@ static Value *handleHlslSplitdouble(const CallExpr *E, CodeGenFunction *CGF) {
   return LastInst;
 }
 
+static Value *handleElementwiseF16ToF32(CodeGenFunction &CGF,
+                                        const CallExpr *E) {
+  Value *Op0 = CGF.EmitScalarExpr(E->getArg(0));
+  QualType Op0Ty = E->getArg(0)->getType();
+  llvm::Type *ResType = CGF.FloatTy;
+  uint64_t NumElements = 0;
+  if (Op0->getType()->isVectorTy()) {
+    NumElements =
+        E->getArg(0)->getType()->castAs<clang::VectorType>()->getNumElements();
+    ResType =
+        llvm::VectorType::get(ResType, ElementCount::getFixed(NumElements));
+  }
+  if (!Op0Ty->hasUnsignedIntegerRepresentation())
+    llvm_unreachable(
+        "f16tof32 operand must have an unsigned int representation");
+
+  if (CGF.CGM.getTriple().isDXIL())
+    return CGF.Builder.CreateIntrinsic(ResType, Intrinsic::dx_legacyf16tof32,
+                                       ArrayRef<Value *>{Op0}, nullptr,
+                                       "hlsl.f16tof32");
+
+  if (CGF.CGM.getTriple().isSPIRV()) {
+    // We use the SPIRV UnpackHalf2x16 operation to avoid the need for the
+    // Int16 and Float16 capabilities
+    auto UnpackType =
+        llvm::VectorType::get(CGF.FloatTy, ElementCount::getFixed(2));
+    if (NumElements == 0) {
+      // a scalar input - simply extract the first element of the unpacked
+      // vector
+      Value *Unpack = CGF.Builder.CreateIntrinsic(
+          UnpackType, Intrinsic::spv_unpackhalf2x16, ArrayRef<Value *>{Op0});
+      return CGF.Builder.CreateExtractElement(Unpack, (uint64_t)0);
+    } else {
+      // a vector input - build a congruent output vector by iterating through
+      // the input vector calling unpackhalf2x16 for each element
+      Value *Result = PoisonValue::get(ResType);
+      for (uint64_t i = 0; i < NumElements; i++) {
+        Value *InVal = CGF.Builder.CreateExtractElement(Op0, i);
+        Value *Unpack = CGF.Builder.CreateIntrinsic(
+            UnpackType, Intrinsic::spv_unpackhalf2x16,
+            ArrayRef<Value *>{InVal});
+        Value *Res = CGF.Builder.CreateExtractElement(Unpack, (uint64_t)0);
+        Result = CGF.Builder.CreateInsertElement(Result, Res, i);
+      }
+      return Result;
+    }
+  }
+
+  llvm_unreachable("Intrinsic F16ToF32 not supported by target architecture");
+}
+
 static Value *emitBufferStride(CodeGenFunction *CGF, const Expr *HandleExpr,
                                LValue &Stride) {
   // Figure out the stride of the buffer elements from the handle type.
@@ -579,6 +630,9 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         /*ReturnType=*/X->getType(), CGM.getHLSLRuntime().getDegreesIntrinsic(),
         ArrayRef<Value *>{X}, nullptr, "hlsl.degrees");
   }
+  case Builtin::BI__builtin_hlsl_elementwise_f16tof32: {
+    return handleElementwiseF16ToF32(*this, E);
+  }
   case Builtin::BI__builtin_hlsl_elementwise_frac: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
     if (!E->getArg(0)->getType()->hasFloatingRepresentation())
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 945f9e2451bc1..4bdba9b3da502 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -261,12 +261,12 @@ static std::optional<llvm::Value *> initializeLocalResourceArray(
 
 llvm::Type *
 CGHLSLRuntime::convertHLSLSpecificType(const Type *T,
-                                       SmallVector<int32_t> *Packoffsets) {
+                                       const CGHLSLOffsetInfo &OffsetInfo) {
   assert(T->isHLSLSpecificType() && "Not an HLSL specific type!");
 
   // Check if the target has a specific translation for this type first.
   if (llvm::Type *TargetTy =
-          CGM.getTargetCodeGenInfo().getHLSLType(CGM, T, Packoffsets))
+          CGM.getTargetCodeGenInfo().getHLSLType(CGM, T, OffsetInfo))
     return TargetTy;
 
   llvm_unreachable("Generic handling of HLSL types is not supported.");
@@ -357,25 +357,14 @@ createBufferHandleType(const HLSLBufferDecl *BufDecl) {
   return cast<HLSLAttributedResourceType>(QT.getTypePtr());
 }
 
-// Iterates over all declarations in the HLSL buffer and based on the
-// packoffset or register(c#) annotations it fills outs the Layout
-// vector with the user-specified layout offsets.
-// The buffer offsets can be specified 2 ways:
-// 1. declarations in cbuffer {} block can have a packoffset annotation
-//    (translates to HLSLPackOffsetAttr)
-// 2. default constant buffer declarations at global scope can have
-//    register(c#) annotations (translates to HLSLResourceBindingAttr with
-//    RegisterType::C)
-// It is not guaranteed that all declarations in a buffer have an annotation.
-// For those where it is not specified a -1 value is added to the Layout
-// vector. In the final layout these declarations will be placed at the end
-// of the HLSL buffer after all of the elements with specified offset.
-static void fillPackoffsetLayout(const HLSLBufferDecl *BufDecl,
-                                 SmallVector<int32_t> &Layout) {
-  assert(Layout.empty() && "expected empty vector for layout");
-  assert(BufDecl->hasValidPackoffset());
+CGHLSLOffsetInfo CGHLSLOffsetInfo::fromDecl(const HLSLBufferDecl &BufDecl) {
+  CGHLSLOffsetInfo Result;
 
-  for (Decl *D : BufDecl->buffer_decls()) {
+  // If we don't have packoffset info, just return an empty result.
+  if (!BufDecl.hasValidPackoffset())
+    return Result;
+
+  for (Decl *D : BufDecl.buffer_decls()) {
     if (isa<CXXRecordDecl, EmptyDecl>(D) || isa<FunctionDecl>(D)) {
       continue;
     }
@@ -384,11 +373,11 @@ static void fillPackoffsetLayout(const HLSLBufferDecl *BufDecl,
       continue;
 
     if (!VD->hasAttrs()) {
-      Layout.push_back(-1);
+      Result.Offsets.push_back(Unspecified);
       continue;
     }
 
-    int32_t Offset = -1;
+    uint32_t Offset = Unspecified;
     for (auto *Attr : VD->getAttrs()) {
       if (auto *POA = dyn_cast<HLSLPackOffsetAttr>(Attr)) {
         Offset = POA->getOffsetInBytes();
@@ -401,8 +390,9 @@ static void fillPackoffsetLayout(const HLSLBufferDecl *BufDecl,
         break;
       }
     }
-    Layout.push_back(Offset);
+    Result.Offsets.push_back(Offset);
   }
+  return Result;
 }
 
 // Codegen for HLSLBufferDecl
@@ -419,13 +409,9 @@ void CGHLSLRuntime::addBuffer(const HLSLBufferDecl *BufDecl) {
     return;
 
   // create global variable for the constant buffer
-  SmallVector<int32_t> Layout;
-  if (BufDecl->hasValidPackoffset())
-    fillPackoffsetLayout(BufDecl, Layout);
-
-  llvm::TargetExtType *TargetTy =
-      cast<llvm::TargetExtType>(convertHLSLSpecificType(
-          ResHandleTy, BufDecl->hasValidPackoffset() ? &Layout : nullptr));
+  CGHLSLOffsetInfo OffsetInfo = CGHLSLOffsetInfo::fromDecl(*BufDecl);
+  llvm::TargetExtType *TargetTy = cast<llvm::TargetExtType>(
+      convertHLSLSpecificType(ResHandleTy, OffsetInfo));
   llvm::GlobalVariable *BufGV = new GlobalVariable(
       TargetTy, /*isConstant*/ false,
       GlobalValue::LinkageTypes::ExternalLinkage, PoisonValue::get(TargetTy),
@@ -549,6 +535,16 @@ static void addSPIRVBuiltinDecoration(llvm::GlobalVariable *GV,
   GV->addMetadata("spirv.Decorations", *Decoration);
 }
 
+static void addLocationDecoration(llvm::GlobalVariable *GV, unsigned Location) {
+  LLVMContext &Ctx = GV->getContext();
+  IRBuilder<> B(GV->getContext());
+  MDNode *Operands =
+      MDNode::get(Ctx, {ConstantAsMetadata::get(B.getInt32(/* Location */ 30)),
+                        ConstantAsMetadata::get(B.getInt32(Location))});
+  MDNode *Decoration = MDNode::get(Ctx, {Operands});
+  GV->addMetadata("spirv.Decorations", *Decoration);
+}
+
 static llvm::Value *createSPIRVBuiltinLoad(IRBuilder<> &B, llvm::Module &M,
                                            llvm::Type *Ty, const Twine &Name,
                                            unsigned BuiltInID) {
@@ -562,6 +558,69 @@ static llvm::Value *createSPIRVBuiltinLoad(IRBuilder<> &B, llvm::Module &M,
   return B.CreateLoad(Ty, GV);
 }
 
+static llvm::Value *createSPIRVLocationLoad(IRBuilder<> &B, llvm::Module &M,
+                                            llvm::Type *Ty, unsigned Location,
+                                            StringRef Name) {
+  auto *GV = new llvm::GlobalVariable(
+      M, Ty, /* isConstant= */ true, llvm::GlobalValue::ExternalLinkage,
+      /* Initializer= */ nullptr, /* Name= */ Name, /* insertBefore= */ nullptr,
+      llvm::GlobalVariable::GeneralDynamicTLSModel,
+      /* AddressSpace */ 7, /* isExternallyInitialized= */ true);
+  GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
+  addLocationDecoration(GV, Location);
+  return B.CreateLoad(Ty, GV);
+}
+
+llvm::Value *
+CGHLSLRuntime::emitSPIRVUserSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
+                                         HLSLSemanticAttr *Semantic,
+                                         std::optional<unsigned> Index) {
+  Twine BaseName = Twine(Semantic->getAttrName()->getName());
+  Twine VariableName = BaseName.concat(Twine(Index.value_or(0)));
+
+  unsigned Location = SPIRVLastAssignedInputSemanticLocation;
+
+  // DXC completely ignores the semantic/index pair. Location are assigned from
+  // the first semantic to the last.
+  llvm::ArrayType *AT = dyn_cast<llvm::ArrayType>(Type);
+  unsigned ElementCount = AT ? AT->getNumElements() : 1;
+  SPIRVLastAssignedInputSemanticLocation += ElementCount;
+  return createSPIRVLocationLoad(B, CGM.getModule(), Type, Location,
+                                 VariableName.str());
+}
+
+llvm::Value *
+CGHLSLRuntime::emitDXILUserSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
+                                        HLSLSemanticAttr *Semantic,
+                                        std::optional<unsigned> Index) {
+  Twine BaseName = Twine(Semantic->getAttrName()->getName());
+  Twine VariableName = BaseName.concat(Twine(Index.value_or(0)));
+
+  // DXIL packing rules etc shall be handled here.
+  // FIXME: generate proper sigpoint, index, col, row values.
+  // FIXME: also DXIL loads vectors element by element.
+  SmallVector<Value *> Args{B.getInt32(4), B.getInt32(0), B.getInt32(0),
+                            B.getInt8(0),
+                            llvm::PoisonValue::get(B.getInt32Ty())};
+
+  llvm::Intrinsic::ID IntrinsicID = llvm::Intrinsic::dx_load_input;
+  llvm::Value *Value = B.CreateIntrinsic(/*ReturnType=*/Type, IntrinsicID, Args,
+                                         nullptr, VariableName);
+  return Value;
+}
+
+llvm::Value *CGHLSLRuntime::emitUserSemanticLoad(
+    IRBuilder<> &B, llvm::Type *Type, const clang::DeclaratorDecl *Decl,
+    HLSLSemanticAttr *Semantic, std::optional<unsigned> Index) {
+  if (CGM.getTarget().getTriple().isSPIRV())
+    return emitSPIRVUserSemanticLoad(B, Type, Semantic, Index);
+
+  if (CGM.getTarget().getTriple().isDXIL())
+    return emitDXILUserSemanticLoad(B, Type, Semantic, Index);
+
+  llvm_unreachable("Unsupported target for user-semantic load.");
+}
+
 llvm::Value *CGHLSLRuntime::emitSystemSemanticLoad(
     IRBuilder<> &B, llvm::Type *Type, const clang::DeclaratorDecl *Decl,
     Attr *Semantic, std::optional<unsigned> Index) {
@@ -626,6 +685,9 @@ CGHLSLRuntime::handleScalarSemanticLoad(IRBuilder<> &B, const FunctionDecl *FD,
   std::optional<unsigned> Index = std::nullopt;
   if (Semantic->isSemanticIndexExplicit())
     Index = Semantic->getSemanticIndex();
+
+  if (isa<HLSLUserSemanticAttr>(Semantic))
+    return emitUserSemanticLoad(B, Type, Decl, Semantic, Index);
   return emitSystemSemanticLoad(B, Type, Decl, Semantic, Index);
 }
 
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index d35df524fdc84..488a322ca7569 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -81,6 +81,33 @@ class CodeGenModule;
 class CodeGenFunction;
 class LValue;
 
+class CGHLSLOffsetInfo {
+  SmallVector<uint32_t> Offsets;
+
+public:
+  static const uint32_t Unspecified = ~0U;
+
+  /// Iterates over all declarations in the HLSL buffer and based on the
+  /// packoffset or register(c#) annotations it fills outs the Offsets vector
+  /// with the user-specified layout offsets. The buffer offsets can be
+  /// specified 2 ways: 1. declarations in cbuffer {} block can have a
+  /// packoffset annotation (translates to HLSLPackOffsetAttr) 2. default
+  /// constant buffer declarations at global scope can have register(c#)
+  /// annotations (translates to HLSLResourceBindingAttr with RegisterType::C)
+  /// It is not guaranteed that all declarations in a buffer have an annotation.
+  /// For those where it is not specified a `~0U` value is added to the Offsets
+  /// vector. In the final layout these declarations will be placed at the end
+  /// of the HLSL buffer after all of the elements with specified offset.
+  static CGHLSLOffsetInfo fromDecl(const HLSLBufferDecl &BufDecl);
+
+  /// Get the given offset, or `~0U` if there is no offset for the member.
+  uint32_t operator[](size_t I) const {
+    if (Offsets.empty())
+      return Unspecified;
+    return Offsets[I];
+  }
+};
+
 class CGHLSLRuntime {
 public:
   //===----------------------------------------------------------------------===//
@@ -167,9 +194,11 @@ class CGHLSLRuntime {
   CGHLSLRuntime(CodeGenModule &CGM) : CGM(CGM) {}
   virtual ~CGHLSLRuntime() {}
 
-  llvm::Type *
-  convertHLSLSpecificType(const Type *T,
-                          SmallVector<int32_t> *Packoffsets = nullptr);
+  llvm::Type *convertHLSLSpecificType(const Type *T,
+                                      const CGHLSLOffsetInfo &OffsetInfo);
+  llvm::Type *convertHLSLSpecificType(const Type *T) {
+    return convertHLSLSpecificType(T, CGHLSLOffsetInfo());
+  }
 
   void generateGlobalCtorDtorCalls();
 
@@ -200,9 +229,25 @@ class CGHLSLRuntime {
                                     llvm::GlobalVariable *BufGV);
   void initializeBufferFromBinding(const HLSLBufferDecl *BufDecl,
                                    llvm::GlobalVariable *GV);
+  void initializeBufferFromBinding(const HLSLBufferDecl *BufDecl,
+                                   llvm::GlobalVariable *GV,
+                                   HLSLResourceBindingAttr *RBA);
+
+  llvm::Value *emitSPIRVUserSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
+                                         HLSLSemanticAttr *Semantic,
+                                         std::optional<unsigned> Index);
+  llvm::Value *emitDXILUserSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
+                                        HLSLSemanticAttr *Semantic,
+                                        std::optional<unsigned> Index);
+  llvm::Value *emitUserSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
+                                    const clang::DeclaratorDecl *Decl,
+                                    HLSLSemanticAttr *Semantic,
+                                    std::optional<unsigned> Index);
+
   llvm::Triple::ArchType getArch();
 
   llvm::DenseMap<const clang::RecordType *, llvm::TargetExtType *> LayoutTypes;
+  unsigned SPIRVLastAssignedInputSemanticLocation = 0;
 };
 
 } // namespace CodeGen
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 121de42248e3b..1224fa681cdc0 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -2000,22 +2000,29 @@ void CGOpenMPRuntime::emitCriticalRegion(CodeGenFunction &CGF,
   // Prepare arguments and build a call to __kmpc_critical
   if (!CGF.HaveInsertPoint())
     return;
+  llvm::FunctionCallee RuntimeFcn = OMPBuilder.getOrCreateRuntimeFunction(
+      CGM.getModule(),
+      Hint ? OMPRTL___kmpc_critical_with_hint : OMPRTL___kmpc_critical);
+  llvm::Value *LockVar = getCriticalRegionLock(CriticalName);
+  unsigned LockVarArgIdx = 2;
+  if (cast<llvm::GlobalVariable>(LockVar)->getAddressSpace() !=
+      RuntimeFcn.getFunctionType()
+          ->getParamType(LockVarArgIdx)
+          ->getPointerAddressSpace())
+    LockVar = CGF.Builder.CreateAddrSpaceCast(
+        LockVar, RuntimeFcn.getFunctionType()->getParamType(LockVarArgIdx));
   llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
-                         getCriticalRegionLock(CriticalName)};
+                         LockVar};
   llvm::SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args),
                                                 std::end(Args));
   if (Hint) {
     EnterArgs.push_back(CGF.Builder.CreateIntCast(
         CGF.EmitScalarExpr(Hint), CGM.Int32Ty, /*isSigned=*/false));
   }
-  CommonActionTy Action(
-      OMPBuilder.getOrCreateRuntimeFunction(
-          CGM.getModule(),
-          Hint ? OMPRTL___kmpc_critical_with_hint : OMPRTL___kmpc_critical),
-      EnterArgs,
-      OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
-                                            OMPRTL___kmpc_end_critical),
-      Args);
+  CommonActionTy Action(RuntimeFcn, EnterArgs,
+                        OMPBuilder.getOrCreateRuntimeFunction(
+                            CGM.getModule(), OMPRTL___kmpc_end_critical),
+                        Args);
   CriticalOpGen.setAction(Action);
   emitInlinedDirective(CGF, OMPD_critical, CriticalOpGen);
 }
@@ -10006,19 +10013,44 @@ static llvm::Value *emitDeviceID(
   return DeviceID;
 }
 
-static llvm::Value *emitDynCGGroupMem(const OMPExecutableDirective &D,
-                                      CodeGenFunction &CGF) {
-  llvm::Value *DynCGroupMem = CGF.Builder.getInt32(0);
-
-  if (auto *DynMemClause = D.getSingleClause<OMPXDynCGroupMemClause>()) {
-    CodeGenFunction::RunCleanupsScope DynCGroupMemScope(CGF);
-    llvm::Value *DynCGroupMemVal = CGF.EmitScalarExpr(
-        DynMemClause->getSize(), /*IgnoreResultAssign=*/true);
-    DynCGroupMem = CGF.Builder.CreateIntCast(DynCGroupMemVal, CGF.Int32Ty,
-                                             /*isSigned=*/false);
+static std::pair<llvm::Value *, OMPDynGroupprivateFallbackType>
+emitDynCGroupMem(const OMPExecutableDirective &D, CodeGenFunction &CGF) {
+  llvm::Value *DynGP = CGF.Builder.getInt32(0);
+  auto DynGPFallback = OMPDynGroupprivateFallbackType::Abort;
+
+  if (auto *DynGPClause = D.getSingleClause<OMPDynGroupprivateClause>()) {
+    CodeGenFunction::RunCleanupsScope DynGPScope(CGF);
+    llvm::Value *DynGPVal =
+        CGF.EmitScalarExpr(DynGPClause->getSize(), /*IgnoreResultAssign=*/true);
+    DynGP = CGF.Builder.CreateIntCast(DynGPVal, CGF.Int32Ty,
+                                      /*isSigned=*/false);
+    auto FallbackModifier = DynGPClause->getDynGroupprivateFallbackModifier();
+    switch (FallbackModifier) {
+    case OMPC_DYN_GROUPPRIVATE_FALLBACK_abort:
+      DynGPFallback = OMPDynGroupprivateFallbackType::Abort;
+      break;
+    case OMPC_DYN_GROUPPRIVATE_FALLBACK_null:
+      DynGPFallback = OMPDynGroupprivateFallbackType::Null;
+      break;
+    case OMPC_DYN_GROUPPRIVATE_FALLBACK_default_mem:
+    case OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown:
+      // This is the default for dyn_groupprivate.
+      DynGPFallback = OMPDynGroupprivateFallbackType::DefaultMem;
+      break;
+    default:
+      llvm_unreachable("Unknown fallback modifier for OpenMP dyn_groupprivate");
+    }
+  } else if (auto *OMPXDynCGClause =
+                 D.getSingleClause<OMPXDynCGroupMemClause>()) {
+    CodeGenFunction::RunCleanupsScope DynCGMemScope(CGF);
+    llvm::Value *DynCGMemVal = CGF.EmitScalarExpr(OMPXDynCGClause->getSize(),
+                                                  /*IgnoreResultAssign=*/true);
+    DynGP = CGF.Builder.CreateIntCast(DynCGMemVal, CGF.Int32Ty,
+                                      /*isSigned=*/false);
   }
-  return DynCGroupMem;
+  return {DynGP, DynGPFallback};
 }
+
 static void genMapInfoForCaptures(
     MappableExprsHandler &MEHandler, CodeGenFunction &CGF,
     const CapturedStmt &CS, llvm::SmallVectorImpl<llvm::Value *> &CapturedVars,
@@ -10227,7 +10259,7 @@ static void emitTargetCallKernelLaunch(
     llvm::Value *RTLoc = OMPRuntime->emitUpdateLocation(CGF, D.getBeginLoc());
     llvm::Value *NumIterations =
         OMPRuntime->emitTargetNumIterationsCall(CGF, D, SizeEmitter);
-    llvm::Value *DynCGGroupMem = emitDynCGGroupMem(D, CGF);
+    auto [DynCGroupMem, DynCGroupMemFallback] = emitDynCGroupMem(D, CGF);
     llvm::OpenMPIRBuilder::InsertPointTy AllocaIP(
         CGF.AllocaInsertPt->getParent(), CGF.AllocaInsertPt->getIterator());
 
@@ -10237,7 +10269,7 @@ static void emitTargetCallKernelLaunch(
 
     llvm::OpenMPIRBuilder::TargetKernelArgs Args(
         NumTargetItems, RTArgs, NumIterations, NumTeams, NumThreads,
-        DynCGGroupMem, HasNoWait);
+        DynCGroupMem, HasNoWait, DynCGroupMemFallback);
 
     llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
         cantFail(OMPRuntime->getOMPBuilder().emitKernelLaunch(
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index fdc1a11f6c55c..36be3295950b8 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -582,48 +582,45 @@ CodeGenFunction::EmitCompoundStmtWithoutScope(const CompoundStmt &S,
                                               bool GetLast,
                                               AggValueSlot AggSlot) {
 
-  const Stmt *ExprResult = S.getStmtExprResult();
-  assert((!GetLast || (GetLast && ExprResult)) &&
-         "If GetLast is true then the CompoundStmt must have a StmtExprResult");
+  for (CompoundStmt::const_body_iterator I = S.body_begin(),
+                                         E = S.body_end() - GetLast;
+       I != E; ++I)
+    EmitStmt(*I);
 
   Address RetAlloca = Address::invalid();
-
-  for (auto *CurStmt : S.body()) {
-    if (GetLast && ExprResult == CurStmt) {
-      // We have to special case labels here.  They are statements, but when put
-      // at the end of a statement expression, they yield the value of their
-      // subexpression.  Handle this by walking through all labels we encounter,
-      // emitting them before we evaluate the subexpr.
-      // Similar issues arise for attributed statements.
-      while (!isa<Expr>(ExprResult)) {
-        if (const auto *LS = dyn_cast<LabelStmt>(ExprResult)) {
-          EmitLabel(LS->getDecl());
-          ExprResult = LS->getSubStmt();
-        } else if (const auto *AS = dyn_cast<AttributedStmt>(ExprResult)) {
-          // FIXME: Update this if we ever have attributes that affect the
-          // semantics of an expression.
-          ExprResult = AS->getSubStmt();
-        } else {
-          llvm_unreachable("unknown value statement");
-        }
+  if (GetLast) {
+    // We have to special case labels here.  They are statements, but when put
+    // at the end of a statement expression, they yield the value of their
+    // subexpression.  Handle this by walking through all labels we encounter,
+    // emitting them before we evaluate the subexpr.
+    // Similar issues arise for attributed statements.
+    const Stmt *LastStmt = S.body_back();
+    while (!isa<Expr>(LastStmt)) {
+      if (const auto *LS = dyn_cast<LabelStmt>(LastStmt)) {
+        EmitLabel(LS->getDecl());
+        LastStmt = LS->getSubStmt();
+      } else if (const auto *AS = dyn_cast<AttributedStmt>(LastStmt)) {
+        // FIXME: Update this if we ever have attributes that affect the
+        // semantics of an expression.
+        LastStmt = AS->getSubStmt();
+      } else {
+        llvm_unreachable("unknown value statement");
       }
+    }
 
-      EnsureInsertPoint();
+    EnsureInsertPoint();
 
-      const Expr *E = cast<Expr>(ExprResult);
-      QualType ExprTy = E->getType();
-      if (hasAggregateEvaluationKind(ExprTy)) {
-        EmitAggExpr(E, AggSlot);
-      } else {
-        // We can't return an RValue here because there might be cleanups at
-        // the end of the StmtExpr.  Because of that, we have to emit the result
-        // here into a temporary alloca.
-        RetAlloca = CreateMemTemp(ExprTy);
-        EmitAnyExprToMem(E, RetAlloca, Qualifiers(),
-                         /*IsInit*/ false);
-      }
+    const Expr *E = cast<Expr>(LastStmt);
+    QualType ExprTy = E->getType();
+    if (hasAggregateEvaluationKind(ExprTy)) {
+      EmitAggExpr(E, AggSlot);
     } else {
-      EmitStmt(CurStmt);
+      // We can't return an RValue here because there might be cleanups at
+      // the end of the StmtExpr.  Because of that, we have to emit the result
+      // here into a temporary alloca.
+      RetAlloca = CreateMemTemp(ExprTy);
+      EmitAnyExprToMem(E, RetAlloca, Qualifiers(),
+                       /*IsInit*/ false);
     }
   }
 
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 0fea57b2e1799..08c66bdbbb9f8 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1512,6 +1512,9 @@ void CodeGenModule::Release() {
   case CodeGenOptions::FramePointerKind::Reserved:
     getModule().setFramePointer(llvm::FramePointerKind::Reserved);
     break;
+  case CodeGenOptions::FramePointerKind::NonLeafNoReserve:
+    getModule().setFramePointer(llvm::FramePointerKind::NonLeafNoReserve);
+    break;
   case CodeGenOptions::FramePointerKind::NonLeaf:
     getModule().setFramePointer(llvm::FramePointerKind::NonLeaf);
     break;
@@ -2368,9 +2371,8 @@ static QualType GeneralizeTransparentUnion(QualType Ty) {
   const RecordDecl *UD = UT->getDecl()->getDefinitionOrSelf();
   if (!UD->hasAttr<TransparentUnionAttr>())
     return Ty;
-  for (const auto *it : UD->fields()) {
-    return it->getType();
-  }
+  if (!UD->fields().empty())
+    return UD->fields().begin()->getType();
   return Ty;
 }
 
@@ -4936,6 +4938,11 @@ void CodeGenModule::setMultiVersionResolverAttributes(llvm::Function *Resolver,
 
   setDSOLocal(Resolver);
 
+  // The resolver must be exempt from sanitizer instrumentation, as it can run
+  // before the sanitizer is initialized.
+  // (https://github.com/llvm/llvm-project/issues/163369)
+  Resolver->addFnAttr(llvm::Attribute::DisableSanitizerInstrumentation);
+
   // Set the default target-specific attributes, such as PAC and BTI ones on
   // AArch64. Not passing Decl to prevent setting unrelated attributes,
   // as Resolver can be shared by multiple declarations.
diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 8f095649f87ce..06d7380b4e37c 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -58,9 +58,10 @@ enum PGOHashVersion : unsigned {
   PGO_HASH_V1,
   PGO_HASH_V2,
   PGO_HASH_V3,
+  PGO_HASH_V4,
 
   // Keep this set to the latest hash version.
-  PGO_HASH_LATEST = PGO_HASH_V3
+  PGO_HASH_LATEST = PGO_HASH_V4
 };
 
 namespace {
@@ -152,7 +153,9 @@ static PGOHashVersion getPGOHashVersion(llvm::IndexedInstrProfReader *PGOReader,
     return PGO_HASH_V1;
   if (PGOReader->getVersion() <= 5)
     return PGO_HASH_V2;
-  return PGO_HASH_V3;
+  if (PGOReader->getVersion() <= 12)
+    return PGO_HASH_V3;
+  return PGO_HASH_V4;
 }
 
 /// A RecursiveASTVisitor that fills a map of statements to PGO counters.
@@ -1099,6 +1102,8 @@ void CodeGenPGO::mapRegionCounters(const Decl *D) {
   assert(Walker.NextCounter > 0 && "no entry counter mapped for decl");
   NumRegionCounters = Walker.NextCounter;
   FunctionHash = Walker.Hash.finalize();
+  if (HashVersion >= PGO_HASH_V4)
+    FunctionHash &= llvm::NamedInstrProfRecord::FUNC_HASH_MASK;
 }
 
 bool CodeGenPGO::skipRegionMappingForDecl(const Decl *D) {
diff --git a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp
index 838903cdcd1ee..4bc6d565fd41f 100644
--- a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp
+++ b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp
@@ -66,8 +66,9 @@ namespace CodeGen {
 // annotation though. For those that don't, the PackOffsets array will contain
 // -1 value instead. These elements must be placed at the end of the layout
 // after all of the elements with specific offset.
-llvm::TargetExtType *HLSLBufferLayoutBuilder::createLayoutType(
-    const RecordType *RT, const llvm::SmallVector<int32_t> *PackOffsets) {
+llvm::TargetExtType *
+HLSLBufferLayoutBuilder::createLayoutType(const RecordType *RT,
+                                          const CGHLSLOffsetInfo &OffsetInfo) {
 
   // check if we already have the layout type for this struct
   if (llvm::TargetExtType *Ty =
@@ -101,14 +102,10 @@ llvm::TargetExtType *HLSLBufferLayoutBuilder::createLayoutType(
     const CXXRecordDecl *RD = RecordDecls.pop_back_val();
 
     for (const auto *FD : RD->fields()) {
-      assert((!PackOffsets || Index < PackOffsets->size()) &&
-             "number of elements in layout struct does not match number of "
-             "packoffset annotations");
-
       // No PackOffset info at all, or have a valid packoffset/register(c#)
       // annotations value -> layout the field.
-      const int PO = PackOffsets ? (*PackOffsets)[Index++] : -1;
-      if (!PackOffsets || PO != -1) {
+      const uint32_t PO = OffsetInfo[Index++];
+      if (PO != CGHLSLOffsetInfo::Unspecified) {
         if (!layoutField(FD, EndOffset, FieldOffset, FieldType, PO))
           return nullptr;
         Layout.push_back(FieldOffset);
@@ -175,7 +172,7 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD,
                                           unsigned &EndOffset,
                                           unsigned &FieldOffset,
                                           llvm::Type *&FieldType,
-                                          int Packoffset) {
+                                          uint32_t Packoffset) {
 
   // Size of element; for arrays this is a size of a single element in the
   // array. Total array size of calculated as (ArrayCount-1) * ArrayStride +
@@ -201,8 +198,9 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD,
     // For array of structures, create a new array with a layout type
     // instead of the structure type.
     if (Ty->isStructureOrClassType()) {
+      CGHLSLOffsetInfo EmptyOffsets;
       llvm::Type *NewTy = cast<llvm::TargetExtType>(
-          createLayoutType(Ty->getAsCanonical<RecordType>()));
+          createLayoutType(Ty->getAsCanonical<RecordType>(), EmptyOffsets));
       if (!NewTy)
         return false;
       assert(isa<llvm::TargetExtType>(NewTy) && "expected target type");
@@ -216,17 +214,20 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD,
       ElemLayoutTy = CGM.getTypes().ConvertTypeForMem(FieldTy);
     }
     ArrayStride = llvm::alignTo(ElemSize, CBufferRowSizeInBytes);
-    ElemOffset = (Packoffset != -1) ? Packoffset : NextRowOffset;
+    ElemOffset = (Packoffset != CGHLSLOffsetInfo::Unspecified) ? Packoffset
+                                                               : NextRowOffset;
 
   } else if (FieldTy->isStructureOrClassType()) {
     // Create a layout type for the structure
+    CGHLSLOffsetInfo EmptyOffsets;
     ElemLayoutTy = createLayoutType(
-        cast<RecordType>(FieldTy->getAsCanonical<RecordType>()));
+        cast<RecordType>(FieldTy->getAsCanonical<RecordType>()), EmptyOffsets);
     if (!ElemLayoutTy)
       return false;
     assert(isa<llvm::TargetExtType>(ElemLayoutTy) && "expected target type");
     ElemSize = cast<llvm::TargetExtType>(ElemLayoutTy)->getIntParameter(0);
-    ElemOffset = (Packoffset != -1) ? Packoffset : NextRowOffset;
+    ElemOffset = (Packoffset != CGHLSLOffsetInfo::Unspecified) ? Packoffset
+                                                               : NextRowOffset;
 
   } else {
     // scalar or vector - find element size and alignment
@@ -246,7 +247,7 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD,
     }
 
     // calculate or get element offset for the vector or scalar
-    if (Packoffset != -1) {
+    if (Packoffset != CGHLSLOffsetInfo::Unspecified) {
       ElemOffset = Packoffset;
     } else {
       ElemOffset = llvm::alignTo(EndOffset, Align);
@@ -269,5 +270,13 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD,
   return true;
 }
 
+bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD,
+                                          unsigned &EndOffset,
+                                          unsigned &FieldOffset,
+                                          llvm::Type *&FieldType) {
+  return layoutField(FD, EndOffset, FieldOffset, FieldType,
+                     CGHLSLOffsetInfo::Unspecified);
+}
+
 } // namespace CodeGen
 } // namespace clang
diff --git a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h
index 61240b280cfcb..916e60e83e2c0 100644
--- a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h
+++ b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h
@@ -14,6 +14,7 @@ class RecordType;
 class FieldDecl;
 
 namespace CodeGen {
+class CGHLSLOffsetInfo;
 class CodeGenModule;
 
 //===----------------------------------------------------------------------===//
@@ -33,14 +34,15 @@ class HLSLBufferLayoutBuilder {
   // Returns LLVM target extension type with the name LayoutTypeName
   // for given structure type and layout data. The first number in
   // the Layout is the size followed by offsets for each struct element.
-  llvm::TargetExtType *
-  createLayoutType(const RecordType *StructType,
-                   const llvm::SmallVector<int32_t> *Packoffsets = nullptr);
+  llvm::TargetExtType *createLayoutType(const RecordType *StructType,
+                                        const CGHLSLOffsetInfo &OffsetInfo);
 
 private:
   bool layoutField(const clang::FieldDecl *FD, unsigned &EndOffset,
                    unsigned &FieldOffset, llvm::Type *&FieldType,
-                   int Packoffset = -1);
+                   uint32_t Packoffset);
+  bool layoutField(const clang::FieldDecl *FD, unsigned &EndOffset,
+                   unsigned &FieldOffset, llvm::Type *&FieldType);
 };
 
 } // namespace CodeGen
diff --git a/clang/lib/CodeGen/ModuleBuilder.cpp b/clang/lib/CodeGen/ModuleBuilder.cpp
index 96f3f6221e20f..8ec8aef311656 100644
--- a/clang/lib/CodeGen/ModuleBuilder.cpp
+++ b/clang/lib/CodeGen/ModuleBuilder.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include <memory>
 
@@ -378,3 +379,31 @@ clang::CreateLLVMCodeGen(DiagnosticsEngine &Diags, llvm::StringRef ModuleName,
                                HeaderSearchOpts, PreprocessorOpts, CGO, C,
                                CoverageInfo);
 }
+
+namespace clang {
+namespace CodeGen {
+std::optional<std::pair<StringRef, StringRef>>
+DemangleTrapReasonInDebugInfo(StringRef FuncName) {
+  static auto TrapRegex =
+      llvm::Regex(llvm::formatv("^{0}\\$(.*)\\$(.*)$", ClangTrapPrefix).str());
+  llvm::SmallVector<llvm::StringRef, 3> Matches;
+  std::string *ErrorPtr = nullptr;
+#ifndef NDEBUG
+  std::string Error;
+  ErrorPtr = &Error;
+#endif
+  if (!TrapRegex.match(FuncName, &Matches, ErrorPtr)) {
+    assert(ErrorPtr && ErrorPtr->empty() && "Invalid regex pattern");
+    return {};
+  }
+
+  if (Matches.size() != 3) {
+    assert(0 && "Expected 3 matches from Regex::match");
+    return {};
+  }
+
+  // Returns { Trap Category, Trap Message }
+  return std::make_pair(Matches[1], Matches[2]);
+}
+} // namespace CodeGen
+} // namespace clang
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 15fa78ddba715..d4b0b81d3d87f 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -590,6 +590,7 @@ struct ARMVectorIntrinsicInfo {
       Intrinsic::LLVMIntrinsic, Intrinsic::AltLLVMIntrinsic, \
       TypeModifier }
 
+// clang-format off
 static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
   NEONMAP1(__a32_vcvt_bf16_f32, arm_neon_vcvtfp2bf, 0),
   NEONMAP0(splat_lane_v),
@@ -1217,35 +1218,55 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
   NEONMAP1(vcales_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
   NEONMAP1(vcaltd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
   NEONMAP1(vcalts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtad_s32_f64, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
   NEONMAP1(vcvtad_s64_f64, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtad_u32_f64, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
   NEONMAP1(vcvtad_u64_f64, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
   NEONMAP1(vcvtas_s32_f32, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtas_s64_f32, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
   NEONMAP1(vcvtas_u32_f32, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtas_u64_f32, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
   NEONMAP1(vcvtd_n_f64_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
   NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
   NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
   NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtd_s32_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
   NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtd_u32_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
   NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
   NEONMAP0(vcvth_bf16_f32),
+  NEONMAP1(vcvtmd_s32_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
   NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtmd_u32_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
   NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
   NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtms_s64_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
   NEONMAP1(vcvtms_u32_f32, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtms_u64_f32, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtnd_s32_f64, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
   NEONMAP1(vcvtnd_s64_f64, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtnd_u32_f64, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
   NEONMAP1(vcvtnd_u64_f64, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
   NEONMAP1(vcvtns_s32_f32, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtns_s64_f32, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
   NEONMAP1(vcvtns_u32_f32, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtns_u64_f32, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtpd_s32_f64, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
   NEONMAP1(vcvtpd_s64_f64, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtpd_u32_f64, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
   NEONMAP1(vcvtpd_u64_f64, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
   NEONMAP1(vcvtps_s32_f32, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtps_s64_f32, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
   NEONMAP1(vcvtps_u32_f32, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtps_u64_f32, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
   NEONMAP1(vcvts_n_f32_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
   NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
   NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
   NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
   NEONMAP1(vcvts_s32_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
+  NEONMAP1(vcvts_s64_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
   NEONMAP1(vcvts_u32_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvts_u64_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
   NEONMAP1(vcvtxd_f32_f64, aarch64_sisd_fcvtxn, 0),
   NEONMAP1(vmaxnmv_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
   NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
@@ -1446,6 +1467,7 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
   NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType),
   NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType),
 };
+// clang-format on
 
 // Some intrinsics are equivalent for codegen.
 static const std::pair<unsigned, unsigned> NEONEquivalentIntrinsicMap[] = {
@@ -7624,6 +7646,16 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
   }
+  case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
+                           {llvm::FixedVectorType::get(HalfTy, 8),
+                            llvm::FixedVectorType::get(Int8Ty, 16)},
+                           Ops, E, "fmmla");
+  case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
+                           {llvm::FixedVectorType::get(FloatTy, 4),
+                            llvm::FixedVectorType::get(Int8Ty, 16)},
+                           Ops, E, "fmmla");
   case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
     ExtractLow = true;
     [[fallthrough]];
diff --git a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
index 6da65b681df1e..8a1cab3417d98 100644
--- a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
@@ -375,28 +375,28 @@ static Value *MakeCpAsync(unsigned IntrinsicID, unsigned IntrinsicIDS,
                                        CGF.EmitScalarExpr(E->getArg(1))});
 }
 
-static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
-                           const CallExpr *E, CodeGenFunction &CGF) {
+static bool EnsureNativeHalfSupport(unsigned BuiltinID, const CallExpr *E,
+                                    CodeGenFunction &CGF) {
   auto &C = CGF.CGM.getContext();
-  if (!(C.getLangOpts().NativeHalfType ||
-        !C.getTargetInfo().useFP16ConversionIntrinsics())) {
+  if (!C.getLangOpts().NativeHalfType &&
+      C.getTargetInfo().useFP16ConversionIntrinsics()) {
     CGF.CGM.Error(E->getExprLoc(), C.BuiltinInfo.getQuotedName(BuiltinID) +
                                        " requires native half type support.");
-    return nullptr;
+    return false;
   }
+  return true;
+}
 
-  if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2)
-    return MakeLdg(CGF, E);
-
-  if (IntrinsicID == Intrinsic::nvvm_ldu_global_f)
-    return MakeLdu(IntrinsicID, CGF, E);
+static Value *MakeHalfType(Function *Intrinsic, unsigned BuiltinID,
+                           const CallExpr *E, CodeGenFunction &CGF) {
+  if (!EnsureNativeHalfSupport(BuiltinID, E, CGF))
+    return nullptr;
 
   SmallVector<Value *, 16> Args;
-  auto *F = CGF.CGM.getIntrinsic(IntrinsicID);
-  auto *FTy = F->getFunctionType();
+  auto *FTy = Intrinsic->getFunctionType();
   unsigned ICEArguments = 0;
   ASTContext::GetBuiltinTypeError Error;
-  C.GetBuiltinType(BuiltinID, Error, &ICEArguments);
+  CGF.CGM.getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
   assert(Error == ASTContext::GE_None && "Should not codegen an error");
   for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
     assert((ICEArguments & (1 << i)) == 0);
@@ -407,8 +407,14 @@ static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
     Args.push_back(ArgValue);
   }
 
-  return CGF.Builder.CreateCall(F, Args);
+  return CGF.Builder.CreateCall(Intrinsic, Args);
 }
+
+static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
+                           const CallExpr *E, CodeGenFunction &CGF) {
+  return MakeHalfType(CGF.CGM.getIntrinsic(IntrinsicID), BuiltinID, E, CGF);
+}
+
 } // namespace
 
 Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
@@ -913,9 +919,14 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
   }
   // The following builtins require half type support
   case NVPTX::BI__nvvm_ex2_approx_f16:
-    return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16, BuiltinID, E, *this);
+    return MakeHalfType(
+        CGM.getIntrinsic(Intrinsic::nvvm_ex2_approx, Builder.getHalfTy()),
+        BuiltinID, E, *this);
   case NVPTX::BI__nvvm_ex2_approx_f16x2:
-    return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16x2, BuiltinID, E, *this);
+    return MakeHalfType(
+        CGM.getIntrinsic(Intrinsic::nvvm_ex2_approx,
+                         FixedVectorType::get(Builder.getHalfTy(), 2)),
+        BuiltinID, E, *this);
   case NVPTX::BI__nvvm_ff2f16x2_rn:
     return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn, BuiltinID, E, *this);
   case NVPTX::BI__nvvm_ff2f16x2_rn_relu:
@@ -1049,12 +1060,22 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
   case NVPTX::BI__nvvm_fabs_d:
     return Builder.CreateUnaryIntrinsic(Intrinsic::fabs,
                                         EmitScalarExpr(E->getArg(0)));
+  case NVPTX::BI__nvvm_ex2_approx_d:
+  case NVPTX::BI__nvvm_ex2_approx_f:
+    return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_ex2_approx,
+                                        EmitScalarExpr(E->getArg(0)));
+  case NVPTX::BI__nvvm_ex2_approx_ftz_f:
+    return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_ex2_approx_ftz,
+                                        EmitScalarExpr(E->getArg(0)));
   case NVPTX::BI__nvvm_ldg_h:
   case NVPTX::BI__nvvm_ldg_h2:
-    return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this);
+    return EnsureNativeHalfSupport(BuiltinID, E, *this) ? MakeLdg(*this, E)
+                                                        : nullptr;
   case NVPTX::BI__nvvm_ldu_h:
   case NVPTX::BI__nvvm_ldu_h2:
-    return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
+    return EnsureNativeHalfSupport(BuiltinID, E, *this)
+               ? MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E)
+               : nullptr;
   case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
     return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4,
                        Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E,
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index b924407b6ddd7..2381b2e7cf2cf 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2931,74 +2931,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     // instruction, but it will create a memset that won't be optimized away.
     return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true);
   }
-  // Corresponding to intrisics which will return 2 tiles (tile0_tile1).
-  case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: {
-    Intrinsic::ID IID;
-    switch (BuiltinID) {
-    default:
-      llvm_unreachable("Unsupported intrinsic!");
-    case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
-      IID = Intrinsic::x86_t2rpntlvwz0_internal;
-      break;
-    case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
-      IID = Intrinsic::x86_t2rpntlvwz0rs_internal;
-      break;
-    case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
-      IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
-      break;
-    case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
-      IID = Intrinsic::x86_t2rpntlvwz0rst1_internal;
-      break;
-    case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
-      IID = Intrinsic::x86_t2rpntlvwz1_internal;
-      break;
-    case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
-      IID = Intrinsic::x86_t2rpntlvwz1rs_internal;
-      break;
-    case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
-      IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
-      break;
-    case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
-      IID = Intrinsic::x86_t2rpntlvwz1rst1_internal;
-      break;
-    }
-
-    // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
-    Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
-                                     {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]});
-
-    auto *PtrTy = E->getArg(3)->getType()->getAs<PointerType>();
-    assert(PtrTy && "arg3 must be of pointer type");
-    QualType PtreeTy = PtrTy->getPointeeType();
-    llvm::Type *TyPtee = ConvertType(PtreeTy);
-
-    // Bitcast amx type (x86_amx) to vector type (256 x i32)
-    // Then store tile0 into DstPtr0
-    Value *T0 = Builder.CreateExtractValue(Call, 0);
-    Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
-                                           {TyPtee}, {T0});
-    Builder.CreateDefaultAlignedStore(VecT0, Ops[3]);
-
-    // Then store tile1 into DstPtr1
-    Value *T1 = Builder.CreateExtractValue(Call, 1);
-    Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
-                                           {TyPtee}, {T1});
-    Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]);
-
-    // Note: Here we escape directly use x86_tilestored64_internal to store
-    // the results due to it can't make sure the Mem written scope. This may
-    // cause shapes reloads after first amx intrinsic, which current amx reg-
-    // ister allocation has no ability to handle it.
-
-    return Store;
-  }
   case X86::BI__ud2:
     // llvm.trap makes a ud2a instruction on x86.
     return EmitTrapCall(Intrinsic::trap);
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index f63e900669d97..383f52f298d2e 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -39,6 +39,7 @@ class ABIInfo;
 class CallArgList;
 class CodeGenFunction;
 class CGBlockInfo;
+class CGHLSLOffsetInfo;
 class SwiftABIInfo;
 
 /// TargetCodeGenInfo - This class organizes various target-specific
@@ -442,9 +443,8 @@ class TargetCodeGenInfo {
   }
 
   /// Return an LLVM type that corresponds to a HLSL type
-  virtual llvm::Type *
-  getHLSLType(CodeGenModule &CGM, const Type *T,
-              const SmallVector<int32_t> *Packoffsets = nullptr) const {
+  virtual llvm::Type *getHLSLType(CodeGenModule &CGM, const Type *T,
+                                  const CGHLSLOffsetInfo &OffsetInfo) const {
     return nullptr;
   }
 
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index bb41a14f5d2f3..d42fcd8b3237c 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -422,6 +422,12 @@ ABIArgInfo AArch64ABIInfo::classifyArgumentType(QualType Ty, bool IsVariadicFn,
   }
 
   // Empty records:
+  // AAPCS64 does not say that empty records are ignored as arguments,
+  // but other compilers do so in certain situations, and we copy that behavior.
+  // Those situations are in fact language-mode-specific, which seems really
+  // unfortunate, but it's something we just have to accept. If this doesn't
+  // apply, just fall through to the standard argument-handling path.
+  // Darwin overrides the psABI here to ignore all empty records in all modes.
   uint64_t Size = getContext().getTypeSize(Ty);
   bool IsEmpty = isEmptyRecord(getContext(), Ty, true);
   if (!Ty->isSVESizelessBuiltinType() && (IsEmpty || Size == 0)) {
@@ -434,9 +440,6 @@ ABIArgInfo AArch64ABIInfo::classifyArgumentType(QualType Ty, bool IsVariadicFn,
     // behaviour here.
     if (Size == 0)
       return ABIArgInfo::getIgnore();
-
-    // Otherwise, they are passed as if they have a size of 1 byte.
-    return ABIArgInfo::getDirect(llvm::Type::getInt8Ty(getVMContext()));
   }
 
   // Homogeneous Floating-point Aggregates (HFAs) need to be expanded.
diff --git a/clang/lib/CodeGen/Targets/DirectX.cpp b/clang/lib/CodeGen/Targets/DirectX.cpp
index b4cebb9a32aca..f30b30284cb12 100644
--- a/clang/lib/CodeGen/Targets/DirectX.cpp
+++ b/clang/lib/CodeGen/Targets/DirectX.cpp
@@ -29,14 +29,13 @@ class DirectXTargetCodeGenInfo : public TargetCodeGenInfo {
   DirectXTargetCodeGenInfo(CodeGen::CodeGenTypes &CGT)
       : TargetCodeGenInfo(std::make_unique<DefaultABIInfo>(CGT)) {}
 
-  llvm::Type *
-  getHLSLType(CodeGenModule &CGM, const Type *T,
-              const SmallVector<int32_t> *Packoffsets = nullptr) const override;
+  llvm::Type *getHLSLType(CodeGenModule &CGM, const Type *T,
+                          const CGHLSLOffsetInfo &OffsetInfo) const override;
 };
 
 llvm::Type *DirectXTargetCodeGenInfo::getHLSLType(
     CodeGenModule &CGM, const Type *Ty,
-    const SmallVector<int32_t> *Packoffsets) const {
+    const CGHLSLOffsetInfo &OffsetInfo) const {
   auto *ResType = dyn_cast<HLSLAttributedResourceType>(Ty);
   if (!ResType)
     return nullptr;
@@ -78,7 +77,7 @@ llvm::Type *DirectXTargetCodeGenInfo::getHLSLType(
     llvm::Type *BufferLayoutTy =
         HLSLBufferLayoutBuilder(CGM, "dx.Layout")
             .createLayoutType(ContainedTy->castAsCanonical<RecordType>(),
-                              Packoffsets);
+                              OffsetInfo);
     if (!BufferLayoutTy)
       return nullptr;
 
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index 15d0b353d748c..be7e9ccecae9f 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -53,9 +53,8 @@ class CommonSPIRTargetCodeGenInfo : public TargetCodeGenInfo {
 
   unsigned getDeviceKernelCallingConv() const override;
   llvm::Type *getOpenCLType(CodeGenModule &CGM, const Type *T) const override;
-  llvm::Type *
-  getHLSLType(CodeGenModule &CGM, const Type *Ty,
-              const SmallVector<int32_t> *Packoffsets = nullptr) const override;
+  llvm::Type *getHLSLType(CodeGenModule &CGM, const Type *Ty,
+                          const CGHLSLOffsetInfo &OffsetInfo) const override;
   llvm::Type *getSPIRVImageTypeFromHLSLResource(
       const HLSLAttributedResourceType::Attributes &attributes,
       QualType SampledType, CodeGenModule &CGM) const;
@@ -260,7 +259,16 @@ CommonSPIRTargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM,
   LangAS AS = QT->getUnqualifiedDesugaredType()->isNullPtrType()
                   ? LangAS::Default
                   : QT->getPointeeType().getAddressSpace();
-  if (AS == LangAS::Default || AS == LangAS::opencl_generic)
+  unsigned ASAsInt = static_cast<unsigned>(AS);
+  unsigned FirstTargetASAsInt =
+      static_cast<unsigned>(LangAS::FirstTargetAddressSpace);
+  unsigned CodeSectionINTELAS = FirstTargetASAsInt + 9;
+  // As per SPV_INTEL_function_pointers, it is illegal to addrspacecast
+  // function pointers to/from the generic AS.
+  bool IsFunctionPtrAS =
+      CGM.getTriple().isSPIRV() && ASAsInt == CodeSectionINTELAS;
+  if (AS == LangAS::Default || AS == LangAS::opencl_generic ||
+      AS == LangAS::opencl_constant || IsFunctionPtrAS)
     return llvm::ConstantPointerNull::get(PT);
 
   auto &Ctx = CGM.getContext();
@@ -509,7 +517,7 @@ static llvm::Type *getInlineSpirvType(CodeGenModule &CGM,
 
 llvm::Type *CommonSPIRTargetCodeGenInfo::getHLSLType(
     CodeGenModule &CGM, const Type *Ty,
-    const SmallVector<int32_t> *Packoffsets) const {
+    const CGHLSLOffsetInfo &OffsetInfo) const {
   llvm::LLVMContext &Ctx = CGM.getLLVMContext();
 
   if (auto *SpirvType = dyn_cast<HLSLInlineSpirvType>(Ty))
@@ -558,7 +566,7 @@ llvm::Type *CommonSPIRTargetCodeGenInfo::getHLSLType(
     llvm::Type *BufferLayoutTy =
         HLSLBufferLayoutBuilder(CGM, "spirv.Layout")
             .createLayoutType(ContainedTy->castAsCanonical<RecordType>(),
-                              Packoffsets);
+                              OffsetInfo);
     uint32_t StorageClass = /* Uniform storage class */ 2;
     return llvm::TargetExtType::get(Ctx, "spirv.VulkanBuffer", {BufferLayoutTy},
                                     {StorageClass, false});
diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index 8daf8eb1d39f1..f9a84ecca074f 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -27,6 +27,14 @@ bool IsX86_MMXType(llvm::Type *IRType) {
 static llvm::Type *X86AdjustInlineAsmType(CodeGen::CodeGenFunction &CGF,
                                           StringRef Constraint,
                                           llvm::Type *Ty) {
+  bool IsMMXCons = llvm::StringSwitch<bool>(Constraint)
+                       .Cases({"y", "&y", "^Ym"}, true)
+                       .Default(false);
+  if (IsMMXCons && Ty->isVectorTy() &&
+      cast<llvm::VectorType>(Ty)->getPrimitiveSizeInBits().getFixedValue() !=
+          64)
+    return nullptr; // Invalid MMX constraint
+
   if (Constraint == "k") {
     llvm::Type *Int1Ty = llvm::Type::getInt1Ty(CGF.getLLVMContext());
     return llvm::FixedVectorType::get(Int1Ty, Ty->getScalarSizeInBits());
diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt
index 7c4f70b966c48..8052659e9836b 100644
--- a/clang/lib/Driver/CMakeLists.txt
+++ b/clang/lib/Driver/CMakeLists.txt
@@ -19,12 +19,10 @@ add_clang_library(clangDriver
   Compilation.cpp
   Distro.cpp
   Driver.cpp
-  DriverOptions.cpp
   Job.cpp
   Multilib.cpp
   MultilibBuilder.cpp
   OffloadBundler.cpp
-  OptionUtils.cpp
   Phases.cpp
   SanitizerArgs.cpp
   Tool.cpp
@@ -99,5 +97,6 @@ add_clang_library(clangDriver
   LINK_LIBS
   clangBasic
   clangLex
+  clangOptions
   ${system_libs}
   )
diff --git a/clang/lib/Driver/Compilation.cpp b/clang/lib/Driver/Compilation.cpp
index 4e300316ae9ba..f8ca2a3d09407 100644
--- a/clang/lib/Driver/Compilation.cpp
+++ b/clang/lib/Driver/Compilation.cpp
@@ -11,9 +11,9 @@
 #include "clang/Driver/Action.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Job.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Driver/Util.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/OptSpecifier.h"
 #include "llvm/Option/Option.h"
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 71c52807091ba..9fd64d4aac514 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -60,13 +60,13 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Job.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Phases.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Driver/Types.h"
 #include "clang/Lex/DependencyDirectivesScanner.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -2540,10 +2540,14 @@ bool Driver::HandleImmediateArgs(Compilation &C) {
   }
 
   if (C.getArgs().hasArg(options::OPT_print_runtime_dir)) {
-    if (std::optional<std::string> RuntimePath = TC.getRuntimePath())
-      llvm::outs() << *RuntimePath << '\n';
-    else
-      llvm::outs() << TC.getCompilerRTPath() << '\n';
+    for (auto RuntimePath :
+         {TC.getRuntimePath(), std::make_optional(TC.getCompilerRTPath())}) {
+      if (RuntimePath && getVFS().exists(*RuntimePath)) {
+        llvm::outs() << *RuntimePath << '\n';
+        return false;
+      }
+    }
+    llvm::outs() << "(runtime dir is not present)" << '\n';
     return false;
   }
 
@@ -3853,6 +3857,9 @@ class OffloadingActionBuilder final {
   /// Flag set to true if all valid builders allow file bundling/unbundling.
   bool CanUseBundler;
 
+  /// Flag set to false if an argument turns off bundling.
+  bool ShouldUseBundler;
+
 public:
   OffloadingActionBuilder(Compilation &C, DerivedArgList &Args,
                           const Driver::InputList &Inputs)
@@ -3887,6 +3894,9 @@ class OffloadingActionBuilder final {
     }
     CanUseBundler =
         ValidBuilders && ValidBuilders == ValidBuildersSupportingBundling;
+
+    ShouldUseBundler = Args.hasFlag(options::OPT_gpu_bundle_output,
+                                    options::OPT_no_gpu_bundle_output, true);
   }
 
   ~OffloadingActionBuilder() {
@@ -4038,11 +4048,11 @@ class OffloadingActionBuilder final {
       SB->appendTopLevelActions(OffloadAL);
     }
 
-    // If we can use the bundler, replace the host action by the bundling one in
-    // the resulting list. Otherwise, just append the device actions. For
-    // device only compilation, HostAction is a null pointer, therefore only do
-    // this when HostAction is not a null pointer.
-    if (CanUseBundler && HostAction &&
+    // If we can and should use the bundler, replace the host action by the
+    // bundling one in the resulting list. Otherwise, just append the device
+    // actions. For device only compilation, HostAction is a null pointer,
+    // therefore only do this when HostAction is not a null pointer.
+    if (CanUseBundler && ShouldUseBundler && HostAction &&
         HostAction->getType() != types::TY_Nothing && !OffloadAL.empty()) {
       // Add the host action to the list in order to create the bundling action.
       OffloadAL.push_back(HostAction);
@@ -6459,9 +6469,16 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
               (JA.getOffloadingDeviceKind() == Action::OFK_OpenMP && TC &&
                TC->getTriple().isAMDGPU()));
     };
-    if (!AtTopLevel && JA.getType() == types::TY_LLVM_BC &&
-        (C.getArgs().hasArg(options::OPT_emit_llvm) ||
-         IsAMDRDCInCompilePhase(JA, C.getArgs())))
+
+    // The linker wrapper may not support the input and output files to be the
+    // same one, and without it -save-temps can fail.
+    bool IsLinkerWrapper =
+        JA.getType() == types::TY_Object && isa<LinkerWrapperJobAction>(JA);
+    bool IsEmitBitcode = JA.getType() == types::TY_LLVM_BC &&
+                         (C.getArgs().hasArg(options::OPT_emit_llvm) ||
+                          IsAMDRDCInCompilePhase(JA, C.getArgs()));
+
+    if (!AtTopLevel && (IsLinkerWrapper || IsEmitBitcode))
       Suffixed += ".tmp";
     Suffixed += '.';
     Suffixed += Suffix;
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 5dd48f53b9069..420c4cddbc8dd 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -8,8 +8,8 @@
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Basic/Sanitizers.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/ToolChain.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index eea5c2f7f4a6a..5ff7d83946137 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -21,9 +21,9 @@
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Job.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/XRayArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -338,7 +338,7 @@ static void getRISCVMultilibFlags(const Driver &D, const llvm::Triple &Triple,
 
 Multilib::flags_list
 ToolChain::getMultilibFlags(const llvm::opt::ArgList &Args) const {
-  using namespace clang::driver::options;
+  using namespace clang::options;
 
   std::vector<std::string> Result;
   const llvm::Triple Triple(ComputeEffectiveClangTriple(Args));
@@ -851,8 +851,11 @@ void ToolChain::addFortranRuntimeLibs(const ArgList &Args,
                    options::OPT_fno_openmp, false)) {
     Driver::OpenMPRuntimeKind OMPRuntime = getDriver().getOpenMPRuntime(Args);
     ToolChain::RuntimeLibType RuntimeLib = GetRuntimeLibType(Args);
-    if (OMPRuntime == Driver::OMPRT_OMP && RuntimeLib == ToolChain::RLT_Libgcc)
+    if ((OMPRuntime == Driver::OMPRT_OMP &&
+         RuntimeLib == ToolChain::RLT_Libgcc) &&
+        !getTriple().isKnownWindowsMSVCEnvironment()) {
       CmdArgs.push_back("-latomic");
+    }
   }
 }
 
@@ -1802,7 +1805,7 @@ void ToolChain::TranslateXarchArgs(
   unsigned Index = BaseArgs.MakeIndex(A->getValue(ValuePos));
   unsigned Prev = Index;
   std::unique_ptr<llvm::opt::Arg> XarchArg(Opts.ParseOneArg(
-      Args, Index, llvm::opt::Visibility(clang::driver::options::ClangOption)));
+      Args, Index, llvm::opt::Visibility(options::ClangOption)));
 
   // If the argument parsing failed or more than one argument was
   // consumed, the -Xarch_ argument's parameter tried to consume
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index 066b59305fe3f..a8acf9cfc44c9 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -9,8 +9,8 @@
 #include "AIX.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/ProfileData/InstrProf.h"
@@ -19,6 +19,7 @@
 #include <set>
 
 using AIX = clang::driver::toolchains::AIX;
+using namespace clang;
 using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace clang::driver::toolchains;
@@ -167,8 +168,7 @@ void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA,
        Args.hasArg(options::OPT_coverage))
     CmdArgs.push_back("-bdbg:namedsects:ss");
 
-  if (Arg *A =
-          Args.getLastArg(clang::driver::options::OPT_mxcoff_build_id_EQ)) {
+  if (Arg *A = Args.getLastArg(options::OPT_mxcoff_build_id_EQ)) {
     StringRef BuildId = A->getValue();
     if (BuildId[0] != '0' || BuildId[1] != 'x' ||
         BuildId.find_if_not(llvm::isHexDigit, 2) != StringRef::npos)
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 654a382e87e40..9dc2c6ce39ae4 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -12,8 +12,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Error.h"
@@ -22,6 +22,7 @@
 #include "llvm/Support/Process.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/TargetParser.h"
 #include <optional>
 #include <system_error>
 
@@ -322,27 +323,24 @@ RocmInstallationDetector::RocmInstallationDetector(
     const llvm::opt::ArgList &Args, bool DetectHIPRuntime, bool DetectDeviceLib)
     : D(D) {
   Verbose = Args.hasArg(options::OPT_v);
-  RocmPathArg = Args.getLastArgValue(clang::driver::options::OPT_rocm_path_EQ);
-  PrintROCmSearchDirs =
-      Args.hasArg(clang::driver::options::OPT_print_rocm_search_dirs);
+  RocmPathArg = Args.getLastArgValue(options::OPT_rocm_path_EQ);
+  PrintROCmSearchDirs = Args.hasArg(options::OPT_print_rocm_search_dirs);
   RocmDeviceLibPathArg =
-      Args.getAllArgValues(clang::driver::options::OPT_rocm_device_lib_path_EQ);
-  HIPPathArg = Args.getLastArgValue(clang::driver::options::OPT_hip_path_EQ);
-  HIPStdParPathArg =
-    Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_path_EQ);
+      Args.getAllArgValues(options::OPT_rocm_device_lib_path_EQ);
+  HIPPathArg = Args.getLastArgValue(options::OPT_hip_path_EQ);
+  HIPStdParPathArg = Args.getLastArgValue(options::OPT_hipstdpar_path_EQ);
   HasHIPStdParLibrary =
     !HIPStdParPathArg.empty() && D.getVFS().exists(HIPStdParPathArg +
                                                    "/hipstdpar_lib.hpp");
   HIPRocThrustPathArg =
-    Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_thrust_path_EQ);
+      Args.getLastArgValue(options::OPT_hipstdpar_thrust_path_EQ);
   HasRocThrustLibrary = !HIPRocThrustPathArg.empty() &&
                         D.getVFS().exists(HIPRocThrustPathArg + "/thrust");
-  HIPRocPrimPathArg =
-    Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_prim_path_EQ);
+  HIPRocPrimPathArg = Args.getLastArgValue(options::OPT_hipstdpar_prim_path_EQ);
   HasRocPrimLibrary = !HIPRocPrimPathArg.empty() &&
                       D.getVFS().exists(HIPRocPrimPathArg + "/rocprim");
 
-  if (auto *A = Args.getLastArg(clang::driver::options::OPT_hip_version_EQ)) {
+  if (auto *A = Args.getLastArg(options::OPT_hip_version_EQ)) {
     HIPVersionArg = A->getValue();
     unsigned Major = ~0U;
     unsigned Minor = ~0U;
@@ -1095,9 +1093,21 @@ bool AMDGPUToolChain::shouldSkipSanitizeOption(
   if (K != SanitizerKind::Address)
     return true;
 
+  // Check 'xnack+' availability by default
+  llvm::StringRef Processor =
+      getProcessorFromTargetID(TC.getTriple(), TargetID);
+  auto ProcKind = TC.getTriple().isAMDGCN()
+                      ? llvm::AMDGPU::parseArchAMDGCN(Processor)
+                      : llvm::AMDGPU::parseArchR600(Processor);
+  auto Features = TC.getTriple().isAMDGCN()
+                      ? llvm::AMDGPU::getArchAttrAMDGCN(ProcKind)
+                      : llvm::AMDGPU::getArchAttrR600(ProcKind);
+  if (Features & llvm::AMDGPU::FEATURE_XNACK_ALWAYS)
+    return false;
+
+  // Look for the xnack feature in TargetID
   llvm::StringMap<bool> FeatureMap;
   auto OptionalGpuArch = parseTargetID(TC.getTriple(), TargetID, &FeatureMap);
-
   assert(OptionalGpuArch && "Invalid Target ID");
   (void)OptionalGpuArch;
   auto Loc = FeatureMap.find("xnack");
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h
index e90a5736911e4..7b999c311154f 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.h
+++ b/clang/lib/Driver/ToolChains/AMDGPU.h
@@ -11,9 +11,9 @@
 
 #include "Gnu.h"
 #include "clang/Basic/TargetID.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/TargetParser/TargetParser.h"
 
diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
index 2b41d54a9eb73..e14bc574d139a 100644
--- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -10,8 +10,8 @@
 #include "AMDGPU.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/STLExtras.h"
 
 using namespace clang::driver;
diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp
index 731076d9754a9..588255dc5a0cd 100644
--- a/clang/lib/Driver/ToolChains/AVR.cpp
+++ b/clang/lib/Driver/ToolChains/AVR.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
index e8d5e38a9064f..d6fb2a57539ed 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
@@ -9,7 +9,7 @@
 #include "AArch64.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/AArch64TargetParser.h"
 #include "llvm/TargetParser/Host.h"
@@ -222,7 +222,7 @@ void aarch64::getAArch64TargetFeatures(const Driver &D,
     // Default to 'A' profile if the architecture is not specified.
     success = getAArch64ArchFeaturesFromMarch(D, "armv8-a", Args, Extensions);
 
-  if (success && (A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)))
+  if (success && (A = Args.getLastArg(options::OPT_mtune_EQ)))
     success =
         getAArch64MicroArchFeaturesFromMtune(D, A->getValue(), Args, Features);
   else if (success && (A = Args.getLastArg(options::OPT_mcpu_EQ)))
diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
index 61beb0455147d..55eb2dcf7ddf4 100644
--- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
@@ -8,7 +8,7 @@
 
 #include "ARM.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/ARMTargetParser.h"
@@ -74,7 +74,7 @@ bool arm::isARMEABIBareMetal(const llvm::Triple &Triple) {
 // Get Arch/CPU from args.
 void arm::getARMArchCPUFromArgs(const ArgList &Args, llvm::StringRef &Arch,
                                 llvm::StringRef &CPU, bool FromAs) {
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ))
+  if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
     CPU = A->getValue();
   if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
     Arch = A->getValue();
diff --git a/clang/lib/Driver/ToolChains/Arch/CSKY.cpp b/clang/lib/Driver/ToolChains/Arch/CSKY.cpp
index 2fd2c72147f5b..65f6534e4d038 100644
--- a/clang/lib/Driver/ToolChains/Arch/CSKY.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/CSKY.cpp
@@ -8,7 +8,7 @@
 
 #include "CSKY.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/CSKYTargetParser.h"
@@ -33,7 +33,7 @@ csky::getCSKYArchName(const Driver &D, const ArgList &Args,
     return std::optional<llvm::StringRef>(A->getValue());
   }
 
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
     llvm::CSKY::ArchKind ArchKind = llvm::CSKY::parseCPUArch(A->getValue());
     if (ArchKind == llvm::CSKY::ArchKind::INVALID) {
       D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args);
@@ -126,7 +126,7 @@ void csky::getCSKYTargetFeatures(const Driver &D, const llvm::Triple &Triple,
     archName = A->getValue();
   }
 
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
     llvm::CSKY::ArchKind Kind = llvm::CSKY::parseCPUArch(A->getValue());
     if (Kind == llvm::CSKY::ArchKind::INVALID) {
       D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args);
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 156ea03045569..da084bdabaee3 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -11,7 +11,7 @@
 #include "clang/Basic/DiagnosticDriver.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/TargetParser/Host.h"
 #include "llvm/TargetParser/LoongArchTargetParser.h"
 
@@ -130,8 +130,7 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
                                            const ArgList &Args,
                                            std::vector<StringRef> &Features) {
   // Enable the `lsx` feature on 64-bit LoongArch by default.
-  if (Triple.isLoongArch64() &&
-      (!Args.hasArgNoClaim(clang::driver::options::OPT_march_EQ)))
+  if (Triple.isLoongArch64() && (!Args.hasArgNoClaim(options::OPT_march_EQ)))
     Features.push_back("+lsx");
 
   // -mrelax is default, unless -mno-relax is specified.
diff --git a/clang/lib/Driver/ToolChains/Arch/M68k.cpp b/clang/lib/Driver/ToolChains/Arch/M68k.cpp
index 1037c0ea80bf6..a620597f10475 100644
--- a/clang/lib/Driver/ToolChains/Arch/M68k.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/M68k.cpp
@@ -8,7 +8,7 @@
 
 #include "M68k.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Regex.h"
@@ -21,7 +21,7 @@ using namespace llvm::opt;
 
 /// getM68kTargetCPU - Get the (LLVM) name of the 68000 cpu we are targeting.
 std::string m68k::getM68kTargetCPU(const ArgList &Args) {
-  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) {
+  if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
     // The canonical CPU name is captalize. However, we allow
     // starting with lower case or numbers only
     StringRef CPUName = A->getValue();
@@ -36,26 +36,26 @@ std::string m68k::getM68kTargetCPU(const ArgList &Args) {
       return "generic";
 
     return llvm::StringSwitch<std::string>(CPUName)
-        .Cases("m68000", "68000", "M68000")
-        .Cases("m68010", "68010", "M68010")
-        .Cases("m68020", "68020", "M68020")
-        .Cases("m68030", "68030", "M68030")
-        .Cases("m68040", "68040", "M68040")
-        .Cases("m68060", "68060", "M68060")
+        .Cases({"m68000", "68000"}, "M68000")
+        .Cases({"m68010", "68010"}, "M68010")
+        .Cases({"m68020", "68020"}, "M68020")
+        .Cases({"m68030", "68030"}, "M68030")
+        .Cases({"m68040", "68040"}, "M68040")
+        .Cases({"m68060", "68060"}, "M68060")
         .Default(CPUName.str());
   }
   // FIXME: Throw error when multiple sub-architecture flag exist
-  if (Args.hasArg(clang::driver::options::OPT_m68000))
+  if (Args.hasArg(options::OPT_m68000))
     return "M68000";
-  if (Args.hasArg(clang::driver::options::OPT_m68010))
+  if (Args.hasArg(options::OPT_m68010))
     return "M68010";
-  if (Args.hasArg(clang::driver::options::OPT_m68020))
+  if (Args.hasArg(options::OPT_m68020))
     return "M68020";
-  if (Args.hasArg(clang::driver::options::OPT_m68030))
+  if (Args.hasArg(options::OPT_m68030))
     return "M68030";
-  if (Args.hasArg(clang::driver::options::OPT_m68040))
+  if (Args.hasArg(options::OPT_m68040))
     return "M68040";
-  if (Args.hasArg(clang::driver::options::OPT_m68060))
+  if (Args.hasArg(options::OPT_m68060))
     return "M68060";
 
   return "";
diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
index 6a6a4ee1a647b..103aae7018fbf 100644
--- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
@@ -9,7 +9,7 @@
 #include "Mips.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 
@@ -49,8 +49,7 @@ void mips::getMipsCPUAndABI(const ArgList &Args, const llvm::Triple &Triple,
     DefMips64CPU = "mips3";
   }
 
-  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ,
-                               options::OPT_mcpu_EQ))
+  if (Arg *A = Args.getLastArg(options::OPT_march_EQ, options::OPT_mcpu_EQ))
     CPUName = A->getValue();
 
   if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) {
@@ -117,7 +116,7 @@ void mips::getMipsCPUAndABI(const ArgList &Args, const llvm::Triple &Triple,
     // Deduce CPU name from ABI name.
     CPUName = llvm::StringSwitch<const char *>(ABIName)
                   .Case("o32", DefMips32CPU)
-                  .Cases("n32", "n64", DefMips64CPU)
+                  .Cases({"n32", "n64"}, DefMips64CPU)
                   .Default("");
   }
 
@@ -467,7 +466,7 @@ bool mips::isNaN2008(const Driver &D, const ArgList &Args,
 
   // NaN2008 is the default for MIPS32r6/MIPS64r6.
   return llvm::StringSwitch<bool>(getCPUName(D, Args, Triple))
-      .Cases("mips32r6", "mips64r6", true)
+      .Cases({"mips32r6", "mips64r6"}, true)
       .Default(false);
 }
 
diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
index 361a68a892a8f..44afdd249fea5 100644
--- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
@@ -9,7 +9,7 @@
 #include "PPC.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/Host.h"
diff --git a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
index f2e79e71f93d4..1dcce6d053a39 100644
--- a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
@@ -10,7 +10,7 @@
 #include "../Clang.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Error.h"
 #include "llvm/TargetParser/Host.h"
diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
index 94a94f1e9c487..49256d80cbdf6 100644
--- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
@@ -8,7 +8,7 @@
 
 #include "Sparc.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/Host.h"
@@ -122,7 +122,7 @@ sparc::FloatABI sparc::getSparcFloatABI(const Driver &D,
 
 std::string sparc::getSparcTargetCPU(const Driver &D, const ArgList &Args,
                                      const llvm::Triple &Triple) {
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
     StringRef CPUName = A->getValue();
     if (CPUName == "native") {
       std::string CPU = std::string(llvm::sys::getHostCPUName());
diff --git a/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp b/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp
index 75b6afd925245..1ef6a725483e8 100644
--- a/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp
@@ -8,7 +8,7 @@
 
 #include "SystemZ.h"
 #include "clang/Config/config.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/Host.h"
 
@@ -25,9 +25,9 @@ systemz::FloatABI systemz::getSystemZFloatABI(const Driver &D,
     D.Diag(diag::err_drv_unsupported_opt)
       << Args.getLastArg(options::OPT_mfloat_abi_EQ)->getAsString(Args);
 
-  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_msoft_float,
-                               options::OPT_mhard_float))
-    if (A->getOption().matches(clang::driver::options::OPT_msoft_float))
+  if (Arg *A =
+          Args.getLastArg(options::OPT_msoft_float, options::OPT_mhard_float))
+    if (A->getOption().matches(options::OPT_msoft_float))
       ABI = systemz::FloatABI::Soft;
 
   return ABI;
@@ -35,7 +35,7 @@ systemz::FloatABI systemz::getSystemZFloatABI(const Driver &D,
 
 std::string systemz::getSystemZTargetCPU(const ArgList &Args,
                                          const llvm::Triple &T) {
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
     llvm::StringRef CPUName = A->getValue();
 
     if (CPUName == "native") {
diff --git a/clang/lib/Driver/ToolChains/Arch/VE.cpp b/clang/lib/Driver/ToolChains/Arch/VE.cpp
index adc0873586588..c8353d7dc5f3a 100644
--- a/clang/lib/Driver/ToolChains/Arch/VE.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/VE.cpp
@@ -8,7 +8,7 @@
 
 #include "VE.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 
 using namespace clang::driver;
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index 1373905a5120e..092069b6ade56 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -8,7 +8,7 @@
 
 #include "X86.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Option/ArgList.h"
@@ -21,7 +21,7 @@ using namespace llvm::opt;
 
 std::string x86::getX86TargetCPU(const Driver &D, const ArgList &Args,
                                  const llvm::Triple &Triple) {
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
     StringRef CPU = A->getValue();
     if (CPU != "native")
       return std::string(CPU);
@@ -119,7 +119,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
                                std::vector<StringRef> &Features) {
   // Claim and report unsupported -mabi=. Note: we don't support "sysv_abi" or
   // "ms_abi" as default function attributes.
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mabi_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) {
     StringRef DefaultAbi =
         (Triple.isOSWindows() || Triple.isUEFI()) ? "ms" : "sysv";
     if (A->getValue() != DefaultAbi)
@@ -128,7 +128,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
   }
 
   // If -march=native, autodetect the feature list.
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
     if (StringRef(A->getValue()) == "native") {
       for (auto &F : llvm::sys::getHostCPUFeatures())
         Features.push_back(
@@ -163,7 +163,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
   // flags). This is a bit hacky but keeps existing usages working. We should
   // consider deprecating this and instead warn if the user requests external
   // retpoline thunks and *doesn't* request some form of retpolines.
-  auto SpectreOpt = clang::driver::options::ID::OPT_INVALID;
+  auto SpectreOpt = options::ID::OPT_INVALID;
   if (Args.hasArgNoClaim(options::OPT_mretpoline, options::OPT_mno_retpoline,
                          options::OPT_mspeculative_load_hardening,
                          options::OPT_mno_speculative_load_hardening)) {
@@ -189,7 +189,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
     SpectreOpt = options::OPT_mretpoline_external_thunk;
   }
 
-  auto LVIOpt = clang::driver::options::ID::OPT_INVALID;
+  auto LVIOpt = options::ID::OPT_INVALID;
   if (Args.hasFlag(options::OPT_mlvi_hardening, options::OPT_mno_lvi_hardening,
                    false)) {
     Features.push_back("+lvi-load-hardening");
@@ -207,7 +207,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
           << D.getOpts().getOptionName(options::OPT_mlvi_hardening)
           << D.getOpts().getOptionName(options::OPT_m_seses);
 
-    if (SpectreOpt != clang::driver::options::ID::OPT_INVALID)
+    if (SpectreOpt != options::ID::OPT_INVALID)
       D.Diag(diag::err_drv_argument_not_allowed_with)
           << D.getOpts().getOptionName(SpectreOpt)
           << D.getOpts().getOptionName(options::OPT_m_seses);
@@ -219,8 +219,8 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
     }
   }
 
-  if (SpectreOpt != clang::driver::options::ID::OPT_INVALID &&
-      LVIOpt != clang::driver::options::ID::OPT_INVALID) {
+  if (SpectreOpt != options::ID::OPT_INVALID &&
+      LVIOpt != options::ID::OPT_INVALID) {
     D.Diag(diag::err_drv_argument_not_allowed_with)
         << D.getOpts().getOptionName(SpectreOpt)
         << D.getOpts().getOptionName(LVIOpt);
diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index 9b7f58c392885..8d598be9ffb0a 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -18,7 +18,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/MultilibBuilder.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
@@ -135,7 +135,7 @@ static std::string computeClangRuntimesSysRoot(const Driver &D,
 bool BareMetal::initGCCInstallation(const llvm::Triple &Triple,
                                     const llvm::opt::ArgList &Args) {
   if (Args.getLastArg(options::OPT_gcc_toolchain) ||
-      Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ)) {
+      Args.getLastArg(clang::options::OPT_gcc_install_dir_EQ)) {
     GCCInstallation.init(Triple, Args);
     return GCCInstallation.isValid();
   }
diff --git a/clang/lib/Driver/ToolChains/CSKYToolChain.cpp b/clang/lib/Driver/ToolChains/CSKYToolChain.cpp
index e4db3307ee3aa..c561d7d38da5b 100644
--- a/clang/lib/Driver/ToolChains/CSKYToolChain.cpp
+++ b/clang/lib/Driver/ToolChains/CSKYToolChain.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 4e8f63ea49480..80389937ee218 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -29,10 +29,10 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Distro.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/Types.h"
 #include "clang/Driver/XRayArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
@@ -65,7 +65,7 @@ using namespace clang;
 using namespace llvm::opt;
 
 static void CheckPreprocessingOptions(const Driver &D, const ArgList &Args) {
-  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_C, options::OPT_CC,
+  if (Arg *A = Args.getLastArg(options::OPT_C, options::OPT_CC,
                                options::OPT_fminimize_whitespace,
                                options::OPT_fno_minimize_whitespace,
                                options::OPT_fkeep_system_includes,
@@ -1661,7 +1661,7 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
 
   AddAAPCSVolatileBitfieldArgs(Args, CmdArgs);
 
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) {
     CmdArgs.push_back("-tune-cpu");
     if (strcmp(A->getValue(), "native") == 0)
       CmdArgs.push_back(Args.MakeArgString(llvm::sys::getHostCPUName()));
@@ -2067,7 +2067,7 @@ void Clang::AddSparcTargetArgs(const ArgList &Args,
     CmdArgs.push_back("hard");
   }
 
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) {
     StringRef Name = A->getValue();
     std::string TuneCPU;
     if (Name == "native")
@@ -2173,12 +2173,11 @@ void Clang::AddX86TargetArgs(const ArgList &Args,
 
   // Default to "generic" unless -march is present or targetting the PS4/PS5.
   std::string TuneCPU;
-  if (!Args.hasArg(clang::driver::options::OPT_march_EQ) &&
-      !getToolChain().getTriple().isPS())
+  if (!Args.hasArg(options::OPT_march_EQ) && !getToolChain().getTriple().isPS())
     TuneCPU = "generic";
 
   // Override based on -mtune.
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) {
     StringRef Name = A->getValue();
 
     if (Name == "native") {
@@ -3708,6 +3707,7 @@ static void RenderHLSLOptions(const ArgList &Args, ArgStringList &CmdArgs,
       options::OPT_emit_obj,
       options::OPT_disable_llvm_passes,
       options::OPT_fnative_half_type,
+      options::OPT_fnative_int16_type,
       options::OPT_hlsl_entrypoint,
       options::OPT_fdx_rootsignature_define,
       options::OPT_fdx_rootsignature_version,
@@ -5704,6 +5704,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   case CodeGenOptions::FramePointerKind::Reserved:
     FPKeepKindStr = "-mframe-pointer=reserved";
     break;
+  case CodeGenOptions::FramePointerKind::NonLeafNoReserve:
+    FPKeepKindStr = "-mframe-pointer=non-leaf-no-reserve";
+    break;
   case CodeGenOptions::FramePointerKind::NonLeaf:
     FPKeepKindStr = "-mframe-pointer=non-leaf";
     break;
@@ -7878,10 +7881,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                        !TC.getTriple().isAndroid() && TC.useIntegratedAs()))
     CmdArgs.push_back("-faddrsig");
 
-  if ((Triple.isOSBinFormatELF() || Triple.isOSBinFormatMachO()) &&
+  const bool HasDefaultDwarf2CFIASM =
+      (Triple.isOSBinFormatELF() || Triple.isOSBinFormatMachO()) &&
       (EH || UnwindTables || AsyncUnwindTables ||
-       DebugInfoKind != llvm::codegenoptions::NoDebugInfo))
-    CmdArgs.push_back("-D__GCC_HAVE_DWARF2_CFI_ASM=1");
+       DebugInfoKind != llvm::codegenoptions::NoDebugInfo);
+  if (Args.hasFlag(options::OPT_fdwarf2_cfi_asm,
+                   options::OPT_fno_dwarf2_cfi_asm, HasDefaultDwarf2CFIASM))
+    CmdArgs.push_back("-fdwarf2-cfi-asm");
 
   if (Arg *A = Args.getLastArg(options::OPT_fsymbol_partition_EQ)) {
     std::string Str = A->getAsString(Args);
@@ -8262,6 +8268,30 @@ void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType,
                                                      << "/kernel";
  }
 
+  if (const Arg *A = Args.getLastArg(options::OPT__SLASH_vlen,
+                                     options::OPT__SLASH_vlen_EQ_256,
+                                     options::OPT__SLASH_vlen_EQ_512)) {
+    llvm::Triple::ArchType AT = getToolChain().getArch();
+    StringRef Default = AT == llvm::Triple::x86 ? "IA32" : "SSE2";
+    StringRef Arch = Args.getLastArgValue(options::OPT__SLASH_arch, Default);
+
+    if (A->getOption().matches(options::OPT__SLASH_vlen_EQ_512)) {
+      if (Arch == "AVX512F" || Arch == "AVX512")
+        CmdArgs.push_back("-mprefer-vector-width=512");
+      else
+        D.Diag(diag::warn_drv_argument_not_allowed_with)
+            << "/vlen=512" << std::string("/arch:").append(Arch);
+    }
+
+    if (A->getOption().matches(options::OPT__SLASH_vlen_EQ_256)) {
+      if (Arch == "AVX512F" || Arch == "AVX512")
+        CmdArgs.push_back("-mprefer-vector-width=256");
+      else if (Arch != "AVX" && Arch != "AVX2")
+        D.Diag(diag::warn_drv_argument_not_allowed_with)
+            << "/vlen=256" << std::string("/arch:").append(Arch);
+    }
+  }
+
   Arg *MostGeneralArg = Args.getLastArg(options::OPT__SLASH_vmg);
   Arg *BestCaseArg = Args.getLastArg(options::OPT__SLASH_vmb);
   if (MostGeneralArg && BestCaseArg)
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index ec8dcdc81db56..4c036f0f8dee3 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -31,11 +31,11 @@
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Job.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Driver/Util.h"
 #include "clang/Driver/XRayArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
@@ -69,8 +69,7 @@ using namespace llvm::opt;
 
 static bool useFramePointerForTargetByDefault(const llvm::opt::ArgList &Args,
                                               const llvm::Triple &Triple) {
-  if (Args.hasArg(clang::driver::options::OPT_pg) &&
-      !Args.hasArg(clang::driver::options::OPT_mfentry))
+  if (Args.hasArg(options::OPT_pg) && !Args.hasArg(options::OPT_mfentry))
     return true;
 
   if (Triple.isAndroid())
@@ -222,26 +221,39 @@ static bool framePointerImpliesLeafFramePointer(const llvm::opt::ArgList &Args,
 clang::CodeGenOptions::FramePointerKind
 getFramePointerKind(const llvm::opt::ArgList &Args,
                     const llvm::Triple &Triple) {
-  // There are three things to consider here:
+  // There are four things to consider here:
   // * Should a frame record be created for non-leaf functions?
   // * Should a frame record be created for leaf functions?
-  // * Is the frame pointer register reserved, i.e. must it always point to
-  //   either a new, valid frame record or be un-modified?
+  // * Is the frame pointer register reserved in non-leaf functions?
+  //   i.e. must it always point to either a new, valid frame record or be
+  //   un-modified?
+  // * Is the frame pointer register reserved in leaf functions?
   //
   //  Not all combinations of these are valid:
   //  * It's not useful to have leaf frame records without non-leaf ones.
   //  * It's not useful to have frame records without reserving the frame
   //    pointer.
   //
-  // | Non-leaf | Leaf | Reserved |
-  // | N        | N    | N        | FramePointerKind::None
-  // | N        | N    | Y        | FramePointerKind::Reserved
-  // | N        | Y    | N        | Invalid
-  // | N        | Y    | Y        | Invalid
-  // | Y        | N    | N        | Invalid
-  // | Y        | N    | Y        | FramePointerKind::NonLeaf
-  // | Y        | Y    | N        | Invalid
-  // | Y        | Y    | Y        | FramePointerKind::All
+  // | Frame Setup     | Reg Reserved    |
+  // |-----------------|-----------------|
+  // | Non-leaf | Leaf | Non-Leaf | Leaf |
+  // |----------|------|----------|------|
+  // | N        | N    | N        | N    | FramePointerKind::None
+  // | N        | N    | N        | Y    | Invalid
+  // | N        | N    | Y        | N    | Invalid
+  // | N        | N    | Y        | Y    | FramePointerKind::Reserved
+  // | N        | Y    | N        | N    | Invalid
+  // | N        | Y    | N        | Y    | Invalid
+  // | N        | Y    | Y        | N    | Invalid
+  // | N        | Y    | Y        | Y    | Invalid
+  // | Y        | N    | N        | N    | Invalid
+  // | Y        | N    | N        | Y    | Invalid
+  // | Y        | N    | Y        | N    | FramePointerKind::NonLeafNoReserve
+  // | Y        | N    | Y        | Y    | FramePointerKind::NonLeaf
+  // | Y        | Y    | N        | N    | Invalid
+  // | Y        | Y    | N        | Y    | Invalid
+  // | Y        | Y    | Y        | N    | Invalid
+  // | Y        | Y    | Y        | Y    | FramePointerKind::All
   //
   // The FramePointerKind::Reserved case is currently only reachable for Arm,
   // which has the -mframe-chain= option which can (in combination with
@@ -249,24 +261,29 @@ getFramePointerKind(const llvm::opt::ArgList &Args,
   // without requiring new frame records to be created.
 
   bool DefaultFP = useFramePointerForTargetByDefault(Args, Triple);
-  bool EnableFP =
-      mustUseNonLeafFramePointerForTarget(Triple) ||
-      Args.hasFlag(clang::driver::options::OPT_fno_omit_frame_pointer,
-                   clang::driver::options::OPT_fomit_frame_pointer, DefaultFP);
+  bool EnableFP = mustUseNonLeafFramePointerForTarget(Triple) ||
+                  Args.hasFlag(options::OPT_fno_omit_frame_pointer,
+                               options::OPT_fomit_frame_pointer, DefaultFP);
 
   bool DefaultLeafFP =
       useLeafFramePointerForTargetByDefault(Triple) ||
       (EnableFP && framePointerImpliesLeafFramePointer(Args, Triple));
-  bool EnableLeafFP = Args.hasFlag(
-      clang::driver::options::OPT_mno_omit_leaf_frame_pointer,
-      clang::driver::options::OPT_momit_leaf_frame_pointer, DefaultLeafFP);
+  bool EnableLeafFP =
+      Args.hasFlag(options::OPT_mno_omit_leaf_frame_pointer,
+                   options::OPT_momit_leaf_frame_pointer, DefaultLeafFP);
 
-  bool FPRegReserved = EnableFP || mustMaintainValidFrameChain(Args, Triple);
+  bool FPRegReserved = Args.hasFlag(options::OPT_mreserve_frame_pointer_reg,
+                                    options::OPT_mno_reserve_frame_pointer_reg,
+                                    mustMaintainValidFrameChain(Args, Triple));
 
   if (EnableFP) {
     if (EnableLeafFP)
       return clang::CodeGenOptions::FramePointerKind::All;
-    return clang::CodeGenOptions::FramePointerKind::NonLeaf;
+
+    if (FPRegReserved)
+      return clang::CodeGenOptions::FramePointerKind::NonLeaf;
+
+    return clang::CodeGenOptions::FramePointerKind::NonLeafNoReserve;
   }
   if (FPRegReserved)
     return clang::CodeGenOptions::FramePointerKind::Reserved;
@@ -753,7 +770,7 @@ std::string tools::getCPUName(const Driver &D, const ArgList &Args,
   case llvm::Triple::ppcle:
   case llvm::Triple::ppc64:
   case llvm::Triple::ppc64le:
-    if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ))
+    if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
       return std::string(
           llvm::PPC::getNormalizedPPCTargetCPU(T, A->getValue()));
     return std::string(llvm::PPC::getNormalizedPPCTargetCPU(T));
@@ -1733,7 +1750,7 @@ bool tools::addSanitizerRuntimes(const ToolChain &TC, const ArgList &Args,
     if (SanArgs.needsFuzzerInterceptors())
       addSanitizerRuntime(TC, Args, CmdArgs, "fuzzer_interceptors", false,
                           true);
-    if (!Args.hasArg(clang::driver::options::OPT_nostdlibxx)) {
+    if (!Args.hasArg(options::OPT_nostdlibxx)) {
       bool OnlyLibstdcxxStatic = Args.hasArg(options::OPT_static_libstdcxx) &&
                                  !Args.hasArg(options::OPT_static);
       if (OnlyLibstdcxxStatic)
@@ -3385,7 +3402,7 @@ void tools::handleInterchangeLoopsArgs(const ArgList &Args,
 // Otherwise, return an empty string and issue a diagnosic message if needed.
 StringRef tools::parseMPreferVectorWidthOption(clang::DiagnosticsEngine &Diags,
                                                const llvm::opt::ArgList &Args) {
-  Arg *A = Args.getLastArg(clang::driver::options::OPT_mprefer_vector_width_EQ);
+  Arg *A = Args.getLastArg(options::OPT_mprefer_vector_width_EQ);
   if (!A)
     return "";
 
diff --git a/clang/lib/Driver/ToolChains/CrossWindows.cpp b/clang/lib/Driver/ToolChains/CrossWindows.cpp
index 51c892fc91718..6df5315e8fff8 100644
--- a/clang/lib/Driver/ToolChains/CrossWindows.cpp
+++ b/clang/lib/Driver/ToolChains/CrossWindows.cpp
@@ -10,8 +10,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 07201cc4676ac..6cc73ff5fc1f6 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -14,7 +14,7 @@
 #include "clang/Driver/Distro.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE
 #include "llvm/Option/ArgList.h"
@@ -153,16 +153,16 @@ CudaInstallationDetector::CudaInstallationDetector(
   std::initializer_list<const char *> Versions = {"8.0", "7.5", "7.0"};
   auto &FS = D.getVFS();
 
-  if (Args.hasArg(clang::driver::options::OPT_cuda_path_EQ)) {
+  if (Args.hasArg(options::OPT_cuda_path_EQ)) {
     Candidates.emplace_back(
-        Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ).str());
+        Args.getLastArgValue(options::OPT_cuda_path_EQ).str());
   } else if (HostTriple.isOSWindows()) {
     for (const char *Ver : Versions)
       Candidates.emplace_back(
           D.SysRoot + "/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v" +
           Ver);
   } else {
-    if (!Args.hasArg(clang::driver::options::OPT_cuda_path_ignore_env)) {
+    if (!Args.hasArg(options::OPT_cuda_path_ignore_env)) {
       // Try to find ptxas binary. If the executable is located in a directory
       // called 'bin/', its parent directory might be a good guess for a valid
       // CUDA installation.
diff --git a/clang/lib/Driver/ToolChains/Cygwin.cpp b/clang/lib/Driver/ToolChains/Cygwin.cpp
index d9c16347daa34..55438125ce0f1 100644
--- a/clang/lib/Driver/ToolChains/Cygwin.cpp
+++ b/clang/lib/Driver/ToolChains/Cygwin.cpp
@@ -10,7 +10,7 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/VirtualFileSystem.h"
 
@@ -58,7 +58,7 @@ void Cygwin::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   const Driver &D = getDriver();
   std::string SysRoot = computeSysRoot();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index cc5bcd1816c52..fc3cd9030f71d 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -14,8 +14,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/ProfileData/InstrProf.h"
@@ -1035,12 +1035,12 @@ static const char *ArmMachOArchName(StringRef Arch) {
       .Case("xscale", "xscale")
       .Case("armv4t", "armv4t")
       .Case("armv7", "armv7")
-      .Cases("armv7a", "armv7-a", "armv7")
-      .Cases("armv7r", "armv7-r", "armv7")
-      .Cases("armv7em", "armv7e-m", "armv7em")
-      .Cases("armv7k", "armv7-k", "armv7k")
-      .Cases("armv7m", "armv7-m", "armv7m")
-      .Cases("armv7s", "armv7-s", "armv7s")
+      .Cases({"armv7a", "armv7-a"}, "armv7")
+      .Cases({"armv7r", "armv7-r"}, "armv7")
+      .Cases({"armv7em", "armv7e-m"}, "armv7em")
+      .Cases({"armv7k", "armv7-k"}, "armv7k")
+      .Cases({"armv7m", "armv7-m"}, "armv7m")
+      .Cases({"armv7s", "armv7-s"}, "armv7s")
       .Default(nullptr);
 }
 
@@ -1079,7 +1079,7 @@ StringRef MachO::getMachOArchName(const ArgList &Args) const {
 
   case llvm::Triple::thumb:
   case llvm::Triple::arm:
-    if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ))
+    if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
       if (const char *Arch = ArmMachOArchName(A->getValue()))
         return Arch;
 
@@ -2993,7 +2993,7 @@ DerivedArgList *MachO::TranslateArgs(const DerivedArgList &Args,
   if (!BoundArch.empty()) {
     StringRef Name = BoundArch;
     const Option MCpu = Opts.getOption(options::OPT_mcpu_EQ);
-    const Option MArch = Opts.getOption(clang::driver::options::OPT_march_EQ);
+    const Option MArch = Opts.getOption(options::OPT_march_EQ);
 
     // This code must be kept in sync with LLVM's getArchTypeForDarwinArch,
     // which defines the list of which architectures we accept.
diff --git a/clang/lib/Driver/ToolChains/DragonFly.cpp b/clang/lib/Driver/ToolChains/DragonFly.cpp
index 524f5f2ff391e..d4a6d6ae3e349 100644
--- a/clang/lib/Driver/ToolChains/DragonFly.cpp
+++ b/clang/lib/Driver/ToolChains/DragonFly.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 
@@ -219,7 +219,7 @@ void DragonFly::AddClangSystemIncludeArgs(
     llvm::opt::ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 88bce181d40d2..270904de544d6 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -11,7 +11,7 @@
 
 #include "clang/Basic/CodeGenOptions.h"
 #include "clang/Driver/CommonArgs.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Frontend/Debug/Options.h"
 #include "llvm/Support/Path.h"
 #include "llvm/TargetParser/Host.h"
@@ -230,7 +230,7 @@ void Flang::addCodegenOptions(const ArgList &Args,
        options::OPT_fstack_repack_arrays, options::OPT_fno_stack_repack_arrays,
        options::OPT_ftime_report, options::OPT_ftime_report_EQ,
        options::OPT_funroll_loops, options::OPT_fno_unroll_loops});
-  if (Args.hasArg(clang::driver::options::OPT_fcoarray))
+  if (Args.hasArg(options::OPT_fcoarray))
     CmdArgs.push_back("-fcoarray");
 }
 
@@ -1071,6 +1071,9 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
   case CodeGenOptions::FramePointerKind::Reserved:
     FPKeepKindStr = "-mframe-pointer=reserved";
     break;
+  case CodeGenOptions::FramePointerKind::NonLeafNoReserve:
+    FPKeepKindStr = "-mframe-pointer=non-leaf-no-reserve";
+    break;
   case CodeGenOptions::FramePointerKind::NonLeaf:
     FPKeepKindStr = "-mframe-pointer=non-leaf";
     break;
diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
index b17b76233ad30..70e66a2f5c3e7 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -13,8 +13,8 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/VirtualFileSystem.h"
 
@@ -404,7 +404,7 @@ void FreeBSD::AddClangSystemIncludeArgs(
     llvm::opt::ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 507cc03b27513..9edfc4de3d602 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -12,8 +12,8 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/MultilibBuilder.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -344,7 +344,7 @@ Tool *Fuchsia::buildStaticLibTool() const {
 
 ToolChain::RuntimeLibType
 Fuchsia::GetRuntimeLibType(const ArgList &Args) const {
-  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) {
+  if (Arg *A = Args.getLastArg(options::OPT_rtlib_EQ)) {
     StringRef Value = A->getValue();
     if (Value != "compiler-rt")
       getDriver().Diag(clang::diag::err_drv_invalid_rtlib_name)
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 7616076847a2c..1bfcd1f4f3a7c 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -20,9 +20,9 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/MultilibBuilder.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Option/ArgList.h"
@@ -1750,11 +1750,11 @@ static void findRISCVBareMetalMultilibs(const Driver &D,
   std::string MArch = tools::riscv::getRISCVArch(Args, TargetTriple);
   for (auto Element : RISCVMultilibSet) {
     addMultilibFlag(MArch == Element.march,
-                    Twine("-march=", Element.march).str().c_str(), Flags);
+                    Twine("-march=", Element.march).str(), Flags);
     if (!Added_ABIs.count(Element.mabi)) {
       Added_ABIs.insert(Element.mabi);
       addMultilibFlag(ABIName == Element.mabi,
-                      Twine("-mabi=", Element.mabi).str().c_str(), Flags);
+                      Twine("-mabi=", Element.mabi).str(), Flags);
     }
   }
 
@@ -2058,7 +2058,7 @@ Generic_GCC::GCCVersion Generic_GCC::GCCVersion::Parse(StringRef VersionText) {
 
 static llvm::StringRef getGCCToolchainDir(const ArgList &Args,
                                           llvm::StringRef SysRoot) {
-  const Arg *A = Args.getLastArg(clang::driver::options::OPT_gcc_toolchain);
+  const Arg *A = Args.getLastArg(options::OPT_gcc_toolchain);
   if (A)
     return A->getValue();
 
@@ -2111,8 +2111,7 @@ void Generic_GCC::GCCInstallationDetector::init(
                            CandidateBiarchTripleAliases);
 
   // If --gcc-install-dir= is specified, skip filesystem detection.
-  if (const Arg *A =
-          Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ);
+  if (const Arg *A = Args.getLastArg(options::OPT_gcc_install_dir_EQ);
       A && A->getValue()[0]) {
     StringRef InstallDir = A->getValue();
     if (!ScanGCCForMultilibs(TargetTriple, Args, InstallDir, false)) {
@@ -2135,8 +2134,7 @@ void Generic_GCC::GCCInstallationDetector::init(
 
   // If --gcc-triple is specified use this instead of trying to
   // auto-detect a triple.
-  if (const Arg *A =
-          Args.getLastArg(clang::driver::options::OPT_gcc_triple_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_gcc_triple_EQ)) {
     StringRef GCCTriple = A->getValue();
     CandidateTripleAliases.clear();
     CandidateTripleAliases.push_back(GCCTriple);
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index c0c8afec07264..0fbfa090ed9d3 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -15,8 +15,8 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/TargetParser/TargetParser.h"
diff --git a/clang/lib/Driver/ToolChains/HIPSPV.cpp b/clang/lib/Driver/ToolChains/HIPSPV.cpp
index bce7f46dea468..be0f49d8e1497 100644
--- a/clang/lib/Driver/ToolChains/HIPSPV.cpp
+++ b/clang/lib/Driver/ToolChains/HIPSPV.cpp
@@ -12,7 +12,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 
@@ -211,7 +211,7 @@ HIPSPVToolChain::getDeviceLibs(
   // Find device libraries in --hip-device-lib-path and HIP_DEVICE_LIB_PATH.
   auto HipDeviceLibPathArgs = DriverArgs.getAllArgValues(
       // --hip-device-lib-path is alias to this option.
-      clang::driver::options::OPT_rocm_device_lib_path_EQ);
+      options::OPT_rocm_device_lib_path_EQ);
   for (auto Path : HipDeviceLibPathArgs)
     LibraryPaths.push_back(DriverArgs.MakeArgString(Path));
 
diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp
index 732403e69a075..1af2ae6470f1e 100644
--- a/clang/lib/Driver/ToolChains/HIPUtility.cpp
+++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp
@@ -9,7 +9,7 @@
 #include "HIPUtility.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Object/Archive.h"
diff --git a/clang/lib/Driver/ToolChains/HLSL.cpp b/clang/lib/Driver/ToolChains/HLSL.cpp
index 20a320ea233d4..5d7221b8718a9 100644
--- a/clang/lib/Driver/ToolChains/HLSL.cpp
+++ b/clang/lib/Driver/ToolChains/HLSL.cpp
@@ -498,6 +498,15 @@ HLSLToolChain::TranslateArgs(const DerivedArgList &Args, StringRef BoundArch,
       continue;
     }
 
+    if (A->getOption().getID() == options::OPT_enable_16bit_types) {
+      // Translate -enable-16bit-types into -fnative-half-type and
+      // -fnative-int16-type
+      DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_fnative_half_type));
+      DAL->AddFlagArg(nullptr, Opts.getOption(options::OPT_fnative_int16_type));
+      A->claim();
+      continue;
+    }
+
     DAL->append(A);
   }
 
@@ -558,3 +567,7 @@ bool HLSLToolChain::isLastJob(DerivedArgList &Args,
   // output to the result file.
   return true;
 }
+
+void HLSLToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {
+  CC1Args.push_back("-Wconversion");
+}
diff --git a/clang/lib/Driver/ToolChains/HLSL.h b/clang/lib/Driver/ToolChains/HLSL.h
index 3aed904648429..5bf385e13e962 100644
--- a/clang/lib/Driver/ToolChains/HLSL.h
+++ b/clang/lib/Driver/ToolChains/HLSL.h
@@ -91,6 +91,8 @@ class LLVM_LIBRARY_VISIBILITY HLSLToolChain : public ToolChain {
   // Set default DWARF version to 4 for DXIL uses version 4.
   unsigned GetDefaultDwarfVersion() const override { return 4; }
 
+  void addClangWarningOptions(llvm::opt::ArgStringList &CC1Args) const override;
+
 private:
   mutable std::unique_ptr<tools::hlsl::Validator> Validator;
   mutable std::unique_ptr<tools::hlsl::MetalConverter> MetalConverter;
diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp
index 9f8b676fc7dc2..084f51721315c 100644
--- a/clang/lib/Driver/ToolChains/Hexagon.cpp
+++ b/clang/lib/Driver/ToolChains/Hexagon.cpp
@@ -11,7 +11,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
diff --git a/clang/lib/Driver/ToolChains/Hurd.cpp b/clang/lib/Driver/ToolChains/Hurd.cpp
index 43121233ea7d0..53ee4d4c0cbde 100644
--- a/clang/lib/Driver/ToolChains/Hurd.cpp
+++ b/clang/lib/Driver/ToolChains/Hurd.cpp
@@ -10,7 +10,7 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/VirtualFileSystem.h"
 
@@ -168,7 +168,7 @@ void Hurd::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   const Driver &D = getDriver();
   std::string SysRoot = computeSysRoot();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 94a9fe8b1a63f..020e7465548fe 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -16,8 +16,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Distro.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/Path.h"
@@ -731,7 +731,7 @@ void Linux::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   const Driver &D = getDriver();
   std::string SysRoot = computeSysRoot();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   // Add 'include' in the resource directory, which is similar to
diff --git a/clang/lib/Driver/ToolChains/MSP430.cpp b/clang/lib/Driver/ToolChains/MSP430.cpp
index 9eca1ad5f2865..3cc56bb7e832e 100644
--- a/clang/lib/Driver/ToolChains/MSP430.cpp
+++ b/clang/lib/Driver/ToolChains/MSP430.cpp
@@ -12,7 +12,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Multilib.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 
diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp
index bb469ff095cd4..fcae5b7a18f34 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -12,8 +12,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/ConvertUTF.h"
diff --git a/clang/lib/Driver/ToolChains/Managarm.cpp b/clang/lib/Driver/ToolChains/Managarm.cpp
index da4a9072317f4..1bbabdfc631b8 100644
--- a/clang/lib/Driver/ToolChains/Managarm.cpp
+++ b/clang/lib/Driver/ToolChains/Managarm.cpp
@@ -11,8 +11,8 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 
@@ -136,7 +136,7 @@ void Managarm::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   const Driver &D = getDriver();
   std::string SysRoot = computeSysRoot();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp
index 1bb9bcfe6aab2..2c9a174069f70 100644
--- a/clang/lib/Driver/ToolChains/MinGW.cpp
+++ b/clang/lib/Driver/ToolChains/MinGW.cpp
@@ -12,8 +12,8 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
diff --git a/clang/lib/Driver/ToolChains/MipsLinux.cpp b/clang/lib/Driver/ToolChains/MipsLinux.cpp
index 7dd3936613296..58d6b5031f536 100644
--- a/clang/lib/Driver/ToolChains/MipsLinux.cpp
+++ b/clang/lib/Driver/ToolChains/MipsLinux.cpp
@@ -9,7 +9,7 @@
 #include "MipsLinux.h"
 #include "Arch/Mips.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -38,7 +38,7 @@ MipsLLVMToolChain::MipsLLVMToolChain(const Driver &D,
 
 void MipsLLVMToolChain::AddClangSystemIncludeArgs(
     const ArgList &DriverArgs, ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   const Driver &D = getDriver();
diff --git a/clang/lib/Driver/ToolChains/NetBSD.cpp b/clang/lib/Driver/ToolChains/NetBSD.cpp
index 8db00deeb80df..ea722b59853d6 100644
--- a/clang/lib/Driver/ToolChains/NetBSD.cpp
+++ b/clang/lib/Driver/ToolChains/NetBSD.cpp
@@ -14,8 +14,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/VirtualFileSystem.h"
 
@@ -466,7 +466,7 @@ void NetBSD::AddClangSystemIncludeArgs(
     llvm::opt::ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
diff --git a/clang/lib/Driver/ToolChains/OHOS.cpp b/clang/lib/Driver/ToolChains/OHOS.cpp
index 00991504e97a8..607eb714f85dc 100644
--- a/clang/lib/Driver/ToolChains/OHOS.cpp
+++ b/clang/lib/Driver/ToolChains/OHOS.cpp
@@ -12,8 +12,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/FileSystem.h"
@@ -174,7 +174,7 @@ OHOS::OHOS(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
 
 ToolChain::RuntimeLibType OHOS::GetRuntimeLibType(
     const ArgList &Args) const {
-  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) {
+  if (Arg *A = Args.getLastArg(options::OPT_rtlib_EQ)) {
     StringRef Value = A->getValue();
     if (Value != "compiler-rt")
       getDriver().Diag(clang::diag::err_drv_invalid_rtlib_name)
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index 8f589186af343..5e7b4f1a664e6 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -13,8 +13,8 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/VirtualFileSystem.h"
@@ -315,7 +315,7 @@ void OpenBSD::AddClangSystemIncludeArgs(
     llvm::opt::ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
diff --git a/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp b/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp
index 8d381c4f14371..76180431ee682 100644
--- a/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp
@@ -8,7 +8,7 @@
 
 #include "PPCFreeBSD.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Support/Path.h"
 
 using namespace clang::driver::toolchains;
@@ -16,7 +16,7 @@ using namespace llvm::opt;
 
 void PPCFreeBSDToolChain::AddClangSystemIncludeArgs(
     const ArgList &DriverArgs, ArgStringList &CC1Args) const {
-  if (!DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) &&
+  if (!DriverArgs.hasArg(options::OPT_nostdinc) &&
       !DriverArgs.hasArg(options::OPT_nobuiltininc)) {
     const Driver &D = getDriver();
     SmallString<128> P(D.ResourceDir);
diff --git a/clang/lib/Driver/ToolChains/PPCLinux.cpp b/clang/lib/Driver/ToolChains/PPCLinux.cpp
index 768214e416bd7..672ebd5b7b98d 100644
--- a/clang/lib/Driver/ToolChains/PPCLinux.cpp
+++ b/clang/lib/Driver/ToolChains/PPCLinux.cpp
@@ -8,7 +8,7 @@
 
 #include "PPCLinux.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 
@@ -58,7 +58,7 @@ PPCLinuxToolChain::PPCLinuxToolChain(const Driver &D,
 
 void PPCLinuxToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                                   ArgStringList &CC1Args) const {
-  if (!DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) &&
+  if (!DriverArgs.hasArg(options::OPT_nostdinc) &&
       !DriverArgs.hasArg(options::OPT_nobuiltininc)) {
     const Driver &D = getDriver();
     SmallString<128> P(D.ResourceDir);
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index 34ec65ae59602..5b5b5607da69e 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -11,8 +11,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -488,6 +488,9 @@ toolchains::PS4PS5Base::PS4PS5Base(const Driver &D, const llvm::Triple &Triple,
   // control of header or library search. If we're not linking, don't check
   // for missing libraries.
   auto CheckSDKPartExists = [&](StringRef Dir, StringRef Desc) {
+    // In ThinLTO code generation mode SDK files are not required.
+    if (Args.hasArgNoClaim(options::OPT_fthinlto_index_EQ))
+      return true;
     if (llvm::sys::fs::exists(Dir))
       return true;
     D.Diag(clang::diag::warn_drv_unable_to_find_directory_expected)
diff --git a/clang/lib/Driver/ToolChains/SPIRV.cpp b/clang/lib/Driver/ToolChains/SPIRV.cpp
index ea824dbad54cb..27de55cfebfc1 100644
--- a/clang/lib/Driver/ToolChains/SPIRV.cpp
+++ b/clang/lib/Driver/ToolChains/SPIRV.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 
 using namespace clang::driver;
 using namespace clang::driver::toolchains;
diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp
index 0232b047a6c4b..85859f344b491 100644
--- a/clang/lib/Driver/ToolChains/SYCL.cpp
+++ b/clang/lib/Driver/ToolChains/SYCL.cpp
@@ -20,7 +20,7 @@ SYCLInstallationDetector::SYCLInstallationDetector(
 
 void SYCLInstallationDetector::addSYCLIncludeArgs(
     const ArgList &DriverArgs, ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nobuiltininc))
+  if (DriverArgs.hasArg(options::OPT_nobuiltininc))
     return;
 
   // Add the SYCL header search locations in the specified order.
diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index 02aa59817449d..ad0f41144f393 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -13,9 +13,9 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/ToolChain.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
@@ -346,7 +346,7 @@ SanitizerMask Solaris::getSupportedSanitizers() const {
 const char *Solaris::getDefaultLinker() const {
   // FIXME: Only handle Solaris ld and GNU ld here.
   return llvm::StringSwitch<const char *>(getDriver().getPreferredLinker())
-      .Cases("bfd", "gld", "/usr/gnu/bin/ld")
+      .Cases({"bfd", "gld"}, "/usr/gnu/bin/ld")
       .Default("/usr/bin/ld");
 }
 
@@ -360,7 +360,7 @@ void Solaris::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                         ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
diff --git a/clang/lib/Driver/ToolChains/UEFI.cpp b/clang/lib/Driver/ToolChains/UEFI.cpp
index d2be147c7b9f6..7732e37f8061d 100644
--- a/clang/lib/Driver/ToolChains/UEFI.cpp
+++ b/clang/lib/Driver/ToolChains/UEFI.cpp
@@ -11,8 +11,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/VirtualFileSystem.h"
diff --git a/clang/lib/Driver/ToolChains/VEToolchain.cpp b/clang/lib/Driver/ToolChains/VEToolchain.cpp
index ad9129046c3e1..78509bcdae0fe 100644
--- a/clang/lib/Driver/ToolChains/VEToolchain.cpp
+++ b/clang/lib/Driver/ToolChains/VEToolchain.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 #include <cstdlib> // ::getenv
@@ -78,7 +78,7 @@ bool VEToolChain::hasBlocksRuntime() const { return false; }
 
 void VEToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                             ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (DriverArgs.hasArg(options::OPT_nobuiltininc) &&
@@ -117,7 +117,7 @@ void VEToolChain::addClangTargetOptions(const ArgList &DriverArgs,
 
 void VEToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
                                                ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) ||
+  if (DriverArgs.hasArg(options::OPT_nostdinc) ||
       DriverArgs.hasArg(options::OPT_nostdlibinc) ||
       DriverArgs.hasArg(options::OPT_nostdincxx))
     return;
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
index 5054868b5ff4d..15c6f19e87fee 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -12,7 +12,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_VERSION_STRING
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
@@ -297,7 +297,7 @@ bool WebAssembly::HasNativeLLVMSupport() const { return true; }
 void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs,
                                         ArgStringList &CC1Args,
                                         Action::OffloadKind) const {
-  if (!DriverArgs.hasFlag(clang::driver::options::OPT_fuse_init_array,
+  if (!DriverArgs.hasFlag(options::OPT_fuse_init_array,
                           options::OPT_fno_use_init_array, true))
     CC1Args.push_back("-fno-use-init-array");
 
@@ -472,7 +472,7 @@ WebAssembly::GetCXXStdlibType(const ArgList &Args) const {
 
 void WebAssembly::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                             ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   const Driver &D = getDriver();
diff --git a/clang/lib/Driver/ToolChains/XCore.cpp b/clang/lib/Driver/ToolChains/XCore.cpp
index 6a2a75cb99739..dd26c11affffb 100644
--- a/clang/lib/Driver/ToolChains/XCore.cpp
+++ b/clang/lib/Driver/ToolChains/XCore.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include <cstdlib> // ::getenv
 
@@ -113,7 +113,7 @@ bool XCoreToolChain::hasBlocksRuntime() const { return false; }
 
 void XCoreToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                                ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) ||
+  if (DriverArgs.hasArg(options::OPT_nostdinc) ||
       DriverArgs.hasArg(options::OPT_nostdlibinc))
     return;
   if (const char *cl_include_dir = getenv("XCC_C_INCLUDE_PATH")) {
@@ -137,7 +137,7 @@ void XCoreToolChain::addClangTargetOptions(const ArgList &DriverArgs,
 
 void XCoreToolChain::AddClangCXXStdlibIncludeArgs(
     const ArgList &DriverArgs, ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) ||
+  if (DriverArgs.hasArg(options::OPT_nostdinc) ||
       DriverArgs.hasArg(options::OPT_nostdlibinc) ||
       DriverArgs.hasArg(options::OPT_nostdincxx))
     return;
diff --git a/clang/lib/Driver/ToolChains/ZOS.cpp b/clang/lib/Driver/ToolChains/ZOS.cpp
index 9a3c45323a3cf..eac8f623f9a50 100644
--- a/clang/lib/Driver/ToolChains/ZOS.cpp
+++ b/clang/lib/Driver/ToolChains/ZOS.cpp
@@ -9,7 +9,7 @@
 #include "ZOS.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/WithColor.h"
diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp
index 0325296f84b19..4c2d11751a363 100644
--- a/clang/lib/Driver/XRayArgs.cpp
+++ b/clang/lib/Driver/XRayArgs.cpp
@@ -8,8 +8,8 @@
 #include "clang/Driver/XRayArgs.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/ToolChain.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/SpecialCaseList.h"
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index e5abf833194d4..9ab024a03fbd7 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -356,9 +356,11 @@ bool ContinuationIndenter::canBreak(const LineState &State) {
     return CurrentState.BreakBeforeClosingBrace;
   }
 
-  // Allow breaking before the right parens with block indentation if there was
-  // a break after the left parens, which is tracked by BreakBeforeClosingParen.
-  if (Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent &&
+  // Check need to break before the right parens if there was a break after
+  // the left parens, which is tracked by BreakBeforeClosingParen.
+  if ((Style.BreakBeforeCloseBracketFunction ||
+       Style.BreakBeforeCloseBracketIf || Style.BreakBeforeCloseBracketLoop ||
+       Style.BreakBeforeCloseBracketSwitch) &&
       Current.is(tok::r_paren)) {
     return CurrentState.BreakBeforeClosingParen;
   }
@@ -837,32 +839,38 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
       return Tok.is(tok::l_brace) && Tok.isNot(BK_Block) &&
              Style.Cpp11BracedListStyle != FormatStyle::BLS_Block;
     };
-    if (Tok.isNoneOf(tok::l_paren, TT_TemplateOpener, tok::l_square) &&
-        !IsStartOfBracedList()) {
+    if (IsStartOfBracedList())
+      return Style.BreakAfterOpenBracketBracedList;
+    if (Tok.isNoneOf(tok::l_paren, TT_TemplateOpener, tok::l_square))
       return false;
-    }
     if (!Tok.Previous)
       return true;
     if (Tok.Previous->isIf())
-      return Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak;
-    return Tok.Previous->isNoneOf(TT_CastRParen, tok::kw_for, tok::kw_while,
-                                  tok::kw_switch) &&
-           !(Style.isJavaScript() && Tok.Previous->is(Keywords.kw_await));
+      return Style.BreakAfterOpenBracketIf;
+    if (Tok.Previous->isLoop(Style))
+      return Style.BreakAfterOpenBracketLoop;
+    if (Tok.Previous->is(tok::kw_switch))
+      return Style.BreakAfterOpenBracketSwitch;
+    if (Style.BreakAfterOpenBracketFunction) {
+      return !Tok.Previous->is(TT_CastRParen) &&
+             !(Style.isJavaScript() && Tok.is(Keywords.kw_await));
+    }
+    return false;
   };
   auto IsFunctionCallParen = [](const FormatToken &Tok) {
     return Tok.is(tok::l_paren) && Tok.ParameterCount > 0 && Tok.Previous &&
            Tok.Previous->is(tok::identifier);
   };
-  auto IsInTemplateString = [this](const FormatToken &Tok) {
+  auto IsInTemplateString = [this](const FormatToken &Tok, bool NestBlocks) {
     if (!Style.isJavaScript())
       return false;
     for (const auto *Prev = &Tok; Prev; Prev = Prev->Previous) {
       if (Prev->is(TT_TemplateString) && Prev->opensScope())
         return true;
-      if (Prev->opensScope() ||
-          (Prev->is(TT_TemplateString) && Prev->closesScope())) {
-        break;
-      }
+      if (Prev->opensScope() && !NestBlocks)
+        return false;
+      if (Prev->is(TT_TemplateString) && Prev->closesScope())
+        return false;
     }
     return false;
   };
@@ -884,21 +892,25 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
          Tok.isOneOf(tok::ellipsis, Keywords.kw_await))) {
       return true;
     }
-    if (const auto *Previous = Tok.Previous;
-        !Previous || (Previous->isNoneOf(TT_FunctionDeclarationLParen,
-                                         TT_LambdaDefinitionLParen) &&
-                      !IsFunctionCallParen(*Previous))) {
+    const auto *Previous = TokAfterLParen.Previous;
+    assert(Previous); // IsOpeningBracket(Previous)
+    if (Previous->Previous &&
+        (Previous->Previous->isIf() || Previous->Previous->isLoop(Style) ||
+         Previous->Previous->is(tok::kw_switch))) {
+      return false;
+    }
+    if (Previous->isNoneOf(TT_FunctionDeclarationLParen,
+                           TT_LambdaDefinitionLParen) &&
+        !IsFunctionCallParen(*Previous)) {
       return true;
     }
-    if (IsOpeningBracket(Tok) || IsInTemplateString(Tok))
+    if (IsOpeningBracket(Tok) || IsInTemplateString(Tok, true))
       return true;
     const auto *Next = Tok.Next;
     return !Next || Next->isMemberAccess() ||
            Next->is(TT_FunctionDeclarationLParen) || IsFunctionCallParen(*Next);
   };
-  if ((Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak ||
-       Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent) &&
-      IsOpeningBracket(Previous) && State.Column > getNewLineColumn(State) &&
+  if (IsOpeningBracket(Previous) && State.Column > getNewLineColumn(State) &&
       // Don't do this for simple (no expressions) one-argument function calls
       // as that feels like needlessly wasting whitespace, e.g.:
       //
@@ -920,7 +932,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
   // Note: This doesn't apply to macro expansion lines, which are MACRO( , , )
   // with args as children of the '(' and ',' tokens. It does not make sense to
   // align the commas with the opening paren.
-  if (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign &&
+  if (Style.AlignAfterOpenBracket &&
       !CurrentState.IsCSharpGenericTypeConstraint && Previous.opensScope() &&
       Previous.isNoneOf(TT_ObjCMethodExpr, TT_RequiresClause,
                         TT_TableGenDAGArgOpener,
@@ -933,7 +945,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
          Previous.Previous->isNoneOf(tok::identifier, tok::l_paren,
                                      BK_BracedInit))) ||
        Previous.is(TT_VerilogMultiLineListLParen)) &&
-      !IsInTemplateString(Current)) {
+      !IsInTemplateString(Current, false)) {
     CurrentState.Indent = State.Column + Spaces;
     CurrentState.IsAligned = true;
   }
@@ -1271,8 +1283,20 @@ unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State,
   }
 
   if (PreviousNonComment && PreviousNonComment->is(tok::l_paren)) {
-    CurrentState.BreakBeforeClosingParen =
-        Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent;
+    if (auto Previous = PreviousNonComment->Previous) {
+      if (Previous->isIf()) {
+        CurrentState.BreakBeforeClosingParen = Style.BreakBeforeCloseBracketIf;
+      } else if (Previous->isLoop(Style)) {
+        CurrentState.BreakBeforeClosingParen =
+            Style.BreakBeforeCloseBracketLoop;
+      } else if (Previous->is(tok::kw_switch)) {
+        CurrentState.BreakBeforeClosingParen =
+            Style.BreakBeforeCloseBracketSwitch;
+      } else {
+        CurrentState.BreakBeforeClosingParen =
+            Style.BreakBeforeCloseBracketFunction;
+      }
+    }
   }
 
   if (PreviousNonComment && PreviousNonComment->is(TT_TemplateOpener))
@@ -1416,13 +1440,17 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) {
       State.Stack.size() > 1) {
     return State.Stack[State.Stack.size() - 2].LastSpace;
   }
-  if (Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent &&
-      (Current.is(tok::r_paren) ||
-       (Current.is(tok::r_brace) && Current.MatchingParen &&
-        Current.MatchingParen->is(BK_BracedInit))) &&
+  if (Style.BreakBeforeCloseBracketBracedList && Current.is(tok::r_brace) &&
+      Current.MatchingParen && Current.MatchingParen->is(BK_BracedInit) &&
       State.Stack.size() > 1) {
     return State.Stack[State.Stack.size() - 2].LastSpace;
   }
+  if ((Style.BreakBeforeCloseBracketFunction ||
+       Style.BreakBeforeCloseBracketIf || Style.BreakBeforeCloseBracketLoop ||
+       Style.BreakBeforeCloseBracketSwitch) &&
+      Current.is(tok::r_paren) && State.Stack.size() > 1) {
+    return State.Stack[State.Stack.size() - 2].LastSpace;
+  }
   if (Style.BreakBeforeTemplateCloser && Current.is(TT_TemplateCloser) &&
       State.Stack.size() > 1) {
     return State.Stack[State.Stack.size() - 2].LastSpace;
@@ -1844,8 +1872,8 @@ void ContinuationIndenter::moveStatePastFakeLParens(LineState &State,
          PrecedenceLevel < prec::Assignment) &&
         (!Previous || Previous->isNot(tok::kw_return) ||
          (!Style.isJava() && PrecedenceLevel > 0)) &&
-        (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign ||
-         PrecedenceLevel > prec::Comma || Current.NestingLevel == 0) &&
+        (Style.AlignAfterOpenBracket || PrecedenceLevel > prec::Comma ||
+         Current.NestingLevel == 0) &&
         (!Style.isTableGen() ||
          (Previous && Previous->isOneOf(TT_TableGenDAGArgListComma,
                                         TT_TableGenDAGArgListCommaToBreak)))) {
@@ -1885,8 +1913,7 @@ void ContinuationIndenter::moveStatePastFakeLParens(LineState &State,
     if (PrecedenceLevel > prec::Unknown)
       NewParenState.LastSpace = std::max(NewParenState.LastSpace, State.Column);
     if (PrecedenceLevel != prec::Conditional &&
-        Current.isNot(TT_UnaryOperator) &&
-        Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign) {
+        Current.isNot(TT_UnaryOperator) && Style.AlignAfterOpenBracket) {
       NewParenState.StartOfFunctionCall = State.Column;
     }
 
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index edd126c7724b8..9bbb33cb14502 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -32,6 +32,13 @@ using clang::format::FormatStyle;
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(FormatStyle::RawStringFormat)
 
+enum BracketAlignmentStyle : int8_t {
+  BAS_Align,
+  BAS_DontAlign,
+  BAS_AlwaysBreak,
+  BAS_BlockIndent
+};
+
 namespace llvm {
 namespace yaml {
 template <>
@@ -204,16 +211,16 @@ template <> struct MappingTraits<FormatStyle::BraceWrappingFlags> {
   }
 };
 
-template <> struct ScalarEnumerationTraits<FormatStyle::BracketAlignmentStyle> {
-  static void enumeration(IO &IO, FormatStyle::BracketAlignmentStyle &Value) {
-    IO.enumCase(Value, "Align", FormatStyle::BAS_Align);
-    IO.enumCase(Value, "DontAlign", FormatStyle::BAS_DontAlign);
-    IO.enumCase(Value, "AlwaysBreak", FormatStyle::BAS_AlwaysBreak);
-    IO.enumCase(Value, "BlockIndent", FormatStyle::BAS_BlockIndent);
+template <> struct ScalarEnumerationTraits<BracketAlignmentStyle> {
+  static void enumeration(IO &IO, BracketAlignmentStyle &Value) {
+    IO.enumCase(Value, "Align", BAS_Align);
+    IO.enumCase(Value, "DontAlign", BAS_DontAlign);
 
     // For backward compatibility.
-    IO.enumCase(Value, "true", FormatStyle::BAS_Align);
-    IO.enumCase(Value, "false", FormatStyle::BAS_DontAlign);
+    IO.enumCase(Value, "true", BAS_Align);
+    IO.enumCase(Value, "false", BAS_DontAlign);
+    IO.enumCase(Value, "AlwaysBreak", BAS_AlwaysBreak);
+    IO.enumCase(Value, "BlockIndent", BAS_BlockIndent);
   }
 };
 
@@ -869,27 +876,28 @@ template <> struct MappingTraits<FormatStyle::TrailingCommentsAlignmentStyle> {
                         FormatStyle::TrailingCommentsAlignmentStyle &Value) {
     IO.enumCase(Value, "Leave",
                 FormatStyle::TrailingCommentsAlignmentStyle(
-                    {FormatStyle::TCAS_Leave, 0}));
+                    {FormatStyle::TCAS_Leave, 0, true}));
 
     IO.enumCase(Value, "Always",
                 FormatStyle::TrailingCommentsAlignmentStyle(
-                    {FormatStyle::TCAS_Always, 0}));
+                    {FormatStyle::TCAS_Always, 0, true}));
 
     IO.enumCase(Value, "Never",
                 FormatStyle::TrailingCommentsAlignmentStyle(
-                    {FormatStyle::TCAS_Never, 0}));
+                    {FormatStyle::TCAS_Never, 0, true}));
 
     // For backwards compatibility
     IO.enumCase(Value, "true",
                 FormatStyle::TrailingCommentsAlignmentStyle(
-                    {FormatStyle::TCAS_Always, 0}));
+                    {FormatStyle::TCAS_Always, 0, true}));
     IO.enumCase(Value, "false",
                 FormatStyle::TrailingCommentsAlignmentStyle(
-                    {FormatStyle::TCAS_Never, 0}));
+                    {FormatStyle::TCAS_Never, 0, true}));
   }
 
   static void mapping(IO &IO,
                       FormatStyle::TrailingCommentsAlignmentStyle &Value) {
+    IO.mapOptional("AlignPPAndNotPP", Value.AlignPPAndNotPP);
     IO.mapOptional("Kind", Value.Kind);
     IO.mapOptional("OverEmptyLines", Value.OverEmptyLines);
   }
@@ -979,6 +987,54 @@ template <> struct MappingTraits<FormatStyle> {
     bool SpacesInCStyleCastParentheses = false;
     bool SpacesInParentheses = false;
 
+    if (IO.outputting()) {
+      IO.mapOptional("AlignAfterOpenBracket", Style.AlignAfterOpenBracket);
+    } else {
+      // For backward compatibility.
+      BracketAlignmentStyle LocalBAS = BAS_Align;
+      if (IsGoogleOrChromium) {
+        FormatStyle::LanguageKind Language = Style.Language;
+        if (Language == FormatStyle::LK_None)
+          Language = ((FormatStyle *)IO.getContext())->Language;
+        if (Language == FormatStyle::LK_JavaScript)
+          LocalBAS = BAS_AlwaysBreak;
+        else if (Language == FormatStyle::LK_Java)
+          LocalBAS = BAS_DontAlign;
+      } else if (BasedOnStyle.equals_insensitive("webkit")) {
+        LocalBAS = BAS_DontAlign;
+      }
+      IO.mapOptional("AlignAfterOpenBracket", LocalBAS);
+      Style.BreakAfterOpenBracketBracedList = false;
+      Style.BreakAfterOpenBracketFunction = false;
+      Style.BreakAfterOpenBracketIf = false;
+      Style.BreakAfterOpenBracketLoop = false;
+      Style.BreakAfterOpenBracketSwitch = false;
+      Style.BreakBeforeCloseBracketBracedList = false;
+      Style.BreakBeforeCloseBracketFunction = false;
+      Style.BreakBeforeCloseBracketIf = false;
+      Style.BreakBeforeCloseBracketLoop = false;
+      Style.BreakBeforeCloseBracketSwitch = false;
+
+      switch (LocalBAS) {
+      case BAS_DontAlign:
+        Style.AlignAfterOpenBracket = false;
+        break;
+      case BAS_BlockIndent:
+        Style.BreakBeforeCloseBracketBracedList = true;
+        Style.BreakBeforeCloseBracketFunction = true;
+        Style.BreakBeforeCloseBracketIf = true;
+        [[fallthrough]];
+      case BAS_AlwaysBreak:
+        Style.BreakAfterOpenBracketBracedList = true;
+        Style.BreakAfterOpenBracketFunction = true;
+        Style.BreakAfterOpenBracketIf = true;
+        [[fallthrough]];
+      case BAS_Align:
+        Style.AlignAfterOpenBracket = true;
+        break;
+      }
+    }
+
     // For backward compatibility.
     if (!IO.outputting()) {
       IO.mapOptional("AlignEscapedNewlinesLeft", Style.AlignEscapedNewlines);
@@ -1014,7 +1070,6 @@ template <> struct MappingTraits<FormatStyle> {
     }
 
     IO.mapOptional("AccessModifierOffset", Style.AccessModifierOffset);
-    IO.mapOptional("AlignAfterOpenBracket", Style.AlignAfterOpenBracket);
     IO.mapOptional("AlignArrayOfStructures", Style.AlignArrayOfStructures);
     IO.mapOptional("AlignConsecutiveAssignments",
                    Style.AlignConsecutiveAssignments);
@@ -1079,10 +1134,29 @@ template <> struct MappingTraits<FormatStyle> {
     IO.mapOptional("BreakAfterAttributes", Style.BreakAfterAttributes);
     IO.mapOptional("BreakAfterJavaFieldAnnotations",
                    Style.BreakAfterJavaFieldAnnotations);
+    IO.mapOptional("BreakAfterOpenBracketBracedList",
+                   Style.BreakAfterOpenBracketBracedList);
+    IO.mapOptional("BreakAfterOpenBracketFunction",
+                   Style.BreakAfterOpenBracketFunction);
+    IO.mapOptional("BreakAfterOpenBracketIf", Style.BreakAfterOpenBracketIf);
+    IO.mapOptional("BreakAfterOpenBracketLoop",
+                   Style.BreakAfterOpenBracketLoop);
+    IO.mapOptional("BreakAfterOpenBracketSwitch",
+                   Style.BreakAfterOpenBracketSwitch);
     IO.mapOptional("BreakAfterReturnType", Style.BreakAfterReturnType);
     IO.mapOptional("BreakArrays", Style.BreakArrays);
     IO.mapOptional("BreakBeforeBinaryOperators",
                    Style.BreakBeforeBinaryOperators);
+    IO.mapOptional("BreakBeforeCloseBracketBracedList",
+                   Style.BreakBeforeCloseBracketBracedList);
+    IO.mapOptional("BreakBeforeCloseBracketFunction",
+                   Style.BreakBeforeCloseBracketFunction);
+    IO.mapOptional("BreakBeforeCloseBracketIf",
+                   Style.BreakBeforeCloseBracketIf);
+    IO.mapOptional("BreakBeforeCloseBracketLoop",
+                   Style.BreakBeforeCloseBracketLoop);
+    IO.mapOptional("BreakBeforeCloseBracketSwitch",
+                   Style.BreakBeforeCloseBracketSwitch);
     IO.mapOptional("BreakBeforeConceptDeclarations",
                    Style.BreakBeforeConceptDeclarations);
     IO.mapOptional("BreakBeforeBraces", Style.BreakBeforeBraces);
@@ -1561,7 +1635,7 @@ static void expandPresetsSpacesInParens(FormatStyle &Expanded) {
 FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   FormatStyle LLVMStyle;
   LLVMStyle.AccessModifierOffset = -2;
-  LLVMStyle.AlignAfterOpenBracket = FormatStyle::BAS_Align;
+  LLVMStyle.AlignAfterOpenBracket = true;
   LLVMStyle.AlignArrayOfStructures = FormatStyle::AIAS_None;
   LLVMStyle.AlignConsecutiveAssignments = {};
   LLVMStyle.AlignConsecutiveAssignments.PadOperators = true;
@@ -1578,6 +1652,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.AlignTrailingComments = {};
   LLVMStyle.AlignTrailingComments.Kind = FormatStyle::TCAS_Always;
   LLVMStyle.AlignTrailingComments.OverEmptyLines = 0;
+  LLVMStyle.AlignTrailingComments.AlignPPAndNotPP = true;
   LLVMStyle.AllowAllArgumentsOnNextLine = true;
   LLVMStyle.AllowAllParametersOfDeclarationOnNextLine = true;
   LLVMStyle.AllowBreakBeforeNoexceptSpecifier = FormatStyle::BBNSS_Never;
@@ -1621,10 +1696,20 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.BreakAdjacentStringLiterals = true;
   LLVMStyle.BreakAfterAttributes = FormatStyle::ABS_Leave;
   LLVMStyle.BreakAfterJavaFieldAnnotations = false;
+  LLVMStyle.BreakAfterOpenBracketBracedList = false;
+  LLVMStyle.BreakAfterOpenBracketFunction = false;
+  LLVMStyle.BreakAfterOpenBracketIf = false;
+  LLVMStyle.BreakAfterOpenBracketLoop = false;
+  LLVMStyle.BreakAfterOpenBracketSwitch = false;
   LLVMStyle.BreakAfterReturnType = FormatStyle::RTBS_None;
   LLVMStyle.BreakArrays = true;
   LLVMStyle.BreakBeforeBinaryOperators = FormatStyle::BOS_None;
   LLVMStyle.BreakBeforeBraces = FormatStyle::BS_Attach;
+  LLVMStyle.BreakBeforeCloseBracketBracedList = false;
+  LLVMStyle.BreakBeforeCloseBracketFunction = false;
+  LLVMStyle.BreakBeforeCloseBracketIf = false;
+  LLVMStyle.BreakBeforeCloseBracketLoop = false;
+  LLVMStyle.BreakBeforeCloseBracketSwitch = false;
   LLVMStyle.BreakBeforeConceptDeclarations = FormatStyle::BBCDS_Always;
   LLVMStyle.BreakBeforeInlineASMColon = FormatStyle::BBIAS_OnlyMultiline;
   LLVMStyle.BreakBeforeTemplateCloser = false;
@@ -1877,7 +1962,7 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
   GoogleStyle.PenaltyReturnTypeOnItsOwnLine = 200;
 
   if (Language == FormatStyle::LK_Java) {
-    GoogleStyle.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+    GoogleStyle.AlignAfterOpenBracket = false;
     GoogleStyle.AlignOperands = FormatStyle::OAS_DontAlign;
     GoogleStyle.AlignTrailingComments = {};
     GoogleStyle.AlignTrailingComments.Kind = FormatStyle::TCAS_Never;
@@ -1889,7 +1974,9 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
     GoogleStyle.SpaceAfterCStyleCast = true;
     GoogleStyle.SpacesBeforeTrailingComments = 1;
   } else if (Language == FormatStyle::LK_JavaScript) {
-    GoogleStyle.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+    GoogleStyle.BreakAfterOpenBracketBracedList = true;
+    GoogleStyle.BreakAfterOpenBracketFunction = true;
+    GoogleStyle.BreakAfterOpenBracketIf = true;
     GoogleStyle.AlignOperands = FormatStyle::OAS_DontAlign;
     GoogleStyle.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Empty;
     // TODO: still under discussion whether to switch to SLS_All.
@@ -2026,7 +2113,7 @@ FormatStyle getMozillaStyle() {
 FormatStyle getWebKitStyle() {
   FormatStyle Style = getLLVMStyle();
   Style.AccessModifierOffset = -4;
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   Style.AlignOperands = FormatStyle::OAS_DontAlign;
   Style.AlignTrailingComments = {};
   Style.AlignTrailingComments.Kind = FormatStyle::TCAS_Never;
diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp
index d1c62642efd43..28fdbcbf0e47f 100644
--- a/clang/lib/Format/FormatToken.cpp
+++ b/clang/lib/Format/FormatToken.cpp
@@ -68,7 +68,7 @@ bool FormatToken::isBlockIndentedInitRBrace(const FormatStyle &Style) const {
   assert(MatchingParen);
   assert(MatchingParen->is(tok::l_brace));
   if (Style.Cpp11BracedListStyle == FormatStyle::BLS_Block ||
-      Style.AlignAfterOpenBracket != FormatStyle::BAS_BlockIndent) {
+      !Style.BreakBeforeCloseBracketBracedList) {
     return false;
   }
   const auto *LBrace = MatchingParen;
@@ -198,7 +198,7 @@ void CommaSeparatedList::precomputeFormattingInfos(const FormatToken *Token) {
     return;
 
   // Column format doesn't really make sense if we don't align after brackets.
-  if (Style.AlignAfterOpenBracket == FormatStyle::BAS_DontAlign)
+  if (!Style.AlignAfterOpenBracket)
     return;
 
   FormatToken *ItemBegin = Token->Next;
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index 6f3d24aefc1ca..d833130a538f1 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -666,6 +666,12 @@ struct FormatToken {
            (endsSequence(tok::identifier, tok::kw_if) && AllowConstexprMacro);
   }
 
+  bool isLoop(const FormatStyle &Style) const {
+    return isOneOf(tok::kw_for, tok::kw_while) ||
+           (Style.isJavaScript() && isNot(tok::l_paren) && Previous &&
+            Previous->is(tok::kw_for));
+  }
+
   bool closesScopeAfterBlock() const {
     if (getBlockKind() == BK_Block)
       return true;
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index ab3293841a2a4..a9ea5ec9009c4 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -318,14 +318,21 @@ void FormatTokenLexer::tryMergePreviousTokens() {
                            {tok::equal, tok::greater},
                            {tok::star, tok::greater},
                            {tok::pipeequal, tok::greater},
-                           {tok::pipe, tok::arrow},
-                           {tok::hash, tok::minus, tok::hash},
-                           {tok::hash, tok::equal, tok::hash}},
+                           {tok::pipe, tok::arrow}},
                           TT_BinaryOperator) ||
         Tokens.back()->is(tok::arrow)) {
       Tokens.back()->ForcedPrecedence = prec::Comma;
       return;
     }
+    if (Tokens.size() >= 3 &&
+        Tokens[Tokens.size() - 3]->is(Keywords.kw_verilogHash) &&
+        Tokens[Tokens.size() - 2]->isOneOf(tok::minus, tok::equal) &&
+        Tokens[Tokens.size() - 1]->is(Keywords.kw_verilogHash) &&
+        tryMergeTokens(3, TT_BinaryOperator)) {
+      Tokens.back()->setFinalizedType(TT_BinaryOperator);
+      Tokens.back()->ForcedPrecedence = prec::Comma;
+      return;
+    }
   } else if (Style.isTableGen()) {
     // TableGen's Multi line string starts with [{
     if (tryMergeTokens({tok::l_square, tok::l_brace},
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 021d8c658eb11..cb41756c56bf7 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -358,11 +358,11 @@ class AnnotatingParser {
       Contexts.back().IsExpression = false;
     } else if (OpeningParen.Previous &&
                (OpeningParen.Previous->isOneOf(
-                    tok::kw_static_assert, tok::kw_noexcept, tok::kw_explicit,
-                    tok::kw_while, tok::l_paren, tok::comma, TT_CastRParen,
+                    tok::kw_noexcept, tok::kw_explicit, tok::kw_while,
+                    tok::l_paren, tok::comma, TT_CastRParen,
                     TT_BinaryOperator) ||
                 OpeningParen.Previous->isIf())) {
-      // static_assert, if and while usually contain expressions.
+      // if and while usually contain expressions.
       Contexts.back().IsExpression = true;
     } else if (Style.isJavaScript() && OpeningParen.Previous &&
                (OpeningParen.Previous->is(Keywords.kw_function) ||
@@ -454,6 +454,11 @@ class AnnotatingParser {
     if (StartsObjCSelector)
       OpeningParen.setType(TT_ObjCSelector);
 
+    const bool IsStaticAssert =
+        PrevNonComment && PrevNonComment->is(tok::kw_static_assert);
+    if (IsStaticAssert)
+      Contexts.back().InStaticAssertFirstArgument = true;
+
     // MightBeFunctionType and ProbablyFunctionType are used for
     // function pointer and reference types as well as Objective-C
     // block types:
@@ -583,8 +588,12 @@ class AnnotatingParser {
       }
       // When we discover a 'new', we set CanBeExpression to 'false' in order to
       // parse the type correctly. Reset that after a comma.
-      if (CurrentToken->is(tok::comma))
-        Contexts.back().CanBeExpression = true;
+      if (CurrentToken->is(tok::comma)) {
+        if (IsStaticAssert)
+          Contexts.back().InStaticAssertFirstArgument = false;
+        else
+          Contexts.back().CanBeExpression = true;
+      }
 
       if (Style.isTableGen()) {
         if (CurrentToken->is(tok::comma)) {
@@ -2144,6 +2153,7 @@ class AnnotatingParser {
     bool CaretFound = false;
     bool InCpp11AttributeSpecifier = false;
     bool InCSharpAttributeSpecifier = false;
+    bool InStaticAssertFirstArgument = false;
     bool VerilogAssignmentFound = false;
     // Whether the braces may mean concatenation instead of structure or array
     // literal.
@@ -2440,7 +2450,8 @@ class AnnotatingParser {
     } else if (Current.isPointerOrReference()) {
       Current.setType(determineStarAmpUsage(
           Current,
-          Contexts.back().CanBeExpression && Contexts.back().IsExpression,
+          (Contexts.back().CanBeExpression && Contexts.back().IsExpression) ||
+              Contexts.back().InStaticAssertFirstArgument,
           Contexts.back().ContextType == Context::TemplateArgument));
     } else if (Current.isOneOf(tok::minus, tok::plus, tok::caret) ||
                (Style.isVerilog() && Current.is(tok::pipe))) {
@@ -4427,10 +4438,8 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line,
 
   if (Left.is(tok::l_paren) && Style.PenaltyBreakOpenParenthesis != 0)
     return Style.PenaltyBreakOpenParenthesis;
-  if (Left.is(tok::l_paren) && InFunctionDecl &&
-      Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign) {
+  if (Left.is(tok::l_paren) && InFunctionDecl && Style.AlignAfterOpenBracket)
     return 100;
-  }
   if (Left.is(tok::l_paren) && Left.Previous &&
       (Left.Previous->isOneOf(tok::kw_for, tok::kw__Generic) ||
        Left.Previous->isIf())) {
@@ -4446,7 +4455,7 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line,
     // If we aren't aligning after opening parens/braces we can always break
     // here unless the style does not want us to place all arguments on the
     // next line.
-    if (Style.AlignAfterOpenBracket == FormatStyle::BAS_DontAlign &&
+    if (!Style.AlignAfterOpenBracket &&
         (Left.ParameterCount <= 1 || Style.AllowAllArgumentsOnNextLine)) {
       return 0;
     }
@@ -6226,24 +6235,31 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
                                    (Right.isBlockIndentedInitRBrace(Style)));
   }
 
-  // We only break before r_paren if we're in a block indented context.
+  // We can break before r_paren if we're in a block indented context or
+  // a control statement with an explicit style option.
   if (Right.is(tok::r_paren)) {
-    if (Style.AlignAfterOpenBracket != FormatStyle::BAS_BlockIndent ||
-        !Right.MatchingParen) {
+    if (!Right.MatchingParen)
       return false;
-    }
     auto Next = Right.Next;
     if (Next && Next->is(tok::r_paren))
       Next = Next->Next;
     if (Next && Next->is(tok::l_paren))
       return false;
     const FormatToken *Previous = Right.MatchingParen->Previous;
-    return !(Previous && (Previous->is(tok::kw_for) || Previous->isIf()));
+    if (!Previous)
+      return false;
+    if (Previous->isIf())
+      return Style.BreakBeforeCloseBracketIf;
+    if (Previous->isLoop(Style))
+      return Style.BreakBeforeCloseBracketLoop;
+    if (Previous->is(tok::kw_switch))
+      return Style.BreakBeforeCloseBracketSwitch;
+    return Style.BreakBeforeCloseBracketFunction;
   }
 
   if (Left.isOneOf(tok::r_paren, TT_TrailingAnnotation) &&
       Right.is(TT_TrailingAnnotation) &&
-      Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent) {
+      Style.BreakBeforeCloseBracketFunction) {
     return false;
   }
 
diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
index ac9c81d4416c9..d31d656a63fc5 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.cpp
+++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -285,7 +285,8 @@ class LineJoiner {
       if (Tok && Tok->is(tok::kw_typedef))
         Tok = Tok->getNextNonComment();
       if (Tok && Tok->isOneOf(tok::kw_class, tok::kw_struct, tok::kw_union,
-                              tok::kw_extern, Keywords.kw_interface)) {
+                              tok::kw_extern, Keywords.kw_interface,
+                              Keywords.kw_record)) {
         return !Style.BraceWrapping.SplitEmptyRecord && EmptyBlock
                    ? tryMergeSimpleBlock(I, E, Limit)
                    : 0;
@@ -498,7 +499,8 @@ class LineJoiner {
         ShouldMerge = Style.AllowShortEnumsOnASingleLine;
       } else if (TheLine->Last->is(TT_CompoundRequirementLBrace)) {
         ShouldMerge = Style.AllowShortCompoundRequirementOnASingleLine;
-      } else if (TheLine->Last->isOneOf(TT_ClassLBrace, TT_StructLBrace)) {
+      } else if (TheLine->Last->isOneOf(TT_ClassLBrace, TT_StructLBrace,
+                                        TT_RecordLBrace)) {
         // NOTE: We use AfterClass (whereas AfterStruct exists) for both classes
         // and structs, but it seems that wrapping is still handled correctly
         // elsewhere.
@@ -507,7 +509,7 @@ class LineJoiner {
                        !Style.BraceWrapping.SplitEmptyRecord);
       } else if (TheLine->InPPDirective ||
                  TheLine->First->isNoneOf(tok::kw_class, tok::kw_enum,
-                                          tok::kw_struct)) {
+                                          tok::kw_struct, Keywords.kw_record)) {
         // Try to merge a block with left brace unwrapped that wasn't yet
         // covered.
         ShouldMerge = !Style.BraceWrapping.AfterFunction ||
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 5e2584edac8f4..8b7dd02d548af 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -948,7 +948,11 @@ static bool isIIFE(const UnwrappedLine &Line,
 }
 
 static bool ShouldBreakBeforeBrace(const FormatStyle &Style,
-                                   const FormatToken &InitialToken) {
+                                   const FormatToken &InitialToken,
+                                   const bool IsJavaRecord) {
+  if (IsJavaRecord)
+    return Style.BraceWrapping.AfterClass;
+
   tok::TokenKind Kind = InitialToken.Tok.getKind();
   if (InitialToken.is(TT_NamespaceMacro))
     Kind = tok::kw_namespace;
@@ -3200,7 +3204,7 @@ void UnwrappedLineParser::parseNamespace() {
   if (FormatTok->is(tok::l_brace)) {
     FormatTok->setFinalizedType(TT_NamespaceLBrace);
 
-    if (ShouldBreakBeforeBrace(Style, InitialToken))
+    if (ShouldBreakBeforeBrace(Style, InitialToken, /*IsJavaRecord=*/false))
       addUnwrappedLine();
 
     unsigned AddLevels =
@@ -3865,7 +3869,7 @@ bool UnwrappedLineParser::parseEnum() {
   }
 
   if (!Style.AllowShortEnumsOnASingleLine &&
-      ShouldBreakBeforeBrace(Style, InitialToken)) {
+      ShouldBreakBeforeBrace(Style, InitialToken, /*IsJavaRecord=*/false)) {
     addUnwrappedLine();
   }
   // Parse enum body.
@@ -4160,7 +4164,7 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr, bool IsJavaRecord) {
     if (ParseAsExpr) {
       parseChildBlock();
     } else {
-      if (ShouldBreakBeforeBrace(Style, InitialToken))
+      if (ShouldBreakBeforeBrace(Style, InitialToken, IsJavaRecord))
         addUnwrappedLine();
 
       unsigned AddLevels = Style.IndentAccessModifiers ? 2u : 1u;
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index f24b8ab14bdce..67f2db2d8bb8d 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -591,7 +591,8 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
       CurrentChangeWidthRight = CurrentChange.TokenLength;
     const FormatToken *MatchingParenToEncounter = nullptr;
     for (unsigned J = I + 1;
-         J != E && (Changes[J].NewlinesBefore == 0 || MatchingParenToEncounter);
+         J != E && (Changes[J].NewlinesBefore == 0 ||
+                    MatchingParenToEncounter || Changes[J].IsAligned);
          ++J) {
       const auto &Change = Changes[J];
       const auto *Tok = Change.Tok;
@@ -1006,9 +1007,13 @@ void WhitespaceManager::alignTrailingComments() {
     return;
 
   const int Size = Changes.size();
+  if (Size == 0)
+    return;
+
   int MinColumn = 0;
   int StartOfSequence = 0;
   bool BreakBeforeNext = false;
+  bool IsInPP = Changes.front().Tok->Tok.is(tok::hash);
   int NewLineThreshold = 1;
   if (Style.AlignTrailingComments.Kind == FormatStyle::TCAS_Always)
     NewLineThreshold = Style.AlignTrailingComments.OverEmptyLines + 1;
@@ -1017,7 +1022,19 @@ void WhitespaceManager::alignTrailingComments() {
     auto &C = Changes[I];
     if (C.StartOfBlockComment)
       continue;
-    Newlines += C.NewlinesBefore;
+    if (C.NewlinesBefore != 0) {
+      Newlines += C.NewlinesBefore;
+      const bool WasInPP = std::exchange(
+          IsInPP, C.Tok->Tok.is(tok::hash) || (IsInPP && C.IsTrailingComment) ||
+                      C.ContinuesPPDirective);
+      if (IsInPP != WasInPP && !Style.AlignTrailingComments.AlignPPAndNotPP) {
+        alignTrailingComments(StartOfSequence, I, MinColumn);
+        MinColumn = 0;
+        MaxColumn = INT_MAX;
+        StartOfSequence = I;
+        Newlines = 0;
+      }
+    }
     if (!C.IsTrailingComment)
       continue;
 
@@ -1215,7 +1232,10 @@ void WhitespaceManager::alignArrayInitializers() {
       bool FoundComplete = false;
       for (unsigned InsideIndex = ChangeIndex + 1; InsideIndex < ChangeEnd;
            ++InsideIndex) {
-        if (Changes[InsideIndex].Tok == C.Tok->MatchingParen) {
+        const auto *Tok = Changes[InsideIndex].Tok;
+        if (Tok->is(tok::pp_define))
+          break;
+        if (Tok == C.Tok->MatchingParen) {
           alignArrayInitializers(ChangeIndex, InsideIndex + 1);
           ChangeIndex = InsideIndex + 1;
           FoundComplete = true;
diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp
index 6cc7094846155..1169acb389acf 100644
--- a/clang/lib/Frontend/ASTUnit.cpp
+++ b/clang/lib/Frontend/ASTUnit.cpp
@@ -518,14 +518,14 @@ class ASTInfoCollector : public ASTReaderListener {
   LangOptions &LangOpts;
   CodeGenOptions &CodeGenOpts;
   TargetOptions &TargetOpts;
-  unsigned &Counter;
+  uint32_t &Counter;
 
 public:
   ASTInfoCollector(HeaderSearchOptions &HSOpts,
                    std::string &SpecificModuleCachePath,
                    PreprocessorOptions &PPOpts, LangOptions &LangOpts,
                    CodeGenOptions &CodeGenOpts, TargetOptions &TargetOpts,
-                   unsigned &Counter)
+                   uint32_t &Counter)
       : HSOpts(HSOpts), SpecificModuleCachePath(SpecificModuleCachePath),
         PPOpts(PPOpts), LangOpts(LangOpts), CodeGenOpts(CodeGenOpts),
         TargetOpts(TargetOpts), Counter(Counter) {}
@@ -577,7 +577,7 @@ class ASTInfoCollector : public ASTReaderListener {
   }
 
   void ReadCounter(const serialization::ModuleFile &M,
-                   unsigned NewCounter) override {
+                   uint32_t NewCounter) override {
     Counter = NewCounter;
   }
 };
diff --git a/clang/lib/Frontend/CMakeLists.txt b/clang/lib/Frontend/CMakeLists.txt
index a916667208845..dac9e0d26f393 100644
--- a/clang/lib/Frontend/CMakeLists.txt
+++ b/clang/lib/Frontend/CMakeLists.txt
@@ -52,6 +52,7 @@ add_clang_library(clangFrontend
   clangAST
   clangBasic
   clangDriver
+  clangOptions
   clangEdit
   clangLex
   clangParse
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 6b09f7f9fc1e3..8034ce9c3f221 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -1058,7 +1058,9 @@ void CompilerInstance::printDiagnosticStats() {
       if (!getLangOpts().CUDAIsDevice) {
         OS << " when compiling for host";
       } else {
-        OS << " when compiling for " << getTargetOpts().CPU;
+        OS << " when compiling for "
+           << (!getTargetOpts().CPU.empty() ? getTargetOpts().CPU
+                                            : getTarget().getTriple().str());
       }
     }
     OS << ".\n";
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index bd36eb4ecf9da..a95796924311b 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -27,7 +27,6 @@
 #include "clang/Basic/XRayInstr.h"
 #include "clang/Config/config.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/CommandLineSourceLoc.h"
 #include "clang/Frontend/DependencyOutputOptions.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
@@ -38,6 +37,7 @@
 #include "clang/Frontend/Utils.h"
 #include "clang/Lex/HeaderSearchOptions.h"
 #include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Options/Options.h"
 #include "clang/Serialization/ASTBitCodes.h"
 #include "clang/Serialization/ModuleFileExtension.h"
 #include "clang/StaticAnalyzer/Core/AnalyzerOptions.h"
@@ -255,7 +255,7 @@ CowCompilerInvocation::getMutPreprocessorOutputOpts() {
 using ArgumentConsumer = CompilerInvocation::ArgumentConsumer;
 
 #define OPTTABLE_STR_TABLE_CODE
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTTABLE_STR_TABLE_CODE
 
 static llvm::StringRef lookupStrInTable(unsigned Offset) {
@@ -263,7 +263,7 @@ static llvm::StringRef lookupStrInTable(unsigned Offset) {
 }
 
 #define SIMPLE_ENUM_VALUE_TABLE
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef SIMPLE_ENUM_VALUE_TABLE
 
 static std::optional<bool> normalizeSimpleFlag(OptSpecifier Opt,
@@ -981,7 +981,7 @@ static void GenerateAnalyzerArgs(const AnalyzerOptions &Opts,
 
 #define ANALYZER_OPTION_WITH_MARSHALLING(...)                                  \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef ANALYZER_OPTION_WITH_MARSHALLING
 
   if (Opts.AnalysisConstraintsOpt != RangeConstraintsModel) {
@@ -1068,7 +1068,7 @@ static bool ParseAnalyzerArgs(AnalyzerOptions &Opts, ArgList &Args,
 
 #define ANALYZER_OPTION_WITH_MARSHALLING(...)                                  \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef ANALYZER_OPTION_WITH_MARSHALLING
 
   if (Arg *A = Args.getLastArg(OPT_analyzer_constraints)) {
@@ -1575,7 +1575,7 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts,
 
 #define CODEGEN_OPTION_WITH_MARSHALLING(...)                                   \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef CODEGEN_OPTION_WITH_MARSHALLING
 
   if (Opts.OptimizationLevel > 0) {
@@ -1880,7 +1880,7 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
 
 #define CODEGEN_OPTION_WITH_MARSHALLING(...)                                   \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef CODEGEN_OPTION_WITH_MARSHALLING
 
   // At O0 we want to fully disable inlining outside of cases marked with
@@ -2371,7 +2371,7 @@ static void GenerateDependencyOutputArgs(const DependencyOutputOptions &Opts,
   const DependencyOutputOptions &DependencyOutputOpts = Opts;
 #define DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING(...)                         \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING
 
   if (Opts.ShowIncludesDest != ShowIncludesDestination::None)
@@ -2406,7 +2406,7 @@ static bool ParseDependencyOutputArgs(DependencyOutputOptions &Opts,
   DependencyOutputOptions &DependencyOutputOpts = Opts;
 #define DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING(...)                         \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING
 
   if (Args.hasArg(OPT_show_includes)) {
@@ -2534,7 +2534,7 @@ static void GenerateFileSystemArgs(const FileSystemOptions &Opts,
 
 #define FILE_SYSTEM_OPTION_WITH_MARSHALLING(...)                               \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef FILE_SYSTEM_OPTION_WITH_MARSHALLING
 }
 
@@ -2546,7 +2546,7 @@ static bool ParseFileSystemArgs(FileSystemOptions &Opts, const ArgList &Args,
 
 #define FILE_SYSTEM_OPTION_WITH_MARSHALLING(...)                               \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef FILE_SYSTEM_OPTION_WITH_MARSHALLING
 
   return Diags.getNumErrors() == NumErrorsBefore;
@@ -2557,7 +2557,7 @@ static void GenerateMigratorArgs(const MigratorOptions &Opts,
   const MigratorOptions &MigratorOpts = Opts;
 #define MIGRATOR_OPTION_WITH_MARSHALLING(...)                                  \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef MIGRATOR_OPTION_WITH_MARSHALLING
 }
 
@@ -2569,7 +2569,7 @@ static bool ParseMigratorArgs(MigratorOptions &Opts, const ArgList &Args,
 
 #define MIGRATOR_OPTION_WITH_MARSHALLING(...)                                  \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef MIGRATOR_OPTION_WITH_MARSHALLING
 
   return Diags.getNumErrors() == NumErrorsBefore;
@@ -2581,7 +2581,7 @@ void CompilerInvocationBase::GenerateDiagnosticArgs(
   const DiagnosticOptions *DiagnosticOpts = &Opts;
 #define DIAG_OPTION_WITH_MARSHALLING(...)                                      \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef DIAG_OPTION_WITH_MARSHALLING
 
   if (!Opts.DiagnosticSerializationFile.empty())
@@ -2686,7 +2686,7 @@ bool clang::ParseDiagnosticArgs(DiagnosticOptions &Opts, ArgList &Args,
 
 #define DIAG_OPTION_WITH_MARSHALLING(...)                                      \
   PARSE_OPTION_WITH_MARSHALLING(Args, *Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef DIAG_OPTION_WITH_MARSHALLING
 
   llvm::sys::Process::UseANSIEscapeCodes(Opts.UseANSIEscapeCodes);
@@ -2836,7 +2836,7 @@ static void GenerateFrontendArgs(const FrontendOptions &Opts,
   const FrontendOptions &FrontendOpts = Opts;
 #define FRONTEND_OPTION_WITH_MARSHALLING(...)                                  \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef FRONTEND_OPTION_WITH_MARSHALLING
 
   std::optional<OptSpecifier> ProgramActionOpt =
@@ -3006,7 +3006,7 @@ static bool ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
 
 #define FRONTEND_OPTION_WITH_MARSHALLING(...)                                  \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef FRONTEND_OPTION_WITH_MARSHALLING
 
   Opts.ProgramAction = frontend::ParseSyntaxOnly;
@@ -3288,7 +3288,7 @@ static void GenerateHeaderSearchArgs(const HeaderSearchOptions &Opts,
   const HeaderSearchOptions *HeaderSearchOpts = &Opts;
 #define HEADER_SEARCH_OPTION_WITH_MARSHALLING(...)                             \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef HEADER_SEARCH_OPTION_WITH_MARSHALLING
 
   if (Opts.UseLibcxx)
@@ -3403,7 +3403,7 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args,
 
 #define HEADER_SEARCH_OPTION_WITH_MARSHALLING(...)                             \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef HEADER_SEARCH_OPTION_WITH_MARSHALLING
 
   if (const Arg *A = Args.getLastArg(OPT_stdlib_EQ))
@@ -3736,7 +3736,7 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
 
 #define LANG_OPTION_WITH_MARSHALLING(...)                                      \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef LANG_OPTION_WITH_MARSHALLING
 
   // The '-fcf-protection=' option is generated by CodeGenOpts generator.
@@ -4049,18 +4049,18 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
   // -cl-std only applies for OpenCL language standards.
   // Override the -std option in this case.
   if (const Arg *A = Args.getLastArg(OPT_cl_std_EQ)) {
-    LangStandard::Kind OpenCLLangStd
-      = llvm::StringSwitch<LangStandard::Kind>(A->getValue())
-        .Cases("cl", "CL", LangStandard::lang_opencl10)
-        .Cases("cl1.0", "CL1.0", LangStandard::lang_opencl10)
-        .Cases("cl1.1", "CL1.1", LangStandard::lang_opencl11)
-        .Cases("cl1.2", "CL1.2", LangStandard::lang_opencl12)
-        .Cases("cl2.0", "CL2.0", LangStandard::lang_opencl20)
-        .Cases("cl3.0", "CL3.0", LangStandard::lang_opencl30)
-        .Cases("clc++", "CLC++", LangStandard::lang_openclcpp10)
-        .Cases("clc++1.0", "CLC++1.0", LangStandard::lang_openclcpp10)
-        .Cases("clc++2021", "CLC++2021", LangStandard::lang_openclcpp2021)
-        .Default(LangStandard::lang_unspecified);
+    LangStandard::Kind OpenCLLangStd =
+        llvm::StringSwitch<LangStandard::Kind>(A->getValue())
+            .Cases({"cl", "CL"}, LangStandard::lang_opencl10)
+            .Cases({"cl1.0", "CL1.0"}, LangStandard::lang_opencl10)
+            .Cases({"cl1.1", "CL1.1"}, LangStandard::lang_opencl11)
+            .Cases({"cl1.2", "CL1.2"}, LangStandard::lang_opencl12)
+            .Cases({"cl2.0", "CL2.0"}, LangStandard::lang_opencl20)
+            .Cases({"cl3.0", "CL3.0"}, LangStandard::lang_opencl30)
+            .Cases({"clc++", "CLC++"}, LangStandard::lang_openclcpp10)
+            .Cases({"clc++1.0", "CLC++1.0"}, LangStandard::lang_openclcpp10)
+            .Cases({"clc++2021", "CLC++2021"}, LangStandard::lang_openclcpp2021)
+            .Default(LangStandard::lang_unspecified);
 
     if (OpenCLLangStd == LangStandard::lang_unspecified) {
       Diags.Report(diag::err_drv_invalid_value)
@@ -4082,7 +4082,7 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
 
 #define LANG_OPTION_WITH_MARSHALLING(...)                                      \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef LANG_OPTION_WITH_MARSHALLING
 
   if (const Arg *A = Args.getLastArg(OPT_fcf_protection_EQ)) {
@@ -4600,7 +4600,8 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
         // Validate that if fnative-half-type is given, that
         // the language standard is at least hlsl2018, and that
         // the target shader model is at least 6.2.
-        if (Args.getLastArg(OPT_fnative_half_type)) {
+        if (Args.getLastArg(OPT_fnative_half_type) ||
+            Args.getLastArg(OPT_fnative_int16_type)) {
           const LangStandard &Std =
               LangStandard::getLangStandardForKind(Opts.LangStd);
           if (!(Opts.LangStd >= LangStandard::lang_hlsl2018 &&
@@ -4614,12 +4615,16 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
           Diags.Report(diag::err_drv_hlsl_bad_shader_unsupported)
               << VulkanEnv << T.getOSName() << T.str();
         }
-        if (Args.getLastArg(OPT_fnative_half_type)) {
+        if (Args.getLastArg(OPT_fnative_half_type) ||
+            Args.getLastArg(OPT_fnative_int16_type)) {
+          const char *Str = Args.getLastArg(OPT_fnative_half_type)
+                                ? "-fnative-half-type"
+                                : "-fnative-int16-type";
           const LangStandard &Std =
               LangStandard::getLangStandardForKind(Opts.LangStd);
           if (!(Opts.LangStd >= LangStandard::lang_hlsl2018))
             Diags.Report(diag::err_drv_hlsl_16bit_types_unsupported)
-                << "-fnative-half-type" << false << Std.getName();
+                << Str << false << Std.getName();
         }
       } else {
         llvm_unreachable("expected DXIL or SPIR-V target");
@@ -4740,7 +4745,7 @@ static void GeneratePreprocessorArgs(const PreprocessorOptions &Opts,
 
 #define PREPROCESSOR_OPTION_WITH_MARSHALLING(...)                              \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef PREPROCESSOR_OPTION_WITH_MARSHALLING
 
   if (Opts.PCHWithHdrStop && !Opts.PCHWithHdrStopCreate)
@@ -4814,7 +4819,7 @@ static bool ParsePreprocessorArgs(PreprocessorOptions &Opts, ArgList &Args,
 
 #define PREPROCESSOR_OPTION_WITH_MARSHALLING(...)                              \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef PREPROCESSOR_OPTION_WITH_MARSHALLING
 
   Opts.PCHWithHdrStop = Args.hasArg(OPT_pch_through_hdrstop_create) ||
@@ -4907,7 +4912,7 @@ GeneratePreprocessorOutputArgs(const PreprocessorOutputOptions &Opts,
 
 #define PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING(...)                       \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING
 
   bool Generate_dM = isStrictlyPreprocessorAction(Action) && !Opts.ShowCPP;
@@ -4928,7 +4933,7 @@ static bool ParsePreprocessorOutputArgs(PreprocessorOutputOptions &Opts,
 
 #define PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING(...)                       \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING
 
   Opts.ShowCPP = isStrictlyPreprocessorAction(Action) && !Args.hasArg(OPT_dM);
@@ -4943,7 +4948,7 @@ static void GenerateTargetArgs(const TargetOptions &Opts,
   const TargetOptions *TargetOpts = &Opts;
 #define TARGET_OPTION_WITH_MARSHALLING(...)                                    \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef TARGET_OPTION_WITH_MARSHALLING
 
   if (!Opts.SDKVersion.empty())
@@ -4962,7 +4967,7 @@ static bool ParseTargetArgs(TargetOptions &Opts, ArgList &Args,
 
 #define TARGET_OPTION_WITH_MARSHALLING(...)                                    \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef TARGET_OPTION_WITH_MARSHALLING
 
   if (Arg *A = Args.getLastArg(options::OPT_target_sdk_version_EQ)) {
@@ -5275,6 +5280,86 @@ std::string CompilerInvocation::getModuleHash() const {
   return toString(llvm::APInt(64, Hash), 36, /*Signed=*/false);
 }
 
+void CompilerInvocationBase::visitPathsImpl(
+    llvm::function_ref<bool(std::string &)> Predicate) {
+#define RETURN_IF(PATH)                                                        \
+  do {                                                                         \
+    if (Predicate(PATH))                                                       \
+      return;                                                                  \
+  } while (0)
+
+#define RETURN_IF_MANY(PATHS)                                                  \
+  do {                                                                         \
+    if (llvm::any_of(PATHS, Predicate))                                        \
+      return;                                                                  \
+  } while (0)
+
+  auto &HeaderSearchOpts = *this->HSOpts;
+  // Header search paths.
+  RETURN_IF(HeaderSearchOpts.Sysroot);
+  for (auto &Entry : HeaderSearchOpts.UserEntries)
+    if (Entry.IgnoreSysRoot)
+      RETURN_IF(Entry.Path);
+  RETURN_IF(HeaderSearchOpts.ResourceDir);
+  RETURN_IF(HeaderSearchOpts.ModuleCachePath);
+  RETURN_IF(HeaderSearchOpts.ModuleUserBuildPath);
+  for (auto &[Name, File] : HeaderSearchOpts.PrebuiltModuleFiles)
+    RETURN_IF(File);
+  RETURN_IF_MANY(HeaderSearchOpts.PrebuiltModulePaths);
+  RETURN_IF_MANY(HeaderSearchOpts.VFSOverlayFiles);
+
+  // Preprocessor options.
+  auto &PPOpts = *this->PPOpts;
+  RETURN_IF_MANY(PPOpts.MacroIncludes);
+  RETURN_IF_MANY(PPOpts.Includes);
+  RETURN_IF(PPOpts.ImplicitPCHInclude);
+
+  // Frontend options.
+  auto &FrontendOpts = *this->FrontendOpts;
+  for (auto &Input : FrontendOpts.Inputs) {
+    if (Input.isBuffer())
+      continue;
+
+    RETURN_IF(Input.File);
+  }
+  RETURN_IF(FrontendOpts.CodeCompletionAt.FileName);
+  RETURN_IF_MANY(FrontendOpts.ModuleMapFiles);
+  RETURN_IF_MANY(FrontendOpts.ModuleFiles);
+  RETURN_IF_MANY(FrontendOpts.ModulesEmbedFiles);
+  RETURN_IF_MANY(FrontendOpts.ASTMergeFiles);
+  RETURN_IF(FrontendOpts.OverrideRecordLayoutsFile);
+  RETURN_IF(FrontendOpts.StatsFile);
+
+  // Filesystem options.
+  auto &FileSystemOpts = *this->FSOpts;
+  RETURN_IF(FileSystemOpts.WorkingDir);
+
+  // Codegen options.
+  auto &CodeGenOpts = *this->CodeGenOpts;
+  RETURN_IF(CodeGenOpts.DebugCompilationDir);
+  RETURN_IF(CodeGenOpts.CoverageCompilationDir);
+
+  // Sanitizer options.
+  RETURN_IF_MANY(LangOpts->NoSanitizeFiles);
+
+  // Coverage mappings.
+  RETURN_IF(CodeGenOpts.ProfileInstrumentUsePath);
+  RETURN_IF(CodeGenOpts.SampleProfileFile);
+  RETURN_IF(CodeGenOpts.ProfileRemappingFile);
+
+  // Dependency output options.
+  for (auto &ExtraDep : DependencyOutputOpts->ExtraDeps)
+    RETURN_IF(ExtraDep.first);
+}
+
+void CompilerInvocationBase::visitPaths(
+    llvm::function_ref<bool(StringRef)> Callback) const {
+  // The const_cast here is OK, because visitPathsImpl() itself doesn't modify
+  // the invocation, and our callback takes immutable StringRefs.
+  return const_cast<CompilerInvocationBase *>(this)->visitPathsImpl(
+      [&Callback](std::string &Path) { return Callback(StringRef(Path)); });
+}
+
 void CompilerInvocationBase::generateCC1CommandLine(
     ArgumentConsumer Consumer) const {
   llvm::Triple T(getTargetOpts().Triple);
diff --git a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
index 99212b81fe064..73b81ed906808 100644
--- a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
+++ b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
@@ -14,11 +14,11 @@
 #include "clang/Driver/Action.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Frontend/Utils.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Option/ArgList.h"
@@ -61,11 +61,11 @@ clang::createInvocation(ArrayRef<const char *> ArgList,
   if (!C)
     return nullptr;
 
-  if (C->getArgs().hasArg(driver::options::OPT_fdriver_only))
+  if (C->getArgs().hasArg(options::OPT_fdriver_only))
     return nullptr;
 
   // Just print the cc1 options if -### was present.
-  if (C->getArgs().hasArg(driver::options::OPT__HASH_HASH_HASH)) {
+  if (C->getArgs().hasArg(options::OPT__HASH_HASH_HASH)) {
     C->getJobs().Print(llvm::errs(), "\n", true);
     return nullptr;
   }
diff --git a/clang/lib/Frontend/DependencyFile.cpp b/clang/lib/Frontend/DependencyFile.cpp
index 15fa7de35df97..93e012b163878 100644
--- a/clang/lib/Frontend/DependencyFile.cpp
+++ b/clang/lib/Frontend/DependencyFile.cpp
@@ -75,6 +75,17 @@ struct DepCollectorPPCallbacks : public PPCallbacks {
                                     /*IsMissing*/ false);
   }
 
+  bool EmbedFileNotFound(StringRef FileName) override {
+    DepCollector.maybeAddDependency(
+        llvm::sys::path::remove_leading_dotslash(FileName),
+        /*FromModule=*/false,
+        /*IsSystem=*/false,
+        /*IsModuleFile=*/false,
+        /*IsMissing=*/true);
+    // Return true to silence the file not found diagnostic.
+    return true;
+  }
+
   void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok,
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp
index d7d56b8166350..3595bbc6c9b9e 100644
--- a/clang/lib/Frontend/FrontendActions.cpp
+++ b/clang/lib/Frontend/FrontendActions.cpp
@@ -1233,20 +1233,6 @@ void PrintDependencyDirectivesSourceMinimizerAction::ExecuteAction() {
                                     llvm::outs());
 }
 
-void GetDependenciesByModuleNameAction::ExecuteAction() {
-  CompilerInstance &CI = getCompilerInstance();
-  Preprocessor &PP = CI.getPreprocessor();
-  SourceManager &SM = PP.getSourceManager();
-  FileID MainFileID = SM.getMainFileID();
-  SourceLocation FileStart = SM.getLocForStartOfFile(MainFileID);
-  SmallVector<IdentifierLoc, 2> Path;
-  IdentifierInfo *ModuleID = PP.getIdentifierInfo(ModuleName);
-  Path.emplace_back(FileStart, ModuleID);
-  auto ModResult = CI.loadModule(FileStart, Path, Module::Hidden, false);
-  PPCallbacks *CB = PP.getPPCallbacks();
-  CB->moduleImport(SourceLocation(), Path, ModResult);
-}
-
 //===----------------------------------------------------------------------===//
 // HLSL Specific Actions
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 47f1d5a6b636c..b88d9f89c5f71 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -399,7 +399,7 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
     Builder.defineMacro("__HLSL_202y",
                         Twine((unsigned)LangOptions::HLSLLangStd::HLSL_202y));
 
-    if (LangOpts.NativeHalfType)
+    if (LangOpts.NativeHalfType && LangOpts.NativeInt16Type)
       Builder.defineMacro("__HLSL_ENABLE_16_BIT", "1");
 
     // Shader target information
@@ -1516,6 +1516,9 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   if (LangOpts.PointerAuthIntrinsics)
     Builder.defineMacro("__PTRAUTH__");
 
+  if (CGOpts.Dwarf2CFIAsm)
+    Builder.defineMacro("__GCC_HAVE_DWARF2_CFI_ASM");
+
   // Get other target #defines.
   TI.getTargetDefines(LangOpts, Builder);
 }
@@ -1542,6 +1545,9 @@ void clang::InitializePreprocessor(Preprocessor &PP,
   llvm::raw_string_ostream Predefines(PredefineBuffer);
   MacroBuilder Builder(Predefines);
 
+  // Ensure that the initial value of __COUNTER__ is hooked up.
+  PP.setCounterValue(InitOpts.InitialCounterValue);
+
   // Emit line markers for various builtin sections of the file. The 3 here
   // marks <built-in> as being a system header, which suppresses warnings when
   // the same macro is defined multiple times.
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index aea3e72d92a84..10032184b5d94 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -349,14 +349,13 @@ struct SourceColumnMap {
 
 /// When the source code line we want to print is too long for
 /// the terminal, select the "interesting" region.
-static void selectInterestingSourceRegion(std::string &SourceLine,
-                                          std::string &CaretLine,
-                                          std::string &FixItInsertionLine,
-                                          Columns NonGutterColumns,
-                                          const SourceColumnMap &Map) {
-  Columns CaretColumns = Columns(CaretLine.size());
-  Columns FixItColumns =
-      Columns(llvm::sys::locale::columnWidth(FixItInsertionLine));
+static void selectInterestingSourceRegion(
+    std::string &SourceLine, std::string &CaretLine,
+    std::string &FixItInsertionLine, Columns NonGutterColumns,
+    const SourceColumnMap &Map,
+    SmallVectorImpl<clang::TextDiagnostic::StyleRange> &Styles) {
+  Columns CaretColumns = CaretLine.size();
+  Columns FixItColumns = llvm::sys::locale::columnWidth(FixItInsertionLine);
   Columns MaxColumns =
       std::max({Map.columns().V, CaretColumns.V, FixItColumns.V});
   // if the number of columns is less than the desired number we're done
@@ -369,13 +368,11 @@ static void selectInterestingSourceRegion(std::string &SourceLine,
   // Find the slice that we need to display the full caret line
   // correctly.
   Columns CaretStart = 0, CaretEnd = CaretLine.size();
-  for (; CaretStart != CaretEnd; CaretStart = CaretStart.next())
-    if (!isWhitespace(CaretLine[CaretStart.V]))
-      break;
+  while (CaretStart != CaretEnd && isWhitespace(CaretLine[CaretStart.V]))
+    CaretStart = CaretStart.next();
 
-  for (; CaretEnd != CaretStart; CaretEnd = CaretEnd.prev())
-    if (!isWhitespace(CaretLine[CaretEnd.V - 1]))
-      break;
+  while (CaretEnd != CaretStart && isWhitespace(CaretLine[CaretEnd.V]))
+    CaretEnd = CaretEnd.prev();
 
   // caret has already been inserted into CaretLine so the above whitespace
   // check is guaranteed to include the caret
@@ -516,13 +513,45 @@ static void selectInterestingSourceRegion(std::string &SourceLine,
   assert(FrontColumnsRemoved + ColumnsKept + BackColumnsRemoved >
          NonGutterColumns);
 
+  // Since we've modified the SourceLine, we also need to adjust the line's
+  // highlighting information. In particular, if we've removed
+  // from the front of the line, we need to move the style ranges to the
+  // left and remove unneeded ranges.
+  // Note in particular that variables like CaretEnd are defined in the
+  // CaretLine, which only contains ASCII, while the style ranges are defined in
+  // the source line, where we have to care for the byte-index != column-index
+  // case.
+  Bytes BytesRemoved =
+      FrontColumnsRemoved > FrontEllipse.size()
+          ? (Map.columnToByte(FrontColumnsRemoved) - Bytes(FrontEllipse.size()))
+          : 0;
+  Bytes CodeEnd =
+      CaretEnd < Map.columns() ? Map.columnToByte(CaretEnd.V) : CaretEnd.V;
+  for (TextDiagnostic::StyleRange &R : Styles) {
+    // Remove style ranges before and after the new truncated snippet.
+    if (R.Start >= static_cast<unsigned>(CodeEnd.V) ||
+        R.End < static_cast<unsigned>(BytesRemoved.V)) {
+      R.Start = R.End = std::numeric_limits<int>::max();
+      continue;
+    }
+    // Move them left. (Note that this can wrap R.Start, but that doesn't
+    // matter).
+    R.Start -= BytesRemoved.V;
+    R.End -= BytesRemoved.V;
+
+    // Don't leak into the ellipse at the end.
+    if (R.Start < static_cast<unsigned>(CodeEnd.V) &&
+        R.End > static_cast<unsigned>(CodeEnd.V))
+      R.End = CodeEnd.V + 1; // R.End is inclusive.
+  }
+
   // The line needs some truncation, and we'd prefer to keep the front
   //  if possible, so remove the back
   if (BackColumnsRemoved > Columns(BackEllipse.size()))
     SourceLine.replace(SourceEnd.V, std::string::npos, BackEllipse);
 
   // If that's enough then we're done
-  if (FrontColumnsRemoved + ColumnsKept <= Columns(NonGutterColumns))
+  if (FrontColumnsRemoved + ColumnsKept <= NonGutterColumns)
     return;
 
   // Otherwise remove the front as well
@@ -1391,6 +1420,11 @@ void TextDiagnostic::emitSnippetAndCaret(
       OS.indent(MaxLineNoDisplayWidth + 2) << "| ";
   };
 
+  Columns MessageLength = DiagOpts.MessageLength;
+  // If we don't have enough columns available, just abort now.
+  if (MessageLength != 0 && MessageLength <= Columns(MaxLineNoDisplayWidth + 4))
+    return;
+
   // Prepare source highlighting information for the lines we're about to
   // emit, starting from the first line.
   std::unique_ptr<SmallVector<StyleRange>[]> SourceStyles =
@@ -1450,10 +1484,14 @@ void TextDiagnostic::emitSnippetAndCaret(
 
     // If the source line is too long for our terminal, select only the
     // "interesting" source region within that line.
-    Columns MessageLength = DiagOpts.MessageLength;
-    if (MessageLength.V != 0)
+    if (MessageLength != 0) {
+      Columns NonGutterColumns = MessageLength;
+      if (MaxLineNoDisplayWidth != 0)
+        NonGutterColumns -= Columns(MaxLineNoDisplayWidth + 4);
       selectInterestingSourceRegion(SourceLine, CaretLine, FixItInsertionLine,
-                                    MessageLength, SourceColMap);
+                                    NonGutterColumns, SourceColMap,
+                                    SourceStyles[LineNo - Lines.first]);
+    }
 
     // If we are in -fdiagnostics-print-source-range-info mode, we are trying
     // to produce easily machine parsable output.  Add a space before the
@@ -1508,7 +1546,7 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine,
   // Print the source line one character at a time.
   bool PrintReversed = false;
   std::optional<llvm::raw_ostream::Colors> CurrentColor;
-  size_t I = 0;
+  size_t I = 0; // Bytes.
   while (I < SourceLine.size()) {
     auto [Str, WasPrintable] =
         printableTextForNextCharacter(SourceLine, &I, DiagOpts.TabStop);
diff --git a/clang/lib/FrontendTool/CMakeLists.txt b/clang/lib/FrontendTool/CMakeLists.txt
index 061e54c3e62d0..66213f76eb968 100644
--- a/clang/lib/FrontendTool/CMakeLists.txt
+++ b/clang/lib/FrontendTool/CMakeLists.txt
@@ -7,6 +7,7 @@ set(link_libs
   clangBasic
   clangCodeGen
   clangDriver
+  clangOptions
   clangExtractAPI
   clangFrontend
   clangRewriteFrontend
diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index c8aad4daa1c10..e571193c6a9c8 100644
--- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -13,7 +13,6 @@
 
 #include "clang/CodeGen/CodeGenAction.h"
 #include "clang/Config/config.h"
-#include "clang/Driver/Options.h"
 #include "clang/ExtractAPI/FrontendActions.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
@@ -22,6 +21,7 @@
 #include "clang/Frontend/FrontendPluginRegistry.h"
 #include "clang/Frontend/Utils.h"
 #include "clang/FrontendTool/Utils.h"
+#include "clang/Options/Options.h"
 #include "clang/Rewrite/Frontend/FrontendActions.h"
 #include "clang/StaticAnalyzer/Frontend/AnalyzerHelpFlags.h"
 #include "clang/StaticAnalyzer/Frontend/FrontendActions.h"
@@ -215,11 +215,11 @@ bool ExecuteCompilerInvocation(CompilerInstance *Clang) {
 
   // Honor -help.
   if (Clang->getFrontendOpts().ShowHelp) {
-    driver::getDriverOptTable().printHelp(
+    getDriverOptTable().printHelp(
         llvm::outs(), "clang -cc1 [options] file...",
         "LLVM 'Clang' Compiler: http://clang.llvm.org",
         /*ShowHidden=*/false, /*ShowAllAliases=*/false,
-        llvm::opt::Visibility(driver::options::CC1Option));
+        llvm::opt::Visibility(options::CC1Option));
     return true;
   }
 
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 18589125697b0..33fff7645df65 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -162,18 +162,12 @@ set(x86_files
   adxintrin.h
   ammintrin.h
   amxavx512intrin.h
-  amxbf16transposeintrin.h
   amxcomplexintrin.h
-  amxcomplextransposeintrin.h
   amxfp16intrin.h
-  amxfp16transposeintrin.h
   amxfp8intrin.h
   amxintrin.h
   amxmovrsintrin.h
-  amxmovrstransposeintrin.h
   amxtf32intrin.h
-  amxtf32transposeintrin.h
-  amxtransposeintrin.h
   avx10_2_512bf16intrin.h
   avx10_2_512convertintrin.h
   avx10_2_512minmaxintrin.h
diff --git a/clang/lib/Headers/__clang_cuda_device_functions.h b/clang/lib/Headers/__clang_cuda_device_functions.h
index 86123727a1bc3..0226fe95abab6 100644
--- a/clang/lib/Headers/__clang_cuda_device_functions.h
+++ b/clang/lib/Headers/__clang_cuda_device_functions.h
@@ -528,7 +528,7 @@ __DEVICE__ float __tanf(float __a) { return __nv_fast_tanf(__a); }
 __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); }
 __DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); };
 __DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); };
-__DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); }
+__DEVICE__ __attribute__((noreturn)) void __trap(void) { __builtin_trap(); }
 __DEVICE__ unsigned short
 __usAtomicCAS(unsigned short *__p, unsigned short __cmp, unsigned short __v) {
   return __nvvm_atom_cas_gen_us(__p, __cmp, __v);
diff --git a/clang/lib/Headers/amxbf16transposeintrin.h b/clang/lib/Headers/amxbf16transposeintrin.h
deleted file mode 100644
index 86f09f2ad8db2..0000000000000
--- a/clang/lib/Headers/amxbf16transposeintrin.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <amxbf16transposeintrin.h> directly; use <immintrin.h> instead."
-#endif /* __IMMINTRIN_H */
-
-#ifndef __AMX_BF16TRANSPOSEINTRIN_H
-#define __AMX_BF16TRANSPOSEINTRIN_H
-#ifdef __x86_64__
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("amx-bf16,amx-transpose")))
-
-/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
-///    tiles \a a and \a b, accumulating the intermediate single-precision
-///    (32-bit) floating-point elements with elements in \a dst, and store the
-///    32-bit result back to tile \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b)
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-///	tmp := dst.row[m]
-///	FOR k := 0 TO (a.colsb / 4) - 1
-///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) *
-///					FP32(b.row[k].bf16[2*n+0])
-///			tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) *
-///					FP32(b.row[k].bf16[2*n+1])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TTDPBF16PS instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps((dst), (a), (b))
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS
-_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
-                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2);
-}
-
-/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
-///    tiles src0 and src1, accumulating the intermediate single-precision
-///    (32-bit) floating-point elements with elements in "dst", and store the
-///    32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TTDPBF16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS
-static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0,
-                                        __tile1024i src1) {
-  dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
-                                       src0.tile, src1.tile);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __x86_64__ */
-#endif /* __AMX_BF16TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/amxcomplextransposeintrin.h b/clang/lib/Headers/amxcomplextransposeintrin.h
deleted file mode 100644
index 11abaf98e9371..0000000000000
--- a/clang/lib/Headers/amxcomplextransposeintrin.h
+++ /dev/null
@@ -1,303 +0,0 @@
-/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <amxcomplextransposeintrin.h> directly; include <immintrin.h> instead."
-#endif // __IMMINTRIN_H
-
-#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
-#define __AMX_COMPLEXTRANSPOSEINTRIN_H
-#ifdef __x86_64__
-
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("amx-complex,amx-transpose")))
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-///    accumulate the results into a packed single precision tile. Each dword
-///    element in input tiles \a a and \a b is interpreted as a complex number
-///    with FP16 real part and FP16 imaginary part.
-/// Calculates the imaginary part of the result. For each possible combination
-///    of (transposed column of \a a, column of \a b), it performs a set of
-///    multiplication and accumulations on all corresponding complex numbers
-///    (one from \a a and one from \a b). The imaginary part of the \a a element
-///    is multiplied with the real part of the corresponding \a b element, and
-///    the real part of the \a a element is multiplied with the imaginary part
-///    of the corresponding \a b elements. The two accumulated results are
-///    added, and then accumulated into the corresponding row and column of
-///    \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b);
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-///	tmp := dst.row[m]
-///	FOR k := 0 TO a.rows - 1
-///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_tcmmimfp16ps(dst, a, b)                                          \
-  __builtin_ia32_ttcmmimfp16ps((dst), (a), (b))
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-///    accumulate the results into a packed single precision tile. Each dword
-///    element in input tiles \a a and \a b is interpreted as a complex number
-///    with FP16 real part and FP16 imaginary part.
-/// Calculates the real part of the result. For each possible combination
-///    of (rtransposed colum of \a a, column of \a b), it performs a set of
-///    multiplication and accumulations on all corresponding complex numbers
-///    (one from \a a and one from \a b). The real part of the \a a element is
-///    multiplied with the real part of the corresponding \a b element, and the
-///    negated imaginary part of the \a a element is multiplied with the
-///    imaginary part of the corresponding \a b elements. The two accumulated
-///    results are added, and then accumulated into the corresponding row and
-///    column of \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b);
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-///	tmp := dst.row[m]
-///	FOR k := 0 TO a.rows - 1
-///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
-///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_tcmmrlfp16ps(dst, a, b)                                          \
-  __builtin_ia32_ttcmmrlfp16ps((dst), (a), (b))
-
-/// Perform matrix conjugate transpose and multiplication of two tiles
-///    containing complex elements and accumulate the results into a packed
-///    single precision tile. Each dword element in input tiles \a a and \a b
-///    is interpreted as a complex number with FP16 real part and FP16 imaginary
-///    part.
-/// Calculates the imaginary part of the result. For each possible combination
-///    of (transposed column of \a a, column of \a b), it performs a set of
-///    multiplication and accumulations on all corresponding complex numbers
-///    (one from \a a and one from \a b). The negated imaginary part of the \a a
-///    element is multiplied with the real part of the corresponding \a b
-///    element, and the real part of the \a a element is multiplied with the
-///    imaginary part of the corresponding \a b elements. The two accumulated
-///    results are added, and then accumulated into the corresponding row and
-///    column of \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b);
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-///	tmp := dst.row[m]
-///	FOR k := 0 TO a.rows - 1
-///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
-///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_conjtcmmimfp16ps(dst, a, b)                                      \
-  __builtin_ia32_tconjtcmmimfp16ps((dst), (a), (b))
-
-/// Perform conjugate transpose of an FP16-pair of complex elements from \a a
-///    and writes the result to \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// void _tile_conjtfp16(__tile dst, __tile a);
-/// \endcode
-///
-/// \code{.operation}
-/// FOR i := 0 TO dst.rows - 1
-///	FOR j := 0 TO (dst.colsb / 4) - 1
-///		tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0]
-///		tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1]
-///	ENDFOR
-///	write_row_and_zero(dst, i, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TCONJTFP16 instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The source tile. Max size is 1024 Bytes.
-#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16((dst), (a))
-
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal(
-    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
-    _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2);
-}
-
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal(
-    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
-    _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
-}
-
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal(
-    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
-    _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2);
-}
-
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS
-_tile_conjtfp16_internal(unsigned short m, unsigned short n, _tile1024i src) {
-  return __builtin_ia32_tconjtfp16_internal(m, n, src);
-}
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-///    accumulate the results into a packed single precision tile. Each dword
-///    element in input tiles src0 and src1 is interpreted as a complex number
-///    with FP16 real part and FP16 imaginary part.
-///    This function calculates the imaginary part of the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TTCMMIMFP16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS
-static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
-                                __tile1024i src1) {
-  dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col,
-                                          dst->tile, src0.tile, src1.tile);
-}
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-///    accumulate the results into a packed single precision tile. Each dword
-///    element in input tiles src0 and src1 is interpreted as a complex number
-///    with FP16 real part and FP16 imaginary part.
-///    This function calculates the real part of the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TTCMMRLFP16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS
-static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
-                                __tile1024i src1) {
-  dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col,
-                                          dst->tile, src0.tile, src1.tile);
-}
-
-/// Perform matrix conjugate transpose and multiplication of two tiles
-///    containing complex elements and accumulate the results into a packed
-///    single precision tile. Each dword element in input tiles src0 and src1
-///    is interpreted as a complex number with FP16 real part and FP16 imaginary
-///    part.
-///    This function calculates the imaginary part of the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TCONJTCMMIMFP16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS
-static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
-                                    __tile1024i src1) {
-  dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col,
-                                              dst->tile, src0.tile, src1.tile);
-}
-
-/// Perform conjugate transpose of an FP16-pair of complex elements from src and
-///    writes the result to dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TCONJTFP16 </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src
-///    The source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS
-static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) {
-  dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif // __x86_64__
-#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H
diff --git a/clang/lib/Headers/amxfp16transposeintrin.h b/clang/lib/Headers/amxfp16transposeintrin.h
deleted file mode 100644
index 191f8c6097a2c..0000000000000
--- a/clang/lib/Headers/amxfp16transposeintrin.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*===----- amxfp16transposeintrin.h - AMX-FP16 and AMX-TRANSPOSE ------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <amxfp16transposeintrin.h> directly; use <immintrin.h> instead."
-#endif /* __IMMINTRIN_H */
-
-#ifndef __AMX_FP16TRANSPOSEINTRIN_H
-#define __AMX_FP16TRANSPOSEINTRIN_H
-#ifdef __x86_64__
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("amx-fp16,amx-transpose")))
-
-/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in
-///    tiles \a a and \a b, accumulating the intermediate single-precision
-///    (32-bit) floating-point elements with elements in \a dst, and store the
-///    32-bit result back to tile \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// void _tile_tdpfp16ps (__tile dst, __tile a, __tile b)
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-///	tmp := dst.row[m]
-///	FOR k := 0 TO (a.colsb / 4) - 1
-///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
-///					FP32(b.row[k].fp16[2*n+0])
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
-///					FP32(b.row[k].fp16[2*n+1])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TTDPFP16PS instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_tdpfp16ps(dst, a, b) __builtin_ia32_ttdpfp16ps((dst), (a), (b))
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS
-_tile_tdpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
-                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_ttdpfp16ps_internal(m, n, k, dst, src1, src2);
-}
-
-/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in
-///    tiles src0 and src1, accumulating the intermediate single-precision
-///    (32-bit) floating-point elements with elements in "dst", and store the
-///    32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TTDPFP16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS
-static __inline__ void __tile_tdpfp16ps(__tile1024i *dst, __tile1024i src0,
-                                        __tile1024i src1) {
-  dst->tile = _tile_tdpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
-                                       src0.tile, src1.tile);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __x86_64__ */
-#endif /* __AMX_FP16TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h
index a7da10d9951e7..208aa3580625f 100644
--- a/clang/lib/Headers/amxintrin.h
+++ b/clang/lib/Headers/amxintrin.h
@@ -230,8 +230,6 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// bytes. Since there is no 2D type in llvm IR, we use vector type to
 /// represent 2D tile and the fixed size is maximum amx tile register size.
 typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
-typedef int _tile1024i_1024a
-    __attribute__((__vector_size__(1024), __aligned__(1024)));
 
 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE
diff --git a/clang/lib/Headers/amxmovrstransposeintrin.h b/clang/lib/Headers/amxmovrstransposeintrin.h
deleted file mode 100644
index 5f48cba949f34..0000000000000
--- a/clang/lib/Headers/amxmovrstransposeintrin.h
+++ /dev/null
@@ -1,200 +0,0 @@
-/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- * ===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <amxmovrstransposeintrin.h> directly; use <immintrin.h> instead."
-#endif /* __IMMINTRIN_H */
-
-#ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H
-#define __AMX_MOVRS_TRANSPOSEINTRIN_H
-#ifdef __x86_64__
-
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("amx-transpose,amx-movrs")))
-
-#define _tile_2rpntlvwz0rs(tdst, base, stride)                                 \
-  __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride)
-#define _tile_2rpntlvwz0rst1(tdst, base, stride)                               \
-  __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride)
-#define _tile_2rpntlvwz1rs(tdst, base, stride)                                 \
-  __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride)
-#define _tile_2rpntlvwz1rst1(tdst, base, stride)                               \
-  __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride)
-
-static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal(
-    unsigned short row, unsigned short col0, unsigned short col1,
-    _tile1024i *dst0, _tile1024i *dst1, const void *base,
-    __SIZE_TYPE__ stride) {
-  // Use __tile1024i_1024a* to escape the alignment check in
-  // clang/test/Headers/x86-intrinsics-headers-clean.cpp
-  __builtin_ia32_t2rpntlvwz0rs_internal(
-      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
-      (__SIZE_TYPE__)(stride));
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal(
-    unsigned short row, unsigned short col0, unsigned short col1,
-    _tile1024i *dst0, _tile1024i *dst1, const void *base,
-    __SIZE_TYPE__ stride) {
-  __builtin_ia32_t2rpntlvwz0rst1_internal(
-      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
-      (__SIZE_TYPE__)(stride));
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal(
-    unsigned short row, unsigned short col0, unsigned short col1,
-    _tile1024i *dst0, _tile1024i *dst1, const void *base,
-    __SIZE_TYPE__ stride) {
-  __builtin_ia32_t2rpntlvwz1rs_internal(
-      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
-      (__SIZE_TYPE__)(stride));
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal(
-    unsigned short row, unsigned short col0, unsigned short col1,
-    _tile1024i *dst0, _tile1024i *dst1, const void *base,
-    __SIZE_TYPE__ stride) {
-  __builtin_ia32_t2rpntlvwz1rst1_internal(
-      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
-      (__SIZE_TYPE__)(stride));
-}
-
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written.
-/// Provides a hint to the implementation that the data will likely become
-/// read shared in the near future and the data caching can be optimized.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction.
-///
-/// \param dst0
-///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS
-static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1,
-                                const void *base, __SIZE_TYPE__ stride) {
-  _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
-                              &dst1->tile, base, stride);
-}
-
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction.
-///
-/// \param dst0
-///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS
-static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1,
-                                  const void *base, __SIZE_TYPE__ stride) {
-  _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
-                                &dst1->tile, base, stride);
-}
-
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written. The last row will be not be read from memory but instead
-/// filled with zeros.
-/// Provides a hint to the implementation that the data will likely become
-/// read shared in the near future and the data caching can be optimized.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
-///
-/// \param dst0
-///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS
-static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1,
-                                const void *base, __SIZE_TYPE__ stride) {
-  _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
-                              &dst1->tile, base, stride);
-}
-
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written. The last row will be not be read from memory but instead
-/// filled with zeros.
-/// Provides a hint to the implementation that the data will likely become
-/// read shared in the near future and the data caching can be optimized.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1RS </c> instruction.
-///
-/// \param dst0
-///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS
-static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1,
-                                  const void *base, __SIZE_TYPE__ stride) {
-  _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
-                                &dst1->tile, base, stride);
-}
-
-#undef __DEFAULT_FN_ATTRS
-#endif /* __x86_64__ */
-#endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/amxtf32transposeintrin.h b/clang/lib/Headers/amxtf32transposeintrin.h
deleted file mode 100644
index e1b90c1adfb22..0000000000000
--- a/clang/lib/Headers/amxtf32transposeintrin.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*===--------- amxtf32transposeintrin.h - AMX-TF32 and AMX-TRANSPOSE --------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <amxtf32transposeintrin.h> directly; include <immintrin.h> instead."
-#endif // __IMMINTRIN_H
-
-#ifndef __AMX_TF32TRANSPOSEINTRIN_H
-#define __AMX_TF32TRANSPOSEINTRIN_H
-#ifdef __x86_64__
-
-#define __DEFAULT_FN_ATTRS_TF32_TRANSPOSE                                      \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("amx-tf32,amx-transpose")))
-
-/// \code
-/// void _tile_tmmultf32ps(constexpr int srcdst, constexpr int a, \
-///                        constexpr int b);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction.
-///
-/// \param srcdst
-/// 	The destination tile. Max size is 1024 Bytes.
-/// \param a
-/// 	The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-/// 	The 2nd source tile. Max size is 1024 Bytes.
-///
-/// \code{.operation}
-/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) {
-/// 	dword[12:0] := 0
-/// 	dword[31:13] := x[31:13]
-/// 	return dword
-/// }
-///
-/// DEFINE silence_snan_fp32(x[31:0]) {
-/// 	IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0)
-/// 		x.fraction[22] := 1
-/// 	return x
-/// }
-///
-/// elements_dest:= srcdst.colsb/4
-///
-/// FOR m := 0 TO (srcdst.rows-1)
-/// 	tmp[511:0] := 0
-/// 	FOR k := 0 TO (a.rows-1)
-/// 		FOR n := 0 TO (elements_dest-1)
-/// 			a1e := silence_snan_fp32(a.row[k].fp32[m])
-/// 			a2e := silence_snan_fp32(b.row[k].fp32[n])
-/// 			s1e := zero_lower_mantissa_bits_fp32(a1e)
-/// 			s2e := zero_lower_mantissa_bits_fp32(a2e)
-/// 			tmp.fp32[n] += s1e * s2e
-/// 		ENDFOR
-/// 	ENDFOR
-///
-/// 	FOR n := 0 TO (elements_dest-1)
-/// 		tmp.fp32[n] += srcdst.row[m].fp32[n]
-/// 	ENDFOR
-///	write_row_and_zero(srcdst, m, tmp, srcdst.colsb)
-///
-/// ENDFOR
-///
-/// zero_upper_rows(srcdst, srcdst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-#define _tile_tmmultf32ps(srcdst, a, b)                                        \
-  __builtin_ia32_ttmmultf32ps((srcdst), (a), (b))
-
-// dst = m x n (srcdest), src1 = k x m, src2 = k x n
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32_TRANSPOSE
-_tile_tmmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k,
-                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_ttmmultf32ps_internal(m, n, k, dst, src1, src2);
-}
-
-/// Compute transpose and do Matrix Multiplication of src0 and src1, and then do
-/// Matrix Plus with dst. All the calculation is base on float32 but with the
-/// lower 13-bit set to 0.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_TF32_TRANSPOSE
-static void __tile_tmmultf32ps(__tile1024i *dst, __tile1024i src0,
-                               __tile1024i src1) {
-  dst->tile = _tile_tmmultf32ps_internal(src0.row, src1.col, src0.col,
-                                         dst->tile, src0.tile, src1.tile);
-}
-
-#endif // __x86_64__
-#endif // __AMX_TF32TRANSPOSEINTRIN_H
diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h
deleted file mode 100644
index b3fa37d766c45..0000000000000
--- a/clang/lib/Headers/amxtransposeintrin.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- * ===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead."
-#endif /* __IMMINTRIN_H */
-
-#ifndef __AMX_TRANSPOSEINTRIN_H
-#define __AMX_TRANSPOSEINTRIN_H
-#ifdef __x86_64__
-
-#define __DEFAULT_FN_ATTRS_TRANSPOSE                                           \
-  __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
-
-#define _tile_2rpntlvwz0(tdst, base, stride)                                   \
-  __builtin_ia32_t2rpntlvwz0(tdst, base, stride)
-#define _tile_2rpntlvwz0t1(tdst, base, stride)                                 \
-  __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
-#define _tile_2rpntlvwz1(tdst, base, stride)                                   \
-  __builtin_ia32_t2rpntlvwz1(tdst, base, stride)
-#define _tile_2rpntlvwz1t1(tdst, base, stride)                                 \
-  __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
-
-/// Transpose 32-bit elements from \a src and write the result to \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// void _tile_transposed(__tile dst, __tile src);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
-///
-/// \param dst
-/// 	The destination tile. Max size is 1024 Bytes.
-/// \param src
-/// 	The source tile. Max size is 1024 Bytes.
-///
-/// \code{.operation}
-///
-/// FOR i := 0 TO (dst.rows-1)
-/// 	tmp[511:0] := 0
-/// 	FOR j := 0 TO (dst.colsb/4-1)
-/// 		tmp.dword[j] := src.row[j].dword[i]
-/// 	ENDFOR
-/// 	dst.row[i] := tmp
-/// ENDFOR
-///
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src)
-
-static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal(
-    unsigned short row, unsigned short col0, unsigned short col1,
-    _tile1024i *dst0, _tile1024i *dst1, const void *base,
-    __SIZE_TYPE__ stride) {
-  // Use __tile1024i_1024a* to escape the alignment check in
-  // clang/test/Headers/x86-intrinsics-headers-clean.cpp
-  __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
-                                      (_tile1024i_1024a *)dst1, base,
-                                      (__SIZE_TYPE__)(stride));
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal(
-    unsigned short row, unsigned short col0, unsigned short col1,
-    _tile1024i *dst0, _tile1024i *dst1, const void *base,
-    __SIZE_TYPE__ stride) {
-  __builtin_ia32_t2rpntlvwz0t1_internal(
-      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
-      (__SIZE_TYPE__)(stride));
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal(
-    unsigned short row, unsigned short col0, unsigned short col1,
-    _tile1024i *dst0, _tile1024i *dst1, const void *base,
-    __SIZE_TYPE__ stride) {
-  __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
-                                      (_tile1024i_1024a *)dst1, base,
-                                      (__SIZE_TYPE__)(stride));
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal(
-    unsigned short row, unsigned short col0, unsigned short col1,
-    _tile1024i *dst0, _tile1024i *dst1, const void *base,
-    __SIZE_TYPE__ stride) {
-  __builtin_ia32_t2rpntlvwz1t1_internal(
-      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
-      (__SIZE_TYPE__)(stride));
-}
-
-// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE
-_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
-  return __builtin_ia32_ttransposed_internal(m, n, src);
-}
-
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written.
-/// Provides a hint to the implementation that the data will likely not be
-/// reused in the near future and the data caching can be optimized.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction.
-///
-/// \param dst0
-///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TRANSPOSE
-static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1,
-                              const void *base, __SIZE_TYPE__ stride) {
-  _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
-                            &dst1->tile, base, stride);
-}
-
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction.
-///
-/// \param dst0
-///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TRANSPOSE
-static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1,
-                                const void *base, __SIZE_TYPE__ stride) {
-  _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
-                              &dst1->tile, base, stride);
-}
-
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written. The last row will be not be read from memory but instead
-/// filled with zeros.
-/// Provides a hint to the implementation that the data will likely not be
-/// reused in the near future and the data caching can be optimized.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
-///
-/// \param dst0
-///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TRANSPOSE
-static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1,
-                              const void *base, __SIZE_TYPE__ stride) {
-  _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
-                            &dst1->tile, base, stride);
-}
-
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written. The last row will be not be read from memory but instead
-/// filled with zeros.
-/// Provides a hint to the implementation that the data will likely not be
-/// reused in the near future and the data caching can be optimized.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction.
-///
-/// \param dst0
-///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TRANSPOSE
-static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
-                                const void *base, __SIZE_TYPE__ stride) {
-  _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
-                              &dst1->tile, base, stride);
-}
-
-/// Transpose 32-bit elements from src and write the result to dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src
-///    The source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_TRANSPOSE
-static void __tile_transposed(__tile1024i *dst, __tile1024i src) {
-  dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile);
-}
-
-#endif /* __x86_64__ */
-#endif /* __AMX_TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
index 37ebc4f46a826..46ec12a63ef9c 100644
--- a/clang/lib/Headers/avx10_2_512bf16intrin.h
+++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -24,6 +24,12 @@ typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1)));
   __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"),        \
                  __min_vector_width__(512)))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
+#else
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
+#endif
+
 static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) {
   return __builtin_bit_cast(__m512bh, _mm512_setzero_ps());
 }
@@ -167,7 +173,7 @@ _mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) {
                                                 (__v32bf)__A);
 }
 
-static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) {
   return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
                                                   (__v32hi)__B);
@@ -555,6 +561,7 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_pbh(
       (__v32bf)_mm512_setzero_pbh());
 }
 
+#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
 #undef __DEFAULT_FN_ATTRS512
 
 #endif
diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
index 765cd682986b4..8fb8cd7cd0865 100644
--- a/clang/lib/Headers/avx10_2bf16intrin.h
+++ b/clang/lib/Headers/avx10_2bf16intrin.h
@@ -27,6 +27,14 @@ typedef __bf16 __m256bh_u __attribute__((__vector_size__(32), __aligned__(1)));
   __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"),        \
                  __min_vector_width__(128)))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
 static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_setzero_pbh(void) {
   return __builtin_bit_cast(__m256bh, _mm256_setzero_ps());
 }
@@ -287,13 +295,13 @@ _mm256_mask_blend_pbh(__mmask16 __U, __m256bh __A, __m256bh __W) {
                                                 (__v16bf)__A);
 }
 
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_permutex2var_pbh(__m128bh __A, __m128i __I, __m128bh __B) {
   return (__m128bh)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
                                                   (__v8hi)__B);
 }
 
-static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) {
   return (__m256bh)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
                                                   (__v16hi)__B);
@@ -1080,6 +1088,7 @@ _mm_maskz_fnmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
 
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
-
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
 #endif
 #endif
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index ac75b6ccde735..4a02c96620335 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -92,69 +92,65 @@ _kxor_mask64(__mmask64 __A, __mmask64 __B) {
   return (__mmask64)__builtin_ia32_kxordi((__mmask64)__A, (__mmask64)__B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestc_mask32_u8(__mmask32 __A, __mmask32 __B)
-{
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
+_kortestc_mask32_u8(__mmask32 __A, __mmask32 __B) {
   return (unsigned char)__builtin_ia32_kortestcsi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestz_mask32_u8(__mmask32 __A, __mmask32 __B)
-{
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
+_kortestz_mask32_u8(__mmask32 __A, __mmask32 __B) {
   return (unsigned char)__builtin_ia32_kortestzsi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
 _kortest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) {
   *__C = (unsigned char)__builtin_ia32_kortestcsi(__A, __B);
   return (unsigned char)__builtin_ia32_kortestzsi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
 _kortestc_mask64_u8(__mmask64 __A, __mmask64 __B) {
   return (unsigned char)__builtin_ia32_kortestcdi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
 _kortestz_mask64_u8(__mmask64 __A, __mmask64 __B) {
   return (unsigned char)__builtin_ia32_kortestzdi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
 _kortest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) {
   *__C = (unsigned char)__builtin_ia32_kortestcdi(__A, __B);
   return (unsigned char)__builtin_ia32_kortestzdi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestc_mask32_u8(__mmask32 __A, __mmask32 __B)
-{
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
+_ktestc_mask32_u8(__mmask32 __A, __mmask32 __B) {
   return (unsigned char)__builtin_ia32_ktestcsi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestz_mask32_u8(__mmask32 __A, __mmask32 __B)
-{
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
+_ktestz_mask32_u8(__mmask32 __A, __mmask32 __B) {
   return (unsigned char)__builtin_ia32_ktestzsi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
 _ktest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) {
   *__C = (unsigned char)__builtin_ia32_ktestcsi(__A, __B);
   return (unsigned char)__builtin_ia32_ktestzsi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
 _ktestc_mask64_u8(__mmask64 __A, __mmask64 __B) {
   return (unsigned char)__builtin_ia32_ktestcdi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
 _ktestz_mask64_u8(__mmask64 __A, __mmask64 __B) {
   return (unsigned char)__builtin_ia32_ktestzdi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
 _ktest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) {
   *__C = (unsigned char)__builtin_ia32_ktestcdi(__A, __B);
   return (unsigned char)__builtin_ia32_ktestzdi(__A, __B);
@@ -515,7 +511,7 @@ _mm512_packs_epi32(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
@@ -523,9 +519,8 @@ _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B)
                                        (__v32hi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
                                        (__v32hi)_mm512_packs_epi32(__A, __B),
                                        (__v32hi)__W);
@@ -536,7 +531,7 @@ _mm512_packs_epi16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
@@ -544,7 +539,7 @@ _mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
                                         (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
@@ -557,7 +552,7 @@ _mm512_packus_epi32(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
@@ -565,7 +560,7 @@ _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B)
                                        (__v32hi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
@@ -578,7 +573,7 @@ _mm512_packus_epi16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
@@ -586,7 +581,7 @@ _mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
                                         (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
@@ -599,17 +594,15 @@ _mm512_adds_epi8(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_elementwise_add_sat((__v64qs)__A, (__v64qs)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_adds_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                         (__v64qi)_mm512_adds_epi8(__A, __B),
                                         (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_adds_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                         (__v64qi)_mm512_adds_epi8(__A, __B),
                                         (__v64qi)_mm512_setzero_si512());
@@ -620,7 +613,7 @@ _mm512_adds_epi16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_elementwise_add_sat((__v32hi)__A, (__v32hi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
@@ -628,7 +621,7 @@ _mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
                                         (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
@@ -641,7 +634,7 @@ _mm512_adds_epu8(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_elementwise_add_sat((__v64qu) __A, (__v64qu) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
@@ -649,7 +642,7 @@ _mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
                                         (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
@@ -662,17 +655,15 @@ _mm512_adds_epu16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_elementwise_add_sat((__v32hu) __A, (__v32hu) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                         (__v32hi)_mm512_adds_epu16(__A, __B),
                                         (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                         (__v32hi)_mm512_adds_epu16(__A, __B),
                                         (__v32hi)_mm512_setzero_si512());
@@ -890,7 +881,7 @@ _mm512_subs_epi8(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_elementwise_sub_sat((__v64qs)__A, (__v64qs)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
@@ -898,7 +889,7 @@ _mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
                                         (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
@@ -911,7 +902,7 @@ _mm512_subs_epi16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_elementwise_sub_sat((__v32hi)__A, (__v32hi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
@@ -919,7 +910,7 @@ _mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
                                         (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
@@ -932,7 +923,7 @@ _mm512_subs_epu8(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_elementwise_sub_sat((__v64qu) __A, (__v64qu) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
@@ -940,7 +931,7 @@ _mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
                                         (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
@@ -953,7 +944,7 @@ _mm512_subs_epu16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_elementwise_sub_sat((__v32hu) __A, (__v32hu) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
@@ -961,7 +952,7 @@ _mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
                                         (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
@@ -969,35 +960,31 @@ _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
                                         (__v32hi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) {
   return (__m512i)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
                                                  (__v32hi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I,
-                               __m512i __B)
-{
+                               __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512(__U,
                               (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
                               (__v32hi)__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512(__U,
                               (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
                               (__v32hi)__I);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512(__U,
                               (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
                               (__v32hi)_mm512_setzero_si512());
@@ -1199,14 +1186,14 @@ _mm512_unpackhi_epi8(__m512i __A, __m512i __B) {
                                           62, 64+62, 63, 64+63);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                         (__v64qi)_mm512_unpackhi_epi8(__A, __B),
                                         (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                         (__v64qi)_mm512_unpackhi_epi8(__A, __B),
@@ -1226,14 +1213,14 @@ _mm512_unpackhi_epi16(__m512i __A, __m512i __B) {
                                           30, 32+30, 31, 32+31);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                        (__v32hi)_mm512_unpackhi_epi16(__A, __B),
                                        (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                        (__v32hi)_mm512_unpackhi_epi16(__A, __B),
@@ -1261,14 +1248,14 @@ _mm512_unpacklo_epi8(__m512i __A, __m512i __B) {
                                           54, 64+54, 55, 64+55);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                         (__v64qi)_mm512_unpacklo_epi8(__A, __B),
                                         (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                         (__v64qi)_mm512_unpacklo_epi8(__A, __B),
@@ -1288,14 +1275,14 @@ _mm512_unpacklo_epi16(__m512i __A, __m512i __B) {
                                           26, 32+26, 27, 32+27);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                        (__v32hi)_mm512_unpacklo_epi16(__A, __B),
                                        (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                        (__v32hi)_mm512_unpacklo_epi16(__A, __B),
@@ -1574,7 +1561,7 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) {
   ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v64qi)(__m512i)(a),         \
                                                 (int)(imm)))
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
 {
   return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
@@ -1582,23 +1569,21 @@ _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
                 (__v32hi) __W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_mov_epi16(__mmask32 __U, __m512i __A) {
   return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
                 (__v32hi) __A,
                 (__v32hi) _mm512_setzero_si512 ());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_mov_epi8(__m512i __W, __mmask64 __U, __m512i __A) {
   return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
                 (__v64qi) __A,
                 (__v64qi) __W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
 {
   return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
@@ -1606,7 +1591,7 @@ _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
                 (__v64qi) _mm512_setzero_si512 ());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
 {
   return (__m512i) __builtin_ia32_selectb_512(__M,
@@ -1614,9 +1599,8 @@ _mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
                                               (__v64qi) __O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_set1_epi8(__mmask64 __M, char __A) {
   return (__m512i) __builtin_ia32_selectb_512(__M,
                                               (__v64qi) _mm512_set1_epi8(__A),
                                               (__v64qi) _mm512_setzero_si512());
@@ -1809,7 +1793,7 @@ _mm512_broadcastb_epi8(__m128i __A) {
                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A)
 {
   return (__m512i)__builtin_ia32_selectb_512(__M,
@@ -1817,15 +1801,14 @@ _mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A)
                                              (__v64qi) __O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectb_512(__M,
                                              (__v64qi) _mm512_broadcastb_epi8(__A),
                                              (__v64qi) _mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
 {
   return (__m512i) __builtin_ia32_selectw_512(__M,
@@ -1833,9 +1816,8 @@ _mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
                                               (__v32hi) __O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_set1_epi16(__mmask32 __M, short __A) {
   return (__m512i) __builtin_ia32_selectw_512(__M,
                                               (__v32hi) _mm512_set1_epi16(__A),
                                               (__v32hi) _mm512_setzero_si512());
@@ -1848,7 +1830,7 @@ _mm512_broadcastw_epi16(__m128i __A) {
                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A)
 {
   return (__m512i)__builtin_ia32_selectw_512(__M,
@@ -1856,7 +1838,7 @@ _mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A)
                                              (__v32hi) __O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A)
 {
   return (__m512i)__builtin_ia32_selectw_512(__M,
diff --git a/clang/lib/Headers/avx512cdintrin.h b/clang/lib/Headers/avx512cdintrin.h
index fb6dcb6dd8ad1..f9de207b764a2 100644
--- a/clang/lib/Headers/avx512cdintrin.h
+++ b/clang/lib/Headers/avx512cdintrin.h
@@ -17,8 +17,8 @@
 /* Define the default attributes for the functions in this file. */
 #if defined(__cplusplus) && (__cplusplus >= 201103L)
 #define __DEFAULT_FN_ATTRS                                                     \
-  constexpr __attribute__((__always_inline__, __nodebug__,                     \
-                           __target__("avx512cd"), __min_vector_width__(512)))
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"),       \
+                 __min_vector_width__(512))) constexpr
 #else
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"),       \
diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h
index fef1a2d64d538..29156e7e96434 100644
--- a/clang/lib/Headers/avx512dqintrin.h
+++ b/clang/lib/Headers/avx512dqintrin.h
@@ -59,55 +59,49 @@ _kxor_mask8(__mmask8 __A, __mmask8 __B) {
   return (__mmask8)__builtin_ia32_kxorqi((__mmask8)__A, (__mmask8)__B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestc_mask8_u8(__mmask8 __A, __mmask8 __B)
-{
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
+_kortestc_mask8_u8(__mmask8 __A, __mmask8 __B) {
   return (unsigned char)__builtin_ia32_kortestcqi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestz_mask8_u8(__mmask8 __A, __mmask8 __B)
-{
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
+_kortestz_mask8_u8(__mmask8 __A, __mmask8 __B) {
   return (unsigned char)__builtin_ia32_kortestzqi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
 _kortest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) {
   *__C = (unsigned char)__builtin_ia32_kortestcqi(__A, __B);
   return (unsigned char)__builtin_ia32_kortestzqi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestc_mask8_u8(__mmask8 __A, __mmask8 __B)
-{
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
+_ktestc_mask8_u8(__mmask8 __A, __mmask8 __B) {
   return (unsigned char)__builtin_ia32_ktestcqi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestz_mask8_u8(__mmask8 __A, __mmask8 __B)
-{
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
+_ktestz_mask8_u8(__mmask8 __A, __mmask8 __B) {
   return (unsigned char)__builtin_ia32_ktestzqi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
 _ktest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) {
   *__C = (unsigned char)__builtin_ia32_ktestcqi(__A, __B);
   return (unsigned char)__builtin_ia32_ktestzqi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestc_mask16_u8(__mmask16 __A, __mmask16 __B)
-{
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
+_ktestc_mask16_u8(__mmask16 __A, __mmask16 __B) {
   return (unsigned char)__builtin_ia32_ktestchi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestz_mask16_u8(__mmask16 __A, __mmask16 __B)
-{
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
+_ktestz_mask16_u8(__mmask16 __A, __mmask16 __B) {
   return (unsigned char)__builtin_ia32_ktestzhi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
 _ktest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) {
   *__C = (unsigned char)__builtin_ia32_ktestchi(__A, __B);
   return (unsigned char)__builtin_ia32_ktestzhi(__A, __B);
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 18c4a44a4c76e..997e9608e112f 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -3059,69 +3059,61 @@ _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 
 /* Vector permutations */
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) {
   return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I,
                                                 (__v16si) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I,
-                               __m512i __B)
-{
+                               __m512i __B) {
   return (__m512i)__builtin_ia32_selectd_512(__U,
                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
                               (__v16si)__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectd_512(__U,
                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
                               (__v16si)__I);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectd_512(__U,
                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
                               (__v16si)_mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) {
   return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I,
                                                 (__v8di) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I,
-                               __m512i __B)
-{
+                               __m512i __B) {
   return (__m512i)__builtin_ia32_selectq_512(__U,
                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
                                (__v8di)__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectq_512(__U,
                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
                                (__v8di)__I);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectq_512(__U,
                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
                                (__v8di)_mm512_setzero_si512());
@@ -5949,71 +5941,66 @@ _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
                                         (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
-{
+static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) {
   return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I,
                                                  (__v8df)__B);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
-{
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I,
+                            __m512d __B) {
   return (__m512d)__builtin_ia32_selectpd_512(__U,
                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
                                   (__v8df)__A);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U,
-                             __m512d __B)
-{
+                             __m512d __B) {
   return (__m512d)__builtin_ia32_selectpd_512(__U,
                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
                                   (__v8df)(__m512d)__I);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I,
-                             __m512d __B)
-{
+                             __m512d __B) {
   return (__m512d)__builtin_ia32_selectpd_512(__U,
                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
                                   (__v8df)_mm512_setzero_pd());
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
-{
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) {
   return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I,
                                                 (__v16sf) __B);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I,
+                            __m512 __B) {
   return (__m512)__builtin_ia32_selectps_512(__U,
                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
                                  (__v16sf)__A);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U,
+                             __m512 __B) {
   return (__m512)__builtin_ia32_selectps_512(__U,
                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
                                  (__v16sf)(__m512)__I);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I,
+                             __m512 __B) {
   return (__m512)__builtin_ia32_selectps_512(__U,
                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
                                  (__v16sf)_mm512_setzero_ps());
 }
 
-
 #define _mm512_cvtt_roundpd_epu32(A, R) \
   ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
                                               (__v8si)_mm256_undefined_si256(), \
@@ -8081,31 +8068,27 @@ _mm512_kor(__mmask16 __A, __mmask16 __B) {
   return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm512_kortestc (__mmask16 __A, __mmask16 __B)
-{
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_kortestc(__mmask16 __A, __mmask16 __B) {
   return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm512_kortestz (__mmask16 __A, __mmask16 __B)
-{
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_kortestz(__mmask16 __A, __mmask16 __B) {
   return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestc_mask16_u8(__mmask16 __A, __mmask16 __B)
-{
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
+_kortestc_mask16_u8(__mmask16 __A, __mmask16 __B) {
   return (unsigned char)__builtin_ia32_kortestchi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestz_mask16_u8(__mmask16 __A, __mmask16 __B)
-{
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
+_kortestz_mask16_u8(__mmask16 __A, __mmask16 __B) {
   return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
 }
 
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
 _kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) {
   *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B);
   return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h
index 142cc079c2c4b..25051228f3e0a 100644
--- a/clang/lib/Headers/avx512fp16intrin.h
+++ b/clang/lib/Headers/avx512fp16intrin.h
@@ -3316,13 +3316,13 @@ _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
                                               (__v32hf)__A);
 }
 
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
+static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
   return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
                                                  (__v32hi)__B);
 }
 
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
+static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_permutexvar_ph(__m512i __A, __m512h __B) {
   return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
 }
diff --git a/clang/lib/Headers/avx512ifmaintrin.h b/clang/lib/Headers/avx512ifmaintrin.h
index 625a8ff66dc60..f73b607df797f 100644
--- a/clang/lib/Headers/avx512ifmaintrin.h
+++ b/clang/lib/Headers/avx512ifmaintrin.h
@@ -17,9 +17,8 @@
 /* Define the default attributes for the functions in this file. */
 #if defined(__cplusplus) && (__cplusplus >= 201103L)
 #define __DEFAULT_FN_ATTRS                                                     \
-  constexpr                                                                    \
-      __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), \
-                     __min_vector_width__(512)))
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"),     \
+                 __min_vector_width__(512))) constexpr
 #else
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"),     \
diff --git a/clang/lib/Headers/avx512ifmavlintrin.h b/clang/lib/Headers/avx512ifmavlintrin.h
index b377c17166ffb..51d5210e5aa5d 100644
--- a/clang/lib/Headers/avx512ifmavlintrin.h
+++ b/clang/lib/Headers/avx512ifmavlintrin.h
@@ -18,13 +18,13 @@
 /* Define the default attributes for the functions in this file. */
 #if defined(__cplusplus) && (__cplusplus >= 201103L)
 #define __DEFAULT_FN_ATTRS128                                                  \
-  constexpr __attribute__((__always_inline__, __nodebug__,                     \
-                           __target__("avx512ifma,avx512vl"),                  \
-                           __min_vector_width__(128)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512ifma,avx512vl"),                            \
+                 __min_vector_width__(128))) constexpr
 #define __DEFAULT_FN_ATTRS256                                                  \
-  constexpr __attribute__((__always_inline__, __nodebug__,                     \
-                           __target__("avx512ifma,avx512vl"),                  \
-                           __min_vector_width__(256)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512ifma,avx512vl"),                            \
+                 __min_vector_width__(256))) constexpr
 #else
 #define __DEFAULT_FN_ATTRS128                                                  \
   __attribute__((__always_inline__, __nodebug__,                               \
@@ -34,7 +34,6 @@
   __attribute__((__always_inline__, __nodebug__,                               \
                  __target__("avx512ifma,avx512vl"),                            \
                  __min_vector_width__(256)))
-
 #endif
 
 #if !(defined(__AVXIFMA__) || defined(__AVX512IFMA__))
diff --git a/clang/lib/Headers/avx512vbmiintrin.h b/clang/lib/Headers/avx512vbmiintrin.h
index 964535c4c4900..84fda5c5849e8 100644
--- a/clang/lib/Headers/avx512vbmiintrin.h
+++ b/clang/lib/Headers/avx512vbmiintrin.h
@@ -19,59 +19,57 @@
   __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"),     \
                  __min_vector_width__(512)))
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
-{
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) {
   return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I,
                                                  (__v64qi) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I,
-                              __m512i __B)
-{
+                              __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512(__U,
                                (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
                                (__v64qi)__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U,
-                               __m512i __B)
-{
+                               __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512(__U,
                                (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
                                (__v64qi)__I);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I,
-                               __m512i __B)
-{
+                               __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512(__U,
                                (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
                                (__v64qi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_permutexvar_epi8(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
-        __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_maskz_permutexvar_epi8(__mmask64 __M, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
                                      (__v64qi)_mm512_permutexvar_epi8(__A, __B),
                                      (__v64qi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
-             __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_mask_permutexvar_epi8(__m512i __W, __mmask64 __M, __m512i __A,
+                             __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
                                      (__v64qi)_mm512_permutexvar_epi8(__A, __B),
                                      (__v64qi)__W);
@@ -99,8 +97,6 @@ _mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y)
                                 (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
                                 (__v64qi)_mm512_setzero_si512());
 }
-
-
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
 #undef __DEFAULT_FN_ATTRS
-
 #endif
diff --git a/clang/lib/Headers/avx512vbmivlintrin.h b/clang/lib/Headers/avx512vbmivlintrin.h
index 4c50be7d9e7e5..58a48dadff863 100644
--- a/clang/lib/Headers/avx512vbmivlintrin.h
+++ b/clang/lib/Headers/avx512vbmivlintrin.h
@@ -24,117 +24,110 @@
                  __target__("avx512vbmi,avx512vl"),                            \
                  __min_vector_width__(256)))
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
-{
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) {
   return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A,
                                                  (__v16qi)__I,
                                                  (__v16qi)__B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I,
-                           __m128i __B)
-{
+                           __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128(__U,
                                   (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
                                   (__v16qi)__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U,
-                            __m128i __B)
-{
+                            __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128(__U,
                                   (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
                                   (__v16qi)__I);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
-                            __m128i __B)
-{
+                            __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128(__U,
                                   (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
                                   (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) {
   return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I,
                                                  (__v32qi)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I,
-                              __m256i __B)
-{
+                              __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256(__U,
                                (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
                                (__v32qi)__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U,
-                               __m256i __B)
-{
+                               __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256(__U,
                                (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
                                (__v32qi)__I);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I,
-                               __m256i __B)
-{
+                               __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256(__U,
                                (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
                                (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_permutexvar_epi8(__m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_permutexvar_epi8(__mmask16 __M, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
                                         (__v16qi)_mm_permutexvar_epi8(__A, __B),
                                         (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
-          __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_permutexvar_epi8(__m128i __W, __mmask16 __M, __m128i __A,
+                          __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
                                         (__v16qi)_mm_permutexvar_epi8(__A, __B),
                                         (__v16qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_permutexvar_epi8(__m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
-        __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_permutexvar_epi8(__mmask32 __M, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
                                      (__v32qi)_mm256_permutexvar_epi8(__A, __B),
                                      (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
-             __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_permutexvar_epi8(__m256i __W, __mmask32 __M, __m256i __A,
+                             __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
                                      (__v32qi)_mm256_permutexvar_epi8(__A, __B),
                                      (__v32qi)__W);
@@ -186,7 +179,8 @@ _mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y)
                                 (__v32qi)_mm256_setzero_si256());
 }
 
-
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
 
diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
index 263a1079b26d5..d23188ab02b6c 100644
--- a/clang/lib/Headers/avx512vlbwintrin.h
+++ b/clang/lib/Headers/avx512vlbwintrin.h
@@ -536,14 +536,14 @@ _mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) {
                                              (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
                                              (__v8hi)_mm_packs_epi32(__A, __B),
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
@@ -551,7 +551,7 @@ _mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
@@ -559,7 +559,7 @@ _mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B)
                                           (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
@@ -567,7 +567,7 @@ _mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
                                           (__v16hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
@@ -575,7 +575,7 @@ _mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B)
                                              (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
@@ -583,7 +583,7 @@ _mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
                                              (__v16qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
@@ -591,7 +591,7 @@ _mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B)
                                           (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
@@ -599,7 +599,7 @@ _mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
                                           (__v32qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
@@ -607,7 +607,7 @@ _mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B)
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
@@ -615,7 +615,7 @@ _mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
@@ -623,7 +623,7 @@ _mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B)
                                          (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
@@ -631,7 +631,7 @@ _mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
                                          (__v16hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
@@ -639,7 +639,7 @@ _mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B)
                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
@@ -647,7 +647,7 @@ _mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
                                             (__v16qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
@@ -655,7 +655,7 @@ _mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B)
                                          (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
@@ -663,7 +663,7 @@ _mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
                                          (__v32qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
@@ -671,7 +671,7 @@ _mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
                                              (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
@@ -679,7 +679,7 @@ _mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B)
                                              (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
@@ -687,7 +687,7 @@ _mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
                                             (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
@@ -695,7 +695,7 @@ _mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B)
                                             (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -703,7 +703,7 @@ _mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -711,7 +711,7 @@ _mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B)
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -719,7 +719,7 @@ _mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
                                            (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -727,7 +727,7 @@ _mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B)
                                            (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
@@ -735,7 +735,7 @@ _mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
                                              (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
@@ -743,7 +743,7 @@ _mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B)
                                              (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
@@ -751,7 +751,7 @@ _mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
                                             (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
@@ -759,7 +759,7 @@ _mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B)
                                             (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -767,7 +767,7 @@ _mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -775,7 +775,7 @@ _mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B)
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -783,7 +783,7 @@ _mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
                                            (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1095,7 +1095,7 @@ _mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
                                          (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
@@ -1103,7 +1103,7 @@ _mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
                                              (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
@@ -1111,7 +1111,7 @@ _mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B)
                                              (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
@@ -1119,7 +1119,7 @@ _mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
                                             (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
@@ -1127,7 +1127,7 @@ _mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B)
                                             (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1135,7 +1135,7 @@ _mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1143,7 +1143,7 @@ _mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B)
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1151,7 +1151,7 @@ _mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
                                            (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1159,7 +1159,7 @@ _mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B)
                                            (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
@@ -1167,7 +1167,7 @@ _mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
                                              (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
@@ -1175,7 +1175,7 @@ _mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B)
                                              (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
@@ -1183,7 +1183,7 @@ _mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
                                             (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
@@ -1191,7 +1191,7 @@ _mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B)
                                             (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1199,7 +1199,7 @@ _mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1207,7 +1207,7 @@ _mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B)
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A,
       __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1215,7 +1215,7 @@ _mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A,
                                            (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1223,69 +1223,61 @@ _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B)
                                            (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) {
   return (__m128i)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
                                                  (__v8hi) __B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I,
-                            __m128i __B)
-{
+                            __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128(__U,
                                   (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
                                   (__v8hi)__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U,
-                             __m128i __B)
-{
+                             __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128(__U,
                                   (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
                                   (__v8hi)__I);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I,
-            __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_permutex2var_epi16(__mmask8 __U, __m128i __A, __m128i __I,
+                             __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128(__U,
                                   (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
                                   (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) {
   return (__m256i)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
                                                  (__v16hi)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I,
-                               __m256i __B)
-{
+                               __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256(__U,
                               (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
                               (__v16hi)__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U,
-                                __m256i __B)
-{
+                                __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256(__U,
                               (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
                               (__v16hi)__I);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I,
-                                 __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i __I,
+                                __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256(__U,
                               (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
                               (__v16hi)_mm256_setzero_si256());
@@ -1440,14 +1432,14 @@ _mm_cvtepi16_epi8(__m128i __A) {
       12, 13, 14, 15);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
   return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
                (__v16qi) __O,
                __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) {
   return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
                (__v16qi) _mm_setzero_si128(),
@@ -1596,112 +1588,112 @@ _mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
       (__mmask16)__U, (__v16qi)_mm_unpackhi_epi8(__A, __B), (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
                                            (__v16qi)_mm_unpackhi_epi8(__A, __B),
                                            (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                         (__v32qi)_mm256_unpackhi_epi8(__A, __B),
                                         (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                         (__v32qi)_mm256_unpackhi_epi8(__A, __B),
                                         (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                            (__v8hi)_mm_unpackhi_epi16(__A, __B),
                                            (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                            (__v8hi)_mm_unpackhi_epi16(__A, __B),
                                            (__v8hi) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                        (__v16hi)_mm256_unpackhi_epi16(__A, __B),
                                        (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                        (__v16hi)_mm256_unpackhi_epi16(__A, __B),
                                        (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
                                            (__v16qi)_mm_unpacklo_epi8(__A, __B),
                                            (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
                                            (__v16qi)_mm_unpacklo_epi8(__A, __B),
                                            (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                         (__v32qi)_mm256_unpacklo_epi8(__A, __B),
                                         (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                         (__v32qi)_mm256_unpacklo_epi8(__A, __B),
                                         (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                            (__v8hi)_mm_unpacklo_epi16(__A, __B),
                                            (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                            (__v8hi)_mm_unpacklo_epi16(__A, __B),
                                            (__v8hi) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                        (__v16hi)_mm256_unpacklo_epi16(__A, __B),
                                        (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                        (__v16hi)_mm256_unpacklo_epi16(__A, __B),
                                        (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1709,7 +1701,7 @@ _mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1717,7 +1709,7 @@ _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A)
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1725,7 +1717,7 @@ _mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
                                              (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1734,7 +1726,7 @@ _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A)
 }
 
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1742,7 +1734,7 @@ _mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1750,7 +1742,7 @@ _mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A)
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1758,7 +1750,7 @@ _mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
                                              (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1885,7 +1877,7 @@ _mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B)
                                           (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1893,7 +1885,7 @@ _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -2181,7 +2173,7 @@ _mm256_maskz_mov_epi8(__mmask32 __U, __m256i __A) {
                 (__v32qi) _mm256_setzero_si256 ());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
 {
   return (__m128i) __builtin_ia32_selectb_128(__M,
@@ -2189,7 +2181,7 @@ _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
                                               (__v16qi) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_set1_epi8 (__mmask16 __M, char __A)
 {
  return (__m128i) __builtin_ia32_selectb_128(__M,
@@ -2197,7 +2189,7 @@ _mm_maskz_set1_epi8 (__mmask16 __M, char __A)
                                              (__v16qi) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
 {
   return (__m256i) __builtin_ia32_selectb_256(__M,
@@ -2205,7 +2197,7 @@ _mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
                                               (__v32qi) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_set1_epi8 (__mmask32 __M, char __A)
 {
   return (__m256i) __builtin_ia32_selectb_256(__M,
@@ -2536,7 +2528,7 @@ _mm256_movm_epi16 (__mmask16 __A)
   return (__m256i) __builtin_ia32_cvtmask2w256 (__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectb_128(__M,
@@ -2544,7 +2536,7 @@ _mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A)
                                              (__v16qi) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectb_128(__M,
@@ -2552,7 +2544,7 @@ _mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A)
                                              (__v16qi) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectb_256(__M,
@@ -2560,7 +2552,7 @@ _mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A)
                                              (__v32qi) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectb_256(__M,
@@ -2568,7 +2560,7 @@ _mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A)
                                              (__v32qi) _mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectw_128(__M,
@@ -2576,7 +2568,7 @@ _mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
                                              (__v8hi) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectw_128(__M,
@@ -2584,7 +2576,7 @@ _mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A)
                                              (__v8hi) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectw_256(__M,
@@ -2592,7 +2584,7 @@ _mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A)
                                              (__v16hi) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectw_256(__M,
@@ -2600,7 +2592,7 @@ _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
                                              (__v16hi) _mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
 {
   return (__m256i) __builtin_ia32_selectw_256 (__M,
@@ -2608,7 +2600,7 @@ _mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
                                                (__v16hi) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
 {
   return (__m256i) __builtin_ia32_selectw_256(__M,
@@ -2616,7 +2608,7 @@ _mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
                                               (__v16hi) _mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
 {
   return (__m128i) __builtin_ia32_selectw_128(__M,
@@ -2624,7 +2616,7 @@ _mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
                                               (__v8hi) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_set1_epi16 (__mmask8 __M, short __A)
 {
   return (__m128i) __builtin_ia32_selectw_128(__M,
diff --git a/clang/lib/Headers/avx512vlcdintrin.h b/clang/lib/Headers/avx512vlcdintrin.h
index 7719680faf93a..df66e1df3bf13 100644
--- a/clang/lib/Headers/avx512vlcdintrin.h
+++ b/clang/lib/Headers/avx512vlcdintrin.h
@@ -16,13 +16,13 @@
 /* Define the default attributes for the functions in this file. */
 #if defined(__cplusplus) && (__cplusplus >= 201103L)
 #define __DEFAULT_FN_ATTRS128                                                  \
-  constexpr __attribute__((__always_inline__, __nodebug__,                     \
-                           __target__("avx512vl,avx512cd"),                    \
-                           __min_vector_width__(128)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512cd"),                              \
+                 __min_vector_width__(128))) constexpr
 #define __DEFAULT_FN_ATTRS256                                                  \
-  constexpr __attribute__((__always_inline__, __nodebug__,                     \
-                           __target__("avx512vl,avx512cd"),                    \
-                           __min_vector_width__(256)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512cd"),                              \
+                 __min_vector_width__(256))) constexpr
 #else
 #define __DEFAULT_FN_ATTRS128                                                  \
   __attribute__((__always_inline__, __nodebug__,                               \
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index 5b2b3f0d0bbd4..885231b030b23 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -2010,24 +2010,24 @@ _mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) {
                                               (__v16hf)__A);
 }
 
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
+static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) {
   return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
                                                  (__v8hi)__B);
 }
 
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
+static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) {
   return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
                                                  (__v16hi)__B);
 }
 
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
+static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_permutexvar_ph(__m128i __A, __m128h __B) {
   return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A);
 }
 
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
+static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_permutexvar_ph(__m256i __A, __m256h __B) {
   return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A);
 }
diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index 92bb444aeb5b8..e5249926b934e 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -3556,13 +3556,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                                (__v8sf)_mm256_setzero_ps());
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) {
     return (__m128i)__builtin_ia32_vpermi2vard128((__v4si) __A, (__v4si)__I,
                                                   (__v4si)__B);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I,
                               __m128i __B) {
     return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -3570,7 +3570,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4si)__A);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U,
                                __m128i __B) {
     return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -3578,7 +3578,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4si)__I);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I,
                                __m128i __B) {
     return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -3586,13 +3586,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4si)_mm_setzero_si128());
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) {
     return (__m256i)__builtin_ia32_vpermi2vard256((__v8si)__A, (__v8si) __I,
                                                   (__v8si) __B);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I,
                                  __m256i __B) {
     return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -3600,7 +3600,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                  (__v8si)__A);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U,
                                   __m256i __B) {
     return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -3608,7 +3608,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                  (__v8si)__I);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I,
                                   __m256i __B) {
     return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -3616,40 +3616,43 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                  (__v8si)_mm256_setzero_si256());
   }
 
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) {
     return (__m128d)__builtin_ia32_vpermi2varpd128((__v2df)__A, (__v2di)__I,
                                                    (__v2df)__B);
   }
 
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
-  _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) {
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+  _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I,
+                           __m128d __B) {
     return (__m128d)__builtin_ia32_selectpd_128(__U,
                                        (__v2df)_mm_permutex2var_pd(__A, __I, __B),
                                        (__v2df)__A);
   }
 
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
-  _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) {
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+  _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U,
+                            __m128d __B) {
     return (__m128d)__builtin_ia32_selectpd_128(__U,
                                        (__v2df)_mm_permutex2var_pd(__A, __I, __B),
                                        (__v2df)(__m128d)__I);
   }
 
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
-  _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) {
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+  _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I,
+                            __m128d __B) {
     return (__m128d)__builtin_ia32_selectpd_128(__U,
                                        (__v2df)_mm_permutex2var_pd(__A, __I, __B),
                                        (__v2df)_mm_setzero_pd());
   }
 
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) {
     return (__m256d)__builtin_ia32_vpermi2varpd256((__v4df)__A, (__v4di)__I,
                                                    (__v4df)__B);
   }
 
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I,
                               __m256d __B) {
     return (__m256d)__builtin_ia32_selectpd_256(__U,
@@ -3657,7 +3660,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4df)__A);
   }
 
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U,
                                __m256d __B) {
     return (__m256d)__builtin_ia32_selectpd_256(__U,
@@ -3665,7 +3668,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4df)(__m256d)__I);
   }
 
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I,
                                __m256d __B) {
     return (__m256d)__builtin_ia32_selectpd_256(__U,
@@ -3673,47 +3676,48 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4df)_mm256_setzero_pd());
   }
 
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) {
     return (__m128)__builtin_ia32_vpermi2varps128((__v4sf)__A, (__v4si)__I,
                                                   (__v4sf)__B);
   }
 
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) {
     return (__m128)__builtin_ia32_selectps_128(__U,
                                        (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
                                        (__v4sf)__A);
   }
 
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) {
     return (__m128)__builtin_ia32_selectps_128(__U,
                                        (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
                                        (__v4sf)(__m128)__I);
   }
 
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) {
     return (__m128)__builtin_ia32_selectps_128(__U,
                                        (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
                                        (__v4sf)_mm_setzero_ps());
   }
 
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) {
     return (__m256)__builtin_ia32_vpermi2varps256((__v8sf)__A, (__v8si)__I,
                                                   (__v8sf) __B);
   }
 
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
-  _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) {
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+  _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I,
+                              __m256 __B) {
     return (__m256)__builtin_ia32_selectps_256(__U,
                                     (__v8sf)_mm256_permutex2var_ps(__A, __I, __B),
                                     (__v8sf)__A);
   }
 
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U,
                                __m256 __B) {
     return (__m256)__builtin_ia32_selectps_256(__U,
@@ -3721,7 +3725,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v8sf)(__m256)__I);
   }
 
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I,
                                __m256 __B) {
     return (__m256)__builtin_ia32_selectps_256(__U,
@@ -3729,13 +3733,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v8sf)_mm256_setzero_ps());
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) {
     return (__m128i)__builtin_ia32_vpermi2varq128((__v2di)__A, (__v2di)__I,
                                                   (__v2di)__B);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I,
                               __m128i __B) {
     return (__m128i)__builtin_ia32_selectq_128(__U,
@@ -3743,7 +3747,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v2di)__A);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U,
                                __m128i __B) {
     return (__m128i)__builtin_ia32_selectq_128(__U,
@@ -3751,7 +3755,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v2di)__I);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I,
                                __m128i __B) {
     return (__m128i)__builtin_ia32_selectq_128(__U,
@@ -3759,14 +3763,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v2di)_mm_setzero_si128());
   }
 
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) {
     return (__m256i)__builtin_ia32_vpermi2varq256((__v4di)__A, (__v4di) __I,
                                                   (__v4di) __B);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I,
                                  __m256i __B) {
     return (__m256i)__builtin_ia32_selectq_256(__U,
@@ -3774,7 +3777,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                  (__v4di)__A);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U,
                                   __m256i __B) {
     return (__m256i)__builtin_ia32_selectq_256(__U,
@@ -3782,7 +3785,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                  (__v4di)__I);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I,
                                   __m256i __B) {
     return (__m256i)__builtin_ia32_selectq_256(__U,
diff --git a/clang/lib/Headers/avxifmaintrin.h b/clang/lib/Headers/avxifmaintrin.h
index e452d5f0920e9..30df01caed6cf 100644
--- a/clang/lib/Headers/avxifmaintrin.h
+++ b/clang/lib/Headers/avxifmaintrin.h
@@ -17,11 +17,11 @@
 /* Define the default attributes for the functions in this file. */
 #if defined(__cplusplus) && (__cplusplus >= 201103L)
 #define __DEFAULT_FN_ATTRS128                                                  \
-  constexpr __attribute__((__always_inline__, __nodebug__,                     \
-                           __target__("avxifma"), __min_vector_width__(128)))
+  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
+                 __min_vector_width__(128))) constexpr
 #define __DEFAULT_FN_ATTRS256                                                  \
-  constexpr __attribute__((__always_inline__, __nodebug__,                     \
-                           __target__("avxifma"), __min_vector_width__(256)))
+  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
+                 __min_vector_width__(256))) constexpr
 #else
 #define __DEFAULT_FN_ATTRS128                                                  \
   __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
index a918af39e4074..2e2703de18cb1 100644
--- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
@@ -1053,76 +1053,25 @@ _HLSL_BUILTIN_ALIAS(__builtin_elementwise_exp2)
 float4 exp2(float4);
 
 //===----------------------------------------------------------------------===//
-// firstbithigh builtins
+// f16tof32 builtins
 //===----------------------------------------------------------------------===//
 
-/// \fn T firstbithigh(T Val)
-/// \brief Returns the location of the first set bit starting from the highest
-/// order bit and working downward, per component.
-/// \param Val the input value.
-
-#ifdef __HLSL_ENABLE_16_BIT
-_HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint firstbithigh(int16_t);
-_HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint2 firstbithigh(int16_t2);
-_HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint3 firstbithigh(int16_t3);
-_HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint4 firstbithigh(int16_t4);
-_HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint firstbithigh(uint16_t);
-_HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint2 firstbithigh(uint16_t2);
-_HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint3 firstbithigh(uint16_t3);
-_HLSL_AVAILABILITY(shadermodel, 6.2)
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint4 firstbithigh(uint16_t4);
-#endif
+/// \fn float f16tof32(uint x)
+/// \brief Returns the half value stored in the low 16 bits of the uint arg
+/// converted to a float.
+/// \param x The uint containing two half values.
+///
+/// The float value of the half value found in the low 16 bits of the \a xi
+/// parameter.
 
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint firstbithigh(int);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint2 firstbithigh(int2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint3 firstbithigh(int3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint4 firstbithigh(int4);
-
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint firstbithigh(uint);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint2 firstbithigh(uint2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint3 firstbithigh(uint3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint4 firstbithigh(uint4);
-
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint firstbithigh(int64_t);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint2 firstbithigh(int64_t2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint3 firstbithigh(int64_t3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint4 firstbithigh(int64_t4);
-
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint firstbithigh(uint64_t);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint2 firstbithigh(uint64_t2);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint3 firstbithigh(uint64_t3);
-_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
-uint4 firstbithigh(uint64_t4);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32)
+float f16tof32(uint);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32)
+float2 f16tof32(uint2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32)
+float3 f16tof32(uint3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32)
+float4 f16tof32(uint4);
 
 //===----------------------------------------------------------------------===//
 // firstbitlow builtins
@@ -2090,9 +2039,17 @@ T select(bool, T, T);
 /// \param FalseVals The vector values are chosen from when conditions are
 /// false.
 
-template <typename T, int Sz>
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 2> select(vector<bool, 2>, vector<T, 2>, vector<T, 2>);
+
+template <typename T>
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
-vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, vector<T, Sz>);
+vector<T, 3> select(vector<bool, 3>, vector<T, 3>, vector<T, 3>);
+
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 4> select(vector<bool, 4>, vector<T, 4>, vector<T, 4>);
 
 /// \fn vector<T,Sz> select(vector<bool,Sz> Conds, T TrueVal,
 ///                         vector<T,Sz> FalseVals)
@@ -2102,9 +2059,17 @@ vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, vector<T, Sz>);
 /// \param FalseVals The vector values are chosen from when conditions are
 /// false.
 
-template <typename T, int Sz>
+template <typename T>
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
-vector<T, Sz> select(vector<bool, Sz>, T, vector<T, Sz>);
+vector<T, 2> select(vector<bool, 2>, T, vector<T, 2>);
+
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 3> select(vector<bool, 3>, T, vector<T, 3>);
+
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 4> select(vector<bool, 4>, T, vector<T, 4>);
 
 /// \fn vector<T,Sz> select(vector<bool,Sz> Conds, vector<T,Sz> TrueVals,
 ///                         T FalseVal)
@@ -2113,9 +2078,17 @@ vector<T, Sz> select(vector<bool, Sz>, T, vector<T, Sz>);
 /// \param TrueVals The vector values are chosen from when conditions are true.
 /// \param FalseVal The scalar value to splat from when conditions are false.
 
-template <typename T, int Sz>
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 2> select(vector<bool, 2>, vector<T, 2>, T);
+
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 3> select(vector<bool, 3>, vector<T, 3>, T);
+
+template <typename T>
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
-vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, T);
+vector<T, 4> select(vector<bool, 4>, vector<T, 4>, T);
 
 /// \fn vector<T,Sz> select(vector<bool,Sz> Conds, vector<T,Sz> TrueVals,
 ///                         T FalseVal)
@@ -2124,10 +2097,20 @@ vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, T);
 /// \param TrueVal The scalar value to splat from when conditions are true.
 /// \param FalseVal The scalar value to splat from when conditions are false.
 
-template <typename T, int Sz>
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, 2>> select(
+    vector<bool, 2>, T, T);
+
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, 3>> select(
+    vector<bool, 3>, T, T);
+
+template <typename T>
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
-__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, Sz>> select(
-    vector<bool, Sz>, T, T);
+__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, 4>> select(
+    vector<bool, 4>, T, T);
 
 //===----------------------------------------------------------------------===//
 // sin builtins
diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
index fe4277ed4a7d2..ee243abef6a41 100644
--- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
+++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #ifndef _HLSL_COMPAT_OVERLOADS_H_
-#define _HLSl_COMPAT_OVERLOADS_H_
+#define _HLSL_COMPAT_OVERLOADS_H_
 
 namespace hlsl {
 
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h
index c877234479ad1..3d8fe7ea701a6 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h
@@ -148,6 +148,18 @@ template <typename T> constexpr T ldexp_impl(T X, T Exp) {
   return exp2(Exp) * X;
 }
 
+template <typename K, typename T, int BitWidth>
+constexpr K firstbithigh_impl(T X) {
+  K FBH = __builtin_hlsl_elementwise_firstbithigh(X);
+#if defined(__DIRECTX__)
+  // The firstbithigh DXIL ops count bits from the wrong side, so we need to
+  // invert it for DirectX.
+  K Inversion = (BitWidth - 1) - FBH;
+  FBH = select(FBH == -1, FBH, Inversion);
+#endif
+  return FBH;
+}
+
 } // namespace __detail
 } // namespace hlsl
 
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 5ba5bfb9abde0..33ed14328ee8a 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -261,6 +261,67 @@ faceforward(__detail::HLSL_FIXED_VECTOR<float, L> N,
   return __detail::faceforward_impl(N, I, Ng);
 }
 
+//===----------------------------------------------------------------------===//
+// firstbithigh builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn T firstbithigh(T Val)
+/// \brief Returns the location of the first set bit starting from the lowest
+/// order bit and working upward, per component.
+/// \param Val the input value.
+
+#ifdef __HLSL_ENABLE_16_BIT
+
+template <typename T>
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+const inline __detail::enable_if_t<__detail::is_same<int16_t, T>::value ||
+                                       __detail::is_same<uint16_t, T>::value,
+                                   uint> firstbithigh(T X) {
+  return __detail::firstbithigh_impl<uint, T, 16>(X);
+}
+
+template <typename T, int N>
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+const
+    inline __detail::enable_if_t<__detail::is_same<int16_t, T>::value ||
+                                     __detail::is_same<uint16_t, T>::value,
+                                 vector<uint, N>> firstbithigh(vector<T, N> X) {
+  return __detail::firstbithigh_impl<vector<uint, N>, vector<T, N>, 16>(X);
+}
+
+#endif
+
+template <typename T>
+const inline __detail::enable_if_t<
+    __detail::is_same<int, T>::value || __detail::is_same<uint, T>::value, uint>
+firstbithigh(T X) {
+  return __detail::firstbithigh_impl<uint, T, 32>(X);
+}
+
+template <typename T, int N>
+const inline __detail::enable_if_t<__detail::is_same<int, T>::value ||
+                                       __detail::is_same<uint, T>::value,
+                                   vector<uint, N>>
+firstbithigh(vector<T, N> X) {
+  return __detail::firstbithigh_impl<vector<uint, N>, vector<T, N>, 32>(X);
+}
+
+template <typename T>
+const inline __detail::enable_if_t<__detail::is_same<int64_t, T>::value ||
+                                       __detail::is_same<uint64_t, T>::value,
+                                   uint>
+firstbithigh(T X) {
+  return __detail::firstbithigh_impl<uint, T, 64>(X);
+}
+
+template <typename T, int N>
+const inline __detail::enable_if_t<__detail::is_same<int64_t, T>::value ||
+                                       __detail::is_same<uint64_t, T>::value,
+                                   vector<uint, N>>
+firstbithigh(vector<T, N> X) {
+  return __detail::firstbithigh_impl<vector<uint, N>, vector<T, N>, 64>(X);
+}
+
 //===----------------------------------------------------------------------===//
 // fmod builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Headers/hvx_hexagon_protos.h b/clang/lib/Headers/hvx_hexagon_protos.h
index fd120a589f64f..19309a40d6dd1 100644
--- a/clang/lib/Headers/hvx_hexagon_protos.h
+++ b/clang/lib/Headers/hvx_hexagon_protos.h
@@ -5605,6 +5605,399 @@
   __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_hf_f8)(Vu, Vv)
 #endif /* __HEXAGON_ARCH___ >= 79 */
 
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf16=vabs(Vu32.hf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vabs_Vhf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf16_vabs_Vhf(Vu)                                                  \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf16_hf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf16=vabs(Vu32.qf16)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vabs_Vqf16(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf16_vabs_Vqf16(Vu)                                                \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf16_qf16)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf32=vabs(Vu32.qf32)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vabs_Vqf32(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf32_vabs_Vqf32(Vu)                                                \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf32_qf32)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf32=vabs(Vu32.sf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vabs_Vsf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf32_vabs_Vsf(Vu)                                                  \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf32_sf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32=valign4(Vu32,Vv32,Rt8)
+   C Intrinsic Prototype: HVX_Vector Q6_V_valign4_VVR(HVX_Vector Vu, HVX_Vector
+   Vv, Word32 Rt) Instruction Type:      CVI_VA Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_V_valign4_VVR(Vu, Vv, Rt)                                           \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_valign4)(Vu, Vv, Rt)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.bf=Vuu32.qf32
+   C Intrinsic Prototype: HVX_Vector Q6_Vbf_equals_Wqf32(HVX_VectorPair Vuu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vbf_equals_Wqf32(Vuu)                                               \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_bf_qf32)(Vuu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.f8=Vu32.qf16
+   C Intrinsic Prototype: HVX_Vector Q6_V_equals_Vqf16(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_V_equals_Vqf16(Vu)                                                  \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_f8_qf16)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.h=Vu32.hf:rnd
+   C Intrinsic Prototype: HVX_Vector Q6_Vh_equals_Vhf_rnd(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vh_equals_Vhf_rnd(Vu)                                               \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_h_hf_rnd)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vdd32.qf16=Vu32.f8
+   C Intrinsic Prototype: HVX_VectorPair Q6_Wqf16_equals_V(HVX_Vector Vu)
+   Instruction Type:      CVI_VP_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Wqf16_equals_V(Vu)                                                  \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_f8)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf16=Vu32.hf
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_equals_Vhf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf16_equals_Vhf(Vu)                                                \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_hf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf16=Vu32.qf16
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_equals_Vqf16(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf16_equals_Vqf16(Vu)                                              \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_qf16)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf32=Vu32.qf32
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_equals_Vqf32(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf32_equals_Vqf32(Vu)                                              \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf32_qf32)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf32=Vu32.sf
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_equals_Vsf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf32_equals_Vsf(Vu)                                                \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf32_sf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Qd4=vcmp.eq(Vu32.hf,Vv32.hf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eq_VhfVhf(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VA Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_eq_VhfVhf(Vu, Vv)                                            \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
+      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf)(Vu, Vv)), -1)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Qx4&=vcmp.eq(Vu32.hf,Vv32.hf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqand_QVhfVhf(HVX_VectorPred
+   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
+   Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_eqand_QVhfVhf(Qx, Vu, Vv)                                    \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
+      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_and)(                  \
+          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
+          Vv)),                                                                \
+      -1)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Qx4|=vcmp.eq(Vu32.hf,Vv32.hf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqor_QVhfVhf(HVX_VectorPred
+   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
+   Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_eqor_QVhfVhf(Qx, Vu, Vv)                                     \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
+      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_or)(                   \
+          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
+          Vv)),                                                                \
+      -1)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Qx4^=vcmp.eq(Vu32.hf,Vv32.hf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqxacc_QVhfVhf(HVX_VectorPred
+   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
+   Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_eqxacc_QVhfVhf(Qx, Vu, Vv)                                   \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
+      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_xor)(                  \
+          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
+          Vv)),                                                                \
+      -1)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Qd4=vcmp.eq(Vu32.sf,Vv32.sf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eq_VsfVsf(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VA Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_eq_VsfVsf(Vu, Vv)                                            \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
+      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf)(Vu, Vv)), -1)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Qx4&=vcmp.eq(Vu32.sf,Vv32.sf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqand_QVsfVsf(HVX_VectorPred
+   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
+   Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_eqand_QVsfVsf(Qx, Vu, Vv)                                    \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
+      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_and)(                  \
+          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
+          Vv)),                                                                \
+      -1)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Qx4|=vcmp.eq(Vu32.sf,Vv32.sf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqor_QVsfVsf(HVX_VectorPred
+   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
+   Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_eqor_QVsfVsf(Qx, Vu, Vv)                                     \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
+      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_or)(                   \
+          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
+          Vv)),                                                                \
+      -1)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Qx4^=vcmp.eq(Vu32.sf,Vv32.sf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqxacc_QVsfVsf(HVX_VectorPred
+   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
+   Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_eqxacc_QVsfVsf(Qx, Vu, Vv)                                   \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)(                         \
+      (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_xor)(                  \
+          __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,   \
+          Vv)),                                                                \
+      -1)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.w=vilog2(Vu32.hf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vhf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vw_vilog2_Vhf(Vu)                                                   \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_hf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.w=vilog2(Vu32.qf16)
+   C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vqf16(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vw_vilog2_Vqf16(Vu)                                                 \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_qf16)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.w=vilog2(Vu32.qf32)
+   C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vqf32(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vw_vilog2_Vqf32(Vu)                                                 \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_qf32)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.w=vilog2(Vu32.sf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vsf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vw_vilog2_Vsf(Vu)                                                   \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_sf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf16=vneg(Vu32.hf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vneg_Vhf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf16_vneg_Vhf(Vu)                                                  \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf16_hf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf16=vneg(Vu32.qf16)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vneg_Vqf16(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf16_vneg_Vqf16(Vu)                                                \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf16_qf16)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf32=vneg(Vu32.qf32)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vneg_Vqf32(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf32_vneg_Vqf32(Vu)                                                \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf32_qf32)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf32=vneg(Vu32.sf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vneg_Vsf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf32_vneg_Vsf(Vu)                                                  \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf32_sf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf16=vsub(Vu32.hf,Vv32.qf16)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vsub_VhfVqf16(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VS Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf16_vsub_VhfVqf16(Vu, Vv)                                         \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_hf_mix)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
+#if __HVX_ARCH__ >= 81
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf32=vsub(Vu32.sf,Vv32.qf32)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vsub_VsfVqf32(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VS Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vqf32_vsub_VsfVqf32(Vu, Vv)                                         \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_sf_mix)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 81 */
+
 #endif /* __HVX__ */
 
 #endif
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 35f012cc70043..19064a4ff5cea 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -475,24 +475,12 @@ _storebe_i64(void * __P, long long __D) {
 
 #include <amxfp8intrin.h>
 
-#include <amxtransposeintrin.h>
-
 #include <amxmovrsintrin.h>
 
-#include <amxmovrstransposeintrin.h>
-
 #include <amxavx512intrin.h>
 
 #include <amxtf32intrin.h>
 
-#include <amxtf32transposeintrin.h>
-
-#include <amxbf16transposeintrin.h>
-
-#include <amxfp16transposeintrin.h>
-
-#include <amxcomplextransposeintrin.h>
-
 #include <avx512vp2intersectintrin.h>
 
 #include <avx512vlvp2intersectintrin.h>
diff --git a/clang/lib/Headers/llvm_libc_wrappers/stdlib.h b/clang/lib/Headers/llvm_libc_wrappers/stdlib.h
index 1da22abd0bc48..d79e7fa041ad4 100644
--- a/clang/lib/Headers/llvm_libc_wrappers/stdlib.h
+++ b/clang/lib/Headers/llvm_libc_wrappers/stdlib.h
@@ -34,13 +34,13 @@ _Static_assert(__builtin_offsetof(div_t, quot) == 0, "ABI mismatch!");
 _Static_assert(__builtin_offsetof(ldiv_t, quot) == 0, "ABI mismatch!");
 _Static_assert(__builtin_offsetof(lldiv_t, quot) == 0, "ABI mismatch!");
 
-#if defined(__GLIBC__) && __cplusplus >= 201703L
+#if defined(__GLIBC__) && __cplusplus >= 201103L
 #define at_quick_exit atexit
 #endif
 
 #include <llvm-libc-decls/stdlib.h>
 
-#if defined(__GLIBC__) && __cplusplus >= 201703L
+#if defined(__GLIBC__) && __cplusplus >= 201103L
 #undef at_quick_exit
 #endif
 
diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap
index 2e4d533356569..c13dd3fd48ac8 100644
--- a/clang/lib/Headers/module.modulemap
+++ b/clang/lib/Headers/module.modulemap
@@ -253,6 +253,11 @@ module _Builtin_stdbool [system] {
   export *
 }
 
+module _Builtin_stdckdint [system] {
+  header "stdckdint.h"
+  export *
+}
+
 module _Builtin_stdcountof [system] {
   header "stdcountof.h"
   export *
diff --git a/clang/lib/Headers/pmmintrin.h b/clang/lib/Headers/pmmintrin.h
index 42bd343e326de..6b152bde29fc1 100644
--- a/clang/lib/Headers/pmmintrin.h
+++ b/clang/lib/Headers/pmmintrin.h
@@ -166,7 +166,7 @@ _mm_moveldup_ps(__m128 __a)
 ///    A 128-bit vector of [2 x double] containing the right source operand.
 /// \returns A 128-bit vector of [2 x double] containing the alternating sums
 ///    and differences of both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_addsub_pd(__m128d __a, __m128d __b) {
   return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
 }
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index cde354c9cd8d1..7764fa7dc92b9 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -33,7 +33,6 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Job.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendAction.h"
@@ -43,6 +42,7 @@
 #include "clang/Interpreter/Interpreter.h"
 #include "clang/Interpreter/Value.h"
 #include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Options/Options.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Serialization/ObjectFilePCHContainerReader.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
@@ -185,7 +185,7 @@ IncrementalCompilerBuilder::create(std::string TT,
   llvm::ArrayRef<const char *> RF = llvm::ArrayRef(ClangArgv);
   std::unique_ptr<driver::Compilation> Compilation(Driver.BuildCompilation(RF));
 
-  if (Compilation->getArgs().hasArg(driver::options::OPT_v))
+  if (Compilation->getArgs().hasArg(options::OPT_v))
     Compilation->getJobs().Print(llvm::errs(), "\n", /*Quote=*/false);
 
   auto ErrOrCC1Args = GetCC1Arguments(&Diags, Compilation.get());
@@ -394,36 +394,48 @@ Interpreter::outOfProcessJITBuilder(JITConfig Config) {
 
 llvm::Expected<std::string>
 Interpreter::getOrcRuntimePath(const driver::ToolChain &TC) {
-  std::optional<std::string> CompilerRTPath = TC.getCompilerRTPath();
-  std::optional<std::string> ResourceDir = TC.getRuntimePath();
+  const std::array<const char *, 3> OrcRTLibNames = {
+      "liborc_rt.a", "liborc_rt_osx.a", "liborc_rt-x86_64.a"};
+
+  auto findInDir = [&](llvm::StringRef Base) -> std::optional<std::string> {
+    for (const char *LibName : OrcRTLibNames) {
+      llvm::SmallString<256> CandidatePath(Base);
+      llvm::sys::path::append(CandidatePath, LibName);
+      if (llvm::sys::fs::exists(CandidatePath))
+        return std::string(CandidatePath.str());
+    }
+    return std::nullopt;
+  };
+
+  std::string SearchedPaths;
 
-  if (!CompilerRTPath) {
+  if (std::optional<std::string> CompilerRTPath = TC.getCompilerRTPath()) {
+    if (auto Found = findInDir(*CompilerRTPath))
+      return *Found;
+    SearchedPaths += *CompilerRTPath;
+  } else {
     return llvm::make_error<llvm::StringError>("CompilerRT path not found",
                                                std::error_code());
   }
 
-  const std::array<const char *, 3> OrcRTLibNames = {
-      "liborc_rt.a", "liborc_rt_osx.a", "liborc_rt-x86_64.a"};
-
-  for (const char *LibName : OrcRTLibNames) {
-    llvm::SmallString<256> CandidatePath((*CompilerRTPath).c_str());
-    llvm::sys::path::append(CandidatePath, LibName);
-
-    if (llvm::sys::fs::exists(CandidatePath)) {
-      return CandidatePath.str().str();
-    }
+  if (std::optional<std::string> ResourceDir = TC.getRuntimePath()) {
+    if (auto Found = findInDir(*ResourceDir))
+      return *Found;
+    if (!SearchedPaths.empty())
+      SearchedPaths += "; ";
+    SearchedPaths += *ResourceDir;
+  } else {
+    return llvm::make_error<llvm::StringError>("ResourceDir path not found",
+                                               std::error_code());
   }
 
   return llvm::make_error<llvm::StringError>(
-      llvm::Twine("OrcRuntime library not found in: ") + (*CompilerRTPath),
+      llvm::Twine("OrcRuntime library not found in: ") + SearchedPaths,
       std::error_code());
 }
 
 llvm::Expected<std::unique_ptr<Interpreter>>
 Interpreter::create(std::unique_ptr<CompilerInstance> CI, JITConfig Config) {
-  llvm::Error Err = llvm::Error::success();
-
-  std::unique_ptr<llvm::orc::LLJITBuilder> JB;
 
   if (Config.IsOutOfProcess) {
     const TargetInfo &TI = CI->getTarget();
@@ -453,6 +465,9 @@ Interpreter::create(std::unique_ptr<CompilerInstance> CI, JITConfig Config) {
     }
   }
 
+  llvm::Error Err = llvm::Error::success();
+  std::unique_ptr<llvm::orc::LLJITBuilder> JB;
+
   auto Interp = std::unique_ptr<Interpreter>(new Interpreter(
       std::move(CI), Err, std::move(JB), /*Consumer=*/nullptr, Config));
   if (auto E = std::move(Err))
diff --git a/clang/lib/Interpreter/InterpreterUtils.h b/clang/lib/Interpreter/InterpreterUtils.h
index fbf9814b0d4a7..4efe8b9fbc6cc 100644
--- a/clang/lib/Interpreter/InterpreterUtils.h
+++ b/clang/lib/Interpreter/InterpreterUtils.h
@@ -21,11 +21,11 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Job.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/TextDiagnosticBuffer.h"
 #include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Options/Options.h"
 
 #include "clang/Sema/Lookup.h"
 #include "llvm/IR/Module.h"
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index 637a08fe4dcdb..b8202ea11be36 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -258,6 +258,7 @@ static bool isBuiltinHeaderName(StringRef FileName) {
            .Case("stdarg.h", true)
            .Case("stdatomic.h", true)
            .Case("stdbool.h", true)
+           .Case("stdckdint.h", true)
            .Case("stdcountof.h", true)
            .Case("stddef.h", true)
            .Case("stdint.h", true)
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index 6a5e5d4bad3a6..891c8ab7f3155 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -4018,7 +4018,7 @@ void Preprocessor::HandleEmbedDirective(SourceLocation HashLoc, Token &EmbedTok,
       this->LookupEmbedFile(Filename, isAngled, true, LookupFromFile);
   if (!MaybeFileRef) {
     // could not find file
-    if (Callbacks && Callbacks->EmbedFileNotFound(OriginalFilename)) {
+    if (Callbacks && Callbacks->EmbedFileNotFound(Filename)) {
       return;
     }
     Diag(FilenameTok, diag::err_pp_file_not_found) << Filename;
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index dd80ae586a1f6..5efa4b5b3f872 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -1735,7 +1735,19 @@ void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
       Diag(getLastFPEvalPragmaLocation(), diag::note_pragma_entered_here);
     }
   } else if (II == Ident__COUNTER__) {
-    // __COUNTER__ expands to a simple numeric value.
+    Diag(Tok.getLocation(),
+         getLangOpts().C2y ? diag::warn_counter : diag::ext_counter);
+    // __COUNTER__ expands to a simple numeric value that must be less than
+    // 2147483647.
+    constexpr uint32_t MaxPosValue = std::numeric_limits<int32_t>::max();
+    if (CounterValue > MaxPosValue) {
+      Diag(Tok.getLocation(), diag::err_counter_overflow);
+      // Retain the maximal value so we don't issue conversion-related
+      // diagnostics by overflowing into a long long. While this does produce
+      // a duplicate value, there's no way to ignore this error so there's no
+      // translation anyway.
+      CounterValue = MaxPosValue;
+    }
     OS << CounterValue++;
     Tok.setKind(tok::numeric_constant);
   } else if (II == Ident__has_feature) {
diff --git a/clang/lib/Options/CMakeLists.txt b/clang/lib/Options/CMakeLists.txt
new file mode 100644
index 0000000000000..a762e9918b41c
--- /dev/null
+++ b/clang/lib/Options/CMakeLists.txt
@@ -0,0 +1,18 @@
+set(LLVM_LINK_COMPONENTS
+  Option
+  Support
+)
+
+add_clang_library(clangOptions
+  DriverOptions.cpp
+  OptionUtils.cpp
+  
+  DEPENDS
+  ClangDriverOptions
+  # These generated headers are included transitively.
+  target_parser_gen
+
+  LINK_LIBS
+  clangBasic
+  ${system_libs}
+)
diff --git a/clang/lib/Driver/DriverOptions.cpp b/clang/lib/Options/DriverOptions.cpp
similarity index 76%
rename from clang/lib/Driver/DriverOptions.cpp
rename to clang/lib/Options/DriverOptions.cpp
index cde1f8989935b..d91e9291fb2f6 100644
--- a/clang/lib/Driver/DriverOptions.cpp
+++ b/clang/lib/Options/DriverOptions.cpp
@@ -6,33 +6,32 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/OptTable.h"
 #include <cassert>
 
-using namespace clang::driver;
-using namespace clang::driver::options;
+using namespace clang::options;
 using namespace llvm::opt;
 
 #define OPTTABLE_STR_TABLE_CODE
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTTABLE_STR_TABLE_CODE
 
 #define OPTTABLE_VALUES_CODE
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTTABLE_VALUES_CODE
 
 #define OPTTABLE_PREFIXES_TABLE_CODE
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTTABLE_PREFIXES_TABLE_CODE
 
 #define OPTTABLE_PREFIXES_UNION_CODE
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTTABLE_PREFIXES_UNION_CODE
 
 static constexpr OptTable::Info InfoTable[] = {
 #define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__),
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTION
 };
 
@@ -44,9 +43,9 @@ class DriverOptTable : public PrecomputedOptTable {
       : PrecomputedOptTable(OptionStrTable, OptionPrefixesTable, InfoTable,
                             OptionPrefixesUnion) {}
 };
-}
+} // anonymous namespace
 
-const llvm::opt::OptTable &clang::driver::getDriverOptTable() {
+const llvm::opt::OptTable &clang::getDriverOptTable() {
   static DriverOptTable Table;
   return Table;
 }
diff --git a/clang/lib/Driver/OptionUtils.cpp b/clang/lib/Options/OptionUtils.cpp
similarity index 97%
rename from clang/lib/Driver/OptionUtils.cpp
rename to clang/lib/Options/OptionUtils.cpp
index 1f36ffc03cab3..fcafd3c83c6b3 100644
--- a/clang/lib/Driver/OptionUtils.cpp
+++ b/clang/lib/Options/OptionUtils.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Options/OptionUtils.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticDriver.h"
-#include "clang/Driver/OptionUtils.h"
 #include "llvm/Option/ArgList.h"
 
 using namespace clang;
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index e4b158e4a6248..8688ccf41acb5 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -2613,7 +2613,7 @@ Decl *Parser::ParseDeclarationAfterDeclaratorAndAttributes(
       }
 
       PreferredType.enterVariableInit(Tok.getLocation(), ThisDecl);
-      ExprResult Init = ParseInitializer();
+      ExprResult Init = ParseInitializer(ThisDecl);
 
       // If this is the only decl in (possibly) range based for statement,
       // our best guess is that the user meant ':' instead of '='.
@@ -4248,6 +4248,13 @@ void Parser::ParseDeclarationSpecifiers(
 
     // type-specifier
     case tok::kw_short:
+      if (!getLangOpts().NativeInt16Type) {
+        Diag(Tok, diag::err_unknown_typename) << Tok.getName();
+        DS.SetTypeSpecError();
+        DS.SetRangeEnd(Tok.getLocation());
+        ConsumeToken();
+        goto DoneWithDeclSpec;
+      }
       isInvalid = DS.SetTypeSpecWidth(TypeSpecifierWidth::Short, Loc, PrevSpec,
                                       DiagID, Policy);
       break;
@@ -5363,8 +5370,13 @@ void Parser::ParseEnumBody(SourceLocation StartLoc, Decl *EnumDecl,
   T.consumeOpen();
 
   // C does not allow an empty enumerator-list, C++ does [dcl.enum].
-  if (Tok.is(tok::r_brace) && !getLangOpts().CPlusPlus)
-    Diag(Tok, diag::err_empty_enum);
+  if (Tok.is(tok::r_brace) && !getLangOpts().CPlusPlus) {
+    if (getLangOpts().MicrosoftExt)
+      Diag(T.getOpenLocation(), diag::ext_ms_c_empty_enum_type)
+          << SourceRange(T.getOpenLocation(), Tok.getLocation());
+    else
+      Diag(Tok, diag::err_empty_enum);
+  }
 
   SmallVector<Decl *, 32> EnumConstantDecls;
   SmallVector<SuppressAccessChecks, 32> EnumAvailabilityDiags;
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index b96968d4592f5..d8ed7e3ff96bd 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -3359,7 +3359,7 @@ ExprResult Parser::ParseCXXMemberInitializer(Decl *D, bool IsFunction,
     Diag(Tok, diag::err_ms_property_initializer) << PD;
     return ExprError();
   }
-  return ParseInitializer();
+  return ParseInitializer(D);
 }
 
 void Parser::SkipCXXMemberSpecification(SourceLocation RecordLoc,
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index 74f87a8cb63c3..7a5d28caf8521 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -772,9 +772,11 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
 
   // Produce a diagnostic if we're not tentatively parsing; otherwise track
   // that our parse has failed.
-  auto Invalid = [&](llvm::function_ref<void()> Action) {
+  auto Result = [&](llvm::function_ref<void()> Action,
+                    LambdaIntroducerTentativeParse State =
+                        LambdaIntroducerTentativeParse::Invalid) {
     if (Tentative) {
-      *Tentative = LambdaIntroducerTentativeParse::Invalid;
+      *Tentative = State;
       return false;
     }
     Action();
@@ -824,7 +826,7 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
           break;
         }
 
-        return Invalid([&] {
+        return Result([&] {
           Diag(Tok.getLocation(), diag::err_expected_comma_or_rsquare);
         });
       }
@@ -861,7 +863,7 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
         ConsumeToken();
         Kind = LCK_StarThis;
       } else {
-        return Invalid([&] {
+        return Result([&] {
           Diag(Tok.getLocation(), diag::err_expected_star_this_capture);
         });
       }
@@ -875,8 +877,9 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
       // or the start of a capture (in the "&" case) with the rest of the
       // capture missing. Both are an error but a misplaced capture-default
       // is more likely if we don't already have a capture default.
-      return Invalid(
-          [&] { Diag(Tok.getLocation(), diag::err_capture_default_first); });
+      return Result(
+          [&] { Diag(Tok.getLocation(), diag::err_capture_default_first); },
+          LambdaIntroducerTentativeParse::Incomplete);
     } else {
       TryConsumeToken(tok::ellipsis, EllipsisLocs[0]);
 
@@ -899,14 +902,13 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
         Id = Tok.getIdentifierInfo();
         Loc = ConsumeToken();
       } else if (Tok.is(tok::kw_this)) {
-        return Invalid([&] {
+        return Result([&] {
           // FIXME: Suggest a fixit here.
           Diag(Tok.getLocation(), diag::err_this_captured_by_reference);
         });
       } else {
-        return Invalid([&] {
-          Diag(Tok.getLocation(), diag::err_expected_capture);
-        });
+        return Result(
+            [&] { Diag(Tok.getLocation(), diag::err_expected_capture); });
       }
 
       TryConsumeToken(tok::ellipsis, EllipsisLocs[2]);
diff --git a/clang/lib/Parse/ParseInit.cpp b/clang/lib/Parse/ParseInit.cpp
index a3be3744a9327..0e86c4c48d5e4 100644
--- a/clang/lib/Parse/ParseInit.cpp
+++ b/clang/lib/Parse/ParseInit.cpp
@@ -581,3 +581,26 @@ bool Parser::ParseMicrosoftIfExistsBraceInitializer(ExprVector &InitExprs,
 
   return !trailingComma;
 }
+
+ExprResult Parser::ParseInitializer(Decl *DeclForInitializer) {
+  // Set DeclForInitializer for file-scope variables.
+  // For constexpr references, set it to suppress runtime warnings.
+  // For non-constexpr references, don't set it to avoid evaluation issues
+  // with self-referencing initializers. Local variables (including local
+  // constexpr) should emit runtime warnings.
+  if (DeclForInitializer && !Actions.ExprEvalContexts.empty()) {
+    if (auto *VD = dyn_cast<VarDecl>(DeclForInitializer);
+        VD && VD->isFileVarDecl() &&
+        (!VD->getType()->isReferenceType() || VD->isConstexpr()))
+      Actions.ExprEvalContexts.back().DeclForInitializer = VD;
+  }
+
+  ExprResult init;
+  if (Tok.isNot(tok::l_brace)) {
+    init = ParseAssignmentExpression();
+  } else {
+    init = ParseBraceInitializer();
+  }
+
+  return init;
+}
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 31bc941e6a015..32a406e2c065f 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -339,7 +339,7 @@ void Parser::ParseOpenMPReductionInitializerForDecl(VarDecl *OmpPrivParm) {
     }
 
     PreferredType.enterVariableInit(Tok.getLocation(), OmpPrivParm);
-    ExprResult Init = ParseInitializer();
+    ExprResult Init = ParseInitializer(OmpPrivParm);
 
     if (Init.isInvalid()) {
       SkipUntil(tok::r_paren, tok::annot_pragma_openmp_end, StopBeforeMatch);
@@ -3178,6 +3178,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
   case OMPC_align:
   case OMPC_message:
   case OMPC_ompx_dyn_cgroup_mem:
+  case OMPC_dyn_groupprivate:
     // OpenMP [2.5, Restrictions]
     //  At most one num_threads clause can appear on the directive.
     // OpenMP [2.8.1, simd construct, Restrictions]
@@ -3216,7 +3217,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
         PP.LookAhead(/*N=*/0).isNot(tok::l_paren))
       Clause = ParseOpenMPClause(CKind, WrongDirective);
     else if (CKind == OMPC_grainsize || CKind == OMPC_num_tasks ||
-             CKind == OMPC_num_threads)
+             CKind == OMPC_num_threads || CKind == OMPC_dyn_groupprivate)
       Clause = ParseOpenMPSingleExprWithArgClause(DKind, CKind, WrongDirective);
     else
       Clause = ParseOpenMPSingleExprClause(CKind, WrongDirective);
@@ -4009,6 +4010,83 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind,
       Arg.push_back(OMPC_GRAINSIZE_unknown);
       KLoc.emplace_back();
     }
+  } else if (Kind == OMPC_dyn_groupprivate) {
+    enum { SimpleModifier, ComplexModifier, NumberOfModifiers };
+    Arg.resize(NumberOfModifiers);
+    KLoc.resize(NumberOfModifiers);
+    Arg[SimpleModifier] = OMPC_DYN_GROUPPRIVATE_unknown;
+    Arg[ComplexModifier] = OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown;
+
+    auto ConsumeModifier = [&]() {
+      unsigned Type = NumberOfModifiers;
+      unsigned Modifier;
+      SourceLocation Loc;
+      if (!Tok.isAnnotation() && PP.getSpelling(Tok) == "fallback" &&
+          NextToken().is(tok::l_paren)) {
+        ConsumeToken();
+        BalancedDelimiterTracker ParenT(*this, tok::l_paren, tok::r_paren);
+        ParenT.consumeOpen();
+
+        Modifier = getOpenMPSimpleClauseType(
+            Kind, Tok.isAnnotation() ? "" : PP.getSpelling(Tok), getLangOpts());
+        if (Modifier <= OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown ||
+            Modifier >= OMPC_DYN_GROUPPRIVATE_FALLBACK_last) {
+          Diag(Tok.getLocation(), diag::err_expected)
+              << "'abort', 'null' or 'default_mem' in fallback modifier";
+          SkipUntil(tok::r_paren);
+          return std::make_tuple(Type, Modifier, Loc);
+        }
+        Type = ComplexModifier;
+        Loc = Tok.getLocation();
+        if (Tok.isNot(tok::r_paren) && Tok.isNot(tok::comma) &&
+            Tok.isNot(tok::annot_pragma_openmp_end))
+          ConsumeAnyToken();
+        ParenT.consumeClose();
+      } else {
+        Modifier = getOpenMPSimpleClauseType(
+            Kind, Tok.isAnnotation() ? "" : PP.getSpelling(Tok), getLangOpts());
+        if (Modifier < OMPC_DYN_GROUPPRIVATE_unknown) {
+          Type = SimpleModifier;
+          Loc = Tok.getLocation();
+          if (Tok.isNot(tok::r_paren) && Tok.isNot(tok::comma) &&
+              Tok.isNot(tok::annot_pragma_openmp_end))
+            ConsumeAnyToken();
+        }
+      }
+      return std::make_tuple(Type, Modifier, Loc);
+    };
+
+    auto SaveModifier = [&](unsigned Type, unsigned Modifier,
+                            SourceLocation Loc) {
+      assert(Type < NumberOfModifiers && "Unexpected modifier type");
+      if (!KLoc[Type].isValid()) {
+        Arg[Type] = Modifier;
+        KLoc[Type] = Loc;
+      } else {
+        Diag(Loc, diag::err_omp_incompatible_dyn_groupprivate_modifier)
+            << getOpenMPSimpleClauseTypeName(OMPC_dyn_groupprivate, Modifier)
+            << getOpenMPSimpleClauseTypeName(OMPC_dyn_groupprivate, Arg[Type]);
+      }
+    };
+
+    // Parse 'modifier'
+    auto [Type1, Mod1, Loc1] = ConsumeModifier();
+    if (Type1 < NumberOfModifiers) {
+      SaveModifier(Type1, Mod1, Loc1);
+      if (Tok.is(tok::comma)) {
+        // Parse ',' 'modifier'
+        ConsumeAnyToken();
+        auto [Type2, Mod2, Loc2] = ConsumeModifier();
+        if (Type2 < NumberOfModifiers)
+          SaveModifier(Type2, Mod2, Loc2);
+      }
+      // Parse ':'
+      if (Tok.is(tok::colon))
+        ConsumeAnyToken();
+      else
+        Diag(Tok, diag::warn_pragma_expected_colon)
+            << "dyn_groupprivate modifier";
+    }
   } else if (Kind == OMPC_num_tasks) {
     // Parse optional <num_tasks modifier> ':'
     OpenMPNumTasksClauseModifier Modifier =
@@ -4083,11 +4161,11 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind,
     }
   }
 
-  bool NeedAnExpression = (Kind == OMPC_schedule && DelimLoc.isValid()) ||
-                          (Kind == OMPC_dist_schedule && DelimLoc.isValid()) ||
-                          Kind == OMPC_if || Kind == OMPC_device ||
-                          Kind == OMPC_grainsize || Kind == OMPC_num_tasks ||
-                          Kind == OMPC_num_threads;
+  bool NeedAnExpression =
+      (Kind == OMPC_schedule && DelimLoc.isValid()) ||
+      (Kind == OMPC_dist_schedule && DelimLoc.isValid()) || Kind == OMPC_if ||
+      Kind == OMPC_device || Kind == OMPC_grainsize || Kind == OMPC_num_tasks ||
+      Kind == OMPC_num_threads || Kind == OMPC_dyn_groupprivate;
   if (NeedAnExpression) {
     SourceLocation ELoc = Tok.getLocation();
     ExprResult LHS(
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index 92038985f9163..7e73d89c2a18c 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -813,7 +813,7 @@ StmtResult Parser::ParseCaseStatement(ParsedStmtContext StmtCtx,
           return StmtError();
       }
     } else {
-      LHS = Expr;
+      LHS = Actions.ActOnCaseExpr(CaseLoc, Expr);
       MissingCase = false;
     }
 
@@ -1079,16 +1079,10 @@ bool Parser::ConsumeNullStmt(StmtVector &Stmts) {
 StmtResult Parser::handleExprStmt(ExprResult E, ParsedStmtContext StmtCtx) {
   bool IsStmtExprResult = false;
   if ((StmtCtx & ParsedStmtContext::InStmtExpr) != ParsedStmtContext()) {
-    // For GCC compatibility we skip past NullStmts.
-    unsigned LookAhead = 0;
-    while (GetLookAheadToken(LookAhead).is(tok::semi)) {
-      ++LookAhead;
-    }
-    // Then look to see if the next two tokens close the statement expression;
-    // if so, this expression statement is the last statement in a statement
-    // expression.
-    IsStmtExprResult = GetLookAheadToken(LookAhead).is(tok::r_brace) &&
-                       GetLookAheadToken(LookAhead + 1).is(tok::r_paren);
+    // Look ahead to see if the next two tokens close the statement expression;
+    // if so, this expression statement is the last statement in a
+    // statment expression.
+    IsStmtExprResult = Tok.is(tok::r_brace) && NextToken().is(tok::r_paren);
   }
 
   if (IsStmtExprResult)
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 140b709dbb651..41a98323450e4 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -2734,6 +2734,70 @@ static void flushDiagnostics(Sema &S, const sema::FunctionScopeInfo *fscope) {
     S.Diag(D.Loc, D.PD);
 }
 
+template <typename Iterator>
+static void emitPossiblyUnreachableDiags(Sema &S, AnalysisDeclContext &AC,
+                                         std::pair<Iterator, Iterator> PUDs) {
+
+  if (PUDs.first == PUDs.second)
+    return;
+
+  for (auto I = PUDs.first; I != PUDs.second; ++I) {
+    for (const Stmt *S : I->Stmts)
+      AC.registerForcedBlockExpression(S);
+  }
+
+  if (AC.getCFG()) {
+    CFGReverseBlockReachabilityAnalysis *Analysis =
+        AC.getCFGReachablityAnalysis();
+
+    for (auto I = PUDs.first; I != PUDs.second; ++I) {
+      const auto &D = *I;
+      if (llvm::all_of(D.Stmts, [&](const Stmt *St) {
+            const CFGBlock *Block = AC.getBlockForRegisteredExpression(St);
+            // FIXME: We should be able to assert that block is non-null, but
+            // the CFG analysis can skip potentially-evaluated expressions in
+            // edge cases; see test/Sema/vla-2.c.
+            if (Block && Analysis)
+              if (!Analysis->isReachable(&AC.getCFG()->getEntry(), Block))
+                return false;
+            return true;
+          })) {
+        S.Diag(D.Loc, D.PD);
+      }
+    }
+  } else {
+    for (auto I = PUDs.first; I != PUDs.second; ++I)
+      S.Diag(I->Loc, I->PD);
+  }
+}
+
+void sema::AnalysisBasedWarnings::registerVarDeclWarning(
+    VarDecl *VD, clang::sema::PossiblyUnreachableDiag PUD) {
+  VarDeclPossiblyUnreachableDiags.emplace(VD, PUD);
+}
+
+void sema::AnalysisBasedWarnings::issueWarningsForRegisteredVarDecl(
+    VarDecl *VD) {
+  if (!llvm::is_contained(VarDeclPossiblyUnreachableDiags, VD))
+    return;
+
+  AnalysisDeclContext AC(/*Mgr=*/nullptr, VD);
+
+  AC.getCFGBuildOptions().PruneTriviallyFalseEdges = true;
+  AC.getCFGBuildOptions().AddEHEdges = false;
+  AC.getCFGBuildOptions().AddInitializers = true;
+  AC.getCFGBuildOptions().AddImplicitDtors = true;
+  AC.getCFGBuildOptions().AddTemporaryDtors = true;
+  AC.getCFGBuildOptions().AddCXXNewAllocator = false;
+  AC.getCFGBuildOptions().AddCXXDefaultInitExprInCtors = true;
+
+  auto Range = VarDeclPossiblyUnreachableDiags.equal_range(VD);
+  auto SecondRange =
+      llvm::make_second_range(llvm::make_range(Range.first, Range.second));
+  emitPossiblyUnreachableDiags(
+      S, AC, std::make_pair(SecondRange.begin(), SecondRange.end()));
+}
+
 // An AST Visitor that calls a callback function on each callable DEFINITION
 // that is NOT in a dependent context:
 class CallableVisitor : public DynamicRecursiveASTVisitor {
@@ -2945,45 +3009,8 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
   }
 
   // Emit delayed diagnostics.
-  if (!fscope->PossiblyUnreachableDiags.empty()) {
-    bool analyzed = false;
-
-    // Register the expressions with the CFGBuilder.
-    for (const auto &D : fscope->PossiblyUnreachableDiags) {
-      for (const Stmt *S : D.Stmts)
-        AC.registerForcedBlockExpression(S);
-    }
-
-    if (AC.getCFG()) {
-      analyzed = true;
-      for (const auto &D : fscope->PossiblyUnreachableDiags) {
-        bool AllReachable = true;
-        for (const Stmt *S : D.Stmts) {
-          const CFGBlock *block = AC.getBlockForRegisteredExpression(S);
-          CFGReverseBlockReachabilityAnalysis *cra =
-              AC.getCFGReachablityAnalysis();
-          // FIXME: We should be able to assert that block is non-null, but
-          // the CFG analysis can skip potentially-evaluated expressions in
-          // edge cases; see test/Sema/vla-2.c.
-          if (block && cra) {
-            // Can this block be reached from the entrance?
-            if (!cra->isReachable(&AC.getCFG()->getEntry(), block)) {
-              AllReachable = false;
-              break;
-            }
-          }
-          // If we cannot map to a basic block, assume the statement is
-          // reachable.
-        }
-
-        if (AllReachable)
-          S.Diag(D.Loc, D.PD);
-      }
-    }
-
-    if (!analyzed)
-      flushDiagnostics(S, fscope);
-  }
+  auto &PUDs = fscope->PossiblyUnreachableDiags;
+  emitPossiblyUnreachableDiags(S, AC, std::make_pair(PUDs.begin(), PUDs.end()));
 
   // Warning: check missing 'return'
   if (P.enableCheckFallThrough) {
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 23bf7f217a01a..46addea232b03 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -321,9 +321,8 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer,
           static_cast<unsigned>(ComparisonCategoryType::Last) + 1),
       StdSourceLocationImplDecl(nullptr), CXXTypeInfoDecl(nullptr),
       GlobalNewDeleteDeclared(false), DisableTypoCorrection(false),
-      TyposCorrected(0), IsBuildingRecoveryCallExpr(false), NumSFINAEErrors(0),
-      AccessCheckingSFINAE(false), CurrentInstantiationScope(nullptr),
-      InNonInstantiationSFINAEContext(false), NonInstantiationEntries(0),
+      TyposCorrected(0), IsBuildingRecoveryCallExpr(false),
+      CurrentInstantiationScope(nullptr), NonInstantiationEntries(0),
       ArgPackSubstIndex(std::nullopt), SatisfactionCache(Context) {
   assert(pp.TUKind == TUKind);
   TUScope = nullptr;
@@ -670,7 +669,9 @@ void Sema::addExternalSource(IntrusiveRefCntPtr<ExternalSemaSource> E) {
 
 void Sema::PrintStats() const {
   llvm::errs() << "\n*** Semantic Analysis Stats:\n";
-  llvm::errs() << NumSFINAEErrors << " SFINAE diagnostics trapped.\n";
+  if (SFINAETrap *Trap = getSFINAEContext())
+    llvm::errs() << int(Trap->hasErrorOccurred())
+                 << " SFINAE diagnostics trapped.\n";
 
   BumpAlloc.PrintStats();
   AnalysisWarnings.PrintStats();
@@ -1681,7 +1682,8 @@ void Sema::EmitDiagnostic(unsigned DiagID, const DiagnosticBuilder &DB) {
   // issue I am not seeing yet), then there should at least be a clarifying
   // comment somewhere.
   Diagnostic DiagInfo(&Diags, DB);
-  if (std::optional<TemplateDeductionInfo *> Info = isSFINAEContext()) {
+  if (SFINAETrap *Trap = getSFINAEContext()) {
+    sema::TemplateDeductionInfo *Info = Trap->getDeductionInfo();
     switch (DiagnosticIDs::getDiagnosticSFINAEResponse(DiagInfo.getID())) {
     case DiagnosticIDs::SFINAE_Report:
       // We'll report the diagnostic below.
@@ -1690,37 +1692,37 @@ void Sema::EmitDiagnostic(unsigned DiagID, const DiagnosticBuilder &DB) {
     case DiagnosticIDs::SFINAE_SubstitutionFailure:
       // Count this failure so that we know that template argument deduction
       // has failed.
-      ++NumSFINAEErrors;
+      Trap->setErrorOccurred();
 
       // Make a copy of this suppressed diagnostic and store it with the
       // template-deduction information.
-      if (*Info && !(*Info)->hasSFINAEDiagnostic()) {
-        (*Info)->addSFINAEDiagnostic(DiagInfo.getLocation(),
-                       PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
-      }
+      if (Info && !Info->hasSFINAEDiagnostic())
+        Info->addSFINAEDiagnostic(
+            DiagInfo.getLocation(),
+            PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
 
       Diags.setLastDiagnosticIgnored(true);
       return;
 
     case DiagnosticIDs::SFINAE_AccessControl: {
       // Per C++ Core Issue 1170, access control is part of SFINAE.
-      // Additionally, the AccessCheckingSFINAE flag can be used to temporarily
+      // Additionally, the WithAccessChecking flag can be used to temporarily
       // make access control a part of SFINAE for the purposes of checking
       // type traits.
-      if (!AccessCheckingSFINAE && !getLangOpts().CPlusPlus11)
+      if (!Trap->withAccessChecking() && !getLangOpts().CPlusPlus11)
         break;
 
       SourceLocation Loc = DiagInfo.getLocation();
 
       // Suppress this diagnostic.
-      ++NumSFINAEErrors;
+      Trap->setErrorOccurred();
 
       // Make a copy of this suppressed diagnostic and store it with the
       // template-deduction information.
-      if (*Info && !(*Info)->hasSFINAEDiagnostic()) {
-        (*Info)->addSFINAEDiagnostic(DiagInfo.getLocation(),
-                       PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
-      }
+      if (Info && !Info->hasSFINAEDiagnostic())
+        Info->addSFINAEDiagnostic(
+            DiagInfo.getLocation(),
+            PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
 
       Diags.setLastDiagnosticIgnored(true);
 
@@ -1740,13 +1742,13 @@ void Sema::EmitDiagnostic(unsigned DiagID, const DiagnosticBuilder &DB) {
         return;
       // Make a copy of this suppressed diagnostic and store it with the
       // template-deduction information;
-      if (*Info) {
-        (*Info)->addSuppressedDiagnostic(
+      if (Info) {
+        Info->addSuppressedDiagnostic(
             DiagInfo.getLocation(),
             PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
         if (!Diags.getDiagnosticIDs()->isNote(DiagID))
           PrintContextStack([Info](SourceLocation Loc, PartialDiagnostic PD) {
-            (*Info)->addSuppressedDiagnostic(Loc, std::move(PD));
+            Info->addSuppressedDiagnostic(Loc, std::move(PD));
           });
       }
 
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 139c4abc040df..cece22092bb14 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -558,6 +558,8 @@ AMDGPUMaxNumWorkGroupsAttr *SemaAMDGPU::CreateAMDGPUMaxNumWorkGroupsAttr(
     const AttributeCommonInfo &CI, Expr *XExpr, Expr *YExpr, Expr *ZExpr) {
   ASTContext &Context = getASTContext();
   AMDGPUMaxNumWorkGroupsAttr TmpAttr(Context, CI, XExpr, YExpr, ZExpr);
+  assert(!SemaRef.isSFINAEContext() &&
+         "Can't produce SFINAE diagnostic pointing to temporary attribute");
 
   if (checkAMDGPUMaxNumWorkGroupsArguments(SemaRef, XExpr, YExpr, ZExpr,
                                            TmpAttr))
diff --git a/clang/lib/Sema/SemaBoundsSafety.cpp b/clang/lib/Sema/SemaBoundsSafety.cpp
index 39ab13653f5fe..de9adf8ef5a1b 100644
--- a/clang/lib/Sema/SemaBoundsSafety.cpp
+++ b/clang/lib/Sema/SemaBoundsSafety.cpp
@@ -132,9 +132,23 @@ bool Sema::CheckCountedByAttrOnField(FieldDecl *FD, Expr *E, bool CountInBytes,
     // `BoundsSafetyCheckUseOfCountAttrPtr`
     //
     // * When the pointee type is always an incomplete type (e.g.
-    // `void`) the attribute is disallowed by this method because we know the
-    // type can never be completed so there's no reason to allow it.
-    InvalidTypeKind = CountedByInvalidPointeeTypeKind::INCOMPLETE;
+    // `void` in strict C mode) the attribute is disallowed by this method
+    // because we know the type can never be completed so there's no reason
+    // to allow it.
+    //
+    // Exception: void has an implicit size of 1 byte for pointer arithmetic
+    // (following GNU convention). Therefore, counted_by on void* is allowed
+    // and behaves equivalently to sized_by (treating the count as bytes).
+    bool IsVoidPtr = PointeeTy->isVoidType();
+    if (IsVoidPtr) {
+      // Emit a warning that this is a GNU extension.
+      Diag(FD->getBeginLoc(), diag::ext_gnu_counted_by_void_ptr) << Kind;
+      Diag(FD->getBeginLoc(), diag::note_gnu_counted_by_void_ptr_use_sized_by)
+          << Kind;
+      assert(InvalidTypeKind == CountedByInvalidPointeeTypeKind::VALID);
+    } else {
+      InvalidTypeKind = CountedByInvalidPointeeTypeKind::INCOMPLETE;
+    }
   } else if (PointeeTy->isSizelessType()) {
     InvalidTypeKind = CountedByInvalidPointeeTypeKind::SIZELESS;
   } else if (PointeeTy->isFunctionType()) {
@@ -272,6 +286,9 @@ GetCountedByAttrOnIncompletePointee(QualType Ty, NamedDecl **ND) {
   if (!PointeeTy->isIncompleteType(ND))
     return {};
 
+  if (PointeeTy->isVoidType())
+    return {};
+
   return {CATy, PointeeTy};
 }
 
diff --git a/clang/lib/Sema/SemaCXXScopeSpec.cpp b/clang/lib/Sema/SemaCXXScopeSpec.cpp
index c52fc5bf815af..17ae7ca5627a9 100644
--- a/clang/lib/Sema/SemaCXXScopeSpec.cpp
+++ b/clang/lib/Sema/SemaCXXScopeSpec.cpp
@@ -780,6 +780,11 @@ bool Sema::BuildCXXNestedNameSpecifier(Scope *S, NestedNameSpecInfo &IdInfo,
 
   if (!Found.empty()) {
     const auto *ND = Found.getAsSingle<NamedDecl>();
+    if (!ND) {
+      Diag(IdInfo.IdentifierLoc, diag::err_expected_class_or_namespace)
+          << IdInfo.Identifier << getLangOpts().CPlusPlus;
+      return true;
+    }
     if (::ExtendNestedNameSpecifier(*this, SS, ND, IdInfo.IdentifierLoc,
                                     IdInfo.CCLoc)) {
       const Type *T = SS.getScopeRep().getAsType();
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index f4517877b04c8..a8e3fe6c07b12 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2609,6 +2609,18 @@ static ExprResult BuiltinInvoke(Sema &S, CallExpr *TheCall) {
                          Args.drop_front(), TheCall->getRParenLoc());
 }
 
+// Performs a similar job to Sema::UsualUnaryConversions, but without any
+// implicit promotion of integral/enumeration types.
+static ExprResult BuiltinVectorMathConversions(Sema &S, Expr *E) {
+  // First, convert to an r-value.
+  ExprResult Res = S.DefaultFunctionArrayLvalueConversion(E);
+  if (Res.isInvalid())
+    return ExprError();
+
+  // Promote floating-point types.
+  return S.UsualUnaryFPConversions(Res.get());
+}
+
 ExprResult
 Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
                                CallExpr *TheCall) {
@@ -3273,6 +3285,46 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
       return ExprError();
     break;
 
+  case Builtin::BI__builtin_elementwise_ldexp: {
+    if (checkArgCount(TheCall, 2))
+      return ExprError();
+
+    ExprResult A = BuiltinVectorMathConversions(*this, TheCall->getArg(0));
+    if (A.isInvalid())
+      return ExprError();
+    QualType TyA = A.get()->getType();
+    if (checkMathBuiltinElementType(*this, A.get()->getBeginLoc(), TyA,
+                                    EltwiseBuiltinArgTyRestriction::FloatTy, 1))
+      return ExprError();
+
+    ExprResult Exp = UsualUnaryConversions(TheCall->getArg(1));
+    if (Exp.isInvalid())
+      return ExprError();
+    QualType TyExp = Exp.get()->getType();
+    if (checkMathBuiltinElementType(*this, Exp.get()->getBeginLoc(), TyExp,
+                                    EltwiseBuiltinArgTyRestriction::IntegerTy,
+                                    2))
+      return ExprError();
+
+    // Check the two arguments are either scalars or vectors of equal length.
+    const auto *Vec0 = TyA->getAs<VectorType>();
+    const auto *Vec1 = TyExp->getAs<VectorType>();
+    unsigned Arg0Length = Vec0 ? Vec0->getNumElements() : 0;
+    unsigned Arg1Length = Vec1 ? Vec1->getNumElements() : 0;
+    if (Arg0Length != Arg1Length) {
+      Diag(Exp.get()->getBeginLoc(),
+           diag::err_typecheck_vector_lengths_not_equal)
+          << TyA << TyExp << A.get()->getSourceRange()
+          << Exp.get()->getSourceRange();
+      return ExprError();
+    }
+
+    TheCall->setArg(0, A.get());
+    TheCall->setArg(1, Exp.get());
+    TheCall->setType(TyA);
+    break;
+  }
+
   // These builtins restrict the element type to floating point
   // types only, and take in two arguments.
   case Builtin::BI__builtin_elementwise_minnum:
@@ -3542,9 +3594,7 @@ bool Sema::ValueIsRunOfOnes(CallExpr *TheCall, unsigned ArgNum) {
 
 bool Sema::getFormatStringInfo(const Decl *D, unsigned FormatIdx,
                                unsigned FirstArg, FormatStringInfo *FSI) {
-  bool IsCXXMember = false;
-  if (const auto *MD = dyn_cast<CXXMethodDecl>(D))
-    IsCXXMember = MD->isInstance();
+  bool HasImplicitThisParam = hasImplicitObjectParameter(D);
   bool IsVariadic = false;
   if (const FunctionType *FnTy = D->getFunctionType())
     IsVariadic = cast<FunctionProtoType>(FnTy)->isVariadic();
@@ -3553,11 +3603,12 @@ bool Sema::getFormatStringInfo(const Decl *D, unsigned FormatIdx,
   else if (const auto *OMD = dyn_cast<ObjCMethodDecl>(D))
     IsVariadic = OMD->isVariadic();
 
-  return getFormatStringInfo(FormatIdx, FirstArg, IsCXXMember, IsVariadic, FSI);
+  return getFormatStringInfo(FormatIdx, FirstArg, HasImplicitThisParam,
+                             IsVariadic, FSI);
 }
 
 bool Sema::getFormatStringInfo(unsigned FormatIdx, unsigned FirstArg,
-                               bool IsCXXMember, bool IsVariadic,
+                               bool HasImplicitThisParam, bool IsVariadic,
                                FormatStringInfo *FSI) {
   if (FirstArg == 0)
     FSI->ArgPassingKind = FAPK_VAList;
@@ -3571,7 +3622,7 @@ bool Sema::getFormatStringInfo(unsigned FormatIdx, unsigned FirstArg,
   // The way the format attribute works in GCC, the implicit this argument
   // of member functions is counted. However, it doesn't appear in our own
   // lists, so decrement format_idx in that case.
-  if (IsCXXMember) {
+  if (HasImplicitThisParam) {
     if(FSI->FormatIdx == 0)
       return false;
     --FSI->FormatIdx;
@@ -15993,18 +16044,6 @@ void Sema::CheckAddressOfPackedMember(Expr *rhs) {
                      _2, _3, _4));
 }
 
-// Performs a similar job to Sema::UsualUnaryConversions, but without any
-// implicit promotion of integral/enumeration types.
-static ExprResult BuiltinVectorMathConversions(Sema &S, Expr *E) {
-  // First, convert to an r-value.
-  ExprResult Res = S.DefaultFunctionArrayLvalueConversion(E);
-  if (Res.isInvalid())
-    return ExprError();
-
-  // Promote floating-point types.
-  return S.UsualUnaryFPConversions(Res.get());
-}
-
 bool Sema::PrepareBuiltinElementwiseMathOneArgCall(
     CallExpr *TheCall, EltwiseBuiltinArgTyRestriction ArgTyRestr) {
   if (checkArgCount(TheCall, 1))
diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp
index 0514d1033f74f..aa93507ab5c30 100644
--- a/clang/lib/Sema/SemaCodeComplete.cpp
+++ b/clang/lib/Sema/SemaCodeComplete.cpp
@@ -10208,6 +10208,24 @@ void SemaCodeCompletion::CodeCompletePreprocessorDirective(bool InConditional) {
   Builder.AddPlaceholderChunk("message");
   Results.AddResult(Builder.TakeString());
 
+  if (getLangOpts().C23) {
+    // #embed "file"
+    Builder.AddTypedTextChunk("embed");
+    Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace);
+    Builder.AddTextChunk("\"");
+    Builder.AddPlaceholderChunk("file");
+    Builder.AddTextChunk("\"");
+    Results.AddResult(Builder.TakeString());
+
+    // #embed <file>
+    Builder.AddTypedTextChunk("embed");
+    Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace);
+    Builder.AddTextChunk("<");
+    Builder.AddPlaceholderChunk("file");
+    Builder.AddTextChunk(">");
+    Results.AddResult(Builder.TakeString());
+  }
+
   // Note: #ident and #sccs are such crazy anachronisms that we don't provide
   // completions for them. And __include_macros is a Clang-internal extension
   // that we don't want to encourage anyone to use.
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index fb4d0b4582684..883e3410a35e0 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -526,12 +526,12 @@ ExprResult ConstraintSatisfactionChecker::EvaluateAtomicConstraint(
         S, AtomicExpr->getBeginLoc(),
         Sema::InstantiatingTemplate::ConstraintSubstitution{},
         // FIXME: improve const-correctness of InstantiatingTemplate
-        const_cast<NamedDecl *>(Template), Info, AtomicExpr->getSourceRange());
+        const_cast<NamedDecl *>(Template), AtomicExpr->getSourceRange());
     if (Inst.isInvalid())
       return ExprError();
 
     // We do not want error diagnostics escaping here.
-    Sema::SFINAETrap Trap(S);
+    Sema::SFINAETrap Trap(S, Info);
     SubstitutedExpression =
         S.SubstConstraintExpr(const_cast<Expr *>(AtomicExpr), MLTAL);
 
@@ -599,16 +599,15 @@ ConstraintSatisfactionChecker::SubstitutionInTemplateArguments(
     return MultiLevelTemplateArgumentList();
 
   TemplateDeductionInfo Info(Constraint.getBeginLoc());
+  Sema::SFINAETrap Trap(S, Info);
   Sema::InstantiatingTemplate Inst(
       S, Constraint.getBeginLoc(),
       Sema::InstantiatingTemplate::ConstraintSubstitution{},
       // FIXME: improve const-correctness of InstantiatingTemplate
-      const_cast<NamedDecl *>(Template), Info, Constraint.getSourceRange());
+      const_cast<NamedDecl *>(Template), Constraint.getSourceRange());
   if (Inst.isInvalid())
     return std::nullopt;
 
-  Sema::SFINAETrap Trap(S);
-
   TemplateArgumentListInfo SubstArgs;
   Sema::ArgPackSubstIndexRAII SubstIndex(
       S, Constraint.getPackSubstitutionIndex()
@@ -778,9 +777,6 @@ ConstraintSatisfactionChecker::EvaluateFoldExpandedConstraintSize(
     const FoldExpandedConstraint &FE,
     const MultiLevelTemplateArgumentList &MLTAL) {
 
-  // We should ignore errors in the presence of packs of different size.
-  Sema::SFINAETrap Trap(S);
-
   Expr *Pattern = const_cast<Expr *>(FE.getPattern());
 
   SmallVector<UnexpandedParameterPack, 2> Unexpanded;
@@ -792,18 +788,12 @@ ConstraintSatisfactionChecker::EvaluateFoldExpandedConstraintSize(
   if (S.CheckParameterPacksForExpansion(
           Pattern->getExprLoc(), Pattern->getSourceRange(), Unexpanded, MLTAL,
           /*FailOnPackProducingTemplates=*/false, Expand, RetainExpansion,
-          NumExpansions) ||
+          NumExpansions, /*Diagnose=*/false) ||
       !Expand || RetainExpansion)
     return std::nullopt;
 
-  if (NumExpansions && S.getLangOpts().BracketDepth < *NumExpansions) {
-    S.Diag(Pattern->getExprLoc(),
-           clang::diag::err_fold_expression_limit_exceeded)
-        << *NumExpansions << S.getLangOpts().BracketDepth
-        << Pattern->getSourceRange();
-    S.Diag(Pattern->getExprLoc(), diag::note_bracket_depth);
+  if (NumExpansions && S.getLangOpts().BracketDepth < *NumExpansions)
     return std::nullopt;
-  }
   return NumExpansions;
 }
 
@@ -921,7 +911,6 @@ ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
     return ExprError();
   }
 
-  Sema::SFINAETrap Trap(S);
   Sema::ArgPackSubstIndexRAII SubstIndex(
       S, Constraint.getPackSubstitutionIndex()
              ? Constraint.getPackSubstitutionIndex()
@@ -930,9 +919,10 @@ ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
   const ASTTemplateArgumentListInfo *Ori =
       ConceptId->getTemplateArgsAsWritten();
   TemplateDeductionInfo Info(TemplateNameLoc);
-  Sema::InstantiatingTemplate _(
+  Sema::SFINAETrap Trap(S, Info);
+  Sema::InstantiatingTemplate _2(
       S, TemplateNameLoc, Sema::InstantiatingTemplate::ConstraintSubstitution{},
-      const_cast<NamedDecl *>(Template), Info, Constraint.getSourceRange());
+      const_cast<NamedDecl *>(Template), Constraint.getSourceRange());
 
   TemplateArgumentListInfo OutArgs(Ori->LAngleLoc, Ori->RAngleLoc);
   if (S.SubstTemplateArguments(Ori->arguments(), *SubstitutedArgs, OutArgs) ||
@@ -1142,13 +1132,21 @@ static bool CheckConstraintSatisfaction(
   if (TemplateArgsLists.getNumLevels() != 0)
     Args = TemplateArgsLists.getInnermost();
 
-  std::optional<Sema::InstantiatingTemplate> SynthesisContext;
-  if (!TopLevelConceptId) {
-    SynthesisContext.emplace(S, TemplateIDRange.getBegin(),
-                             Sema::InstantiatingTemplate::ConstraintsCheck{},
-                             const_cast<NamedDecl *>(Template), Args,
+  struct SynthesisContextPair {
+    Sema::InstantiatingTemplate Inst;
+    Sema::NonSFINAEContext NSC;
+    SynthesisContextPair(Sema &S, NamedDecl *Template,
+                         ArrayRef<TemplateArgument> TemplateArgs,
+                         SourceRange InstantiationRange)
+        : Inst(S, InstantiationRange.getBegin(),
+               Sema::InstantiatingTemplate::ConstraintsCheck{}, Template,
+               TemplateArgs, InstantiationRange),
+          NSC(S) {}
+  };
+  std::optional<SynthesisContextPair> SynthesisContext;
+  if (!TopLevelConceptId)
+    SynthesisContext.emplace(S, const_cast<NamedDecl *>(Template), Args,
                              TemplateIDRange);
-  }
 
   const NormalizedConstraint *C =
       S.getNormalizedAssociatedConstraints(Template, AssociatedConstraints);
@@ -1478,8 +1476,7 @@ static const Expr *SubstituteConstraintExpressionWithoutSatisfaction(
   if (MLTAL.getNumSubstitutedLevels() == 0)
     return ConstrExpr;
 
-  Sema::SFINAETrap SFINAE(S);
-
+  Sema::NonSFINAEContext _(S);
   Sema::InstantiatingTemplate Inst(
       S, DeclInfo.getLocation(),
       Sema::InstantiatingTemplate::ConstraintNormalization{},
@@ -1554,7 +1551,7 @@ static const Expr *SubstituteConstraintExpressionWithoutSatisfaction(
       Sema::ReuseLambdaContextDecl);
   ExprResult SubstConstr = S.SubstConstraintExprWithoutSatisfaction(
       const_cast<clang::Expr *>(ConstrExpr), MLTAL);
-  if (SFINAE.hasErrorOccurred() || !SubstConstr.isUsable())
+  if (!SubstConstr.isUsable())
     return nullptr;
   return SubstConstr.get();
 }
@@ -2104,6 +2101,7 @@ bool SubstituteParameterMappings::substitute(
     InstLocBegin = SR.getBegin();
     InstLocEnd = SR.getEnd();
   }
+  Sema::NonSFINAEContext _(SemaRef);
   Sema::InstantiatingTemplate Inst(
       SemaRef, InstLocBegin,
       Sema::InstantiatingTemplate::ParameterMappingSubstitution{},
@@ -2171,6 +2169,7 @@ bool SubstituteParameterMappings::substitute(ConceptIdConstraint &CC) {
     InstLocBegin = SR.getBegin();
     InstLocEnd = SR.getEnd();
   }
+  Sema::NonSFINAEContext _(SemaRef);
   // This is useful for name lookup across modules; see Sema::getLookupModules.
   Sema::InstantiatingTemplate Inst(
       SemaRef, InstLocBegin,
@@ -2311,6 +2310,7 @@ NormalizedConstraint *NormalizedConstraint::fromConstraintExpr(
   } else if (auto *CSE = dyn_cast<const ConceptSpecializationExpr>(E)) {
     NormalizedConstraint *SubNF;
     {
+      Sema::NonSFINAEContext _(S);
       Sema::InstantiatingTemplate Inst(
           S, CSE->getExprLoc(),
           Sema::InstantiatingTemplate::ConstraintNormalization{},
@@ -2546,8 +2546,6 @@ bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic(
   };
 
   {
-    // The subsumption checks might cause diagnostics
-    SFINAETrap Trap(*this);
     auto *Normalized1 = getNormalizedAssociatedConstraints(D1, AC1);
     if (!Normalized1)
       return false;
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index fc3aabf5741ca..25b89d65847ad 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -59,6 +59,7 @@
 #include "clang/Sema/SemaWasm.h"
 #include "clang/Sema/Template.h"
 #include "llvm/ADT/STLForwardCompat.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
@@ -8492,12 +8493,11 @@ void Sema::CheckShadow(NamedDecl *D, NamedDecl *ShadowedDecl,
   DeclContext *NewDC = D->getDeclContext();
 
   if (FieldDecl *FD = dyn_cast<FieldDecl>(ShadowedDecl)) {
-    if (CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(NewDC)) {
-      // Fields are not shadowed by variables in C++ static methods.
-      if (MD->isStatic())
-        return;
-
-      if (!MD->getParent()->isLambda() && MD->isExplicitObjectMemberFunction())
+    if (const auto *MD =
+            dyn_cast<CXXMethodDecl>(getFunctionLevelDeclContext())) {
+      // Fields aren't shadowed in C++ static members or in member functions
+      // with an explicit object parameter.
+      if (MD->isStatic() || MD->isExplicitObjectMemberFunction())
         return;
     }
     // Fields shadowed by constructor parameters are a special case. Usually
@@ -13118,6 +13118,13 @@ namespace {
     if (isa<ParmVarDecl>(OrigDecl))
       return;
 
+    // Skip checking for file-scope constexpr variables - constant evaluation
+    // will produce appropriate errors without needing runtime diagnostics.
+    // Local constexpr should still emit runtime warnings.
+    if (auto *VD = dyn_cast<VarDecl>(OrigDecl);
+        VD && VD->isConstexpr() && VD->isFileVarDecl())
+      return;
+
     E = E->IgnoreParens();
 
     // Skip checking T a = a where T is not a record or reference type.
@@ -13745,6 +13752,11 @@ void Sema::DiagnoseUniqueObjectDuplication(const VarDecl *VD) {
 }
 
 void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
+  auto ResetDeclForInitializer = llvm::make_scope_exit([this]() {
+    if (this->ExprEvalContexts.empty())
+      this->ExprEvalContexts.back().DeclForInitializer = nullptr;
+  });
+
   // If there is no declaration, there was an error parsing it.  Just ignore
   // the initializer.
   if (!RealDecl) {
@@ -15070,6 +15082,10 @@ void Sema::FinalizeDeclaration(Decl *ThisDecl) {
   if (!VD)
     return;
 
+  // Emit any deferred warnings for the variable's initializer, even if the
+  // variable is invalid
+  AnalysisWarnings.issueWarningsForRegisteredVarDecl(VD);
+
   // Apply an implicit SectionAttr if '#pragma clang section bss|data|rodata' is active
   if (VD->hasGlobalStorage() && VD->isThisDeclarationADefinition() &&
       !inTemplateInstantiation() && !VD->hasAttr<SectionAttr>()) {
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 964a2a791e18f..a9e7b44ac9d73 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3785,7 +3785,7 @@ static bool handleFormatAttrCommon(Sema &S, Decl *D, const ParsedAttr &AL,
 
   // In C++ the implicit 'this' function parameter also counts, and they are
   // counted from one.
-  bool HasImplicitThisParam = isInstanceMethod(D);
+  bool HasImplicitThisParam = hasImplicitObjectParameter(D);
   Info->NumArgs = getFunctionOrMethodNumParams(D) + HasImplicitThisParam;
 
   Info->Identifier = AL.getArgAsIdent(0)->getIdentifierInfo();
@@ -3926,7 +3926,7 @@ static void handleCallbackAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
     return;
   }
 
-  bool HasImplicitThisParam = isInstanceMethod(D);
+  bool HasImplicitThisParam = hasImplicitObjectParameter(D);
   int32_t NumArgs = getFunctionOrMethodNumParams(D);
 
   FunctionDecl *FD = D->getAsFunction();
@@ -4110,7 +4110,7 @@ static void handleLifetimeCaptureByAttr(Sema &S, Decl *D,
 }
 
 void Sema::LazyProcessLifetimeCaptureByParams(FunctionDecl *FD) {
-  bool HasImplicitThisParam = isInstanceMethod(FD);
+  bool HasImplicitThisParam = hasImplicitObjectParameter(FD);
   SmallVector<LifetimeCaptureByAttr *, 1> Attrs;
   for (ParmVarDecl *PVD : FD->parameters())
     if (auto *A = PVD->getAttr<LifetimeCaptureByAttr>())
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index d41ab126c426f..8030aac3d8771 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -8035,7 +8035,7 @@ class DefaultedComparisonVisitor {
   DefaultedComparisonVisitor(Sema &S, CXXRecordDecl *RD, FunctionDecl *FD,
                              DefaultedComparisonKind DCK)
       : S(S), RD(RD), FD(FD), DCK(DCK) {
-    if (auto *Info = FD->getDefalutedOrDeletedInfo()) {
+    if (auto *Info = FD->getDefaultedOrDeletedInfo()) {
       // FIXME: Change CreateOverloadedBinOp to take an ArrayRef instead of an
       // UnresolvedSet to avoid this copy.
       Fns.assign(Info->getUnqualifiedLookups().begin(),
diff --git a/clang/lib/Sema/SemaDeclObjC.cpp b/clang/lib/Sema/SemaDeclObjC.cpp
index 3df9f9c1d68c7..53ff818a2af53 100644
--- a/clang/lib/Sema/SemaDeclObjC.cpp
+++ b/clang/lib/Sema/SemaDeclObjC.cpp
@@ -4730,13 +4730,13 @@ ParmVarDecl *SemaObjC::ActOnMethodParmDeclaration(Scope *S,
                                                   bool MethodDefinition) {
   ASTContext &Context = getASTContext();
   QualType ArgType;
-  TypeSourceInfo *DI;
+  TypeSourceInfo *TSI;
 
   if (!ArgInfo.Type) {
     ArgType = Context.getObjCIdType();
-    DI = nullptr;
+    TSI = nullptr;
   } else {
-    ArgType = SemaRef.GetTypeFromParser(ArgInfo.Type, &DI);
+    ArgType = SemaRef.GetTypeFromParser(ArgInfo.Type, &TSI);
   }
   LookupResult R(SemaRef, ArgInfo.Name, ArgInfo.NameLoc,
                  Sema::LookupOrdinaryName,
@@ -4753,14 +4753,14 @@ ParmVarDecl *SemaObjC::ActOnMethodParmDeclaration(Scope *S,
     }
   }
   SourceLocation StartLoc =
-      DI ? DI->getTypeLoc().getBeginLoc() : ArgInfo.NameLoc;
+      TSI ? TSI->getTypeLoc().getBeginLoc() : ArgInfo.NameLoc;
 
   // Temporarily put parameter variables in the translation unit. This is what
   // ActOnParamDeclarator does in the case of C arguments to the Objective-C
   // method too.
   ParmVarDecl *Param = SemaRef.CheckParameter(
       Context.getTranslationUnitDecl(), StartLoc, ArgInfo.NameLoc, ArgInfo.Name,
-      ArgType, DI, SC_None);
+      ArgType, TSI, SC_None);
   Param->setObjCMethodScopeInfo(ParamIndex);
   Param->setObjCDeclQualifier(
       CvtQTToAstBitMask(ArgInfo.DeclSpec.getObjCDeclQualifier()));
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index a50c27610dc96..10f0ec3010c6c 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -12653,10 +12653,10 @@ QualType Sema::CheckCompareOperands(ExprResult &LHS, ExprResult &RHS,
       // This is a gcc extension compatibility comparison.
       // In a SFINAE context, we treat this as a hard error to maintain
       // conformance with the C++ standard.
-      diagnoseFunctionPointerToVoidComparison(
-          *this, Loc, LHS, RHS, /*isError*/ (bool)isSFINAEContext());
+      bool IsError = isSFINAEContext();
+      diagnoseFunctionPointerToVoidComparison(*this, Loc, LHS, RHS, IsError);
 
-      if (isSFINAEContext())
+      if (IsError)
         return QualType();
 
       RHS = ImpCastExprToType(RHS.get(), LHSType, CK_BitCast);
@@ -14598,11 +14598,11 @@ QualType Sema::CheckAddressOfOperand(ExprResult &OrigOp, SourceLocation OpLoc) {
   unsigned AddressOfError = AO_No_Error;
 
   if (lval == Expr::LV_ClassTemporary || lval == Expr::LV_ArrayTemporary) {
-    bool sfinae = (bool)isSFINAEContext();
-    Diag(OpLoc, isSFINAEContext() ? diag::err_typecheck_addrof_temporary
-                                  : diag::ext_typecheck_addrof_temporary)
-      << op->getType() << op->getSourceRange();
-    if (sfinae)
+    bool IsError = isSFINAEContext();
+    Diag(OpLoc, IsError ? diag::err_typecheck_addrof_temporary
+                        : diag::ext_typecheck_addrof_temporary)
+        << op->getType() << op->getSourceRange();
+    if (IsError)
       return QualType();
     // Materialize the temporary as an lvalue so that we can take its address.
     OrigOp = op =
@@ -16185,9 +16185,7 @@ ExprResult Sema::BuildStmtExpr(SourceLocation LPLoc, Stmt *SubStmt,
   QualType Ty = Context.VoidTy;
   bool StmtExprMayBindToTemp = false;
   if (!Compound->body_empty()) {
-    // For GCC compatibility we get the last Stmt excluding trailing NullStmts.
-    if (const auto *LastStmt =
-            dyn_cast<ValueStmt>(Compound->getStmtExprResult())) {
+    if (const auto *LastStmt = dyn_cast<ValueStmt>(Compound->body_back())) {
       if (const Expr *Value = LastStmt->getExprStmt()) {
         StmtExprMayBindToTemp = true;
         Ty = Value->getType();
@@ -20567,31 +20565,36 @@ void Sema::MarkDeclarationsReferencedInExpr(Expr *E,
 }
 
 /// Emit a diagnostic when statements are reachable.
-/// FIXME: check for reachability even in expressions for which we don't build a
-///        CFG (eg, in the initializer of a global or in a constant expression).
-///        For example,
-///        namespace { auto *p = new double[3][false ? (1, 2) : 3]; }
 bool Sema::DiagIfReachable(SourceLocation Loc, ArrayRef<const Stmt *> Stmts,
                            const PartialDiagnostic &PD) {
-  if (!Stmts.empty() && getCurFunctionOrMethodDecl()) {
-    if (!FunctionScopes.empty())
-      FunctionScopes.back()->PossiblyUnreachableDiags.push_back(
-          sema::PossiblyUnreachableDiag(PD, Loc, Stmts));
-    return true;
-  }
-
+  VarDecl *Decl = ExprEvalContexts.back().DeclForInitializer;
   // The initializer of a constexpr variable or of the first declaration of a
   // static data member is not syntactically a constant evaluated constant,
   // but nonetheless is always required to be a constant expression, so we
   // can skip diagnosing.
-  // FIXME: Using the mangling context here is a hack.
-  if (auto *VD = dyn_cast_or_null<VarDecl>(
-          ExprEvalContexts.back().ManglingContextDecl)) {
-    if (VD->isConstexpr() ||
-        (VD->isStaticDataMember() && VD->isFirstDecl() && !VD->isInline()))
-      return false;
-    // FIXME: For any other kind of variable, we should build a CFG for its
-    // initializer and check whether the context in question is reachable.
+  if (Decl &&
+      (Decl->isConstexpr() || (Decl->isStaticDataMember() &&
+                               Decl->isFirstDecl() && !Decl->isInline())))
+    return false;
+
+  if (Stmts.empty()) {
+    Diag(Loc, PD);
+    return true;
+  }
+
+  if (getCurFunction()) {
+    FunctionScopes.back()->PossiblyUnreachableDiags.push_back(
+        sema::PossiblyUnreachableDiag(PD, Loc, Stmts));
+    return true;
+  }
+
+  // For non-constexpr file-scope variables with reachability context (non-empty
+  // Stmts), build a CFG for the initializer and check whether the context in
+  // question is reachable.
+  if (Decl && Decl->isFileVarDecl()) {
+    AnalysisWarnings.registerVarDeclWarning(
+        Decl, sema::PossiblyUnreachableDiag(PD, Loc, Stmts));
+    return true;
   }
 
   Diag(Loc, PD);
diff --git a/clang/lib/Sema/SemaFunctionEffects.cpp b/clang/lib/Sema/SemaFunctionEffects.cpp
index 8590ee831084f..12cc02965e7d3 100644
--- a/clang/lib/Sema/SemaFunctionEffects.cpp
+++ b/clang/lib/Sema/SemaFunctionEffects.cpp
@@ -1208,8 +1208,16 @@ class Analyzer {
         return true;
       }
 
-      // No Decl, just an Expr. Just check based on its type.
-      checkIndirectCall(Call, CalleeExpr->getType());
+      // No Decl, just an Expr. Just check based on its type. Bound member
+      // functions are a special expression type and need to be specially
+      // unpacked.
+      QualType CalleeExprQT = CalleeExpr->getType();
+      if (CalleeExpr->isBoundMemberFunction(Outer.S.getASTContext())) {
+        QualType QT = Expr::findBoundMemberType(CalleeExpr);
+        if (!QT.isNull())
+          CalleeExprQT = QT;
+      }
+      checkIndirectCall(Call, CalleeExprQT);
 
       return true;
     }
@@ -1271,7 +1279,15 @@ class Analyzer {
       const CXXConstructorDecl *Ctor = Construct->getConstructor();
       CallableInfo CI(*Ctor);
       followCall(CI, Construct->getLocation());
+      return true;
+    }
 
+    bool VisitCXXBindTemporaryExpr(CXXBindTemporaryExpr *BTE) override {
+      const CXXDestructorDecl *Dtor = BTE->getTemporary()->getDestructor();
+      if (Dtor != nullptr) {
+        CallableInfo CI(*Dtor);
+        followCall(CI, BTE->getBeginLoc());
+      }
       return true;
     }
 
@@ -1286,6 +1302,14 @@ class Analyzer {
       return true;
     }
 
+    bool TraverseCXXRecordDecl(CXXRecordDecl *D) override {
+      // Completely skip local struct/class/union declarations since their
+      // methods would otherwise be incorrectly interpreted as part of the
+      // function we are currently traversing. The initial Sema pass will have
+      // already recorded any nonblocking methods needing analysis.
+      return true;
+    }
+
     bool TraverseConstructorInitializer(CXXCtorInitializer *Init) override {
       ViolationSite PrevVS = VSite;
       if (Init->isAnyMemberInitializer())
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 94a490a8f68dc..e95fe16e6cb6c 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -775,6 +775,10 @@ HLSLSemanticAttr *SemaHLSL::createSemantic(const SemanticInfo &Info,
                                            DeclaratorDecl *TargetDecl) {
   std::string SemanticName = Info.Semantic->getAttrName()->getName().upper();
 
+  if (dyn_cast<HLSLUserSemanticAttr>(Info.Semantic))
+    return createSemanticAttr<HLSLUserSemanticAttr>(*Info.Semantic, TargetDecl,
+                                                    Info.Index);
+
   if (SemanticName == "SV_DISPATCHTHREADID") {
     return createSemanticAttr<HLSLSV_DispatchThreadIDAttr>(
         *Info.Semantic, TargetDecl, Info.Index);
@@ -797,9 +801,10 @@ HLSLSemanticAttr *SemaHLSL::createSemantic(const SemanticInfo &Info,
   return nullptr;
 }
 
-bool SemaHLSL::determineActiveSemanticOnScalar(FunctionDecl *FD,
-                                               DeclaratorDecl *D,
-                                               SemanticInfo &ActiveSemantic) {
+bool SemaHLSL::determineActiveSemanticOnScalar(
+    FunctionDecl *FD, DeclaratorDecl *D, SemanticInfo &ActiveSemantic,
+    llvm::StringSet<> &ActiveInputSemantics) {
+
   if (ActiveSemantic.Semantic == nullptr) {
     ActiveSemantic.Semantic = D->getAttr<HLSLSemanticAttr>();
     if (ActiveSemantic.Semantic &&
@@ -818,11 +823,31 @@ bool SemaHLSL::determineActiveSemanticOnScalar(FunctionDecl *FD,
 
   checkSemanticAnnotation(FD, D, A);
   FD->addAttr(A);
+
+  unsigned Location = ActiveSemantic.Index.value_or(0);
+
+  const ConstantArrayType *AT = dyn_cast<ConstantArrayType>(D->getType());
+  unsigned ElementCount = AT ? AT->getZExtSize() : 1;
+  ActiveSemantic.Index = Location + ElementCount;
+
+  Twine BaseName = Twine(ActiveSemantic.Semantic->getAttrName()->getName());
+  for (unsigned I = 0; I < ElementCount; ++I) {
+    Twine VariableName = BaseName.concat(Twine(Location + I));
+
+    auto [_, Inserted] = ActiveInputSemantics.insert(VariableName.str());
+    if (!Inserted) {
+      Diag(D->getLocation(), diag::err_hlsl_semantic_index_overlap)
+          << VariableName.str();
+      return false;
+    }
+  }
+
   return true;
 }
 
-bool SemaHLSL::determineActiveSemantic(FunctionDecl *FD, DeclaratorDecl *D,
-                                       SemanticInfo &ActiveSemantic) {
+bool SemaHLSL::determineActiveSemantic(
+    FunctionDecl *FD, DeclaratorDecl *D, SemanticInfo &ActiveSemantic,
+    llvm::StringSet<> &ActiveInputSemantics) {
   if (ActiveSemantic.Semantic == nullptr) {
     ActiveSemantic.Semantic = D->getAttr<HLSLSemanticAttr>();
     if (ActiveSemantic.Semantic &&
@@ -833,12 +858,13 @@ bool SemaHLSL::determineActiveSemantic(FunctionDecl *FD, DeclaratorDecl *D,
   const Type *T = D->getType()->getUnqualifiedDesugaredType();
   const RecordType *RT = dyn_cast<RecordType>(T);
   if (!RT)
-    return determineActiveSemanticOnScalar(FD, D, ActiveSemantic);
+    return determineActiveSemanticOnScalar(FD, D, ActiveSemantic,
+                                           ActiveInputSemantics);
 
   const RecordDecl *RD = RT->getDecl();
   for (FieldDecl *Field : RD->fields()) {
     SemanticInfo Info = ActiveSemantic;
-    if (!determineActiveSemantic(FD, Field, Info)) {
+    if (!determineActiveSemantic(FD, Field, Info, ActiveInputSemantics)) {
       Diag(Field->getLocation(), diag::note_hlsl_semantic_used_here) << Field;
       return false;
     }
@@ -911,12 +937,14 @@ void SemaHLSL::CheckEntryPoint(FunctionDecl *FD) {
     llvm_unreachable("Unhandled environment in triple");
   }
 
+  llvm::StringSet<> ActiveInputSemantics;
   for (ParmVarDecl *Param : FD->parameters()) {
     SemanticInfo ActiveSemantic;
     ActiveSemantic.Semantic = nullptr;
     ActiveSemantic.Index = std::nullopt;
 
-    if (!determineActiveSemantic(FD, Param, ActiveSemantic)) {
+    if (!determineActiveSemantic(FD, Param, ActiveSemantic,
+                                 ActiveInputSemantics)) {
       Diag(Param->getLocation(), diag::note_previous_decl) << Param;
       FD->setInvalidDecl();
     }
@@ -947,6 +975,8 @@ void SemaHLSL::checkSemanticAnnotation(FunctionDecl *EntryPoint,
       return;
     DiagnoseAttrStageMismatch(SemanticAttr, ST, {llvm::Triple::Pixel});
     break;
+  case attr::HLSLUserSemantic:
+    return;
   default:
     llvm_unreachable("Unknown SemanticAttr");
   }
@@ -1766,7 +1796,7 @@ void SemaHLSL::handleSemanticAttr(Decl *D, const ParsedAttr &AL) {
   if (AL.getAttrName()->getName().starts_with_insensitive("SV_"))
     diagnoseSystemSemanticAttr(D, AL, Index);
   else
-    Diag(AL.getLoc(), diag::err_hlsl_unknown_semantic) << AL;
+    D->addAttr(createSemanticAttr<HLSLUserSemanticAttr>(AL, nullptr, Index));
 }
 
 void SemaHLSL::handlePackOffsetAttr(Decl *D, const ParsedAttr &AL) {
@@ -2802,6 +2832,23 @@ static bool CheckUnsignedIntRepresentation(Sema *S, SourceLocation Loc,
   return false;
 }
 
+static bool CheckExpectedBitWidth(Sema *S, CallExpr *TheCall,
+                                  unsigned ArgOrdinal, unsigned Width) {
+  QualType ArgTy = TheCall->getArg(0)->getType();
+  if (auto *VTy = ArgTy->getAs<VectorType>())
+    ArgTy = VTy->getElementType();
+  // ensure arg type has expected bit width
+  uint64_t ElementBitCount =
+      S->getASTContext().getTypeSizeInChars(ArgTy).getQuantity() * 8;
+  if (ElementBitCount != Width) {
+    S->Diag(TheCall->getArg(0)->getBeginLoc(),
+            diag::err_integer_incorrect_bit_count)
+        << Width << ElementBitCount;
+    return true;
+  }
+  return false;
+}
+
 static void SetElementTypeAsReturnType(Sema *S, CallExpr *TheCall,
                                        QualType ReturnType) {
   auto *VecTyA = TheCall->getArg(0)->getType()->getAs<VectorType>();
@@ -2961,24 +3008,16 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
                                    CheckUnsignedIntVecRepresentation))
       return true;
 
-    auto *VTy = TheCall->getArg(0)->getType()->getAs<VectorType>();
     // ensure arg integers are 32-bits
-    uint64_t ElementBitCount = getASTContext()
-                                   .getTypeSizeInChars(VTy->getElementType())
-                                   .getQuantity() *
-                               8;
-    if (ElementBitCount != 32) {
-      SemaRef.Diag(TheCall->getBeginLoc(),
-                   diag::err_integer_incorrect_bit_count)
-          << 32 << ElementBitCount;
+    if (CheckExpectedBitWidth(&SemaRef, TheCall, 0, 32))
       return true;
-    }
 
     // ensure both args are vectors of total bit size of a multiple of 64
+    auto *VTy = TheCall->getArg(0)->getType()->getAs<VectorType>();
     int NumElementsArg = VTy->getNumElements();
     if (NumElementsArg != 2 && NumElementsArg != 4) {
       SemaRef.Diag(TheCall->getBeginLoc(), diag::err_vector_incorrect_bit_count)
-          << 1 /*a multiple of*/ << 64 << NumElementsArg * ElementBitCount;
+          << 1 /*a multiple of*/ << 64 << NumElementsArg * 32;
       return true;
     }
 
@@ -3295,7 +3334,7 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     break;
   }
   // Note these are llvm builtins that we want to catch invalid intrinsic
-  // generation. Normal handling of these builitns will occur elsewhere.
+  // generation. Normal handling of these builtins will occur elsewhere.
   case Builtin::BI__builtin_elementwise_bitreverse: {
     // does not include a check for number of arguments
     // because that is done previously
@@ -3405,6 +3444,30 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     }
     break;
   }
+  case Builtin::BI__builtin_hlsl_elementwise_f16tof32: {
+    if (SemaRef.checkArgCount(TheCall, 1))
+      return true;
+    if (CheckAllArgTypesAreCorrect(&SemaRef, TheCall,
+                                   CheckUnsignedIntRepresentation))
+      return true;
+    // ensure arg integers are 32 bits
+    if (CheckExpectedBitWidth(&SemaRef, TheCall, 0, 32))
+      return true;
+    // check it wasn't a bool type
+    QualType ArgTy = TheCall->getArg(0)->getType();
+    if (auto *VTy = ArgTy->getAs<VectorType>())
+      ArgTy = VTy->getElementType();
+    if (ArgTy->isBooleanType()) {
+      SemaRef.Diag(TheCall->getArg(0)->getBeginLoc(),
+                   diag::err_builtin_invalid_arg_type)
+          << 1 << /* scalar or vector of */ 5 << /* unsigned int */ 3
+          << /* no fp */ 0 << TheCall->getArg(0)->getType();
+      return true;
+    }
+
+    SetElementTypeAsReturnType(&SemaRef, TheCall, getASTContext().FloatTy);
+    break;
+  }
   }
   return false;
 }
@@ -3847,12 +3910,15 @@ void SemaHLSL::ActOnVariableDeclarator(VarDecl *VD) {
     if (VD->getType()->isHLSLIntangibleType())
       collectResourceBindingsOnVarDecl(VD);
 
-    if (isResourceRecordTypeOrArrayOf(VD) ||
-        VD->hasAttr<HLSLVkConstantIdAttr>()) {
-      // Make the variable for resources static. The global externally visible
-      // storage is accessed through the handle, which is a member. The variable
-      // itself is not externally visible.
+    if (VD->hasAttr<HLSLVkConstantIdAttr>())
       VD->setStorageClass(StorageClass::SC_Static);
+
+    if (isResourceRecordTypeOrArrayOf(VD) &&
+        VD->getStorageClass() != SC_Static) {
+      // Add internal linkage attribute to non-static resource variables. The
+      // global externally visible storage is accessed through the handle, which
+      // is a member. The variable itself is not externally visible.
+      VD->addAttr(InternalLinkageAttr::CreateImplicit(getASTContext()));
     }
 
     // process explicit bindings
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 073010d16b428..cc6ddf568d346 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -1897,26 +1897,29 @@ void InitListChecker::CheckMatrixType(const InitializedEntity &Entity,
     return;
 
   const ConstantMatrixType *MT = DeclType->castAs<ConstantMatrixType>();
+
+  // For HLSL, the error reporting for this case is handled in SemaHLSL's
+  // initializer list diagnostics. That means the execution should require
+  // getNumElementsFlattened to equal getNumInits. In other words the execution
+  // should never reach this point if this condition is not true".
+  assert(IList->getNumInits() == MT->getNumElementsFlattened() &&
+         "Inits must equal Matrix element count");
+
   QualType ElemTy = MT->getElementType();
-  const unsigned MaxElts = MT->getNumElementsFlattened();
 
-  unsigned NumEltsInit = 0;
+  Index = 0;
   InitializedEntity ElemEnt =
       InitializedEntity::InitializeElement(SemaRef.Context, 0, Entity);
 
-  while (NumEltsInit < MaxElts && Index < IList->getNumInits()) {
+  while (Index < IList->getNumInits()) {
     // Not a sublist: just consume directly.
-    ElemEnt.setElementIndex(Index);
-    CheckSubElementType(ElemEnt, IList, ElemTy, Index, StructuredList,
+    unsigned ColMajorIndex = (Index % MT->getNumRows()) * MT->getNumColumns() +
+                             (Index / MT->getNumRows());
+    ElemEnt.setElementIndex(ColMajorIndex);
+    CheckSubElementType(ElemEnt, IList, ElemTy, ColMajorIndex, StructuredList,
                         StructuredIndex);
-    ++NumEltsInit;
+    ++Index;
   }
-
-  // For HLSL The error for this case is handled in SemaHLSL's initializer
-  // list diagnostics, That means the execution should require NumEltsInit
-  // to equal Max initializers. In other words  execution should never
-  // reach this point if this condition is not true".
-  assert(NumEltsInit == MaxElts && "NumEltsInit must equal MaxElts");
 }
 
 void InitListChecker::CheckVectorType(const InitializedEntity &Entity,
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 256f9521b3a7e..2ab2fd10a942e 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -16532,6 +16532,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind,
   case OMPC_holds:
     Res = ActOnOpenMPHoldsClause(Expr, StartLoc, LParenLoc, EndLoc);
     break;
+  case OMPC_dyn_groupprivate:
   case OMPC_grainsize:
   case OMPC_num_tasks:
   case OMPC_num_threads:
@@ -16658,6 +16659,8 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
   case OMPC_num_teams:
   case OMPC_thread_limit:
   case OMPC_ompx_dyn_cgroup_mem:
+  case OMPC_dyn_groupprivate:
+    // TODO: This may need to consider teams too.
     if (Leafs[0] == OMPD_target)
       return OMPD_target;
     break;
@@ -17316,45 +17319,101 @@ OMPClause *SemaOpenMP::ActOnOpenMPDefaultClause(
         << getOpenMPClauseNameForDiag(OMPC_default);
     return nullptr;
   }
-
-  switch (M) {
-  case OMP_DEFAULT_none:
-    DSAStack->setDefaultDSANone(MLoc);
-    break;
-  case OMP_DEFAULT_shared:
-    DSAStack->setDefaultDSAShared(MLoc);
-    break;
-  case OMP_DEFAULT_firstprivate:
-    DSAStack->setDefaultDSAFirstPrivate(MLoc);
-    break;
-  case OMP_DEFAULT_private:
-    DSAStack->setDefaultDSAPrivate(MLoc);
-    break;
-  default:
-    llvm_unreachable("DSA unexpected in OpenMP default clause");
-  }
-
-  switch (VCKind) {
-  case OMPC_DEFAULT_VC_aggregate:
-    DSAStack->setDefaultDSAVCAggregate(VCKindLoc);
-    break;
-  case OMPC_DEFAULT_VC_all:
-    DSAStack->setDefaultDSAVCAll(VCKindLoc);
-    break;
-  case OMPC_DEFAULT_VC_allocatable:
-    DSAStack->setDefaultDSAVCAllocatable(VCKindLoc);
-    break;
-  case OMPC_DEFAULT_VC_pointer:
-    DSAStack->setDefaultDSAVCPointer(VCKindLoc);
-    break;
-  case OMPC_DEFAULT_VC_scalar:
-    DSAStack->setDefaultDSAVCScalar(VCKindLoc);
-    break;
-  default:
+  if (VCKind == OMPC_DEFAULT_VC_unknown) {
     Diag(VCKindLoc, diag::err_omp_default_vc)
         << getOpenMPSimpleClauseTypeName(OMPC_default, unsigned(M));
+    return nullptr;
   }
 
+  bool IsTargetDefault =
+      getLangOpts().OpenMP >= 60 &&
+      isOpenMPTargetExecutionDirective(DSAStack->getCurrentDirective());
+
+  // OpenMP 6.0, page 224, lines 3-4 default Clause, Semantics
+  // If data-sharing-attribute is shared then the clause has no effect
+  // on a target construct;
+  if (IsTargetDefault && M == OMP_DEFAULT_shared)
+    return nullptr;
+
+  auto SetDefaultClauseAttrs = [&](llvm::omp::DefaultKind M,
+                                   OpenMPDefaultClauseVariableCategory VCKind) {
+    OpenMPDefaultmapClauseModifier DefMapMod;
+    OpenMPDefaultmapClauseKind DefMapKind;
+    // default data-sharing-attribute
+    switch (M) {
+    case OMP_DEFAULT_none:
+      if (IsTargetDefault)
+        DefMapMod = OMPC_DEFAULTMAP_MODIFIER_none;
+      else
+        DSAStack->setDefaultDSANone(MLoc);
+      break;
+    case OMP_DEFAULT_firstprivate:
+      if (IsTargetDefault)
+        DefMapMod = OMPC_DEFAULTMAP_MODIFIER_firstprivate;
+      else
+        DSAStack->setDefaultDSAFirstPrivate(MLoc);
+      break;
+    case OMP_DEFAULT_private:
+      if (IsTargetDefault)
+        DefMapMod = OMPC_DEFAULTMAP_MODIFIER_private;
+      else
+        DSAStack->setDefaultDSAPrivate(MLoc);
+      break;
+    case OMP_DEFAULT_shared:
+      assert(!IsTargetDefault && "DSA shared invalid with target directive");
+      DSAStack->setDefaultDSAShared(MLoc);
+      break;
+    default:
+      llvm_unreachable("unexpected DSA in OpenMP default clause");
+    }
+    // default variable-category
+    switch (VCKind) {
+    case OMPC_DEFAULT_VC_aggregate:
+      if (IsTargetDefault)
+        DefMapKind = OMPC_DEFAULTMAP_aggregate;
+      else
+        DSAStack->setDefaultDSAVCAggregate(VCKindLoc);
+      break;
+    case OMPC_DEFAULT_VC_pointer:
+      if (IsTargetDefault)
+        DefMapKind = OMPC_DEFAULTMAP_pointer;
+      else
+        DSAStack->setDefaultDSAVCPointer(VCKindLoc);
+      break;
+    case OMPC_DEFAULT_VC_scalar:
+      if (IsTargetDefault)
+        DefMapKind = OMPC_DEFAULTMAP_scalar;
+      else
+        DSAStack->setDefaultDSAVCScalar(VCKindLoc);
+      break;
+    case OMPC_DEFAULT_VC_all:
+      if (IsTargetDefault)
+        DefMapKind = OMPC_DEFAULTMAP_all;
+      else
+        DSAStack->setDefaultDSAVCAll(VCKindLoc);
+      break;
+    default:
+      llvm_unreachable("unexpected variable category in OpenMP default clause");
+    }
+    // OpenMP 6.0, page 224, lines 4-5 default Clause, Semantics
+    // otherwise, its effect on a target construct is equivalent to
+    // specifying the defaultmap clause with the same data-sharing-attribute
+    // and variable-category.
+    //
+    // If earlier than OpenMP 6.0, or not a target directive, the default DSA
+    // is/was set as before.
+    if (IsTargetDefault) {
+      if (DefMapKind == OMPC_DEFAULTMAP_all) {
+        DSAStack->setDefaultDMAAttr(DefMapMod, OMPC_DEFAULTMAP_aggregate, MLoc);
+        DSAStack->setDefaultDMAAttr(DefMapMod, OMPC_DEFAULTMAP_scalar, MLoc);
+        DSAStack->setDefaultDMAAttr(DefMapMod, OMPC_DEFAULTMAP_pointer, MLoc);
+      } else {
+        DSAStack->setDefaultDMAAttr(DefMapMod, DefMapKind, MLoc);
+      }
+    }
+  };
+
+  SetDefaultClauseAttrs(M, VCKind);
   return new (getASTContext())
       OMPDefaultClause(M, MLoc, VCKind, VCKindLoc, StartLoc, LParenLoc, EndLoc);
 }
@@ -17705,7 +17764,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprWithArgClause(
     SourceLocation EndLoc) {
   OMPClause *Res = nullptr;
   switch (Kind) {
-  case OMPC_schedule:
+  case OMPC_schedule: {
     enum { Modifier1, Modifier2, ScheduleKind, NumberOfElements };
     assert(Argument.size() == NumberOfElements &&
            ArgumentLoc.size() == NumberOfElements);
@@ -17716,6 +17775,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprWithArgClause(
         StartLoc, LParenLoc, ArgumentLoc[Modifier1], ArgumentLoc[Modifier2],
         ArgumentLoc[ScheduleKind], DelimLoc, EndLoc);
     break;
+  }
   case OMPC_if:
     assert(Argument.size() == 1 && ArgumentLoc.size() == 1);
     Res = ActOnOpenMPIfClause(static_cast<OpenMPDirectiveKind>(Argument.back()),
@@ -17771,6 +17831,20 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprWithArgClause(
         static_cast<OpenMPNumTasksClauseModifier>(Argument.back()), Expr,
         StartLoc, LParenLoc, ArgumentLoc.back(), EndLoc);
     break;
+  case OMPC_dyn_groupprivate: {
+    enum { Modifier1, Modifier2, NumberOfElements };
+    assert(Argument.size() == NumberOfElements &&
+           ArgumentLoc.size() == NumberOfElements &&
+           "Modifiers for dyn_groupprivate clause and their locations are "
+           "expected.");
+    Res = ActOnOpenMPDynGroupprivateClause(
+        static_cast<OpenMPDynGroupprivateClauseModifier>(Argument[Modifier1]),
+        static_cast<OpenMPDynGroupprivateClauseFallbackModifier>(
+            Argument[Modifier2]),
+        Expr, StartLoc, LParenLoc, ArgumentLoc[Modifier1],
+        ArgumentLoc[Modifier2], EndLoc);
+    break;
+  }
   case OMPC_num_threads:
     assert(Argument.size() == 1 && ArgumentLoc.size() == 1 &&
            "Modifier for num_threads clause and its location are expected.");
@@ -18127,6 +18201,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPClause(OpenMPClauseKind Kind,
   case OMPC_affinity:
   case OMPC_when:
   case OMPC_ompx_dyn_cgroup_mem:
+  case OMPC_dyn_groupprivate:
   default:
     llvm_unreachable("Clause is not allowed.");
   }
@@ -25246,6 +25321,49 @@ OMPClause *SemaOpenMP::ActOnOpenMPXDynCGroupMemClause(Expr *Size,
       ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc);
 }
 
+OMPClause *SemaOpenMP::ActOnOpenMPDynGroupprivateClause(
+    OpenMPDynGroupprivateClauseModifier M1,
+    OpenMPDynGroupprivateClauseFallbackModifier M2, Expr *Size,
+    SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation M1Loc,
+    SourceLocation M2Loc, SourceLocation EndLoc) {
+
+  if ((M1Loc.isValid() && M1 == OMPC_DYN_GROUPPRIVATE_unknown) ||
+      (M2Loc.isValid() && M2 == OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown)) {
+    std::string Values = getListOfPossibleValues(
+        OMPC_dyn_groupprivate, /*First=*/0, OMPC_DYN_GROUPPRIVATE_unknown);
+    Diag((M1Loc.isValid() && M1 == OMPC_DYN_GROUPPRIVATE_unknown) ? M1Loc
+                                                                  : M2Loc,
+         diag::err_omp_unexpected_clause_value)
+        << Values << getOpenMPClauseName(OMPC_dyn_groupprivate);
+    return nullptr;
+  }
+
+  Expr *ValExpr = Size;
+  Stmt *HelperValStmt = nullptr;
+
+  // OpenMP [2.5, Restrictions]
+  //  The dyn_groupprivate expression must evaluate to a positive integer
+  //  value.
+  if (!isNonNegativeIntegerValue(ValExpr, SemaRef, OMPC_dyn_groupprivate,
+                                 /*StrictlyPositive=*/false))
+    return nullptr;
+
+  OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
+  OpenMPDirectiveKind CaptureRegion = getOpenMPCaptureRegionForClause(
+      DKind, OMPC_dyn_groupprivate, getLangOpts().OpenMP);
+  if (CaptureRegion != OMPD_unknown &&
+      !SemaRef.CurContext->isDependentContext()) {
+    ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
+    llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
+    ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get();
+    HelperValStmt = buildPreInits(getASTContext(), Captures);
+  }
+
+  return new (getASTContext()) OMPDynGroupprivateClause(
+      StartLoc, LParenLoc, EndLoc, ValExpr, HelperValStmt, CaptureRegion, M1,
+      M1Loc, M2, M2Loc);
+}
+
 OMPClause *SemaOpenMP::ActOnOpenMPDoacrossClause(
     OpenMPDoacrossClauseModifier DepType, SourceLocation DepLoc,
     SourceLocation ColonLoc, ArrayRef<Expr *> VarList, SourceLocation StartLoc,
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index f39896336053e..5b3ef1adf38e3 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -3281,6 +3281,9 @@ static Scope *FindLabeledBreakContinueScope(Sema &S, Scope *CurScope,
                                             SourceLocation LabelLoc,
                                             bool IsContinue) {
   assert(Target && "not a named break/continue?");
+
+  Target->markUsed(S.Context);
+
   Scope *Found = nullptr;
   for (Scope *Scope = CurScope; Scope; Scope = Scope->getParent()) {
     if (Scope->isFunctionScope())
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 2cc65935def53..4a9e1bc93b918 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -949,11 +949,11 @@ static TemplateArgumentLoc translateTemplateArgument(Sema &SemaRef,
 
   switch (Arg.getKind()) {
   case ParsedTemplateArgument::Type: {
-    TypeSourceInfo *DI;
-    QualType T = SemaRef.GetTypeFromParser(Arg.getAsType(), &DI);
-    if (!DI)
-      DI = SemaRef.Context.getTrivialTypeSourceInfo(T, Arg.getNameLoc());
-    return TemplateArgumentLoc(TemplateArgument(T), DI);
+    TypeSourceInfo *TSI;
+    QualType T = SemaRef.GetTypeFromParser(Arg.getAsType(), &TSI);
+    if (!TSI)
+      TSI = SemaRef.Context.getTrivialTypeSourceInfo(T, Arg.getNameLoc());
+    return TemplateArgumentLoc(TemplateArgument(T), TSI);
   }
 
   case ParsedTemplateArgument::NonType: {
@@ -3846,13 +3846,14 @@ QualType Sema::CheckTemplateIdType(ElaboratedTypeKeyword Keyword,
       // within enable_if in a SFINAE context, dig out the specific
       // enable_if condition that failed and present that instead.
       if (isEnableIfAliasTemplate(AliasTemplate)) {
-        if (auto DeductionInfo = isSFINAEContext()) {
-          if (*DeductionInfo &&
-              (*DeductionInfo)->hasSFINAEDiagnostic() &&
-              (*DeductionInfo)->peekSFINAEDiagnostic().second.getDiagID() ==
-                diag::err_typename_nested_not_found_enable_if &&
-              TemplateArgs[0].getArgument().getKind()
-                == TemplateArgument::Expression) {
+        if (SFINAETrap *Trap = getSFINAEContext();
+            TemplateDeductionInfo *DeductionInfo =
+                Trap ? Trap->getDeductionInfo() : nullptr) {
+          if (DeductionInfo->hasSFINAEDiagnostic() &&
+              DeductionInfo->peekSFINAEDiagnostic().second.getDiagID() ==
+                  diag::err_typename_nested_not_found_enable_if &&
+              TemplateArgs[0].getArgument().getKind() ==
+                  TemplateArgument::Expression) {
             Expr *FailedCond;
             std::string FailedDescription;
             std::tie(FailedCond, FailedDescription) =
@@ -3861,15 +3862,14 @@ QualType Sema::CheckTemplateIdType(ElaboratedTypeKeyword Keyword,
             // Remove the old SFINAE diagnostic.
             PartialDiagnosticAt OldDiag =
               {SourceLocation(), PartialDiagnostic::NullDiagnostic()};
-            (*DeductionInfo)->takeSFINAEDiagnostic(OldDiag);
+            DeductionInfo->takeSFINAEDiagnostic(OldDiag);
 
             // Add a new SFINAE diagnostic specifying which condition
             // failed.
-            (*DeductionInfo)->addSFINAEDiagnostic(
-              OldDiag.first,
-              PDiag(diag::err_typename_nested_not_found_requirement)
-                << FailedDescription
-                << FailedCond->getSourceRange());
+            DeductionInfo->addSFINAEDiagnostic(
+                OldDiag.first,
+                PDiag(diag::err_typename_nested_not_found_requirement)
+                    << FailedDescription << FailedCond->getSourceRange());
           }
         }
       }
@@ -3955,6 +3955,7 @@ QualType Sema::CheckTemplateIdType(ElaboratedTypeKeyword Keyword,
 
     if (Decl->getSpecializationKind() == TSK_Undeclared &&
         ClassTemplate->getTemplatedDecl()->hasAttrs()) {
+      NonSFINAEContext _(*this);
       InstantiatingTemplate Inst(*this, TemplateLoc, Decl);
       if (!Inst.isInvalid()) {
         MultiLevelTemplateArgumentList TemplateArgLists(Template,
@@ -4329,7 +4330,7 @@ void Sema::CheckDeductionGuideTemplate(FunctionTemplateDecl *TD) {
 }
 
 DeclResult Sema::ActOnVarTemplateSpecialization(
-    Scope *S, Declarator &D, TypeSourceInfo *DI, LookupResult &Previous,
+    Scope *S, Declarator &D, TypeSourceInfo *TSI, LookupResult &Previous,
     SourceLocation TemplateKWLoc, TemplateParameterList *TemplateParams,
     StorageClass SC, bool IsPartialSpecialization) {
   // D must be variable template id.
@@ -4455,8 +4456,8 @@ DeclResult Sema::ActOnVarTemplateSpecialization(
     VarTemplatePartialSpecializationDecl *Partial =
         VarTemplatePartialSpecializationDecl::Create(
             Context, VarTemplate->getDeclContext(), TemplateKWLoc,
-            TemplateNameLoc, TemplateParams, VarTemplate, DI->getType(), DI, SC,
-            CTAI.CanonicalConverted);
+            TemplateNameLoc, TemplateParams, VarTemplate, TSI->getType(), TSI,
+            SC, CTAI.CanonicalConverted);
     Partial->setTemplateArgsAsWritten(TemplateArgs);
 
     if (!PrevPartial)
@@ -4474,7 +4475,7 @@ DeclResult Sema::ActOnVarTemplateSpecialization(
     // this explicit specialization or friend declaration.
     Specialization = VarTemplateSpecializationDecl::Create(
         Context, VarTemplate->getDeclContext(), TemplateKWLoc, TemplateNameLoc,
-        VarTemplate, DI->getType(), DI, SC, CTAI.CanonicalConverted);
+        VarTemplate, TSI->getType(), TSI, SC, CTAI.CanonicalConverted);
     Specialization->setTemplateArgsAsWritten(TemplateArgs);
 
     if (!PrevDecl)
@@ -5565,12 +5566,11 @@ bool Sema::CheckTemplateArgument(NamedDecl *Param, TemplateArgumentLoc &ArgLoc,
 
     auto checkExpr = [&](Expr *E) -> Expr * {
       TemplateArgument SugaredResult, CanonicalResult;
-      unsigned CurSFINAEErrors = NumSFINAEErrors;
       ExprResult Res = CheckTemplateArgument(
           NTTP, NTTPType, E, SugaredResult, CanonicalResult,
           /*StrictCheck=*/CTAI.MatchingTTP || CTAI.PartialOrdering, CTAK);
       // If the current template argument causes an error, give up now.
-      if (Res.isInvalid() || CurSFINAEErrors < NumSFINAEErrors)
+      if (Res.isInvalid())
         return nullptr;
       CTAI.SugaredConverted.push_back(SugaredResult);
       CTAI.CanonicalConverted.push_back(CanonicalResult);
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 6964242b39d6e..a287319cc4f88 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3239,10 +3239,6 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
     ArrayRef<TemplateArgumentLoc> Ps, ArrayRef<TemplateArgument> As,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced,
     TemplateDeductionInfo &Info, bool CopyDeducedArgs) {
-  // Unevaluated SFINAE context.
-  EnterExpressionEvaluationContext Unevaluated(
-      S, Sema::ExpressionEvaluationContext::Unevaluated);
-
   Sema::ContextRAII SavedContext(S, getAsDeclContextOrEnclosing(Entity));
 
   // C++ [temp.deduct.type]p2:
@@ -3380,10 +3376,6 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
     Sema &S, TemplateDecl *TD,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced,
     TemplateDeductionInfo &Info) {
-  // Unevaluated SFINAE context.
-  EnterExpressionEvaluationContext Unevaluated(
-      S, Sema::ExpressionEvaluationContext::Unevaluated);
-
   Sema::ContextRAII SavedContext(S, getAsDeclContextOrEnclosing(TD));
 
   // C++ [temp.deduct.type]p2:
@@ -3423,7 +3415,7 @@ DeduceTemplateArguments(Sema &S, T *Partial,
   // Unevaluated SFINAE context.
   EnterExpressionEvaluationContext Unevaluated(
       S, Sema::ExpressionEvaluationContext::Unevaluated);
-  Sema::SFINAETrap Trap(S);
+  Sema::SFINAETrap Trap(S, Info);
 
   // This deduction has no relation to any outer instantiation we might be
   // performing.
@@ -3441,8 +3433,7 @@ DeduceTemplateArguments(Sema &S, T *Partial,
     return Result;
 
   SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(), Deduced.end());
-  Sema::InstantiatingTemplate Inst(S, Info.getLocation(), Partial, DeducedArgs,
-                                   Info);
+  Sema::InstantiatingTemplate Inst(S, Info.getLocation(), Partial, DeducedArgs);
   if (Inst.isInvalid())
     return TemplateDeductionResult::InstantiationDepth;
 
@@ -3497,7 +3488,7 @@ Sema::DeduceTemplateArgumentsFromType(TemplateDecl *TD, QualType FromType,
   // Unevaluated SFINAE context.
   EnterExpressionEvaluationContext Unevaluated(
       *this, Sema::ExpressionEvaluationContext::Unevaluated);
-  SFINAETrap Trap(*this);
+  SFINAETrap Trap(*this, Info);
 
   // This deduction has no relation to any outer instantiation we might be
   // performing.
@@ -3514,7 +3505,7 @@ Sema::DeduceTemplateArgumentsFromType(TemplateDecl *TD, QualType FromType,
   }
 
   SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(), Deduced.end());
-  InstantiatingTemplate Inst(*this, Info.getLocation(), TD, DeducedArgs, Info);
+  InstantiatingTemplate Inst(*this, Info.getLocation(), TD, DeducedArgs);
   if (Inst.isInvalid())
     return TemplateDeductionResult::InstantiationDepth;
 
@@ -3558,6 +3549,9 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
     SmallVectorImpl<DeducedTemplateArgument> &Deduced,
     SmallVectorImpl<QualType> &ParamTypes, QualType *FunctionType,
     TemplateDeductionInfo &Info) {
+  assert(isSFINAEContext());
+  assert(isUnevaluatedContext());
+
   FunctionDecl *Function = FunctionTemplate->getTemplatedDecl();
   TemplateParameterList *TemplateParams
     = FunctionTemplate->getTemplateParameters();
@@ -3573,11 +3567,6 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
     return TemplateDeductionResult::Success;
   }
 
-  // Unevaluated SFINAE context.
-  EnterExpressionEvaluationContext Unevaluated(
-      *this, Sema::ExpressionEvaluationContext::Unevaluated);
-  SFINAETrap Trap(*this);
-
   // C++ [temp.arg.explicit]p3:
   //   Template arguments that are present shall be specified in the
   //   declaration order of their corresponding template-parameters. The
@@ -3590,7 +3579,7 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
   SmallVector<TemplateArgument, 4> DeducedArgs;
   InstantiatingTemplate Inst(
       *this, Info.getLocation(), FunctionTemplate, DeducedArgs,
-      CodeSynthesisContext::ExplicitTemplateArgumentSubstitution, Info);
+      CodeSynthesisContext::ExplicitTemplateArgumentSubstitution);
   if (Inst.isInvalid())
     return TemplateDeductionResult::InstantiationDepth;
 
@@ -3598,8 +3587,7 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
   if (CheckTemplateArgumentList(FunctionTemplate, SourceLocation(),
                                 ExplicitTemplateArgs, /*DefaultArgs=*/{},
                                 /*PartialTemplateArgs=*/true, CTAI,
-                                /*UpdateArgsWithConversions=*/false) ||
-      Trap.hasErrorOccurred()) {
+                                /*UpdateArgsWithConversions=*/false)) {
     unsigned Index = CTAI.SugaredConverted.size();
     if (Index >= TemplateParams->size())
       return TemplateDeductionResult::SubstitutionFailure;
@@ -3688,7 +3676,7 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
     ResultType =
         SubstType(Proto->getReturnType(), MLTAL,
                   Function->getTypeSpecStartLoc(), Function->getDeclName());
-    if (ResultType.isNull() || Trap.hasErrorOccurred())
+    if (ResultType.isNull())
       return TemplateDeductionResult::SubstitutionFailure;
     // CUDA: Kernel function must have 'void' return type.
     if (getLangOpts().CUDA)
@@ -3714,7 +3702,7 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments(
                                       Function->getLocation(),
                                       Function->getDeclName(),
                                       EPI);
-    if (FunctionType->isNull() || Trap.hasErrorOccurred())
+    if (FunctionType->isNull())
       return TemplateDeductionResult::SubstitutionFailure;
   }
 
@@ -3912,12 +3900,15 @@ static TemplateDeductionResult instantiateExplicitSpecifierDeferred(
   if (!ExplicitExpr->isValueDependent())
     return TemplateDeductionResult::Success;
 
+  // By this point, FinishTemplateArgumentDeduction will have been reverted back
+  // to a regular non-SFINAE template instantiation context, so setup a new
+  // SFINAE context.
   Sema::InstantiatingTemplate Inst(
       S, Info.getLocation(), FunctionTemplate, DeducedArgs,
-      Sema::CodeSynthesisContext::DeducedTemplateArgumentSubstitution, Info);
+      Sema::CodeSynthesisContext::DeducedTemplateArgumentSubstitution);
   if (Inst.isInvalid())
     return TemplateDeductionResult::InstantiationDepth;
-  Sema::SFINAETrap Trap(S);
+  Sema::SFINAETrap Trap(S, Info);
   const ExplicitSpecifier InstantiatedES =
       S.instantiateExplicitSpecifier(SubstArgs, ES);
   if (InstantiatedES.isInvalid() || Trap.hasErrorOccurred()) {
@@ -3937,17 +3928,12 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
     bool PartialOverloading, bool PartialOrdering,
     bool ForOverloadSetAddressResolution,
     llvm::function_ref<bool(bool)> CheckNonDependent) {
-  // Unevaluated SFINAE context.
-  EnterExpressionEvaluationContext Unevaluated(
-      *this, Sema::ExpressionEvaluationContext::Unevaluated);
-  SFINAETrap Trap(*this);
-
   // Enter a new template instantiation context while we instantiate the
   // actual function declaration.
   SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(), Deduced.end());
   InstantiatingTemplate Inst(
       *this, Info.getLocation(), FunctionTemplate, DeducedArgs,
-      CodeSynthesisContext::DeducedTemplateArgumentSubstitution, Info);
+      CodeSynthesisContext::DeducedTemplateArgumentSubstitution);
   if (Inst.isInvalid())
     return TemplateDeductionResult::InstantiationDepth;
 
@@ -4030,18 +4016,9 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
   // If the template argument list is owned by the function template
   // specialization, release it.
   if (Specialization->getTemplateSpecializationArgs() ==
-          CanonicalDeducedArgumentList &&
-      !Trap.hasErrorOccurred())
+      CanonicalDeducedArgumentList)
     Info.takeCanonical();
 
-  // There may have been an error that did not prevent us from constructing a
-  // declaration. Mark the declaration invalid and return with a substitution
-  // failure.
-  if (Trap.hasErrorOccurred()) {
-    Specialization->setInvalidDecl(true);
-    return TemplateDeductionResult::SubstitutionFailure;
-  }
-
   // C++2a [temp.deduct]p5
   //   [...] When all template arguments have been deduced [...] all uses of
   //   template parameters [...] are replaced with the corresponding deduced
@@ -4553,6 +4530,10 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
       return TemplateDeductionResult::TooManyArguments;
   }
 
+  EnterExpressionEvaluationContext Unevaluated(
+      *this, Sema::ExpressionEvaluationContext::Unevaluated);
+  Sema::SFINAETrap Trap(*this, Info);
+
   // The types of the parameters from which we will perform template argument
   // deduction.
   LocalInstantiationScope InstScope(*this);
@@ -4570,6 +4551,8 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
     });
     if (Result != TemplateDeductionResult::Success)
       return Result;
+    if (Trap.hasErrorOccurred())
+      return TemplateDeductionResult::SubstitutionFailure;
 
     NumExplicitlySpecified = Deduced.size();
   } else {
@@ -4743,6 +4726,11 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
                                    OnlyInitializeNonUserDefinedConversions);
         });
   });
+  if (Trap.hasErrorOccurred()) {
+    if (Specialization)
+      Specialization->setInvalidDecl(true);
+    return TemplateDeductionResult::SubstitutionFailure;
+  }
   return Result;
 }
 
@@ -4795,6 +4783,14 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
     = FunctionTemplate->getTemplateParameters();
   QualType FunctionType = Function->getType();
 
+  bool PotentiallyEvaluated =
+      currentEvaluationContext().isPotentiallyEvaluated();
+
+  // Unevaluated SFINAE context.
+  EnterExpressionEvaluationContext Unevaluated(
+      *this, Sema::ExpressionEvaluationContext::Unevaluated);
+  SFINAETrap Trap(*this, Info);
+
   // Substitute any explicit template arguments.
   LocalInstantiationScope InstScope(*this);
   SmallVector<DeducedTemplateArgument, 4> Deduced;
@@ -4809,6 +4805,8 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
     });
     if (Result != TemplateDeductionResult::Success)
       return Result;
+    if (Trap.hasErrorOccurred())
+      return TemplateDeductionResult::SubstitutionFailure;
 
     NumExplicitlySpecified = Deduced.size();
   }
@@ -4820,11 +4818,6 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
     ArgFunctionType = adjustCCAndNoReturn(ArgFunctionType, FunctionType,
                                           /*AdjustExceptionSpec*/false);
 
-  // Unevaluated SFINAE context.
-  std::optional<EnterExpressionEvaluationContext> Unevaluated(
-      std::in_place, *this, Sema::ExpressionEvaluationContext::Unevaluated);
-  SFINAETrap Trap(*this);
-
   Deduced.resize(TemplateParams->size());
 
   // If the function has a deduced return type, substitute it for a dependent
@@ -4865,14 +4858,12 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
       DeduceReturnType(Specialization, Info.getLocation(), false))
     return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
-  Unevaluated = std::nullopt;
   // [C++26][expr.const]/p17
   // An expression or conversion is immediate-escalating if it is not initially
   // in an immediate function context and it is [...]
   // a potentially-evaluated id-expression that denotes an immediate function.
   if (IsAddressOfFunction && getLangOpts().CPlusPlus20 &&
-      Specialization->isImmediateEscalating() &&
-      currentEvaluationContext().isPotentiallyEvaluated() &&
+      Specialization->isImmediateEscalating() && PotentiallyEvaluated &&
       CheckIfFunctionSpecializationIsImmediate(Specialization,
                                                Info.getLocation()))
     return TemplateDeductionResult::MiscellaneousDeductionFailure;
@@ -4975,7 +4966,7 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
   // Unevaluated SFINAE context.
   EnterExpressionEvaluationContext Unevaluated(
       *this, Sema::ExpressionEvaluationContext::Unevaluated);
-  SFINAETrap Trap(*this);
+  SFINAETrap Trap(*this, Info);
 
   // C++ [temp.deduct.conv]p1:
   //   Template argument deduction is done by comparing the return
@@ -5614,10 +5605,6 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
     Sema &S, FunctionTemplateDecl *FTD,
     SmallVectorImpl<DeducedTemplateArgument> &Deduced,
     TemplateDeductionInfo &Info, T &&CheckDeductionConsistency) {
-  EnterExpressionEvaluationContext Unevaluated(
-      S, Sema::ExpressionEvaluationContext::Unevaluated);
-  Sema::SFINAETrap Trap(S);
-
   Sema::ContextRAII SavedContext(S, getAsDeclContextOrEnclosing(FTD));
 
   // C++26 [temp.deduct.type]p2:
@@ -5645,13 +5632,7 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
   // and verify that the instantiated argument is both valid
   // and equivalent to the parameter.
   LocalInstantiationScope InstScope(S);
-
-  if (auto TDR = CheckDeductionConsistency(S, FTD, CTAI.SugaredConverted);
-      TDR != TemplateDeductionResult::Success)
-    return TDR;
-
-  return Trap.hasErrorOccurred() ? TemplateDeductionResult::SubstitutionFailure
-                                 : TemplateDeductionResult::Success;
+  return CheckDeductionConsistency(S, FTD, CTAI.SugaredConverted);
 }
 
 /// Determine whether the function template \p FT1 is at least as
@@ -5717,9 +5698,12 @@ static bool isAtLeastAsSpecializedAs(
   }
 
   SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(), Deduced.end());
+  EnterExpressionEvaluationContext Unevaluated(
+      S, Sema::ExpressionEvaluationContext::Unevaluated);
+  Sema::SFINAETrap Trap(S, Info);
   Sema::InstantiatingTemplate Inst(
       S, Info.getLocation(), FT2, DeducedArgs,
-      Sema::CodeSynthesisContext::DeducedTemplateArgumentSubstitution, Info);
+      Sema::CodeSynthesisContext::DeducedTemplateArgumentSubstitution);
   if (Inst.isInvalid())
     return false;
 
@@ -5765,7 +5749,7 @@ static bool isAtLeastAsSpecializedAs(
                   });
             }) == TemplateDeductionResult::Success;
   });
-  if (!AtLeastAsSpecialized)
+  if (!AtLeastAsSpecialized || Trap.hasErrorOccurred())
     return false;
 
   // C++0x [temp.deduct.partial]p11:
@@ -6241,10 +6225,11 @@ static bool isAtLeastAsSpecializedAs(Sema &S, QualType T1, QualType T2,
           /*HasDeducedAnyParam=*/nullptr) != TemplateDeductionResult::Success)
     return false;
 
-  SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(),
-                                               Deduced.end());
-  Sema::InstantiatingTemplate Inst(S, Info.getLocation(), P2, DeducedArgs,
-                                   Info);
+  SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(), Deduced.end());
+  EnterExpressionEvaluationContext Unevaluated(
+      S, Sema::ExpressionEvaluationContext::Unevaluated);
+  Sema::SFINAETrap Trap(S, Info);
+  Sema::InstantiatingTemplate Inst(S, Info.getLocation(), P2, DeducedArgs);
   if (Inst.isInvalid())
     return false;
 
@@ -6252,8 +6237,6 @@ static bool isAtLeastAsSpecializedAs(Sema &S, QualType T1, QualType T2,
       Ps = cast<TemplateSpecializationType>(T2)->template_arguments(),
       As = cast<TemplateSpecializationType>(T1)->template_arguments();
 
-  Sema::SFINAETrap Trap(S);
-
   TemplateDeductionResult Result;
   S.runWithSufficientStackSpace(Info.getLocation(), [&] {
     Result = ::FinishTemplateArgumentDeduction(
@@ -6261,14 +6244,7 @@ static bool isAtLeastAsSpecializedAs(Sema &S, QualType T1, QualType T2,
         /*IsPartialOrdering=*/true, Ps, As, Deduced, Info,
         /*CopyDeducedArgs=*/false);
   });
-
-  if (Result != TemplateDeductionResult::Success)
-    return false;
-
-  if (Trap.hasErrorOccurred())
-    return false;
-
-  return true;
+  return Result == TemplateDeductionResult::Success && !Trap.hasErrorOccurred();
 }
 
 namespace {
diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index ad50600f6399c..bfb10665c25b1 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -632,41 +632,42 @@ struct ConvertConstructorToDeductionGuideTransform {
       ParmVarDecl *OldParam, MultiLevelTemplateArgumentList &Args,
       llvm::SmallVectorImpl<TypedefNameDecl *> &MaterializedTypedefs,
       bool TransformingOuterPatterns) {
-    TypeSourceInfo *OldDI = OldParam->getTypeSourceInfo();
-    TypeSourceInfo *NewDI;
-    if (auto PackTL = OldDI->getTypeLoc().getAs<PackExpansionTypeLoc>()) {
+    TypeSourceInfo *OldTSI = OldParam->getTypeSourceInfo();
+    TypeSourceInfo *NewTSI;
+    if (auto PackTL = OldTSI->getTypeLoc().getAs<PackExpansionTypeLoc>()) {
       // Expand out the one and only element in each inner pack.
       Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, 0u);
-      NewDI =
+      NewTSI =
           SemaRef.SubstType(PackTL.getPatternLoc(), Args,
                             OldParam->getLocation(), OldParam->getDeclName());
-      if (!NewDI)
+      if (!NewTSI)
         return nullptr;
-      NewDI =
-          SemaRef.CheckPackExpansion(NewDI, PackTL.getEllipsisLoc(),
+      NewTSI =
+          SemaRef.CheckPackExpansion(NewTSI, PackTL.getEllipsisLoc(),
                                      PackTL.getTypePtr()->getNumExpansions());
     } else
-      NewDI = SemaRef.SubstType(OldDI, Args, OldParam->getLocation(),
-                                OldParam->getDeclName());
-    if (!NewDI)
+      NewTSI = SemaRef.SubstType(OldTSI, Args, OldParam->getLocation(),
+                                 OldParam->getDeclName());
+    if (!NewTSI)
       return nullptr;
 
     // Extract the type. This (for instance) replaces references to typedef
     // members of the current instantiations with the definitions of those
     // typedefs, avoiding triggering instantiation of the deduced type during
     // deduction.
-    NewDI = ExtractTypeForDeductionGuide(
-                SemaRef, MaterializedTypedefs, NestedPattern,
-                TransformingOuterPatterns ? &Args : nullptr)
-                .transform(NewDI);
-
+    NewTSI = ExtractTypeForDeductionGuide(
+                 SemaRef, MaterializedTypedefs, NestedPattern,
+                 TransformingOuterPatterns ? &Args : nullptr)
+                 .transform(NewTSI);
+    if (!NewTSI)
+      return nullptr;
     // Resolving a wording defect, we also inherit default arguments from the
     // constructor.
     ExprResult NewDefArg;
     if (OldParam->hasDefaultArg()) {
       // We don't care what the value is (we won't use it); just create a
       // placeholder to indicate there is a default argument.
-      QualType ParamTy = NewDI->getType();
+      QualType ParamTy = NewTSI->getType();
       NewDefArg = new (SemaRef.Context)
           OpaqueValueExpr(OldParam->getDefaultArgRange().getBegin(),
                           ParamTy.getNonLValueExprType(SemaRef.Context),
@@ -675,13 +676,13 @@ struct ConvertConstructorToDeductionGuideTransform {
                                                              : VK_PRValue);
     }
     // Handle arrays and functions decay.
-    auto NewType = NewDI->getType();
+    auto NewType = NewTSI->getType();
     if (NewType->isArrayType() || NewType->isFunctionType())
       NewType = SemaRef.Context.getDecayedType(NewType);
 
     ParmVarDecl *NewParam = ParmVarDecl::Create(
         SemaRef.Context, DC, OldParam->getInnerLocStart(),
-        OldParam->getLocation(), OldParam->getIdentifier(), NewType, NewDI,
+        OldParam->getLocation(), OldParam->getIdentifier(), NewType, NewTSI,
         OldParam->getStorageClass(), NewDefArg.get());
     NewParam->setScopeInfo(OldParam->getFunctionScopeDepth(),
                            OldParam->getFunctionScopeIndex());
@@ -1024,6 +1025,7 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
                                 TypeAliasTemplateDecl *AliasTemplate,
                                 FunctionTemplateDecl *F, SourceLocation Loc) {
   LocalInstantiationScope Scope(SemaRef);
+  Sema::NonSFINAEContext _1(SemaRef);
   Sema::InstantiatingTemplate BuildingDeductionGuides(
       SemaRef, AliasTemplate->getLocation(), F,
       Sema::InstantiatingTemplate::BuildingDeductionGuidesTag{});
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 7f858050db13e..35205f40cbcef 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -606,8 +606,7 @@ bool Sema::CodeSynthesisContext::isInstantiationRecord() const {
 Sema::InstantiatingTemplate::InstantiatingTemplate(
     Sema &SemaRef, CodeSynthesisContext::SynthesisKind Kind,
     SourceLocation PointOfInstantiation, SourceRange InstantiationRange,
-    Decl *Entity, NamedDecl *Template, ArrayRef<TemplateArgument> TemplateArgs,
-    sema::TemplateDeductionInfo *DeductionInfo)
+    Decl *Entity, NamedDecl *Template, ArrayRef<TemplateArgument> TemplateArgs)
     : SemaRef(SemaRef) {
   // Don't allow further instantiation if a fatal error and an uncompilable
   // error have occurred. Any diagnostics we might have raised will not be
@@ -625,7 +624,6 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
   Inst.Template = Template;
   Inst.TemplateArgs = TemplateArgs.data();
   Inst.NumTemplateArgs = TemplateArgs.size();
-  Inst.DeductionInfo = DeductionInfo;
   Inst.InstantiationRange = InstantiationRange;
   Inst.InConstraintSubstitution =
       Inst.Kind == CodeSynthesisContext::ConstraintSubstitution;
@@ -671,48 +669,40 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
     Sema &SemaRef, SourceLocation PointOfInstantiation,
     FunctionTemplateDecl *FunctionTemplate,
     ArrayRef<TemplateArgument> TemplateArgs,
-    CodeSynthesisContext::SynthesisKind Kind,
-    sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
+    CodeSynthesisContext::SynthesisKind Kind, SourceRange InstantiationRange)
     : InstantiatingTemplate(SemaRef, Kind, PointOfInstantiation,
                             InstantiationRange, FunctionTemplate, nullptr,
-                            TemplateArgs, &DeductionInfo) {
+                            TemplateArgs) {
   assert(Kind == CodeSynthesisContext::ExplicitTemplateArgumentSubstitution ||
          Kind == CodeSynthesisContext::DeducedTemplateArgumentSubstitution ||
          Kind == CodeSynthesisContext::BuildingDeductionGuides);
 }
 
 Sema::InstantiatingTemplate::InstantiatingTemplate(
-    Sema &SemaRef, SourceLocation PointOfInstantiation,
-    TemplateDecl *Template,
-    ArrayRef<TemplateArgument> TemplateArgs,
-    sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
+    Sema &SemaRef, SourceLocation PointOfInstantiation, TemplateDecl *Template,
+    ArrayRef<TemplateArgument> TemplateArgs, SourceRange InstantiationRange)
     : InstantiatingTemplate(
-          SemaRef,
-          CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
+          SemaRef, CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
           PointOfInstantiation, InstantiationRange, Template, nullptr,
-          TemplateArgs, &DeductionInfo) {}
+          TemplateArgs) {}
 
 Sema::InstantiatingTemplate::InstantiatingTemplate(
     Sema &SemaRef, SourceLocation PointOfInstantiation,
     ClassTemplatePartialSpecializationDecl *PartialSpec,
-    ArrayRef<TemplateArgument> TemplateArgs,
-    sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
+    ArrayRef<TemplateArgument> TemplateArgs, SourceRange InstantiationRange)
     : InstantiatingTemplate(
-          SemaRef,
-          CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
+          SemaRef, CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
           PointOfInstantiation, InstantiationRange, PartialSpec, nullptr,
-          TemplateArgs, &DeductionInfo) {}
+          TemplateArgs) {}
 
 Sema::InstantiatingTemplate::InstantiatingTemplate(
     Sema &SemaRef, SourceLocation PointOfInstantiation,
     VarTemplatePartialSpecializationDecl *PartialSpec,
-    ArrayRef<TemplateArgument> TemplateArgs,
-    sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
+    ArrayRef<TemplateArgument> TemplateArgs, SourceRange InstantiationRange)
     : InstantiatingTemplate(
-          SemaRef,
-          CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
+          SemaRef, CodeSynthesisContext::DeducedTemplateArgumentSubstitution,
           PointOfInstantiation, InstantiationRange, PartialSpec, nullptr,
-          TemplateArgs, &DeductionInfo) {}
+          TemplateArgs) {}
 
 Sema::InstantiatingTemplate::InstantiatingTemplate(
     Sema &SemaRef, SourceLocation PointOfInstantiation, ParmVarDecl *Param,
@@ -763,12 +753,11 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
 
 Sema::InstantiatingTemplate::InstantiatingTemplate(
     Sema &SemaRef, SourceLocation PointOfInstantiation,
-    concepts::Requirement *Req, sema::TemplateDeductionInfo &DeductionInfo,
-    SourceRange InstantiationRange)
+    concepts::Requirement *Req, SourceRange InstantiationRange)
     : InstantiatingTemplate(
           SemaRef, CodeSynthesisContext::RequirementInstantiation,
           PointOfInstantiation, InstantiationRange, /*Entity=*/nullptr,
-          /*Template=*/nullptr, /*TemplateArgs=*/{}, &DeductionInfo) {}
+          /*Template=*/nullptr, /*TemplateArgs=*/{}) {}
 
 Sema::InstantiatingTemplate::InstantiatingTemplate(
     Sema &SemaRef, SourceLocation PointOfInstantiation,
@@ -781,11 +770,11 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
 
 Sema::InstantiatingTemplate::InstantiatingTemplate(
     Sema &SemaRef, SourceLocation PointOfInstantiation, const RequiresExpr *RE,
-    sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
+    SourceRange InstantiationRange)
     : InstantiatingTemplate(
           SemaRef, CodeSynthesisContext::RequirementParameterInstantiation,
           PointOfInstantiation, InstantiationRange, /*Entity=*/nullptr,
-          /*Template=*/nullptr, /*TemplateArgs=*/{}, &DeductionInfo) {}
+          /*Template=*/nullptr, /*TemplateArgs=*/{}) {}
 
 Sema::InstantiatingTemplate::InstantiatingTemplate(
     Sema &SemaRef, SourceLocation PointOfInstantiation,
@@ -797,13 +786,11 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
           TemplateArgs) {}
 
 Sema::InstantiatingTemplate::InstantiatingTemplate(
-    Sema &SemaRef, SourceLocation PointOfInstantiation,
-    ConstraintSubstitution, NamedDecl *Template,
-    sema::TemplateDeductionInfo &DeductionInfo, SourceRange InstantiationRange)
+    Sema &SemaRef, SourceLocation PointOfInstantiation, ConstraintSubstitution,
+    NamedDecl *Template, SourceRange InstantiationRange)
     : InstantiatingTemplate(
           SemaRef, CodeSynthesisContext::ConstraintSubstitution,
-          PointOfInstantiation, InstantiationRange, Template, nullptr,
-          {}, &DeductionInfo) {}
+          PointOfInstantiation, InstantiationRange, Template, nullptr, {}) {}
 
 Sema::InstantiatingTemplate::InstantiatingTemplate(
     Sema &SemaRef, SourceLocation PointOfInstantiation,
@@ -835,9 +822,6 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
                             ArgLoc, InstantiationRange, PArg) {}
 
 bool Sema::pushCodeSynthesisContext(CodeSynthesisContext Ctx) {
-  Ctx.SavedInNonInstantiationSFINAEContext = InNonInstantiationSFINAEContext;
-  InNonInstantiationSFINAEContext = false;
-
   if (!Ctx.isInstantiationRecord()) {
     ++NonInstantiationEntries;
   } else {
@@ -871,8 +855,6 @@ void Sema::popCodeSynthesisContext() {
     --NonInstantiationEntries;
   }
 
-  InNonInstantiationSFINAEContext = Active.SavedInNonInstantiationSFINAEContext;
-
   // Name lookup no longer looks in this template's defining module.
   assert(CodeSynthesisContexts.size() >=
              CodeSynthesisContextLookupModules.size() &&
@@ -1282,93 +1264,6 @@ void Sema::PrintInstantiationStack(InstantiationContextDiagFuncRef DiagFunc) {
   }
 }
 
-std::optional<TemplateDeductionInfo *> Sema::isSFINAEContext() const {
-  if (InNonInstantiationSFINAEContext)
-    return std::optional<TemplateDeductionInfo *>(nullptr);
-
-  for (SmallVectorImpl<CodeSynthesisContext>::const_reverse_iterator
-         Active = CodeSynthesisContexts.rbegin(),
-         ActiveEnd = CodeSynthesisContexts.rend();
-       Active != ActiveEnd;
-       ++Active)
-  {
-    switch (Active->Kind) {
-    case CodeSynthesisContext::TypeAliasTemplateInstantiation:
-      // An instantiation of an alias template may or may not be a SFINAE
-      // context, depending on what else is on the stack.
-      if (isa<TypeAliasTemplateDecl>(Active->Entity))
-        break;
-      [[fallthrough]];
-    case CodeSynthesisContext::TemplateInstantiation:
-    case CodeSynthesisContext::DefaultFunctionArgumentInstantiation:
-    case CodeSynthesisContext::ExceptionSpecInstantiation:
-    case CodeSynthesisContext::ConstraintsCheck:
-    case CodeSynthesisContext::ParameterMappingSubstitution:
-    case CodeSynthesisContext::ConstraintNormalization:
-    case CodeSynthesisContext::NestedRequirementConstraintsCheck:
-      // This is a template instantiation, so there is no SFINAE.
-      return std::nullopt;
-    case CodeSynthesisContext::LambdaExpressionSubstitution:
-      // [temp.deduct]p9
-      // A lambda-expression appearing in a function type or a template
-      // parameter is not considered part of the immediate context for the
-      // purposes of template argument deduction.
-      // CWG2672: A lambda-expression body is never in the immediate context.
-      return std::nullopt;
-
-    case CodeSynthesisContext::DefaultTemplateArgumentInstantiation:
-    case CodeSynthesisContext::PriorTemplateArgumentSubstitution:
-    case CodeSynthesisContext::DefaultTemplateArgumentChecking:
-    case CodeSynthesisContext::RewritingOperatorAsSpaceship:
-    case CodeSynthesisContext::PartialOrderingTTP:
-      // A default template argument instantiation and substitution into
-      // template parameters with arguments for prior parameters may or may
-      // not be a SFINAE context; look further up the stack.
-      break;
-
-    case CodeSynthesisContext::ExplicitTemplateArgumentSubstitution:
-    case CodeSynthesisContext::DeducedTemplateArgumentSubstitution:
-      // We're either substituting explicitly-specified template arguments,
-      // deduced template arguments. SFINAE applies unless we are in a lambda
-      // body, see [temp.deduct]p9.
-    case CodeSynthesisContext::ConstraintSubstitution:
-    case CodeSynthesisContext::RequirementInstantiation:
-    case CodeSynthesisContext::RequirementParameterInstantiation:
-      // SFINAE always applies in a constraint expression or a requirement
-      // in a requires expression.
-      assert(Active->DeductionInfo && "Missing deduction info pointer");
-      return Active->DeductionInfo;
-
-    case CodeSynthesisContext::DeclaringSpecialMember:
-    case CodeSynthesisContext::DeclaringImplicitEqualityComparison:
-    case CodeSynthesisContext::DefiningSynthesizedFunction:
-    case CodeSynthesisContext::InitializingStructuredBinding:
-    case CodeSynthesisContext::MarkingClassDllexported:
-    case CodeSynthesisContext::BuildingBuiltinDumpStructCall:
-    case CodeSynthesisContext::BuildingDeductionGuides:
-      // This happens in a context unrelated to template instantiation, so
-      // there is no SFINAE.
-      return std::nullopt;
-
-    case CodeSynthesisContext::ExceptionSpecEvaluation:
-      // FIXME: This should not be treated as a SFINAE context, because
-      // we will cache an incorrect exception specification. However, clang
-      // bootstrap relies this! See PR31692.
-      break;
-
-    case CodeSynthesisContext::Memoization:
-      break;
-    }
-
-    // The inner context was transparent for SFINAE. If it occurred within a
-    // non-instantiation SFINAE context, then SFINAE applies.
-    if (Active->SavedInNonInstantiationSFINAEContext)
-      return std::optional<TemplateDeductionInfo *>(nullptr);
-  }
-
-  return std::nullopt;
-}
-
 //===----------------------------------------------------------------------===/
 // Template Instantiation for Types
 //===----------------------------------------------------------------------===/
@@ -2674,10 +2569,9 @@ ExprResult TemplateInstantiator::TransformRequiresTypeParams(
     Sema::ExtParameterInfoBuilder &PInfos) {
 
   TemplateDeductionInfo Info(KWLoc);
-  Sema::InstantiatingTemplate TypeInst(SemaRef, KWLoc,
-                                       RE, Info,
+  Sema::InstantiatingTemplate TypeInst(SemaRef, KWLoc, RE,
                                        SourceRange{KWLoc, RBraceLoc});
-  Sema::SFINAETrap Trap(SemaRef);
+  Sema::SFINAETrap Trap(SemaRef, Info);
 
   unsigned ErrorIdx;
   if (getDerived().TransformFunctionTypeParams(
@@ -2709,10 +2603,10 @@ TemplateInstantiator::TransformTypeRequirement(concepts::TypeRequirement *Req) {
     return Req;
   }
 
-  Sema::SFINAETrap Trap(SemaRef);
   TemplateDeductionInfo Info(Req->getType()->getTypeLoc().getBeginLoc());
-  Sema::InstantiatingTemplate TypeInst(SemaRef,
-      Req->getType()->getTypeLoc().getBeginLoc(), Req, Info,
+  Sema::SFINAETrap Trap(SemaRef, Info);
+  Sema::InstantiatingTemplate TypeInst(
+      SemaRef, Req->getType()->getTypeLoc().getBeginLoc(), Req,
       Req->getType()->getTypeLoc().getSourceRange());
   if (TypeInst.isInvalid())
     return nullptr;
@@ -2730,8 +2624,6 @@ TemplateInstantiator::TransformExprRequirement(concepts::ExprRequirement *Req) {
   if (!Req->isDependent() && !AlwaysRebuild())
     return Req;
 
-  Sema::SFINAETrap Trap(SemaRef);
-
   llvm::PointerUnion<Expr *, concepts::Requirement::SubstitutionDiagnostic *>
       TransExpr;
   if (Req->isExprSubstitutionFailure())
@@ -2739,7 +2631,8 @@ TemplateInstantiator::TransformExprRequirement(concepts::ExprRequirement *Req) {
   else {
     Expr *E = Req->getExpr();
     TemplateDeductionInfo Info(E->getBeginLoc());
-    Sema::InstantiatingTemplate ExprInst(SemaRef, E->getBeginLoc(), Req, Info,
+    Sema::SFINAETrap Trap(SemaRef, Info);
+    Sema::InstantiatingTemplate ExprInst(SemaRef, E->getBeginLoc(), Req,
                                          E->getSourceRange());
     if (ExprInst.isInvalid())
       return nullptr;
@@ -2765,8 +2658,9 @@ TemplateInstantiator::TransformExprRequirement(concepts::ExprRequirement *Req) {
     TemplateParameterList *OrigTPL =
         RetReq.getTypeConstraintTemplateParameterList();
     TemplateDeductionInfo Info(OrigTPL->getTemplateLoc());
-    Sema::InstantiatingTemplate TPLInst(SemaRef, OrigTPL->getTemplateLoc(),
-                                        Req, Info, OrigTPL->getSourceRange());
+    Sema::SFINAETrap Trap(SemaRef, Info);
+    Sema::InstantiatingTemplate TPLInst(SemaRef, OrigTPL->getTemplateLoc(), Req,
+                                        OrigTPL->getSourceRange());
     if (TPLInst.isInvalid())
       return nullptr;
     TemplateParameterList *TPL = TransformTemplateParameterList(OrigTPL);
@@ -2830,11 +2724,9 @@ TemplateInstantiator::TransformNestedRequirement(
 
   bool Success;
   Expr *NewConstraint;
-  TemplateDeductionInfo Info(Constraint->getBeginLoc());
   {
     EnterExpressionEvaluationContext ContextRAII(
         SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-
     Sema::InstantiatingTemplate ConstrInst(
         SemaRef, Constraint->getBeginLoc(), Req,
         Sema::InstantiatingTemplate::ConstraintsCheck(),
@@ -2843,16 +2735,10 @@ TemplateInstantiator::TransformNestedRequirement(
     if (ConstrInst.isInvalid())
       return nullptr;
 
-    Sema::SFINAETrap Trap(SemaRef);
-
     Success = !SemaRef.CheckConstraintSatisfaction(
         Req, AssociatedConstraint(Constraint, SemaRef.ArgPackSubstIndex),
         TemplateArgs, Constraint->getSourceRange(), Satisfaction,
         /*TopLevelConceptId=*/nullptr, &NewConstraint);
-
-    assert((!Success || !Trap.hasErrorOccurred()) &&
-           "Substitution failures must be handled "
-           "by CheckConstraintSatisfaction.");
   }
 
   if (!Success || Satisfaction.HasSubstitutionFailure())
@@ -3156,25 +3042,25 @@ Sema::SubstParmVarDecl(ParmVarDecl *OldParm,
                        const MultiLevelTemplateArgumentList &TemplateArgs,
                        int indexAdjustment, UnsignedOrNone NumExpansions,
                        bool ExpectParameterPack, bool EvaluateConstraint) {
-  TypeSourceInfo *OldDI = OldParm->getTypeSourceInfo();
-  TypeSourceInfo *NewDI = nullptr;
+  TypeSourceInfo *OldTSI = OldParm->getTypeSourceInfo();
+  TypeSourceInfo *NewTSI = nullptr;
 
-  TypeLoc OldTL = OldDI->getTypeLoc();
+  TypeLoc OldTL = OldTSI->getTypeLoc();
   if (PackExpansionTypeLoc ExpansionTL = OldTL.getAs<PackExpansionTypeLoc>()) {
 
     // We have a function parameter pack. Substitute into the pattern of the
     // expansion.
-    NewDI = SubstType(ExpansionTL.getPatternLoc(), TemplateArgs,
-                      OldParm->getLocation(), OldParm->getDeclName());
-    if (!NewDI)
+    NewTSI = SubstType(ExpansionTL.getPatternLoc(), TemplateArgs,
+                       OldParm->getLocation(), OldParm->getDeclName());
+    if (!NewTSI)
       return nullptr;
 
-    if (NewDI->getType()->containsUnexpandedParameterPack()) {
+    if (NewTSI->getType()->containsUnexpandedParameterPack()) {
       // We still have unexpanded parameter packs, which means that
       // our function parameter is still a function parameter pack.
       // Therefore, make its type a pack expansion type.
-      NewDI = CheckPackExpansion(NewDI, ExpansionTL.getEllipsisLoc(),
-                                 NumExpansions);
+      NewTSI = CheckPackExpansion(NewTSI, ExpansionTL.getEllipsisLoc(),
+                                  NumExpansions);
     } else if (ExpectParameterPack) {
       // We expected to get a parameter pack but didn't (because the type
       // itself is not a pack expansion type), so complain. This can occur when
@@ -3182,18 +3068,18 @@ Sema::SubstParmVarDecl(ParmVarDecl *OldParm,
       // pack expansion.
       Diag(OldParm->getLocation(),
            diag::err_function_parameter_pack_without_parameter_packs)
-        << NewDI->getType();
+          << NewTSI->getType();
       return nullptr;
     }
   } else {
-    NewDI = SubstType(OldDI, TemplateArgs, OldParm->getLocation(),
-                      OldParm->getDeclName());
+    NewTSI = SubstType(OldTSI, TemplateArgs, OldParm->getLocation(),
+                       OldParm->getDeclName());
   }
 
-  if (!NewDI)
+  if (!NewTSI)
     return nullptr;
 
-  if (NewDI->getType()->isVoidType()) {
+  if (NewTSI->getType()->isVoidType()) {
     Diag(OldParm->getLocation(), diag::err_param_with_void_type);
     return nullptr;
   }
@@ -3205,7 +3091,7 @@ Sema::SubstParmVarDecl(ParmVarDecl *OldParm,
   // here, when the instantiated versions of those referenced parameters are in
   // scope.
   if (TemplateTypeParmDecl *TTP =
-          GetContainedInventedTypeParmVisitor().Visit(OldDI->getType())) {
+          GetContainedInventedTypeParmVisitor().Visit(OldTSI->getType())) {
     if (const TypeConstraint *TC = TTP->getTypeConstraint()) {
       auto *Inst = cast_or_null<TemplateTypeParmDecl>(
           FindInstantiatedDecl(TTP->getLocation(), TTP, TemplateArgs));
@@ -3219,12 +3105,10 @@ Sema::SubstParmVarDecl(ParmVarDecl *OldParm,
     }
   }
 
-  ParmVarDecl *NewParm = CheckParameter(Context.getTranslationUnitDecl(),
-                                        OldParm->getInnerLocStart(),
-                                        OldParm->getLocation(),
-                                        OldParm->getIdentifier(),
-                                        NewDI->getType(), NewDI,
-                                        OldParm->getStorageClass());
+  ParmVarDecl *NewParm = CheckParameter(
+      Context.getTranslationUnitDecl(), OldParm->getInnerLocStart(),
+      OldParm->getLocation(), OldParm->getIdentifier(), NewTSI->getType(),
+      NewTSI, OldParm->getStorageClass());
   if (!NewParm)
     return nullptr;
 
@@ -3308,7 +3192,7 @@ bool Sema::SubstDefaultArgument(
 
   EnterExpressionEvaluationContext EvalContext(
       *this, ExpressionEvaluationContext::PotentiallyEvaluated, Param);
-
+  NonSFINAEContext _(*this);
   InstantiatingTemplate Inst(*this, Loc, Param, TemplateArgs.getInnermost());
   if (Inst.isInvalid())
     return true;
@@ -3596,6 +3480,7 @@ bool Sema::InstantiateClassImpl(
     Spec->setPointOfInstantiation(PointOfInstantiation);
   }
 
+  NonSFINAEContext _(*this);
   InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation);
   if (Inst.isInvalid())
     return true;
@@ -3830,6 +3715,7 @@ bool Sema::InstantiateEnum(SourceLocation PointOfInstantiation,
     MSInfo->setPointOfInstantiation(PointOfInstantiation);
   }
 
+  NonSFINAEContext _(*this);
   InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation);
   if (Inst.isInvalid())
     return true;
@@ -3894,6 +3780,7 @@ bool Sema::InstantiateInClassInitializer(
     return true;
   }
 
+  NonSFINAEContext _(*this);
   InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation);
   if (Inst.isInvalid())
     return true;
@@ -3977,6 +3864,7 @@ static ActionResult<CXXRecordDecl *> getPatternForClassTemplateSpecialization(
     Sema &S, SourceLocation PointOfInstantiation,
     ClassTemplateSpecializationDecl *ClassTemplateSpec,
     TemplateSpecializationKind TSK, bool PrimaryStrictPackMatch) {
+  std::optional<Sema::NonSFINAEContext> NSC(S);
   Sema::InstantiatingTemplate Inst(S, PointOfInstantiation, ClassTemplateSpec);
   if (Inst.isInvalid())
     return {/*Invalid=*/true};
@@ -4078,6 +3966,7 @@ static ActionResult<CXXRecordDecl *> getPatternForClassTemplateSpecialization(
         if (Ambiguous) {
           // Partial ordering did not produce a clear winner. Complain.
           Inst.Clear();
+          NSC.reset();
           S.Diag(PointOfInstantiation,
                  diag::err_partial_spec_ordering_ambiguous)
               << ClassTemplateSpec;
@@ -4509,6 +4398,7 @@ ExprResult Sema::SubstConceptTemplateArguments(
   TemplateArgumentListInfo SubstArgs(ArgsAsWritten->getLAngleLoc(),
                                      ArgsAsWritten->getRAngleLoc());
 
+  NonSFINAEContext _(*this);
   Sema::InstantiatingTemplate Inst(
       *this, ArgsAsWritten->arguments().front().getSourceRange().getBegin(),
       Sema::InstantiatingTemplate::ConstraintNormalization{},
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 28925cca8f956..1b6b559c1227b 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -1506,17 +1506,17 @@ TemplateDeclInstantiator::VisitNamespaceAliasDecl(NamespaceAliasDecl *D) {
 Decl *TemplateDeclInstantiator::InstantiateTypedefNameDecl(TypedefNameDecl *D,
                                                            bool IsTypeAlias) {
   bool Invalid = false;
-  TypeSourceInfo *DI = D->getTypeSourceInfo();
-  if (DI->getType()->isInstantiationDependentType() ||
-      DI->getType()->isVariablyModifiedType()) {
-    DI = SemaRef.SubstType(DI, TemplateArgs,
-                           D->getLocation(), D->getDeclName());
-    if (!DI) {
+  TypeSourceInfo *TSI = D->getTypeSourceInfo();
+  if (TSI->getType()->isInstantiationDependentType() ||
+      TSI->getType()->isVariablyModifiedType()) {
+    TSI = SemaRef.SubstType(TSI, TemplateArgs, D->getLocation(),
+                            D->getDeclName());
+    if (!TSI) {
       Invalid = true;
-      DI = SemaRef.Context.getTrivialTypeSourceInfo(SemaRef.Context.IntTy);
+      TSI = SemaRef.Context.getTrivialTypeSourceInfo(SemaRef.Context.IntTy);
     }
   } else {
-    SemaRef.MarkDeclarationsReferencedInType(D->getLocation(), DI->getType());
+    SemaRef.MarkDeclarationsReferencedInType(D->getLocation(), TSI->getType());
   }
 
   // HACK: 2012-10-23 g++ has a bug where it gets the value kind of ?: wrong.
@@ -1525,7 +1525,7 @@ Decl *TemplateDeclInstantiator::InstantiateTypedefNameDecl(TypedefNameDecl *D,
   // semantics. See LWG issue 2141 for more information on the bug.  The bugs
   // are fixed in g++ and libstdc++ 4.9.0 (2014-04-22).
   if (SemaRef.getPreprocessor().NeedsStdLibCxxWorkaroundBefore(2014'04'22)) {
-    const DecltypeType *DT = DI->getType()->getAs<DecltypeType>();
+    const DecltypeType *DT = TSI->getType()->getAs<DecltypeType>();
     CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(D->getDeclContext());
     if (DT && RD && isa<ConditionalOperator>(DT->getUnderlyingExpr()) &&
         DT->isReferenceType() &&
@@ -1534,18 +1534,18 @@ Decl *TemplateDeclInstantiator::InstantiateTypedefNameDecl(TypedefNameDecl *D,
         D->getIdentifier() && D->getIdentifier()->isStr("type") &&
         SemaRef.getSourceManager().isInSystemHeader(D->getBeginLoc()))
       // Fold it to the (non-reference) type which g++ would have produced.
-      DI = SemaRef.Context.getTrivialTypeSourceInfo(
-          DI->getType().getNonReferenceType());
+      TSI = SemaRef.Context.getTrivialTypeSourceInfo(
+          TSI->getType().getNonReferenceType());
   }
 
   // Create the new typedef
   TypedefNameDecl *Typedef;
   if (IsTypeAlias)
     Typedef = TypeAliasDecl::Create(SemaRef.Context, Owner, D->getBeginLoc(),
-                                    D->getLocation(), D->getIdentifier(), DI);
+                                    D->getLocation(), D->getIdentifier(), TSI);
   else
     Typedef = TypedefDecl::Create(SemaRef.Context, Owner, D->getBeginLoc(),
-                                  D->getLocation(), D->getIdentifier(), DI);
+                                  D->getLocation(), D->getIdentifier(), TSI);
   if (Invalid)
     Typedef->setInvalidDecl();
 
@@ -1554,7 +1554,7 @@ Decl *TemplateDeclInstantiator::InstantiateTypedefNameDecl(TypedefNameDecl *D,
   if (const TagType *oldTagType = D->getUnderlyingType()->getAs<TagType>()) {
     TagDecl *oldTag = oldTagType->getDecl();
     if (oldTag->getTypedefNameForAnonDecl() == D && !Invalid) {
-      TagDecl *newTag = DI->getType()->castAs<TagType>()->getDecl();
+      TagDecl *newTag = TSI->getType()->castAs<TagType>()->getDecl();
       assert(!newTag->hasNameForLinkage());
       newTag->setTypedefNameForAnonDecl(Typedef);
     }
@@ -1719,15 +1719,15 @@ Decl *TemplateDeclInstantiator::VisitVarDecl(VarDecl *D,
                                              ArrayRef<BindingDecl*> *Bindings) {
 
   // Do substitution on the type of the declaration
-  TypeSourceInfo *DI = SemaRef.SubstType(
+  TypeSourceInfo *TSI = SemaRef.SubstType(
       D->getTypeSourceInfo(), TemplateArgs, D->getTypeSpecStartLoc(),
-      D->getDeclName(), /*AllowDeducedTST*/true);
-  if (!DI)
+      D->getDeclName(), /*AllowDeducedTST*/ true);
+  if (!TSI)
     return nullptr;
 
-  if (DI->getType()->isFunctionType()) {
+  if (TSI->getType()->isFunctionType()) {
     SemaRef.Diag(D->getLocation(), diag::err_variable_instantiates_to_function)
-      << D->isStaticDataMember() << DI->getType();
+        << D->isStaticDataMember() << TSI->getType();
     return nullptr;
   }
 
@@ -1739,12 +1739,12 @@ Decl *TemplateDeclInstantiator::VisitVarDecl(VarDecl *D,
   VarDecl *Var;
   if (Bindings)
     Var = DecompositionDecl::Create(SemaRef.Context, DC, D->getInnerLocStart(),
-                                    D->getLocation(), DI->getType(), DI,
+                                    D->getLocation(), TSI->getType(), TSI,
                                     D->getStorageClass(), *Bindings);
   else
     Var = VarDecl::Create(SemaRef.Context, DC, D->getInnerLocStart(),
-                          D->getLocation(), D->getIdentifier(), DI->getType(),
-                          DI, D->getStorageClass());
+                          D->getLocation(), D->getIdentifier(), TSI->getType(),
+                          TSI, D->getStorageClass());
 
   // In ARC, infer 'retaining' for variables of retainable type.
   if (SemaRef.getLangOpts().ObjCAutoRefCount &&
@@ -1810,15 +1810,15 @@ Decl *TemplateDeclInstantiator::VisitAccessSpecDecl(AccessSpecDecl *D) {
 
 Decl *TemplateDeclInstantiator::VisitFieldDecl(FieldDecl *D) {
   bool Invalid = false;
-  TypeSourceInfo *DI = D->getTypeSourceInfo();
-  if (DI->getType()->isInstantiationDependentType() ||
-      DI->getType()->isVariablyModifiedType())  {
-    DI = SemaRef.SubstType(DI, TemplateArgs,
-                           D->getLocation(), D->getDeclName());
-    if (!DI) {
-      DI = D->getTypeSourceInfo();
+  TypeSourceInfo *TSI = D->getTypeSourceInfo();
+  if (TSI->getType()->isInstantiationDependentType() ||
+      TSI->getType()->isVariablyModifiedType()) {
+    TSI = SemaRef.SubstType(TSI, TemplateArgs, D->getLocation(),
+                            D->getDeclName());
+    if (!TSI) {
+      TSI = D->getTypeSourceInfo();
       Invalid = true;
-    } else if (DI->getType()->isFunctionType()) {
+    } else if (TSI->getType()->isFunctionType()) {
       // C++ [temp.arg.type]p3:
       //   If a declaration acquires a function type through a type
       //   dependent on a template-parameter and this causes a
@@ -1826,11 +1826,11 @@ Decl *TemplateDeclInstantiator::VisitFieldDecl(FieldDecl *D) {
       //   function declarator to have function type, the program is
       //   ill-formed.
       SemaRef.Diag(D->getLocation(), diag::err_field_instantiates_to_function)
-        << DI->getType();
+          << TSI->getType();
       Invalid = true;
     }
   } else {
-    SemaRef.MarkDeclarationsReferencedInType(D->getLocation(), DI->getType());
+    SemaRef.MarkDeclarationsReferencedInType(D->getLocation(), TSI->getType());
   }
 
   Expr *BitWidth = D->getBitWidth();
@@ -1850,16 +1850,10 @@ Decl *TemplateDeclInstantiator::VisitFieldDecl(FieldDecl *D) {
       BitWidth = InstantiatedBitWidth.getAs<Expr>();
   }
 
-  FieldDecl *Field = SemaRef.CheckFieldDecl(D->getDeclName(),
-                                            DI->getType(), DI,
-                                            cast<RecordDecl>(Owner),
-                                            D->getLocation(),
-                                            D->isMutable(),
-                                            BitWidth,
-                                            D->getInClassInitStyle(),
-                                            D->getInnerLocStart(),
-                                            D->getAccess(),
-                                            nullptr);
+  FieldDecl *Field = SemaRef.CheckFieldDecl(
+      D->getDeclName(), TSI->getType(), TSI, cast<RecordDecl>(Owner),
+      D->getLocation(), D->isMutable(), BitWidth, D->getInClassInitStyle(),
+      D->getInnerLocStart(), D->getAccess(), nullptr);
   if (!Field) {
     cast<Decl>(Owner)->setInvalidDecl();
     return nullptr;
@@ -1892,19 +1886,19 @@ Decl *TemplateDeclInstantiator::VisitFieldDecl(FieldDecl *D) {
 
 Decl *TemplateDeclInstantiator::VisitMSPropertyDecl(MSPropertyDecl *D) {
   bool Invalid = false;
-  TypeSourceInfo *DI = D->getTypeSourceInfo();
+  TypeSourceInfo *TSI = D->getTypeSourceInfo();
 
-  if (DI->getType()->isVariablyModifiedType()) {
+  if (TSI->getType()->isVariablyModifiedType()) {
     SemaRef.Diag(D->getLocation(), diag::err_property_is_variably_modified)
       << D;
     Invalid = true;
-  } else if (DI->getType()->isInstantiationDependentType())  {
-    DI = SemaRef.SubstType(DI, TemplateArgs,
-                           D->getLocation(), D->getDeclName());
-    if (!DI) {
-      DI = D->getTypeSourceInfo();
+  } else if (TSI->getType()->isInstantiationDependentType()) {
+    TSI = SemaRef.SubstType(TSI, TemplateArgs, D->getLocation(),
+                            D->getDeclName());
+    if (!TSI) {
+      TSI = D->getTypeSourceInfo();
       Invalid = true;
-    } else if (DI->getType()->isFunctionType()) {
+    } else if (TSI->getType()->isFunctionType()) {
       // C++ [temp.arg.type]p3:
       //   If a declaration acquires a function type through a type
       //   dependent on a template-parameter and this causes a
@@ -1912,16 +1906,17 @@ Decl *TemplateDeclInstantiator::VisitMSPropertyDecl(MSPropertyDecl *D) {
       //   function declarator to have function type, the program is
       //   ill-formed.
       SemaRef.Diag(D->getLocation(), diag::err_field_instantiates_to_function)
-      << DI->getType();
+          << TSI->getType();
       Invalid = true;
     }
   } else {
-    SemaRef.MarkDeclarationsReferencedInType(D->getLocation(), DI->getType());
+    SemaRef.MarkDeclarationsReferencedInType(D->getLocation(), TSI->getType());
   }
 
   MSPropertyDecl *Property = MSPropertyDecl::Create(
-      SemaRef.Context, Owner, D->getLocation(), D->getDeclName(), DI->getType(),
-      DI, D->getBeginLoc(), D->getGetterId(), D->getSetterId());
+      SemaRef.Context, Owner, D->getLocation(), D->getDeclName(),
+      TSI->getType(), TSI, D->getBeginLoc(), D->getGetterId(),
+      D->getSetterId());
 
   SemaRef.InstantiateAttrs(TemplateArgs, D, Property, LateAttrs,
                            StartingScope);
@@ -3584,7 +3579,7 @@ Decl *TemplateDeclInstantiator::VisitNonTypeTemplateParmDecl(
   SmallVector<TypeSourceInfo *, 4> ExpandedParameterPackTypesAsWritten;
   SmallVector<QualType, 4> ExpandedParameterPackTypes;
   bool IsExpandedParameterPack = false;
-  TypeSourceInfo *DI;
+  TypeSourceInfo *TSI;
   QualType T;
   bool Invalid = false;
 
@@ -3594,24 +3589,24 @@ Decl *TemplateDeclInstantiator::VisitNonTypeTemplateParmDecl(
     ExpandedParameterPackTypes.reserve(D->getNumExpansionTypes());
     ExpandedParameterPackTypesAsWritten.reserve(D->getNumExpansionTypes());
     for (unsigned I = 0, N = D->getNumExpansionTypes(); I != N; ++I) {
-      TypeSourceInfo *NewDI =
+      TypeSourceInfo *NewTSI =
           SemaRef.SubstType(D->getExpansionTypeSourceInfo(I), TemplateArgs,
                             D->getLocation(), D->getDeclName());
-      if (!NewDI)
+      if (!NewTSI)
         return nullptr;
 
       QualType NewT =
-          SemaRef.CheckNonTypeTemplateParameterType(NewDI, D->getLocation());
+          SemaRef.CheckNonTypeTemplateParameterType(NewTSI, D->getLocation());
       if (NewT.isNull())
         return nullptr;
 
-      ExpandedParameterPackTypesAsWritten.push_back(NewDI);
+      ExpandedParameterPackTypesAsWritten.push_back(NewTSI);
       ExpandedParameterPackTypes.push_back(NewT);
     }
 
     IsExpandedParameterPack = true;
-    DI = D->getTypeSourceInfo();
-    T = DI->getType();
+    TSI = D->getTypeSourceInfo();
+    T = TSI->getType();
   } else if (D->isPackExpansion()) {
     // The non-type template parameter pack's type is a pack expansion of types.
     // Determine whether we need to expand this parameter pack into separate
@@ -3637,18 +3632,17 @@ Decl *TemplateDeclInstantiator::VisitNonTypeTemplateParmDecl(
     if (Expand) {
       for (unsigned I = 0; I != *NumExpansions; ++I) {
         Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, I);
-        TypeSourceInfo *NewDI = SemaRef.SubstType(Pattern, TemplateArgs,
-                                                  D->getLocation(),
-                                                  D->getDeclName());
-        if (!NewDI)
+        TypeSourceInfo *NewTSI = SemaRef.SubstType(
+            Pattern, TemplateArgs, D->getLocation(), D->getDeclName());
+        if (!NewTSI)
           return nullptr;
 
         QualType NewT =
-            SemaRef.CheckNonTypeTemplateParameterType(NewDI, D->getLocation());
+            SemaRef.CheckNonTypeTemplateParameterType(NewTSI, D->getLocation());
         if (NewT.isNull())
           return nullptr;
 
-        ExpandedParameterPackTypesAsWritten.push_back(NewDI);
+        ExpandedParameterPackTypesAsWritten.push_back(NewTSI);
         ExpandedParameterPackTypes.push_back(NewT);
       }
 
@@ -3656,8 +3650,8 @@ Decl *TemplateDeclInstantiator::VisitNonTypeTemplateParmDecl(
       // expanded parameter pack is the original expansion type, but callers
       // will end up using the expanded parameter pack types for type-checking.
       IsExpandedParameterPack = true;
-      DI = D->getTypeSourceInfo();
-      T = DI->getType();
+      TSI = D->getTypeSourceInfo();
+      T = TSI->getType();
     } else {
       // We cannot fully expand the pack expansion now, so substitute into the
       // pattern and create a new pack expansion type.
@@ -3669,22 +3663,22 @@ Decl *TemplateDeclInstantiator::VisitNonTypeTemplateParmDecl(
         return nullptr;
 
       SemaRef.CheckNonTypeTemplateParameterType(NewPattern, D->getLocation());
-      DI = SemaRef.CheckPackExpansion(NewPattern, Expansion.getEllipsisLoc(),
-                                      NumExpansions);
-      if (!DI)
+      TSI = SemaRef.CheckPackExpansion(NewPattern, Expansion.getEllipsisLoc(),
+                                       NumExpansions);
+      if (!TSI)
         return nullptr;
 
-      T = DI->getType();
+      T = TSI->getType();
     }
   } else {
     // Simple case: substitution into a parameter that is not a parameter pack.
-    DI = SemaRef.SubstType(D->getTypeSourceInfo(), TemplateArgs,
-                           D->getLocation(), D->getDeclName());
-    if (!DI)
+    TSI = SemaRef.SubstType(D->getTypeSourceInfo(), TemplateArgs,
+                            D->getLocation(), D->getDeclName());
+    if (!TSI)
       return nullptr;
 
     // Check that this type is acceptable for a non-type template parameter.
-    T = SemaRef.CheckNonTypeTemplateParameterType(DI, D->getLocation());
+    T = SemaRef.CheckNonTypeTemplateParameterType(TSI, D->getLocation());
     if (T.isNull()) {
       T = SemaRef.Context.IntTy;
       Invalid = true;
@@ -3696,20 +3690,20 @@ Decl *TemplateDeclInstantiator::VisitNonTypeTemplateParmDecl(
     Param = NonTypeTemplateParmDecl::Create(
         SemaRef.Context, Owner, D->getInnerLocStart(), D->getLocation(),
         D->getDepth() - TemplateArgs.getNumSubstitutedLevels(),
-        D->getPosition(), D->getIdentifier(), T, DI, ExpandedParameterPackTypes,
-        ExpandedParameterPackTypesAsWritten);
+        D->getPosition(), D->getIdentifier(), T, TSI,
+        ExpandedParameterPackTypes, ExpandedParameterPackTypesAsWritten);
   else
     Param = NonTypeTemplateParmDecl::Create(
         SemaRef.Context, Owner, D->getInnerLocStart(), D->getLocation(),
         D->getDepth() - TemplateArgs.getNumSubstitutedLevels(),
-        D->getPosition(), D->getIdentifier(), T, D->isParameterPack(), DI);
+        D->getPosition(), D->getIdentifier(), T, D->isParameterPack(), TSI);
 
-  if (AutoTypeLoc AutoLoc = DI->getTypeLoc().getContainedAutoTypeLoc())
+  if (AutoTypeLoc AutoLoc = TSI->getTypeLoc().getContainedAutoTypeLoc())
     if (AutoLoc.isConstrained()) {
       SourceLocation EllipsisLoc;
       if (IsExpandedParameterPack)
         EllipsisLoc =
-            DI->getTypeLoc().getAs<PackExpansionTypeLoc>().getEllipsisLoc();
+            TSI->getTypeLoc().getAs<PackExpansionTypeLoc>().getEllipsisLoc();
       else if (auto *Constraint = dyn_cast_if_present<CXXFoldExpr>(
                    D->getPlaceholderTypeConstraint()))
         EllipsisLoc = Constraint->getEllipsisLoc();
@@ -4642,22 +4636,22 @@ TemplateDeclInstantiator::VisitVarTemplateSpecializationDecl(
     VarTemplateSpecializationDecl *PrevDecl) {
 
   // Do substitution on the type of the declaration
-  TypeSourceInfo *DI =
+  TypeSourceInfo *TSI =
       SemaRef.SubstType(D->getTypeSourceInfo(), TemplateArgs,
                         D->getTypeSpecStartLoc(), D->getDeclName());
-  if (!DI)
+  if (!TSI)
     return nullptr;
 
-  if (DI->getType()->isFunctionType()) {
+  if (TSI->getType()->isFunctionType()) {
     SemaRef.Diag(D->getLocation(), diag::err_variable_instantiates_to_function)
-        << D->isStaticDataMember() << DI->getType();
+        << D->isStaticDataMember() << TSI->getType();
     return nullptr;
   }
 
   // Build the instantiated declaration
   VarTemplateSpecializationDecl *Var = VarTemplateSpecializationDecl::Create(
       SemaRef.Context, Owner, D->getInnerLocStart(), D->getLocation(),
-      VarTemplate, DI->getType(), DI, D->getStorageClass(), Converted);
+      VarTemplate, TSI->getType(), TSI, D->getStorageClass(), Converted);
   if (!PrevDecl) {
     void *InsertPos = nullptr;
     VarTemplate->findSpecialization(Converted, InsertPos);
@@ -5005,16 +4999,16 @@ TemplateDeclInstantiator::InstantiateVarTemplatePartialSpecialization(
                                              InstParams, InsertPos);
 
   // Do substitution on the type of the declaration
-  TypeSourceInfo *DI = SemaRef.SubstType(
+  TypeSourceInfo *TSI = SemaRef.SubstType(
       PartialSpec->getTypeSourceInfo(), TemplateArgs,
       PartialSpec->getTypeSpecStartLoc(), PartialSpec->getDeclName());
-  if (!DI)
+  if (!TSI)
     return nullptr;
 
-  if (DI->getType()->isFunctionType()) {
+  if (TSI->getType()->isFunctionType()) {
     SemaRef.Diag(PartialSpec->getLocation(),
                  diag::err_variable_instantiates_to_function)
-        << PartialSpec->isStaticDataMember() << DI->getType();
+        << PartialSpec->isStaticDataMember() << TSI->getType();
     return nullptr;
   }
 
@@ -5022,8 +5016,8 @@ TemplateDeclInstantiator::InstantiateVarTemplatePartialSpecialization(
   VarTemplatePartialSpecializationDecl *InstPartialSpec =
       VarTemplatePartialSpecializationDecl::Create(
           SemaRef.Context, Owner, PartialSpec->getInnerLocStart(),
-          PartialSpec->getLocation(), InstParams, VarTemplate, DI->getType(),
-          DI, PartialSpec->getStorageClass(), CTAI.CanonicalConverted);
+          PartialSpec->getLocation(), InstParams, VarTemplate, TSI->getType(),
+          TSI, PartialSpec->getStorageClass(), CTAI.CanonicalConverted);
 
   InstPartialSpec->setTemplateArgsAsWritten(InstTemplateArgs);
 
@@ -5322,6 +5316,7 @@ void Sema::InstantiateExceptionSpec(SourceLocation PointOfInstantiation,
     return;
   }
 
+  NonSFINAEContext _(*this);
   InstantiatingTemplate Inst(*this, PointOfInstantiation, Decl,
                              InstantiatingTemplate::ExceptionSpecification());
   if (Inst.isInvalid()) {
@@ -5389,6 +5384,7 @@ TemplateDeclInstantiator::InitFunctionInstantiation(FunctionDecl *New,
   if (ActiveInst.Kind == ActiveInstType::ExplicitTemplateArgumentSubstitution ||
       ActiveInst.Kind == ActiveInstType::DeducedTemplateArgumentSubstitution) {
     if (isa<FunctionTemplateDecl>(ActiveInst.Entity)) {
+      SemaRef.CurrentSFINAEContext = nullptr;
       atTemplateEnd(SemaRef.TemplateInstCallbacks, SemaRef, ActiveInst);
       ActiveInst.Kind = ActiveInstType::TemplateInstantiation;
       ActiveInst.Entity = New;
@@ -5469,7 +5465,7 @@ TemplateDeclInstantiator::InitMethodInstantiation(CXXMethodDecl *New,
 bool TemplateDeclInstantiator::SubstDefaultedFunction(FunctionDecl *New,
                                                       FunctionDecl *Tmpl) {
   // Transfer across any unqualified lookups.
-  if (auto *DFI = Tmpl->getDefalutedOrDeletedInfo()) {
+  if (auto *DFI = Tmpl->getDefaultedOrDeletedInfo()) {
     SmallVector<DeclAccessPair, 32> Lookups;
     Lookups.reserve(DFI->getUnqualifiedLookups().size());
     bool AnyChanged = false;
@@ -5499,8 +5495,7 @@ FunctionDecl *Sema::InstantiateFunctionDeclaration(
     SourceLocation Loc, CodeSynthesisContext::SynthesisKind CSC) {
   FunctionDecl *FD = FTD->getTemplatedDecl();
 
-  sema::TemplateDeductionInfo Info(Loc);
-  InstantiatingTemplate Inst(*this, Loc, FTD, Args->asArray(), CSC, Info);
+  InstantiatingTemplate Inst(*this, Loc, FTD, Args->asArray(), CSC);
   if (Inst.isInvalid())
     return nullptr;
 
@@ -5690,6 +5685,7 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
     }
   }
 
+  NonSFINAEContext _(*this);
   InstantiatingTemplate Inst(*this, PointOfInstantiation, Function);
   if (Inst.isInvalid())
     return;
@@ -5980,6 +5976,7 @@ VarTemplateSpecializationDecl *Sema::BuildVarTemplateInstantiation(
   if (FromVar->isInvalidDecl())
     return nullptr;
 
+  NonSFINAEContext _(*this);
   InstantiatingTemplate Inst(*this, PointOfInstantiation, FromVar);
   if (Inst.isInvalid())
     return nullptr;
@@ -6026,14 +6023,14 @@ VarTemplateSpecializationDecl *Sema::CompleteVarTemplateSpecializationDecl(
          "don't have a definition to instantiate from");
 
   // Do substitution on the type of the declaration
-  TypeSourceInfo *DI =
+  TypeSourceInfo *TSI =
       SubstType(PatternDecl->getTypeSourceInfo(), TemplateArgs,
                 PatternDecl->getTypeSpecStartLoc(), PatternDecl->getDeclName());
-  if (!DI)
+  if (!TSI)
     return nullptr;
 
   // Update the type of this variable template specialization.
-  VarSpec->setType(DI->getType());
+  VarSpec->setType(TSI->getType());
 
   // Convert the declaration into a definition now.
   VarSpec->setCompleteDefinition();
@@ -6201,6 +6198,10 @@ void Sema::InstantiateVariableInitializer(
   currentEvaluationContext().RebuildDefaultArgOrDefaultInit =
       parentEvaluationContext().RebuildDefaultArgOrDefaultInit;
 
+  // Set DeclForInitializer for this variable so DiagIfReachable can properly
+  // suppress runtime diagnostics for constexpr/static member variables
+  currentEvaluationContext().DeclForInitializer = Var;
+
   if (OldVar->getInit()) {
     // Instantiate the initializer.
     ExprResult Init =
@@ -6287,6 +6288,7 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
         !Var->hasInit()) {
       // FIXME: Factor out the duplicated instantiation context setup/tear down
       // code here.
+      NonSFINAEContext _(*this);
       InstantiatingTemplate Inst(*this, PointOfInstantiation, Var);
       if (Inst.isInvalid())
         return;
@@ -6391,6 +6393,7 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
     return;
   }
 
+  NonSFINAEContext _(*this);
   InstantiatingTemplate Inst(*this, PointOfInstantiation, Var);
   if (Inst.isInvalid())
     return;
@@ -6468,6 +6471,8 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
     PassToConsumerRAII.Var = Var;
     Var->setTemplateSpecializationKind(OldVar->getTemplateSpecializationKind(),
                                        OldVar->getPointOfInstantiation());
+    // Emit any deferred warnings for the variable's initializer
+    AnalysisWarnings.issueWarningsForRegisteredVarDecl(Var);
   }
 
   // This variable may have local implicit instantiations that need to be
diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp
index 0f72d6a13ae06..5b1aad3fa8470 100644
--- a/clang/lib/Sema/SemaTemplateVariadic.cpp
+++ b/clang/lib/Sema/SemaTemplateVariadic.cpp
@@ -844,7 +844,7 @@ bool Sema::CheckParameterPacksForExpansion(
     ArrayRef<UnexpandedParameterPack> Unexpanded,
     const MultiLevelTemplateArgumentList &TemplateArgs,
     bool FailOnPackProducingTemplates, bool &ShouldExpand,
-    bool &RetainExpansion, UnsignedOrNone &NumExpansions) {
+    bool &RetainExpansion, UnsignedOrNone &NumExpansions, bool Diagnose) {
   ShouldExpand = true;
   RetainExpansion = false;
   IdentifierLoc FirstPack;
@@ -874,6 +874,9 @@ bool Sema::CheckParameterPacksForExpansion(
       if (!FailOnPackProducingTemplates)
         continue;
 
+      if (!Diagnose)
+        return true;
+
       // It is not yet supported in certain contexts.
       return Diag(PatternRange.getBegin().isValid() ? PatternRange.getBegin()
                                                     : EllipsisLoc,
@@ -1015,7 +1018,9 @@ bool Sema::CheckParameterPacksForExpansion(
       // C++0x [temp.variadic]p5:
       //   All of the parameter packs expanded by a pack expansion shall have
       //   the same number of arguments specified.
-      if (HaveFirstPack)
+      if (!Diagnose)
+        ;
+      else if (HaveFirstPack)
         Diag(EllipsisLoc, diag::err_pack_expansion_length_conflict)
             << FirstPack.getIdentifierInfo() << Name << *NumExpansions
             << (LeastNewPackSize != NewPackSize) << LeastNewPackSize
@@ -1041,6 +1046,8 @@ bool Sema::CheckParameterPacksForExpansion(
     if (NumExpansions && *NumExpansions < *NumPartialExpansions) {
       NamedDecl *PartialPack =
           CurrentInstantiationScope->getPartiallySubstitutedPack();
+      if (!Diagnose)
+        return true;
       Diag(EllipsisLoc, diag::err_pack_expansion_length_conflict_partial)
           << PartialPack << *NumPartialExpansions << *NumExpansions
           << SourceRange(PartiallySubstitutedPackLoc);
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 280b3c92cce14..eb8b1352d1be1 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -2358,6 +2358,11 @@ QualType Sema::BuildVectorType(QualType CurType, Expr *SizeExpr,
     return QualType();
   }
 
+  if (VecSize->isNegative()) {
+    Diag(SizeExpr->getExprLoc(), diag::err_attribute_vec_negative_size);
+    return QualType();
+  }
+
   if (CurType->isDependentType())
     return Context.getDependentVectorType(CurType, SizeExpr, AttrLoc,
                                           VectorKind::Generic);
@@ -2394,7 +2399,7 @@ QualType Sema::BuildVectorType(QualType CurType, Expr *SizeExpr,
                                VectorKind::Generic);
 }
 
-QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize,
+QualType Sema::BuildExtVectorType(QualType T, Expr *SizeExpr,
                                   SourceLocation AttrLoc) {
   // Unlike gcc's vector_size attribute, we do not allow vectors to be defined
   // in conjunction with complex types (pointers, arrays, functions, etc.).
@@ -2417,35 +2422,40 @@ QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize,
       BIT && CheckBitIntElementType(*this, AttrLoc, BIT))
     return QualType();
 
-  if (!ArraySize->isTypeDependent() && !ArraySize->isValueDependent()) {
-    std::optional<llvm::APSInt> vecSize =
-        ArraySize->getIntegerConstantExpr(Context);
-    if (!vecSize) {
+  if (!SizeExpr->isTypeDependent() && !SizeExpr->isValueDependent()) {
+    std::optional<llvm::APSInt> VecSize =
+        SizeExpr->getIntegerConstantExpr(Context);
+    if (!VecSize) {
       Diag(AttrLoc, diag::err_attribute_argument_type)
-        << "ext_vector_type" << AANT_ArgumentIntegerConstant
-        << ArraySize->getSourceRange();
+          << "ext_vector_type" << AANT_ArgumentIntegerConstant
+          << SizeExpr->getSourceRange();
+      return QualType();
+    }
+
+    if (VecSize->isNegative()) {
+      Diag(SizeExpr->getExprLoc(), diag::err_attribute_vec_negative_size);
       return QualType();
     }
 
-    if (!vecSize->isIntN(32)) {
+    if (!VecSize->isIntN(32)) {
       Diag(AttrLoc, diag::err_attribute_size_too_large)
-          << ArraySize->getSourceRange() << "vector";
+          << SizeExpr->getSourceRange() << "vector";
       return QualType();
     }
     // Unlike gcc's vector_size attribute, the size is specified as the
     // number of elements, not the number of bytes.
-    unsigned vectorSize = static_cast<unsigned>(vecSize->getZExtValue());
+    unsigned VectorSize = static_cast<unsigned>(VecSize->getZExtValue());
 
-    if (vectorSize == 0) {
+    if (VectorSize == 0) {
       Diag(AttrLoc, diag::err_attribute_zero_size)
-          << ArraySize->getSourceRange() << "vector";
+          << SizeExpr->getSourceRange() << "vector";
       return QualType();
     }
 
-    return Context.getExtVectorType(T, vectorSize);
+    return Context.getExtVectorType(T, VectorSize);
   }
 
-  return Context.getDependentSizedExtVectorType(T, ArraySize, AttrLoc);
+  return Context.getDependentSizedExtVectorType(T, SizeExpr, AttrLoc);
 }
 
 QualType Sema::BuildMatrixType(QualType ElementTy, Expr *NumRows, Expr *NumCols,
@@ -2785,13 +2795,14 @@ QualType Sema::GetTypeFromParser(ParsedType Ty, TypeSourceInfo **TInfo) {
     return QualType();
   }
 
-  TypeSourceInfo *DI = nullptr;
+  TypeSourceInfo *TSI = nullptr;
   if (const LocInfoType *LIT = dyn_cast<LocInfoType>(QT)) {
     QT = LIT->getType();
-    DI = LIT->getTypeSourceInfo();
+    TSI = LIT->getTypeSourceInfo();
   }
 
-  if (TInfo) *TInfo = DI;
+  if (TInfo)
+    *TInfo = TSI;
   return QT;
 }
 
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index 850bcb17bece1..2f61bdd9a6540 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -489,14 +489,6 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_tileloaddrst164:
   case X86::BI__builtin_ia32_tilestored64:
   case X86::BI__builtin_ia32_tilezero:
-  case X86::BI__builtin_ia32_t2rpntlvwz0:
-  case X86::BI__builtin_ia32_t2rpntlvwz0t1:
-  case X86::BI__builtin_ia32_t2rpntlvwz1:
-  case X86::BI__builtin_ia32_t2rpntlvwz1t1:
-  case X86::BI__builtin_ia32_t2rpntlvwz0rst1:
-  case X86::BI__builtin_ia32_t2rpntlvwz1rs:
-  case X86::BI__builtin_ia32_t2rpntlvwz1rst1:
-  case X86::BI__builtin_ia32_t2rpntlvwz0rs:
   case X86::BI__builtin_ia32_tcvtrowps2bf16h:
   case X86::BI__builtin_ia32_tcvtrowps2bf16l:
   case X86::BI__builtin_ia32_tcvtrowps2phh:
@@ -516,17 +508,8 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_tdpbhf8ps:
   case X86::BI__builtin_ia32_tdphbf8ps:
   case X86::BI__builtin_ia32_tdphf8ps:
-  case X86::BI__builtin_ia32_ttdpbf16ps:
-  case X86::BI__builtin_ia32_ttdpfp16ps:
-  case X86::BI__builtin_ia32_ttcmmimfp16ps:
-  case X86::BI__builtin_ia32_ttcmmrlfp16ps:
-  case X86::BI__builtin_ia32_tconjtcmmimfp16ps:
   case X86::BI__builtin_ia32_tmmultf32ps:
-  case X86::BI__builtin_ia32_ttmmultf32ps:
     return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2});
-  case X86::BI__builtin_ia32_ttransposed:
-  case X86::BI__builtin_ia32_tconjtfp16:
-    return CheckBuiltinTileArgumentsRange(TheCall, {0, 1});
   }
 }
 static bool isX86_32Builtin(unsigned BuiltinID) {
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 8c20078e97a13..c2491489f40d2 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -371,7 +371,7 @@ class TreeTransform {
   /// may override this function (to take over all type
   /// transformations) or some set of the TransformXXXType functions
   /// to alter the transformation.
-  TypeSourceInfo *TransformType(TypeSourceInfo *DI);
+  TypeSourceInfo *TransformType(TypeSourceInfo *TSI);
 
   /// Transform the given type-with-location into a new
   /// type, collecting location information in the given builder
@@ -387,7 +387,7 @@ class TreeTransform {
   /// template arguments.
   /// @{
   QualType TransformTypeWithDeducedTST(QualType T);
-  TypeSourceInfo *TransformTypeWithDeducedTST(TypeSourceInfo *DI);
+  TypeSourceInfo *TransformTypeWithDeducedTST(TypeSourceInfo *TSI);
   /// @}
 
   /// The reason why the value of a statement is not discarded, if any.
@@ -2463,6 +2463,19 @@ class TreeTransform {
                                                              LParenLoc, EndLoc);
   }
 
+  /// Build a new OpenMP 'dyn_groupprivate' clause.
+  ///
+  /// By default, performs semantic analysis to build the new OpenMP clause.
+  /// Subclasses may override this routine to provide different behavior.
+  OMPClause *RebuildOMPDynGroupprivateClause(
+      OpenMPDynGroupprivateClauseModifier M1,
+      OpenMPDynGroupprivateClauseFallbackModifier M2, Expr *Size,
+      SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation M1Loc,
+      SourceLocation M2Loc, SourceLocation EndLoc) {
+    return getSema().OpenMP().ActOnOpenMPDynGroupprivateClause(
+        M1, M2, Size, StartLoc, LParenLoc, M1Loc, M2Loc, EndLoc);
+  }
+
   /// Build a new OpenMP 'ompx_attribute' clause.
   ///
   /// By default, performs semantic analysis to build the new OpenMP clause.
@@ -4995,15 +5008,15 @@ bool TreeTransform<Derived>::TransformTemplateArgument(
   }
 
   case TemplateArgument::Type: {
-    TypeSourceInfo *DI = Input.getTypeSourceInfo();
-    if (!DI)
-      DI = InventTypeSourceInfo(Input.getArgument().getAsType());
+    TypeSourceInfo *TSI = Input.getTypeSourceInfo();
+    if (!TSI)
+      TSI = InventTypeSourceInfo(Input.getArgument().getAsType());
 
-    DI = getDerived().TransformType(DI);
-    if (!DI)
+    TSI = getDerived().TransformType(TSI);
+    if (!TSI)
       return true;
 
-    Output = TemplateArgumentLoc(TemplateArgument(DI->getType()), DI);
+    Output = TemplateArgumentLoc(TemplateArgument(TSI->getType()), TSI);
     return false;
   }
 
@@ -5360,28 +5373,28 @@ QualType TreeTransform<Derived>::TransformType(QualType T) {
 
   // Temporary workaround.  All of these transformations should
   // eventually turn into transformations on TypeLocs.
-  TypeSourceInfo *DI = getSema().Context.getTrivialTypeSourceInfo(T,
-                                                getDerived().getBaseLocation());
+  TypeSourceInfo *TSI = getSema().Context.getTrivialTypeSourceInfo(
+      T, getDerived().getBaseLocation());
 
-  TypeSourceInfo *NewDI = getDerived().TransformType(DI);
+  TypeSourceInfo *NewTSI = getDerived().TransformType(TSI);
 
-  if (!NewDI)
+  if (!NewTSI)
     return QualType();
 
-  return NewDI->getType();
+  return NewTSI->getType();
 }
 
-template<typename Derived>
-TypeSourceInfo *TreeTransform<Derived>::TransformType(TypeSourceInfo *DI) {
+template <typename Derived>
+TypeSourceInfo *TreeTransform<Derived>::TransformType(TypeSourceInfo *TSI) {
   // Refine the base location to the type's location.
-  TemporaryBase Rebase(*this, DI->getTypeLoc().getBeginLoc(),
+  TemporaryBase Rebase(*this, TSI->getTypeLoc().getBeginLoc(),
                        getDerived().getBaseEntity());
-  if (getDerived().AlreadyTransformed(DI->getType()))
-    return DI;
+  if (getDerived().AlreadyTransformed(TSI->getType()))
+    return TSI;
 
   TypeLocBuilder TLB;
 
-  TypeLoc TL = DI->getTypeLoc();
+  TypeLoc TL = TSI->getTypeLoc();
   TLB.reserve(TL.getFullDataSize());
 
   QualType Result = getDerived().TransformType(TLB, TL);
@@ -5413,27 +5426,27 @@ QualType TreeTransform<Derived>::TransformTypeWithDeducedTST(QualType T) {
 
   if (getDerived().AlreadyTransformed(T))
     return T;
-  TypeSourceInfo *DI = getSema().Context.getTrivialTypeSourceInfo(T,
-                                                getDerived().getBaseLocation());
-  TypeSourceInfo *NewDI = getDerived().TransformTypeWithDeducedTST(DI);
-  return NewDI ? NewDI->getType() : QualType();
+  TypeSourceInfo *TSI = getSema().Context.getTrivialTypeSourceInfo(
+      T, getDerived().getBaseLocation());
+  TypeSourceInfo *NewTSI = getDerived().TransformTypeWithDeducedTST(TSI);
+  return NewTSI ? NewTSI->getType() : QualType();
 }
 
-template<typename Derived>
+template <typename Derived>
 TypeSourceInfo *
-TreeTransform<Derived>::TransformTypeWithDeducedTST(TypeSourceInfo *DI) {
-  if (!isa<DependentNameType>(DI->getType()))
-    return TransformType(DI);
+TreeTransform<Derived>::TransformTypeWithDeducedTST(TypeSourceInfo *TSI) {
+  if (!isa<DependentNameType>(TSI->getType()))
+    return TransformType(TSI);
 
   // Refine the base location to the type's location.
-  TemporaryBase Rebase(*this, DI->getTypeLoc().getBeginLoc(),
+  TemporaryBase Rebase(*this, TSI->getTypeLoc().getBeginLoc(),
                        getDerived().getBaseEntity());
-  if (getDerived().AlreadyTransformed(DI->getType()))
-    return DI;
+  if (getDerived().AlreadyTransformed(TSI->getType()))
+    return TSI;
 
   TypeLocBuilder TLB;
 
-  TypeLoc TL = DI->getTypeLoc();
+  TypeLoc TL = TSI->getTypeLoc();
   TLB.reserve(TL.getFullDataSize());
 
   auto QTL = TL.getAs<QualifiedTypeLoc>();
@@ -6258,17 +6271,17 @@ template <typename Derived>
 ParmVarDecl *TreeTransform<Derived>::TransformFunctionTypeParam(
     ParmVarDecl *OldParm, int indexAdjustment, UnsignedOrNone NumExpansions,
     bool ExpectParameterPack) {
-  TypeSourceInfo *OldDI = OldParm->getTypeSourceInfo();
-  TypeSourceInfo *NewDI = nullptr;
+  TypeSourceInfo *OldTSI = OldParm->getTypeSourceInfo();
+  TypeSourceInfo *NewTSI = nullptr;
 
-  if (NumExpansions && isa<PackExpansionType>(OldDI->getType())) {
+  if (NumExpansions && isa<PackExpansionType>(OldTSI->getType())) {
     // If we're substituting into a pack expansion type and we know the
     // length we want to expand to, just substitute for the pattern.
-    TypeLoc OldTL = OldDI->getTypeLoc();
+    TypeLoc OldTL = OldTSI->getTypeLoc();
     PackExpansionTypeLoc OldExpansionTL = OldTL.castAs<PackExpansionTypeLoc>();
 
     TypeLocBuilder TLB;
-    TypeLoc NewTL = OldDI->getTypeLoc();
+    TypeLoc NewTL = OldTSI->getTypeLoc();
     TLB.reserve(NewTL.getFullDataSize());
 
     QualType Result = getDerived().TransformType(TLB,
@@ -6286,24 +6299,20 @@ ParmVarDecl *TreeTransform<Derived>::TransformFunctionTypeParam(
     PackExpansionTypeLoc NewExpansionTL
       = TLB.push<PackExpansionTypeLoc>(Result);
     NewExpansionTL.setEllipsisLoc(OldExpansionTL.getEllipsisLoc());
-    NewDI = TLB.getTypeSourceInfo(SemaRef.Context, Result);
+    NewTSI = TLB.getTypeSourceInfo(SemaRef.Context, Result);
   } else
-    NewDI = getDerived().TransformType(OldDI);
-  if (!NewDI)
+    NewTSI = getDerived().TransformType(OldTSI);
+  if (!NewTSI)
     return nullptr;
 
-  if (NewDI == OldDI && indexAdjustment == 0)
+  if (NewTSI == OldTSI && indexAdjustment == 0)
     return OldParm;
 
-  ParmVarDecl *newParm = ParmVarDecl::Create(SemaRef.Context,
-                                             OldParm->getDeclContext(),
-                                             OldParm->getInnerLocStart(),
-                                             OldParm->getLocation(),
-                                             OldParm->getIdentifier(),
-                                             NewDI->getType(),
-                                             NewDI,
-                                             OldParm->getStorageClass(),
-                                             /* DefArg */ nullptr);
+  ParmVarDecl *newParm = ParmVarDecl::Create(
+      SemaRef.Context, OldParm->getDeclContext(), OldParm->getInnerLocStart(),
+      OldParm->getLocation(), OldParm->getIdentifier(), NewTSI->getType(),
+      NewTSI, OldParm->getStorageClass(),
+      /* DefArg */ nullptr);
   newParm->setScopeInfo(OldParm->getFunctionScopeDepth(),
                         OldParm->getFunctionScopeIndex() + indexAdjustment);
   getDerived().transformedLocalDecl(OldParm, {newParm});
@@ -8080,14 +8089,13 @@ TreeTransform<Derived>::TransformCompoundStmt(CompoundStmt *S,
     getSema().resetFPOptions(
         S->getStoredFPFeatures().applyOverrides(getSema().getLangOpts()));
 
-  const Stmt *ExprResult = S->getStmtExprResult();
   bool SubStmtInvalid = false;
   bool SubStmtChanged = false;
   SmallVector<Stmt*, 8> Statements;
   for (auto *B : S->body()) {
     StmtResult Result = getDerived().TransformStmt(
-        B, IsStmtExpr && B == ExprResult ? StmtDiscardKind::StmtExprResult
-                                         : StmtDiscardKind::Discarded);
+        B, IsStmtExpr && B == S->body_back() ? StmtDiscardKind::StmtExprResult
+                                             : StmtDiscardKind::Discarded);
 
     if (Result.isInvalid()) {
       // Immediately fail if this was a DeclStmt, since it's very
@@ -11730,6 +11738,19 @@ OMPClause *TreeTransform<Derived>::TransformOMPXDynCGroupMemClause(
       Size.get(), C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
 }
 
+template <typename Derived>
+OMPClause *TreeTransform<Derived>::TransformOMPDynGroupprivateClause(
+    OMPDynGroupprivateClause *C) {
+  ExprResult Size = getDerived().TransformExpr(C->getSize());
+  if (Size.isInvalid())
+    return nullptr;
+  return getDerived().RebuildOMPDynGroupprivateClause(
+      C->getDynGroupprivateModifier(), C->getDynGroupprivateFallbackModifier(),
+      Size.get(), C->getBeginLoc(), C->getLParenLoc(),
+      C->getDynGroupprivateModifierLoc(),
+      C->getDynGroupprivateFallbackModifierLoc(), C->getEndLoc());
+}
+
 template <typename Derived>
 OMPClause *
 TreeTransform<Derived>::TransformOMPDoacrossClause(OMPDoacrossClause *C) {
@@ -15828,16 +15849,20 @@ TreeTransform<Derived>::TransformLambdaExpr(LambdaExpr *E) {
       Sema::ExpressionEvaluationContext::PotentiallyEvaluated,
       E->getCallOperator());
 
-  Sema::CodeSynthesisContext C;
-  C.Kind = clang::Sema::CodeSynthesisContext::LambdaExpressionSubstitution;
-  C.PointOfInstantiation = E->getBody()->getBeginLoc();
-  getSema().pushCodeSynthesisContext(C);
+  StmtResult Body;
+  {
+    Sema::NonSFINAEContext _(getSema());
+    Sema::CodeSynthesisContext C;
+    C.Kind = clang::Sema::CodeSynthesisContext::LambdaExpressionSubstitution;
+    C.PointOfInstantiation = E->getBody()->getBeginLoc();
+    getSema().pushCodeSynthesisContext(C);
 
-  // Instantiate the body of the lambda expression.
-  StmtResult Body =
-      Invalid ? StmtError() : getDerived().TransformLambdaBody(E, E->getBody());
+    // Instantiate the body of the lambda expression.
+    Body = Invalid ? StmtError()
+                   : getDerived().TransformLambdaBody(E, E->getBody());
 
-  getSema().popCodeSynthesisContext();
+    getSema().popCodeSynthesisContext();
+  }
 
   // ActOnLambda* will pop the function scope for us.
   FuncScopeCleanup.disable();
diff --git a/clang/lib/Sema/TypeLocBuilder.h b/clang/lib/Sema/TypeLocBuilder.h
index 0c27088a1748b..e84e79aee8f0d 100644
--- a/clang/lib/Sema/TypeLocBuilder.h
+++ b/clang/lib/Sema/TypeLocBuilder.h
@@ -113,9 +113,9 @@ class TypeLocBuilder {
 #endif
 
     size_t FullDataSize = Capacity - Index;
-    TypeSourceInfo *DI = Context.CreateTypeSourceInfo(T, FullDataSize);
-    memcpy(DI->getTypeLoc().getOpaqueData(), &Buffer[Index], FullDataSize);
-    return DI;
+    TypeSourceInfo *TSI = Context.CreateTypeSourceInfo(T, FullDataSize);
+    memcpy(TSI->getTypeLoc().getOpaqueData(), &Buffer[Index], FullDataSize);
+    return TSI;
   }
 
   /// Copies the type-location information to the given AST context and
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index e3106f8d8e13c..634bf991b2aee 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -225,7 +225,7 @@ bool ChainedASTReaderListener::ReadPreprocessorOptions(
 }
 
 void ChainedASTReaderListener::ReadCounter(const serialization::ModuleFile &M,
-                                           unsigned Value) {
+                                           uint32_t Value) {
   First->ReadCounter(M, Value);
   Second->ReadCounter(M, Value);
 }
@@ -973,7 +973,7 @@ bool PCHValidator::ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
       PP.getPreprocessorOpts());
 }
 
-void PCHValidator::ReadCounter(const ModuleFile &M, unsigned Value) {
+void PCHValidator::ReadCounter(const ModuleFile &M, uint32_t Value) {
   PP.setCounterValue(Value);
 }
 
@@ -4087,10 +4087,14 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
             std::errc::illegal_byte_sequence,
             "Invalid PENDING_IMPLICIT_INSTANTIATIONS block");
 
-      for (unsigned I = 0, N = Record.size(); I != N; /* in loop */) {
-        PendingInstantiations.push_back(
-            {ReadDeclID(F, Record, I),
-             ReadSourceLocation(F, Record, I).getRawEncoding()});
+      // For standard C++20 module, we will only reads the instantiations
+      // if it is the main file.
+      if (!F.StandardCXXModule || F.Kind == MK_MainFile) {
+        for (unsigned I = 0, N = Record.size(); I != N; /* in loop */) {
+          PendingInstantiations.push_back(
+              {ReadDeclID(F, Record, I),
+               ReadSourceLocation(F, Record, I).getRawEncoding()});
+        }
       }
       break;
 
@@ -6438,10 +6442,13 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F,
     case SUBMODULE_INITIALIZERS: {
       if (!ContextObj)
         break;
-      SmallVector<GlobalDeclID, 16> Inits;
-      for (unsigned I = 0; I < Record.size(); /*in loop*/)
-        Inits.push_back(ReadDeclID(F, Record, I));
-      ContextObj->addLazyModuleInitializers(CurrentModule, Inits);
+      // Standard C++ module has its own way to initialize variables.
+      if (!F.StandardCXXModule || F.Kind == MK_MainFile) {
+        SmallVector<GlobalDeclID, 16> Inits;
+        for (unsigned I = 0; I < Record.size(); /*in loop*/)
+          Inits.push_back(ReadDeclID(F, Record, I));
+        ContextObj->addLazyModuleInitializers(CurrentModule, Inits);
+      }
       break;
     }
 
@@ -11544,6 +11551,9 @@ OMPClause *OMPClauseReader::readClause() {
   case llvm::omp::OMPC_ompx_dyn_cgroup_mem:
     C = new (Context) OMPXDynCGroupMemClause();
     break;
+  case llvm::omp::OMPC_dyn_groupprivate:
+    C = new (Context) OMPDynGroupprivateClause();
+    break;
   case llvm::omp::OMPC_doacross: {
     unsigned NumVars = Record.readInt();
     unsigned NumLoops = Record.readInt();
@@ -12736,6 +12746,19 @@ void OMPClauseReader::VisitOMPXDynCGroupMemClause(OMPXDynCGroupMemClause *C) {
   C->setLParenLoc(Record.readSourceLocation());
 }
 
+void OMPClauseReader::VisitOMPDynGroupprivateClause(
+    OMPDynGroupprivateClause *C) {
+  VisitOMPClauseWithPreInit(C);
+  C->setDynGroupprivateModifier(
+      Record.readEnum<OpenMPDynGroupprivateClauseModifier>());
+  C->setDynGroupprivateFallbackModifier(
+      Record.readEnum<OpenMPDynGroupprivateClauseFallbackModifier>());
+  C->setSize(Record.readSubExpr());
+  C->setLParenLoc(Record.readSourceLocation());
+  C->setDynGroupprivateModifierLoc(Record.readSourceLocation());
+  C->setDynGroupprivateFallbackModifierLoc(Record.readSourceLocation());
+}
+
 void OMPClauseReader::VisitOMPDoacrossClause(OMPDoacrossClause *C) {
   C->setLParenLoc(Record.readSourceLocation());
   C->setDependenceType(
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 3ac338e013deb..e4618d60a8acb 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -3247,7 +3247,7 @@ void ASTWriter::WriteSubmodules(Module *WritingModule, ASTContext *Context) {
 
     // Emit the reachable initializers.
     // The initializer may only be unreachable in reduced BMI.
-    if (Context) {
+    if (Context && !GeneratingReducedBMI) {
       RecordData Inits;
       for (Decl *D : Context->getModuleInitializers(Mod))
         if (wasDeclEmitted(D))
@@ -4374,8 +4374,7 @@ class ASTDeclContextNameLookupTrait
     // parent of parent. We DON'T remove the enum constant from its parent. So
     // we don't need to care about merging problems here.
     if (auto *ECD = dyn_cast<EnumConstantDecl>(D);
-        ECD && DC.isFileContext() && ECD->getOwningModule() &&
-        ECD->getTopLevelOwningNamedModule()->isNamedModule()) {
+        ECD && DC.isFileContext() && ECD->getTopLevelOwningNamedModule()) {
       if (llvm::all_of(
               DC.noload_lookup(
                   cast<EnumDecl>(ECD->getDeclContext())->getDeclName()),
@@ -5828,17 +5827,19 @@ void ASTWriter::WriteSpecialDeclRecords(Sema &SemaRef) {
     Stream.EmitRecord(UNUSED_LOCAL_TYPEDEF_NAME_CANDIDATES,
                       UnusedLocalTypedefNameCandidates);
 
-  // Write the record containing pending implicit instantiations.
-  RecordData PendingInstantiations;
-  for (const auto &I : SemaRef.PendingInstantiations) {
-    if (!wasDeclEmitted(I.first))
-      continue;
+  if (!GeneratingReducedBMI) {
+    // Write the record containing pending implicit instantiations.
+    RecordData PendingInstantiations;
+    for (const auto &I : SemaRef.PendingInstantiations) {
+      if (!wasDeclEmitted(I.first))
+        continue;
 
-    AddDeclRef(I.first, PendingInstantiations);
-    AddSourceLocation(I.second, PendingInstantiations);
+      AddDeclRef(I.first, PendingInstantiations);
+      AddSourceLocation(I.second, PendingInstantiations);
+    }
+    if (!PendingInstantiations.empty())
+      Stream.EmitRecord(PENDING_IMPLICIT_INSTANTIATIONS, PendingInstantiations);
   }
-  if (!PendingInstantiations.empty())
-    Stream.EmitRecord(PENDING_IMPLICIT_INSTANTIATIONS, PendingInstantiations);
 
   // Write the record containing declaration references of Sema.
   RecordData SemaDeclRefs;
@@ -8652,6 +8653,17 @@ void OMPClauseWriter::VisitOMPXDynCGroupMemClause(OMPXDynCGroupMemClause *C) {
   Record.AddSourceLocation(C->getLParenLoc());
 }
 
+void OMPClauseWriter::VisitOMPDynGroupprivateClause(
+    OMPDynGroupprivateClause *C) {
+  VisitOMPClauseWithPreInit(C);
+  Record.push_back(C->getDynGroupprivateModifier());
+  Record.push_back(C->getDynGroupprivateFallbackModifier());
+  Record.AddStmt(C->getSize());
+  Record.AddSourceLocation(C->getLParenLoc());
+  Record.AddSourceLocation(C->getDynGroupprivateModifierLoc());
+  Record.AddSourceLocation(C->getDynGroupprivateFallbackModifierLoc());
+}
+
 void OMPClauseWriter::VisitOMPDoacrossClause(OMPDoacrossClause *C) {
   Record.push_back(C->varlist_size());
   Record.push_back(C->getNumLoops());
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index a8c487005f6ec..c9f8797ab973f 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -903,7 +903,7 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
   Record.push_back(D->getODRHash());
 
   if (D->isDefaulted() || D->isDeletedAsWritten()) {
-    if (auto *FDI = D->getDefalutedOrDeletedInfo()) {
+    if (auto *FDI = D->getDefaultedOrDeletedInfo()) {
       // Store both that there is an DefaultedOrDeletedInfo and whether it
       // contains a DeletedMessage.
       StringLiteral *DeletedMessage = FDI->getDeletedMessage();
diff --git a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
index 3ddd6590fcbb0..68bee710e5ce5 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
@@ -270,8 +270,7 @@ REGISTER_LIST_WITH_PROGRAMSTATE(ActiveCritSections, CritSectionMarker)
 // TODO: Move these to llvm::ImmutableList when overhauling immutable data
 // structures for proper iterator concept support.
 template <>
-struct std::iterator_traits<
-    typename llvm::ImmutableList<CritSectionMarker>::iterator> {
+struct std::iterator_traits<llvm::ImmutableList<CritSectionMarker>::iterator> {
   using iterator_category = std::forward_iterator_tag;
   using value_type = CritSectionMarker;
   using difference_type = std::ptrdiff_t;
diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index 70baab54df563..ec7ef237b7c31 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -6,41 +6,45 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines a variety of memory management related checkers, such as
+// This file defines checkers that report memory management errors such as
 // leak, double free, and use-after-free.
 //
-// The following checkers are defined here:
+// The logic for modeling memory allocations is implemented in the checker
+// family which is called 'MallocChecker' for historical reasons. (This name is
+// inaccurate, something like 'DynamicMemory' would be more precise.)
 //
-//   * MallocChecker
-//       Despite its name, it models all sorts of memory allocations and
-//       de- or reallocation, including but not limited to malloc, free,
-//       relloc, new, delete. It also reports on a variety of memory misuse
-//       errors.
-//       Many other checkers interact very closely with this checker, in fact,
-//       most are merely options to this one. Other checkers may register
-//       MallocChecker, but do not enable MallocChecker's reports (more details
-//       to follow around its field, ChecksEnabled).
-//       It also has a boolean "Optimistic" checker option, which if set to true
-//       will cause the checker to model user defined memory management related
-//       functions annotated via the attribute ownership_takes, ownership_holds
-//       and ownership_returns.
+// The reports produced by this backend are exposed through several frontends:
+//  *   MallocChecker: reports all misuse of dynamic memory allocated by
+//      malloc, related functions (like calloc, realloc etc.) and the functions
+//      annotated by ownership_returns. (Here the name "MallocChecker" is
+//      reasonably accurate; don't confuse this checker frontend with the whole
+//      misnamed family.)
+//  *   NewDeleteChecker: reports most misuse (anything but memory leaks) of
+//      memory managed by the C++ operators new and new[].
+//  *   NewDeleteLeaksChecker: reports leaks of dynamic memory allocated by
+//      the C++ operators new and new[].
+//  *   MismatchedDeallocatorChecker: reports situations where the allocation
+//      and deallocation is mismatched, e.g. memory allocated via malloc is
+//      passed to operator delete.
+//  *   InnerPointerChecker: reports use of pointers to the internal buffer of
+//      a std::string instance after operations that invalidate them.
+//  *   TaintedAllocChecker: reports situations where the size argument of a
+//      memory allocation function or array new operator is tainted (i.e. comes
+//      from an untrusted source and can be controlled by an attacker).
 //
-//   * NewDeleteChecker
-//       Enables the modeling of new, new[], delete, delete[] in MallocChecker,
-//       and checks for related double-free and use-after-free errors.
+// In addition to these frontends this file also defines the registration
+// functions for "unix.DynamicMemoryModeling". This registers the callbacks of
+// the checker family MallocChecker without enabling any of the frontends and
+// and handle two checker options which are attached to this "modeling
+// checker" because they affect multiple checker frontends.
 //
-//   * NewDeleteLeaksChecker
-//       Checks for leaks related to new, new[], delete, delete[].
-//       Depends on NewDeleteChecker.
-//
-//   * MismatchedDeallocatorChecker
-//       Enables checking whether memory is deallocated with the corresponding
-//       allocation function in MallocChecker, such as malloc() allocated
-//       regions are only freed by free(), new by delete, new[] by delete[].
-//
-//  InnerPointerChecker interacts very closely with MallocChecker, but unlike
-//  the above checkers, it has it's own file, hence the many InnerPointerChecker
-//  related headers and non-static functions.
+// Note that what the users see as the checker "cplusplus.InnerPointer" is a
+// combination of the frontend InnerPointerChecker (within this family) which
+// emits the bug reports and a separate checker class (also named
+// InnerPointerChecker) which is defined in InnerPointerChecker.cpp and does a
+// significant part of the modeling. This cooperation is enabled by several
+// non-static helper functions that are defined within this translation unit
+// and used in InnerPointerChecker.cpp.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index d3d1f13ab1c78..5cd894af1fd65 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -578,6 +578,10 @@ class TrivialFunctionAnalysisVisitor
     return WithCachedResult(CS, [&]() { return VisitChildren(CS); });
   }
 
+  bool VisitCoroutineBodyStmt(const CoroutineBodyStmt *CBS) {
+    return WithCachedResult(CBS, [&]() { return VisitChildren(CBS); });
+  }
+
   bool VisitReturnStmt(const ReturnStmt *RS) {
     // A return statement is allowed as long as the return value is trivial.
     if (auto *RV = RS->getRetValue())
diff --git a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
index 63f0d70238992..0ba3c05d2d163 100644
--- a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
+++ b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
@@ -3254,9 +3254,6 @@ bool ConditionBRVisitor::printValue(const Expr *CondVarExpr, raw_ostream &Out,
   return true;
 }
 
-constexpr llvm::StringLiteral ConditionBRVisitor::GenericTrueMessage;
-constexpr llvm::StringLiteral ConditionBRVisitor::GenericFalseMessage;
-
 bool ConditionBRVisitor::isPieceMessageGeneric(
     const PathDiagnosticPiece *Piece) {
   return Piece->getString() == GenericTrueMessage ||
diff --git a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
index 62460cc6f5b19..d04c827ce1391 100644
--- a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
@@ -230,13 +230,11 @@ static void findPtrToConstParams(llvm::SmallSet<unsigned, 4> &PreserveArgs,
 }
 
 ProgramStateRef CallEvent::invalidateRegions(unsigned BlockCount,
-                                             ProgramStateRef Orig) const {
-  ProgramStateRef Result = (Orig ? Orig : getState());
-
+                                             ProgramStateRef State) const {
   // Don't invalidate anything if the callee is marked pure/const.
-  if (const Decl *callee = getDecl())
-    if (callee->hasAttr<PureAttr>() || callee->hasAttr<ConstAttr>())
-      return Result;
+  if (const Decl *Callee = getDecl())
+    if (Callee->hasAttr<PureAttr>() || Callee->hasAttr<ConstAttr>())
+      return State;
 
   SmallVector<SVal, 8> ValuesToInvalidate;
   RegionAndSymbolInvalidationTraits ETraits;
@@ -278,10 +276,10 @@ ProgramStateRef CallEvent::invalidateRegions(unsigned BlockCount,
   // Invalidate designated regions using the batch invalidation API.
   // NOTE: Even if RegionsToInvalidate is empty, we may still invalidate
   //  global variables.
-  return Result->invalidateRegions(ValuesToInvalidate, getCFGElementRef(),
-                                   BlockCount, getLocationContext(),
-                                   /*CausedByPointerEscape*/ true,
-                                   /*Symbols=*/nullptr, this, &ETraits);
+  return State->invalidateRegions(ValuesToInvalidate, getCFGElementRef(),
+                                  BlockCount, getLocationContext(),
+                                  /*CausedByPointerEscape*/ true,
+                                  /*Symbols=*/nullptr, this, &ETraits);
 }
 
 ProgramPoint CallEvent::getProgramPoint(bool IsPreVisit,
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
index 75d7e265af0f3..00e3ef8311919 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
@@ -1013,7 +1013,7 @@ void ExprEngine::VisitCXXNewExpr(const CXXNewExpr *CNE, ExplodedNode *Pred,
     // FIXME: Once we figure out how we want allocators to work,
     // we should be using the usual pre-/(default-)eval-/post-call checkers
     // here.
-    State = Call->invalidateRegions(blockCount);
+    State = Call->invalidateRegions(blockCount, State);
     if (!State)
       return;
 
diff --git a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
index 2838533c1a406..4f4824a3616ce 100644
--- a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -714,11 +714,6 @@ class RegionStoreManager : public StoreManager {
     return getBinding(getRegionBindings(S), L, T);
   }
 
-  std::optional<SVal> getUniqueDefaultBinding(RegionBindingsConstRef B,
-                                              const TypedValueRegion *R) const;
-  std::optional<SVal>
-  getUniqueDefaultBinding(nonloc::LazyCompoundVal LCV) const;
-
   std::optional<SVal> getDefaultBinding(Store S, const MemRegion *R) override {
     RegionBindingsRef B = getRegionBindings(S);
     // Default bindings are always applied over a base region so look up the
@@ -2465,11 +2460,6 @@ SVal RegionStoreManager::getBindingForStruct(RegionBindingsConstRef B,
   // behavior doesn't depend on the struct layout.
   // This way even an empty struct can carry taint, no matter if creduce drops
   // the last field member or not.
-
-  // Try to avoid creating a LCV if it would anyways just refer to a single
-  // default binding.
-  if (std::optional<SVal> Val = getUniqueDefaultBinding(B, R))
-    return *Val;
   return createLazyBinding(B, R);
 }
 
@@ -2757,50 +2747,12 @@ RegionStoreManager::bindVector(LimitedRegionBindingsConstRef B,
   return NewB;
 }
 
-std::optional<SVal>
-RegionStoreManager::getUniqueDefaultBinding(RegionBindingsConstRef B,
-                                            const TypedValueRegion *R) const {
-  if (R != R->getBaseRegion())
-    return std::nullopt;
-
-  const auto *Cluster = B.lookup(R);
-  if (!Cluster || !llvm::hasSingleElement(*Cluster))
-    return std::nullopt;
-
-  const auto [Key, Value] = *Cluster->begin();
-  return Key.isDirect() ? std::optional<SVal>{} : Value;
-}
-
-std::optional<SVal>
-RegionStoreManager::getUniqueDefaultBinding(nonloc::LazyCompoundVal LCV) const {
-  auto B = getRegionBindings(LCV.getStore());
-  return getUniqueDefaultBinding(B, LCV.getRegion());
-}
-
 std::optional<LimitedRegionBindingsRef> RegionStoreManager::tryBindSmallStruct(
     LimitedRegionBindingsConstRef B, const TypedValueRegion *R,
     const RecordDecl *RD, nonloc::LazyCompoundVal LCV) {
   if (B.hasExhaustedBindingLimit())
     return B.withValuesEscaped(LCV);
 
-  // If we try to copy a Conjured value representing the value of the whole
-  // struct, don't try to element-wise copy each field.
-  // That would unnecessarily bind Derived symbols slicing off the subregion for
-  // the field from the whole Conjured symbol.
-  //
-  //   struct Window { int width; int height; };
-  //   Window getWindow(); <-- opaque fn.
-  //   Window w = getWindow(); <-- conjures a new Window.
-  //   Window w2 = w; <-- trivial copy "w", calling "tryBindSmallStruct"
-  //
-  // We should not end up with a new Store for "w2" like this:
-  //   Direct [ 0..31]: Derived{Conj{}, w.width}
-  //   Direct [32..63]: Derived{Conj{}, w.height}
-  // Instead, we should just bind that Conjured value instead.
-  if (std::optional<SVal> Val = getUniqueDefaultBinding(LCV)) {
-    return B.addBinding(BindingKey::Make(R, BindingKey::Default), Val.value());
-  }
-
   FieldVector Fields;
 
   if (const CXXRecordDecl *Class = dyn_cast<CXXRecordDecl>(RD))
diff --git a/clang/lib/Tooling/CMakeLists.txt b/clang/lib/Tooling/CMakeLists.txt
index fc1f1f9f9d367..faaa53276d0e6 100644
--- a/clang/lib/Tooling/CMakeLists.txt
+++ b/clang/lib/Tooling/CMakeLists.txt
@@ -40,6 +40,7 @@ add_clang_library(clangTooling
   clangASTMatchers
   clangBasic
   clangDriver
+  clangOptions
   clangFormat
   clangFrontend
   clangLex
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
index 42f52d0ff6241..4178d1fd352c3 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
@@ -1,4 +1,4 @@
-//===- DependencyScanner.cpp - Performs module dependency scanning --------===//
+//===- DependencyScannerImpl.cpp - Implements module dependency scanning --===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -12,6 +12,7 @@
 #include "clang/Driver/Driver.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Tooling/DependencyScanning/DependencyScanningWorker.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/TargetParser/Host.h"
 
 using namespace clang;
@@ -350,7 +351,7 @@ void sanitizeDiagOpts(DiagnosticOptions &DiagOpts) {
   //       See `test/ClangScanDeps/diagnostic-pragmas.c` for an example.
   llvm::erase_if(DiagOpts.Warnings, [](StringRef Warning) {
     return llvm::StringSwitch<bool>(Warning)
-        .Cases("pch-vfs-diff", "error=pch-vfs-diff", false)
+        .Cases({"pch-vfs-diff", "error=pch-vfs-diff"}, false)
         .StartsWith("no-error=", false)
         .Default(true);
   });
@@ -456,7 +457,8 @@ initVFSForTUBuferScanning(IntrusiveRefCntPtr<llvm::vfs::FileSystem> BaseFS,
   return std::make_pair(ModifiedFS, ModifiedCommandLine);
 }
 
-std::pair<IntrusiveRefCntPtr<llvm::vfs::FileSystem>, std::vector<std::string>>
+std::pair<IntrusiveRefCntPtr<llvm::vfs::OverlayFileSystem>,
+          std::vector<std::string>>
 initVFSForByNameScanning(IntrusiveRefCntPtr<llvm::vfs::FileSystem> BaseFS,
                          ArrayRef<std::string> CommandLine,
                          StringRef WorkingDirectory, StringRef ModuleName) {
@@ -588,7 +590,7 @@ computePrebuiltModulesASTMap(CompilerInstance &ScanInstance,
 }
 
 std::unique_ptr<DependencyOutputOptions>
-takeDependencyOutputOptionsFrom(CompilerInstance &ScanInstance) {
+takeAndUpdateDependencyOutputOptionsFrom(CompilerInstance &ScanInstance) {
   // This function moves the existing dependency output options from the
   // invocation to the collector. The options in the invocation are reset,
   // which ensures that the compiler won't create new dependency collectors,
@@ -675,7 +677,7 @@ bool DependencyScanningAction::runInvocation(
   if (!MaybePrebuiltModulesASTMap)
     return false;
 
-  auto DepOutputOpts = takeDependencyOutputOptionsFrom(ScanInstance);
+  auto DepOutputOpts = takeAndUpdateDependencyOutputOptionsFrom(ScanInstance);
 
   MDC = initializeScanInstanceDependencyCollector(
       ScanInstance, std::move(DepOutputOpts), WorkingDirectory, Consumer,
@@ -686,8 +688,6 @@ bool DependencyScanningAction::runInvocation(
 
   if (Service.getFormat() == ScanningOutputFormat::P1689)
     Action = std::make_unique<PreprocessOnlyAction>();
-  else if (ModuleName)
-    Action = std::make_unique<GetDependenciesByModuleNameAction>(*ModuleName);
   else
     Action = std::make_unique<ReadPCHAndPreprocessAction>();
 
@@ -704,3 +704,175 @@ bool DependencyScanningAction::runInvocation(
 
   return Result;
 }
+
+bool CompilerInstanceWithContext::initialize(DiagnosticConsumer *DC) {
+  if (DC) {
+    DiagConsumer = DC;
+  } else {
+    DiagPrinterWithOS =
+        std::make_unique<TextDiagnosticsPrinterWithOutput>(CommandLine);
+    DiagConsumer = &DiagPrinterWithOS->DiagPrinter;
+  }
+
+  std::tie(OverlayFS, CommandLine) = initVFSForByNameScanning(
+      Worker.BaseFS, CommandLine, CWD, "ScanningByName");
+
+  DiagEngineWithCmdAndOpts = std::make_unique<DignosticsEngineWithDiagOpts>(
+      CommandLine, OverlayFS, *DiagConsumer);
+
+  std::tie(Driver, Compilation) = buildCompilation(
+      CommandLine, *DiagEngineWithCmdAndOpts->DiagEngine, OverlayFS, Alloc);
+
+  if (!Compilation)
+    return false;
+
+  assert(Compilation->getJobs().size() &&
+         "Must have a job list of non-zero size");
+  const driver::Command &Command = *(Compilation->getJobs().begin());
+  const auto &CommandArgs = Command.getArguments();
+  assert(!CommandArgs.empty() && "Cannot have a command with 0 args");
+  assert(StringRef(CommandArgs[0]) == "-cc1" && "Requires a cc1 job.");
+  OriginalInvocation = std::make_unique<CompilerInvocation>();
+
+  if (!CompilerInvocation::CreateFromArgs(*OriginalInvocation, CommandArgs,
+                                          *DiagEngineWithCmdAndOpts->DiagEngine,
+                                          Command.getExecutable())) {
+    DiagEngineWithCmdAndOpts->DiagEngine->Report(
+        diag::err_fe_expected_compiler_job)
+        << llvm::join(CommandLine, " ");
+    return false;
+  }
+
+  if (any(Worker.Service.getOptimizeArgs() & ScanningOptimizations::Macros))
+    canonicalizeDefines(OriginalInvocation->getPreprocessorOpts());
+
+  // Create the CompilerInstance.
+  IntrusiveRefCntPtr<ModuleCache> ModCache =
+      makeInProcessModuleCache(Worker.Service.getModuleCacheEntries());
+  CIPtr = std::make_unique<CompilerInstance>(
+      std::make_shared<CompilerInvocation>(*OriginalInvocation),
+      Worker.PCHContainerOps, ModCache.get());
+  auto &CI = *CIPtr;
+
+  if (!initializeScanCompilerInstance(
+          CI, OverlayFS, DiagEngineWithCmdAndOpts->DiagEngine->getClient(),
+          Worker.Service, Worker.DepFS))
+    return false;
+
+  StableDirs = getInitialStableDirs(CI);
+  auto MaybePrebuiltModulesASTMap =
+      computePrebuiltModulesASTMap(CI, StableDirs);
+  if (!MaybePrebuiltModulesASTMap)
+    return false;
+
+  PrebuiltModuleASTMap = std::move(*MaybePrebuiltModulesASTMap);
+  OutputOpts = takeAndUpdateDependencyOutputOptionsFrom(CI);
+
+  // We do not create the target in initializeScanCompilerInstance because
+  // setting it here is unique for by-name lookups. We create the target only
+  // once here, and the information is reused for all computeDependencies calls.
+  // We do not need to call createTarget explicitly if we go through
+  // CompilerInstance::ExecuteAction to perform scanning.
+  CI.createTarget();
+
+  return true;
+}
+
+bool CompilerInstanceWithContext::computeDependencies(
+    StringRef ModuleName, DependencyConsumer &Consumer,
+    DependencyActionController &Controller) {
+  assert(CIPtr && "CIPtr must be initialized before calling this method");
+  auto &CI = *CIPtr;
+
+  // We create this cleanup object because computeDependencies may exit
+  // early with errors.
+  auto CleanUp = llvm::make_scope_exit([&]() {
+    CI.clearDependencyCollectors();
+    // The preprocessor may not be created at the entry of this method,
+    // but it must have been created when this method returns, whether
+    // there are errors during scanning or not.
+    CI.getPreprocessor().removePPCallbacks();
+  });
+
+  auto MDC = initializeScanInstanceDependencyCollector(
+      CI, std::make_unique<DependencyOutputOptions>(*OutputOpts), CWD, Consumer,
+      Worker.Service,
+      /* The MDC's constructor makes a copy of the OriginalInvocation, so
+      we can pass it in without worrying that it might be changed across
+      invocations of computeDependencies. */
+      *OriginalInvocation, Controller, PrebuiltModuleASTMap, StableDirs);
+
+  if (!SrcLocOffset) {
+    // When SrcLocOffset is zero, we are at the beginning of the fake source
+    // file. In this case, we call BeginSourceFile to initialize.
+    std::unique_ptr<FrontendAction> Action =
+        std::make_unique<PreprocessOnlyAction>();
+    auto InputFile = CI.getFrontendOpts().Inputs.begin();
+    bool ActionBeginSucceeded = Action->BeginSourceFile(CI, *InputFile);
+    assert(ActionBeginSucceeded && "Action BeginSourceFile must succeed");
+    (void)ActionBeginSucceeded;
+  }
+
+  Preprocessor &PP = CI.getPreprocessor();
+  SourceManager &SM = PP.getSourceManager();
+  FileID MainFileID = SM.getMainFileID();
+  SourceLocation FileStart = SM.getLocForStartOfFile(MainFileID);
+  SourceLocation IDLocation = FileStart.getLocWithOffset(SrcLocOffset);
+  PPCallbacks *CB = nullptr;
+  if (!SrcLocOffset) {
+    // We need to call EnterSourceFile when SrcLocOffset is zero to initialize
+    // the preprocessor.
+    bool PPFailed = PP.EnterSourceFile(MainFileID, nullptr, SourceLocation());
+    assert(!PPFailed && "Preprocess must be able to enter the main file.");
+    (void)PPFailed;
+    CB = MDC->getPPCallbacks();
+  } else {
+    // When SrcLocOffset is non-zero, the preprocessor has already been
+    // initialized through a previous call of computeDependencies. We want to
+    // preserve the PP's state, hence we do not call EnterSourceFile again.
+    MDC->attachToPreprocessor(PP);
+    CB = MDC->getPPCallbacks();
+
+    FileID PrevFID;
+    SrcMgr::CharacteristicKind FileType = SM.getFileCharacteristic(IDLocation);
+    CB->LexedFileChanged(MainFileID,
+                         PPChainedCallbacks::LexedFileChangeReason::EnterFile,
+                         FileType, PrevFID, IDLocation);
+  }
+
+  SrcLocOffset++;
+  SmallVector<IdentifierLoc, 2> Path;
+  IdentifierInfo *ModuleID = PP.getIdentifierInfo(ModuleName);
+  Path.emplace_back(IDLocation, ModuleID);
+  auto ModResult = CI.loadModule(IDLocation, Path, Module::Hidden, false);
+
+  assert(CB && "Must have PPCallbacks after module loading");
+  CB->moduleImport(SourceLocation(), Path, ModResult);
+  // Note that we are calling the CB's EndOfMainFile function, which
+  // forwards the results to the dependency consumer.
+  // It does not indicate the end of processing the fake file.
+  CB->EndOfMainFile();
+
+  if (!ModResult)
+    return false;
+
+  CompilerInvocation ModuleInvocation(*OriginalInvocation);
+  MDC->applyDiscoveredDependencies(ModuleInvocation);
+  Consumer.handleBuildCommand(
+      {CommandLine[0], ModuleInvocation.getCC1CommandLine()});
+
+  return true;
+}
+
+bool CompilerInstanceWithContext::finalize() {
+  DiagConsumer->finish();
+  return true;
+}
+
+llvm::Error CompilerInstanceWithContext::handleReturnStatus(bool Success) {
+  assert(DiagPrinterWithOS && "Must use the default DiagnosticConsumer.");
+  return Success ? llvm::Error::success()
+                 : llvm::make_error<llvm::StringError>(
+                       DiagPrinterWithOS->DiagnosticsOS.str(),
+                       llvm::inconvertibleErrorCode());
+}
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.h b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.h
index 5657317565e8d..54166dabebb05 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.h
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.h
@@ -1,4 +1,4 @@
-//===- DependencyScanner.h - Performs module dependency scanning *- C++ -*-===//
+//===- DependencyScannerImpl.h - Implements dependency scanning *- C++ -*--===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -23,6 +23,8 @@ class DiagnosticConsumer;
 namespace tooling {
 namespace dependencies {
 class DependencyScanningService;
+class DependencyScanningWorker;
+
 class DependencyConsumer;
 class DependencyActionController;
 class DependencyScanningWorkerFilesystem;
@@ -35,8 +37,7 @@ class DependencyScanningAction {
       IntrusiveRefCntPtr<DependencyScanningWorkerFilesystem> DepFS,
       std::optional<StringRef> ModuleName = std::nullopt)
       : Service(Service), WorkingDirectory(WorkingDirectory),
-        Consumer(Consumer), Controller(Controller), DepFS(std::move(DepFS)),
-        ModuleName(ModuleName) {}
+        Consumer(Consumer), Controller(Controller), DepFS(std::move(DepFS)) {}
   bool runInvocation(std::unique_ptr<CompilerInvocation> Invocation,
                      IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
                      std::shared_ptr<PCHContainerOperations> PCHContainerOps,
@@ -66,7 +67,6 @@ class DependencyScanningAction {
   DependencyConsumer &Consumer;
   DependencyActionController &Controller;
   IntrusiveRefCntPtr<DependencyScanningWorkerFilesystem> DepFS;
-  std::optional<StringRef> ModuleName;
   std::optional<CompilerInstance> ScanInstanceStorage;
   std::shared_ptr<ModuleDepCollector> MDC;
   std::vector<std::string> LastCC1Arguments;
@@ -118,7 +118,8 @@ initVFSForTUBuferScanning(IntrusiveRefCntPtr<llvm::vfs::FileSystem> BaseFS,
                           StringRef WorkingDirectory,
                           llvm::MemoryBufferRef TUBuffer);
 
-std::pair<IntrusiveRefCntPtr<llvm::vfs::FileSystem>, std::vector<std::string>>
+std::pair<IntrusiveRefCntPtr<llvm::vfs::OverlayFileSystem>,
+          std::vector<std::string>>
 initVFSForByNameScanning(IntrusiveRefCntPtr<llvm::vfs::FileSystem> BaseFS,
                          ArrayRef<std::string> CommandLine,
                          StringRef WorkingDirectory, StringRef ModuleName);
@@ -137,7 +138,7 @@ computePrebuiltModulesASTMap(CompilerInstance &ScanInstance,
                              SmallVector<StringRef> &StableDirs);
 
 std::unique_ptr<DependencyOutputOptions>
-takeDependencyOutputOptionsFrom(CompilerInstance &ScanInstance);
+takeAndUpdateDependencyOutputOptionsFrom(CompilerInstance &ScanInstance);
 
 /// Create the dependency collector that will collect the produced
 /// dependencies. May return the created ModuleDepCollector depending
@@ -150,6 +151,60 @@ std::shared_ptr<ModuleDepCollector> initializeScanInstanceDependencyCollector(
     DependencyActionController &Controller,
     PrebuiltModulesAttrsMap PrebuiltModulesASTMap,
     llvm::SmallVector<StringRef> &StableDirs);
+
+class CompilerInstanceWithContext {
+  // Context
+  DependencyScanningWorker &Worker;
+  llvm::StringRef CWD;
+  std::vector<std::string> CommandLine;
+
+  // Context - file systems
+  llvm::IntrusiveRefCntPtr<llvm::vfs::OverlayFileSystem> OverlayFS;
+
+  // Context - Diagnostics engine.
+  std::unique_ptr<TextDiagnosticsPrinterWithOutput> DiagPrinterWithOS;
+  // DiagConsumer may points to DiagPrinterWithOS->DiagPrinter, or a custom
+  // DiagnosticConsumer passed in from initialize.
+  DiagnosticConsumer *DiagConsumer = nullptr;
+  std::unique_ptr<DignosticsEngineWithDiagOpts> DiagEngineWithCmdAndOpts;
+
+  // Context - compiler invocation
+  // Compilation's command's arguments may be owned by Alloc when expanded from
+  // response files, so we need to keep Alloc alive in the context.
+  llvm::BumpPtrAllocator Alloc;
+  std::unique_ptr<clang::driver::Driver> Driver;
+  std::unique_ptr<clang::driver::Compilation> Compilation;
+  std::unique_ptr<CompilerInvocation> OriginalInvocation;
+
+  // Context - output options
+  std::unique_ptr<DependencyOutputOptions> OutputOpts;
+
+  // Context - stable directory handling
+  llvm::SmallVector<StringRef> StableDirs;
+  PrebuiltModulesAttrsMap PrebuiltModuleASTMap;
+
+  // Compiler Instance
+  std::unique_ptr<CompilerInstance> CIPtr;
+
+  // Source location offset.
+  int32_t SrcLocOffset = 0;
+
+public:
+  CompilerInstanceWithContext(DependencyScanningWorker &Worker, StringRef CWD,
+                              const std::vector<std::string> &CMD)
+      : Worker(Worker), CWD(CWD), CommandLine(CMD) {};
+
+  // The three methods below returns false when they fail, with the detail
+  // accumulated in DiagConsumer.
+  bool initialize(DiagnosticConsumer *DC);
+  bool computeDependencies(StringRef ModuleName, DependencyConsumer &Consumer,
+                           DependencyActionController &Controller);
+  bool finalize();
+
+  // The method below turns the return status from the above methods
+  // into an llvm::Error using a default DiagnosticConsumer.
+  llvm::Error handleReturnStatus(bool Success);
+};
 } // namespace dependencies
 } // namespace tooling
 } // namespace clang
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp
index 27734ffd0e20b..a1f2db7a471be 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp
@@ -162,13 +162,45 @@ DependencyScanningTool::getModuleDependencies(
     LookupModuleOutputCallback LookupModuleOutput) {
   FullDependencyConsumer Consumer(AlreadySeen);
   CallbackActionController Controller(LookupModuleOutput);
-  llvm::Error Result = Worker.computeDependencies(CWD, CommandLine, Consumer,
-                                                  Controller, ModuleName);
+  if (auto Error =
+          Worker.initializeCompilerInstanceWithContextOrError(CWD, CommandLine))
+    return std::move(Error);
+
+  auto Result = Worker.computeDependenciesByNameWithContextOrError(
+      ModuleName, Consumer, Controller);
+
+  if (auto Error = Worker.finalizeCompilerInstanceWithContextOrError())
+    return std::move(Error);
+
   if (Result)
     return std::move(Result);
+
   return Consumer.takeTranslationUnitDeps();
 }
 
+llvm::Error DependencyScanningTool::initializeCompilerInstanceWithContext(
+    StringRef CWD, const std::vector<std::string> &CommandLine) {
+  return Worker.initializeCompilerInstanceWithContextOrError(CWD, CommandLine);
+}
+
+llvm::Expected<TranslationUnitDeps>
+DependencyScanningTool::computeDependenciesByNameWithContext(
+    StringRef ModuleName, const llvm::DenseSet<ModuleID> &AlreadySeen,
+    LookupModuleOutputCallback LookupModuleOutput) {
+  FullDependencyConsumer Consumer(AlreadySeen);
+  CallbackActionController Controller(LookupModuleOutput);
+  llvm::Error Result = Worker.computeDependenciesByNameWithContextOrError(
+      ModuleName, Consumer, Controller);
+  if (Result)
+    return std::move(Result);
+
+  return Consumer.takeTranslationUnitDeps();
+}
+
+llvm::Error DependencyScanningTool::finalizeCompilerInstanceWithContext() {
+  return Worker.finalizeCompilerInstanceWithContextOrError();
+}
+
 TranslationUnitDeps FullDependencyConsumer::takeTranslationUnitDeps() {
   TranslationUnitDeps TU;
 
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
index 0a1cf6b18b11c..dc408b10542c3 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
@@ -43,6 +43,9 @@ DependencyScanningWorker::DependencyScanningWorker(
   }
 }
 
+DependencyScanningWorker::~DependencyScanningWorker() = default;
+DependencyActionController::~DependencyActionController() = default;
+
 llvm::Error DependencyScanningWorker::computeDependencies(
     StringRef WorkingDirectory, const std::vector<std::string> &CommandLine,
     DependencyConsumer &Consumer, DependencyActionController &Controller,
@@ -58,21 +61,6 @@ llvm::Error DependencyScanningWorker::computeDependencies(
       DiagPrinterWithOS.DiagnosticsOS.str(), llvm::inconvertibleErrorCode());
 }
 
-llvm::Error DependencyScanningWorker::computeDependencies(
-    StringRef WorkingDirectory, const std::vector<std::string> &CommandLine,
-    DependencyConsumer &Consumer, DependencyActionController &Controller,
-    StringRef ModuleName) {
-  // Capture the emitted diagnostics and report them to the client
-  // in the case of a failure.
-  TextDiagnosticsPrinterWithOutput DiagPrinterWithOS(CommandLine);
-
-  if (computeDependencies(WorkingDirectory, CommandLine, Consumer, Controller,
-                          DiagPrinterWithOS.DiagPrinter, ModuleName))
-    return llvm::Error::success();
-  return llvm::make_error<llvm::StringError>(
-      DiagPrinterWithOS.DiagnosticsOS.str(), llvm::inconvertibleErrorCode());
-}
-
 static bool forEachDriverJob(
     ArrayRef<std::string> ArgStrs, DiagnosticsEngine &Diags,
     IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
@@ -113,11 +101,11 @@ static bool createAndRunToolInvocation(
 bool DependencyScanningWorker::scanDependencies(
     StringRef WorkingDirectory, const std::vector<std::string> &CommandLine,
     DependencyConsumer &Consumer, DependencyActionController &Controller,
-    DiagnosticConsumer &DC, llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
-    std::optional<StringRef> ModuleName) {
+    DiagnosticConsumer &DC,
+    llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS) {
   DignosticsEngineWithDiagOpts DiagEngineWithCmdAndOpts(CommandLine, FS, DC);
   DependencyScanningAction Action(Service, WorkingDirectory, Consumer,
-                                  Controller, DepFS, ModuleName);
+                                  Controller, DepFS);
 
   bool Success = false;
   if (CommandLine[1] == "-cc1") {
@@ -172,24 +160,51 @@ bool DependencyScanningWorker::computeDependencies(
     auto [FinalFS, FinalCommandLine] = initVFSForTUBuferScanning(
         BaseFS, CommandLine, WorkingDirectory, *TUBuffer);
     return scanDependencies(WorkingDirectory, FinalCommandLine, Consumer,
-                            Controller, DC, FinalFS,
-                            /*ModuleName=*/std::nullopt);
+                            Controller, DC, FinalFS);
   } else {
     BaseFS->setCurrentWorkingDirectory(WorkingDirectory);
     return scanDependencies(WorkingDirectory, CommandLine, Consumer, Controller,
-                            DC, BaseFS, /*ModuleName=*/std::nullopt);
+                            DC, BaseFS);
   }
 }
 
-bool DependencyScanningWorker::computeDependencies(
-    StringRef WorkingDirectory, const std::vector<std::string> &CommandLine,
-    DependencyConsumer &Consumer, DependencyActionController &Controller,
-    DiagnosticConsumer &DC, StringRef ModuleName) {
-  auto [OverlayFS, ModifiedCommandLine] = initVFSForByNameScanning(
-      BaseFS, CommandLine, WorkingDirectory, ModuleName);
+llvm::Error
+DependencyScanningWorker::initializeCompilerInstanceWithContextOrError(
+    StringRef CWD, const std::vector<std::string> &CommandLine) {
+  bool Success = initializeCompilerInstanceWithContext(CWD, CommandLine);
+  return CIWithContext->handleReturnStatus(Success);
+}
+
+llvm::Error
+DependencyScanningWorker::computeDependenciesByNameWithContextOrError(
+    StringRef ModuleName, DependencyConsumer &Consumer,
+    DependencyActionController &Controller) {
+  bool Success =
+      computeDependenciesByNameWithContext(ModuleName, Consumer, Controller);
+  return CIWithContext->handleReturnStatus(Success);
+}
+
+llvm::Error
+DependencyScanningWorker::finalizeCompilerInstanceWithContextOrError() {
+  bool Success = finalizeCompilerInstance();
+  return CIWithContext->handleReturnStatus(Success);
+}
 
-  return scanDependencies(WorkingDirectory, ModifiedCommandLine, Consumer,
-                          Controller, DC, OverlayFS, ModuleName);
+bool DependencyScanningWorker::initializeCompilerInstanceWithContext(
+    StringRef CWD, const std::vector<std::string> &CommandLine,
+    DiagnosticConsumer *DC) {
+  CIWithContext =
+      std::make_unique<CompilerInstanceWithContext>(*this, CWD, CommandLine);
+  return CIWithContext->initialize(DC);
 }
 
-DependencyActionController::~DependencyActionController() {}
+bool DependencyScanningWorker::computeDependenciesByNameWithContext(
+    StringRef ModuleName, DependencyConsumer &Consumer,
+    DependencyActionController &Controller) {
+  assert(CIWithContext && "CompilerInstance with context required!");
+  return CIWithContext->computeDependencies(ModuleName, Consumer, Controller);
+}
+
+bool DependencyScanningWorker::finalizeCompilerInstance() {
+  return CIWithContext->finalize();
+}
diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
index a117bec0d656e..0022597348a82 100644
--- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
+++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
@@ -471,82 +471,13 @@ static bool isSafeToIgnoreCWD(const CowCompilerInvocation &CI) {
   // Check if the command line input uses relative paths.
   // It is not safe to ignore the current working directory if any of the
   // command line inputs use relative paths.
-#define IF_RELATIVE_RETURN_FALSE(PATH)                                         \
-  do {                                                                         \
-    if (!PATH.empty() && !llvm::sys::path::is_absolute(PATH))                  \
-      return false;                                                            \
-  } while (0)
-
-#define IF_ANY_RELATIVE_RETURN_FALSE(PATHS)                                    \
-  do {                                                                         \
-    if (llvm::any_of(PATHS, [](const auto &P) {                                \
-          return !P.empty() && !llvm::sys::path::is_absolute(P);               \
-        }))                                                                    \
-      return false;                                                            \
-  } while (0)
-
-  // Header search paths.
-  const auto &HeaderSearchOpts = CI.getHeaderSearchOpts();
-  IF_RELATIVE_RETURN_FALSE(HeaderSearchOpts.Sysroot);
-  for (auto &Entry : HeaderSearchOpts.UserEntries)
-    if (Entry.IgnoreSysRoot)
-      IF_RELATIVE_RETURN_FALSE(Entry.Path);
-  IF_RELATIVE_RETURN_FALSE(HeaderSearchOpts.ResourceDir);
-  IF_RELATIVE_RETURN_FALSE(HeaderSearchOpts.ModuleCachePath);
-  IF_RELATIVE_RETURN_FALSE(HeaderSearchOpts.ModuleUserBuildPath);
-  for (auto I = HeaderSearchOpts.PrebuiltModuleFiles.begin(),
-            E = HeaderSearchOpts.PrebuiltModuleFiles.end();
-       I != E;) {
-    auto Current = I++;
-    IF_RELATIVE_RETURN_FALSE(Current->second);
-  }
-  IF_ANY_RELATIVE_RETURN_FALSE(HeaderSearchOpts.PrebuiltModulePaths);
-  IF_ANY_RELATIVE_RETURN_FALSE(HeaderSearchOpts.VFSOverlayFiles);
-
-  // Preprocessor options.
-  const auto &PPOpts = CI.getPreprocessorOpts();
-  IF_ANY_RELATIVE_RETURN_FALSE(PPOpts.MacroIncludes);
-  IF_ANY_RELATIVE_RETURN_FALSE(PPOpts.Includes);
-  IF_RELATIVE_RETURN_FALSE(PPOpts.ImplicitPCHInclude);
-
-  // Frontend options.
-  const auto &FrontendOpts = CI.getFrontendOpts();
-  for (const FrontendInputFile &Input : FrontendOpts.Inputs) {
-    if (Input.isBuffer())
-      continue; // FIXME: Can this happen when parsing command-line?
-
-    IF_RELATIVE_RETURN_FALSE(Input.getFile());
-  }
-  IF_RELATIVE_RETURN_FALSE(FrontendOpts.CodeCompletionAt.FileName);
-  IF_ANY_RELATIVE_RETURN_FALSE(FrontendOpts.ModuleMapFiles);
-  IF_ANY_RELATIVE_RETURN_FALSE(FrontendOpts.ModuleFiles);
-  IF_ANY_RELATIVE_RETURN_FALSE(FrontendOpts.ModulesEmbedFiles);
-  IF_ANY_RELATIVE_RETURN_FALSE(FrontendOpts.ASTMergeFiles);
-  IF_RELATIVE_RETURN_FALSE(FrontendOpts.OverrideRecordLayoutsFile);
-  IF_RELATIVE_RETURN_FALSE(FrontendOpts.StatsFile);
-
-  // Filesystem options.
-  const auto &FileSystemOpts = CI.getFileSystemOpts();
-  IF_RELATIVE_RETURN_FALSE(FileSystemOpts.WorkingDir);
-
-  // Codegen options.
-  const auto &CodeGenOpts = CI.getCodeGenOpts();
-  IF_RELATIVE_RETURN_FALSE(CodeGenOpts.DebugCompilationDir);
-  IF_RELATIVE_RETURN_FALSE(CodeGenOpts.CoverageCompilationDir);
-
-  // Sanitizer options.
-  IF_ANY_RELATIVE_RETURN_FALSE(CI.getLangOpts().NoSanitizeFiles);
-
-  // Coverage mappings.
-  IF_RELATIVE_RETURN_FALSE(CodeGenOpts.ProfileInstrumentUsePath);
-  IF_RELATIVE_RETURN_FALSE(CodeGenOpts.SampleProfileFile);
-  IF_RELATIVE_RETURN_FALSE(CodeGenOpts.ProfileRemappingFile);
-
-  // Dependency output options.
-  for (auto &ExtraDep : CI.getDependencyOutputOpts().ExtraDeps)
-    IF_RELATIVE_RETURN_FALSE(ExtraDep.first);
-
-  return true;
+  bool AnyRelative = false;
+  CI.visitPaths([&](StringRef Path) {
+    assert(!AnyRelative && "Continuing path visitation despite returning true");
+    AnyRelative |= !Path.empty() && !llvm::sys::path::is_absolute(Path);
+    return AnyRelative;
+  });
+  return !AnyRelative;
 }
 
 static std::string getModuleContextHash(const ModuleDeps &MD,
@@ -965,7 +896,9 @@ ModuleDepCollector::ModuleDepCollector(
           makeCommonInvocationForModuleBuild(std::move(OriginalCI))) {}
 
 void ModuleDepCollector::attachToPreprocessor(Preprocessor &PP) {
-  PP.addPPCallbacks(std::make_unique<ModuleDepCollectorPP>(*this));
+  auto CollectorPP = std::make_unique<ModuleDepCollectorPP>(*this);
+  CollectorPPPtr = CollectorPP.get();
+  PP.addPPCallbacks(std::move(CollectorPP));
 }
 
 void ModuleDepCollector::attachToASTReader(ASTReader &R) {}
diff --git a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
index 28568426a6c48..e9b72388ae4df 100644
--- a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
+++ b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
@@ -44,8 +44,8 @@
 
 #include "clang/Basic/LangStandard.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Types.h"
+#include "clang/Options/Options.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -164,11 +164,11 @@ struct TransferableCommand {
     // We parse each argument individually so that we can retain the exact
     // spelling of each argument; re-rendering is lossy for aliased flags.
     // E.g. in CL mode, /W4 maps to -Wall.
-    auto &OptTable = clang::driver::getDriverOptTable();
+    auto &OptTable = getDriverOptTable();
     if (!OldArgs.empty())
       Cmd.CommandLine.emplace_back(OldArgs.front());
     for (unsigned Pos = 1; Pos < OldArgs.size();) {
-      using namespace driver::options;
+      using namespace options;
 
       const unsigned OldPos = Pos;
       std::unique_ptr<llvm::opt::Arg> Arg(OptTable.ParseOneArg(
@@ -296,14 +296,14 @@ struct TransferableCommand {
   // Try to interpret the argument as a type specifier, e.g. '-x'.
   std::optional<types::ID> tryParseTypeArg(const llvm::opt::Arg &Arg) {
     const llvm::opt::Option &Opt = Arg.getOption();
-    using namespace driver::options;
+    using namespace options;
     if (ClangCLMode) {
       if (Opt.matches(OPT__SLASH_TC) || Opt.matches(OPT__SLASH_Tc))
         return types::TY_C;
       if (Opt.matches(OPT__SLASH_TP) || Opt.matches(OPT__SLASH_Tp))
         return types::TY_CXX;
     } else {
-      if (Opt.matches(driver::options::OPT_x))
+      if (Opt.matches(options::OPT_x))
         return types::lookupTypeForTypeSpecifier(Arg.getValue());
     }
     return std::nullopt;
@@ -311,7 +311,7 @@ struct TransferableCommand {
 
   // Try to interpret the argument as '-std='.
   std::optional<LangStandard::Kind> tryParseStdArg(const llvm::opt::Arg &Arg) {
-    using namespace driver::options;
+    using namespace options;
     if (Arg.getOption().matches(ClangCLMode ? OPT__SLASH_std : OPT_std_EQ)) {
       // "c++latest" is not a recognized LangStandard, but it's accepted by
       // the clang driver in CL mode.
diff --git a/clang/lib/Tooling/Syntax/TokenBufferTokenManager.cpp b/clang/lib/Tooling/Syntax/TokenBufferTokenManager.cpp
index a06f7e2900d47..3d63d4ab506ab 100644
--- a/clang/lib/Tooling/Syntax/TokenBufferTokenManager.cpp
+++ b/clang/lib/Tooling/Syntax/TokenBufferTokenManager.cpp
@@ -10,8 +10,6 @@
 
 namespace clang {
 namespace syntax {
-constexpr llvm::StringLiteral syntax::TokenBufferTokenManager::Kind;
-
 std::pair<FileID, ArrayRef<syntax::Token>>
 syntax::TokenBufferTokenManager::lexBuffer(
     std::unique_ptr<llvm::MemoryBuffer> Input) {
diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp
index e8eef5ed9c9fa..1f6a5c94601fc 100644
--- a/clang/lib/Tooling/Tooling.cpp
+++ b/clang/lib/Tooling/Tooling.cpp
@@ -21,7 +21,6 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Job.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Frontend/ASTUnit.h"
@@ -32,6 +31,7 @@
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Lex/HeaderSearchOptions.h"
 #include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Options/Options.h"
 #include "clang/Tooling/ArgumentsAdjusters.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -270,17 +270,15 @@ void addTargetAndModeForProgramName(std::vector<std::string> &CommandLine,
                                     StringRef InvokedAs) {
   if (CommandLine.empty() || InvokedAs.empty())
     return;
-  const auto &Table = driver::getDriverOptTable();
+  const auto &Table = getDriverOptTable();
   // --target=X
-  StringRef TargetOPT =
-      Table.getOption(driver::options::OPT_target).getPrefixedName();
+  StringRef TargetOPT = Table.getOption(options::OPT_target).getPrefixedName();
   // -target X
   StringRef TargetOPTLegacy =
-      Table.getOption(driver::options::OPT_target_legacy_spelling)
-          .getPrefixedName();
+      Table.getOption(options::OPT_target_legacy_spelling).getPrefixedName();
   // --driver-mode=X
   StringRef DriverModeOPT =
-      Table.getOption(driver::options::OPT_driver_mode).getPrefixedName();
+      Table.getOption(options::OPT_driver_mode).getPrefixedName();
   auto TargetMode =
       driver::ToolChain::getTargetAndModeFromProgramName(InvokedAs);
   // No need to search for target args if we don't have a target/mode to insert.
diff --git a/clang/lib/Tooling/Transformer/RangeSelector.cpp b/clang/lib/Tooling/Transformer/RangeSelector.cpp
index 171c786bc366f..b4bdec1fcdd69 100644
--- a/clang/lib/Tooling/Transformer/RangeSelector.cpp
+++ b/clang/lib/Tooling/Transformer/RangeSelector.cpp
@@ -205,8 +205,12 @@ RangeSelector transformer::name(std::string ID) {
       // `foo<int>` for which this range will be too short.  Doing so will
       // require subcasing `NamedDecl`, because it doesn't provide virtual
       // access to the \c DeclarationNameInfo.
-      if (tooling::getText(R, *Result.Context) != D->getName())
-        return CharSourceRange();
+      StringRef Text = tooling::getText(R, *Result.Context);
+      if (Text != D->getName())
+        return llvm::make_error<StringError>(
+            llvm::errc::not_supported,
+            "range selected by name(node id=" + ID + "): '" + Text +
+                "' is different from decl name '" + D->getName() + "'");
       return R;
     }
     if (const auto *E = Node.get<DeclRefExpr>()) {
diff --git a/clang/test/AST/ByteCode/c.c b/clang/test/AST/ByteCode/c.c
index 3360d4f725b24..bffd557ff77a6 100644
--- a/clang/test/AST/ByteCode/c.c
+++ b/clang/test/AST/ByteCode/c.c
@@ -387,3 +387,8 @@ void bar2(void) {
   int a[2][3][4][5]; // all-note {{array 'a' declared here}}
   foo2(&a[0][4]); // all-warning {{array index 4 is past the end of the array}}
 }
+
+void plainComplex(void) {
+  _Complex cd; // all-warning {{_Complex double}}
+  cd = *(_Complex *)&(struct { double r, i; }){0.0, 0.0}; // all-warning {{_Complex double}}
+}
diff --git a/clang/test/AST/ByteCode/cxx11.cpp b/clang/test/AST/ByteCode/cxx11.cpp
index 427d3a106656b..95615350f5142 100644
--- a/clang/test/AST/ByteCode/cxx11.cpp
+++ b/clang/test/AST/ByteCode/cxx11.cpp
@@ -374,8 +374,27 @@ namespace GH150709 {
 namespace DiscardedAddrLabel {
   void foo(void) {
   L:
-    *&&L; // both-error {{indirection not permitted}} \
+    *&&L; // both-error {{indirection not permitted on operand of type 'void *'}} \
           // both-warning {{expression result unused}}
   }
 }
 
+struct Counter {
+  int copies;
+  constexpr Counter(int copies) : copies(copies) {}
+  constexpr Counter(const Counter& other) : copies(other.copies + 1) {}
+};
+// Passing an lvalue by value makes a non-elidable copy.
+constexpr int PassByValue(Counter c) { return c.copies; }
+static_assert(PassByValue(Counter(0)) == 0, "expect no copies");
+
+namespace PointerCast {
+  /// The two interpreters disagree here.
+  struct S { int x, y; } s;
+  constexpr S* sptr = &s;
+  struct U {};
+  struct Str {
+    int e : (Str*)(sptr) == (Str*)(sptr); // expected-error {{not an integral constant expression}} \
+                                          // expected-note {{cast that performs the conversions of a reinterpret_cast}}
+  };
+}
diff --git a/clang/test/AST/ByteCode/cxx14.cpp b/clang/test/AST/ByteCode/cxx14.cpp
index 9622311e100cb..57cb42ea4a98b 100644
--- a/clang/test/AST/ByteCode/cxx14.cpp
+++ b/clang/test/AST/ByteCode/cxx14.cpp
@@ -7,3 +7,24 @@ constexpr int(*null_ptr)() = nullptr;
 constexpr int test4 = (*null_ptr)(); // both-error {{must be initialized by a constant expression}} \
                                      // both-note {{evaluates to a null function pointer}}
 
+struct E {
+  int n = 0;
+  struct {
+    void *x = this;
+  };
+  void *y = this;
+};
+constexpr E e1 = E();
+static_assert(e1.x != e1.y, "");
+constexpr E e2 = E{0};
+static_assert(e2.x != e2.y, "");
+
+struct S {
+  int &&a = 2;
+  int b[1]{a};
+};
+constexpr int foo() {
+  S s{12};
+  return s.b[0];
+}
+static_assert(foo() == 12, "");
diff --git a/clang/test/AST/ByteCode/invalid.cpp b/clang/test/AST/ByteCode/invalid.cpp
index 00db27419e36b..115c8663079a1 100644
--- a/clang/test/AST/ByteCode/invalid.cpp
+++ b/clang/test/AST/ByteCode/invalid.cpp
@@ -66,3 +66,44 @@ struct S {
 S s;
 S *sp[2] = {&s, &s};
 S *&spp = sp[1];
+
+namespace InvalidBitCast {
+  void foo() {
+    const long long int i = 1; // both-note {{declared const here}}
+    if (*(double *)&i == 2) {
+      i = 0; // both-error {{cannot assign to variable}}
+    }
+  }
+
+  struct S2 {
+    void *p;
+  };
+  struct T {
+    S2 s;
+  };
+  constexpr T t = {{nullptr}};
+  constexpr void *foo2() { return ((void **)&t)[0]; } // both-error {{never produces a constant expression}} \
+                                                      // both-note 2{{cast that performs the conversions of a reinterpret_cast}}
+  constexpr auto x = foo2(); // both-error {{must be initialized by a constant expression}} \
+                             // both-note {{in call to}}
+
+
+  struct sockaddr
+  {
+    char sa_data[8];
+  };
+  struct in_addr
+  {
+    unsigned int s_addr;
+  };
+  struct sockaddr_in
+  {
+    unsigned short int sin_port;
+    struct in_addr sin_addr;
+  };
+  /// Bitcast from sockaddr to sockaddr_in. Used to crash.
+  unsigned int get_addr(sockaddr addr) {
+      return ((sockaddr_in *)&addr)->sin_addr.s_addr;
+  }
+
+}
diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp
index 83f32c97c50c7..4799ebe25dde1 100644
--- a/clang/test/AST/ByteCode/records.cpp
+++ b/clang/test/AST/ByteCode/records.cpp
@@ -1882,3 +1882,14 @@ namespace MethodWillHaveBody {
   }
   int n = f(0); // both-note {{instantiation of}}
 }
+
+namespace StaticRedecl {
+  struct T {
+    static T tt;
+    constexpr T() : p(&tt) {}
+    T *p;
+  };
+  T T::tt;
+  constexpr T t;
+  static_assert(t.p == &T::tt, "");
+}
diff --git a/clang/test/AST/HLSL/cbuffer.hlsl b/clang/test/AST/HLSL/cbuffer.hlsl
index f3c6636232798..b0b5b989e36c2 100644
--- a/clang/test/AST/HLSL/cbuffer.hlsl
+++ b/clang/test/AST/HLSL/cbuffer.hlsl
@@ -153,7 +153,7 @@ cbuffer CB {
   static float SV;
   // CHECK: VarDecl {{.*}} s7 'EmptyStruct' callinit
   EmptyStruct s7;
-  // CHECK: VarDecl {{.*}} Buf 'RWBuffer<float>':'hlsl::RWBuffer<float>' static callinit
+  // CHECK: VarDecl {{.*}} Buf 'RWBuffer<float>':'hlsl::RWBuffer<float>' callinit
   RWBuffer<float> Buf;
   // CHECK: VarDecl {{.*}} ea 'EmptyArrayTypedef':'float[10][0]'
   EmptyArrayTypedef ea;
diff --git a/clang/test/AST/HLSL/matrix-constructors.hlsl b/clang/test/AST/HLSL/matrix-constructors.hlsl
index 0a2f03c7c0fac..e1a9c53e2c602 100644
--- a/clang/test/AST/HLSL/matrix-constructors.hlsl
+++ b/clang/test/AST/HLSL/matrix-constructors.hlsl
@@ -9,21 +9,20 @@ typedef float float4 __attribute__((ext_vector_type(4)));
 
 [numthreads(1,1,1)]
 void ok() {
-
 // CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} A 'float2x3':'matrix<float, 2, 3>' cinit
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>' functional cast to float2x3 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
   float2x3 A = float2x3(1,2,3,4,5,6);
 
@@ -57,6 +56,8 @@ void ok() {
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' xvalue
@@ -68,12 +69,10 @@ void ok() {
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
   float2x3 D = float2x3(float2(1,2), 3, 4, 5, 6);
 
@@ -97,9 +96,9 @@ void ok() {
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' functional cast to float2 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
@@ -107,10 +106,12 @@ void ok() {
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' functional cast to float2 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' xvalue
@@ -120,9 +121,7 @@ void ok() {
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
   float2x3 E = float2x3(float2(1,2), float2(3,4), 5, 6);
@@ -158,7 +157,7 @@ void ok() {
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>' xvalue
@@ -172,7 +171,9 @@ void ok() {
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>' xvalue
@@ -186,9 +187,7 @@ void ok() {
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 3
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
   float2x3 F = float2x3(float4(1,2,3,4), 5, 6);
@@ -202,10 +201,10 @@ void ok() {
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
@@ -215,41 +214,41 @@ void ok() {
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' matrixcomponent
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' matrixcomponent
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
 float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);  
@@ -262,13 +261,13 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'vector<float, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'vector<float, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
   float2 Vec2 = float2(1.0, 2.0);   
   float2x2 H = float2x2(Vec2,3,4);
@@ -281,10 +280,10 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'i' 'int'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <LValueToRValue>
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'j' 'int'
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'k' 'int'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <LValueToRValue>
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'k' 'int'
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'j' 'int'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <LValueToRValue>
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'l' 'int'
@@ -300,15 +299,15 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue .a 0x{{[0-9a-fA-F]+}}
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue .a 0x{{[0-9a-fA-F]+}}
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
-// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue .a 0x{{[0-9a-fA-F]+}}
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
   struct S { float2 f;  float a;} s;
   float2x2 J = float2x2(s.f, s.a, s.a);
@@ -317,8 +316,8 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 1.000000e+00
-// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 3.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 4.000000e+00
   typedef float2x2 second_level_of_typedefs;
   second_level_of_typedefs L = float2x2(1.0f, 2.0f, 3.0f, 4.0f);
@@ -327,8 +326,8 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'second_level_of_typedefs':'matrix<float, 2, 2>' functional cast to second_level_of_typedefs <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'second_level_of_typedefs':'matrix<float, 2, 2>'
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 1.000000e+00
-// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 3.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 4.000000e+00
   float2x2 M = second_level_of_typedefs(1.0f, 2.0f, 3.0f, 4.0f);
 
@@ -367,12 +366,12 @@ float2x1 GettingStrange = float2x1(s2, s2);
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'S3' lvalue Var 0x{{[0-9a-fA-F]+}} 's3' 'S3'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'S3' lvalue Var 0x{{[0-9a-fA-F]+}} 's3' 'S3'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
diff --git a/clang/test/AST/HLSL/matrix-general-initializer.hlsl b/clang/test/AST/HLSL/matrix-general-initializer.hlsl
index 14c950acb7baf..1a631113eb0f0 100644
--- a/clang/test/AST/HLSL/matrix-general-initializer.hlsl
+++ b/clang/test/AST/HLSL/matrix-general-initializer.hlsl
@@ -26,14 +26,6 @@ void ok() {
 // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xxx
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
-// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
-// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xvalue
-// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xxx
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
@@ -42,20 +34,8 @@ void ok() {
 // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xx
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
-// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
-// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xvalue
-// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xx
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' x
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xvalue
@@ -66,10 +46,30 @@ void ok() {
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
-// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xvalue
-// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xx
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xxx
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'vector<int, 2>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'vector<int, 2>' xx
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' x
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'vector<int, 2>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'vector<int, 2>' xx
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'int' 4
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx};
 
@@ -84,12 +84,12 @@ float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx};
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
@@ -105,12 +105,12 @@ float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx};
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
@@ -138,7 +138,7 @@ S s = {m};
 // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 4>' xxxx
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
@@ -146,7 +146,7 @@ S s = {m};
 // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 4>' xxxx
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
@@ -169,25 +169,25 @@ float2x2 m2 = {0.xxxx};
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
@@ -199,26 +199,26 @@ float2x2 m2 = {0.xxxx};
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
@@ -229,25 +229,25 @@ float2x2 m2 = {0.xxxx};
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
diff --git a/clang/test/AST/HLSL/packoffset.hlsl b/clang/test/AST/HLSL/packoffset.hlsl
index 4d18a9ca631f1..05b927279e198 100644
--- a/clang/test/AST/HLSL/packoffset.hlsl
+++ b/clang/test/AST/HLSL/packoffset.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.3-library -S -finclude-default-header -fnative-half-type -ast-dump  -x hlsl %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.3-library -S -finclude-default-header -fnative-half-type -fnative-int16-type -ast-dump  -x hlsl %s | FileCheck %s
 
 
 // CHECK: HLSLBufferDecl {{.*}} cbuffer A
diff --git a/clang/test/AST/HLSL/private.hlsl b/clang/test/AST/HLSL/private.hlsl
index e00afb8f5cbd8..ba7380ec3cfda 100644
--- a/clang/test/AST/HLSL/private.hlsl
+++ b/clang/test/AST/HLSL/private.hlsl
@@ -3,7 +3,7 @@
 // CHECK: VarDecl {{.*}} global_scalar 'hlsl_private int' static cinit
 static int global_scalar = 0;
 
-// CHECK: VarDecl {{.*}} global_buffer 'RWBuffer<float>':'hlsl::RWBuffer<float>' static callinit
+// CHECK: VarDecl {{.*}} global_buffer 'RWBuffer<float>':'hlsl::RWBuffer<float>' callinit
 RWBuffer<float> global_buffer;
 
 class A {
diff --git a/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl b/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
index 733c4e2ee5a36..5654974b26d2d 100644
--- a/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
+++ b/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute -x hlsl -ast-dump -o - %s | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -fnative-int16-type -triple spirv-unknown-vulkan-compute -x hlsl -ast-dump -o - %s | FileCheck %s
 
 // CHECK: VarDecl {{.*}} bool_const 'const hlsl_private bool' static cinit
 // CHECK-NEXT: CallExpr {{.*}} 'bool'
diff --git a/clang/test/AST/ast-dump-arm-attr.c b/clang/test/AST/ast-dump-arm-attr.c
index 78f557d4eb0b1..d26a77d067e97 100644
--- a/clang/test/AST/ast-dump-arm-attr.c
+++ b/clang/test/AST/ast-dump-arm-attr.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -triple arm-apple-darwin -ast-dump -ast-dump-filter Test %s \
 // RUN: | FileCheck --strict-whitespace %s
 //
-// RUN: %clang_cc1 -triple armv8m.base-none-eabi -mcmse -ast-dump -ast-dump-filter Test %s \
+// RUN: %clang_cc1 -triple thumbv8m.base-none-eabi -mcmse -ast-dump -ast-dump-filter Test %s \
 // RUN: | FileCheck --strict-whitespace %s --check-prefix=CHECK-CMSE
 //
 // Tests with serialization:
@@ -11,8 +11,8 @@
 // RUN: | sed -e "s/ <undeserialized declarations>//" -e "s/ imported//" \
 // RUN: | FileCheck --strict-whitespace %s
 //
-// RUN: %clang_cc1 -triple armv8m.base-none-eabi -mcmse -emit-pch -o %t %s
-// RUN: %clang_cc1 -x c -triple armv8m.base-none-eabi -mcmse -include-pch %t -ast-dump-all -ast-dump-filter Test /dev/null \
+// RUN: %clang_cc1 -triple thumbv8m.base-none-eabi -mcmse -emit-pch -o %t %s
+// RUN: %clang_cc1 -x c -triple thumbv8m.base-none-eabi -mcmse -include-pch %t -ast-dump-all -ast-dump-filter Test /dev/null \
 // RUN: | sed -e "s/ <undeserialized declarations>//" -e "s/ imported//" \
 // RUN: | FileCheck --strict-whitespace %s
 
diff --git a/clang/test/AST/ast-dump-stmt.c b/clang/test/AST/ast-dump-stmt.c
index 5c44fea2df6e7..6fb01a4b159fa 100644
--- a/clang/test/AST/ast-dump-stmt.c
+++ b/clang/test/AST/ast-dump-stmt.c
@@ -400,7 +400,7 @@ void TestMiscStmts(void) {
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:17> 'int' lvalue Var 0x{{[^ ]*}} 'a' 'int'
   ({int a = 10; a;;; });
-  // CHECK-NEXT: StmtExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:23> 'int'
+  // CHECK-NEXT: StmtExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:23> 'void'
   // CHECK-NEXT: CompoundStmt
   // CHECK-NEXT: DeclStmt
   // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:5, col:13> col:9 used a 'int' cinit
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures-co_await-assertion-failure.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures-co_await-assertion-failure.cpp
new file mode 100644
index 0000000000000..a67f45700cd10
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures-co_await-assertion-failure.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=webkit.UncountedLambdaCapturesChecker -std=c++20 -verify %s
+// expected-no-diagnostics
+
+template<typename Arg>
+void foo(Arg&& arg)
+{
+    [&]{
+        co_await [&](auto&&... args) {
+        }(arg);
+    }();
+}
diff --git a/clang/test/Analysis/LifetimeSafety/CMakeLists.txt b/clang/test/Analysis/LifetimeSafety/CMakeLists.txt
index ce37a29655668..2f9c2ac247497 100644
--- a/clang/test/Analysis/LifetimeSafety/CMakeLists.txt
+++ b/clang/test/Analysis/LifetimeSafety/CMakeLists.txt
@@ -15,6 +15,13 @@ set(LIFETIME_BENCHMARK_REQUIREMENTS
 set(LIFETIME_BENCHMARK_OUTPUT_DIR
   "${CMAKE_CURRENT_BINARY_DIR}/benchmark_results")
 
+if(WIN32)
+  set(LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE
+    "${LIFETIME_BENCHMARK_VENV_DIR}/Scripts/python")
+else()
+  set(LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE
+    "${LIFETIME_BENCHMARK_VENV_DIR}/bin/python")
+endif()
 
 if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREMENTS})
 
@@ -22,7 +29,7 @@ if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREME
   add_custom_command(
     OUTPUT ${LIFETIME_BENCHMARK_VENV_DIR}/pyvenv.cfg
     COMMAND ${Python3_EXECUTABLE} -m venv ${LIFETIME_BENCHMARK_VENV_DIR}
-    COMMAND ${LIFETIME_BENCHMARK_VENV_DIR}/bin/python -m pip install -r ${LIFETIME_BENCHMARK_REQUIREMENTS}
+    COMMAND ${LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE} -m pip install -r ${LIFETIME_BENCHMARK_REQUIREMENTS}
     DEPENDS ${LIFETIME_BENCHMARK_REQUIREMENTS}
     COMMENT "Creating Python virtual environment and installing dependencies for benchmark..."
   )
@@ -32,7 +39,7 @@ if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREME
 
   # Main benchmark target
   add_custom_target(benchmark_lifetime_safety_analysis
-    COMMAND ${LIFETIME_BENCHMARK_VENV_DIR}/bin/python ${LIFETIME_BENCHMARK_SCRIPT}
+    COMMAND ${LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE} ${LIFETIME_BENCHMARK_SCRIPT}
             --clang-binary ${LLVM_BINARY_DIR}/bin/clang
             --output-dir ${LIFETIME_BENCHMARK_OUTPUT_DIR}
 
diff --git a/clang/test/Analysis/NewDelete-checker-test.cpp b/clang/test/Analysis/NewDelete-checker-test.cpp
index c417b9c2ac97e..fd831cc0985cc 100644
--- a/clang/test/Analysis/NewDelete-checker-test.cpp
+++ b/clang/test/Analysis/NewDelete-checker-test.cpp
@@ -3,13 +3,13 @@
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDelete
 //
-// RUN: %clang_analyze_cc1 -DLEAKS -std=c++11 -fblocks %s \
+// RUN: %clang_analyze_cc1 -std=c++11 -fblocks %s \
 // RUN:   -verify=expected,newdelete,leak \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDelete \
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
 //
-// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -verify %s \
+// RUN: %clang_analyze_cc1 -std=c++11 -fblocks %s \
 // RUN:   -verify=expected,leak \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
@@ -19,13 +19,13 @@
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDelete
 //
-// RUN: %clang_analyze_cc1 -DLEAKS -std=c++17 -fblocks %s \
+// RUN: %clang_analyze_cc1 -std=c++17 -fblocks %s \
 // RUN:   -verify=expected,newdelete,leak \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDelete \
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
 //
-// RUN: %clang_analyze_cc1 -std=c++17 -fblocks -verify %s \
+// RUN: %clang_analyze_cc1 -std=c++17 -fblocks %s \
 // RUN:   -verify=expected,leak,inspection \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks \
@@ -503,3 +503,75 @@ namespace optional_union {
     custom_union_t a;
   } // leak-warning{{Potential leak of memory pointed to by 'a.present.q'}}
 }
+
+namespace gh153782 {
+
+// Ensure we do not regress on the following use case.
+
+namespace mutually_exclusive_test_case_1 {
+struct StorageWrapper {
+  // Imagine the destructor and copy constructor both call a reset() function (among other things).
+  ~StorageWrapper() { delete parts; }
+  StorageWrapper(StorageWrapper const&) = default;
+
+  // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find.
+  void operator=(StorageWrapper&& other) { delete parts; } // newdelete-warning{{Attempt to release already released memory}}
+
+  // Not provided, typically would do `parts = new long`.
+  StorageWrapper();
+
+  long* parts;
+};
+
+void test_non_trivial_struct_assignment() {
+  StorageWrapper* object = new StorageWrapper[]{StorageWrapper()};
+  object[0] = StorageWrapper(); // This assignment leads to the double-free.
+}
+} // mutually_exclusive_test_case_1
+
+namespace mutually_exclusive_test_case_2 {
+struct StorageWrapper {
+  // Imagine the destructor and copy constructor both call a reset() function (among other things).
+  ~StorageWrapper() { delete parts; }
+  StorageWrapper(StorageWrapper const&) = default;
+
+  // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find.
+  void operator=(StorageWrapper&& other) { delete parts; }
+
+  // Not provided, typically would do `parts = new long`.
+  StorageWrapper();
+
+  long* parts;
+};
+
+void test_non_trivial_struct_assignment() {
+  StorageWrapper* object = new StorageWrapper[]{StorageWrapper()};
+  // object[0] = StorageWrapper(); // Remove the source of double free to make the potential leak appear.
+} // leak-warning{{Potential leak of memory pointed to by 'object'}}
+} // mutually_exclusive_test_case_2
+
+namespace mutually_exclusive_test_case_3 {
+struct StorageWrapper {
+  // Imagine the destructor and copy constructor both call a reset() function (among other things).
+  ~StorageWrapper() { delete parts; }
+  StorageWrapper(StorageWrapper const&) = default;
+
+  // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find.
+  void operator=(StorageWrapper&& other) { delete parts; } // newdelete-warning{{Attempt to release already released memory}}
+
+  // Not provided, typically would do `parts = new long`.
+  StorageWrapper();
+
+  long* parts;
+};
+
+struct TestDoubleFreeWithInitializerList {
+  StorageWrapper* Object;
+  TestDoubleFreeWithInitializerList()
+  : Object(new StorageWrapper[]{StorageWrapper()}) {
+    Object[0] = StorageWrapper(); // This assignment leads to the double-free.
+  }
+};
+} // mutually_exclusive_test_case_3
+
+} // namespace gh153782
diff --git a/clang/test/Analysis/analyzeOneFunction.cpp b/clang/test/Analysis/analyzeOneFunction.cpp
index 3a362dfd9a08c..b2257570f6052 100644
--- a/clang/test/Analysis/analyzeOneFunction.cpp
+++ b/clang/test/Analysis/analyzeOneFunction.cpp
@@ -5,9 +5,9 @@
 // RUN:   -analyze-function="c:@S@Window@F@overloaded#I#"
 
 // RUN: %clang_extdef_map %s | FileCheck %s
-// CHECK:      27:c:@S@Window@F@overloaded#I#
-// CHECK-NEXT: 27:c:@S@Window@F@overloaded#C#
-// CHECK-NEXT: 27:c:@S@Window@F@overloaded#d#
+// CHECK-DAG: 27:c:@S@Window@F@overloaded#I#
+// CHECK-DAG: 27:c:@S@Window@F@overloaded#C#
+// CHECK-DAG: 27:c:@S@Window@F@overloaded#d#
 
 void clang_analyzer_warnIfReached();
 
diff --git a/clang/test/Analysis/ctor-trivial-copy.cpp b/clang/test/Analysis/ctor-trivial-copy.cpp
index 940ff9ba3ed9c..44990fc631d6d 100644
--- a/clang/test/Analysis/ctor-trivial-copy.cpp
+++ b/clang/test/Analysis/ctor-trivial-copy.cpp
@@ -5,8 +5,6 @@
 void clang_analyzer_printState();
 template <typename T> void clang_analyzer_dump_lref(T& param);
 template <typename T> void clang_analyzer_dump_val(T param);
-template <typename T> void clang_analyzer_denote(T param, const char *name);
-template <typename T> void clang_analyzer_express(T param);
 template <typename T> T conjure();
 template <typename... Ts> void nop(const Ts &... args) {}
 
@@ -42,10 +40,16 @@ void test_assign_return() {
 namespace trivial_struct_copy {
 
 void _01_empty_structs() {
-  clang_analyzer_dump_val(conjure<empty>()); // expected-warning {{conj_$}}
+  clang_analyzer_dump_val(conjure<empty>()); // expected-warning {{lazyCompoundVal}}
   empty Empty = conjure<empty>();
   empty Empty2 = Empty;
   empty Empty3 = Empty2;
+  // All of these should refer to the exact same LCV, because all of
+  // these trivial copies refer to the original conjured value.
+  // There were Unknown before:
+  clang_analyzer_dump_val(Empty);  // expected-warning {{lazyCompoundVal}}
+  clang_analyzer_dump_val(Empty2); // expected-warning {{lazyCompoundVal}}
+  clang_analyzer_dump_val(Empty3); // expected-warning {{lazyCompoundVal}}
 
   // We only have binding for the original Empty object, because copying empty
   // objects is a no-op in the performTrivialCopy. This is fine, because empty
@@ -67,20 +71,18 @@ void _01_empty_structs() {
 }
 
 void _02_structs_with_members() {
-  clang_analyzer_dump_val(conjure<aggr>()); // expected-warning {{conj_$}}
+  clang_analyzer_dump_val(conjure<aggr>()); // expected-warning {{lazyCompoundVal}}
   aggr Aggr = conjure<aggr>();
   aggr Aggr2 = Aggr;
   aggr Aggr3 = Aggr2;
-  // All of these should refer to the exact same symbol, because all of
+  // All of these should refer to the exact same LCV, because all of
   // these trivial copies refer to the original conjured value.
-  clang_analyzer_denote(Aggr, "$Aggr");
-  clang_analyzer_express(Aggr);  // expected-warning {{$Aggr}}
-  clang_analyzer_express(Aggr2); // expected-warning {{$Aggr}}
-  clang_analyzer_express(Aggr3); // expected-warning {{$Aggr}}
-
-  // We should have the same Conjured symbol for "Aggr", "Aggr2" and "Aggr3".
-  // We used to have Derived symbols for the individual fields that were
-  // copied as part of copying the whole struct.
+  clang_analyzer_dump_val(Aggr);  // expected-warning {{lazyCompoundVal}}
+  clang_analyzer_dump_val(Aggr2); // expected-warning {{lazyCompoundVal}}
+  clang_analyzer_dump_val(Aggr3); // expected-warning {{lazyCompoundVal}}
+
+  // We have fields in the struct we copy, thus we also have the entries for the copies
+  // (and for all of their fields).
   clang_analyzer_printState();
   // CHECK:       "store": { "pointer": "0x{{[0-9a-f]+}}", "items": [
   // CHECK-NEXT:    { "cluster": "GlobalInternalSpaceRegion", "pointer": "0x{{[0-9a-f]+}}", "items": [
@@ -93,10 +95,12 @@ void _02_structs_with_members() {
   // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ:conj_\$[0-9]+{int, LC[0-9]+, S[0-9]+, #[0-9]+}]]" }
   // CHECK-NEXT:    ]},
   // CHECK-NEXT:    { "cluster": "Aggr2", "pointer": "0x{{[0-9a-f]+}}", "items": [
-  // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ]]" }
+  // CHECK-NEXT:      { "kind": "Direct", "offset": 0, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.x}" },
+  // CHECK-NEXT:      { "kind": "Direct", "offset": 32, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.y}" }
   // CHECK-NEXT:    ]},
   // CHECK-NEXT:    { "cluster": "Aggr3", "pointer": "0x{{[0-9a-f]+}}", "items": [
-  // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ]]" }
+  // CHECK-NEXT:      { "kind": "Direct", "offset": 0, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.x}" },
+  // CHECK-NEXT:      { "kind": "Direct", "offset": 32, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.y}" }
   // CHECK-NEXT:    ]}
   // CHECK-NEXT:  ]},
 
@@ -113,3 +117,31 @@ void entrypoint() {
 }
 
 } // namespace trivial_struct_copy
+
+namespace gh153782 {
+
+// Ensure we do not regress on the following use cases.
+// The assumption made on a field in `setPtr` should apply to the returned copy in `func`.
+struct Status { int error; };
+Status getError();
+
+Status setPtr(int **outptr, int* ptr) {
+  Status e = getError();
+  if (e.error != 0) return e; // When assuming the error field is non-zero,
+  *outptr = ptr;              // this is not executed
+  return e;
+}
+
+int func() {
+  int *ptr = nullptr;
+  int x = 42;
+  if (setPtr(&ptr, &x).error == 0) {
+    // The assumption made in get() SHOULD match the assumption about
+    // the returned value, hence the engine SHOULD NOT assume ptr is null.
+    clang_analyzer_dump_val(ptr); // expected-warning {{&x}}
+    return *ptr;
+  }
+  return 0;
+}
+
+} // namespace gh153782
diff --git a/clang/test/Analysis/explain-svals.cpp b/clang/test/Analysis/explain-svals.cpp
index dfc650223c9e7..9474aa7c7dbb1 100644
--- a/clang/test/Analysis/explain-svals.cpp
+++ b/clang/test/Analysis/explain-svals.cpp
@@ -99,7 +99,7 @@ class C {
 } // end of anonymous namespace
 
 void test_6() {
-  clang_analyzer_explain(conjure_S()); // expected-warning-re{{{{^symbol of type 'int' conjured at CFG element 'conjure_S\(\) \(CXXRecordTypedCall, \+0\)'$}}}}
+  clang_analyzer_explain(conjure_S()); // expected-warning-re{{{{^lazily frozen compound value of 1st parameter of function 'clang_analyzer_explain\(\)'$}}}}
   clang_analyzer_explain(conjure_S().z); // expected-warning-re{{{{^value derived from \(symbol of type 'int' conjured at CFG element 'conjure_S\(\) \(CXXRecordTypedCall, \)'\) for field 'z' of temporary object constructed at statement 'conjure_S\(\)'$}}}}
 }
 
diff --git a/clang/test/Analysis/iterator-modeling.cpp b/clang/test/Analysis/iterator-modeling.cpp
index 78882da4431fd..f1538839d06c8 100644
--- a/clang/test/Analysis/iterator-modeling.cpp
+++ b/clang/test/Analysis/iterator-modeling.cpp
@@ -2035,7 +2035,6 @@ void print_state(std::vector<int> &V) {
   // CHECK:      "checker_messages": [
   // CHECK:   { "checker": "alpha.cplusplus.IteratorModeling", "messages": [
   // CHECK-NEXT:     "Iterator Positions :",
-  // CHECK-NEXT:     "conj_$[[#]]{int, LC[[#]], S[[#]], #[[#]]} : Valid ; Container == SymRegion{reg_$[[#]]<std::vector<int> & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}",
   // CHECK-NEXT:     "i0 : Valid ; Container == SymRegion{reg_$[[#]]<std::vector<int> & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}"
   // CHECK-NEXT:   ]}
 
@@ -2046,7 +2045,6 @@ void print_state(std::vector<int> &V) {
   // CHECK:      "checker_messages": [
   // CHECK:   { "checker": "alpha.cplusplus.IteratorModeling", "messages": [
   // CHECK-NEXT:     "Iterator Positions :",
-  // CHECK-NEXT:     "conj_$[[#]]{int, LC[[#]], S[[#]], #[[#]]} : Valid ; Container == SymRegion{reg_$[[#]]<std::vector<int> & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}",
   // CHECK-NEXT:     "i1 : Valid ; Container == SymRegion{reg_$[[#]]<std::vector<int> & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}"
   // CHECK-NEXT:   ]}
 
diff --git a/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp b/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp
index 191af95cd2b9c..98301cf7274fc 100644
--- a/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp
+++ b/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp
@@ -4,16 +4,6 @@
 // RUN:  -analyzer-config alpha.cplusplus.STLAlgorithmModeling:AggressiveStdFindModeling=true\
 // RUN:  -verify
 
-// STLAlgorithmModeling and DebugIteratorModeling are probably bugged because
-// these tests started failing after we just directly copy the symbol
-// representing the value of a variable instead of creating a LazyCompoundVal
-// of that single conjured value.
-// In theory, it shouldn't matter if we eagerly copy the value that we would
-// "load" from the LCV once requested or just directly binding the backing symbol.
-// Yet, these tests fail, so there is likely messed up how/what the checker
-// metadata is associated with.
-// XFAIL: *
-
 #include "Inputs/system-header-simulator-cxx.h"
 
 void clang_analyzer_eval(bool);
diff --git a/clang/test/Analysis/stl-algorithm-modeling.cpp b/clang/test/Analysis/stl-algorithm-modeling.cpp
index f7029c79b0942..5549c24a8c220 100644
--- a/clang/test/Analysis/stl-algorithm-modeling.cpp
+++ b/clang/test/Analysis/stl-algorithm-modeling.cpp
@@ -3,16 +3,6 @@
 // RUN:  -analyzer-config aggressive-binary-operation-simplification=true\
 // RUN:  -verify
 
-// STLAlgorithmModeling and DebugIteratorModeling are probably bugged because
-// these tests started failing after we just directly copy the symbol
-// representing the value of a variable instead of creating a LazyCompoundVal
-// of that single conjured value.
-// In theory, it shouldn't matter if we eagerly copy the value that we would
-// "load" from the LCV once requested or just directly binding the backing symbol.
-// Yet, these tests fail, so there is likely messed up how/what the checker
-// metadata is associated with.
-// XFAIL: *
-
 #include "Inputs/system-header-simulator-cxx.h"
 
 void clang_analyzer_eval(bool);
diff --git a/clang/test/Analysis/store-dump-orders.cpp b/clang/test/Analysis/store-dump-orders.cpp
index dbe93f1c5183a..d99f581f00fe1 100644
--- a/clang/test/Analysis/store-dump-orders.cpp
+++ b/clang/test/Analysis/store-dump-orders.cpp
@@ -41,7 +41,7 @@ void test_output(int n) {
   // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "conj_$
   // CHECK-NEXT:    ]},
   // CHECK-NEXT:    { "cluster": "objfirst", "pointer": "0x{{[0-9a-f]+}}", "items": [
-  // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "conj_$
+  // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "lazyCompoundVal
   // CHECK-NEXT:      { "kind": "Direct", "offset": 320, "value": "1 S32b" },
   // CHECK-NEXT:      { "kind": "Direct", "offset": 352, "value": "2 S32b" },
   // CHECK-NEXT:      { "kind": "Direct", "offset": 384, "value": "3 S32b" }
diff --git a/clang/test/Analysis/taint-generic.cpp b/clang/test/Analysis/taint-generic.cpp
index fc7c37300d3fc..4b8d9ab68ff84 100644
--- a/clang/test/Analysis/taint-generic.cpp
+++ b/clang/test/Analysis/taint-generic.cpp
@@ -158,7 +158,11 @@ void top() {
   clang_analyzer_isTainted(E); // expected-warning {{NO}}
 
   Aggr A = mySource1<Aggr>();
-  clang_analyzer_isTainted(A);      // expected-warning {{YES}}
+  // FIXME Ideally, both A and A.data should be tainted. However, the
+  //       implementation used by e5ac9145ba29 ([analyzer][taint] Recognize
+  //       tainted LazyCompoundVals (4/4) (#115919), 2024-11-15) led to FPs and
+  //       FNs in various scenarios and had to be reverted to fix #153782.
+  clang_analyzer_isTainted(A);      // expected-warning {{NO}}
   clang_analyzer_isTainted(A.data); // expected-warning {{YES}}
 }
 } // namespace gh114270
diff --git a/clang/test/Analysis/template-param-objects.cpp b/clang/test/Analysis/template-param-objects.cpp
index b065f8756d4d8..dde95fa62cb65 100644
--- a/clang/test/Analysis/template-param-objects.cpp
+++ b/clang/test/Analysis/template-param-objects.cpp
@@ -11,7 +11,7 @@ bool operator ==(Box lhs, Box rhs) {
   return lhs.value == rhs.value;
 }
 template <Box V> void dumps() {
-  clang_analyzer_dump(V);        // expected-warning {{Unknown}}
+  clang_analyzer_dump(V);        // expected-warning {{lazyCompoundVal}}
   clang_analyzer_dump(&V);       // expected-warning {{Unknown}}
   clang_analyzer_dump(V.value);  // expected-warning {{Unknown}} FIXME: It should be '6 S32b'.
   clang_analyzer_dump(&V.value); // expected-warning {{Unknown}}
diff --git a/clang/test/C/C2y/n3348.c b/clang/test/C/C2y/n3348.c
new file mode 100644
index 0000000000000..e20c9f74883f9
--- /dev/null
+++ b/clang/test/C/C2y/n3348.c
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s
+
+/* WG14 N3348: No
+ * Matching of Multi-Dimensional Arrays in Generic Selection Expressions
+ *
+ * This allows use of * in a _Generic association as a placeholder for any size
+ * value.
+ *
+ * FIXME: Clang doesn't yet implement this paper. When we do implement it, we
+ * should expose the functionality in earlier language modes (C89) for
+ * compatibility with GCC.
+ */
+
+void test(int n, int m) {
+  static_assert(1 == _Generic(int[3][2], int[3][*]: 1, int[2][*]: 0));  /* expected-error {{star modifier used outside of function prototype}}
+                                                                           expected-error {{array has incomplete element type 'int[]'}}
+                                                                         */
+  static_assert(1 == _Generic(int[3][2], int[*][2]: 1, int[*][3]: 0));  // expected-error {{star modifier used outside of function prototype}}
+  static_assert(1 == _Generic(int[3][n], int[3][*]: 1, int[2][*]: 0));  /* expected-error {{star modifier used outside of function prototype}}
+                                                                           expected-error {{array has incomplete element type 'int[]'}}
+                                                                         */
+  static_assert(1 == _Generic(int[n][m], int[*][*]: 1, char[*][*]: 0)); /* expected-error 2 {{star modifier used outside of function prototype}}
+                                                                           expected-error {{array has incomplete element type 'int[]'}}
+                                                                         */
+  static_assert(1 == _Generic(int(*)[2], int(*)[*]: 1));                // expected-error {{star modifier used outside of function prototype}}
+}
+
+void questionable() {
+  // GCC accepts this despite the * appearing outside of a generic association,
+  // but it's not clear whether that's intentionally supported or an oversight.
+  // It gives a warning about * being used outside of a declaration, but not
+  // with an associated warning group.
+  static_assert(1 == _Generic(int[*][*], int[2][100]: 1)); /* expected-error 2 {{star modifier used outside of function prototype}}
+                                                              expected-error {{array has incomplete element type 'int[]'}}
+                                                            */
+  // GCC claims this matches multiple associations, so the functionality seems
+  // like it may be intended to work?
+  (void)_Generic(int[*][*], /* expected-error 2 {{star modifier used outside of function prototype}}
+                               expected-error {{array has incomplete element type 'int[]'}}
+                             */
+    int[2][100]: 1,
+    int[3][1000]: 2,
+  );
+}
diff --git a/clang/test/C/C2y/n3457.c b/clang/test/C/C2y/n3457.c
new file mode 100644
index 0000000000000..d71a3f37e1343
--- /dev/null
+++ b/clang/test/C/C2y/n3457.c
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -verify=ext -std=c23 -pedantic %s
+// RUN: %clang_cc1 -verify=ext -pedantic -x c++ %s
+// RUN: %clang_cc1 -verify=pre -std=c2y -pedantic -Wpre-c2y-compat %s
+
+/* WG14 N3457: Clang 22
+ * The __COUNTER__ predefined macro
+ *
+ * This predefined macro was supported as an extension in earlier versions of
+ * Clang, but the required diagnostics for the limits were not added until 22.
+ */
+
+// Ensure that __COUNTER__ starts from 0.
+static_assert(__COUNTER__ == 0); /* ext-warning {{'__COUNTER__' is a C2y extension}}
+                                    pre-warning {{'__COUNTER__' is incompatible with standards before C2y}}
+                                  */
+
+// Ensure that the produced value can be used with token concatenation.
+#define CAT_IMPL(a, b) a ## b
+#define CAT(a, b) CAT_IMPL(a, b)
+#define NAME_WITH_COUNTER(a) CAT(a, __COUNTER__)
+void test() {
+  // Because this is the 2nd expansion, this defines test1.
+  int NAME_WITH_COUNTER(test); /* ext-warning {{'__COUNTER__' is a C2y extension}}
+                                  pre-warning {{'__COUNTER__' is incompatible with standards before C2y}}
+                                */
+  int other_test = test1;      // Ok
+}
+
+// Ensure that __COUNTER__ increments each time you mention it.
+static_assert(__COUNTER__ == 2); /* ext-warning {{'__COUNTER__' is a C2y extension}}
+                                    pre-warning {{'__COUNTER__' is incompatible with standards before C2y}}
+                                 */
+static_assert(__COUNTER__ == 3); /* ext-warning {{'__COUNTER__' is a C2y extension}}
+                                    pre-warning {{'__COUNTER__' is incompatible with standards before C2y}}
+                                 */
+static_assert(__COUNTER__ == 4); /* ext-warning {{'__COUNTER__' is a C2y extension}}
+                                    pre-warning {{'__COUNTER__' is incompatible with standards before C2y}}
+                                 */
diff --git a/clang/test/C/C2y/n3457_1.c b/clang/test/C/C2y/n3457_1.c
new file mode 100644
index 0000000000000..76c5a0b9a700f
--- /dev/null
+++ b/clang/test/C/C2y/n3457_1.c
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -verify -std=c2y -finitial-counter-value=2147483646 %s
+
+// The value produced needs to be a type that's representable with a signed
+// long. However, the actual type it expands to does *not* need to be forced to
+// be signed long because that would generally mean suffixing the value with L,
+// which would be very surprising for folks using this to generate unique ids.
+// We'll test this by ensuring the largest value can be expanded properly and
+// an assertion that signed long is always at least four bytes wide (which is
+// what's required to represent that maximal value).
+//
+// So we set the initial counter value to 2147483646, we'll validate that,
+// increment it once to get to the maximal value and ensure there's no
+// diagnostic, then increment again to ensure we get the constraint violation.
+
+static_assert(__COUNTER__ == 2147483646); // Test and increment
+static_assert(__COUNTER__ == 2147483647); // Test and increment
+
+// This one should fail.
+signed long i = __COUNTER__; // expected-error {{'__COUNTER__' value cannot exceed 2'147'483'647}}
+
diff --git a/clang/test/C/C2y/n3457_2.c b/clang/test/C/C2y/n3457_2.c
new file mode 100644
index 0000000000000..018c8f4390767
--- /dev/null
+++ b/clang/test/C/C2y/n3457_2.c
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -verify=good -std=c2y -finitial-counter-value=2147483648 %s
+// RUN: %clang_cc1 -verify -std=c2y -finitial-counter-value=2147483648 -DEXPAND_IT %s
+// good-no-diagnostics
+
+// This sets the intial __COUNTER__ value to something that's too big. Setting
+// the value too large is fine. Expanding to a too-large value is not.
+#ifdef EXPAND_IT
+  // This one should fail.
+  signed long i = __COUNTER__; // expected-error {{'__COUNTER__' value cannot exceed 2'147'483'647}}
+#endif
diff --git a/clang/test/C/C2y/n3525.c b/clang/test/C/C2y/n3525.c
new file mode 100644
index 0000000000000..428df23c79ba2
--- /dev/null
+++ b/clang/test/C/C2y/n3525.c
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s
+// RUN: %clang_cc1 -verify -std=c23 -Wall -pedantic %s
+
+/* WG14 N3525: Yes
+ * static_assert without UB
+ *
+ * Ensures that a static_assert declaration cannot defer to runtime; it must
+ * take an integer constant expression that is resolved at compile time.
+ *
+ * Note: implementations are free to extend what is a valid integer constant
+ * expression, and Clang (and GCC) does so. So this test is validating that
+ * we quietly accept a pasing assertion, loudly reject a failing assertion, and
+ * issue a pedantic diagnostic for the extension case.
+ */
+
+static_assert(1); // Okay
+
+static_assert(0); // expected-error {{static assertion failed}}
+
+extern int a;
+static_assert(1 || a); // expected-warning {{expression is not an integer constant expression; folding it to a constant is a GNU extension}}
+
+static_assert(a);      // expected-error {{static assertion expression is not an integral constant expression}}
+static_assert(0 || a); // expected-error {{static assertion expression is not an integral constant expression}}
+
+// Note, there is no CodeGen test for this; we have existing tests for the ICE
+// extension, so the pedantic warning is sufficient to verify we're not
+// emitting code which reads 'a' in '1 || a' because of the folding, and
+// there's no way to generate code for reading 'a' in '0 || a' because of the
+// error.
diff --git a/clang/test/CIR/CodeGen/agg-expr-lvalue.c b/clang/test/CIR/CodeGen/agg-expr-lvalue.c
index c826f8fa829d0..509f0218e9912 100644
--- a/clang/test/CIR/CodeGen/agg-expr-lvalue.c
+++ b/clang/test/CIR/CodeGen/agg-expr-lvalue.c
@@ -95,16 +95,13 @@ void test_string_array_in_array(void) {
 }
   
 // CIR-LABEL: cir.func{{.*}} @test_string_array_in_array
-// CIR:   cir.alloca !cir.array<!cir.array<!s8i x 6> x 2>, {{.*}}, ["matrix", init]
-// CIR:   cir.get_global
-// CIR:   cir.copy
-// CIR:   cir.get_global
-// CIR:   cir.copy
+// CIR:   %[[MATRIX:.*]] = cir.alloca !cir.array<!cir.array<!s8i x 6> x 2>, {{.*}}, ["matrix", init]
+// CIR:   %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_array<[#cir.int<104> : !s8i, #cir.int<101> : !s8i, #cir.int<108> : !s8i, #cir.int<108> : !s8i, #cir.int<111> : !s8i, #cir.int<0> : !s8i]> : !cir.array<!s8i x 6>, #cir.const_array<[#cir.int<119> : !s8i, #cir.int<111> : !s8i, #cir.int<114> : !s8i, #cir.int<108> : !s8i, #cir.int<100> : !s8i, #cir.int<0> : !s8i]> : !cir.array<!s8i x 6>]>
+// CIR:   cir.store{{.*}} %[[CONST]], %[[MATRIX]]
 
 // LLVM-LABEL: define{{.*}} @test_string_array_in_array
-// LLVM:   alloca [2 x [6 x i8]]
-// LLVM:   call void @llvm.memcpy
-// LLVM:   call void @llvm.memcpy
+// LLVM:   %[[MATRIX:.*]] = alloca [2 x [6 x i8]]
+// LLVM:   store [2 x [6 x i8]] {{\[}}[6 x i8] c"hello\00", [6 x i8] c"world\00"], ptr %[[MATRIX]]
 
 // OGCG-LABEL: define{{.*}} @test_string_array_in_array
 // OGCG:   alloca [2 x [6 x i8]]
diff --git a/clang/test/CIR/CodeGen/array.cpp b/clang/test/CIR/CodeGen/array.cpp
index 82add4b347e72..5e873810d494b 100644
--- a/clang/test/CIR/CodeGen/array.cpp
+++ b/clang/test/CIR/CodeGen/array.cpp
@@ -151,50 +151,12 @@ void func2() {
 }
 
 // CIR: %[[ARR2:.*]] = cir.alloca !cir.array<!s32i x 2>, !cir.ptr<!cir.array<!s32i x 2>>, ["arr", init]
-// CIR: %[[ARR_PTR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp", init]
-// CIR: %[[ARR_0:.*]] = cir.cast array_to_ptrdecay %[[ARR2]] : !cir.ptr<!cir.array<!s32i x 2>> -> !cir.ptr<!s32i>
-// CIR: %[[FIVE:.*]] = cir.const #cir.int<5> : !s32i
-// CIR: cir.store{{.*}} %[[FIVE]], %[[ARR_0]] : !s32i, !cir.ptr<!s32i>
-// CIR: %[[OFFSET_0:.*]] = cir.const #cir.int<1> : !s64i
-// CIR: %[[ELE_PTR:.*]] = cir.ptr_stride %[[ARR_0]], %[[OFFSET_0]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CIR: cir.store{{.*}} %[[ELE_PTR]], %[[ARR_PTR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CIR: %[[TWO:.*]] = cir.const #cir.int<2> : !s64i
-// CIR: %[[ARR_END:.*]] = cir.ptr_stride %[[ARR_0]], %[[TWO]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CIR: cir.do {
-// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CIR:   %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CIR:   cir.store{{.*}} %[[ZERO]], %[[ARR_CUR]] : !s32i, !cir.ptr<!s32i>
-// CIR:   %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CIR:   %[[ARR_NEXT:.*]] = cir.ptr_stride %[[ARR_CUR]], %[[ONE]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CIR:   cir.store{{.*}} %[[ARR_NEXT]], %[[ARR_PTR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CIR:   cir.yield
-// CIR: } while {
-// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CIR:   %[[CMP:.*]] = cir.cmp(ne, %[[ARR_CUR]], %[[ARR_END]]) : !cir.ptr<!s32i>, !cir.bool
-// CIR:   cir.condition(%[[CMP]])
-// CIR: }
+// CIR: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.int<5> : !s32i, #cir.int<0> : !s32i]> : !cir.array<!s32i x 2>
+// CIR: cir.store{{.*}} %[[CONST]], %[[ARR2]] : !cir.array<!s32i x 2>, !cir.ptr<!cir.array<!s32i x 2>>
 
 // LLVM: define{{.*}} void @_Z5func2v(){{.*}}
 // LLVM:   %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4
-// LLVM:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
-// LLVM:   %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR]], i32 0
-// LLVM:   store i32 5, ptr %[[ARR_PTR]], align 4
-// LLVM:   %[[ELE_1_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1
-// LLVM:   store ptr %[[ELE_1_PTR]], ptr %[[TMP]], align 8
-// LLVM:   %[[END_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 2
-// LLVM:   br label %[[LOOP_BODY:.*]]
-// LLVM: [[LOOP_NEXT:.*]]:
-// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
-// LLVM:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
-// LLVM:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
-// LLVM: [[LOOP_BODY]]:
-// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
-// LLVM:   store i32 0, ptr %[[CUR]], align 4
-// LLVM:   %[[NEXT:.*]] = getelementptr i32, ptr %[[CUR]], i64 1
-// LLVM:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
-// LLVM:   br label %[[LOOP_NEXT:.*]]
-// LLVM: [[LOOP_END]]:
-// LLVM:   ret void
+// LLVM:   store [2 x i32] [i32 5, i32 0], ptr %[[ARR]], align 4
 
 // OGCG: %[[ARR:.*]] = alloca [2 x i32], align 4
 // OGCG: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[ARR]], ptr align 4 @[[FUN2_ARR]], i64 8, i1 false)
@@ -209,13 +171,8 @@ void func3() {
 // CIR: %[[ARR:.*]] = cir.alloca !cir.array<!s32i x 2>, !cir.ptr<!cir.array<!s32i x 2>>, ["arr", init]
 // CIR: %[[IDX:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["idx", init]
 // CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
-// CIR: %[[ARR_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!s32i x 2>> -> !cir.ptr<!s32i>
-// CIR: %[[V0:.*]] = cir.const #cir.int<5> : !s32i
-// CIR: cir.store{{.*}} %[[V0]], %[[ARR_PTR]] : !s32i, !cir.ptr<!s32i>
-// CIR: %[[OFFSET_0:.*]] = cir.const #cir.int<1> : !s64i
-// CIR: %[[ELE_1_PTR:.*]] = cir.ptr_stride %[[ARR_PTR]], %[[OFFSET_0]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CIR: %[[V1:.*]] = cir.const #cir.int<6> : !s32i
-// CIR: cir.store{{.*}} %[[V1]], %[[ELE_1_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.int<5> : !s32i, #cir.int<6> : !s32i]> : !cir.array<!s32i x 2>
+// CIR: cir.store{{.*}} %[[CONST]], %[[ARR]] : !cir.array<!s32i x 2>, !cir.ptr<!cir.array<!s32i x 2>>
 // CIR: %[[IDX_V:.*]] = cir.const #cir.int<1> : !s32i
 // CIR: cir.store{{.*}} %[[IDX_V]], %[[IDX]] : !s32i, !cir.ptr<!s32i>
 // CIR: %[[TMP_IDX:.*]] = cir.load{{.*}} %[[IDX]] : !cir.ptr<!s32i>, !s32i
@@ -228,10 +185,7 @@ void func3() {
 // LLVM:  %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4
 // LLVM:  %[[IDX:.*]] = alloca i32, i64 1, align 4
 // LLVM:  %[[INIT:.*]] = alloca i32, i64 1, align 4
-// LLVM:  %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR]], i32 0
-// LLVM:  store i32 5, ptr %[[ARR_PTR]], align 4
-// LLVM:  %[[ELE_1_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1
-// LLVM:  store i32 6, ptr %[[ELE_1_PTR]], align 4
+// LLVM:  store [2 x i32] [i32 5, i32 6], ptr %[[ARR]], align 4
 // LLVM:  store i32 1, ptr %[[IDX]], align 4
 // LLVM:  %[[TMP1:.*]] = load i32, ptr %[[IDX]], align 4
 // LLVM:  %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR]], i32 0
@@ -258,15 +212,8 @@ void func4() {
 
 // CIR: %[[ARR:.*]] = cir.alloca !cir.array<!cir.array<!s32i x 1> x 2>, !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>, ["arr", init]
 // CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
-// CIR: %[[ARR_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>> -> !cir.ptr<!cir.array<!s32i x 1>>
-// CIR: %[[ARR_0_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>> -> !cir.ptr<!s32i>
-// CIR: %[[V_0_0:.*]] = cir.const #cir.int<5> : !s32i
-// CIR: cir.store{{.*}} %[[V_0_0]], %[[ARR_0_PTR]] : !s32i, !cir.ptr<!s32i>
-// CIR: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i
-// CIR: %[[ARR_1:.*]] = cir.ptr_stride %[[ARR_PTR]], %[[OFFSET]] : (!cir.ptr<!cir.array<!s32i x 1>>, !s64i) -> !cir.ptr<!cir.array<!s32i x 1>>
-// CIR: %[[ARR_1_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR_1]] : !cir.ptr<!cir.array<!s32i x 1>> -> !cir.ptr<!s32i>
-// CIR: %[[V_1_0:.*]] = cir.const #cir.int<6> : !s32i
-// CIR: cir.store{{.*}} %[[V_1_0]], %[[ARR_1_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_array<[#cir.int<5> : !s32i]> : !cir.array<!s32i x 1>, #cir.const_array<[#cir.int<6> : !s32i]> : !cir.array<!s32i x 1>]> : !cir.array<!cir.array<!s32i x 1> x 2>
+// CIR: cir.store{{.*}} %[[CONST]], %[[ARR]] : !cir.array<!cir.array<!s32i x 1> x 2>, !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>
 // CIR: %[[IDX:.*]] = cir.const #cir.int<0> : !s32i
 // CIR: %[[IDX_1:.*]] = cir.const #cir.int<1> : !s32i
 // CIR: %[[ARR_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>> -> !cir.ptr<!cir.array<!s32i x 1>>
@@ -279,12 +226,7 @@ void func4() {
 // LLVM: define{{.*}} void @_Z5func4v(){{.*}}
 // LLVM:  %[[ARR:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
 // LLVM:  %[[INIT:.*]] = alloca i32, i64 1, align 4
-// LLVM:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR]], i32 0
-// LLVM:  %[[ARR_0_0:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
-// LLVM:  store i32 5, ptr %[[ARR_0_0]], align 4
-// LLVM:  %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
-// LLVM:  %[[ARR_1_0:.*]] = getelementptr i32, ptr %[[ARR_1]], i32 0
-// LLVM:  store i32 6, ptr %[[ARR_1_0]], align 4
+// LLVM:  store [2 x [1 x i32]] {{\[}}[1 x i32] [i32 5], [1 x i32] [i32 6]], ptr %[[ARR]], align 4
 // LLVM:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR]], i32 0
 // LLVM:  %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
 // LLVM:  %[[ARR_1_0:.*]] = getelementptr i32, ptr %[[ARR_1]], i32 0
@@ -305,52 +247,12 @@ void func5() {
 }
 
 // CIR: %[[ARR:.*]] = cir.alloca !cir.array<!cir.array<!s32i x 1> x 2>, !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>, ["arr", init]
-// CIR: %[[ARR_PTR:.*]] = cir.alloca !cir.ptr<!cir.array<!s32i x 1>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>, ["arrayinit.temp", init]
-// CIR: %[[ARR_0:.*]] = cir.cast array_to_ptrdecay %0 : !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>> -> !cir.ptr<!cir.array<!s32i x 1>>
-// CIR: %[[ARR_0_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR_0]] : !cir.ptr<!cir.array<!s32i x 1>> -> !cir.ptr<!s32i>
-// CIR: %[[V_0_0:.*]] = cir.const #cir.int<5> : !s32i
-// CIR: cir.store{{.*}} %[[V_0_0]], %[[ARR_0_PTR]] : !s32i, !cir.ptr<!s32i>
-// CIR: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i
-// CIR: %[[ARR_1:.*]] = cir.ptr_stride %[[ARR_0]], %[[OFFSET]] : (!cir.ptr<!cir.array<!s32i x 1>>, !s64i) -> !cir.ptr<!cir.array<!s32i x 1>>
-// CIR: cir.store{{.*}} %[[ARR_1]], %[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>
-// CIR: %[[TWO:.*]] = cir.const #cir.int<2> : !s64i
-// CIR: %[[ARR_END:.*]] = cir.ptr_stride %[[ARR_0]], %[[TWO]] : (!cir.ptr<!cir.array<!s32i x 1>>, !s64i) -> !cir.ptr<!cir.array<!s32i x 1>>
-// CIR: cir.do {
-// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>, !cir.ptr<!cir.array<!s32i x 1>>
-// CIR:   %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!s32i x 1>
-// CIR:   cir.store{{.*}} %[[ZERO]], %[[ARR_CUR]] : !cir.array<!s32i x 1>, !cir.ptr<!cir.array<!s32i x 1>>
-// CIR:   %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CIR:   %[[ARR_NEXT:.*]] = cir.ptr_stride %[[ARR_CUR]], %[[ONE]] : (!cir.ptr<!cir.array<!s32i x 1>>, !s64i) -> !cir.ptr<!cir.array<!s32i x 1>>
-// CIR:   cir.store{{.*}} %[[ARR_NEXT]], %[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>
-// CIR:   cir.yield
-// CIR: } while {
-// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>, !cir.ptr<!cir.array<!s32i x 1>>
-// CIR:   %[[CMP:.*]] = cir.cmp(ne, %[[ARR_CUR]], %[[ARR_END]]) : !cir.ptr<!cir.array<!s32i x 1>>, !cir.bool
-// CIR:   cir.condition(%[[CMP]])
-// CIR: }
+// CIR: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_array<[#cir.int<5> : !s32i]> : !cir.array<!s32i x 1>, #cir.zero : !cir.array<!s32i x 1>]> : !cir.array<!cir.array<!s32i x 1> x 2>
+// CIR: cir.store{{.*}} %[[CONST]], %[[ARR]] : !cir.array<!cir.array<!s32i x 1> x 2>, !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>
 
 // LLVM: define{{.*}} void @_Z5func5v(){{.*}}
 // LLVM:   %[[ARR:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
-// LLVM:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
-// LLVM:   %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR]], i32 0
-// LLVM:   %[[ARR_0:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
-// LLVM:   store i32 5, ptr %[[ARR_0]], align 4
-// LLVM:   %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
-// LLVM:   store ptr %[[ARR_1]], ptr %[[TMP]], align 8
-// LLVM:   %[[END_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 2
-// LLVM:   br label %[[LOOP_BODY:.*]]
-// LLVM: [[LOOP_NEXT:.*]]:
-// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
-// LLVM:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
-// LLVM:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
-// LLVM: [[LOOP_BODY]]:
-// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
-// LLVM:   store [1 x i32] zeroinitializer, ptr %[[CUR]], align 4
-// LLVM:   %[[NEXT:.*]] = getelementptr [1 x i32], ptr %[[CUR]], i64 1
-// LLVM:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
-// LLVM:   br label %[[LOOP_NEXT:.*]]
-// LLVM: [[LOOP_END]]:
-// LLVM:   ret void
+// LLVM:   store [2 x [1 x i32]] {{\[}}[1 x i32] [i32 5], [1 x i32] zeroinitializer], ptr %[[ARR]], align 4
 
 // ORGC: %[[ARR:.*]] = alloca [2 x [1 x i32]], align 4
 // ORGC: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[ARR]], ptr align 4 @[[FUN5_ARR]], i64 8, i1 false)
@@ -395,44 +297,12 @@ void func7() {
 }
 
 // CIR: %[[ARR:.*]] = cir.alloca !cir.array<!cir.ptr<!s32i> x 1>, !cir.ptr<!cir.array<!cir.ptr<!s32i> x 1>>, ["arr", init]
-// CIR: %[[ARR_PTR:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, ["arrayinit.temp", init]
-// CIR: %[[ARR_0:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 1>> -> !cir.ptr<!cir.ptr<!s32i>>
-// CIR: cir.store{{.*}} %[[ARR_0]], %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
-// CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CIR: %[[ARR_END:.*]] = cir.ptr_stride %[[ARR_0]], %[[ONE]] : (!cir.ptr<!cir.ptr<!s32i>>, !s64i) -> !cir.ptr<!cir.ptr<!s32i>>
-// CIR: cir.do {
-// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!s32i>>
-// CIR:   %[[NULL_PTR:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!s32i>
-// CIR:   cir.store{{.*}} %[[NULL_PTR]], %[[ARR_CUR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CIR:   %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CIR:   %[[ARR_NEXT:.*]] = cir.ptr_stride %[[ARR_CUR]], %[[ONE]] : (!cir.ptr<!cir.ptr<!s32i>>, !s64i) -> !cir.ptr<!cir.ptr<!s32i>>
-// CIR:   cir.store{{.*}} %[[ARR_NEXT]], %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
-// CIR:   cir.yield
-// CIR: } while {
-// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!s32i>>
-// CIR:   %[[CMP:.*]] = cir.cmp(ne, %[[ARR_CUR]], %[[ARR_END]]) : !cir.ptr<!cir.ptr<!s32i>>, !cir.bool
-// CIR:   cir.condition(%[[CMP]])
-// CIR: }
+// CIR: %[[CONST:.*]] = cir.const #cir.zero : !cir.array<!cir.ptr<!s32i> x 1>
+// CIR: cir.store{{.*}} %[[CONST]], %[[ARR]] : !cir.array<!cir.ptr<!s32i> x 1>, !cir.ptr<!cir.array<!cir.ptr<!s32i> x 1>>
 
 // LLVM: define{{.*}} void @_Z5func7v(){{.*}}
 // LLVM:   %[[ARR:.*]] = alloca [1 x ptr], i64 1, align 8
-// LLVM:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
-// LLVM:   %[[ARR_PTR:.*]] = getelementptr ptr, ptr %[[ARR]], i32 0
-// LLVM:   store ptr %[[ARR_PTR]], ptr %[[TMP]], align 8
-// LLVM:   %[[END_PTR:.*]] = getelementptr ptr, ptr %[[ARR_PTR]], i64 1
-// LLVM:   br label %[[LOOP_BODY:.*]]
-// LLVM: [[LOOP_NEXT:.*]]:
-// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
-// LLVM:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
-// LLVM:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
-// LLVM: [[LOOP_BODY]]:
-// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
-// LLVM:   store ptr null, ptr %[[CUR]], align 8
-// LLVM:   %[[NEXT:.*]] = getelementptr ptr, ptr %[[CUR]], i64 1
-// LLVM:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
-// LLVM:   br label %[[LOOP_NEXT:.*]]
-// LLVM: [[LOOP_END]]:
-// LLVM:   ret void
+// LLVM:   store [1 x ptr] zeroinitializer, ptr %[[ARR]], align 8
 
 // OGCG: %[[ARR:.*]] = alloca [1 x ptr], align 8
 // OGCG: call void @llvm.memset.p0.i64(ptr align 8 %[[ARR]], i8 0, i64 8, i1 false)
@@ -581,19 +451,11 @@ void array_with_complex_elements() {
 }
 
 // CIR: %[[ARR_ADDR:.*]] = cir.alloca !cir.array<!cir.complex<!cir.float> x 2>, !cir.ptr<!cir.array<!cir.complex<!cir.float> x 2>>, ["arr", init]
-// CIR: %[[ARR_0:.*]] = cir.cast array_to_ptrdecay %[[ARR_ADDR]] : !cir.ptr<!cir.array<!cir.complex<!cir.float> x 2>> -> !cir.ptr<!cir.complex<!cir.float>>
-// CIR: %[[CONST_COMPLEX_0:.*]] = cir.const #cir.const_complex<#cir.fp<1.100000e+00> : !cir.float, #cir.fp<2.200000e+00> : !cir.float> : !cir.complex<!cir.float>
-// CIR: cir.store{{.*}} %[[CONST_COMPLEX_0]], %[[ARR_0]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
-// CIR: %[[IDX_1:.*]] = cir.const #cir.int<1> : !s64i
-// CIR: %[[ARR_1:.*]] = cir.ptr_stride %1, %[[IDX_1]] : (!cir.ptr<!cir.complex<!cir.float>>, !s64i) -> !cir.ptr<!cir.complex<!cir.float>>
-// CIR: %[[CONST_COMPLEX_1:.*]] = cir.const #cir.const_complex<#cir.fp<3.300000e+00> : !cir.float, #cir.fp<4.400000e+00> : !cir.float> : !cir.complex<!cir.float>
-// CIR: cir.store{{.*}} %[[CONST_COMPLEX_1]], %[[ARR_1]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_complex<#cir.fp<1.100000e+00> : !cir.float, #cir.fp<2.200000e+00> : !cir.float> : !cir.complex<!cir.float>, #cir.const_complex<#cir.fp<3.300000e+00> : !cir.float, #cir.fp<4.400000e+00> : !cir.float> : !cir.complex<!cir.float>]> : !cir.array<!cir.complex<!cir.float> x 2>
+// CIR: cir.store{{.*}} %[[CONST]], %[[ARR_ADDR]] : !cir.array<!cir.complex<!cir.float> x 2>, !cir.ptr<!cir.array<!cir.complex<!cir.float> x 2>>
 
 // LLVM: %[[ARR_ADDR:.*]] = alloca [2 x { float, float }], i64 1, align 16
-// LLVM: %[[ARR_0:.*]] = getelementptr { float, float }, ptr %[[ARR_ADDR]], i32 0
-// LLVM: store { float, float } { float 0x3FF19999A0000000, float 0x40019999A0000000 }, ptr %[[ARR_0]], align 8
-// LLVM: %[[ARR_1:.*]] = getelementptr { float, float }, ptr %[[ARR_0]], i64 1
-// LLVM: store { float, float } { float 0x400A666660000000, float 0x40119999A0000000 }, ptr %[[ARR_1]], align 8
+// LLVM: store [2 x { float, float }] [{ float, float } { float 0x3FF19999A0000000, float 0x40019999A0000000 }, { float, float } { float 0x400A666660000000, float 0x40119999A0000000 }], ptr %[[ARR_ADDR]], align 16
 
 // OGCG: %[[ARR_ADDR:.*]] = alloca [2 x { float, float }], align 16
 // OGCG: call void @llvm.memcpy.p0.p0.i64(ptr align 16 %[[ARR_ADDR]], ptr align 16 @__const._Z27array_with_complex_elementsv.arr, i64 16, i1 false)
diff --git a/clang/test/CIR/CodeGen/atomic.c b/clang/test/CIR/CodeGen/atomic.c
index 65799881a0cbe..d5bea8446d730 100644
--- a/clang/test/CIR/CodeGen/atomic.c
+++ b/clang/test/CIR/CodeGen/atomic.c
@@ -46,6 +46,32 @@ void f2(void) {
 // OGCG-NEXT:    store i32 42, ptr %[[SLOT]], align 4
 // OGCG:       }
 
+void f3(_Atomic(int) *p) {
+  *p = 42;
+}
+
+// CIR-LABEL: @f3
+// CIR: cir.store align(4) atomic(seq_cst) %{{.+}}, %{{.+}} : !s32i, !cir.ptr<!s32i>
+
+// LLVM-LABEL: @f3
+// LLVM: store atomic i32 42, ptr %{{.+}} seq_cst, align 4
+
+// OGCG-LABEL: @f3
+// OGCG: store atomic i32 42, ptr %{{.+}} seq_cst, align 4
+
+void f4(_Atomic(float) *p) {
+  *p = 3.14;
+}
+
+// CIR-LABEL: @f4
+// CIR: cir.store align(4) atomic(seq_cst) %{{.+}}, %{{.+}} : !cir.float, !cir.ptr<!cir.float>
+
+// LLVM-LABEL: @f4
+// LLVM: store atomic float 0x40091EB860000000, ptr %{{.+}} seq_cst, align 4
+
+// OGCG-LABEL: @f4
+// OGCG: store atomic float 0x40091EB860000000, ptr %{{.+}} seq_cst, align 4
+
 void load(int *ptr) {
   int x;
   __atomic_load(ptr, &x, __ATOMIC_RELAXED);
diff --git a/clang/test/CIR/CodeGen/binassign.c b/clang/test/CIR/CodeGen/binassign.c
index 44c54b4a2969a..4520063c56ee6 100644
--- a/clang/test/CIR/CodeGen/binassign.c
+++ b/clang/test/CIR/CodeGen/binassign.c
@@ -100,3 +100,107 @@ void binary_assign_struct() {
 // OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[LS_PTR]], ptr align 4 @gs, i64 8, i1 false)
 // OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[LSV_PTR]], ptr align 4 @gsv, i64 8, i1 true)
 // OGCG:   ret void
+
+int ignore_result_assign() {
+  int arr[10];
+  int i, j;
+  j = i = 123, 0;
+  j = arr[i = 5];
+  int *p, *q = 0;
+  if(p = q)
+    return 1;
+  return 0;
+}
+
+// CIR-LABEL: cir.func{{.*}} @ignore_result_assign() -> !s32i
+// CIR:         %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
+// CIR:         %[[ARR:.*]] = cir.alloca !cir.array<!s32i x 10>, !cir.ptr<!cir.array<!s32i x 10>>, ["arr"]
+// CIR:         %[[I:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i"]
+// CIR:         %[[J:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["j"]
+// CIR:         %[[P:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["p"]
+// CIR:         %[[Q:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["q", init]
+// CIR:         %[[VAL_123:.*]] = cir.const #cir.int<123> : !s32i
+// CIR:         cir.store{{.*}} %[[VAL_123]], %[[I]] : !s32i, !cir.ptr<!s32i>
+// CIR:         cir.store{{.*}} %[[VAL_123]], %[[J]] : !s32i, !cir.ptr<!s32i>
+// CIR:         %[[VAL_0:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:         %[[VAL_5:.*]] = cir.const #cir.int<5> : !s32i
+// CIR:         cir.store{{.*}} %[[VAL_5]], %[[I]] : !s32i, !cir.ptr<!s32i>
+// CIR:         %[[ARR_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!s32i x 10>> -> !cir.ptr<!s32i>
+// CIR:         %[[ARR_ELEM:.*]] = cir.ptr_stride %[[ARR_DECAY]], %[[VAL_5]] : (!cir.ptr<!s32i>, !s32i) -> !cir.ptr<!s32i>
+// CIR:         %[[ARR_LOAD:.*]] = cir.load{{.*}} %[[ARR_ELEM]] : !cir.ptr<!s32i>, !s32i
+// CIR:         cir.store{{.*}} %[[ARR_LOAD]], %[[J]] : !s32i, !cir.ptr<!s32i>
+// CIR:         %[[NULL:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!s32i>
+// CIR:         cir.store{{.*}} %[[NULL]], %[[Q]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CIR:         cir.scope {
+// CIR:           %[[Q_VAL:.*]] = cir.load{{.*}} %[[Q]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
+// CIR:           cir.store{{.*}} %[[Q_VAL]], %[[P]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CIR:           %[[COND:.*]] = cir.cast ptr_to_bool %[[Q_VAL]] : !cir.ptr<!s32i> -> !cir.bool
+// CIR:           cir.if %[[COND]] {
+// CIR:             %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CIR:             cir.store %[[ONE]], %[[RETVAL]] : !s32i, !cir.ptr<!s32i>
+// CIR:             %{{.*}} = cir.load %[[RETVAL]] : !cir.ptr<!s32i>, !s32i
+// CIR:             cir.return
+// CIR:           }
+// CIR:         }
+// CIR:         %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:         cir.store %[[ZERO]], %[[RETVAL]] : !s32i, !cir.ptr<!s32i>
+// CIR:         %{{.*}} = cir.load %[[RETVAL]] : !cir.ptr<!s32i>, !s32i
+// CIR:         cir.return
+
+// LLVM-LABEL: define {{.*}}i32 @ignore_result_assign()
+// LLVM:         %[[RETVAL_PTR:.*]] = alloca i32
+// LLVM:         %[[ARR_PTR:.*]] = alloca [10 x i32]
+// LLVM:         %[[I_PTR:.*]] = alloca i32
+// LLVM:         %[[J_PTR:.*]] = alloca i32
+// LLVM:         %[[P_PTR:.*]] = alloca ptr
+// LLVM:         %[[Q_PTR:.*]] = alloca ptr
+// LLVM:         store i32 123, ptr %[[I_PTR]]
+// LLVM:         store i32 123, ptr %[[J_PTR]]
+// LLVM:         store i32 5, ptr %[[I_PTR]]
+// LLVM:         %[[GEP1:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
+// LLVM:         %[[GEP2:.*]] = getelementptr i32, ptr %[[GEP1]], i64 5
+// LLVM:         %[[ARR_VAL:.*]] = load i32, ptr %[[GEP2]]
+// LLVM:         store i32 %[[ARR_VAL]], ptr %[[J_PTR]]
+// LLVM:         store ptr null, ptr %[[Q_PTR]]
+// LLVM:         br label
+// LLVM:         %[[Q_VAL:.*]] = load ptr, ptr %[[Q_PTR]]
+// LLVM:         store ptr %[[Q_VAL]], ptr %[[P_PTR]]
+// LLVM:         %[[CMP:.*]] = icmp ne ptr %[[Q_VAL]], null
+// LLVM:         br i1 %[[CMP]], label %[[THEN:.*]], label %[[ELSE:.*]]
+// LLVM:       [[THEN]]:
+// LLVM:         store i32 1, ptr %[[RETVAL_PTR]]
+// LLVM:         %{{.*}} = load i32, ptr %[[RETVAL_PTR]]
+// LLVM:         ret i32
+// LLVM:       [[ELSE]]:
+// LLVM:         br label
+// LLVM:         store i32 0, ptr %[[RETVAL_PTR]]
+// LLVM:         %{{.*}} = load i32, ptr %[[RETVAL_PTR]]
+// LLVM:         ret i32
+
+// OGCG-LABEL: define {{.*}}i32 @ignore_result_assign()
+// OGCG:         %[[RETVAL:.*]] = alloca i32
+// OGCG:         %[[ARR:.*]] = alloca [10 x i32]
+// OGCG:         %[[I:.*]] = alloca i32
+// OGCG:         %[[J:.*]] = alloca i32
+// OGCG:         %[[P:.*]] = alloca ptr
+// OGCG:         %[[Q:.*]] = alloca ptr
+// OGCG:         store i32 123, ptr %[[I]]
+// OGCG:         store i32 123, ptr %[[J]]
+// OGCG:         store i32 5, ptr %[[I]]
+// OGCG:         %[[ARRAYIDX:.*]] = getelementptr inbounds [10 x i32], ptr %[[ARR]], i64 0, i64 5
+// OGCG:         %[[ARR_VAL:.*]] = load i32, ptr %[[ARRAYIDX]]
+// OGCG:         store i32 %[[ARR_VAL]], ptr %[[J]]
+// OGCG:         store ptr null, ptr %[[Q]]
+// OGCG:         %[[Q_VAL:.*]] = load ptr, ptr %[[Q]]
+// OGCG:         store ptr %[[Q_VAL]], ptr %[[P]]
+// OGCG:         %[[TOBOOL:.*]] = icmp ne ptr %[[Q_VAL]], null
+// OGCG:         br i1 %[[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// OGCG:       [[IF_THEN]]:
+// OGCG:         store i32 1, ptr %[[RETVAL]]
+// OGCG:         br label %[[RETURN:.*]]
+// OGCG:       [[IF_END]]:
+// OGCG:         store i32 0, ptr %[[RETVAL]]
+// OGCG:         br label %[[RETURN]]
+// OGCG:       [[RETURN]]:
+// OGCG:         %{{.*}} = load i32, ptr %[[RETVAL]]
+// OGCG:         ret i32
diff --git a/clang/test/CIR/CodeGen/builtins-floating-point.c b/clang/test/CIR/CodeGen/builtins-floating-point.c
index 193cc172d37d2..1b7de650662c7 100644
--- a/clang/test/CIR/CodeGen/builtins-floating-point.c
+++ b/clang/test/CIR/CodeGen/builtins-floating-point.c
@@ -7,14 +7,42 @@
 
 float cosf(float f) {
   return __builtin_cosf(f);
-  // CHECK: %{{.*}} = cir.cos {{.*}} : !cir.float
+  // CIR: %{{.*}} = cir.cos %{{.*}} : !cir.float
   // LLVM: %{{.*}} = call float @llvm.cos.f32(float %{{.*}})
   // OGCG: %{{.*}} = call float @llvm.cos.f32(float %{{.*}})
 }
 
 double cos(double f) {
   return __builtin_cos(f);
-  // CIR: {{.+}} = cir.cos {{.+}} : !cir.double
+  // CIR: %{{.*}} = cir.cos %{{.*}} : !cir.double
   // LLVM: %{{.*}} = call double @llvm.cos.f64(double %{{.*}})
   // OGCG: %{{.*}} = call double @llvm.cos.f64(double %{{.*}})
 }
+
+float ceil(float f) {
+  return __builtin_ceilf(f);
+  // CIR: %{{.*}} = cir.ceil %{{.*}} : !cir.float
+  // LLVM: %{{.*}} = call float @llvm.ceil.f32(float %{{.*}})
+  // OGCG: %{{.*}} = call float @llvm.ceil.f32(float %{{.*}})
+}
+
+float expf(float f) {
+  return __builtin_expf(f);
+  // CIR: %{{.*}} = cir.exp {{.*}} : !cir.float
+  // LLVM: %{{.*}} = call float @llvm.exp.f32(float %{{.*}})
+  // OGCG: %{{.*}} = call float @llvm.exp.f32(float %{{.*}})
+}
+
+double exp(double f) {
+  return __builtin_exp(f);
+  // CIR: %{{.*}} = cir.exp {{.*}} : !cir.double
+  // LLVM: %{{.*}} = call double @llvm.exp.f64(double %{{.*}})
+  // OGCG: %{{.*}} = call double @llvm.exp.f64(double %{{.*}})
+}
+
+long double expl(long double f) {
+  return __builtin_expl(f);
+  // CIR: %{{.*}} = cir.exp {{.*}} : !cir.long_double<!cir.f128>
+  // LLVM: %{{.*}} = call fp128 @llvm.exp.f128(fp128 %{{.*}})
+  // OGCG: %{{.*}} = call fp128 @llvm.exp.f128(fp128 %{{.*}})
+}
diff --git a/clang/test/CIR/CodeGen/complex-compound-assignment.cpp b/clang/test/CIR/CodeGen/complex-compound-assignment.cpp
index a5070f51fad63..f2dbb3cc76ad2 100644
--- a/clang/test/CIR/CodeGen/complex-compound-assignment.cpp
+++ b/clang/test/CIR/CodeGen/complex-compound-assignment.cpp
@@ -237,18 +237,18 @@ void foo4() {
 // CXX_CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["a"]
 // CXX_CIR: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["b"]
 // CXX_CIR: %[[C_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c", init]
-// CXX_CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
-// CXX_CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CXX_CIR: %[[TMP_A:.*]] = cir.load volatile {{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CXX_CIR: %[[TMP_B:.*]] = cir.load volatile {{.*}} %[[B_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
 // CXX_CIR: %[[RESULT:.*]] = cir.complex.add %[[TMP_B]], %[[TMP_A]] : !cir.complex<!s32i>
-// CXX_CIR: cir.store{{.*}} %[[RESULT]], %[[B_ADDR]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
-// CXX_CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CXX_CIR: cir.store volatile {{.*}} %[[RESULT]], %[[B_ADDR]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+// CXX_CIR: %[[TMP_B:.*]] = cir.load volatile {{.*}} %[[B_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
 // CXX_CIR: cir.store{{.*}} %[[TMP_B]], %[[C_ADDR]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>
 
 // CXX_LLVM: %[[A_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
 // CXX_LLVM: %[[B_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
 // CXX_LLVM: %[[C_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
-// CXX_LLVM: %[[TMP_A:.*]] = load { i32, i32 }, ptr %[[A_ADDR]], align 4
-// CXX_LLVM: %[[TMP_B:.*]] = load { i32, i32 }, ptr %[[B_ADDR]], align 4
+// CXX_LLVM: %[[TMP_A:.*]] = load volatile { i32, i32 }, ptr %[[A_ADDR]], align 4
+// CXX_LLVM: %[[TMP_B:.*]] = load volatile { i32, i32 }, ptr %[[B_ADDR]], align 4
 // CXX_LLVM: %[[B_REAL:.*]] = extractvalue { i32, i32 } %[[TMP_B]], 0
 // CXX_LLVM: %[[B_IMAG:.*]] = extractvalue { i32, i32 } %[[TMP_B]], 1
 // CXX_LLVM: %[[A_REAL:.*]] = extractvalue { i32, i32 } %[[TMP_A]], 0
@@ -257,8 +257,8 @@ void foo4() {
 // CXX_LLVM: %[[ADD_IMAG:.*]] = add i32 %[[B_IMAG]], %[[A_IMAG]]
 // CXX_LLVM: %[[TMP_RESULT:.*]] = insertvalue { i32, i32 } poison, i32 %[[ADD_REAL]], 0
 // CXX_LLVM: %[[RESULT:.*]] = insertvalue { i32, i32 } %[[TMP_RESULT]], i32 %[[ADD_IMAG]], 1
-// CXX_LLVM: store { i32, i32 } %[[RESULT]], ptr %[[B_ADDR]], align 4
-// CXX_LLVM: %[[TMP_B:.*]] = load { i32, i32 }, ptr %[[B_ADDR]], align 4
+// CXX_LLVM: store volatile { i32, i32 } %[[RESULT]], ptr %[[B_ADDR]], align 4
+// CXX_LLVM: %[[TMP_B:.*]] = load volatile { i32, i32 }, ptr %[[B_ADDR]], align 4
 // CXX_LLVM: store { i32, i32 } %[[TMP_B]], ptr %[[C_ADDR]], align 4
 
 // CXX_OGCG: %[[A_ADDR:.*]] = alloca { i32, i32 }, align 4
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index 3fb78dc871904..82c9f2d7aaf26 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -1495,3 +1495,185 @@ void calling_function_that_return_complex() {
 // OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
 // OGCG: store float %[[RESULT_REAL]], ptr %[[A_REAL_PTR]], align 4
 // OGCG: store float %[[RESULT_IMAG]], ptr %[[A_IMAG_PTR]], align 4
+
+void imag_literal_gnu_extension() {
+  float _Complex a = 3.0fi;
+  double _Complex b = 3.0i;
+  int _Complex c = 3i;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a", init]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>, ["b", init]
+// CIR: %[[C_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c", init]
+// CIR: %[[COMPLEX_A:.*]] = cir.const #cir.const_complex<#cir.fp<0.000000e+00> : !cir.float, #cir.fp<3.000000e+00> : !cir.float> : !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[COMPLEX_A]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR: %[[COMPLEX_B:.*]] = cir.const #cir.const_complex<#cir.fp<0.000000e+00> : !cir.double, #cir.fp<3.000000e+00> : !cir.double> : !cir.complex<!cir.double>
+// CIR: cir.store{{.*}} %[[COMPLEX_B]], %[[B_ADDR]] : !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>
+// CIR: %[[COMPLEX_C:.*]] = cir.const #cir.const_complex<#cir.int<0> : !s32i, #cir.int<3> : !s32i> : !cir.complex<!s32i>
+// CIR: cir.store{{.*}} %[[COMPLEX_C]], %[[C_ADDR]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca { double, double }, i64 1, align 8
+// LLVM: %[[C_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: store { float, float } { float 0.000000e+00, float 3.000000e+00 }, ptr %[[A_ADDR]], align 4
+// LLVM: store { double, double } { double 0.000000e+00, double 3.000000e+00 }, ptr %[[B_ADDR]], align 8
+// LLVM: store { i32, i32 } { i32 0, i32 3 }, ptr %[[C_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca { double, double }, align 8
+// OGCG: %[[C_ADDR:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: store float 0.000000e+00, ptr %[[A_REAL_PTR]], align 4
+// OGCG: store float 3.000000e+00, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: store double 0.000000e+00, ptr %[[B_REAL_PTR]], align 8
+// OGCG: store double 3.000000e+00, ptr %[[B_IMAG_PTR]], align 8
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 1
+// OGCG: store i32 0, ptr %[[C_REAL_PTR]], align 4
+// OGCG: store i32 3, ptr %[[C_IMAG_PTR]], align 4
+
+void load_store_volatile() {
+  volatile double _Complex a;
+  volatile double _Complex b;
+  a = b;
+
+  volatile int _Complex c;
+  volatile int _Complex d;
+  c = d;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>, ["b"]
+// CIR: %[[C_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c"]
+// CIR: %[[D_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["d"]
+// CIR: %[[TMP_B:.*]] = cir.load volatile {{.*}} %[[B_ADDR]] : !cir.ptr<!cir.complex<!cir.double>>, !cir.complex<!cir.double>
+// CIR: cir.store volatile {{.*}} %[[TMP_B]], %[[A_ADDR]] : !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>
+// CIR: %[[TMP_D:.*]] = cir.load volatile {{.*}} %[[D_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CIR: cir.store volatile {{.*}} %[[TMP_D]], %[[C_ADDR]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { double, double }, i64 1, align 8
+// LLVM: %[[B_ADDR:.*]] = alloca { double, double }, i64 1, align 8
+// LLVM: %[[C_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[D_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[TMP_B:.*]] = load volatile { double, double }, ptr %[[B_ADDR]], align 8
+// LLVM: store volatile { double, double } %[[TMP_B]], ptr %[[A_ADDR]], align 8
+// LLVM: %[[TMP_D:.*]] = load volatile { i32, i32 }, ptr %[[D_ADDR]], align 4
+// LLVM: store volatile { i32, i32 } %[[TMP_D]], ptr %[[C_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { double, double }, align 8
+// OGCG: %[[B_ADDR:.*]] = alloca { double, double }, align 8
+// OGCG: %[[C_ADDR:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[D_ADDR:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[B_REAL:.*]] = load volatile double, ptr %[[B_REAL_PTR]], align 8
+// OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: %[[B_IMAG:.*]] = load volatile double, ptr %[[B_IMAG_PTR]], align 8
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: store volatile double %[[B_REAL]], ptr %[[A_REAL_PTR]], align 8
+// OGCG: store volatile double %[[B_IMAG]], ptr %[[A_IMAG_PTR]], align 8
+// OGCG: %[[D_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[D_ADDR]], i32 0, i32 0
+// OGCG: %[[D_REAL:.*]] = load volatile i32, ptr %[[D_REAL_PTR]], align 4
+// OGCG: %[[D_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[D_ADDR]], i32 0, i32 1
+// OGCG: %[[D_IMAG:.*]] = load volatile i32, ptr %[[D_IMAG_PTR]], align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 1
+// OGCG: store volatile i32 %[[D_REAL]], ptr %[[C_REAL_PTR]], align 4
+// OGCG: store volatile i32 %[[D_IMAG]], ptr %[[C_IMAG_PTR]], align 4
+
+
+void load_store_volatile_2() {
+  volatile double _Complex av;
+  double _Complex a;
+  av = a;
+
+  double _Complex b;
+  volatile double _Complex bv;
+  b = bv;
+
+  int _Complex c;
+  volatile int _Complex cv;
+  c = cv;
+
+  volatile int _Complex dv;
+  int _Complex d;
+  dv = d;
+}
+
+// CIR: %[[AV_ADDR:.*]] = cir.alloca !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>, ["av"]
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>, ["b"]
+// CIR: %[[BV_ADDR:.*]] = cir.alloca !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>, ["bv"]
+// CIR: %[[C_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c"]
+// CIR: %[[CV_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["cv"]
+// CIR: %[[DV_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["dv"]
+// CIR: %[[D_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["d"]
+// CIR: %[[TMP_A:.*]] = cir.load {{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.double>>, !cir.complex<!cir.double>
+// CIR: cir.store volatile {{.*}} %[[TMP_A]], %[[AV_ADDR]] : !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>
+// CIR: %[[TMP_BV:.*]] = cir.load volatile {{.*}} %[[BV_ADDR]] : !cir.ptr<!cir.complex<!cir.double>>, !cir.complex<!cir.double>
+// CIR: cir.store {{.*}} %[[TMP_BV]], %[[B_ADDR]] : !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>
+// CIR: %[[TMP_CV:.*]] = cir.load volatile {{.*}} %[[CV_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CIR: cir.store {{.*}} %[[TMP_CV]], %[[C_ADDR]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+// CIR: %[[TMP_D:.*]] = cir.load {{.*}} %[[D_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CIR: cir.store volatile {{.*}} %[[TMP_D]], %[[DV_ADDR]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// LLVM: %[[AV_ADDR:.*]] = alloca { double, double }, i64 1, align 8
+// LLVM: %[[A_ADDR:.*]] = alloca { double, double }, i64 1, align 8
+// LLVM: %[[B_ADDR:.*]] = alloca { double, double }, i64 1, align 8
+// LLVM: %[[BV_ADDR:.*]] = alloca { double, double }, i64 1, align 8
+// LLVM: %[[C_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[CV_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[DV_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[D_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load { double, double }, ptr %[[A_ADDR]], align 8
+// LLVM: store volatile { double, double } %[[TMP_A]], ptr %[[AV_ADDR]], align 8
+// LLVM: %[[TMP_BV:.*]] = load volatile { double, double }, ptr %[[BV_ADDR]], align 8
+// LLVM: store { double, double } %[[TMP_BV]], ptr %[[B_ADDR]], align 8
+// LLVM: %[[TMP_CV:.*]] = load volatile { i32, i32 }, ptr %[[CV_ADDR]], align 4
+// LLVM: store { i32, i32 } %[[TMP_CV]], ptr %[[C_ADDR]], align 4
+// LLVM: %[[TMP_D:.*]] = load { i32, i32 }, ptr %[[D_ADDR]], align 4
+// LLVM: store volatile { i32, i32 } %[[TMP_D]], ptr %[[DV_ADDR]], align 4
+
+// OGCG: %[[AV_ADDR:.*]] = alloca { double, double }, align 8
+// OGCG: %[[A_ADDR:.*]] = alloca { double, double }, align 8
+// OGCG: %[[B_ADDR:.*]] = alloca { double, double }, align 8
+// OGCG: %[[BV_ADDR:.*]] = alloca { double, double }, align 8
+// OGCG: %[[C_ADDR:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[CV_ADDR:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[DV_ADDR:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[D_ADDR:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load double, ptr %[[A_REAL_PTR]], align 8
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load double, ptr %[[A_IMAG_PTR]], align 8
+// OGCG: %[[AV_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[AV_ADDR]], i32 0, i32 0
+// OGCG: %[[AV_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[AV_ADDR]], i32 0, i32 1
+// OGCG: store volatile double %[[A_REAL]], ptr %[[AV_REAL_PTR]], align 8
+// OGCG: store volatile double %[[A_IMAG]], ptr %[[AV_IMAG_PTR]], align 8
+// OGCG: %[[BV_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[BV_ADDR]], i32 0, i32 0
+// OGCG: %[[BV_REAL:.*]] = load volatile double, ptr %[[BV_REAL_PTR]], align 8
+// OGCG: %[[BV_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[BV_ADDR]], i32 0, i32 1
+// OGCG: %[[BV_IMAG:.*]] = load volatile double, ptr %[[BV_IMAG_PTR]], align 8
+// OGCG: %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: store double %[[BV_REAL]], ptr %[[B_REAL_PTR]], align 8
+// OGCG: store double %[[BV_IMAG]], ptr %[[B_IMAG_PTR]], align 8
+// OGCG: %[[CV_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[CV_ADDR]], i32 0, i32 0
+// OGCG: %[[CV_REAL:.*]] = load volatile i32, ptr %[[CV_REAL_PTR]], align 4
+// OGCG: %[[CV_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[CV_ADDR]], i32 0, i32 1
+// OGCG: %[[CV_IMAG:.*]] = load volatile i32, ptr %[[CV_IMAG_PTR]], align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 1
+// OGCG: store i32 %[[CV_REAL]], ptr %[[C_REAL_PTR]], align 4
+// OGCG: store i32 %[[CV_IMAG]], ptr %[[C_IMAG_PTR]], align 4
+// OGCG: %[[D_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[D_ADDR]], i32 0, i32 0
+// OGCG: %[[D_REAL:.*]] = load i32, ptr %[[D_REAL_PTR]], align 4
+// OGCG: %[[D_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[D_ADDR]], i32 0, i32 1
+// OGCG: %[[D_IMAG:.*]] = load i32, ptr %[[D_IMAG_PTR]], align 4
+// OGCG: %[[DV_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[DV_ADDR]], i32 0, i32 0
+// OGCG: %[[DV_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[DV_ADDR]], i32 0, i32 1
+// OGCG: store volatile i32 %[[D_REAL]], ptr %[[DV_REAL_PTR]], align 4
+// OGCG: store volatile i32 %[[D_IMAG]], ptr %[[DV_IMAG_PTR]], align 4
diff --git a/clang/test/CIR/CodeGen/compound_literal.cpp b/clang/test/CIR/CodeGen/compound_literal.cpp
index 30a1dc03c449b..5219710d3e8bc 100644
--- a/clang/test/CIR/CodeGen/compound_literal.cpp
+++ b/clang/test/CIR/CodeGen/compound_literal.cpp
@@ -79,17 +79,17 @@ void foo3() {
 }
 
 // CIR: %[[A_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
-// CIR: %[[CL_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, [".compoundliteral", init]
 // CIR: %[[VEC:.*]] = cir.const #cir.const_vector<[#cir.int<10> : !s32i, #cir.int<20> : !s32i, #cir.int<30> : !s32i, #cir.int<40> : !s32i]> : !cir.vector<4 x !s32i>
-// CIR: cir.store{{.*}} %[[VEC]], %[[CL_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[TMP:.*]] = cir.load{{.*}} %[[CL_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
-// CIR: cir.store{{.*}} %[[TMP]], %[[A_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: cir.store{{.*}} %[[VEC]], %[[A_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 
 // LLVM: %[[A_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
-// LLVM: %[[CL_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
-// LLVM: store <4 x i32> <i32 10, i32 20, i32 30, i32 40>, ptr %[[CL_ADDR]], align 16
-// LLVM: %[[TMP:.*]] = load <4 x i32>, ptr %[[CL_ADDR]], align 16
-// LLVM: store <4 x i32> %[[TMP]], ptr %[[A_ADDR]], align 16
+// LLVM: store <4 x i32> <i32 10, i32 20, i32 30, i32 40>, ptr %[[A_ADDR]], align 16
+
+// FIXME: OGCG emits a temporary compound literal in this case because it omits
+//        vector types from the check for aggregate constants in
+//        EmitAutoVarAlloca. This looks like an oversight in OGCG because the
+//        code to emit a constant in EmitStoresForConstant specifically looks
+//        for vector types in OGCG.
 
 // OGCG:  %[[A_ADDR:.*]] = alloca <4 x i32>, align 16
 // OGCG: %[[CL_ADDR:.*]] = alloca <4 x i32>, align 16
@@ -107,19 +107,12 @@ void foo4() {
 
 // CIR-LABEL: @_Z4foo4v
 // CIR:   %[[P:.*]] = cir.alloca !rec_Point, !cir.ptr<!rec_Point>, ["p", init]
-// CIR:   %[[P_X:.*]] = cir.get_member %[[P]][0] {name = "x"}
-// CIR:   %[[FIVE:.*]] = cir.const #cir.int<5> : !s32i
-// CIR:   cir.store{{.*}} %[[FIVE]], %[[P_X]]
-// CIR:   %[[P_Y:.*]] = cir.get_member %[[P]][1] {name = "y"}
-// CIR:   %[[TEN:.*]] = cir.const #cir.int<10> : !s32i
-// CIR:   cir.store{{.*}} %[[TEN]], %[[P_Y]]
+// CIR:   %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<5> : !s32i, #cir.int<10> : !s32i}> : !rec_Point
+// CIR:   cir.store{{.*}} %[[CONST]], %[[P]] : !rec_Point, !cir.ptr<!rec_Point>
 
 // LLVM-LABEL: @_Z4foo4v
 // LLVM:   %[[P:.*]] = alloca %struct.Point
-// LLVM:   %[[P_X:.*]] = getelementptr %struct.Point, ptr %[[P]], i32 0, i32 0
-// LLVM:   store i32 5, ptr %[[P_X]]
-// LLVM:   %[[P_Y:.*]] = getelementptr %struct.Point, ptr %[[P]], i32 0, i32 1
-// LLVM:   store i32 10, ptr %[[P_Y]]
+// LLVM:   store %struct.Point { i32 5, i32 10 }, ptr %[[P]], align 4
 
 // OGCG-LABEL: @_Z4foo4v
 // OGCG:   %[[P:.*]] = alloca %struct.Point
diff --git a/clang/test/CIR/CodeGen/coro-task.cpp b/clang/test/CIR/CodeGen/coro-task.cpp
index 265325f82d7f7..5738c815909ea 100644
--- a/clang/test/CIR/CodeGen/coro-task.cpp
+++ b/clang/test/CIR/CodeGen/coro-task.cpp
@@ -36,6 +36,12 @@ struct suspend_never {
   void await_resume() noexcept {}
 };
 
+struct string {
+  int size() const;
+  string();
+  string(char const *s);
+};
+
 } // namespace std
 
 namespace folly {
@@ -101,7 +107,10 @@ co_invoke_fn co_invoke;
 }} // namespace folly::coro
 
 // CIR-DAG: ![[VoidTask:.*]] = !cir.record<struct "folly::coro::Task<void>" padded {!u8i}>
-
+// CIR-DAG: ![[IntTask:.*]] = !cir.record<struct "folly::coro::Task<int>" padded {!u8i}>
+// CIR-DAG: ![[VoidPromisse:.*]] = !cir.record<struct "folly::coro::Task<void>::promise_type" padded {!u8i}>
+// CIR-DAG: ![[IntPromisse:.*]] = !cir.record<struct "folly::coro::Task<int>::promise_type" padded {!u8i}>
+// CIR-DAG: ![[StdString:.*]] = !cir.record<struct "std::string" padded {!u8i}>
 // CIR: module {{.*}} {
 // CIR-NEXT: cir.global external @_ZN5folly4coro9co_invokeE = #cir.zero : !rec_folly3A3Acoro3A3Aco_invoke_fn
 
@@ -119,6 +128,7 @@ VoidTask silly_task() {
 // CIR: cir.func coroutine dso_local @_Z10silly_taskv() -> ![[VoidTask]]
 // CIR: %[[VoidTaskAddr:.*]] = cir.alloca ![[VoidTask]], {{.*}}, ["__retval"]
 // CIR: %[[SavedFrameAddr:.*]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["__coro_frame_addr"]
+// CIR: %[[VoidPromisseAddr:.*]] = cir.alloca ![[VoidPromisse]], {{.*}}, ["__promise"]
 
 // Get coroutine id with __builtin_coro_id.
 
@@ -138,3 +148,27 @@ VoidTask silly_task() {
 // CIR: }
 // CIR: %[[Load0:.*]] = cir.load{{.*}} %[[SavedFrameAddr]] : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>
 // CIR: %[[CoroFrameAddr:.*]] = cir.call @__builtin_coro_begin(%[[CoroId]], %[[Load0]])
+
+// Call promise.get_return_object() to retrieve the task object.
+
+// CIR: %[[RetObj:.*]] = cir.call @_ZN5folly4coro4TaskIvE12promise_type17get_return_objectEv(%[[VoidPromisseAddr]]) nothrow : {{.*}} -> ![[VoidTask]]
+// CIR: cir.store{{.*}} %[[RetObj]], %[[VoidTaskAddr]] : ![[VoidTask]]
+
+folly::coro::Task<int> byRef(const std::string& s) {
+  co_return s.size();
+}
+
+// CIR:  cir.func coroutine dso_local @_Z5byRefRKSt6string(%[[ARG:.*]]: !cir.ptr<![[StdString]]> {{.*}}) -> ![[IntTask]]
+// CIR:    %[[AllocaParam:.*]] = cir.alloca !cir.ptr<![[StdString]]>, {{.*}}, ["s", init, const]
+// CIR:    %[[IntTaskAddr:.*]] = cir.alloca ![[IntTask]], {{.*}}, ["__retval"]
+// CIR:    %[[SavedFrameAddr:.*]]  = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["__coro_frame_addr"]
+// CIR:    %[[AllocaFnUse:.*]] = cir.alloca !cir.ptr<![[StdString]]>, {{.*}}, ["s", init, const]
+// CIR:    %[[IntPromisseAddr:.*]] = cir.alloca ![[IntPromisse]], {{.*}}, ["__promise"]
+// CIR:    cir.store %[[ARG]], %[[AllocaParam]] : !cir.ptr<![[StdString]]>, {{.*}}
+
+// Call promise.get_return_object() to retrieve the task object.
+
+// CIR:    %[[LOAD:.*]] = cir.load %[[AllocaParam]] : !cir.ptr<!cir.ptr<![[StdString]]>>, !cir.ptr<![[StdString]]>
+// CIR:    cir.store {{.*}} %[[LOAD]], %[[AllocaFnUse]] : !cir.ptr<![[StdString]]>, !cir.ptr<!cir.ptr<![[StdString]]>>
+// CIR:    %[[RetObj:.*]] = cir.call @_ZN5folly4coro4TaskIiE12promise_type17get_return_objectEv(%4) nothrow : {{.*}} -> ![[IntTask]]
+// CIR:    cir.store {{.*}} %[[RetObj]], %[[IntTaskAddr]] : ![[IntTask]]
diff --git a/clang/test/CIR/CodeGen/ctor-null-init.cpp b/clang/test/CIR/CodeGen/ctor-null-init.cpp
new file mode 100644
index 0000000000000..4324b329c8b41
--- /dev/null
+++ b/clang/test/CIR/CodeGen/ctor-null-init.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir --check-prefix=CIR %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll --check-prefix=LLVM %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll --check-prefix=OGCG %s
+
+struct A {
+  A() = default;
+  A(int); // This constructor triggers the null base class initialization.
+};
+
+struct B : A {
+};
+
+void test_empty_base_null_init() {
+  B{};
+}
+
+// CIR: cir.func {{.*}} @_Z25test_empty_base_null_initv()
+// CIR-NEXT:   %[[B_ADDR:.*]] = cir.alloca !rec_B, !cir.ptr<!rec_B>, ["agg.tmp.ensured"]
+// CIR-NEXT:   %[[A_ADDR:.*]] = cir.base_class_addr %[[B_ADDR]] : !cir.ptr<!rec_B> nonnull [0] -> !cir.ptr<!rec_A>
+
+// LLVM: define{{.*}} @_Z25test_empty_base_null_initv()
+// LLVM-NEXT:   %[[B:.*]] = alloca %struct.B
+// LLVM-NEXT:   ret void
+
+// OGCG: define{{.*}} @_Z25test_empty_base_null_initv()
+// OGCG-NEXT: entry:
+// OGCG-NEXT:   %[[B:.*]] = alloca %struct.B
+// OGCG-NEXT:   ret void
diff --git a/clang/test/CIR/CodeGen/cxx-rewritten-binary-operator.cpp b/clang/test/CIR/CodeGen/cxx-rewritten-binary-operator.cpp
new file mode 100644
index 0000000000000..ac4cac429cb0f
--- /dev/null
+++ b/clang/test/CIR/CodeGen/cxx-rewritten-binary-operator.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+struct HasOpEq {
+  bool operator==(const HasOpEq &) const;
+};
+
+void cxx_rewritten_binary_operator_scalar_expr() {
+  HasOpEq a;
+  HasOpEq b;
+  bool neq = a != b;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !rec_HasOpEq, !cir.ptr<!rec_HasOpEq>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !rec_HasOpEq, !cir.ptr<!rec_HasOpEq>, ["b"]
+// CIR: %[[NEQ_ADDR:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["neq", init]
+// CIR: %[[EQ:.*]] = cir.call @_ZNK7HasOpEqeqERKS_(%[[A_ADDR]], %[[B_ADDR]]) : (!cir.ptr<!rec_HasOpEq>, !cir.ptr<!rec_HasOpEq>) -> !cir.bool
+// CIR: %[[NEQ:.*]] = cir.unary(not, %[[EQ]]) : !cir.bool, !cir.bool
+// CIR: cir.store{{.*}} %[[NEQ]], %[[NEQ_ADDR]] : !cir.bool, !cir.ptr<!cir.bool>
+
+// LLVM: %[[A_ADDR:.*]] = alloca %struct.HasOpEq, i64 1, align 1
+// LLVM: %[[B_ADDR:.*]] = alloca %struct.HasOpEq, i64 1, align 1
+// LLVM: %[[NEQ_ADDR:.*]] = alloca i8, i64 1, align 1
+// LLVM: %[[EQ:.*]] = call i1 @_ZNK7HasOpEqeqERKS_(ptr %[[A_ADDR]], ptr %[[B_ADDR]])
+// LLVM: %[[NEQ_I1:.*]] = xor i1 %[[EQ]], true
+// LLVM: %[[NEQ:.*]] = zext i1 %[[NEQ_I1]] to i8
+// LLVM: store i8 %[[NEQ]], ptr %[[NEQ_ADDR]], align 1
+
+// OGCG: %[[A_ADDR:.*]] = alloca %struct.HasOpEq, align 1
+// OGCG: %[[B_ADDR:.*]] = alloca %struct.HasOpEq, align 1
+// OGCG: %[[NEQ_ADDR:.*]] = alloca i8, align 1
+// OGCG: %[[EQ:.*]] = call {{.*}} zeroext i1 @_ZNK7HasOpEqeqERKS_(ptr {{.*}} %[[A_ADDR]], ptr {{.*}} %[[B_ADDR]])
+// OGCG: %[[NEQ_I1:.*]] = xor i1 %[[EQ]], true
+// OGCG: %[[NEQ:.*]] = zext i1 %[[NEQ_I1]] to i8
+// OGCG: store i8 %[[NEQ]], ptr %[[NEQ_ADDR]], align 1
diff --git a/clang/test/CIR/CodeGen/derived-to-base.cpp b/clang/test/CIR/CodeGen/derived-to-base.cpp
new file mode 100644
index 0000000000000..13acb47022c65
--- /dev/null
+++ b/clang/test/CIR/CodeGen/derived-to-base.cpp
@@ -0,0 +1,129 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+// TODO(cir): The constructors in this test case are only here because we don't
+//            have support for zero-initialization of base classes yet. We should
+//            fix that soon.
+
+struct Base {
+  Base();
+  void f();
+  int a;
+};
+
+struct Derived : Base {
+  Derived();
+  double b;
+};
+
+void f() {
+  Derived d;
+  d.f();
+}
+
+// CIR: cir.func {{.*}} @_Z1fv()
+// CIR:   %[[D:.*]] = cir.alloca !rec_Derived, !cir.ptr<!rec_Derived>, ["d", init]
+// CIR:   cir.call @_ZN7DerivedC1Ev(%[[D]]) : (!cir.ptr<!rec_Derived>) -> ()
+// CIR:   %[[D_BASE:.*]] = cir.base_class_addr %[[D]] : !cir.ptr<!rec_Derived> nonnull [0] -> !cir.ptr<!rec_Base>
+// CIR:   cir.call @_ZN4Base1fEv(%[[D_BASE]]) : (!cir.ptr<!rec_Base>) -> ()
+
+// LLVM: define {{.*}}void @_Z1fv()
+// LLVM:   %[[D:.*]] = alloca %struct.Derived
+// LLVM:   call void @_ZN7DerivedC1Ev(ptr %[[D]])
+// LLVM:   call void @_ZN4Base1fEv(ptr %[[D]])
+
+// OGCG: define {{.*}}void @_Z1fv()
+// OGCG:   %[[D:.*]] = alloca %struct.Derived
+// OGCG:   call void @_ZN7DerivedC1Ev(ptr {{.*}} %[[D]])
+// OGCG:   call void @_ZN4Base1fEv(ptr {{.*}} %[[D]])
+
+void useBase(Base *base);
+void callBaseUsingDerived(Derived *derived) {
+  useBase(derived);
+}
+
+
+// CIR: cir.func {{.*}} @_Z20callBaseUsingDerivedP7Derived(%[[DERIVED_ARG:.*]]: !cir.ptr<!rec_Derived> {{.*}})
+// CIR:   %[[DERIVED_ADDR:.*]] = cir.alloca !cir.ptr<!rec_Derived>, !cir.ptr<!cir.ptr<!rec_Derived>>, ["derived", init]
+// CIR:   cir.store %[[DERIVED_ARG]], %[[DERIVED_ADDR]]
+// CIR:   %[[DERIVED:.*]] = cir.load{{.*}} %[[DERIVED_ADDR]]
+// CIR:   %[[DERIVED_BASE:.*]] = cir.base_class_addr %[[DERIVED]] : !cir.ptr<!rec_Derived> nonnull [0] -> !cir.ptr<!rec_Base>
+// CIR:   cir.call @_Z7useBaseP4Base(%[[DERIVED_BASE]]) : (!cir.ptr<!rec_Base>) -> ()
+
+// LLVM: define {{.*}} void @_Z20callBaseUsingDerivedP7Derived(ptr %[[DERIVED_ARG:.*]])
+// LLVM:   %[[DERIVED_ADDR:.*]] = alloca ptr
+// LLVM:   store ptr %[[DERIVED_ARG]], ptr %[[DERIVED_ADDR]]
+// LLVM:   %[[DERIVED:.*]] = load ptr, ptr %[[DERIVED_ADDR]]
+// LLVM:   call void @_Z7useBaseP4Base(ptr %[[DERIVED]])
+
+// OGCG: define {{.*}} void @_Z20callBaseUsingDerivedP7Derived(ptr {{.*}} %[[DERIVED_ARG:.*]])
+// OGCG:   %[[DERIVED_ADDR:.*]] = alloca ptr
+// OGCG:   store ptr %[[DERIVED_ARG]], ptr %[[DERIVED_ADDR]]
+// OGCG:   %[[DERIVED:.*]] = load ptr, ptr %[[DERIVED_ADDR]]
+// OGCG:   call void @_Z7useBaseP4Base(ptr {{.*}} %[[DERIVED]])
+
+Base *returnBaseFromDerived(Derived* derived) {
+  return derived;
+}
+
+// CIR: cir.func {{.*}} @_Z21returnBaseFromDerivedP7Derived(%[[DERIVED_ARG:.*]]: !cir.ptr<!rec_Derived> {{.*}}) -> !cir.ptr<!rec_Base>
+// CIR:   %[[DERIVED_ADDR:.*]] = cir.alloca !cir.ptr<!rec_Derived>, !cir.ptr<!cir.ptr<!rec_Derived>>, ["derived", init]
+// CIR:   %[[BASE_ADDR:.*]] = cir.alloca !cir.ptr<!rec_Base>, !cir.ptr<!cir.ptr<!rec_Base>>, ["__retval"]
+// CIR:   cir.store %[[DERIVED_ARG]], %[[DERIVED_ADDR]]
+// CIR:   %[[DERIVED:.*]] = cir.load{{.*}} %[[DERIVED_ADDR]]
+// CIR:   %[[DERIVED_BASE:.*]] = cir.base_class_addr %[[DERIVED]] : !cir.ptr<!rec_Derived> nonnull [0] -> !cir.ptr<!rec_Base>
+// CIR:   cir.store %[[DERIVED_BASE]], %[[BASE_ADDR]]
+// CIR:   %[[BASE:.*]] = cir.load{{.*}} %[[BASE_ADDR]]
+// CIR:   cir.return %[[BASE]] : !cir.ptr<!rec_Base>
+
+// LLVM: define {{.*}} ptr @_Z21returnBaseFromDerivedP7Derived(ptr %[[DERIVED_ARG:.*]])
+// LLVM:   %[[DERIVED_ADDR:.*]] = alloca ptr
+// LLVM:   store ptr %[[DERIVED_ARG]], ptr %[[DERIVED_ADDR]]
+// LLVM:   %[[DERIVED:.*]] = load ptr, ptr %[[DERIVED_ADDR]]
+
+// OGCG: define {{.*}} ptr @_Z21returnBaseFromDerivedP7Derived(ptr {{.*}} %[[DERIVED_ARG:.*]])
+// OGCG:   %[[DERIVED_ADDR:.*]] = alloca ptr
+// OGCG:   store ptr %[[DERIVED_ARG]], ptr %[[DERIVED_ADDR]]
+// OGCG:   %[[DERIVED:.*]] = load ptr, ptr %[[DERIVED_ADDR]]
+
+volatile Derived derivedObj;
+
+void test_volatile_store() {
+  derivedObj.a = 0;
+}
+
+// CIR: cir.func {{.*}} @_Z19test_volatile_storev()
+// CIR:   %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:   %[[DERIVED_OBJ:.*]] = cir.get_global @derivedObj : !cir.ptr<!rec_Derived>
+// CIR:   %[[DERIVED_OBJ_BASE:.*]] = cir.base_class_addr %[[DERIVED_OBJ]] : !cir.ptr<!rec_Derived> nonnull [0] -> !cir.ptr<!rec_Base>
+// CIR:   %[[DERIVED_OBJ_A:.*]] = cir.get_member %[[DERIVED_OBJ_BASE]][0] {name = "a"} : !cir.ptr<!rec_Base> -> !cir.ptr<!s32i>
+// CIR:   cir.store volatile {{.*}} %[[ZERO]], %[[DERIVED_OBJ_A]] : !s32i, !cir.ptr<!s32i>
+
+// LLVM: define {{.*}} void @_Z19test_volatile_storev()
+// LLVM:   store volatile i32 0, ptr @derivedObj
+
+// OGCG: define {{.*}} void @_Z19test_volatile_storev()
+// OGCG:   store volatile i32 0, ptr @derivedObj
+
+void test_volatile_load() {
+  [[maybe_unused]] int val = derivedObj.a;
+}
+
+// CIR: cir.func {{.*}} @_Z18test_volatile_loadv()
+// CIR:   %[[DERIVED_OBJ:.*]] = cir.get_global @derivedObj : !cir.ptr<!rec_Derived>
+// CIR:   %[[DERIVED_OBJ_BASE:.*]] = cir.base_class_addr %[[DERIVED_OBJ]] : !cir.ptr<!rec_Derived> nonnull [0] -> !cir.ptr<!rec_Base>
+// CIR:   %[[DERIVED_OBJ_A:.*]] = cir.get_member %[[DERIVED_OBJ_BASE]][0] {name = "a"} : !cir.ptr<!rec_Base> -> !cir.ptr<!s32i>
+// CIR:   %[[VAL:.*]] = cir.load volatile {{.*}} %[[DERIVED_OBJ_A]] : !cir.ptr<!s32i>, !s32i
+
+// LLVM: define {{.*}} void @_Z18test_volatile_loadv()
+// LLVM:   %[[VAL_ADDR:.*]] = alloca i32
+// LLVM:   %[[DERIVED_OBJ:.*]] = load volatile i32, ptr @derivedObj
+
+// OGCG: define {{.*}} void @_Z18test_volatile_loadv()
+// OGCG:   %[[VAL_ADDR:.*]] = alloca i32
+// OGCG:   %[[DERIVED_OBJ:.*]] = load volatile i32, ptr @derivedObj
+// OGCG:   store i32 %[[DERIVED_OBJ]], ptr %[[VAL_ADDR]]
diff --git a/clang/test/CIR/CodeGen/loop.cpp b/clang/test/CIR/CodeGen/loop.cpp
index 3d286664bba85..463434c38a1af 100644
--- a/clang/test/CIR/CodeGen/loop.cpp
+++ b/clang/test/CIR/CodeGen/loop.cpp
@@ -312,23 +312,10 @@ void l5() {
 // CIR:     %[[BEGIN_ADDR:.*]] = cir.alloca {{.*}} ["__begin1", init]
 // CIR:     %[[END_ADDR:.*]] = cir.alloca {{.*}} ["__end1", init]
 // CIR:     %[[X_ADDR:.*]] = cir.alloca {{.*}} ["x", init]
-// CIR:     %[[ARR_CAST:.*]] = cir.cast array_to_ptrdecay %[[ARR_ADDR]] : {{.*}}
-// CIR:     %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CIR:     cir.store{{.*}} %[[ONE]], %[[ARR_CAST]]
-// CIR:     %[[OFFSET1:.*]] = cir.const #cir.int<1> : !s64i
-// CIR:     %[[STRIDE:.*]] = cir.ptr_stride %[[ARR_CAST]], %[[OFFSET1]] : ({{.*}}, {{.*}})
-// CIR:     %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
-// CIR:     cir.store{{.*}} %[[TWO]], %[[STRIDE]]
-// CIR:     %[[OFFSET2:.*]] = cir.const #cir.int<2> : !s64i
-// CIR:     %[[STRIDE2:.*]] = cir.ptr_stride %[[ARR_CAST]], %[[OFFSET2]] : ({{.*}}, {{.*}})
-// CIR:     %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
-// CIR:     cir.store{{.*}} %[[THREE]], %[[STRIDE2]]
-// CIR:     %[[OFFSET3:.*]] = cir.const #cir.int<3> : !s64i
-// CIR:     %[[STRIDE3:.*]] = cir.ptr_stride %[[ARR_CAST]], %[[OFFSET3]] : ({{.*}}, {{.*}})
-// CIR:     %[[FOUR:.*]] = cir.const #cir.int<4> : !s32i
-// CIR:     cir.store{{.*}} %[[FOUR]], %[[STRIDE3]]
+// CIR:     %[[ARR_INIT:.*]] = cir.const #cir.const_array<[#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i]>
+// CIR:     cir.store{{.*}} %[[ARR_INIT]], %[[ARR_ADDR]]
 // CIR:     cir.store{{.*}} %[[ARR_ADDR]], %[[RANGE_ADDR]]
-// CIR:     %[[RANGE_LOAD:.*]] = cir.load{{.*}} %[[RANGE_ADDR]]
+// CIR:     %[[RANGE_LOAD:.*]] = cir.load %[[RANGE_ADDR]]
 // CIR:     %[[RANGE_CAST:.*]] = cir.cast array_to_ptrdecay %[[RANGE_LOAD]] : {{.*}}
 // CIR:     cir.store{{.*}} %[[RANGE_CAST]], %[[BEGIN_ADDR]]
 // CIR:     %[[BEGIN:.*]] = cir.load{{.*}} %[[RANGE_ADDR]]
@@ -363,14 +350,7 @@ void l5() {
 // LLVM:   %[[X_ADDR:.*]] = alloca i32
 // LLVM:   br label %[[SETUP:.*]]
 // LLVM: [[SETUP]]:
-// LLVM:   %[[ARR_0:.*]] = getelementptr i32, ptr %[[ARR_ADDR]], i32 0
-// LLVM:   store i32 1, ptr %[[ARR_0]]
-// LLVM:   %[[ARR_1:.*]] = getelementptr i32, ptr %[[ARR_0]], i64 1
-// LLVM:   store i32 2, ptr %[[ARR_1]]
-// LLVM:   %[[ARR_2:.*]] = getelementptr i32, ptr %[[ARR_0]], i64 2
-// LLVM:   store i32 3, ptr %[[ARR_2]]
-// LLVM:   %[[ARR_3:.*]] = getelementptr i32, ptr %[[ARR_0]], i64 3
-// LLVM:   store i32 4, ptr %[[ARR_3]]
+// LLVM:   store [4 x i32] [i32 1, i32 2, i32 3, i32 4], ptr %[[ARR_ADDR]]
 // LLVM:   store ptr %[[ARR_ADDR]], ptr %[[RANGE_ADDR]]
 // LLVM:   %[[BEGIN:.*]] = load ptr, ptr %[[RANGE_ADDR]]
 // LLVM:   %[[BEGIN_CAST:.*]] = getelementptr i32, ptr %[[BEGIN]], i32 0
diff --git a/clang/test/CIR/CodeGen/object-size-flex-array.c b/clang/test/CIR/CodeGen/object-size-flex-array.c
new file mode 100644
index 0000000000000..74229fd1fac6c
--- /dev/null
+++ b/clang/test/CIR/CodeGen/object-size-flex-array.c
@@ -0,0 +1,317 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR --check-prefix=CIR-NO-STRICT
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -emit-llvm -disable-llvm-passes %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM --check-prefix=LLVM-NO-STRICT
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -emit-llvm -disable-llvm-passes %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG --check-prefix=OGCG-NO-STRICT
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=0 -emit-cir %s -o %t-strict-0.cir
+// RUN: FileCheck --input-file=%t-strict-0.cir %s --check-prefix=CIR --check-prefix=CIR-STRICT-0
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=0 -emit-llvm -disable-llvm-passes %s -o %t-cir-strict-0.ll
+// RUN: FileCheck --input-file=%t-cir-strict-0.ll %s --check-prefix=LLVM --check-prefix=LLVM-STRICT-0
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fstrict-flex-arrays=0 -emit-llvm -disable-llvm-passes %s -o %t-strict-0.ll
+// RUN: FileCheck --input-file=%t-strict-0.ll %s --check-prefix=OGCG --check-prefix=OGCG-STRICT-0
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=1 -emit-cir %s -o %t-strict-1.cir
+// RUN: FileCheck --input-file=%t-strict-1.cir %s --check-prefix=CIR --check-prefix=CIR-STRICT-1
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=1 -emit-llvm -disable-llvm-passes %s -o %t-cir-strict-1.ll
+// RUN: FileCheck --input-file=%t-cir-strict-1.ll %s --check-prefix=LLVM --check-prefix=LLVM-STRICT-1
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fstrict-flex-arrays=1 -emit-llvm -disable-llvm-passes %s -o %t-strict-1.ll
+// RUN: FileCheck --input-file=%t-strict-1.ll %s --check-prefix=OGCG --check-prefix=OGCG-STRICT-1
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=2 -emit-cir %s -o %t-strict-2.cir
+// RUN: FileCheck --input-file=%t-strict-2.cir %s --check-prefix=CIR --check-prefix=CIR-STRICT-2
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=2 -emit-llvm -disable-llvm-passes %s -o %t-cir-strict-2.ll
+// RUN: FileCheck --input-file=%t-cir-strict-2.ll %s --check-prefix=LLVM --check-prefix=LLVM-STRICT-2
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fstrict-flex-arrays=2 -emit-llvm -disable-llvm-passes %s -o %t-strict-2.ll
+// RUN: FileCheck --input-file=%t-strict-2.ll %s --check-prefix=OGCG --check-prefix=OGCG-STRICT-2
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=3 -emit-cir %s -o %t-strict-3.cir
+// RUN: FileCheck --input-file=%t-strict-3.cir %s --check-prefix=CIR --check-prefix=CIR-STRICT-3
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fclangir -fstrict-flex-arrays=3 -emit-llvm -disable-llvm-passes %s -o %t-cir-strict-3.ll
+// RUN: FileCheck --input-file=%t-cir-strict-3.ll %s --check-prefix=LLVM --check-prefix=LLVM-STRICT-3
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -fstrict-flex-arrays=3 -emit-llvm -disable-llvm-passes %s -o %t-strict-3.ll
+// RUN: FileCheck --input-file=%t-strict-3.ll %s --check-prefix=OGCG --check-prefix=OGCG-STRICT-3
+
+#define OBJECT_SIZE_BUILTIN __builtin_object_size
+
+typedef struct {
+  float f;
+  double c[];
+} foo_t;
+
+typedef struct {
+  float f;
+  double c[0];
+} foo0_t;
+
+typedef struct {
+  float f;
+  double c[1];
+} foo1_t;
+
+typedef struct {
+  float f;
+  double c[2];
+} foo2_t;
+
+// CIR-LABEL: @bar
+// LLVM-LABEL: @bar(
+// OGCG-LABEL: @bar(
+unsigned bar(foo_t *f) {
+  // CIR-NO-STRICT: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-0: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-1: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-2: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-3: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-3: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-3: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  return OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// CIR-LABEL: @bar0
+// LLVM-LABEL: @bar0(
+// OGCG-LABEL: @bar0(
+unsigned bar0(foo0_t *f) {
+  // CIR-NO-STRICT: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-0: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-1: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-2: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-3: cir.const #cir.int<0>
+  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-3: store i32 0
+  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-3: ret i32 0
+  return OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// CIR-LABEL: @bar1
+// LLVM-LABEL: @bar1(
+// OGCG-LABEL: @bar1(
+unsigned bar1(foo1_t *f) {
+  // CIR-NO-STRICT: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-0: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-1: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-2: cir.const #cir.int<8>
+  // CIR-STRICT-3: cir.const #cir.int<8>
+  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-2: store i32 8
+  // LLVM-STRICT-3: store i32 8
+  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-2: ret i32 8
+  // OGCG-STRICT-3: ret i32 8
+  return OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// CIR-LABEL: @bar2
+// LLVM-LABEL: @bar2(
+// OGCG-LABEL: @bar2(
+unsigned bar2(foo2_t *f) {
+  // CIR-NO-STRICT: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-0: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-1: cir.const #cir.int<16>
+  // CIR-STRICT-2: cir.const #cir.int<16>
+  // CIR-STRICT-3: cir.const #cir.int<16>
+  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // LLVM-STRICT-1: store i32 16
+  // LLVM-STRICT-2: store i32 16
+  // LLVM-STRICT-3: store i32 16
+  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 false)
+  // OGCG-STRICT-1: ret i32 16
+  // OGCG-STRICT-2: ret i32 16
+  // OGCG-STRICT-3: ret i32 16
+  return OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+#define DYNAMIC_OBJECT_SIZE_BUILTIN __builtin_dynamic_object_size
+
+// CIR-LABEL: @dyn_bar
+// LLVM-LABEL: @dyn_bar(
+// OGCG-LABEL: @dyn_bar(
+unsigned dyn_bar(foo_t *f) {
+  // CIR-NO-STRICT: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-0: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-1: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-2: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-3: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-3: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-3: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  return DYNAMIC_OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// CIR-LABEL: @dyn_bar0
+// LLVM-LABEL: @dyn_bar0(
+// OGCG-LABEL: @dyn_bar0(
+unsigned dyn_bar0(foo0_t *f) {
+  // CIR-NO-STRICT: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-0: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-1: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-2: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-3: cir.const #cir.int<0>
+  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-3: store i32 0
+  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-2: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-3: ret i32 0
+  return DYNAMIC_OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// CIR-LABEL: @dyn_bar1
+// LLVM-LABEL: @dyn_bar1(
+// OGCG-LABEL: @dyn_bar1(
+unsigned dyn_bar1(foo1_t *f) {
+  // CIR-NO-STRICT: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-0: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-1: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-2: cir.const #cir.int<8>
+  // CIR-STRICT-3: cir.const #cir.int<8>
+  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-2: store i32 8
+  // LLVM-STRICT-3: store i32 8
+  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-1: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-2: ret i32 8
+  // OGCG-STRICT-3: ret i32 8
+  return DYNAMIC_OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// CIR-LABEL: @dyn_bar2
+// LLVM-LABEL: @dyn_bar2(
+// OGCG-LABEL: @dyn_bar2(
+unsigned dyn_bar2(foo2_t *f) {
+  // CIR-NO-STRICT: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-0: cir.objsize max nullunknown dynamic {{.*}} : !cir.ptr<!void> -> !u64i
+  // CIR-STRICT-1: cir.const #cir.int<16>
+  // CIR-STRICT-2: cir.const #cir.int<16>
+  // CIR-STRICT-3: cir.const #cir.int<16>
+  // LLVM-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // LLVM-STRICT-1: store i32 16
+  // LLVM-STRICT-2: store i32 16
+  // LLVM-STRICT-3: store i32 16
+  // OGCG-NO-STRICT: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-0: llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1 true)
+  // OGCG-STRICT-1: ret i32 16
+  // OGCG-STRICT-2: ret i32 16
+  // OGCG-STRICT-3: ret i32 16
+  return DYNAMIC_OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// Also checks for non-trailing flex-array like members
+
+typedef struct {
+  double c[0];
+  float f;
+} foofoo0_t;
+
+typedef struct {
+  double c[1];
+  float f;
+} foofoo1_t;
+
+typedef struct {
+  double c[2];
+  float f;
+} foofoo2_t;
+
+// CIR-LABEL: @babar0
+// LLVM-LABEL: @babar0(
+// OGCG-LABEL: @babar0(
+unsigned babar0(foofoo0_t *f) {
+  // CIR-NO-STRICT: cir.const #cir.int<0>
+  // CIR-STRICT-0: cir.const #cir.int<0>
+  // CIR-STRICT-1: cir.const #cir.int<0>
+  // CIR-STRICT-2: cir.const #cir.int<0>
+  // CIR-STRICT-3: cir.const #cir.int<0>
+  // LLVM-NO-STRICT: store i32 0
+  // LLVM-STRICT-0: store i32 0
+  // LLVM-STRICT-1: store i32 0
+  // LLVM-STRICT-2: store i32 0
+  // LLVM-STRICT-3: store i32 0
+  // OGCG-NO-STRICT: ret i32 0
+  // OGCG-STRICT-0: ret i32 0
+  // OGCG-STRICT-1: ret i32 0
+  // OGCG-STRICT-2: ret i32 0
+  // OGCG-STRICT-3: ret i32 0
+  return OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// CIR-LABEL: @babar1
+// LLVM-LABEL: @babar1(
+// OGCG-LABEL: @babar1(
+unsigned babar1(foofoo1_t *f) {
+  // CIR-NO-STRICT: cir.const #cir.int<8>
+  // CIR-STRICT-0: cir.const #cir.int<8>
+  // CIR-STRICT-1: cir.const #cir.int<8>
+  // CIR-STRICT-2: cir.const #cir.int<8>
+  // CIR-STRICT-3: cir.const #cir.int<8>
+  // LLVM-NO-STRICT: store i32 8
+  // LLVM-STRICT-0: store i32 8
+  // LLVM-STRICT-1: store i32 8
+  // LLVM-STRICT-2: store i32 8
+  // LLVM-STRICT-3: store i32 8
+  // OGCG-NO-STRICT: ret i32 8
+  // OGCG-STRICT-0: ret i32 8
+  // OGCG-STRICT-1: ret i32 8
+  // OGCG-STRICT-2: ret i32 8
+  // OGCG-STRICT-3: ret i32 8
+  return OBJECT_SIZE_BUILTIN(f->c, 1);
+}
+
+// CIR-LABEL: @babar2
+// LLVM-LABEL: @babar2(
+// OGCG-LABEL: @babar2(
+unsigned babar2(foofoo2_t *f) {
+  // CIR-NO-STRICT: cir.const #cir.int<16>
+  // CIR-STRICT-0: cir.const #cir.int<16>
+  // CIR-STRICT-1: cir.const #cir.int<16>
+  // CIR-STRICT-2: cir.const #cir.int<16>
+  // CIR-STRICT-3: cir.const #cir.int<16>
+  // LLVM-NO-STRICT: store i32 16
+  // LLVM-STRICT-0: store i32 16
+  // LLVM-STRICT-1: store i32 16
+  // LLVM-STRICT-2: store i32 16
+  // LLVM-STRICT-3: store i32 16
+  // OGCG-NO-STRICT: ret i32 16
+  // OGCG-STRICT-0: ret i32 16
+  // OGCG-STRICT-1: ret i32 16
+  // OGCG-STRICT-2: ret i32 16
+  // OGCG-STRICT-3: ret i32 16
+  return OBJECT_SIZE_BUILTIN(f->c, 1);
+}
diff --git a/clang/test/CIR/CodeGen/object-size.c b/clang/test/CIR/CodeGen/object-size.c
new file mode 100644
index 0000000000000..1b10fb8b352cf
--- /dev/null
+++ b/clang/test/CIR/CodeGen/object-size.c
@@ -0,0 +1,877 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+char gbuf[63];
+char *gp;
+int gi, gj;
+
+// CIR-LABEL: @test1
+// LLVM-LABEL: define {{.*}} void @test1
+// OGCG-LABEL: define {{.*}} void @test1
+void test1(void) {
+  // CIR: cir.const #cir.int<59>
+  // LLVM: store i32 59
+  // OGCG: store i32 59
+  gi = __builtin_object_size(&gbuf[4], 1);
+}
+
+// CIR-LABEL: @test2
+// LLVM-LABEL: define {{.*}} void @test2
+// OGCG-LABEL: define {{.*}} void @test2
+void test2(void) {
+  // CIR: cir.const #cir.int<63>
+  // LLVM: store i32 63
+  // OGCG: store i32 63
+  gi = __builtin_object_size(gbuf, 1);
+}
+
+// CIR-LABEL: @test3
+// LLVM-LABEL: define {{.*}} void @test3
+// OGCG-LABEL: define {{.*}} void @test3
+void test3(void) {
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&gbuf[100], 1);
+}
+
+// CIR-LABEL: @test4
+// LLVM-LABEL: define {{.*}} void @test4
+// OGCG-LABEL: define {{.*}} void @test4
+void test4(void) {
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)(void*)&gbuf[-1], 1);
+}
+
+// CIR-LABEL: @test5
+// LLVM-LABEL: define {{.*}} void @test5
+// OGCG-LABEL: define {{.*}} void @test5
+void test5(void) {
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(gp, 0);
+}
+
+// CIR-LABEL: @test6
+// LLVM-LABEL: define {{.*}} void @test6
+// OGCG-LABEL: define {{.*}} void @test6
+void test6(void) {
+  char buf[57];
+
+  // CIR: cir.const #cir.int<53>
+  // LLVM: store i32 53
+  // OGCG: store i32 53
+  gi = __builtin_object_size(&buf[4], 1);
+}
+
+// CIR-LABEL: @test18
+// LLVM-LABEL: define {{.*}} i32 @test18
+// OGCG-LABEL: define {{.*}} i32 @test18
+unsigned test18(int cond) {
+  int a[4], b[4];
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64
+  // OGCG: call i64 @llvm.objectsize.i64
+  return __builtin_object_size(cond ? a : b, 0);
+}
+
+// CIR-LABEL: @test19
+// LLVM-LABEL: define {{.*}} void @test19
+// OGCG-LABEL: define {{.*}} void @test19
+void test19(void) {
+  struct {
+    int a, b;
+  } foo;
+
+  // CIR: cir.const #cir.int<8>
+  // LLVM: store i32 8
+  // OGCG: store i32 8
+  gi = __builtin_object_size(&foo.a, 0);
+  
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&foo.a, 1);
+  
+  // CIR: cir.const #cir.int<8>
+  // LLVM: store i32 8
+  // OGCG: store i32 8
+  gi = __builtin_object_size(&foo.a, 2);
+  
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&foo.a, 3);
+
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&foo.b, 0);
+  
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&foo.b, 1);
+  
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&foo.b, 2);
+  
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&foo.b, 3);
+}
+
+// CIR-LABEL: @test20
+// LLVM-LABEL: define {{.*}} void @test20
+// OGCG-LABEL: define {{.*}} void @test20
+void test20(void) {
+  struct { int t[10]; } t[10];
+
+  // CIR: cir.const #cir.int<380>
+  // LLVM: store i32 380
+  // OGCG: store i32 380
+  gi = __builtin_object_size(&t[0].t[5], 0);
+  
+  // CIR: cir.const #cir.int<20>
+  // LLVM: store i32 20
+  // OGCG: store i32 20
+  gi = __builtin_object_size(&t[0].t[5], 1);
+  
+  // CIR: cir.const #cir.int<380>
+  // LLVM: store i32 380
+  // OGCG: store i32 380
+  gi = __builtin_object_size(&t[0].t[5], 2);
+  
+  // CIR: cir.const #cir.int<20>
+  // LLVM: store i32 20
+  // OGCG: store i32 20
+  gi = __builtin_object_size(&t[0].t[5], 3);
+}
+
+// CIR-LABEL: @test21
+// LLVM-LABEL: define {{.*}} void @test21
+// OGCG-LABEL: define {{.*}} void @test21
+void test21(void) {
+  struct { int t; } t;
+
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t + 1, 0);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t + 1, 1);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t + 1, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t + 1, 3);
+
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t.t + 1, 0);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t.t + 1, 1);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t.t + 1, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t.t + 1, 3);
+}
+
+// CIR-LABEL: @test22
+// LLVM-LABEL: define {{.*}} void @test22
+// OGCG-LABEL: define {{.*}} void @test22
+void test22(void) {
+  struct { int t[10]; } t[10];
+
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[10], 0);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[10], 1);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[10], 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[10], 3);
+
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[9].t[10], 0);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[9].t[10], 1);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[9].t[10], 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[9].t[10], 3);
+
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)&t[0] + sizeof(t), 0);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)&t[0] + sizeof(t), 1);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)&t[0] + sizeof(t), 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)&t[0] + sizeof(t), 3);
+
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)&t[9].t[0] + 10*sizeof(t[0].t), 0);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)&t[9].t[0] + 10*sizeof(t[0].t), 1);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)&t[9].t[0] + 10*sizeof(t[0].t), 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((char*)&t[9].t[0] + 10*sizeof(t[0].t), 3);
+}
+
+struct Test23Ty { int a; int t[10]; };
+
+// CIR-LABEL: @test23
+// LLVM-LABEL: define {{.*}} void @test23
+// OGCG-LABEL: define {{.*}} void @test23
+void test23(struct Test23Ty *p) {
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(p, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(p, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(p, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(p, 3);
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(&p->a, 0);
+  
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&p->a, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(&p->a, 2);
+  
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&p->a, 3);
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(&p->t[5], 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(&p->t[5], 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(&p->t[5], 2);
+  
+  // CIR: cir.const #cir.int<20>
+  // LLVM: store i32 20
+  // OGCG: store i32 20
+  gi = __builtin_object_size(&p->t[5], 3);
+}
+
+// CIR-LABEL: @test24
+// LLVM-LABEL: define {{.*}} void @test24
+// OGCG-LABEL: define {{.*}} void @test24
+void test24(void) {
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size((void*)0, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size((void*)0, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size((void*)0, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((void*)0, 3);
+}
+
+// CIR-LABEL: @test25
+// LLVM-LABEL: define {{.*}} void @test25
+// OGCG-LABEL: define {{.*}} void @test25
+void test25(void) {
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size((void*)0x1000, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size((void*)0x1000, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size((void*)0x1000, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size((void*)0x1000, 3);
+
+  // Skipping (void*)0 + 0x1000 tests - void pointer arithmetic NYI in CIR
+}
+
+// CIR-LABEL: @test26
+// LLVM-LABEL: define {{.*}} void @test26
+// OGCG-LABEL: define {{.*}} void @test26
+void test26(void) {
+  struct { int v[10]; } t[10];
+
+  // CIR: cir.const #cir.int<316>
+  // LLVM: store i32 316
+  // OGCG: store i32 316
+  gi = __builtin_object_size(&t[1].v[11], 0);
+  
+  // CIR: cir.const #cir.int<312>
+  // LLVM: store i32 312
+  // OGCG: store i32 312
+  gi = __builtin_object_size(&t[1].v[12], 1);
+  
+  // CIR: cir.const #cir.int<308>
+  // LLVM: store i32 308
+  // OGCG: store i32 308
+  gi = __builtin_object_size(&t[1].v[13], 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&t[1].v[14], 3);
+}
+
+struct Test27IncompleteTy;
+
+// CIR-LABEL: @test27
+// LLVM-LABEL: define {{.*}} void @test27
+// OGCG-LABEL: define {{.*}} void @test27
+void test27(struct Test27IncompleteTy *t) {
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(t, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(t, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(t, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(t, 3);
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(&test27, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(&test27, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr {{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(&test27, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(&test27, 3);
+}
+
+// CIR-LABEL: @test28
+// LLVM-LABEL: define {{.*}} void @test28
+// OGCG-LABEL: define {{.*}} void @test28
+void test28(void) {
+  struct { int v[10]; } t[10];
+
+  // CIR: cir.const #cir.int<360>
+  // LLVM: store i32 360
+  // OGCG: store i32 360
+  gi = __builtin_object_size((char*)((short*)(&t[1])), 0);
+  
+  // CIR: cir.const #cir.int<360>
+  // LLVM: store i32 360
+  // OGCG: store i32 360
+  gi = __builtin_object_size((char*)((short*)(&t[1])), 1);
+  
+  // CIR: cir.const #cir.int<360>
+  // LLVM: store i32 360
+  // OGCG: store i32 360
+  gi = __builtin_object_size((char*)((short*)(&t[1])), 2);
+  
+  // CIR: cir.const #cir.int<360>
+  // LLVM: store i32 360
+  // OGCG: store i32 360
+  gi = __builtin_object_size((char*)((short*)(&t[1])), 3);
+
+  // CIR: cir.const #cir.int<356>
+  // LLVM: store i32 356
+  // OGCG: store i32 356
+  gi = __builtin_object_size((char*)((short*)(&t[1].v[1])), 0);
+  
+  // CIR: cir.const #cir.int<36>
+  // LLVM: store i32 36
+  // OGCG: store i32 36
+  gi = __builtin_object_size((char*)((short*)(&t[1].v[1])), 1);
+  
+  // CIR: cir.const #cir.int<356>
+  // LLVM: store i32 356
+  // OGCG: store i32 356
+  gi = __builtin_object_size((char*)((short*)(&t[1].v[1])), 2);
+  
+  // CIR: cir.const #cir.int<36>
+  // LLVM: store i32 36
+  // OGCG: store i32 36
+  gi = __builtin_object_size((char*)((short*)(&t[1].v[1])), 3);
+}
+
+struct DynStructVar {
+  char fst[16];
+  char snd[];
+};
+
+struct DynStruct0 {
+  char fst[16];
+  char snd[0];
+};
+
+struct DynStruct1 {
+  char fst[16];
+  char snd[1];
+};
+
+struct StaticStruct {
+  char fst[16];
+  char snd[2];
+};
+
+// CIR-LABEL: @test29
+// LLVM-LABEL: define {{.*}} void @test29
+// OGCG-LABEL: define {{.*}} void @test29
+void test29(struct DynStructVar *dv, struct DynStruct0 *d0,
+            struct DynStruct1 *d1, struct StaticStruct *ss) {
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(dv->snd, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(dv->snd, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(dv->snd, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(dv->snd, 3);
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(d0->snd, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(d0->snd, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(d0->snd, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(d0->snd, 3);
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(d1->snd, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(d1->snd, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(d1->snd, 2);
+  
+  // CIR: cir.const #cir.int<1>
+  // LLVM: store i32 1
+  // OGCG: store i32 1
+  gi = __builtin_object_size(d1->snd, 3);
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(ss->snd, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(ss->snd, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(ss->snd, 2);
+  
+  // CIR: cir.const #cir.int<2>
+  // LLVM: store i32 2
+  // OGCG: store i32 2
+  gi = __builtin_object_size(ss->snd, 3);
+}
+
+// CIR-LABEL: @test30
+// LLVM-LABEL: define {{.*}} void @test30
+// OGCG-LABEL: define {{.*}} void @test30
+void test30(void) {
+  struct { struct DynStruct1 fst, snd; } *nested;
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(nested->fst.snd, 0);
+  
+  // CIR: cir.const #cir.int<1>
+  // LLVM: store i32 1
+  // OGCG: store i32 1
+  gi = __builtin_object_size(nested->fst.snd, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(nested->fst.snd, 2);
+  
+  // CIR: cir.const #cir.int<1>
+  // LLVM: store i32 1
+  // OGCG: store i32 1
+  gi = __builtin_object_size(nested->fst.snd, 3);
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(nested->snd.snd, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(nested->snd.snd, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(nested->snd.snd, 2);
+  
+  // CIR: cir.const #cir.int<1>
+  // LLVM: store i32 1
+  // OGCG: store i32 1
+  gi = __builtin_object_size(nested->snd.snd, 3);
+
+  union { struct DynStruct1 d1; char c[1]; } *u;
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(u->c, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(u->c, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(u->c, 2);
+  
+  // CIR: cir.const #cir.int<1>
+  // LLVM: store i32 1
+  // OGCG: store i32 1
+  gi = __builtin_object_size(u->c, 3);
+
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(u->d1.snd, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(u->d1.snd, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(u->d1.snd, 2);
+  
+  // CIR: cir.const #cir.int<1>
+  // LLVM: store i32 1
+  // OGCG: store i32 1
+  gi = __builtin_object_size(u->d1.snd, 3);
+}
+
+// CIR-LABEL: @test32
+// LLVM-LABEL: define {{.*}} i64 @test32
+// OGCG-LABEL: define {{.*}} i64 @test32
+static struct DynStructVar D32 = {
+  .fst = {},
+  .snd = { 0, 1, 2, },
+};
+unsigned long test32(void) {
+  // CIR: cir.const #cir.int<19>
+  // LLVM: store i64 19
+  // OGCG: ret i64 19
+  return __builtin_object_size(&D32, 1);
+}
+
+// CIR-LABEL: @test33
+// LLVM-LABEL: define {{.*}} i64 @test33
+// OGCG-LABEL: define {{.*}} i64 @test33
+static struct DynStructVar D33 = {
+  .fst = {},
+  .snd = {},
+};
+unsigned long test33(void) {
+  // CIR: cir.const #cir.int<16>
+  // LLVM: store i64 16
+  // OGCG: ret i64 16
+  return __builtin_object_size(&D33, 1);
+}
+
+// CIR-LABEL: @test34
+// LLVM-LABEL: define {{.*}} i64 @test34
+// OGCG-LABEL: define {{.*}} i64 @test34
+static struct DynStructVar D34 = {
+  .fst = {},
+};
+unsigned long test34(void) {
+  // CIR: cir.const #cir.int<16>
+  // LLVM: store i64 16
+  // OGCG: ret i64 16
+  return __builtin_object_size(&D34, 1);
+}
+
+// CIR-LABEL: @test35
+// LLVM-LABEL: define {{.*}} i64 @test35
+// OGCG-LABEL: define {{.*}} i64 @test35
+unsigned long test35(void) {
+  // CIR: cir.const #cir.int<16>
+  // LLVM: store i64 16
+  // OGCG: ret i64 16
+  return __builtin_object_size(&(struct DynStructVar){}, 1);
+}
+
+// CIR-LABEL: @test37
+// LLVM-LABEL: define {{.*}} i64 @test37
+// OGCG-LABEL: define {{.*}} i64 @test37
+struct Z { struct A { int x, y[]; } z; int a; int b[]; };
+static struct Z my_z = { .b = {1,2,3} };
+unsigned long test37(void) {
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i64 4
+  // OGCG: ret i64 4
+  return __builtin_object_size(&my_z.z, 1);
+}
+
+// CIR-LABEL: @PR30346
+// LLVM-LABEL: define {{.*}} void @PR30346
+// OGCG-LABEL: define {{.*}} void @PR30346
+void PR30346(void) {
+  struct sa_family_t {};
+  struct sockaddr {
+    struct sa_family_t sa_family;
+    char sa_data[14];
+  };
+
+  struct sockaddr *sa;
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(sa->sa_data, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1
+  gi = __builtin_object_size(sa->sa_data, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1
+  gi = __builtin_object_size(sa->sa_data, 2);
+  
+  // CIR: cir.const #cir.int<14>
+  // LLVM: store i32 14
+  // OGCG: store i32 14
+  gi = __builtin_object_size(sa->sa_data, 3);
+}
+
+extern char incomplete_char_array[];
+
+// CIR-LABEL: @incomplete_and_function_types
+// LLVM-LABEL: define {{.*}} void @incomplete_and_function_types
+// OGCG-LABEL: define {{.*}} void @incomplete_and_function_types
+void incomplete_and_function_types(void) {
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0
+  // OGCG: call i64 @llvm.objectsize.i64.p0
+  gi = __builtin_object_size(incomplete_char_array, 0);
+  
+  // CIR: cir.objsize max nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0
+  // OGCG: call i64 @llvm.objectsize.i64.p0
+  gi = __builtin_object_size(incomplete_char_array, 1);
+  
+  // CIR: cir.objsize min nullunknown {{.*}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0
+  // OGCG: call i64 @llvm.objectsize.i64.p0
+  gi = __builtin_object_size(incomplete_char_array, 2);
+  
+  // CIR: cir.const #cir.int<0>
+  // LLVM: store i32 0
+  // OGCG: store i32 0
+  gi = __builtin_object_size(incomplete_char_array, 3);
+}
+
+// CIR-LABEL: @deeply_nested
+// LLVM-LABEL: define {{.*}} void @deeply_nested
+// OGCG-LABEL: define {{.*}} void @deeply_nested
+void deeply_nested(void) {
+  struct {
+    struct {
+      struct {
+        struct {
+          int e[2];
+          char f;
+        } d[2];
+      } c[2];
+    } b[2];
+  } *a;
+
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&a->b[1].c[1].d[1].e[1], 1);
+  
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size(&a->b[1].c[1].d[1].e[1], 3);
+}
diff --git a/clang/test/CIR/CodeGen/object-size.cpp b/clang/test/CIR/CodeGen/object-size.cpp
new file mode 100644
index 0000000000000..b60e24594388d
--- /dev/null
+++ b/clang/test/CIR/CodeGen/object-size.cpp
@@ -0,0 +1,108 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+// C++-specific tests for __builtin_object_size
+
+int gi;
+
+// CIR-LABEL: @_Z5test1v
+// LLVM-LABEL: define{{.*}} void @_Z5test1v()
+// OGCG-LABEL: define{{.*}} void @_Z5test1v()
+void test1() {
+  // Guaranteeing that our cast removal logic doesn't break more interesting
+  // cases.
+  struct A { int a; };
+  struct B { int b; };
+  struct C: public A, public B {};
+
+  C c;
+
+  // CIR: cir.const #cir.int<8>
+  // LLVM: store i32 8
+  // OGCG: store i32 8
+  gi = __builtin_object_size(&c, 0);
+  // CIR: cir.const #cir.int<8>
+  // LLVM: store i32 8
+  // OGCG: store i32 8
+  gi = __builtin_object_size((A*)&c, 0);
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size((B*)&c, 0);
+
+  // CIR: cir.const #cir.int<8>
+  // LLVM: store i32 8
+  // OGCG: store i32 8
+  gi = __builtin_object_size((char*)&c, 0);
+  // CIR: cir.const #cir.int<8>
+  // LLVM: store i32 8
+  // OGCG: store i32 8
+  gi = __builtin_object_size((char*)(A*)&c, 0);
+  // CIR: cir.const #cir.int<4>
+  // LLVM: store i32 4
+  // OGCG: store i32 4
+  gi = __builtin_object_size((char*)(B*)&c, 0);
+}
+
+// CIR-LABEL: @_Z5test2v()
+// LLVM-LABEL: define{{.*}} void @_Z5test2v()
+// OGCG-LABEL: define{{.*}} void @_Z5test2v()
+void test2() {
+  struct A { char buf[16]; };
+  struct B : A {};
+  struct C { int i; B bs[1]; } *c;
+
+  // CIR: cir.objsize max nullunknown %{{.+}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
+  gi = __builtin_object_size(&c->bs[0], 0);
+  // CIR: cir.objsize max nullunknown %{{.+}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
+  gi = __builtin_object_size(&c->bs[0], 1);
+  // CIR: cir.objsize min nullunknown %{{.+}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1 false)
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1 false)
+  gi = __builtin_object_size(&c->bs[0], 2);
+  // CIR: cir.const #cir.int<16>
+  // LLVM: store i32 16
+  // OGCG: store i32 16
+  gi = __builtin_object_size(&c->bs[0], 3);
+
+  // NYI: DerivedToBase cast
+  // gi = __builtin_object_size((A*)&c->bs[0], 0);
+
+  // CIR: cir.const #cir.int<16>
+  // LLVM: store i32 16
+  // OGCG: store i32 16
+  gi = __builtin_object_size((A*)&c->bs[0], 1);
+
+  // NYI: DerivedToBase cast 
+  // gi = __builtin_object_size((A*)&c->bs[0], 2);
+
+  // CIR: cir.const #cir.int<16>
+  // LLVM: store i32 16
+  // OGCG: store i32 16
+  gi = __builtin_object_size((A*)&c->bs[0], 3);
+
+  // CIR: cir.objsize max nullunknown %{{.+}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false)
+  gi = __builtin_object_size(&c->bs[0].buf[0], 0);
+  // CIR: cir.const #cir.int<16>
+  // LLVM: store i32 16
+  // OGCG: store i32 16
+  gi = __builtin_object_size(&c->bs[0].buf[0], 1);
+  // CIR: cir.objsize min nullunknown %{{.+}} : !cir.ptr<!void> -> !u64i
+  // LLVM: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1 false)
+  // OGCG: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1 false)
+  gi = __builtin_object_size(&c->bs[0].buf[0], 2);
+  // CIR: cir.const #cir.int<16>
+  // LLVM: store i32 16
+  // OGCG: store i32 16
+  gi = __builtin_object_size(&c->bs[0].buf[0], 3);
+}
diff --git a/clang/test/CIR/CodeGen/paren-init-list.cpp b/clang/test/CIR/CodeGen/paren-init-list.cpp
index 0efa36352899e..a5676e2b31667 100644
--- a/clang/test/CIR/CodeGen/paren-init-list.cpp
+++ b/clang/test/CIR/CodeGen/paren-init-list.cpp
@@ -13,18 +13,11 @@ struct CompleteS {
 void cxx_paren_list_init_expr() { CompleteS a(1, 'a'); }
 
 // CIR: %[[A_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["a", init]
-// CIR: %[[ELEM_0_PTR:.*]] = cir.get_member %[[A_ADDR]][0] {name = "a"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s32i>
-// CIR: %[[ELEM_0_VAL:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: cir.store{{.*}} %[[ELEM_0_VAL]], %[[ELEM_0_PTR]] : !s32i, !cir.ptr<!s32i>
-// CIR: %[[ELEM_1_PTR:.*]] = cir.get_member %[[A_ADDR]][1] {name = "b"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s8i>
-// CIR: %[[ELEM_1_VAL:.*]] = cir.const #cir.int<97> : !s8i
-// CIR: cir.store{{.*}} %[[ELEM_1_VAL]], %[[ELEM_1_PTR]] : !s8i, !cir.ptr<!s8i>
+// CIR: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<97> : !s8i}> : !rec_CompleteS
+// CIR: cir.store{{.*}} %[[CONST]], %[[A_ADDR]]
 
 // LLVM: %[[A_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
-// LLVM: %[[ELEM_0_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 0
-// LLVM: store i32 1, ptr %[[ELEM_0_PTR]], align 4
-// LLVM: %[[ELEM_1_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 1
-// LLVM: store i8 97, ptr %[[ELEM_1_PTR]], align 4
+// LLVM: store %struct.CompleteS { i32 1, i8 97 }, ptr %[[A_ADDR]], align 4
 
 // OGCG: %[[A_ADDR:.*]] = alloca %struct.CompleteS, align 4
 // OGCG: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[A_ADDR]], ptr align 4 @__const._Z24cxx_paren_list_init_exprv.a, i64 8, i1 false)
diff --git a/clang/test/CIR/CodeGen/statement-exprs.c b/clang/test/CIR/CodeGen/statement-exprs.c
index c784ec9eda7d8..f917334ade829 100644
--- a/clang/test/CIR/CodeGen/statement-exprs.c
+++ b/clang/test/CIR/CodeGen/statement-exprs.c
@@ -6,7 +6,7 @@
 // RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
 
 int f19(void) {
-  return ({ 3;;4;; });
+  return ({ 3;;4; });
 }
 
 // CIR: cir.func dso_local @f19() -> !s32i
@@ -42,6 +42,16 @@ int f19(void) {
 // OGCG:   %[[TMP_VAL:.+]] = load i32, ptr %[[TMP]]
 // OGCG:   ret i32 %[[TMP_VAL]]
 
+// PR166036: The trailing NullStmt should result in a void.
+void f20(void) {
+  return ({ 3;;4;; });
+}
+
+// CIR-LABEL: cir.func dso_local @f20() {{[^-]*}}
+// CIR: cir.return {{[^%]*}}
+
+// LLVM-LABEL: define{{.*}} void @f20
+// LLVM: ret void
 
 int nested(void) {
   ({123;});
@@ -223,9 +233,8 @@ int test3() { return ({ struct S s = {1}; s; }).x; }
 // CIR:     %[[TMP:.+]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["tmp"]
 // CIR:     cir.scope {
 // CIR:       %[[S:.+]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["s", init]
-// CIR:       %[[GEP_X_S:.+]] = cir.get_member %[[S]][0] {name = "x"} : !cir.ptr<!rec_S> -> !cir.ptr<!s32i>
-// CIR:       %[[C1:.+]] = cir.const #cir.int<1> : !s32i
-// CIR:       cir.store {{.*}} %[[C1]], %[[GEP_X_S]] : !s32i, !cir.ptr<!s32i>
+// CIR:       %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i}> : !rec_S
+// CIR:       cir.store{{.*}} %[[CONST]], %[[S]] : !rec_S, !cir.ptr<!rec_S>
 // CIR:       cir.copy %[[S]] to %[[REF_TMP0]] : !cir.ptr<!rec_S>
 // CIR:     }
 // CIR:     %[[GEP_X_TMP:.+]] = cir.get_member %[[REF_TMP0]][0] {name = "x"} : !cir.ptr<!rec_S> -> !cir.ptr<!s32i>
@@ -244,8 +253,7 @@ int test3() { return ({ struct S s = {1}; s; }).x; }
 // LLVM: [[LBL5]]:
 // LLVM:     br label %[[LBL6:.+]]
 // LLVM: [[LBL6]]:
-// LLVM:     %[[GEP_S:.+]] = getelementptr %struct.S, ptr %[[VAR3]], i32 0, i32 0
-// LLVM:     store i32 1, ptr %[[GEP_S]]
+// LLVM:     store %struct.S { i32 1 }, ptr %[[VAR3]]
 // LLVM:     call void @llvm.memcpy.p0.p0.i32(ptr %[[VAR1]], ptr %[[VAR3]], i32 4, i1 false)
 // LLVM:     br label %[[LBL8:.+]]
 // LLVM: [[LBL8]]:
diff --git a/clang/test/CIR/CodeGen/struct-init.cpp b/clang/test/CIR/CodeGen/struct-init.cpp
index 79886190616b9..8f146684ffb10 100644
--- a/clang/test/CIR/CodeGen/struct-init.cpp
+++ b/clang/test/CIR/CodeGen/struct-init.cpp
@@ -65,41 +65,16 @@ void init() {
 // CIR: cir.func{{.*}} @_Z4initv()
 // CIR:   %[[S1:.*]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["s1", init]
 // CIR:   %[[S2:.*]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["s2", init]
-// CIR:   %[[S1_A:.*]] = cir.get_member %[[S1]][0] {name = "a"}
-// CIR:   %[[ONE:.*]] = cir.const #cir.int<1>
-// CIR:   cir.store{{.*}} %[[ONE]], %[[S1_A]]
-// CIR:   %[[S1_B:.*]] = cir.get_member %[[S1]][1] {name = "b"}
-// CIR:   %[[TWO:.*]] = cir.const #cir.int<2>
-// CIR:   cir.store{{.*}} %[[TWO]], %[[S1_B]]
-// CIR:   %[[S1_C:.*]] = cir.get_member %[[S1]][2] {name = "c"}
-// CIR:   %[[THREE:.*]] = cir.const #cir.int<3>
-// CIR:   cir.store{{.*}} %[[THREE]], %[[S1_C]]
-// CIR:   %[[S2_A:.*]] = cir.get_member %[[S2]][0] {name = "a"}
-// CIR:   %[[FOUR:.*]] = cir.const #cir.int<4>
-// CIR:   cir.store{{.*}} %[[FOUR]], %[[S2_A]]
-// CIR:   %[[S2_B:.*]] = cir.get_member %[[S2]][1] {name = "b"}
-// CIR:   %[[FIVE:.*]] = cir.const #cir.int<5>
-// CIR:   cir.store{{.*}} %[[FIVE]], %[[S2_B]]
-// CIR:   %[[S2_C:.*]] = cir.get_member %[[S2]][2] {name = "c"}
-// CIR:   %[[ZERO:.*]] = cir.const #cir.int<0>
-// CIR:   cir.store{{.*}} %[[ZERO]], %[[S2_C]]
-// CIR:   cir.return
+// CIR:   %[[CONST_1:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i}> : !rec_S
+// CIR:   cir.store{{.*}} %[[CONST_1]], %[[S1]]
+// CIR:   %[[CONST_2:.*]] = cir.const #cir.const_record<{#cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<0> : !s32i}> : !rec_S
+// CIR:   cir.store{{.*}} %[[CONST_2]], %[[S2]]
 
 // LLVM: define{{.*}} void @_Z4initv()
 // LLVM:   %[[S1:.*]] = alloca %struct.S
 // LLVM:   %[[S2:.*]] = alloca %struct.S
-// LLVM:   %[[S1_A:.*]] = getelementptr %struct.S, ptr %[[S1]], i32 0, i32 0
-// LLVM:   store i32 1, ptr %[[S1_A]]
-// LLVM:   %[[S1_B:.*]] = getelementptr %struct.S, ptr %[[S1]], i32 0, i32 1
-// LLVM:   store i32 2, ptr %[[S1_B]]
-// LLVM:   %[[S1_C:.*]] = getelementptr %struct.S, ptr %[[S1]], i32 0, i32 2
-// LLVM:   store i32 3, ptr %[[S1_C]]
-// LLVM:   %[[S2_A:.*]] = getelementptr %struct.S, ptr %[[S2]], i32 0, i32 0
-// LLVM:   store i32 4, ptr %[[S2_A]]
-// LLVM:   %[[S2_B:.*]] = getelementptr %struct.S, ptr %[[S2]], i32 0, i32 1
-// LLVM:   store i32 5, ptr %[[S2_B]]
-// LLVM:   %[[S2_C:.*]] = getelementptr %struct.S, ptr %[[S2]], i32 0, i32 2
-// LLVM:   store i32 0, ptr %[[S2_C]]
+// LLVM:   store %struct.S { i32 1, i32 2, i32 3 }, ptr %[[S1]], align 4
+// LLVM:   store %struct.S { i32 4, i32 5, i32 0 }, ptr %[[S2]], align 4
 
 // OGCG: @__const._Z4initv.s1 = private unnamed_addr constant %struct.S { i32 1, i32 2, i32 3 }
 // OGCG: @__const._Z4initv.s2 = private unnamed_addr constant %struct.S { i32 4, i32 5, i32 0 }
diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp
index c8db71498e477..c15e7e7c57b9f 100644
--- a/clang/test/CIR/CodeGen/struct.cpp
+++ b/clang/test/CIR/CodeGen/struct.cpp
@@ -107,21 +107,14 @@ void paren_expr() {
 // CIR: cir.func{{.*}} @_Z10paren_exprv()
 // CIR:   %[[A_ADDR:.*]] = cir.alloca !rec_Point, !cir.ptr<!rec_Point>, ["a", init]
 // CIR:   %[[B_ADDR:.*]] = cir.alloca !rec_Point, !cir.ptr<!rec_Point>, ["b", init]
-// CIR:   %[[X_ELEM_PTR:.*]] = cir.get_member %[[A_ADDR]][0] {name = "x"} : !cir.ptr<!rec_Point> -> !cir.ptr<!s32i>
-// CIR:   %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
-// CIR:   cir.store{{.*}} %[[CONST_0]], %[[X_ELEM_PTR]] : !s32i, !cir.ptr<!s32i>
-// CIR:   %[[Y_ELEM_PTR:.*]] = cir.get_member %[[A_ADDR]][1] {name = "y"} : !cir.ptr<!rec_Point> -> !cir.ptr<!s32i>
-// CIR:   %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
-// CIR:   cir.store{{.*}} %[[CONST_0]], %[[Y_ELEM_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR:   %[[CONST:.*]] = cir.const #cir.zero : !rec_Point
+// CIR:   cir.store{{.*}} %[[CONST]], %[[A_ADDR]] : !rec_Point, !cir.ptr<!rec_Point>
 // CIR:   cir.call @_ZZ10paren_exprvEN5PointC1ERKS_(%[[B_ADDR]], %[[A_ADDR]]) nothrow : (!cir.ptr<!rec_Point>, !cir.ptr<!rec_Point>) -> ()
 
 // LLVM: define{{.*}} void @_Z10paren_exprv()
 // LLVM:   %[[A_ADDR:.*]] = alloca %struct.Point, i64 1, align 4
 // LLVM:   %[[B_ADDR:.*]] = alloca %struct.Point, i64 1, align 4
-// LLVM:   %[[X_ELEM_PTR:.*]] = getelementptr %struct.Point, ptr %[[A_ADDR]], i32 0, i32 0
-// LLVM:   store i32 0, ptr %[[X_ELEM_PTR]], align 4
-// LLVM:   %[[Y_ELEM_PTR:.*]] = getelementptr %struct.Point, ptr %[[A_ADDR]], i32 0, i32 1
-// LLVM:   store i32 0, ptr %[[Y_ELEM_PTR]], align 4
+// LLVM:   store %struct.Point zeroinitializer, ptr %[[A_ADDR]], align 4
 // LLVM:   call void @_ZZ10paren_exprvEN5PointC1ERKS_(ptr %[[B_ADDR]], ptr %[[A_ADDR]])
 
 // OGCG: define{{.*}} void @_Z10paren_exprv()
@@ -265,16 +258,11 @@ void bin_comma() {
 
 // CIR: cir.func{{.*}} @_Z9bin_commav()
 // CIR:   %[[A_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["a", init]
-// CIR:   %[[TMP_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["agg.tmp.ensured"]
-// CIR:   %[[ZERO:.*]] = cir.const #cir.zero : !rec_CompleteS
-// CIR:   cir.store{{.*}} %[[ZERO]], %[[TMP_ADDR]] : !rec_CompleteS, !cir.ptr<!rec_CompleteS>
-// CIR:   %[[ZERO:.*]] = cir.const #cir.zero : !rec_CompleteS
-// CIR:   cir.store{{.*}} %[[ZERO]], %[[A_ADDR]] : !rec_CompleteS, !cir.ptr<!rec_CompleteS>
+// CIR:   %[[CONST:.*]] = cir.const #cir.zero : !rec_CompleteS
+// CIR:   cir.store{{.*}} %[[CONST]], %[[A_ADDR]] : !rec_CompleteS, !cir.ptr<!rec_CompleteS>
 
 // LLVM: define{{.*}} void @_Z9bin_commav()
 // LLVM:   %[[A_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
-// LLVM:   %[[TMP_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
-// LLVM:   store %struct.CompleteS zeroinitializer, ptr %[[TMP_ADDR]], align 4
 // LLVM:   store %struct.CompleteS zeroinitializer, ptr %[[A_ADDR]], align 4
 
 // OGCG: define{{.*}} void @_Z9bin_commav()
@@ -284,20 +272,13 @@ void bin_comma() {
 void compound_literal_expr() { CompleteS a = (CompleteS){}; }
 
 // CIR: %[[A_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["a", init]
-// CIR: %[[A_ELEM_0_PTR:.*]] = cir.get_member %[[A_ADDR]][0] {name = "a"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s32i>
-// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
-// CIR: cir.store{{.*}} %[[CONST_0]], %[[A_ELEM_0_PTR]] : !s32i, !cir.ptr<!s32i>
-// CIR: %[[A_ELEM_1_PTR:.*]] = cir.get_member %[[A_ADDR]][1] {name = "b"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s8i>
-// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s8i
-// CIR: cir.store{{.*}} %[[CONST_0]], %[[A_ELEM_1_PTR]] : !s8i, !cir.ptr<!s8i>
+// CIR: %[[CONST:.*]] = cir.const #cir.zero : !rec_CompleteS
+// CIR: cir.store{{.*}} %[[CONST]], %[[A_ADDR]] : !rec_CompleteS, !cir.ptr<!rec_CompleteS>
 
 // TODO(cir): zero-initialize the padding
 
 // LLVM: %[[A_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
-// LLVM: %[[A_ELEM_0_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 0
-// LLVM: store i32 0, ptr %[[A_ELEM_0_PTR]], align 4
-// LLVM: %[[A_ELEM_1_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 1
-// LLVM: store i8 0, ptr %[[A_ELEM_1_PTR]], align 4
+// LLVM: store %struct.CompleteS zeroinitializer, ptr %[[A_ADDR]], align 4
 
 // OGCG: %[[A_ADDR:.*]] = alloca %struct.CompleteS, align 4
 // OGCG: call void @llvm.memset.p0.i64(ptr align 4 %[[A_ADDR]], i8 0, i64 8, i1 false)
@@ -344,3 +325,47 @@ void struct_with_const_member_expr() {
 // OGCG: %[[BF_SET:.*]] = or i8 %[[BF_CLEAR]], 0
 // OGCG: store i8 %[[BF_SET]], ptr %[[REF_ADDR]], align 4
 // OGCG: store i32 0, ptr %[[A_ADDR]], align 4
+
+void function_arg_with_default_value(CompleteS a = {1, 2}) {}
+
+// CIR: %[[ARG_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["a", init]
+// CIR: cir.store %{{.*}}, %[[ARG_ADDR]] : !rec_CompleteS, !cir.ptr<!rec_CompleteS>
+
+// LLVM: %[[ARG_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
+// LLVM: store %struct.CompleteS %{{.*}}, ptr %[[ARG_ADDR]], align 4
+
+// OGCG: %[[ARG_ADDR:.*]] = alloca %struct.CompleteS, align 4
+// OGCG: store i64 %{{.*}}, ptr %[[ARG_ADDR]], align 4
+
+void calling_function_with_default_values() {
+  function_arg_with_default_value();
+}
+
+// CIR: %[[AGG_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["agg.tmp0"]
+// CIR: %[[ELEM_0_PTR:.*]] = cir.get_member %[[AGG_ADDR]][0] {name = "a"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s32i>
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: cir.store{{.*}} %[[CONST_1]], %[[ELEM_0_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[ELEM_1_PTR:.*]] = cir.get_member %[[AGG_ADDR]][1] {name = "b"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s8i>
+// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
+// CIR: %[[CONST_2_I8:.*]] = cir.cast integral %[[CONST_2]] : !s32i -> !s8i
+// CIR: cir.store{{.*}} %[[CONST_2_I8]], %[[ELEM_1_PTR]] : !s8i, !cir.ptr<!s8i>
+// CIR: %[[TMP_AGG:.*]] = cir.load{{.*}} %[[AGG_ADDR]] : !cir.ptr<!rec_CompleteS>, !rec_CompleteS
+// CIR: cir.call @_Z31function_arg_with_default_value9CompleteS(%[[TMP_AGG]]) : (!rec_CompleteS) -> ()
+
+// TODO(CIR): the difference between the CIR LLVM and OGCG is because the lack of calling convention lowering,
+
+// LLVM: %[[AGG_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
+// LLVM: %[[ELEM_0_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 0
+// LLVM: store i32 1, ptr %[[ELEM_0_PTR]], align 4
+// LLVM: %[[ELEM_1_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 1
+// LLVM: store i8 2, ptr %[[ELEM_1_PTR]], align 4
+// LLVM: %[[TMP_AGG:.*]] = load %struct.CompleteS, ptr %[[AGG_ADDR]], align 4
+// LLVM: call void @_Z31function_arg_with_default_value9CompleteS(%struct.CompleteS %[[TMP_AGG]])
+
+// OGCG: %[[AGG_ADDR:.*]] = alloca %struct.CompleteS, align 4
+// OGCG: %[[ELEM_0_PTR:.*]] = getelementptr inbounds nuw %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 0
+// OGCG: store i32 1, ptr %[[ELEM_0_PTR]], align 4
+// OGCG: %[[ELEM_1_PTR:.*]] = getelementptr inbounds nuw %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 1
+// OGCG: store i8 2, ptr %[[ELEM_1_PTR]], align 4
+// OGCG: %[[TMP_AGG:.*]] = load i64, ptr %[[AGG_ADDR]], align 4
+// OGCG: call void @_Z31function_arg_with_default_value9CompleteS(i64 %[[TMP_AGG]])
diff --git a/clang/test/CIR/CodeGen/switch.cpp b/clang/test/CIR/CodeGen/switch.cpp
index e13aa8f4f4953..3824be0d08c2f 100644
--- a/clang/test/CIR/CodeGen/switch.cpp
+++ b/clang/test/CIR/CodeGen/switch.cpp
@@ -1183,3 +1183,90 @@ int nested_switch(int a) {
 // OGCG: [[IFEND10]]:
 // OGCG:   br label %[[EPILOG]]
 // OGCG: [[EPILOG]]:
+
+int sw_return_multi_cases(int x) {
+  switch (x) {
+  case 0:
+    return 0;
+  case 1:
+    return 1;
+  case 2:
+    return 2;
+  default:
+    return -1;
+  }
+}
+
+// CIR-LABEL: cir.func{{.*}} @_Z21sw_return_multi_casesi
+// CIR:       cir.switch (%{{.*}} : !s32i) {
+// CIR-NEXT:  cir.case(equal, [#cir.int<0> : !s32i]) {
+// CIR:         %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:         cir.store{{.*}} %[[ZERO]], %{{.*}} : !s32i, !cir.ptr<!s32i>
+// CIR:         %[[RET0:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr<!s32i>, !s32i
+// CIR-NEXT:    cir.return %[[RET0]] : !s32i
+// CIR-NEXT:  }
+// CIR-NEXT:  cir.case(equal, [#cir.int<1> : !s32i]) {
+// CIR:         %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CIR:         cir.store{{.*}} %[[ONE]], %{{.*}} : !s32i, !cir.ptr<!s32i>
+// CIR:         %[[RET1:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr<!s32i>, !s32i
+// CIR-NEXT:    cir.return %[[RET1]] : !s32i
+// CIR-NEXT:  }
+// CIR-NEXT:  cir.case(equal, [#cir.int<2> : !s32i]) {
+// CIR:         %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
+// CIR:         cir.store{{.*}} %[[TWO]], %{{.*}} : !s32i, !cir.ptr<!s32i>
+// CIR:         %[[RET2:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr<!s32i>, !s32i
+// CIR-NEXT:    cir.return %[[RET2]] : !s32i
+// CIR-NEXT:  }
+// CIR-NEXT:  cir.case(default, []) {
+// CIR:         %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CIR:         %[[NEG:.*]] = cir.unary(minus, %[[ONE]]) {{.*}} : !s32i, !s32i
+// CIR:         cir.store{{.*}} %[[NEG]], %{{.*}} : !s32i, !cir.ptr<!s32i>
+// CIR:         %[[RETDEF:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr<!s32i>, !s32i
+// CIR-NEXT:    cir.return %[[RETDEF]] : !s32i
+// CIR-NEXT:  }
+// CIR-NEXT:  cir.yield
+
+// LLVM-LABEL: define{{.*}} i32 @_Z21sw_return_multi_casesi
+// LLVM:   switch i32 %{{.*}}, label %[[DEFAULT:.*]] [
+// LLVM-DAG:   i32 0, label %[[CASE0:.*]]
+// LLVM-DAG:   i32 1, label %[[CASE1:.*]]
+// LLVM-DAG:   i32 2, label %[[CASE2:.*]]
+// LLVM:   ]
+// LLVM: [[CASE0]]:
+// LLVM:   store i32 0, ptr %{{.*}}, align 4
+// LLVM:   %{{.*}} = load i32, ptr %{{.*}}, align 4
+// LLVM:   ret i32 %{{.*}}
+// LLVM: [[CASE1]]:
+// LLVM:   store i32 1, ptr %{{.*}}, align 4
+// LLVM:   %{{.*}} = load i32, ptr %{{.*}}, align 4
+// LLVM:   ret i32 %{{.*}}
+// LLVM: [[CASE2]]:
+// LLVM:   store i32 2, ptr %{{.*}}, align 4
+// LLVM:   %{{.*}} = load i32, ptr %{{.*}}, align 4
+// LLVM:   ret i32 %{{.*}}
+// LLVM: [[DEFAULT]]:
+// LLVM:   store i32 -1, ptr %{{.*}}, align 4
+// LLVM:   %{{.*}} = load i32, ptr %{{.*}}, align 4
+// LLVM:   ret i32 %{{.*}}
+
+// OGCG-LABEL: define{{.*}} i32 @_Z21sw_return_multi_casesi
+// OGCG: entry:
+// OGCG:   %[[RETVAL:.*]] = alloca i32, align 4
+// OGCG:   %[[X_ADDR:.*]] = alloca i32, align 4
+// OGCG:   %[[X_VAL:.*]] = load i32, ptr %[[X_ADDR]], align 4
+// OGCG:   switch i32 %[[X_VAL]], label %[[DEFAULT:.*]] [
+// OGCG-DAG:   i32 0, label %[[SW0:.*]]
+// OGCG-DAG:   i32 1, label %[[SW1:.*]]
+// OGCG-DAG:   i32 2, label %[[SW2:.*]]
+// OGCG:   ]
+// OGCG: [[SW0]]:
+// OGCG:   br label %[[RETURN:.*]]
+// OGCG: [[SW1]]:
+// OGCG:   br label %[[RETURN]]
+// OGCG: [[SW2]]:
+// OGCG:   br label %[[RETURN]]
+// OGCG: [[DEFAULT]]:
+// OGCG:   br label %[[RETURN]]
+// OGCG: [[RETURN]]:
+// OGCG:   %[[RETVAL_LOAD:.*]] = load i32, ptr %[[RETVAL]], align 4
+// OGCG:   ret i32 %[[RETVAL_LOAD]]
diff --git a/clang/test/CIR/CodeGen/variable-decomposition.cpp b/clang/test/CIR/CodeGen/variable-decomposition.cpp
index ba59109ab625f..f0e19263cd6db 100644
--- a/clang/test/CIR/CodeGen/variable-decomposition.cpp
+++ b/clang/test/CIR/CodeGen/variable-decomposition.cpp
@@ -19,12 +19,8 @@ float function() {
 // CIR-LABEL: cir.func dso_local @_Z8functionv() -> !cir.float
 // CIR:  %[[RETVAL:.+]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["__retval"]
 // CIR:  %[[STRUCT:.+]] = cir.alloca !rec_some_struct, !cir.ptr<!rec_some_struct>, ["", init]
-// CIR:  %[[MEMBER_A:.+]] = cir.get_member %[[STRUCT]][0] {name = "a"} : !cir.ptr<!rec_some_struct> -> !cir.ptr<!s32i>
-// CIR:  %[[CONST_1:.+]] = cir.const #cir.int<1> : !s32i
-// CIR:  cir.store{{.*}} %[[CONST_1]], %[[MEMBER_A]]
-// CIR:  %[[MEMBER_B:.+]] = cir.get_member %[[STRUCT]][1] {name = "b"} : !cir.ptr<!rec_some_struct> -> !cir.ptr<!cir.float>
-// CIR:  %[[TWO_FP:.+]] = cir.const #cir.fp<2.000000e+00> : !cir.float
-// CIR:  cir.store{{.*}} %[[TWO_FP]], %[[MEMBER_B]]
+// CIR:  %[[CONST:.+]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.fp<2.000000e+00> : !cir.float}> : !rec_some_struct
+// CIR:  cir.store{{.*}} %[[CONST]], %[[STRUCT]]
 // CIR:  %[[MEMBER_A:.+]] = cir.get_member %[[STRUCT]][0] {name = "a"} : !cir.ptr<!rec_some_struct> -> !cir.ptr<!s32i>
 // CIR:  %[[LOAD_A:.+]] = cir.load align(4) %[[MEMBER_A]] : !cir.ptr<!s32i>, !s32i
 // CIR:  %[[CAST_A:.+]] = cir.cast int_to_float %[[LOAD_A]] : !s32i -> !cir.float
@@ -38,10 +34,7 @@ float function() {
 // LLVM-LABEL: define dso_local float @_Z8functionv()
 // LLVM:  %[[RETVAL:.+]] = alloca float, i64 1
 // LLVM:  %[[STRUCT:.+]] = alloca %struct.some_struct, i64 1
-// LLVM:  %[[GEP_A:.+]] = getelementptr %struct.some_struct, ptr %[[STRUCT]], i32 0, i32 0
-// LLVM:  store i32 1, ptr %[[GEP_A]]
-// LLVM:  %[[GEP_B:.+]] = getelementptr %struct.some_struct, ptr %[[STRUCT]], i32 0, i32 1
-// LLVM:  store float 2.000000e+00, ptr %[[GEP_B]]
+// LLVM:  store %struct.some_struct { i32 1, float 2.000000e+00 }, ptr %[[STRUCT]]
 // LLVM:  %[[GEP_A:.+]] = getelementptr %struct.some_struct, ptr %[[STRUCT]], i32 0, i32 0
 // LLVM:  %[[LOAD_A:.+]] = load i32, ptr %[[GEP_A]]
 // LLVM:  %[[CAST_A:.+]] = sitofp i32 %[[LOAD_A]] to float
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
index 53eba7bafb312..d289336ccaf8c 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
 
 struct DefaultOperators {
   int i;
@@ -24,21 +25,8 @@ void acc_combined() {
 // CHECK: acc.reduction.recipe @reduction_add__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -83,21 +71,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -142,21 +117,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -240,21 +202,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -338,15 +287,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTS24DefaultOperatorsNoFloats : !cir.ptr<!rec_DefaultOperatorsNoFloats> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -379,15 +321,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTS24DefaultOperatorsNoFloats : !cir.ptr<!rec_DefaultOperatorsNoFloats> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperatorsNoFloats
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -420,15 +355,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTS24DefaultOperatorsNoFloats : !cir.ptr<!rec_DefaultOperatorsNoFloats> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperatorsNoFloats
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -461,21 +389,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -558,21 +473,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -656,37 +558,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[TEMP_LOAD]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!rec_DefaultOperators>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -754,96 +627,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -912,96 +697,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[LEAST_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[LEAST_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators]> : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1107,96 +804,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[LARGEST_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[LARGEST_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1302,66 +911,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ALL_ONES_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ALL_ONES_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats]> : !cir.array<!rec_DefaultOperatorsNoFloats x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1417,32 +968,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!rec_DefaultOperatorsNoFloats x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1498,31 +1025,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!rec_DefaultOperatorsNoFloats x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1578,96 +1082,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1774,38 +1190,8 @@ void acc_combined() {
 // CHECK: acc.reduction.recipe @reduction_lor__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[TEMP_LOAD]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!rec_DefaultOperators>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
index 63d69529bee53..f65cd8aa414bd 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
@@ -1,4 +1,6 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
+
 template<typename T>
 void acc_combined() {
   T someVar;
@@ -137,24 +139,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!cir.float>>, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!cir.float>>, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!cir.float>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -191,25 +177,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float]> : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -246,25 +215,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float]> : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -308,25 +260,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float]> : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -370,25 +305,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float]> : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -435,24 +353,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!cir.float>>, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!cir.float>>, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!cir.float>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
index 78b43ddc8f182..ca6f0ea60dc34 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
 
 template<typename T>
 void acc_combined() {
@@ -41,7 +42,7 @@ void acc_combined() {
 #pragma acc parallel loop reduction(max:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSi : !cir.ptr<!s32i> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!s32i>{{.*}})
-// CHECK-NEXT: cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[ALLOCA]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield
@@ -64,7 +65,7 @@ void acc_combined() {
 #pragma acc parallel loop reduction(min:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSi : !cir.ptr<!s32i> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!s32i>{{.*}})
-// CHECK-NEXT: cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[ALLOCA]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield
@@ -87,7 +88,7 @@ void acc_combined() {
 #pragma acc parallel loop reduction(&:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSi : !cir.ptr<!s32i> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!s32i>{{.*}})
-// CHECK-NEXT: cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[ALLOCA]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield
@@ -190,24 +191,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!s32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]])
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -244,25 +229,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -298,26 +266,9 @@ void acc_combined() {
 #pragma acc parallel loop reduction(max:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
-// CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i]> : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -360,26 +311,9 @@ void acc_combined() {
 #pragma acc parallel loop reduction(min:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
-// CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i]> : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -422,26 +356,9 @@ void acc_combined() {
 #pragma acc parallel loop reduction(&:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
-// CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i]> : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -478,24 +395,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!s32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]])
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -532,24 +433,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!s32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]])
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -586,25 +471,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -651,24 +519,8 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!s32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
index 6ec1c43ebbe45..cba01cab6d341 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -std=c23 -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -std=c23 -triple x86_64-linux-pc %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
 
 struct DefaultOperators {
   int i;
@@ -22,22 +23,10 @@ void acc_compute() {
 #pragma acc parallel reduction(+:someVar)
 // CHECK: acc.reduction.recipe @reduction_add__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
-// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init"]
+// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!rec_anon_struct>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_anon_struct
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[BITCAST]] : !rec_anon_struct, !cir.ptr<!rec_anon_struct>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -81,22 +70,10 @@ void acc_compute() {
 #pragma acc parallel reduction(*:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
-// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init"]
+// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!rec_anon_struct>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, {{.*}}, #cir.fp<1{{.*}}> : !cir.double, #true, {{.*}}}> : !rec_anon_struct
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[BITCAST]] : !rec_anon_struct, !cir.ptr<!rec_anon_struct>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -140,22 +117,10 @@ void acc_compute() {
 #pragma acc parallel reduction(max:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
-// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init"]
+// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!rec_anon_struct>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, {{.*}}, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false, {{.*}}}> : !rec_anon_struct
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[BITCAST]] : !rec_anon_struct, !cir.ptr<!rec_anon_struct>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -241,22 +206,10 @@ void acc_compute() {
 #pragma acc parallel reduction(min:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
-// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init"]
+// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!rec_anon_struct>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, {{.*}}, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true, {{.*}}}> : !rec_anon_struct
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[BITCAST]] : !rec_anon_struct, !cir.ptr<!rec_anon_struct>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -342,16 +295,10 @@ void acc_compute() {
 #pragma acc parallel reduction(&:someVarNoFloats)
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTS24DefaultOperatorsNoFloats : !cir.ptr<!rec_DefaultOperatorsNoFloats> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats>{{.*}})
-// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>, ["openacc.reduction.init"]
+// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!rec_anon_struct1>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true, {{.*}}}> : !rec_anon_struct1
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[BITCAST]] : !rec_anon_struct1, !cir.ptr<!rec_anon_struct1>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -383,16 +330,10 @@ void acc_compute() {
 #pragma acc parallel reduction(|:someVarNoFloats)
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTS24DefaultOperatorsNoFloats : !cir.ptr<!rec_DefaultOperatorsNoFloats> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats>{{.*}})
-// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>, ["openacc.reduction.init"]
+// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!rec_anon_struct1>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_anon_struct1
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[BITCAST]] : !rec_anon_struct1, !cir.ptr<!rec_anon_struct1>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -424,16 +365,10 @@ void acc_compute() {
 #pragma acc parallel reduction(^:someVarNoFloats)
 // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTS24DefaultOperatorsNoFloats : !cir.ptr<!rec_DefaultOperatorsNoFloats> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats>{{.*}})
-// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>, ["openacc.reduction.init"]
+// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!rec_anon_struct1>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_anon_struct1
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[BITCAST]] : !rec_anon_struct1, !cir.ptr<!rec_anon_struct1>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -465,22 +400,10 @@ void acc_compute() {
 #pragma acc parallel reduction(&&:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
-// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init"]
+// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!rec_anon_struct>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, {{.*}}, #cir.fp<1{{.*}}> : !cir.double, #true, {{.*}}}> : !rec_anon_struct
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[BITCAST]] : !rec_anon_struct, !cir.ptr<!rec_anon_struct>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -565,22 +488,10 @@ void acc_compute() {
 #pragma acc parallel reduction(||:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
-// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init"]
+// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[ALLOCA]] : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!rec_anon_struct>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_anon_struct
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[BITCAST]] : !rec_anon_struct, !cir.ptr<!rec_anon_struct>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -667,24 +578,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperators
-// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[TEMP_LOAD]]
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!rec_DefaultOperators>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -752,96 +647,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -910,96 +717,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[LEAST_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[LEAST_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators]> : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1108,96 +827,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[LARGEST_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[LARGEST_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1307,66 +938,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ALL_ONES_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ALL_ONES_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats]> : !cir.array<!rec_DefaultOperatorsNoFloats x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1422,24 +995,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperatorsNoFloats
-// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[TEMP_LOAD]]
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!rec_DefaultOperatorsNoFloats x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1495,24 +1052,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperatorsNoFloats
-// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[TEMP_LOAD]]
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!rec_DefaultOperatorsNoFloats x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1568,96 +1109,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1767,24 +1220,8 @@ void acc_compute() {
 // CHECK: acc.reduction.recipe @reduction_lor__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperators
-// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[TEMP_LOAD]]
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!rec_DefaultOperators>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
index 7bd6f67a9e19e..43b0791250835 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
 
 struct DefaultOperators {
   int i;
@@ -24,21 +25,8 @@ void acc_compute() {
 // CHECK: acc.reduction.recipe @reduction_add__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -83,21 +71,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -142,21 +117,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -240,21 +202,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -338,15 +287,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTS24DefaultOperatorsNoFloats : !cir.ptr<!rec_DefaultOperatorsNoFloats> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats
+// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -379,15 +321,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTS24DefaultOperatorsNoFloats : !cir.ptr<!rec_DefaultOperatorsNoFloats> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperatorsNoFloats
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -420,15 +355,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTS24DefaultOperatorsNoFloats : !cir.ptr<!rec_DefaultOperatorsNoFloats> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperatorsNoFloats
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -461,21 +389,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -558,21 +473,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -656,37 +558,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[TEMP_LOAD]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!rec_DefaultOperators>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -754,96 +627,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -912,96 +697,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[LEAST_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[LEAST_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators]> : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1107,96 +804,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[LARGEST_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[LARGEST_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1302,66 +911,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ALL_ONES_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ALL_ONES_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats]> : !cir.array<!rec_DefaultOperatorsNoFloats x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1417,32 +968,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!rec_DefaultOperatorsNoFloats x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1498,31 +1025,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!rec_DefaultOperatorsNoFloats x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1578,96 +1082,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1774,38 +1190,8 @@ void acc_compute() {
 // CHECK: acc.reduction.recipe @reduction_lor__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[TEMP_LOAD]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!rec_DefaultOperators>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
index 13c335b867044..cd4d2dcb9fa4b 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
 
 void acc_compute() {
   float someVar;
@@ -139,24 +140,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!cir.float>>, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!cir.float>>, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!cir.float>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -193,25 +178,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float]> : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -248,25 +216,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float]> : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -310,25 +261,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float]> : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -372,25 +306,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float]> : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -438,24 +355,8 @@ void acc_compute() {
 // CHECK: acc.reduction.recipe @reduction_lor__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!cir.float>>, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!cir.float>>, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!cir.float>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
index 67378210ba83c..c1385ab830f00 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
 
 template<typename T>
 void acc_compute() {
@@ -138,24 +139,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!cir.float>>, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!cir.float>>, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!cir.float>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -192,25 +177,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float]> : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -247,25 +215,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float]> : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -309,25 +260,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float]> : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -371,25 +305,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float]> : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -436,24 +353,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!cir.float>>, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!cir.float>>, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!cir.float>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
index be7b12350360d..440f8f9f8fbf7 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
 
 void acc_compute() {
   int someVar;
@@ -40,7 +41,7 @@ void acc_compute() {
 #pragma acc parallel reduction(max:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSi : !cir.ptr<!s32i> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!s32i>{{.*}})
-// CHECK-NEXT: cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[ALLOCA]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield
@@ -63,7 +64,7 @@ void acc_compute() {
 #pragma acc parallel reduction(min:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSi : !cir.ptr<!s32i> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!s32i>{{.*}})
-// CHECK-NEXT: cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[ALLOCA]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield
@@ -86,7 +87,7 @@ void acc_compute() {
 #pragma acc parallel reduction(&:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSi : !cir.ptr<!s32i> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!s32i>{{.*}})
-// CHECK-NEXT: cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[ALLOCA]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield
@@ -189,24 +190,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!s32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]])
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -243,25 +228,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -297,26 +265,9 @@ void acc_compute() {
 #pragma acc parallel reduction(max:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
-// CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -359,26 +310,9 @@ void acc_compute() {
 #pragma acc parallel reduction(min:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
-// CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -421,26 +355,9 @@ void acc_compute() {
 #pragma acc parallel reduction(&:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
-// CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -477,24 +394,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!s32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]])
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -531,24 +432,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!s32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]])
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -585,25 +470,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -650,24 +518,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!s32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[INIT_VAL:.*]] = cir.const {{.*}} : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store {{.*}} %[[INIT_VAL]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
index fb6984fcd0068..db1b18e3fb8b7 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
 
 template<typename T>
 void acc_compute() {
@@ -41,7 +42,7 @@ void acc_compute() {
 #pragma acc parallel reduction(max:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSi : !cir.ptr<!s32i> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!s32i>{{.*}})
-// CHECK-NEXT: cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[ALLOCA]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield
@@ -64,7 +65,7 @@ void acc_compute() {
 #pragma acc parallel reduction(min:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSi : !cir.ptr<!s32i> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!s32i>{{.*}})
-// CHECK-NEXT: cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[ALLOCA]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield
@@ -87,7 +88,7 @@ void acc_compute() {
 #pragma acc parallel reduction(&:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSi : !cir.ptr<!s32i> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!s32i>{{.*}})
-// CHECK-NEXT: cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[ALLOCA]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield
@@ -190,24 +191,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!s32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]])
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -244,25 +229,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store {{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -298,26 +266,9 @@ void acc_compute() {
 #pragma acc parallel reduction(max:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
-// CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i]> : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store {{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -360,26 +311,9 @@ void acc_compute() {
 #pragma acc parallel reduction(min:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
-// CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i]> : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store {{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -422,26 +356,9 @@ void acc_compute() {
 #pragma acc parallel reduction(&:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
-// CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[CONST_ARR:.*]] = cir.const #cir.const_array<[#cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i]> : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store {{.*}} %[[CONST_ARR]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -478,24 +395,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!s32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]])
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!s32i x 5> loc(#loc12)
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>> loc(#loc12)
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -532,24 +433,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!s32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]])
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!s32i x 5> loc(#loc12)
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>> loc(#loc12)
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -586,25 +471,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store {{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -651,24 +519,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!s32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!s32i x 5> loc(#loc12)
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>> loc(#loc12)
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
index 9b10a296e99f5..54784f35266d5 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
 
 void acc_compute() {
   unsigned int someVar;
@@ -40,7 +41,7 @@ void acc_compute() {
 #pragma acc parallel reduction(max:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSj : !cir.ptr<!u32i> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!u32i>{{.*}})
-// CHECK-NEXT: cir.alloca !u32i, !cir.ptr<!u32i>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
 // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[ALLOCA]] : !u32i, !cir.ptr<!u32i>
 // CHECK-NEXT: acc.yield
@@ -63,7 +64,7 @@ void acc_compute() {
 #pragma acc parallel reduction(min:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSj : !cir.ptr<!u32i> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!u32i>{{.*}})
-// CHECK-NEXT: cir.alloca !u32i, !cir.ptr<!u32i>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
 // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[ALLOCA]] : !u32i, !cir.ptr<!u32i>
 // CHECK-NEXT: acc.yield
@@ -86,7 +87,7 @@ void acc_compute() {
 #pragma acc parallel reduction(&:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSj : !cir.ptr<!u32i> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!u32i>{{.*}})
-// CHECK-NEXT: cir.alloca !u32i, !cir.ptr<!u32i>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
 // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[ALLOCA]] : !u32i, !cir.ptr<!u32i>
 // CHECK-NEXT: acc.yield
@@ -190,24 +191,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!u32i>>, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!u32i>>, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!u32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]])
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!u32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -244,25 +229,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<1> : !u32i, #cir.int<1> : !u32i, #cir.int<1> : !u32i, #cir.int<1> : !u32i, #cir.int<1> : !u32i]> : !cir.array<!u32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -298,26 +266,9 @@ void acc_compute() {
 #pragma acc parallel reduction(max:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
-// CHECK-NEXT: cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.zero : !cir.array<!u32i x 5>
+// CHECK-NEXT: cir.store {{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -360,26 +311,9 @@ void acc_compute() {
 #pragma acc parallel reduction(min:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
-// CHECK-NEXT: cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<4294967295> : !u32i, #cir.int<4294967295> : !u32i, #cir.int<4294967295> : !u32i, #cir.int<4294967295> : !u32i, #cir.int<4294967295> : !u32i]> : !cir.array<!u32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -422,26 +356,9 @@ void acc_compute() {
 #pragma acc parallel reduction(&:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
-// CHECK-NEXT: cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<4294967295> : !u32i, #cir.int<4294967295> : !u32i, #cir.int<4294967295> : !u32i, #cir.int<4294967295> : !u32i, #cir.int<4294967295> : !u32i]> : !cir.array<!u32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -478,24 +395,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!u32i>>, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!u32i>>, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!u32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]])
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!u32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -532,24 +433,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!u32i>>, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!u32i>>, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!u32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]])
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!u32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -586,25 +471,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<1> : !u32i, #cir.int<1> : !u32i, #cir.int<1> : !u32i, #cir.int<1> : !u32i, #cir.int<1> : !u32i]> : !cir.array<!u32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -652,24 +520,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!u32i>>, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!u32i>>, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!u32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!u32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
index 11ebd7b4c26cb..a6f9e33bc25e0 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
 
 struct DefaultOperators {
   int i;
@@ -24,21 +25,8 @@ void acc_loop() {
 // CHECK: acc.reduction.recipe @reduction_add__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -83,21 +71,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -142,21 +117,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -240,21 +202,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -338,15 +287,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTS24DefaultOperatorsNoFloats : !cir.ptr<!rec_DefaultOperatorsNoFloats> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -379,15 +321,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTS24DefaultOperatorsNoFloats : !cir.ptr<!rec_DefaultOperatorsNoFloats> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperatorsNoFloats
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -420,15 +355,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTS24DefaultOperatorsNoFloats : !cir.ptr<!rec_DefaultOperatorsNoFloats> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperatorsNoFloats
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperatorsNoFloats, !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -461,21 +389,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -558,21 +473,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[ALLOCA]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[ALLOCA]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[ALLOCA]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[ALLOCA]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !rec_DefaultOperators
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -656,37 +558,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[TEMP_LOAD]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!rec_DefaultOperators>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -754,96 +627,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -912,96 +697,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[LEAST_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[LEAST_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<-2147483648> : !s32i, #cir.int<0> : !u32i, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-1.7{{.*}}E+308> : !cir.double, #false}> : !rec_DefaultOperators]> : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1107,96 +804,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[LARGEST_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[LARGEST_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<2147483647> : !s32i, #cir.int<4294967295> : !u32i, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<1.7{{.*}}E+308> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1302,66 +911,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ALL_ONES_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ALL_ONES_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats, #cir.const_record<{#cir.int<-1> : !s32i, #cir.int<4294967295> : !u32i, #true}> : !rec_DefaultOperatorsNoFloats]> : !cir.array<!rec_DefaultOperatorsNoFloats x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1417,32 +968,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!rec_DefaultOperatorsNoFloats x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1498,31 +1025,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperatorsNoFloats>>, !cir.ptr<!rec_DefaultOperatorsNoFloats>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!rec_DefaultOperatorsNoFloats>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!rec_DefaultOperatorsNoFloats x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperatorsNoFloats x 5>, !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1578,96 +1082,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[DECAY]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[DECAY]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[DECAY]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[DECAY]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-//
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[NEXT_ELT]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[NEXT_ELT]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[NEXT_ELT]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[NEXT_ELT]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[NEXT_ELT]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
-// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
+// CHECK-NEXT: %[[CONST:.*]] = cir.const #cir.const_array<[#cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators, #cir.const_record<{#cir.int<1> : !s32i, #cir.int<1> : !u32i, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.double, #true}> : !rec_DefaultOperators]> : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -1774,38 +1190,8 @@ void acc_loop() {
 // CHECK: acc.reduction.recipe @reduction_lor__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[TEMP_LOAD]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[TEMP_LOAD]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
-// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[TEMP_LOAD]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[TEMP_LOAD]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
-// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[TEMP_LOAD]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, !cir.ptr<!rec_DefaultOperators>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!rec_DefaultOperators>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!rec_DefaultOperators x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
index 57cc1afec2911..6e5af5c3ae322 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
 
 template<typename T>
 void acc_loop() {
@@ -138,24 +139,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!cir.float>>, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!cir.float>>, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!cir.float>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -192,25 +177,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float]> : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -247,25 +215,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float, #cir.fp<-3.4{{.*}}E+38> : !cir.float]> : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -309,25 +260,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float, #cir.fp<3.4{{.*}}E+38> : !cir.float]> : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -371,25 +305,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float, #cir.fp<1{{.*}}> : !cir.float]> : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -436,24 +353,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!cir.float>>, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !cir.float, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!cir.float>>, !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!cir.float>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!cir.float x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
index f60dff9385412..8baf77966efc1 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
 
 template<typename T>
 void acc_loop() {
@@ -41,7 +42,7 @@ void acc_loop() {
 #pragma acc loop reduction(max:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSi : !cir.ptr<!s32i> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!s32i>{{.*}})
-// CHECK-NEXT: cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[ALLOCA]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield
@@ -64,7 +65,7 @@ void acc_loop() {
 #pragma acc loop reduction(min:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSi : !cir.ptr<!s32i> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!s32i>{{.*}})
-// CHECK-NEXT: cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[ALLOCA]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield
@@ -87,7 +88,7 @@ void acc_loop() {
 #pragma acc loop reduction(&:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSi : !cir.ptr<!s32i> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!s32i>{{.*}})
-// CHECK-NEXT: cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[ALLOCA]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield
@@ -190,24 +191,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!s32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]])
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -244,25 +229,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -298,26 +266,9 @@ void acc_loop() {
 #pragma acc loop reduction(max:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
-// CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i, #cir.int<-2147483648> : !s32i]> : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -360,26 +311,9 @@ void acc_loop() {
 #pragma acc loop reduction(min:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
-// CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i, #cir.int<2147483647> : !s32i]> : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -422,26 +356,9 @@ void acc_loop() {
 #pragma acc loop reduction(&:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
-// CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i]> : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -478,24 +395,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!s32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]])
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -532,24 +433,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!s32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]])
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -586,25 +471,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[ONE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_IDX:.*]] = cir.const #cir.int<2> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[TWO_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_IDX:.*]] = cir.const #cir.int<3> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[THREE_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_IDX:.*]] = cir.const #cir.int<4> : !s64i
-// CHECK-NEXT: %[[NEXT_ELT:.*]] = cir.ptr_stride %[[DECAY]], %[[FOUR_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[NEXT_ELT]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[CONST_ARRAY:.*]] = cir.const #cir.const_array<[#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[CONST_ARRAY]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
@@ -651,24 +519,8 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
-// CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride %[[DECAY]], %[[LAST_IDX]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[TEMP_LOAD]] : !s32i, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
-// CHECK-NEXT: %[[NEXT_ITEM:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ONE]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
-// CHECK-NEXT: cir.store {{.*}} %[[NEXT_ITEM]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load {{.*}} %[[TEMP_ITR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[TEMP_LOAD]], %[[END_ITR]]) : !cir.ptr<!s32i>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]]) 
-// CHECK-NEXT: }
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!s32i x 5>
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[ALLOCA]] : !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
diff --git a/clang/test/CIR/IR/objsize.cir b/clang/test/CIR/IR/objsize.cir
new file mode 100644
index 0000000000000..bc24551c446e6
--- /dev/null
+++ b/clang/test/CIR/IR/objsize.cir
@@ -0,0 +1,89 @@
+// Test the cir.objsize operation can parse and print correctly (roundtrip)
+// with all possible combinations of optional attributes
+
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
+
+!u64i = !cir.int<u, 64>
+!void = !cir.void
+
+module {
+  cir.func @test_max(%arg0: !cir.ptr<!void>) -> !u64i {
+    %0 = cir.objsize max %arg0 : !cir.ptr<!void> -> !u64i
+    cir.return %0 : !u64i
+  }
+
+  cir.func @test_max_nullunknown(%arg0: !cir.ptr<!void>) -> !u64i {
+    %0 = cir.objsize max nullunknown %arg0 : !cir.ptr<!void> -> !u64i
+    cir.return %0 : !u64i
+  }
+
+  cir.func @test_max_dynamic(%arg0: !cir.ptr<!void>) -> !u64i {
+    %0 = cir.objsize max dynamic %arg0 : !cir.ptr<!void> -> !u64i
+    cir.return %0 : !u64i
+  }
+
+  cir.func @test_max_nullunknown_dynamic(%arg0: !cir.ptr<!void>) -> !u64i {
+    %0 = cir.objsize max nullunknown dynamic %arg0 : !cir.ptr<!void> -> !u64i
+    cir.return %0 : !u64i
+  }
+
+  cir.func @test_min(%arg0: !cir.ptr<!void>) -> !u64i {
+    %0 = cir.objsize min %arg0 : !cir.ptr<!void> -> !u64i
+    cir.return %0 : !u64i
+  }
+
+  cir.func @test_min_nullunknown(%arg0: !cir.ptr<!void>) -> !u64i {
+    %0 = cir.objsize min nullunknown %arg0 : !cir.ptr<!void> -> !u64i
+    cir.return %0 : !u64i
+  }
+
+  cir.func @test_min_dynamic(%arg0: !cir.ptr<!void>) -> !u64i {
+    %0 = cir.objsize min dynamic %arg0 : !cir.ptr<!void> -> !u64i
+    cir.return %0 : !u64i
+  }
+
+  cir.func @test_min_nullunknown_dynamic(%arg0: !cir.ptr<!void>) -> !u64i {
+    %0 = cir.objsize min nullunknown dynamic %arg0 : !cir.ptr<!void> -> !u64i
+    cir.return %0 : !u64i
+  }
+}
+
+// CHECK: cir.func @test_max(%arg0: !cir.ptr<!void>) -> !u64i {
+// CHECK:   %0 = cir.objsize max %arg0 : !cir.ptr<!void> -> !u64i
+// CHECK:   cir.return %0 : !u64i
+// CHECK: }
+
+// CHECK: cir.func @test_max_nullunknown(%arg0: !cir.ptr<!void>) -> !u64i {
+// CHECK:   %0 = cir.objsize max nullunknown %arg0 : !cir.ptr<!void> -> !u64i
+// CHECK:   cir.return %0 : !u64i
+// CHECK: }
+
+// CHECK: cir.func @test_max_dynamic(%arg0: !cir.ptr<!void>) -> !u64i {
+// CHECK:   %0 = cir.objsize max dynamic %arg0 : !cir.ptr<!void> -> !u64i
+// CHECK:   cir.return %0 : !u64i
+// CHECK: }
+
+// CHECK: cir.func @test_max_nullunknown_dynamic(%arg0: !cir.ptr<!void>) -> !u64i {
+// CHECK:   %0 = cir.objsize max nullunknown dynamic %arg0 : !cir.ptr<!void> -> !u64i
+// CHECK:   cir.return %0 : !u64i
+// CHECK: }
+
+// CHECK: cir.func @test_min(%arg0: !cir.ptr<!void>) -> !u64i {
+// CHECK:   %0 = cir.objsize min %arg0 : !cir.ptr<!void> -> !u64i
+// CHECK:   cir.return %0 : !u64i
+// CHECK: }
+
+// CHECK: cir.func @test_min_nullunknown(%arg0: !cir.ptr<!void>) -> !u64i {
+// CHECK:   %0 = cir.objsize min nullunknown %arg0 : !cir.ptr<!void> -> !u64i
+// CHECK:   cir.return %0 : !u64i
+// CHECK: }
+
+// CHECK: cir.func @test_min_dynamic(%arg0: !cir.ptr<!void>) -> !u64i {
+// CHECK:   %0 = cir.objsize min dynamic %arg0 : !cir.ptr<!void> -> !u64i
+// CHECK:   cir.return %0 : !u64i
+// CHECK: }
+
+// CHECK: cir.func @test_min_nullunknown_dynamic(%arg0: !cir.ptr<!void>) -> !u64i {
+// CHECK:   %0 = cir.objsize min nullunknown dynamic %arg0 : !cir.ptr<!void> -> !u64i
+// CHECK:   cir.return %0 : !u64i
+// CHECK: }
diff --git a/clang/test/CIR/Lowering/array.cpp b/clang/test/CIR/Lowering/array.cpp
index 40ad986b7fdfa..de4a77072b930 100644
--- a/clang/test/CIR/Lowering/array.cpp
+++ b/clang/test/CIR/Lowering/array.cpp
@@ -60,24 +60,7 @@ void func2() {
 
 // CHECK: define{{.*}} void @_Z5func2v()
 // CHECK:   %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4
-// CHECK:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
-// CHECK:   %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR]], i32 0
-// CHECK:   store i32 5, ptr %[[ARR_PTR]], align 4
-// CHECK:   %[[ELE_1_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1
-// CHECK:   store ptr %[[ELE_1_PTR]], ptr %[[TMP]], align 8
-// CHECK:   %[[END_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 2
-// CHECK:   br label %[[LOOP_BODY:.*]]
-// CHECK: [[LOOP_NEXT:.*]]:
-// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
-// CHECK:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
-// CHECK:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
-// CHECK: [[LOOP_BODY]]:
-// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
-// CHECK:   store i32 0, ptr %[[CUR]], align 4
-// CHECK:   %[[NEXT:.*]] = getelementptr i32, ptr %[[CUR]], i64 1
-// CHECK:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
-// CHECK:   br label %[[LOOP_NEXT:.*]]
-// CHECK: [[LOOP_END]]:
+// CHECK:   store [2 x i32] [i32 5, i32 0], ptr %[[ARR]], align 4
 // CHECK:   ret void
 
 void func3() {
@@ -85,10 +68,7 @@ void func3() {
 }
 // CHECK: define{{.*}} void @_Z5func3v()
 // CHECK:  %[[ARR_ALLOCA:.*]] = alloca [2 x i32], i64 1, align 4
-// CHECK:  %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR_ALLOCA]], i32 0
-// CHECK:  store i32 5, ptr %[[ARR_PTR]], align 4
-// CHECK:  %[[ELE_1_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1
-// CHECK:  store i32 6, ptr %[[ELE_1_PTR]], align 4
+// CHECK:  store [2 x i32] [i32 5, i32 6], ptr %[[ARR_ALLOCA]], align 4
 
 void func4() {
   int arr[2][1] = {{5}, {6}};
@@ -97,12 +77,7 @@ void func4() {
 // CHECK: define{{.*}} void @_Z5func4v()
 // CHECK:  %[[ARR_ALLOCA:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
 // CHECK:  %[[INIT:.*]] = alloca i32, i64 1, align 4
-// CHECK:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_ALLOCA]], i32 0
-// CHECK:  %[[ARR_0_0:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
-// CHECK:  store i32 5, ptr %[[ARR_0_0]], align 4
-// CHECK:  %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
-// CHECK:  %[[ARR_1_0:.*]] = getelementptr i32, ptr %[[ARR_1]], i32 0
-// CHECK:  store i32 6, ptr %[[ARR_1_0]], align 4
+// CHECK:  store [2 x [1 x i32]] {{\[}}[1 x i32] [i32 5], [1 x i32] [i32 6]], ptr %[[ARR_ALLOCA]], align 4
 // CHECK:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_ALLOCA]], i32 0
 // CHECK:  %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
 // CHECK:  %[[ARR_1_0:.*]] = getelementptr i32, ptr %[[ARR_1]], i32 0
@@ -115,25 +90,7 @@ void func5() {
 }
 // CHECK: define{{.*}} void @_Z5func5v()
 // CHECK:   %[[ARR:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
-// CHECK:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
-// CHECK:   %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR]], i32 0
-// CHECK:   %[[ARR_0:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
-// CHECK:   store i32 5, ptr %[[ARR_0]], align 4
-// CHECK:   %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
-// CHECK:   store ptr %[[ARR_1]], ptr %[[TMP]], align 8
-// CHECK:   %[[END_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 2
-// CHECK:   br label %[[LOOP_BODY:.*]]
-// CHECK: [[LOOP_NEXT:.*]]:
-// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
-// CHECK:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
-// CHECK:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
-// CHECK: [[LOOP_BODY]]:
-// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
-// CHECK:   store [1 x i32] zeroinitializer, ptr %[[CUR]], align 4
-// CHECK:   %[[NEXT:.*]] = getelementptr [1 x i32], ptr %[[CUR]], i64 1
-// CHECK:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
-// CHECK:   br label %[[LOOP_NEXT:.*]]
-// CHECK: [[LOOP_END]]:
+// CHECK:   store [2 x [1 x i32]] {{\[}}[1 x i32] [i32 5], [1 x i32] zeroinitializer], ptr %[[ARR]], align 4
 // CHECK:   ret void
 
 void func6() {
@@ -155,22 +112,7 @@ void func7() {
 }
 // CHECK: define{{.*}} void @_Z5func7v()
 // CHECK:   %[[ARR:.*]] = alloca [1 x ptr], i64 1, align 8
-// CHECK:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
-// CHECK:   %[[ARR_PTR:.*]] = getelementptr ptr, ptr %[[ARR]], i32 0
-// CHECK:   store ptr %[[ARR_PTR]], ptr %[[TMP]], align 8
-// CHECK:   %[[END_PTR:.*]] = getelementptr ptr, ptr %[[ARR_PTR]], i64 1
-// CHECK:   br label %[[LOOP_BODY:.*]]
-// CHECK: [[LOOP_NEXT:.*]]:
-// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
-// CHECK:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
-// CHECK:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
-// CHECK: [[LOOP_BODY]]:
-// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
-// CHECK:   store ptr null, ptr %[[CUR]], align 8
-// CHECK:   %[[NEXT:.*]] = getelementptr ptr, ptr %[[CUR]], i64 1
-// CHECK:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
-// CHECK:   br label %[[LOOP_NEXT:.*]]
-// CHECK: [[LOOP_END]]:
+// CHECK:   store [1 x ptr] zeroinitializer, ptr %[[ARR]], align 8
 // CHECK:   ret void
 
 void func8(int p[10]) {}
diff --git a/clang/test/CXX/dcl.decl/dcl.fct.def/dcl.fct.def.default/p2.cpp b/clang/test/CXX/dcl.decl/dcl.fct.def/dcl.fct.def.default/p2.cpp
index 849594307390f..4055a6a44699c 100644
--- a/clang/test/CXX/dcl.decl/dcl.fct.def/dcl.fct.def.default/p2.cpp
+++ b/clang/test/CXX/dcl.decl/dcl.fct.def/dcl.fct.def.default/p2.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -fcxx-exceptions %s
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -fcxx-exceptions %s -fexperimental-new-constant-interpreter
 
 // An explicitly-defaulted function may be declared constexpr only if it would
 // have been implicitly declared as constexpr.
diff --git a/clang/test/CXX/drs/cwg0xx.cpp b/clang/test/CXX/drs/cwg0xx.cpp
index 805be67f2dc1a..10a4f1d6add3a 100644
--- a/clang/test/CXX/drs/cwg0xx.cpp
+++ b/clang/test/CXX/drs/cwg0xx.cpp
@@ -90,6 +90,8 @@ namespace cwg5 { // cwg5: 3.1
   const C c = e;
 } // namespace cwg5
 
+// cwg6 is in cwg6.cpp
+
 namespace cwg7 { // cwg7: 3.4
   class A { public: ~A(); };
   class B : virtual private A {}; // #cwg7-B
diff --git a/clang/test/CXX/drs/cwg28xx.cpp b/clang/test/CXX/drs/cwg28xx.cpp
index a6b2b99e0c3f1..d0ee191ef23d8 100644
--- a/clang/test/CXX/drs/cwg28xx.cpp
+++ b/clang/test/CXX/drs/cwg28xx.cpp
@@ -61,6 +61,24 @@ namespace cwg2819 { // cwg2819: 19 c++26
 #endif
 } // namespace cwg2819
 
+namespace cwg2823 { // cwg2823: no
+#if __cplusplus >= 201103L
+  constexpr int *p = 0;
+  constexpr int *q1 = &*p;
+  // expected-error@-1 {{constexpr variable 'q1' must be initialized by a constant expression}}
+  //   expected-note@-2 {{dereferencing a null pointer is not allowed in a constant expression}}
+  // FIXME: invalid: dereferencing a null pointer.
+  constexpr int *q2 = &p[0];
+
+  int arr[32];
+  constexpr int *r = arr;
+  // FIXME: invalid: dereferencing a past-the-end pointer.
+  constexpr int *s1 = &*(r + 32);
+  // FIXME: invalid: dereferencing a past-the-end pointer.
+  constexpr int *s2 = &r[32];
+#endif
+}
+
 namespace cwg2847 { // cwg2847: 19 review 2024-03-01
 
 #if __cplusplus >= 202002L
diff --git a/clang/test/CXX/drs/cwg2xx.cpp b/clang/test/CXX/drs/cwg2xx.cpp
index 37186e3c3f205..a4995ddc2c588 100644
--- a/clang/test/CXX/drs/cwg2xx.cpp
+++ b/clang/test/CXX/drs/cwg2xx.cpp
@@ -230,6 +230,38 @@ namespace cwg211 { // cwg211: 2.7
   };
 } // namespace cwg211
 
+namespace cwg212 { // cwg212: 2.7
+  template<typename T> struct Base;
+  template<typename T> struct Derived;
+
+  int *overload(void*);
+  float *overload(Base<int>*);
+  double *overload(Base<long>*);
+
+  void f(Derived<int> *p) {
+    // OK, calls void* overload.
+    int *a = overload(p);
+
+    Base<int> *q = p;
+    // expected-error@-1 {{cannot initialize a variable of type 'Base<int> *' with an lvalue of type 'Derived<int> *'}}
+  }
+
+  template<typename T> struct Base {};
+  template<typename T> struct Derived : Base<T> {};
+
+  void g(Derived<long> *p) {
+    // OK, instantiates and calls Base<long>* overlod.
+    double *b = overload(p);
+    (void)b;
+  }
+
+  void h(Derived<float> *p) {
+    // OK, instantiates and converts.
+    Base<float> *q = p;
+    (void)q;
+  }
+}
+
 namespace cwg213 { // cwg213: 2.7
   template <class T> struct A : T {
     void h(T t) {
@@ -593,6 +625,9 @@ namespace cwg231 { // cwg231: 2.7
   }
 } // namespace cwg231
 
+// 232 is NAD; the desired behavior is described in 2823.
+// cwg232: dup 2823
+
 // cwg234: na
 // cwg235: na
 
diff --git a/clang/test/CXX/drs/cwg6.cpp b/clang/test/CXX/drs/cwg6.cpp
new file mode 100644
index 0000000000000..4752e72034c78
--- /dev/null
+++ b/clang/test/CXX/drs/cwg6.cpp
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s --check-prefixes CHECK
+
+#if __cplusplus == 199711L
+#define static_assert(expr) __extension__ _Static_assert(expr)
+#define noexcept throw()
+#endif
+
+namespace cwg6 { // cwg6: 2.7
+#if __cplusplus >= 201103L
+struct Counter {
+  int copies;
+  constexpr Counter(int copies) : copies(copies) {}
+  constexpr Counter(const Counter& other) : copies(other.copies + 1) {}
+};
+
+// Passing an lvalue by value makes a non-elidable copy.
+constexpr int PassByValue(Counter c) { return c.copies; }
+constexpr int PassByValue2(Counter c) { return PassByValue(c); }
+constexpr int PassByValue3(Counter c) { return PassByValue2(c); }
+static_assert(PassByValue(Counter(0)) == 0, "expect no copies");
+static_assert(PassByValue2(Counter(0)) == 1, "expect 1 copy");
+static_assert(PassByValue3(Counter(0)) == 2, "expect 2 copies");
+#endif
+
+struct A {
+  A() noexcept;
+  A(const A&) noexcept;
+  ~A() noexcept;
+};
+
+inline void f(A a) noexcept {}
+
+// CHECK-LABEL: define {{.*}} @_ZN4cwg64callEv
+void call() {
+  A a;
+  // We copy the parameter here, even though object is not mutated by f and
+  // otherwise satisfies the criteria for the proposed CWG6 optimization.
+  // CHECK: call {{.*}} @_ZN4cwg61AC1ERKS0_(
+  // CHECK: call {{.*}} @_ZN4cwg61fENS_1AE(
+  f(a);
+  // CHECK: call {{.*}} @_ZN4cwg61AD1Ev(
+  // CHECK: call {{.*}} @_ZN4cwg61AD1Ev(
+}
+
+} // namespace cwg6
diff --git a/clang/test/ClangScanDeps/link-libraries.c b/clang/test/ClangScanDeps/link-libraries.c
index cc2e223102024..3719d713e775c 100644
--- a/clang/test/ClangScanDeps/link-libraries.c
+++ b/clang/test/ClangScanDeps/link-libraries.c
@@ -32,7 +32,7 @@ module transitive {
 }]
 
 // RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json
-// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-name=root > %t/result.json
+// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-names=root > %t/result.json
 // RUN: cat %t/result.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s
 
 // CHECK:      {
diff --git a/clang/test/ClangScanDeps/modules-full-by-mod-name.c b/clang/test/ClangScanDeps/modules-full-by-mod-name.c
index c838614d0bfde..edb99636aaf25 100644
--- a/clang/test/ClangScanDeps/modules-full-by-mod-name.c
+++ b/clang/test/ClangScanDeps/modules-full-by-mod-name.c
@@ -25,7 +25,7 @@ module transitive { header "transitive.h" }
 }]
 
 // RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json
-// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-name=root > %t/result.json
+// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-names=root > %t/result.json
 // RUN: cat %t/result.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s
 
 // CHECK:      {
diff --git a/clang/test/ClangScanDeps/modules-full-by-mult-mod-names.c b/clang/test/ClangScanDeps/modules-full-by-mult-mod-names.c
new file mode 100644
index 0000000000000..030f7f3427810
--- /dev/null
+++ b/clang/test/ClangScanDeps/modules-full-by-mult-mod-names.c
@@ -0,0 +1,108 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+
+//--- module.modulemap
+module root { header "root.h" }
+module direct { header "direct.h" }
+module transitive { header "transitive.h" }
+module root1 { header "root1.h"}
+//--- root.h
+#include "direct.h"
+#include "root/textual.h"
+
+//--- root1.h
+#include "direct.h"
+
+//--- direct.h
+#include "transitive.h"
+//--- transitive.h
+// empty
+
+//--- root/textual.h
+// This is here to verify that the "root" directory doesn't clash with name of
+// the "root" module.
+
+//--- cdb.json.template
+[{
+  "file": "",
+  "directory": "DIR",
+  "command": "clang -fmodules -fmodules-cache-path=DIR/cache -I DIR -x c"
+}]
+
+// RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json
+// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-names=root,root1,direct > %t/result.json
+// RUN: cat %t/result.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s
+
+// CHECK:      {
+// CHECK-NEXT:   "modules": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "clang-module-deps": [
+// CHECK-NEXT:         {
+// CHECK-NEXT:           "context-hash": "{{.*}}",
+// CHECK-NEXT:           "module-name": "transitive"
+// CHECK-NEXT:         }
+// CHECK-NEXT:       ],
+// CHECK-NEXT:       "clang-modulemap-file": "[[PREFIX]]/module.modulemap",
+// CHECK-NEXT:       "command-line": [
+// CHECK:            ],
+// CHECK-NEXT:       "context-hash": "{{.*}}",
+// CHECK-NEXT:       "file-deps": [
+// CHECK-NEXT:         "[[PREFIX]]/module.modulemap",
+// CHECK-NEXT:         "[[PREFIX]]/direct.h"
+// CHECK-NEXT:       ],
+// CHECK-NEXT:       "link-libraries": [],
+// CHECK-NEXT:       "name": "direct"
+// CHECK-NEXT:     },
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "clang-module-deps": [
+// CHECK-NEXT:         {
+// CHECK-NEXT:           "context-hash": "{{.*}}",
+// CHECK-NEXT:           "module-name": "direct"
+// CHECK-NEXT:         }
+// CHECK-NEXT:       ],
+// CHECK-NEXT:       "clang-modulemap-file": "[[PREFIX]]/module.modulemap",
+// CHECK-NEXT:       "command-line": [
+// CHECK:            ],
+// CHECK-NEXT:       "context-hash": "{{.*}}",
+// CHECK-NEXT:       "file-deps": [
+// CHECK-NEXT:         "[[PREFIX]]/module.modulemap",
+// CHECK-NEXT:         "[[PREFIX]]/root.h",
+// CHECK-NEXT:         "[[PREFIX]]/root/textual.h"
+// CHECK-NEXT:       ],
+// CHECK-NEXT:       "link-libraries": [],
+// CHECK-NEXT:       "name": "root"
+// CHECK-NEXT:     },
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "clang-module-deps": [
+// CHECK-NEXT:         {
+// CHECK-NEXT:           "context-hash": "{{.*}}",
+// CHECK-NEXT:           "module-name": "direct"
+// CHECK-NEXT:         }
+// CHECK-NEXT:       ],
+// CHECK-NEXT:       "clang-modulemap-file": "[[PREFIX]]/module.modulemap",
+// CHECK-NEXT:       "command-line": [
+// CHECK:            ],
+// CHECK-NEXT:       "context-hash": "{{.*}}",
+// CHECK-NEXT:       "file-deps": [
+// CHECK-NEXT:         "[[PREFIX]]/module.modulemap",
+// CHECK-NEXT:         "[[PREFIX]]/root1.h"
+// CHECK-NEXT:       ],
+// CHECK-NEXT:       "link-libraries": [],
+// CHECK-NEXT:       "name": "root1"
+// CHECK-NEXT:     },
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "clang-module-deps": [],
+// CHECK-NEXT:       "clang-modulemap-file": "[[PREFIX]]/module.modulemap",
+// CHECK-NEXT:       "command-line": [
+// CHECK:            ],
+// CHECK-NEXT:       "context-hash": "{{.*}}",
+// CHECK-NEXT:       "file-deps": [
+// CHECK-NEXT:         "[[PREFIX]]/module.modulemap",
+// CHECK-NEXT:         "[[PREFIX]]/transitive.h"
+// CHECK-NEXT:       ],
+// CHECK-NEXT:       "link-libraries": [],
+// CHECK-NEXT:       "name": "transitive"
+// CHECK-NEXT:     }
+// CHECK-NEXT:   ],
+// CHECK-NEXT:   "translation-units": []
+// CHECK-NEXT: }
diff --git a/clang/test/ClangScanDeps/strip-codegen-args.m b/clang/test/ClangScanDeps/strip-codegen-args.m
index 71171f4983386..f2cec6281f7df 100644
--- a/clang/test/ClangScanDeps/strip-codegen-args.m
+++ b/clang/test/ClangScanDeps/strip-codegen-args.m
@@ -16,6 +16,7 @@
 // CHECK-NOT:          "-flto"
 // CHECK-NOT:          "-fno-autolink"
 // CHECK-NOT:          "-mrelax-relocations=no"
+// CHECK-NOT:          "-mspeculative-load-hardening"
 // CHECK:            ]
 // CHECK:            "name": "A"
 // CHECK:          }
@@ -39,6 +40,11 @@
     "command": "clang -Imodules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-modules -O2 -flto=full -fsyntax-only DIR/t3.m",
     "file": "DIR/t2.m"
   }
+  {
+    "directory": "DIR",
+    "command": "clang -Imodules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-modules -O2 -mspeculative-load-hardening -fsyntax-only DIR/t3.m",
+    "file": "DIR/t3.m"
+  }
 ]
 
 //--- modules/A/module.modulemap
diff --git a/clang/test/CodeGen/AArch64/args.cpp b/clang/test/CodeGen/AArch64/args.cpp
index 3cb62d3119ecf..c284316a5e1b4 100644
--- a/clang/test/CodeGen/AArch64/args.cpp
+++ b/clang/test/CodeGen/AArch64/args.cpp
@@ -17,11 +17,29 @@ struct Empty {};
 
 // DARWIN: define{{.*}} i32 @empty_arg(i32 noundef %a)
 // C: define{{.*}} i32 @empty_arg(i32 noundef %a)
-// CXX: define{{.*}} i32 @empty_arg(i8 %e.coerce, i32 noundef %a)
+// CXX: define{{.*}} i32 @empty_arg(i64 %e.coerce, i32 noundef %a)
 EXTERNC int empty_arg(struct Empty e, int a) {
   return a;
 }
 
+// CXX: define{{.*}} i32 @empty_align8_arg(i64 %a.coerce, i32 noundef %b)
+struct EmptyAlign8 { int __attribute__((aligned(8))) : 0; };
+EXTERNC int empty_align8_arg(struct EmptyAlign8 a, int b) {
+  return b;
+}
+
+// CXX: define{{.*}} i32 @empty_align16_arg(i128 %a.coerce, i32 noundef %b)
+struct EmptyAlign16 { long long int __attribute__((aligned(16))) : 0; };
+EXTERNC int empty_align16_arg(struct EmptyAlign16 a, int b) {
+  return b;
+}
+
+// CXX: define{{.*}} i32 @empty_align32_arg(ptr dead_on_return noundef %a, i32 noundef %b)
+struct EmptyAlign32 { long long int __attribute__((aligned(32))) : 0; };
+EXTERNC int empty_align32_arg(struct EmptyAlign32 a, int b) {
+  return b;
+}
+
 // DARWIN: define{{.*}} void @empty_ret()
 // C: define{{.*}} void @empty_ret()
 // CXX: define{{.*}} void @empty_ret()
diff --git a/clang/test/CodeGen/AArch64/fmv-detection.c b/clang/test/CodeGen/AArch64/fmv-detection.c
index e585140a1eb08..a6761ffd4bb1e 100644
--- a/clang/test/CodeGen/AArch64/fmv-detection.c
+++ b/clang/test/CodeGen/AArch64/fmv-detection.c
@@ -437,7 +437,7 @@ int caller() {
 // CHECK-NEXT:    ret i32 [[CALL]]
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@fmv.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@fmv.resolver() {{[#0-9]* }}comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
diff --git a/clang/test/CodeGen/AArch64/fmv-mix-explicit-implicit-default.c b/clang/test/CodeGen/AArch64/fmv-mix-explicit-implicit-default.c
index dcc5e1c5886e2..a6d6509ca7de0 100644
--- a/clang/test/CodeGen/AArch64/fmv-mix-explicit-implicit-default.c
+++ b/clang/test/CodeGen/AArch64/fmv-mix-explicit-implicit-default.c
@@ -107,22 +107,26 @@ int caller6(void) { return no_def_explicit_default_first(); }
 // CHECK-NEXT:    ret i32 [[CALL]]
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@implicit_default_decl_first.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@implicit_default_decl_first.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER:[0-9]+]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    ret ptr @implicit_default_decl_first.default
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@explicit_default_def_first.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@explicit_default_def_first.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    ret ptr @explicit_default_def_first.default
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@implicit_default_def_first.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@implicit_default_def_first.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    ret ptr @implicit_default_def_first.default
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@explicit_default_decl_first.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@explicit_default_decl_first.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    ret ptr @explicit_default_decl_first.default
 //
diff --git a/clang/test/CodeGen/AArch64/fmv-priority.c b/clang/test/CodeGen/AArch64/fmv-priority.c
index c92e0c4e9c3db..84c84df5a2fa0 100644
--- a/clang/test/CodeGen/AArch64/fmv-priority.c
+++ b/clang/test/CodeGen/AArch64/fmv-priority.c
@@ -2,13 +2,10 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
 
 // Priority biskmasks after feature dependency expansion:
-//
 // MSB                                                    LSB
-//
 // sme2 | wfxt | sme | bf16 |       |      | fp16 | simd | fp
 // -----+------+-----+------+-------+------+------+------+---
 // sme2 |      | sme | bf16 | rcpc2 | rcpc | fp16 | simd | fp
-//
 // Dependencies should not affect priorities, since a
 // feature can only depend on lower priority features:
 // https://github.com/ARM-software/acle/pull/376
@@ -32,7 +29,8 @@ int call() { return fn(); }
 // CHECK-NEXT:    ret i32 [[CALL]]
 //
 //
-// CHECK-LABEL: define weak_odr ptr @fn.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @fn.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER:[0-9]+]] comdat {
 // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
diff --git a/clang/test/CodeGen/AArch64/fmv-resolver-emission.c b/clang/test/CodeGen/AArch64/fmv-resolver-emission.c
index 591625d4d0da1..beebbb2166edf 100644
--- a/clang/test/CodeGen/AArch64/fmv-resolver-emission.c
+++ b/clang/test/CodeGen/AArch64/fmv-resolver-emission.c
@@ -258,7 +258,8 @@ __attribute__((target_clones("aes"))) void clones_without_default(void) {}
 // CHECK-NEXT:    ret void
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@used_before_default_def.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@used_before_default_def.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER:[0-9]+]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -272,7 +273,8 @@ __attribute__((target_clones("aes"))) void clones_without_default(void) {}
 // CHECK-NEXT:    ret ptr @used_before_default_def.default
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@used_after_default_def.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@used_after_default_def.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -286,7 +288,8 @@ __attribute__((target_clones("aes"))) void clones_without_default(void) {}
 // CHECK-NEXT:    ret ptr @used_after_default_def.default
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@not_used_with_default.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@not_used_with_default.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -300,7 +303,8 @@ __attribute__((target_clones("aes"))) void clones_without_default(void) {}
 // CHECK-NEXT:    ret ptr @not_used_with_default.default
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@indirect_use.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@indirect_use.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -328,7 +332,8 @@ __attribute__((target_clones("aes"))) void clones_without_default(void) {}
 // CHECK-NEXT:    ret void
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@internal_func.resolver() {
+// CHECK-LABEL: define {{[^@]+}}@internal_func.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -356,7 +361,8 @@ __attribute__((target_clones("aes"))) void clones_without_default(void) {}
 // CHECK-NEXT:    ret void
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@linkonce_func.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@linkonce_func.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -370,7 +376,8 @@ __attribute__((target_clones("aes"))) void clones_without_default(void) {}
 // CHECK-NEXT:    ret ptr @linkonce_func.default
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@clones_with_default.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@clones_with_default.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -383,6 +390,7 @@ __attribute__((target_clones("aes"))) void clones_without_default(void) {}
 // CHECK:       resolver_else:
 // CHECK-NEXT:    ret ptr @clones_with_default.default
 //
+// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
 //.
 // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
diff --git a/clang/test/CodeGen/AArch64/mixed-target-attributes.c b/clang/test/CodeGen/AArch64/mixed-target-attributes.c
index ef47c8a3bc737..480c010b92d96 100644
--- a/clang/test/CodeGen/AArch64/mixed-target-attributes.c
+++ b/clang/test/CodeGen/AArch64/mixed-target-attributes.c
@@ -127,7 +127,8 @@ __attribute__((target_version("jscvt"))) int default_def_with_version_decls(void
 // CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@implicit_default.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@implicit_default.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER:[0-9]+]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -165,7 +166,8 @@ __attribute__((target_version("jscvt"))) int default_def_with_version_decls(void
 // CHECK-NEXT:    ret ptr @implicit_default.default
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@explicit_default.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@explicit_default.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -203,7 +205,8 @@ __attribute__((target_version("jscvt"))) int default_def_with_version_decls(void
 // CHECK-NEXT:    ret ptr @explicit_default.default
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@default_def_with_version_decls.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@default_def_with_version_decls.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
diff --git a/clang/test/CodeGen/AArch64/neon-fcvt-intrinsics.c b/clang/test/CodeGen/AArch64/neon-fcvt-intrinsics.c
index 670b65070289d..929df94aa60ef 100644
--- a/clang/test/CodeGen/AArch64/neon-fcvt-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-fcvt-intrinsics.c
@@ -26,16 +26,36 @@ int32_t test_vcvtas_s32_f32(float32_t a) {
   return (int32_t)vcvtas_s32_f32(a);
 }
 
-// CHECK-LABEL: define {{[^@]+}}@test_test_vcvtad_s64_f64
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtad_s64_f64
 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VCVTAD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double [[A]])
 // CHECK-NEXT:    ret i64 [[VCVTAD_S64_F64_I]]
 //
-int64_t test_test_vcvtad_s64_f64(float64_t a) {
+int64_t test_vcvtad_s64_f64(float64_t a) {
   return (int64_t)vcvtad_s64_f64(a);
 }
 
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtas_s64_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTAS_S64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f32(float [[A]])
+// CHECK-NEXT:    ret i64 [[VCVTAS_S64_F32_I]]
+//
+int64_t test_vcvtas_s64_f32(float32_t a) {
+  return (int64_t)vcvtas_s64_f32(a);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtad_s32_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTAD_S32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f64(double [[A]])
+// CHECK-NEXT:    ret i32 [[VCVTAD_S32_F64_I]]
+//
+int32_t test_vcvtad_s32_f64(float64_t a) {
+  return (int32_t)vcvtad_s32_f64(a);
+}
+
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtas_u32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
@@ -56,6 +76,26 @@ uint64_t test_vcvtad_u64_f64(float64_t a) {
   return (uint64_t)vcvtad_u64_f64(a);
 }
 
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtas_u64_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTAS_U64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f32(float [[A]])
+// CHECK-NEXT:    ret i64 [[VCVTAS_U64_F32_I]]
+//
+uint64_t test_vcvtas_u64_f32(float32_t a) {
+  return (uint64_t)vcvtas_u64_f32(a);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtad_u32_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTAD_U32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f64(double [[A]])
+// CHECK-NEXT:    ret i32 [[VCVTAD_U32_F64_I]]
+//
+uint32_t test_vcvtad_u32_f64(float64_t a) {
+  return (uint32_t)vcvtad_u32_f64(a);
+}
+
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtms_s32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
@@ -76,6 +116,26 @@ int64_t test_vcvtmd_s64_f64(float64_t a) {
   return (int64_t)vcvtmd_s64_f64(a);
 }
 
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtms_s64_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTMS_S64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f32(float [[A]])
+// CHECK-NEXT:    ret i64 [[VCVTMS_S64_F32_I]]
+//
+int64_t test_vcvtms_s64_f32(float32_t a) {
+  return (int64_t)vcvtms_s64_f32(a);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtmd_s32_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTMD_S32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f64(double [[A]])
+// CHECK-NEXT:    ret i32 [[VCVTMD_S32_F64_I]]
+//
+int32_t test_vcvtmd_s32_f64(float64_t a) {
+  return (int32_t)vcvtmd_s32_f64(a);
+}
+
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtms_u32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
@@ -96,6 +156,26 @@ uint64_t test_vcvtmd_u64_f64(float64_t a) {
   return (uint64_t)vcvtmd_u64_f64(a);
 }
 
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtms_u64_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTMS_U64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float [[A]])
+// CHECK-NEXT:    ret i64 [[VCVTMS_U64_F32_I]]
+//
+uint64_t test_vcvtms_u64_f32(float32_t a) {
+  return (uint64_t)vcvtms_u64_f32(a);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtmd_u32_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTMD_U32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double [[A]])
+// CHECK-NEXT:    ret i32 [[VCVTMD_U32_F64_I]]
+//
+uint32_t test_vcvtmd_u32_f64(float64_t a) {
+  return (uint32_t)vcvtmd_u32_f64(a);
+}
+
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtns_s32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
@@ -116,6 +196,26 @@ int64_t test_vcvtnd_s64_f64(float64_t a) {
   return (int64_t)vcvtnd_s64_f64(a);
 }
 
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtns_s64_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTNS_S64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f32(float [[A]])
+// CHECK-NEXT:    ret i64 [[VCVTNS_S64_F32_I]]
+//
+int64_t test_vcvtns_s64_f32(float32_t a) {
+  return (int64_t)vcvtns_s64_f32(a);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtnd_s32_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTND_S32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f64(double [[A]])
+// CHECK-NEXT:    ret i32 [[VCVTND_S32_F64_I]]
+//
+int32_t test_vcvtnd_s32_f64(float64_t a) {
+  return (int32_t)vcvtnd_s32_f64(a);
+}
+
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtns_u32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
@@ -136,6 +236,26 @@ uint64_t test_vcvtnd_u64_f64(float64_t a) {
   return (uint64_t)vcvtnd_u64_f64(a);
 }
 
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtns_u64_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTNS_U64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float [[A]])
+// CHECK-NEXT:    ret i64 [[VCVTNS_U64_F32_I]]
+//
+uint64_t test_vcvtns_u64_f32(float32_t a) {
+  return (uint64_t)vcvtns_u64_f32(a);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtnd_u32_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTND_U32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double [[A]])
+// CHECK-NEXT:    ret i32 [[VCVTND_U32_F64_I]]
+//
+uint32_t test_vcvtnd_u32_f64(float64_t a) {
+  return (uint32_t)vcvtnd_u32_f64(a);
+}
+
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtps_s32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
@@ -156,6 +276,26 @@ int64_t test_vcvtpd_s64_f64(float64_t a) {
   return (int64_t)vcvtpd_s64_f64(a);
 }
 
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtps_s64_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTPS_S64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f32(float [[A]])
+// CHECK-NEXT:    ret i64 [[VCVTPS_S64_F32_I]]
+//
+int64_t test_vcvtps_s64_f32(float32_t a) {
+  return (int64_t)vcvtps_s64_f32(a);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtpd_s32_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTPD_S32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f64(double [[A]])
+// CHECK-NEXT:    ret i32 [[VCVTPD_S32_F64_I]]
+//
+int32_t test_vcvtpd_s32_f64(float64_t a) {
+  return (int32_t)vcvtpd_s32_f64(a);
+}
+
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtps_u32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
@@ -176,6 +316,26 @@ uint64_t test_vcvtpd_u64_f64(float64_t a) {
   return (uint64_t)vcvtpd_u64_f64(a);
 }
 
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtps_u64_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTPS_U64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float [[A]])
+// CHECK-NEXT:    ret i64 [[VCVTPS_U64_F32_I]]
+//
+uint64_t test_vcvtps_u64_f32(float32_t a) {
+  return (uint64_t)vcvtps_u64_f32(a);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtpd_u32_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTPD_U32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double [[A]])
+// CHECK-NEXT:    ret i32 [[VCVTPD_U32_F64_I]]
+//
+uint32_t test_vcvtpd_u32_f64(float64_t a) {
+  return (uint32_t)vcvtpd_u32_f64(a);
+}
+
 // CHECK-LABEL: define {{[^@]+}}@test_vcvts_s32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
@@ -196,6 +356,26 @@ int64_t test_vcvtd_s64_f64(float64_t a) {
   return (int64_t)vcvtd_s64_f64(a);
 }
 
+// CHECK-LABEL: define {{[^@]+}}@test_vcvts_s64_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTS_S64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float [[A]])
+// CHECK-NEXT:    ret i64 [[VCVTS_S64_F32_I]]
+//
+int64_t test_vcvts_s64_f32(float32_t a) {
+  return (int64_t)vcvts_s64_f32(a);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtd_s32_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTD_S32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double [[A]])
+// CHECK-NEXT:    ret i32 [[VCVTD_S32_F64_I]]
+//
+int32_t test_vcvtd_s32_f64(float64_t a) {
+  return (int32_t)vcvtd_s32_f64(a);
+}
+
 // CHECK-LABEL: define {{[^@]+}}@test_vcvts_u32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
@@ -215,3 +395,24 @@ uint32_t test_vcvts_u32_f32(float32_t a) {
 uint64_t test_vcvtd_u64_f64(float64_t a) {
   return (uint64_t)vcvtd_u64_f64(a);
 }
+
+// CHECK-LABEL: define {{[^@]+}}@test_vcvts_u64_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTS_U64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float [[A]])
+// CHECK-NEXT:    ret i64 [[VCVTS_U64_F32_I]]
+//
+uint64_t test_vcvts_u64_f32(float32_t a) {
+  return (uint64_t)vcvts_u64_f32(a);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtd_u32_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTD_U32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double [[A]])
+// CHECK-NEXT:    ret i32 [[VCVTD_U32_F64_I]]
+//
+uint32_t test_vcvtd_u32_f64(float64_t a) {
+  return (uint32_t)vcvtd_u32_f64(a);
+}
+
diff --git a/clang/test/CodeGen/AArch64/resolver-attributes.c b/clang/test/CodeGen/AArch64/resolver-attributes.c
index 6e4497cdc8611..b53da46354c34 100644
--- a/clang/test/CodeGen/AArch64/resolver-attributes.c
+++ b/clang/test/CodeGen/AArch64/resolver-attributes.c
@@ -46,17 +46,20 @@ __attribute__((ifunc("ifunc_resolver"))) int ifunc(void);
 // BTI: define internal ptr @static_target_clones.resolver()  #[[ATTR_RESOLVER]]
 // BTI: define internal ptr @static_target_version.resolver() #[[ATTR_RESOLVER]]
 
-// In NOBTI case, no attribute groups are assigned to the resolver functions:
-// NOBTI: define weak_odr ptr @global_target_clones.resolver(){{( comdat)?}} {
-// NOBTI: define weak_odr ptr @global_target_version.resolver(){{( comdat)?}} {
-// NOBTI: define internal ptr @static_target_clones.resolver() {
-// NOBTI: define internal ptr @static_target_version.resolver() {
+// In NOBTI case, only "no_sanitizer_instrumentation" attributes are added to the resolver
 
-// HIDDEN: define weak_odr hidden ptr @global_target_clones.resolver(){{( comdat)?}} {
-// HIDDEN: define weak_odr hidden ptr @global_target_version.resolver(){{( comdat)?}} {
-// HIDDEN: define internal ptr @static_target_clones.resolver() {
-// HIDDEN: define internal ptr @static_target_version.resolver() {
+// NOBTI: define weak_odr ptr @global_target_clones.resolver() [[ATTR_RESOLVER:(#[0-9]+)?]]{{( comdat)?}}
+// NOBTI: define weak_odr ptr @global_target_version.resolver() [[ATTR_RESOLVER]]{{( comdat)?}}
+// NOBTI: define internal ptr @static_target_clones.resolver() [[ATTR_RESOLVER]]
+// NOBTI: define internal ptr @static_target_version.resolver() [[ATTR_RESOLVER]]
+
+// HIDDEN: define weak_odr hidden ptr @global_target_clones.resolver() [[ATTR_RESOLVER:(#[0-9]+)?]]{{( comdat)?}}
+// HIDDEN: define weak_odr hidden ptr @global_target_version.resolver() [[ATTR_RESOLVER]]{{( comdat)?}}
+// HIDDEN: define internal ptr @static_target_clones.resolver() [[ATTR_RESOLVER]] 
+// HIDDEN: define internal ptr @static_target_version.resolver() [[ATTR_RESOLVER]] 
 
 // ELF:       attributes #[[ATTR_IFUNC_RESOLVER]] = { {{.*}}"branch-target-enforcement"{{.*}} }
 
 // BTI:       attributes #[[ATTR_RESOLVER]] = { {{.*}}"branch-target-enforcement"{{.*}} }
+//
+// NOBTI:        attributes [[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
diff --git a/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp b/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp
index f0c9ef28201a5..97fdd0ce56c66 100644
--- a/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp
+++ b/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp
@@ -575,19 +575,21 @@ void TSpp_align16(SSpp_align16 s) { *s.a.x = 1; }
 struct Sempty {
 };
 // CHECK-A64-LABEL: define dso_local void @_Z6Tempty6Sempty(
-// CHECK-A64-SAME: i8 [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-A64-NEXT:  [[ENTRY:.*:]]
 // CHECK-A64-NEXT:    [[S:%.*]] = alloca [[STRUCT_SEMPTY:%.*]], align 1
 // CHECK-A64-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SEMPTY]], ptr [[S]], i32 0, i32 0
-// CHECK-A64-NEXT:    store i8 [[S_COERCE]], ptr [[COERCE_DIVE]], align 1
+// CHECK-A64-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[S_COERCE]] to i8
+// CHECK-A64-NEXT:    store i8 [[COERCE_VAL_II]], ptr [[COERCE_DIVE]], align 1
 // CHECK-A64-NEXT:    ret void
 //
 // CHECK-A64_32-LABEL: define void @_Z6Tempty6Sempty(
-// CHECK-A64_32-SAME: i8 [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-A64_32-NEXT:  [[ENTRY:.*:]]
 // CHECK-A64_32-NEXT:    [[S:%.*]] = alloca [[STRUCT_SEMPTY:%.*]], align 1
 // CHECK-A64_32-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SEMPTY]], ptr [[S]], i32 0, i32 0
-// CHECK-A64_32-NEXT:    store i8 [[S_COERCE]], ptr [[COERCE_DIVE]], align 1
+// CHECK-A64_32-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[S_COERCE]] to i8
+// CHECK-A64_32-NEXT:    store i8 [[COERCE_VAL_II]], ptr [[COERCE_DIVE]], align 1
 // CHECK-A64_32-NEXT:    ret void
 //
 void Tempty(Sempty s) { }
diff --git a/clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c b/clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c
new file mode 100644
index 0000000000000..89ee9e38bb3fb
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c
@@ -0,0 +1,23 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f16mm -target-feature +fp8 \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=mem2reg,sroa \
+// RUN: | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8(
+// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[P3]])
+// CHECK-NEXT:    [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[FMMLA1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> [[FMMLA_I]], <16 x i8> [[P1]], <16 x i8> [[P2]])
+// CHECK-NEXT:    ret <8 x half> [[FMMLA1_I]]
+//
+float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
+  return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3);
+}
diff --git a/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c b/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c
new file mode 100644
index 0000000000000..13db72c2cbdd1
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c
@@ -0,0 +1,21 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f32mm -target-feature +fp8 \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=mem2reg,sroa \
+// RUN: | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8(
+// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[P3]])
+// CHECK-NEXT:    [[FMMLA_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v16i8(<4 x float> [[P0]], <16 x i8> [[P1]], <16 x i8> [[P2]])
+// CHECK-NEXT:    ret <4 x float> [[FMMLA_I]]
+//
+float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
+  return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3);
+}
+
diff --git a/clang/test/CodeGen/Inputs/basic-block-sections.funcnames b/clang/test/CodeGen/Inputs/basic-block-sections.funcnames
index 329cea9a0adfb..2452ee345fe2f 100644
--- a/clang/test/CodeGen/Inputs/basic-block-sections.funcnames
+++ b/clang/test/CodeGen/Inputs/basic-block-sections.funcnames
@@ -1 +1,3 @@
-!world
+v1
+f world
+c 0
diff --git a/clang/test/CodeGen/PowerPC/ppc64-abi-override-datalayout.c b/clang/test/CodeGen/PowerPC/ppc64-abi-override-datalayout.c
new file mode 100644
index 0000000000000..30b85d24a56fd
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/ppc64-abi-override-datalayout.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple powerpc64-unknown-linux-gnu -target-abi elfv2 %s -o - -emit-llvm | FileCheck %s
+
+// REQUIRES: powerpc-registered-target
+
+// Make sure that overriding the ABI to ELFv2 on a target that defaults to
+// ELFv1 changes the data layout:
+
+// CHECK: target datalayout = "E-m:e-Fn32-i64:64-i128:128-n32:64-S128-v256:256:256-v512:512:512"
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c
new file mode 100644
index 0000000000000..a0d5845208529
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c
@@ -0,0 +1,131 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4(vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2(vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1(vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2(vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4(vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8(vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8_m(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c
new file mode 100644
index 0000000000000..25d0991fa70cd
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c
@@ -0,0 +1,111 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +xsfvfexp32e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8_m(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c
new file mode 100644
index 0000000000000..9fc332a1469ff
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c
@@ -0,0 +1,135 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8_m(vm, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c
new file mode 100644
index 0000000000000..67a9220bd011d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c
@@ -0,0 +1,234 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexpa -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4(vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2(vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1(vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2(vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4(vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8(vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8_m(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c
new file mode 100644
index 0000000000000..fd6f82db52953
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c
@@ -0,0 +1,90 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64.i64(<vscale x 1 x double> poison, <vscale x 1 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1(vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2(
+// CHECK-RV64-SAME: <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64.i64(<vscale x 2 x double> poison, <vscale x 2 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2(vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4(
+// CHECK-RV64-SAME: <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64.i64(<vscale x 4 x double> poison, <vscale x 4 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4(vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8(
+// CHECK-RV64-SAME: <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64.i64(<vscale x 8 x double> poison, <vscale x 8 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8(vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> poison, <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_m(vbool64_t vm, vfloat64m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> poison, <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_m(vbool32_t vm, vfloat64m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> poison, <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_m(vbool16_t vm, vfloat64m4_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> poison, <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_m(vbool8_t vm, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8_m(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c
new file mode 100644
index 0000000000000..0e769ed5fc5bc
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c
@@ -0,0 +1,131 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4(vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2(vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1(vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2(vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4(vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8(vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c
new file mode 100644
index 0000000000000..3df1eaa3a0467
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c
@@ -0,0 +1,111 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +xsfvfexp32e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c
new file mode 100644
index 0000000000000..6179dbe8d82e4
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c
new file mode 100644
index 0000000000000..1ddbb0b84520c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c
@@ -0,0 +1,234 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexpa -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4(vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2(vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1(vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2(vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4(vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8(vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c
new file mode 100644
index 0000000000000..165879a8bb589
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c
@@ -0,0 +1,90 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64.i64(<vscale x 1 x double> poison, <vscale x 1 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1(vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2(
+// CHECK-RV64-SAME: <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64.i64(<vscale x 2 x double> poison, <vscale x 2 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2(vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4(
+// CHECK-RV64-SAME: <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64.i64(<vscale x 4 x double> poison, <vscale x 4 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4(vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8(
+// CHECK-RV64-SAME: <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64.i64(<vscale x 8 x double> poison, <vscale x 8 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8(vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> poison, <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_m(vbool64_t vm, vfloat64m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> poison, <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_m(vbool32_t vm, vfloat64m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> poison, <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_m(vbool16_t vm, vfloat64m4_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> poison, <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_m(vbool8_t vm, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c
new file mode 100644
index 0000000000000..aed6d87a4b18a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c
@@ -0,0 +1,248 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c
new file mode 100644
index 0000000000000..374f324cc0808
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c
@@ -0,0 +1,208 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +xsfvfexp32e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c
new file mode 100644
index 0000000000000..aec0b9f934ab9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c
@@ -0,0 +1,248 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c
new file mode 100644
index 0000000000000..b6870264251cc
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c
@@ -0,0 +1,448 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexpa -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c
new file mode 100644
index 0000000000000..8638dc232cf01
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c
@@ -0,0 +1,167 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tu(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tu(vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tu(vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tu(
+// CHECK-RV64-SAME: <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tu(vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tu(
+// CHECK-RV64-SAME: <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tu(vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tum(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tum(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tum(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tum(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tumu(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tumu(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tumu(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tumu(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_mu(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_mu(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_mu(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_mu(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c
new file mode 100644
index 0000000000000..4ceeb7b35629c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c
@@ -0,0 +1,261 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd,
+                                         vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd,
+                                         vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd,
+                                       vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd,
+                                       vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd,
+                                       vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd,
+                                       vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd,
+                                          vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd,
+                                          vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd,
+                                        vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd,
+                                        vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd,
+                                        vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd,
+                                        vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd,
+                                        vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd,
+                                        vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd,
+                                      vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd,
+                                      vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd,
+                                      vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd,
+                                      vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c
new file mode 100644
index 0000000000000..e08d6c5b371cc
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c
@@ -0,0 +1,228 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +xsfvfexp32e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                         vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                       vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                       vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                       vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                       vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                          vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                        vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                        vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                        vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                        vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                        vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                      vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                      vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                      vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                      vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c
new file mode 100644
index 0000000000000..14570d465bea8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c
@@ -0,0 +1,272 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs2,
+                                          size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs2,
+                                          size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                           vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                           vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                         vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                         vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                         vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd,
+                                         vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
+                                            vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
+                                            vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                          vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                          vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                          vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd,
+                                          vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                          vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                          vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                        vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                        vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                        vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd,
+                                        vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c
new file mode 100644
index 0000000000000..4ac5cfc360551
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c
@@ -0,0 +1,492 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexpa -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd,
+                                          vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd,
+                                          vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd,
+                                        vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd,
+                                        vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd,
+                                        vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd,
+                                        vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                          vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                        vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                        vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                        vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                        vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd,
+                                           vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd,
+                                           vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd,
+                                         vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd,
+                                         vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd,
+                                         vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd,
+                                         vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                           vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                         vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                         vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                         vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                         vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd,
+                                         vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd,
+                                         vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd,
+                                       vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd,
+                                       vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd,
+                                       vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd,
+                                       vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                         vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                       vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                       vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                       vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                       vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c
new file mode 100644
index 0000000000000..d0faaee571122
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c
@@ -0,0 +1,183 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tu(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tu(vfloat64m1_t vd, vfloat64m1_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tu(vfloat64m2_t vd, vfloat64m2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tu(
+// CHECK-RV64-SAME: <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tu(vfloat64m4_t vd, vfloat64m4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tu(
+// CHECK-RV64-SAME: <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tu(vfloat64m8_t vd, vfloat64m8_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tum(vbool64_t vm, vfloat64m1_t vd,
+                                        vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tum(vbool32_t vm, vfloat64m2_t vd,
+                                        vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tum(vbool16_t vm, vfloat64m4_t vd,
+                                        vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tum(vbool8_t vm, vfloat64m8_t vd,
+                                        vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tumu(vbool64_t vm, vfloat64m1_t vd,
+                                         vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tumu(vbool32_t vm, vfloat64m2_t vd,
+                                         vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tumu(vbool16_t vm, vfloat64m4_t vd,
+                                         vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tumu(vbool8_t vm, vfloat64m8_t vd,
+                                         vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_mu(vbool64_t vm, vfloat64m1_t vd,
+                                       vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_mu(vbool32_t vm, vfloat64m2_t vd,
+                                       vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_mu(vbool16_t vm, vfloat64m4_t vd,
+                                       vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_mu(vbool8_t vm, vfloat64m8_t vd,
+                                       vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
index d5d15b4dea966..35fde8733f375 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
@@ -3584,13 +3584,13 @@ void test_integer(void) {
   // CHECK-ASM: vsrlb
 
   vsc = vec_abs(vsc);
-  // CHECK-ASM: vlcb
+  // CHECK-ASM: vlpb
   vss = vec_abs(vss);
-  // CHECK-ASM: vlch
+  // CHECK-ASM: vlph
   vsi = vec_abs(vsi);
-  // CHECK-ASM: vlcf
+  // CHECK-ASM: vlpf
   vsl = vec_abs(vsl);
-  // CHECK-ASM: vlcg
+  // CHECK-ASM: vlpg
 
   vsc = vec_max(vsc, vsc);
   // CHECK-ASM: vmxb
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c
index 6ee9e1ee3a117..cd0fafdb7435f 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector5.c
@@ -246,7 +246,7 @@ void test_integer(void) {
   // CHECK-ASM: vctzq
 
   vslll = vec_abs(vslll);
-  // CHECK-ASM: vlcq
+  // CHECK-ASM: vlpq
 
   vslll = vec_avg(vslll, vslll);
   // CHECK: call i128 @llvm.s390.vavgq(i128 %{{.*}}, i128 %{{.*}})
diff --git a/clang/test/CodeGen/WebAssembly/musttail.c b/clang/test/CodeGen/WebAssembly/musttail.c
new file mode 100644
index 0000000000000..37fed70028bbc
--- /dev/null
+++ b/clang/test/CodeGen/WebAssembly/musttail.c
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 %s -triple wasm32-unknown-unknown -target-feature +tail-call -o /dev/null -emit-llvm -verify=tail
+// RUN: %clang_cc1 %s -triple wasm32-unknown-unknown -o /dev/null -emit-llvm -verify=notail
+
+int foo(int x) {
+  return x;
+}
+
+#if __has_attribute(musttail)
+// tail-warning@+1 {{HAS IT}}
+#warning HAS IT
+#else
+// notail-warning@+1 {{DOES NOT HAVE}}
+#warning DOES NOT HAVE
+#endif
+
+int bar(int x)
+{
+  // notail-warning@+1 {{unknown attribute 'clang::musttail' ignored}}
+ [[clang::musttail]] return foo(1);
+}
diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose.c b/clang/test/CodeGen/X86/amx_movrs_tranpose.c
deleted file mode 100755
index 192c153835e1e..0000000000000
--- a/clang/test/CodeGen/X86/amx_movrs_tranpose.c
+++ /dev/null
@@ -1,53 +0,0 @@
-// RUN:  %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
-// RUN:  -target-feature +amx-movrs  -emit-llvm -o - -Wall -Werror -pedantic \
-// RUN:  -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s
-
-#include <immintrin.h>
-#include <stddef.h>
-
-char buf[2048];
-#define STRIDE 32
-
-// CHECK-LABEL:  define dso_local void @test_tile_2rpntlvwz0rs_internal(
-// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
-// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
-// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
-// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
-// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
-// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
-void test_tile_2rpntlvwz0rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
-  _tile_2rpntlvwz0rs_internal(row, col0, col1, D0, D1, B, 1);
-}
-
-// CHECK-LABEL:  define dso_local void @test_tile_2rpntlvwz0rst1_internal(
-// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
-// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
-// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
-// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
-// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
-// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
-void test_tile_2rpntlvwz0rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
-  _tile_2rpntlvwz0rst1_internal(row, col0, col1, D0, D1, B, 1);
-}
-
-// CHECK-LABEL:  define dso_local void @test_tile_2rpntlvwz1rs_internal(
-// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
-// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
-// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
-// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
-// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
-// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
-void test_tile_2rpntlvwz1rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
-  _tile_2rpntlvwz1rs_internal(row, col0, col1, D0, D1, B, 1);
-}
-
-// CHECK-LABEL:  define dso_local void @test_tile_2rpntlvwz1rst1_internal(
-// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
-// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
-// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
-// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
-// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
-// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
-void test_tile_2rpntlvwz1rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
-  _tile_2rpntlvwz1rst1_internal(row, col0, col1, D0, D1, B, 1);
-}
diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c b/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c
deleted file mode 100755
index b174cc5067bf3..0000000000000
--- a/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c
+++ /dev/null
@@ -1,81 +0,0 @@
-// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
-// RUN: -target-feature +amx-movrs  -emit-llvm -o - -Wall -Werror -pedantic \
-// RUN: -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s
-
-#include <immintrin.h>
-#include <stddef.h>
-
-char buf[2048];
-#define STRIDE 32
-
-void test_tile_2rpntlvwz0rs(const void *A, size_t B) {
-  // CHECK-LABEL: @test_tile_2rpntlvwz0rs
-  // CHECK: call void @llvm.x86.t2rpntlvwz0rs(i8 1, ptr %{{.*}}, i64 %{{.*}})
-  _tile_2rpntlvwz0rs(1, A, B);
-}
-
-void test_tile_2rpntlvwz0rst1(const void *A, size_t B) {
-  // CHECK-LABEL: @test_tile_2rpntlvwz0rst1
-  // CHECK: call void @llvm.x86.t2rpntlvwz0rst1(i8 1, ptr %{{.*}}, i64 %{{.*}})
-  _tile_2rpntlvwz0rst1(1, A, B);
-}
-
-void test_tile_2rpntlvwz1rs(const void *A, size_t B) {
-  // CHECK-LABEL: @test_tile_2rpntlvwz1rs
-  // CHECK: call void @llvm.x86.t2rpntlvwz1rs(i8 1, ptr %{{.*}}, i64 %{{.*}})
-  _tile_2rpntlvwz1rs(1, A, B);
-}
-
-void test_tile_2rpntlvwz1rst1(const void *A, size_t B) {
-  // CHECK-LABEL: @test_tile_2rpntlvwz1rst1
-  // CHECK: call void @llvm.x86.t2rpntlvwz1rst1(i8 1, ptr %{{.*}}, i64 %{{.*}})
-  _tile_2rpntlvwz1rst1(1, A, B);
-}
-
-void test__tile_2rpntlvwz0rs(__tile1024i dst0, __tile1024i dst1) {
-  //CHECK-LABEL: @test__tile_2rpntlvwz0rs
-  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  __tile_2rpntlvwz0rs(&dst0, &dst1, buf, STRIDE);
-}
-
-void test__tile_2rpntlvwz0rst1(__tile1024i dst0, __tile1024i dst1) {
-  //CHECK-LABEL: @test__tile_2rpntlvwz0rst1
-  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  __tile_2rpntlvwz0rst1(&dst0, &dst1, buf, STRIDE);
-}
-
-void test__tile_2rpntlvwz1rs(__tile1024i dst0, __tile1024i dst1) {
-  //CHECK-LABEL: @test__tile_2rpntlvwz1rs
-  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  __tile_2rpntlvwz1rs(&dst0, &dst1, buf, STRIDE);
-}
-
-void test__tile_2rpntlvwz1rst1(__tile1024i dst0, __tile1024i dst1) {
-  //CHECK-LABEL: @test__tile_2rpntlvwz1rst1
-  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  __tile_2rpntlvwz1rst1(&dst0, &dst1, buf, STRIDE);
-}
diff --git a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
deleted file mode 100755
index 840b52bbb29bb..0000000000000
--- a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
-// RUN: -target-feature +amx-int8 -target-feature +amx-transpose -target-feature +amx-movrs \
-// RUN: -verify
-
-#include <immintrin.h>
-#include <stddef.h>
-
-void test_tile_2rpntlvwz0rs(const void *A, size_t B) {
-  _tile_2rpntlvwz0rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
-
-void test_tile_2rpntlvwz0rst1(const void *A, size_t B) {
-  _tile_2rpntlvwz0rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
-
-void test_tile_2rpntlvwz1rs(const void *A, size_t B) {
-  _tile_2rpntlvwz1rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
-
-void test_tile_2rpntlvwz1rst1(const void *A, size_t B) {
-  _tile_2rpntlvwz1rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
diff --git a/clang/test/CodeGen/X86/amx_tf32.c b/clang/test/CodeGen/X86/amx_tf32.c
index 661a9dfbc673b..54ad6bb714933 100644
--- a/clang/test/CodeGen/X86/amx_tf32.c
+++ b/clang/test/CodeGen/X86/amx_tf32.c
@@ -10,8 +10,3 @@ void test_tile_mmultf32ps(void) {
   _tile_mmultf32ps(1, 2, 3);
 }
 
-void test_tile_tmmultf32ps(void) {
-  // CHECK-LABEL: @test_tile_tmmultf32ps(
-  // CHECK: call void @llvm.x86.ttmmultf32ps(i8 1, i8 2, i8 3)
-  _tile_tmmultf32ps(1, 2, 3);
-}
diff --git a/clang/test/CodeGen/X86/amx_tf32_api.c b/clang/test/CodeGen/X86/amx_tf32_api.c
index 2ac8489e3e0ba..8f574b7bc71dc 100644
--- a/clang/test/CodeGen/X86/amx_tf32_api.c
+++ b/clang/test/CodeGen/X86/amx_tf32_api.c
@@ -18,10 +18,3 @@ void test_tile_mmultf32ps(__tile1024i a, __tile1024i b, __tile1024i c) {
   __tile_mmultf32ps(&c, a, b);
 }
 
-void test_tile_tmmultf32ps(__tile1024i a, __tile1024i b, __tile1024i c) {
-  //CHECK-LABEL: @test_tile_tmmultf32ps
-  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call x86_amx @llvm.x86.ttmmultf32ps.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile_tmmultf32ps(&c, a, b);
-}
diff --git a/clang/test/CodeGen/X86/amx_tf32_errors.c b/clang/test/CodeGen/X86/amx_tf32_errors.c
index 4502130692115..f0fdd060363cf 100644
--- a/clang/test/CodeGen/X86/amx_tf32_errors.c
+++ b/clang/test/CodeGen/X86/amx_tf32_errors.c
@@ -13,11 +13,3 @@ void test_tile_mmultf32ps() {
   _tile_mmultf32ps(1, 3, 3);  // expected-error {{tile arguments must refer to different tiles}}
 }
 
-void test_tile_tmmultf32ps() {
-  _tile_tmmultf32ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
-  _tile_tmmultf32ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
-  _tile_tmmultf32ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}}
-  _tile_tmmultf32ps(1, 1, 3);  // expected-error {{tile arguments must refer to different tiles}}
-  _tile_tmmultf32ps(1, 2, 1);  // expected-error {{tile arguments must refer to different tiles}}
-  _tile_tmmultf32ps(1, 2, 2);  // expected-error {{tile arguments must refer to different tiles}}
-}
diff --git a/clang/test/CodeGen/X86/amx_transpose.c b/clang/test/CodeGen/X86/amx_transpose.c
deleted file mode 100644
index 7e88fd80592d6..0000000000000
--- a/clang/test/CodeGen/X86/amx_transpose.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-transpose \
-// RUN: -target-feature +amx-bf16 -target-feature +amx-fp16 -target-feature +amx-complex \
-// RUN: -target-feature +avx512f -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression| FileCheck %s
-
-#include <immintrin.h>
-#include <stddef.h>
-
-void test_tile_2rpntlvwz0(const void *A, size_t B) {
-  // CHECK-LABEL: @test_tile_2rpntlvwz0
-  // CHECK: call void @llvm.x86.t2rpntlvwz0(i8 1, ptr %{{.*}}, i64 %{{.*}})
-  _tile_2rpntlvwz0(1, A, B);
-}
-
-void test_tile_2rpntlvwz0t1(const void *A, size_t B) {
-  // CHECK-LABEL: @test_tile_2rpntlvwz0t1
-  // CHECK: call void @llvm.x86.t2rpntlvwz0t1(i8 1, ptr %{{.*}}, i64 %{{.*}})
-  _tile_2rpntlvwz0t1(1, A, B);
-}
-
-void test_tile_2rpntlvwz1(const void *A, size_t B) {
-  // CHECK-LABEL: @test_tile_2rpntlvwz1
-  // CHECK: call void @llvm.x86.t2rpntlvwz1(i8 1, ptr %{{.*}}, i64 %{{.*}})
-  _tile_2rpntlvwz1(1, A, B);
-}
-
-void test_tile_2rpntlvwz1t1(const void *A, size_t B) {
-  // CHECK-LABEL: @test_tile_2rpntlvwz1t1
-  // CHECK: call void @llvm.x86.t2rpntlvwz1t1(i8 1, ptr %{{.*}}, i64 %{{.*}})
-  _tile_2rpntlvwz1t1(1, A, B);
-}
-
-void test_tile_transposed(void)
-{
-  // CHECK-LABEL: @test_tile_transposed
-  // CHECK: call void @llvm.x86.ttransposed(i8 1, i8 2)
-  _tile_transposed(1, 2);
-}
-
-void test_tile_tdpbf16ps(void)
-{
-  // CHECK-LABEL: @test_tile_tdpbf16ps
-  // CHECK: call void @llvm.x86.ttdpbf16ps(i8 1, i8 2, i8 3)
-  _tile_tdpbf16ps(1, 2, 3);
-}
-
-void test_tile_tdpfp16ps(void)
-{
-  // CHECK-LABEL: @test_tile_tdpfp16ps
-  // CHECK: call void @llvm.x86.ttdpfp16ps(i8 4, i8 5, i8 6)
-  _tile_tdpfp16ps(4, 5, 6);
-}
-
-void test_tile_tcmmimfp16ps(void) {
-  // CHECK-LABEL: @test_tile_tcmmimfp16ps
-  // CHECK: call void @llvm.x86.ttcmmimfp16ps(i8 1, i8 2, i8 3)
-  _tile_tcmmimfp16ps(1, 2, 3);
-}
-
-void test_tile_tcmmrlfp16ps(void) {
-  // CHECK-LABEL: @test_tile_tcmmrlfp16ps
-  // CHECK: call void @llvm.x86.ttcmmrlfp16ps(i8 1, i8 2, i8 3)
-  _tile_tcmmrlfp16ps(1, 2, 3);
-}
-
-void test_tile_conjtcmmimfp16ps(void) {
-  // CHECK-LABEL: @test_tile_conjtcmmimfp16ps
-  // CHECK: call void @llvm.x86.tconjtcmmimfp16ps(i8 1, i8 2, i8 3)
-  _tile_conjtcmmimfp16ps(1, 2, 3);
-}
-
-void test_tile_conjtfp16(void) {
-  // CHECK-LABEL: @test_tile_conjtfp16
-  // CHECK: call void @llvm.x86.tconjtfp16(i8 1, i8 2)
-  _tile_conjtfp16(1, 2);
-}
diff --git a/clang/test/CodeGen/X86/amx_transpose_api.c b/clang/test/CodeGen/X86/amx_transpose_api.c
deleted file mode 100644
index dc3ef5104252c..0000000000000
--- a/clang/test/CodeGen/X86/amx_transpose_api.c
+++ /dev/null
@@ -1,114 +0,0 @@
-// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512f \
-// RUN: -target-feature +amx-transpose -target-feature +amx-bf16 -target-feature +amx-fp16 -target-feature +amx-complex \
-// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK
-
-#include <immintrin.h>
-
-char buf[2048];
-#define STRIDE 32
-
-char buf2[2048];
-
-void test_tile_2rpntlvwz0(__tile1024i dst0, __tile1024i dst1) {
-  //CHECK-LABEL: @test_tile_2rpntlvwz0
-  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  __tile_2rpntlvwz0(&dst0, &dst1, buf, STRIDE);
-}
-
-void test_tile_2rpntlvwz0t1(__tile1024i dst0, __tile1024i dst1) {
-  //CHECK-LABEL: @test_tile_2rpntlvwz0t1
-  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  __tile_2rpntlvwz0t1(&dst0, &dst1, buf, STRIDE);
-}
-
-void test_tile_2rpntlvwz1(__tile1024i dst0, __tile1024i dst1) {
-  //CHECK-LABEL: @test_tile_2rpntlvwz1
-  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  __tile_2rpntlvwz1(&dst0, &dst1, buf, STRIDE);
-}
-
-void test_tile_2rpntlvwz1t1(__tile1024i dst0, __tile1024i dst1) {
-  //CHECK-LABEL: @test_tile_2rpntlvwz1t1
-  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
-  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
-  __tile_2rpntlvwz1t1(&dst0, &dst1, buf, STRIDE);
-}
-
-void test_tile_transposed(__tile1024i dst, __tile1024i src) {
-  //CHECK-LABEL: @test_tile_transposed
-  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call x86_amx @llvm.x86.ttransposed.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile_transposed(&dst, src);
-}
-
-void test_tile_tdpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
-  //CHECK-LABEL: @test_tile_tdpbf16ps
-  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call x86_amx @llvm.x86.ttdpbf16ps.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile_tdpbf16ps(&c, a, b);
-}
-
-void test_tile_tdpfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
-  //CHECK-LABEL: @test_tile_tdpfp16ps
-  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call x86_amx @llvm.x86.ttdpfp16ps.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile_tdpfp16ps(&c, a, b);
-}
-
-void test_tile_tcmmimfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
-  //CHECK-LABEL: @test_tile_tcmmimfp16ps
-  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call x86_amx @llvm.x86.ttcmmimfp16ps.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile_tcmmimfp16ps(&c, a, b);
-}
-
-void test_tile_tcmmrlfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
-  //CHECK-LABEL: @test_tile_tcmmrlfp16ps
-  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call x86_amx @llvm.x86.ttcmmrlfp16ps.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile_tcmmrlfp16ps(&c, a, b);
-}
-
-void test_tile_conjtcmmimfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
-  //CHECK-LABEL: @test_tile_conjtcmmimfp16ps
-  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call x86_amx @llvm.x86.tconjtcmmimfp16ps.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile_conjtcmmimfp16ps(&c, a, b);
-}
-
-void test_tile_conjtfp16(__tile1024i dst, __tile1024i src) {
-  //CHECK-LABEL: @test_tile_conjtfp16
-  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
-  //CHECK-DAG: call x86_amx @llvm.x86.tconjtfp16.internal
-  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
-  __tile_conjtfp16(&dst, src);
-}
diff --git a/clang/test/CodeGen/X86/amx_transpose_errors.c b/clang/test/CodeGen/X86/amx_transpose_errors.c
deleted file mode 100644
index 80368c580c793..0000000000000
--- a/clang/test/CodeGen/X86/amx_transpose_errors.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
-// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-transpose \
-// RUN: -target-feature +avx512f -target-feature +amx-fp16 -target-feature +amx-complex -verify
-
-#include <immintrin.h>
-#include <stddef.h>
-
-// Transpose
-void test_tile_2rpntlvwz0(const void *A, size_t B) {
-  _tile_2rpntlvwz0(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
-
-void test_tile_2rpntlvwz0t1(const void *A, size_t B) {
-  _tile_2rpntlvwz0t1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
-
-void test_tile_2rpntlvwz1(const void *A, size_t B) {
-  _tile_2rpntlvwz1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
-
-void test_tile_2rpntlvwz1t1(const void *A, size_t B) {
-  _tile_2rpntlvwz1t1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
-
-void test_tile_tdpbf16ps()
-{
-  _tile_tdpbf16ps(8, 2, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  _tile_tdpbf16ps(1, 8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  _tile_tdpbf16ps(1, 2, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  _tile_tdpbf16ps(1, 1, 3);  // expected-error {{tile arguments must refer to different tiles}}
-  _tile_tdpbf16ps(1, 2, 1);  // expected-error {{tile arguments must refer to different tiles}}
-  _tile_tdpbf16ps(1, 2, 2);  // expected-error {{tile arguments must refer to different tiles}}
-}
-
-void test_tile_tdpfp16ps()
-{
-  _tile_tdpfp16ps(8, 5, 6); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  _tile_tdpfp16ps(1, 8, 6); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  _tile_tdpfp16ps(1, 5, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  _tile_tdpfp16ps(1, 1, 3);  // expected-error {{tile arguments must refer to different tiles}}
-  _tile_tdpfp16ps(1, 2, 1);  // expected-error {{tile arguments must refer to different tiles}}
-  _tile_tdpfp16ps(1, 2, 2);  // expected-error {{tile arguments must refer to different tiles}}
-}
-
-void test_tile_transposed()
-{
-  _tile_transposed(8, 2); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-  _tile_transposed(1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
-}
-
-void test_tile_tcmmimfp16ps() {
-  _tile_tcmmimfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
-  _tile_tcmmimfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
-  _tile_tcmmimfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}}
-  _tile_tcmmimfp16ps(1, 1, 3);  // expected-error {{tile arguments must refer to different tiles}}
-}
-
-void test_tile_tcmmrlfp16ps() {
-  _tile_tcmmrlfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
-  _tile_tcmmrlfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
-  _tile_tcmmrlfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}}
-  _tile_tcmmrlfp16ps(1, 1, 3);  // expected-error {{tile arguments must refer to different tiles}}
-}
-
-void test_tile_conjtcmmimfp16ps() {
-  _tile_conjtcmmimfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
-  _tile_conjtcmmimfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
-  _tile_conjtcmmimfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}}
-  _tile_conjtcmmimfp16ps(1, 2, 1);  // expected-error {{tile arguments must refer to different tiles}}
-}
-
-void test_tile_conjtfp16() {
-  _tile_conjtfp16(16, 2); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
-  _tile_conjtfp16(1, 26); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
-}
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index de4cb2fd0b055..ce8e2f04e487c 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -109,6 +109,9 @@ __m256i test_mm256_alignr_epi8(__m256i a, __m256i b) {
   // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
   return _mm256_alignr_epi8(a, b, 2);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_alignr_epi8(((__m256i)(__v32qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), ((__m256i)(__v32qs){33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 2), 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 1, 2, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 17, 18));
+TEST_CONSTEXPR(match_v32qi(_mm256_alignr_epi8(((__m256i)(__v32qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), ((__m256i)(__v32qs){33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 16), 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32));
+TEST_CONSTEXPR(match_v32qi(_mm256_alignr_epi8(((__m256i)(__v32qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), ((__m256i)(__v32qs){33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 32), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m256i test2_mm256_alignr_epi8(__m256i a, __m256i b) {
   // CHECK-LABEL: test2_mm256_alignr_epi8
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index be2cd480f7558..2749dc5741b58 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -209,6 +209,10 @@ unsigned char test_kortestz_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m
                              _mm512_cmpneq_epu16_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_kortestz_mask32_u8(0x0000'0000, 0x0000'0000) == 1);
+TEST_CONSTEXPR(_kortestz_mask32_u8(0x0000'0000, 0x8000'0000) == 0);
+TEST_CONSTEXPR(_kortestz_mask32_u8(0x0123'4567, 0xFEDC'BA98) == 0);
+
 unsigned char test_kortestc_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
   // CHECK-LABEL: test_kortestc_mask32_u8
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
@@ -222,6 +226,10 @@ unsigned char test_kortestc_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m
                              _mm512_cmpneq_epu16_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_kortestc_mask32_u8(0x0000'0000, 0x0000'0000) == 0);
+TEST_CONSTEXPR(_kortestc_mask32_u8(0x0000'0000, 0x8000'0000) == 0);
+TEST_CONSTEXPR(_kortestc_mask32_u8(0x0123'4567, 0xFEDC'BA98) == 1);
+
 unsigned char test_kortest_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
   // CHECK-LABEL: test_kortest_mask32_u8
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
@@ -242,6 +250,30 @@ unsigned char test_kortest_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m5
                             _mm512_cmpneq_epu16_mask(__C, __D), CF);
 }
 
+// Test constexpr handling.
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+constexpr unsigned char
+test_kortest_mask32_u8(unsigned int A, unsigned int B) {
+  unsigned char all_ones{};
+  return (_kortest_mask32_u8(A, B, &all_ones) << 4) | all_ones;
+}
+
+void _kortest_mask32_u8() {
+  constexpr unsigned int A1 = 0x0000'0000;
+  constexpr unsigned int B1 = 0x0000'0000;
+  constexpr unsigned char expected_result_1 = 0x10;
+  static_assert(test_kortest_mask32_u8(A1, B1) == expected_result_1);
+  constexpr unsigned int A2 = 0x0000'0000;
+  constexpr unsigned int B2 = 0x8000'0000;
+  constexpr unsigned char expected_result_2 = 0x00;
+  static_assert(test_kortest_mask32_u8(A2, B2) == expected_result_2);
+  constexpr unsigned int A3 = 0x0123'4567;
+  constexpr unsigned int B3 = 0xFEDC'BA98;
+  constexpr unsigned char expected_result_3 = 0x01;
+  static_assert(test_kortest_mask32_u8(A3, B3) == expected_result_3);
+}
+#endif
+
 unsigned char test_kortestz_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
   // CHECK-LABEL: test_kortestz_mask64_u8
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
@@ -255,6 +287,10 @@ unsigned char test_kortestz_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m
                              _mm512_cmpneq_epu8_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_kortestz_mask64_u8(0x0000'0000'0000'0000, 0x0000'0000'0000'0000) == 1);
+TEST_CONSTEXPR(_kortestz_mask64_u8(0x0000'0000'0000'0000, 0x8000'0000'0000'0000) == 0);
+TEST_CONSTEXPR(_kortestz_mask64_u8(0x0123'4567'89AB'CDEF, 0xFEDC'BA98'7654'3210) == 0);
+
 unsigned char test_kortestc_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
   // CHECK-LABEL: test_kortestc_mask64_u8
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
@@ -268,6 +304,10 @@ unsigned char test_kortestc_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m
                              _mm512_cmpneq_epu8_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_kortestc_mask64_u8(0x0000'0000'0000'0000, 0x0000'0000'0000'0000) == 0);
+TEST_CONSTEXPR(_kortestc_mask64_u8(0x0023'4567'89AB'CDEF, 0xFEDC'BA98'7654'3210) == 0);
+TEST_CONSTEXPR(_kortestc_mask64_u8(0x0123'4567'89AB'CDEF, 0xFEDC'BA98'7654'3210) == 1);
+
 unsigned char test_kortest_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
   // CHECK-LABEL: test_kortest_mask64_u8
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
@@ -288,6 +328,30 @@ unsigned char test_kortest_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m5
                             _mm512_cmpneq_epu8_mask(__C, __D), CF);
 }
 
+// Test constexpr handling.
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+constexpr unsigned char
+test_kortest_mask64_u8(unsigned long long A, unsigned long long B) {
+  unsigned char all_ones{};
+  return (_kortest_mask64_u8(A, B, &all_ones) << 4) | all_ones;
+}
+
+void _kortest_mask64_u8() {
+  constexpr unsigned long long A1 = 0x0000'0000'0000'0000;
+  constexpr unsigned long long B1 = 0x0000'0000'0000'0000;
+  constexpr unsigned char expected_result_1 = 0x10;
+  static_assert(test_kortest_mask64_u8(A1, B1) == expected_result_1);
+  constexpr unsigned long long A2 = 0x0000'0000'0000'0000;
+  constexpr unsigned long long B2 = 0x8000'0000'0000'0000;
+  constexpr unsigned char expected_result_2 = 0x00;
+  static_assert(test_kortest_mask64_u8(A2, B2) == expected_result_2);
+  constexpr unsigned long long A3 = 0x0123'4567'89AB'CDEF;
+  constexpr unsigned long long B3 = 0xFEDC'BA98'7654'3210;
+  constexpr unsigned char expected_result_3 = 0x01;
+  static_assert(test_kortest_mask64_u8(A3, B3) == expected_result_3);
+}
+#endif
+
 unsigned char test_ktestz_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
   // CHECK-LABEL: test_ktestz_mask32_u8
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
@@ -298,6 +362,11 @@ unsigned char test_ktestz_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m51
                            _mm512_cmpneq_epu16_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_ktestz_mask32_u8(0x0000'0000, 0x0000'0000) == 1);
+TEST_CONSTEXPR(_ktestz_mask32_u8(0x0000'0000, 0x8000'0000) == 1);
+TEST_CONSTEXPR(_ktestz_mask32_u8(0xF000'0000, 0x8000'0000) == 0);
+TEST_CONSTEXPR(_ktestz_mask32_u8(0x0123'4567, 0x0123'4567) == 0);
+
 unsigned char test_ktestc_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
   // CHECK-LABEL: test_ktestc_mask32_u8
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
@@ -308,6 +377,11 @@ unsigned char test_ktestc_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m51
                            _mm512_cmpneq_epu16_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_ktestc_mask32_u8(0x0000'0000, 0x0000'0000) == 1);
+TEST_CONSTEXPR(_ktestc_mask32_u8(0x0000'0000, 0x8000'0000) == 0);
+TEST_CONSTEXPR(_ktestc_mask32_u8(0xF000'0000, 0x8000'0000) == 1);
+TEST_CONSTEXPR(_ktestc_mask32_u8(0x0123'4567, 0x0123'4567) == 1);
+
 unsigned char test_ktest_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
   // CHECK-LABEL: test_ktest_mask32_u8
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
@@ -322,6 +396,34 @@ unsigned char test_ktest_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512
                           _mm512_cmpneq_epu16_mask(__C, __D), CF);
 }
 
+// Test constexpr handling.
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+constexpr unsigned char
+test_ktest_mask32_u8(unsigned int A, unsigned int B) {
+  unsigned char and_not{};
+  return (_ktest_mask32_u8(A, B, &and_not) << 4) | and_not;
+}
+
+void _ktest_mask32_u8() {
+  constexpr unsigned int A1 = 0x0000'0000;
+  constexpr unsigned int B1 = 0x0000'0000;
+  constexpr unsigned char expected_result_1 = 0x11;
+  static_assert(test_ktest_mask32_u8(A1, B1) == expected_result_1);
+  constexpr unsigned int A2 = 0x0000'0000;
+  constexpr unsigned int B2 = 0x8000'0000;
+  constexpr unsigned char expected_result_2 = 0x10;
+  static_assert(test_ktest_mask32_u8(A2, B2) == expected_result_2);
+  constexpr unsigned int A3 = 0xF000'0000;
+  constexpr unsigned int B3 = 0x8000'0000;
+  constexpr unsigned char expected_result_3 = 0x01;
+  static_assert(test_ktest_mask32_u8(A3, B3) == expected_result_3);
+  constexpr unsigned int A4 = 0x0123'4567;
+  constexpr unsigned int B4 = 0x0123'4567;
+  constexpr unsigned char expected_result_4 = 0x01;
+  static_assert(test_ktest_mask32_u8(A4, B4) == expected_result_4);
+}
+#endif
+
 unsigned char test_ktestz_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
   // CHECK-LABEL: test_ktestz_mask64_u8
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
@@ -332,6 +434,11 @@ unsigned char test_ktestz_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m51
                            _mm512_cmpneq_epu8_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_ktestz_mask64_u8(0x0000'0000'0000'0000, 0x0000'0000'0000'0000) == 1);
+TEST_CONSTEXPR(_ktestz_mask64_u8(0x0000'0000'0000'0000, 0x8000'0000'0000'0000) == 1);
+TEST_CONSTEXPR(_ktestz_mask64_u8(0xF000'0000'0000'0000, 0x8000'0000'0000'0000) == 0);
+TEST_CONSTEXPR(_ktestz_mask64_u8(0x0123'4567'89AB'CDEF, 0x0123'4567'89AB'CDEF) == 0);
+
 unsigned char test_ktestc_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
   // CHECK-LABEL: test_ktestc_mask64_u8
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
@@ -342,6 +449,11 @@ unsigned char test_ktestc_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m51
                            _mm512_cmpneq_epu8_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_ktestc_mask64_u8(0x0000'0000'0000'0000, 0x0000'0000'0000'0000) == 1);
+TEST_CONSTEXPR(_ktestc_mask64_u8(0x0000'0000'0000'0000, 0x8000'0000'0000'0000) == 0);
+TEST_CONSTEXPR(_ktestc_mask64_u8(0xF000'0000'0000'0000, 0x8000'0000'0000'0000) == 1);
+TEST_CONSTEXPR(_ktestc_mask64_u8(0x0123'4567'89AB'CDEF, 0x0123'4567'89AB'CDEF) == 1);
+
 unsigned char test_ktest_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
   // CHECK-LABEL: test_ktest_mask64_u8
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
@@ -356,6 +468,34 @@ unsigned char test_ktest_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512
                           _mm512_cmpneq_epu8_mask(__C, __D), CF);
 }
 
+// Test constexpr handling.
+#if defined(__cplusplus) && (__cplusplus >= 201402L)
+constexpr unsigned char
+test_ktest_mask64_u8(unsigned long long A, unsigned long long B) {
+  unsigned char and_not{};
+  return (_ktest_mask64_u8(A, B, &and_not) << 4) | and_not;
+}
+
+void _ktest_mask64_u8() {
+  constexpr unsigned long long A1 = 0x0000'0000'0000'0000;
+  constexpr unsigned long long B1 = 0x0000'0000'0000'0000;
+  constexpr unsigned char expected_result_1 = 0x11;
+  static_assert(test_ktest_mask64_u8(A1, B1) == expected_result_1);
+  constexpr unsigned long long A2 = 0x0000'0000'0000'0000;
+  constexpr unsigned long long B2 = 0x8000'0000'0000'0000;
+  constexpr unsigned char expected_result_2 = 0x10;
+  static_assert(test_ktest_mask64_u8(A2, B2) == expected_result_2);
+  constexpr unsigned long long A3 = 0xF000'0000'0000'0000;
+  constexpr unsigned long long B3 = 0x8000'0000'0000'0000;
+  constexpr unsigned char expected_result_3 = 0x01;
+  static_assert(test_ktest_mask64_u8(A3, B3) == expected_result_3);
+  constexpr unsigned long long A4 = 0x0123'4567'89AB'CDEF;
+  constexpr unsigned long long B4 = 0x0123'4567'89AB'CDEF;
+  constexpr unsigned char expected_result_4 = 0x01;
+  static_assert(test_ktest_mask64_u8(A4, B4) == expected_result_4);
+}
+#endif
+
 __mmask32 test_kadd_mask32(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: test_kadd_mask32
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
@@ -1065,12 +1205,16 @@ __m512i test_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_packs_epi32(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_packs_epi32((__mmask32)0xAAAAAAAA,(__m512i)(__v16si){40000,-50000,32767,-32768,70000,-70000,42,-42,0,1,-1,30000,32768,-32769,65535,-65536},(__m512i)(__v16si){0,1,-1,65536,-1000000,1000000,32768,-32769,123456,-123456,32767,-32768,22222,-22222,40000,-40000}),0,-32768,0,-32768,0,1,0,32767,0,-32768,0,-42,0,32767,0,-32768,0,1,0,30000,0,-32768,0,-32768,0,-32768,0,-32768,0,-22222,0,-32768));
+
 __m512i test_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_packs_epi32
   // CHECK: @llvm.x86.avx512.packssdw.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_packs_epi32(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_packs_epi32((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,(__m512i)(__v16si){40000,-50000,32767,-32768,70000,-70000,42,-42,0,1,-1,30000,32768,-32769,65535,-65536},(__m512i)(__v16si){0,1,-1,65536,-1000000,1000000,32768,-32769,123456,-123456,32767,-32768,22222,-22222,40000,-40000}),1,-32768,3,-32768,5,1,7,32767,9,-32768,11,-42,13,32767,15,-32768,17,1,19,30000,21,-32768,23,-32768,25,-32768,27,-32768,29,-22222,31,-32768));
+
 __m512i test_mm512_packs_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_packs_epi16
   // CHECK: @llvm.x86.avx512.packsswb.512
@@ -1083,48 +1227,62 @@ __m512i test_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_packs_epi16(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_packs_epi16((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v32hi){130,-200,127,-128,300,-1000,42,-42,32767,-32767,127,-128,30000,-30000,90,-90,130,-200,0,-1,126,-127,128,-129,500,-500,7,-7,255,-255,127,-128},(__m512i)(__v32hi){0,1,-1,255,-129,128,20000,-32768,5,-5,100,-100,127,-128,512,-512,1,2,-2,300,-300,127,-128,42,0,1,-1,127,-128,90,-90,-32768}),1,-128,3,-128,5,-128,7,-42,9,1,11,127,13,127,15,-128,17,-128,19,-128,21,-128,23,-90,25,-5,27,-100,29,-128,31,-128,33,-128,35,-1,37,-127,39,-128,41,2,43,127,45,127,47,42,49,-128,51,-7,53,-128,55,-128,57,1,59,127,61,90,63,-128));
+
 __m512i test_mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_packs_epi16
   // CHECK: @llvm.x86.avx512.packsswb.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_packs_epi16(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_packs_epi16((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v32hi){130,-200,127,-128,300,-1000,42,-42,32767,-32767,127,-128,30000,-30000,90,-90,130,-200,0,-1,126,-127,128,-129,500,-500,7,-7,255,-255,127,-128},(__m512i)(__v32hi){0,1,-1,255,-129,128,20000,-32768,5,-5,100,-100,127,-128,512,-512,1,2,-2,300,-300,127,-128,42,0,1,-1,127,-128,90,-90,-32768}),0,-128,0,-128,0,-128,0,-42,0,1,0,127,0,127,0,-128,0,-128,0,-128,0,-128,0,-90,0,-5,0,-100,0,-128,0,-128,0,-128,0,-1,0,-127,0,-128,0,2,0,127,0,127,0,42,0,-128,0,-7,0,-128,0,-128,0,1,0,127,0,90,0,-128));
+
 __m512i test_mm512_packus_epi32(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_packus_epi32
   // CHECK: @llvm.x86.avx512.packusdw.512
   return _mm512_packus_epi32(__A,__B); 
 }
 TEST_CONSTEXPR(match_v32hi(_mm512_packus_epi32((__m512i)(__v16si){40000, -50000, 32767, -32768, 70000, -70000, 42, -42, 0, 1, -1, 65535, 32768, -32769, 22222, -22222}, (__m512i)(__v16si){0, 1, -1, 65536, -1000000, 1000000, 32768, -32769, 123456, -123456, 32767, -32768, 40000, -40000, 65535, 0}), -25536, 0, 32767, 0, 0, 1, 0, -1, -1, 0, 42, 0, 0, -1, -32768, 0, 0, 1, 0, -1, -1, 0, 32767, 0, -32768, 0, 22222, 0, -25536, 0, -1, 0));
+
 __m512i test_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_packus_epi32
   // CHECK: @llvm.x86.avx512.packusdw.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_packus_epi32(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_packus_epi32((__mmask32)0xAAAAAAAA,(__m512i)(__v16si){40000,-50000,32767,-32768,70000,-70000,42,-42,0,1,-1,65535,32768,-32769,22222,-22222},(__m512i)(__v16si){0,1,-1,65536,-1000000,1000000,32768,-32769,123456,-123456,32767,-32768,40000,-40000,65535,0}),0,0,0,0,0,1,0,-1,0,0,0,0,0,-1,0,0,0,1,0,-1,0,0,0,0,0,0,0,0,0,0,0,0));
+
 __m512i test_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_packus_epi32
   // CHECK: @llvm.x86.avx512.packusdw.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_packus_epi32(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_packus_epi32((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,(__m512i)(__v16si){40000,-50000,32767,-32768,70000,-70000,42,-42,0,1,-1,65535,32768,-32769,22222,-22222},(__m512i)(__v16si){0,1,-1,65536,-1000000,1000000,32768,-32769,123456,-123456,32767,-32768,40000,-40000,65535,0}),1,0,3,0,5,1,7,-1,9,0,11,0,13,-1,15,0,17,1,19,-1,21,0,23,0,25,0,27,0,29,0,31,0));
+
 __m512i test_mm512_packus_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_packus_epi16
   // CHECK: @llvm.x86.avx512.packuswb.512
   return _mm512_packus_epi16(__A,__B); 
 }
 TEST_CONSTEXPR(match_v64qi(_mm512_packus_epi16((__m512i)(__v32hi){-1, 0, 1, 127, 128, 255, 256, -200, 300, 42, -42, 500, 20000, -32768, 129, -129, -1, 0, 1, 127, 128, 255, 256, -200, 300, 42, -42, 500, 20000, -32768, 129, -129}, (__m512i)(__v32hi){0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90, 0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90}), 0, 0, 1, 127, -128, -1, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0, -1, 42, 0, -1, -1, 0, -127, 0, -1, 0, 127, 0, -1, 0, 90, 0, 0, 0, 1, 127, -128, -1, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0, -1, 42, 0, -1, -1, 0, -127, 0, -1, 0, 127, 0, -1, 0, 90, 0));
+
 __m512i test_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_packus_epi16
   // CHECK: @llvm.x86.avx512.packuswb.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_packus_epi16(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_packus_epi16((__m512i)(__v64qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v32hi){-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129,-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129},(__m512i)(__v32hi){0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90,0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90}),1,0,3,127,5,-1,7,0,9,1,11,-1,13,-128,15,0,17,42,19,-1,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,127,37,-1,39,0,41,1,43,-1,45,-128,47,0,49,42,51,-1,53,0,55,0,57,0,59,0,61,0,63,0));
+
 __m512i test_mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_packus_epi16
   // CHECK: @llvm.x86.avx512.packuswb.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_packus_epi16(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_packus_epi16((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v32hi){-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129,-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129},(__m512i)(__v32hi){0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90,0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90}),0,0,0,127,0,-1,0,0,0,1,0,-1,0,-128,0,0,0,42,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,127,0,-1,0,0,0,1,0,-1,0,-128,0,0,0,42,0,-1,0,0,0,0,0,0,0,0,0,0,0,0));
+
 __m512i test_mm512_adds_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_adds_epi8
   // CHECK: @llvm.sadd.sat.v64i8
@@ -1138,18 +1296,22 @@ __m512i test_mm512_mask_adds_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m51
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
  return _mm512_mask_adds_epi8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_adds_epi8((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,-28,+29,-30,+31,-32,+33,-34,+35,-36,+37,-38,+39,-40,+41,-42,+43,-44,+45,-46,+47,+100,+50,-100,+20,+80,-50,+120,-20,-100,-50,+100,-20,-80,+50,-120,+20},(__m512i)(__v64qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,-28,+29,-30,+31,-32,+33,-34,+35,-36,+37,-38,+39,-40,+41,-42,+43,-44,+45,-46,+47,+50,+80,-50,+110,+60,-30,+20,-10,+50,+80,-50,+110,+60,-30,+20,-10}),1,+2,3,+6,5,+10,7,+14,9,+18,11,+22,13,+26,15,+30,17,+34,19,+38,21,+42,23,+46,25,+50,27,+54,29,+58,31,+62,33,+66,35,+70,37,+74,39,+78,41,+82,43,+86,45,+90,47,+94,49,+127,51,+127,53,-80,+55,-30,57,+30,59,+90,61,+20,63,+10));
+
 __m512i test_mm512_maskz_adds_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_adds_epi8
   // CHECK: @llvm.sadd.sat.v64i8
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_adds_epi8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_adds_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,-28,+29,-30,+31,-32,+33,-34,+35,-36,+37,-38,+39,-40,+41,-42,+43,-44,+45,-46,+47,+100,+50,-100,+20,+80,-50,+120,-20,-100,-50,+100,-20,-80,+50,-120,+20},(__m512i)(__v64qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,-28,+29,-30,+31,-32,+33,-34,+35,-36,+37,-38,+39,-40,+41,-42,+43,-44,+45,-46,+47,+50,+80,-50,+110,+60,-30,+20,-10,+50,+80,-50,+110,+60,-30,+20,-10}),0,+2,0,+6,0,+10,0,+14,0,+18,0,+22,0,+26,0,+30,0,+34,0,+38,0,+42,0,+46,0,+50,0,+54,0,+58,0,+62,0,+66,0,+70,0,+74,0,+78,0,+82,0,+86,0,+90,0,+94,0,+127,0,+127,0,-80,0,-30,0,+30,0,+90,0,+20,0,+10));
+
 __m512i test_mm512_adds_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_adds_epi16
   // CHECK: @llvm.sadd.sat.v32i16
  return _mm512_adds_epi16(__A,__B); 
 }
-TEST_CONSTEXPR(match_v32hi(_mm512_adds_epi16((__m512i)(__v32hi){0, +1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, +32000, -32000, +32000, -32000}, (__m512i)(__v32hi){0, +1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, +800, -800, -800, +800}), 0, +2, -4, +6, -8, +10, -12, +14, -16, +18, -20, +22, -24, +26, -28, +30, -32, +34, -36, +38, -40, +42, -44, +46, -48, +50, -52, +54, +32767, -32768, +31200, -31200));
+TEST_CONSTEXPR(match_v32hi(_mm512_adds_epi16((__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+32000,-32000,+32000,-32000},(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+800,-800,-800,+800}),0,+2,-4,+6,-8,+10,-12,+14,-16,+18,-20,+22,-24,+26,-28,+30,-32,+34,-36,+38,-40,+42,-44,+46,-48,+50,-52,+54,+32767,-32768,+31200,-31200));
 
 __m512i test_mm512_mask_adds_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_adds_epi16
@@ -1157,12 +1319,16 @@ __m512i test_mm512_mask_adds_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m5
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_adds_epi16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_adds_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAAu,(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+800,-800,-800,+800}),1,+2,3,+6,5,+10,7,+14,9,+18,11,+22,13,+26,15,+30,17,+34,19,+38,21,+42,23,+46,25,+50,27,+54,29,-32768,31,+32767));
+
 __m512i test_mm512_maskz_adds_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_adds_epi16
   // CHECK: @llvm.sadd.sat.v32i16
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
 return _mm512_maskz_adds_epi16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_adds_epi16((__mmask32)0xAAAAAAAAu,(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+800,-800,-800,+800}),0,+2,0,+6,0,+10,0,+14,0,+18,0,+22,0,+26,0,+30,0,+34,0,+38,0,+42,0,+46,0,+50,0,+54,0,-32768,0,+32767));
+
 __m512i test_mm512_adds_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_adds_epu8
   // CHECK-NOT: @llvm.x86.avx512.mask.paddus.b.512
@@ -1178,7 +1344,7 @@ __m512i test_mm512_mask_adds_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m51
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_adds_epu8(__W,__U,__A,__B); 
 }
-TEST_CONSTEXPR(match_v32hu(_mm512_adds_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, +32767, +32768, +65535, +16384, +32768, +49151, +49152, +65535, +65535, +32767, +49151, +65534, +65535, +65535, +65535, +32768, +49152, +65535, +65535, +65535, +65535, +49152, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535));
+TEST_CONSTEXPR(match_v64qu(_mm512_mask_adds_epu8((__m512i)(__v64qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+64,+64,+64,+64,+64,+64,+64,+64,+127,+127,+127,+127,+127,+127,+127,+127,+128,+128,+128,+128,+128,+128,+128,+128,+191,+191,+191,+191,+191,+191,+191,+191,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m512i)(__v64qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),1,+63,3,+127,5,+191,7,+255,9,+126,11,+190,13,+254,15,+255,17,+127,19,+191,21,+255,23,+255,25,+190,27,+254,29,+255,31,+255,33,+191,35,+255,37,+255,39,+255,41,+254,43,+255,45,+255,47,+255,49,+255,51,+255,53,+255,55,+255,57,+255,59,+255,61,+255,63,+255));
 
 __m512i test_mm512_maskz_adds_epu8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_adds_epu8
@@ -1187,12 +1353,16 @@ __m512i test_mm512_maskz_adds_epu8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_adds_epu8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qu(_mm512_maskz_adds_epu8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+64,+64,+64,+64,+64,+64,+64,+64,+127,+127,+127,+127,+127,+127,+127,+127,+128,+128,+128,+128,+128,+128,+128,+128,+191,+191,+191,+191,+191,+191,+191,+191,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m512i)(__v64qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),0,+63,0,+127,0,+191,0,+255,0,+126,0,+190,0,+254,0,+255,0,+127,0,+191,0,+255,0,+255,0,+190,0,+254,0,+255,0,+255,0,+191,0,+255,0,+255,0,+255,0,+254,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255));
+
 __m512i test_mm512_adds_epu16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_adds_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.paddus.w.512
   // CHECK: call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_adds_epu16(__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hu(_mm512_adds_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, +32767, +32768, +65535, +16384, +32768, +49151, +49152, +65535, +65535, +32767, +49151, +65534, +65535, +65535, +65535, +32768, +49152, +65535, +65535, +65535, +65535, +49152, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535));
+
 __m512i test_mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_adds_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.paddus.w.512
@@ -1200,6 +1370,8 @@ __m512i test_mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m5
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_adds_epu16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hu(_mm512_mask_adds_epu16((__m512i)(__v32hu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,(__m512i)(__v32hu){0,0,0,0,+16384,+16384,+16384,+16384,+16384,+16384,+32767,+32767,+32767,+32767,+32767,+32767,+32768,+32768,+32768,+32768,+32768,+32768,+49152,+49152,+49152,+49152,+49152,+49152,+65535,+65535,+65535,+65535},(__m512i)(__v32hu){0,+32767,+32768,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+32767,+32768,+65535}),1,+32767,3,+65535,5,+32768,7,+49152,9,+65535,11,+49151,13,+65535,15,+65535,17,+49152,19,+65535,21,+65535,23,+65535,25,+65535,27,+65535,29,+65535,31,+65535));
+
 __m512i test_mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_adds_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.paddus.w.512
@@ -1207,6 +1379,8 @@ __m512i test_mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_adds_epu16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hu(_mm512_maskz_adds_epu16((__mmask32)0xAAAAAAAA,(__m512i)(__v32hu){0,0,0,0,+16384,+16384,+16384,+16384,+16384,+16384,+32767,+32767,+32767,+32767,+32767,+32767,+32768,+32768,+32768,+32768,+32768,+32768,+49152,+49152,+49152,+49152,+49152,+49152,+65535,+65535,+65535,+65535},(__m512i)(__v32hu){0,+32767,+32768,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+32767,+32768,+65535}),0,+32767,0,+65535,0,+32768,0,+49152,0,+65535,0,+49151,0,+65535,0,+65535,0,+49152,0,+65535,0,+65535,0,+65535,0,+65535,0,+65535,0,+65535,0,+65535));
+
 __m512i test_mm512_avg_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_avg_epu8
   // CHECK: @llvm.x86.avx512.pavg.b.512
@@ -1500,12 +1674,16 @@ __m512i test_mm512_mask_subs_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m51
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
 return _mm512_mask_subs_epi8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_subs_epi8((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){1,-100,3,4,5,-6,7,100,9,-100,11,12,13,-14,15,100,17,-100,19,20,21,-22,23,100,25,-100,27,28,29,-30,31,100,33,-100,35,36,37,-38,39,100,41,-100,43,44,45,-46,47,100,49,-100,51,52,53,-54,55,100,57,-100,59,60,61,-62,63,100},(__m512i)(__v64qs){1,100,3,4,5,6,7,-100,9,100,11,12,13,14,15,-100,17,100,19,20,21,22,23,-100,25,100,27,28,29,30,31,-100,33,100,35,36,37,38,39,-100,41,100,43,44,45,46,47,-100,49,100,51,52,53,54,55,-100,57,100,59,60,61,62,63,-100}),1,-128,3,0,5,-12,7,127,9,-128,11,0,13,-28,15,127,17,-128,19,0,21,-44,23,127,25,-128,27,0,29,-60,31,127,33,-128,35,0,37,-76,39,127,41,-128,43,0,45,-92,47,127,49,-128,51,0,53,-108,55,127,57,-128,59,0,61,-124,63,127));
+
 __m512i test_mm512_maskz_subs_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_subs_epi8
   // CHECK: @llvm.ssub.sat.v64i8
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
 return _mm512_maskz_subs_epi8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_subs_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){1,-100,3,4,5,-6,7,100,9,-100,11,12,13,-14,15,100,17,-100,19,20,21,-22,23,100,25,-100,27,28,29,-30,31,100,33,-100,35,36,37,-38,39,100,41,-100,43,44,45,-46,47,100,49,-100,51,52,53,-54,55,100,57,-100,59,60,61,-62,63,100},(__m512i)(__v64qs){1,100,3,4,5,6,7,-100,9,100,11,12,13,14,15,-100,17,100,19,20,21,22,23,-100,25,100,27,28,29,30,31,-100,33,100,35,36,37,38,39,-100,41,100,43,44,45,46,47,-100,49,100,51,52,53,54,55,-100,57,100,59,60,61,62,63,-100}),0,-128,0,0,0,-12,0,127,0,-128,0,0,0,-28,0,127,0,-128,0,0,0,-44,0,127,0,-128,0,0,0,-60,0,127,0,-128,0,0,0,-76,0,127,0,-128,0,0,0,-92,0,127,0,-128,0,0,0,-108,0,127,0,-128,0,0,0,-124,0,127));
+
 __m512i test_mm512_subs_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_subs_epi16
   // CHECK: @llvm.ssub.sat.v32i16
@@ -1518,18 +1696,24 @@ __m512i test_mm512_mask_subs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m5
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
 return _mm512_mask_subs_epi16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_subs_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,(__m512i)(__v32hi){1,-30000,3,30000,5,-6,7,8,9,-30000,11,30000,13,-14,15,16,17,-30000,19,30000,21,-22,23,24,25,-30000,27,30000,29,-30,31,32},(__m512i)(__v32hi){1,30000,3,-30000,5,6,7,-8,9,30000,11,-30000,13,14,15,-16,17,30000,19,-30000,21,22,23,-24,25,30000,27,-30000,29,30,31,-32}),1,-32768,3,32767,5,-12,7,16,9,-32768,11,32767,13,-28,15,32,17,-32768,19,32767,21,-44,23,48,25,-32768,27,32767,29,-60,31,64));
+
 __m512i test_mm512_maskz_subs_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_subs_epi16
   // CHECK: @llvm.ssub.sat.v32i16
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
 return _mm512_maskz_subs_epi16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_subs_epi16((__mmask32)0xAAAAAAAAu,(__m512i)(__v32hi){1,-30000,3,30000,5,-6,7,8,9,-30000,11,30000,13,-14,15,16,17,-30000,19,30000,21,-22,23,24,25,-30000,27,30000,29,-30,31,32},(__m512i)(__v32hi){1,30000,3,-30000,5,6,7,-8,9,30000,11,-30000,13,14,15,-16,17,30000,19,-30000,21,22,23,-24,25,30000,27,-30000,29,30,31,-32}),0,-32768,0,32767,0,-12,0,16,0,-32768,0,32767,0,-28,0,32,0,-32768,0,32767,0,-44,0,48,0,-32768,0,32767,0,-60,0,64));
+
 __m512i test_mm512_subs_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_subs_epu8
   // CHECK-NOT: @llvm.x86.avx512.mask.psubus.b.512
   // CHECK: call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
 return _mm512_subs_epu8(__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qu(_mm512_subs_epu8((__m512i)(__v64qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+64,+64,+64,+64,+64,+64,+64,+64,+127,+127,+127,+127,+127,+127,+127,+127,+128,+128,+128,+128,+128,+128,+128,+128,+191,+191,+191,+191,+191,+191,+191,+191,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m512i)(__v64qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),0,0,0,0,0,0,0,0,+63,0,0,0,0,0,0,0,+64,+1,0,0,0,0,0,0,+127,+64,+63,0,0,0,0,0,+128,+65,+64,+1,0,0,0,0,+191,+128,+127,+64,+63,0,0,0,+192,+129,+128,+65,+64,+1,0,0,+255,+192,+191,+128,+127,+64,+63,+0));
+
 __m512i test_mm512_mask_subs_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_subs_epu8
   // CHECK-NOT: @llvm.x86.avx512.mask.psubus.b.512
@@ -1537,7 +1721,7 @@ __m512i test_mm512_mask_subs_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m51
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
 return _mm512_mask_subs_epu8(__W,__U,__A,__B); 
 }
-TEST_CONSTEXPR(match_v64qu(_mm512_subs_epu8((__m512i)(__v64qu){0, 0, 0, 0, 0, 0, 0, 0, +63, +63, +63, +63, +63, +63, +63, +63, +64, +64, +64, +64, +64, +64, +64, +64, +127, +127, +127, +127, +127, +127, +127, +127, +128, +128, +128, +128, +128, +128, +128, +128, +191, +191, +191, +191, +191, +191, +191, +191, +192, +192, +192, +192, +192, +192, +192, +192, +255, +255, +255, +255, +255, +255, +255, +255}, (__m512i)(__v64qu){0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255}), 0, 0, 0, 0, 0, 0, 0, 0, +63, 0, 0, 0, 0, 0, 0, 0, +64, +1, 0, 0, 0, 0, 0, 0, +127, +64, +63, 0, 0, 0, 0, 0, +128, +65, +64, +1, 0, 0, 0, 0, +191, +128, +127, +64, +63, 0, 0, 0, +192, +129, +128, +65, +64, +1, 0, 0, +255, +192, +191, +128, +127, +64, +63, +0));
+TEST_CONSTEXPR(match_v64qu(_mm512_mask_subs_epu8((__m512i)(__v64qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m512i)(__v64qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),1,200,3,0,5,0,7,254,9,0,11,1,13,1,15,0,17,200,19,0,21,0,23,254,25,0,27,1,29,1,31,0,33,200,35,0,37,0,39,254,41,0,43,1,45,1,47,0,49,200,51,0,53,0,55,254,57,0,59,1,61,1,63,0));
 
 __m512i test_mm512_maskz_subs_epu8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_subs_epu8
@@ -1546,20 +1730,25 @@ __m512i test_mm512_maskz_subs_epu8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
 return _mm512_maskz_subs_epu8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qu(_mm512_maskz_subs_epu8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m512i)(__v64qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0,0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0,0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0,0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0));
+
 __m512i test_mm512_subs_epu16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_subs_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.psubus.w.512
   // CHECK: call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 return _mm512_subs_epu16(__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hu(_mm512_subs_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, 0, 0, 0, +16384, 0, 0, 0, 0, 0, +32767, +16383, 0, 0, 0, 0, +32768, +16384, +1, 0, 0, 0, +49152, +32768, +16385, +16384, 0, 0, +65535, +32768, +32767, 0));
+
 __m512i test_mm512_mask_subs_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_subs_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.psubus.w.512
   // CHECK: call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
 return _mm512_mask_subs_epu16(__W,__U,__A,__B); 
-TEST_CONSTEXPR(match_v32hu(_mm512_subs_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, 0, 0, 0, +16384, 0, 0, 0, 0, 0, +32767, +16383, 0, 0, 0, 0, +32768, +16384, +1, 0, 0, 0, +49152, +32768, +16385, +16384, 0, 0, +65535, +32768, +32767, 0));
 }
+TEST_CONSTEXPR(match_v32hu(_mm512_mask_subs_epu16((__m512i)(__v32hu){101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132},(__mmask32)0xAAAAAAAAu,(__m512i)(__v32hu){0,65000,0,40000,0,100,0,65535,0,0,0,1000,0,1,0,50000,0,65000,0,40000,0,100,0,65535,0,0,0,1000,0,1,0,50000},(__m512i)(__v32hu){0,5000,0,40000,0,200,0,1,0,1,0,65535,0,0,0,25000,0,5000,0,40000,0,200,0,1,0,1,0,65535,0,0,0,25000}),101,60000,103,0,105,0,107,65534,109,0,111,0,113,1,115,25000,117,60000,119,0,121,0,123,65534,125,0,127,0,129,1,131,25000));
+
 __m512i test_mm512_maskz_subs_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_subs_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.psubus.w.512
@@ -1567,6 +1756,8 @@ __m512i test_mm512_maskz_subs_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
 return _mm512_maskz_subs_epu16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hu(_mm512_maskz_subs_epu16((__mmask32)0xAAAAAAAAu,(__m512i)(__v32hu){51,65000,0,40000,0,100,0,65535,42,0,0,1000,0,1,0,50000,69,65000,0,40000,0,100,0,65535,71,0,0,1000,0,1,0,50000},(__m512i)(__v32hu){2652,5000,0,40000,0,200,0,1,398,1,0,65535,0,0,0,25000,29625,5000,0,40000,0,200,0,1,25274,1,0,65535,0,0,0,25000}),0,60000,0,0,0,0,0,65534,0,0,0,0,0,1,0,25000,0,60000,0,0,0,0,0,65534,0,0,0,0,0,1,0,25000));
+
 __m512i test_mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask2_permutex2var_epi16
   // CHECK: @llvm.x86.avx512.vpermi2var.hi.512
@@ -1591,6 +1782,132 @@ __m512i test_mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i
   return _mm512_maskz_permutex2var_epi16(__U,__A,__I,__B); 
 }
 
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_permutex2var_epi16(
+        (__m512i)(__v32hi){
+            0, 10, 20, 30, 40, 50, 60, 70,
+            80, 90, 100, 110, 120, 130, 140, 150,
+            160, 170, 180, 190, 200, 210, 220, 230,
+            240, 250, 260, 270, 280, 290, 300, 310},
+        (__m512i)(__v32hi){
+            0, 32, 1, 33, 2, 34, 3, 35,
+            4, 36, 5, 37, 6, 38, 7, 39,
+            8, 40, 9, 41, 10, 42, 11, 43,
+            12, 44, 13, 45, 14, 46, 15, 47},
+        (__m512i)(__v32hi){
+            400, 410, 420, 430, 440, 450, 460, 470,
+            480, 490, 500, 510, 520, 530, 540, 550,
+            560, 570, 580, 590, 600, 610, 620, 630,
+            640, 650, 660, 670, 680, 690, 700, 710}),
+    0, 400, 10, 410, 20, 420, 30, 430,
+    40, 440, 50, 450, 60, 460, 70, 470,
+    80, 480, 90, 490, 100, 500, 110, 510,
+    120, 520, 130, 530, 140, 540, 150, 550));
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_mask_permutex2var_epi16(
+        (__m512i)(__v32hi){
+            -1, -2, -3, -4, -5, -6, -7, -8,
+            -9, -10, -11, -12, -13, -14, -15, -16,
+            -17, -18, -19, -20, -21, -22, -23, -24,
+            -25, -26, -27, -28, -29, -30, -31, -32},
+        0xAAAAAAAA,
+        (__m512i)(__v32hi){
+            0, 32, 1, 33, 2, 34, 3, 35,
+            4, 36, 5, 37, 6, 38, 7, 39,
+            8, 40, 9, 41, 10, 42, 11, 43,
+            12, 44, 13, 45, 14, 46, 15, 47},
+        (__m512i)(__v32hi){
+            400, 410, 420, 430, 440, 450, 460, 470,
+            480, 490, 500, 510, 520, 530, 540, 550,
+            560, 570, 580, 590, 600, 610, 620, 630,
+            640, 650, 660, 670, 680, 690, 700, 710}),
+    -1, 400, -3, 410, -5, 420, -7, 430,
+    -9, 440, -11, 450, -13, 460, -15, 470,
+    -17, 480, -19, 490, -21, 500, -23, 510,
+    -25, 520, -27, 530, -29, 540, -31, 550));
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_maskz_permutex2var_epi16(
+        0x55555555,
+        (__m512i)(__v32hi){
+            0, 10, 20, 30, 40, 50, 60, 70,
+            80, 90, 100, 110, 120, 130, 140, 150,
+            160, 170, 180, 190, 200, 210, 220, 230,
+            240, 250, 260, 270, 280, 290, 300, 310},
+        (__m512i)(__v32hi){
+            0, 32, 1, 33, 2, 34, 3, 35,
+            4, 36, 5, 37, 6, 38, 7, 39,
+            8, 40, 9, 41, 10, 42, 11, 43,
+            12, 44, 13, 45, 14, 46, 15, 47},
+        (__m512i)(__v32hi){
+            400, 410, 420, 430, 440, 450, 460, 470,
+            480, 490, 500, 510, 520, 530, 540, 550,
+            560, 570, 580, 590, 600, 610, 620, 630,
+            640, 650, 660, 670, 680, 690, 700, 710}),
+    0, 0, 10, 0, 20, 0, 30, 0,
+    40, 0, 50, 0, 60, 0, 70, 0,
+    80, 0, 90, 0, 100, 0, 110, 0,
+    120, 0, 130, 0, 140, 0, 150, 0));
+
+TEST_CONSTEXPR(match_v64qu(
+    _mm512_permutex2var_epi8(
+        (__m512i)(__v64qu){
+            0, 10, 20, 30, 40, 50, 60, 70,
+            80, 90, 100, 110, 120, 127, 126, 125,
+            124, 123, 122, 121, 120, 119, 118, 117,
+            116, 115, 114, 113, 112, 111, 110, 109,
+            108, 107, 106, 105, 104, 103, 102, 101,
+            100, 99, 98, 97, 96, 95, 94, 93,
+            92, 91, 90, 89, 88, 87, 86, 85,
+            84, 83, 82, 81, 80, 79, 78, 77},
+        (__m512i)(__v64qu){
+            0, 64, 1, 65, 2, 66, 3, 67,
+            4, 68, 5, 69, 6, 70, 7, 71,
+            8, 72, 9, 73, 10, 74, 11, 75,
+            12, 76, 13, 77, 14, 78, 15, 79,
+            16, 80, 17, 81, 18, 82, 19, 83,
+            20, 84, 21, 85, 22, 86, 23, 87,
+            24, 88, 25, 89, 26, 90, 27, 91,
+            28, 92, 29, 93, 30, 94, 31, 95},
+        (__m512i)(__v64qu){
+            200, 210, 220, 230, 240, 250, 254, 253,
+            252, 251, 250, 249, 248, 247, 246, 245,
+            244, 243, 242, 241, 240, 239, 238, 237,
+            236, 235, 234, 233, 232, 231, 230, 229,
+            228, 227, 226, 225, 224, 223, 222, 221,
+            220, 219, 218, 217, 216, 215, 214, 213,
+            212, 211, 210, 209, 208, 207, 206, 205,
+            204, 203, 202, 201, 200, 199, 198, 197}),
+    0, 200, 10, 210, 20, 220, 30, 230,
+    40, 240, 50, 250, 60, 254, 70, 253,
+    80, 252, 90, 251, 100, 250, 110, 249,
+    120, 248, 127, 247, 126, 246, 125, 245,
+    124, 244, 123, 243, 122, 242, 121, 241,
+    120, 240, 119, 239, 118, 238, 117, 237,
+    116, 236, 115, 235, 114, 234, 113, 233,
+    112, 232, 111, 231, 110, 230, 109, 229));
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_mask2_permutex2var_epi16(
+        (__m512i)(__v32hi){
+            0, 10, 20, 30, 40, 50, 60, 70,
+            80, 90, 100, 110, 120, 130, 140, 150,
+            160, 170, 180, 190, 200, 210, 220, 230,
+            240, 250, 260, 270, 280, 290, 300, 310},
+        (__m512i)(__v32hi){
+            0, 32, 1, 33, 2, 34, 3, 35,
+            4, 36, 5, 37, 6, 38, 7, 39,
+            8, 40, 9, 41, 10, 42, 11, 43,
+            12, 44, 13, 45, 14, 46, 15, 47},
+        0x55555555,
+        (__m512i)(__v32hi){
+            400, 410, 420, 430, 440, 450, 460, 470,
+            480, 490, 500, 510, 520, 530, 540, 550,
+            560, 570, 580, 590, 600, 610, 620, 630,
+            640, 650, 660, 670, 680, 690, 700, 710}),
+    0, 32, 10, 33, 20, 34, 30, 35,
+    40, 36, 50, 37, 60, 38, 70, 39,
+    80, 40, 90, 41, 100, 42, 110, 43,
+    120, 44, 130, 45, 140, 46, 150, 47));
+
 __m512i test_mm512_mulhrs_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mulhrs_epi16
   // CHECK: @llvm.x86.avx512.pmul.hr.sw.512
@@ -1775,6 +2092,7 @@ __m512i test_mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, _
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_unpackhi_epi8(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_unpackhi_epi8((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xFAAAAAAAAAAAAAAA,(__m512i)(__v64qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89},(__m512i)(__v64qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64}),1,-9,3,-10,5,-11,7,-12,9,-13,11,-14,13,-15,15,-16,17,-25,19,-26,21,-27,23,-28,25,-29,27,-30,29,-31,31,-32,33,-41,35,-42,37,-43,39,-44,41,-45,43,-46,45,-47,47,-48,49,-57,51,-58,53,-59,55,-60,57,-61,59,-62,-90,-63,-89,-64));
 
 __m512i test_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_unpackhi_epi8
@@ -1782,6 +2100,7 @@ __m512i test_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B)
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_unpackhi_epi8(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_unpackhi_epi8((__mmask64)0xFAAAAAAAAAAAAAAA,(__m512i)(__v64qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89},(__m512i)(__v64qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64}),0,-9,0,-10,0,-11,0,-12,0,-13,0,-14,0,-15,0,-16,0,-25,0,-26,0,-27,0,-28,0,-29,0,-30,0,-31,0,-32,0,-41,0,-42,0,-43,0,-44,0,-45,0,-46,0,-47,0,-48,0,-57,0,-58,0,-59,0,-60,0,-61,0,-62,-90,-63,-89,-64));
 
 __m512i test_mm512_unpackhi_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_unpackhi_epi16
@@ -1797,6 +2116,7 @@ __m512i test_mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A,
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_unpackhi_epi16(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_unpackhi_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xFAAAAAAAu,(__m512i)(__v32hi){100,101,102,103,104,105,106,107,110,111,112,113,114,115,116,117,120,121,122,123,124,125,126,127,130,131,132,133,134,135,136,137},(__m512i)(__v32hi){200,201,202,203,204,205,206,207,210,211,212,213,214,215,216,217,220,221,222,223,224,225,226,227,230,231,232,233,234,235,236,237}),1,204,3,205,5,206,7,207,9,214,11,215,13,216,15,217,17,224,19,225,21,226,23,227,25,234,27,235,136,236,137,237));
 
 __m512i test_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_unpackhi_epi16
@@ -1804,6 +2124,7 @@ __m512i test_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B)
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_unpackhi_epi16(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_unpackhi_epi16((__mmask32)0xFAAAAAAAu,(__m512i)(__v32hi){100,101,102,103,104,105,106,107,110,111,112,113,114,115,116,117,120,121,122,123,124,125,126,127,130,131,132,133,134,135,136,137},(__m512i)(__v32hi){200,201,202,203,204,205,206,207,210,211,212,213,214,215,216,217,220,221,222,223,224,225,226,227,230,231,232,233,234,235,236,237}),0,204,0,205,0,206,0,207,0,214,0,215,0,216,0,217,0,224,0,225,0,226,0,227,0,234,0,235,136,236,137,237));
 
 __m512i test_mm512_unpacklo_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_unpacklo_epi8
@@ -1818,6 +2139,7 @@ __m512i test_mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, _
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_unpacklo_epi8(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_unpacklo_epi8((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xFAAAAAAAAAAAAAAA,(__m512i)(__v64qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64,-65},(__m512i)(__v64qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75}),1,-1,3,-2,5,-3,7,-4,9,-5,11,-6,13,-7,15,-8,17,20,19,21,21,22,23,23,25,24,27,25,29,26,31,27,33,40,35,41,37,42,39,43,41,44,43,45,45,46,47,47,49,60,51,61,53,62,55,63,57,64,59,65,-56,66,-57,67));
 
 __m512i test_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_unpacklo_epi8
@@ -1825,6 +2147,7 @@ __m512i test_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B)
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_unpacklo_epi8(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_unpacklo_epi8((__mmask64)0xFAAAAAAAAAAAAAAA,(__m512i)(__v64qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64,-65},(__m512i)(__v64qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75}),0,-1,0,-2,0,-3,0,-4,0,-5,0,-6,0,-7,0,-8,0,20,0,21,0,22,0,23,0,24,0,25,0,26,0,27,0,40,0,41,0,42,0,43,0,44,0,45,0,46,0,47,0,60,0,61,0,62,0,63,0,64,0,65,-56,66,-57,67));
 
 __m512i test_mm512_unpacklo_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_unpacklo_epi16
@@ -1839,6 +2162,7 @@ __m512i test_mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A,
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_unpacklo_epi16(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_unpacklo_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xFAAAAAAAu,(__m512i)(__v32hi){100,101,102,103,104,105,106,107,110,111,112,113,114,115,116,117,120,121,122,123,124,125,126,127,130,131,132,133,134,135,136,137},(__m512i)(__v32hi){200,201,202,203,204,205,206,207,210,211,212,213,214,215,216,217,220,221,222,223,224,225,226,227,230,231,232,233,234,235,236,237}),1,200,3,201,5,202,7,203,9,210,11,211,13,212,15,213,17,220,19,221,21,222,23,223,25,230,27,231,132,232,133,233));
 
 __m512i test_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_unpacklo_epi16
@@ -1846,6 +2170,7 @@ __m512i test_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B)
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_unpacklo_epi16(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_unpacklo_epi16((__mmask32)0xFAAAAAAAu,(__m512i)(__v32hi){100,101,102,103,104,105,106,107,110,111,112,113,114,115,116,117,120,121,122,123,124,125,126,127,130,131,132,133,134,135,136,137},(__m512i)(__v32hi){200,201,202,203,204,205,206,207,210,211,212,213,214,215,216,217,220,221,222,223,224,225,226,227,230,231,232,233,234,235,236,237}),0,200,0,201,0,202,0,203,0,210,0,211,0,212,0,213,0,220,0,221,0,222,0,223,0,230,0,231,132,232,133,233));
 
 __m512i test_mm512_cvtepi8_epi16(__m256i __A) {
   // CHECK-LABEL: test_mm512_cvtepi8_epi16
@@ -2233,24 +2558,28 @@ __m512i test_mm512_mask_mov_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_mov_epi16(__W, __U, __A); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_mov_epi16((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31},(__mmask32)0xAAAAAAAA,(__m512i)(__v32hi){-0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31}),0,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15,16,-17,18,-19,20,-21,22,-23,24,-25,26,-27,28,-29,30,-31));
 
 __m512i test_mm512_maskz_mov_epi16(__mmask32 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_maskz_mov_epi16
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_mov_epi16(__U, __A); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_mov_epi16((__mmask32)0xAAAAAAAA,(__m512i)(__v32hi){-0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31}),0,-1,0,-3,0,-5,0,-7,0,-9,0,-11,0,-13,0,-15,0,-17,0,-19,0,-21,0,-23,0,-25,0,-27,0,-29,0,-31));
 
 __m512i test_mm512_mask_mov_epi8(__m512i __W, __mmask64 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_mov_epi8
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_mov_epi8(__W, __U, __A); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_mov_epi8((__m512i)(__v64qs){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){-0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63}),0,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15,16,-17,18,-19,20,-21,22,-23,24,-25,26,-27,28,-29,30,-31,32,-33,34,-35,36,-37,38,-39,40,-41,42,-43,44,-45,46,-47,48,-49,50,-51,52,-53,54,-55,56,-57,58,-59,60,-61,62,-63));
 
 __m512i test_mm512_maskz_mov_epi8(__mmask64 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_maskz_mov_epi8
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_mov_epi8(__U, __A); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_mov_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){-0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63}),0,-1,0,-3,0,-5,0,-7,0,-9,0,-11,0,-13,0,-15,0,-17,0,-19,0,-21,0,-23,0,-25,0,-27,0,-29,0,-31,0,-33,0,-35,0,-37,0,-39,0,-41,0,-43,0,-45,0,-47,0,-49,0,-51,0,-53,0,-55,0,-57,0,-59,0,-61,0,-63));
 
 __m512i test_mm512_mask_set1_epi8(__m512i __O, __mmask64 __M, char __A) {
   // CHECK-LABEL: test_mm512_mask_set1_epi8
@@ -2319,6 +2648,7 @@ __m512i test_mm512_mask_set1_epi8(__m512i __O, __mmask64 __M, char __A) {
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_set1_epi8(__O, __M, __A); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_set1_epi8((__m512i)(__v64qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(char)42),1,42,3,42,5,42,7,42,9,42,11,42,13,42,15,42,17,42,19,42,21,42,23,42,25,42,27,42,29,42,31,42,33,42,35,42,37,42,39,42,41,42,43,42,45,42,47,42,49,42,51,42,53,42,55,42,57,42,59,42,61,42,63,42));
 
 __m512i test_mm512_maskz_set1_epi8(__mmask64 __M, char __A) {
   // CHECK-LABEL: test_mm512_maskz_set1_epi8
@@ -2389,6 +2719,7 @@ __m512i test_mm512_maskz_set1_epi8(__mmask64 __M, char __A) {
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_set1_epi8(__M, __A); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_set1_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(char)42),0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42));
 
 __mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: test_mm512_kunpackd
@@ -2564,6 +2895,7 @@ __m512i test_mm512_mask_broadcastb_epi8(__m512i __O, __mmask64 __M, __m128i __A)
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_broadcastb_epi8(__O, __M, __A);
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_broadcastb_epi8((__m512i)(__v64qs){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,2,-120,4,-120,6,-120,8,-120,10,-120,12,-120,14,-120,16,-120,18,-120,20,-120,22,-120,24,-120,26,-120,28,-120,30,-120,32,-120,34,-120,36,-120,38,-120,40,-120,42,-120,44,-120,46,-120,48,-120,50,-120,52,-120,54,-120,56,-120,58,-120,60,-120,62,-120));
 
 __m512i test_mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) {
   // CHECK-LABEL: test_mm512_maskz_broadcastb_epi8
@@ -2571,6 +2903,7 @@ __m512i test_mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) {
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_broadcastb_epi8(__M, __A);
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_broadcastb_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120));
 
 __m512i test_mm512_broadcastw_epi16(__m128i __A) {
   // CHECK-LABEL: test_mm512_broadcastw_epi16
@@ -2578,6 +2911,33 @@ __m512i test_mm512_broadcastw_epi16(__m128i __A) {
   return _mm512_broadcastw_epi16(__A);
 }
 TEST_CONSTEXPR(match_v32hi(_mm512_broadcastw_epi16((__m128i)(__v8hi){42, 3, 10, 8, 0, 256, 256, 128}), 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42));
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_permutex2var_epi16((__m512i)(__v32hi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                                                  17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+                              (__m512i)(__v32hi){0, 31, 32, 63, 1, 33, 2, 34,
+                                                  3, 35, 4, 36, 5, 37, 6, 38,
+                                                  7, 39, 8, 40, 9, 41, 10, 42,
+                                                  11, 43, 12, 44, 13, 45, 14, 46},
+                              (__m512i)(__v32hi){101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+                                                  117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132}),
+    1, 32, 101, 132, 2, 102, 3, 103,
+    4, 104, 5, 105, 6, 106, 7, 107,
+    8, 108, 9, 109, 10, 110, 11, 111,
+    12, 112, 13, 113, 14, 114, 15, 115));
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_mask_permutex2var_epi16((__m512i)(__v32hi){-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16,
+                                                        -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32},
+                                    0xAAAAAAAA,
+                                    (__m512i)(__v32hi){0, 31, 32, 63, 1, 33, 2, 34,
+                                                        3, 35, 4, 36, 5, 37, 6, 38,
+                                                        7, 39, 8, 40, 9, 41, 10, 42,
+                                                        11, 43, 12, 44, 13, 45, 14, 46},
+                                    (__m512i)(__v32hi){101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+                                                        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132}),
+    -1, -32, -3, 132, -5, 102, -7, 103,
+    -9, 104, -11, 105, -13, 106, -15, 107,
+    -17, 108, -19, 109, -21, 110, -23, 111,
+    -25, 112, -27, 113, -29, 114, -31, 115));
 
 __m512i test_mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, __m128i __A) {
   // CHECK-LABEL: test_mm512_mask_broadcastw_epi16
@@ -2585,6 +2945,7 @@ __m512i test_mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, __m128i __A
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_broadcastw_epi16(__O, __M, __A);
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_broadcastw_epi16((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31},(__mmask32)0xAAAAAAAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,2,-120,4,-120,6,-120,8,-120,10,-120,12,-120,14,-120,16,-120,18,-120,20,-120,22,-120,24,-120,26,-120,28,-120,30,-120));
 
 __m512i test_mm512_maskz_broadcastw_epi16(__mmask32 __M, __m128i __A) {
   // CHECK-LABEL: test_mm512_maskz_broadcastw_epi16
@@ -2592,6 +2953,7 @@ __m512i test_mm512_maskz_broadcastw_epi16(__mmask32 __M, __m128i __A) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_broadcastw_epi16(__M, __A);
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_broadcastw_epi16((__mmask32)0xAAAAAAAAu,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120));
 
 __m512i test_mm512_mask_set1_epi16(__m512i __O, __mmask32 __M, short __A) {
   // CHECK-LABEL: test_mm512_mask_set1_epi16
@@ -2630,6 +2992,7 @@ __m512i test_mm512_mask_set1_epi16(__m512i __O, __mmask32 __M, short __A) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_set1_epi16(__O, __M, __A); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_set1_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,-1),1,-1,3,-1,5,-1,7,-1,9,-1,11,-1,13,-1,15,-1,17,-1,19,-1,21,-1,23,-1,25,-1,27,-1,29,-1,31,-1));
 
 __m512i test_mm512_maskz_set1_epi16(__mmask32 __M, short __A) {
   // CHECK-LABEL: test_mm512_maskz_set1_epi16
@@ -2668,6 +3031,8 @@ __m512i test_mm512_maskz_set1_epi16(__mmask32 __M, short __A) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_set1_epi16(__M, __A); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_set1_epi16((__mmask32)0xAAAAAAAA,42),0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42));
+
 __m512i test_mm512_permutexvar_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_permutexvar_epi16
   // CHECK: @llvm.x86.avx512.permvar.hi.512
@@ -2692,6 +3057,9 @@ __m512i test_mm512_alignr_epi8(__m512i __A,__m512i __B){
     // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113>
     return _mm512_alignr_epi8(__A, __B, 2); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_alignr_epi8(((__m512i)(__v64qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), ((__m512i)(__v64qs){65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127}), 2), 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 1, 2, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 17, 18, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 33, 34, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127, 49, 50));
+TEST_CONSTEXPR(match_v64qi(_mm512_alignr_epi8(((__m512i)(__v64qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), ((__m512i)(__v64qs){65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127}), 16), 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64));
+TEST_CONSTEXPR(match_v64qi(_mm512_alignr_epi8(((__m512i)(__v64qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), ((__m512i)(__v64qs){65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127}), 32), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m512i test_mm512_mask_alignr_epi8(__m512i __W, __mmask64 __U, __m512i __A,__m512i __B){
     // CHECK-LABEL: test_mm512_mask_alignr_epi8
@@ -2699,6 +3067,7 @@ __m512i test_mm512_mask_alignr_epi8(__m512i __W, __mmask64 __U, __m512i __A,__m5
     // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
     return _mm512_mask_alignr_epi8(__W, __U, __A, __B, 2); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_alignr_epi8(((__m512i)(__v64qs){127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}), (__mmask64)0x000000000000000f, ((__m512i)(__v64qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), ((__m512i)(__v64qs){65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127}), 2), 67, 68, 69, 70, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127));
 
 __m512i test_mm512_maskz_alignr_epi8(__mmask64 __U, __m512i __A,__m512i __B){
     // CHECK-LABEL: test_mm512_maskz_alignr_epi8
@@ -2706,6 +3075,7 @@ __m512i test_mm512_maskz_alignr_epi8(__mmask64 __U, __m512i __A,__m512i __B){
     // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
    return _mm512_maskz_alignr_epi8(__U, __A, __B, 2); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_alignr_epi8((__mmask64)0x000000000000000f, ((__m512i)(__v64qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), ((__m512i)(__v64qs){65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127}), 2), 67, 68, 69, 70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 
 
diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c
index 9c4ada3a2b7b8..efe983ce5b10c 100644
--- a/clang/test/CodeGen/X86/avx512dq-builtins.c
+++ b/clang/test/CodeGen/X86/avx512dq-builtins.c
@@ -117,6 +117,10 @@ unsigned char test_kortestz_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m5
                             _mm512_cmpneq_epu64_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_kortestz_mask8_u8(0x00, 0x00) == 1);
+TEST_CONSTEXPR(_kortestz_mask8_u8(0x00, 0x80) == 0);
+TEST_CONSTEXPR(_kortestz_mask8_u8(0x01, 0xFE) == 0);
+
 unsigned char test_kortestc_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
   // CHECK-LABEL: test_kortestc_mask8_u8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
@@ -130,6 +134,10 @@ unsigned char test_kortestc_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m5
                             _mm512_cmpneq_epu64_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_kortestc_mask8_u8(0x00, 0x00) == 0);
+TEST_CONSTEXPR(_kortestc_mask8_u8(0x00, 0x80) == 0);
+TEST_CONSTEXPR(_kortestc_mask8_u8(0x01, 0xFE) == 1);
+
 unsigned char test_kortest_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
   // CHECK-LABEL: test_kortest_mask8_u8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
@@ -150,6 +158,30 @@ unsigned char test_kortest_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m51
                            _mm512_cmpneq_epu64_mask(__C, __D), CF);
 }
 
+// Test constexpr handling.
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+constexpr unsigned char
+test_kortest_mask8_u8(unsigned char A, unsigned char B) {
+  unsigned char all_ones{};
+  return (_kortest_mask8_u8(A, B, &all_ones) << 4) | all_ones;
+}
+
+void _kortest_mask8_u8() {
+  constexpr unsigned char A1 = 0x00;
+  constexpr unsigned char B1 = 0x00;
+  constexpr unsigned char expected_result_1 = 0x10;
+  static_assert(test_kortest_mask8_u8(A1, B1) == expected_result_1);
+  constexpr unsigned char A2 = 0x00;
+  constexpr unsigned char B2 = 0x80;
+  constexpr unsigned char expected_result_2 = 0x00;
+  static_assert(test_kortest_mask8_u8(A2, B2) == expected_result_2);
+  constexpr unsigned char A3 = 0x01;
+  constexpr unsigned char B3 = 0xFE;
+  constexpr unsigned char expected_result_3 = 0x01;
+  static_assert(test_kortest_mask8_u8(A3, B3) == expected_result_3);
+}
+#endif
+
 unsigned char test_ktestz_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
   // CHECK-LABEL: test_ktestz_mask8_u8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
@@ -160,6 +192,11 @@ unsigned char test_ktestz_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512
                           _mm512_cmpneq_epu64_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_ktestz_mask8_u8(0x00, 0x00) == 1);
+TEST_CONSTEXPR(_ktestz_mask8_u8(0x00, 0x80) == 1);
+TEST_CONSTEXPR(_ktestz_mask8_u8(0xF0, 0x80) == 0);
+TEST_CONSTEXPR(_ktestz_mask8_u8(0x01, 0x01) == 0);
+
 unsigned char test_ktestc_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
   // CHECK-LABEL: test_ktestc_mask8_u8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
@@ -170,6 +207,11 @@ unsigned char test_ktestc_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512
                           _mm512_cmpneq_epu64_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_ktestc_mask8_u8(0x00, 0x00) == 1);
+TEST_CONSTEXPR(_ktestc_mask8_u8(0x00, 0x80) == 0);
+TEST_CONSTEXPR(_ktestc_mask8_u8(0xF0, 0x80) == 1);
+TEST_CONSTEXPR(_ktestc_mask8_u8(0x01, 0x01) == 1);
+
 unsigned char test_ktest_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
   // CHECK-LABEL: test_ktest_mask8_u8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
@@ -184,6 +226,34 @@ unsigned char test_ktest_mask8_u8(__m512i __A, __m512i __B, __m512i __C, __m512i
                          _mm512_cmpneq_epu64_mask(__C, __D), CF);
 }
 
+// Test constexpr handling.
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+constexpr unsigned char
+test_ktest_mask8_u8(unsigned char A, unsigned char B) {
+  unsigned char all_ones{};
+  return (_ktest_mask8_u8(A, B, &all_ones) << 4) | all_ones;
+}
+
+void _ktest_mask8_u8() {
+  constexpr unsigned char A1 = 0x00;
+  constexpr unsigned char B1 = 0x00;
+  constexpr unsigned char expected_result_1 = 0x11;
+  static_assert(test_ktest_mask8_u8(A1, B1) == expected_result_1);
+  constexpr unsigned char A2 = 0x00;
+  constexpr unsigned char B2 = 0x80;
+  constexpr unsigned char expected_result_2 = 0x10;
+  static_assert(test_ktest_mask8_u8(A2, B2) == expected_result_2);
+  constexpr unsigned char A3 = 0xF0;
+  constexpr unsigned char B3 = 0x80;
+  constexpr unsigned char expected_result_3 = 0x01;
+  static_assert(test_ktest_mask8_u8(A3, B3) == expected_result_3);
+  constexpr unsigned char A4 = 0x01;
+  constexpr unsigned char B4 = 0x01;
+  constexpr unsigned char expected_result_4 = 0x01;
+  static_assert(test_ktest_mask8_u8(A4, B4) == expected_result_4);
+}
+#endif
+
 unsigned char test_ktestz_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
   // CHECK-LABEL: test_ktestz_mask16_u8
   // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
@@ -194,6 +264,11 @@ unsigned char test_ktestz_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m51
                            _mm512_cmpneq_epu32_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_ktestz_mask16_u8(0x0000, 0x0000) == 1);
+TEST_CONSTEXPR(_ktestz_mask16_u8(0x0000, 0x8000) == 1);
+TEST_CONSTEXPR(_ktestz_mask16_u8(0xF000, 0x8000) == 0);
+TEST_CONSTEXPR(_ktestz_mask16_u8(0x0123, 0x0123) == 0);
+
 unsigned char test_ktestc_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
   // CHECK-LABEL: test_ktestc_mask16_u8
   // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
@@ -204,6 +279,11 @@ unsigned char test_ktestc_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m51
                            _mm512_cmpneq_epu32_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_ktestc_mask16_u8(0x0000, 0x0000) == 1);
+TEST_CONSTEXPR(_ktestc_mask16_u8(0x0000, 0x8000) == 0);
+TEST_CONSTEXPR(_ktestc_mask16_u8(0xF000, 0x8000) == 1);
+TEST_CONSTEXPR(_ktestc_mask16_u8(0x0123, 0x0123) == 1);
+
 unsigned char test_ktest_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
   // CHECK-LABEL: test_ktest_mask16_u8
   // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
@@ -218,6 +298,34 @@ unsigned char test_ktest_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512
                           _mm512_cmpneq_epu32_mask(__C, __D), CF);
 }
 
+// Test constexpr handling.
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+constexpr unsigned char
+test_ktest_mask16_u8(unsigned int A, unsigned int B) {
+  unsigned char all_ones{};
+  return (_ktest_mask16_u8(A, B, &all_ones) << 4) | all_ones;
+}
+
+void _ktest_mask16_u8() {
+  constexpr unsigned int A1 = 0x0000;
+  constexpr unsigned int B1 = 0x0000;
+  constexpr unsigned char expected_result_1 = 0x11;
+  static_assert(test_ktest_mask16_u8(A1, B1) == expected_result_1);
+  constexpr unsigned int A2 = 0x0000;
+  constexpr unsigned int B2 = 0x8000;
+  constexpr unsigned char expected_result_2 = 0x10;
+  static_assert(test_ktest_mask16_u8(A2, B2) == expected_result_2);
+  constexpr unsigned int A3 = 0xF000;
+  constexpr unsigned int B3 = 0x8000;
+  constexpr unsigned char expected_result_3 = 0x01;
+  static_assert(test_ktest_mask16_u8(A3, B3) == expected_result_3);
+  constexpr unsigned int A4 = 0x0123;
+  constexpr unsigned int B4 = 0x0123;
+  constexpr unsigned char expected_result_4 = 0x01;
+  static_assert(test_ktest_mask16_u8(A4, B4) == expected_result_4);
+}
+#endif
+
 __mmask8 test_kadd_mask8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: test_kadd_mask8
   // CHECK: [[LHS:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 69599379b6b3d..17778b52d3671 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -5607,6 +5607,56 @@ __m512i test_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i _
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_permutex2var_epi64(__U, __A, __I, __B);
 }
+
+TEST_CONSTEXPR(match_v16si(
+    _mm512_permutex2var_epi32(
+        (__m512i)(__v16si){0, 10, 20, 30, 40, 50, 60, 70,
+                           80, 90, 100, 110, 120, 130, 140, 150},
+        (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    0, 150, 200, 350, 10, 210, 20, 220,
+    30, 230, 40, 240, 50, 250, 60, 260));
+TEST_CONSTEXPR(match_v16si(
+    _mm512_mask_permutex2var_epi32(
+        (__m512i)(__v16si){-1, -2, -3, -4, -5, -6, -7, -8,
+                           -9, -10, -11, -12, -13, -14, -15, -16},
+        0xAAAA,
+        (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    -1, -16, -3, 350, -5, 210, -7, 220,
+    -9, 230, -11, 240, -13, 250, -15, 260));
+TEST_CONSTEXPR(match_v16si(
+    _mm512_maskz_permutex2var_epi32(
+        0x5555,
+        (__m512i)(__v16si){0, 10, 20, 30, 40, 50, 60, 70,
+                           80, 90, 100, 110, 120, 130, 140, 150},
+        (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    0, 0, 200, 0, 10, 0, 20, 0,
+    30, 0, 40, 0, 50, 0, 60, 0));
+TEST_CONSTEXPR(match_m512(
+    _mm512_permutex2var_ps(
+        (__m512){1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f,
+                 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f},
+        (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f,
+                 109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}),
+    1.f, 16.f, 101.f, 116.f, 2.f, 102.f, 3.f, 103.f,
+    4.f, 104.f, 5.f, 105.f, 6.f, 106.f, 7.f, 107.f));
+TEST_CONSTEXPR(match_m512d(
+    _mm512_permutex2var_pd(
+        (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
+        (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+        (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}),
+    1.0, 108.0, 1.0, 8.0, 2.0, 2.0, 3.0, 3.0));
+
 __mmask16 test_mm512_testn_epi32_mask(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_testn_epi32_mask
   // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
@@ -8965,6 +9015,10 @@ int test_mm512_kortestc(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
                          _mm512_cmpneq_epu32_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_mm512_kortestc(0x0000, 0x0000) == 0);
+TEST_CONSTEXPR(_mm512_kortestc(0x0000, 0x8000) == 0);
+TEST_CONSTEXPR(_mm512_kortestc(0x0123, 0xFEDC) == 1);
+
 int test_mm512_kortestz(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
   // CHECK-LABEL: test_mm512_kortestz
   // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
@@ -8977,6 +9031,10 @@ int test_mm512_kortestz(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
                          _mm512_cmpneq_epu32_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_mm512_kortestz(0x0000, 0x0000) == 1);
+TEST_CONSTEXPR(_mm512_kortestz(0x0000, 0x8000) == 0);
+TEST_CONSTEXPR(_mm512_kortestz(0x0123, 0xFEDC) == 0);
+
 unsigned char test_kortestz_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
   // CHECK-LABEL: test_kortestz_mask16_u8
   // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
@@ -8990,6 +9048,10 @@ unsigned char test_kortestz_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m
                              _mm512_cmpneq_epu32_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_kortestz_mask16_u8(0x0000, 0x0000) == 1);
+TEST_CONSTEXPR(_kortestz_mask16_u8(0x0000, 0x8000) == 0);
+TEST_CONSTEXPR(_kortestz_mask16_u8(0x0123, 0xFEDC) == 0);
+
 unsigned char test_kortestc_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
   // CHECK-LABEL: test_kortestc_mask16_u8
   // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
@@ -9003,6 +9065,10 @@ unsigned char test_kortestc_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m
                              _mm512_cmpneq_epu32_mask(__C, __D));
 }
 
+TEST_CONSTEXPR(_kortestc_mask16_u8(0x0000, 0x0000) == 0);
+TEST_CONSTEXPR(_kortestc_mask16_u8(0x0000, 0x8000) == 0);
+TEST_CONSTEXPR(_kortestc_mask16_u8(0x0123, 0xFEDC) == 1);
+
 unsigned char test_kortest_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
   // CHECK-LABEL: test_kortest_mask16_u8
   // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
@@ -9023,6 +9089,30 @@ unsigned char test_kortest_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m5
                             _mm512_cmpneq_epu32_mask(__C, __D), CF);
 }
 
+// Test constexpr handling.
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+constexpr unsigned char
+test_kortest_mask16_u8(unsigned short A, unsigned short B) {
+  unsigned char all_ones{};
+  return (_kortest_mask16_u8(A, B, &all_ones) << 4) | all_ones;
+}
+
+void _kortest_mask16_u8() {
+  constexpr unsigned short A1 = 0x0000;
+  constexpr unsigned short B1 = 0x0000;
+  constexpr unsigned char expected_result_1 = 0x10;
+  static_assert(test_kortest_mask16_u8(A1, B1) == expected_result_1);
+  constexpr unsigned short A2 = 0x0000;
+  constexpr unsigned short B2 = 0x8000;
+  constexpr unsigned char expected_result_2 = 0x00;
+  static_assert(test_kortest_mask16_u8(A2, B2) == expected_result_2);
+  constexpr unsigned short A3 = 0x0123;
+  constexpr unsigned short B3 = 0xFEDC;
+  constexpr unsigned char expected_result_3 = 0x01;
+  static_assert(test_kortest_mask16_u8(A3, B3) == expected_result_3);
+}
+#endif
+
 __mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: test_mm512_kunpackb
   // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
@@ -11753,3 +11843,73 @@ void test_mm512_mask_i32loscatter_epi64(void *__addr, __mmask8 __mask, __m512i _
   // CHECK: @llvm.x86.avx512.mask.scatter.dpq.512
   _mm512_mask_i32loscatter_epi64(__addr, __mask, __index, __v1, 2);
 }
+
+
+TEST_CONSTEXPR(match_m512d(
+    _mm512_permutex2var_pd((__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
+                           (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+                           (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}),
+    1.0, 108.0, 1.0, 8.0, 2.0, 2.0, 3.0, 3.0));
+TEST_CONSTEXPR(match_m512d(
+    _mm512_mask_permutex2var_pd((__m512d){-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0},
+                                 0xAA,
+                                 (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+                                 (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}),
+    -1.0, 108.0, -3.0, -8.0, -5.0, -2.0, -7.0, -3.0));
+TEST_CONSTEXPR(match_m512d(
+    _mm512_maskz_permutex2var_pd(0x55, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
+                                 (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+                                 (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}),
+    1.0, 0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0));
+
+TEST_CONSTEXPR(match_m512(
+    _mm512_permutex2var_ps((__m512){1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f,
+                                     9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f},
+                           (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                                               3, 19, 4, 20, 5, 21, 6, 22},
+                           (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f,
+                                    109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}),
+    1.f, 16.f, 101.f, 116.f, 2.f, 102.f, 3.f, 103.f,
+    4.f, 104.f, 5.f, 105.f, 6.f, 106.f, 7.f, 107.f));
+TEST_CONSTEXPR(match_m512(
+    _mm512_mask_permutex2var_ps((__m512){-1.f, -2.f, -3.f, -4.f, -5.f, -6.f, -7.f, -8.f,
+                                          -9.f, -10.f, -11.f, -12.f, -13.f, -14.f, -15.f, -16.f},
+                                0xAAAA,
+                                (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                                                    3, 19, 4, 20, 5, 21, 6, 22},
+                                (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f,
+                                         109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}),
+    -1.f, -16.f, -3.f, 116.f, -5.f, 102.f, -7.f, 103.f,
+    -9.f, 104.f, -11.f, 105.f, -13.f, 106.f, -15.f, 107.f));
+
+TEST_CONSTEXPR(match_v16si(
+    _mm512_permutex2var_epi32((__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8,
+                                                  9, 10, 11, 12, 13, 14, 15, 16},
+                              (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                                                  3, 19, 4, 20, 5, 21, 6, 22},
+                              (__m512i)(__v16si){101, 102, 103, 104, 105, 106, 107, 108,
+                                                  109, 110, 111, 112, 113, 114, 115, 116}),
+    1, 16, 101, 116, 2, 102, 3, 103,
+    4, 104, 5, 105, 6, 106, 7, 107));
+TEST_CONSTEXPR(match_v16si(
+    _mm512_maskz_permutex2var_epi32(0x5555,
+                                     (__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8,
+                                                         9, 10, 11, 12, 13, 14, 15, 16},
+                                     (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                                                         3, 19, 4, 20, 5, 21, 6, 22},
+                                     (__m512i)(__v16si){101, 102, 103, 104, 105, 106, 107, 108,
+                                                         109, 110, 111, 112, 113, 114, 115, 116}),
+    1, 0, 101, 0, 2, 0, 3, 0,
+    4, 0, 5, 0, 6, 0, 7, 0));
+
+TEST_CONSTEXPR(match_v8di(
+    _mm512_permutex2var_epi64((__m512i)(__v8di){1, 2, 3, 4, 5, 6, 7, 8},
+                              (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+                              (__m512i)(__v8di){101, 102, 103, 104, 105, 106, 107, 108}),
+    1, 108, 1, 8, 2, 2, 3, 3));
+TEST_CONSTEXPR(match_v8di(
+    _mm512_mask_permutex2var_epi64((__m512i)(__v8di){-1, -2, -3, -4, -5, -6, -7, -8},
+                                    0xAA,
+                                    (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+                                    (__m512i)(__v8di){101, 102, 103, 104, 105, 106, 107, 108}),
+    -1, 108, -3, -8, -5, -2, -7, -3));
diff --git a/clang/test/CodeGen/X86/avx512vbmi-builtins.c b/clang/test/CodeGen/X86/avx512vbmi-builtins.c
index c3b6298a39b59..7d506db92faeb 100644
--- a/clang/test/CodeGen/X86/avx512vbmi-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vbmi-builtins.c
@@ -3,8 +3,14 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror | FileCheck %s
 
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m512i test_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask2_permutex2var_epi8
@@ -33,6 +39,154 @@ __m512i test_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i _
   return _mm512_maskz_permutex2var_epi8(__U, __A, __I, __B); 
 }
 
+TEST_CONSTEXPR(match_v64qu(
+    _mm512_permutex2var_epi8((__m512i)(__v64qu){
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55,
+            56, 57, 58, 59, 60, 61, 62, 63},
+        (__m512i)(__v64qu){
+            0, 64, 1, 65, 2, 66, 3, 67,
+            4, 68, 5, 69, 6, 70, 7, 71,
+            8, 72, 9, 73, 10, 74, 11, 75,
+            12, 76, 13, 77, 14, 78, 15, 79,
+            16, 80, 17, 81, 18, 82, 19, 83,
+            20, 84, 21, 85, 22, 86, 23, 87,
+            24, 88, 25, 89, 26, 90, 27, 91,
+            28, 92, 29, 93, 30, 94, 31, 95},
+        (__m512i)(__v64qu){
+            200, 201, 202, 203, 204, 205, 206, 207,
+            208, 209, 210, 211, 212, 213, 214, 215,
+            216, 217, 218, 219, 220, 221, 222, 223,
+            224, 225, 226, 227, 228, 229, 230, 231,
+            232, 233, 234, 235, 236, 237, 238, 239,
+            240, 241, 242, 243, 244, 245, 246, 247,
+            248, 249, 250, 251, 252, 253, 254, 255,
+            0, 1, 2, 3, 4, 5, 6, 7}),
+    0, 200, 1, 201, 2, 202, 3, 203,
+    4, 204, 5, 205, 6, 206, 7, 207,
+    8, 208, 9, 209, 10, 210, 11, 211,
+    12, 212, 13, 213, 14, 214, 15, 215,
+    16, 216, 17, 217, 18, 218, 19, 219,
+    20, 220, 21, 221, 22, 222, 23, 223,
+    24, 224, 25, 225, 26, 226, 27, 227,
+    28, 228, 29, 229, 30, 230, 31, 231));
+TEST_CONSTEXPR(match_v64qu(
+    _mm512_mask_permutex2var_epi8((__m512i)(__v64qu){
+            10, 11, 12, 13, 14, 15, 16, 17,
+            18, 19, 20, 21, 22, 23, 24, 25,
+            26, 27, 28, 29, 30, 31, 32, 33,
+            34, 35, 36, 37, 38, 39, 40, 41,
+            42, 43, 44, 45, 46, 47, 48, 49,
+            50, 51, 52, 53, 54, 55, 56, 57,
+            58, 59, 60, 61, 62, 63, 64, 65,
+            66, 67, 68, 69, 70, 71, 72, 73},
+        0xAAAAAAAAAAAAAAAAULL,
+        (__m512i)(__v64qu){
+            0, 64, 1, 65, 2, 66, 3, 67,
+            4, 68, 5, 69, 6, 70, 7, 71,
+            8, 72, 9, 73, 10, 74, 11, 75,
+            12, 76, 13, 77, 14, 78, 15, 79,
+            16, 80, 17, 81, 18, 82, 19, 83,
+            20, 84, 21, 85, 22, 86, 23, 87,
+            24, 88, 25, 89, 26, 90, 27, 91,
+            28, 92, 29, 93, 30, 94, 31, 95},
+        (__m512i)(__v64qu){
+            200, 201, 202, 203, 204, 205, 206, 207,
+            208, 209, 210, 211, 212, 213, 214, 215,
+            216, 217, 218, 219, 220, 221, 222, 223,
+            224, 225, 226, 227, 228, 229, 230, 231,
+            232, 233, 234, 235, 236, 237, 238, 239,
+            240, 241, 242, 243, 244, 245, 246, 247,
+            248, 249, 250, 251, 252, 253, 254, 255,
+            0, 1, 2, 3, 4, 5, 6, 7}),
+    10, 200, 12, 201, 14, 202, 16, 203,
+    18, 204, 20, 205, 22, 206, 24, 207,
+    26, 208, 28, 209, 30, 210, 32, 211,
+    34, 212, 36, 213, 38, 214, 40, 215,
+    42, 216, 44, 217, 46, 218, 48, 219,
+    50, 220, 52, 221, 54, 222, 56, 223,
+    58, 224, 60, 225, 62, 226, 64, 227,
+    66, 228, 68, 229, 70, 230, 72, 231));
+TEST_CONSTEXPR(match_v64qu(
+    _mm512_maskz_permutex2var_epi8(0x5555555555555555ULL,
+        (__m512i)(__v64qu){
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55,
+            56, 57, 58, 59, 60, 61, 62, 63},
+        (__m512i)(__v64qu){
+            0, 64, 1, 65, 2, 66, 3, 67,
+            4, 68, 5, 69, 6, 70, 7, 71,
+            8, 72, 9, 73, 10, 74, 11, 75,
+            12, 76, 13, 77, 14, 78, 15, 79,
+            16, 80, 17, 81, 18, 82, 19, 83,
+            20, 84, 21, 85, 22, 86, 23, 87,
+            24, 88, 25, 89, 26, 90, 27, 91,
+            28, 92, 29, 93, 30, 94, 31, 95},
+        (__m512i)(__v64qu){
+            200, 201, 202, 203, 204, 205, 206, 207,
+            208, 209, 210, 211, 212, 213, 214, 215,
+            216, 217, 218, 219, 220, 221, 222, 223,
+            224, 225, 226, 227, 228, 229, 230, 231,
+            232, 233, 234, 235, 236, 237, 238, 239,
+            240, 241, 242, 243, 244, 245, 246, 247,
+            248, 249, 250, 251, 252, 253, 254, 255,
+            0, 1, 2, 3, 4, 5, 6, 7}),
+    0, 0, 1, 0, 2, 0, 3, 0,
+    4, 0, 5, 0, 6, 0, 7, 0,
+    8, 0, 9, 0, 10, 0, 11, 0,
+    12, 0, 13, 0, 14, 0, 15, 0,
+    16, 0, 17, 0, 18, 0, 19, 0,
+    20, 0, 21, 0, 22, 0, 23, 0,
+    24, 0, 25, 0, 26, 0, 27, 0,
+    28, 0, 29, 0, 30, 0, 31, 0));
+TEST_CONSTEXPR(match_v64qu(
+    _mm512_mask2_permutex2var_epi8((__m512i)(__v64qu){
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55,
+            56, 57, 58, 59, 60, 61, 62, 63},
+        (__m512i)(__v64qu){
+            0, 64, 1, 65, 2, 66, 3, 67,
+            4, 68, 5, 69, 6, 70, 7, 71,
+            8, 72, 9, 73, 10, 74, 11, 75,
+            12, 76, 13, 77, 14, 78, 15, 79,
+            16, 80, 17, 81, 18, 82, 19, 83,
+            20, 84, 21, 85, 22, 86, 23, 87,
+            24, 88, 25, 89, 26, 90, 27, 91,
+            28, 92, 29, 93, 30, 94, 31, 95},
+        0x5555555555555555ULL,
+        (__m512i)(__v64qu){
+            200, 201, 202, 203, 204, 205, 206, 207,
+            208, 209, 210, 211, 212, 213, 214, 215,
+            216, 217, 218, 219, 220, 221, 222, 223,
+            224, 225, 226, 227, 228, 229, 230, 231,
+            232, 233, 234, 235, 236, 237, 238, 239,
+            240, 241, 242, 243, 244, 245, 246, 247,
+            248, 249, 250, 251, 252, 253, 254, 255,
+            0, 1, 2, 3, 4, 5, 6, 7}),
+    0, 64, 1, 65, 2, 66, 3, 67,
+    4, 68, 5, 69, 6, 70, 7, 71,
+    8, 72, 9, 73, 10, 74, 11, 75,
+    12, 76, 13, 77, 14, 78, 15, 79,
+    16, 80, 17, 81, 18, 82, 19, 83,
+    20, 84, 21, 85, 22, 86, 23, 87,
+    24, 88, 25, 89, 26, 90, 27, 91,
+    28, 92, 29, 93, 30, 94, 31, 95));
+
 __m512i test_mm512_permutexvar_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_permutexvar_epi8
   // CHECK: call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
diff --git a/clang/test/CodeGen/X86/avx512vbmivl-builtin.c b/clang/test/CodeGen/X86/avx512vbmivl-builtin.c
index c4d5fc8fb6977..49b7a1a721195 100644
--- a/clang/test/CodeGen/X86/avx512vbmivl-builtin.c
+++ b/clang/test/CodeGen/X86/avx512vbmivl-builtin.c
@@ -3,8 +3,14 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s
 
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m128i test_mm_permutexvar_epi8(__m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_permutexvar_epi8
@@ -77,8 +83,28 @@ __m128i test_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
   // CHECK-LABEL: test_mm_maskz_permutex2var_epi8
   // CHECK: call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
-  return _mm_maskz_permutex2var_epi8(__U, __A, __I, __B); 
-}
+  return _mm_maskz_permutex2var_epi8(__U, __A, __I, __B);
+}
+
+TEST_CONSTEXPR(match_v16qu(
+    _mm_permutex2var_epi8((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8,
+                                             9, 10, 11, 12, 13, 14, 15, 16},
+                         (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19,
+                                             4, 20, 5, 21, 6, 22, 7, 23},
+                         (__m128i)(__v16qu){101, 102, 103, 104, 105, 106, 107, 108,
+                                            109, 110, 111, 112, 113, 114, 115, 116}),
+    1, 101, 2, 102, 3, 103, 4, 104,
+    5, 105, 6, 106, 7, 107, 8, 108));
+TEST_CONSTEXPR(match_v16qu(
+    _mm_mask_permutex2var_epi8((__m128i)(__v16qu){200, 201, 202, 203, 204, 205, 206, 207,
+                                                   208, 209, 210, 211, 212, 213, 214, 215},
+                               0xAAAA,
+                               (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19,
+                                                   4, 20, 5, 21, 6, 22, 7, 23},
+                               (__m128i)(__v16qu){101, 102, 103, 104, 105, 106, 107, 108,
+                                                  109, 110, 111, 112, 113, 114, 115, 116}),
+    200, 101, 202, 102, 204, 103, 206, 104,
+    208, 105, 210, 106, 212, 107, 214, 108));
 
 __m256i test_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_permutex2var_epi8
@@ -97,8 +123,44 @@ __m256i test_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i _
   // CHECK-LABEL: test_mm256_maskz_permutex2var_epi8
   // CHECK: call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
-  return _mm256_maskz_permutex2var_epi8(__U, __A, __I, __B); 
-}
+  return _mm256_maskz_permutex2var_epi8(__U, __A, __I, __B);
+}
+
+TEST_CONSTEXPR(match_v32qu(
+    _mm256_permutex2var_epi8((__m256i)(__v32qu){1, 2, 3, 4, 5, 6, 7, 8,
+                                                 9, 10, 11, 12, 13, 14, 15, 16,
+                                                 17, 18, 19, 20, 21, 22, 23, 24,
+                                                 25, 26, 27, 28, 29, 30, 31, 32},
+                             (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35,
+                                                 4, 36, 5, 37, 6, 38, 7, 39,
+                                                 8, 40, 9, 41, 10, 42, 11, 43,
+                                                 12, 44, 13, 45, 14, 46, 15, 47},
+                             (__m256i)(__v32qu){101, 102, 103, 104, 105, 106, 107, 108,
+                                                109, 110, 111, 112, 113, 114, 115, 116,
+                                                117, 118, 119, 120, 121, 122, 123, 124,
+                                                125, 126, 127, 128, 129, 130, 131, 132}),
+    1, 101, 2, 102, 3, 103, 4, 104,
+    5, 105, 6, 106, 7, 107, 8, 108,
+    9, 109, 10, 110, 11, 111, 12, 112,
+    13, 113, 14, 114, 15, 115, 16, 116));
+TEST_CONSTEXPR(match_v32qu(
+    _mm256_mask_permutex2var_epi8((__m256i)(__v32qu){200, 201, 202, 203, 204, 205, 206, 207,
+                                                      208, 209, 210, 211, 212, 213, 214, 215,
+                                                      216, 217, 218, 219, 220, 221, 222, 223,
+                                                      224, 225, 226, 227, 228, 229, 230, 231},
+                                  0xAAAAAAAA,
+                                  (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35,
+                                                      4, 36, 5, 37, 6, 38, 7, 39,
+                                                      8, 40, 9, 41, 10, 42, 11, 43,
+                                                      12, 44, 13, 45, 14, 46, 15, 47},
+                                  (__m256i)(__v32qu){101, 102, 103, 104, 105, 106, 107, 108,
+                                                     109, 110, 111, 112, 113, 114, 115, 116,
+                                                     117, 118, 119, 120, 121, 122, 123, 124,
+                                                     125, 126, 127, 128, 129, 130, 131, 132}),
+    200, 101, 202, 102, 204, 103, 206, 104,
+    208, 105, 210, 106, 212, 107, 214, 108,
+    216, 109, 218, 110, 220, 111, 222, 112,
+    224, 113, 226, 114, 228, 115, 230, 116));
 
 __m128i test_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: test_mm_mask_multishift_epi64_epi8
diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c
index 33c43977f72dc..121d5bf8d4adb 100644
--- a/clang/test/CodeGen/X86/avx512vl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vl-builtins.c
@@ -5610,12 +5610,23 @@ __m128i test_mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U,
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask2_permutex2var_epi32(__A,__I,__U,__B); 
 }
+TEST_CONSTEXPR(match_v4si(
+    _mm_mask2_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40},
+                                 (__m128i)(__v4si){0, 3, 4, 6}, 0x05,
+                                 (__m128i)(__v4si){100, 200, 300, 400}),
+    10, 3, 100, 6));
 __m256i test_mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask2_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask2_permutex2var_epi32(__A,__I,__U,__B); 
 }
+TEST_CONSTEXPR(match_v8si(
+    _mm256_mask2_permutex2var_epi32((__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70},
+                                    (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                                    0xA5,
+                                    (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}),
+    0, 7, 100, 15, 1, 110, 2, 120));
 __m128d test_mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) {
   // CHECK-LABEL: test_mm_mask2_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
@@ -5646,149 +5657,255 @@ __m128i test_mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U,
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask2_permutex2var_epi64(__A,__I,__U,__B); 
 }
+TEST_CONSTEXPR(match_v2di(
+    _mm_mask2_permutex2var_epi64((__m128i)(__v2di){10, 20},
+                                 (__m128i)(__v2di){0, 5}, 0x1,
+                                 (__m128i)(__v2di){100, 200}),
+    10, 5));
 __m256i test_mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask2_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.256
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask2_permutex2var_epi64(__A,__I,__U,__B); 
 }
+TEST_CONSTEXPR(match_v4di(
+    _mm256_mask2_permutex2var_epi64((__m256i)(__v4di){0, 10, 20, 30},
+                                    (__m256i)(__v4di){0, 1, 4, 5}, 0x5,
+                                    (__m256i)(__v4di){100, 110, 120, 130}),
+    0, 1, 100, 5));
 __m128i test_mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) {
   // CHECK-LABEL: test_mm_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.128
   return _mm_permutex2var_epi32(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4si(
+    _mm_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40},
+                           (__m128i)(__v4si){0, 3, 4, 6},
+                           (__m128i)(__v4si){100, 200, 300, 400}),
+    10, 40, 100, 300));
 __m128i test_mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.128
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_permutex2var_epi32(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4si(
+    _mm_mask_permutex2var_epi32((__m128i)(__v4si){-1, -2, -3, -4}, 0x0A,
+                                (__m128i)(__v4si){0, 3, 4, 6},
+                                (__m128i)(__v4si){100, 200, 300, 400}),
+    -1, -4, -3, 300));
 __m128i test_mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I,  __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.128
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_permutex2var_epi32(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4si(
+    _mm_maskz_permutex2var_epi32(0x0A, (__m128i)(__v4si){10, 20, 30, 40},
+                                 (__m128i)(__v4si){0, 3, 4, 6},
+                                 (__m128i)(__v4si){100, 200, 300, 400}),
+    0, 40, 0, 300));
 __m256i test_mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.256
   return _mm256_permutex2var_epi32(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v8si(
+    _mm256_permutex2var_epi32((__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70},
+                              (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                              (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}),
+    0, 70, 100, 170, 10, 110, 20, 120));
 __m256i test_mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_permutex2var_epi32(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_v8si(
+    _mm256_mask_permutex2var_epi32((__m256i)(__v8si){-1, -2, -3, -4, -5, -6, -7, -8}, 0xAA,
+                                   (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                                   (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}),
+    -1, -8, -3, 170, -5, 110, -7, 120));
 __m256i test_mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_permutex2var_epi32(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v8si(
+    _mm256_maskz_permutex2var_epi32(0xAA, (__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70},
+                                  (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                                  (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}),
+    0, 70, 0, 170, 0, 110, 0, 120));
 __m128d test_mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) {
   // CHECK-LABEL: test_mm_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
   return _mm_permutex2var_pd(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128d(
+    _mm_permutex2var_pd((__m128d){1.0, 2.0}, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}),
+    1.0, 10.0));
 __m128d test_mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) {
   // CHECK-LABEL: test_mm_mask_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_permutex2var_pd(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128d(
+    _mm_mask_permutex2var_pd((__m128d){-1.0, -2.0}, 0x2, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}),
+    -1.0, 10.0));
 __m128d test_mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) {
   // CHECK-LABEL: test_mm_maskz_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_permutex2var_pd(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128d(
+    _mm_maskz_permutex2var_pd(0x2, (__m128d){1.0, 2.0}, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}),
+    0.0, 10.0));
 __m256d test_mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) {
   // CHECK-LABEL: test_mm256_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.256
   return _mm256_permutex2var_pd(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256d(
+    _mm256_permutex2var_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}),
+    1.0, 10.0, 2.0, 20.0));
 __m256d test_mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, __m256d __B) {
   // CHECK-LABEL: test_mm256_mask_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.256
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_permutex2var_pd(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256d(
+    _mm256_mask_permutex2var_pd((__m256d){-1.0, -2.0, -3.0, -4.0}, 0x2, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}),
+    -1.0, 10.0, -3.0, -4.0));
 __m256d test_mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I,  __m256d __B) {
   // CHECK-LABEL: test_mm256_maskz_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.256
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_permutex2var_pd(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256d(
+    _mm256_maskz_permutex2var_pd(0x2, (__m256d){1.0, 2.0, 3.0, 4.0}, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}),
+    0.0, 10.0, 0.0, 0.0));
 __m128 test_mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) {
   // CHECK-LABEL: test_mm_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.128
   return _mm_permutex2var_ps(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128(
+    _mm_permutex2var_ps((__m128){1.f, 2.f, 3.f, 4.f}, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}),
+    1.f, 4.f, 10.f, 30.f));
 __m128 test_mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) {
   // CHECK-LABEL: test_mm_mask_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.128
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_permutex2var_ps(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128(
+    _mm_mask_permutex2var_ps((__m128){-1.f, -2.f, -3.f, -4.f}, 0x0A, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}),
+    -1.f, -4.f, -3.f, 30.f));
 __m128 test_mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) {
   // CHECK-LABEL: test_mm_maskz_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.128
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_permutex2var_ps(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128(
+    _mm_maskz_permutex2var_ps(0x0A, (__m128){1.f, 2.f, 3.f, 4.f}, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}),
+    0.f, 4.f, 0.f, 30.f));
 __m256 test_mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) {
   // CHECK-LABEL: test_mm256_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.256
   return _mm256_permutex2var_ps(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256(
+    _mm256_permutex2var_ps((__m256){0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f},
+                              (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                              (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}),
+    0.f, 7.f, 10.f, 17.f, 1.f, 11.f, 2.f, 12.f));
 __m256 test_mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) {
   // CHECK-LABEL: test_mm256_mask_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_permutex2var_ps(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256(
+    _mm256_mask_permutex2var_ps((__m256){-1.f, -2.f, -3.f, -4.f, -5.f, -6.f, -7.f, -8.f}, 0xAA, (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}),
+    -1.f, -8.f, -3.f, 17.f, -5.f, 11.f, -7.f, 12.f));
 __m256 test_mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, __m256 __B) {
   // CHECK-LABEL: test_mm256_maskz_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_permutex2var_ps(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256(
+    _mm256_maskz_permutex2var_ps(0xAA, (__m256){0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}, (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}),
+    0.f, 7.f, 0.f, 17.f, 0.f, 11.f, 0.f, 12.f));
 __m128i test_mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) {
   // CHECK-LABEL: test_mm_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.128
   return _mm_permutex2var_epi64(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v2di(
+    _mm_permutex2var_epi64((__m128i)(__v2di){10, 20}, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}),
+    10, 200));
 __m128i test_mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.128
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_permutex2var_epi64(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_v2di(
+    _mm_mask_permutex2var_epi64((__m128i)(__v2di){-1, -2}, 0x2, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}),
+    -1, 200));
 __m128i test_mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.128
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_permutex2var_epi64(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v2di(
+    _mm_maskz_permutex2var_epi64(0x2, (__m128i)(__v2di){10, 20}, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}),
+    0, 200));
 __m256i test_mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.256
   return _mm256_permutex2var_epi64(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4di(
+    _mm256_permutex2var_epi64((__m256i)(__v4di){0, 10, 20, 30}, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}),
+    0, 10, 100, 110));
 __m256i test_mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.256
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_permutex2var_epi64(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4di(
+    _mm256_mask_permutex2var_epi64((__m256i)(__v4di){-1, -2, -3, -4}, 0x5, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}),
+    -1, -2, 100, -4));
 __m256i test_mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.256
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_permutex2var_epi64(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4di(
+    _mm256_maskz_permutex2var_epi64(0x5, (__m256i)(__v4di){0, 10, 20, 30}, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}),
+    0, 0, 100, 0));
 
+TEST_CONSTEXPR(match_v4si(
+    _mm_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40},
+                           (__m128i)(__v4si){0, 3, 4, 6},
+                           (__m128i)(__v4si){100, 200, 300, 400}),
+    10, 40, 100, 300));
+TEST_CONSTEXPR(match_v4si(
+    _mm_mask_permutex2var_epi32((__m128i)(__v4si){-1, -2, -3, -4}, 0x0A,
+                                (__m128i)(__v4si){0, 3, 4, 6},
+                                (__m128i)(__v4si){100, 200, 300, 400}),
+    -1, -4, -3, 300));
 __m128i test_mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_cvtepi8_epi32
   // CHECK: sext <4 x i8> %{{.*}} to <4 x i32>
@@ -10472,6 +10589,17 @@ __m256i test_mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A) {
 TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0x33u, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,0,0));
 TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xAAu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 0,0,0,0, 0,4,0,4));
 TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xFFu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,4,4));
+TEST_CONSTEXPR(match_v8si(
+    _mm256_permutex2var_epi32((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8},
+                              (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                              (__m256i)(__v8si){101, 102, 103, 104, 105, 106, 107, 108}),
+    1, 8, 101, 108, 2, 102, 3, 103));
+TEST_CONSTEXPR(match_v8si(
+    _mm256_mask_permutex2var_epi32((__m256i)(__v8si){-1, -2, -3, -4, -5, -6, -7, -8},
+                                    0xAA,
+                                    (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                                    (__m256i)(__v8si){101, 102, 103, 104, 105, 106, 107, 108}),
+    -1, -8, -3, 108, -5, 102, -7, 103));
 
 __m128d test_mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) {
   // CHECK-LABEL: test_mm_mask_mov_pd
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index febef46458ae9..7a5af2dc8742f 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -941,56 +941,28 @@ __m128i test_mm_mask_blend_epi8(__mmask16 __U, __m128i __A, __m128i __W) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_blend_epi8(__U,__A,__W); 
 }
-TEST_CONSTEXPR(match_v16qi(
-  _mm_mask_blend_epi8(
-    (__mmask16)0x0001,
-    (__m128i)(__v16qi){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
-    (__m128i)(__v16qi){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25 }
-  ),
-  10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
-));
+TEST_CONSTEXPR(match_v16qi(_mm_mask_blend_epi8((__mmask16)0x0001,(__m128i)(__v16qi){2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2},(__m128i)(__v16qi){10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}),10,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2));
 
 __m256i test_mm256_mask_blend_epi8(__mmask32 __U, __m256i __A, __m256i __W) {
   // CHECK-LABEL: test_mm256_mask_blend_epi8
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_blend_epi8(__U,__A,__W); 
 }
-TEST_CONSTEXPR(match_v32qi(
-  _mm256_mask_blend_epi8(
-    (__mmask32) 0x00000001,
-    (__m256i)(__v32qi) {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
-    (__m256i)(__v32qi){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}
-  ),
-  10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
-));
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_blend_epi8((__mmask32)0x00000001,(__m256i)(__v32qi){2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2},(__m256i)(__v32qi){10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}),10,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2));
 
 __m128i test_mm_mask_blend_epi16(__mmask8 __U, __m128i __A, __m128i __W) {
   // CHECK-LABEL: test_mm_mask_blend_epi16
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_blend_epi16(__U,__A,__W); 
 }
-TEST_CONSTEXPR(match_v8hi(
-  _mm_mask_blend_epi16(
-    (__mmask8)0x01,
-    (__m128i)(__v8hi){2, 2, 2, 2, 2, 2, 2, 2},
-    (__m128i)(__v8hi){ 10,11,12,13,14,15,16,17 }
-  ),
-  10, 2, 2, 2, 2, 2, 2, 2
-));
+TEST_CONSTEXPR(match_v8hi(_mm_mask_blend_epi16((__mmask8)0x01,(__m128i)(__v8hi){2,2,2,2,2,2,2,2},(__m128i)(__v8hi){10,11,12,13,14,15,16,17}),10,2,2,2,2,2,2,2));
 
 __m256i test_mm256_mask_blend_epi16(__mmask16 __U, __m256i __A, __m256i __W) {
   // CHECK-LABEL: test_mm256_mask_blend_epi16
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_blend_epi16(__U,__A,__W); 
 }
-TEST_CONSTEXPR(match_v16hi(
-  _mm256_mask_blend_epi16(
-    (__mmask16)0x0001,
-    (__m256i)(__v16hi){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
-    (__m256i)(__v16hi){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25 }
-  ),
-  10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
-));
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_blend_epi16((__mmask16)0x0001,(__m256i)(__v16hi){2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2},(__m256i)(__v16hi){10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}),10,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2));
 
 __m128i test_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_abs_epi8
@@ -1078,48 +1050,63 @@ __m128i test_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_packs_epi32(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_packs_epi32((__mmask8)0xAA,(__m128i)(__v4si){40000,-50000,65535,-65536},(__m128i)(__v4si){0,50000,40000,-40000}),0,-32768,0,-32768,0,32767,0,-32768));
+
 __m128i test_mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_packs_epi32
   // CHECK: @llvm.x86.sse2.packssdw
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_packs_epi32(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_packs_epi32((__m128i)(__v8hi){1,2,3,4,29,30,31,32},(__mmask8)0xAA,(__m128i)(__v4si){40000,-50000,65535,-65536},(__m128i)(__v4si){0,50000,40000,-40000}),1,-32768,3,-32768,29,32767,31,-32768));
+
 __m256i test_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_packs_epi32
   // CHECK: @llvm.x86.avx2.packssdw
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_packs_epi32(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_packs_epi32((__mmask32)0xAAAA,(__m256i)(__v8si){40000,-50000,32767,-32768,32768,-32769,65535,-65536},(__m256i)(__v8si){0,1,-1,65536,22222,-22222,40000,-40000}),0,-32768,0,-32768,0,1,0,32767,0,-32768,0,-32768,0,-22222,0,-32768));
+
 __m256i test_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_packs_epi32
   // CHECK: @llvm.x86.avx2.packssdw
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_packs_epi32(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_packs_epi32((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,(__m256i)(__v8si){40000,-50000,32767,-32768,32768,-32769,65535,-65536},(__m256i)(__v8si){0,1,-1,65536,22222,-22222,40000,-40000}),1,-32768,3,-32768,5,1,7,32767,25,-32768,27,-32768,29,-22222,31,-32768));
+
 __m128i test_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_packs_epi16
   // CHECK: @llvm.x86.sse2.packsswb
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_packs_epi16(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_maskz_packs_epi16((__mmask16)0xAAAA,(__m128i)(__v8hi){130,-200,127,-128,255,-255,127,-128},(__m128i)(__v8hi){0,1,-1,255,-128,90,-90,-32768}),0,-128,0,-128,0,-128,0,-128,0,1,0,127,0,90,0,-128));
+
 __m128i test_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_packs_epi16
   // CHECK: @llvm.x86.sse2.packsswb
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_packs_epi16(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_mask_packs_epi16((__m128i)(__v16qi){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v8hi){130,-200,127,-128,255,-255,127,-128},(__m128i)(__v8hi){0,1,-1,255,-128,90,-90,-32768}),1,-128,3,-128,5,-128,7,-128,57,1,59,127,61,90,63,-128));
+
 __m256i test_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_packs_epi16
   // CHECK: @llvm.x86.avx2.packsswb
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_packs_epi16(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_packs_epi16((__mmask32)0xAAAAAAAA,(__m256i)(__v16hi){130,-200,127,-128,300,-1000,42,-42,500,-500,7,-7,255,-255,127,-128},(__m256i)(__v16hi){0,1,-1,255,-129,128,20000,-32768,0,1,-1,127,-128,90,-90,-32768}),0,-128,0,-128,0,-128,0,-42,0,1,0,127,0,127,0,-128,0,-128,0,-7,0,-128,0,-128,0,1,0,127,0,90,0,-128));
+
 __m256i test_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_packs_epi16
   // CHECK: @llvm.x86.avx2.packsswb
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_packs_epi16(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_packs_epi16((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v16hi){130,-200,127,-128,300,-1000,42,-42,500,-500,7,-7,255,-255,127,-128},(__m256i)(__v16hi){0,1,-1,255,-129,128,20000,-32768,0,1,-1,127,-128,90,-90,-32768}),1,-128,3,-128,5,-128,7,-42,9,1,11,127,13,127,15,-128,49,-128,51,-7,53,-128,55,-128,57,1,59,127,61,90,63,-128));
 
 __m128i test_mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_packus_epi32
@@ -1127,6 +1114,7 @@ __m128i test_mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_packus_epi32(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hu(_mm_mask_packus_epi32((__m128i)(__v8hu){1,2,3,4,5,6,7,8},(__mmask8)0xAA,(__m128i)(__v4si){40000,-50000,32767,-32768},(__m128i)(__v4si){0,1,-1,65536}),1,0,3,0,5,1,7,65535));
 
 __m128i test_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_packus_epi32
@@ -1134,6 +1122,7 @@ __m128i test_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_packus_epi32(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hu(_mm_maskz_packus_epi32((__mmask8)0xAA,(__m128i)(__v4si){40000,-50000,32767,-32768},(__m128i)(__v4si){0,1,-1,65536}),0,0,0,0,0,1,0,65535));
 
 __m256i test_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_packus_epi32
@@ -1141,6 +1130,7 @@ __m256i test_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_packus_epi32(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_packus_epi32((__mmask16)0xAAAA,(__m256i)(__v8si){40000,-50000,32767,-32768,32768,-32769,22222,-22222},(__m256i)(__v8si){0,1,-1,65536,40000,-40000,65535,0}),0,0,0,0,0,1,0,-1,0,0,0,0,0,0,0,0));
 
 __m256i test_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_packus_epi32
@@ -1148,6 +1138,7 @@ __m256i test_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_packus_epi32(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_packus_epi32((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,(__m256i)(__v8si){40000,-50000,32767,-32768,32768,-32769,22222,-22222},(__m256i)(__v8si){0,1,-1,65536,40000,-40000,65535,0}),1,0,3,0,5,1,7,-1,25,0,27,0,29,0,31,0));
 
 __m128i test_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_packus_epi16
@@ -1155,6 +1146,7 @@ __m128i test_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_packus_epi16(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qu(_mm_maskz_packus_epi16((__mmask16)0xAAAA,(__m128i)(__v8hi){-1,0,1,127,128,255,256,-200},(__m128i)(__v8hi){0,1,-1,255,-129,128,20000,-32768}),0,0,0,127,0,255,0,0,0,1,0,255,0,128,0,0));
 
 __m128i test_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_packus_epi16
@@ -1162,6 +1154,7 @@ __m128i test_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m12
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_packus_epi16(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qu(_mm_mask_packus_epi16((__m128i)(__v16qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},(__mmask16)0xAAAA,(__m128i)(__v8hi){-1,0,1,127,128,255,256,-200},(__m128i)(__v8hi){0,1,-1,255,-129,128,20000,-32768}),1,0,3,127,5,255,7,0,9,1,11,255,13,128,15,0));
 
 __m256i test_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_packus_epi16
@@ -1169,6 +1162,7 @@ __m256i test_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_packus_epi16(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_packus_epi16((__mmask32)0xAAAAAAAA,(__m256i)(__v16hi){-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129},(__m256i)(__v16hi){0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90}),0,0,0,127,0,-1,0,0,0,1,0,-1,0,-128,0,0,0,42,0,-1,0,0,0,0,0,0,0,0,0,0,0,0));
 
 __m256i test_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_packus_epi16
@@ -1176,6 +1170,7 @@ __m256i test_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_packus_epi16(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_packus_epi16((__m256i)(__v32qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v16hi){-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129},(__m256i)(__v16hi){0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90}),1,0,3,127,5,-1,7,0,9,1,11,-1,13,-128,15,0,49,42,51,-1,53,0,55,0,57,0,59,0,61,0,63,0));
 
 __m128i test_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_adds_epi8
@@ -1183,48 +1178,64 @@ __m128i test_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_adds_epi8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_mask_adds_epi8((__m128i)(__v16qs){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v16qs){0,+1,-2,+3,-4,+5,-6,+7,-100,-50,+100,-20,-80,+120,-120,-20},(__m128i)(__v16qs){0,+1,-2,+3,-4,+5,-6,+7,+50,+80,-50,+110,+60,120,+20,-120}),1,+2,3,+6,5,+10,7,+14,57,+30,59,+90,61,+127,63,-128));
+
 __m128i test_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_adds_epi8
   // CHECK: @llvm.sadd.sat.v16i8
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_adds_epi8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_maskz_adds_epi8((__mmask16)0xAAAA,(__m128i)(__v16qs){0,+1,-2,+3,-4,+5,-6,+7,-100,-50,+100,-20,-80,+120,-120,-20},(__m128i)(__v16qs){0,+1,-2,+3,-4,+5,-6,+7,+50,+80,-50,+110,+60,120,+20,-120}),0,+2,0,+6,0,+10,0,+14,0,+30,0,+90,0,+127,0,-128));
+
 __m256i test_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_adds_epi8
   // CHECK: @llvm.sadd.sat.v32i8
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_adds_epi8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_adds_epi8((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v32qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,+100,+50,-100,+20,+80,-50,+120,-20,-100,-50,+100,-20,-80,+50,-120,+20},(__m256i)(__v32qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,+50,+80,-50,+110,+60,-30,+20,-10,+50,+80,-50,+110,+60,-30,+20,-10}),1,+2,3,+6,5,+10,7,+14,9,+18,11,+22,13,+26,15,+30,49,+127,51,+127,53,-80,+55,-30,57,+30,59,+90,61,+20,63,+10));
+
 __m256i test_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_adds_epi8
   // CHECK: @llvm.sadd.sat.v32i8
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_adds_epi8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_adds_epi8((__mmask32)0xAAAAAAAA,(__m256i)(__v32qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,+100,+50,-100,+20,+80,-50,+120,-20,-100,-50,+100,-20,-80,+50,-120,+20},(__m256i)(__v32qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,+50,+80,-50,+110,+60,-30,+20,-10,+50,+80,-50,+110,+60,-30,+20,-10}),0,+2,0,+6,0,+10,0,+14,0,+18,0,+22,0,+26,0,+30,0,+127,0,+127,0,-80,0,-30,0,+30,0,+90,0,+20,0,+10));
+
 __m128i test_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_adds_epi16
   // CHECK: @llvm.sadd.sat.v8i16
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_adds_epi16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_adds_epi16((__m128i)(__v8hi){9,10,11,12,13,14,15,16,},(__mmask8)0xAA,(__m128i)(__v8hi){-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m128i)(__v8hi){-24,+25,-26,+27,+800,-800,-800,+800}),9,+50,11,+54,13,-32768,15,+32767));
+
 __m128i test_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_adds_epi16
   // CHECK: @llvm.sadd.sat.v8i16
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_adds_epi16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_adds_epi16((__mmask8)0xAA,(__m128i)(__v8hi){-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m128i)(__v8hi){-24,+25,-26,+27,+800,-800,-800,+800}),0,+50,0,+54,0,-32768,0,+32767));
+
 __m256i test_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_adds_epi16
   // CHECK: @llvm.sadd.sat.v16i16
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_adds_epi16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_adds_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,},(__mmask16)0xAAAA,(__m256i)(__v16hi){0,+1,-2,+3,-4,+5,-6,+7,-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m256i)(__v16hi){0,+1,-2,+3,-4,+5,-6,+7,-24,+25,-26,+27,+800,-800,-800,+800}),1,+2,3,+6,5,+10,7,+14,9,+50,11,+54,13,-32768,15,+32767));
+
 __m256i test_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_adds_epi16
   // CHECK: @llvm.sadd.sat.v16i16
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_adds_epi16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_adds_epi16((__mmask16)0xAAAA,(__m256i)(__v16hi){0,+1,-2,+3,-4,+5,-6,+7,-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m256i)(__v16hi){0,+1,-2,+3,-4,+5,-6,+7,-24,+25,-26,+27,+800,-800,-800,+800}),0,+2,0,+6,0,+10,0,+14,0,+50,0,+54,0,-32768,0,+32767));
+
 __m128i test_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_adds_epu8
   // CHECK-NOT: @llvm.x86.sse2.paddus.b
@@ -1232,6 +1243,8 @@ __m128i test_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_adds_epu8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qu(_mm_mask_adds_epu8((__m128i)(__v16qu){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v16qu){0,0,0,0,0,0,0,0,+255,+255,+255,+255,+255,+255,+255,+255},(__m128i)(__v16qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),1,+63,3,+127,5,+191,7,+255,57,+255,59,+255,61,+255,63,+255));
+
 __m128i test_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_adds_epu8
   // CHECK-NOT: @llvm.x86.sse2.paddus.b
@@ -1239,6 +1252,8 @@ __m128i test_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_adds_epu8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qu(_mm_maskz_adds_epu8((__mmask16)0xAAAA,(__m128i)(__v16qu){0,0,0,0,0,0,0,0,+255,+255,+255,+255,+255,+255,+255,+255},(__m128i)(__v16qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),0,+63,0,+127,0,+191,0,+255,0,+255,0,+255,0,+255,0,+255));
+
 __m256i test_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_adds_epu8
   // CHECK-NOT: @llvm.x86.avx2.paddus.b
@@ -1246,6 +1261,8 @@ __m256i test_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m25
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_adds_epu8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qu(_mm256_mask_adds_epu8((__m256i)(__v32qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v32qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m256i)(__v32qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),1,+63,3,+127,5,+191,7,+255,9,+126,11,+190,13,+254,15,+255,49,+255,51,+255,53,+255,55,+255,57,+255,59,+255,61,+255,63,+255));
+
 __m256i test_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_adds_epu8
   // CHECK-NOT: @llvm.x86.avx2.paddus.b
@@ -1253,6 +1270,8 @@ __m256i test_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_adds_epu8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qu(_mm256_maskz_adds_epu8((__mmask32)0xAAAAAAAA,(__m256i)(__v32qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m256i)(__v32qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),0,+63,0,+127,0,+191,0,+255,0,+126,0,+190,0,+254,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255));
+
 __m128i test_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_adds_epu16
   // CHECK-NOT: @llvm.x86.sse2.paddus.w
@@ -1260,6 +1279,8 @@ __m128i test_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_adds_epu16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hu(_mm_mask_adds_epu16((__m128i)(__v8hu){25,26,27,28,29,30,31,32},(__mmask8)0xAA,(__m128i)(__v8hu){+16384,+16384,+16384,+16384,+49152,+49152,+49152,+49152},(__m128i)(__v8hu){0,+16384,+32767,+32768,+32767,+32768,+49152,+65535}),25,+32768,27,+49152,29,+65535,31,+65535));
+
 __m128i test_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_adds_epu16
   // CHECK-NOT: @llvm.x86.sse2.paddus.w
@@ -1267,6 +1288,8 @@ __m128i test_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_adds_epu16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hu(_mm_maskz_adds_epu16((__mmask8)0xAA,(__m128i)(__v8hu){+16384,+16384,+16384,+16384,+49152,+49152,+49152,+49152},(__m128i)(__v8hu){0,+16384,+32767,+32768,+32767,+32768,+49152,+65535}),0,+32768,0,+49152,0,+65535,0,+65535));
+
 __m256i test_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_adds_epu16
   // CHECK-NOT: @llvm.x86.avx2.paddus.w
@@ -1274,6 +1297,8 @@ __m256i test_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m2
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_adds_epu16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hu(_mm256_mask_adds_epu16((__m256i)(__v16hu){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,(__m256i)(__v16hu){0,0,0,0,+16384,+16384,+16384,+16384,+49152,+49152,+49152,+49152,+65535,+65535,+65535,+65535},(__m256i)(__v16hu){0,+32767,+32768,+65535,0,+16384,+32767,+32768,+32767,+32768,+49152,+65535,0,+32767,+32768,+65535}),1,+32767,3,+65535,5,+32768,7,+49152,25,+65535,27,+65535,29,+65535,31,+65535));
+
 __m256i test_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_adds_epu16
   // CHECK-NOT: @llvm.x86.avx2.paddus.w
@@ -1281,6 +1306,8 @@ __m256i test_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_adds_epu16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hu(_mm256_maskz_adds_epu16((__mmask16)0xAAAA,(__m256i)(__v16hu){0,0,0,0,+16384,+16384,+16384,+16384,+49152,+49152,+49152,+49152,+65535,+65535,+65535,+65535},(__m256i)(__v16hu){0,+32767,+32768,+65535,0,+16384,+32767,+32768,+32767,+32768,+49152,+65535,0,+32767,+32768,+65535}),0,+32767,0,+65535,0,+32768,0,+49152,0,+65535,0,+65535,0,+65535,0,+65535));
+
 __m128i test_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_avg_epu8
   // CHECK: @llvm.x86.sse2.pavg.b
@@ -1740,48 +1767,64 @@ __m128i test_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_subs_epi8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_mask_subs_epi8((__m128i)(__v16qs){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v16qs){1,-100,3,4,5,-6,7,100,57,-100,59,60,61,-62,63,100},(__m128i)(__v16qs){1,100,3,4,5,6,7,-100,57,100,59,60,61,62,63,-100}),1,-128,3,0,5,-12,7,127,57,-128,59,0,61,-124,63,127));
+
 __m128i test_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_subs_epi8
   // CHECK: @llvm.ssub.sat.v16i8
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_subs_epi8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_maskz_subs_epi8((__mmask16)0xAAAA,(__m128i)(__v16qs){1,-100,3,4,5,-6,7,100,57,-100,59,60,61,-62,63,100},(__m128i)(__v16qs){1,100,3,4,5,6,7,-100,57,100,59,60,61,62,63,-100}),0,-128,0,0,0,-12,0,127,0,-128,0,0,0,-124,0,127));
+
 __m256i test_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_subs_epi8
   // CHECK: @llvm.ssub.sat.v32i8
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_subs_epi8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_subs_epi8((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v32qs){1,-100,3,4,5,-6,7,100,9,-100,11,12,13,-14,15,100,49,-100,51,52,53,-54,55,100,57,-100,59,60,61,-62,63,100},(__m256i)(__v32qs){1,100,3,4,5,6,7,-100,9,100,11,12,13,14,15,-100,49,100,51,52,53,54,55,-100,57,100,59,60,61,62,63,-100}),1,-128,3,0,5,-12,7,127,9,-128,11,0,13,-28,15,127,49,-128,51,0,53,-108,55,127,57,-128,59,0,61,-124,63,127));
+
 __m256i test_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_subs_epi8
   // CHECK: @llvm.ssub.sat.v32i8
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_subs_epi8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_subs_epi8((__mmask32)0xAAAAAAAA,(__m256i)(__v32qs){1,-100,3,4,5,-6,7,100,9,-100,11,12,13,-14,15,100,49,-100,51,52,53,-54,55,100,57,-100,59,60,61,-62,63,100},(__m256i)(__v32qs){1,100,3,4,5,6,7,-100,9,100,11,12,13,14,15,-100,49,100,51,52,53,54,55,-100,57,100,59,60,61,62,63,-100}),0,-128,0,0,0,-12,0,127,0,-128,0,0,0,-28,0,127,0,-128,0,0,0,-108,0,127,0,-128,0,0,0,-124,0,127));
+
 __m128i test_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_subs_epi16
   // CHECK: @llvm.ssub.sat.v8i16
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_subs_epi16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_subs_epi16((__m128i)(__v8hi){1,2,3,4,29,30,31,32},(__mmask8)0xAA,(__m128i)(__v8hi){1,-30000,3,30000,29,-30,31,32},(__m128i)(__v8hi){1,30000,3,-30000,29,30,31,-32}),1,-32768,3,32767,29,-60,31,64));
+
 __m128i test_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_subs_epi16
   // CHECK: @llvm.ssub.sat.v8i16
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_subs_epi16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_subs_epi16((__mmask8)0xAA,(__m128i)(__v8hi){1,-30000,3,30000,29,-30,31,32},(__m128i)(__v8hi){1,30000,3,-30000,29,30,31,-32}),0,-32768,0,32767,0,-60,0,64));
+
 __m256i test_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_subs_epi16
   // CHECK: @llvm.ssub.sat.v16i16
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_subs_epi16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_subs_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,(__m256i)(__v16hi){1,-30000,3,30000,5,-6,7,8,25,-30000,27,30000,29,-30,31,32},(__m256i)(__v16hi){1,30000,3,-30000,5,6,7,-8,25,30000,27,-30000,29,30,31,-32}),1,-32768,3,32767,5,-12,7,16,25,-32768,27,32767,29,-60,31,64));
+
 __m256i test_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_subs_epi16
   // CHECK: @llvm.ssub.sat.v16i16
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_subs_epi16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_subs_epi16((__mmask16)0xAAAA,(__m256i)(__v16hi){1,-30000,3,30000,5,-6,7,8,25,-30000,27,30000,29,-30,31,32},(__m256i)(__v16hi){1,30000,3,-30000,5,6,7,-8,25,30000,27,-30000,29,30,31,-32}),0,-32768,0,32767,0,-12,0,16,0,-32768,0,32767,0,-60,0,64));
+
 __m128i test_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_subs_epu8
   // CHECK-NOT: @llvm.x86.sse2.psubus.b
@@ -1789,6 +1832,8 @@ __m128i test_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_subs_epu8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qu(_mm_mask_subs_epu8((__m128i)(__v16qu){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v16qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m128i)(__v16qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),1,200,3,0,5,0,7,254,57,0,59,1,61,1,63,0));
+
 __m128i test_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_subs_epu8
   // CHECK-NOT: @llvm.x86.sse2.psubus.b
@@ -1796,6 +1841,8 @@ __m128i test_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_subs_epu8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qu(_mm_maskz_subs_epu8((__mmask16)0xAAAA,(__m128i)(__v16qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m128i)(__v16qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0));
+
 __m256i test_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_subs_epu8
   // CHECK-NOT: @llvm.x86.avx2.psubus.b
@@ -1803,6 +1850,8 @@ __m256i test_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m25
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_subs_epu8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qu(_mm256_mask_subs_epu8((__m256i)(__v32qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v32qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m256i)(__v32qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),1,200,3,0,5,0,7,254,9,0,11,1,13,1,15,0,49,200,51,0,53,0,55,254,57,0,59,1,61,1,63,0));
+
 __m256i test_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_subs_epu8
   // CHECK-NOT: @llvm.x86.avx2.psubus.b
@@ -1810,6 +1859,8 @@ __m256i test_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_subs_epu8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qu(_mm256_maskz_subs_epu8((__mmask32)0xAAAAAAAA,(__m256i)(__v32qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m256i)(__v32qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0,0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0));
+
 __m128i test_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_subs_epu16
   // CHECK-NOT: @llvm.x86.sse2.psubus.w
@@ -1817,6 +1868,8 @@ __m128i test_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_subs_epu16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hu(_mm_mask_subs_epu16((__m128i)(__v8hu){101,102,103,104,129,130,131,132},(__mmask8)0xAAu,(__m128i)(__v8hu){0,65000,0,40000,0,1,0,50000},(__m128i)(__v8hu){0,5000,0,60000,0,0,0,25000}),101,60000,103,0,129,1,131,25000));
+
 __m128i test_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_subs_epu16
   // CHECK-NOT: @llvm.x86.sse2.psubus.w
@@ -1824,6 +1877,8 @@ __m128i test_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_subs_epu16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hu(_mm_maskz_subs_epu16((__mmask8)0xAAu,(__m128i)(__v8hu){0,65000,0,40000,0,1,0,50000},(__m128i)(__v8hu){0,5000,0,60000,0,0,0,25000}),0,60000,0,0,0,1,0,25000));
+
 __m256i test_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_subs_epu16
   // CHECK-NOT: @llvm.x86.avx2.psubus.w
@@ -1831,6 +1886,8 @@ __m256i test_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m2
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_subs_epu16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hu(_mm256_mask_subs_epu16((__m256i)(__v16hu){101,102,103,104,105,106,107,108,125,126,127,128,129,130,131,132},(__mmask16)0xAAAAu,(__m256i)(__v16hu){0,65000,0,40000,0,100,0,65535,0,0,0,1000,0,1,0,50000},(__m256i)(__v16hu){0,5000,0,40000,0,200,0,1,0,1,0,65535,0,0,0,25000}),101,60000,103,0,105,0,107,65534,125,0,127,0,129,1,131,25000));
+
 __m256i test_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_subs_epu16
   // CHECK-NOT: @llvm.x86.avx2.psubus.w
@@ -1838,7 +1895,7 @@ __m256i test_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_subs_epu16(__U,__A,__B); 
 }
-
+TEST_CONSTEXPR(match_v16hu(_mm256_maskz_subs_epu16((__mmask16)0xAAAAu,(__m256i)(__v16hu){0,65000,0,40000,0,100,10,65535,0,0,0,1000,0,1,10000,50000},(__m256i)(__v16hu){0,5000,0,40000,0,200,0,1,0,1,0,65535,0,0,0,25000}),0,60000,0,0,0,0,0,65534,0,0,0,0,0,1,0,25000));
 
 __m128i test_mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) {
   // CHECK-LABEL: test_mm_mask2_permutex2var_epi16
@@ -1887,6 +1944,67 @@ __m256i test_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_permutex2var_epi16(__U,__A,__I,__B); 
 }
+
+TEST_CONSTEXPR(match_v8hi(
+    _mm_permutex2var_epi16((__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70},
+                           (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10},
+                           (__m128i)(__v8hi){100, 110, 120, 130, 140, 150, 160,
+                                             170}),
+    0, 70, 100, 170, 10, 110, 20, 120));
+TEST_CONSTEXPR(match_v8hi(
+    _mm_mask_permutex2var_epi16((__m128i)(__v8hi){-1, -2, -3, -4, -5, -6, -7, -8},
+                                0xAA,
+                                (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10},
+                                (__m128i)(__v8hi){100, 110, 120, 130, 140, 150,
+                                                  160, 170}),
+    -1, -8, -3, 170, -5, 110, -7, 120));
+TEST_CONSTEXPR(match_v8hi(
+    _mm_maskz_permutex2var_epi16(0xAA,
+                                 (__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70},
+                                 (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10},
+                                 (__m128i)(__v8hi){100, 110, 120, 130, 140, 150,
+                                                   160, 170}),
+    0, 70, 0, 170, 0, 110, 0, 120));
+TEST_CONSTEXPR(match_v8hi(
+    _mm_mask2_permutex2var_epi16((__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70},
+                                 (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10},
+                                 0x55,
+                                 (__m128i)(__v8hi){100, 110, 120, 130, 140, 150,
+                                                   160, 170}),
+    0, 7, 100, 15, 10, 9, 20, 10));
+TEST_CONSTEXPR(match_v16hi(
+    _mm256_permutex2var_epi16(
+        (__m256i)(__v16hi){0, 10, 20, 30, 40, 50, 60, 70,
+                           80, 90, 100, 110, 120, 130, 140, 150},
+        (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    0, 150, 200, 350, 10, 210, 20, 220,
+    30, 230, 40, 240, 50, 250, 60, 260));
+TEST_CONSTEXPR(match_v16hi(
+    _mm256_mask_permutex2var_epi16(
+        (__m256i)(__v16hi){-1, -2, -3, -4, -5, -6, -7, -8,
+                           -9, -10, -11, -12, -13, -14, -15, -16},
+        0xAAAA,
+        (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    -1, -16, -3, 350, -5, 210, -7, 220,
+    -9, 230, -11, 240, -13, 250, -15, 260));
+TEST_CONSTEXPR(match_v16hi(
+    _mm256_maskz_permutex2var_epi16(
+        0x5555,
+        (__m256i)(__v16hi){0, 10, 20, 30, 40, 50, 60, 70,
+                           80, 90, 100, 110, 120, 130, 140, 150},
+        (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    0, 0, 200, 0, 10, 0, 20, 0,
+    30, 0, 40, 0, 50, 0, 60, 0));
+
 __m128i test_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: test_mm_mask_maddubs_epi16
   // CHECK: @llvm.x86.ssse3.pmadd.ub.sw
@@ -2172,6 +2290,7 @@ __m128i test_mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m1
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_unpackhi_epi8(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_mask_unpackhi_epi8((__m128i)(__v16qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},(__mmask16)0xFAAA,(__m128i)(__v16qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115},(__m128i)(__v16qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}),1,-9,3,-10,5,-11,7,-12,9,-13,11,-14,114,-15,115,-16));
 
 __m128i test_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_unpackhi_epi8
@@ -2179,6 +2298,7 @@ __m128i test_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_unpackhi_epi8(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_maskz_unpackhi_epi8((__mmask16)0xFAAA,(__m128i)(__v16qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115},(__m128i)(__v16qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}),0,-9,0,-10,0,-11,0,-12,0,-13,0,-14,114,-15,115,-16));
 
 __m256i test_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_unpackhi_epi8
@@ -2186,6 +2306,7 @@ __m256i test_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, _
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_unpackhi_epi8(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_unpackhi_epi8((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xFAAAAAAA,(__m256i)(__v32qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89},(__m256i)(__v32qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64}),1,-9,3,-10,5,-11,7,-12,9,-13,11,-14,13,-15,15,-16,49,-57,51,-58,53,-59,55,-60,57,-61,59,-62,-90,-63,-89,-64));
 
 __m256i test_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_unpackhi_epi8
@@ -2193,6 +2314,7 @@ __m256i test_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B)
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_unpackhi_epi8(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_unpackhi_epi8((__mmask32)0xFAAAAAAA,(__m256i)(__v32qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89},(__m256i)(__v32qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64}),0,-9,0,-10,0,-11,0,-12,0,-13,0,-14,0,-15,0,-16,0,-57,0,-58,0,-59,0,-60,0,-61,0,-62,-90,-63,-89,-64));
 
 __m128i test_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_unpackhi_epi16
@@ -2200,6 +2322,7 @@ __m128i test_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m1
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_unpackhi_epi16(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_unpackhi_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8},(__mmask8)0xFA,(__m128i)(__v8hi){100,101,102,103,104,105,106,107},(__m128i)(__v8hi){200,201,202,203,204,205,206,207}),1,204,3,205,106,206,107,207));
 
 __m128i test_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_unpackhi_epi16
@@ -2207,6 +2330,7 @@ __m128i test_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_unpackhi_epi16(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_unpackhi_epi16((__mmask8)0xFA,(__m128i)(__v8hi){100,101,102,103,104,105,106,107},(__m128i)(__v8hi){200,201,202,203,204,205,206,207}),0,204,0,205,106,206,107,207));
 
 __m256i test_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_unpackhi_epi16
@@ -2214,6 +2338,7 @@ __m256i test_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A,
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_unpackhi_epi16(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_unpackhi_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xFAAAu,(__m256i)(__v16hi){100,101,102,103,104,105,106,107,130,131,132,133,134,135,136,137},(__m256i)(__v16hi){200,201,202,203,204,205,206,207,230,231,232,233,234,235,236,237}),1,204,3,205,5,206,7,207,25,234,27,235,136,236,137,237));
 
 __m256i test_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_unpackhi_epi16
@@ -2221,6 +2346,7 @@ __m256i test_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B)
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_unpackhi_epi16(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_unpackhi_epi16((__mmask16)0xFAAAu,(__m256i)(__v16hi){100,101,102,103,104,105,106,107,130,131,132,133,134,135,136,137},(__m256i)(__v16hi){200,201,202,203,204,205,206,207,230,231,232,233,234,235,236,237}),0,204,0,205,0,206,0,207,0,234,0,235,136,236,137,237));
 
 __m128i test_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_unpacklo_epi8
@@ -2228,6 +2354,7 @@ __m128i test_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m1
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_unpacklo_epi8(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_mask_unpacklo_epi8((__m128i)(__v16qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},(__mmask16)0xFAAA,(__m128i)(__v16qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115},(__m128i)(__v16qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}),1,-1,3,-2,5,-3,7,-4,9,-5,11,-6,106,-7,107,-8));
 
 __m128i test_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_unpacklo_epi8
@@ -2235,6 +2362,7 @@ __m128i test_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_unpacklo_epi8(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_maskz_unpacklo_epi8((__mmask16)0xFAAA,(__m128i)(__v16qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115},(__m128i)(__v16qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}),0,-1,0,-2,0,-3,0,-4,0,-5,0,-6,106,-7,107,-8));
 
 __m256i test_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_unpacklo_epi8
@@ -2242,6 +2370,7 @@ __m256i test_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, _
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_unpacklo_epi8(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_unpacklo_epi8((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xFAAAAAAA,(__m256i)(__v32qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64,-65},(__m256i)(__v32qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75}),1,-1,3,-2,5,-3,7,-4,9,-5,11,-6,13,-7,15,-8,49,60,51,61,53,62,55,63,57,64,59,65,-56,66,-57,67));
 
 __m256i test_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_unpacklo_epi8
@@ -2249,6 +2378,7 @@ __m256i test_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B)
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_unpacklo_epi8(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_unpacklo_epi8((__mmask32)0xFAAAAAAA,(__m256i)(__v32qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64,-65},(__m256i)(__v32qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75}),0,-1,0,-2,0,-3,0,-4,0,-5,0,-6,0,-7,0,-8,0,60,0,61,0,62,0,63,0,64,0,65,-56,66,-57,67));
 
 __m128i test_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_unpacklo_epi16
@@ -2256,6 +2386,7 @@ __m128i test_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m1
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_unpacklo_epi16(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_unpacklo_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8},(__mmask8)0xFAu,(__m128i)(__v8hi){100,101,102,103,104,105,106,107},(__m128i)(__v8hi){200,201,202,203,204,205,206,207}),1,200,3,201,102,202,103,203));
 
 __m128i test_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_unpacklo_epi16
@@ -2263,6 +2394,7 @@ __m128i test_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_unpacklo_epi16(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_unpacklo_epi16((__mmask8)0xFAu,(__m128i)(__v8hi){100,101,102,103,104,105,106,107},(__m128i)(__v8hi){200,201,202,203,204,205,206,207}),0,200,0,201,102,202,103,203));
 
 __m256i test_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_unpacklo_epi16
@@ -2270,6 +2402,7 @@ __m256i test_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A,
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_unpacklo_epi16(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_unpacklo_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xFAAAu,(__m256i)(__v16hi){100,101,102,103,104,105,106,107,130,131,132,133,134,135,136,137},(__m256i)(__v16hi){200,201,202,203,204,205,206,207,230,231,232,233,234,235,236,237}),1,200,3,201,5,202,7,203,25,230,27,231,132,232,133,233));
 
 __m256i test_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_unpacklo_epi16
@@ -2277,6 +2410,7 @@ __m256i test_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B)
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_unpacklo_epi16(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_unpacklo_epi16((__mmask16)0xFAAAu,(__m256i)(__v16hi){100,101,102,103,104,105,106,107,130,131,132,133,134,135,136,137},(__m256i)(__v16hi){200,201,202,203,204,205,206,207,230,231,232,233,234,235,236,237}),0,200,0,201,0,202,0,203,0,230,0,231,132,232,133,233));
 
 __m128i test_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_cvtepi8_epi16
@@ -2284,6 +2418,7 @@ __m128i test_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_cvtepi8_epi16(__W, __U, __A); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_cvtepi8_epi16(_mm_set1_epi16(-777),(__mmask8)0xA5,(__m128i)(__v16qs){1,-2,3,-4,5,-6,7,-8,9,10,11,12,13,14,15,16}),1,-777,3,-777,-777,-6,-777,-8));
 
 __m128i test_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_maskz_cvtepi8_epi16
@@ -2291,6 +2426,7 @@ __m128i test_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_cvtepi8_epi16(__U, __A); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_cvtepi8_epi16((__mmask8)0xA5,(__m128i)(__v16qs){1,-2,3,-4,5,-6,7,-8,9,10,11,12,13,14,15,16}),1,0,3,0,0,-6,0,-8));
 
 __m256i test_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) {
   // CHECK-LABEL: test_mm256_mask_cvtepi8_epi16
@@ -2298,6 +2434,7 @@ __m256i test_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_cvtepi8_epi16(__W, __U, __A); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_cvtepi8_epi16(_mm256_set1_epi16(-777),/*1001110010100101=*/0x9ca5,(__m128i)(__v16qs){1,-2,3,-4,5,-6,7,-8,25,-26,27,-28,29,-30,31,-32}),1,-777,3,-777,-777,-6,-777,-8,-777,-777,27,-28,29,-777,-777,-32));
 
 __m256i test_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) {
   // CHECK-LABEL: test_mm256_maskz_cvtepi8_epi16
@@ -2305,6 +2442,7 @@ __m256i test_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_cvtepi8_epi16(__U, __A); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_cvtepi8_epi16(/*1001110010100101=*/0x9ca5,(__m128i)(__v16qs){1,-2,3,-4,5,-6,7,-8,25,-26,27,-28,29,-30,31,-32}),1,0,3,0,0,-6,0,-8,0,0,27,-28,29,0,0,-32));
 
 __m128i test_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_cvtepu8_epi16
@@ -2312,6 +2450,7 @@ __m128i test_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_cvtepu8_epi16(__W, __U, __A); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_cvtepu8_epi16(_mm_set1_epi16(-777),(__mmask8)0xA5,(__m128i)(__v16qu){25,26,27,28,29,30,31,32,0,0,0,0,0,0,0,0}),25,-777,27,-777,-777,30,-777,32));
 
 __m128i test_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_maskz_cvtepu8_epi16
@@ -2319,6 +2458,7 @@ __m128i test_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_cvtepu8_epi16(__U, __A); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_cvtepu8_epi16((__mmask8)0xA5,(__m128i)(__v16qu){25,26,27,28,29,30,31,32,0,0,0,0,0,0,0,0}),25,0,27,0,0,30,0,32));
 
 __m256i test_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) {
   // CHECK-LABEL: test_mm256_mask_cvtepu8_epi16
@@ -2326,6 +2466,7 @@ __m256i test_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_cvtepu8_epi16(__W, __U, __A); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_cvtepu8_epi16(_mm256_set1_epi16(-777),/*1001110010100101=*/0x9ca5,(__m128i)(__v16qu){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32}),1,-777,3,-777,-777,6,-777,8,-777,-777,27,28,29,-777,-777,32));
 
 __m256i test_mm256_maskz_cvtepu8_epi16(__mmask16 __U, __m128i __A) {
   // CHECK-LABEL: test_mm256_maskz_cvtepu8_epi16
@@ -2333,6 +2474,7 @@ __m256i test_mm256_maskz_cvtepu8_epi16(__mmask16 __U, __m128i __A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_cvtepu8_epi16(__U, __A); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_cvtepu8_epi16(/*1001110010100101=*/0x9ca5,(__m128i)(__v16qu){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32}),1,0,3,0,0,6,0,8,0,0,27,28,29,0,0,32));
 
 __m256i test_mm256_sllv_epi16(__m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_sllv_epi16
@@ -2407,6 +2549,7 @@ __m256i test_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_sll_epi16(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_slli_epi16((__mmask8)0xAA, (__m128i)(__v8hi){0, 1, 2, 3, 4, 5, 6, 7}, 20), 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m128i test_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_slli_epi16
@@ -2414,6 +2557,7 @@ __m128i test_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_slli_epi16(__W, __U, __A, 5); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_slli_epi16((__m128i)(__v8hi){100, 101, 102, 103, 104, 105, 106, 107}, (__mmask8)0xAA, (__m128i)(__v8hi){0, 1, 2, 3, 4, 5, 6, 7}, 20), 100, 0, 102, 0, 104, 0, 106, 0));
 
 __m128i test_mm_mask_slli_epi16_2(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) {
   // CHECK-LABEL: test_mm_mask_slli_epi16_2
@@ -3091,6 +3235,7 @@ __m128i test_mm_mask_broadcastb_epi8(__m128i __O, __mmask16 __M, __m128i __A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_broadcastb_epi8(__O, __M, __A);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_mask_broadcastb_epi8((__m128i)(__v16qs){0,1,2,3,4,5,6,7,56,57,58,59,60,61,62,63},(__mmask16)0xAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,2,-120,4,-120,6,-120,56,-120,58,-120,60,-120,62,-120));
 
 __m128i test_mm_maskz_broadcastb_epi8(__mmask16 __M, __m128i __A) {
   // CHECK-LABEL: test_mm_maskz_broadcastb_epi8
@@ -3098,6 +3243,7 @@ __m128i test_mm_maskz_broadcastb_epi8(__mmask16 __M, __m128i __A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_broadcastb_epi8(__M, __A);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_maskz_broadcastb_epi8((__mmask16)0xAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120));
 
 __m256i test_mm256_mask_broadcastb_epi8(__m256i __O, __mmask32 __M, __m128i __A) {
   // CHECK-LABEL: test_mm256_mask_broadcastb_epi8
@@ -3105,6 +3251,7 @@ __m256i test_mm256_mask_broadcastb_epi8(__m256i __O, __mmask32 __M, __m128i __A)
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_broadcastb_epi8(__O, __M, __A);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_broadcastb_epi8((__m256i)(__v32qs){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63},(__mmask32)0xAAAAAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,2,-120,4,-120,6,-120,8,-120,10,-120,12,-120,14,-120,48,-120,50,-120,52,-120,54,-120,56,-120,58,-120,60,-120,62,-120));
 
 __m256i test_mm256_maskz_broadcastb_epi8(__mmask32 __M, __m128i __A) {
   // CHECK-LABEL: test_mm256_maskz_broadcastb_epi8
@@ -3112,6 +3259,7 @@ __m256i test_mm256_maskz_broadcastb_epi8(__mmask32 __M, __m128i __A) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_broadcastb_epi8(__M, __A);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_broadcastb_epi8((__mmask32)0xAAAAAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120));
 
 __m128i test_mm_mask_broadcastw_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_broadcastw_epi16
@@ -3119,6 +3267,7 @@ __m128i test_mm_mask_broadcastw_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_broadcastw_epi16(__O, __M, __A);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_broadcastw_epi16((__m128i)(__v8hi){0,1,2,3,4,5,6,7},(__mmask8)0xAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,2,-120,4,-120,6,-120));
 
 __m128i test_mm_maskz_broadcastw_epi16(__mmask8 __M, __m128i __A) {
   // CHECK-LABEL: test_mm_maskz_broadcastw_epi16
@@ -3126,6 +3275,7 @@ __m128i test_mm_maskz_broadcastw_epi16(__mmask8 __M, __m128i __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_broadcastw_epi16(__M, __A);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_broadcastw_epi16((__mmask8)0xAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,0,-120,0,-120,0,-120));
 
 __m256i test_mm256_mask_broadcastw_epi16(__m256i __O, __mmask16 __M, __m128i __A) {
   // CHECK-LABEL: test_mm256_mask_broadcastw_epi16
@@ -3133,6 +3283,7 @@ __m256i test_mm256_mask_broadcastw_epi16(__m256i __O, __mmask16 __M, __m128i __A
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_broadcastw_epi16(__O, __M, __A);
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_broadcastw_epi16((__m256i)(__v16hi){0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31},(__mmask16)0xAAAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,2,-120,4,-120,6,-120,24,-120,26,-120,28,-120,30,-120));
 
 __m256i test_mm256_maskz_broadcastw_epi16(__mmask16 __M, __m128i __A) {
   // CHECK-LABEL: test_mm256_maskz_broadcastw_epi16
@@ -3140,6 +3291,8 @@ __m256i test_mm256_maskz_broadcastw_epi16(__mmask16 __M, __m128i __A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_broadcastw_epi16(__M, __A);
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_broadcastw_epi16((__mmask16)0xAAAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120));
+
 __m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){
   // CHECK-LABEL: test_mm_mask_set1_epi8
   // CHECK: insertelement <16 x i8> poison, i8 %{{.*}}, i32 0
@@ -3161,6 +3314,8 @@ __m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_set1_epi8(__O, __M, __A);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_mask_set1_epi8((__m128i)(__v16qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},(__mmask16)0xAAAA,(char)42),1,42,3,42,5,42,7,42,9,42,11,42,13,42,15,42));
+
 __m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){
   // CHECK-LABEL: test_mm_maskz_set1_epi8
   // CHECK: insertelement <16 x i8> poison, i8 %{{.*}}, i32 0
@@ -3182,6 +3337,7 @@ __m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_set1_epi8( __M, __A);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_maskz_set1_epi8((__mmask16)0xAAAA,(char)42),0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42));
 
 __m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) {
   // CHECK-LABEL: test_mm256_mask_set1_epi8
@@ -3220,6 +3376,7 @@ __m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) {
   // CHECK:  select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_set1_epi8(__O, __M, __A);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_set1_epi8((__m256i)(__v32qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(char)42),1,42,3,42,5,42,7,42,9,42,11,42,13,42,15,42,49,42,51,42,53,42,55,42,57,42,59,42,61,42,63,42));
 
 __m256i test_mm256_maskz_set1_epi8( __mmask32 __M, char __A) {
   // CHECK-LABEL: test_mm256_maskz_set1_epi8
@@ -3258,7 +3415,7 @@ __m256i test_mm256_maskz_set1_epi8( __mmask32 __M, char __A) {
   // CHECK:  select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_set1_epi8( __M, __A);
 }
-
+TEST_CONSTEXPR( match_v32qi( _mm256_maskz_set1_epi8( (__mmask32)0xAAAAAAAA, (char)42 ), 0,   42,  0, 42, 0,  42,  0,  42, 0,   42,  0, 42, 0,  42,  0,  42, 0,   42,  0, 42, 0,  42,  0,  42, 0,   42,  0, 42, 0,  42,  0,  42 ) );
 
 __m256i test_mm256_mask_set1_epi16(__m256i __O, __mmask16 __M, short __A) {
   // CHECK-LABEL: test_mm256_mask_set1_epi16
@@ -3281,6 +3438,7 @@ __m256i test_mm256_mask_set1_epi16(__m256i __O, __mmask16 __M, short __A) {
   // CHECK:  select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_set1_epi16(__O, __M, __A); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_set1_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,42),1,42,3,42,5,42,7,42,25,42,27,42,29,42,31,42));
 
 __m256i test_mm256_maskz_set1_epi16(__mmask16 __M, short __A) {
   // CHECK-LABEL: test_mm256_maskz_set1_epi16
@@ -3303,6 +3461,7 @@ __m256i test_mm256_maskz_set1_epi16(__mmask16 __M, short __A) {
   // CHECK:  select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_set1_epi16(__M, __A); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_set1_epi16((__mmask16)0xAAAA,42),0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42));
 
 __m128i test_mm_mask_set1_epi16(__m128i __O, __mmask8 __M, short __A) {
   // CHECK-LABEL: test_mm_mask_set1_epi16
@@ -3317,6 +3476,7 @@ __m128i test_mm_mask_set1_epi16(__m128i __O, __mmask8 __M, short __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_set1_epi16(__O, __M, __A); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_set1_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8},(__mmask8)0xAA,42),1,42,3,42,5,42,7,42));
 
 __m128i test_mm_maskz_set1_epi16(__mmask8 __M, short __A) {
   // CHECK-LABEL: test_mm_maskz_set1_epi16
@@ -3331,6 +3491,8 @@ __m128i test_mm_maskz_set1_epi16(__mmask8 __M, short __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_set1_epi16(__M, __A); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_set1_epi16((__mmask8)0xAA,42),0,42,0,42,0,42,0,42));
+
 __m128i test_mm_permutexvar_epi16(__m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_permutexvar_epi16
   // CHECK: @llvm.x86.avx512.permvar.hi.128
@@ -3376,6 +3538,7 @@ __m128i test_mm_mask_alignr_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_alignr_epi8(__W, __U, __A, __B, 2); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_mask_alignr_epi8(((__m128i)(__v16qs){127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}), (__mmask16)0x000f, ((__m128i)(__v16qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m128i)(__v16qs){17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 2), 19, 20, 21, 22, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127));
 
 __m128i test_mm_maskz_alignr_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_alignr_epi8
@@ -3383,6 +3546,7 @@ __m128i test_mm_maskz_alignr_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_alignr_epi8(__U, __A, __B, 2); 
 }
+TEST_CONSTEXPR(match_v16qi( _mm_maskz_alignr_epi8((__mmask16)0x000f, ((__m128i)(__v16qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m128i)(__v16qs){17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}),2), 19, 20, 21, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m256i test_mm256_mask_alignr_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_alignr_epi8
@@ -3390,6 +3554,7 @@ __m256i test_mm256_mask_alignr_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_alignr_epi8(__W, __U, __A, __B, 2); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_alignr_epi8(((__m256i)(__v32qs){127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}), (__mmask32)0xf000000f, ((__m256i)(__v32qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), ((__m256i)(__v32qs){33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 2), 35, 36, 37, 38, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 63, 64, 17, 18));
 
 __m256i test_mm256_maskz_alignr_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_alignr_epi8
@@ -3397,6 +3562,7 @@ __m256i test_mm256_maskz_alignr_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_alignr_epi8(__U, __A, __B, 2); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_alignr_epi8((__mmask32)0xf000000f, ((__m256i)(__v32qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), ((__m256i)(__v32qs){33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 2), 35, 36, 37, 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63, 64, 17, 18));
 
 __m128i test_mm_dbsad_epu8(__m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dbsad_epu8
@@ -3596,3 +3762,22 @@ void test_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i _
  // CHECK: @llvm.x86.avx512.mask.pmovs.wb.mem.256
  _mm256_mask_cvtsepi16_storeu_epi8 ( __P, __M, __A);
 }
+
+
+TEST_CONSTEXPR(match_v16qu(
+    _mm_permutex2var_epi8((__m128i)(__v16qu){0, 10, 20, 30, 40, 50, 60, 70,
+                                             80, 90, 100, 110, 120, 127, 126, 125},
+                         (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19,
+                                             4, 20, 5, 21, 6, 22, 7, 23},
+                         (__m128i)(__v16qu){100, 110, 120, 130, 140, 150, 160, 170,
+                                            180, 190, 200, 210, 220, 230, 240, 250}),
+    0, 100, 10, 110, 20, 120, 30, 130,
+    40, 140, 50, 150, 60, 160, 70, 170));
+TEST_CONSTEXPR(match_v32qu(
+    _mm256_permutex2var_epi8((__m256i)(__v32qu){0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109},
+                             (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47},
+                             (__m256i)(__v32qu){200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231}),
+    0, 200, 10, 201, 20, 202, 30, 203,
+    40, 204, 50, 205, 60, 206, 70, 207,
+    80, 208, 90, 209, 100, 210, 110, 211,
+    120, 212, 127, 213, 126, 214, 125, 215));
diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c
index a56f8ba1ee385..c7cd9ffdd6966 100644
--- a/clang/test/CodeGen/X86/math-builtins.c
+++ b/clang/test/CodeGen/X86/math-builtins.c
@@ -118,36 +118,36 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_copysign(f,f); __builtin_copysignf(f,f); __builtin_copysignl(f,f); __builtin_copysignf128(f,f);
 
-// NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
-// HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
+// NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
+// HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
 
   __builtin_fabs(f);       __builtin_fabsf(f);      __builtin_fabsl(f); __builtin_fabsf128(f);
 
-// NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC2]]
 
   __builtin_frexp(f,i);    __builtin_frexpf(f,i);   __builtin_frexpl(f,i); __builtin_frexpf128(f,i);
 
-// NO__ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC2]]
 
   __builtin_huge_val();    __builtin_huge_valf();   __builtin_huge_vall(); __builtin_huge_valf128();
 
@@ -165,10 +165,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_ldexp(f,f);    __builtin_ldexpf(f,f);   __builtin_ldexpl(f,f);  __builtin_ldexpf128(f,f);
 
-// NO__ERRNO: declare double @llvm.ldexp.f64.i32(double, i32) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.ldexp.f32.i32(float, i32) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.ldexp.f128.i32(fp128, i32) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.ldexp.f64.i32(double, i32) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.ldexp.f32.i32(float, i32) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.ldexp.f128.i32(fp128, i32) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @ldexp(double noundef, i32 noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @ldexpf(float noundef, i32 noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[NOT_READNONE]]
@@ -180,7 +180,7 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 // NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]]
 // NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]]
 // NO__ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE:#[0-9]+]]
-// HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]]
+// HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC:#[0-9]+]]
 // HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE]]
@@ -209,10 +209,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_pow(f,f);        __builtin_powf(f,f);       __builtin_powl(f,f); __builtin_powf128(f,f);
 
-// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.pow.f128(fp128, fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.pow.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @pow(double noundef, double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @powf(float noundef, float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @powl(x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
@@ -220,12 +220,12 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_powi(f,f);        __builtin_powif(f,f);       __builtin_powil(f,f);
 
-// NO__ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]]
 
   /* math */
   __builtin_acos(f);       __builtin_acosf(f);      __builtin_acosl(f); __builtin_acosf128(f);
@@ -307,21 +307,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_ceil(f);       __builtin_ceilf(f);      __builtin_ceill(f); __builtin_ceilf128(f);
 
-// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC2]]
 
   __builtin_cos(f);        __builtin_cosf(f);       __builtin_cosl(f); __builtin_cosf128(f);
 
-// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.cos.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.cos.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @cos(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @cosf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @cosl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -362,10 +362,10 @@ __builtin_erfc(f);       __builtin_erfcf(f);      __builtin_erfcl(f); __builtin_
 
 __builtin_exp(f);        __builtin_expf(f);       __builtin_expl(f); __builtin_expf128(f);
 
-// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.exp.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.exp.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @exp(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @expf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @expl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -373,10 +373,10 @@ __builtin_exp(f);        __builtin_expf(f);       __builtin_expl(f); __builtin_e
 
 __builtin_exp2(f);       __builtin_exp2f(f);      __builtin_exp2l(f); __builtin_exp2f128(f);
 
-// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.exp2.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.exp2.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @exp2(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @exp2f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @exp2l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -384,10 +384,10 @@ __builtin_exp2(f);       __builtin_exp2f(f);      __builtin_exp2l(f); __builtin_
 
 __builtin_exp10(f);       __builtin_exp10f(f);      __builtin_exp10l(f); __builtin_exp10f128(f);
 
-// NO__ERRNO: declare double @llvm.exp10.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.exp10.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.exp10.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.exp10.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.exp10.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.exp10.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.exp10.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.exp10.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @exp10(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @exp10f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @exp10l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -417,22 +417,22 @@ __builtin_fdim(f,f);       __builtin_fdimf(f,f);      __builtin_fdiml(f,f); __bu
 
 __builtin_floor(f);      __builtin_floorf(f);     __builtin_floorl(f); __builtin_floorf128(f);
 
-// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_fma(f,f,f);        __builtin_fmaf(f,f,f);       __builtin_fmal(f,f,f); __builtin_fmaf128(f,f,f);  __builtin_fmaf16(f,f,f);
 
-// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.fma.f128(fp128, fp128, fp128) [[READNONE_INTRINSIC]]
-// NO__ERRONO: declare half @llvm.fma.f16(half, half, half) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.fma.f128(fp128, fp128, fp128) [[READNONE_INTRINSIC2]]
+// NO__ERRONO: declare half @llvm.fma.f16(half, half, half) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @fma(double noundef, double noundef, double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @fmaf(float noundef, float noundef, float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
@@ -454,25 +454,25 @@ __builtin_fma(f,f,f);        __builtin_fmaf(f,f,f);       __builtin_fmal(f,f,f);
 
 __builtin_fmax(f,f);       __builtin_fmaxf(f,f);      __builtin_fmaxl(f,f); __builtin_fmaxf128(f,f);
 
-// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_fmin(f,f);       __builtin_fminf(f,f);      __builtin_fminl(f,f); __builtin_fminf128(f,f);
 
-// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_hypot(f,f);      __builtin_hypotf(f,f);     __builtin_hypotl(f,f); __builtin_hypotf128(f,f);
 
@@ -509,10 +509,10 @@ __builtin_lgamma(f);     __builtin_lgammaf(f);    __builtin_lgammal(f); __builti
 
 __builtin_llrint(f);     __builtin_llrintf(f);    __builtin_llrintl(f); __builtin_llrintf128(f);
 
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @llrint(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llrintf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llrintl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -520,10 +520,10 @@ __builtin_llrint(f);     __builtin_llrintf(f);    __builtin_llrintl(f); __builti
 
 __builtin_llround(f);    __builtin_llroundf(f);   __builtin_llroundl(f); __builtin_llroundf128(f);
 
-// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llround.i64.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @llround(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llroundf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llroundl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -531,10 +531,10 @@ __builtin_llround(f);    __builtin_llroundf(f);   __builtin_llroundl(f); __built
 
 __builtin_log(f);        __builtin_logf(f);       __builtin_logl(f); __builtin_logf128(f);
 
-// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.log.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.log.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @logf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @logl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -542,10 +542,10 @@ __builtin_log(f);        __builtin_logf(f);       __builtin_logl(f); __builtin_l
 
 __builtin_log10(f);      __builtin_log10f(f);     __builtin_log10l(f); __builtin_log10f128(f);
 
-// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.log10.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.log10.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log10(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @log10f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @log10l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -564,10 +564,10 @@ __builtin_log1p(f);      __builtin_log1pf(f);     __builtin_log1pl(f); __builtin
 
 __builtin_log2(f);       __builtin_log2f(f);      __builtin_log2l(f); __builtin_log2f128(f);
 
-// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.log2.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.log2.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log2(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @log2f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @log2l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -586,10 +586,10 @@ __builtin_logb(f);       __builtin_logbf(f);      __builtin_logbl(f); __builtin_
 
 __builtin_lrint(f);      __builtin_lrintf(f);     __builtin_lrintl(f); __builtin_lrintf128(f);
 
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @lrint(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lrintf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lrintl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -597,10 +597,10 @@ __builtin_lrint(f);      __builtin_lrintf(f);     __builtin_lrintl(f); __builtin
 
 __builtin_lround(f);     __builtin_lroundf(f);    __builtin_lroundl(f);  __builtin_lroundf128(f);
 
-// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lround.i64.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @lround(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lroundf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lroundl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -608,14 +608,14 @@ __builtin_lround(f);     __builtin_lroundf(f);    __builtin_lroundl(f);  __built
 
 __builtin_nearbyint(f);  __builtin_nearbyintf(f); __builtin_nearbyintl(f); __builtin_nearbyintf128(f);
 
-// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_nextafter(f,f);  __builtin_nextafterf(f,f); __builtin_nextafterl(f,f); __builtin_nextafterf128(f,f);
 
@@ -663,25 +663,25 @@ __builtin_remquo(f,f,i);  __builtin_remquof(f,f,i); __builtin_remquol(f,f,i); __
 
 __builtin_rint(f);       __builtin_rintf(f);      __builtin_rintl(f); __builtin_rintf128(f);
 
-// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_round(f);      __builtin_roundf(f);     __builtin_roundl(f); __builtin_roundf128(f);
 
-// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_scalbln(f,f);    __builtin_scalblnf(f,f);   __builtin_scalblnl(f,f); __builtin_scalblnf128(f,f);
 
@@ -707,10 +707,10 @@ __builtin_scalbn(f,f);     __builtin_scalbnf(f,f);    __builtin_scalbnl(f,f); __
 
 __builtin_sin(f);        __builtin_sinf(f);       __builtin_sinl(f); __builtin_sinf128(f);
 
-// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.sin.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.sin.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @sin(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @sinf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @sinl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -747,10 +747,10 @@ __builtin_sincospi(f,d,d); __builtin_sincospif(f,fp,fp); __builtin_sincospil(f,l
 
 __builtin_sqrt(f);       __builtin_sqrtf(f);      __builtin_sqrtl(f); __builtin_sqrtf128(f);
 
-// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.sqrt.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.sqrt.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @sqrt(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @sqrtf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @sqrtl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -791,22 +791,24 @@ __builtin_tgamma(f);     __builtin_tgammaf(f);    __builtin_tgammal(f); __builti
 
 __builtin_trunc(f);      __builtin_truncf(f);     __builtin_truncl(f); __builtin_truncf128(f);
 
-// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC2]]
 };
 
 // NO__ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
+// NO__ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} }
 // NO__ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
 // NO__ERRNO: attributes [[PURE]] = { {{.*}}memory(read){{.*}} }
 // NO__ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
 
 // HAS_ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
+// HAS_ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} }
 // HAS_ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
 // HAS_ERRNO: attributes [[PURE]] = { {{.*}}memory(read){{.*}} }
 // HAS_ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 273138063a1b1..ad8a81c61ad43 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -102,6 +102,8 @@ __m64 test_mm_alignr_pi8(__m64 a, __m64 b) {
   // CHECK: shufflevector <16 x i8> {{%.*}}, <16 x i8> zeroinitializer, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   return _mm_alignr_pi8(a, b, 2);
 }
+TEST_CONSTEXPR(match_v8qi(_mm_alignr_pi8(((__m64)(__v8qs){1, 2, 3, 4, 5, 6, 7, 8}), ((__m64)(__v8qs){9, 10, 11, 12, 13, 14, 15, 16}), 2), 11, 12, 13, 14, 15, 16, 1, 2));
+TEST_CONSTEXPR(match_v8qi(_mm_alignr_pi8(((__m64)(__v8qs){1, 2, 3, 4, 5, 6, 7, 8}), ((__m64)(__v8qs){9, 10, 11, 12, 13, 14, 15, 16}), 16), 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m64 test_mm_and_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_and_si64
diff --git a/clang/test/CodeGen/X86/mmx-inline-asm-error.c b/clang/test/CodeGen/X86/mmx-inline-asm-error.c
index 1e2246176a117..8a2f991a537a2 100644
--- a/clang/test/CodeGen/X86/mmx-inline-asm-error.c
+++ b/clang/test/CodeGen/X86/mmx-inline-asm-error.c
@@ -1,13 +1,15 @@
 // RUN: %clang_cc1 -verify -triple x86_64-unknown-unknown -emit-llvm-only %s
+// RUN: %clang_cc1 -verify=omp -triple x86_64-unknown-unknown -emit-llvm-only -fopenmp %s
 typedef int vec256 __attribute__((ext_vector_type(8)));
 
 vec256 foo(vec256 in) {
   vec256 out;
 
   asm("something %0" : : "y"(in)); // expected-error {{invalid input size for constraint 'y'}}
+  // omp-error@+1 {{invalid type 'vec256' (vector of 8 'int' values) in asm input for constraint 'y'}}
   asm("something %0" : "=y"(out)); // expected-error {{invalid output size for constraint '=y'}}
+  // omp-error@+1 {{invalid type 'vec256' (vector of 8 'int' values) in asm input for constraint 'y'}}
   asm("something %0, %0" : "+y"(out)); // expected-error {{invalid output size for constraint '+y'}}
 
   return out;
 }
-
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index b7a4a2fe7ccd7..193fa37f65d14 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -48,6 +48,8 @@ __m128i test_mm_alignr_epi8(__m128i a, __m128i b) {
   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   return _mm_alignr_epi8(a, b, 2);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_alignr_epi8(((__m128i)(__v16qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m128i)(__v16qi){17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 2), 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2));
+TEST_CONSTEXPR(match_v16qi(_mm_alignr_epi8(((__m128i)(__v16qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m128i)(__v16qi){17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 32), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m128i test2_mm_alignr_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test2_mm_alignr_epi8
diff --git a/clang/test/CodeGen/arm-acle-coproc.c b/clang/test/CodeGen/arm-acle-coproc.c
index 5acb9f65413a0..000fff632f0b7 100644
--- a/clang/test/CodeGen/arm-acle-coproc.c
+++ b/clang/test/CodeGen/arm-acle-coproc.c
@@ -4,10 +4,10 @@
 // RUN: %clang_cc1 -triple armv5te %s -E -dD -o - | FileCheck --check-prefix=CHECK-V5-TE %s
 // RUN: %clang_cc1 -triple armv5tej %s -E -dD -o - | FileCheck --check-prefix=CHECK-V5-TE %s
 // RUN: %clang_cc1 -triple armv6 %s -E -dD -o - | FileCheck --check-prefix=CHECK-V6 %s
-// RUN: %clang_cc1 -triple armv6m %s -E -dD -o - | FileCheck --check-prefix=CHECK-V6M %s
+// RUN: %clang_cc1 -triple thumbv6m %s -E -dD -o - | FileCheck --check-prefix=CHECK-V6M %s
 // RUN: %clang_cc1 -triple armv7a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V7 %s
 // RUN: %clang_cc1 -triple armv7r %s -E -dD -o - | FileCheck --check-prefix=CHECK-V7 %s
-// RUN: %clang_cc1 -triple armv7m %s -E -dD -o - | FileCheck --check-prefix=CHECK-V7 %s
+// RUN: %clang_cc1 -triple thumbv7m %s -E -dD -o - | FileCheck --check-prefix=CHECK-V7 %s
 // RUN: %clang_cc1 -triple armv8a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple armv8r %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple armv8.1a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
diff --git a/clang/test/CodeGen/arm64-microsoft-arguments.cpp b/clang/test/CodeGen/arm64-microsoft-arguments.cpp
index a0a81be54325f..f7eb0cc765354 100644
--- a/clang/test/CodeGen/arm64-microsoft-arguments.cpp
+++ b/clang/test/CodeGen/arm64-microsoft-arguments.cpp
@@ -57,7 +57,7 @@ S4 f4() {
 
 // Pass and return from instance method called from instance method.
 // CHECK: define {{.*}} void @{{.*}}bar@Q1{{.*}}(ptr {{[^,]*}} %this, ptr dead_on_unwind inreg noalias writable sret(%class.P1) align 1 %agg.result)
-// CHECK: call void {{.*}}foo@P1{{.*}}(ptr noundef{{[^,]*}} %ref.tmp, ptr dead_on_unwind inreg writable sret(%class.P1) align 1 %agg.result, i8 %0)
+// CHECK: call void {{.*}}foo@P1{{.*}}(ptr noundef{{[^,]*}} %ref.tmp, ptr dead_on_unwind inreg writable sret(%class.P1) align 1 %agg.result, i64 %coerce.val.ii)
 
 class P1 {
 public:
@@ -76,7 +76,7 @@ P1 Q1::bar() {
 
 // Pass and return from instance method called from free function.
 // CHECK: define {{.*}} void {{.*}}bar{{.*}}()
-// CHECK: call void {{.*}}foo@P2{{.*}}(ptr noundef{{[^,]*}} %ref.tmp, ptr dead_on_unwind inreg writable sret(%class.P2) align 1 %retval, i8 %0)
+// CHECK: call void {{.*}}foo@P2{{.*}}(ptr noundef{{[^,]*}} %ref.tmp, ptr dead_on_unwind inreg writable sret(%class.P2) align 1 %retval, i64 %coerce.val.ii)
 class P2 {
 public:
   P2 foo(P2 x);
diff --git a/clang/test/CodeGen/attr-counted-by-void-ptr-gnu.c b/clang/test/CodeGen/attr-counted-by-void-ptr-gnu.c
new file mode 100644
index 0000000000000..e22aad306f60c
--- /dev/null
+++ b/clang/test/CodeGen/attr-counted-by-void-ptr-gnu.c
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -std=gnu11 -triple x86_64-unknown-linux-gnu -O2 -emit-llvm -o - %s | FileCheck %s
+
+// Test that counted_by on void* in GNU mode treats void as having size 1 (byte count)
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+#define __sized_by(f)  __attribute__((sized_by(f)))
+
+struct with_counted_by_void {
+  int count;
+  void* buf __counted_by(count);
+};
+
+struct with_sized_by_void {
+  int size;
+  void* buf __sized_by(size);
+};
+
+struct with_counted_by_int {
+  int count;
+  int* buf __counted_by(count);
+};
+
+// CHECK-LABEL: define dso_local {{.*}}@test_counted_by_void(
+// CHECK:         %[[COUNT:.*]] = load i32, ptr %s
+// CHECK:         %[[NARROW:.*]] = tail call i32 @llvm.smax.i32(i32 %[[COUNT]], i32 0)
+// CHECK:         %[[ZEXT:.*]] = zext nneg i32 %[[NARROW]] to i64
+// CHECK:         ret i64 %[[ZEXT]]
+//
+// Verify: counted_by on void* returns the count directly (count * 1 byte)
+long long test_counted_by_void(struct with_counted_by_void *s) {
+  return __builtin_dynamic_object_size(s->buf, 0);
+}
+
+// CHECK-LABEL: define dso_local {{.*}}@test_sized_by_void(
+// CHECK:         %[[SIZE:.*]] = load i32, ptr %s
+// CHECK:         %[[NARROW:.*]] = tail call i32 @llvm.smax.i32(i32 %[[SIZE]], i32 0)
+// CHECK:         %[[ZEXT:.*]] = zext nneg i32 %[[NARROW]] to i64
+// CHECK:         ret i64 %[[ZEXT]]
+//
+// Verify: sized_by on void* returns the size directly
+long long test_sized_by_void(struct with_sized_by_void *s) {
+  return __builtin_dynamic_object_size(s->buf, 0);
+}
+
+// CHECK-LABEL: define dso_local {{.*}}@test_counted_by_int(
+// CHECK:         %[[COUNT:.*]] = load i32, ptr %s
+// CHECK:         %[[SEXT:.*]] = sext i32 %[[COUNT]] to i64
+// CHECK:         %[[SIZE:.*]] = shl nsw i64 %[[SEXT]], 2
+// CHECK:         ret i64
+//
+// Verify: counted_by on int* returns count * sizeof(int) = count * 4
+long long test_counted_by_int(struct with_counted_by_int *s) {
+  return __builtin_dynamic_object_size(s->buf, 0);
+}
+
+// CHECK-LABEL: define dso_local ptr @test_void_ptr_arithmetic(
+// CHECK:         %[[BUF:.*]] = load ptr, ptr
+// CHECK:         %[[EXT:.*]] = sext i32 %offset to i64
+// CHECK:         %[[PTR:.*]] = getelementptr inbounds i8, ptr %[[BUF]], i64 %[[EXT]]
+// CHECK:         ret ptr %[[PTR]]
+//
+// Verify: pointer arithmetic on void* uses i8 (byte offsets), not i32 or other sizes
+void* test_void_ptr_arithmetic(struct with_counted_by_void *s, int offset) {
+  return s->buf + offset;  // GNU extension: void* arithmetic
+}
diff --git a/clang/test/CodeGen/attr-cpuspecific.c b/clang/test/CodeGen/attr-cpuspecific.c
index 44f51887be389..7d086adeef4bf 100644
--- a/clang/test/CodeGen/attr-cpuspecific.c
+++ b/clang/test/CodeGen/attr-cpuspecific.c
@@ -42,7 +42,7 @@ void SingleVersion(void){}
 
 ATTR(cpu_dispatch(ivybridge))
 void SingleVersion(void);
-// LINUX: define weak_odr ptr @SingleVersion.resolver()
+// LINUX: define weak_odr ptr @SingleVersion.resolver() #[[ATTR_RESOLVER:[0-9]+]]
 // LINUX: call void @__cpu_indicator_init
 // LINUX: %[[FEAT_INIT:.+]] = load i32, ptr getelementptr inbounds ({ i32, i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 3, i32 0), align 4
 // LINUX: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 525311
@@ -51,7 +51,7 @@ void SingleVersion(void);
 // LINUX: call void @llvm.trap
 // LINUX: unreachable
 
-// WINDOWS: define weak_odr dso_local void @SingleVersion() comdat
+// WINDOWS: define weak_odr dso_local void @SingleVersion() #[[ATTR_RESOLVER:[0-9]+]] comdat
 // WINDOWS: call void @__cpu_indicator_init()
 // WINDOWS: %[[FEAT_INIT:.+]] = load i32, ptr getelementptr inbounds ({ i32, i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 3, i32 0), align 4
 // WINDOWS: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 525311
@@ -72,7 +72,7 @@ void TwoVersions(void);
 
 ATTR(cpu_dispatch(ivybridge, knl))
 void TwoVersions(void);
-// LINUX: define weak_odr ptr @TwoVersions.resolver()
+// LINUX: define weak_odr ptr @TwoVersions.resolver() #[[ATTR_RESOLVER]]
 // LINUX: call void @__cpu_indicator_init
 // LINUX: %[[FEAT_INIT:.+]] = load i32, ptr getelementptr inbounds ({ i32, i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 3, i32 0), align 4
 // LINUX: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 9422847
@@ -82,7 +82,7 @@ void TwoVersions(void);
 // LINUX: call void @llvm.trap
 // LINUX: unreachable
 
-// WINDOWS: define weak_odr dso_local void @TwoVersions() comdat
+// WINDOWS: define weak_odr dso_local void @TwoVersions() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: call void @__cpu_indicator_init()
 // WINDOWS: %[[FEAT_INIT:.+]] = load i32, ptr getelementptr inbounds ({ i32, i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 3, i32 0), align 4
 // WINDOWS: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 9422847
@@ -119,13 +119,13 @@ void CpuSpecificNoDispatch(void) {}
 
 ATTR(cpu_dispatch(knl))
 void OrderDispatchUsageSpecific(void);
-// LINUX: define weak_odr ptr @OrderDispatchUsageSpecific.resolver()
+// LINUX: define weak_odr ptr @OrderDispatchUsageSpecific.resolver() #[[ATTR_RESOLVER]]
 // LINUX: call void @__cpu_indicator_init
 // LINUX: ret ptr @OrderDispatchUsageSpecific.Z
 // LINUX: call void @llvm.trap
 // LINUX: unreachable
 
-// WINDOWS: define weak_odr dso_local void @OrderDispatchUsageSpecific() comdat
+// WINDOWS: define weak_odr dso_local void @OrderDispatchUsageSpecific() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: call void @__cpu_indicator_init()
 // WINDOWS: call void @OrderDispatchUsageSpecific.Z()
 // WINDOWS-NEXT: ret void
@@ -173,14 +173,14 @@ void usages(void) {
 // has an extra config to emit!
 ATTR(cpu_dispatch(ivybridge, knl, atom))
 void TwoVersionsSameAttr(void);
-// LINUX: define weak_odr ptr @TwoVersionsSameAttr.resolver()
+// LINUX: define weak_odr ptr @TwoVersionsSameAttr.resolver() #[[ATTR_RESOLVER]]
 // LINUX: ret ptr @TwoVersionsSameAttr.Z
 // LINUX: ret ptr @TwoVersionsSameAttr.S
 // LINUX: ret ptr @TwoVersionsSameAttr.O
 // LINUX: call void @llvm.trap
 // LINUX: unreachable
 
-// WINDOWS: define weak_odr dso_local void @TwoVersionsSameAttr() comdat
+// WINDOWS: define weak_odr dso_local void @TwoVersionsSameAttr() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: call void @TwoVersionsSameAttr.Z
 // WINDOWS-NEXT: ret void
 // WINDOWS: call void @TwoVersionsSameAttr.S
@@ -192,7 +192,7 @@ void TwoVersionsSameAttr(void);
 
 ATTR(cpu_dispatch(atom, ivybridge, knl))
 void ThreeVersionsSameAttr(void){}
-// LINUX: define weak_odr ptr @ThreeVersionsSameAttr.resolver()
+// LINUX: define weak_odr ptr @ThreeVersionsSameAttr.resolver() #[[ATTR_RESOLVER]]
 // LINUX: call void @__cpu_indicator_init
 // LINUX: ret ptr @ThreeVersionsSameAttr.Z
 // LINUX: ret ptr @ThreeVersionsSameAttr.S
@@ -200,7 +200,7 @@ void ThreeVersionsSameAttr(void){}
 // LINUX: call void @llvm.trap
 // LINUX: unreachable
 
-// WINDOWS: define weak_odr dso_local void @ThreeVersionsSameAttr() comdat
+// WINDOWS: define weak_odr dso_local void @ThreeVersionsSameAttr() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: call void @__cpu_indicator_init
 // WINDOWS: call void @ThreeVersionsSameAttr.Z
 // WINDOWS-NEXT: ret void
@@ -213,10 +213,10 @@ void ThreeVersionsSameAttr(void){}
 
 ATTR(cpu_dispatch(knl))
 void OrderSpecificUsageDispatch(void);
-// LINUX: define weak_odr ptr @OrderSpecificUsageDispatch.resolver()
+// LINUX: define weak_odr ptr @OrderSpecificUsageDispatch.resolver() #[[ATTR_RESOLVER]]
 // LINUX: ret ptr @OrderSpecificUsageDispatch.Z
 
-// WINDOWS: define weak_odr dso_local void @OrderSpecificUsageDispatch() comdat
+// WINDOWS: define weak_odr dso_local void @OrderSpecificUsageDispatch() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: call void @__cpu_indicator_init
 // WINDOWS: call void @OrderSpecificUsageDispatch.Z
 // WINDOWS-NEXT: ret void
@@ -224,7 +224,7 @@ void OrderSpecificUsageDispatch(void);
 // No Cpu Specific options.
 ATTR(cpu_dispatch(atom, ivybridge, knl))
 void NoSpecifics(void);
-// LINUX: define weak_odr ptr @NoSpecifics.resolver()
+// LINUX: define weak_odr ptr @NoSpecifics.resolver() #[[ATTR_RESOLVER]]
 // LINUX: call void @__cpu_indicator_init
 // LINUX: ret ptr @NoSpecifics.Z
 // LINUX: ret ptr @NoSpecifics.S
@@ -232,7 +232,7 @@ void NoSpecifics(void);
 // LINUX: call void @llvm.trap
 // LINUX: unreachable
 
-// WINDOWS: define weak_odr dso_local void @NoSpecifics() comdat
+// WINDOWS: define weak_odr dso_local void @NoSpecifics() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: call void @__cpu_indicator_init
 // WINDOWS: call void @NoSpecifics.Z
 // WINDOWS-NEXT: ret void
@@ -245,7 +245,7 @@ void NoSpecifics(void);
 
 ATTR(cpu_dispatch(atom, generic, ivybridge, knl))
 void HasGeneric(void);
-// LINUX: define weak_odr ptr @HasGeneric.resolver()
+// LINUX: define weak_odr ptr @HasGeneric.resolver() #[[ATTR_RESOLVER]]
 // LINUX: call void @__cpu_indicator_init
 // LINUX: ret ptr @HasGeneric.Z
 // LINUX: ret ptr @HasGeneric.S
@@ -253,7 +253,7 @@ void HasGeneric(void);
 // LINUX: ret ptr @HasGeneric.A
 // LINUX-NOT: call void @llvm.trap
 
-// WINDOWS: define weak_odr dso_local void @HasGeneric() comdat
+// WINDOWS: define weak_odr dso_local void @HasGeneric() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: call void @__cpu_indicator_init
 // WINDOWS: call void @HasGeneric.Z
 // WINDOWS-NEXT: ret void
@@ -267,7 +267,7 @@ void HasGeneric(void);
 
 ATTR(cpu_dispatch(atom, generic, ivybridge, knl))
 void HasParams(int i, double d);
-// LINUX: define weak_odr ptr @HasParams.resolver()
+// LINUX: define weak_odr ptr @HasParams.resolver() #[[ATTR_RESOLVER]]
 // LINUX: call void @__cpu_indicator_init
 // LINUX: ret ptr @HasParams.Z
 // LINUX: ret ptr @HasParams.S
@@ -275,7 +275,7 @@ void HasParams(int i, double d);
 // LINUX: ret ptr @HasParams.A
 // LINUX-NOT: call void @llvm.trap
 
-// WINDOWS: define weak_odr dso_local void @HasParams(i32 %0, double %1) comdat
+// WINDOWS: define weak_odr dso_local void @HasParams(i32 %0, double %1) #[[ATTR_RESOLVER]] comdat
 // WINDOWS: call void @__cpu_indicator_init
 // WINDOWS: call void @HasParams.Z(i32 %0, double %1)
 // WINDOWS-NEXT: ret void
@@ -289,7 +289,7 @@ void HasParams(int i, double d);
 
 ATTR(cpu_dispatch(atom, generic, ivybridge, knl))
 int HasParamsAndReturn(int i, double d);
-// LINUX: define weak_odr ptr @HasParamsAndReturn.resolver()
+// LINUX: define weak_odr ptr @HasParamsAndReturn.resolver() #[[ATTR_RESOLVER]]
 // LINUX: call void @__cpu_indicator_init
 // LINUX: ret ptr @HasParamsAndReturn.Z
 // LINUX: ret ptr @HasParamsAndReturn.S
@@ -297,7 +297,7 @@ int HasParamsAndReturn(int i, double d);
 // LINUX: ret ptr @HasParamsAndReturn.A
 // LINUX-NOT: call void @llvm.trap
 
-// WINDOWS: define weak_odr dso_local i32 @HasParamsAndReturn(i32 %0, double %1) comdat
+// WINDOWS: define weak_odr dso_local i32 @HasParamsAndReturn(i32 %0, double %1) #[[ATTR_RESOLVER]] comdat
 // WINDOWS: call void @__cpu_indicator_init
 // WINDOWS: %[[RET:.+]] = musttail call i32 @HasParamsAndReturn.Z(i32 %0, double %1)
 // WINDOWS-NEXT: ret i32 %[[RET]]
@@ -311,14 +311,14 @@ int HasParamsAndReturn(int i, double d);
 
 ATTR(cpu_dispatch(atom, generic, pentium))
 int GenericAndPentium(int i, double d);
-// LINUX: define weak_odr ptr @GenericAndPentium.resolver()
+// LINUX: define weak_odr ptr @GenericAndPentium.resolver() #[[ATTR_RESOLVER]]
 // LINUX: call void @__cpu_indicator_init
 // LINUX: ret ptr @GenericAndPentium.O
 // LINUX: ret ptr @GenericAndPentium.B
 // LINUX-NOT: ret ptr @GenericAndPentium.A
 // LINUX-NOT: call void @llvm.trap
 
-// WINDOWS: define weak_odr dso_local i32 @GenericAndPentium(i32 %0, double %1) comdat
+// WINDOWS: define weak_odr dso_local i32 @GenericAndPentium(i32 %0, double %1) #[[ATTR_RESOLVER]] comdat
 // WINDOWS: call void @__cpu_indicator_init
 // WINDOWS: %[[RET:.+]] = musttail call i32 @GenericAndPentium.O(i32 %0, double %1)
 // WINDOWS-NEXT: ret i32 %[[RET]]
@@ -329,11 +329,11 @@ int GenericAndPentium(int i, double d);
 
 ATTR(cpu_dispatch(atom, pentium))
 int DispatchFirst(void);
-// LINUX: define weak_odr ptr @DispatchFirst.resolver
+// LINUX: define weak_odr ptr @DispatchFirst.resolver() #[[ATTR_RESOLVER]]
 // LINUX: ret ptr @DispatchFirst.O
 // LINUX: ret ptr @DispatchFirst.B
 
-// WINDOWS: define weak_odr dso_local i32 @DispatchFirst() comdat
+// WINDOWS: define weak_odr dso_local i32 @DispatchFirst() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: %[[RET:.+]] = musttail call i32 @DispatchFirst.O()
 // WINDOWS-NEXT: ret i32 %[[RET]]
 // WINDOWS: %[[RET:.+]] = musttail call i32 @DispatchFirst.B()
@@ -360,6 +360,7 @@ void OrderDispatchUsageSpecific(void) {}
 
 // CHECK: attributes #[[S]] = {{.*}}"target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK-SAME: "tune-cpu"="ivybridge"
+// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
 // CHECK: attributes #[[K]] = {{.*}}"target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK-SAME: "tune-cpu"="knl"
 // CHECK: attributes #[[O]] = {{.*}}"target-features"="+cmov,+cx16,+cx8,+fxsr,+mmx,+movbe,+sahf,+sse,+sse2,+sse3,+ssse3,+x87"
diff --git a/clang/test/CodeGen/attr-target-clones-aarch64.c b/clang/test/CodeGen/attr-target-clones-aarch64.c
index 57add8b8c8abc..f790273e02aa8 100644
--- a/clang/test/CodeGen/attr-target-clones-aarch64.c
+++ b/clang/test/CodeGen/attr-target-clones-aarch64.c
@@ -172,7 +172,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:    ret i32 [[ADD5]]
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@ftc_def.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@ftc_def.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER:[0-9]+]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -194,7 +195,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:    ret ptr @ftc_def.default
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@ftc_dup1.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@ftc_dup1.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -208,7 +210,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:    ret ptr @ftc_dup1.default
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@ftc_dup2.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@ftc_dup2.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -230,7 +233,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:    ret ptr @ftc_dup2.default
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@ftc_dup3.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@ftc_dup3.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -273,7 +277,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@ftc_inline2.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@ftc_inline2.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -337,7 +342,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:    ret i32 3
 //
 //
-// CHECK-LABEL: define {{[^@]+}}@ftc_inline3.resolver() comdat {
+// CHECK-LABEL: define {{[^@]+}}@ftc_inline3.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -563,7 +569,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-MTE-BTI-NEXT:    ret i32 [[ADD5]]
 //
 //
-// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_def.resolver() comdat {
+// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_def.resolver()
+// CHECK-MTE-BTI-SAME: #[[ATTR_RESOLVER:[0-9]+]] comdat {
 // CHECK-MTE-BTI-NEXT:  resolver_entry:
 // CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -585,7 +592,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-MTE-BTI-NEXT:    ret ptr @ftc_def.default
 //
 //
-// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup1.resolver() comdat {
+// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup1.resolver()
+// CHECK-MTE-BTI-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-MTE-BTI-NEXT:  resolver_entry:
 // CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -599,7 +607,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-MTE-BTI-NEXT:    ret ptr @ftc_dup1.default
 //
 //
-// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup2.resolver() comdat {
+// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup2.resolver()
+// CHECK-MTE-BTI-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-MTE-BTI-NEXT:  resolver_entry:
 // CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -621,7 +630,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-MTE-BTI-NEXT:    ret ptr @ftc_dup2.default
 //
 //
-// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup3.resolver() comdat {
+// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup3.resolver()
+// CHECK-MTE-BTI-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-MTE-BTI-NEXT:  resolver_entry:
 // CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -664,7 +674,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-MTE-BTI-NEXT:    ret i32 2
 //
 //
-// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline2.resolver() comdat {
+// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline2.resolver()
+// CHECK-MTE-BTI-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-MTE-BTI-NEXT:  resolver_entry:
 // CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -728,7 +739,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-MTE-BTI-NEXT:    ret i32 3
 //
 //
-// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline3.resolver() comdat {
+// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline3.resolver()
+// CHECK-MTE-BTI-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-MTE-BTI-NEXT:  resolver_entry:
 // CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -749,6 +761,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-MTE-BTI:       resolver_else2:
 // CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3.default
 //
+// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
+// CHECK-MTE-BTI: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
 //.
 // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
diff --git a/clang/test/CodeGen/attr-target-clones-riscv.c b/clang/test/CodeGen/attr-target-clones-riscv.c
index 642302ba9d229..77e935127313f 100644
--- a/clang/test/CodeGen/attr-target-clones-riscv.c
+++ b/clang/test/CodeGen/attr-target-clones-riscv.c
@@ -53,7 +53,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @foo1.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @foo1.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER:[0-9]+]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -84,7 +85,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: define weak_odr ptr @foo2.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @foo2.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -116,7 +118,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
 // CHECK-NEXT:    ret i32 3
 //
 //
-// CHECK-LABEL: define weak_odr ptr @foo3.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @foo3.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -141,7 +144,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
 // CHECK-NEXT:    ret i32 4
 //
 //
-// CHECK-LABEL: define weak_odr ptr @foo4.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @foo4.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -160,7 +164,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
 // CHECK-NEXT:    ret i32 5
 //
 //
-// CHECK-LABEL: define weak_odr ptr @foo5.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @foo5.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    ret ptr @foo5.default
@@ -178,7 +183,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: define weak_odr ptr @foo6.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @foo6.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -215,7 +221,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: define weak_odr ptr @foo7.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @foo7.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -266,7 +273,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: define weak_odr ptr @foo8.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @foo8.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -317,7 +325,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7()
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: define weak_odr ptr @foo9.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @foo9.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
diff --git a/clang/test/CodeGen/attr-target-clones.c b/clang/test/CodeGen/attr-target-clones.c
index 3256db061f9a2..295b25d6478eb 100644
--- a/clang/test/CodeGen/attr-target-clones.c
+++ b/clang/test/CodeGen/attr-target-clones.c
@@ -44,45 +44,45 @@
 static int __attribute__((target_clones("sse4.2, default"))) internal(void) { return 0; }
 int use(void) { return internal(); }
 /// Internal linkage resolvers do not use comdat.
-// LINUX: define internal ptr @internal.resolver() {
-// DARWIN: define internal ptr @internal.resolver() {
-// WINDOWS: define internal i32 @internal() {
+// LINUX: define internal ptr @internal.resolver() #[[ATTR_RESOLVER:[0-9]+]] {
+// DARWIN: define internal ptr @internal.resolver() #[[ATTR_RESOLVER:[0-9]+]] {
+// WINDOWS: define internal i32 @internal() #[[ATTR_RESOLVER:[0-9]+]] {
 
 int __attribute__((target_clones("sse4.2, default"))) foo(void) { return 0; }
 // LINUX: define {{.*}}i32 @foo.sse4.2.0()
 // LINUX: define {{.*}}i32 @foo.default.1()
-// LINUX: define weak_odr ptr @foo.resolver() comdat
+// LINUX: define weak_odr ptr @foo.resolver() #[[ATTR_RESOLVER]] comdat
 // LINUX: ret ptr @foo.sse4.2.0
 // LINUX: ret ptr @foo.default.1
 
 // DARWIN: define {{.*}}i32 @foo.sse4.2.0()
 // DARWIN: define {{.*}}i32 @foo.default.1()
-// DARWIN: define weak_odr ptr @foo.resolver() {
+// DARWIN: define weak_odr ptr @foo.resolver() #[[ATTR_RESOLVER]] {
 // DARWIN: ret ptr @foo.sse4.2.0
 // DARWIN: ret ptr @foo.default.1
 
 // WINDOWS: define dso_local i32 @foo.sse4.2.0()
 // WINDOWS: define dso_local i32 @foo.default.1()
-// WINDOWS: define weak_odr dso_local i32 @foo() comdat
+// WINDOWS: define weak_odr dso_local i32 @foo() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: musttail call i32 @foo.sse4.2.0
 // WINDOWS: musttail call i32 @foo.default.1
 
 __attribute__((target_clones("default,default ,sse4.2"))) void foo_dupes(void) {}
 // LINUX: define {{.*}}void @foo_dupes.default.1()
 // LINUX: define {{.*}}void @foo_dupes.sse4.2.0()
-// LINUX: define weak_odr ptr @foo_dupes.resolver() comdat
+// LINUX: define weak_odr ptr @foo_dupes.resolver() #[[ATTR_RESOLVER]] comdat
 // LINUX: ret ptr @foo_dupes.sse4.2.0
 // LINUX: ret ptr @foo_dupes.default.1
 
 // DARWIN: define {{.*}}void @foo_dupes.default.1()
 // DARWIN: define {{.*}}void @foo_dupes.sse4.2.0()
-// DARWIN: define weak_odr ptr @foo_dupes.resolver() {
+// DARWIN: define weak_odr ptr @foo_dupes.resolver() #[[ATTR_RESOLVER]] {
 // DARWIN: ret ptr @foo_dupes.sse4.2.0
 // DARWIN: ret ptr @foo_dupes.default.1
 
 // WINDOWS: define dso_local void @foo_dupes.default.1()
 // WINDOWS: define dso_local void @foo_dupes.sse4.2.0()
-// WINDOWS: define weak_odr dso_local void @foo_dupes() comdat
+// WINDOWS: define weak_odr dso_local void @foo_dupes() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: musttail call void @foo_dupes.sse4.2.0
 // WINDOWS: musttail call void @foo_dupes.default.1
 
@@ -109,19 +109,19 @@ int bar(void) {
 void __attribute__((target_clones("default, arch=ivybridge"))) unused(void) {}
 // LINUX: define {{.*}}void @unused.default.1()
 // LINUX: define {{.*}}void @unused.arch_ivybridge.0()
-// LINUX: define weak_odr ptr @unused.resolver() comdat
+// LINUX: define weak_odr ptr @unused.resolver() #[[ATTR_RESOLVER]] comdat
 // LINUX: ret ptr @unused.arch_ivybridge.0
 // LINUX: ret ptr @unused.default.1
 
 // DARWIN: define {{.*}}void @unused.default.1()
 // DARWIN: define {{.*}}void @unused.arch_ivybridge.0()
-// DARWIN: define weak_odr ptr @unused.resolver() {
+// DARWIN: define weak_odr ptr @unused.resolver() #[[ATTR_RESOLVER]] {
 // DARWIN: ret ptr @unused.arch_ivybridge.0
 // DARWIN: ret ptr @unused.default.1
 
 // WINDOWS: define dso_local void @unused.default.1()
 // WINDOWS: define dso_local void @unused.arch_ivybridge.0()
-// WINDOWS: define weak_odr dso_local void @unused() comdat
+// WINDOWS: define weak_odr dso_local void @unused() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: musttail call void @unused.arch_ivybridge.0
 // WINDOWS: musttail call void @unused.default.1
 
@@ -144,34 +144,34 @@ int bar3(void) {
   // WINDOWS: call i32 @foo_inline2()
 }
 
-// LINUX: define weak_odr ptr @foo_inline.resolver() comdat
+// LINUX: define weak_odr ptr @foo_inline.resolver() #[[ATTR_RESOLVER]] comdat
 // LINUX: ret ptr @foo_inline.arch_sandybridge.0
 // LINUX: ret ptr @foo_inline.sse4.2.1
 // LINUX: ret ptr @foo_inline.default.2
 
-// DARWIN: define weak_odr ptr @foo_inline.resolver() {
+// DARWIN: define weak_odr ptr @foo_inline.resolver() #[[ATTR_RESOLVER]] {
 // DARWIN: ret ptr @foo_inline.arch_sandybridge.0
 // DARWIN: ret ptr @foo_inline.sse4.2.1
 // DARWIN: ret ptr @foo_inline.default.2
 
-// WINDOWS: define weak_odr dso_local i32 @foo_inline() comdat
+// WINDOWS: define weak_odr dso_local i32 @foo_inline() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: musttail call i32 @foo_inline.arch_sandybridge.0
 // WINDOWS: musttail call i32 @foo_inline.sse4.2.1
 // WINDOWS: musttail call i32 @foo_inline.default.2
 
 inline int __attribute__((target_clones("arch=sandybridge,default,sse4.2")))
 foo_inline2(void){ return 0; }
-// LINUX: define weak_odr ptr @foo_inline2.resolver() comdat
+// LINUX: define weak_odr ptr @foo_inline2.resolver() #[[ATTR_RESOLVER]] comdat
 // LINUX: ret ptr @foo_inline2.arch_sandybridge.0
 // LINUX: ret ptr @foo_inline2.sse4.2.1
 // LINUX: ret ptr @foo_inline2.default.2
 
-// DARWIN: define weak_odr ptr @foo_inline2.resolver() {
+// DARWIN: define weak_odr ptr @foo_inline2.resolver() #[[ATTR_RESOLVER]] {
 // DARWIN: ret ptr @foo_inline2.arch_sandybridge.0
 // DARWIN: ret ptr @foo_inline2.sse4.2.1
 // DARWIN: ret ptr @foo_inline2.default.2
 
-// WINDOWS: define weak_odr dso_local i32 @foo_inline2() comdat
+// WINDOWS: define weak_odr dso_local i32 @foo_inline2() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: musttail call i32 @foo_inline2.arch_sandybridge.0
 // WINDOWS: musttail call i32 @foo_inline2.sse4.2.1
 // WINDOWS: musttail call i32 @foo_inline2.default.2
@@ -194,15 +194,15 @@ int test_foo_used_no_defn(void) {
 }
 
 
-// LINUX: define weak_odr ptr @foo_used_no_defn.resolver() comdat
+// LINUX: define weak_odr ptr @foo_used_no_defn.resolver() #[[ATTR_RESOLVER]] comdat
 // LINUX: ret ptr @foo_used_no_defn.sse4.2.0
 // LINUX: ret ptr @foo_used_no_defn.default.1
 
-// DARWIN: define weak_odr ptr @foo_used_no_defn.resolver() {
+// DARWIN: define weak_odr ptr @foo_used_no_defn.resolver() #[[ATTR_RESOLVER]] {
 // DARWIN: ret ptr @foo_used_no_defn.sse4.2.0
 // DARWIN: ret ptr @foo_used_no_defn.default.1
 
-// WINDOWS: define weak_odr dso_local i32 @foo_used_no_defn() comdat
+// WINDOWS: define weak_odr dso_local i32 @foo_used_no_defn() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: musttail call i32 @foo_used_no_defn.sse4.2.0
 // WINDOWS: musttail call i32 @foo_used_no_defn.default.1
 
@@ -213,7 +213,7 @@ int isa_level(int) { return 0; }
 // LINUX:      define{{.*}} i32 @isa_level.arch_x86-64-v2.1(
 // LINUX:      define{{.*}} i32 @isa_level.arch_x86-64-v3.2(
 // LINUX:      define{{.*}} i32 @isa_level.arch_x86-64-v4.3(
-// LINUX:      define weak_odr ptr @isa_level.resolver() comdat
+// LINUX:      define weak_odr ptr @isa_level.resolver() #[[ATTR_RESOLVER]] comdat
 // LINUX:        call void @__cpu_indicator_init()
 // LINUX-NEXT:   load i32, ptr getelementptr inbounds ([3 x i32], ptr @__cpu_features2, i32 0, i32 2)
 // LINUX-NEXT:   and i32 %[[#]], 4
@@ -234,7 +234,7 @@ int isa_level(int) { return 0; }
 // DARWIN:      define{{.*}} i32 @isa_level.arch_x86-64-v2.1(
 // DARWIN:      define{{.*}} i32 @isa_level.arch_x86-64-v3.2(
 // DARWIN:      define{{.*}} i32 @isa_level.arch_x86-64-v4.3(
-// DARWIN:      define weak_odr ptr @isa_level.resolver() {
+// DARWIN:      define weak_odr ptr @isa_level.resolver() #[[ATTR_RESOLVER]] {
 // DARWIN:        call void @__cpu_indicator_init()
 // DARWIN-NEXT:   load i32, ptr getelementptr inbounds ([3 x i32], ptr @__cpu_features2, i32 0, i32 2)
 // DARWIN-NEXT:   and i32 %[[#]], 4
@@ -288,6 +288,7 @@ int isa_level(int) { return 0; }
 // WINDOWS: declare dso_local i32 @foo_used_no_defn.sse4.2.0()
 
 
+// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
 // CHECK: attributes #[[SSE42]] =
 // CHECK-SAME: "target-features"="+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87"
 // CHECK: attributes #[[SB]] =
diff --git a/clang/test/CodeGen/attr-target-mv-va-args.c b/clang/test/CodeGen/attr-target-mv-va-args.c
index dbf5a74205c4c..e8238dac8f310 100644
--- a/clang/test/CodeGen/attr-target-mv-va-args.c
+++ b/clang/test/CodeGen/attr-target-mv-va-args.c
@@ -24,7 +24,7 @@ int bar(void) {
 // IFUNC-ELF: call i32 (i32, ...) @foo.ifunc(i32 noundef 1, i32 noundef 97, double
 // IFUNC-ELF: call i32 (i32, ...) @foo.ifunc(i32 noundef 2, double noundef 2.2{{[0-9Ee+]+}}, ptr noundef
 
-// IFUNC-ELF: define weak_odr ptr @foo.resolver() comdat
+// IFUNC-ELF: define weak_odr ptr @foo.resolver() #{{[0-9]+}} comdat
 // IFUNC-ELF: ret ptr @foo.arch_sandybridge
 // IFUNC-ELF: ret ptr @foo.arch_ivybridge
 // IFUNC-ELF: ret ptr @foo.sse4.2
@@ -42,7 +42,7 @@ int bar(void) {
 // IFUNC-MACHO: call i32 (i32, ...) @foo.ifunc(i32 noundef 1, i32 noundef 97, double
 // IFUNC-MACHO: call i32 (i32, ...) @foo.ifunc(i32 noundef 2, double noundef 2.2{{[0-9Ee+]+}}, ptr noundef
 
-// IFUNC-MACHO: define weak_odr ptr @foo.resolver()
+// IFUNC-MACHO: define weak_odr ptr @foo.resolver() #{{[0-9]+}} 
 // IFUNC-MACHO: ret ptr @foo.arch_sandybridge
 // IFUNC-MACHO: ret ptr @foo.arch_ivybridge
 // IFUNC-MACHO: ret ptr @foo.sse4.2
@@ -55,12 +55,12 @@ int bar(void) {
 // NO-IFUNC: ret i32 1
 // NO-IFUNC: define dso_local i32 @foo(i32 noundef %i, ...)
 // NO-IFUNC: ret i32 2
-// NO-IFUNC: define dso_local i32 @bar()
+// NO-IFUNC: define dso_local i32 @bar() 
 // NO-IFUNC: call i32 (i32, ...) @foo.resolver(i32 noundef 1, i32 noundef 97, double
 // NO-IFUNC: call i32 (i32, ...) @foo.resolver(i32 noundef 2, double noundef 2.2{{[0-9Ee+]+}}, ptr noundef
 
-// WINDOWS: define weak_odr dso_local i32 @foo.resolver(i32 %0, ...) comdat
-// NO-IFUNC-ELF: define weak_odr i32 @foo.resolver(i32 %0, ...) comdat
+// WINDOWS: define weak_odr dso_local i32 @foo.resolver(i32 %0, ...) #{{[0-9]+}} comdat
+// NO-IFUNC-ELF: define weak_odr i32 @foo.resolver(i32 %0, ...) #{{[0-9]+}} comdat
 // NO-IFUNC: musttail call i32 (i32, ...) @foo.arch_sandybridge
 // NO-IFUNC: musttail call i32 (i32, ...) @foo.arch_ivybridge
 // NO-IFUNC: musttail call i32 (i32, ...) @foo.sse4.2
diff --git a/clang/test/CodeGen/attr-target-mv.c b/clang/test/CodeGen/attr-target-mv.c
index b8807dd9171d5..4ab5d6f950ccd 100644
--- a/clang/test/CodeGen/attr-target-mv.c
+++ b/clang/test/CodeGen/attr-target-mv.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=ITANIUM,LINUX
-// RUN: %clang_cc1 -triple x86_64-apple-macos -emit-llvm %s -o - | FileCheck %s --check-prefixes=ITANIUM,DARWIN
-// RUN: %clang_cc1 -triple x86_64-windows-pc -emit-llvm %s -o - | FileCheck %s --check-prefix=WINDOWS
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,ITANIUM,LINUX
+// RUN: %clang_cc1 -triple x86_64-apple-macos -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,ITANIUM,DARWIN
+// RUN: %clang_cc1 -triple x86_64-windows-pc -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,WINDOWS
 
 int __attribute__((target("sse4.2"))) foo(void) { return 0; }
 int __attribute__((target("arch=sandybridge"))) foo(void);
@@ -277,7 +277,7 @@ void calls_pr50025c(void) { pr50025c(); }
 // WINDOWS: define dso_local i32 @bar()
 // WINDOWS: call i32 @foo.resolver()
 
-// ITANIUM: define weak_odr ptr @foo.resolver()
+// ITANIUM: define weak_odr ptr @foo.resolver() #[[ATTR_RESOLVER:[0-9]+]] 
 // LINUX-SAME: comdat
 // ITANIUM: call void @__cpu_indicator_init()
 // ITANIUM: ret ptr @foo.arch_sandybridge
@@ -285,7 +285,7 @@ void calls_pr50025c(void) { pr50025c(); }
 // ITANIUM: ret ptr @foo.sse4.2
 // ITANIUM: ret ptr @foo
 
-// WINDOWS: define weak_odr dso_local i32 @foo.resolver() comdat
+// WINDOWS: define weak_odr dso_local i32 @foo.resolver() #[[ATTR_RESOLVER:[0-9]+]] comdat
 // WINDOWS: call void @__cpu_indicator_init()
 // WINDOWS: call i32 @foo.arch_sandybridge
 // WINDOWS: call i32 @foo.arch_ivybridge
@@ -293,9 +293,9 @@ void calls_pr50025c(void) { pr50025c(); }
 // WINDOWS: call i32 @foo
 
 /// Internal linkage resolvers do not use comdat.
-// ITANIUM: define internal ptr @foo_internal.resolver() {
+// ITANIUM: define internal ptr @foo_internal.resolver() #[[ATTR_RESOLVER]] {
 
-// WINDOWS: define internal i32 @foo_internal.resolver() {
+// WINDOWS: define internal i32 @foo_internal.resolver() #[[ATTR_RESOLVER]] {
 
 // ITANIUM: define{{.*}} i32 @bar2()
 // ITANIUM: call i32 @foo_inline.ifunc()
@@ -303,7 +303,7 @@ void calls_pr50025c(void) { pr50025c(); }
 // WINDOWS: define dso_local i32 @bar2()
 // WINDOWS: call i32 @foo_inline.resolver()
 
-// ITANIUM: define weak_odr ptr @foo_inline.resolver()
+// ITANIUM: define weak_odr ptr @foo_inline.resolver() #[[ATTR_RESOLVER]] 
 // LINUX-SAME: comdat
 // ITANIUM: call void @__cpu_indicator_init()
 // ITANIUM: ret ptr @foo_inline.arch_sandybridge
@@ -311,7 +311,7 @@ void calls_pr50025c(void) { pr50025c(); }
 // ITANIUM: ret ptr @foo_inline.sse4.2
 // ITANIUM: ret ptr @foo_inline
 
-// WINDOWS: define weak_odr dso_local i32 @foo_inline.resolver() comdat
+// WINDOWS: define weak_odr dso_local i32 @foo_inline.resolver() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: call void @__cpu_indicator_init()
 // WINDOWS: call i32 @foo_inline.arch_sandybridge
 // WINDOWS: call i32 @foo_inline.arch_ivybridge
@@ -324,12 +324,12 @@ void calls_pr50025c(void) { pr50025c(); }
 // WINDOWS: define dso_local void @bar3()
 // WINDOWS: call void @foo_decls.resolver()
 
-// ITANIUM: define weak_odr ptr @foo_decls.resolver()
+// ITANIUM: define weak_odr ptr @foo_decls.resolver() #[[ATTR_RESOLVER]]
 // LINUX-SAME: comdat
 // ITANIUM: ret ptr @foo_decls.sse4.2
 // ITANIUM: ret ptr @foo_decls
 
-// WINDOWS: define weak_odr dso_local void @foo_decls.resolver() comdat
+// WINDOWS: define weak_odr dso_local void @foo_decls.resolver() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: call void @foo_decls.sse4.2
 // WINDOWS: call void @foo_decls
 
@@ -339,7 +339,7 @@ void calls_pr50025c(void) { pr50025c(); }
 // WINDOWS: define dso_local void @bar4()
 // WINDOWS: call void @foo_multi.resolver(i32 noundef 1, double noundef 5.{{[0+e]*}})
 
-// ITANIUM: define weak_odr ptr @foo_multi.resolver()
+// ITANIUM: define weak_odr ptr @foo_multi.resolver() #[[ATTR_RESOLVER]] 
 // LINUX-SAME: comdat
 // ITANIUM: and i32 %{{.*}}, 4352
 // ITANIUM: icmp eq i32 %{{.*}}, 4352
@@ -353,7 +353,7 @@ void calls_pr50025c(void) { pr50025c(); }
 // ITANIUM: ret ptr @foo_multi.avx_sse4.2
 // ITANIUM: ret ptr @foo_multi
 
-// WINDOWS: define weak_odr dso_local void @foo_multi.resolver(i32 %0, double %1) comdat
+// WINDOWS: define weak_odr dso_local void @foo_multi.resolver(i32 %0, double %1) #[[ATTR_RESOLVER]] comdat
 // WINDOWS: and i32 %{{.*}}, 4352
 // WINDOWS: icmp eq i32 %{{.*}}, 4352
 // WINDOWS: call void @foo_multi.fma4_sse4.2(i32 %0, double %1)
@@ -392,20 +392,20 @@ void calls_pr50025c(void) { pr50025c(); }
 // WINDOWS: call i32 @fwd_decl_default.resolver()
 // WINDOWS: call i32 @fwd_decl_avx.resolver()
 
-// ITANIUM: define weak_odr ptr @fwd_decl_default.resolver()
+// ITANIUM: define weak_odr ptr @fwd_decl_default.resolver() #[[ATTR_RESOLVER]]
 // LINUX-SAME: comdat
 // ITANIUM: call void @__cpu_indicator_init()
 // ITANIUM: ret ptr @fwd_decl_default
-// ITANIUM: define weak_odr ptr @fwd_decl_avx.resolver()
+// ITANIUM: define weak_odr ptr @fwd_decl_avx.resolver() #[[ATTR_RESOLVER]]
 // LINUX-SAME: comdat
 // ITANIUM: call void @__cpu_indicator_init()
 // ITANIUM: ret ptr @fwd_decl_avx.avx
 // ITANIUM: ret ptr @fwd_decl_avx
 
-// WINDOWS: define weak_odr dso_local i32 @fwd_decl_default.resolver() comdat
+// WINDOWS: define weak_odr dso_local i32 @fwd_decl_default.resolver() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: call void @__cpu_indicator_init()
 // WINDOWS: call i32 @fwd_decl_default
-// WINDOWS: define weak_odr dso_local i32 @fwd_decl_avx.resolver() comdat
+// WINDOWS: define weak_odr dso_local i32 @fwd_decl_avx.resolver() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: call void @__cpu_indicator_init()
 // WINDOWS: call i32 @fwd_decl_avx.avx
 // WINDOWS: call i32 @fwd_decl_avx
@@ -478,12 +478,14 @@ void calls_pr50025c(void) { pr50025c(); }
 // WINDOWS: define linkonce_odr dso_local void @pr50025c() #{{[0-9]*}} comdat
 // WINDOWS: call void @pr50025b.resolver()
 
-// ITANIUM: define weak_odr ptr @pr50025b.resolver()
+// ITANIUM: define weak_odr ptr @pr50025b.resolver() #[[ATTR_RESOLVER]] 
 // LINUX-SAME: comdat
 // ITANIUM: ret ptr @pr50025b
 // ITANIUM: define linkonce void @pr50025b()
 // ITANIUM: call void @must_be_emitted()
-// WINDOWS: define weak_odr dso_local void @pr50025b.resolver() comdat
+// WINDOWS: define weak_odr dso_local void @pr50025b.resolver() #[[ATTR_RESOLVER]] comdat
 // WINDOWS: musttail call void @pr50025b()
 // WINDOWS: define linkonce_odr dso_local void @pr50025b() #{{[0-9]*}} comdat
 // WINDOWS: call void @must_be_emitted()
+
+// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
diff --git a/clang/test/CodeGen/attr-target-version-riscv.c b/clang/test/CodeGen/attr-target-version-riscv.c
index fbead04caf455..96f0c37e06725 100644
--- a/clang/test/CodeGen/attr-target-version-riscv.c
+++ b/clang/test/CodeGen/attr-target-version-riscv.c
@@ -49,7 +49,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7();
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @foo1.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @foo1.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER:[0-9]+]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -74,7 +75,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7();
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @foo2.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @foo2.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -112,7 +114,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7();
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @foo3.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @foo3.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -150,7 +153,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7();
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @foo4.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @foo4.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -201,7 +205,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7();
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @foo5.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @foo5.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -252,7 +257,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7();
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @foo6.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @foo6.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -303,7 +309,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5() + foo6() + foo7();
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @foo7.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @foo7.resolver()
+// CHECK-SAME: #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
diff --git a/clang/test/CodeGen/basic-block-sections.c b/clang/test/CodeGen/basic-block-sections.c
index a61b8dd4ac376..0c21a4cb1442c 100644
--- a/clang/test/CodeGen/basic-block-sections.c
+++ b/clang/test/CodeGen/basic-block-sections.c
@@ -30,8 +30,10 @@ int another(int a) {
 //
 // BB_WORLD: .section .text.world,"ax",@progbits{{$}}
 // BB_WORLD: world:
-// BB_WORLD: .section .text.world,"ax",@progbits,unique
-// BB_WORLD: world.__part.1:
+// BB_ALL: .section .text.world,"ax",@progbits,unique
+// BB_ALL: world.__part.1:
+// BB_LIST: .section .text.split.world,"ax",@progbits
+// BB_LIST: world.cold:
 // BB_ALL: .section .text.another,"ax",@progbits
 // BB_ALL: another.__part.1:
 // BB_LIST-NOT: .section .text.another,"ax",@progbits
diff --git a/clang/test/CodeGen/builtin-sqrt.c b/clang/test/CodeGen/builtin-sqrt.c
index 2313a68d2d0e2..3ebf2ac91ccdf 100644
--- a/clang/test/CodeGen/builtin-sqrt.c
+++ b/clang/test/CodeGen/builtin-sqrt.c
@@ -11,5 +11,5 @@ float foo(float X) {
 // HAS_ERRNO-NOT: attributes [[ATTR]] = {{{.*}} memory(none)
 
 // NO_ERRNO: declare float @llvm.sqrt.f32(float) [[ATTR:#[0-9]+]]
-// NO_ERRNO: attributes [[ATTR]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+// NO_ERRNO: attributes [[ATTR]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 
diff --git a/clang/test/CodeGen/builtins-elementwise-math.c b/clang/test/CodeGen/builtins-elementwise-math.c
index e9344d8fe0b8b..2df485f0155c3 100644
--- a/clang/test/CodeGen/builtins-elementwise-math.c
+++ b/clang/test/CodeGen/builtins-elementwise-math.c
@@ -6,6 +6,7 @@ typedef half half2 __attribute__((ext_vector_type(2)));
 typedef float float2 __attribute__((ext_vector_type(2)));
 typedef float float4 __attribute__((ext_vector_type(4)));
 typedef short int si8 __attribute__((ext_vector_type(8)));
+typedef int int4 __attribute__((ext_vector_type(4)));
 typedef unsigned int u4 __attribute__((ext_vector_type(4)));
 typedef double double2 __attribute__((ext_vector_type(2)));
 typedef double double3 __attribute__((ext_vector_type(3)));
@@ -729,6 +730,36 @@ void test_builtin_elementwise_exp10(float f1, float f2, double d1, double d2,
   vf2 = __builtin_elementwise_exp10(vf1);
 }
 
+void test_builtin_elementwise_ldexp(float f1, float f2, double d1, double d2,
+                                    float4 vf1, float4 vf2, int i1, int4 vi1, short s1, long l1) {
+  // CHECK-LABEL: define void @test_builtin_elementwise_ldexp(
+  // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
+  // CHECK:      [[I1:%.+]] = load i32, ptr %i1.addr, align 4
+  // CHECK-NEXT: call float @llvm.ldexp.f32.i32(float [[F1]], i32 [[I1]])
+  f2 = __builtin_elementwise_ldexp(f1, i1);
+
+  // CHECK:      [[F2:%.+]] = load float, ptr %f1.addr, align 4
+  // CHECK:      [[S1:%.+]] = load i16, ptr %s1.addr, align 2
+  // CHECK:      [[Ext1:%.+]] = sext i16 [[S1]] to i32
+  // CHECK-NEXT: call float @llvm.ldexp.f32.i32(float [[F2]], i32 [[Ext1]])
+  f2 = __builtin_elementwise_ldexp(f1, s1);
+
+  // CHECK:      [[F3:%.+]] = load float, ptr %f1.addr, align 4
+  // CHECK:      [[L1:%.+]] = load i64, ptr %l1.addr, align 8
+  // CHECK-NEXT: call float @llvm.ldexp.f32.i64(float [[F3]], i64 [[L1]])
+  f2 = __builtin_elementwise_ldexp(f1, l1);
+
+  // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
+  // CHECK:      [[I2:%.+]] = load i32, ptr %i1.addr, align 4
+  // CHECK-NEXT: call double @llvm.ldexp.f64.i32(double [[D1]], i32 [[I2]])
+  d2 = __builtin_elementwise_ldexp(d1, i1);
+
+  // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
+  // CHECK:      [[VI1:%.+]] = load <4 x i32>, ptr %vi1.addr, align 16
+  // CHECK-NEXT: call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> [[VF1]], <4 x i32> [[VI1]])
+  vf2 = __builtin_elementwise_ldexp(vf1, vi1);
+}
+
 void test_builtin_elementwise_floor(float f1, float f2, double d1, double d2,
                                     float4 vf1, float4 vf2) {
   // CHECK-LABEL: define void @test_builtin_elementwise_floor(
diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
index 035c4c6066be2..60a35f4fe0c37 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
@@ -8,7 +8,7 @@
 typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
 
 // CHECK: call half @llvm.nvvm.ex2.approx.f16(half {{.*}})
-// CHECK: call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> {{.*}})
+// CHECK: call <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half> {{.*}})
 // CHECK: call half @llvm.nvvm.fma.rn.relu.f16(half {{.*}}, half {{.*}}, half {{.*}})
 // CHECK: call half @llvm.nvvm.fma.rn.ftz.relu.f16(half {{.*}}, half {{.*}}, half {{.*}})
 // CHECK: call <2 x half> @llvm.nvvm.fma.rn.relu.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type.c b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
index 01a004efd71e4..1f16c7e54b85d 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
@@ -41,7 +41,7 @@ __device__ void nvvm_ex2_sm75() {
 #if __CUDA_ARCH__ >= 750
   // CHECK_PTX70_SM75: call half @llvm.nvvm.ex2.approx.f16
   __nvvm_ex2_approx_f16(0.1f16);
-  // CHECK_PTX70_SM75: call <2 x half> @llvm.nvvm.ex2.approx.f16x2
+  // CHECK_PTX70_SM75: call <2 x half> @llvm.nvvm.ex2.approx.v2f16
   __nvvm_ex2_approx_f16x2({0.1f16, 0.7f16});
 #endif
   // CHECK: ret void
diff --git a/clang/test/CodeGen/exprs.c b/clang/test/CodeGen/exprs.c
index 5cca9722dcb3a..93015da074bf2 100644
--- a/clang/test/CodeGen/exprs.c
+++ b/clang/test/CodeGen/exprs.c
@@ -196,10 +196,17 @@ void f18(void) {
 
 // Ensure the right stmt is returned
 int f19(void) {
-  return ({ 3;;4;; });
+  return ({ 3;;4; });
 }
 // CHECK-LABEL: define{{.*}} i32 @f19()
 // CHECK: [[T:%.*]] = alloca i32
 // CHECK: store i32 4, ptr [[T]]
 // CHECK: [[L:%.*]] = load i32, ptr [[T]]
 // CHECK: ret i32 [[L]]
+
+// PR166036: The trailing NullStmt should result in a void.
+void f20(void) {
+  return ({ 3;;4;; });
+}
+// CHECK-LABEL: define{{.*}} void @f20()
+// CHECK: ret void
diff --git a/clang/test/CodeGen/libcalls.c b/clang/test/CodeGen/libcalls.c
index 1e4b06e34aaf9..923719f3ec8e4 100644
--- a/clang/test/CodeGen/libcalls.c
+++ b/clang/test/CodeGen/libcalls.c
@@ -74,9 +74,9 @@ void test_fma(float a0, double a1, long double a2) {
 // CHECK-YES: declare float @fmaf(float noundef, float noundef, float noundef)
 // CHECK-YES: declare double @fma(double noundef, double noundef, double noundef)
 // CHECK-YES: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef)
-// CHECK-NO: declare float @llvm.fma.f32(float, float, float) [[NUW_RN2:#[0-9]+]]
-// CHECK-NO: declare double @llvm.fma.f64(double, double, double) [[NUW_RN2]]
-// CHECK-NO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[NUW_RN2]]
+// CHECK-NO: declare float @llvm.fma.f32(float, float, float) [[NUW_RNI]]
+// CHECK-NO: declare double @llvm.fma.f64(double, double, double) [[NUW_RNI]]
+// CHECK-NO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[NUW_RNI]]
 
 // Just checking to make sure these library functions are marked readnone
 void test_builtins(double d, float f, long double ld) {
@@ -85,9 +85,9 @@ void test_builtins(double d, float f, long double ld) {
   double atan_ = atan(d);
   long double atanl_ = atanl(ld);
   float atanf_ = atanf(f);
-// CHECK-NO: declare double @llvm.atan.f64(double) [[NUW_RNI:#[0-9]+]]
-// CHECK-NO: declare x86_fp80 @llvm.atan.f80(x86_fp80) [[NUW_RNI]]
-// CHECK-NO: declare float @llvm.atan.f32(float) [[NUW_RNI]]
+// CHECK-NO: declare double @llvm.atan.f64(double) [[NUW_RN2:#[0-9]+]]
+// CHECK-NO: declare x86_fp80 @llvm.atan.f80(x86_fp80) [[NUW_RN2]]
+// CHECK-NO: declare float @llvm.atan.f32(float) [[NUW_RN2]]
 // CHECK-YES: declare double @atan(double noundef) [[NUW:#[0-9]+]]
 // CHECK-YES: declare x86_fp80 @atanl(x86_fp80 noundef) [[NUW]]
 // CHECK-YES: declare float @atanf(float noundef) [[NUW]]
@@ -95,9 +95,9 @@ void test_builtins(double d, float f, long double ld) {
   double atan2_ = atan2(d, 2);
   long double atan2l_ = atan2l(ld, ld);
   float atan2f_ = atan2f(f, f);
-// CHECK-NO: declare double @llvm.atan2.f64(double, double) [[NUW_RNI]]
-// CHECK-NO: declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80) [[NUW_RNI]]
-// CHECK-NO: declare float @llvm.atan2.f32(float, float) [[NUW_RNI]]
+// CHECK-NO: declare double @llvm.atan2.f64(double, double) [[NUW_RN2]]
+// CHECK-NO: declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80) [[NUW_RN2]]
+// CHECK-NO: declare float @llvm.atan2.f32(float, float) [[NUW_RN2]]
 // CHECK-YES: declare double @atan2(double noundef, double noundef) [[NUW]]
 // CHECK-YES: declare x86_fp80 @atan2l(x86_fp80 noundef, x86_fp80 noundef) [[NUW]]
 // CHECK-YES: declare float @atan2f(float noundef, float noundef) [[NUW]]
@@ -124,4 +124,4 @@ void test_builtins(double d, float f, long double ld) {
 }
 
 // CHECK-YES: attributes [[NUW]] = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+x87" }
-// CHECK-NO-DAG: attributes [[NUW_RNI]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+// CHECK-NO-DAG: attributes [[NUW_RNI]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c
index ad297828f48ed..d4cd6f86b3c51 100644
--- a/clang/test/CodeGen/math-libcalls.c
+++ b/clang/test/CodeGen/math-libcalls.c
@@ -35,27 +35,27 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   copysign(f,f); copysignf(f,f);copysignl(f,f);
 
-  // NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC]]
-  // NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
-  // NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
-  // HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-  // HAS_MAYTRAP: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
-  // HAS_MAYTRAP: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
-  // HAS_MAYTRAP: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+  // NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
+  // NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
+  // NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+  // HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
+  // HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
+  // HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+  // HAS_MAYTRAP: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
+  // HAS_MAYTRAP: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
+  // HAS_MAYTRAP: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 
   fabs(f);       fabsf(f);      fabsl(f);
 
-  // NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
-  // NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
-  // NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
-  // HAS_MAYTRAP: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
-  // HAS_MAYTRAP: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
-  // HAS_MAYTRAP: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
+  // NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
+  // NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
+  // NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+  // HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
+  // HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
+  // HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+  // HAS_MAYTRAP: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
+  // HAS_MAYTRAP: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
+  // HAS_MAYTRAP: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 
   frexp(f,i);    frexpf(f,i);   frexpl(f,i);
 
@@ -86,7 +86,7 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
   // NO__ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]]
   // NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]]
   // NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]]
+  // HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC:#[0-9]+]]
   // HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]]
   // HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]]
   // HAS_MAYTRAP: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]]
@@ -107,9 +107,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   pow(f,f);        powf(f,f);       powl(f,f);
 
-// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @pow(double noundef, double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @powf(float noundef, float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @powl(x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
@@ -206,21 +206,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   ceil(f);       ceilf(f);      ceill(f);
 
-// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.ceil.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.ceil.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.ceil.f80(
 
   cos(f);        cosf(f);       cosl(f);
 
-// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @cos(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @cosf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @cosl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -266,9 +266,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   exp(f);        expf(f);       expl(f);
 
-// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @exp(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @expf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @expl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -278,9 +278,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   exp2(f);       exp2f(f);      exp2l(f);
 
-// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @exp2(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @exp2f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @exp2l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -314,21 +314,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   floor(f);      floorf(f);     floorl(f);
 
-// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.floor.f64
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.floor.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.floor.f80(
 
   fma(f,f,f);        fmaf(f,f,f);       fmal(f,f,f);
 
-// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @fma(double noundef, double noundef, double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @fmaf(float noundef, float noundef, float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
@@ -350,39 +350,39 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   fmax(f,f);       fmaxf(f,f);      fmaxl(f,f);
 
-// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.maxnum.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.maxnum.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.maxnum.f80(
 
   fmin(f,f);       fminf(f,f);      fminl(f,f);
 
-// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.minnum.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.minnum.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.minnum.f80(
 
   fmaximum_num(*d,*d);       fmaximum_numf(f,f);      fmaximum_numl(*l,*l);
 
-// COMMON: declare double @llvm.maximumnum.f64(double, double) [[READNONE_INTRINSIC]]
-// COMMON: declare float @llvm.maximumnum.f32(float, float) [[READNONE_INTRINSIC]]
-// COMMON: declare x86_fp80 @llvm.maximumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// COMMON: declare double @llvm.maximumnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// COMMON: declare float @llvm.maximumnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// COMMON: declare x86_fp80 @llvm.maximumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 
   fminimum_num(*d,*d);       fminimum_numf(f,f);      fminimum_numl(*l,*l);
 
-// COMMON: declare double @llvm.minimumnum.f64(double, double) [[READNONE_INTRINSIC]]
-// COMMON: declare float @llvm.minimumnum.f32(float, float) [[READNONE_INTRINSIC]]
-// COMMON: declare x86_fp80 @llvm.minimumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// COMMON: declare double @llvm.minimumnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// COMMON: declare float @llvm.minimumnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// COMMON: declare x86_fp80 @llvm.minimumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 
   hypot(f,f);      hypotf(f,f);     hypotl(f,f);
 
@@ -422,9 +422,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   llrint(f);     llrintf(f);    llrintl(f);
 
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @llrint(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llrintf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llrintl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -434,9 +434,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   llround(f);    llroundf(f);   llroundl(f);
 
-// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @llround(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llroundf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llroundl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -446,9 +446,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   log(f);        logf(f);       logl(f);
 
-// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @logf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @logl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -458,9 +458,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   log10(f);      log10f(f);     log10l(f);
 
-// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log10(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @log10f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @log10l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -482,9 +482,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   log2(f);       log2f(f);      log2l(f);
 
-// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log2(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @log2f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @log2l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -506,9 +506,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   lrint(f);      lrintf(f);     lrintl(f);
 
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @lrint(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lrintf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lrintl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -518,9 +518,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   lround(f);     lroundf(f);    lroundl(f);
 
-// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @lround(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lroundf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lroundl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -530,12 +530,12 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   nearbyint(f);  nearbyintf(f); nearbyintl(f);
 
-// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.nearbyint.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.nearbyint.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.nearbyint.f80(
@@ -590,24 +590,24 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   rint(f);       rintf(f);      rintl(f);
 
-// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.rint.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.rint.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.rint.f80(
 
   round(f);      roundf(f);     roundl(f);
 
-// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.round.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.round.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.round.f80(
@@ -638,9 +638,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   sin(f);        sinf(f);       sinl(f);
 
-// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @sin(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @sinf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @sinl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -674,9 +674,9 @@ sincos(f, d, d);       sincosf(f, fp, fp);        sincosl(f, l, l);
 
   sqrt(f);       sqrtf(f);      sqrtl(f);
 
-// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @sqrt(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @sqrtf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @sqrtl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -722,20 +722,22 @@ sincos(f, d, d);       sincosf(f, fp, fp);        sincosl(f, l, l);
 
   trunc(f);      truncf(f);     truncl(f);
 
-// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 };
 
 // NO__ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
+// NO__ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} }
 // NO__ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
 // NO__ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
 // NO__ERRNO: attributes [[READONLY]] = { {{.*}}memory(read){{.*}} }
 
 // HAS_ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
+// HAS_ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} }
 // HAS_ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
 // HAS_ERRNO: attributes [[READONLY]] = { {{.*}}memory(read){{.*}} }
 // HAS_ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
diff --git a/clang/test/CodeGen/ms-empty-enum.c b/clang/test/CodeGen/ms-empty-enum.c
new file mode 100644
index 0000000000000..6c1c87b756f9a
--- /dev/null
+++ b/clang/test/CodeGen/ms-empty-enum.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -fms-extensions -triple x86_64-windows-msvc -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fms-extensions -triple i386-windows-msvc -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s
+
+typedef enum tag1 {} A;
+
+// CHECK: void @empty_enum(i32 noundef %a)
+void empty_enum(A a) {}
diff --git a/clang/test/CodeGen/pr45476.cpp b/clang/test/CodeGen/pr45476.cpp
index c95f7fb8cd9c3..3a67904a8e568 100644
--- a/clang/test/CodeGen/pr45476.cpp
+++ b/clang/test/CodeGen/pr45476.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple armv6m-eabi -emit-llvm %s -o - | FileCheck -check-prefix=LIBCALL %s
+// RUN: %clang_cc1 -triple thumbv6m-eabi -emit-llvm %s -o - | FileCheck -check-prefix=LIBCALL %s
 // RUN: %clang_cc1 -triple armv8-eabi -emit-llvm %s -o - | FileCheck -check-prefix=NATIVE %s
 // PR45476
 
diff --git a/clang/test/CodeGenCXX/aarch64-arguments.cpp b/clang/test/CodeGenCXX/aarch64-arguments.cpp
index ffb0cafa8882d..3206e38ad0090 100644
--- a/clang/test/CodeGenCXX/aarch64-arguments.cpp
+++ b/clang/test/CodeGenCXX/aarch64-arguments.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple arm64-none-linux -emit-llvm -w -o - %s | FileCheck -check-prefix=PCS %s
 
-// PCS: define{{.*}} void @{{.*}}(i8 %a
+// PCS: define{{.*}} void @{{.*}}(i64 %a.coerce)
 struct s0 {};
 void f0(s0 a) {}
diff --git a/clang/test/CodeGenCXX/arm64-darwinpcs.cpp b/clang/test/CodeGenCXX/arm64-darwinpcs.cpp
index a0b0d9efdd4c4..ef0e2da3effac 100644
--- a/clang/test/CodeGenCXX/arm64-darwinpcs.cpp
+++ b/clang/test/CodeGenCXX/arm64-darwinpcs.cpp
@@ -7,7 +7,7 @@ void test_extensions(bool a, char b, short c) {}
 
 struct Empty {};
 void test_empty(Empty e) {}
-// CHECK: define{{.*}} void @_Z10test_empty5Empty(i8
+// CHECK: define{{.*}} void @_Z10test_empty5Empty(i64 %e.coerce)
 // CHECK-DARWIN: define{{.*}} void @_Z10test_empty5Empty()
 
 struct HFA {
diff --git a/clang/test/CodeGenCXX/attr-callback.cpp b/clang/test/CodeGenCXX/attr-callback.cpp
index c3456d6c430ff..efa705b9d06dc 100644
--- a/clang/test/CodeGenCXX/attr-callback.cpp
+++ b/clang/test/CodeGenCXX/attr-callback.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple i386-unknown-unknown %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple i386-unknown-unknown -std=c++23 %s -emit-llvm -o - | FileCheck %s
 
 struct Base {
 
@@ -47,9 +47,30 @@ struct Derived_2 : public Base {
 // CHECK-NOT: !callback
 void Derived_2::virtual_1(void (*callback)(void)) {}
 
+class ExplicitParameterObject {
+  __attribute__((callback(1, 0))) void implicit_this_idx(void (*callback)(ExplicitParameterObject*));
+  __attribute__((callback(1, this))) void implicit_this_identifier(void (*callback)(ExplicitParameterObject*));
+  __attribute__((callback(2, 1))) void explicit_this_idx(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*));
+  __attribute__((callback(2, self))) void explicit_this_identifier(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*));
+};
+
+// CHECK-DAG: define{{.*}} void @_ZN23ExplicitParameterObject17implicit_this_idxEPFvPS_E({{[^!]*!callback}} ![[cid3:[0-9]+]]
+void ExplicitParameterObject::implicit_this_idx(void (*callback)(ExplicitParameterObject*)) {}
+
+// CHECK-DAG: define{{.*}} void @_ZN23ExplicitParameterObject24implicit_this_identifierEPFvPS_E({{[^!]*!callback}} ![[cid3]]
+void ExplicitParameterObject::implicit_this_identifier(void (*callback)(ExplicitParameterObject*)) {}
+
+// CHECK-DAG: define{{.*}} void @_ZNH23ExplicitParameterObject17explicit_this_idxEPS_PFvS0_E({{[^!]*!callback}} ![[cid3]]
+void ExplicitParameterObject::explicit_this_idx(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*)) {}
+
+// CHECK-DAG: define{{.*}} void @_ZNH23ExplicitParameterObject24explicit_this_identifierEPS_PFvS0_E({{[^!]*!callback}} ![[cid3]]
+void ExplicitParameterObject::explicit_this_identifier(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*)) {}
+
 // CHECK-DAG: ![[cid0]] = !{![[cid0b:[0-9]+]]}
 // CHECK-DAG: ![[cid0b]] = !{i64 1, i1 false}
 // CHECK-DAG: ![[cid1]] = !{![[cid1b:[0-9]+]]}
 // CHECK-DAG: ![[cid1b]] = !{i64 2, i1 false}
 // CHECK-DAG: ![[cid2]] = !{![[cid2b:[0-9]+]]}
 // CHECK-DAG: ![[cid2b]] = !{i64 1, i64 0, i64 -1, i64 0, i1 false}
+// CHECK-DAG: ![[cid3]] = !{![[cid3b:[0-9]+]]}
+// CHECK-DAG: ![[cid3b]] = !{i64 1, i64 0, i1 false}
diff --git a/clang/test/CodeGenCXX/attr-cpuspecific.cpp b/clang/test/CodeGenCXX/attr-cpuspecific.cpp
index 225c6a5c742a5..fc0e1da6edb95 100644
--- a/clang/test/CodeGenCXX/attr-cpuspecific.cpp
+++ b/clang/test/CodeGenCXX/attr-cpuspecific.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=LINUX
-// RUN: %clang_cc1 -triple x86_64-apple-macos -emit-llvm -o - %s | FileCheck %s --check-prefix=LINUX
-// RUN: %clang_cc1 -triple x86_64-windows-pc -fms-compatibility -emit-llvm -o - %s | FileCheck %s --check-prefix=WINDOWS
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,LINUX
+// RUN: %clang_cc1 -triple x86_64-apple-macos -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,LINUX
+// RUN: %clang_cc1 -triple x86_64-windows-pc -fms-compatibility -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,WINDOWS
 
 struct S {
   __attribute__((cpu_specific(atom)))
@@ -16,14 +16,16 @@ void foo() {
 
 // LINUX: @_ZN1S4FuncEv = weak_odr alias void (ptr), ptr @_ZN1S4FuncEv.ifunc
 // LINUX: @_ZN1S4FuncEv.ifunc = weak_odr ifunc void (ptr), ptr @_ZN1S4FuncEv.resolver
-// LINUX: define weak_odr ptr @_ZN1S4FuncEv.resolver
+// LINUX: define weak_odr ptr @_ZN1S4FuncEv.resolver() #[[ATTR_RESOLVER:[0-9]+]] 
 // LINUX: ret ptr @_ZN1S4FuncEv.S
 // LINUX: ret ptr @_ZN1S4FuncEv.O
 // LINUX: declare void @_ZN1S4FuncEv.S
 // LINUX: define linkonce_odr void @_ZN1S4FuncEv.O
 
-// WINDOWS: define weak_odr dso_local void @"?Func@S@@QEAAXXZ"(ptr %0) comdat
+// WINDOWS: define weak_odr dso_local void @"?Func@S@@QEAAXXZ"(ptr %0) #[[ATTR_RESOLVER:[0-9]+]] comdat
 // WINDOWS: musttail call void @"?Func@S@@QEAAXXZ.S"(ptr %0)
 // WINDOWS: musttail call void @"?Func@S@@QEAAXXZ.O"(ptr %0)
 // WINDOWS: declare dso_local void @"?Func@S@@QEAAXXZ.S"
 // WINDOWS: define linkonce_odr dso_local void @"?Func@S@@QEAAXXZ.O"
+
+// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
diff --git a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
index a502d24f17880..1992f08d89a87 100644
--- a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
+++ b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
@@ -106,7 +106,8 @@ void run_foo_tml() {
 // CHECK-NEXT:    ret i32 4
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z7foo_ovli.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z7foo_ovli.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER:[0-9]+]] comdat {
 // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -147,7 +148,8 @@ void run_foo_tml() {
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_ZN7MyClassIssE7foo_tmlEv.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_ZN7MyClassIssE7foo_tmlEv.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
diff --git a/clang/test/CodeGenCXX/attr-target-clones-riscv.cpp b/clang/test/CodeGenCXX/attr-target-clones-riscv.cpp
index 7e57b1437e2e1..693fec04e1e1c 100644
--- a/clang/test/CodeGenCXX/attr-target-clones-riscv.cpp
+++ b/clang/test/CodeGenCXX/attr-target-clones-riscv.cpp
@@ -52,7 +52,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z4foo1v.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z4foo1v.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER:[0-9]+]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -83,7 +84,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z4foo2v.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z4foo2v.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -115,7 +117,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
 // CHECK-NEXT:    ret i32 3
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z4foo3v.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z4foo3v.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -140,7 +143,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
 // CHECK-NEXT:    ret i32 4
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z4foo4v.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z4foo4v.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -159,7 +163,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
 // CHECK-NEXT:    ret i32 5
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z4foo5v.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z4foo5v.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    ret ptr @_Z4foo5v.default
@@ -177,7 +182,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z4foo6v.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z4foo6v.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -214,7 +220,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z4foo7v.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z4foo7v.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -265,7 +272,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z4foo8v.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z4foo8v.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -316,7 +324,8 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z4foo9v.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z4foo9v.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -367,6 +376,7 @@ int bar() { return foo1() + foo2() + foo3() + foo4() + foo5()+ foo6() + foo7() +
 //
 //.
 // CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+i,+m,+zmmul" }
+// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
 // CHECK: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+i,+m,+zbb,+zmmul" }
 // CHECK: attributes #[[ATTR2]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+c,+i,+m,+zbb,+zca,+zmmul" }
 // CHECK: attributes #[[ATTR3]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+d,+f,+i,+m,+v,+zbb,+zicsr,+zmmul,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b" }
diff --git a/clang/test/CodeGenCXX/attr-target-clones.cpp b/clang/test/CodeGenCXX/attr-target-clones.cpp
index 0814df312f4d8..5cc9c61134cea 100644
--- a/clang/test/CodeGenCXX/attr-target-clones.cpp
+++ b/clang/test/CodeGenCXX/attr-target-clones.cpp
@@ -1,59 +1,39 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -std=c++11 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=ITANIUM,LINUX
 // RUN: %clang_cc1 -std=c++11 -triple x86_64-apple-macos -emit-llvm %s -o - | FileCheck %s --check-prefixes=ITANIUM,DARWIN
 // RUN: %clang_cc1 -std=c++11 -triple x86_64-windows-pc -emit-llvm %s -o - | FileCheck %s --check-prefix=WINDOWS
 
-// DARWIN-NOT: comdat
 
 // Aliases for ifuncs
-// ITANIUM: @_Z10overloadedi.ifunc = weak_odr alias i32 (i32), ptr @_Z10overloadedi
-// ITANIUM: @_Z10overloadedPKc.ifunc = weak_odr alias i32 (ptr), ptr @_Z10overloadedPKc
-// ITANIUM: @_ZN1CIssE3fooEv.ifunc = weak_odr alias i32 (ptr), ptr @_ZN1CIssE3fooEv
-// ITANIUM: @_ZN1CIisE3fooEv.ifunc = weak_odr alias i32 (ptr), ptr @_ZN1CIisE3fooEv
-// ITANIUM: @_ZN1CIdfE3fooEv.ifunc = weak_odr alias i32 (ptr), ptr @_ZN1CIdfE3fooEv
 
 // Overloaded ifuncs
-// ITANIUM: @_Z10overloadedi = weak_odr ifunc i32 (i32), ptr @_Z10overloadedi.resolver
-// ITANIUM: @_Z10overloadedPKc = weak_odr ifunc i32 (ptr), ptr @_Z10overloadedPKc.resolver
 // struct 'C' ifuncs, note the 'float, U' one doesn't get one.
-// ITANIUM: @_ZN1CIssE3fooEv = weak_odr ifunc i32 (ptr), ptr @_ZN1CIssE3fooEv.resolver
-// ITANIUM: @_ZN1CIisE3fooEv = weak_odr ifunc i32 (ptr), ptr @_ZN1CIisE3fooEv.resolver
-// ITANIUM: @_ZN1CIdfE3fooEv = weak_odr ifunc i32 (ptr), ptr @_ZN1CIdfE3fooEv.resolver
 
+//
 int __attribute__((target_clones("sse4.2", "default"))) overloaded(int) { return 1; }
-// ITANIUM: define {{.*}}i32 @_Z10overloadedi.sse4.2.0(i32{{.+}})
-// ITANIUM: define {{.*}}i32 @_Z10overloadedi.default.1(i32{{.+}})
-// ITANIUM: define weak_odr ptr @_Z10overloadedi.resolver()
-// LINUX-SAME: comdat
-// ITANIUM: ret ptr @_Z10overloadedi.sse4.2.0
-// ITANIUM: ret ptr @_Z10overloadedi.default.1
-
-// WINDOWS: define dso_local noundef i32 @"?overloaded@@YAHH@Z.sse4.2.0"(i32{{.+}})
-// WINDOWS: define dso_local noundef i32 @"?overloaded@@YAHH@Z.default.1"(i32{{.+}})
-// WINDOWS: define weak_odr dso_local i32 @"?overloaded@@YAHH@Z"(i32{{.+}}) comdat
-// WINDOWS: call i32 @"?overloaded@@YAHH@Z.sse4.2.0"
-// WINDOWS: call i32 @"?overloaded@@YAHH@Z.default.1"
 
+
+//
 int __attribute__((target_clones("arch=ivybridge", "default"))) overloaded(const char *) { return 2; }
-// ITANIUM: define {{.*}}i32 @_Z10overloadedPKc.arch_ivybridge.0(ptr{{.+}})
-// ITANIUM: define {{.*}}i32 @_Z10overloadedPKc.default.1(ptr{{.+}})
-// ITANIUM: define weak_odr ptr @_Z10overloadedPKc.resolver()
-// LINUX-SAME: comdat
-// ITANIUM: ret ptr @_Z10overloadedPKc.arch_ivybridge.0
-// ITANIUM: ret ptr @_Z10overloadedPKc.default.1
-
-// WINDOWS: define dso_local noundef i32 @"?overloaded@@YAHPEBD@Z.arch_ivybridge.0"(ptr{{.+}})
-// WINDOWS: define dso_local noundef i32 @"?overloaded@@YAHPEBD@Z.default.1"(ptr{{.+}})
-// WINDOWS: define weak_odr dso_local i32 @"?overloaded@@YAHPEBD@Z"(ptr{{.+}}) comdat
-// WINDOWS: call i32 @"?overloaded@@YAHPEBD@Z.arch_ivybridge.0"
-// WINDOWS: call i32 @"?overloaded@@YAHPEBD@Z.default.1"
 
+
+// LINUX-LABEL: define dso_local void @_Z14use_overloadedv(
+// LINUX-SAME: ) #[[ATTR1:[0-9]+]] {
+// LINUX-NEXT:  [[ENTRY:.*:]]
+// LINUX-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z10overloadedi(i32 noundef 1)
+// LINUX-NEXT:    [[CALL1:%.*]] = call noundef i32 @_Z10overloadedPKc(ptr noundef null)
+// LINUX-NEXT:    ret void
+//
+// DARWIN-LABEL: define void @_Z14use_overloadedv(
+// DARWIN-SAME: ) #[[ATTR1:[0-9]+]] {
+// DARWIN-NEXT:  [[ENTRY:.*:]]
+// DARWIN-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z10overloadedi(i32 noundef 1)
+// DARWIN-NEXT:    [[CALL1:%.*]] = call noundef i32 @_Z10overloadedPKc(ptr noundef null)
+// DARWIN-NEXT:    ret void
+//
 void use_overloaded() {
   overloaded(1);
-  // ITANIUM: call noundef i32 @_Z10overloadedi
-  // WINDOWS: call noundef i32 @"?overloaded@@YAHH@Z"
   overloaded(nullptr);
-  // ITANIUM: call noundef i32 @_Z10overloadedPKc 
-  // WINDOWS: call noundef i32 @"?overloaded@@YAHPEBD@Z"
 }
 
 template<typename T, typename U>
@@ -69,67 +49,56 @@ struct C<float, U> {
 int foo(){ return 2;}
 };
 template<>
+//
 struct C<double, float> {
 int __attribute__((target_clones("sse4.2", "default"))) foo(){ return 3;}
 };
 
+// LINUX-LABEL: define dso_local void @_Z16uses_specializedv(
+// LINUX-SAME: ) #[[ATTR1]] {
+// LINUX-NEXT:  [[ENTRY:.*:]]
+// LINUX-NEXT:    [[C:%.*]] = alloca [[STRUCT_C:%.*]], align 1
+// LINUX-NEXT:    [[C2:%.*]] = alloca [[STRUCT_C_0:%.*]], align 1
+// LINUX-NEXT:    [[C3:%.*]] = alloca [[STRUCT_C_1:%.*]], align 1
+// LINUX-NEXT:    [[C4:%.*]] = alloca [[STRUCT_C_2:%.*]], align 1
+// LINUX-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN1CIssE3fooEv(ptr noundef nonnull align 1 dereferenceable(1) [[C]])
+// LINUX-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZN1CIisE3fooEv(ptr noundef nonnull align 1 dereferenceable(1) [[C2]])
+// LINUX-NEXT:    [[CALL2:%.*]] = call noundef i32 @_ZN1CIfsE3fooEv(ptr noundef nonnull align 1 dereferenceable(1) [[C3]])
+// LINUX-NEXT:    [[CALL3:%.*]] = call noundef i32 @_ZN1CIdfE3fooEv(ptr noundef nonnull align 1 dereferenceable(1) [[C4]])
+// LINUX-NEXT:    ret void
+//
+// DARWIN-LABEL: define void @_Z16uses_specializedv(
+// DARWIN-SAME: ) #[[ATTR1]] {
+// DARWIN-NEXT:  [[ENTRY:.*:]]
+// DARWIN-NEXT:    [[C:%.*]] = alloca [[STRUCT_C:%.*]], align 1
+// DARWIN-NEXT:    [[C2:%.*]] = alloca [[STRUCT_C_0:%.*]], align 1
+// DARWIN-NEXT:    [[C3:%.*]] = alloca [[STRUCT_C_1:%.*]], align 1
+// DARWIN-NEXT:    [[C4:%.*]] = alloca [[STRUCT_C_2:%.*]], align 1
+// DARWIN-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN1CIssE3fooEv(ptr noundef nonnull align 1 dereferenceable(1) [[C]])
+// DARWIN-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZN1CIisE3fooEv(ptr noundef nonnull align 1 dereferenceable(1) [[C2]])
+// DARWIN-NEXT:    [[CALL2:%.*]] = call noundef i32 @_ZN1CIfsE3fooEv(ptr noundef nonnull align 1 dereferenceable(1) [[C3]])
+// DARWIN-NEXT:    [[CALL3:%.*]] = call noundef i32 @_ZN1CIdfE3fooEv(ptr noundef nonnull align 1 dereferenceable(1) [[C4]])
+// DARWIN-NEXT:    ret void
+//
 void uses_specialized() {
   C<short, short> c;
   c.foo();
-  // ITANIUM: call noundef i32 @_ZN1CIssE3fooEv(ptr
-  // WINDOWS: call noundef i32 @"?foo@?$C@FF@@QEAAHXZ"(ptr
   C<int, short> c2;
   c2.foo();
-  // ITANIUM: call noundef i32 @_ZN1CIisE3fooEv(ptr
-  // WINDOWS: call noundef i32 @"?foo@?$C@HF@@QEAAHXZ"(ptr
   C<float, short> c3;
   c3.foo();
   // Note this is not an ifunc/mv
-  // ITANIUM: call noundef i32 @_ZN1CIfsE3fooEv(ptr
-  // WINDOWS: call noundef i32 @"?foo@?$C@MF@@QEAAHXZ"(ptr
   C<double, float> c4;
   c4.foo();
-  // ITANIUM: call noundef i32 @_ZN1CIdfE3fooEv(ptr
-  // WINDOWS: call noundef i32 @"?foo@?$C@NM@@QEAAHXZ"(ptr
 }
 
-// ITANIUM: define weak_odr ptr @_ZN1CIssE3fooEv.resolver()
-// LINUX-SAME: comdat
-// ITANIUM: ret ptr @_ZN1CIssE3fooEv.sse4.2.0
-// ITANIUM: ret ptr @_ZN1CIssE3fooEv.default.1
-
-// WINDOWS: define {{.*}}i32 @"?foo@?$C@FF@@QEAAHXZ"(ptr
-// WINDOWS: call i32 @"?foo@?$C@FF@@QEAAHXZ.sse4.2.0"
-// WINDOWS: call i32 @"?foo@?$C@FF@@QEAAHXZ.default.1"
-
-// ITANIUM: define weak_odr ptr @_ZN1CIisE3fooEv.resolver()
-// LINUX-SAME: comdat
-// ITANIUM: ret ptr @_ZN1CIisE3fooEv.sse4.2.0
-// ITANIUM: ret ptr @_ZN1CIisE3fooEv.default.1
-
-// WINDOWS: define {{.*}}i32 @"?foo@?$C@HF@@QEAAHXZ"(ptr
-// WINDOWS: call i32 @"?foo@?$C@HF@@QEAAHXZ.sse4.2.0"
-// WINDOWS: call i32 @"?foo@?$C@HF@@QEAAHXZ.default.1"
-
-// ITANIUM: define weak_odr ptr @_ZN1CIdfE3fooEv.resolver()
-// LINUX-SAME: comdat
-// ITANIUM: ret ptr @_ZN1CIdfE3fooEv.sse4.2.0
-// ITANIUM: ret ptr @_ZN1CIdfE3fooEv.default.1
-
-// WINDOWS: define {{.*}}i32 @"?foo@?$C@NM@@QEAAHXZ"(ptr
-// WINDOWS: call i32 @"?foo@?$C@NM@@QEAAHXZ.sse4.2.0"
-// WINDOWS: call i32 @"?foo@?$C@NM@@QEAAHXZ.default.1"
-
-// ITANIUM: define {{.*}}i32 @_ZN1CIssE3fooEv.sse4.2.0(ptr
-// ITANIUM: define {{.*}}i32 @_ZN1CIssE3fooEv.default.1(ptr
-// ITANIUM: define {{.*}}i32 @_ZN1CIisE3fooEv.sse4.2.0(ptr
-// ITANIUM: define {{.*}}i32 @_ZN1CIisE3fooEv.default.1(ptr
-// ITANIUM: define {{.*}}i32 @_ZN1CIdfE3fooEv.sse4.2.0(ptr
-// ITANIUM: define {{.*}}i32 @_ZN1CIdfE3fooEv.default.1(ptr
-
-// WINDOWS: define {{.*}}i32 @"?foo@?$C@FF@@QEAAHXZ.sse4.2.0"(ptr
-// WINDOWS: define {{.*}}i32 @"?foo@?$C@FF@@QEAAHXZ.default.1"(ptr
-// WINDOWS: define {{.*}}i32 @"?foo@?$C@HF@@QEAAHXZ.sse4.2.0"(ptr
-// WINDOWS: define {{.*}}i32 @"?foo@?$C@HF@@QEAAHXZ.default.1"(ptr
-// WINDOWS: define {{.*}}i32 @"?foo@?$C@NM@@QEAAHXZ.sse4.2.0"(ptr
-// WINDOWS: define {{.*}}i32 @"?foo@?$C@NM@@QEAAHXZ.default.1"(ptr
+
+
+
+
+
+
+
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// ITANIUM: {{.*}}
+// WINDOWS: {{.*}}
diff --git a/clang/test/CodeGenCXX/attr-target-mv-diff-ns.cpp b/clang/test/CodeGenCXX/attr-target-mv-diff-ns.cpp
index 8f2fb5ef0df7e..a7681e559f53a 100644
--- a/clang/test/CodeGenCXX/attr-target-mv-diff-ns.cpp
+++ b/clang/test/CodeGenCXX/attr-target-mv-diff-ns.cpp
@@ -60,27 +60,27 @@ int bar() {
 // WINDOWS: call noundef i32 @"?foo@@YAHH@Z.resolver"(i32 noundef 1)
 // WINDOWS: call noundef i32 @"?foo@ns@@YAHH@Z.resolver"(i32 noundef 2)
 
-// ITANIUM: define weak_odr ptr @_Z3fooi.resolver()
+// ITANIUM: define weak_odr ptr @_Z3fooi.resolver() #[[ATTR_RESOLVER:[0-9]+]]
 // LINUX-SAME: comdat
 // ITANIUM: ret ptr @_Z3fooi.arch_sandybridge
 // ITANIUM: ret ptr @_Z3fooi.arch_ivybridge
 // ITANIUM: ret ptr @_Z3fooi.sse4.2
 // ITANIUM: ret ptr @_Z3fooi
 
-// WINDOWS: define weak_odr dso_local i32 @"?foo@@YAHH@Z.resolver"(i32 %0) comdat
+// WINDOWS: define weak_odr dso_local i32 @"?foo@@YAHH@Z.resolver"(i32 %0) #[[ATTR_RESOLVER:[0-9]+]] comdat
 // WINDOWS: call i32 @"?foo@@YAHH@Z.arch_sandybridge"(i32 %0)
 // WINDOWS: call i32 @"?foo@@YAHH@Z.arch_ivybridge"(i32 %0)
 // WINDOWS: call i32 @"?foo@@YAHH@Z.sse4.2"(i32 %0)
 // WINDOWS: call i32 @"?foo@@YAHH@Z"(i32 %0)
 
-// ITANIUM: define weak_odr ptr @_ZN2ns3fooEi.resolver()
+// ITANIUM: define weak_odr ptr @_ZN2ns3fooEi.resolver() #[[ATTR_RESOLVER]]
 // LINUX-SAME: comdat
 // ITANIUM: ret ptr @_ZN2ns3fooEi.arch_sandybridge
 // ITANIUM: ret ptr @_ZN2ns3fooEi.arch_ivybridge
 // ITANIUM: ret ptr @_ZN2ns3fooEi.sse4.2
 // ITANIUM: ret ptr @_ZN2ns3fooEi
 
-// WINDOWS: define weak_odr dso_local i32 @"?foo@ns@@YAHH@Z.resolver"(i32 %0) comdat
+// WINDOWS: define weak_odr dso_local i32 @"?foo@ns@@YAHH@Z.resolver"(i32 %0) #[[ATTR_RESOLVER]] comdat
 // WINDOWS: call i32 @"?foo@ns@@YAHH@Z.arch_sandybridge"(i32 %0)
 // WINDOWS: call i32 @"?foo@ns@@YAHH@Z.arch_ivybridge"(i32 %0)
 // WINDOWS: call i32 @"?foo@ns@@YAHH@Z.sse4.2"(i32 %0)
diff --git a/clang/test/CodeGenCXX/attr-target-mv-member-funcs.cpp b/clang/test/CodeGenCXX/attr-target-mv-member-funcs.cpp
index f956890cf706e..59581b40e8b19 100644
--- a/clang/test/CodeGenCXX/attr-target-mv-member-funcs.cpp
+++ b/clang/test/CodeGenCXX/attr-target-mv-member-funcs.cpp
@@ -180,7 +180,8 @@ int templ_use() {
 // ITANIUM: ret ptr @_ZN5templIdE3fooEi.sse4.2
 // ITANIUM: ret ptr @_ZN5templIdE3fooEi
 
-// WINDOWS: define weak_odr dso_local i32 @"?foo@?$templ@N@@QEAAHH@Z.resolver"(ptr %0, i32 %1) comdat
+// WINDOWS: define weak_odr dso_local i32 @"?foo@?$templ@N@@QEAAHH@Z.resolver"(ptr %0, i32 %1) 
+// WINDOWS-SAME: comdat
 // WINDOWS: call i32 @"?foo@?$templ@N@@QEAAHH@Z.arch_sandybridge"
 // WINDOWS: call i32 @"?foo@?$templ@N@@QEAAHH@Z.arch_ivybridge"
 // WINDOWS: call i32 @"?foo@?$templ@N@@QEAAHH@Z.sse4.2"
diff --git a/clang/test/CodeGenCXX/attr-target-mv-out-of-line-defs.cpp b/clang/test/CodeGenCXX/attr-target-mv-out-of-line-defs.cpp
index 3c56cad3af914..8d6b178d45a25 100644
--- a/clang/test/CodeGenCXX/attr-target-mv-out-of-line-defs.cpp
+++ b/clang/test/CodeGenCXX/attr-target-mv-out-of-line-defs.cpp
@@ -54,7 +54,8 @@ int bar() {
 // ITANIUM: ret ptr @_ZN1S3fooEi.sse4.2
 // ITANIUM: ret ptr @_ZN1S3fooEi
 
-// WINDOWS: define weak_odr dso_local i32 @"?foo@S@@QEAAHH@Z.resolver"(ptr %0, i32 %1) comdat
+// WINDOWS: define weak_odr dso_local i32 @"?foo@S@@QEAAHH@Z.resolver"(ptr %0, i32 %1) 
+// WINDOWS-SAME: comdat
 // WINDOWS: call i32 @"?foo@S@@QEAAHH@Z.arch_sandybridge"(ptr %0, i32 %1)
 // WINDOWS: call i32 @"?foo@S@@QEAAHH@Z.arch_ivybridge"(ptr %0, i32 %1)
 // WINDOWS: call i32 @"?foo@S@@QEAAHH@Z.sse4.2"(ptr %0, i32 %1)
diff --git a/clang/test/CodeGenCXX/attr-target-mv-overloads.cpp b/clang/test/CodeGenCXX/attr-target-mv-overloads.cpp
index e30fbf4ef5027..5f0008313f54f 100644
--- a/clang/test/CodeGenCXX/attr-target-mv-overloads.cpp
+++ b/clang/test/CodeGenCXX/attr-target-mv-overloads.cpp
@@ -61,7 +61,8 @@ int bar2() {
 // ITANIUM: ret ptr @_Z12foo_overloadv.sse4.2
 // ITANIUM: ret ptr @_Z12foo_overloadv
 
-// WINDOWS: define weak_odr dso_local i32 @"?foo_overload@@YAHXZ.resolver"() comdat
+// WINDOWS: define weak_odr dso_local i32 @"?foo_overload@@YAHXZ.resolver"()
+// WINDOWS-SAME comdat
 // WINDOWS: call i32 @"?foo_overload@@YAHXZ.arch_sandybridge"
 // WINDOWS: call i32 @"?foo_overload@@YAHXZ.arch_ivybridge"
 // WINDOWS: call i32 @"?foo_overload@@YAHXZ.sse4.2"
@@ -74,7 +75,8 @@ int bar2() {
 // ITANIUM: ret ptr @_Z12foo_overloadi.sse4.2
 // ITANIUM: ret ptr @_Z12foo_overloadi
 
-// WINDOWS: define weak_odr dso_local i32 @"?foo_overload@@YAHH@Z.resolver"(i32 %0) comdat
+// WINDOWS: define weak_odr dso_local i32 @"?foo_overload@@YAHH@Z.resolver"(i32 %0)
+// WINDOWS-SAME: comdat
 // WINDOWS: call i32 @"?foo_overload@@YAHH@Z.arch_sandybridge"
 // WINDOWS: call i32 @"?foo_overload@@YAHH@Z.arch_ivybridge"
 // WINDOWS: call i32 @"?foo_overload@@YAHH@Z.sse4.2"
diff --git a/clang/test/CodeGenCXX/attr-target-version-riscv.cpp b/clang/test/CodeGenCXX/attr-target-version-riscv.cpp
index ffb4576b3cd30..dd0e6822e7e06 100644
--- a/clang/test/CodeGenCXX/attr-target-version-riscv.cpp
+++ b/clang/test/CodeGenCXX/attr-target-version-riscv.cpp
@@ -49,7 +49,8 @@ int bar() { return foo1() + foo2() + foo3(); }
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z4foo1v.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z4foo1v.resolver()
+// CHECK-SAME: comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -74,7 +75,8 @@ int bar() { return foo1() + foo2() + foo3(); }
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z4foo2v.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z4foo2v.resolver()
+// CHECK-SAME: comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -112,7 +114,8 @@ int bar() { return foo1() + foo2() + foo3(); }
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z4foo3v.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z4foo3v.resolver()
+// CHECK-SAME comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -150,7 +153,8 @@ int bar() { return foo1() + foo2() + foo3(); }
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z4foo4v.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z4foo4v.resolver()
+// CHECK-SAME: comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -201,7 +205,8 @@ int bar() { return foo1() + foo2() + foo3(); }
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z4foo5v.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z4foo5v.resolver()
+// CHECK-SAME: comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -252,7 +257,8 @@ int bar() { return foo1() + foo2() + foo3(); }
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z4foo6v.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z4foo6v.resolver()
+// CHECK-SAME: comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
@@ -303,7 +309,8 @@ int bar() { return foo1() + foo2() + foo3(); }
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z4foo7v.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z4foo7v.resolver()
+// CHECK-SAME: comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_riscv_feature_bits(ptr null)
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8
diff --git a/clang/test/CodeGenCXX/attr-target-version.cpp b/clang/test/CodeGenCXX/attr-target-version.cpp
index b6ba07ed29504..c62b0266f32c9 100644
--- a/clang/test/CodeGenCXX/attr-target-version.cpp
+++ b/clang/test/CodeGenCXX/attr-target-version.cpp
@@ -231,7 +231,8 @@ int bar() {
 // CHECK-NEXT:    ret i32 [[ADD3]]
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z3fooi.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z3fooi.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER:[0-9]+]] comdat {
 // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -245,7 +246,8 @@ int bar() {
 // CHECK-NEXT:    ret ptr @_Z3fooi.default
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_Z3foov.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_Z3foov.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -259,7 +261,8 @@ int bar() {
 // CHECK-NEXT:    ret ptr @_Z3foov.default
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass3gooEi.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass3gooEi.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -281,7 +284,8 @@ int bar() {
 // CHECK-NEXT:    ret ptr @_ZN7MyClass3gooEi.default
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass23unused_with_default_defEv.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass23unused_with_default_defEv.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -295,7 +299,8 @@ int bar() {
 // CHECK-NEXT:    ret ptr @_ZN7MyClass23unused_with_default_defEv.default
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass32unused_with_implicit_default_defEv.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass32unused_with_imp
+// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -309,7 +314,8 @@ int bar() {
 // CHECK-NEXT:    ret ptr @_ZN7MyClass32unused_with_implicit_default_defEv.default
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass40unused_with_implicit_forward_default_defEv.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass40unused_with_implicit_forward_default_defEv.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -322,6 +328,7 @@ int bar() {
 // CHECK:       [[RESOLVER_ELSE]]:
 // CHECK-NEXT:    ret ptr @_ZN7MyClass40unused_with_implicit_forward_default_defEv.default
 //
+// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
 //.
 // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
diff --git a/clang/test/CodeGenCXX/fmv-namespace.cpp b/clang/test/CodeGenCXX/fmv-namespace.cpp
index 75f29e1c77975..4680b3954121b 100644
--- a/clang/test/CodeGenCXX/fmv-namespace.cpp
+++ b/clang/test/CodeGenCXX/fmv-namespace.cpp
@@ -72,7 +72,8 @@ __attribute((target_version("mops"))) int bar() { return 1; }
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_ZN4Name3fooEv.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_ZN4Name3fooEv.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER:[0-9]+]] comdat {
 // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -86,7 +87,8 @@ __attribute((target_version("mops"))) int bar() { return 1; }
 // CHECK-NEXT:    ret ptr @_ZN4Name3fooEv.default
 //
 //
-// CHECK-LABEL: define weak_odr ptr @_ZN3Foo3barEv.resolver() comdat {
+// CHECK-LABEL: define weak_odr ptr @_ZN3Foo3barEv.resolver(
+// CHECK-SAME: ) #[[ATTR_RESOLVER]] comdat {
 // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -99,6 +101,7 @@ __attribute((target_version("mops"))) int bar() { return 1; }
 // CHECK:       [[RESOLVER_ELSE]]:
 // CHECK-NEXT:    ret ptr @_ZN3Foo3barEv.default
 //
+// CHECK: attributes #[[ATTR_RESOLVER]] = { disable_sanitizer_instrumentation }
 //.
 // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl
new file mode 100644
index 0000000000000..a7c01015b2015
--- /dev/null
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl
@@ -0,0 +1,95 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -disable-llvm-passes -emit-llvm -finclude-default-header -o - %s | FileCheck %s
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case1v(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <6 x float> <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 1.000000e+00, float 3.000000e+00, float 5.000000e+00>
+//
+float3x2 case1() {
+  // vec[0] = 0
+  // vec[1] = 2
+  // vec[2] = 4
+  // vec[3] = 1
+  // vec[4] = 3
+  // vec[5] = 5
+  return float3x2(0, 1, 
+                  2, 3,
+                  4, 5);
+}
+
+
+RWStructuredBuffer<float> In;
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case2v(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 0) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[CALL1:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 1) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL2:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 2) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL3:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 3) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL4:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 4) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL5:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 5) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[CALL]], align 4
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <6 x float> poison, float [[TMP0]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[CALL2]], align 4
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT]], float [[TMP1]], i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[CALL4]], align 4
+// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[TMP2]], i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[CALL1]], align 4
+// CHECK-NEXT:    [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT7]], float [[TMP3]], i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[CALL3]], align 4
+// CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <6 x float> [[VECINIT8]], float [[TMP4]], i32 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[CALL5]], align 4
+// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <6 x float> [[VECINIT9]], float [[TMP5]], i32 5
+// CHECK-NEXT:    ret <6 x float> [[VECINIT10]]
+//
+float3x2 case2() {
+  // vec[0] = Call
+  // vec[1] = Call2
+  // vec[2] = Call4
+  // vec[3] = Call1
+  // vec[4] = Call3
+  // vec[5] = Call5
+  return float3x2(In[0], In[1], 
+                  In[2], In[3],
+                  In[4], In[5]);
+}
+
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case3Dv3_fS_(
+// CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[A:%.*]], <3 x float> noundef nofpclass(nan inf) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x float>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <3 x float>, align 16
+// CHECK-NEXT:    store <3 x float> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <3 x float> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <3 x float> [[TMP0]], i64 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <6 x float> poison, float [[VECEXT]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <3 x float> [[TMP1]], i64 2
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <6 x float> [[VECINIT]], float [[VECEXT1]], i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <3 x float> [[TMP2]], i64 1
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <6 x float> [[VECINIT2]], float [[VECEXT3]], i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT5:%.*]] = extractelement <3 x float> [[TMP3]], i64 1
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT4]], float [[VECEXT5]], i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT7:%.*]] = extractelement <3 x float> [[TMP4]], i64 0
+// CHECK-NEXT:    [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[VECEXT7]], i32 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT9:%.*]] = extractelement <3 x float> [[TMP5]], i64 2
+// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <6 x float> [[VECINIT8]], float [[VECEXT9]], i32 5
+// CHECK-NEXT:    ret <6 x float> [[VECINIT10]]
+//
+float3x2 case3(float3 a, float3 b) {
+ // vec[0] = A[0]
+ // vec[1] = A[2]
+ // vec[2] = B[1]
+ // vec[3] = A[1]
+ // vec[4] = B[0]
+ // vec[5] = B[2]
+  return float3x2(a,b);
+}
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl
index 4e29994afd27e..bd9a62f4db359 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 
 struct S {
   int X;
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/frem_modulo.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/frem_modulo.hlsl
index edc28c5c80b51..393efcc360d08 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/frem_modulo.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/frem_modulo.hlsl
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s
 
  half2 half_vec_mod_by_int(half2 p1) {
diff --git a/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl b/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl
index aa13b27581850..6737cd3ee78ba 100644
--- a/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl
+++ b/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s
 
 // CHECK: define {{.*}} i32 {{.*}}test_branch{{.*}}(i32 {{.*}} [[VALD:%.*]])
 // CHECK: [[PARAM:%.*]] = load i32, ptr [[VALD]].addr, align 4
diff --git a/clang/test/CodeGenHLSL/Operators/logical-not.hlsl b/clang/test/CodeGenHLSL/Operators/logical-not.hlsl
index 0f9d0677d8610..d5130ab88ea64 100644
--- a/clang/test/CodeGenHLSL/Operators/logical-not.hlsl
+++ b/clang/test/CodeGenHLSL/Operators/logical-not.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -disable-llvm-passes -emit-llvm -finclude-default-header -fnative-half-type -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -disable-llvm-passes -emit-llvm -finclude-default-header -fnative-half-type -fnative-int16-type -o - %s | FileCheck %s
 
 // CHECK-LABEL: case1
 // CHECK: [[ToBool:%.*]] = icmp ne <2 x i32> {{.*}}, zeroinitializer
diff --git a/clang/test/CodeGenHLSL/basic_types.hlsl b/clang/test/CodeGenHLSL/basic_types.hlsl
index 37fb5195e9768..8836126934957 100644
--- a/clang/test/CodeGenHLSL/basic_types.hlsl
+++ b/clang/test/CodeGenHLSL/basic_types.hlsl
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - -DNAMESPACED| FileCheck %s
 
 
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl
index df530a9cee561..f499fc97f43fc 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
+// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
 // RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
 // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
-// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
+// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
 // RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
 // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
 
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl
index 87bb1dee01905..3655cdb443fa9 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
+// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
 // RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
 // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
-// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
+// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
 // RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
 // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
 
diff --git a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
index 8c787a42618ac..da6cbc40a79bb 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -triple \
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
 // RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
 // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -triple \
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
 // RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
 // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
 
diff --git a/clang/test/CodeGenHLSL/builtins/abs.hlsl b/clang/test/CodeGenHLSL/builtins/abs.hlsl
index 6abe2f816c844..45cc907c0ada9 100644
--- a/clang/test/CodeGenHLSL/builtins/abs.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/abs.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/acos.hlsl b/clang/test/CodeGenHLSL/builtins/acos.hlsl
index 8152339a34e87..f710d1f738a48 100644
--- a/clang/test/CodeGenHLSL/builtins/acos.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/acos.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/all.hlsl b/clang/test/CodeGenHLSL/builtins/all.hlsl
index 391fad0ef33f5..bfa3b903d66a8 100644
--- a/clang/test/CodeGenHLSL/builtins/all.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/all.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
 // RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
diff --git a/clang/test/CodeGenHLSL/builtins/any.hlsl b/clang/test/CodeGenHLSL/builtins/any.hlsl
index e4837876e2693..fa2cd2698b392 100644
--- a/clang/test/CodeGenHLSL/builtins/any.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/any.hlsl
@@ -1,19 +1,19 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-int16-type -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
 // RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-int16-type -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
 // RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
 
diff --git a/clang/test/CodeGenHLSL/builtins/asfloat.hlsl b/clang/test/CodeGenHLSL/builtins/asfloat.hlsl
index 59fc15fa60b1e..72802e8ef09be 100644
--- a/clang/test/CodeGenHLSL/builtins/asfloat.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/asfloat.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
 
 // CHECK: define {{.*}}test_uint{{.*}}(i32 {{.*}} [[VAL:%.*]]){{.*}} 
 // CHECK: bitcast i32 [[VAL]] to float
diff --git a/clang/test/CodeGenHLSL/builtins/asin.hlsl b/clang/test/CodeGenHLSL/builtins/asin.hlsl
index 16efbba79670e..ccf704834116c 100644
--- a/clang/test/CodeGenHLSL/builtins/asin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/asin.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/asint.hlsl b/clang/test/CodeGenHLSL/builtins/asint.hlsl
index e1d80df5015c9..587d2bdc657d8 100644
--- a/clang/test/CodeGenHLSL/builtins/asint.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/asint.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
 
 // CHECK: define {{.*}}test_int{{.*}}(i32 {{.*}} [[VAL:%.*]]){{.*}}
 // CHECK-NOT: bitcast
diff --git a/clang/test/CodeGenHLSL/builtins/asint16.hlsl b/clang/test/CodeGenHLSL/builtins/asint16.hlsl
index 8a1513012fd99..fd2cb8d10ee6b 100644
--- a/clang/test/CodeGenHLSL/builtins/asint16.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/asint16.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
 
 //CHECK-LABEL: define {{.*}}test_ints
 //CHECK-SAME: {{.*}}(i16 {{.*}} [[VAL:%.*]]){{.*}}
diff --git a/clang/test/CodeGenHLSL/builtins/asuint.hlsl b/clang/test/CodeGenHLSL/builtins/asuint.hlsl
index 252a434ccce0d..5fd1e62d66ddb 100644
--- a/clang/test/CodeGenHLSL/builtins/asuint.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/asuint.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
 
 // CHECK: define {{.*}}test_uint{{.*}}(i32 {{.*}} [[VAL:%.*]]){{.*}}
 // CHECK-NOT: bitcast
diff --git a/clang/test/CodeGenHLSL/builtins/asuint16.hlsl b/clang/test/CodeGenHLSL/builtins/asuint16.hlsl
index 6d44377df2ffb..31e151e210d7e 100644
--- a/clang/test/CodeGenHLSL/builtins/asuint16.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/asuint16.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
 
 //CHECK-LABEL: define {{.*}}test_ints
 //CHECK-SAME: {{.*}}(i16 {{.*}} [[VAL:%.*]]){{.*}}
diff --git a/clang/test/CodeGenHLSL/builtins/atan.hlsl b/clang/test/CodeGenHLSL/builtins/atan.hlsl
index 437835a863703..91fe139ddf05b 100644
--- a/clang/test/CodeGenHLSL/builtins/atan.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/atan.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/atan2.hlsl b/clang/test/CodeGenHLSL/builtins/atan2.hlsl
index 6c93f57be6b3d..512b44a5780db 100644
--- a/clang/test/CodeGenHLSL/builtins/atan2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/atan2.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/ceil.hlsl b/clang/test/CodeGenHLSL/builtins/ceil.hlsl
index 1a9c630b60e57..d87d56edd9443 100644
--- a/clang/test/CodeGenHLSL/builtins/ceil.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ceil.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl
index 356836b40e9c0..56a2b090bdeaf 100644
--- a/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_clamp_half
diff --git a/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
index eaedfb419c195..8044047c5ef40 100644
--- a/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
@@ -7,7 +7,7 @@
 // RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
-// RUN:  -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
diff --git a/clang/test/CodeGenHLSL/builtins/clamp.hlsl b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
index 58db4423799be..10570e9b6ddb4 100644
--- a/clang/test/CodeGenHLSL/builtins/clamp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
@@ -7,7 +7,7 @@
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
diff --git a/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
index aaeb2f026449b..0baf0db9bd0b6 100644
--- a/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // CHECK:      define hidden void @{{.*}}builtin_clip_float{{.*}}(float {{.*}} [[P0:%.*]])
 // CHECK:      [[LOAD:%.*]] = load float, ptr [[P0]].addr, align 4
diff --git a/clang/test/CodeGenHLSL/builtins/clip.hlsl b/clang/test/CodeGenHLSL/builtins/clip.hlsl
index e067828c38bf6..bb21f084deba5 100644
--- a/clang/test/CodeGenHLSL/builtins/clip.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clip.hlsl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-pixel %s -fnative-half-type -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -finclude-default-header -triple spirv-vulkan-pixel %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefix=SPIRV
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-pixel %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-vulkan-pixel %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefix=SPIRV
 
 
 void test_scalar(float Buf) {
diff --git a/clang/test/CodeGenHLSL/builtins/cos.hlsl b/clang/test/CodeGenHLSL/builtins/cos.hlsl
index 79f9e1e6fbec2..1f8970096a349 100644
--- a/clang/test/CodeGenHLSL/builtins/cos.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cos.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/cosh.hlsl b/clang/test/CodeGenHLSL/builtins/cosh.hlsl
index 07c64206412db..80474d459fcbd 100644
--- a/clang/test/CodeGenHLSL/builtins/cosh.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cosh.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/countbits.hlsl b/clang/test/CodeGenHLSL/builtins/countbits.hlsl
index 218d8dcd10f8d..87524ae58a0d6 100644
--- a/clang/test/CodeGenHLSL/builtins/countbits.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/countbits.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
 
 #ifdef __HLSL_ENABLE_16_BIT
diff --git a/clang/test/CodeGenHLSL/builtins/cross.hlsl b/clang/test/CodeGenHLSL/builtins/cross.hlsl
index 873cb6db30425..e53b34bb9dc42 100644
--- a/clang/test/CodeGenHLSL/builtins/cross.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cross.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
diff --git a/clang/test/CodeGenHLSL/builtins/degrees-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/degrees-builtin.hlsl
index 2e639f5577d20..3098ed242a492 100644
--- a/clang/test/CodeGenHLSL/builtins/degrees-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/degrees-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_degrees_half
diff --git a/clang/test/CodeGenHLSL/builtins/degrees.hlsl b/clang/test/CodeGenHLSL/builtins/degrees.hlsl
index f0fb12855e5f6..645e44eba3d95 100644
--- a/clang/test/CodeGenHLSL/builtins/degrees.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/degrees.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
diff --git a/clang/test/CodeGenHLSL/builtins/distance.hlsl b/clang/test/CodeGenHLSL/builtins/distance.hlsl
index 0c24fbb9f1859..bf015415a7d2f 100644
--- a/clang/test/CodeGenHLSL/builtins/distance.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/distance.hlsl
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh(
diff --git a/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl
index 716704a1bfdad..cbbf38aba3504 100644
--- a/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_dot_half
diff --git a/clang/test/CodeGenHLSL/builtins/dot.hlsl b/clang/test/CodeGenHLSL/builtins/dot.hlsl
index c1fdb0740adc3..a496842281d6d 100644
--- a/clang/test/CodeGenHLSL/builtins/dot.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/dot.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,DXCHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
@@ -7,7 +7,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXCHECK,NO_HALF
 
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,SPVCHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/dot2add.hlsl b/clang/test/CodeGenHLSL/builtins/dot2add.hlsl
index e80ffba2bcfdb..3165c24f2a60e 100644
--- a/clang/test/CodeGenHLSL/builtins/dot2add.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/dot2add.hlsl
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
+// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
 // RUN:   dxil-pc-shadermodel6.4-compute %s -emit-llvm -o - | \
 // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
-// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \
+// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -fnative-int16-type -triple \
 // RUN:   spirv-pc-vulkan-compute %s -emit-llvm -o - | \
 // RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
 
diff --git a/clang/test/CodeGenHLSL/builtins/dst.hlsl b/clang/test/CodeGenHLSL/builtins/dst.hlsl
index a0840c66e5da9..d8292d31fba7c 100644
--- a/clang/test/CodeGenHLSL/builtins/dst.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/dst.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: define {{.*}} <4 x float> @{{[A-Za-z1-9_]+}}dst_impl{{[A-Za-z1-9_]*}}(
diff --git a/clang/test/CodeGenHLSL/builtins/exp.hlsl b/clang/test/CodeGenHLSL/builtins/exp.hlsl
index 5a8f60528a84c..d50ef021eecb8 100644
--- a/clang/test/CodeGenHLSL/builtins/exp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/exp2.hlsl b/clang/test/CodeGenHLSL/builtins/exp2.hlsl
index a9bbcb0d9bff9..ed8cfcf47b04b 100644
--- a/clang/test/CodeGenHLSL/builtins/exp2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp2.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl
new file mode 100644
index 0000000000000..65dba664bb5ea
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s
+
+// CHECK: define hidden noundef nofpclass(nan inf) float
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %0)
+// CHECK: ret float %hlsl.f16tof32
+// CHECK: declare float @llvm.dx.legacyf16tof32.i32(i32)
+float test_scalar(uint p0) { return __builtin_hlsl_elementwise_f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <2 x float>
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %0)
+// CHECK: ret <2 x float> %hlsl.f16tof32
+// CHECK: declare <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32>)
+float2 test_uint2(uint2 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) #0 {
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %0)
+// CHECK: ret <3 x float> %hlsl.f16tof32
+// CHECK: declare <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32>)
+float3 test_uint3(uint3 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) #0 {
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %0)
+// CHECK: ret <4 x float> %hlsl.f16tof32
+// CHECK: declare <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32>)
+float4 test_uint4(uint4 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); }
+
+
+
diff --git a/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl b/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl
new file mode 100644
index 0000000000000..b68bc197f16c5
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s
+
+// CHECK: define hidden noundef nofpclass(nan inf) float
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %0)
+// CHECK: ret float %hlsl.f16tof32
+// CHECK: declare float @llvm.dx.legacyf16tof32.i32(i32)
+float test_scalar(uint p0) { return f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <2 x float>
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %0)
+// CHECK: ret <2 x float> %hlsl.f16tof32
+// CHECK: declare <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32>)
+float2 test_uint2(uint2 p0) { return f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) #0 {
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %0)
+// CHECK: ret <3 x float> %hlsl.f16tof32
+// CHECK: declare <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32>)
+float3 test_uint3(uint3 p0) { return f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) #0 {
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %0)
+// CHECK: ret <4 x float> %hlsl.f16tof32
+// CHECK: declare <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32>)
+float4 test_uint4(uint4 p0) { return f16tof32(p0); }
+
+
+
diff --git a/clang/test/CodeGenHLSL/builtins/faceforward.hlsl b/clang/test/CodeGenHLSL/builtins/faceforward.hlsl
index d2ece57aba4ae..70459d81685a1 100644
--- a/clang/test/CodeGenHLSL/builtins/faceforward.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/faceforward.hlsl
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -o - | FileCheck %s --check-prefix=SPVCHECK
 
 // CHECK-LABEL: test_faceforward_half
diff --git a/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl b/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
index a71b1878f8b55..51b0f81bea06a 100644
--- a/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
@@ -1,161 +1,260 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s -DTARGET=dx
+// RUN:   -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s -DTARGET=dx \
+// RUN:   --check-prefixes=CHECK,DXCHECK
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
-// RUN: -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s -DTARGET=spv
+// RUN:   -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s -DTARGET=spv
 
 #ifdef __HLSL_ENABLE_16_BIT
 // CHECK-LABEL: test_firstbithigh_ushort
-// CHECK: call i32 @llvm.[[TARGET]].firstbituhigh.i16
+// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbituhigh.i16
+// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 15, [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
+// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
+// CHECK-NEXT: ret i32
 uint test_firstbithigh_ushort(uint16_t p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_ushort2
-// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i16
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i16
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 15), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
+// CHECK-NEXT: ret <2 x i32>
 uint2 test_firstbithigh_ushort2(uint16_t2 p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_ushort3
-// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i16
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i16
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 15), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
+// CHECK-NEXT: ret <3 x i32>
 uint3 test_firstbithigh_ushort3(uint16_t3 p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_ushort4
-// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i16
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i16
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 15), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
+// CHECK-NEXT: ret <4 x i32>
 uint4 test_firstbithigh_ushort4(uint16_t4 p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_short
-// CHECK: call i32 @llvm.[[TARGET]].firstbitshigh.i16
+// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbitshigh.i16
+// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 15, [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
+// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
+// CHECK-NEXT: ret i32
 uint test_firstbithigh_short(int16_t p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_short2
-// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i16
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i16
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 15), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
+// CHECK-NEXT: ret <2 x i32>
 uint2 test_firstbithigh_short2(int16_t2 p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_short3
-// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i16
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i16
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 15), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
+// CHECK-NEXT: ret <3 x i32>
 uint3 test_firstbithigh_short3(int16_t3 p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_short4
-// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i16
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i16
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 15), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
+// CHECK-NEXT: ret <4 x i32>
 uint4 test_firstbithigh_short4(int16_t4 p0) {
   return firstbithigh(p0);
 }
 #endif // __HLSL_ENABLE_16_BIT
 
 // CHECK-LABEL: test_firstbithigh_uint
-// CHECK: call i32 @llvm.[[TARGET]].firstbituhigh.i32
+// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbituhigh.i32
+// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 31, [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
+// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
+// CHECK-NEXT: ret i32
 uint test_firstbithigh_uint(uint p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_uint2
-// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i32
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i32
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 31), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
+// CHECK-NEXT: ret <2 x i32>
 uint2 test_firstbithigh_uint2(uint2 p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_uint3
-// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i32
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i32
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 31), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
+// CHECK-NEXT: ret <3 x i32>
 uint3 test_firstbithigh_uint3(uint3 p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_uint4
-// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i32
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i32
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 31), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
+// CHECK-NEXT: ret <4 x i32>
 uint4 test_firstbithigh_uint4(uint4 p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_ulong
-// CHECK: call i32 @llvm.[[TARGET]].firstbituhigh.i64
+// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbituhigh.i64
+// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 63, [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
+// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
+// CHECK-NEXT: ret i32
 uint test_firstbithigh_ulong(uint64_t p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_ulong2
-// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i64
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbituhigh.v2i64
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 63), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
+// CHECK-NEXT: ret <2 x i32>
 uint2 test_firstbithigh_ulong2(uint64_t2 p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_ulong3
-// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i64
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbituhigh.v3i64
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 63), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
+// CHECK-NEXT: ret <3 x i32>
 uint3 test_firstbithigh_ulong3(uint64_t3 p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_ulong4
-// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i64
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i64
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 63), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
+// CHECK-NEXT: ret <4 x i32>
 uint4 test_firstbithigh_ulong4(uint64_t4 p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_int
-// CHECK: call i32 @llvm.[[TARGET]].firstbitshigh.i32
+// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbitshigh.i32
+// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 31, [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
+// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
+// CHECK-NEXT: ret i32
 uint test_firstbithigh_int(int p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_int2
-// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i32
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i32
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 31), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
+// CHECK-NEXT: ret <2 x i32>
 uint2 test_firstbithigh_int2(int2 p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_int3
-// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i32
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i32
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 31), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
+// CHECK-NEXT: ret <3 x i32>
 uint3 test_firstbithigh_int3(int3 p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_int4
-// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i32
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i32
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 31), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
+// CHECK-NEXT: ret <4 x i32>
 uint4 test_firstbithigh_int4(int4 p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_long
-// CHECK: call i32 @llvm.[[TARGET]].firstbitshigh.i64
+// CHECK: [[FBH:%.*]] = tail call {{.*}}i32 @llvm.[[TARGET]].firstbitshigh.i64
+// DXCHECK-NEXT: [[SUB:%.*]] = sub i32 63, [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[FBH]], -1
+// DXCHECK-NEXT: select i1 %cmp.i, i32 -1, i32 [[SUB]]
+// CHECK-NEXT: ret i32
 uint test_firstbithigh_long(int64_t p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_long2
-// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i64
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<2 x i32> @llvm.[[TARGET]].firstbitshigh.v2i64
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> splat (i32 63), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <2 x i1> %cmp.i, <2 x i32> splat (i32 -1), <2 x i32> [[SUB]]
+// CHECK-NEXT: ret <2 x i32>
 uint2 test_firstbithigh_long2(int64_t2 p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_long3
-// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i64
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<3 x i32> @llvm.[[TARGET]].firstbitshigh.v3i64
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <3 x i32> splat (i32 63), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <3 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <3 x i1> %cmp.i, <3 x i32> splat (i32 -1), <3 x i32> [[SUB]]
+// CHECK-NEXT: ret <3 x i32>
 uint3 test_firstbithigh_long3(int64_t3 p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_long4
-// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i64
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbitshigh.v4i64
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 63), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
+// CHECK-NEXT: ret <4 x i32>
 uint4 test_firstbithigh_long4(int64_t4 p0) {
   return firstbithigh(p0);
 }
 
 // CHECK-LABEL: test_firstbithigh_upcast
-// CHECK: [[FBH:%.*]] = call <4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i32(<4 x i32> %{{.*}})
-// CHECK: [[CONV:%.*]] = zext <4 x i32> [[FBH]] to <4 x i64>
-// CHECK: ret <4 x i64> [[CONV]]
+// CHECK: [[FBH:%.*]] = tail call {{.*}}<4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i32(<4 x i32> %{{.*}})
+// DXCHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> splat (i32 31), [[FBH]]
+// DXCHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x i32> [[FBH]], splat (i32 -1)
+// DXCHECK-NEXT: select <4 x i1> %cmp.i, <4 x i32> splat (i32 -1), <4 x i32> [[SUB]]
+// CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i32> {{.*}} to <4 x i64>
+// CHECK-NEXT: ret <4 x i64> [[ZEXT]]
 uint64_t4 test_firstbithigh_upcast(uint4 p0) {
   return firstbithigh(p0);
 }
diff --git a/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
index 007db0c9c2ad5..a1d2a1b31c99a 100644
--- a/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s -DTARGET=spv
 
diff --git a/clang/test/CodeGenHLSL/builtins/floor.hlsl b/clang/test/CodeGenHLSL/builtins/floor.hlsl
index b3ff58317981a..4763e54f92b8e 100644
--- a/clang/test/CodeGenHLSL/builtins/floor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/floor.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/fmod.hlsl b/clang/test/CodeGenHLSL/builtins/fmod.hlsl
index cc91c0b67f6cc..527eb6020469e 100644
--- a/clang/test/CodeGenHLSL/builtins/fmod.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/fmod.hlsl
@@ -3,7 +3,7 @@
 // ---------- Native Half support test -----------
 //
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -o - | FileCheck %s -DFNATTRS="hidden noundef nofpclass(nan inf)" \
 // RUN:   -DTYPE=half -DINT_TYPE=f16 --check-prefixes=DXCHECK
 
@@ -21,7 +21,7 @@
 // ---------- Native Half support test -----------
 //
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -o - | FileCheck %s \
 // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTYPE=half
 
diff --git a/clang/test/CodeGenHLSL/builtins/frac-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/frac-builtin.hlsl
index 9f144f470ed90..e41fd856c6a42 100644
--- a/clang/test/CodeGenHLSL/builtins/frac-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/frac-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_frac_half
diff --git a/clang/test/CodeGenHLSL/builtins/frac.hlsl b/clang/test/CodeGenHLSL/builtins/frac.hlsl
index d8397407cd013..3b61c482e86ad 100644
--- a/clang/test/CodeGenHLSL/builtins/frac.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/frac.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
diff --git a/clang/test/CodeGenHLSL/builtins/isinf.hlsl b/clang/test/CodeGenHLSL/builtins/isinf.hlsl
index dc869a64a65b7..b778df38bc9b6 100644
--- a/clang/test/CodeGenHLSL/builtins/isinf.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/isinf.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,DXCHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
@@ -7,7 +7,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXCHECK,NO_HALF
 
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,SPVCHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/isnan.hlsl b/clang/test/CodeGenHLSL/builtins/isnan.hlsl
index ce7dbe1aedea4..cca3863557229 100644
--- a/clang/test/CodeGenHLSL/builtins/isnan.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/isnan.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,DXCHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
@@ -7,7 +7,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXCHECK,NO_HALF
 
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,SPVCHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/ldexp.hlsl b/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
index f8fa06c39f2a1..012adc588ddfa 100644
--- a/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) half @_ZN4hlsl8__detail10ldexp_implIDhEET_S2_S2_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn half @llvm.exp2.f16(half %{{.*}})
diff --git a/clang/test/CodeGenHLSL/builtins/length.hlsl b/clang/test/CodeGenHLSL/builtins/length.hlsl
index 9297c35abfd16..95edb20dacdac 100644
--- a/clang/test/CodeGenHLSL/builtins/length.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/length.hlsl
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s
 
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
 
diff --git a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
index 96bcf2b49bf25..cb8634c9234e3 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_lerp_half
diff --git a/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
index 3b13e43873c77..20f758b18218e 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
-// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @_Z16test_lerp_doubled(
diff --git a/clang/test/CodeGenHLSL/builtins/lerp.hlsl b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
index d7a7113de4878..02cf14c0e1772 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
diff --git a/clang/test/CodeGenHLSL/builtins/lit.hlsl b/clang/test/CodeGenHLSL/builtins/lit.hlsl
index 44b3e96ef88bf..c0b109a75906b 100644
--- a/clang/test/CodeGenHLSL/builtins/lit.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lit.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s
 
 // CHECK-LABEL: test_lit_half
 // CHECK: %cmp.i = fcmp reassoc nnan ninf nsz arcp afn olt half %{{.*}}, 0xH0000
diff --git a/clang/test/CodeGenHLSL/builtins/log.hlsl b/clang/test/CodeGenHLSL/builtins/log.hlsl
index 0136c1a052ed4..20e62120b64a6 100644
--- a/clang/test/CodeGenHLSL/builtins/log.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/log10.hlsl b/clang/test/CodeGenHLSL/builtins/log10.hlsl
index 6a75444143b18..feeccf7cd7ab3 100644
--- a/clang/test/CodeGenHLSL/builtins/log10.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log10.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/log2.hlsl b/clang/test/CodeGenHLSL/builtins/log2.hlsl
index 84d73c1810890..a57fc44e09b70 100644
--- a/clang/test/CodeGenHLSL/builtins/log2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log2.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/mad.hlsl b/clang/test/CodeGenHLSL/builtins/mad.hlsl
index e764e20748d58..1116c1419997d 100644
--- a/clang/test/CodeGenHLSL/builtins/mad.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/mad.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,DXIL_CHECK,DXIL_NATIVE_HALF,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
@@ -7,7 +7,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXIL_CHECK,NO_HALF
 
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF,SPIR_NATIVE_HALF,SPIR_CHECK
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
index cd7013ba75825..a5ef87a822dd5 100644
--- a/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm  -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm  -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
diff --git a/clang/test/CodeGenHLSL/builtins/max.hlsl b/clang/test/CodeGenHLSL/builtins/max.hlsl
index fab53a160c856..9c621e62b5336 100644
--- a/clang/test/CodeGenHLSL/builtins/max.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/max.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
index f81fa128ce9c7..c0e06b0d204b3 100644
--- a/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
diff --git a/clang/test/CodeGenHLSL/builtins/min.hlsl b/clang/test/CodeGenHLSL/builtins/min.hlsl
index b3e8fedff9b1b..44d2063229cdb 100644
--- a/clang/test/CodeGenHLSL/builtins/min.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/min.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/normalize-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/normalize-builtin.hlsl
index 3db64604a1319..46bfb44c9b2a1 100644
--- a/clang/test/CodeGenHLSL/builtins/normalize-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/normalize-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_normalize_half
diff --git a/clang/test/CodeGenHLSL/builtins/normalize.hlsl b/clang/test/CodeGenHLSL/builtins/normalize.hlsl
index 85937346ead65..bbea11a8b432f 100644
--- a/clang/test/CodeGenHLSL/builtins/normalize.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/normalize.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
diff --git a/clang/test/CodeGenHLSL/builtins/pow.hlsl b/clang/test/CodeGenHLSL/builtins/pow.hlsl
index fcde755e15fcc..b11ded8c1d173 100644
--- a/clang/test/CodeGenHLSL/builtins/pow.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/pow.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/radians-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/radians-builtin.hlsl
index 0c86357d5ecad..1f7e19055ee6b 100644
--- a/clang/test/CodeGenHLSL/builtins/radians-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/radians-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_radians_half
diff --git a/clang/test/CodeGenHLSL/builtins/radians.hlsl b/clang/test/CodeGenHLSL/builtins/radians.hlsl
index f281747fbf298..6521606a25c05 100644
--- a/clang/test/CodeGenHLSL/builtins/radians.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/radians.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DTARGET=dx -DFNATTRS="hidden noundef nofpclass(nan inf)"
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DTARGET=dx -DFNATTRS="hidden noundef nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)"
diff --git a/clang/test/CodeGenHLSL/builtins/rcp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/rcp-builtin.hlsl
index d81a49b8c6048..2cc38203bd060 100644
--- a/clang/test/CodeGenHLSL/builtins/rcp-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/rcp-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_rcp_half
diff --git a/clang/test/CodeGenHLSL/builtins/rcp.hlsl b/clang/test/CodeGenHLSL/builtins/rcp.hlsl
index cdfaa3c5f1ee3..c9c47c737114d 100644
--- a/clang/test/CodeGenHLSL/builtins/rcp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/rcp.hlsl
@@ -1,12 +1,12 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,DXIL_CHECK,DXIL_NATIVE_HALF,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXIL_CHECK,NO_HALF,DXIL_NO_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF,SPIR_NATIVE_HALF,SPIR_CHECK
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/reflect.hlsl b/clang/test/CodeGenHLSL/builtins/reflect.hlsl
index 65fefd801ffed..feb5a5b2ea78f 100644
--- a/clang/test/CodeGenHLSL/builtins/reflect.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/reflect.hlsl
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh(
diff --git a/clang/test/CodeGenHLSL/builtins/refract.hlsl b/clang/test/CodeGenHLSL/builtins/refract.hlsl
index eda256451ee2b..ffeb2a78b2517 100644
--- a/clang/test/CodeGenHLSL/builtins/refract.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/refract.hlsl
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -o - | FileCheck %s --check-prefix=SPVCHECK
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_refract_halfDhDhDh(
diff --git a/clang/test/CodeGenHLSL/builtins/reversebits.hlsl b/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
index 91375c8f4eb8f..5fd8de9c95df8 100644
--- a/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
 
 #ifdef __HLSL_ENABLE_16_BIT
diff --git a/clang/test/CodeGenHLSL/builtins/round.hlsl b/clang/test/CodeGenHLSL/builtins/round.hlsl
index 755f2e86fb116..0d4afee6ba9a8 100644
--- a/clang/test/CodeGenHLSL/builtins/round.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/round.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/rsqrt-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/rsqrt-builtin.hlsl
index 43ad9d0d0b844..d45f8cbbb5cf1 100644
--- a/clang/test/CodeGenHLSL/builtins/rsqrt-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/rsqrt-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_rsqrt_half
diff --git a/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
index 9c398fd6f06cb..de2a222ae78d1 100644
--- a/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
diff --git a/clang/test/CodeGenHLSL/builtins/saturate-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/saturate-builtin.hlsl
index 7dbba72f3abb5..c407362c1c85f 100644
--- a/clang/test/CodeGenHLSL/builtins/saturate-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/saturate-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 
 // CHECK-LABEL: builtin_saturate_half
diff --git a/clang/test/CodeGenHLSL/builtins/saturate.hlsl b/clang/test/CodeGenHLSL/builtins/saturate.hlsl
index 3304073d9b501..c583013d4b245 100644
--- a/clang/test/CodeGenHLSL/builtins/saturate.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/saturate.hlsl
@@ -1,12 +1,12 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -Dtar=dx
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF -Dtar=dx
 
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -Dtar=spv
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/select.hlsl b/clang/test/CodeGenHLSL/builtins/select.hlsl
index 7590b4a881259..e5169844cb3f2 100644
--- a/clang/test/CodeGenHLSL/builtins/select.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/select.hlsl
@@ -20,16 +20,6 @@ struct S test_select_infer_struct(bool cond0, struct S tVal, struct S fVal) {
   return select(cond0, tVal, fVal);
 }
 
-// CHECK-LABEL: test_select_infer_array
-// CHECK: [[TRUE_VAL:%.*]] = load [3 x i32], ptr {{%.*}}, align 4
-// CHECK: [[FALSE_VAL:%.*]] = load [3 x i32], ptr {{%.*}}, align 4
-// CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, [3 x i32] [[TRUE_VAL]], [3 x i32] [[FALSE_VAL]]
-// CHECK: store [3 x i32] [[SELECT]], ptr {{%.*}}, align 4
-// CHECK: ret void
-int test_select_infer_array(bool cond, int tVal[3], int fVal[3])[3] {
-  return select(cond, tVal, fVal);
-}
-
 // CHECK-LABEL: test_select_bool_vector
 // CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> {{%.*}}
 // CHECK: ret <2 x i32> [[SELECT]]
@@ -38,24 +28,24 @@ int2 test_select_bool_vector(bool cond0, int2 tVal, int2 fVal) {
 }
 
 // CHECK-LABEL: test_select_vector_1
-// CHECK: [[SELECT:%.*]] = select <1 x i1> {{%.*}}, <1 x i32> {{%.*}}, <1 x i32> {{%.*}}
+// CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, <1 x i32> {{%.*}}, <1 x i32> {{%.*}}
 // CHECK: ret <1 x i32> [[SELECT]]
 int1 test_select_vector_1(bool1 cond0, int1 tVals, int1 fVals) {
-  return select<int,1>(cond0, tVals, fVals);
+  return select(cond0, tVals, fVals);
 }
 
 // CHECK-LABEL: test_select_vector_2
 // CHECK: [[SELECT:%.*]] = select <2 x i1> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> {{%.*}}
 // CHECK: ret <2 x i32> [[SELECT]]
 int2 test_select_vector_2(bool2 cond0, int2 tVals, int2 fVals) {
-  return select<int,2>(cond0, tVals, fVals);
+  return select(cond0, tVals, fVals);
 }
 
 // CHECK-LABEL: test_select_vector_3
 // CHECK: [[SELECT:%.*]] = select <3 x i1> {{%.*}}, <3 x i32> {{%.*}}, <3 x i32> {{%.*}}
 // CHECK: ret <3 x i32> [[SELECT]]
 int3 test_select_vector_3(bool3 cond0, int3 tVals, int3 fVals) {
-  return select<int,3>(cond0, tVals, fVals);
+  return select(cond0, tVals, fVals);
 }
 
 // CHECK-LABEL: test_select_vector_4
@@ -86,10 +76,54 @@ int4 test_select_vector_vector_scalar(bool4 cond0, int4 tVals, int fVal) {
 // CHECK-LABEL: test_select_vector_scalar_scalar
 // CHECK: [[SPLAT_SRC1:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0
 // CHECK: [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC1]], <4 x i32> poison, <4 x i32> zeroinitializer
-// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 %3, i64 0
+// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0
 // CHECK: [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC2]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK: [[SELECT:%.*]] = select <4 x i1> {{%.*}}, <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
 // CHECK: ret <4 x i32> [[SELECT]]
 int4 test_select_vector_scalar_scalar(bool4 cond0, int tVal, int fVal) {
   return select(cond0, tVal, fVal);
 }
+
+// CHECK-LABEL: test_select_nonbool_cond_vector_4
+// CHECK: [[TMP0:%.*]] = load <4 x i32>, ptr %cond0.addr, align 16
+// CHECK: [[TOBOOL:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
+// CHECK: [[SELECT:%.*]] = select <4 x i1> [[TOBOOL]], <4 x i1> {{%.*}}, <4 x i1> {{%.*}}
+// CHECK: ret <4 x i1> [[SELECT]]
+bool4 test_select_nonbool_cond_vector_4(int4 cond0, bool4 tVal, bool4 fVal) {
+  return select(cond0, tVal, fVal);
+}
+
+// CHECK-LABEL: test_select_nonbool_cond_vector_scalar_vector
+// CHECK: [[TMP0:%.*]] = load <3 x i32>, ptr %cond0.addr, align 16
+// CHECK: [[TOBOOL:%.*]] = icmp ne <3 x i32> [[TMP0]], zeroinitializer
+// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <3 x i32> poison, i32 {{%.*}}, i64 0
+// CHECK: [[SPLAT1:%.*]] = shufflevector <3 x i32> [[SPLAT_SRC1]], <3 x i32> poison, <3 x i32> zeroinitializer
+// CHECK: [[SELECT:%.*]] = select <3 x i1> [[TOBOOL]], <3 x i32> [[SPLAT1]], <3 x i32> {{%.*}}
+// CHECK: ret <3 x i32> [[SELECT]]
+int3 test_select_nonbool_cond_vector_scalar_vector(int3 cond0, int tVal, int3 fVal) {
+  return select(cond0, tVal, fVal);
+}
+
+// CHECK-LABEL: test_select_nonbool_cond_vector_vector_scalar
+// CHECK: [[TMP0:%.*]] = load <2 x i32>, ptr %cond0.addr, align 8
+// CHECK: [[TOBOOL:%.*]] = icmp ne <2 x i32> [[TMP0]], zeroinitializer
+// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <2 x i32> poison, i32 {{%.*}}, i64 0
+// CHECK: [[SPLAT1:%.*]] = shufflevector <2 x i32> [[SPLAT_SRC1]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK: [[SELECT:%.*]] = select <2 x i1> [[TOBOOL]], <2 x i32> {{%.*}}, <2 x i32> [[SPLAT1]]
+// CHECK: ret <2 x i32> [[SELECT]]
+int2 test_select_nonbool_cond_vector_vector_scalar(int2 cond0, int2 tVal, int fVal) {
+  return select(cond0, tVal, fVal);
+}
+
+// CHECK-LABEL: test_select_nonbool_cond_vector_scalar_scalar
+// CHECK: [[TMP0:%.*]] = load <4 x i32>, ptr %cond0.addr, align 16
+// CHECK: [[TOBOOL:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
+// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0
+// CHECK: [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC1]], <4 x i32> poison, <4 x i32> zeroinitializer
+// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0
+// CHECK: [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC2]], <4 x i32> poison, <4 x i32> zeroinitializer
+// CHECK: [[SELECT:%.*]] = select <4 x i1> [[TOBOOL]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
+// CHECK: ret <4 x i32> [[SELECT]]
+int4 test_select_nonbool_cond_vector_scalar_scalar(int4 cond0, int tVal, int fVal) {
+  return select(cond0, tVal, fVal);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/sign.hlsl b/clang/test/CodeGenHLSL/builtins/sign.hlsl
index cbdb929388934..ef8f7168b1002 100644
--- a/clang/test/CodeGenHLSL/builtins/sign.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sign.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DTARGET=dx -DFNATTRS="hidden noundef"
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DTARGET=dx -DFNATTRS="hidden noundef"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef"
diff --git a/clang/test/CodeGenHLSL/builtins/sin.hlsl b/clang/test/CodeGenHLSL/builtins/sin.hlsl
index 9bbe97997aa33..5a900972c7ac9 100644
--- a/clang/test/CodeGenHLSL/builtins/sin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sin.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/sinh.hlsl b/clang/test/CodeGenHLSL/builtins/sinh.hlsl
index d55d60515418c..ab0f814ecd694 100644
--- a/clang/test/CodeGenHLSL/builtins/sinh.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sinh.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
index bef64ce77d470..dcf9013045c07 100644
--- a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s
 // RUN: %clang_cc1 -finclude-default-header -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh(
diff --git a/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl b/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
index aeb2b79e90291..53f4f6aa2cb5f 100644
--- a/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -emit-llvm -O0 -o - | FileCheck %s --check-prefix=SPIRV
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O1 -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -fnative-int16-type -emit-llvm -O0 -o - | FileCheck %s --check-prefix=SPIRV
 
 
 
diff --git a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
index 31839f6bc177d..ce77459c77c41 100644
--- a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/builtins/step.hlsl b/clang/test/CodeGenHLSL/builtins/step.hlsl
index 6f6588a026a45..5061f8126d7e2 100644
--- a/clang/test/CodeGenHLSL/builtins/step.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/step.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
@@ -8,7 +8,7 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
 // RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
 // RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
diff --git a/clang/test/CodeGenHLSL/builtins/tan.hlsl b/clang/test/CodeGenHLSL/builtins/tan.hlsl
index c8c948624a613..2a108bf97bd1f 100644
--- a/clang/test/CodeGenHLSL/builtins/tan.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/tan.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/tanh.hlsl b/clang/test/CodeGenHLSL/builtins/tanh.hlsl
index f947c7f53b110..91345caad84c9 100644
--- a/clang/test/CodeGenHLSL/builtins/tanh.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/tanh.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
diff --git a/clang/test/CodeGenHLSL/builtins/transpose-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/transpose-builtin.hlsl
index 86aa7cd6985dd..ef282fc355b23 100644
--- a/clang/test/CodeGenHLSL/builtins/transpose-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/transpose-builtin.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // NOTE: This test is only to confirm we can do codgen with the matrix alias.
 
diff --git a/clang/test/CodeGenHLSL/builtins/trunc.hlsl b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
index c1c6ee4119f0d..58cc78ed03596 100644
--- a/clang/test/CodeGenHLSL/builtins/trunc.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
diff --git a/clang/test/CodeGenHLSL/enable-16bit-types.hlsl b/clang/test/CodeGenHLSL/enable-16bit-types.hlsl
index 690404c4fde24..9e92eb04ada5b 100644
--- a/clang/test/CodeGenHLSL/enable-16bit-types.hlsl
+++ b/clang/test/CodeGenHLSL/enable-16bit-types.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.3-unknown-shadermodel6.3-library \
+// RUN: %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.3-unknown-shadermodel6.3-library \
 // RUN:  -finclude-default-header -emit-llvm -o - %s 2>&1 | FileCheck %s --check-prefix=FLAG
 // RUN: %clang_cc1 -std=hlsl202x -triple dxilv1.3-unknown-shadermodel6.3-library \
 // RUN:  -finclude-default-header -emit-llvm -o - %s 2>&1 | FileCheck %s --check-prefix=NOFLAG
diff --git a/clang/test/CodeGenHLSL/float3.hlsl b/clang/test/CodeGenHLSL/float3.hlsl
index 4f03464586bf0..4abd18713e718 100644
--- a/clang/test/CodeGenHLSL/float3.hlsl
+++ b/clang/test/CodeGenHLSL/float3.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // Make sure float3 is not changed into float4.
diff --git a/clang/test/CodeGenHLSL/no_int_promotion.hlsl b/clang/test/CodeGenHLSL/no_int_promotion.hlsl
index b4ffcb477f1ba..adea165c1c864 100644
--- a/clang/test/CodeGenHLSL/no_int_promotion.hlsl
+++ b/clang/test/CodeGenHLSL/no_int_promotion.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -D__HLSL_ENABLE_16_BIT \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
 
 // FIXME: add test for char/int8_t/uint8_t when these types are supported in HLSL.
diff --git a/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl
index c97ad4237000f..843f14474a23f 100644
--- a/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl
+++ b/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL
 
 struct MyStruct {
   float4 a;
diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl
index 2b286bde88468..43f2e9cb7f333 100644
--- a/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl
+++ b/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl
@@ -1,25 +1,25 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=StructuredBuffer %s | FileCheck %s -DRESOURCE=StructuredBuffer -check-prefixes=DXIL-RO
 
-// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=StructuredBuffer %s | FileCheck %s -DRESOURCE=StructuredBuffer -check-prefixes=SPV-RO
 
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=RWStructuredBuffer %s | FileCheck %s -DRESOURCE=RWStructuredBuffer -check-prefixes=DXIL-RW
 
-// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=RWStructuredBuffer %s | FileCheck %s -DRESOURCE=RWStructuredBuffer -check-prefixes=SPV-RW
 
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=AppendStructuredBuffer %s | FileCheck %s -DRESOURCE=AppendStructuredBuffer -check-prefixes=DXIL-RW
 
-// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=AppendStructuredBuffer %s | FileCheck %s -DRESOURCE=AppendStructuredBuffer -check-prefixes=SPV-RW
 
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck %s -DRESOURCE=ConsumeStructuredBuffer -check-prefixes=DXIL-RW
 
-// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck %s -DRESOURCE=ConsumeStructuredBuffer -check-prefixes=SPV-RW
 
 // DXIL-RO: %"class.hlsl::[[RESOURCE]]" = type { target("dx.RawBuffer", i16, 0, 0) }
diff --git a/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl
index d3dba8a69cc72..7d59bc5fed5ea 100644
--- a/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl
+++ b/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl
@@ -1,13 +1,13 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type  \
 // RUN: -emit-llvm -o - -DRESOURCE=Buffer %s | FileCheck %s -DRESOURCE=Buffer -DRW=0 -check-prefixes=DXIL
 
-// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=Buffer %s | FileCheck %s -DRESOURCE=Buffer -DRW=1 -check-prefixes=SPV-RO
 
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=RWBuffer %s | FileCheck %s -DRESOURCE=RWBuffer -DRW=1 -check-prefixes=DXIL
 
-// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type \
+// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type -fnative-int16-type \
 // RUN: -emit-llvm -o - -DRESOURCE=RWBuffer %s | FileCheck %s -DRESOURCE=RWBuffer --DRW=2 -check-prefixes=SPV-RW
 
 // DXIL: %"class.hlsl::[[RESOURCE]]" = type { target("dx.TypedBuffer", i16, [[RW]], 0, 1) }
diff --git a/clang/test/CodeGenHLSL/resources/cbuffer.hlsl b/clang/test/CodeGenHLSL/resources/cbuffer.hlsl
index 8dcff5dad9d13..c8efe0d64c985 100644
--- a/clang/test/CodeGenHLSL/resources/cbuffer.hlsl
+++ b/clang/test/CodeGenHLSL/resources/cbuffer.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-compute -fnative-half-type -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-compute -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 
 // CHECK: %__cblayout_CBScalars = type <{ float, double, half, i64, i32, i16, i32, i64 }>
 // CHECK: %__cblayout_CBVectors = type <{ <3 x float>, <3 x double>, <2 x half>, <3 x i64>, <4 x i32>, <3 x i16>, <3 x i64> }>
diff --git a/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl b/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl
index 7aeb877072d87..b0abaeddff422 100644
--- a/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl
+++ b/clang/test/CodeGenHLSL/semantics/DispatchThreadID.hlsl
@@ -24,4 +24,3 @@ void foo(uint Idx : SV_DispatchThreadID) {}
 [shader("compute")]
 [numthreads(8,8,1)]
 void bar(uint2 Idx : SV_DispatchThreadID) {}
-
diff --git a/clang/test/CodeGenHLSL/semantics/semantic.arbitrary.hlsl b/clang/test/CodeGenHLSL/semantics/semantic.arbitrary.hlsl
new file mode 100644
index 0000000000000..96d5b995fa74a
--- /dev/null
+++ b/clang/test/CodeGenHLSL/semantics/semantic.arbitrary.hlsl
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan-vertex -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-vertex -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx
+
+// CHECK-SPIRV-DAG:  @AAA0 = external hidden thread_local addrspace(7) externally_initialized constant float, !spirv.Decorations ![[#METADATA_0:]]
+// CHECK-SPIRV-DAG:    @B0 = external hidden thread_local addrspace(7) externally_initialized constant i32, !spirv.Decorations ![[#METADATA_2:]]
+// CHECK-SPIRV-DAG:   @CC0 = external hidden thread_local addrspace(7) externally_initialized constant <2 x float>, !spirv.Decorations ![[#METADATA_4:]]
+
+
+// FIXME: replace `float2 c` with a  matrix when available.
+void main(float a : AAA, int b : B, float2 c : CC) {
+  float tmp = a + b + c.x + c.y;
+}
+// CHECK-SPIRV: define internal spir_func void @_Z4mainfiDv2_f(float noundef nofpclass(nan inf) %a, i32 noundef %b, <2 x float> noundef nofpclass(nan inf) %c) #0 {
+
+// CHECK: define void @main()
+
+// CHECK-DXIL: %AAA0 = call float @llvm.dx.load.input.f32(i32 4, i32 0, i32 0, i8 0, i32 poison)
+// CHECK-DXIL:   %B0 = call i32 @llvm.dx.load.input.i32(i32 4, i32 0, i32 0, i8 0, i32 poison)
+// CHECK-DXIL   %CC0 = call <2 x float> @llvm.dx.load.input.v2f32(i32 4, i32 0, i32 0, i8 0, i32 poison)
+// CHECK-DXIL:         call void @_Z4mainfiDv2_f(float %AAA0, i32 %B0, <2 x float> %CC0)
+
+// CHECK-SPIRV: %[[#AAA0:]] = load float, ptr addrspace(7) @AAA0, align 4
+// CHECK-SPIRV:   %[[#B0:]] = load i32, ptr addrspace(7) @B0, align 4
+// CHECK-SPIRV:  %[[#CC0:]] = load <2 x float>, ptr addrspace(7) @CC0, align 8
+// CHECK-SPIRV:               call spir_func void @_Z4mainfiDv2_f(float %[[#AAA0]], i32 %[[#B0]], <2 x float> %[[#CC0]]) [ "convergencectrl"(token %0) ]
+
+
+// CHECK-SPIRV-DAG: ![[#METADATA_0]] = !{![[#METADATA_1:]]}
+// CHECK-SPIRV-DAG: ![[#METADATA_2]] = !{![[#METADATA_3:]]}
+// CHECK-SPIRV-DAG: ![[#METADATA_4]] = !{![[#METADATA_5:]]}
+
+// CHECK-SPIRV-DAG: ![[#METADATA_1]] = !{i32 30, i32 0}
+// CHECK-SPIRV-DAG: ![[#METADATA_3]] = !{i32 30, i32 1}
+// CHECK-SPIRV-DAG: ![[#METADATA_5]] = !{i32 30, i32 2}
+//                                            |      `- Location index
+//                                            `-> Decoration "Location"
diff --git a/clang/test/CodeGenHLSL/semantics/semantic.array.hlsl b/clang/test/CodeGenHLSL/semantics/semantic.array.hlsl
new file mode 100644
index 0000000000000..b2cb3dad9f0ce
--- /dev/null
+++ b/clang/test/CodeGenHLSL/semantics/semantic.array.hlsl
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv
+// RUN: %clang_cc1 -triple dxil-px-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx
+
+struct S0 {
+  float4 position[2];
+  float4 color;
+};
+
+// CHECK: %struct.S0 = type { [2 x <4 x float>], <4 x float> }
+
+// CHECK-SPIRV: @A0 = external hidden thread_local addrspace(7) externally_initialized constant [2 x <4 x float>], !spirv.Decorations ![[#MD_0:]]
+// CHECK-SPIRV: @A2 = external hidden thread_local addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations ![[#MD_2:]]
+
+// CHECK:       define void @main0()
+// CHECK-DXIL:          %A0 = call [2 x <4 x float>] @llvm.dx.load.input.a2v4f32(i32 4, i32 0, i32 0, i8 0, i32 poison)
+// CHECK-DXIL: %[[#TMP0:]] = insertvalue %struct.S0 poison, [2 x <4 x float>] %A0, 0
+// CHECK-DXIL:         %A2 = call <4 x float> @llvm.dx.load.input.v4f32(i32 4, i32 0, i32 0, i8 0, i32 poison)
+// CHECK-DXIL: %[[#TMP1:]] = insertvalue %struct.S0 %[[#TMP0]], <4 x float> %A2, 1
+
+// CHECK-SPIRV:   %[[#A0:]] = load [2 x <4 x float>], ptr addrspace(7) @A0, align 16
+// CHECK-SPIRV: %[[#TMP0:]] = insertvalue %struct.S0 poison, [2 x <4 x float>] %[[#A0]], 0
+// CHECK-SPIRV:   %[[#A2:]] = load <4 x float>, ptr addrspace(7) @A2, align 16
+// CHECK-SPIRV: %[[#TMP1:]] = insertvalue %struct.S0 %[[#TMP0]], <4 x float> %[[#A2]], 1
+
+// CHECK:        %[[#ARG:]] = alloca %struct.S0, align 16
+// CHECK:                     store %struct.S0 %[[#TMP1]], ptr %[[#ARG]], align 16
+// CHECK-DXIL:                call void @{{.*}}main0{{.*}}(ptr %[[#ARG]])
+// CHECK-SPIRV:               call spir_func void @{{.*}}main0{{.*}}(ptr %[[#ARG]])
+[shader("pixel")]
+void main0(S0 p : A) {
+  float tmp = p.position[0] + p.position[1] + p.color;
+}
+
+// CHECK-SPIRV: ![[#MD_0]] = !{![[#MD_1:]]}
+// CHECK-SPIRV: ![[#MD_1]] = !{i32 30, i32 0}
+// CHECK-SPIRV: ![[#MD_2]] = !{![[#MD_3:]]}
+// CHECK-SPIRV: ![[#MD_3]] = !{i32 30, i32 2}
diff --git a/clang/test/CodeGenHLSL/semantics/semantic.struct.hlsl b/clang/test/CodeGenHLSL/semantics/semantic.struct.hlsl
new file mode 100644
index 0000000000000..733cf3a1a7b9d
--- /dev/null
+++ b/clang/test/CodeGenHLSL/semantics/semantic.struct.hlsl
@@ -0,0 +1,77 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv
+
+struct S0 {
+  uint Idx : SV_DispatchThreadID;
+};
+
+// CHECK:       define void @main0()
+// CHECK-DXIL:    %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0)
+// CHECK-SPIRV:   %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0)
+// CHECK:        %[[#TMP:]] = insertvalue %struct.S0 poison, i32 %[[#ID:]], 0
+// CHECK:        %[[#ARG:]] = alloca %struct.S0, align 8
+// CHECK:                     store %struct.S0 %[[#TMP]], ptr %[[#ARG]], align 4
+// CHECK-DXIL:                call void @{{.*}}main0{{.*}}(ptr %[[#ARG]])
+// CHECK-SPIRV:               call spir_func void @{{.*}}main0{{.*}}(ptr %[[#ARG]])
+[shader("compute")]
+[numthreads(8,8,1)]
+void main0(S0 p) {}
+
+struct S1 {
+  uint2  a : SV_DispatchThreadID;
+  uint2  b : SV_GroupThreadID;
+};
+
+// CHECK:                     define void @main1()
+// CHECK-DXIL:    %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0)
+// CHECK-SPIRV:   %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0)
+// CHECK:        %[[#AX_:]] = insertelement <2 x i32> poison, i32 %[[#ID]], i64 0
+// CHECK-DXIL:    %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 1)
+// CHECK-SPIRV:   %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 1)
+// CHECK:        %[[#AXY:]] = insertelement <2 x i32> %[[#AX_]], i32 %[[#ID]], i64 1
+// CHECK:       %[[#S1A_:]] = insertvalue %struct.S1 poison, <2 x i32> %[[#AXY]], 0
+// CHECK-DXIL:  %[[#ID_X:]] = call i32 @llvm.[[TARGET]].thread.id.in.group(i32 0)
+// CHECK-SPIRV: %[[#ID_X:]] = call i32 @llvm.[[TARGET]].thread.id.in.group.i32(i32 0)
+// CHECK:      %[[#ID_X_:]] = insertelement <2 x i32> poison, i32 %[[#ID_X]], i64 0
+// CHECK-DXIL:  %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].thread.id.in.group(i32 1)
+// CHECK-SPIRV: %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].thread.id.in.group.i32(i32 1)
+// CHECK:      %[[#ID_XY:]] = insertelement <2 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1
+// CHECK:       %[[#S1AB:]] = insertvalue %struct.S1 %[[#S1A_]], <2 x i32> %[[#ID_XYZ:]], 1
+// CHECK:        %[[#ARG:]] = alloca %struct.S1, align 8
+// CHECK:                     store %struct.S1 %[[#S1AB]], ptr %[[#ARG]], align 8
+// CHECK-DXIL:                call void @{{.*}}main1{{.*}}(ptr %[[#ARG]])
+// CHECK-SPIRV:               call spir_func void @{{.*}}main1{{.*}}(ptr %[[#ARG]])
+[shader("compute")]
+[numthreads(8,8,1)]
+void main1(S1 p) {}
+
+struct S2C {
+  uint2 b : SV_GroupThreadID;
+};
+
+struct S2 {
+  uint  a : SV_DispatchThreadID;
+  S2C child;
+};
+
+// CHECK:                     define void @main2()
+// CHECK-DXIL:    %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0)
+// CHECK-SPIRV:   %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0)
+// CHECK:       %[[#S2A_:]] = insertvalue %struct.S2 poison, i32 %[[#ID:]], 0
+
+// CHECK-DXIL:  %[[#ID_X:]] = call i32 @llvm.[[TARGET]].thread.id.in.group(i32 0)
+// CHECK-SPIRV: %[[#ID_X:]] = call i32 @llvm.[[TARGET]].thread.id.in.group.i32(i32 0)
+// CHECK:      %[[#ID_X_:]] = insertelement <2 x i32> poison, i32 %[[#ID_X]], i64 0
+// CHECK-DXIL:  %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].thread.id.in.group(i32 1)
+// CHECK-SPIRV: %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].thread.id.in.group.i32(i32 1)
+// CHECK:      %[[#ID_XY:]] = insertelement <2 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1
+// CHECK:        %[[#S2C:]] = insertvalue %struct.S2C poison, <2 x i32> %[[#ID_XY:]], 0
+
+// CHECK:       %[[#S2AB:]] = insertvalue %struct.S2 %[[#S2A_]], %struct.S2C %[[#S2V:]], 1
+// CHECK:        %[[#ARG:]] = alloca %struct.S2, align 8
+// CHECK:                     store %struct.S2 %[[#S2AB]], ptr %[[#ARG]], align 1
+// CHECK-DXIL:                call void @{{.*}}main2{{.*}}(ptr %[[#ARG]])
+// CHECK-SPIRV:               call spir_func void @{{.*}}main2{{.*}}(ptr %[[#ARG]])
+[shader("compute")]
+[numthreads(8,8,1)]
+void main2(S2 p) {}
diff --git a/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl b/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
index 15c54beb03d38..3f7c59916316d 100644
--- a/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
+++ b/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-int16-type -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s
 
 [[vk::constant_id(1)]]
diff --git a/clang/test/CodeGenObjC/nontrivial-c-struct.m b/clang/test/CodeGenObjC/nontrivial-c-struct.m
new file mode 100644
index 0000000000000..fa4fa223bc2d9
--- /dev/null
+++ b/clang/test/CodeGenObjC/nontrivial-c-struct.m
@@ -0,0 +1,59 @@
+// RUN: %clang_cc1 -triple arm64e-apple-ios18 -fptrauth-calls -fptrauth-intrinsics -fobjc-arc -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: %[[STRUCT_S0:.*]] = type { i32, i32, ptr }
+// CHECK: %[[STRUCT_S1:.*]] = type { ptr, ptr }
+
+// This struct isn't POD because it has an address-discriminated ptrauth
+// field.
+typedef struct {
+  int f0, f1;
+  int * __ptrauth(1,1,50) f2;
+} S0;
+
+// This struct isn't POD because it has an address-discriminated ptrauth
+// field and an ARC ObjC pointer field.
+typedef struct {
+  id f0;
+  int * __ptrauth(1,1,50) f1;
+} S1;
+
+// CHECK: define void @compound_literal_assignment0(ptr noundef %[[P:.*]])
+// CHECK: %[[P_ADDR:.*]] = alloca ptr, align 8
+// CHECK-NEXT: %[[_COMPOUNDLITERAL:.*]] = alloca %[[STRUCT_S0]], align 8
+// CHECK-NEXT: store ptr %[[P]], ptr %[[P_ADDR]], align 8
+// CHECK-NEXT: %[[V0:.*]] = load ptr, ptr %[[P_ADDR]], align 8
+// CHECK-NEXT: %[[F0:.*]] = getelementptr inbounds nuw %[[STRUCT_S0]], ptr %[[_COMPOUNDLITERAL]], i32 0, i32 0
+// CHECK-NEXT: %[[V1:.*]] = load ptr, ptr %[[P_ADDR]], align 8
+// CHECK-NEXT: %[[F1:.*]] = getelementptr inbounds nuw %[[STRUCT_S0]], ptr %[[V1]], i32 0, i32 1
+// CHECK-NEXT: %[[V2:.*]] = load i32, ptr %[[F1]], align 4
+// CHECK-NEXT: store i32 %[[V2]], ptr %[[F0]], align 8
+// CHECK-NEXT: %[[F11:.*]] = getelementptr inbounds nuw %[[STRUCT_S0]], ptr %[[_COMPOUNDLITERAL]], i32 0, i32 1
+// CHECK-NEXT: %[[V3:.*]] = load ptr, ptr %[[P_ADDR]], align 8
+// CHECK-NEXT: %[[F02:.*]] = getelementptr inbounds nuw %[[STRUCT_S0]], ptr %[[V3]], i32 0, i32 0
+// CHECK-NEXT: %[[V4:.*]] = load i32, ptr %[[F02]], align 8
+// CHECK-NEXT: store i32 %[[V4]], ptr %[[F11]], align 4
+// CHECK-NEXT: %[[F2:.*]] = getelementptr inbounds nuw %[[STRUCT_S0]], ptr %[[_COMPOUNDLITERAL]], i32 0, i32 2
+// CHECK-NEXT: store ptr null, ptr %[[F2]], align 8
+// CHECK-NEXT: call void @__copy_assignment_8_8_t0w8_pa1_50_8(ptr %[[V0]], ptr %[[_COMPOUNDLITERAL]])
+// CHECK-NEXT: ret void
+
+void compound_literal_assignment0(S0 *p) {
+  *p = (S0){.f0 = p->f1, .f1 = p->f0};
+}
+
+// CHECK: define void @compound_literal_assignment1(ptr noundef %[[P:.*]])
+// CHECK: %[[P_ADDR:.*]] = alloca ptr, align 8
+// CHECK-NEXT: %[[_COMPOUNDLITERAL:.*]] = alloca %[[STRUCT_S1]], align 8
+// CHECK-NEXT: store ptr %[[P]], ptr %[[P_ADDR]], align 8
+// CHECK-NEXT: %[[V0:.*]] = load ptr, ptr %[[P_ADDR]], align 8
+// CHECK-NEXT: %[[F0:.*]] = getelementptr inbounds nuw %[[STRUCT_S1]], ptr %[[_COMPOUNDLITERAL]], i32 0, i32 0
+// CHECK-NEXT: store ptr null, ptr %[[F0]], align 8
+// CHECK-NEXT: %[[F1:.*]] = getelementptr inbounds nuw %[[STRUCT_S1]], ptr %[[_COMPOUNDLITERAL]], i32 0, i32 1
+// CHECK-NEXT: store ptr null, ptr %[[F1]], align 8
+// CHECK-NEXT: call void @__copy_assignment_8_8_s0_pa1_50_8(ptr %[[V0]], ptr %[[_COMPOUNDLITERAL]])
+// CHECK-NEXT: call void @__destructor_8_s0(ptr %[[_COMPOUNDLITERAL]])
+// CHECK-NEXT: ret void
+
+void compound_literal_assignment1(S1 *p) {
+  *p = (S1){};
+}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
deleted file mode 100644
index d0bcd1fccb7ce..0000000000000
--- a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
+++ /dev/null
@@ -1,633 +0,0 @@
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -O0 -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck --check-prefix=NOOPT %s
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -fcommon -emit-llvm -o - | FileCheck %s --check-prefix=COMMON
-
-typedef struct {
-  private char *p1;
-  local char *p2;
-  constant char *p3;
-  global char *p4;
-  generic char *p5;
-} StructTy1;
-
-typedef struct {
-  constant char *p3;
-  global char *p4;
-  generic char *p5;
-} StructTy2;
-
-// Test 0 as initializer.
-
-// CHECK: @private_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
-private char *private_p = 0;
-
-// CHECK: @local_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
-local char *local_p = 0;
-
-// CHECK: @global_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
-global char *global_p = 0;
-
-// CHECK: @constant_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
-constant char *constant_p = 0;
-
-// CHECK: @generic_p ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
-generic char *generic_p = 0;
-
-// Test NULL as initializer.
-
-// CHECK: @private_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
-private char *private_p_NULL = NULL;
-
-// CHECK: @local_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
-local char *local_p_NULL = NULL;
-
-// CHECK: @global_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
-global char *global_p_NULL = NULL;
-
-// CHECK: @constant_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
-constant char *constant_p_NULL = NULL;
-
-// CHECK: @generic_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
-generic char *generic_p_NULL = NULL;
-
-// Test constant folding of null pointer.
-// A null pointer should be folded to a null pointer in the target address space.
-
-// CHECK: @fold_generic ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
-generic int *fold_generic = (global int*)(generic float*)(private char*)0;
-
-// CHECK: @fold_priv ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr addrspace(1) null to ptr addrspace(5)), align 4
-private short *fold_priv = (private short*)(generic int*)(global void*)0;
-
-// CHECK: @fold_priv_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) inttoptr (i32 9 to ptr addrspace(5)), align 4
-private char *fold_priv_arith = (private char*)0 + 10;
-
-// CHECK: @fold_local_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) inttoptr (i32 9 to ptr addrspace(3)), align 4
-local char *fold_local_arith = (local char*)0 + 10;
-
-// CHECK: @fold_int ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4
-int fold_int = (int)(private void*)(generic char*)(global int*)0 + 14;
-
-// CHECK: @fold_int2 ={{.*}} local_unnamed_addr addrspace(1) global i32 12, align 4
-int fold_int2 = (int) ((private void*)0 + 13);
-
-// CHECK: @fold_int3 ={{.*}} local_unnamed_addr addrspace(1) global i32 -1, align 4
-int fold_int3 = (int) ((private int*)0);
-
-// CHECK: @fold_int4 ={{.*}} local_unnamed_addr addrspace(1) global i32 7, align 4
-int fold_int4 = (int) &((private int*)0)[2];
-
-// CHECK: @fold_int5 ={{.*}} local_unnamed_addr addrspace(1) global i32 3, align 4
-int fold_int5 = (int) &((private StructTy1*)0)->p2;
-
-
-// CHECK: @fold_int_local ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4
-int fold_int_local = (int)(local void*)(generic char*)(global int*)0 + 14;
-
-// CHECK: @fold_int2_local ={{.*}} local_unnamed_addr addrspace(1) global i32 12, align 4
-int fold_int2_local = (int) ((local void*)0 + 13);
-
-// CHECK: @fold_int3_local ={{.*}} local_unnamed_addr addrspace(1) global i32 -1, align 4
-int fold_int3_local = (int) ((local int*)0);
-
-// CHECK: @fold_int4_local ={{.*}} local_unnamed_addr addrspace(1) global i32 7, align 4
-int fold_int4_local = (int) &((local int*)0)[2];
-
-// CHECK: @fold_int5_local ={{.*}} local_unnamed_addr addrspace(1) global i32 3, align 4
-int fold_int5_local = (int) &((local StructTy1*)0)->p2;
-
-
-// Test static variable initialization.
-
-// NOOPT: @test_static_var_private.sp1 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
-// NOOPT: @test_static_var_private.sp2 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
-// NOOPT: @test_static_var_private.sp3 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
-// NOOPT: @test_static_var_private.sp4 = internal addrspace(1) global ptr addrspace(5) null, align 4
-// NOOPT: @test_static_var_private.sp5 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
-// NOOPT: @test_static_var_private.SS1 = internal addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
-// NOOPT: @test_static_var_private.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
-
-void test_static_var_private(void) {
-  static private char *sp1 = 0;
-  static private char *sp2 = NULL;
-  static private char *sp3;
-  static private char *sp4 = (private char*)((void)0, 0);
-  const int x = 0;
-  static private char *sp5 = (private char*)x;
-  static StructTy1 SS1;
-  static StructTy2 SS2;
-}
-
-// NOOPT: @test_static_var_local.sp1 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
-// NOOPT: @test_static_var_local.sp2 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
-// NOOPT: @test_static_var_local.sp3 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
-// NOOPT: @test_static_var_local.sp4 = internal addrspace(1) global ptr addrspace(3) null, align 4
-// NOOPT: @test_static_var_local.sp5 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
-// NOOPT: @test_static_var_local.SS1 = internal addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
-// NOOPT: @test_static_var_local.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
-void test_static_var_local(void) {
-  static local char *sp1 = 0;
-  static local char *sp2 = NULL;
-  static local char *sp3;
-  static local char *sp4 = (local char*)((void)0, 0);
-  const int x = 0;
-  static local char *sp5 = (local char*)x;
-  static StructTy1 SS1;
-  static StructTy2 SS2;
-}
-
-// Test function-scope variable initialization.
-// NOOPT-LABEL: @test_func_scope_var_private(
-// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp1{{.*}}, align 4
-// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp2{{.*}}, align 4
-// NOOPT: store ptr addrspace(5) null, ptr addrspace(5) %sp3{{.*}}, align 4
-// NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp4{{.*}}, align 4
-// NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_private.SS1, i64 32, i1 false)
-// NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2{{.*}}, i8 0, i64 24, i1 false)
-void test_func_scope_var_private(void) {
-  private char *sp1 = 0;
-  private char *sp2 = NULL;
-  private char *sp3 = (private char*)((void)0, 0);
-  const int x = 0;
-  private char *sp4 = (private char*)x;
-  StructTy1 SS1 = {0, 0, 0, 0, 0};
-  StructTy2 SS2 = {0, 0, 0};
-}
-
-// Test function-scope variable initialization.
-// NOOPT-LABEL: @test_func_scope_var_local(
-// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp1{{.*}}, align 4
-// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp2{{.*}}, align 4
-// NOOPT: store ptr addrspace(3) null, ptr addrspace(5) %sp3{{.*}}, align 4
-// NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp4{{.*}}, align 4
-// NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_local.SS1, i64 32, i1 false)
-// NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2{{.*}}, i8 0, i64 24, i1 false)
-void test_func_scope_var_local(void) {
-  local char *sp1 = 0;
-  local char *sp2 = NULL;
-  local char *sp3 = (local char*)((void)0, 0);
-  const int x = 0;
-  local char *sp4 = (local char*)x;
-  StructTy1 SS1 = {0, 0, 0, 0, 0};
-  StructTy2 SS2 = {0, 0, 0};
-}
-
-
-// Test default initialization of pointers.
-
-// Tentative definition of global variables with non-zero initializer
-// cannot have common linkage since common linkage requires zero initialization
-// and does not have explicit section.
-
-// CHECK: @p1 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
-// COMMON: @p1 = weak local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
-private char *p1;
-
-// CHECK: @p2 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
-// COMMON: @p2 = weak local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
-local char *p2;
-
-// CHECK: @p3 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
-// COMMON: @p3 = common local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
-constant char *p3;
-
-// CHECK: @p4 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
-// COMMON: @p4 = common local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
-global char *p4;
-
-// CHECK: @p5 ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
-// COMMON: @p5 = common local_unnamed_addr addrspace(1) global ptr null, align 8
-generic char *p5;
-
-// Test default initialization of structure.
-
-// CHECK: @S1 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
-StructTy1 S1;
-
-// CHECK: @S2 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
-StructTy2 S2;
-
-// Test default initialization of array.
-// CHECK: @A1 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy1] [%struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }], align 8
-StructTy1 A1[2];
-
-// CHECK: @A2 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy2] zeroinitializer, align 8
-StructTy2 A2[2];
-
-// Test comparison with 0.
-
-// CHECK-LABEL: cmp_private
-// CHECK: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
-void cmp_private(private char* p) {
-  if (p != 0)
-    *p = 0;
-}
-
-// CHECK-LABEL: cmp_local
-// CHECK: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
-void cmp_local(local char* p) {
-  if (p != 0)
-    *p = 0;
-}
-
-// CHECK-LABEL: cmp_global
-// CHECK: icmp eq ptr addrspace(1) %p, null
-void cmp_global(global char* p) {
-  if (p != 0)
-    *p = 0;
-}
-
-// CHECK-LABEL: cmp_constant
-// CHECK: icmp eq ptr addrspace(4) %p, null
-char cmp_constant(constant char* p) {
-  if (p != 0)
-    return *p;
-  else
-    return 0;
-}
-
-// CHECK-LABEL: cmp_generic
-// CHECK: icmp eq ptr %p, null
-void cmp_generic(generic char* p) {
-  if (p != 0)
-    *p = 0;
-}
-
-// Test comparison with NULL.
-
-// CHECK-LABEL: cmp_NULL_private
-// CHECK: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
-void cmp_NULL_private(private char* p) {
-  if (p != NULL)
-    *p = 0;
-}
-
-// CHECK-LABEL: cmp_NULL_local
-// CHECK: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
-void cmp_NULL_local(local char* p) {
-  if (p != NULL)
-    *p = 0;
-}
-
-// CHECK-LABEL: cmp_NULL_global
-// CHECK: icmp eq ptr addrspace(1) %p, null
-void cmp_NULL_global(global char* p) {
-  if (p != NULL)
-    *p = 0;
-}
-
-// CHECK-LABEL: cmp_NULL_constant
-// CHECK: icmp eq ptr addrspace(4) %p, null
-char cmp_NULL_constant(constant char* p) {
-  if (p != NULL)
-    return *p;
-  else
-    return 0;
-}
-
-// CHECK-LABEL: cmp_NULL_generic
-// CHECK: icmp eq ptr %p, null
-void cmp_NULL_generic(generic char* p) {
-  if (p != NULL)
-    *p = 0;
-}
-
-// Test storage 0 as null pointer.
-// CHECK-LABEL: test_storage_null_pointer
-// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %arg_private
-// CHECK: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %arg_local
-// CHECK: store ptr addrspace(1) null, ptr %arg_global
-// CHECK: store ptr addrspace(4) null, ptr %arg_constant
-// CHECK: store ptr null, ptr %arg_generic
-void test_storage_null_pointer(private char** arg_private,
-                               local char** arg_local,
-                               global char** arg_global,
-                               constant char** arg_constant,
-                               generic char** arg_generic) {
-   *arg_private = 0;
-   *arg_local = 0;
-   *arg_global = 0;
-   *arg_constant = 0;
-   *arg_generic = 0;
-}
-
-// Test storage NULL as null pointer.
-// CHECK-LABEL: test_storage_null_pointer_NULL
-// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %arg_private
-// CHECK: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %arg_local
-// CHECK: store ptr addrspace(1) null, ptr %arg_global
-// CHECK: store ptr addrspace(4) null, ptr %arg_constant
-// CHECK: store ptr null, ptr %arg_generic
-void test_storage_null_pointer_NULL(private char** arg_private,
-                                    local char** arg_local,
-                                    global char** arg_global,
-                                    constant char** arg_constant,
-                                    generic char** arg_generic) {
-   *arg_private = NULL;
-   *arg_local = NULL;
-   *arg_global = NULL;
-   *arg_constant = NULL;
-   *arg_generic = NULL;
-}
-
-// Test pass null pointer to function as argument.
-void test_pass_null_pointer_arg_calee(private char* arg_private,
-                                      local char* arg_local,
-                                      global char* arg_global,
-                                      constant char* arg_constant,
-                                      generic char* arg_generic);
-
-// CHECK-LABEL: test_pass_null_pointer_arg
-// CHECK: call void @test_pass_null_pointer_arg_calee(ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(1) null, ptr addrspace(4) null, ptr null)
-// CHECK: call void @test_pass_null_pointer_arg_calee(ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(1) null, ptr addrspace(4) null, ptr null)
-void test_pass_null_pointer_arg(void) {
-  test_pass_null_pointer_arg_calee(0, 0, 0, 0, 0);
-  test_pass_null_pointer_arg_calee(NULL, NULL, NULL, NULL, NULL);
-}
-
-// Test cast null pointer to size_t.
-void test_cast_null_pointer_to_sizet_calee(size_t arg_private,
-                                           size_t arg_local,
-                                           size_t arg_global,
-                                           size_t arg_constant,
-                                           size_t arg_generic);
-
-// CHECK-LABEL: test_cast_null_pointer_to_sizet
-// CHECK: call void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i64), i64 0, i64 0, i64 0)
-// CHECK: call void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i64), i64 0, i64 0, i64 0)
-void test_cast_null_pointer_to_sizet(void) {
-  test_cast_null_pointer_to_sizet_calee((size_t)((private char*)0),
-                                        (size_t)((local char*)0),
-                                        (size_t)((global char*)0),
-                                        (size_t)((constant char*)0),
-                                        (size_t)((generic char*)0));
-  test_cast_null_pointer_to_sizet_calee((size_t)((private char*)NULL),
-                                        (size_t)((local char*)NULL),
-                                        (size_t)((global char*)NULL),
-                                        (size_t)((constant char*)0), // NULL cannot be casted to constant pointer since it is defined as a generic pointer
-                                        (size_t)((generic char*)NULL));
-}
-
-// Test comparison between null pointers.
-#define TEST_EQ00(addr1, addr2) int test_eq00_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)0; }
-#define TEST_EQ0N(addr1, addr2) int test_eq0N_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)NULL; }
-#define TEST_EQN0(addr1, addr2) int test_eqN0_##addr1##_##addr2(void) { return (addr1 char*)NULL == (addr2 char*)0; }
-#define TEST_EQNN(addr1, addr2) int test_eqNN_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)NULL; }
-#define TEST_NE00(addr1, addr2) int test_ne00_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)0; }
-#define TEST_NE0N(addr1, addr2) int test_ne0N_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)NULL; }
-#define TEST_NEN0(addr1, addr2) int test_neN0_##addr1##_##addr2(void) { return (addr1 char*)NULL != (addr2 char*)0; }
-#define TEST_NENN(addr1, addr2) int test_neNN_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)NULL; }
-#define TEST(addr1, addr2) \
-        TEST_EQ00(addr1, addr2) \
-        TEST_EQ0N(addr1, addr2) \
-        TEST_EQN0(addr1, addr2) \
-        TEST_EQNN(addr1, addr2) \
-        TEST_NE00(addr1, addr2) \
-        TEST_NE0N(addr1, addr2) \
-        TEST_NEN0(addr1, addr2) \
-        TEST_NENN(addr1, addr2)
-
-// CHECK-LABEL: test_eq00_generic_private
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eq0N_generic_private
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eqN0_generic_private
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eqNN_generic_private
-// CHECK: ret i32 1
-// CHECK-LABEL: test_ne00_generic_private
-// CHECK: ret i32 0
-// CHECK-LABEL: test_ne0N_generic_private
-// CHECK: ret i32 0
-// CHECK-LABEL: test_neN0_generic_private
-// CHECK: ret i32 0
-// CHECK-LABEL: test_neNN_generic_private
-// CHECK: ret i32 0
-TEST(generic, private)
-
-// CHECK-LABEL: test_eq00_generic_local
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eq0N_generic_local
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eqN0_generic_local
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eqNN_generic_local
-// CHECK: ret i32 1
-// CHECK-LABEL: test_ne00_generic_local
-// CHECK: ret i32 0
-// CHECK-LABEL: test_ne0N_generic_local
-// CHECK: ret i32 0
-// CHECK-LABEL: test_neN0_generic_local
-// CHECK: ret i32 0
-// CHECK-LABEL: test_neNN_generic_local
-// CHECK: ret i32 0
-TEST(generic, local)
-
-// CHECK-LABEL: test_eq00_generic_global
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eq0N_generic_global
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eqN0_generic_global
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eqNN_generic_global
-// CHECK: ret i32 1
-// CHECK-LABEL: test_ne00_generic_global
-// CHECK: ret i32 0
-// CHECK-LABEL: test_ne0N_generic_global
-// CHECK: ret i32 0
-// CHECK-LABEL: test_neN0_generic_global
-// CHECK: ret i32 0
-// CHECK-LABEL: test_neNN_generic_global
-// CHECK: ret i32 0
-TEST(generic, global)
-
-// CHECK-LABEL: test_eq00_generic_generic
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eq0N_generic_generic
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eqN0_generic_generic
-// CHECK: ret i32 1
-// CHECK-LABEL: test_eqNN_generic_generic
-// CHECK: ret i32 1
-// CHECK-LABEL: test_ne00_generic_generic
-// CHECK: ret i32 0
-// CHECK-LABEL: test_ne0N_generic_generic
-// CHECK: ret i32 0
-// CHECK-LABEL: test_neN0_generic_generic
-// CHECK: ret i32 0
-// CHECK-LABEL: test_neNN_generic_generic
-// CHECK: ret i32 0
-TEST(generic, generic)
-
-// CHECK-LABEL: test_eq00_constant_constant
-// CHECK: ret i32 1
-TEST_EQ00(constant, constant)
-
-// Test cast to bool.
-
-// CHECK-LABEL: cast_bool_private
-// CHECK: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
-void cast_bool_private(private char* p) {
-  if (p)
-    *p = 0;
-}
-
-// CHECK-LABEL: cast_bool_local
-// CHECK: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
-void cast_bool_local(local char* p) {
-  if (p)
-    *p = 0;
-}
-
-// CHECK-LABEL: cast_bool_global
-// CHECK: icmp eq ptr addrspace(1) %p, null
-void cast_bool_global(global char* p) {
-  if (p)
-    *p = 0;
-}
-
-// CHECK-LABEL: cast_bool_constant
-// CHECK: icmp eq ptr addrspace(4) %p, null
-char cast_bool_constant(constant char* p) {
-  if (p)
-    return *p;
-  else
-    return 0;
-}
-
-// CHECK-LABEL: cast_bool_generic
-// CHECK: icmp eq ptr %p, null
-void cast_bool_generic(generic char* p) {
-  if (p)
-    *p = 0;
-}
-
-// Test initialize a struct using memset.
-// For large structures which is mostly zero, clang generats llvm.memset for
-// the zero part and store for non-zero members.
-typedef struct {
-  long a, b, c, d;
-  private char *p;
-} StructTy3;
-
-// CHECK-LABEL: test_memset_private
-// CHECK: call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 8 {{.*}}, i8 0, i64 32, i1 false)
-// CHECK: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) %ptr, i32 32
-// CHECK: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) [[GEP]]
-// CHECK: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) {{.*}}, i32 36
-// CHECK: store i32 0, ptr addrspace(5) [[GEP1]], align 4
-void test_memset_private(private StructTy3 *ptr) {
-  StructTy3 S3 = {0, 0, 0, 0, 0};
-  *ptr = S3;
-}
-
-// Test casting literal 0 to pointer.
-// A 0 literal casted to pointer should become a null pointer.
-
-// CHECK-LABEL: test_cast_0_to_local_ptr
-// CHECK: ret ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3))
-local int* test_cast_0_to_local_ptr(void) {
-  return (local int*)0;
-}
-
-// CHECK-LABEL: test_cast_0_to_private_ptr
-// CHECK: ret ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5))
-private int* test_cast_0_to_private_ptr(void) {
-  return (private int*)0;
-}
-
-// Test casting non-literal integer with 0 value to pointer.
-// A non-literal integer expression with 0 value is casted to a pointer with
-// zero value.
-
-// CHECK-LABEL: test_cast_int_to_ptr1_private
-// CHECK: ret ptr addrspace(5) null
-private int* test_cast_int_to_ptr1_private(void) {
-  return (private int*)((void)0, 0);
-}
-
-// CHECK-LABEL: test_cast_int_to_ptr1_local
- // CHECK: ret ptr addrspace(3) null
-local int* test_cast_int_to_ptr1_local(void) {
-  return (local int*)((void)0, 0);
-}
-
-// CHECK-LABEL: test_cast_int_to_ptr2
-// CHECK: ret ptr addrspace(5) null
-private int* test_cast_int_to_ptr2(void) {
-  int x = 0;
-  return (private int*)x;
-}
-
-// Test logical operations.
-// CHECK-LABEL: test_not_nullptr
-// CHECK: ret i32 1
-int test_not_nullptr(void) {
-  return !(private char*)NULL;
-}
-
-// CHECK-LABEL: test_and_nullptr
-// CHECK: ret i32 0
-int test_and_nullptr(int a) {
-  return a && ((private char*)NULL);
-}
-
-// CHECK-LABEL: test_not_private_ptr
-// CHECK: %[[lnot:.*]] = icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
-// CHECK: %[[lnot_ext:.*]] = zext i1 %[[lnot]] to i32
-// CHECK: ret i32 %[[lnot_ext]]
-int test_not_private_ptr(private char* p) {
-  return !p;
-}
-
-// CHECK-LABEL: test_not_local_ptr
-// CHECK: %[[lnot:.*]] = icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
-// CHECK: %[[lnot_ext:.*]] = zext i1 %[[lnot]] to i32
-// CHECK: ret i32 %[[lnot_ext]]
-int test_not_local_ptr(local char* p) {
-  return !p;
-}
-
-
-// CHECK-LABEL: test_and_ptr
-// CHECK: %[[tobool:.*]] = icmp ne ptr addrspace(5) %p1, addrspacecast (ptr null to ptr addrspace(5))
-// CHECK: %[[tobool1:.*]] = icmp ne ptr addrspace(3) %p2, addrspacecast (ptr null to ptr addrspace(3))
-// CHECK: %[[res:.*]] = select i1 %[[tobool]], i1 %[[tobool1]], i1 false
-// CHECK: %[[land_ext:.*]] = zext i1 %[[res]] to i32
-// CHECK: ret i32 %[[land_ext]]
-int test_and_ptr(private char* p1, local char* p2) {
-  return p1 && p2;
-}
-
-// Test folding of null pointer in function scope.
-// NOOPT-LABEL: test_fold_private
-// NOOPT: call void @test_fold_callee
-// NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob{{.*}}, align 8
-// NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
-// NOOPT: call void @test_fold_callee
-// NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i32) to i64
-// NOOPT: %{{.*}} = add nsw i64 %1, %[[SEXT]]
-// NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1
-void test_fold_callee(void);
-void test_fold_private(void) {
-  global int* glob = (test_fold_callee(), (global int*)(generic char*)0);
-  long x = glob - (global int*)(generic char*)0;
-  x = x + (int)(test_fold_callee(), (private int*)(generic char*)(global short*)0);
-  x = x - (int)((private int*)0 == (private int*)(generic char*)0);
-}
-
-// NOOPT-LABEL: test_fold_local
-// NOOPT: call void @test_fold_callee
-// NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob{{.*}}, align 8
-// NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
-// NOOPT: call void @test_fold_callee
-// NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i32) to i64
-// NOOPT: %{{.*}} = add nsw i64 %{{.*}}, %[[SEXT]]
-// NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1
-void test_fold_local(void) {
-  global int* glob = (test_fold_callee(), (global int*)(generic char*)0);
-  long x = glob - (global int*)(generic char*)0;
-  x = x + (int)(test_fold_callee(), (local int*)(generic char*)(global short*)0);
-  x = x - (int)((local int*)0 == (local int*)(generic char*)0);
-}
diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
index ea1f734391614..5cbf6452d4c85 100644
--- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
+++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
@@ -199,7 +199,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) {
 // SPIR32: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
 // SPIR32: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 // SPIR32: attributes #[[ATTR2]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// SPIR32: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+// SPIR32: attributes #[[ATTR3:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 // SPIR32: attributes #[[ATTR4]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 // SPIR32: attributes #[[ATTR5]] = { convergent nounwind "uniform-work-group-size"="true" }
 //.
diff --git a/clang/test/CodeGenOpenCL/nullptr.cl b/clang/test/CodeGenOpenCL/nullptr.cl
new file mode 100644
index 0000000000000..976e12c0bef47
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/nullptr.cl
@@ -0,0 +1,735 @@
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple spir64 -emit-llvm -o - -Wno-void-pointer-to-int-cast -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast | FileCheck %s --check-prefixes=CHECK,SPIR64
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -O0 -cl-std=CL2.0 -include opencl-c.h -triple spir64 -emit-llvm -o - -Wno-void-pointer-to-int-cast -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast | FileCheck --check-prefixes=CHECK-NOOPT,SPIR64-NOOPT %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,AMDGCN
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -O0 -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck --check-prefixes=CHECK-NOOPT,AMDGCN-NOOPT %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s --check-prefix=AMDGCN
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -fcommon -emit-llvm -o - | FileCheck %s --check-prefix=AMDGCN-COMMON
+
+typedef struct {
+  private char *p1;
+  local char *p2;
+  constant char *p3;
+  global char *p4;
+  generic char *p5;
+} StructTy1;
+
+typedef struct {
+  constant char *p3;
+  global char *p4;
+  generic char *p5;
+} StructTy2;
+
+// Test 0 as initializer.
+
+// SPIR64: @private_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// AMDGCN: @private_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
+private char *private_p = 0;
+
+// SPIR64: @local_p = local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
+// AMDGCN: @local_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
+local char *local_p = 0;
+
+// SPIR64: @global_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), align 8
+// AMDGCN: @global_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
+global char *global_p = 0;
+
+// SPIR64: @constant_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(2) null, align 8
+// AMDGCN: @constant_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
+constant char *constant_p = 0;
+
+// SPIR64: @generic_p ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
+// AMDGCN: @generic_p ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
+generic char *generic_p = 0;
+
+// Test NULL as initializer.
+
+// SPIR64: @private_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// AMDGCN: @private_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
+private char *private_p_NULL = NULL;
+
+// SPIR64: @local_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
+// AMDGCN: @local_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
+local char *local_p_NULL = NULL;
+
+// SPIR64: @global_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), align 8
+// AMDGCN: @global_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
+global char *global_p_NULL = NULL;
+
+// SPIR64: @constant_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(2) null, align 8
+// AMDGCN: @constant_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
+constant char *constant_p_NULL = NULL;
+
+// SPIR64: @generic_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
+// AMDGCN: @generic_p_NULL ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
+generic char *generic_p_NULL = NULL;
+
+// Test constant folding of null pointer.
+// A null pointer should be folded to a null pointer in the target address space.
+
+// SPIR64: @fold_generic ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
+// AMDGCN: @fold_generic ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
+generic int *fold_generic = (global int*)(generic float*)(private char*)0;
+
+// SPIR64: @fold_priv ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// AMDGCN: @fold_priv ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr addrspace(1) null to ptr addrspace(5)), align 4
+private short *fold_priv = (private short*)(generic int*)(global void*)0;
+
+// SPIR64: @fold_priv_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr inttoptr (i64 10 to ptr), align 8
+// AMDGCN: @fold_priv_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) inttoptr (i32 9 to ptr addrspace(5)), align 4
+private char *fold_priv_arith = (private char*)0 + 10;
+
+// SPIR64: @fold_local_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) inttoptr (i64 10 to ptr addrspace(3)), align 8
+// AMDGCN: @fold_local_arith ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) inttoptr (i32 9 to ptr addrspace(3)), align 4
+local char *fold_local_arith = (local char*)0 + 10;
+
+// SPIR64: @fold_int ={{.*}} local_unnamed_addr addrspace(1) global i32 14, align 4
+// AMDGCN: @fold_int ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4
+int fold_int = (int)(private void*)(generic char*)(global int*)0 + 14;
+
+// SPIR64: @fold_int2 ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4
+// AMDGCN: @fold_int2 ={{.*}} local_unnamed_addr addrspace(1) global i32 12, align 4
+int fold_int2 = (int) ((private void*)0 + 13);
+
+// SPIR64: @fold_int3 ={{.*}} local_unnamed_addr addrspace(1) global i32 0, align 4
+// AMDGCN: @fold_int3 ={{.*}} local_unnamed_addr addrspace(1) global i32 -1, align 4
+int fold_int3 = (int) ((private int*)0);
+
+// SPIR64: @fold_int4 ={{.*}} local_unnamed_addr addrspace(1) global i32 8, align 4
+// AMDGCN: @fold_int4 ={{.*}} local_unnamed_addr addrspace(1) global i32 7, align 4
+int fold_int4 = (int) &((private int*)0)[2];
+
+// SPIR64: @fold_int5 ={{.*}} local_unnamed_addr addrspace(1) global i32 8, align 4
+// AMDGCN: @fold_int5 ={{.*}} local_unnamed_addr addrspace(1) global i32 3, align 4
+int fold_int5 = (int) &((private StructTy1*)0)->p2;
+
+// SPIR64: @fold_int_local ={{.*}} local_unnamed_addr addrspace(1) global i32 14, align 4
+// AMDGCN: @fold_int_local = local_unnamed_addr addrspace(1) global i32 13, align 4
+int fold_int_local = (int)(local void*)(generic char*)(global int*)0 + 14;
+
+// SPIR64: @fold_int2_local ={{.*}} local_unnamed_addr addrspace(1) global i32 13, align 4
+// AMDGCN: @fold_int2_local ={{.*}} local_unnamed_addr addrspace(1) global i32 12, align 4
+int fold_int2_local = (int) ((local void*)0 + 13);
+
+// SPIR64: @fold_int3_local ={{.*}} local_unnamed_addr addrspace(1) global i32 0, align 4
+// AMDGCN: @fold_int3_local ={{.*}} local_unnamed_addr addrspace(1) global i32 -1, align 4
+int fold_int3_local = (int) ((local int*)0);
+
+// SPIR64: @fold_int4_local ={{.*}} local_unnamed_addr addrspace(1) global i32 8, align 4
+// AMDGCN: @fold_int4_local ={{.*}} local_unnamed_addr addrspace(1) global i32 7, align 4
+int fold_int4_local = (int) &((local int*)0)[2];
+
+// SPIR64: @fold_int5_local ={{.*}} local_unnamed_addr addrspace(1) global i32 8, align 4
+// AMDGCN: @fold_int5_local ={{.*}} local_unnamed_addr addrspace(1) global i32 3, align 4
+int fold_int5_local = (int) &((local StructTy1*)0)->p2;
+
+
+// Test static variable initialization.
+
+// SPIR64-NOOPT: @test_static_var_private.sp1 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// SPIR64-NOOPT: @test_static_var_private.sp2 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// SPIR64-NOOPT: @test_static_var_private.sp3 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// SPIR64-NOOPT: @test_static_var_private.sp4 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// SPIR64-NOOPT: @test_static_var_private.sp5 = internal addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// SPIR64-NOOPT: @test_static_var_private.SS1 = internal addrspace(1) global %struct.StructTy1 zeroinitializer, align 8
+// AMDGCN-NOOPT: @test_static_var_private.sp1 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
+// AMDGCN-NOOPT: @test_static_var_private.sp2 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
+// AMDGCN-NOOPT: @test_static_var_private.sp3 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
+// AMDGCN-NOOPT: @test_static_var_private.sp4 = internal addrspace(1) global ptr addrspace(5) null, align 4
+// AMDGCN-NOOPT: @test_static_var_private.sp5 = internal addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
+// AMDGCN-NOOPT: @test_static_var_private.SS1 = internal addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
+// CHECK-NOOPT: @test_static_var_private.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
+
+void test_static_var_private(void) {
+  static private char *sp1 = 0;
+  static private char *sp2 = NULL;
+  static private char *sp3;
+  static private char *sp4 = (private char*)((void)0, 0);
+  const int x = 0;
+  static private char *sp5 = (private char*)x;
+  static StructTy1 SS1;
+  static StructTy2 SS2;
+}
+
+// SPIR64-NOOPT: @test_static_var_local.sp1 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
+// SPIR64-NOOPT: @test_static_var_local.sp2 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
+// SPIR64-NOOPT: @test_static_var_local.sp3 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
+// SPIR64-NOOPT: @test_static_var_local.sp4 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
+// SPIR64-NOOPT: @test_static_var_local.sp5 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
+// SPIR64-NOOPT: @test_static_var_local.SS1 = internal addrspace(1) global %struct.StructTy1 zeroinitializer, align 8
+// AMDGCN-NOOPT: @test_static_var_local.sp1 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
+// AMDGCN-NOOPT: @test_static_var_local.sp2 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
+// AMDGCN-NOOPT: @test_static_var_local.sp3 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
+// AMDGCN-NOOPT: @test_static_var_local.sp4 = internal addrspace(1) global ptr addrspace(3) null, align 4
+// AMDGCN-NOOPT: @test_static_var_local.sp5 = internal addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
+// AMDGCN-NOOPT: @test_static_var_local.SS1 = internal addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
+// CHECK-NOOPT: @test_static_var_local.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
+void test_static_var_local(void) {
+  static local char *sp1 = 0;
+  static local char *sp2 = NULL;
+  static local char *sp3;
+  static local char *sp4 = (local char*)((void)0, 0);
+  const int x = 0;
+  static local char *sp5 = (local char*)x;
+  static StructTy1 SS1;
+  static StructTy2 SS2;
+}
+
+// Test function-scope variable initialization.
+// CHECK-NOOPT-LABEL: @test_func_scope_var_private(
+// SPIR64-NOOPT: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr %sp1{{.*}}, align 8
+// SPIR64-NOOPT: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr %sp2{{.*}}, align 8
+// SPIR64-NOOPT: store ptr null, ptr %sp3{{.*}}, align 8
+// SPIR64-NOOPT: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr %sp4{{.*}}, align 8
+// SPIR64-NOOPT: call void @llvm.memset.p0.i64(ptr align 8 %SS1{{.*}}, i8 0, i64 40, i1 false)
+// SPIR64-NOOPT: call void @llvm.memcpy.p0.p2.i64(ptr align 8 %SS2{{.*}}, ptr addrspace(2) align 8 @__const.test_func_scope_var_private.SS2, i64 24, i1 false)
+// AMDGCN-NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp1{{.*}}, align 4
+// AMDGCN-NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp2{{.*}}, align 4
+// AMDGCN-NOOPT: store ptr addrspace(5) null, ptr addrspace(5) %sp3{{.*}}, align 4
+// AMDGCN-NOOPT: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) %sp4{{.*}}, align 4
+// AMDGCN-NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_private.SS1, i64 32, i1 false)
+// AMDGCN-NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2{{.*}}, i8 0, i64 24, i1 false)
+void test_func_scope_var_private(void) {
+  private char *sp1 = 0;
+  private char *sp2 = NULL;
+  private char *sp3 = (private char*)((void)0, 0);
+  const int x = 0;
+  private char *sp4 = (private char*)x;
+  StructTy1 SS1 = {0, 0, 0, 0, 0};
+  StructTy2 SS2 = {0, 0, 0};
+}
+
+// Test function-scope variable initialization.
+// CHECK-NOOPT-LABEL: @test_func_scope_var_local(
+// SPIR64-NOOPT: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr %sp1{{.*}}, align 8
+// SPIR64-NOOPT: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr %sp2{{.*}}, align 8
+// SPIR64-NOOPT: store ptr addrspace(3) null, ptr %sp3{{.*}}, align 8
+// SPIR64-NOOPT: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr %sp4{{.*}}, align 8
+// SPIR64-NOOPT: call void @llvm.memset.p0.i64(ptr align 8 %SS1{{.*}}, i8 0, i64 40, i1 false)
+// SPIR64-NOOPT: call void @llvm.memcpy.p0.p2.i64(ptr align 8 %SS2{{.*}}, ptr addrspace(2) align 8 @__const.test_func_scope_var_local.SS2, i64 24, i1 false)
+// AMDGCN-NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp1{{.*}}, align 4
+// AMDGCN-NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp2{{.*}}, align 4
+// AMDGCN-NOOPT: store ptr addrspace(3) null, ptr addrspace(5) %sp3{{.*}}, align 4
+// AMDGCN-NOOPT: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(5) %sp4{{.*}}, align 4
+// AMDGCN-NOOPT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 %SS1{{.*}}, ptr addrspace(4) align 8 @__const.test_func_scope_var_local.SS1, i64 32, i1 false)
+// AMDGCN-NOOPT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 %SS2{{.*}}, i8 0, i64 24, i1 false)
+void test_func_scope_var_local(void) {
+  local char *sp1 = 0;
+  local char *sp2 = NULL;
+  local char *sp3 = (local char*)((void)0, 0);
+  const int x = 0;
+  local char *sp4 = (local char*)x;
+  StructTy1 SS1 = {0, 0, 0, 0, 0};
+  StructTy2 SS2 = {0, 0, 0};
+}
+
+
+// Test default initialization of pointers.
+
+// Tentative definition of global variables with non-zero initializer
+// cannot have common linkage since common linkage requires zero initialization
+// and does not have explicit section.
+
+// SPIR64: @p1 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(4) null to ptr), align 8
+// AMDGCN: @p1 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
+// AMDGCN-COMMON: @p1 = weak local_unnamed_addr addrspace(1) global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), align 4
+private char *p1;
+
+// SPIR64: @p2 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), align 8
+// AMDGCN: @p2 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
+// AMDGCN-COMMON: @p2 = weak local_unnamed_addr addrspace(1) global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), align 4
+local char *p2;
+
+// SPIR64: @p3 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(2) null, align 8
+// AMDGCN: @p3 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
+// AMDGCN-COMMON: @p3 = common local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
+constant char *p3;
+
+// SPIR64: @p4 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), align 8
+// AMDGCN: @p4 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
+// AMDGCN-COMMON: @p4 = common local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 8
+global char *p4;
+
+// SPIR64: @p5 ={{.*}} local_unnamed_addr addrspace(1) global ptr addrspace(4) null, align 8
+// AMDGCN: @p5 ={{.*}} local_unnamed_addr addrspace(1) global ptr null, align 8
+// AMDGCN-COMMON: @p5 = common local_unnamed_addr addrspace(1) global ptr null, align 8
+generic char *p5;
+
+// Test default initialization of structure.
+
+// SPIR64: @S1 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy1 zeroinitializer, align 8
+// AMDGCN: @S1 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, align 8
+StructTy1 S1;
+
+// CHECK: @S2 ={{.*}} local_unnamed_addr addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
+StructTy2 S2;
+
+// Test default initialization of array.
+// SPIR64: @A1 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy1] zeroinitializer, align 8
+// AMDGCN: @A1 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy1] [%struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }, %struct.StructTy1 { ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(4) null, ptr addrspace(1) null, ptr null }], align 8
+StructTy1 A1[2];
+
+// CHECK: @A2 ={{.*}} local_unnamed_addr addrspace(1) global [2 x %struct.StructTy2] zeroinitializer, align 8
+StructTy2 A2[2];
+
+// Test comparison with 0.
+
+// CHECK-LABEL: cmp_private
+// SPIR64: icmp eq ptr %p, addrspacecast (ptr addrspace(4) null to ptr)
+// AMDGCN: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
+void cmp_private(private char* p) {
+  if (p != 0)
+    *p = 0;
+}
+
+// CHECK-LABEL: cmp_local
+// SPIR64: icmp eq ptr addrspace(3) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
+// AMDGCN: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
+void cmp_local(local char* p) {
+  if (p != 0)
+    *p = 0;
+}
+
+// CHECK-LABEL: cmp_global
+// SPIR64: icmp eq ptr addrspace(1) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(1))
+// AMDGCN: icmp eq ptr addrspace(1) %p, null
+void cmp_global(global char* p) {
+  if (p != 0)
+    *p = 0;
+}
+
+// CHECK-LABEL: cmp_constant
+// SPIR64: icmp eq ptr addrspace(2) %p, null
+// AMDGCN: icmp eq ptr addrspace(4) %p, null
+char cmp_constant(constant char* p) {
+  if (p != 0)
+    return *p;
+  else
+    return 0;
+}
+
+// CHECK-LABEL: cmp_generic
+// SPIR64: icmp eq ptr addrspace(4) %p, null
+// AMDGCN: icmp eq ptr %p, null
+void cmp_generic(generic char* p) {
+  if (p != 0)
+    *p = 0;
+}
+
+// Test comparison with NULL.
+
+// CHECK-LABEL: cmp_NULL_private
+// SPIR64: icmp eq ptr %p, addrspacecast (ptr addrspace(4) null to ptr)
+// AMDGCN: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
+void cmp_NULL_private(private char* p) {
+  if (p != NULL)
+    *p = 0;
+}
+
+// CHECK-LABEL: cmp_NULL_local
+// SPIR64: icmp eq ptr addrspace(3) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
+// AMDGCN: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
+void cmp_NULL_local(local char* p) {
+  if (p != NULL)
+    *p = 0;
+}
+
+// CHECK-LABEL: cmp_NULL_global
+// SPIR64: icmp eq ptr addrspace(1) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(1))
+// AMDGCN: icmp eq ptr addrspace(1) %p, null
+void cmp_NULL_global(global char* p) {
+  if (p != NULL)
+    *p = 0;
+}
+
+// CHECK-LABEL: cmp_NULL_constant
+// SPIR64: icmp eq ptr addrspace(2) %p, null
+// AMDGCN: icmp eq ptr addrspace(4) %p, null
+char cmp_NULL_constant(constant char* p) {
+  if (p != NULL)
+    return *p;
+  else
+    return 0;
+}
+
+// CHECK-LABEL: cmp_NULL_generic
+// SPIR64: icmp eq ptr addrspace(4) %p, null
+// AMDGCN: icmp eq ptr %p, null
+void cmp_NULL_generic(generic char* p) {
+  if (p != NULL)
+    *p = 0;
+}
+
+// Test storage 0 as null pointer.
+// CHECK-LABEL: test_storage_null_pointer
+// SPIR64: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr addrspace(4) %arg_private
+// SPIR64: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr addrspace(4) %arg_local
+// SPIR64: store ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr addrspace(4) %arg_global
+// SPIR64: store ptr addrspace(2) null, ptr addrspace(4) %arg_constant
+// SPIR64: store ptr addrspace(4) null, ptr addrspace(4) %arg_generic
+// AMDGCN: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %arg_private
+// AMDGCN: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %arg_local
+// AMDGCN: store ptr addrspace(1) null, ptr %arg_global
+// AMDGCN: store ptr addrspace(4) null, ptr %arg_constant
+// AMDGCN: store ptr null, ptr %arg_generic
+void test_storage_null_pointer(private char** arg_private,
+                               local char** arg_local,
+                               global char** arg_global,
+                               constant char** arg_constant,
+                               generic char** arg_generic) {
+   *arg_private = 0;
+   *arg_local = 0;
+   *arg_global = 0;
+   *arg_constant = 0;
+   *arg_generic = 0;
+}
+
+// Test storage NULL as null pointer.
+// CHECK-LABEL: test_storage_null_pointer_NULL
+// SPIR64: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr addrspace(4) %arg_private
+// SPIR64: store ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr addrspace(4) %arg_local
+// SPIR64: store ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr addrspace(4) %arg_global
+// SPIR64: store ptr addrspace(2) null, ptr addrspace(4) %arg_constant
+// SPIR64: store ptr addrspace(4) null, ptr addrspace(4) %arg_generic
+// AMDGCN: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr %arg_private
+// AMDGCN: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr %arg_local
+// AMDGCN: store ptr addrspace(1) null, ptr %arg_global
+// AMDGCN: store ptr addrspace(4) null, ptr %arg_constant
+// AMDGCN: store ptr null, ptr %arg_generic
+void test_storage_null_pointer_NULL(private char** arg_private,
+                                    local char** arg_local,
+                                    global char** arg_global,
+                                    constant char** arg_constant,
+                                    generic char** arg_generic) {
+   *arg_private = NULL;
+   *arg_local = NULL;
+   *arg_global = NULL;
+   *arg_constant = NULL;
+   *arg_generic = NULL;
+}
+
+// Test pass null pointer to function as argument.
+void test_pass_null_pointer_arg_calee(private char* arg_private,
+                                      local char* arg_local,
+                                      global char* arg_global,
+                                      constant char* arg_constant,
+                                      generic char* arg_generic);
+
+// CHECK-LABEL: test_pass_null_pointer_arg
+// SPIR64: call spir_func void @test_pass_null_pointer_arg_calee(ptr addrspacecast (ptr addrspace(4) null to ptr), ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr addrspace(2) null, ptr addrspace(4) null)
+// SPIR64: call spir_func void @test_pass_null_pointer_arg_calee(ptr addrspacecast (ptr addrspace(4) null to ptr), ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)), ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr addrspace(2) null, ptr addrspace(4) null)
+// AMDGCN: call void @test_pass_null_pointer_arg_calee(ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(1) null, ptr addrspace(4) null, ptr null)
+// AMDGCN: call void @test_pass_null_pointer_arg_calee(ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr addrspace(1) null, ptr addrspace(4) null, ptr null)
+void test_pass_null_pointer_arg(void) {
+  test_pass_null_pointer_arg_calee(0, 0, 0, 0, 0);
+  test_pass_null_pointer_arg_calee(NULL, NULL, NULL, NULL, NULL);
+}
+
+// Test cast null pointer to size_t.
+void test_cast_null_pointer_to_sizet_calee(size_t arg_private,
+                                           size_t arg_local,
+                                           size_t arg_global,
+                                           size_t arg_constant,
+                                           size_t arg_generic);
+
+// CHECK-LABEL: test_cast_null_pointer_to_sizet
+// SPIR64: call spir_func void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) to i64), i64 ptrtoint (ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) to i64), i64 0, i64 0)
+// SPIR64: call spir_func void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) to i64), i64 ptrtoint (ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) to i64), i64 0, i64 0)
+// AMDGCN: call void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i64), i64 0, i64 0, i64 0)
+// AMDGCN: call void @test_cast_null_pointer_to_sizet_calee(i64 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i64), i64 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i64), i64 0, i64 0, i64 0)
+void test_cast_null_pointer_to_sizet(void) {
+  test_cast_null_pointer_to_sizet_calee((size_t)((private char*)0),
+                                        (size_t)((local char*)0),
+                                        (size_t)((global char*)0),
+                                        (size_t)((constant char*)0),
+                                        (size_t)((generic char*)0));
+  test_cast_null_pointer_to_sizet_calee((size_t)((private char*)NULL),
+                                        (size_t)((local char*)NULL),
+                                        (size_t)((global char*)NULL),
+                                        (size_t)((constant char*)0), // NULL cannot be casted to constant pointer since it is defined as a generic pointer
+                                        (size_t)((generic char*)NULL));
+}
+
+// Test comparison between null pointers.
+#define TEST_EQ00(addr1, addr2) int test_eq00_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)0; }
+#define TEST_EQ0N(addr1, addr2) int test_eq0N_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)NULL; }
+#define TEST_EQN0(addr1, addr2) int test_eqN0_##addr1##_##addr2(void) { return (addr1 char*)NULL == (addr2 char*)0; }
+#define TEST_EQNN(addr1, addr2) int test_eqNN_##addr1##_##addr2(void) { return (addr1 char*)0 == (addr2 char*)NULL; }
+#define TEST_NE00(addr1, addr2) int test_ne00_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)0; }
+#define TEST_NE0N(addr1, addr2) int test_ne0N_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)NULL; }
+#define TEST_NEN0(addr1, addr2) int test_neN0_##addr1##_##addr2(void) { return (addr1 char*)NULL != (addr2 char*)0; }
+#define TEST_NENN(addr1, addr2) int test_neNN_##addr1##_##addr2(void) { return (addr1 char*)0 != (addr2 char*)NULL; }
+#define TEST(addr1, addr2) \
+        TEST_EQ00(addr1, addr2) \
+        TEST_EQ0N(addr1, addr2) \
+        TEST_EQN0(addr1, addr2) \
+        TEST_EQNN(addr1, addr2) \
+        TEST_NE00(addr1, addr2) \
+        TEST_NE0N(addr1, addr2) \
+        TEST_NEN0(addr1, addr2) \
+        TEST_NENN(addr1, addr2)
+
+// CHECK-LABEL: test_eq00_generic_private
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eq0N_generic_private
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eqN0_generic_private
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eqNN_generic_private
+// CHECK: ret i32 1
+// CHECK-LABEL: test_ne00_generic_private
+// CHECK: ret i32 0
+// CHECK-LABEL: test_ne0N_generic_private
+// CHECK: ret i32 0
+// CHECK-LABEL: test_neN0_generic_private
+// CHECK: ret i32 0
+// CHECK-LABEL: test_neNN_generic_private
+// CHECK: ret i32 0
+TEST(generic, private)
+
+// CHECK-LABEL: test_eq00_generic_local
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eq0N_generic_local
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eqN0_generic_local
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eqNN_generic_local
+// CHECK: ret i32 1
+// CHECK-LABEL: test_ne00_generic_local
+// CHECK: ret i32 0
+// CHECK-LABEL: test_ne0N_generic_local
+// CHECK: ret i32 0
+// CHECK-LABEL: test_neN0_generic_local
+// CHECK: ret i32 0
+// CHECK-LABEL: test_neNN_generic_local
+// CHECK: ret i32 0
+TEST(generic, local)
+
+// CHECK-LABEL: test_eq00_generic_global
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eq0N_generic_global
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eqN0_generic_global
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eqNN_generic_global
+// CHECK: ret i32 1
+// CHECK-LABEL: test_ne00_generic_global
+// CHECK: ret i32 0
+// CHECK-LABEL: test_ne0N_generic_global
+// CHECK: ret i32 0
+// CHECK-LABEL: test_neN0_generic_global
+// CHECK: ret i32 0
+// CHECK-LABEL: test_neNN_generic_global
+// CHECK: ret i32 0
+TEST(generic, global)
+
+// CHECK-LABEL: test_eq00_generic_generic
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eq0N_generic_generic
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eqN0_generic_generic
+// CHECK: ret i32 1
+// CHECK-LABEL: test_eqNN_generic_generic
+// CHECK: ret i32 1
+// CHECK-LABEL: test_ne00_generic_generic
+// CHECK: ret i32 0
+// CHECK-LABEL: test_ne0N_generic_generic
+// CHECK: ret i32 0
+// CHECK-LABEL: test_neN0_generic_generic
+// CHECK: ret i32 0
+// CHECK-LABEL: test_neNN_generic_generic
+// CHECK: ret i32 0
+TEST(generic, generic)
+
+// CHECK-LABEL: test_eq00_constant_constant
+// CHECK: ret i32 1
+TEST_EQ00(constant, constant)
+
+// Test cast to bool.
+
+// CHECK-LABEL: cast_bool_private
+// SPIR64: icmp eq ptr %p, addrspacecast (ptr addrspace(4) null to ptr)
+// AMDGCN: icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
+void cast_bool_private(private char* p) {
+  if (p)
+    *p = 0;
+}
+
+// CHECK-LABEL: cast_bool_local
+// SPIR64: icmp eq ptr addrspace(3) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
+// AMDGCN: icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
+void cast_bool_local(local char* p) {
+  if (p)
+    *p = 0;
+}
+
+// CHECK-LABEL: cast_bool_global
+// SPIR64: icmp eq ptr addrspace(1) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(1))
+// AMDGCN: icmp eq ptr addrspace(1) %p, null
+void cast_bool_global(global char* p) {
+  if (p)
+    *p = 0;
+}
+
+// CHECK-LABEL: cast_bool_constant
+// SPIR64: icmp eq ptr addrspace(2) %p, null
+// AMDGCN: icmp eq ptr addrspace(4) %p, null
+char cast_bool_constant(constant char* p) {
+  if (p)
+    return *p;
+  else
+    return 0;
+}
+
+// CHECK-LABEL: cast_bool_generic
+// SPIR64: icmp eq ptr addrspace(4) %p, null
+// AMDGCN: icmp eq ptr %p, null
+void cast_bool_generic(generic char* p) {
+  if (p)
+    *p = 0;
+}
+
+// Test initialize a struct using memset.
+// For large structures which is mostly zero, clang generats llvm.memset for
+// the zero part and store for non-zero members.
+typedef struct {
+  long a, b, c, d;
+  private char *p;
+} StructTy3;
+
+// CHECK-LABEL: test_memset_private
+// SPIR64: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %ptr, i8 0, i64 32, i1 false)
+// SPIR64: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr %ptr, i64 32
+// SPIR64: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr [[GEP]], align 8
+// AMDGCN: call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 8 {{.*}}, i8 0, i64 32, i1 false)
+// AMDGCN: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) %ptr, i32 32
+// AMDGCN: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) [[GEP]]
+// AMDGCN: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) {{.*}}, i32 36
+// AMDGCN: store i32 0, ptr addrspace(5) [[GEP1]], align 4
+void test_memset_private(private StructTy3 *ptr) {
+  StructTy3 S3 = {0, 0, 0, 0, 0};
+  *ptr = S3;
+}
+
+// Test casting literal 0 to pointer.
+// A 0 literal casted to pointer should become a null pointer.
+
+// CHECK-LABEL: test_cast_0_to_local_ptr
+// SPIR64: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
+// AMDGCN: ret ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3))
+local int* test_cast_0_to_local_ptr(void) {
+  return (local int*)0;
+}
+
+// CHECK-LABEL: test_cast_0_to_private_ptr
+// SPIR64: ptr addrspacecast (ptr addrspace(4) null to ptr)
+// AMDGCN: ret ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5))
+private int* test_cast_0_to_private_ptr(void) {
+  return (private int*)0;
+}
+
+// Test casting non-literal integer with 0 value to pointer.
+// A non-literal integer expression with 0 value is casted to a pointer with
+// zero value.
+
+// CHECK-LABEL: test_cast_int_to_ptr1_private
+// SPIR64: ret ptr null
+// AMDGCN: ret ptr addrspace(5) null
+private int* test_cast_int_to_ptr1_private(void) {
+  return (private int*)((void)0, 0);
+}
+
+// CHECK-LABEL: test_cast_int_to_ptr1_local
+// CHECK: ret ptr addrspace(3) null
+local int* test_cast_int_to_ptr1_local(void) {
+  return (local int*)((void)0, 0);
+}
+
+// CHECK-LABEL: test_cast_int_to_ptr2
+// SPIR64: ret ptr null
+// AMDGCN: ret ptr addrspace(5) null
+private int* test_cast_int_to_ptr2(void) {
+  int x = 0;
+  return (private int*)x;
+}
+
+// Test logical operations.
+// CHECK-LABEL: test_not_nullptr
+// CHECK: ret i32 1
+int test_not_nullptr(void) {
+  return !(private char*)NULL;
+}
+
+// CHECK-LABEL: test_and_nullptr
+// CHECK: ret i32 0
+int test_and_nullptr(int a) {
+  return a && ((private char*)NULL);
+}
+
+// CHECK-LABEL: test_not_private_ptr
+// SPIR64: %[[lnot:.*]] = icmp eq ptr %p, addrspacecast (ptr addrspace(4) null to ptr)
+// AMDGCN: %[[lnot:.*]] = icmp eq ptr addrspace(5) %p, addrspacecast (ptr null to ptr addrspace(5))
+// CHECK: %[[lnot_ext:.*]] = zext i1 %[[lnot]] to i32
+// CHECK: ret i32 %[[lnot_ext]]
+int test_not_private_ptr(private char* p) {
+  return !p;
+}
+
+// CHECK-LABEL: test_not_local_ptr
+// SPIR64: %[[lnot:.*]] = icmp eq ptr addrspace(3) %p, addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
+// AMDGCN: %[[lnot:.*]] = icmp eq ptr addrspace(3) %p, addrspacecast (ptr null to ptr addrspace(3))
+// CHECK: %[[lnot_ext:.*]] = zext i1 %[[lnot]] to i32
+// CHECK: ret i32 %[[lnot_ext]]
+int test_not_local_ptr(local char* p) {
+  return !p;
+}
+
+
+// CHECK-LABEL: test_and_ptr
+// SPIR64: %[[tobool:.*]] = icmp ne ptr %p1, addrspacecast (ptr addrspace(4) null to ptr)
+// SPIR64: %[[tobool1:.*]] = icmp ne ptr addrspace(3) %p2, addrspacecast (ptr addrspace(4) null to ptr addrspace(3))
+// AMDGCN: %[[tobool:.*]] = icmp ne ptr addrspace(5) %p1, addrspacecast (ptr null to ptr addrspace(5))
+// AMDGCN: %[[tobool1:.*]] = icmp ne ptr addrspace(3) %p2, addrspacecast (ptr null to ptr addrspace(3))
+// CHECK: %[[res:.*]] = select i1 %[[tobool]], i1 %[[tobool1]], i1 false
+// CHECK: %[[land_ext:.*]] = zext i1 %[[res]] to i32
+// CHECK: ret i32 %[[land_ext]]
+int test_and_ptr(private char* p1, local char* p2) {
+  return p1 && p2;
+}
+
+// Test folding of null pointer in function scope.
+// CHECK-NOOPT-LABEL: test_fold_private
+// SPIR64-NOOPT:  call{{.*}} void @test_fold_callee
+// SPIR64-NOOPT:  store ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr %glob{{.*}}, align 8
+// SPIR64-NOOPT:  %{{.*}} = sub i64 %{{.*}}, ptrtoint (ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) to i64)
+// AMDGCN-NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob{{.*}}, align 8
+// AMDGCN-NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
+// SPIR64-NOOPT:  call{{.*}} void @test_fold_callee
+// SPIR64-NOOPT:  %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i32) to i64
+// AMDGCN-NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)) to i32) to i64
+// CHECK-NOOPT: %{{.*}} = add nsw i64 %{{.*}}, %[[SEXT]]
+// CHECK-NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1
+void test_fold_callee(void);
+void test_fold_private(void) {
+  global int* glob = (test_fold_callee(), (global int*)(generic char*)0);
+  long x = glob - (global int*)(generic char*)0;
+  x = x + (int)(test_fold_callee(), (private int*)(generic char*)(global short*)0);
+  x = x - (int)((private int*)0 == (private int*)(generic char*)0);
+}
+
+// CHECK-NOOPT-LABEL: test_fold_local
+// CHECK-NOOPT:  call{{.*}} void @test_fold_callee
+// SPIR64-NOOPT: store ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)), ptr %glob{{.*}}, align 8
+// SPIR64-NOOPT: %{{.*}} = sub i64 %{{.*}}, ptrtoint (ptr addrspace(1) addrspacecast (ptr addrspace(4) null to ptr addrspace(1)) to i64)
+// AMDGCN-NOOPT: store ptr addrspace(1) null, ptr addrspace(5) %glob{{.*}}, align 8
+// AMDGCN-NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
+// CHECK-NOOPT:  call{{.*}} void @test_fold_callee
+// SPIR64-NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(3) addrspacecast (ptr addrspace(4) null to ptr addrspace(3)) to i32) to i64
+// AMDGCN-NOOPT: %[[SEXT:.*]] = sext i32 ptrtoint (ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)) to i32) to i64
+// CHECK-NOOPT: %{{.*}} = add nsw i64 %{{.*}}, %[[SEXT]]
+// CHECK-NOOPT: %{{.*}} = sub nsw i64 %{{.*}}, 1
+void test_fold_local(void) {
+  global int* glob = (test_fold_callee(), (global int*)(generic char*)0);
+  long x = glob - (global int*)(generic char*)0;
+  x = x + (int)(test_fold_callee(), (local int*)(generic char*)(global short*)0);
+  x = x - (int)((local int*)0 == (local int*)(generic char*)0);
+}
diff --git a/clang/test/CodeGenSPIRV/spirv-intel.c b/clang/test/CodeGenSPIRV/spirv-intel.c
index 997cd6f10b90c..f00fc97adaec7 100644
--- a/clang/test/CodeGenSPIRV/spirv-intel.c
+++ b/clang/test/CodeGenSPIRV/spirv-intel.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple spirv64-intel %s -emit-llvm -o - | FileCheck -check-prefix=CHECK-WITH-64 %s
-// RUN: %clang_cc1 -triple spirv32-intel %s -emit-llvm -o - | FileCheck -check-prefix=CHECK-WITH-32 %s
+// RUN: %clang_cc1 -triple spirv64-intel %s -emit-llvm -o - | FileCheck -check-prefixes=CHECK-WITH,CHECK-WITH-64 %s
+// RUN: %clang_cc1 -triple spirv32-intel %s -emit-llvm -o - | FileCheck -check-prefixes=CHECK-WITH,CHECK-WITH-32 %s
 // RUN: %clang_cc1 -triple spir-intel %s -emit-llvm -o - | FileCheck -check-prefix=CHECK-WITHOUT %s
 // RUN: %clang_cc1 -triple spir64-intel %s -emit-llvm -o - | FileCheck -check-prefix=CHECK-WITHOUT %s
 
@@ -9,3 +9,11 @@
 // CHECK-WITHOUT: spir_func void @foo(ptr noundef %param) #0 {
 void foo(int *param) {
 }
+
+typedef __attribute__((address_space(9))) void * FnPtrTy;
+
+// CHECK-WITH: %{{.*}} = icmp eq ptr addrspace(9) %{{.*}}, null
+int bar() {
+  FnPtrTy FnPtr = (FnPtrTy)foo;
+  return FnPtr == 0;
+}
diff --git a/clang/test/DebugInfo/CXX/decl-member-call.cpp b/clang/test/DebugInfo/CXX/decl-member-call.cpp
new file mode 100644
index 0000000000000..95758a2985c0c
--- /dev/null
+++ b/clang/test/DebugInfo/CXX/decl-member-call.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -O1 -triple x86_64-unknown_unknown -emit-llvm \
+// RUN:   -debug-info-kind=standalone -dwarf-version=5 %s -o - | FileCheck %s
+
+// Ensure both nonmember and member calls to declared function
+// have attached `DISubprogram`s.
+
+int nonmember(int n);
+
+struct S {
+  int x;
+  int member(int n);
+};
+
+int main(int argc, char** argv) {
+  struct S s = {};
+  int a = s.member(argc);
+  int b = nonmember(argc);
+  return a + b;
+}
+
+// CHECK: declare !dbg ![[SP1:[0-9]+]] noundef i32 @_ZN1S6memberEi(
+// CHECK: declare !dbg ![[SP2:[0-9]+]] noundef i32 @_Z9nonmemberi(
+
+// CHECK: ![[SP1]] = !DISubprogram(name: "member", linkageName: "_ZN1S6memberEi"
+// CHECK: ![[SP2]] = !DISubprogram(name: "nonmember", linkageName: "_Z9nonmemberi"
diff --git a/clang/test/DebugInfo/KeyInstructions/flag.cpp b/clang/test/DebugInfo/KeyInstructions/flag.cpp
index 6aeeed664135e..4a4a5c4c142a7 100644
--- a/clang/test/DebugInfo/KeyInstructions/flag.cpp
+++ b/clang/test/DebugInfo/KeyInstructions/flag.cpp
@@ -8,6 +8,9 @@
 
 // KEY-INSTRUCTIONS: "-gkey-instructions"
 // NO-KEY-INSTRUCTIONS-NOT: key-instructions
+
+// Only expect one dwarf related flag.
+// NO-DEBUG: -fdwarf2-cfi-asm
 // NO-DEBUG-NOT: debug-info-kind
 // NO-DEBUG-NOT: dwarf
 
diff --git a/clang/test/Driver/HLSL/wconversion.hlsl b/clang/test/Driver/HLSL/wconversion.hlsl
new file mode 100644
index 0000000000000..1857a3dfe386e
--- /dev/null
+++ b/clang/test/Driver/HLSL/wconversion.hlsl
@@ -0,0 +1,7 @@
+// RUN: %clang_dxc -T lib_6_7 %s -### %s 2>&1 | FileCheck %s --check-prefixes=CONV
+// RUN: %clang_dxc -T lib_6_7 -Wno-conversion %s -### %s 2>&1 | FileCheck %s --check-prefixes=NOCONV
+
+// make sure we generate -Wconversion by default
+// CONV: "-Wconversion"
+// make sure -Wno-conversion still works
+// NOCONV: "-Wno-conversion"
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1250.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1250.bc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1251.bc b/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1251.bc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/aarch64-vfat.c b/clang/test/Driver/aarch64-vfat.c
new file mode 100644
index 0000000000000..bd5eed275489f
--- /dev/null
+++ b/clang/test/Driver/aarch64-vfat.c
@@ -0,0 +1,7 @@
+// ===== Features supported on aarch64 =====
+
+// FAT features (Future Architecture Technologies)
+
+// RUN: %clang -target aarch64 -march=armv9.7a+mops-go -### -c %s 2>&1 | FileCheck -check-prefix=VFAT-MOPS-GO %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+mops-go -### -c %s 2>&1 | FileCheck -check-prefix=VFAT-MOPS-GO %s
+// VFAT-MOPS-GO: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+mops-go"
diff --git a/clang/test/Driver/amdgpu-openmp-sanitize-options.c b/clang/test/Driver/amdgpu-openmp-sanitize-options.c
index 914e01873089c..10d64984918e6 100644
--- a/clang/test/Driver/amdgpu-openmp-sanitize-options.c
+++ b/clang/test/Driver/amdgpu-openmp-sanitize-options.c
@@ -22,10 +22,14 @@
 // RUN:   %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address --rocm-path=%S/Inputs/rocm %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=HOSTSAN,GPUSAN,SAN %s
 
-// ASan enabled for multiple amdgpu-arch [gfx908:xnack+,gfx900:xnack+]
+// GPU ASan enabled for multiple amdgpu-arch [gfx908:xnack+,gfx900:xnack+]
 // RUN:   %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ --offload-arch=gfx900:xnack+ -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=HOSTSAN,GPUSAN,SAN %s
 
+// GPU ASan enabled  for amdgpu-arch [gfx1250,gfx1251]
+// RUN:   %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx1250,gfx1251 -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \
+// RUN:   | FileCheck -check-prefixes=HOSTSAN,GPUSAN,SAN %s
+
 // GPU ASan Disabled Test Cases
 
 // GPU ASan disabled through '-fsanitize=address' without '-fgpu-sanitize' flag for amdgpu-arch [gfx908]
@@ -56,9 +60,9 @@
 
 // HOSTSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "--offload-targets=amdgcn-amd-amdhsa".* "-x" "c".*}}
 
-// GPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-mlink-bitcode-file" "[^"]*asanrtl.bc".* "-mlink-bitcode-file" "[^"]*ockl.bc".* "-target-cpu" "(gfx908|gfx900)".* "-fopenmp".* "-fsanitize=address".* "-x" "c".*}}
+// GPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-mlink-bitcode-file" "[^"]*asanrtl.bc".* "-mlink-bitcode-file" "[^"]*ockl.bc".* "-target-cpu" "(gfx908|gfx900|gfx1250|gfx1251)".* "-fopenmp".* "-fsanitize=address".* "-x" "c".*}}
 // NOGPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-target-cpu" "(gfx908|gfx900)".* "-fopenmp".* "-x" "c".*}}
 
-// SAN: {{"[^"]*llvm-offload-binary[^"]*" "-o".* "--image=file=.*.bc,triple=amdgcn-amd-amdhsa,arch=gfx908(:xnack\-|:xnack\+)?,kind=openmp(,feature=(\-xnack|\+xnack))?"}}
+// SAN: {{"[^"]*llvm-offload-binary[^"]*" "-o".* "--image=file=.*.bc,triple=amdgcn-amd-amdhsa,arch=(gfx908|gfx1250|gfx1251)(:xnack\-|:xnack\+)?,kind=openmp(,feature=(\-xnack|\+xnack))?"}}
 // SAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "--offload-targets=amdgcn-amd-amdhsa".* "-x" "ir".*}}
 // SAN: {{"[^"]*clang-linker-wrapper[^"]*".* "--host-triple=x86_64-unknown-linux-gnu".* "--linker-path=[^"]*".* "--whole-archive" "[^"]*(libclang_rt.asan_static.a|libclang_rt.asan_static-x86_64.a)".* "--whole-archive" "[^"]*(libclang_rt.asan.a|libclang_rt.asan-x86_64.a)".*}}
diff --git a/clang/test/Driver/cl-x86-flags.c b/clang/test/Driver/cl-x86-flags.c
index 89526744c0a49..4dae49aab7ac7 100644
--- a/clang/test/Driver/cl-x86-flags.c
+++ b/clang/test/Driver/cl-x86-flags.c
@@ -133,6 +133,28 @@
 // tune: "-target-cpu" "sandybridge"
 // tune-SAME: "-tune-cpu" "haswell"
 
+// RUN: %clang_cl -m64 -arch:AVX512 -vlen=512 --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=vlen512 %s
+// vlen512: "-mprefer-vector-width=512"
+
+// RUN: %clang_cl -m64 -arch:AVX512 -vlen=256 --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=vlen256 %s
+// vlen256: "-mprefer-vector-width=256"
+
+// RUN: %clang_cl -m64 -arch:AVX512 -vlen=512 -vlen --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=novlen %s
+// novlen-NOT: -mprefer-vector-width
+
+// RUN: %clang_cl -m64 -arch:AVX2 -vlen=512 --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx2vlen512 %s
+// avx2vlen512: invalid argument '/vlen=512' not allowed with '/arch:AVX2'
+
+// RUN: %clang_cl -m64 -arch:AVX2 -vlen=256 --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx2vlen256 %s
+// avx2vlen256-NOT: invalid argument
+
+// RUN: %clang_cl -m32 -arch:SSE2 -vlen=256 --target=i386-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=sse2vlen256 %s
+// RUN: %clang_cl -m64 -vlen=256 --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=sse2vlen256 %s
+// sse2vlen256: invalid argument '/vlen=256' not allowed with '/arch:SSE2'
+
+// RUN: %clang_cl -m32 -vlen=256 --target=i386-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=ia32vlen256 %s
+// ia32vlen256: invalid argument '/vlen=256' not allowed with '/arch:IA32'
+
 void f(void) {
 }
 
diff --git a/clang/test/Driver/dxc_enable16bittypes.hlsl b/clang/test/Driver/dxc_enable16bittypes.hlsl
new file mode 100644
index 0000000000000..4cd1d2fd402b3
--- /dev/null
+++ b/clang/test/Driver/dxc_enable16bittypes.hlsl
@@ -0,0 +1,7 @@
+// RUN: %clang_dxc -enable-16bit-types -T lib_6_7 %s -### %s 2>&1 | FileCheck %s
+
+// Make sure enable-16bit-types flag translates into '-fnative-half-type' and 'fnative-int16-type'
+// CHECK: "-fnative-half-type"
+// CHECK-SAME: "-fnative-int16-type"
+
+// expected-no-diagnostics
diff --git a/clang/test/Driver/dxc_fcgl.hlsl b/clang/test/Driver/dxc_fcgl.hlsl
index fe65124c197bc..4db7ada9622c5 100644
--- a/clang/test/Driver/dxc_fcgl.hlsl
+++ b/clang/test/Driver/dxc_fcgl.hlsl
@@ -1,9 +1,5 @@
-// RUN: not %clang_dxc -fcgl -T lib_6_7 foo.hlsl -### %s 2>&1 | FileCheck %s
-// RUN: %clang_dxc -fcgl -T lib_6_7 %s -Xclang -verify
+// RUN: %clang_dxc -fcgl -T lib_6_7 %s -### %s 2>&1 | FileCheck %s
 
 // Make sure fcgl option flag which translated into "-emit-llvm" "-disable-llvm-passes".
 // CHECK: "-emit-llvm"
 // CHECK-SAME: "-disable-llvm-passes"
-
-// Make sure fcgl option not generate any diagnostics.
-// expected-no-diagnostics
diff --git a/clang/test/Driver/frame-pointer-elim.c b/clang/test/Driver/frame-pointer-elim.c
index 6d719828c6a06..e68fbf529643e 100644
--- a/clang/test/Driver/frame-pointer-elim.c
+++ b/clang/test/Driver/frame-pointer-elim.c
@@ -2,6 +2,8 @@
 // KEEP-ALL:      "-mframe-pointer=all"
 // KEEP-NON-LEAF-NOT: warning: argument unused
 // KEEP-NON-LEAF: "-mframe-pointer=non-leaf"
+// KEEP-NON-LEAF-NO-RESERVE-NOT: warning: argument unused
+// KEEP-NON-LEAF-NO-RESERVE: "-mframe-pointer=non-leaf-no-reserve"
 // KEEP-NONE-NOT: warning: argument unused
 // KEEP-NONE:     "-mframe-pointer=none"
 // KEEP-RESERVED-NOT: warning: argument unused
@@ -24,19 +26,27 @@
 // -momit-leaf-frame-pointer omits leaf frame pointer.
 // -fno-omit-frame-pointer loses out to -momit-leaf-frame-pointer.
 // RUN: %clang -### --target=i386 -S -momit-leaf-frame-pointer %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=i386-linux -S -O1 -momit-leaf-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
+// -momit-leaf-frame-pointer -mreserve-frame-pointer-reg results in the frame pointer reg being reserved
+// RUN: %clang -### --target=i386 -S -momit-leaf-frame-pointer -mreserve-frame-pointer-reg %s 2>&1 | \
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+
+// -fomit-frame-pointer -mreserve-frame-pointer-reg results in the frame pointer reg being reserved
+// RUN: %clang -### --target=i386 -S -fomit-frame-pointer -mreserve-frame-pointer-reg %s 2>&1 | \
+// RUN:   FileCheck --check-prefix=KEEP-RESERVED %s
+
 // fno-omit-frame-pointer -momit-leaf-frame-pointer can be overwritten by
 // fomit-frame-pointer later on the command without warning
 // RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer -fomit-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
 // RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // Explicit or default -fomit-frame-pointer wins over -mno-omit-leaf-frame-pointer.
 // RUN: %clang -### --target=i386 -S %s -fomit-frame-pointer -mno-omit-leaf-frame-pointer 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
@@ -68,45 +78,45 @@
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
 // RUN: %clang -### --target=i386-darwin -S -momit-leaf-frame-pointer %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 
 // RUN: %clang -### -target armv7s-apple-ios -fomit-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=WARN-OMIT-7S %s
 // WARN-OMIT-7S: warning: optimization flag '-fomit-frame-pointer' is not supported for target 'armv7s'
-// WARN-OMIT-7S: "-mframe-pointer=non-leaf"
+// WARN-OMIT-7S: "-mframe-pointer=non-leaf-no-reserve"
 
 // RUN: %clang -### -target armv7k-apple-watchos -fomit-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=WARN-OMIT-7K %s
 // WARN-OMIT-7K: warning: optimization flag '-fomit-frame-pointer' is not supported for target 'armv7k'
-// WARN-OMIT-7K: "-mframe-pointer=non-leaf"
+// WARN-OMIT-7K: "-mframe-pointer=non-leaf-no-reserve"
 
 // RUN: %clang -### -target armv7s-apple-ios8.0 -momit-leaf-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=WARN-OMIT-LEAF-7S %s
 // WARN-OMIT-LEAF-7S-NOT: warning: optimization flag '-momit-leaf-frame-pointer' is not supported for target 'armv7s'
-// WARN-OMIT-LEAF-7S: "-mframe-pointer=non-leaf"
+// WARN-OMIT-LEAF-7S: "-mframe-pointer=non-leaf-no-reserve"
 
 // On AArch64, PS4, PS5, and VE, default to omitting the frame pointer on leaf
 // functions
 // RUN: %clang -### --target=aarch64 -S %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=x86_64-scei-ps4 -S %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=x86_64-scei-ps4 -S -O2 %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=x86_64-sie-ps5 -S %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=x86_64-sie-ps5 -S -O2 %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### -target aarch64-apple-darwin -arch arm64_32 -S %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=ve-unknown-linux-gnu -S %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=aarch64-linux-android -S %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=aarch64-linux-android -S -O2 %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=aarch64-linux-android -S -Os %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 
 // RUN: %clang -### --target=powerpc64 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
@@ -161,9 +171,9 @@
 // RUN: %clang -### --target=armv7a-linux-androideabi- -mthumb -mbig-endian -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
 // RUN: %clang -### --target=riscv64-linux-android -O1 -S %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=riscv64-linux-android -mbig-endian -O1 -S %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 
 // On ARM backend bare metal targets, frame pointer is omitted
 // RUN: %clang -### --target=arm-arm-none-eabi -S %s 2>&1 | \
@@ -191,21 +201,21 @@
 
 // Check that for Apple bare metal targets, we're keeping frame pointers by default
 // RUN: %clang -### --target=armv6m-apple-none-macho -S %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=armv6m-apple-none-macho -S -fno-omit-frame-pointer %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=arm-apple-none-macho -S %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=arm-apple-none-macho -S -fno-omit-frame-pointer %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=armv6m-apple-none-macho -S -O1 %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=armv6m-apple-none-macho -S -O1 -fno-omit-frame-pointer %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=arm-apple-none-macho -S -O1 %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=arm-apple-none-macho -S -O1 -fno-omit-frame-pointer %s 2>&1 | \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 
 // RUN: %clang --target=armv7-apple-macho -### -S %s 2>&1	\
 // RUN:         -fomit-frame-pointer \
@@ -221,17 +231,22 @@
 
 // AArch64 bare metal targets behave like hosted targets
 // RUN: %clang -### --target=aarch64-none-elf -S %s 2>&1 |  \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=aarch64-none-elf -S -O1 %s 2>&1 |  \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=aarch64-none-elf -S -fno-omit-frame-pointer %s 2>&1 |  \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 // RUN: %clang -### --target=aarch64-none-elf -S -O1 -fno-omit-frame-pointer %s 2>&1 |  \
-// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
+// RUN:   FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s
 
 // AArch64 Windows requires that the frame pointer be reserved
 // RUN: %clang -### --target=aarch64-pc-windows-msvc -S -fomit-frame-pointer %s 2>&1 |  \
 // RUN:   FileCheck --check-prefix=KEEP-RESERVED %s
 
+// -mno-reserve-frame-pointer-reg overrides platform defaults
+// But -mno-reserve-frame-pointer-reg should override the target platform default
+// RUN: %clang -### --target=aarch64-pc-windows-msvc -S -fomit-frame-pointer -mno-reserve-frame-pointer-reg %s 2>&1 |  \
+// RUN:   FileCheck --check-prefix=KEEP-NONE %s
+
 void f0() {}
 void f1() { f0(); }
diff --git a/clang/test/Driver/fuchsia.c b/clang/test/Driver/fuchsia.c
index 99e5018117924..0cf7535d14bd5 100644
--- a/clang/test/Driver/fuchsia.c
+++ b/clang/test/Driver/fuchsia.c
@@ -77,7 +77,7 @@
 // RUN: %clang -### %s --target=aarch64-unknown-fuchsia -O3 2>&1 \
 // RUN:     | FileCheck %s -check-prefix=CHECK-FP-NONE
 // CHECK-FP-ALL: "-mframe-pointer=all"
-// CHECK-FP-NONLEAF: "-mframe-pointer=non-leaf"
+// CHECK-FP-NONLEAF: "-mframe-pointer=non-leaf-no-reserve"
 // CHECK-FP-NONE: "-mframe-pointer=none"
 
 // RUN: not %clang -### %s --target=x86_64-unknown-fuchsia -rtlib=libgcc 2>&1 \
diff --git a/clang/test/Driver/hip-sanitize-options.hip b/clang/test/Driver/hip-sanitize-options.hip
index 0c9c15b61fdc9..490385173a4cb 100644
--- a/clang/test/Driver/hip-sanitize-options.hip
+++ b/clang/test/Driver/hip-sanitize-options.hip
@@ -3,6 +3,11 @@
 // RUN:   -nogpuinc --rocm-path=%S/Inputs/rocm \
 // RUN:   %s 2>&1 | FileCheck -check-prefixes=NORDC %s
 
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx1250,gfx1251 \
+// RUN:   -fsanitize=address \
+// RUN:   -nogpuinc --rocm-path=%S/Inputs/rocm \
+// RUN:   %s 2>&1 | FileCheck -check-prefixes=NORDC %s
+
 // RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \
 // RUN:   -fsanitize=address -fno-gpu-sanitize \
 // RUN:   -nogpuinc --rocm-path=%S/Inputs/rocm \
diff --git a/clang/test/Driver/hip-spirv-translator-new-driver.c b/clang/test/Driver/hip-spirv-translator-new-driver.c
new file mode 100644
index 0000000000000..67d894e2eb506
--- /dev/null
+++ b/clang/test/Driver/hip-spirv-translator-new-driver.c
@@ -0,0 +1,9 @@
+// The --offload-new-driver was crashing when using -save-temps due to a failure in clang-linker-wrapper.
+// The input and output files cannot be the same.
+
+// RUN: %clang --offload-new-driver -### -save-temps -nogpuinc -nogpulib \
+// RUN: --target=x86_64-unknown-linux-gnu --offload-arch=amdgcnspirv -x hip %s 2>&1 \
+// RUN: | FileCheck %s
+
+// CHECK-NOT: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" "[[OUTPUT_FILE:.*.o]]" {{.*}}"[[OUTPUT_FILE]]"
+// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.tmp.o"}}
diff --git a/clang/test/Driver/mg.c b/clang/test/Driver/mg.c
index 82d8a6084e5e0..b7458a08698d3 100644
--- a/clang/test/Driver/mg.c
+++ b/clang/test/Driver/mg.c
@@ -1,5 +1,7 @@
-// RUN: %clang -M -MG -include nonexistent-preinclude.h %s | FileCheck %s
+// RUN: %clang -M -MG -include nonexistent-preinclude.h -std=c23 %s | FileCheck %s
 // CHECK: nonexistent-preinclude.h
 // CHECK: nonexistent-ppinclude.h
+// CHECK: nonexistent-embed
 
 #include "nonexistent-ppinclude.h"
+#embed "nonexistent-embed"
diff --git a/clang/test/Driver/no-gpu-bundle-respected.hip b/clang/test/Driver/no-gpu-bundle-respected.hip
new file mode 100644
index 0000000000000..fc93640dc4b90
--- /dev/null
+++ b/clang/test/Driver/no-gpu-bundle-respected.hip
@@ -0,0 +1,24 @@
+// RUN: %clang -ccc-print-phases -c -emit-llvm \
+// RUN:   --offload-arch=gfx900,gfx1030 -O3 -x hip %s \
+// RUN:   2>&1 | FileCheck %s --check-prefix=BUNDLE
+
+// RUN: %clang -ccc-print-phases -c -emit-llvm \
+// RUN:   --gpu-bundle-output --offload-arch=gfx900,gfx1030 -O3 -x hip %s \
+// RUN:   2>&1 | FileCheck %s --check-prefix=BUNDLE
+
+// RUN: %clang -ccc-print-phases -c -emit-llvm \
+// RUN:   --no-gpu-bundle-output --offload-arch=gfx900,gfx1030 -O3 -x hip %s \
+// RUN:   2>&1 | FileCheck %s --check-prefixes=COMPILER,GFX1030,GFX900,OFFLOAD,NOBUNDLE
+
+// BUNDLE: clang-offload-bundler
+// NOBUNDLE-NOT: clang-offload-bundler
+
+// COM: sanity checks
+// COMPILER: compiler
+// GFX1030: (device-hip, gfx1030)
+// GFX900: (device-hip, gfx900)
+// OFFLOAD: offload
+
+int square(int num) {
+    return num * num;
+}
diff --git a/clang/test/Driver/print-supported-extensions-aarch64.c b/clang/test/Driver/print-supported-extensions-aarch64.c
index 7294c33959e7e..f2da680b68d70 100644
--- a/clang/test/Driver/print-supported-extensions-aarch64.c
+++ b/clang/test/Driver/print-supported-extensions-aarch64.c
@@ -49,6 +49,7 @@
 // CHECK-NEXT:     lsui                FEAT_LSUI                                              Enable Armv9.6-A unprivileged load/store instructions
 // CHECK-NEXT:     lut                 FEAT_LUT                                               Enable Lookup Table instructions
 // CHECK-NEXT:     mops                FEAT_MOPS                                              Enable Armv8.8-A memcpy and memset acceleration instructions
+// CHECK-NEXT:     mops-go             FEAT_MOPS_GO                                           Enable memset acceleration granule only
 // CHECK-NEXT:     mpamv2              FEAT_MPAMv2                                            Enable Armv9.7-A MPAMv2 Lookaside Buffer Invalidate instructions
 // CHECK-NEXT:     memtag              FEAT_MTE, FEAT_MTE2                                    Enable Memory Tagging Extension
 // CHECK-NEXT:     mtetc               FEAT_MTETC                                             Enable Virtual Memory Tagging Extension
diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index cb812736786a9..681c912bd1612 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -227,6 +227,7 @@
 // CHECK-NEXT:     zvfofp8min           0.2       'Zvfofp8min' (Vector OFP8 Converts)
 // CHECK-NEXT:     zvkgs                0.7       'Zvkgs' (Vector-Scalar GCM instructions for Cryptography)
 // CHECK-NEXT:     zvqdotq              0.0       'Zvqdotq' (Vector quad widening 4D Dot Product)
+// CHECK-NEXT:     smpmpmt              0.6       'Smpmpmt' (PMP-based Memory Types Extension)
 // CHECK-NEXT:     svukte               0.3       'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses)
 // CHECK-NEXT:     xqccmp               0.3       'Xqccmp' (Qualcomm 16-bit Push/Pop and Double Moves)
 // CHECK-NEXT:     xqcia                0.7       'Xqcia' (Qualcomm uC Arithmetic Extension)
diff --git a/clang/test/Driver/ps4-sdk-root.c b/clang/test/Driver/ps4-sdk-root.c
index 6e5f1e28958ad..791b96ac12ae6 100644
--- a/clang/test/Driver/ps4-sdk-root.c
+++ b/clang/test/Driver/ps4-sdk-root.c
@@ -11,6 +11,9 @@
 ///
 /// The default <SDKROOT> for both headers and libraries is taken from the
 /// SCE_ORBIS_SDK_DIR environment variable.
+///
+/// In ThinLTO code generation mode (-fthinlto-index=) SDK files are not required
+/// so all warnings are suppressed.
 
 // RUN: echo "-### -Winvalid-or-nonexistent-directory -target x86_64-scei-ps4" > %t.rsp
 
@@ -31,6 +34,10 @@
 /// headers and libraries.
 // RUN: env SCE_ORBIS_SDK_DIR=.. %clang @%t.rsp %s 2>&1 | FileCheck -check-prefixes=WARN-SYS-HEADERS,WARN-SYS-LIBS,NO-WARN %s
 
+/// -fthinlto-index= warning suppression.
+// RUN: touch %t_dummy.o
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang @%t.rsp %t_dummy.o -fthinlto-index=ignored -c 2>&1 | FileCheck -check-prefixes=NO-WARN %s
+
 /// If `-c`, `-S`, `-E` or `-emit-ast` is supplied, the existence check for SDK
 /// libraries is skipped because no linking will be performed. We only expect
 /// warnings about missing headers.
diff --git a/clang/test/Driver/ps5-sdk-root.c b/clang/test/Driver/ps5-sdk-root.c
index 16ef2cc01f5e7..a337ce3801456 100644
--- a/clang/test/Driver/ps5-sdk-root.c
+++ b/clang/test/Driver/ps5-sdk-root.c
@@ -13,6 +13,9 @@
 ///
 /// The default <SDKROOT> for both headers and libraries is taken from the
 /// SCE_PROSPERO_SDK_DIR environment variable.
+///
+/// In ThinLTO code generation mode (-fthinlto-index=) SDK files are not required
+/// so all warnings are suppressed.
 
 // RUN: echo "-### -Winvalid-or-nonexistent-directory -target x86_64-sie-ps5" > %t.rsp
 
@@ -33,6 +36,10 @@
 /// headers and libraries.
 // RUN: env SCE_PROSPERO_SDK_DIR=.. %clang @%t.rsp %s 2>&1 | FileCheck -check-prefixes=WARN-SYS-HEADERS,WARN-SYS-LIBS,NO-WARN %s
 
+/// -fthinlto-index= warning suppression.
+// RUN: touch %t_dummy.o
+// RUN: env SCE_PROSPERO_SDK_DIR=.. %clang @%t.rsp %t_dummy.o -fthinlto-index=ignored -c 2>&1 | FileCheck -check-prefixes=NO-WARN %s
+
 /// If `-c`, `-S`, `-E` or `-emit-ast` is supplied, the existence check for SDK
 /// libraries is skipped because no linking will be performed. We only expect
 /// warnings about missing headers.
diff --git a/clang/test/Driver/rocm-device-libs.cl b/clang/test/Driver/rocm-device-libs.cl
index f9766e6fa4d99..649dc8562a1b2 100644
--- a/clang/test/Driver/rocm-device-libs.cl
+++ b/clang/test/Driver/rocm-device-libs.cl
@@ -138,6 +138,18 @@
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck  --check-prefixes=ASAN,COMMON %s
 
+// RUN: %clang -### -target amdgcn-amd-amdhsa \
+// RUN:   -x cl -mcpu=gfx1250 -fsanitize=address \
+// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   %s \
+// RUN: 2>&1 | FileCheck  --check-prefixes=ASAN,COMMON %s
+
+// RUN: %clang -### -target amdgcn-amd-amdhsa \
+// RUN:   -x cl -mcpu=gfx1251 -fsanitize=address \
+// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   %s \
+// RUN: 2>&1 | FileCheck  --check-prefixes=ASAN,COMMON %s
+
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx908:xnack+ \
 // RUN:   --rocm-path=%S/Inputs/rocm \
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index 3717c449d6601..f1660b1afb518 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -304,13 +304,6 @@
 // AMX-COMPLEX: "-target-feature" "+amx-complex"
 // NO-AMX-COMPLEX: "-target-feature" "-amx-complex"
 
-// RUN: %clang --target=x86_64-unknown-linux-gnu -mamx-transpose %s \
-// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-TRANSPOSE %s
-// RUN: %clang --target=x86_64-unknown-linux-gnu -mno-amx-transpose %s \
-// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s
-// AMX-TRANSPOSE: "-target-feature" "+amx-transpose"
-// NO-AMX-TRANSPOSE: "-target-feature" "-amx-transpose"
-
 // RUN: %clang --target=x86_64-unknown-linux-gnu -mamx-avx512 %s \
 // RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-AVX512 %s
 // RUN: %clang --target=x86_64-unknown-linux-gnu -mno-amx-avx512 %s \
diff --git a/clang/test/Frontend/diags-interesting-source-region-colors.cpp b/clang/test/Frontend/diags-interesting-source-region-colors.cpp
new file mode 100644
index 0000000000000..80db0873b9e0a
--- /dev/null
+++ b/clang/test/Frontend/diags-interesting-source-region-colors.cpp
@@ -0,0 +1,30 @@
+// RUN: not %clang_cc1 %s -fmessage-length=40 -fcolor-diagnostics -fno-show-source-location -Wunused-value -o - 2>&1 | FileCheck %s
+
+// REQUIRES: ansi-escape-sequences
+
+int main() {
+          1 +                                                        + if;
+  // CHECK: expected expression
+  // CHECK-NEXT: ...+ [[MAGENTA:.\[0;34m]]if[[RESET:.\[0m]];
+
+    /*😂*/1 +                                                        + if;
+  // CHECK: expected expression
+  // CHECK-NEXT: ...+ [[MAGENTA:.\[0;34m]]if[[RESET:.\[0m]];
+
+  a + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1;
+  // CHECK: use of undeclared identifier
+  // CHECK-NEXT: a + [[GREEN:.\[0;32m]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] ...
+
+
+  /*😂😂😂*/ a + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1;
+  // CHECK: use of undeclared identifier
+  // CHECK-NEXT: [[YELLOW:.\[0;33m]]/*😂😂😂*/[[RESET]] a + [[GREEN:.\[0;32m]]1[[RESET]] + [[GREEN]]1[[RESET]] ...
+
+  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
+  // CHECK: [[GREEN:.\[0;32m]]"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"[[RESET]];
+
+  "😂xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
+  // CHECK: [[GREEN:.\[0;32m]]"😂xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"[[RESET]];
+}
+
+
diff --git a/clang/test/Index/complete-preprocessor.m b/clang/test/Index/complete-preprocessor.m
index 1cc2f32b7efa6..bd90a796240c4 100644
--- a/clang/test/Index/complete-preprocessor.m
+++ b/clang/test/Index/complete-preprocessor.m
@@ -80,3 +80,8 @@
 // RUN: env CINDEXTEST_EDITING=1 CINDEXTEST_COMPLETION_CACHING=1 c-index-test -code-completion-at=%s:9:8 %s | FileCheck -check-prefix=CHECK-CC3 %s
 // RUN: env CINDEXTEST_EDITING=1 CINDEXTEST_COMPLETION_CACHING=1 c-index-test -code-completion-at=%s:11:5 %s | FileCheck -check-prefix=CHECK-CC4 %s
 // RUN: env CINDEXTEST_EDITING=1 CINDEXTEST_COMPLETION_CACHING=1 c-index-test -code-completion-at=%s:14:5 %s | FileCheck -check-prefix=CHECK-CC5 %s
+
+// Test #embed completion in C23 mode
+// RUN: c-index-test -code-completion-at=%s:4:2 %s -std=c23 | FileCheck -check-prefix=CHECK-EMBED %s
+// CHECK-EMBED: NotImplemented:{TypedText embed}{HorizontalSpace  }{Text "}{Placeholder file}{Text "} (40)
+// CHECK-EMBED: NotImplemented:{TypedText embed}{HorizontalSpace  }{Text <}{Placeholder file}{Text >} (40)
diff --git a/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap b/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap
index 186965177caaf..8ab6ae4779ea9 100644
--- a/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap
+++ b/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap
@@ -49,6 +49,11 @@ module cstd [system] [no_undeclared_includes] {
     export *
   }
 
+  module stdckdint {
+    header "stdckdint.h"
+    export *
+  }
+
   module stdcountof {
     header "stdcountof.h"
     export *
diff --git a/clang/test/Modules/avoid-specialization-update-in-reduced-bmi.cppm b/clang/test/Modules/avoid-specialization-update-in-reduced-bmi.cppm
new file mode 100644
index 0000000000000..7844344a15427
--- /dev/null
+++ b/clang/test/Modules/avoid-specialization-update-in-reduced-bmi.cppm
@@ -0,0 +1,28 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/base.cppm -emit-module-interface -o %t/base.pcm
+// RUN: %clang_cc1 -std=c++20 %t/update.cppm -fmodule-file=base=%t/base.pcm -emit-module-interface -o %t/update.pcm
+// RUN: llvm-bcanalyzer --dump --disable-histogram %t/update.pcm | FileCheck %t/update.cppm --check-prefix=FULL
+//
+// RUN: %clang_cc1 -std=c++20 %t/base.cppm -emit-reduced-module-interface -o %t/base.pcm
+// RUN: %clang_cc1 -std=c++20 %t/update.cppm -fmodule-file=base=%t/base.pcm -emit-reduced-module-interface -o %t/update.pcm
+// RUN: llvm-bcanalyzer --dump --disable-histogram %t/update.pcm | FileCheck %t/update.cppm
+
+//--- base.cppm
+export module base;
+
+export template <typename T>
+struct base {
+    T value;
+};
+
+//--- update.cppm
+export module update;
+import base;
+export int update() {
+    return base<int>().value;
+}
+
+// FULL: TEMPLATE_SPECIALIZATION
+// CHECK-NOT: TEMPLATE_SPECIALIZATION
diff --git a/clang/test/Modules/builtin-headers.mm b/clang/test/Modules/builtin-headers.mm
index ad2d66ae38dfd..6cd366228172e 100644
--- a/clang/test/Modules/builtin-headers.mm
+++ b/clang/test/Modules/builtin-headers.mm
@@ -17,6 +17,7 @@
 @import _Builtin_stdarg;
 @import _Builtin_stdatomic;
 @import _Builtin_stdbool;
+@import _Builtin_stdckdint;
 @import _Builtin_stdcountof;
 @import _Builtin_stddef;
 @import _Builtin_stdint;
diff --git a/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm b/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm
new file mode 100644
index 0000000000000..90c57796dcf7e
--- /dev/null
+++ b/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm
@@ -0,0 +1,46 @@
+// Fixes #165445
+
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 -x c++-user-header %t/header.h \
+// RUN:   -emit-header-unit -o %t/header.pcm
+//
+// RUN: %clang_cc1 -std=c++20 %t/A.cppm -fmodule-file=%t/header.pcm \
+// RUN:   -emit-module-interface -o %t/A.pcm
+// 
+// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fmodule-file=%t/header.pcm \
+// RUN:   -emit-module-interface -o %t/B.pcm
+//
+// RUN: %clang_cc1 -std=c++20 %t/use.cpp \
+// RUN:   -fmodule-file=A=%t/A.pcm -fmodule-file=B=%t/B.pcm  \
+// RUN:   -fmodule-file=%t/header.pcm \
+// RUN:   -verify -fsyntax-only
+
+//--- enum.h
+enum E { Value };
+
+//--- header.h
+#include "enum.h"
+
+//--- A.cppm
+module;
+#include "enum.h"
+export module A;
+
+auto e = Value;
+
+//--- B.cppm
+export module B;
+import "header.h";
+
+auto e = Value;
+
+//--- use.cpp
+// expected-no-diagnostics
+import A;
+import B;
+#include "enum.h"
+
+auto e = Value;
diff --git a/clang/test/Modules/pr166068.cppm b/clang/test/Modules/pr166068.cppm
new file mode 100644
index 0000000000000..b6944b591d264
--- /dev/null
+++ b/clang/test/Modules/pr166068.cppm
@@ -0,0 +1,38 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/flyweight.cppm -emit-reduced-module-interface -o %t/flyweight.pcm
+// RUN: %clang_cc1 -std=c++20 %t/account.cppm -emit-reduced-module-interface -o %t/account.pcm -fprebuilt-module-path=%t
+// RUN: %clang_cc1 -std=c++20 %t/core.cppm -emit-reduced-module-interface -o %t/core.pcm -fprebuilt-module-path=%t
+// RUN: %clang_cc1 -std=c++20 %t/core.cppm -fprebuilt-module-path=%t -emit-llvm -disable-llvm-passes -o - | FileCheck %t/core.cppm
+
+//--- flyweight.cppm
+module;
+template <typename> struct flyweight_core {
+  static bool init() { (void)__builtin_operator_new(2); return true; }
+  static bool static_initializer;
+};
+template <typename T> bool flyweight_core<T>::static_initializer = init();
+export module flyweight;
+export template <class> void flyweight() {
+  (void)flyweight_core<int>::static_initializer;
+}
+
+//--- account.cppm
+export module account;
+import flyweight;
+export void account() {
+  (void)::flyweight<int>;
+}
+
+//--- core.cppm
+export module core;
+import account;
+
+extern "C" void core() {}
+
+// Fine enough to check it won't crash.
+// CHECK-NOT: init
+// CHECK-NOT: static_initializer
+// CHECK: define {{.*}}@core(
diff --git a/clang/test/Modules/transitive-system.test b/clang/test/Modules/transitive-system.test
index b1f1558b31742..5f6196cc1d6a3 100644
--- a/clang/test/Modules/transitive-system.test
+++ b/clang/test/Modules/transitive-system.test
@@ -2,9 +2,9 @@
 // RUN: split-file %s %t
 
 // RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json
-// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-name=direct > %t/result1.json
+// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-names=direct > %t/result1.json
 // RUN: rm -rf %t/cache
-// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-name=transitive > %t/result2.json
+// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full -module-names=transitive > %t/result2.json
 // RUN: %deps-to-rsp %t/result1.json --module-name transitive > %t/1.rsp
 // RUN: %deps-to-rsp %t/result2.json --module-name transitive > %t/2.rsp
 // RUN: diff %t/1.rsp %t/2.rsp
diff --git a/clang/test/OpenMP/metadirective_ast_print.c b/clang/test/OpenMP/metadirective_ast_print.c
index 638dbae1bc774..75ef5fa26827c 100644
--- a/clang/test/OpenMP/metadirective_ast_print.c
+++ b/clang/test/OpenMP/metadirective_ast_print.c
@@ -2,17 +2,25 @@
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT
 
-// RUN: %clang_cc1 -verify -fopenmp -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-AMDGCN
+// RUN: %clang_cc1 -verify -fopenmp -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU
 
-// RUN: %clang_cc1 -verify -fopenmp-simd -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-AMDGCN
+// RUN: %clang_cc1 -verify -fopenmp-simd -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU
+
+// RUN: %clang_cc1 -verify -fopenmp -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52
 
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-AMDGCN
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU
 
-// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-AMDGCN
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU
 // expected-no-diagnostics
 
 #ifndef HEADER
@@ -77,6 +85,12 @@ void foo1(void) {
   for (int i = 0; i < 100; i++)
   ;
 
+#pragma omp metadirective when(device={arch("spirv64")}: \
+                                teams distribute parallel for)\
+                                otherwise(parallel for)
+  for (int i = 0; i < 100; i++)
+  ;
+
 #pragma omp metadirective when(implementation = {extension(match_all)} \
                                : nothing) otherwise(parallel for)
   for (int i = 0; i < 16; i++)
@@ -134,8 +148,8 @@ void foo1(void) {
 // OMP52-NEXT: for (int i = 0; i < 16; i++) {
 // OMP52-NEXT: #pragma omp simd
 // OMP52-NEXT: for (int j = 0; j < 16; j++)
-// OMP52-AMDGCN: #pragma omp teams distribute parallel for
-// OMP52-AMDGCN-NEXT: for (int i = 0; i < 100; i++)
+// OMP52-GPU: #pragma omp teams distribute parallel for
+// OMP52-GPU-NEXT: for (int i = 0; i < 100; i++)
 // OMP52: for (int i = 0; i < 16; i++)
 // OMP52: for (int i = 0; i < 16; i++)
 
@@ -198,6 +212,12 @@ void foo2(void) {
   for (int i = 0; i < 100; i++)
   ;
 
+#pragma omp metadirective when(device={arch("spirv64")}: \
+                                teams distribute parallel for)\
+                                default(parallel for)
+  for (int i = 0; i < 100; i++)
+  ;  
+
 #pragma omp metadirective when(implementation = {extension(match_all)} \
                                : nothing) default(parallel for)
   for (int i = 0; i < 16; i++)
@@ -266,8 +286,8 @@ void foo2(void) {
 // DEFAULT-NEXT: for (int i = 0; i < 16; i++) {
 // DEFAULT-NEXT: #pragma omp simd
 // DEFAULT-NEXT: for (int j = 0; j < 16; j++)
-// DEFAULT-AMDGCN: #pragma omp teams distribute parallel for
-// DEFAULT-AMDGCN-NEXT: for (int i = 0; i < 100; i++)
+// DEFAULT-GPU: #pragma omp teams distribute parallel for
+// DEFAULT-GPU-NEXT: for (int i = 0; i < 100; i++)
 // DEFAULT: for (int i = 0; i < 16; i++)
 // DEFAULT: for (int i = 0; i < 16; i++)
 
diff --git a/clang/test/OpenMP/metadirective_device_arch_codegen.cpp b/clang/test/OpenMP/metadirective_device_arch_codegen.cpp
index eecae310d0a77..1d5584de67162 100644
--- a/clang/test/OpenMP/metadirective_device_arch_codegen.cpp
+++ b/clang/test/OpenMP/metadirective_device_arch_codegen.cpp
@@ -1,7 +1,7 @@
-// REQUIRES: amdgpu-registered-target
-
 // RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -target-cpu gfx906 -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-ppc-spirv-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-spirv-host.bc  -o - | FileCheck %s
 // expected-no-diagnostics
 
 
@@ -16,6 +16,12 @@ Inspired from SOLLVE tests:
 
 #define N 1024
 
+#ifdef __AMDGPU__
+#define GPU "amdgcn"
+#else
+#define GPU "spirv64"
+#endif
+
 int metadirective1() {
 
    int v1[N], v2[N], v3[N];
@@ -26,7 +32,7 @@ int metadirective1() {
    #pragma omp target map(to:v1,v2) map(from:v3, target_device_num) device(default_device)
    {
       #pragma omp metadirective \
-                   when(device={arch("amdgcn")}: teams distribute parallel for) \
+                   when(device={arch(GPU)}: teams distribute parallel for) \
                    default(parallel for)
 
          for (int i = 0; i < N; i++) {
@@ -38,28 +44,28 @@ int metadirective1() {
    return errors;
 }
 
-// CHECK: define weak_odr protected amdgpu_kernel void @[[METADIRECTIVE:.+metadirective1[a-z0-9_]+]]
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @[[METADIRECTIVE:.+metadirective1[a-z0-9_]+]]
 // CHECK: entry:
-// CHECK: %{{[0-9]}} = call i32 @__kmpc_target_init
+// CHECK: %{{[0-9]}} = call{{.*}} i32 @__kmpc_target_init
 // CHECK: user_code.entry:
-// CHECK: call void @[[METADIRECTIVE]]_omp_outlined
-// CHECK-NOT: call void @__kmpc_parallel_51
+// CHECK: call{{.*}} void @[[METADIRECTIVE]]_omp_outlined
+// CHECK-NOT: call{{.*}} void @__kmpc_parallel_51
 // CHECK: ret void
 
 
 // CHECK: define internal void @[[METADIRECTIVE]]_omp_outlined
 // CHECK: entry:
-// CHECK: call void @__kmpc_distribute_static_init
+// CHECK: call{{.*}} void @__kmpc_distribute_static_init
 // CHECK: omp.loop.exit:
-// CHECK: call void @__kmpc_distribute_static_fini
+// CHECK: call{{.*}} void @__kmpc_distribute_static_fini
 
 
 // CHECK: define internal void @[[METADIRECTIVE]]_omp_outlined_omp_outlined
 // CHECK: entry:
-// CHECK: call void @__kmpc_for_static_init_4
+// CHECK: call{{.*}} void @__kmpc_for_static_init_4
 // CHECK: omp.inner.for.body:
 // CHECK: store atomic {{.*}} monotonic
 // CHECK: omp.loop.exit:
-// CHECK-NEXT: call void @__kmpc_for_static_fini
+// CHECK-NEXT: call{{.*}} void @__kmpc_for_static_fini
 // CHECK-NEXT: ret void
 
diff --git a/clang/test/OpenMP/spirv_target_codegen_basic.cpp b/clang/test/OpenMP/spirv_target_codegen_basic.cpp
index fb2810e88c063..6e029fb93644d 100644
--- a/clang/test/OpenMP/spirv_target_codegen_basic.cpp
+++ b/clang/test/OpenMP/spirv_target_codegen_basic.cpp
@@ -6,12 +6,18 @@
 // CHECK: @__omp_offloading_{{.*}}_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer
 // CHECK: @__omp_offloading_{{.*}}_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy
 
+// CHECK: @"_gomp_critical_user_$var" = common addrspace(1) global [8 x i32] zeroinitializer, align 8
+
 // CHECK: define weak_odr protected spir_kernel void @__omp_offloading_{{.*}}
 
+// CHECK: call spir_func addrspace(9) void @__kmpc_critical(ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}} to ptr addrspace(4)), i32 %{{.*}}, ptr addrspace(4) addrspacecast (ptr addrspace(1) @"_gomp_critical_user_$var" to ptr addrspace(4)))
+// CHECK: call spir_func addrspace(9) void @__kmpc_end_critical(ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}} to ptr addrspace(4)), i32 %{{.*}}, ptr addrspace(4) addrspacecast (ptr addrspace(1) @"_gomp_critical_user_$var" to ptr addrspace(4)))
+
 int main() {
   int ret = 0;
   #pragma omp target
   for(int i = 0; i < 5; i++)
+    #pragma omp critical
     ret++;
   return ret;
 }
diff --git a/clang/test/OpenMP/target_default_codegen.cpp b/clang/test/OpenMP/target_default_codegen.cpp
new file mode 100644
index 0000000000000..eadd0e57945b1
--- /dev/null
+++ b/clang/test/OpenMP/target_default_codegen.cpp
@@ -0,0 +1,2020 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 5
+// expected-no-diagnostics
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -verify -Wno-vla  -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -verify -Wno-vla  -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK-32
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -verify -Wno-vla  -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY-64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY-64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -verify -Wno-vla  -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY-32 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla  %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY-32 %s
+
+#ifndef HEADER
+#define HEADER
+void foo1(int a) {
+  double d = (double)a;
+
+  #pragma omp target default(private: scalar)
+  {
+    d += 1.0;
+  }
+}
+
+void foo2() {
+  int pvtArr[10];
+
+  #pragma omp target default(private: aggregate)
+  {
+    pvtArr[5]++;
+  }
+}
+
+void foo3() {
+  int *pa;
+
+  #pragma omp target default(private: pointer)
+  {
+    pa[50]++;
+  }
+}
+
+// Specified variable-category doesn't apply to referenced variable, so
+// normal implicitly determined data-sharing applies.
+void foo4() {
+  int p;
+
+  #pragma omp target default(private: pointer)
+  {
+    p++;
+  }
+}
+
+// Verify default clause with variable-category 'all' is equivalent to no
+// variable-category. IR checks generated with 'all' but test runs without
+// variable-category.
+void foo5(int a) {
+  double d = (double)a;
+  int pvtArr[10];
+  int *pa;
+
+  #pragma omp target default(private)
+  {
+    d += 1.0;
+    pvtArr[5]++;
+    pa[50]++;
+  }
+}
+
+// Verify default clause with 'shared' DSA is ignored. This makes it
+// equivalent to target with no default clause. IR checks generated with
+// no default clause but test runs with default 'shared'.
+void foo6(int a) {
+  double d = (double)a;
+  int pvtArr[10];
+  int *pa;
+
+  #pragma omp target default(shared)
+  {
+    d += 1.0;
+    pvtArr[5]++;
+    pa[50]++;
+  }
+}
+
+// Verify default clause with 'firstprivate' DSA is equivalent to specifying
+// defaultmap with 'firstprivate'. IR checks generated with
+// defaultmap(firstprivate) but test runs with default(firstprivate).
+void foo7(int a) {
+  double d = (double)a;
+  int pvtArr[10];
+  int *pa;
+
+  #pragma omp target default(firstprivate)
+  {
+    d += 1.0;
+    pvtArr[5]++;
+    pa[50]++;
+  }
+}
+
+// Verify 'default' clause on a combined 'target' directive is equivalent to
+// specifying its constituent directives with 'default' clauses. IR checks
+// generated with constituent directives but test runs with combined
+// directive.
+void foo8() {
+  int x = 0;
+  #pragma omp target teams distribute parallel for default(firstprivate) firstprivate(x)
+  for (int i=0; i<10; i++)
+    x += 1;
+}
+#endif // HEADER
+// CK-64-LABEL: define dso_local void @_Z4foo1i(
+// CK-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[D:%.*]] = alloca double, align 8
+// CK-64-NEXT:    [[D_CASTED:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CK-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CK-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// CK-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// CK-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D]], align 8
+// CK-64-NEXT:    store double [[TMP1]], ptr [[D_CASTED]], align 8
+// CK-64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[D_CASTED]], align 8
+// CK-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-64-NEXT:    store i64 [[TMP2]], ptr [[TMP3]], align 8
+// CK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-64-NEXT:    store i64 [[TMP2]], ptr [[TMP4]], align 8
+// CK-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK-64-NEXT:    store ptr null, ptr [[TMP5]], align 8
+// CK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK-64-NEXT:    store i32 3, ptr [[TMP8]], align 4
+// CK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK-64-NEXT:    store i32 1, ptr [[TMP9]], align 4
+// CK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK-64-NEXT:    store ptr [[TMP6]], ptr [[TMP10]], align 8
+// CK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK-64-NEXT:    store ptr [[TMP7]], ptr [[TMP11]], align 8
+// CK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK-64-NEXT:    store ptr @.offload_sizes, ptr [[TMP12]], align 8
+// CK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK-64-NEXT:    store ptr @.offload_maptypes, ptr [[TMP13]], align 8
+// CK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK-64-NEXT:    store ptr null, ptr [[TMP14]], align 8
+// CK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK-64-NEXT:    store ptr null, ptr [[TMP15]], align 8
+// CK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK-64-NEXT:    store i64 0, ptr [[TMP16]], align 8
+// CK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK-64-NEXT:    store i64 0, ptr [[TMP17]], align 8
+// CK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP18]], align 4
+// CK-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4
+// CK-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK-64-NEXT:    store i32 0, ptr [[TMP20]], align 4
+// CK-64-NEXT:    [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23.region_id, ptr [[KERNEL_ARGS]])
+// CK-64-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+// CK-64-NEXT:    br i1 [[TMP22]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK-64:       [[OMP_OFFLOAD_FAILED]]:
+// CK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23(i64 [[TMP2]]) #[[ATTR2:[0-9]+]]
+// CK-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK-64:       [[OMP_OFFLOAD_CONT]]:
+// CK-64-NEXT:    ret void
+//
+//
+// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23(
+// CK-64-SAME: i64 [[D:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[D_ADDR:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    [[D1:%.*]] = alloca double, align 8
+// CK-64-NEXT:    store i64 [[D]], ptr [[D_ADDR]], align 8
+// CK-64-NEXT:    [[TMP0:%.*]] = load double, ptr [[D1]], align 8
+// CK-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP0]], 1.000000e+00
+// CK-64-NEXT:    store double [[ADD]], ptr [[D1]], align 8
+// CK-64-NEXT:    ret void
+//
+//
+// CK-64-LABEL: define dso_local void @_Z4foo2v(
+// CK-64-SAME: ) #[[ATTR0]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// CK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK-64-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP0]], align 8
+// CK-64-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP1]], align 8
+// CK-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK-64-NEXT:    store ptr null, ptr [[TMP2]], align 8
+// CK-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK-64-NEXT:    store i32 3, ptr [[TMP5]], align 4
+// CK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK-64-NEXT:    store i32 1, ptr [[TMP6]], align 4
+// CK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK-64-NEXT:    store ptr [[TMP3]], ptr [[TMP7]], align 8
+// CK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK-64-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK-64-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP9]], align 8
+// CK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK-64-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP10]], align 8
+// CK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK-64-NEXT:    store ptr null, ptr [[TMP11]], align 8
+// CK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK-64-NEXT:    store ptr null, ptr [[TMP12]], align 8
+// CK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK-64-NEXT:    store i64 0, ptr [[TMP13]], align 8
+// CK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK-64-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4
+// CK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK-64-NEXT:    store i32 0, ptr [[TMP17]], align 4
+// CK-64-NEXT:    [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32.region_id, ptr [[KERNEL_ARGS]])
+// CK-64-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CK-64-NEXT:    br i1 [[TMP19]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK-64:       [[OMP_OFFLOAD_FAILED]]:
+// CK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32(ptr [[PVTARR]]) #[[ATTR2]]
+// CK-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK-64:       [[OMP_OFFLOAD_CONT]]:
+// CK-64-NEXT:    ret void
+//
+//
+// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32(
+// CK-64-SAME: ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]]) #[[ATTR1]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    [[PVTARR1:%.*]] = alloca [10 x i32], align 4
+// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 8
+// CK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 8, !nonnull [[META19:![0-9]+]], !align [[META20:![0-9]+]]
+// CK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i64 0, i64 5
+// CK-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CK-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// CK-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// CK-64-NEXT:    ret void
+//
+//
+// CK-64-LABEL: define dso_local void @_Z4foo3v(
+// CK-64-SAME: ) #[[ATTR0]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA]], align 8
+// CK-64-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-64-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 8
+// CK-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-64-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 8
+// CK-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK-64-NEXT:    store ptr null, ptr [[TMP3]], align 8
+// CK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK-64-NEXT:    store i32 3, ptr [[TMP6]], align 4
+// CK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK-64-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK-64-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK-64-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 8
+// CK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK-64-NEXT:    store ptr @.offload_sizes.3, ptr [[TMP10]], align 8
+// CK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK-64-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP11]], align 8
+// CK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK-64-NEXT:    store ptr null, ptr [[TMP12]], align 8
+// CK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK-64-NEXT:    store ptr null, ptr [[TMP13]], align 8
+// CK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK-64-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK-64-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK-64-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK-64-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41.region_id, ptr [[KERNEL_ARGS]])
+// CK-64-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK-64-NEXT:    br i1 [[TMP20]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK-64:       [[OMP_OFFLOAD_FAILED]]:
+// CK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41(ptr [[TMP0]]) #[[ATTR2]]
+// CK-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK-64:       [[OMP_OFFLOAD_CONT]]:
+// CK-64-NEXT:    ret void
+//
+//
+// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41(
+// CK-64-SAME: ptr [[PA:%.*]]) #[[ATTR1]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    [[PA1:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 8
+// CK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 8
+// CK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 50
+// CK-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CK-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// CK-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// CK-64-NEXT:    ret void
+//
+//
+// CK-64-LABEL: define dso_local void @_Z4foo4v(
+// CK-64-SAME: ) #[[ATTR0]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[P:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[P_CASTED:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
+// CK-64-NEXT:    store i32 [[TMP0]], ptr [[P_CASTED]], align 4
+// CK-64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[P_CASTED]], align 8
+// CK-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-64-NEXT:    store i64 [[TMP1]], ptr [[TMP2]], align 8
+// CK-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-64-NEXT:    store i64 [[TMP1]], ptr [[TMP3]], align 8
+// CK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK-64-NEXT:    store ptr null, ptr [[TMP4]], align 8
+// CK-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK-64-NEXT:    store i32 3, ptr [[TMP7]], align 4
+// CK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK-64-NEXT:    store i32 1, ptr [[TMP8]], align 4
+// CK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK-64-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 8
+// CK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK-64-NEXT:    store ptr [[TMP6]], ptr [[TMP10]], align 8
+// CK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK-64-NEXT:    store ptr @.offload_sizes.5, ptr [[TMP11]], align 8
+// CK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK-64-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP12]], align 8
+// CK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK-64-NEXT:    store ptr null, ptr [[TMP13]], align 8
+// CK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK-64-NEXT:    store ptr null, ptr [[TMP14]], align 8
+// CK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK-64-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK-64-NEXT:    store i64 0, ptr [[TMP16]], align 8
+// CK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP17]], align 4
+// CK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
+// CK-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK-64-NEXT:    store i32 0, ptr [[TMP19]], align 4
+// CK-64-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52.region_id, ptr [[KERNEL_ARGS]])
+// CK-64-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
+// CK-64-NEXT:    br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK-64:       [[OMP_OFFLOAD_FAILED]]:
+// CK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52(i64 [[TMP1]]) #[[ATTR2]]
+// CK-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK-64:       [[OMP_OFFLOAD_CONT]]:
+// CK-64-NEXT:    ret void
+//
+//
+// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52(
+// CK-64-SAME: i64 [[P:%.*]]) #[[ATTR1]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[P_ADDR:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    store i64 [[P]], ptr [[P_ADDR]], align 8
+// CK-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P_ADDR]], align 4
+// CK-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+// CK-64-NEXT:    store i32 [[INC]], ptr [[P_ADDR]], align 4
+// CK-64-NEXT:    ret void
+//
+//
+// CK-64-LABEL: define dso_local void @_Z4foo5i(
+// CK-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[D:%.*]] = alloca double, align 8
+// CK-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// CK-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    [[D_CASTED:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
+// CK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CK-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CK-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// CK-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// CK-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D]], align 8
+// CK-64-NEXT:    store double [[TMP1]], ptr [[D_CASTED]], align 8
+// CK-64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[D_CASTED]], align 8
+// CK-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8
+// CK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-64-NEXT:    store i64 [[TMP2]], ptr [[TMP4]], align 8
+// CK-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-64-NEXT:    store i64 [[TMP2]], ptr [[TMP5]], align 8
+// CK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK-64-NEXT:    store ptr null, ptr [[TMP6]], align 8
+// CK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP7]], align 8
+// CK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP8]], align 8
+// CK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CK-64-NEXT:    store ptr null, ptr [[TMP9]], align 8
+// CK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CK-64-NEXT:    store ptr [[TMP3]], ptr [[TMP10]], align 8
+// CK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CK-64-NEXT:    store ptr [[TMP3]], ptr [[TMP11]], align 8
+// CK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CK-64-NEXT:    store ptr null, ptr [[TMP12]], align 8
+// CK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK-64-NEXT:    store i32 3, ptr [[TMP15]], align 4
+// CK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK-64-NEXT:    store i32 3, ptr [[TMP16]], align 4
+// CK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP17]], align 8
+// CK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK-64-NEXT:    store ptr [[TMP14]], ptr [[TMP18]], align 8
+// CK-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK-64-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP19]], align 8
+// CK-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK-64-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP20]], align 8
+// CK-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK-64-NEXT:    store ptr null, ptr [[TMP21]], align 8
+// CK-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK-64-NEXT:    store ptr null, ptr [[TMP22]], align 8
+// CK-64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK-64-NEXT:    store i64 0, ptr [[TMP23]], align 8
+// CK-64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK-64-NEXT:    store i64 0, ptr [[TMP24]], align 8
+// CK-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP25]], align 4
+// CK-64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4
+// CK-64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK-64-NEXT:    store i32 0, ptr [[TMP27]], align 4
+// CK-64-NEXT:    [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66.region_id, ptr [[KERNEL_ARGS]])
+// CK-64-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
+// CK-64-NEXT:    br i1 [[TMP29]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK-64:       [[OMP_OFFLOAD_FAILED]]:
+// CK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66(i64 [[TMP2]], ptr [[PVTARR]], ptr [[TMP3]]) #[[ATTR2]]
+// CK-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK-64:       [[OMP_OFFLOAD_CONT]]:
+// CK-64-NEXT:    ret void
+//
+//
+// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66(
+// CK-64-SAME: i64 [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[D_ADDR:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    [[D1:%.*]] = alloca double, align 8
+// CK-64-NEXT:    [[PVTARR2:%.*]] = alloca [10 x i32], align 4
+// CK-64-NEXT:    [[PA3:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    store i64 [[D]], ptr [[D_ADDR]], align 8
+// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 8
+// CK-64-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 8
+// CK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 8, !nonnull [[META19]], !align [[META20]]
+// CK-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D1]], align 8
+// CK-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
+// CK-64-NEXT:    store double [[ADD]], ptr [[D1]], align 8
+// CK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i64 0, i64 5
+// CK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CK-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
+// CK-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// CK-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA3]], align 8
+// CK-64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50
+// CK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+// CK-64-NEXT:    [[INC5:%.*]] = add nsw i32 [[TMP4]], 1
+// CK-64-NEXT:    store i32 [[INC5]], ptr [[ARRAYIDX4]], align 4
+// CK-64-NEXT:    ret void
+//
+//
+// CK-64-LABEL: define dso_local void @_Z4foo6i(
+// CK-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[D:%.*]] = alloca double, align 8
+// CK-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// CK-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    [[D_CASTED:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
+// CK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CK-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CK-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// CK-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// CK-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D]], align 8
+// CK-64-NEXT:    store double [[TMP1]], ptr [[D_CASTED]], align 8
+// CK-64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[D_CASTED]], align 8
+// CK-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8
+// CK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-64-NEXT:    store i64 [[TMP2]], ptr [[TMP4]], align 8
+// CK-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-64-NEXT:    store i64 [[TMP2]], ptr [[TMP5]], align 8
+// CK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK-64-NEXT:    store ptr null, ptr [[TMP6]], align 8
+// CK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP7]], align 8
+// CK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP8]], align 8
+// CK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CK-64-NEXT:    store ptr null, ptr [[TMP9]], align 8
+// CK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CK-64-NEXT:    store ptr [[TMP3]], ptr [[TMP10]], align 8
+// CK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CK-64-NEXT:    store ptr [[TMP3]], ptr [[TMP11]], align 8
+// CK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CK-64-NEXT:    store ptr null, ptr [[TMP12]], align 8
+// CK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK-64-NEXT:    store i32 3, ptr [[TMP15]], align 4
+// CK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK-64-NEXT:    store i32 3, ptr [[TMP16]], align 4
+// CK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP17]], align 8
+// CK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK-64-NEXT:    store ptr [[TMP14]], ptr [[TMP18]], align 8
+// CK-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK-64-NEXT:    store ptr @.offload_sizes.9, ptr [[TMP19]], align 8
+// CK-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK-64-NEXT:    store ptr @.offload_maptypes.10, ptr [[TMP20]], align 8
+// CK-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK-64-NEXT:    store ptr null, ptr [[TMP21]], align 8
+// CK-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK-64-NEXT:    store ptr null, ptr [[TMP22]], align 8
+// CK-64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK-64-NEXT:    store i64 0, ptr [[TMP23]], align 8
+// CK-64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK-64-NEXT:    store i64 0, ptr [[TMP24]], align 8
+// CK-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP25]], align 4
+// CK-64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4
+// CK-64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK-64-NEXT:    store i32 0, ptr [[TMP27]], align 4
+// CK-64-NEXT:    [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82.region_id, ptr [[KERNEL_ARGS]])
+// CK-64-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
+// CK-64-NEXT:    br i1 [[TMP29]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK-64:       [[OMP_OFFLOAD_FAILED]]:
+// CK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82(i64 [[TMP2]], ptr [[PVTARR]], ptr [[TMP3]]) #[[ATTR2]]
+// CK-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK-64:       [[OMP_OFFLOAD_CONT]]:
+// CK-64-NEXT:    ret void
+//
+//
+// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82(
+// CK-64-SAME: i64 [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[D_ADDR:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    store i64 [[D]], ptr [[D_ADDR]], align 8
+// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 8
+// CK-64-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 8
+// CK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 8, !nonnull [[META19]], !align [[META20]]
+// CK-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D_ADDR]], align 8
+// CK-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
+// CK-64-NEXT:    store double [[ADD]], ptr [[D_ADDR]], align 8
+// CK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 5
+// CK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CK-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
+// CK-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// CK-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA_ADDR]], align 8
+// CK-64-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50
+// CK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+// CK-64-NEXT:    [[INC2:%.*]] = add nsw i32 [[TMP4]], 1
+// CK-64-NEXT:    store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4
+// CK-64-NEXT:    ret void
+//
+//
+// CK-64-LABEL: define dso_local void @_Z4foo7i(
+// CK-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[D:%.*]] = alloca double, align 8
+// CK-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// CK-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    [[D_CASTED:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
+// CK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CK-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CK-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// CK-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// CK-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D]], align 8
+// CK-64-NEXT:    store double [[TMP1]], ptr [[D_CASTED]], align 8
+// CK-64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[D_CASTED]], align 8
+// CK-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8
+// CK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-64-NEXT:    store i64 [[TMP2]], ptr [[TMP4]], align 8
+// CK-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-64-NEXT:    store i64 [[TMP2]], ptr [[TMP5]], align 8
+// CK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK-64-NEXT:    store ptr null, ptr [[TMP6]], align 8
+// CK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP7]], align 8
+// CK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[TMP8]], align 8
+// CK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CK-64-NEXT:    store ptr null, ptr [[TMP9]], align 8
+// CK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CK-64-NEXT:    store ptr [[TMP3]], ptr [[TMP10]], align 8
+// CK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CK-64-NEXT:    store ptr [[TMP3]], ptr [[TMP11]], align 8
+// CK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CK-64-NEXT:    store ptr null, ptr [[TMP12]], align 8
+// CK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK-64-NEXT:    store i32 3, ptr [[TMP15]], align 4
+// CK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK-64-NEXT:    store i32 3, ptr [[TMP16]], align 4
+// CK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP17]], align 8
+// CK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK-64-NEXT:    store ptr [[TMP14]], ptr [[TMP18]], align 8
+// CK-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK-64-NEXT:    store ptr @.offload_sizes.11, ptr [[TMP19]], align 8
+// CK-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK-64-NEXT:    store ptr @.offload_maptypes.12, ptr [[TMP20]], align 8
+// CK-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK-64-NEXT:    store ptr null, ptr [[TMP21]], align 8
+// CK-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK-64-NEXT:    store ptr null, ptr [[TMP22]], align 8
+// CK-64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK-64-NEXT:    store i64 0, ptr [[TMP23]], align 8
+// CK-64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK-64-NEXT:    store i64 0, ptr [[TMP24]], align 8
+// CK-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK-64-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP25]], align 4
+// CK-64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4
+// CK-64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK-64-NEXT:    store i32 0, ptr [[TMP27]], align 4
+// CK-64-NEXT:    [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98.region_id, ptr [[KERNEL_ARGS]])
+// CK-64-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
+// CK-64-NEXT:    br i1 [[TMP29]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK-64:       [[OMP_OFFLOAD_FAILED]]:
+// CK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98(i64 [[TMP2]], ptr [[PVTARR]], ptr [[TMP3]]) #[[ATTR2]]
+// CK-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK-64:       [[OMP_OFFLOAD_CONT]]:
+// CK-64-NEXT:    ret void
+//
+//
+// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98(
+// CK-64-SAME: i64 [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[D_ADDR:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    [[PVTARR1:%.*]] = alloca [10 x i32], align 4
+// CK-64-NEXT:    store i64 [[D]], ptr [[D_ADDR]], align 8
+// CK-64-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 8
+// CK-64-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 8
+// CK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 8, !nonnull [[META19]], !align [[META20]]
+// CK-64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[PVTARR1]], ptr align 4 [[TMP0]], i64 40, i1 false)
+// CK-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D_ADDR]], align 8
+// CK-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
+// CK-64-NEXT:    store double [[ADD]], ptr [[D_ADDR]], align 8
+// CK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i64 0, i64 5
+// CK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CK-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
+// CK-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// CK-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA_ADDR]], align 8
+// CK-64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50
+// CK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+// CK-64-NEXT:    [[INC3:%.*]] = add nsw i32 [[TMP4]], 1
+// CK-64-NEXT:    store i32 [[INC3]], ptr [[ARRAYIDX2]], align 4
+// CK-64-NEXT:    ret void
+//
+//
+// CK-64-LABEL: define dso_local void @_Z4foo8v(
+// CK-64-SAME: ) #[[ATTR0]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[X:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[X_CASTED:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CK-64-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK-64-NEXT:    store i32 0, ptr [[X]], align 4
+// CK-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4
+// CK-64-NEXT:    store i32 [[TMP0]], ptr [[X_CASTED]], align 4
+// CK-64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X_CASTED]], align 8
+// CK-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-64-NEXT:    store i64 [[TMP1]], ptr [[TMP2]], align 8
+// CK-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-64-NEXT:    store i64 [[TMP1]], ptr [[TMP3]], align 8
+// CK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CK-64-NEXT:    store ptr null, ptr [[TMP4]], align 8
+// CK-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK-64-NEXT:    store i32 3, ptr [[TMP7]], align 4
+// CK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK-64-NEXT:    store i32 1, ptr [[TMP8]], align 4
+// CK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK-64-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 8
+// CK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK-64-NEXT:    store ptr [[TMP6]], ptr [[TMP10]], align 8
+// CK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK-64-NEXT:    store ptr @.offload_sizes.13, ptr [[TMP11]], align 8
+// CK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK-64-NEXT:    store ptr @.offload_maptypes.14, ptr [[TMP12]], align 8
+// CK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK-64-NEXT:    store ptr null, ptr [[TMP13]], align 8
+// CK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK-64-NEXT:    store ptr null, ptr [[TMP14]], align 8
+// CK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK-64-NEXT:    store i64 10, ptr [[TMP15]], align 8
+// CK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK-64-NEXT:    store i64 0, ptr [[TMP16]], align 8
+// CK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK-64-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
+// CK-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK-64-NEXT:    store i32 0, ptr [[TMP19]], align 4
+// CK-64-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.region_id, ptr [[KERNEL_ARGS]])
+// CK-64-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
+// CK-64-NEXT:    br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK-64:       [[OMP_OFFLOAD_FAILED]]:
+// CK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112(i64 [[TMP1]]) #[[ATTR2]]
+// CK-64-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK-64:       [[OMP_OFFLOAD_CONT]]:
+// CK-64-NEXT:    ret void
+//
+//
+// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112(
+// CK-64-SAME: i64 [[X:%.*]]) #[[ATTR1]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[X_ADDR:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    [[X_CASTED:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    store i64 [[X]], ptr [[X_ADDR]], align 8
+// CK-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// CK-64-NEXT:    store i32 [[TMP0]], ptr [[X_CASTED]], align 4
+// CK-64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X_CASTED]], align 8
+// CK-64-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined, i64 [[TMP1]])
+// CK-64-NEXT:    ret void
+//
+//
+// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined(
+// CK-64-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[X:%.*]]) #[[ATTR1]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    [[X_ADDR:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[X_CASTED:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CK-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CK-64-NEXT:    store i64 [[X]], ptr [[X_ADDR]], align 8
+// CK-64-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CK-64-NEXT:    store i32 9, ptr [[DOTOMP_COMB_UB]], align 4
+// CK-64-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CK-64-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CK-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CK-64-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CK-64-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CK-64:       [[COND_TRUE]]:
+// CK-64-NEXT:    br label %[[COND_END:.*]]
+// CK-64:       [[COND_FALSE]]:
+// CK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CK-64-NEXT:    br label %[[COND_END]]
+// CK-64:       [[COND_END]]:
+// CK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP3]], %[[COND_FALSE]] ]
+// CK-64-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CK-64-NEXT:    br label %[[OMP_INNER_FOR_COND:.*]]
+// CK-64:       [[OMP_INNER_FOR_COND]]:
+// CK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CK-64-NEXT:    br i1 [[CMP1]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]]
+// CK-64:       [[OMP_INNER_FOR_BODY]]:
+// CK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CK-64-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CK-64-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// CK-64-NEXT:    store i32 [[TMP11]], ptr [[X_CASTED]], align 4
+// CK-64-NEXT:    [[TMP12:%.*]] = load i64, ptr [[X_CASTED]], align 8
+// CK-64-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]], i64 [[TMP12]])
+// CK-64-NEXT:    br label %[[OMP_INNER_FOR_INC:.*]]
+// CK-64:       [[OMP_INNER_FOR_INC]]:
+// CK-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CK-64-NEXT:    br label %[[OMP_INNER_FOR_COND]]
+// CK-64:       [[OMP_INNER_FOR_END]]:
+// CK-64-NEXT:    br label %[[OMP_LOOP_EXIT:.*]]
+// CK-64:       [[OMP_LOOP_EXIT]]:
+// CK-64-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]])
+// CK-64-NEXT:    ret void
+//
+//
+// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined.omp_outlined(
+// CK-64-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[X:%.*]]) #[[ATTR1]] {
+// CK-64-NEXT:  [[ENTRY:.*:]]
+// CK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    [[X_ADDR:%.*]] = alloca i64, align 8
+// CK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CK-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CK-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CK-64-NEXT:    store i64 [[X]], ptr [[X_ADDR]], align 8
+// CK-64-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CK-64-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4
+// CK-64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CK-64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CK-64-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CK-64-NEXT:    store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CK-64-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CK-64-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CK-64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CK-64-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CK-64-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CK-64:       [[COND_TRUE]]:
+// CK-64-NEXT:    br label %[[COND_END:.*]]
+// CK-64:       [[COND_FALSE]]:
+// CK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CK-64-NEXT:    br label %[[COND_END]]
+// CK-64:       [[COND_END]]:
+// CK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP5]], %[[COND_FALSE]] ]
+// CK-64-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CK-64-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CK-64-NEXT:    br label %[[OMP_INNER_FOR_COND:.*]]
+// CK-64:       [[OMP_INNER_FOR_COND]]:
+// CK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CK-64-NEXT:    br i1 [[CMP2]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]]
+// CK-64:       [[OMP_INNER_FOR_BODY]]:
+// CK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4
+// CK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// CK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CK-64-NEXT:    store i32 [[ADD3]], ptr [[X_ADDR]], align 4
+// CK-64-NEXT:    br label %[[OMP_BODY_CONTINUE:.*]]
+// CK-64:       [[OMP_BODY_CONTINUE]]:
+// CK-64-NEXT:    br label %[[OMP_INNER_FOR_INC:.*]]
+// CK-64:       [[OMP_INNER_FOR_INC]]:
+// CK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], 1
+// CK-64-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
+// CK-64-NEXT:    br label %[[OMP_INNER_FOR_COND]]
+// CK-64:       [[OMP_INNER_FOR_END]]:
+// CK-64-NEXT:    br label %[[OMP_LOOP_EXIT:.*]]
+// CK-64:       [[OMP_LOOP_EXIT]]:
+// CK-64-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP3]])
+// CK-64-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define dso_local void @_Z4foo1i(
+// CK-32-SAME: i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[D:%.*]] = alloca double, align 8
+// CK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CK-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// CK-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// CK-32-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr [[D]], ptr [[TMP1]], align 4
+// CK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr [[D]], ptr [[TMP2]], align 4
+// CK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr null, ptr [[TMP3]], align 4
+// CK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK-32-NEXT:    store i32 3, ptr [[TMP6]], align 4
+// CK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK-32-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK-32-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK-32-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
+// CK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK-32-NEXT:    store ptr @.offload_sizes, ptr [[TMP10]], align 4
+// CK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK-32-NEXT:    store ptr @.offload_maptypes, ptr [[TMP11]], align 4
+// CK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK-32-NEXT:    store ptr null, ptr [[TMP12]], align 4
+// CK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK-32-NEXT:    store ptr null, ptr [[TMP13]], align 4
+// CK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK-32-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK-32-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK-32-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK-32-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23.region_id, ptr [[KERNEL_ARGS]])
+// CK-32-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK-32-NEXT:    br i1 [[TMP20]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK-32:       [[OMP_OFFLOAD_FAILED]]:
+// CK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23(ptr [[D]]) #[[ATTR2:[0-9]+]]
+// CK-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK-32:       [[OMP_OFFLOAD_CONT]]:
+// CK-32-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23(
+// CK-32-SAME: ptr nonnull align 4 dereferenceable(8) [[D:%.*]]) #[[ATTR1:[0-9]+]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[D1:%.*]] = alloca double, align 8
+// CK-32-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 4
+// CK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META20:![0-9]+]], !align [[META21:![0-9]+]]
+// CK-32-NEXT:    [[TMP1:%.*]] = load double, ptr [[D1]], align 8
+// CK-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
+// CK-32-NEXT:    store double [[ADD]], ptr [[D1]], align 8
+// CK-32-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define dso_local void @_Z4foo2v(
+// CK-32-SAME: ) #[[ATTR0]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK-32-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP0]], align 4
+// CK-32-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP1]], align 4
+// CK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr null, ptr [[TMP2]], align 4
+// CK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK-32-NEXT:    store i32 3, ptr [[TMP5]], align 4
+// CK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK-32-NEXT:    store i32 1, ptr [[TMP6]], align 4
+// CK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK-32-NEXT:    store ptr [[TMP3]], ptr [[TMP7]], align 4
+// CK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK-32-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK-32-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP9]], align 4
+// CK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK-32-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP10]], align 4
+// CK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK-32-NEXT:    store ptr null, ptr [[TMP11]], align 4
+// CK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK-32-NEXT:    store ptr null, ptr [[TMP12]], align 4
+// CK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK-32-NEXT:    store i64 0, ptr [[TMP13]], align 8
+// CK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK-32-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4
+// CK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK-32-NEXT:    store i32 0, ptr [[TMP17]], align 4
+// CK-32-NEXT:    [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32.region_id, ptr [[KERNEL_ARGS]])
+// CK-32-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CK-32-NEXT:    br i1 [[TMP19]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK-32:       [[OMP_OFFLOAD_FAILED]]:
+// CK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32(ptr [[PVTARR]]) #[[ATTR2]]
+// CK-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK-32:       [[OMP_OFFLOAD_CONT]]:
+// CK-32-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32(
+// CK-32-SAME: ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]]) #[[ATTR1]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[PVTARR1:%.*]] = alloca [10 x i32], align 4
+// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 4
+// CK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 4, !nonnull [[META20]], !align [[META21]]
+// CK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i32 0, i32 5
+// CK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CK-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// CK-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// CK-32-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define dso_local void @_Z4foo3v(
+// CK-32-SAME: ) #[[ATTR0]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA]], align 4
+// CK-32-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr [[TMP0]], ptr [[TMP1]], align 4
+// CK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr [[TMP0]], ptr [[TMP2]], align 4
+// CK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr null, ptr [[TMP3]], align 4
+// CK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK-32-NEXT:    store i32 3, ptr [[TMP6]], align 4
+// CK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK-32-NEXT:    store i32 1, ptr [[TMP7]], align 4
+// CK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK-32-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK-32-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
+// CK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK-32-NEXT:    store ptr @.offload_sizes.3, ptr [[TMP10]], align 4
+// CK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK-32-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP11]], align 4
+// CK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK-32-NEXT:    store ptr null, ptr [[TMP12]], align 4
+// CK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK-32-NEXT:    store ptr null, ptr [[TMP13]], align 4
+// CK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK-32-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK-32-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4
+// CK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK-32-NEXT:    store i32 0, ptr [[TMP18]], align 4
+// CK-32-NEXT:    [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41.region_id, ptr [[KERNEL_ARGS]])
+// CK-32-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CK-32-NEXT:    br i1 [[TMP20]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK-32:       [[OMP_OFFLOAD_FAILED]]:
+// CK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41(ptr [[TMP0]]) #[[ATTR2]]
+// CK-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK-32:       [[OMP_OFFLOAD_CONT]]:
+// CK-32-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41(
+// CK-32-SAME: ptr [[PA:%.*]]) #[[ATTR1]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[PA1:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 4
+// CK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 4
+// CK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 50
+// CK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CK-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// CK-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// CK-32-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define dso_local void @_Z4foo4v(
+// CK-32-SAME: ) #[[ATTR0]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[P:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[P_CASTED:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
+// CK-32-NEXT:    store i32 [[TMP0]], ptr [[P_CASTED]], align 4
+// CK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[P_CASTED]], align 4
+// CK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-32-NEXT:    store i32 [[TMP1]], ptr [[TMP2]], align 4
+// CK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-32-NEXT:    store i32 [[TMP1]], ptr [[TMP3]], align 4
+// CK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr null, ptr [[TMP4]], align 4
+// CK-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK-32-NEXT:    store i32 3, ptr [[TMP7]], align 4
+// CK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK-32-NEXT:    store i32 1, ptr [[TMP8]], align 4
+// CK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK-32-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
+// CK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK-32-NEXT:    store ptr [[TMP6]], ptr [[TMP10]], align 4
+// CK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK-32-NEXT:    store ptr @.offload_sizes.5, ptr [[TMP11]], align 4
+// CK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK-32-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP12]], align 4
+// CK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK-32-NEXT:    store ptr null, ptr [[TMP13]], align 4
+// CK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK-32-NEXT:    store ptr null, ptr [[TMP14]], align 4
+// CK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK-32-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK-32-NEXT:    store i64 0, ptr [[TMP16]], align 8
+// CK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP17]], align 4
+// CK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
+// CK-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK-32-NEXT:    store i32 0, ptr [[TMP19]], align 4
+// CK-32-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52.region_id, ptr [[KERNEL_ARGS]])
+// CK-32-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
+// CK-32-NEXT:    br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK-32:       [[OMP_OFFLOAD_FAILED]]:
+// CK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52(i32 [[TMP1]]) #[[ATTR2]]
+// CK-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK-32:       [[OMP_OFFLOAD_CONT]]:
+// CK-32-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52(
+// CK-32-SAME: i32 [[P:%.*]]) #[[ATTR1]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[P_ADDR:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    store i32 [[P]], ptr [[P_ADDR]], align 4
+// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P_ADDR]], align 4
+// CK-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+// CK-32-NEXT:    store i32 [[INC]], ptr [[P_ADDR]], align 4
+// CK-32-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define dso_local void @_Z4foo5i(
+// CK-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[D:%.*]] = alloca double, align 8
+// CK-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// CK-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
+// CK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CK-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// CK-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// CK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PA]], align 4
+// CK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr [[D]], ptr [[TMP2]], align 4
+// CK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr [[D]], ptr [[TMP3]], align 4
+// CK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr null, ptr [[TMP4]], align 4
+// CK-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP5]], align 4
+// CK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP6]], align 4
+// CK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CK-32-NEXT:    store ptr null, ptr [[TMP7]], align 4
+// CK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CK-32-NEXT:    store ptr [[TMP1]], ptr [[TMP8]], align 4
+// CK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CK-32-NEXT:    store ptr [[TMP1]], ptr [[TMP9]], align 4
+// CK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CK-32-NEXT:    store ptr null, ptr [[TMP10]], align 4
+// CK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK-32-NEXT:    store i32 3, ptr [[TMP13]], align 4
+// CK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK-32-NEXT:    store i32 3, ptr [[TMP14]], align 4
+// CK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP15]], align 4
+// CK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK-32-NEXT:    store ptr [[TMP12]], ptr [[TMP16]], align 4
+// CK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK-32-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP17]], align 4
+// CK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK-32-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP18]], align 4
+// CK-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK-32-NEXT:    store ptr null, ptr [[TMP19]], align 4
+// CK-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK-32-NEXT:    store ptr null, ptr [[TMP20]], align 4
+// CK-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK-32-NEXT:    store i64 0, ptr [[TMP21]], align 8
+// CK-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK-32-NEXT:    store i64 0, ptr [[TMP22]], align 8
+// CK-32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP23]], align 4
+// CK-32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP24]], align 4
+// CK-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK-32-NEXT:    store i32 0, ptr [[TMP25]], align 4
+// CK-32-NEXT:    [[TMP26:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66.region_id, ptr [[KERNEL_ARGS]])
+// CK-32-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+// CK-32-NEXT:    br i1 [[TMP27]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK-32:       [[OMP_OFFLOAD_FAILED]]:
+// CK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66(ptr [[D]], ptr [[PVTARR]], ptr [[TMP1]]) #[[ATTR2]]
+// CK-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK-32:       [[OMP_OFFLOAD_CONT]]:
+// CK-32-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66(
+// CK-32-SAME: ptr nonnull align 4 dereferenceable(8) [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[D1:%.*]] = alloca double, align 8
+// CK-32-NEXT:    [[PVTARR2:%.*]] = alloca [10 x i32], align 4
+// CK-32-NEXT:    [[PA3:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 4
+// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 4
+// CK-32-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 4
+// CK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META20]], !align [[META21]]
+// CK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 4, !nonnull [[META20]], !align [[META21]]
+// CK-32-NEXT:    [[TMP2:%.*]] = load double, ptr [[D1]], align 8
+// CK-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP2]], 1.000000e+00
+// CK-32-NEXT:    store double [[ADD]], ptr [[D1]], align 8
+// CK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i32 0, i32 5
+// CK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CK-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP3]], 1
+// CK-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// CK-32-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[PA3]], align 4
+// CK-32-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 50
+// CK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+// CK-32-NEXT:    [[INC5:%.*]] = add nsw i32 [[TMP5]], 1
+// CK-32-NEXT:    store i32 [[INC5]], ptr [[ARRAYIDX4]], align 4
+// CK-32-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define dso_local void @_Z4foo6i(
+// CK-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[D:%.*]] = alloca double, align 8
+// CK-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// CK-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
+// CK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CK-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// CK-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// CK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PA]], align 4
+// CK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr [[D]], ptr [[TMP2]], align 4
+// CK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr [[D]], ptr [[TMP3]], align 4
+// CK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr null, ptr [[TMP4]], align 4
+// CK-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP5]], align 4
+// CK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP6]], align 4
+// CK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CK-32-NEXT:    store ptr null, ptr [[TMP7]], align 4
+// CK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CK-32-NEXT:    store ptr [[TMP1]], ptr [[TMP8]], align 4
+// CK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CK-32-NEXT:    store ptr [[TMP1]], ptr [[TMP9]], align 4
+// CK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CK-32-NEXT:    store ptr null, ptr [[TMP10]], align 4
+// CK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK-32-NEXT:    store i32 3, ptr [[TMP13]], align 4
+// CK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK-32-NEXT:    store i32 3, ptr [[TMP14]], align 4
+// CK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP15]], align 4
+// CK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK-32-NEXT:    store ptr [[TMP12]], ptr [[TMP16]], align 4
+// CK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK-32-NEXT:    store ptr @.offload_sizes.9, ptr [[TMP17]], align 4
+// CK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK-32-NEXT:    store ptr @.offload_maptypes.10, ptr [[TMP18]], align 4
+// CK-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK-32-NEXT:    store ptr null, ptr [[TMP19]], align 4
+// CK-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK-32-NEXT:    store ptr null, ptr [[TMP20]], align 4
+// CK-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK-32-NEXT:    store i64 0, ptr [[TMP21]], align 8
+// CK-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK-32-NEXT:    store i64 0, ptr [[TMP22]], align 8
+// CK-32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP23]], align 4
+// CK-32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP24]], align 4
+// CK-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK-32-NEXT:    store i32 0, ptr [[TMP25]], align 4
+// CK-32-NEXT:    [[TMP26:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82.region_id, ptr [[KERNEL_ARGS]])
+// CK-32-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+// CK-32-NEXT:    br i1 [[TMP27]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK-32:       [[OMP_OFFLOAD_FAILED]]:
+// CK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82(ptr [[D]], ptr [[PVTARR]], ptr [[TMP1]]) #[[ATTR2]]
+// CK-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK-32:       [[OMP_OFFLOAD_CONT]]:
+// CK-32-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82(
+// CK-32-SAME: ptr nonnull align 4 dereferenceable(8) [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[D1:%.*]] = alloca double, align 8
+// CK-32-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 4
+// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 4
+// CK-32-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 4
+// CK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META20]], !align [[META21]]
+// CK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 4, !nonnull [[META20]], !align [[META21]]
+// CK-32-NEXT:    [[TMP2:%.*]] = load double, ptr [[TMP0]], align 8
+// CK-32-NEXT:    store double [[TMP2]], ptr [[D1]], align 8
+// CK-32-NEXT:    [[TMP3:%.*]] = load double, ptr [[D1]], align 8
+// CK-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP3]], 1.000000e+00
+// CK-32-NEXT:    store double [[ADD]], ptr [[D1]], align 8
+// CK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP1]], i32 0, i32 5
+// CK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CK-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// CK-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// CK-32-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[PA_ADDR]], align 4
+// CK-32-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 50
+// CK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+// CK-32-NEXT:    [[INC3:%.*]] = add nsw i32 [[TMP6]], 1
+// CK-32-NEXT:    store i32 [[INC3]], ptr [[ARRAYIDX2]], align 4
+// CK-32-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define dso_local void @_Z4foo7i(
+// CK-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[D:%.*]] = alloca double, align 8
+// CK-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// CK-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
+// CK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CK-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// CK-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// CK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PA]], align 4
+// CK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr [[D]], ptr [[TMP2]], align 4
+// CK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr [[D]], ptr [[TMP3]], align 4
+// CK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr null, ptr [[TMP4]], align 4
+// CK-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP5]], align 4
+// CK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[TMP6]], align 4
+// CK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CK-32-NEXT:    store ptr null, ptr [[TMP7]], align 4
+// CK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CK-32-NEXT:    store ptr [[TMP1]], ptr [[TMP8]], align 4
+// CK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CK-32-NEXT:    store ptr [[TMP1]], ptr [[TMP9]], align 4
+// CK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CK-32-NEXT:    store ptr null, ptr [[TMP10]], align 4
+// CK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK-32-NEXT:    store i32 3, ptr [[TMP13]], align 4
+// CK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK-32-NEXT:    store i32 3, ptr [[TMP14]], align 4
+// CK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP15]], align 4
+// CK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK-32-NEXT:    store ptr [[TMP12]], ptr [[TMP16]], align 4
+// CK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK-32-NEXT:    store ptr @.offload_sizes.11, ptr [[TMP17]], align 4
+// CK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK-32-NEXT:    store ptr @.offload_maptypes.12, ptr [[TMP18]], align 4
+// CK-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK-32-NEXT:    store ptr null, ptr [[TMP19]], align 4
+// CK-32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK-32-NEXT:    store ptr null, ptr [[TMP20]], align 4
+// CK-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK-32-NEXT:    store i64 0, ptr [[TMP21]], align 8
+// CK-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK-32-NEXT:    store i64 0, ptr [[TMP22]], align 8
+// CK-32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK-32-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP23]], align 4
+// CK-32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP24]], align 4
+// CK-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK-32-NEXT:    store i32 0, ptr [[TMP25]], align 4
+// CK-32-NEXT:    [[TMP26:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98.region_id, ptr [[KERNEL_ARGS]])
+// CK-32-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+// CK-32-NEXT:    br i1 [[TMP27]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK-32:       [[OMP_OFFLOAD_FAILED]]:
+// CK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98(ptr [[D]], ptr [[PVTARR]], ptr [[TMP1]]) #[[ATTR2]]
+// CK-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK-32:       [[OMP_OFFLOAD_CONT]]:
+// CK-32-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98(
+// CK-32-SAME: ptr nonnull align 4 dereferenceable(8) [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[PVTARR_ADDR:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[PA_ADDR:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[D1:%.*]] = alloca double, align 8
+// CK-32-NEXT:    [[PVTARR2:%.*]] = alloca [10 x i32], align 4
+// CK-32-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 4
+// CK-32-NEXT:    store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 4
+// CK-32-NEXT:    store ptr [[PA]], ptr [[PA_ADDR]], align 4
+// CK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META20]], !align [[META21]]
+// CK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 4, !nonnull [[META20]], !align [[META21]]
+// CK-32-NEXT:    [[TMP2:%.*]] = load double, ptr [[TMP0]], align 8
+// CK-32-NEXT:    store double [[TMP2]], ptr [[D1]], align 8
+// CK-32-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[PVTARR2]], ptr align 4 [[TMP1]], i32 40, i1 false)
+// CK-32-NEXT:    [[TMP3:%.*]] = load double, ptr [[D1]], align 8
+// CK-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP3]], 1.000000e+00
+// CK-32-NEXT:    store double [[ADD]], ptr [[D1]], align 8
+// CK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i32 0, i32 5
+// CK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CK-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// CK-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// CK-32-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[PA_ADDR]], align 4
+// CK-32-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 50
+// CK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+// CK-32-NEXT:    [[INC4:%.*]] = add nsw i32 [[TMP6]], 1
+// CK-32-NEXT:    store i32 [[INC4]], ptr [[ARRAYIDX3]], align 4
+// CK-32-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define dso_local void @_Z4foo8v(
+// CK-32-SAME: ) #[[ATTR0]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[X:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[X_CASTED:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CK-32-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CK-32-NEXT:    store i32 0, ptr [[X]], align 4
+// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4
+// CK-32-NEXT:    store i32 [[TMP0]], ptr [[X_CASTED]], align 4
+// CK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X_CASTED]], align 4
+// CK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-32-NEXT:    store i32 [[TMP1]], ptr [[TMP2]], align 4
+// CK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-32-NEXT:    store i32 [[TMP1]], ptr [[TMP3]], align 4
+// CK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CK-32-NEXT:    store ptr null, ptr [[TMP4]], align 4
+// CK-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CK-32-NEXT:    store i32 3, ptr [[TMP7]], align 4
+// CK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CK-32-NEXT:    store i32 1, ptr [[TMP8]], align 4
+// CK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CK-32-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 4
+// CK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CK-32-NEXT:    store ptr [[TMP6]], ptr [[TMP10]], align 4
+// CK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CK-32-NEXT:    store ptr @.offload_sizes.13, ptr [[TMP11]], align 4
+// CK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CK-32-NEXT:    store ptr @.offload_maptypes.14, ptr [[TMP12]], align 4
+// CK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CK-32-NEXT:    store ptr null, ptr [[TMP13]], align 4
+// CK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CK-32-NEXT:    store ptr null, ptr [[TMP14]], align 4
+// CK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CK-32-NEXT:    store i64 10, ptr [[TMP15]], align 8
+// CK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CK-32-NEXT:    store i64 0, ptr [[TMP16]], align 8
+// CK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CK-32-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
+// CK-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CK-32-NEXT:    store i32 0, ptr [[TMP19]], align 4
+// CK-32-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.region_id, ptr [[KERNEL_ARGS]])
+// CK-32-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
+// CK-32-NEXT:    br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]]
+// CK-32:       [[OMP_OFFLOAD_FAILED]]:
+// CK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112(i32 [[TMP1]]) #[[ATTR2]]
+// CK-32-NEXT:    br label %[[OMP_OFFLOAD_CONT]]
+// CK-32:       [[OMP_OFFLOAD_CONT]]:
+// CK-32-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112(
+// CK-32-SAME: i32 [[X:%.*]]) #[[ATTR1]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[X_CASTED:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// CK-32-NEXT:    store i32 [[TMP0]], ptr [[X_CASTED]], align 4
+// CK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X_CASTED]], align 4
+// CK-32-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined, i32 [[TMP1]])
+// CK-32-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined(
+// CK-32-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[X:%.*]]) #[[ATTR1]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[X_CASTED:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CK-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CK-32-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// CK-32-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CK-32-NEXT:    store i32 9, ptr [[DOTOMP_COMB_UB]], align 4
+// CK-32-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CK-32-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CK-32-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CK-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CK-32-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CK-32:       [[COND_TRUE]]:
+// CK-32-NEXT:    br label %[[COND_END:.*]]
+// CK-32:       [[COND_FALSE]]:
+// CK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CK-32-NEXT:    br label %[[COND_END]]
+// CK-32:       [[COND_END]]:
+// CK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP3]], %[[COND_FALSE]] ]
+// CK-32-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CK-32-NEXT:    br label %[[OMP_INNER_FOR_COND:.*]]
+// CK-32:       [[OMP_INNER_FOR_COND]]:
+// CK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CK-32-NEXT:    br i1 [[CMP1]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]]
+// CK-32:       [[OMP_INNER_FOR_BODY]]:
+// CK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// CK-32-NEXT:    store i32 [[TMP9]], ptr [[X_CASTED]], align 4
+// CK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[X_CASTED]], align 4
+// CK-32-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined.omp_outlined, i32 [[TMP7]], i32 [[TMP8]], i32 [[TMP10]])
+// CK-32-NEXT:    br label %[[OMP_INNER_FOR_INC:.*]]
+// CK-32:       [[OMP_INNER_FOR_INC]]:
+// CK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CK-32-NEXT:    br label %[[OMP_INNER_FOR_COND]]
+// CK-32:       [[OMP_INNER_FOR_END]]:
+// CK-32-NEXT:    br label %[[OMP_LOOP_EXIT:.*]]
+// CK-32:       [[OMP_LOOP_EXIT]]:
+// CK-32-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]])
+// CK-32-NEXT:    ret void
+//
+//
+// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined.omp_outlined(
+// CK-32-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[X:%.*]]) #[[ATTR1]] {
+// CK-32-NEXT:  [[ENTRY:.*:]]
+// CK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CK-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CK-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CK-32-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// CK-32-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CK-32-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4
+// CK-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CK-32-NEXT:    store i32 [[TMP0]], ptr [[DOTOMP_LB]], align 4
+// CK-32-NEXT:    store i32 [[TMP1]], ptr [[DOTOMP_UB]], align 4
+// CK-32-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CK-32-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CK-32-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CK-32-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CK-32-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CK-32:       [[COND_TRUE]]:
+// CK-32-NEXT:    br label %[[COND_END:.*]]
+// CK-32:       [[COND_FALSE]]:
+// CK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CK-32-NEXT:    br label %[[COND_END]]
+// CK-32:       [[COND_END]]:
+// CK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP5]], %[[COND_FALSE]] ]
+// CK-32-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CK-32-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CK-32-NEXT:    br label %[[OMP_INNER_FOR_COND:.*]]
+// CK-32:       [[OMP_INNER_FOR_COND]]:
+// CK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CK-32-NEXT:    br i1 [[CMP1]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]]
+// CK-32:       [[OMP_INNER_FOR_BODY]]:
+// CK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4
+// CK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// CK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
+// CK-32-NEXT:    store i32 [[ADD2]], ptr [[X_ADDR]], align 4
+// CK-32-NEXT:    br label %[[OMP_BODY_CONTINUE:.*]]
+// CK-32:       [[OMP_BODY_CONTINUE]]:
+// CK-32-NEXT:    br label %[[OMP_INNER_FOR_INC:.*]]
+// CK-32:       [[OMP_INNER_FOR_INC]]:
+// CK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP11]], 1
+// CK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CK-32-NEXT:    br label %[[OMP_INNER_FOR_COND]]
+// CK-32:       [[OMP_INNER_FOR_END]]:
+// CK-32-NEXT:    br label %[[OMP_LOOP_EXIT:.*]]
+// CK-32:       [[OMP_LOOP_EXIT]]:
+// CK-32-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP3]])
+// CK-32-NEXT:    ret void
+//
+//
+// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo1i(
+// SIMD-ONLY-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY-64-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY-64-NEXT:    [[D:%.*]] = alloca double, align 8
+// SIMD-ONLY-64-NEXT:    [[D1:%.*]] = alloca double, align 8
+// SIMD-ONLY-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// SIMD-ONLY-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// SIMD-ONLY-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D1]], align 8
+// SIMD-ONLY-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
+// SIMD-ONLY-64-NEXT:    store double [[ADD]], ptr [[D1]], align 8
+// SIMD-ONLY-64-NEXT:    ret void
+//
+//
+// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo2v(
+// SIMD-ONLY-64-SAME: ) #[[ATTR0]] {
+// SIMD-ONLY-64-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY-64-NEXT:    [[PVTARR1:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i64 0, i64 5
+// SIMD-ONLY-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-64-NEXT:    ret void
+//
+//
+// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo3v(
+// SIMD-ONLY-64-SAME: ) #[[ATTR0]] {
+// SIMD-ONLY-64-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
+// SIMD-ONLY-64-NEXT:    [[PA1:%.*]] = alloca ptr, align 8
+// SIMD-ONLY-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 8
+// SIMD-ONLY-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 50
+// SIMD-ONLY-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-64-NEXT:    ret void
+//
+//
+// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo4v(
+// SIMD-ONLY-64-SAME: ) #[[ATTR0]] {
+// SIMD-ONLY-64-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY-64-NEXT:    [[P:%.*]] = alloca i32, align 4
+// SIMD-ONLY-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
+// SIMD-ONLY-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY-64-NEXT:    store i32 [[INC]], ptr [[P]], align 4
+// SIMD-ONLY-64-NEXT:    ret void
+//
+//
+// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo5i(
+// SIMD-ONLY-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY-64-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY-64-NEXT:    [[D:%.*]] = alloca double, align 8
+// SIMD-ONLY-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
+// SIMD-ONLY-64-NEXT:    [[D1:%.*]] = alloca double, align 8
+// SIMD-ONLY-64-NEXT:    [[PVTARR2:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY-64-NEXT:    [[PA3:%.*]] = alloca ptr, align 8
+// SIMD-ONLY-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// SIMD-ONLY-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// SIMD-ONLY-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D1]], align 8
+// SIMD-ONLY-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
+// SIMD-ONLY-64-NEXT:    store double [[ADD]], ptr [[D1]], align 8
+// SIMD-ONLY-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i64 0, i64 5
+// SIMD-ONLY-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA3]], align 8
+// SIMD-ONLY-64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50
+// SIMD-ONLY-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+// SIMD-ONLY-64-NEXT:    [[INC5:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY-64-NEXT:    store i32 [[INC5]], ptr [[ARRAYIDX4]], align 4
+// SIMD-ONLY-64-NEXT:    ret void
+//
+//
+// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo6i(
+// SIMD-ONLY-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY-64-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY-64-NEXT:    [[D:%.*]] = alloca double, align 8
+// SIMD-ONLY-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
+// SIMD-ONLY-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// SIMD-ONLY-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// SIMD-ONLY-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D]], align 8
+// SIMD-ONLY-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
+// SIMD-ONLY-64-NEXT:    store double [[ADD]], ptr [[D]], align 8
+// SIMD-ONLY-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR]], i64 0, i64 5
+// SIMD-ONLY-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8
+// SIMD-ONLY-64-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50
+// SIMD-ONLY-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+// SIMD-ONLY-64-NEXT:    [[INC2:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY-64-NEXT:    store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4
+// SIMD-ONLY-64-NEXT:    ret void
+//
+//
+// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo7i(
+// SIMD-ONLY-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY-64-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY-64-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY-64-NEXT:    [[D:%.*]] = alloca double, align 8
+// SIMD-ONLY-64-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY-64-NEXT:    [[PA:%.*]] = alloca ptr, align 8
+// SIMD-ONLY-64-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY-64-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// SIMD-ONLY-64-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// SIMD-ONLY-64-NEXT:    [[TMP1:%.*]] = load double, ptr [[D]], align 8
+// SIMD-ONLY-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
+// SIMD-ONLY-64-NEXT:    store double [[ADD]], ptr [[D]], align 8
+// SIMD-ONLY-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR]], i64 0, i64 5
+// SIMD-ONLY-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY-64-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-64-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8
+// SIMD-ONLY-64-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50
+// SIMD-ONLY-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+// SIMD-ONLY-64-NEXT:    [[INC2:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY-64-NEXT:    store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4
+// SIMD-ONLY-64-NEXT:    ret void
+//
+//
+// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo8v(
+// SIMD-ONLY-64-SAME: ) #[[ATTR0]] {
+// SIMD-ONLY-64-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY-64-NEXT:    [[X:%.*]] = alloca i32, align 4
+// SIMD-ONLY-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY-64-NEXT:    store i32 0, ptr [[X]], align 4
+// SIMD-ONLY-64-NEXT:    store i32 0, ptr [[I]], align 4
+// SIMD-ONLY-64-NEXT:    br label %[[FOR_COND:.*]]
+// SIMD-ONLY-64:       [[FOR_COND]]:
+// SIMD-ONLY-64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY-64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 10
+// SIMD-ONLY-64-NEXT:    br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// SIMD-ONLY-64:       [[FOR_BODY]]:
+// SIMD-ONLY-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+// SIMD-ONLY-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY-64-NEXT:    store i32 [[ADD]], ptr [[X]], align 4
+// SIMD-ONLY-64-NEXT:    br label %[[FOR_INC:.*]]
+// SIMD-ONLY-64:       [[FOR_INC]]:
+// SIMD-ONLY-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY-64-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY-64-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// SIMD-ONLY-64-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]]
+// SIMD-ONLY-64:       [[FOR_END]]:
+// SIMD-ONLY-64-NEXT:    ret void
+//
+//
+// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo1i(
+// SIMD-ONLY-32-SAME: i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY-32-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY-32-NEXT:    [[D:%.*]] = alloca double, align 8
+// SIMD-ONLY-32-NEXT:    [[D1:%.*]] = alloca double, align 8
+// SIMD-ONLY-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// SIMD-ONLY-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// SIMD-ONLY-32-NEXT:    [[TMP1:%.*]] = load double, ptr [[D1]], align 8
+// SIMD-ONLY-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
+// SIMD-ONLY-32-NEXT:    store double [[ADD]], ptr [[D1]], align 8
+// SIMD-ONLY-32-NEXT:    ret void
+//
+//
+// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo2v(
+// SIMD-ONLY-32-SAME: ) #[[ATTR0]] {
+// SIMD-ONLY-32-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY-32-NEXT:    [[PVTARR1:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i32 0, i32 5
+// SIMD-ONLY-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-32-NEXT:    ret void
+//
+//
+// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo3v(
+// SIMD-ONLY-32-SAME: ) #[[ATTR0]] {
+// SIMD-ONLY-32-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
+// SIMD-ONLY-32-NEXT:    [[PA1:%.*]] = alloca ptr, align 4
+// SIMD-ONLY-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 4
+// SIMD-ONLY-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 50
+// SIMD-ONLY-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-32-NEXT:    ret void
+//
+//
+// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo4v(
+// SIMD-ONLY-32-SAME: ) #[[ATTR0]] {
+// SIMD-ONLY-32-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY-32-NEXT:    [[P:%.*]] = alloca i32, align 4
+// SIMD-ONLY-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
+// SIMD-ONLY-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+// SIMD-ONLY-32-NEXT:    store i32 [[INC]], ptr [[P]], align 4
+// SIMD-ONLY-32-NEXT:    ret void
+//
+//
+// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo5i(
+// SIMD-ONLY-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY-32-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY-32-NEXT:    [[D:%.*]] = alloca double, align 8
+// SIMD-ONLY-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
+// SIMD-ONLY-32-NEXT:    [[D1:%.*]] = alloca double, align 8
+// SIMD-ONLY-32-NEXT:    [[PVTARR2:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY-32-NEXT:    [[PA3:%.*]] = alloca ptr, align 4
+// SIMD-ONLY-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// SIMD-ONLY-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// SIMD-ONLY-32-NEXT:    [[TMP1:%.*]] = load double, ptr [[D1]], align 8
+// SIMD-ONLY-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
+// SIMD-ONLY-32-NEXT:    store double [[ADD]], ptr [[D1]], align 8
+// SIMD-ONLY-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i32 0, i32 5
+// SIMD-ONLY-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-32-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA3]], align 4
+// SIMD-ONLY-32-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 50
+// SIMD-ONLY-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+// SIMD-ONLY-32-NEXT:    [[INC5:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY-32-NEXT:    store i32 [[INC5]], ptr [[ARRAYIDX4]], align 4
+// SIMD-ONLY-32-NEXT:    ret void
+//
+//
+// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo6i(
+// SIMD-ONLY-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY-32-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY-32-NEXT:    [[D:%.*]] = alloca double, align 8
+// SIMD-ONLY-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
+// SIMD-ONLY-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// SIMD-ONLY-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// SIMD-ONLY-32-NEXT:    [[TMP1:%.*]] = load double, ptr [[D]], align 8
+// SIMD-ONLY-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
+// SIMD-ONLY-32-NEXT:    store double [[ADD]], ptr [[D]], align 8
+// SIMD-ONLY-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR]], i32 0, i32 5
+// SIMD-ONLY-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-32-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA]], align 4
+// SIMD-ONLY-32-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 50
+// SIMD-ONLY-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+// SIMD-ONLY-32-NEXT:    [[INC2:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY-32-NEXT:    store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4
+// SIMD-ONLY-32-NEXT:    ret void
+//
+//
+// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo7i(
+// SIMD-ONLY-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] {
+// SIMD-ONLY-32-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY-32-NEXT:    [[D:%.*]] = alloca double, align 8
+// SIMD-ONLY-32-NEXT:    [[PVTARR:%.*]] = alloca [10 x i32], align 4
+// SIMD-ONLY-32-NEXT:    [[PA:%.*]] = alloca ptr, align 4
+// SIMD-ONLY-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY-32-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+// SIMD-ONLY-32-NEXT:    store double [[CONV]], ptr [[D]], align 8
+// SIMD-ONLY-32-NEXT:    [[TMP1:%.*]] = load double, ptr [[D]], align 8
+// SIMD-ONLY-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00
+// SIMD-ONLY-32-NEXT:    store double [[ADD]], ptr [[D]], align 8
+// SIMD-ONLY-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR]], i32 0, i32 5
+// SIMD-ONLY-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY-32-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX]], align 4
+// SIMD-ONLY-32-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[PA]], align 4
+// SIMD-ONLY-32-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 50
+// SIMD-ONLY-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+// SIMD-ONLY-32-NEXT:    [[INC2:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY-32-NEXT:    store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4
+// SIMD-ONLY-32-NEXT:    ret void
+//
+//
+// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo8v(
+// SIMD-ONLY-32-SAME: ) #[[ATTR0]] {
+// SIMD-ONLY-32-NEXT:  [[ENTRY:.*:]]
+// SIMD-ONLY-32-NEXT:    [[X:%.*]] = alloca i32, align 4
+// SIMD-ONLY-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY-32-NEXT:    store i32 0, ptr [[X]], align 4
+// SIMD-ONLY-32-NEXT:    store i32 0, ptr [[I]], align 4
+// SIMD-ONLY-32-NEXT:    br label %[[FOR_COND:.*]]
+// SIMD-ONLY-32:       [[FOR_COND]]:
+// SIMD-ONLY-32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY-32-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 10
+// SIMD-ONLY-32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// SIMD-ONLY-32:       [[FOR_BODY]]:
+// SIMD-ONLY-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+// SIMD-ONLY-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY-32-NEXT:    store i32 [[ADD]], ptr [[X]], align 4
+// SIMD-ONLY-32-NEXT:    br label %[[FOR_INC:.*]]
+// SIMD-ONLY-32:       [[FOR_INC]]:
+// SIMD-ONLY-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY-32-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY-32-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// SIMD-ONLY-32-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// SIMD-ONLY-32:       [[FOR_END]]:
+// SIMD-ONLY-32-NEXT:    ret void
+//
+//.
+// CK-64: [[META19]] = !{}
+// CK-64: [[META20]] = !{i64 4}
+//.
+// CK-32: [[META20]] = !{}
+// CK-32: [[META21]] = !{i64 4}
+//.
+// SIMD-ONLY-64: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
+// SIMD-ONLY-64: [[META3]] = !{!"llvm.loop.mustprogress"}
+//.
+// SIMD-ONLY-32: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+// SIMD-ONLY-32: [[META4]] = !{!"llvm.loop.mustprogress"}
+//.
diff --git a/clang/test/OpenMP/target_default_messages.cpp b/clang/test/OpenMP/target_default_messages.cpp
index be677dffa21ca..6a1a1f99360b5 100644
--- a/clang/test/OpenMP/target_default_messages.cpp
+++ b/clang/test/OpenMP/target_default_messages.cpp
@@ -24,6 +24,8 @@ int main(int argc, char **argv) {
   for (int i=0; i<200; i++) foo();
 #pragma omp target  default(x) // expected-error {{expected 'none', 'shared', 'private' or 'firstprivate' in OpenMP clause 'default'}}
   for (int i=0; i<200; i++) foo();
+#pragma omp target default(none) // expected-note {{explicit data sharing attribute, data mapping attribute, or is_device_ptr clause requested here}}
+  x++; // expected-error {{variable 'x' must have explicitly specified data sharing attributes, data mapping attributes, or in an is_device_ptr clause}}
 #endif 
 
 #ifdef OMP52
diff --git a/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp b/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp
new file mode 100644
index 0000000000000..758f35d629ace
--- /dev/null
+++ b/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp
@@ -0,0 +1,2633 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK3
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK9
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK9
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK11
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK11
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+
+
+
+// We have 6 target regions
+
+
+
+// Check target registration is registered as a Ctor.
+
+
+template<typename tx>
+tx ftemplate(int n) {
+  tx a = 0;
+
+  #pragma omp target teams dyn_groupprivate(tx(20))
+  {
+  }
+
+  short b = 1;
+  #pragma omp target teams num_teams(b) dyn_groupprivate(1024)
+  {
+    a += b;
+  }
+
+  return a;
+}
+
+static
+int fstatic(int n) {
+
+  #pragma omp target teams distribute parallel for simd num_teams(n) dyn_groupprivate(n*32)
+  for (int i = 0; i < n ; ++i) {
+  }
+
+  #pragma omp target teams dyn_groupprivate(fallback(default_mem): 32+n) nowait
+  {
+  }
+
+  return n+1;
+}
+
+struct S1 {
+  double a;
+
+  int r1(int n){
+    int b = 1;
+
+    #pragma omp target teams dyn_groupprivate(fallback(null): n-b)
+    {
+      this->a = (double)b + 1.5;
+    }
+
+    #pragma omp target dyn_groupprivate(fallback(abort): 1024)
+    {
+      this->a = 2.5;
+    }
+
+    return (int)a;
+  }
+};
+
+int bar(int n){
+  int a = 0;
+
+  S1 S;
+  a += S.r1(n);
+
+  a += fstatic(n);
+
+  a += ftemplate<int>(n);
+
+  return a;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+// Check that the offloading functions are emitted and that the parallel function
+// is appropriately guarded.
+
+
+
+
+
+
+#endif
+
+// CHECK1-LABEL: define {{[^@]+}}@_Z3bari
+// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 8
+// CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[A]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_ZN2S12r1Ei(ptr noundef nonnull align 8 dereferenceable(8) [[S]], i32 noundef signext [[TMP0]])
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CALL]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[A]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[CALL1:%.*]] = call noundef signext i32 @_ZL7fstatici(i32 noundef signext [[TMP2]])
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[CALL1]]
+// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[A]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[CALL3:%.*]] = call noundef signext i32 @_Z9ftemplateIiET_i(i32 noundef signext [[TMP4]])
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP5]], [[CALL3]]
+// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[A]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT:    ret i32 [[TMP6]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS3:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS4:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS5:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store i32 1, ptr [[B]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]]
+// CHECK1-NEXT:    store i32 [[SUB]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B]], align 4
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[B_CASTED]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
+// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    store ptr [[THIS1]], ptr [[TMP6]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    store ptr [[A]], ptr [[TMP7]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr null, ptr [[TMP8]], align 8
+// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-NEXT:    store i64 [[TMP3]], ptr [[TMP9]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-NEXT:    store i64 [[TMP3]], ptr [[TMP10]], align 8
+// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-NEXT:    store ptr null, ptr [[TMP11]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK1-NEXT:    store i64 [[TMP5]], ptr [[TMP12]], align 8
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK1-NEXT:    store i64 [[TMP5]], ptr [[TMP13]], align 8
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK1-NEXT:    store ptr null, ptr [[TMP14]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 3, ptr [[TMP19]], align 4
+// CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP20]], align 8
+// CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr [[TMP16]], ptr [[TMP21]], align 8
+// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr @.offload_sizes, ptr [[TMP22]], align 8
+// CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT:    store ptr @.offload_maptypes, ptr [[TMP23]], align 8
+// CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT:    store ptr null, ptr [[TMP24]], align 8
+// CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr null, ptr [[TMP25]], align 8
+// CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, ptr [[TMP26]], align 8
+// CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 4, ptr [[TMP27]], align 8
+// CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4
+// CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[TMP30]], align 4
+// CHECK1-NEXT:    [[TMP31:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
+// CHECK1-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1:       omp_offload.failed:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88(ptr [[THIS1]], i64 [[TMP3]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK1:       omp_offload.cont:
+// CHECK1-NEXT:    [[A2:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
+// CHECK1-NEXT:    store ptr [[THIS1]], ptr [[TMP33]], align 8
+// CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
+// CHECK1-NEXT:    store ptr [[A2]], ptr [[TMP34]], align 8
+// CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS5]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr null, ptr [[TMP35]], align 8
+// CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 3, ptr [[TMP38]], align 4
+// CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 1, ptr [[TMP39]], align 4
+// CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
+// CHECK1-NEXT:    store ptr [[TMP36]], ptr [[TMP40]], align 8
+// CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr [[TMP37]], ptr [[TMP41]], align 8
+// CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP42]], align 8
+// CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5
+// CHECK1-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP43]], align 8
+// CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6
+// CHECK1-NEXT:    store ptr null, ptr [[TMP44]], align 8
+// CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr null, ptr [[TMP45]], align 8
+// CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, ptr [[TMP46]], align 8
+// CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 0, ptr [[TMP47]], align 8
+// CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP48]], align 4
+// CHECK1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP49]], align 4
+// CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 1024, ptr [[TMP50]], align 4
+// CHECK1-NEXT:    [[TMP51:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93.region_id, ptr [[KERNEL_ARGS6]])
+// CHECK1-NEXT:    [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0
+// CHECK1-NEXT:    br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
+// CHECK1:       omp_offload.failed7:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93(ptr [[THIS1]]) #[[ATTR2]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
+// CHECK1:       omp_offload.cont8:
+// CHECK1-NEXT:    [[A9:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP53:%.*]] = load double, ptr [[A9]], align 8
+// CHECK1-NEXT:    [[CONV:%.*]] = fptosi double [[TMP53]] to i32
+// CHECK1-NEXT:    ret i32 [[CONV]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZL7fstatici
+// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED8:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS9:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS10:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS11:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], 32
+// CHECK1-NEXT:    store i32 [[MUL]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED2]], align 8
+// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP4]], ptr [[TMP9]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP4]], ptr [[TMP10]], align 8
+// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr null, ptr [[TMP11]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-NEXT:    store i64 [[TMP6]], ptr [[TMP12]], align 8
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-NEXT:    store i64 [[TMP6]], ptr [[TMP13]], align 8
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-NEXT:    store ptr null, ptr [[TMP14]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK1-NEXT:    store i64 [[TMP8]], ptr [[TMP15]], align 8
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK1-NEXT:    store i64 [[TMP8]], ptr [[TMP16]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK1-NEXT:    store ptr null, ptr [[TMP17]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP22]], 0
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT:    [[SUB5:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], 1
+// CHECK1-NEXT:    [[TMP24:%.*]] = zext i32 [[ADD]] to i64
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP20]], 0
+// CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 3, ptr [[TMP27]], align 4
+// CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 3, ptr [[TMP28]], align 4
+// CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT:    store ptr [[TMP18]], ptr [[TMP29]], align 8
+// CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr [[TMP19]], ptr [[TMP30]], align 8
+// CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr @.offload_sizes.3, ptr [[TMP31]], align 8
+// CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP32]], align 8
+// CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT:    store ptr null, ptr [[TMP33]], align 8
+// CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr null, ptr [[TMP34]], align 8
+// CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 [[TMP24]], ptr [[TMP35]], align 8
+// CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 8, ptr [[TMP36]], align 8
+// CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] [[TMP26]], ptr [[TMP37]], align 4
+// CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4
+// CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 [[TMP25]], ptr [[TMP39]], align 4
+// CHECK1-NEXT:    [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP20]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT:    [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
+// CHECK1-NEXT:    br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1:       omp_offload.failed:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71(i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP8]]) #[[ATTR2]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK1:       omp_offload.cont:
+// CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 32, [[TMP42]]
+// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    store i32 [[TMP43]], ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4
+// CHECK1-NEXT:    [[TMP44:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED8]], align 8
+// CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP44]], ptr [[TMP45]], align 8
+// CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP44]], ptr [[TMP46]], align 8
+// CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS11]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr null, ptr [[TMP47]], align 8
+// CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP51:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    store i32 [[TMP51]], ptr [[TMP50]], align 4
+// CHECK1-NEXT:    [[TMP52:%.*]] = call ptr @__kmpc_omp_target_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 64, i64 4, ptr @.omp_task_entry., i64 -1)
+// CHECK1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP52]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP53]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[TMP54]], align 8
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP55]], ptr align 4 [[AGG_CAPTURED]], i64 4, i1 false)
+// CHECK1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP52]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP57:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP56]], i32 0, i32 0
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP57]], ptr align 8 [[TMP48]], i64 8, i1 false)
+// CHECK1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 1
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP58]], ptr align 8 [[TMP49]], i64 8, i1 false)
+// CHECK1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 2
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP59]], ptr align 8 @.offload_sizes.5, i64 8, i1 false)
+// CHECK1-NEXT:    [[TMP60:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP52]])
+// CHECK1-NEXT:    [[TMP61:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP61]], 1
+// CHECK1-NEXT:    ret i32 [[ADD12]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT:    [[B:%.*]] = alloca i16, align 2
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i16, align 2
+// CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS1:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[A]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT:    store ptr null, ptr [[TMP2]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr null, ptr [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr null, ptr [[TMP4]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT:    store ptr null, ptr [[TMP5]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT:    store ptr null, ptr [[TMP6]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr null, ptr [[TMP7]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 8, ptr [[TMP9]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 20, ptr [[TMP12]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1:       omp_offload.failed:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55() #[[ATTR2]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK1:       omp_offload.cont:
+// CHECK1-NEXT:    store i16 1, ptr [[B]], align 2
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i16, ptr [[B]], align 2
+// CHECK1-NEXT:    store i16 [[TMP15]], ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT:    store i32 [[TMP16]], ptr [[A_CASTED]], align 4
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i16, ptr [[B]], align 2
+// CHECK1-NEXT:    store i16 [[TMP18]], ptr [[B_CASTED]], align 2
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK1-NEXT:    store i16 [[TMP20]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 2
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
+// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP17]], ptr [[TMP22]], align 8
+// CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP17]], ptr [[TMP23]], align 8
+// CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr null, ptr [[TMP24]], align 8
+// CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-NEXT:    store i64 [[TMP19]], ptr [[TMP25]], align 8
+// CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-NEXT:    store i64 [[TMP19]], ptr [[TMP26]], align 8
+// CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-NEXT:    store ptr null, ptr [[TMP27]], align 8
+// CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK1-NEXT:    store i64 [[TMP21]], ptr [[TMP28]], align 8
+// CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK1-NEXT:    store i64 [[TMP21]], ptr [[TMP29]], align 8
+// CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK1-NEXT:    store ptr null, ptr [[TMP30]], align 8
+// CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK1-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
+// CHECK1-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
+// CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 3, ptr [[TMP36]], align 4
+// CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
+// CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
+// CHECK1-NEXT:    store ptr [[TMP31]], ptr [[TMP38]], align 8
+// CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr [[TMP32]], ptr [[TMP39]], align 8
+// CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP40]], align 8
+// CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 5
+// CHECK1-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP41]], align 8
+// CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 6
+// CHECK1-NEXT:    store ptr null, ptr [[TMP42]], align 8
+// CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr null, ptr [[TMP43]], align 8
+// CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, ptr [[TMP44]], align 8
+// CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 8, ptr [[TMP45]], align 8
+// CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] [[TMP35]], ptr [[TMP46]], align 4
+// CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4
+// CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 1024, ptr [[TMP48]], align 4
+// CHECK1-NEXT:    [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP34]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.region_id, ptr [[KERNEL_ARGS1]])
+// CHECK1-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
+// CHECK1-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED2:%.*]], label [[OMP_OFFLOAD_CONT3:%.*]]
+// CHECK1:       omp_offload.failed2:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60(i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR2]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT3]]
+// CHECK1:       omp_offload.cont3:
+// CHECK1-NEXT:    [[TMP51:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT:    ret i32 [[TMP51]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88
+// CHECK1-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[B_CASTED]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i64 [[TMP2]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK1-NEXT:    store double [[ADD]], ptr [[A]], align 8
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93
+// CHECK1-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK1-NEXT:    store double 2.500000e+00, ptr [[A]], align 8
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71
+// CHECK1-SAME: (i64 noundef [[N:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK1-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i64 [[TMP3]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1:       omp.precond.then:
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
+// CHECK1-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1:       cond.true:
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    br label [[COND_END:%.*]]
+// CHECK1:       cond.false:
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT:    br label [[COND_END]]
+// CHECK1:       cond.end:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1:       omp.inner.for.cond:
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17:![0-9]+]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1:       omp.inner.for.body:
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i64 [[TMP14]], i64 [[TMP16]], i64 [[TMP18]]), !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1:       omp.inner.for.inc:
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
+// CHECK1:       omp.inner.for.end:
+// CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1:       omp.loop.exit:
+// CHECK1-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]])
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+// CHECK1-NEXT:    br i1 [[TMP24]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK1:       .omp.final.then:
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB6:%.*]] = sub nsw i32 [[TMP25]], 0
+// CHECK1-NEXT:    [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1
+// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[I3]], align 4
+// CHECK1-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK1:       .omp.final.done:
+// CHECK1-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK1:       omp.precond.end:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1:       omp.precond.then:
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP4]] to i32
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT:    [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK1-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1:       cond.true:
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    br label [[COND_END:%.*]]
+// CHECK1:       cond.false:
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    br label [[COND_END]]
+// CHECK1:       cond.end:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1:       omp.inner.for.cond:
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1:       omp.inner.for.body:
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1:       omp.body.continue:
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1:       omp.inner.for.inc:
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK1:       omp.inner.for.end:
+// CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1:       omp.loop.exit:
+// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP18]])
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CHECK1-NEXT:    br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK1:       .omp.final.then:
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0
+// CHECK1-NEXT:    [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1
+// CHECK1-NEXT:    [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1
+// CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i32 0, [[MUL10]]
+// CHECK1-NEXT:    store i32 [[ADD11]], ptr [[I4]], align 4
+// CHECK1-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK1:       .omp.final.done:
+// CHECK1-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK1:       omp.precond.end:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75
+// CHECK1-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined)
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map.
+// CHECK1-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTADDR3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTADDR3]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP6]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP8]], align 8
+// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 2
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTADDR3]], align 8
+// CHECK1-NEXT:    store ptr [[TMP9]], ptr [[TMP10]], align 8
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@.omp_task_entry.
+// CHECK1-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS_I:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED_I:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META31:![0-9]+]])
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META33:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]]) #[[ATTR2]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 1, ptr [[TMP16]], align 4, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
+// CHECK1-NEXT:    store ptr [[TMP12]], ptr [[TMP17]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr [[TMP13]], ptr [[TMP18]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP19]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
+// CHECK1-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP20]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
+// CHECK1-NEXT:    store ptr null, ptr [[TMP21]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr null, ptr [[TMP22]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, ptr [[TMP23]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 9, ptr [[TMP24]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 [[TMP15]], ptr [[TMP27]], align 4, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.region_id, ptr [[KERNEL_ARGS_I]])
+// CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
+// CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
+// CHECK1:       omp_offload.failed.i:
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK1-NEXT:    store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75(i64 [[TMP31]]) #[[ATTR2]]
+// CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
+// CHECK1:       .omp_outlined..exit:
+// CHECK1-NEXT:    ret i32 0
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
+// CHECK1-SAME: () #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined)
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60
+// CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2
+// CHECK1-NEXT:    [[TMP2:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK1-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP2]], i32 0)
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK1-NEXT:    store i16 [[TMP5]], ptr [[B_CASTED]], align 2
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i64 [[TMP4]], i64 [[TMP6]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_Z3bari
+// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 4
+// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 0, ptr [[A]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN2S12r1Ei(ptr noundef nonnull align 4 dereferenceable(8) [[S]], i32 noundef [[TMP0]])
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CALL]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[A]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZL7fstatici(i32 noundef [[TMP2]])
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[CALL1]]
+// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[A]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[CALL3:%.*]] = call noundef i32 @_Z9ftemplateIiET_i(i32 noundef [[TMP4]])
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP5]], [[CALL3]]
+// CHECK3-NEXT:    store i32 [[ADD4]], ptr [[A]], align 4
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT:    ret i32 [[TMP6]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS3:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS4:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS5:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    store i32 1, ptr [[B]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4
+// CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]]
+// CHECK3-NEXT:    store i32 [[SUB]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B]], align 4
+// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[B_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr [[THIS1]], ptr [[TMP6]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr [[A]], ptr [[TMP7]], align 4
+// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr null, ptr [[TMP8]], align 4
+// CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[TMP9]], align 4
+// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[TMP10]], align 4
+// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-NEXT:    store ptr null, ptr [[TMP11]], align 4
+// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK3-NEXT:    store i32 [[TMP5]], ptr [[TMP12]], align 4
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK3-NEXT:    store i32 [[TMP5]], ptr [[TMP13]], align 4
+// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr null, ptr [[TMP14]], align 4
+// CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 3, ptr [[TMP19]], align 4
+// CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr [[TMP15]], ptr [[TMP20]], align 4
+// CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr [[TMP16]], ptr [[TMP21]], align 4
+// CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr @.offload_sizes, ptr [[TMP22]], align 4
+// CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT:    store ptr @.offload_maptypes, ptr [[TMP23]], align 4
+// CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT:    store ptr null, ptr [[TMP24]], align 4
+// CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr null, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, ptr [[TMP26]], align 8
+// CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 4, ptr [[TMP27]], align 8
+// CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4
+// CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 [[TMP17]], ptr [[TMP30]], align 4
+// CHECK3-NEXT:    [[TMP31:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
+// CHECK3-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3:       omp_offload.failed:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88(ptr [[THIS1]], i32 [[TMP3]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK3:       omp_offload.cont:
+// CHECK3-NEXT:    [[A2:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr [[THIS1]], ptr [[TMP33]], align 4
+// CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr [[A2]], ptr [[TMP34]], align 4
+// CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS5]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr null, ptr [[TMP35]], align 4
+// CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 3, ptr [[TMP38]], align 4
+// CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 1, ptr [[TMP39]], align 4
+// CHECK3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr [[TMP36]], ptr [[TMP40]], align 4
+// CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr [[TMP37]], ptr [[TMP41]], align 4
+// CHECK3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP42]], align 4
+// CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5
+// CHECK3-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP43]], align 4
+// CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6
+// CHECK3-NEXT:    store ptr null, ptr [[TMP44]], align 4
+// CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr null, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, ptr [[TMP46]], align 8
+// CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 0, ptr [[TMP47]], align 8
+// CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP48]], align 4
+// CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP49]], align 4
+// CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 1024, ptr [[TMP50]], align 4
+// CHECK3-NEXT:    [[TMP51:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93.region_id, ptr [[KERNEL_ARGS6]])
+// CHECK3-NEXT:    [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0
+// CHECK3-NEXT:    br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
+// CHECK3:       omp_offload.failed7:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93(ptr [[THIS1]]) #[[ATTR2]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
+// CHECK3:       omp_offload.cont8:
+// CHECK3-NEXT:    [[A9:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP53:%.*]] = load double, ptr [[A9]], align 4
+// CHECK3-NEXT:    [[CONV:%.*]] = fptosi double [[TMP53]] to i32
+// CHECK3-NEXT:    ret i32 [[CONV]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZL7fstatici
+// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED2:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED8:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS9:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS10:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS11:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], 32
+// CHECK3-NEXT:    store i32 [[MUL]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    store i32 [[TMP7]], ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4
+// CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP4]], ptr [[TMP9]], align 4
+// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP4]], ptr [[TMP10]], align 4
+// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr null, ptr [[TMP11]], align 4
+// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 [[TMP6]], ptr [[TMP12]], align 4
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 [[TMP6]], ptr [[TMP13]], align 4
+// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-NEXT:    store ptr null, ptr [[TMP14]], align 4
+// CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK3-NEXT:    store i32 [[TMP8]], ptr [[TMP15]], align 4
+// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK3-NEXT:    store i32 [[TMP8]], ptr [[TMP16]], align 4
+// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr null, ptr [[TMP17]], align 4
+// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP21]], ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK3-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP22]], 0
+// CHECK3-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT:    [[SUB5:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT:    store i32 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK3-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], 1
+// CHECK3-NEXT:    [[TMP24:%.*]] = zext i32 [[ADD]] to i64
+// CHECK3-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP20]], 0
+// CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 3, ptr [[TMP27]], align 4
+// CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 3, ptr [[TMP28]], align 4
+// CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr [[TMP18]], ptr [[TMP29]], align 4
+// CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr [[TMP19]], ptr [[TMP30]], align 4
+// CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr @.offload_sizes.3, ptr [[TMP31]], align 4
+// CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP32]], align 4
+// CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT:    store ptr null, ptr [[TMP33]], align 4
+// CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr null, ptr [[TMP34]], align 4
+// CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 [[TMP24]], ptr [[TMP35]], align 8
+// CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 8, ptr [[TMP36]], align 8
+// CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] [[TMP26]], ptr [[TMP37]], align 4
+// CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4
+// CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 [[TMP25]], ptr [[TMP39]], align 4
+// CHECK3-NEXT:    [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP20]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT:    [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
+// CHECK3-NEXT:    br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3:       omp_offload.failed:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71(i32 [[TMP4]], i32 [[TMP6]], i32 [[TMP8]]) #[[ATTR2]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK3:       omp_offload.cont:
+// CHECK3-NEXT:    [[TMP42:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[ADD7:%.*]] = add nsw i32 32, [[TMP42]]
+// CHECK3-NEXT:    store i32 [[ADD7]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK3-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK3-NEXT:    store i32 [[TMP43]], ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4
+// CHECK3-NEXT:    [[TMP44:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4
+// CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP44]], ptr [[TMP45]], align 4
+// CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP44]], ptr [[TMP46]], align 4
+// CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS11]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr null, ptr [[TMP47]], align 4
+// CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP51:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK3-NEXT:    store i32 [[TMP51]], ptr [[TMP50]], align 4
+// CHECK3-NEXT:    [[TMP52:%.*]] = call ptr @__kmpc_omp_target_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 36, i32 4, ptr @.omp_task_entry., i64 -1)
+// CHECK3-NEXT:    [[TMP53:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP52]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP54:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP53]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[TMP54]], align 4
+// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP55]], ptr align 4 [[AGG_CAPTURED]], i32 4, i1 false)
+// CHECK3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP52]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP57:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP56]], i32 0, i32 0
+// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP57]], ptr align 4 @.offload_sizes.5, i32 8, i1 false)
+// CHECK3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 1
+// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP58]], ptr align 4 [[TMP48]], i32 4, i1 false)
+// CHECK3-NEXT:    [[TMP59:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 2
+// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP59]], ptr align 4 [[TMP49]], i32 4, i1 false)
+// CHECK3-NEXT:    [[TMP60:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP52]])
+// CHECK3-NEXT:    [[TMP61:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP61]], 1
+// CHECK3-NEXT:    ret i32 [[ADD12]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT:    [[B:%.*]] = alloca i16, align 2
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i16, align 2
+// CHECK3-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS1:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 0, ptr [[A]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr null, ptr [[TMP2]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr null, ptr [[TMP3]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr null, ptr [[TMP4]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT:    store ptr null, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT:    store ptr null, ptr [[TMP6]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr null, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 8, ptr [[TMP9]], align 8
+// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 20, ptr [[TMP12]], align 4
+// CHECK3-NEXT:    [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK3-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3:       omp_offload.failed:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55() #[[ATTR2]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK3:       omp_offload.cont:
+// CHECK3-NEXT:    store i16 1, ptr [[B]], align 2
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i16, ptr [[B]], align 2
+// CHECK3-NEXT:    store i16 [[TMP15]], ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT:    store i32 [[TMP16]], ptr [[A_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP18:%.*]] = load i16, ptr [[B]], align 2
+// CHECK3-NEXT:    store i16 [[TMP18]], ptr [[B_CASTED]], align 2
+// CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP20:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK3-NEXT:    store i16 [[TMP20]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 2
+// CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP17]], ptr [[TMP22]], align 4
+// CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP17]], ptr [[TMP23]], align 4
+// CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr null, ptr [[TMP24]], align 4
+// CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 [[TMP19]], ptr [[TMP25]], align 4
+// CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 [[TMP19]], ptr [[TMP26]], align 4
+// CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-NEXT:    store ptr null, ptr [[TMP27]], align 4
+// CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK3-NEXT:    store i32 [[TMP21]], ptr [[TMP28]], align 4
+// CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK3-NEXT:    store i32 [[TMP21]], ptr [[TMP29]], align 4
+// CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr null, ptr [[TMP30]], align 4
+// CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP33:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK3-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
+// CHECK3-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
+// CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 3, ptr [[TMP36]], align 4
+// CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 3, ptr [[TMP37]], align 4
+// CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr [[TMP31]], ptr [[TMP38]], align 4
+// CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr [[TMP32]], ptr [[TMP39]], align 4
+// CHECK3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP40]], align 4
+// CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 5
+// CHECK3-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP41]], align 4
+// CHECK3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 6
+// CHECK3-NEXT:    store ptr null, ptr [[TMP42]], align 4
+// CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr null, ptr [[TMP43]], align 4
+// CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, ptr [[TMP44]], align 8
+// CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 8, ptr [[TMP45]], align 8
+// CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] [[TMP35]], ptr [[TMP46]], align 4
+// CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4
+// CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 1024, ptr [[TMP48]], align 4
+// CHECK3-NEXT:    [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP34]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.region_id, ptr [[KERNEL_ARGS1]])
+// CHECK3-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
+// CHECK3-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED2:%.*]], label [[OMP_OFFLOAD_CONT3:%.*]]
+// CHECK3:       omp_offload.failed2:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60(i32 [[TMP17]], i32 [[TMP19]], i32 [[TMP21]]) #[[ATTR2]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT3]]
+// CHECK3:       omp_offload.cont3:
+// CHECK3-NEXT:    [[TMP51:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT:    ret i32 [[TMP51]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88
+// CHECK3-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP1]], ptr [[B_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i32 [[TMP2]])
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK3-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// CHECK3-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK3-NEXT:    store double [[ADD]], ptr [[A]], align 4
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93
+// CHECK3-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK3-NEXT:    store double 2.500000e+00, ptr [[A]], align 4
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71
+// CHECK3-SAME: (i32 noundef [[N:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK3-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[N_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i32 [[TMP3]])
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK3-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK3-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3:       omp.precond.then:
+// CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK3-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
+// CHECK3-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3:       cond.true:
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    br label [[COND_END:%.*]]
+// CHECK3:       cond.false:
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT:    br label [[COND_END]]
+// CHECK3:       cond.end:
+// CHECK3-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
+// CHECK3-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3:       omp.inner.for.cond:
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
+// CHECK3-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3:       omp.inner.for.body:
+// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    store i32 [[TMP15]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i32 [[TMP13]], i32 [[TMP14]], i32 [[TMP16]]), !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3:       omp.inner.for.inc:
+// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+// CHECK3:       omp.inner.for.end:
+// CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3:       omp.loop.exit:
+// CHECK3-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK3-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
+// CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+// CHECK3-NEXT:    br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK3:       .omp.final.then:
+// CHECK3-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[SUB6:%.*]] = sub nsw i32 [[TMP23]], 0
+// CHECK3-NEXT:    [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1
+// CHECK3-NEXT:    [[ADD8:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT:    store i32 [[ADD8]], ptr [[I3]], align 4
+// CHECK3-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK3:       .omp.final.done:
+// CHECK3-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK3:       omp.precond.end:
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK3-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK3-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3:       omp.precond.then:
+// CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK3-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3:       cond.true:
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    br label [[COND_END:%.*]]
+// CHECK3:       cond.false:
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT:    br label [[COND_END]]
+// CHECK3:       cond.end:
+// CHECK3-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK3-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3:       omp.inner.for.cond:
+// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK3-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK3-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3:       omp.inner.for.body:
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3:       omp.body.continue:
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3:       omp.inner.for.inc:
+// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK3-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK3-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK3:       omp.inner.for.end:
+// CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3:       omp.loop.exit:
+// CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// CHECK3-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP18]])
+// CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CHECK3-NEXT:    br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK3:       .omp.final.then:
+// CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[SUB7:%.*]] = sub nsw i32 [[TMP21]], 0
+// CHECK3-NEXT:    [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1
+// CHECK3-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1
+// CHECK3-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// CHECK3-NEXT:    store i32 [[ADD10]], ptr [[I3]], align 4
+// CHECK3-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK3:       .omp.final.done:
+// CHECK3-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK3:       omp.precond.end:
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75
+// CHECK3-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined)
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@.omp_task_privates_map.
+// CHECK3-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTADDR3:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
+// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTADDR3]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR3]], align 4
+// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[TMP6]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[TMP8]], align 4
+// CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
+// CHECK3-NEXT:    store ptr [[TMP9]], ptr [[TMP10]], align 4
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@.omp_task_entry.
+// CHECK3-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS_I:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED_I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    store i32 [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
+// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META25:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META28:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
+// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META34:![0-9]+]]
+// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]]) #[[ATTR2]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 1, ptr [[TMP16]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr [[TMP12]], ptr [[TMP17]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr [[TMP13]], ptr [[TMP18]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr [[TMP14]], ptr [[TMP19]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
+// CHECK3-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP20]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
+// CHECK3-NEXT:    store ptr null, ptr [[TMP21]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr null, ptr [[TMP22]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, ptr [[TMP23]], align 8, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 9, ptr [[TMP24]], align 8, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 [[TMP15]], ptr [[TMP27]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.region_id, ptr [[KERNEL_ARGS_I]])
+// CHECK3-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
+// CHECK3-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
+// CHECK3:       omp_offload.failed.i:
+// CHECK3-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK3-NEXT:    store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75(i32 [[TMP31]]) #[[ATTR2]]
+// CHECK3-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
+// CHECK3:       .omp_outlined..exit:
+// CHECK3-NEXT:    ret i32 0
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
+// CHECK3-SAME: () #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined)
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60
+// CHECK3-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK3-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2
+// CHECK3-NEXT:    [[TMP2:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK3-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP2]], i32 0)
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK3-NEXT:    store i16 [[TMP5]], ptr [[B_CASTED]], align 2
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i32 [[TMP4]], i32 [[TMP6]])
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK3-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK9-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK9-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK9-NEXT:    store i32 [[TMP2]], ptr [[N_CASTED]], align 4
+// CHECK9-NEXT:    [[TMP3:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i64 [[TMP3]])
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK9-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK9-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK9-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK9-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK9-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK9-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK9:       omp.precond.then:
+// CHECK9-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK9-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK9-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
+// CHECK9-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9:       cond.true:
+// CHECK9-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT:    br label [[COND_END:%.*]]
+// CHECK9:       cond.false:
+// CHECK9-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT:    br label [[COND_END]]
+// CHECK9:       cond.end:
+// CHECK9-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
+// CHECK9-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9:       omp.inner.for.cond:
+// CHECK9-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK9-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
+// CHECK9-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9:       omp.inner.for.body:
+// CHECK9-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
+// CHECK9-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
+// CHECK9-NEXT:    [[TMP17:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    store i32 [[TMP17]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    [[TMP18:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i64 [[TMP14]], i64 [[TMP16]], i64 [[TMP18]]), !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9:       omp.inner.for.inc:
+// CHECK9-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK9-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK9:       omp.inner.for.end:
+// CHECK9-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9:       omp.loop.exit:
+// CHECK9-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK9-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP22]])
+// CHECK9-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+// CHECK9-NEXT:    br i1 [[TMP24]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK9:       .omp.final.then:
+// CHECK9-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[SUB6:%.*]] = sub nsw i32 [[TMP25]], 0
+// CHECK9-NEXT:    [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK9-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1
+// CHECK9-NEXT:    [[ADD8:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK9-NEXT:    store i32 [[ADD8]], ptr [[I3]], align 4
+// CHECK9-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK9:       .omp.final.done:
+// CHECK9-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK9:       omp.precond.end:
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK9-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK9-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK9-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK9-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK9-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK9-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK9:       omp.precond.then:
+// CHECK9-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT:    [[TMP4:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP4]] to i32
+// CHECK9-NEXT:    [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT:    [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK9-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT:    store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK9-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK9-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK9-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9:       cond.true:
+// CHECK9-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT:    br label [[COND_END:%.*]]
+// CHECK9:       cond.false:
+// CHECK9-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT:    br label [[COND_END]]
+// CHECK9:       cond.end:
+// CHECK9-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK9-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9:       omp.inner.for.cond:
+// CHECK9-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
+// CHECK9-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK9-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK9-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9:       omp.inner.for.body:
+// CHECK9-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK9-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK9-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK9-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK9:       omp.body.continue:
+// CHECK9-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9:       omp.inner.for.inc:
+// CHECK9-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK9-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK9-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK9-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK9:       omp.inner.for.end:
+// CHECK9-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9:       omp.loop.exit:
+// CHECK9-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// CHECK9-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP18]])
+// CHECK9-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CHECK9-NEXT:    br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK9:       .omp.final.then:
+// CHECK9-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0
+// CHECK9-NEXT:    [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1
+// CHECK9-NEXT:    [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1
+// CHECK9-NEXT:    [[ADD11:%.*]] = add nsw i32 0, [[MUL10]]
+// CHECK9-NEXT:    store i32 [[ADD11]], ptr [[I4]], align 4
+// CHECK9-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK9:       .omp.final.done:
+// CHECK9-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK9:       omp.precond.end:
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined)
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK9-NEXT:    store i32 [[TMP1]], ptr [[B_CASTED]], align 4
+// CHECK9-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i64 [[TMP2]])
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK9-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// CHECK9-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// CHECK9-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK9-NEXT:    store double [[ADD]], ptr [[A]], align 8
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK9-NEXT:    store double 2.500000e+00, ptr [[A]], align 8
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined)
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2
+// CHECK9-NEXT:    [[TMP2:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK9-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 0)
+// CHECK9-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK9-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK9-NEXT:    [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK9-NEXT:    [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK9-NEXT:    store i16 [[TMP5]], ptr [[B_CASTED]], align 2
+// CHECK9-NEXT:    [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i64 [[TMP4]], i64 [[TMP6]])
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK9-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]]
+// CHECK9-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK11-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK11-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[TMP2]], ptr [[N_CASTED]], align 4
+// CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i32 [[TMP3]])
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK11-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK11-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK11-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK11-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK11-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK11:       omp.precond.then:
+// CHECK11-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK11-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK11-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK11-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
+// CHECK11-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK11:       cond.true:
+// CHECK11-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT:    br label [[COND_END:%.*]]
+// CHECK11:       cond.false:
+// CHECK11-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT:    br label [[COND_END]]
+// CHECK11:       cond.end:
+// CHECK11-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
+// CHECK11-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK11-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK11:       omp.inner.for.cond:
+// CHECK11-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17:![0-9]+]]
+// CHECK11-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
+// CHECK11-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK11:       omp.inner.for.body:
+// CHECK11-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    store i32 [[TMP15]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    [[TMP16:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i32 [[TMP13]], i32 [[TMP14]], i32 [[TMP16]]), !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK11:       omp.inner.for.inc:
+// CHECK11-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK11-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
+// CHECK11:       omp.inner.for.end:
+// CHECK11-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK11:       omp.loop.exit:
+// CHECK11-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK11-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP20]])
+// CHECK11-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+// CHECK11-NEXT:    br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK11:       .omp.final.then:
+// CHECK11-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT:    [[SUB6:%.*]] = sub nsw i32 [[TMP23]], 0
+// CHECK11-NEXT:    [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK11-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1
+// CHECK11-NEXT:    [[ADD8:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK11-NEXT:    store i32 [[ADD8]], ptr [[I3]], align 4
+// CHECK11-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK11:       .omp.final.done:
+// CHECK11-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK11:       omp.precond.end:
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK11-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK11-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK11-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK11-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK11-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK11:       omp.precond.then:
+// CHECK11-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK11-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK11-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK11-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK11-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK11-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK11:       cond.true:
+// CHECK11-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT:    br label [[COND_END:%.*]]
+// CHECK11:       cond.false:
+// CHECK11-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT:    br label [[COND_END]]
+// CHECK11:       cond.end:
+// CHECK11-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK11-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK11:       omp.inner.for.cond:
+// CHECK11-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]]
+// CHECK11-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK11-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK11-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK11:       omp.inner.for.body:
+// CHECK11-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK11-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK11-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK11-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK11:       omp.body.continue:
+// CHECK11-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK11:       omp.inner.for.inc:
+// CHECK11-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK11-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK11-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK11-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK11:       omp.inner.for.end:
+// CHECK11-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK11:       omp.loop.exit:
+// CHECK11-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// CHECK11-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP18]])
+// CHECK11-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CHECK11-NEXT:    br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK11:       .omp.final.then:
+// CHECK11-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT:    [[SUB7:%.*]] = sub nsw i32 [[TMP21]], 0
+// CHECK11-NEXT:    [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1
+// CHECK11-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1
+// CHECK11-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// CHECK11-NEXT:    store i32 [[ADD10]], ptr [[I3]], align 4
+// CHECK11-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK11:       .omp.final.done:
+// CHECK11-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK11:       omp.precond.end:
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined)
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[TMP1]], ptr [[B_CASTED]], align 4
+// CHECK11-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i32 [[TMP2]])
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK11-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// CHECK11-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// CHECK11-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK11-NEXT:    store double [[ADD]], ptr [[A]], align 4
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK11-NEXT:    store double 2.500000e+00, ptr [[A]], align 4
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined)
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2
+// CHECK11-NEXT:    [[TMP2:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK11-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 0)
+// CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK11-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK11-NEXT:    [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK11-NEXT:    store i16 [[TMP5]], ptr [[B_CASTED]], align 2
+// CHECK11-NEXT:    [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i32 [[TMP4]], i32 [[TMP6]])
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK11-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]]
+// CHECK11-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT:    ret void
+//
diff --git a/clang/test/OpenMP/target_dyn_groupprivate_messages.cpp b/clang/test/OpenMP/target_dyn_groupprivate_messages.cpp
new file mode 100644
index 0000000000000..385bd5e89829d
--- /dev/null
+++ b/clang/test/OpenMP/target_dyn_groupprivate_messages.cpp
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 %s -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 %s -Wuninitialized
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  T z;
+  #pragma omp target dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
+  foo();
+  #pragma omp target dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target dyn_groupprivate () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  foo();
+  #pragma omp target dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
+  #pragma omp target dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp target' cannot contain more than one 'dyn_groupprivate' clause}}
+  foo();
+  #pragma omp target dyn_groupprivate (S) // expected-error {{'S' does not refer to a value}}
+  foo();
+  #pragma omp target dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target dyn_groupprivate(argc+z)
+  foo();
+  return 0;
+}
+
+int main(int argc, char **argv) {
+constexpr int n = -1;
+int z;
+  #pragma omp target dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
+  foo();
+  #pragma omp target dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target dyn_groupprivate () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  foo();
+  #pragma omp target dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
+  #pragma omp target dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp target' cannot contain more than one 'dyn_groupprivate' clause}}
+  foo();
+  #pragma omp target dyn_groupprivate (S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target dyn_groupprivate (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target dyn_groupprivate(dyn_groupprivate(tmain(argc, argv) // expected-error2 {{expected ')'}} expected-note2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char>' requested here}}
+  foo();
+  #pragma omp target dyn_groupprivate(-1) // expected-error {{argument to 'dyn_groupprivate' clause must be a non-negative integer value}}
+  foo();
+  #pragma omp target dyn_groupprivate(cgrou) // expected-error {{use of undeclared identifier 'cgrou'}}
+  foo();
+  #pragma omp target dyn_groupprivate(cgrou: argc) // expected-error {{use of undeclared identifier 'cgrou'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target dyn_groupprivate(cgroup,cgroup: argc) // expected-error {{modifier 'cgroup' cannot be used along with modifier 'cgroup' in dyn_groupprivate}}
+  foo();
+  #pragma omp target dyn_groupprivate(fallback(default_mem),fallback(abort): argc) // expected-error {{modifier 'fallback(abort)' cannot be used along with modifier 'fallback(default_mem)' in dyn_groupprivate}}
+  foo();
+  #pragma omp target dyn_groupprivate(fallback(abort),fallback(null): argc) // expected-error {{modifier 'fallback(null)' cannot be used along with modifier 'fallback(abort)' in dyn_groupprivate}}
+  foo();
+  #pragma omp target dyn_groupprivate(fallback(cgroup): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target dyn_groupprivate(fallback(): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target dyn_groupprivate(: argc) // expected-error {{expected ')'}} expected-error {{expected expression}} expected-note {{to match this '('}}
+  foo();
+
+  return tmain(argc, argv);
+}
+
diff --git a/clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp b/clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp
new file mode 100644
index 0000000000000..ac2cc0dde5073
--- /dev/null
+++ b/clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 %s -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 %s -Wuninitialized
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  T z;
+  #pragma omp target teams dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
+  foo();
+  #pragma omp target teams dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target teams dyn_groupprivate () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target teams dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target teams dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target teams' are ignored}}
+  foo();
+  #pragma omp target teams dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
+  #pragma omp target teams dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp target teams' cannot contain more than one 'dyn_groupprivate' clause}}
+  foo();
+  #pragma omp target teams dyn_groupprivate (S) // expected-error {{'S' does not refer to a value}}
+  foo();
+  #pragma omp target teams dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target teams dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target teams dyn_groupprivate(argc+z)
+  foo();
+  return 0;
+}
+
+int main(int argc, char **argv) {
+constexpr int n = -1;
+int z;
+  #pragma omp target teams dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
+  foo();
+  #pragma omp target teams dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target teams dyn_groupprivate () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target teams dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target teams dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target teams' are ignored}}
+  foo();
+  #pragma omp target teams dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
+  #pragma omp target teams dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp target teams' cannot contain more than one 'dyn_groupprivate' clause}}
+  foo();
+  #pragma omp target teams dyn_groupprivate (S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target teams dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target teams dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target teams dyn_groupprivate (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target teams dyn_groupprivate(dyn_groupprivate(tmain(argc, argv) // expected-error2 {{expected ')'}} expected-note2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char>' requested here}}
+  foo();
+  #pragma omp target teams dyn_groupprivate(-1) // expected-error {{argument to 'dyn_groupprivate' clause must be a non-negative integer value}}
+  foo();
+  #pragma omp target teams dyn_groupprivate(cgrou) // expected-error {{use of undeclared identifier 'cgrou'}}
+  foo();
+  #pragma omp target teams dyn_groupprivate(cgrou: argc) // expected-error {{use of undeclared identifier 'cgrou'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target teams dyn_groupprivate(cgroup,cgroup: argc) // expected-error {{modifier 'cgroup' cannot be used along with modifier 'cgroup' in dyn_groupprivate}}
+  foo();
+  #pragma omp target dyn_groupprivate(fallback(default_mem),fallback(abort): argc) // expected-error {{modifier 'fallback(abort)' cannot be used along with modifier 'fallback(default_mem)' in dyn_groupprivate}}
+  foo();
+  #pragma omp target dyn_groupprivate(fallback(abort),fallback(null): argc) // expected-error {{modifier 'fallback(null)' cannot be used along with modifier 'fallback(abort)' in dyn_groupprivate}}
+  foo();
+  #pragma omp target dyn_groupprivate(fallback(cgroup): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target dyn_groupprivate(fallback(): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target teams dyn_groupprivate(: argc) // expected-error {{expected ')'}} expected-error {{expected expression}} expected-note {{to match this '('}}
+  foo();
+
+  return tmain(argc, argv);
+}
+
diff --git a/clang/test/OpenMP/teams_dyn_groupprivate_messages.cpp b/clang/test/OpenMP/teams_dyn_groupprivate_messages.cpp
new file mode 100644
index 0000000000000..701ebfb43eec6
--- /dev/null
+++ b/clang/test/OpenMP/teams_dyn_groupprivate_messages.cpp
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 %s -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 %s -Wuninitialized
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  T z;
+  #pragma omp teams dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
+  foo();
+  #pragma omp teams dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp teams dyn_groupprivate () // expected-error {{expected expression}}
+  foo();
+  #pragma omp teams dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp teams dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp teams' are ignored}}
+  foo();
+  #pragma omp teams dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
+  #pragma omp teams dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp teams' cannot contain more than one 'dyn_groupprivate' clause}}
+  foo();
+  #pragma omp teams dyn_groupprivate (S) // expected-error {{'S' does not refer to a value}}
+  foo();
+  #pragma omp teams dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp teams dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp teams dyn_groupprivate(argc+z)
+  foo();
+  return 0;
+}
+
+int main(int argc, char **argv) {
+constexpr int n = -1;
+int z;
+  #pragma omp teams dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
+  foo();
+  #pragma omp teams dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp teams dyn_groupprivate () // expected-error {{expected expression}}
+  foo();
+  #pragma omp teams dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp teams dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp teams' are ignored}}
+  foo();
+  #pragma omp teams dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
+  #pragma omp teams dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp teams' cannot contain more than one 'dyn_groupprivate' clause}}
+  foo();
+  #pragma omp teams dyn_groupprivate (S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp teams dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp teams dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp teams dyn_groupprivate (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp teams dyn_groupprivate(dyn_groupprivate(tmain(argc, argv) // expected-error2 {{expected ')'}} expected-note2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char>' requested here}}
+  foo();
+  #pragma omp teams dyn_groupprivate(-1) // expected-error {{argument to 'dyn_groupprivate' clause must be a non-negative integer value}}
+  foo();
+  #pragma omp teams dyn_groupprivate(cgrou) // expected-error {{use of undeclared identifier 'cgrou'}}
+  foo();
+  #pragma omp teams dyn_groupprivate(cgrou: argc) // expected-error {{use of undeclared identifier 'cgrou'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp teams dyn_groupprivate(cgroup,cgroup: argc) // expected-error {{modifier 'cgroup' cannot be used along with modifier 'cgroup' in dyn_groupprivate}}
+  foo();
+  #pragma omp target dyn_groupprivate(fallback(default_mem),fallback(abort): argc) // expected-error {{modifier 'fallback(abort)' cannot be used along with modifier 'fallback(default_mem)' in dyn_groupprivate}}
+  foo();
+  #pragma omp target dyn_groupprivate(fallback(abort),fallback(null): argc) // expected-error {{modifier 'fallback(null)' cannot be used along with modifier 'fallback(abort)' in dyn_groupprivate}}
+  foo();
+  #pragma omp target dyn_groupprivate(fallback(cgroup): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target dyn_groupprivate(fallback(): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp teams dyn_groupprivate(: argc) // expected-error {{expected ')'}} expected-error {{expected expression}} expected-note {{to match this '('}}
+  foo();
+
+  return tmain(argc, argv);
+}
+
diff --git a/clang/test/OpenMP/thread_limit_amdgpu.c b/clang/test/OpenMP/thread_limit_amdgpu.c
deleted file mode 100644
index f884eeb73c3ff..0000000000000
--- a/clang/test/OpenMP/thread_limit_amdgpu.c
+++ /dev/null
@@ -1,34 +0,0 @@
-// Test target codegen - host bc file has to be created first.
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s
-// expected-no-diagnostics
-
-#ifndef HEADER
-#define HEADER
-
-void foo(int N) {
-#pragma omp target teams distribute parallel for simd
-  for (int i = 0; i < N; ++i)
-    ;
-#pragma omp target teams distribute parallel for simd thread_limit(4)
-  for (int i = 0; i < N; ++i)
-    ;
-#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42))))
-  for (int i = 0; i < N; ++i)
-    ;
-#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
-  for (int i = 0; i < N; ++i)
-    ;
-}
-
-#endif
-
-// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l10({{.*}}) #[[ATTR1:.+]] {
-// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l13({{.*}}) #[[ATTR2:.+]] {
-// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l16({{.*}}) #[[ATTR3:.+]] {
-// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l19({{.*}}) #[[ATTR4:.+]] {
-
-// CHECK: attributes #[[ATTR1]] = { {{.*}} "amdgpu-flat-work-group-size"="1,256" {{.*}} }
-// CHECK: attributes #[[ATTR2]] = { {{.*}} "amdgpu-flat-work-group-size"="1,4" {{.*}} }
-// CHECK: attributes #[[ATTR3]] = { {{.*}} "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} }
-// CHECK: attributes #[[ATTR4]] = { {{.*}} "amdgpu-flat-work-group-size"="1,22" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} }
diff --git a/clang/test/OpenMP/thread_limit_gpu.c b/clang/test/OpenMP/thread_limit_gpu.c
new file mode 100644
index 0000000000000..4bcc14d070c22
--- /dev/null
+++ b/clang/test/OpenMP/thread_limit_gpu.c
@@ -0,0 +1,41 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck -check-prefixes=CHECK,CHECK-AMDGPU %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-x86-spirv-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-spirv-host.bc -o - | FileCheck -check-prefixes=CHECK,CHECK-SPIRV %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void foo(int N) {
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < N; ++i)
+    ;
+#pragma omp target teams distribute parallel for simd thread_limit(4)
+  for (int i = 0; i < N; ++i)
+    ;
+#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42))))
+  for (int i = 0; i < N; ++i)
+    ;
+#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
+  for (int i = 0; i < N; ++i)
+    ;
+}
+
+#endif
+
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l12({{.*}}) #[[ATTR1:.+]] {
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l15({{.*}}) #[[ATTR2:.+]] {
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l18({{.*}}) #[[ATTR3:.+]] {
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l21({{.*}}) #[[ATTR4:.+]] {
+
+// CHECK-AMDGPU: attributes #[[ATTR1]] = { {{.*}} "amdgpu-flat-work-group-size"="1,256" {{.*}} }
+// CHECK-AMDGPU: attributes #[[ATTR2]] = { {{.*}} "amdgpu-flat-work-group-size"="1,4" {{.*}} }
+// CHECK-AMDGPU: attributes #[[ATTR3]] = { {{.*}} "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} }
+// CHECK-AMDGPU: attributes #[[ATTR4]] = { {{.*}} "amdgpu-flat-work-group-size"="1,22" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} }
+
+// CHECK-SPIRV: attributes #[[ATTR1]] = { {{.*}} "omp_target_thread_limit"="256" {{.*}} }
+// CHECK-SPIRV: attributes #[[ATTR2]] = { {{.*}} "omp_target_thread_limit"="4"  {{.*}} }
+// CHECK-SPIRV: attributes #[[ATTR3]] = { {{.*}} "omp_target_num_teams"="42" "omp_target_thread_limit"="42" {{.*}} }
+// CHECK-SPIRV: attributes #[[ATTR4]] = { {{.*}} "omp_target_num_teams"="42" "omp_target_thread_limit"="22" {{.*}} }
diff --git a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
index f37d00503fe57..6a507b0990df5 100644
--- a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
+++ b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
@@ -1,7 +1,9 @@
-// RUN: not %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2016 -fnative-half-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=SPIRV
-// RUN: %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2021 -fnative-half-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=valid
+// RUN: not %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2016 -fnative-half-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=SPIRV-HALF
+// RUN: not %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2016 -fnative-int16-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=SPIRV-INT
+// RUN: %clang_cc1 -internal-isystem D:\llvm-project\build\x64-Release\lib\clang\19\include -nostdsysteminc -triple spirv-vulkan-library -x hlsl -std=hlsl2021 -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes  -o - %s 2>&1 | FileCheck %s --check-prefix=valid
 
-// SPIRV: error: '-fnative-half-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016'
+// SPIRV-HALF: error: '-fnative-half-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016'
+// SPIRV-INT: error: '-fnative-int16-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016'
 
 // valid: "spirv-unknown-vulkan-library"
 // valid: define hidden spir_func void @{{.*main.*}}() #0 {
diff --git a/clang/test/Parser/lambda-misplaced-capture-default.cpp b/clang/test/Parser/lambda-misplaced-capture-default.cpp
index d65b875102da7..4f5bd6d7fa5e9 100644
--- a/clang/test/Parser/lambda-misplaced-capture-default.cpp
+++ b/clang/test/Parser/lambda-misplaced-capture-default.cpp
@@ -36,3 +36,12 @@ template <typename... Args> void Test(Args... args) {
   [... xs = &args, &] {};  // expected-error {{capture default must be first}}
 }
 } // namespace misplaced_capture_default_pack
+
+namespace GH163498 {
+struct S {
+  template <class T> S(T) {}
+};
+void t() {
+  S s{[a(42), &] {}}; // expected-error {{capture default must be first}}
+}
+}
diff --git a/clang/test/Parser/ms-empty-enum.c b/clang/test/Parser/ms-empty-enum.c
new file mode 100644
index 0000000000000..790547af88bab
--- /dev/null
+++ b/clang/test/Parser/ms-empty-enum.c
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 %s -fsyntax-only -Wmicrosoft -verify -fms-extensions
+
+typedef enum tag1 { } A; // expected-warning {{empty enumeration types are a Microsoft extension}}
+typedef enum tag2 { } B; // expected-warning {{empty enumeration types are a Microsoft extension}}
+typedef enum : unsigned { } C; // expected-warning {{enumeration types with a fixed underlying type are a Microsoft extension}}\
+                               // expected-warning {{empty enumeration types are a Microsoft extension}}
diff --git a/clang/test/ParserHLSL/semantic_parsing.hlsl b/clang/test/ParserHLSL/semantic_parsing.hlsl
index 726deadb7c44c..bff7bd03189e7 100644
--- a/clang/test/ParserHLSL/semantic_parsing.hlsl
+++ b/clang/test/ParserHLSL/semantic_parsing.hlsl
@@ -12,30 +12,33 @@ void Pony(int GI : SV_IWantAPony) { }
 // expected-note@+1 {{to match this '('}}
 void SuperPony(int GI : 0) { }
 
-// expected-error@+1 {{unknown HLSL semantic '_'}}
+// '_' is a valid CPP identifier.
 void MegaPony(int GI : _) { }
 
-// expected-error@+1 {{unknown HLSL semantic 'A0A'}}
+void GarguantuanPony(int GI : _1) { }
+
 void CoolPony(int GI : A0A0) { }
 
-// expected-error@+1 {{unknown HLSL semantic 'A_'}}
 void NicePony(int GI : A_0) { }
 
-// expected-error@+1 {{unknown HLSL semantic 'A'}}
 void CutePony(int GI : A00) { }
 
-// expected-error@+3 {{unknown HLSL semantic 'A'}}
 // expected-error@+2 {{expected ')'}}
 // expected-note@+1 {{to match this '('}}
 void DoublePony(int GI : A00 B) { }
 
-// expected-error@+1 {{unknown HLSL semantic 'é'}}
-void BigPony(int GI : é) { }
+// Unicode can be used:
+// https://timsong-cpp.github.io/cppwp/n3337/charname.allowed
+void FrenchPony(int GI : garçon_de_café) { }
+void UnicodePony(int GI : ℮) { }
+
+// Since P1949 seems Emojis are not allowed, even if in the range
+// mentioned in N3337.
+// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p1949r7.html
 
 // expected-error@+2 {{unexpected character <U+1F60A>}}
 // expected-error@+1 {{expected HLSL Semantic identifier}}
 void UTFPony(int GI : 😊) { }
 
-// expected-error@+2 {{character <U+1F60A> not allowed in an identifier}}
-// expected-error@+1 {{unknown HLSL semantic 'PonyWithA😊'}}
+// expected-error@+1 {{character <U+1F60A> not allowed in an identifier}}
 void SmilingPony(int GI : PonyWithA😊) { }
diff --git a/clang/test/Preprocessor/init-riscv.c b/clang/test/Preprocessor/init-riscv.c
new file mode 100644
index 0000000000000..4eeecccff4378
--- /dev/null
+++ b/clang/test/Preprocessor/init-riscv.c
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -E -dM -triple=riscv32 < /dev/null | \
+// RUN:     FileCheck -match-full-lines -check-prefixes=RV32 %s
+// RUN: %clang_cc1 -E -dM -triple=riscv64 < /dev/null | \
+// RUN:     FileCheck -match-full-lines -check-prefixes=RV64 %s
+
+// RV32: #define __GCC_CONSTRUCTIVE_SIZE 64
+// RV32: #define __GCC_DESTRUCTIVE_SIZE 64
+
+// RV64: #define __GCC_CONSTRUCTIVE_SIZE 64
+// RV64: #define __GCC_DESTRUCTIVE_SIZE 64
diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index cdb46326c2838..cf2cd4a10b056 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -1841,7 +1841,6 @@
 // CHECK_DMR_M32: #define __AMX_MOVRS__ 1
 // CHECK_DMR_M32: #define __AMX_TF32__ 1
 // CHECK_GNR_M32: #define __AMX_TILE__ 1
-// CHECK_DMR_M32: #define __AMX_TRANSPOSE__ 1
 // CHECK_DMR_M32: #define __AVX10_2_512__ 1
 // CHECK_DMR_M32: #define __AVX10_2__ 1
 // CHECK_GNR_M32: #define __AVX2__ 1
@@ -1947,7 +1946,6 @@
 // CHECK_DMR_M64: #define __AMX_MOVRS__ 1
 // CHECK_DMR_M64: #define __AMX_TF32__ 1
 // CHECK_GNR_M64: #define __AMX_TILE__ 1
-// CHECK_DMR_M64: #define __AMX_TRANSPOSE__ 1
 // CHECK_DMR_M64: #define __AVX10_2_512__ 1
 // CHECK_DMR_M64: #define __AVX10_2__ 1
 // CHECK_GNR_M64: #define __AVX2__ 1
diff --git a/clang/test/Preprocessor/predefined-macros-hlsl.hlsl b/clang/test/Preprocessor/predefined-macros-hlsl.hlsl
index 26bda6b7be167..f10c79cc9c2d4 100644
--- a/clang/test/Preprocessor/predefined-macros-hlsl.hlsl
+++ b/clang/test/Preprocessor/predefined-macros-hlsl.hlsl
@@ -7,7 +7,7 @@
 // RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-mesh | FileCheck -match-full-lines %s --check-prefixes=CHECK,MESH,NOHALF
 // RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-pixel | FileCheck -match-full-lines %s --check-prefixes=CHECK,PIXEL,NOHALF
 // RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-vertex | FileCheck -match-full-lines %s --check-prefixes=CHECK,VERTEX,NOHALF
-// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.3-vertex -fnative-half-type | FileCheck -match-full-lines %s --check-prefixes=CHECK,VERTEX,HALF
+// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.3-vertex -fnative-half-type -fnative-int16-type | FileCheck -match-full-lines %s --check-prefixes=CHECK,VERTEX,HALF
 
 // RUN: %clang_cc1 %s -E -dM -o - -triple spirv-unknown-vulkan-compute | FileCheck -match-full-lines %s --check-prefixes=CHECK,COMPUTE,NOHALF,SPIRV
 
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index 77731a9776be8..56c738bc007fb 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -40,6 +40,7 @@
 // CHECK-NOT: __riscv_smepmp {{.*$}}
 // CHECK-NOT: __riscv_smmpm{{.*$}}
 // CHECK-NOT: __riscv_smnpm{{.*$}}
+// CHECK-NOT: __riscv_smpmpmt {{.*$}}
 // CHECK-NOT: __riscv_smrnmi {{.*$}}
 // CHECK-NOT: __riscv_smstateen {{.*$}}
 // CHECK-NOT: __riscv_ssaia {{.*$}}
@@ -1333,6 +1334,14 @@
 // RUN:   -o - | FileCheck --check-prefix=CHECK-SMEPMP-EXT %s
 // CHECK-SMEPMP-EXT: __riscv_smepmp  1000000{{$}}
 
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32ismpmpmt0p6 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-SMPMPMT %s
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64ismpmpmt0p6 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-SMPMPMT %s
+// CHECK-SMPMPMT: __riscv_smpmpmt  6000{{$}}
+
 // RUN: %clang --target=riscv32 \
 // RUN:   -march=rv32ismrnmi1p0 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-SMRNMI-EXT %s
diff --git a/clang/test/Preprocessor/unwind-tables.c b/clang/test/Preprocessor/unwind-tables.c
index 0a863d79adbf6..5ff990d0c40a6 100644
--- a/clang/test/Preprocessor/unwind-tables.c
+++ b/clang/test/Preprocessor/unwind-tables.c
@@ -1,11 +1,13 @@
 // RUN: %clang %s -dM -E -target x86_64-windows | FileCheck %s --check-prefix=NO
 // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables | FileCheck %s --check-prefix=NO
+// RUN: %clang %s -dM -E -target x86_64 -fno-dwarf2-cfi-asm | FileCheck %s --check-prefix=NO
 
 // RUN: %clang %s -dM -E -target x86_64 | FileCheck %s
 // RUN: %clang %s -dM -E -target x86_64 -funwind-tables -fno-asynchronous-unwind-tables -g | FileCheck %s
 // RUN: %clang %s -dM -E -target aarch64-apple-darwin | FileCheck %s
 // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables -g | FileCheck %s
 // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables -fexceptions | FileCheck %s
+// RUN: %clang %s -dM -E -target x86_64-windows -fdwarf2-cfi-asm | FileCheck %s
 
 // NO-NOT: #define __GCC_HAVE_DWARF2_CFI_ASM
 // CHECK: #define __GCC_HAVE_DWARF2_CFI_ASM 1
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 5f17641878761..78f8b19459c2f 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -526,18 +526,6 @@
 
 // NO-AMX-COMPLEX-NOT: #define __AMX_COMPLEX__ 1
 
-// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-transpose -x c \
-// RUN: -E -dM -o - %s | FileCheck  -check-prefix=AMX-TRANSPOSE %s
-
-// AMX-TRANSPOSE: #define __AMX_TRANSPOSE__ 1
-
-// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mno-amx-transpose -x c \
-// RUN: -E -dM -o - %s | FileCheck  -check-prefix=NO-AMX-TRANSPOSE %s
-// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-transpose -mno-amx-tile \
-// RUN: -x c -E -dM -o - %s | FileCheck  -check-prefix=NO-AMX-TRANSPOSE %s
-
-// NO-AMX-TRANSPOSE-NOT: #define __AMX_TRANSPOSE__ 1
-
 // RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-avx512 -x c \
 // RUN: -E -dM -o - %s | FileCheck  -check-prefix=AMX-AVX512 %s
 
diff --git a/clang/test/Profile/Inputs/c-counter-overflows.proftext b/clang/test/Profile/Inputs/c-counter-overflows.proftext
index 4d0287c787051..8633060507014 100644
--- a/clang/test/Profile/Inputs/c-counter-overflows.proftext
+++ b/clang/test/Profile/Inputs/c-counter-overflows.proftext
@@ -1,5 +1,5 @@
 main
-7779561829442898616
+862032801801816760
 8
 1
 68719476720
diff --git a/clang/test/Profile/Inputs/c-general.profdata.v12 b/clang/test/Profile/Inputs/c-general.profdata.v12
new file mode 100644
index 0000000000000..57a72faaecc85
Binary files /dev/null and b/clang/test/Profile/Inputs/c-general.profdata.v12 differ
diff --git a/clang/test/Profile/Inputs/c-general.proftext b/clang/test/Profile/Inputs/c-general.proftext
index 08280ef39a89d..72e1be6e8846f 100644
--- a/clang/test/Profile/Inputs/c-general.proftext
+++ b/clang/test/Profile/Inputs/c-general.proftext
@@ -7,7 +7,7 @@ simple_loops
 75
 
 conditionals
-4904767535850050386
+293081517422662482
 13
 1
 100
@@ -24,7 +24,7 @@ conditionals
 1
 
 early_exits
-2880354649761471549
+574511640547777597
 9
 1
 0
@@ -37,7 +37,7 @@ early_exits
 0
 
 jumps
-15051420506203462683
+63440946314451995
 22
 1
 1
@@ -86,7 +86,7 @@ switches
 0
 
 big_switch
-13144136522122330070
+461999971447013334
 17
 1
 32
@@ -125,7 +125,7 @@ boolean_operators
 33
 
 boolop_loops
-12402604614320574815
+873389568252105055
 13
 1
 50
@@ -149,7 +149,7 @@ conditional_operator
 1
 
 do_fallthrough
-8714614136504380050
+644163604256451218
 4
 1
 10
diff --git a/clang/test/Profile/Inputs/c-unprofiled-blocks.proftext b/clang/test/Profile/Inputs/c-unprofiled-blocks.proftext
index d880663fed32d..7af509715f8f7 100644
--- a/clang/test/Profile/Inputs/c-unprofiled-blocks.proftext
+++ b/clang/test/Profile/Inputs/c-unprofiled-blocks.proftext
@@ -1,5 +1,5 @@
 never_called
-6820425066224770721
+1055817543190535841
 9
 0
 0
@@ -17,7 +17,7 @@ main
 1
 
 dead_code
-5254464978620792806
+642778960193404902
 10
 1
 0
diff --git a/clang/test/Profile/Inputs/cxx-rangefor.proftext b/clang/test/Profile/Inputs/cxx-rangefor.proftext
index d41205bbde147..cfc88da8f9726 100644
--- a/clang/test/Profile/Inputs/cxx-rangefor.proftext
+++ b/clang/test/Profile/Inputs/cxx-rangefor.proftext
@@ -1,5 +1,5 @@
 _Z9range_forv
-8789831523895825398
+719380991647896566
 5
 1
 4
diff --git a/clang/test/Profile/Inputs/cxx-throws.proftext b/clang/test/Profile/Inputs/cxx-throws.proftext
index 043dea08c728f..92b0eab396844 100644
--- a/clang/test/Profile/Inputs/cxx-throws.proftext
+++ b/clang/test/Profile/Inputs/cxx-throws.proftext
@@ -1,5 +1,5 @@
 _Z6throwsv
-18172607911962830854
+878785342860126214
 9
 1
 100
diff --git a/clang/test/Profile/Inputs/misexpect-switch-default.proftext b/clang/test/Profile/Inputs/misexpect-switch-default.proftext
index 533da91765234..112426e0c7b57 100644
--- a/clang/test/Profile/Inputs/misexpect-switch-default.proftext
+++ b/clang/test/Profile/Inputs/misexpect-switch-default.proftext
@@ -1,6 +1,6 @@
 main
 # Func Hash:
-8734802134600123338
+664351602352194506
 # Num Counters:
 9
 # Counter Values:
diff --git a/clang/test/Profile/Inputs/misexpect-switch-nonconst.proftext b/clang/test/Profile/Inputs/misexpect-switch-nonconst.proftext
index 0da9379357ae7..99d067c57f16f 100644
--- a/clang/test/Profile/Inputs/misexpect-switch-nonconst.proftext
+++ b/clang/test/Profile/Inputs/misexpect-switch-nonconst.proftext
@@ -1,6 +1,6 @@
 main
 # Func Hash:
-3721743393642630379
+262978879822089451
 # Num Counters:
 10
 # Counter Values:
diff --git a/clang/test/Profile/c-collision.c b/clang/test/Profile/c-collision.c
index 6c779c6facaa2..f35ba1bfb7627 100644
--- a/clang/test/Profile/c-collision.c
+++ b/clang/test/Profile/c-collision.c
@@ -2,8 +2,8 @@
 // RUN: %clang_cc1 -UEXTRA -triple x86_64-unknown-linux-gnu -main-file-name c-collision.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s --check-prefix=CHECK-NOEXTRA
 // RUN: %clang_cc1 -DEXTRA -triple x86_64-unknown-linux-gnu -main-file-name c-collision.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s --check-prefix=CHECK-EXTRA
 
-// CHECK-NOEXTRA: @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 7156072912471487002,
-// CHECK-EXTRA:   @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 -4383447408116050035,
+// CHECK-NOEXTRA: @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 238543884830405146,
+// CHECK-EXTRA:   @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 228238610311337869,
 
 extern int bar;
 void foo(void) {
diff --git a/clang/test/Profile/c-general.c b/clang/test/Profile/c-general.c
index ee36a43dac081..6c865e608a037 100644
--- a/clang/test/Profile/c-general.c
+++ b/clang/test/Profile/c-general.c
@@ -4,6 +4,7 @@
 
 // RUN: llvm-profdata merge %S/Inputs/c-general.proftext -o %t.profdata
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v12 | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v5 | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v3 | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
 // Also check compatibility with older profiles.
diff --git a/clang/test/Sema/PR166843.cpp b/clang/test/Sema/PR166843.cpp
new file mode 100644
index 0000000000000..5a6223bccc27e
--- /dev/null
+++ b/clang/test/Sema/PR166843.cpp
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -fsyntax-only %s -verify
+namespace a {
+template <class b>
+void c() {
+  ((::c::x)); // expected-error {{'c' is not a class, namespace, or enumeration}}
+}
+}
diff --git a/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c b/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c
index 8d4e0c510603a..443ccbbae66db 100644
--- a/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c
+++ b/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -Wpointer-arith -verify %s
 
 #define __counted_by(f)  __attribute__((counted_by(f)))
 
@@ -29,7 +29,9 @@ struct on_member_pointer_const_incomplete_ty {
 };
 
 struct on_member_pointer_void_ty {
-  void* buf __counted_by(count); // expected-error{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
+  // expected-warning@+2{{'counted_by' on a pointer to void is a GNU extension, treated as 'sized_by'}}
+  // expected-note@+1{{use '__sized_by' to suppress this warning}}
+  void* buf __counted_by(count);
   int count;
 };
 
diff --git a/clang/test/Sema/attr-counted-by-or-null-last-field.c b/clang/test/Sema/attr-counted-by-or-null-last-field.c
index 60a1f571b19e9..d0c50a733acef 100644
--- a/clang/test/Sema/attr-counted-by-or-null-last-field.c
+++ b/clang/test/Sema/attr-counted-by-or-null-last-field.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify=expected,immediate %s
-// RUN: %clang_cc1 -fsyntax-only -fexperimental-late-parse-attributes -verify=expected,late %s
+// RUN: %clang_cc1 -fsyntax-only -Wpointer-arith -verify=expected,immediate %s
+// RUN: %clang_cc1 -fsyntax-only -Wpointer-arith -fexperimental-late-parse-attributes -verify=expected,late %s
 
 #define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
 
@@ -128,7 +128,9 @@ struct on_member_ptr_incomplete_const_ty_ty_pos {
 
 struct on_member_ptr_void_ty_ty_pos {
   int count;
-  void * ptr __counted_by_or_null(count); // expected-error {{'counted_by_or_null' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
+  // expected-warning@+2{{'counted_by_or_null' on a pointer to void is a GNU extension, treated as 'sized_by_or_null'}}
+  // expected-note@+1{{use '__sized_by_or_null' to suppress this warning}}
+  void * ptr __counted_by_or_null(count);
 };
 
 typedef void(fn_ty)(int);
diff --git a/clang/test/Sema/attr-counted-by-or-null-late-parsed-struct-ptrs.c b/clang/test/Sema/attr-counted-by-or-null-late-parsed-struct-ptrs.c
index 2150c81f9e9be..233b729f87ccd 100644
--- a/clang/test/Sema/attr-counted-by-or-null-late-parsed-struct-ptrs.c
+++ b/clang/test/Sema/attr-counted-by-or-null-late-parsed-struct-ptrs.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -Wpointer-arith -verify %s
 
 #define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
 #define __counted_by(f)  __attribute__((counted_by(f)))
@@ -30,7 +30,9 @@ struct on_member_pointer_const_incomplete_ty {
 };
 
 struct on_member_pointer_void_ty {
-  void* buf __counted_by_or_null(count); // expected-error{{'counted_by_or_null' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
+  // expected-warning@+2{{'counted_by_or_null' on a pointer to void is a GNU extension, treated as 'sized_by_or_null'}}
+  // expected-note@+1{{use '__sized_by_or_null' to suppress this warning}}
+  void* buf __counted_by_or_null(count);
   int count;
 };
 
diff --git a/clang/test/Sema/attr-counted-by-or-null-struct-ptrs.c b/clang/test/Sema/attr-counted-by-or-null-struct-ptrs.c
index 0bb09059c97f9..0fd739ca7d4c3 100644
--- a/clang/test/Sema/attr-counted-by-or-null-struct-ptrs.c
+++ b/clang/test/Sema/attr-counted-by-or-null-struct-ptrs.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -Wpointer-arith -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -Wpointer-arith -verify %s
 
 #define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
 #define __counted_by(f)  __attribute__((counted_by(f)))
@@ -32,7 +32,8 @@ struct on_member_pointer_const_incomplete_ty {
 
 struct on_member_pointer_void_ty {
   int count;
-  // expected-error@+1{{'counted_by_or_null' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
+  // expected-warning@+2{{'counted_by_or_null' on a pointer to void is a GNU extension, treated as 'sized_by_or_null'}}
+  // expected-note@+1{{use '__sized_by_or_null' to suppress this warning}}
   void* buf __counted_by_or_null(count);
 };
 
@@ -124,7 +125,8 @@ struct on_member_pointer_const_incomplete_ty_ty_pos {
 
 struct on_member_pointer_void_ty_ty_pos {
   int count;
-  // expected-error@+1{{'counted_by_or_null' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
+  // expected-warning@+2{{'counted_by_or_null' on a pointer to void is a GNU extension, treated as 'sized_by_or_null'}}
+  // expected-note@+1{{use '__sized_by_or_null' to suppress this warning}}
   void *__counted_by_or_null(count) buf;
 };
 
diff --git a/clang/test/Sema/attr-counted-by-struct-ptrs.c b/clang/test/Sema/attr-counted-by-struct-ptrs.c
index c05d18262e2b7..a42f3895695a3 100644
--- a/clang/test/Sema/attr-counted-by-struct-ptrs.c
+++ b/clang/test/Sema/attr-counted-by-struct-ptrs.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fsyntax-only -fexperimental-late-parse-attributes %s -verify
+// RUN: %clang_cc1 -fsyntax-only -Wpointer-arith -verify %s
+// RUN: %clang_cc1 -fsyntax-only -Wpointer-arith -fexperimental-late-parse-attributes %s -verify
 
 #define __counted_by(f)  __attribute__((counted_by(f)))
 
@@ -31,7 +31,8 @@ struct on_member_pointer_const_incomplete_ty {
 
 struct on_member_pointer_void_ty {
   int count;
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
+  // expected-warning@+2{{'counted_by' on a pointer to void is a GNU extension, treated as 'sized_by'}}
+  // expected-note@+1{{use '__sized_by' to suppress this warning}}
   void* buf __counted_by(count);
 };
 
@@ -123,7 +124,8 @@ struct on_member_pointer_const_incomplete_ty_ty_pos {
 
 struct on_member_pointer_void_ty_ty_pos {
   int count;
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
+  // expected-warning@+2{{'counted_by' on a pointer to void is a GNU extension, treated as 'sized_by'}}
+  // expected-note@+1{{use '__sized_by' to suppress this warning}}
   void *__counted_by(count) buf;
 };
 
diff --git a/clang/test/Sema/attr-counted-by-void-ptr-gnu.c b/clang/test/Sema/attr-counted-by-void-ptr-gnu.c
new file mode 100644
index 0000000000000..c1ed5f84cf935
--- /dev/null
+++ b/clang/test/Sema/attr-counted-by-void-ptr-gnu.c
@@ -0,0 +1,101 @@
+// RUN: %clang_cc1 -fsyntax-only -verify=expected-nowarn %s
+// RUN: %clang_cc1 -Wpointer-arith -fsyntax-only -verify=expected-warn %s
+// RUN: %clang_cc1 -fexperimental-bounds-safety -fsyntax-only -verify=expected-bounds %s
+
+// expected-nowarn-no-diagnostics
+// expected-bounds-no-diagnostics
+
+#define NULL (void*)0
+#define __counted_by(f)  __attribute__((counted_by(f)))
+#define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
+#define __sized_by(f)  __attribute__((sized_by(f)))
+
+//==============================================================================
+// Test: counted_by on void* is allowed (warns with -Wpointer-arith)
+//==============================================================================
+
+struct test_void_ptr_gnu {
+  int count;
+  // expected-warn-warning@+2{{'counted_by' on a pointer to void is a GNU extension, treated as 'sized_by'}}
+  // expected-warn-note@+1{{use '__sized_by' to suppress this warning}}
+  void* buf __counted_by(count);
+};
+
+struct test_const_void_ptr_gnu {
+  int count;
+  // expected-warn-warning@+2{{'counted_by' on a pointer to void is a GNU extension, treated as 'sized_by'}}
+  // expected-warn-note@+1{{use '__sized_by' to suppress this warning}}
+  const void* buf __counted_by(count);
+};
+
+struct test_volatile_void_ptr_gnu {
+  int count;
+  // expected-warn-warning@+2{{'counted_by' on a pointer to void is a GNU extension, treated as 'sized_by'}}
+  // expected-warn-note@+1{{use '__sized_by' to suppress this warning}}
+  volatile void* buf __counted_by(count);
+};
+
+struct test_const_volatile_void_ptr_gnu {
+  int count;
+  // expected-warn-warning@+2{{'counted_by' on a pointer to void is a GNU extension, treated as 'sized_by'}}
+  // expected-warn-note@+1{{use '__sized_by' to suppress this warning}}
+  const volatile void* buf __counted_by(count);
+};
+
+// Verify sized_by still works the same way (always allowed, no warning)
+struct test_sized_by_void_ptr {
+  int size;
+  void* buf __sized_by(size);  // OK in both modes, no warning
+};
+
+//==============================================================================
+// Test: counted_by_or_null on void* behaves the same
+//==============================================================================
+
+struct test_void_ptr_or_null_gnu {
+  int count;
+  // expected-warn-warning@+2{{'counted_by_or_null' on a pointer to void is a GNU extension, treated as 'sized_by_or_null'}}
+  // expected-warn-note@+1{{use '__sized_by_or_null' to suppress this warning}}
+  void* buf __counted_by_or_null(count);
+};
+
+struct test_const_void_ptr_or_null_gnu {
+  int count;
+  // expected-warn-warning@+2{{'counted_by_or_null' on a pointer to void is a GNU extension, treated as 'sized_by_or_null'}}
+  // expected-warn-note@+1{{use '__sized_by_or_null' to suppress this warning}}
+  const void* buf __counted_by_or_null(count);
+};
+
+//==============================================================================
+// Test: Using void* __counted_by(...) pointers (not just declaring them)
+//==============================================================================
+
+// Verify that void* __counted_by pointers can be used as rvalues, assigned to,
+// passed to functions, etc.
+
+void* use_as_rvalue(struct test_void_ptr_gnu* t) {
+  return t->buf;
+}
+
+void assign_to_pointer(struct test_void_ptr_gnu* t) {
+  t->buf = NULL;
+  t->count = 0;
+}
+
+extern void* my_allocator(unsigned long);
+
+void assign_from_allocator(struct test_void_ptr_gnu* t) {
+  t->buf = my_allocator(100);
+  t->count = 100;
+}
+
+void takes_void_ptr(void* p);
+
+void pass_to_function(struct test_void_ptr_gnu* t) {
+  takes_void_ptr(t->buf);
+}
+
+void* pointer_arithmetic(struct test_void_ptr_gnu* t) {
+  // expected-warn-warning@+1{{arithmetic on a pointer to void is a GNU extension}}
+  return t->buf + 10;
+}
diff --git a/clang/test/Sema/attr-nonblocking-constraints.cpp b/clang/test/Sema/attr-nonblocking-constraints.cpp
index b26a945843696..012c017798a1f 100644
--- a/clang/test/Sema/attr-nonblocking-constraints.cpp
+++ b/clang/test/Sema/attr-nonblocking-constraints.cpp
@@ -104,6 +104,25 @@ void nb8c()
 	};
 }
 
+void nb8d() [[clang::nonblocking]]
+{
+	// Blocking methods of a local CXXRecordDecl do not generate diagnostics
+	// for the outer function.
+	struct F1 {
+        void method() { void* ptr = new int; }
+	};
+
+	// Skipping the CXXRecordDecl does not skip a following VarDecl.
+	struct F2 {
+        F2() { void* ptr = new int; } // expected-note {{constructor cannot be inferred 'nonblocking' because it allocates or deallocates memory}}
+	} f2; // expected-warning {{function with 'nonblocking' attribute must not call non-'nonblocking' constructor 'nb8d()::F2::F2'}}
+
+	// Nonblocking methods of a local CXXRecordDecl are verified independently.
+	struct F3 {
+		void method() [[clang::nonblocking]] { void* ptr = new int; }// expected-warning {{function with 'nonblocking' attribute must not allocate or deallocate memory}}
+	};
+}
+
 // Make sure template expansions are found and verified.
 	template <typename T>
 	struct Adder {
@@ -235,16 +254,35 @@ void nb13() [[clang::nonblocking]] { nb12(); }
 // C++ member function pointers
 struct PTMFTester {
 	typedef void (PTMFTester::*ConvertFunction)() [[clang::nonblocking]];
-
-	void convert() [[clang::nonblocking]];
+	typedef void (PTMFTester::*BlockingFunction)();
 
 	ConvertFunction mConvertFunc;
-};
 
-void PTMFTester::convert() [[clang::nonblocking]]
-{
-	(this->*mConvertFunc)();
-}
+	void convert() [[clang::nonblocking]]
+	{
+		(this->*mConvertFunc)(); // This should not generate a warning.
+	}
+
+	template <typename T>
+	struct Holder {
+		T value;
+		
+		T& operator*() { return value; }
+	};
+
+
+	void ptmfInExpr(Holder<ConvertFunction>& holder) [[clang::nonblocking]]
+	{
+		(this->*(*holder))();   // Should not generate a warning.
+		((*this).*(*holder))(); // Should not generate a warning.
+	}
+
+	void ptmfInExpr(Holder<BlockingFunction>& holder) [[clang::nonblocking]]
+	{
+		(this->*(*holder))(); // expected-warning {{function with 'nonblocking' attribute must not call non-'nonblocking' expression}}
+		((*this).*(*holder))(); // expected-warning {{function with 'nonblocking' attribute must not call non-'nonblocking' expression}}
+	}
+};
 
 // Allow implicit conversion from array to pointer.
 void nb14(unsigned idx) [[clang::nonblocking]]
@@ -354,6 +392,33 @@ struct Unsafe {
   Unsafe(float y) [[clang::nonblocking]] : Unsafe(int(y)) {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}}
 };
 
+// Exercise cases of a temporary with a safe constructor and unsafe destructor.
+void nb23()
+{
+	struct X {
+		int *ptr = nullptr;
+		X() {}
+		~X() { delete ptr; } // expected-note 2 {{destructor cannot be inferred 'nonblocking' because it allocates or deallocates memory}}
+	};
+
+	auto inner = []() [[clang::nonblocking]] {
+		X(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor 'nb23()::X::~X'}}
+	};
+
+	auto inner2 = [](X x) [[clang::nonblocking]] { // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor 'nb23()::X::~X'}}
+	};
+
+}
+
+struct S2 { ~S2(); }; // expected-note 2 {{declaration cannot be inferred 'nonblocking' because it has no definition in this translation unit}}
+void nb24() {
+    S2 s;
+    [&]() [[clang::nonblocking]] {
+        [s]{ auto x = &s; }(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor}} expected-note {{destructor cannot be inferred 'nonblocking' because it calls non-'nonblocking' destructor 'S2::~S2'}}
+        [=]{ auto x = &s; }(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor}} expected-note {{destructor cannot be inferred 'nonblocking' because it calls non-'nonblocking' destructor 'S2::~S2'}}
+    }();
+}
+
 struct DerivedFromUnsafe : public Unsafe {
   DerivedFromUnsafe() [[clang::nonblocking]] {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}}
   DerivedFromUnsafe(int x) [[clang::nonblocking]] : Unsafe(x) {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}}
diff --git a/clang/test/Sema/builtins-arm-exclusive-124.c b/clang/test/Sema/builtins-arm-exclusive-124.c
index b35ac181f0887..93540879a01ba 100644
--- a/clang/test/Sema/builtins-arm-exclusive-124.c
+++ b/clang/test/Sema/builtins-arm-exclusive-124.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple armv7m -fsyntax-only -verify %s
-// RUN: %clang_cc1 -triple armv8m.main -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple thumbv7m -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple thumbv8m.main -fsyntax-only -verify %s
 // RUN: %clang_cc1 -triple armv8.1m.main -fsyntax-only -verify %s
 
 // All these architecture versions provide 1-, 2- or 4-byte exclusive accesses,
diff --git a/clang/test/Sema/builtins-arm-exclusive-none.c b/clang/test/Sema/builtins-arm-exclusive-none.c
index 2ef910dd99aaf..25a71e18935a6 100644
--- a/clang/test/Sema/builtins-arm-exclusive-none.c
+++ b/clang/test/Sema/builtins-arm-exclusive-none.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple armv6m -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple thumbv6m -fsyntax-only -verify %s
 
 // Armv6-M does not support exclusive loads/stores at all, so all uses of
 // __builtin_arm_ldrex[d] and __builtin_arm_strex[d] is forbidden.
diff --git a/clang/test/Sema/builtins-elementwise-math.c b/clang/test/Sema/builtins-elementwise-math.c
index f9df4a6f93e05..37be0e4ebbd28 100644
--- a/clang/test/Sema/builtins-elementwise-math.c
+++ b/clang/test/Sema/builtins-elementwise-math.c
@@ -645,6 +645,42 @@ void test_builtin_elementwise_exp10(int i, float f, double d, float4 v, int3 iv,
   // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'unsigned4' (vector of 4 'unsigned int' values))}}
 }
 
+void test_builtin_elementwise_ldexp(int i, float f, double d, float4 v, int3 iv, unsigned u, unsigned4 uv) {
+
+  struct Foo s = __builtin_elementwise_ldexp(f, i);
+  // expected-error@-1 {{initializing 'struct Foo' with an expression of incompatible type 'float'}}
+
+  f = __builtin_elementwise_ldexp();
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 0}}
+
+  f = __builtin_elementwise_ldexp(f);
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 1}}
+
+  f = __builtin_elementwise_ldexp(f, i, i);
+  // expected-error@-1 {{too many arguments to function call, expected 2, have 3}}
+
+  f = __builtin_elementwise_ldexp(i, i);
+  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'int')}}
+
+  f = __builtin_elementwise_ldexp(f, f);
+  // expected-error@-1 {{2nd argument must be a scalar or vector of integer types (was 'float')}}
+
+  f = __builtin_elementwise_ldexp(v, iv);
+  // expected-error@-1 {{vector operands do not have the same number of elements ('float4' (vector of 4 'float' values) and 'int3' (vector of 3 'int' values))}}
+
+  v = __builtin_elementwise_ldexp(v, i);
+  // expected-error@-1 {{vector operands do not have the same number of elements ('float4' (vector of 4 'float' values) and 'int')}}
+
+  v = __builtin_elementwise_ldexp(f, iv);
+  // expected-error@-1 {{vector operands do not have the same number of elements ('float' and 'int3' (vector of 3 'int' values))}}
+
+  f = __builtin_elementwise_ldexp(u, i);
+  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'unsigned int')}}
+
+  f = __builtin_elementwise_ldexp(uv, i);
+  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'unsigned4' (vector of 4 'unsigned int' values))}}
+}
+
 void test_builtin_elementwise_floor(int i, float f, double d, float4 v, int3 iv, unsigned u, unsigned4 uv) {
 
   struct Foo s = __builtin_elementwise_floor(f);
diff --git a/clang/test/Sema/labeled-break-continue.c b/clang/test/Sema/labeled-break-continue.c
index 78f81c484c3d5..6b4adc23dca8d 100644
--- a/clang/test/Sema/labeled-break-continue.c
+++ b/clang/test/Sema/labeled-break-continue.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -std=c2y -verify -fsyntax-only -fblocks %s
-// RUN: %clang_cc1 -std=c23 -verify -fsyntax-only -fblocks -fnamed-loops %s
-// RUN: %clang_cc1 -x c++ -verify -fsyntax-only -fblocks -fnamed-loops %s
+// RUN: %clang_cc1 -std=c2y -verify -Wunused -fsyntax-only -fblocks %s
+// RUN: %clang_cc1 -std=c23 -verify -Wunused -fsyntax-only -fblocks -fnamed-loops %s
+// RUN: %clang_cc1 -x c++ -verify -Wunused -fsyntax-only -fblocks -fnamed-loops %s
 
 void f1() {
   l1: while (true) {
@@ -159,3 +159,15 @@ void f7() {
     continue d; // expected-error {{'continue' label does not name an enclosing loop}}
   }
 }
+
+void f8() {
+  l1: // no-warning
+  while (true) {
+    break l1;
+  }
+
+  l2: // no-warning
+  while (true) {
+    continue l2;
+  }
+}
diff --git a/clang/test/Sema/statements.c b/clang/test/Sema/statements.c
index d44ab5a65d5af..28740fa295768 100644
--- a/clang/test/Sema/statements.c
+++ b/clang/test/Sema/statements.c
@@ -119,14 +119,15 @@ void test_pr22849(void) {
   };
 }
 
-// GCC ignores empty statements at the end of compound expressions where the
-// result type is concerned.
+// Empty statements at the end of compound expressions have a result type 'void'.
 void test13(void) {
   int a;
   a = ({ 1; });
-  a = ({1;; });
+  a = ({ 1; 2; }); // expected-warning {{expression result unused}}
+  a = ({ 1;; }); // expected-error {{assigning to 'int' from incompatible type 'void'}}
+                 // expected-warning@-1 {{expression result unused}}
   a = ({int x = 1; (void)x; }); // expected-error {{assigning to 'int' from incompatible type 'void'}}
-  a = ({int x = 1; (void)x;; }); // expected-error {{assigning to 'int' from incompatible type 'void'}}
+  a = ({int x = 1;; }); // expected-error {{assigning to 'int' from incompatible type 'void'}}
 }
 
 void test14(void) { return ({}); }
diff --git a/clang/test/Sema/warn-lifetime-safety-dataflow.cpp b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
index 31148b990d6bd..e9515b5d61006 100644
--- a/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
+++ b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
@@ -414,3 +414,20 @@ void test_use_lifetimebound_call() {
 // CHECK:   Expire ([[L_Y]] (Path: y))
 // CHECK:   Expire ([[L_X]] (Path: x))
 }
+// CHECK-LABEL: Function: test_conditional_operator
+void test_conditional_operator(bool cond) {
+  MyObj x, y;
+  MyObj *p = cond ? &x : &y;
+// CHECK: Block B{{[0-9]+}}:
+// CHECK:   Issue ([[L_X:[0-9]+]] (Path: x), ToOrigin: [[O_DRE_X:[0-9]+]] (Expr: DeclRefExpr))
+// CHECK:   OriginFlow (Dest: [[O_ADDR_X:[0-9]+]] (Expr: UnaryOperator), Src: [[O_DRE_X]] (Expr: DeclRefExpr))
+// CHECK: Block B{{[0-9]+}}:
+// CHECK:   Issue ([[L_Y:[0-9]+]] (Path: y), ToOrigin: [[O_DRE_Y:[0-9]+]] (Expr: DeclRefExpr))
+// CHECK:   OriginFlow (Dest: [[O_ADDR_Y:[0-9]+]] (Expr: UnaryOperator), Src: [[O_DRE_Y]] (Expr: DeclRefExpr))
+// CHECK: Block B{{[0-9]+}}:
+// CHECK:   OriginFlow (Dest: [[O_COND_OP:[0-9]+]] (Expr: ConditionalOperator), Src: [[O_ADDR_X]] (Expr: UnaryOperator))
+// CHECK:   OriginFlow (Dest: [[O_COND_OP]] (Expr: ConditionalOperator), Src: [[O_ADDR_Y]] (Expr: UnaryOperator), Merge)
+// CHECK:   OriginFlow (Dest: [[O_P:[0-9]+]] (Decl: p), Src: [[O_COND_OP]] (Expr: ConditionalOperator))
+// CHECK:   Expire ([[L_Y]] (Path: y))
+// CHECK:   Expire ([[L_X]] (Path: x))
+}
diff --git a/clang/test/Sema/warn-lifetime-safety.cpp b/clang/test/Sema/warn-lifetime-safety.cpp
index 4f234f0ac6e2d..b9368db550805 100644
--- a/clang/test/Sema/warn-lifetime-safety.cpp
+++ b/clang/test/Sema/warn-lifetime-safety.cpp
@@ -440,6 +440,7 @@ void no_error_loan_from_current_iteration(bool cond) {
 //===----------------------------------------------------------------------===//
 
 View Identity(View v [[clang::lifetimebound]]);
+MyObj* Identity(MyObj* v [[clang::lifetimebound]]);
 View Choose(bool cond, View a [[clang::lifetimebound]], View b [[clang::lifetimebound]]);
 MyObj* GetPointer(const MyObj& obj [[clang::lifetimebound]]);
 
@@ -582,3 +583,106 @@ void lifetimebound_ctor() {
   }
   (void)v;
 }
+
+// Conditional operator.
+void conditional_operator_one_unsafe_branch(bool cond) {
+  MyObj safe;
+  MyObj* p = &safe;
+  {
+    MyObj temp;
+    p = cond ? &temp  // expected-warning {{object whose reference is captured may not live long enough}}
+             : &safe;
+  }  // expected-note {{destroyed here}}
+
+  // This is not a use-after-free for any value of `cond` but the analysis
+  // cannot reason this and marks the above as a false positive. This 
+  // ensures safety regardless of cond's value.
+  if (cond) 
+    p = &safe;
+  (void)*p;  // expected-note {{later used here}}
+}
+
+void conditional_operator_two_unsafe_branches(bool cond) {
+  MyObj* p;
+  {
+    MyObj a, b;
+    p = cond ? &a   // expected-warning {{object whose reference is captured does not live long enough}}
+             : &b;  // expected-warning {{object whose reference is captured does not live long enough}}
+  }  // expected-note 2 {{destroyed here}}
+  (void)*p;  // expected-note 2 {{later used here}}
+}
+
+void conditional_operator_nested(bool cond) {
+  MyObj* p;
+  {
+    MyObj a, b, c, d;
+    p = cond ? cond ? &a    // expected-warning {{object whose reference is captured does not live long enough}}.
+                    : &b    // expected-warning {{object whose reference is captured does not live long enough}}.
+             : cond ? &c    // expected-warning {{object whose reference is captured does not live long enough}}.
+                    : &d;   // expected-warning {{object whose reference is captured does not live long enough}}.
+  }  // expected-note 4 {{destroyed here}}
+  (void)*p;  // expected-note 4 {{later used here}}
+}
+
+void conditional_operator_lifetimebound(bool cond) {
+  MyObj* p;
+  {
+    MyObj a, b;
+    p = Identity(cond ? &a    // expected-warning {{object whose reference is captured does not live long enough}}
+                      : &b);  // expected-warning {{object whose reference is captured does not live long enough}}
+  }  // expected-note 2 {{destroyed here}}
+  (void)*p;  // expected-note 2 {{later used here}}
+}
+
+void conditional_operator_lifetimebound_nested(bool cond) {
+  MyObj* p;
+  {
+    MyObj a, b;
+    p = Identity(cond ? Identity(&a)    // expected-warning {{object whose reference is captured does not live long enough}}
+                      : Identity(&b));  // expected-warning {{object whose reference is captured does not live long enough}}
+  }  // expected-note 2 {{destroyed here}}
+  (void)*p;  // expected-note 2 {{later used here}}
+}
+
+void conditional_operator_lifetimebound_nested_deep(bool cond) {
+  MyObj* p;
+  {
+    MyObj a, b, c, d;
+    p = Identity(cond ? Identity(cond ? &a     // expected-warning {{object whose reference is captured does not live long enough}}
+                                      : &b)    // expected-warning {{object whose reference is captured does not live long enough}}
+                      : Identity(cond ? &c     // expected-warning {{object whose reference is captured does not live long enough}}
+                                      : &d));  // expected-warning {{object whose reference is captured does not live long enough}}
+  }  // expected-note 4 {{destroyed here}}
+  (void)*p;  // expected-note 4 {{later used here}}
+}
+
+void parentheses(bool cond) {
+  MyObj* p;
+  {
+    MyObj a;
+    p = &((((a))));  // expected-warning {{object whose reference is captured does not live long enough}}
+  }                  // expected-note {{destroyed here}}
+  (void)*p;          // expected-note {{later used here}}
+
+  {
+    MyObj a;
+    p = ((GetPointer((a))));  // expected-warning {{object whose reference is captured does not live long enough}}
+  }                           // expected-note {{destroyed here}}
+  (void)*p;                   // expected-note {{later used here}}
+
+  {
+    MyObj a, b, c, d;
+    p = &(cond ? (cond ? a     // expected-warning {{object whose reference is captured does not live long enough}}.
+                       : b)    // expected-warning {{object whose reference is captured does not live long enough}}.
+               : (cond ? c     // expected-warning {{object whose reference is captured does not live long enough}}.
+                       : d));  // expected-warning {{object whose reference is captured does not live long enough}}.
+  }  // expected-note 4 {{destroyed here}}
+  (void)*p;  // expected-note 4 {{later used here}}
+
+  {
+    MyObj a, b, c, d;
+    p = ((cond ? (((cond ? &a : &b)))   // expected-warning 2 {{object whose reference is captured does not live long enough}}.
+              : &(((cond ? c : d)))));  // expected-warning 2 {{object whose reference is captured does not live long enough}}.
+  }  // expected-note 4 {{destroyed here}}
+  (void)*p;  // expected-note 4 {{later used here}}
+}
diff --git a/clang/test/Sema/warn-unreachable-file-scope.c b/clang/test/Sema/warn-unreachable-file-scope.c
new file mode 100644
index 0000000000000..64a6918cbcf77
--- /dev/null
+++ b/clang/test/Sema/warn-unreachable-file-scope.c
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+typedef unsigned char u8;
+
+u8 a1 = (0 ? 0xffff : 0xff);
+u8 a2 = (1 ? 0xffff : 0xff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}}
+u8 a3 = (1 ? 0xff : 0xffff);
+u8 a4 = (0 ? 0xff : 0xffff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}}
+
+unsigned long long b1 = 1 ? 0 : 1ULL << 64;
+unsigned long long b2 = 0 ? 0 : 1ULL << 64; // expected-warning {{shift count >= width of type}}
+unsigned long long b3 = 1 ? 1ULL << 64 : 0; // expected-warning {{shift count >= width of type}}
+
+#define M(n) (((n) == 64) ? ~0ULL : ((1ULL << (n)) - 1))
+unsigned long long c1 = M(64);
+unsigned long long c2 = M(32);
+
+static u8 d1 = (0 ? 0xffff : 0xff);
+static u8 d2 = (1 ? 0xffff : 0xff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}}
+
+int a = 1 ? 6 : (1,2);
+int b = 0 ? 6 : (1,2); // expected-warning {{left operand of comma operator has no effect}}
+
+void f(void) {
+  u8 e1 = (0 ? 0xffff : 0xff);
+  u8 e2 = (1 ? 0xffff : 0xff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}}
+
+  unsigned long long e3 = 1 ? 0 : 1ULL << 64;
+  unsigned long long e4 = 0 ? 0 : 1ULL << 64; // expected-warning {{shift count >= width of type}}
+}
+
+void statics(void) {
+  static u8 f1 = (0 ? 0xffff : 0xff);
+  static u8 f2 = (1 ? 0xffff : 0xff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}}
+  static u8 f3 = (1 ? 0xff : 0xffff);
+  static u8 f4 = (0 ? 0xff : 0xffff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}}
+}
diff --git a/clang/test/SemaCUDA/error-includes-mode.cu b/clang/test/SemaCUDA/error-includes-mode.cu
index 257fdeceef654..f775e656b07a1 100644
--- a/clang/test/SemaCUDA/error-includes-mode.cu
+++ b/clang/test/SemaCUDA/error-includes-mode.cu
@@ -1,7 +1,16 @@
 // RUN: not %clang_cc1 -fsyntax-only %s 2>&1 | FileCheck --check-prefix HOST %s
 // RUN: not %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_35 \
 // RUN:   -fcuda-is-device -fsyntax-only %s 2>&1 | FileCheck --check-prefix SM35 %s
+// RUN: not %clang_cc1 -triple spirv64-unknown-unknown \
+// RUN:   -fcuda-is-device -fsyntax-only %s 2>&1 | FileCheck --check-prefix SPIRV %s
+// RUN: not %clang_cc1 -triple spirv64-amd-amdhsa \
+// RUN:   -fcuda-is-device -fsyntax-only %s 2>&1 | FileCheck --check-prefix AMDGCNSPIRV %s
+// RUN: not %clang_cc1 -triple spirv64-intel-unknown \
+// RUN:   -fcuda-is-device -fsyntax-only %s 2>&1 | FileCheck --check-prefix INTELSPIRV %s
 
 // HOST: 1 error generated when compiling for host
 // SM35: 1 error generated when compiling for sm_35
+// SPIRV: 1 error generated when compiling for spirv64-unknown-unknown
+// AMDGCNSPIRV: 1 error generated when compiling for spirv64-amd-amdhsa
+// INTELSPIRV: 1 error generated when compiling for spirv64-intel-unknown
 error;
diff --git a/clang/test/SemaCXX/attr-callback-broken.cpp b/clang/test/SemaCXX/attr-callback-broken.cpp
index a5469b22ba350..53b331a49251b 100644
--- a/clang/test/SemaCXX/attr-callback-broken.cpp
+++ b/clang/test/SemaCXX/attr-callback-broken.cpp
@@ -1,7 +1,12 @@
-// RUN: %clang_cc1 %s -verify -fsyntax-only
+// RUN: %clang_cc1 %s -std=c++23 -verify -fsyntax-only
 
 class C_in_class {
 #define HAS_THIS
 #include "../Sema/attr-callback-broken.c"
 #undef HAS_THIS
 };
+
+class ExplicitParameterObject {
+  __attribute__((callback(2, 0))) void explicit_this_idx(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*));           // expected-error {{'callback' argument at position 2 references unavailable implicit 'this'}}
+  __attribute__((callback(2, this))) void explicit_this_identifier(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*)); // expected-error {{'callback' argument at position 2 references unavailable implicit 'this'}}
+};
diff --git a/clang/test/SemaCXX/attr-callback.cpp b/clang/test/SemaCXX/attr-callback.cpp
index ee02f7d3d24f7..ff5a241e92f74 100644
--- a/clang/test/SemaCXX/attr-callback.cpp
+++ b/clang/test/SemaCXX/attr-callback.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -verify -fsyntax-only
+// RUN: %clang_cc1 %s -std=c++23 -verify -fsyntax-only
 
 // expected-no-diagnostics
 
@@ -6,6 +6,11 @@ class C_in_class {
 #include "../Sema/attr-callback.c"
 };
 
+class ExplicitParameterObject {
+  __attribute__((callback(2, 1))) void explicit_this_idx(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*));
+  __attribute__((callback(2, self))) void explicit_this_identifier(this ExplicitParameterObject* self, void (*callback)(ExplicitParameterObject*));
+};
+
 struct Base {
 
   void no_args_1(void (*callback)(void));
diff --git a/clang/test/SemaCXX/attr-format.cpp b/clang/test/SemaCXX/attr-format.cpp
index adc05fc46776c..c0aeb5d07dfe9 100644
--- a/clang/test/SemaCXX/attr-format.cpp
+++ b/clang/test/SemaCXX/attr-format.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -Wformat-nonliteral -verify %s
+// RUN: %clang_cc1 -fsyntax-only -std=c++23 -Wformat-nonliteral -verify %s
 #include <stdarg.h>
 
 int printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
@@ -11,6 +11,10 @@ struct S {
   // the format argument is argument 2 here.
   void g(const char*, ...) __attribute__((format(printf, 2, 3)));
   const char* g2(const char*) __attribute__((format_arg(2)));
+  // From C++23 'this' can also be specified explicitly.
+  void g3(this S&, const char *, ...) __attribute__((format(printf, 2, 3)));
+  void g4(this const char* s, ...) __attribute__((format(printf, 1, 2)));
+  consteval operator const char*() const { return "%f"; } // #g4_fmt_string
 
   void h(const char*, ...) __attribute__((format(printf, 1, 4))); // \
       expected-error{{implicit this argument as the format string}}
@@ -18,10 +22,17 @@ struct S {
       expected-error{{out of bounds}}
   const char* h3(const char*) __attribute__((format_arg(1))); // \
       expected-error{{invalid for the implicit this argument}}
+  void h4(this S&, const char *, ...) __attribute__((format(printf, 1, 3))); // \
+      expected-error {{format argument not a string type}}
 
   void operator() (const char*, ...) __attribute__((format(printf, 2, 3)));
 };
 
+void s() {
+  S().g4(4); // expected-warning {{format specifies type 'double' but the argument has type 'int'}}
+             // expected-note@#g4_fmt_string {{format string is defined here}}
+}
+
 // PR5521
 struct A { void a(const char*,...) __attribute((format(printf,2,3))); };
 void b(A x) {
diff --git a/clang/test/SemaCXX/attr-lifetime-capture-by.cpp b/clang/test/SemaCXX/attr-lifetime-capture-by.cpp
index 70a5fe5a45376..8606592c6b771 100644
--- a/clang/test/SemaCXX/attr-lifetime-capture-by.cpp
+++ b/clang/test/SemaCXX/attr-lifetime-capture-by.cpp
@@ -44,4 +44,7 @@ struct T {
   {
     s.captureInt(x);
   }
+
+  void explicit_this1(this T& self, const int &x [[clang::lifetime_capture_by(self)]]);
+  void explicit_this2(this T& self, const int &x [[clang::lifetime_capture_by(this)]]); // expected-error {{argument references unavailable implicit 'this'}}
 };
diff --git a/clang/test/SemaCXX/attr-mode-tmpl.cpp b/clang/test/SemaCXX/attr-mode-tmpl.cpp
index f665b1ba49123..3a1da3b358af4 100644
--- a/clang/test/SemaCXX/attr-mode-tmpl.cpp
+++ b/clang/test/SemaCXX/attr-mode-tmpl.cpp
@@ -45,7 +45,7 @@ void CheckMachineMode() {
 
 // Check attributes on function parameters.
 template <class T1, class T2>
-void CheckParameters(T1 __attribute__((mode(SI)))   paramSI,     // expected-note{{ignored: substitution failure}} expected-note-re{{not viable: no known conversion from '{{.*}}' (vector of 4 '{{.*}}' values) to 'EnumType' for 2nd argument}}
+void CheckParameters(T1 __attribute__((mode(SI)))   paramSI,     // expected-note{{ignored: substitution failure}} expected-note{{ignored: substitution failure [with T1 = int, T2 = int]: type of machine mode does not match type of base type}}
                      T1 __attribute__((mode(V4DI))) paramV4DI,   // expected-warning{{deprecated}}
                      T2 __attribute__((mode(SF)))   paramSF,
                      T2 __attribute__((mode(V4DF))) paramV4DF) { // expected-warning{{deprecated}}
diff --git a/clang/test/SemaCXX/attr-nonnull.cpp b/clang/test/SemaCXX/attr-nonnull.cpp
index 6f9119b519d09..0fba6b50cb354 100644
--- a/clang/test/SemaCXX/attr-nonnull.cpp
+++ b/clang/test/SemaCXX/attr-nonnull.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter
 struct S {
   S(const char *) __attribute__((nonnull(2)));
 
@@ -11,6 +11,13 @@ struct S {
 
   void h(const char*) __attribute__((nonnull(1))); // \
       expected-error{{invalid for the implicit this argument}}
+
+  void i(this S* self, const char*) __attribute__((nonnull(1)));
+
+  void j(this S* self, const char*) __attribute__((nonnull(2)));
+
+  void k(this S* self, const char*) __attribute__((nonnull(3))); // \
+      expected-error{{'nonnull' attribute parameter 1 is out of bounds}}
 };
 
 void test() {
diff --git a/clang/test/SemaCXX/constant-expression-cxx14.cpp b/clang/test/SemaCXX/constant-expression-cxx14.cpp
index bea90ff7eaf8a..1fc6e5ec4cc55 100644
--- a/clang/test/SemaCXX/constant-expression-cxx14.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx14.cpp
@@ -1450,3 +1450,9 @@ namespace GH149500 {
   unsigned int * p = &(*(unsigned int *)0x400);
   static const void *q = &(*(const struct sysrq_key_op *)0);
 }
+
+constexpr bool missingCase() {
+  switch (1) {
+    1u: return false; // expected-error {{expected 'case' keyword before expression}}
+  }
+}
diff --git a/clang/test/SemaCXX/cxx23-assume.cpp b/clang/test/SemaCXX/cxx23-assume.cpp
index ce862666aa48f..a594a1a44337b 100644
--- a/clang/test/SemaCXX/cxx23-assume.cpp
+++ b/clang/test/SemaCXX/cxx23-assume.cpp
@@ -108,7 +108,8 @@ constexpr bool f4() {
 template <typename T>
 concept C = f4<T>(); // expected-note 3 {{in instantiation of}}
                      // expected-note@-1 3 {{while substituting}}
-                     // expected-error@-2 2 {{resulted in a non-constant expression}}
+                     // expected-error@-2 {{resulted in a non-constant expression}}
+                     // expected-note@-3 {{because substituted constraint expression is ill-formed: substitution into constraint expression resulted in a non-constant expression}}
 
 struct D {
   int x;
@@ -130,13 +131,13 @@ constexpr int f5() requires C<T> { return 1; } // expected-note {{while checking
                                                // expected-note@-1 {{candidate template ignored}}
 
 template <typename T>
-constexpr int f5() requires (!C<T>) { return 2; } // expected-note 4 {{while checking the satisfaction}} \
-                                                  // expected-note 4 {{while substituting template arguments}} \
+constexpr int f5() requires (!C<T>) { return 2; } // expected-note 3 {{while checking the satisfaction}} \
+                                                  // expected-note 3 {{while substituting template arguments}} \
                                                   // expected-note {{candidate template ignored}}
 
 static_assert(f5<int>() == 1);
-static_assert(f5<D>() == 1); // expected-note 3 {{while checking constraint satisfaction}}
-                             // expected-note@-1 3 {{while substituting deduced template arguments}}
+static_assert(f5<D>() == 1); // expected-note 2 {{while checking constraint satisfaction}}
+                             // expected-note@-1 2 {{while substituting deduced template arguments}}
                              // expected-error@-2 {{no matching function for call}}
 
 static_assert(f5<double>() == 2);
@@ -170,7 +171,7 @@ foo (int x, int y)
 
 // Do not crash when assumptions are unreachable.
 namespace gh106898 {
-int foo () { 
+int foo () {
     while(1);
     int a = 0, b = 1;
     __attribute__((assume (a < b)));
diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
index 331fe8387e1c7..ff104243a9735 100644
--- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
+++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -std=c++2a -Wno-unused-value %s -verify
+// RUN: %clang_cc1 -std=c++2a -Wno-unused-value %s -verify -fexperimental-new-constant-interpreter
 // RUN: %clang_cc1 -std=c++2b -Wno-unused-value %s -verify
+// RUN: %clang_cc1 -std=c++2b -Wno-unused-value %s -verify -fexperimental-new-constant-interpreter
 
 consteval int id(int i) { return i; }
 constexpr char id(char c) { return c; }
diff --git a/clang/test/SemaCXX/cxx2b-warn-shadow.cpp b/clang/test/SemaCXX/cxx2b-warn-shadow.cpp
index 76866c4269474..9ce0c5a7434f5 100644
--- a/clang/test/SemaCXX/cxx2b-warn-shadow.cpp
+++ b/clang/test/SemaCXX/cxx2b-warn-shadow.cpp
@@ -11,3 +11,29 @@ struct Foo {
   }
 };
 } // namespace GH95707
+
+namespace GH163731 {
+struct S1 {
+  int a;
+  void m(this S1 &self) {
+    auto lambda = [](int a) { return a; };
+  }
+};
+
+struct S2 {
+  int a;
+  void m(this S2 &self) {
+    int a = 1;                // expected-note {{previous declaration is here}}
+    auto lambda = [](int a) { // expected-warning {{declaration shadows a local variable}}
+      return a;
+    };
+  }
+};
+
+struct S3 {
+  int a;
+  void m(this S3 &self) {
+    auto lambda = [self](int a) { return a + self.a; };
+  }
+};
+}
diff --git a/clang/test/SemaCXX/dependent-switch-case.cpp b/clang/test/SemaCXX/dependent-switch-case.cpp
new file mode 100644
index 0000000000000..bbeab3a650f4d
--- /dev/null
+++ b/clang/test/SemaCXX/dependent-switch-case.cpp
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -std=c++20 %s -verify
+// RUN: %clang_cc1 -std=c++20 %s -verify -fexperimental-new-constant-interpreter
+
+constexpr bool e(int){switch(0)0=0:return t(;} // expected-error {{expression is not assignable}} \
+                                               // expected-error {{expected 'case' keyword before expression}} \
+                                               // expected-error {{expected expression}}
diff --git a/clang/test/SemaCXX/dllexport.cpp b/clang/test/SemaCXX/dllexport.cpp
index f503e2fc311d1..169af5cacc6c7 100644
--- a/clang/test/SemaCXX/dllexport.cpp
+++ b/clang/test/SemaCXX/dllexport.cpp
@@ -1,13 +1,13 @@
-// RUN: %clang_cc1 -triple i686-win32             -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DMS  %s
-// RUN: %clang_cc1 -triple x86_64-win32           -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DMS  %s
-// RUN: %clang_cc1 -triple i686-mingw32           -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DGNU %s
-// RUN: %clang_cc1 -triple x86_64-mingw32         -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DGNU %s
-// RUN: %clang_cc1 -triple i686-pc-cygwin         -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DGNU %s
-// RUN: %clang_cc1 -triple x86_64-pc-cygwin       -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DGNU %s
-// RUN: %clang_cc1 -triple i686-windows-itanium   -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DWI  %s
-// RUN: %clang_cc1 -triple x86_64-windows-itanium -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DWI  %s
-// RUN: %clang_cc1 -triple x86_64-scei-ps4        -fsyntax-only -fdeclspec      -verify -std=c++11 -Wunsupported-dll-base-class-template -DPS  %s
-// RUN: %clang_cc1 -triple x86_64-sie-ps5         -fsyntax-only -fdeclspec      -verify -std=c++1y -Wunsupported-dll-base-class-template -DPS  %s
+// RUN: %clang_cc1 -triple i686-win32             -fsyntax-only -fms-extensions -verify=expected,ms,non-gnu,ms-ps        -std=c++11 -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple x86_64-win32           -fsyntax-only -fms-extensions -verify=expected,ms,non-gnu,ms-ps        -std=c++1y -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple i686-mingw32           -fsyntax-only -fms-extensions -verify=expected,non-ms,gnu,win-gnu      -std=c++1y -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple x86_64-mingw32         -fsyntax-only -fms-extensions -verify=expected,non-ms,gnu,win-gnu      -std=c++11 -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple i686-pc-cygwin         -fsyntax-only -fms-extensions -verify=expected,non-ms,gnu,win-gnu      -std=c++1y -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple x86_64-pc-cygwin       -fsyntax-only -fms-extensions -verify=expected,non-ms,gnu,win-gnu      -std=c++11 -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple i686-windows-itanium   -fsyntax-only -fms-extensions -verify=expected,non-ms,non-gnu,win-gnu  -std=c++11 -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple x86_64-windows-itanium -fsyntax-only -fms-extensions -verify=expected,non-ms,non-gnu,win-gnu  -std=c++1y -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple x86_64-scei-ps4        -fsyntax-only -fdeclspec      -verify=expected,non-ms,non-gnu,ms-ps    -std=c++11 -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple x86_64-sie-ps5         -fsyntax-only -fdeclspec      -verify=expected,non-ms,non-gnu,ms-ps    -std=c++1y -Wunsupported-dll-base-class-template %s
 
 // Helper structs to make templates more expressive.
 struct ImplicitInst_Exported {};
@@ -75,9 +75,7 @@ __declspec(dllexport) extern int GlobalRedecl4; // expected-warning{{redeclarati
 // External linkage is required.
 __declspec(dllexport) static int StaticGlobal; // expected-error{{'StaticGlobal' must have external linkage when declared 'dllexport'}}
 __declspec(dllexport) Internal InternalTypeGlobal; // expected-error{{'InternalTypeGlobal' must have external linkage when declared 'dllexport'}}
-#ifndef MS
-namespace    { __declspec(dllexport) int InternalGlobal; } // expected-error{{'(anonymous namespace)::InternalGlobal' must have external linkage when declared 'dllexport'}}
-#endif
+namespace    { __declspec(dllexport) int InternalGlobal; } // non-ms-error{{'(anonymous namespace)::InternalGlobal' must have external linkage when declared 'dllexport'}}
 namespace ns { __declspec(dllexport) int ExternalGlobal; }
 
 __declspec(dllexport) auto InternalAutoTypeGlobal = Internal(); // expected-error{{'InternalAutoTypeGlobal' must have external linkage when declared 'dllexport'}}
@@ -132,9 +130,7 @@ template<typename T> __declspec(dllexport) extern int VarTmplRedecl3; // expecte
 // External linkage is required.
 template<typename T> __declspec(dllexport) static int StaticVarTmpl; // expected-error{{'StaticVarTmpl' must have external linkage when declared 'dllexport'}}
 template<typename T> __declspec(dllexport) Internal InternalTypeVarTmpl; // expected-error{{'InternalTypeVarTmpl' must have external linkage when declared 'dllexport'}}
-#ifndef MS
-namespace    { template<typename T> __declspec(dllexport) int InternalVarTmpl; } // expected-error{{'(anonymous namespace)::InternalVarTmpl' must have external linkage when declared 'dllexport'}}
-#endif
+namespace    { template<typename T> __declspec(dllexport) int InternalVarTmpl; } // non-ms-error{{'(anonymous namespace)::InternalVarTmpl' must have external linkage when declared 'dllexport'}}
 namespace ns { template<typename T> __declspec(dllexport) int ExternalVarTmpl = 1; }
 
 template<typename T> __declspec(dllexport) auto InternalAutoTypeVarTmpl = Internal(); // expected-error{{'InternalAutoTypeVarTmpl' must have external linkage when declared 'dllexport'}}
@@ -355,11 +351,8 @@ class __declspec(dllexport) ClassDecl;
 
 class __declspec(dllexport) ClassDef {};
 
-#if defined(MS) || defined (WI) || defined(PS)
-// expected-warning@+3{{'dllexport' attribute ignored}}
-#endif
 template <typename T> struct PartiallySpecializedClassTemplate {};
-template <typename T> struct __declspec(dllexport) PartiallySpecializedClassTemplate<T*> { void f() {} };
+template <typename T> struct __declspec(dllexport) PartiallySpecializedClassTemplate<T*> { void f() {} }; // non-gnu-warning {{'dllexport' attribute ignored}}
 
 template <typename T> struct ExpliciallySpecializedClassTemplate {};
 template <> struct __declspec(dllexport) ExpliciallySpecializedClassTemplate<int> { void f() {} };
@@ -373,16 +366,11 @@ ImplicitlyInstantiatedExportedTemplate<IncompleteType> implicitlyInstantiatedExp
 
 // Don't instantiate class members of templates with explicit instantiation declarations, even if they are exported.
 struct IncompleteType2;
-#if defined(MS) || defined (WI) || defined(PS)
-// expected-note@+2{{attribute is here}}
-#endif
-template <typename T> struct __declspec(dllexport) ExportedTemplateWithExplicitInstantiationDecl {
+
+template <typename T> struct __declspec(dllexport) ExportedTemplateWithExplicitInstantiationDecl { // non-gnu-note {{attribute is here}}
   int f() { return sizeof(T); } // no-error
 };
-#if defined(MS) || defined (WI) || defined(PS)
-// expected-warning@+2{{explicit instantiation declaration should not be 'dllexport'}}
-#endif
-extern template struct ExportedTemplateWithExplicitInstantiationDecl<IncompleteType2>;
+extern template struct ExportedTemplateWithExplicitInstantiationDecl<IncompleteType2>; // non-gnu-warning {{explicit instantiation declaration should not be 'dllexport'}}
 
 // Instantiate class members for explicitly instantiated exported templates.
 struct IncompleteType3; // expected-note{{forward declaration of 'IncompleteType3'}}
@@ -392,16 +380,9 @@ template <typename T> struct __declspec(dllexport) ExplicitlyInstantiatedExporte
 template struct ExplicitlyInstantiatedExportedTemplate<IncompleteType3>; // expected-note{{in instantiation of member function 'ExplicitlyInstantiatedExportedTemplate<IncompleteType3>::f' requested here}}
 
 // In MS mode, instantiate members of class templates that are base classes of exported classes.
-#if defined(MS) || defined(PS)
-  // expected-note@+3{{forward declaration of 'IncompleteType4'}}
-  // expected-note@+3{{in instantiation of member function 'BaseClassTemplateOfExportedClass<IncompleteType4>::f' requested here}}
-#endif
-struct IncompleteType4;
-template <typename T> struct BaseClassTemplateOfExportedClass {
-#if defined(MS) || defined(PS)
-  // expected-error@+2{{invalid application of 'sizeof' to an incomplete type 'IncompleteType4'}}
-#endif
-  int f() { return sizeof(T); };
+struct IncompleteType4; // ms-ps-note {{forward declaration of 'IncompleteType4'}}
+template <typename T> struct BaseClassTemplateOfExportedClass { // ms-ps-note {{in instantiation of member function 'BaseClassTemplateOfExportedClass<IncompleteType4>::f' requested here}}
+  int f() { return sizeof(T); }; // ms-ps-error {{invalid application of 'sizeof' to an incomplete type 'IncompleteType4'}}
 };
 struct __declspec(dllexport) ExportedBaseClass : public BaseClassTemplateOfExportedClass<IncompleteType4> {};
 
@@ -414,17 +395,11 @@ struct __declspec(dllexport) ExportedBaseClass2 : public ExportedBaseClassTempla
 
 // Warn about explicit instantiation declarations of dllexport classes.
 template <typename T> struct ExplicitInstantiationDeclTemplate {};
-#if defined(MS) || defined (WI) || defined(PS)
-// expected-warning@+2{{explicit instantiation declaration should not be 'dllexport'}} expected-note@+2{{attribute is here}}
-#endif
-extern template struct __declspec(dllexport) ExplicitInstantiationDeclTemplate<int>;
+extern template struct __declspec(dllexport) ExplicitInstantiationDeclTemplate<int>; // non-gnu-warning {{explicit instantiation declaration should not be 'dllexport'}} \
+                                                                                        non-gnu-note {{attribute is here}}
 
-template <typename T> struct __declspec(dllexport) ExplicitInstantiationDeclExportedTemplate {};
-#if defined(MS) || defined (WI) || defined(PS)
-// expected-note@-2{{attribute is here}}
-// expected-warning@+2{{explicit instantiation declaration should not be 'dllexport'}}
-#endif
-extern template struct ExplicitInstantiationDeclExportedTemplate<int>;
+template <typename T> struct __declspec(dllexport) ExplicitInstantiationDeclExportedTemplate {}; // non-gnu-note {{attribute is here}}
+extern template struct ExplicitInstantiationDeclExportedTemplate<int>; // non-gnu-warning {{explicit instantiation declaration should not be 'dllexport'}}
 
 namespace { struct InternalLinkageType {}; }
 struct __declspec(dllexport) PR23308 {
@@ -440,35 +415,23 @@ class __declspec(dllexport) ExportedClass {};
 class __declspec(dllimport) ImportedClass {};
 
 template <typename T> class ClassTemplate {};
-#if not defined(MS) && not defined(PS)
-// expected-error@+2{{'ExportedClassTemplate<LocalCRTP>' must have external linkage when declared 'dllexport'}}
-#endif
-template <typename T> class __declspec(dllexport) ExportedClassTemplate {};
+template <typename T> class __declspec(dllexport) ExportedClassTemplate {}; // win-gnu-error {{'ExportedClassTemplate<LocalCRTP>' must have external linkage when declared 'dllexport'}}
 template <typename T> class __declspec(dllimport) ImportedClassTemplate {};
 
 template <typename T> struct ExplicitlySpecializedTemplate { void func() {} };
-#if defined(MS) || defined(PS)
-// expected-note@+2{{class template 'ExplicitlySpecializedTemplate<int>' was explicitly specialized here}}
-#endif
-template <> struct ExplicitlySpecializedTemplate<int> { void func() {} };
+template <> struct ExplicitlySpecializedTemplate<int> { void func() {} }; // ms-ps-note {{class template 'ExplicitlySpecializedTemplate<int>' was explicitly specialized here}}
 template <typename T> struct ExplicitlyExportSpecializedTemplate { void func() {} };
 template <> struct __declspec(dllexport) ExplicitlyExportSpecializedTemplate<int> { void func() {} };
 template <typename T> struct ExplicitlyImportSpecializedTemplate { void func() {} };
 template <> struct __declspec(dllimport) ExplicitlyImportSpecializedTemplate<int> { void func() {} };
 
 template <typename T> struct ExplicitlyInstantiatedTemplate { void func() {} };
-#if defined(MS) || defined(PS)
-// expected-note@+2{{class template 'ExplicitlyInstantiatedTemplate<int>' was instantiated here}}
-#endif
-template struct ExplicitlyInstantiatedTemplate<int>;
+template struct ExplicitlyInstantiatedTemplate<int>; // ms-ps-note {{class template 'ExplicitlyInstantiatedTemplate<int>' was instantiated here}}
 template <typename T> struct ExplicitlyExportInstantiatedTemplate { void func() {} };
 template struct __declspec(dllexport) ExplicitlyExportInstantiatedTemplate<int>;
 template <typename T> struct ExplicitlyExportDeclaredInstantiatedTemplate { void func() {} };
 extern template struct ExplicitlyExportDeclaredInstantiatedTemplate<int>;
-#if not defined(MS) && not defined (WI) && not defined(PS)
-// expected-warning@+2{{'dllexport' attribute ignored on explicit instantiation definition}}
-#endif
-template struct __declspec(dllexport) ExplicitlyExportDeclaredInstantiatedTemplate<int>;
+template struct __declspec(dllexport) ExplicitlyExportDeclaredInstantiatedTemplate<int>; // gnu-warning {{'dllexport' attribute ignored on explicit instantiation definition}}
 template <typename T> struct ExplicitlyImportInstantiatedTemplate { void func() {} };
 template struct __declspec(dllimport) ExplicitlyImportInstantiatedTemplate<int>;
 
@@ -496,11 +459,8 @@ class __declspec(dllexport) DerivedFromTemplateB : public ClassTemplate<bool> {}
 // The second derived class doesn't change anything, the attribute that was propagated first wins.
 class __declspec(dllimport) DerivedFromTemplateB2 : public ClassTemplate<bool> {};
 
-#if defined(MS) || defined(PS)
-// expected-warning@+3{{propagating dll attribute to explicitly specialized base class template without dll attribute is not supported}}
-// expected-note@+2{{attribute is here}}
-#endif
-struct __declspec(dllexport) DerivedFromExplicitlySpecializedTemplate : public ExplicitlySpecializedTemplate<int> {};
+struct __declspec(dllexport) DerivedFromExplicitlySpecializedTemplate : public ExplicitlySpecializedTemplate<int> {}; // ms-ps-warning {{propagating dll attribute to explicitly specialized base class template without dll attribute is not supported}} \
+                                                                                                                         ms-ps-note {{attribute is here}}
 
 // Base class alredy specialized with export attribute.
 struct __declspec(dllexport) DerivedFromExplicitlyExportSpecializedTemplate : public ExplicitlyExportSpecializedTemplate<int> {};
@@ -508,11 +468,8 @@ struct __declspec(dllexport) DerivedFromExplicitlyExportSpecializedTemplate : pu
 // Base class already specialized with import attribute.
 struct __declspec(dllexport) DerivedFromExplicitlyImportSpecializedTemplate : public ExplicitlyImportSpecializedTemplate<int> {};
 
-#if defined(MS) || defined(PS)
-// expected-warning@+3{{propagating dll attribute to already instantiated base class template without dll attribute is not supported}}
-// expected-note@+2{{attribute is here}}
-#endif
-struct __declspec(dllexport) DerivedFromExplicitlyInstantiatedTemplate : public ExplicitlyInstantiatedTemplate<int> {};
+struct __declspec(dllexport) DerivedFromExplicitlyInstantiatedTemplate : public ExplicitlyInstantiatedTemplate<int> {}; // ms-ps-warning {{propagating dll attribute to already instantiated base class template without dll attribute is not supported}} \
+                                                                                                                           ms-ps-note {{attribute is here}}
 
 // Base class already instantiated with export attribute.
 struct __declspec(dllexport) DerivedFromExplicitlyExportInstantiatedTemplate : public ExplicitlyExportInstantiatedTemplate<int> {};
@@ -528,10 +485,7 @@ void func() {
   // MSVC allows deriving from exported template classes in local contexts.
   class LocalDerivedFromExportedClass : public ExportedClass {};
   class LocalDerivedFromExportedTemplate : public ExportedClassTemplate<int> {};
-#if not defined(MS) && not defined (PS)
-  // expected-note@+2{{in instantiation of template class 'ExportedClassTemplate<LocalCRTP>' requested here}}
-#endif
-  class LocalCRTP : public ExportedClassTemplate<LocalCRTP> {};
+  class LocalCRTP : public ExportedClassTemplate<LocalCRTP> {}; // win-gnu-note {{in instantiation of template class 'ExportedClassTemplate<LocalCRTP>' requested here}}
 }
 
 //===----------------------------------------------------------------------===//
@@ -778,46 +732,40 @@ __declspec(dllexport)        void MemberRedecl::staticInlineDecl() {}  // expect
 
 __declspec(dllexport)        int  MemberRedecl::StaticField = 1;       // expected-error{{redeclaration of 'MemberRedecl::StaticField' cannot add 'dllexport' attribute}}
 __declspec(dllexport) const  int  MemberRedecl::StaticConstField = 1;  // expected-error{{redeclaration of 'MemberRedecl::StaticConstField' cannot add 'dllexport' attribute}}
-#ifdef MS
-// expected-warning@+4{{attribute declaration must precede definition}}
-#else
-// expected-error@+2{{redeclaration of 'MemberRedecl::ConstexprField' cannot add 'dllexport' attribute}}
-#endif
-__declspec(dllexport) constexpr int MemberRedecl::ConstexprField;
 
-#ifdef MS
+__declspec(dllexport) constexpr int MemberRedecl::ConstexprField; // ms-warning {{attribute declaration must precede definition}} \
+                                                                     non-ms-error {{redeclaration of 'MemberRedecl::ConstexprField' cannot add 'dllexport' attribute}}
+
 struct __declspec(dllexport) ClassWithMultipleDefaultCtors {
-  ClassWithMultipleDefaultCtors(int = 40) {} // expected-error{{'__declspec(dllexport)' cannot be applied to more than one default constructor}}
-  ClassWithMultipleDefaultCtors(int = 30, ...) {} // expected-note{{declared here}}
+  ClassWithMultipleDefaultCtors(int = 40) {} // ms-error{{'__declspec(dllexport)' cannot be applied to more than one default constructor}}
+  ClassWithMultipleDefaultCtors(int = 30, ...) {} // ms-note{{declared here}}
 };
 template <typename T>
 struct ClassTemplateWithMultipleDefaultCtors {
-  __declspec(dllexport) ClassTemplateWithMultipleDefaultCtors(int = 40) {}      // expected-error{{'__declspec(dllexport)' cannot be applied to more than one default constructor}}
-  __declspec(dllexport) ClassTemplateWithMultipleDefaultCtors(int = 30, ...) {} // expected-note{{declared here}}
+  __declspec(dllexport) ClassTemplateWithMultipleDefaultCtors(int = 40) {}      // ms-error{{'__declspec(dllexport)' cannot be applied to more than one default constructor}}
+  __declspec(dllexport) ClassTemplateWithMultipleDefaultCtors(int = 30, ...) {} // ms-note{{declared here}}
 };
 
 template <typename T> struct HasDefaults {
-  HasDefaults(int x = sizeof(T)) {} // expected-error {{invalid application of 'sizeof'}}
+  HasDefaults(int x = sizeof(T)) {} // ms-error {{invalid application of 'sizeof'}}
 };
 template struct __declspec(dllexport) HasDefaults<char>;
 
 template struct
-__declspec(dllexport) // expected-note {{in instantiation of default function argument expression for 'HasDefaults<void>' required here}}
-HasDefaults<void>; // expected-note {{in instantiation of member function 'HasDefaults<void>::HasDefaults' requested here}}
+__declspec(dllexport) // ms-note {{in instantiation of default function argument expression for 'HasDefaults<void>' required here}}
+HasDefaults<void>; // ms-note {{in instantiation of member function 'HasDefaults<void>::HasDefaults' requested here}}
 
 template <typename T> struct HasDefaults2 {
-  __declspec(dllexport) // expected-note {{in instantiation of default function argument expression for 'HasDefaults2<void>' required here}}
-  HasDefaults2(int x = sizeof(T)) {} // expected-error {{invalid application of 'sizeof'}}
+  __declspec(dllexport) // ms-note {{in instantiation of default function argument expression for 'HasDefaults2<void>' required here}}
+  HasDefaults2(int x = sizeof(T)) {} // ms-error {{invalid application of 'sizeof'}}
 };
-template struct HasDefaults2<void>; // expected-note {{in instantiation of member function 'HasDefaults2<void>::HasDefaults2' requested here}}
+template struct HasDefaults2<void>; // ms-note {{in instantiation of member function 'HasDefaults2<void>::HasDefaults2' requested here}}
 
-template <typename T> struct __declspec(dllexport) HasDefaults3 { // expected-note{{in instantiation of default function argument expression for 'HasDefaults3<void>' required here}}
-  HasDefaults3(int x = sizeof(T)) {} // expected-error {{invalid application of 'sizeof'}}
+template <typename T> struct __declspec(dllexport) HasDefaults3 { // ms-note{{in instantiation of default function argument expression for 'HasDefaults3<void>' required here}}
+  HasDefaults3(int x = sizeof(T)) {} // ms-error {{invalid application of 'sizeof'}}
 };
 template <> HasDefaults3<void>::HasDefaults3(int) {};
 
-#endif
-
 //===----------------------------------------------------------------------===//
 // Class member templates
 //===----------------------------------------------------------------------===//
@@ -887,12 +835,8 @@ template<typename T> __declspec(dllexport)        void MemTmplRedecl::staticInli
 template<typename T> __declspec(dllexport)        int  MemTmplRedecl::StaticField = 1;      // expected-error{{redeclaration of 'MemTmplRedecl::StaticField' cannot add 'dllexport' attribute}}
 template<typename T> __declspec(dllexport) const  int  MemTmplRedecl::StaticConstField = 1; // expected-error{{redeclaration of 'MemTmplRedecl::StaticConstField' cannot add 'dllexport' attribute}}
 
-#ifdef MS
-// expected-warning@+4{{attribute declaration must precede definition}}
-#else
-// expected-error@+2{{redeclaration of 'MemTmplRedecl::ConstexprField' cannot add 'dllexport' attribute}}
-#endif
-template<typename T> __declspec(dllexport) constexpr int MemTmplRedecl::ConstexprField;
+template<typename T> __declspec(dllexport) constexpr int MemTmplRedecl::ConstexprField; // ms-warning {{attribute declaration must precede definition}} \
+                                                                                           non-ms-error {{redeclaration of 'MemTmplRedecl::ConstexprField' cannot add 'dllexport' attribute}}
 #endif // __has_feature(cxx_variable_templates)
 
 
@@ -1097,20 +1041,13 @@ template<typename T> __declspec(dllexport)        void CTMR<T>::staticInlineDecl
 
 template<typename T> __declspec(dllexport)        int  CTMR<T>::StaticField = 1;       // expected-error{{redeclaration of 'CTMR::StaticField' cannot add 'dllexport' attribute}}
 template<typename T> __declspec(dllexport) const  int  CTMR<T>::StaticConstField = 1;  // expected-error{{redeclaration of 'CTMR::StaticConstField' cannot add 'dllexport' attribute}}
-#ifdef MS
-// expected-warning@+4{{attribute declaration must precede definition}}
-#else
-// expected-error@+2{{redeclaration of 'CTMR::ConstexprField' cannot add 'dllexport' attribute}}
-#endif
-template<typename T> __declspec(dllexport) constexpr int CTMR<T>::ConstexprField;
+template<typename T> __declspec(dllexport) constexpr int CTMR<T>::ConstexprField; // ms-warning {{attribute declaration must precede definition}} \
+                                                                                     non-ms-error {{redeclaration of 'CTMR::ConstexprField' cannot add 'dllexport' attribute}}
 
 // MSVC exports explicit specialization of exported class template member
 // function, and errors on such definitions. MinGW does not treat them as
 // dllexport.
-#if !defined(GNU)
-// expected-error@+2{{attribute 'dllexport' cannot be applied to a deleted function}}
-#endif
-template <> void ExportClassTmplMembers<int>::normalDecl() = delete;
+template <> void ExportClassTmplMembers<int>::normalDecl() = delete; // non-gnu-error {{attribute 'dllexport' cannot be applied to a deleted function}}
 
 
 //===----------------------------------------------------------------------===//
@@ -1183,12 +1120,8 @@ template<typename T> template<typename U> __declspec(dllexport)        void CTMT
 #if __has_feature(cxx_variable_templates)
 template<typename T> template<typename U> __declspec(dllexport)        int  CTMTR<T>::StaticField = 1;       // expected-error{{redeclaration of 'CTMTR::StaticField' cannot add 'dllexport' attribute}}
 template<typename T> template<typename U> __declspec(dllexport) const  int  CTMTR<T>::StaticConstField = 1;  // expected-error{{redeclaration of 'CTMTR::StaticConstField' cannot add 'dllexport' attribute}}
-#ifdef MS
-// expected-warning@+4{{attribute declaration must precede definition}}
-#else
-// expected-error@+2{{redeclaration of 'CTMTR::ConstexprField' cannot add 'dllexport' attribute}}
-#endif
-template<typename T> template<typename U> __declspec(dllexport) constexpr int CTMTR<T>::ConstexprField;
+template<typename T> template<typename U> __declspec(dllexport) constexpr int CTMTR<T>::ConstexprField; // ms-warning {{attribute declaration must precede definition}} \
+                                                                                                           non-ms-error {{redeclaration of 'CTMTR::ConstexprField' cannot add 'dllexport' attribute}}
 #endif // __has_feature(cxx_variable_templates)
 
 // FIXME: Precedence rules seem to be different for classes.
@@ -1197,7 +1130,4 @@ template<typename T> template<typename U> __declspec(dllexport) constexpr int CT
 // Lambdas
 //===----------------------------------------------------------------------===//
 // The MS ABI doesn't provide a stable mangling for lambdas, so they can't be imported or exported.
-#if defined(MS) || defined (WI) || defined(PS)
-// expected-error@+2{{lambda cannot be declared 'dllexport'}}
-#endif
-auto Lambda = []() __declspec(dllexport) -> bool { return true; };
+auto Lambda = []() __declspec(dllexport) -> bool { return true; }; // non-gnu-error {{lambda cannot be declared 'dllexport'}}
diff --git a/clang/test/SemaCXX/statements.cpp b/clang/test/SemaCXX/statements.cpp
index 48f178dd9a8b3..426e9fa1e585b 100644
--- a/clang/test/SemaCXX/statements.cpp
+++ b/clang/test/SemaCXX/statements.cpp
@@ -43,8 +43,6 @@ T test7(T v) {
   return ({ // expected-warning{{use of GNU statement expression extension}}
     T a = v;
     a;
-    ;
-    ;
   });
 }
 
@@ -53,6 +51,21 @@ void test8() {
   double b = test7(2.0);
 }
 
+template <typename T>
+T test9(T v) {
+  return ({ // expected-warning {{use of GNU statement expression extension}}
+    T a = v;
+    a; // expected-warning {{expression result unused}}
+    ;
+    ;
+  });
+}
+
+void test10() {
+  int a = test9(1);  // expected-note {{in instantiation of function template specialization 'test9<int>' requested here}}
+  // expected-error@-10 {{cannot initialize return object of type 'int' with an rvalue of type 'void'}}
+}
+
 namespace GH48405 {
 void foo() {
   struct S {
diff --git a/clang/test/SemaCXX/vector.cpp b/clang/test/SemaCXX/vector.cpp
index 808bdb679b09c..06195f039cd92 100644
--- a/clang/test/SemaCXX/vector.cpp
+++ b/clang/test/SemaCXX/vector.cpp
@@ -786,3 +786,16 @@ const long long e = *0; // expected-error {{indirection requires pointer operand
 double f = a - e;       // expected-error {{cannot initialize a variable of type 'double' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(double)))) double' (vector of 1 'double' value)}}
 int h = c - e;          // expected-error {{cannot initialize a variable of type 'int' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(long)))) long' (vector of 1 'long' value)}}
 }
+
+typedef int v_neg_size __attribute__((vector_size(-8))); // expected-error{{vector must have non-negative size}}
+typedef int v_neg_size_2 __attribute__((vector_size(-1 * 8))); // expected-error{{vector must have non-negative size}}
+typedef int v_ext_neg_size __attribute__((ext_vector_type(-8))); // expected-error{{vector must have non-negative size}}
+typedef int v_ext_neg_size2 __attribute__((ext_vector_type(-1 * 8))); // expected-error{{vector must have non-negative size}}
+
+
+#if __cplusplus >= 201103L
+
+template <int N> using templated_v_size = int  __attribute__((vector_size(N))); // expected-error{{vector must have non-negative size}}
+templated_v_size<-8> templated_v_neg_size; //expected-note{{in instantiation of template type alias 'templated_v_size' requested here}}
+
+#endif
diff --git a/clang/test/SemaHIP/builtins-amdgcn-raw-buffer-atomic-add.hip b/clang/test/SemaHIP/builtins-amdgcn-raw-buffer-atomic-add.hip
index 8ee64d486f4f4..fea86162c801d 100644
--- a/clang/test/SemaHIP/builtins-amdgcn-raw-buffer-atomic-add.hip
+++ b/clang/test/SemaHIP/builtins-amdgcn-raw-buffer-atomic-add.hip
@@ -14,5 +14,9 @@ __device__ void test_raw_ptr_atomics(__amdgpu_buffer_rsrc_t rsrc, int i32, float
 __device__ void test_raw_ptr_atomics_err(__amdgpu_buffer_rsrc_t rsrc, int i32, float f32, float16x2_t v2f16, int offset, int soffset) {
   i32 = __builtin_amdgcn_raw_ptr_buffer_atomic_add_i32(i32, rsrc, offset, soffset, 0, 4); // expected-error{{too many arguments to function call}}
   f32 = __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32(f32, rsrc, offset, soffset, 0, 4); // expected-error{{too many arguments to function call}}
-  v2f16 = __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16(v2f16, rsrc, offset, soffset, 0, 4);
+  v2f16 = __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16(v2f16, rsrc, offset, soffset, 0, 4); // expected-error{{too many arguments to function call}}
+}
+
+__device__ void test_raw_ptr_atomics_f16_retty(__amdgpu_buffer_rsrc_t rsrc, int i32, float f32, float16x2_t v2f16, int offset, int soffset) {
+  v2f16 = __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16(v2f16, rsrc, offset, soffset, 0);
 }
diff --git a/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl
index b4ef0550bf88a..553db49231ae0 100644
--- a/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/AddUint64-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 uint2 test_too_few_arg() {
   return __builtin_hlsl_adduint64();
diff --git a/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl
index 4afd799f8539e..5e00428de0c82 100644
--- a/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/all-errors.hlsl
@@ -1,5 +1,5 @@
 
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
 
 bool test_too_few_arg() {
   return __builtin_hlsl_all();
diff --git a/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl
index e42fd97b40219..6210c998d8e2d 100644
--- a/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl
@@ -1,5 +1,5 @@
 
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
 
 bool test_too_few_arg() {
   return __builtin_hlsl_any();
diff --git a/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl
index f5f223943b4cd..9872f39ebcfba 100644
--- a/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/asfloat-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify
 
 
 float4 test_float_too_many_arg(float p0, float p1) {
diff --git a/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl
index 815a0c35cb04c..52f2cd224a13c 100644
--- a/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/asint-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify
 
 
 int4 test_asint_too_many_arg(float p0, float p1) {
diff --git a/clang/test/SemaHLSL/BuiltIns/asint16-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asint16-errors.hlsl
index fee1c2eb87b11..5f3d5c9772d84 100644
--- a/clang/test/SemaHLSL/BuiltIns/asint16-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/asint16-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -verify
 
 
 int16_t4 test_asint16_too_many_arg(uint16_t p0, uint16_t p1)
diff --git a/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl
index 9d0c206a3b3ad..3bb6cc0094926 100644
--- a/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify
 
 
 uint4 test_asuint_too_many_arg(float p0, float p1) {
diff --git a/clang/test/SemaHLSL/BuiltIns/asuint16-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asuint16-errors.hlsl
index 024fd406fe8ef..709d2067d9df2 100644
--- a/clang/test/SemaHLSL/BuiltIns/asuint16-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/asuint16-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.2-library %s -fnative-half-type -fnative-int16-type -verify
 
 uint16_t test_asuint16_less_argument()
 {
diff --git a/clang/test/SemaHLSL/BuiltIns/clamp-errors-16bit.hlsl b/clang/test/SemaHLSL/BuiltIns/clamp-errors-16bit.hlsl
index 7a6341659493b..40910bc9108ed 100644
--- a/clang/test/SemaHLSL/BuiltIns/clamp-errors-16bit.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/clamp-errors-16bit.hlsl
@@ -1,8 +1,8 @@
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=half
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=int16_t
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=uint16_t
 
 // check we error on 16 bit type if shader model is too old
diff --git a/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl
index 93e37075773f5..bbe567b6d6ac1 100644
--- a/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
 
 float2 test_no_second_arg(float2 p0) {
   return __builtin_hlsl_elementwise_clamp(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/clip-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/clip-errors.hlsl
index 2cb401601f7eb..f47468897312c 100644
--- a/clang/test/SemaHLSL/BuiltIns/clip-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/clip-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify
 
 
 void test_arg_missing() {
diff --git a/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl
index 5704165e1a450..8949324ec69f6 100644
--- a/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/countbits-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
 
 
 double test_int_builtin(double p0) {
diff --git a/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl
index 4f73dad79f21f..2c3e8d1560c87 100644
--- a/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -disable-llvm-passes -verify
 
 void test_too_few_arg()
 {
diff --git a/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl
index e7521c7251432..4ec1bcef2b6fc 100644
--- a/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/distance-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_no_second_arg(float2 p0) {
   return distance(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
index 606194692931f..f514a04eb9f49 100644
--- a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
 
 float test_no_second_arg(float2 p0) {
   return __builtin_hlsl_dot(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl
index 5933faeae2aac..84333ba08b9b8 100644
--- a/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_too_few_arg() {
   return dot2add();
diff --git a/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl
index 1435232cbfbc5..f0076ac4e5881 100644
--- a/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl
@@ -1,7 +1,7 @@
 
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp2
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp10
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp2
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp10
 float test_too_few_arg() {
   return TEST_FUNC();
   // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
diff --git a/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl
new file mode 100644
index 0000000000000..8f2f9308ed966
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl
@@ -0,0 +1,134 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.6-library %s -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
+
+float builtin_f16tof32_too_few_arg() {
+  return __builtin_hlsl_elementwise_f16tof32();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+  // expected-note@hlsl/hlsl_alias_intrinsics.h:* 4 {{candidate function not viable: requires 1 argument, but 0 were provided}}
+}
+
+float builtin_f16tof32_too_many_arg(uint p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+  // expected-note@hlsl/hlsl_alias_intrinsics.h:* 4 {{candidate function not viable: requires 1 argument, but 2 were provided}}
+}
+
+float builtin_f16tof32_bool(bool p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool')}}
+}
+
+float builtin_f16tof32_bool4(bool4 p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool4' (aka 'vector<bool, 4>')}}
+}
+
+float builtin_f16tof32_short(short p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'short')}}
+}
+
+float builtin_f16tof32_unsigned_short(unsigned short p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{incorrect number of bits in integer (expected 32 bits, have 16)}}
+}
+
+float builtin_f16tof32_int(int p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int')}}
+}
+
+float builtin_f16tof32_int64_t(long p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'long')}}
+}
+
+float2 builtin_f16tof32_int2_to_float2_promotion(int2 p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int2' (aka 'vector<int, 2>'))}}
+}
+
+float builtin_f16tof32_half(half p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half')}}
+}
+
+float builtin_f16tof32_half4(half4 p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half4' (aka 'vector<half, 4>'))}}
+}
+
+float builtin_f16tof32_float(float p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'float')}}
+}
+
+float builtin_f16tof32_double(double p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'double')}}
+}
+
+float f16tof32_too_few_arg() {
+  return f16tof32();
+  // expected-error@-1 {{no matching function for call to 'f16tof32'}}
+}
+
+float f16tof32_too_many_arg(uint p0) {
+  return f16tof32(p0, p0);
+  // expected-error@-1 {{no matching function for call to 'f16tof32'}}
+}
+
+float f16tof32_bool(bool p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool')}}
+}
+
+float f16tof32_bool3(bool3 p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool3' (aka 'vector<bool, 3>'))}}
+}
+
+
+float f16tof32_int16_t(short p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'short')}}
+}
+
+float f16tof32_int16_t(unsigned short p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{incorrect number of bits in integer (expected 32 bits, have 16)}}
+}
+
+float f16tof32_int(int p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int')}}
+}
+
+float f16tof32_int64_t(long p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'long')}}
+}
+
+float2 f16tof32_int2_to_float2_promotion(int3 p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int3' (aka 'vector<int, 3>'))}}
+}
+
+float f16tof32_half(half p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half')}}
+}
+
+float f16tof32_half2(half2 p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half2' (aka 'vector<half, 2>'))}}
+}
+
+float f16tof32_float(float p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'float')}}
+}
+
+float f16tof32_double(double p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'double')}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/faceforward-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/faceforward-errors.hlsl
index 469d55995f966..01261a00295b1 100644
--- a/clang/test/SemaHLSL/BuiltIns/faceforward-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/faceforward-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_double_inputs(double p0, double p1, double p2) {
   return faceforward(p0, p1, p2);
diff --git a/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
index 8badaf0b99a20..1f70186c78ad9 100644
--- a/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
 
 int test_too_few_arg() {
   return firstbithigh();
@@ -12,7 +12,7 @@ int test_too_many_arg(int p0) {
 
 double test_int_builtin(double p0) {
   return firstbithigh(p0);
-  // expected-error@-1 {{call to 'firstbithigh' is ambiguous}}
+  // expected-error@-1 {{no matching function for call to 'firstbithigh'}}
 }
 
 double2 test_int_builtin_2(double2 p0) {
diff --git a/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl
index b12afe65a863e..37090796577fc 100644
--- a/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
 
 int test_too_few_arg() {
   return firstbitlow();
diff --git a/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl
index fc931139e523d..eceac9be8d7d1 100644
--- a/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/fmod-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_no_second_arg(float2 p0) {
   return fmod(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl
index 1e277186f22c4..cdf2b61c45207 100644
--- a/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl
@@ -1,5 +1,5 @@
 
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_too_few_arg() {
   return __builtin_hlsl_elementwise_frac();
diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl
index bf044797c3acb..e9cc0ed338e3e 100644
--- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl
@@ -1,25 +1,25 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_acos
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_asin
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_ceil
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cos
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cosh
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp2
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp10
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_floor
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log2
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log10
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sin
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sinh
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sqrt
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_roundeven
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tan
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tanh
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_trunc
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_degrees
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_radians
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_acos
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_asin
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_ceil
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cos
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cosh
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp2
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp10
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_floor
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log2
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log10
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sin
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sinh
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sqrt
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_roundeven
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tan
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tanh
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_trunc
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_degrees
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_radians
 
 double test_double_builtin(double p0) {
     return TEST_FUNC(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
index c264617558261..9e10e1afa9385 100644
--- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan2
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_fmod
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_pow
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan2
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_fmod
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_pow
 
 double test_double_builtin(double p0, double p1) {
     return TEST_FUNC(p0, p1);
diff --git a/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl
index 8d14df91f1409..a32bc9628a295 100644
--- a/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl
@@ -1,5 +1,5 @@
 
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 bool test_too_few_arg() {
   return __builtin_hlsl_elementwise_isinf();
diff --git a/clang/test/SemaHLSL/BuiltIns/isnan-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/isnan-errors.hlsl
index a6be28117af4f..625c415f91de2 100644
--- a/clang/test/SemaHLSL/BuiltIns/isnan-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/isnan-errors.hlsl
@@ -1,5 +1,5 @@
 
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 bool test_too_few_arg() {
   return __builtin_hlsl_elementwise_isnan();
diff --git a/clang/test/SemaHLSL/BuiltIns/ldexp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/ldexp-errors.hlsl
index 0bc7f7e40f5d3..fa146a5bce525 100644
--- a/clang/test/SemaHLSL/BuiltIns/ldexp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/ldexp-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_double_inputs(double p0, double p1) {
   return ldexp(p0, p1);
diff --git a/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl
index 3aaafa37e8e82..8c5c9a4a0d22a 100644
--- a/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 void test_too_few_arg()
 {
diff --git a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
index 9592d8766dada..22720a4a37d02 100644
--- a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
 
 float2 test_no_second_arg(float2 p0) {
   return __builtin_hlsl_lerp(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
index 5dec0f68d71fa..0e9dda7055f98 100644
--- a/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected=note
 
 float2 test_no_second_arg(float2 p0) {
   return __builtin_hlsl_mad(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl
index 5ad1d6aefde38..6a6f14b52cb16 100644
--- a/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl
@@ -3,7 +3,7 @@
 uint64_t5x5 mat;
 // expected-error@-1  {{unknown type name 'uint64_t5x5'}}
 
-// Note: this one only fails because -fnative-half-type is not set
+// Note: this one only fails because -fnative-half-type -fnative-int16-type is not set
 uint16_t4x4 mat2;
 // expected-error@-1  {{unknown type name 'uint16_t4x4'}}
 
diff --git a/clang/test/SemaHLSL/BuiltIns/max-errors-16bit.hlsl b/clang/test/SemaHLSL/BuiltIns/max-errors-16bit.hlsl
index 32a4bbd42e5ec..71c14efa60b0f 100644
--- a/clang/test/SemaHLSL/BuiltIns/max-errors-16bit.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/max-errors-16bit.hlsl
@@ -1,8 +1,8 @@
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=half
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=int16_t
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=uint16_t
 
 // check we error on 16 bit type if shader model is too old
diff --git a/clang/test/SemaHLSL/BuiltIns/min-errors-16bit.hlsl b/clang/test/SemaHLSL/BuiltIns/min-errors-16bit.hlsl
index eb0066835689a..c2cffa18892d5 100644
--- a/clang/test/SemaHLSL/BuiltIns/min-errors-16bit.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/min-errors-16bit.hlsl
@@ -1,8 +1,8 @@
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=half
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=int16_t
-// RUN: not %clang_cc1 -fnative-half-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
+// RUN: not %clang_cc1 -fnative-half-type -fnative-int16-type -std=hlsl202x -triple dxilv1.0-unknown-shadermodel6.0-compute \
 // RUN:  -finclude-default-header -S -o - %s 2>&1 | FileCheck %s -DTEST_TYPE=uint16_t
 
 // check we error on 16 bit type if shader model is too old
diff --git a/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl
index 6ec32257a370f..377c2d5e41a73 100644
--- a/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -disable-llvm-passes -verify
 
 void test_too_few_arg()
 {
diff --git a/clang/test/SemaHLSL/BuiltIns/radians-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/radians-errors.hlsl
index dbffce226b54e..70e5b671bb3c9 100644
--- a/clang/test/SemaHLSL/BuiltIns/radians-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/radians-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_too_few_arg() {
   return __builtin_hlsl_elementwise_radians();
diff --git a/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl
index 01876240e82d0..79076b4815a6e 100644
--- a/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_too_few_arg() {
   return __builtin_hlsl_elementwise_rcp();
diff --git a/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl
index 9934a3e525d38..b0ae770f49f20 100644
--- a/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/reflect-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_no_second_arg(float2 p0) {
   return reflect(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl
index 6cb3e56c20f0e..fce41a4a46d38 100644
--- a/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_no_second_arg(float3 p0) {
   return refract(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl
index 1ac275beba642..5b33b89cb8eb8 100644
--- a/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 
 double2 test_int_builtin(double2 p0) {
diff --git a/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl
index 45f86450b37c2..54feed35379d7 100644
--- a/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl
@@ -1,5 +1,5 @@
 
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
 
 float test_too_few_arg() {
   return __builtin_elementwise_round();
diff --git a/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl
index 1f81c51207bc3..cedfcca35225e 100644
--- a/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_too_few_arg() {
   return __builtin_hlsl_elementwise_rsqrt();
diff --git a/clang/test/SemaHLSL/BuiltIns/saturate-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/saturate-errors.hlsl
index 721b28f86f950..4054ebfb3f649 100644
--- a/clang/test/SemaHLSL/BuiltIns/saturate-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/saturate-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -Werror
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -Werror
 
 float2 test_no_arg() {
   return saturate();
diff --git a/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl
index 12c818acec035..b2f45051a9bd8 100644
--- a/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl
@@ -15,7 +15,7 @@ int2 test_select_vector_vals_not_vecs(bool2 p0, int t0,
 }
 
 int1 test_select_vector_vals_wrong_size(bool2 p0, int1 t0, int1 f0) {
-  return select<int,1>(p0, t0, f0); // expected-warning{{implicit conversion truncates vector: 'bool2' (aka 'vector<bool, 2>') to 'vector<bool, 1>' (vector of 1 'bool' value)}}
+  return select<int1>(p0, t0, f0); // No diagnostic expected.
 }
 
 int test_select_no_args() {
diff --git a/clang/test/SemaHLSL/BuiltIns/sign-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/sign-errors.hlsl
index b67725fc77e52..68583d10d1287 100644
--- a/clang/test/SemaHLSL/BuiltIns/sign-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/sign-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
 
 bool test_too_few_arg() {
   return __builtin_hlsl_elementwise_sign();
diff --git a/clang/test/SemaHLSL/BuiltIns/smoothstep-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/smoothstep-errors.hlsl
index e5e902d6ab887..4c6bea8f02411 100644
--- a/clang/test/SemaHLSL/BuiltIns/smoothstep-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/smoothstep-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
 
 float test_no_second_arg(float2 p0) {
   return smoothstep(p0);
diff --git a/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl
index 312230a2d6aff..e2ef0f796c166 100644
--- a/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -verify
 
 void test_no_second_arg(double D) {
   __builtin_hlsl_elementwise_splitdouble(D);
diff --git a/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl
index 5346f217b83aa..993450a17ebfb 100644
--- a/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -disable-llvm-passes -verify
 
 void test_too_few_arg()
 {
diff --git a/clang/test/SemaHLSL/Operators/logical-not.hlsl b/clang/test/SemaHLSL/Operators/logical-not.hlsl
index d06ca3982be05..bd1a4be84c47f 100644
--- a/clang/test/SemaHLSL/Operators/logical-not.hlsl
+++ b/clang/test/SemaHLSL/Operators/logical-not.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple  dxil-pc-shadermodel6.6-library %s -fnative-half-type -ast-dump -ast-dump-filter=case | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple  dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -ast-dump -ast-dump-filter=case | FileCheck %s
 
 // CHECK-LABEL: FunctionDecl {{.*}} used case1 'uint32_t2 (uint32_t2)'
 // CHECK-NEXT: ParmVarDecl {{.*}} used b 'uint32_t2':'vector<uint32_t, 2>'
diff --git a/clang/test/SemaHLSL/Semantics/semantics-invalid.hlsl b/clang/test/SemaHLSL/Semantics/semantics-invalid.hlsl
new file mode 100644
index 0000000000000..fdba6f624d289
--- /dev/null
+++ b/clang/test/SemaHLSL/Semantics/semantics-invalid.hlsl
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -hlsl-entry main -verify %s
+
+typedef float t_f : SEMANTIC; // expected-warning{{'SEMANTIC' attribute only applies to parameters, non-static data members, and functions}}
+
+struct semantic_on_struct : SEMANTIC { // expected-error{{expected class name}}
+  float a;
+};
+
+struct s_fields_multiple_semantics {
+  float a : semantic_a : semantic_c; // expected-error{{use of undeclared identifier 'semantic_c'}}
+  float b : semantic_b;
+};
+
+[numthreads(1, 1, 1)]
+void main() {
+  float a : SEM_A; // expected-warning{{'SEM_A' attribute only applies to parameters, non-static data members, and functions}}
+}
diff --git a/clang/test/SemaHLSL/Semantics/semantics-valid.hlsl b/clang/test/SemaHLSL/Semantics/semantics-valid.hlsl
new file mode 100644
index 0000000000000..1e6bae4fcbca5
--- /dev/null
+++ b/clang/test/SemaHLSL/Semantics/semantics-valid.hlsl
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -hlsl-entry CSMain -x hlsl  -finclude-default-header  -ast-dump -o - %s | FileCheck %s
+
+struct s_fields {
+  float a : semantic_a;
+  float b : semantic_b;
+// CHECK: |-CXXRecordDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> line:[[@LINE-3]]:8 struct s_fields definition
+// CHECK: | |-FieldDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:9 a 'float'
+// CHECK: | | `-HLSLUserSemanticAttr 0x{{[0-9a-fA-F]+}} <col:13>
+// CHECK: | `-FieldDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:9 b 'float'
+// CHECK: |   `-HLSLUserSemanticAttr 0x{{[0-9a-fA-F]+}} <col:13>
+};
+
+float fn_foo1(float a : a, float b : b) : sem_ret { return 1.0f; }
+// CHECK:      |-FunctionDecl {{.*}} <{{.*}}> col:7 fn_foo1 'float (float, float)'
+// CHECK-NEXT: | |-ParmVarDecl {{.*}} <{{.*}}> col:21 a 'float'
+// CHECK-NEXT: | | `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
+// CHECK-NEXT: | |-ParmVarDecl {{.*}} <{{.*}}> col:34 b 'float'
+// CHECK-NEXT: | | `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
+// CHECK-NEXT: | |-CompoundStmt {{.*}} <{{.*}}>
+// CHECK-NEXT: | | `-ReturnStmt {{.*}} <{{.*}}>
+// CHECK-NEXT: | |   `-FloatingLiteral {{.*}} <{{.*}}> 'float' 1.000000e+00
+// CHECK-NEXT: | `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
+float fn_foo2(float a : a, float b : b) : sem_ret : also_ret { return 1.0f; }
+// CHECK:      `-FunctionDecl {{.*}} <{{.*}}> col:7 fn_foo2 'float (float, float)'
+// CHECK-NEXT:   |-ParmVarDecl {{.*}} <{{.*}}> col:21 a 'float'
+// CHECK-NEXT:   | `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
+// CHECK-NEXT:   |-ParmVarDecl {{.*}} <{{.*}}> col:34 b 'float'
+// CHECK-NEXT:   | `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
+// CHECK-NEXT:   |-CompoundStmt {{.*}} <{{.*}}>
+// CHECK-NEXT:   | `-ReturnStmt {{.*}} <{{.*}}>
+// CHECK-NEXT:   |   `-FloatingLiteral {{.*}} <{{.*}}> 'float' 1.000000e+00
+// CHECK-NEXT:   |-HLSLUserSemanticAttr {{.*}} <{{.*}}>
+// CHECK-NEXT:   `-HLSLUserSemanticAttr {{.*}} <{{.*}}>
diff --git a/clang/test/SemaHLSL/Types/AggregateSplatConstantExpr.hlsl b/clang/test/SemaHLSL/Types/AggregateSplatConstantExpr.hlsl
new file mode 100644
index 0000000000000..630acd8297642
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/AggregateSplatConstantExpr.hlsl
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -fnative-half-type -fnative-int16-type -std=hlsl202x -verify %s
+
+// expected-no-diagnostics
+
+struct Base {
+  double D;
+  uint64_t2 U;
+  int16_t I : 5;
+  uint16_t I2: 5;
+};
+
+struct R : Base {
+  int G : 10;
+  int : 30;
+  float F;
+};
+
+struct B1 {
+  float A;
+  float B;
+};
+
+struct B2 : B1 {
+  int C;
+  int D;
+  bool BB;
+};
+
+// tests for HLSLAggregateSplatCast
+export void fn() {
+  // result type vector
+  // splat from a vector of size 1
+  
+  constexpr float1 Y = {1.0};
+  constexpr float4 F4 = (float4)Y;
+  _Static_assert(F4[0] == 1.0, "Woo!");
+  _Static_assert(F4[1] == 1.0, "Woo!");
+  _Static_assert(F4[2] == 1.0, "Woo!");
+  _Static_assert(F4[3] == 1.0, "Woo!");
+
+  // result type array
+  // splat from a scalar
+  constexpr float F = 3.33;
+  constexpr int B6[6] = (int[6])F;
+  _Static_assert(B6[0] == 3, "Woo!");
+  _Static_assert(B6[1] == 3, "Woo!");
+  _Static_assert(B6[2] == 3, "Woo!");
+  _Static_assert(B6[3] == 3, "Woo!");
+  _Static_assert(B6[4] == 3, "Woo!");
+  _Static_assert(B6[5] == 3, "Woo!");
+
+  // splat from a vector of size 1
+  constexpr int1 A1 = {1};
+  constexpr uint64_t2 A7[2] = (uint64_t2[2])A1;
+  _Static_assert(A7[0][0] == 1, "Woo!");
+  _Static_assert(A7[0][1] == 1, "Woo!");
+  _Static_assert(A7[1][0] == 1, "Woo!");
+  _Static_assert(A7[1][1] == 1, "Woo!");
+
+  // result type struct
+  // splat from a scalar
+  constexpr double D = 97.6789;
+  constexpr R SR = (R)(D + 3.0);
+  _Static_assert(SR.D == 100.6789, "Woo!");
+  _Static_assert(SR.U[0] == 100, "Woo!");
+  _Static_assert(SR.U[1] == 100, "Woo!");
+  _Static_assert(SR.I == 4, "Woo!");
+  _Static_assert(SR.I2 == 4, "Woo!");
+  _Static_assert(SR.G == 100, "Woo!");
+  _Static_assert(SR.F == 100.6789, "Woo!");
+
+  // splat from a vector of size 1
+  constexpr float1 A100 = {1000.1111};
+  constexpr B2 SB2 = (B2)A100;
+  _Static_assert(SB2.A == 1000.1111, "Woo!");
+  _Static_assert(SB2.B == 1000.1111, "Woo!");
+  _Static_assert(SB2.C == 1000, "Woo!");
+  _Static_assert(SB2.D == 1000, "Woo!");
+  _Static_assert(SB2.BB == true, "Woo!");
+
+  // splat from a bool to an int and float etc
+  constexpr bool B = true;
+  constexpr B2 SB3 = (B2)B;
+  _Static_assert(SB3.A == 1.0, "Woo!");
+  _Static_assert(SB3.B == 1.0, "Woo!");
+  _Static_assert(SB3.C == 1, "Woo!");
+  _Static_assert(SB3.D == 1, "Woo!");
+  _Static_assert(SB3.BB == true, "Woo!");
+}
diff --git a/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl b/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl
index 7de4674699930..22e18769a2fe4 100644
--- a/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl
+++ b/clang/test/SemaHLSL/Types/Arithmetic/half_size.hlsl
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -verify %s
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -verify -fnative-half-type %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -verify -fnative-half-type -fnative-int16-type %s
 // RUN: %clang_cc1 -triple spirv-linux-vulkan-library -verify %s
-// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -verify -fnative-half-type %s
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -verify -fnative-half-type -fnative-int16-type %s
 
 // expected-no-diagnostics
 #ifdef __HLSL_ENABLE_16_BIT
diff --git a/clang/test/SemaHLSL/Types/ElementwiseCastConstantExpr.hlsl b/clang/test/SemaHLSL/Types/ElementwiseCastConstantExpr.hlsl
new file mode 100644
index 0000000000000..c9963c36ce23a
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/ElementwiseCastConstantExpr.hlsl
@@ -0,0 +1,90 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -fnative-half-type -fnative-int16-type -std=hlsl202x -verify %s
+
+// expected-no-diagnostics
+
+struct Base {
+  double D;
+  uint64_t2 U;
+  int16_t I : 5;
+  uint16_t I2: 5;
+};
+
+struct R : Base {
+  int G : 10;
+  int : 30;
+  float F;
+};
+
+struct B1 {
+  float A;
+  float B;
+};
+
+struct B2 : B1 {
+  int C;
+  int D;
+  bool BB;
+};
+
+export void fn() {
+  _Static_assert(((float4)(int[6]){1,2,3,4,5,6}).x == 1.0, "Woo!");
+
+  // This compiling successfully verifies that the array constant expression
+  // gets truncated to a float at compile time for instantiation via the
+  // flat cast
+  _Static_assert(((int)(int[2]){1,2}) == 1, "Woo!");
+
+  // truncation tests
+  // result type int
+  // truncate from struct
+  constexpr B1 SB1 = {1.0, 3.0};
+  constexpr int X = (int)SB1;
+  _Static_assert(X == 1, "Woo!");
+
+  // result type float
+  // truncate from array
+  constexpr B1 Arr[2] = {4.0, 3.0, 2.0, 1.0};
+  constexpr float F = (float)Arr;
+  _Static_assert(F == 4.0, "Woo!");
+
+  // result type vector
+  // truncate from array of vector
+  constexpr int2 Arr2[2] = {5,6,7,8};
+  constexpr int2 I2 = (int2)Arr2;
+  _Static_assert(I2[0] == 5, "Woo!");
+  _Static_assert(I2[1] == 6, "Woo!");
+
+  // lhs and rhs are same "size" tests
+  
+  // result type vector from  array
+  constexpr int4 I4 = (int4)Arr;
+  _Static_assert(I4[0] == 4, "Woo!");
+  _Static_assert(I4[1] == 3, "Woo!");
+  _Static_assert(I4[2] == 2, "Woo!");
+  _Static_assert(I4[3] == 1, "Woo!");
+
+  // result type array from vector
+  constexpr double3 D3 = {100.11, 200.11, 300.11};
+  constexpr float FArr[3] = (float[3])D3;
+  _Static_assert(FArr[0] == 100.11, "Woo!");
+  _Static_assert(FArr[1] == 200.11, "Woo!");
+  _Static_assert(FArr[2] == 300.11, "Woo!");
+
+  // result type struct from struct
+  constexpr B2 SB2 = {5.5, 6.5, 1000, 5000, false};
+  constexpr Base SB = (Base)SB2;
+  _Static_assert(SB.D == 5.5, "Woo!");
+  _Static_assert(SB.U[0] == 6, "Woo!");
+  _Static_assert(SB.U[1] == 1000, "Woo!");
+  _Static_assert(SB.I == 8, "Woo!");
+  _Static_assert(SB.I2 == 0, "Woo!");
+
+  // Make sure we read bitfields correctly
+  constexpr Base BB = {222.22, {100, 200}, -2, 7};
+  constexpr int Arr3[5] = (int[5])BB;
+  _Static_assert(Arr3[0] == 222, "Woo!");
+  _Static_assert(Arr3[1] == 100, "Woo!");
+  _Static_assert(Arr3[2] == 200, "Woo!");
+  _Static_assert(Arr3[3] == -2, "Woo!");
+  _Static_assert(Arr3[4] == 7, "Woo!");
+}
diff --git a/clang/test/SemaHLSL/Types/short-errors.hlsl b/clang/test/SemaHLSL/Types/short-errors.hlsl
new file mode 100644
index 0000000000000..93250084e300b
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/short-errors.hlsl
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-pc-vulkan1.3-compute -verify %s
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.8-compute -verify %s
+
+void asArg(inout short F) { F + 1;}
+// expected-error@-1 {{unknown type name short}}
+
+export void asVarDecl() {
+  short A = 1;
+  // expected-error@-1 {{unknown type name short}}  
+  fn(A);
+}
+
+export short asReturnType() {
+// expected-error@-1 {{unknown type name short}}
+  return 1;
+}
+
+struct S {
+  short A;
+  // expected-error@-1 {{unknown type name short}}
+};
diff --git a/clang/test/SemaHLSL/Types/typedefs.hlsl b/clang/test/SemaHLSL/Types/typedefs.hlsl
index fd72b1ae8a47f..c9c8ff2fc02de 100644
--- a/clang/test/SemaHLSL/Types/typedefs.hlsl
+++ b/clang/test/SemaHLSL/Types/typedefs.hlsl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -finclude-default-header -verify -fnative-half-type %s
-// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -finclude-default-header -verify -fnative-half-type %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.4-library -finclude-default-header -verify -fnative-half-type -fnative-int16-type %s
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -finclude-default-header -verify -fnative-half-type -fnative-int16-type %s
 
 // expected-no-diagnostics
 #define SizeCheck(Ty, SizeInBits)                                              \
diff --git a/clang/test/SemaHLSL/VectorOverloadResolution.hlsl b/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
index b320abdd81182..756dcb4034e4e 100644
--- a/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
+++ b/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.6-library -S -fnative-half-type -finclude-default-header -o - -ast-dump %s | FileCheck %s
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECKIR
+// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.6-library -S -fnative-half-type -fnative-int16-type -finclude-default-header -o - -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECKIR
 void Fn(double2 D);
 void Fn(half2 H);
 
diff --git a/clang/test/SemaTemplate/ctad.cpp b/clang/test/SemaTemplate/ctad.cpp
index 1a575ea527006..60603f0c963a5 100644
--- a/clang/test/SemaTemplate/ctad.cpp
+++ b/clang/test/SemaTemplate/ctad.cpp
@@ -104,3 +104,15 @@ namespace ConvertDeducedTemplateArgument {
 
   auto x = C(D<A::B>());
 }
+
+namespace pr165560 {
+template <class T, class> struct S {
+  using A = T;
+  template <class> struct I { // expected-note{{candidate function template not viable: requires 1 argument, but 0 were provided}} \
+                              // expected-note{{implicit deduction guide declared as 'template <class> I(pr165560::S<int, int>::I<type-parameter-0-0>) -> pr165560::S<int, int>::I<type-parameter-0-0>'}}
+    I(typename A::F) {} // expected-error{{type 'A' (aka 'int') cannot be used prior to '::' because it has no members}}
+  };
+};
+S<int, int>::I i; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'S<int, int>::I'}} \
+                  // expected-note{{while building implicit deduction guide first needed here}}
+}
diff --git a/clang/test/SemaTemplate/temp_arg_nontype.cpp b/clang/test/SemaTemplate/temp_arg_nontype.cpp
index 7d2a010295b47..bd0bf3cfdbc59 100644
--- a/clang/test/SemaTemplate/temp_arg_nontype.cpp
+++ b/clang/test/SemaTemplate/temp_arg_nontype.cpp
@@ -173,8 +173,7 @@ namespace pr6249 {
 }
 
 namespace PR6723 {
-  template<unsigned char C> void f(int (&a)[C]); // expected-note 3{{candidate template ignored: substitution failure [with C = '\x00']}}
-  // expected-note@-1 {{not viable: no known conversion from 'int[512]' to 'int (&)[0]'}}
+  template<unsigned char C> void f(int (&a)[C]); // expected-note 4{{candidate template ignored: substitution failure [with C = '\x00']}}
   void g() {
     int arr512[512];
     f(arr512); // expected-error{{no matching function for call}}
diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx11.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx11.cpp
index 5752cbac0291d..45bdb4c623dfe 100644
--- a/clang/test/SemaTemplate/temp_arg_nontype_cxx11.cpp
+++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx11.cpp
@@ -43,7 +43,7 @@ void TempFunc() {}
 
 void Useage() {
   //expected-error@+2 {{no matching function}}
-  //expected-note@-4 {{candidate template ignored: invalid explicitly-specified argument for template parameter 'b'}}
+  //expected-note@-4 {{candidate template ignored: substitution failure [with a = 1, b = 4294967295, c = 1]: non-type template argument evaluates to -1, which cannot be narrowed to type 'unsigned int'}}
   TempFunc<1, -1, 1>();
 }
 }
diff --git a/clang/tools/clang-check/CMakeLists.txt b/clang/tools/clang-check/CMakeLists.txt
index 5493aa4237aee..1efcc91fcaec9 100644
--- a/clang/tools/clang-check/CMakeLists.txt
+++ b/clang/tools/clang-check/CMakeLists.txt
@@ -14,6 +14,7 @@ clang_target_link_libraries(clang-check
   clangBasic
   clangDriver
   clangFrontend
+  clangOptions
   clangRewriteFrontend
   clangSerialization
   clangStaticAnalyzerFrontend
diff --git a/clang/tools/clang-check/ClangCheck.cpp b/clang/tools/clang-check/ClangCheck.cpp
index fa6dd06a1ee58..80255c647b98f 100644
--- a/clang/tools/clang-check/ClangCheck.cpp
+++ b/clang/tools/clang-check/ClangCheck.cpp
@@ -16,9 +16,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/AST/ASTConsumer.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/ASTConsumers.h"
 #include "clang/Frontend/CompilerInstance.h"
+#include "clang/Options/Options.h"
 #include "clang/Rewrite/Frontend/FixItRewriter.h"
 #include "clang/Rewrite/Frontend/FrontendActions.h"
 #include "clang/StaticAnalyzer/Frontend/FrontendActions.h"
@@ -34,8 +34,8 @@
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetSelect.h"
 
-using namespace clang::driver;
 using namespace clang::tooling;
+using namespace clang;
 using namespace llvm;
 
 static cl::extrahelp CommonHelp(CommonOptionsParser::HelpMessage);
diff --git a/clang/tools/clang-installapi/CMakeLists.txt b/clang/tools/clang-installapi/CMakeLists.txt
index 9c0d9dff7dc7f..54bc80486472f 100644
--- a/clang/tools/clang-installapi/CMakeLists.txt
+++ b/clang/tools/clang-installapi/CMakeLists.txt
@@ -25,6 +25,7 @@ clang_target_link_libraries(clang-installapi
   clangAST
   clangInstallAPI
   clangBasic
+  clangOptions
   clangDriver
   clangFrontend
   clangTooling
diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp
index 4e66485343b89..8bef9690ad855 100644
--- a/clang/tools/clang-installapi/ClangInstallAPI.cpp
+++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp
@@ -35,7 +35,7 @@
 
 using namespace clang;
 using namespace clang::installapi;
-using namespace clang::driver::options;
+using namespace clang::options;
 using namespace llvm::opt;
 using namespace llvm::MachO;
 
@@ -71,7 +71,7 @@ static bool runFrontend(StringRef ProgName, Twine Label, bool Verbose,
 static bool run(ArrayRef<const char *> Args, const char *ProgName) {
   // Setup Diagnostics engine.
   DiagnosticOptions DiagOpts;
-  const llvm::opt::OptTable &ClangOpts = clang::driver::getDriverOptTable();
+  const llvm::opt::OptTable &ClangOpts = getDriverOptTable();
   unsigned MissingArgIndex, MissingArgCount;
   llvm::opt::InputArgList ParsedArgs = ClangOpts.ParseArgs(
       ArrayRef(Args).slice(1), MissingArgIndex, MissingArgCount);
diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp
index 64324a3f8b010..f484d6f33ad8f 100644
--- a/clang/tools/clang-installapi/Options.cpp
+++ b/clang/tools/clang-installapi/Options.cpp
@@ -26,8 +26,6 @@ using namespace llvm;
 using namespace llvm::opt;
 using namespace llvm::MachO;
 
-namespace drv = clang::driver::options;
-
 namespace clang {
 namespace installapi {
 
@@ -109,7 +107,7 @@ getArgListFromJSON(const StringRef Input, llvm::opt::OptTable *Table,
 
 bool Options::processDriverOptions(InputArgList &Args) {
   // Handle inputs.
-  for (const StringRef Path : Args.getAllArgValues(drv::OPT_INPUT)) {
+  for (const StringRef Path : Args.getAllArgValues(options::OPT_INPUT)) {
     // Assume any input that is not a directory is a filelist.
     // InstallAPI does not accept multiple directories, so retain the last one.
     if (FM->getOptionalDirectoryRef(Path))
@@ -120,7 +118,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
 
   // Handle output.
   SmallString<PATH_MAX> OutputPath;
-  if (auto *Arg = Args.getLastArg(drv::OPT_o)) {
+  if (auto *Arg = Args.getLastArg(options::OPT_o)) {
     OutputPath = Arg->getValue();
     if (OutputPath != "-")
       FM->makeAbsolutePath(OutputPath);
@@ -132,10 +130,10 @@ bool Options::processDriverOptions(InputArgList &Args) {
   }
 
   // Do basic error checking first for mixing -target and -arch options.
-  auto *ArgArch = Args.getLastArgNoClaim(drv::OPT_arch);
-  auto *ArgTarget = Args.getLastArgNoClaim(drv::OPT_target);
+  auto *ArgArch = Args.getLastArgNoClaim(options::OPT_arch);
+  auto *ArgTarget = Args.getLastArgNoClaim(options::OPT_target);
   auto *ArgTargetVariant =
-      Args.getLastArgNoClaim(drv::OPT_darwin_target_variant);
+      Args.getLastArgNoClaim(options::OPT_darwin_target_variant);
   if (ArgArch && (ArgTarget || ArgTargetVariant)) {
     Diags->Report(clang::diag::err_drv_argument_not_allowed_with)
         << ArgArch->getAsString(Args)
@@ -143,7 +141,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
     return false;
   }
 
-  auto *ArgMinTargetOS = Args.getLastArgNoClaim(drv::OPT_mtargetos_EQ);
+  auto *ArgMinTargetOS = Args.getLastArgNoClaim(options::OPT_mtargetos_EQ);
   if ((ArgTarget || ArgTargetVariant) && ArgMinTargetOS) {
     Diags->Report(clang::diag::err_drv_cannot_mix_options)
         << ArgTarget->getAsString(Args) << ArgMinTargetOS->getAsString(Args);
@@ -152,7 +150,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
 
   // Capture target triples first.
   if (ArgTarget) {
-    for (const Arg *A : Args.filtered(drv::OPT_target)) {
+    for (const Arg *A : Args.filtered(options::OPT_target)) {
       A->claim();
       llvm::Triple TargetTriple(A->getValue());
       Target TAPITarget = Target(TargetTriple);
@@ -168,7 +166,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
 
   // Capture target variants.
   DriverOpts.Zippered = ArgTargetVariant != nullptr;
-  for (Arg *A : Args.filtered(drv::OPT_darwin_target_variant)) {
+  for (Arg *A : Args.filtered(options::OPT_darwin_target_variant)) {
     A->claim();
     Triple Variant(A->getValue());
     if (Variant.getVendor() != Triple::Apple) {
@@ -213,7 +211,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
     DriverOpts.Targets[TAPIVariant] = Variant;
   }
 
-  DriverOpts.Verbose = Args.hasArgNoClaim(drv::OPT_v);
+  DriverOpts.Verbose = Args.hasArgNoClaim(options::OPT_v);
 
   return true;
 }
@@ -407,7 +405,7 @@ bool Options::processOptionList(InputArgList &Args,
 
 bool Options::processLinkerOptions(InputArgList &Args) {
   // Handle required arguments.
-  if (const Arg *A = Args.getLastArg(drv::OPT_install__name))
+  if (const Arg *A = Args.getLastArg(options::OPT_install__name))
     LinkerOpts.InstallName = A->getValue();
   if (LinkerOpts.InstallName.empty()) {
     Diags->Report(diag::err_no_install_name);
@@ -415,28 +413,29 @@ bool Options::processLinkerOptions(InputArgList &Args) {
   }
 
   // Defaulted or optional arguments.
-  if (auto *Arg = Args.getLastArg(drv::OPT_current__version))
+  if (auto *Arg = Args.getLastArg(options::OPT_current__version))
     LinkerOpts.CurrentVersion.parse64(Arg->getValue());
 
-  if (auto *Arg = Args.getLastArg(drv::OPT_compatibility__version))
+  if (auto *Arg = Args.getLastArg(options::OPT_compatibility__version))
     LinkerOpts.CompatVersion.parse64(Arg->getValue());
 
-  if (auto *Arg = Args.getLastArg(drv::OPT_compatibility__version))
+  if (auto *Arg = Args.getLastArg(options::OPT_compatibility__version))
     LinkerOpts.CompatVersion.parse64(Arg->getValue());
 
-  if (auto *Arg = Args.getLastArg(drv::OPT_umbrella))
+  if (auto *Arg = Args.getLastArg(options::OPT_umbrella))
     LinkerOpts.ParentUmbrella = Arg->getValue();
 
-  LinkerOpts.IsDylib = Args.hasArg(drv::OPT_dynamiclib);
+  LinkerOpts.IsDylib = Args.hasArg(options::OPT_dynamiclib);
 
-  for (auto *Arg : Args.filtered(drv::OPT_alias_list)) {
+  for (auto *Arg : Args.filtered(options::OPT_alias_list)) {
     LinkerOpts.AliasLists.emplace_back(Arg->getValue());
     Arg->claim();
   }
 
-  LinkerOpts.AppExtensionSafe = Args.hasFlag(
-      drv::OPT_fapplication_extension, drv::OPT_fno_application_extension,
-      /*Default=*/LinkerOpts.AppExtensionSafe);
+  LinkerOpts.AppExtensionSafe =
+      Args.hasFlag(options::OPT_fapplication_extension,
+                   options::OPT_fno_application_extension,
+                   /*Default=*/LinkerOpts.AppExtensionSafe);
 
   if (::getenv("LD_NO_ENCRYPT") != nullptr)
     LinkerOpts.AppExtensionSafe = true;
@@ -446,7 +445,7 @@ bool Options::processLinkerOptions(InputArgList &Args) {
 
   // Capture library paths.
   PathSeq LibraryPaths;
-  for (const Arg *A : Args.filtered(drv::OPT_L)) {
+  for (const Arg *A : Args.filtered(options::OPT_L)) {
     LibraryPaths.emplace_back(A->getValue());
     A->claim();
   }
@@ -461,7 +460,7 @@ bool Options::processLinkerOptions(InputArgList &Args) {
 // invocations.
 bool Options::processFrontendOptions(InputArgList &Args) {
   // Capture language mode.
-  if (auto *A = Args.getLastArgNoClaim(drv::OPT_x)) {
+  if (auto *A = Args.getLastArgNoClaim(options::OPT_x)) {
     FEOpts.LangMode = llvm::StringSwitch<clang::Language>(A->getValue())
                           .Case("c", clang::Language::C)
                           .Case("c++", clang::Language::CXX)
@@ -475,15 +474,15 @@ bool Options::processFrontendOptions(InputArgList &Args) {
       return false;
     }
   }
-  for (auto *A : Args.filtered(drv::OPT_ObjC, drv::OPT_ObjCXX)) {
-    if (A->getOption().matches(drv::OPT_ObjC))
+  for (auto *A : Args.filtered(options::OPT_ObjC, options::OPT_ObjCXX)) {
+    if (A->getOption().matches(options::OPT_ObjC))
       FEOpts.LangMode = clang::Language::ObjC;
     else
       FEOpts.LangMode = clang::Language::ObjCXX;
   }
 
   // Capture Sysroot.
-  if (const Arg *A = Args.getLastArgNoClaim(drv::OPT_isysroot)) {
+  if (const Arg *A = Args.getLastArgNoClaim(options::OPT_isysroot)) {
     SmallString<PATH_MAX> Path(A->getValue());
     FM->makeAbsolutePath(Path);
     if (!FM->getOptionalDirectoryRef(Path)) {
@@ -502,13 +501,13 @@ bool Options::processFrontendOptions(InputArgList &Args) {
   }
 
   // Capture system frameworks for all platforms.
-  for (const Arg *A : Args.filtered(drv::OPT_iframework))
+  for (const Arg *A : Args.filtered(options::OPT_iframework))
     FEOpts.SystemFwkPaths.emplace_back(A->getValue(),
                                        std::optional<PlatformType>{});
 
   // Capture framework paths.
   PathSeq FrameworkPaths;
-  for (const Arg *A : Args.filtered(drv::OPT_F))
+  for (const Arg *A : Args.filtered(options::OPT_F))
     FrameworkPaths.emplace_back(A->getValue());
 
   if (!FrameworkPaths.empty())
diff --git a/clang/tools/clang-repl/ClangRepl.cpp b/clang/tools/clang-repl/ClangRepl.cpp
index c7879422cd7df..c86a1314ac026 100644
--- a/clang/tools/clang-repl/ClangRepl.cpp
+++ b/clang/tools/clang-repl/ClangRepl.cpp
@@ -309,6 +309,7 @@ int main(int argc, const char **argv) {
   clang::Interpreter::JITConfig Config;
   Config.IsOutOfProcess = !OOPExecutor.empty() || !OOPExecutorConnect.empty();
   Config.OOPExecutor = OOPExecutor;
+  Config.OrcRuntimePath = OrcRuntimePath;
   auto SizeOrErr = getSlabAllocSize(SlabAllocateSizeString);
   if (!SizeOrErr) {
     llvm::logAllUnhandledErrors(SizeOrErr.takeError(), llvm::errs(), "error: ");
diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index c11a34870b204..5f5bf42df5e6b 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -87,7 +87,7 @@ static std::string ModuleFilesDir;
 static bool EagerLoadModules;
 static unsigned NumThreads = 0;
 static std::string CompilationDB;
-static std::optional<std::string> ModuleName;
+static std::optional<std::string> ModuleNames;
 static std::vector<std::string> ModuleDepTargets;
 static std::string TranslationUnitFile;
 static bool DeprecatedDriverCommand;
@@ -205,8 +205,8 @@ static void ParseArgs(int argc, char **argv) {
   if (const llvm::opt::Arg *A = Args.getLastArg(OPT_compilation_database_EQ))
     CompilationDB = A->getValue();
 
-  if (const llvm::opt::Arg *A = Args.getLastArg(OPT_module_name_EQ))
-    ModuleName = A->getValue();
+  if (const llvm::opt::Arg *A = Args.getLastArg(OPT_module_names_EQ))
+    ModuleNames = A->getValue();
 
   for (const llvm::opt::Arg *A : Args.filtered(OPT_dependency_target_EQ))
     ModuleDepTargets.emplace_back(A->getValue());
@@ -664,6 +664,16 @@ static bool handleModuleResult(StringRef ModuleName,
   return false;
 }
 
+static void handleErrorWithInfoString(StringRef Info, llvm::Error E,
+                                      SharedStream &OS, SharedStream &Errs) {
+  llvm::handleAllErrors(std::move(E), [&Info, &Errs](llvm::StringError &Err) {
+    Errs.applyLocked([&](raw_ostream &OS) {
+      OS << "Error: " << Info << ":\n";
+      OS << Err.getMessage();
+    });
+  });
+}
+
 class P1689Deps {
 public:
   void printDependencies(raw_ostream &OS) {
@@ -1008,7 +1018,7 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
   };
 
   if (Format == ScanningOutputFormat::Full)
-    FD.emplace(!ModuleName ? Inputs.size() : 0);
+    FD.emplace(!ModuleNames ? Inputs.size() : 0);
 
   std::atomic<size_t> NumStatusCalls = 0;
   std::atomic<size_t> NumOpenFileForReadCalls = 0;
@@ -1082,13 +1092,48 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
                                              MakeformatOS, Errs))
             HadErrors = true;
         }
-      } else if (ModuleName) {
-        auto MaybeModuleDepsGraph = WorkerTool.getModuleDependencies(
-            *ModuleName, Input->CommandLine, CWD, AlreadySeenModules,
-            LookupOutput);
-        if (handleModuleResult(*ModuleName, MaybeModuleDepsGraph, *FD,
-                               LocalIndex, DependencyOS, Errs))
-          HadErrors = true;
+      } else if (ModuleNames) {
+        StringRef ModuleNameRef(*ModuleNames);
+        SmallVector<StringRef> Names;
+        ModuleNameRef.split(Names, ',');
+
+        if (Names.size() == 1) {
+          auto MaybeModuleDepsGraph = WorkerTool.getModuleDependencies(
+              Names[0], Input->CommandLine, CWD, AlreadySeenModules,
+              LookupOutput);
+          if (handleModuleResult(Names[0], MaybeModuleDepsGraph, *FD,
+                                 LocalIndex, DependencyOS, Errs))
+            HadErrors = true;
+        } else {
+          if (llvm::Error Err =
+                  WorkerTool.initializeCompilerInstanceWithContext(
+                      CWD, Input->CommandLine)) {
+            handleErrorWithInfoString(
+                "Compiler instance with context setup error", std::move(Err),
+                DependencyOS, Errs);
+            HadErrors = true;
+            continue;
+          }
+
+          for (auto N : Names) {
+            auto MaybeModuleDepsGraph =
+                WorkerTool.computeDependenciesByNameWithContext(
+                    N, AlreadySeenModules, LookupOutput);
+            if (handleModuleResult(N, MaybeModuleDepsGraph, *FD, LocalIndex,
+                                   DependencyOS, Errs)) {
+              HadErrors = true;
+              break;
+            }
+          }
+
+          if (llvm::Error Err =
+                  WorkerTool.finalizeCompilerInstanceWithContext()) {
+            handleErrorWithInfoString(
+                "Compiler instance with context finialization error",
+                std::move(Err), DependencyOS, Errs);
+            HadErrors = true;
+          }
+        }
       } else {
         std::unique_ptr<llvm::MemoryBuffer> TU;
         std::optional<llvm::MemoryBufferRef> TUBuffer;
diff --git a/clang/tools/clang-scan-deps/Opts.td b/clang/tools/clang-scan-deps/Opts.td
index 7a63b18f6d462..6ea9d824c9646 100644
--- a/clang/tools/clang-scan-deps/Opts.td
+++ b/clang/tools/clang-scan-deps/Opts.td
@@ -26,7 +26,9 @@ def eager_load_pcm : F<"eager-load-pcm", "Load PCM files eagerly (instead of laz
 def j : Arg<"j", "Number of worker threads to use (default: use all concurrent threads)">;
 
 defm compilation_database : Eq<"compilation-database", "Compilation database">;
-defm module_name : Eq<"module-name", "the module of which the dependencies are to be computed">;
+defm module_names
+    : Eq<"module-names", "A comma separated list of names of modules of which "
+                         "the dependencies are to be computed">;
 defm dependency_target : Eq<"dependency-target", "The names of dependency targets for the dependency file">;
 
 defm tu_buffer_path: Eq<"tu-buffer-path", "The path to the translation unit for depscan. Not compatible with -module-name">;
diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
index d9d36f7a41359..002aaef005253 100644
--- a/clang/tools/driver/CMakeLists.txt
+++ b/clang/tools/driver/CMakeLists.txt
@@ -63,6 +63,7 @@ clang_target_link_libraries(clang
   clangDriver
   clangFrontend
   clangFrontendTool
+  clangOptions
   clangSerialization
   )
 
diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp
index 52cffa4ccbe1f..2aef75597fc5f 100644
--- a/clang/tools/driver/cc1_main.cpp
+++ b/clang/tools/driver/cc1_main.cpp
@@ -17,7 +17,6 @@
 #include "clang/CodeGen/ObjectFilePCHContainerWriter.h"
 #include "clang/Config/config.h"
 #include "clang/Driver/DriverDiagnostic.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
@@ -25,6 +24,7 @@
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Frontend/Utils.h"
 #include "clang/FrontendTool/Utils.h"
+#include "clang/Options/Options.h"
 #include "clang/Serialization/ObjectFilePCHContainerReader.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp
index 50da2f8449a22..f13812f2a8383 100644
--- a/clang/tools/driver/cc1as_main.cpp
+++ b/clang/tools/driver/cc1as_main.cpp
@@ -14,10 +14,10 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Driver/DriverDiagnostic.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Frontend/Utils.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -59,8 +59,7 @@
 #include <optional>
 #include <system_error>
 using namespace clang;
-using namespace clang::driver;
-using namespace clang::driver::options;
+using namespace clang::options;
 using namespace llvm;
 using namespace llvm::opt;
 
@@ -688,8 +687,7 @@ int cc1as_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
     getDriverOptTable().printHelp(
         llvm::outs(), "clang -cc1as [options] file...",
         "Clang Integrated Assembler", /*ShowHidden=*/false,
-        /*ShowAllAliases=*/false,
-        llvm::opt::Visibility(driver::options::CC1AsOption));
+        /*ShowAllAliases=*/false, llvm::opt::Visibility(options::CC1AsOption));
 
     return 0;
   }
diff --git a/clang/tools/driver/driver.cpp b/clang/tools/driver/driver.cpp
index 7390d7d610ec0..1e2c9884ba63d 100644
--- a/clang/tools/driver/driver.cpp
+++ b/clang/tools/driver/driver.cpp
@@ -18,13 +18,13 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/DriverDiagnostic.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Frontend/ChainedDiagnosticConsumer.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/SerializedDiagnosticPrinter.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Frontend/Utils.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 08776d9bcabfc..f4d6fa72a1dfe 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2757,6 +2757,11 @@ void OMPClauseEnqueue::VisitOMPXDynCGroupMemClause(
   VisitOMPClauseWithPreInit(C);
   Visitor->AddStmt(C->getSize());
 }
+void OMPClauseEnqueue::VisitOMPDynGroupprivateClause(
+    const OMPDynGroupprivateClause *C) {
+  VisitOMPClauseWithPreInit(C);
+  Visitor->AddStmt(C->getSize());
+}
 void OMPClauseEnqueue::VisitOMPDoacrossClause(const OMPDoacrossClause *C) {
   VisitOMPClauseList(C);
 }
diff --git a/clang/tools/offload-arch/AMDGPUArchByHIP.cpp b/clang/tools/offload-arch/AMDGPUArchByHIP.cpp
index 11cff4f5ecdbe..ff39a85d15628 100644
--- a/clang/tools/offload-arch/AMDGPUArchByHIP.cpp
+++ b/clang/tools/offload-arch/AMDGPUArchByHIP.cpp
@@ -98,8 +98,16 @@ static std::vector<std::string> getSearchPaths() {
 // Custom comparison function for dll name
 static bool compareVersions(StringRef A, StringRef B) {
   auto ParseVersion = [](StringRef S) -> VersionTuple {
-    size_t Pos = S.find_last_of('_');
-    StringRef VerStr = (Pos == StringRef::npos) ? S : S.substr(Pos + 1);
+    StringRef Filename = sys::path::filename(S);
+    size_t Pos = Filename.find_last_of('_');
+    if (Pos == StringRef::npos)
+      return VersionTuple();
+
+    StringRef VerStr = Filename.substr(Pos + 1);
+    size_t DotPos = VerStr.find('.');
+    if (DotPos != StringRef::npos)
+      VerStr = VerStr.substr(0, DotPos);
+
     VersionTuple Vt;
     (void)Vt.tryParse(VerStr);
     return Vt;
@@ -135,8 +143,6 @@ static std::pair<std::string, bool> findNewestHIPDLL() {
           Filename.ends_with(HipDLLSuffix))
         DLLNames.push_back(sys::path::convert_to_slash(DirIt->path()));
     }
-    if (!DLLNames.empty())
-      break;
   }
 
   if (DLLNames.empty())
diff --git a/clang/tools/scan-build/bin/set-xcode-analyzer b/clang/tools/scan-build/bin/set-xcode-analyzer
index 8e4a5794594a6..5d98c0cf2c1e2 100755
--- a/clang/tools/scan-build/bin/set-xcode-analyzer
+++ b/clang/tools/scan-build/bin/set-xcode-analyzer
@@ -5,10 +5,6 @@
 # This one has the scripting bridge enabled.
 
 import sys
-if sys.version_info < (3, 6):
-    print "set-xcode-analyzer requires Python 3.6 or later"
-    sys.exit(1)
-
 import os
 import subprocess
 import re
@@ -18,7 +14,7 @@ import stat
 from AppKit import *
 
 def FindClangSpecs(path):
-  print "(+) Searching for xcspec file in: ", path
+  print("(+) Searching for xcspec file in: ", path)
   for root, dirs, files in os.walk(path):
     for f in files:
       if f.endswith(".xcspec") and f.startswith("Clang LLVM"):
@@ -49,14 +45,14 @@ def ModifySpec(path, isBuiltinAnalyzer, pathToChecker):
           foundAnalyzer = False
       t.write(line)
   t.close()
-  print "(+) processing:", path
+  print("(+) processing:", path)
   try:
     shutil.copy(t.name, path)
     os.chmod(path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)
-  except IOError, why:
-    print "    (-) Cannot update file:", why, "\n"
-  except OSError, why:
-    print "    (-) Cannot update file:", why, "\n"
+  except IOError as why:
+    print("    (-) Cannot update file:", why, "\n")
+  except OSError as why:
+    print("    (-) Cannot update file:", why, "\n")
   os.unlink(t.name)
 
 def main():
@@ -75,7 +71,7 @@ def main():
   # determine if Xcode is running
   for x in NSWorkspace.sharedWorkspace().runningApplications():
     if x.localizedName().find("Xcode") >= 0:
-      print "(-) You must quit Xcode first before modifying its configuration files."
+      print("(-) You must quit Xcode first before modifying its configuration files.")
       sys.exit(1)
 
   isBuiltinAnalyzer = False
@@ -83,12 +79,12 @@ def main():
     # Expand tildes.
     path = os.path.expanduser(options.path)
     if not path.endswith("clang"):
-      print "(+) Using Clang bundled with checker build:", path
+      print("(+) Using Clang bundled with checker build:", path)
       path = os.path.join(path, "bin", "clang");
     else:
-      print "(+) Using Clang located at:", path
+      print("(+) Using Clang located at:", path)
   else:
-    print "(+) Using the Clang bundled with Xcode"
+    print("(+) Using the Clang bundled with Xcode")
     path = options.default
     isBuiltinAnalyzer = True
 
@@ -108,7 +104,7 @@ def main():
     ModifySpec(x, isBuiltinAnalyzer, path)
 
   if not foundSpec:
-      print "(-) No compiler configuration file was found.  Xcode's analyzer has not been updated."
+      print("(-) No compiler configuration file was found.  Xcode's analyzer has not been updated.")
 
 if __name__ == '__main__':
   main()
diff --git a/clang/tools/scan-view/share/ScanView.py b/clang/tools/scan-view/share/ScanView.py
index a89bf3f24fc5a..9c110130315ad 100644
--- a/clang/tools/scan-view/share/ScanView.py
+++ b/clang/tools/scan-view/share/ScanView.py
@@ -1,40 +1,19 @@
-from __future__ import print_function
-
-try:
-    from http.server import HTTPServer, SimpleHTTPRequestHandler
-except ImportError:
-    from BaseHTTPServer import HTTPServer
-    from SimpleHTTPServer import SimpleHTTPRequestHandler
+from http.server import HTTPServer, SimpleHTTPRequestHandler
 import os
 import sys
-
-try:
-    from urlparse import urlparse
-    from urllib import unquote
-except ImportError:
-    from urllib.parse import urlparse, unquote
-
+from urllib.parse import urlparse, unquote
 import posixpath
-
-if sys.version_info.major >= 3:
-    from io import StringIO, BytesIO
-else:
-    from io import BytesIO, BytesIO as StringIO
-
+from io import StringIO, BytesIO
 import re
 import shutil
 import threading
 import time
 import socket
 import itertools
+import configparser
 
 import Reporter
 
-try:
-    import configparser
-except ImportError:
-    import ConfigParser as configparser
-
 ###
 # Various patterns matched or replaced by server.
 
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 4c7ea5e338a13..3cab4c600b1b1 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -3300,6 +3300,72 @@ TEST_P(ImportExpr, ConceptNestedNonInstantiationDependentRequirement) {
              conceptDecl(has(requiresExpr(has(requiresExprBodyDecl())))));
 }
 
+TEST_P(ImportExpr, ImportSubstNonTypeTemplateParmPackExpr) {
+  MatchVerifier<Decl> Verifier;
+  const char *Code = R"(
+    template<auto ...> struct X {};
+    template<typename ...> struct Z {};
+
+    template<int ...N> struct E {
+      template<int ...M> using B = Z<X<N, M>...>;
+      template<int M1, int M2> E(B<M1, M2>);
+    };
+    using declToImport = E<1, 3>;
+  )";
+  testImport(Code, Lang_CXX20, "", Lang_CXX20, Verifier,
+             typedefNameDecl(hasName("declToImport")));
+}
+
+TEST_P(ImportExpr, ImportCXXParenListInitExpr) {
+  MatchVerifier<Decl> Verifier;
+  const char *Code = R"(
+    struct Node {
+      int val;
+      double d;
+    };
+    Node* declToImport() { return new Node(2, 3.14); }
+  )";
+  testImport(Code, Lang_CXX20, "", Lang_CXX20, Verifier,
+             functionDecl(hasName("declToImport")));
+}
+
+TEST_P(ImportExpr, ImportPseudoObjectExpr) {
+  MatchVerifier<Decl> Verifier;
+  const char *Code = R"(
+  namespace std {
+    struct strong_ordering {
+      int n;
+      constexpr operator int() const { return n; }
+      static const strong_ordering less, equal, greater;
+    };
+    constexpr strong_ordering strong_ordering::less{-1},
+        strong_ordering::equal{0}, strong_ordering::greater{1};
+  }
+
+  struct A {
+    std::strong_ordering operator<=>(const A&) const;
+  };
+  struct B {
+    bool operator==(const B&) const;
+    bool operator<(const B&) const;
+  };
+
+  template<typename T> struct Cmp : T {
+    std::strong_ordering operator<=>(const Cmp&) const = default;
+  };
+
+  void use(...);
+  void declToImport() {
+    use(
+      Cmp<A>() <=> Cmp<A>(),
+      Cmp<B>() <=> Cmp<B>()
+    );
+  }
+  )";
+  testImport(Code, Lang_CXX20, "", Lang_CXX20, Verifier,
+             functionDecl(hasName("declToImport")));
+}
+
 class ImportImplicitMethods : public ASTImporterOptionSpecificTestBase {
 public:
   static constexpr auto DefaultCode = R"(
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
index 9692d6e6fae97..3fcb5582d3dd7 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
@@ -1179,6 +1179,12 @@ TEST_P(ASTMatchersTest, PredefinedExpr) {
                                      has(stringLiteral()))));
 }
 
+TEST_P(ASTMatchersTest, FileScopeAsmDecl) {
+  EXPECT_TRUE(matches("__asm(\"nop\");", fileScopeAsmDecl()));
+  EXPECT_TRUE(
+      notMatches("void f() { __asm(\"mov al, 2\"); }", fileScopeAsmDecl()));
+}
+
 TEST_P(ASTMatchersTest, AsmStatement) {
   EXPECT_TRUE(matches("void foo() { __asm(\"mov al, 2\"); }", asmStmt()));
 }
@@ -2442,7 +2448,8 @@ TEST_P(ASTMatchersTest, LambdaCaptureTest_BindsToCaptureOfReferenceType) {
                       "int main() {"
                       "  int a;"
                       "  f(a);"
-                      "}", matcher));
+                      "}",
+                      matcher));
   EXPECT_FALSE(matches("template <class ...T> void f(T &...args) {"
                        "  [...args = args] () mutable {"
                        "  }();"
@@ -2450,7 +2457,8 @@ TEST_P(ASTMatchersTest, LambdaCaptureTest_BindsToCaptureOfReferenceType) {
                        "int main() {"
                        "  int a;"
                        "  f(a);"
-                       "}", matcher));
+                       "}",
+                       matcher));
 }
 
 TEST_P(ASTMatchersTest, IsDerivedFromRecursion) {
@@ -2628,7 +2636,7 @@ TEST(ASTMatchersTestObjC, ObjCStringLiteral) {
                           "    [Test someFunction:@\"Ola!\"]; "
                           "}\n"
                           "@end ";
-    EXPECT_TRUE(matchesObjC(Objc1String, objcStringLiteral()));
+  EXPECT_TRUE(matchesObjC(Objc1String, objcStringLiteral()));
 }
 
 TEST(ASTMatchersTestObjC, ObjCDecls) {
diff --git a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
index ef229606de0f0..8fc9a66dbda7e 100644
--- a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
+++ b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
@@ -2076,4 +2076,19 @@ TEST(ExprMutationAnalyzerTest, PointeeMutatedByReturn) {
   }
 }
 
+TEST(ExprMutationAnalyzerTest, PointeeMutatedByPointerToMemberOperator) {
+  // GH161913
+  const std::string Code = R"(
+    struct S { int i; };
+    void f(S s) {
+      S *x = &s;
+      (x->*(&S::i))++;
+    }
+  )";
+  auto AST = buildASTFromCodeWithArgs(Code, {"-Wno-everything"});
+  auto Results =
+      match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
+  EXPECT_TRUE(isPointeeMutated(Results, AST.get()));
+}
+
 } // namespace clang
diff --git a/clang/unittests/Analysis/LifetimeSafetyTest.cpp b/clang/unittests/Analysis/LifetimeSafetyTest.cpp
index 0c051847f4d47..601308c53f9a9 100644
--- a/clang/unittests/Analysis/LifetimeSafetyTest.cpp
+++ b/clang/unittests/Analysis/LifetimeSafetyTest.cpp
@@ -530,6 +530,7 @@ TEST_F(LifetimeAnalysisTest, PointersInACycle) {
         p1 = p2;
         p2 = p3;
         p3 = temp;
+        POINT(in_loop);
       }
       POINT(after_loop);
     }
@@ -543,7 +544,11 @@ TEST_F(LifetimeAnalysisTest, PointersInACycle) {
   EXPECT_THAT(Origin("p1"), HasLoansTo({"v1", "v2", "v3"}, "after_loop"));
   EXPECT_THAT(Origin("p2"), HasLoansTo({"v1", "v2", "v3"}, "after_loop"));
   EXPECT_THAT(Origin("p3"), HasLoansTo({"v1", "v2", "v3"}, "after_loop"));
-  EXPECT_THAT(Origin("temp"), HasLoansTo({"v1", "v2", "v3"}, "after_loop"));
+
+  EXPECT_THAT(Origin("temp"), HasLoansTo({"v1", "v2", "v3"}, "in_loop"));
+  // 'temp' is a block-local origin and it's loans are not tracked outside the
+  // block.
+  EXPECT_THAT(Origin("temp"), HasLoansTo({}, "after_loop"));
 }
 
 TEST_F(LifetimeAnalysisTest, PointersAndExpirationInACycle) {
@@ -684,7 +689,6 @@ TEST_F(LifetimeAnalysisTest, GslPointerConstructFromView) {
   EXPECT_THAT(Origin("q"), HasLoansTo({"a"}, "p1"));
 }
 
-// FIXME: Handle loans in ternary operator!
 TEST_F(LifetimeAnalysisTest, GslPointerInConditionalOperator) {
   SetupTest(R"(
     void target(bool cond) {
@@ -693,7 +697,24 @@ TEST_F(LifetimeAnalysisTest, GslPointerInConditionalOperator) {
       POINT(p1);
     }
   )");
-  EXPECT_THAT(Origin("v"), HasLoansTo({}, "p1"));
+  EXPECT_THAT(Origin("v"), HasLoansTo({"a", "b"}, "p1"));
+}
+
+TEST_F(LifetimeAnalysisTest, ExtraParenthesis) {
+  SetupTest(R"(
+    void target() {
+      MyObj a;
+      View x = ((View((((a))))));
+      View y = ((View{(((x)))}));
+      View z = ((View(((y)))));
+      View p = ((View{((x))}));
+      POINT(p1);
+    }
+  )");
+  EXPECT_THAT(Origin("x"), HasLoansTo({"a"}, "p1"));
+  EXPECT_THAT(Origin("y"), HasLoansTo({"a"}, "p1"));
+  EXPECT_THAT(Origin("z"), HasLoansTo({"a"}, "p1"));
+  EXPECT_THAT(Origin("p"), HasLoansTo({"a"}, "p1"));
 }
 
 // FIXME: Handle temporaries.
diff --git a/clang/unittests/CodeGen/CMakeLists.txt b/clang/unittests/CodeGen/CMakeLists.txt
index f5bcecb0b08a3..d4efb2230a054 100644
--- a/clang/unittests/CodeGen/CMakeLists.txt
+++ b/clang/unittests/CodeGen/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_clang_unittest(ClangCodeGenTests
   BufferSourceTest.cpp
   CodeGenExternalTest.cpp
+  DemangleTrapReasonInDebugInfo.cpp
   TBAAMetadataTest.cpp
   CheckTargetFeaturesTest.cpp
   CLANG_LIBS
diff --git a/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp b/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp
new file mode 100644
index 0000000000000..17bfe17c31d65
--- /dev/null
+++ b/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp
@@ -0,0 +1,67 @@
+//=== unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/CodeGen/ModuleBuilder.h"
+#include "llvm/ADT/StringRef.h"
+#include "gtest/gtest.h"
+
+using namespace clang::CodeGen;
+
+void CheckValidCommon(llvm::StringRef FuncName, const char *ExpectedCategory,
+                      const char *ExpectedMessage) {
+  auto MaybeTrapReason = DemangleTrapReasonInDebugInfo(FuncName);
+  ASSERT_TRUE(MaybeTrapReason.has_value());
+  auto [Category, Message] = MaybeTrapReason.value();
+  ASSERT_STREQ(Category.str().c_str(), ExpectedCategory);
+  ASSERT_STREQ(Message.str().c_str(), ExpectedMessage);
+}
+
+void CheckInvalidCommon(llvm::StringRef FuncName) {
+  auto MaybeTrapReason = DemangleTrapReasonInDebugInfo(FuncName);
+  ASSERT_TRUE(!MaybeTrapReason.has_value());
+}
+
+TEST(DemangleTrapReasonInDebugInfo, Valid) {
+  std::string FuncName(ClangTrapPrefix);
+  FuncName += "$trap category$trap message";
+  CheckValidCommon(FuncName, "trap category", "trap message");
+}
+
+TEST(DemangleTrapReasonInDebugInfo, ValidEmptyCategory) {
+  std::string FuncName(ClangTrapPrefix);
+  FuncName += "$$trap message";
+  CheckValidCommon(FuncName, "", "trap message");
+}
+
+TEST(DemangleTrapReasonInDebugInfo, ValidEmptyMessage) {
+  std::string FuncName(ClangTrapPrefix);
+  FuncName += "$trap category$";
+  CheckValidCommon(FuncName, "trap category", "");
+}
+
+TEST(DemangleTrapReasonInDebugInfo, ValidAllEmpty) {
+  //  `__builtin_verbose_trap` actually allows this
+  // currently. However, we should probably disallow this in Sema because having
+  // an empty category and message completely defeats the point of using the
+  // builtin (#165981).
+  std::string FuncName(ClangTrapPrefix);
+  FuncName += "$$";
+  CheckValidCommon(FuncName, "", "");
+}
+
+TEST(DemangleTrapReasonInDebugInfo, InvalidOnlyPrefix) {
+  std::string FuncName(ClangTrapPrefix);
+  CheckInvalidCommon(FuncName);
+}
+
+TEST(DemangleTrapReasonInDebugInfo, Invalid) {
+  std::string FuncName("foo");
+  CheckInvalidCommon(FuncName);
+}
+
+TEST(DemangleTrapReasonInDebugInfo, InvalidEmpty) { CheckInvalidCommon(""); }
diff --git a/clang/unittests/Driver/DXCModeTest.cpp b/clang/unittests/Driver/DXCModeTest.cpp
index 62274235c53f5..e0454f190b35a 100644
--- a/clang/unittests/Driver/DXCModeTest.cpp
+++ b/clang/unittests/Driver/DXCModeTest.cpp
@@ -131,8 +131,8 @@ TEST(DxcModeTest, ValidatorVersionValidation) {
       TC.TranslateArgs(*DAL, "0", Action::OffloadKind::OFK_None)};
   EXPECT_NE(TranslatedArgs, nullptr);
   if (TranslatedArgs) {
-    auto *A = TranslatedArgs->getLastArg(
-        clang::driver::options::OPT_dxil_validator_version);
+    auto *A =
+        TranslatedArgs->getLastArg(clang::options::OPT_dxil_validator_version);
     EXPECT_NE(A, nullptr);
     if (A) {
       EXPECT_STREQ(A->getValue(), "1.1");
diff --git a/clang/unittests/Driver/MultilibTest.cpp b/clang/unittests/Driver/MultilibTest.cpp
index ebb8611d97e1c..277fa266dea9b 100644
--- a/clang/unittests/Driver/MultilibTest.cpp
+++ b/clang/unittests/Driver/MultilibTest.cpp
@@ -144,7 +144,7 @@ TEST(MultilibTest, SetPushback) {
   ASSERT_TRUE(MS.size() == 2);
   for (MultilibSet::const_iterator I = MS.begin(), E = MS.end(); I != E; ++I) {
     ASSERT_TRUE(llvm::StringSwitch<bool>(I->gccSuffix())
-                    .Cases("/one", "/two", true)
+                    .Cases({"/one", "/two"}, true)
                     .Default(false));
   }
 }
diff --git a/clang/unittests/Format/AlignBracketsTest.cpp b/clang/unittests/Format/AlignBracketsTest.cpp
index ea8db51a4d18e..10ca5fb7da1ce 100644
--- a/clang/unittests/Format/AlignBracketsTest.cpp
+++ b/clang/unittests/Format/AlignBracketsTest.cpp
@@ -28,7 +28,7 @@ TEST_F(AlignBracketsTest, AlignsAfterOpenBracket) {
       "SomeLongVariableName->someFunction(foooooooo(aaaaaaaaaaaaaaa,\n"
       "                                             aaaaaaaaaaaaaaaaaaaaa));");
   FormatStyle Style = getLLVMStyle();
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
                "    aaaaaaaaaaa aaaaaaaa, aaaaaaaaa aaaaaaa) {}",
                Style);
@@ -64,7 +64,7 @@ TEST_F(AlignBracketsTest, AlignsAfterOpenBracket) {
                Style);
   Style.ColumnLimit = 80;
 
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   Style.BinPackArguments = false;
   Style.BinPackParameters = FormatStyle::BPPS_OnePerLine;
   verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
@@ -115,7 +115,9 @@ TEST_F(AlignBracketsTest, AlignsAfterOpenBracket) {
       "    XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXZZZZZZZZZZZZZZZZZZZZZZZZZ()));",
       Style);
 
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Style.BreakAfterOpenBracketFunction = true;
+  Style.BreakBeforeCloseBracketFunction = true;
+  Style.BreakBeforeCloseBracketBracedList = true;
   Style.BinPackArguments = false;
   Style.BinPackParameters = FormatStyle::BPPS_OnePerLine;
   verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
@@ -254,7 +256,8 @@ TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndent) {
                "argument5));",
                Style);
 
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Style.BreakAfterOpenBracketFunction = true;
+  Style.BreakBeforeCloseBracketFunction = true;
 
   verifyFormat(Short, Style);
   verifyFormat(
@@ -378,7 +381,8 @@ TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndentIfStatement) {
                "}",
                Style);
 
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Style.BreakAfterOpenBracketFunction = true;
+  Style.BreakBeforeCloseBracketFunction = true;
 
   verifyFormat("if (foo()) {\n"
                "  return;\n"
@@ -440,7 +444,8 @@ TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndentForStatement) {
                "}",
                Style);
 
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Style.BreakAfterOpenBracketFunction = true;
+  Style.BreakBeforeCloseBracketFunction = true;
 
   verifyFormat("for (int i = 0; i < 5; ++i) {\n"
                "  doSomething();\n"
@@ -457,7 +462,8 @@ TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndentForStatement) {
 
 TEST_F(AlignBracketsTest, AlignAfterOpenBracketBlockIndentInitializers) {
   auto Style = getLLVMStyleWithColumns(60);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Style.BreakAfterOpenBracketBracedList = true;
+  Style.BreakBeforeCloseBracketBracedList = true;
   // Aggregate initialization.
   verifyFormat("int LooooooooooooooooooooooooongVariable[2] = {\n"
                "    10000000, 20000000\n"
@@ -611,13 +617,13 @@ TEST_F(AlignBracketsTest, AllowAllArgumentsOnNextLineDontAlign) {
   StringRef Input = "functionCall(paramA, paramB, paramC);\n"
                     "void functionDecl(int A, int B, int C);";
   Style.AllowAllArgumentsOnNextLine = false;
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   verifyFormat(StringRef("functionCall(paramA, paramB,\n"
                          "    paramC);\n"
                          "void functionDecl(int A, int B,\n"
                          "    int C);"),
                Input, Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_Align;
+  Style.AlignAfterOpenBracket = true;
   verifyFormat(StringRef("functionCall(paramA, paramB,\n"
                          "             paramC);\n"
                          "void functionDecl(int A, int B,\n"
@@ -625,13 +631,14 @@ TEST_F(AlignBracketsTest, AllowAllArgumentsOnNextLineDontAlign) {
                Input, Style);
   // However, BAS_AlwaysBreak and BAS_BlockIndent should take precedence over
   // AllowAllArgumentsOnNextLine.
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   verifyFormat(StringRef("functionCall(\n"
                          "    paramA, paramB, paramC);\n"
                          "void functionDecl(\n"
                          "    int A, int B, int C);"),
                Input, Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Style.BreakAfterOpenBracketFunction = true;
+  Style.BreakBeforeCloseBracketFunction = true;
   verifyFormat("functionCall(\n"
                "    paramA, paramB, paramC\n"
                ");\n"
@@ -639,11 +646,12 @@ TEST_F(AlignBracketsTest, AllowAllArgumentsOnNextLineDontAlign) {
                "    int A, int B, int C\n"
                ");",
                Input, Style);
+  Style.BreakBeforeCloseBracketFunction = false;
 
   // When AllowAllArgumentsOnNextLine is set, we prefer breaking before the
   // first argument.
   Style.AllowAllArgumentsOnNextLine = true;
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   verifyFormat(StringRef("functionCall(\n"
                          "    paramA, paramB, paramC);\n"
                          "void functionDecl(\n"
@@ -651,13 +659,14 @@ TEST_F(AlignBracketsTest, AllowAllArgumentsOnNextLineDontAlign) {
                Input, Style);
   // It wouldn't fit on one line with aligned parameters so this setting
   // doesn't change anything for BAS_Align.
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_Align;
+  Style.AlignAfterOpenBracket = true;
+  Style.BreakAfterOpenBracketFunction = false;
   verifyFormat(StringRef("functionCall(paramA, paramB,\n"
                          "             paramC);\n"
                          "void functionDecl(int A, int B,\n"
                          "                  int C);"),
                Input, Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.BreakAfterOpenBracketFunction = true;
   verifyFormat(StringRef("functionCall(\n"
                          "    paramA, paramB, paramC);\n"
                          "void functionDecl(\n"
@@ -678,13 +687,14 @@ TEST_F(AlignBracketsTest, FormatsDeclarationBreakAlways) {
 
   // Ensure AlignAfterOpenBracket interacts correctly with BinPackParameters set
   // to BPPS_AlwaysOnePerLine.
-  BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  BreakAlways.BreakAfterOpenBracketFunction = true;
   verifyFormat(
       "void someLongFunctionName(\n"
       "    int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n"
       "    int b);",
       BreakAlways);
-  BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  BreakAlways.BreakAfterOpenBracketFunction = true;
+  BreakAlways.BreakBeforeCloseBracketFunction = true;
   verifyFormat(
       "void someLongFunctionName(\n"
       "    int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n"
@@ -734,7 +744,7 @@ TEST_F(AlignBracketsTest, FormatsDefinitionBreakAlways) {
 
   // Ensure AlignAfterOpenBracket interacts correctly with BinPackParameters set
   // to BPPS_AlwaysOnePerLine.
-  BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  BreakAlways.BreakAfterOpenBracketFunction = true;
   verifyFormat(
       "void someLongFunctionName(\n"
       "    int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n"
@@ -743,7 +753,8 @@ TEST_F(AlignBracketsTest, FormatsDefinitionBreakAlways) {
       "      aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa, b);\n"
       "}",
       BreakAlways);
-  BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  BreakAlways.BreakAfterOpenBracketFunction = true;
+  BreakAlways.BreakBeforeCloseBracketFunction = true;
   verifyFormat(
       "void someLongFunctionName(\n"
       "    int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n"
@@ -761,17 +772,17 @@ TEST_F(AlignBracketsTest, ParenthesesAndOperandAlignment) {
   verifyFormat("int a = f(aaaaaaaaaaaaaaaaaaaaaa &&\n"
                "          bbbbbbbbbbbbbbbbbbbbbb);",
                Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_Align;
+  Style.AlignAfterOpenBracket = true;
   Style.AlignOperands = FormatStyle::OAS_DontAlign;
   verifyFormat("int a = f(aaaaaaaaaaaaaaaaaaaaaa &&\n"
                "          bbbbbbbbbbbbbbbbbbbbbb);",
                Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   Style.AlignOperands = FormatStyle::OAS_Align;
   verifyFormat("int a = f(aaaaaaaaaaaaaaaaaaaaaa &&\n"
                "          bbbbbbbbbbbbbbbbbbbbbb);",
                Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   Style.AlignOperands = FormatStyle::OAS_DontAlign;
   verifyFormat("int a = f(aaaaaaaaaaaaaaaaaaaaaa &&\n"
                "    bbbbbbbbbbbbbbbbbbbbbb);",
@@ -781,7 +792,10 @@ TEST_F(AlignBracketsTest, ParenthesesAndOperandAlignment) {
 TEST_F(AlignBracketsTest, BlockIndentAndNamespace) {
   auto Style = getLLVMStyleWithColumns(120);
   Style.AllowShortNamespacesOnASingleLine = true;
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Style.BreakAfterOpenBracketFunction = true;
+  Style.BreakAfterOpenBracketBracedList = true;
+  Style.BreakBeforeCloseBracketFunction = true;
+  Style.BreakBeforeCloseBracketBracedList = true;
 
   verifyNoCrash(
       "namespace {\n"
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index 6488e38badee7..d578fa7a1a1e8 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -172,6 +172,16 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
   CHECK_PARSE_BOOL(BinPackLongBracedList);
   CHECK_PARSE_BOOL(BreakAdjacentStringLiterals);
   CHECK_PARSE_BOOL(BreakAfterJavaFieldAnnotations);
+  CHECK_PARSE_BOOL(BreakAfterOpenBracketBracedList);
+  CHECK_PARSE_BOOL(BreakAfterOpenBracketFunction);
+  CHECK_PARSE_BOOL(BreakAfterOpenBracketIf);
+  CHECK_PARSE_BOOL(BreakAfterOpenBracketLoop);
+  CHECK_PARSE_BOOL(BreakAfterOpenBracketSwitch);
+  CHECK_PARSE_BOOL(BreakBeforeCloseBracketBracedList);
+  CHECK_PARSE_BOOL(BreakBeforeCloseBracketFunction);
+  CHECK_PARSE_BOOL(BreakBeforeCloseBracketIf);
+  CHECK_PARSE_BOOL(BreakBeforeCloseBracketLoop);
+  CHECK_PARSE_BOOL(BreakBeforeCloseBracketSwitch);
   CHECK_PARSE_BOOL(BreakBeforeTemplateCloser);
   CHECK_PARSE_BOOL(BreakBeforeTernaryOperators);
   CHECK_PARSE_BOOL(BreakStringLiterals);
@@ -533,20 +543,23 @@ TEST(ConfigParseTest, ParsesConfiguration) {
   CHECK_PARSE("EnumTrailingComma: Remove", EnumTrailingComma,
               FormatStyle::ETC_Remove);
 
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
-  CHECK_PARSE("AlignAfterOpenBracket: Align", AlignAfterOpenBracket,
-              FormatStyle::BAS_Align);
-  CHECK_PARSE("AlignAfterOpenBracket: DontAlign", AlignAfterOpenBracket,
-              FormatStyle::BAS_DontAlign);
+  Style.AlignAfterOpenBracket = false;
+  CHECK_PARSE("AlignAfterOpenBracket: Align", AlignAfterOpenBracket, true);
+  CHECK_PARSE("AlignAfterOpenBracket: DontAlign", AlignAfterOpenBracket, false);
+  // For backward compatibility:
   CHECK_PARSE("AlignAfterOpenBracket: AlwaysBreak", AlignAfterOpenBracket,
-              FormatStyle::BAS_AlwaysBreak);
+              true);
+  CHECK_PARSE("AlignAfterOpenBracket: AlwaysBreak\n"
+              "BreakAfterOpenBracketIf: false",
+              BreakAfterOpenBracketIf, false);
+  CHECK_PARSE("BreakAfterOpenBracketLoop: true\n"
+              "AlignAfterOpenBracket: AlwaysBreak",
+              BreakAfterOpenBracketLoop, true);
+  CHECK_PARSE("AlignAfterOpenBracket: false", AlignAfterOpenBracket, false);
   CHECK_PARSE("AlignAfterOpenBracket: BlockIndent", AlignAfterOpenBracket,
-              FormatStyle::BAS_BlockIndent);
-  // For backward compatibility:
-  CHECK_PARSE("AlignAfterOpenBracket: false", AlignAfterOpenBracket,
-              FormatStyle::BAS_DontAlign);
-  CHECK_PARSE("AlignAfterOpenBracket: true", AlignAfterOpenBracket,
-              FormatStyle::BAS_Align);
+              true);
+  Style.AlignAfterOpenBracket = false;
+  CHECK_PARSE("AlignAfterOpenBracket: true", AlignAfterOpenBracket, true);
 
   Style.AlignEscapedNewlines = FormatStyle::ENAS_Left;
   CHECK_PARSE("AlignEscapedNewlines: DontAlign", AlignEscapedNewlines,
@@ -576,20 +589,20 @@ TEST(ConfigParseTest, ParsesConfiguration) {
 
   CHECK_PARSE("AlignTrailingComments: Leave", AlignTrailingComments,
               FormatStyle::TrailingCommentsAlignmentStyle(
-                  {FormatStyle::TCAS_Leave, 0}));
+                  {FormatStyle::TCAS_Leave, 0, true}));
   CHECK_PARSE("AlignTrailingComments: Always", AlignTrailingComments,
               FormatStyle::TrailingCommentsAlignmentStyle(
-                  {FormatStyle::TCAS_Always, 0}));
+                  {FormatStyle::TCAS_Always, 0, true}));
   CHECK_PARSE("AlignTrailingComments: Never", AlignTrailingComments,
               FormatStyle::TrailingCommentsAlignmentStyle(
-                  {FormatStyle::TCAS_Never, 0}));
+                  {FormatStyle::TCAS_Never, 0, true}));
   // For backwards compatibility
   CHECK_PARSE("AlignTrailingComments: true", AlignTrailingComments,
               FormatStyle::TrailingCommentsAlignmentStyle(
-                  {FormatStyle::TCAS_Always, 0}));
+                  {FormatStyle::TCAS_Always, 0, true}));
   CHECK_PARSE("AlignTrailingComments: false", AlignTrailingComments,
               FormatStyle::TrailingCommentsAlignmentStyle(
-                  {FormatStyle::TCAS_Never, 0}));
+                  {FormatStyle::TCAS_Never, 0, true}));
   CHECK_PARSE_NESTED_VALUE("Kind: Always", AlignTrailingComments, Kind,
                            FormatStyle::TCAS_Always);
   CHECK_PARSE_NESTED_VALUE("Kind: Never", AlignTrailingComments, Kind,
@@ -598,6 +611,7 @@ TEST(ConfigParseTest, ParsesConfiguration) {
                            FormatStyle::TCAS_Leave);
   CHECK_PARSE_NESTED_VALUE("OverEmptyLines: 1234", AlignTrailingComments,
                            OverEmptyLines, 1234u);
+  CHECK_PARSE_NESTED_BOOL(AlignTrailingComments, AlignPPAndNotPP);
 
   Style.UseTab = FormatStyle::UT_ForIndentation;
   CHECK_PARSE("UseTab: Never", UseTab, FormatStyle::UT_Never);
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index d45babe1b82ad..c9446fa3ff317 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -5126,7 +5126,8 @@ TEST_F(FormatTest, DesignatedInitializers) {
 TEST_F(FormatTest, BracedInitializerIndentWidth) {
   auto Style = getLLVMStyleWithColumns(60);
   Style.BinPackArguments = true;
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
+  Style.BreakAfterOpenBracketBracedList = true;
   Style.BracedInitializerIndentWidth = 6;
 
   // Non-initializing braces are unaffected by BracedInitializerIndentWidth.
@@ -5302,7 +5303,8 @@ TEST_F(FormatTest, BracedInitializerIndentWidth) {
                Style);
 
   // Aligning after open braces unaffected by BracedInitializerIndentWidth.
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_Align;
+  Style.AlignAfterOpenBracket = true;
+  Style.BreakAfterOpenBracketBracedList = false;
   verifyFormat("SomeStruct s{\"xxxxxxxxxxxxx\", \"yyyyyyyyyyyyy\",\n"
                "             \"zzzzzzzzzzzzz\"};",
                Style);
@@ -7459,7 +7461,7 @@ TEST_F(FormatTest, ExpressionIndentationBreakingBeforeOperators) {
   Style.IndentWidth = 4;
   Style.TabWidth = 4;
   Style.UseTab = FormatStyle::UT_Always;
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   Style.AlignOperands = FormatStyle::OAS_DontAlign;
   verifyFormat("return someVeryVeryLongConditionThatBarelyFitsOnALine\n"
                "\t&& (someOtherLongishConditionPart1\n"
@@ -7470,7 +7472,7 @@ TEST_F(FormatTest, ExpressionIndentationBreakingBeforeOperators) {
                Style);
 
   Style = getLLVMStyleWithColumns(20);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   Style.BinPackParameters = FormatStyle::BPPS_OnePerLine;
   Style.BreakBeforeBinaryOperators = FormatStyle::BOS_NonAssignment;
   Style.ContinuationIndentWidth = 2;
@@ -7632,7 +7634,7 @@ TEST_F(FormatTest, NoOperandAlignment) {
                "        * cccccccccccccccccccccccccccccccccccc;",
                Style);
 
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   verifyFormat("return (a > b\n"
                "    // comment1\n"
                "    // comment2\n"
@@ -11248,7 +11250,7 @@ TEST_F(FormatTest, BreakBeforeTemplateCloser) {
 
 TEST_F(FormatTest, WrapsTemplateParameters) {
   FormatStyle Style = getLLVMStyle();
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   Style.BreakBeforeBinaryOperators = FormatStyle::BOS_None;
   verifyFormat(
       "template <typename... a> struct q {};\n"
@@ -11256,7 +11258,7 @@ TEST_F(FormatTest, WrapsTemplateParameters) {
       "    aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaa>\n"
       "    y;",
       Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_DontAlign;
+  Style.AlignAfterOpenBracket = false;
   Style.BreakBeforeBinaryOperators = FormatStyle::BOS_All;
   verifyFormat(
       "template <typename... a> struct r {};\n"
@@ -11264,7 +11266,7 @@ TEST_F(FormatTest, WrapsTemplateParameters) {
       "    aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaa>\n"
       "    y;",
       Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   Style.BreakBeforeBinaryOperators = FormatStyle::BOS_None;
   verifyFormat("template <typename... a> struct s {};\n"
                "extern s<\n"
@@ -11274,7 +11276,7 @@ TEST_F(FormatTest, WrapsTemplateParameters) {
                "aaaaaaaaaaaaaaaaaaaaaa>\n"
                "    y;",
                Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   Style.BreakBeforeBinaryOperators = FormatStyle::BOS_All;
   verifyFormat("template <typename... a> struct t {};\n"
                "extern t<\n"
@@ -14302,7 +14304,7 @@ TEST_F(FormatTest, LayoutCxx11BraceInitializers) {
                "};",
                NoBinPacking);
 
-  NoBinPacking.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  NoBinPacking.BreakAfterOpenBracketBracedList = true;
   verifyFormat("static uint8 CddDp83848Reg[] = {\n"
                "    CDDDP83848_BMCR_REGISTER,\n"
                "    CDDDP83848_BMSR_REGISTER,\n"
@@ -15972,13 +15974,14 @@ TEST_F(FormatTest, BreaksStringLiteralOperands) {
   // In a function call with two operands, with AlignAfterOpenBracket enabled,
   // the first must be broken with a line break before it.
   FormatStyle Style = getLLVMStyleWithColumns(25);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   verifyFormat("someFunction(\n"
                "    \"long long long \"\n"
                "    \"long\",\n"
                "    a);",
                "someFunction(\"long long long long\", a);", Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Style.BreakAfterOpenBracketFunction = true;
+  Style.BreakBeforeCloseBracketFunction = true;
   verifyFormat("someFunction(\n"
                "    \"long long long \"\n"
                "    \"long\",\n"
@@ -17773,7 +17776,7 @@ TEST_F(FormatTest, ConfigurableSpacesInParens) {
 
   Spaces.ColumnLimit = 80;
   Spaces.IndentWidth = 4;
-  Spaces.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Spaces.BreakAfterOpenBracketFunction = true;
   verifyFormat("void foo( ) {\n"
                "    size_t foo = (*(function))(\n"
                "        Foooo, Barrrrr, Foooo, Barrrr, FoooooooooLooooong, "
@@ -17798,7 +17801,8 @@ TEST_F(FormatTest, ConfigurableSpacesInParens) {
                "}",
                Spaces);
 
-  Spaces.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+  Spaces.BreakAfterOpenBracketFunction = true;
+  Spaces.BreakBeforeCloseBracketFunction = true;
   verifyFormat("void foo( ) {\n"
                "    size_t foo = (*(function))(\n"
                "        Foooo, Barrrrr, Foooo, Barrrr, FoooooooooLooooong, "
@@ -20820,6 +20824,13 @@ TEST_F(FormatTest, AlignWithLineBreaks) {
                "    argument1,\n"
                "    argument2);",
                Style);
+
+  Style.ColumnLimit = 45;
+  verifyFormat("auto xxxxxxxx = foo;\n"
+               "auto x = whatever ? some / long -\n"
+               "                        computition / stuff\n"
+               "                  : random;",
+               Style);
 }
 
 TEST_F(FormatTest, AlignWithInitializerPeriods) {
@@ -22261,6 +22272,19 @@ TEST_F(FormatTest, CatchAlignArrayOfStructuresLeftAlignment) {
                "});",
                Style);
 
+  verifyNoCrash(
+      "PANEL_Ic PANEL_ic[PANEL_IC_NUMBER] =\n"
+      "    {\n"
+      "        {PIC(0),   PIC(0),   PIC(99),  PIC(81),  0}, // Backbox\n"
+      "        {PIC(1),   PIC(83),  PIC(191), PIC(137), 0}, // AK47\n"
+      "\n"
+      "#define PICALL1(a, b, c, d) \\\n"
+      "    { PIC(a), PIC(b), PIC(c), PIC(d), 1 }\n"
+      "\n"
+      "        PICALL1(1, 1, 75, 50),\n"
+      "};",
+      Style);
+
   Style.AlignEscapedNewlines = FormatStyle::ENAS_DontAlign;
   verifyFormat("#define FOO \\\n"
                "  int foo[][2] = { \\\n"
@@ -22827,7 +22851,7 @@ TEST_F(FormatTest, ConstructorInitializerIndentWidth) {
       ": aaaaaaaaaaaaa(aaaaaaaaaaaaaa), aaaaaaaaaaaaa(aaaaaaaaaaaaaa),\n"
       "  aaaaaaaaaaaaa(aaaaaaaaaaaaaa) {}",
       Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   verifyFormat(
       "SomeLongTemplateVariableName<\n"
       "    aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa>",
@@ -24082,7 +24106,7 @@ TEST_F(FormatTest, FormatsLambdas) {
                "      return aFunkyFunctionCall(qux);\n"
                "    }} {}",
                Style);
-  Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  Style.BreakAfterOpenBracketFunction = true;
   // FIXME: The following test should pass, but fails at the time of writing.
 #if 0
   // As long as all the non-lambda arguments fit on a single line, AlwaysBreak
diff --git a/clang/unittests/Format/FormatTestComments.cpp b/clang/unittests/Format/FormatTestComments.cpp
index 399f8357692ba..684d3014fa7bb 100644
--- a/clang/unittests/Format/FormatTestComments.cpp
+++ b/clang/unittests/Format/FormatTestComments.cpp
@@ -99,14 +99,6 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) {
   const auto Style20 = getLLVMStyleWithColumns(20);
 
   verifyFormat("enum A {\n"
-               "  // line a\n"
-               "  a,\n"
-               "  b, // line b\n"
-               "\n"
-               "  // line c\n"
-               "  c\n"
-               "};",
-               "enum A {\n"
                "  // line a\n"
                "  a,\n"
                "  b, // line b\n"
@@ -115,15 +107,11 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) {
                "  c\n"
                "};",
                Style20);
-  verifyFormat("enum A {\n"
-               "  a, // line 1\n"
-               "  // line 2\n"
-               "};",
-               "enum A {\n"
-               "  a, // line 1\n"
-               "  // line 2\n"
-               "};",
-               Style20);
+  verifyNoChange("enum A {\n"
+                 "  a, // line 1\n"
+                 "  // line 2\n"
+                 "};",
+                 Style20);
   verifyFormat("enum A {\n"
                "  a, // line 1\n"
                "     // line 2\n"
@@ -133,17 +121,12 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) {
                "   // line 2\n"
                "};",
                Style20);
-  verifyFormat("enum A {\n"
-               "  a, // line 1\n"
-               "  // line 2\n"
-               "  b\n"
-               "};",
-               "enum A {\n"
-               "  a, // line 1\n"
-               "  // line 2\n"
-               "  b\n"
-               "};",
-               Style20);
+  verifyNoChange("enum A {\n"
+                 "  a, // line 1\n"
+                 "  // line 2\n"
+                 "  b\n"
+                 "};",
+                 Style20);
   verifyFormat("enum A {\n"
                "  a, // line 1\n"
                "     // line 2\n"
@@ -487,12 +470,9 @@ TEST_F(FormatTestComments, AlignsBlockComments) {
                " Don't try to outdent if there's not enough indentation.\n"
                " */");
 
-  verifyFormat("int i; /* Comment with empty...\n"
-               "        *\n"
-               "        * line. */",
-               "int i; /* Comment with empty...\n"
-               "        *\n"
-               "        * line. */");
+  verifyNoChange("int i; /* Comment with empty...\n"
+                 "        *\n"
+                 "        * line. */");
   verifyFormat("int foobar = 0; /* comment */\n"
                "int bar = 0;    /* multiline\n"
                "                   comment 1 */\n"
@@ -555,8 +535,6 @@ TEST_F(FormatTestComments, CommentReflowingCanApplyOnlyToIndents) {
 
 TEST_F(FormatTestComments, CorrectlyHandlesLengthOfBlockComments) {
   verifyFormat("double *x; /* aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
-               "              aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa */",
-               "double *x; /* aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
                "              aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa */");
   verifyFormat(
       "void ffffffffffff(\n"
@@ -680,17 +658,12 @@ TEST_F(FormatTestComments, SplitsLongCxxComments) {
   verifyFormat("#define XXX // q w e r\n"
                "            // t y u i",
                "#define XXX //q w e r t y u i", Style22);
-  verifyFormat("{\n"
-               "  //\n"
-               "  //\\\n"
-               "  // long 1 2 3 4 5\n"
-               "}",
-               "{\n"
-               "  //\n"
-               "  //\\\n"
-               "  // long 1 2 3 4 5\n"
-               "}",
-               Style20);
+  verifyNoChange("{\n"
+                 "  //\n"
+                 "  //\\\n"
+                 "  // long 1 2 3 4 5\n"
+                 "}",
+                 Style20);
   verifyFormat("{\n"
                "  //\n"
                "  //\\\n"
@@ -730,19 +703,13 @@ TEST_F(FormatTestComments, PreservesHangingIndentInCxxComments) {
 }
 
 TEST_F(FormatTestComments, DontSplitLineCommentsWithEscapedNewlines) {
-  verifyFormat("// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
-               "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
-               "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
-               "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
-               "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
-               "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
-  verifyFormat("int a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
-               "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
-               "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
-               "int a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
-               "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
-               "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
-               getLLVMStyleWithColumns(50));
+  verifyNoChange("// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
+                 "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\\n"
+                 "// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
+  verifyNoChange("int a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
+                 "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
+                 "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
+                 getLLVMStyleWithColumns(50));
   verifyFormat("double\n"
                "    a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
                "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
@@ -773,10 +740,8 @@ TEST_F(FormatTestComments, DontIntroduceMultilineComments) {
 TEST_F(FormatTestComments, DontSplitLineCommentsWithPragmas) {
   FormatStyle Pragmas = getLLVMStyleWithColumns(30);
   Pragmas.CommentPragmas = "^ IWYU pragma:";
-  verifyFormat("// IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb",
-               "// IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb", Pragmas);
-  verifyFormat("/* IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb */",
-               "/* IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb */", Pragmas);
+  verifyFormat("// IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb", Pragmas);
+  verifyFormat("/* IWYU pragma: aaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbb */", Pragmas);
 }
 
 TEST_F(FormatTestComments, PriorityOfCommentBreaking) {
@@ -812,26 +777,17 @@ TEST_F(FormatTestComments, PriorityOfCommentBreaking) {
 
 TEST_F(FormatTestComments, MultiLineCommentsInDefines) {
   const auto Style17 = getLLVMStyleWithColumns(17);
-  verifyFormat("#define A(x) /* \\\n"
-               "  a comment     \\\n"
-               "  inside */     \\\n"
-               "  f();",
-               "#define A(x) /* \\\n"
-               "  a comment     \\\n"
-               "  inside */     \\\n"
-               "  f();",
-               Style17);
-  verifyFormat("#define A(      \\\n"
-               "    x) /*       \\\n"
-               "  a comment     \\\n"
-               "  inside */     \\\n"
-               "  f();",
-               "#define A(      \\\n"
-               "    x) /*       \\\n"
-               "  a comment     \\\n"
-               "  inside */     \\\n"
-               "  f();",
-               Style17);
+  verifyNoChange("#define A(x) /* \\\n"
+                 "  a comment     \\\n"
+                 "  inside */     \\\n"
+                 "  f();",
+                 Style17);
+  verifyNoChange("#define A(      \\\n"
+                 "    x) /*       \\\n"
+                 "  a comment     \\\n"
+                 "  inside */     \\\n"
+                 "  f();",
+                 Style17);
 }
 
 TEST_F(FormatTestComments, LineCommentsInMacrosDoNotGetEscapedNewlines) {
@@ -865,33 +821,20 @@ TEST_F(FormatTestComments, ParsesCommentsAdjacentToPPDirectives) {
 TEST_F(FormatTestComments, KeepsLevelOfCommentBeforePPDirective) {
   // Keep the current level if the comment was originally not aligned with
   // the preprocessor directive.
-  verifyFormat("void f() {\n"
-               "  int i;\n"
-               "  /* comment */\n"
-               "#ifdef A\n"
-               "  int j;\n"
-               "}",
-               "void f() {\n"
-               "  int i;\n"
-               "  /* comment */\n"
-               "#ifdef A\n"
-               "  int j;\n"
-               "}");
+  verifyNoChange("void f() {\n"
+                 "  int i;\n"
+                 "  /* comment */\n"
+                 "#ifdef A\n"
+                 "  int j;\n"
+                 "}");
 
-  verifyFormat("void f() {\n"
-               "  int i;\n"
-               "  /* comment */\n"
-               "\n"
-               "#ifdef A\n"
-               "  int j;\n"
-               "}",
-               "void f() {\n"
-               "  int i;\n"
-               "  /* comment */\n"
-               "\n"
-               "#ifdef A\n"
-               "  int j;\n"
-               "}");
+  verifyNoChange("void f() {\n"
+                 "  int i;\n"
+                 "  /* comment */\n"
+                 "\n"
+                 "#ifdef A\n"
+                 "  int j;\n"
+                 "}");
 
   verifyFormat("int f(int i) {\n"
                "  if (true) {\n"
@@ -1060,18 +1003,12 @@ TEST_F(FormatTestComments, KeepsLevelOfCommentBeforePPDirective) {
   // Align with the preprocessor directive if the comment was originally aligned
   // with the preprocessor directive and there is no newline between the comment
   // and the preprocessor directive.
-  verifyFormat("void f() {\n"
-               "  int i;\n"
-               "/* comment */\n"
-               "#ifdef A\n"
-               "  int j;\n"
-               "}",
-               "void f() {\n"
-               "  int i;\n"
-               "/* comment */\n"
-               "#ifdef A\n"
-               "  int j;\n"
-               "}");
+  verifyNoChange("void f() {\n"
+                 "  int i;\n"
+                 "/* comment */\n"
+                 "#ifdef A\n"
+                 "  int j;\n"
+                 "}");
 
   verifyFormat("int f(int i) {\n"
                "  if (true) {\n"
@@ -1245,13 +1182,10 @@ TEST_F(FormatTestComments, SplitsLongLinesInComments) {
                "   wherever_a_space_occurs                             \n"
                " */",
                Style20);
-  verifyFormat("/*\n"
-               " *    This_comment_can_not_be_broken_into_lines\n"
-               " */",
-               "/*\n"
-               " *    This_comment_can_not_be_broken_into_lines\n"
-               " */",
-               Style20);
+  verifyNoChange("/*\n"
+                 " *    This_comment_can_not_be_broken_into_lines\n"
+                 " */",
+                 Style20);
   verifyFormat("{\n"
                "  /*\n"
                "  This is another\n"
@@ -1445,17 +1379,12 @@ TEST_F(FormatTestComments, AlignsPPElseEndifComments) {
                "int iiii; // CC\n"
                "#endif // B",
                Style20);
-  verifyFormat("#if A\n"
-               "#else  // A1\n"
-               "       // A2\n"
-               "int ii;\n"
-               "#endif // B",
-               "#if A\n"
-               "#else  // A1\n"
-               "       // A2\n"
-               "int ii;\n"
-               "#endif // B",
-               Style20);
+  verifyNoChange("#if A\n"
+                 "#else  // A1\n"
+                 "       // A2\n"
+                 "int ii;\n"
+                 "#endif // B",
+                 Style20);
 }
 
 TEST_F(FormatTestComments, CommentsInStaticInitializers) {
@@ -1526,10 +1455,6 @@ TEST_F(FormatTestComments, CommentsInStaticInitializers) {
 
 TEST_F(FormatTestComments, LineCommentsAfterRightBrace) {
   verifyFormat("if (true) { // comment about branch\n"
-               "  // comment about f\n"
-               "  f();\n"
-               "}",
-               "if (true) { // comment about branch\n"
                "  // comment about f\n"
                "  f();\n"
                "}");
@@ -1582,6 +1507,7 @@ TEST_F(FormatTestComments, LineCommentsAfterRightBrace) {
 TEST_F(FormatTestComments, ReflowsComments) {
   const auto Style20 = getLLVMStyleWithColumns(20);
   const auto Style22 = getLLVMStyleWithColumns(22);
+
   // Break a long line and reflow with the full next line.
   verifyFormat("// long long long\n"
                "// long long",
@@ -2149,11 +2075,9 @@ TEST_F(FormatTestComments, ReflowsComments) {
                Style20);
 
   // Don't break or reflow comments on import lines.
-  verifyFormat("#include \"t\" /* l l l\n"
-               "                * l */",
-               "#include \"t\" /* l l l\n"
-               "                * l */",
-               Style20);
+  verifyNoChange("#include \"t\" /* l l l\n"
+                 "                * l */",
+                 Style20);
 
   // Don't reflow between different trailing comment sections.
   verifyFormat("int i; // long long\n"
@@ -2209,7 +2133,9 @@ TEST_F(FormatTestComments, ReflowsCommentsPrecise) {
                "// some text that reflows\n"
                "// into   foo",
                Style);
+
   Style.ColumnLimit = 21;
+
   // Given one more column, "// reflows into   foo" does fit the limit, so we
   // do not compress the whitespace.
   verifyFormat("// some text that\n"
@@ -2228,6 +2154,7 @@ TEST_F(FormatTestComments, ReflowsCommentsPrecise) {
                "// some text that reflows\n"
                "// into1234567",
                Style);
+
   // Secondly, when the next line ends later, but the first word in that line
   // is precisely one column over the limit, do not reflow.
   verifyFormat("// some text that\n"
@@ -2240,6 +2167,7 @@ TEST_F(FormatTestComments, ReflowsCommentsPrecise) {
 
 TEST_F(FormatTestComments, ReflowsCommentsWithExtraWhitespace) {
   const auto Style16 = getLLVMStyleWithColumns(16);
+
   // Baseline.
   verifyFormat("// some text\n"
                "// that re flows",
@@ -2546,37 +2474,23 @@ TEST_F(FormatTestComments, BlockComments) {
 
   verifyFormat("void f(int * /* unused */) {}");
 
-  verifyFormat("/*\n"
-               " **\n"
-               " */",
-               "/*\n"
-               " **\n"
-               " */");
-  verifyFormat("/*\n"
-               " *q\n"
-               " */",
-               "/*\n"
-               " *q\n"
-               " */");
-  verifyFormat("/*\n"
-               " * q\n"
-               " */",
-               "/*\n"
-               " * q\n"
-               " */");
-  verifyFormat("/*\n"
-               " **/",
-               "/*\n"
-               " **/");
-  verifyFormat("/*\n"
-               " ***/",
-               "/*\n"
-               " ***/");
+  verifyNoChange("/*\n"
+                 " **\n"
+                 " */");
+  verifyNoChange("/*\n"
+                 " *q\n"
+                 " */");
+  verifyNoChange("/*\n"
+                 " * q\n"
+                 " */");
+  verifyNoChange("/*\n"
+                 " **/");
+  verifyNoChange("/*\n"
+                 " ***/");
 }
 
 TEST_F(FormatTestComments, BlockCommentsInMacros) {
   const auto Style20 = getLLVMStyleWithColumns(20);
-
   verifyFormat("#define A          \\\n"
                "  {                \\\n"
                "    /* one line */ \\\n"
@@ -2599,7 +2513,6 @@ TEST_F(FormatTestComments, BlockCommentsInMacros) {
 
 TEST_F(FormatTestComments, BlockCommentsAtEndOfLine) {
   const auto Style15 = getLLVMStyleWithColumns(15);
-
   verifyFormat("a = {\n"
                "    1111 /*    */\n"
                "};",
@@ -2770,11 +2683,6 @@ TEST_F(FormatTestComments, AlignTrailingComments) {
   // Align comment line sections aligned with the next token with the next
   // token.
   verifyFormat("class A {\n"
-               "public: // public comment\n"
-               "  // comment about a\n"
-               "  int a;\n"
-               "};",
-               "class A {\n"
                "public: // public comment\n"
                "  // comment about a\n"
                "  int a;\n"
@@ -3106,41 +3014,26 @@ TEST_F(FormatTestComments, AlignTrailingCommentsLeave) {
   FormatStyle Style = getLLVMStyle();
   Style.AlignTrailingComments.Kind = FormatStyle::TCAS_Leave;
 
-  verifyFormat("int a;// do not touch\n"
-               "int b; // any comments\n"
-               "int c;  // comment\n"
-               "int d;   // comment",
-               "int a;// do not touch\n"
-               "int b; // any comments\n"
-               "int c;  // comment\n"
-               "int d;   // comment",
-               Style);
+  verifyNoChange("int a;// do not touch\n"
+                 "int b; // any comments\n"
+                 "int c;  // comment\n"
+                 "int d;   // comment",
+                 Style);
 
-  verifyFormat("int a;   // do not touch\n"
-               "int b;  // any comments\n"
-               "int c; // comment\n"
-               "int d;// comment",
-               "int a;   // do not touch\n"
-               "int b;  // any comments\n"
-               "int c; // comment\n"
-               "int d;// comment",
-               Style);
+  verifyNoChange("int a;   // do not touch\n"
+                 "int b;  // any comments\n"
+                 "int c; // comment\n"
+                 "int d;// comment",
+                 Style);
 
-  verifyFormat("// do not touch\n"
-               "int a;  // any comments\n"
-               "\n"
-               "   // comment\n"
-               "// comment\n"
-               "\n"
-               "// comment",
-               "// do not touch\n"
-               "int a;  // any comments\n"
-               "\n"
-               "   // comment\n"
-               "// comment\n"
-               "\n"
-               "// comment",
-               Style);
+  verifyNoChange("// do not touch\n"
+                 "int a;  // any comments\n"
+                 "\n"
+                 "   // comment\n"
+                 "// comment\n"
+                 "\n"
+                 "// comment",
+                 Style);
 
   verifyFormat("// do not touch\n"
                "int a;  // any comments\n"
@@ -3178,23 +3071,15 @@ TEST_F(FormatTestComments, AlignTrailingCommentsLeave) {
 
   // Allow to keep 2 empty lines
   Style.MaxEmptyLinesToKeep = 2;
-  verifyFormat("// do not touch\n"
-               "int a;  // any comments\n"
-               "\n"
-               "\n"
-               "   // comment\n"
-               "// comment\n"
-               "\n"
-               "// comment",
-               "// do not touch\n"
-               "int a;  // any comments\n"
-               "\n"
-               "\n"
-               "   // comment\n"
-               "// comment\n"
-               "\n"
-               "// comment",
-               Style);
+  verifyNoChange("// do not touch\n"
+                 "int a;  // any comments\n"
+                 "\n"
+                 "\n"
+                 "   // comment\n"
+                 "// comment\n"
+                 "\n"
+                 "// comment",
+                 Style);
   Style.MaxEmptyLinesToKeep = 1;
 
   // Just format comments normally when leaving exceeds the column limit
@@ -3233,16 +3118,16 @@ TEST_F(FormatTestComments, DontAlignNamespaceComments) {
   Style.NamespaceMacros.push_back("TESTSUITE");
   Style.ShortNamespaceLines = 0;
 
-  StringRef Input = "namespace A {\n"
-                    "  TESTSUITE(B) {\n"
-                    "    namespace C {\n"
-                    "      namespace D { //\n"
-                    "      } // namespace D\n"
-                    "      std::string Foo = Bar; // Comment\n"
-                    "      std::string BazString = Baz;   // C2\n"
-                    "    }          // namespace C\n"
-                    "  }\n"
-                    "} // NaMeSpAcE A";
+  constexpr StringRef Input("namespace A {\n"
+                            "  TESTSUITE(B) {\n"
+                            "    namespace C {\n"
+                            "      namespace D { //\n"
+                            "      } // namespace D\n"
+                            "      std::string Foo = Bar; // Comment\n"
+                            "      std::string BazString = Baz;   // C2\n"
+                            "    }          // namespace C\n"
+                            "  }\n"
+                            "} // NaMeSpAcE A");
 
   EXPECT_TRUE(Style.FixNamespaceComments);
   EXPECT_EQ(Style.AlignTrailingComments.Kind, FormatStyle::TCAS_Always);
@@ -3326,21 +3211,21 @@ TEST_F(FormatTestComments, DontAlignNamespaceComments) {
 
   Style.AlignTrailingComments.Kind = FormatStyle::TCAS_Always;
   Style.FixNamespaceComments = true;
-  Input = "namespace A {\n"
-          "  int Foo;\n"
-          "  int Bar;\n"
-          "}\n"
-          "// Comment";
+  constexpr StringRef Code("namespace A {\n"
+                           "  int Foo;\n"
+                           "  int Bar;\n"
+                           "}\n"
+                           "// Comment");
 
   verifyFormat("namespace A {\n"
                "  int Foo;\n"
                "  int Bar;\n"
                "} // namespace A\n"
                "// Comment",
-               Input, Style);
+               Code, Style);
 
   Style.FixNamespaceComments = false;
-  verifyFormat(Input, Style);
+  verifyFormat(Code, Style);
 }
 
 TEST_F(FormatTestComments, DontAlignOverScope) {
@@ -3493,15 +3378,73 @@ TEST_F(FormatTestComments, DontAlignOverScope) {
                "int foobar; // group");
 }
 
+TEST_F(FormatTestComments, DontAlignOverPPDirective) {
+  auto Style = getLLVMStyle();
+  Style.AlignTrailingComments.AlignPPAndNotPP = false;
+
+  verifyFormat("int i;    // Aligned\n"
+               "int long; // with this\n"
+               "#define FOO    // only aligned\n"
+               "#define LOOONG // with other pp directives\n"
+               "int loooong; // new alignment",
+               "int i;//Aligned\n"
+               "int long;//with this\n"
+               "#define FOO //only aligned\n"
+               "#define LOOONG //with other pp directives\n"
+               "int loooong; //new alignment",
+               Style);
+
+  verifyFormat("#define A  // Comment\n"
+               "#define AB // Comment",
+               Style);
+
+  Style.ColumnLimit = 30;
+  verifyNoChange("#define A // Comment\n"
+                 "          // Continued\n"
+                 "int i = 0; // New Stuff\n"
+                 "           // Continued\n"
+                 "#define Func(X)              \\\n"
+                 "  X();                       \\\n"
+                 "  X(); // Comment\n"
+                 "       // Continued\n"
+                 "long loong = 1; // Dont align",
+                 Style);
+
+  verifyFormat("#define A   // Comment that\n"
+               "            // would wrap\n"
+               "#define FOO // For the\n"
+               "            // alignment\n"
+               "#define B   // Also\n"
+               "            // aligned",
+               "#define A // Comment that would wrap\n"
+               "#define FOO // For the alignment\n"
+               "#define B // Also\n"
+               " // aligned",
+               Style);
+
+  Style.AlignTrailingComments.OverEmptyLines = 1;
+  verifyNoChange("#define A // Comment\n"
+                 "\n"
+                 "          // Continued\n"
+                 "int i = 0; // New Stuff\n"
+                 "\n"
+                 "           // Continued\n"
+                 "#define Func(X)              \\\n"
+                 "  X();                       \\\n"
+                 "  X(); // Comment\n"
+                 "\n"
+                 "       // Continued\n"
+                 "long loong = 1; // Dont align",
+                 Style);
+}
+
 TEST_F(FormatTestComments, AlignsBlockCommentDecorations) {
   verifyFormat("/*\n"
                " */",
                "/*\n"
                "*/");
-  verifyFormat("/*\n"
-               " */",
-               "/*\n"
-               " */");
+  verifyNoChange("/*\n"
+                 " */");
   verifyFormat("/*\n"
                " */",
                "/*\n"
@@ -3512,10 +3455,8 @@ TEST_F(FormatTestComments, AlignsBlockCommentDecorations) {
                " * line */",
                "/*\n"
                "* line */");
-  verifyFormat("/*\n"
-               " * line */",
-               "/*\n"
-               " * line */");
+  verifyNoChange("/*\n"
+                 " * line */");
   verifyFormat("/*\n"
                " * line */",
                "/*\n"
@@ -3528,10 +3469,8 @@ TEST_F(FormatTestComments, AlignsBlockCommentDecorations) {
                " * line */",
                "/**\n"
                "* line */");
-  verifyFormat("/**\n"
-               " * line */",
-               "/**\n"
-               " * line */");
+  verifyNoChange("/**\n"
+                 " * line */");
   verifyFormat("/**\n"
                " * line */",
                "/**\n"
@@ -3566,10 +3505,8 @@ TEST_F(FormatTestComments, AlignsBlockCommentDecorations) {
                "  */");
 
   // Align two lines.
-  verifyFormat("/* line 1\n"
-               " * line 2 */",
-               "/* line 1\n"
-               " * line 2 */");
+  verifyNoChange("/* line 1\n"
+                 " * line 2 */");
   verifyFormat("/* line 1\n"
                " * line 2 */",
                "/* line 1\n"
@@ -3590,10 +3527,8 @@ TEST_F(FormatTestComments, AlignsBlockCommentDecorations) {
                "        * line 2 */",
                "int i; /* line 1\n"
                "* line 2 */");
-  verifyFormat("int i; /* line 1\n"
-               "        * line 2 */",
-               "int i; /* line 1\n"
-               "        * line 2 */");
+  verifyNoChange("int i; /* line 1\n"
+                 "        * line 2 */");
   verifyFormat("int i; /* line 1\n"
                "        * line 2 */",
                "int i; /* line 1\n"
@@ -3695,6 +3630,7 @@ TEST_F(FormatTestComments, NonTrailingBlockComments) {
 
 TEST_F(FormatTestComments, PythonStyleComments) {
   const auto ProtoStyle20 = getTextProtoStyleWithColumns(20);
+
   // Keeps a space after '#'.
   verifyFormat("# comment\n"
                "key: value",
@@ -3798,8 +3734,6 @@ TEST_F(FormatTestComments, ReflowBackslashCrash) {
 TEST_F(FormatTestComments, IndentsLongJavadocAnnotatedLines) {
   FormatStyle Style = getGoogleStyle(FormatStyle::LK_Java);
   Style.ColumnLimit = 60;
-  FormatStyle Style20 = getGoogleStyle(FormatStyle::LK_Java);
-  Style20.ColumnLimit = 20;
   verifyFormat("/**\n"
                " * @param x long long long long long long long long long\n"
                " *     long\n"
@@ -3827,6 +3761,10 @@ TEST_F(FormatTestComments, IndentsLongJavadocAnnotatedLines) {
                "long long long long long long long long long long long\n"
                " */",
                Style);
+
+  FormatStyle Style20 = getGoogleStyle(FormatStyle::LK_Java);
+  Style20.ColumnLimit = 20;
+
   verifyFormat("/**\n"
                " * Sentence that\n"
                " * should be broken.\n"
@@ -3895,7 +3833,6 @@ TEST_F(FormatTestComments, IndentsLongJavadocAnnotatedLines) {
 }
 
 TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
-  FormatStyle Style = getLLVMStyle();
   constexpr StringRef NoTextInComment(" //       \n"
                                       "\n"
                                       "void foo() {// \n"
@@ -3907,8 +3844,9 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
                "void foo() { //\n"
                "  //\n"
                "}",
-               NoTextInComment, Style);
+               NoTextInComment);
 
+  auto Style = getLLVMStyle();
   Style.SpacesInLineCommentPrefix.Minimum = 0;
   verifyFormat("//#comment", Style);
   verifyFormat("//\n"
@@ -3927,7 +3865,6 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
                "}",
                NoTextInComment, Style);
 
-  Style = getLLVMStyle();
   constexpr StringRef Code(
       "//Free comment without space\n"
       "\n"
@@ -4216,8 +4153,9 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
       "//  vv will only move\n"
       "//  } if the line above does");
 
-  verifyFormat(Code2, Code, Style);
+  verifyFormat(Code2, Code);
 
+  Style = getLLVMStyle();
   Style.SpacesInLineCommentPrefix = {0, 0};
   verifyFormat("//#comment", "//   #comment", Style);
   verifyFormat(Code3, Code, Style);
@@ -4226,12 +4164,12 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
   verifyFormat(Code4, Code, Style);
 
   Style = getLLVMStyleWithColumns(20);
-  StringRef WrapCode = "//Lorem ipsum dolor sit amet\n"
-                       "\n"
-                       "//  Lorem   ipsum   dolor   sit   amet\n"
-                       "\n"
-                       "void f() {//Hello World\n"
-                       "}";
+  constexpr StringRef WrapCode("//Lorem ipsum dolor sit amet\n"
+                               "\n"
+                               "//  Lorem   ipsum   dolor   sit   amet\n"
+                               "\n"
+                               "void f() {//Hello World\n"
+                               "}");
 
   verifyFormat("// Lorem ipsum dolor\n"
                "// sit amet\n"
@@ -4506,20 +4444,20 @@ TEST_F(FormatTestComments, SplitCommentIntroducers) {
 }
 
 TEST_F(FormatTestComments, LineCommentsOnStartOfFunctionCall) {
-  auto Style = getLLVMStyle();
-
-  EXPECT_EQ(Style.Cpp11BracedListStyle, FormatStyle::BLS_AlignFirstComment);
   verifyFormat("Type name{// Comment\n"
-               "          value};",
-               Style);
+               "          value};");
 
+  auto Style = getLLVMStyle();
+  EXPECT_EQ(Style.Cpp11BracedListStyle, FormatStyle::BLS_AlignFirstComment);
   Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
+
   verifyFormat("Type name{ // Comment\n"
                "           value\n"
                "};",
                Style);
 
   Style.Cpp11BracedListStyle = FormatStyle::BLS_FunctionCall;
+
   verifyFormat("Type name{ // Comment\n"
                "    value};",
                Style);
diff --git a/clang/unittests/Format/FormatTestJS.cpp b/clang/unittests/Format/FormatTestJS.cpp
index 91577b9a49167..4847151c14b33 100644
--- a/clang/unittests/Format/FormatTestJS.cpp
+++ b/clang/unittests/Format/FormatTestJS.cpp
@@ -2883,7 +2883,7 @@ TEST_F(FormatTestJS, DontBreakFieldsAsGoToLabels) {
 
 TEST_F(FormatTestJS, BreakAfterOpenBracket) {
   auto Style = getGoogleStyle(FormatStyle::LK_JavaScript);
-  EXPECT_EQ(Style.AlignAfterOpenBracket, FormatStyle::BAS_AlwaysBreak);
+  EXPECT_EQ(Style.BreakAfterOpenBracketFunction, true);
   verifyFormat("ctrl.onCopy(/** @type {!WizEvent}*/ (\n"
                "    {event, targetElement: {el: () => selectedElement}}));",
                Style);
diff --git a/clang/unittests/Format/FormatTestJava.cpp b/clang/unittests/Format/FormatTestJava.cpp
index 1416614bae29a..3cc97e2dc0b2e 100644
--- a/clang/unittests/Format/FormatTestJava.cpp
+++ b/clang/unittests/Format/FormatTestJava.cpp
@@ -848,6 +848,19 @@ TEST_F(FormatTestJava, TextBlock) {
                  "              Pat Q. Smith");
 }
 
+TEST_F(FormatTestJava, BreakAfterRecord) {
+  auto Style = getLLVMStyle(FormatStyle::LK_Java);
+  Style.EmptyLineBeforeAccessModifier = FormatStyle::ELBAMS_Never;
+  Style.BreakBeforeBraces = FormatStyle::BS_Custom;
+  Style.BraceWrapping.AfterClass = true;
+  Style.BraceWrapping.SplitEmptyRecord = true;
+
+  verifyFormat("public record Foo(int i)\n"
+               "{\n"
+               "}",
+               "public record Foo(int i) {}", Style);
+}
+
 } // namespace
 } // namespace test
 } // namespace format
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index c046142c613b0..815c79e68dac9 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -799,6 +799,30 @@ TEST_F(TokenAnnotatorTest, UnderstandsTemplateTemplateParameters) {
   EXPECT_TOKEN(Tokens[23], tok::identifier, TT_ClassHeadName);
 }
 
+TEST_F(TokenAnnotatorTest, UnderstandsCommonCppTemplates) {
+  auto Tokens =
+      annotate("static_assert(std::conditional_t<A || B, C, D>::value);");
+  ASSERT_EQ(Tokens.size(), 19u) << Tokens;
+  EXPECT_TOKEN(Tokens[5], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[13], tok::greater, TT_TemplateCloser);
+
+  Tokens =
+      annotate("static_assert(std::conditional<A || B, C, D>::type::value);");
+  ASSERT_EQ(Tokens.size(), 21u) << Tokens;
+  EXPECT_TOKEN(Tokens[5], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[13], tok::greater, TT_TemplateCloser);
+
+  Tokens = annotate("static_assert(fancy_v<A || B>);");
+  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
+
+  Tokens = annotate("static_assert(fancy<A || B>::value);");
+  ASSERT_EQ(Tokens.size(), 13u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
+}
+
 TEST_F(TokenAnnotatorTest, UnderstandsWhitespaceSensitiveMacros) {
   FormatStyle Style = getLLVMStyle();
   Style.WhitespaceSensitiveMacros.push_back("FOO");
@@ -2686,6 +2710,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsVerilogOperators) {
   // precedence.
   std::pair<prec::Level, std::string> JoinedBinary[] = {
       {prec::Comma, "->"},        {prec::Comma, "<->"},
+      {prec::Comma, "#-#"},       {prec::Comma, "#=#"},
       {prec::Assignment, "+="},   {prec::Assignment, "-="},
       {prec::Assignment, "*="},   {prec::Assignment, "/="},
       {prec::Assignment, "%="},   {prec::Assignment, "&="},
diff --git a/clang/unittests/Support/TimeProfilerTest.cpp b/clang/unittests/Support/TimeProfilerTest.cpp
index e544c892635e8..3b18aa8360a6e 100644
--- a/clang/unittests/Support/TimeProfilerTest.cpp
+++ b/clang/unittests/Support/TimeProfilerTest.cpp
@@ -186,7 +186,8 @@ std::string buildTraceGraph(StringRef Json) {
 
 } // namespace
 
-TEST(TimeProfilerTest, ConstantEvaluationCxx20) {
+// FIXME: Flaky test. See https://github.com/llvm/llvm-project/pull/138613
+TEST(TimeProfilerTest, DISABLED_ConstantEvaluationCxx20) {
   std::string Code = R"(
 void print(double value);
 
diff --git a/clang/unittests/Tooling/RangeSelectorTest.cpp b/clang/unittests/Tooling/RangeSelectorTest.cpp
index adf5e74ea3192..a1fcbb023832f 100644
--- a/clang/unittests/Tooling/RangeSelectorTest.cpp
+++ b/clang/unittests/Tooling/RangeSelectorTest.cpp
@@ -527,6 +527,31 @@ TEST(RangeSelectorTest, NameOpDeclRefError) {
           AllOf(HasSubstr(Ref), HasSubstr("requires property 'identifier'")))));
 }
 
+TEST(RangeSelectorTest, NameOpDeclInMacroArg) {
+  StringRef Code = R"cc(
+  #define MACRO(name) int name;
+  MACRO(x)
+  )cc";
+  const char *ID = "id";
+  TestMatch Match = matchCode(Code, varDecl().bind(ID));
+  EXPECT_THAT_EXPECTED(select(name(ID), Match), HasValue("x"));
+}
+
+TEST(RangeSelectorTest, NameOpDeclInMacroBodyError) {
+  StringRef Code = R"cc(
+  #define MACRO int x;
+  MACRO
+  )cc";
+  const char *ID = "id";
+  TestMatch Match = matchCode(Code, varDecl().bind(ID));
+  EXPECT_THAT_EXPECTED(
+      name(ID)(Match.Result),
+      Failed<StringError>(testing::Property(
+          &StringError::getMessage,
+          AllOf(HasSubstr("range selected by name(node id="),
+                HasSubstr("' is different from decl name 'x'")))));
+}
+
 TEST(RangeSelectorTest, CallArgsOp) {
   const StringRef Code = R"cc(
     struct C {
diff --git a/clang/utils/CmpDriver b/clang/utils/CmpDriver
index 12ce7a3250f66..0732baa76d01c 100755
--- a/clang/utils/CmpDriver
+++ b/clang/utils/CmpDriver
@@ -5,6 +5,7 @@ A simple utility that compares tool invocations and exit codes issued by
 compiler drivers that support -### (e.g. gcc and clang).
 """
 
+from itertools import zip_longest
 import subprocess
 
 def splitArgs(s):
@@ -22,7 +23,7 @@ def splitArgs(s):
         elif inQuote:
             if c == '\\':
                 current += c
-                current += it.next()
+                current += next(it)
             else:
                 current += c
         elif not c.isspace():
@@ -135,77 +136,77 @@ def main():
 
     # Compare stdout.
     if infoA.stdout != infoB.stdout:
-        print '-- STDOUT DIFFERS -'
-        print 'A OUTPUT: ',infoA.stdout
-        print 'B OUTPUT: ',infoB.stdout
-        print
+        print('-- STDOUT DIFFERS -')
+        print('A OUTPUT: ',infoA.stdout)
+        print('B OUTPUT: ',infoB.stdout)
+        print()
 
         diff = ZipperDiff(infoA.stdout.split('\n'),
                           infoB.stdout.split('\n'))
         for i,(aElt,bElt) in enumerate(diff.getDiffs()):
             if aElt is None:
-                print 'A missing: %s' % bElt
+                print('A missing: %s' % bElt)
             elif bElt is None:
-                print 'B missing: %s' % aElt
+                print('B missing: %s' % aElt)
             else:
-                print 'mismatch: A: %s' % aElt
-                print '          B: %s' % bElt
+                print('mismatch: A: %s' % aElt)
+                print('          B: %s' % bElt)
 
         differ = True
 
     # Compare stderr.
     if infoA.stderr != infoB.stderr:
-        print '-- STDERR DIFFERS -'
-        print 'A STDERR: ',infoA.stderr
-        print 'B STDERR: ',infoB.stderr
-        print
+        print('-- STDERR DIFFERS -')
+        print('A STDERR: ',infoA.stderr)
+        print('B STDERR: ',infoB.stderr)
+        print()
 
         diff = ZipperDiff(infoA.stderr.split('\n'),
                           infoB.stderr.split('\n'))
         for i,(aElt,bElt) in enumerate(diff.getDiffs()):
             if aElt is None:
-                print 'A missing: %s' % bElt
+                print('A missing: %s' % bElt)
             elif bElt is None:
-                print 'B missing: %s' % aElt
+                print('B missing: %s' % aElt)
             else:
-                print 'mismatch: A: %s' % aElt
-                print '          B: %s' % bElt
+                print('mismatch: A: %s' % aElt)
+                print('          B: %s' % bElt)
 
         differ = True
 
     # Compare commands.
-    for i,(a,b) in enumerate(map(None, infoA.commands, infoB.commands)):
+    for i,(a,b) in enumerate(zip_longest(infoA.commands, infoB.commands, fillvalue=None)):
         if a is None:
-            print 'A MISSING:',' '.join(b)
+            print('A MISSING:',' '.join(b))
             differ = True
             continue
         elif b is None:
-            print 'B MISSING:',' '.join(a)
+            print('B MISSING:',' '.join(a))
             differ = True
             continue
 
         diff = DriverZipperDiff(a,b)
         diffs = list(diff.getDiffs())
         if diffs:
-            print '-- COMMAND %d DIFFERS -' % i
-            print 'A COMMAND:',' '.join(a)
-            print 'B COMMAND:',' '.join(b)
-            print
+            print('-- COMMAND %d DIFFERS -' % i)
+            print('A COMMAND:',' '.join(a))
+            print('B COMMAND:',' '.join(b))
+            print()
             for i,(aElt,bElt) in enumerate(diffs):
                 if aElt is None:
-                    print 'A missing: %s' % bElt
+                    print('A missing: %s' % bElt)
                 elif bElt is None:
-                    print 'B missing: %s' % aElt
+                    print('B missing: %s' % aElt)
                 else:
-                    print 'mismatch: A: %s' % aElt
-                    print '          B: %s' % bElt
+                    print('mismatch: A: %s' % aElt)
+                    print('          B: %s' % bElt)
             differ = True
     
     # Compare result codes.
     if infoA.exitCode != infoB.exitCode:
-        print '-- EXIT CODES DIFFER -'
-        print 'A: ',infoA.exitCode
-        print 'B: ',infoB.exitCode
+        print('-- EXIT CODES DIFFER -')
+        print('A: ',infoA.exitCode)
+        print('B: ',infoB.exitCode)
         differ = True
 
     if differ:
diff --git a/clang/utils/check_cfc/check_cfc.py b/clang/utils/check_cfc/check_cfc.py
index 8d42ec532bbb7..7658f6c27009b 100755
--- a/clang/utils/check_cfc/check_cfc.py
+++ b/clang/utils/check_cfc/check_cfc.py
@@ -56,11 +56,7 @@
 import subprocess
 import sys
 import tempfile
-
-try:
-    import configparser
-except ImportError:
-    import ConfigParser as configparser
+import configparser
 import io
 
 import obj_diff
diff --git a/clang/www/OpenProjects.html b/clang/www/OpenProjects.html
index ae0ec1d4d12cb..3e5e84b5b2ed4 100755
--- a/clang/www/OpenProjects.html
+++ b/clang/www/OpenProjects.html
@@ -38,7 +38,7 @@ <h1>Open Clang Projects</h1>
   <li>documenting <a href="https://github.com/llvm/llvm-project/blob/main/clang/include/clang/Basic/DiagnosticDocs.td">
       diagnostic group flags</a> (adding code examples of what is diagnosed, or
       other relevant information), or</li>
-  <li>documenting <a href="https://github.com/llvm/llvm-project/blob/main/clang/include/clang/Driver/Options.td">
+  <li>documenting <a href="https://github.com/llvm/llvm-project/blob/main/clang/include/clang/Options/Options.td">
       command line options</a>, or</li>
   <li>help with completing other missing documentation.</li>
 </ul>
diff --git a/clang/www/c_status.html b/clang/www/c_status.html
index b8039622fe694..ccf19ea86957a 100644
--- a/clang/www/c_status.html
+++ b/clang/www/c_status.html
@@ -324,12 +324,12 @@ <h2 id="c2y">C2y implementation status</h2>
     <tr>
       <td>Matching of Multi-Dimensional Arrays in Generic Selection Expressions</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3348.pdf">N3348</a></td>
-      <td class="unknown" align="center">Unknown</td>
+      <td class="none" align="center">No</td>
 	</tr>
     <tr>
       <td>The __COUNTER__ predefined macro</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3457.htm">N3457</a></td>
-      <td class="unknown" align="center">Unknown</td>
+      <td class="unreleased" align="center">Clang 22</td>
 	</tr>
     <tr>
       <td>Chasing Ghosts I: constant expressions v2</td>
@@ -344,7 +344,7 @@ <h2 id="c2y">C2y implementation status</h2>
     <tr>
       <td>static_assert without UB</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3525.htm">N3525</a></td>
-      <td class="unknown" align="center">Unknown</td>
+      <td class="full" align="center">Yes</td>
 	</tr>
     <tr>
       <td>Allow calling static inline within extern inline</td>
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index ae9b28ee625cd..e9fadb2dbd4ac 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -40,2843 +40,3314 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
 
 <p>This page tracks which C++ defect reports are implemented within Clang.</p>
 
-<table width="689" border="1" cellspacing="0">
+<table width="892" border="1" cellspacing="0">
   <tr>
     <th>Number</th>
+    <th>Section</th>
     <th>Status</th>
     <th>Issue title</th>
     <th>Available in Clang?</th>
   </tr>
   <tr id="1">
     <td><a href="https://cplusplus.github.io/CWG/issues/1.html">1</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>TC1</td>
     <td>What if two using-declarations refer to the same function but the declarations introduce different default-arguments?</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="2">
     <td><a href="https://cplusplus.github.io/CWG/issues/2.html">2</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.res">temp.dep.res</a>]</td>
     <td>drafting</td>
     <td>How can dependent names be used in member declarations that appear outside of the class template definition?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="3">
     <td><a href="https://cplusplus.github.io/CWG/issues/3.html">3</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>The template compilation model rules render some explicit specialization declarations not visible during instantiation</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="4">
     <td><a href="https://cplusplus.github.io/CWG/issues/4.html">4</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD1</td>
     <td>Does extern "C" affect the linkage of function names with internal linkage?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="5">
     <td><a href="https://cplusplus.github.io/CWG/issues/5.html">5</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>CV-qualifiers and type conversions</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="6">
     <td><a href="https://cplusplus.github.io/CWG/issues/6.html">6</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>NAD</td>
     <td>Should the optimization that allows a class object to alias another object also allow the case of a parameter in an inline function to alias its argument?</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Yes</td>
   </tr>
   <tr id="7">
     <td><a href="https://cplusplus.github.io/CWG/issues/7.html">7</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>NAD</td>
     <td>Can a class with a private virtual base class be derived from?</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="8">
     <td><a href="https://cplusplus.github.io/CWG/issues/8.html">8</a></td>
+    <td>[<a href="https://wg21.link/class.access">class.access</a>]</td>
     <td>CD1</td>
     <td>Access to template arguments used in a function return type and in the nested name specifier</td>
     <td class="full" align="center">Duplicate of <a href="#45">45</a></td>
   </tr>
   <tr id="9">
     <td><a href="https://cplusplus.github.io/CWG/issues/9.html">9</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD1</td>
     <td>Clarification of access to base class members</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="10">
     <td><a href="https://cplusplus.github.io/CWG/issues/10.html">10</a></td>
+    <td>[<a href="https://wg21.link/class.access.nest">class.access.nest</a>]</td>
     <td>CD1</td>
     <td>Can a nested class access its own class name as a qualified name if it is a private member of the enclosing class?</td>
     <td class="full" align="center">Duplicate of <a href="#45">45</a></td>
   </tr>
   <tr id="11">
     <td><a href="https://cplusplus.github.io/CWG/issues/11.html">11</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD1</td>
     <td>How do the keywords typename/template interact with using-declarations?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="12">
     <td><a href="https://cplusplus.github.io/CWG/issues/12.html">12</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>dup</td>
     <td>Default arguments on different declarations for the same function and the Koenig lookup</td>
     <td class="full-superseded" align="center">Superseded by <a href="#239">239</a></td>
   </tr>
   <tr id="13">
     <td><a href="https://cplusplus.github.io/CWG/issues/13.html">13</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>extern "C" for Parameters of Function Templates</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="14">
     <td><a href="https://cplusplus.github.io/CWG/issues/14.html">14</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>extern "C" functions and declarations in different namespaces</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="15">
     <td><a href="https://cplusplus.github.io/CWG/issues/15.html">15</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>dup</td>
     <td>Default arguments for parameters of function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="16">
     <td><a href="https://cplusplus.github.io/CWG/issues/16.html">16</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD1</td>
     <td>Access to members of indirect private base classes</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="17">
     <td><a href="https://cplusplus.github.io/CWG/issues/17.html">17</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>NAD</td>
     <td>Footnote 99 should discuss the naming class when describing members that can be accessed from friends</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="18">
     <td><a href="https://cplusplus.github.io/CWG/issues/18.html">18</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>NAD</td>
     <td>f(TYPE) where TYPE is void should be allowed</td>
     <td class="full-superseded" align="center">Superseded by <a href="#577">577</a></td>
   </tr>
   <tr id="19">
     <td><a href="https://cplusplus.github.io/CWG/issues/19.html">19</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>NAD</td>
     <td>Clarify protected member access</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="20">
     <td><a href="https://cplusplus.github.io/CWG/issues/20.html">20</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>TC1</td>
     <td>Some clarifications needed for 12.8 para 15</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="21">
     <td><a href="https://cplusplus.github.io/CWG/issues/21.html">21</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>TC1</td>
     <td>Can a default argument for a template parameter appear in a friend declaration?</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="22">
     <td><a href="https://cplusplus.github.io/CWG/issues/22.html">22</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.res">temp.dep.res</a>]</td>
     <td>TC1</td>
     <td>Template parameter with a default argument that refers to itself</td>
     <td class="full-superseded" align="center">Superseded by <a href="#481">481</a></td>
   </tr>
   <tr id="23">
     <td><a href="https://cplusplus.github.io/CWG/issues/23.html">23</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>NAD</td>
     <td>Some questions regarding partial ordering of function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="24">
     <td><a href="https://cplusplus.github.io/CWG/issues/24.html">24</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>TC1</td>
     <td>Errors in examples in 14.7.3</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="25">
     <td><a href="https://cplusplus.github.io/CWG/issues/25.html">25</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>TC1</td>
     <td>Exception specifications and pointers to members</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="26">
     <td><a href="https://cplusplus.github.io/CWG/issues/26.html">26</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Copy constructors and default arguments</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="27">
     <td><a href="https://cplusplus.github.io/CWG/issues/27.html">27</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>NAD</td>
     <td>Overload ambiguities for builtin ?: prototypes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="28">
     <td><a href="https://cplusplus.github.io/CWG/issues/28.html">28</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>CD1</td>
     <td>'exit', 'signal' and static object destruction</td>
     <td class="na" align="center">N/A (Library DR)</td>
   </tr>
   <tr id="29">
     <td><a href="https://cplusplus.github.io/CWG/issues/29.html">29</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD1</td>
     <td>Linkage of locally declared functions</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="30">
     <td><a href="https://cplusplus.github.io/CWG/issues/30.html">30</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>TC1</td>
     <td>Valid uses of "<TT>::template</TT>"</td>
     <td class="full-superseded" align="center">Superseded by <a href="#468">468</a> (C++11 onwards)</td>
   </tr>
   <tr id="31">
     <td><a href="https://cplusplus.github.io/CWG/issues/31.html">31</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>NAD</td>
     <td>Looking up new/delete</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="32">
     <td><a href="https://cplusplus.github.io/CWG/issues/32.html">32</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>TC1</td>
     <td>Clarification of explicit instantiation of non-exported templates</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="33">
     <td><a href="https://cplusplus.github.io/CWG/issues/33.html">33</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>TC1</td>
     <td>Argument dependent lookup and overloaded functions</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="34">
     <td><a href="https://cplusplus.github.io/CWG/issues/34.html">34</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>NAD</td>
     <td>Argument dependent lookup and points of instantiation</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="35">
     <td><a href="https://cplusplus.github.io/CWG/issues/35.html">35</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>TC1</td>
     <td>Definition of default-initialization</td>
     <td class="full" align="center">Duplicate of <a href="#178">178</a></td>
   </tr>
   <tr id="36">
     <td><a href="https://cplusplus.github.io/CWG/issues/36.html">36</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD6</td>
     <td><I>using-declaration</I>s in multiple-declaration contexts</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="37">
     <td><a href="https://cplusplus.github.io/CWG/issues/37.html">37</a></td>
+    <td>[<a href="https://wg21.link/except.uncaught">except.uncaught</a>]</td>
     <td>NAD</td>
     <td>When is uncaught_exception() true?</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#475">475</a></td>
   </tr>
   <tr id="38">
     <td><a href="https://cplusplus.github.io/CWG/issues/38.html">38</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>TC1</td>
     <td>Explicit template arguments and operator functions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="39">
     <td><a href="https://cplusplus.github.io/CWG/issues/39.html">39</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>CD1</td>
     <td>Conflicting ambiguity rules</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="40">
     <td><a href="https://cplusplus.github.io/CWG/issues/40.html">40</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>TC1</td>
     <td>Syntax of <I>declarator-id</I></td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="41">
     <td><a href="https://cplusplus.github.io/CWG/issues/41.html">41</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>TC1</td>
     <td>Clarification of lookup of names after declarator-id</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="42">
     <td><a href="https://cplusplus.github.io/CWG/issues/42.html">42</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>NAD</td>
     <td>Redefining names from base classes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="43">
     <td><a href="https://cplusplus.github.io/CWG/issues/43.html">43</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>TC1</td>
     <td>Copying base classes (PODs) using memcpy</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="44">
     <td><a href="https://cplusplus.github.io/CWG/issues/44.html">44</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD1</td>
     <td>Member specializations</td>
     <td class="partial-superseded" align="center">Superseded by <a href="#727">727</a></td>
   </tr>
   <tr id="45">
     <td><a href="https://cplusplus.github.io/CWG/issues/45.html">45</a></td>
+    <td>[<a href="https://wg21.link/class.access.nest">class.access.nest</a>]</td>
     <td>CD1</td>
     <td>Access to nested classes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="46">
     <td><a href="https://cplusplus.github.io/CWG/issues/46.html">46</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>NAD</td>
     <td>Explicit instantiation of member templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="47">
     <td><a href="https://cplusplus.github.io/CWG/issues/47.html">47</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>NAD</td>
     <td>Template friend issues</td>
     <td class="full-superseded" align="center">Superseded by <a href="#329">329</a></td>
   </tr>
   <tr id="48">
     <td><a href="https://cplusplus.github.io/CWG/issues/48.html">48</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>TC1</td>
     <td>Definitions of unused static members</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="49">
     <td><a href="https://cplusplus.github.io/CWG/issues/49.html">49</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>TC1</td>
     <td>Restriction on non-type, non-value template arguments</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="50">
     <td><a href="https://cplusplus.github.io/CWG/issues/50.html">50</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>NAD</td>
     <td>Converting pointer to incomplete type to same type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="51">
     <td><a href="https://cplusplus.github.io/CWG/issues/51.html">51</a></td>
+    <td>[<a href="https://wg21.link/over.match.best">over.match.best</a>]</td>
     <td>TC1</td>
     <td>Overloading and user-defined conversions</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="52">
     <td><a href="https://cplusplus.github.io/CWG/issues/52.html">52</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>TC1</td>
     <td>Non-static members, member selection and access checking</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="53">
     <td><a href="https://cplusplus.github.io/CWG/issues/53.html">53</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>TC1</td>
     <td>Lvalue-to-rvalue conversion before certain static_casts</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="54">
     <td><a href="https://cplusplus.github.io/CWG/issues/54.html">54</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD1</td>
     <td>Static_cast from private base to derived class</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="55">
     <td><a href="https://cplusplus.github.io/CWG/issues/55.html">55</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>NAD</td>
     <td>Adding/subtracting pointer and enumeration value</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="56">
     <td><a href="https://cplusplus.github.io/CWG/issues/56.html">56</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>TC1</td>
     <td>Redeclaring typedefs within classes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="57">
     <td><a href="https://cplusplus.github.io/CWG/issues/57.html">57</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>open</td>
     <td>Empty unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="58">
     <td><a href="https://cplusplus.github.io/CWG/issues/58.html">58</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD1</td>
     <td>Signedness of bit fields of enum type</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="59">
     <td><a href="https://cplusplus.github.io/CWG/issues/59.html">59</a></td>
+    <td>[<a href="https://wg21.link/over.match.copy">over.match.copy</a>]</td>
     <td>TC1</td>
     <td>Clarification of overloading and UDC to reference type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="60">
     <td><a href="https://cplusplus.github.io/CWG/issues/60.html">60</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>CD1</td>
     <td>Reference binding and valid conversion sequences</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="61">
     <td><a href="https://cplusplus.github.io/CWG/issues/61.html">61</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>NAD</td>
     <td>Address of static member function "<TT>&amp;p-&gt;f</TT>"</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="62">
     <td><a href="https://cplusplus.github.io/CWG/issues/62.html">62</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.type">temp.arg.type</a>]</td>
     <td>CD1</td>
     <td>Unnamed members of classes used as type parameters</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="63">
     <td><a href="https://cplusplus.github.io/CWG/issues/63.html">63</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>CD1</td>
     <td>Class instantiation from pointer conversion to void*, null and self</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="64">
     <td><a href="https://cplusplus.github.io/CWG/issues/64.html">64</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>TC1</td>
     <td>Partial ordering to disambiguate explicit specialization</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="65">
     <td><a href="https://cplusplus.github.io/CWG/issues/65.html">65</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>TC1</td>
     <td>Typo in default argument example</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="66">
     <td><a href="https://cplusplus.github.io/CWG/issues/66.html">66</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>NAD</td>
     <td>Visibility of default args vs overloads added after using-declaration</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="67">
     <td><a href="https://cplusplus.github.io/CWG/issues/67.html">67</a></td>
+    <td>[<a href="https://wg21.link/class.static">class.static</a>]</td>
     <td>TC1</td>
     <td>Evaluation of left side of object-expression</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="68">
     <td><a href="https://cplusplus.github.io/CWG/issues/68.html">68</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>TC1</td>
     <td>Grammar does not allow "friend class A&lt;int&gt;;"</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="69">
     <td><a href="https://cplusplus.github.io/CWG/issues/69.html">69</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>TC1</td>
     <td>Storage class specifiers on template declarations</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="70">
     <td><a href="https://cplusplus.github.io/CWG/issues/70.html">70</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD1</td>
     <td>Is an array bound a nondeduced context?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="71">
     <td><a href="https://cplusplus.github.io/CWG/issues/71.html">71</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>NAD</td>
     <td>Incorrect cross reference</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="72">
     <td><a href="https://cplusplus.github.io/CWG/issues/72.html">72</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>dup</td>
     <td>Linkage and storage class specifiers for templates</td>
     <td class="full" align="center">Duplicate of <a href="#69">69</a></td>
   </tr>
   <tr id="73">
     <td><a href="https://cplusplus.github.io/CWG/issues/73.html">73</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>TC1</td>
     <td>Pointer equality</td>
     <td class="full-superseded" align="center">Superseded by <a href="#1652">1652</a></td>
   </tr>
   <tr id="74">
     <td><a href="https://cplusplus.github.io/CWG/issues/74.html">74</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>TC1</td>
     <td>Enumeration value in direct-new-declarator</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="75">
     <td><a href="https://cplusplus.github.io/CWG/issues/75.html">75</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>TC1</td>
     <td>In-class initialized members must be const</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="76">
     <td><a href="https://cplusplus.github.io/CWG/issues/76.html">76</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.cv">dcl.type.cv</a>]</td>
     <td>TC1</td>
     <td>Are const volatile variables considered "constant expressions"?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="77">
     <td><a href="https://cplusplus.github.io/CWG/issues/77.html">77</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>CD1</td>
     <td>The definition of friend does not allow nested classes to be friends</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="78">
     <td><a href="https://cplusplus.github.io/CWG/issues/78.html">78</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Section 8.5 paragraph 9 should state it only applies to non-static objects</td>
     <td class="none" align="center">Superseded by <a href="#????">????</a></td>
   </tr>
   <tr id="79">
     <td><a href="https://cplusplus.github.io/CWG/issues/79.html">79</a></td>
+    <td>[<a href="https://wg21.link/new.delete.placement">new.delete.placement</a>]</td>
     <td>dup</td>
     <td>Alignment and placement new</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="80">
     <td><a href="https://cplusplus.github.io/CWG/issues/80.html">80</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>TC1</td>
     <td>Class members with same name as class</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="81">
     <td><a href="https://cplusplus.github.io/CWG/issues/81.html">81</a></td>
+    <td>[<a href="https://wg21.link/diff">diff</a>]</td>
     <td>NAD</td>
     <td>Null pointers and C compatibility</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="82">
     <td><a href="https://cplusplus.github.io/CWG/issues/82.html">82</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>dup</td>
     <td>Definition of "using" a constant expression</td>
     <td class="full" align="center">Duplicate of <a href="#48">48</a></td>
   </tr>
   <tr id="83">
     <td><a href="https://cplusplus.github.io/CWG/issues/83.html">83</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>TC1</td>
     <td>Overloading and deprecated conversion of string literal</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="84">
     <td><a href="https://cplusplus.github.io/CWG/issues/84.html">84</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>TC1</td>
     <td>Overloading and conversion loophole used by <TT>auto_ptr</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="85">
     <td><a href="https://cplusplus.github.io/CWG/issues/85.html">85</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.elab">basic.lookup.elab</a>]</td>
     <td>TC1</td>
     <td>Redeclaration of member class</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="86">
     <td><a href="https://cplusplus.github.io/CWG/issues/86.html">86</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Lifetime of temporaries in query expressions</td>
     <td class="full" align="center">Duplicate of <a href="#446">446</a></td>
   </tr>
   <tr id="87">
     <td><a href="https://cplusplus.github.io/CWG/issues/87.html">87</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD1</td>
     <td>Exception specifications on function parameters</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="88">
     <td><a href="https://cplusplus.github.io/CWG/issues/88.html">88</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Specialization of member constant templates</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="89">
     <td><a href="https://cplusplus.github.io/CWG/issues/89.html">89</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>TC1</td>
     <td>Object lifetime does not account for reference rebinding</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="90">
     <td><a href="https://cplusplus.github.io/CWG/issues/90.html">90</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>TC1</td>
     <td>Should the enclosing class be an "associated class" too?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="91">
     <td><a href="https://cplusplus.github.io/CWG/issues/91.html">91</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>NAD</td>
     <td>A union's associated types should include the union itself</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="92">
     <td><a href="https://cplusplus.github.io/CWG/issues/92.html">92</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Should <I>exception-specification</I>s be part of the type system?</td>
     <td class="full" align="center">Clang 4 (C++17 onwards)</td>
   </tr>
   <tr id="93">
     <td><a href="https://cplusplus.github.io/CWG/issues/93.html">93</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>TC1</td>
     <td>Missing word in 3.8 <U>basic.life</U> paragraph 2</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="94">
     <td><a href="https://cplusplus.github.io/CWG/issues/94.html">94</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>TC1</td>
     <td>Inconsistencies in the descriptions of constant expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="95">
     <td><a href="https://cplusplus.github.io/CWG/issues/95.html">95</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>NAD</td>
     <td>Elaborated type specifiers referencing names declared in friend decls</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="96">
     <td><a href="https://cplusplus.github.io/CWG/issues/96.html">96</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++11</td>
     <td>Syntactic disambiguation using the <TT>template</TT> keyword</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P1787">P1787</a></td>
   </tr>
   <tr id="97">
     <td><a href="https://cplusplus.github.io/CWG/issues/97.html">97</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Use of bool constants in integral constant expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="98">
     <td><a href="https://cplusplus.github.io/CWG/issues/98.html">98</a></td>
+    <td>[<a href="https://wg21.link/except">except</a>]</td>
     <td>TC1</td>
     <td>Branching into try block</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="99">
     <td><a href="https://cplusplus.github.io/CWG/issues/99.html">99</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>NAD</td>
     <td>Partial ordering, references and cv-qualifiers</td>
     <td class="full-superseded" align="center">Superseded by <a href="#214">214</a></td>
   </tr>
   <tr id="100">
     <td><a href="https://cplusplus.github.io/CWG/issues/100.html">100</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>TC1</td>
     <td>Clarify why string literals are not allowed as template arguments</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="101">
     <td><a href="https://cplusplus.github.io/CWG/issues/101.html">101</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>TC1</td>
     <td>Redeclaration of extern "C" names via using-declarations</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="102">
     <td><a href="https://cplusplus.github.io/CWG/issues/102.html">102</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>NAD</td>
     <td>Operator lookup rules do not work well with parts of the library</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="103">
     <td><a href="https://cplusplus.github.io/CWG/issues/103.html">103</a></td>
+    <td>[<a href="https://wg21.link/namespace.udir">namespace.udir</a>]</td>
     <td>TC1</td>
     <td>Is it <I>extended-namespace-definition</I> or <I>extension-namespace-definition</I> ?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="104">
     <td><a href="https://cplusplus.github.io/CWG/issues/104.html">104</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>NAD</td>
     <td>Destroying the exception temp when no handler is found</td>
     <td class="na" align="center">N/A (Library DR)</td>
   </tr>
   <tr id="105">
     <td><a href="https://cplusplus.github.io/CWG/issues/105.html">105</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>TC1</td>
     <td>Meaning of "template function"</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="106">
     <td><a href="https://cplusplus.github.io/CWG/issues/106.html">106</a></td>
+    <td>[<a href="https://wg21.link/unknown">unknown</a>]</td>
     <td>CD1</td>
     <td>Creating references to references during template deduction/instantiation</td>
     <td class="full-superseded" align="center">Superseded by <a href="#540">540</a></td>
   </tr>
   <tr id="107">
     <td><a href="https://cplusplus.github.io/CWG/issues/107.html">107</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>Linkage of operator functions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="108">
     <td><a href="https://cplusplus.github.io/CWG/issues/108.html">108</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>TC1</td>
     <td>Are classes nested in templates dependent?</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="109">
     <td><a href="https://cplusplus.github.io/CWG/issues/109.html">109</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>NAD</td>
     <td>Allowing <TT>::template</TT> in <I>using-declaration</I>s</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="110">
     <td><a href="https://cplusplus.github.io/CWG/issues/110.html">110</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD6</td>
     <td>Can template functions and classes be declared in the same scope?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="111">
     <td><a href="https://cplusplus.github.io/CWG/issues/111.html">111</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Copy constructors and cv-qualifiers</td>
     <td class="full" align="center">Duplicate of <a href="#535">535</a></td>
   </tr>
   <tr id="112">
     <td><a href="https://cplusplus.github.io/CWG/issues/112.html">112</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>CD1</td>
     <td>Array types and cv-qualifiers</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="113">
     <td><a href="https://cplusplus.github.io/CWG/issues/113.html">113</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD1</td>
     <td>Visibility of called function</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="114">
     <td><a href="https://cplusplus.github.io/CWG/issues/114.html">114</a></td>
+    <td>[<a href="https://wg21.link/temp.mem">temp.mem</a>]</td>
     <td>NAD</td>
     <td>Virtual overriding by template member function specializations</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="115">
     <td><a href="https://cplusplus.github.io/CWG/issues/115.html">115</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>CD1</td>
     <td>Address of template-id</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="116">
     <td><a href="https://cplusplus.github.io/CWG/issues/116.html">116</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>TC1</td>
     <td>Equivalent and functionally-equivalent function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="117">
     <td><a href="https://cplusplus.github.io/CWG/issues/117.html">117</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>NAD</td>
     <td>Timing of destruction of temporaries</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="118">
     <td><a href="https://cplusplus.github.io/CWG/issues/118.html">118</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD1</td>
     <td>Calls via pointers to virtual member functions</td>
     <td class="full" align="center">Yes</td>
   </tr>
   <tr id="119">
     <td><a href="https://cplusplus.github.io/CWG/issues/119.html">119</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD1</td>
     <td>Object lifetime and aggregate initialization</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="120">
     <td><a href="https://cplusplus.github.io/CWG/issues/120.html">120</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>TC1</td>
     <td>Nonexistent non-terminal <I>qualified-name</I></td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="121">
     <td><a href="https://cplusplus.github.io/CWG/issues/121.html">121</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>TC1</td>
     <td>Dependent type names with non-dependent <I>nested-name-specifier</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="122">
     <td><a href="https://cplusplus.github.io/CWG/issues/122.html">122</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD1</td>
     <td><I>template-id</I>s as <I>unqualified-id</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="123">
     <td><a href="https://cplusplus.github.io/CWG/issues/123.html">123</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>TC1</td>
     <td>Bad cross-reference</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="124">
     <td><a href="https://cplusplus.github.io/CWG/issues/124.html">124</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Lifetime of temporaries in default initialization of class arrays</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="125">
     <td><a href="https://cplusplus.github.io/CWG/issues/125.html">125</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD1</td>
     <td>Ambiguity in <TT>friend</TT> declaration syntax</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="126">
     <td><a href="https://cplusplus.github.io/CWG/issues/126.html">126</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>TC1</td>
     <td>Exception specifications and <TT>const</TT></td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="127">
     <td><a href="https://cplusplus.github.io/CWG/issues/127.html">127</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>TC1</td>
     <td>Ambiguity in description of matching deallocation function</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="128">
     <td><a href="https://cplusplus.github.io/CWG/issues/128.html">128</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>TC1</td>
     <td>Casting between enum types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="129">
     <td><a href="https://cplusplus.github.io/CWG/issues/129.html">129</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD3</td>
     <td>Stability of uninitialized auto variables</td>
     <td class="full" align="center">Duplicate of <a href="#616">616</a></td>
   </tr>
   <tr id="130">
     <td><a href="https://cplusplus.github.io/CWG/issues/130.html">130</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>NAD</td>
     <td>Sequence points and <I>new-expression</I>s</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="131">
     <td><a href="https://cplusplus.github.io/CWG/issues/131.html">131</a></td>
+    <td>[<a href="https://wg21.link/extendid">extendid</a>]</td>
     <td>TC1</td>
     <td>Typo in Lao characters</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P1949">P1949</a></td>
   </tr>
   <tr id="132">
     <td><a href="https://cplusplus.github.io/CWG/issues/132.html">132</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>NAD</td>
     <td>Local types and linkage</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="133">
     <td><a href="https://cplusplus.github.io/CWG/issues/133.html">133</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>dup</td>
     <td>Exception specifications and checking</td>
     <td class="none" align="center">Duplicate of <a href="#87">87</a></td>
   </tr>
   <tr id="134">
     <td><a href="https://cplusplus.github.io/CWG/issues/134.html">134</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>TC1</td>
     <td>Template classes and <I>declarator-id</I>s</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="135">
     <td><a href="https://cplusplus.github.io/CWG/issues/135.html">135</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>TC1</td>
     <td>Class type in in-class member function definitions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="136">
     <td><a href="https://cplusplus.github.io/CWG/issues/136.html">136</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD1</td>
     <td>Default arguments and friend declarations</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="137">
     <td><a href="https://cplusplus.github.io/CWG/issues/137.html">137</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>TC1</td>
     <td><TT>static_cast</TT> of <I>cv</I> <TT>void*</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="138">
     <td><a href="https://cplusplus.github.io/CWG/issues/138.html">138</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>CD6</td>
     <td>Friend declaration name lookup</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="139">
     <td><a href="https://cplusplus.github.io/CWG/issues/139.html">139</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD1</td>
     <td>Error in <TT>friend</TT> lookup example</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="140">
     <td><a href="https://cplusplus.github.io/CWG/issues/140.html">140</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD1</td>
     <td>Agreement of parameter declarations</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="141">
     <td><a href="https://cplusplus.github.io/CWG/issues/141.html">141</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD1</td>
     <td>Non-member function templates in member access expressions</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="142">
     <td><a href="https://cplusplus.github.io/CWG/issues/142.html">142</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>TC1</td>
     <td>Injection-related errors in access example</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="143">
     <td><a href="https://cplusplus.github.io/CWG/issues/143.html">143</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD1</td>
     <td>Friends and Koenig lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="144">
     <td><a href="https://cplusplus.github.io/CWG/issues/144.html">144</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>open</td>
     <td>Position of <TT>friend</TT> specifier</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="145">
     <td><a href="https://cplusplus.github.io/CWG/issues/145.html">145</a></td>
+    <td>[<a href="https://wg21.link/depr.impldec">depr.impldec</a>]</td>
     <td>TC1</td>
     <td>Deprecation of prefix <TT>++</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="146">
     <td><a href="https://cplusplus.github.io/CWG/issues/146.html">146</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>open</td>
     <td>Floating-point zero</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="147">
     <td><a href="https://cplusplus.github.io/CWG/issues/147.html">147</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>TC1</td>
     <td>Naming the constructor</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="148">
     <td><a href="https://cplusplus.github.io/CWG/issues/148.html">148</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>TC1</td>
     <td>POD classes and pointers to members</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="149">
     <td><a href="https://cplusplus.github.io/CWG/issues/149.html">149</a></td>
+    <td>[<a href="https://wg21.link/conv.ptr">conv.ptr</a>]</td>
     <td>TC1</td>
     <td>Accessibility and ambiguity</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="150">
     <td><a href="https://cplusplus.github.io/CWG/issues/150.html">150</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>C++17</td>
     <td>Template template parameters and default arguments</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="151">
     <td><a href="https://cplusplus.github.io/CWG/issues/151.html">151</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>TC1</td>
     <td>Terminology of zero-initialization</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="152">
     <td><a href="https://cplusplus.github.io/CWG/issues/152.html">152</a></td>
+    <td>[<a href="https://wg21.link/class.conv.ctor">class.conv.ctor</a>]</td>
     <td>TC1</td>
     <td><TT>explicit</TT> copy constructors</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="153">
     <td><a href="https://cplusplus.github.io/CWG/issues/153.html">153</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>TC1</td>
     <td>Misleading wording (rank of conversion)</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="154">
     <td><a href="https://cplusplus.github.io/CWG/issues/154.html">154</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>NAD</td>
     <td>Anonymous unions in unnamed namespaces</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="155">
     <td><a href="https://cplusplus.github.io/CWG/issues/155.html">155</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>dup</td>
     <td>Brace initializer for scalar</td>
     <td class="full" align="center">Duplicate of <a href="#632">632</a></td>
   </tr>
   <tr id="156">
     <td><a href="https://cplusplus.github.io/CWG/issues/156.html">156</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>NAD</td>
     <td>Name lookup for conversion functions</td>
     <td class="partial-superseded" align="center">Superseded by <a href="#1111">1111</a></td>
   </tr>
   <tr class="open" id="157">
     <td><a href="https://cplusplus.github.io/CWG/issues/157.html">157</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>open</td>
     <td>Omitted typedef declarator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="158">
     <td><a href="https://cplusplus.github.io/CWG/issues/158.html">158</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD1</td>
     <td>Aliasing and qualification conversions</td>
     <td class="full" align="center">Yes</td>
   </tr>
   <tr id="159">
     <td><a href="https://cplusplus.github.io/CWG/issues/159.html">159</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>TC1</td>
     <td>Namespace qualification in declarators</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="160">
     <td><a href="https://cplusplus.github.io/CWG/issues/160.html">160</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>CD1</td>
     <td>Missing <TT>std::</TT> qualification</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="161">
     <td><a href="https://cplusplus.github.io/CWG/issues/161.html">161</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>TC1</td>
     <td>Access to protected nested type</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="162">
     <td><a href="https://cplusplus.github.io/CWG/issues/162.html">162</a></td>
+    <td>[<a href="https://wg21.link/over.match.call">over.match.call</a>]</td>
     <td>CD1</td>
     <td>(<TT>&amp;C::f)()</TT> with nonstatic members</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="163">
     <td><a href="https://cplusplus.github.io/CWG/issues/163.html">163</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>TC1</td>
     <td>Description of subaggregate initializer</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="164">
     <td><a href="https://cplusplus.github.io/CWG/issues/164.html">164</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>TC1</td>
     <td>Overlap between Koenig and normal lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="165">
     <td><a href="https://cplusplus.github.io/CWG/issues/165.html">165</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>NAD</td>
     <td>Definitions of friends and block-scope externs</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="166">
     <td><a href="https://cplusplus.github.io/CWG/issues/166.html">166</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>TC1</td>
     <td>Friend declarations of <I>template-id</I>s</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="167">
     <td><a href="https://cplusplus.github.io/CWG/issues/167.html">167</a></td>
+    <td>[<a href="https://wg21.link/depr.static">depr.static</a>]</td>
     <td>NAD</td>
     <td>Deprecating static functions</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1012">1012</a></td>
   </tr>
   <tr id="168">
     <td><a href="https://cplusplus.github.io/CWG/issues/168.html">168</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>C linkage for static member functions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="169">
     <td><a href="https://cplusplus.github.io/CWG/issues/169.html">169</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>NAD</td>
     <td><I>template-id</I>s in <I>using-declaration</I>s</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="170">
     <td><a href="https://cplusplus.github.io/CWG/issues/170.html">170</a></td>
+    <td>[<a href="https://wg21.link/conv.mem">conv.mem</a>]</td>
     <td>CD7</td>
     <td>Pointer-to-member conversions</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="171">
     <td><a href="https://cplusplus.github.io/CWG/issues/171.html">171</a></td>
+    <td>[<a href="https://wg21.link/basic.namespace">basic.namespace</a>]</td>
     <td>TC1</td>
     <td>Global namespace scope</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="172">
     <td><a href="https://cplusplus.github.io/CWG/issues/172.html">172</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD1</td>
     <td>Unsigned int as underlying type of enum</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="173">
     <td><a href="https://cplusplus.github.io/CWG/issues/173.html">173</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>TC1</td>
     <td>Constraints on execution character set</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="174">
     <td><a href="https://cplusplus.github.io/CWG/issues/174.html">174</a></td>
+    <td>[<a href="https://wg21.link/depr.static">depr.static</a>]</td>
     <td>NAD</td>
     <td>Undeprecating global static</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1012">1012</a></td>
   </tr>
   <tr id="175">
     <td><a href="https://cplusplus.github.io/CWG/issues/175.html">175</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Class name injection and base name access</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="176">
     <td><a href="https://cplusplus.github.io/CWG/issues/176.html">176</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>TC1</td>
     <td>Name injection and templates</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="177">
     <td><a href="https://cplusplus.github.io/CWG/issues/177.html">177</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Lvalues vs rvalues in copy-initialization</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="178">
     <td><a href="https://cplusplus.github.io/CWG/issues/178.html">178</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>TC1</td>
     <td>More on value-initialization</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="179">
     <td><a href="https://cplusplus.github.io/CWG/issues/179.html">179</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>TC1</td>
     <td>Function pointers and subtraction</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="180">
     <td><a href="https://cplusplus.github.io/CWG/issues/180.html">180</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td><TT>typename</TT> and elaborated types</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="181">
     <td><a href="https://cplusplus.github.io/CWG/issues/181.html">181</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>TC1</td>
     <td>Errors in template <I>template-parameter</I> example</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="182">
     <td><a href="https://cplusplus.github.io/CWG/issues/182.html">182</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Access checking on explicit specializations</td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr id="183">
     <td><a href="https://cplusplus.github.io/CWG/issues/183.html">183</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>TC1</td>
     <td><TT>typename</TT> in explicit specializations</td>
     <td class="full-superseded" align="center">Superseded by <a href="#382">382</a></td>
   </tr>
   <tr id="184">
     <td><a href="https://cplusplus.github.io/CWG/issues/184.html">184</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD1</td>
     <td>Default arguments in template <I>template-parameter</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="185">
     <td><a href="https://cplusplus.github.io/CWG/issues/185.html">185</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>TC1</td>
     <td>"Named" temporaries and copy elision</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="186">
     <td><a href="https://cplusplus.github.io/CWG/issues/186.html">186</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>open</td>
     <td>Name hiding and template <I>template-parameter</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="187">
     <td><a href="https://cplusplus.github.io/CWG/issues/187.html">187</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>TC1</td>
     <td>Scope of template parameter names</td>
     <td class="full-superseded" align="center">Superseded by <a href="#481">481</a></td>
   </tr>
   <tr id="188">
     <td><a href="https://cplusplus.github.io/CWG/issues/188.html">188</a></td>
+    <td>[<a href="https://wg21.link/expr.comma">expr.comma</a>]</td>
     <td>TC1</td>
     <td>Comma operator and rvalue conversion</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="189">
     <td><a href="https://cplusplus.github.io/CWG/issues/189.html">189</a></td>
+    <td>[<a href="https://wg21.link/lex.operators">lex.operators</a>]</td>
     <td>open</td>
     <td>Definition of <I>operator</I> and <I>punctuator</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="190">
     <td><a href="https://cplusplus.github.io/CWG/issues/190.html">190</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>TC1</td>
     <td>Layout-compatible POD-struct types</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="191">
     <td><a href="https://cplusplus.github.io/CWG/issues/191.html">191</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD6</td>
     <td>Name lookup does not handle complex nesting</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="192">
     <td><a href="https://cplusplus.github.io/CWG/issues/192.html">192</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>NAD</td>
     <td>Name lookup in parameters</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="193">
     <td><a href="https://cplusplus.github.io/CWG/issues/193.html">193</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>TC1</td>
     <td>Order of destruction of local automatics of destructor</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="194">
     <td><a href="https://cplusplus.github.io/CWG/issues/194.html">194</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>TC1</td>
     <td>Identifying constructors</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="195">
     <td><a href="https://cplusplus.github.io/CWG/issues/195.html">195</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD1</td>
     <td>Converting between function and object pointers</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="196">
     <td><a href="https://cplusplus.github.io/CWG/issues/196.html">196</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>open</td>
     <td>Arguments to deallocation functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="197">
     <td><a href="https://cplusplus.github.io/CWG/issues/197.html">197</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.candidate">temp.dep.candidate</a>]</td>
     <td>CD1</td>
     <td>Issues with two-stage lookup of dependent names</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="198">
     <td><a href="https://cplusplus.github.io/CWG/issues/198.html">198</a></td>
+    <td>[<a href="https://wg21.link/class.local">class.local</a>]</td>
     <td>CD1</td>
     <td>Definition of "use" in local and nested classes</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="199">
     <td><a href="https://cplusplus.github.io/CWG/issues/199.html">199</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Order of destruction of temporaries</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="200">
     <td><a href="https://cplusplus.github.io/CWG/issues/200.html">200</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>dup</td>
     <td>Partial ordering and explicit arguments</td>
     <td class="full" align="center">Duplicate of <a href="#214">214</a></td>
   </tr>
   <tr id="201">
     <td><a href="https://cplusplus.github.io/CWG/issues/201.html">201</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Order of destruction of temporaries in initializers</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="202">
     <td><a href="https://cplusplus.github.io/CWG/issues/202.html">202</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>TC1</td>
     <td>Use of overloaded function name</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="203">
     <td><a href="https://cplusplus.github.io/CWG/issues/203.html">203</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>NAD</td>
     <td>Type of address-of-member expression</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="204">
     <td><a href="https://cplusplus.github.io/CWG/issues/204.html">204</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD1</td>
     <td>Exported class templates</td>
     <td class="full-superseded" align="center">Superseded by <a href="#820">820</a></td>
   </tr>
   <tr class="open" id="205">
     <td><a href="https://cplusplus.github.io/CWG/issues/205.html">205</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>drafting</td>
     <td>Templates and static data members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="206">
     <td><a href="https://cplusplus.github.io/CWG/issues/206.html">206</a></td>
+    <td>[<a href="https://wg21.link/temp.nondep">temp.nondep</a>]</td>
     <td>TC1</td>
     <td>Semantic constraints on non-dependent names</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="207">
     <td><a href="https://cplusplus.github.io/CWG/issues/207.html">207</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD1</td>
     <td><I>using-declaration</I>s and protected access</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="208">
     <td><a href="https://cplusplus.github.io/CWG/issues/208.html">208</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD1</td>
     <td>Rethrowing exceptions in nested handlers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="209">
     <td><a href="https://cplusplus.github.io/CWG/issues/209.html">209</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
-    <td>Must friend declaration names be
-accessible?</td>
+    <td>Must friend declaration names be accessible?</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="210">
     <td><a href="https://cplusplus.github.io/CWG/issues/210.html">210</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>TC1</td>
     <td>What is the type matched by an exception handler?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="211">
     <td><a href="https://cplusplus.github.io/CWG/issues/211.html">211</a></td>
+    <td>[<a href="https://wg21.link/except">except</a>]</td>
     <td>NAD</td>
     <td>Constructors should not be allowed to return normally after an exception</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="212">
     <td><a href="https://cplusplus.github.io/CWG/issues/212.html">212</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>CD4</td>
     <td>Implicit instantiation is not described clearly enough</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Yes</td>
   </tr>
   <tr id="213">
     <td><a href="https://cplusplus.github.io/CWG/issues/213.html">213</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>TC1</td>
     <td>Lookup in dependent base classes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="214">
     <td><a href="https://cplusplus.github.io/CWG/issues/214.html">214</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>CD1</td>
     <td>Partial ordering of function templates is underspecified</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="215">
     <td><a href="https://cplusplus.github.io/CWG/issues/215.html">215</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD1</td>
     <td>Template parameters are not allowed in <I>nested-name-specifier</I>s</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="216">
     <td><a href="https://cplusplus.github.io/CWG/issues/216.html">216</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD1</td>
     <td>Linkage of nameless class-scope enumeration types</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="217">
     <td><a href="https://cplusplus.github.io/CWG/issues/217.html">217</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>TC1</td>
     <td>Default arguments for non-template member functions of class templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="218">
     <td><a href="https://cplusplus.github.io/CWG/issues/218.html">218</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD1</td>
     <td>Specification of Koenig lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="219">
     <td><a href="https://cplusplus.github.io/CWG/issues/219.html">219</a></td>
+    <td>[<a href="https://wg21.link/except.terminate">except.terminate</a>]</td>
     <td>NAD</td>
     <td>Cannot defend against destructors that throw exceptions</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="220">
     <td><a href="https://cplusplus.github.io/CWG/issues/220.html">220</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>CD1</td>
     <td>All deallocation functions should be required not to throw</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="221">
     <td><a href="https://cplusplus.github.io/CWG/issues/221.html">221</a></td>
+    <td>[<a href="https://wg21.link/over.assign">over.assign</a>]</td>
     <td>CD1</td>
     <td>Must compound assignment operators be member functions?</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr id="222">
     <td><a href="https://cplusplus.github.io/CWG/issues/222.html">222</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD1</td>
     <td>Sequence points and lvalue-returning operators</td>
     <td class="full" align="center">Duplicate of <a href="#637">637</a></td>
   </tr>
   <tr id="223">
     <td><a href="https://cplusplus.github.io/CWG/issues/223.html">223</a></td>
+    <td>[<a href="https://wg21.link/depr">depr</a>]</td>
     <td>CD3</td>
     <td>The meaning of deprecation</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="224">
     <td><a href="https://cplusplus.github.io/CWG/issues/224.html">224</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD1</td>
     <td>Definition of dependent names</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="225">
     <td><a href="https://cplusplus.github.io/CWG/issues/225.html">225</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>NAD</td>
     <td>Koenig lookup and fundamental types</td>
     <td class="full" align="center">Yes</td>
   </tr>
   <tr id="226">
     <td><a href="https://cplusplus.github.io/CWG/issues/226.html">226</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD1</td>
     <td>Default template arguments for function templates</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="227">
     <td><a href="https://cplusplus.github.io/CWG/issues/227.html">227</a></td>
+    <td>[<a href="https://wg21.link/stmt.select">stmt.select</a>]</td>
     <td>TC1</td>
     <td>How many scopes in an <TT>if</TT> statement?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="228">
     <td><a href="https://cplusplus.github.io/CWG/issues/228.html">228</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD1</td>
     <td>Use of <TT>template</TT> keyword with non-member templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="229">
     <td><a href="https://cplusplus.github.io/CWG/issues/229.html">229</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>NAD</td>
     <td>Partial specialization of function templates</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="230">
     <td><a href="https://cplusplus.github.io/CWG/issues/230.html">230</a></td>
+    <td>[<a href="https://wg21.link/class.abstract">class.abstract</a>]</td>
     <td>NAD</td>
     <td>Calls to pure virtual functions</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="231">
     <td><a href="https://cplusplus.github.io/CWG/issues/231.html">231</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>NAD</td>
     <td>Visibility of names after <I>using-directive</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="232">
     <td><a href="https://cplusplus.github.io/CWG/issues/232.html">232</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>NAD</td>
     <td>Is indirection through a null pointer undefined behavior?</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="none" align="center">Duplicate of <a href="#2823">2823</a></td>
   </tr>
   <tr id="233">
     <td><a href="https://cplusplus.github.io/CWG/issues/233.html">233</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD7</td>
     <td>References vs pointers in UDC overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="234">
     <td><a href="https://cplusplus.github.io/CWG/issues/234.html">234</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>NAD</td>
     <td>Reuse of base class subobjects</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="235">
     <td><a href="https://cplusplus.github.io/CWG/issues/235.html">235</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>TC1</td>
     <td>Assignment vs initialization</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="236">
     <td><a href="https://cplusplus.github.io/CWG/issues/236.html">236</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Explicit temporaries and integral constant expressions</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="237">
     <td><a href="https://cplusplus.github.io/CWG/issues/237.html">237</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD1</td>
     <td>Explicit instantiation and base class members</td>
     <td class="full" align="center">Duplicate of <a href="#470">470</a></td>
   </tr>
   <tr id="238">
     <td><a href="https://cplusplus.github.io/CWG/issues/238.html">238</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD4</td>
     <td>Precision and accuracy constraints on floating point</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="239">
     <td><a href="https://cplusplus.github.io/CWG/issues/239.html">239</a></td>
+    <td>[<a href="https://wg21.link/over.call.func">over.call.func</a>]</td>
     <td>CD1</td>
     <td>Footnote 116 and Koenig lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="240">
     <td><a href="https://cplusplus.github.io/CWG/issues/240.html">240</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>CD3</td>
     <td>Uninitialized values and undefined behavior</td>
     <td class="full" align="center">Duplicate of <a href="#616">616</a></td>
   </tr>
   <tr id="241">
     <td><a href="https://cplusplus.github.io/CWG/issues/241.html">241</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>TC1</td>
     <td>Error in example in 14.8.1</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="242">
     <td><a href="https://cplusplus.github.io/CWG/issues/242.html">242</a></td>
+    <td>[<a href="https://wg21.link/expr.cast">expr.cast</a>]</td>
     <td>CD4</td>
     <td>Interpretation of old-style casts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="243">
     <td><a href="https://cplusplus.github.io/CWG/issues/243.html">243</a></td>
+    <td>[<a href="https://wg21.link/over.ics.user">over.ics.user</a>]</td>
     <td>NAD</td>
     <td>Weighting of conversion functions in direct-initialization</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="244">
     <td><a href="https://cplusplus.github.io/CWG/issues/244.html">244</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD1</td>
     <td>Destructor lookup</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="245">
     <td><a href="https://cplusplus.github.io/CWG/issues/245.html">245</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.elab">basic.lookup.elab</a>]</td>
     <td>CD1</td>
     <td>Name lookup in <I>elaborated-type-specifier</I>s</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="246">
     <td><a href="https://cplusplus.github.io/CWG/issues/246.html">246</a></td>
+    <td>[<a href="https://wg21.link/temp.arg">temp.arg</a>]</td>
     <td>CD1</td>
     <td>Jumps in <I>function-try-block</I> handlers</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="247">
     <td><a href="https://cplusplus.github.io/CWG/issues/247.html">247</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>NAD</td>
     <td>Pointer-to-member casts and function overload resolution</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="248">
     <td><a href="https://cplusplus.github.io/CWG/issues/248.html">248</a></td>
+    <td>[<a href="https://wg21.link/extendid">extendid</a>]</td>
     <td>C++11</td>
     <td>Identifier characters</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P1949">P1949</a></td>
   </tr>
   <tr id="249">
     <td><a href="https://cplusplus.github.io/CWG/issues/249.html">249</a></td>
+    <td>[<a href="https://wg21.link/temp.mem.func">temp.mem.func</a>]</td>
     <td>TC1</td>
     <td>What is a member function template?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="250">
     <td><a href="https://cplusplus.github.io/CWG/issues/250.html">250</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>TC1</td>
     <td>Address of function template specialization with non-deduced template arguments</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="251">
     <td><a href="https://cplusplus.github.io/CWG/issues/251.html">251</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>open</td>
     <td>How many signed integer types are there?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="252">
     <td><a href="https://cplusplus.github.io/CWG/issues/252.html">252</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD1</td>
     <td>Looking up deallocation functions in virtual destructors</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="253">
     <td><a href="https://cplusplus.github.io/CWG/issues/253.html">253</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++17</td>
     <td>Why must empty or fully-initialized const objects be initialized?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="254">
     <td><a href="https://cplusplus.github.io/CWG/issues/254.html">254</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.elab">basic.lookup.elab</a>]</td>
     <td>CD1</td>
     <td>Definitional problems with <I>elaborated-type-specifier</I>s</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="255">
     <td><a href="https://cplusplus.github.io/CWG/issues/255.html">255</a></td>
+    <td>[<a href="https://wg21.link/class.free">class.free</a>]</td>
     <td>CD6</td>
     <td>Placement deallocation functions and lookup ambiguity</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="256">
     <td><a href="https://cplusplus.github.io/CWG/issues/256.html">256</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD1</td>
     <td>Overflow in size calculations</td>
     <td class="unknown" align="center">Duplicate of <a href="#624">624</a></td>
   </tr>
   <tr id="257">
     <td><a href="https://cplusplus.github.io/CWG/issues/257.html">257</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD2</td>
     <td>Abstract base constructors and virtual base initialization</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="258">
     <td><a href="https://cplusplus.github.io/CWG/issues/258.html">258</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD1</td>
     <td><I>using-declaration</I>s and cv-qualifiers</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="259">
     <td><a href="https://cplusplus.github.io/CWG/issues/259.html">259</a></td>
+    <td>[<a href="https://wg21.link/temp.spec">temp.spec</a>]</td>
     <td>CD1</td>
     <td>Restrictions on explicit specialization and instantiation</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr class="open" id="260">
     <td><a href="https://cplusplus.github.io/CWG/issues/260.html">260</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>open</td>
     <td>User-defined conversions and built-in <TT>operator=</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="261">
     <td><a href="https://cplusplus.github.io/CWG/issues/261.html">261</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD1</td>
     <td>When is a deallocation function "used?"</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="262">
     <td><a href="https://cplusplus.github.io/CWG/issues/262.html">262</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD1</td>
     <td>Default arguments and ellipsis</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="263">
     <td><a href="https://cplusplus.github.io/CWG/issues/263.html">263</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD1</td>
     <td>Can a constructor be declared a friend?</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr class="open" id="264">
     <td><a href="https://cplusplus.github.io/CWG/issues/264.html">264</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>open</td>
     <td>Unusable template constructors and conversion functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="265">
     <td><a href="https://cplusplus.github.io/CWG/issues/265.html">265</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>dup</td>
     <td>Destructors, exceptions, and deallocation</td>
     <td class="unknown" align="center">Duplicate of <a href="#353">353</a></td>
   </tr>
   <tr id="266">
     <td><a href="https://cplusplus.github.io/CWG/issues/266.html">266</a></td>
+    <td>[<a href="https://wg21.link/gram">gram</a>]</td>
     <td>NAD</td>
     <td>No grammar sentence symbol</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr class="open" id="267">
     <td><a href="https://cplusplus.github.io/CWG/issues/267.html">267</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Alignment requirement for <I>new-expression</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="268">
     <td><a href="https://cplusplus.github.io/CWG/issues/268.html">268</a></td>
+    <td>[<a href="https://wg21.link/cpp.rescan">cpp.rescan</a>]</td>
     <td>open</td>
     <td>Macro name suppression in rescanned replacement text</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="269">
     <td><a href="https://cplusplus.github.io/CWG/issues/269.html">269</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>NAD</td>
-    <td>Order of initialization of multiply-defined static data members
-of class templates</td>
+    <td>Order of initialization of multiply-defined static data members of class templates</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="270">
     <td><a href="https://cplusplus.github.io/CWG/issues/270.html">270</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD1</td>
     <td>Order of initialization of static data members of class templates</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="271">
     <td><a href="https://cplusplus.github.io/CWG/issues/271.html">271</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD6</td>
     <td>Explicit instantiation and template argument deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="272">
     <td><a href="https://cplusplus.github.io/CWG/issues/272.html">272</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD1</td>
     <td>Explicit destructor invocation and <I>qualified-id</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="273">
     <td><a href="https://cplusplus.github.io/CWG/issues/273.html">273</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>POD classes and <TT>operator&amp;()</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="274">
     <td><a href="https://cplusplus.github.io/CWG/issues/274.html">274</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD1</td>
     <td>Cv-qualification and char-alias access to out-of-lifetime objects</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="275">
     <td><a href="https://cplusplus.github.io/CWG/issues/275.html">275</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD1</td>
     <td>Explicit instantiation/specialization and <I>using-directive</I>s</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="276">
     <td><a href="https://cplusplus.github.io/CWG/issues/276.html">276</a></td>
+    <td>[<a href="https://wg21.link/stmt.jump">stmt.jump</a>]</td>
     <td>CD1</td>
     <td>Order of destruction of parameters and temporaries</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="277">
     <td><a href="https://cplusplus.github.io/CWG/issues/277.html">277</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Zero-initialization of pointers</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="278">
     <td><a href="https://cplusplus.github.io/CWG/issues/278.html">278</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>NAD</td>
     <td>External linkage and nameless entities</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="279">
     <td><a href="https://cplusplus.github.io/CWG/issues/279.html">279</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD6</td>
     <td>Correspondence of "names for linkage purposes"</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="280">
     <td><a href="https://cplusplus.github.io/CWG/issues/280.html">280</a></td>
+    <td>[<a href="https://wg21.link/over.call.object">over.call.object</a>]</td>
     <td>CD1</td>
     <td>Access and surrogate call functions</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="281">
     <td><a href="https://cplusplus.github.io/CWG/issues/281.html">281</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD1</td>
     <td><TT>inline</TT> specifier in <TT>friend</TT> declarations</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="282">
     <td><a href="https://cplusplus.github.io/CWG/issues/282.html">282</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>open</td>
     <td>Namespace for <TT>extended_type_info</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="283">
     <td><a href="https://cplusplus.github.io/CWG/issues/283.html">283</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD1</td>
     <td>Template <I>type-parameter</I>s are not syntactically <I>type-name</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="284">
     <td><a href="https://cplusplus.github.io/CWG/issues/284.html">284</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td><I>qualified-id</I>s in class declarations</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="285">
     <td><a href="https://cplusplus.github.io/CWG/issues/285.html">285</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Identifying a function template being specialized</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="286">
     <td><a href="https://cplusplus.github.io/CWG/issues/286.html">286</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>CD1</td>
     <td>Incorrect example in partial specialization</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr class="open" id="287">
     <td><a href="https://cplusplus.github.io/CWG/issues/287.html">287</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>drafting</td>
     <td>Order dependencies in template instantiation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="288">
     <td><a href="https://cplusplus.github.io/CWG/issues/288.html">288</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD1</td>
     <td>Misuse of "static type" in describing pointers</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="289">
     <td><a href="https://cplusplus.github.io/CWG/issues/289.html">289</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD1</td>
     <td>Incomplete list of contexts requiring a complete type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="290">
     <td><a href="https://cplusplus.github.io/CWG/issues/290.html">290</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>NAD</td>
     <td>Should memcpy be allowed into a POD with a const member?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="291">
     <td><a href="https://cplusplus.github.io/CWG/issues/291.html">291</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD1</td>
     <td>Overload resolution needed when binding reference to class rvalue</td>
     <td class="full" align="center">Duplicate of <a href="#391">391</a></td>
   </tr>
   <tr id="292">
     <td><a href="https://cplusplus.github.io/CWG/issues/292.html">292</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD3</td>
     <td>Deallocation on exception in <TT>new</TT> before arguments evaluated</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr class="open" id="293">
     <td><a href="https://cplusplus.github.io/CWG/issues/293.html">293</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>open</td>
     <td>Syntax of explicit instantiation/specialization too permissive</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="294">
     <td><a href="https://cplusplus.github.io/CWG/issues/294.html">294</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>NAD</td>
     <td>Can <TT>static_cast</TT> drop exception specifications?</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="295">
     <td><a href="https://cplusplus.github.io/CWG/issues/295.html">295</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD1</td>
     <td>cv-qualifiers on function types</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="296">
     <td><a href="https://cplusplus.github.io/CWG/issues/296.html">296</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>CD1</td>
     <td>Can conversion functions be static?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="297">
     <td><a href="https://cplusplus.github.io/CWG/issues/297.html">297</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>NAD</td>
     <td>Which template does an explicit specialization specialize?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="298">
     <td><a href="https://cplusplus.github.io/CWG/issues/298.html">298</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD1</td>
     <td><TT>T::x</TT> when <TT>T</TT> is cv-qualified</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="299">
     <td><a href="https://cplusplus.github.io/CWG/issues/299.html">299</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD1</td>
     <td>Conversion on array bound expression in <TT>new</TT></td>
     <td class="full" align="center">Clang 2.8 (C++11 onwards)</td>
   </tr>
   <tr id="300">
     <td><a href="https://cplusplus.github.io/CWG/issues/300.html">300</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD1</td>
     <td>References to functions in template argument deduction</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="301">
     <td><a href="https://cplusplus.github.io/CWG/issues/301.html">301</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD1</td>
     <td>Syntax for <I>template-name</I></td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="302">
     <td><a href="https://cplusplus.github.io/CWG/issues/302.html">302</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Value-initialization and generation of default constructor</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="303">
     <td><a href="https://cplusplus.github.io/CWG/issues/303.html">303</a></td>
+    <td>[<a href="https://wg21.link/conv.prom">conv.prom</a>]</td>
     <td>NAD</td>
     <td>Integral promotions on bit-fields</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="304">
     <td><a href="https://cplusplus.github.io/CWG/issues/304.html">304</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>TC1</td>
     <td>Value-initialization of a reference</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="305">
     <td><a href="https://cplusplus.github.io/CWG/issues/305.html">305</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD1</td>
     <td>Name lookup in destructor call</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="306">
     <td><a href="https://cplusplus.github.io/CWG/issues/306.html">306</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>CD1</td>
     <td>Ambiguity by class name injection</td>
     <td class="none" align="center">Duplicate of <a href="#39">39</a></td>
   </tr>
   <tr id="307">
     <td><a href="https://cplusplus.github.io/CWG/issues/307.html">307</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>NAD</td>
     <td>Initialization of a virtual base class subobject</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="308">
     <td><a href="https://cplusplus.github.io/CWG/issues/308.html">308</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>NAD</td>
     <td>Catching exceptions with ambiguous base classes</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="309">
     <td><a href="https://cplusplus.github.io/CWG/issues/309.html">309</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>CD1</td>
     <td>Linkage of entities whose names are not simply identifiers, in introduction</td>
     <td class="full" align="center">Duplicate of <a href="#485">485</a></td>
   </tr>
   <tr class="open" id="310">
     <td><a href="https://cplusplus.github.io/CWG/issues/310.html">310</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>open</td>
     <td>Can function templates differing only in parameter cv-qualifiers be overloaded?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="311">
     <td><a href="https://cplusplus.github.io/CWG/issues/311.html">311</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>NAD</td>
     <td>Using qualified name to reopen nested namespace</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="312">
     <td><a href="https://cplusplus.github.io/CWG/issues/312.html">312</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>CD3</td>
     <td>&#8220;use&#8221; of invalid pointer value not defined</td>
     <td class="full" align="center">Duplicate of <a href="#616">616</a></td>
   </tr>
   <tr id="313">
     <td><a href="https://cplusplus.github.io/CWG/issues/313.html">313</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>dup</td>
     <td>Class with single conversion function to integral as array size in <TT>new</TT></td>
     <td class="full" align="center">Duplicate of <a href="#299">299</a> (C++11 onwards)</td>
   </tr>
   <tr id="314">
     <td><a href="https://cplusplus.github.io/CWG/issues/314.html">314</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++17</td>
     <td><TT>template</TT> in base class specifier</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="315">
     <td><a href="https://cplusplus.github.io/CWG/issues/315.html">315</a></td>
+    <td>[<a href="https://wg21.link/class.static.mfct">class.static.mfct</a>]</td>
     <td>NAD</td>
     <td>Is call of static member function through null pointer undefined?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="316">
     <td><a href="https://cplusplus.github.io/CWG/issues/316.html">316</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>NAD</td>
     <td>Injected-class-name of template used as template template parameter</td>
     <td class="full-superseded" align="center">Superseded by <a href="#1004">1004</a></td>
   </tr>
   <tr id="317">
     <td><a href="https://cplusplus.github.io/CWG/issues/317.html">317</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD1</td>
     <td>Can a function be declared inline after it has been called?</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="318">
     <td><a href="https://cplusplus.github.io/CWG/issues/318.html">318</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD1</td>
     <td><TT>struct A::A</TT> should not name the constructor of <TT>A</TT></td>
     <td class="full-superseded" align="center">Superseded by <a href="#1310">1310</a></td>
   </tr>
   <tr id="319">
     <td><a href="https://cplusplus.github.io/CWG/issues/319.html">319</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD1</td>
     <td>Use of names without linkage in declaring entities with linkage</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="320">
     <td><a href="https://cplusplus.github.io/CWG/issues/320.html">320</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Question on copy constructor elision example</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="321">
     <td><a href="https://cplusplus.github.io/CWG/issues/321.html">321</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>dup</td>
     <td>Associated classes and namespaces for argument-dependent lookup</td>
     <td class="full" align="center">Duplicate of <a href="#557">557</a></td>
   </tr>
   <tr id="322">
     <td><a href="https://cplusplus.github.io/CWG/issues/322.html">322</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD1</td>
     <td>Deduction of reference conversions</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="323">
     <td><a href="https://cplusplus.github.io/CWG/issues/323.html">323</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD1</td>
     <td>Where must <TT>export</TT> appear?</td>
     <td class="full-superseded" align="center">Superseded by <a href="#820">820</a></td>
   </tr>
   <tr id="324">
     <td><a href="https://cplusplus.github.io/CWG/issues/324.html">324</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD1</td>
     <td>Can "<TT>&amp;</TT>" be applied to assignment to bit-field?</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr class="open" id="325">
     <td><a href="https://cplusplus.github.io/CWG/issues/325.html">325</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>open</td>
     <td>When are default arguments parsed?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="326">
     <td><a href="https://cplusplus.github.io/CWG/issues/326.html">326</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD1</td>
     <td>Wording for definition of trivial constructor</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="327">
     <td><a href="https://cplusplus.github.io/CWG/issues/327.html">327</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Use of "structure" without definition</td>
     <td class="na" align="center">Duplicate of <a href="#538">538</a></td>
   </tr>
   <tr id="328">
     <td><a href="https://cplusplus.github.io/CWG/issues/328.html">328</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD1</td>
     <td>Missing requirement that class member types be complete</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="329">
     <td><a href="https://cplusplus.github.io/CWG/issues/329.html">329</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD1</td>
     <td>Evaluation of friends of templates</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="330">
     <td><a href="https://cplusplus.github.io/CWG/issues/330.html">330</a></td>
+    <td>[<a href="https://wg21.link/conv.qual">conv.qual</a>]</td>
     <td>CD4</td>
     <td>Qualification conversions and pointers to arrays of pointers</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="331">
     <td><a href="https://cplusplus.github.io/CWG/issues/331.html">331</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD1</td>
     <td>Allowed copy constructor signatures</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="332">
     <td><a href="https://cplusplus.github.io/CWG/issues/332.html">332</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD3</td>
     <td>cv-qualified <TT>void</TT> parameter types</td>
     <td class="full" align="center">Duplicate of <a href="#577">577</a></td>
   </tr>
   <tr id="333">
     <td><a href="https://cplusplus.github.io/CWG/issues/333.html">333</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>NAD</td>
     <td>Ambiguous use of "declaration" in disambiguation section</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="334">
     <td><a href="https://cplusplus.github.io/CWG/issues/334.html">334</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>NAD</td>
     <td>Is a comma-expression dependent if its first operand is?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="335">
     <td><a href="https://cplusplus.github.io/CWG/issues/335.html">335</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD1</td>
     <td>Allowing <TT>export</TT> on template members of nontemplate classes</td>
     <td class="full-superseded" align="center">Superseded by <a href="#820">820</a></td>
   </tr>
   <tr id="336">
     <td><a href="https://cplusplus.github.io/CWG/issues/336.html">336</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD1</td>
     <td>Explicit specialization examples are still incorrect</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="337">
     <td><a href="https://cplusplus.github.io/CWG/issues/337.html">337</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD1</td>
     <td>Attempt to create array of abtract type should cause deduction to fail</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="338">
     <td><a href="https://cplusplus.github.io/CWG/issues/338.html">338</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD6</td>
     <td>Enumerator name with linkage used as class name in other translation unit</td>
     <td class="partial" align="center">Duplicate of <a href="#1884">1884</a></td>
   </tr>
   <tr id="339">
     <td><a href="https://cplusplus.github.io/CWG/issues/339.html">339</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td>Overload resolution in operand of <TT>sizeof</TT> in constant expression</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="340">
     <td><a href="https://cplusplus.github.io/CWG/issues/340.html">340</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>NAD</td>
     <td>Unclear wording in disambiguation section</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="341">
     <td><a href="https://cplusplus.github.io/CWG/issues/341.html">341</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>C++11</td>
     <td><TT>extern "C"</TT> namespace member function versus global variable</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1708">1708</a></td>
   </tr>
   <tr id="342">
     <td><a href="https://cplusplus.github.io/CWG/issues/342.html">342</a></td>
+    <td>[<a href="https://wg21.link/expr.unary">expr.unary</a>]</td>
     <td>CD3</td>
     <td>Terminology: "indirection" versus "dereference"</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="343">
     <td><a href="https://cplusplus.github.io/CWG/issues/343.html">343</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++17</td>
     <td>Make <TT>template</TT> optional in contexts that require a type</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="344">
     <td><a href="https://cplusplus.github.io/CWG/issues/344.html">344</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD3</td>
     <td>Naming destructors</td>
     <td class="unknown" align="center">Duplicate of <a href="#1435">1435</a></td>
   </tr>
   <tr id="345">
     <td><a href="https://cplusplus.github.io/CWG/issues/345.html">345</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td>Misleading comment on example in templates chapter</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="346">
     <td><a href="https://cplusplus.github.io/CWG/issues/346.html">346</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>Typo in 15.4</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="347">
     <td><a href="https://cplusplus.github.io/CWG/issues/347.html">347</a></td>
+    <td>[<a href="https://wg21.link/class.nest">class.nest</a>]</td>
     <td>NAD</td>
     <td>Use of derived class name in defining base class nested class</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="348">
     <td><a href="https://cplusplus.github.io/CWG/issues/348.html">348</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>CD1</td>
     <td><TT>delete</TT> and user-written deallocation functions</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="349">
     <td><a href="https://cplusplus.github.io/CWG/issues/349.html">349</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD1</td>
     <td>Template argument deduction for conversion functions and qualification conversions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="350">
     <td><a href="https://cplusplus.github.io/CWG/issues/350.html">350</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>open</td>
     <td><TT>signed char</TT> underlying representation for objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="351">
     <td><a href="https://cplusplus.github.io/CWG/issues/351.html">351</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD1</td>
     <td>Sequence point error: unspecified or undefined?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="352">
     <td><a href="https://cplusplus.github.io/CWG/issues/352.html">352</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD1</td>
     <td>Nondeduced contexts</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="353">
     <td><a href="https://cplusplus.github.io/CWG/issues/353.html">353</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD1</td>
     <td>Is deallocation routine called if destructor throws exception in delete?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="354">
     <td><a href="https://cplusplus.github.io/CWG/issues/354.html">354</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD1</td>
     <td>Null as nontype template argument</td>
     <td class="full" align="center">Clang 3.1 (C++11 onwards)</td>
   </tr>
   <tr id="355">
     <td><a href="https://cplusplus.github.io/CWG/issues/355.html">355</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>C++11</td>
     <td>Global-scope <TT>::</TT> in <I>nested-name-specifier</I></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="356">
     <td><a href="https://cplusplus.github.io/CWG/issues/356.html">356</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Wording of behavior of generated copy constructor for scalar members</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="357">
     <td><a href="https://cplusplus.github.io/CWG/issues/357.html">357</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD1</td>
     <td>Definition of signature should include name</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="358">
     <td><a href="https://cplusplus.github.io/CWG/issues/358.html">358</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>Namespaces and extern "C"</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="359">
     <td><a href="https://cplusplus.github.io/CWG/issues/359.html">359</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>NAD</td>
     <td>Type definition in anonymous union</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="360">
     <td><a href="https://cplusplus.github.io/CWG/issues/360.html">360</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD6</td>
     <td>Using-declaration that reduces access</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr class="open" id="361">
     <td><a href="https://cplusplus.github.io/CWG/issues/361.html">361</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>open</td>
     <td>Forward reference to default argument</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="362">
     <td><a href="https://cplusplus.github.io/CWG/issues/362.html">362</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD1</td>
     <td>Order of initialization in instantiation units</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="363">
     <td><a href="https://cplusplus.github.io/CWG/issues/363.html">363</a></td>
+    <td>[<a href="https://wg21.link/class.expl.init">class.expl.init</a>]</td>
     <td>NAD</td>
     <td>Initialization of class from self</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="364">
     <td><a href="https://cplusplus.github.io/CWG/issues/364.html">364</a></td>
+    <td>[<a href="https://wg21.link/over.call.func">over.call.func</a>]</td>
     <td>CD1</td>
     <td>Calling overloaded function with static in set, with no object</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="365">
     <td><a href="https://cplusplus.github.io/CWG/issues/365.html">365</a></td>
+    <td>[<a href="https://wg21.link/basic.stc">basic.stc</a>]</td>
     <td>open</td>
     <td>Storage duration and temporaries</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="366">
     <td><a href="https://cplusplus.github.io/CWG/issues/366.html">366</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td>String literal allowed in integral constant expression?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="367">
     <td><a href="https://cplusplus.github.io/CWG/issues/367.html">367</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td><TT>throw</TT> operator allowed in constant expression?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="368">
     <td><a href="https://cplusplus.github.io/CWG/issues/368.html">368</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD1</td>
     <td>Uses of non-type parameters that should cause deduction to fail</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr class="open" id="369">
     <td><a href="https://cplusplus.github.io/CWG/issues/369.html">369</a></td>
+    <td>[<a href="https://wg21.link/lex.pptoken">lex.pptoken</a>]</td>
     <td>open</td>
     <td>Are <TT>new</TT>/<TT>delete</TT> identifiers or <I>preprocessing-op-or-punc</I>?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="370">
     <td><a href="https://cplusplus.github.io/CWG/issues/370.html">370</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>CD1</td>
     <td>Can <TT>#include &lt;...&gt;</TT> form be used other than for standard C++ headers?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr class="open" id="371">
     <td><a href="https://cplusplus.github.io/CWG/issues/371.html">371</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>open</td>
     <td>Interleaving of constructor calls</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="372">
     <td><a href="https://cplusplus.github.io/CWG/issues/372.html">372</a></td>
+    <td>[<a href="https://wg21.link/temp.arg">temp.arg</a>]</td>
     <td>CD1</td>
     <td>Is access granted by base class specifiers available in following base class specifiers?</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="373">
     <td><a href="https://cplusplus.github.io/CWG/issues/373.html">373</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.udir">basic.lookup.udir</a>]</td>
     <td>C++11</td>
     <td>Lookup on namespace qualified name in using-directive</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr id="374">
     <td><a href="https://cplusplus.github.io/CWG/issues/374.html">374</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD2</td>
     <td>Can explicit specialization outside namespace use qualified name?</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="375">
     <td><a href="https://cplusplus.github.io/CWG/issues/375.html">375</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>dup</td>
     <td>Confusing example on lookup with <TT>typename</TT></td>
     <td class="full" align="center">Duplicate of <a href="#345">345</a></td>
   </tr>
   <tr id="376">
     <td><a href="https://cplusplus.github.io/CWG/issues/376.html">376</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>NAD</td>
     <td>Class "definition" versus class "declaration"</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="377">
     <td><a href="https://cplusplus.github.io/CWG/issues/377.html">377</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD1</td>
     <td>Enum whose enumerators will not fit in any integral type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="378">
     <td><a href="https://cplusplus.github.io/CWG/issues/378.html">378</a></td>
+    <td>[<a href="https://wg21.link/stmt.jump">stmt.jump</a>]</td>
     <td>CD1</td>
     <td>Wording that says temporaries are declared</td>
     <td class="na" align="center">Duplicate of <a href="#276">276</a></td>
   </tr>
   <tr id="379">
     <td><a href="https://cplusplus.github.io/CWG/issues/379.html">379</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Change "class declaration" to "class definition"</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr class="open" id="380">
     <td><a href="https://cplusplus.github.io/CWG/issues/380.html">380</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>open</td>
     <td>Definition of "ambiguous base class" missing</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="381">
     <td><a href="https://cplusplus.github.io/CWG/issues/381.html">381</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD1</td>
     <td>Incorrect example of base class member lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="382">
     <td><a href="https://cplusplus.github.io/CWG/issues/382.html">382</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td>Allow <TT>typename</TT> outside of templates</td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="383">
     <td><a href="https://cplusplus.github.io/CWG/issues/383.html">383</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Is a class with a declared but not defined destructor a POD?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="384">
     <td><a href="https://cplusplus.github.io/CWG/issues/384.html">384</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>NAD</td>
     <td>Argument-dependent lookup and operator functions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="385">
     <td><a href="https://cplusplus.github.io/CWG/issues/385.html">385</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>CD1</td>
     <td>How does protected member check of 11.5 interact with using-declarations?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="386">
     <td><a href="https://cplusplus.github.io/CWG/issues/386.html">386</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD6</td>
     <td>Friend declaration of name brought in by <I>using-declaration</I></td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="387">
     <td><a href="https://cplusplus.github.io/CWG/issues/387.html">387</a></td>
+    <td>[<a href="https://wg21.link/temp.inject">temp.inject</a>]</td>
     <td>CD1</td>
     <td>Errors in example in 14.6.5</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="388">
     <td><a href="https://cplusplus.github.io/CWG/issues/388.html">388</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>CD3</td>
     <td>Catching base<TT>*&amp;</TT> from a throw of derived<TT>*</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="389">
     <td><a href="https://cplusplus.github.io/CWG/issues/389.html">389</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD1</td>
     <td>Unnamed types in entities with linkage</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="390">
     <td><a href="https://cplusplus.github.io/CWG/issues/390.html">390</a></td>
+    <td>[<a href="https://wg21.link/class.abstract">class.abstract</a>]</td>
     <td>CD1</td>
     <td>Pure virtual must be defined when implicitly called</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="391">
     <td><a href="https://cplusplus.github.io/CWG/issues/391.html">391</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD1</td>
     <td>Require direct binding of short-lived references to rvalues</td>
     <td class="full" align="center">Clang 2.8 (C++11 onwards)</td>
   </tr>
   <tr id="392">
     <td><a href="https://cplusplus.github.io/CWG/issues/392.html">392</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Use of full expression lvalue before temporary destruction</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="393">
     <td><a href="https://cplusplus.github.io/CWG/issues/393.html">393</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD4</td>
     <td>Pointer to array of unknown bound in template argument list in parameter</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="394">
     <td><a href="https://cplusplus.github.io/CWG/issues/394.html">394</a></td>
+    <td>[<a href="https://wg21.link/cpp.pre">cpp.pre</a>]</td>
     <td>CD1</td>
     <td><I>identifier-list</I> is never defined</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="395">
     <td><a href="https://cplusplus.github.io/CWG/issues/395.html">395</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>NAD</td>
     <td>Conversion operator template syntax</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="396">
     <td><a href="https://cplusplus.github.io/CWG/issues/396.html">396</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD1</td>
     <td>Misleading note regarding use of <TT>auto</TT> for disambiguation</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="397">
     <td><a href="https://cplusplus.github.io/CWG/issues/397.html">397</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD1</td>
     <td>Same address for string literals from default arguments in inline functions?</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1823">1823</a></td>
   </tr>
   <tr id="398">
     <td><a href="https://cplusplus.github.io/CWG/issues/398.html">398</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD1</td>
     <td>Ambiguous wording on naming a type in deduction</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="399">
     <td><a href="https://cplusplus.github.io/CWG/issues/399.html">399</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD6</td>
     <td>Destructor lookup redux</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="400">
     <td><a href="https://cplusplus.github.io/CWG/issues/400.html">400</a></td>
+    <td>[<a href="https://wg21.link/namespace.qual">namespace.qual</a>]</td>
     <td>CD1</td>
     <td>Using-declarations and the "struct hack"</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="401">
     <td><a href="https://cplusplus.github.io/CWG/issues/401.html">401</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD1</td>
     <td>When is access for template parameter default arguments checked?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr class="open" id="402">
     <td><a href="https://cplusplus.github.io/CWG/issues/402.html">402</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>open</td>
     <td>More on partial ordering of function templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="403">
     <td><a href="https://cplusplus.github.io/CWG/issues/403.html">403</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD1</td>
     <td>Reference to a type as a <I>template-id</I></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="404">
     <td><a href="https://cplusplus.github.io/CWG/issues/404.html">404</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD1</td>
     <td>Unclear reference to construction with non-trivial constructor</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="405">
     <td><a href="https://cplusplus.github.io/CWG/issues/405.html">405</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD6</td>
     <td>Unqualified function name lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="406">
     <td><a href="https://cplusplus.github.io/CWG/issues/406.html">406</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>CD1</td>
     <td>Static data member in class with name for linkage purposes</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="407">
     <td><a href="https://cplusplus.github.io/CWG/issues/407.html">407</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>C++11</td>
     <td>Named class with associated typedef: two names or one?</td>
     <td class="full" align="center">Clang 3.8</td>
   </tr>
   <tr id="408">
     <td><a href="https://cplusplus.github.io/CWG/issues/408.html">408</a></td>
+    <td>[<a href="https://wg21.link/temp.static">temp.static</a>]</td>
     <td>CD2</td>
     <td>sizeof applied to unknown-bound array static data member of template</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="409">
     <td><a href="https://cplusplus.github.io/CWG/issues/409.html">409</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td>Obsolete paragraph missed by changes for issue 224</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="410">
     <td><a href="https://cplusplus.github.io/CWG/issues/410.html">410</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD1</td>
     <td>Paragraph missed in changes for issue 166</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="411">
     <td><a href="https://cplusplus.github.io/CWG/issues/411.html">411</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD6</td>
     <td>Use of universal-character-name in character versus string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="412">
     <td><a href="https://cplusplus.github.io/CWG/issues/412.html">412</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>NAD</td>
     <td>Can a replacement allocation function be inline?</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="413">
     <td><a href="https://cplusplus.github.io/CWG/issues/413.html">413</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Definition of "empty class"</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="414">
     <td><a href="https://cplusplus.github.io/CWG/issues/414.html">414</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD1</td>
     <td>Multiple types found on destructor lookup</td>
     <td class="none" align="center">Duplicate of <a href="#305">305</a></td>
   </tr>
   <tr id="415">
     <td><a href="https://cplusplus.github.io/CWG/issues/415.html">415</a></td>
+    <td>[<a href="https://wg21.link/temp.over">temp.over</a>]</td>
     <td>CD1</td>
     <td>Template deduction does not cause instantiation</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="416">
     <td><a href="https://cplusplus.github.io/CWG/issues/416.html">416</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>CD1</td>
     <td>Class must be complete to allow operator lookup?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="417">
     <td><a href="https://cplusplus.github.io/CWG/issues/417.html">417</a></td>
+    <td>[<a href="https://wg21.link/class.name">class.name</a>]</td>
     <td>CD1</td>
     <td>Using derived-class qualified name in out-of-class nested class definition</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="418">
     <td><a href="https://cplusplus.github.io/CWG/issues/418.html">418</a></td>
+    <td>[<a href="https://wg21.link/over.match.best">over.match.best</a>]</td>
     <td>CD6</td>
     <td>Imperfect wording on error on multiple default arguments on a called function</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="419">
     <td><a href="https://cplusplus.github.io/CWG/issues/419.html">419</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Can cast to virtual base class be done on partially-constructed object?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="420">
     <td><a href="https://cplusplus.github.io/CWG/issues/420.html">420</a></td>
+    <td>[<a href="https://wg21.link/over.ref">over.ref</a>]</td>
     <td>CD1</td>
     <td>postfixexpression-&gt;scalar_type_dtor() inconsistent</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="421">
     <td><a href="https://cplusplus.github.io/CWG/issues/421.html">421</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD1</td>
     <td>Is rvalue.field an rvalue?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="422">
     <td><a href="https://cplusplus.github.io/CWG/issues/422.html">422</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>NAD</td>
     <td>Is a typedef redeclaration allowed with a template type that might be the same?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="423">
     <td><a href="https://cplusplus.github.io/CWG/issues/423.html">423</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>NAD</td>
     <td>Can a conversion be done on the left operand of a compound assignment?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="424">
     <td><a href="https://cplusplus.github.io/CWG/issues/424.html">424</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD1</td>
     <td>Wording problem with issue 56 resolution on redeclaring typedefs in class scope</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="425">
     <td><a href="https://cplusplus.github.io/CWG/issues/425.html">425</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>CD1</td>
     <td>Set of candidates for overloaded built-in operator with float operand</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="426">
     <td><a href="https://cplusplus.github.io/CWG/issues/426.html">426</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>C++17</td>
     <td>Identically-named variables, one internally and one externally linked, allowed?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="427">
     <td><a href="https://cplusplus.github.io/CWG/issues/427.html">427</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD1</td>
     <td><TT>static_cast</TT> ambiguity: conversion versus cast to derived</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="428">
     <td><a href="https://cplusplus.github.io/CWG/issues/428.html">428</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD1</td>
     <td>Mention of expression with reference type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="429">
     <td><a href="https://cplusplus.github.io/CWG/issues/429.html">429</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD1</td>
     <td>Matching deallocation function chosen based on syntax or signature?</td>
     <td class="full" align="center">Clang 2.8 (C++11 onwards)</td>
   </tr>
   <tr id="430">
     <td><a href="https://cplusplus.github.io/CWG/issues/430.html">430</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD1</td>
     <td>Ordering of expression evaluation in initializer list</td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="431">
     <td><a href="https://cplusplus.github.io/CWG/issues/431.html">431</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++11</td>
     <td>Defect in wording in 14.2</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="432">
     <td><a href="https://cplusplus.github.io/CWG/issues/432.html">432</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>CD1</td>
     <td>Is injected class name visible in base class specifier list?</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="433">
     <td><a href="https://cplusplus.github.io/CWG/issues/433.html">433</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>CD1</td>
     <td>Do elaborated type specifiers in templates inject into enclosing namespace scope?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="434">
     <td><a href="https://cplusplus.github.io/CWG/issues/434.html">434</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>NAD</td>
     <td>Unclear suppression of standard conversions while binding reference to lvalue</td>
     <td class="full-superseded" align="center">Superseded by <a href="#2352">2352</a></td>
   </tr>
   <tr id="435">
     <td><a href="https://cplusplus.github.io/CWG/issues/435.html">435</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>NAD</td>
     <td>Change "declararation or definition" to "declaration"</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="436">
     <td><a href="https://cplusplus.github.io/CWG/issues/436.html">436</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD1</td>
     <td>Problem in example in 9.6 paragraph 4</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="437">
     <td><a href="https://cplusplus.github.io/CWG/issues/437.html">437</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD1</td>
     <td>Is type of class allowed in member function exception specification?</td>
     <td class="full-superseded-superseded" align="center">Superseded by <a href="#1308">1308</a></td>
   </tr>
   <tr id="438">
     <td><a href="https://cplusplus.github.io/CWG/issues/438.html">438</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD2</td>
     <td>Possible flaw in wording for multiple accesses to object between sequence points</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="439">
     <td><a href="https://cplusplus.github.io/CWG/issues/439.html">439</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD1</td>
     <td>Guarantees on casting pointer back to cv-qualified version of original type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="440">
     <td><a href="https://cplusplus.github.io/CWG/issues/440.html">440</a></td>
+    <td>[<a href="https://wg21.link/temp.arg">temp.arg</a>]</td>
     <td>NAD</td>
     <td>Allow implicit pointer-to-member conversion on nontype template argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="441">
     <td><a href="https://cplusplus.github.io/CWG/issues/441.html">441</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD1</td>
     <td>Ordering of static reference initialization</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="442">
     <td><a href="https://cplusplus.github.io/CWG/issues/442.html">442</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD1</td>
     <td>Incorrect use of null pointer constant in description of delete operator</td>
     <td class="na-superseded" align="center">Superseded by <a href="#348">348</a></td>
   </tr>
   <tr id="443">
     <td><a href="https://cplusplus.github.io/CWG/issues/443.html">443</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Wording nit in description of lifetime of temporaries</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="444">
     <td><a href="https://cplusplus.github.io/CWG/issues/444.html">444</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>NAD</td>
     <td>Overriding and the generated copy assignment operator</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="445">
     <td><a href="https://cplusplus.github.io/CWG/issues/445.html">445</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td>Wording issue on friend declarations</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="446">
     <td><a href="https://cplusplus.github.io/CWG/issues/446.html">446</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD1</td>
     <td>Does an lvalue-to-rvalue conversion on the "?" operator produce a temporary?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="447">
     <td><a href="https://cplusplus.github.io/CWG/issues/447.html">447</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD1</td>
     <td>Is offsetof type-dependent?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="448">
     <td><a href="https://cplusplus.github.io/CWG/issues/448.html">448</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>C++11</td>
     <td>Set of template functions in call with dependent explicit argument</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="449">
     <td><a href="https://cplusplus.github.io/CWG/issues/449.html">449</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>NAD</td>
     <td>Consistency in use of hyphen with names of "non" entities</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="450">
     <td><a href="https://cplusplus.github.io/CWG/issues/450.html">450</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD1</td>
     <td>Binding a reference to const to a cv-qualified array rvalue</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="451">
     <td><a href="https://cplusplus.github.io/CWG/issues/451.html">451</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD1</td>
     <td>Expressions with invalid results and ill-formedness</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="452">
     <td><a href="https://cplusplus.github.io/CWG/issues/452.html">452</a></td>
+    <td>[<a href="https://wg21.link/class.this">class.this</a>]</td>
     <td>CD1</td>
     <td>Wording nit on description of <TT>this</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="453">
     <td><a href="https://cplusplus.github.io/CWG/issues/453.html">453</a></td>
+    <td>[<a href="https://wg21.link/dcl.ref">dcl.ref</a>]</td>
     <td>CD7</td>
     <td>References may only bind to &#8220;valid&#8221; objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="454">
     <td><a href="https://cplusplus.github.io/CWG/issues/454.html">454</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>CD1</td>
     <td>When is a definition of a static data member required?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="455">
     <td><a href="https://cplusplus.github.io/CWG/issues/455.html">455</a></td>
+    <td>[<a href="https://wg21.link/over.match.best">over.match.best</a>]</td>
     <td>NAD</td>
     <td>Partial ordering and non-deduced arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="456">
     <td><a href="https://cplusplus.github.io/CWG/issues/456.html">456</a></td>
+    <td>[<a href="https://wg21.link/conv.ptr">conv.ptr</a>]</td>
     <td>NAD</td>
     <td>Is initialized const int or const bool variable a null pointer constant?</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="457">
     <td><a href="https://cplusplus.github.io/CWG/issues/457.html">457</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td>Wording nit on use of const variables in constant expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="458">
     <td><a href="https://cplusplus.github.io/CWG/issues/458.html">458</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>C++11</td>
     <td>Hiding of member template parameters by other members</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="459">
     <td><a href="https://cplusplus.github.io/CWG/issues/459.html">459</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>NAD</td>
     <td>Hiding of template parameters by base class members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="460">
     <td><a href="https://cplusplus.github.io/CWG/issues/460.html">460</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD1</td>
     <td>Can a <I>using-declaration</I> name a namespace?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="461">
     <td><a href="https://cplusplus.github.io/CWG/issues/461.html">461</a></td>
+    <td>[<a href="https://wg21.link/dcl.asm">dcl.asm</a>]</td>
     <td>NAD</td>
     <td>Make <TT>asm</TT> conditionally-supported</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="462">
     <td><a href="https://cplusplus.github.io/CWG/issues/462.html">462</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD3</td>
     <td>Lifetime of temporaries bound to comma expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="463">
     <td><a href="https://cplusplus.github.io/CWG/issues/463.html">463</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD1</td>
     <td><TT>reinterpret_cast&lt;T*&gt;(0)</TT></td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="464">
     <td><a href="https://cplusplus.github.io/CWG/issues/464.html">464</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Wording nit on lifetime of temporaries to which references are bound</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="465">
     <td><a href="https://cplusplus.github.io/CWG/issues/465.html">465</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>NAD</td>
     <td>May constructors of global objects call <TT>exit()</TT>?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="466">
     <td><a href="https://cplusplus.github.io/CWG/issues/466.html">466</a></td>
+    <td>[<a href="https://wg21.link/expr.pseudo">expr.pseudo</a>]</td>
     <td>CD1</td>
     <td>cv-qualifiers on pseudo-destructor type</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="467">
     <td><a href="https://cplusplus.github.io/CWG/issues/467.html">467</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>NAD</td>
     <td>Jump past initialization of local static variable</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="468">
     <td><a href="https://cplusplus.github.io/CWG/issues/468.html">468</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD1</td>
     <td>Allow <TT>::template</TT> outside of templates</td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="469">
     <td><a href="https://cplusplus.github.io/CWG/issues/469.html">469</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>NAD</td>
     <td>Const template specializations and reference arguments</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="470">
     <td><a href="https://cplusplus.github.io/CWG/issues/470.html">470</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD1</td>
     <td>Instantiation of members of an explicitly-instantiated class template</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="471">
     <td><a href="https://cplusplus.github.io/CWG/issues/471.html">471</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>NAD</td>
     <td>Conflicting inherited access specifications</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr class="open" id="472">
     <td><a href="https://cplusplus.github.io/CWG/issues/472.html">472</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>open</td>
     <td>Casting across protected inheritance</td>
     <td align="center">
@@ -2887,5528 +3358,6447 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="473">
     <td><a href="https://cplusplus.github.io/CWG/issues/473.html">473</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>NAD</td>
     <td>Block-scope declarations of allocator functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="474">
     <td><a href="https://cplusplus.github.io/CWG/issues/474.html">474</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD1</td>
     <td>Block-scope <TT>extern</TT> declarations in namespace members</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="475">
     <td><a href="https://cplusplus.github.io/CWG/issues/475.html">475</a></td>
+    <td>[<a href="https://wg21.link/except.uncaught">except.uncaught</a>]</td>
     <td>C++11</td>
     <td>When is <TT>std::uncaught_exception()</TT> true? (take 2)</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="476">
     <td><a href="https://cplusplus.github.io/CWG/issues/476.html">476</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td>Determining the buffer size for placement new</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="477">
     <td><a href="https://cplusplus.github.io/CWG/issues/477.html">477</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD1</td>
     <td>Can <TT>virtual</TT> appear in a <TT>friend</TT> declaration?</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="478">
     <td><a href="https://cplusplus.github.io/CWG/issues/478.html">478</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>NAD</td>
     <td>May a function parameter be an array of an abstract class type?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="479">
     <td><a href="https://cplusplus.github.io/CWG/issues/479.html">479</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD1</td>
     <td>Copy elision in exception handling</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="480">
     <td><a href="https://cplusplus.github.io/CWG/issues/480.html">480</a></td>
+    <td>[<a href="https://wg21.link/conv.mem">conv.mem</a>]</td>
     <td>CD1</td>
     <td>Is a base of a virtual base also virtual?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="481">
     <td><a href="https://cplusplus.github.io/CWG/issues/481.html">481</a></td>
+    <td>[<a href="https://wg21.link/basic.scope">basic.scope</a>]</td>
     <td>CD2</td>
     <td>Scope of template parameters</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="482">
     <td><a href="https://cplusplus.github.io/CWG/issues/482.html">482</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD3</td>
     <td>Qualified declarators in redeclarations</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="483">
     <td><a href="https://cplusplus.github.io/CWG/issues/483.html">483</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD3</td>
     <td>Normative requirements on integral ranges</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="484">
     <td><a href="https://cplusplus.github.io/CWG/issues/484.html">484</a></td>
+    <td>[<a href="https://wg21.link/class.derived">class.derived</a>]</td>
     <td>CD1</td>
     <td>Can a <I>base-specifier</I> name a cv-qualified class type?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="485">
     <td><a href="https://cplusplus.github.io/CWG/issues/485.html">485</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>CD1</td>
     <td>What is a &#8220;name&#8221;?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="486">
     <td><a href="https://cplusplus.github.io/CWG/issues/486.html">486</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD1</td>
     <td>Invalid return types and template argument deduction</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="487">
     <td><a href="https://cplusplus.github.io/CWG/issues/487.html">487</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Operator overloading in constant expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="488">
     <td><a href="https://cplusplus.github.io/CWG/issues/488.html">488</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD1</td>
     <td>Local types, overload resolution, and template argument deduction</td>
     <td class="full" align="center">Clang 2.9 (C++11 onwards)</td>
   </tr>
   <tr id="489">
     <td><a href="https://cplusplus.github.io/CWG/issues/489.html">489</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>NAD</td>
     <td>Must member function templates be instantiated during overload resolution?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="490">
     <td><a href="https://cplusplus.github.io/CWG/issues/490.html">490</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD2</td>
     <td>Name lookup in friend declarations</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="491">
     <td><a href="https://cplusplus.github.io/CWG/issues/491.html">491</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD1</td>
     <td>Initializers for empty-class aggregrate members</td>
     <td class="full" align="center">Duplicate of <a href="#413">413</a></td>
   </tr>
   <tr id="492">
     <td><a href="https://cplusplus.github.io/CWG/issues/492.html">492</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>CD1</td>
     <td><TT>typeid</TT> constness inconsistent with example</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="493">
     <td><a href="https://cplusplus.github.io/CWG/issues/493.html">493</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD2</td>
     <td>Type deduction from a <TT>bool</TT> context</td>
     <td class="unknown" align="center">Duplicate of <a href="#976">976</a></td>
   </tr>
   <tr id="494">
     <td><a href="https://cplusplus.github.io/CWG/issues/494.html">494</a></td>
+    <td>[<a href="https://wg21.link/class.access">class.access</a>]</td>
     <td>CD1</td>
     <td>Problems with the resolution of issue 45</td>
     <td class="none" align="center">Duplicate of <a href="#372">372</a></td>
   </tr>
   <tr id="495">
     <td><a href="https://cplusplus.github.io/CWG/issues/495.html">495</a></td>
+    <td>[<a href="https://wg21.link/over.match.best">over.match.best</a>]</td>
     <td>CD2</td>
     <td>Overload resolution with template and non-template conversion functions</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="496">
     <td><a href="https://cplusplus.github.io/CWG/issues/496.html">496</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD3</td>
     <td>Is a volatile-qualified type really a POD?</td>
     <td class="full-superseded" align="center">Superseded by <a href="#2094">2094</a></td>
   </tr>
   <tr id="497">
     <td><a href="https://cplusplus.github.io/CWG/issues/497.html">497</a></td>
+    <td>[<a href="https://wg21.link/expr.mptr.oper">expr.mptr.oper</a>]</td>
     <td>CD1</td>
     <td>Missing required initialization in example</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#253">253</a></td>
   </tr>
   <tr class="open" id="498">
     <td><a href="https://cplusplus.github.io/CWG/issues/498.html">498</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>open</td>
     <td>Storage class specifiers in definitions of class members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="499">
     <td><a href="https://cplusplus.github.io/CWG/issues/499.html">499</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD2</td>
     <td>Throwing an array of unknown size</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="500">
     <td><a href="https://cplusplus.github.io/CWG/issues/500.html">500</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>CD1</td>
     <td>Access in <I>base-specifier</I>s of friend and nested classes</td>
     <td class="none" align="center">Duplicate of <a href="#372">372</a></td>
   </tr>
   <tr id="501">
     <td><a href="https://cplusplus.github.io/CWG/issues/501.html">501</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td>Visibility of friend declarations within the befriending class</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="502">
     <td><a href="https://cplusplus.github.io/CWG/issues/502.html">502</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++11</td>
     <td>Dependency of nested enumerations and enumerators</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="503">
     <td><a href="https://cplusplus.github.io/CWG/issues/503.html">503</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>open</td>
     <td>Cv-qualified function types in template argument deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="504">
     <td><a href="https://cplusplus.github.io/CWG/issues/504.html">504</a></td>
+    <td>[<a href="https://wg21.link/dcl.ref">dcl.ref</a>]</td>
     <td>NAD</td>
     <td>Should use of a variable in its own initializer require a diagnostic?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="505">
     <td><a href="https://cplusplus.github.io/CWG/issues/505.html">505</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD1</td>
     <td>Conditionally-supported behavior for unknown character escapes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="506">
     <td><a href="https://cplusplus.github.io/CWG/issues/506.html">506</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD1</td>
     <td>Conditionally-supported behavior for non-POD objects passed to ellipsis</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="507">
     <td><a href="https://cplusplus.github.io/CWG/issues/507.html">507</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>dup</td>
     <td>Ambiguity assigning class object to built-in type</td>
     <td class="unknown" align="center">Duplicate of <a href="#260">260</a></td>
   </tr>
   <tr id="508">
     <td><a href="https://cplusplus.github.io/CWG/issues/508.html">508</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++11</td>
     <td>Non-constructed value-initialized objects</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="509">
     <td><a href="https://cplusplus.github.io/CWG/issues/509.html">509</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Dead code in the specification of default initialization</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="510">
     <td><a href="https://cplusplus.github.io/CWG/issues/510.html">510</a></td>
+    <td>[<a href="https://wg21.link/class.init">class.init</a>]</td>
     <td>CD1</td>
     <td>Default initialization of POD classes?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="511">
     <td><a href="https://cplusplus.github.io/CWG/issues/511.html">511</a></td>
+    <td>[<a href="https://wg21.link/class.prop">class.prop</a>]</td>
     <td>NAD</td>
     <td>POD-structs with template assignment operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="512">
     <td><a href="https://cplusplus.github.io/CWG/issues/512.html">512</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>NAD</td>
     <td>Union members with user-declared non-default constructors</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="513">
     <td><a href="https://cplusplus.github.io/CWG/issues/513.html">513</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>CD1</td>
     <td>Non-class &#8220;most-derived&#8221; objects</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="514">
     <td><a href="https://cplusplus.github.io/CWG/issues/514.html">514</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD1</td>
     <td>Is the initializer for a namespace member in the scope of the namespace?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="515">
     <td><a href="https://cplusplus.github.io/CWG/issues/515.html">515</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD1</td>
     <td>Non-dependent references to base class members</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1017">1017</a></td>
   </tr>
   <tr id="516">
     <td><a href="https://cplusplus.github.io/CWG/issues/516.html">516</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD1</td>
     <td>Use of <TT>signed</TT> in bit-field declarations</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="517">
     <td><a href="https://cplusplus.github.io/CWG/issues/517.html">517</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>CD1</td>
     <td>Partial specialization following explicit instantiation</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="518">
     <td><a href="https://cplusplus.github.io/CWG/issues/518.html">518</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD1</td>
     <td>Trailing comma following <I>enumerator-list</I></td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="519">
     <td><a href="https://cplusplus.github.io/CWG/issues/519.html">519</a></td>
+    <td>[<a href="https://wg21.link/conv.ptr">conv.ptr</a>]</td>
     <td>CD1</td>
     <td>Null pointer preservation in <TT>void*</TT> conversions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="520">
     <td><a href="https://cplusplus.github.io/CWG/issues/520.html">520</a></td>
+    <td>[<a href="https://wg21.link/expr.cast">expr.cast</a>]</td>
     <td>CD1</td>
     <td>Old-style casts between incomplete class types</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="521">
     <td><a href="https://cplusplus.github.io/CWG/issues/521.html">521</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>CD1</td>
     <td>Requirements for exceptions thrown by allocation functions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="522">
     <td><a href="https://cplusplus.github.io/CWG/issues/522.html">522</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD1</td>
     <td>Array-to-pointer decay in template argument deduction</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="523">
     <td><a href="https://cplusplus.github.io/CWG/issues/523.html">523</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>open</td>
     <td>Can a one-past-the-end pointer be invalidated by deleting an adjacent object?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="524">
     <td><a href="https://cplusplus.github.io/CWG/issues/524.html">524</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD1</td>
     <td>Can function-notation calls to operator functions be dependent?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="525">
     <td><a href="https://cplusplus.github.io/CWG/issues/525.html">525</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>CD1</td>
     <td>Missing <TT>*</TT> in example</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="526">
     <td><a href="https://cplusplus.github.io/CWG/issues/526.html">526</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD1</td>
     <td>Confusing aspects in the specification of non-deduced contexts</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="527">
     <td><a href="https://cplusplus.github.io/CWG/issues/527.html">527</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD2</td>
     <td>Problems with linkage of types</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="528">
     <td><a href="https://cplusplus.github.io/CWG/issues/528.html">528</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>NAD</td>
     <td>Why are incomplete class types not allowed with <TT>typeid</TT>?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="529">
     <td><a href="https://cplusplus.github.io/CWG/issues/529.html">529</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>open</td>
     <td>Use of <TT>template&lt;&gt;</TT> with &#8220;explicitly-specialized&#8221; class templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="530">
     <td><a href="https://cplusplus.github.io/CWG/issues/530.html">530</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td>Nontype template arguments in constant expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="531">
     <td><a href="https://cplusplus.github.io/CWG/issues/531.html">531</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++11</td>
     <td>Defining members of explicit specializations</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="532">
     <td><a href="https://cplusplus.github.io/CWG/issues/532.html">532</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>C++11</td>
     <td>Member/nonmember operator template partial ordering</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="533">
     <td><a href="https://cplusplus.github.io/CWG/issues/533.html">533</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>NAD</td>
     <td>Special treatment for C-style header names</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="534">
     <td><a href="https://cplusplus.github.io/CWG/issues/534.html">534</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD1</td>
     <td><I>template-name</I>s and <I>operator-function-id</I>s</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="535">
     <td><a href="https://cplusplus.github.io/CWG/issues/535.html">535</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD3</td>
     <td>Copy construction without a copy constructor</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="536">
     <td><a href="https://cplusplus.github.io/CWG/issues/536.html">536</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD6</td>
     <td>Problems in the description of <I>id-expression</I>s</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="537">
     <td><a href="https://cplusplus.github.io/CWG/issues/537.html">537</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD1</td>
     <td>Definition of &#8220;signature&#8221;</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="538">
     <td><a href="https://cplusplus.github.io/CWG/issues/538.html">538</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
-    <td>Definition and usage
-of <I>structure</I>, <I>POD-struct</I>, <I>POD-union</I>,
-and <I>POD class</I></td>
+    <td>Definition and usage of <I>structure</I>, <I>POD-struct</I>, <I>POD-union</I>, and <I>POD class</I></td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="539">
     <td><a href="https://cplusplus.github.io/CWG/issues/539.html">539</a></td>
+    <td>[<a href="https://wg21.link/dcl.type">dcl.type</a>]</td>
     <td>CD3</td>
     <td>Constraints on <I>type-specifier-seq</I></td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="540">
     <td><a href="https://cplusplus.github.io/CWG/issues/540.html">540</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD1</td>
     <td>Propagation of cv-qualifiers in reference-to-reference collapse</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="541">
     <td><a href="https://cplusplus.github.io/CWG/issues/541.html">541</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD2</td>
     <td>Dependent function types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="542">
     <td><a href="https://cplusplus.github.io/CWG/issues/542.html">542</a></td>
+    <td>[<a href="https://wg21.link/class.init">class.init</a>]</td>
     <td>CD2</td>
     <td>Value initialization of arrays of POD-structs</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="543">
     <td><a href="https://cplusplus.github.io/CWG/issues/543.html">543</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Value initialization and default constructors</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="544">
     <td><a href="https://cplusplus.github.io/CWG/issues/544.html">544</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>NAD</td>
     <td>Base class lookup in explicit specialization</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="545">
     <td><a href="https://cplusplus.github.io/CWG/issues/545.html">545</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>open</td>
     <td>User-defined conversions and built-in operator overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="546">
     <td><a href="https://cplusplus.github.io/CWG/issues/546.html">546</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>C++11</td>
     <td>Explicit instantiation of class template members</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="547">
     <td><a href="https://cplusplus.github.io/CWG/issues/547.html">547</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++11</td>
     <td>Partial specialization on member function types</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="548">
     <td><a href="https://cplusplus.github.io/CWG/issues/548.html">548</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>dup</td>
     <td><I>qualified-id</I>s in declarations</td>
     <td class="full" align="center">Duplicate of <a href="#482">482</a></td>
   </tr>
   <tr class="open" id="549">
     <td><a href="https://cplusplus.github.io/CWG/issues/549.html">549</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.match">temp.spec.partial.match</a>]</td>
     <td>drafting</td>
     <td>Non-deducible parameters in partial specializations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="550">
     <td><a href="https://cplusplus.github.io/CWG/issues/550.html">550</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>dup</td>
     <td>Pointer to array of unknown bound in parameter declarations</td>
     <td class="full" align="center">Duplicate of <a href="#393">393</a></td>
   </tr>
   <tr id="551">
     <td><a href="https://cplusplus.github.io/CWG/issues/551.html">551</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD1</td>
     <td>When is <TT>inline</TT> permitted in an explicit instantiation?</td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="552">
     <td><a href="https://cplusplus.github.io/CWG/issues/552.html">552</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>NAD</td>
     <td>Use of <TT>typename</TT> in the type in a non-type <I>parameter-declaration</I></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="553">
     <td><a href="https://cplusplus.github.io/CWG/issues/553.html">553</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>NAD</td>
     <td>Problems with friend allocation and deallocation functions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="554">
     <td><a href="https://cplusplus.github.io/CWG/issues/554.html">554</a></td>
+    <td>[<a href="https://wg21.link/basic.scope">basic.scope</a>]</td>
     <td>CD6</td>
     <td>Definition of &#8220;declarative region&#8221; and &#8220;scope&#8221;</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="555">
     <td><a href="https://cplusplus.github.io/CWG/issues/555.html">555</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup">basic.lookup</a>]</td>
     <td>CD5</td>
     <td>Pseudo-destructor name lookup</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="556">
     <td><a href="https://cplusplus.github.io/CWG/issues/556.html">556</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD2</td>
     <td>Conflicting requirements for acceptable aliasing</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="557">
     <td><a href="https://cplusplus.github.io/CWG/issues/557.html">557</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD1</td>
     <td>Does argument-dependent lookup cause template instantiation?</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="558">
     <td><a href="https://cplusplus.github.io/CWG/issues/558.html">558</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>CD1</td>
     <td>Excluded characters in universal character names</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="559">
     <td><a href="https://cplusplus.github.io/CWG/issues/559.html">559</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td>Editing error in issue 382 resolution</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="560">
     <td><a href="https://cplusplus.github.io/CWG/issues/560.html">560</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td>Use of the <TT>typename</TT> keyword in return types</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="561">
     <td><a href="https://cplusplus.github.io/CWG/issues/561.html">561</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.candidate">temp.dep.candidate</a>]</td>
     <td>CD2</td>
     <td>Internal linkage functions in dependent name lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="562">
     <td><a href="https://cplusplus.github.io/CWG/issues/562.html">562</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD6</td>
     <td><I>qualified-id</I>s in non-expression contexts</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="563">
     <td><a href="https://cplusplus.github.io/CWG/issues/563.html">563</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD6</td>
     <td>Linkage specification for objects</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="564">
     <td><a href="https://cplusplus.github.io/CWG/issues/564.html">564</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD2</td>
     <td>Agreement of language linkage or <I>linkage-specification</I>s?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="565">
     <td><a href="https://cplusplus.github.io/CWG/issues/565.html">565</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD3</td>
     <td>Conflict rules for <I>using-declaration</I>s naming function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="566">
     <td><a href="https://cplusplus.github.io/CWG/issues/566.html">566</a></td>
+    <td>[<a href="https://wg21.link/conv.fpint">conv.fpint</a>]</td>
     <td>NAD</td>
     <td>Conversion of negative floating point values to integer type</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="567">
     <td><a href="https://cplusplus.github.io/CWG/issues/567.html">567</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>NAD</td>
     <td>Can <TT>size_t</TT> and <TT>ptrdiff_t</TT> be larger than <TT>long</TT>?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="568">
     <td><a href="https://cplusplus.github.io/CWG/issues/568.html">568</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Definition of POD is too strict</td>
     <td class="full" align="center">Clang 3.0 (C++11 onwards)</td>
   </tr>
   <tr id="569">
     <td><a href="https://cplusplus.github.io/CWG/issues/569.html">569</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>CD2</td>
     <td>Spurious semicolons at namespace scope should be allowed</td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="570">
     <td><a href="https://cplusplus.github.io/CWG/issues/570.html">570</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD2</td>
     <td>Are references subject to the ODR?</td>
     <td class="na" align="center">Duplicate of <a href="#633">633</a></td>
   </tr>
   <tr id="571">
     <td><a href="https://cplusplus.github.io/CWG/issues/571.html">571</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD2</td>
     <td>References declared <TT>const</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="572">
     <td><a href="https://cplusplus.github.io/CWG/issues/572.html">572</a></td>
+    <td>[<a href="https://wg21.link/conv">conv</a>]</td>
     <td>C++11</td>
     <td>Standard conversions for non-built-in types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="573">
     <td><a href="https://cplusplus.github.io/CWG/issues/573.html">573</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>C++11</td>
     <td>Conversions between function pointers and <TT>void*</TT></td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="574">
     <td><a href="https://cplusplus.github.io/CWG/issues/574.html">574</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>NAD</td>
     <td>Definition of &#8220;copy assignment operator&#8221;</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="575">
     <td><a href="https://cplusplus.github.io/CWG/issues/575.html">575</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>C++11</td>
     <td>Criteria for deduction failure</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="576">
     <td><a href="https://cplusplus.github.io/CWG/issues/576.html">576</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD2</td>
     <td>Typedefs in function definitions</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="577">
     <td><a href="https://cplusplus.github.io/CWG/issues/577.html">577</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD3</td>
     <td><TT>void</TT> in an empty parameter list</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="578">
     <td><a href="https://cplusplus.github.io/CWG/issues/578.html">578</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD6</td>
     <td>Phase 1 replacement of characters with <I>universal-character-name</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="579">
     <td><a href="https://cplusplus.github.io/CWG/issues/579.html">579</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>open</td>
     <td>What is a &#8220;nested&#8221; <TT>&gt;</TT> or <TT>&gt;&gt;</TT>?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="580">
     <td><a href="https://cplusplus.github.io/CWG/issues/580.html">580</a></td>
+    <td>[<a href="https://wg21.link/class.access">class.access</a>]</td>
     <td>C++11</td>
     <td>Access in <I>template-parameter</I>s of member and friend definitions</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="581">
     <td><a href="https://cplusplus.github.io/CWG/issues/581.html">581</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>CD5</td>
     <td>Can a templated constructor be explicitly instantiated or specialized?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="582">
     <td><a href="https://cplusplus.github.io/CWG/issues/582.html">582</a></td>
+    <td>[<a href="https://wg21.link/temp.mem">temp.mem</a>]</td>
     <td>CD1</td>
     <td>Template conversion functions</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="583">
     <td><a href="https://cplusplus.github.io/CWG/issues/583.html">583</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD3</td>
     <td>Relational pointer comparisons against the null pointer constant</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="584">
     <td><a href="https://cplusplus.github.io/CWG/issues/584.html">584</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>NAD</td>
     <td>Unions and aliasing</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="585">
     <td><a href="https://cplusplus.github.io/CWG/issues/585.html">585</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td>Friend template template parameters</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="586">
     <td><a href="https://cplusplus.github.io/CWG/issues/586.html">586</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>NAD</td>
     <td>Default <I>template-argument</I>s and template argument deduction</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="587">
     <td><a href="https://cplusplus.github.io/CWG/issues/587.html">587</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD2</td>
     <td>Lvalue operands of a conditional expression differing only in cv-qualification</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="588">
     <td><a href="https://cplusplus.github.io/CWG/issues/588.html">588</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD2</td>
     <td>Searching dependent bases of classes local to function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="589">
     <td><a href="https://cplusplus.github.io/CWG/issues/589.html">589</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD2</td>
     <td>Direct binding of class and array rvalues in reference initialization</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="590">
     <td><a href="https://cplusplus.github.io/CWG/issues/590.html">590</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++11</td>
     <td>Nested classes and the &#8220;current instantiation&#8221;</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="591">
     <td><a href="https://cplusplus.github.io/CWG/issues/591.html">591</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD4</td>
     <td>When a dependent base class is the current instantiation</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="592">
     <td><a href="https://cplusplus.github.io/CWG/issues/592.html">592</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>CD1</td>
     <td>Exceptions during construction of local static objects</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="593">
     <td><a href="https://cplusplus.github.io/CWG/issues/593.html">593</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>NAD</td>
     <td>Falling off the end of a destructor's <I>function-try-block</I> handler</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="594">
     <td><a href="https://cplusplus.github.io/CWG/issues/594.html">594</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD1</td>
     <td>Coordinating issues 119 and 404 with delegating constructors</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="595">
     <td><a href="https://cplusplus.github.io/CWG/issues/595.html">595</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>dup</td>
     <td>Exception specifications in templates instantiated from class bodies</td>
     <td class="full" align="center">Duplicate of <a href="#1330">1330</a></td>
   </tr>
   <tr id="596">
     <td><a href="https://cplusplus.github.io/CWG/issues/596.html">596</a></td>
+    <td>[<a href="https://wg21.link/except.unexpected">except.unexpected</a>]</td>
     <td>NAD</td>
     <td>Replacing an exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="597">
     <td><a href="https://cplusplus.github.io/CWG/issues/597.html">597</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD3</td>
     <td>Conversions applied to out-of-lifetime non-POD lvalues</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="598">
     <td><a href="https://cplusplus.github.io/CWG/issues/598.html">598</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD2</td>
     <td>Associated namespaces of overloaded functions and function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="599">
     <td><a href="https://cplusplus.github.io/CWG/issues/599.html">599</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD2</td>
     <td>Deleting a null function pointer</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="600">
     <td><a href="https://cplusplus.github.io/CWG/issues/600.html">600</a></td>
+    <td>[<a href="https://wg21.link/class.access">class.access</a>]</td>
     <td>CD6</td>
     <td>Does access control apply to members or to names?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="601">
     <td><a href="https://cplusplus.github.io/CWG/issues/601.html">601</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD2</td>
     <td>Type of literals in preprocessing expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="602">
     <td><a href="https://cplusplus.github.io/CWG/issues/602.html">602</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>C++11</td>
     <td>When is the injected-class-name of a class template a template?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="603">
     <td><a href="https://cplusplus.github.io/CWG/issues/603.html">603</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>CD1</td>
     <td>Type equivalence and unsigned overflow</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="604">
     <td><a href="https://cplusplus.github.io/CWG/issues/604.html">604</a></td>
+    <td>[<a href="https://wg21.link/over.match.ctor">over.match.ctor</a>]</td>
     <td>CD2</td>
     <td>Argument list for overload resolution in copy-initialization</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="605">
     <td><a href="https://cplusplus.github.io/CWG/issues/605.html">605</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++11</td>
     <td>Linkage of explicit specializations</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="606">
     <td><a href="https://cplusplus.github.io/CWG/issues/606.html">606</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD1</td>
     <td>Template argument deduction for rvalue references</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="607">
     <td><a href="https://cplusplus.github.io/CWG/issues/607.html">607</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD6</td>
     <td>Lookup of <I>mem-initializer-id</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="608">
     <td><a href="https://cplusplus.github.io/CWG/issues/608.html">608</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>CD2</td>
     <td>Determining the final overrider of a virtual function</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="609">
     <td><a href="https://cplusplus.github.io/CWG/issues/609.html">609</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.cv">dcl.type.cv</a>]</td>
     <td>CD4</td>
     <td>What is a &#8220;top-level&#8221; cv-qualifier?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="610">
     <td><a href="https://cplusplus.github.io/CWG/issues/610.html">610</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>NAD</td>
     <td>Computing the negative of <TT>0U</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="611">
     <td><a href="https://cplusplus.github.io/CWG/issues/611.html">611</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD2</td>
     <td>Zero-initializing references</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="612">
     <td><a href="https://cplusplus.github.io/CWG/issues/612.html">612</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD2</td>
     <td>Requirements on a conforming implementation</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="613">
     <td><a href="https://cplusplus.github.io/CWG/issues/613.html">613</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD1</td>
     <td>Unevaluated uses of non-static class members</td>
     <td class="full" align="center">Clang 3.1 (C++11 onwards)</td>
   </tr>
   <tr id="614">
     <td><a href="https://cplusplus.github.io/CWG/issues/614.html">614</a></td>
+    <td>[<a href="https://wg21.link/expr.mul">expr.mul</a>]</td>
     <td>CD1</td>
     <td>Results of integer <TT>/</TT> and <TT>%</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="615">
     <td><a href="https://cplusplus.github.io/CWG/issues/615.html">615</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++11</td>
     <td>Incorrect description of variables that can be initialized</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="616">
     <td><a href="https://cplusplus.github.io/CWG/issues/616.html">616</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD3</td>
     <td>Definition of &#8220;indeterminate value&#8221;</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="617">
     <td><a href="https://cplusplus.github.io/CWG/issues/617.html">617</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>NAD</td>
     <td>Lvalue-to-rvalue conversions of uninitialized <TT>char</TT> objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="618">
     <td><a href="https://cplusplus.github.io/CWG/issues/618.html">618</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD2</td>
     <td>Casts in preprocessor conditional expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="619">
     <td><a href="https://cplusplus.github.io/CWG/issues/619.html">619</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>Completeness of array types</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="620">
     <td><a href="https://cplusplus.github.io/CWG/issues/620.html">620</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD1</td>
     <td>Declaration order in layout-compatible POD structs</td>
     <td class="full" align="center">Duplicate of <a href="#568">568</a></td>
   </tr>
   <tr id="621">
     <td><a href="https://cplusplus.github.io/CWG/issues/621.html">621</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++11</td>
     <td>Template argument deduction from function return types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="622">
     <td><a href="https://cplusplus.github.io/CWG/issues/622.html">622</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>NAD</td>
     <td>Relational comparisons of arbitrary pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="623">
     <td><a href="https://cplusplus.github.io/CWG/issues/623.html">623</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>CD3</td>
     <td>Use of pointers to deallocated storage</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="624">
     <td><a href="https://cplusplus.github.io/CWG/issues/624.html">624</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD1</td>
     <td>Overflow in calculating size of allocation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="625">
     <td><a href="https://cplusplus.github.io/CWG/issues/625.html">625</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD2</td>
     <td>Use of <TT>auto</TT> as a <I>template-argument</I></td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="626">
     <td><a href="https://cplusplus.github.io/CWG/issues/626.html">626</a></td>
+    <td>[<a href="https://wg21.link/cpp.stringize">cpp.stringize</a>]</td>
     <td>CD2</td>
     <td>Preprocessor string literals</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="627">
     <td><a href="https://cplusplus.github.io/CWG/issues/627.html">627</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>NAD</td>
     <td>Values behaving as types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="628">
     <td><a href="https://cplusplus.github.io/CWG/issues/628.html">628</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD2</td>
     <td>The values of an enumeration with no enumerator</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="629">
     <td><a href="https://cplusplus.github.io/CWG/issues/629.html">629</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD1</td>
     <td><TT>auto</TT> parsing ambiguity</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="630">
     <td><a href="https://cplusplus.github.io/CWG/issues/630.html">630</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>CD2</td>
     <td>Equality of narrow and wide character values in the basic character set</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="631">
     <td><a href="https://cplusplus.github.io/CWG/issues/631.html">631</a></td>
+    <td>[<a href="https://wg21.link/stmt.if">stmt.if</a>]</td>
     <td>CD3</td>
     <td>Jumping into a &#8220;then&#8221; clause</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="632">
     <td><a href="https://cplusplus.github.io/CWG/issues/632.html">632</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD1</td>
     <td>Brace-enclosed initializer for scalar member of aggregate</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="633">
     <td><a href="https://cplusplus.github.io/CWG/issues/633.html">633</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>CD2</td>
     <td>Specifications for variables that should also apply to references</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="634">
     <td><a href="https://cplusplus.github.io/CWG/issues/634.html">634</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD1</td>
     <td>Conditionally-supported behavior for non-POD objects passed to ellipsis redux</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="635">
     <td><a href="https://cplusplus.github.io/CWG/issues/635.html">635</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>NAD</td>
     <td>Names of constructors and destructors of templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="636">
     <td><a href="https://cplusplus.github.io/CWG/issues/636.html">636</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD4</td>
     <td>Dynamic type of objects and aliasing</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="637">
     <td><a href="https://cplusplus.github.io/CWG/issues/637.html">637</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD1</td>
     <td>Sequencing rules and example disagree</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="638">
     <td><a href="https://cplusplus.github.io/CWG/issues/638.html">638</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD2</td>
     <td>Explicit specialization and friendship</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="639">
     <td><a href="https://cplusplus.github.io/CWG/issues/639.html">639</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD1</td>
     <td>What makes side effects &#8220;different&#8221; from one another?</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="640">
     <td><a href="https://cplusplus.github.io/CWG/issues/640.html">640</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>NAD</td>
     <td>Accessing destroyed local objects of static storage duration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="641">
     <td><a href="https://cplusplus.github.io/CWG/issues/641.html">641</a></td>
+    <td>[<a href="https://wg21.link/over.match.viable">over.match.viable</a>]</td>
     <td>CD2</td>
     <td>Overload resolution and conversion-to-same-type operators</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="642">
     <td><a href="https://cplusplus.github.io/CWG/issues/642.html">642</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.block">basic.scope.block</a>]</td>
     <td>CD2</td>
     <td>Definition and use of &#8220;block scope&#8221; and &#8220;local scope&#8221;</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="643">
     <td><a href="https://cplusplus.github.io/CWG/issues/643.html">643</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>NAD</td>
     <td>Use of <TT>decltype</TT> in a class <I>member-specification</I></td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="644">
     <td><a href="https://cplusplus.github.io/CWG/issues/644.html">644</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD1</td>
     <td>Should a trivial class type be a literal type?</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="645">
     <td><a href="https://cplusplus.github.io/CWG/issues/645.html">645</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD2</td>
     <td>Are bit-field and non-bit-field members layout compatible?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="646">
     <td><a href="https://cplusplus.github.io/CWG/issues/646.html">646</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>NAD</td>
     <td>Can a class with a constexpr copy constructor be a literal type?</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#981">981</a></td>
   </tr>
   <tr id="647">
     <td><a href="https://cplusplus.github.io/CWG/issues/647.html">647</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD1</td>
     <td>Non-constexpr instances of constexpr constructor templates</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="648">
     <td><a href="https://cplusplus.github.io/CWG/issues/648.html">648</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD1</td>
     <td>Constant expressions in constexpr initializers</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="649">
     <td><a href="https://cplusplus.github.io/CWG/issues/649.html">649</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>CD1</td>
     <td>Optionally ill-formed extended alignment requests</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="650">
     <td><a href="https://cplusplus.github.io/CWG/issues/650.html">650</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD2</td>
     <td>Order of destruction for temporaries bound to the returned value of a function</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="651">
     <td><a href="https://cplusplus.github.io/CWG/issues/651.html">651</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD1</td>
     <td>Problems in <TT>decltype</TT> specification and examples</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="652">
     <td><a href="https://cplusplus.github.io/CWG/issues/652.html">652</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD2</td>
     <td>Compile-time evaluation of floating-point expressions</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="653">
     <td><a href="https://cplusplus.github.io/CWG/issues/653.html">653</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>CD2</td>
     <td>Copy assignment of unions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="654">
     <td><a href="https://cplusplus.github.io/CWG/issues/654.html">654</a></td>
+    <td>[<a href="https://wg21.link/conv.ptr">conv.ptr</a>]</td>
     <td>CD1</td>
     <td>Conversions to and from <TT>nullptr_t</TT></td>
     <td class="full-superseded" align="center">Superseded by <a href="#1423">1423</a></td>
   </tr>
   <tr id="655">
     <td><a href="https://cplusplus.github.io/CWG/issues/655.html">655</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++11</td>
     <td>Initialization not specified for forwarding constructors</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="656">
     <td><a href="https://cplusplus.github.io/CWG/issues/656.html">656</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD2</td>
     <td>Direct binding to the result of a conversion operator</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="657">
     <td><a href="https://cplusplus.github.io/CWG/issues/657.html">657</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD2</td>
     <td>Abstract class parameter in synthesized declaration</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="658">
     <td><a href="https://cplusplus.github.io/CWG/issues/658.html">658</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD2</td>
     <td>Defining <TT>reinterpret_cast</TT> for pointer types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="659">
     <td><a href="https://cplusplus.github.io/CWG/issues/659.html">659</a></td>
+    <td>[<a href="https://wg21.link/expr.alignof">expr.alignof</a>]</td>
     <td>CD1</td>
     <td>Alignment of function types</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="660">
     <td><a href="https://cplusplus.github.io/CWG/issues/660.html">660</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD1</td>
     <td>Unnamed scoped enumerations</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="661">
     <td><a href="https://cplusplus.github.io/CWG/issues/661.html">661</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD1</td>
     <td>Semantics of arithmetic comparisons</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="662">
     <td><a href="https://cplusplus.github.io/CWG/issues/662.html">662</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>NAD</td>
     <td>Forming a pointer to a reference type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="663">
     <td><a href="https://cplusplus.github.io/CWG/issues/663.html">663</a></td>
+    <td>[<a href="https://wg21.link/extendid">extendid</a>]</td>
     <td>CD1</td>
     <td>Valid Cyrillic identifier characters</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P1949">P1949</a></td>
   </tr>
   <tr id="664">
     <td><a href="https://cplusplus.github.io/CWG/issues/664.html">664</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD2</td>
     <td>Direct binding of references to non-class rvalue references</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="665">
     <td><a href="https://cplusplus.github.io/CWG/issues/665.html">665</a></td>
+    <td>[<a href="https://wg21.link/expr.dynamic.cast">expr.dynamic.cast</a>]</td>
     <td>CD2</td>
     <td>Problems in the specification of <TT>dynamic_cast</TT></td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="666">
     <td><a href="https://cplusplus.github.io/CWG/issues/666.html">666</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td>Dependent <I>qualified-id</I>s without the <TT>typename</TT> keyword</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="667">
     <td><a href="https://cplusplus.github.io/CWG/issues/667.html">667</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD2</td>
     <td>Trivial special member functions that cannot be implicitly defined</td>
     <td class="full" align="center">Clang 8</td>
   </tr>
   <tr id="668">
     <td><a href="https://cplusplus.github.io/CWG/issues/668.html">668</a></td>
+    <td>[<a href="https://wg21.link/except.terminate">except.terminate</a>]</td>
     <td>CD2</td>
     <td>Throwing an exception from the destructor of a local static object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="669">
     <td><a href="https://cplusplus.github.io/CWG/issues/669.html">669</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>NAD</td>
     <td>Confusing specification of the meaning of <TT>decltype</TT></td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="670">
     <td><a href="https://cplusplus.github.io/CWG/issues/670.html">670</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD4</td>
     <td>Copy initialization via derived-to-base conversion in the second step</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="671">
     <td><a href="https://cplusplus.github.io/CWG/issues/671.html">671</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD1</td>
     <td>Explicit conversion from a scoped enumeration type to integral type</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="672">
     <td><a href="https://cplusplus.github.io/CWG/issues/672.html">672</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD2</td>
     <td>Sequencing of initialization in <I>new-expression</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="673">
     <td><a href="https://cplusplus.github.io/CWG/issues/673.html">673</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>NAD</td>
     <td>Injection of names from <I>elaborated-type-specifier</I>s in <TT>friend</TT> declarations</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="674">
     <td><a href="https://cplusplus.github.io/CWG/issues/674.html">674</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>C++11</td>
     <td>&#8220;matching specialization&#8221; for a friend declaration</td>
     <td class="full" align="center">Clang 8</td>
   </tr>
   <tr id="675">
     <td><a href="https://cplusplus.github.io/CWG/issues/675.html">675</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD3</td>
     <td>Signedness of bit-field with typedef or template parameter type</td>
     <td class="unknown" align="center">Duplicate of <a href="#739">739</a></td>
   </tr>
   <tr id="676">
     <td><a href="https://cplusplus.github.io/CWG/issues/676.html">676</a></td>
+    <td>[<a href="https://wg21.link/basic.def">basic.def</a>]</td>
     <td>C++11</td>
     <td><I>static_assert-declaration</I>s and general requirements for declarations</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="677">
     <td><a href="https://cplusplus.github.io/CWG/issues/677.html">677</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD1</td>
     <td>Deleted <TT>operator delete</TT> and virtual destructors</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="678">
     <td><a href="https://cplusplus.github.io/CWG/issues/678.html">678</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++11</td>
     <td>Language linkage of member function parameter types and the ODR</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="679">
     <td><a href="https://cplusplus.github.io/CWG/issues/679.html">679</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>CD1</td>
     <td>Equivalence of <I>template-id</I>s and operator function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="680">
     <td><a href="https://cplusplus.github.io/CWG/issues/680.html">680</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD2</td>
     <td>What is a move constructor?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="681">
     <td><a href="https://cplusplus.github.io/CWG/issues/681.html">681</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD1</td>
     <td>Restrictions on declarators with late-specified return types</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="682">
     <td><a href="https://cplusplus.github.io/CWG/issues/682.html">682</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD5</td>
     <td>Missing description of lookup of template aliases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="683">
     <td><a href="https://cplusplus.github.io/CWG/issues/683.html">683</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD1</td>
     <td>Requirements for trivial subobject special functions</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="684">
     <td><a href="https://cplusplus.github.io/CWG/issues/684.html">684</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td>Constant expressions involving the address of an automatic variable</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1454">1454</a></td>
   </tr>
   <tr id="685">
     <td><a href="https://cplusplus.github.io/CWG/issues/685.html">685</a></td>
+    <td>[<a href="https://wg21.link/conv.prom">conv.prom</a>]</td>
     <td>CD2</td>
     <td>Integral promotion of enumeration ignores fixed underlying type</td>
     <td class="full" align="center">Clang 10</td>
   </tr>
   <tr id="686">
     <td><a href="https://cplusplus.github.io/CWG/issues/686.html">686</a></td>
+    <td>[<a href="https://wg21.link/dcl.name">dcl.name</a>]</td>
     <td>CD1</td>
     <td>Type declarations/definitions in <I>type-specifier-seq</I>s and <I>type-id</I>s</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="687">
     <td><a href="https://cplusplus.github.io/CWG/issues/687.html">687</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>NAD</td>
     <td><TT>template</TT> keyword with <I>unqualified-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="688">
     <td><a href="https://cplusplus.github.io/CWG/issues/688.html">688</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD1</td>
     <td>Constexpr constructors and static initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="689">
     <td><a href="https://cplusplus.github.io/CWG/issues/689.html">689</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD5</td>
     <td>Maximum values of signed and unsigned integers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="690">
     <td><a href="https://cplusplus.github.io/CWG/issues/690.html">690</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD2</td>
     <td>The dynamic type of an rvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="691">
     <td><a href="https://cplusplus.github.io/CWG/issues/691.html">691</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++11</td>
     <td>Template parameter packs in class template partial specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="692">
     <td><a href="https://cplusplus.github.io/CWG/issues/692.html">692</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>C++11</td>
     <td>Partial ordering of variadic class template partial specializations</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="693">
     <td><a href="https://cplusplus.github.io/CWG/issues/693.html">693</a></td>
+    <td>[<a href="https://wg21.link/conv.array">conv.array</a>]</td>
     <td>CD2</td>
     <td>New string types and deprecated conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="694">
     <td><a href="https://cplusplus.github.io/CWG/issues/694.html">694</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++11</td>
     <td>Zero- and value-initialization of union objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="695">
     <td><a href="https://cplusplus.github.io/CWG/issues/695.html">695</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD2</td>
     <td>Compile-time calculation errors in constexpr functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="696">
     <td><a href="https://cplusplus.github.io/CWG/issues/696.html">696</a></td>
+    <td>[<a href="https://wg21.link/class.local">class.local</a>]</td>
     <td>C++11</td>
     <td>Use of block-scope constants in local classes</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr class="open" id="697">
     <td><a href="https://cplusplus.github.io/CWG/issues/697.html">697</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>open</td>
     <td>Deduction rules apply to more than functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="698">
     <td><a href="https://cplusplus.github.io/CWG/issues/698.html">698</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>open</td>
     <td>The definition of &#8220;sequenced before&#8221; is too narrow</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="699">
     <td><a href="https://cplusplus.github.io/CWG/issues/699.html">699</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD2</td>
     <td>Must constexpr member functions be defined in the class <I>member-specification</I>?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="700">
     <td><a href="https://cplusplus.github.io/CWG/issues/700.html">700</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Constexpr member functions of class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="701">
     <td><a href="https://cplusplus.github.io/CWG/issues/701.html">701</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>CD2</td>
     <td>When is the array-to-pointer conversion applied?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="702">
     <td><a href="https://cplusplus.github.io/CWG/issues/702.html">702</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD2</td>
     <td>Preferring conversion to <TT>std::initializer_list</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="703">
     <td><a href="https://cplusplus.github.io/CWG/issues/703.html">703</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD2</td>
     <td>Narrowing for literals that cannot be exactly represented</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="704">
     <td><a href="https://cplusplus.github.io/CWG/issues/704.html">704</a></td>
+    <td>[<a href="https://wg21.link/over.match.call">over.match.call</a>]</td>
     <td>CD2</td>
     <td>To which <I>postfix-expression</I>s does overload resolution apply?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="705">
     <td><a href="https://cplusplus.github.io/CWG/issues/705.html">705</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD2</td>
     <td>Suppressing argument-dependent lookup via parentheses</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="706">
     <td><a href="https://cplusplus.github.io/CWG/issues/706.html">706</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>NAD</td>
     <td>Use of <TT>auto</TT> with rvalue references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="707">
     <td><a href="https://cplusplus.github.io/CWG/issues/707.html">707</a></td>
+    <td>[<a href="https://wg21.link/conv.fpint">conv.fpint</a>]</td>
     <td>CD2</td>
     <td>Undefined behavior in integral-to-floating conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="708">
     <td><a href="https://cplusplus.github.io/CWG/issues/708.html">708</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>open</td>
     <td>Partial specialization of member templates of class templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="709">
     <td><a href="https://cplusplus.github.io/CWG/issues/709.html">709</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>C++11</td>
     <td>Enumeration names as <I>nested-name-specifier</I>s in deduction failure</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="710">
     <td><a href="https://cplusplus.github.io/CWG/issues/710.html">710</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>CD2</td>
     <td>Data races during construction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="711">
     <td><a href="https://cplusplus.github.io/CWG/issues/711.html">711</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD2</td>
     <td><TT>auto</TT> with <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="712">
     <td><a href="https://cplusplus.github.io/CWG/issues/712.html">712</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD3</td>
     <td>Are integer constant operands of a <I>conditional-expression</I> &#8220;used?&#8221;</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="713">
     <td><a href="https://cplusplus.github.io/CWG/issues/713.html">713</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD2</td>
     <td>Unclear note about cv-qualified function types</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="714">
     <td><a href="https://cplusplus.github.io/CWG/issues/714.html">714</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>CD2</td>
     <td>Static const data members and <I>braced-init-list</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="715">
     <td><a href="https://cplusplus.github.io/CWG/issues/715.html">715</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD2</td>
     <td>Class member access constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="716">
     <td><a href="https://cplusplus.github.io/CWG/issues/716.html">716</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>CD2</td>
     <td>Specifications that should apply only to non-static union data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="717">
     <td><a href="https://cplusplus.github.io/CWG/issues/717.html">717</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD2</td>
     <td>Unintentional restrictions on the use of <TT>thread_local</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="718">
     <td><a href="https://cplusplus.github.io/CWG/issues/718.html">718</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td>Non-class, non-function friend declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="719">
     <td><a href="https://cplusplus.github.io/CWG/issues/719.html">719</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>CD2</td>
     <td>Specifications for <I>operator-function-id</I> that should also apply to <I>literal-operator-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="720">
     <td><a href="https://cplusplus.github.io/CWG/issues/720.html">720</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Need examples of <I>lambda-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="721">
     <td><a href="https://cplusplus.github.io/CWG/issues/721.html">721</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD2</td>
     <td>Where must a variable be initialized to be used in a constant expression?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="722">
     <td><a href="https://cplusplus.github.io/CWG/issues/722.html">722</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD2</td>
     <td>Can <TT>nullptr</TT> be passed to an ellipsis?</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="726">
     <td><a href="https://cplusplus.github.io/CWG/issues/726.html">726</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>CD2</td>
     <td>Atomic and non-atomic objects in the memory model</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="727">
     <td><a href="https://cplusplus.github.io/CWG/issues/727.html">727</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++17</td>
     <td>In-class explicit specializations</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="728">
     <td><a href="https://cplusplus.github.io/CWG/issues/728.html">728</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>NAD</td>
     <td>Restrictions on local classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="729">
     <td><a href="https://cplusplus.github.io/CWG/issues/729.html">729</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>CD3</td>
     <td>Qualification conversions and handlers of reference-to-pointer type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="730">
     <td><a href="https://cplusplus.github.io/CWG/issues/730.html">730</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD2</td>
     <td>Explicit specializations of members of non-template classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="731">
     <td><a href="https://cplusplus.github.io/CWG/issues/731.html">731</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD2</td>
     <td>Omitted reference qualification of member function type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="732">
     <td><a href="https://cplusplus.github.io/CWG/issues/732.html">732</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>Late-specified return types in function definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="733">
     <td><a href="https://cplusplus.github.io/CWG/issues/733.html">733</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>NAD</td>
     <td>Reference qualification of copy assignment operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="734">
     <td><a href="https://cplusplus.github.io/CWG/issues/734.html">734</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD2</td>
     <td>Are unique addresses required for namespace-scope variables?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="735">
     <td><a href="https://cplusplus.github.io/CWG/issues/735.html">735</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.safety">basic.stc.dynamic.safety</a>]</td>
     <td>CD2</td>
     <td>Missing case in specification of safely-derived pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="736">
     <td><a href="https://cplusplus.github.io/CWG/issues/736.html">736</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>NAD</td>
     <td>Is the <TT>&amp;</TT> <I>ref-qualifier</I> needed?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="737">
     <td><a href="https://cplusplus.github.io/CWG/issues/737.html">737</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.string">dcl.init.string</a>]</td>
     <td>CD2</td>
     <td>Uninitialized trailing characters in string initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="738">
     <td><a href="https://cplusplus.github.io/CWG/issues/738.html">738</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> not permitted by the syntax of constructor declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="739">
     <td><a href="https://cplusplus.github.io/CWG/issues/739.html">739</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD3</td>
     <td>Signedness of plain bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="740">
     <td><a href="https://cplusplus.github.io/CWG/issues/740.html">740</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>CD2</td>
     <td>Incorrect note on data races</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="741">
     <td><a href="https://cplusplus.github.io/CWG/issues/741.html">741</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>C++11</td>
     <td>&#8220;plain&#8221; <TT>long long</TT> bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="742">
     <td><a href="https://cplusplus.github.io/CWG/issues/742.html">742</a></td>
+    <td>[<a href="https://wg21.link/expr.post.incr">expr.post.incr</a>]</td>
     <td>open</td>
     <td>Postfix increment/decrement with long bit-field operands</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="743">
     <td><a href="https://cplusplus.github.io/CWG/issues/743.html">743</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD2</td>
     <td>Use of <TT>decltype</TT> in a <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="744">
     <td><a href="https://cplusplus.github.io/CWG/issues/744.html">744</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>CD2</td>
     <td>Matching template arguments with template template parameters with parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="745">
     <td><a href="https://cplusplus.github.io/CWG/issues/745.html">745</a></td>
+    <td>[<a href="https://wg21.link/cpp.error">cpp.error</a>]</td>
     <td>C++23</td>
     <td>Effect of ill-formedness resulting from <TT>#error</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="746">
     <td><a href="https://cplusplus.github.io/CWG/issues/746.html">746</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD2</td>
     <td>Use of <TT>auto</TT> in <I>new-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="747">
     <td><a href="https://cplusplus.github.io/CWG/issues/747.html">747</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>dup</td>
     <td>Access of protected base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="749">
     <td><a href="https://cplusplus.github.io/CWG/issues/749.html">749</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>CD2</td>
     <td>References to function types with a <I>cv-qualifier</I> or <I>ref-qualifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="750">
     <td><a href="https://cplusplus.github.io/CWG/issues/750.html">750</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Implementation constraints on reference-only closure objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="751">
     <td><a href="https://cplusplus.github.io/CWG/issues/751.html">751</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Deriving from closure classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="752">
     <td><a href="https://cplusplus.github.io/CWG/issues/752.html">752</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Name lookup in nested <I>lambda-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="753">
     <td><a href="https://cplusplus.github.io/CWG/issues/753.html">753</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD2</td>
     <td>Array names in lambda capture sets</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="754">
     <td><a href="https://cplusplus.github.io/CWG/issues/754.html">754</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Lambda expressions in default arguments of block-scope function declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="755">
     <td><a href="https://cplusplus.github.io/CWG/issues/755.html">755</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD3</td>
     <td>Generalized <I>lambda-capture</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="756">
     <td><a href="https://cplusplus.github.io/CWG/issues/756.html">756</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Dropping cv-qualification on members of closure objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="757">
     <td><a href="https://cplusplus.github.io/CWG/issues/757.html">757</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD2</td>
     <td>Types without linkage in declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="758">
     <td><a href="https://cplusplus.github.io/CWG/issues/758.html">758</a></td>
+    <td>[<a href="https://wg21.link/basic.def">basic.def</a>]</td>
     <td>C++11</td>
     <td>Missing cases of declarations that are not definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="759">
     <td><a href="https://cplusplus.github.io/CWG/issues/759.html">759</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Destruction of closure objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="760">
     <td><a href="https://cplusplus.github.io/CWG/issues/760.html">760</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD2</td>
     <td><TT>this</TT> inside a nested class of a non-static member function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="761">
     <td><a href="https://cplusplus.github.io/CWG/issues/761.html">761</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Inferred return type of closure object call operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="762">
     <td><a href="https://cplusplus.github.io/CWG/issues/762.html">762</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Name lookup in the <I>compound-statement</I> of a lambda expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="763">
     <td><a href="https://cplusplus.github.io/CWG/issues/763.html">763</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Is a closure object's <TT>operator()</TT> inline?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="764">
     <td><a href="https://cplusplus.github.io/CWG/issues/764.html">764</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD2</td>
     <td>Capturing unused variables in a lambda expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="765">
     <td><a href="https://cplusplus.github.io/CWG/issues/765.html">765</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD2</td>
     <td>Local types in inline functions with external linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="766">
     <td><a href="https://cplusplus.github.io/CWG/issues/766.html">766</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Where may lambda expressions appear?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="767">
     <td><a href="https://cplusplus.github.io/CWG/issues/767.html">767</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td><TT>void</TT> and other unnamed <I>lambda-parameter</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="768">
     <td><a href="https://cplusplus.github.io/CWG/issues/768.html">768</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Ellipsis in a lambda parameter list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="769">
     <td><a href="https://cplusplus.github.io/CWG/issues/769.html">769</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Initialization of closure objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="770">
     <td><a href="https://cplusplus.github.io/CWG/issues/770.html">770</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD2</td>
     <td>Ambiguity in late-specified return type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="771">
     <td><a href="https://cplusplus.github.io/CWG/issues/771.html">771</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Move-construction of reference members of closure objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="772">
     <td><a href="https://cplusplus.github.io/CWG/issues/772.html">772</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD2</td>
     <td><I>capture-default</I> in lambdas in local default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="773">
     <td><a href="https://cplusplus.github.io/CWG/issues/773.html">773</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++11</td>
     <td>Parentheses in address non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="774">
     <td><a href="https://cplusplus.github.io/CWG/issues/774.html">774</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Can a closure class be a POD?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="775">
     <td><a href="https://cplusplus.github.io/CWG/issues/775.html">775</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD2</td>
     <td>Capturing references to functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="776">
     <td><a href="https://cplusplus.github.io/CWG/issues/776.html">776</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>CD2</td>
     <td>Delegating constructors, destructors, and <TT>std::exit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="777">
     <td><a href="https://cplusplus.github.io/CWG/issues/777.html">777</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD2</td>
     <td>Default arguments and parameter packs</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="778">
     <td><a href="https://cplusplus.github.io/CWG/issues/778.html">778</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++11</td>
     <td>Template parameter packs in non-type template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="779">
     <td><a href="https://cplusplus.github.io/CWG/issues/779.html">779</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Rvalue reference members of closure objects?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="782">
     <td><a href="https://cplusplus.github.io/CWG/issues/782.html">782</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Lambda expressions and argument-dependent lookup</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="783">
     <td><a href="https://cplusplus.github.io/CWG/issues/783.html">783</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>open</td>
     <td>Definition of &#8220;argument&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="784">
     <td><a href="https://cplusplus.github.io/CWG/issues/784.html">784</a></td>
+    <td>[<a href="https://wg21.link/intro.structure">intro.structure</a>]</td>
     <td>C++11</td>
     <td>List of incompatibilities with the previous Standard</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="785">
     <td><a href="https://cplusplus.github.io/CWG/issues/785.html">785</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD2</td>
     <td>&#8220;Execution sequence&#8221; is inappropriate phraseology</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="786">
     <td><a href="https://cplusplus.github.io/CWG/issues/786.html">786</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>CD2</td>
     <td>Definition of &#8220;thread&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="787">
     <td><a href="https://cplusplus.github.io/CWG/issues/787.html">787</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD2</td>
     <td>Unnecessary lexical undefined behavior</td>
     <td class="full" align="center">Clang 21</td>
   </tr>
   <tr id="788">
     <td><a href="https://cplusplus.github.io/CWG/issues/788.html">788</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>CD2</td>
     <td>Relationship between locale and values of the execution character set</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="789">
     <td><a href="https://cplusplus.github.io/CWG/issues/789.html">789</a></td>
+    <td>[<a href="https://wg21.link/lex.trigraph">lex.trigraph</a>]</td>
     <td>CD2</td>
     <td>Deprecating trigraphs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="790">
     <td><a href="https://cplusplus.github.io/CWG/issues/790.html">790</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD2</td>
     <td>Concatenation of raw and non-raw string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="792">
     <td><a href="https://cplusplus.github.io/CWG/issues/792.html">792</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD2</td>
     <td>Effects of <TT>std::quick_exit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="793">
     <td><a href="https://cplusplus.github.io/CWG/issues/793.html">793</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD2</td>
     <td>Use of class members during destruction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="794">
     <td><a href="https://cplusplus.github.io/CWG/issues/794.html">794</a></td>
+    <td>[<a href="https://wg21.link/conv.mem">conv.mem</a>]</td>
     <td>NAD</td>
     <td>Base-derived conversion in member type of pointer-to-member conversion</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="795">
     <td><a href="https://cplusplus.github.io/CWG/issues/795.html">795</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>NAD</td>
     <td>Dependency of lambdas on <TT>&lt;functional&gt;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="796">
     <td><a href="https://cplusplus.github.io/CWG/issues/796.html">796</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Lifetime of a closure object with members captured by reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="797">
     <td><a href="https://cplusplus.github.io/CWG/issues/797.html">797</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Converting a no-capture lambda to a function type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="798">
     <td><a href="https://cplusplus.github.io/CWG/issues/798.html">798</a></td>
+    <td>[<a href="https://wg21.link/expr.sub">expr.sub</a>]</td>
     <td>C++11</td>
     <td>Overloaded subscript operator described in clause 5</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="799">
     <td><a href="https://cplusplus.github.io/CWG/issues/799.html">799</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD2</td>
     <td>Can <TT>reinterpret_cast</TT> be used to cast an operand to its own type?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="800">
     <td><a href="https://cplusplus.github.io/CWG/issues/800.html">800</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>NAD</td>
     <td>Safely-derived pointers and object pointers converted from function pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="801">
     <td><a href="https://cplusplus.github.io/CWG/issues/801.html">801</a></td>
+    <td>[<a href="https://wg21.link/expr.const.cast">expr.const.cast</a>]</td>
     <td>CD2</td>
     <td>Casting away constness in a cast to rvalue reference type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="803">
     <td><a href="https://cplusplus.github.io/CWG/issues/803.html">803</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>CD2</td>
     <td><TT>sizeof</TT> an enumeration type with a fixed underlying type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="804">
     <td><a href="https://cplusplus.github.io/CWG/issues/804.html">804</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD2</td>
     <td>Deducing the type in <TT>new auto(x)</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="805">
     <td><a href="https://cplusplus.github.io/CWG/issues/805.html">805</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD2</td>
     <td>Which exception to throw for overflow in array size calculation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="806">
     <td><a href="https://cplusplus.github.io/CWG/issues/806.html">806</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD2</td>
     <td>Enumeration types in integral constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="807">
     <td><a href="https://cplusplus.github.io/CWG/issues/807.html">807</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td><TT>typeid</TT> expressions in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="808">
     <td><a href="https://cplusplus.github.io/CWG/issues/808.html">808</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec">dcl.spec</a>]</td>
     <td>CD2</td>
     <td>Non-type <I>decl-specifier</I>s versus max-munch</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="809">
     <td><a href="https://cplusplus.github.io/CWG/issues/809.html">809</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD2</td>
     <td>Deprecation of the <TT>register</TT> keyword</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="810">
     <td><a href="https://cplusplus.github.io/CWG/issues/810.html">810</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD2</td>
     <td>Block-scope <TT>thread_local</TT> variables should be implicitly <TT>static</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="811">
     <td><a href="https://cplusplus.github.io/CWG/issues/811.html">811</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.cv">dcl.type.cv</a>]</td>
     <td>CD2</td>
     <td>Unclear implications of const-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="812">
     <td><a href="https://cplusplus.github.io/CWG/issues/812.html">812</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD2</td>
     <td>Duplicate names in inline namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="813">
     <td><a href="https://cplusplus.github.io/CWG/issues/813.html">813</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>open</td>
     <td><TT>typename</TT> in a <I>using-declaration</I> with a non-dependent name</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="814">
     <td><a href="https://cplusplus.github.io/CWG/issues/814.html">814</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr">dcl.attr</a>]</td>
     <td>CD2</td>
     <td>Attribute to indicate that a function throws nothing</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="815">
     <td><a href="https://cplusplus.github.io/CWG/issues/815.html">815</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>CD2</td>
     <td>Parameter pack expansion inside attributes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="816">
     <td><a href="https://cplusplus.github.io/CWG/issues/816.html">816</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.final">dcl.attr.final</a>]</td>
     <td>CD2</td>
     <td>Diagnosing violations of <TT>[[final]]</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="817">
     <td><a href="https://cplusplus.github.io/CWG/issues/817.html">817</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.final">dcl.attr.final</a>]</td>
     <td>CD2</td>
     <td>Meaning of <TT>[[final]]</TT> applied to a class definition</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="818">
     <td><a href="https://cplusplus.github.io/CWG/issues/818.html">818</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD2</td>
     <td>Function parameter packs in non-final positions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="819">
     <td><a href="https://cplusplus.github.io/CWG/issues/819.html">819</a></td>
+    <td>[<a href="https://wg21.link/special">special</a>]</td>
     <td>NAD</td>
     <td>Access control and deleted implicitly-declared special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="820">
     <td><a href="https://cplusplus.github.io/CWG/issues/820.html">820</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD2</td>
     <td>Deprecation of <TT>export</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="822">
     <td><a href="https://cplusplus.github.io/CWG/issues/822.html">822</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>NAD</td>
     <td>Additional contexts for template aliases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="823">
     <td><a href="https://cplusplus.github.io/CWG/issues/823.html">823</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD2</td>
     <td>Literal types with constexpr conversions as non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="828">
     <td><a href="https://cplusplus.github.io/CWG/issues/828.html">828</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD2</td>
     <td>Destruction of exception objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="829">
     <td><a href="https://cplusplus.github.io/CWG/issues/829.html">829</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>At what point is <TT>std::unexpected</TT> called?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="830">
     <td><a href="https://cplusplus.github.io/CWG/issues/830.html">830</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD2</td>
     <td>Deprecating exception specifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="831">
     <td><a href="https://cplusplus.github.io/CWG/issues/831.html">831</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>CD2</td>
     <td>Limit on recursively nested template instantiations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="832">
     <td><a href="https://cplusplus.github.io/CWG/issues/832.html">832</a></td>
+    <td>[<a href="https://wg21.link/lex.ppnumber">lex.ppnumber</a>]</td>
     <td>CD2</td>
     <td>Value of preprocessing numbers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="833">
     <td><a href="https://cplusplus.github.io/CWG/issues/833.html">833</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD2</td>
     <td>Explicit conversion of a scoped enumeration value to a floating type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="834">
     <td><a href="https://cplusplus.github.io/CWG/issues/834.html">834</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD2</td>
     <td>What is an &#8220;ordinary string literal&#8221;?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="835">
     <td><a href="https://cplusplus.github.io/CWG/issues/835.html">835</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD2</td>
     <td>Scoped enumerations and the &#8220;usual arithmetic conversions&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="836">
     <td><a href="https://cplusplus.github.io/CWG/issues/836.html">836</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.noreturn">dcl.attr.noreturn</a>]</td>
     <td>NAD</td>
     <td><TT>[[noreturn]]</TT> applied to function types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="837">
     <td><a href="https://cplusplus.github.io/CWG/issues/837.html">837</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Constexpr functions and <TT>return</TT> <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="838">
     <td><a href="https://cplusplus.github.io/CWG/issues/838.html">838</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++11</td>
     <td>Use of <TT>this</TT> in a <I>brace-or-equal-initializer</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="839">
     <td><a href="https://cplusplus.github.io/CWG/issues/839.html">839</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>dup</td>
     <td><TT>sizeof</TT> with opaque enumerations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="840">
     <td><a href="https://cplusplus.github.io/CWG/issues/840.html">840</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD2</td>
     <td>Rvalue references as nontype template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="842">
     <td><a href="https://cplusplus.github.io/CWG/issues/842.html">842</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD2</td>
     <td>Casting to rvalue reference type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="845">
     <td><a href="https://cplusplus.github.io/CWG/issues/845.html">845</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>What is the &#8220;first declaration&#8221; of an explicit specialization?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="846">
     <td><a href="https://cplusplus.github.io/CWG/issues/846.html">846</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD2</td>
     <td>Rvalue references to functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="847">
     <td><a href="https://cplusplus.github.io/CWG/issues/847.html">847</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD2</td>
     <td>Error in rvalue reference deduction example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="850">
     <td><a href="https://cplusplus.github.io/CWG/issues/850.html">850</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD2</td>
     <td>Restrictions on use of non-static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="852">
     <td><a href="https://cplusplus.github.io/CWG/issues/852.html">852</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD6</td>
     <td><I>using-declaration</I>s and dependent base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="853">
     <td><a href="https://cplusplus.github.io/CWG/issues/853.html">853</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.safety">basic.stc.dynamic.safety</a>]</td>
     <td>CD2</td>
     <td>Support for relaxed pointer safety</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="854">
     <td><a href="https://cplusplus.github.io/CWG/issues/854.html">854</a></td>
+    <td>[<a href="https://wg21.link/expr.shift">expr.shift</a>]</td>
     <td>CD2</td>
     <td>Left shift and unsigned extended types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="855">
     <td><a href="https://cplusplus.github.io/CWG/issues/855.html">855</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD2</td>
     <td>Incorrect comments in <I>braced-init-list</I> assignment example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="858">
     <td><a href="https://cplusplus.github.io/CWG/issues/858.html">858</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD2</td>
     <td>Example binding an rvalue reference to an lvalue</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="860">
     <td><a href="https://cplusplus.github.io/CWG/issues/860.html">860</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Explicit qualification of constexpr member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="861">
     <td><a href="https://cplusplus.github.io/CWG/issues/861.html">861</a></td>
+    <td>[<a href="https://wg21.link/namespace.qual">namespace.qual</a>]</td>
     <td>CD2</td>
     <td>Unintended ambiguity in inline namespace lookup</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="862">
     <td><a href="https://cplusplus.github.io/CWG/issues/862.html">862</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD2</td>
     <td>Undefined behavior with enumerator value overflow</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="863">
     <td><a href="https://cplusplus.github.io/CWG/issues/863.html">863</a></td>
+    <td>[<a href="https://wg21.link/expr.post">expr.post</a>]</td>
     <td>CD2</td>
     <td>Rvalue reference cast to incomplete type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="864">
     <td><a href="https://cplusplus.github.io/CWG/issues/864.html">864</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>C++11</td>
     <td><I>braced-init-list</I> in the range-based <TT>for</TT> statement</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="865">
     <td><a href="https://cplusplus.github.io/CWG/issues/865.html">865</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD2</td>
     <td>Initializing a <TT>std::initializer_list</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="869">
     <td><a href="https://cplusplus.github.io/CWG/issues/869.html">869</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD2</td>
     <td>Uninitialized <TT>thread_local</TT> objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="872">
     <td><a href="https://cplusplus.github.io/CWG/issues/872.html">872</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD2</td>
     <td>Lexical issues with raw strings</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="873">
     <td><a href="https://cplusplus.github.io/CWG/issues/873.html">873</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>C++11</td>
     <td>Deducing rvalue references in declarative contexts</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="874">
     <td><a href="https://cplusplus.github.io/CWG/issues/874.html">874</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD2</td>
     <td>Class-scope definitions of enumeration types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="876">
     <td><a href="https://cplusplus.github.io/CWG/issues/876.html">876</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD2</td>
     <td>Type references in rvalue reference deduction specification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="877">
     <td><a href="https://cplusplus.github.io/CWG/issues/877.html">877</a></td>
+    <td>[<a href="https://wg21.link/over.match.viable">over.match.viable</a>]</td>
     <td>CD2</td>
     <td>Viable functions and binding references to rvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="879">
     <td><a href="https://cplusplus.github.io/CWG/issues/879.html">879</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>CD2</td>
     <td>Missing built-in comparison operators for pointer types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="880">
     <td><a href="https://cplusplus.github.io/CWG/issues/880.html">880</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>CD2</td>
     <td>Built-in conditional operator for scoped enumerations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="882">
     <td><a href="https://cplusplus.github.io/CWG/issues/882.html">882</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD2</td>
     <td>Defining <TT>main</TT> as deleted</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="883">
     <td><a href="https://cplusplus.github.io/CWG/issues/883.html">883</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD2</td>
     <td><TT>std::memcpy</TT> vs <TT>std::memmove</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="884">
     <td><a href="https://cplusplus.github.io/CWG/issues/884.html">884</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD2</td>
     <td>Defining an explicitly-specialized static data member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="885">
     <td><a href="https://cplusplus.github.io/CWG/issues/885.html">885</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>NAD</td>
     <td>Partial ordering of function templates with unordered parameter pairs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="886">
     <td><a href="https://cplusplus.github.io/CWG/issues/886.html">886</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD2</td>
     <td>Member initializers and aggregates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="887">
     <td><a href="https://cplusplus.github.io/CWG/issues/887.html">887</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD2</td>
     <td>Move construction of thrown object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="888">
     <td><a href="https://cplusplus.github.io/CWG/issues/888.html">888</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD2</td>
     <td>Union member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="891">
     <td><a href="https://cplusplus.github.io/CWG/issues/891.html">891</a></td>
+    <td>[<a href="https://wg21.link/expr.const.cast">expr.const.cast</a>]</td>
     <td>CD2</td>
     <td><TT>const_cast</TT> to rvalue reference from objectless rvalue</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="892">
     <td><a href="https://cplusplus.github.io/CWG/issues/892.html">892</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Missing requirements for constexpr constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="893">
     <td><a href="https://cplusplus.github.io/CWG/issues/893.html">893</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>NAD</td>
     <td>Brace syntax for <I>enumerator-definition</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="896">
     <td><a href="https://cplusplus.github.io/CWG/issues/896.html">896</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD2</td>
     <td>Rvalue references and rvalue-reference conversion functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="897">
     <td><a href="https://cplusplus.github.io/CWG/issues/897.html">897</a></td>
+    <td>[<a href="https://wg21.link/cpp.pragma.op">cpp.pragma.op</a>]</td>
     <td>open</td>
     <td><TT>_Pragma</TT> and extended <I>string-literal</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="898">
     <td><a href="https://cplusplus.github.io/CWG/issues/898.html">898</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Declarations in constexpr functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="899">
     <td><a href="https://cplusplus.github.io/CWG/issues/899.html">899</a></td>
+    <td>[<a href="https://wg21.link/over.match.copy">over.match.copy</a>]</td>
     <td>CD2</td>
     <td>Explicit conversion functions in direct class initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="900">
     <td><a href="https://cplusplus.github.io/CWG/issues/900.html">900</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>C++23</td>
     <td>Lifetime of temporaries in range-based <TT>for</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="901">
     <td><a href="https://cplusplus.github.io/CWG/issues/901.html">901</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Deleted <TT>operator delete</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="902">
     <td><a href="https://cplusplus.github.io/CWG/issues/902.html">902</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>NAD</td>
     <td>In-class initialization of non-constant static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="903">
     <td><a href="https://cplusplus.github.io/CWG/issues/903.html">903</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD3</td>
     <td>Value-dependent integral null pointer constants</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="904">
     <td><a href="https://cplusplus.github.io/CWG/issues/904.html">904</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD2</td>
     <td>Parameter packs in <I>lambda-capture</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="905">
     <td><a href="https://cplusplus.github.io/CWG/issues/905.html">905</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD2</td>
     <td>Explicit defaulted copy constructors and trivial copyability</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="906">
     <td><a href="https://cplusplus.github.io/CWG/issues/906.html">906</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>Which special member functions can be defaulted?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="908">
     <td><a href="https://cplusplus.github.io/CWG/issues/908.html">908</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>Deleted global allocation and deallocation functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="909">
     <td><a href="https://cplusplus.github.io/CWG/issues/909.html">909</a></td>
+    <td>[<a href="https://wg21.link/expr.cast">expr.cast</a>]</td>
     <td>NAD</td>
     <td>Old-style casts with conversion functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="910">
     <td><a href="https://cplusplus.github.io/CWG/issues/910.html">910</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD2</td>
     <td>Move constructors and implicitly-declared copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="912">
     <td><a href="https://cplusplus.github.io/CWG/issues/912.html">912</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD3</td>
     <td>Character literals and <I>universal-character-name</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="913">
     <td><a href="https://cplusplus.github.io/CWG/issues/913.html">913</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD2</td>
     <td>Deduction rules for array- and function-type conversion functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="914">
     <td><a href="https://cplusplus.github.io/CWG/issues/914.html">914</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>open</td>
     <td>Value-initialization of array types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="915">
     <td><a href="https://cplusplus.github.io/CWG/issues/915.html">915</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>Deleted specializations of member function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="919">
     <td><a href="https://cplusplus.github.io/CWG/issues/919.html">919</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD2</td>
     <td>Contradictions regarding inline namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="920">
     <td><a href="https://cplusplus.github.io/CWG/issues/920.html">920</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD2</td>
     <td>Interaction of inline namespaces and <I>using-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="921">
     <td><a href="https://cplusplus.github.io/CWG/issues/921.html">921</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD2</td>
     <td>Unclear specification of inline namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="922">
     <td><a href="https://cplusplus.github.io/CWG/issues/922.html">922</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD2</td>
     <td>Implicit default constructor definitions and <TT>const</TT> variant members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="923">
     <td><a href="https://cplusplus.github.io/CWG/issues/923.html">923</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD2</td>
     <td>Inline explicit specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="924">
     <td><a href="https://cplusplus.github.io/CWG/issues/924.html">924</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++11</td>
     <td><I>alias-declaration</I> as a class member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="925">
     <td><a href="https://cplusplus.github.io/CWG/issues/925.html">925</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>open</td>
     <td>Type of character literals in preprocessor expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="926">
     <td><a href="https://cplusplus.github.io/CWG/issues/926.html">926</a></td>
+    <td>[<a href="https://wg21.link/namespace.unnamed">namespace.unnamed</a>]</td>
     <td>CD2</td>
     <td>Inline unnamed namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="927">
     <td><a href="https://cplusplus.github.io/CWG/issues/927.html">927</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD2</td>
     <td>Implicitly-deleted default constructors and member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="928">
     <td><a href="https://cplusplus.github.io/CWG/issues/928.html">928</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>Defaulting a function that would be implicitly defined as deleted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="929">
     <td><a href="https://cplusplus.github.io/CWG/issues/929.html">929</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>CD2</td>
     <td>What is a template alias?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="930">
     <td><a href="https://cplusplus.github.io/CWG/issues/930.html">930</a></td>
+    <td>[<a href="https://wg21.link/expr.alignof">expr.alignof</a>]</td>
     <td>CD2</td>
     <td><TT>alignof</TT> with incomplete array type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="931">
     <td><a href="https://cplusplus.github.io/CWG/issues/931.html">931</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>CD2</td>
     <td>Confusing reference to the length of a user-defined string literal</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="932">
     <td><a href="https://cplusplus.github.io/CWG/issues/932.html">932</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD2</td>
     <td>UCNs in closing delimiters of raw string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="933">
     <td><a href="https://cplusplus.github.io/CWG/issues/933.html">933</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD2</td>
     <td>32-bit UCNs with 16-bit <TT>wchar_t</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="934">
     <td><a href="https://cplusplus.github.io/CWG/issues/934.html">934</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD2</td>
     <td>List-initialization of references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="935">
     <td><a href="https://cplusplus.github.io/CWG/issues/935.html">935</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>CD2</td>
     <td>Missing overloads for character types for user-defined literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="936">
     <td><a href="https://cplusplus.github.io/CWG/issues/936.html">936</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.string">dcl.init.string</a>]</td>
     <td>CD2</td>
     <td>Array initialization with new string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="937">
     <td><a href="https://cplusplus.github.io/CWG/issues/937.html">937</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>NAD</td>
     <td>Restrictions on values of template arguments in user-defined literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="938">
     <td><a href="https://cplusplus.github.io/CWG/issues/938.html">938</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++11</td>
     <td>Initializer lists and array new</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="939">
     <td><a href="https://cplusplus.github.io/CWG/issues/939.html">939</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>CD2</td>
     <td>Explicitly checking virtual function overriding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="940">
     <td><a href="https://cplusplus.github.io/CWG/issues/940.html">940</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD2</td>
     <td>Global anonymous unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="941">
     <td><a href="https://cplusplus.github.io/CWG/issues/941.html">941</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++11</td>
     <td>Explicit specialization of deleted function template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="942">
     <td><a href="https://cplusplus.github.io/CWG/issues/942.html">942</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>CD2</td>
     <td>Is <TT>this</TT> an entity?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="943">
     <td><a href="https://cplusplus.github.io/CWG/issues/943.html">943</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>CD5</td>
     <td>Is <TT>T()</TT> a temporary?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="944">
     <td><a href="https://cplusplus.github.io/CWG/issues/944.html">944</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>NAD</td>
     <td><TT>reinterpret_cast</TT> for all types with the same size and alignment</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="945">
     <td><a href="https://cplusplus.github.io/CWG/issues/945.html">945</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>C++11</td>
     <td>Use of <TT>this</TT> in a late-specified return type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="946">
     <td><a href="https://cplusplus.github.io/CWG/issues/946.html">946</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>CD2</td>
     <td>Order of destruction of local static objects and calls to <TT>std::atexit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="947">
     <td><a href="https://cplusplus.github.io/CWG/issues/947.html">947</a></td>
+    <td>[<a href="https://wg21.link/temp.over">temp.over</a>]</td>
     <td>NAD</td>
     <td>Deducing type template arguments from default function arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="948">
     <td><a href="https://cplusplus.github.io/CWG/issues/948.html">948</a></td>
+    <td>[<a href="https://wg21.link/stmt.select">stmt.select</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> in <I>condition</I>s</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr class="open" id="949">
     <td><a href="https://cplusplus.github.io/CWG/issues/949.html">949</a></td>
+    <td>[<a href="https://wg21.link/intro.compliance">intro.compliance</a>]</td>
     <td>open</td>
     <td>Requirements for freestanding implementations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="950">
     <td><a href="https://cplusplus.github.io/CWG/issues/950.html">950</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD2</td>
     <td>Use of <TT>decltype</TT> as a <I>class-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="951">
     <td><a href="https://cplusplus.github.io/CWG/issues/951.html">951</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr">dcl.attr</a>]</td>
     <td>CD2</td>
     <td>Problems with <I>attribute-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="952">
     <td><a href="https://cplusplus.github.io/CWG/issues/952.html">952</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD6</td>
     <td>Insufficient description of &#8220;naming class&#8221;</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="953">
     <td><a href="https://cplusplus.github.io/CWG/issues/953.html">953</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>CD2</td>
     <td>Rvalue references and function viability</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="954">
     <td><a href="https://cplusplus.github.io/CWG/issues/954.html">954</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>open</td>
     <td>Overload resolution of conversion operator templates with built-in types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="955">
     <td><a href="https://cplusplus.github.io/CWG/issues/955.html">955</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Can a closure type's <TT>operator()</TT> be virtual?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="956">
     <td><a href="https://cplusplus.github.io/CWG/issues/956.html">956</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD2</td>
     <td>Function prototype scope with late-specified return types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="957">
     <td><a href="https://cplusplus.github.io/CWG/issues/957.html">957</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>CD2</td>
     <td>Alternative tokens and <I>attribute-token</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="958">
     <td><a href="https://cplusplus.github.io/CWG/issues/958.html">958</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>NAD</td>
     <td>Lambdas and <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="959">
     <td><a href="https://cplusplus.github.io/CWG/issues/959.html">959</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>CD2</td>
     <td>Alignment attribute for class and enumeration types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="960">
     <td><a href="https://cplusplus.github.io/CWG/issues/960.html">960</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>CD2</td>
     <td>Covariant functions and lvalue/rvalue references</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="961">
     <td><a href="https://cplusplus.github.io/CWG/issues/961.html">961</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD2</td>
     <td>Overload resolution and conversion of <TT>std::nullptr_t</TT> to <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="962">
     <td><a href="https://cplusplus.github.io/CWG/issues/962.html">962</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>CD2</td>
     <td>Attributes appertaining to class and enum types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="963">
     <td><a href="https://cplusplus.github.io/CWG/issues/963.html">963</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD2</td>
     <td>Comparing <TT>nullptr</TT> with 0</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="964">
     <td><a href="https://cplusplus.github.io/CWG/issues/964.html">964</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>C++11</td>
     <td>Incorrect description of when the lvalue-to-rvalue conversion applies</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="965">
     <td><a href="https://cplusplus.github.io/CWG/issues/965.html">965</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.depend">dcl.attr.depend</a>]</td>
     <td>CD2</td>
     <td>Limiting the applicability of the <TT>carries_dependency</TT> attribute</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="966">
     <td><a href="https://cplusplus.github.io/CWG/issues/966.html">966</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD2</td>
     <td>Nested types without linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="967">
     <td><a href="https://cplusplus.github.io/CWG/issues/967.html">967</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic">basic.stc.dynamic</a>]</td>
     <td>NAD</td>
     <td>Exception specification of replacement allocation function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="968">
     <td><a href="https://cplusplus.github.io/CWG/issues/968.html">968</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>CD2</td>
     <td>Syntactic ambiguity of the attribute notation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="969">
     <td><a href="https://cplusplus.github.io/CWG/issues/969.html">969</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD2</td>
     <td>Explicit instantiation declarations of class template specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="970">
     <td><a href="https://cplusplus.github.io/CWG/issues/970.html">970</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr">dcl.attr</a>]</td>
     <td>CD2</td>
     <td>Consistent use of &#8220;appertain&#8221; and &#8220;apply&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="971">
     <td><a href="https://cplusplus.github.io/CWG/issues/971.html">971</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>C++11</td>
     <td>Incorrect treatment of <I>exception-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="972">
     <td><a href="https://cplusplus.github.io/CWG/issues/972.html">972</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++11</td>
     <td>Allowing multiple <I>attribute-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="973">
     <td><a href="https://cplusplus.github.io/CWG/issues/973.html">973</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD2</td>
     <td>Function types in <I>exception-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="974">
     <td><a href="https://cplusplus.github.io/CWG/issues/974.html">974</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD3</td>
     <td>Default arguments for lambdas</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="975">
     <td><a href="https://cplusplus.github.io/CWG/issues/975.html">975</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD3</td>
     <td>Restrictions on return type deduction for lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="976">
     <td><a href="https://cplusplus.github.io/CWG/issues/976.html">976</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD2</td>
     <td>Deduction for <TT>const T&amp;</TT> conversion operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="977">
     <td><a href="https://cplusplus.github.io/CWG/issues/977.html">977</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD3</td>
     <td>When is an enumeration type complete?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="978">
     <td><a href="https://cplusplus.github.io/CWG/issues/978.html">978</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>CD2</td>
     <td>Incorrect specification for copy initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="979">
     <td><a href="https://cplusplus.github.io/CWG/issues/979.html">979</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD2</td>
     <td>Position of <I>attribute-specifier</I> in declarator syntax</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="980">
     <td><a href="https://cplusplus.github.io/CWG/issues/980.html">980</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD2</td>
     <td>Explicit instantiation of a member of a class template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="981">
     <td><a href="https://cplusplus.github.io/CWG/issues/981.html">981</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>Constexpr constructor templates and literal types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="982">
     <td><a href="https://cplusplus.github.io/CWG/issues/982.html">982</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Initialization with an empty initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="983">
     <td><a href="https://cplusplus.github.io/CWG/issues/983.html">983</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD2</td>
     <td>Ambiguous pointer-to-member constant</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="984">
     <td><a href="https://cplusplus.github.io/CWG/issues/984.html">984</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD2</td>
     <td>&#8220;Deduced type&#8221; is unclear in <TT>auto</TT> type deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="985">
     <td><a href="https://cplusplus.github.io/CWG/issues/985.html">985</a></td>
+    <td>[<a href="https://wg21.link/lex.digraph">lex.digraph</a>]</td>
     <td>C++11</td>
     <td>Alternative tokens and user-defined literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="986">
     <td><a href="https://cplusplus.github.io/CWG/issues/986.html">986</a></td>
+    <td>[<a href="https://wg21.link/namespace.udir">namespace.udir</a>]</td>
     <td>CD2</td>
     <td>Transitivity of <I>using-directive</I>s versus qualified lookup</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="987">
     <td><a href="https://cplusplus.github.io/CWG/issues/987.html">987</a></td>
+    <td>[<a href="https://wg21.link/basic.namespace">basic.namespace</a>]</td>
     <td>CD4</td>
     <td>Which declarations introduce namespace members?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="988">
     <td><a href="https://cplusplus.github.io/CWG/issues/988.html">988</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD2</td>
     <td>Reference-to-reference collapsing with <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="989">
     <td><a href="https://cplusplus.github.io/CWG/issues/989.html">989</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD2</td>
     <td>Misplaced list-initialization example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="990">
     <td><a href="https://cplusplus.github.io/CWG/issues/990.html">990</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD2</td>
     <td>Value initialization with multiple initializer-list constructors</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="991">
     <td><a href="https://cplusplus.github.io/CWG/issues/991.html">991</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD2</td>
     <td>Reference parameters of constexpr functions and constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="992">
     <td><a href="https://cplusplus.github.io/CWG/issues/992.html">992</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Inheriting explicitness</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="993">
     <td><a href="https://cplusplus.github.io/CWG/issues/993.html">993</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>C++11</td>
     <td>Freedom to perform instantiation at the end of the translation unit</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="994">
     <td><a href="https://cplusplus.github.io/CWG/issues/994.html">994</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++11</td>
     <td><I>braced-init-list</I> as a default argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="995">
     <td><a href="https://cplusplus.github.io/CWG/issues/995.html">995</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD2</td>
     <td>Incorrect example for <I>using-declaration</I> and explicit instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="996">
     <td><a href="https://cplusplus.github.io/CWG/issues/996.html">996</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>C++11</td>
     <td>Ambiguous partial specializations of member class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="997">
     <td><a href="https://cplusplus.github.io/CWG/issues/997.html">997</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>C++11</td>
     <td>Argument-dependent lookup and dependent function template parameter types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="998">
     <td><a href="https://cplusplus.github.io/CWG/issues/998.html">998</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>dup</td>
     <td>Function parameter transformations and template functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="999">
     <td><a href="https://cplusplus.github.io/CWG/issues/999.html">999</a></td>
+    <td>[<a href="https://wg21.link/over.match">over.match</a>]</td>
     <td>CD2</td>
     <td>&#8220;Implicit&#8221; or &#8220;implied&#8221; object argument/parameter?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1000">
     <td><a href="https://cplusplus.github.io/CWG/issues/1000.html">1000</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD2</td>
     <td>Mistaking member typedefs for constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1001">
     <td><a href="https://cplusplus.github.io/CWG/issues/1001.html">1001</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>review</td>
     <td>Parameter type adjustment in dependent parameter types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1002">
     <td><a href="https://cplusplus.github.io/CWG/issues/1002.html">1002</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>NAD</td>
     <td>Pack expansion for function arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1003">
     <td><a href="https://cplusplus.github.io/CWG/issues/1003.html">1003</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD3</td>
     <td>Acceptable definitions of <TT>main</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1004">
     <td><a href="https://cplusplus.github.io/CWG/issues/1004.html">1004</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>C++11</td>
     <td>Injected-class-names as arguments for template template parameters</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr id="1005">
     <td><a href="https://cplusplus.github.io/CWG/issues/1005.html">1005</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>NAD</td>
     <td>Qualified name resolution in member functions of class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1006">
     <td><a href="https://cplusplus.github.io/CWG/issues/1006.html">1006</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++11</td>
     <td><TT>std::nullptr_t</TT> as a non-type template parameter</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1007">
     <td><a href="https://cplusplus.github.io/CWG/issues/1007.html">1007</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>NAD</td>
     <td>Protected access and pointers to members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1008">
     <td><a href="https://cplusplus.github.io/CWG/issues/1008.html">1008</a></td>
+    <td>[<a href="https://wg21.link/expr.alignof">expr.alignof</a>]</td>
     <td>NAD</td>
     <td>Querying the alignment of an object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1009">
     <td><a href="https://cplusplus.github.io/CWG/issues/1009.html">1009</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>C++11</td>
     <td>Missing cases in the <I>declarator-id</I> of a function template declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1010">
     <td><a href="https://cplusplus.github.io/CWG/issues/1010.html">1010</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD2</td>
     <td>Address of object with dynamic storage duration in constant expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1011">
     <td><a href="https://cplusplus.github.io/CWG/issues/1011.html">1011</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>C++11</td>
     <td>Standard conversions that cannot be inverted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1012">
     <td><a href="https://cplusplus.github.io/CWG/issues/1012.html">1012</a></td>
+    <td>[<a href="https://wg21.link/namespace.unnamed">namespace.unnamed</a>]</td>
     <td>C++11</td>
     <td>Undeprecating <TT>static</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1013">
     <td><a href="https://cplusplus.github.io/CWG/issues/1013.html">1013</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>CD3</td>
     <td>Uninitialized <TT>std::nullptr_t</TT> objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1014">
     <td><a href="https://cplusplus.github.io/CWG/issues/1014.html">1014</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>NAD</td>
     <td>Overload resolution between <TT>const T&amp;</TT> and <TT>T&amp;&amp;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1015">
     <td><a href="https://cplusplus.github.io/CWG/issues/1015.html">1015</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>C++11</td>
     <td>Template arguments and argument-dependent lookup</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1016">
     <td><a href="https://cplusplus.github.io/CWG/issues/1016.html">1016</a></td>
+    <td>[<a href="https://wg21.link/over">over</a>]</td>
     <td>C++11</td>
     <td>Overloadable declarations, function templates, and references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1017">
     <td><a href="https://cplusplus.github.io/CWG/issues/1017.html">1017</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>C++11</td>
     <td>Member access transformation in unevaluated operands</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1018">
     <td><a href="https://cplusplus.github.io/CWG/issues/1018.html">1018</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>C++11</td>
     <td>Ambiguity between <I>simple-declaration</I> and <I>attribute-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1019">
     <td><a href="https://cplusplus.github.io/CWG/issues/1019.html">1019</a></td>
+    <td>[<a href="https://wg21.link/class.derived">class.derived</a>]</td>
     <td>dup</td>
     <td>Dependent <I>simple-template-id</I>s in <I>base-specifier</I>s and <I>mem-initializer</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1020">
     <td><a href="https://cplusplus.github.io/CWG/issues/1020.html">1020</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Implicitly-defined copy constructors and explicit base class constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1021">
     <td><a href="https://cplusplus.github.io/CWG/issues/1021.html">1021</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>CD4</td>
     <td>Definitions of namespace members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1022">
     <td><a href="https://cplusplus.github.io/CWG/issues/1022.html">1022</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>C++11</td>
     <td>Can an enumeration variable have values outside the values of the enumeration?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1023">
     <td><a href="https://cplusplus.github.io/CWG/issues/1023.html">1023</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>dup</td>
     <td><TT>thread_local</TT> objects as non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1024">
     <td><a href="https://cplusplus.github.io/CWG/issues/1024.html">1024</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD3</td>
     <td>Limits on multicharacter literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1025">
     <td><a href="https://cplusplus.github.io/CWG/issues/1025.html">1025</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++11</td>
     <td>Use of a reference as a non-type template argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1026">
     <td><a href="https://cplusplus.github.io/CWG/issues/1026.html">1026</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>NAD</td>
     <td>Cv-qualified non-class rvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1027">
     <td><a href="https://cplusplus.github.io/CWG/issues/1027.html">1027</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>review</td>
     <td>Type consistency and reallocation of scalar types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1028">
     <td><a href="https://cplusplus.github.io/CWG/issues/1028.html">1028</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.res">temp.dep.res</a>]</td>
     <td>CD6</td>
     <td>Dependent names in non-defining declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1029">
     <td><a href="https://cplusplus.github.io/CWG/issues/1029.html">1029</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>C++11</td>
     <td>Type of a destructor call</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1030">
     <td><a href="https://cplusplus.github.io/CWG/issues/1030.html">1030</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++11</td>
     <td>Evaluation order in <I>initializer-list</I>s used in aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1031">
     <td><a href="https://cplusplus.github.io/CWG/issues/1031.html">1031</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++11</td>
     <td>Optional elements in attributes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1032">
     <td><a href="https://cplusplus.github.io/CWG/issues/1032.html">1032</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>C++11</td>
     <td>Empty pack expansions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1033">
     <td><a href="https://cplusplus.github.io/CWG/issues/1033.html">1033</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>C++11</td>
     <td>Restrictions on alignment attributes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1034">
     <td><a href="https://cplusplus.github.io/CWG/issues/1034.html">1034</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>C++11</td>
     <td>Attributes for <TT>return</TT> statements in lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1035">
     <td><a href="https://cplusplus.github.io/CWG/issues/1035.html">1035</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++11</td>
     <td>Omitted and required <I>decl-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1036">
     <td><a href="https://cplusplus.github.io/CWG/issues/1036.html">1036</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>C++11</td>
     <td>Alignment attribute in an <I>exception-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1037">
     <td><a href="https://cplusplus.github.io/CWG/issues/1037.html">1037</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>C++11</td>
     <td>Requirements for operands of <I>delete-expression</I>s and deallocation functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1038">
     <td><a href="https://cplusplus.github.io/CWG/issues/1038.html">1038</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>CD7</td>
     <td>Overload resolution of <TT>&amp;x.static_func</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1039">
     <td><a href="https://cplusplus.github.io/CWG/issues/1039.html">1039</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>dup</td>
     <td>Coordinating C and C++ alignment specifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1040">
     <td><a href="https://cplusplus.github.io/CWG/issues/1040.html">1040</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>NAD</td>
     <td>Memory model issues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1041">
     <td><a href="https://cplusplus.github.io/CWG/issues/1041.html">1041</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>dup</td>
     <td><I>alias-declaration</I>s as class members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1042">
     <td><a href="https://cplusplus.github.io/CWG/issues/1042.html">1042</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>C++11</td>
     <td>Attributes in <I>alias-declaration</I>s</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1043">
     <td><a href="https://cplusplus.github.io/CWG/issues/1043.html">1043</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++11</td>
     <td>Qualified name lookup in the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1044">
     <td><a href="https://cplusplus.github.io/CWG/issues/1044.html">1044</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>C++11</td>
     <td>Point of declaration for an <I>alias-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1045">
     <td><a href="https://cplusplus.github.io/CWG/issues/1045.html">1045</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>NAD</td>
     <td>Requiring explicit instantiation declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1046">
     <td><a href="https://cplusplus.github.io/CWG/issues/1046.html">1046</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>open</td>
     <td>What is a &#8220;use&#8221; of a class specialization?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1047">
     <td><a href="https://cplusplus.github.io/CWG/issues/1047.html">1047</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>C++11</td>
     <td>When is <TT>typeid</TT> value-dependent?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1048">
     <td><a href="https://cplusplus.github.io/CWG/issues/1048.html">1048</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD3</td>
     <td><TT>auto</TT> deduction and lambda return type deduction.</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr class="open" id="1049">
     <td><a href="https://cplusplus.github.io/CWG/issues/1049.html">1049</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>open</td>
     <td>Copy elision through reference parameters of inline functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1050">
     <td><a href="https://cplusplus.github.io/CWG/issues/1050.html">1050</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>NAD</td>
     <td>Effects of thread support on object lifetime</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1051">
     <td><a href="https://cplusplus.github.io/CWG/issues/1051.html">1051</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Reference members and generated copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1052">
     <td><a href="https://cplusplus.github.io/CWG/issues/1052.html">1052</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>dup</td>
     <td><TT>const</TT> non-static data member and PODness</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1053">
     <td><a href="https://cplusplus.github.io/CWG/issues/1053.html">1053</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>Terminate vs undefined behavior for noexcept violation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1054">
     <td><a href="https://cplusplus.github.io/CWG/issues/1054.html">1054</a></td>
+    <td>[<a href="https://wg21.link/stmt.expr">stmt.expr</a>]</td>
     <td>C++11</td>
     <td>Lvalue-to-rvalue conversions in expression statements</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1055">
     <td><a href="https://cplusplus.github.io/CWG/issues/1055.html">1055</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>C++11</td>
     <td>Permissible uses of <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1056">
     <td><a href="https://cplusplus.github.io/CWG/issues/1056.html">1056</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>C++11</td>
     <td>Template aliases, member definitions, and the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1057">
     <td><a href="https://cplusplus.github.io/CWG/issues/1057.html">1057</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++11</td>
     <td><TT>decltype</TT> and the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1058">
     <td><a href="https://cplusplus.github.io/CWG/issues/1058.html">1058</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>NAD</td>
     <td>Reference binding of incompatible array types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1059">
     <td><a href="https://cplusplus.github.io/CWG/issues/1059.html">1059</a></td>
+    <td>[<a href="https://wg21.link/basic.type.qualifier">basic.type.qualifier</a>]</td>
     <td>CD3</td>
     <td>Cv-qualified array types (with rvalues)</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1060">
     <td><a href="https://cplusplus.github.io/CWG/issues/1060.html">1060</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Scoped enumerators in integral constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1061">
     <td><a href="https://cplusplus.github.io/CWG/issues/1061.html">1061</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>C++11</td>
     <td>Negative array bounds in a <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1062">
     <td><a href="https://cplusplus.github.io/CWG/issues/1062.html">1062</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>C++11</td>
     <td>Syntax of <I>attribute-specifier</I>s in lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1063">
     <td><a href="https://cplusplus.github.io/CWG/issues/1063.html">1063</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.override">dcl.attr.override</a>]</td>
     <td>C++11</td>
     <td><TT>[[hiding]]</TT> with non-attribute declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1064">
     <td><a href="https://cplusplus.github.io/CWG/issues/1064.html">1064</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Defaulted move constructor for a union</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1065">
     <td><a href="https://cplusplus.github.io/CWG/issues/1065.html">1065</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.override">dcl.attr.override</a>]</td>
     <td>C++11</td>
     <td><TT>[[hiding]]</TT> with <TT>[[override]]</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1066">
     <td><a href="https://cplusplus.github.io/CWG/issues/1066.html">1066</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>C++11</td>
     <td>When is a copy/move assignment operator implicitly defined?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1067">
     <td><a href="https://cplusplus.github.io/CWG/issues/1067.html">1067</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.override">dcl.attr.override</a>]</td>
     <td>NAD</td>
     <td><TT>[[hiding]]</TT>, <I>using-declaration</I>s, and multiple inheritance</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1068">
     <td><a href="https://cplusplus.github.io/CWG/issues/1068.html">1068</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++11</td>
     <td>Template aliases with default arguments and template parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1069">
     <td><a href="https://cplusplus.github.io/CWG/issues/1069.html">1069</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++11</td>
     <td>Incorrect function type with <I>trailing-return-type</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1070">
     <td><a href="https://cplusplus.github.io/CWG/issues/1070.html">1070</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++11</td>
     <td>Missing initializer clauses in aggregate initialization</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1071">
     <td><a href="https://cplusplus.github.io/CWG/issues/1071.html">1071</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>Literal class types and trivial default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1072">
     <td><a href="https://cplusplus.github.io/CWG/issues/1072.html">1072</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++11</td>
     <td>Scoped enumerator with the same name as its containing class</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1073">
     <td><a href="https://cplusplus.github.io/CWG/issues/1073.html">1073</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>C++11</td>
     <td>Merging <I>dynamic-exception-specification</I>s and <I>noexcept-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1074">
     <td><a href="https://cplusplus.github.io/CWG/issues/1074.html">1074</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>C++11</td>
     <td>Value-dependent <I>noexcept-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1075">
     <td><a href="https://cplusplus.github.io/CWG/issues/1075.html">1075</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>C++11</td>
     <td>Grammar does not allow template alias in <I>type-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1076">
     <td><a href="https://cplusplus.github.io/CWG/issues/1076.html">1076</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD5</td>
     <td>Value categories and lvalue temporaries</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1077">
     <td><a href="https://cplusplus.github.io/CWG/issues/1077.html">1077</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>NAD</td>
     <td>Explicit specializations in non-containing namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1078">
     <td><a href="https://cplusplus.github.io/CWG/issues/1078.html">1078</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Narrowing and the usual arithmetic conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1079">
     <td><a href="https://cplusplus.github.io/CWG/issues/1079.html">1079</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>C++11</td>
     <td>Overload resolution involving aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1080">
     <td><a href="https://cplusplus.github.io/CWG/issues/1080.html">1080</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Confusing relationship between templates and copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1081">
     <td><a href="https://cplusplus.github.io/CWG/issues/1081.html">1081</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>C++11</td>
     <td>Defaulted destructor and unusable operator delete</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1082">
     <td><a href="https://cplusplus.github.io/CWG/issues/1082.html">1082</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Implicit copy function if subobject has none?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1083">
     <td><a href="https://cplusplus.github.io/CWG/issues/1083.html">1083</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>C++11</td>
     <td>Passing an object to ellipsis with non-trivial move constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1084">
     <td><a href="https://cplusplus.github.io/CWG/issues/1084.html">1084</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Conditions for a deleted move function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1085">
     <td><a href="https://cplusplus.github.io/CWG/issues/1085.html">1085</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>NAD</td>
     <td>Move assignment operators and virtual bases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1086">
     <td><a href="https://cplusplus.github.io/CWG/issues/1086.html">1086</a></td>
+    <td>[<a href="https://wg21.link/expr.const.cast">expr.const.cast</a>]</td>
     <td>C++11</td>
     <td><TT>const_cast</TT> to rvalue reference to function type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1087">
     <td><a href="https://cplusplus.github.io/CWG/issues/1087.html">1087</a></td>
+    <td>[<a href="https://wg21.link/over.match.copy">over.match.copy</a>]</td>
     <td>C++11</td>
     <td>Additional applications of issue 899</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1088">
     <td><a href="https://cplusplus.github.io/CWG/issues/1088.html">1088</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>C++11</td>
     <td>Dependent non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1089">
     <td><a href="https://cplusplus.github.io/CWG/issues/1089.html">1089</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual.general">basic.lookup.qual.general</a>]</td>
     <td>open</td>
     <td>Template parameters in member selections</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1090">
     <td><a href="https://cplusplus.github.io/CWG/issues/1090.html">1090</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>C++11</td>
     <td>Alignment of subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1091">
     <td><a href="https://cplusplus.github.io/CWG/issues/1091.html">1091</a></td>
+    <td>[<a href="https://wg21.link/expr.mptr.oper">expr.mptr.oper</a>]</td>
     <td>C++11</td>
     <td>Inconsistent use of the term &#8220;object expression&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1092">
     <td><a href="https://cplusplus.github.io/CWG/issues/1092.html">1092</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>drafting</td>
     <td>Cycles in overload resolution during instantiation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1093">
     <td><a href="https://cplusplus.github.io/CWG/issues/1093.html">1093</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value-initializing non-objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1094">
     <td><a href="https://cplusplus.github.io/CWG/issues/1094.html">1094</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>C++11</td>
     <td>Converting floating-point values to scoped enumeration types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1095">
     <td><a href="https://cplusplus.github.io/CWG/issues/1095.html">1095</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++11</td>
     <td>List-initialization of references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1096">
     <td><a href="https://cplusplus.github.io/CWG/issues/1096.html">1096</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>C++11</td>
     <td>Missing requirement for template definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1097">
     <td><a href="https://cplusplus.github.io/CWG/issues/1097.html">1097</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>NAD</td>
     <td>Aggregate initialization of function parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1098">
     <td><a href="https://cplusplus.github.io/CWG/issues/1098.html">1098</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Pointer conversions in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1099">
     <td><a href="https://cplusplus.github.io/CWG/issues/1099.html">1099</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Infinite recursion in <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1100">
     <td><a href="https://cplusplus.github.io/CWG/issues/1100.html">1100</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> conversion functions and non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1101">
     <td><a href="https://cplusplus.github.io/CWG/issues/1101.html">1101</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>C++11</td>
     <td>Non-integral initialized static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1102">
     <td><a href="https://cplusplus.github.io/CWG/issues/1102.html">1102</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>C++11</td>
     <td>Better example of undefined behavior</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1103">
     <td><a href="https://cplusplus.github.io/CWG/issues/1103.html">1103</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>C++11</td>
     <td>Reversion of phase 1 and 2 transformations in raw string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1104">
     <td><a href="https://cplusplus.github.io/CWG/issues/1104.html">1104</a></td>
+    <td>[<a href="https://wg21.link/lex.digraph">lex.digraph</a>]</td>
     <td>C++11</td>
     <td>Global-scope template arguments vs the <TT>&lt;:</TT> digraph</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1105">
     <td><a href="https://cplusplus.github.io/CWG/issues/1105.html">1105</a></td>
+    <td>[<a href="https://wg21.link/lex.name">lex.name</a>]</td>
     <td>C++11</td>
     <td>Issues relating to TR 10176:2003</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1106">
     <td><a href="https://cplusplus.github.io/CWG/issues/1106.html">1106</a></td>
+    <td>[<a href="https://wg21.link/lex.nullptr">lex.nullptr</a>]</td>
     <td>C++11</td>
     <td>Need more detail in <TT>nullptr</TT> keyword description</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1107">
     <td><a href="https://cplusplus.github.io/CWG/issues/1107.html">1107</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>C++11</td>
     <td>Overload resolution for user-defined integer literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1108">
     <td><a href="https://cplusplus.github.io/CWG/issues/1108.html">1108</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>NAD</td>
     <td>User-defined literals have not been implemented</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1109">
     <td><a href="https://cplusplus.github.io/CWG/issues/1109.html">1109</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++11</td>
     <td>When is &#8220;use&#8221; a reference to the ODR meaning?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1110">
     <td><a href="https://cplusplus.github.io/CWG/issues/1110.html">1110</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>NAD</td>
     <td>Incomplete return type should be allowed in <TT>decltype</TT> operand</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1111">
     <td><a href="https://cplusplus.github.io/CWG/issues/1111.html">1111</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>C++11</td>
     <td>Remove dual-scope lookup of member template names</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1112">
     <td><a href="https://cplusplus.github.io/CWG/issues/1112.html">1112</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> variables should have internal linkage like <TT>const</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1113">
     <td><a href="https://cplusplus.github.io/CWG/issues/1113.html">1113</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>C++11</td>
     <td>Linkage of namespace member of unnamed namespace</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1114">
     <td><a href="https://cplusplus.github.io/CWG/issues/1114.html">1114</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>C++11</td>
     <td>Incorrect use of placement <TT>new</TT> in example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1115">
     <td><a href="https://cplusplus.github.io/CWG/issues/1115.html">1115</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>C++11</td>
     <td>C-compatible alignment specification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1116">
     <td><a href="https://cplusplus.github.io/CWG/issues/1116.html">1116</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD4</td>
     <td>Aliasing of union members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1117">
     <td><a href="https://cplusplus.github.io/CWG/issues/1117.html">1117</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>C++11</td>
     <td>Incorrect note about xvalue member access expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1118">
     <td><a href="https://cplusplus.github.io/CWG/issues/1118.html">1118</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>NAD</td>
     <td>Implicit lambda capture via explicit copy constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1119">
     <td><a href="https://cplusplus.github.io/CWG/issues/1119.html">1119</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>C++11</td>
     <td>Missing case in description of member access ambiguity</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1120">
     <td><a href="https://cplusplus.github.io/CWG/issues/1120.html">1120</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>C++11</td>
     <td><TT>reinterpret_cast</TT> and <TT>void*</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1121">
     <td><a href="https://cplusplus.github.io/CWG/issues/1121.html">1121</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>C++11</td>
     <td>Unnecessary ambiguity error in formation of pointer to member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1122">
     <td><a href="https://cplusplus.github.io/CWG/issues/1122.html">1122</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>C++11</td>
     <td>Circular definition of <TT>std::size_t</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1123">
     <td><a href="https://cplusplus.github.io/CWG/issues/1123.html">1123</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>C++11</td>
     <td>Destructors should be <TT>noexcept</TT> by default</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1124">
     <td><a href="https://cplusplus.github.io/CWG/issues/1124.html">1124</a></td>
+    <td>[<a href="https://wg21.link/expr.mptr.oper">expr.mptr.oper</a>]</td>
     <td>NAD</td>
     <td>Error in description of value category of pointer-to-member expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1125">
     <td><a href="https://cplusplus.github.io/CWG/issues/1125.html">1125</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Unclear definition of &#8220;potential constant expression&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1126">
     <td><a href="https://cplusplus.github.io/CWG/issues/1126.html">1126</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> functions in <TT>const</TT> initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1127">
     <td><a href="https://cplusplus.github.io/CWG/issues/1127.html">1127</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Overload resolution in <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1128">
     <td><a href="https://cplusplus.github.io/CWG/issues/1128.html">1128</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec">dcl.spec</a>]</td>
     <td>C++11</td>
     <td><I>attribute-specifier</I>s in <I>decl-specifier-seq</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1129">
     <td><a href="https://cplusplus.github.io/CWG/issues/1129.html">1129</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Default <TT>nothrow</TT> for <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1130">
     <td><a href="https://cplusplus.github.io/CWG/issues/1130.html">1130</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>C++11</td>
     <td>Function parameter type adjustments and <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1131">
     <td><a href="https://cplusplus.github.io/CWG/issues/1131.html">1131</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>C++11</td>
     <td>Template aliases in <I>elaborated-type-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1132">
     <td><a href="https://cplusplus.github.io/CWG/issues/1132.html">1132</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.noreturn">dcl.attr.noreturn</a>]</td>
     <td>NAD</td>
     <td>Keyword vs attribute for <TT>noreturn</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1133">
     <td><a href="https://cplusplus.github.io/CWG/issues/1133.html">1133</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.override">dcl.attr.override</a>]</td>
     <td>C++11</td>
     <td>Keywords vs attributes for control of hiding and overriding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1134">
     <td><a href="https://cplusplus.github.io/CWG/issues/1134.html">1134</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++11</td>
     <td>When is an explicitly-defaulted function defined?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1135">
     <td><a href="https://cplusplus.github.io/CWG/issues/1135.html">1135</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++11</td>
     <td>Explicitly-defaulted non-public special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1136">
     <td><a href="https://cplusplus.github.io/CWG/issues/1136.html">1136</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++11</td>
     <td>Explicitly-defaulted explicit constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1137">
     <td><a href="https://cplusplus.github.io/CWG/issues/1137.html">1137</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++11</td>
     <td>Explicitly-defaulted virtual special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1138">
     <td><a href="https://cplusplus.github.io/CWG/issues/1138.html">1138</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>C++11</td>
     <td>Rvalue-ness check for rvalue reference binding is wrong</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1139">
     <td><a href="https://cplusplus.github.io/CWG/issues/1139.html">1139</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>C++11</td>
     <td>Rvalue reference binding to scalar xvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1140">
     <td><a href="https://cplusplus.github.io/CWG/issues/1140.html">1140</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>C++11</td>
     <td>Incorrect redefinition of POD class</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1141">
     <td><a href="https://cplusplus.github.io/CWG/issues/1141.html">1141</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>NAD</td>
     <td>Non-static data member initializers have not been implemented</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1142">
     <td><a href="https://cplusplus.github.io/CWG/issues/1142.html">1142</a></td>
+    <td>[<a href="https://wg21.link/class.mfct">class.mfct</a>]</td>
     <td>C++11</td>
     <td><TT>friend</TT> declaration of member function of containing class</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1143">
     <td><a href="https://cplusplus.github.io/CWG/issues/1143.html">1143</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>NAD</td>
     <td>Move semantics for <TT>*this</TT> have not been implemented</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1144">
     <td><a href="https://cplusplus.github.io/CWG/issues/1144.html">1144</a></td>
+    <td>[<a href="https://wg21.link/class.access.dcl">class.access.dcl</a>]</td>
     <td>C++11</td>
     <td>Remove access declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1145">
     <td><a href="https://cplusplus.github.io/CWG/issues/1145.html">1145</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++11</td>
     <td>Defaulting and triviality</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1146">
     <td><a href="https://cplusplus.github.io/CWG/issues/1146.html">1146</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>C++11</td>
     <td><I>exception-specification</I>s of defaulted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1147">
     <td><a href="https://cplusplus.github.io/CWG/issues/1147.html">1147</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>C++11</td>
     <td>Destructors should be default <TT>nothrow</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1148">
     <td><a href="https://cplusplus.github.io/CWG/issues/1148.html">1148</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>C++11</td>
     <td>Copy elision and move construction of function parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1149">
     <td><a href="https://cplusplus.github.io/CWG/issues/1149.html">1149</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Trivial non-public copy operators in subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1150">
     <td><a href="https://cplusplus.github.io/CWG/issues/1150.html">1150</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>NAD</td>
     <td>Inheriting constructors have not been implemented</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="1151">
     <td><a href="https://cplusplus.github.io/CWG/issues/1151.html">1151</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>C++11</td>
     <td>Overload resolution with initializer-list and non-list constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1152">
     <td><a href="https://cplusplus.github.io/CWG/issues/1152.html">1152</a></td>
+    <td>[<a href="https://wg21.link/over.match.viable">over.match.viable</a>]</td>
     <td>C++11</td>
     <td>Rules for determining existence of implicit conversion sequence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1153">
     <td><a href="https://cplusplus.github.io/CWG/issues/1153.html">1153</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>C++11</td>
     <td>Type matching in address of overloaded function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1154">
     <td><a href="https://cplusplus.github.io/CWG/issues/1154.html">1154</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++11</td>
     <td>Address of <TT>thread_local</TT> variable as non-type template argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1155">
     <td><a href="https://cplusplus.github.io/CWG/issues/1155.html">1155</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++11</td>
     <td>Internal-linkage non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1156">
     <td><a href="https://cplusplus.github.io/CWG/issues/1156.html">1156</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>C++11</td>
     <td>Partial ordering in a non-call context</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1157">
     <td><a href="https://cplusplus.github.io/CWG/issues/1157.html">1157</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>open</td>
     <td>Partial ordering of function templates is still underspecified</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1158">
     <td><a href="https://cplusplus.github.io/CWG/issues/1158.html">1158</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>C++11</td>
     <td>Recursive instantiation via alias template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1159">
     <td><a href="https://cplusplus.github.io/CWG/issues/1159.html">1159</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>C++11</td>
     <td>Class and enumeration definitions in template aliases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1160">
     <td><a href="https://cplusplus.github.io/CWG/issues/1160.html">1160</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++11</td>
     <td>Definitions of template members and the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1161">
     <td><a href="https://cplusplus.github.io/CWG/issues/1161.html">1161</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>C++11</td>
     <td>Dependent <I>nested-name-specifier</I> in a pointer-to-member declarator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1162">
     <td><a href="https://cplusplus.github.io/CWG/issues/1162.html">1162</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td>Dependent <I>elaborated-type-specifier</I>s in non-deduced contexts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1163">
     <td><a href="https://cplusplus.github.io/CWG/issues/1163.html">1163</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>NAD</td>
     <td><TT>extern template</TT> prevents inlining functions not marked <TT>inline</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1164">
     <td><a href="https://cplusplus.github.io/CWG/issues/1164.html">1164</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>C++11</td>
     <td>Partial ordering of <TT>f(T&amp;)</TT> and <TT>f(T&amp;&amp;)</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1165">
     <td><a href="https://cplusplus.github.io/CWG/issues/1165.html">1165</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>C++11</td>
     <td>Exceptions when destroying array elements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1166">
     <td><a href="https://cplusplus.github.io/CWG/issues/1166.html">1166</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>C++11</td>
     <td><I>exception-declaration</I>s that do not declare objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1167">
     <td><a href="https://cplusplus.github.io/CWG/issues/1167.html">1167</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>C++11</td>
     <td><I>function-try-block</I>s for destructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1168">
     <td><a href="https://cplusplus.github.io/CWG/issues/1168.html">1168</a></td>
+    <td>[<a href="https://wg21.link/except.terminate">except.terminate</a>]</td>
     <td>C++11</td>
     <td>Additional reasons to call <TT>std::terminate</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1169">
     <td><a href="https://cplusplus.github.io/CWG/issues/1169.html">1169</a></td>
+    <td>[<a href="https://wg21.link/cpp.predefined">cpp.predefined</a>]</td>
     <td>C++11</td>
     <td>Missing feature macro for strict pointer safety</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1170">
     <td><a href="https://cplusplus.github.io/CWG/issues/1170.html">1170</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>C++11</td>
     <td>Access checking during template argument deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1171">
     <td><a href="https://cplusplus.github.io/CWG/issues/1171.html">1171</a></td>
+    <td>[<a href="https://wg21.link/except.terminate">except.terminate</a>]</td>
     <td>C++11</td>
     <td>Partial stack unwinding with <TT>noexcept</TT> violation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1172">
     <td><a href="https://cplusplus.github.io/CWG/issues/1172.html">1172</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>drafting</td>
     <td>&#8220;instantiation-dependent&#8221; constructs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1173">
     <td><a href="https://cplusplus.github.io/CWG/issues/1173.html">1173</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>C++11</td>
     <td>Unclear specification of effects of signal handling</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1174">
     <td><a href="https://cplusplus.github.io/CWG/issues/1174.html">1174</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++11</td>
     <td>When is a pure virtual function &#8220;used?&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1175">
     <td><a href="https://cplusplus.github.io/CWG/issues/1175.html">1175</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>C++11</td>
     <td>Disambiguating user-defined literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1176">
     <td><a href="https://cplusplus.github.io/CWG/issues/1176.html">1176</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>C++11</td>
     <td>Definition of release sequence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1177">
     <td><a href="https://cplusplus.github.io/CWG/issues/1177.html">1177</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>C++11</td>
     <td>Intra-thread dependency-ordered-before</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1178">
     <td><a href="https://cplusplus.github.io/CWG/issues/1178.html">1178</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.decl">temp.deduct.decl</a>]</td>
     <td>C++11</td>
     <td>Deduction failure matching placement new</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1179">
     <td><a href="https://cplusplus.github.io/CWG/issues/1179.html">1179</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Cv-qualification of non-type template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1180">
     <td><a href="https://cplusplus.github.io/CWG/issues/1180.html">1180</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>C++11</td>
     <td>Over-aligned class types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1181">
     <td><a href="https://cplusplus.github.io/CWG/issues/1181.html">1181</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>What is a &#8220;built-in type?&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1182">
     <td><a href="https://cplusplus.github.io/CWG/issues/1182.html">1182</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>C++11</td>
     <td>Incorrect description of pack expansion syntax</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1183">
     <td><a href="https://cplusplus.github.io/CWG/issues/1183.html">1183</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++11</td>
     <td>Expansion of parameter packs in declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1184">
     <td><a href="https://cplusplus.github.io/CWG/issues/1184.html">1184</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>C++11</td>
     <td>Argument conversions to nondeduced parameter types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1185">
     <td><a href="https://cplusplus.github.io/CWG/issues/1185.html">1185</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>C++11</td>
     <td>Misleading description of language linkage and member function types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1186">
     <td><a href="https://cplusplus.github.io/CWG/issues/1186.html">1186</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Non-dependent <TT>constexpr</TT> violations in function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1187">
     <td><a href="https://cplusplus.github.io/CWG/issues/1187.html">1187</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>C++11</td>
     <td>Problems in initialization example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1188">
     <td><a href="https://cplusplus.github.io/CWG/issues/1188.html">1188</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Type punning in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1189">
     <td><a href="https://cplusplus.github.io/CWG/issues/1189.html">1189</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>C++11</td>
     <td>Address of distinct base class subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1190">
     <td><a href="https://cplusplus.github.io/CWG/issues/1190.html">1190</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.safety">basic.stc.dynamic.safety</a>]</td>
     <td>C++11</td>
     <td>Operations on non-safely-derived pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1191">
     <td><a href="https://cplusplus.github.io/CWG/issues/1191.html">1191</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++11</td>
     <td>Deleted subobject destructors and implicitly-defined constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1192">
     <td><a href="https://cplusplus.github.io/CWG/issues/1192.html">1192</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++11</td>
     <td>Inadvertent change to ODR and templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1193">
     <td><a href="https://cplusplus.github.io/CWG/issues/1193.html">1193</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Use of address-constant pointers in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1194">
     <td><a href="https://cplusplus.github.io/CWG/issues/1194.html">1194</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Constexpr references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1195">
     <td><a href="https://cplusplus.github.io/CWG/issues/1195.html">1195</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>References to non-literal types in constexpr functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1196">
     <td><a href="https://cplusplus.github.io/CWG/issues/1196.html">1196</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>C++11</td>
     <td>Definition required for explicit instantiation after explicit specialization?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1197">
     <td><a href="https://cplusplus.github.io/CWG/issues/1197.html">1197</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Constexpr arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1198">
     <td><a href="https://cplusplus.github.io/CWG/issues/1198.html">1198</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>Literal types and copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1199">
     <td><a href="https://cplusplus.github.io/CWG/issues/1199.html">1199</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Deleted constexpr functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1200">
     <td><a href="https://cplusplus.github.io/CWG/issues/1200.html">1200</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD6</td>
     <td>Lookup rules for template parameters</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="1201">
     <td><a href="https://cplusplus.github.io/CWG/issues/1201.html">1201</a></td>
+    <td>[<a href="https://wg21.link/basic.def">basic.def</a>]</td>
     <td>C++11</td>
     <td>Are deleted and defaulted functions definitions?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1202">
     <td><a href="https://cplusplus.github.io/CWG/issues/1202.html">1202</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>C++11</td>
     <td>Calling virtual functions during destruction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1203">
     <td><a href="https://cplusplus.github.io/CWG/issues/1203.html">1203</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>dup</td>
     <td>Misleading note regarding initialized static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1204">
     <td><a href="https://cplusplus.github.io/CWG/issues/1204.html">1204</a></td>
+    <td>[<a href="https://wg21.link/stmt.iter">stmt.iter</a>]</td>
     <td>C++11</td>
     <td>Specifiers in a <I>for-range-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1205">
     <td><a href="https://cplusplus.github.io/CWG/issues/1205.html">1205</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>dup</td>
     <td>Lvalue reference binding and function viability</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1206">
     <td><a href="https://cplusplus.github.io/CWG/issues/1206.html">1206</a></td>
+    <td>[<a href="https://wg21.link/temp.class">temp.class</a>]</td>
     <td>C++11</td>
     <td>Defining opaque enumeration members of class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1207">
     <td><a href="https://cplusplus.github.io/CWG/issues/1207.html">1207</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>C++11</td>
     <td>Type of class member in <I>trailing-return-type</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1208">
     <td><a href="https://cplusplus.github.io/CWG/issues/1208.html">1208</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>C++11</td>
     <td>Explicit <TT>noexcept</TT> in defaulted definition</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1209">
     <td><a href="https://cplusplus.github.io/CWG/issues/1209.html">1209</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>open</td>
     <td>Is a potentially-evaluated expression in a template definition a &#8220;use?&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1210">
     <td><a href="https://cplusplus.github.io/CWG/issues/1210.html">1210</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>C++11</td>
     <td>Injection of <I>elaborated-type-specifier</I> in enumeration scope</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1211">
     <td><a href="https://cplusplus.github.io/CWG/issues/1211.html">1211</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>open</td>
     <td>Misaligned lvalues</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1212">
     <td><a href="https://cplusplus.github.io/CWG/issues/1212.html">1212</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>C++11</td>
     <td>Non-function-call xvalues and <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1213">
     <td><a href="https://cplusplus.github.io/CWG/issues/1213.html">1213</a></td>
+    <td>[<a href="https://wg21.link/expr.sub">expr.sub</a>]</td>
     <td>CD3</td>
     <td>Array subscripting and xvalues</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1214">
     <td><a href="https://cplusplus.github.io/CWG/issues/1214.html">1214</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++11</td>
     <td>Kinds of initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1215">
     <td><a href="https://cplusplus.github.io/CWG/issues/1215.html">1215</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>C++11</td>
     <td>Definition of POD struct</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1216">
     <td><a href="https://cplusplus.github.io/CWG/issues/1216.html">1216</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>C++11</td>
     <td>Exceptions &#8220;allowed&#8221; by a <I>noexcept-specification</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1217">
     <td><a href="https://cplusplus.github.io/CWG/issues/1217.html">1217</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.delete">dcl.fct.def.delete</a>]</td>
     <td>NAD</td>
     <td>Are deleted functions implicitly <TT>noexcept</TT>?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1218">
     <td><a href="https://cplusplus.github.io/CWG/issues/1218.html">1218</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>C++11</td>
     <td>What is the &#8220;currently-handled exception&#8221; in a multi-threaded program?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1219">
     <td><a href="https://cplusplus.github.io/CWG/issues/1219.html">1219</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>Non-static data member initializers in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1220">
     <td><a href="https://cplusplus.github.io/CWG/issues/1220.html">1220</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>C++11</td>
     <td>Looking up <I>conversion-type-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1221">
     <td><a href="https://cplusplus.github.io/CWG/issues/1221.html">1221</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>open</td>
     <td>Partial ordering and reference collapsing</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1222">
     <td><a href="https://cplusplus.github.io/CWG/issues/1222.html">1222</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>NAD</td>
     <td>Unnecessary restriction on <TT>auto</TT> array types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1223">
     <td><a href="https://cplusplus.github.io/CWG/issues/1223.html">1223</a></td>
+    <td>[<a href="https://wg21.link/stmt.ambig">stmt.ambig</a>]</td>
     <td>CD7</td>
     <td>Syntactic disambiguation and <I>trailing-return-type</I>s</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="1224">
     <td><a href="https://cplusplus.github.io/CWG/issues/1224.html">1224</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> defaulted copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1225">
     <td><a href="https://cplusplus.github.io/CWG/issues/1225.html">1225</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> constructors and virtual bases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1226">
     <td><a href="https://cplusplus.github.io/CWG/issues/1226.html">1226</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD3</td>
     <td>Converting a <I>braced-init-list</I> default argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1227">
     <td><a href="https://cplusplus.github.io/CWG/issues/1227.html">1227</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD3</td>
     <td>Mixing immediate and non-immediate contexts in deduction failure</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="1228">
     <td><a href="https://cplusplus.github.io/CWG/issues/1228.html">1228</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>NAD</td>
     <td>Copy-list-initialization and <TT>explicit</TT> constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1229">
     <td><a href="https://cplusplus.github.io/CWG/issues/1229.html">1229</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>C++11</td>
     <td>Overload resolution with empty <I>braced-init-list</I> argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1230">
     <td><a href="https://cplusplus.github.io/CWG/issues/1230.html">1230</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>dup</td>
     <td>Confusing description of ambiguity of destructor name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1231">
     <td><a href="https://cplusplus.github.io/CWG/issues/1231.html">1231</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>C++11</td>
     <td>Variadic templates requiring an empty pack expansion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1232">
     <td><a href="https://cplusplus.github.io/CWG/issues/1232.html">1232</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++11</td>
     <td>Creation of array temporaries using a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1233">
     <td><a href="https://cplusplus.github.io/CWG/issues/1233.html">1233</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>C++11</td>
     <td>Pack expansions and dependent calls</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1234">
     <td><a href="https://cplusplus.github.io/CWG/issues/1234.html">1234</a></td>
+    <td>[<a href="https://wg21.link/dcl.name">dcl.name</a>]</td>
     <td>C++11</td>
     <td><I>abstract-declarator</I> does not permit <TT>...</TT> after <I>ptr-operator</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1235">
     <td><a href="https://cplusplus.github.io/CWG/issues/1235.html">1235</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>C++11</td>
     <td>&#8220;Unused&#8221; ellipsis and default arguments in partial ordering</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1236">
     <td><a href="https://cplusplus.github.io/CWG/issues/1236.html">1236</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>C++11</td>
     <td>Inconsistently-interrelated examples</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1237">
     <td><a href="https://cplusplus.github.io/CWG/issues/1237.html">1237</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>C++11</td>
     <td>Deprecated implicit copy assignment in example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1238">
     <td><a href="https://cplusplus.github.io/CWG/issues/1238.html">1238</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>C++11</td>
     <td>Overloading ambiguity binding reference to function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1239">
     <td><a href="https://cplusplus.github.io/CWG/issues/1239.html">1239</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>C++11</td>
     <td>Hexadecimal floating-point literals vs user-defined literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1240">
     <td><a href="https://cplusplus.github.io/CWG/issues/1240.html">1240</a></td>
+    <td>[<a href="https://wg21.link/dcl.name">dcl.name</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> defaulted constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1241">
     <td><a href="https://cplusplus.github.io/CWG/issues/1241.html">1241</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>C++11</td>
     <td>Which members does a destructor destroy?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1242">
     <td><a href="https://cplusplus.github.io/CWG/issues/1242.html">1242</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++11</td>
     <td>Initializing variant class members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1243">
     <td><a href="https://cplusplus.github.io/CWG/issues/1243.html">1243</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>C++11</td>
     <td>Misleading footnote regarding multiple-declarator declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1244">
     <td><a href="https://cplusplus.github.io/CWG/issues/1244.html">1244</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>C++11</td>
     <td>Equivalence of alias templates and class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1245">
     <td><a href="https://cplusplus.github.io/CWG/issues/1245.html">1245</a></td>
+    <td>[<a href="https://wg21.link/temp.mem.func">temp.mem.func</a>]</td>
     <td>C++11</td>
     <td>Matching declarations involving <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1246">
     <td><a href="https://cplusplus.github.io/CWG/issues/1246.html">1246</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++11</td>
     <td>Non-deduced non-final parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1247">
     <td><a href="https://cplusplus.github.io/CWG/issues/1247.html">1247</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD4</td>
     <td>Restriction on alias name appearing in <I>type-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1248">
     <td><a href="https://cplusplus.github.io/CWG/issues/1248.html">1248</a></td>
+    <td>[<a href="https://wg21.link/diff.iso">diff.iso</a>]</td>
     <td>open</td>
     <td>Updating Annex C to C99 and C23</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1249">
     <td><a href="https://cplusplus.github.io/CWG/issues/1249.html">1249</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD6</td>
     <td>Cv-qualification of nested lambda capture</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1250">
     <td><a href="https://cplusplus.github.io/CWG/issues/1250.html">1250</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>CD3</td>
     <td>Cv-qualification of incomplete virtual function return types</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1251">
     <td><a href="https://cplusplus.github.io/CWG/issues/1251.html">1251</a></td>
+    <td>[<a href="https://wg21.link/diff.conv">diff.conv</a>]</td>
     <td>CD3</td>
     <td>C compatibility: casting to unqualified <TT>void*</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1252">
     <td><a href="https://cplusplus.github.io/CWG/issues/1252.html">1252</a></td>
+    <td>[<a href="https://wg21.link/over.load">over.load</a>]</td>
     <td>CD6</td>
     <td>Overloading member function templates based on dependent return type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1253">
     <td><a href="https://cplusplus.github.io/CWG/issues/1253.html">1253</a></td>
+    <td>[<a href="https://wg21.link/temp.spec">temp.spec</a>]</td>
     <td>C++17</td>
     <td>Generic non-template members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1254">
     <td><a href="https://cplusplus.github.io/CWG/issues/1254.html">1254</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>NAD</td>
     <td>odr-use vs template arguments and constexpr functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1255">
     <td><a href="https://cplusplus.github.io/CWG/issues/1255.html">1255</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>drafting</td>
     <td>Definition problems with <TT>constexpr</TT> functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1256">
     <td><a href="https://cplusplus.github.io/CWG/issues/1256.html">1256</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Unevaluated operands are not necessarily constant expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1257">
     <td><a href="https://cplusplus.github.io/CWG/issues/1257.html">1257</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>open</td>
     <td>Instantiation via non-dependent references in uninstantiated templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1258">
     <td><a href="https://cplusplus.github.io/CWG/issues/1258.html">1258</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>CD5</td>
     <td>&#8220;Instantiation context&#8221; differs from dependent lookup rules</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1259">
     <td><a href="https://cplusplus.github.io/CWG/issues/1259.html">1259</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>NAD</td>
     <td>Deleting a POD via a pointer to base</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1260">
     <td><a href="https://cplusplus.github.io/CWG/issues/1260.html">1260</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD3</td>
     <td>Incorrect use of term &#8220;overloaded&#8221; in description of odr-use</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1261">
     <td><a href="https://cplusplus.github.io/CWG/issues/1261.html">1261</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD3</td>
     <td>Explicit handling of cv-qualification with non-class prvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1262">
     <td><a href="https://cplusplus.github.io/CWG/issues/1262.html">1262</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD3</td>
     <td>Default template arguments and deduction failure</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1263">
     <td><a href="https://cplusplus.github.io/CWG/issues/1263.html">1263</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>NAD</td>
     <td>Mismatch between rvalue reference binding and overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1264">
     <td><a href="https://cplusplus.github.io/CWG/issues/1264.html">1264</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Use of <TT>this</TT> in <TT>constexpr</TT> constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1265">
     <td><a href="https://cplusplus.github.io/CWG/issues/1265.html">1265</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD3</td>
     <td>Mixed use of the <TT>auto</TT> specifier</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr class="open" id="1266">
     <td><a href="https://cplusplus.github.io/CWG/issues/1266.html">1266</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>open</td>
     <td><I>user-defined-integer-literal</I> overflow</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1267">
     <td><a href="https://cplusplus.github.io/CWG/issues/1267.html">1267</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD3</td>
     <td>Rvalue reference types in <I>exception-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1268">
     <td><a href="https://cplusplus.github.io/CWG/issues/1268.html">1268</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD3</td>
     <td><TT>reinterpret_cast</TT> of an xvalue operand</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1269">
     <td><a href="https://cplusplus.github.io/CWG/issues/1269.html">1269</a></td>
+    <td>[<a href="https://wg21.link/expr.dynamic.cast">expr.dynamic.cast</a>]</td>
     <td>CD3</td>
     <td><TT>dynamic_cast</TT> of an xvalue operand</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1270">
     <td><a href="https://cplusplus.github.io/CWG/issues/1270.html">1270</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Brace elision in array temporary initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1271">
     <td><a href="https://cplusplus.github.io/CWG/issues/1271.html">1271</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD5</td>
     <td>Imprecise wording regarding dependent types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1272">
     <td><a href="https://cplusplus.github.io/CWG/issues/1272.html">1272</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>NAD</td>
     <td>Implicit definition of static data member of const literal type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1273">
     <td><a href="https://cplusplus.github.io/CWG/issues/1273.html">1273</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>NAD</td>
     <td>Accessibility and function signatures</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1274">
     <td><a href="https://cplusplus.github.io/CWG/issues/1274.html">1274</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>CD4</td>
     <td>Common nonterminal for <I>expression</I> and <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1275">
     <td><a href="https://cplusplus.github.io/CWG/issues/1275.html">1275</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD3</td>
     <td>Incorrect comment in example of template parameter pack restriction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1276">
     <td><a href="https://cplusplus.github.io/CWG/issues/1276.html">1276</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>NAD</td>
     <td>Reference to <TT>stdint.h</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1277">
     <td><a href="https://cplusplus.github.io/CWG/issues/1277.html">1277</a></td>
+    <td>[<a href="https://wg21.link/cstdint.syn">cstdint.syn</a>]</td>
     <td>NAD</td>
     <td>Lax definition of <TT>intmax_t</TT> and <TT>uintmax_t</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1278">
     <td><a href="https://cplusplus.github.io/CWG/issues/1278.html">1278</a></td>
+    <td>[<a href="https://wg21.link/over.call.func">over.call.func</a>]</td>
     <td>drafting</td>
     <td>Incorrect treatment of contrived object</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1279">
     <td><a href="https://cplusplus.github.io/CWG/issues/1279.html">1279</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp03">diff.cpp03</a>]</td>
     <td>open</td>
     <td>Additional differences between C++ 2003 and C++ 2011</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1280">
     <td><a href="https://cplusplus.github.io/CWG/issues/1280.html">1280</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>NAD</td>
     <td>Object reallocation and reference members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1281">
     <td><a href="https://cplusplus.github.io/CWG/issues/1281.html">1281</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>NAD</td>
     <td>Virtual and dependent base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1282">
     <td><a href="https://cplusplus.github.io/CWG/issues/1282.html">1282</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD3</td>
     <td>Underspecified destructor <I>exception-specification</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1283">
     <td><a href="https://cplusplus.github.io/CWG/issues/1283.html">1283</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>open</td>
     <td>Static data members of classes with typedef name for linkage purposes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1284">
     <td><a href="https://cplusplus.github.io/CWG/issues/1284.html">1284</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD4</td>
     <td>Should the lifetime of an array be independent of that of its elements?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1285">
     <td><a href="https://cplusplus.github.io/CWG/issues/1285.html">1285</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>NAD</td>
     <td>Trivial destructors and object lifetime</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1286">
     <td><a href="https://cplusplus.github.io/CWG/issues/1286.html">1286</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>open</td>
     <td>Equivalence of alias templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1287">
     <td><a href="https://cplusplus.github.io/CWG/issues/1287.html">1287</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>C++14</td>
     <td>Direct initialization vs &#8220;implicit&#8221; conversion in reference binding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1288">
     <td><a href="https://cplusplus.github.io/CWG/issues/1288.html">1288</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Reference list initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1289">
     <td><a href="https://cplusplus.github.io/CWG/issues/1289.html">1289</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>NAD</td>
     <td>Can an alias template name the current instantiation?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1290">
     <td><a href="https://cplusplus.github.io/CWG/issues/1290.html">1290</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Lifetime of the underlying array of an <TT>initializer_list</TT> member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1291">
     <td><a href="https://cplusplus.github.io/CWG/issues/1291.html">1291</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD6</td>
     <td>Looking up a <I>conversion-type-id</I></td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="1292">
     <td><a href="https://cplusplus.github.io/CWG/issues/1292.html">1292</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD4</td>
     <td>Dependent calls with <I>braced-init-list</I>s containing a pack expansion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1293">
     <td><a href="https://cplusplus.github.io/CWG/issues/1293.html">1293</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>String literals in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1294">
     <td><a href="https://cplusplus.github.io/CWG/issues/1294.html">1294</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>open</td>
     <td>Side effects in dynamic/static initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1295">
     <td><a href="https://cplusplus.github.io/CWG/issues/1295.html">1295</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD3</td>
     <td>Binding a reference to an rvalue bit-field</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="1296">
     <td><a href="https://cplusplus.github.io/CWG/issues/1296.html">1296</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD3</td>
     <td>Ill-formed template declarations (not just definitions)</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1297">
     <td><a href="https://cplusplus.github.io/CWG/issues/1297.html">1297</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD3</td>
     <td>Misplaced function <I>attribute-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1298">
     <td><a href="https://cplusplus.github.io/CWG/issues/1298.html">1298</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD3</td>
     <td>Incorrect example in overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1299">
     <td><a href="https://cplusplus.github.io/CWG/issues/1299.html">1299</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD5</td>
     <td>&#8220;Temporary objects&#8221; vs &#8220;temporary expressions&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1300">
     <td><a href="https://cplusplus.github.io/CWG/issues/1300.html">1300</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>dup</td>
     <td><TT>T()</TT> for array types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1301">
     <td><a href="https://cplusplus.github.io/CWG/issues/1301.html">1301</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value initialization of union</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1302">
     <td><a href="https://cplusplus.github.io/CWG/issues/1302.html">1302</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD3</td>
     <td><TT>noexcept</TT> applied to expression of type <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1303">
     <td><a href="https://cplusplus.github.io/CWG/issues/1303.html">1303</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>NAD</td>
     <td>C language linkage for template with internal linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1304">
     <td><a href="https://cplusplus.github.io/CWG/issues/1304.html">1304</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.string">dcl.init.string</a>]</td>
     <td>drafting</td>
     <td>Omitted array bound with string initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1305">
     <td><a href="https://cplusplus.github.io/CWG/issues/1305.html">1305</a></td>
+    <td>[<a href="https://wg21.link/expr.alignof">expr.alignof</a>]</td>
     <td>CD3</td>
     <td><TT>alignof</TT> applied to array of unknown size</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="1306">
     <td><a href="https://cplusplus.github.io/CWG/issues/1306.html">1306</a></td>
+    <td>[<a href="https://wg21.link/class.this">class.this</a>]</td>
     <td>CD3</td>
     <td>Modifying an object within a <TT>const</TT> member function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1307">
     <td><a href="https://cplusplus.github.io/CWG/issues/1307.html">1307</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>C++14</td>
     <td>Overload resolution based on size of array <I>initializer-list</I></td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr id="1308">
     <td><a href="https://cplusplus.github.io/CWG/issues/1308.html">1308</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD3</td>
     <td>Completeness of class type within an <I>exception-specification</I></td>
     <td class="full-superseded" align="center">Superseded by <a href="#1330">1330</a></td>
   </tr>
   <tr id="1309">
     <td><a href="https://cplusplus.github.io/CWG/issues/1309.html">1309</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD4</td>
     <td>Incorrect note regarding lookup of a member of the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1310">
     <td><a href="https://cplusplus.github.io/CWG/issues/1310.html">1310</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD3</td>
     <td>What is an &#8220;acceptable lookup result?&#8221;</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr id="1311">
     <td><a href="https://cplusplus.github.io/CWG/issues/1311.html">1311</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Volatile lvalues in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1312">
     <td><a href="https://cplusplus.github.io/CWG/issues/1312.html">1312</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Simulated <TT>reinterpret_cast</TT> in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1313">
     <td><a href="https://cplusplus.github.io/CWG/issues/1313.html">1313</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Undefined pointer arithmetic in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1314">
     <td><a href="https://cplusplus.github.io/CWG/issues/1314.html">1314</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>NAD</td>
     <td>Pointer arithmetic within standard-layout objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1315">
     <td><a href="https://cplusplus.github.io/CWG/issues/1315.html">1315</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>CD4</td>
     <td>Restrictions on non-type template arguments in partial specializations</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1316">
     <td><a href="https://cplusplus.github.io/CWG/issues/1316.html">1316</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td><TT>constexpr</TT> function requirements and class scope</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1317">
     <td><a href="https://cplusplus.github.io/CWG/issues/1317.html">1317</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>NAD</td>
     <td>Unnamed scoped enumerations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1318">
     <td><a href="https://cplusplus.github.io/CWG/issues/1318.html">1318</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD3</td>
     <td>Syntactic ambiguities with <TT>final</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1319">
     <td><a href="https://cplusplus.github.io/CWG/issues/1319.html">1319</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Error in pack expansion example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1320">
     <td><a href="https://cplusplus.github.io/CWG/issues/1320.html">1320</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD3</td>
     <td>Converting scoped enumerations to <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1321">
     <td><a href="https://cplusplus.github.io/CWG/issues/1321.html">1321</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>CD3</td>
     <td>Equivalency of dependent calls</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1322">
     <td><a href="https://cplusplus.github.io/CWG/issues/1322.html">1322</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>drafting</td>
     <td>Function parameter type decay in templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1323">
     <td><a href="https://cplusplus.github.io/CWG/issues/1323.html">1323</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>NAD</td>
     <td>Nonexistent nonterminal in <I>alignment-specifier</I> grammar</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1324">
     <td><a href="https://cplusplus.github.io/CWG/issues/1324.html">1324</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value initialization and defaulted constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1325">
     <td><a href="https://cplusplus.github.io/CWG/issues/1325.html">1325</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>NAD</td>
     <td>Omitted declarator in <TT>friend</TT> declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1326">
     <td><a href="https://cplusplus.github.io/CWG/issues/1326.html">1326</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>dup</td>
     <td>Deducing an array bound from an <I>initializer-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1327">
     <td><a href="https://cplusplus.github.io/CWG/issues/1327.html">1327</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD3</td>
     <td><I>virt-specifier</I> in a defaulted definition</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1328">
     <td><a href="https://cplusplus.github.io/CWG/issues/1328.html">1328</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD3</td>
     <td>Conflict in reference binding vs overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1329">
     <td><a href="https://cplusplus.github.io/CWG/issues/1329.html">1329</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>CD3</td>
     <td>Recursive deduction substitutions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1330">
     <td><a href="https://cplusplus.github.io/CWG/issues/1330.html">1330</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD3</td>
     <td>Delayed instantiation of <TT>noexcept</TT> specifiers</td>
     <td class="full" align="center">Clang 4 (C++11 onwards)</td>
   </tr>
   <tr id="1331">
     <td><a href="https://cplusplus.github.io/CWG/issues/1331.html">1331</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD5</td>
     <td><TT>const</TT> mismatch with defaulted copy constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1332">
     <td><a href="https://cplusplus.github.io/CWG/issues/1332.html">1332</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>CD5</td>
     <td>Handling of invalid universal-character-names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1333">
     <td><a href="https://cplusplus.github.io/CWG/issues/1333.html">1333</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD3</td>
     <td>Omission of <TT>const</TT> in a defaulted copy constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1334">
     <td><a href="https://cplusplus.github.io/CWG/issues/1334.html">1334</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>NAD</td>
     <td>Layout compatibility and cv-qualification</td>
     <td class="full-superseded" align="center">Superseded by <a href="#1719">1719</a></td>
   </tr>
   <tr id="1335">
     <td><a href="https://cplusplus.github.io/CWG/issues/1335.html">1335</a></td>
+    <td>[<a href="https://wg21.link/cpp.stringize">cpp.stringize</a>]</td>
     <td>CD6</td>
     <td>Stringizing, extended characters, and universal-character-names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1336">
     <td><a href="https://cplusplus.github.io/CWG/issues/1336.html">1336</a></td>
+    <td>[<a href="https://wg21.link/class.conv.ctor">class.conv.ctor</a>]</td>
     <td>CD3</td>
     <td>Definition of &#8220;converting constructor&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1337">
     <td><a href="https://cplusplus.github.io/CWG/issues/1337.html">1337</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>dup</td>
     <td>Partial ordering and non-deduced parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1338">
     <td><a href="https://cplusplus.github.io/CWG/issues/1338.html">1338</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>CD4</td>
     <td>Aliasing and allocation functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1339">
     <td><a href="https://cplusplus.github.io/CWG/issues/1339.html">1339</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>NAD</td>
     <td>Parenthesized <I>braced-init-list</I> and arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1340">
     <td><a href="https://cplusplus.github.io/CWG/issues/1340.html">1340</a></td>
+    <td>[<a href="https://wg21.link/expr.mptr.oper">expr.mptr.oper</a>]</td>
     <td>CD3</td>
     <td>Complete type in member pointer expressions</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="1341">
     <td><a href="https://cplusplus.github.io/CWG/issues/1341.html">1341</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>NAD</td>
     <td>Bit-field initializers</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P0683R1">P0683R1</a></td>
   </tr>
   <tr id="1342">
     <td><a href="https://cplusplus.github.io/CWG/issues/1342.html">1342</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD6</td>
     <td>Order of initialization with multiple declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1343">
     <td><a href="https://cplusplus.github.io/CWG/issues/1343.html">1343</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>C++17</td>
     <td>Sequencing of non-class initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1344">
     <td><a href="https://cplusplus.github.io/CWG/issues/1344.html">1344</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++14</td>
     <td>Adding new special member functions to a class via default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1345">
     <td><a href="https://cplusplus.github.io/CWG/issues/1345.html">1345</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD3</td>
     <td>Initialization of anonymous union class members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1346">
     <td><a href="https://cplusplus.github.io/CWG/issues/1346.html">1346</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD3</td>
     <td><I>expression-list</I> initializers and the <TT>auto</TT> specifier</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1347">
     <td><a href="https://cplusplus.github.io/CWG/issues/1347.html">1347</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD3</td>
     <td>Consistency of <TT>auto</TT> in multiple-declarator declarations</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr class="open" id="1348">
     <td><a href="https://cplusplus.github.io/CWG/issues/1348.html">1348</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>drafting</td>
     <td>Use of <TT>auto</TT> in a <I>trailing-return-type</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1349">
     <td><a href="https://cplusplus.github.io/CWG/issues/1349.html">1349</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>dup</td>
     <td>Consistency of alias template redeclarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1350">
     <td><a href="https://cplusplus.github.io/CWG/issues/1350.html">1350</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD3</td>
     <td>Incorrect exception specification for inherited constructors</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1351">
     <td><a href="https://cplusplus.github.io/CWG/issues/1351.html">1351</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Problems with implicitly-declared <I>exception-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1352">
     <td><a href="https://cplusplus.github.io/CWG/issues/1352.html">1352</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>CD3</td>
     <td>Inconsistent class scope and completeness rules</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="1353">
     <td><a href="https://cplusplus.github.io/CWG/issues/1353.html">1353</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD7</td>
     <td>Array and variant members and deleted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1354">
     <td><a href="https://cplusplus.github.io/CWG/issues/1354.html">1354</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>CD3</td>
     <td>Destructor exceptions for temporaries in noexcept expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1355">
     <td><a href="https://cplusplus.github.io/CWG/issues/1355.html">1355</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD3</td>
     <td>Aggregates and &#8220;user-provided&#8221; constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1356">
     <td><a href="https://cplusplus.github.io/CWG/issues/1356.html">1356</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Exception specifications of copy assignment operators with virtual bases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1357">
     <td><a href="https://cplusplus.github.io/CWG/issues/1357.html">1357</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD3</td>
     <td><I>brace-or-equal-initializer</I>s for function and typedef members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1358">
     <td><a href="https://cplusplus.github.io/CWG/issues/1358.html">1358</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD3</td>
     <td>Unintentionally ill-formed <TT>constexpr</TT> function template instances</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1359">
     <td><a href="https://cplusplus.github.io/CWG/issues/1359.html">1359</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD3</td>
     <td><TT>constexpr</TT> union constructors</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1360">
     <td><a href="https://cplusplus.github.io/CWG/issues/1360.html">1360</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD6</td>
     <td><TT>constexpr</TT> defaulted default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1361">
     <td><a href="https://cplusplus.github.io/CWG/issues/1361.html">1361</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD3</td>
     <td>Requirement on <I>brace-or-equal-initializer</I>s of literal types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1362">
     <td><a href="https://cplusplus.github.io/CWG/issues/1362.html">1362</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD3</td>
     <td>Complete type required for implicit conversion to <TT>T&amp;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1363">
     <td><a href="https://cplusplus.github.io/CWG/issues/1363.html">1363</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD3</td>
     <td>Triviality vs multiple default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1364">
     <td><a href="https://cplusplus.github.io/CWG/issues/1364.html">1364</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td><TT>constexpr</TT> function parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1365">
     <td><a href="https://cplusplus.github.io/CWG/issues/1365.html">1365</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Calling undefined <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1366">
     <td><a href="https://cplusplus.github.io/CWG/issues/1366.html">1366</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD3</td>
     <td>Deleted <TT>constexpr</TT> constructors and virtual base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1367">
     <td><a href="https://cplusplus.github.io/CWG/issues/1367.html">1367</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Use of <TT>this</TT> in a constant expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1368">
     <td><a href="https://cplusplus.github.io/CWG/issues/1368.html">1368</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value initialization and defaulted constructors (part 2)</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1369">
     <td><a href="https://cplusplus.github.io/CWG/issues/1369.html">1369</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD3</td>
     <td>Function invocation substitution of <TT>this</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1370">
     <td><a href="https://cplusplus.github.io/CWG/issues/1370.html">1370</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace">cpp.replace</a>]</td>
     <td>CD3</td>
     <td><I>identifier-list</I> cannot contain ellipsis</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1371">
     <td><a href="https://cplusplus.github.io/CWG/issues/1371.html">1371</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>NAD</td>
     <td>Deduction from <TT>T&amp;&amp;</TT> in return types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1372">
     <td><a href="https://cplusplus.github.io/CWG/issues/1372.html">1372</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD3</td>
     <td>Cross-references incorrect in conversion function template argument deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1373">
     <td><a href="https://cplusplus.github.io/CWG/issues/1373.html">1373</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>dup</td>
     <td>Overload resolution changes matching reference-binding changes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1374">
     <td><a href="https://cplusplus.github.io/CWG/issues/1374.html">1374</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD3</td>
     <td>Qualification conversion vs difference in reference binding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1375">
     <td><a href="https://cplusplus.github.io/CWG/issues/1375.html">1375</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>CD3</td>
     <td>Reference to anonymous union?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1376">
     <td><a href="https://cplusplus.github.io/CWG/issues/1376.html">1376</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>C++14</td>
     <td><TT>static_cast</TT> of temporary to rvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1377">
     <td><a href="https://cplusplus.github.io/CWG/issues/1377.html">1377</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp03">diff.cpp03</a>]</td>
     <td>dup</td>
     <td>Access declarations not mentioned in Annex C</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1378">
     <td><a href="https://cplusplus.github.io/CWG/issues/1378.html">1378</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>CD5</td>
     <td>When is an instantiation required?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1379">
     <td><a href="https://cplusplus.github.io/CWG/issues/1379.html">1379</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Is <TT>std::initializer_list</TT> an aggregate?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1380">
     <td><a href="https://cplusplus.github.io/CWG/issues/1380.html">1380</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD3</td>
     <td>Type definitions in <I>template-parameter</I> <I>parameter-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1381">
     <td><a href="https://cplusplus.github.io/CWG/issues/1381.html">1381</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD3</td>
     <td>Implicitly-declared special member functions and default <TT>nothrow</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1382">
     <td><a href="https://cplusplus.github.io/CWG/issues/1382.html">1382</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD3</td>
     <td>Dead code for constructor names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1383">
     <td><a href="https://cplusplus.github.io/CWG/issues/1383.html">1383</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD3</td>
     <td>Clarifying discarded-value expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1384">
     <td><a href="https://cplusplus.github.io/CWG/issues/1384.html">1384</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td><TT>reinterpret_cast</TT> in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1385">
     <td><a href="https://cplusplus.github.io/CWG/issues/1385.html">1385</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>CD3</td>
     <td>Syntactic forms of conversion functions for surrogate call functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1386">
     <td><a href="https://cplusplus.github.io/CWG/issues/1386.html">1386</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>NAD</td>
     <td>Explicitly-specified partial argument list with multiple parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1387">
     <td><a href="https://cplusplus.github.io/CWG/issues/1387.html">1387</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD3</td>
     <td>Missing non-deduced context for <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1388">
     <td><a href="https://cplusplus.github.io/CWG/issues/1388.html">1388</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD3</td>
     <td>Missing non-deduced context following a function parameter pack</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="1389">
     <td><a href="https://cplusplus.github.io/CWG/issues/1389.html">1389</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>NAD</td>
     <td>Recursive reference in <I>trailing-return-type</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1390">
     <td><a href="https://cplusplus.github.io/CWG/issues/1390.html">1390</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>drafting</td>
     <td>Dependency of alias template specializations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1391">
     <td><a href="https://cplusplus.github.io/CWG/issues/1391.html">1391</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>CD4</td>
     <td>Conversions to parameter types with non-deduced template arguments</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1392">
     <td><a href="https://cplusplus.github.io/CWG/issues/1392.html">1392</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>CD3</td>
     <td>Explicit conversion functions for references and non-references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1393">
     <td><a href="https://cplusplus.github.io/CWG/issues/1393.html">1393</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>C++17</td>
     <td>Pack expansions in <I>using-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1394">
     <td><a href="https://cplusplus.github.io/CWG/issues/1394.html">1394</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD3</td>
     <td>Incomplete types as parameters of deleted functions</td>
     <td class="full" align="center">Clang 15</td>
   </tr>
   <tr id="1395">
     <td><a href="https://cplusplus.github.io/CWG/issues/1395.html">1395</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>C++17</td>
     <td>Partial ordering of variadic templates reconsidered</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="1396">
     <td><a href="https://cplusplus.github.io/CWG/issues/1396.html">1396</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>C++23</td>
     <td>Deferred instantiation and checking of non-static data member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1397">
     <td><a href="https://cplusplus.github.io/CWG/issues/1397.html">1397</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td>Class completeness in non-static data member initializers</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="1398">
     <td><a href="https://cplusplus.github.io/CWG/issues/1398.html">1398</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD3</td>
     <td>Non-type template parameters of type <TT>std::nullptr_t</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1399">
     <td><a href="https://cplusplus.github.io/CWG/issues/1399.html">1399</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD3</td>
     <td>Deduction with multiple function parameter packs</td>
     <td class="full" align="center">Duplicate of <a href="#1388">1388</a></td>
   </tr>
   <tr id="1400">
     <td><a href="https://cplusplus.github.io/CWG/issues/1400.html">1400</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>NAD</td>
     <td>Function pointer equality</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1401">
     <td><a href="https://cplusplus.github.io/CWG/issues/1401.html">1401</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD3</td>
     <td>Similar types and reference compatibility</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1402">
     <td><a href="https://cplusplus.github.io/CWG/issues/1402.html">1402</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD3</td>
     <td>Move functions too often deleted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1403">
     <td><a href="https://cplusplus.github.io/CWG/issues/1403.html">1403</a></td>
+    <td>[<a href="https://wg21.link/lex.comment">lex.comment</a>]</td>
     <td>CD6</td>
     <td>Universal-character-names in comments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1404">
     <td><a href="https://cplusplus.github.io/CWG/issues/1404.html">1404</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>open</td>
     <td>Object reallocation in unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1405">
     <td><a href="https://cplusplus.github.io/CWG/issues/1405.html">1405</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD3</td>
     <td><TT>constexpr</TT> and mutable members of literal types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1406">
     <td><a href="https://cplusplus.github.io/CWG/issues/1406.html">1406</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>CD3</td>
     <td><I>ref-qualifier</I>s and added parameters of non-static member function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1407">
     <td><a href="https://cplusplus.github.io/CWG/issues/1407.html">1407</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Integral to <TT>bool</TT> conversion in converted constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1408">
     <td><a href="https://cplusplus.github.io/CWG/issues/1408.html">1408</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD3</td>
     <td>What is &#8220;the same aggregate initialization?&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1409">
     <td><a href="https://cplusplus.github.io/CWG/issues/1409.html">1409</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>CD3</td>
     <td>What is the second standard conversion sequence of a list-initialization sequence?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1410">
     <td><a href="https://cplusplus.github.io/CWG/issues/1410.html">1410</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD3</td>
     <td>Reference overload tiebreakers should apply to rvalue references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1411">
     <td><a href="https://cplusplus.github.io/CWG/issues/1411.html">1411</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD3</td>
     <td>More on global scope <TT>::</TT> in <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1412">
     <td><a href="https://cplusplus.github.io/CWG/issues/1412.html">1412</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD3</td>
     <td>Problems in specifying pointer conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1413">
     <td><a href="https://cplusplus.github.io/CWG/issues/1413.html">1413</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD3</td>
     <td>Missing cases of value-dependency</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr class="open" id="1414">
     <td><a href="https://cplusplus.github.io/CWG/issues/1414.html">1414</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>drafting</td>
     <td>Binding an rvalue reference to a reference-unrelated lvalue</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1415">
     <td><a href="https://cplusplus.github.io/CWG/issues/1415.html">1415</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD3</td>
     <td>Missing prohibition of block-scope definition of <TT>extern</TT> object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1416">
     <td><a href="https://cplusplus.github.io/CWG/issues/1416.html">1416</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>CD3</td>
     <td>Function cv-qualifiers and <TT>typeid</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1417">
     <td><a href="https://cplusplus.github.io/CWG/issues/1417.html">1417</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++14</td>
     <td>Pointers/references to functions with cv-qualifiers or <I>ref-qualifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1418">
     <td><a href="https://cplusplus.github.io/CWG/issues/1418.html">1418</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Type of <TT>initializer_list</TT> backing array</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1419">
     <td><a href="https://cplusplus.github.io/CWG/issues/1419.html">1419</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Evaluation order in aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1420">
     <td><a href="https://cplusplus.github.io/CWG/issues/1420.html">1420</a></td>
+    <td>[<a href="https://wg21.link/class.abstract">class.abstract</a>]</td>
     <td>NAD</td>
     <td>Abstract final classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1421">
     <td><a href="https://cplusplus.github.io/CWG/issues/1421.html">1421</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Full expressions and aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1422">
     <td><a href="https://cplusplus.github.io/CWG/issues/1422.html">1422</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>dup</td>
     <td>Type of character literals containing universal-character-names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1423">
     <td><a href="https://cplusplus.github.io/CWG/issues/1423.html">1423</a></td>
+    <td>[<a href="https://wg21.link/conv.fctptr">conv.fctptr</a>]</td>
     <td>CD3</td>
     <td>Convertibility of <TT>nullptr</TT> to <TT>bool</TT></td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="1424">
     <td><a href="https://cplusplus.github.io/CWG/issues/1424.html">1424</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>C++14</td>
     <td>When must sub-object destructors be accessible?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1425">
     <td><a href="https://cplusplus.github.io/CWG/issues/1425.html">1425</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD3</td>
     <td>Base-class subobjects of standard-layout structs</td>
     <td class="na" align="center">N/A (ABI constraint)</td>
   </tr>
   <tr id="1426">
     <td><a href="https://cplusplus.github.io/CWG/issues/1426.html">1426</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD5</td>
     <td>Allowing additional parameter types in defaulted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1427">
     <td><a href="https://cplusplus.github.io/CWG/issues/1427.html">1427</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>NAD</td>
     <td>Default constructor and deleted or inaccessible destructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1428">
     <td><a href="https://cplusplus.github.io/CWG/issues/1428.html">1428</a></td>
+    <td>[<a href="https://wg21.link/basic.type.qualifier">basic.type.qualifier</a>]</td>
     <td>CD3</td>
     <td>Dynamic const objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1429">
     <td><a href="https://cplusplus.github.io/CWG/issues/1429.html">1429</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.temp">basic.scope.temp</a>]</td>
     <td>NAD</td>
     <td>Scope of a member template's template parameter</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1430">
     <td><a href="https://cplusplus.github.io/CWG/issues/1430.html">1430</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>open</td>
     <td>Pack expansion into fixed alias template parameter list</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1431">
     <td><a href="https://cplusplus.github.io/CWG/issues/1431.html">1431</a></td>
+    <td>[<a href="https://wg21.link/except">except</a>]</td>
     <td>CD3</td>
     <td>Exceptions from other than <I>throw-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1432">
     <td><a href="https://cplusplus.github.io/CWG/issues/1432.html">1432</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>open</td>
     <td>Newly-ambiguous variadic template expansions</td>
     <td align="center">
@@ -8419,912 +9809,1064 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1433">
     <td><a href="https://cplusplus.github.io/CWG/issues/1433.html">1433</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>NAD</td>
     <td><I>trailing-return-type</I> and point of declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1434">
     <td><a href="https://cplusplus.github.io/CWG/issues/1434.html">1434</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>NAD</td>
     <td>Parenthesized <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1435">
     <td><a href="https://cplusplus.github.io/CWG/issues/1435.html">1435</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD3</td>
     <td><I>template-id</I> as the declarator for a class template constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1436">
     <td><a href="https://cplusplus.github.io/CWG/issues/1436.html">1436</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>open</td>
     <td>Interaction of constant expression changes with preprocessor expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1437">
     <td><a href="https://cplusplus.github.io/CWG/issues/1437.html">1437</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD3</td>
     <td><TT>alignas</TT> in <I>alias-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1438">
     <td><a href="https://cplusplus.github.io/CWG/issues/1438.html">1438</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.safety">basic.stc.dynamic.safety</a>]</td>
     <td>CD3</td>
     <td>Non-dereference use of invalid pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1439">
     <td><a href="https://cplusplus.github.io/CWG/issues/1439.html">1439</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>CD3</td>
     <td>Lookup and friend template declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1440">
     <td><a href="https://cplusplus.github.io/CWG/issues/1440.html">1440</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD3</td>
     <td>Acceptable <I>decltype-specifier</I>s used as <I>nested-name-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1441">
     <td><a href="https://cplusplus.github.io/CWG/issues/1441.html">1441</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>C++14</td>
     <td>Unclear wording for signal handler restrictions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1442">
     <td><a href="https://cplusplus.github.io/CWG/issues/1442.html">1442</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>CD3</td>
     <td>Argument-dependent lookup in the range-based <TT>for</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1443">
     <td><a href="https://cplusplus.github.io/CWG/issues/1443.html">1443</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>NAD</td>
     <td>Default arguments and non-static data members</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="1444">
     <td><a href="https://cplusplus.github.io/CWG/issues/1444.html">1444</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>drafting</td>
     <td>Type adjustments of non-type template parameters</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1445">
     <td><a href="https://cplusplus.github.io/CWG/issues/1445.html">1445</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>dup</td>
     <td>Argument-dependent lookup of <TT>begin</TT> and <TT>end</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1446">
     <td><a href="https://cplusplus.github.io/CWG/issues/1446.html">1446</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>CD4</td>
     <td>Member function with no <I>ref-qualifier</I> and non-member function with rvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1447">
     <td><a href="https://cplusplus.github.io/CWG/issues/1447.html">1447</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD3</td>
     <td><TT>static_cast</TT> of bit-field lvalue to rvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1448">
     <td><a href="https://cplusplus.github.io/CWG/issues/1448.html">1448</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>NAD</td>
     <td>Integral values of type <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1449">
     <td><a href="https://cplusplus.github.io/CWG/issues/1449.html">1449</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Narrowing conversion of negative value to unsigned type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1450">
     <td><a href="https://cplusplus.github.io/CWG/issues/1450.html">1450</a></td>
+    <td>[<a href="https://wg21.link/expr.mul">expr.mul</a>]</td>
     <td>CD3</td>
     <td><TT>INT_MIN % -1</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1451">
     <td><a href="https://cplusplus.github.io/CWG/issues/1451.html">1451</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD4</td>
     <td>Objects with no linkage in non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1452">
     <td><a href="https://cplusplus.github.io/CWG/issues/1452.html">1452</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Value-initialized objects may be constants</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1453">
     <td><a href="https://cplusplus.github.io/CWG/issues/1453.html">1453</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD3</td>
     <td>Volatile members in literal classes?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1454">
     <td><a href="https://cplusplus.github.io/CWG/issues/1454.html">1454</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Passing constants through <TT>constexpr</TT> functions via references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1455">
     <td><a href="https://cplusplus.github.io/CWG/issues/1455.html">1455</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Lvalue converted constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1456">
     <td><a href="https://cplusplus.github.io/CWG/issues/1456.html">1456</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Address constant expression designating the one-past-the-end address</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1457">
     <td><a href="https://cplusplus.github.io/CWG/issues/1457.html">1457</a></td>
+    <td>[<a href="https://wg21.link/expr.shift">expr.shift</a>]</td>
     <td>CD3</td>
     <td>Undefined behavior in left-shift</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1458">
     <td><a href="https://cplusplus.github.io/CWG/issues/1458.html">1458</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD3</td>
     <td>Address of incomplete type vs <TT>operator&amp;()</TT></td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr class="open" id="1459">
     <td><a href="https://cplusplus.github.io/CWG/issues/1459.html">1459</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>open</td>
     <td>Reference-binding tiebreakers in overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1460">
     <td><a href="https://cplusplus.github.io/CWG/issues/1460.html">1460</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>C++14</td>
     <td>What is an empty union?</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1461">
     <td><a href="https://cplusplus.github.io/CWG/issues/1461.html">1461</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Narrowing conversions to bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1462">
     <td><a href="https://cplusplus.github.io/CWG/issues/1462.html">1462</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD3</td>
     <td>Deduction failure vs &#8220;ill-formed, no diagnostic required&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1463">
     <td><a href="https://cplusplus.github.io/CWG/issues/1463.html">1463</a></td>
+    <td>[<a href="https://wg21.link/temp.pre">temp.pre</a>]</td>
     <td>drafting</td>
     <td><TT>extern "C"</TT> alias templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1464">
     <td><a href="https://cplusplus.github.io/CWG/issues/1464.html">1464</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD3</td>
     <td>Negative array bound in a <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1465">
     <td><a href="https://cplusplus.github.io/CWG/issues/1465.html">1465</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>CD4</td>
     <td><TT>noexcept</TT> and <TT>std::bad_array_new_length</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1466">
     <td><a href="https://cplusplus.github.io/CWG/issues/1466.html">1466</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>C++14</td>
     <td>Visible sequences of side effects are redundant</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1467">
     <td><a href="https://cplusplus.github.io/CWG/issues/1467.html">1467</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>List-initialization of aggregate from same-type object</td>
     <td class="full" align="center">Clang 3.7 (C++11 onwards)</td>
   </tr>
   <tr id="1468">
     <td><a href="https://cplusplus.github.io/CWG/issues/1468.html">1468</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD5</td>
     <td><TT>typeid</TT>, overload resolution, and implicit lambda capture</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1469">
     <td><a href="https://cplusplus.github.io/CWG/issues/1469.html">1469</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td>Omitted bound in array <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1470">
     <td><a href="https://cplusplus.github.io/CWG/issues/1470.html">1470</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>NAD</td>
     <td>Thread migration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1471">
     <td><a href="https://cplusplus.github.io/CWG/issues/1471.html">1471</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD3</td>
     <td>Nested type of non-dependent base</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1472">
     <td><a href="https://cplusplus.github.io/CWG/issues/1472.html">1472</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD3</td>
     <td>odr-use of reference variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1473">
     <td><a href="https://cplusplus.github.io/CWG/issues/1473.html">1473</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>CD3</td>
     <td>Syntax of <I>literal-operator-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1474">
     <td><a href="https://cplusplus.github.io/CWG/issues/1474.html">1474</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>NAD</td>
     <td>User-defined literals and <TT>&lt;inttypes.h&gt;</TT> format macros</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1475">
     <td><a href="https://cplusplus.github.io/CWG/issues/1475.html">1475</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.depend">dcl.attr.depend</a>]</td>
     <td>CD3</td>
     <td>Errors in <TT>[[carries_dependency]]</TT> example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1476">
     <td><a href="https://cplusplus.github.io/CWG/issues/1476.html">1476</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD3</td>
     <td>Definition of user-defined type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1477">
     <td><a href="https://cplusplus.github.io/CWG/issues/1477.html">1477</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>CD3</td>
     <td>Definition of a <TT>friend</TT> outside its namespace</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1478">
     <td><a href="https://cplusplus.github.io/CWG/issues/1478.html">1478</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD6</td>
     <td><TT>template</TT> keyword for dependent template template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1479">
     <td><a href="https://cplusplus.github.io/CWG/issues/1479.html">1479</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>CD3</td>
     <td>Literal operators and default arguments</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1480">
     <td><a href="https://cplusplus.github.io/CWG/issues/1480.html">1480</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Constant initialization via non-constant temporary</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1481">
     <td><a href="https://cplusplus.github.io/CWG/issues/1481.html">1481</a></td>
+    <td>[<a href="https://wg21.link/over.inc">over.inc</a>]</td>
     <td>CD3</td>
     <td>Increment/decrement operators with reference parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1482">
     <td><a href="https://cplusplus.github.io/CWG/issues/1482.html">1482</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>CD3</td>
     <td>Point of declaration of enumeration</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="1483">
     <td><a href="https://cplusplus.github.io/CWG/issues/1483.html">1483</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td>Non-dependent <I>static_assert-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1484">
     <td><a href="https://cplusplus.github.io/CWG/issues/1484.html">1484</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>CD4</td>
     <td>Unused local classes of function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1485">
     <td><a href="https://cplusplus.github.io/CWG/issues/1485.html">1485</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>drafting</td>
     <td>Out-of-class definition of member unscoped opaque enumeration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1486">
     <td><a href="https://cplusplus.github.io/CWG/issues/1486.html">1486</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.funcaddr">temp.deduct.funcaddr</a>]</td>
     <td>drafting</td>
     <td>Base-derived conversion in member pointer deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1487">
     <td><a href="https://cplusplus.github.io/CWG/issues/1487.html">1487</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD3</td>
     <td>When are inheriting constructors declared?</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr class="open" id="1488">
     <td><a href="https://cplusplus.github.io/CWG/issues/1488.html">1488</a></td>
+    <td>[<a href="https://wg21.link/dcl.name">dcl.name</a>]</td>
     <td>drafting</td>
     <td><I>abstract-pack-declarator</I>s in <I>type-id</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1489">
     <td><a href="https://cplusplus.github.io/CWG/issues/1489.html">1489</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD3</td>
     <td>Is value-initialization of an array constant initialization?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1490">
     <td><a href="https://cplusplus.github.io/CWG/issues/1490.html">1490</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>List-initialization from a string literal</td>
     <td class="full" align="center">Clang 3.7 (C++11 onwards)</td>
   </tr>
   <tr id="1491">
     <td><a href="https://cplusplus.github.io/CWG/issues/1491.html">1491</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD3</td>
     <td>Move construction and rvalue reference members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1492">
     <td><a href="https://cplusplus.github.io/CWG/issues/1492.html">1492</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD4</td>
     <td>Exception specifications on template destructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1493">
     <td><a href="https://cplusplus.github.io/CWG/issues/1493.html">1493</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++14</td>
     <td>Criteria for move-construction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1494">
     <td><a href="https://cplusplus.github.io/CWG/issues/1494.html">1494</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Temporary initialization for reference binding in list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1495">
     <td><a href="https://cplusplus.github.io/CWG/issues/1495.html">1495</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>CD3</td>
     <td>Partial specialization of variadic class template</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="1496">
     <td><a href="https://cplusplus.github.io/CWG/issues/1496.html">1496</a></td>
+    <td>[<a href="https://wg21.link/class.name">class.name</a>]</td>
     <td>CD4</td>
     <td>Triviality with deleted and missing default constructors</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1497">
     <td><a href="https://cplusplus.github.io/CWG/issues/1497.html">1497</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>NAD</td>
     <td>Aggregate initialization with parenthesized string literal</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1498">
     <td><a href="https://cplusplus.github.io/CWG/issues/1498.html">1498</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>dup</td>
     <td>Lifetime of temporaries in range-based <TT>for</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1499">
     <td><a href="https://cplusplus.github.io/CWG/issues/1499.html">1499</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>CD7</td>
     <td>Missing case for deleted move assignment operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1500">
     <td><a href="https://cplusplus.github.io/CWG/issues/1500.html">1500</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.candidate">temp.dep.candidate</a>]</td>
     <td>CD6</td>
     <td>Name lookup of dependent conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1501">
     <td><a href="https://cplusplus.github.io/CWG/issues/1501.html">1501</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Nested braces in list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1502">
     <td><a href="https://cplusplus.github.io/CWG/issues/1502.html">1502</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value initialization of unions with member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1503">
     <td><a href="https://cplusplus.github.io/CWG/issues/1503.html">1503</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD3</td>
     <td>Exceptions during copy to exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1504">
     <td><a href="https://cplusplus.github.io/CWG/issues/1504.html">1504</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>CD3</td>
     <td>Pointer arithmetic after derived-base conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1505">
     <td><a href="https://cplusplus.github.io/CWG/issues/1505.html">1505</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>dup</td>
     <td>Direct binding of reference to temporary in list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1506">
     <td><a href="https://cplusplus.github.io/CWG/issues/1506.html">1506</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Value category of <TT>initializer_list</TT> object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1507">
     <td><a href="https://cplusplus.github.io/CWG/issues/1507.html">1507</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value initialization with trivial inaccessible default constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1508">
     <td><a href="https://cplusplus.github.io/CWG/issues/1508.html">1508</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++14</td>
     <td>Template initializer-list constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1509">
     <td><a href="https://cplusplus.github.io/CWG/issues/1509.html">1509</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>C++14</td>
     <td>Definition of &#8220;non-template function&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1510">
     <td><a href="https://cplusplus.github.io/CWG/issues/1510.html">1510</a></td>
+    <td>[<a href="https://wg21.link/dcl.ref">dcl.ref</a>]</td>
     <td>CD3</td>
     <td>cv-qualified references via <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1511">
     <td><a href="https://cplusplus.github.io/CWG/issues/1511.html">1511</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD3</td>
     <td><TT>const volatile</TT> variables and the one-definition rule</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1512">
     <td><a href="https://cplusplus.github.io/CWG/issues/1512.html">1512</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD3</td>
     <td>Pointer comparison vs qualification conversions</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr class="open" id="1513">
     <td><a href="https://cplusplus.github.io/CWG/issues/1513.html">1513</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>drafting</td>
     <td><TT>initializer_list</TT> deduction failure</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1514">
     <td><a href="https://cplusplus.github.io/CWG/issues/1514.html">1514</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>C++14</td>
     <td>Ambiguity between enumeration definition and zero-length bit-field</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="1515">
     <td><a href="https://cplusplus.github.io/CWG/issues/1515.html">1515</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD3</td>
     <td>Modulo 2<SUP><I>n</I></SUP> arithmetic for implicitly-unsigned types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1516">
     <td><a href="https://cplusplus.github.io/CWG/issues/1516.html">1516</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD3</td>
     <td>Definition of &#8220;virtual function call&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1517">
     <td><a href="https://cplusplus.github.io/CWG/issues/1517.html">1517</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>open</td>
     <td>Unclear/missing description of behavior during construction/destruction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1518">
     <td><a href="https://cplusplus.github.io/CWG/issues/1518.html">1518</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>Explicit default constructors and copy-list-initialization</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="1519">
     <td><a href="https://cplusplus.github.io/CWG/issues/1519.html">1519</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>NAD</td>
     <td>Conflicting default and variadic constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1520">
     <td><a href="https://cplusplus.github.io/CWG/issues/1520.html">1520</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>NAD</td>
     <td>Alias template specialization vs pack expansion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1521">
     <td><a href="https://cplusplus.github.io/CWG/issues/1521.html">1521</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>dup</td>
     <td><TT>T{</TT><I>expr</I><TT>}</TT> with reference types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1522">
     <td><a href="https://cplusplus.github.io/CWG/issues/1522.html">1522</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Access checking for <TT>initializer_list</TT> array initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1523">
     <td><a href="https://cplusplus.github.io/CWG/issues/1523.html">1523</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>CD5</td>
     <td>Point of declaration in range-based <TT>for</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1524">
     <td><a href="https://cplusplus.github.io/CWG/issues/1524.html">1524</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>drafting</td>
     <td>Incompletely-defined class template base</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1525">
     <td><a href="https://cplusplus.github.io/CWG/issues/1525.html">1525</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>NAD</td>
     <td>Array bound inference in temporary array</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1526">
     <td><a href="https://cplusplus.github.io/CWG/issues/1526.html">1526</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>dup</td>
     <td>Dependent-class lookup in the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1527">
     <td><a href="https://cplusplus.github.io/CWG/issues/1527.html">1527</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD3</td>
     <td>Assignment from <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1528">
     <td><a href="https://cplusplus.github.io/CWG/issues/1528.html">1528</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD3</td>
     <td>Repeated <I>cv-qualifier</I>s in declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1529">
     <td><a href="https://cplusplus.github.io/CWG/issues/1529.html">1529</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>drafting</td>
     <td>Nomenclature for variable vs reference non-static data member</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1530">
     <td><a href="https://cplusplus.github.io/CWG/issues/1530.html">1530</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>drafting</td>
     <td>Member access in out-of-lifetime objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1531">
     <td><a href="https://cplusplus.github.io/CWG/issues/1531.html">1531</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD3</td>
     <td>Definition of &#8220;access&#8221; (verb)</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1532">
     <td><a href="https://cplusplus.github.io/CWG/issues/1532.html">1532</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD3</td>
     <td>Explicit instantiation and member templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1533">
     <td><a href="https://cplusplus.github.io/CWG/issues/1533.html">1533</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>CD3</td>
     <td>Function pack expansion for member initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1534">
     <td><a href="https://cplusplus.github.io/CWG/issues/1534.html">1534</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>dup</td>
     <td>cv-qualification of prvalue of type &#8220;array of class&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1535">
     <td><a href="https://cplusplus.github.io/CWG/issues/1535.html">1535</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td><TT>typeid</TT> in core constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1536">
     <td><a href="https://cplusplus.github.io/CWG/issues/1536.html">1536</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>drafting</td>
     <td>Overload resolution with temporary from initializer list</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1537">
     <td><a href="https://cplusplus.github.io/CWG/issues/1537.html">1537</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Optional compile-time evaluation of constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1538">
     <td><a href="https://cplusplus.github.io/CWG/issues/1538.html">1538</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD3</td>
     <td>C-style cast in <I>braced-init-list</I> assignment</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1539">
     <td><a href="https://cplusplus.github.io/CWG/issues/1539.html">1539</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD3</td>
     <td>Definition of &#8220;character type&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1540">
     <td><a href="https://cplusplus.github.io/CWG/issues/1540.html">1540</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Use of address constants in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1541">
     <td><a href="https://cplusplus.github.io/CWG/issues/1541.html">1541</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>CD3</td>
     <td><I>cv</I> <TT>void</TT> return types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1542">
     <td><a href="https://cplusplus.github.io/CWG/issues/1542.html">1542</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>open</td>
     <td>Compound assignment of <I>braced-init-list</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1543">
     <td><a href="https://cplusplus.github.io/CWG/issues/1543.html">1543</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>CD3</td>
     <td>Implicit conversion sequence for empty initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1544">
     <td><a href="https://cplusplus.github.io/CWG/issues/1544.html">1544</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD3</td>
     <td>Linkage of member of unnamed namespace</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1545">
     <td><a href="https://cplusplus.github.io/CWG/issues/1545.html">1545</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>NAD</td>
     <td><TT>friend</TT> function templates defined in class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1546">
     <td><a href="https://cplusplus.github.io/CWG/issues/1546.html">1546</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>NAD</td>
     <td>Errors in function template default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1547">
     <td><a href="https://cplusplus.github.io/CWG/issues/1547.html">1547</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td><TT>typename</TT> keyword in <I>alias-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1548">
     <td><a href="https://cplusplus.github.io/CWG/issues/1548.html">1548</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>open</td>
     <td>Copy/move construction and conversion functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1549">
     <td><a href="https://cplusplus.github.io/CWG/issues/1549.html">1549</a></td>
+    <td>[<a href="https://wg21.link/over.binary">over.binary</a>]</td>
     <td>open</td>
     <td>Overloaded comma operator with <TT>void</TT> operand</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1550">
     <td><a href="https://cplusplus.github.io/CWG/issues/1550.html">1550</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD3</td>
     <td>Parenthesized <I>throw-expression</I> operand of <I>conditional-expression</I></td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="1551">
     <td><a href="https://cplusplus.github.io/CWG/issues/1551.html">1551</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>C++14</td>
     <td>Wording problems in <I>using-declaration</I> specification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1552">
     <td><a href="https://cplusplus.github.io/CWG/issues/1552.html">1552</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD4</td>
     <td><I>exception-specification</I>s and defaulted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1553">
     <td><a href="https://cplusplus.github.io/CWG/issues/1553.html">1553</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>CD3</td>
     <td><TT>sizeof</TT> and xvalue bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1554">
     <td><a href="https://cplusplus.github.io/CWG/issues/1554.html">1554</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>drafting</td>
     <td>Access and alias templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1555">
     <td><a href="https://cplusplus.github.io/CWG/issues/1555.html">1555</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>NAD</td>
     <td>Language linkage and function type compatibility</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1556">
     <td><a href="https://cplusplus.github.io/CWG/issues/1556.html">1556</a></td>
+    <td>[<a href="https://wg21.link/over.match.copy">over.match.copy</a>]</td>
     <td>CD3</td>
     <td>Constructors and explicit conversion functions in direct initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1557">
     <td><a href="https://cplusplus.github.io/CWG/issues/1557.html">1557</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD3</td>
     <td>Language linkage of converted lambda function pointer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1558">
     <td><a href="https://cplusplus.github.io/CWG/issues/1558.html">1558</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>CD4</td>
     <td>Unused arguments in alias template specializations</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr id="1559">
     <td><a href="https://cplusplus.github.io/CWG/issues/1559.html">1559</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD3</td>
     <td>String too long in initializer list of <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1560">
     <td><a href="https://cplusplus.github.io/CWG/issues/1560.html">1560</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD3</td>
     <td>Gratuitous lvalue-to-rvalue conversion in <I>conditional-expression</I> with <I>throw-expression</I> operand</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1561">
     <td><a href="https://cplusplus.github.io/CWG/issues/1561.html">1561</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD4</td>
     <td>Aggregates with empty base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1562">
     <td><a href="https://cplusplus.github.io/CWG/issues/1562.html">1562</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++14</td>
     <td>Non-static data member initializers and union <I>ctor-initializer</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1563">
     <td><a href="https://cplusplus.github.io/CWG/issues/1563.html">1563</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>CD3</td>
     <td>List-initialization and overloaded function disambiguation</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1564">
     <td><a href="https://cplusplus.github.io/CWG/issues/1564.html">1564</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>NAD</td>
     <td>Template argument deduction from an initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1565">
     <td><a href="https://cplusplus.github.io/CWG/issues/1565.html">1565</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Copy elision and lifetime of <TT>initializer_list</TT> underlying array</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1566">
     <td><a href="https://cplusplus.github.io/CWG/issues/1566.html">1566</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>NAD</td>
     <td>Should <TT>new std::initializer_list&lt;T&gt;</TT> be ill-formed?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1567">
     <td><a href="https://cplusplus.github.io/CWG/issues/1567.html">1567</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>C++14</td>
     <td>Inheriting constructors and copy/move constructors</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="1568">
     <td><a href="https://cplusplus.github.io/CWG/issues/1568.html">1568</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>dup</td>
     <td>Temporary lifetime extension with intervening cast</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1569">
     <td><a href="https://cplusplus.github.io/CWG/issues/1569.html">1569</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>C++14</td>
     <td>Deducing a function parameter pack before ellipsis</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1570">
     <td><a href="https://cplusplus.github.io/CWG/issues/1570.html">1570</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++14</td>
     <td>Address of subobject as non-type template argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1571">
     <td><a href="https://cplusplus.github.io/CWG/issues/1571.html">1571</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD4</td>
     <td>cv-qualification for indirect reference binding via conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1572">
     <td><a href="https://cplusplus.github.io/CWG/issues/1572.html">1572</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD4</td>
     <td>Incorrect example for rvalue reference binding via conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1573">
     <td><a href="https://cplusplus.github.io/CWG/issues/1573.html">1573</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Inherited constructor characteristics</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1574">
     <td><a href="https://cplusplus.github.io/CWG/issues/1574.html">1574</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>NAD</td>
     <td>Explicitly-defaulted <TT>constexpr</TT> functions in wrapper templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1575">
     <td><a href="https://cplusplus.github.io/CWG/issues/1575.html">1575</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.safety">basic.stc.dynamic.safety</a>]</td>
     <td>C++14</td>
     <td>Incorrect definition of &#8220;strict pointer safety&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1576">
     <td><a href="https://cplusplus.github.io/CWG/issues/1576.html">1576</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>C++14</td>
     <td>Discarded-value volatile xvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1577">
     <td><a href="https://cplusplus.github.io/CWG/issues/1577.html">1577</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>NAD</td>
     <td>Unnecessary restrictions on partial specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1578">
     <td><a href="https://cplusplus.github.io/CWG/issues/1578.html">1578</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>NAD</td>
     <td>Value-initialization of aggregates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1579">
     <td><a href="https://cplusplus.github.io/CWG/issues/1579.html">1579</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++14</td>
     <td>Return by converting move constructor</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr class="open" id="1580">
     <td><a href="https://cplusplus.github.io/CWG/issues/1580.html">1580</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>drafting</td>
     <td>Default arguments in explicit instantiations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1581">
     <td><a href="https://cplusplus.github.io/CWG/issues/1581.html">1581</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>When are <TT>constexpr</TT> member functions defined?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1582">
     <td><a href="https://cplusplus.github.io/CWG/issues/1582.html">1582</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>drafting</td>
     <td>Template default arguments and deduction failure</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1583">
     <td><a href="https://cplusplus.github.io/CWG/issues/1583.html">1583</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>C++14</td>
     <td>Incorrect example of unspecified behavior</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1584">
     <td><a href="https://cplusplus.github.io/CWG/issues/1584.html">1584</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>drafting</td>
     <td>Deducing function types from cv-qualified types</td>
     <td align="center">
@@ -9335,1836 +10877,2142 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1585">
     <td><a href="https://cplusplus.github.io/CWG/issues/1585.html">1585</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>NAD</td>
     <td>Value category of member access of rvalue reference member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1586">
     <td><a href="https://cplusplus.github.io/CWG/issues/1586.html">1586</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>NAD</td>
     <td>Naming a destructor via <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1587">
     <td><a href="https://cplusplus.github.io/CWG/issues/1587.html">1587</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++14</td>
     <td><TT>constexpr</TT> initialization and nested anonymous unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1588">
     <td><a href="https://cplusplus.github.io/CWG/issues/1588.html">1588</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD3</td>
     <td>Deducing cv-qualified <TT>auto</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1589">
     <td><a href="https://cplusplus.github.io/CWG/issues/1589.html">1589</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD4</td>
     <td>Ambiguous ranking of list-initialization sequences</td>
     <td class="full" align="center">Clang 3.7 (C++11 onwards)</td>
   </tr>
   <tr id="1590">
     <td><a href="https://cplusplus.github.io/CWG/issues/1590.html">1590</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD4</td>
     <td>Bypassing non-copy/move constructor copying</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1591">
     <td><a href="https://cplusplus.github.io/CWG/issues/1591.html">1591</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD4</td>
     <td>Deducing array bound and element type from initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1592">
     <td><a href="https://cplusplus.github.io/CWG/issues/1592.html">1592</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>C++14</td>
     <td>When do template parameters match?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1593">
     <td><a href="https://cplusplus.github.io/CWG/issues/1593.html">1593</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++14</td>
     <td>&#8220;Parameter type&#8221; of special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1594">
     <td><a href="https://cplusplus.github.io/CWG/issues/1594.html">1594</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>drafting</td>
     <td>Lazy declaration of special members vs overload errors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1595">
     <td><a href="https://cplusplus.github.io/CWG/issues/1595.html">1595</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++14</td>
     <td>Constructors &#8220;involved in&#8221; subobject initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1596">
     <td><a href="https://cplusplus.github.io/CWG/issues/1596.html">1596</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD4</td>
     <td>Non-array objects as <TT>array[1]</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1597">
     <td><a href="https://cplusplus.github.io/CWG/issues/1597.html">1597</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD3</td>
     <td>Misleading <TT>constexpr</TT> example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1598">
     <td><a href="https://cplusplus.github.io/CWG/issues/1598.html">1598</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>C++14</td>
     <td>Criterion for equality of pointers to members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1599">
     <td><a href="https://cplusplus.github.io/CWG/issues/1599.html">1599</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>Lifetime of <TT>initializer_list</TT> underlying array</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1600">
     <td><a href="https://cplusplus.github.io/CWG/issues/1600.html">1600</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD4</td>
     <td>Erroneous reference initialization in example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1601">
     <td><a href="https://cplusplus.github.io/CWG/issues/1601.html">1601</a></td>
+    <td>[<a href="https://wg21.link/conv.prom">conv.prom</a>]</td>
     <td>C++14</td>
     <td>Promotion of enumeration with fixed underlying type</td>
     <td class="full" align="center">Clang 10</td>
   </tr>
   <tr class="open" id="1602">
     <td><a href="https://cplusplus.github.io/CWG/issues/1602.html">1602</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>review</td>
     <td>Linkage of specialization vs linkage of template arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1603">
     <td><a href="https://cplusplus.github.io/CWG/issues/1603.html">1603</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD4</td>
     <td>Errors resulting from giving unnamed namespaces internal linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1604">
     <td><a href="https://cplusplus.github.io/CWG/issues/1604.html">1604</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>C++14</td>
     <td>Double temporaries in reference initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1605">
     <td><a href="https://cplusplus.github.io/CWG/issues/1605.html">1605</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD3</td>
     <td>Misleading parenthetical comment for explicit destructor call</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1606">
     <td><a href="https://cplusplus.github.io/CWG/issues/1606.html">1606</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>NAD</td>
     <td><TT>sizeof</TT> closure class</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1607">
     <td><a href="https://cplusplus.github.io/CWG/issues/1607.html">1607</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>C++14</td>
     <td>Lambdas in template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1608">
     <td><a href="https://cplusplus.github.io/CWG/issues/1608.html">1608</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>C++14</td>
     <td>Operator lookup in trailing return type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1609">
     <td><a href="https://cplusplus.github.io/CWG/issues/1609.html">1609</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>open</td>
     <td>Default arguments and function parameter packs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1610">
     <td><a href="https://cplusplus.github.io/CWG/issues/1610.html">1610</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>drafting</td>
     <td>Cv-qualification in deduction of reference to array</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1611">
     <td><a href="https://cplusplus.github.io/CWG/issues/1611.html">1611</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++14</td>
     <td>Deleted default constructor for abstract class</td>
     <td class="full" align="center">Duplicate of <a href="#1658">1658</a></td>
   </tr>
   <tr id="1612">
     <td><a href="https://cplusplus.github.io/CWG/issues/1612.html">1612</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++14</td>
     <td>Implicit lambda capture and anonymous unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1613">
     <td><a href="https://cplusplus.github.io/CWG/issues/1613.html">1613</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++14</td>
     <td>Constant expressions and lambda capture</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1614">
     <td><a href="https://cplusplus.github.io/CWG/issues/1614.html">1614</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD4</td>
     <td>Address of pure virtual function vs odr-use</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1615">
     <td><a href="https://cplusplus.github.io/CWG/issues/1615.html">1615</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>CD4</td>
     <td>Alignment of types, variables, and members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1616">
     <td><a href="https://cplusplus.github.io/CWG/issues/1616.html">1616</a></td>
+    <td>[<a href="https://wg21.link/stmt.ambig">stmt.ambig</a>]</td>
     <td>CD6</td>
     <td>Disambiguation parsing and template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1617">
     <td><a href="https://cplusplus.github.io/CWG/issues/1617.html">1617</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>open</td>
     <td><TT>alignas</TT> and non-defining declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1618">
     <td><a href="https://cplusplus.github.io/CWG/issues/1618.html">1618</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>C++14</td>
     <td>Gratuitously-unsigned underlying enum type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1619">
     <td><a href="https://cplusplus.github.io/CWG/issues/1619.html">1619</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>open</td>
     <td>Definition of current instantiation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1620">
     <td><a href="https://cplusplus.github.io/CWG/issues/1620.html">1620</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>open</td>
     <td>User-defined literals and extended integer types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1621">
     <td><a href="https://cplusplus.github.io/CWG/issues/1621.html">1621</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++20</td>
     <td>Member initializers in anonymous unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1622">
     <td><a href="https://cplusplus.github.io/CWG/issues/1622.html">1622</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++17</td>
     <td>Empty aggregate initializer for union</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1623">
     <td><a href="https://cplusplus.github.io/CWG/issues/1623.html">1623</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>drafting</td>
     <td>Deleted default union constructor and member initializers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1624">
     <td><a href="https://cplusplus.github.io/CWG/issues/1624.html">1624</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>NAD</td>
     <td>Destruction of union members with member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1625">
     <td><a href="https://cplusplus.github.io/CWG/issues/1625.html">1625</a></td>
+    <td>[<a href="https://wg21.link/cpp.stringize">cpp.stringize</a>]</td>
     <td>open</td>
     <td>Adding spaces between tokens in stringizing</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1626">
     <td><a href="https://cplusplus.github.io/CWG/issues/1626.html">1626</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>dup</td>
     <td><TT>constexpr</TT> member functions in <I>brace-or-equal-initializer</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1627">
     <td><a href="https://cplusplus.github.io/CWG/issues/1627.html">1627</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>NAD</td>
     <td>Agreement of dependent <TT>alignas</TT> specifiers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1628">
     <td><a href="https://cplusplus.github.io/CWG/issues/1628.html">1628</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Deallocation function templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1629">
     <td><a href="https://cplusplus.github.io/CWG/issues/1629.html">1629</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>C++14</td>
     <td>Can a closure class be a literal type?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1630">
     <td><a href="https://cplusplus.github.io/CWG/issues/1630.html">1630</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD4</td>
     <td>Multiple default constructor templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1631">
     <td><a href="https://cplusplus.github.io/CWG/issues/1631.html">1631</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>CD4</td>
     <td>Incorrect overload resolution for single-element <I>initializer-list</I></td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1632">
     <td><a href="https://cplusplus.github.io/CWG/issues/1632.html">1632</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD5</td>
     <td>Lambda capture in member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1633">
     <td><a href="https://cplusplus.github.io/CWG/issues/1633.html">1633</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD4</td>
     <td>Copy-initialization in member initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1634">
     <td><a href="https://cplusplus.github.io/CWG/issues/1634.html">1634</a></td>
+    <td>[<a href="https://wg21.link/basic.stc">basic.stc</a>]</td>
     <td>open</td>
     <td>Temporary storage duration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1635">
     <td><a href="https://cplusplus.github.io/CWG/issues/1635.html">1635</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>drafting</td>
     <td>How similar are template default arguments to function default arguments?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1636">
     <td><a href="https://cplusplus.github.io/CWG/issues/1636.html">1636</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD5</td>
     <td>Bits required for negative enumerator values</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1637">
     <td><a href="https://cplusplus.github.io/CWG/issues/1637.html">1637</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td>Recursion in <TT>constexpr</TT> template default constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1638">
     <td><a href="https://cplusplus.github.io/CWG/issues/1638.html">1638</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD4</td>
     <td>Declaring an explicit specialization of a scoped enumeration</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1639">
     <td><a href="https://cplusplus.github.io/CWG/issues/1639.html">1639</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td><I>exception-specification</I>s and pointer/pointer-to-member expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1640">
     <td><a href="https://cplusplus.github.io/CWG/issues/1640.html">1640</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>CD5</td>
     <td>Array of abstract instance of class template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1641">
     <td><a href="https://cplusplus.github.io/CWG/issues/1641.html">1641</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>NAD</td>
     <td>Assignment in member initializer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1642">
     <td><a href="https://cplusplus.github.io/CWG/issues/1642.html">1642</a></td>
+    <td>[<a href="https://wg21.link/expr.compound">expr.compound</a>]</td>
     <td>CD7</td>
     <td>Missing requirements for prvalue operands</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1643">
     <td><a href="https://cplusplus.github.io/CWG/issues/1643.html">1643</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Default arguments for template parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1644">
     <td><a href="https://cplusplus.github.io/CWG/issues/1644.html">1644</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>NAD</td>
     <td>Equivalent <I>exception-specification</I>s in function template declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1645">
     <td><a href="https://cplusplus.github.io/CWG/issues/1645.html">1645</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Identical inheriting constructors via default arguments</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1646">
     <td><a href="https://cplusplus.github.io/CWG/issues/1646.html">1646</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD5</td>
     <td><I>decltype-specifier</I>s, abstract classes, and deduction failure</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1647">
     <td><a href="https://cplusplus.github.io/CWG/issues/1647.html">1647</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>drafting</td>
     <td>Type agreement of non-type template arguments in partial specializations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1648">
     <td><a href="https://cplusplus.github.io/CWG/issues/1648.html">1648</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>C++14</td>
     <td><TT>thread_local</TT> vs block extern declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1649">
     <td><a href="https://cplusplus.github.io/CWG/issues/1649.html">1649</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++14</td>
     <td>Error in the syntax of <I>mem-initializer-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1650">
     <td><a href="https://cplusplus.github.io/CWG/issues/1650.html">1650</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>NAD</td>
     <td>Class prvalues in reference initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1651">
     <td><a href="https://cplusplus.github.io/CWG/issues/1651.html">1651</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>NAD</td>
     <td>Lifetime extension of temporary via reference to subobject</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1652">
     <td><a href="https://cplusplus.github.io/CWG/issues/1652.html">1652</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>CD4</td>
     <td>Object addresses in <TT>constexpr</TT> expressions</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr id="1653">
     <td><a href="https://cplusplus.github.io/CWG/issues/1653.html">1653</a></td>
+    <td>[<a href="https://wg21.link/expr.pre.incr">expr.pre.incr</a>]</td>
     <td>CD4</td>
     <td>Removing deprecated increment of <TT>bool</TT></td>
     <td class="full" align="center">Clang 4 (C++17 onwards)</td>
   </tr>
   <tr id="1654">
     <td><a href="https://cplusplus.github.io/CWG/issues/1654.html">1654</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>dup</td>
     <td>Literal types and <TT>constexpr</TT> defaulted constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1655">
     <td><a href="https://cplusplus.github.io/CWG/issues/1655.html">1655</a></td>
+    <td>[<a href="https://wg21.link/lex.pptoken">lex.pptoken</a>]</td>
     <td>open</td>
     <td>Line endings in raw string literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1656">
     <td><a href="https://cplusplus.github.io/CWG/issues/1656.html">1656</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD6</td>
     <td>Encoding of numerically-escaped characters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1657">
     <td><a href="https://cplusplus.github.io/CWG/issues/1657.html">1657</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD4</td>
     <td>Attributes for namespaces and enumerators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1658">
     <td><a href="https://cplusplus.github.io/CWG/issues/1658.html">1658</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++14</td>
     <td>Deleted default constructor for abstract class via destructor</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr class="open" id="1659">
     <td><a href="https://cplusplus.github.io/CWG/issues/1659.html">1659</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>open</td>
     <td>Initialization order of thread_local template static data members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1660">
     <td><a href="https://cplusplus.github.io/CWG/issues/1660.html">1660</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++14</td>
     <td><I>member-declaration</I> requirements and unnamed bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1661">
     <td><a href="https://cplusplus.github.io/CWG/issues/1661.html">1661</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>NAD</td>
     <td>Preservation of infinite loops</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1662">
     <td><a href="https://cplusplus.github.io/CWG/issues/1662.html">1662</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++14</td>
     <td>Capturing function parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1663">
     <td><a href="https://cplusplus.github.io/CWG/issues/1663.html">1663</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>NAD</td>
     <td>Capturing an empty pack expansion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1664">
     <td><a href="https://cplusplus.github.io/CWG/issues/1664.html">1664</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>C++14</td>
     <td>Argument-dependent lookup of lambdas used in default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1665">
     <td><a href="https://cplusplus.github.io/CWG/issues/1665.html">1665</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>drafting</td>
     <td>Declaration matching in explicit instantiations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1666">
     <td><a href="https://cplusplus.github.io/CWG/issues/1666.html">1666</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++14</td>
     <td>Address constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1667">
     <td><a href="https://cplusplus.github.io/CWG/issues/1667.html">1667</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>NAD</td>
     <td>Function exiting via exception called by destructor during unwinding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1668">
     <td><a href="https://cplusplus.github.io/CWG/issues/1668.html">1668</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>drafting</td>
     <td>Parameter type determination still not clear enough</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1669">
     <td><a href="https://cplusplus.github.io/CWG/issues/1669.html">1669</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>C++14</td>
     <td><TT>auto</TT> return type for <TT>main</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1670">
     <td><a href="https://cplusplus.github.io/CWG/issues/1670.html">1670</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>review</td>
     <td><TT>auto</TT> as <I>conversion-type-id</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1671">
     <td><a href="https://cplusplus.github.io/CWG/issues/1671.html">1671</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>NAD</td>
     <td>Unclear rules for deduction with cv-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1672">
     <td><a href="https://cplusplus.github.io/CWG/issues/1672.html">1672</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td>Layout compatibility with multiple empty bases</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1673">
     <td><a href="https://cplusplus.github.io/CWG/issues/1673.html">1673</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>C++14</td>
     <td>Clarifying overload resolution for the second step of copy-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1674">
     <td><a href="https://cplusplus.github.io/CWG/issues/1674.html">1674</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>C++14</td>
     <td>Return type deduction for address of function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1675">
     <td><a href="https://cplusplus.github.io/CWG/issues/1675.html">1675</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>NAD</td>
     <td>Size limit for automatic array object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1676">
     <td><a href="https://cplusplus.github.io/CWG/issues/1676.html">1676</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>drafting</td>
     <td><TT>auto</TT> return type for allocation and deallocation functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1677">
     <td><a href="https://cplusplus.github.io/CWG/issues/1677.html">1677</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>C++17</td>
     <td>Constant initialization via aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1678">
     <td><a href="https://cplusplus.github.io/CWG/issues/1678.html">1678</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>NAD</td>
     <td>Naming the type of an array of runtime bound</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1679">
     <td><a href="https://cplusplus.github.io/CWG/issues/1679.html">1679</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>NAD</td>
     <td>Range-based <TT>for</TT> and array of runtime bound</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1680">
     <td><a href="https://cplusplus.github.io/CWG/issues/1680.html">1680</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>drafting</td>
     <td>Including <TT>&lt;initializer_list&gt;</TT> for range-based <TT>for</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1681">
     <td><a href="https://cplusplus.github.io/CWG/issues/1681.html">1681</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++14</td>
     <td><I>init-capture</I>s and nested lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1682">
     <td><a href="https://cplusplus.github.io/CWG/issues/1682.html">1682</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>open</td>
     <td>Overly-restrictive rules on function templates as allocation functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1683">
     <td><a href="https://cplusplus.github.io/CWG/issues/1683.html">1683</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Incorrect example after <TT>constexpr</TT> changes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1684">
     <td><a href="https://cplusplus.github.io/CWG/issues/1684.html">1684</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++14</td>
     <td>Static <TT>constexpr</TT> member functions for non-literal classes</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr id="1685">
     <td><a href="https://cplusplus.github.io/CWG/issues/1685.html">1685</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>NAD</td>
     <td>Value category of <TT>noexcept</TT> expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1686">
     <td><a href="https://cplusplus.github.io/CWG/issues/1686.html">1686</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD4</td>
     <td>Which variables are &#8220;explicitly declared <TT>const</TT>?&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1687">
     <td><a href="https://cplusplus.github.io/CWG/issues/1687.html">1687</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>C++14</td>
     <td>Conversions of operands of built-in operators</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1688">
     <td><a href="https://cplusplus.github.io/CWG/issues/1688.html">1688</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td>Volatile <TT>constexpr</TT> variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1689">
     <td><a href="https://cplusplus.github.io/CWG/issues/1689.html">1689</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++14</td>
     <td>Syntactic nonterminal for operand of <TT>alignas</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1690">
     <td><a href="https://cplusplus.github.io/CWG/issues/1690.html">1690</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>C++14</td>
     <td>Associated namespace for local type</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="1691">
     <td><a href="https://cplusplus.github.io/CWG/issues/1691.html">1691</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>C++14</td>
     <td>Argument-dependent lookup and opaque enumerations</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="1692">
     <td><a href="https://cplusplus.github.io/CWG/issues/1692.html">1692</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>C++14</td>
     <td>Associated namespaces of doubly-nested classes</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="1693">
     <td><a href="https://cplusplus.github.io/CWG/issues/1693.html">1693</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++14</td>
     <td>Superfluous semicolons in class definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1694">
     <td><a href="https://cplusplus.github.io/CWG/issues/1694.html">1694</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Restriction on reference to temporary as a constant expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1695">
     <td><a href="https://cplusplus.github.io/CWG/issues/1695.html">1695</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>NAD</td>
     <td>Lifetime extension via <I>init-capture</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1696">
     <td><a href="https://cplusplus.github.io/CWG/issues/1696.html">1696</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD4</td>
     <td>Temporary lifetime and non-static data member initializers</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1697">
     <td><a href="https://cplusplus.github.io/CWG/issues/1697.html">1697</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD4</td>
     <td>Lifetime extension and copy elision</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1698">
     <td><a href="https://cplusplus.github.io/CWG/issues/1698.html">1698</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD7</td>
     <td>Files ending in <TT>\</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1699">
     <td><a href="https://cplusplus.github.io/CWG/issues/1699.html">1699</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>open</td>
     <td>Does befriending a class befriend its friends?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1700">
     <td><a href="https://cplusplus.github.io/CWG/issues/1700.html">1700</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>NAD</td>
     <td>Does the special rvalue-reference deduction apply to alias templates?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1701">
     <td><a href="https://cplusplus.github.io/CWG/issues/1701.html">1701</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>open</td>
     <td>Array vs sequence in object representation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1702">
     <td><a href="https://cplusplus.github.io/CWG/issues/1702.html">1702</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>drafting</td>
     <td>Rephrasing the definition of &#8220;anonymous union&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1703">
     <td><a href="https://cplusplus.github.io/CWG/issues/1703.html">1703</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>Language linkage of names of functions with internal linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1704">
     <td><a href="https://cplusplus.github.io/CWG/issues/1704.html">1704</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD5</td>
     <td>Type checking in explicit instantiation of variable templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1705">
     <td><a href="https://cplusplus.github.io/CWG/issues/1705.html">1705</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>CD4</td>
     <td>Unclear specification of &#8220;more specialized&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1706">
     <td><a href="https://cplusplus.github.io/CWG/issues/1706.html">1706</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>drafting</td>
     <td><TT>alignas</TT> pack expansion syntax</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1707">
     <td><a href="https://cplusplus.github.io/CWG/issues/1707.html">1707</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>C++14</td>
     <td><TT>template</TT> in <I>elaborated-type-specifier</I> without <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1708">
     <td><a href="https://cplusplus.github.io/CWG/issues/1708.html">1708</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD4</td>
     <td>overly-strict requirements for names with C language linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1709">
     <td><a href="https://cplusplus.github.io/CWG/issues/1709.html">1709</a></td>
+    <td>[<a href="https://wg21.link/cpp.stringize">cpp.stringize</a>]</td>
     <td>open</td>
     <td>Stringizing raw string literals containing newline</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1710">
     <td><a href="https://cplusplus.github.io/CWG/issues/1710.html">1710</a></td>
+    <td>[<a href="https://wg21.link/class.derived">class.derived</a>]</td>
     <td>C++17</td>
     <td>Missing <TT>template</TT> keyword in <I>class-or-decltype</I></td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1711">
     <td><a href="https://cplusplus.github.io/CWG/issues/1711.html">1711</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>CD6</td>
     <td>Missing specification of variable template partial specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1712">
     <td><a href="https://cplusplus.github.io/CWG/issues/1712.html">1712</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD4</td>
     <td><TT>constexpr</TT> variable template declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1713">
     <td><a href="https://cplusplus.github.io/CWG/issues/1713.html">1713</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>dup</td>
     <td>Linkage of variable template specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1714">
     <td><a href="https://cplusplus.github.io/CWG/issues/1714.html">1714</a></td>
+    <td>[<a href="https://wg21.link/class.local">class.local</a>]</td>
     <td>NAD</td>
     <td>odr-use of <TT>this</TT> from a local class</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1715">
     <td><a href="https://cplusplus.github.io/CWG/issues/1715.html">1715</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Access and inherited constructor templates</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1716">
     <td><a href="https://cplusplus.github.io/CWG/issues/1716.html">1716</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>C++14</td>
     <td>When are default arguments evaluated?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1717">
     <td><a href="https://cplusplus.github.io/CWG/issues/1717.html">1717</a></td>
+    <td>[<a href="https://wg21.link/lex.icon">lex.icon</a>]</td>
     <td>C++14</td>
     <td>Missing specification of type of binary literal</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1718">
     <td><a href="https://cplusplus.github.io/CWG/issues/1718.html">1718</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace">cpp.replace</a>]</td>
     <td>open</td>
     <td>Macro invocation spanning end-of-file</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1719">
     <td><a href="https://cplusplus.github.io/CWG/issues/1719.html">1719</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td>Layout compatibility and cv-qualification revisited</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="1720">
     <td><a href="https://cplusplus.github.io/CWG/issues/1720.html">1720</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>NAD</td>
     <td>Macro invocation in <TT>#include</TT> directive</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1721">
     <td><a href="https://cplusplus.github.io/CWG/issues/1721.html">1721</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>review</td>
     <td>Diagnosing ODR violations for static data members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1722">
     <td><a href="https://cplusplus.github.io/CWG/issues/1722.html">1722</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD4</td>
     <td>Should lambda to function pointer conversion function be <TT>noexcept</TT>?</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr class="open" id="1723">
     <td><a href="https://cplusplus.github.io/CWG/issues/1723.html">1723</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>open</td>
     <td>Multicharacter user-defined character literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1724">
     <td><a href="https://cplusplus.github.io/CWG/issues/1724.html">1724</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD6</td>
     <td>Unclear rules for deduction failure</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1725">
     <td><a href="https://cplusplus.github.io/CWG/issues/1725.html">1725</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>NAD</td>
     <td>Trailing return type with nested function declarator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1726">
     <td><a href="https://cplusplus.github.io/CWG/issues/1726.html">1726</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>CD6</td>
     <td>Declarator operators and conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1727">
     <td><a href="https://cplusplus.github.io/CWG/issues/1727.html">1727</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Type of a specialization of a variable template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1728">
     <td><a href="https://cplusplus.github.io/CWG/issues/1728.html">1728</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD5</td>
     <td>Type of an explicit instantiation of a variable template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1729">
     <td><a href="https://cplusplus.github.io/CWG/issues/1729.html">1729</a></td>
+    <td>[<a href="https://wg21.link/temp.decls">temp.decls</a>]</td>
     <td>CD6</td>
     <td>Matching declarations and definitions of variable templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1730">
     <td><a href="https://cplusplus.github.io/CWG/issues/1730.html">1730</a></td>
+    <td>[<a href="https://wg21.link/temp.decls">temp.decls</a>]</td>
     <td>drafting</td>
     <td>Can a variable template have an unnamed type?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1731">
     <td><a href="https://cplusplus.github.io/CWG/issues/1731.html">1731</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td><TT>is_trivially_</TT><I>X</I> and definitions of special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1732">
     <td><a href="https://cplusplus.github.io/CWG/issues/1732.html">1732</a></td>
+    <td>[<a href="https://wg21.link/stmt.select">stmt.select</a>]</td>
     <td>C++14</td>
     <td>Defining types in <I>condition</I>s and range-based <TT>for</TT> statements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1733">
     <td><a href="https://cplusplus.github.io/CWG/issues/1733.html">1733</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD6</td>
     <td>Return type and value for <TT>operator=</TT> with <I>ref-qualifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1734">
     <td><a href="https://cplusplus.github.io/CWG/issues/1734.html">1734</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD4</td>
     <td>Nontrivial deleted copy functions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="1735">
     <td><a href="https://cplusplus.github.io/CWG/issues/1735.html">1735</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>open</td>
     <td>Out-of-range literals in <I>user-defined-literal</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1736">
     <td><a href="https://cplusplus.github.io/CWG/issues/1736.html">1736</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Inheriting constructor templates in a local class</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1737">
     <td><a href="https://cplusplus.github.io/CWG/issues/1737.html">1737</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++14</td>
     <td>Type dependence of call to a member of the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1738">
     <td><a href="https://cplusplus.github.io/CWG/issues/1738.html">1738</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>C++14</td>
     <td>Explicit instantiation/specialization of inheriting constructor templates</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P0136R1">P0136R1</a></td>
   </tr>
   <tr id="1739">
     <td><a href="https://cplusplus.github.io/CWG/issues/1739.html">1739</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>C++14</td>
     <td>Conversion of floating point to enumeration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1740">
     <td><a href="https://cplusplus.github.io/CWG/issues/1740.html">1740</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>C++14</td>
     <td>Disambiguation of <TT>noexcept</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1741">
     <td><a href="https://cplusplus.github.io/CWG/issues/1741.html">1741</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++14</td>
     <td>odr-use of class object in lvalue-to-rvalue conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1742">
     <td><a href="https://cplusplus.github.io/CWG/issues/1742.html">1742</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD5</td>
     <td><I>using-declaration</I>s and scoped enumerators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1743">
     <td><a href="https://cplusplus.github.io/CWG/issues/1743.html">1743</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>NAD</td>
     <td><I>init-capture</I>s in nested lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1744">
     <td><a href="https://cplusplus.github.io/CWG/issues/1744.html">1744</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD4</td>
     <td>Unordered initialization for variable template specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1745">
     <td><a href="https://cplusplus.github.io/CWG/issues/1745.html">1745</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td><TT>thread_local constexpr</TT> variable</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1746">
     <td><a href="https://cplusplus.github.io/CWG/issues/1746.html">1746</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++14</td>
     <td>Are volatile scalar types trivially copyable?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1747">
     <td><a href="https://cplusplus.github.io/CWG/issues/1747.html">1747</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>C++14</td>
     <td>Constant initialization of reference to function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1748">
     <td><a href="https://cplusplus.github.io/CWG/issues/1748.html">1748</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD4</td>
     <td>Placement new with a null pointer</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1749">
     <td><a href="https://cplusplus.github.io/CWG/issues/1749.html">1749</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>NAD</td>
     <td>Confusing definition for constant initializer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1750">
     <td><a href="https://cplusplus.github.io/CWG/issues/1750.html">1750</a></td>
+    <td>[<a href="https://wg21.link/over.match.copy">over.match.copy</a>]</td>
     <td>CD4</td>
     <td>&#8220;Argument&#8221; vs &#8220;parameter&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1751">
     <td><a href="https://cplusplus.github.io/CWG/issues/1751.html">1751</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD4</td>
     <td>Non-trivial operations vs non-trivial initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1752">
     <td><a href="https://cplusplus.github.io/CWG/issues/1752.html">1752</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD4</td>
     <td>Right-recursion in <I>mem-initializer-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1753">
     <td><a href="https://cplusplus.github.io/CWG/issues/1753.html">1753</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual">basic.lookup.qual</a>]</td>
     <td>CD4</td>
     <td><I>decltype-specifier</I> in <I>nested-name-specifier</I> of destructor</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="1754">
     <td><a href="https://cplusplus.github.io/CWG/issues/1754.html">1754</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>NAD</td>
     <td>Declaration of partial specialization of static data member template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1755">
     <td><a href="https://cplusplus.github.io/CWG/issues/1755.html">1755</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.member">temp.spec.partial.member</a>]</td>
     <td>drafting</td>
     <td>Out-of-class partial specializations of member templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1756">
     <td><a href="https://cplusplus.github.io/CWG/issues/1756.html">1756</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>Direct-list-initialization of a non-class object</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1757">
     <td><a href="https://cplusplus.github.io/CWG/issues/1757.html">1757</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Const integral subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1758">
     <td><a href="https://cplusplus.github.io/CWG/issues/1758.html">1758</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>CD4</td>
     <td>Explicit conversion in copy/move list initialization</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1759">
     <td><a href="https://cplusplus.github.io/CWG/issues/1759.html">1759</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>C++14</td>
     <td>UTF-8 code units in plain <TT>char</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1760">
     <td><a href="https://cplusplus.github.io/CWG/issues/1760.html">1760</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++14</td>
     <td>Access of member corresponding to <I>init-capture</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1761">
     <td><a href="https://cplusplus.github.io/CWG/issues/1761.html">1761</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>NAD</td>
     <td>Runtime check on size of automatic array</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1762">
     <td><a href="https://cplusplus.github.io/CWG/issues/1762.html">1762</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>C++14</td>
     <td>Reserved identifier used in <I>literal-operator-id</I> example</td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr class="open" id="1763">
     <td><a href="https://cplusplus.github.io/CWG/issues/1763.html">1763</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>open</td>
     <td>Length mismatch in template type deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1764">
     <td><a href="https://cplusplus.github.io/CWG/issues/1764.html">1764</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>C++14</td>
     <td>Hiding of function from using-declaration by signature</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1765">
     <td><a href="https://cplusplus.github.io/CWG/issues/1765.html">1765</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>C++14</td>
     <td>Overflow of enumeration used as enumerator value</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1766">
     <td><a href="https://cplusplus.github.io/CWG/issues/1766.html">1766</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD4</td>
     <td>Values outside the range of the values of an enumeration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1767">
     <td><a href="https://cplusplus.github.io/CWG/issues/1767.html">1767</a></td>
+    <td>[<a href="https://wg21.link/stmt.switch">stmt.switch</a>]</td>
     <td>C++14</td>
     <td>Scoped enumeration in a <TT>switch</TT> statement</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1768">
     <td><a href="https://cplusplus.github.io/CWG/issues/1768.html">1768</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>NAD</td>
     <td>Zero-element array of runtime bound</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1769">
     <td><a href="https://cplusplus.github.io/CWG/issues/1769.html">1769</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>C++14</td>
     <td>Catching a base class of the exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1770">
     <td><a href="https://cplusplus.github.io/CWG/issues/1770.html">1770</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>C++14</td>
     <td>Type matching of non-type template parameters and arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1771">
     <td><a href="https://cplusplus.github.io/CWG/issues/1771.html">1771</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual">basic.lookup.qual</a>]</td>
     <td>CD6</td>
     <td>Restricted lookup in <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1772">
     <td><a href="https://cplusplus.github.io/CWG/issues/1772.html">1772</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>C++14</td>
     <td><TT>__func__</TT> in a lambda body</td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr id="1773">
     <td><a href="https://cplusplus.github.io/CWG/issues/1773.html">1773</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>C++14</td>
     <td>Out-of-lifetime lvalue-to-rvalue conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1774">
     <td><a href="https://cplusplus.github.io/CWG/issues/1774.html">1774</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>CD4</td>
     <td>Discrepancy between subobject destruction and stack unwinding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1775">
     <td><a href="https://cplusplus.github.io/CWG/issues/1775.html">1775</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>C++14</td>
     <td>Undefined behavior of line splice in raw string literal</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1776">
     <td><a href="https://cplusplus.github.io/CWG/issues/1776.html">1776</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD4</td>
     <td>Replacement of class objects containing reference members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1777">
     <td><a href="https://cplusplus.github.io/CWG/issues/1777.html">1777</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Empty pack expansion in <I>dynamic-exception-specification</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1778">
     <td><a href="https://cplusplus.github.io/CWG/issues/1778.html">1778</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++14</td>
     <td><I>exception-specification</I> in explicitly-defaulted functions</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="1779">
     <td><a href="https://cplusplus.github.io/CWG/issues/1779.html">1779</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD4</td>
     <td>Type dependency of <TT>__func__</TT></td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr id="1780">
     <td><a href="https://cplusplus.github.io/CWG/issues/1780.html">1780</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD4</td>
     <td>Explicit instantiation/specialization of generic lambda <TT>operator()</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1781">
     <td><a href="https://cplusplus.github.io/CWG/issues/1781.html">1781</a></td>
+    <td>[<a href="https://wg21.link/over.match.conv">over.match.conv</a>]</td>
     <td>CD5</td>
     <td>Converting from <TT>nullptr_t</TT> to <TT>bool</TT> in overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1782">
     <td><a href="https://cplusplus.github.io/CWG/issues/1782.html">1782</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD4</td>
     <td>Form of initialization for <TT>nullptr_t</TT> to <TT>bool</TT> conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1783">
     <td><a href="https://cplusplus.github.io/CWG/issues/1783.html">1783</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>NAD</td>
     <td>Why are virtual destructors non-trivial?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1784">
     <td><a href="https://cplusplus.github.io/CWG/issues/1784.html">1784</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>C++17</td>
     <td>Concurrent execution during static local initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1785">
     <td><a href="https://cplusplus.github.io/CWG/issues/1785.html">1785</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td>Conflicting diagnostic requirements for template definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1786">
     <td><a href="https://cplusplus.github.io/CWG/issues/1786.html">1786</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>C++14</td>
     <td>Effect of merging allocations on memory leakage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1787">
     <td><a href="https://cplusplus.github.io/CWG/issues/1787.html">1787</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>C++14</td>
     <td>Uninitialized <TT>unsigned char</TT> values</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1788">
     <td><a href="https://cplusplus.github.io/CWG/issues/1788.html">1788</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD4</td>
     <td>Sized deallocation of array of non-class type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1789">
     <td><a href="https://cplusplus.github.io/CWG/issues/1789.html">1789</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>open</td>
     <td>Array reference vs array decay in overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1790">
     <td><a href="https://cplusplus.github.io/CWG/issues/1790.html">1790</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>open</td>
     <td>Ellipsis following function parameter pack</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1791">
     <td><a href="https://cplusplus.github.io/CWG/issues/1791.html">1791</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>CD4</td>
     <td>Incorrect restrictions on <I>cv-qualifier-seq</I> and <I>ref-qualifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1792">
     <td><a href="https://cplusplus.github.io/CWG/issues/1792.html">1792</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Incorrect example of explicit specialization of member enumeration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1793">
     <td><a href="https://cplusplus.github.io/CWG/issues/1793.html">1793</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD4</td>
     <td><TT>thread_local</TT> in explicit specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1794">
     <td><a href="https://cplusplus.github.io/CWG/issues/1794.html">1794</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++17</td>
     <td><TT>template</TT> keyword and alias templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1795">
     <td><a href="https://cplusplus.github.io/CWG/issues/1795.html">1795</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD4</td>
     <td>Disambiguating <I>original-namespace-definition</I> and <I>extension-namespace-definition</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1796">
     <td><a href="https://cplusplus.github.io/CWG/issues/1796.html">1796</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>CD4</td>
     <td>Is all-bits-zero for null characters a meaningful requirement?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1797">
     <td><a href="https://cplusplus.github.io/CWG/issues/1797.html">1797</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD4</td>
     <td>Are all bit patterns of <TT>unsigned char</TT> distinct numbers?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1798">
     <td><a href="https://cplusplus.github.io/CWG/issues/1798.html">1798</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td><I>exception-specification</I>s of template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1799">
     <td><a href="https://cplusplus.github.io/CWG/issues/1799.html">1799</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD4</td>
     <td><TT>mutable</TT> and non-explicit const qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1800">
     <td><a href="https://cplusplus.github.io/CWG/issues/1800.html">1800</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD4</td>
     <td>Pointer to member of nested anonymous union</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="1801">
     <td><a href="https://cplusplus.github.io/CWG/issues/1801.html">1801</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>CD4</td>
     <td>Kind of expression referring to member of anonymous union</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="1802">
     <td><a href="https://cplusplus.github.io/CWG/issues/1802.html">1802</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD4</td>
     <td><TT>char16_t</TT> string literals and surrogate pairs</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1803">
     <td><a href="https://cplusplus.github.io/CWG/issues/1803.html">1803</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD5</td>
     <td><I>opaque-enum-declaration</I> as <I>member-declaration</I></td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="1804">
     <td><a href="https://cplusplus.github.io/CWG/issues/1804.html">1804</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD4</td>
     <td>Partial specialization and friendship</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1805">
     <td><a href="https://cplusplus.github.io/CWG/issues/1805.html">1805</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD4</td>
     <td>Conversions of array operands in <I>conditional-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1806">
     <td><a href="https://cplusplus.github.io/CWG/issues/1806.html">1806</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>CD4</td>
     <td>Virtual bases and move-assignment</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1807">
     <td><a href="https://cplusplus.github.io/CWG/issues/1807.html">1807</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>CD4</td>
     <td>Order of destruction of array elements after an exception</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr class="open" id="1808">
     <td><a href="https://cplusplus.github.io/CWG/issues/1808.html">1808</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>drafting</td>
     <td>Constructor templates vs default constructors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1809">
     <td><a href="https://cplusplus.github.io/CWG/issues/1809.html">1809</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD4</td>
     <td>Narrowing and template argument deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1810">
     <td><a href="https://cplusplus.github.io/CWG/issues/1810.html">1810</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>CD4</td>
     <td>Invalid <I>ud-suffix</I>es</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1811">
     <td><a href="https://cplusplus.github.io/CWG/issues/1811.html">1811</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD4</td>
     <td>Lookup of deallocation function in a virtual destructor definition</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1812">
     <td><a href="https://cplusplus.github.io/CWG/issues/1812.html">1812</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++17</td>
     <td>Omission of <TT>template</TT> in a <I>typename-specifier</I></td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1813">
     <td><a href="https://cplusplus.github.io/CWG/issues/1813.html">1813</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD4</td>
     <td>Direct vs indirect bases in standard-layout classes</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1814">
     <td><a href="https://cplusplus.github.io/CWG/issues/1814.html">1814</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD4</td>
     <td>Default arguments in <I>lambda-expression</I>s</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1815">
     <td><a href="https://cplusplus.github.io/CWG/issues/1815.html">1815</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD4</td>
     <td>Lifetime extension in aggregate initialization</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="1816">
     <td><a href="https://cplusplus.github.io/CWG/issues/1816.html">1816</a></td>
+    <td>[<a href="https://wg21.link/conv.integral">conv.integral</a>]</td>
     <td>CD4</td>
     <td>Unclear specification of bit-field values</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1817">
     <td><a href="https://cplusplus.github.io/CWG/issues/1817.html">1817</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>open</td>
     <td>Linkage specifications and nested scopes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1818">
     <td><a href="https://cplusplus.github.io/CWG/issues/1818.html">1818</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD6</td>
     <td>Visibility and inherited language linkage</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="1819">
     <td><a href="https://cplusplus.github.io/CWG/issues/1819.html">1819</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>CD4</td>
     <td>Acceptable scopes for definition of partial specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1820">
     <td><a href="https://cplusplus.github.io/CWG/issues/1820.html">1820</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD6</td>
     <td>Qualified typedef names</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1821">
     <td><a href="https://cplusplus.github.io/CWG/issues/1821.html">1821</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD6</td>
     <td>Qualified redeclarations in a class <I>member-specification</I></td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="1822">
     <td><a href="https://cplusplus.github.io/CWG/issues/1822.html">1822</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD6</td>
     <td>Lookup of parameter names in <I>lambda-expression</I>s</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1823">
     <td><a href="https://cplusplus.github.io/CWG/issues/1823.html">1823</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD4</td>
     <td>String literal uniqueness in inline functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1824">
     <td><a href="https://cplusplus.github.io/CWG/issues/1824.html">1824</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD4</td>
     <td>Completeness of return type vs point of instantiation</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1825">
     <td><a href="https://cplusplus.github.io/CWG/issues/1825.html">1825</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>C++17</td>
     <td>Partial ordering between variadic and non-variadic function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1826">
     <td><a href="https://cplusplus.github.io/CWG/issues/1826.html">1826</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td><TT>const</TT> floating-point in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1827">
     <td><a href="https://cplusplus.github.io/CWG/issues/1827.html">1827</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>drafting</td>
     <td>Reference binding with ambiguous conversions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1828">
     <td><a href="https://cplusplus.github.io/CWG/issues/1828.html">1828</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual">basic.lookup.qual</a>]</td>
     <td>CD6</td>
     <td><I>nested-name-specifier</I> ambiguity</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1829">
     <td><a href="https://cplusplus.github.io/CWG/issues/1829.html">1829</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD6</td>
     <td>Dependent unnamed types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1830">
     <td><a href="https://cplusplus.github.io/CWG/issues/1830.html">1830</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>CD4</td>
     <td>Repeated specifiers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1831">
     <td><a href="https://cplusplus.github.io/CWG/issues/1831.html">1831</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Explicitly vs implicitly deleted move constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1832">
     <td><a href="https://cplusplus.github.io/CWG/issues/1832.html">1832</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD4</td>
     <td>Casting to incomplete enumeration</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="1833">
     <td><a href="https://cplusplus.github.io/CWG/issues/1833.html">1833</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td><TT>friend</TT> declarations naming implicitly-declared member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1834">
     <td><a href="https://cplusplus.github.io/CWG/issues/1834.html">1834</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD4</td>
     <td>Constant initialization binding a reference to an xvalue</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1835">
     <td><a href="https://cplusplus.github.io/CWG/issues/1835.html">1835</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD6</td>
     <td>Dependent member lookup before <TT>&lt;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1836">
     <td><a href="https://cplusplus.github.io/CWG/issues/1836.html">1836</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD5</td>
     <td>Use of class type being defined in <I>trailing-return-type</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1837">
     <td><a href="https://cplusplus.github.io/CWG/issues/1837.html">1837</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD6</td>
     <td>Use of <TT>this</TT> in <TT>friend</TT> and local class declarations</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="1838">
     <td><a href="https://cplusplus.github.io/CWG/issues/1838.html">1838</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>CD4</td>
     <td>Definition via <I>unqualified-id</I> and <I>using-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1839">
     <td><a href="https://cplusplus.github.io/CWG/issues/1839.html">1839</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD6</td>
     <td>Lookup of block-scope <TT>extern</TT> declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1840">
     <td><a href="https://cplusplus.github.io/CWG/issues/1840.html">1840</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>drafting</td>
     <td>Non-deleted explicit specialization of deleted function template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1841">
     <td><a href="https://cplusplus.github.io/CWG/issues/1841.html">1841</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>CD6</td>
     <td><TT>&lt;</TT> following template injected-class-name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1842">
     <td><a href="https://cplusplus.github.io/CWG/issues/1842.html">1842</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>open</td>
     <td>Unevaluated operands and &#8220;carries a dependency&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1843">
     <td><a href="https://cplusplus.github.io/CWG/issues/1843.html">1843</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD4</td>
     <td>Bit-field in conditional operator with <TT>throw</TT> operand</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1844">
     <td><a href="https://cplusplus.github.io/CWG/issues/1844.html">1844</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>open</td>
     <td>Defining &#8220;immediate context&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1845">
     <td><a href="https://cplusplus.github.io/CWG/issues/1845.html">1845</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>review</td>
     <td>Point of instantiation of a variable template specialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1846">
     <td><a href="https://cplusplus.github.io/CWG/issues/1846.html">1846</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD4</td>
     <td>Declaring explicitly-defaulted implicitly-deleted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1847">
     <td><a href="https://cplusplus.github.io/CWG/issues/1847.html">1847</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD4</td>
     <td>Clarifying compatibility during partial ordering</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1848">
     <td><a href="https://cplusplus.github.io/CWG/issues/1848.html">1848</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD4</td>
     <td>Parenthesized constructor and destructor declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1849">
     <td><a href="https://cplusplus.github.io/CWG/issues/1849.html">1849</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD6</td>
     <td>Variable templates and the ODR</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1850">
     <td><a href="https://cplusplus.github.io/CWG/issues/1850.html">1850</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD4</td>
     <td>Differences between definition context and point of instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1851">
     <td><a href="https://cplusplus.github.io/CWG/issues/1851.html">1851</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD4</td>
     <td><TT>decltype(auto)</TT> in <I>new-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1852">
     <td><a href="https://cplusplus.github.io/CWG/issues/1852.html">1852</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD4</td>
     <td>Wording issues regarding <TT>decltype(auto)</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1853">
     <td><a href="https://cplusplus.github.io/CWG/issues/1853.html">1853</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>dup</td>
     <td>Defining &#8220;allocated storage&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1854">
     <td><a href="https://cplusplus.github.io/CWG/issues/1854.html">1854</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>drafting</td>
     <td>Disallowing use of implicitly-deleted functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1855">
     <td><a href="https://cplusplus.github.io/CWG/issues/1855.html">1855</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>dup</td>
     <td>Out-of-lifetime access to nonstatic data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1856">
     <td><a href="https://cplusplus.github.io/CWG/issues/1856.html">1856</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>open</td>
     <td>Indirect nested classes of class templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1857">
     <td><a href="https://cplusplus.github.io/CWG/issues/1857.html">1857</a></td>
+    <td>[<a href="https://wg21.link/expr.shift">expr.shift</a>]</td>
     <td>CD5</td>
     <td>Additional questions about bits</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1858">
     <td><a href="https://cplusplus.github.io/CWG/issues/1858.html">1858</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>CD4</td>
     <td>Comparing pointers to union members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1859">
     <td><a href="https://cplusplus.github.io/CWG/issues/1859.html">1859</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD5</td>
     <td>UTF-16 in <TT>char16_t</TT> string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1860">
     <td><a href="https://cplusplus.github.io/CWG/issues/1860.html">1860</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>C++17</td>
     <td>What is a &#8220;direct member?&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1861">
     <td><a href="https://cplusplus.github.io/CWG/issues/1861.html">1861</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD4</td>
     <td>Values of a bit-field</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1862">
     <td><a href="https://cplusplus.github.io/CWG/issues/1862.html">1862</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD5</td>
     <td>Determining &#8220;corresponding members&#8221; for friendship</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1863">
     <td><a href="https://cplusplus.github.io/CWG/issues/1863.html">1863</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD4</td>
     <td>Requirements on thrown object type to support <TT>std::current_exception()</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1864">
     <td><a href="https://cplusplus.github.io/CWG/issues/1864.html">1864</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>List-initialization of array objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1865">
     <td><a href="https://cplusplus.github.io/CWG/issues/1865.html">1865</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>CD4</td>
     <td>Pointer arithmetic and multi-level qualification conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1866">
     <td><a href="https://cplusplus.github.io/CWG/issues/1866.html">1866</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>CD4</td>
     <td>Initializing variant members with non-trivial destructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1867">
     <td><a href="https://cplusplus.github.io/CWG/issues/1867.html">1867</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>NAD</td>
     <td>Function/expression ambiguity with qualified parameter name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1868">
     <td><a href="https://cplusplus.github.io/CWG/issues/1868.html">1868</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>open</td>
     <td>Meaning of &#8220;placeholder type&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1869">
     <td><a href="https://cplusplus.github.io/CWG/issues/1869.html">1869</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td><TT>thread_local</TT> vs <I>linkage-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1870">
     <td><a href="https://cplusplus.github.io/CWG/issues/1870.html">1870</a></td>
+    <td>[<a href="https://wg21.link/basic.def">basic.def</a>]</td>
     <td>CD4</td>
     <td>Contradictory wording about definitions vs explicit specialization/instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1871">
     <td><a href="https://cplusplus.github.io/CWG/issues/1871.html">1871</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>NAD</td>
     <td>Non-identifier characters in <I>ud-suffix</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1872">
     <td><a href="https://cplusplus.github.io/CWG/issues/1872.html">1872</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD4</td>
     <td>Instantiations of <TT>constexpr</TT> templates that cannot appear in constant expressions</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="1873">
     <td><a href="https://cplusplus.github.io/CWG/issues/1873.html">1873</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD4</td>
     <td>Protected member access from derived class friends</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1874">
     <td><a href="https://cplusplus.github.io/CWG/issues/1874.html">1874</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD4</td>
     <td>Type vs non-type template parameters with <TT>class</TT> keyword</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1875">
     <td><a href="https://cplusplus.github.io/CWG/issues/1875.html">1875</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>CD4</td>
     <td>Reordering declarations in class scope</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1876">
     <td><a href="https://cplusplus.github.io/CWG/issues/1876.html">1876</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Preventing explicit specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1877">
     <td><a href="https://cplusplus.github.io/CWG/issues/1877.html">1877</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD4</td>
     <td>Return type deduction from <TT>return</TT> with no operand</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1878">
     <td><a href="https://cplusplus.github.io/CWG/issues/1878.html">1878</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD4</td>
     <td><TT>operator auto</TT> template</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="1879">
     <td><a href="https://cplusplus.github.io/CWG/issues/1879.html">1879</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>NAD</td>
     <td>Inadequate definition of alignment requirement</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1880">
     <td><a href="https://cplusplus.github.io/CWG/issues/1880.html">1880</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD4</td>
     <td>When are parameter objects destroyed?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1881">
     <td><a href="https://cplusplus.github.io/CWG/issues/1881.html">1881</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD4</td>
     <td>Standard-layout classes and unnamed bit-fields</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1882">
     <td><a href="https://cplusplus.github.io/CWG/issues/1882.html">1882</a></td>
+    <td>[<a href="https://wg21.link/global.names">global.names</a>]</td>
     <td>CD4</td>
     <td>Reserved names without library use</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1883">
     <td><a href="https://cplusplus.github.io/CWG/issues/1883.html">1883</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>review</td>
     <td>Protected access to constructors in <I>mem-initializer</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1884">
     <td><a href="https://cplusplus.github.io/CWG/issues/1884.html">1884</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD6</td>
     <td>Unclear requirements for same-named external-linkage entities</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1885">
     <td><a href="https://cplusplus.github.io/CWG/issues/1885.html">1885</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD4</td>
     <td>Return value of a function is underspecified</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1886">
     <td><a href="https://cplusplus.github.io/CWG/issues/1886.html">1886</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD4</td>
     <td>Language linkage for <TT>main()</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1887">
     <td><a href="https://cplusplus.github.io/CWG/issues/1887.html">1887</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD4</td>
     <td>Problems with <TT>::</TT> as <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1888">
     <td><a href="https://cplusplus.github.io/CWG/issues/1888.html">1888</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD4</td>
     <td>Implicitly-declared default constructors and <TT>explicit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1889">
     <td><a href="https://cplusplus.github.io/CWG/issues/1889.html">1889</a></td>
+    <td>[<a href="https://wg21.link/cpp.pragma">cpp.pragma</a>]</td>
     <td>open</td>
     <td>Unclear effect of <TT>#pragma</TT> on conformance</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1890">
     <td><a href="https://cplusplus.github.io/CWG/issues/1890.html">1890</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>drafting</td>
     <td>Member type depending on definition of member function</td>
     <td align="center">
@@ -11175,2670 +13023,3115 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1891">
     <td><a href="https://cplusplus.github.io/CWG/issues/1891.html">1891</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD4</td>
     <td>Move constructor/assignment for closure class</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="1892">
     <td><a href="https://cplusplus.github.io/CWG/issues/1892.html">1892</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD4</td>
     <td>Use of <TT>auto</TT> in function type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1893">
     <td><a href="https://cplusplus.github.io/CWG/issues/1893.html">1893</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>CD5</td>
     <td>Function-style cast with <I>braced-init-list</I>s and empty pack expansions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1894">
     <td><a href="https://cplusplus.github.io/CWG/issues/1894.html">1894</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD6</td>
     <td><I>typedef-name</I>s and <I>using-declaration</I>s</td>
     <td class="full" align="center">Clang 3.8</td>
   </tr>
   <tr id="1895">
     <td><a href="https://cplusplus.github.io/CWG/issues/1895.html">1895</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD4</td>
     <td>Deleted conversions in conditional operator operands</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1896">
     <td><a href="https://cplusplus.github.io/CWG/issues/1896.html">1896</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>CD6</td>
     <td>Repeated alias templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1897">
     <td><a href="https://cplusplus.github.io/CWG/issues/1897.html">1897</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>review</td>
     <td>ODR vs alternative tokens</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1898">
     <td><a href="https://cplusplus.github.io/CWG/issues/1898.html">1898</a></td>
+    <td>[<a href="https://wg21.link/over.dcl">over.dcl</a>]</td>
     <td>CD6</td>
     <td>Use of &#8220;equivalent&#8221; in overload resolution</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1899">
     <td><a href="https://cplusplus.github.io/CWG/issues/1899.html">1899</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD4</td>
     <td>Value-dependent constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1900">
     <td><a href="https://cplusplus.github.io/CWG/issues/1900.html">1900</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD6</td>
     <td>Do <TT>friend</TT> declarations count as &#8220;previous declarations&#8221;?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="1901">
     <td><a href="https://cplusplus.github.io/CWG/issues/1901.html">1901</a></td>
+    <td>[<a href="https://wg21.link/lex.token">lex.token</a>]</td>
     <td>open</td>
     <td><I>punctuator</I> referenced but not defined</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1902">
     <td><a href="https://cplusplus.github.io/CWG/issues/1902.html">1902</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>CD4</td>
     <td>What makes a conversion &#8220;otherwise ill-formed&#8221;?</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1903">
     <td><a href="https://cplusplus.github.io/CWG/issues/1903.html">1903</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD4</td>
     <td>What declarations are introduced by a non-member <I>using-declaration</I>?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1904">
     <td><a href="https://cplusplus.github.io/CWG/issues/1904.html">1904</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Default template arguments for members of class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1905">
     <td><a href="https://cplusplus.github.io/CWG/issues/1905.html">1905</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>NAD</td>
     <td>Dependent types and injected-class-names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1906">
     <td><a href="https://cplusplus.github.io/CWG/issues/1906.html">1906</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>NAD</td>
     <td>Name lookup in member <TT>friend</TT> declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1907">
     <td><a href="https://cplusplus.github.io/CWG/issues/1907.html">1907</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD6</td>
     <td><I>using-declaration</I>s and default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1908">
     <td><a href="https://cplusplus.github.io/CWG/issues/1908.html">1908</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD6</td>
     <td>Dual destructor lookup and <I>template-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1909">
     <td><a href="https://cplusplus.github.io/CWG/issues/1909.html">1909</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td>Member class template with the same name as the class</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1910">
     <td><a href="https://cplusplus.github.io/CWG/issues/1910.html">1910</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>CD5</td>
     <td>&#8220;Shall&#8221; requirement applied to runtime behavior</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1911">
     <td><a href="https://cplusplus.github.io/CWG/issues/1911.html">1911</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD4</td>
     <td><TT>constexpr</TT> constructor with non-literal base class</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1912">
     <td><a href="https://cplusplus.github.io/CWG/issues/1912.html">1912</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD5</td>
     <td><I>exception-specification</I> of defaulted function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1913">
     <td><a href="https://cplusplus.github.io/CWG/issues/1913.html">1913</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD5</td>
     <td><TT>decltype((x))</TT> in <I>lambda-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1914">
     <td><a href="https://cplusplus.github.io/CWG/issues/1914.html">1914</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr">dcl.attr</a>]</td>
     <td>extension</td>
     <td>Duplicate standard attributes</td>
     <td align="center">Extension</td>
   </tr>
   <tr class="open" id="1915">
     <td><a href="https://cplusplus.github.io/CWG/issues/1915.html">1915</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>open</td>
     <td>Potentially-invoked destructors in non-throwing constructors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1916">
     <td><a href="https://cplusplus.github.io/CWG/issues/1916.html">1916</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD4</td>
     <td>&#8220;Same cv-unqualified type&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1917">
     <td><a href="https://cplusplus.github.io/CWG/issues/1917.html">1917</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>NAD</td>
     <td>decltype-qualified enumeration names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1918">
     <td><a href="https://cplusplus.github.io/CWG/issues/1918.html">1918</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD5</td>
     <td><TT>friend</TT> templates with dependent scopes</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="1919">
     <td><a href="https://cplusplus.github.io/CWG/issues/1919.html">1919</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>open</td>
     <td>Overload resolution for <TT>!</TT> with explicit conversion operator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1920">
     <td><a href="https://cplusplus.github.io/CWG/issues/1920.html">1920</a></td>
+    <td>[<a href="https://wg21.link/expr.pseudo">expr.pseudo</a>]</td>
     <td>CD4</td>
     <td>Qualification mismatch in <I>pseudo-destructor-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1921">
     <td><a href="https://cplusplus.github.io/CWG/issues/1921.html">1921</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td><TT>constexpr</TT> constructors and point of initialization of <TT>const</TT> variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1922">
     <td><a href="https://cplusplus.github.io/CWG/issues/1922.html">1922</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>CD4</td>
     <td>Injected class template names and default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1923">
     <td><a href="https://cplusplus.github.io/CWG/issues/1923.html">1923</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>NAD</td>
     <td>Lvalues of type <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1924">
     <td><a href="https://cplusplus.github.io/CWG/issues/1924.html">1924</a></td>
+    <td>[<a href="https://wg21.link/lex.literal">lex.literal</a>]</td>
     <td>review</td>
     <td>Definition of &#8220;literal&#8221; and kinds of literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1925">
     <td><a href="https://cplusplus.github.io/CWG/issues/1925.html">1925</a></td>
+    <td>[<a href="https://wg21.link/expr.comma">expr.comma</a>]</td>
     <td>CD4</td>
     <td>Bit-field prvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1926">
     <td><a href="https://cplusplus.github.io/CWG/issues/1926.html">1926</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD4</td>
     <td>Potential results of subscript operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1927">
     <td><a href="https://cplusplus.github.io/CWG/issues/1927.html">1927</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>dup</td>
     <td>Lifetime of temporaries in <I>init-capture</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1928">
     <td><a href="https://cplusplus.github.io/CWG/issues/1928.html">1928</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Triviality of deleted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1929">
     <td><a href="https://cplusplus.github.io/CWG/issues/1929.html">1929</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD4</td>
     <td><TT>template</TT> keyword following namespace <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1930">
     <td><a href="https://cplusplus.github.io/CWG/issues/1930.html">1930</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD4</td>
     <td><I>init-declarator-list</I> vs <I>member-declarator-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1931">
     <td><a href="https://cplusplus.github.io/CWG/issues/1931.html">1931</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD5</td>
     <td>Default-constructible and copy-assignable closure types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1932">
     <td><a href="https://cplusplus.github.io/CWG/issues/1932.html">1932</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD4</td>
     <td>Bit-field results of conditional operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1933">
     <td><a href="https://cplusplus.github.io/CWG/issues/1933.html">1933</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>NAD</td>
     <td>Implementation limit for <I>initializer-list</I> elements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1934">
     <td><a href="https://cplusplus.github.io/CWG/issues/1934.html">1934</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>Relaxing <I>exception-specification</I> compatibility requirements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1935">
     <td><a href="https://cplusplus.github.io/CWG/issues/1935.html">1935</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td>Reuse of placement arguments in deallocation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1936">
     <td><a href="https://cplusplus.github.io/CWG/issues/1936.html">1936</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD6</td>
     <td>Dependent <I>qualified-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1937">
     <td><a href="https://cplusplus.github.io/CWG/issues/1937.html">1937</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD5</td>
     <td>Incomplete specification of function pointer from lambda</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1938">
     <td><a href="https://cplusplus.github.io/CWG/issues/1938.html">1938</a></td>
+    <td>[<a href="https://wg21.link/intro.compliance">intro.compliance</a>]</td>
     <td>CD5</td>
     <td>Should hosted/freestanding be implementation-defined?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1939">
     <td><a href="https://cplusplus.github.io/CWG/issues/1939.html">1939</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>open</td>
     <td>Argument conversions to nondeduced parameter types revisited</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1940">
     <td><a href="https://cplusplus.github.io/CWG/issues/1940.html">1940</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>CD4</td>
     <td><TT>static_assert</TT> in anonymous unions</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1941">
     <td><a href="https://cplusplus.github.io/CWG/issues/1941.html">1941</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>SFINAE and inherited constructor default arguments</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1942">
     <td><a href="https://cplusplus.github.io/CWG/issues/1942.html">1942</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD4</td>
     <td>Incorrect reference to <I>trailing-return-type</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1943">
     <td><a href="https://cplusplus.github.io/CWG/issues/1943.html">1943</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD5</td>
     <td>Unspecified meaning of &#8220;bit&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1944">
     <td><a href="https://cplusplus.github.io/CWG/issues/1944.html">1944</a></td>
+    <td>[<a href="https://wg21.link/diff">diff</a>]</td>
     <td>open</td>
     <td>New C incompatibilities</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1945">
     <td><a href="https://cplusplus.github.io/CWG/issues/1945.html">1945</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD5</td>
     <td>Friend declarations naming members of class templates in non-templates</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1946">
     <td><a href="https://cplusplus.github.io/CWG/issues/1946.html">1946</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td><I>exception-specification</I>s vs pointer dereference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1947">
     <td><a href="https://cplusplus.github.io/CWG/issues/1947.html">1947</a></td>
+    <td>[<a href="https://wg21.link/lex.icon">lex.icon</a>]</td>
     <td>NAD</td>
     <td>Digit separators following non-octal prefix</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1948">
     <td><a href="https://cplusplus.github.io/CWG/issues/1948.html">1948</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic">basic.stc.dynamic</a>]</td>
     <td>NAD</td>
     <td><I>exception-specification</I> of replacement global <TT>new</TT></td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1949">
     <td><a href="https://cplusplus.github.io/CWG/issues/1949.html">1949</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD4</td>
     <td>&#8220;sequenced after&#8221; instead of &#8220;sequenced before&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1950">
     <td><a href="https://cplusplus.github.io/CWG/issues/1950.html">1950</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>NAD</td>
     <td>Restructuring description of ranks of conversion sequences</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1951">
     <td><a href="https://cplusplus.github.io/CWG/issues/1951.html">1951</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD4</td>
     <td>Cv-qualification and literal types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1952">
     <td><a href="https://cplusplus.github.io/CWG/issues/1952.html">1952</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Constant expressions and library undefined behavior</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1953">
     <td><a href="https://cplusplus.github.io/CWG/issues/1953.html">1953</a></td>
+    <td>[<a href="https://wg21.link/intro.memory">intro.memory</a>]</td>
     <td>CD7</td>
     <td>Data races and common initial sequence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1954">
     <td><a href="https://cplusplus.github.io/CWG/issues/1954.html">1954</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>CD7</td>
     <td><TT>typeid</TT> null dereference check in subexpressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1955">
     <td><a href="https://cplusplus.github.io/CWG/issues/1955.html">1955</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD4</td>
     <td><TT>#elif</TT> with invalid controlling expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1956">
     <td><a href="https://cplusplus.github.io/CWG/issues/1956.html">1956</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.auto">basic.stc.auto</a>]</td>
     <td>CD4</td>
     <td>Reuse of storage of automatic variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1957">
     <td><a href="https://cplusplus.github.io/CWG/issues/1957.html">1957</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>NAD</td>
     <td><TT>decltype(auto)</TT> with direct-list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1958">
     <td><a href="https://cplusplus.github.io/CWG/issues/1958.html">1958</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD4</td>
     <td><TT>decltype(auto)</TT> with parenthesized initializer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1959">
     <td><a href="https://cplusplus.github.io/CWG/issues/1959.html">1959</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Inadvertently inherited copy constructor</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1960">
     <td><a href="https://cplusplus.github.io/CWG/issues/1960.html">1960</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>NAD</td>
     <td>Visibility of entity named in class-scope <I>using-declaration</I></td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1961">
     <td><a href="https://cplusplus.github.io/CWG/issues/1961.html">1961</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>C++17</td>
     <td>Potentially-concurrent actions within a signal handler</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1962">
     <td><a href="https://cplusplus.github.io/CWG/issues/1962.html">1962</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>open</td>
     <td>Type of <TT>__func__</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1963">
     <td><a href="https://cplusplus.github.io/CWG/issues/1963.html">1963</a></td>
+    <td>[<a href="https://wg21.link/lex.name">lex.name</a>]</td>
     <td>CD4</td>
     <td>Implementation-defined identifier characters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1964">
     <td><a href="https://cplusplus.github.io/CWG/issues/1964.html">1964</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>NAD</td>
     <td><I>opaque-enum-declaration</I> in <I>alias-declaration</I>?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1965">
     <td><a href="https://cplusplus.github.io/CWG/issues/1965.html">1965</a></td>
+    <td>[<a href="https://wg21.link/expr.dynamic.cast">expr.dynamic.cast</a>]</td>
     <td>CD7</td>
     <td>Explicit casts to reference types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1966">
     <td><a href="https://cplusplus.github.io/CWG/issues/1966.html">1966</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD4</td>
     <td>Colon following enumeration <I>elaborated-type-specifier</I></td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="1967">
     <td><a href="https://cplusplus.github.io/CWG/issues/1967.html">1967</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>CD4</td>
     <td>Temporary lifetime and move-elision</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1968">
     <td><a href="https://cplusplus.github.io/CWG/issues/1968.html">1968</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Address of <TT>typeid</TT> in constant expressions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1969">
     <td><a href="https://cplusplus.github.io/CWG/issues/1969.html">1969</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD6</td>
     <td>Missing exclusion of <TT>~S</TT> as an ordinary function name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1970">
     <td><a href="https://cplusplus.github.io/CWG/issues/1970.html">1970</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>NAD</td>
     <td>Ambiguity resolution for <TT>(T())*x</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1971">
     <td><a href="https://cplusplus.github.io/CWG/issues/1971.html">1971</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD4</td>
     <td>Unclear disambiguation of destructor and <TT>operator~</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1972">
     <td><a href="https://cplusplus.github.io/CWG/issues/1972.html">1972</a></td>
+    <td>[<a href="https://wg21.link/lex.name">lex.name</a>]</td>
     <td>CD6</td>
     <td>Identifier character restrictions in non-<I>identifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1973">
     <td><a href="https://cplusplus.github.io/CWG/issues/1973.html">1973</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD7</td>
     <td>Which <I>parameter-declaration-clause</I> in a <I>lambda-expression</I>?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1974">
     <td><a href="https://cplusplus.github.io/CWG/issues/1974.html">1974</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td>Redundant specification of non-type <I>typename-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1975">
     <td><a href="https://cplusplus.github.io/CWG/issues/1975.html">1975</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Permissible declarations for <I>exception-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1976">
     <td><a href="https://cplusplus.github.io/CWG/issues/1976.html">1976</a></td>
+    <td>[<a href="https://wg21.link/namespace.alias">namespace.alias</a>]</td>
     <td>NAD</td>
     <td>Ambiguity of <I>namespace-alias</I>es</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1977">
     <td><a href="https://cplusplus.github.io/CWG/issues/1977.html">1977</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>open</td>
     <td>Contradictory results of failed destructor lookup</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1978">
     <td><a href="https://cplusplus.github.io/CWG/issues/1978.html">1978</a></td>
+    <td>[<a href="https://wg21.link/class.conv.ctor">class.conv.ctor</a>]</td>
     <td>CD4</td>
     <td>Redundant description of explicit constructor use</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1979">
     <td><a href="https://cplusplus.github.io/CWG/issues/1979.html">1979</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>drafting</td>
     <td>Alias template specialization in template member definition</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1980">
     <td><a href="https://cplusplus.github.io/CWG/issues/1980.html">1980</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>drafting</td>
     <td>Equivalent but not functionally-equivalent redeclarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1981">
     <td><a href="https://cplusplus.github.io/CWG/issues/1981.html">1981</a></td>
+    <td>[<a href="https://wg21.link/conv">conv</a>]</td>
     <td>CD4</td>
     <td>Implicit contextual conversions and <TT>explicit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1982">
     <td><a href="https://cplusplus.github.io/CWG/issues/1982.html">1982</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>NAD</td>
     <td>Deduction extending parameter pack</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1983">
     <td><a href="https://cplusplus.github.io/CWG/issues/1983.html">1983</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD5</td>
     <td>Inappropriate use of <I>virt-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1984">
     <td><a href="https://cplusplus.github.io/CWG/issues/1984.html">1984</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Lossless narrowing conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1985">
     <td><a href="https://cplusplus.github.io/CWG/issues/1985.html">1985</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>NAD</td>
     <td>Unknown bound array member with <I>brace-or-equal-initializer</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1986">
     <td><a href="https://cplusplus.github.io/CWG/issues/1986.html">1986</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>drafting</td>
     <td>odr-use and delayed initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1987">
     <td><a href="https://cplusplus.github.io/CWG/issues/1987.html">1987</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>NAD</td>
     <td><TT>constexpr</TT> static data members across translation units</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1988">
     <td><a href="https://cplusplus.github.io/CWG/issues/1988.html">1988</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD4</td>
     <td>Ambiguity between dependent and non-dependent bases in implicit member access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1989">
     <td><a href="https://cplusplus.github.io/CWG/issues/1989.html">1989</a></td>
+    <td>[<a href="https://wg21.link/over.oper">over.oper</a>]</td>
     <td>drafting</td>
     <td>Insufficient restrictions on parameters of postfix operators</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1990">
     <td><a href="https://cplusplus.github.io/CWG/issues/1990.html">1990</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>CD4</td>
     <td>Ambiguity due to optional <I>decl-specifier-seq</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1991">
     <td><a href="https://cplusplus.github.io/CWG/issues/1991.html">1991</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Inheriting constructors vs default arguments</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1992">
     <td><a href="https://cplusplus.github.io/CWG/issues/1992.html">1992</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD4</td>
     <td><TT>new (std::nothrow) int[N]</TT> can throw</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1993">
     <td><a href="https://cplusplus.github.io/CWG/issues/1993.html">1993</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>open</td>
     <td>Use of <TT>template&lt;&gt;</TT> defining member of explicit specialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1994">
     <td><a href="https://cplusplus.github.io/CWG/issues/1994.html">1994</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>dup</td>
     <td>Confusing wording regarding multiple <TT>template&lt;&gt;</TT> prefixes</td>
     <td class="unknown" align="center">Duplicate of <a href="#529">529</a></td>
   </tr>
   <tr id="1995">
     <td><a href="https://cplusplus.github.io/CWG/issues/1995.html">1995</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td><I>exception-specification</I>s and non-type template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1996">
     <td><a href="https://cplusplus.github.io/CWG/issues/1996.html">1996</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>drafting</td>
     <td>Reference list-initialization ignores conversion functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1997">
     <td><a href="https://cplusplus.github.io/CWG/issues/1997.html">1997</a></td>
+    <td>[<a href="https://wg21.link/basic.indet">basic.indet</a>]</td>
     <td>CD7</td>
     <td>Placement new and previous initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1998">
     <td><a href="https://cplusplus.github.io/CWG/issues/1998.html">1998</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>NAD</td>
     <td>Additional sources of xvalue expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1999">
     <td><a href="https://cplusplus.github.io/CWG/issues/1999.html">1999</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD4</td>
     <td>Representation of source characters as universal-character-names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2000">
     <td><a href="https://cplusplus.github.io/CWG/issues/2000.html">2000</a></td>
+    <td>[<a href="https://wg21.link/lex.pptoken">lex.pptoken</a>]</td>
     <td>CD4</td>
     <td><I>header-name</I> outside <TT>#include</TT> directive</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2001">
     <td><a href="https://cplusplus.github.io/CWG/issues/2001.html">2001</a></td>
+    <td>[<a href="https://wg21.link/cpp.pre">cpp.pre</a>]</td>
     <td>CD4</td>
     <td><I>non-directive</I> is underspecified</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2002">
     <td><a href="https://cplusplus.github.io/CWG/issues/2002.html">2002</a></td>
+    <td>[<a href="https://wg21.link/cpp.pre">cpp.pre</a>]</td>
     <td>open</td>
     <td>White space within preprocessing directives</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2003">
     <td><a href="https://cplusplus.github.io/CWG/issues/2003.html">2003</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace">cpp.replace</a>]</td>
     <td>drafting</td>
     <td>Zero-argument macros incorrectly specified</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2004">
     <td><a href="https://cplusplus.github.io/CWG/issues/2004.html">2004</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Unions with mutable members in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2005">
     <td><a href="https://cplusplus.github.io/CWG/issues/2005.html">2005</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Incorrect <TT>constexpr</TT> reference initialization requirements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2006">
     <td><a href="https://cplusplus.github.io/CWG/issues/2006.html">2006</a></td>
+    <td>[<a href="https://wg21.link/basic.compound">basic.compound</a>]</td>
     <td>CD4</td>
     <td>Cv-qualified <TT>void</TT> types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2007">
     <td><a href="https://cplusplus.github.io/CWG/issues/2007.html">2007</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>CD6</td>
     <td>Argument-dependent lookup for <TT>operator=</TT></td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="2008">
     <td><a href="https://cplusplus.github.io/CWG/issues/2008.html">2008</a></td>
+    <td>[<a href="https://wg21.link/temp.arg">temp.arg</a>]</td>
     <td>CD4</td>
     <td>Default <I>template-argument</I>s underspecified</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2009">
     <td><a href="https://cplusplus.github.io/CWG/issues/2009.html">2009</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>CD6</td>
     <td>Unclear specification of class scope</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="2010">
     <td><a href="https://cplusplus.github.io/CWG/issues/2010.html">2010</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td><I>exception-specification</I>s and conversion operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2011">
     <td><a href="https://cplusplus.github.io/CWG/issues/2011.html">2011</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++17</td>
     <td>Unclear effect of reference capture of reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2012">
     <td><a href="https://cplusplus.github.io/CWG/issues/2012.html">2012</a></td>
+    <td>[<a href="https://wg21.link/basic.stc">basic.stc</a>]</td>
     <td>CD4</td>
     <td>Lifetime of references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2013">
     <td><a href="https://cplusplus.github.io/CWG/issues/2013.html">2013</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>drafting</td>
     <td>Pointer subtraction in large array</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2014">
     <td><a href="https://cplusplus.github.io/CWG/issues/2014.html">2014</a></td>
+    <td>[<a href="https://wg21.link/new.delete.array">new.delete.array</a>]</td>
     <td>NAD</td>
     <td>Unneeded deallocation signatures</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2015">
     <td><a href="https://cplusplus.github.io/CWG/issues/2015.html">2015</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.delete">dcl.fct.def.delete</a>]</td>
     <td>CD4</td>
     <td>odr-use of deleted virtual functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2016">
     <td><a href="https://cplusplus.github.io/CWG/issues/2016.html">2016</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>CD4</td>
     <td>Confusing wording in description of conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2017">
     <td><a href="https://cplusplus.github.io/CWG/issues/2017.html">2017</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>CD4</td>
     <td>Flowing off end is not equivalent to no-expression return</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2018">
     <td><a href="https://cplusplus.github.io/CWG/issues/2018.html">2018</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>dup</td>
     <td>Qualification conversion vs reference binding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2019">
     <td><a href="https://cplusplus.github.io/CWG/issues/2019.html">2019</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.general">basic.stc.general</a>]</td>
     <td>CD4</td>
     <td>Member references omitted from description of storage duration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2020">
     <td><a href="https://cplusplus.github.io/CWG/issues/2020.html">2020</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Inadequate description of odr-use of implicitly-invoked functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2021">
     <td><a href="https://cplusplus.github.io/CWG/issues/2021.html">2021</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>dup</td>
     <td>Function template redeclaration via alias template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2022">
     <td><a href="https://cplusplus.github.io/CWG/issues/2022.html">2022</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Copy elision in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2023">
     <td><a href="https://cplusplus.github.io/CWG/issues/2023.html">2023</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>drafting</td>
     <td>Composite reference result type of conditional operator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2024">
     <td><a href="https://cplusplus.github.io/CWG/issues/2024.html">2024</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD4</td>
     <td>Dependent types and unexpanded parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2025">
     <td><a href="https://cplusplus.github.io/CWG/issues/2025.html">2025</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>dup</td>
     <td>Declaration matching via alias templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2026">
     <td><a href="https://cplusplus.github.io/CWG/issues/2026.html">2026</a></td>
+    <td>[<a href="https://wg21.link/basic.start">basic.start</a>]</td>
     <td>CD4</td>
     <td>Zero-initialization and <TT>constexpr</TT></td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="2027">
     <td><a href="https://cplusplus.github.io/CWG/issues/2027.html">2027</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>CD4</td>
     <td>Unclear requirements for multiple <TT>alignas</TT> specifiers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2028">
     <td><a href="https://cplusplus.github.io/CWG/issues/2028.html">2028</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>drafting</td>
     <td>Converting constructors in rvalue reference initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2029">
     <td><a href="https://cplusplus.github.io/CWG/issues/2029.html">2029</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>dup</td>
     <td>Abstract class return type in <TT>decltype</TT> operand</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2030">
     <td><a href="https://cplusplus.github.io/CWG/issues/2030.html">2030</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>NAD</td>
     <td>Access of injected-class-name with template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2031">
     <td><a href="https://cplusplus.github.io/CWG/issues/2031.html">2031</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp03.expr">diff.cpp03.expr</a>]</td>
     <td>CD4</td>
     <td>Missing incompatibility for <TT>&amp;&amp;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2032">
     <td><a href="https://cplusplus.github.io/CWG/issues/2032.html">2032</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD4</td>
     <td>Default <I>template-argument</I>s of variable templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2033">
     <td><a href="https://cplusplus.github.io/CWG/issues/2033.html">2033</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>CD4</td>
     <td>Redundant restriction on partial specialization argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2034">
     <td><a href="https://cplusplus.github.io/CWG/issues/2034.html">2034</a></td>
+    <td>[<a href="https://wg21.link/except.uncaught">except.uncaught</a>]</td>
     <td>NAD</td>
     <td>Deprecating <TT>uncaught_exception()</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2035">
     <td><a href="https://cplusplus.github.io/CWG/issues/2035.html">2035</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.match">temp.spec.partial.match</a>]</td>
     <td>CD3</td>
     <td>Multi-section example is confusing</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2036">
     <td><a href="https://cplusplus.github.io/CWG/issues/2036.html">2036</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>NAD</td>
     <td>Refactoring <I>parameters-and-qualifiers</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2037">
     <td><a href="https://cplusplus.github.io/CWG/issues/2037.html">2037</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>drafting</td>
     <td>Alias templates and template declaration matching</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2038">
     <td><a href="https://cplusplus.github.io/CWG/issues/2038.html">2038</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp14">diff.cpp14</a>]</td>
     <td>CD4</td>
     <td>Document C++14 incompatibility of new braced deduction rule</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2039">
     <td><a href="https://cplusplus.github.io/CWG/issues/2039.html">2039</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Constant conversions to <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2040">
     <td><a href="https://cplusplus.github.io/CWG/issues/2040.html">2040</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD4</td>
     <td><I>trailing-return-type</I> no longer ambiguous</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2041">
     <td><a href="https://cplusplus.github.io/CWG/issues/2041.html">2041</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD4</td>
     <td>Namespace for explicit class template specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2042">
     <td><a href="https://cplusplus.github.io/CWG/issues/2042.html">2042</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>review</td>
     <td>Exceptions and deallocation functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2043">
     <td><a href="https://cplusplus.github.io/CWG/issues/2043.html">2043</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>drafting</td>
     <td>Generalized template arguments and array-to-pointer decay</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2044">
     <td><a href="https://cplusplus.github.io/CWG/issues/2044.html">2044</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD4</td>
     <td><TT>decltype(auto)</TT> and <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2045">
     <td><a href="https://cplusplus.github.io/CWG/issues/2045.html">2045</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>CD5</td>
     <td>&#8220;Identical&#8221; template parameter lists</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2046">
     <td><a href="https://cplusplus.github.io/CWG/issues/2046.html">2046</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>C++17</td>
     <td>Incomplete thread specifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2047">
     <td><a href="https://cplusplus.github.io/CWG/issues/2047.html">2047</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Coordinating &#8220;throws anything&#8221; specifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2048">
     <td><a href="https://cplusplus.github.io/CWG/issues/2048.html">2048</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>open</td>
     <td>C-style casts that cast away constness vs <TT>static_cast</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2049">
     <td><a href="https://cplusplus.github.io/CWG/issues/2049.html">2049</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD7</td>
     <td>List initializer in non-type template default argument</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2050">
     <td><a href="https://cplusplus.github.io/CWG/issues/2050.html">2050</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>NAD</td>
     <td>Consolidate specification of linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2051">
     <td><a href="https://cplusplus.github.io/CWG/issues/2051.html">2051</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD5</td>
     <td>Simplifying alias rules</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2052">
     <td><a href="https://cplusplus.github.io/CWG/issues/2052.html">2052</a></td>
+    <td>[<a href="https://wg21.link/over.oper">over.oper</a>]</td>
     <td>CD4</td>
     <td>Template argument deduction vs overloaded operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2053">
     <td><a href="https://cplusplus.github.io/CWG/issues/2053.html">2053</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>C++20</td>
     <td><TT>auto</TT> in non-generic lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2054">
     <td><a href="https://cplusplus.github.io/CWG/issues/2054.html">2054</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD7</td>
     <td>Missing description of class SFINAE</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2055">
     <td><a href="https://cplusplus.github.io/CWG/issues/2055.html">2055</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>drafting</td>
     <td>Explicitly-specified non-deduced parameter packs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2056">
     <td><a href="https://cplusplus.github.io/CWG/issues/2056.html">2056</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>open</td>
     <td>Member function calls in partially-initialized class objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2057">
     <td><a href="https://cplusplus.github.io/CWG/issues/2057.html">2057</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>drafting</td>
     <td>Template template arguments with default arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2058">
     <td><a href="https://cplusplus.github.io/CWG/issues/2058.html">2058</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD6</td>
     <td>More errors from internal-linkage namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2059">
     <td><a href="https://cplusplus.github.io/CWG/issues/2059.html">2059</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD5</td>
     <td>Linkage and deduced return types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2060">
     <td><a href="https://cplusplus.github.io/CWG/issues/2060.html">2060</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>NAD</td>
     <td>Deduced return type for explicit specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2061">
     <td><a href="https://cplusplus.github.io/CWG/issues/2061.html">2061</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD4</td>
     <td>Inline namespace after simplifications</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2062">
     <td><a href="https://cplusplus.github.io/CWG/issues/2062.html">2062</a></td>
+    <td>[<a href="https://wg21.link/temp.class">temp.class</a>]</td>
     <td>CD6</td>
     <td>Class template redeclaration requirements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2063">
     <td><a href="https://cplusplus.github.io/CWG/issues/2063.html">2063</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.declarative">basic.scope.declarative</a>]</td>
     <td>CD4</td>
     <td>Type/nontype hiding in class scope</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2064">
     <td><a href="https://cplusplus.github.io/CWG/issues/2064.html">2064</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>CD4</td>
     <td>Conflicting specifications for dependent <I>decltype-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2065">
     <td><a href="https://cplusplus.github.io/CWG/issues/2065.html">2065</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD6</td>
     <td>Current instantiation of a partial specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2066">
     <td><a href="https://cplusplus.github.io/CWG/issues/2066.html">2066</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD4</td>
     <td>Does type-dependent imply value-dependent?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2067">
     <td><a href="https://cplusplus.github.io/CWG/issues/2067.html">2067</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>open</td>
     <td>Generated variadic templates requiring empty pack</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2068">
     <td><a href="https://cplusplus.github.io/CWG/issues/2068.html">2068</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD4</td>
     <td>When can/must a defaulted virtual destructor be defined?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2069">
     <td><a href="https://cplusplus.github.io/CWG/issues/2069.html">2069</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD4</td>
     <td>Do destructors have names?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2070">
     <td><a href="https://cplusplus.github.io/CWG/issues/2070.html">2070</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD6</td>
     <td><I>using-declaration</I> with dependent <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2071">
     <td><a href="https://cplusplus.github.io/CWG/issues/2071.html">2071</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD4</td>
     <td><TT>typedef</TT> with no declarator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2072">
     <td><a href="https://cplusplus.github.io/CWG/issues/2072.html">2072</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>C++23</td>
     <td>Default argument instantiation for member functions of templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2073">
     <td><a href="https://cplusplus.github.io/CWG/issues/2073.html">2073</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>open</td>
     <td>Allocating memory for exception objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2074">
     <td><a href="https://cplusplus.github.io/CWG/issues/2074.html">2074</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>drafting</td>
     <td>Type-dependence of local class of function template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2075">
     <td><a href="https://cplusplus.github.io/CWG/issues/2075.html">2075</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>CD4</td>
     <td>Passing short initializer lists to array reference parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2076">
     <td><a href="https://cplusplus.github.io/CWG/issues/2076.html">2076</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>CD4</td>
     <td>List-initialization of arguments for constructor parameters</td>
     <td class="full" align="center">Clang 13</td>
   </tr>
   <tr class="open" id="2077">
     <td><a href="https://cplusplus.github.io/CWG/issues/2077.html">2077</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>drafting</td>
     <td>Overload resolution and invalid rvalue-reference initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2078">
     <td><a href="https://cplusplus.github.io/CWG/issues/2078.html">2078</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>NAD</td>
     <td>Name lookup of <I>mem-initilizer-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2079">
     <td><a href="https://cplusplus.github.io/CWG/issues/2079.html">2079</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>CD4</td>
     <td><TT>[[</TT> appearing in a <I>balanced-token-seq</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2080">
     <td><a href="https://cplusplus.github.io/CWG/issues/2080.html">2080</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>CD5</td>
     <td>Example with empty anonymous union member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2081">
     <td><a href="https://cplusplus.github.io/CWG/issues/2081.html">2081</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD5</td>
     <td>Deduced return type in redeclaration or specialization of function template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2082">
     <td><a href="https://cplusplus.github.io/CWG/issues/2082.html">2082</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD4</td>
     <td>Referring to parameters in unevaluated operands of default arguments</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="2083">
     <td><a href="https://cplusplus.github.io/CWG/issues/2083.html">2083</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Incorrect cases of odr-use</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="2084">
     <td><a href="https://cplusplus.github.io/CWG/issues/2084.html">2084</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD4</td>
     <td>NSDMIs and deleted union default constructors</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="2085">
     <td><a href="https://cplusplus.github.io/CWG/issues/2085.html">2085</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD4</td>
     <td>Invalid example of adding special member function via default argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2086">
     <td><a href="https://cplusplus.github.io/CWG/issues/2086.html">2086</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>drafting</td>
     <td>Reference odr-use vs implicit capture</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2087">
     <td><a href="https://cplusplus.github.io/CWG/issues/2087.html">2087</a></td>
+    <td>[<a href="https://wg21.link/expr.shift">expr.shift</a>]</td>
     <td>NAD</td>
     <td>Left shift of negative value by zero bits</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2088">
     <td><a href="https://cplusplus.github.io/CWG/issues/2088.html">2088</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>CD5</td>
     <td>Late tiebreakers in partial ordering</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2089">
     <td><a href="https://cplusplus.github.io/CWG/issues/2089.html">2089</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>drafting</td>
     <td>Restricting selection of builtin overloaded operators</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2090">
     <td><a href="https://cplusplus.github.io/CWG/issues/2090.html">2090</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.temp">temp.dep.temp</a>]</td>
     <td>open</td>
     <td>Dependency via non-dependent base class</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2091">
     <td><a href="https://cplusplus.github.io/CWG/issues/2091.html">2091</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD4</td>
     <td>Deducing reference non-type template arguments</td>
     <td class="full" align="center">Clang 10</td>
   </tr>
   <tr id="2092">
     <td><a href="https://cplusplus.github.io/CWG/issues/2092.html">2092</a></td>
+    <td>[<a href="https://wg21.link/temp.over">temp.over</a>]</td>
     <td>CD5</td>
     <td>Deduction failure and overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2093">
     <td><a href="https://cplusplus.github.io/CWG/issues/2093.html">2093</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>CD4</td>
     <td>Qualification conversion for pointer-to-member handler matching</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2094">
     <td><a href="https://cplusplus.github.io/CWG/issues/2094.html">2094</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++17</td>
     <td>Trivial copy/move constructor for class with volatile member</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr id="2095">
     <td><a href="https://cplusplus.github.io/CWG/issues/2095.html">2095</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD4</td>
     <td>Capturing rvalue references to functions by copy</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2096">
     <td><a href="https://cplusplus.github.io/CWG/issues/2096.html">2096</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD4</td>
     <td>Constraints on literal unions</td>
     <td class="full" align="center">Duplicate of <a href="#2598">2598</a></td>
   </tr>
   <tr class="open" id="2097">
     <td><a href="https://cplusplus.github.io/CWG/issues/2097.html">2097</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>extension</td>
     <td>Lambdas and <TT>noreturn</TT> attribute</td>
     <td align="center">Extension</td>
   </tr>
   <tr id="2098">
     <td><a href="https://cplusplus.github.io/CWG/issues/2098.html">2098</a></td>
+    <td>[<a href="https://wg21.link/except.uncaught">except.uncaught</a>]</td>
     <td>CD4</td>
     <td>Is <TT>uncaught_exceptions()</TT> per-thread?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2099">
     <td><a href="https://cplusplus.github.io/CWG/issues/2099.html">2099</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>CD4</td>
     <td>Inferring the bound of an array static data member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2100">
     <td><a href="https://cplusplus.github.io/CWG/issues/2100.html">2100</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>C++17</td>
     <td>Value-dependent address of static data member of class template</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr id="2101">
     <td><a href="https://cplusplus.github.io/CWG/issues/2101.html">2101</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD4</td>
     <td>Incorrect description of type- and value-dependence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2102">
     <td><a href="https://cplusplus.github.io/CWG/issues/2102.html">2102</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD7</td>
     <td>Constructor checking in <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2103">
     <td><a href="https://cplusplus.github.io/CWG/issues/2103.html">2103</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Lvalue-to-rvalue conversion is irrelevant in odr-use of a reference</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2104">
     <td><a href="https://cplusplus.github.io/CWG/issues/2104.html">2104</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD4</td>
     <td>Internal-linkage <TT>constexpr</TT> references and ODR requirements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2105">
     <td><a href="https://cplusplus.github.io/CWG/issues/2105.html">2105</a></td>
+    <td>[<a href="https://wg21.link/temp.arg">temp.arg</a>]</td>
     <td>open</td>
     <td>When do the arguments for a parameter pack end?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2106">
     <td><a href="https://cplusplus.github.io/CWG/issues/2106.html">2106</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.type">temp.arg.type</a>]</td>
     <td>CD4</td>
     <td>Unclear restrictions on use of function-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2107">
     <td><a href="https://cplusplus.github.io/CWG/issues/2107.html">2107</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD4</td>
     <td>Lifetime of temporaries for default arguments in array copying</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2108">
     <td><a href="https://cplusplus.github.io/CWG/issues/2108.html">2108</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>drafting</td>
     <td>Conversions to non-class prvalues in reference initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2109">
     <td><a href="https://cplusplus.github.io/CWG/issues/2109.html">2109</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD4</td>
     <td>Value dependence underspecified</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2110">
     <td><a href="https://cplusplus.github.io/CWG/issues/2110.html">2110</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>drafting</td>
     <td>Overload resolution for base class conversion and reference/non-reference</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2111">
     <td><a href="https://cplusplus.github.io/CWG/issues/2111.html">2111</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>NAD</td>
     <td>Array temporaries in reference binding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2112">
     <td><a href="https://cplusplus.github.io/CWG/issues/2112.html">2112</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td><TT>new auto{x}</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2113">
     <td><a href="https://cplusplus.github.io/CWG/issues/2113.html">2113</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD4</td>
     <td>Incompete specification of types for declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2114">
     <td><a href="https://cplusplus.github.io/CWG/issues/2114.html">2114</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp11.dcl.decl">diff.cpp11.dcl.decl</a>]</td>
     <td>CD3</td>
     <td>Missing description of incompatibility from aggregate NSDMIs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2115">
     <td><a href="https://cplusplus.github.io/CWG/issues/2115.html">2115</a></td>
+    <td>[<a href="https://wg21.link/stmt.jump">stmt.jump</a>]</td>
     <td>open</td>
     <td>Order of implicit destruction vs release of automatic storage</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2116">
     <td><a href="https://cplusplus.github.io/CWG/issues/2116.html">2116</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++17</td>
     <td>Direct or copy initialization for omitted aggregate initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2117">
     <td><a href="https://cplusplus.github.io/CWG/issues/2117.html">2117</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td>Explicit specializations and <TT>constexpr</TT> function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2118">
     <td><a href="https://cplusplus.github.io/CWG/issues/2118.html">2118</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>open</td>
     <td>Stateful metaprogramming via friend injection</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2119">
     <td><a href="https://cplusplus.github.io/CWG/issues/2119.html">2119</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>NAD</td>
     <td>Disambiguation of multi-level covariant return type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2120">
     <td><a href="https://cplusplus.github.io/CWG/issues/2120.html">2120</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD4</td>
     <td>Array as first non-static data member in standard-layout class</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="2121">
     <td><a href="https://cplusplus.github.io/CWG/issues/2121.html">2121</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.general">expr.prim.lambda.general</a>]</td>
     <td>CD6</td>
     <td>More flexible lambda syntax</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2122">
     <td><a href="https://cplusplus.github.io/CWG/issues/2122.html">2122</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD4</td>
     <td>Glvalues of <TT>void</TT> type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2123">
     <td><a href="https://cplusplus.github.io/CWG/issues/2123.html">2123</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>open</td>
     <td>Omitted constant initialization of local static variables</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2124">
     <td><a href="https://cplusplus.github.io/CWG/issues/2124.html">2124</a></td>
+    <td>[<a href="https://wg21.link/defns.signature.member.templ">defns.signature.member.templ</a>]</td>
     <td>CD4</td>
     <td>Signature of constructor template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2125">
     <td><a href="https://cplusplus.github.io/CWG/issues/2125.html">2125</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>NAD</td>
     <td>Copy elision and comma operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2126">
     <td><a href="https://cplusplus.github.io/CWG/issues/2126.html">2126</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++20</td>
     <td>Lifetime-extended temporaries in constant expressions</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr class="open" id="2127">
     <td><a href="https://cplusplus.github.io/CWG/issues/2127.html">2127</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>drafting</td>
     <td>Partial specialization and nullptr</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2128">
     <td><a href="https://cplusplus.github.io/CWG/issues/2128.html">2128</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>open</td>
     <td>Imprecise rule for reference member initializer</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2129">
     <td><a href="https://cplusplus.github.io/CWG/issues/2129.html">2129</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Non-object prvalues and constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2130">
     <td><a href="https://cplusplus.github.io/CWG/issues/2130.html">2130</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD4</td>
     <td>Over-aligned types in <I>new-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2131">
     <td><a href="https://cplusplus.github.io/CWG/issues/2131.html">2131</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>drafting</td>
     <td>Ambiguity with <I>opaque-enum-declaration</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2132">
     <td><a href="https://cplusplus.github.io/CWG/issues/2132.html">2132</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Deprecated default generated copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2133">
     <td><a href="https://cplusplus.github.io/CWG/issues/2133.html">2133</a></td>
+    <td>[<a href="https://wg21.link/conv.fctptr">conv.fctptr</a>]</td>
     <td>CD5</td>
     <td>Converting <TT>std::nullptr_t</TT> to <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2134">
     <td><a href="https://cplusplus.github.io/CWG/issues/2134.html">2134</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>NAD</td>
     <td>Objectless references to non-static member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2135">
     <td><a href="https://cplusplus.github.io/CWG/issues/2135.html">2135</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>NAD</td>
     <td><I>mem-initializer</I>s for virtual bases of abstract classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2136">
     <td><a href="https://cplusplus.github.io/CWG/issues/2136.html">2136</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>NAD</td>
     <td>Argument-dependent lookup and initializer lists</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2137">
     <td><a href="https://cplusplus.github.io/CWG/issues/2137.html">2137</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>List-initialization from object of same type</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2138">
     <td><a href="https://cplusplus.github.io/CWG/issues/2138.html">2138</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Explicit member specialization vs implicit instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2139">
     <td><a href="https://cplusplus.github.io/CWG/issues/2139.html">2139</a></td>
+    <td>[<a href="https://wg21.link/conv.fpint">conv.fpint</a>]</td>
     <td>NAD</td>
     <td>Floating-point requirements for integer representation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2140">
     <td><a href="https://cplusplus.github.io/CWG/issues/2140.html">2140</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>CD4</td>
     <td>Lvalue-to-rvalue conversion of <TT>std::nullptr_t</TT></td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2141">
     <td><a href="https://cplusplus.github.io/CWG/issues/2141.html">2141</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD4</td>
     <td>Ambiguity in <I>new-expression</I> with <I>elaborated-type-specifier</I></td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="2142">
     <td><a href="https://cplusplus.github.io/CWG/issues/2142.html">2142</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>NAD</td>
     <td>Missing definition of associated classes and namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2143">
     <td><a href="https://cplusplus.github.io/CWG/issues/2143.html">2143</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++17</td>
     <td>Value-dependency via injected-class-name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2144">
     <td><a href="https://cplusplus.github.io/CWG/issues/2144.html">2144</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>CD7</td>
     <td>Function/variable declaration ambiguity</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2145">
     <td><a href="https://cplusplus.github.io/CWG/issues/2145.html">2145</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>CD4</td>
     <td>Parenthesized declarator in function definition</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2146">
     <td><a href="https://cplusplus.github.io/CWG/issues/2146.html">2146</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD4</td>
     <td>Scalar object vs memory location in definition of &#8220;unsequenced&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2147">
     <td><a href="https://cplusplus.github.io/CWG/issues/2147.html">2147</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD4</td>
     <td>Initializer-list arguments and pack deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2148">
     <td><a href="https://cplusplus.github.io/CWG/issues/2148.html">2148</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>drafting</td>
     <td>Thread storage duration and order of initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2149">
     <td><a href="https://cplusplus.github.io/CWG/issues/2149.html">2149</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD7</td>
     <td>Brace elision and array length deduction</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="2150">
     <td><a href="https://cplusplus.github.io/CWG/issues/2150.html">2150</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Initializer list array lifetime</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2151">
     <td><a href="https://cplusplus.github.io/CWG/issues/2151.html">2151</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>CD4</td>
     <td>Exception object is not created</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2152">
     <td><a href="https://cplusplus.github.io/CWG/issues/2152.html">2152</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>NAD</td>
     <td>Can an alternative token be used as a <I>ud-suffix</I>?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2153">
     <td><a href="https://cplusplus.github.io/CWG/issues/2153.html">2153</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td><I>pure-specifier</I> in friend declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2154">
     <td><a href="https://cplusplus.github.io/CWG/issues/2154.html">2154</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td>Ambiguity of <I>pure-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2155">
     <td><a href="https://cplusplus.github.io/CWG/issues/2155.html">2155</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>C++17</td>
     <td>Defining classes and enumerations via <I>using-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2156">
     <td><a href="https://cplusplus.github.io/CWG/issues/2156.html">2156</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD4</td>
     <td>Definition of enumeration declared by <I>using-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2157">
     <td><a href="https://cplusplus.github.io/CWG/issues/2157.html">2157</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>CD4</td>
     <td>Further disambiguation of enumeration <I>elaborated-type-specifier</I></td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr class="open" id="2158">
     <td><a href="https://cplusplus.github.io/CWG/issues/2158.html">2158</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>drafting</td>
     <td>Polymorphic behavior during destruction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2159">
     <td><a href="https://cplusplus.github.io/CWG/issues/2159.html">2159</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>NAD</td>
     <td>Lambda capture and local <TT>thread_local</TT> variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2160">
     <td><a href="https://cplusplus.github.io/CWG/issues/2160.html">2160</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>open</td>
     <td>Issues with partial ordering</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2161">
     <td><a href="https://cplusplus.github.io/CWG/issues/2161.html">2161</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>NAD</td>
     <td>Explicit instantiation declaration and &#8220;preceding initialization&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2162">
     <td><a href="https://cplusplus.github.io/CWG/issues/2162.html">2162</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD3</td>
     <td>Capturing <TT>this</TT> by reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2163">
     <td><a href="https://cplusplus.github.io/CWG/issues/2163.html">2163</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD4</td>
     <td>Labels in <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2164">
     <td><a href="https://cplusplus.github.io/CWG/issues/2164.html">2164</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.hiding">basic.scope.hiding</a>]</td>
     <td>CD5</td>
     <td>Name hiding and <I>using-directive</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2165">
     <td><a href="https://cplusplus.github.io/CWG/issues/2165.html">2165</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.declarative">basic.scope.declarative</a>]</td>
     <td>CD6</td>
     <td>Namespaces, declarative regions, and translation units</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr class="open" id="2166">
     <td><a href="https://cplusplus.github.io/CWG/issues/2166.html">2166</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>drafting</td>
     <td>Unclear meaning of &#8220;undefined <TT>constexpr</TT> function&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2167">
     <td><a href="https://cplusplus.github.io/CWG/issues/2167.html">2167</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Non-member references with lifetimes within the current evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2168">
     <td><a href="https://cplusplus.github.io/CWG/issues/2168.html">2168</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>review</td>
     <td>Narrowing conversions and +/- infinity</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2169">
     <td><a href="https://cplusplus.github.io/CWG/issues/2169.html">2169</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>open</td>
     <td>Narrowing conversions and overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2170">
     <td><a href="https://cplusplus.github.io/CWG/issues/2170.html">2170</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Unclear definition of odr-use for arrays</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2171">
     <td><a href="https://cplusplus.github.io/CWG/issues/2171.html">2171</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD4</td>
     <td>Triviality of copy constructor with less-qualified parameter</td>
     <td class="full" align="center">Clang 15</td>
   </tr>
   <tr class="open" id="2172">
     <td><a href="https://cplusplus.github.io/CWG/issues/2172.html">2172</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>drafting</td>
     <td>Multiple exceptions with one exception object</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2173">
     <td><a href="https://cplusplus.github.io/CWG/issues/2173.html">2173</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>open</td>
     <td>Partial specialization with non-deduced contexts</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2174">
     <td><a href="https://cplusplus.github.io/CWG/issues/2174.html">2174</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>C++17</td>
     <td>Unclear rules for friend definitions in templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2175">
     <td><a href="https://cplusplus.github.io/CWG/issues/2175.html">2175</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>CD4</td>
     <td>Ambiguity with attribute in conversion operator declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2176">
     <td><a href="https://cplusplus.github.io/CWG/issues/2176.html">2176</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD4</td>
     <td>Destroying the returned object when a destructor throws</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2177">
     <td><a href="https://cplusplus.github.io/CWG/issues/2177.html">2177</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td>Placement <TT>operator delete</TT> and parameter copies</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2178">
     <td><a href="https://cplusplus.github.io/CWG/issues/2178.html">2178</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Substitution of dependent template arguments in default template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2179">
     <td><a href="https://cplusplus.github.io/CWG/issues/2179.html">2179</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>drafting</td>
     <td>Required diagnostic for partial specialization after first use</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2180">
     <td><a href="https://cplusplus.github.io/CWG/issues/2180.html">2180</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>CD4</td>
     <td>Virtual bases in destructors and defaulted assignment operators</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="2181">
     <td><a href="https://cplusplus.github.io/CWG/issues/2181.html">2181</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>C++20</td>
     <td>Normative requirements in an informative Annex</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2182">
     <td><a href="https://cplusplus.github.io/CWG/issues/2182.html">2182</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>drafting</td>
     <td>Pointer arithmetic in array-like containers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2183">
     <td><a href="https://cplusplus.github.io/CWG/issues/2183.html">2183</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>Problems in description of potential exceptions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2184">
     <td><a href="https://cplusplus.github.io/CWG/issues/2184.html">2184</a></td>
+    <td>[<a href="https://wg21.link/diff.expr">diff.expr</a>]</td>
     <td>CD4</td>
     <td>Missing C compatibility entry for decrement of <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2185">
     <td><a href="https://cplusplus.github.io/CWG/issues/2185.html">2185</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD6</td>
     <td>Cv-qualified numeric types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2186">
     <td><a href="https://cplusplus.github.io/CWG/issues/2186.html">2186</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++20</td>
     <td>Unclear point that &#8220;preceding initialization&#8221; must precede</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2187">
     <td><a href="https://cplusplus.github.io/CWG/issues/2187.html">2187</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>drafting</td>
     <td>Protected members and access via <I>qualified-id</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2188">
     <td><a href="https://cplusplus.github.io/CWG/issues/2188.html">2188</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>open</td>
     <td><I>empty-declaration</I> grammar ambiguity</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2189">
     <td><a href="https://cplusplus.github.io/CWG/issues/2189.html">2189</a></td>
+    <td>[<a href="https://wg21.link/over.call.object">over.call.object</a>]</td>
     <td>open</td>
     <td>Surrogate call template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2190">
     <td><a href="https://cplusplus.github.io/CWG/issues/2190.html">2190</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>open</td>
     <td>Insufficient specification of <TT>__has_include</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2191">
     <td><a href="https://cplusplus.github.io/CWG/issues/2191.html">2191</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>C++17</td>
     <td>Incorrect result for <TT>noexcept(typeid(v))</TT></td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr class="open" id="2192">
     <td><a href="https://cplusplus.github.io/CWG/issues/2192.html">2192</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Constant expressions and order-of-eval undefined behavior</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2193">
     <td><a href="https://cplusplus.github.io/CWG/issues/2193.html">2193</a></td>
+    <td>[<a href="https://wg21.link/numeric.limits.members">numeric.limits.members</a>]</td>
     <td>NAD</td>
     <td><TT>numeric_limits&lt;int&gt;::radix</TT> and <TT>digits</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2194">
     <td><a href="https://cplusplus.github.io/CWG/issues/2194.html">2194</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>drafting</td>
     <td>Impossible case in list initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2195">
     <td><a href="https://cplusplus.github.io/CWG/issues/2195.html">2195</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.cv">dcl.type.cv</a>]</td>
     <td>open</td>
     <td>Unsolicited reading of trailing volatile members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2196">
     <td><a href="https://cplusplus.github.io/CWG/issues/2196.html">2196</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++17</td>
     <td>Zero-initialization with virtual base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2197">
     <td><a href="https://cplusplus.github.io/CWG/issues/2197.html">2197</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++17</td>
     <td>Overload resolution and deleted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2198">
     <td><a href="https://cplusplus.github.io/CWG/issues/2198.html">2198</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>C++17</td>
     <td>Linkage of enumerators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2199">
     <td><a href="https://cplusplus.github.io/CWG/issues/2199.html">2199</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD6</td>
     <td>Typedefs and tags</td>
     <td class="full" align="center">Clang 3.8</td>
   </tr>
   <tr id="2200">
     <td><a href="https://cplusplus.github.io/CWG/issues/2200.html">2200</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>NAD</td>
     <td>Conversions in template argument deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2201">
     <td><a href="https://cplusplus.github.io/CWG/issues/2201.html">2201</a></td>
+    <td>[<a href="https://wg21.link/basic.type.qualifier">basic.type.qualifier</a>]</td>
     <td>C++17</td>
     <td>Cv-qualification of array types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2202">
     <td><a href="https://cplusplus.github.io/CWG/issues/2202.html">2202</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>drafting</td>
     <td>When does default argument instantiation occur?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2203">
     <td><a href="https://cplusplus.github.io/CWG/issues/2203.html">2203</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>drafting</td>
     <td>Defaulted copy/move constructors and UDCs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2204">
     <td><a href="https://cplusplus.github.io/CWG/issues/2204.html">2204</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>NAD</td>
     <td>Naming delegated constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2205">
     <td><a href="https://cplusplus.github.io/CWG/issues/2205.html">2205</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++17</td>
     <td>Restrictions on use of <TT>alignas</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2206">
     <td><a href="https://cplusplus.github.io/CWG/issues/2206.html">2206</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>C++17</td>
     <td>Composite type of object and function pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2207">
     <td><a href="https://cplusplus.github.io/CWG/issues/2207.html">2207</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>CD5</td>
     <td>Alignment of allocation function return value</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2208">
     <td><a href="https://cplusplus.github.io/CWG/issues/2208.html">2208</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>NAD</td>
     <td><I>static_assert-declaration</I> does not declare a member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2209">
     <td><a href="https://cplusplus.github.io/CWG/issues/2209.html">2209</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>NAD</td>
     <td>Destruction of constructed array elements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2210">
     <td><a href="https://cplusplus.github.io/CWG/issues/2210.html">2210</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>NAD</td>
     <td>Principal/target constructor confusion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2211">
     <td><a href="https://cplusplus.github.io/CWG/issues/2211.html">2211</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++17</td>
     <td>Hiding by lambda captures and parameters</td>
     <td class="full" align="center">Clang 8</td>
   </tr>
   <tr id="2212">
     <td><a href="https://cplusplus.github.io/CWG/issues/2212.html">2212</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD5</td>
     <td>Typedef changing linkage after use</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2213">
     <td><a href="https://cplusplus.github.io/CWG/issues/2213.html">2213</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>CD6</td>
     <td>Forward declaration of partial specializations</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2214">
     <td><a href="https://cplusplus.github.io/CWG/issues/2214.html">2214</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>C++17</td>
     <td>Missing requirement on representation of integer values</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2215">
     <td><a href="https://cplusplus.github.io/CWG/issues/2215.html">2215</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>C++17</td>
     <td>Redundant description of language linkage in function call</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2216">
     <td><a href="https://cplusplus.github.io/CWG/issues/2216.html">2216</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>Exception specifications in unevaluated contexts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2217">
     <td><a href="https://cplusplus.github.io/CWG/issues/2217.html">2217</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td><TT>constexpr</TT> constructors for non-literal types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2218">
     <td><a href="https://cplusplus.github.io/CWG/issues/2218.html">2218</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup">basic.lookup</a>]</td>
     <td>C++17</td>
     <td>Ambiguity and namespace aliases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2219">
     <td><a href="https://cplusplus.github.io/CWG/issues/2219.html">2219</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>drafting</td>
     <td>Dynamically-unreachable handlers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2220">
     <td><a href="https://cplusplus.github.io/CWG/issues/2220.html">2220</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>C++17</td>
     <td>Hiding index variable in range-based <TT>for</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2221">
     <td><a href="https://cplusplus.github.io/CWG/issues/2221.html">2221</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD6</td>
     <td>Copying volatile objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2222">
     <td><a href="https://cplusplus.github.io/CWG/issues/2222.html">2222</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>drafting</td>
     <td>Additional contexts where instantiation is not required</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2223">
     <td><a href="https://cplusplus.github.io/CWG/issues/2223.html">2223</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>drafting</td>
     <td>Multiple <TT>alignas</TT> specifiers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2224">
     <td><a href="https://cplusplus.github.io/CWG/issues/2224.html">2224</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>C++17</td>
     <td>Member subobjects and base-class casts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2225">
     <td><a href="https://cplusplus.github.io/CWG/issues/2225.html">2225</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>NAD</td>
     <td><TT>reinterpret_cast</TT> to same floating-point type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2226">
     <td><a href="https://cplusplus.github.io/CWG/issues/2226.html">2226</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD5</td>
     <td>Xvalues vs lvalues in conditional expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2227">
     <td><a href="https://cplusplus.github.io/CWG/issues/2227.html">2227</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD5</td>
     <td>Destructor access and default member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2228">
     <td><a href="https://cplusplus.github.io/CWG/issues/2228.html">2228</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>review</td>
     <td>Ambiguity resolution for cast to function type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2229">
     <td><a href="https://cplusplus.github.io/CWG/issues/2229.html">2229</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD5</td>
     <td>Volatile unnamed bit-fields</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="2230">
     <td><a href="https://cplusplus.github.io/CWG/issues/2230.html">2230</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>NAD</td>
     <td>Linkage of <TT>extern "C"</TT> function in unnamed namespace</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2231">
     <td><a href="https://cplusplus.github.io/CWG/issues/2231.html">2231</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>NAD</td>
     <td>Class member access to static data member template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2232">
     <td><a href="https://cplusplus.github.io/CWG/issues/2232.html">2232</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>open</td>
     <td><TT>thread_local</TT> anonymous unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2233">
     <td><a href="https://cplusplus.github.io/CWG/issues/2233.html">2233</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD5</td>
     <td>Function parameter packs following default arguments</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="2234">
     <td><a href="https://cplusplus.github.io/CWG/issues/2234.html">2234</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD5</td>
     <td>Missing rules for <I>simple-template-id</I> as <I>class-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2235">
     <td><a href="https://cplusplus.github.io/CWG/issues/2235.html">2235</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>CD5</td>
     <td>Partial ordering and non-dependent types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2236">
     <td><a href="https://cplusplus.github.io/CWG/issues/2236.html">2236</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>drafting</td>
     <td>When is an alias template specialization dependent?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2237">
     <td><a href="https://cplusplus.github.io/CWG/issues/2237.html">2237</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD5</td>
     <td>Can a <I>template-id</I> name a constructor?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2238">
     <td><a href="https://cplusplus.github.io/CWG/issues/2238.html">2238</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>NAD</td>
     <td>Contradictory alignment requirements for allocation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2239">
     <td><a href="https://cplusplus.github.io/CWG/issues/2239.html">2239</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>NAD</td>
     <td>Sized deallocation with a trivial destructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2240">
     <td><a href="https://cplusplus.github.io/CWG/issues/2240.html">2240</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>NAD</td>
     <td><TT>this</TT> is not odr-used in a constant expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2241">
     <td><a href="https://cplusplus.github.io/CWG/issues/2241.html">2241</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD5</td>
     <td>Overload resolution is not invoked with a single function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2242">
     <td><a href="https://cplusplus.github.io/CWG/issues/2242.html">2242</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++23</td>
     <td>ODR violation with constant initialization possibly omitted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2243">
     <td><a href="https://cplusplus.github.io/CWG/issues/2243.html">2243</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>drafting</td>
     <td>Incorrect use of implicit conversion sequence</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2244">
     <td><a href="https://cplusplus.github.io/CWG/issues/2244.html">2244</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>open</td>
     <td>Base class access in aggregate initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2245">
     <td><a href="https://cplusplus.github.io/CWG/issues/2245.html">2245</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>drafting</td>
     <td>Point of instantiation of incomplete class template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2246">
     <td><a href="https://cplusplus.github.io/CWG/issues/2246.html">2246</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>drafting</td>
     <td>Access of indirect virtual base class constructors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2247">
     <td><a href="https://cplusplus.github.io/CWG/issues/2247.html">2247</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++17</td>
     <td>Lambda capture and variable argument list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2248">
     <td><a href="https://cplusplus.github.io/CWG/issues/2248.html">2248</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>C++17</td>
     <td>Problems with sized delete</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2249">
     <td><a href="https://cplusplus.github.io/CWG/issues/2249.html">2249</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.unqual">expr.prim.id.unqual</a>]</td>
     <td>CD5</td>
     <td><I>identifier</I>s and <I>id-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2250">
     <td><a href="https://cplusplus.github.io/CWG/issues/2250.html">2250</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>open</td>
     <td>Implicit instantiation, destruction, and TUs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2251">
     <td><a href="https://cplusplus.github.io/CWG/issues/2251.html">2251</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++17</td>
     <td>Unreachable enumeration list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2252">
     <td><a href="https://cplusplus.github.io/CWG/issues/2252.html">2252</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD7</td>
     <td>Enumeration list-initialization from the same type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2253">
     <td><a href="https://cplusplus.github.io/CWG/issues/2253.html">2253</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD5</td>
     <td>Unnamed bit-fields and zero-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2254">
     <td><a href="https://cplusplus.github.io/CWG/issues/2254.html">2254</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD5</td>
     <td>Standard-layout classes and bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2255">
     <td><a href="https://cplusplus.github.io/CWG/issues/2255.html">2255</a></td>
+    <td>[<a href="https://wg21.link/temp.spec">temp.spec</a>]</td>
     <td>CD5</td>
     <td>Instantiated static data member templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2256">
     <td><a href="https://cplusplus.github.io/CWG/issues/2256.html">2256</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD5</td>
     <td>Lifetime of trivially-destructible objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2257">
     <td><a href="https://cplusplus.github.io/CWG/issues/2257.html">2257</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD5</td>
     <td>Lifetime extension of references vs exceptions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2258">
     <td><a href="https://cplusplus.github.io/CWG/issues/2258.html">2258</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Storage deallocation during period of destruction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2259">
     <td><a href="https://cplusplus.github.io/CWG/issues/2259.html">2259</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>C++17</td>
     <td>Unclear context describing ambiguity</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2260">
     <td><a href="https://cplusplus.github.io/CWG/issues/2260.html">2260</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD5</td>
     <td>Explicit specializations of deleted member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2261">
     <td><a href="https://cplusplus.github.io/CWG/issues/2261.html">2261</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>extension</td>
     <td>Explicit instantiation of in-class <TT>friend</TT> definition</td>
     <td align="center">Extension</td>
   </tr>
   <tr id="2262">
     <td><a href="https://cplusplus.github.io/CWG/issues/2262.html">2262</a></td>
+    <td>[<a href="https://wg21.link/dcl.asm">dcl.asm</a>]</td>
     <td>C++17</td>
     <td>Attributes for <I>asm-definition</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2263">
     <td><a href="https://cplusplus.github.io/CWG/issues/2263.html">2263</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>drafting</td>
     <td>Default argument instantiation for <TT>friend</TT>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2264">
     <td><a href="https://cplusplus.github.io/CWG/issues/2264.html">2264</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>drafting</td>
     <td>Memberwise copying with indeterminate value</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2265">
     <td><a href="https://cplusplus.github.io/CWG/issues/2265.html">2265</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>drafting</td>
     <td>Delayed pack expansion and member redeclarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2266">
     <td><a href="https://cplusplus.github.io/CWG/issues/2266.html">2266</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD5</td>
     <td>Has dependent type vs is type-dependent</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2267">
     <td><a href="https://cplusplus.github.io/CWG/issues/2267.html">2267</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD5</td>
     <td>Copy-initialization of temporary in reference direct-initialization</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2268">
     <td><a href="https://cplusplus.github.io/CWG/issues/2268.html">2268</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++17</td>
     <td>Unions with mutable members in constant expressions revisited</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2269">
     <td><a href="https://cplusplus.github.io/CWG/issues/2269.html">2269</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>dup</td>
     <td>Additional recursive references in aggregate DMIs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2270">
     <td><a href="https://cplusplus.github.io/CWG/issues/2270.html">2270</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>NAD</td>
     <td>Non-inline functions and explicit instantiation declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2271">
     <td><a href="https://cplusplus.github.io/CWG/issues/2271.html">2271</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++17</td>
     <td>Aliasing <TT>this</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2272">
     <td><a href="https://cplusplus.github.io/CWG/issues/2272.html">2272</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++17</td>
     <td>Implicit initialization of aggregate members of reference type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2273">
     <td><a href="https://cplusplus.github.io/CWG/issues/2273.html">2273</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD5</td>
     <td>Inheriting constructors vs implicit default constructor</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="2274">
     <td><a href="https://cplusplus.github.io/CWG/issues/2274.html">2274</a></td>
+    <td>[<a href="https://wg21.link/stmt.if">stmt.if</a>]</td>
     <td>NAD</td>
     <td>Generic lambda capture vs constexpr if</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2275">
     <td><a href="https://cplusplus.github.io/CWG/issues/2275.html">2275</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>drafting</td>
     <td>Type-dependence of function template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2276">
     <td><a href="https://cplusplus.github.io/CWG/issues/2276.html">2276</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>C++17</td>
     <td>Dependent <TT>noexcept</TT> and function type-dependence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2277">
     <td><a href="https://cplusplus.github.io/CWG/issues/2277.html">2277</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD5</td>
     <td>Ambiguity inheriting constructors with default arguments</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="2278">
     <td><a href="https://cplusplus.github.io/CWG/issues/2278.html">2278</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD5</td>
     <td>Copy elision in constant expressions reconsidered</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2279">
     <td><a href="https://cplusplus.github.io/CWG/issues/2279.html">2279</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>NAD</td>
     <td>Multiple <I>attribute-specifier</I>s in one <I>attribute-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2280">
     <td><a href="https://cplusplus.github.io/CWG/issues/2280.html">2280</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>C++20</td>
     <td>Matching a usual deallocation function with placement new</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2281">
     <td><a href="https://cplusplus.github.io/CWG/issues/2281.html">2281</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>drafting</td>
     <td>Consistency of aligned <TT>operator delete</TT> replacement</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2282">
     <td><a href="https://cplusplus.github.io/CWG/issues/2282.html">2282</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>C++20</td>
     <td>Consistency with mismatched aligned/non-over-aligned allocation/deallocation functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2283">
     <td><a href="https://cplusplus.github.io/CWG/issues/2283.html">2283</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD7</td>
     <td>Missing complete type requirements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2284">
     <td><a href="https://cplusplus.github.io/CWG/issues/2284.html">2284</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>open</td>
     <td>Sequencing of <I>braced-init-list</I> arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2285">
     <td><a href="https://cplusplus.github.io/CWG/issues/2285.html">2285</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD5</td>
     <td>Issues with structured bindings</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="2286">
     <td><a href="https://cplusplus.github.io/CWG/issues/2286.html">2286</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>NAD</td>
     <td>Assignment evaluation order</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2287">
     <td><a href="https://cplusplus.github.io/CWG/issues/2287.html">2287</a></td>
+    <td>[<a href="https://wg21.link/basic.compound">basic.compound</a>]</td>
     <td>CD5</td>
     <td>Pointer-interconvertibility in non-standard-layout unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2288">
     <td><a href="https://cplusplus.github.io/CWG/issues/2288.html">2288</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>NAD</td>
     <td>Contradictory optionality in <I>simple-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2289">
     <td><a href="https://cplusplus.github.io/CWG/issues/2289.html">2289</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.declarative">basic.scope.declarative</a>]</td>
     <td>CD5</td>
     <td>Uniqueness of structured binding names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2290">
     <td><a href="https://cplusplus.github.io/CWG/issues/2290.html">2290</a></td>
+    <td>[<a href="https://wg21.link/over.match.funcs">over.match.funcs</a>]</td>
     <td>CD5</td>
     <td>Unclear specification for overload resolution and deleted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2291">
     <td><a href="https://cplusplus.github.io/CWG/issues/2291.html">2291</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>dup</td>
     <td>Implicit conversion sequences in non-call contexts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2292">
     <td><a href="https://cplusplus.github.io/CWG/issues/2292.html">2292</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD5</td>
     <td><I>simple-template-id</I> is ambiguous between <I>class-name</I> and <I>type-name</I></td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2293">
     <td><a href="https://cplusplus.github.io/CWG/issues/2293.html">2293</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD5</td>
     <td>Requirements for <I>simple-template-id</I> used as a <I>class-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2294">
     <td><a href="https://cplusplus.github.io/CWG/issues/2294.html">2294</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD5</td>
     <td>Dependent <TT>auto</TT> static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2295">
     <td><a href="https://cplusplus.github.io/CWG/issues/2295.html">2295</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD5</td>
     <td>Aggregates with deleted defaulted constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2296">
     <td><a href="https://cplusplus.github.io/CWG/issues/2296.html">2296</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>open</td>
     <td>Are default argument instantiation failures in the &#8220;immediate context&#8221;?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2297">
     <td><a href="https://cplusplus.github.io/CWG/issues/2297.html">2297</a></td>
+    <td>[<a href="https://wg21.link/intro.races">intro.races</a>]</td>
     <td>open</td>
     <td>Unclear specification of atomic operations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2298">
     <td><a href="https://cplusplus.github.io/CWG/issues/2298.html">2298</a></td>
+    <td>[<a href="https://wg21.link/intro.races">intro.races</a>]</td>
     <td>open</td>
     <td>Actions and expression evaluation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2299">
     <td><a href="https://cplusplus.github.io/CWG/issues/2299.html">2299</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD5</td>
     <td><TT>constexpr</TT> vararg functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2300">
     <td><a href="https://cplusplus.github.io/CWG/issues/2300.html">2300</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Lambdas in multiple definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2301">
     <td><a href="https://cplusplus.github.io/CWG/issues/2301.html">2301</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Value-initialization and constexpr constructor evaluation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2302">
     <td><a href="https://cplusplus.github.io/CWG/issues/2302.html">2302</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>NAD</td>
     <td>Address comparison between different member subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2303">
     <td><a href="https://cplusplus.github.io/CWG/issues/2303.html">2303</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD5</td>
     <td>Partial ordering and recursive variadic inheritance</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr id="2304">
     <td><a href="https://cplusplus.github.io/CWG/issues/2304.html">2304</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>NAD</td>
     <td>Incomplete type vs overload resolution</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="2305">
     <td><a href="https://cplusplus.github.io/CWG/issues/2305.html">2305</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD5</td>
     <td>Explicit instantiation of constexpr or inline variable template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2306">
     <td><a href="https://cplusplus.github.io/CWG/issues/2306.html">2306</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>NAD</td>
     <td>Nested friend templates of class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2307">
     <td><a href="https://cplusplus.github.io/CWG/issues/2307.html">2307</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD5</td>
     <td>Unclear definition of &#8220;equivalent to a nontype template parameter&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2308">
     <td><a href="https://cplusplus.github.io/CWG/issues/2308.html">2308</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>NAD</td>
     <td>Structured bindings and lambda capture</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2309">
     <td><a href="https://cplusplus.github.io/CWG/issues/2309.html">2309</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD5</td>
     <td>Restrictions on nested statements within <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2310">
     <td><a href="https://cplusplus.github.io/CWG/issues/2310.html">2310</a></td>
+    <td>[<a href="https://wg21.link/conv.ptr">conv.ptr</a>]</td>
     <td>CD5</td>
     <td>Type completeness and derived-to-base pointer conversions</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr class="open" id="2311">
     <td><a href="https://cplusplus.github.io/CWG/issues/2311.html">2311</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>open</td>
     <td>Missed case for guaranteed copy elision</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2312">
     <td><a href="https://cplusplus.github.io/CWG/issues/2312.html">2312</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD6</td>
     <td>Structured bindings and <TT>mutable</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2313">
     <td><a href="https://cplusplus.github.io/CWG/issues/2313.html">2313</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD5</td>
     <td>Redeclaration of structured binding reference variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2314">
     <td><a href="https://cplusplus.github.io/CWG/issues/2314.html">2314</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>dup</td>
     <td>Structured bindings and lambda capture</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2315">
     <td><a href="https://cplusplus.github.io/CWG/issues/2315.html">2315</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD5</td>
     <td>What is the &#8220;corresponding special member&#8221; of a variant member?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2316">
     <td><a href="https://cplusplus.github.io/CWG/issues/2316.html">2316</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>drafting</td>
     <td>Simplifying class conversions in conditional expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2317">
     <td><a href="https://cplusplus.github.io/CWG/issues/2317.html">2317</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD5</td>
     <td>Self-referential default member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2318">
     <td><a href="https://cplusplus.github.io/CWG/issues/2318.html">2318</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD5</td>
     <td>Nondeduced contexts in deduction from a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2319">
     <td><a href="https://cplusplus.github.io/CWG/issues/2319.html">2319</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>drafting</td>
     <td>Nested brace initialization from same type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2320">
     <td><a href="https://cplusplus.github.io/CWG/issues/2320.html">2320</a></td>
+    <td>[<a href="https://wg21.link/stmt.if">stmt.if</a>]</td>
     <td>extension</td>
     <td><TT>constexpr if</TT> and boolean conversions</td>
     <td align="center">Extension</td>
   </tr>
   <tr id="2321">
     <td><a href="https://cplusplus.github.io/CWG/issues/2321.html">2321</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD5</td>
     <td>Conditional operator and cv-qualified class prvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2322">
     <td><a href="https://cplusplus.github.io/CWG/issues/2322.html">2322</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD5</td>
     <td>Substitution failure and lexical order</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2323">
     <td><a href="https://cplusplus.github.io/CWG/issues/2323.html">2323</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++20</td>
     <td>Expunge POD</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2324">
     <td><a href="https://cplusplus.github.io/CWG/issues/2324.html">2324</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>drafting</td>
     <td>Size of base class subobject</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2325">
     <td><a href="https://cplusplus.github.io/CWG/issues/2325.html">2325</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>drafting</td>
     <td><TT>std::launder</TT> and reuse of character buffers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2326">
     <td><a href="https://cplusplus.github.io/CWG/issues/2326.html">2326</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>dup</td>
     <td>Type deduction with initializer list containing ambiguous functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2327">
     <td><a href="https://cplusplus.github.io/CWG/issues/2327.html">2327</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>drafting</td>
     <td>Copy elision for direct-initialization with a conversion function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2328">
     <td><a href="https://cplusplus.github.io/CWG/issues/2328.html">2328</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>drafting</td>
     <td>Unclear presentation style of template argument deduction rules</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2329">
     <td><a href="https://cplusplus.github.io/CWG/issues/2329.html">2329</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>open</td>
     <td>Virtual base classes and generated assignment operators</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2330">
     <td><a href="https://cplusplus.github.io/CWG/issues/2330.html">2330</a></td>
+    <td>[<a href="https://wg21.link/temp.spec">temp.spec</a>]</td>
     <td>CD5</td>
     <td>Missing references to variable templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2331">
     <td><a href="https://cplusplus.github.io/CWG/issues/2331.html">2331</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>CD6</td>
     <td>Redundancy in description of class scope</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="2332">
     <td><a href="https://cplusplus.github.io/CWG/issues/2332.html">2332</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD5</td>
     <td><I>template-name</I> as <I>simple-type-name</I> vs injected-class-name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2333">
     <td><a href="https://cplusplus.github.io/CWG/issues/2333.html">2333</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD6</td>
     <td>Escape sequences in UTF-8 character literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2334">
     <td><a href="https://cplusplus.github.io/CWG/issues/2334.html">2334</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>open</td>
     <td>Creation of objects by <TT>typeid</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2335">
     <td><a href="https://cplusplus.github.io/CWG/issues/2335.html">2335</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>drafting</td>
     <td>Deduced return types vs member types</td>
     <td align="center">
@@ -13849,1308 +16142,1526 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2336">
     <td><a href="https://cplusplus.github.io/CWG/issues/2336.html">2336</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD5</td>
     <td>Destructor characteristics vs potentially-constructed subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2337">
     <td><a href="https://cplusplus.github.io/CWG/issues/2337.html">2337</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>open</td>
     <td>Incorrect implication of logic ladder for conversion sequence tiebreakers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2338">
     <td><a href="https://cplusplus.github.io/CWG/issues/2338.html">2338</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD5</td>
     <td>Undefined behavior converting to short enums with fixed underlying types</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr id="2339">
     <td><a href="https://cplusplus.github.io/CWG/issues/2339.html">2339</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD5</td>
     <td>Underspecified template arguments in structured bindings</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2340">
     <td><a href="https://cplusplus.github.io/CWG/issues/2340.html">2340</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>open</td>
     <td>Reference collapsing and structured bindings</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2341">
     <td><a href="https://cplusplus.github.io/CWG/issues/2341.html">2341</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>CD5</td>
     <td>Structured bindings with static storage duration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2342">
     <td><a href="https://cplusplus.github.io/CWG/issues/2342.html">2342</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD5</td>
     <td>Reference <TT>reinterpret_cast</TT> and pointer-interconvertibility</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2343">
     <td><a href="https://cplusplus.github.io/CWG/issues/2343.html">2343</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++20</td>
     <td><TT>void*</TT> non-type template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2344">
     <td><a href="https://cplusplus.github.io/CWG/issues/2344.html">2344</a></td>
+    <td>[<a href="https://wg21.link/stmt.select">stmt.select</a>]</td>
     <td>NAD</td>
     <td>Redeclaration of names in <I>init-statement</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2345">
     <td><a href="https://cplusplus.github.io/CWG/issues/2345.html">2345</a></td>
+    <td>[<a href="https://wg21.link/stmt.if">stmt.if</a>]</td>
     <td>CD5</td>
     <td>Jumping across initializers in <I>init-statement</I>s and <I>condition</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2346">
     <td><a href="https://cplusplus.github.io/CWG/issues/2346.html">2346</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD5</td>
     <td>Local variables in default arguments</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="2347">
     <td><a href="https://cplusplus.github.io/CWG/issues/2347.html">2347</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>C++20</td>
     <td>Passing short scoped enumerations to ellipsis</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2348">
     <td><a href="https://cplusplus.github.io/CWG/issues/2348.html">2348</a></td>
+    <td>[<a href="https://wg21.link/stmt.if">stmt.if</a>]</td>
     <td>NAD</td>
     <td>Non-templated <TT>constexpr if</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2349">
     <td><a href="https://cplusplus.github.io/CWG/issues/2349.html">2349</a></td>
+    <td>[<a href="https://wg21.link/stmt">stmt</a>]</td>
     <td>NAD</td>
     <td>Class/enumeration names vs conditions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2350">
     <td><a href="https://cplusplus.github.io/CWG/issues/2350.html">2350</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>NAD</td>
     <td>Forwarding references and deduction guides</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2351">
     <td><a href="https://cplusplus.github.io/CWG/issues/2351.html">2351</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>CD5</td>
     <td><TT>void{}</TT></td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2352">
     <td><a href="https://cplusplus.github.io/CWG/issues/2352.html">2352</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD5</td>
     <td>Similar types and reference binding</td>
     <td class="full" align="center">Clang 10</td>
   </tr>
   <tr id="2353">
     <td><a href="https://cplusplus.github.io/CWG/issues/2353.html">2353</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Potential results of a member access expression for a static data member</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2354">
     <td><a href="https://cplusplus.github.io/CWG/issues/2354.html">2354</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>CD5</td>
     <td>Extended alignment and object representation</td>
     <td class="full" align="center">Clang 15</td>
   </tr>
   <tr id="2355">
     <td><a href="https://cplusplus.github.io/CWG/issues/2355.html">2355</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD6</td>
     <td>Deducing <I>noexcept-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2356">
     <td><a href="https://cplusplus.github.io/CWG/issues/2356.html">2356</a></td>
+    <td>[<a href="https://wg21.link/over.match.funcs">over.match.funcs</a>]</td>
     <td>CD5</td>
     <td>Base class copy and move constructors should not be inherited</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="2357">
     <td><a href="https://cplusplus.github.io/CWG/issues/2357.html">2357</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>NAD</td>
     <td>Lookup in member function declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2358">
     <td><a href="https://cplusplus.github.io/CWG/issues/2358.html">2358</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD5</td>
     <td>Explicit capture of value</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="2359">
     <td><a href="https://cplusplus.github.io/CWG/issues/2359.html">2359</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD5</td>
     <td>Unintended copy initialization with designated initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2360">
     <td><a href="https://cplusplus.github.io/CWG/issues/2360.html">2360</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.unused">dcl.attr.unused</a>]</td>
     <td>CD5</td>
     <td><TT>[[maybe_unused]]</TT> and structured bindings</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2361">
     <td><a href="https://cplusplus.github.io/CWG/issues/2361.html">2361</a></td>
+    <td>[<a href="https://wg21.link/csetjmp.syn">csetjmp.syn</a>]</td>
     <td>open</td>
     <td>Unclear description of <TT>longjmp</TT> undefined behavior</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2362">
     <td><a href="https://cplusplus.github.io/CWG/issues/2362.html">2362</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>open</td>
     <td><TT>__func__</TT> should be <TT>constexpr</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2363">
     <td><a href="https://cplusplus.github.io/CWG/issues/2363.html">2363</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td>Opaque enumeration friend declarations</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2364">
     <td><a href="https://cplusplus.github.io/CWG/issues/2364.html">2364</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Constant expressions, aggregate initialization, and modifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2365">
     <td><a href="https://cplusplus.github.io/CWG/issues/2365.html">2365</a></td>
+    <td>[<a href="https://wg21.link/expr.dynamic.cast">expr.dynamic.cast</a>]</td>
     <td>CD5</td>
     <td>Confusing specification for <TT>dynamic_cast</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2366">
     <td><a href="https://cplusplus.github.io/CWG/issues/2366.html">2366</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD5</td>
     <td>Can default initialization be constant initialization?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2367">
     <td><a href="https://cplusplus.github.io/CWG/issues/2367.html">2367</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>NAD</td>
     <td>Lambdas in default arguments vs the ODR</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2368">
     <td><a href="https://cplusplus.github.io/CWG/issues/2368.html">2368</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD5</td>
     <td>Differences in relational and three-way constant comparisons</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2369">
     <td><a href="https://cplusplus.github.io/CWG/issues/2369.html">2369</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD6</td>
     <td>Ordering between constraints and substitution</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="2370">
     <td><a href="https://cplusplus.github.io/CWG/issues/2370.html">2370</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD6</td>
     <td><TT>friend</TT> declarations of namespace-scope functions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2371">
     <td><a href="https://cplusplus.github.io/CWG/issues/2371.html">2371</a></td>
+    <td>[<a href="https://wg21.link/basic.def">basic.def</a>]</td>
     <td>CD5</td>
     <td>Use of the English term &#8220;attributes&#8221; is confusing</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2372">
     <td><a href="https://cplusplus.github.io/CWG/issues/2372.html">2372</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD5</td>
     <td>Incorrect matching rules for block-scope <TT>extern</TT> declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2373">
     <td><a href="https://cplusplus.github.io/CWG/issues/2373.html">2373</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>CD5</td>
     <td>Incorrect handling of static member function templates in partial ordering</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2374">
     <td><a href="https://cplusplus.github.io/CWG/issues/2374.html">2374</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++20</td>
     <td>Overly permissive specification of <TT>enum</TT> direct-list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2375">
     <td><a href="https://cplusplus.github.io/CWG/issues/2375.html">2375</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>NAD</td>
     <td>Multiple redeclarations of <TT>constexpr</TT> static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2376">
     <td><a href="https://cplusplus.github.io/CWG/issues/2376.html">2376</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>CD5</td>
     <td>Class template argument deduction with array declarator</td>
     <td class="full" align="center">Clang 21</td>
   </tr>
   <tr id="2377">
     <td><a href="https://cplusplus.github.io/CWG/issues/2377.html">2377</a></td>
+    <td>[<a href="https://wg21.link/over.match.viable">over.match.viable</a>]</td>
     <td>NAD</td>
     <td>Explicit copy constructor vs function viability</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2378">
     <td><a href="https://cplusplus.github.io/CWG/issues/2378.html">2378</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++20</td>
     <td>Inconsistent grammar for reference <I>init-capture</I> of pack</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2379">
     <td><a href="https://cplusplus.github.io/CWG/issues/2379.html">2379</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD5</td>
     <td>Missing prohibition against <TT>constexpr</TT> in <TT>friend</TT> declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2380">
     <td><a href="https://cplusplus.github.io/CWG/issues/2380.html">2380</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td><I>capture-default</I> makes too many references odr-usable</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2381">
     <td><a href="https://cplusplus.github.io/CWG/issues/2381.html">2381</a></td>
+    <td>[<a href="https://wg21.link/expr.type">expr.type</a>]</td>
     <td>CD5</td>
     <td>Composite pointer type of pointers to plain and noexcept member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2382">
     <td><a href="https://cplusplus.github.io/CWG/issues/2382.html">2382</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td>Array allocation overhead for non-allocating placement <TT>new</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2383">
     <td><a href="https://cplusplus.github.io/CWG/issues/2383.html">2383</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Variadic member functions of variadic class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2384">
     <td><a href="https://cplusplus.github.io/CWG/issues/2384.html">2384</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD5</td>
     <td>Conversion function templates and qualification conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2385">
     <td><a href="https://cplusplus.github.io/CWG/issues/2385.html">2385</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>CD5</td>
     <td>Lookup for <I>conversion-function-id</I>s</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="2386">
     <td><a href="https://cplusplus.github.io/CWG/issues/2386.html">2386</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD5</td>
     <td><T>tuple_size</T> requirements for structured binding</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2387">
     <td><a href="https://cplusplus.github.io/CWG/issues/2387.html">2387</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD5</td>
     <td>Linkage of const-qualified variable template</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2388">
     <td><a href="https://cplusplus.github.io/CWG/issues/2388.html">2388</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>NAD</td>
     <td>Applicability of <I>contract-attribute-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2389">
     <td><a href="https://cplusplus.github.io/CWG/issues/2389.html">2389</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD6</td>
     <td>Agreement of deduced and explicitly-specified variable types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2390">
     <td><a href="https://cplusplus.github.io/CWG/issues/2390.html">2390</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD5</td>
     <td>Is the argument of <TT>__has_cpp_attribute</TT> macro-expanded?</td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr id="2391">
     <td><a href="https://cplusplus.github.io/CWG/issues/2391.html">2391</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>dup</td>
     <td>Additional template parameters following pack expansion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2392">
     <td><a href="https://cplusplus.github.io/CWG/issues/2392.html">2392</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td><I>new-expression</I> size check and constant evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2393">
     <td><a href="https://cplusplus.github.io/CWG/issues/2393.html">2393</a></td>
+    <td>[<a href="https://wg21.link/expr.pseudo">expr.pseudo</a>]</td>
     <td>NAD</td>
     <td>Pseudo-destructors and object lifetime</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2394">
     <td><a href="https://cplusplus.github.io/CWG/issues/2394.html">2394</a></td>
+    <td>[<a href="https://wg21.link/class.default.ctor">class.default.ctor</a>]</td>
     <td>CD5</td>
     <td>Const-default-constructible for members</td>
     <td class="full" align="center">Clang 15</td>
   </tr>
   <tr class="open" id="2395">
     <td><a href="https://cplusplus.github.io/CWG/issues/2395.html">2395</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>drafting</td>
     <td>Parameters following a pack expansion</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2396">
     <td><a href="https://cplusplus.github.io/CWG/issues/2396.html">2396</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>CD6</td>
     <td>Lookup of names in complex <I>conversion-type-id</I>s</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2397">
     <td><a href="https://cplusplus.github.io/CWG/issues/2397.html">2397</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>CD6</td>
     <td><TT>auto</TT> specifier for pointers and references to arrays</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr class="open" id="2398">
     <td><a href="https://cplusplus.github.io/CWG/issues/2398.html">2398</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>drafting</td>
     <td>Template template parameter matching and deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2399">
     <td><a href="https://cplusplus.github.io/CWG/issues/2399.html">2399</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD5</td>
     <td>Unclear referent of &#8220;expression&#8221; in <I>assignment-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2400">
     <td><a href="https://cplusplus.github.io/CWG/issues/2400.html">2400</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD5</td>
     <td>Constexpr virtual functions and temporary objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2401">
     <td><a href="https://cplusplus.github.io/CWG/issues/2401.html">2401</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++20</td>
     <td>Array decay vs prohibition of subobject non-type arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2402">
     <td><a href="https://cplusplus.github.io/CWG/issues/2402.html">2402</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD6</td>
     <td>When is the restriction to a single <I>c-char</I> in a Unicode literal enforced?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2403">
     <td><a href="https://cplusplus.github.io/CWG/issues/2403.html">2403</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>drafting</td>
     <td>Temporary materialization and base/member initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2404">
     <td><a href="https://cplusplus.github.io/CWG/issues/2404.html">2404</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD5</td>
     <td><TT>[[no_unique_address]]</TT> and allocation order</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2405">
     <td><a href="https://cplusplus.github.io/CWG/issues/2405.html">2405</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD6</td>
     <td>Additional type-dependent expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2406">
     <td><a href="https://cplusplus.github.io/CWG/issues/2406.html">2406</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.fallthrough">dcl.attr.fallthrough</a>]</td>
     <td>CD5</td>
     <td><TT>[[fallthrough]]</TT> attribute and iteration statements</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr id="2407">
     <td><a href="https://cplusplus.github.io/CWG/issues/2407.html">2407</a></td>
+    <td>[<a href="https://wg21.link/diff">diff</a>]</td>
     <td>C++23</td>
     <td>Missing entry in Annex C for defaulted comparison operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2408">
     <td><a href="https://cplusplus.github.io/CWG/issues/2408.html">2408</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>NAD</td>
     <td>Temporaries and previously-initialized elements in aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2409">
     <td><a href="https://cplusplus.github.io/CWG/issues/2409.html">2409</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>drafting</td>
     <td>Explicit specializations of constexpr static data members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2410">
     <td><a href="https://cplusplus.github.io/CWG/issues/2410.html">2410</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++23</td>
     <td>Implicit calls of immediate functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2411">
     <td><a href="https://cplusplus.github.io/CWG/issues/2411.html">2411</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>C++20</td>
     <td>Comparison of pointers to members in template non-type arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2412">
     <td><a href="https://cplusplus.github.io/CWG/issues/2412.html">2412</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>review</td>
     <td>SFINAE vs undeduced placeholder type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2413">
     <td><a href="https://cplusplus.github.io/CWG/issues/2413.html">2413</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD6</td>
     <td><TT>typename</TT> in <I>conversion-function-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2414">
     <td><a href="https://cplusplus.github.io/CWG/issues/2414.html">2414</a></td>
+    <td>[<a href="https://wg21.link/class.compare.default">class.compare.default</a>]</td>
     <td>C++20</td>
     <td>Unclear results if both member and friend <TT>operator&lt;=&gt;</TT> are declared</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2415">
     <td><a href="https://cplusplus.github.io/CWG/issues/2415.html">2415</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>NAD</td>
     <td><I>using-declaration</I>s vs copy assignment operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2416">
     <td><a href="https://cplusplus.github.io/CWG/issues/2416.html">2416</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++20</td>
     <td>Explicit specializations vs <TT>constexpr</TT> and <TT>consteval</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2417">
     <td><a href="https://cplusplus.github.io/CWG/issues/2417.html">2417</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>open</td>
     <td>Explicit instantiation and exception specifications</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2418">
     <td><a href="https://cplusplus.github.io/CWG/issues/2418.html">2418</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD5</td>
     <td>Missing cases in definition of &#8220;usable in constant expressions&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2419">
     <td><a href="https://cplusplus.github.io/CWG/issues/2419.html">2419</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>C++20</td>
     <td>Loss of generality treating pointers to objects as one-element arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2420">
     <td><a href="https://cplusplus.github.io/CWG/issues/2420.html">2420</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>dup</td>
     <td>Exception specifications in explicit instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2421">
     <td><a href="https://cplusplus.github.io/CWG/issues/2421.html">2421</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>drafting</td>
     <td>Explicit instantiation of constrained member functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2422">
     <td><a href="https://cplusplus.github.io/CWG/issues/2422.html">2422</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.guide">temp.deduct.guide</a>]</td>
     <td>C++20</td>
     <td>Incorrect grammar for <I>deduction-guide</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2423">
     <td><a href="https://cplusplus.github.io/CWG/issues/2423.html">2423</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>NAD</td>
     <td>Typedefs, names, and entities</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2424">
     <td><a href="https://cplusplus.github.io/CWG/issues/2424.html">2424</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++20</td>
     <td><TT>constexpr</TT> initialization requirements for variant members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2425">
     <td><a href="https://cplusplus.github.io/CWG/issues/2425.html">2425</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>open</td>
     <td>Confusing wording for deduction from a type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2426">
     <td><a href="https://cplusplus.github.io/CWG/issues/2426.html">2426</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>C++20</td>
     <td>Reference to destructor that cannot be invoked</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2427">
     <td><a href="https://cplusplus.github.io/CWG/issues/2427.html">2427</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>C++20</td>
     <td>Deprecation of volatile operands and unevaluated contexts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2428">
     <td><a href="https://cplusplus.github.io/CWG/issues/2428.html">2428</a></td>
+    <td>[<a href="https://wg21.link/temp.concept">temp.concept</a>]</td>
     <td>C++23</td>
     <td>Deprecating a concept</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2429">
     <td><a href="https://cplusplus.github.io/CWG/issues/2429.html">2429</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>C++20</td>
     <td>Initialization of <TT>thread_local</TT> variables referenced by lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2430">
     <td><a href="https://cplusplus.github.io/CWG/issues/2430.html">2430</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++20</td>
     <td>Completeness of return and parameter types of member functions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2431">
     <td><a href="https://cplusplus.github.io/CWG/issues/2431.html">2431</a></td>
+    <td>[<a href="https://wg21.link/basic.exec">basic.exec</a>]</td>
     <td>C++20</td>
     <td>Full-expressions and temporaries bound to references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2432">
     <td><a href="https://cplusplus.github.io/CWG/issues/2432.html">2432</a></td>
+    <td>[<a href="https://wg21.link/class.spaceship">class.spaceship</a>]</td>
     <td>C++20</td>
     <td>Return types for defaulted <TT>&lt;=&gt;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2433">
     <td><a href="https://cplusplus.github.io/CWG/issues/2433.html">2433</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++20</td>
     <td>Variable templates in the ODR</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2434">
     <td><a href="https://cplusplus.github.io/CWG/issues/2434.html">2434</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>review</td>
     <td>Mandatory copy elision vs non-class objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2435">
     <td><a href="https://cplusplus.github.io/CWG/issues/2435.html">2435</a></td>
+    <td>[<a href="https://wg21.link/temp.spec">temp.spec</a>]</td>
     <td>open</td>
     <td>Alias template specializations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2436">
     <td><a href="https://cplusplus.github.io/CWG/issues/2436.html">2436</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>C++20</td>
     <td>Copy semantics of coroutine parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2437">
     <td><a href="https://cplusplus.github.io/CWG/issues/2437.html">2437</a></td>
+    <td>[<a href="https://wg21.link/class.spaceship">class.spaceship</a>]</td>
     <td>C++20</td>
     <td>Conversion of <TT>std::strong_ordering</TT> in a defaulted <TT>operator&lt;=&gt;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2438">
     <td><a href="https://cplusplus.github.io/CWG/issues/2438.html">2438</a></td>
+    <td>[<a href="https://wg21.link/conv.qual">conv.qual</a>]</td>
     <td>open</td>
     <td>Problems in the specification of qualification conversions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2439">
     <td><a href="https://cplusplus.github.io/CWG/issues/2439.html">2439</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++20</td>
     <td>Undefined term in definition of &#8220;usable in constant expressions&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2440">
     <td><a href="https://cplusplus.github.io/CWG/issues/2440.html">2440</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Allocation in core constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2441">
     <td><a href="https://cplusplus.github.io/CWG/issues/2441.html">2441</a></td>
+    <td>[<a href="https://wg21.link/dcl.inline">dcl.inline</a>]</td>
     <td>C++20</td>
     <td>Inline function parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2442">
     <td><a href="https://cplusplus.github.io/CWG/issues/2442.html">2442</a></td>
+    <td>[<a href="https://wg21.link/over.match.viable">over.match.viable</a>]</td>
     <td>C++20</td>
     <td>Incorrect requirement for default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2443">
     <td><a href="https://cplusplus.github.io/CWG/issues/2443.html">2443</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>C++23</td>
     <td>Meaningless template exports</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2444">
     <td><a href="https://cplusplus.github.io/CWG/issues/2444.html">2444</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>drafting</td>
     <td>Constant expressions in initialization odr-use</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2445">
     <td><a href="https://cplusplus.github.io/CWG/issues/2445.html">2445</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>C++20</td>
     <td>Partial ordering with rewritten candidates</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2446">
     <td><a href="https://cplusplus.github.io/CWG/issues/2446.html">2446</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>C++20</td>
     <td>Questionable type-dependency of <I>concept-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2447">
     <td><a href="https://cplusplus.github.io/CWG/issues/2447.html">2447</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>C++20</td>
     <td>Unintended description of abbreviated function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2448">
     <td><a href="https://cplusplus.github.io/CWG/issues/2448.html">2448</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD6</td>
     <td>Cv-qualification of arithmetic types and deprecation of volatile</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2449">
     <td><a href="https://cplusplus.github.io/CWG/issues/2449.html">2449</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>extension</td>
     <td>Thunks as an implementation technique for pointers to virtual functions</td>
     <td align="center">Extension</td>
   </tr>
   <tr id="2450">
     <td><a href="https://cplusplus.github.io/CWG/issues/2450.html">2450</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD7</td>
     <td><I>braced-init-list</I> as a <I>template-argument</I></td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2451">
     <td><a href="https://cplusplus.github.io/CWG/issues/2451.html">2451</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>C++23</td>
     <td><I>promise</I><TT>.unhandled_exception()</TT> and final suspend point</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2452">
     <td><a href="https://cplusplus.github.io/CWG/issues/2452.html">2452</a></td>
+    <td>[<a href="https://wg21.link/stmt.return.coroutine">stmt.return.coroutine</a>]</td>
     <td>CD6</td>
     <td>Flowing off the end of a coroutine</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2453">
     <td><a href="https://cplusplus.github.io/CWG/issues/2453.html">2453</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto.general">dcl.spec.auto.general</a>]</td>
     <td>NAD</td>
     <td>Deduced return types and coroutine lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2454">
     <td><a href="https://cplusplus.github.io/CWG/issues/2454.html">2454</a></td>
+    <td>[<a href="https://wg21.link/expr.await">expr.await</a>]</td>
     <td>NAD</td>
     <td>Tail recursion and coroutine symmetric transfer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2455">
     <td><a href="https://cplusplus.github.io/CWG/issues/2455.html">2455</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD6</td>
     <td>Concatenation of string literals vs translation phases 5 and 6</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2456">
     <td><a href="https://cplusplus.github.io/CWG/issues/2456.html">2456</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Viable user-defined conversions in converted constant expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2457">
     <td><a href="https://cplusplus.github.io/CWG/issues/2457.html">2457</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD6</td>
     <td>Unexpanded parameter packs don't make a function type dependent</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2458">
     <td><a href="https://cplusplus.github.io/CWG/issues/2458.html">2458</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD6</td>
     <td>Value category of expressions denoting non-static member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2459">
     <td><a href="https://cplusplus.github.io/CWG/issues/2459.html">2459</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD7</td>
     <td>Template parameter initialization</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2460">
     <td><a href="https://cplusplus.github.io/CWG/issues/2460.html">2460</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD6</td>
     <td>C language linkage and constrained non-template friends</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2461">
     <td><a href="https://cplusplus.github.io/CWG/issues/2461.html">2461</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD6</td>
     <td>Diagnosing non-<TT>bool</TT> type constraints</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2462">
     <td><a href="https://cplusplus.github.io/CWG/issues/2462.html">2462</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>open</td>
     <td>Problems with the omission of the <TT>typename</TT> keyword</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2463">
     <td><a href="https://cplusplus.github.io/CWG/issues/2463.html">2463</a></td>
+    <td>[<a href="https://wg21.link/class.prop">class.prop</a>]</td>
     <td>open</td>
     <td>Trivial copyability and unions with non-trivial members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2464">
     <td><a href="https://cplusplus.github.io/CWG/issues/2464.html">2464</a></td>
+    <td>[<a href="https://wg21.link/ptr.launder">ptr.launder</a>]</td>
     <td>CD6</td>
     <td>Constexpr launder and unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2465">
     <td><a href="https://cplusplus.github.io/CWG/issues/2465.html">2465</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>CD6</td>
     <td>Coroutine parameters passed to a promise constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2466">
     <td><a href="https://cplusplus.github.io/CWG/issues/2466.html">2466</a></td>
+    <td>[<a href="https://wg21.link/expr.await">expr.await</a>]</td>
     <td>CD6</td>
     <td><TT>co_await</TT> should be a single evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2467">
     <td><a href="https://cplusplus.github.io/CWG/issues/2467.html">2467</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>drafting</td>
     <td>CTAD for alias templates and the deducible check</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2468">
     <td><a href="https://cplusplus.github.io/CWG/issues/2468.html">2468</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>open</td>
     <td>Omission of the <TT>typename</TT> keyword in a member template parameter list</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2469">
     <td><a href="https://cplusplus.github.io/CWG/issues/2469.html">2469</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>drafting</td>
     <td>Implicit object creation vs constant expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2470">
     <td><a href="https://cplusplus.github.io/CWG/issues/2470.html">2470</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>CD6</td>
     <td>Multiple array objects providing storage for one object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2471">
     <td><a href="https://cplusplus.github.io/CWG/issues/2471.html">2471</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>drafting</td>
     <td>Nested class template argument deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2472">
     <td><a href="https://cplusplus.github.io/CWG/issues/2472.html">2472</a></td>
+    <td>[<a href="https://wg21.link/expr.await">expr.await</a>]</td>
     <td>NAD</td>
     <td>Value categories in <I>await-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2473">
     <td><a href="https://cplusplus.github.io/CWG/issues/2473.html">2473</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.dtor">expr.prim.id.dtor</a>]</td>
     <td>open</td>
     <td>Parentheses in pseudo-destructor calls</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2474">
     <td><a href="https://cplusplus.github.io/CWG/issues/2474.html">2474</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD6</td>
     <td>Cv-qualification and deletion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2475">
     <td><a href="https://cplusplus.github.io/CWG/issues/2475.html">2475</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>C++23</td>
     <td>Object declarations of type <I>cv</I> <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2476">
     <td><a href="https://cplusplus.github.io/CWG/issues/2476.html">2476</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto.general">dcl.spec.auto.general</a>]</td>
     <td>CD7</td>
     <td><I>placeholder-type-specifier</I>s and function declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2477">
     <td><a href="https://cplusplus.github.io/CWG/issues/2477.html">2477</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD6</td>
     <td>Defaulted vs deleted copy constructors/assignment operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2478">
     <td><a href="https://cplusplus.github.io/CWG/issues/2478.html">2478</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++23</td>
     <td>Properties of explicit specializations of implicitly-instantiated class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2479">
     <td><a href="https://cplusplus.github.io/CWG/issues/2479.html">2479</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD6</td>
     <td>Missing specifications for <TT>consteval</TT> and <TT>constinit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2480">
     <td><a href="https://cplusplus.github.io/CWG/issues/2480.html">2480</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.general">basic.lookup.general</a>]</td>
     <td>drafting</td>
     <td>Lookup for enumerators in modules</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2481">
     <td><a href="https://cplusplus.github.io/CWG/issues/2481.html">2481</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD6</td>
     <td>Cv-qualification of temporary to which a reference is bound</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2482">
     <td><a href="https://cplusplus.github.io/CWG/issues/2482.html">2482</a></td>
+    <td>[<a href="https://wg21.link/bit.cast">bit.cast</a>]</td>
     <td>CD6</td>
     <td><TT>bit_cast</TT> and indeterminate values</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2483">
     <td><a href="https://cplusplus.github.io/CWG/issues/2483.html">2483</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>C++23</td>
     <td>Language linkage of static member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2484">
     <td><a href="https://cplusplus.github.io/CWG/issues/2484.html">2484</a></td>
+    <td>[<a href="https://wg21.link/conv.prom">conv.prom</a>]</td>
     <td>CD6</td>
     <td><TT>char8_t</TT> and <TT>char16_t</TT> in integral promotions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2485">
     <td><a href="https://cplusplus.github.io/CWG/issues/2485.html">2485</a></td>
+    <td>[<a href="https://wg21.link/conv.prom">conv.prom</a>]</td>
     <td>CD7</td>
     <td>Bit-fields in integral promotions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2486">
     <td><a href="https://cplusplus.github.io/CWG/issues/2486.html">2486</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD6</td>
     <td>Call to <TT>noexcept</TT> function via <TT>noexcept(false)</TT> pointer/lvalue</td>
     <td class="full" align="center">Clang 4 (C++17 onwards)</td>
   </tr>
   <tr class="open" id="2487">
     <td><a href="https://cplusplus.github.io/CWG/issues/2487.html">2487</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>drafting</td>
     <td>Type dependence of function-style cast to incomplete array type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2488">
     <td><a href="https://cplusplus.github.io/CWG/issues/2488.html">2488</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>open</td>
     <td>Overloading virtual functions and functions with trailing <I>requires-clause</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2489">
     <td><a href="https://cplusplus.github.io/CWG/issues/2489.html">2489</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>C++23</td>
     <td>Storage provided by array of <TT>char</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2490">
     <td><a href="https://cplusplus.github.io/CWG/issues/2490.html">2490</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD6</td>
     <td>Restrictions on destruction in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2491">
     <td><a href="https://cplusplus.github.io/CWG/issues/2491.html">2491</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>CD6</td>
     <td>Export of typedef after its first declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2492">
     <td><a href="https://cplusplus.github.io/CWG/issues/2492.html">2492</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>open</td>
     <td>Comparing user-defined conversion sequences in list-initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2493">
     <td><a href="https://cplusplus.github.io/CWG/issues/2493.html">2493</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto.general">dcl.spec.auto.general</a>]</td>
     <td>dup</td>
     <td><TT>auto</TT> as a <I>conversion-type-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2494">
     <td><a href="https://cplusplus.github.io/CWG/issues/2494.html">2494</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD6</td>
     <td>Multiple definitions of non-odr-used entities</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2495">
     <td><a href="https://cplusplus.github.io/CWG/issues/2495.html">2495</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>open</td>
     <td>Glvalue result of a function call</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2496">
     <td><a href="https://cplusplus.github.io/CWG/issues/2496.html">2496</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>CD6</td>
     <td><I>ref-qualifier</I>s and virtual overriding</td>
     <td class="full" align="center">Clang 21</td>
   </tr>
   <tr class="open" id="2497">
     <td><a href="https://cplusplus.github.io/CWG/issues/2497.html">2497</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>drafting</td>
     <td>Points of instantiation for constexpr function templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2498">
     <td><a href="https://cplusplus.github.io/CWG/issues/2498.html">2498</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>open</td>
     <td>Partial specialization failure and the immediate context</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2499">
     <td><a href="https://cplusplus.github.io/CWG/issues/2499.html">2499</a></td>
+    <td>[<a href="https://wg21.link/basic.compound">basic.compound</a>]</td>
     <td>CD6</td>
     <td>Inconsistency in definition of pointer-interconvertibility</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2500">
     <td><a href="https://cplusplus.github.io/CWG/issues/2500.html">2500</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>extension</td>
     <td><TT>noexcept(false)</TT> functions and <TT>noexcept</TT> expressions</td>
     <td align="center">Extension</td>
   </tr>
   <tr class="open" id="2501">
     <td><a href="https://cplusplus.github.io/CWG/issues/2501.html">2501</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>drafting</td>
     <td>Explicit instantiation and trailing <I>requires-clause</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2502">
     <td><a href="https://cplusplus.github.io/CWG/issues/2502.html">2502</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.block">basic.scope.block</a>]</td>
     <td>CD6</td>
     <td>Unintended declaration conflicts in nested statement scopes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2503">
     <td><a href="https://cplusplus.github.io/CWG/issues/2503.html">2503</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id">expr.prim.id</a>]</td>
     <td>drafting</td>
     <td>Unclear relationship among name, qualified name, and unqualified name</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2504">
     <td><a href="https://cplusplus.github.io/CWG/issues/2504.html">2504</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor.init">class.inhctor.init</a>]</td>
     <td>CD7</td>
     <td>Inheriting constructors from virtual base classes</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="2505">
     <td><a href="https://cplusplus.github.io/CWG/issues/2505.html">2505</a></td>
+    <td>[<a href="https://wg21.link/namespace.unnamed">namespace.unnamed</a>]</td>
     <td>drafting</td>
     <td>Nested unnamed namespace of inline unnamed namespace</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2506">
     <td><a href="https://cplusplus.github.io/CWG/issues/2506.html">2506</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD6</td>
     <td>Structured bindings and array cv-qualifiers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2507">
     <td><a href="https://cplusplus.github.io/CWG/issues/2507.html">2507</a></td>
+    <td>[<a href="https://wg21.link/over.oper.general">over.oper.general</a>]</td>
     <td>CD6</td>
     <td>Default arguments for <TT>operator[]</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2508">
     <td><a href="https://cplusplus.github.io/CWG/issues/2508.html">2508</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>C++23</td>
     <td>Restrictions on uses of template parameter names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2509">
     <td><a href="https://cplusplus.github.io/CWG/issues/2509.html">2509</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.general">expr.prim.lambda.general</a>]</td>
     <td>CD6</td>
     <td><I>decl-specifier-seq</I> in <I>lambda-specifiers</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2510">
     <td><a href="https://cplusplus.github.io/CWG/issues/2510.html">2510</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>NAD</td>
     <td><I>noexcept-specifier</I> of friend function vs class completeness</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2511">
     <td><a href="https://cplusplus.github.io/CWG/issues/2511.html">2511</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD6</td>
     <td>cv-qualified bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2512">
     <td><a href="https://cplusplus.github.io/CWG/issues/2512.html">2512</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>NAD</td>
     <td><TT>typeid</TT> and incomplete class types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="2513">
     <td><a href="https://cplusplus.github.io/CWG/issues/2513.html">2513</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>open</td>
     <td>Ambiguity with <I>requires-clause</I> and <I>operator-function-id</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2514">
     <td><a href="https://cplusplus.github.io/CWG/issues/2514.html">2514</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Modifying const subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2515">
     <td><a href="https://cplusplus.github.io/CWG/issues/2515.html">2515</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>open</td>
     <td>Result of a function call</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2516">
     <td><a href="https://cplusplus.github.io/CWG/issues/2516.html">2516</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>C++23</td>
     <td>Locus of <I>enum-specifier</I> or <I>opaque-enum-declaration</I></td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="2517">
     <td><a href="https://cplusplus.github.io/CWG/issues/2517.html">2517</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.nested">expr.prim.req.nested</a>]</td>
     <td>C++23</td>
     <td>Useless restriction on use of parameter in <I>constraint-expression</I></td>
     <td class="full" align="center">Clang 21</td>
   </tr>
   <tr id="2518">
     <td><a href="https://cplusplus.github.io/CWG/issues/2518.html">2518</a></td>
+    <td>[<a href="https://wg21.link/intro.compliance.general">intro.compliance.general</a>]</td>
     <td>C++23</td>
     <td>Conformance requirements and <TT>#error</TT>/<TT>#warning</TT></td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="2519">
     <td><a href="https://cplusplus.github.io/CWG/issues/2519.html">2519</a></td>
+    <td>[<a href="https://wg21.link/basic.types.general">basic.types.general</a>]</td>
     <td>CD7</td>
     <td>Object representation of a bit-field</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2520">
     <td><a href="https://cplusplus.github.io/CWG/issues/2520.html">2520</a></td>
+    <td>[<a href="https://wg21.link/defns.signature.templ">defns.signature.templ</a>]</td>
     <td>C++23</td>
     <td>Template signature and default template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2521">
     <td><a href="https://cplusplus.github.io/CWG/issues/2521.html">2521</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>C++23</td>
     <td>User-defined literals and reserved identifiers</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr class="open" id="2522">
     <td><a href="https://cplusplus.github.io/CWG/issues/2522.html">2522</a></td>
+    <td>[<a href="https://wg21.link/cpp.concat">cpp.concat</a>]</td>
     <td>open</td>
     <td>Removing placemarker tokens and retention of whitespace</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2523">
     <td><a href="https://cplusplus.github.io/CWG/issues/2523.html">2523</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Undefined behavior via omitted destructor call in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2524">
     <td><a href="https://cplusplus.github.io/CWG/issues/2524.html">2524</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>NAD</td>
     <td>Distinguishing user-defined conversion sequences by <I>ref-qualifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2525">
     <td><a href="https://cplusplus.github.io/CWG/issues/2525.html">2525</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics.general">over.best.ics.general</a>]</td>
     <td>open</td>
     <td>Incorrect definition of implicit conversion sequence</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2526">
     <td><a href="https://cplusplus.github.io/CWG/issues/2526.html">2526</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>C++23</td>
     <td>Relational comparison of <TT>void*</TT> pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2527">
     <td><a href="https://cplusplus.github.io/CWG/issues/2527.html">2527</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.nouniqueaddr">dcl.attr.nouniqueaddr</a>]</td>
     <td>NAD</td>
     <td>Non-class potentially-overlapping objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2528">
     <td><a href="https://cplusplus.github.io/CWG/issues/2528.html">2528</a></td>
+    <td>[<a href="https://wg21.link/expr.arith.conv">expr.arith.conv</a>]</td>
     <td>C++23</td>
     <td>Three-way comparison and the usual arithmetic conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2529">
     <td><a href="https://cplusplus.github.io/CWG/issues/2529.html">2529</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Constant destruction of constexpr references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2530">
     <td><a href="https://cplusplus.github.io/CWG/issues/2530.html">2530</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++23</td>
     <td>Multiple definitions of enumerators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2531">
     <td><a href="https://cplusplus.github.io/CWG/issues/2531.html">2531</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD7</td>
     <td>Static data members redeclared as constexpr</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2532">
     <td><a href="https://cplusplus.github.io/CWG/issues/2532.html">2532</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Kind of pointer value returned by <TT>new T[0]</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2533">
     <td><a href="https://cplusplus.github.io/CWG/issues/2533.html">2533</a></td>
+    <td>[<a href="https://wg21.link/basic.stc">basic.stc</a>]</td>
     <td>CD7</td>
     <td>Storage duration of implicitly created objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2534">
     <td><a href="https://cplusplus.github.io/CWG/issues/2534.html">2534</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD6</td>
     <td>Value category of pseudo-destructor expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2535">
     <td><a href="https://cplusplus.github.io/CWG/issues/2535.html">2535</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD6</td>
     <td>Type punning in class member access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2536">
     <td><a href="https://cplusplus.github.io/CWG/issues/2536.html">2536</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Partially initialized variables during constant initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2537">
     <td><a href="https://cplusplus.github.io/CWG/issues/2537.html">2537</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>drafting</td>
     <td>Overbroad grammar for <I>parameter-declaration</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2538">
     <td><a href="https://cplusplus.github.io/CWG/issues/2538.html">2538</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++23</td>
     <td>Can standard attributes be syntactically ignored?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2539">
     <td><a href="https://cplusplus.github.io/CWG/issues/2539.html">2539</a></td>
+    <td>[<a href="https://wg21.link/class.spaceship">class.spaceship</a>]</td>
     <td>C++23</td>
     <td>Three-way comparison requiring strong ordering for floating-point types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2540">
     <td><a href="https://cplusplus.github.io/CWG/issues/2540.html">2540</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD6</td>
     <td>Unspecified interpretation of <I>numeric-escape-sequence</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2541">
     <td><a href="https://cplusplus.github.io/CWG/issues/2541.html">2541</a></td>
+    <td>[<a href="https://wg21.link/module.unit">module.unit</a>]</td>
     <td>open</td>
     <td>Linkage specifications, module purview, and module attachment</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2542">
     <td><a href="https://cplusplus.github.io/CWG/issues/2542.html">2542</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD7</td>
     <td>Is a closure type a structural type?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2543">
     <td><a href="https://cplusplus.github.io/CWG/issues/2543.html">2543</a></td>
+    <td>[<a href="https://wg21.link/dcl.constinit">dcl.constinit</a>]</td>
     <td>C++23</td>
     <td><TT>constinit</TT> and optimized dynamic initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2544">
     <td><a href="https://cplusplus.github.io/CWG/issues/2544.html">2544</a></td>
+    <td>[<a href="https://wg21.link/basic.compound">basic.compound</a>]</td>
     <td>open</td>
     <td>Address of past-the-end of a potentially-overlapping subobject</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2545">
     <td><a href="https://cplusplus.github.io/CWG/issues/2545.html">2545</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Transparently replacing objects in constant expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2546">
     <td><a href="https://cplusplus.github.io/CWG/issues/2546.html">2546</a></td>
+    <td>[<a href="https://wg21.link/class.compare.secondary">class.compare.secondary</a>]</td>
     <td>CD7</td>
     <td>Defaulted secondary comparison operators defined as deleted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2547">
     <td><a href="https://cplusplus.github.io/CWG/issues/2547.html">2547</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD7</td>
     <td>Defaulted comparison operator function for non-classes</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2548">
     <td><a href="https://cplusplus.github.io/CWG/issues/2548.html">2548</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>NAD</td>
     <td>Array prvalues and additive operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2549">
     <td><a href="https://cplusplus.github.io/CWG/issues/2549.html">2549</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>CD7</td>
     <td>Implicitly moving the operand of a <I>throw-expression</I> in unevaluated contexts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2550">
     <td><a href="https://cplusplus.github.io/CWG/issues/2550.html">2550</a></td>
+    <td>[<a href="https://wg21.link/dcl.ref">dcl.ref</a>]</td>
     <td>CD7</td>
     <td>Type "reference to <I>cv</I> <TT>void</TT>" outside of a declarator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2551">
     <td><a href="https://cplusplus.github.io/CWG/issues/2551.html">2551</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>review</td>
     <td>"Refers to allocated storage" has no meaning</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2552">
     <td><a href="https://cplusplus.github.io/CWG/issues/2552.html">2552</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Constant evaluation of non-defining variable declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2553">
     <td><a href="https://cplusplus.github.io/CWG/issues/2553.html">2553</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>review</td>
     <td>Restrictions on explicit object member functions</td>
     <td align="center">
@@ -15161,6 +17672,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2554">
     <td><a href="https://cplusplus.github.io/CWG/issues/2554.html">2554</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>review</td>
     <td>Overriding virtual functions, also with explicit object parameters</td>
     <td align="center">
@@ -15171,66 +17683,77 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2555">
     <td><a href="https://cplusplus.github.io/CWG/issues/2555.html">2555</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>tentatively ready</td>
     <td>Ineffective redeclaration prevention for <I>using-declarator</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2556">
     <td><a href="https://cplusplus.github.io/CWG/issues/2556.html">2556</a></td>
+    <td>[<a href="https://wg21.link/stmt.return.coroutine">stmt.return.coroutine</a>]</td>
     <td>CD7</td>
     <td>Unusable <TT>promise::return_void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2557">
     <td><a href="https://cplusplus.github.io/CWG/issues/2557.html">2557</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>review</td>
     <td>Class member access referring to an unrelated class</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2558">
     <td><a href="https://cplusplus.github.io/CWG/issues/2558.html">2558</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Uninitialized subobjects as a result of an immediate invocation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2559">
     <td><a href="https://cplusplus.github.io/CWG/issues/2559.html">2559</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Defaulted consteval functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2560">
     <td><a href="https://cplusplus.github.io/CWG/issues/2560.html">2560</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.general">expr.prim.req.general</a>]</td>
     <td>CD7</td>
     <td>Parameter type determination in a <I>requirement-parameter-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2561">
     <td><a href="https://cplusplus.github.io/CWG/issues/2561.html">2561</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD7</td>
     <td>Conversion to function pointer for lambda with explicit object parameter</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="2562">
     <td><a href="https://cplusplus.github.io/CWG/issues/2562.html">2562</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>open</td>
     <td>Exceptions thrown during coroutine startup</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2563">
     <td><a href="https://cplusplus.github.io/CWG/issues/2563.html">2563</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>review</td>
     <td>Initialization of coroutine result object</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2564">
     <td><a href="https://cplusplus.github.io/CWG/issues/2564.html">2564</a></td>
+    <td>[<a href="https://wg21.link/over.call.object">over.call.object</a>]</td>
     <td>drafting</td>
     <td>Conversion to function pointer with an explicit object parameter</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2565">
     <td><a href="https://cplusplus.github.io/CWG/issues/2565.html">2565</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.general">expr.prim.req.general</a>]</td>
     <td>open</td>
     <td>Invalid types in the <I>parameter-declaration-clause</I> of a <I>requires-expression</I></td>
     <td align="center">
@@ -15241,1231 +17764,1435 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2566">
     <td><a href="https://cplusplus.github.io/CWG/issues/2566.html">2566</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>review</td>
     <td>Matching deallocation for uncaught exception</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2567">
     <td><a href="https://cplusplus.github.io/CWG/issues/2567.html">2567</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>NAD</td>
     <td>Operator lookup ambiguity</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2568">
     <td><a href="https://cplusplus.github.io/CWG/issues/2568.html">2568</a></td>
+    <td>[<a href="https://wg21.link/class.compare.default">class.compare.default</a>]</td>
     <td>CD7</td>
     <td>Access checking during synthesis of defaulted comparison operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2569">
     <td><a href="https://cplusplus.github.io/CWG/issues/2569.html">2569</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.unqual">expr.prim.id.unqual</a>]</td>
     <td>CD6</td>
     <td>Use of <code>decltype(capture)</code> in a lambda's <I>parameter-declaration-clause</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2570">
     <td><a href="https://cplusplus.github.io/CWG/issues/2570.html">2570</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD7</td>
     <td>Clarify constexpr for defaulted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2571">
     <td><a href="https://cplusplus.github.io/CWG/issues/2571.html">2571</a></td>
+    <td>[<a href="https://wg21.link/expr.sub">expr.sub</a>]</td>
     <td>CD6</td>
     <td>Evaluation order for subscripting</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2572">
     <td><a href="https://cplusplus.github.io/CWG/issues/2572.html">2572</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>review</td>
     <td>Address of overloaded function with no target</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2573">
     <td><a href="https://cplusplus.github.io/CWG/issues/2573.html">2573</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior when splicing results in a <I>universal-character-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2574">
     <td><a href="https://cplusplus.github.io/CWG/issues/2574.html">2574</a></td>
+    <td>[<a href="https://wg21.link/lex.pptoken">lex.pptoken</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior when lexing unmatched quotes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2575">
     <td><a href="https://cplusplus.github.io/CWG/issues/2575.html">2575</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>open</td>
     <td>Undefined behavior when macro-replacing "defined" operator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2576">
     <td><a href="https://cplusplus.github.io/CWG/issues/2576.html">2576</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>open</td>
     <td>Undefined behavior with macro-expanded <TT>#include</TT> directives</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2577">
     <td><a href="https://cplusplus.github.io/CWG/issues/2577.html">2577</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace.general">cpp.replace.general</a>]</td>
     <td>open</td>
     <td>Undefined behavior for preprocessing directives in macro arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2578">
     <td><a href="https://cplusplus.github.io/CWG/issues/2578.html">2578</a></td>
+    <td>[<a href="https://wg21.link/cpp.stringize">cpp.stringize</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior when creating an invalid string literal via stringizing</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2579">
     <td><a href="https://cplusplus.github.io/CWG/issues/2579.html">2579</a></td>
+    <td>[<a href="https://wg21.link/cpp.concat">cpp.concat</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior when token pasting does not create a preprocessing token</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2580">
     <td><a href="https://cplusplus.github.io/CWG/issues/2580.html">2580</a></td>
+    <td>[<a href="https://wg21.link/cpp.line">cpp.line</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior with <TT>#line</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2581">
     <td><a href="https://cplusplus.github.io/CWG/issues/2581.html">2581</a></td>
+    <td>[<a href="https://wg21.link/cpp.predefined">cpp.predefined</a>]</td>
     <td>open</td>
     <td>Undefined behavior for predefined macros</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2582">
     <td><a href="https://cplusplus.github.io/CWG/issues/2582.html">2582</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>CD6</td>
     <td>Differing member lookup from nested classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2583">
     <td><a href="https://cplusplus.github.io/CWG/issues/2583.html">2583</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>C++23</td>
     <td>Common initial sequence should consider over-alignment</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr class="open" id="2584">
     <td><a href="https://cplusplus.github.io/CWG/issues/2584.html">2584</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>open</td>
     <td>Equivalent types in function template declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2585">
     <td><a href="https://cplusplus.github.io/CWG/issues/2585.html">2585</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>CD6</td>
     <td>Name lookup for coroutine allocation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2586">
     <td><a href="https://cplusplus.github.io/CWG/issues/2586.html">2586</a></td>
+    <td>[<a href="https://wg21.link/class.compare.default">class.compare.default</a>]</td>
     <td>CD6</td>
     <td>Explicit object parameter for assignment and comparison</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr class="open" id="2587">
     <td><a href="https://cplusplus.github.io/CWG/issues/2587.html">2587</a></td>
+    <td>[<a href="https://wg21.link/intro.races">intro.races</a>]</td>
     <td>review</td>
     <td>Visible side effects and initial value of an object</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2588">
     <td><a href="https://cplusplus.github.io/CWG/issues/2588.html">2588</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>CD7</td>
     <td>friend declarations and module linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2589">
     <td><a href="https://cplusplus.github.io/CWG/issues/2589.html">2589</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.atomic">temp.constr.atomic</a>]</td>
     <td>review</td>
     <td>Context of access checks during constraint satisfaction checking</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2590">
     <td><a href="https://cplusplus.github.io/CWG/issues/2590.html">2590</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>C++23</td>
     <td>Underlying type should determine size and alignment requirements of an enum</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2591">
     <td><a href="https://cplusplus.github.io/CWG/issues/2591.html">2591</a></td>
+    <td>[<a href="https://wg21.link/class.union.general">class.union.general</a>]</td>
     <td>CD7</td>
     <td>Implicit change of active union member for anonymous union in union</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2592">
     <td><a href="https://cplusplus.github.io/CWG/issues/2592.html">2592</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Missing definition for placement allocation/deallocation function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2593">
     <td><a href="https://cplusplus.github.io/CWG/issues/2593.html">2593</a></td>
+    <td>[<a href="https://wg21.link/expr.mptr.oper">expr.mptr.oper</a>]</td>
     <td>review</td>
     <td>Insufficient base class restriction for pointer-to-member expression</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2594">
     <td><a href="https://cplusplus.github.io/CWG/issues/2594.html">2594</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD6</td>
     <td>Disallowing a global function template <TT>main</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2595">
     <td><a href="https://cplusplus.github.io/CWG/issues/2595.html">2595</a></td>
+    <td>[<a href="https://wg21.link/special">special</a>]</td>
     <td>CD7</td>
     <td>"More constrained" for eligible special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2596">
     <td><a href="https://cplusplus.github.io/CWG/issues/2596.html">2596</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>drafting</td>
     <td>Instantiation of constrained non-template friends</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2597">
     <td><a href="https://cplusplus.github.io/CWG/issues/2597.html">2597</a></td>
+    <td>[<a href="https://wg21.link/module.unit">module.unit</a>]</td>
     <td>CD6</td>
     <td>Replaceable allocation and deallocation functions in the global module</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2598">
     <td><a href="https://cplusplus.github.io/CWG/issues/2598.html">2598</a></td>
+    <td>[<a href="https://wg21.link/basic.types.general">basic.types.general</a>]</td>
     <td>C++23</td>
     <td>Unions should not require a non-static data member of literal type</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2599">
     <td><a href="https://cplusplus.github.io/CWG/issues/2599.html">2599</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>C++23</td>
     <td>What does initializing a parameter include?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2600">
     <td><a href="https://cplusplus.github.io/CWG/issues/2600.html">2600</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD7</td>
     <td>Type dependency of placeholder types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2601">
     <td><a href="https://cplusplus.github.io/CWG/issues/2601.html">2601</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>C++23</td>
     <td>Tracking of created and destroyed subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2602">
     <td><a href="https://cplusplus.github.io/CWG/issues/2602.html">2602</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++23</td>
     <td>consteval defaulted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2603">
     <td><a href="https://cplusplus.github.io/CWG/issues/2603.html">2603</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>C++23</td>
     <td>Holistic functional equivalence for function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2604">
     <td><a href="https://cplusplus.github.io/CWG/issues/2604.html">2604</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++23</td>
     <td>Attributes for an explicit specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2605">
     <td><a href="https://cplusplus.github.io/CWG/issues/2605.html">2605</a></td>
+    <td>[<a href="https://wg21.link/class.prop">class.prop</a>]</td>
     <td>C++23</td>
     <td>Implicit-lifetime aggregates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2606">
     <td><a href="https://cplusplus.github.io/CWG/issues/2606.html">2606</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD6</td>
     <td><TT>static_cast</TT> from "pointer to void" does not handle similar types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2607">
     <td><a href="https://cplusplus.github.io/CWG/issues/2607.html">2607</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>drafting</td>
     <td>Visibility of enumerator names</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2608">
     <td><a href="https://cplusplus.github.io/CWG/issues/2608.html">2608</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>CD6</td>
     <td>Omitting an empty template argument list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2609">
     <td><a href="https://cplusplus.github.io/CWG/issues/2609.html">2609</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>open</td>
     <td>Padding in class types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2610">
     <td><a href="https://cplusplus.github.io/CWG/issues/2610.html">2610</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++23</td>
     <td>Indirect private base classes in aggregates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2611">
     <td><a href="https://cplusplus.github.io/CWG/issues/2611.html">2611</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>C++23</td>
     <td>Missing parentheses in expansion of fold-expression could cause syntactic reinterpretation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2612">
     <td><a href="https://cplusplus.github.io/CWG/issues/2612.html">2612</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>C++23</td>
     <td>Incorrect comment in example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2613">
     <td><a href="https://cplusplus.github.io/CWG/issues/2613.html">2613</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>C++23</td>
     <td>Incomplete definition of resumer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2614">
     <td><a href="https://cplusplus.github.io/CWG/issues/2614.html">2614</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>C++23</td>
     <td>Unspecified results for class member access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2615">
     <td><a href="https://cplusplus.github.io/CWG/issues/2615.html">2615</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>C++23</td>
     <td>Missing <TT>__has_cpp_attribute(assume)</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2616">
     <td><a href="https://cplusplus.github.io/CWG/issues/2616.html">2616</a></td>
+    <td>[<a href="https://wg21.link/stmt">stmt</a>]</td>
     <td>C++23</td>
     <td>Imprecise restrictions on <TT>break</TT> and <TT>continue</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2617">
     <td><a href="https://cplusplus.github.io/CWG/issues/2617.html">2617</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>review</td>
     <td>Default template arguments for template members of non-template classes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2618">
     <td><a href="https://cplusplus.github.io/CWG/issues/2618.html">2618</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>C++23</td>
     <td>Substitution during deduction should exclude exception specifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2619">
     <td><a href="https://cplusplus.github.io/CWG/issues/2619.html">2619</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++23</td>
     <td>Kind of initialization for a <I>designated-initializer-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2620">
     <td><a href="https://cplusplus.github.io/CWG/issues/2620.html">2620</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>C++23</td>
     <td>Nonsensical disambiguation rule</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2621">
     <td><a href="https://cplusplus.github.io/CWG/issues/2621.html">2621</a></td>
+    <td>[<a href="https://wg21.link/enum.udecl">enum.udecl</a>]</td>
     <td>C++23</td>
     <td>Kind of lookup for <TT>using enum</TT> declarations</td>
     <td class="full-superseded" align="center">Superseded by <a href="#2877">2877</a></td>
   </tr>
   <tr id="2622">
     <td><a href="https://cplusplus.github.io/CWG/issues/2622.html">2622</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>C++23</td>
     <td>Compounding types from function and pointer-to-member types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2623">
     <td><a href="https://cplusplus.github.io/CWG/issues/2623.html">2623</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>drafting</td>
     <td>Invoking destroying <TT>operator delete</TT> for constructor failure</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2624">
     <td><a href="https://cplusplus.github.io/CWG/issues/2624.html">2624</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>C++23</td>
     <td>Array delete expression with no array cookie</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2625">
     <td><a href="https://cplusplus.github.io/CWG/issues/2625.html">2625</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>C++23</td>
     <td>Deletion of pointer to out-of-lifetime object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2626">
     <td><a href="https://cplusplus.github.io/CWG/issues/2626.html">2626</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>C++23</td>
     <td>Rephrase ones' complement using base-2 representation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2627">
     <td><a href="https://cplusplus.github.io/CWG/issues/2627.html">2627</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++23</td>
     <td>Bit-fields and narrowing conversions</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2628">
     <td><a href="https://cplusplus.github.io/CWG/issues/2628.html">2628</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>CD7</td>
     <td>Implicit deduction guides should propagate constraints</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2629">
     <td><a href="https://cplusplus.github.io/CWG/issues/2629.html">2629</a></td>
+    <td>[<a href="https://wg21.link/stmt.switch">stmt.switch</a>]</td>
     <td>C++23</td>
     <td>Variables of floating-point type as <TT>switch</TT> conditions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2630">
     <td><a href="https://cplusplus.github.io/CWG/issues/2630.html">2630</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>C++23</td>
     <td>Syntactic specification of class completeness</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2631">
     <td><a href="https://cplusplus.github.io/CWG/issues/2631.html">2631</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Immediate function evaluations in default arguments</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr class="open" id="2632">
     <td><a href="https://cplusplus.github.io/CWG/issues/2632.html">2632</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>drafting</td>
     <td>'user-declared' is not defined</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2633">
     <td><a href="https://cplusplus.github.io/CWG/issues/2633.html">2633</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>typeid of constexpr-unknown dynamic type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2634">
     <td><a href="https://cplusplus.github.io/CWG/issues/2634.html">2634</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>CD7</td>
     <td>Avoid circularity in specification of scope for friend class declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2635">
     <td><a href="https://cplusplus.github.io/CWG/issues/2635.html">2635</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>C++23</td>
     <td>Constrained structured bindings</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="2636">
     <td><a href="https://cplusplus.github.io/CWG/issues/2636.html">2636</a></td>
+    <td>[<a href="https://wg21.link/ub">ub</a>]</td>
     <td>C++23</td>
     <td>Update Annex E based on Unicode 15.0 UAX #31</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="2637">
     <td><a href="https://cplusplus.github.io/CWG/issues/2637.html">2637</a></td>
+    <td>[<a href="https://wg21.link/class.pre">class.pre</a>]</td>
     <td>CD7</td>
     <td>Injected-class-name as a <I>simple-template-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2638">
     <td><a href="https://cplusplus.github.io/CWG/issues/2638.html">2638</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD7</td>
     <td>Improve the example for initializing by initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2639">
     <td><a href="https://cplusplus.github.io/CWG/issues/2639.html">2639</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>C++23</td>
     <td>new-lines after phase 1</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2640">
     <td><a href="https://cplusplus.github.io/CWG/issues/2640.html">2640</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>C++23</td>
     <td>Allow more characters in an n-char sequence</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="2641">
     <td><a href="https://cplusplus.github.io/CWG/issues/2641.html">2641</a></td>
+    <td>[<a href="https://wg21.link/lex.literal">lex.literal</a>]</td>
     <td>C++23</td>
     <td>Redundant specification of value category of literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2642">
     <td><a href="https://cplusplus.github.io/CWG/issues/2642.html">2642</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>C++23</td>
     <td>Inconsistent use of T and C</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="2643">
     <td><a href="https://cplusplus.github.io/CWG/issues/2643.html">2643</a></td>
+    <td>[<a href="https://wg21.link/basic.types.general">basic.types.general</a>]</td>
     <td>C++23</td>
     <td>Completing a pointer to array of unknown bound</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2644">
     <td><a href="https://cplusplus.github.io/CWG/issues/2644.html">2644</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++23</td>
     <td>Incorrect comment in example</td>
     <td class="full" align="center">Clang 8</td>
   </tr>
   <tr id="2645">
     <td><a href="https://cplusplus.github.io/CWG/issues/2645.html">2645</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>C++23</td>
     <td>Unused term "default argument promotions"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2646">
     <td><a href="https://cplusplus.github.io/CWG/issues/2646.html">2646</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++23</td>
     <td>Defaulted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2647">
     <td><a href="https://cplusplus.github.io/CWG/issues/2647.html">2647</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Fix for "needed for constant evaluation"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2648">
     <td><a href="https://cplusplus.github.io/CWG/issues/2648.html">2648</a></td>
+    <td>[<a href="https://wg21.link/over.call">over.call</a>]</td>
     <td>C++23</td>
     <td>Correspondence of surrogate call function and conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2649">
     <td><a href="https://cplusplus.github.io/CWG/issues/2649.html">2649</a></td>
+    <td>[<a href="https://wg21.link/over.call.object">over.call.object</a>]</td>
     <td>C++23</td>
     <td>Incorrect note about implicit conversion sequence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2650">
     <td><a href="https://cplusplus.github.io/CWG/issues/2650.html">2650</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>C++23</td>
     <td>Incorrect example for ill-formed non-type template arguments</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="2651">
     <td><a href="https://cplusplus.github.io/CWG/issues/2651.html">2651</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>C++23</td>
     <td>Conversion function templates and "noexcept"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2652">
     <td><a href="https://cplusplus.github.io/CWG/issues/2652.html">2652</a></td>
+    <td>[<a href="https://wg21.link/cpp.predefined">cpp.predefined</a>]</td>
     <td>C++23</td>
     <td>Overbroad definition of <TT>__STDCPP_BFLOAT16_T__</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2653">
     <td><a href="https://cplusplus.github.io/CWG/issues/2653.html">2653</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++23</td>
     <td>Can an explicit object parameter have a default argument?</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2654">
     <td><a href="https://cplusplus.github.io/CWG/issues/2654.html">2654</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>C++23</td>
     <td>Un-deprecation of compound volatile assignments</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="2655">
     <td><a href="https://cplusplus.github.io/CWG/issues/2655.html">2655</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>NAD</td>
     <td>Instantiation of default arguments in <I>lambda-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2656">
     <td><a href="https://cplusplus.github.io/CWG/issues/2656.html">2656</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>drafting</td>
     <td>Converting consteval lambda to function pointer in non-immediate context</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2657">
     <td><a href="https://cplusplus.github.io/CWG/issues/2657.html">2657</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD7</td>
     <td>Cv-qualification adjustment when binding reference to temporary</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2658">
     <td><a href="https://cplusplus.github.io/CWG/issues/2658.html">2658</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Trivial copying of unions in core constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2659">
     <td><a href="https://cplusplus.github.io/CWG/issues/2659.html">2659</a></td>
+    <td>[<a href="https://wg21.link/cpp.predefined">cpp.predefined</a>]</td>
     <td>C++23</td>
     <td>Missing feature-test macro for lifetime extension in range-for loop</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2660">
     <td><a href="https://cplusplus.github.io/CWG/issues/2660.html">2660</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>open</td>
     <td>Confusing term "this parameter"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2661">
     <td><a href="https://cplusplus.github.io/CWG/issues/2661.html">2661</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>CD7</td>
     <td>Missing disambiguation rule for <I>pure-specifier</I> vs. <I>brace-or-equal-initializer</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2662">
     <td><a href="https://cplusplus.github.io/CWG/issues/2662.html">2662</a></td>
+    <td>[<a href="https://wg21.link/class.access.general">class.access.general</a>]</td>
     <td>C++23</td>
     <td>Example for member access control vs. overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2663">
     <td><a href="https://cplusplus.github.io/CWG/issues/2663.html">2663</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD7</td>
     <td>Example for member redeclarations with <I>using-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2664">
     <td><a href="https://cplusplus.github.io/CWG/issues/2664.html">2664</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>C++23</td>
     <td>Deduction failure in CTAD for alias templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2665">
     <td><a href="https://cplusplus.github.io/CWG/issues/2665.html">2665</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>NAD</td>
     <td>Replacing a subobject with a complete object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2666">
     <td><a href="https://cplusplus.github.io/CWG/issues/2666.html">2666</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Lifetime extension through <TT>static_cast</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2667">
     <td><a href="https://cplusplus.github.io/CWG/issues/2667.html">2667</a></td>
+    <td>[<a href="https://wg21.link/cpp.import">cpp.import</a>]</td>
     <td>C++23</td>
     <td>Named module imports do not import macros</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2668">
     <td><a href="https://cplusplus.github.io/CWG/issues/2668.html">2668</a></td>
+    <td>[<a href="https://wg21.link/expr.await">expr.await</a>]</td>
     <td>CD7</td>
     <td><TT>co_await</TT> in a <I>lambda-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2669">
     <td><a href="https://cplusplus.github.io/CWG/issues/2669.html">2669</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>open</td>
     <td>Lifetime extension for aggregate initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2670">
     <td><a href="https://cplusplus.github.io/CWG/issues/2670.html">2670</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>open</td>
     <td>Programs and translation units</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2671">
     <td><a href="https://cplusplus.github.io/CWG/issues/2671.html">2671</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning.general">dcl.meaning.general</a>]</td>
     <td>open</td>
     <td>friend named by a <I>template-id</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2672">
     <td><a href="https://cplusplus.github.io/CWG/issues/2672.html">2672</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>CD7</td>
     <td>Lambda body SFINAE is still required, contrary to intent and note</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2673">
     <td><a href="https://cplusplus.github.io/CWG/issues/2673.html">2673</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>C++23</td>
     <td>User-declared spaceship vs. built-in operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2674">
     <td><a href="https://cplusplus.github.io/CWG/issues/2674.html">2674</a></td>
+    <td>[<a href="https://wg21.link/class.ctor.general">class.ctor.general</a>]</td>
     <td>C++23</td>
     <td>Prohibit explicit object parameters for constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2675">
     <td><a href="https://cplusplus.github.io/CWG/issues/2675.html">2675</a></td>
+    <td>[<a href="https://wg21.link/class.union.general">class.union.general</a>]</td>
     <td>open</td>
     <td><TT>start_lifetime_as</TT>, placement-new, and active union members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2676">
     <td><a href="https://cplusplus.github.io/CWG/issues/2676.html">2676</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Replacing a complete object having base subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2677">
     <td><a href="https://cplusplus.github.io/CWG/issues/2677.html">2677</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>review</td>
     <td>Replacing union subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2678">
     <td><a href="https://cplusplus.github.io/CWG/issues/2678.html">2678</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++23</td>
     <td><TT>std::source_location::current</TT> is unimplementable</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2679">
     <td><a href="https://cplusplus.github.io/CWG/issues/2679.html">2679</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics.general">over.best.ics.general</a>]</td>
     <td>open</td>
     <td>Implicit conversion sequence with a null pointer constant</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2680">
     <td><a href="https://cplusplus.github.io/CWG/issues/2680.html">2680</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>open</td>
     <td>Class template argument deduction for aggregates with designated initializers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2681">
     <td><a href="https://cplusplus.github.io/CWG/issues/2681.html">2681</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>C++23</td>
     <td>Deducing member array type from string literal</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="2682">
     <td><a href="https://cplusplus.github.io/CWG/issues/2682.html">2682</a></td>
+    <td>[<a href="https://wg21.link/temp.pre">temp.pre</a>]</td>
     <td>C++23</td>
     <td>Templated function vs. function template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2683">
     <td><a href="https://cplusplus.github.io/CWG/issues/2683.html">2683</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD7</td>
     <td>Default arguments for member functions of templated nested classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2684">
     <td><a href="https://cplusplus.github.io/CWG/issues/2684.html">2684</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>open</td>
     <td>thread_local dynamic initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2685">
     <td><a href="https://cplusplus.github.io/CWG/issues/2685.html">2685</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>C++23</td>
     <td>Aggregate CTAD, string, and brace elision</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2686">
     <td><a href="https://cplusplus.github.io/CWG/issues/2686.html">2686</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.constr">temp.constr.constr</a>]</td>
     <td>open</td>
     <td>Pack expansion into a non-pack parameter of a concept</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2687">
     <td><a href="https://cplusplus.github.io/CWG/issues/2687.html">2687</a></td>
+    <td>[<a href="https://wg21.link/over.match.call.general">over.match.call.general</a>]</td>
     <td>C++23</td>
     <td>Calling an explicit object member function via an address-of-overload-set</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr class="open" id="2688">
     <td><a href="https://cplusplus.github.io/CWG/issues/2688.html">2688</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>open</td>
     <td>Calling explicit object member functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2689">
     <td><a href="https://cplusplus.github.io/CWG/issues/2689.html">2689</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD7</td>
     <td>Are cv-qualified <TT>std::nullptr_t</TT> fundamental types?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2690">
     <td><a href="https://cplusplus.github.io/CWG/issues/2690.html">2690</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>C++23</td>
     <td>Semantics of defaulted move assignment operator for unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2691">
     <td><a href="https://cplusplus.github.io/CWG/issues/2691.html">2691</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>C++23</td>
     <td><I>hexadecimal-escape-sequence</I> is too greedy</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2692">
     <td><a href="https://cplusplus.github.io/CWG/issues/2692.html">2692</a></td>
+    <td>[<a href="https://wg21.link/over.match.call.general">over.match.call.general</a>]</td>
     <td>C++23</td>
     <td>Static and explicit object member functions with the same parameter-type-lists</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr class="open" id="2693">
     <td><a href="https://cplusplus.github.io/CWG/issues/2693.html">2693</a></td>
+    <td>[<a href="https://wg21.link/cpp.line">cpp.line</a>]</td>
     <td>open</td>
     <td>Escape sequences for the <I>string-literal</I> of <TT>#line</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2694">
     <td><a href="https://cplusplus.github.io/CWG/issues/2694.html">2694</a></td>
+    <td>[<a href="https://wg21.link/cpp.pragma.op">cpp.pragma.op</a>]</td>
     <td>open</td>
     <td><I>string-literal</I>s of the <TT>_Pragma</TT> operator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2695">
     <td><a href="https://cplusplus.github.io/CWG/issues/2695.html">2695</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++23</td>
     <td>Semantic ignorability of attributes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2696">
     <td><a href="https://cplusplus.github.io/CWG/issues/2696.html">2696</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>dup</td>
     <td>Relational comparisons of pointers to <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2697">
     <td><a href="https://cplusplus.github.io/CWG/issues/2697.html">2697</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.guide">temp.deduct.guide</a>]</td>
     <td>CD7</td>
     <td>Deduction guides using abbreviated function syntax</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2698">
     <td><a href="https://cplusplus.github.io/CWG/issues/2698.html">2698</a></td>
+    <td>[<a href="https://wg21.link/lex.icon">lex.icon</a>]</td>
     <td>CD7</td>
     <td>Using extended integer types with <TT>z</TT> suffix</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2699">
     <td><a href="https://cplusplus.github.io/CWG/issues/2699.html">2699</a></td>
+    <td>[<a href="https://wg21.link/expr.throw">expr.throw</a>]</td>
     <td>CD7</td>
     <td>Inconsistency of <I>throw-expression</I> specification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2700">
     <td><a href="https://cplusplus.github.io/CWG/issues/2700.html">2700</a></td>
+    <td>[<a href="https://wg21.link/intro.compliance.general">intro.compliance.general</a>]</td>
     <td>CD7</td>
     <td><TT>#error</TT> disallows existing implementation practice</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2701">
     <td><a href="https://cplusplus.github.io/CWG/issues/2701.html">2701</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>open</td>
     <td>Default arguments in multiple scopes / inheritance of array bounds in the same scope</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2702">
     <td><a href="https://cplusplus.github.io/CWG/issues/2702.html">2702</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Constant destruction of reference members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2703">
     <td><a href="https://cplusplus.github.io/CWG/issues/2703.html">2703</a></td>
+    <td>[<a href="https://wg21.link/class.spaceship">class.spaceship</a>]</td>
     <td>CD7</td>
     <td>Three-way comparison requiring strong ordering for floating-point types, take 2</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2704">
     <td><a href="https://cplusplus.github.io/CWG/issues/2704.html">2704</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>open</td>
     <td>Clarify meaning of "bind directly"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2705">
     <td><a href="https://cplusplus.github.io/CWG/issues/2705.html">2705</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>open</td>
     <td>Accessing ambiguous subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2706">
     <td><a href="https://cplusplus.github.io/CWG/issues/2706.html">2706</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>open</td>
     <td>Repeated structured binding declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2707">
     <td><a href="https://cplusplus.github.io/CWG/issues/2707.html">2707</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.guide">temp.deduct.guide</a>]</td>
     <td>CD7</td>
     <td>Deduction guides cannot have a trailing <I>requires-clause</I></td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2708">
     <td><a href="https://cplusplus.github.io/CWG/issues/2708.html">2708</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>CD7</td>
     <td>Parenthesized initialization of arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2709">
     <td><a href="https://cplusplus.github.io/CWG/issues/2709.html">2709</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>NAD</td>
     <td>Parenthesized initialization of reference-to-aggregate</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2710">
     <td><a href="https://cplusplus.github.io/CWG/issues/2710.html">2710</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Loops in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2711">
     <td><a href="https://cplusplus.github.io/CWG/issues/2711.html">2711</a></td>
+    <td>[<a href="https://wg21.link/expr.throw">expr.throw</a>]</td>
     <td>CD7</td>
     <td>Source for copy-initializing the exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2712">
     <td><a href="https://cplusplus.github.io/CWG/issues/2712.html">2712</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>CD7</td>
     <td>Simplify restrictions on built-in assignment operator candidates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2713">
     <td><a href="https://cplusplus.github.io/CWG/issues/2713.html">2713</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD7</td>
     <td>Initialization of reference-to-aggregate from designated initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2714">
     <td><a href="https://cplusplus.github.io/CWG/issues/2714.html">2714</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>CD7</td>
     <td>Implicit deduction guides omit properties from the parameter-declaration-clause of a constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2715">
     <td><a href="https://cplusplus.github.io/CWG/issues/2715.html">2715</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD7</td>
     <td>"calling function" for parameter initialization may not exist</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2716">
     <td><a href="https://cplusplus.github.io/CWG/issues/2716.html">2716</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>CD7</td>
     <td>Rule about self-or-base conversion is normatively redundant</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2717">
     <td><a href="https://cplusplus.github.io/CWG/issues/2717.html">2717</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>CD7</td>
     <td>Pack expansion for <I>alignment-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2718">
     <td><a href="https://cplusplus.github.io/CWG/issues/2718.html">2718</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD7</td>
     <td>Type completeness for derived-to-base conversions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2719">
     <td><a href="https://cplusplus.github.io/CWG/issues/2719.html">2719</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>CD7</td>
     <td>Creating objects in misaligned storage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2720">
     <td><a href="https://cplusplus.github.io/CWG/issues/2720.html">2720</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>CD7</td>
     <td>Template validity rules for templated entities and alias templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2721">
     <td><a href="https://cplusplus.github.io/CWG/issues/2721.html">2721</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD7</td>
     <td>When exactly is storage reused?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2722">
     <td><a href="https://cplusplus.github.io/CWG/issues/2722.html">2722</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>CD7</td>
     <td>Temporary materialization conversion for <TT>noexcept</TT> operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2723">
     <td><a href="https://cplusplus.github.io/CWG/issues/2723.html">2723</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD7</td>
     <td>Range of representable values for floating-point types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2724">
     <td><a href="https://cplusplus.github.io/CWG/issues/2724.html">2724</a></td>
+    <td>[<a href="https://wg21.link/expr.shift">expr.shift</a>]</td>
     <td>CD7</td>
     <td>Clarify rounding for arithmetic right shift</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2725">
     <td><a href="https://cplusplus.github.io/CWG/issues/2725.html">2725</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD7</td>
     <td>Overload resolution for non-call of class member access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2726">
     <td><a href="https://cplusplus.github.io/CWG/issues/2726.html">2726</a></td>
+    <td>[<a href="https://wg21.link/lex.digraph">lex.digraph</a>]</td>
     <td>review</td>
     <td>Alternative tokens appearing as <I>attribute-token</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2727">
     <td><a href="https://cplusplus.github.io/CWG/issues/2727.html">2727</a></td>
+    <td>[<a href="https://wg21.link/module.import">module.import</a>]</td>
     <td>open</td>
     <td>Importing header units synthesized from source files</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2728">
     <td><a href="https://cplusplus.github.io/CWG/issues/2728.html">2728</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD7</td>
     <td>Evaluation of conversions in a <I>delete-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2729">
     <td><a href="https://cplusplus.github.io/CWG/issues/2729.html">2729</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD7</td>
     <td>Meaning of <I>new-type-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2730">
     <td><a href="https://cplusplus.github.io/CWG/issues/2730.html">2730</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>open</td>
     <td>Comparison templates on enumeration types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2731">
     <td><a href="https://cplusplus.github.io/CWG/issues/2731.html">2731</a></td>
+    <td>[<a href="https://wg21.link/over.ics.user">over.ics.user</a>]</td>
     <td>open</td>
     <td>List-initialization sequence with a user-defined conversion</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2732">
     <td><a href="https://cplusplus.github.io/CWG/issues/2732.html">2732</a></td>
+    <td>[<a href="https://wg21.link/module.import">module.import</a>]</td>
     <td>CD7</td>
     <td>Can importable headers react to preprocessor state from point of import?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2733">
     <td><a href="https://cplusplus.github.io/CWG/issues/2733.html">2733</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.unused">dcl.attr.unused</a>]</td>
     <td>CD7</td>
     <td>Applying <TT>[[maybe_unused]]</TT> to a label</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2734">
     <td><a href="https://cplusplus.github.io/CWG/issues/2734.html">2734</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Immediate forward-declared function templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2735">
     <td><a href="https://cplusplus.github.io/CWG/issues/2735.html">2735</a></td>
+    <td>[<a href="https://wg21.link/over.match.best">over.match.best</a>]</td>
     <td>open</td>
     <td>List-initialization and conversions in overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2736">
     <td><a href="https://cplusplus.github.io/CWG/issues/2736.html">2736</a></td>
+    <td>[<a href="https://wg21.link/class.prop">class.prop</a>]</td>
     <td>open</td>
     <td>Standard layout class with empty base class also in first member</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2737">
     <td><a href="https://cplusplus.github.io/CWG/issues/2737.html">2737</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>review</td>
     <td>Temporary lifetime extension for reference init-captures</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2738">
     <td><a href="https://cplusplus.github.io/CWG/issues/2738.html">2738</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.unqual">expr.prim.id.unqual</a>]</td>
     <td>review</td>
     <td>"denotes a destructor" is missing specification</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2739">
     <td><a href="https://cplusplus.github.io/CWG/issues/2739.html">2739</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.nested">expr.prim.req.nested</a>]</td>
     <td>open</td>
     <td>Nested requirement not a constant expression</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2740">
     <td><a href="https://cplusplus.github.io/CWG/issues/2740.html">2740</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Too many objects have constexpr-unknown type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2741">
     <td><a href="https://cplusplus.github.io/CWG/issues/2741.html">2741</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>open</td>
     <td>Implicit conversion sequence from empty list to array of unknown bound</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2742">
     <td><a href="https://cplusplus.github.io/CWG/issues/2742.html">2742</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>drafting</td>
     <td>Guaranteed copy elision for brace-initialization from prvalue</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2743">
     <td><a href="https://cplusplus.github.io/CWG/issues/2743.html">2743</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>open</td>
     <td>Copying non-trivial objects nested within a union</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2744">
     <td><a href="https://cplusplus.github.io/CWG/issues/2744.html">2744</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>open</td>
     <td>Multiple objects of the same type at the same address</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2745">
     <td><a href="https://cplusplus.github.io/CWG/issues/2745.html">2745</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD7</td>
     <td>Dependent odr-use in generic lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2746">
     <td><a href="https://cplusplus.github.io/CWG/issues/2746.html">2746</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>CD7</td>
     <td>Checking of default template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2747">
     <td><a href="https://cplusplus.github.io/CWG/issues/2747.html">2747</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD7</td>
     <td>Cannot depend on an already-deleted splice</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2748">
     <td><a href="https://cplusplus.github.io/CWG/issues/2748.html">2748</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD7</td>
     <td>Accessing static data members via null pointer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2749">
     <td><a href="https://cplusplus.github.io/CWG/issues/2749.html">2749</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD7</td>
     <td>Treatment of "pointer to void" for relational comparisons</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2750">
     <td><a href="https://cplusplus.github.io/CWG/issues/2750.html">2750</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>construct_at without constructor call</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2751">
     <td><a href="https://cplusplus.github.io/CWG/issues/2751.html">2751</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>NAD</td>
     <td>Order of destruction for parameters for operator functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2752">
     <td><a href="https://cplusplus.github.io/CWG/issues/2752.html">2752</a></td>
+    <td>[<a href="https://wg21.link/lex.fcon">lex.fcon</a>]</td>
     <td>open</td>
     <td>Excess-precision floating-point literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2753">
     <td><a href="https://cplusplus.github.io/CWG/issues/2753.html">2753</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>CD7</td>
     <td>Storage reuse for string literal objects and backing arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2754">
     <td><a href="https://cplusplus.github.io/CWG/issues/2754.html">2754</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>CD7</td>
     <td>Using *this in explicit object member functions that are coroutines</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2755">
     <td><a href="https://cplusplus.github.io/CWG/issues/2755.html">2755</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Incorrect wording applied by P2738R1</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2756">
     <td><a href="https://cplusplus.github.io/CWG/issues/2756.html">2756</a></td>
+    <td>[<a href="https://wg21.link/class.init">class.init</a>]</td>
     <td>review</td>
     <td>Completion of initialization by delegating constructor</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2757">
     <td><a href="https://cplusplus.github.io/CWG/issues/2757.html">2757</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>review</td>
     <td>Deleting or deallocating storage of an object during its construction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2758">
     <td><a href="https://cplusplus.github.io/CWG/issues/2758.html">2758</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD7</td>
     <td>What is "access and ambiguity control"?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2759">
     <td><a href="https://cplusplus.github.io/CWG/issues/2759.html">2759</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>CD7</td>
     <td>[[no_unique_address] and common initial sequence</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2760">
     <td><a href="https://cplusplus.github.io/CWG/issues/2760.html">2760</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Defaulted constructor that is an immediate function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2761">
     <td><a href="https://cplusplus.github.io/CWG/issues/2761.html">2761</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD7</td>
     <td>Implicitly invoking the deleted destructor of an anonymous union member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2762">
     <td><a href="https://cplusplus.github.io/CWG/issues/2762.html">2762</a></td>
+    <td>[<a href="https://wg21.link/over.match.funcs.general">over.match.funcs.general</a>]</td>
     <td>CD7</td>
     <td>Type of implicit object parameter</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2763">
     <td><a href="https://cplusplus.github.io/CWG/issues/2763.html">2763</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Ignorability of [[noreturn]] during constant evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2764">
     <td><a href="https://cplusplus.github.io/CWG/issues/2764.html">2764</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>CD7</td>
     <td>Use of placeholders affecting name mangling</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2765">
     <td><a href="https://cplusplus.github.io/CWG/issues/2765.html">2765</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>open</td>
     <td>Address comparisons between potentially non-unique objects during constant evaluation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2766">
     <td><a href="https://cplusplus.github.io/CWG/issues/2766.html">2766</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>open</td>
-    <td>Repeated evaluation of a <I>string-literal</I> may yield different
-objects</td>
+    <td>Repeated evaluation of a <I>string-literal</I> may yield different objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2767">
     <td><a href="https://cplusplus.github.io/CWG/issues/2767.html">2767</a></td>
+    <td>[<a href="https://wg21.link/class.union.anon">class.union.anon</a>]</td>
     <td>open</td>
     <td>Non-defining declarations of anonymous unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2768">
     <td><a href="https://cplusplus.github.io/CWG/issues/2768.html">2768</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD7</td>
     <td>Assignment to enumeration variable with a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2769">
     <td><a href="https://cplusplus.github.io/CWG/issues/2769.html">2769</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>open</td>
     <td>Substitution into template parameters and default template arguments should be interleaved</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2770">
     <td><a href="https://cplusplus.github.io/CWG/issues/2770.html">2770</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>open</td>
     <td>Trailing <I>requires-clause</I> can refer to function parameters before they are substituted into</td>
     <td align="center">
@@ -16476,462 +19203,539 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2771">
     <td><a href="https://cplusplus.github.io/CWG/issues/2771.html">2771</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>CD7</td>
     <td>Transformation for <I>unqualified-id</I>s in address operator</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2772">
     <td><a href="https://cplusplus.github.io/CWG/issues/2772.html">2772</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp03.dcl.dcl">diff.cpp03.dcl.dcl</a>]</td>
     <td>CD7</td>
     <td>Missing Annex C entry for linkage effects of <I>linkage-specification</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2773">
     <td><a href="https://cplusplus.github.io/CWG/issues/2773.html">2773</a></td>
+    <td>[<a href="https://wg21.link/class.union.anon">class.union.anon</a>]</td>
     <td>open</td>
     <td>Naming anonymous union members as class members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2774">
     <td><a href="https://cplusplus.github.io/CWG/issues/2774.html">2774</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>open</td>
     <td>Value-dependence of <I>requires-expression</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2775">
     <td><a href="https://cplusplus.github.io/CWG/issues/2775.html">2775</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD7</td>
     <td>Unclear argument type for copy of exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2776">
     <td><a href="https://cplusplus.github.io/CWG/issues/2776.html">2776</a></td>
+    <td>[<a href="https://wg21.link/intro.compliance.general">intro.compliance.general</a>]</td>
     <td>open</td>
     <td>Substitution failure and implementation limits</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2777">
     <td><a href="https://cplusplus.github.io/CWG/issues/2777.html">2777</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD7</td>
     <td>Type of <I>id-expression</I> denoting a template parameter object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2778">
     <td><a href="https://cplusplus.github.io/CWG/issues/2778.html">2778</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>review</td>
     <td>Trivial destructor does not imply constant destruction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2779">
     <td><a href="https://cplusplus.github.io/CWG/issues/2779.html">2779</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>open</td>
     <td>Restrictions on the ordinary literal encoding</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2780">
     <td><a href="https://cplusplus.github.io/CWG/issues/2780.html">2780</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD7</td>
     <td><TT>reinterpret_cast</TT> to reference to function types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2781">
     <td><a href="https://cplusplus.github.io/CWG/issues/2781.html">2781</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>open</td>
     <td>Unclear recursion in the one-definition rule</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2782">
     <td><a href="https://cplusplus.github.io/CWG/issues/2782.html">2782</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>open</td>
     <td>Treatment of closure types in the one-definition rule</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2783">
     <td><a href="https://cplusplus.github.io/CWG/issues/2783.html">2783</a></td>
+    <td>[<a href="https://wg21.link/module.global.frag">module.global.frag</a>]</td>
     <td>CD7</td>
     <td>Handling of deduction guides in <I>global-module-fragment</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2784">
     <td><a href="https://cplusplus.github.io/CWG/issues/2784.html">2784</a></td>
+    <td>[<a href="https://wg21.link/support.types.layout">support.types.layout</a>]</td>
     <td>open</td>
     <td>Unclear definition of <I>member-designator</I> for <TT>offsetof</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2785">
     <td><a href="https://cplusplus.github.io/CWG/issues/2785.html">2785</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD7</td>
     <td>Type-dependence of <I>requires-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2786">
     <td><a href="https://cplusplus.github.io/CWG/issues/2786.html">2786</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>open</td>
     <td>Comparing pointers to complete objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2787">
     <td><a href="https://cplusplus.github.io/CWG/issues/2787.html">2787</a></td>
+    <td>[<a href="https://wg21.link/special">special</a>]</td>
     <td>open</td>
     <td>Kind of explicit object copy/move assignment function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2788">
     <td><a href="https://cplusplus.github.io/CWG/issues/2788.html">2788</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>open</td>
     <td>Correspondence and redeclarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2789">
     <td><a href="https://cplusplus.github.io/CWG/issues/2789.html">2789</a></td>
+    <td>[<a href="https://wg21.link/over.match.best.general">over.match.best.general</a>]</td>
     <td>CD7</td>
     <td>Overload resolution with implicit and explicit object member functions</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr class="open" id="2790">
     <td><a href="https://cplusplus.github.io/CWG/issues/2790.html">2790</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>open</td>
     <td>Aggregate initialization and user-defined conversion sequence</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2791">
     <td><a href="https://cplusplus.github.io/CWG/issues/2791.html">2791</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>CD7</td>
     <td>Unclear phrasing about "returning to the caller"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2792">
     <td><a href="https://cplusplus.github.io/CWG/issues/2792.html">2792</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>CD7</td>
     <td>Clean up specification of <TT>noexcept</TT> operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2793">
     <td><a href="https://cplusplus.github.io/CWG/issues/2793.html">2793</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.block">basic.scope.block</a>]</td>
     <td>CD7</td>
     <td>Block-scope declaration conflicting with parameter name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2794">
     <td><a href="https://cplusplus.github.io/CWG/issues/2794.html">2794</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>open</td>
     <td>Uniqueness of lambdas in alias templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2795">
     <td><a href="https://cplusplus.github.io/CWG/issues/2795.html">2795</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>CD7</td>
     <td>Overlapping empty subobjects with different cv-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2796">
     <td><a href="https://cplusplus.github.io/CWG/issues/2796.html">2796</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD7</td>
     <td>Function pointer conversions for relational operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2797">
     <td><a href="https://cplusplus.github.io/CWG/issues/2797.html">2797</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>review</td>
     <td>Meaning of "corresponds" for rewritten operator candidates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2798">
     <td><a href="https://cplusplus.github.io/CWG/issues/2798.html">2798</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Manifestly constant evaluation of the <TT>static_assert</TT> message</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr class="open" id="2799">
     <td><a href="https://cplusplus.github.io/CWG/issues/2799.html">2799</a></td>
+    <td>[<a href="https://wg21.link/class.default.ctor">class.default.ctor</a>]</td>
     <td>drafting</td>
     <td>Inheriting default constructors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2800">
     <td><a href="https://cplusplus.github.io/CWG/issues/2800.html">2800</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>review</td>
     <td>Instantiating constexpr variables for potential constant evaluation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2801">
     <td><a href="https://cplusplus.github.io/CWG/issues/2801.html">2801</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD7</td>
     <td>Reference binding with reference-related types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2802">
     <td><a href="https://cplusplus.github.io/CWG/issues/2802.html">2802</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>open</td>
     <td>Constrained <TT>auto</TT> and redeclaration with non-abbreviated syntax</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2803">
     <td><a href="https://cplusplus.github.io/CWG/issues/2803.html">2803</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>CD7</td>
     <td>Overload resolution for reference binding of similar types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2804">
     <td><a href="https://cplusplus.github.io/CWG/issues/2804.html">2804</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>open</td>
     <td>Lookup for determining rewrite targets</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2805">
     <td><a href="https://cplusplus.github.io/CWG/issues/2805.html">2805</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>open</td>
     <td>Underspecified selection of deallocation function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2806">
     <td><a href="https://cplusplus.github.io/CWG/issues/2806.html">2806</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>CD7</td>
     <td>Make a <I>type-requirement</I> a type-only context</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2807">
     <td><a href="https://cplusplus.github.io/CWG/issues/2807.html">2807</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD7</td>
     <td>Destructors declared <TT>consteval</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2808">
     <td><a href="https://cplusplus.github.io/CWG/issues/2808.html">2808</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>review</td>
     <td>Explicit specialization of defaulted special member function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2809">
     <td><a href="https://cplusplus.github.io/CWG/issues/2809.html">2809</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD7</td>
     <td>An implicit definition does not redeclare a function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2810">
     <td><a href="https://cplusplus.github.io/CWG/issues/2810.html">2810</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>CD7</td>
     <td>Requiring the absence of diagnostics for templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2811">
     <td><a href="https://cplusplus.github.io/CWG/issues/2811.html">2811</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD7</td>
     <td>Clarify "use" of main</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr class="open" id="2812">
     <td><a href="https://cplusplus.github.io/CWG/issues/2812.html">2812</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Allocation with explicit alignment</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2813">
     <td><a href="https://cplusplus.github.io/CWG/issues/2813.html">2813</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD7</td>
     <td>Class member access with prvalues</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2814">
     <td><a href="https://cplusplus.github.io/CWG/issues/2814.html">2814</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>NAD</td>
     <td>Alignment requirement of incomplete class type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2815">
     <td><a href="https://cplusplus.github.io/CWG/issues/2815.html">2815</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD7</td>
     <td>Overload resolution for references/pointers to <TT>noexcept</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2816">
     <td><a href="https://cplusplus.github.io/CWG/issues/2816.html">2816</a></td>
+    <td>[<a href="https://wg21.link/intro.progress">intro.progress</a>]</td>
     <td>review</td>
     <td>Unclear phrasing "may assume ... eventually"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2817">
     <td><a href="https://cplusplus.github.io/CWG/issues/2817.html">2817</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>open</td>
     <td>sizeof(abstract class) is underspecified</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2818">
     <td><a href="https://cplusplus.github.io/CWG/issues/2818.html">2818</a></td>
+    <td>[<a href="https://wg21.link/lex.name">lex.name</a>]</td>
     <td>CD7</td>
     <td>Use of predefined reserved identifiers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2819">
     <td><a href="https://cplusplus.github.io/CWG/issues/2819.html">2819</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Cast from null pointer value in a constant expression</td>
     <td class="full" align="center">Clang 19 (C++26 onwards)</td>
   </tr>
   <tr id="2820">
     <td><a href="https://cplusplus.github.io/CWG/issues/2820.html">2820</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>CD7</td>
     <td>Value-initialization and default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2821">
     <td><a href="https://cplusplus.github.io/CWG/issues/2821.html">2821</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>review</td>
     <td>Lifetime, zero-initialization, and dynamic initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2822">
     <td><a href="https://cplusplus.github.io/CWG/issues/2822.html">2822</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.general">basic.stc.general</a>]</td>
     <td>CD7</td>
     <td>Side-effect-free pointer zap</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2823">
     <td><a href="https://cplusplus.github.io/CWG/issues/2823.html">2823</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD7</td>
     <td>Implicit undefined behavior when dereferencing pointers</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="none" align="center">No</td>
   </tr>
   <tr id="2824">
     <td><a href="https://cplusplus.github.io/CWG/issues/2824.html">2824</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>CD7</td>
     <td>Copy-initialization of arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2825">
     <td><a href="https://cplusplus.github.io/CWG/issues/2825.html">2825</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>CD7</td>
     <td>Range-based for statement using a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2826">
     <td><a href="https://cplusplus.github.io/CWG/issues/2826.html">2826</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>drafting</td>
     <td>Missing definition of "temporary expression"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2827">
     <td><a href="https://cplusplus.github.io/CWG/issues/2827.html">2827</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>review</td>
     <td>Representation of unsigned integral types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2828">
     <td><a href="https://cplusplus.github.io/CWG/issues/2828.html">2828</a></td>
+    <td>[<a href="https://wg21.link/expr.cast">expr.cast</a>]</td>
     <td>CD7</td>
     <td>Ambiguous interpretation of C-style cast</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2829">
     <td><a href="https://cplusplus.github.io/CWG/issues/2829.html">2829</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics.general">over.best.ics.general</a>]</td>
     <td>open</td>
     <td>Redundant case in restricting user-defined conversion sequences</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2830">
     <td><a href="https://cplusplus.github.io/CWG/issues/2830.html">2830</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD7</td>
     <td>Top-level cv-qualification should be ignored for list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2831">
     <td><a href="https://cplusplus.github.io/CWG/issues/2831.html">2831</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl.general">dcl.decl.general</a>]</td>
     <td>CD7</td>
     <td>Non-templated function definitions and <I>requires-clause</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2832">
     <td><a href="https://cplusplus.github.io/CWG/issues/2832.html">2832</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Invented temporary variables and temporary objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2833">
     <td><a href="https://cplusplus.github.io/CWG/issues/2833.html">2833</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>review</td>
     <td>Evaluation of odr-use</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2834">
     <td><a href="https://cplusplus.github.io/CWG/issues/2834.html">2834</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>review</td>
     <td>Partial ordering and explicit object parameters</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2835">
     <td><a href="https://cplusplus.github.io/CWG/issues/2835.html">2835</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>open</td>
     <td>Name-independent declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2836">
     <td><a href="https://cplusplus.github.io/CWG/issues/2836.html">2836</a></td>
+    <td>[<a href="https://wg21.link/conv.rank">conv.rank</a>]</td>
     <td>CD7</td>
     <td>Conversion rank of <TT>long double</TT> and extended floating-point types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2837">
     <td><a href="https://cplusplus.github.io/CWG/issues/2837.html">2837</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>open</td>
     <td>Instantiating and inheriting by-value copy constructors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2838">
     <td><a href="https://cplusplus.github.io/CWG/issues/2838.html">2838</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.block">basic.scope.block</a>]</td>
     <td>open</td>
     <td>Declaration conflicts in <I>lambda-expression</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2839">
     <td><a href="https://cplusplus.github.io/CWG/issues/2839.html">2839</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>open</td>
     <td>Explicit destruction of base classes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2840">
     <td><a href="https://cplusplus.github.io/CWG/issues/2840.html">2840</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>open</td>
     <td>Missing requirements for fundamental alignments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2841">
     <td><a href="https://cplusplus.github.io/CWG/issues/2841.html">2841</a></td>
+    <td>[<a href="https://wg21.link/class.ctor.general">class.ctor.general</a>]</td>
     <td>open</td>
     <td>When do const objects start being const?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2842">
     <td><a href="https://cplusplus.github.io/CWG/issues/2842.html">2842</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>open</td>
     <td>Preferring an <TT>initializer_list</TT> over a single value</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2843">
     <td><a href="https://cplusplus.github.io/CWG/issues/2843.html">2843</a></td>
+    <td>[<a href="https://wg21.link/intro.refs">intro.refs</a>]</td>
     <td>CD7</td>
     <td>Undated reference to Unicode makes C++ a moving target</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2844">
     <td><a href="https://cplusplus.github.io/CWG/issues/2844.html">2844</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>open</td>
     <td>Enumerating a finite set of built-in candidates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2845">
     <td><a href="https://cplusplus.github.io/CWG/issues/2845.html">2845</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD7</td>
     <td>Make the closure type of a captureless lambda a structural type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2846">
     <td><a href="https://cplusplus.github.io/CWG/issues/2846.html">2846</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD7</td>
     <td>Out-of-class definitions of explicit object member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2847">
     <td><a href="https://cplusplus.github.io/CWG/issues/2847.html">2847</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>review</td>
     <td>Constrained explicit specializations of function templates at class scope</td>
     <td align="center">
@@ -16942,228 +19746,266 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2848">
     <td><a href="https://cplusplus.github.io/CWG/issues/2848.html">2848</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD7</td>
     <td>Omitting an empty template argument list for explicit instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2849">
     <td><a href="https://cplusplus.github.io/CWG/issues/2849.html">2849</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD7</td>
     <td>Parameter objects are not temporary objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2850">
     <td><a href="https://cplusplus.github.io/CWG/issues/2850.html">2850</a></td>
+    <td>[<a href="https://wg21.link/basic.stc">basic.stc</a>]</td>
     <td>CD7</td>
     <td>Unclear storage duration for function parameter objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2851">
     <td><a href="https://cplusplus.github.io/CWG/issues/2851.html">2851</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Allow floating-point conversions in converted constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2852">
     <td><a href="https://cplusplus.github.io/CWG/issues/2852.html">2852</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>open</td>
     <td>Complete-class contexts and class-scope lambdas</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2853">
     <td><a href="https://cplusplus.github.io/CWG/issues/2853.html">2853</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>CD7</td>
     <td>Pointer arithmetic with pointer to hypothetical element</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2854">
     <td><a href="https://cplusplus.github.io/CWG/issues/2854.html">2854</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD7</td>
     <td>Storage duration of exception objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2855">
     <td><a href="https://cplusplus.github.io/CWG/issues/2855.html">2855</a></td>
+    <td>[<a href="https://wg21.link/expr.post.incr">expr.post.incr</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior in postfix increment</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2856">
     <td><a href="https://cplusplus.github.io/CWG/issues/2856.html">2856</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>CD7</td>
     <td>Copy-list-initialization with explicit default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2857">
     <td><a href="https://cplusplus.github.io/CWG/issues/2857.html">2857</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD7</td>
     <td>Argument-dependent lookup with incomplete class types</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2858">
     <td><a href="https://cplusplus.github.io/CWG/issues/2858.html">2858</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>CD7</td>
     <td>Declarative <I>nested-name-specifier</I>s and <I>pack-index-specifier</I>s</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2859">
     <td><a href="https://cplusplus.github.io/CWG/issues/2859.html">2859</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>CD7</td>
     <td>Value-initialization with multiple default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2860">
     <td><a href="https://cplusplus.github.io/CWG/issues/2860.html">2860</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>dup</td>
     <td>Remove and fix the term "vacuous initialization"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2861">
     <td><a href="https://cplusplus.github.io/CWG/issues/2861.html">2861</a></td>
+    <td>[<a href="https://wg21.link/expr.dynamic.cast">expr.dynamic.cast</a>]</td>
     <td>CD7</td>
     <td><TT>dynamic_cast</TT> on bad pointer value</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2862">
     <td><a href="https://cplusplus.github.io/CWG/issues/2862.html">2862</a></td>
+    <td>[<a href="https://wg21.link/temp.pre">temp.pre</a>]</td>
     <td>review</td>
     <td>Unclear boundaries of template declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2863">
     <td><a href="https://cplusplus.github.io/CWG/issues/2863.html">2863</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>drafting</td>
     <td>Unclear synchronization requirements for object lifetime rules</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2864">
     <td><a href="https://cplusplus.github.io/CWG/issues/2864.html">2864</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD7</td>
     <td>Narrowing floating-point conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2865">
     <td><a href="https://cplusplus.github.io/CWG/issues/2865.html">2865</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD7</td>
     <td>Regression on result of conditional operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2866">
     <td><a href="https://cplusplus.github.io/CWG/issues/2866.html">2866</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr">dcl.attr</a>]</td>
     <td>open</td>
     <td>Observing the effects of <TT>[[no_unique_address]]</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2867">
     <td><a href="https://cplusplus.github.io/CWG/issues/2867.html">2867</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD7</td>
     <td>Order of initialization for structured bindings</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2868">
     <td><a href="https://cplusplus.github.io/CWG/issues/2868.html">2868</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Self-references in trivially copyable objects as function return values</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2869">
     <td><a href="https://cplusplus.github.io/CWG/issues/2869.html">2869</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.this">expr.prim.this</a>]</td>
     <td>CD7</td>
     <td><TT>this</TT> in local classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2870">
     <td><a href="https://cplusplus.github.io/CWG/issues/2870.html">2870</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD7</td>
     <td>Combining absent <I>encoding-prefix</I>es</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2871">
     <td><a href="https://cplusplus.github.io/CWG/issues/2871.html">2871</a></td>
+    <td>[<a href="https://wg21.link/class.default.ctor">class.default.ctor</a>]</td>
     <td>CD7</td>
     <td>User-declared constructor templates inhibiting default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2872">
     <td><a href="https://cplusplus.github.io/CWG/issues/2872.html">2872</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD7</td>
     <td>Linkage and unclear "can be referred to"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2873">
     <td><a href="https://cplusplus.github.io/CWG/issues/2873.html">2873</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>open</td>
     <td>Taking the address of a function involving template argument deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2874">
     <td><a href="https://cplusplus.github.io/CWG/issues/2874.html">2874</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>CD7</td>
     <td>Qualified declarations of partial specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2875">
     <td><a href="https://cplusplus.github.io/CWG/issues/2875.html">2875</a></td>
+    <td>[<a href="https://wg21.link/diff.expr">diff.expr</a>]</td>
     <td>tentatively ready</td>
     <td>Missing support for round-tripping null pointer values through indirection/address operators</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2876">
     <td><a href="https://cplusplus.github.io/CWG/issues/2876.html">2876</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>CD7</td>
     <td>Disambiguation of <TT>T x = delete("text")</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2877">
     <td><a href="https://cplusplus.github.io/CWG/issues/2877.html">2877</a></td>
+    <td>[<a href="https://wg21.link/enum.udecl">enum.udecl</a>]</td>
     <td>CD7</td>
     <td>Type-only lookup for <I>using-enum-declarator</I></td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr class="open" id="2878">
     <td><a href="https://cplusplus.github.io/CWG/issues/2878.html">2878</a></td>
+    <td>[<a href="https://wg21.link/expr.cast">expr.cast</a>]</td>
     <td>open</td>
     <td>C-style casts to reference types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2879">
     <td><a href="https://cplusplus.github.io/CWG/issues/2879.html">2879</a></td>
+    <td>[<a href="https://wg21.link/expr.const.cast">expr.const.cast</a>]</td>
     <td>CD7</td>
     <td>Undesired outcomes with <TT>const_cast</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2880">
     <td><a href="https://cplusplus.github.io/CWG/issues/2880.html">2880</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD7</td>
     <td>Accessibility check for destructor of incomplete class type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2881">
     <td><a href="https://cplusplus.github.io/CWG/issues/2881.html">2881</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD7</td>
     <td>Type restrictions for the explicit object parameter of a lambda</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2882">
     <td><a href="https://cplusplus.github.io/CWG/issues/2882.html">2882</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD7</td>
     <td>Unclear treatment of conversion to <TT>void</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2883">
     <td><a href="https://cplusplus.github.io/CWG/issues/2883.html">2883</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD7</td>
     <td>Definition of "odr-usable" ignores lambda scopes</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2884">
     <td><a href="https://cplusplus.github.io/CWG/issues/2884.html">2884</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>dup</td>
     <td>Qualified declarations of partial specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2885">
     <td><a href="https://cplusplus.github.io/CWG/issues/2885.html">2885</a></td>
+    <td>[<a href="https://wg21.link/class.default.ctor">class.default.ctor</a>]</td>
     <td>review</td>
     <td>Non-eligible trivial default constructors</td>
     <td align="center">
@@ -17174,192 +20016,224 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2886">
     <td><a href="https://cplusplus.github.io/CWG/issues/2886.html">2886</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD7</td>
     <td>Temporaries and trivial potentially-throwing special member functions</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2887">
     <td><a href="https://cplusplus.github.io/CWG/issues/2887.html">2887</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp03.expr">diff.cpp03.expr</a>]</td>
     <td>CD7</td>
     <td>Missing compatibility entries for xvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2888">
     <td><a href="https://cplusplus.github.io/CWG/issues/2888.html">2888</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>review</td>
     <td>Missing cases for reference and array types for argument-dependent lookup</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2889">
     <td><a href="https://cplusplus.github.io/CWG/issues/2889.html">2889</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>open</td>
     <td>Requiring an accessible destructor for destroying operator delete</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2890">
     <td><a href="https://cplusplus.github.io/CWG/issues/2890.html">2890</a></td>
+    <td>[<a href="https://wg21.link/class.local">class.local</a>]</td>
     <td>CD7</td>
     <td>Defining members of local classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2891">
     <td><a href="https://cplusplus.github.io/CWG/issues/2891.html">2891</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>CD7</td>
     <td>Normative status of implementation limits</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2892">
     <td><a href="https://cplusplus.github.io/CWG/issues/2892.html">2892</a></td>
+    <td>[<a href="https://wg21.link/expr.arith.conv">expr.arith.conv</a>]</td>
     <td>CD7</td>
     <td>Unclear usual arithmetic conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2893">
     <td><a href="https://cplusplus.github.io/CWG/issues/2893.html">2893</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>NAD</td>
     <td>Instantiations in discarded <TT>if constexpr</TT> substatements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2894">
     <td><a href="https://cplusplus.github.io/CWG/issues/2894.html">2894</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>CD7</td>
     <td>Functional casts create prvalues of reference type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2895">
     <td><a href="https://cplusplus.github.io/CWG/issues/2895.html">2895</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>CD7</td>
     <td>Initialization should ignore the destination type's cv-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2896">
     <td><a href="https://cplusplus.github.io/CWG/issues/2896.html">2896</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>review</td>
     <td>Template argument deduction involving exception specifications</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2897">
     <td><a href="https://cplusplus.github.io/CWG/issues/2897.html">2897</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>open</td>
     <td>Copying potentially-overlapping union subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2898">
     <td><a href="https://cplusplus.github.io/CWG/issues/2898.html">2898</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics.general">over.best.ics.general</a>]</td>
     <td>CD7</td>
     <td>Clarify implicit conversion sequence from <I>cv</I> <TT>T</TT> to <TT>T</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2899">
     <td><a href="https://cplusplus.github.io/CWG/issues/2899.html">2899</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>CD7</td>
     <td>Bad value representations should cause undefined behavior</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2900">
     <td><a href="https://cplusplus.github.io/CWG/issues/2900.html">2900</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>open</td>
     <td>Deduction of non-type template arguments with placeholder types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2901">
     <td><a href="https://cplusplus.github.io/CWG/issues/2901.html">2901</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD7</td>
     <td>Unclear semantics for near-match aliased access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2902">
     <td><a href="https://cplusplus.github.io/CWG/issues/2902.html">2902</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.general">expr.prim.id.general</a>]</td>
     <td>review</td>
     <td>Implicit <TT>this</TT> transformation outside of permitted contexts</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2903">
     <td><a href="https://cplusplus.github.io/CWG/issues/2903.html">2903</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>drafting</td>
     <td>Can we omit the <TT>template</TT> disambiguator in <I>nested-name-specifier</I>s in type-only contexts?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2904">
     <td><a href="https://cplusplus.github.io/CWG/issues/2904.html">2904</a></td>
+    <td>[<a href="https://wg21.link/temp.pre">temp.pre</a>]</td>
     <td>open</td>
     <td>Introducing <I>template-name</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2905">
     <td><a href="https://cplusplus.github.io/CWG/issues/2905.html">2905</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD7</td>
     <td>Value-dependence of <I>noexcept-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2906">
     <td><a href="https://cplusplus.github.io/CWG/issues/2906.html">2906</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD7</td>
     <td>Lvalue-to-rvalue conversion of class types for conditional operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2907">
     <td><a href="https://cplusplus.github.io/CWG/issues/2907.html">2907</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Constant lvalue-to-rvalue conversion on uninitialized <TT>std::nullptr_t</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2908">
     <td><a href="https://cplusplus.github.io/CWG/issues/2908.html">2908</a></td>
+    <td>[<a href="https://wg21.link/cpp.line">cpp.line</a>]</td>
     <td>CD7</td>
     <td>Counting physical source lines for <TT>__LINE__</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2909">
     <td><a href="https://cplusplus.github.io/CWG/issues/2909.html">2909</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Subtle difference between constant-initialized and constexpr</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2910">
     <td><a href="https://cplusplus.github.io/CWG/issues/2910.html">2910</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD7</td>
     <td>Effect of <I>requirement-parameter-list</I>s on odr-usability</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2911">
     <td><a href="https://cplusplus.github.io/CWG/issues/2911.html">2911</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.general">expr.prim.req.general</a>]</td>
     <td>CD7</td>
     <td>Unclear meaning of expressions "appearing within" subexpressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2912">
     <td><a href="https://cplusplus.github.io/CWG/issues/2912.html">2912</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Too-large value for size in array new</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2913">
     <td><a href="https://cplusplus.github.io/CWG/issues/2913.html">2913</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.guide">temp.deduct.guide</a>]</td>
     <td>CD7</td>
     <td>Grammar for <I>deduction-guide</I> has <I>requires-clause</I> in the wrong position</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr class="open" id="2914">
     <td><a href="https://cplusplus.github.io/CWG/issues/2914.html">2914</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>review</td>
     <td>Unclear order of initialization of static and thread-local variables</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2915">
     <td><a href="https://cplusplus.github.io/CWG/issues/2915.html">2915</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD7</td>
     <td>Explicit object parameters of type <TT>void</TT></td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr class="open" id="2916">
     <td><a href="https://cplusplus.github.io/CWG/issues/2916.html">2916</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>review</td>
     <td>Variable template partial specializations should not be declared <TT>static</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2917">
     <td><a href="https://cplusplus.github.io/CWG/issues/2917.html">2917</a></td>
+    <td>[<a href="https://wg21.link/temp.pre">temp.pre</a>]</td>
     <td>review</td>
     <td>Disallow multiple <I>friend-type-specifier</I>s for a friend template</td>
     <td align="center">
@@ -17370,528 +20244,616 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2918">
     <td><a href="https://cplusplus.github.io/CWG/issues/2918.html">2918</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>CD7</td>
     <td>Consideration of constraints for address of overloaded function</td>
     <td class="full" align="center">Clang 21</td>
   </tr>
   <tr id="2919">
     <td><a href="https://cplusplus.github.io/CWG/issues/2919.html">2919</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>CD7</td>
     <td>Conversion function candidates for initialization of const lvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2920">
     <td><a href="https://cplusplus.github.io/CWG/issues/2920.html">2920</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>open</td>
     <td>The <TT>template</TT> keyword for base classes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2921">
     <td><a href="https://cplusplus.github.io/CWG/issues/2921.html">2921</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>CD7</td>
     <td>Exporting redeclarations of entities not attached to a named module</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2922">
     <td><a href="https://cplusplus.github.io/CWG/issues/2922.html">2922</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>constexpr placement-new is too permissive</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr class="open" id="2923">
     <td><a href="https://cplusplus.github.io/CWG/issues/2923.html">2923</a></td>
+    <td>[<a href="https://wg21.link/intro.progress">intro.progress</a>]</td>
     <td>tentatively ready</td>
     <td>Note about infinite loops and execution steps</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2924">
     <td><a href="https://cplusplus.github.io/CWG/issues/2924.html">2924</a></td>
+    <td>[<a href="https://wg21.link/defns.undefined">defns.undefined</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior during constant evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2925">
     <td><a href="https://cplusplus.github.io/CWG/issues/2925.html">2925</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>NAD</td>
     <td>Deleting a pointer to an incomplete enumeration type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2926">
     <td><a href="https://cplusplus.github.io/CWG/issues/2926.html">2926</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual.general">basic.lookup.qual.general</a>]</td>
     <td>drafting</td>
     <td>Lookup context for dependent qualified names</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2927">
     <td><a href="https://cplusplus.github.io/CWG/issues/2927.html">2927</a></td>
+    <td>[<a href="https://wg21.link/cpp.pre">cpp.pre</a>]</td>
     <td>CD7</td>
     <td>Unclear status of translation unit with <TT>module</TT> keyword</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2928">
     <td><a href="https://cplusplus.github.io/CWG/issues/2928.html">2928</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>open</td>
     <td>No ordering for initializing thread-local variables</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2929">
     <td><a href="https://cplusplus.github.io/CWG/issues/2929.html">2929</a></td>
+    <td>[<a href="https://wg21.link/basic.start.term">basic.start.term</a>]</td>
     <td>review</td>
     <td>Lifetime of trivially-destructible static or thread-local objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2930">
     <td><a href="https://cplusplus.github.io/CWG/issues/2930.html">2930</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>CD7</td>
     <td>Unclear term "copy/move operation" in specification of copy elision</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2931">
     <td><a href="https://cplusplus.github.io/CWG/issues/2931.html">2931</a></td>
+    <td>[<a href="https://wg21.link/over.oper.general">over.oper.general</a>]</td>
     <td>CD7</td>
     <td>Restrictions on operator functions that are explicit object member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2932">
     <td><a href="https://cplusplus.github.io/CWG/issues/2932.html">2932</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>review</td>
     <td>Value range of empty enumeration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2933">
     <td><a href="https://cplusplus.github.io/CWG/issues/2933.html">2933</a></td>
+    <td>[<a href="https://wg21.link/expr.type">expr.type</a>]</td>
     <td>CD7</td>
     <td>Dangling references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2934">
     <td><a href="https://cplusplus.github.io/CWG/issues/2934.html">2934</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>open</td>
     <td>Unclear semantics of exception escaping from <TT>unhandled_exception</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2935">
     <td><a href="https://cplusplus.github.io/CWG/issues/2935.html">2935</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>open</td>
     <td>Destroying the coroutine state when initial-await-resume-called is false</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2936">
     <td><a href="https://cplusplus.github.io/CWG/issues/2936.html">2936</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD7</td>
     <td>Local classes of templated functions should be part of the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2937">
     <td><a href="https://cplusplus.github.io/CWG/issues/2937.html">2937</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD7</td>
     <td>Grammar for <I>preprocessing-file</I> has no normative effect</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2938">
     <td><a href="https://cplusplus.github.io/CWG/issues/2938.html">2938</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>open</td>
     <td>Inheriting linkage from a previous declaration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2939">
     <td><a href="https://cplusplus.github.io/CWG/issues/2939.html">2939</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD7</td>
     <td>Do not allow <TT>reinterpret_cast</TT> from prvalue to rvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2940">
     <td><a href="https://cplusplus.github.io/CWG/issues/2940.html">2940</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>review</td>
     <td>Definition of "object"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2941">
     <td><a href="https://cplusplus.github.io/CWG/issues/2941.html">2941</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Lifetime extension for function-style cast to reference type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2942">
     <td><a href="https://cplusplus.github.io/CWG/issues/2942.html">2942</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>open</td>
     <td>Packs in a function's parameter-type-list</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2943">
     <td><a href="https://cplusplus.github.io/CWG/issues/2943.html">2943</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.nodiscard">dcl.attr.nodiscard</a>]</td>
     <td>CD7</td>
     <td>Discarding a void return value</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2944">
     <td><a href="https://cplusplus.github.io/CWG/issues/2944.html">2944</a></td>
+    <td>[<a href="https://wg21.link/expr.throw">expr.throw</a>]</td>
     <td>CD7</td>
     <td>Unsequenced <I>throw-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2945">
     <td><a href="https://cplusplus.github.io/CWG/issues/2945.html">2945</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>open</td>
     <td>Redundant constraints on matching function template declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2946">
     <td><a href="https://cplusplus.github.io/CWG/issues/2946.html">2946</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>open</td>
     <td>Dependent call equivalence in non-ADL cases</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2947">
     <td><a href="https://cplusplus.github.io/CWG/issues/2947.html">2947</a></td>
+    <td>[<a href="https://wg21.link/cpp.module">cpp.module</a>]</td>
     <td>open</td>
     <td>Limiting macro expansion in <I>pp-module</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2948">
     <td><a href="https://cplusplus.github.io/CWG/issues/2948.html">2948</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>open</td>
     <td>Late ambiguity for partial template specialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2949">
     <td><a href="https://cplusplus.github.io/CWG/issues/2949.html">2949</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>open</td>
     <td>Treatment of ellipsis during partial ordering</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2950">
     <td><a href="https://cplusplus.github.io/CWG/issues/2950.html">2950</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>open</td>
     <td>Value preservation in enumeration vs. integer bit-fields</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2951">
     <td><a href="https://cplusplus.github.io/CWG/issues/2951.html">2951</a></td>
+    <td>[<a href="https://wg21.link/temp.decls.general">temp.decls.general</a>]</td>
     <td>open</td>
     <td>Distinguishing a primary template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2952">
     <td><a href="https://cplusplus.github.io/CWG/issues/2952.html">2952</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Vacuous initialization for subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2953">
     <td><a href="https://cplusplus.github.io/CWG/issues/2953.html">2953</a></td>
+    <td>[<a href="https://wg21.link/basic.types.general">basic.types.general</a>]</td>
     <td>open</td>
     <td>Value representation for non-trivially-copyable types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2954">
     <td><a href="https://cplusplus.github.io/CWG/issues/2954.html">2954</a></td>
+    <td>[<a href="https://wg21.link/intro.races">intro.races</a>]</td>
     <td>NAD</td>
     <td>Simultaneous modifications of an atomic object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2955">
     <td><a href="https://cplusplus.github.io/CWG/issues/2955.html">2955</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>open</td>
     <td>Unify rules about conflicting unordered accesses</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2956">
     <td><a href="https://cplusplus.github.io/CWG/issues/2956.html">2956</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual.general">basic.lookup.qual.general</a>]</td>
     <td>open</td>
     <td>Missing allowance for pseudo-destructors in qualified lookup</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2957">
     <td><a href="https://cplusplus.github.io/CWG/issues/2957.html">2957</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>open</td>
     <td>Evaluating a reference member should constitute access</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2958">
     <td><a href="https://cplusplus.github.io/CWG/issues/2958.html">2958</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>open</td>
     <td>Overload resolution involving lvalue transformation and qualification conversion</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2959">
     <td><a href="https://cplusplus.github.io/CWG/issues/2959.html">2959</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>open</td>
     <td>Naming enumerators in class member access expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2960">
     <td><a href="https://cplusplus.github.io/CWG/issues/2960.html">2960</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Introduce discontiguous object lifetime</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2961">
     <td><a href="https://cplusplus.github.io/CWG/issues/2961.html">2961</a></td>
+    <td>[<a href="https://wg21.link/temp.constr">temp.constr</a>]</td>
     <td>open</td>
     <td>Checking of ill-formed types in <I>constraint-expression</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2962">
     <td><a href="https://cplusplus.github.io/CWG/issues/2962.html">2962</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Evaluation of destructor call for variable with constant destruction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2963">
     <td><a href="https://cplusplus.github.io/CWG/issues/2963.html">2963</a></td>
+    <td>[<a href="https://wg21.link/stmt.ambig">stmt.ambig</a>]</td>
     <td>open</td>
     <td>Paradoxical variable-or-function declaration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2964">
     <td><a href="https://cplusplus.github.io/CWG/issues/2964.html">2964</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>open</td>
     <td>Reading "invalid pointer values"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2965">
     <td><a href="https://cplusplus.github.io/CWG/issues/2965.html">2965</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.temp">basic.scope.temp</a>]</td>
     <td>open</td>
     <td>Generic lambdas do not have a template parameter scope</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2966">
     <td><a href="https://cplusplus.github.io/CWG/issues/2966.html">2966</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>open</td>
     <td>Alignment and value representation of <TT>std::nullptr_t</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2967">
     <td><a href="https://cplusplus.github.io/CWG/issues/2967.html">2967</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>open</td>
     <td>Explicit conversion functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2968">
     <td><a href="https://cplusplus.github.io/CWG/issues/2968.html">2968</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.general">basic.lookup.general</a>]</td>
     <td>open</td>
     <td>Name lookup result for <I>typedef-name</I> vs. <I>class-name</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2969">
     <td><a href="https://cplusplus.github.io/CWG/issues/2969.html">2969</a></td>
+    <td>[<a href="https://wg21.link/basic.scope">basic.scope</a>]</td>
     <td>open</td>
     <td>Scopes in the <I>function-try-block</I> of a constructor</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2970">
     <td><a href="https://cplusplus.github.io/CWG/issues/2970.html">2970</a></td>
+    <td>[<a href="https://wg21.link/intro.races">intro.races</a>]</td>
     <td>CD7</td>
     <td>Races with <TT>volatile sig_atomic_t</TT> bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2971">
     <td><a href="https://cplusplus.github.io/CWG/issues/2971.html">2971</a></td>
+    <td>[<a href="https://wg21.link/module.global.frag">module.global.frag</a>]</td>
     <td>open</td>
     <td>Specializations for a class are not decl-reachable</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2972">
     <td><a href="https://cplusplus.github.io/CWG/issues/2972.html">2972</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>open</td>
     <td>Declarative <I>nested-name-specifier</I> naming a partial specialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2973">
     <td><a href="https://cplusplus.github.io/CWG/issues/2973.html">2973</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>open</td>
     <td>Does an <I>alias-declaration</I> introduce a name for linkage purposes?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2974">
     <td><a href="https://cplusplus.github.io/CWG/issues/2974.html">2974</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>open</td>
     <td>Non-deduced context for <I>qualified-id</I> naming a template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2975">
     <td><a href="https://cplusplus.github.io/CWG/issues/2975.html">2975</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.normal">temp.constr.normal</a>]</td>
     <td>open</td>
     <td>Effect of concept <I>template-head</I> on parameter mappings</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2976">
     <td><a href="https://cplusplus.github.io/CWG/issues/2976.html">2976</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>review</td>
     <td>Transferring control out of a function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2977">
     <td><a href="https://cplusplus.github.io/CWG/issues/2977.html">2977</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>review</td>
     <td>Initialization with string literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2978">
     <td><a href="https://cplusplus.github.io/CWG/issues/2978.html">2978</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>open</td>
     <td>Deduction involving reference to similar types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2979">
     <td><a href="https://cplusplus.github.io/CWG/issues/2979.html">2979</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>open</td>
     <td>Duplicate declarations of enumerations in class scope</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2980">
     <td><a href="https://cplusplus.github.io/CWG/issues/2980.html">2980</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>open</td>
     <td>Constraints on template template parameters</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2981">
     <td><a href="https://cplusplus.github.io/CWG/issues/2981.html">2981</a></td>
+    <td>[<a href="https://wg21.link/expr.arith.conv">expr.arith.conv</a>]</td>
     <td>open</td>
     <td>Usual arithmetic conversions and result types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2982">
     <td><a href="https://cplusplus.github.io/CWG/issues/2982.html">2982</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.decl">temp.deduct.decl</a>]</td>
     <td>CD7</td>
     <td>Deduction in <I>type-constraint</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2983">
     <td><a href="https://cplusplus.github.io/CWG/issues/2983.html">2983</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>review</td>
     <td>Non-type template parameters are not variables</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2984">
     <td><a href="https://cplusplus.github.io/CWG/issues/2984.html">2984</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>open</td>
     <td>Value-dependent structured bindings</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2985">
     <td><a href="https://cplusplus.github.io/CWG/issues/2985.html">2985</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD7</td>
     <td>Unclear rules for reference initialization with conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2986">
     <td><a href="https://cplusplus.github.io/CWG/issues/2986.html">2986</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Creating objects within a mutable member of a const object</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2987">
     <td><a href="https://cplusplus.github.io/CWG/issues/2987.html">2987</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD7</td>
     <td>Remove dilapidated wording from <TT>static_cast</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2988">
     <td><a href="https://cplusplus.github.io/CWG/issues/2988.html">2988</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>open</td>
     <td>Is a closure type from a <I>lambda-expression</I> appearing in a <I>concept-definition</I> a TU-local entity?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2989">
     <td><a href="https://cplusplus.github.io/CWG/issues/2989.html">2989</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.paren">expr.prim.paren</a>]</td>
     <td>open</td>
     <td>Remove misleading general allowance for parentheses</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2990">
     <td><a href="https://cplusplus.github.io/CWG/issues/2990.html">2990</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>CD7</td>
     <td>Exporting redeclarations of namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2991">
     <td><a href="https://cplusplus.github.io/CWG/issues/2991.html">2991</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>open</td>
     <td>"array size" is vague</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2992">
     <td><a href="https://cplusplus.github.io/CWG/issues/2992.html">2992</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>open</td>
     <td>Labels do not have names</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2993">
     <td><a href="https://cplusplus.github.io/CWG/issues/2993.html">2993</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>open</td>
     <td>Body of a destructor</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2994">
     <td><a href="https://cplusplus.github.io/CWG/issues/2994.html">2994</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>open</td>
     <td>Allowing template parameters following template parameter packs that are pack expansions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2995">
     <td><a href="https://cplusplus.github.io/CWG/issues/2995.html">2995</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>open</td>
     <td>Meaning of flowing off the end of a function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2996">
     <td><a href="https://cplusplus.github.io/CWG/issues/2996.html">2996</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.atomic">temp.constr.atomic</a>]</td>
     <td>open</td>
     <td>Impenetrable definition of atomic constraint</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2997">
     <td><a href="https://cplusplus.github.io/CWG/issues/2997.html">2997</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>open</td>
     <td>Defaulted functions with deleted definition</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2998">
     <td><a href="https://cplusplus.github.io/CWG/issues/2998.html">2998</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>open</td>
     <td>Missing deduction consistency check for partial ordering</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2999">
     <td><a href="https://cplusplus.github.io/CWG/issues/2999.html">2999</a></td>
+    <td>[<a href="https://wg21.link/class.default.ctor">class.default.ctor</a>]</td>
     <td>open</td>
     <td>Trivial unions changing existing behavior</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3000">
     <td><a href="https://cplusplus.github.io/CWG/issues/3000.html">3000</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>review</td>
     <td>Handling of cv-qualified class types in conditional operator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3001">
     <td><a href="https://cplusplus.github.io/CWG/issues/3001.html">3001</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>tentatively ready</td>
     <td>Inconsistent restrictions for <TT>static_cast</TT> on pointers to out-of-lifetime objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3002">
     <td><a href="https://cplusplus.github.io/CWG/issues/3002.html">3002</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.temp">temp.dep.temp</a>]</td>
     <td>tentatively ready</td>
     <td>Template parameter/argument confusion</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3003">
     <td><a href="https://cplusplus.github.io/CWG/issues/3003.html">3003</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>review</td>
     <td>Naming a deducible template for class template argument deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3004">
     <td><a href="https://cplusplus.github.io/CWG/issues/3004.html">3004</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>tentatively ready</td>
     <td>Pointer arithmetic on array of unknown bound</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3005">
     <td><a href="https://cplusplus.github.io/CWG/issues/3005.html">3005</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>tentatively ready</td>
     <td>Function parameters should never be name-independent</td>
     <td align="center">
@@ -17902,570 +20864,665 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="3006">
     <td><a href="https://cplusplus.github.io/CWG/issues/3006.html">3006</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>review</td>
     <td>Vague restrictions for explicit instantiations of class templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3007">
     <td><a href="https://cplusplus.github.io/CWG/issues/3007.html">3007</a></td>
+    <td>[<a href="https://wg21.link/class.compare.default">class.compare.default</a>]</td>
     <td>open</td>
     <td>Access checking during synthesis of defaulted comparison operator, take 2</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3008">
     <td><a href="https://cplusplus.github.io/CWG/issues/3008.html">3008</a></td>
+    <td>[<a href="https://wg21.link/diff.dcl">diff.dcl</a>]</td>
     <td>tentatively ready</td>
     <td>Missing Annex C entry for <TT>void</TT> object declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3009">
     <td><a href="https://cplusplus.github.io/CWG/issues/3009.html">3009</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Unclear rules for constant initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3010">
     <td><a href="https://cplusplus.github.io/CWG/issues/3010.html">3010</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>constexpr placement-new should require transparent replaceability</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3011">
     <td><a href="https://cplusplus.github.io/CWG/issues/3011.html">3011</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>tentatively ready</td>
     <td>Parenthesized aggregate initialization for <I>new-expression</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3012">
     <td><a href="https://cplusplus.github.io/CWG/issues/3012.html">3012</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>open</td>
     <td>Deviating <TT>constexpr</TT> or <TT>consteval</TT> across translation units</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="3013">
     <td><a href="https://cplusplus.github.io/CWG/issues/3013.html">3013</a></td>
+    <td>[<a href="https://wg21.link/cpp.embed.gen">cpp.embed.gen</a>]</td>
     <td>CD7</td>
     <td>Disallowing macros for <TT>#embed</TT> parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="3014">
     <td><a href="https://cplusplus.github.io/CWG/issues/3014.html">3014</a></td>
+    <td>[<a href="https://wg21.link/cpp.embed.gen">cpp.embed.gen</a>]</td>
     <td>CD7</td>
     <td>Comma-delimited vs. comma-separated output for <TT>#embed</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="3015">
     <td><a href="https://cplusplus.github.io/CWG/issues/3015.html">3015</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>CD7</td>
     <td>Handling of <I>header-name</I>s for <TT>#include</TT> and <TT>#embed</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="3016">
     <td><a href="https://cplusplus.github.io/CWG/issues/3016.html">3016</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD7</td>
     <td>Satisfying the syntactic requirements of <TT>#include</TT> and <TT>#embed</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="3017">
     <td><a href="https://cplusplus.github.io/CWG/issues/3017.html">3017</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>open</td>
     <td>Commas in controlling expression of conditional inclusion</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="3018">
     <td><a href="https://cplusplus.github.io/CWG/issues/3018.html">3018</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD7</td>
     <td>Validity of <TT>defined</TT> in <TT>__has_embed</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="3019">
     <td><a href="https://cplusplus.github.io/CWG/issues/3019.html">3019</a></td>
+    <td>[<a href="https://wg21.link/lex.header">lex.header</a>]</td>
     <td>open</td>
     <td>Restrictions on character sequences in <I>header-name</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="3020">
     <td><a href="https://cplusplus.github.io/CWG/issues/3020.html">3020</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD7</td>
     <td>Missing specification for <TT>__has_cpp_attribute(indeterminate)</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="3021">
     <td><a href="https://cplusplus.github.io/CWG/issues/3021.html">3021</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.order">temp.constr.order</a>]</td>
     <td>drafting</td>
     <td>Subsumption rules for fold expanded constraints</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3022">
     <td><a href="https://cplusplus.github.io/CWG/issues/3022.html">3022</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>review</td>
     <td>Redundant specification of explicit destructor calls</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3023">
     <td><a href="https://cplusplus.github.io/CWG/issues/3023.html">3023</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>open</td>
     <td>Default arguments in list-initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3024">
     <td><a href="https://cplusplus.github.io/CWG/issues/3024.html">3024</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>open</td>
     <td>Alignment of references</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3025">
     <td><a href="https://cplusplus.github.io/CWG/issues/3025.html">3025</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>open</td>
     <td>Deallocation functions returning void</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3026">
     <td><a href="https://cplusplus.github.io/CWG/issues/3026.html">3026</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>open</td>
     <td>Class for pointer-to-member formation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3027">
     <td><a href="https://cplusplus.github.io/CWG/issues/3027.html">3027</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>open</td>
     <td>Equivalence of <I>pack-index-specifier</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3028">
     <td><a href="https://cplusplus.github.io/CWG/issues/3028.html">3028</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>open</td>
     <td>A <I>using-declarator</I> should bind a name</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3029">
     <td><a href="https://cplusplus.github.io/CWG/issues/3029.html">3029</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>drafting</td>
     <td>Confusing note about ordinary character types for aligned memory areas</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3030">
     <td><a href="https://cplusplus.github.io/CWG/issues/3030.html">3030</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>open</td>
     <td>Initializing array prvalues of unknown bound</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3031">
     <td><a href="https://cplusplus.github.io/CWG/issues/3031.html">3031</a></td>
+    <td>[<a href="https://wg21.link/over.match.funcs.general">over.match.funcs.general</a>]</td>
     <td>open</td>
     <td>Finding declarations for conversion operators for access checking</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3032">
     <td><a href="https://cplusplus.github.io/CWG/issues/3032.html">3032</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.general">temp.arg.general</a>]</td>
     <td>tentatively ready</td>
     <td>Template argument disambiguation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3033">
     <td><a href="https://cplusplus.github.io/CWG/issues/3033.html">3033</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.namespace">basic.scope.namespace</a>]</td>
     <td>open</td>
     <td>Scope after <I>declarator-id</I> before determining correspondence</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3034">
     <td><a href="https://cplusplus.github.io/CWG/issues/3034.html">3034</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>open</td>
     <td>Infinite recursion should hit an implementation limit</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3035">
     <td><a href="https://cplusplus.github.io/CWG/issues/3035.html">3035</a></td>
+    <td>[<a href="https://wg21.link/class.union.anon">class.union.anon</a>]</td>
     <td>open</td>
     <td>Lambda expressions in anonymous unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3036">
     <td><a href="https://cplusplus.github.io/CWG/issues/3036.html">3036</a></td>
+    <td>[<a href="https://wg21.link/basic.extended.fp">basic.extended.fp</a>]</td>
     <td>open</td>
     <td>Extended floating-point types should not be cv-qualified</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3037">
     <td><a href="https://cplusplus.github.io/CWG/issues/3037.html">3037</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>open</td>
     <td>Name lookup results for <I>using-declarator</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3038">
     <td><a href="https://cplusplus.github.io/CWG/issues/3038.html">3038</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>open</td>
     <td>Ignorability of attributes, again</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3039">
     <td><a href="https://cplusplus.github.io/CWG/issues/3039.html">3039</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>open</td>
     <td>Undefined behavior from implicit object creation ignores observable checkpoints</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3040">
     <td><a href="https://cplusplus.github.io/CWG/issues/3040.html">3040</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>open</td>
     <td>Mishandling of lambda coroutines</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3041">
     <td><a href="https://cplusplus.github.io/CWG/issues/3041.html">3041</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>open</td>
     <td>Overly aggressive rule for deleting the destructor of a union</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3042">
     <td><a href="https://cplusplus.github.io/CWG/issues/3042.html">3042</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>open</td>
     <td>Implicit object creation is insufficient to model effective type rule of C</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3043">
     <td><a href="https://cplusplus.github.io/CWG/issues/3043.html">3043</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Lifetime extension for temporaries in expansion statements</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3044">
     <td><a href="https://cplusplus.github.io/CWG/issues/3044.html">3044</a></td>
+    <td>[<a href="https://wg21.link/stmt.expand">stmt.expand</a>]</td>
     <td>tentatively ready</td>
     <td>Iterating expansion statements woes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3045">
     <td><a href="https://cplusplus.github.io/CWG/issues/3045.html">3045</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.block">basic.scope.block</a>]</td>
     <td>tentatively ready</td>
     <td>Regularizing environment interactions of expansion statement</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3046">
     <td><a href="https://cplusplus.github.io/CWG/issues/3046.html">3046</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>open</td>
     <td>Enumerations as part of the common initial sequence</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3047">
     <td><a href="https://cplusplus.github.io/CWG/issues/3047.html">3047</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Calling destructors on out-of-lifetime objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3048">
     <td><a href="https://cplusplus.github.io/CWG/issues/3048.html">3048</a></td>
+    <td>[<a href="https://wg21.link/stmt.expand">stmt.expand</a>]</td>
     <td>tentatively ready</td>
     <td>Empty destructuring expansion statements</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3049">
     <td><a href="https://cplusplus.github.io/CWG/issues/3049.html">3049</a></td>
+    <td>[<a href="https://wg21.link/class.prop">class.prop</a>]</td>
     <td>open</td>
     <td>Implicitly deleted move operation should not disable trivial relocation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3050">
     <td><a href="https://cplusplus.github.io/CWG/issues/3050.html">3050</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.deprecated">dcl.attr.deprecated</a>]</td>
     <td>open</td>
     <td>[[deprecated]] for class template partial specializations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3051">
     <td><a href="https://cplusplus.github.io/CWG/issues/3051.html">3051</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>open</td>
     <td>Missing specification for types of member subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3052">
     <td><a href="https://cplusplus.github.io/CWG/issues/3052.html">3052</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>open</td>
     <td>Unclear handling of checks on discarded <TT>return</TT> statements</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3053">
     <td><a href="https://cplusplus.github.io/CWG/issues/3053.html">3053</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace.general">cpp.replace.general</a>]</td>
     <td>tentatively ready</td>
     <td>Allowing <TT>#undef likely</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3054">
     <td><a href="https://cplusplus.github.io/CWG/issues/3054.html">3054</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>open</td>
     <td>Use of default arguments depending on shape of <I>postfix-expression</I> in a function call</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3055">
     <td><a href="https://cplusplus.github.io/CWG/issues/3055.html">3055</a></td>
+    <td>[<a href="https://wg21.link/over.call.object">over.call.object</a>]</td>
     <td>open</td>
     <td>Misleading body for surrogate call function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3056">
     <td><a href="https://cplusplus.github.io/CWG/issues/3056.html">3056</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.type">expr.prim.req.type</a>]</td>
     <td>open</td>
     <td>Missing semicolons in grammar for <I>type-requirement</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3057">
     <td><a href="https://cplusplus.github.io/CWG/issues/3057.html">3057</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>open</td>
     <td>Ranking of derived-to-base conversions should ignore reference binding</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3058">
     <td><a href="https://cplusplus.github.io/CWG/issues/3058.html">3058</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.general">basic.lookup.general</a>]</td>
     <td>open</td>
     <td>"Program point" is not defined</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3059">
     <td><a href="https://cplusplus.github.io/CWG/issues/3059.html">3059</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td><TT>throw;</TT> in constant expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3060">
     <td><a href="https://cplusplus.github.io/CWG/issues/3060.html">3060</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>open</td>
     <td>Change in behavior for <TT>noexcept</TT> <TT>main</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3061">
     <td><a href="https://cplusplus.github.io/CWG/issues/3061.html">3061</a></td>
+    <td>[<a href="https://wg21.link/stmt.expand">stmt.expand</a>]</td>
     <td>tentatively ready</td>
     <td>Trailing comma in an <I>expansion-init-list</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3062">
     <td><a href="https://cplusplus.github.io/CWG/issues/3062.html">3062</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>open</td>
     <td>Overlapping specification of default template arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3063">
     <td><a href="https://cplusplus.github.io/CWG/issues/3063.html">3063</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Lifetime extension of temporaries past function return</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3064">
     <td><a href="https://cplusplus.github.io/CWG/issues/3064.html">3064</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Mishandling of placement-new in lifetime rules</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3065">
     <td><a href="https://cplusplus.github.io/CWG/issues/3065.html">3065</a></td>
+    <td>[<a href="https://wg21.link/basic.types.general">basic.types.general</a>]</td>
     <td>open</td>
     <td>Reachability and completeness of types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3066">
     <td><a href="https://cplusplus.github.io/CWG/issues/3066.html">3066</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>tentatively ready</td>
     <td>Declarative <I>nested-name-specifier</I> in explicit instantiation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3067">
     <td><a href="https://cplusplus.github.io/CWG/issues/3067.html">3067</a></td>
+    <td>[<a href="https://wg21.link/conv.array">conv.array</a>]</td>
     <td>open</td>
     <td>Array-to-pointer conversion with object type mismatch</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3068">
     <td><a href="https://cplusplus.github.io/CWG/issues/3068.html">3068</a></td>
+    <td>[<a href="https://wg21.link/class.access.general">class.access.general</a>]</td>
     <td>open</td>
     <td>Access checking in friends involving <I>qualified-id</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3069">
     <td><a href="https://cplusplus.github.io/CWG/issues/3069.html">3069</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.normal">temp.constr.normal</a>]</td>
     <td>open</td>
     <td>Reference to wrong placeholder</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3070">
     <td><a href="https://cplusplus.github.io/CWG/issues/3070.html">3070</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>open</td>
     <td>Trivial assignment can skip member subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3071">
     <td><a href="https://cplusplus.github.io/CWG/issues/3071.html">3071</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>open</td>
     <td>Negative <TT>tuple_size</TT> in structured bindings</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3072">
     <td><a href="https://cplusplus.github.io/CWG/issues/3072.html">3072</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>open</td>
     <td>Incorrect examples for lambda SFINAE</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3073">
     <td><a href="https://cplusplus.github.io/CWG/issues/3073.html">3073</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>open</td>
     <td>Dependence of <I>R</I> on <TT>T2</TT> is unclear</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3074">
     <td><a href="https://cplusplus.github.io/CWG/issues/3074.html">3074</a></td>
+    <td>[<a href="https://wg21.link/cpp.module">cpp.module</a>]</td>
     <td>tentatively ready</td>
     <td>Redundant ill-formedness for module macros</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3075">
     <td><a href="https://cplusplus.github.io/CWG/issues/3075.html">3075</a></td>
+    <td>[<a href="https://wg21.link/cpp.import">cpp.import</a>]</td>
     <td>tentatively ready</td>
     <td>Unclear matching of import directive</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3076">
     <td><a href="https://cplusplus.github.io/CWG/issues/3076.html">3076</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>tentatively ready</td>
     <td>Remove unnecessary IFNDR for malformed <I>header-name-token</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3077">
     <td><a href="https://cplusplus.github.io/CWG/issues/3077.html">3077</a></td>
+    <td>[<a href="https://wg21.link/cpp.pre">cpp.pre</a>]</td>
     <td>tentatively ready</td>
     <td>Undesirable formation of <TT>import</TT> directive with <I>string-literal</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3078">
     <td><a href="https://cplusplus.github.io/CWG/issues/3078.html">3078</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>review</td>
     <td>Different treatment of <TT>#include</TT> <I>pp-tokens</I> and <I>header-name-tokens</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3079">
     <td><a href="https://cplusplus.github.io/CWG/issues/3079.html">3079</a></td>
+    <td>[<a href="https://wg21.link/class.union.anon">class.union.anon</a>]</td>
     <td>open</td>
     <td>Allow <I>empty-declaration</I>s in anonymous unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3080">
     <td><a href="https://cplusplus.github.io/CWG/issues/3080.html">3080</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>tentatively ready</td>
     <td>Clarify kinds of permitted template template arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3081">
     <td><a href="https://cplusplus.github.io/CWG/issues/3081.html">3081</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>review</td>
     <td>Require glvalue when splicing direct base class relationship</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3082">
     <td><a href="https://cplusplus.github.io/CWG/issues/3082.html">3082</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>tentatively ready</td>
     <td>Allow for call-compatible function types in <TT>reinterpret_cast</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3083">
     <td><a href="https://cplusplus.github.io/CWG/issues/3083.html">3083</a></td>
+    <td>[<a href="https://wg21.link/stmt.pre">stmt.pre</a>]</td>
     <td>tentatively ready</td>
     <td>Remove redundant restrictions on class and enum definitions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3084">
     <td><a href="https://cplusplus.github.io/CWG/issues/3084.html">3084</a></td>
+    <td>[<a href="https://wg21.link/stmt.cont">stmt.cont</a>]</td>
     <td>tentatively ready</td>
     <td><I>compound-statement</I>s inside <I>iteration-statement</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3085">
     <td><a href="https://cplusplus.github.io/CWG/issues/3085.html">3085</a></td>
+    <td>[<a href="https://wg21.link/stmt.pre">stmt.pre</a>]</td>
     <td>tentatively ready</td>
     <td>Apply restriction inside for-range-declaration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3086">
     <td><a href="https://cplusplus.github.io/CWG/issues/3086.html">3086</a></td>
+    <td>[<a href="https://wg21.link/cpp.pragma.op">cpp.pragma.op</a>]</td>
     <td>tentatively ready</td>
     <td>Destringizing should consider all sorts of encoding-prefixes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3087">
     <td><a href="https://cplusplus.github.io/CWG/issues/3087.html">3087</a></td>
+    <td>[<a href="https://wg21.link/cpp.pragma.op">cpp.pragma.op</a>]</td>
     <td>open</td>
     <td>Destringizing for raw string literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3088">
     <td><a href="https://cplusplus.github.io/CWG/issues/3088.html">3088</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace.general">cpp.replace.general</a>]</td>
     <td>open</td>
     <td>Clarify macro treatment of identifiers with special meaning</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3089">
     <td><a href="https://cplusplus.github.io/CWG/issues/3089.html">3089</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>tentatively ready</td>
     <td>const-default-constructible improperly handles std::meta::info</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3090">
     <td><a href="https://cplusplus.github.io/CWG/issues/3090.html">3090</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>tentatively ready</td>
     <td>Internal linkage from header units</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3091">
     <td><a href="https://cplusplus.github.io/CWG/issues/3091.html">3091</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>review</td>
     <td>Linking of translation units as sequences of tokens</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3092">
     <td><a href="https://cplusplus.github.io/CWG/issues/3092.html">3092</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.annotation">dcl.attr.annotation</a>]</td>
     <td>tentatively ready</td>
     <td><I>base-specifier</I>s are not "declared"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3093">
     <td><a href="https://cplusplus.github.io/CWG/issues/3093.html">3093</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.splice">expr.prim.splice</a>]</td>
     <td>open</td>
     <td>Missing integration of direct base class relationships</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3094">
     <td><a href="https://cplusplus.github.io/CWG/issues/3094.html">3094</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>review</td>
     <td>Rework phases for string literal concatenation and token formation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3095">
     <td><a href="https://cplusplus.github.io/CWG/issues/3095.html">3095</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>open</td>
     <td>Type-dependent packs that are not structured binding packs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3096">
     <td><a href="https://cplusplus.github.io/CWG/issues/3096.html">3096</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>open</td>
     <td>Value-dependence of size of structured binding pack with non-dependent initializer</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3097">
     <td><a href="https://cplusplus.github.io/CWG/issues/3097.html">3097</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>tentatively ready</td>
     <td>Lambda expression introduces a scope</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3098">
     <td><a href="https://cplusplus.github.io/CWG/issues/3098.html">3098</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>tentatively ready</td>
     <td>Remove redundancy "names or designates"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3099">
     <td><a href="https://cplusplus.github.io/CWG/issues/3099.html">3099</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>open</td>
     <td>Instantiation of type aliases from alias templates is unspecified</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3100">
     <td><a href="https://cplusplus.github.io/CWG/issues/3100.html">3100</a></td>
+    <td>[<a href="https://wg21.link/basic.start.term">basic.start.term</a>]</td>
     <td>open</td>
     <td>Destruction order for objects with static storage duration</td>
     <td align="center">Not resolved</td>
diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status
index 485a9a56267ca..3ba12e13a7354 100755
--- a/clang/www/make_cxx_dr_status
+++ b/clang/www/make_cxx_dr_status
@@ -10,26 +10,36 @@ output = os.path.join(clang_www_dir, 'cxx_dr_status.html')
 dr_test_dir = os.path.join(clang_www_dir, '../test/CXX/drs')
 
 class DR:
-  def __init__(self, section, issue, url, status, title):
-    self.section, self.issue, self.url, self.status, self.title = \
-        section, issue, url, status, title
+  def __init__(self, *, section_number, section_name, section_link, number, url, status, liaison, title):
+    self.section_number, self.section_name, self.section_link, self.number, self.url, self.status, self.liaison, self.title = \
+        section_number, section_name, section_link, number, url, status, liaison, title
   def __repr__(self):
-    return '%s (%s): %s' % (self.issue, self.status, self.title)
-
-def parse(dr):
-  try:
-    section, issue_link, status, liaison, title = [
-        col.split('>', 1)[1].split('</TD>')[0]
-        for col in dr.split('</TR>', 1)[0].split('<TD')[1:]
-    ]
-  except Exception as ex:
-    print(f"Parse error: {ex}\n{dr}", file=sys.stderr)
-    sys.exit(1)
-  _, url, issue = issue_link.split('"', 2)
-  url = url.strip()
-  issue = int(issue.split('>', 1)[1].split('<', 1)[0])
-  title = title.replace('<issue_title>', '').replace('</issue_title>', '').replace('\r\n', '\n').strip()
-  return DR(section, issue, url, status, title)
+    return '%s (%s): %s' % (self.number, self.status, self.title)
+
+  pattern = re.compile('''
+<TD.*>(?P<section_number>.*) <A href="(?P<section_link>.*)">(?P<section_name>.*)</A>
+</TD>
+<TD.*><A HREF="(?P<url>.*)">(?P<number>.*)</A></TD>
+<TD.*>(?P<status>.*)</TD>
+<TD.*>(?P<liaison>.*)</TD>
+<TD.*><issue_title>(?P<title>[\\w\\W]*)</issue_title></TD>
+</TR>''')
+
+  @classmethod
+  def parse_from_html(cls, html_string):
+    match = cls.pattern.match(html_string)
+    if match is None:
+        print(f"Parse error: {html_string}", file=sys.stderr)
+        exit(1)
+    return cls(
+      section_number=match.group('section_number'),
+      section_name=match.group('section_name'),
+      section_link=match.group('section_link'),
+      number=int(match.group('number')),
+      url=match.group('url'),
+      status=match.group('status'),
+      liaison=match.group('liaison'),
+      title=match.group('title').replace('\n', ' ').strip())
 
 def collect_tests():
   status_re = re.compile(r'\bcwg([0-9]+): (.*)')
@@ -68,8 +78,8 @@ def get_issues(path):
      print(ex, file=sys.stderr)
      sys.exit(1)
 
-  return sorted((parse(dr) for dr in buffer.split('<TR>')[2:]),
-                key = lambda dr: dr.issue)
+  return sorted((DR.parse_from_html(dr) for dr in buffer.split('<TR>')[2:]),
+                key = lambda dr: dr.number)
 
 
 issue_list_path  = None
@@ -127,9 +137,10 @@ out_html.append('''\
 
 <p>This page tracks which C++ defect reports are implemented within Clang.</p>
 
-<table width="689" border="1" cellspacing="0">
+<table width="892" border="1" cellspacing="0">
   <tr>
     <th>Number</th>
+    <th>Section</th>
     <th>Status</th>
     <th>Issue title</th>
     <th>Available in Clang?</th>
@@ -149,7 +160,7 @@ def availability(issue):
     unresolved_status = unresolved_status_match.group(1)
     proposed_resolution_match = re.search(r' (open|drafting|review|tentatively ready|ready) (\d{4}-\d{2}(?:-\d{2})?|P\d{4}R\d+)$', status)
     if proposed_resolution_match is None:
-      raise AvailabilityError('error: issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.issue, unresolved_status))
+      raise AvailabilityError('error: issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.number, unresolved_status))
     proposed_resolution = proposed_resolution_match.group(2)
     status = status[:-1-len(proposed_resolution)]
     status = status[:-1-len(unresolved_status)]
@@ -236,7 +247,7 @@ def availability(issue):
     avail = 'Duplicate of <a href="#%s">%s</a>' % (dup, dup)
     _, avail_style, _, _ = availability(dup)
   else:
-    raise AvailabilityError('error: unknown status %s for issue %s' % (status, dr.issue))
+    raise AvailabilityError('error: unknown status %s for issue %s' % (status, dr.number))
   return (avail + avail_suffix, avail_style, unresolved_status, details)
 
 count = {}
@@ -254,7 +265,7 @@ for dr in drs:
   elif dr.status in ('open', 'drafting', 'review', 'tentatively ready', 'ready'):
     row_style = ' class="open"'
     try:
-      avail, avail_style, unresolved_status, details = availability(dr.issue)
+      avail, avail_style, unresolved_status, details = availability(dr.number)
     except AvailabilityError as e:
       availability_error_occurred = True
       print(e.args[0])
@@ -267,12 +278,12 @@ for dr in drs:
       if unresolved_status != dr.status:
         availability_error_occurred = True
         print("error: issue %s is marked '%s', which differs from CWG index status '%s'" \
-             % (dr.issue, unresolved_status, dr.status))
+             % (dr.number, unresolved_status, dr.status))
         continue
   else:
     row_style = ''
     try:
-      avail, avail_style, unresolved_status, details = availability(dr.issue)
+      avail, avail_style, unresolved_status, details = availability(dr.number)
     except AvailabilityError as e:
       availability_error_occurred = True
       print(e.args[0])
@@ -281,7 +292,7 @@ for dr in drs:
     if unresolved_status:
       availability_error_occurred = True
       print("error: issue %s is marked '%s', even though it is resolved in CWG index" \
-           % (dr.issue, unresolved_status))
+           % (dr.number, unresolved_status))
       continue
 
   if not avail.startswith('Sup') and not avail.startswith('Dup'):
@@ -297,8 +308,9 @@ for dr in drs:
         {details}
       </details>'''
   out_html.append(f'''
-  <tr{row_style} id="{dr.issue}">
-    <td><a href="https://cplusplus.github.io/CWG/issues/{dr.issue}.html">{dr.issue}</a></td>
+  <tr{row_style} id="{dr.number}">
+    <td><a href="https://cplusplus.github.io/CWG/issues/{dr.number}.html">{dr.number}</a></td>
+    <td>[<a href="{dr.section_link}">{dr.section_name}</a>]</td>
     <td>{dr.status}</td>
     <td>{dr.title}</td>
     <td{avail_style} align="center">{avail}</td>
diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index a9e8899f8ae0c..1ed4e66d5622f 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -605,6 +605,10 @@ string(REGEX REPLACE "-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}
 list(APPEND COMPILER_RT_COMMON_CFLAGS ${stdlib_flag})
 list(APPEND COMPILER_RT_COMMON_LINK_FLAGS ${stdlib_flag})
 
+# Add assembler flags for execute-only code generation. C and C++ flags should have already
+# been added to CMAKE_C_FLAGS and CMAKE_CXX_FLAGS.
+append_string_if(RUNTIMES_EXECUTE_ONLY_CODE -DCOMPILER_RT_EXECUTE_ONLY_CODE CMAKE_ASM_FLAGS)
+
 # TODO: There's a lot of duplication across lib/*/tests/CMakeLists.txt files,
 # move some of the common flags to COMPILER_RT_UNITTEST_CFLAGS.
 
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index 0496f240dc823..46d6bb5bd8896 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -722,7 +722,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 /* Raw profile format version (start from 1). */
 #define INSTR_PROF_RAW_VERSION 10
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 12
+#define INSTR_PROF_INDEX_VERSION 13
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 6
 
diff --git a/compiler-rt/lib/asan/scripts/asan_symbolize.py b/compiler-rt/lib/asan/scripts/asan_symbolize.py
index 8ecd66c745119..091e9bcc9a796 100755
--- a/compiler-rt/lib/asan/scripts/asan_symbolize.py
+++ b/compiler-rt/lib/asan/scripts/asan_symbolize.py
@@ -59,6 +59,7 @@ def is_valid_arch(s):
         "armv7s",
         "armv7k",
         "arm64",
+        "arm64e",
         "powerpc64",
         "powerpc64le",
         "s390x",
diff --git a/compiler-rt/lib/asan/tests/asan_test.cpp b/compiler-rt/lib/asan/tests/asan_test.cpp
index 2d23a12cc6ae2..06c9fdc9b23db 100644
--- a/compiler-rt/lib/asan/tests/asan_test.cpp
+++ b/compiler-rt/lib/asan/tests/asan_test.cpp
@@ -1115,15 +1115,37 @@ TEST(AddressSanitizer, StressStackReuseTest) {
   LotsOfStackReuse();
 }
 
+// On some platform (ex: AIX), the default thread stack size (~96 KB) is
+// insufficient for this test and can lead to stack overflows.
+#define MIN_STACK_SIZE (128 * 1024)  // 128 KB
 TEST(AddressSanitizer, ThreadedStressStackReuseTest) {
   const int kNumThreads = 20;
   pthread_t t[kNumThreads];
+// pthread_attr isn't supported on Windows.
+#ifndef _WIN32
+  size_t curStackSize = 0;
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  // Get the current (default) thread stack size
+  pthread_attr_getstacksize(&attr, &curStackSize);
+  if (curStackSize < MIN_STACK_SIZE) {
+    int rc = pthread_attr_setstacksize(&attr, MIN_STACK_SIZE);
+    ASSERT_EQ(0, rc);
+  }
+#endif
   for (int i = 0; i < kNumThreads; i++) {
-    PTHREAD_CREATE(&t[i], 0, (void* (*)(void *x))LotsOfStackReuse, 0);
+#ifdef _WIN32
+    PTHREAD_CREATE(&t[i], 0, (void* (*)(void* x))LotsOfStackReuse, 0);
+#else
+    PTHREAD_CREATE(&t[i], &attr, (void* (*)(void* x))LotsOfStackReuse, 0);
+#endif
   }
   for (int i = 0; i < kNumThreads; i++) {
     PTHREAD_JOIN(t[i], 0);
   }
+#ifndef _WIN32
+  pthread_attr_destroy(&attr);
+#endif
 }
 
 // pthread_exit tries to perform unwinding stuff that leads to dlopen'ing
diff --git a/compiler-rt/lib/builtins/assembly.h b/compiler-rt/lib/builtins/assembly.h
index 368cbaf108d31..2eddbf468c149 100644
--- a/compiler-rt/lib/builtins/assembly.h
+++ b/compiler-rt/lib/builtins/assembly.h
@@ -71,9 +71,24 @@
 
 #endif
 
+#if defined(__aarch64__) && defined(__ELF__) &&                                \
+    defined(COMPILER_RT_EXECUTE_ONLY_CODE)
+// The assembler always creates an implicit '.text' section with default flags
+// (SHF_ALLOC | SHF_EXECINSTR), which is incompatible with the execute-only
+// '.text' section we want to create here because of the missing
+// SHF_AARCH64_PURECODE section flag. To solve this, we use 'unique,0' to
+// differentiate the two sections. The output will therefore have two separate
+// sections named '.text', where code will be placed into the execute-only
+// '.text' section, and the implicitly-created one will be empty.
+#define TEXT_SECTION                                                           \
+  .section .text,"axy",@progbits,unique,0
+#else
+#define TEXT_SECTION                                                           \
+  .text
+#endif
+
 #if defined(__arm__) || defined(__aarch64__) || defined(__arm64ec__)
 #define FUNC_ALIGN                                                             \
-  .text SEPARATOR                                                              \
   .balign 16 SEPARATOR
 #else
 #define FUNC_ALIGN
@@ -255,6 +270,7 @@
 #endif
 
 #define DEFINE_COMPILERRT_FUNCTION(name)                                       \
+  TEXT_SECTION SEPARATOR                                                       \
   DEFINE_CODE_STATE                                                            \
   FILE_LEVEL_DIRECTIVE SEPARATOR                                               \
   .globl FUNC_SYMBOL(SYMBOL_NAME(name)) SEPARATOR                              \
@@ -264,6 +280,7 @@
   FUNC_SYMBOL(SYMBOL_NAME(name)):
 
 #define DEFINE_COMPILERRT_THUMB_FUNCTION(name)                                 \
+  TEXT_SECTION SEPARATOR                                                       \
   DEFINE_CODE_STATE                                                            \
   FILE_LEVEL_DIRECTIVE SEPARATOR                                               \
   .globl FUNC_SYMBOL(SYMBOL_NAME(name)) SEPARATOR                              \
@@ -273,6 +290,7 @@
   FUNC_SYMBOL(SYMBOL_NAME(name)):
 
 #define DEFINE_COMPILERRT_PRIVATE_FUNCTION(name)                               \
+  TEXT_SECTION SEPARATOR                                                       \
   DEFINE_CODE_STATE                                                            \
   FILE_LEVEL_DIRECTIVE SEPARATOR                                               \
   .globl FUNC_SYMBOL(SYMBOL_NAME(name)) SEPARATOR                              \
@@ -282,6 +300,7 @@
   FUNC_SYMBOL(SYMBOL_NAME(name)):
 
 #define DEFINE_COMPILERRT_PRIVATE_FUNCTION_UNMANGLED(name)                     \
+  TEXT_SECTION SEPARATOR                                                       \
   DEFINE_CODE_STATE                                                            \
   .globl FUNC_SYMBOL(name) SEPARATOR                                           \
   SYMBOL_IS_FUNC(name) SEPARATOR                                               \
@@ -290,6 +309,7 @@
   FUNC_SYMBOL(name):
 
 #define DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(name)                     \
+  TEXT_SECTION SEPARATOR                                                       \
   DEFINE_CODE_STATE                                                            \
   FUNC_ALIGN                                                                   \
   .globl FUNC_SYMBOL(name) SEPARATOR                                           \
diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c
index c21b2bad1d212..45b7055abf454 100644
--- a/compiler-rt/lib/builtins/cpu_model/x86.c
+++ b/compiler-rt/lib/builtins/cpu_model/x86.c
@@ -21,7 +21,9 @@
 
 #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
 
+#if __STDC_HOSTED__
 #include <assert.h>
+#endif // __STDC_HOSTED__
 
 #if (defined(__GNUC__) || defined(__clang__)) && !defined(_MSC_VER)
 #include <cpuid.h>
@@ -245,8 +247,8 @@ struct __processor_model {
   unsigned int __cpu_features[1];
 } __cpu_model = {0, 0, 0, {0}};
 
-static_assert(sizeof(__cpu_model) == 16,
-              "Wrong size of __cpu_model will result in ABI break");
+_Static_assert(sizeof(__cpu_model) == 16,
+               "Wrong size of __cpu_model will result in ABI break");
 
 // This code is copied from lib/Support/Host.cpp.
 // Changes to either file should be mirrored in the other.
@@ -1200,8 +1202,8 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) {
   unsigned Vendor;
   unsigned Model, Family;
   unsigned Features[(CPU_FEATURE_MAX + 31) / 32] = {0};
-  static_assert(sizeof(Features) / sizeof(Features[0]) == 4, "");
-  static_assert(sizeof(__cpu_features2) / sizeof(__cpu_features2[0]) == 3, "");
+  _Static_assert(sizeof(Features) / sizeof(Features[0]) == 4, "");
+  _Static_assert(sizeof(__cpu_features2) / sizeof(__cpu_features2[0]) == 3, "");
 
   // This function needs to run just once.
   if (__cpu_model.__cpu_vendor)
@@ -1234,9 +1236,11 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) {
   } else
     __cpu_model.__cpu_vendor = VENDOR_OTHER;
 
+#if __STDC_HOSTED__
   assert(__cpu_model.__cpu_vendor < VENDOR_MAX);
   assert(__cpu_model.__cpu_type < CPU_TYPE_MAX);
   assert(__cpu_model.__cpu_subtype < CPU_SUBTYPE_MAX);
+#endif // __STDC_HOSTED__
 
   return 0;
 }
diff --git a/compiler-rt/lib/fuzzer/CMakeLists.txt b/compiler-rt/lib/fuzzer/CMakeLists.txt
index 6db24610df1f0..a57e2fe46245a 100644
--- a/compiler-rt/lib/fuzzer/CMakeLists.txt
+++ b/compiler-rt/lib/fuzzer/CMakeLists.txt
@@ -162,6 +162,7 @@ if(OS_NAME MATCHES "Android|Linux|Fuchsia" AND
       CFLAGS ${TARGET_CFLAGS}
       CMAKE_ARGS -DCMAKE_CXX_COMPILER_WORKS=ON
                  -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                 -DRUNTIMES_EXECUTE_ONLY_CODE=${RUNTIMES_EXECUTE_ONLY_CODE}
                  -DLIBCXXABI_ENABLE_EXCEPTIONS=OFF
                  -DLIBCXX_ABI_NAMESPACE=__Fuzzer
                  -DLIBCXX_ENABLE_EXCEPTIONS=OFF)
diff --git a/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S b/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S
index b8d98b09ada25..80d680017cfe7 100644
--- a/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S
+++ b/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S
@@ -28,7 +28,7 @@
 // stack pointer when compiling a C function.
 // Hence we have to write this function in assembly.
 
-.section .text
+TEXT_SECTION
 .file "hwasan_setjmp_aarch64.S"
 
 .global ASM_WRAPPER_NAME(setjmp)
diff --git a/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S b/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S
index be82475101c8c..1631d3257a26f 100644
--- a/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S
+++ b/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S
@@ -70,7 +70,7 @@
 // clobbering the x17 register in error reports, and that the program will have
 // a runtime dependency on the __hwasan_tag_mismatch_v2 symbol therefore it will
 // fail to start up given an older (i.e. incompatible) runtime.
-.section .text
+TEXT_SECTION
 .file "hwasan_tag_mismatch_aarch64.S"
 .global __hwasan_tag_mismatch
 .type __hwasan_tag_mismatch, %function
diff --git a/compiler-rt/lib/lsan/lsan_allocator.h b/compiler-rt/lib/lsan/lsan_allocator.h
index 556b9f56a4a4a..2d0ea0b46fe0e 100644
--- a/compiler-rt/lib/lsan/lsan_allocator.h
+++ b/compiler-rt/lib/lsan/lsan_allocator.h
@@ -93,6 +93,10 @@ using LSanSizeClassMap = DefaultSizeClassMap;
 const uptr kAllocatorSpace = 0x600000000000ULL;
 const uptr kAllocatorSize  = 0x40000000000ULL;  // 4T.
 using LSanSizeClassMap = DefaultSizeClassMap;
+#  elif SANITIZER_ANDROID && defined(__aarch64__)
+const uptr kAllocatorSpace = 0x3000000000ULL;
+const uptr kAllocatorSize = 0x2000000000ULL;
+using LSanSizeClassMap = VeryCompactSizeClassMap;
 #  else
 const uptr kAllocatorSpace = 0x500000000000ULL;
 const uptr kAllocatorSize = 0x40000000000ULL;  // 4T.
diff --git a/compiler-rt/lib/msan/tests/CMakeLists.txt b/compiler-rt/lib/msan/tests/CMakeLists.txt
index a8500225337e6..b4848a8d190dc 100644
--- a/compiler-rt/lib/msan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/msan/tests/CMakeLists.txt
@@ -139,6 +139,7 @@ if(COMPILER_RT_CAN_EXECUTE_TESTS AND
     add_custom_libcxx(libcxx_msan_${arch} ${LIBCXX_PREFIX}
       DEPS ${MSAN_RUNTIME_LIBRARIES}
       CFLAGS ${MSAN_LIBCXX_CFLAGS} ${TARGET_CFLAGS}
+      CMAKE_ARGS -DRUNTIMES_EXECUTE_ONLY_CODE=${RUNTIMES_EXECUTE_ONLY_CODE}
       USE_TOOLCHAIN)
     set(MSAN_LIBCXX_DIR ${LIBCXX_PREFIX}/lib/)
 
diff --git a/compiler-rt/lib/orc/elfnix_tls.aarch64.S b/compiler-rt/lib/orc/elfnix_tls.aarch64.S
index 8dcdd535be8ae..25d97e6593dc0 100644
--- a/compiler-rt/lib/orc/elfnix_tls.aarch64.S
+++ b/compiler-rt/lib/orc/elfnix_tls.aarch64.S
@@ -13,9 +13,11 @@
 // The content of this file is aarch64-only
 #if defined(__arm64__) || defined(__aarch64__)
 
+#include "builtins/assembly.h"
+
 #define REGISTER_SAVE_SPACE_SIZE     32 * 24
 
-        .text
+        TEXT_SECTION
 
   // returns address of TLV in x0, all other registers preserved
   // TODO: add fast-path for repeat access
diff --git a/compiler-rt/lib/orc/sysv_reenter.arm64.S b/compiler-rt/lib/orc/sysv_reenter.arm64.S
index 74941c459d6ac..61e58e50c97c7 100644
--- a/compiler-rt/lib/orc/sysv_reenter.arm64.S
+++ b/compiler-rt/lib/orc/sysv_reenter.arm64.S
@@ -13,7 +13,9 @@
 // The content of this file is arm64-only
 #if defined(__arm64__) || defined(__aarch64__)
 
-        .text
+#include "builtins/assembly.h"
+
+        TEXT_SECTION
 
         // Saves GPRs, calls __orc_rt_resolve
         .globl __orc_rt_sysv_reenter
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
index ba85a0eb5a35e..b515b15b327d8 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
@@ -737,6 +737,7 @@ enum ModuleArch {
   kModuleArchARMV7S,
   kModuleArchARMV7K,
   kModuleArchARM64,
+  kModuleArchARM64E,
   kModuleArchLoongArch64,
   kModuleArchRISCV64,
   kModuleArchHexagon
@@ -810,6 +811,8 @@ inline const char *ModuleArchToString(ModuleArch arch) {
       return "armv7k";
     case kModuleArchARM64:
       return "arm64";
+    case kModuleArchARM64E:
+      return "arm64e";
     case kModuleArchLoongArch64:
       return "loongarch64";
     case kModuleArchRISCV64:
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S
index 5066953980af7..c5c2180e0de93 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S
@@ -5,6 +5,7 @@
 
 ASM_HIDDEN(COMMON_INTERCEPTOR_SPILL_AREA)
 
+TEXT_SECTION
 .comm _ZN14__interception10real_vforkE,8,8
 .globl ASM_WRAPPER_NAME(vfork)
 ASM_TYPE_FUNCTION(ASM_WRAPPER_NAME(vfork))
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
index b0a29db908639..90c0b66f81b5b 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
@@ -960,7 +960,17 @@ static void DisableMmapExcGuardExceptions() {
       RTLD_DEFAULT, "task_set_exc_guard_behavior");
   if (set_behavior == nullptr) return;
   const task_exc_guard_behavior_t task_exc_guard_none = 0;
-  set_behavior(mach_task_self(), task_exc_guard_none);
+  kern_return_t res = set_behavior(mach_task_self(), task_exc_guard_none);
+  if (res != KERN_SUCCESS) {
+    Report(
+        "WARN: task_set_exc_guard_behavior returned %d (%s), "
+        "mmap may fail unexpectedly.\n",
+        res, mach_error_string(res));
+    if (res == KERN_DENIED)
+      Report(
+          "HINT: Check that task_set_exc_guard_behavior is allowed by "
+          "sandbox.\n");
+  }
 }
 
 static void VerifyInterceptorsWorking();
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp
index a9533d6fc04ca..a5ec85ae16460 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp
@@ -20,18 +20,21 @@
 #include <mach/mach.h>
 
 // These are not available in older macOS SDKs.
-#ifndef CPU_SUBTYPE_X86_64_H
-#define CPU_SUBTYPE_X86_64_H  ((cpu_subtype_t)8)   /* Haswell */
-#endif
-#ifndef CPU_SUBTYPE_ARM_V7S
-#define CPU_SUBTYPE_ARM_V7S   ((cpu_subtype_t)11)  /* Swift */
-#endif
-#ifndef CPU_SUBTYPE_ARM_V7K
-#define CPU_SUBTYPE_ARM_V7K   ((cpu_subtype_t)12)
-#endif
-#ifndef CPU_TYPE_ARM64
-#define CPU_TYPE_ARM64        (CPU_TYPE_ARM | CPU_ARCH_ABI64)
-#endif
+#  ifndef CPU_SUBTYPE_X86_64_H
+#    define CPU_SUBTYPE_X86_64_H ((cpu_subtype_t)8) /* Haswell */
+#  endif
+#  ifndef CPU_SUBTYPE_ARM_V7S
+#    define CPU_SUBTYPE_ARM_V7S ((cpu_subtype_t)11) /* Swift */
+#  endif
+#  ifndef CPU_SUBTYPE_ARM_V7K
+#    define CPU_SUBTYPE_ARM_V7K ((cpu_subtype_t)12)
+#  endif
+#  ifndef CPU_TYPE_ARM64
+#    define CPU_TYPE_ARM64 (CPU_TYPE_ARM | CPU_ARCH_ABI64)
+#  endif
+#  ifndef CPU_SUBTYPE_ARM64E
+#    define CPU_SUBTYPE_ARM64E ((cpu_subtype_t)2)
+#  endif
 
 namespace __sanitizer {
 
@@ -311,18 +314,26 @@ ModuleArch ModuleArchFromCpuType(cpu_type_t cputype, cpu_subtype_t cpusubtype) {
     case CPU_TYPE_I386:
       return kModuleArchI386;
     case CPU_TYPE_X86_64:
-      if (cpusubtype == CPU_SUBTYPE_X86_64_ALL) return kModuleArchX86_64;
-      if (cpusubtype == CPU_SUBTYPE_X86_64_H) return kModuleArchX86_64H;
+      if (cpusubtype == CPU_SUBTYPE_X86_64_ALL)
+        return kModuleArchX86_64;
+      if (cpusubtype == CPU_SUBTYPE_X86_64_H)
+        return kModuleArchX86_64H;
       CHECK(0 && "Invalid subtype of x86_64");
       return kModuleArchUnknown;
     case CPU_TYPE_ARM:
-      if (cpusubtype == CPU_SUBTYPE_ARM_V6) return kModuleArchARMV6;
-      if (cpusubtype == CPU_SUBTYPE_ARM_V7) return kModuleArchARMV7;
-      if (cpusubtype == CPU_SUBTYPE_ARM_V7S) return kModuleArchARMV7S;
-      if (cpusubtype == CPU_SUBTYPE_ARM_V7K) return kModuleArchARMV7K;
+      if (cpusubtype == CPU_SUBTYPE_ARM_V6)
+        return kModuleArchARMV6;
+      if (cpusubtype == CPU_SUBTYPE_ARM_V7)
+        return kModuleArchARMV7;
+      if (cpusubtype == CPU_SUBTYPE_ARM_V7S)
+        return kModuleArchARMV7S;
+      if (cpusubtype == CPU_SUBTYPE_ARM_V7K)
+        return kModuleArchARMV7K;
       CHECK(0 && "Invalid subtype of ARM");
       return kModuleArchUnknown;
     case CPU_TYPE_ARM64:
+      if (cpusubtype == CPU_SUBTYPE_ARM64E)
+        return kModuleArchARM64E;
       return kModuleArchARM64;
     default:
       CHECK(0 && "Invalid CPU type");
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
index f8d821e125b7a..7eb0c9756d64a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
@@ -505,6 +505,13 @@ static void ChooseSymbolizerTools(IntrusiveList<SymbolizerTool> *list,
   }
 
 #  if SANITIZER_APPLE
+  if (list->empty()) {
+    Report(
+        "WARN: No external symbolizers found. Symbols may be missing or "
+        "unreliable.\n");
+    Report(
+        "HINT: Is PATH set? Does sandbox allow file-read of /usr/bin/atos?\n");
+  }
   VReport(2, "Using dladdr symbolizer.\n");
   list->push_back(new (*allocator) DlAddrSymbolizer());
 #  endif  // SANITIZER_APPLE
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp
index 00542b944f516..c18e5bd9f3194 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp
@@ -70,7 +70,7 @@ TEST(MemoryMapping, LoadedModuleArchAndUUID) {
         EXPECT_EQ(arch, kModuleArchI386);
       } else if (SANITIZER_WORDSIZE == 64) {
         EXPECT_TRUE(arch == kModuleArchX86_64 || arch == kModuleArchX86_64H ||
-                    arch == kModuleArchARM64);
+                    arch == kModuleArchARM64 || arch == kModuleArchARM64E);
       }
       const u8 *uuid = modules[i].uuid();
       u8 null_uuid[kModuleUUIDSize] = {0};
diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.def b/compiler-rt/lib/scudo/standalone/allocator_config.def
index 748530820cd64..0aea7b8f2fb9a 100644
--- a/compiler-rt/lib/scudo/standalone/allocator_config.def
+++ b/compiler-rt/lib/scudo/standalone/allocator_config.def
@@ -57,6 +57,10 @@ BASE_OPTIONAL(const bool, MaySupportMemoryTagging, false)
 // Disable the quarantine code.
 BASE_OPTIONAL(const bool, QuarantineDisabled, false)
 
+// If set to true, malloc_usable_size returns the exact size of the allocation.
+// If set to false, return the total available size in the allocation.
+BASE_OPTIONAL(const bool, ExactUsableSize, true)
+
 // PRIMARY_REQUIRED_TYPE(NAME)
 //
 // SizeClassMap to use with the Primary.
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 329ec4596482b..ffe9554203241 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -706,19 +706,26 @@ class Allocator {
         if (!getChunkFromBlock(Block, &Chunk, &Header) &&
             !getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header))
           return;
-      } else {
-        if (!getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header))
-          return;
+      } else if (!getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header)) {
+        return;
       }
-      if (Header.State == Chunk::State::Allocated) {
-        uptr TaggedChunk = Chunk;
-        if (allocatorSupportsMemoryTagging<AllocatorConfig>())
-          TaggedChunk = untagPointer(TaggedChunk);
-        if (useMemoryTagging<AllocatorConfig>(Primary.Options.load()))
-          TaggedChunk = loadTag(Chunk);
-        Callback(TaggedChunk, getSize(reinterpret_cast<void *>(Chunk), &Header),
-                 Arg);
+
+      if (Header.State != Chunk::State::Allocated)
+        return;
+
+      uptr TaggedChunk = Chunk;
+      if (allocatorSupportsMemoryTagging<AllocatorConfig>())
+        TaggedChunk = untagPointer(TaggedChunk);
+      uptr Size;
+      if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Primary.Options.load()))) {
+        TaggedChunk = loadTag(Chunk);
+        Size = getSize(reinterpret_cast<void *>(Chunk), &Header);
+      } else if (AllocatorConfig::getExactUsableSize()) {
+        Size = getSize(reinterpret_cast<void *>(Chunk), &Header);
+      } else {
+        Size = getUsableSize(reinterpret_cast<void *>(Chunk), &Header);
       }
+      Callback(TaggedChunk, Size, Arg);
     };
     Primary.iterateOverBlocks(Lambda);
     Secondary.iterateOverBlocks(Lambda);
@@ -759,16 +766,50 @@ class Allocator {
     return false;
   }
 
-  // Return the usable size for a given chunk. Technically we lie, as we just
-  // report the actual size of a chunk. This is done to counteract code actively
-  // writing past the end of a chunk (like sqlite3) when the usable size allows
-  // for it, which then forces realloc to copy the usable size of a chunk as
-  // opposed to its actual size.
+  ALWAYS_INLINE uptr getUsableSize(const void *Ptr,
+                                   Chunk::UnpackedHeader *Header) {
+    void *BlockBegin = getBlockBegin(Ptr, Header);
+    if (LIKELY(Header->ClassId)) {
+      return SizeClassMap::getSizeByClassId(Header->ClassId) -
+             (reinterpret_cast<uptr>(Ptr) - reinterpret_cast<uptr>(BlockBegin));
+    }
+
+    uptr UntaggedPtr = reinterpret_cast<uptr>(Ptr);
+    if (allocatorSupportsMemoryTagging<AllocatorConfig>()) {
+      UntaggedPtr = untagPointer(UntaggedPtr);
+      BlockBegin = untagPointer(BlockBegin);
+    }
+    return SecondaryT::getBlockEnd(BlockBegin) - UntaggedPtr;
+  }
+
+  // Return the usable size for a given chunk. If MTE is enabled or if the
+  // ExactUsableSize config parameter is true, we report the exact size of
+  // the original allocation size. Otherwise, we will return the total
+  // actual usable size.
   uptr getUsableSize(const void *Ptr) {
     if (UNLIKELY(!Ptr))
       return 0;
 
-    return getAllocSize(Ptr);
+    if (AllocatorConfig::getExactUsableSize() ||
+        UNLIKELY(useMemoryTagging<AllocatorConfig>(Primary.Options.load())))
+      return getAllocSize(Ptr);
+
+    initThreadMaybe();
+
+#ifdef GWP_ASAN_HOOKS
+    if (UNLIKELY(GuardedAlloc.pointerIsMine(Ptr)))
+      return GuardedAlloc.getSize(Ptr);
+#endif // GWP_ASAN_HOOKS
+
+    Ptr = getHeaderTaggedPointer(const_cast<void *>(Ptr));
+    Chunk::UnpackedHeader Header;
+    Chunk::loadHeader(Cookie, Ptr, &Header);
+
+    // Getting the alloc size of a chunk only makes sense if it's allocated.
+    if (UNLIKELY(Header.State != Chunk::State::Allocated))
+      reportInvalidChunkState(AllocatorAction::Sizing, Ptr);
+
+    return getUsableSize(Ptr, &Header);
   }
 
   uptr getAllocSize(const void *Ptr) {
@@ -951,6 +992,19 @@ class Allocator {
                          MemorySize, 2, 16);
   }
 
+  uptr getBlockBeginTestOnly(const void *Ptr) {
+    Chunk::UnpackedHeader Header;
+    Chunk::loadHeader(Cookie, Ptr, &Header);
+    DCHECK(Header.State == Chunk::State::Allocated);
+
+    if (allocatorSupportsMemoryTagging<AllocatorConfig>())
+      Ptr = untagPointer(const_cast<void *>(Ptr));
+    void *Begin = getBlockBegin(Ptr, &Header);
+    if (allocatorSupportsMemoryTagging<AllocatorConfig>())
+      Begin = untagPointer(Begin);
+    return reinterpret_cast<uptr>(Begin);
+  }
+
 private:
   typedef typename PrimaryT::SizeClassMap SizeClassMap;
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
index 5fdfd1e7c55cc..4837ac96b9b26 100644
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@@ -1152,6 +1152,248 @@ TEST(ScudoCombinedTest, QuarantineDisabled) {
   EXPECT_EQ(Stats.find("Stats: Quarantine"), std::string::npos);
 }
 
+struct UsableSizeClassConfig {
+  static const scudo::uptr NumBits = 1;
+  static const scudo::uptr MinSizeLog = 10;
+  static const scudo::uptr MidSizeLog = 10;
+  static const scudo::uptr MaxSizeLog = 13;
+  static const scudo::u16 MaxNumCachedHint = 8;
+  static const scudo::uptr MaxBytesCachedLog = 12;
+  static const scudo::uptr SizeDelta = 0;
+};
+
+struct TestExactUsableSizeConfig {
+  static const bool MaySupportMemoryTagging = false;
+  static const bool QuarantineDisabled = true;
+
+  template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 1U, 1U>;
+
+  struct Primary {
+    // In order to properly test the usable size, this Primary config has
+    // four real size classes: 1024, 2048, 4096, 8192.
+    using SizeClassMap = scudo::FixedSizeClassMap<UsableSizeClassConfig>;
+    static const scudo::uptr RegionSizeLog = 21U;
+    static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN;
+    static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+    typedef scudo::uptr CompactPtrT;
+    static const scudo::uptr CompactPtrScale = 0;
+    static const bool EnableRandomOffset = true;
+    static const scudo::uptr MapSizeIncrement = 1UL << 18;
+    static const scudo::uptr GroupSizeLog = 18;
+  };
+  template <typename Config>
+  using PrimaryT = scudo::SizeClassAllocator64<Config>;
+
+  struct Secondary {
+    template <typename Config>
+    using CacheT = scudo::MapAllocatorNoCache<Config>;
+  };
+
+  template <typename Config> using SecondaryT = scudo::MapAllocator<Config>;
+};
+
+template <class AllocatorT> void VerifyExactUsableSize(AllocatorT &Allocator) {
+  // Scan through all sizes up to 10000 then some larger sizes.
+  for (scudo::uptr Size = 1; Size < 10000; Size++) {
+    void *P = Allocator.allocate(Size, Origin);
+    EXPECT_EQ(Size, Allocator.getUsableSize(P))
+        << "Failed usable size at allocation size " << Size;
+    Allocator.deallocate(P, Origin);
+  }
+
+  // Verify that aligned allocations also return the exact size allocated.
+  const scudo::uptr AllocSize = 313;
+  for (scudo::uptr Align = 1; Align <= 8; Align++) {
+    void *P = Allocator.allocate(AllocSize, Origin, 1U << Align);
+    EXPECT_EQ(AllocSize, Allocator.getUsableSize(P))
+        << "Failed usable size at allocation size " << AllocSize << " at align "
+        << 1 << Align;
+    Allocator.deallocate(P, Origin);
+  }
+
+  // Verify an explicitly large allocations.
+  const scudo::uptr LargeAllocSize = 1000000;
+  void *P = Allocator.allocate(LargeAllocSize, Origin);
+  EXPECT_EQ(LargeAllocSize, Allocator.getUsableSize(P));
+  Allocator.deallocate(P, Origin);
+
+  // Now do it for aligned allocations for large allocations.
+  for (scudo::uptr Align = 1; Align <= 8; Align++) {
+    void *P = Allocator.allocate(LargeAllocSize, Origin, 1U << Align);
+    EXPECT_EQ(LargeAllocSize, Allocator.getUsableSize(P))
+        << "Failed usable size at allocation size " << AllocSize << " at align "
+        << 1 << Align;
+    Allocator.deallocate(P, Origin);
+  }
+}
+
+template <class AllocatorT>
+void VerifyIterateOverUsableSize(AllocatorT &Allocator) {
+  // This will not verify if the size is the exact size or the size of the
+  // size class. Instead verify that the size matches the usable size and
+  // assume the other tests have verified getUsableSize.
+  std::unordered_map<void *, size_t> Pointers;
+  Pointers.insert({Allocator.allocate(128, Origin), 0U});
+  Pointers.insert({Allocator.allocate(128, Origin, 32), 0U});
+  Pointers.insert({Allocator.allocate(2000, Origin), 0U});
+  Pointers.insert({Allocator.allocate(2000, Origin, 64), 0U});
+  Pointers.insert({Allocator.allocate(8000, Origin), 0U});
+  Pointers.insert({Allocator.allocate(8000, Origin, 128), 0U});
+  Pointers.insert({Allocator.allocate(2000205, Origin), 0U});
+  Pointers.insert({Allocator.allocate(2000205, Origin, 128), 0U});
+  Pointers.insert({Allocator.allocate(2000205, Origin, 256), 0U});
+
+  Allocator.disable();
+  Allocator.iterateOverChunks(
+      0, static_cast<scudo::uptr>(SCUDO_MMAP_RANGE_SIZE - 1),
+      [](uintptr_t Base, size_t Size, void *Arg) {
+        std::unordered_map<void *, size_t> *Pointers =
+            reinterpret_cast<std::unordered_map<void *, size_t> *>(Arg);
+        (*Pointers)[reinterpret_cast<void *>(Base)] = Size;
+      },
+      reinterpret_cast<void *>(&Pointers));
+  Allocator.enable();
+
+  for (auto [Ptr, IterateSize] : Pointers) {
+    EXPECT_NE(0U, IterateSize)
+        << "Pointer " << Ptr << " not found in iterateOverChunks call.";
+    EXPECT_EQ(IterateSize, Allocator.getUsableSize(Ptr))
+        << "Pointer " << Ptr
+        << " mismatch between iterate size and usable size.";
+    Allocator.deallocate(Ptr, Origin);
+  }
+}
+
+TEST(ScudoCombinedTest, ExactUsableSize) {
+  using AllocatorT = scudo::Allocator<TestExactUsableSizeConfig>;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  VerifyExactUsableSize<AllocatorT>(*Allocator);
+  VerifyIterateOverUsableSize<AllocatorT>(*Allocator);
+}
+
+struct TestExactUsableSizeMTEConfig : TestExactUsableSizeConfig {
+  static const bool MaySupportMemoryTagging = true;
+};
+
+TEST(ScudoCombinedTest, ExactUsableSizeMTE) {
+  if (!scudo::archSupportsMemoryTagging() ||
+      !scudo::systemDetectsMemoryTagFaultsTestOnly())
+    TEST_SKIP("Only supported on systems that can enable MTE.");
+
+  scudo::enableSystemMemoryTaggingTestOnly();
+
+  using AllocatorT = scudo::Allocator<TestExactUsableSizeMTEConfig>;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  VerifyExactUsableSize<AllocatorT>(*Allocator);
+  VerifyIterateOverUsableSize<AllocatorT>(*Allocator);
+}
+
+template <class AllocatorT>
+void VerifyUsableSizePrimary(AllocatorT &Allocator) {
+  std::vector<scudo::uptr> SizeClasses = {1024U, 2048U, 4096U, 8192U};
+  for (size_t I = 0; I < SizeClasses.size(); I++) {
+    scudo::uptr SizeClass = SizeClasses[I];
+    scudo::uptr StartSize;
+    if (I == 0)
+      StartSize = 1;
+    else
+      StartSize = SizeClasses[I - 1];
+    scudo::uptr UsableSize = SizeClass - scudo::Chunk::getHeaderSize();
+    for (scudo::uptr Size = StartSize; Size < UsableSize; Size++) {
+      void *P = Allocator.allocate(Size, Origin);
+      EXPECT_EQ(UsableSize, Allocator.getUsableSize(P))
+          << "Failed usable size at allocation size " << Size
+          << " for size class " << SizeClass;
+      memset(P, 0xff, UsableSize);
+      EXPECT_EQ(Allocator.getBlockBeginTestOnly(P) + SizeClass,
+                reinterpret_cast<scudo::uptr>(P) + UsableSize);
+      Allocator.deallocate(P, Origin);
+    }
+
+    StartSize = UsableSize + 1;
+  }
+
+  std::vector<scudo::uptr> Alignments = {32U, 128U};
+  for (size_t I = 0; I < SizeClasses.size(); I++) {
+    scudo::uptr SizeClass = SizeClasses[I];
+    scudo::uptr AllocSize;
+    if (I == 0)
+      AllocSize = 1;
+    else
+      AllocSize = SizeClasses[I - 1] + 1;
+
+    for (auto Alignment : Alignments) {
+      void *P = Allocator.allocate(AllocSize, Origin, Alignment);
+      scudo::uptr UsableSize = Allocator.getUsableSize(P);
+      memset(P, 0xff, UsableSize);
+      EXPECT_EQ(Allocator.getBlockBeginTestOnly(P) + SizeClass,
+                reinterpret_cast<scudo::uptr>(P) + UsableSize)
+          << "Failed usable size at allocation size " << AllocSize
+          << " for size class " << SizeClass << " at alignment " << Alignment;
+      Allocator.deallocate(P, Origin);
+    }
+  }
+}
+
+template <class AllocatorT>
+void VerifyUsableSizeSecondary(AllocatorT &Allocator) {
+  const scudo::uptr LargeAllocSize = 996780;
+  const scudo::uptr PageSize = scudo::getPageSizeCached();
+  void *P = Allocator.allocate(LargeAllocSize, Origin);
+  scudo::uptr UsableSize = Allocator.getUsableSize(P);
+  memset(P, 0xff, UsableSize);
+  // Assumes that the secondary always rounds up allocations to a page boundary.
+  EXPECT_EQ(scudo::roundUp(reinterpret_cast<scudo::uptr>(P) + LargeAllocSize,
+                           PageSize),
+            reinterpret_cast<scudo::uptr>(P) + UsableSize);
+  Allocator.deallocate(P, Origin);
+
+  // Check aligned allocations now.
+  for (scudo::uptr Alignment = 1; Alignment <= 8; Alignment++) {
+    void *P = Allocator.allocate(LargeAllocSize, Origin, 1U << Alignment);
+    scudo::uptr UsableSize = Allocator.getUsableSize(P);
+    EXPECT_EQ(scudo::roundUp(reinterpret_cast<scudo::uptr>(P) + LargeAllocSize,
+                             PageSize),
+              reinterpret_cast<scudo::uptr>(P) + UsableSize)
+        << "Failed usable size at allocation size " << LargeAllocSize
+        << " at alignment " << Alignment;
+    Allocator.deallocate(P, Origin);
+  }
+}
+
+struct TestFullUsableSizeConfig : TestExactUsableSizeConfig {
+  static const bool ExactUsableSize = false;
+};
+
+TEST(ScudoCombinedTest, FullUsableSize) {
+  using AllocatorT = scudo::Allocator<TestFullUsableSizeConfig>;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  VerifyUsableSizePrimary<AllocatorT>(*Allocator);
+  VerifyUsableSizeSecondary<AllocatorT>(*Allocator);
+  VerifyIterateOverUsableSize<AllocatorT>(*Allocator);
+}
+
+struct TestFullUsableSizeMTEConfig : TestFullUsableSizeConfig {
+  static const bool MaySupportMemoryTagging = true;
+};
+
+TEST(ScudoCombinedTest, FullUsableSizeMTE) {
+  if (!scudo::archSupportsMemoryTagging() ||
+      !scudo::systemDetectsMemoryTagFaultsTestOnly())
+    TEST_SKIP("Only supported on systems that can enable MTE.");
+
+  scudo::enableSystemMemoryTaggingTestOnly();
+
+  using AllocatorT = scudo::Allocator<TestFullUsableSizeMTEConfig>;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  // When MTE is enabled, you get exact sizes.
+  VerifyExactUsableSize<AllocatorT>(*Allocator);
+  VerifyIterateOverUsableSize<AllocatorT>(*Allocator);
+}
 // Verify that no special quarantine blocks appear in iterateOverChunks.
 TEST(ScudoCombinedTest, QuarantineIterateOverChunks) {
   using AllocatorT = TestAllocator<TestQuarantineConfig>;
diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
index 612317b3c3293..9e5d0658e5ed5 100644
--- a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
@@ -588,8 +588,13 @@ TEST_F(ScudoWrappersCTest, MallocInfo) {
   EXPECT_EQ(errno, 0);
   fclose(F);
   EXPECT_EQ(strncmp(Buffer, "<malloc version=\"scudo-", 23), 0);
-  EXPECT_NE(nullptr, strstr(Buffer, "<alloc size=\"1234\" count=\""));
-  EXPECT_NE(nullptr, strstr(Buffer, "<alloc size=\"4321\" count=\""));
+  std::string expected;
+  expected =
+      "<alloc size=\"" + std::to_string(malloc_usable_size(P1)) + "\" count=\"";
+  EXPECT_NE(nullptr, strstr(Buffer, expected.c_str()));
+  expected =
+      "<alloc size=\"" + std::to_string(malloc_usable_size(P2)) + "\" count=\"";
+  EXPECT_NE(nullptr, strstr(Buffer, expected.c_str()));
 
   free(P1);
   free(P2);
diff --git a/compiler-rt/lib/tsan/CMakeLists.txt b/compiler-rt/lib/tsan/CMakeLists.txt
index 7928116879c09..3319855521bd5 100644
--- a/compiler-rt/lib/tsan/CMakeLists.txt
+++ b/compiler-rt/lib/tsan/CMakeLists.txt
@@ -30,6 +30,7 @@ if(COMPILER_RT_LIBCXX_PATH AND
     add_custom_libcxx(libcxx_tsan_${arch} ${LIBCXX_PREFIX}
       DEPS ${TSAN_RUNTIME_LIBRARIES}
       CFLAGS ${TARGET_CFLAGS} -fsanitize=thread
+      CMAKE_ARGS -DRUNTIMES_EXECUTE_ONLY_CODE=${RUNTIMES_EXECUTE_ONLY_CODE}
       USE_TOOLCHAIN)
     list(APPEND libcxx_tsan_deps libcxx_tsan_${arch}-install-cmake326-workaround)
   endforeach()
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S b/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S
index f1d11a3e7f54f..124bd59a91f08 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S
@@ -4,10 +4,8 @@
 #include "sanitizer_common/sanitizer_asm.h"
 #include "builtins/assembly.h"
 
-#if !defined(__APPLE__)
-.section .text
-#else
-.section __TEXT,__text
+TEXT_SECTION
+#if defined(__APPLE__)
 .align 3
 #endif
 
diff --git a/compiler-rt/lib/tysan/tysan.cpp b/compiler-rt/lib/tysan/tysan.cpp
index 4fa8166986d76..1c67adeba0fc5 100644
--- a/compiler-rt/lib/tysan/tysan.cpp
+++ b/compiler-rt/lib/tysan/tysan.cpp
@@ -22,6 +22,7 @@
 
 #include "tysan/tysan.h"
 
+#include <stdint.h>
 #include <string.h>
 
 using namespace __sanitizer;
@@ -254,10 +255,68 @@ static void reportError(void *Addr, int Size, tysan_type_descriptor *TD,
   }
 }
 
+ALWAYS_INLINE
+static void SetShadowType(tysan_type_descriptor *td,
+                          tysan_type_descriptor **shadowData,
+                          uint64_t AccessSize) {
+  *shadowData = td;
+  uint64_t shadowDataInt = (uint64_t)shadowData;
+
+  for (uint64_t i = 1; i < AccessSize; ++i) {
+    int64_t dataOffset = i << PtrShift();
+    int64_t *badShadowData = (int64_t *)(shadowDataInt + dataOffset);
+    int64_t badTD = int64_t(i) * -1;
+    *badShadowData = badTD;
+  }
+}
+
+ALWAYS_INLINE
+static bool GetNotAllBadTD(uint64_t ShadowDataInt, uint64_t AccessSize) {
+  bool notAllBadTD = false;
+  for (uint64_t i = 1; i < AccessSize; ++i) {
+    int64_t **unkShadowData = (int64_t **)(ShadowDataInt + (i << PtrShift()));
+    int64_t *ILdTD = *unkShadowData;
+    notAllBadTD = notAllBadTD || (ILdTD != nullptr);
+  }
+  return notAllBadTD;
+}
+
+ALWAYS_INLINE
+static bool GetNotAllUnkTD(uint64_t ShadowDataInt, uint64_t AccessSize) {
+  bool notAllBadTD = false;
+  for (uint64_t i = 1; i < AccessSize; ++i) {
+    int64_t *badShadowData = (int64_t *)(ShadowDataInt + (i << PtrShift()));
+    int64_t ILdTD = *badShadowData;
+    notAllBadTD = notAllBadTD || (ILdTD >= 0);
+  }
+  return notAllBadTD;
+}
+
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
-__tysan_check(void *addr, int size, tysan_type_descriptor *td, int flags) {
-  GET_CALLER_PC_BP_SP;
+__tysan_instrument_mem_inst(char *dest, char *src, uint64_t size,
+                            bool needsMemMove) {
+  tysan_type_descriptor **destShadowDataPtr = shadow_for(dest);
+
+  if (!src) {
+    internal_memset((char *)destShadowDataPtr, 0, size << PtrShift());
+    return;
+  }
+
+  uint64_t srcInt = (uint64_t)src;
+  uint64_t srcShadowInt = ((srcInt & AppMask()) << PtrShift()) + ShadowAddr();
+  uint64_t *srcShadow = (uint64_t *)srcShadowInt;
 
+  if (needsMemMove) {
+    internal_memmove((char *)destShadowDataPtr, srcShadow, size << PtrShift());
+  } else {
+    internal_memcpy((char *)destShadowDataPtr, srcShadow, size << PtrShift());
+  }
+}
+
+ALWAYS_INLINE
+static void __tysan_check_internal(void *addr, int size,
+                                   tysan_type_descriptor *td, int flags,
+                                   uptr pc, uptr bp, uptr sp) {
   bool IsRead = flags & 1;
   bool IsWrite = flags & 2;
   const char *AccessStr;
@@ -300,6 +359,64 @@ __tysan_check(void *addr, int size, tysan_type_descriptor *td, int flags) {
   }
 }
 
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__tysan_check(void *addr, int size, tysan_type_descriptor *td, int flags) {
+  GET_CALLER_PC_BP_SP;
+  __tysan_check_internal(addr, size, td, flags, pc, bp, sp);
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__tysan_instrument_with_shadow_update(void *ptr, tysan_type_descriptor *td,
+                                      bool sanitizeFunction,
+                                      uint64_t accessSize, int flags) {
+  tysan_type_descriptor **shadowData = shadow_for(ptr);
+  tysan_type_descriptor *loadedTD = *shadowData;
+  bool shadowIsNull = loadedTD == nullptr;
+
+  // TODO, sanitizeFunction is known at compile time, so maybe this is split
+  // into two different functions
+  if (sanitizeFunction) {
+
+    if (td != loadedTD) {
+
+      // We now know that the types did not match (we're on the slow path). If
+      // the type is unknown, then set it.
+      if (shadowIsNull) {
+        // We're about to set the type. Make sure that all bytes in the value
+        // are also of unknown type.
+        bool isAllUnknownTD = GetNotAllUnkTD((uint64_t)shadowData, accessSize);
+        if (isAllUnknownTD) {
+          GET_CALLER_PC_BP_SP;
+          __tysan_check_internal(ptr, accessSize, td, flags, pc, bp, sp);
+        }
+        SetShadowType(td, shadowData, accessSize);
+      } else {
+        GET_CALLER_PC_BP_SP;
+        __tysan_check_internal(ptr, accessSize, td, flags, pc, bp, sp);
+      }
+    } else {
+      // We appear to have the right type. Make sure that all other bytes in
+      // the type are still marked as interior bytes. If not, call the runtime.
+      bool isNotAllBadTD = GetNotAllBadTD((uint64_t)shadowData, accessSize);
+      if (isNotAllBadTD) {
+        GET_CALLER_PC_BP_SP;
+        __tysan_check_internal(ptr, accessSize, td, flags, pc, bp, sp);
+      }
+    }
+  } else if (shadowIsNull) {
+    SetShadowType(td, shadowData, accessSize);
+  }
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__tysan_set_shadow_type(void *ptr, tysan_type_descriptor *td,
+                        uint64_t accessSize) {
+  // In the mode where writes always set the type, for a write (which does
+  // not also read), we just set the type.
+  tysan_type_descriptor **shadow = shadow_for(ptr);
+  SetShadowType(td, shadow, accessSize);
+}
+
 Flags __tysan::flags_data;
 
 SANITIZER_INTERFACE_ATTRIBUTE uptr __tysan_shadow_memory_address;
diff --git a/compiler-rt/lib/tysan/tysan_platform.h b/compiler-rt/lib/tysan/tysan_platform.h
index f01392885d939..19f77f0cace6b 100644
--- a/compiler-rt/lib/tysan/tysan_platform.h
+++ b/compiler-rt/lib/tysan/tysan_platform.h
@@ -21,24 +21,28 @@ struct Mapping {
   static const uptr kShadowAddr = 0x010000000000ull;
   static const uptr kAppAddr = 0x550000000000ull;
   static const uptr kAppMemMsk = ~0x780000000000ull;
+  static const uptr kPtrShift = 3;
 };
 #elif defined(__aarch64__)
 struct Mapping39 {
   static const uptr kShadowAddr = 0x0800000000ull;
   static const uptr kAppAddr = 0x5500000000ull;
   static const uptr kAppMemMsk = ~0x7800000000ull;
+  static const uptr kPtrShift = 3;
 };
 
 struct Mapping42 {
   static const uptr kShadowAddr = 0x10000000000ull;
   static const uptr kAppAddr = 0x2aa00000000ull;
   static const uptr kAppMemMsk = ~0x3c000000000ull;
+  static const uptr kPtrShift = 3;
 };
 
 struct Mapping48 {
   static const uptr kShadowAddr = 0x0002000000000ull;
   static const uptr kAppAddr = 0x0aaaa00000000ull;
   static const uptr kAppMemMsk = ~0x0fff800000000ull;
+  static const uptr kPtrShift = 3;
 };
 #define TYSAN_RUNTIME_VMA 1
 #else
@@ -49,7 +53,12 @@ struct Mapping48 {
 extern int vmaSize;
 #endif
 
-enum MappingType { MAPPING_SHADOW_ADDR, MAPPING_APP_ADDR, MAPPING_APP_MASK };
+enum MappingType {
+  MAPPING_SHADOW_ADDR,
+  MAPPING_APP_ADDR,
+  MAPPING_APP_MASK,
+  MAPPING_PTR_SHIFT
+};
 
 template <typename Mapping, int Type> uptr MappingImpl(void) {
   switch (Type) {
@@ -59,6 +68,8 @@ template <typename Mapping, int Type> uptr MappingImpl(void) {
     return Mapping::kAppAddr;
   case MAPPING_APP_MASK:
     return Mapping::kAppMemMsk;
+  case MAPPING_PTR_SHIFT:
+    return Mapping::kPtrShift;
   }
 }
 
@@ -88,6 +99,9 @@ uptr AppAddr() { return MappingArchImpl<MAPPING_APP_ADDR>(); }
 ALWAYS_INLINE
 uptr AppMask() { return MappingArchImpl<MAPPING_APP_MASK>(); }
 
+ALWAYS_INLINE
+uptr PtrShift() { return MappingArchImpl<MAPPING_PTR_SHIFT>(); }
+
 } // namespace __tysan
 
 #endif
diff --git a/compiler-rt/lib/xray/xray_trampoline_AArch64.S b/compiler-rt/lib/xray/xray_trampoline_AArch64.S
index 2586def04cbb1..5d951f3821a50 100644
--- a/compiler-rt/lib/xray/xray_trampoline_AArch64.S
+++ b/compiler-rt/lib/xray/xray_trampoline_AArch64.S
@@ -37,7 +37,7 @@
 #endif
 .endm
 
-.text
+TEXT_SECTION
 .p2align 2
 .global ASM_SYMBOL(__xray_FunctionEntry)
 ASM_HIDDEN(__xray_FunctionEntry)
diff --git a/compiler-rt/test/asan/TestCases/Darwin/suppressions-sandbox.cpp b/compiler-rt/test/asan/TestCases/Darwin/suppressions-sandbox.cpp
index f12e2b2ada50d..651d0c5d05b07 100644
--- a/compiler-rt/test/asan/TestCases/Darwin/suppressions-sandbox.cpp
+++ b/compiler-rt/test/asan/TestCases/Darwin/suppressions-sandbox.cpp
@@ -15,9 +15,6 @@
 // sandbox-exec isn't available on iOS
 // UNSUPPORTED: ios
 
-// Symbolizer fails to find test functions on current macOS bot version
-// XFAIL: system-darwin && target=arm{{.*}}
-
 #include <CoreFoundation/CoreFoundation.h>
 
 #if defined(SHARED_LIB)
diff --git a/compiler-rt/test/asan/TestCases/Posix/fread_fwrite.cpp b/compiler-rt/test/asan/TestCases/Posix/fread_fwrite.cpp
index c7b9280ea7d8e..c0629260418a3 100644
--- a/compiler-rt/test/asan/TestCases/Posix/fread_fwrite.cpp
+++ b/compiler-rt/test/asan/TestCases/Posix/fread_fwrite.cpp
@@ -2,9 +2,6 @@
 // RUN: not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-FWRITE
 // RUN: not %run %t 1 2>&1 | FileCheck %s --check-prefix=CHECK-FREAD
 
-// Symbolizer fails to find test functions on current macOS bot version
-// XFAIL: system-darwin && target=arm{{.*}}
-
 #include <stdio.h>
 #include <stdlib.h>
 
diff --git a/compiler-rt/test/asan/TestCases/log-path_test.cpp b/compiler-rt/test/asan/TestCases/log-path_test.cpp
index 6875d57c43cc0..22f077fb54680 100644
--- a/compiler-rt/test/asan/TestCases/log-path_test.cpp
+++ b/compiler-rt/test/asan/TestCases/log-path_test.cpp
@@ -1,6 +1,5 @@
 // FIXME: https://code.google.com/p/address-sanitizer/issues/detail?id=316
-// XFAIL: android
-// UNSUPPORTED: ios
+// UNSUPPORTED: ios, android
 //
 // The for loop in the backticks below requires bash.
 // REQUIRES: shell
diff --git a/compiler-rt/test/asan/TestCases/strcmp.c b/compiler-rt/test/asan/TestCases/strcmp.c
index 417bd491ebe02..2b31e64768c42 100644
--- a/compiler-rt/test/asan/TestCases/strcmp.c
+++ b/compiler-rt/test/asan/TestCases/strcmp.c
@@ -14,6 +14,8 @@ int main(int argc, char **argv) {
   assert(strcmp(s1 - 1, s2));
 
   // CHECK: {{.*ERROR: AddressSanitizer: stack-buffer-underflow on address}}
-  // CHECK: READ of size 1
+  // Very rarely `s1[-1]` happens to be '1', resulting in `strcmp` needing to
+  // check 2 bytes before failing, rather than 1 - this should still pass
+  // CHECK: READ of size {{[12]}}
   return 0;
 }
diff --git a/compiler-rt/test/asan/TestCases/verbose-log-path_test.cpp b/compiler-rt/test/asan/TestCases/verbose-log-path_test.cpp
index 53166ccded390..f4781a7d47647 100644
--- a/compiler-rt/test/asan/TestCases/verbose-log-path_test.cpp
+++ b/compiler-rt/test/asan/TestCases/verbose-log-path_test.cpp
@@ -9,8 +9,8 @@
 // RUN: FileCheck %s --check-prefix=CHECK-ERROR < %t-dir/asan.log.verbose-log-path_test-binary.*
 
 // FIXME: only FreeBSD, NetBSD and Linux have verbose log paths now.
-// XFAIL: target={{.*windows-msvc.*}},android
-// UNSUPPORTED: ios
+// XFAIL: target={{.*windows-msvc.*}}
+// UNSUPPORTED: ios, android
 
 #include <stdlib.h>
 #include <string.h>
diff --git a/compiler-rt/test/fuzzer/coverage.test b/compiler-rt/test/fuzzer/coverage.test
index cf36784ce21da..a4af2648d61e1 100644
--- a/compiler-rt/test/fuzzer/coverage.test
+++ b/compiler-rt/test/fuzzer/coverage.test
@@ -2,6 +2,8 @@
 UNSUPPORTED: target={{.*windows.*}}
 # FIXME: CreatePCArray() emits PLT stub addresses for entry blocks, which are ignored by TracePC::PrintCoverage().
 UNSUPPORTED: target=s390x{{.*}}
+UNSUPPORTED: darwin
+
 RUN: mkdir -p %t.dir && cd %t.dir
 RUN: %cpp_compiler -mllvm -use-unknown-locations=Disable  %S/NullDerefTest.cpp -o %t.dir/NullDerefTest
 RUN: %cpp_compiler -mllvm -use-unknown-locations=Disable %S/DSO1.cpp -fPIC %ld_flags_rpath_so1 -O0 -shared -o %dynamiclib1
diff --git a/compiler-rt/test/fuzzer/exit_on_src_pos.test b/compiler-rt/test/fuzzer/exit_on_src_pos.test
index 020424e2d9fdd..ba4fb01780ce2 100644
--- a/compiler-rt/test/fuzzer/exit_on_src_pos.test
+++ b/compiler-rt/test/fuzzer/exit_on_src_pos.test
@@ -8,6 +8,7 @@
 UNSUPPORTED: target=thumb{{.*}}
 # Timeout on loongarch64 machine
 UNSUPPORTED: target=loongarch64{{.*}}
+UNSUPPORTED: darwin
 
 RUN: %cpp_compiler -O0 %S/SimpleTest.cpp -o %t-SimpleTest.exe -mllvm -use-unknown-locations=Disable
 RUN: %cpp_compiler -O0 %S/ShrinkControlFlowTest.cpp -o %t-ShrinkControlFlowTest.exe
diff --git a/compiler-rt/test/hwasan/TestCases/Linux/fixed-shadow.c b/compiler-rt/test/hwasan/TestCases/Linux/fixed-shadow.c
index 421d233957830..353c5fb21a009 100644
--- a/compiler-rt/test/hwasan/TestCases/Linux/fixed-shadow.c
+++ b/compiler-rt/test/hwasan/TestCases/Linux/fixed-shadow.c
@@ -3,17 +3,17 @@
 // Default compiler instrumentation works with any shadow base (dynamic or fixed).
 // RUN: %clang_hwasan %s -o %t
 // RUN: %run %t
-// RUN: HWASAN_OPTIONS=fixed_shadow_base=263878495698944 %run %t 2>%t.out || (cat %t.out | FileCheck %s)
-// RUN: HWASAN_OPTIONS=fixed_shadow_base=4398046511104 %run %t
+// RUN: env HWASAN_OPTIONS=fixed_shadow_base=17592186044416 %run %t
+// RUN: env HWASAN_OPTIONS=fixed_shadow_base=4398046511104 %run %t
 //
 // If -hwasan-mapping-offset is set, then the fixed_shadow_base needs to match.
-// RUN: %clang_hwasan %s -mllvm -hwasan-mapping-offset=263878495698944 -o %t
-// RUN: HWASAN_OPTIONS=fixed_shadow_base=263878495698944 %run %t 2>%t.out || (cat %t.out | FileCheck %s)
-// RUN: HWASAN_OPTIONS=fixed_shadow_base=4398046511104 not %run %t
+// RUN: %clang_hwasan %s -mllvm -hwasan-mapping-offset=17592186044416 -o %t
+// RUN: env HWASAN_OPTIONS=fixed_shadow_base=17592186044416 %run %t
+// RUN: env HWASAN_OPTIONS=fixed_shadow_base=4398046511104 not %run %t
 
 // RUN: %clang_hwasan %s -mllvm -hwasan-mapping-offset=4398046511104 -o %t
-// RUN: HWASAN_OPTIONS=fixed_shadow_base=4398046511104 %run %t
-// RUN: HWASAN_OPTIONS=fixed_shadow_base=263878495698944 not %run %t
+// RUN: env HWASAN_OPTIONS=fixed_shadow_base=4398046511104 %run %t
+// RUN: env HWASAN_OPTIONS=fixed_shadow_base=263878495698944 not %run %t
 //
 // Note: if fixed_shadow_base is not set, compiler-rt will dynamically choose a
 // shadow base, which has a tiny but non-zero probability of matching the
@@ -26,8 +26,6 @@
 //
 // UNSUPPORTED: android
 
-// CHECK: FATAL: HWAddressSanitizer: Shadow range {{.*}} is not available
-
 #include <assert.h>
 #include <sanitizer/allocator_interface.h>
 #include <sanitizer/hwasan_interface.h>
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 9d2f02189b8bd..3f7dd8e402b78 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -195,16 +195,14 @@ def push_dynamic_library_lookup_path(config, new_path):
 # Normalize the path for comparison
 if test_cc_resource_dir is not None:
     test_cc_resource_dir = os.path.realpath(test_cc_resource_dir)
-if lit_config.debug:
-    lit_config.note(f"Resource dir for {config.clang} is {test_cc_resource_dir}")
+lit_config.dbg(f"Resource dir for {config.clang} is {test_cc_resource_dir}")
 local_build_resource_dir = os.path.realpath(config.compiler_rt_output_dir)
 if test_cc_resource_dir != local_build_resource_dir and config.test_standalone_build_libs:
     if config.compiler_id == "Clang":
-        if lit_config.debug:
-            lit_config.note(
-                f"Overriding test compiler resource dir to use "
-                f'libraries in "{config.compiler_rt_libdir}"'
-            )
+        lit_config.dbg(
+            f"Overriding test compiler resource dir to use "
+            f'libraries in "{config.compiler_rt_libdir}"'
+        )
         # Ensure that we use the just-built static libraries when linking by
         # overriding the Clang resource directory. Additionally, we want to use
         # the builtin headers shipped with clang (e.g. stdint.h), so we
diff --git a/compiler-rt/test/memprof/TestCases/log_path_test.cpp b/compiler-rt/test/memprof/TestCases/log_path_test.cpp
index 664ab79393195..683ca67122c31 100644
--- a/compiler-rt/test/memprof/TestCases/log_path_test.cpp
+++ b/compiler-rt/test/memprof/TestCases/log_path_test.cpp
@@ -18,7 +18,8 @@
 // RUN: %env_memprof_opts=print_text=true:log_path=/dev/null/INVALID not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-BAD-DIR --dump-input=always
 
 // Too long log_path.
-// RUN: %env_memprof_opts=print_text=true:log_path=`for((i=0;i<10000;i++)); do echo -n $i; done` \
+// RUN: %python -c "for i in range(0, 10000): print(i, end='')" > %t.long_log_path
+// RUN: %env_memprof_opts=print_text=true:log_path=%{readfile:%t.long_log_path} \
 // RUN:   not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-LONG --dump-input=always
 
 // Specifying the log name via the __memprof_profile_filename variable.
diff --git a/compiler-rt/test/msan/allocator_mapping.cpp b/compiler-rt/test/msan/allocator_mapping.cpp
index e7a12da489152..6eaba7e16a5be 100644
--- a/compiler-rt/test/msan/allocator_mapping.cpp
+++ b/compiler-rt/test/msan/allocator_mapping.cpp
@@ -3,7 +3,8 @@
 // mapping the heap early, in __msan_init.
 //
 // RUN: %clangxx_msan -O0 %s -o %t_1
-// RUN: %clangxx_msan -O0 -DHEAP_ADDRESS=$(%run %t_1) %s -o %t_2 && %run %t_2
+// RUN: %run %t_1 > %t.heap_address
+// RUN: %clangxx_msan -O0 -DHEAP_ADDRESS=%{readfile:%t.heap_address} %s -o %t_2 && %run %t_2
 //
 // This test only makes sense for the 64-bit allocator. The 32-bit allocator
 // does not have a fixed mapping. Exclude platforms that use the 32-bit
diff --git a/compiler-rt/test/nsan/Posix/allocator_mapping.cpp b/compiler-rt/test/nsan/Posix/allocator_mapping.cpp
index 3a3e655e259d0..a92962e16d9d2 100644
--- a/compiler-rt/test/nsan/Posix/allocator_mapping.cpp
+++ b/compiler-rt/test/nsan/Posix/allocator_mapping.cpp
@@ -2,7 +2,8 @@
 /// Test that a module constructor can not map memory over the NSan heap
 /// (without MAP_FIXED, of course).
 // RUN: %clangxx_nsan -O0 %s -o %t_1
-// RUN: %clangxx_nsan -O0 -DHEAP_ADDRESS=$(%run %t_1) %s -o %t_2 && %run %t_2
+// RUN: %run %t_1 > %t.heap_address
+// RUN: %clangxx_nsan -O0 -DHEAP_ADDRESS=%{readfile:%t.heap_address} %s -o %t_2 && %run %t_2
 
 #include <assert.h>
 #include <stdio.h>
diff --git a/compiler-rt/test/profile/instrprof-hostname.c b/compiler-rt/test/profile/instrprof-hostname.c
index b77cf8df158bd..c0b3426eeaa84 100644
--- a/compiler-rt/test/profile/instrprof-hostname.c
+++ b/compiler-rt/test/profile/instrprof-hostname.c
@@ -1,7 +1,7 @@
 // RUN: %clang_profgen -o %t -O3 %s
 // RUN: env LLVM_PROFILE_FILE=%h.%t-%h.profraw_%h %run %t
-// RUN: %run uname -n > %t.n
-// RUN: llvm-profdata merge -o %t.profdata `cat %t.n`.%t-`cat %t.n`.profraw_`cat %t.n`
+// RUN: %run uname -n | tr -d '\n' > %t.n
+// RUN: llvm-profdata merge -o %t.profdata %{readfile:%t.n}.%t-%{readfile:%t.n}.profraw_%{readfile:%t.n}
 // RUN: %clang_profuse=%t.profdata -o - -S -emit-llvm %s | FileCheck %s
 // REQUIRES: shell
 
diff --git a/compiler-rt/test/profile/instrprof-tmpdir.c b/compiler-rt/test/profile/instrprof-tmpdir.c
index 7206df3c2eb0c..9d4b3d35e94e7 100644
--- a/compiler-rt/test/profile/instrprof-tmpdir.c
+++ b/compiler-rt/test/profile/instrprof-tmpdir.c
@@ -1,3 +1,8 @@
+// AIX does not support env -u.
+// TODO(boomanaiden154): Reenable AIX support once we use the internal shell by
+// default.
+// UNSUPPORTED: system-aix
+
 // RUN: rm -rf %t
 // RUN: mkdir -p %t
 // RUN: cd %t
@@ -12,8 +17,7 @@
 // RUN: llvm-profdata show ./raw2.profraw | FileCheck %s -check-prefix TMPDIR
 //
 // Check that we fall back to the default path if TMPDIR is missing.
-// RUN: %if system-aix %{ unset TMPDIR %}
-// RUN: env %if !system-aix %{ -u TMPDIR %} LLVM_PROFILE_FILE="%%t/raw3.profraw" %run %t/binary 2>&1 | FileCheck %s -check-prefix MISSING
+// RUN: env -u TMPDIR LLVM_PROFILE_FILE="%%t/raw3.profraw" %run %t/binary 2>&1 | FileCheck %s -check-prefix MISSING
 // RUN: llvm-profdata show ./default.profraw | FileCheck %s -check-prefix TMPDIR
 
 // TMPDIR: Maximum function count: 1
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_fd_test.cpp b/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_fd_test.cpp
index 6ba7025bf7578..e4064828015aa 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_fd_test.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_fd_test.cpp
@@ -7,7 +7,7 @@
 // RUN: not %run %t %t-out && FileCheck < %t-out %s
 
 // REQUIRES: stable-runtime
-// XFAIL: android && asan
+// UNSUPPORTED: android && asan
 
 #include <sanitizer/common_interface_defs.h>
 #include <stdio.h>
diff --git a/compiler-rt/test/tsan/cxa_guard_acquire.cpp b/compiler-rt/test/tsan/cxa_guard_acquire.cpp
index fc407259e8968..6050c243cb8c1 100644
--- a/compiler-rt/test/tsan/cxa_guard_acquire.cpp
+++ b/compiler-rt/test/tsan/cxa_guard_acquire.cpp
@@ -66,10 +66,17 @@ int main(int argc, char **argv) {
   printf("Enter main\n");
 
   // If initialization is contended, the blocked thread should enter a
-  // potentially blocking region.
+  // potentially blocking region. Note that we use a DAG check because it is
+  // possible for Thread 1 to acquire the guard, then Thread 2 fail to acquire
+  // the guard then call `OnPotentiallyBlockingRegionBegin` and print "Enter
+  // potentially blocking region\n", before Thread 1 manages to reach "Enter
+  // constructor\n". This is exceptionally rare, but can be replicated by
+  // inserting a `sleep(1)` between `LazyInit() {` and `printf("Enter
+  // constructor\n");`. Due to the barrier it is not possible for the exit logs
+  // to be inverted.
   //
-  // CHECK-NEXT: Enter constructor
-  // CHECK-NEXT: Enter potentially blocking region
+  // CHECK-DAG: Enter constructor
+  // CHECK-DAG: Enter potentially blocking region
   // CHECK-NEXT: Exit constructor
   // CHECK-NEXT: Exit potentially blocking region
   barrier_init(&barrier, 2);
diff --git a/compiler-rt/test/tsan/ignore_lib0.cpp b/compiler-rt/test/tsan/ignore_lib0.cpp
index cba58c6177038..9c4919022b512 100644
--- a/compiler-rt/test/tsan/ignore_lib0.cpp
+++ b/compiler-rt/test/tsan/ignore_lib0.cpp
@@ -4,11 +4,13 @@
 // RUN: %clangxx_tsan -O1 -fno-builtin %s -DLIB -fPIC -fno-sanitize=thread -shared -o %t-dir/libignore_lib0.so
 // RUN: %clangxx_tsan -O1 %s -L%t-dir -lignore_lib0 %link_libcxx_tsan -o %t
 // RUN: echo running w/o suppressions:
-// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %deflake %run %t | FileCheck %s --check-prefix=CHECK-NOSUPP
+// RUN: echo -n %t-dir > %t.ld_library_path
+// RUN: %python -c "if 'LD_LIBRARY_PATH' in __import__('os').environ: print(':' + __import__('os').environ['LD_LIBRARY_PATH'], end='')" >> %t.ld_library_path
+// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %deflake %run %t | FileCheck %s --check-prefix=CHECK-NOSUPP
 // RUN: echo running with suppressions:
-// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %env_tsan_opts=suppressions='%s.supp' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP
+// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %env_tsan_opts=suppressions='%s.supp' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP
 // RUN: echo running with generic suppression of noninstrumented code:
-// RUN: env LD_LIBRARY_PATH=%t-dir${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} %env_tsan_opts=ignore_noninstrumented_modules=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP
+// RUN: env LD_LIBRARY_PATH=%{readfile:%t.ld_library_path} %env_tsan_opts=ignore_noninstrumented_modules=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WITHSUPP
 
 // Tests that interceptors coming from a library specified in called_from_lib
 // suppression are ignored.
diff --git a/compiler-rt/test/tsan/target_clones_segfault.c b/compiler-rt/test/tsan/target_clones_segfault.c
new file mode 100644
index 0000000000000..b8847ec619813
--- /dev/null
+++ b/compiler-rt/test/tsan/target_clones_segfault.c
@@ -0,0 +1,11 @@
+// https://github.com/llvm/llvm-project/issues/163369
+// RUN: %clang_tsan %s -o %t && %run %t
+
+#if __x86_64__
+__attribute__((target_clones("avx,default")))
+#endif
+static int has_target_clones(void) {
+  return 0;
+}
+
+int main(void) { has_target_clones(); }
diff --git a/compiler-rt/test/tysan/basic.c b/compiler-rt/test/tysan/basic.c
index 8e66e1a721383..28b94c425757e 100644
--- a/compiler-rt/test/tysan/basic.c
+++ b/compiler-rt/test/tysan/basic.c
@@ -1,6 +1,10 @@
-// RUN: %clang_tysan -O0 %s -o %t && %run %t 10 >%t.out.0 2>&1
+// RUN: %clang_tysan -O0 -mllvm -tysan-outline-instrumentation=false %s -o %t && %run %t 10 >%t.out.0 2>&1
 // RUN: FileCheck %s < %t.out.0
-// RUN: %clang_tysan -O2 %s -o %t && %run %t 10 >%t.out 2>&1
+// RUN: %clang_tysan -O2 -mllvm -tysan-outline-instrumentation=false %s -o %t && %run %t 10 >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+// RUN: %clang_tysan -O0 -mllvm -tysan-outline-instrumentation=true %s -o %t && %run %t 10 >%t.out.0 2>&1
+// RUN: FileCheck %s < %t.out.0
+// RUN: %clang_tysan -O2 -mllvm -tysan-outline-instrumentation=true %s -o %t && %run %t 10 >%t.out 2>&1
 // RUN: FileCheck %s < %t.out
 
 #include <stdio.h>
diff --git a/compiler-rt/test/tysan/simple_verify_outlines.c b/compiler-rt/test/tysan/simple_verify_outlines.c
new file mode 100644
index 0000000000000..0d0730edb0b99
--- /dev/null
+++ b/compiler-rt/test/tysan/simple_verify_outlines.c
@@ -0,0 +1,22 @@
+// RUN: %clang_tysan -mllvm -tysan-outline-instrumentation=true -mllvm -tysan-verify-outlined-instrumentation=true %s -o %t && %run %t >%t.out.0 2>&1
+// RUN: FileCheck %s < %t.out.0
+
+#include <stdio.h>
+
+void printInt(int *i) { printf("%d\n", *i); }
+
+int main() {
+
+  float value = 5.0f;
+  printInt((int *)&value);
+
+  return 0;
+}
+
+// CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+// CHECK-NEXT: READ of size 4 at {{.*}} with type int accesses an existing object of type float
+// CHECK-NEXT: {{#0 0x.* in printInt}}
+// CHECK-EMPTY:
+// CHECK-NEXT: ERROR: TypeSanitizer: type-aliasing-violation
+// CHECK-NEXT: READ of size 4 at {{.*}} with type int accesses an existing object of type float
+// CHECK-NEXT: {{#0 0x.* in printInt}}
diff --git a/compiler-rt/test/tysan/struct-offset-outline.c b/compiler-rt/test/tysan/struct-offset-outline.c
new file mode 100644
index 0000000000000..c84eb2762f669
--- /dev/null
+++ b/compiler-rt/test/tysan/struct-offset-outline.c
@@ -0,0 +1,32 @@
+// RUN: %clang_tysan -mllvm -tysan-outline-instrumentation=true -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+// RUN: %clang_tysan -mllvm -tysan-outline-instrumentation=true -mllvm -tysan-verify-outlined-instrumentation=true -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s --check-prefixes='CHECK,CHECK-VERIFY' < %t.out
+
+#include <stdio.h>
+#include <stdlib.h>
+
+struct X {
+  int i;
+  int j;
+};
+
+int foo(struct X *p, struct X *q) {
+  q->j = 1;
+  p->i = 0;
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK-NEXT: WRITE of size 4 at {{.*}} with type int (in X at offset 0) accesses an existing object of type int (in X at offset 4)
+  // CHECK-NEXT: {{#0 0x.* in foo .*struct-offset-outline.c:}}[[@LINE-3]]
+  // CHECK-VERIFY-EMPTY:
+  // CHECK-VERIFY-NEXT: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK-VERIFY-NEXT: WRITE of size 4 at {{.*}} with type int (in X at offset 0) accesses an existing object of type int (in X at offset 4)
+  // CHECK-VERIFY-NEXT: {{#0 0x.* in foo .*struct-offset-outline.c:}}[[@LINE-7]]
+  return q->j;
+}
+
+int main() {
+  unsigned char *p = malloc(3 * sizeof(int));
+  printf("%i\n", foo((struct X *)(p + sizeof(int)), (struct X *)p));
+}
+
+// CHECK-NOT: ERROR: TypeSanitizer: type-aliasing-violation
diff --git a/compiler-rt/test/ubsan/TestCases/Misc/Posix/print_stack_trace.cpp b/compiler-rt/test/ubsan/TestCases/Misc/Posix/print_stack_trace.cpp
index 93c6bd66e127c..2eac710d98085 100644
--- a/compiler-rt/test/ubsan/TestCases/Misc/Posix/print_stack_trace.cpp
+++ b/compiler-rt/test/ubsan/TestCases/Misc/Posix/print_stack_trace.cpp
@@ -1,5 +1,5 @@
-// RUN: %clangxx -fsanitize=undefined -O0 %s -o %t && UBSAN_OPTIONS=stack_trace_format=DEFAULT:fast_unwind_on_fatal=1 %run %t 2>&1 | FileCheck %s
-// RUN: %clangxx -fsanitize=undefined -O0 %s -o %t && UBSAN_OPTIONS=stack_trace_format=DEFAULT:fast_unwind_on_fatal=0 %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx -fsanitize=undefined -O0 %s -o %t && env UBSAN_OPTIONS=stack_trace_format=DEFAULT:fast_unwind_on_fatal=1 %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx -fsanitize=undefined -O0 %s -o %t && env UBSAN_OPTIONS=stack_trace_format=DEFAULT:fast_unwind_on_fatal=0 %run %t 2>&1 | FileCheck %s
 
 // This test is temporarily disabled due to broken unwinding on ARM.
 // UNSUPPORTED: target={{.*-linux-.*}}
diff --git a/compiler-rt/test/xray/TestCases/Posix/fdr-single-thread.cpp b/compiler-rt/test/xray/TestCases/Posix/fdr-single-thread.cpp
index b8803aedc8851..36a4e65988f9a 100644
--- a/compiler-rt/test/xray/TestCases/Posix/fdr-single-thread.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/fdr-single-thread.cpp
@@ -1,11 +1,12 @@
 // RUN: %clangxx_xray -g -std=c++11 %s -o %t
 // RUN: rm -f fdr-logging-1thr-*
-// RUN: XRAY_OPTIONS=XRAY_OPTIONS="verbosity=1 patch_premain=true \
+// RUN: env XRAY_OPTIONS=XRAY_OPTIONS="verbosity=1 patch_premain=true \
 // RUN:   xray_fdr_log=true \
 // RUN:   xray_fdr_log_func_duration_threshold_us=0 \
 // RUN:   xray_logfile_base=fdr-logging-1thr-" %run %t 2>&1
+// RUN: ls fdr-logging-1thr-* | head -n1 | tr -d '\n' > %t.xray_input
 // RUN: %llvm_xray convert --output-format=yaml --symbolize --instr_map=%t \
-// RUN:   "`ls fdr-logging-1thr-* | head -n1`" | FileCheck %s
+// RUN:   "%{readfile:%t.xray_input}" | FileCheck %s
 // RUN: rm fdr-logging-1thr-*
 
 // UNSUPPORTED: target=arm{{.*}}
diff --git a/flang-rt/include/flang-rt/runtime/io-stmt.h b/flang-rt/include/flang-rt/runtime/io-stmt.h
index f6a81f7cb8120..3c6bcfec8d0c4 100644
--- a/flang-rt/include/flang-rt/runtime/io-stmt.h
+++ b/flang-rt/include/flang-rt/runtime/io-stmt.h
@@ -730,8 +730,7 @@ class ChildListIoStatementState : public ChildIoStatementState<DIR>,
   RT_API_ATTRS bool AdvanceRecord(int = 1);
   RT_API_ATTRS int EndIoStatement();
   RT_API_ATTRS bool CanAdvance() {
-    return DIR == Direction::Input &&
-        (canAdvance_ || this->mutableModes().inNamelist);
+    return canAdvance_ || this->mutableModes().inNamelist;
   }
 
 private:
diff --git a/flang-rt/lib/runtime/edit-output.cpp b/flang-rt/lib/runtime/edit-output.cpp
index f90b6fb10963f..73dba35ff08d9 100644
--- a/flang-rt/lib/runtime/edit-output.cpp
+++ b/flang-rt/lib/runtime/edit-output.cpp
@@ -175,9 +175,10 @@ bool RT_API_ATTRS EditIntegerOutput(IoStatementState &io, const DataEdit &edit,
   }
   if (edit.IsListDirected()) {
     int total{std::max(leadingSpaces, 1) + subTotal};
-    if (io.GetConnectionState().NeedAdvance(static_cast<std::size_t>(total)) &&
-        !io.AdvanceRecord()) {
-      return false;
+    if (io.GetConnectionState().NeedAdvance(static_cast<std::size_t>(total))) {
+      if (!io.AdvanceRecord()) {
+        return false;
+      }
     }
     leadingSpaces = 1;
   } else if (!edit.width) {
diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp
index 97ac56236e799..2a2e19f9f17ec 100644
--- a/flang-rt/lib/runtime/environment.cpp
+++ b/flang-rt/lib/runtime/environment.cpp
@@ -17,6 +17,10 @@
 
 #ifdef _WIN32
 extern char **_environ;
+#elif defined(__FreeBSD__)
+// FreeBSD has environ in crt rather than libc. Using "extern char** environ"
+// in the code of a shared library makes it fail to link with -Wl,--no-undefined
+// See https://reviews.freebsd.org/D30842#840642
 #else
 extern char **environ;
 #endif
@@ -104,6 +108,11 @@ void ExecutionEnvironment::Configure(int ac, const char *av[],
 
 #ifdef _WIN32
   envp = _environ;
+#elif defined(__FreeBSD__)
+  auto envpp{reinterpret_cast<char ***>(dlsym(RTLD_DEFAULT, "environ"))};
+  if (envpp) {
+    envp = *envpp;
+  }
 #else
   envp = environ;
 #endif
diff --git a/flang-rt/lib/runtime/io-stmt.cpp b/flang-rt/lib/runtime/io-stmt.cpp
index b958f23cf5342..a88fbe605f890 100644
--- a/flang-rt/lib/runtime/io-stmt.cpp
+++ b/flang-rt/lib/runtime/io-stmt.cpp
@@ -1109,20 +1109,20 @@ ChildListIoStatementState<DIR>::ChildListIoStatementState(
     ChildIo &child, const char *sourceFile, int sourceLine)
     : ChildIoStatementState<DIR>{child, sourceFile, sourceLine} {
 #if !defined(RT_DEVICE_AVOID_RECURSION)
-  if constexpr (DIR == Direction::Input) {
-    if (const auto *listInput{child.parent()
-                .get_if<ListDirectedStatementState<Direction::Input>>()}) {
-      this->set_eatComma(listInput->eatComma());
-      this->namelistGroup_ = listInput->namelistGroup();
-      if (auto *childListInput{child.parent()
-                  .get_if<ChildListIoStatementState<Direction::Input>>()}) {
-        // Child list input whose parent is child list input: can advance
-        // if the parent can.
-        this->canAdvance_ = childListInput->CanAdvance();
-      } else {
-        // Child list input of top-level list input: can advance.
-        this->canAdvance_ = true;
-      }
+  if (const auto *listParent{
+          child.parent().get_if<ListDirectedStatementState<DIR>>()}) {
+    if constexpr (DIR == Direction::Input) {
+      this->set_eatComma(listParent->eatComma());
+      this->namelistGroup_ = listParent->namelistGroup();
+    }
+    if (auto *childListParent{
+            child.parent().get_if<ChildListIoStatementState<DIR>>()}) {
+      // Child list I/O whose parent is child list I/O: can advance
+      // if the parent can.
+      this->canAdvance_ = childListParent->CanAdvance();
+    } else {
+      // Child list I/O of top-level list I/O: can advance.
+      this->canAdvance_ = true;
     }
   }
 #else
diff --git a/flang-rt/unittests/CMakeLists.txt b/flang-rt/unittests/CMakeLists.txt
index 53cd54dfd215e..e1ab73d7d9301 100644
--- a/flang-rt/unittests/CMakeLists.txt
+++ b/flang-rt/unittests/CMakeLists.txt
@@ -22,12 +22,8 @@ if (CMAKE_CROSSCOMPILING)
   return ()
 endif ()
 
-if (NOT TARGET llvm_gtest)
-  message(WARNING "Flang-RT unittests disabled due to GTest being unavailable; "
-                  "Try LLVM_INSTALL_GTEST=ON for the LLVM build")
-  return ()
-endif ()
-
+# Make the targets default_gtest and default_gtest_main available.
+build_gtest()
 
 add_dependencies(flang-rt-test-depends
   FlangRTUnitTests
diff --git a/flang-rt/unittests/Evaluate/ISO-Fortran-binding.cpp b/flang-rt/unittests/Evaluate/ISO-Fortran-binding.cpp
index 8c0a6f29b6967..1a9817cc665de 100644
--- a/flang-rt/unittests/Evaluate/ISO-Fortran-binding.cpp
+++ b/flang-rt/unittests/Evaluate/ISO-Fortran-binding.cpp
@@ -9,7 +9,6 @@
 #include "flang-rt/runtime/descriptor.h"
 #include "flang/Common/ISO_Fortran_binding_wrapper.h"
 #include "flang/Testing/testing.h"
-#include "llvm/Support/raw_ostream.h"
 #include <type_traits>
 
 using namespace Fortran::runtime;
@@ -73,26 +72,9 @@ static void AddNoiseToCdesc(CFI_cdesc_t *dv, CFI_rank_t rank) {
   }
 }
 
-#ifdef VERBOSE
-static void DumpTestWorld(const void *bAddr, CFI_attribute_t attr,
-    CFI_type_t ty, std::size_t eLen, CFI_rank_t rank,
-    const CFI_index_t *eAddr) {
-  llvm::outs() << " base_addr: ";
-  llvm::outs().write_hex(reinterpret_cast<std::intptr_t>(bAddr))
-      << " attribute: " << static_cast<int>(attr)
-      << " type: " << static_cast<int>(ty) << " elem_len: " << eLen
-      << " rank: " << static_cast<int>(rank) << " extent: ";
-  llvm::outs().write_hex(reinterpret_cast<std::intptr_t>(eAddr)) << '\n';
-  llvm::outs().flush();
-}
-#endif
-
 static void check_CFI_establish(CFI_cdesc_t *dv, void *base_addr,
     CFI_attribute_t attribute, CFI_type_t type, std::size_t elem_len,
     CFI_rank_t rank, const CFI_index_t extents[]) {
-#ifdef VERBOSE
-  DumpTestWorld(base_addr, attribute, type, elem_len, rank, extent);
-#endif
   // CFI_establish reqs from F2018 section 18.5.5
   int retCode{
       CFI_establish(dv, base_addr, attribute, type, elem_len, rank, extents)};
@@ -305,9 +287,6 @@ static void check_CFI_allocate(CFI_cdesc_t *dv,
   const CFI_type_t type{dv->type};
   const void *base_addr{dv->base_addr};
   const int version{dv->version};
-#ifdef VERBOSE
-  DumpTestWorld(base_addr, attribute, type, elem_len, rank, nullptr);
-#endif
   int retCode{CFI_allocate(dv, lower_bounds, upper_bounds, elem_len)};
   Descriptor *desc = reinterpret_cast<Descriptor *>(dv);
   if (retCode == CFI_SUCCESS) {
diff --git a/flang-rt/unittests/Runtime/AccessTest.cpp b/flang-rt/unittests/Runtime/AccessTest.cpp
index d431d0d19bd61..d44f0ec3ced23 100644
--- a/flang-rt/unittests/Runtime/AccessTest.cpp
+++ b/flang-rt/unittests/Runtime/AccessTest.cpp
@@ -12,8 +12,8 @@
 #include "CrashHandlerFixture.h"
 #include "gtest/gtest.h"
 #include "flang/Runtime/extensions.h"
-#include "llvm/ADT/Twine.h"
 
+#include <cstring>
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -82,8 +82,9 @@ static const char *temp_directory_path() {
 
 static std::string createTemporaryFile(
     const char *name, const AccessType &accessType) {
-  std::string path =
-      (llvm::Twine{temp_directory_path()} + "/" + addPIDSuffix(name)).str();
+  std::ostringstream pathS;
+  pathS << temp_directory_path() << "/" << addPIDSuffix(name);
+  std::string path = pathS.str();
 
   // O_CREAT | O_EXCL enforces that this file is newly created by this call.
   // This feels risky. If we don't have permission to create files in the
diff --git a/flang-rt/unittests/Runtime/CrashHandlerFixture.cpp b/flang-rt/unittests/Runtime/CrashHandlerFixture.cpp
index 8213edd1f9225..34901b5cd2139 100644
--- a/flang-rt/unittests/Runtime/CrashHandlerFixture.cpp
+++ b/flang-rt/unittests/Runtime/CrashHandlerFixture.cpp
@@ -17,12 +17,11 @@
   char buffer[1000];
   std::vsnprintf(buffer, sizeof buffer, message, ap);
   va_end(ap);
-  llvm::errs()
-      << "Test "
-      << ::testing::UnitTest::GetInstance()->current_test_info()->name()
-      << " crashed in file "
-      << (sourceFile ? sourceFile : "unknown source file") << '(' << sourceLine
-      << "): " << buffer << '\n';
+  std::cerr << "Test "
+            << ::testing::UnitTest::GetInstance()->current_test_info()->name()
+            << " crashed in file "
+            << (sourceFile ? sourceFile : "unknown source file") << '('
+            << sourceLine << "): " << buffer << '\n';
   std::exit(EXIT_FAILURE);
 }
 
diff --git a/flang-rt/unittests/Runtime/Descriptor.cpp b/flang-rt/unittests/Runtime/Descriptor.cpp
index 3a4a7670fc62e..4a7bb43a492af 100644
--- a/flang-rt/unittests/Runtime/Descriptor.cpp
+++ b/flang-rt/unittests/Runtime/Descriptor.cpp
@@ -32,8 +32,8 @@ TEST(Descriptor, FixedStride) {
   extent[0] = 8;
   descriptor.Establish(integer, four, data, 1, extent);
   ASSERT_EQ(descriptor.rank(), 1);
-  ASSERT_EQ(descriptor.Elements(), 8);
-  ASSERT_EQ(descriptor.ElementBytes(), four);
+  ASSERT_EQ(descriptor.Elements(), 8u);
+  ASSERT_EQ(descriptor.ElementBytes(), static_cast<unsigned>(four));
   ASSERT_EQ(descriptor.GetDimension(0).LowerBound(), 0);
   ASSERT_EQ(descriptor.GetDimension(0).ByteStride(), four);
   ASSERT_EQ(descriptor.GetDimension(0).Extent(), 8);
diff --git a/flang-rt/unittests/Runtime/ExternalIOTest.cpp b/flang-rt/unittests/Runtime/ExternalIOTest.cpp
index 6c148b1de6f82..6421194f45141 100644
--- a/flang-rt/unittests/Runtime/ExternalIOTest.cpp
+++ b/flang-rt/unittests/Runtime/ExternalIOTest.cpp
@@ -16,7 +16,6 @@
 #include "flang/Runtime/io-api.h"
 #include "flang/Runtime/main.h"
 #include "flang/Runtime/stop.h"
-#include "llvm/Support/raw_ostream.h"
 #include <cstring>
 #include <string_view>
 
diff --git a/flang/docs/CMakeLists.txt b/flang/docs/CMakeLists.txt
index 568f942cb4aa6..b183d6add1059 100644
--- a/flang/docs/CMakeLists.txt
+++ b/flang/docs/CMakeLists.txt
@@ -88,7 +88,7 @@ function (gen_rst_file_from_td output_file td_option source target)
   endif()
   get_filename_component(TABLEGEN_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${source}" DIRECTORY)
   list(APPEND LLVM_TABLEGEN_FLAGS "-I${TABLEGEN_INCLUDE_DIR}")
-  list(APPEND LLVM_TABLEGEN_FLAGS "-I${CMAKE_CURRENT_SOURCE_DIR}/../../clang/include/clang/Driver/")
+  list(APPEND LLVM_TABLEGEN_FLAGS "-I${CMAKE_CURRENT_SOURCE_DIR}/../../clang/include/clang/Options/")
   clang_tablegen(Source/${output_file} ${td_option} SOURCE ${source} TARGET ${target})
 endfunction()
 
diff --git a/flang/docs/Directives.md b/flang/docs/Directives.md
index 2f16a8d579f8b..128d8f9b6b707 100644
--- a/flang/docs/Directives.md
+++ b/flang/docs/Directives.md
@@ -32,6 +32,22 @@ A list of non-standard directives supported by Flang
     end
   end interface
 ```
+  Note that it's not allowed to pass array actual argument to `ignore_trk(R)`
+  dummy argument that is a scalar with `VALUE` attribute, for example:
+```
+  interface
+    subroutine s(b)
+      !dir$ ignore_tkr(r) b
+      integer, value :: b
+    end
+  end interface
+  integer :: a(5)
+  call s(a)
+```
+  The reason for this limitation is that scalars with `VALUE` attribute can
+  be passed in registers, so it's not clear how lowering should handle this
+  case. (Passing scalar actual argument to `ignore_tkr(R)` dummy argument
+  that is a scalar with `VALUE` attribute is allowed.)
 * `!dir$ assume_aligned desginator:alignment`, where designator is a variable,
   maybe with array indices, and alignment is what the compiler should assume the
   alignment to be. E.g A:64 or B(1,1,1):128. The alignment should be a power of 2,
@@ -52,6 +68,9 @@ A list of non-standard directives supported by Flang
   integer that specifying the unrolling factor. When `N` is `0` or `1`, the loop
   should not be unrolled at all. If `N` is omitted the optimizer will
   selects the number of times to unroll the loop.
+* `!dir$ prefetch designator[, designator]...`, where the designator list can be
+  a variable or an array reference. This directive is used to insert a hint to
+  the code generator to prefetch instructions for memory references.
 * `!dir$ novector` disabling vectorization on the following loop.
 * `!dir$ nounroll` disabling unrolling on the following loop.
 * `!dir$ nounroll_and_jam` disabling unrolling and jamming on the following loop.
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 6d872094811e3..c9cc02703fbc8 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -182,6 +182,13 @@ end
   Note that internally the main program symbol name is all uppercase, unlike
   the names of all other symbols, which are usually all lowercase. This
   may make a difference in testing/debugging.
+* A `PROCEDURE()` with no interface name or type may be called as an
+  subroutine with an implicit interface, F'2023 15.4.3.6 paragraph 4 and
+  C1525 notwithstanding.
+  This is a universally portable feature, and it also applies to
+  `PROCEDURE(), POINTER, NOPASS` derived type components.
+  Such procedures may *not* be referenced as implicitly typed functions
+  without first being associated with a function pointer.
 
 ## Extensions, deletions, and legacy features supported by default
 
@@ -954,4 +961,3 @@ print *, [(j,j=1,10)]
   "&GRP A(1:)=1. 2. 3./".
   This extension is necessarily disabled when the type of the array
   has an accessible defined formatted READ subroutine.
-
diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md
index 3286171bb1499..9953f2252218b 100644
--- a/flang/docs/FlangDriver.md
+++ b/flang/docs/FlangDriver.md
@@ -76,7 +76,7 @@ will ignore it when used without `Xflang`.
 As hinted above, `flang` and `flang -fc1` are two separate tools. The
 fact that these tools are accessed through one binary, `flang`, is just an
 implementation detail. Each tool has a separate list of options, albeit defined
-in the same file: `clang/include/clang/Driver/Options.td`.
+in the same file: `clang/include/clang/Options/Options.td`.
 
 The separation helps us split various tasks and allows us to implement more
 specialised tools. In particular, `flang` is not aware of various
@@ -112,7 +112,7 @@ in terms of Clang's driver library, `clangDriver`. This approach allows us to:
   as linkers and assemblers.
 One implication of this dependency on Clang is that all of Flang's compiler
 options are defined alongside Clang's options in
-`clang/include/clang/Driver/Options.td`. For options that are common for both
+`clang/include/clang/Options/Options.td`. For options that are common for both
 Flang and Clang, the corresponding definitions are shared.
 
 Internally, a `clangDriver` based compiler driver works by creating actions
@@ -242,7 +242,7 @@ Adding a new compiler option in Flang consists of two steps:
 
 ### Option Definition
 All of Flang's compiler and frontend driver options are defined in
-`clang/include/clang/Driver/Options.td` in Clang. When adding a new option to
+`clang/include/clang/Options/Options.td` in Clang. When adding a new option to
 Flang, you will either:
   * extend the existing definition for an option that is already available
     in one of Clang's drivers (e.g.  `clang`), but not yet available in Flang, or
@@ -314,7 +314,7 @@ add, you will have to add a dedicated entry in that enum (e.g.
 `ParseSyntaxOnly` for `-fsyntax-only`) and a corresponding `case` in
 `ParseFrontendArgs` function in the `CompilerInvocation.cpp` file, e.g.:
 ```cpp
-    case clang::driver::options::OPT_fsyntax_only:
+    case clang::options::OPT_fsyntax_only:
       opts.programAction = ParseSyntaxOnly;
       break;
 ```
diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp
index 225a6558ef956..bb55a8163d938 100644
--- a/flang/examples/FeatureList/FeatureList.cpp
+++ b/flang/examples/FeatureList/FeatureList.cpp
@@ -348,6 +348,7 @@ struct NodeVisitor {
   READ_FEATURE(TeamValue)
   READ_FEATURE(ImageSelector)
   READ_FEATURE(ImageSelectorSpec)
+  READ_FEATURE(ImageSelectorSpec::Notify)
   READ_FEATURE(ImageSelectorSpec::Stat)
   READ_FEATURE(ImageSelectorSpec::Team_Number)
   READ_FEATURE(ImplicitPart)
@@ -445,6 +446,7 @@ struct NodeVisitor {
   READ_FEATURE(ObjectDecl)
   READ_FEATURE(OldParameterStmt)
   READ_FEATURE(OmpAlignedClause)
+  READ_FEATURE(OmpAllocateDirective)
   READ_FEATURE(OmpBeginDirective)
   READ_FEATURE(OmpBeginLoopDirective)
   READ_FEATURE(OmpBeginSectionsDirective)
@@ -541,7 +543,6 @@ struct NodeVisitor {
   READ_FEATURE(OpenMPCancellationPointConstruct)
   READ_FEATURE(OpenMPConstruct)
   READ_FEATURE(OpenMPCriticalConstruct)
-  READ_FEATURE(OpenMPDeclarativeAllocate)
   READ_FEATURE(OpenMPDeclarativeConstruct)
   READ_FEATURE(OpenMPDeclareReductionConstruct)
   READ_FEATURE(OpenMPDeclareSimdConstruct)
@@ -550,7 +551,6 @@ struct NodeVisitor {
   READ_FEATURE(OmpAtomicDefaultMemOrderClause)
   READ_FEATURE(OpenMPFlushConstruct)
   READ_FEATURE(OpenMPLoopConstruct)
-  READ_FEATURE(OpenMPExecutableAllocate)
   READ_FEATURE(OpenMPAllocatorsConstruct)
   READ_FEATURE(OpenMPRequiresConstruct)
   READ_FEATURE(OpenMPSimpleStandaloneConstruct)
diff --git a/flang/include/flang/Evaluate/check-expression.h b/flang/include/flang/Evaluate/check-expression.h
index 2ff78d75325ef..d11fe22c0be7b 100644
--- a/flang/include/flang/Evaluate/check-expression.h
+++ b/flang/include/flang/Evaluate/check-expression.h
@@ -163,8 +163,8 @@ extern template bool IsErrorExpr(const Expr<SomeType> &);
 std::optional<parser::Message> CheckStatementFunction(
     const Symbol &, const Expr<SomeType> &, FoldingContext &);
 
-bool MayNeedCopy(const ActualArgument *, const characteristics::DummyArgument *,
-    FoldingContext &, bool forCopyOut);
+std::optional<bool> ActualArgNeedsCopy(const ActualArgument *,
+    const characteristics::DummyArgument *, FoldingContext &, bool forCopyOut);
 
 } // namespace Fortran::evaluate
 #endif
diff --git a/flang/include/flang/Evaluate/common.h b/flang/include/flang/Evaluate/common.h
index 0263f15d4215e..3d220afa71718 100644
--- a/flang/include/flang/Evaluate/common.h
+++ b/flang/include/flang/Evaluate/common.h
@@ -303,10 +303,16 @@ class FoldingContext {
     return common::ScopedSet(analyzingPDTComponentKindSelector_, true);
   }
 
+  common::Restorer<std::string> SetRealFlagWarningContext(std::string str) {
+    return common::ScopedSet(realFlagWarningContext_, str);
+  }
+
   parser::CharBlock SaveTempName(std::string &&name) {
     return {*tempNames_.emplace(std::move(name)).first};
   }
 
+  void RealFlagWarnings(const RealFlags &, const char *op);
+
 private:
   parser::ContextualMessages messages_;
   const common::IntrinsicTypeDefaultKinds &defaults_;
@@ -318,8 +324,8 @@ class FoldingContext {
   std::map<parser::CharBlock, ConstantSubscript> impliedDos_;
   const common::LanguageFeatureControl &languageFeatures_;
   std::set<std::string> &tempNames_;
+  std::string realFlagWarningContext_;
 };
 
-void RealFlagWarnings(FoldingContext &, const RealFlags &, const char *op);
 } // namespace Fortran::evaluate
 #endif // FORTRAN_EVALUATE_COMMON_H_
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 7f64d230f7348..4248e3a5461f5 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1110,6 +1110,9 @@ bool IsArraySection(const Expr<SomeType> &expr);
 // Predicate: does an expression contain constant?
 bool HasConstant(const Expr<SomeType> &);
 
+// Predicate: Does an expression contain a component
+bool HasStructureComponent(const Expr<SomeType> &expr);
+
 // Utilities for attaching the location of the declaration of a symbol
 // of interest to a message.  Handles the case of USE association gracefully.
 parser::Message *AttachDeclaration(parser::Message &, const Symbol &);
diff --git a/flang/include/flang/Evaluate/traverse.h b/flang/include/flang/Evaluate/traverse.h
index 48aafa8982559..d63c16f93230a 100644
--- a/flang/include/flang/Evaluate/traverse.h
+++ b/flang/include/flang/Evaluate/traverse.h
@@ -146,7 +146,7 @@ class Traverse {
     return Combine(x.base(), x.subscript());
   }
   Result operator()(const CoarrayRef &x) const {
-    return Combine(x.base(), x.cosubscript(), x.stat(), x.team());
+    return Combine(x.base(), x.cosubscript(), x.notify(), x.stat(), x.team());
   }
   Result operator()(const DataRef &x) const { return visitor_(x.u); }
   Result operator()(const Substring &x) const {
diff --git a/flang/include/flang/Evaluate/variable.h b/flang/include/flang/Evaluate/variable.h
index 5c14421fd3a1b..4f64ede3d407d 100644
--- a/flang/include/flang/Evaluate/variable.h
+++ b/flang/include/flang/Evaluate/variable.h
@@ -260,6 +260,9 @@ class CoarrayRef {
   // it's TEAM=.
   std::optional<Expr<SomeType>> team() const;
   CoarrayRef &set_team(Expr<SomeType> &&);
+  // When notify() is Expr<Some>, it's NOTIFY=.
+  std::optional<Expr<SomeType>> notify() const;
+  CoarrayRef &set_notify(Expr<SomeType> &&);
 
   int Rank() const;
   int Corank() const { return 0; }
@@ -272,6 +275,7 @@ class CoarrayRef {
 private:
   common::CopyableIndirection<DataRef> base_;
   std::vector<Expr<SubscriptInteger>> cosubscript_;
+  std::optional<common::CopyableIndirection<Expr<SomeType>>> notify_;
   std::optional<common::CopyableIndirection<Expr<SomeInteger>>> stat_;
   std::optional<common::CopyableIndirection<Expr<SomeType>>> team_;
 };
diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def
index dc3da7ba5c7f3..d5415faf06f47 100644
--- a/flang/include/flang/Frontend/CodeGenOptions.def
+++ b/flang/include/flang/Frontend/CodeGenOptions.def
@@ -54,7 +54,7 @@ CODEGENOPT(Underscoring, 1, 1)
 ENUM_CODEGENOPT(RelocationModel, llvm::Reloc::Model, 3, llvm::Reloc::PIC_) ///< Name of the relocation model to use.
 ENUM_CODEGENOPT(DebugInfo,  llvm::codegenoptions::DebugInfoKind, 4,  llvm::codegenoptions::NoDebugInfo) ///< Level of debug info to generate
 ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, 4, llvm::driver::VectorLibrary::NoLibrary) ///< Vector functions library to use
-ENUM_CODEGENOPT(FramePointer, llvm::FramePointerKind, 2, llvm::FramePointerKind::None) ///< Enable the usage of frame pointers
+ENUM_CODEGENOPT(FramePointer, llvm::FramePointerKind, 3, llvm::FramePointerKind::None) ///< Enable the usage of frame pointers
 ENUM_CODEGENOPT(ComplexRange, ComplexRangeKind, 3, ComplexRangeKind::CX_Full) ///< Method for calculating complex number division
 
 ENUM_CODEGENOPT(DoConcurrentMapping, DoConcurrentMappingKind, 2, DoConcurrentMappingKind::DCMK_None) ///< Map `do concurrent` to OpenMP
diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index f8322a50effc4..f93f7ad867b30 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -271,6 +271,12 @@ class AbstractConverter {
   virtual bool
   isRegisteredDummySymbol(Fortran::semantics::SymbolRef symRef) const = 0;
 
+  /// Get the source-level argument position (1-based) for a dummy symbol.
+  /// Returns 0 if the symbol is not a registered dummy or position is unknown.
+  /// Can only be used reliably during the instantiation of variables.
+  virtual unsigned
+  getDummyArgPosition(const Fortran::semantics::Symbol &sym) const = 0;
+
   /// Returns the FunctionLikeUnit being lowered, if any.
   virtual const Fortran::lower::pft::FunctionLikeUnit *
   getCurrentFunctionUnit() const = 0;
@@ -351,6 +357,11 @@ class AbstractConverter {
 
   virtual Fortran::lower::StatementContext &getFctCtx() = 0;
 
+  /// Generate STAT and ERRMSG from a list of StatOrErrmsg
+  virtual std::pair<mlir::Value, mlir::Value>
+  genStatAndErrmsg(mlir::Location loc,
+                   const std::list<Fortran::parser::StatOrErrmsg> &) = 0;
+
   AbstractConverter(const Fortran::lower::LoweringOptions &loweringOptions)
       : loweringOptions(loweringOptions) {}
   virtual ~AbstractConverter() = default;
diff --git a/flang/include/flang/Lower/DirectivesCommon.h b/flang/include/flang/Lower/DirectivesCommon.h
index 2d6906738773a..b564ee1f64423 100644
--- a/flang/include/flang/Lower/DirectivesCommon.h
+++ b/flang/include/flang/Lower/DirectivesCommon.h
@@ -512,11 +512,19 @@ fir::factory::AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
       }
       bool dataExvIsAssumedSize =
           Fortran::semantics::IsAssumedSizeArray(symRef->get().GetUltimate());
-      if (genDefaultBounds &&
-          mlir::isa<fir::SequenceType>(fir::unwrapRefType(info.addr.getType())))
+      if (genDefaultBounds && mlir::isa<fir::SequenceType>(
+                                  fir::unwrapRefType(info.addr.getType()))) {
         bounds = fir::factory::genBaseBoundsOps<BoundsOp, BoundsType>(
             builder, operandLocation, dataExv, dataExvIsAssumedSize,
             strideIncludeLowerExtent);
+      }
+      if (genDefaultBounds && fir::characterWithDynamicLen(
+                                  fir::unwrapRefType(info.addr.getType())) ||
+          mlir::isa<fir::BoxCharType>(
+              fir::unwrapRefType(info.addr.getType()))) {
+        bounds = {fir::factory::genBoundsOpFromBoxChar<BoundsOp, BoundsType>(
+            builder, operandLocation, dataExv, info)};
+      }
       asFortran << symRef->get().name().ToString();
     } else { // Unsupported
       llvm::report_fatal_error("Unsupported type of OpenACC operand");
diff --git a/flang/include/flang/Lower/Coarray.h b/flang/include/flang/Lower/MultiImageFortran.h
similarity index 76%
rename from flang/include/flang/Lower/Coarray.h
rename to flang/include/flang/Lower/MultiImageFortran.h
index 76d6a37b0bd61..d9dc9cf051f4c 100644
--- a/flang/include/flang/Lower/Coarray.h
+++ b/flang/include/flang/Lower/MultiImageFortran.h
@@ -1,4 +1,4 @@
-//===-- Lower/Coarray.h -- image related lowering ---------------*- C++ -*-===//
+//===-- Lower/MultiImageFortran.h -- image related lowering -----*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_LOWER_COARRAY_H
-#define FORTRAN_LOWER_COARRAY_H
+#ifndef FORTRAN_LOWER_MULTIIMAGEFORTRAN_H
+#define FORTRAN_LOWER_MULTIIMAGEFORTRAN_H
 
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
@@ -33,6 +33,18 @@ namespace pft {
 struct Evaluation;
 } // namespace pft
 
+//===----------------------------------------------------------------------===//
+// Synchronization statements
+//===----------------------------------------------------------------------===//
+
+void genSyncAllStatement(AbstractConverter &, const parser::SyncAllStmt &);
+
+void genSyncImagesStatement(AbstractConverter &,
+                            const parser::SyncImagesStmt &);
+void genSyncMemoryStatement(AbstractConverter &,
+                            const parser::SyncMemoryStmt &);
+void genSyncTeamStatement(AbstractConverter &, const parser::SyncTeamStmt &);
+
 //===----------------------------------------------------------------------===//
 // TEAM constructs
 //===----------------------------------------------------------------------===//
@@ -75,4 +87,4 @@ class CoarrayExprHelper {
 } // namespace lower
 } // namespace Fortran
 
-#endif // FORTRAN_LOWER_COARRAY_H
+#endif // FORTRAN_LOWER_MULTIIMAGEFORTRAN_H
diff --git a/flang/include/flang/Lower/OpenMP/Clauses.h b/flang/include/flang/Lower/OpenMP/Clauses.h
index 688d01704370d..3eff90b95a20d 100644
--- a/flang/include/flang/Lower/OpenMP/Clauses.h
+++ b/flang/include/flang/Lower/OpenMP/Clauses.h
@@ -204,6 +204,7 @@ using At = tomp::clause::AtT<TypeTy, IdTy, ExprTy>;
 using Bind = tomp::clause::BindT<TypeTy, IdTy, ExprTy>;
 using Capture = tomp::clause::CaptureT<TypeTy, IdTy, ExprTy>;
 using Collapse = tomp::clause::CollapseT<TypeTy, IdTy, ExprTy>;
+using Collector = tomp::clause::CollectorT<TypeTy, IdTy, ExprTy>;
 using Compare = tomp::clause::CompareT<TypeTy, IdTy, ExprTy>;
 using Contains = tomp::clause::ContainsT<TypeTy, IdTy, ExprTy>;
 using Copyin = tomp::clause::CopyinT<TypeTy, IdTy, ExprTy>;
@@ -239,6 +240,7 @@ using If = tomp::clause::IfT<TypeTy, IdTy, ExprTy>;
 using Inbranch = tomp::clause::InbranchT<TypeTy, IdTy, ExprTy>;
 using Inclusive = tomp::clause::InclusiveT<TypeTy, IdTy, ExprTy>;
 using Indirect = tomp::clause::IndirectT<TypeTy, IdTy, ExprTy>;
+using Inductor = tomp::clause::InductorT<TypeTy, IdTy, ExprTy>;
 using Init = tomp::clause::InitT<TypeTy, IdTy, ExprTy>;
 using Initializer = tomp::clause::InitializerT<TypeTy, IdTy, ExprTy>;
 using InReduction = tomp::clause::InReductionT<TypeTy, IdTy, ExprTy>;
diff --git a/flang/include/flang/Lower/Runtime.h b/flang/include/flang/Lower/Runtime.h
index f76f398569b54..204093f9a766a 100644
--- a/flang/include/flang/Lower/Runtime.h
+++ b/flang/include/flang/Lower/Runtime.h
@@ -57,12 +57,6 @@ void genEventWaitStatement(AbstractConverter &, const parser::EventWaitStmt &);
 void genLockStatement(AbstractConverter &, const parser::LockStmt &);
 void genFailImageStatement(AbstractConverter &);
 void genStopStatement(AbstractConverter &, const parser::StopStmt &);
-void genSyncAllStatement(AbstractConverter &, const parser::SyncAllStmt &);
-void genSyncImagesStatement(AbstractConverter &,
-                            const parser::SyncImagesStmt &);
-void genSyncMemoryStatement(AbstractConverter &,
-                            const parser::SyncMemoryStmt &);
-void genSyncTeamStatement(AbstractConverter &, const parser::SyncTeamStmt &);
 void genUnlockStatement(AbstractConverter &, const parser::UnlockStmt &);
 void genPauseStatement(AbstractConverter &, const parser::PauseStmt &);
 
diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
new file mode 100644
index 0000000000000..ae7d566920656
--- /dev/null
+++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
@@ -0,0 +1,94 @@
+//==-- Builder/CUDAIntrinsicCall.h - lowering of CUDA intrinsics ---*-C++-*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_LOWER_CUDAINTRINSICCALL_H
+#define FORTRAN_LOWER_CUDAINTRINSICCALL_H
+
+#include "flang/Optimizer/Builder/IntrinsicCall.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+
+namespace fir {
+
+struct CUDAIntrinsicLibrary : IntrinsicLibrary {
+
+  // Constructors.
+  explicit CUDAIntrinsicLibrary(fir::FirOpBuilder &builder, mlir::Location loc)
+      : IntrinsicLibrary(builder, loc) {}
+  CUDAIntrinsicLibrary() = delete;
+  CUDAIntrinsicLibrary(const CUDAIntrinsicLibrary &) = delete;
+
+  // CUDA intrinsic handlers.
+  mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  fir::ExtendedValue genAtomicAddR2(mlir::Type,
+                                    llvm::ArrayRef<fir::ExtendedValue>);
+  template <int extent>
+  fir::ExtendedValue genAtomicAddVector(mlir::Type,
+                                        llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  fir::ExtendedValue genAtomicCas(mlir::Type,
+                                  llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genAtomicDec(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  fir::ExtendedValue genAtomicExch(mlir::Type,
+                                   llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genAtomicInc(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicMax(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicMin(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicSub(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  fir::ExtendedValue genAtomicXor(mlir::Type,
+                                  llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
+  template <const char *fctName, int extent>
+  fir::ExtendedValue genLDXXFunc(mlir::Type,
+                                 llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genMatchAllSync(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genMatchAnySync(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  template <typename OpTy>
+  mlir::Value genNVVMTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  void genSyncThreads(llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genSyncThreadsAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  void genSyncWarp(llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  template <mlir::NVVM::MemScopeKind scope>
+  void genThreadFence(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreC4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreC8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreI4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreI8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreR2(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreR4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreR8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>);
+  template <mlir::NVVM::VoteSyncKind kind>
+  mlir::Value genVoteSync(mlir::Type, llvm::ArrayRef<mlir::Value>);
+};
+
+const IntrinsicHandler *findCUDAIntrinsicHandler(llvm::StringRef name);
+
+} // namespace fir
+
+#endif // FORTRAN_LOWER_CUDAINTRINSICCALL_H
diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h
index 5c56dd6b695f8..6e2442745f9a0 100644
--- a/flang/include/flang/Optimizer/Builder/CUFCommon.h
+++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h
@@ -18,6 +18,7 @@ static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem";
 
 namespace fir {
 class FirOpBuilder;
+class KindMapping;
 } // namespace fir
 
 namespace cuf {
@@ -34,6 +35,10 @@ bool isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr);
 
 void genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder);
 
+int computeElementByteSize(mlir::Location loc, mlir::Type type,
+                           fir::KindMapping &kindMap,
+                           bool emitErrorOnFailure = true);
+
 } // namespace cuf
 
 #endif // FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index 9f7c10c2b06c2..9933e3ed6c308 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -233,7 +233,7 @@ genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
            fir::FortranVariableFlagsAttr flags,
            mlir::Value dummyScope = nullptr, mlir::Value storage = nullptr,
            std::uint64_t storageOffset = 0,
-           cuf::DataAttributeAttr dataAttr = {});
+           cuf::DataAttributeAttr dataAttr = {}, unsigned dummyArgNo = 0);
 
 /// Generate an hlfir.associate to build a variable from an expression value.
 /// The type of the variable must be provided so that scalar logicals are
@@ -450,6 +450,41 @@ mlir::Value inlineElementalOp(
     mlir::IRMapping &mapper,
     const std::function<bool(hlfir::ElementalOp)> &mustRecursivelyInline);
 
+/// Generate an element-by-element assignment from \p rhs to \p lhs for arrays
+/// that are known not to alias. The assignment is performed using a loop nest
+/// over the optimal extents deduced from both shapes. If \p emitWorkshareLoop
+/// is true, a workshare loop construct may be emitted when available.
+/// Allocatable LHS must be allocated with the right shape and parameters.
+void genNoAliasArrayAssignment(
+    mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
+    hlfir::Entity lhs, bool emitWorkshareLoop = false,
+    bool temporaryLHS = false,
+    std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
+                                hlfir::Entity, hlfir::Entity)> *combiner =
+        nullptr);
+
+/// Generate an assignment from \p rhs to \p lhs when they are known not to
+/// alias. Handles both arrays and scalars: for arrays, delegates to
+/// genNoAliasArrayAssignment; for scalars, performs load/store for trivial
+/// scalar types and falls back to hlfir.assign otherwise.
+/// Allocatable LHS must be allocated with the right shape and parameters.
+void genNoAliasAssignment(
+    mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
+    hlfir::Entity lhs, bool emitWorkshareLoop = false,
+    bool temporaryLHS = false,
+    std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
+                                hlfir::Entity, hlfir::Entity)> *combiner =
+        nullptr);
+inline void genNoAliasAssignment(
+    mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
+    hlfir::Entity lhs, bool emitWorkshareLoop, bool temporaryLHS,
+    std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
+                                hlfir::Entity, hlfir::Entity)>
+        combiner) {
+  genNoAliasAssignment(loc, builder, rhs, lhs, emitWorkshareLoop, temporaryLHS,
+                       &combiner);
+}
+
 /// Create a new temporary with the shape and parameters of the provided
 /// hlfir.eval_in_mem operation and clone the body of the hlfir.eval_in_mem
 /// operating on this new temporary.  returns the temporary and whether the
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 3407dd01dd504..ce0b26c868701 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -19,7 +19,6 @@
 #include "flang/Runtime/iostat-consts.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include <optional>
 
@@ -187,20 +186,6 @@ struct IntrinsicLibrary {
   mlir::Value genAnint(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  fir::ExtendedValue genAtomicCas(mlir::Type,
-                                  llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genAtomicDec(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  fir::ExtendedValue genAtomicExch(mlir::Type,
-                                   llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genAtomicInc(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genAtomicMax(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genAtomicMin(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genAtomicOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genAtomicSub(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  fir::ExtendedValue genAtomicXor(mlir::Type,
-                                  llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue
       genCommandArgumentCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAsind(mlir::Type, llvm::ArrayRef<mlir::Value>);
@@ -208,11 +193,6 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genAssociated(mlir::Type,
                                    llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAtand(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genBesselJn(mlir::Type,
                                  llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genBesselYn(mlir::Type,
@@ -234,9 +214,6 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   void genCpuTime(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCshift(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
-  template <const char *fctName, int extent>
-  fir::ExtendedValue genCUDALDXXFunc(mlir::Type,
-                                     llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCAssociatedCFunPtr(mlir::Type,
                                            llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCAssociatedCPtr(mlir::Type,
@@ -276,7 +253,6 @@ struct IntrinsicLibrary {
                                       llvm::ArrayRef<fir::ExtendedValue>);
   template <Extremum, ExtremumBehavior>
   mlir::Value genExtremum(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genFloor(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genFraction(mlir::Type resultType,
                           mlir::ArrayRef<mlir::Value> args);
@@ -294,6 +270,7 @@ struct IntrinsicLibrary {
   void genGetEnvironmentVariable(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genGetGID(mlir::Type resultType,
                         llvm::ArrayRef<mlir::Value> args);
+  mlir::Value genGetTeam(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genGetUID(mlir::Type resultType,
                         llvm::ArrayRef<mlir::Value> args);
   fir::ExtendedValue genHostnm(std::optional<mlir::Type> resultType,
@@ -368,8 +345,6 @@ struct IntrinsicLibrary {
   mlir::Value genMalloc(mlir::Type, llvm::ArrayRef<mlir::Value>);
   template <typename Shift>
   mlir::Value genMask(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genMatchAllSync(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genMatchAnySync(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genMatmul(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genMatmulTranspose(mlir::Type,
                                         llvm::ArrayRef<fir::ExtendedValue>);
@@ -392,8 +367,6 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genNull(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genNumImages(mlir::Type,
                                   llvm::ArrayRef<fir::ExtendedValue>);
-  template <typename OpTy>
-  mlir::Value genNVVMTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genPack(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genParity(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   void genPerror(llvm::ArrayRef<fir::ExtendedValue>);
@@ -448,56 +421,27 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genSum(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   void genSignalSubroutine(llvm::ArrayRef<fir::ExtendedValue>);
   void genSleep(llvm::ArrayRef<fir::ExtendedValue>);
-  void genSyncThreads(llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genSyncThreadsAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  void genSyncWarp(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genSystem(std::optional<mlir::Type>,
                                mlir::ArrayRef<fir::ExtendedValue> args);
   void genSystemClock(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genTand(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genTanpi(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  fir::ExtendedValue genTeamNumber(mlir::Type,
+                                   llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreI4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreI8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreR2(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreR4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreR8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreC4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreC8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genTrailz(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genTransfer(mlir::Type,
                                  llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genTranspose(mlir::Type,
                                   llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genThisImage(mlir::Type,
                                   llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  void genThreadFence(llvm::ArrayRef<fir::ExtendedValue>);
-  void genThreadFenceBlock(llvm::ArrayRef<fir::ExtendedValue>);
-  void genThreadFenceSystem(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genTrim(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genUbound(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genUnlink(std::optional<mlir::Type> resultType,
                                llvm::ArrayRef<fir::ExtendedValue> args);
   fir::ExtendedValue genUnpack(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genVerify(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
-  template <mlir::NVVM::VoteSyncKind kind>
-  mlir::Value genVoteSync(mlir::Type, llvm::ArrayRef<mlir::Value>);
 
   /// Implement all conversion functions like DBLE, the first argument is
   /// the value to convert. There may be an additional KIND arguments that
diff --git a/flang/include/flang/Optimizer/Dialect/FIRCG/CGOps.td b/flang/include/flang/Optimizer/Dialect/FIRCG/CGOps.td
index 04f839386498c..b29228eed1591 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRCG/CGOps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRCG/CGOps.td
@@ -229,12 +229,13 @@ def fircg_XDeclareOp : fircg_Op<"ext_declare", [AttrSizedOperandSegments]> {
 
   let arguments = (ins AnyRefOrBox:$memref, Variadic<AnyIntegerType>:$shape,
       Variadic<AnyIntegerType>:$shift, Variadic<AnyIntegerType>:$typeparams,
-      Optional<fir_DummyScopeType>:$dummy_scope, Builtin_StringAttr:$uniq_name);
+      Optional<fir_DummyScopeType>:$dummy_scope, Builtin_StringAttr:$uniq_name,
+      OptionalAttr<UI32Attr>:$dummy_arg_no);
   let results = (outs AnyRefOrBox);
 
   let assemblyFormat = [{
     $memref (`(` $shape^ `)`)? (`origin` $shift^)? (`typeparams` $typeparams^)?
-    (`dummy_scope` $dummy_scope^)?
+    (`dummy_scope` $dummy_scope^ (`arg` $dummy_arg_no^)?)?
     attr-dict `:` functional-type(operands, results)
   }];
 
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.h b/flang/include/flang/Optimizer/Dialect/FIROps.h
index 62ef8b4b502f2..4651f2bb8038e 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.h
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.h
@@ -20,6 +20,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
 
 namespace fir {
 
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 58a317cf5d691..8d2dc71281ce0 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -17,6 +17,7 @@
 include "mlir/Dialect/Arith/IR/ArithBase.td"
 include "mlir/Dialect/Arith/IR/ArithOpsInterfaces.td"
 include "mlir/Dialect/LLVMIR/LLVMAttrDefs.td"
+include "mlir/Interfaces/ViewLikeInterface.td"
 include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td"
 include "flang/Optimizer/Dialect/FIRDialect.td"
 include "flang/Optimizer/Dialect/FIRTypes.td"
@@ -79,8 +80,7 @@ def AnyRefOfConstantSizeAggregateType : TypeConstraint<
 // Memory SSA operations
 //===----------------------------------------------------------------------===//
 
-def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments,
-    MemoryEffects<[MemAlloc<AutomaticAllocationScopeResource>]>]> {
+def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments]> {
   let summary = "allocate storage for a temporary on the stack given a type";
   let description = [{
     This primitive operation is used to allocate an object on the stack.  A
@@ -161,7 +161,9 @@ def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments,
     Variadic<AnyIntegerType>:$shape
   );
 
-  let results = (outs fir_ReferenceType);
+  let results =
+      (outs Res<fir_ReferenceType,
+                "", [MemAlloc<AutomaticAllocationScopeResource>]>:$res);
 
   let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
@@ -211,8 +213,7 @@ def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments,
   }];
 }
 
-def fir_AllocMemOp : fir_Op<"allocmem",
-    [MemoryEffects<[MemAlloc<DefaultResource>]>, AttrSizedOperandSegments]> {
+def fir_AllocMemOp : fir_Op<"allocmem", [AttrSizedOperandSegments]> {
   let summary = "allocate storage on the heap for an object of a given type";
 
   let description = [{
@@ -234,7 +235,7 @@ def fir_AllocMemOp : fir_Op<"allocmem",
     Variadic<AnyIntegerType>:$typeparams,
     Variadic<AnyIntegerType>:$shape
   );
-  let results = (outs fir_HeapType);
+  let results = (outs Res<fir_HeapType, "", [MemAlloc<DefaultResource>]>:$res);
 
   let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
@@ -2828,7 +2829,8 @@ def fir_VolatileCastOp : fir_SimpleOneResultOp<"volatile_cast", [Pure]> {
   let hasFolder = 1;
 }
 
-def fir_ConvertOp : fir_SimpleOneResultOp<"convert", [NoMemoryEffect]> {
+def fir_ConvertOp
+    : fir_SimpleOneResultOp<"convert", [NoMemoryEffect, ViewLikeOpInterface]> {
   let summary = "encapsulates all Fortran entity type conversions";
 
   let description = [{
@@ -2866,6 +2868,7 @@ def fir_ConvertOp : fir_SimpleOneResultOp<"convert", [NoMemoryEffect]> {
     static bool isPointerCompatible(mlir::Type ty);
     static bool canBeConverted(mlir::Type inType, mlir::Type outType);
     static bool areVectorsCompatible(mlir::Type inTy, mlir::Type outTy);
+    mlir::Value getViewSource() { return getValue(); }
   }];
   let hasCanonicalizer = 1;
 }
@@ -3271,13 +3274,14 @@ def fir_DeclareOp
       DefaultValuedAttr<UI64Attr, "0">:$storage_offset,
       Builtin_StringAttr:$uniq_name,
       OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs,
-      OptionalAttr<cuf_DataAttributeAttr>:$data_attr);
+      OptionalAttr<cuf_DataAttributeAttr>:$data_attr,
+      OptionalAttr<UI32Attr>:$dummy_arg_no);
 
   let results = (outs AnyRefOrBox);
 
   let assemblyFormat = [{
     $memref (`(` $shape^ `)`)? (`typeparams` $typeparams^)?
-    (`dummy_scope` $dummy_scope^)?
+    (`dummy_scope` $dummy_scope^ (`arg` $dummy_arg_no^)?)?
     (`storage` `(` $storage^ `[` $storage_offset `]` `)`)?
     attr-dict `:` functional-type(operands, results)
   }];
diff --git a/flang/include/flang/Optimizer/Dialect/MIF/MIFOps.td b/flang/include/flang/Optimizer/Dialect/MIF/MIFOps.td
index 52471d3702b76..a6c7d0a07b019 100644
--- a/flang/include/flang/Optimizer/Dialect/MIF/MIFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/MIF/MIFOps.td
@@ -21,6 +21,10 @@ include "flang/Optimizer/Dialect/FIRAttr.td"
 class mif_Op<string mnemonic, list<Trait> traits>
     : Op<MIFDialect, mnemonic, traits>;
 
+class region_Op<string mnemonic, list<Trait> traits = []>
+    : mif_Op<mnemonic, !listconcat(traits, [RecursivelySpeculatable,
+                                            RecursiveMemoryEffects])> {}
+
 //===----------------------------------------------------------------------===//
 // Initialization and Finalization
 //===----------------------------------------------------------------------===//
@@ -174,6 +178,18 @@ def mif_SyncMemoryOp : mif_Op<"sync_memory", [AttrSizedOperandSegments]> {
   }];
 }
 
+def mif_SyncTeamOp : mif_Op<"sync_team", [AttrSizedOperandSegments]> {
+  let summary = "Performs a synchronization of the team, identified by `team`";
+
+  let arguments = (ins AnyRefOrBoxType:$team, Optional<AnyReferenceLike>:$stat,
+      Optional<AnyRefOrBoxType>:$errmsg);
+  let assemblyFormat = [{
+    $team (`stat` $stat^ )?
+    (`errmsg` $errmsg^ )? 
+    attr-dict `:` functional-type(operands, results)
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Collective Operations
 //===----------------------------------------------------------------------===//
@@ -265,4 +281,148 @@ def mif_CoSumOp
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// Teams
+//===----------------------------------------------------------------------===//
+
+def mif_FormTeamOp : mif_Op<"form_team", [AttrSizedOperandSegments]> {
+  let summary =
+      "Create a set of sibling teams whose parent team is the current team.";
+  let description = [{
+    Create a new team for each unique `team_number` value specified.
+    Each executing image will belong to the team whose `team_number` is equal 
+    to the value of team-number on that image, and `team_var` becomes defined
+    with a value that identifies that team.
+
+    If `new_index` is specified, the image index of the executing image will take
+    this index in its new team. Otherwise, the new image index is processor 
+    dependent.
+
+    Arguments: 
+      - `team_number`: Shall be a positive integer.
+      - `team_var` : Shall be a variable of type TEAM_TYPE from the intrinsic
+        module ISO_FORTRAN_ENV.
+      - `new_index`(optional): Shall be an integer that correspond to the index that
+        the calling image will have in the new team.
+  }];
+
+  let arguments = (ins AnyIntegerType:$team_number,
+      Arg<fir_BoxType, "", [MemWrite]>:$team_var,
+      Optional<AnyIntegerType>:$new_index,
+      Arg<Optional<AnyReferenceLike>, "", [MemWrite]>:$stat,
+      Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg);
+
+  let assemblyFormat = [{
+    `team_number` $team_number `team_var` $team_var
+    (`new_index` $new_index^ )?
+    (`stat` $stat^ )?
+    (`errmsg` $errmsg^ )?
+    attr-dict `:` functional-type(operands, results)
+  }];
+}
+
+def mif_EndTeamOp : mif_Op<"end_team", [AttrSizedOperandSegments, Terminator,
+                                        ParentOneOf<["ChangeTeamOp"]>]> {
+  let summary = "Changes the current team to the parent team.";
+  let description = [{
+    The END TEAM operation completes the CHANGE TEAM construct and 
+    restores the current team to the team that was current before 
+    the CHANGE TEAM construct. 
+  }];
+
+  let arguments = (ins Arg<Optional<AnyReferenceLike>, "", [MemWrite]>:$stat,
+      Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg);
+  let builders = [OpBuilder<(ins), [{ /* do nothing */ }]>];
+
+  let assemblyFormat = [{
+    (`stat` $stat^ )? (`errmsg` $errmsg^ )?
+    attr-dict `:` functional-type(operands, results)
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// NOTE: The CHANGE TEAM region will take a coarray association list in
+// argument. However, coarray management and coarray alias creation are not
+// yet supported by the dialect. The argument is therefore not yet supported by
+// this operation and will be added later.
+//===----------------------------------------------------------------------===//
+def mif_ChangeTeamOp
+    : region_Op<"change_team", [AttrSizedOperandSegments,
+                                SingleBlockImplicitTerminator<"EndTeamOp">]> {
+  let summary = "Changes the current team.";
+  let description = [{
+    The CHANGE TEAM construct changes the current team to the specified new
+    team, which must be a child team of the current team.  
+
+    ```
+      mif.change_team %team {
+        %x = fir.convert %i : (index) -> i32
+        ...
+        mif.end_team
+      }
+  }];
+
+  let arguments = (ins AnyRefOrBoxType:$team,
+      Arg<Optional<AnyReferenceLike>, "", [MemWrite]>:$stat,
+      Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg);
+  let regions = (region SizedRegion<1>:$region);
+
+  let skipDefaultBuilders = 1;
+  let builders =
+      [OpBuilder<(ins "mlir::Value":$team,
+           CArg<"bool", "true">:$ensureTermination,
+           CArg<"llvm::ArrayRef<mlir::NamedAttribute>", "{}">:$attributes)>,
+       OpBuilder<(ins "mlir::Value":$team, "mlir::Value":$stat,
+           "mlir::Value":$errmsg, CArg<"bool", "true">:$ensureTermination,
+           CArg<"llvm::ArrayRef<mlir::NamedAttribute>", "{}">:$attributes)>];
+
+  let extraClassDeclaration = [{
+    /// Get the body of the CHANGE TEAM construct 
+    mlir::Block *getBody() { return &getRegion().front(); }
+  }];
+
+  let assemblyFormat = [{
+    $team (`stat` $stat^)?
+    (`errmsg` $errmsg^)?
+    attr-dict `:` `(` type(operands) `)`
+    custom<ChangeTeamOpBody>($region)
+  }];
+}
+
+def mif_GetTeamOp : mif_Op<"get_team", []> {
+  let summary = "Get the team value for the current or ancestor team.";
+  let description = [{
+    This operation gets the team value for the current or an ancestor team.
+    `level`(optional): If provided, must equal one of the following constants : 
+    `INITIAL_TEAM`, `PARENT_TEAM` or `CURRENT_TEAM` from the module ISO_FORTRAN_ENV.
+    If `level` isn't present or has the value `CURRENT_TEAM` the returned
+    value is the current team.     
+  }];
+
+  let arguments = (ins Optional<AnyIntegerType>:$level);
+  let results = (outs fir_BoxType:$team);
+
+  let assemblyFormat = [{
+    (`level` $level^ )?
+    attr-dict `:` functional-type(operands, results)
+  }];
+}
+
+def mif_TeamNumberOp : mif_Op<"team_number", []> {
+  let summary = "Get the team number";
+  let description = [{
+    Argument: `team` is optional and shall be a scalar of type TEAM_TYPE from 
+    module ISO_FORTRAN_ENV and the value identifies the current or an ancestor team. 
+    If `team` is absent, the team specified is the current team.
+  }];
+
+  let arguments = (ins Optional<AnyRefOrBoxType>:$team);
+  let results = (outs I64);
+
+  let assemblyFormat = [{
+    (`team` $team^ )?
+    attr-dict `:` functional-type(operands, results)
+  }];
+}
+
 #endif // FORTRAN_DIALECT_MIF_MIF_OPS
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
index b7563a2f752f0..73f6be604a55c 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -103,13 +103,13 @@ def hlfir_DeclareOp
       Builtin_StringAttr:$uniq_name,
       OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs,
       OptionalAttr<cuf_DataAttributeAttr>:$data_attr,
-      OptionalAttr<UnitAttr>:$skip_rebox);
+      OptionalAttr<UnitAttr>:$skip_rebox, OptionalAttr<UI32Attr>:$dummy_arg_no);
 
   let results = (outs AnyFortranVariable, AnyRefOrBoxLike);
 
   let assemblyFormat = [{
     $memref (`(` $shape^ `)`)? (`typeparams` $typeparams^)?
-    (`dummy_scope` $dummy_scope^)?
+    (`dummy_scope` $dummy_scope^ (`arg` $dummy_arg_no^)?)?
     (`storage` `(` $storage^ `[` $storage_offset `]` `)`)?
     (`skip_rebox` $skip_rebox^)?
     attr-dict `:` functional-type(operands, results)
@@ -122,7 +122,8 @@ def hlfir_DeclareOp
       CArg<"mlir::Value", "{}">:$storage,
       CArg<"std::uint64_t", "0">:$storage_offset,
       CArg<"fir::FortranVariableFlagsAttr", "{}">:$fortran_attrs,
-      CArg<"cuf::DataAttributeAttr", "{}">:$data_attr)>];
+      CArg<"cuf::DataAttributeAttr", "{}">:$data_attr,
+      CArg<"unsigned", "0">:$dummy_arg_no)>];
 
   let extraClassDeclaration = [{
     /// Get the variable original base (same as input). It lacks
diff --git a/flang/include/flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h b/flang/include/flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h
new file mode 100644
index 0000000000000..c798681306c10
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h
@@ -0,0 +1,51 @@
+//===- FIROpenACCSupportAnalysis.h - FIR OpenACCSupport Analysis ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the FIR-specific implementation of OpenACCSupport analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_OPENACC_ANALYSIS_FIROPENACCSUPPORTANALYSIS_H
+#define FORTRAN_OPTIMIZER_OPENACC_ANALYSIS_FIROPENACCSUPPORTANALYSIS_H
+
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Value.h"
+#include <string>
+
+namespace fir {
+namespace acc {
+
+/// FIR-specific implementation for the OpenACCSupport analysis interface.
+///
+/// This class provides the custom implementations of the OpenACCSupport
+/// interface methods that are tailored to FIR's requirements and
+/// can handle FIR dialect operations and types.
+/// Its primary intent is to be registered with the OpenACCSupport analysis
+/// using setImplementation()
+///
+/// Usage:
+///   auto &support = getAnalysis<mlir::acc::OpenACCSupport>();
+///   support.setImplementation(fir::acc::FIROpenACCSupportAnalysis());
+///
+class FIROpenACCSupportAnalysis {
+public:
+  FIROpenACCSupportAnalysis() = default;
+
+  std::string getVariableName(mlir::Value v);
+
+  std::string getRecipeName(mlir::acc::RecipeKind kind, mlir::Type type,
+                            mlir::Value var);
+
+  mlir::InFlightDiagnostic emitNYI(mlir::Location loc,
+                                   const mlir::Twine &message);
+};
+
+} // namespace acc
+} // namespace fir
+
+#endif // FORTRAN_OPTIMIZER_OPENACC_ANALYSIS_FIROPENACCSUPPORTANALYSIS_H
diff --git a/flang/include/flang/Optimizer/OpenACC/Passes.h b/flang/include/flang/Optimizer/OpenACC/Passes.h
index 0627cc8ce4a6d..c27c7ebc3b06f 100644
--- a/flang/include/flang/Optimizer/OpenACC/Passes.h
+++ b/flang/include/flang/Optimizer/OpenACC/Passes.h
@@ -13,6 +13,9 @@
 #ifndef FORTRAN_OPTIMIZER_OPENACC_PASSES_H
 #define FORTRAN_OPTIMIZER_OPENACC_PASSES_H
 
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
@@ -25,6 +28,7 @@ namespace acc {
 #define GEN_PASS_REGISTRATION
 #include "flang/Optimizer/OpenACC/Passes.h.inc"
 
+std::unique_ptr<mlir::Pass> createACCInitializeFIRAnalysesPass();
 std::unique_ptr<mlir::Pass> createACCRecipeBufferizationPass();
 
 } // namespace acc
diff --git a/flang/include/flang/Optimizer/OpenACC/Passes.td b/flang/include/flang/Optimizer/OpenACC/Passes.td
index 3c127b30aa9b8..d947aa470494a 100644
--- a/flang/include/flang/Optimizer/OpenACC/Passes.td
+++ b/flang/include/flang/Optimizer/OpenACC/Passes.td
@@ -11,6 +11,22 @@
 
 include "mlir/Pass/PassBase.td"
 
+def ACCInitializeFIRAnalyses
+    : Pass<"acc-initialize-fir-analyses", "mlir::ModuleOp"> {
+  let summary = "Initialize FIR analyses for OpenACC passes";
+  let description = [{
+    This pass initializes analyses that can be used by subsequent OpenACC passes
+    in the pipeline. It creates and caches the OpenACCSupport analysis with a
+    FIR-specific implementation that can handle FIR types and operations.
+    It also initializes FIR's AliasAnalysis for use in OpenACC passes.
+    This pass needs to rerun if any analyses were invalidated by MLIR's framework.
+  }];
+  // In addition to pre-registering the needed analyses, this pass also
+  // pre-registers the dialects that various OpenACC passes may generate.
+  let dependentDialects = ["fir::FIROpsDialect", "hlfir::hlfirDialect",
+      "mlir::acc::OpenACCDialect"];
+}
+
 def ACCRecipeBufferization
     : Pass<"fir-acc-recipe-bufferization", "mlir::ModuleOp"> {
   let summary = "Rewrite acc.*.recipe box values to ref<box> and update uses";
diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h
new file mode 100644
index 0000000000000..7afe97aac57e8
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h
@@ -0,0 +1,58 @@
+//===- FIROpenACCOpsInterfaces.h --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains external operation interfaces for FIR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_
+#define FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_
+
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+
+namespace fir {
+class DeclareOp;
+} // namespace fir
+
+namespace hlfir {
+class DeclareOp;
+class DesignateOp;
+} // namespace hlfir
+
+namespace fir::acc {
+
+template <typename Op>
+struct PartialEntityAccessModel
+    : public mlir::acc::PartialEntityAccessOpInterface::ExternalModel<
+          PartialEntityAccessModel<Op>, Op> {
+  mlir::Value getBaseEntity(mlir::Operation *op) const;
+
+  // Default implementation - returns false (partial view)
+  bool isCompleteView(mlir::Operation *op) const { return false; }
+};
+
+// Full specializations for declare operations
+template <>
+struct PartialEntityAccessModel<fir::DeclareOp>
+    : public mlir::acc::PartialEntityAccessOpInterface::ExternalModel<
+          PartialEntityAccessModel<fir::DeclareOp>, fir::DeclareOp> {
+  mlir::Value getBaseEntity(mlir::Operation *op) const;
+  bool isCompleteView(mlir::Operation *op) const;
+};
+
+template <>
+struct PartialEntityAccessModel<hlfir::DeclareOp>
+    : public mlir::acc::PartialEntityAccessOpInterface::ExternalModel<
+          PartialEntityAccessModel<hlfir::DeclareOp>, hlfir::DeclareOp> {
+  mlir::Value getBaseEntity(mlir::Operation *op) const;
+  bool isCompleteView(mlir::Operation *op) const;
+};
+
+} // namespace fir::acc
+
+#endif // FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_
diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h
index 4817ed933ba06..3167c554abbdd 100644
--- a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h
+++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h
@@ -60,6 +60,8 @@ struct OpenACCMappableModel
   getOffsetInBytes(mlir::Type type, mlir::Value var, mlir::ValueRange accBounds,
                    const mlir::DataLayout &dataLayout) const;
 
+  bool hasUnknownDimensions(mlir::Type type) const;
+
   llvm::SmallVector<mlir::Value>
   generateAccBounds(mlir::Type type, mlir::Value var,
                     mlir::OpBuilder &builder) const;
diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h
new file mode 100644
index 0000000000000..5ca0925ea681f
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h
@@ -0,0 +1,57 @@
+//===- FIROpenACCUtils.h - FIR OpenACC Utilities ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares utility functions for FIR OpenACC support.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_OPENACC_SUPPORT_FIROPENACCUTILS_H
+#define FORTRAN_OPTIMIZER_OPENACC_SUPPORT_FIROPENACCUTILS_H
+
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Value.h"
+#include <string>
+
+namespace fir {
+namespace acc {
+
+/// Attempts to extract the variable name from a value by walking through
+/// FIR operations and looking for variable names.
+/// \param v The value to extract the variable name from
+/// \param preferDemangledName If true, prefers demangled/bindc names over
+///        mangled/unique names. If false, prefers mangled names.
+/// Returns empty string if no name is found.
+std::string getVariableName(mlir::Value v, bool preferDemangledName = true);
+
+/// Get the recipe name for a given recipe kind, FIR type, and optional
+/// variable. Uses FIR's type string representation with appropriate prefix. For
+/// firstprivate and reduction recipes, handles bounds suffix when all bounds
+/// are constant. For reduction recipes, embeds the operator name in the recipe.
+/// \param kind The recipe kind (private, firstprivate, or reduction)
+/// \param type The FIR type (must be a FIR type)
+/// \param var Optional variable value
+/// \param bounds Optional bounds for array sections (used for suffix
+/// generation)
+/// \param reductionOp Optional reduction operator (required for reduction
+/// recipes)
+/// \return The complete recipe name with all necessary suffixes
+std::string getRecipeName(mlir::acc::RecipeKind kind, mlir::Type type,
+                          mlir::Value var = nullptr,
+                          llvm::ArrayRef<mlir::Value> bounds = {},
+                          mlir::acc::ReductionOperator reductionOp =
+                              mlir::acc::ReductionOperator::AccNone);
+
+/// Check if all bounds are expressed with constant values.
+/// \param bounds Array of DataBoundsOp values to check
+/// \return true if all bounds have constant lowerbound/upperbound or extent
+bool areAllBoundsConstant(llvm::ArrayRef<mlir::Value> bounds);
+
+} // namespace acc
+} // namespace fir
+
+#endif // FORTRAN_OPTIMIZER_OPENACC_SUPPORT_FIROPENACCUTILS_H
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index a7398a4ef970f..353260b2e5c02 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -218,6 +218,7 @@ class ParseTreeDumper {
   NODE(CompilerDirective, NoVector)
   NODE(CompilerDirective, NoUnroll)
   NODE(CompilerDirective, NoUnrollAndJam)
+  NODE(CompilerDirective, Prefetch)
   NODE(parser, ComplexLiteralConstant)
   NODE(parser, ComplexPart)
   NODE(parser, ComponentArraySpec)
@@ -387,6 +388,7 @@ class ParseTreeDumper {
   NODE(parser, TeamValue)
   NODE(parser, ImageSelector)
   NODE(parser, ImageSelectorSpec)
+  NODE(ImageSelectorSpec, Notify)
   NODE(ImageSelectorSpec, Stat)
   NODE(ImageSelectorSpec, Team_Number)
   NODE(parser, ImplicitPart)
@@ -512,6 +514,7 @@ class ParseTreeDumper {
   NODE(parser, OmpAlignModifier)
   NODE(parser, OmpAllocateClause)
   NODE(OmpAllocateClause, Modifier)
+  NODE(parser, OmpAllocateDirective)
   NODE(parser, OmpAllocatorComplexModifier)
   NODE(parser, OmpAllocatorSimpleModifier)
   NODE(parser, OmpAlwaysModifier)
@@ -585,6 +588,8 @@ class ParseTreeDumper {
   NODE(parser, OmpExpectation)
   NODE_ENUM(OmpExpectation, Value)
   NODE(parser, OmpFailClause)
+  NODE(parser, OmpFallbackModifier)
+  NODE_ENUM(OmpFallbackModifier, Value)
   NODE(parser, OmpFromClause)
   NODE(OmpFromClause, Modifier)
   NODE(parser, OmpGrainsizeClause)
@@ -739,7 +744,6 @@ class ParseTreeDumper {
   NODE(parser, OpenMPCancellationPointConstruct)
   NODE(parser, OpenMPConstruct)
   NODE(parser, OpenMPCriticalConstruct)
-  NODE(parser, OpenMPDeclarativeAllocate)
   NODE(parser, OpenMPDeclarativeAssumes)
   NODE(parser, OpenMPDeclarativeConstruct)
   NODE(parser, OpenMPDeclareMapperConstruct)
@@ -748,7 +752,6 @@ class ParseTreeDumper {
   NODE(parser, OpenMPDeclareTargetConstruct)
   NODE(parser, OpenMPDepobjConstruct)
   NODE(parser, OpenMPDispatchConstruct)
-  NODE(parser, OpenMPExecutableAllocate)
   NODE(parser, OpenMPFlushConstruct)
   NODE(parser, OpenMPGroupprivate)
   NODE(parser, OpenMPLoopConstruct)
diff --git a/flang/include/flang/Parser/openmp-utils.h b/flang/include/flang/Parser/openmp-utils.h
index 49db091af93a7..8fa4a84aff06d 100644
--- a/flang/include/flang/Parser/openmp-utils.h
+++ b/flang/include/flang/Parser/openmp-utils.h
@@ -22,6 +22,7 @@
 #include <type_traits>
 #include <utility>
 #include <variant>
+#include <vector>
 
 namespace Fortran::parser::omp {
 
@@ -33,23 +34,6 @@ template <typename T> constexpr auto addr_if(const std::optional<T> &x) {
 }
 
 namespace detail {
-using D = llvm::omp::Directive;
-
-template <typename Construct> //
-struct ConstructId {
-  static constexpr llvm::omp::Directive id{D::OMPD_unknown};
-};
-
-#define MAKE_CONSTR_ID(Construct, Id) \
-  template <> struct ConstructId<Construct> { \
-    static constexpr llvm::omp::Directive id{Id}; \
-  }
-
-MAKE_CONSTR_ID(OpenMPDeclarativeAllocate, D::OMPD_allocate);
-MAKE_CONSTR_ID(OpenMPExecutableAllocate, D::OMPD_allocate);
-
-#undef MAKE_CONSTR_ID
-
 struct DirectiveNameScope {
   static OmpDirectiveName MakeName(CharBlock source = {},
       llvm::omp::Directive id = llvm::omp::Directive::OMPD_unknown) {
@@ -97,9 +81,6 @@ struct DirectiveNameScope {
     } else if constexpr (TupleTrait<T>) {
       if constexpr (std::is_base_of_v<OmpBlockConstruct, T>) {
         return std::get<OmpBeginDirective>(x.t).DirName();
-      } else if constexpr (std::is_same_v<T, OpenMPDeclarativeAllocate> ||
-          std::is_same_v<T, OpenMPExecutableAllocate>) {
-        return MakeName(std::get<Verbatim>(x.t).source, ConstructId<T>::id);
       } else {
         return GetFromTuple(
             x.t, std::make_index_sequence<std::tuple_size_v<decltype(x.t)>>{});
@@ -139,6 +120,9 @@ template <typename T> OmpDirectiveName GetOmpDirectiveName(const T &x) {
   return detail::DirectiveNameScope::GetOmpDirectiveName(x);
 }
 
+const OpenMPDeclarativeConstruct *GetOmp(const DeclarationConstruct &x);
+const OpenMPConstruct *GetOmp(const ExecutionPartConstruct &x);
+
 const OmpObjectList *GetOmpObjectList(const OmpClause &clause);
 
 template <typename T>
@@ -158,6 +142,13 @@ const OmpCombinerExpression *GetCombinerExpr(
     const OmpReductionSpecifier &rspec);
 const OmpInitializerExpression *GetInitializerExpr(const OmpClause &init);
 
+struct OmpAllocateInfo {
+  std::vector<const OmpAllocateDirective *> dirs;
+  const ExecutionPartConstruct *body{nullptr};
+};
+
+OmpAllocateInfo SplitOmpAllocate(const OmpAllocateDirective &x);
+
 } // namespace Fortran::parser::omp
 
 #endif // FORTRAN_PARSER_OPENMP_UTILS_H
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 375790af90b74..2f6b95b2fa2a8 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -1684,13 +1684,15 @@ using Cosubscript = ScalarIntExpr;
 WRAPPER_CLASS(TeamValue, Scalar<common::Indirection<Expr>>);
 
 // R926 image-selector-spec ->
+//        NOTIFY = notify-variable |
 //        STAT = stat-variable | TEAM = team-value |
 //        TEAM_NUMBER = scalar-int-expr
 struct ImageSelectorSpec {
   WRAPPER_CLASS(Stat, Scalar<Integer<common::Indirection<Variable>>>);
   WRAPPER_CLASS(Team_Number, ScalarIntExpr);
+  WRAPPER_CLASS(Notify, Scalar<common::Indirection<Variable>>);
   UNION_CLASS_BOILERPLATE(ImageSelectorSpec);
-  std::variant<Stat, TeamValue, Team_Number> u;
+  std::variant<Notify, Stat, TeamValue, Team_Number> u;
 };
 
 // R924 image-selector ->
@@ -3358,6 +3360,7 @@ struct StmtFunctionStmt {
 // !DIR$ NOVECTOR
 // !DIR$ NOUNROLL
 // !DIR$ NOUNROLL_AND_JAM
+// !DIR$ PREFETCH designator[, designator]...
 // !DIR$ FORCEINLINE
 // !DIR$ INLINE
 // !DIR$ NOINLINE
@@ -3386,6 +3389,10 @@ struct CompilerDirective {
   struct UnrollAndJam {
     WRAPPER_CLASS_BOILERPLATE(UnrollAndJam, std::optional<std::uint64_t>);
   };
+  struct Prefetch {
+    WRAPPER_CLASS_BOILERPLATE(
+        Prefetch, std::list<common::Indirection<Designator>>);
+  };
   EMPTY_CLASS(NoVector);
   EMPTY_CLASS(NoUnroll);
   EMPTY_CLASS(NoUnrollAndJam);
@@ -3396,7 +3403,8 @@ struct CompilerDirective {
   CharBlock source;
   std::variant<std::list<IgnoreTKR>, LoopCount, std::list<AssumeAligned>,
       VectorAlways, std::list<NameValue>, Unroll, UnrollAndJam, Unrecognized,
-      NoVector, NoUnroll, NoUnrollAndJam, ForceInline, Inline, NoInline>
+      NoVector, NoUnroll, NoUnrollAndJam, ForceInline, Inline, NoInline,
+      Prefetch>
       u;
 };
 
@@ -3992,6 +4000,17 @@ struct OmpExpectation {
   WRAPPER_CLASS_BOILERPLATE(OmpExpectation, Value);
 };
 
+// Ref: [6.1:tbd]
+//
+// fallback-modifier ->
+//    FALLBACK(fallback-mode)                       // since 6.1
+// fallback-mode ->
+//    ABORT | DEFAULT_MEM | NULL                    // since 6.1
+struct OmpFallbackModifier {
+  ENUM_CLASS(Value, Abort, Default_Mem, Null);
+  WRAPPER_CLASS_BOILERPLATE(OmpFallbackModifier, Value);
+};
+
 // REF: [5.1:217-220], [5.2:293-294]
 //
 // OmpInteropRuntimeIdentifier ->                   // since 5.2
@@ -4121,9 +4140,8 @@ struct OmpOrderModifier {
 //
 // prescriptiveness ->
 //    STRICT                                        // since 5.1
-//    FALLBACK                                      // since 6.1
 struct OmpPrescriptiveness {
-  ENUM_CLASS(Value, Strict, Fallback)
+  ENUM_CLASS(Value, Strict)
   WRAPPER_CLASS_BOILERPLATE(OmpPrescriptiveness, Value);
 };
 
@@ -4504,7 +4522,7 @@ struct OmpDynamicAllocatorsClause {
 
 struct OmpDynGroupprivateClause {
   TUPLE_CLASS_BOILERPLATE(OmpDynGroupprivateClause);
-  MODIFIER_BOILERPLATE(OmpAccessGroup, OmpPrescriptiveness);
+  MODIFIER_BOILERPLATE(OmpAccessGroup, OmpFallbackModifier);
   std::tuple<MODIFIERS(), ScalarIntExpr> t;
 };
 
@@ -5151,17 +5169,42 @@ struct OpenMPThreadprivate {
   CharBlock source;
 };
 
-// 2.11.3 allocate -> ALLOCATE (variable-name-list) [clause]
-struct OpenMPDeclarativeAllocate {
-  TUPLE_CLASS_BOILERPLATE(OpenMPDeclarativeAllocate);
-  CharBlock source;
-  std::tuple<Verbatim, OmpObjectList, OmpClauseList> t;
+// Ref: [4.5:310-312], [5.0:156-158], [5.1:181-184], [5.2:176-177],
+//      [6.0:310-312]
+//
+// allocate-directive ->
+//    ALLOCATE (variable-list-item...) |            // since 4.5
+//    ALLOCATE (variable-list-item...)              // since 5.0, until 5.1
+//    ...
+//    allocate-stmt
+//
+// The first form is the "declarative-allocate", and is a declarative
+// directive. The second is the "executable-allocate" and is an executable
+// directive. The executable form was deprecated in 5.2.
+//
+// The executable-allocate consists of several ALLOCATE directives. Since
+// in the parse tree every type corresponding to a directive only corresponds
+// to a single directive, the executable form is represented by a sequence
+// of nested OmpAlocateDirectives, e.g.
+//    !$OMP ALLOCATE(x)
+//    !$OMP ALLOCATE(y)
+//    ALLOCATE(x, y)
+// will become
+//    OmpAllocateDirective
+//    |- ALLOCATE(x)            // begin directive
+//    `- OmpAllocateDirective   // block
+//       |- ALLOCATE(y)            // begin directive
+//       `- ALLOCATE(x, y)         // block
+//
+// The block in the declarative-allocate will be empty.
+struct OmpAllocateDirective : public OmpBlockConstruct {
+  INHERITED_TUPLE_CLASS_BOILERPLATE(OmpAllocateDirective, OmpBlockConstruct);
 };
 
 struct OpenMPDeclarativeConstruct {
   UNION_CLASS_BOILERPLATE(OpenMPDeclarativeConstruct);
   CharBlock source;
-  std::variant<OpenMPDeclarativeAllocate, OpenMPDeclarativeAssumes,
+  std::variant<OmpAllocateDirective, OpenMPDeclarativeAssumes,
       OpenMPDeclareMapperConstruct, OpenMPDeclareReductionConstruct,
       OpenMPDeclareSimdConstruct, OpenMPDeclareTargetConstruct,
       OmpDeclareVariantDirective, OpenMPGroupprivate, OpenMPThreadprivate,
@@ -5174,19 +5217,6 @@ struct OpenMPCriticalConstruct : public OmpBlockConstruct {
   INHERITED_TUPLE_CLASS_BOILERPLATE(OpenMPCriticalConstruct, OmpBlockConstruct);
 };
 
-// 2.11.3 allocate -> ALLOCATE [(variable-name-list)] [clause]
-//        [ALLOCATE (variable-name-list) [clause] [...]]
-//        allocate-statement
-//        clause -> allocator-clause
-struct OpenMPExecutableAllocate {
-  TUPLE_CLASS_BOILERPLATE(OpenMPExecutableAllocate);
-  CharBlock source;
-  std::tuple<Verbatim, std::optional<OmpObjectList>, OmpClauseList,
-      std::optional<std::list<OpenMPDeclarativeAllocate>>,
-      Statement<AllocateStmt>>
-      t;
-};
-
 // Ref: [5.2:180-181], [6.0:315]
 //
 // allocators-construct ->
@@ -5342,9 +5372,9 @@ struct OpenMPConstruct {
   UNION_CLASS_BOILERPLATE(OpenMPConstruct);
   std::variant<OpenMPStandaloneConstruct, OpenMPSectionsConstruct,
       OpenMPSectionConstruct, OpenMPLoopConstruct, OmpBlockConstruct,
-      OpenMPAtomicConstruct, OpenMPDeclarativeAllocate, OpenMPDispatchConstruct,
-      OpenMPUtilityConstruct, OpenMPExecutableAllocate,
-      OpenMPAllocatorsConstruct, OpenMPAssumeConstruct, OpenMPCriticalConstruct>
+      OpenMPAtomicConstruct, OmpAllocateDirective, OpenMPDispatchConstruct,
+      OpenMPUtilityConstruct, OpenMPAllocatorsConstruct, OpenMPAssumeConstruct,
+      OpenMPCriticalConstruct>
       u;
 };
 
diff --git a/flang/include/flang/Semantics/openmp-modifiers.h b/flang/include/flang/Semantics/openmp-modifiers.h
index bfa3aa4939cb1..283bf2a4c895e 100644
--- a/flang/include/flang/Semantics/openmp-modifiers.h
+++ b/flang/include/flang/Semantics/openmp-modifiers.h
@@ -67,6 +67,7 @@ template <typename SpecificTy> const OmpModifierDescriptor &OmpGetDescriptor();
 #define DECLARE_DESCRIPTOR(name) \
   template <> const OmpModifierDescriptor &OmpGetDescriptor<name>()
 
+DECLARE_DESCRIPTOR(parser::OmpAccessGroup);
 DECLARE_DESCRIPTOR(parser::OmpAlignment);
 DECLARE_DESCRIPTOR(parser::OmpAlignModifier);
 DECLARE_DESCRIPTOR(parser::OmpAllocatorComplexModifier);
@@ -82,6 +83,7 @@ DECLARE_DESCRIPTOR(parser::OmpDependenceType);
 DECLARE_DESCRIPTOR(parser::OmpDeviceModifier);
 DECLARE_DESCRIPTOR(parser::OmpDirectiveNameModifier);
 DECLARE_DESCRIPTOR(parser::OmpExpectation);
+DECLARE_DESCRIPTOR(parser::OmpFallbackModifier);
 DECLARE_DESCRIPTOR(parser::OmpInteropPreference);
 DECLARE_DESCRIPTOR(parser::OmpInteropType);
 DECLARE_DESCRIPTOR(parser::OmpIterator);
diff --git a/flang/include/flang/Semantics/openmp-utils.h b/flang/include/flang/Semantics/openmp-utils.h
index 032944d8be370..14a4f0e93bda5 100644
--- a/flang/include/flang/Semantics/openmp-utils.h
+++ b/flang/include/flang/Semantics/openmp-utils.h
@@ -72,6 +72,8 @@ const parser::OmpObject *GetArgumentObject(const parser::OmpArgument &argument);
 bool IsCommonBlock(const Symbol &sym);
 bool IsExtendedListItem(const Symbol &sym);
 bool IsVariableListItem(const Symbol &sym);
+bool IsTypeParamInquiry(const Symbol &sym);
+bool IsStructureComponent(const Symbol &sym);
 bool IsVarOrFunctionRef(const MaybeExpr &expr);
 
 bool IsMapEnteringType(parser::OmpMapType::Value type);
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index 8a7b9867c0979..1c3477013b559 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -107,6 +107,7 @@ bool IsBindCProcedure(const Scope &);
 // Returns a pointer to the function's symbol when true, else null
 const Symbol *IsFunctionResultWithSameNameAsFunction(const Symbol &);
 bool IsOrContainsEventOrLockComponent(const Symbol &);
+bool IsOrContainsNotifyComponent(const Symbol &);
 bool CanBeTypeBoundProc(const Symbol &);
 // Does a non-PARAMETER symbol have explicit initialization with =value or
 // =>target in its declaration (but not in a DATA statement)? (Being
@@ -652,6 +653,8 @@ using PotentialAndPointerComponentIterator =
 // dereferenced.
 PotentialComponentIterator::const_iterator FindEventOrLockPotentialComponent(
     const DerivedTypeSpec &, bool ignoreCoarrays = false);
+PotentialComponentIterator::const_iterator FindNotifyPotentialComponent(
+    const DerivedTypeSpec &, bool ignoreCoarrays = false);
 PotentialComponentIterator::const_iterator FindCoarrayPotentialComponent(
     const DerivedTypeSpec &);
 PotentialAndPointerComponentIterator::const_iterator
diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp
index 839717d0833f1..e07076e42ec88 100644
--- a/flang/lib/Evaluate/check-expression.cpp
+++ b/flang/lib/Evaluate/check-expression.cpp
@@ -379,8 +379,11 @@ bool IsInitialProcedureTarget(const semantics::Symbol &symbol) {
       common::visitors{
           [&](const semantics::SubprogramDetails &subp) {
             return !subp.isDummy() && !subp.stmtFunction() &&
-                symbol.owner().kind() != semantics::Scope::Kind::MainProgram &&
-                symbol.owner().kind() != semantics::Scope::Kind::Subprogram;
+                ((symbol.owner().kind() !=
+                         semantics::Scope::Kind::MainProgram &&
+                     symbol.owner().kind() !=
+                         semantics::Scope::Kind::Subprogram) ||
+                    ultimate.attrs().test(semantics::Attr::EXTERNAL));
           },
           [](const semantics::SubprogramNameDetails &x) {
             return x.kind() != semantics::SubprogramKind::Internal;
@@ -1475,13 +1478,12 @@ class CopyInOutExplicitInterface {
       const characteristics::DummyDataObject &dummyObj)
       : fc_{fc}, actual_{actual}, dummyObj_{dummyObj} {}
 
-  // Returns true, if actual and dummy have different contiguity requirements
-  bool HaveContiguityDifferences() const {
-    // Check actual contiguity, unless dummy doesn't care
+  // Returns true if dummy arg needs to be contiguous
+  bool DummyNeedsContiguity() const {
+    if (dummyObj_.ignoreTKR.test(common::IgnoreTKR::Contiguous)) {
+      return false;
+    }
     bool dummyTreatAsArray{dummyObj_.ignoreTKR.test(common::IgnoreTKR::Rank)};
-    bool actualTreatAsContiguous{
-        dummyObj_.ignoreTKR.test(common::IgnoreTKR::Contiguous) ||
-        IsSimplyContiguous(actual_, fc_)};
     bool dummyIsExplicitShape{dummyObj_.type.IsExplicitShape()};
     bool dummyIsAssumedSize{dummyObj_.type.attrs().test(
         characteristics::TypeAndShape::Attr::AssumedSize)};
@@ -1498,32 +1500,17 @@ class CopyInOutExplicitInterface {
         (dummyTreatAsArray && !dummyIsPolymorphic) || dummyIsVoidStar ||
         dummyObj_.attrs.test(
             characteristics::DummyDataObject::Attr::Contiguous)};
-    return !actualTreatAsContiguous && dummyNeedsContiguity;
+    return dummyNeedsContiguity;
   }
 
-  // Returns true, if actual and dummy have polymorphic differences
   bool HavePolymorphicDifferences() const {
-    bool dummyIsAssumedRank{dummyObj_.type.attrs().test(
-        characteristics::TypeAndShape::Attr::AssumedRank)};
-    bool actualIsAssumedRank{semantics::IsAssumedRank(actual_)};
-    bool dummyIsAssumedShape{dummyObj_.type.attrs().test(
-        characteristics::TypeAndShape::Attr::AssumedShape)};
-    bool actualIsAssumedShape{semantics::IsAssumedShape(actual_)};
-    if ((actualIsAssumedRank && dummyIsAssumedRank) ||
-        (actualIsAssumedShape && dummyIsAssumedShape)) {
-      // Assumed-rank and assumed-shape arrays are represented by descriptors,
-      // so don't need to do polymorphic check.
-    } else if (!dummyObj_.ignoreTKR.test(common::IgnoreTKR::Type)) {
-      // flang supports limited cases of passing polymorphic to non-polimorphic.
-      // These cases require temporary of non-polymorphic type. (For example,
-      // the actual argument could be polymorphic array of child type,
-      // while the dummy argument could be non-polymorphic array of parent
-      // type.)
+    if (dummyObj_.ignoreTKR.test(common::IgnoreTKR::Type)) {
+      return false;
+    }
+    if (auto actualType{
+            characteristics::TypeAndShape::Characterize(actual_, fc_)}) {
+      bool actualIsPolymorphic{actualType->type().IsPolymorphic()};
       bool dummyIsPolymorphic{dummyObj_.type.type().IsPolymorphic()};
-      auto actualType{
-          characteristics::TypeAndShape::Characterize(actual_, fc_)};
-      bool actualIsPolymorphic{
-          actualType && actualType->type().IsPolymorphic()};
       if (actualIsPolymorphic && !dummyIsPolymorphic) {
         return true;
       }
@@ -1572,28 +1559,32 @@ class CopyInOutExplicitInterface {
 // procedures with explicit interface, it's expected that "dummy" is not null.
 // For procedures with implicit interface dummy may be null.
 //
+// Returns std::optional<bool> indicating whether the copy is known to be
+// needed (true) or not needed (false); returns std::nullopt if the necessity
+// of the copy is undetermined.
+//
 // Note that these copy-in and copy-out checks are done from the caller's
 // perspective, meaning that for copy-in the caller need to do the copy
 // before calling the callee. Similarly, for copy-out the caller is expected
 // to do the copy after the callee returns.
-bool MayNeedCopy(const ActualArgument *actual,
+std::optional<bool> ActualArgNeedsCopy(const ActualArgument *actual,
     const characteristics::DummyArgument *dummy, FoldingContext &fc,
     bool forCopyOut) {
   if (!actual) {
-    return false;
+    return std::nullopt;
   }
   if (actual->isAlternateReturn()) {
-    return false;
+    return std::nullopt;
   }
   const auto *dummyObj{dummy
           ? std::get_if<characteristics::DummyDataObject>(&dummy->u)
           : nullptr};
-  const bool forCopyIn = !forCopyOut;
+  const bool forCopyIn{!forCopyOut};
   if (!evaluate::IsVariable(*actual)) {
-    // Actual argument expressions that aren’t variables are copy-in, but
-    // not copy-out.
+    // Expressions are copy-in, but not copy-out.
     return forCopyIn;
   }
+  auto maybeContigActual{IsContiguous(*actual, fc)};
   if (dummyObj) { // Explict interface
     CopyInOutExplicitInterface check{fc, *actual, *dummyObj};
     if (forCopyOut && check.HasIntentIn()) {
@@ -1616,28 +1607,25 @@ bool MayNeedCopy(const ActualArgument *actual,
     if (!check.HaveArrayOrAssumedRankArgs()) {
       return false;
     }
-    if (check.HaveContiguityDifferences()) {
-      return true;
-    }
-    if (check.HavePolymorphicDifferences()) {
-      return true;
+    if (maybeContigActual.has_value()) {
+      // We know whether actual arg is contiguous or not
+      bool isContiguousActual{maybeContigActual.value()};
+      bool actualArgNeedsCopy{
+          (!isContiguousActual || check.HavePolymorphicDifferences()) &&
+          check.DummyNeedsContiguity()};
+      return actualArgNeedsCopy;
+    } else {
+      // We don't know whether actual arg is contiguous or not
+      return check.DummyNeedsContiguity();
     }
   } else { // Implicit interface
-    if (ExtractCoarrayRef(*actual)) {
-      // Coindexed actual args may need copy-in and copy-out with implicit
-      // interface
-      return true;
-    }
-    if (!IsSimplyContiguous(*actual, fc)) {
-      // Copy-in:  actual arguments that are variables are copy-in when
-      //           non-contiguous.
-      // Copy-out: vector subscripts could refer to duplicate elements, can't
-      //           copy out.
-      return !(forCopyOut && HasVectorSubscript(*actual));
+    if (maybeContigActual.has_value()) {
+      // If known contiguous, don't copy in/out.
+      // If known non-contiguous, copy in/out.
+      return !*maybeContigActual;
     }
   }
-  // For everything else, no copy-in or copy-out
-  return false;
+  return std::nullopt;
 }
 
 } // namespace Fortran::evaluate
diff --git a/flang/lib/Evaluate/common.cpp b/flang/lib/Evaluate/common.cpp
index 46c75a5c2ee44..119ea3c5612a5 100644
--- a/flang/lib/Evaluate/common.cpp
+++ b/flang/lib/Evaluate/common.cpp
@@ -13,24 +13,29 @@ using namespace Fortran::parser::literals;
 
 namespace Fortran::evaluate {
 
-void RealFlagWarnings(
-    FoldingContext &context, const RealFlags &flags, const char *operation) {
+void FoldingContext::RealFlagWarnings(
+    const RealFlags &flags, const char *operation) {
   static constexpr auto warning{common::UsageWarning::FoldingException};
+  if (!realFlagWarningContext_.empty()) {
+    // Override 'operation' with a string like
+    // "compilation-time evaluation of a call to '...'"
+    operation = realFlagWarningContext_.c_str();
+  }
   if (flags.test(RealFlag::Overflow)) {
-    context.Warn(warning, "overflow on %s"_warn_en_US, operation);
+    Warn(warning, "overflow on %s"_warn_en_US, operation);
   }
   if (flags.test(RealFlag::DivideByZero)) {
     if (std::strcmp(operation, "division") == 0) {
-      context.Warn(warning, "division by zero"_warn_en_US);
+      Warn(warning, "division by zero"_warn_en_US);
     } else {
-      context.Warn(warning, "division by zero on %s"_warn_en_US, operation);
+      Warn(warning, "division by zero on %s"_warn_en_US, operation);
     }
   }
   if (flags.test(RealFlag::InvalidArgument)) {
-    context.Warn(warning, "invalid argument on %s"_warn_en_US, operation);
+    Warn(warning, "invalid argument on %s"_warn_en_US, operation);
   }
   if (flags.test(RealFlag::Underflow)) {
-    context.Warn(warning, "underflow on %s"_warn_en_US, operation);
+    Warn(warning, "underflow on %s"_warn_en_US, operation);
   }
 }
 
diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h
index 3fdf3a6f38848..52ea627d0bbe4 100644
--- a/flang/lib/Evaluate/fold-implementation.h
+++ b/flang/lib/Evaluate/fold-implementation.h
@@ -1862,7 +1862,7 @@ Expr<TO> FoldOperation(
                 std::snprintf(buffer, sizeof buffer,
                     "INTEGER(%d) to REAL(%d) conversion", Operand::kind,
                     TO::kind);
-                RealFlagWarnings(ctx, converted.flags, buffer);
+                ctx.RealFlagWarnings(converted.flags, buffer);
               }
               return ScalarConstantToExpr(std::move(converted.value));
             } else if constexpr (FromCat == TypeCategory::Real) {
@@ -1871,7 +1871,7 @@ Expr<TO> FoldOperation(
               if (!converted.flags.empty()) {
                 std::snprintf(buffer, sizeof buffer,
                     "REAL(%d) to REAL(%d) conversion", Operand::kind, TO::kind);
-                RealFlagWarnings(ctx, converted.flags, buffer);
+                ctx.RealFlagWarnings(converted.flags, buffer);
               }
               if (ctx.targetCharacteristics().areSubnormalsFlushedToZero()) {
                 converted.value = converted.value.FlushSubnormalToZero();
@@ -2012,7 +2012,7 @@ Expr<T> FoldOperation(FoldingContext &context, Add<T> &&x) {
     } else {
       auto sum{folded->first.Add(
           folded->second, context.targetCharacteristics().roundingMode())};
-      RealFlagWarnings(context, sum.flags, "addition");
+      context.RealFlagWarnings(sum.flags, "addition");
       if (context.targetCharacteristics().areSubnormalsFlushedToZero()) {
         sum.value = sum.value.FlushSubnormalToZero();
       }
@@ -2041,7 +2041,7 @@ Expr<T> FoldOperation(FoldingContext &context, Subtract<T> &&x) {
     } else {
       auto difference{folded->first.Subtract(
           folded->second, context.targetCharacteristics().roundingMode())};
-      RealFlagWarnings(context, difference.flags, "subtraction");
+      context.RealFlagWarnings(difference.flags, "subtraction");
       if (context.targetCharacteristics().areSubnormalsFlushedToZero()) {
         difference.value = difference.value.FlushSubnormalToZero();
       }
@@ -2070,7 +2070,7 @@ Expr<T> FoldOperation(FoldingContext &context, Multiply<T> &&x) {
     } else {
       auto product{folded->first.Multiply(
           folded->second, context.targetCharacteristics().roundingMode())};
-      RealFlagWarnings(context, product.flags, "multiplication");
+      context.RealFlagWarnings(product.flags, "multiplication");
       if (context.targetCharacteristics().areSubnormalsFlushedToZero()) {
         product.value = product.value.FlushSubnormalToZero();
       }
@@ -2141,7 +2141,7 @@ Expr<T> FoldOperation(FoldingContext &context, Divide<T> &&x) {
         }
       }
       if (!isCanonicalNaNOrInf) {
-        RealFlagWarnings(context, quotient.flags, "division");
+        context.RealFlagWarnings(quotient.flags, "division");
       }
       if (context.targetCharacteristics().areSubnormalsFlushedToZero()) {
         quotient.value = quotient.value.FlushSubnormalToZero();
@@ -2201,7 +2201,7 @@ Expr<T> FoldOperation(FoldingContext &context, RealToIntPower<T> &&x) {
       [&](auto &y) -> Expr<T> {
         if (auto folded{OperandsAreConstants(x.left(), y)}) {
           auto power{evaluate::IntPower(folded->first, folded->second)};
-          RealFlagWarnings(context, power.flags, "power with INTEGER exponent");
+          context.RealFlagWarnings(power.flags, "power with INTEGER exponent");
           if (context.targetCharacteristics().areSubnormalsFlushedToZero()) {
             power.value = power.value.FlushSubnormalToZero();
           }
diff --git a/flang/lib/Evaluate/host.cpp b/flang/lib/Evaluate/host.cpp
index 25409ac3418b8..bf0249647162a 100644
--- a/flang/lib/Evaluate/host.cpp
+++ b/flang/lib/Evaluate/host.cpp
@@ -140,8 +140,8 @@ void HostFloatingPointEnvironment::CheckAndRestoreFloatingPointEnvironment(
   }
 
   if (!flags_.empty()) {
-    RealFlagWarnings(
-        context, flags_, "evaluation of intrinsic function or operation");
+    context.RealFlagWarnings(
+        flags_, "evaluation of intrinsic function or operation");
   }
   errno = 0;
   if (fesetenv(&originalFenv_) != 0) {
diff --git a/flang/lib/Evaluate/intrinsics-library.cpp b/flang/lib/Evaluate/intrinsics-library.cpp
index 9820aa3d2ea3d..54726ac539d60 100644
--- a/flang/lib/Evaluate/intrinsics-library.cpp
+++ b/flang/lib/Evaluate/intrinsics-library.cpp
@@ -1043,7 +1043,7 @@ std::optional<HostRuntimeWrapper> GetHostRuntimeWrapper(const std::string &name,
   if (const auto *hostFunction{
           SearchHostRuntime(name, biggerResultType, biggerArgTypes)}) {
     auto hostFolderWithChecks{AddArgumentVerifierIfAny(name, *hostFunction)};
-    return [hostFunction, resultType, hostFolderWithChecks](
+    return [hostFunction, resultType, hostFolderWithChecks, name](
                FoldingContext &context, std::vector<Expr<SomeType>> &&args) {
       auto nArgs{args.size()};
       for (size_t i{0}; i < nArgs; ++i) {
@@ -1051,6 +1051,8 @@ std::optional<HostRuntimeWrapper> GetHostRuntimeWrapper(const std::string &name,
             ConvertToType(hostFunction->argumentTypes[i], std::move(args[i]))
                 .value());
       }
+      auto restorer{context.SetRealFlagWarningContext(
+          "compilation-time evaluation of a call to '"s + name + "'"s)};
       return Fold(context,
           ConvertToType(
               resultType, hostFolderWithChecks(context, std::move(args)))
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index bd06acc21e47f..117b2249a9179 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -1210,6 +1210,20 @@ bool HasConstant(const Expr<SomeType> &expr) {
   return HasConstantHelper{}(expr);
 }
 
+// HasStructureComponent()
+struct HasStructureComponentHelper
+    : public AnyTraverse<HasStructureComponentHelper, bool, false> {
+  using Base = AnyTraverse<HasStructureComponentHelper, bool, false>;
+  HasStructureComponentHelper() : Base(*this) {}
+  using Base::operator();
+
+  bool operator()(const Component &) const { return true; }
+};
+
+bool HasStructureComponent(const Expr<SomeType> &expr) {
+  return HasStructureComponentHelper{}(expr);
+}
+
 parser::Message *AttachDeclaration(
     parser::Message &message, const Symbol &symbol) {
   const Symbol *unhosted{&symbol};
diff --git a/flang/lib/Evaluate/variable.cpp b/flang/lib/Evaluate/variable.cpp
index b9b34d4d5bc89..b257dad42fc58 100644
--- a/flang/lib/Evaluate/variable.cpp
+++ b/flang/lib/Evaluate/variable.cpp
@@ -89,6 +89,14 @@ std::optional<Expr<SomeType>> CoarrayRef::team() const {
   }
 }
 
+std::optional<Expr<SomeType>> CoarrayRef::notify() const {
+  if (notify_) {
+    return notify_.value().value();
+  } else {
+    return std::nullopt;
+  }
+}
+
 CoarrayRef &CoarrayRef::set_stat(Expr<SomeInteger> &&v) {
   CHECK(IsVariable(v));
   stat_.emplace(std::move(v));
@@ -100,6 +108,11 @@ CoarrayRef &CoarrayRef::set_team(Expr<SomeType> &&v) {
   return *this;
 }
 
+CoarrayRef &CoarrayRef::set_notify(Expr<SomeType> &&v) {
+  notify_.emplace(std::move(v));
+  return *this;
+}
+
 const Symbol &CoarrayRef::GetFirstSymbol() const {
   return base().GetFirstSymbol();
 }
diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt
index 2b3bc0e9c2269..bb0b4a39cec9b 100644
--- a/flang/lib/Frontend/CMakeLists.txt
+++ b/flang/lib/Frontend/CMakeLists.txt
@@ -76,6 +76,7 @@ add_flang_library(flangFrontend
   CLANG_LIBS
   clangBasic
   clangDriver
+  clangOptions
 )
 
 target_precompile_headers(flangFrontend PRIVATE
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 548ca675db5ea..893121fe01f27 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -26,8 +26,8 @@
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/OptionUtils.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/OptionUtils.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -82,11 +82,11 @@ static bool parseShowColorsArgs(const llvm::opt::ArgList &args,
 
   for (auto *a : args) {
     const llvm::opt::Option &opt = a->getOption();
-    if (opt.matches(clang::driver::options::OPT_fcolor_diagnostics)) {
+    if (opt.matches(clang::options::OPT_fcolor_diagnostics)) {
       showColors = Colors_On;
-    } else if (opt.matches(clang::driver::options::OPT_fno_color_diagnostics)) {
+    } else if (opt.matches(clang::options::OPT_fno_color_diagnostics)) {
       showColors = Colors_Off;
-    } else if (opt.matches(clang::driver::options::OPT_fdiagnostics_color_EQ)) {
+    } else if (opt.matches(clang::options::OPT_fdiagnostics_color_EQ)) {
       llvm::StringRef value(a->getValue());
       if (value == "always")
         showColors = Colors_On;
@@ -107,15 +107,13 @@ static unsigned getOptimizationLevel(llvm::opt::ArgList &args,
                                      clang::DiagnosticsEngine &diags) {
   unsigned defaultOpt = 0;
 
-  if (llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_O_Group)) {
-    if (a->getOption().matches(clang::driver::options::OPT_O0))
+  if (llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_O_Group)) {
+    if (a->getOption().matches(clang::options::OPT_O0))
       return 0;
 
-    assert(a->getOption().matches(clang::driver::options::OPT_O));
+    assert(a->getOption().matches(clang::options::OPT_O));
 
-    return getLastArgIntValue(args, clang::driver::options::OPT_O, defaultOpt,
-                              diags);
+    return getLastArgIntValue(args, clang::options::OPT_O, defaultOpt, diags);
   }
 
   return defaultOpt;
@@ -133,7 +131,7 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts,
                            clang::DiagnosticsEngine &diags) {
   using DebugInfoKind = llvm::codegenoptions::DebugInfoKind;
   if (llvm::opt::Arg *arg =
-          args.getLastArg(clang::driver::options::OPT_debug_info_kind_EQ)) {
+          args.getLastArg(clang::options::OPT_debug_info_kind_EQ)) {
     std::optional<DebugInfoKind> val =
         llvm::StringSwitch<std::optional<DebugInfoKind>>(arg->getValue())
             .Case("line-tables-only", llvm::codegenoptions::DebugLineTablesOnly)
@@ -158,13 +156,13 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts,
       diags.Report(debugWarning) << arg->getValue();
     }
     opts.DwarfVersion =
-        getLastArgIntValue(args, clang::driver::options::OPT_dwarf_version_EQ,
+        getLastArgIntValue(args, clang::options::OPT_dwarf_version_EQ,
                            /*Default=*/0, diags);
     if (const llvm::opt::Arg *a =
-            args.getLastArg(clang::driver::options::OPT_split_dwarf_file))
+            args.getLastArg(clang::options::OPT_split_dwarf_file))
       opts.SplitDwarfFile = a->getValue();
     if (const llvm::opt::Arg *a =
-            args.getLastArg(clang::driver::options::OPT_split_dwarf_output))
+            args.getLastArg(clang::options::OPT_split_dwarf_output))
       opts.SplitDwarfOutput = a->getValue();
   }
   return true;
@@ -174,7 +172,7 @@ static void parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts,
                                      llvm::opt::ArgList &args,
                                      clang::DiagnosticsEngine &diags) {
   llvm::opt::Arg *arg =
-      args.getLastArg(clang::driver::options::OPT_fdo_concurrent_to_openmp_EQ);
+      args.getLastArg(clang::options::OPT_fdo_concurrent_to_openmp_EQ);
   if (!arg)
     return;
 
@@ -199,7 +197,7 @@ static void parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts,
 static bool parseVectorLibArg(Fortran::frontend::CodeGenOptions &opts,
                               llvm::opt::ArgList &args,
                               clang::DiagnosticsEngine &diags) {
-  llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fveclib);
+  llvm::opt::Arg *arg = args.getLastArg(clang::options::OPT_fveclib);
   if (!arg)
     return true;
 
@@ -237,7 +235,7 @@ parseOptimizationRemark(clang::DiagnosticsEngine &diags,
   CodeGenOptions::OptRemark result;
 
   for (llvm::opt::Arg *a : args) {
-    if (a->getOption().matches(clang::driver::options::OPT_R_Joined)) {
+    if (a->getOption().matches(clang::options::OPT_R_Joined)) {
       llvm::StringRef value = a->getValue();
 
       if (value == remarkOptName) {
@@ -274,43 +272,45 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
                              clang::DiagnosticsEngine &diags) {
   opts.OptimizationLevel = getOptimizationLevel(args, diags);
 
-  if (args.hasFlag(clang::driver::options::OPT_fdebug_pass_manager,
-                   clang::driver::options::OPT_fno_debug_pass_manager, false))
+  if (args.hasFlag(clang::options::OPT_fdebug_pass_manager,
+                   clang::options::OPT_fno_debug_pass_manager, false))
     opts.DebugPassManager = 1;
 
-  if (args.hasFlag(clang::driver::options::OPT_fstack_arrays,
-                   clang::driver::options::OPT_fno_stack_arrays, false))
+  if (args.hasFlag(clang::options::OPT_fstack_arrays,
+                   clang::options::OPT_fno_stack_arrays, false))
     opts.StackArrays = 1;
 
-  if (args.getLastArg(clang::driver::options::OPT_floop_interchange))
+  if (args.getLastArg(clang::options::OPT_floop_interchange))
     opts.InterchangeLoops = 1;
 
-  if (args.getLastArg(clang::driver::options::OPT_fexperimental_loop_fusion))
+  if (args.getLastArg(clang::options::OPT_fexperimental_loop_fusion))
     opts.FuseLoops = 1;
 
-  if (args.getLastArg(clang::driver::options::OPT_vectorize_loops))
+  if (args.getLastArg(clang::options::OPT_vectorize_loops))
     opts.VectorizeLoop = 1;
 
-  if (args.getLastArg(clang::driver::options::OPT_vectorize_slp))
+  if (args.getLastArg(clang::options::OPT_vectorize_slp))
     opts.VectorizeSLP = 1;
 
-  if (args.hasFlag(clang::driver::options::OPT_floop_versioning,
-                   clang::driver::options::OPT_fno_loop_versioning, false))
+  if (args.hasFlag(clang::options::OPT_floop_versioning,
+                   clang::options::OPT_fno_loop_versioning, false))
     opts.LoopVersioning = 1;
 
-  opts.UnrollLoops = args.hasFlag(clang::driver::options::OPT_funroll_loops,
-                                  clang::driver::options::OPT_fno_unroll_loops,
+  opts.UnrollLoops = args.hasFlag(clang::options::OPT_funroll_loops,
+                                  clang::options::OPT_fno_unroll_loops,
                                   (opts.OptimizationLevel > 1));
 
   opts.AliasAnalysis = opts.OptimizationLevel > 0;
 
   // -mframe-pointer=none/non-leaf/reserved/all option.
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_mframe_pointer_EQ)) {
+          args.getLastArg(clang::options::OPT_mframe_pointer_EQ)) {
     std::optional<llvm::FramePointerKind> val =
         llvm::StringSwitch<std::optional<llvm::FramePointerKind>>(a->getValue())
             .Case("none", llvm::FramePointerKind::None)
             .Case("non-leaf", llvm::FramePointerKind::NonLeaf)
+            .Case("non-leaf-no-reserve",
+                  llvm::FramePointerKind::NonLeafNoReserve)
             .Case("reserved", llvm::FramePointerKind::Reserved)
             .Case("all", llvm::FramePointerKind::All)
             .Default(std::nullopt);
@@ -322,7 +322,7 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
       opts.setFramePointer(val.value());
   }
 
-  for (auto *a : args.filtered(clang::driver::options::OPT_fpass_plugin_EQ))
+  for (auto *a : args.filtered(clang::options::OPT_fpass_plugin_EQ))
     opts.LLVMPassPlugins.push_back(a->getValue());
 
   opts.Reciprocals = clang::driver::tools::parseMRecipOption(diags, args);
@@ -331,15 +331,14 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
       clang::driver::tools::parseMPreferVectorWidthOption(diags, args);
 
   // -fembed-offload-object option
-  for (auto *a :
-       args.filtered(clang::driver::options::OPT_fembed_offload_object_EQ))
+  for (auto *a : args.filtered(clang::options::OPT_fembed_offload_object_EQ))
     opts.OffloadObjects.push_back(a->getValue());
 
-  if (args.hasArg(clang::driver::options::OPT_finstrument_functions))
+  if (args.hasArg(clang::options::OPT_finstrument_functions))
     opts.InstrumentFunctions = 1;
 
-  if (const llvm::opt::Arg *a = args.getLastArg(
-          clang::driver::options::OPT_mcode_object_version_EQ)) {
+  if (const llvm::opt::Arg *a =
+          args.getLastArg(clang::options::OPT_mcode_object_version_EQ)) {
     llvm::StringRef s = a->getValue();
     if (s == "6")
       opts.CodeObjectVersion = llvm::CodeObjectVersionKind::COV_6;
@@ -353,36 +352,36 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
 
   // -f[no-]save-optimization-record[=<format>]
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_opt_record_file))
+          args.getLastArg(clang::options::OPT_opt_record_file))
     opts.OptRecordFile = a->getValue();
 
   // Optimization file format. Defaults to yaml
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_opt_record_format))
+          args.getLastArg(clang::options::OPT_opt_record_format))
     opts.OptRecordFormat = a->getValue();
 
   // Specifies, using a regex, which successful optimization passes(middle and
   // backend), to include in the final optimization record file generated. If
   // not provided -fsave-optimization-record will include all passes.
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_opt_record_passes))
+          args.getLastArg(clang::options::OPT_opt_record_passes))
     opts.OptRecordPasses = a->getValue();
 
   // Create OptRemark that allows printing of all successful optimization
   // passes applied.
   opts.OptimizationRemark =
-      parseOptimizationRemark(diags, args, clang::driver::options::OPT_Rpass_EQ,
+      parseOptimizationRemark(diags, args, clang::options::OPT_Rpass_EQ,
                               /*remarkOptName=*/"pass");
 
   // Create OptRemark that allows all missed optimization passes to be printed.
-  opts.OptimizationRemarkMissed = parseOptimizationRemark(
-      diags, args, clang::driver::options::OPT_Rpass_missed_EQ,
-      /*remarkOptName=*/"pass-missed");
+  opts.OptimizationRemarkMissed =
+      parseOptimizationRemark(diags, args, clang::options::OPT_Rpass_missed_EQ,
+                              /*remarkOptName=*/"pass-missed");
 
   // Create OptRemark that allows all optimization decisions made by LLVM
   // to be printed.
   opts.OptimizationRemarkAnalysis = parseOptimizationRemark(
-      diags, args, clang::driver::options::OPT_Rpass_analysis_EQ,
+      diags, args, clang::options::OPT_Rpass_analysis_EQ,
       /*remarkOptName=*/"pass-analysis");
 
   if (opts.getDebugInfo() == llvm::codegenoptions::NoDebugInfo) {
@@ -400,23 +399,22 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
       opts.setDebugInfo(llvm::codegenoptions::LocTrackingOnly);
   }
 
-  if (auto *a = args.getLastArg(clang::driver::options::OPT_save_temps_EQ))
+  if (auto *a = args.getLastArg(clang::options::OPT_save_temps_EQ))
     opts.SaveTempsDir = a->getValue();
 
   // -record-command-line option.
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_record_command_line)) {
+          args.getLastArg(clang::options::OPT_record_command_line)) {
     opts.RecordCommandLine = a->getValue();
   }
 
   // -mlink-builtin-bitcode
-  for (auto *a :
-       args.filtered(clang::driver::options::OPT_mlink_builtin_bitcode))
+  for (auto *a : args.filtered(clang::options::OPT_mlink_builtin_bitcode))
     opts.BuiltinBCLibs.push_back(a->getValue());
 
   // -mrelocation-model option.
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_mrelocation_model)) {
+          args.getLastArg(clang::options::OPT_mrelocation_model)) {
     llvm::StringRef modelName = a->getValue();
     auto relocModel =
         llvm::StringSwitch<std::optional<llvm::Reloc::Model>>(modelName)
@@ -435,31 +433,30 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
   }
 
   // -pic-level and -pic-is-pie option.
-  if (int picLevel = getLastArgIntValue(
-          args, clang::driver::options::OPT_pic_level, 0, diags)) {
+  if (int picLevel =
+          getLastArgIntValue(args, clang::options::OPT_pic_level, 0, diags)) {
     if (picLevel > 2)
       diags.Report(clang::diag::err_drv_invalid_value)
-          << args.getLastArg(clang::driver::options::OPT_pic_level)
-                 ->getAsString(args)
+          << args.getLastArg(clang::options::OPT_pic_level)->getAsString(args)
           << picLevel;
 
     opts.PICLevel = picLevel;
-    if (args.hasArg(clang::driver::options::OPT_pic_is_pie))
+    if (args.hasArg(clang::options::OPT_pic_is_pie))
       opts.IsPIE = 1;
   }
 
-  if (args.hasArg(clang::driver::options::OPT_fprofile_generate)) {
+  if (args.hasArg(clang::options::OPT_fprofile_generate)) {
     opts.setProfileInstr(llvm::driver::ProfileInstrKind::ProfileIRInstr);
   }
 
-  if (auto A = args.getLastArg(clang::driver::options::OPT_fprofile_use_EQ)) {
+  if (auto A = args.getLastArg(clang::options::OPT_fprofile_use_EQ)) {
     opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileIRInstr);
     opts.ProfileInstrumentUsePath = A->getValue();
   }
 
   // -mcmodel option.
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_mcmodel_EQ)) {
+          args.getLastArg(clang::options::OPT_mcmodel_EQ)) {
     llvm::StringRef modelName = a->getValue();
     std::optional<llvm::CodeModel::Model> codeModel = getCodeModel(modelName);
 
@@ -470,8 +467,8 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
           << a->getAsString(args) << modelName;
   }
 
-  if (const llvm::opt::Arg *arg = args.getLastArg(
-          clang::driver::options::OPT_mlarge_data_threshold_EQ)) {
+  if (const llvm::opt::Arg *arg =
+          args.getLastArg(clang::options::OPT_mlarge_data_threshold_EQ)) {
     uint64_t LDT;
     if (llvm::StringRef(arg->getValue()).getAsInteger(/*Radix=*/10, LDT)) {
       diags.Report(clang::diag::err_drv_invalid_value)
@@ -481,15 +478,15 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
   }
 
   // This option is compatible with -f[no-]underscoring in gfortran.
-  if (args.hasFlag(clang::driver::options::OPT_fno_underscoring,
-                   clang::driver::options::OPT_funderscoring, false)) {
+  if (args.hasFlag(clang::options::OPT_fno_underscoring,
+                   clang::options::OPT_funderscoring, false)) {
     opts.Underscoring = 0;
   }
 
   parseDoConcurrentMapping(opts, args, diags);
 
   if (const llvm::opt::Arg *arg =
-          args.getLastArg(clang::driver::options::OPT_complex_range_EQ)) {
+          args.getLastArg(clang::options::OPT_complex_range_EQ)) {
     llvm::StringRef argValue = llvm::StringRef(arg->getValue());
     if (argValue == "full") {
       opts.setComplexRange(CodeGenOptions::ComplexRangeKind::CX_Full);
@@ -510,46 +507,42 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
 /// \param [in] opts The target options instance to update
 /// \param [in] args The list of input arguments (from the compiler invocation)
 static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) {
-  if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_triple))
+  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_triple))
     opts.triple = a->getValue();
 
-  opts.atomicIgnoreDenormalMode = args.hasFlag(
-      clang::driver::options::OPT_fatomic_ignore_denormal_mode,
-      clang::driver::options::OPT_fno_atomic_ignore_denormal_mode, false);
-  opts.atomicFineGrainedMemory = args.hasFlag(
-      clang::driver::options::OPT_fatomic_fine_grained_memory,
-      clang::driver::options::OPT_fno_atomic_fine_grained_memory, false);
+  opts.atomicIgnoreDenormalMode =
+      args.hasFlag(clang::options::OPT_fatomic_ignore_denormal_mode,
+                   clang::options::OPT_fno_atomic_ignore_denormal_mode, false);
+  opts.atomicFineGrainedMemory =
+      args.hasFlag(clang::options::OPT_fatomic_fine_grained_memory,
+                   clang::options::OPT_fno_atomic_fine_grained_memory, false);
   opts.atomicRemoteMemory =
-      args.hasFlag(clang::driver::options::OPT_fatomic_remote_memory,
-                   clang::driver::options::OPT_fno_atomic_remote_memory, false);
+      args.hasFlag(clang::options::OPT_fatomic_remote_memory,
+                   clang::options::OPT_fno_atomic_remote_memory, false);
 
-  if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_target_cpu))
+  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_target_cpu))
     opts.cpu = a->getValue();
 
-  if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_tune_cpu))
+  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_tune_cpu))
     opts.cpuToTuneFor = a->getValue();
 
   for (const llvm::opt::Arg *currentArg :
-       args.filtered(clang::driver::options::OPT_target_feature))
+       args.filtered(clang::options::OPT_target_feature))
     opts.featuresAsWritten.emplace_back(currentArg->getValue());
 
-  if (args.hasArg(clang::driver::options::OPT_fdisable_real_10))
+  if (args.hasArg(clang::options::OPT_fdisable_real_10))
     opts.disabledRealKinds.push_back(10);
 
-  if (args.hasArg(clang::driver::options::OPT_fdisable_real_3))
+  if (args.hasArg(clang::options::OPT_fdisable_real_3))
     opts.disabledRealKinds.push_back(3);
 
-  if (args.hasArg(clang::driver::options::OPT_fdisable_integer_2))
+  if (args.hasArg(clang::options::OPT_fdisable_integer_2))
     opts.disabledIntegerKinds.push_back(2);
 
-  if (args.hasArg(clang::driver::options::OPT_fdisable_integer_16))
+  if (args.hasArg(clang::options::OPT_fdisable_integer_16))
     opts.disabledIntegerKinds.push_back(16);
 
-  if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_mabi_EQ)) {
+  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_mabi_EQ)) {
     opts.abi = a->getValue();
     llvm::StringRef V = a->getValue();
     if (V == "vec-extabi") {
@@ -559,9 +552,8 @@ static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) {
     }
   }
 
-  opts.asmVerbose =
-      args.hasFlag(clang::driver::options::OPT_fverbose_asm,
-                   clang::driver::options::OPT_fno_verbose_asm, false);
+  opts.asmVerbose = args.hasFlag(clang::options::OPT_fverbose_asm,
+                                 clang::options::OPT_fno_verbose_asm, false);
 }
 // Tweak the frontend configuration based on the frontend action
 static void setUpFrontendBasedOnAction(FrontendOptions &opts) {
@@ -594,108 +586,114 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
   // Treat multiple action options as an invocation error. Note that `clang
   // -cc1` does accept multiple action options, but will only consider the
   // rightmost one.
-  if (args.hasMultipleArgs(clang::driver::options::OPT_Action_Group)) {
-    const unsigned diagID = diags.getCustomDiagID(
-        clang::DiagnosticsEngine::Error, "Only one action option is allowed");
-    diags.Report(diagID);
+  if (args.hasMultipleArgs(clang::options::OPT_Action_Group)) {
+    llvm::SmallString<32> buf;
+    llvm::raw_svector_ostream os(buf);
+    for (const llvm::opt::Arg *arg :
+         args.filtered(clang::options::OPT_Action_Group)) {
+      if (buf.size())
+        os << ", ";
+      os << "'" << arg->getSpelling() << "'";
+    }
+    diags.Report(clang::diag::err_drv_too_many_actions) << buf;
     return false;
   }
 
   // Identify the action (i.e. opts.ProgramAction)
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_Action_Group)) {
+          args.getLastArg(clang::options::OPT_Action_Group)) {
     switch (a->getOption().getID()) {
     default: {
       llvm_unreachable("Invalid option in group!");
     }
-    case clang::driver::options::OPT_test_io:
+    case clang::options::OPT_test_io:
       opts.programAction = InputOutputTest;
       break;
-    case clang::driver::options::OPT_E:
+    case clang::options::OPT_E:
       opts.programAction = PrintPreprocessedInput;
       break;
-    case clang::driver::options::OPT_fsyntax_only:
+    case clang::options::OPT_fsyntax_only:
       opts.programAction = ParseSyntaxOnly;
       break;
-    case clang::driver::options::OPT_emit_fir:
+    case clang::options::OPT_emit_fir:
       opts.programAction = EmitFIR;
       break;
-    case clang::driver::options::OPT_emit_hlfir:
+    case clang::options::OPT_emit_hlfir:
       opts.programAction = EmitHLFIR;
       break;
-    case clang::driver::options::OPT_emit_llvm:
+    case clang::options::OPT_emit_llvm:
       opts.programAction = EmitLLVM;
       break;
-    case clang::driver::options::OPT_emit_llvm_bc:
+    case clang::options::OPT_emit_llvm_bc:
       opts.programAction = EmitLLVMBitcode;
       break;
-    case clang::driver::options::OPT_emit_obj:
+    case clang::options::OPT_emit_obj:
       opts.programAction = EmitObj;
       break;
-    case clang::driver::options::OPT_S:
+    case clang::options::OPT_S:
       opts.programAction = EmitAssembly;
       break;
-    case clang::driver::options::OPT_fdebug_unparse:
+    case clang::options::OPT_fdebug_unparse:
       opts.programAction = DebugUnparse;
       break;
-    case clang::driver::options::OPT_fdebug_unparse_no_sema:
+    case clang::options::OPT_fdebug_unparse_no_sema:
       opts.programAction = DebugUnparseNoSema;
       break;
-    case clang::driver::options::OPT_fdebug_unparse_with_symbols:
+    case clang::options::OPT_fdebug_unparse_with_symbols:
       opts.programAction = DebugUnparseWithSymbols;
       break;
-    case clang::driver::options::OPT_fdebug_unparse_with_modules:
+    case clang::options::OPT_fdebug_unparse_with_modules:
       opts.programAction = DebugUnparseWithModules;
       break;
-    case clang::driver::options::OPT_fdebug_dump_symbols:
+    case clang::options::OPT_fdebug_dump_symbols:
       opts.programAction = DebugDumpSymbols;
       break;
-    case clang::driver::options::OPT_fdebug_dump_parse_tree:
+    case clang::options::OPT_fdebug_dump_parse_tree:
       opts.programAction = DebugDumpParseTree;
       break;
-    case clang::driver::options::OPT_fdebug_dump_pft:
+    case clang::options::OPT_fdebug_dump_pft:
       opts.programAction = DebugDumpPFT;
       break;
-    case clang::driver::options::OPT_fdebug_dump_all:
+    case clang::options::OPT_fdebug_dump_all:
       opts.programAction = DebugDumpAll;
       break;
-    case clang::driver::options::OPT_fdebug_dump_parse_tree_no_sema:
+    case clang::options::OPT_fdebug_dump_parse_tree_no_sema:
       opts.programAction = DebugDumpParseTreeNoSema;
       break;
-    case clang::driver::options::OPT_fdebug_dump_provenance:
+    case clang::options::OPT_fdebug_dump_provenance:
       opts.programAction = DebugDumpProvenance;
       break;
-    case clang::driver::options::OPT_fdebug_dump_parsing_log:
+    case clang::options::OPT_fdebug_dump_parsing_log:
       opts.programAction = DebugDumpParsingLog;
       break;
-    case clang::driver::options::OPT_fdebug_measure_parse_tree:
+    case clang::options::OPT_fdebug_measure_parse_tree:
       opts.programAction = DebugMeasureParseTree;
       break;
-    case clang::driver::options::OPT_fdebug_pre_fir_tree:
+    case clang::options::OPT_fdebug_pre_fir_tree:
       opts.programAction = DebugPreFIRTree;
       break;
-    case clang::driver::options::OPT_fget_symbols_sources:
+    case clang::options::OPT_fget_symbols_sources:
       opts.programAction = GetSymbolsSources;
       break;
-    case clang::driver::options::OPT_fget_definition:
+    case clang::options::OPT_fget_definition:
       opts.programAction = GetDefinition;
       break;
-    case clang::driver::options::OPT_init_only:
+    case clang::options::OPT_init_only:
       opts.programAction = InitOnly;
       break;
 
       // TODO:
-      // case clang::driver::options::OPT_emit_llvm:
-      // case clang::driver::options::OPT_emit_llvm_only:
-      // case clang::driver::options::OPT_emit_codegen_only:
-      // case clang::driver::options::OPT_emit_module:
+      // case clang::options::OPT_emit_llvm:
+      // case clang::options::OPT_emit_llvm_only:
+      // case clang::options::OPT_emit_codegen_only:
+      // case clang::options::OPT_emit_module:
       // (...)
     }
 
     // Parse the values provided with `-fget-definition` (there should be 3
     // integers)
     if (llvm::opt::OptSpecifier(a->getOption().getID()) ==
-        clang::driver::options::OPT_fget_definition) {
+        clang::options::OPT_fget_definition) {
       unsigned optVals[3] = {0, 0, 0};
 
       for (unsigned i = 0; i < 3; i++) {
@@ -715,27 +713,25 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
   }
 
   // Parsing -load <dsopath> option and storing shared object path
-  if (llvm::opt::Arg *a = args.getLastArg(clang::driver::options::OPT_load)) {
+  if (llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_load)) {
     opts.plugins.push_back(a->getValue());
   }
 
   // Parsing -plugin <name> option and storing plugin name and setting action
-  if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_plugin)) {
+  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_plugin)) {
     opts.programAction = PluginAction;
     opts.actionName = a->getValue();
   }
 
-  opts.outputFile = args.getLastArgValue(clang::driver::options::OPT_o);
-  opts.showHelp = args.hasArg(clang::driver::options::OPT_help);
-  opts.showVersion = args.hasArg(clang::driver::options::OPT_version);
+  opts.outputFile = args.getLastArgValue(clang::options::OPT_o);
+  opts.showHelp = args.hasArg(clang::options::OPT_help);
+  opts.showVersion = args.hasArg(clang::options::OPT_version);
   opts.printSupportedCPUs =
-      args.hasArg(clang::driver::options::OPT_print_supported_cpus);
+      args.hasArg(clang::options::OPT_print_supported_cpus);
 
   // Get the input kind (from the value passed via `-x`)
   InputKind dashX(Language::Unknown);
-  if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_x)) {
+  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_x)) {
     llvm::StringRef xValue = a->getValue();
     // Principal languages.
     dashX = llvm::StringSwitch<InputKind>(xValue)
@@ -762,7 +758,7 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
 
   // Collect the input files and save them in our instance of FrontendOptions.
   std::vector<std::string> inputs =
-      args.getAllArgValues(clang::driver::options::OPT_INPUT);
+      args.getAllArgValues(clang::options::OPT_INPUT);
   opts.inputs.clear();
   if (inputs.empty())
     // '-' is the default input if none is given.
@@ -782,18 +778,16 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
   }
 
   // Set fortranForm based on options -ffree-form and -ffixed-form.
-  if (const auto *arg =
-          args.getLastArg(clang::driver::options::OPT_ffixed_form,
-                          clang::driver::options::OPT_ffree_form)) {
-    opts.fortranForm =
-        arg->getOption().matches(clang::driver::options::OPT_ffixed_form)
-            ? FortranForm::FixedForm
-            : FortranForm::FreeForm;
+  if (const auto *arg = args.getLastArg(clang::options::OPT_ffixed_form,
+                                        clang::options::OPT_ffree_form)) {
+    opts.fortranForm = arg->getOption().matches(clang::options::OPT_ffixed_form)
+                           ? FortranForm::FixedForm
+                           : FortranForm::FreeForm;
   }
 
   // Set fixedFormColumns based on -ffixed-line-length=<value>
   if (const auto *arg =
-          args.getLastArg(clang::driver::options::OPT_ffixed_line_length_EQ)) {
+          args.getLastArg(clang::options::OPT_ffixed_line_length_EQ)) {
     llvm::StringRef argValue = llvm::StringRef(arg->getValue());
     std::int64_t columns = -1;
     if (argValue == "none") {
@@ -815,8 +809,7 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
   }
 
   // Set conversion based on -fconvert=<value>
-  if (const auto *arg =
-          args.getLastArg(clang::driver::options::OPT_fconvert_EQ)) {
+  if (const auto *arg = args.getLastArg(clang::options::OPT_fconvert_EQ)) {
     const char *argValue = arg->getValue();
     if (auto convert = parseConvertArg(argValue))
       opts.envDefaults.push_back({"FORT_CONVERT", *convert});
@@ -826,59 +819,55 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
   }
 
   // -f{no-}implicit-none
-  opts.features.Enable(
-      Fortran::common::LanguageFeature::ImplicitNoneTypeAlways,
-      args.hasFlag(clang::driver::options::OPT_fimplicit_none,
-                   clang::driver::options::OPT_fno_implicit_none, false));
+  opts.features.Enable(Fortran::common::LanguageFeature::ImplicitNoneTypeAlways,
+                       args.hasFlag(clang::options::OPT_fimplicit_none,
+                                    clang::options::OPT_fno_implicit_none,
+                                    false));
 
   // -f{no-}implicit-none-ext
-  opts.features.Enable(
-      Fortran::common::LanguageFeature::ImplicitNoneExternal,
-      args.hasFlag(clang::driver::options::OPT_fimplicit_none_ext,
-                   clang::driver::options::OPT_fno_implicit_none_ext, false));
+  opts.features.Enable(Fortran::common::LanguageFeature::ImplicitNoneExternal,
+                       args.hasFlag(clang::options::OPT_fimplicit_none_ext,
+                                    clang::options::OPT_fno_implicit_none_ext,
+                                    false));
 
   // -f{no-}backslash
   opts.features.Enable(Fortran::common::LanguageFeature::BackslashEscapes,
-                       args.hasFlag(clang::driver::options::OPT_fbackslash,
-                                    clang::driver::options::OPT_fno_backslash,
-                                    false));
+                       args.hasFlag(clang::options::OPT_fbackslash,
+                                    clang::options::OPT_fno_backslash, false));
 
   // -f{no-}logical-abbreviations
   opts.features.Enable(
       Fortran::common::LanguageFeature::LogicalAbbreviations,
-      args.hasFlag(clang::driver::options::OPT_flogical_abbreviations,
-                   clang::driver::options::OPT_fno_logical_abbreviations,
-                   false));
+      args.hasFlag(clang::options::OPT_flogical_abbreviations,
+                   clang::options::OPT_fno_logical_abbreviations, false));
 
   // -f{no-}unsigned
   opts.features.Enable(Fortran::common::LanguageFeature::Unsigned,
-                       args.hasFlag(clang::driver::options::OPT_funsigned,
-                                    clang::driver::options::OPT_fno_unsigned,
-                                    false));
+                       args.hasFlag(clang::options::OPT_funsigned,
+                                    clang::options::OPT_fno_unsigned, false));
 
   // -f{no-}xor-operator
-  opts.features.Enable(
-      Fortran::common::LanguageFeature::XOROperator,
-      args.hasFlag(clang::driver::options::OPT_fxor_operator,
-                   clang::driver::options::OPT_fno_xor_operator, false));
+  opts.features.Enable(Fortran::common::LanguageFeature::XOROperator,
+                       args.hasFlag(clang::options::OPT_fxor_operator,
+                                    clang::options::OPT_fno_xor_operator,
+                                    false));
 
   // -fno-automatic
-  if (args.hasArg(clang::driver::options::OPT_fno_automatic)) {
+  if (args.hasArg(clang::options::OPT_fno_automatic)) {
     opts.features.Enable(Fortran::common::LanguageFeature::DefaultSave);
   }
 
   // -f{no}-save-main-program
-  opts.features.Enable(
-      Fortran::common::LanguageFeature::SaveMainProgram,
-      args.hasFlag(clang::driver::options::OPT_fsave_main_program,
-                   clang::driver::options::OPT_fno_save_main_program, false));
+  opts.features.Enable(Fortran::common::LanguageFeature::SaveMainProgram,
+                       args.hasFlag(clang::options::OPT_fsave_main_program,
+                                    clang::options::OPT_fno_save_main_program,
+                                    false));
 
-  if (args.hasArg(
-          clang::driver::options::OPT_falternative_parameter_statement)) {
+  if (args.hasArg(clang::options::OPT_falternative_parameter_statement)) {
     opts.features.Enable(Fortran::common::LanguageFeature::OldStyleParameter);
   }
   if (const llvm::opt::Arg *arg =
-          args.getLastArg(clang::driver::options::OPT_finput_charset_EQ)) {
+          args.getLastArg(clang::options::OPT_finput_charset_EQ)) {
     llvm::StringRef argValue = arg->getValue();
     if (argValue == "utf-8") {
       opts.encoding = Fortran::parser::Encoding::UTF_8;
@@ -923,9 +912,9 @@ static std::string getOpenMPHeadersDir(const char *argv) {
 static void parsePreprocessorArgs(Fortran::frontend::PreprocessorOptions &opts,
                                   llvm::opt::ArgList &args) {
   // Add macros from the command line.
-  for (const auto *currentArg : args.filtered(clang::driver::options::OPT_D,
-                                              clang::driver::options::OPT_U)) {
-    if (currentArg->getOption().matches(clang::driver::options::OPT_D)) {
+  for (const auto *currentArg :
+       args.filtered(clang::options::OPT_D, clang::options::OPT_U)) {
+    if (currentArg->getOption().matches(clang::options::OPT_D)) {
       opts.addMacroDef(currentArg->getValue());
     } else {
       opts.addMacroUndef(currentArg->getValue());
@@ -933,34 +922,33 @@ static void parsePreprocessorArgs(Fortran::frontend::PreprocessorOptions &opts,
   }
 
   // Add the ordered list of -I's.
-  for (const auto *currentArg : args.filtered(clang::driver::options::OPT_I))
+  for (const auto *currentArg : args.filtered(clang::options::OPT_I))
     opts.searchDirectoriesFromDashI.emplace_back(currentArg->getValue());
 
   // Prepend the ordered list of -intrinsic-modules-path
   // to the default location to search.
   for (const auto *currentArg :
-       args.filtered(clang::driver::options::OPT_fintrinsic_modules_path))
+       args.filtered(clang::options::OPT_fintrinsic_modules_path))
     opts.searchDirectoriesFromIntrModPath.emplace_back(currentArg->getValue());
 
   // -cpp/-nocpp
-  if (const auto *currentArg = args.getLastArg(
-          clang::driver::options::OPT_cpp, clang::driver::options::OPT_nocpp))
-    opts.macrosFlag =
-        (currentArg->getOption().matches(clang::driver::options::OPT_cpp))
-            ? PPMacrosFlag::Include
-            : PPMacrosFlag::Exclude;
+  if (const auto *currentArg =
+          args.getLastArg(clang::options::OPT_cpp, clang::options::OPT_nocpp))
+    opts.macrosFlag = (currentArg->getOption().matches(clang::options::OPT_cpp))
+                          ? PPMacrosFlag::Include
+                          : PPMacrosFlag::Exclude;
   // Enable -cpp based on -x unless explicitly disabled with -nocpp
   if (opts.macrosFlag != PPMacrosFlag::Exclude)
-    if (const auto *dashX = args.getLastArg(clang::driver::options::OPT_x))
+    if (const auto *dashX = args.getLastArg(clang::options::OPT_x))
       opts.macrosFlag = llvm::StringSwitch<PPMacrosFlag>(dashX->getValue())
                             .Case("f95-cpp-input", PPMacrosFlag::Include)
                             .Default(opts.macrosFlag);
 
-  opts.noReformat = args.hasArg(clang::driver::options::OPT_fno_reformat);
+  opts.noReformat = args.hasArg(clang::options::OPT_fno_reformat);
   opts.preprocessIncludeLines =
-      args.hasArg(clang::driver::options::OPT_fpreprocess_include_lines);
-  opts.noLineDirectives = args.hasArg(clang::driver::options::OPT_P);
-  opts.showMacros = args.hasArg(clang::driver::options::OPT_dM);
+      args.hasArg(clang::options::OPT_fpreprocess_include_lines);
+  opts.noLineDirectives = args.hasArg(clang::options::OPT_P);
+  opts.showMacros = args.hasArg(clang::options::OPT_dM);
 }
 
 /// Parses all semantic related arguments and populates the variables
@@ -971,7 +959,7 @@ static bool parseSemaArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
 
   // -J/module-dir option
   std::vector<std::string> moduleDirList =
-      args.getAllArgValues(clang::driver::options::OPT_module_dir);
+      args.getAllArgValues(clang::options::OPT_module_dir);
   // User can only specify one -J/-module-dir directory, but may repeat
   // -J/-module-dir as long as the directory is the same each time.
   // https://gcc.gnu.org/onlinedocs/gfortran/Directory-Options.html
@@ -990,25 +978,25 @@ static bool parseSemaArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
     res.setModuleDir(moduleDirList[0]);
 
   // -fdebug-module-writer option
-  if (args.hasArg(clang::driver::options::OPT_fdebug_module_writer)) {
+  if (args.hasArg(clang::options::OPT_fdebug_module_writer)) {
     res.setDebugModuleDir(true);
   }
 
   // -fhermetic-module-files option
-  if (args.hasArg(clang::driver::options::OPT_fhermetic_module_files)) {
+  if (args.hasArg(clang::options::OPT_fhermetic_module_files)) {
     res.setHermeticModuleFileOutput(true);
   }
 
   // -module-suffix
   if (const auto *moduleSuffix =
-          args.getLastArg(clang::driver::options::OPT_module_suffix)) {
+          args.getLastArg(clang::options::OPT_module_suffix)) {
     res.setModuleFileSuffix(moduleSuffix->getValue());
   }
 
   // -f{no-}analyzed-objects-for-unparse
-  res.setUseAnalyzedObjectsForUnparse(args.hasFlag(
-      clang::driver::options::OPT_fanalyzed_objects_for_unparse,
-      clang::driver::options::OPT_fno_analyzed_objects_for_unparse, true));
+  res.setUseAnalyzedObjectsForUnparse(
+      args.hasFlag(clang::options::OPT_fanalyzed_objects_for_unparse,
+                   clang::options::OPT_fno_analyzed_objects_for_unparse, true));
 
   return diags.getNumErrors() == numErrorsBefore;
 }
@@ -1025,7 +1013,7 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   // chosen to match clang's behavior.
 
   // -pedantic
-  if (args.hasArg(clang::driver::options::OPT_pedantic)) {
+  if (args.hasArg(clang::options::OPT_pedantic)) {
     features.WarnOnAllNonstandard();
     features.WarnOnAllUsage();
     res.setEnableConformanceChecks();
@@ -1035,9 +1023,8 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   // -Werror option
   // TODO: Currently throws a Diagnostic for anything other than -W<error>,
   // this has to change when other -W<opt>'s are supported.
-  if (args.hasArg(clang::driver::options::OPT_W_Joined)) {
-    const auto &wArgs =
-        args.getAllArgValues(clang::driver::options::OPT_W_Joined);
+  if (args.hasArg(clang::options::OPT_W_Joined)) {
+    const auto &wArgs = args.getAllArgValues(clang::options::OPT_W_Joined);
     for (const auto &wArg : wArgs) {
       if (wArg == "error") {
         res.setWarnAsErr(true);
@@ -1054,7 +1041,7 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   }
 
   // -w
-  if (args.hasArg(clang::driver::options::OPT_w)) {
+  if (args.hasArg(clang::options::OPT_w)) {
     features.DisableAllWarnings();
     res.setDisableWarnings();
   }
@@ -1074,7 +1061,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   unsigned numErrorsBefore = diags.getNumErrors();
 
   // -fd-lines-as-code
-  if (args.hasArg(clang::driver::options::OPT_fd_lines_as_code)) {
+  if (args.hasArg(clang::options::OPT_fd_lines_as_code)) {
     if (res.getFrontendOpts().fortranForm == FortranForm::FreeForm) {
       const unsigned fdLinesAsWarning = diags.getCustomDiagID(
           clang::DiagnosticsEngine::Warning,
@@ -1087,7 +1074,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   }
 
   // -fd-lines-as-comments
-  if (args.hasArg(clang::driver::options::OPT_fd_lines_as_comments)) {
+  if (args.hasArg(clang::options::OPT_fd_lines_as_comments)) {
     if (res.getFrontendOpts().fortranForm == FortranForm::FreeForm) {
       const unsigned fdLinesAsWarning = diags.getCustomDiagID(
           clang::DiagnosticsEngine::Warning,
@@ -1100,18 +1087,18 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   }
 
   // -fdefault* family
-  if (args.hasArg(clang::driver::options::OPT_fdefault_real_8)) {
+  if (args.hasArg(clang::options::OPT_fdefault_real_8)) {
     res.getDefaultKinds().set_defaultRealKind(8);
     res.getDefaultKinds().set_doublePrecisionKind(16);
   }
-  if (args.hasArg(clang::driver::options::OPT_fdefault_integer_8)) {
+  if (args.hasArg(clang::options::OPT_fdefault_integer_8)) {
     res.getDefaultKinds().set_defaultIntegerKind(8);
     res.getDefaultKinds().set_subscriptIntegerKind(8);
     res.getDefaultKinds().set_sizeIntegerKind(8);
     res.getDefaultKinds().set_defaultLogicalKind(8);
   }
-  if (args.hasArg(clang::driver::options::OPT_fdefault_double_8)) {
-    if (!args.hasArg(clang::driver::options::OPT_fdefault_real_8)) {
+  if (args.hasArg(clang::options::OPT_fdefault_double_8)) {
+    if (!args.hasArg(clang::options::OPT_fdefault_real_8)) {
       // -fdefault-double-8 has to be used with -fdefault-real-8
       // to be compatible with gfortran
       const unsigned diagID = diags.getCustomDiagID(
@@ -1122,18 +1109,18 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
     // https://gcc.gnu.org/onlinedocs/gfortran/Fortran-Dialect-Options.html
     res.getDefaultKinds().set_doublePrecisionKind(8);
   }
-  if (args.hasArg(clang::driver::options::OPT_flarge_sizes))
+  if (args.hasArg(clang::options::OPT_flarge_sizes))
     res.getDefaultKinds().set_sizeIntegerKind(8);
 
   // -x cuda
-  auto language = args.getLastArgValue(clang::driver::options::OPT_x);
+  auto language = args.getLastArgValue(clang::options::OPT_x);
   if (language == "cuda") {
     res.getFrontendOpts().features.Enable(
         Fortran::common::LanguageFeature::CUDA);
   }
 
   // -fopenacc
-  if (args.hasArg(clang::driver::options::OPT_fopenacc)) {
+  if (args.hasArg(clang::options::OPT_fopenacc)) {
     res.getFrontendOpts().features.Enable(
         Fortran::common::LanguageFeature::OpenACC);
   }
@@ -1141,8 +1128,8 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   // -std=f2018
   // TODO: Set proper options when more fortran standards
   // are supported.
-  if (args.hasArg(clang::driver::options::OPT_std_EQ)) {
-    auto standard = args.getLastArgValue(clang::driver::options::OPT_std_EQ);
+  if (args.hasArg(clang::options::OPT_std_EQ)) {
+    auto standard = args.getLastArgValue(clang::options::OPT_std_EQ);
     // We only allow f2018 as the given standard
     if (standard == "f2018") {
       res.setEnableConformanceChecks();
@@ -1155,7 +1142,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
     }
   }
   // -fcoarray
-  if (args.hasArg(clang::driver::options::OPT_fcoarray)) {
+  if (args.hasArg(clang::options::OPT_fcoarray)) {
     res.getFrontendOpts().features.Enable(
         Fortran::common::LanguageFeature::Coarray);
     const unsigned diagID =
@@ -1173,13 +1160,12 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
 /// generated.
 static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
                             clang::DiagnosticsEngine &diags) {
-  llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fopenmp,
-                                        clang::driver::options::OPT_fno_openmp);
-  if (!arg ||
-      arg->getOption().matches(clang::driver::options::OPT_fno_openmp)) {
-    bool isSimdSpecified = args.hasFlag(
-        clang::driver::options::OPT_fopenmp_simd,
-        clang::driver::options::OPT_fno_openmp_simd, /*Default=*/false);
+  llvm::opt::Arg *arg = args.getLastArg(clang::options::OPT_fopenmp,
+                                        clang::options::OPT_fno_openmp);
+  if (!arg || arg->getOption().matches(clang::options::OPT_fno_openmp)) {
+    bool isSimdSpecified =
+        args.hasFlag(clang::options::OPT_fopenmp_simd,
+                     clang::options::OPT_fno_openmp_simd, /*Default=*/false);
     if (!isSimdSpecified)
       return true;
     res.getLangOpts().OpenMPSimd = 1;
@@ -1194,8 +1180,7 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   res.getLangOpts().OpenMPVersion = newestFullySupported;
   res.getFrontendOpts().features.Enable(
       Fortran::common::LanguageFeature::OpenMP);
-  if (auto *arg =
-          args.getLastArg(clang::driver::options::OPT_fopenmp_version_EQ)) {
+  if (auto *arg = args.getLastArg(clang::options::OPT_fopenmp_version_EQ)) {
     llvm::ArrayRef<unsigned> ompVersions = llvm::omp::getOpenMPVersions();
     unsigned oldVersions[] = {11, 20, 25, 30};
     unsigned version = 0;
@@ -1248,16 +1233,16 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
     }
   }
 
-  if (args.hasArg(clang::driver::options::OPT_fopenmp_force_usm)) {
+  if (args.hasArg(clang::options::OPT_fopenmp_force_usm)) {
     res.getLangOpts().OpenMPForceUSM = 1;
   }
-  if (args.hasArg(clang::driver::options::OPT_fopenmp_is_target_device)) {
+  if (args.hasArg(clang::options::OPT_fopenmp_is_target_device)) {
     res.getLangOpts().OpenMPIsTargetDevice = 1;
 
     // Get OpenMP host file path if any and report if a non existent file is
     // found
-    if (auto *arg = args.getLastArg(
-            clang::driver::options::OPT_fopenmp_host_ir_file_path)) {
+    if (auto *arg =
+            args.getLastArg(clang::options::OPT_fopenmp_host_ir_file_path)) {
       res.getLangOpts().OMPHostIRFile = arg->getValue();
       if (!llvm::sys::fs::exists(res.getLangOpts().OMPHostIRFile))
         diags.Report(clang::diag::err_omp_host_ir_file_not_found)
@@ -1265,37 +1250,34 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
     }
 
     if (args.hasFlag(
-            clang::driver::options::OPT_fopenmp_assume_teams_oversubscription,
-            clang::driver::options::
-                OPT_fno_openmp_assume_teams_oversubscription,
+            clang::options::OPT_fopenmp_assume_teams_oversubscription,
+            clang::options::OPT_fno_openmp_assume_teams_oversubscription,
             /*Default=*/false))
       res.getLangOpts().OpenMPTeamSubscription = true;
 
-    if (args.hasArg(clang::driver::options::OPT_fopenmp_assume_no_thread_state))
+    if (args.hasArg(clang::options::OPT_fopenmp_assume_no_thread_state))
       res.getLangOpts().OpenMPNoThreadState = 1;
 
-    if (args.hasArg(
-            clang::driver::options::OPT_fopenmp_assume_no_nested_parallelism))
+    if (args.hasArg(clang::options::OPT_fopenmp_assume_no_nested_parallelism))
       res.getLangOpts().OpenMPNoNestedParallelism = 1;
 
     if (args.hasFlag(
-            clang::driver::options::OPT_fopenmp_assume_threads_oversubscription,
-            clang::driver::options::
-                OPT_fno_openmp_assume_threads_oversubscription,
+            clang::options::OPT_fopenmp_assume_threads_oversubscription,
+            clang::options::OPT_fno_openmp_assume_threads_oversubscription,
             /*Default=*/false))
       res.getLangOpts().OpenMPThreadSubscription = true;
 
-    if ((args.hasArg(clang::driver::options::OPT_fopenmp_target_debug) ||
-         args.hasArg(clang::driver::options::OPT_fopenmp_target_debug_EQ))) {
-      res.getLangOpts().OpenMPTargetDebug = getLastArgIntValue(
-          args, clang::driver::options::OPT_fopenmp_target_debug_EQ,
-          res.getLangOpts().OpenMPTargetDebug, diags);
+    if ((args.hasArg(clang::options::OPT_fopenmp_target_debug) ||
+         args.hasArg(clang::options::OPT_fopenmp_target_debug_EQ))) {
+      res.getLangOpts().OpenMPTargetDebug =
+          getLastArgIntValue(args, clang::options::OPT_fopenmp_target_debug_EQ,
+                             res.getLangOpts().OpenMPTargetDebug, diags);
 
       if (!res.getLangOpts().OpenMPTargetDebug &&
-          args.hasArg(clang::driver::options::OPT_fopenmp_target_debug))
+          args.hasArg(clang::options::OPT_fopenmp_target_debug))
         res.getLangOpts().OpenMPTargetDebug = 1;
     }
-    if (args.hasArg(clang::driver::options::OPT_no_offloadlib))
+    if (args.hasArg(clang::options::OPT_no_offloadlib))
       res.getLangOpts().NoGPULib = 1;
   }
   if (llvm::Triple(res.getTargetOpts().triple).isGPU()) {
@@ -1311,8 +1293,7 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   }
 
   // Get the OpenMP target triples if any.
-  if (auto *arg =
-          args.getLastArg(clang::driver::options::OPT_offload_targets_EQ)) {
+  if (auto *arg = args.getLastArg(clang::options::OPT_offload_targets_EQ)) {
     enum ArchPtrSize { Arch16Bit, Arch32Bit, Arch64Bit };
     auto getArchPtrSize = [](const llvm::Triple &triple) {
       if (triple.isArch16Bit())
@@ -1355,7 +1336,7 @@ static bool parseIntegerOverflowArgs(CompilerInvocation &invoc,
                                      clang::DiagnosticsEngine &diags) {
   Fortran::common::LangOptions &opts = invoc.getLangOpts();
 
-  if (args.getLastArg(clang::driver::options::OPT_fwrapv))
+  if (args.getLastArg(clang::options::OPT_fwrapv))
     opts.setSignedOverflowBehavior(Fortran::common::LangOptions::SOB_Defined);
 
   return true;
@@ -1374,7 +1355,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
   Fortran::common::LangOptions &opts = invoc.getLangOpts();
 
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_ffp_contract)) {
+          args.getLastArg(clang::options::OPT_ffp_contract)) {
     const llvm::StringRef val = a->getValue();
     enum Fortran::common::LangOptions::FPModeKind fpContractMode;
 
@@ -1391,31 +1372,31 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
     opts.setFPContractMode(fpContractMode);
   }
 
-  if (args.getLastArg(clang::driver::options::OPT_menable_no_infs)) {
+  if (args.getLastArg(clang::options::OPT_menable_no_infs)) {
     opts.NoHonorInfs = true;
   }
 
-  if (args.getLastArg(clang::driver::options::OPT_menable_no_nans)) {
+  if (args.getLastArg(clang::options::OPT_menable_no_nans)) {
     opts.NoHonorNaNs = true;
   }
 
-  if (args.getLastArg(clang::driver::options::OPT_fapprox_func)) {
+  if (args.getLastArg(clang::options::OPT_fapprox_func)) {
     opts.ApproxFunc = true;
   }
 
-  if (args.getLastArg(clang::driver::options::OPT_fno_signed_zeros)) {
+  if (args.getLastArg(clang::options::OPT_fno_signed_zeros)) {
     opts.NoSignedZeros = true;
   }
 
-  if (args.getLastArg(clang::driver::options::OPT_mreassociate)) {
+  if (args.getLastArg(clang::options::OPT_mreassociate)) {
     opts.AssociativeMath = true;
   }
 
-  if (args.getLastArg(clang::driver::options::OPT_freciprocal_math)) {
+  if (args.getLastArg(clang::options::OPT_freciprocal_math)) {
     opts.ReciprocalMath = true;
   }
 
-  if (args.getLastArg(clang::driver::options::OPT_ffast_math)) {
+  if (args.getLastArg(clang::options::OPT_ffast_math)) {
     opts.NoHonorInfs = true;
     opts.NoHonorNaNs = true;
     opts.AssociativeMath = true;
@@ -1425,7 +1406,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
     opts.setFPContractMode(Fortran::common::LangOptions::FPM_Fast);
   }
 
-  if (args.hasArg(clang::driver::options::OPT_fno_fast_real_mod))
+  if (args.hasArg(clang::options::OPT_fno_fast_real_mod))
     opts.NoFastRealMod = true;
 
   return true;
@@ -1440,10 +1421,8 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
 /// \param [out] diags DiagnosticsEngine to report erros with
 static bool parseVScaleArgs(CompilerInvocation &invoc, llvm::opt::ArgList &args,
                             clang::DiagnosticsEngine &diags) {
-  const auto *vscaleMin =
-      args.getLastArg(clang::driver::options::OPT_mvscale_min_EQ);
-  const auto *vscaleMax =
-      args.getLastArg(clang::driver::options::OPT_mvscale_max_EQ);
+  const auto *vscaleMin = args.getLastArg(clang::options::OPT_mvscale_min_EQ);
+  const auto *vscaleMax = args.getLastArg(clang::options::OPT_mvscale_max_EQ);
 
   if (!vscaleMin && !vscaleMax)
     return true;
@@ -1491,8 +1470,7 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc,
 
   // TODO: support --dependent-lib on other platforms when MLIR supports
   //       !llvm.dependent.lib
-  if (args.hasArg(clang::driver::options::OPT_dependent_lib) &&
-      !triple.isOSWindows()) {
+  if (args.hasArg(clang::options::OPT_dependent_lib) && !triple.isOSWindows()) {
     const unsigned diagID =
         diags.getCustomDiagID(clang::DiagnosticsEngine::Error,
                               "--dependent-lib is only supported on Windows");
@@ -1500,12 +1478,10 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc,
     return false;
   }
 
-  opts.DependentLibs =
-      args.getAllArgValues(clang::driver::options::OPT_dependent_lib);
+  opts.DependentLibs = args.getAllArgValues(clang::options::OPT_dependent_lib);
 
   // -flto=full/thin option.
-  if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_flto_EQ)) {
+  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_flto_EQ)) {
     llvm::StringRef s = a->getValue();
     assert((s == "full" || s == "thin") && "Unknown LTO mode.");
     if (s == "full")
@@ -1516,10 +1492,10 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc,
 
   // -ffat-lto-objects
   if (const llvm::opt::Arg *arg =
-          args.getLastArg(clang::driver::options::OPT_ffat_lto_objects,
-                          clang::driver::options::OPT_fno_fat_lto_objects)) {
+          args.getLastArg(clang::options::OPT_ffat_lto_objects,
+                          clang::options::OPT_fno_fat_lto_objects)) {
     opts.PrepareForFatLTO =
-        arg->getOption().matches(clang::driver::options::OPT_ffat_lto_objects);
+        arg->getOption().matches(clang::options::OPT_ffat_lto_objects);
     if (opts.PrepareForFatLTO) {
       assert((opts.PrepareForFullLTO || opts.PrepareForThinLTO) &&
              "Unknown LTO mode");
@@ -1560,8 +1536,8 @@ bool CompilerInvocation::createFromArgs(
       llvm::Triple::normalize(llvm::sys::getDefaultTargetTriple());
 
   // Parse the arguments
-  const llvm::opt::OptTable &opts = clang::driver::getDriverOptTable();
-  llvm::opt::Visibility visibilityMask(clang::driver::options::FC1Option);
+  const llvm::opt::OptTable &opts = clang::getDriverOptTable();
+  llvm::opt::Visibility visibilityMask(clang::options::FC1Option);
   unsigned missingArgIndex, missingArgCount;
   llvm::opt::InputArgList args = opts.ParseArgs(
       commandLineArgs, missingArgIndex, missingArgCount, visibilityMask);
@@ -1574,7 +1550,7 @@ bool CompilerInvocation::createFromArgs(
   }
 
   // Issue errors on unknown arguments
-  for (const auto *a : args.filtered(clang::driver::options::OPT_UNKNOWN)) {
+  for (const auto *a : args.filtered(clang::options::OPT_UNKNOWN)) {
     auto argString = a->getAsString(args);
     std::string nearest;
     if (opts.findNearest(argString, nearest, visibilityMask) > 1)
@@ -1586,15 +1562,15 @@ bool CompilerInvocation::createFromArgs(
   }
 
   // -flang-experimental-hlfir
-  if (args.hasArg(clang::driver::options::OPT_flang_experimental_hlfir) ||
-      args.hasArg(clang::driver::options::OPT_emit_hlfir)) {
+  if (args.hasArg(clang::options::OPT_flang_experimental_hlfir) ||
+      args.hasArg(clang::options::OPT_emit_hlfir)) {
     invoc.loweringOpts.setLowerToHighLevelFIR(true);
   }
 
   // -flang-deprecated-no-hlfir
-  if (args.hasArg(clang::driver::options::OPT_flang_deprecated_no_hlfir) &&
-      !args.hasArg(clang::driver::options::OPT_emit_hlfir)) {
-    if (args.hasArg(clang::driver::options::OPT_flang_experimental_hlfir)) {
+  if (args.hasArg(clang::options::OPT_flang_deprecated_no_hlfir) &&
+      !args.hasArg(clang::options::OPT_emit_hlfir)) {
+    if (args.hasArg(clang::options::OPT_flang_experimental_hlfir)) {
       const unsigned diagID = diags.getCustomDiagID(
           clang::DiagnosticsEngine::Error,
           "Options '-flang-experimental-hlfir' and "
@@ -1605,13 +1581,13 @@ bool CompilerInvocation::createFromArgs(
   }
 
   // -fno-ppc-native-vector-element-order
-  if (args.hasArg(clang::driver::options::OPT_fno_ppc_native_vec_elem_order)) {
+  if (args.hasArg(clang::options::OPT_fno_ppc_native_vec_elem_order)) {
     invoc.loweringOpts.setNoPPCNativeVecElemOrder(true);
   }
 
   // -f[no-]init-global-zero
-  if (args.hasFlag(clang::driver::options::OPT_finit_global_zero,
-                   clang::driver::options::OPT_fno_init_global_zero,
+  if (args.hasFlag(clang::options::OPT_finit_global_zero,
+                   clang::options::OPT_fno_init_global_zero,
                    /*default=*/true))
     invoc.loweringOpts.setInitGlobalZero(true);
   else
@@ -1620,8 +1596,8 @@ bool CompilerInvocation::createFromArgs(
   // Preserve all the remark options requested, i.e. -Rpass, -Rpass-missed or
   // -Rpass-analysis. This will be used later when processing and outputting the
   // remarks generated by LLVM in ExecuteCompilerInvocation.cpp.
-  for (auto *a : args.filtered(clang::driver::options::OPT_R_Group)) {
-    if (a->getOption().matches(clang::driver::options::OPT_R_value_Group))
+  for (auto *a : args.filtered(clang::options::OPT_R_Group)) {
+    if (a->getOption().matches(clang::options::OPT_R_value_Group))
       // This is -Rfoo=, where foo is the name of the diagnostic
       // group. Add only the remark option name to the diagnostics. e.g. for
       // -Rpass= we will add the string "pass".
@@ -1634,20 +1610,19 @@ bool CompilerInvocation::createFromArgs(
   }
 
   // -frealloc-lhs is the default.
-  if (!args.hasFlag(clang::driver::options::OPT_frealloc_lhs,
-                    clang::driver::options::OPT_fno_realloc_lhs, true))
+  if (!args.hasFlag(clang::options::OPT_frealloc_lhs,
+                    clang::options::OPT_fno_realloc_lhs, true))
     invoc.loweringOpts.setReallocateLHS(false);
 
-  invoc.loweringOpts.setRepackArrays(
-      args.hasFlag(clang::driver::options::OPT_frepack_arrays,
-                   clang::driver::options::OPT_fno_repack_arrays,
-                   /*default=*/false));
+  invoc.loweringOpts.setRepackArrays(args.hasFlag(
+      clang::options::OPT_frepack_arrays, clang::options::OPT_fno_repack_arrays,
+      /*default=*/false));
   invoc.loweringOpts.setStackRepackArrays(
-      args.hasFlag(clang::driver::options::OPT_fstack_repack_arrays,
-                   clang::driver::options::OPT_fno_stack_repack_arrays,
+      args.hasFlag(clang::options::OPT_fstack_repack_arrays,
+                   clang::options::OPT_fno_stack_repack_arrays,
                    /*default=*/false));
-  if (auto *arg = args.getLastArg(
-          clang::driver::options::OPT_frepack_arrays_contiguity_EQ))
+  if (auto *arg =
+          args.getLastArg(clang::options::OPT_frepack_arrays_contiguity_EQ))
     invoc.loweringOpts.setRepackArraysWhole(arg->getValue() ==
                                             llvm::StringRef{"whole"});
 
@@ -1667,10 +1642,8 @@ bool CompilerInvocation::createFromArgs(
   // `mlirArgs`. Instead, you can use
   //    * `-mllvm <your-llvm-option>`, or
   //    * `-mmlir <your-mlir-option>`.
-  invoc.frontendOpts.llvmArgs =
-      args.getAllArgValues(clang::driver::options::OPT_mllvm);
-  invoc.frontendOpts.mlirArgs =
-      args.getAllArgValues(clang::driver::options::OPT_mmlir);
+  invoc.frontendOpts.llvmArgs = args.getAllArgValues(clang::options::OPT_mllvm);
+  invoc.frontendOpts.mlirArgs = args.getAllArgValues(clang::options::OPT_mmlir);
 
   success &= parseLangOptionsArgs(invoc, args, diags);
 
@@ -1694,7 +1667,7 @@ bool CompilerInvocation::createFromArgs(
   }
 
   // Process the timing-related options.
-  if (args.hasArg(clang::driver::options::OPT_ftime_report))
+  if (args.hasArg(clang::options::OPT_ftime_report))
     invoc.enableTimers = true;
 
   invoc.setArgv0(argv0);
diff --git a/flang/lib/FrontendTool/CMakeLists.txt b/flang/lib/FrontendTool/CMakeLists.txt
index faf56e9d955a1..b69436c36d438 100644
--- a/flang/lib/FrontendTool/CMakeLists.txt
+++ b/flang/lib/FrontendTool/CMakeLists.txt
@@ -18,5 +18,6 @@ add_flang_library(flangFrontendTool
 
   CLANG_LIBS
   clangBasic
+  clangOptions
   clangDriver
 )
diff --git a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index 09ac129d3e689..7586be59ba01b 100644
--- a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -23,7 +23,7 @@
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Pass/PassManager.h"
 #include "clang/Basic/DiagnosticFrontend.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/OptTable.h"
 #include "llvm/Option/Option.h"
 #include "llvm/Support/BuryPointer.h"
@@ -153,10 +153,10 @@ updateDiagEngineForOptRemarks(clang::DiagnosticsEngine &diagsEng,
 bool executeCompilerInvocation(CompilerInstance *flang) {
   // Honor -help.
   if (flang->getFrontendOpts().showHelp) {
-    clang::driver::getDriverOptTable().printHelp(
+    clang::getDriverOptTable().printHelp(
         llvm::outs(), "flang -fc1 [options] file...", "LLVM 'Flang' Compiler",
         /*ShowHidden=*/false, /*ShowAllAliases=*/false,
-        llvm::opt::Visibility(clang::driver::options::FC1Option));
+        llvm::opt::Visibility(clang::options::FC1Option));
     return true;
   }
 
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 6e729874eb5e6..20e85a940b182 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -15,7 +15,6 @@
 #include "flang/Lower/Allocatable.h"
 #include "flang/Lower/CUDA.h"
 #include "flang/Lower/CallInterface.h"
-#include "flang/Lower/Coarray.h"
 #include "flang/Lower/ConvertCall.h"
 #include "flang/Lower/ConvertExpr.h"
 #include "flang/Lower/ConvertExprToHLFIR.h"
@@ -26,6 +25,7 @@
 #include "flang/Lower/IO.h"
 #include "flang/Lower/IterationSpace.h"
 #include "flang/Lower/Mangler.h"
+#include "flang/Lower/MultiImageFortran.h"
 #include "flang/Lower/OpenACC.h"
 #include "flang/Lower/OpenMP.h"
 #include "flang/Lower/PFTBuilder.h"
@@ -1111,6 +1111,34 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     return bridge.fctCtx();
   }
 
+  /// Initializes values for STAT and ERRMSG
+  std::pair<mlir::Value, mlir::Value>
+  genStatAndErrmsg(mlir::Location loc,
+                   const std::list<Fortran::parser::StatOrErrmsg>
+                       &statOrErrList) override final {
+    Fortran::lower::StatementContext stmtCtx;
+
+    mlir::Value errMsgExpr, statExpr;
+    for (const Fortran::parser::StatOrErrmsg &statOrErr : statOrErrList) {
+      std::visit(Fortran::common::visitors{
+                     [&](const Fortran::parser::StatVariable &statVar) {
+                       const Fortran::semantics::SomeExpr *expr =
+                           Fortran::semantics::GetExpr(statVar);
+                       statExpr =
+                           fir::getBase(genExprAddr(*expr, stmtCtx, &loc));
+                     },
+                     [&](const Fortran::parser::MsgVariable &errMsgVar) {
+                       const Fortran::semantics::SomeExpr *expr =
+                           Fortran::semantics::GetExpr(errMsgVar);
+                       errMsgExpr =
+                           fir::getBase(genExprBox(loc, *expr, stmtCtx));
+                     }},
+                 statOrErr.u);
+    }
+
+    return {statExpr, errMsgExpr};
+  }
+
   mlir::Value hostAssocTupleValue() override final { return hostAssocTuple; }
 
   /// Record a binding for the ssa-value of the tuple for this function.
@@ -1129,6 +1157,12 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     return registeredDummySymbols.contains(sym);
   }
 
+  unsigned getDummyArgPosition(
+      const Fortran::semantics::Symbol &sym) const override final {
+    auto it = dummyArgPositions.find(&sym);
+    return (it != dummyArgPositions.end()) ? it->second : 0;
+  }
+
   const Fortran::lower::pft::FunctionLikeUnit *
   getCurrentFunctionUnit() const override final {
     return currentFunctionUnit;
@@ -1413,11 +1447,14 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   /// definitive mapping. The specification expression have not been lowered
   /// yet. The final mapping will be done using this pre-mapping in
   /// Fortran::lower::mapSymbolAttributes.
+  /// \param argNo The 1-based source position of this argument (0 if
+  /// unknown/result)
   bool mapBlockArgToDummyOrResult(const Fortran::semantics::SymbolRef sym,
-                                  mlir::Value val, bool isResult) {
+                                  mlir::Value val, bool isResult,
+                                  unsigned argNo = 0) {
     localSymbols.addSymbol(sym, val);
     if (!isResult)
-      registerDummySymbol(sym);
+      registerDummySymbol(sym, argNo);
 
     return true;
   }
@@ -3275,6 +3312,9 @@ class FirConverter : public Fortran::lower::AbstractConverter {
             [&](const Fortran::parser::CompilerDirective::NoInline &) {
               attachInliningDirectiveToStmt(dir, &eval);
             },
+            [&](const Fortran::parser::CompilerDirective::Prefetch &prefetch) {
+              TODO(getCurrentLocation(), "!$dir prefetch");
+            },
             [&](const auto &) {}},
         dir.u);
   }
@@ -3950,13 +3990,30 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   }
 
   void genFIR(const Fortran::parser::ChangeTeamConstruct &construct) {
-    TODO(toLocation(), "coarray: ChangeTeamConstruct");
+    Fortran::lower::StatementContext stmtCtx;
+    pushActiveConstruct(getEval(), stmtCtx);
+
+    for (Fortran::lower::pft::Evaluation &e :
+         getEval().getNestedEvaluations()) {
+      if (e.getIf<Fortran::parser::ChangeTeamStmt>()) {
+        maybeStartBlock(e.block);
+        setCurrentPosition(e.position);
+        genFIR(e);
+      } else if (e.getIf<Fortran::parser::EndChangeTeamStmt>()) {
+        maybeStartBlock(e.block);
+        setCurrentPosition(e.position);
+        genFIR(e);
+      } else {
+        genFIR(e);
+      }
+    }
+    popActiveConstruct();
   }
   void genFIR(const Fortran::parser::ChangeTeamStmt &stmt) {
-    TODO(toLocation(), "coarray: ChangeTeamStmt");
+    genChangeTeamStmt(*this, getEval(), stmt);
   }
   void genFIR(const Fortran::parser::EndChangeTeamStmt &stmt) {
-    TODO(toLocation(), "coarray: EndChangeTeamStmt");
+    genEndChangeTeamStmt(*this, getEval(), stmt);
   }
 
   void genFIR(const Fortran::parser::CriticalConstruct &criticalConstruct) {
@@ -4876,6 +4933,10 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       mlir::Value shape = builder->genShape(loc, lbounds, extents);
       rhsBox = fir::ReboxOp::create(*builder, loc, lhsBoxType, rhsBox, shape,
                                     /*slice=*/mlir::Value{});
+    } else if (fir::isClassStarType(lhsBoxType) &&
+               !fir::ConvertOp::canBeConverted(rhsBoxType, lhsBoxType)) {
+      rhsBox = fir::ReboxOp::create(*builder, loc, lhsBoxType, rhsBox,
+                                    mlir::Value{}, mlir::Value{});
     }
     return rhsBox;
   }
@@ -5955,7 +6016,16 @@ class FirConverter : public Fortran::lower::AbstractConverter {
                             const Fortran::lower::CalleeInterface &callee) {
     assert(builder && "require a builder object at this point");
     using PassBy = Fortran::lower::CalleeInterface::PassEntityBy;
+
+    // Track the source-level argument position (1-based)
+    unsigned argPosition = 0;
+
     auto mapPassedEntity = [&](const auto arg, bool isResult = false) {
+      // Count only actual source-level dummy arguments (not results or
+      // host assoc tuples)
+      if (!isResult && arg.entity.has_value())
+        argPosition++;
+
       if (arg.passBy == PassBy::AddressAndLength) {
         if (callee.characterize().IsBindC())
           return;
@@ -5966,11 +6036,12 @@ class FirConverter : public Fortran::lower::AbstractConverter {
         mlir::Value casted =
             builder->createVolatileCast(loc, false, arg.firArgument);
         mlir::Value box = charHelp.createEmboxChar(casted, arg.firLength);
-        mapBlockArgToDummyOrResult(arg.entity->get(), box, isResult);
+        mapBlockArgToDummyOrResult(arg.entity->get(), box, isResult,
+                                   isResult ? 0 : argPosition);
       } else {
         if (arg.entity.has_value()) {
           mapBlockArgToDummyOrResult(arg.entity->get(), arg.firArgument,
-                                     isResult);
+                                     isResult, isResult ? 0 : argPosition);
         } else {
           assert(funit.parentHasTupleHostAssoc() && "expect tuple argument");
         }
@@ -6828,13 +6899,22 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   }
 
   /// Record the given symbol as a dummy argument of this function.
-  void registerDummySymbol(Fortran::semantics::SymbolRef symRef) {
+  /// \param symRef The symbol representing the dummy argument
+  /// \param argNo The 1-based position of this argument in the source (0 =
+  /// unknown)
+  void registerDummySymbol(Fortran::semantics::SymbolRef symRef,
+                           unsigned argNo = 0) {
     auto *sym = &*symRef;
     registeredDummySymbols.insert(sym);
+    if (argNo > 0)
+      dummyArgPositions[sym] = argNo;
   }
 
   /// Reset all registered dummy symbols.
-  void resetRegisteredDummySymbols() { registeredDummySymbols.clear(); }
+  void resetRegisteredDummySymbols() {
+    registeredDummySymbols.clear();
+    dummyArgPositions.clear();
+  }
 
   void setCurrentFunctionUnit(Fortran::lower::pft::FunctionLikeUnit *unit) {
     currentFunctionUnit = unit;
@@ -6876,6 +6956,11 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   llvm::SmallPtrSet<const Fortran::semantics::Symbol *, 16>
       registeredDummySymbols;
 
+  /// Map from dummy symbols to their 1-based argument positions.
+  /// Used to generate debug info with correct argument numbers.
+  llvm::DenseMap<const Fortran::semantics::Symbol *, unsigned>
+      dummyArgPositions;
+
   /// A map of unique names for constant expressions.
   /// The names are used for representing the constant expressions
   /// with global constant initialized objects.
diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt
index 3d0b4e4cd82eb..230a56ab66ec5 100644
--- a/flang/lib/Lower/CMakeLists.txt
+++ b/flang/lib/Lower/CMakeLists.txt
@@ -5,7 +5,6 @@ add_flang_library(FortranLower
   Allocatable.cpp
   Bridge.cpp
   CallInterface.cpp
-  Coarray.cpp
   ComponentPath.cpp
   ConvertArrayConstructor.cpp
   ConvertCall.cpp
@@ -23,6 +22,7 @@ add_flang_library(FortranLower
   IterationSpace.cpp
   LoweringOptions.cpp
   Mangler.cpp
+  MultiImageFortran.cpp
   OpenACC.cpp
   OpenMP/Atomic.cpp
   OpenMP/ClauseProcessor.cpp
diff --git a/flang/lib/Lower/Coarray.cpp b/flang/lib/Lower/Coarray.cpp
deleted file mode 100644
index a84f65a5c49e8..0000000000000
--- a/flang/lib/Lower/Coarray.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-//===-- Coarray.cpp -------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// Implementation of the lowering of image related constructs and expressions.
-/// Fortran images can form teams, communicate via coarrays, etc.
-///
-//===----------------------------------------------------------------------===//
-
-#include "flang/Lower/Coarray.h"
-#include "flang/Lower/AbstractConverter.h"
-#include "flang/Lower/SymbolMap.h"
-#include "flang/Optimizer/Builder/FIRBuilder.h"
-#include "flang/Optimizer/Builder/Todo.h"
-#include "flang/Parser/parse-tree.h"
-#include "flang/Semantics/expression.h"
-
-//===----------------------------------------------------------------------===//
-// TEAM statements and constructs
-//===----------------------------------------------------------------------===//
-
-void Fortran::lower::genChangeTeamConstruct(
-    Fortran::lower::AbstractConverter &converter,
-    Fortran::lower::pft::Evaluation &,
-    const Fortran::parser::ChangeTeamConstruct &) {
-  TODO(converter.getCurrentLocation(), "coarray: CHANGE TEAM construct");
-}
-
-void Fortran::lower::genChangeTeamStmt(
-    Fortran::lower::AbstractConverter &converter,
-    Fortran::lower::pft::Evaluation &,
-    const Fortran::parser::ChangeTeamStmt &) {
-  TODO(converter.getCurrentLocation(), "coarray: CHANGE TEAM statement");
-}
-
-void Fortran::lower::genEndChangeTeamStmt(
-    Fortran::lower::AbstractConverter &converter,
-    Fortran::lower::pft::Evaluation &,
-    const Fortran::parser::EndChangeTeamStmt &) {
-  TODO(converter.getCurrentLocation(), "coarray: END CHANGE TEAM statement");
-}
-
-void Fortran::lower::genFormTeamStatement(
-    Fortran::lower::AbstractConverter &converter,
-    Fortran::lower::pft::Evaluation &, const Fortran::parser::FormTeamStmt &) {
-  TODO(converter.getCurrentLocation(), "coarray: FORM TEAM statement");
-}
-
-//===----------------------------------------------------------------------===//
-// COARRAY expressions
-//===----------------------------------------------------------------------===//
-
-fir::ExtendedValue Fortran::lower::CoarrayExprHelper::genAddr(
-    const Fortran::evaluate::CoarrayRef &expr) {
-  (void)symMap;
-  TODO(converter.getCurrentLocation(), "co-array address");
-}
-
-fir::ExtendedValue Fortran::lower::CoarrayExprHelper::genValue(
-    const Fortran::evaluate::CoarrayRef &expr) {
-  TODO(converter.getCurrentLocation(), "co-array value");
-}
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index 9bf994e70cf5d..f24a4d9745698 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -1296,10 +1296,14 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
     Fortran::evaluate::FoldingContext &foldingContext{
         callContext.converter.getFoldingContext()};
 
-    bool suggestCopyIn = Fortran::evaluate::MayNeedCopy(
-        arg.entity, arg.characteristics, foldingContext, /*forCopyOut=*/false);
-    bool suggestCopyOut = Fortran::evaluate::MayNeedCopy(
-        arg.entity, arg.characteristics, foldingContext, /*forCopyOut=*/true);
+    bool suggestCopyIn = Fortran::evaluate::ActualArgNeedsCopy(
+                             arg.entity, arg.characteristics, foldingContext,
+                             /*forCopyOut=*/false)
+                             .value_or(true);
+    bool suggestCopyOut = Fortran::evaluate::ActualArgNeedsCopy(
+                              arg.entity, arg.characteristics, foldingContext,
+                              /*forCopyOut=*/true)
+                              .value_or(true);
     mustDoCopyIn = actual.isArray() && suggestCopyIn;
     mustDoCopyOut = actual.isArray() && suggestCopyOut;
   }
diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index a46d219ba4b2c..b2910a0fc58e0 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -19,7 +19,6 @@
 #include "flang/Lower/Bridge.h"
 #include "flang/Lower/BuiltinModules.h"
 #include "flang/Lower/CallInterface.h"
-#include "flang/Lower/Coarray.h"
 #include "flang/Lower/ComponentPath.h"
 #include "flang/Lower/ConvertCall.h"
 #include "flang/Lower/ConvertConstant.h"
@@ -28,6 +27,7 @@
 #include "flang/Lower/ConvertVariable.h"
 #include "flang/Lower/CustomIntrinsicCall.h"
 #include "flang/Lower/Mangler.h"
+#include "flang/Lower/MultiImageFortran.h"
 #include "flang/Lower/Runtime.h"
 #include "flang/Lower/Support/Utils.h"
 #include "flang/Optimizer/Builder/Character.h"
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 2517ab35d4ff0..53d4d7566acfa 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -1946,12 +1946,15 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter,
       return;
     }
     mlir::Value dummyScope;
-    if (converter.isRegisteredDummySymbol(sym))
+    unsigned argNo = 0;
+    if (converter.isRegisteredDummySymbol(sym)) {
       dummyScope = converter.dummyArgsScopeValue();
+      argNo = converter.getDummyArgPosition(sym);
+    }
     auto [storage, storageOffset] = converter.getSymbolStorage(sym);
     auto newBase = hlfir::DeclareOp::create(
         builder, loc, base, name, shapeOrShift, lenParams, dummyScope, storage,
-        storageOffset, attributes, dataAttr);
+        storageOffset, attributes, dataAttr, argNo);
     symMap.addVariableDefinition(sym, newBase, force);
     return;
   }
@@ -2004,15 +2007,17 @@ void Fortran::lower::genDeclareSymbol(
                                                         sym.GetUltimate());
     auto name = converter.mangleName(sym);
     mlir::Value dummyScope;
+    unsigned argNo = 0;
     fir::ExtendedValue base = exv;
     if (converter.isRegisteredDummySymbol(sym)) {
       base = genPackArray(converter, sym, exv);
       dummyScope = converter.dummyArgsScopeValue();
+      argNo = converter.getDummyArgPosition(sym);
     }
     auto [storage, storageOffset] = converter.getSymbolStorage(sym);
     hlfir::EntityWithAttributes declare =
         hlfir::genDeclare(loc, builder, base, name, attributes, dummyScope,
-                          storage, storageOffset, dataAttr);
+                          storage, storageOffset, dataAttr, argNo);
     symMap.addVariableDefinition(sym, declare.getIfVariableInterface(), force);
     return;
   }
diff --git a/flang/lib/Lower/MultiImageFortran.cpp b/flang/lib/Lower/MultiImageFortran.cpp
new file mode 100644
index 0000000000000..745ca2494708c
--- /dev/null
+++ b/flang/lib/Lower/MultiImageFortran.cpp
@@ -0,0 +1,278 @@
+//===-- MultiImageFortran.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Implementation of the lowering of image related constructs and expressions.
+/// Fortran images can form teams, communicate via coarrays, etc.
+///
+//===----------------------------------------------------------------------===//
+
+#include "flang/Lower/MultiImageFortran.h"
+#include "flang/Lower/AbstractConverter.h"
+#include "flang/Lower/SymbolMap.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/MIF/MIFOps.h"
+#include "flang/Parser/parse-tree.h"
+#include "flang/Semantics/expression.h"
+
+//===----------------------------------------------------------------------===//
+// Synchronization statements
+//===----------------------------------------------------------------------===//
+
+void Fortran::lower::genSyncAllStatement(
+    Fortran::lower::AbstractConverter &converter,
+    const Fortran::parser::SyncAllStmt &stmt) {
+  mlir::Location loc = converter.getCurrentLocation();
+  converter.checkCoarrayEnabled();
+
+  // Handle STAT and ERRMSG values
+  const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList = stmt.v;
+  auto [statAddr, errMsgAddr] = converter.genStatAndErrmsg(loc, statOrErrList);
+
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  mif::SyncAllOp::create(builder, loc, statAddr, errMsgAddr);
+}
+
+void Fortran::lower::genSyncImagesStatement(
+    Fortran::lower::AbstractConverter &converter,
+    const Fortran::parser::SyncImagesStmt &stmt) {
+  mlir::Location loc = converter.getCurrentLocation();
+  converter.checkCoarrayEnabled();
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+
+  // Handle STAT and ERRMSG values
+  const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList =
+      std::get<std::list<Fortran::parser::StatOrErrmsg>>(stmt.t);
+  auto [statAddr, errMsgAddr] = converter.genStatAndErrmsg(loc, statOrErrList);
+
+  // SYNC_IMAGES(*) is passed as count == -1 while  SYNC IMAGES([]) has count
+  // == 0. Note further that SYNC IMAGES(*) is not semantically equivalent to
+  // SYNC ALL.
+  Fortran::lower::StatementContext stmtCtx;
+  mlir::Value imageSet;
+  const Fortran::parser::SyncImagesStmt::ImageSet &imgSet =
+      std::get<Fortran::parser::SyncImagesStmt::ImageSet>(stmt.t);
+  std::visit(Fortran::common::visitors{
+                 [&](const Fortran::parser::IntExpr &intExpr) {
+                   const SomeExpr *expr = Fortran::semantics::GetExpr(intExpr);
+                   imageSet =
+                       fir::getBase(converter.genExprBox(loc, *expr, stmtCtx));
+                 },
+                 [&](const Fortran::parser::Star &) {
+                   // Image set is not set.
+                   imageSet = mlir::Value{};
+                 }},
+             imgSet.u);
+
+  mif::SyncImagesOp::create(builder, loc, imageSet, statAddr, errMsgAddr);
+}
+
+void Fortran::lower::genSyncMemoryStatement(
+    Fortran::lower::AbstractConverter &converter,
+    const Fortran::parser::SyncMemoryStmt &stmt) {
+  mlir::Location loc = converter.getCurrentLocation();
+  converter.checkCoarrayEnabled();
+
+  // Handle STAT and ERRMSG values
+  const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList = stmt.v;
+  auto [statAddr, errMsgAddr] = converter.genStatAndErrmsg(loc, statOrErrList);
+
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  mif::SyncMemoryOp::create(builder, loc, statAddr, errMsgAddr);
+}
+
+void Fortran::lower::genSyncTeamStatement(
+    Fortran::lower::AbstractConverter &converter,
+    const Fortran::parser::SyncTeamStmt &stmt) {
+  mlir::Location loc = converter.getCurrentLocation();
+  converter.checkCoarrayEnabled();
+
+  // Handle TEAM
+  Fortran::lower::StatementContext stmtCtx;
+  const Fortran::parser::TeamValue &teamValue =
+      std::get<Fortran::parser::TeamValue>(stmt.t);
+  const SomeExpr *teamExpr = Fortran::semantics::GetExpr(teamValue);
+  mlir::Value team =
+      fir::getBase(converter.genExprBox(loc, *teamExpr, stmtCtx));
+
+  // Handle STAT and ERRMSG values
+  const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList =
+      std::get<std::list<Fortran::parser::StatOrErrmsg>>(stmt.t);
+  auto [statAddr, errMsgAddr] = converter.genStatAndErrmsg(loc, statOrErrList);
+
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  mif::SyncTeamOp::create(builder, loc, team, statAddr, errMsgAddr);
+}
+
+//===----------------------------------------------------------------------===//
+// TEAM statements and constructs
+//===----------------------------------------------------------------------===//
+
+void Fortran::lower::genChangeTeamConstruct(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::lower::pft::Evaluation &,
+    const Fortran::parser::ChangeTeamConstruct &) {
+  TODO(converter.getCurrentLocation(), "coarray: CHANGE TEAM construct");
+}
+
+void Fortran::lower::genChangeTeamStmt(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::lower::pft::Evaluation &,
+    const Fortran::parser::ChangeTeamStmt &stmt) {
+  mlir::Location loc = converter.getCurrentLocation();
+  converter.checkCoarrayEnabled();
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+
+  mlir::Value errMsgAddr, statAddr, team;
+  // Handle STAT and ERRMSG values
+  Fortran::lower::StatementContext stmtCtx;
+  const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList =
+      std::get<std::list<Fortran::parser::StatOrErrmsg>>(stmt.t);
+  for (const Fortran::parser::StatOrErrmsg &statOrErr : statOrErrList) {
+    std::visit(Fortran::common::visitors{
+                   [&](const Fortran::parser::StatVariable &statVar) {
+                     const auto *expr = Fortran::semantics::GetExpr(statVar);
+                     statAddr = fir::getBase(
+                         converter.genExprAddr(loc, *expr, stmtCtx));
+                   },
+                   [&](const Fortran::parser::MsgVariable &errMsgVar) {
+                     const auto *expr = Fortran::semantics::GetExpr(errMsgVar);
+                     errMsgAddr = fir::getBase(
+                         converter.genExprBox(loc, *expr, stmtCtx));
+                   },
+               },
+               statOrErr.u);
+  }
+
+  // TODO: Manage the list of coarrays associated in
+  // `std::list<CoarrayAssociation>`. According to the PRIF specification, it is
+  // necessary to call `prif_alias_{create|destroy}` for each coarray defined in
+  // this list. Support will be added once lowering to this procedure is
+  // possible.
+  const std::list<Fortran::parser::CoarrayAssociation> &coarrayAssocList =
+      std::get<std::list<Fortran::parser::CoarrayAssociation>>(stmt.t);
+  if (coarrayAssocList.size())
+    TODO(loc, "Coarrays provided in the association list.");
+
+  // Handle TEAM-VALUE
+  const auto *teamExpr =
+      Fortran::semantics::GetExpr(std::get<Fortran::parser::TeamValue>(stmt.t));
+  team = fir::getBase(converter.genExprBox(loc, *teamExpr, stmtCtx));
+
+  mif::ChangeTeamOp changeOp = mif::ChangeTeamOp::create(
+      builder, loc, team, statAddr, errMsgAddr, /*terminator*/ false);
+  builder.setInsertionPointToStart(changeOp.getBody());
+}
+
+void Fortran::lower::genEndChangeTeamStmt(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::lower::pft::Evaluation &,
+    const Fortran::parser::EndChangeTeamStmt &stmt) {
+  converter.checkCoarrayEnabled();
+  mlir::Location loc = converter.getCurrentLocation();
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+
+  mlir::Value errMsgAddr, statAddr;
+  // Handle STAT and ERRMSG values
+  Fortran::lower::StatementContext stmtCtx;
+  const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList =
+      std::get<std::list<Fortran::parser::StatOrErrmsg>>(stmt.t);
+  for (const Fortran::parser::StatOrErrmsg &statOrErr : statOrErrList) {
+    std::visit(Fortran::common::visitors{
+                   [&](const Fortran::parser::StatVariable &statVar) {
+                     const auto *expr = Fortran::semantics::GetExpr(statVar);
+                     statAddr = fir::getBase(
+                         converter.genExprAddr(loc, *expr, stmtCtx));
+                   },
+                   [&](const Fortran::parser::MsgVariable &errMsgVar) {
+                     const auto *expr = Fortran::semantics::GetExpr(errMsgVar);
+                     errMsgAddr = fir::getBase(
+                         converter.genExprBox(loc, *expr, stmtCtx));
+                   },
+               },
+               statOrErr.u);
+  }
+
+  mif::EndTeamOp endOp =
+      mif::EndTeamOp::create(builder, loc, statAddr, errMsgAddr);
+  builder.setInsertionPointAfter(endOp.getParentOp());
+}
+
+void Fortran::lower::genFormTeamStatement(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::lower::pft::Evaluation &,
+    const Fortran::parser::FormTeamStmt &stmt) {
+  converter.checkCoarrayEnabled();
+  mlir::Location loc = converter.getCurrentLocation();
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+
+  mlir::Value errMsgAddr, statAddr, newIndex, teamNumber, team;
+  // Handle NEW_INDEX, STAT and ERRMSG
+  std::list<Fortran::parser::StatOrErrmsg> statOrErrList{};
+  Fortran::lower::StatementContext stmtCtx;
+  const auto &formSpecList =
+      std::get<std::list<Fortran::parser::FormTeamStmt::FormTeamSpec>>(stmt.t);
+  for (const Fortran::parser::FormTeamStmt::FormTeamSpec &formSpec :
+       formSpecList) {
+    std::visit(
+        Fortran::common::visitors{
+            [&](const Fortran::parser::StatOrErrmsg &statOrErr) {
+              std::visit(
+                  Fortran::common::visitors{
+                      [&](const Fortran::parser::StatVariable &statVar) {
+                        const auto *expr = Fortran::semantics::GetExpr(statVar);
+                        statAddr = fir::getBase(
+                            converter.genExprAddr(loc, *expr, stmtCtx));
+                      },
+                      [&](const Fortran::parser::MsgVariable &errMsgVar) {
+                        const auto *expr =
+                            Fortran::semantics::GetExpr(errMsgVar);
+                        errMsgAddr = fir::getBase(
+                            converter.genExprBox(loc, *expr, stmtCtx));
+                      },
+                  },
+                  statOrErr.u);
+            },
+            [&](const Fortran::parser::ScalarIntExpr &intExpr) {
+              fir::ExtendedValue newIndexExpr = converter.genExprValue(
+                  loc, Fortran::semantics::GetExpr(intExpr), stmtCtx);
+              newIndex = fir::getBase(newIndexExpr);
+            },
+        },
+        formSpec.u);
+  }
+
+  // Handle TEAM-NUMBER
+  const auto *teamNumberExpr = Fortran::semantics::GetExpr(
+      std::get<Fortran::parser::ScalarIntExpr>(stmt.t));
+  teamNumber =
+      fir::getBase(converter.genExprValue(loc, *teamNumberExpr, stmtCtx));
+
+  // Handle TEAM-VARIABLE
+  const auto *teamExpr = Fortran::semantics::GetExpr(
+      std::get<Fortran::parser::TeamVariable>(stmt.t));
+  team = fir::getBase(converter.genExprBox(loc, *teamExpr, stmtCtx));
+
+  mif::FormTeamOp::create(builder, loc, teamNumber, team, newIndex, statAddr,
+                          errMsgAddr);
+}
+
+//===----------------------------------------------------------------------===//
+// COARRAY expressions
+//===----------------------------------------------------------------------===//
+
+fir::ExtendedValue Fortran::lower::CoarrayExprHelper::genAddr(
+    const Fortran::evaluate::CoarrayRef &expr) {
+  (void)symMap;
+  TODO(converter.getCurrentLocation(), "co-array address");
+}
+
+fir::ExtendedValue Fortran::lower::CoarrayExprHelper::genValue(
+    const Fortran::evaluate::CoarrayRef &expr) {
+  TODO(converter.getCurrentLocation(), "co-array value");
+}
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index d7861ac6463c8..98a3aced3f528 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -28,6 +28,7 @@
 #include "flang/Optimizer/Builder/IntrinsicCall.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h"
 #include "flang/Parser/parse-tree-visitor.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Parser/tools.h"
@@ -1159,18 +1160,6 @@ bool isConstantBound(mlir::acc::DataBoundsOp &op) {
   return false;
 }
 
-/// Return true iff all the bounds are expressed with constant values.
-bool areAllBoundConstant(const llvm::SmallVector<mlir::Value> &bounds) {
-  for (auto bound : bounds) {
-    auto dataBound =
-        mlir::dyn_cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
-    assert(dataBound && "Must be DataBoundOp operation");
-    if (!isConstantBound(dataBound))
-      return false;
-  }
-  return true;
-}
-
 static llvm::SmallVector<mlir::Value>
 genConstantBounds(fir::FirOpBuilder &builder, mlir::Location loc,
                   mlir::acc::DataBoundsOp &dataBound) {
@@ -1196,59 +1185,6 @@ genConstantBounds(fir::FirOpBuilder &builder, mlir::Location loc,
   return {lb, ub, step};
 }
 
-static mlir::Value genShapeFromBoundsOrArgs(
-    mlir::Location loc, fir::FirOpBuilder &builder, fir::SequenceType seqTy,
-    const llvm::SmallVector<mlir::Value> &bounds, mlir::ValueRange arguments) {
-  llvm::SmallVector<mlir::Value> args;
-  if (bounds.empty() && seqTy) {
-    if (seqTy.hasDynamicExtents()) {
-      assert(!arguments.empty() && "arguments must hold the entity");
-      auto entity = hlfir::Entity{arguments[0]};
-      return hlfir::genShape(loc, builder, entity);
-    }
-    return genShapeOp(builder, seqTy, loc).getResult();
-  } else if (areAllBoundConstant(bounds)) {
-    for (auto bound : llvm::reverse(bounds)) {
-      auto dataBound =
-          mlir::cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
-      args.append(genConstantBounds(builder, loc, dataBound));
-    }
-  } else {
-    assert(((arguments.size() - 2) / 3 == seqTy.getDimension()) &&
-           "Expect 3 block arguments per dimension");
-    for (auto arg : arguments.drop_front(2))
-      args.push_back(arg);
-  }
-
-  assert(args.size() % 3 == 0 && "Triplets must be a multiple of 3");
-  llvm::SmallVector<mlir::Value> extents;
-  mlir::Type idxTy = builder.getIndexType();
-  mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
-  mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0);
-  for (unsigned i = 0; i < args.size(); i += 3) {
-    mlir::Value s1 =
-        mlir::arith::SubIOp::create(builder, loc, args[i + 1], args[0]);
-    mlir::Value s2 = mlir::arith::AddIOp::create(builder, loc, s1, one);
-    mlir::Value s3 =
-        mlir::arith::DivSIOp::create(builder, loc, s2, args[i + 2]);
-    mlir::Value cmp = mlir::arith::CmpIOp::create(
-        builder, loc, mlir::arith::CmpIPredicate::sgt, s3, zero);
-    mlir::Value ext =
-        mlir::arith::SelectOp::create(builder, loc, cmp, s3, zero);
-    extents.push_back(ext);
-  }
-  return fir::ShapeOp::create(builder, loc, extents);
-}
-
-static hlfir::DesignateOp::Subscripts
-getSubscriptsFromArgs(mlir::ValueRange args) {
-  hlfir::DesignateOp::Subscripts triplets;
-  for (unsigned i = 2; i < args.size(); i += 3)
-    triplets.emplace_back(
-        hlfir::DesignateOp::Triplet{args[i], args[i + 1], args[i + 2]});
-  return triplets;
-}
-
 static hlfir::Entity genDesignateWithTriplets(
     fir::FirOpBuilder &builder, mlir::Location loc, hlfir::Entity &entity,
     hlfir::DesignateOp::Subscripts &triplets, mlir::Value shape) {
@@ -1262,19 +1198,88 @@ static hlfir::Entity genDesignateWithTriplets(
   return hlfir::Entity{designate.getResult()};
 }
 
-mlir::acc::FirstprivateRecipeOp Fortran::lower::createOrGetFirstprivateRecipe(
-    fir::FirOpBuilder &builder, llvm::StringRef recipeName, mlir::Location loc,
-    mlir::Type ty, llvm::SmallVector<mlir::Value> &bounds) {
-  mlir::ModuleOp mod =
-      builder.getBlock()->getParent()->getParentOfType<mlir::ModuleOp>();
-  if (auto recipe =
-          mod.lookupSymbol<mlir::acc::FirstprivateRecipeOp>(recipeName))
-    return recipe;
+// Designate uses triplets based on object lower bounds while acc.bounds are
+// zero based. This helper shift the bounds to create the designate triplets.
+static hlfir::DesignateOp::Subscripts
+genTripletsFromAccBounds(fir::FirOpBuilder &builder, mlir::Location loc,
+                         const llvm::SmallVector<mlir::Value> &accBounds,
+                         hlfir::Entity entity) {
+  assert(entity.getRank() * 3 == static_cast<int>(accBounds.size()) &&
+         "must get lb,ub,step for each dimension");
+  hlfir::DesignateOp::Subscripts triplets;
+  for (unsigned i = 0; i < accBounds.size(); i += 3) {
+    mlir::Value lb = hlfir::genLBound(loc, builder, entity, i / 3);
+    lb = builder.createConvert(loc, accBounds[i].getType(), lb);
+    assert(accBounds[i].getType() == accBounds[i + 1].getType() &&
+           "mix of integer types in triplets");
+    mlir::Value sliceLB =
+        builder.createOrFold<mlir::arith::AddIOp>(loc, accBounds[i], lb);
+    mlir::Value sliceUB =
+        builder.createOrFold<mlir::arith::AddIOp>(loc, accBounds[i + 1], lb);
+    triplets.emplace_back(
+        hlfir::DesignateOp::Triplet{sliceLB, sliceUB, accBounds[i + 2]});
+  }
+  return triplets;
+}
 
-  auto ip = builder.saveInsertionPoint();
-  auto recipe = genRecipeOp<mlir::acc::FirstprivateRecipeOp>(
-      builder, mod, recipeName, loc, ty);
-  bool allConstantBound = areAllBoundConstant(bounds);
+static std::pair<hlfir::Entity, hlfir::Entity>
+genArraySectionsInRecipe(fir::FirOpBuilder &builder, mlir::Location loc,
+                         llvm::SmallVector<mlir::Value> &dataOperationBounds,
+                         mlir::ValueRange recipeArguments,
+                         bool allConstantBound, hlfir::Entity lhs,
+                         hlfir::Entity rhs) {
+  lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs);
+  rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs);
+  // Get the list of lb,ub,step values for the sections that can be used inside
+  // the recipe region.
+  llvm::SmallVector<mlir::Value> bounds;
+  if (allConstantBound) {
+    // For constant bounds, the bounds are not region arguments. Materialize
+    // constants looking at the IR for the bounds on the data operation.
+    for (auto bound : dataOperationBounds) {
+      auto dataBound =
+          mlir::cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
+      bounds.append(genConstantBounds(builder, loc, dataBound));
+    }
+  } else {
+    // If one bound is not constant, all of the bounds are region arguments.
+    for (auto arg : recipeArguments.drop_front(2))
+      bounds.push_back(arg);
+  }
+  // Compute the fir.shape of the array section and the triplets to create
+  // hlfir.designate.
+  assert(lhs.getRank() * 3 == static_cast<int>(bounds.size()) &&
+         "must get lb,ub,step for each dimension");
+  llvm::SmallVector<mlir::Value> extents;
+  mlir::Type idxTy = builder.getIndexType();
+  for (unsigned i = 0; i < bounds.size(); i += 3)
+    extents.push_back(builder.genExtentFromTriplet(
+        loc, bounds[i], bounds[i + 1], bounds[i + 2], idxTy));
+  mlir::Value shape = fir::ShapeOp::create(builder, loc, extents);
+  hlfir::DesignateOp::Subscripts rhsTriplets =
+      genTripletsFromAccBounds(builder, loc, bounds, rhs);
+  hlfir::DesignateOp::Subscripts lhsTriplets;
+  // Share the bounds when both rhs/lhs are known to be 1-based to avoid noise
+  // in the IR for the most common cases.
+  if (!lhs.mayHaveNonDefaultLowerBounds() &&
+      !rhs.mayHaveNonDefaultLowerBounds())
+    lhsTriplets = rhsTriplets;
+  else
+    lhsTriplets = genTripletsFromAccBounds(builder, loc, bounds, lhs);
+  hlfir::Entity leftSection =
+      genDesignateWithTriplets(builder, loc, lhs, lhsTriplets, shape);
+  hlfir::Entity rightSection =
+      genDesignateWithTriplets(builder, loc, rhs, rhsTriplets, shape);
+  return {leftSection, rightSection};
+}
+
+// Generate the combiner or copy region block and block arguments and return the
+// source and destination entities.
+static std::pair<hlfir::Entity, hlfir::Entity>
+genRecipeCombinerOrCopyRegion(fir::FirOpBuilder &builder, mlir::Location loc,
+                              mlir::Type ty, mlir::Region &region,
+                              llvm::SmallVector<mlir::Value> &bounds,
+                              bool allConstantBound) {
   llvm::SmallVector<mlir::Type> argsTy{ty, ty};
   llvm::SmallVector<mlir::Location> argsLoc{loc, loc};
   if (!allConstantBound) {
@@ -1289,100 +1294,57 @@ mlir::acc::FirstprivateRecipeOp Fortran::lower::createOrGetFirstprivateRecipe(
       argsLoc.push_back(dataBound.getStartIdx().getLoc());
     }
   }
-  builder.createBlock(&recipe.getCopyRegion(), recipe.getCopyRegion().end(),
-                      argsTy, argsLoc);
+  mlir::Block *block =
+      builder.createBlock(&region, region.end(), argsTy, argsLoc);
+  builder.setInsertionPointToEnd(&region.back());
+  return {hlfir::Entity{block->getArgument(0)},
+          hlfir::Entity{block->getArgument(1)}};
+}
 
-  builder.setInsertionPointToEnd(&recipe.getCopyRegion().back());
-  ty = fir::unwrapRefType(ty);
-  if (fir::isa_trivial(ty)) {
-    mlir::Value initValue = fir::LoadOp::create(
-        builder, loc, recipe.getCopyRegion().front().getArgument(0));
-    fir::StoreOp::create(builder, loc, initValue,
-                         recipe.getCopyRegion().front().getArgument(1));
-  } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(ty)) {
-    fir::FirOpBuilder firBuilder{builder, recipe.getOperation()};
-    auto shape = genShapeFromBoundsOrArgs(
-        loc, firBuilder, seqTy, bounds, recipe.getCopyRegion().getArguments());
-
-    auto leftDeclOp = hlfir::DeclareOp::create(
-        builder, loc, recipe.getCopyRegion().getArgument(0), llvm::StringRef{},
-        shape);
-    auto rightDeclOp = hlfir::DeclareOp::create(
-        builder, loc, recipe.getCopyRegion().getArgument(1), llvm::StringRef{},
-        shape);
-
-    hlfir::DesignateOp::Subscripts triplets =
-        getSubscriptsFromArgs(recipe.getCopyRegion().getArguments());
-    auto leftEntity = hlfir::Entity{leftDeclOp.getBase()};
-    auto left =
-        genDesignateWithTriplets(firBuilder, loc, leftEntity, triplets, shape);
-    auto rightEntity = hlfir::Entity{rightDeclOp.getBase()};
-    auto right =
-        genDesignateWithTriplets(firBuilder, loc, rightEntity, triplets, shape);
-
-    hlfir::AssignOp::create(firBuilder, loc, left, right);
-
-  } else if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(ty)) {
-    fir::FirOpBuilder firBuilder{builder, recipe.getOperation()};
-    llvm::SmallVector<mlir::Value> tripletArgs;
-    mlir::Type innerTy = fir::extractSequenceType(boxTy);
-    fir::SequenceType seqTy =
-        mlir::dyn_cast_or_null<fir::SequenceType>(innerTy);
-    if (!seqTy)
-      TODO(loc, "Unsupported boxed type in OpenACC firstprivate");
-
-    auto shape = genShapeFromBoundsOrArgs(
-        loc, firBuilder, seqTy, bounds, recipe.getCopyRegion().getArguments());
-    hlfir::DesignateOp::Subscripts triplets =
-        getSubscriptsFromArgs(recipe.getCopyRegion().getArguments());
-    auto leftEntity = hlfir::Entity{recipe.getCopyRegion().getArgument(0)};
-    auto left =
-        genDesignateWithTriplets(firBuilder, loc, leftEntity, triplets, shape);
-    auto rightEntity = hlfir::Entity{recipe.getCopyRegion().getArgument(1)};
-    auto right =
-        genDesignateWithTriplets(firBuilder, loc, rightEntity, triplets, shape);
-    hlfir::AssignOp::create(firBuilder, loc, left, right);
-  } else {
-    // Copy scalar derived type.
-    // The temporary_lhs flag allows indicating that user defined assignments
-    // should not be called while copying components, and that the LHS and RHS
-    // are known to not alias since the LHS is a created object.
-    hlfir::AssignOp::create(
-        builder, loc, recipe.getCopyRegion().getArgument(0),
-        recipe.getCopyRegion().getArgument(1), /*realloc=*/false,
-        /*keep_lhs_length_if_realloc=*/false, /*temporary_lhs=*/true);
-  }
+mlir::acc::FirstprivateRecipeOp Fortran::lower::createOrGetFirstprivateRecipe(
+    fir::FirOpBuilder &builder, llvm::StringRef recipeName, mlir::Location loc,
+    mlir::Type ty, llvm::SmallVector<mlir::Value> &bounds) {
+  mlir::ModuleOp mod =
+      builder.getBlock()->getParent()->getParentOfType<mlir::ModuleOp>();
+  if (auto recipe =
+          mod.lookupSymbol<mlir::acc::FirstprivateRecipeOp>(recipeName))
+    return recipe;
 
-  mlir::acc::TerminatorOp::create(builder, loc);
-  builder.restoreInsertionPoint(ip);
-  return recipe;
-}
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  auto recipe = genRecipeOp<mlir::acc::FirstprivateRecipeOp>(
+      builder, mod, recipeName, loc, ty);
+  bool allConstantBound = fir::acc::areAllBoundsConstant(bounds);
+  auto [source, destination] = genRecipeCombinerOrCopyRegion(
+      builder, loc, ty, recipe.getCopyRegion(), bounds, allConstantBound);
+
+  fir::FirOpBuilder firBuilder{builder, recipe.getOperation()};
+
+  source = hlfir::derefPointersAndAllocatables(loc, builder, source);
+  destination = hlfir::derefPointersAndAllocatables(loc, builder, destination);
 
-/// Get a string representation of the bounds.
-std::string getBoundsString(llvm::SmallVector<mlir::Value> &bounds) {
-  std::stringstream boundStr;
   if (!bounds.empty())
-    boundStr << "_section_";
-  llvm::interleave(
-      bounds,
-      [&](mlir::Value bound) {
-        auto boundsOp =
-            mlir::cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
-        if (boundsOp.getLowerbound() &&
-            fir::getIntIfConstant(boundsOp.getLowerbound()) &&
-            boundsOp.getUpperbound() &&
-            fir::getIntIfConstant(boundsOp.getUpperbound())) {
-          boundStr << "lb" << *fir::getIntIfConstant(boundsOp.getLowerbound())
-                   << ".ub" << *fir::getIntIfConstant(boundsOp.getUpperbound());
-        } else if (boundsOp.getExtent() &&
-                   fir::getIntIfConstant(boundsOp.getExtent())) {
-          boundStr << "ext" << *fir::getIntIfConstant(boundsOp.getExtent());
-        } else {
-          boundStr << "?";
-        }
-      },
-      [&] { boundStr << "x"; });
-  return boundStr.str();
+    std::tie(source, destination) = genArraySectionsInRecipe(
+        firBuilder, loc, bounds, recipe.getCopyRegion().getArguments(),
+        allConstantBound, source, destination);
+  // The source and the destination of the firstprivate copy cannot alias,
+  // the destination is already properly allocated, so a simple assignment
+  // can be generated right away to avoid ending-up with runtime calls
+  // for arrays of numerical, logical and, character types.
+  //
+  // The temporary_lhs flag allows indicating that user defined assignments
+  // should not be called while copying components, and that the LHS and RHS
+  // are known to not alias since the LHS is a created object.
+  //
+  // TODO: detect cases where user defined assignment is needed and add a TODO.
+  // using temporary_lhs allows more aggressive optimizations of simple derived
+  // types. Existing compilers supporting OpenACC do not call user defined
+  // assignments, some use case is needed to decide what to do.
+  source = hlfir::loadTrivialScalar(loc, builder, source);
+  hlfir::AssignOp::create(builder, loc, source, destination, /*realloc=*/false,
+                          /*keep_lhs_length_if_realloc=*/false,
+                          /*temporary_lhs=*/true);
+  mlir::acc::TerminatorOp::create(builder, loc);
+  return recipe;
 }
 
 /// Rebuild the array type from the acc.bounds operation with constant
@@ -1458,9 +1420,8 @@ static void genPrivatizationRecipes(
     RecipeOp recipe;
     mlir::Type retTy = getTypeFromBounds(bounds, info.addr.getType());
     if constexpr (std::is_same_v<RecipeOp, mlir::acc::PrivateRecipeOp>) {
-      std::string recipeName =
-          fir::getTypeAsString(retTy, converter.getKindMap(),
-                               Fortran::lower::privatizationRecipePrefix);
+      std::string recipeName = fir::acc::getRecipeName(
+          mlir::acc::RecipeKind::private_recipe, retTy, info.addr, bounds);
       recipe = Fortran::lower::createOrGetPrivateRecipe(builder, recipeName,
                                                         operandLocation, retTy);
       auto op = createDataEntryOp<mlir::acc::PrivateOp>(
@@ -1474,10 +1435,8 @@ static void genPrivatizationRecipes(
         symbolPairs->emplace_back(op.getAccVar(),
                                   Fortran::semantics::SymbolRef(symbol));
     } else {
-      std::string suffix =
-          areAllBoundConstant(bounds) ? getBoundsString(bounds) : "";
-      std::string recipeName = fir::getTypeAsString(
-          retTy, converter.getKindMap(), "firstprivatization" + suffix);
+      std::string recipeName = fir::acc::getRecipeName(
+          mlir::acc::RecipeKind::firstprivate_recipe, retTy, info.addr, bounds);
       recipe = Fortran::lower::createOrGetFirstprivateRecipe(
           builder, recipeName, operandLocation, retTy, bounds);
       auto op = createDataEntryOp<mlir::acc::FirstprivateOp>(
@@ -1611,205 +1570,6 @@ static mlir::Value genScalarCombiner(fir::FirOpBuilder &builder,
   TODO(loc, "reduction operator");
 }
 
-static hlfir::DesignateOp::Subscripts
-getTripletsFromArgs(mlir::acc::ReductionRecipeOp recipe) {
-  hlfir::DesignateOp::Subscripts triplets;
-  for (unsigned i = 2; i < recipe.getCombinerRegion().getArguments().size();
-       i += 3)
-    triplets.emplace_back(hlfir::DesignateOp::Triplet{
-        recipe.getCombinerRegion().getArgument(i),
-        recipe.getCombinerRegion().getArgument(i + 1),
-        recipe.getCombinerRegion().getArgument(i + 2)});
-  return triplets;
-}
-
-static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
-                        mlir::acc::ReductionOperator op, mlir::Type ty,
-                        mlir::Value value1, mlir::Value value2,
-                        mlir::acc::ReductionRecipeOp &recipe,
-                        llvm::SmallVector<mlir::Value> &bounds,
-                        bool allConstantBound) {
-  ty = fir::unwrapRefType(ty);
-
-  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty)) {
-    mlir::Type refTy = fir::ReferenceType::get(seqTy.getEleTy());
-    llvm::SmallVector<fir::DoLoopOp> loops;
-    llvm::SmallVector<mlir::Value> ivs;
-    if (seqTy.hasDynamicExtents()) {
-      auto shape =
-          genShapeFromBoundsOrArgs(loc, builder, seqTy, bounds,
-                                   recipe.getCombinerRegion().getArguments());
-      auto v1DeclareOp = hlfir::DeclareOp::create(builder, loc, value1,
-                                                  llvm::StringRef{}, shape);
-      auto v2DeclareOp = hlfir::DeclareOp::create(builder, loc, value2,
-                                                  llvm::StringRef{}, shape);
-      hlfir::DesignateOp::Subscripts triplets = getTripletsFromArgs(recipe);
-
-      llvm::SmallVector<mlir::Value> lenParamsLeft;
-      auto leftEntity = hlfir::Entity{v1DeclareOp.getBase()};
-      hlfir::genLengthParameters(loc, builder, leftEntity, lenParamsLeft);
-      auto leftDesignate = hlfir::DesignateOp::create(
-          builder, loc, v1DeclareOp.getBase().getType(), v1DeclareOp.getBase(),
-          /*component=*/"",
-          /*componentShape=*/mlir::Value{}, triplets,
-          /*substring=*/mlir::ValueRange{}, /*complexPartAttr=*/std::nullopt,
-          shape, lenParamsLeft);
-      auto left = hlfir::Entity{leftDesignate.getResult()};
-
-      llvm::SmallVector<mlir::Value> lenParamsRight;
-      auto rightEntity = hlfir::Entity{v2DeclareOp.getBase()};
-      hlfir::genLengthParameters(loc, builder, rightEntity, lenParamsLeft);
-      auto rightDesignate = hlfir::DesignateOp::create(
-          builder, loc, v2DeclareOp.getBase().getType(), v2DeclareOp.getBase(),
-          /*component=*/"",
-          /*componentShape=*/mlir::Value{}, triplets,
-          /*substring=*/mlir::ValueRange{}, /*complexPartAttr=*/std::nullopt,
-          shape, lenParamsRight);
-      auto right = hlfir::Entity{rightDesignate.getResult()};
-
-      llvm::SmallVector<mlir::Value, 1> typeParams;
-      auto genKernel = [&builder, &loc, op, seqTy, &left, &right](
-                           mlir::Location l, fir::FirOpBuilder &b,
-                           mlir::ValueRange oneBasedIndices) -> hlfir::Entity {
-        auto leftElement = hlfir::getElementAt(l, b, left, oneBasedIndices);
-        auto rightElement = hlfir::getElementAt(l, b, right, oneBasedIndices);
-        auto leftVal = hlfir::loadTrivialScalar(l, b, leftElement);
-        auto rightVal = hlfir::loadTrivialScalar(l, b, rightElement);
-        return hlfir::Entity{genScalarCombiner(
-            builder, loc, op, seqTy.getEleTy(), leftVal, rightVal)};
-      };
-      mlir::Value elemental = hlfir::genElementalOp(
-          loc, builder, seqTy.getEleTy(), shape, typeParams, genKernel,
-          /*isUnordered=*/true);
-      hlfir::AssignOp::create(builder, loc, elemental, v1DeclareOp.getBase());
-      return;
-    }
-    if (bounds.empty()) {
-      llvm::SmallVector<mlir::Value> extents;
-      mlir::Type idxTy = builder.getIndexType();
-      for (auto extent : llvm::reverse(seqTy.getShape())) {
-        mlir::Value lb = mlir::arith::ConstantOp::create(
-            builder, loc, idxTy, builder.getIntegerAttr(idxTy, 0));
-        mlir::Value ub = mlir::arith::ConstantOp::create(
-            builder, loc, idxTy, builder.getIntegerAttr(idxTy, extent - 1));
-        mlir::Value step = mlir::arith::ConstantOp::create(
-            builder, loc, idxTy, builder.getIntegerAttr(idxTy, 1));
-        auto loop = fir::DoLoopOp::create(builder, loc, lb, ub, step,
-                                          /*unordered=*/false);
-        builder.setInsertionPointToStart(loop.getBody());
-        loops.push_back(loop);
-        ivs.push_back(loop.getInductionVar());
-      }
-    } else if (allConstantBound) {
-      // Use the constant bound directly in the combiner region so they do not
-      // need to be passed as block argument.
-      assert(!bounds.empty() &&
-             "seq type with constant bounds cannot have empty bounds");
-      for (auto bound : llvm::reverse(bounds)) {
-        auto dataBound =
-            mlir::dyn_cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
-        llvm::SmallVector<mlir::Value> values =
-            genConstantBounds(builder, loc, dataBound);
-        auto loop =
-            fir::DoLoopOp::create(builder, loc, values[0], values[1], values[2],
-                                  /*unordered=*/false);
-        builder.setInsertionPointToStart(loop.getBody());
-        loops.push_back(loop);
-        ivs.push_back(loop.getInductionVar());
-      }
-    } else {
-      // Lowerbound, upperbound and step are passed as block arguments.
-      unsigned nbRangeArgs =
-          recipe.getCombinerRegion().getArguments().size() - 2;
-      assert((nbRangeArgs / 3 == seqTy.getDimension()) &&
-             "Expect 3 block arguments per dimension");
-      for (int i = nbRangeArgs - 1; i >= 2; i -= 3) {
-        mlir::Value lb = recipe.getCombinerRegion().getArgument(i);
-        mlir::Value ub = recipe.getCombinerRegion().getArgument(i + 1);
-        mlir::Value step = recipe.getCombinerRegion().getArgument(i + 2);
-        auto loop = fir::DoLoopOp::create(builder, loc, lb, ub, step,
-                                          /*unordered=*/false);
-        builder.setInsertionPointToStart(loop.getBody());
-        loops.push_back(loop);
-        ivs.push_back(loop.getInductionVar());
-      }
-    }
-    llvm::SmallVector<mlir::Value> reversedIvs(ivs.rbegin(), ivs.rend());
-    auto addr1 =
-        fir::CoordinateOp::create(builder, loc, refTy, value1, reversedIvs);
-    auto addr2 =
-        fir::CoordinateOp::create(builder, loc, refTy, value2, reversedIvs);
-    auto load1 = fir::LoadOp::create(builder, loc, addr1);
-    auto load2 = fir::LoadOp::create(builder, loc, addr2);
-    mlir::Value res =
-        genScalarCombiner(builder, loc, op, seqTy.getEleTy(), load1, load2);
-    fir::StoreOp::create(builder, loc, res, addr1);
-    builder.setInsertionPointAfter(loops[0]);
-  } else if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
-    mlir::Type innerTy = fir::unwrapRefType(boxTy.getEleTy());
-    if (fir::isa_trivial(innerTy)) {
-      mlir::Value boxAddr1 = value1, boxAddr2 = value2;
-      if (fir::isBoxAddress(boxAddr1.getType()))
-        boxAddr1 = fir::LoadOp::create(builder, loc, boxAddr1);
-      if (fir::isBoxAddress(boxAddr2.getType()))
-        boxAddr2 = fir::LoadOp::create(builder, loc, boxAddr2);
-      boxAddr1 = fir::BoxAddrOp::create(builder, loc, boxAddr1);
-      boxAddr2 = fir::BoxAddrOp::create(builder, loc, boxAddr2);
-      auto leftEntity = hlfir::Entity{boxAddr1};
-      auto rightEntity = hlfir::Entity{boxAddr2};
-
-      auto leftVal = hlfir::loadTrivialScalar(loc, builder, leftEntity);
-      auto rightVal = hlfir::loadTrivialScalar(loc, builder, rightEntity);
-      mlir::Value res =
-          genScalarCombiner(builder, loc, op, innerTy, leftVal, rightVal);
-      hlfir::AssignOp::create(builder, loc, res, boxAddr1);
-    } else {
-      mlir::Type innerTy = fir::extractSequenceType(boxTy);
-      fir::SequenceType seqTy =
-          mlir::dyn_cast_or_null<fir::SequenceType>(innerTy);
-      if (!seqTy)
-        TODO(loc, "Unsupported boxed type in OpenACC reduction combiner");
-
-      auto shape =
-          genShapeFromBoundsOrArgs(loc, builder, seqTy, bounds,
-                                   recipe.getCombinerRegion().getArguments());
-      hlfir::DesignateOp::Subscripts triplets =
-          getSubscriptsFromArgs(recipe.getCombinerRegion().getArguments());
-      auto leftEntity = hlfir::Entity{value1};
-      if (fir::isBoxAddress(value1.getType()))
-        leftEntity = hlfir::Entity{
-            fir::LoadOp::create(builder, loc, value1).getResult()};
-      auto left =
-          genDesignateWithTriplets(builder, loc, leftEntity, triplets, shape);
-      auto rightEntity = hlfir::Entity{value2};
-      if (fir::isBoxAddress(value2.getType()))
-        rightEntity = hlfir::Entity{
-            fir::LoadOp::create(builder, loc, value2).getResult()};
-      auto right =
-          genDesignateWithTriplets(builder, loc, rightEntity, triplets, shape);
-
-      llvm::SmallVector<mlir::Value, 1> typeParams;
-      auto genKernel = [&builder, &loc, op, seqTy, &left, &right](
-                           mlir::Location l, fir::FirOpBuilder &b,
-                           mlir::ValueRange oneBasedIndices) -> hlfir::Entity {
-        auto leftElement = hlfir::getElementAt(l, b, left, oneBasedIndices);
-        auto rightElement = hlfir::getElementAt(l, b, right, oneBasedIndices);
-        auto leftVal = hlfir::loadTrivialScalar(l, b, leftElement);
-        auto rightVal = hlfir::loadTrivialScalar(l, b, rightElement);
-        return hlfir::Entity{genScalarCombiner(
-            builder, loc, op, seqTy.getEleTy(), leftVal, rightVal)};
-      };
-      mlir::Value elemental = hlfir::genElementalOp(
-          loc, builder, seqTy.getEleTy(), shape, typeParams, genKernel,
-          /*isUnordered=*/true);
-      hlfir::AssignOp::create(builder, loc, elemental, value1);
-    }
-  } else {
-    mlir::Value res = genScalarCombiner(builder, loc, op, ty, value1, value2);
-    fir::StoreOp::create(builder, loc, res, value1);
-  }
-}
-
 mlir::acc::ReductionRecipeOp Fortran::lower::createOrGetReductionRecipe(
     fir::FirOpBuilder &builder, llvm::StringRef recipeName, mlir::Location loc,
     mlir::Type ty, mlir::acc::ReductionOperator op,
@@ -1819,37 +1579,33 @@ mlir::acc::ReductionRecipeOp Fortran::lower::createOrGetReductionRecipe(
   if (auto recipe = mod.lookupSymbol<mlir::acc::ReductionRecipeOp>(recipeName))
     return recipe;
 
-  auto ip = builder.saveInsertionPoint();
-
+  mlir::OpBuilder::InsertionGuard guard(builder);
   auto recipe = genRecipeOp<mlir::acc::ReductionRecipeOp>(
       builder, mod, recipeName, loc, ty, op);
-
-  // The two first block arguments are the two values to be combined.
-  // The next arguments are the iteration ranges (lb, ub, step) to be used
-  // for the combiner if needed.
-  llvm::SmallVector<mlir::Type> argsTy{ty, ty};
-  llvm::SmallVector<mlir::Location> argsLoc{loc, loc};
-  bool allConstantBound = areAllBoundConstant(bounds);
-  if (!allConstantBound) {
-    for (mlir::Value bound : llvm::reverse(bounds)) {
-      auto dataBound =
-          mlir::dyn_cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
-      argsTy.push_back(dataBound.getLowerbound().getType());
-      argsLoc.push_back(dataBound.getLowerbound().getLoc());
-      argsTy.push_back(dataBound.getUpperbound().getType());
-      argsLoc.push_back(dataBound.getUpperbound().getLoc());
-      argsTy.push_back(dataBound.getStartIdx().getType());
-      argsLoc.push_back(dataBound.getStartIdx().getLoc());
-    }
-  }
-  builder.createBlock(&recipe.getCombinerRegion(),
-                      recipe.getCombinerRegion().end(), argsTy, argsLoc);
-  builder.setInsertionPointToEnd(&recipe.getCombinerRegion().back());
-  mlir::Value v1 = recipe.getCombinerRegion().front().getArgument(0);
-  mlir::Value v2 = recipe.getCombinerRegion().front().getArgument(1);
-  genCombiner(builder, loc, op, ty, v1, v2, recipe, bounds, allConstantBound);
-  mlir::acc::YieldOp::create(builder, loc, v1);
-  builder.restoreInsertionPoint(ip);
+  bool allConstantBound = fir::acc::areAllBoundsConstant(bounds);
+
+  auto [dest, src] = genRecipeCombinerOrCopyRegion(
+      builder, loc, ty, recipe.getCombinerRegion(), bounds, allConstantBound);
+  // Generate loops that combine and assign the inputs into dest (or array
+  // section of the inputs when there are bounds).
+  hlfir::Entity srcSection = src;
+  hlfir::Entity destSection = dest;
+  if (!bounds.empty())
+    std::tie(srcSection, destSection) = genArraySectionsInRecipe(
+        builder, loc, bounds, recipe.getCombinerRegion().getArguments(),
+        allConstantBound, srcSection, destSection);
+
+  mlir::Type elementType = fir::getFortranElementType(ty);
+  auto genKernel = [&](mlir::Location l, fir::FirOpBuilder &b,
+                       hlfir::Entity srcElementValue,
+                       hlfir::Entity destElementValue) -> hlfir::Entity {
+    return hlfir::Entity{genScalarCombiner(builder, loc, op, elementType,
+                                           srcElementValue, destElementValue)};
+  };
+  hlfir::genNoAliasAssignment(loc, builder, srcSection, destSection,
+                              /*emitWorkshareLoop=*/false,
+                              /*temporaryLHS=*/false, genKernel);
+  mlir::acc::YieldOp::create(builder, loc, dest);
   return recipe;
 }
 
@@ -1911,15 +1667,12 @@ genReductions(const Fortran::parser::AccObjectListWithReduction &objectList,
         mlir::acc::DataClause::acc_reduction, info.addr.getType(), async,
         asyncDeviceTypes, asyncOnlyDeviceTypes, /*unwrapBoxAddr=*/true);
     mlir::Type ty = op.getAccVar().getType();
-    if (!areAllBoundConstant(bounds) ||
+    if (!fir::acc::areAllBoundsConstant(bounds) ||
         fir::isAssumedShape(info.addr.getType()) ||
         fir::isAllocatableOrPointerArray(info.addr.getType()))
       ty = info.addr.getType();
-    std::string suffix =
-        areAllBoundConstant(bounds) ? getBoundsString(bounds) : "";
-    std::string recipeName = fir::getTypeAsString(
-        ty, converter.getKindMap(),
-        ("reduction_" + stringifyReductionOperator(mlirOp)).str() + suffix);
+    std::string recipeName = fir::acc::getRecipeName(
+        mlir::acc::RecipeKind::reduction_recipe, ty, info.addr, bounds, mlirOp);
 
     mlir::acc::ReductionRecipeOp recipe =
         Fortran::lower::createOrGetReductionRecipe(
@@ -2164,9 +1917,8 @@ static void privatizeIv(
   }
 
   if (privateOp == nullptr) {
-    std::string recipeName =
-        fir::getTypeAsString(ivValue.getType(), converter.getKindMap(),
-                             Fortran::lower::privatizationRecipePrefix);
+    std::string recipeName = fir::acc::getRecipeName(
+        mlir::acc::RecipeKind::private_recipe, ivValue.getType(), ivValue, {});
     auto recipe = Fortran::lower::createOrGetPrivateRecipe(
         builder, recipeName, loc, ivValue.getType());
 
@@ -2251,6 +2003,49 @@ static void determineDefaultLoopParMode(
   }
 }
 
+// Helper to visit Bounds of DO LOOP nest.
+static void visitLoopControl(
+    Fortran::lower::AbstractConverter &converter,
+    const Fortran::parser::DoConstruct &outerDoConstruct,
+    uint64_t loopsToProcess, Fortran::lower::pft::Evaluation &eval,
+    std::function<void(const Fortran::parser::LoopControl::Bounds &,
+                       mlir::Location)>
+        callback) {
+  Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation();
+  for (uint64_t i = 0; i < loopsToProcess; ++i) {
+    const Fortran::parser::LoopControl *loopControl;
+    if (i == 0) {
+      loopControl = &*outerDoConstruct.GetLoopControl();
+      mlir::Location loc = converter.genLocation(
+          Fortran::parser::FindSourceLocation(outerDoConstruct));
+      callback(std::get<Fortran::parser::LoopControl::Bounds>(loopControl->u),
+               loc);
+    } else {
+      // Safely locate the next inner DoConstruct within this eval.
+      const Fortran::parser::DoConstruct *innerDo = nullptr;
+      if (crtEval && crtEval->hasNestedEvaluations()) {
+        for (Fortran::lower::pft::Evaluation &child :
+             crtEval->getNestedEvaluations()) {
+          if (auto *stmt = child.getIf<Fortran::parser::DoConstruct>()) {
+            innerDo = stmt;
+            // Prepare to descend for the next iteration
+            crtEval = &child;
+            break;
+          }
+        }
+      }
+      if (!innerDo)
+        break; // No deeper loop; stop collecting collapsed bounds.
+
+      loopControl = &*innerDo->GetLoopControl();
+      mlir::Location loc =
+          converter.genLocation(Fortran::parser::FindSourceLocation(*innerDo));
+      callback(std::get<Fortran::parser::LoopControl::Bounds>(loopControl->u),
+               loc);
+    }
+  }
+}
+
 // Extract loop bounds, steps, induction variables, and privatization info
 // for both DO CONCURRENT and regular do loops
 static void processDoLoopBounds(
@@ -2272,7 +2067,6 @@ static void processDoLoopBounds(
     llvm::SmallVector<mlir::Location> &locs, uint64_t loopsToProcess) {
   assert(loopsToProcess > 0 && "expect at least one loop");
   locs.push_back(currentLocation); // Location of the directive
-  Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation();
   bool isDoConcurrent = outerDoConstruct.IsDoConcurrent();
 
   if (isDoConcurrent) {
@@ -2313,57 +2107,29 @@ static void processDoLoopBounds(
       inclusiveBounds.push_back(true);
     }
   } else {
-    for (uint64_t i = 0; i < loopsToProcess; ++i) {
-      const Fortran::parser::LoopControl *loopControl;
-      if (i == 0) {
-        loopControl = &*outerDoConstruct.GetLoopControl();
-        locs.push_back(converter.genLocation(
-            Fortran::parser::FindSourceLocation(outerDoConstruct)));
-      } else {
-        // Safely locate the next inner DoConstruct within this eval.
-        const Fortran::parser::DoConstruct *innerDo = nullptr;
-        if (crtEval && crtEval->hasNestedEvaluations()) {
-          for (Fortran::lower::pft::Evaluation &child :
-               crtEval->getNestedEvaluations()) {
-            if (auto *stmt = child.getIf<Fortran::parser::DoConstruct>()) {
-              innerDo = stmt;
-              // Prepare to descend for the next iteration
-              crtEval = &child;
-              break;
-            }
-          }
-        }
-        if (!innerDo)
-          break; // No deeper loop; stop collecting collapsed bounds.
-
-        loopControl = &*innerDo->GetLoopControl();
-        locs.push_back(converter.genLocation(
-            Fortran::parser::FindSourceLocation(*innerDo)));
-      }
-
-      const Fortran::parser::LoopControl::Bounds *bounds =
-          std::get_if<Fortran::parser::LoopControl::Bounds>(&loopControl->u);
-      assert(bounds && "Expected bounds on the loop construct");
-      lowerbounds.push_back(fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(bounds->lower), stmtCtx)));
-      upperbounds.push_back(fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(bounds->upper), stmtCtx)));
-      if (bounds->step)
-        steps.push_back(fir::getBase(converter.genExprValue(
-            *Fortran::semantics::GetExpr(bounds->step), stmtCtx)));
-      else // If `step` is not present, assume it is `1`.
-        steps.push_back(builder.createIntegerConstant(
-            currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
-
-      Fortran::semantics::Symbol &ivSym =
-          bounds->name.thing.symbol->GetUltimate();
-      privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs,
-                  privateOperands, ivPrivate, privatizationRecipes);
-
-      inclusiveBounds.push_back(true);
-
-      // crtEval already updated when descending; no blind increment here.
-    }
+    visitLoopControl(
+        converter, outerDoConstruct, loopsToProcess, eval,
+        [&](const Fortran::parser::LoopControl::Bounds &bounds,
+            mlir::Location loc) {
+          locs.push_back(loc);
+          lowerbounds.push_back(fir::getBase(converter.genExprValue(
+              *Fortran::semantics::GetExpr(bounds.lower), stmtCtx)));
+          upperbounds.push_back(fir::getBase(converter.genExprValue(
+              *Fortran::semantics::GetExpr(bounds.upper), stmtCtx)));
+          if (bounds.step)
+            steps.push_back(fir::getBase(converter.genExprValue(
+                *Fortran::semantics::GetExpr(bounds.step), stmtCtx)));
+          else // If `step` is not present, assume it is `1`.
+            steps.push_back(builder.createIntegerConstant(
+                currentLocation, upperbounds[upperbounds.size() - 1].getType(),
+                1));
+          Fortran::semantics::Symbol &ivSym =
+              bounds.name.thing.symbol->GetUltimate();
+          privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs,
+                      privateOperands, ivPrivate, privatizationRecipes);
+
+          inclusiveBounds.push_back(true);
+        });
   }
 }
 
@@ -2499,6 +2265,34 @@ static void remapDataOperandSymbols(
   }
 }
 
+static void privatizeInductionVariables(
+    Fortran::lower::AbstractConverter &converter,
+    mlir::Location currentLocation,
+    const Fortran::parser::DoConstruct &outerDoConstruct,
+    Fortran::lower::pft::Evaluation &eval,
+    llvm::SmallVector<mlir::Value> &privateOperands,
+    llvm::SmallVector<std::pair<mlir::Value, Fortran::semantics::SymbolRef>>
+        &ivPrivate,
+    llvm::SmallVector<mlir::Attribute> &privatizationRecipes,
+    llvm::SmallVector<mlir::Location> &locs, uint64_t loopsToProcess) {
+  // ivTypes and locs will be ignored since no acc.loop control arguments will
+  // be created.
+  llvm::SmallVector<mlir::Type> ivTypes;
+  llvm::SmallVector<mlir::Location> ivLocs;
+  assert(!outerDoConstruct.IsDoConcurrent() &&
+         "do concurrent loops are not expected to contained earlty exits");
+  visitLoopControl(converter, outerDoConstruct, loopsToProcess, eval,
+                   [&](const Fortran::parser::LoopControl::Bounds &bounds,
+                       mlir::Location loc) {
+                     locs.push_back(loc);
+                     Fortran::semantics::Symbol &ivSym =
+                         bounds.name.thing.symbol->GetUltimate();
+                     privatizeIv(converter, ivSym, currentLocation, ivTypes,
+                                 ivLocs, privateOperands, ivPrivate,
+                                 privatizationRecipes);
+                   });
+}
+
 static mlir::acc::LoopOp buildACCLoopOp(
     Fortran::lower::AbstractConverter &converter,
     mlir::Location currentLocation,
@@ -2528,13 +2322,22 @@ static mlir::acc::LoopOp buildACCLoopOp(
   llvm::SmallVector<mlir::Location> locs;
   llvm::SmallVector<mlir::Value> lowerbounds, upperbounds, steps;
 
-  // Look at the do/do concurrent loops to extract bounds information.
-  processDoLoopBounds(converter, currentLocation, stmtCtx, builder,
-                      outerDoConstruct, eval, lowerbounds, upperbounds, steps,
-                      privateOperands, ivPrivate, privatizationRecipes, ivTypes,
-                      ivLocs, inclusiveBounds, locs, loopsToProcess);
-
-  // Prepare the operand segment size attribute and the operands value range.
+  // Look at the do/do concurrent loops to extract bounds information unless
+  // this loop is lowered in an unstructured fashion, in which case bounds are
+  // not represented on acc.loop and explicit control flow is used inside body.
+  if (!eval.lowerAsUnstructured()) {
+    processDoLoopBounds(converter, currentLocation, stmtCtx, builder,
+                        outerDoConstruct, eval, lowerbounds, upperbounds, steps,
+                        privateOperands, ivPrivate, privatizationRecipes,
+                        ivTypes, ivLocs, inclusiveBounds, locs, loopsToProcess);
+  } else {
+    // When the loop contains early exits, privatize induction variables, but do
+    // not create acc.loop bounds. The control flow of the loop will be
+    // generated explicitly in the acc.loop body that is just a container.
+    privatizeInductionVariables(converter, currentLocation, outerDoConstruct,
+                                eval, privateOperands, ivPrivate,
+                                privatizationRecipes, locs, loopsToProcess);
+  }
   llvm::SmallVector<mlir::Value> operands;
   llvm::SmallVector<int32_t> operandSegments;
   addOperands(operands, operandSegments, lowerbounds);
@@ -2563,20 +2366,36 @@ static mlir::acc::LoopOp buildACCLoopOp(
   // Remap symbols from data clauses to use data operation results
   remapDataOperandSymbols(converter, builder, loopOp, dataOperandSymbolPairs);
 
-  for (auto [arg, iv] :
-       llvm::zip(loopOp.getLoopRegions().front()->front().getArguments(),
-                 ivPrivate)) {
-    // Store block argument to the related iv private variable.
-    mlir::Value privateValue =
-        converter.getSymbolAddress(std::get<Fortran::semantics::SymbolRef>(iv));
-    fir::StoreOp::create(builder, currentLocation, arg, privateValue);
+  if (!eval.lowerAsUnstructured()) {
+    for (auto [arg, iv] :
+         llvm::zip(loopOp.getLoopRegions().front()->front().getArguments(),
+                   ivPrivate)) {
+      // Store block argument to the related iv private variable.
+      mlir::Value privateValue = converter.getSymbolAddress(
+          std::get<Fortran::semantics::SymbolRef>(iv));
+      fir::StoreOp::create(builder, currentLocation, arg, privateValue);
+    }
+    loopOp.setInclusiveUpperbound(inclusiveBounds);
+  } else {
+    loopOp.setUnstructuredAttr(builder.getUnitAttr());
   }
 
-  loopOp.setInclusiveUpperbound(inclusiveBounds);
-
   return loopOp;
 }
 
+static bool hasEarlyReturn(Fortran::lower::pft::Evaluation &eval) {
+  bool hasReturnStmt = false;
+  for (auto &e : eval.getNestedEvaluations()) {
+    e.visit(Fortran::common::visitors{
+        [&](const Fortran::parser::ReturnStmt &) { hasReturnStmt = true; },
+        [&](const auto &s) {},
+    });
+    if (e.hasNestedEvaluations())
+      hasReturnStmt = hasEarlyReturn(e);
+  }
+  return hasReturnStmt;
+}
+
 static mlir::acc::LoopOp createLoopOp(
     Fortran::lower::AbstractConverter &converter,
     mlir::Location currentLocation,
@@ -2586,8 +2405,7 @@ static mlir::acc::LoopOp createLoopOp(
     Fortran::lower::pft::Evaluation &eval,
     const Fortran::parser::AccClauseList &accClauseList,
     std::optional<mlir::acc::CombinedConstructsType> combinedConstructs =
-        std::nullopt,
-    bool needEarlyReturnHandling = false) {
+        std::nullopt) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   llvm::SmallVector<mlir::Value> tileOperands, privateOperands,
       reductionOperands, cacheOperands, vectorOperands, workerNumOperands,
@@ -2763,7 +2581,10 @@ static mlir::acc::LoopOp createLoopOp(
 
   llvm::SmallVector<mlir::Type> retTy;
   mlir::Value yieldValue;
-  if (needEarlyReturnHandling) {
+  if (eval.lowerAsUnstructured() && hasEarlyReturn(eval)) {
+    // When there is a return statement inside the loop, add a result to the
+    // acc.loop that will be used in a conditional branch after the loop to
+    // return.
     mlir::Type i1Ty = builder.getI1Type();
     yieldValue = builder.createIntegerConstant(currentLocation, i1Ty, 0);
     retTy.push_back(i1Ty);
@@ -2844,19 +2665,6 @@ static mlir::acc::LoopOp createLoopOp(
   return loopOp;
 }
 
-static bool hasEarlyReturn(Fortran::lower::pft::Evaluation &eval) {
-  bool hasReturnStmt = false;
-  for (auto &e : eval.getNestedEvaluations()) {
-    e.visit(Fortran::common::visitors{
-        [&](const Fortran::parser::ReturnStmt &) { hasReturnStmt = true; },
-        [&](const auto &s) {},
-    });
-    if (e.hasNestedEvaluations())
-      hasReturnStmt = hasEarlyReturn(e);
-  }
-  return hasReturnStmt;
-}
-
 static mlir::Value
 genACC(Fortran::lower::AbstractConverter &converter,
        Fortran::semantics::SemanticsContext &semanticsContext,
@@ -2870,17 +2678,6 @@ genACC(Fortran::lower::AbstractConverter &converter,
 
   mlir::Location currentLocation =
       converter.genLocation(beginLoopDirective.source);
-  bool needEarlyExitHandling = false;
-  if (eval.lowerAsUnstructured()) {
-    needEarlyExitHandling = hasEarlyReturn(eval);
-    // If the loop is lowered in an unstructured fashion, lowering generates
-    // explicit control flow that duplicates the looping semantics of the
-    // loops.
-    if (!needEarlyExitHandling)
-      TODO(currentLocation,
-           "loop with early exit inside OpenACC loop construct");
-  }
-
   Fortran::lower::StatementContext stmtCtx;
 
   assert(loopDirective.v == llvm::acc::ACCD_loop &&
@@ -2893,8 +2690,8 @@ genACC(Fortran::lower::AbstractConverter &converter,
       std::get<std::optional<Fortran::parser::DoConstruct>>(loopConstruct.t);
   auto loopOp = createLoopOp(converter, currentLocation, semanticsContext,
                              stmtCtx, *outerDoConstruct, eval, accClauseList,
-                             /*combinedConstructs=*/{}, needEarlyExitHandling);
-  if (needEarlyExitHandling)
+                             /*combinedConstructs=*/{});
+  if (loopOp.getNumResults() == 1)
     return loopOp.getResult(0);
 
   return mlir::Value{};
@@ -3106,8 +2903,8 @@ static Op createComputeOp(
       genDataOperandOperationsWithModifier<mlir::acc::CreateOp,
                                            Fortran::parser::AccClause::Copyout>(
           copyoutClause, converter, semanticsContext, stmtCtx,
-          Fortran::parser::AccDataModifier::Modifier::ReadOnly,
-          dataClauseOperands, mlir::acc::DataClause::acc_copyout,
+          Fortran::parser::AccDataModifier::Modifier::Zero, dataClauseOperands,
+          mlir::acc::DataClause::acc_copyout,
           mlir::acc::DataClause::acc_copyout_zero, async, asyncDeviceTypes,
           asyncOnlyDeviceTypes, /*setDeclareAttr=*/false,
           &dataOperandSymbolPairs);
@@ -3679,10 +3476,6 @@ genACC(Fortran::lower::AbstractConverter &converter,
       converter.genLocation(beginCombinedDirective.source);
   Fortran::lower::StatementContext stmtCtx;
 
-  if (eval.lowerAsUnstructured())
-    TODO(currentLocation,
-         "loop with early exit inside OpenACC combined construct");
-
   if (combinedDirective.v == llvm::acc::ACCD_kernels_loop) {
     createComputeOp<mlir::acc::KernelsOp>(
         converter, currentLocation, eval, semanticsContext, stmtCtx,
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 1c163e6de7e5a..872f31fe45cca 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -406,6 +406,11 @@ bool ClauseProcessor::processMergeable(
   return markClauseOccurrence<omp::clause::Mergeable>(result.mergeable);
 }
 
+bool ClauseProcessor::processNogroup(
+    mlir::omp::NogroupClauseOps &result) const {
+  return markClauseOccurrence<omp::clause::Nogroup>(result.nogroup);
+}
+
 bool ClauseProcessor::processNowait(mlir::omp::NowaitClauseOps &result) const {
   return markClauseOccurrence<omp::clause::Nowait>(result.nowait);
 }
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 6452e39b97551..d524b4ddc8ac4 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -89,6 +89,7 @@ class ClauseProcessor {
   bool processInclusive(mlir::Location currentLocation,
                         mlir::omp::InclusiveClauseOps &result) const;
   bool processMergeable(mlir::omp::MergeableClauseOps &result) const;
+  bool processNogroup(mlir::omp::NogroupClauseOps &result) const;
   bool processNowait(mlir::omp::NowaitClauseOps &result) const;
   bool processNumTasks(lower::StatementContext &stmtCtx,
                        mlir::omp::NumTasksClauseOps &result) const;
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 0f60b47991004..b1a3c3d3c5439 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -10,7 +10,6 @@
 
 #include "flang/Common/idioms.h"
 #include "flang/Evaluate/expression.h"
-#include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/expression.h"
 #include "flang/Semantics/openmp-modifiers.h"
@@ -249,8 +248,10 @@ MAKE_EMPTY_CLASS(Groupprivate, Groupprivate);
 
 MAKE_INCOMPLETE_CLASS(AdjustArgs, AdjustArgs);
 MAKE_INCOMPLETE_CLASS(AppendArgs, AppendArgs);
+MAKE_INCOMPLETE_CLASS(Collector, Collector);
 MAKE_INCOMPLETE_CLASS(GraphId, GraphId);
 MAKE_INCOMPLETE_CLASS(GraphReset, GraphReset);
+MAKE_INCOMPLETE_CLASS(Inductor, Inductor);
 MAKE_INCOMPLETE_CLASS(Replayable, Replayable);
 MAKE_INCOMPLETE_CLASS(Transparent, Transparent);
 
@@ -394,8 +395,6 @@ makePrescriptiveness(parser::OmpPrescriptiveness::Value v) {
   switch (v) {
   case parser::OmpPrescriptiveness::Value::Strict:
     return clause::Prescriptiveness::Strict;
-  case parser::OmpPrescriptiveness::Value::Fallback:
-    return clause::Prescriptiveness::Fallback;
   }
   llvm_unreachable("Unexpected prescriptiveness");
 }
@@ -797,21 +796,31 @@ DynGroupprivate make(const parser::OmpClause::DynGroupprivate &inp,
                      semantics::SemanticsContext &semaCtx) {
   // imp.v -> OmpDyngroupprivateClause
   CLAUSET_ENUM_CONVERT( //
-      convert, parser::OmpAccessGroup::Value, DynGroupprivate::AccessGroup,
+      makeAccessGroup, parser::OmpAccessGroup::Value,
+      DynGroupprivate::AccessGroup,
       // clang-format off
       MS(Cgroup,  Cgroup)
       // clang-format on
   );
 
+  CLAUSET_ENUM_CONVERT( //
+      makeFallback, parser::OmpFallbackModifier::Value,
+      DynGroupprivate::Fallback,
+      // clang-format off
+      MS(Abort,       Abort)
+      MS(Default_Mem, Default_Mem)
+      MS(Null,        Null)
+      // clang-format on
+  );
+
   auto &mods = semantics::OmpGetModifiers(inp.v);
   auto *m0 = semantics::OmpGetUniqueModifier<parser::OmpAccessGroup>(mods);
-  auto *m1 = semantics::OmpGetUniqueModifier<parser::OmpPrescriptiveness>(mods);
+  auto *m1 = semantics::OmpGetUniqueModifier<parser::OmpFallbackModifier>(mods);
   auto &size = std::get<parser::ScalarIntExpr>(inp.v.t);
 
-  return DynGroupprivate{
-      {/*AccessGroup=*/maybeApplyToV(convert, m0),
-       /*Prescriptiveness=*/maybeApplyToV(makePrescriptiveness, m1),
-       /*Size=*/makeExpr(size, semaCtx)}};
+  return DynGroupprivate{{/*AccessGroup=*/maybeApplyToV(makeAccessGroup, m0),
+                          /*Fallback=*/maybeApplyToV(makeFallback, m1),
+                          /*Size=*/makeExpr(size, semaCtx)}};
 }
 
 Enter make(const parser::OmpClause::Enter &inp,
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 146a252b049ec..83c2eda0a2dc7 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -342,7 +342,8 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
   if (!hasLastPrivate)
     return;
 
-  if (mlir::isa<mlir::omp::WsloopOp>(op) || mlir::isa<mlir::omp::SimdOp>(op)) {
+  if (mlir::isa<mlir::omp::WsloopOp>(op) || mlir::isa<mlir::omp::SimdOp>(op) ||
+      mlir::isa<mlir::omp::TaskloopOp>(op)) {
     mlir::omp::LoopRelatedClauseOps result;
     llvm::SmallVector<const semantics::Symbol *> iv;
     collectLoopRelatedInfo(converter, converter.getCurrentLocation(), eval,
@@ -408,7 +409,7 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
   } else {
     TODO(converter.getCurrentLocation(),
          "lastprivate clause in constructs other than "
-         "simd/worksharing-loop");
+         "simd/worksharing-loop/taskloop");
   }
 }
 
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 71067283d13f7..4048aeea37b92 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1008,9 +1008,7 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder,
                           mlir::omp::VariableCaptureKind::ByRef);
     break;
   case DefMap::ImplicitBehavior::Firstprivate:
-  case DefMap::ImplicitBehavior::None:
-    TODO(loc, "Firstprivate and None are currently unsupported defaultmap "
-              "behaviour");
+    TODO(loc, "Firstprivate is currently unsupported defaultmap behaviour");
     break;
   case DefMap::ImplicitBehavior::From:
     return std::make_pair(mapFlag |= mlir::omp::ClauseMapFlags::from,
@@ -1032,8 +1030,9 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder,
                           mlir::omp::VariableCaptureKind::ByRef);
     break;
   case DefMap::ImplicitBehavior::Default:
+  case DefMap::ImplicitBehavior::None:
     llvm_unreachable(
-        "Implicit None Behaviour Should Have Been Handled Earlier");
+        "Implicit None and Default behaviour should have been handled earlier");
     break;
   }
 
@@ -1763,21 +1762,25 @@ static void genTaskgroupClauses(
   cp.processTaskReduction(loc, clauseOps, taskReductionSyms);
 }
 
-static void genTaskloopClauses(lower::AbstractConverter &converter,
-                               semantics::SemanticsContext &semaCtx,
-                               lower::StatementContext &stmtCtx,
-                               const List<Clause> &clauses, mlir::Location loc,
-                               mlir::omp::TaskloopOperands &clauseOps) {
+static void genTaskloopClauses(
+    lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
+    lower::StatementContext &stmtCtx, const List<Clause> &clauses,
+    mlir::Location loc, mlir::omp::TaskloopOperands &clauseOps,
+    llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms,
+    llvm::SmallVectorImpl<const semantics::Symbol *> &inReductionSyms) {
 
   ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processAllocate(clauseOps);
+  cp.processFinal(stmtCtx, clauseOps);
   cp.processGrainsize(stmtCtx, clauseOps);
+  cp.processIf(llvm::omp::Directive::OMPD_taskloop, clauseOps);
+  cp.processInReduction(loc, clauseOps, inReductionSyms);
+  cp.processMergeable(clauseOps);
+  cp.processNogroup(clauseOps);
   cp.processNumTasks(stmtCtx, clauseOps);
-
-  cp.processTODO<clause::Allocate, clause::Collapse, clause::Default,
-                 clause::Final, clause::If, clause::InReduction,
-                 clause::Lastprivate, clause::Mergeable, clause::Nogroup,
-                 clause::Priority, clause::Reduction, clause::Shared,
-                 clause::Untied>(loc, llvm::omp::Directive::OMPD_taskloop);
+  cp.processPriority(stmtCtx, clauseOps);
+  cp.processReduction(loc, clauseOps, reductionSyms);
+  cp.processUntied(clauseOps);
 }
 
 static void genTaskwaitClauses(lower::AbstractConverter &converter,
@@ -2979,8 +2982,11 @@ static mlir::omp::TaskloopOp genStandaloneTaskloop(
     lower::pft::Evaluation &eval, mlir::Location loc,
     const ConstructQueue &queue, ConstructQueue::const_iterator item) {
   mlir::omp::TaskloopOperands taskloopClauseOps;
+  llvm::SmallVector<const semantics::Symbol *> reductionSyms;
+  llvm::SmallVector<const semantics::Symbol *> inReductionSyms;
+
   genTaskloopClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
-                     taskloopClauseOps);
+                     taskloopClauseOps, reductionSyms, inReductionSyms);
   DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
                            /*shouldCollectPreDeterminedSymbols=*/true,
                            enableDelayedPrivatization, symTable);
@@ -2994,6 +3000,10 @@ static mlir::omp::TaskloopOp genStandaloneTaskloop(
   EntryBlockArgs taskloopArgs;
   taskloopArgs.priv.syms = dsp.getDelayedPrivSymbols();
   taskloopArgs.priv.vars = taskloopClauseOps.privateVars;
+  taskloopArgs.reduction.syms = reductionSyms;
+  taskloopArgs.reduction.vars = taskloopClauseOps.reductionVars;
+  taskloopArgs.inReduction.syms = inReductionSyms;
+  taskloopArgs.inReduction.vars = taskloopClauseOps.inReductionVars;
 
   auto taskLoopOp = genWrapperOp<mlir::omp::TaskloopOp>(
       converter, loc, taskloopClauseOps, taskloopArgs);
@@ -3503,12 +3513,12 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPUtilityConstruct &);
 
-static void
-genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
-       semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
-       const parser::OpenMPDeclarativeAllocate &declarativeAllocate) {
+static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
+                   semantics::SemanticsContext &semaCtx,
+                   lower::pft::Evaluation &eval,
+                   const parser::OmpAllocateDirective &allocate) {
   if (!semaCtx.langOptions().OpenMPSimd)
-    TODO(converter.getCurrentLocation(), "OpenMPDeclarativeAllocate");
+    TODO(converter.getCurrentLocation(), "OmpAllocateDirective");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
@@ -3899,14 +3909,6 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
     TODO(converter.getCurrentLocation(), "OpenMPDispatchConstruct");
 }
 
-static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
-                   semantics::SemanticsContext &semaCtx,
-                   lower::pft::Evaluation &eval,
-                   const parser::OpenMPExecutableAllocate &execAllocConstruct) {
-  if (!semaCtx.langOptions().OpenMPSimd)
-    TODO(converter.getCurrentLocation(), "OpenMPExecutableAllocate");
-}
-
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
diff --git a/flang/lib/Lower/Runtime.cpp b/flang/lib/Lower/Runtime.cpp
index cb555249125f6..d5b8045d91992 100644
--- a/flang/lib/Lower/Runtime.cpp
+++ b/flang/lib/Lower/Runtime.cpp
@@ -48,31 +48,6 @@ static void genUnreachable(fir::FirOpBuilder &builder, mlir::Location loc) {
   builder.setInsertionPointToStart(newBlock);
 }
 
-/// Initializes values for STAT and ERRMSG
-static std::pair<mlir::Value, mlir::Value> getStatAndErrmsg(
-    Fortran::lower::AbstractConverter &converter, mlir::Location loc,
-    const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList) {
-  Fortran::lower::StatementContext stmtCtx;
-
-  mlir::Value errMsgExpr, statExpr;
-  for (const Fortran::parser::StatOrErrmsg &statOrErr : statOrErrList) {
-    std::visit(Fortran::common::visitors{
-                   [&](const Fortran::parser::StatVariable &statVar) {
-                     statExpr = fir::getBase(converter.genExprAddr(
-                         loc, Fortran::semantics::GetExpr(statVar), stmtCtx));
-                   },
-                   [&](const Fortran::parser::MsgVariable &errMsgVar) {
-                     const Fortran::semantics::SomeExpr *expr =
-                         Fortran::semantics::GetExpr(errMsgVar);
-                     errMsgExpr = fir::getBase(
-                         converter.genExprBox(loc, *expr, stmtCtx));
-                   }},
-               statOrErr.u);
-  }
-
-  return {statExpr, errMsgExpr};
-}
-
 //===----------------------------------------------------------------------===//
 // Misc. Fortran statements that lower to runtime calls
 //===----------------------------------------------------------------------===//
@@ -193,74 +168,6 @@ void Fortran::lower::genUnlockStatement(
   TODO(converter.getCurrentLocation(), "coarray: UNLOCK runtime");
 }
 
-void Fortran::lower::genSyncAllStatement(
-    Fortran::lower::AbstractConverter &converter,
-    const Fortran::parser::SyncAllStmt &stmt) {
-  mlir::Location loc = converter.getCurrentLocation();
-  converter.checkCoarrayEnabled();
-
-  // Handle STAT and ERRMSG values
-  const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList = stmt.v;
-  auto [statAddr, errMsgAddr] = getStatAndErrmsg(converter, loc, statOrErrList);
-
-  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  mif::SyncAllOp::create(builder, loc, statAddr, errMsgAddr);
-}
-
-void Fortran::lower::genSyncImagesStatement(
-    Fortran::lower::AbstractConverter &converter,
-    const Fortran::parser::SyncImagesStmt &stmt) {
-  mlir::Location loc = converter.getCurrentLocation();
-  converter.checkCoarrayEnabled();
-  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-
-  // Handle STAT and ERRMSG values
-  const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList =
-      std::get<std::list<Fortran::parser::StatOrErrmsg>>(stmt.t);
-  auto [statAddr, errMsgAddr] = getStatAndErrmsg(converter, loc, statOrErrList);
-
-  // SYNC_IMAGES(*) is passed as count == -1 while  SYNC IMAGES([]) has count
-  // == 0. Note further that SYNC IMAGES(*) is not semantically equivalent to
-  // SYNC ALL.
-  Fortran::lower::StatementContext stmtCtx;
-  mlir::Value imageSet;
-  const Fortran::parser::SyncImagesStmt::ImageSet &imgSet =
-      std::get<Fortran::parser::SyncImagesStmt::ImageSet>(stmt.t);
-  std::visit(Fortran::common::visitors{
-                 [&](const Fortran::parser::IntExpr &intExpr) {
-                   const SomeExpr *expr = Fortran::semantics::GetExpr(intExpr);
-                   imageSet =
-                       fir::getBase(converter.genExprBox(loc, *expr, stmtCtx));
-                 },
-                 [&](const Fortran::parser::Star &) {
-                   // Image set is not set.
-                   imageSet = mlir::Value{};
-                 }},
-             imgSet.u);
-
-  mif::SyncImagesOp::create(builder, loc, imageSet, statAddr, errMsgAddr);
-}
-
-void Fortran::lower::genSyncMemoryStatement(
-    Fortran::lower::AbstractConverter &converter,
-    const Fortran::parser::SyncMemoryStmt &stmt) {
-  mlir::Location loc = converter.getCurrentLocation();
-  converter.checkCoarrayEnabled();
-
-  // Handle STAT and ERRMSG values
-  const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList = stmt.v;
-  auto [statAddr, errMsgAddr] = getStatAndErrmsg(converter, loc, statOrErrList);
-
-  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  mif::SyncMemoryOp::create(builder, loc, statAddr, errMsgAddr);
-}
-
-void Fortran::lower::genSyncTeamStatement(
-    Fortran::lower::AbstractConverter &converter,
-    const Fortran::parser::SyncTeamStmt &) {
-  TODO(converter.getCurrentLocation(), "coarray: SYNC TEAM runtime");
-}
-
 void Fortran::lower::genPauseStatement(
     Fortran::lower::AbstractConverter &converter,
     const Fortran::parser::PauseStmt &) {
diff --git a/flang/lib/Lower/Support/Utils.cpp b/flang/lib/Lower/Support/Utils.cpp
index 1b4d37e9798a9..4b95a3adf052a 100644
--- a/flang/lib/Lower/Support/Utils.cpp
+++ b/flang/lib/Lower/Support/Utils.cpp
@@ -82,7 +82,7 @@ class HashEvaluateExpr {
          x.cosubscript())
       cosubs -= getHashValue(v);
     return getHashValue(x.base()) * 97u - cosubs + getHashValue(x.stat()) +
-           257u + getHashValue(x.team());
+           257u + getHashValue(x.team()) + getHashValue(x.notify());
   }
   static unsigned getHashValue(const Fortran::evaluate::NamedEntity &x) {
     if (x.IsSymbol())
@@ -341,7 +341,8 @@ class IsEqualEvaluateExpr {
                       const Fortran::evaluate::CoarrayRef &y) {
     return isEqual(x.base(), y.base()) &&
            isEqual(x.cosubscript(), y.cosubscript()) &&
-           isEqual(x.stat(), y.stat()) && isEqual(x.team(), y.team());
+           isEqual(x.stat(), y.stat()) && isEqual(x.team(), y.team()) &&
+           isEqual(x.notify(), y.notify());
   }
   static bool isEqual(const Fortran::evaluate::NamedEntity &x,
                       const Fortran::evaluate::NamedEntity &y) {
diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
index 73ddd1ff80126..ef9894232b409 100644
--- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
+++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
@@ -27,6 +27,26 @@ using namespace mlir;
 
 #define DEBUG_TYPE "fir-alias-analysis"
 
+// Inspect for value-scoped Allocate effects and determine whether
+// 'candidate' is a new allocation. Returns SourceKind::Allocate if a
+// MemAlloc effect is attached
+static fir::AliasAnalysis::SourceKind
+classifyAllocateFromEffects(mlir::Operation *op, mlir::Value candidate) {
+  if (!op)
+    return fir::AliasAnalysis::SourceKind::Unknown;
+  auto interface = llvm::dyn_cast<mlir::MemoryEffectOpInterface>(op);
+  if (!interface)
+    return fir::AliasAnalysis::SourceKind::Unknown;
+  llvm::SmallVector<mlir::MemoryEffects::EffectInstance, 4> effects;
+  interface.getEffects(effects);
+  for (mlir::MemoryEffects::EffectInstance &e : effects) {
+    if (mlir::isa<mlir::MemoryEffects::Allocate>(e.getEffect()) &&
+        e.getValue() && e.getValue() == candidate)
+      return fir::AliasAnalysis::SourceKind::Allocate;
+  }
+  return fir::AliasAnalysis::SourceKind::Unknown;
+}
+
 //===----------------------------------------------------------------------===//
 // AliasAnalysis: alias
 //===----------------------------------------------------------------------===//
@@ -535,6 +555,11 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
   mlir::Operation *instantiationPoint{nullptr};
   while (defOp && !breakFromLoop) {
     ty = defOp->getResultTypes()[0];
+    // Value-scoped allocation detection via effects.
+    if (classifyAllocateFromEffects(defOp, v) == SourceKind::Allocate) {
+      type = SourceKind::Allocate;
+      break;
+    }
     llvm::TypeSwitch<Operation *>(defOp)
         .Case<hlfir::AsExprOp>([&](auto op) {
           v = op.getVar();
@@ -554,11 +579,6 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
             defOp = v.getDefiningOp();
           }
         })
-        .Case<fir::AllocaOp, fir::AllocMemOp>([&](auto op) {
-          // Unique memory allocation.
-          type = SourceKind::Allocate;
-          breakFromLoop = true;
-        })
         .Case<fir::ConvertOp>([&](auto op) {
           // Skip ConvertOp's and track further through the operand.
           v = op->getOperand(0);
@@ -628,16 +648,23 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
               type = SourceKind::Global;
             } else {
               auto def = llvm::cast<mlir::Value>(boxSrc.origin.u);
-              // TODO: Add support to fir.allocmem
-              if (auto allocOp = def.template getDefiningOp<fir::AllocaOp>()) {
-                v = def;
-                defOp = v.getDefiningOp();
-                type = SourceKind::Allocate;
-              } else if (isDummyArgument(def)) {
-                defOp = nullptr;
-                v = def;
-              } else {
-                type = SourceKind::Indirect;
+              bool classified = false;
+              if (auto defDefOp = def.getDefiningOp()) {
+                if (classifyAllocateFromEffects(defDefOp, def) ==
+                    SourceKind::Allocate) {
+                  v = def;
+                  defOp = defDefOp;
+                  type = SourceKind::Allocate;
+                  classified = true;
+                }
+              }
+              if (!classified) {
+                if (isDummyArgument(def)) {
+                  defOp = nullptr;
+                  v = def;
+                } else {
+                  type = SourceKind::Indirect;
+                }
               }
             }
             breakFromLoop = true;
diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt
index 1f95259a857da..37c9c2d703c76 100644
--- a/flang/lib/Optimizer/Builder/CMakeLists.txt
+++ b/flang/lib/Optimizer/Builder/CMakeLists.txt
@@ -5,6 +5,7 @@ add_flang_library(FIRBuilder
   BoxValue.cpp
   Character.cpp
   Complex.cpp
+  CUDAIntrinsicCall.cpp
   CUFCommon.cpp
   DoLoopHelper.cpp
   FIRBuilder.cpp
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
new file mode 100644
index 0000000000000..323d1ef78e65d
--- /dev/null
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -0,0 +1,1556 @@
+//===-- CUDAIntrinsicCall.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Helper routines for constructing the FIR dialect of MLIR for PowerPC
+// intrinsics. Extensive use of MLIR interfaces and MLIR's coding style
+// (https://mlir.llvm.org/getting_started/DeveloperGuide/) is used in this
+// module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/CUDAIntrinsicCall.h"
+#include "flang/Evaluate/common.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/MutableBox.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+
+namespace fir {
+
+using CI = CUDAIntrinsicLibrary;
+
+static const char __ldca_i4x4[] = "__ldca_i4x4_";
+static const char __ldca_i8x2[] = "__ldca_i8x2_";
+static const char __ldca_r2x2[] = "__ldca_r2x2_";
+static const char __ldca_r4x4[] = "__ldca_r4x4_";
+static const char __ldca_r8x2[] = "__ldca_r8x2_";
+static const char __ldcg_i4x4[] = "__ldcg_i4x4_";
+static const char __ldcg_i8x2[] = "__ldcg_i8x2_";
+static const char __ldcg_r2x2[] = "__ldcg_r2x2_";
+static const char __ldcg_r4x4[] = "__ldcg_r4x4_";
+static const char __ldcg_r8x2[] = "__ldcg_r8x2_";
+static const char __ldcs_i4x4[] = "__ldcs_i4x4_";
+static const char __ldcs_i8x2[] = "__ldcs_i8x2_";
+static const char __ldcs_r2x2[] = "__ldcs_r2x2_";
+static const char __ldcs_r4x4[] = "__ldcs_r4x4_";
+static const char __ldcs_r8x2[] = "__ldcs_r8x2_";
+static const char __ldcv_i4x4[] = "__ldcv_i4x4_";
+static const char __ldcv_i8x2[] = "__ldcv_i8x2_";
+static const char __ldcv_r2x2[] = "__ldcv_r2x2_";
+static const char __ldcv_r4x4[] = "__ldcv_r4x4_";
+static const char __ldcv_r8x2[] = "__ldcv_r8x2_";
+static const char __ldlu_i4x4[] = "__ldlu_i4x4_";
+static const char __ldlu_i8x2[] = "__ldlu_i8x2_";
+static const char __ldlu_r2x2[] = "__ldlu_r2x2_";
+static const char __ldlu_r4x4[] = "__ldlu_r4x4_";
+static const char __ldlu_r8x2[] = "__ldlu_r8x2_";
+
+// CUDA specific intrinsic handlers.
+static constexpr IntrinsicHandler cudaHandlers[]{
+    {"__ldca_i4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldca_i4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldca_i8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldca_i8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldca_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldca_r2x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldca_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldca_r4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldca_r8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldca_r8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcg_i4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcg_i4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcg_i8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcg_i8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcg_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcg_r2x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcg_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcg_r4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcg_r8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcg_r8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcs_i4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcs_i4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcs_i8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcs_i8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcs_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcs_r2x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcs_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcs_r4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcs_r8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcs_r8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcv_i4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcv_i4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcv_i8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcv_i8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcv_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcv_r2x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcv_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcv_r4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcv_r8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcv_r8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldlu_i4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldlu_i4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldlu_i8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldlu_i8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldlu_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldlu_r2x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldlu_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldlu_r4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldlu_r8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldlu_r8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"all_sync",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genVoteSync<mlir::NVVM::VoteSyncKind::all>),
+     {{{"mask", asValue}, {"pred", asValue}}},
+     /*isElemental=*/false},
+    {"any_sync",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genVoteSync<mlir::NVVM::VoteSyncKind::any>),
+     {{{"mask", asValue}, {"pred", asValue}}},
+     /*isElemental=*/false},
+    {"atomicadd_r4x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genAtomicAddVector<2>),
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
+    {"atomicadd_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genAtomicAddVector<4>),
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
+    {"atomicaddd",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicaddf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicaddi",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicaddl",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicaddr2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicAddR2),
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
+    {"atomicaddvector_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genAtomicAddVector<2>),
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
+    {"atomicaddvector_r4x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genAtomicAddVector<2>),
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
+    {"atomicandi",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAnd),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomiccasd",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
+     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
+     false},
+    {"atomiccasf",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
+     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
+     false},
+    {"atomiccasi",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
+     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
+     false},
+    {"atomiccasul",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
+     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
+     false},
+    {"atomicdeci",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicDec),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicexchd",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicexchf",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicexchi",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicexchul",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicinci",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicInc),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmaxd",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmaxf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmaxi",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmaxl",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmind",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicminf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmini",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicminl",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicori",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicOr),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicsubd",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicsubf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicsubi",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicsubl",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicxori",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicXor),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"ballot_sync",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>),
+     {{{"mask", asValue}, {"pred", asValue}}},
+     /*isElemental=*/false},
+    {"barrier_arrive",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genBarrierArrive),
+     {{{"barrier", asAddr}}},
+     /*isElemental=*/false},
+    {"barrier_arrive_cnt",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genBarrierArriveCnt),
+     {{{"barrier", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"barrier_init",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genBarrierInit),
+     {{{"barrier", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"barrier_try_wait",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genBarrierTryWait),
+     {{{"barrier", asAddr}, {"token", asValue}}},
+     /*isElemental=*/false},
+    {"barrier_try_wait_sleep",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genBarrierTryWaitSleep),
+     {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}},
+     /*isElemental=*/false},
+    {"clock",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genNVVMTime<mlir::NVVM::ClockOp>),
+     {},
+     /*isElemental=*/false},
+    {"clock64",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genNVVMTime<mlir::NVVM::Clock64Op>),
+     {},
+     /*isElemental=*/false},
+    {"fence_proxy_async",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genFenceProxyAsync),
+     {},
+     /*isElemental=*/false},
+    {"globaltimer",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genNVVMTime<mlir::NVVM::GlobalTimerOp>),
+     {},
+     /*isElemental=*/false},
+    {"match_all_syncjd",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAllSync),
+     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+     /*isElemental=*/false},
+    {"match_all_syncjf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAllSync),
+     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+     /*isElemental=*/false},
+    {"match_all_syncjj",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAllSync),
+     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+     /*isElemental=*/false},
+    {"match_all_syncjx",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAllSync),
+     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+     /*isElemental=*/false},
+    {"match_any_syncjd",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAnySync),
+     {{{"mask", asValue}, {"value", asValue}}},
+     /*isElemental=*/false},
+    {"match_any_syncjf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAnySync),
+     {{{"mask", asValue}, {"value", asValue}}},
+     /*isElemental=*/false},
+    {"match_any_syncjj",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAnySync),
+     {{{"mask", asValue}, {"value", asValue}}},
+     /*isElemental=*/false},
+    {"match_any_syncjx",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAnySync),
+     {{{"mask", asValue}, {"value", asValue}}},
+     /*isElemental=*/false},
+    {"syncthreads",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genSyncThreads),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_and_i4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsAnd),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_and_l4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsAnd),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_count_i4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsCount),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_count_l4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsCount),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_or_i4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsOr),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_or_l4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsOr),
+     {},
+     /*isElemental=*/false},
+    {"syncwarp",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genSyncWarp),
+     {},
+     /*isElemental=*/false},
+    {"this_grid",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisGrid),
+     {},
+     /*isElemental=*/false},
+    {"this_thread_block",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genThisThreadBlock),
+     {},
+     /*isElemental=*/false},
+    {"this_warp",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisWarp),
+     {},
+     /*isElemental=*/false},
+    {"threadfence",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genThreadFence<mlir::NVVM::MemScopeKind::GPU>),
+     {},
+     /*isElemental=*/false},
+    {"threadfence_block",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genThreadFence<mlir::NVVM::MemScopeKind::CTA>),
+     {},
+     /*isElemental=*/false},
+    {"threadfence_system",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genThreadFence<mlir::NVVM::MemScopeKind::SYS>),
+     {},
+     /*isElemental=*/false},
+    {"tma_bulk_commit_group",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkCommitGroup),
+     {{}},
+     /*isElemental=*/false},
+    {"tma_bulk_g2s",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkG2S),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nbytes", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldc4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadC4),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldc8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadC8),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldi4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadI4),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldi8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadI8),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldr2",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadR2),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldr4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadR4),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldr8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadR8),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_s2g",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkS2G),
+     {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_c4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreC4),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_c8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreC8),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_i4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreI4),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_i8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreI8),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_r2",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreR2),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_r4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreR4),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_r8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreR8),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_wait_group",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkWaitGroup),
+     {{}},
+     /*isElemental=*/false},
+};
+
+template <std::size_t N>
+static constexpr bool isSorted(const IntrinsicHandler (&array)[N]) {
+  // Replace by std::sorted when C++20 is default (will be constexpr).
+  const IntrinsicHandler *lastSeen{nullptr};
+  bool isSorted{true};
+  for (const auto &x : array) {
+    if (lastSeen)
+      isSorted &= std::string_view{lastSeen->name} < std::string_view{x.name};
+    lastSeen = &x;
+  }
+  return isSorted;
+}
+static_assert(isSorted(cudaHandlers) && "map must be sorted");
+
+const IntrinsicHandler *findCUDAIntrinsicHandler(llvm::StringRef name) {
+  auto compare = [](const IntrinsicHandler &cudaHandler, llvm::StringRef name) {
+    return name.compare(cudaHandler.name) > 0;
+  };
+  auto result = llvm::lower_bound(cudaHandlers, name, compare);
+  return result != std::end(cudaHandlers) && result->name == name ? result
+                                                                  : nullptr;
+}
+
+static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder,
+                                         mlir::Location loc,
+                                         mlir::Value barrier,
+                                         mlir::NVVM::NVVMMemorySpace space) {
+  mlir::Value llvmPtr = fir::ConvertOp::create(
+      builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()),
+      barrier);
+  mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create(
+      builder, loc,
+      mlir::LLVM::LLVMPointerType::get(builder.getContext(),
+                                       static_cast<unsigned>(space)),
+      llvmPtr);
+  return addrCast;
+}
+
+static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,
+                                mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0,
+                                mlir::Value arg1) {
+  auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  arg0 = builder.createConvert(loc, llvmPointerType, arg0);
+  return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1,
+                                         mlir::LLVM::AtomicOrdering::seq_cst);
+}
+
+// ATOMICADD
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicAdd(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::LLVM::AtomicBinOp binOp =
+      mlir::isa<mlir::IntegerType>(args[1].getType())
+          ? mlir::LLVM::AtomicBinOp::add
+          : mlir::LLVM::AtomicBinOp::fadd;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genAtomicAddR2(mlir::Type resultType,
+                                     llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+
+  mlir::Value a = fir::getBase(args[0]);
+
+  if (mlir::isa<fir::BaseBoxType>(a.getType())) {
+    a = fir::BoxAddrOp::create(builder, loc, a);
+  }
+
+  auto loc = builder.getUnknownLoc();
+  auto f16Ty = builder.getF16Type();
+  auto i32Ty = builder.getI32Type();
+  auto vecF16Ty = mlir::VectorType::get({2}, f16Ty);
+  mlir::Type idxTy = builder.getIndexType();
+  auto f16RefTy = fir::ReferenceType::get(f16Ty);
+  auto zero = builder.createIntegerConstant(loc, idxTy, 0);
+  auto one = builder.createIntegerConstant(loc, idxTy, 1);
+  auto v1Coord = fir::CoordinateOp::create(builder, loc, f16RefTy,
+                                           fir::getBase(args[1]), zero);
+  auto v2Coord = fir::CoordinateOp::create(builder, loc, f16RefTy,
+                                           fir::getBase(args[1]), one);
+  auto v1 = fir::LoadOp::create(builder, loc, v1Coord);
+  auto v2 = fir::LoadOp::create(builder, loc, v2Coord);
+  mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF16Ty);
+  mlir::Value vec1 = mlir::LLVM::InsertElementOp::create(
+      builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0));
+  mlir::Value vec2 = mlir::LLVM::InsertElementOp::create(
+      builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1));
+  auto res = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2);
+  auto i32VecTy = mlir::VectorType::get({1}, i32Ty);
+  mlir::Value vecI32 =
+      mlir::vector::BitCastOp::create(builder, loc, i32VecTy, res);
+  return mlir::vector::ExtractOp::create(builder, loc, vecI32,
+                                         mlir::ArrayRef<int64_t>{0});
+}
+
+// ATOMICADDVECTOR
+template <int extent>
+fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicAddVector(
+    mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value res = fir::AllocaOp::create(
+      builder, loc, fir::SequenceType::get({extent}, resultType));
+  mlir::Value a = fir::getBase(args[0]);
+  if (mlir::isa<fir::BaseBoxType>(a.getType())) {
+    a = fir::BoxAddrOp::create(builder, loc, a);
+  }
+  auto vecTy = mlir::VectorType::get({extent}, resultType);
+  auto refTy = fir::ReferenceType::get(resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+  mlir::Type idxTy = builder.getIndexType();
+
+  // Extract the values from the array.
+  llvm::SmallVector<mlir::Value> values;
+  for (unsigned i = 0; i < extent; ++i) {
+    mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i);
+    mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy,
+                                                  fir::getBase(args[1]), pos);
+    mlir::Value value = fir::LoadOp::create(builder, loc, coord);
+    values.push_back(value);
+  }
+  // Pack extracted values into a vector to call the atomic add.
+  mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy);
+  for (unsigned i = 0; i < extent; ++i) {
+    mlir::Value insert = mlir::LLVM::InsertElementOp::create(
+        builder, loc, undef, values[i],
+        builder.createIntegerConstant(loc, i32Ty, i));
+    undef = insert;
+  }
+  // Atomic operation with a vector of values.
+  mlir::Value add =
+      genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, undef);
+  // Store results in the result array.
+  for (unsigned i = 0; i < extent; ++i) {
+    mlir::Value r = mlir::LLVM::ExtractElementOp::create(
+        builder, loc, add, builder.createIntegerConstant(loc, i32Ty, i));
+    mlir::Value c = fir::CoordinateOp::create(
+        builder, loc, refTy, res, builder.createIntegerConstant(loc, idxTy, i));
+    fir::StoreOp::create(builder, loc, r, c);
+  }
+  mlir::Value ext = builder.createIntegerConstant(loc, idxTy, extent);
+  return fir::ArrayBoxValue(res, {ext});
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicAnd(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicOr(mlir::Type resultType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+// ATOMICCAS
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genAtomicCas(mlir::Type resultType,
+                                   llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  auto successOrdering = mlir::LLVM::AtomicOrdering::acq_rel;
+  auto failureOrdering = mlir::LLVM::AtomicOrdering::monotonic;
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(resultType.getContext());
+
+  mlir::Value arg0 = fir::getBase(args[0]);
+  mlir::Value arg1 = fir::getBase(args[1]);
+  mlir::Value arg2 = fir::getBase(args[2]);
+
+  auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value {
+    if (mlir::isa<mlir::Float32Type>(arg.getType()))
+      return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(),
+                                           arg);
+    if (mlir::isa<mlir::Float64Type>(arg.getType()))
+      return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(),
+                                           arg);
+    return arg;
+  };
+
+  arg1 = bitCastFloat(arg1);
+  arg2 = bitCastFloat(arg2);
+
+  if (arg1.getType() != arg2.getType()) {
+    // arg1 and arg2 need to have the same type in AtomicCmpXchgOp.
+    arg2 = builder.createConvert(loc, arg1.getType(), arg2);
+  }
+
+  auto address =
+      mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0)
+          .getResult(0);
+  auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create(
+      builder, loc, address, arg1, arg2, successOrdering, failureOrdering);
+  mlir::Value boolResult =
+      mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1);
+  return builder.createConvert(loc, resultType, boolResult);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicDec(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+// ATOMICEXCH
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genAtomicExch(mlir::Type resultType,
+                                    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value arg0 = fir::getBase(args[0]);
+  mlir::Value arg1 = fir::getBase(args[1]);
+  assert(arg1.getType().isIntOrFloat());
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::xchg;
+  return genAtomBinOp(builder, loc, binOp, arg0, arg1);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicInc(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicMax(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+
+  mlir::LLVM::AtomicBinOp binOp =
+      mlir::isa<mlir::IntegerType>(args[1].getType())
+          ? mlir::LLVM::AtomicBinOp::max
+          : mlir::LLVM::AtomicBinOp::fmax;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicMin(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+
+  mlir::LLVM::AtomicBinOp binOp =
+      mlir::isa<mlir::IntegerType>(args[1].getType())
+          ? mlir::LLVM::AtomicBinOp::min
+          : mlir::LLVM::AtomicBinOp::fmin;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+// ATOMICSUB
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicSub(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::LLVM::AtomicBinOp binOp =
+      mlir::isa<mlir::IntegerType>(args[1].getType())
+          ? mlir::LLVM::AtomicBinOp::sub
+          : mlir::LLVM::AtomicBinOp::fsub;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+// ATOMICXOR
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genAtomicXor(mlir::Type resultType,
+                                   llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value arg0 = fir::getBase(args[0]);
+  mlir::Value arg1 = fir::getBase(args[1]);
+  return genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::_xor, arg0, arg1);
+}
+
+// BARRIER_ARRIVE
+mlir::Value
+CUDAIntrinsicLibrary::genBarrierArrive(mlir::Type resultType,
+                                       llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 1);
+  mlir::Value barrier = convertPtrToNVVMSpace(
+      builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
+  return mlir::NVVM::MBarrierArriveOp::create(builder, loc, resultType, barrier)
+      .getResult();
+}
+
+// BARRIER_ARRIBVE_CNT
+mlir::Value
+CUDAIntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType,
+                                          llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::Value barrier = convertPtrToNVVMSpace(
+      builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
+  return mlir::NVVM::InlinePtxOp::create(builder, loc, {resultType},
+                                         {barrier, args[1]}, {},
+                                         "mbarrier.arrive.expect_tx.release."
+                                         "cta.shared::cta.b64 %0, [%1], %2;",
+                                         {})
+      .getResult(0);
+}
+
+// BARRIER_INIT
+void CUDAIntrinsicLibrary::genBarrierInit(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value barrier = convertPtrToNVVMSpace(
+      builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
+  mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier,
+                                     fir::getBase(args[1]), {});
+  auto kind = mlir::NVVM::ProxyKindAttr::get(
+      builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
+  auto space = mlir::NVVM::SharedSpaceAttr::get(
+      builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
+  mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
+}
+
+// BARRIER_TRY_WAIT
+mlir::Value
+CUDAIntrinsicLibrary::genBarrierTryWait(mlir::Type resultType,
+                                        llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
+  fir::StoreOp::create(builder, loc, zero, res);
+  mlir::Value ns =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 1000000);
+  mlir::Value load = fir::LoadOp::create(builder, loc, res);
+  auto whileOp = mlir::scf::WhileOp::create(
+      builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load});
+  mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore());
+  mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc);
+  builder.setInsertionPointToStart(beforeBlock);
+  mlir::Value condition = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero);
+  mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg);
+  mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter());
+  afterBlock->addArgument(resultType, loc);
+  builder.setInsertionPointToStart(afterBlock);
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
+  mlir::Value ret = mlir::NVVM::InlinePtxOp::create(
+                        builder, loc, {resultType}, {barrier, args[1], ns}, {},
+                        "{\n"
+                        "  .reg .pred p;\n"
+                        "  mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n"
+                        "  selp.b32 %0, 1, 0, p;\n"
+                        "}",
+                        {})
+                        .getResult(0);
+  mlir::scf::YieldOp::create(builder, loc, ret);
+  builder.setInsertionPointAfter(whileOp);
+  return whileOp.getResult(0);
+}
+
+// BARRIER_TRY_WAIT_SLEEP
+mlir::Value
+CUDAIntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,
+                                             llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 3);
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
+  return mlir::NVVM::InlinePtxOp::create(
+             builder, loc, {resultType}, {barrier, args[1], args[2]}, {},
+             "{\n"
+             "  .reg .pred p;\n"
+             "  mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n"
+             "  selp.b32 %0, 1, 0, p;\n"
+             "}",
+             {})
+      .getResult(0);
+}
+
+// FENCE_PROXY_ASYNC
+void CUDAIntrinsicLibrary::genFenceProxyAsync(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 0);
+  auto kind = mlir::NVVM::ProxyKindAttr::get(
+      builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
+  auto space = mlir::NVVM::SharedSpaceAttr::get(
+      builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
+  mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
+}
+
+// __LDCA, __LDCS, __LDLU, __LDCV
+template <const char *fctName, int extent>
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genLDXXFunc(mlir::Type resultType,
+                                  llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1);
+  mlir::Type resTy = fir::SequenceType::get(extent, resultType);
+  mlir::Value arg = fir::getBase(args[0]);
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resTy);
+  if (mlir::isa<fir::BaseBoxType>(arg.getType()))
+    arg = fir::BoxAddrOp::create(builder, loc, arg);
+  mlir::Type refResTy = fir::ReferenceType::get(resTy);
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {});
+  auto funcOp = builder.createFunction(loc, fctName, ftype);
+  llvm::SmallVector<mlir::Value> funcArgs;
+  funcArgs.push_back(res);
+  funcArgs.push_back(arg);
+  fir::CallOp::create(builder, loc, funcOp, funcArgs);
+  mlir::Value ext =
+      builder.createIntegerConstant(loc, builder.getIndexType(), extent);
+  return fir::ArrayBoxValue(res, {ext});
+}
+
+// CLOCK, CLOCK64, GLOBALTIMER
+template <typename OpTy>
+mlir::Value
+CUDAIntrinsicLibrary::genNVVMTime(mlir::Type resultType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0 && "expect no arguments");
+  return OpTy::create(builder, loc, resultType).getResult();
+}
+
+// MATCH_ALL_SYNC
+mlir::Value
+CUDAIntrinsicLibrary::genMatchAllSync(mlir::Type resultType,
+                                      llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 3);
+  bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
+
+  mlir::Type i1Ty = builder.getI1Type();
+  mlir::MLIRContext *context = builder.getContext();
+
+  mlir::Value arg1 = args[1];
+  if (arg1.getType().isF32() || arg1.getType().isF64())
+    arg1 = fir::ConvertOp::create(
+        builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
+
+  mlir::Type retTy =
+      mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty});
+  auto match =
+      mlir::NVVM::MatchSyncOp::create(builder, loc, retTy, args[0], arg1,
+                                      mlir::NVVM::MatchSyncKind::all)
+          .getResult();
+  auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0);
+  auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1);
+  auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred);
+  fir::StoreOp::create(builder, loc, conv, args[2]);
+  return value;
+}
+
+// MATCH_ANY_SYNC
+mlir::Value
+CUDAIntrinsicLibrary::genMatchAnySync(mlir::Type resultType,
+                                      llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
+
+  mlir::Value arg1 = args[1];
+  if (arg1.getType().isF32() || arg1.getType().isF64())
+    arg1 = fir::ConvertOp::create(
+        builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
+
+  return mlir::NVVM::MatchSyncOp::create(builder, loc, resultType, args[0],
+                                         arg1, mlir::NVVM::MatchSyncKind::any)
+      .getResult();
+}
+
+// SYNCTHREADS
+void CUDAIntrinsicLibrary::genSyncThreads(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  mlir::NVVM::Barrier0Op::create(builder, loc);
+}
+
+// SYNCTHREADS_AND
+mlir::Value
+CUDAIntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType,
+                                        llvm::ArrayRef<mlir::Value> args) {
+  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and";
+  mlir::MLIRContext *context = builder.getContext();
+  mlir::Type i32 = builder.getI32Type();
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(context, {resultType}, {i32});
+  auto funcOp = builder.createFunction(loc, funcName, ftype);
+  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
+  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
+}
+
+// SYNCTHREADS_COUNT
+mlir::Value
+CUDAIntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType,
+                                          llvm::ArrayRef<mlir::Value> args) {
+  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc";
+  mlir::MLIRContext *context = builder.getContext();
+  mlir::Type i32 = builder.getI32Type();
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(context, {resultType}, {i32});
+  auto funcOp = builder.createFunction(loc, funcName, ftype);
+  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
+  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
+}
+
+// SYNCTHREADS_OR
+mlir::Value
+CUDAIntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType,
+                                       llvm::ArrayRef<mlir::Value> args) {
+  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or";
+  mlir::MLIRContext *context = builder.getContext();
+  mlir::Type i32 = builder.getI32Type();
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(context, {resultType}, {i32});
+  auto funcOp = builder.createFunction(loc, funcName, ftype);
+  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
+  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
+}
+
+// SYNCWARP
+void CUDAIntrinsicLibrary::genSyncWarp(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1);
+  mlir::NVVM::SyncWarpOp::create(builder, loc, fir::getBase(args[0]));
+}
+
+// THIS_GRID
+mlir::Value
+CUDAIntrinsicLibrary::genThisGrid(mlir::Type resultType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0);
+  auto recTy = mlir::cast<fir::RecordType>(resultType);
+  assert(recTy && "RecordType expepected");
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+
+  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
+  mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
+  mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
+
+  mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty);
+  mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty);
+  mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty);
+
+  mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
+  mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
+  mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
+  mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty);
+  mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty);
+  mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty);
+
+  // this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) *
+  // (blockDim.x * gridDim.x);
+  mlir::Value resZ =
+      mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ);
+  mlir::Value resY =
+      mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY);
+  mlir::Value resX =
+      mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX);
+  mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY);
+  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX);
+
+  // tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) +
+  //   blockIdx.x;
+  // this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) +
+  //   ((threadIdx.z * blockDim.y) * blockDim.x) +
+  //   (threadIdx.y * blockDim.x) + threadIdx.x + 1;
+  mlir::Value r1 =
+      mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY);
+  mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX);
+  mlir::Value r3 =
+      mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX);
+  mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
+  mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX);
+
+  mlir::Value bXbY =
+      mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY);
+  mlir::Value bXbYbZ =
+      mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ);
+  mlir::Value tZbY =
+      mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
+  mlir::Value tZbYbX =
+      mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX);
+  mlir::Value tYbX =
+      mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
+  mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX);
+  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
+
+  auto sizeFieldName = recTy.getTypeList()[1].first;
+  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
+  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, sizeFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value sizeCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+  fir::StoreOp::create(builder, loc, size, sizeCoord);
+
+  auto rankFieldName = recTy.getTypeList()[2].first;
+  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
+  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, rankFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value rankCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+  fir::StoreOp::create(builder, loc, rank, rankCoord);
+  return res;
+}
+
+// THIS_THREAD_BLOCK
+mlir::Value
+CUDAIntrinsicLibrary::genThisThreadBlock(mlir::Type resultType,
+                                         llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0);
+  auto recTy = mlir::cast<fir::RecordType>(resultType);
+  assert(recTy && "RecordType expepected");
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+
+  // this_thread_block%size = blockDim.z * blockDim.y * blockDim.x;
+  mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
+  mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
+  mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
+  mlir::Value size =
+      mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY);
+  size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX);
+
+  // this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) +
+  //   (threadIdx.y * blockDim.x) + threadIdx.x + 1;
+  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
+  mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
+  mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
+  mlir::Value r1 =
+      mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
+  mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX);
+  mlir::Value r3 =
+      mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
+  mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
+  mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX);
+  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
+
+  auto sizeFieldName = recTy.getTypeList()[1].first;
+  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
+  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, sizeFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value sizeCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+  fir::StoreOp::create(builder, loc, size, sizeCoord);
+
+  auto rankFieldName = recTy.getTypeList()[2].first;
+  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
+  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, rankFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value rankCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+  fir::StoreOp::create(builder, loc, rank, rankCoord);
+  return res;
+}
+
+// THIS_WARP
+mlir::Value
+CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0);
+  auto recTy = mlir::cast<fir::RecordType>(resultType);
+  assert(recTy && "RecordType expepected");
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+
+  // coalesced_group%size = 32
+  mlir::Value size = builder.createIntegerConstant(loc, i32Ty, 32);
+  auto sizeFieldName = recTy.getTypeList()[1].first;
+  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
+  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, sizeFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value sizeCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+  fir::StoreOp::create(builder, loc, size, sizeCoord);
+
+  // coalesced_group%rank = threadIdx.x & 31 + 1
+  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
+  mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31);
+  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+  mlir::Value masked =
+      mlir::arith::AndIOp::create(builder, loc, threadIdX, mask);
+  mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one);
+  auto rankFieldName = recTy.getTypeList()[2].first;
+  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
+  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, rankFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value rankCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+  fir::StoreOp::create(builder, loc, rank, rankCoord);
+  return res;
+}
+
+// THREADFENCE, THREADFENCE_BLOCK, THREADFENCE_SYSTEM
+template <mlir::NVVM::MemScopeKind scope>
+void CUDAIntrinsicLibrary::genThreadFence(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 0);
+  mlir::NVVM::MembarOp::create(builder, loc, scope);
+}
+
+// TMA_BULK_COMMIT_GROUP
+void CUDAIntrinsicLibrary::genTMABulkCommitGroup(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 0);
+  mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc);
+}
+
+// TMA_BULK_G2S
+void CUDAIntrinsicLibrary::genTMABulkG2S(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value barrier = convertPtrToNVVMSpace(
+      builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
+  mlir::Value dst =
+      convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]),
+                            mlir::NVVM::NVVMMemorySpace::SharedCluster);
+  mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
+                                          mlir::NVVM::NVVMMemorySpace::Global);
+  mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create(
+      builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {});
+}
+
+static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc,
+                           mlir::Value barrier, mlir::Value src,
+                           mlir::Value dst, mlir::Value nelem,
+                           mlir::Value eleSize) {
+  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize);
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  barrier = builder.createConvert(loc, llvmPtrTy, barrier);
+  dst = builder.createConvert(loc, llvmPtrTy, dst);
+  src = builder.createConvert(loc, llvmPtrTy, src);
+  mlir::NVVM::InlinePtxOp::create(
+      builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {},
+      "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], "
+      "[%1], %2, [%3];",
+      {});
+  mlir::NVVM::InlinePtxOp::create(
+      builder, loc, mlir::TypeRange{}, {barrier, size}, {},
+      "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {});
+}
+
+// TMA_BULK_LOADC4
+void CUDAIntrinsicLibrary::genTMABulkLoadC4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADC8
+void CUDAIntrinsicLibrary::genTMABulkLoadC8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 16);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADI4
+void CUDAIntrinsicLibrary::genTMABulkLoadI4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADI8
+void CUDAIntrinsicLibrary::genTMABulkLoadI8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR2
+void CUDAIntrinsicLibrary::genTMABulkLoadR2(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 2);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR4
+void CUDAIntrinsicLibrary::genTMABulkLoadR4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR8
+void CUDAIntrinsicLibrary::genTMABulkLoadR8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_S2G
+void CUDAIntrinsicLibrary::genTMABulkS2G(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]),
+                                          mlir::NVVM::NVVMMemorySpace::Shared);
+  mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
+                                          mlir::NVVM::NVVMMemorySpace::Global);
+  mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(
+      builder, loc, dst, src, fir::getBase(args[2]), {}, {});
+
+  mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
+                                  "cp.async.bulk.commit_group;", {});
+  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
+                                             builder.getI32IntegerAttr(0), {});
+}
+
+static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc,
+                            mlir::Value src, mlir::Value dst, mlir::Value count,
+                            mlir::Value eleSize) {
+  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count);
+  src = convertPtrToNVVMSpace(builder, loc, src,
+                              mlir::NVVM::NVVMMemorySpace::Shared);
+  dst = convertPtrToNVVMSpace(builder, loc, dst,
+                              mlir::NVVM::NVVMMemorySpace::Global);
+  mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src,
+                                                     size, {}, {});
+  mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
+                                  "cp.async.bulk.commit_group;", {});
+  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
+                                             builder.getI32IntegerAttr(0), {});
+}
+
+// TMA_BULK_STORE_C4
+void CUDAIntrinsicLibrary::genTMABulkStoreC4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_C8
+void CUDAIntrinsicLibrary::genTMABulkStoreC8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 16);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_I4
+void CUDAIntrinsicLibrary::genTMABulkStoreI4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_I8
+void CUDAIntrinsicLibrary::genTMABulkStoreI8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R2
+void CUDAIntrinsicLibrary::genTMABulkStoreR2(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 2);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R4
+void CUDAIntrinsicLibrary::genTMABulkStoreR4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R8
+void CUDAIntrinsicLibrary::genTMABulkStoreR8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_WAIT_GROUP
+void CUDAIntrinsicLibrary::genTMABulkWaitGroup(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 0);
+  auto group = builder.getIntegerAttr(builder.getI32Type(), 0);
+  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {});
+}
+
+// ALL_SYNC, ANY_SYNC, BALLOT_SYNC
+template <mlir::NVVM::VoteSyncKind kind>
+mlir::Value
+CUDAIntrinsicLibrary::genVoteSync(mlir::Type resultType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::Value arg1 =
+      fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]);
+  mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot
+                         ? builder.getI32Type()
+                         : builder.getI1Type();
+  auto voteRes =
+      mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind)
+          .getResult();
+  return fir::ConvertOp::create(builder, loc, resultType, voteRes);
+}
+
+} // namespace fir
diff --git a/flang/lib/Optimizer/Builder/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp
index cf7588f275d22..461deb8e4cb55 100644
--- a/flang/lib/Optimizer/Builder/CUFCommon.cpp
+++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp
@@ -9,6 +9,7 @@
 #include "flang/Optimizer/Builder/CUFCommon.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
+#include "flang/Optimizer/Dialect/Support/KindMapping.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
@@ -91,3 +92,25 @@ void cuf::genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder) {
     }
   }
 }
+
+int cuf::computeElementByteSize(mlir::Location loc, mlir::Type type,
+                                fir::KindMapping &kindMap,
+                                bool emitErrorOnFailure) {
+  auto eleTy = fir::unwrapSequenceType(type);
+  if (auto t{mlir::dyn_cast<mlir::IntegerType>(eleTy)})
+    return t.getWidth() / 8;
+  if (auto t{mlir::dyn_cast<mlir::FloatType>(eleTy)})
+    return t.getWidth() / 8;
+  if (auto t{mlir::dyn_cast<fir::LogicalType>(eleTy)})
+    return kindMap.getLogicalBitsize(t.getFKind()) / 8;
+  if (auto t{mlir::dyn_cast<mlir::ComplexType>(eleTy)}) {
+    int elemSize =
+        mlir::cast<mlir::FloatType>(t.getElementType()).getWidth() / 8;
+    return 2 * elemSize;
+  }
+  if (auto t{mlir::dyn_cast<fir::CharacterType>(eleTy)})
+    return kindMap.getCharacterBitsize(t.getFKind()) / 8;
+  if (emitErrorOnFailure)
+    mlir::emitError(loc, "unsupported type");
+  return 0;
+}
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 5da27d1713825..6ef6074cf73ad 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -427,7 +427,8 @@ mlir::Value fir::FirOpBuilder::genTempDeclareOp(
       builder, loc, memref.getType(), memref, shape, typeParams,
       /*dummy_scope=*/nullptr,
       /*storage=*/nullptr,
-      /*storage_offset=*/0, nameAttr, fortranAttrs, cuf::DataAttributeAttr{});
+      /*storage_offset=*/0, nameAttr, fortranAttrs, cuf::DataAttributeAttr{},
+      /*dummy_arg_no=*/mlir::IntegerAttr{});
 }
 
 mlir::Value fir::FirOpBuilder::genStackSave(mlir::Location loc) {
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 93dfc577665ce..b435ae15cff58 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -250,7 +250,7 @@ hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
                   const fir::ExtendedValue &exv, llvm::StringRef name,
                   fir::FortranVariableFlagsAttr flags, mlir::Value dummyScope,
                   mlir::Value storage, std::uint64_t storageOffset,
-                  cuf::DataAttributeAttr dataAttr) {
+                  cuf::DataAttributeAttr dataAttr, unsigned dummyArgNo) {
 
   mlir::Value base = fir::getBase(exv);
   assert(fir::conformsWithPassByRef(base.getType()) &&
@@ -281,7 +281,7 @@ hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
       [](const auto &) {});
   auto declareOp = hlfir::DeclareOp::create(
       builder, loc, base, name, shapeOrShift, lenParams, dummyScope, storage,
-      storageOffset, flags, dataAttr);
+      storageOffset, flags, dataAttr, dummyArgNo);
   return mlir::cast<fir::FortranVariableOpInterface>(declareOp.getOperation());
 }
 
@@ -1392,6 +1392,66 @@ bool hlfir::elementalOpMustProduceTemp(hlfir::ElementalOp elemental) {
   return false;
 }
 
+static void combineAndStoreElement(
+    mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity lhs,
+    hlfir::Entity rhs, bool temporaryLHS,
+    std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
+                                hlfir::Entity, hlfir::Entity)> *combiner) {
+  hlfir::Entity valueToAssign = hlfir::loadTrivialScalar(loc, builder, rhs);
+  if (combiner) {
+    hlfir::Entity lhsValue = hlfir::loadTrivialScalar(loc, builder, lhs);
+    valueToAssign = (*combiner)(loc, builder, lhsValue, valueToAssign);
+  }
+  hlfir::AssignOp::create(builder, loc, valueToAssign, lhs,
+                          /*realloc=*/false,
+                          /*keep_lhs_length_if_realloc=*/false,
+                          /*temporary_lhs=*/temporaryLHS);
+}
+
+void hlfir::genNoAliasArrayAssignment(
+    mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
+    hlfir::Entity lhs, bool emitWorkshareLoop, bool temporaryLHS,
+    std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
+                                hlfir::Entity, hlfir::Entity)> *combiner) {
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs);
+  lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs);
+  mlir::Value lhsShape = hlfir::genShape(loc, builder, lhs);
+  llvm::SmallVector<mlir::Value> lhsExtents =
+      hlfir::getIndexExtents(loc, builder, lhsShape);
+  mlir::Value rhsShape = hlfir::genShape(loc, builder, rhs);
+  llvm::SmallVector<mlir::Value> rhsExtents =
+      hlfir::getIndexExtents(loc, builder, rhsShape);
+  llvm::SmallVector<mlir::Value> extents =
+      fir::factory::deduceOptimalExtents(lhsExtents, rhsExtents);
+  hlfir::LoopNest loopNest =
+      hlfir::genLoopNest(loc, builder, extents,
+                         /*isUnordered=*/true, emitWorkshareLoop);
+  builder.setInsertionPointToStart(loopNest.body);
+  auto rhsArrayElement =
+      hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
+  rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement);
+  auto lhsArrayElement =
+      hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices);
+  combineAndStoreElement(loc, builder, lhsArrayElement, rhsArrayElement,
+                         temporaryLHS, combiner);
+}
+
+void hlfir::genNoAliasAssignment(
+    mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
+    hlfir::Entity lhs, bool emitWorkshareLoop, bool temporaryLHS,
+    std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
+                                hlfir::Entity, hlfir::Entity)> *combiner) {
+  if (lhs.isArray()) {
+    genNoAliasArrayAssignment(loc, builder, rhs, lhs, emitWorkshareLoop,
+                              temporaryLHS, combiner);
+    return;
+  }
+  rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs);
+  lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs);
+  combineAndStoreElement(loc, builder, lhs, rhs, temporaryLHS, combiner);
+}
+
 std::pair<hlfir::Entity, bool>
 hlfir::createTempFromMold(mlir::Location loc, fir::FirOpBuilder &builder,
                           hlfir::Entity mold) {
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index ca3e1cd46db7d..60dc02474faf6 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -16,6 +16,7 @@
 #include "flang/Optimizer/Builder/IntrinsicCall.h"
 #include "flang/Common/static-multimap-view.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/CUDAIntrinsicCall.h"
 #include "flang/Optimizer/Builder/CUFCommon.h"
 #include "flang/Optimizer/Builder/Character.h"
 #include "flang/Optimizer/Builder/Complex.h"
@@ -50,7 +51,6 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/Math/IR/Math.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -108,34 +108,6 @@ using I = IntrinsicLibrary;
 /// argument is an optional variable in the current scope).
 static constexpr bool handleDynamicOptional = true;
 
-/// TODO: Move all CUDA Fortran intrinsic handlers into its own file similar to
-/// PPC.
-static const char __ldca_i4x4[] = "__ldca_i4x4_";
-static const char __ldca_i8x2[] = "__ldca_i8x2_";
-static const char __ldca_r2x2[] = "__ldca_r2x2_";
-static const char __ldca_r4x4[] = "__ldca_r4x4_";
-static const char __ldca_r8x2[] = "__ldca_r8x2_";
-static const char __ldcg_i4x4[] = "__ldcg_i4x4_";
-static const char __ldcg_i8x2[] = "__ldcg_i8x2_";
-static const char __ldcg_r2x2[] = "__ldcg_r2x2_";
-static const char __ldcg_r4x4[] = "__ldcg_r4x4_";
-static const char __ldcg_r8x2[] = "__ldcg_r8x2_";
-static const char __ldcs_i4x4[] = "__ldcs_i4x4_";
-static const char __ldcs_i8x2[] = "__ldcs_i8x2_";
-static const char __ldcs_r2x2[] = "__ldcs_r2x2_";
-static const char __ldcs_r4x4[] = "__ldcs_r4x4_";
-static const char __ldcs_r8x2[] = "__ldcs_r8x2_";
-static const char __ldcv_i4x4[] = "__ldcv_i4x4_";
-static const char __ldcv_i8x2[] = "__ldcv_i8x2_";
-static const char __ldcv_r2x2[] = "__ldcv_r2x2_";
-static const char __ldcv_r4x4[] = "__ldcv_r4x4_";
-static const char __ldcv_r8x2[] = "__ldcv_r8x2_";
-static const char __ldlu_i4x4[] = "__ldlu_i4x4_";
-static const char __ldlu_i8x2[] = "__ldlu_i8x2_";
-static const char __ldlu_r2x2[] = "__ldlu_r2x2_";
-static const char __ldlu_r4x4[] = "__ldlu_r4x4_";
-static const char __ldlu_r8x2[] = "__ldlu_r8x2_";
-
 /// Table that drives the fir generation depending on the intrinsic or intrinsic
 /// module procedure one to one mapping with Fortran arguments. If no mapping is
 /// defined here for a generic intrinsic, genRuntimeCall will be called
@@ -144,106 +116,6 @@ static const char __ldlu_r8x2[] = "__ldlu_r8x2_";
 /// argument must not be lowered by value. In which case, the lowering rules
 /// should be provided for all the intrinsic arguments for completeness.
 static constexpr IntrinsicHandler handlers[]{
-    {"__ldca_i4x4",
-     &I::genCUDALDXXFunc<__ldca_i4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldca_i8x2",
-     &I::genCUDALDXXFunc<__ldca_i8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldca_r2x2",
-     &I::genCUDALDXXFunc<__ldca_r2x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldca_r4x4",
-     &I::genCUDALDXXFunc<__ldca_r4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldca_r8x2",
-     &I::genCUDALDXXFunc<__ldca_r8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcg_i4x4",
-     &I::genCUDALDXXFunc<__ldcg_i4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcg_i8x2",
-     &I::genCUDALDXXFunc<__ldcg_i8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcg_r2x2",
-     &I::genCUDALDXXFunc<__ldcg_r2x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcg_r4x4",
-     &I::genCUDALDXXFunc<__ldcg_r4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcg_r8x2",
-     &I::genCUDALDXXFunc<__ldcg_r8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcs_i4x4",
-     &I::genCUDALDXXFunc<__ldcs_i4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcs_i8x2",
-     &I::genCUDALDXXFunc<__ldcs_i8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcs_r2x2",
-     &I::genCUDALDXXFunc<__ldcs_r2x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcs_r4x4",
-     &I::genCUDALDXXFunc<__ldcs_r4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcs_r8x2",
-     &I::genCUDALDXXFunc<__ldcs_r8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcv_i4x4",
-     &I::genCUDALDXXFunc<__ldcv_i4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcv_i8x2",
-     &I::genCUDALDXXFunc<__ldcv_i8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcv_r2x2",
-     &I::genCUDALDXXFunc<__ldcv_r2x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcv_r4x4",
-     &I::genCUDALDXXFunc<__ldcv_r4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcv_r8x2",
-     &I::genCUDALDXXFunc<__ldcv_r8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldlu_i4x4",
-     &I::genCUDALDXXFunc<__ldlu_i4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldlu_i8x2",
-     &I::genCUDALDXXFunc<__ldlu_i8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldlu_r2x2",
-     &I::genCUDALDXXFunc<__ldlu_r2x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldlu_r4x4",
-     &I::genCUDALDXXFunc<__ldlu_r4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldlu_r8x2",
-     &I::genCUDALDXXFunc<__ldlu_r8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
     {"abort", &I::genAbort},
     {"abs", &I::genAbs},
     {"achar", &I::genChar},
@@ -263,10 +135,6 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genAll,
      {{{"mask", asAddr}, {"dim", asValue}}},
      /*isElemental=*/false},
-    {"all_sync",
-     &I::genVoteSync<mlir::NVVM::VoteSyncKind::all>,
-     {{{"mask", asValue}, {"pred", asValue}}},
-     /*isElemental=*/false},
     {"allocated",
      &I::genAllocated,
      {{{"array", asInquired}, {"scalar", asInquired}}},
@@ -276,10 +144,6 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genAny,
      {{{"mask", asAddr}, {"dim", asValue}}},
      /*isElemental=*/false},
-    {"any_sync",
-     &I::genVoteSync<mlir::NVVM::VoteSyncKind::any>,
-     {{{"mask", asValue}, {"pred", asValue}}},
-     /*isElemental=*/false},
     {"asind", &I::genAsind},
     {"asinpi", &I::genAsinpi},
     {"associated",
@@ -290,83 +154,6 @@ static constexpr IntrinsicHandler handlers[]{
     {"atan2pi", &I::genAtanpi},
     {"atand", &I::genAtand},
     {"atanpi", &I::genAtanpi},
-    {"atomicaddd", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicaddf", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicaddi", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicaddl", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicandi", &I::genAtomicAnd, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomiccasd",
-     &I::genAtomicCas,
-     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
-     false},
-    {"atomiccasf",
-     &I::genAtomicCas,
-     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
-     false},
-    {"atomiccasi",
-     &I::genAtomicCas,
-     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
-     false},
-    {"atomiccasul",
-     &I::genAtomicCas,
-     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
-     false},
-    {"atomicdeci", &I::genAtomicDec, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicexchd",
-     &I::genAtomicExch,
-     {{{"a", asAddr}, {"v", asValue}}},
-     false},
-    {"atomicexchf",
-     &I::genAtomicExch,
-     {{{"a", asAddr}, {"v", asValue}}},
-     false},
-    {"atomicexchi",
-     &I::genAtomicExch,
-     {{{"a", asAddr}, {"v", asValue}}},
-     false},
-    {"atomicexchul",
-     &I::genAtomicExch,
-     {{{"a", asAddr}, {"v", asValue}}},
-     false},
-    {"atomicinci", &I::genAtomicInc, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmaxd", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmaxf", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmaxi", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmaxl", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmind", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicminf", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmini", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicminl", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicori", &I::genAtomicOr, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicsubd", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicsubf", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicsubi", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicsubl", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicxori", &I::genAtomicXor, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"ballot_sync",
-     &I::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>,
-     {{{"mask", asValue}, {"pred", asValue}}},
-     /*isElemental=*/false},
-    {"barrier_arrive",
-     &I::genBarrierArrive,
-     {{{"barrier", asAddr}}},
-     /*isElemental=*/false},
-    {"barrier_arrive_cnt",
-     &I::genBarrierArriveCnt,
-     {{{"barrier", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"barrier_init",
-     &I::genBarrierInit,
-     {{{"barrier", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"barrier_try_wait",
-     &I::genBarrierTryWait,
-     {{{"barrier", asAddr}, {"token", asValue}}},
-     /*isElemental=*/false},
-    {"barrier_try_wait_sleep",
-     &I::genBarrierTryWaitSleep,
-     {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}},
-     /*isElemental=*/false},
     {"bessel_jn",
      &I::genBesselJn,
      {{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}},
@@ -410,11 +197,6 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genChdir,
      {{{"name", asAddr}, {"status", asAddr, handleDynamicOptional}}},
      /*isElemental=*/false},
-    {"clock", &I::genNVVMTime<mlir::NVVM::ClockOp>, {}, /*isElemental=*/false},
-    {"clock64",
-     &I::genNVVMTime<mlir::NVVM::Clock64Op>,
-     {},
-     /*isElemental=*/false},
     {"cmplx",
      &I::genCmplx,
      {{{"x", asValue}, {"y", asValue, handleDynamicOptional}}}},
@@ -511,10 +293,6 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genExtendsTypeOf,
      {{{"a", asBox}, {"mold", asBox}}},
      /*isElemental=*/false},
-    {"fence_proxy_async",
-     &I::genFenceProxyAsync,
-     {},
-     /*isElemental=*/false},
     {"findloc",
      &I::genFindloc,
      {{{"array", asBox},
@@ -562,6 +340,10 @@ static constexpr IntrinsicHandler handlers[]{
        {"trim_name", asAddr, handleDynamicOptional},
        {"errmsg", asBox, handleDynamicOptional}}},
      /*isElemental=*/false},
+    {"get_team",
+     &I::genGetTeam,
+     {{{"level", asValue, handleDynamicOptional}}},
+     /*isElemental=*/false},
     {"getcwd",
      &I::genGetCwd,
      {{{"c", asBox}, {"status", asAddr, handleDynamicOptional}}},
@@ -569,10 +351,6 @@ static constexpr IntrinsicHandler handlers[]{
     {"getgid", &I::genGetGID},
     {"getpid", &I::genGetPID},
     {"getuid", &I::genGetUID},
-    {"globaltimer",
-     &I::genNVVMTime<mlir::NVVM::GlobalTimerOp>,
-     {},
-     /*isElemental=*/false},
     {"hostnm",
      &I::genHostnm,
      {{{"c", asBox}, {"status", asAddr, handleDynamicOptional}}},
@@ -740,38 +518,6 @@ static constexpr IntrinsicHandler handlers[]{
     {"malloc", &I::genMalloc},
     {"maskl", &I::genMask<mlir::arith::ShLIOp>},
     {"maskr", &I::genMask<mlir::arith::ShRUIOp>},
-    {"match_all_syncjd",
-     &I::genMatchAllSync,
-     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
-     /*isElemental=*/false},
-    {"match_all_syncjf",
-     &I::genMatchAllSync,
-     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
-     /*isElemental=*/false},
-    {"match_all_syncjj",
-     &I::genMatchAllSync,
-     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
-     /*isElemental=*/false},
-    {"match_all_syncjx",
-     &I::genMatchAllSync,
-     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
-     /*isElemental=*/false},
-    {"match_any_syncjd",
-     &I::genMatchAnySync,
-     {{{"mask", asValue}, {"value", asValue}}},
-     /*isElemental=*/false},
-    {"match_any_syncjf",
-     &I::genMatchAnySync,
-     {{{"mask", asValue}, {"value", asValue}}},
-     /*isElemental=*/false},
-    {"match_any_syncjj",
-     &I::genMatchAnySync,
-     {{{"mask", asValue}, {"value", asValue}}},
-     /*isElemental=*/false},
-    {"match_any_syncjx",
-     &I::genMatchAnySync,
-     {{{"mask", asValue}, {"value", asValue}}},
-     /*isElemental=*/false},
     {"matmul",
      &I::genMatmul,
      {{{"matrix_a", asAddr}, {"matrix_b", asAddr}}},
@@ -997,20 +743,6 @@ static constexpr IntrinsicHandler handlers[]{
        {"dim", asValue},
        {"mask", asBox, handleDynamicOptional}}},
      /*isElemental=*/false},
-    {"syncthreads", &I::genSyncThreads, {}, /*isElemental=*/false},
-    {"syncthreads_and_i4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false},
-    {"syncthreads_and_l4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false},
-    {"syncthreads_count_i4",
-     &I::genSyncThreadsCount,
-     {},
-     /*isElemental=*/false},
-    {"syncthreads_count_l4",
-     &I::genSyncThreadsCount,
-     {},
-     /*isElemental=*/false},
-    {"syncthreads_or_i4", &I::genSyncThreadsOr, {}, /*isElemental=*/false},
-    {"syncthreads_or_l4", &I::genSyncThreadsOr, {}, /*isElemental=*/false},
-    {"syncwarp", &I::genSyncWarp, {}, /*isElemental=*/false},
     {"system",
      &I::genSystem,
      {{{"command", asBox}, {"exitstat", asBox, handleDynamicOptional}}},
@@ -1021,115 +753,17 @@ static constexpr IntrinsicHandler handlers[]{
      /*isElemental=*/false},
     {"tand", &I::genTand},
     {"tanpi", &I::genTanpi},
-    {"this_grid", &I::genThisGrid, {}, /*isElemental=*/false},
+    {"team_number",
+     &I::genTeamNumber,
+     {{{"team", asBox, handleDynamicOptional}}},
+     /*isElemental=*/false},
     {"this_image",
      &I::genThisImage,
      {{{"coarray", asBox},
        {"dim", asAddr},
        {"team", asBox, handleDynamicOptional}}},
      /*isElemental=*/false},
-    {"this_thread_block", &I::genThisThreadBlock, {}, /*isElemental=*/false},
-    {"this_warp", &I::genThisWarp, {}, /*isElemental=*/false},
-    {"threadfence", &I::genThreadFence, {}, /*isElemental=*/false},
-    {"threadfence_block", &I::genThreadFenceBlock, {}, /*isElemental=*/false},
-    {"threadfence_system", &I::genThreadFenceSystem, {}, /*isElemental=*/false},
     {"time", &I::genTime, {}, /*isElemental=*/false},
-    {"tma_bulk_commit_group",
-     &I::genTMABulkCommitGroup,
-     {{}},
-     /*isElemental=*/false},
-    {"tma_bulk_g2s",
-     &I::genTMABulkG2S,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nbytes", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldc4",
-     &I::genTMABulkLoadC4,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldc8",
-     &I::genTMABulkLoadC8,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldi4",
-     &I::genTMABulkLoadI4,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldi8",
-     &I::genTMABulkLoadI8,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldr2",
-     &I::genTMABulkLoadR2,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldr4",
-     &I::genTMABulkLoadR4,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldr8",
-     &I::genTMABulkLoadR8,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_s2g",
-     &I::genTMABulkS2G,
-     {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_c4",
-     &I::genTMABulkStoreC4,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_c8",
-     &I::genTMABulkStoreC8,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_i4",
-     &I::genTMABulkStoreI4,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_i8",
-     &I::genTMABulkStoreI8,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_r2",
-     &I::genTMABulkStoreR2,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_r4",
-     &I::genTMABulkStoreR4,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_r8",
-     &I::genTMABulkStoreR8,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_wait_group",
-     &I::genTMABulkWaitGroup,
-     {{}},
-     /*isElemental=*/false},
     {"trailz", &I::genTrailz},
     {"transfer",
      &I::genTransfer,
@@ -2221,6 +1855,9 @@ lookupIntrinsicHandler(fir::FirOpBuilder &builder,
   if (isPPCTarget)
     if (const IntrinsicHandler *ppcHandler = findPPCIntrinsicHandler(name))
       return std::make_optional<IntrinsicHandlerEntry>(ppcHandler);
+  // TODO: Look for CUDA intrinsic handlers only if CUDA is enabled.
+  if (const IntrinsicHandler *cudaHandler = findCUDAIntrinsicHandler(name))
+    return std::make_optional<IntrinsicHandlerEntry>(cudaHandler);
   // Subroutines should have a handler.
   if (!resultType)
     return std::nullopt;
@@ -3107,159 +2744,6 @@ mlir::Value IntrinsicLibrary::genAtanpi(mlir::Type resultType,
   return mlir::arith::MulFOp::create(builder, loc, atan, factor);
 }
 
-static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,
-                                mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0,
-                                mlir::Value arg1) {
-  auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext());
-  arg0 = builder.createConvert(loc, llvmPointerType, arg0);
-  return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1,
-                                         mlir::LLVM::AtomicOrdering::seq_cst);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-
-  mlir::LLVM::AtomicBinOp binOp =
-      mlir::isa<mlir::IntegerType>(args[1].getType())
-          ? mlir::LLVM::AtomicBinOp::add
-          : mlir::LLVM::AtomicBinOp::fadd;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicSub(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-
-  mlir::LLVM::AtomicBinOp binOp =
-      mlir::isa<mlir::IntegerType>(args[1].getType())
-          ? mlir::LLVM::AtomicBinOp::sub
-          : mlir::LLVM::AtomicBinOp::fsub;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicAnd(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
-
-  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicOr(mlir::Type resultType,
-                                          llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
-
-  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-// ATOMICCAS
-fir::ExtendedValue
-IntrinsicLibrary::genAtomicCas(mlir::Type resultType,
-                               llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  auto successOrdering = mlir::LLVM::AtomicOrdering::acq_rel;
-  auto failureOrdering = mlir::LLVM::AtomicOrdering::monotonic;
-  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(resultType.getContext());
-
-  mlir::Value arg0 = fir::getBase(args[0]);
-  mlir::Value arg1 = fir::getBase(args[1]);
-  mlir::Value arg2 = fir::getBase(args[2]);
-
-  auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value {
-    if (mlir::isa<mlir::Float32Type>(arg.getType()))
-      return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(),
-                                           arg);
-    if (mlir::isa<mlir::Float64Type>(arg.getType()))
-      return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(),
-                                           arg);
-    return arg;
-  };
-
-  arg1 = bitCastFloat(arg1);
-  arg2 = bitCastFloat(arg2);
-
-  if (arg1.getType() != arg2.getType()) {
-    // arg1 and arg2 need to have the same type in AtomicCmpXchgOp.
-    arg2 = builder.createConvert(loc, arg1.getType(), arg2);
-  }
-
-  auto address =
-      mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0)
-          .getResult(0);
-  auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create(
-      builder, loc, address, arg1, arg2, successOrdering, failureOrdering);
-  mlir::Value boolResult =
-      mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1);
-  return builder.createConvert(loc, resultType, boolResult);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicDec(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
-
-  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-// ATOMICEXCH
-fir::ExtendedValue
-IntrinsicLibrary::genAtomicExch(mlir::Type resultType,
-                                llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 2);
-  mlir::Value arg0 = fir::getBase(args[0]);
-  mlir::Value arg1 = fir::getBase(args[1]);
-  assert(arg1.getType().isIntOrFloat());
-
-  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::xchg;
-  return genAtomBinOp(builder, loc, binOp, arg0, arg1);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicInc(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
-
-  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicMax(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-
-  mlir::LLVM::AtomicBinOp binOp =
-      mlir::isa<mlir::IntegerType>(args[1].getType())
-          ? mlir::LLVM::AtomicBinOp::max
-          : mlir::LLVM::AtomicBinOp::fmax;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicMin(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-
-  mlir::LLVM::AtomicBinOp binOp =
-      mlir::isa<mlir::IntegerType>(args[1].getType())
-          ? mlir::LLVM::AtomicBinOp::min
-          : mlir::LLVM::AtomicBinOp::fmin;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-// ATOMICXOR
-fir::ExtendedValue
-IntrinsicLibrary::genAtomicXor(mlir::Type resultType,
-                               llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 2);
-  mlir::Value arg0 = fir::getBase(args[0]);
-  mlir::Value arg1 = fir::getBase(args[1]);
-  return genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::_xor, arg0, arg1);
-}
-
 // ASSOCIATED
 fir::ExtendedValue
 IntrinsicLibrary::genAssociated(mlir::Type resultType,
@@ -3311,114 +2795,6 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType,
   return fir::runtime::genAssociated(builder, loc, pointerBox, targetBox);
 }
 
-static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder,
-                                         mlir::Location loc,
-                                         mlir::Value barrier,
-                                         mlir::NVVM::NVVMMemorySpace space) {
-  mlir::Value llvmPtr = fir::ConvertOp::create(
-      builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()),
-      barrier);
-  mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create(
-      builder, loc,
-      mlir::LLVM::LLVMPointerType::get(builder.getContext(),
-                                       static_cast<unsigned>(space)),
-      llvmPtr);
-  return addrCast;
-}
-
-// BARRIER_ARRIVE (CUDA)
-mlir::Value
-IntrinsicLibrary::genBarrierArrive(mlir::Type resultType,
-                                   llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 1);
-  mlir::Value barrier = convertPtrToNVVMSpace(
-      builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
-  return mlir::NVVM::MBarrierArriveSharedOp::create(builder, loc, resultType,
-                                                    barrier)
-      .getResult();
-}
-
-// BARRIER_ARRIBVE_CNT (CUDA)
-mlir::Value
-IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType,
-                                      llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  mlir::Value barrier = convertPtrToNVVMSpace(
-      builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
-  mlir::Value token = fir::AllocaOp::create(builder, loc, resultType);
-  // TODO: the MBarrierArriveExpectTxOp is not taking the state argument and
-  // currently just the sink symbol `_`.
-  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
-  mlir::NVVM::MBarrierArriveExpectTxOp::create(builder, loc, barrier, args[1],
-                                               {});
-  return fir::LoadOp::create(builder, loc, token);
-}
-
-// BARRIER_INIT (CUDA)
-void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 2);
-  mlir::Value barrier = convertPtrToNVVMSpace(
-      builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
-  mlir::NVVM::MBarrierInitSharedOp::create(builder, loc, barrier,
-                                           fir::getBase(args[1]), {});
-  auto kind = mlir::NVVM::ProxyKindAttr::get(
-      builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
-  auto space = mlir::NVVM::SharedSpaceAttr::get(
-      builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
-  mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
-}
-
-// BARRIER_TRY_WAIT (CUDA)
-mlir::Value
-IntrinsicLibrary::genBarrierTryWait(mlir::Type resultType,
-                                    llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
-  mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
-  fir::StoreOp::create(builder, loc, zero, res);
-  mlir::Value ns =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 1000000);
-  mlir::Value load = fir::LoadOp::create(builder, loc, res);
-  auto whileOp = mlir::scf::WhileOp::create(
-      builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load});
-  mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore());
-  mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc);
-  builder.setInsertionPointToStart(beforeBlock);
-  mlir::Value condition = mlir::arith::CmpIOp::create(
-      builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero);
-  mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg);
-  mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter());
-  afterBlock->addArgument(resultType, loc);
-  builder.setInsertionPointToStart(afterBlock);
-  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
-  auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
-  mlir::Value ret =
-      mlir::NVVM::InlinePtxOp::create(
-          builder, loc, {resultType}, {barrier, args[1], ns}, {},
-          ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; "
-          "selp.b32 %0, 1, 0, p;",
-          {})
-          .getResult(0);
-  mlir::scf::YieldOp::create(builder, loc, ret);
-  builder.setInsertionPointAfter(whileOp);
-  return whileOp.getResult(0);
-}
-
-// BARRIER_TRY_WAIT_SLEEP (CUDA)
-mlir::Value
-IntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,
-                                         llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 3);
-  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
-  auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
-  return mlir::NVVM::InlinePtxOp::create(
-             builder, loc, {resultType}, {barrier, args[1], args[2]}, {},
-             ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; "
-             "selp.b32 %0, 1, 0, p;",
-             {})
-      .getResult(0);
-}
-
 // BESSEL_JN
 fir::ExtendedValue
 IntrinsicLibrary::genBesselJn(mlir::Type resultType,
@@ -4152,30 +3528,6 @@ IntrinsicLibrary::genCshift(mlir::Type resultType,
   return readAndAddCleanUp(resultMutableBox, resultType, "CSHIFT");
 }
 
-// __LDCA, __LDCS, __LDLU, __LDCV
-template <const char *fctName, int extent>
-fir::ExtendedValue
-IntrinsicLibrary::genCUDALDXXFunc(mlir::Type resultType,
-                                  llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 1);
-  mlir::Type resTy = fir::SequenceType::get(extent, resultType);
-  mlir::Value arg = fir::getBase(args[0]);
-  mlir::Value res = fir::AllocaOp::create(builder, loc, resTy);
-  if (mlir::isa<fir::BaseBoxType>(arg.getType()))
-    arg = fir::BoxAddrOp::create(builder, loc, arg);
-  mlir::Type refResTy = fir::ReferenceType::get(resTy);
-  mlir::FunctionType ftype =
-      mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {});
-  auto funcOp = builder.createFunction(loc, fctName, ftype);
-  llvm::SmallVector<mlir::Value> funcArgs;
-  funcArgs.push_back(res);
-  funcArgs.push_back(arg);
-  fir::CallOp::create(builder, loc, funcOp, funcArgs);
-  mlir::Value ext =
-      builder.createIntegerConstant(loc, builder.getIndexType(), extent);
-  return fir::ArrayBoxValue(res, {ext});
-}
-
 // DATE_AND_TIME
 void IntrinsicLibrary::genDateAndTime(llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 4 && "date_and_time has 4 args");
@@ -4508,17 +3860,6 @@ IntrinsicLibrary::genExtendsTypeOf(mlir::Type resultType,
                                      fir::getBase(args[1])));
 }
 
-// FENCE_PROXY_ASYNC (CUDA)
-void IntrinsicLibrary::genFenceProxyAsync(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 0);
-  auto kind = mlir::NVVM::ProxyKindAttr::get(
-      builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
-  auto space = mlir::NVVM::SharedSpaceAttr::get(
-      builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
-  mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
-}
-
 // FINDLOC
 fir::ExtendedValue
 IntrinsicLibrary::genFindloc(mlir::Type resultType,
@@ -4680,6 +4021,15 @@ IntrinsicLibrary::genFtell(std::optional<mlir::Type> resultType,
   }
 }
 
+// GET_TEAM
+mlir::Value IntrinsicLibrary::genGetTeam(mlir::Type resultType,
+                                         llvm::ArrayRef<mlir::Value> args) {
+  converter->checkCoarrayEnabled();
+  assert(args.size() == 1);
+  return mif::GetTeamOp::create(builder, loc, fir::BoxType::get(resultType),
+                                /*level*/ args[0]);
+}
+
 // GETCWD
 fir::ExtendedValue
 IntrinsicLibrary::genGetCwd(std::optional<mlir::Type> resultType,
@@ -7029,67 +6379,6 @@ mlir::Value IntrinsicLibrary::genMask(mlir::Type resultType,
   return result;
 }
 
-// MATCH_ALL_SYNC
-mlir::Value
-IntrinsicLibrary::genMatchAllSync(mlir::Type resultType,
-                                  llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 3);
-  bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
-
-  mlir::Type i1Ty = builder.getI1Type();
-  mlir::MLIRContext *context = builder.getContext();
-
-  mlir::Value arg1 = args[1];
-  if (arg1.getType().isF32() || arg1.getType().isF64())
-    arg1 = fir::ConvertOp::create(
-        builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
-
-  mlir::Type retTy =
-      mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty});
-  auto match =
-      mlir::NVVM::MatchSyncOp::create(builder, loc, retTy, args[0], arg1,
-                                      mlir::NVVM::MatchSyncKind::all)
-          .getResult();
-  auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0);
-  auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1);
-  auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred);
-  fir::StoreOp::create(builder, loc, conv, args[2]);
-  return value;
-}
-
-// ALL_SYNC, ANY_SYNC, BALLOT_SYNC
-template <mlir::NVVM::VoteSyncKind kind>
-mlir::Value IntrinsicLibrary::genVoteSync(mlir::Type resultType,
-                                          llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  mlir::Value arg1 =
-      fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]);
-  mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot
-                         ? builder.getI32Type()
-                         : builder.getI1Type();
-  auto voteRes =
-      mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind)
-          .getResult();
-  return fir::ConvertOp::create(builder, loc, resultType, voteRes);
-}
-
-// MATCH_ANY_SYNC
-mlir::Value
-IntrinsicLibrary::genMatchAnySync(mlir::Type resultType,
-                                  llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
-
-  mlir::Value arg1 = args[1];
-  if (arg1.getType().isF32() || arg1.getType().isF64())
-    arg1 = fir::ConvertOp::create(
-        builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
-
-  return mlir::NVVM::MatchSyncOp::create(builder, loc, resultType, args[0],
-                                         arg1, mlir::NVVM::MatchSyncKind::any)
-      .getResult();
-}
-
 // MATMUL
 fir::ExtendedValue
 IntrinsicLibrary::genMatmul(mlir::Type resultType,
@@ -7707,14 +6996,6 @@ IntrinsicLibrary::genNumImages(mlir::Type resultType,
   return mif::NumImagesOp::create(builder, loc).getResult();
 }
 
-// CLOCK, CLOCK64, GLOBALTIMER
-template <typename OpTy>
-mlir::Value IntrinsicLibrary::genNVVMTime(mlir::Type resultType,
-                                          llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 0 && "expect no arguments");
-  return OpTy::create(builder, loc, resultType).getResult();
-}
-
 // PACK
 fir::ExtendedValue
 IntrinsicLibrary::genPack(mlir::Type resultType,
@@ -8689,90 +7970,14 @@ mlir::Value IntrinsicLibrary::genTanpi(mlir::Type resultType,
   return getRuntimeCallGenerator("tan", ftype)(builder, loc, {arg});
 }
 
-// THIS_GRID
-mlir::Value IntrinsicLibrary::genThisGrid(mlir::Type resultType,
-                                          llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 0);
-  auto recTy = mlir::cast<fir::RecordType>(resultType);
-  assert(recTy && "RecordType expepected");
-  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
-  mlir::Type i32Ty = builder.getI32Type();
-
-  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
-  mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
-  mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
-
-  mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty);
-  mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty);
-  mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty);
-
-  mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
-  mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
-  mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
-  mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty);
-  mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty);
-  mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty);
-
-  // this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) *
-  // (blockDim.x * gridDim.x);
-  mlir::Value resZ =
-      mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ);
-  mlir::Value resY =
-      mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY);
-  mlir::Value resX =
-      mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX);
-  mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY);
-  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX);
-
-  // tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) +
-  //   blockIdx.x;
-  // this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) +
-  //   ((threadIdx.z * blockDim.y) * blockDim.x) +
-  //   (threadIdx.y * blockDim.x) + threadIdx.x + 1;
-  mlir::Value r1 =
-      mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY);
-  mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX);
-  mlir::Value r3 =
-      mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX);
-  mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
-  mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX);
-
-  mlir::Value bXbY =
-      mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY);
-  mlir::Value bXbYbZ =
-      mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ);
-  mlir::Value tZbY =
-      mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
-  mlir::Value tZbYbX =
-      mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX);
-  mlir::Value tYbX =
-      mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
-  mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ);
-  rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX);
-  rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX);
-  rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX);
-  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
-  rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
-
-  auto sizeFieldName = recTy.getTypeList()[1].first;
-  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
-  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
-  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, sizeFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value sizeCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
-  fir::StoreOp::create(builder, loc, size, sizeCoord);
-
-  auto rankFieldName = recTy.getTypeList()[2].first;
-  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
-  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, rankFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value rankCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
-  fir::StoreOp::create(builder, loc, rank, rankCoord);
-  return res;
+// TEAM_NUMBER
+fir::ExtendedValue
+IntrinsicLibrary::genTeamNumber(mlir::Type,
+                                llvm::ArrayRef<fir::ExtendedValue> args) {
+  converter->checkCoarrayEnabled();
+  assert(args.size() == 1);
+  return mif::TeamNumberOp::create(builder, loc,
+                                   /*team*/ fir::getBase(args[0]));
 }
 
 // THIS_IMAGE
@@ -8790,99 +7995,6 @@ IntrinsicLibrary::genThisImage(mlir::Type resultType,
   return builder.createConvert(loc, resultType, res);
 }
 
-// THIS_THREAD_BLOCK
-mlir::Value
-IntrinsicLibrary::genThisThreadBlock(mlir::Type resultType,
-                                     llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 0);
-  auto recTy = mlir::cast<fir::RecordType>(resultType);
-  assert(recTy && "RecordType expepected");
-  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
-  mlir::Type i32Ty = builder.getI32Type();
-
-  // this_thread_block%size = blockDim.z * blockDim.y * blockDim.x;
-  mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
-  mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
-  mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
-  mlir::Value size =
-      mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY);
-  size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX);
-
-  // this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) +
-  //   (threadIdx.y * blockDim.x) + threadIdx.x + 1;
-  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
-  mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
-  mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
-  mlir::Value r1 =
-      mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
-  mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX);
-  mlir::Value r3 =
-      mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
-  mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
-  mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX);
-  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
-  rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
-
-  auto sizeFieldName = recTy.getTypeList()[1].first;
-  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
-  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
-  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, sizeFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value sizeCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
-  fir::StoreOp::create(builder, loc, size, sizeCoord);
-
-  auto rankFieldName = recTy.getTypeList()[2].first;
-  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
-  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, rankFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value rankCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
-  fir::StoreOp::create(builder, loc, rank, rankCoord);
-  return res;
-}
-
-// THIS_WARP
-mlir::Value IntrinsicLibrary::genThisWarp(mlir::Type resultType,
-                                          llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 0);
-  auto recTy = mlir::cast<fir::RecordType>(resultType);
-  assert(recTy && "RecordType expepected");
-  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
-  mlir::Type i32Ty = builder.getI32Type();
-
-  // coalesced_group%size = 32
-  mlir::Value size = builder.createIntegerConstant(loc, i32Ty, 32);
-  auto sizeFieldName = recTy.getTypeList()[1].first;
-  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
-  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
-  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, sizeFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value sizeCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
-  fir::StoreOp::create(builder, loc, size, sizeCoord);
-
-  // coalesced_group%rank = threadIdx.x & 31 + 1
-  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
-  mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31);
-  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
-  mlir::Value masked =
-      mlir::arith::AndIOp::create(builder, loc, threadIdX, mask);
-  mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one);
-  auto rankFieldName = recTy.getTypeList()[2].first;
-  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
-  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, rankFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value rankCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
-  fir::StoreOp::create(builder, loc, rank, rankCoord);
-  return res;
-}
-
 // TRAILZ
 mlir::Value IntrinsicLibrary::genTrailz(mlir::Type resultType,
                                         llvm::ArrayRef<mlir::Value> args) {
@@ -9104,65 +8216,6 @@ IntrinsicLibrary::genSum(mlir::Type resultType,
                       resultType, args);
 }
 
-// SYNCTHREADS
-void IntrinsicLibrary::genSyncThreads(llvm::ArrayRef<fir::ExtendedValue> args) {
-  mlir::NVVM::Barrier0Op::create(builder, loc);
-}
-
-// SYNCTHREADS_AND
-mlir::Value
-IntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType,
-                                    llvm::ArrayRef<mlir::Value> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and";
-  mlir::MLIRContext *context = builder.getContext();
-  mlir::Type i32 = builder.getI32Type();
-  mlir::FunctionType ftype =
-      mlir::FunctionType::get(context, {resultType}, {i32});
-  auto funcOp = builder.createFunction(loc, funcName, ftype);
-  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
-  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
-}
-
-// SYNCTHREADS_COUNT
-mlir::Value
-IntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType,
-                                      llvm::ArrayRef<mlir::Value> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc";
-  mlir::MLIRContext *context = builder.getContext();
-  mlir::Type i32 = builder.getI32Type();
-  mlir::FunctionType ftype =
-      mlir::FunctionType::get(context, {resultType}, {i32});
-  auto funcOp = builder.createFunction(loc, funcName, ftype);
-  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
-  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
-}
-
-// SYNCTHREADS_OR
-mlir::Value
-IntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType,
-                                   llvm::ArrayRef<mlir::Value> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or";
-  mlir::MLIRContext *context = builder.getContext();
-  mlir::Type i32 = builder.getI32Type();
-  mlir::FunctionType ftype =
-      mlir::FunctionType::get(context, {resultType}, {i32});
-  auto funcOp = builder.createFunction(loc, funcName, ftype);
-  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
-  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
-}
-
-// SYNCWARP
-void IntrinsicLibrary::genSyncWarp(llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 1);
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.bar.warp.sync";
-  mlir::Value mask = fir::getBase(args[0]);
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {mask.getType()}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> argsList{mask};
-  fir::CallOp::create(builder, loc, funcOp, argsList);
-}
-
 // SYSTEM
 fir::ExtendedValue
 IntrinsicLibrary::genSystem(std::optional<mlir::Type> resultType,
@@ -9294,38 +8347,6 @@ IntrinsicLibrary::genTranspose(mlir::Type resultType,
   return readAndAddCleanUp(resultMutableBox, resultType, "TRANSPOSE");
 }
 
-// THREADFENCE
-void IntrinsicLibrary::genThreadFence(llvm::ArrayRef<fir::ExtendedValue> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl";
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> noArgs;
-  fir::CallOp::create(builder, loc, funcOp, noArgs);
-}
-
-// THREADFENCE_BLOCK
-void IntrinsicLibrary::genThreadFenceBlock(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta";
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> noArgs;
-  fir::CallOp::create(builder, loc, funcOp, noArgs);
-}
-
-// THREADFENCE_SYSTEM
-void IntrinsicLibrary::genThreadFenceSystem(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys";
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> noArgs;
-  fir::CallOp::create(builder, loc, funcOp, noArgs);
-}
-
 // TIME
 mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType,
                                       llvm::ArrayRef<mlir::Value> args) {
@@ -9334,226 +8355,6 @@ mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType,
                                fir::runtime::genTime(builder, loc));
 }
 
-// TMA_BULK_COMMIT_GROUP (CUDA)
-void IntrinsicLibrary::genTMABulkCommitGroup(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 0);
-  mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc);
-}
-
-// TMA_BULK_G2S (CUDA)
-void IntrinsicLibrary::genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value barrier = convertPtrToNVVMSpace(
-      builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
-  mlir::Value dst =
-      convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]),
-                            mlir::NVVM::NVVMMemorySpace::SharedCluster);
-  mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
-                                          mlir::NVVM::NVVMMemorySpace::Global);
-  mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create(
-      builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {});
-}
-
-static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc,
-                           mlir::Value barrier, mlir::Value src,
-                           mlir::Value dst, mlir::Value nelem,
-                           mlir::Value eleSize) {
-  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize);
-  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
-  barrier = builder.createConvert(loc, llvmPtrTy, barrier);
-  dst = builder.createConvert(loc, llvmPtrTy, dst);
-  src = builder.createConvert(loc, llvmPtrTy, src);
-  mlir::NVVM::InlinePtxOp::create(
-      builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {},
-      "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], "
-      "[%1], %2, [%3];",
-      {});
-  mlir::NVVM::InlinePtxOp::create(
-      builder, loc, mlir::TypeRange{}, {barrier, size}, {},
-      "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {});
-}
-
-// TMA_BULK_LOADC4
-void IntrinsicLibrary::genTMABulkLoadC4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADC8
-void IntrinsicLibrary::genTMABulkLoadC8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 16);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADI4
-void IntrinsicLibrary::genTMABulkLoadI4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADI8
-void IntrinsicLibrary::genTMABulkLoadI8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADR2
-void IntrinsicLibrary::genTMABulkLoadR2(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 2);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADR4
-void IntrinsicLibrary::genTMABulkLoadR4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADR8
-void IntrinsicLibrary::genTMABulkLoadR8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_S2G (CUDA)
-void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]),
-                                          mlir::NVVM::NVVMMemorySpace::Shared);
-  mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
-                                          mlir::NVVM::NVVMMemorySpace::Global);
-  mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(
-      builder, loc, dst, src, fir::getBase(args[2]), {}, {});
-
-  mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
-                                  "cp.async.bulk.commit_group", {});
-  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
-                                             builder.getI32IntegerAttr(0), {});
-}
-
-static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc,
-                            mlir::Value src, mlir::Value dst, mlir::Value count,
-                            mlir::Value eleSize) {
-  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count);
-  src = convertPtrToNVVMSpace(builder, loc, src,
-                              mlir::NVVM::NVVMMemorySpace::Shared);
-  dst = convertPtrToNVVMSpace(builder, loc, dst,
-                              mlir::NVVM::NVVMMemorySpace::Global);
-  mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src,
-                                                     size, {}, {});
-  mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
-                                  "cp.async.bulk.commit_group", {});
-  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
-                                             builder.getI32IntegerAttr(0), {});
-}
-
-// TMA_BULK_STORE_C4 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreC4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_C8 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreC8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 16);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_I4 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreI4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_I8 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreI8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_R2 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreR2(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 2);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_R4 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreR4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_R8 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreR8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_WAIT_GROUP (CUDA)
-void IntrinsicLibrary::genTMABulkWaitGroup(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 0);
-  auto group = builder.getIntegerAttr(builder.getI32Type(), 0);
-  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {});
-}
-
 // TRIM
 fir::ExtendedValue
 IntrinsicLibrary::genTrim(mlir::Type resultType,
@@ -9968,6 +8769,9 @@ getIntrinsicArgumentLowering(llvm::StringRef specificName) {
   if (const IntrinsicHandler *ppcHandler = findPPCIntrinsicHandler(name))
     if (!ppcHandler->argLoweringRules.hasDefaultRules())
       return &ppcHandler->argLoweringRules;
+  if (const IntrinsicHandler *cudaHandler = findCUDAIntrinsicHandler(name))
+    if (!cudaHandler->argLoweringRules.hasDefaultRules())
+      return &cudaHandler->argLoweringRules;
   return nullptr;
 }
 
diff --git a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
index 7e329e357d7b3..5db40aff91878 100644
--- a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
+++ b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
@@ -258,13 +258,9 @@ void fir::factory::AnyVariableStack::pushValue(mlir::Location loc,
                                                fir::FirOpBuilder &builder,
                                                mlir::Value variable) {
   hlfir::Entity entity{variable};
-  mlir::Type storageElementType =
-      hlfir::getFortranElementType(retValueBox.getType());
-  auto [box, maybeCleanUp] =
-      hlfir::convertToBox(loc, builder, entity, storageElementType);
+  mlir::Value box =
+      hlfir::genVariableBox(loc, builder, entity, entity.getBoxType());
   fir::runtime::genPushDescriptor(loc, builder, opaquePtr, fir::getBase(box));
-  if (maybeCleanUp)
-    (*maybeCleanUp)();
 }
 
 void fir::factory::AnyVariableStack::resetFetchPosition(
diff --git a/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp b/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp
index ac432c74f0147..81488d75d0ab6 100644
--- a/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp
+++ b/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp
@@ -289,7 +289,6 @@ PackArrayConversion::genRepackedBox(fir::FirOpBuilder &builder,
   fir::factory::genDimInfoFromBox(builder, loc, box, &lbounds, &extents,
                                   /*strides=*/nullptr);
   // Get the type parameters from the box, if needed.
-  llvm::SmallVector<mlir::Value> assumedTypeParams;
   if (numTypeParams != 0) {
     if (auto charType =
             mlir::dyn_cast<fir::CharacterType>(boxType.unwrapInnerType()))
diff --git a/flang/lib/Optimizer/CodeGen/PassDetail.h b/flang/lib/Optimizer/CodeGen/PassDetail.h
index f7030131beff9..252da029dc0c8 100644
--- a/flang/lib/Optimizer/CodeGen/PassDetail.h
+++ b/flang/lib/Optimizer/CodeGen/PassDetail.h
@@ -18,7 +18,7 @@
 
 namespace fir {
 
-#define GEN_PASS_CLASSES
+#define GEN_PASS_DECL
 #include "flang/Optimizer/CodeGen/CGPasses.h.inc"
 
 } // namespace fir
diff --git a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
index 1b1d43c11c707..bafeb32660e6c 100644
--- a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
@@ -302,11 +302,15 @@ class DeclareOpConversion : public mlir::OpRewritePattern<fir::DeclareOp> {
       else
         return mlir::failure();
     }
+    // Extract dummy_arg_no attribute if present
+    mlir::IntegerAttr dummyArgNoAttr;
+    if (auto attr = declareOp->getAttrOfType<mlir::IntegerAttr>("dummy_arg_no"))
+      dummyArgNoAttr = attr;
     // FIXME: Add FortranAttrs and CudaAttrs
     auto xDeclOp = fir::cg::XDeclareOp::create(
         rewriter, loc, declareOp.getType(), declareOp.getMemref(), shapeOpers,
         shiftOpers, declareOp.getTypeparams(), declareOp.getDummyScope(),
-        declareOp.getUniqName());
+        declareOp.getUniqName(), dummyArgNoAttr);
     LLVM_DEBUG(llvm::dbgs()
                << "rewriting " << declareOp << " to " << xDeclOp << '\n');
     rewriter.replaceOp(declareOp, xDeclOp.getOperation()->getResults());
diff --git a/flang/lib/Optimizer/Dialect/MIF/MIFOps.cpp b/flang/lib/Optimizer/Dialect/MIF/MIFOps.cpp
index c6cc2e855ff35..5f68f3dda54a7 100644
--- a/flang/lib/Optimizer/Dialect/MIF/MIFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/MIF/MIFOps.cpp
@@ -15,9 +15,6 @@
 #include "mlir/IR/PatternMatch.h"
 #include "llvm/ADT/SmallVector.h"
 
-#define GET_OP_CLASSES
-#include "flang/Optimizer/Dialect/MIF/MIFOps.cpp.inc"
-
 //===----------------------------------------------------------------------===//
 // NumImagesOp
 //===----------------------------------------------------------------------===//
@@ -151,3 +148,60 @@ llvm::LogicalResult mif::CoSumOp::verify() {
     return emitOpError("`A` shall be of numeric type.");
   return mlir::success();
 }
+
+//===----------------------------------------------------------------------===//
+// ChangeTeamOp
+//===----------------------------------------------------------------------===//
+
+void mif::ChangeTeamOp::build(mlir::OpBuilder &builder,
+                              mlir::OperationState &result, mlir::Value team,
+                              bool ensureTerminator,
+                              llvm::ArrayRef<mlir::NamedAttribute> attributes) {
+  build(builder, result, team, /*stat*/ mlir::Value{}, /*errmsg*/ mlir::Value{},
+        ensureTerminator, attributes);
+}
+
+void mif::ChangeTeamOp::build(mlir::OpBuilder &builder,
+                              mlir::OperationState &result, mlir::Value team,
+                              mlir::Value stat, mlir::Value errmsg,
+                              bool ensureTerminator,
+                              llvm::ArrayRef<mlir::NamedAttribute> attributes) {
+  std::int32_t argStat = 0, argErrmsg = 0;
+  result.addOperands(team);
+  if (stat) {
+    result.addOperands(stat);
+    argStat++;
+  }
+  if (errmsg) {
+    result.addOperands(errmsg);
+    argErrmsg++;
+  }
+
+  mlir::Region *bodyRegion = result.addRegion();
+  bodyRegion->push_back(new mlir::Block{});
+  if (ensureTerminator)
+    ChangeTeamOp::ensureTerminator(*bodyRegion, builder, result.location);
+
+  result.addAttribute(getOperandSegmentSizeAttr(),
+                      builder.getDenseI32ArrayAttr({1, argStat, argErrmsg}));
+  result.addAttributes(attributes);
+}
+
+static mlir::ParseResult parseChangeTeamOpBody(mlir::OpAsmParser &parser,
+                                               mlir::Region &body) {
+  if (parser.parseRegion(body))
+    return mlir::failure();
+
+  auto &builder = parser.getBuilder();
+  mif::ChangeTeamOp::ensureTerminator(body, builder, builder.getUnknownLoc());
+  return mlir::success();
+}
+
+static void printChangeTeamOpBody(mlir::OpAsmPrinter &p, mif::ChangeTeamOp op,
+                                  mlir::Region &body) {
+  p.printRegion(op.getRegion(), /*printEntryBlockArgs=*/true,
+                /*printBlockTerminators=*/true);
+}
+
+#define GET_OP_CLASSES
+#include "flang/Optimizer/Dialect/MIF/MIFOps.cpp.inc"
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index 1332dc57fb086..8d6b888789c15 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -261,14 +261,12 @@ updateDeclaredInputTypeWithVolatility(mlir::Type inputType, mlir::Value memref,
   return std::make_pair(inputType, memref);
 }
 
-void hlfir::DeclareOp::build(mlir::OpBuilder &builder,
-                             mlir::OperationState &result, mlir::Value memref,
-                             llvm::StringRef uniq_name, mlir::Value shape,
-                             mlir::ValueRange typeparams,
-                             mlir::Value dummy_scope, mlir::Value storage,
-                             std::uint64_t storage_offset,
-                             fir::FortranVariableFlagsAttr fortran_attrs,
-                             cuf::DataAttributeAttr data_attr) {
+void hlfir::DeclareOp::build(
+    mlir::OpBuilder &builder, mlir::OperationState &result, mlir::Value memref,
+    llvm::StringRef uniq_name, mlir::Value shape, mlir::ValueRange typeparams,
+    mlir::Value dummy_scope, mlir::Value storage, std::uint64_t storage_offset,
+    fir::FortranVariableFlagsAttr fortran_attrs,
+    cuf::DataAttributeAttr data_attr, unsigned dummy_arg_no) {
   auto nameAttr = builder.getStringAttr(uniq_name);
   mlir::Type inputType = memref.getType();
   bool hasExplicitLbs = hasExplicitLowerBounds(shape);
@@ -279,9 +277,12 @@ void hlfir::DeclareOp::build(mlir::OpBuilder &builder,
   }
   auto [hlfirVariableType, firVarType] =
       getDeclareOutputTypes(inputType, hasExplicitLbs);
+  mlir::IntegerAttr argNoAttr;
+  if (dummy_arg_no > 0)
+    argNoAttr = builder.getUI32IntegerAttr(dummy_arg_no);
   build(builder, result, {hlfirVariableType, firVarType}, memref, shape,
         typeparams, dummy_scope, storage, storage_offset, nameAttr,
-        fortran_attrs, data_attr, /*skip_rebox=*/mlir::UnitAttr{});
+        fortran_attrs, data_attr, /*skip_rebox=*/mlir::UnitAttr{}, argNoAttr);
 }
 
 llvm::LogicalResult hlfir::DeclareOp::verify() {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
index 6a57bf2ae6fec..4c3f37bdead3f 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
@@ -308,7 +308,8 @@ class DeclareOpConversion : public mlir::OpRewritePattern<hlfir::DeclareOp> {
         declareOp.getTypeparams(), declareOp.getDummyScope(),
         /*storage=*/declareOp.getStorage(),
         /*storage_offset=*/declareOp.getStorageOffset(),
-        declareOp.getUniqName(), fortranAttrs, dataAttr);
+        declareOp.getUniqName(), fortranAttrs, dataAttr,
+        declareOp.getDummyArgNoAttr());
 
     // Propagate other attributes from hlfir.declare to fir.declare.
     // OpenACC's acc.declare is one example. Right now, the propagation
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
index 86d39749df93d..1fc592c7fe522 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
@@ -107,26 +107,8 @@ class InlineHLFIRAssignConversion
     mlir::Location loc = assign->getLoc();
     fir::FirOpBuilder builder(rewriter, assign.getOperation());
     builder.setInsertionPoint(assign);
-    rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs);
-    lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs);
-    mlir::Value lhsShape = hlfir::genShape(loc, builder, lhs);
-    llvm::SmallVector<mlir::Value> lhsExtents =
-        hlfir::getIndexExtents(loc, builder, lhsShape);
-    mlir::Value rhsShape = hlfir::genShape(loc, builder, rhs);
-    llvm::SmallVector<mlir::Value> rhsExtents =
-        hlfir::getIndexExtents(loc, builder, rhsShape);
-    llvm::SmallVector<mlir::Value> extents =
-        fir::factory::deduceOptimalExtents(lhsExtents, rhsExtents);
-    hlfir::LoopNest loopNest =
-        hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true,
-                           flangomp::shouldUseWorkshareLowering(assign));
-    builder.setInsertionPointToStart(loopNest.body);
-    auto rhsArrayElement =
-        hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
-    rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement);
-    auto lhsArrayElement =
-        hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices);
-    hlfir::AssignOp::create(builder, loc, rhsArrayElement, lhsArrayElement);
+    hlfir::genNoAliasArrayAssignment(
+        loc, builder, rhs, lhs, flangomp::shouldUseWorkshareLowering(assign));
     rewriter.eraseOp(assign);
     return mlir::success();
   }
diff --git a/flang/lib/Optimizer/OpenACC/Analysis/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Analysis/CMakeLists.txt
new file mode 100644
index 0000000000000..e05d1456e6dba
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Analysis/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_flang_library(FIROpenACCAnalysis
+  FIROpenACCSupportAnalysis.cpp
+
+  DEPENDS
+  FIRAnalysis
+  FIRDialect
+  FIROpenACCSupport
+  HLFIRDialect
+
+  LINK_LIBS
+  FIRAnalysis
+  FIRDialect
+  FIROpenACCSupport
+  HLFIRDialect
+
+  MLIR_DEPS
+  MLIROpenACCDialect
+
+  MLIR_LIBS
+  MLIROpenACCDialect
+)
+
diff --git a/flang/lib/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.cpp b/flang/lib/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.cpp
new file mode 100644
index 0000000000000..8cdbe1d5b170e
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.cpp
@@ -0,0 +1,40 @@
+//===- FIROpenACCSupportAnalysis.cpp - FIR OpenACCSupport Analysis -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the FIR-specific OpenACCSupport analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h"
+#include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h"
+
+using namespace mlir;
+
+namespace fir {
+namespace acc {
+
+std::string FIROpenACCSupportAnalysis::getVariableName(Value v) {
+  return fir::acc::getVariableName(v, /*preferDemangledName=*/true);
+}
+
+std::string FIROpenACCSupportAnalysis::getRecipeName(mlir::acc::RecipeKind kind,
+                                                     Type type, Value var) {
+  return fir::acc::getRecipeName(kind, type, var);
+}
+
+mlir::InFlightDiagnostic
+FIROpenACCSupportAnalysis::emitNYI(Location loc, const Twine &message) {
+  TODO(loc, message);
+  // Should be unreachable, but we return an actual diagnostic
+  // to satisfy the interface.
+  return mlir::emitError(loc, "not yet implemented: " + message.str());
+}
+
+} // namespace acc
+} // namespace fir
diff --git a/flang/lib/Optimizer/OpenACC/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/CMakeLists.txt
index 790b9fdb1589a..16a40254dbfe9 100644
--- a/flang/lib/Optimizer/OpenACC/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenACC/CMakeLists.txt
@@ -1,2 +1,3 @@
+add_subdirectory(Analysis)
 add_subdirectory(Support)
 add_subdirectory(Transforms)
diff --git a/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt
index ef67ab1549537..9c6f0ee74f4cf 100644
--- a/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenACC/Support/CMakeLists.txt
@@ -2,7 +2,9 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 
 add_flang_library(FIROpenACCSupport
   FIROpenACCAttributes.cpp
+  FIROpenACCOpsInterfaces.cpp
   FIROpenACCTypeInterfaces.cpp
+  FIROpenACCUtils.cpp
   RegisterOpenACCExtensions.cpp
 
   DEPENDS
diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp
new file mode 100644
index 0000000000000..c1734be5185f4
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp
@@ -0,0 +1,62 @@
+//===-- FIROpenACCOpsInterfaces.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of external operation interfaces for FIR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h"
+
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+
+namespace fir::acc {
+
+template <>
+mlir::Value PartialEntityAccessModel<fir::ArrayCoorOp>::getBaseEntity(
+    mlir::Operation *op) const {
+  return mlir::cast<fir::ArrayCoorOp>(op).getMemref();
+}
+
+template <>
+mlir::Value PartialEntityAccessModel<fir::CoordinateOp>::getBaseEntity(
+    mlir::Operation *op) const {
+  return mlir::cast<fir::CoordinateOp>(op).getRef();
+}
+
+template <>
+mlir::Value PartialEntityAccessModel<hlfir::DesignateOp>::getBaseEntity(
+    mlir::Operation *op) const {
+  return mlir::cast<hlfir::DesignateOp>(op).getMemref();
+}
+
+mlir::Value PartialEntityAccessModel<fir::DeclareOp>::getBaseEntity(
+    mlir::Operation *op) const {
+  return mlir::cast<fir::DeclareOp>(op).getStorage();
+}
+
+bool PartialEntityAccessModel<fir::DeclareOp>::isCompleteView(
+    mlir::Operation *op) const {
+  // Return false (partial view) only if storage is present
+  // Return true (complete view) if storage is absent
+  return !getBaseEntity(op);
+}
+
+mlir::Value PartialEntityAccessModel<hlfir::DeclareOp>::getBaseEntity(
+    mlir::Operation *op) const {
+  return mlir::cast<hlfir::DeclareOp>(op).getStorage();
+}
+
+bool PartialEntityAccessModel<hlfir::DeclareOp>::isCompleteView(
+    mlir::Operation *op) const {
+  // Return false (partial view) only if storage is present
+  // Return true (complete view) if storage is absent
+  return !getBaseEntity(op);
+}
+
+} // namespace fir::acc
diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
index ed9e41c743754..ae0f5fb8197fa 100644
--- a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
+++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
@@ -193,6 +193,28 @@ OpenACCMappableModel<fir::PointerType>::getOffsetInBytes(
     mlir::Type type, mlir::Value var, mlir::ValueRange accBounds,
     const mlir::DataLayout &dataLayout) const;
 
+template <typename Ty>
+bool OpenACCMappableModel<Ty>::hasUnknownDimensions(mlir::Type type) const {
+  assert(fir::isa_ref_type(type) && "expected FIR reference type");
+  return fir::hasDynamicSize(fir::unwrapRefType(type));
+}
+
+template bool OpenACCMappableModel<fir::ReferenceType>::hasUnknownDimensions(
+    mlir::Type type) const;
+
+template bool OpenACCMappableModel<fir::HeapType>::hasUnknownDimensions(
+    mlir::Type type) const;
+
+template bool OpenACCMappableModel<fir::PointerType>::hasUnknownDimensions(
+    mlir::Type type) const;
+
+template <>
+bool OpenACCMappableModel<fir::BaseBoxType>::hasUnknownDimensions(
+    mlir::Type type) const {
+  // Descriptor-based entities have dimensions encoded.
+  return false;
+}
+
 static llvm::SmallVector<mlir::Value>
 generateSeqTyAccBounds(fir::SequenceType seqType, mlir::Value var,
                        mlir::OpBuilder &builder) {
diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCUtils.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCUtils.cpp
new file mode 100644
index 0000000000000..e5b8123305c62
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCUtils.cpp
@@ -0,0 +1,269 @@
+//===- FIROpenACCUtils.cpp - FIR OpenACC Utilities ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utility functions for FIR OpenACC support.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIROpsSupport.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Dialect/Support/FIRContext.h"
+#include "flang/Optimizer/Dialect/Support/KindMapping.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/Support/InternalNames.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+namespace fir {
+namespace acc {
+
+std::string getVariableName(Value v, bool preferDemangledName) {
+  std::string srcName;
+  std::string prefix;
+  llvm::SmallVector<std::string, 4> arrayIndices;
+  bool iterate = true;
+  mlir::Operation *defOp;
+
+  // For integer constants, no need to further iterate - print their value
+  // immediately.
+  if (v.getDefiningOp()) {
+    IntegerAttr::ValueType val;
+    if (matchPattern(v.getDefiningOp(), m_ConstantInt(&val))) {
+      llvm::raw_string_ostream os(prefix);
+      val.print(os, /*isSigned=*/true);
+      return prefix;
+    }
+  }
+
+  while (v && (defOp = v.getDefiningOp()) && iterate) {
+    iterate =
+        llvm::TypeSwitch<mlir::Operation *, bool>(defOp)
+            .Case<mlir::ViewLikeOpInterface>(
+                [&v](mlir::ViewLikeOpInterface op) {
+                  v = op.getViewSource();
+                  return true;
+                })
+            .Case<fir::ReboxOp>([&v](fir::ReboxOp op) {
+              v = op.getBox();
+              return true;
+            })
+            .Case<fir::EmboxOp>([&v](fir::EmboxOp op) {
+              v = op.getMemref();
+              return true;
+            })
+            .Case<fir::ConvertOp>([&v](fir::ConvertOp op) {
+              v = op.getValue();
+              return true;
+            })
+            .Case<fir::LoadOp>([&v](fir::LoadOp op) {
+              v = op.getMemref();
+              return true;
+            })
+            .Case<fir::BoxAddrOp>([&v](fir::BoxAddrOp op) {
+              // The box holds the name of the variable.
+              v = op.getVal();
+              return true;
+            })
+            .Case<fir::AddrOfOp>([&](fir::AddrOfOp op) {
+              // Only use address_of symbol if mangled name is preferred
+              if (!preferDemangledName) {
+                auto symRef = op.getSymbol();
+                srcName = symRef.getLeafReference().getValue().str();
+              }
+              return false;
+            })
+            .Case<fir::ArrayCoorOp>([&](fir::ArrayCoorOp op) {
+              v = op.getMemref();
+              for (auto coor : op.getIndices()) {
+                auto idxName = getVariableName(coor, preferDemangledName);
+                arrayIndices.push_back(idxName.empty() ? "?" : idxName);
+              }
+              return true;
+            })
+            .Case<fir::CoordinateOp>([&](fir::CoordinateOp op) {
+              std::optional<llvm::ArrayRef<int32_t>> fieldIndices =
+                  op.getFieldIndices();
+              if (fieldIndices && fieldIndices->size() > 0 &&
+                  (*fieldIndices)[0] != fir::CoordinateOp::kDynamicIndex) {
+                int fieldId = (*fieldIndices)[0];
+                mlir::Type baseType =
+                    fir::getFortranElementType(op.getRef().getType());
+                if (auto recType = llvm::dyn_cast<fir::RecordType>(baseType)) {
+                  srcName = recType.getTypeList()[fieldId].first;
+                }
+              }
+              if (!srcName.empty()) {
+                // If the field name is known - attempt to continue building
+                // name by looking at its parents.
+                prefix =
+                    getVariableName(op.getRef(), preferDemangledName) + "%";
+              }
+              return false;
+            })
+            .Case<hlfir::DesignateOp>([&](hlfir::DesignateOp op) {
+              if (op.getComponent()) {
+                srcName = op.getComponent().value().str();
+                prefix =
+                    getVariableName(op.getMemref(), preferDemangledName) + "%";
+                return false;
+              }
+              for (auto coor : op.getIndices()) {
+                auto idxName = getVariableName(coor, preferDemangledName);
+                arrayIndices.push_back(idxName.empty() ? "?" : idxName);
+              }
+              v = op.getMemref();
+              return true;
+            })
+            .Case<fir::DeclareOp, hlfir::DeclareOp>([&](auto op) {
+              srcName = op.getUniqName().str();
+              return false;
+            })
+            .Case<fir::AllocaOp>([&](fir::AllocaOp op) {
+              if (preferDemangledName) {
+                // Prefer demangled name (bindc_name over uniq_name)
+                srcName = op.getBindcName()  ? *op.getBindcName()
+                          : op.getUniqName() ? *op.getUniqName()
+                                             : "";
+              } else {
+                // Prefer mangled name (uniq_name over bindc_name)
+                srcName = op.getUniqName()    ? *op.getUniqName()
+                          : op.getBindcName() ? *op.getBindcName()
+                                              : "";
+              }
+              return false;
+            })
+            .Default([](mlir::Operation *) { return false; });
+  }
+
+  // Fallback to the default implementation.
+  if (srcName.empty())
+    return acc::getVariableName(v);
+
+  // Build array index suffix if present
+  std::string suffix;
+  if (!arrayIndices.empty()) {
+    llvm::raw_string_ostream os(suffix);
+    os << "(";
+    llvm::interleaveComma(arrayIndices, os);
+    os << ")";
+  }
+
+  // Names from FIR operations may be mangled.
+  // When the demangled name is requested - demangle it.
+  if (preferDemangledName) {
+    auto [kind, deconstructed] = fir::NameUniquer::deconstruct(srcName);
+    if (kind != fir::NameUniquer::NameKind::NOT_UNIQUED)
+      return prefix + deconstructed.name + suffix;
+  }
+
+  return prefix + srcName + suffix;
+}
+
+bool areAllBoundsConstant(llvm::ArrayRef<Value> bounds) {
+  for (auto bound : bounds) {
+    auto dataBound =
+        mlir::dyn_cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
+    if (!dataBound)
+      return false;
+
+    // Check if this bound has constant values
+    bool hasConstant = false;
+    if (dataBound.getLowerbound() && dataBound.getUpperbound())
+      hasConstant =
+          fir::getIntIfConstant(dataBound.getLowerbound()).has_value() &&
+          fir::getIntIfConstant(dataBound.getUpperbound()).has_value();
+    else if (dataBound.getExtent())
+      hasConstant = fir::getIntIfConstant(dataBound.getExtent()).has_value();
+
+    if (!hasConstant)
+      return false;
+  }
+  return true;
+}
+
+static std::string getBoundsString(llvm::ArrayRef<Value> bounds) {
+  if (bounds.empty())
+    return "";
+
+  std::string boundStr;
+  llvm::raw_string_ostream os(boundStr);
+  os << "_section_";
+
+  llvm::interleave(
+      bounds,
+      [&](Value bound) {
+        auto boundsOp =
+            mlir::cast<mlir::acc::DataBoundsOp>(bound.getDefiningOp());
+        if (boundsOp.getLowerbound() &&
+            fir::getIntIfConstant(boundsOp.getLowerbound()) &&
+            boundsOp.getUpperbound() &&
+            fir::getIntIfConstant(boundsOp.getUpperbound())) {
+          os << "lb" << *fir::getIntIfConstant(boundsOp.getLowerbound())
+             << ".ub" << *fir::getIntIfConstant(boundsOp.getUpperbound());
+        } else if (boundsOp.getExtent() &&
+                   fir::getIntIfConstant(boundsOp.getExtent())) {
+          os << "ext" << *fir::getIntIfConstant(boundsOp.getExtent());
+        } else {
+          os << "?";
+        }
+      },
+      [&] { os << "x"; });
+
+  return os.str();
+}
+
+std::string getRecipeName(mlir::acc::RecipeKind kind, Type type, Value var,
+                          llvm::ArrayRef<Value> bounds,
+                          mlir::acc::ReductionOperator reductionOp) {
+  assert(fir::isa_fir_type(type) && "getRecipeName expects a FIR type");
+
+  // Build the complete prefix with all components before calling
+  // getTypeAsString
+  std::string prefixStr;
+  llvm::raw_string_ostream prefixOS(prefixStr);
+
+  switch (kind) {
+  case mlir::acc::RecipeKind::private_recipe:
+    prefixOS << "privatization";
+    // Private recipes do not currently include bounds in the name
+    // TODO: They should include them - but lowering tests would need to
+    // be updated.
+    break;
+  case mlir::acc::RecipeKind::firstprivate_recipe:
+    prefixOS << "firstprivatization";
+    // Add bounds to the prefix if applicable (only for firstprivate)
+    if (!bounds.empty() && areAllBoundsConstant(bounds))
+      prefixOS << getBoundsString(bounds);
+    break;
+  case mlir::acc::RecipeKind::reduction_recipe:
+    prefixOS << "reduction";
+    // Embed the reduction operator in the prefix
+    if (reductionOp != mlir::acc::ReductionOperator::AccNone)
+      prefixOS << "_"
+               << mlir::acc::stringifyReductionOperator(reductionOp).str();
+    // Add bounds to the prefix if applicable (only for reduction)
+    if (!bounds.empty() && areAllBoundsConstant(bounds))
+      prefixOS << getBoundsString(bounds);
+    break;
+  }
+
+  auto kindMap = var && var.getDefiningOp()
+                     ? fir::getKindMapping(var.getDefiningOp())
+                     : fir::KindMapping(type.getContext());
+  return fir::getTypeAsString(type, kindMap, prefixOS.str());
+}
+
+} // namespace acc
+} // namespace fir
diff --git a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp
index 717bf344e40aa..d71c40dfac03c 100644
--- a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp
+++ b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp
@@ -11,8 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.h"
+
 #include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h"
 #include "flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h"
 
 namespace fir::acc {
@@ -37,7 +42,24 @@ void registerOpenACCExtensions(mlir::DialectRegistry &registry) {
 
     fir::LLVMPointerType::attachInterface<
         OpenACCPointerLikeModel<fir::LLVMPointerType>>(*ctx);
+
+    fir::ArrayCoorOp::attachInterface<
+        PartialEntityAccessModel<fir::ArrayCoorOp>>(*ctx);
+    fir::CoordinateOp::attachInterface<
+        PartialEntityAccessModel<fir::CoordinateOp>>(*ctx);
+    fir::DeclareOp::attachInterface<PartialEntityAccessModel<fir::DeclareOp>>(
+        *ctx);
   });
+
+  // Register HLFIR operation interfaces
+  registry.addExtension(
+      +[](mlir::MLIRContext *ctx, hlfir::hlfirDialect *dialect) {
+        hlfir::DesignateOp::attachInterface<
+            PartialEntityAccessModel<hlfir::DesignateOp>>(*ctx);
+        hlfir::DeclareOp::attachInterface<
+            PartialEntityAccessModel<hlfir::DeclareOp>>(*ctx);
+      });
+
   registerAttrsExtensions(registry);
 }
 
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/ACCInitializeFIRAnalyses.cpp b/flang/lib/Optimizer/OpenACC/Transforms/ACCInitializeFIRAnalyses.cpp
new file mode 100644
index 0000000000000..679b29bb462b5
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Transforms/ACCInitializeFIRAnalyses.cpp
@@ -0,0 +1,56 @@
+//===- ACCInitializeFIRAnalyses.cpp - Initialize FIR analyses ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass initializes analyses that can be reused by subsequent OpenACC
+// passes in the pipeline.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Analysis/AliasAnalysis.h"
+#include "flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h"
+#include "flang/Optimizer/OpenACC/Passes.h"
+#include "mlir/Analysis/AliasAnalysis.h"
+#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h"
+
+namespace fir {
+namespace acc {
+#define GEN_PASS_DEF_ACCINITIALIZEFIRANALYSES
+#include "flang/Optimizer/OpenACC/Passes.h.inc"
+} // namespace acc
+} // namespace fir
+
+#define DEBUG_TYPE "acc-initialize-fir-analyses"
+
+namespace {
+
+/// This pass initializes analyses for reuse by subsequent OpenACC passes in the
+/// pipeline. It creates and caches analyses like OpenACCSupport so they can be
+/// retrieved by later passes using getAnalysis() or getCachedAnalysis().
+class ACCInitializeFIRAnalysesPass
+    : public fir::acc::impl::ACCInitializeFIRAnalysesBase<
+          ACCInitializeFIRAnalysesPass> {
+public:
+  void runOnOperation() override {
+    // Initialize OpenACCSupport with FIR-specific implementation.
+    auto &openACCSupport = getAnalysis<mlir::acc::OpenACCSupport>();
+    openACCSupport.setImplementation(fir::acc::FIROpenACCSupportAnalysis());
+
+    // Initialize AliasAnalysis with FIR-specific implementation.
+    auto &aliasAnalysis = getAnalysis<mlir::AliasAnalysis>();
+    aliasAnalysis.addAnalysisImplementation(fir::AliasAnalysis());
+
+    // Mark all analyses as preserved since this pass only initializes them
+    markAllAnalysesPreserved();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> fir::acc::createACCInitializeFIRAnalysesPass() {
+  return std::make_unique<ACCInitializeFIRAnalysesPass>();
+}
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
index ed177baf52bea..35aa87d6f1c80 100644
--- a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
@@ -1,11 +1,15 @@
 add_flang_library(FIROpenACCTransforms
+  ACCInitializeFIRAnalyses.cpp
   ACCRecipeBufferization.cpp
 
   DEPENDS
   FIROpenACCPassesIncGen
 
   LINK_LIBS
+  FIRAnalysis
   FIRDialect
+  FIROpenACCAnalysis
+  HLFIRDialect
 
   MLIR_LIBS
   MLIRIR
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 1229018bd9b3e..9aad8cddc60a1 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -692,9 +692,6 @@ class DoConcurrentConversion
       if (!targetShapeCreationInfo.isShapedValue())
         return {};
 
-      llvm::SmallVector<mlir::Value> extentOperands;
-      llvm::SmallVector<mlir::Value> startIndexOperands;
-
       if (targetShapeCreationInfo.isShapeShiftedValue()) {
         llvm::SmallVector<mlir::Value> shapeShiftOperands;
 
diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
index bd07d7fe01b85..8382a481ee875 100644
--- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
@@ -477,58 +477,6 @@ class MapInfoFinalizationPass
     return false;
   }
 
-  mlir::omp::MapInfoOp genBoxcharMemberMap(mlir::omp::MapInfoOp op,
-                                           fir::FirOpBuilder &builder) {
-    if (!op.getMembers().empty())
-      return op;
-    mlir::Location loc = op.getVarPtr().getLoc();
-    mlir::Value boxChar = op.getVarPtr();
-
-    if (mlir::isa<fir::ReferenceType>(op.getVarPtr().getType()))
-      boxChar = fir::LoadOp::create(builder, loc, op.getVarPtr());
-
-    fir::BoxCharType boxCharType =
-        mlir::dyn_cast<fir::BoxCharType>(boxChar.getType());
-    mlir::Value boxAddr = fir::BoxOffsetOp::create(
-        builder, loc, op.getVarPtr(), fir::BoxFieldAttr::base_addr);
-
-    mlir::ArrayAttr newMembersAttr;
-    llvm::SmallVector<llvm::SmallVector<int64_t>> memberIdx = {{0}};
-    newMembersAttr = builder.create2DI64ArrayAttr(memberIdx);
-
-    mlir::Value varPtr = op.getVarPtr();
-    mlir::omp::MapInfoOp memberMapInfoOp = mlir::omp::MapInfoOp::create(
-        builder, op.getLoc(), varPtr.getType(), varPtr,
-        mlir::TypeAttr::get(boxCharType.getEleTy()),
-        builder.getAttr<mlir::omp::ClauseMapFlagsAttr>(
-            mlir::omp::ClauseMapFlags::to |
-            mlir::omp::ClauseMapFlags::implicit),
-        builder.getAttr<mlir::omp::VariableCaptureKindAttr>(
-            mlir::omp::VariableCaptureKind::ByRef),
-        /*varPtrPtr=*/boxAddr,
-        /*members=*/llvm::SmallVector<mlir::Value>{},
-        /*member_index=*/mlir::ArrayAttr{},
-        /*bounds=*/op.getBounds(),
-        /*mapperId=*/mlir::FlatSymbolRefAttr(), /*name=*/op.getNameAttr(),
-        builder.getBoolAttr(false));
-
-    mlir::omp::MapInfoOp newMapInfoOp = mlir::omp::MapInfoOp::create(
-        builder, op.getLoc(), op.getResult().getType(), varPtr,
-        mlir::TypeAttr::get(
-            llvm::cast<mlir::omp::PointerLikeType>(varPtr.getType())
-                .getElementType()),
-        op.getMapTypeAttr(), op.getMapCaptureTypeAttr(),
-        /*varPtrPtr=*/mlir::Value{},
-        /*members=*/llvm::SmallVector<mlir::Value>{memberMapInfoOp},
-        /*member_index=*/newMembersAttr,
-        /*bounds=*/llvm::SmallVector<mlir::Value>{},
-        /*mapperId=*/mlir::FlatSymbolRefAttr(), op.getNameAttr(),
-        /*partial_map=*/builder.getBoolAttr(false));
-    op.replaceAllUsesWith(newMapInfoOp.getResult());
-    op->erase();
-    return newMapInfoOp;
-  }
-
   // Expand mappings of type(C_PTR) to map their `__address` field explicitly
   // as a single pointer-sized member (USM-gated at callsite). This helps in
   // USM scenarios to ensure the pointer-sized mapping is used.
@@ -956,6 +904,14 @@ class MapInfoFinalizationPass
     baseAddr.erase();
   }
 
+  static bool hasADescriptor(mlir::Operation *varOp, mlir::Type varType) {
+    if (fir::isTypeWithDescriptor(varType) ||
+        mlir::isa<fir::BoxCharType>(varType) ||
+        mlir::isa_and_present<fir::BoxAddrOp>(varOp))
+      return true;
+    return false;
+  }
+
   // This pass executes on omp::MapInfoOp's containing descriptor based types
   // (allocatables, pointers, assumed shape etc.) and expanding them into
   // multiple omp::MapInfoOp's for each pointer member contained within the
@@ -987,38 +943,6 @@ class MapInfoFinalizationPass
       localBoxAllocas.clear();
       deferrableDesc.clear();
 
-      // First, walk `omp.map.info` ops to see if any of them have varPtrs
-      // with an underlying type of fir.char<k, ?>, i.e a character
-      // with dynamic length. If so, check if they need bounds added.
-      func->walk([&](mlir::omp::MapInfoOp op) {
-        if (!op.getBounds().empty())
-          return;
-
-        mlir::Value varPtr = op.getVarPtr();
-        mlir::Type underlyingVarType = fir::unwrapRefType(varPtr.getType());
-
-        if (!fir::characterWithDynamicLen(underlyingVarType))
-          return;
-
-        fir::factory::AddrAndBoundsInfo info =
-            fir::factory::getDataOperandBaseAddr(
-                builder, varPtr, /*isOptional=*/false, varPtr.getLoc());
-
-        fir::ExtendedValue extendedValue =
-            hlfir::translateToExtendedValue(varPtr.getLoc(), builder,
-                                            hlfir::Entity{info.addr},
-                                            /*continguousHint=*/true)
-                .first;
-        builder.setInsertionPoint(op);
-        llvm::SmallVector<mlir::Value> boundsOps =
-            fir::factory::genImplicitBoundsOps<mlir::omp::MapBoundsOp,
-                                               mlir::omp::MapBoundsType>(
-                builder, info, extendedValue,
-                /*dataExvIsAssumedSize=*/false, varPtr.getLoc());
-
-        op.getBoundsMutable().append(boundsOps);
-      });
-
       // Next, walk `omp.map.info` ops to see if any record members should be
       // implicitly mapped.
       func->walk([&](mlir::omp::MapInfoOp op) {
@@ -1209,36 +1133,6 @@ class MapInfoFinalizationPass
         return mlir::WalkResult::advance();
       });
 
-      func->walk([&](mlir::omp::MapInfoOp op) {
-        if (!op.getMembers().empty())
-          return;
-
-        if (!mlir::isa<fir::BoxCharType>(fir::unwrapRefType(op.getVarType())))
-          return;
-
-        // POSSIBLE_HACK_ALERT: If the boxchar has been implicitly mapped then
-        // it is likely that the underlying pointer to the data
-        // (!fir.ref<fir.char<k,?>>) has already been mapped. So, skip such
-        // boxchars. We are primarily interested in boxchars that were mapped
-        // by passes such as MapsForPrivatizedSymbols that map boxchars that
-        // are privatized. At present, such boxchar maps are not marked
-        // implicit. Should they be? I don't know. If they should be then
-        // we need to change this check for early return OR live with
-        // over-mapping.
-        bool hasImplicitMap =
-            (op.getMapType() & mlir::omp::ClauseMapFlags::implicit) ==
-            mlir::omp::ClauseMapFlags::implicit;
-        if (hasImplicitMap)
-          return;
-
-        assert(llvm::hasSingleElement(op->getUsers()) &&
-               "OMPMapInfoFinalization currently only supports single users "
-               "of a MapInfoOp");
-
-        builder.setInsertionPoint(op);
-        genBoxcharMemberMap(op, builder);
-      });
-
       // Expand type(C_PTR) only when unified_shared_memory is required,
       // to ensure device-visible pointer size/behavior in USM scenarios
       // without changing default expectations elsewhere.
@@ -1266,9 +1160,8 @@ class MapInfoFinalizationPass
                "OMPMapInfoFinalization currently only supports single users "
                "of a MapInfoOp");
 
-        if (fir::isTypeWithDescriptor(op.getVarType()) ||
-            mlir::isa_and_present<fir::BoxAddrOp>(
-                op.getVarPtr().getDefiningOp())) {
+        if (hasADescriptor(op.getVarPtr().getDefiningOp(),
+                           fir::unwrapRefType(op.getVarType()))) {
           builder.setInsertionPoint(op);
           mlir::Operation *targetUser = getFirstTargetUser(op);
           assert(targetUser && "expected user of map operation was not found");
diff --git a/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp b/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
index 0972861b8450a..6404e1892ca5d 100644
--- a/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
@@ -104,21 +104,31 @@ class MapsForPrivatizedSymbolsPass
     llvm::SmallVector<mlir::Value> boundsOps;
     if (needsBoundsOps(varPtr))
       genBoundsOps(builder, varPtr, boundsOps);
+    mlir::Type varType = varPtr.getType();
 
     mlir::omp::VariableCaptureKind captureKind =
         mlir::omp::VariableCaptureKind::ByRef;
-    if (fir::isa_trivial(fir::unwrapRefType(varPtr.getType())) ||
-        fir::isa_char(fir::unwrapRefType(varPtr.getType()))) {
-      if (canPassByValue(fir::unwrapRefType(varPtr.getType()))) {
+    if (fir::isa_trivial(fir::unwrapRefType(varType)) ||
+        fir::isa_char(fir::unwrapRefType(varType))) {
+      if (canPassByValue(fir::unwrapRefType(varType))) {
         captureKind = mlir::omp::VariableCaptureKind::ByCopy;
       }
     }
 
+    // Use tofrom if what we are mapping is not a trivial type. In all
+    // likelihood, it is a descriptor
+    mlir::omp::ClauseMapFlags mapFlag;
+    if (fir::isa_trivial(fir::unwrapRefType(varType)) ||
+        fir::isa_char(fir::unwrapRefType(varType)))
+      mapFlag = mlir::omp::ClauseMapFlags::to;
+    else
+      mapFlag = mlir::omp::ClauseMapFlags::to | mlir::omp::ClauseMapFlags::from;
+
     return omp::MapInfoOp::create(
-        builder, loc, varPtr.getType(), varPtr,
-        TypeAttr::get(llvm::cast<omp::PointerLikeType>(varPtr.getType())
-                          .getElementType()),
-        builder.getAttr<omp::ClauseMapFlagsAttr>(omp::ClauseMapFlags::to),
+        builder, loc, varType, varPtr,
+        TypeAttr::get(
+            llvm::cast<omp::PointerLikeType>(varType).getElementType()),
+        builder.getAttr<omp::ClauseMapFlagsAttr>(mapFlag),
         builder.getAttr<omp::VariableCaptureKindAttr>(captureKind),
         /*varPtrPtr=*/Value{},
         /*members=*/SmallVector<Value>{},
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index e006d2e878fd8..00633b4104b6c 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -228,24 +228,11 @@ void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp,
     }
   }
 
-  // FIXME: There may be cases where an argument is processed a bit before
-  // DeclareOp is generated. In that case, DeclareOp may point to an
-  // intermediate op and not to BlockArgument.
-  // Moreover, with MLIR inlining we cannot use the BlockArgument
-  // position to identify the original number of the dummy argument.
-  // If we want to keep running AddDebugInfoPass late, the dummy argument
-  // position in the argument list has to be expressed in FIR (e.g. as a
-  // constant attribute of [hl]fir.declare/fircg.ext_declare operation that has
-  // a dummy_scope operand).
+  // Get the dummy argument position from the explicit attribute.
   unsigned argNo = 0;
   if (declOp.getDummyScope()) {
-    if (auto arg = llvm::dyn_cast<mlir::BlockArgument>(declOp.getMemref())) {
-      // Check if it is the BlockArgument of the function's entry block.
-      if (auto funcLikeOp =
-              declOp->getParentOfType<mlir::FunctionOpInterface>())
-        if (arg.getOwner() == &funcLikeOp.front())
-          argNo = arg.getArgNumber() + 1;
-    }
+    if (auto argNoOpt = declOp.getDummyArgNo())
+      argNo = *argNoOpt;
   }
 
   auto tyAttr = typeGen.convertType(fir::unwrapRefType(declOp.getType()),
diff --git a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
index 09126e047d382..a64494510d847 100644
--- a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
@@ -41,8 +41,7 @@ namespace {
 static bool isAssumedSize(mlir::ValueRange shape) {
   if (shape.size() != 1)
     return false;
-  std::optional<std::int64_t> val = fir::getIntIfConstant(shape[0]);
-  if (val && *val == -1)
+  if (llvm::isa_and_nonnull<fir::AssumedSizeExtentOp>(shape[0].getDefiningOp()))
     return true;
   return false;
 }
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 8d00272b09f42..5b1b0a2f6feab 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -263,28 +263,6 @@ static bool inDeviceContext(mlir::Operation *op) {
   return false;
 }
 
-static int computeWidth(mlir::Location loc, mlir::Type type,
-                        fir::KindMapping &kindMap) {
-  auto eleTy = fir::unwrapSequenceType(type);
-  if (auto t{mlir::dyn_cast<mlir::IntegerType>(eleTy)})
-    return t.getWidth() / 8;
-  if (auto t{mlir::dyn_cast<mlir::FloatType>(eleTy)})
-    return t.getWidth() / 8;
-  if (eleTy.isInteger(1))
-    return 1;
-  if (auto t{mlir::dyn_cast<fir::LogicalType>(eleTy)})
-    return kindMap.getLogicalBitsize(t.getFKind()) / 8;
-  if (auto t{mlir::dyn_cast<mlir::ComplexType>(eleTy)}) {
-    int elemSize =
-        mlir::cast<mlir::FloatType>(t.getElementType()).getWidth() / 8;
-    return 2 * elemSize;
-  }
-  if (auto t{mlir::dyn_cast_or_null<fir::CharacterType>(eleTy)})
-    return kindMap.getCharacterBitsize(t.getFKind()) / 8;
-  mlir::emitError(loc, "unsupported type");
-  return 0;
-}
-
 struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -320,7 +298,7 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
       mlir::Value bytes;
       fir::KindMapping kindMap{fir::getKindMapping(mod)};
       if (fir::isa_trivial(op.getInType())) {
-        int width = computeWidth(loc, op.getInType(), kindMap);
+        int width = cuf::computeElementByteSize(loc, op.getInType(), kindMap);
         bytes =
             builder.createIntegerConstant(loc, builder.getIndexType(), width);
       } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(
@@ -330,7 +308,7 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
           mlir::Type structTy = typeConverter->convertType(seqTy.getEleTy());
           size = dl->getTypeSizeInBits(structTy) / 8;
         } else {
-          size = computeWidth(loc, seqTy.getEleTy(), kindMap);
+          size = cuf::computeElementByteSize(loc, seqTy.getEleTy(), kindMap);
         }
         mlir::Value width =
             builder.createIntegerConstant(loc, builder.getIndexType(), size);
@@ -704,7 +682,7 @@ struct CUFDataTransferOpConversion
             typeConverter->convertType(fir::unwrapSequenceType(dstTy));
         width = dl->getTypeSizeInBits(structTy) / 8;
       } else {
-        width = computeWidth(loc, dstTy, kindMap);
+        width = cuf::computeElementByteSize(loc, dstTy, kindMap);
       }
       mlir::Value widthValue = mlir::arith::ConstantOp::create(
           rewriter, loc, i64Ty, rewriter.getIntegerAttr(i64Ty, width));
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index e1e6125fc348b..8019c399f3779 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -718,6 +718,31 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
     return convertRecordType(recTy, fileAttr, scope, declOp);
   } else if (auto tupleTy = mlir::dyn_cast_if_present<mlir::TupleType>(Ty)) {
     return convertTupleType(tupleTy, fileAttr, scope, declOp);
+  } else if (mlir::isa<mlir::FunctionType>(Ty)) {
+    // Handle function types - these represent procedure pointers after the
+    // BoxedProcedure pass has run and unwrapped the fir.boxproc type, as well
+    // as dummy procedures (which are represented as function types in FIR)
+    llvm::SmallVector<mlir::LLVM::DITypeAttr> types;
+
+    auto funcTy = mlir::cast<mlir::FunctionType>(Ty);
+    // Add return type (or void if no return type)
+    if (funcTy.getNumResults() == 0)
+      types.push_back(mlir::LLVM::DINullTypeAttr::get(context));
+    else
+      types.push_back(
+          convertType(funcTy.getResult(0), fileAttr, scope, declOp));
+
+    for (mlir::Type paramTy : funcTy.getInputs())
+      types.push_back(convertType(paramTy, fileAttr, scope, declOp));
+
+    auto subroutineTy = mlir::LLVM::DISubroutineTypeAttr::get(
+        context, /*callingConvention=*/0, types);
+
+    return mlir::LLVM::DIDerivedTypeAttr::get(
+        context, llvm::dwarf::DW_TAG_pointer_type,
+        mlir::StringAttr::get(context, ""), subroutineTy,
+        /*sizeInBits=*/ptrSize * 8, /*alignInBits=*/0, /*offset=*/0,
+        /*optional<address space>=*/std::nullopt, /*extra data=*/nullptr);
   } else if (auto refTy = mlir::dyn_cast_if_present<fir::ReferenceType>(Ty)) {
     auto elTy = refTy.getEleTy();
     return convertPointerLikeType(elTy, fileAttr, scope, declOp,
diff --git a/flang/lib/Optimizer/Transforms/MIFOpConversion.cpp b/flang/lib/Optimizer/Transforms/MIFOpConversion.cpp
index 206cb9be0574f..0d3d2f6c144ff 100644
--- a/flang/lib/Optimizer/Transforms/MIFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/MIFOpConversion.cpp
@@ -67,6 +67,13 @@ genErrmsgPRIF(fir::FirOpBuilder &builder, mlir::Location loc,
   return {errMsg, errMsgAlloc};
 }
 
+static mlir::Value genStatPRIF(fir::FirOpBuilder &builder, mlir::Location loc,
+                               mlir::Value stat) {
+  if (!stat)
+    return fir::AbsentOp::create(builder, loc, getPRIFStatType(builder));
+  return stat;
+}
+
 /// Convert mif.init operation to runtime call of 'prif_init'
 struct MIFInitOpConversion : public mlir::OpRewritePattern<mif::InitOp> {
   using OpRewritePattern::OpRewritePattern;
@@ -210,9 +217,7 @@ struct MIFSyncAllOpConversion : public mlir::OpRewritePattern<mif::SyncAllOp> {
 
     auto [errmsgArg, errmsgAllocArg] =
         genErrmsgPRIF(builder, loc, op.getErrmsg());
-    mlir::Value stat = op.getStat();
-    if (!stat)
-      stat = fir::AbsentOp::create(builder, loc, getPRIFStatType(builder));
+    mlir::Value stat = genStatPRIF(builder, loc, op.getStat());
     llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
         builder, loc, ftype, stat, errmsgArg, errmsgAllocArg);
     rewriter.replaceOpWithNewOp<fir::CallOp>(op, funcOp, args);
@@ -261,9 +266,7 @@ struct MIFSyncImagesOpConversion
     }
     auto [errmsgArg, errmsgAllocArg] =
         genErrmsgPRIF(builder, loc, op.getErrmsg());
-    mlir::Value stat = op.getStat();
-    if (!stat)
-      stat = fir::AbsentOp::create(builder, loc, getPRIFStatType(builder));
+    mlir::Value stat = genStatPRIF(builder, loc, op.getStat());
     llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
         builder, loc, ftype, imageSet, stat, errmsgArg, errmsgAllocArg);
     rewriter.replaceOpWithNewOp<fir::CallOp>(op, funcOp, args);
@@ -293,9 +296,7 @@ struct MIFSyncMemoryOpConversion
 
     auto [errmsgArg, errmsgAllocArg] =
         genErrmsgPRIF(builder, loc, op.getErrmsg());
-    mlir::Value stat = op.getStat();
-    if (!stat)
-      stat = fir::AbsentOp::create(builder, loc, getPRIFStatType(builder));
+    mlir::Value stat = genStatPRIF(builder, loc, op.getStat());
     llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
         builder, loc, ftype, stat, errmsgArg, errmsgAllocArg);
     rewriter.replaceOpWithNewOp<fir::CallOp>(op, funcOp, args);
@@ -303,6 +304,37 @@ struct MIFSyncMemoryOpConversion
   }
 };
 
+/// Convert mif.sync_team operation to runtime call of 'prif_sync_team'
+struct MIFSyncTeamOpConversion
+    : public mlir::OpRewritePattern<mif::SyncTeamOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::SyncTeamOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+
+    mlir::Type boxTy = fir::BoxType::get(builder.getNoneType());
+    mlir::Type errmsgTy = getPRIFErrmsgType(builder);
+    mlir::FunctionType ftype = mlir::FunctionType::get(
+        builder.getContext(),
+        /*inputs*/ {boxTy, getPRIFStatType(builder), errmsgTy, errmsgTy},
+        /*results*/ {});
+    mlir::func::FuncOp funcOp =
+        builder.createFunction(loc, getPRIFProcName("sync_team"), ftype);
+
+    auto [errmsgArg, errmsgAllocArg] =
+        genErrmsgPRIF(builder, loc, op.getErrmsg());
+    mlir::Value stat = genStatPRIF(builder, loc, op.getStat());
+    llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+        builder, loc, ftype, op.getTeam(), stat, errmsgArg, errmsgAllocArg);
+    rewriter.replaceOpWithNewOp<fir::CallOp>(op, funcOp, args);
+    return mlir::success();
+  }
+};
+
 /// Generate call to collective subroutines except co_reduce
 /// A must be lowered as a box
 static fir::CallOp genCollectiveSubroutine(fir::FirOpBuilder &builder,
@@ -432,6 +464,208 @@ struct MIFCoSumOpConversion : public mlir::OpRewritePattern<mif::CoSumOp> {
   }
 };
 
+/// Convert mif.form_team operation to runtime call of 'prif_form_team'
+struct MIFFormTeamOpConversion
+    : public mlir::OpRewritePattern<mif::FormTeamOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::FormTeamOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+    mlir::Type errmsgTy = getPRIFErrmsgType(builder);
+    mlir::Type boxTy = fir::BoxType::get(builder.getNoneType());
+    mlir::FunctionType ftype = mlir::FunctionType::get(
+        builder.getContext(),
+        /*inputs*/
+        {builder.getRefType(builder.getI64Type()), boxTy,
+         builder.getRefType(builder.getI32Type()), getPRIFStatType(builder),
+         errmsgTy, errmsgTy},
+        /*results*/ {});
+    mlir::func::FuncOp funcOp =
+        builder.createFunction(loc, getPRIFProcName("form_team"), ftype);
+
+    mlir::Type i64Ty = builder.getI64Type();
+    mlir::Value teamNumber = builder.createTemporary(loc, i64Ty);
+    mlir::Value t =
+        (op.getTeamNumber().getType() == i64Ty)
+            ? op.getTeamNumber()
+            : fir::ConvertOp::create(builder, loc, i64Ty, op.getTeamNumber());
+    fir::StoreOp::create(builder, loc, t, teamNumber);
+
+    mlir::Type i32Ty = builder.getI32Type();
+    mlir::Value newIndex;
+    if (op.getNewIndex()) {
+      newIndex = builder.createTemporary(loc, i32Ty);
+      mlir::Value ni =
+          (op.getNewIndex().getType() == i32Ty)
+              ? op.getNewIndex()
+              : fir::ConvertOp::create(builder, loc, i32Ty, op.getNewIndex());
+      fir::StoreOp::create(builder, loc, ni, newIndex);
+    } else
+      newIndex = fir::AbsentOp::create(builder, loc, builder.getRefType(i32Ty));
+
+    mlir::Value stat = genStatPRIF(builder, loc, op.getStat());
+    auto [errmsgArg, errmsgAllocArg] =
+        genErrmsgPRIF(builder, loc, op.getErrmsg());
+    llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+        builder, loc, ftype, teamNumber, op.getTeamVar(), newIndex, stat,
+        errmsgArg, errmsgAllocArg);
+    fir::CallOp callOp = fir::CallOp::create(builder, loc, funcOp, args);
+    rewriter.replaceOp(op, callOp);
+    return mlir::success();
+  }
+};
+
+/// Convert mif.change_team operation to runtime call of 'prif_change_team'
+struct MIFChangeTeamOpConversion
+    : public mlir::OpRewritePattern<mif::ChangeTeamOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::ChangeTeamOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    builder.setInsertionPoint(op);
+
+    mlir::Location loc = op.getLoc();
+    mlir::Type errmsgTy = getPRIFErrmsgType(builder);
+    mlir::Type boxTy = fir::BoxType::get(builder.getNoneType());
+    mlir::FunctionType ftype = mlir::FunctionType::get(
+        builder.getContext(),
+        /*inputs*/ {boxTy, getPRIFStatType(builder), errmsgTy, errmsgTy},
+        /*results*/ {});
+    mlir::func::FuncOp funcOp =
+        builder.createFunction(loc, getPRIFProcName("change_team"), ftype);
+
+    mlir::Value stat = genStatPRIF(builder, loc, op.getStat());
+    auto [errmsgArg, errmsgAllocArg] =
+        genErrmsgPRIF(builder, loc, op.getErrmsg());
+    llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+        builder, loc, ftype, op.getTeam(), stat, errmsgArg, errmsgAllocArg);
+    fir::CallOp::create(builder, loc, funcOp, args);
+
+    mlir::Operation *changeOp = op.getOperation();
+    auto &bodyRegion = op.getRegion();
+    mlir::Block &bodyBlock = bodyRegion.front();
+
+    rewriter.inlineBlockBefore(&bodyBlock, changeOp);
+    rewriter.eraseOp(op);
+    return mlir::success();
+  }
+};
+
+/// Convert mif.end_team operation to runtime call of 'prif_end_team'
+struct MIFEndTeamOpConversion : public mlir::OpRewritePattern<mif::EndTeamOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::EndTeamOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+    mlir::Type errmsgTy = getPRIFErrmsgType(builder);
+    mlir::FunctionType ftype = mlir::FunctionType::get(
+        builder.getContext(),
+        /*inputs*/ {getPRIFStatType(builder), errmsgTy, errmsgTy},
+        /*results*/ {});
+    mlir::func::FuncOp funcOp =
+        builder.createFunction(loc, getPRIFProcName("end_team"), ftype);
+
+    mlir::Value stat = genStatPRIF(builder, loc, op.getStat());
+    auto [errmsgArg, errmsgAllocArg] =
+        genErrmsgPRIF(builder, loc, op.getErrmsg());
+    llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+        builder, loc, ftype, stat, errmsgArg, errmsgAllocArg);
+    fir::CallOp callOp = fir::CallOp::create(builder, loc, funcOp, args);
+    rewriter.replaceOp(op, callOp);
+    return mlir::success();
+  }
+};
+
+/// Convert mif.get_team operation to runtime call of 'prif_get_team'
+struct MIFGetTeamOpConversion : public mlir::OpRewritePattern<mif::GetTeamOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::GetTeamOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+
+    mlir::Type boxTy = fir::BoxType::get(builder.getNoneType());
+    mlir::Type lvlTy = builder.getRefType(builder.getI32Type());
+    mlir::FunctionType ftype =
+        mlir::FunctionType::get(builder.getContext(),
+                                /*inputs*/ {lvlTy, boxTy},
+                                /*results*/ {});
+    mlir::func::FuncOp funcOp =
+        builder.createFunction(loc, getPRIFProcName("get_team"), ftype);
+
+    mlir::Value level = op.getLevel();
+    if (!level)
+      level = fir::AbsentOp::create(builder, loc, lvlTy);
+    else {
+      mlir::Value cst = op.getLevel();
+      mlir::Type i32Ty = builder.getI32Type();
+      level = builder.createTemporary(loc, i32Ty);
+      if (cst.getType() != i32Ty)
+        cst = builder.createConvert(loc, i32Ty, cst);
+      fir::StoreOp::create(builder, loc, cst, level);
+    }
+    mlir::Type resultType = op.getResult().getType();
+    mlir::Type baseTy = fir::unwrapRefType(resultType);
+    mlir::Value team = builder.createTemporary(loc, baseTy);
+    fir::EmboxOp box = fir::EmboxOp::create(builder, loc, resultType, team);
+
+    llvm::SmallVector<mlir::Value> args =
+        fir::runtime::createArguments(builder, loc, ftype, level, box);
+    fir::CallOp::create(builder, loc, funcOp, args);
+
+    rewriter.replaceOp(op, box);
+    return mlir::success();
+  }
+};
+
+/// Convert mif.team_number operation to runtime call of 'prif_team_number'
+struct MIFTeamNumberOpConversion
+    : public mlir::OpRewritePattern<mif::TeamNumberOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::TeamNumberOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+    mlir::Type i64Ty = builder.getI64Type();
+    mlir::Type boxTy = fir::BoxType::get(builder.getNoneType());
+    mlir::FunctionType ftype =
+        mlir::FunctionType::get(builder.getContext(),
+                                /*inputs*/ {boxTy, builder.getRefType(i64Ty)},
+                                /*results*/ {});
+    mlir::func::FuncOp funcOp =
+        builder.createFunction(loc, getPRIFProcName("team_number"), ftype);
+
+    mlir::Value team = op.getTeam();
+    if (!team)
+      team = fir::AbsentOp::create(builder, loc, boxTy);
+
+    mlir::Value result = builder.createTemporary(loc, i64Ty);
+    llvm::SmallVector<mlir::Value> args =
+        fir::runtime::createArguments(builder, loc, ftype, team, result);
+    fir::CallOp::create(builder, loc, funcOp, args);
+    fir::LoadOp load = fir::LoadOp::create(builder, loc, result);
+    rewriter.replaceOp(op, load);
+    return mlir::success();
+  }
+};
+
 class MIFOpConversion : public fir::impl::MIFOpConversionBase<MIFOpConversion> {
 public:
   void runOnOperation() override {
@@ -458,7 +692,10 @@ void mif::populateMIFOpConversionPatterns(mlir::RewritePatternSet &patterns) {
   patterns.insert<MIFInitOpConversion, MIFThisImageOpConversion,
                   MIFNumImagesOpConversion, MIFSyncAllOpConversion,
                   MIFSyncImagesOpConversion, MIFSyncMemoryOpConversion,
-                  MIFCoBroadcastOpConversion, MIFCoMaxOpConversion,
-                  MIFCoMinOpConversion, MIFCoSumOpConversion>(
+                  MIFSyncTeamOpConversion, MIFCoBroadcastOpConversion,
+                  MIFCoMaxOpConversion, MIFCoMinOpConversion,
+                  MIFCoSumOpConversion, MIFFormTeamOpConversion,
+                  MIFChangeTeamOpConversion, MIFEndTeamOpConversion,
+                  MIFGetTeamOpConversion, MIFTeamNumberOpConversion>(
       patterns.getContext());
 }
diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
index 49a085ee3b336..49ae189d0b758 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -730,7 +730,6 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
       mlir::Value ifCompatElem =
           fir::ConvertOp::create(builder, loc, ifCompatType, maskElem);
 
-      llvm::SmallVector<mlir::Type> resultsTy = {elementType, elementType};
       fir::IfOp ifOp =
           fir::IfOp::create(builder, loc, elementType, ifCompatElem,
                             /*withElseRegion=*/true);
diff --git a/flang/lib/Parser/Fortran-parsers.cpp b/flang/lib/Parser/Fortran-parsers.cpp
index 59fe7d813d96a..cdc9b0add7a48 100644
--- a/flang/lib/Parser/Fortran-parsers.cpp
+++ b/flang/lib/Parser/Fortran-parsers.cpp
@@ -1212,12 +1212,15 @@ TYPE_CONTEXT_PARSER("image selector"_en_US,
 
 // R926 image-selector-spec ->
 //        STAT = stat-variable | TEAM = team-value |
-//        TEAM_NUMBER = scalar-int-expr
+//        TEAM_NUMBER = scalar-int-expr |
+//        NOTIFY = notify-variable
 TYPE_PARSER(construct<ImageSelectorSpec>(construct<ImageSelectorSpec::Stat>(
                 "STAT =" >> scalar(integer(indirect(variable))))) ||
     construct<ImageSelectorSpec>(construct<TeamValue>("TEAM =" >> teamValue)) ||
     construct<ImageSelectorSpec>(construct<ImageSelectorSpec::Team_Number>(
-        "TEAM_NUMBER =" >> scalarIntExpr)))
+        "TEAM_NUMBER =" >> scalarIntExpr)) ||
+    construct<ImageSelectorSpec>(construct<ImageSelectorSpec::Notify>(
+        "NOTIFY =" >> scalar(indirect(variable)))))
 
 // R927 allocate-stmt ->
 //        ALLOCATE ( [type-spec ::] allocation-list [, alloc-opt-list] )
@@ -1294,6 +1297,7 @@ TYPE_PARSER(construct<StatOrErrmsg>("STAT =" >> statVariable) ||
 // !DIR$ LOOP COUNT (n1[, n2]...)
 // !DIR$ name[=value] [, name[=value]]...
 // !DIR$ UNROLL [n]
+// !DIR$ PREFETCH designator[, designator]...
 // !DIR$ <anything else>
 constexpr auto ignore_tkr{
     "IGNORE_TKR" >> optionalList(construct<CompilerDirective::IgnoreTKR>(
@@ -1308,6 +1312,8 @@ constexpr auto vectorAlways{
     "VECTOR ALWAYS" >> construct<CompilerDirective::VectorAlways>()};
 constexpr auto unroll{
     "UNROLL" >> construct<CompilerDirective::Unroll>(maybe(digitString64))};
+constexpr auto prefetch{"PREFETCH" >>
+    construct<CompilerDirective::Prefetch>(nonemptyList(indirect(designator)))};
 constexpr auto unrollAndJam{"UNROLL_AND_JAM" >>
     construct<CompilerDirective::UnrollAndJam>(maybe(digitString64))};
 constexpr auto novector{"NOVECTOR" >> construct<CompilerDirective::NoVector>()};
@@ -1326,6 +1332,7 @@ TYPE_PARSER(beginDirective >> "DIR$ "_tok >>
                 construct<CompilerDirective>(vectorAlways) ||
                 construct<CompilerDirective>(unrollAndJam) ||
                 construct<CompilerDirective>(unroll) ||
+                construct<CompilerDirective>(prefetch) ||
                 construct<CompilerDirective>(novector) ||
                 construct<CompilerDirective>(nounrollAndJam) ||
                 construct<CompilerDirective>(nounroll) ||
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 4159d2e41b78c..e2da60ed19de8 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -791,6 +791,12 @@ TYPE_PARSER(construct<OmpDirectiveNameModifier>(OmpDirectiveNameParser{}))
 TYPE_PARSER(construct<OmpExpectation>( //
     "PRESENT" >> pure(OmpExpectation::Value::Present)))
 
+TYPE_PARSER(construct<OmpFallbackModifier>("FALLBACK"_tok >>
+    parenthesized( //
+        "ABORT" >> pure(OmpFallbackModifier::Value::Abort) ||
+        "DEFAULT_MEM" >> pure(OmpFallbackModifier::Value::Default_Mem) ||
+        "NULL" >> pure(OmpFallbackModifier::Value::Null))))
+
 TYPE_PARSER(construct<OmpInteropRuntimeIdentifier>(
     construct<OmpInteropRuntimeIdentifier>(charLiteralConstant) ||
     construct<OmpInteropRuntimeIdentifier>(scalarIntConstantExpr)))
@@ -857,8 +863,7 @@ TYPE_PARSER(construct<OmpOrderingModifier>(
     "SIMD" >> pure(OmpOrderingModifier::Value::Simd)))
 
 TYPE_PARSER(construct<OmpPrescriptiveness>(
-    "STRICT" >> pure(OmpPrescriptiveness::Value::Strict) ||
-    "FALLBACK" >> pure(OmpPrescriptiveness::Value::Fallback)))
+    "STRICT" >> pure(OmpPrescriptiveness::Value::Strict)))
 
 TYPE_PARSER(construct<OmpPresentModifier>( //
     "PRESENT" >> pure(OmpPresentModifier::Value::Present)))
@@ -925,7 +930,7 @@ TYPE_PARSER( //
     sourced(construct<OmpDynGroupprivateClause::Modifier>(
         Parser<OmpAccessGroup>{})) ||
     sourced(construct<OmpDynGroupprivateClause::Modifier>(
-        Parser<OmpPrescriptiveness>{})))
+        Parser<OmpFallbackModifier>{})))
 
 TYPE_PARSER(
     sourced(construct<OmpDeviceClause::Modifier>(Parser<OmpDeviceModifier>{})))
@@ -1778,6 +1783,31 @@ struct OmpBlockConstructParser {
   llvm::omp::Directive dir_;
 };
 
+struct OmpDeclarativeAllocateParser {
+  using resultType = OmpAllocateDirective;
+
+  std::optional<resultType> Parse(ParseState &state) const {
+    constexpr llvm::omp::Directive dir{llvm::omp::Directive::OMPD_allocate};
+    if (auto &&begin{attempt(OmpBeginDirectiveParser(dir)).Parse(state)}) {
+      Block empty;
+      auto end{maybe(OmpEndDirectiveParser{dir}).Parse(state)};
+      return OmpAllocateDirective{std::move(*begin), std::move(empty),
+          llvm::transformOptional(std::move(*end),
+              [](auto &&s) { return OmpEndDirective(std::move(s)); })};
+    }
+    return std::nullopt;
+  }
+};
+
+struct OmpExecutableAllocateParser {
+  using resultType = OmpAllocateDirective;
+
+  std::optional<resultType> Parse(ParseState &state) const {
+    OmpStatementConstructParser p{llvm::omp::Directive::OMPD_allocate};
+    return construct<OmpAllocateDirective>(p).Parse(state);
+  }
+};
+
 TYPE_PARSER(sourced(construct<OpenMPAllocatorsConstruct>(
     OmpStatementConstructParser{llvm::omp::Directive::OMPD_allocators})))
 
@@ -2044,13 +2074,6 @@ TYPE_PARSER(construct<OmpInitializerExpression>(OmpStylizedExpressionParser{}))
 TYPE_PARSER(sourced(construct<OpenMPCriticalConstruct>(
     OmpBlockConstructParser{llvm::omp::Directive::OMPD_critical})))
 
-// 2.11.3 Executable Allocate directive
-TYPE_PARSER(
-    sourced(construct<OpenMPExecutableAllocate>(verbatim("ALLOCATE"_tok),
-        maybe(parenthesized(Parser<OmpObjectList>{})), Parser<OmpClauseList>{},
-        maybe(nonemptyList(Parser<OpenMPDeclarativeAllocate>{})) / endOmpLine,
-        statement(allocateStmt))))
-
 // 2.8.2 Declare Simd construct
 TYPE_PARSER(sourced(construct<OpenMPDeclareSimdConstruct>(
     predicated(Parser<OmpDirectiveName>{},
@@ -2076,12 +2099,6 @@ TYPE_PARSER(sourced( //
             IsDirective(llvm::omp::Directive::OMPD_threadprivate)) >=
         Parser<OmpDirectiveSpecification>{})))
 
-// 2.11.3 Declarative Allocate directive
-TYPE_PARSER(
-    sourced(construct<OpenMPDeclarativeAllocate>(verbatim("ALLOCATE"_tok),
-        parenthesized(Parser<OmpObjectList>{}), Parser<OmpClauseList>{})) /
-    lookAhead(endOmpLine / !statement(allocateStmt)))
-
 // Assumes Construct
 TYPE_PARSER(sourced(construct<OpenMPDeclarativeAssumes>(
     predicated(OmpDirectiveNameParser{},
@@ -2104,7 +2121,7 @@ TYPE_PARSER(
                             construct<OpenMPDeclarativeConstruct>(
                                 Parser<OmpDeclareVariantDirective>{}) ||
                             construct<OpenMPDeclarativeConstruct>(
-                                Parser<OpenMPDeclarativeAllocate>{}) ||
+                                sourced(OmpDeclarativeAllocateParser{})) ||
                             construct<OpenMPDeclarativeConstruct>(
                                 Parser<OpenMPGroupprivate>{}) ||
                             construct<OpenMPDeclarativeConstruct>(
@@ -2192,6 +2209,8 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US,
         withMessage("expected OpenMP construct"_err_en_US,
             first(construct<OpenMPConstruct>(Parser<OpenMPSectionsConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPLoopConstruct>{}),
+                construct<OpenMPConstruct>(
+                    sourced(OmpExecutableAllocateParser{})),
                 construct<OpenMPConstruct>(Parser<OmpBlockConstruct>{}),
                 // OmpBlockConstruct is attempted before
                 // OpenMPStandaloneConstruct to resolve !$OMP ORDERED
@@ -2199,9 +2218,7 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US,
                 construct<OpenMPConstruct>(Parser<OpenMPAtomicConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPUtilityConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPDispatchConstruct>{}),
-                construct<OpenMPConstruct>(Parser<OpenMPExecutableAllocate>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPAllocatorsConstruct>{}),
-                construct<OpenMPConstruct>(Parser<OpenMPDeclarativeAllocate>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPAssumeConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPCriticalConstruct>{}))))
 
diff --git a/flang/lib/Parser/openmp-utils.cpp b/flang/lib/Parser/openmp-utils.cpp
index 95ad3f60770f5..b9d3763cdd06d 100644
--- a/flang/lib/Parser/openmp-utils.cpp
+++ b/flang/lib/Parser/openmp-utils.cpp
@@ -22,6 +22,25 @@
 
 namespace Fortran::parser::omp {
 
+const OpenMPDeclarativeConstruct *GetOmp(const DeclarationConstruct &x) {
+  if (auto *y = std::get_if<SpecificationConstruct>(&x.u)) {
+    if (auto *z{std::get_if<common::Indirection<OpenMPDeclarativeConstruct>>(
+            &y->u)}) {
+      return &z->value();
+    }
+  }
+  return nullptr;
+}
+
+const OpenMPConstruct *GetOmp(const ExecutionPartConstruct &x) {
+  if (auto *y{std::get_if<ExecutableConstruct>(&x.u)}) {
+    if (auto *z{std::get_if<common::Indirection<OpenMPConstruct>>(&y->u)}) {
+      return &z->value();
+    }
+  }
+  return nullptr;
+}
+
 const OmpObjectList *GetOmpObjectList(const OmpClause &clause) {
   // Clauses with OmpObjectList as its data member
   using MemberObjectListClauses = std::tuple<OmpClause::Copyin,
@@ -86,4 +105,24 @@ const OmpInitializerExpression *GetInitializerExpr(const OmpClause &init) {
   return nullptr;
 }
 
+static void SplitOmpAllocateHelper(
+    OmpAllocateInfo &n, const OmpAllocateDirective &x) {
+  n.dirs.push_back(&x);
+  const Block &body{std::get<Block>(x.t)};
+  if (!body.empty()) {
+    if (auto *omp{GetOmp(body.front())}) {
+      if (auto *ad{std::get_if<OmpAllocateDirective>(&omp->u)}) {
+        return SplitOmpAllocateHelper(n, *ad);
+      }
+    }
+    n.body = &body.front();
+  }
+}
+
+OmpAllocateInfo SplitOmpAllocate(const OmpAllocateDirective &x) {
+  OmpAllocateInfo info;
+  SplitOmpAllocateHelper(info, x);
+  return info;
+}
+
 } // namespace Fortran::parser::omp
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index fd69404f313d3..8cccd84f9fa19 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -557,7 +557,7 @@ bool Prescanner::MustSkipToEndOfLine() const {
     return true; // skip over ignored columns in right margin (73:80)
   } else if (*at_ == '!' && !inCharLiteral_ &&
       (!inFixedForm_ || tabInCurrentLine_ || column_ != 6)) {
-    return !IsCompilerDirectiveSentinel(at_ + 1);
+    return InCompilerDirective() || !IsCompilerDirectiveSentinel(at_ + 1);
   } else {
     return false;
   }
@@ -1642,6 +1642,17 @@ Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
       // This is a Continuation line, not an initial directive line.
       return std::nullopt;
     }
+    ++column, ++p;
+  }
+  if (isOpenMPConditional) {
+    for (; column <= fixedFormColumnLimit_; ++column, ++p) {
+      if (IsSpaceOrTab(p)) {
+      } else if (*p == '!') {
+        return std::nullopt; // !$    ! is a comment, not a directive
+      } else {
+        break;
+      }
+    }
   }
   if (const char *ss{IsCompilerDirectiveSentinel(
           sentinel, static_cast<std::size_t>(sp - sentinel))}) {
@@ -1657,8 +1668,17 @@ Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
       p && *p++ == '!') {
     if (auto maybePair{IsCompilerDirectiveSentinel(p)}) {
       auto offset{static_cast<std::size_t>(p - start - 1)};
-      return {LineClassification{LineClassification::Kind::CompilerDirective,
-          offset, maybePair->first}};
+      const char *sentinel{maybePair->first};
+      if ((sentinel[0] == '$' && sentinel[1] == '\0') || sentinel[1] == '@') {
+        if (const char *comment{IsFreeFormComment(maybePair->second)}) {
+          if (*comment == '!') {
+            // Conditional line comment - treat as comment
+            return std::nullopt;
+          }
+        }
+      }
+      return {LineClassification{
+          LineClassification::Kind::CompilerDirective, offset, sentinel}};
     }
   }
   return std::nullopt;
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 9b38cfc40c5b2..53e74298f96ac 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -819,6 +819,7 @@ class UnparseVisitor {
       Word("TEAM=");
     }
   }
+  void Before(const ImageSelectorSpec::Notify &) { Word("NOTIFY="); }
   void Unparse(const AllocateStmt &x) { // R927
     Word("ALLOCATE(");
     Walk(std::get<std::optional<TypeSpec>>(x.t), "::");
@@ -1854,6 +1855,10 @@ class UnparseVisitor {
               Word("!DIR$ UNROLL");
               Walk(" ", unroll.v);
             },
+            [&](const CompilerDirective::Prefetch &prefetch) {
+              Word("!DIR$ PREFETCH");
+              Walk(" ", prefetch.v);
+            },
             [&](const CompilerDirective::UnrollAndJam &unrollAndJam) {
               Word("!DIR$ UNROLL_AND_JAM");
               Walk(" ", unrollAndJam.v);
@@ -2281,6 +2286,11 @@ class UnparseVisitor {
     Walk(std::get<OmpObjectList>(x.t));
     Walk(": ", std::get<std::optional<std::list<Modifier>>>(x.t));
   }
+  void Unparse(const OmpFallbackModifier &x) {
+    Word("FALLBACK(");
+    Walk(x.v);
+    Put(")");
+  }
   void Unparse(const OmpDynGroupprivateClause &x) {
     using Modifier = OmpDynGroupprivateClause::Modifier;
     Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ": ");
@@ -2482,32 +2492,8 @@ class UnparseVisitor {
     Unparse(static_cast<const OmpBlockConstruct &>(x));
   }
 
-  void Unparse(const OpenMPExecutableAllocate &x) {
-    const auto &fields =
-        std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
-            x.t);
-    if (fields) {
-      for (const auto &decl : *fields) {
-        Walk(decl);
-      }
-    }
-    BeginOpenMP();
-    Word("!$OMP ALLOCATE");
-    Walk(" (", std::get<std::optional<OmpObjectList>>(x.t), ")");
-    Walk(std::get<OmpClauseList>(x.t));
-    Put("\n");
-    EndOpenMP();
-    Walk(std::get<Statement<AllocateStmt>>(x.t));
-  }
-  void Unparse(const OpenMPDeclarativeAllocate &x) {
-    BeginOpenMP();
-    Word("!$OMP ALLOCATE");
-    Put(" (");
-    Walk(std::get<OmpObjectList>(x.t));
-    Put(")");
-    Walk(std::get<OmpClauseList>(x.t));
-    Put("\n");
-    EndOpenMP();
+  void Unparse(const OmpAllocateDirective &x) {
+    Unparse(static_cast<const OmpBlockConstruct &>(x));
   }
   void Unparse(const OpenMPAllocatorsConstruct &x) {
     Unparse(static_cast<const OmpBlockConstruct &>(x));
@@ -2814,6 +2800,7 @@ class UnparseVisitor {
       OmpDeviceTypeClause, DeviceTypeDescription) // OMP device_type
   WALK_NESTED_ENUM(OmpReductionModifier, Value) // OMP reduction-modifier
   WALK_NESTED_ENUM(OmpExpectation, Value) // OMP motion-expectation
+  WALK_NESTED_ENUM(OmpFallbackModifier, Value) // OMP fallback-modifier
   WALK_NESTED_ENUM(OmpInteropType, Value) // OMP InteropType
   WALK_NESTED_ENUM(OmpOrderClause, Ordering) // OMP ordering
   WALK_NESTED_ENUM(OmpOrderModifier, Value) // OMP order-modifier
diff --git a/flang/lib/Semantics/canonicalize-omp.cpp b/flang/lib/Semantics/canonicalize-omp.cpp
index c884658bf464a..a11c5250b1ab4 100644
--- a/flang/lib/Semantics/canonicalize-omp.cpp
+++ b/flang/lib/Semantics/canonicalize-omp.cpp
@@ -51,8 +51,6 @@ class CanonicalizationOfOmp {
     } // Block list
   }
 
-  void Post(parser::ExecutionPart &body) { RewriteOmpAllocations(body); }
-
   // Pre-visit all constructs that have both a specification part and
   // an execution part, and store the connection between the two.
   bool Pre(parser::BlockConstruct &x) {
@@ -88,6 +86,7 @@ class CanonicalizationOfOmp {
 
   void Post(parser::SpecificationPart &spec) {
     CanonicalizeUtilityConstructs(spec);
+    CanonicalizeAllocateDirectives(spec);
   }
 
   void Post(parser::OmpMapClause &map) { CanonicalizeMapModifiers(map); }
@@ -239,33 +238,138 @@ class CanonicalizationOfOmp {
     }
   }
 
-  void RewriteOmpAllocations(parser::ExecutionPart &body) {
-    // Rewrite leading declarative allocations so they are nested
-    // within their respective executable allocate directive
-    //
-    // Original:
-    //   ExecutionPartConstruct -> OpenMPDeclarativeAllocate
-    //   ExecutionPartConstruct -> OpenMPDeclarativeAllocate
-    //   ExecutionPartConstruct -> OpenMPExecutableAllocate
-    //
-    // After rewriting:
-    //   ExecutionPartConstruct -> OpenMPExecutableAllocate
-    //     ExecutionPartConstruct -> OpenMPDeclarativeAllocate
-    //     ExecutionPartConstruct -> OpenMPDeclarativeAllocate
-    for (auto it = body.v.rbegin(); it != body.v.rend();) {
-      if (auto *exec = GetOmpIf<parser::OpenMPExecutableAllocate>(*(it++))) {
-        parser::OpenMPDeclarativeAllocate *decl;
-        std::list<parser::OpenMPDeclarativeAllocate> subAllocates;
-        while (it != body.v.rend() &&
-            (decl = GetOmpIf<parser::OpenMPDeclarativeAllocate>(*it))) {
-          subAllocates.push_front(std::move(*decl));
-          it = decltype(it)(body.v.erase(std::next(it).base()));
+  // Canonicalization of allocate directives
+  //
+  // In OpenMP 5.0 and 5.1 the allocate directive could either be a declarative
+  // one or an executable one. As usual in such cases, this poses a problem
+  // when the directive appears at the boundary between the specification part
+  // and the execution part.
+  // The executable form can actually consist of several adjacent directives,
+  // whereas the declarative form is always standalone. Additionally, the
+  // executable form must be associated with an allocate statement.
+  //
+  // The parser tries to parse declarative statements first, so in the
+  // following case, the two directives will be declarative, even though
+  // they should be treated as a single executable form:
+  //   integer, allocatable :: x, y   ! Specification
+  //   !$omp allocate(x)
+  //   !$omp allocate(y)
+  //   allocate(x, y)                 ! Execution
+  //
+  void CanonicalizeAllocateDirectives(parser::SpecificationPart &spec) {
+    auto found = blockForSpec_.find(&spec);
+    if (found == blockForSpec_.end()) {
+      // There is no corresponding execution part, so there is nothing to do.
+      return;
+    }
+    parser::Block &block = *found->second;
+
+    auto isAllocateStmt = [](const parser::ExecutionPartConstruct &epc) {
+      if (auto *ec = std::get_if<parser::ExecutableConstruct>(&epc.u)) {
+        if (auto *as =
+                std::get_if<parser::Statement<parser::ActionStmt>>(&ec->u)) {
+          return std::holds_alternative<
+              common::Indirection<parser::AllocateStmt>>(as->statement.u);
+        }
+      }
+      return false;
+    };
+
+    if (!block.empty() && isAllocateStmt(block.front())) {
+      // There are two places where an OpenMP declarative construct can
+      // show up in the tuple in specification part:
+      // (1) in std::list<OpenMPDeclarativeConstruct>, or
+      // (2) in std::list<DeclarationConstruct>.
+      // The case (1) is only possible if the list (2) is empty.
+
+      auto &omps =
+          std::get<std::list<parser::OpenMPDeclarativeConstruct>>(spec.t);
+      auto &decls = std::get<std::list<parser::DeclarationConstruct>>(spec.t);
+
+      if (!decls.empty()) {
+        MakeExecutableAllocateFromDecls(decls, block);
+      } else {
+        MakeExecutableAllocateFromOmps(omps, block);
+      }
+    }
+  }
+
+  parser::ExecutionPartConstruct EmbedInExec(
+      parser::OmpAllocateDirective *alo, parser::ExecutionPartConstruct &&epc) {
+    // Nest current epc inside the allocate directive.
+    std::get<parser::Block>(alo->t).push_front(std::move(epc));
+    // Set the new epc to be the ExecutionPartConstruct made from
+    // the allocate directive.
+    parser::OpenMPConstruct opc(std::move(*alo));
+    common::Indirection<parser::OpenMPConstruct> ind(std::move(opc));
+    parser::ExecutableConstruct ec(std::move(ind));
+    return parser::ExecutionPartConstruct(std::move(ec));
+  }
+
+  void MakeExecutableAllocateFromDecls(
+      std::list<parser::DeclarationConstruct> &decls, parser::Block &body) {
+    using OpenMPDeclarativeConstruct =
+        common::Indirection<parser::OpenMPDeclarativeConstruct>;
+
+    auto getAllocate = [](parser::DeclarationConstruct *dc) {
+      if (auto *sc = std::get_if<parser::SpecificationConstruct>(&dc->u)) {
+        if (auto *odc = std::get_if<OpenMPDeclarativeConstruct>(&sc->u)) {
+          if (auto *alo =
+                  std::get_if<parser::OmpAllocateDirective>(&odc->value().u)) {
+            return alo;
+          }
+        }
+      }
+      return static_cast<parser::OmpAllocateDirective *>(nullptr);
+    };
+
+    std::list<parser::DeclarationConstruct>::reverse_iterator rlast = [&]() {
+      for (auto rit = decls.rbegin(), rend = decls.rend(); rit != rend; ++rit) {
+        if (getAllocate(&*rit) == nullptr) {
+          return rit;
         }
-        if (!subAllocates.empty()) {
-          std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
-              exec->t) = {std::move(subAllocates)};
+      }
+      return decls.rend();
+    }();
+
+    if (rlast != decls.rbegin()) {
+      // We have already checked that the first statement in body is
+      // ALLOCATE.
+      parser::ExecutionPartConstruct epc(std::move(body.front()));
+      for (auto rit = decls.rbegin(); rit != rlast; ++rit) {
+        epc = EmbedInExec(getAllocate(&*rit), std::move(epc));
+      }
+
+      body.pop_front();
+      body.push_front(std::move(epc));
+      decls.erase(rlast.base(), decls.end());
+    }
+  }
+
+  void MakeExecutableAllocateFromOmps(
+      std::list<parser::OpenMPDeclarativeConstruct> &omps,
+      parser::Block &body) {
+    using OpenMPDeclarativeConstruct = parser::OpenMPDeclarativeConstruct;
+
+    std::list<OpenMPDeclarativeConstruct>::reverse_iterator rlast = [&]() {
+      for (auto rit = omps.rbegin(), rend = omps.rend(); rit != rend; ++rit) {
+        if (!std::holds_alternative<parser::OmpAllocateDirective>(rit->u)) {
+          return rit;
         }
       }
+      return omps.rend();
+    }();
+
+    if (rlast != omps.rbegin()) {
+      parser::ExecutionPartConstruct epc(std::move(body.front()));
+      for (auto rit = omps.rbegin(); rit != rlast; ++rit) {
+        epc = EmbedInExec(
+            &std::get<parser::OmpAllocateDirective>(rit->u), std::move(epc));
+      }
+
+      body.pop_front();
+      body.push_front(std::move(epc));
+      omps.erase(rlast.base(), omps.end());
     }
   }
 
diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp
index e019bbdfa27f6..a411e20557456 100644
--- a/flang/lib/Semantics/check-allocate.cpp
+++ b/flang/lib/Semantics/check-allocate.cpp
@@ -26,6 +26,10 @@ struct AllocateCheckerInfo {
   std::optional<evaluate::DynamicType> sourceExprType;
   std::optional<parser::CharBlock> sourceExprLoc;
   std::optional<parser::CharBlock> typeSpecLoc;
+  std::optional<parser::CharBlock> statSource;
+  std::optional<parser::CharBlock> msgSource;
+  const SomeExpr *statVar{nullptr};
+  const SomeExpr *msgVar{nullptr};
   int sourceExprRank{0}; // only valid if gotMold || gotSource
   bool gotStat{false};
   bool gotMsg{false};
@@ -141,12 +145,15 @@ static std::optional<AllocateCheckerInfo> CheckAllocateOptions(
             [&](const parser::StatOrErrmsg &statOrErr) {
               common::visit(
                   common::visitors{
-                      [&](const parser::StatVariable &) {
+                      [&](const parser::StatVariable &var) {
                         if (info.gotStat) { // C943
                           context.Say(
                               "STAT may not be duplicated in a ALLOCATE statement"_err_en_US);
                         }
                         info.gotStat = true;
+                        info.statVar = GetExpr(context, var);
+                        info.statSource =
+                            parser::Unwrap<parser::Variable>(var)->GetSource();
                       },
                       [&](const parser::MsgVariable &var) {
                         WarnOnDeferredLengthCharacterScalar(context,
@@ -159,6 +166,9 @@ static std::optional<AllocateCheckerInfo> CheckAllocateOptions(
                               "ERRMSG may not be duplicated in a ALLOCATE statement"_err_en_US);
                         }
                         info.gotMsg = true;
+                        info.msgVar = GetExpr(context, var);
+                        info.msgSource =
+                            parser::Unwrap<parser::Variable>(var)->GetSource();
                       },
                   },
                   statOrErr.u);
@@ -460,6 +470,16 @@ static bool HaveCompatibleLengths(
   }
 }
 
+bool AreSameAllocation(const SomeExpr *root, const SomeExpr *path) {
+  if (root && path) {
+    // For now we just use equality of expressions. If we implement a more
+    // sophisticated alias analysis we should use it here.
+    return *root == *path;
+  } else {
+    return false;
+  }
+}
+
 bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) {
   if (!ultimate_) {
     CHECK(context.AnyFatalError());
@@ -690,6 +710,17 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) {
           "Object in ALLOCATE must have DEVICE attribute when STREAM option is specified"_err_en_US);
     }
   }
+
+  if (const SomeExpr *allocObj{GetExpr(context, allocateObject_)}) {
+    if (AreSameAllocation(allocObj, allocateInfo_.statVar)) {
+      context.Say(allocateInfo_.statSource.value_or(name_.source),
+          "STAT variable in ALLOCATE must not be the variable being allocated"_err_en_US);
+    }
+    if (AreSameAllocation(allocObj, allocateInfo_.msgVar)) {
+      context.Say(allocateInfo_.msgSource.value_or(name_.source),
+          "ERRMSG variable in ALLOCATE must not be the variable being allocated"_err_en_US);
+    }
+  }
   return RunCoarrayRelatedChecks(context);
 }
 
diff --git a/flang/lib/Semantics/check-allocate.h b/flang/lib/Semantics/check-allocate.h
index e3f7f07bca5b7..54f7380bc3fe8 100644
--- a/flang/lib/Semantics/check-allocate.h
+++ b/flang/lib/Semantics/check-allocate.h
@@ -24,5 +24,6 @@ class AllocateChecker : public virtual BaseChecker {
 private:
   SemanticsContext &context_;
 };
+bool AreSameAllocation(const SomeExpr *root, const SomeExpr *path);
 } // namespace Fortran::semantics
 #endif // FORTRAN_SEMANTICS_CHECK_ALLOCATE_H_
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index 995deaa12dd3b..022b4289b4e7c 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -548,8 +548,13 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
     actualLastSymbol = &ResolveAssociations(*actualLastSymbol);
   }
   int actualRank{actualType.Rank()};
-  if (dummy.type.attrs().test(
-          characteristics::TypeAndShape::Attr::AssumedShape)) {
+  if (dummyIsValue && dummyRank == 0 &&
+      dummy.ignoreTKR.test(common::IgnoreTKR::Rank) && actualRank > 0) {
+    messages.Say(
+        "Array actual argument may not be associated with IGNORE_TKR(R) scalar %s with VALUE attribute"_err_en_US,
+        dummyName);
+  } else if (dummy.type.attrs().test(
+                 characteristics::TypeAndShape::Attr::AssumedShape)) {
     // 15.5.2.4(16)
     if (actualIsAssumedRank) {
       messages.Say(
@@ -795,7 +800,9 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
   bool dummyIsAssumedShape{dummy.type.attrs().test(
       characteristics::TypeAndShape::Attr::AssumedShape)};
   bool copyOutNeeded{
-      evaluate::MayNeedCopy(&arg, &dummyArg, foldingContext, true)};
+      evaluate::ActualArgNeedsCopy(&arg, &dummyArg, foldingContext,
+          /*forCopyOut=*/true)
+          .value_or(false)};
   if (copyOutNeeded && !dummyIsValue &&
       (dummyIsAsynchronous || dummyIsVolatile)) {
     if (actualIsAsynchronous || actualIsVolatile) {
@@ -832,8 +839,8 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
   // a unread value in the actual argument.
   // Occurences of `volatileOrAsyncNeedsTempDiagnosticIssued = true` indicate a
   // more specific error message has already been issued. We might be able to
-  // clean this up by switching the coding style of MayNeedCopy to be more like
-  // WhyNotDefinable.
+  // clean this up by switching the coding style of ActualArgNeedsCopy to be
+  // more like WhyNotDefinable.
   if (copyOutNeeded && !volatileOrAsyncNeedsTempDiagnosticIssued) {
     if ((actualIsVolatile || actualIsAsynchronous) &&
         (dummyIsVolatile || dummyIsAsynchronous)) {
diff --git a/flang/lib/Semantics/check-deallocate.cpp b/flang/lib/Semantics/check-deallocate.cpp
index c1ebc5f4c0ec2..e6ce1b30a59f5 100644
--- a/flang/lib/Semantics/check-deallocate.cpp
+++ b/flang/lib/Semantics/check-deallocate.cpp
@@ -7,51 +7,87 @@
 //===----------------------------------------------------------------------===//
 
 #include "check-deallocate.h"
+#include "check-allocate.h"
 #include "definable.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Parser/message.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/expression.h"
 #include "flang/Semantics/tools.h"
+#include <optional>
 
 namespace Fortran::semantics {
 
 void DeallocateChecker::Leave(const parser::DeallocateStmt &deallocateStmt) {
+  bool gotStat{false}, gotMsg{false};
+  const SomeExpr *statVar{nullptr}, *msgVar{nullptr};
+  std::optional<parser::CharBlock> statSource;
+  std::optional<parser::CharBlock> msgSource;
+  for (const parser::StatOrErrmsg &deallocOpt :
+      std::get<std::list<parser::StatOrErrmsg>>(deallocateStmt.t)) {
+    common::visit(
+        common::visitors{
+            [&](const parser::StatVariable &var) {
+              if (gotStat) {
+                context_.Say(
+                    "STAT may not be duplicated in a DEALLOCATE statement"_err_en_US);
+              }
+              gotStat = true;
+              statVar = GetExpr(context_, var);
+              statSource = parser::Unwrap<parser::Variable>(var)->GetSource();
+            },
+            [&](const parser::MsgVariable &var) {
+              WarnOnDeferredLengthCharacterScalar(context_,
+                  GetExpr(context_, var),
+                  parser::UnwrapRef<parser::Variable>(var).GetSource(),
+                  "ERRMSG=");
+              if (gotMsg) {
+                context_.Say(
+                    "ERRMSG may not be duplicated in a DEALLOCATE statement"_err_en_US);
+              }
+              gotMsg = true;
+              msgVar = GetExpr(context_, var);
+              msgSource = parser::Unwrap<parser::Variable>(var)->GetSource();
+            },
+        },
+        deallocOpt.u);
+  }
   for (const parser::AllocateObject &allocateObject :
       std::get<std::list<parser::AllocateObject>>(deallocateStmt.t)) {
+    parser::CharBlock source;
     common::visit(
         common::visitors{
             [&](const parser::Name &name) {
               const Symbol *symbol{
                   name.symbol ? &name.symbol->GetUltimate() : nullptr};
-              ;
+              source = name.source;
               if (context_.HasError(symbol)) {
                 // already reported an error
               } else if (!IsVariableName(*symbol)) {
-                context_.Say(name.source,
+                context_.Say(source,
                     "Name in DEALLOCATE statement must be a variable name"_err_en_US);
               } else if (!IsAllocatableOrObjectPointer(symbol)) { // C936
-                context_.Say(name.source,
+                context_.Say(source,
                     "Name in DEALLOCATE statement must have the ALLOCATABLE or POINTER attribute"_err_en_US);
-              } else if (auto whyNot{WhyNotDefinable(name.source,
-                             context_.FindScope(name.source),
-                             {DefinabilityFlag::PointerDefinition,
-                                 DefinabilityFlag::AcceptAllocatable,
-                                 DefinabilityFlag::PotentialDeallocation},
-                             *symbol)}) {
+              } else if (auto whyNot{
+                             WhyNotDefinable(source, context_.FindScope(source),
+                                 {DefinabilityFlag::PointerDefinition,
+                                     DefinabilityFlag::AcceptAllocatable,
+                                     DefinabilityFlag::PotentialDeallocation},
+                                 *symbol)}) {
                 // Catch problems with non-definability of the
                 // pointer/allocatable
                 context_
-                    .Say(name.source,
+                    .Say(source,
                         "Name in DEALLOCATE statement is not definable"_err_en_US)
                     .Attach(std::move(
                         whyNot->set_severity(parser::Severity::Because)));
-              } else if (auto whyNot{WhyNotDefinable(name.source,
-                             context_.FindScope(name.source),
-                             DefinabilityFlags{}, *symbol)}) {
+              } else if (auto whyNot{
+                             WhyNotDefinable(source, context_.FindScope(source),
+                                 DefinabilityFlags{}, *symbol)}) {
                 // Catch problems with non-definability of the dynamic object
                 context_
-                    .Say(name.source,
+                    .Say(source,
                         "Object in DEALLOCATE statement is not deallocatable"_err_en_US)
                     .Attach(std::move(
                         whyNot->set_severity(parser::Severity::Because)));
@@ -62,13 +98,12 @@ void DeallocateChecker::Leave(const parser::DeallocateStmt &deallocateStmt) {
             [&](const parser::StructureComponent &structureComponent) {
               // Only perform structureComponent checks if it was successfully
               // analyzed by expression analysis.
-              auto source{structureComponent.component.source};
+              source = structureComponent.component.source;
               if (const auto *expr{GetExpr(context_, allocateObject)}) {
-                if (const Symbol *
-                        symbol{structureComponent.component.symbol
-                                ? &structureComponent.component.symbol
-                                       ->GetUltimate()
-                                : nullptr};
+                if (const Symbol *symbol{structureComponent.component.symbol
+                            ? &structureComponent.component.symbol
+                                  ->GetUltimate()
+                            : nullptr};
                     !IsAllocatableOrObjectPointer(symbol)) { // F'2023 C936
                   context_.Say(source,
                       "Component in DEALLOCATE statement must have the ALLOCATABLE or POINTER attribute"_err_en_US);
@@ -99,32 +134,16 @@ void DeallocateChecker::Leave(const parser::DeallocateStmt &deallocateStmt) {
             },
         },
         allocateObject.u);
-  }
-  bool gotStat{false}, gotMsg{false};
-  for (const parser::StatOrErrmsg &deallocOpt :
-      std::get<std::list<parser::StatOrErrmsg>>(deallocateStmt.t)) {
-    common::visit(
-        common::visitors{
-            [&](const parser::StatVariable &) {
-              if (gotStat) {
-                context_.Say(
-                    "STAT may not be duplicated in a DEALLOCATE statement"_err_en_US);
-              }
-              gotStat = true;
-            },
-            [&](const parser::MsgVariable &var) {
-              WarnOnDeferredLengthCharacterScalar(context_,
-                  GetExpr(context_, var),
-                  parser::UnwrapRef<parser::Variable>(var).GetSource(),
-                  "ERRMSG=");
-              if (gotMsg) {
-                context_.Say(
-                    "ERRMSG may not be duplicated in a DEALLOCATE statement"_err_en_US);
-              }
-              gotMsg = true;
-            },
-        },
-        deallocOpt.u);
+    if (const SomeExpr *allocObj{GetExpr(context_, allocateObject)}) {
+      if (AreSameAllocation(allocObj, statVar)) {
+        context_.Say(statSource.value_or(source),
+            "STAT variable in DEALLOCATE must not be the variable being deallocated"_err_en_US);
+      }
+      if (AreSameAllocation(allocObj, msgVar)) {
+        context_.Say(msgSource.value_or(source),
+            "ERRMSG variable in DEALLOCATE must not be the variable being deallocated"_err_en_US);
+      }
+    }
   }
 }
 
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index de407d3b1e125..9a6b3ff3cdc2c 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -855,6 +855,15 @@ void CheckHelper::CheckObjectEntity(
         messages_.Say(
             "Variable '%s' with EVENT_TYPE or LOCK_TYPE potential component '%s' must be a coarray"_err_en_US,
             symbol.name(), component.BuildResultDesignatorName());
+      } else if (IsNotifyType(derived)) { // C1612
+        messages_.Say(
+            "Variable '%s' with NOTIFY_TYPE must be a coarray"_err_en_US,
+            symbol.name());
+      } else if (auto component{FindNotifyPotentialComponent( // C1611
+                     *derived, /*ignoreCoarrays=*/true)}) {
+        messages_.Say(
+            "Variable '%s' with NOTIFY_TYPE potential component '%s' must be a coarray"_err_en_US,
+            symbol.name(), component.BuildResultDesignatorName());
       }
     }
   }
@@ -873,6 +882,10 @@ void CheckHelper::CheckObjectEntity(
         messages_.Say(
             "An INTENT(OUT) dummy argument may not be, or contain, EVENT_TYPE or LOCK_TYPE"_err_en_US);
       }
+      if (IsOrContainsNotifyComponent(symbol)) { // C1613
+        messages_.Say(
+            "An INTENT(OUT) dummy argument may not be, or contain, NOTIFY_TYPE"_err_en_US);
+      }
       if (IsAssumedSizeArray(symbol)) { // C834
         if (type && type->IsPolymorphic()) {
           messages_.Say(
diff --git a/flang/lib/Semantics/check-omp-atomic.cpp b/flang/lib/Semantics/check-omp-atomic.cpp
index 2707921ca1dfa..ec03e6fe2d920 100644
--- a/flang/lib/Semantics/check-omp-atomic.cpp
+++ b/flang/lib/Semantics/check-omp-atomic.cpp
@@ -590,9 +590,11 @@ void OmpStructureChecker::CheckAtomicVariable(
 
   CheckAtomicType(syms.back(), source, atom.AsFortran(), checkTypeOnPointer);
 
-  if (IsAllocatable(syms.back()) && !IsArrayElement(atom)) {
-    context_.Say(source, "Atomic variable %s cannot be ALLOCATABLE"_err_en_US,
-        atom.AsFortran());
+  if (!IsArrayElement(atom) && !ExtractComplexPart(atom)) {
+    if (IsAllocatable(syms.back())) {
+      context_.Say(source, "Atomic variable %s cannot be ALLOCATABLE"_err_en_US,
+          atom.AsFortran());
+    }
   }
 }
 
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index aaaf1ec5d4626..37b4404cc598f 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -179,29 +179,21 @@ void OmpStructureChecker::Leave(const parser::BlockConstruct &x) {
   }
 }
 
-// Use when clause falls under 'struct OmpClause' in 'parse-tree.h'.
-#define CHECK_SIMPLE_CLAUSE(X, Y) \
-  void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \
-    CheckAllowedClause(llvm::omp::Clause::Y); \
-  }
+void OmpStructureChecker::Enter(const parser::SpecificationPart &) {
+  partStack_.push_back(PartKind::SpecificationPart);
+}
 
-#define CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(X, Y) \
-  void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \
-    CheckAllowedClause(llvm::omp::Clause::Y); \
-    RequiresConstantPositiveParameter(llvm::omp::Clause::Y, c.v); \
-  }
+void OmpStructureChecker::Leave(const parser::SpecificationPart &) {
+  partStack_.pop_back();
+}
 
-#define CHECK_REQ_SCALAR_INT_CLAUSE(X, Y) \
-  void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \
-    CheckAllowedClause(llvm::omp::Clause::Y); \
-    RequiresPositiveParameter(llvm::omp::Clause::Y, c.v); \
-  }
+void OmpStructureChecker::Enter(const parser::ExecutionPart &) {
+  partStack_.push_back(PartKind::ExecutionPart);
+}
 
-// Use when clause don't falls under 'struct OmpClause' in 'parse-tree.h'.
-#define CHECK_SIMPLE_PARSER_CLAUSE(X, Y) \
-  void OmpStructureChecker::Enter(const parser::X &) { \
-    CheckAllowedClause(llvm::omp::Y); \
-  }
+void OmpStructureChecker::Leave(const parser::ExecutionPart &) {
+  partStack_.pop_back();
+}
 
 // 'OmpWorkshareBlockChecker' is used to check the validity of the assignment
 // statements and the expressions enclosed in an OpenMP Workshare construct
@@ -667,49 +659,6 @@ void OmpStructureChecker::HasInvalidTeamsNesting(
   }
 }
 
-void OmpStructureChecker::CheckPredefinedAllocatorRestriction(
-    const parser::CharBlock &source, const parser::Name &name) {
-  if (const auto *symbol{name.symbol}) {
-    const auto *commonBlock{FindCommonBlockContaining(*symbol)};
-    const auto &scope{context_.FindScope(symbol->name())};
-    const Scope &containingScope{GetProgramUnitContaining(scope)};
-    if (!isPredefinedAllocator &&
-        (IsSaved(*symbol) || commonBlock ||
-            containingScope.kind() == Scope::Kind::Module)) {
-      context_.Say(source,
-          "If list items within the %s directive have the "
-          "SAVE attribute, are a common block name, or are "
-          "declared in the scope of a module, then only "
-          "predefined memory allocator parameters can be used "
-          "in the allocator clause"_err_en_US,
-          ContextDirectiveAsFortran());
-    }
-  }
-}
-
-void OmpStructureChecker::CheckPredefinedAllocatorRestriction(
-    const parser::CharBlock &source,
-    const parser::OmpObjectList &ompObjectList) {
-  for (const auto &ompObject : ompObjectList.v) {
-    common::visit(
-        common::visitors{
-            [&](const parser::Designator &designator) {
-              if (const auto *dataRef{
-                      std::get_if<parser::DataRef>(&designator.u)}) {
-                if (const auto *name{std::get_if<parser::Name>(&dataRef->u)}) {
-                  CheckPredefinedAllocatorRestriction(source, *name);
-                }
-              }
-            },
-            [&](const parser::Name &name) {
-              CheckPredefinedAllocatorRestriction(source, name);
-            },
-            [&](const parser::OmpObject::Invalid &invalid) {},
-        },
-        ompObject.u);
-  }
-}
-
 void OmpStructureChecker::Enter(const parser::OmpClause::Hint &x) {
   CheckAllowedClause(llvm::omp::Clause::OMPC_hint);
   auto &dirCtx{GetContext()};
@@ -733,6 +682,13 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Hint &x) {
   }
 }
 
+void OmpStructureChecker::Enter(const parser::OmpClause::DynGroupprivate &x) {
+  CheckAllowedClause(llvm::omp::Clause::OMPC_dyn_groupprivate);
+  parser::CharBlock source{GetContext().clauseSource};
+
+  OmpVerifyModifiers(x.v, llvm::omp::OMPC_dyn_groupprivate, source, context_);
+}
+
 void OmpStructureChecker::Enter(const parser::OmpDirectiveSpecification &x) {
   // OmpDirectiveSpecification exists on its own only in METADIRECTIVE.
   // In other cases it's a part of other constructs that handle directive
@@ -763,18 +719,10 @@ template <typename Checker> struct DirectiveSpellingVisitor {
     return std::get<parser::OmpBeginDirective>(t).DirName();
   }
 
-  bool Pre(const parser::OpenMPDeclarativeAllocate &x) {
-    checker_(std::get<parser::Verbatim>(x.t).source, Directive::OMPD_allocate);
-    return false;
-  }
   bool Pre(const parser::OpenMPDispatchConstruct &x) {
     checker_(GetDirName(x.t).source, Directive::OMPD_dispatch);
     return false;
   }
-  bool Pre(const parser::OpenMPExecutableAllocate &x) {
-    checker_(std::get<parser::Verbatim>(x.t).source, Directive::OMPD_allocate);
-    return false;
-  }
   bool Pre(const parser::OpenMPAllocatorsConstruct &x) {
     checker_(GetDirName(x.t).source, Directive::OMPD_allocators);
     return false;
@@ -1710,12 +1658,39 @@ void OmpStructureChecker::Leave(const parser::OpenMPRequiresConstruct &) {
   dirContext_.pop_back();
 }
 
-void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source,
-    const parser::OmpObjectList &objects,
-    const parser::OmpClauseList &clauses) {
-  const Scope &thisScope{context_.FindScope(source)};
-  SymbolSourceMap symbols;
-  GetSymbolsInObjectList(objects, symbols);
+static std::pair<const parser::AllocateStmt *, parser::CharBlock>
+getAllocateStmtAndSource(const parser::ExecutionPartConstruct *epc) {
+  if (SourcedActionStmt as{GetActionStmt(epc)}) {
+    using IndirectionAllocateStmt = common::Indirection<parser::AllocateStmt>;
+    if (auto *indirect{std::get_if<IndirectionAllocateStmt>(&as.stmt->u)}) {
+      return {&indirect->value(), as.source};
+    }
+  }
+  return {nullptr, ""};
+}
+
+// Collect symbols that correspond to non-component objects on the
+// ALLOCATE statement.
+static UnorderedSymbolSet GetNonComponentSymbols(
+    const parser::AllocateStmt &stmt) {
+  UnorderedSymbolSet symbols;
+  for (auto &alloc : std::get<std::list<parser::Allocation>>(stmt.t)) {
+    auto &object{std::get<parser::AllocateObject>(alloc.t)};
+    if (auto *name{std::get_if<parser::Name>(&object.u)}) {
+      if (name->symbol) {
+        symbols.insert(name->symbol->GetUltimate());
+      }
+    }
+  }
+  return symbols;
+}
+
+void OmpStructureChecker::CheckIndividualAllocateDirective(
+    const parser::OmpAllocateDirective &x, bool isExecutable) {
+  const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()};
+  const parser::OmpDirectiveName &dirName{beginSpec.DirName()};
+
+  const Scope &thisScope{context_.FindScope(dirName.source)};
 
   auto maybeHasPredefinedAllocator{[&](const parser::OmpClause *calloc) {
     // Return "true" if the ALLOCATOR clause was provided with an argument
@@ -1741,73 +1716,200 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source,
     return true;
   }};
 
-  const auto *allocator{FindClause(llvm::omp::Clause::OMPC_allocator)};
+  const auto *allocator{[&]() {
+    // Can't use FindClause in Enter (because clauses haven't been visited
+    // yet).
+    for (const parser::OmpClause &c : beginSpec.Clauses().v) {
+      if (c.Id() == llvm::omp::Clause::OMPC_allocator) {
+        return &c;
+      }
+    }
+    return static_cast<const parser::OmpClause *>(nullptr);
+  }()};
+
   if (InTargetRegion()) {
     bool hasDynAllocators{
         HasRequires(llvm::omp::Clause::OMPC_dynamic_allocators)};
     if (!allocator && !hasDynAllocators) {
-      context_.Say(source,
+      context_.Say(dirName.source,
           "An ALLOCATE directive in a TARGET region must specify an ALLOCATOR clause or REQUIRES(DYNAMIC_ALLOCATORS) must be specified"_err_en_US);
     }
   }
 
   auto maybePredefined{maybeHasPredefinedAllocator(allocator)};
 
-  for (auto &[symbol, source] : symbols) {
-    if (!inExecutableAllocate_) {
-      if (symbol->owner() != thisScope) {
+  unsigned version{context_.langOptions().OpenMPVersion};
+  std::string condStr{version == 50
+          ? "a named common block, has SAVE attribute or is declared in the "
+            "scope of a module"
+          : "a named common block or has SAVE attribute"};
+
+  auto checkSymbol{[&](const Symbol &symbol, parser::CharBlock source) {
+    if (!isExecutable) {
+      // For structure members, the scope is the derived type, which is
+      // never "this" scope. Ignore this check for members, they will be
+      // flagged anyway.
+      if (symbol.owner() != thisScope && !IsStructureComponent(symbol)) {
         context_.Say(source,
             "A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears"_err_en_US);
       }
-      if (IsPointer(*symbol) || IsAllocatable(*symbol)) {
+      if (IsPointer(symbol) || IsAllocatable(symbol)) {
         context_.Say(source,
             "A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute"_err_en_US);
       }
     }
-    if (symbol->GetUltimate().has<AssocEntityDetails>()) {
+    if (symbol.GetUltimate().has<AssocEntityDetails>()) {
       context_.Say(source,
           "A list item in a declarative ALLOCATE cannot be an associate name"_err_en_US);
     }
-    if (symbol->attrs().test(Attr::SAVE) || IsCommonBlock(*symbol)) {
+    bool inModule{
+        version == 50 && symbol.owner().kind() == Scope::Kind::Module};
+    if (symbol.attrs().test(Attr::SAVE) || IsCommonBlock(symbol) || inModule) {
       if (!allocator) {
         context_.Say(source,
-            "If a list item is a named common block or has SAVE attribute, an ALLOCATOR clause must be present with a predefined allocator"_err_en_US);
+            "If a list item is %s, an ALLOCATOR clause must be present with a predefined allocator"_err_en_US,
+            condStr);
       } else if (!maybePredefined) {
         context_.Say(source,
-            "If a list item is a named common block or has SAVE attribute, only a predefined allocator may be used on the ALLOCATOR clause"_err_en_US);
+            "If a list item is %s, only a predefined allocator may be used on the ALLOCATOR clause"_err_en_US,
+            condStr);
       }
     }
-    if (FindCommonBlockContaining(*symbol)) {
+    if (FindCommonBlockContaining(symbol)) {
       context_.Say(source,
           "A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block"_err_en_US);
     }
+  }};
+
+  for (const parser::OmpArgument &arg : beginSpec.Arguments().v) {
+    const parser::OmpObject *object{GetArgumentObject(arg)};
+    if (!object) {
+      context_.Say(arg.source,
+          "An argument to ALLOCATE directive must be a variable list item"_err_en_US);
+      continue;
+    }
+
+    if (const Symbol *symbol{GetObjectSymbol(*object)}) {
+      if (!IsTypeParamInquiry(*symbol)) {
+        checkSymbol(*symbol, arg.source);
+      }
+      CheckVarIsNotPartOfAnotherVar(dirName.source, *object);
+    }
   }
-  CheckVarIsNotPartOfAnotherVar(source, objects);
 }
 
-void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) {
-  const auto &dir{std::get<parser::Verbatim>(x.t)};
-  PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
+void OmpStructureChecker::CheckExecutableAllocateDirective(
+    const parser::OmpAllocateDirective &x) {
+  parser::omp::OmpAllocateInfo info{SplitOmpAllocate(x)};
+
+  auto [allocStmt, allocSource]{getAllocateStmtAndSource(info.body)};
+  if (!allocStmt) {
+    // This has been diagnosed already.
+    return;
+  }
+
+  UnorderedSymbolSet allocateSyms{GetNonComponentSymbols(*allocStmt)};
+  SymbolSourceMap directiveSyms;
+  bool hasEmptyList{false};
+
+  for (const parser::OmpAllocateDirective *ompAlloc : info.dirs) {
+    const parser::OmpDirectiveSpecification &spec{DEREF(ompAlloc).BeginDir()};
+    if (spec.Arguments().v.empty()) {
+      if (hasEmptyList && info.dirs.size() > 1) {
+        context_.Say(spec.DirName().source,
+            "If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items"_err_en_US);
+      }
+      hasEmptyList = true;
+    }
+    for (const parser::OmpArgument &arg : spec.Arguments().v) {
+      if (auto *sym{GetArgumentSymbol(arg)}) {
+        // Ignore these checks for structure members. They are not allowed
+        // in the first place, so don't tell the users that they need to
+        // be specified somewhere,
+        if (IsStructureComponent(*sym)) {
+          continue;
+        }
+        if (auto f{directiveSyms.find(sym)}; f != directiveSyms.end()) {
+          parser::MessageFormattedText txt(
+              "A list item on an executable ALLOCATE may only be specified once"_err_en_US);
+          parser::Message message(arg.source, txt);
+          message.Attach(f->second, "The list item was specified here"_en_US);
+          context_.Say(std::move(message));
+        } else {
+          directiveSyms.insert(std::make_pair(sym, arg.source));
+        }
+
+        if (auto f{allocateSyms.find(*sym)}; f == allocateSyms.end()) {
+          context_
+              .Say(arg.source,
+                  "A list item on an executable ALLOCATE must be specified on the associated ALLOCATE statement"_err_en_US)
+              .Attach(allocSource, "The ALLOCATE statement"_en_US);
+        }
+      }
+    }
+  }
 }
 
-void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeAllocate &x) {
-  if (!inExecutableAllocate_) {
-    const auto &dir{std::get<parser::Verbatim>(x.t)};
-    const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
-    const auto &objectList{std::get<parser::OmpObjectList>(x.t)};
+void OmpStructureChecker::Enter(const parser::OmpAllocateDirective &x) {
+  const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()};
+  const parser::OmpDirectiveName &dirName{beginSpec.DirName()};
+  PushContextAndClauseSets(dirName.source, dirName.v);
+  ++allocateDirectiveLevel;
+
+  bool isExecutable{partStack_.back() == PartKind::ExecutionPart};
 
-    isPredefinedAllocator = true;
-    CheckAllocateDirective(dir.source, objectList, clauseList);
+  unsigned version{context_.langOptions().OpenMPVersion};
+  if (isExecutable && allocateDirectiveLevel == 1 && version >= 52) {
+    context_.Warn(common::UsageWarning::OpenMPUsage, dirName.source,
+        "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US);
   }
+
+  CheckIndividualAllocateDirective(x, isExecutable);
+
+  if (isExecutable) {
+    auto isOmpAllocate{[](const parser::ExecutionPartConstruct &epc) {
+      if (auto *omp{GetOmp(epc)}) {
+        auto odn{GetOmpDirectiveName(*omp)};
+        return odn.v == llvm::omp::Directive::OMPD_allocate;
+      }
+      return false;
+    }};
+
+    auto &body{std::get<parser::Block>(x.t)};
+    // The parser should put at most one statement in the body.
+    assert(body.size() <= 1 && "Multiple statements in allocate");
+    if (body.empty()) {
+      context_.Say(dirName.source,
+          "An executable ALLOCATE directive must be associated with an ALLOCATE statement"_err_en_US);
+    } else {
+      const parser::ExecutionPartConstruct &first{body.front()};
+      auto [allocStmt, _]{getAllocateStmtAndSource(&body.front())};
+      if (!isOmpAllocate(first) && !allocStmt) {
+        parser::CharBlock source{[&]() {
+          if (auto &&maybeSource{parser::GetSource(first)}) {
+            return *maybeSource;
+          }
+          return dirName.source;
+        }()};
+        context_.Say(source,
+            "The statement associated with executable ALLOCATE directive must be an ALLOCATE statement"_err_en_US);
+      }
+    }
+  }
+}
+
+void OmpStructureChecker::Leave(const parser::OmpAllocateDirective &x) {
+  bool isExecutable{partStack_.back() == PartKind::ExecutionPart};
+  if (isExecutable && allocateDirectiveLevel == 1) {
+    CheckExecutableAllocateDirective(x);
+  }
+
+  --allocateDirectiveLevel;
   dirContext_.pop_back();
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Allocator &x) {
   CheckAllowedClause(llvm::omp::Clause::OMPC_allocator);
-  // Note: Predefined allocators are stored in ScalarExpr as numbers
-  //   whereas custom allocators are stored as strings, so if the ScalarExpr
-  //   actually has an int value, then it must be a predefined allocator
-  isPredefinedAllocator = GetIntValue(x.v).has_value();
   RequiresPositiveParameter(llvm::omp::Clause::OMPC_allocator, x.v);
 }
 
@@ -1823,16 +1925,6 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Allocate &x) {
             "The alignment value should be a constant positive integer"_err_en_US);
       }
     }
-    // The simple and complex modifiers have the same structure. They only
-    // differ in their syntax.
-    if (auto *alloc{OmpGetUniqueModifier<parser::OmpAllocatorComplexModifier>(
-            modifiers)}) {
-      isPredefinedAllocator = GetIntValue(alloc->v).has_value();
-    }
-    if (auto *alloc{OmpGetUniqueModifier<parser::OmpAllocatorSimpleModifier>(
-            modifiers)}) {
-      isPredefinedAllocator = GetIntValue(alloc->v).has_value();
-    }
   }
 }
 
@@ -2115,168 +2207,88 @@ void OmpStructureChecker::Enter(const parser::OmpClause::At &x) {
   }
 }
 
-// Goes through the names in an OmpObjectList and checks if each name appears
-// in the given allocate statement
-void OmpStructureChecker::CheckAllNamesInAllocateStmt(
-    const parser::CharBlock &source, const parser::OmpObjectList &ompObjectList,
-    const parser::AllocateStmt &allocate) {
-  for (const auto &obj : ompObjectList.v) {
-    if (const auto *d{std::get_if<parser::Designator>(&obj.u)}) {
-      if (const auto *ref{std::get_if<parser::DataRef>(&d->u)}) {
-        if (const auto *n{std::get_if<parser::Name>(&ref->u)}) {
-          CheckNameInAllocateStmt(source, *n, allocate);
-        }
-      }
-    }
-  }
-}
-
-void OmpStructureChecker::CheckNameInAllocateStmt(
-    const parser::CharBlock &source, const parser::Name &name,
-    const parser::AllocateStmt &allocate) {
-  for (const auto &allocation :
-      std::get<std::list<parser::Allocation>>(allocate.t)) {
-    const auto &allocObj = std::get<parser::AllocateObject>(allocation.t);
-    if (const auto *n{std::get_if<parser::Name>(&allocObj.u)}) {
-      if (n->source == name.source) {
-        return;
-      }
-    }
-  }
-  unsigned version{context_.langOptions().OpenMPVersion};
-  context_.Say(source,
-      "Object '%s' in %s directive not "
-      "found in corresponding ALLOCATE statement"_err_en_US,
-      name.ToString(),
-      parser::ToUpperCaseLetters(
-          llvm::omp::getOpenMPDirectiveName(GetContext().directive, version)
-              .str()));
-}
-
-void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) {
-  inExecutableAllocate_ = true;
-  const auto &dir{std::get<parser::Verbatim>(x.t)};
-  PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
-
-  unsigned version{context_.langOptions().OpenMPVersion};
-  if (version >= 52) {
-    context_.Warn(common::UsageWarning::OpenMPUsage, x.source,
-        "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US);
-  }
-
-  const auto &allocateStmt =
-      std::get<parser::Statement<parser::AllocateStmt>>(x.t).statement;
-  if (const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)}) {
-    CheckAllNamesInAllocateStmt(
-        std::get<parser::Verbatim>(x.t).source, *list, allocateStmt);
-  }
-  if (const auto &subDirs{
-          std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
-              x.t)}) {
-    for (const auto &dalloc : *subDirs) {
-      CheckAllNamesInAllocateStmt(std::get<parser::Verbatim>(dalloc.t).source,
-          std::get<parser::OmpObjectList>(dalloc.t), allocateStmt);
-    }
-  }
-
-  isPredefinedAllocator = true;
-}
+void OmpStructureChecker::Enter(const parser::OpenMPAllocatorsConstruct &x) {
+  const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()};
+  const parser::OmpDirectiveName &dirName{beginSpec.DirName()};
+  PushContextAndClauseSets(
+      dirName.source, llvm::omp::Directive::OMPD_allocators);
 
-void OmpStructureChecker::Leave(const parser::OpenMPExecutableAllocate &x) {
-  parser::OmpObjectList empty{std::list<parser::OmpObject>{}};
-  auto &objects{[&]() -> const parser::OmpObjectList & {
-    if (auto &objects{std::get<std::optional<parser::OmpObjectList>>(x.t)}) {
-      return *objects;
-    } else {
-      return empty;
+  for (const auto &clause : beginSpec.Clauses().v) {
+    auto *alloc{std::get_if<parser::OmpClause::Allocate>(&clause.u)};
+    if (!alloc) {
+      continue;
     }
-  }()};
-  auto &clauses{std::get<parser::OmpClauseList>(x.t)};
-  CheckAllocateDirective(
-      std::get<parser::Verbatim>(x.t).source, objects, clauses);
-
-  if (const auto &subDirs{
-          std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
-              x.t)}) {
-    for (const auto &dalloc : *subDirs) {
-      const auto &dir{std::get<parser::Verbatim>(x.t)};
-      const auto &clauses{std::get<parser::OmpClauseList>(dalloc.t)};
-      const auto &objects{std::get<parser::OmpObjectList>(dalloc.t)};
-      CheckAllocateDirective(dir.source, objects, clauses);
+    using OmpAllocatorSimpleModifier = parser::OmpAllocatorSimpleModifier;
+    using OmpAllocatorComplexModifier = parser::OmpAllocatorComplexModifier;
+
+    if (InTargetRegion()) {
+      auto &modifiers{OmpGetModifiers(alloc->v)};
+      bool hasAllocator{
+          OmpGetUniqueModifier<OmpAllocatorSimpleModifier>(modifiers) ||
+          OmpGetUniqueModifier<OmpAllocatorComplexModifier>(modifiers)};
+      bool hasDynAllocators{
+          HasRequires(llvm::omp::Clause::OMPC_dynamic_allocators)};
+      if (!hasAllocator && !hasDynAllocators) {
+        context_.Say(clause.source,
+            "An ALLOCATE clause in a TARGET region must specify an allocator or REQUIRES(DYNAMIC_ALLOCATORS) must be specified"_err_en_US);
+      }
     }
   }
 
-  dirContext_.pop_back();
-  inExecutableAllocate_ = false;
-}
-
-void OmpStructureChecker::Enter(const parser::OpenMPAllocatorsConstruct &x) {
-  isPredefinedAllocator = true;
-
-  const parser::OmpDirectiveSpecification &dirSpec{x.BeginDir()};
-  auto &block{std::get<parser::Block>(x.t)};
-  PushContextAndClauseSets(
-      dirSpec.DirName().source, llvm::omp::Directive::OMPD_allocators);
-
-  if (block.empty()) {
-    context_.Say(dirSpec.source,
-        "The ALLOCATORS construct should contain a single ALLOCATE statement"_err_en_US);
+  auto &body{std::get<parser::Block>(x.t)};
+  // The parser should put at most one statement in the body.
+  assert(body.size() <= 1 && "Malformed body in allocators");
+  if (body.empty()) {
+    context_.Say(dirName.source,
+        "The body of an ALLOCATORS construct should be an ALLOCATE statement"_err_en_US);
     return;
   }
 
-  omp::SourcedActionStmt action{omp::GetActionStmt(block)};
-  const auto *allocate{
-      action ? parser::Unwrap<parser::AllocateStmt>(action.stmt) : nullptr};
-
-  if (allocate) {
-    for (const auto &clause : dirSpec.Clauses().v) {
-      if (auto *alloc{std::get_if<parser::OmpClause::Allocate>(&clause.u)}) {
-        CheckAllNamesInAllocateStmt(
-            x.source, std::get<parser::OmpObjectList>(alloc->v.t), *allocate);
-
-        using OmpAllocatorSimpleModifier = parser::OmpAllocatorSimpleModifier;
-        using OmpAllocatorComplexModifier = parser::OmpAllocatorComplexModifier;
-
-        auto &modifiers{OmpGetModifiers(alloc->v)};
-        bool hasAllocator{
-            OmpGetUniqueModifier<OmpAllocatorSimpleModifier>(modifiers) ||
-            OmpGetUniqueModifier<OmpAllocatorComplexModifier>(modifiers)};
-
-        // TODO: As with allocate directive, exclude the case when a requires
-        //       directive with the dynamic_allocators clause is present in
-        //       the same compilation unit (OMP5.0 2.11.3).
-        if (IsNestedInDirective(llvm::omp::Directive::OMPD_target) &&
-            !hasAllocator) {
-          context_.Say(x.source,
-              "ALLOCATORS directives that appear in a TARGET region must specify an allocator"_err_en_US);
-        }
+  auto [allocStmt, allocSource]{getAllocateStmtAndSource(&body.front())};
+  if (!allocStmt) {
+    parser::CharBlock source{[&]() {
+      if (auto &&maybeSource{parser::GetSource(body.front())}) {
+        return *maybeSource;
       }
-    }
-  } else {
-    const parser::CharBlock &source = action ? action.source : x.source;
+      return dirName.source;
+    }()};
     context_.Say(source,
-        "The body of the ALLOCATORS construct should be an ALLOCATE statement"_err_en_US);
+        "The body of an ALLOCATORS construct should be an ALLOCATE statement"_err_en_US);
+    return;
   }
 
-  for (const auto &clause : dirSpec.Clauses().v) {
-    if (const auto *allocClause{
-            parser::Unwrap<parser::OmpClause::Allocate>(clause)}) {
-      CheckVarIsNotPartOfAnotherVar(
-          dirSpec.source, std::get<parser::OmpObjectList>(allocClause->v.t));
+  UnorderedSymbolSet allocateSyms{GetNonComponentSymbols(*allocStmt)};
+  for (const auto &clause : beginSpec.Clauses().v) {
+    auto *alloc{std::get_if<parser::OmpClause::Allocate>(&clause.u)};
+    if (!alloc) {
+      continue;
+    }
+    for (auto &object : DEREF(GetOmpObjectList(clause)).v) {
+      CheckVarIsNotPartOfAnotherVar(dirName.source, object);
+      if (auto *symbol{GetObjectSymbol(object)}) {
+        if (IsStructureComponent(*symbol)) {
+          continue;
+        }
+        parser::CharBlock source{[&]() {
+          if (auto &&objectSource{GetObjectSource(object)}) {
+            return *objectSource;
+          }
+          return clause.source;
+        }()};
+        if (!IsTypeParamInquiry(*symbol)) {
+          if (auto f{allocateSyms.find(*symbol)}; f == allocateSyms.end()) {
+            context_
+                .Say(source,
+                    "A list item in an ALLOCATORS construct must be specified on the associated ALLOCATE statement"_err_en_US)
+                .Attach(allocSource, "The ALLOCATE statement"_en_US);
+          }
+        }
+      }
     }
   }
 }
 
 void OmpStructureChecker::Leave(const parser::OpenMPAllocatorsConstruct &x) {
-  const parser::OmpDirectiveSpecification &dirSpec{x.BeginDir()};
-
-  for (const auto &clause : dirSpec.Clauses().v) {
-    if (const auto *allocClause{
-            std::get_if<parser::OmpClause::Allocate>(&clause.u)}) {
-      CheckPredefinedAllocatorRestriction(
-          dirSpec.source, std::get<parser::OmpObjectList>(allocClause->v.t));
-    }
-  }
   dirContext_.pop_back();
 }
 
@@ -3311,6 +3323,32 @@ void OmpStructureChecker::Leave(const parser::OmpClauseList &) {
     }
   }
 
+  // Default access-group for DYN_GROUPPRIVATE is "cgroup". On a given
+  // construct there can be at most one DYN_GROUPPRIVATE with a given
+  // access-group.
+  const parser::OmpClause
+      *accGrpClause[parser::OmpAccessGroup::Value_enumSize] = {nullptr};
+  for (auto [_, clause] :
+      FindClauses(llvm::omp::Clause::OMPC_dyn_groupprivate)) {
+    auto &wrapper{std::get<parser::OmpClause::DynGroupprivate>(clause->u)};
+    auto &modifiers{OmpGetModifiers(wrapper.v)};
+    auto accGrp{parser::OmpAccessGroup::Value::Cgroup};
+    if (auto *ag{OmpGetUniqueModifier<parser::OmpAccessGroup>(modifiers)}) {
+      accGrp = ag->v;
+    }
+    auto &firstClause{accGrpClause[llvm::to_underlying(accGrp)]};
+    if (firstClause) {
+      context_
+          .Say(clause->source,
+              "The access-group modifier can only occur on a single clause in a construct"_err_en_US)
+          .Attach(firstClause->source,
+              "Previous clause with access-group modifier"_en_US);
+      break;
+    } else {
+      firstClause = clause;
+    }
+  }
+
   CheckRequireAtLeastOneOf();
 }
 
@@ -3362,88 +3400,6 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Sizes &c) {
         /*paramName=*/"parameter", /*allowZero=*/false);
 }
 
-// Following clauses do not have a separate node in parse-tree.h.
-CHECK_SIMPLE_CLAUSE(Absent, OMPC_absent)
-CHECK_SIMPLE_CLAUSE(Affinity, OMPC_affinity)
-CHECK_SIMPLE_CLAUSE(Capture, OMPC_capture)
-CHECK_SIMPLE_CLAUSE(Contains, OMPC_contains)
-CHECK_SIMPLE_CLAUSE(Default, OMPC_default)
-CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj)
-CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type)
-CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule)
-CHECK_SIMPLE_CLAUSE(DynGroupprivate, OMPC_dyn_groupprivate)
-CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive)
-CHECK_SIMPLE_CLAUSE(Final, OMPC_final)
-CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush)
-CHECK_SIMPLE_CLAUSE(Full, OMPC_full)
-CHECK_SIMPLE_CLAUSE(Grainsize, OMPC_grainsize)
-CHECK_SIMPLE_CLAUSE(GraphId, OMPC_graph_id)
-CHECK_SIMPLE_CLAUSE(GraphReset, OMPC_graph_reset)
-CHECK_SIMPLE_CLAUSE(Holds, OMPC_holds)
-CHECK_SIMPLE_CLAUSE(Inclusive, OMPC_inclusive)
-CHECK_SIMPLE_CLAUSE(Initializer, OMPC_initializer)
-CHECK_SIMPLE_CLAUSE(Match, OMPC_match)
-CHECK_SIMPLE_CLAUSE(Nontemporal, OMPC_nontemporal)
-CHECK_SIMPLE_CLAUSE(NumTasks, OMPC_num_tasks)
-CHECK_SIMPLE_CLAUSE(Order, OMPC_order)
-CHECK_SIMPLE_CLAUSE(Read, OMPC_read)
-CHECK_SIMPLE_CLAUSE(Threadprivate, OMPC_threadprivate)
-CHECK_SIMPLE_CLAUSE(Groupprivate, OMPC_groupprivate)
-CHECK_SIMPLE_CLAUSE(Threads, OMPC_threads)
-CHECK_SIMPLE_CLAUSE(Threadset, OMPC_threadset)
-CHECK_SIMPLE_CLAUSE(Inbranch, OMPC_inbranch)
-CHECK_SIMPLE_CLAUSE(Link, OMPC_link)
-CHECK_SIMPLE_CLAUSE(Indirect, OMPC_indirect)
-CHECK_SIMPLE_CLAUSE(Mergeable, OMPC_mergeable)
-CHECK_SIMPLE_CLAUSE(NoOpenmp, OMPC_no_openmp)
-CHECK_SIMPLE_CLAUSE(NoOpenmpRoutines, OMPC_no_openmp_routines)
-CHECK_SIMPLE_CLAUSE(NoOpenmpConstructs, OMPC_no_openmp_constructs)
-CHECK_SIMPLE_CLAUSE(NoParallelism, OMPC_no_parallelism)
-CHECK_SIMPLE_CLAUSE(Nogroup, OMPC_nogroup)
-CHECK_SIMPLE_CLAUSE(Notinbranch, OMPC_notinbranch)
-CHECK_SIMPLE_CLAUSE(Partial, OMPC_partial)
-CHECK_SIMPLE_CLAUSE(ProcBind, OMPC_proc_bind)
-CHECK_SIMPLE_CLAUSE(Simd, OMPC_simd)
-CHECK_SIMPLE_CLAUSE(Permutation, OMPC_permutation)
-CHECK_SIMPLE_CLAUSE(Uniform, OMPC_uniform)
-CHECK_SIMPLE_CLAUSE(Unknown, OMPC_unknown)
-CHECK_SIMPLE_CLAUSE(Untied, OMPC_untied)
-CHECK_SIMPLE_CLAUSE(UsesAllocators, OMPC_uses_allocators)
-CHECK_SIMPLE_CLAUSE(Write, OMPC_write)
-CHECK_SIMPLE_CLAUSE(Init, OMPC_init)
-CHECK_SIMPLE_CLAUSE(Use, OMPC_use)
-CHECK_SIMPLE_CLAUSE(Novariants, OMPC_novariants)
-CHECK_SIMPLE_CLAUSE(Nocontext, OMPC_nocontext)
-CHECK_SIMPLE_CLAUSE(Severity, OMPC_severity)
-CHECK_SIMPLE_CLAUSE(Message, OMPC_message)
-CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter)
-CHECK_SIMPLE_CLAUSE(Otherwise, OMPC_otherwise)
-CHECK_SIMPLE_CLAUSE(AdjustArgs, OMPC_adjust_args)
-CHECK_SIMPLE_CLAUSE(AppendArgs, OMPC_append_args)
-CHECK_SIMPLE_CLAUSE(MemoryOrder, OMPC_memory_order)
-CHECK_SIMPLE_CLAUSE(Bind, OMPC_bind)
-CHECK_SIMPLE_CLAUSE(Compare, OMPC_compare)
-CHECK_SIMPLE_CLAUSE(OmpxAttribute, OMPC_ompx_attribute)
-CHECK_SIMPLE_CLAUSE(Weak, OMPC_weak)
-CHECK_SIMPLE_CLAUSE(AcqRel, OMPC_acq_rel)
-CHECK_SIMPLE_CLAUSE(Acquire, OMPC_acquire)
-CHECK_SIMPLE_CLAUSE(Relaxed, OMPC_relaxed)
-CHECK_SIMPLE_CLAUSE(Release, OMPC_release)
-CHECK_SIMPLE_CLAUSE(Replayable, OMPC_replayable)
-CHECK_SIMPLE_CLAUSE(Transparent, OMPC_transparent)
-CHECK_SIMPLE_CLAUSE(SeqCst, OMPC_seq_cst)
-CHECK_SIMPLE_CLAUSE(Fail, OMPC_fail)
-
-CHECK_REQ_SCALAR_INT_CLAUSE(NumTeams, OMPC_num_teams)
-CHECK_REQ_SCALAR_INT_CLAUSE(NumThreads, OMPC_num_threads)
-CHECK_REQ_SCALAR_INT_CLAUSE(OmpxDynCgroupMem, OMPC_ompx_dyn_cgroup_mem)
-CHECK_REQ_SCALAR_INT_CLAUSE(Priority, OMPC_priority)
-CHECK_REQ_SCALAR_INT_CLAUSE(ThreadLimit, OMPC_thread_limit)
-
-CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Collapse, OMPC_collapse)
-CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Safelen, OMPC_safelen)
-CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Simdlen, OMPC_simdlen)
-
 void OmpStructureChecker::Enter(const parser::OmpClause::Looprange &x) {
   context_.Say(GetContext().clauseSource,
       "LOOPRANGE clause is not implemented yet"_err_en_US,
@@ -4750,10 +4706,12 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Copyin &x) {
 void OmpStructureChecker::CheckStructureComponent(
     const parser::OmpObjectList &objects, llvm::omp::Clause clauseId) {
   auto CheckComponent{[&](const parser::Designator &designator) {
-    if (auto *dataRef{std::get_if<parser::DataRef>(&designator.u)}) {
+    if (const parser::DataRef *dataRef{
+            std::get_if<parser::DataRef>(&designator.u)}) {
       if (!IsDataRefTypeParamInquiry(dataRef)) {
-        if (auto *comp{parser::Unwrap<parser::StructureComponent>(*dataRef)}) {
-          context_.Say(comp->component.source,
+        const auto expr{AnalyzeExpr(context_, designator)};
+        if (expr.has_value() && evaluate::HasStructureComponent(expr.value())) {
+          context_.Say(designator.source,
               "A variable that is part of another variable cannot appear on the %s clause"_err_en_US,
               parser::ToUpperCaseLetters(getClauseName(clauseId).str()));
         }
@@ -5516,4 +5474,105 @@ void OmpStructureChecker::CheckAllowedRequiresClause(llvmOmpClause clause) {
   }
 }
 
+// Use when clause falls under 'struct OmpClause' in 'parse-tree.h'.
+#define CHECK_SIMPLE_CLAUSE(X, Y) \
+  void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \
+    CheckAllowedClause(llvm::omp::Clause::Y); \
+  }
+
+#define CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(X, Y) \
+  void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \
+    CheckAllowedClause(llvm::omp::Clause::Y); \
+    RequiresConstantPositiveParameter(llvm::omp::Clause::Y, c.v); \
+  }
+
+#define CHECK_REQ_SCALAR_INT_CLAUSE(X, Y) \
+  void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \
+    CheckAllowedClause(llvm::omp::Clause::Y); \
+    RequiresPositiveParameter(llvm::omp::Clause::Y, c.v); \
+  }
+
+// Following clauses do not have a separate node in parse-tree.h.
+CHECK_SIMPLE_CLAUSE(Absent, OMPC_absent)
+CHECK_SIMPLE_CLAUSE(AcqRel, OMPC_acq_rel)
+CHECK_SIMPLE_CLAUSE(Acquire, OMPC_acquire)
+CHECK_SIMPLE_CLAUSE(AdjustArgs, OMPC_adjust_args)
+CHECK_SIMPLE_CLAUSE(Affinity, OMPC_affinity)
+CHECK_SIMPLE_CLAUSE(AppendArgs, OMPC_append_args)
+CHECK_SIMPLE_CLAUSE(Bind, OMPC_bind)
+CHECK_SIMPLE_CLAUSE(Capture, OMPC_capture)
+CHECK_SIMPLE_CLAUSE(Collector, OMPC_collector)
+CHECK_SIMPLE_CLAUSE(Compare, OMPC_compare)
+CHECK_SIMPLE_CLAUSE(Contains, OMPC_contains)
+CHECK_SIMPLE_CLAUSE(Default, OMPC_default)
+CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj)
+CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type)
+CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule)
+CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive)
+CHECK_SIMPLE_CLAUSE(Fail, OMPC_fail)
+CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter)
+CHECK_SIMPLE_CLAUSE(Final, OMPC_final)
+CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush)
+CHECK_SIMPLE_CLAUSE(Full, OMPC_full)
+CHECK_SIMPLE_CLAUSE(Grainsize, OMPC_grainsize)
+CHECK_SIMPLE_CLAUSE(GraphId, OMPC_graph_id)
+CHECK_SIMPLE_CLAUSE(GraphReset, OMPC_graph_reset)
+CHECK_SIMPLE_CLAUSE(Groupprivate, OMPC_groupprivate)
+CHECK_SIMPLE_CLAUSE(Holds, OMPC_holds)
+CHECK_SIMPLE_CLAUSE(Inbranch, OMPC_inbranch)
+CHECK_SIMPLE_CLAUSE(Inclusive, OMPC_inclusive)
+CHECK_SIMPLE_CLAUSE(Indirect, OMPC_indirect)
+CHECK_SIMPLE_CLAUSE(Inductor, OMPC_inductor)
+CHECK_SIMPLE_CLAUSE(Initializer, OMPC_initializer)
+CHECK_SIMPLE_CLAUSE(Init, OMPC_init)
+CHECK_SIMPLE_CLAUSE(Link, OMPC_link)
+CHECK_SIMPLE_CLAUSE(Match, OMPC_match)
+CHECK_SIMPLE_CLAUSE(MemoryOrder, OMPC_memory_order)
+CHECK_SIMPLE_CLAUSE(Mergeable, OMPC_mergeable)
+CHECK_SIMPLE_CLAUSE(Message, OMPC_message)
+CHECK_SIMPLE_CLAUSE(Nocontext, OMPC_nocontext)
+CHECK_SIMPLE_CLAUSE(Nogroup, OMPC_nogroup)
+CHECK_SIMPLE_CLAUSE(Nontemporal, OMPC_nontemporal)
+CHECK_SIMPLE_CLAUSE(NoOpenmpConstructs, OMPC_no_openmp_constructs)
+CHECK_SIMPLE_CLAUSE(NoOpenmp, OMPC_no_openmp)
+CHECK_SIMPLE_CLAUSE(NoOpenmpRoutines, OMPC_no_openmp_routines)
+CHECK_SIMPLE_CLAUSE(NoParallelism, OMPC_no_parallelism)
+CHECK_SIMPLE_CLAUSE(Notinbranch, OMPC_notinbranch)
+CHECK_SIMPLE_CLAUSE(Novariants, OMPC_novariants)
+CHECK_SIMPLE_CLAUSE(NumTasks, OMPC_num_tasks)
+CHECK_SIMPLE_CLAUSE(OmpxAttribute, OMPC_ompx_attribute)
+CHECK_SIMPLE_CLAUSE(Order, OMPC_order)
+CHECK_SIMPLE_CLAUSE(Otherwise, OMPC_otherwise)
+CHECK_SIMPLE_CLAUSE(Partial, OMPC_partial)
+CHECK_SIMPLE_CLAUSE(Permutation, OMPC_permutation)
+CHECK_SIMPLE_CLAUSE(ProcBind, OMPC_proc_bind)
+CHECK_SIMPLE_CLAUSE(Read, OMPC_read)
+CHECK_SIMPLE_CLAUSE(Relaxed, OMPC_relaxed)
+CHECK_SIMPLE_CLAUSE(Release, OMPC_release)
+CHECK_SIMPLE_CLAUSE(Replayable, OMPC_replayable)
+CHECK_SIMPLE_CLAUSE(SeqCst, OMPC_seq_cst)
+CHECK_SIMPLE_CLAUSE(Severity, OMPC_severity)
+CHECK_SIMPLE_CLAUSE(Simd, OMPC_simd)
+CHECK_SIMPLE_CLAUSE(Threadprivate, OMPC_threadprivate)
+CHECK_SIMPLE_CLAUSE(Threadset, OMPC_threadset)
+CHECK_SIMPLE_CLAUSE(Threads, OMPC_threads)
+CHECK_SIMPLE_CLAUSE(Transparent, OMPC_transparent)
+CHECK_SIMPLE_CLAUSE(Uniform, OMPC_uniform)
+CHECK_SIMPLE_CLAUSE(Unknown, OMPC_unknown)
+CHECK_SIMPLE_CLAUSE(Untied, OMPC_untied)
+CHECK_SIMPLE_CLAUSE(Use, OMPC_use)
+CHECK_SIMPLE_CLAUSE(UsesAllocators, OMPC_uses_allocators)
+CHECK_SIMPLE_CLAUSE(Weak, OMPC_weak)
+CHECK_SIMPLE_CLAUSE(Write, OMPC_write)
+
+CHECK_REQ_SCALAR_INT_CLAUSE(NumTeams, OMPC_num_teams)
+CHECK_REQ_SCALAR_INT_CLAUSE(NumThreads, OMPC_num_threads)
+CHECK_REQ_SCALAR_INT_CLAUSE(OmpxDynCgroupMem, OMPC_ompx_dyn_cgroup_mem)
+CHECK_REQ_SCALAR_INT_CLAUSE(Priority, OMPC_priority)
+CHECK_REQ_SCALAR_INT_CLAUSE(ThreadLimit, OMPC_thread_limit)
+
+CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Collapse, OMPC_collapse)
+CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Safelen, OMPC_safelen)
+CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Simdlen, OMPC_simdlen)
+
 } // namespace Fortran::semantics
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 7426559e77ff7..1b84bc5dda471 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -82,6 +82,11 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   bool Enter(const parser::BlockConstruct &);
   void Leave(const parser::BlockConstruct &);
 
+  void Enter(const parser::SpecificationPart &);
+  void Leave(const parser::SpecificationPart &);
+  void Enter(const parser::ExecutionPart &);
+  void Leave(const parser::ExecutionPart &);
+
   void Enter(const parser::OpenMPConstruct &);
   void Leave(const parser::OpenMPConstruct &);
   void Enter(const parser::OpenMPInteropConstruct &);
@@ -113,8 +118,8 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   void Leave(const parser::OmpDeclareVariantDirective &);
   void Enter(const parser::OpenMPDeclareSimdConstruct &);
   void Leave(const parser::OpenMPDeclareSimdConstruct &);
-  void Enter(const parser::OpenMPDeclarativeAllocate &);
-  void Leave(const parser::OpenMPDeclarativeAllocate &);
+  void Enter(const parser::OmpAllocateDirective &);
+  void Leave(const parser::OmpAllocateDirective &);
   void Enter(const parser::OpenMPDeclareMapperConstruct &);
   void Leave(const parser::OpenMPDeclareMapperConstruct &);
   void Enter(const parser::OpenMPDeclareReductionConstruct &);
@@ -129,8 +134,6 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   void Leave(const parser::OmpErrorDirective &);
   void Enter(const parser::OmpNothingDirective &);
   void Leave(const parser::OmpNothingDirective &);
-  void Enter(const parser::OpenMPExecutableAllocate &);
-  void Leave(const parser::OpenMPExecutableAllocate &);
   void Enter(const parser::OpenMPAllocatorsConstruct &);
   void Leave(const parser::OpenMPAllocatorsConstruct &);
   void Enter(const parser::OpenMPRequiresConstruct &);
@@ -263,9 +266,9 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   bool CheckTargetBlockOnlyTeams(const parser::Block &);
   void CheckWorkshareBlockStmts(const parser::Block &, parser::CharBlock);
   void CheckWorkdistributeBlockStmts(const parser::Block &, parser::CharBlock);
-  void CheckAllocateDirective(parser::CharBlock source,
-      const parser::OmpObjectList &objects,
-      const parser::OmpClauseList &clauses);
+  void CheckIndividualAllocateDirective(
+      const parser::OmpAllocateDirective &x, bool isExecutable);
+  void CheckExecutableAllocateDirective(const parser::OmpAllocateDirective &x);
 
   void CheckIteratorRange(const parser::OmpIteratorSpecifier &x);
   void CheckIteratorModifier(const parser::OmpIterator &x);
@@ -325,11 +328,6 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
       const std::optional<parser::OmpClauseList> &maybeClauses);
   void CheckCancellationNest(
       const parser::CharBlock &source, llvm::omp::Directive type);
-  void CheckAllNamesInAllocateStmt(const parser::CharBlock &source,
-      const parser::OmpObjectList &ompObjectList,
-      const parser::AllocateStmt &allocate);
-  void CheckNameInAllocateStmt(const parser::CharBlock &source,
-      const parser::Name &ompObject, const parser::AllocateStmt &allocate);
   std::int64_t GetOrdCollapseLevel(const parser::OpenMPLoopConstruct &x);
   void CheckReductionObjects(
       const parser::OmpObjectList &objects, llvm::omp::Clause clauseId);
@@ -353,11 +351,6 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
       const parser::OmpObjectList &ompObjectList);
   void CheckIfContiguous(const parser::OmpObject &object);
   const parser::Name *GetObjectName(const parser::OmpObject &object);
-  void CheckPredefinedAllocatorRestriction(const parser::CharBlock &source,
-      const parser::OmpObjectList &ompObjectList);
-  void CheckPredefinedAllocatorRestriction(
-      const parser::CharBlock &source, const parser::Name &name);
-  bool isPredefinedAllocator{false};
 
   void CheckAllowedRequiresClause(llvmOmpClause clause);
   bool deviceConstructFound_{false};
@@ -383,7 +376,7 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   };
   int directiveNest_[LastType + 1] = {0};
 
-  bool inExecutableAllocate_{false};
+  int allocateDirectiveLevel{0};
   parser::CharBlock visitedAtomicSource_;
   SymbolSourceMap deferredNonVariables_;
 
@@ -392,6 +385,14 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   std::vector<LoopConstruct> loopStack_;
   // Scopes for scoping units.
   std::vector<const Scope *> scopeStack_;
+
+  enum class PartKind : int {
+    // There are also other "parts", such as internal-subprogram-part, etc,
+    // but we're keeping track of these two for now.
+    SpecificationPart,
+    ExecutionPart,
+  };
+  std::vector<PartKind> partStack_;
 };
 
 /// Find a duplicate entry in the range, and return an iterator to it.
diff --git a/flang/lib/Semantics/dump-expr.cpp b/flang/lib/Semantics/dump-expr.cpp
index 66cedab94bfb4..8d354cf65b61e 100644
--- a/flang/lib/Semantics/dump-expr.cpp
+++ b/flang/lib/Semantics/dump-expr.cpp
@@ -23,6 +23,7 @@ void DumpEvaluateExpr::Show(const evaluate::CoarrayRef &x) {
   Indent("coarray ref");
   Show(x.base());
   Show(x.cosubscript());
+  Show(x.notify());
   Show(x.stat());
   Show(x.team());
   Outdent();
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 32aa6b1e0aa1d..ac58dfc005f17 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -834,7 +834,7 @@ Constant<TYPE> ReadRealLiteral(
   auto valWithFlags{
       Scalar<TYPE>::Read(p, context.targetCharacteristics().roundingMode())};
   CHECK(p == source.end());
-  RealFlagWarnings(context, valWithFlags.flags, "conversion of REAL literal");
+  context.RealFlagWarnings(valWithFlags.flags, "conversion of REAL literal");
   auto value{valWithFlags.value};
   if (context.targetCharacteristics().areSubnormalsFlushedToZero()) {
     value = value.FlushSubnormalToZero();
@@ -1579,6 +1579,19 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::CoindexedNamedObject &x) {
         std::get<std::list<parser::ImageSelectorSpec>>(x.imageSelector.t)) {
       common::visit(
           common::visitors{
+              [&](const parser::ImageSelectorSpec::Notify &x) {
+                Analyze(x.v);
+                if (const auto *expr{GetExpr(context_, x.v)}) {
+                  if (coarrayRef.notify()) {
+                    Say("coindexed reference has multiple NOTIFY= specifiers"_err_en_US);
+                  } else if (auto dyType{expr->GetType()};
+                      dyType && IsNotifyType(GetDerivedTypeSpec(*dyType))) {
+                    coarrayRef.set_notify(Expr<SomeType>{*expr});
+                  } else {
+                    Say("NOTIFY= specifier must have type NOTIFY_TYPE from ISO_FORTRAN_ENV"_err_en_US);
+                  }
+                }
+              },
               [&](const parser::ImageSelectorSpec::Stat &x) {
                 Analyze(x.v);
                 if (const auto *expr{GetExpr(context_, x.v)}) {
diff --git a/flang/lib/Semantics/openmp-modifiers.cpp b/flang/lib/Semantics/openmp-modifiers.cpp
index 717fb0351ba5b..f191b4de2d579 100644
--- a/flang/lib/Semantics/openmp-modifiers.cpp
+++ b/flang/lib/Semantics/openmp-modifiers.cpp
@@ -74,6 +74,22 @@ unsigned OmpModifierDescriptor::since(llvm::omp::Clause id) const {
 // Note: The intent for these functions is to have them be automatically-
 // generated in the future.
 
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpAccessGroup>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"access-group",
+      /*props=*/
+      {
+          {61, {OmpProperty::Unique}},
+      },
+      /*clauses=*/
+      {
+          {61, {Clause::OMPC_dyn_groupprivate}},
+      },
+  };
+  return desc;
+}
+
 template <>
 const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpAlignment>() {
   static const OmpModifierDescriptor desc{
@@ -321,6 +337,22 @@ const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpExpectation>() {
   return desc;
 }
 
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpFallbackModifier>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"fallback-modifier",
+      /*props=*/
+      {
+          {61, {OmpProperty::Unique}},
+      },
+      /*clauses=*/
+      {
+          {61, {Clause::OMPC_dyn_groupprivate}},
+      },
+  };
+  return desc;
+}
+
 template <>
 const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpInteropPreference>() {
   static const OmpModifierDescriptor desc{
diff --git a/flang/lib/Semantics/openmp-utils.cpp b/flang/lib/Semantics/openmp-utils.cpp
index 6b304b62ef867..4a40d6eec17bb 100644
--- a/flang/lib/Semantics/openmp-utils.cpp
+++ b/flang/lib/Semantics/openmp-utils.cpp
@@ -186,6 +186,23 @@ bool IsExtendedListItem(const Symbol &sym) {
   return IsVariableListItem(sym) || sym.IsSubprogram();
 }
 
+bool IsTypeParamInquiry(const Symbol &sym) {
+  return common::visit( //
+      common::visitors{
+          [&](const MiscDetails &d) {
+            return d.kind() == MiscDetails::Kind::KindParamInquiry ||
+                d.kind() == MiscDetails::Kind::LenParamInquiry;
+          },
+          [&](const TypeParamDetails &s) { return true; },
+          [&](auto &&) { return false; },
+      },
+      sym.details());
+}
+
+bool IsStructureComponent(const Symbol &sym) {
+  return sym.owner().kind() == Scope::Kind::DerivedType;
+}
+
 bool IsVarOrFunctionRef(const MaybeExpr &expr) {
   if (expr) {
     return evaluate::UnwrapProcedureRef(*expr) != nullptr ||
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 628068f9a9f68..224c69163b85e 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -415,6 +415,18 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
     return true;
   }
 
+  bool Pre(const parser::SpecificationPart &) {
+    partStack_.push_back(PartKind::SpecificationPart);
+    return true;
+  }
+  void Post(const parser::SpecificationPart &) { partStack_.pop_back(); }
+
+  bool Pre(const parser::ExecutionPart &) {
+    partStack_.push_back(PartKind::ExecutionPart);
+    return true;
+  }
+  void Post(const parser::ExecutionPart &) { partStack_.pop_back(); }
+
   bool Pre(const parser::InternalSubprogram &) {
     // Clear the labels being tracked in the previous scope
     ClearLabels();
@@ -639,8 +651,7 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   bool Pre(const parser::OpenMPThreadprivate &);
   void Post(const parser::OpenMPThreadprivate &) { PopContext(); }
 
-  bool Pre(const parser::OpenMPDeclarativeAllocate &);
-  void Post(const parser::OpenMPDeclarativeAllocate &) { PopContext(); }
+  bool Pre(const parser::OmpAllocateDirective &);
 
   bool Pre(const parser::OpenMPAssumeConstruct &);
   void Post(const parser::OpenMPAssumeConstruct &) { PopContext(); }
@@ -651,9 +662,6 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   bool Pre(const parser::OpenMPDispatchConstruct &);
   void Post(const parser::OpenMPDispatchConstruct &) { PopContext(); }
 
-  bool Pre(const parser::OpenMPExecutableAllocate &);
-  void Post(const parser::OpenMPExecutableAllocate &);
-
   bool Pre(const parser::OpenMPAllocatorsConstruct &);
   void Post(const parser::OpenMPAllocatorsConstruct &);
 
@@ -998,6 +1006,14 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
       targetLabels_;
   parser::CharBlock currentStatementSource_;
 
+  enum class PartKind : int {
+    // There are also other "parts", such as internal-subprogram-part, etc,
+    // but we're keeping track of these two for now.
+    SpecificationPart,
+    ExecutionPart,
+  };
+  std::vector<PartKind> partStack_;
+
   void AddAllocateName(const parser::Name *&object) {
     allocateNames_.push_back(object);
   }
@@ -2558,10 +2574,24 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPThreadprivate &x) {
   return true;
 }
 
-bool OmpAttributeVisitor::Pre(const parser::OpenMPDeclarativeAllocate &x) {
+bool OmpAttributeVisitor::Pre(const parser::OmpAllocateDirective &x) {
   PushContext(x.source, llvm::omp::Directive::OMPD_allocate);
-  const auto &list{std::get<parser::OmpObjectList>(x.t)};
-  ResolveOmpObjectList(list, Symbol::Flag::OmpDeclarativeAllocateDirective);
+  assert(!partStack_.empty() && "Misplaced directive");
+
+  auto ompFlag{partStack_.back() == PartKind::SpecificationPart
+          ? Symbol::Flag::OmpDeclarativeAllocateDirective
+          : Symbol::Flag::OmpExecutableAllocateDirective};
+
+  parser::omp::OmpAllocateInfo info{parser::omp::SplitOmpAllocate(x)};
+  for (const parser::OmpAllocateDirective *ad : info.dirs) {
+    for (const parser::OmpArgument &arg : ad->BeginDir().Arguments().v) {
+      if (auto *object{omp::GetArgumentObject(arg)}) {
+        ResolveOmpObject(*object, ompFlag);
+      }
+    }
+  }
+
+  PopContext();
   return false;
 }
 
@@ -2580,15 +2610,6 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPDispatchConstruct &x) {
   return true;
 }
 
-bool OmpAttributeVisitor::Pre(const parser::OpenMPExecutableAllocate &x) {
-  PushContext(x.source, llvm::omp::Directive::OMPD_allocate);
-  const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)};
-  if (list) {
-    ResolveOmpObjectList(*list, Symbol::Flag::OmpExecutableAllocateDirective);
-  }
-  return true;
-}
-
 bool OmpAttributeVisitor::Pre(const parser::OpenMPAllocatorsConstruct &x) {
   const parser::OmpDirectiveSpecification &dirSpec{x.BeginDir()};
   PushContext(x.source, dirSpec.DirId());
@@ -2660,10 +2681,6 @@ bool OmpAttributeVisitor::IsNestedInDirective(llvm::omp::Directive directive) {
   return false;
 }
 
-void OmpAttributeVisitor::Post(const parser::OpenMPExecutableAllocate &x) {
-  PopContext();
-}
-
 void OmpAttributeVisitor::Post(const parser::OpenMPAllocatorsConstruct &x) {
   PopContext();
 }
@@ -2948,6 +2965,67 @@ void OmpAttributeVisitor::CreateImplicitSymbols(const Symbol *symbol) {
   }
 }
 
+static bool IsOpenMPPointer(const Symbol &symbol) {
+  if (IsPointer(symbol) || IsBuiltinCPtr(symbol))
+    return true;
+  return false;
+}
+
+static bool IsOpenMPAggregate(const Symbol &symbol) {
+  if (IsAllocatable(symbol) || IsOpenMPPointer(symbol))
+    return false;
+
+  const auto *type{symbol.GetType()};
+  // OpenMP categorizes Fortran characters as aggregates.
+  if (type->category() == Fortran::semantics::DeclTypeSpec::Category::Character)
+    return true;
+
+  if (const auto *det{symbol.GetUltimate()
+              .detailsIf<Fortran::semantics::ObjectEntityDetails>()})
+    if (det->IsArray())
+      return true;
+
+  if (type->AsDerived())
+    return true;
+
+  if (IsDeferredShape(symbol) || IsAssumedRank(symbol) ||
+      IsAssumedShape(symbol))
+    return true;
+  return false;
+}
+
+static bool IsOpenMPScalar(const Symbol &symbol) {
+  if (IsOpenMPAggregate(symbol) || IsOpenMPPointer(symbol) ||
+      IsAllocatable(symbol))
+    return false;
+  const auto *type{symbol.GetType()};
+  if ((!symbol.GetShape() || symbol.GetShape()->empty()) &&
+      (type->category() ==
+              Fortran::semantics::DeclTypeSpec::Category::Numeric ||
+          type->category() ==
+              Fortran::semantics::DeclTypeSpec::Category::Logical))
+    return true;
+  return false;
+}
+
+static bool DefaultMapCategoryMatchesSymbol(
+    parser::OmpVariableCategory::Value category, const Symbol &symbol) {
+  using VarCat = parser::OmpVariableCategory::Value;
+  switch (category) {
+  case VarCat::Scalar:
+    return IsOpenMPScalar(symbol);
+  case VarCat::Allocatable:
+    return IsAllocatable(symbol);
+  case VarCat::Aggregate:
+    return IsOpenMPAggregate(symbol);
+  case VarCat::Pointer:
+    return IsOpenMPPointer(symbol);
+  case VarCat::All:
+    return true;
+  }
+  return false;
+}
+
 // For OpenMP constructs, check all the data-refs within the constructs
 // and adjust the symbol for each Name if necessary
 void OmpAttributeVisitor::Post(const parser::Name &name) {
@@ -2983,6 +3061,36 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
       }
     }
 
+    // TODO: handle case where default and defaultmap are present on the same
+    // construct and conflict, defaultmap should supersede default if they
+    // conflict.
+    if (!GetContext().defaultMap.empty()) {
+      // Checked before implicit data sharing attributes as this rule ignores
+      // them and expects explicit predetermined/specified attributes to be in
+      // place for the types specified.
+      if (Symbol * found{currScope().FindSymbol(name.source)}) {
+        // If the variable has declare target applied to it (enter or link) it
+        // is exempt from defaultmap(none) restrictions
+        if (!symbol->GetUltimate().test(Symbol::Flag::OmpDeclareTarget)) {
+          auto &dMap = GetContext().defaultMap;
+          for (auto defaults : dMap) {
+            if (defaults.second ==
+                parser::OmpDefaultmapClause::ImplicitBehavior::None) {
+              if (DefaultMapCategoryMatchesSymbol(defaults.first, *found)) {
+                if (!IsObjectWithDSA(*symbol)) {
+                  context_.Say(name.source,
+                      "The DEFAULTMAP(NONE) clause requires that '%s' must be "
+                      "listed in a "
+                      "data-sharing attribute, data-mapping attribute, or is_device_ptr clause"_err_en_US,
+                      symbol->name());
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
     if (Symbol * found{currScope().FindSymbol(name.source)}) {
       if (found->GetUltimate().test(semantics::Symbol::Flag::OmpThreadprivate))
         return;
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index f88af5fac0bbd..09ec951a422ca 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1700,12 +1700,12 @@ class OmpVisitor : public virtual DeclarationVisitor {
   void Post(const parser::OpenMPDeclareTargetConstruct &) {
     SkipImplicitTyping(false);
   }
-  bool Pre(const parser::OpenMPDeclarativeAllocate &x) {
+  bool Pre(const parser::OmpAllocateDirective &x) {
     AddOmpSourceRange(x.source);
     SkipImplicitTyping(true);
     return true;
   }
-  void Post(const parser::OpenMPDeclarativeAllocate &) {
+  void Post(const parser::OmpAllocateDirective &) {
     SkipImplicitTyping(false);
     messageHandler().set_currStmtSource(std::nullopt);
   }
@@ -9435,13 +9435,18 @@ bool ResolveNamesVisitor::SetProcFlag(
     SayWithDecl(name, symbol,
         "Implicit declaration of function '%s' has a different result type than in previous declaration"_err_en_US);
     return false;
-  } else if (symbol.has<ProcEntityDetails>()) {
-    symbol.set(flag); // in case it hasn't been set yet
-    if (flag == Symbol::Flag::Function) {
-      ApplyImplicitRules(symbol);
-    }
-    if (symbol.attrs().test(Attr::INTRINSIC)) {
-      AcquireIntrinsicProcedureFlags(symbol);
+  } else if (const auto *proc{symbol.detailsIf<ProcEntityDetails>()}) {
+    if (IsPointer(symbol) && !proc->type() && !proc->procInterface()) {
+      // PROCEDURE(), POINTER -- errors will be emitted later about a lack
+      // of known characteristics if used as a function
+    } else {
+      symbol.set(flag); // in case it hasn't been set yet
+      if (flag == Symbol::Flag::Function) {
+        ApplyImplicitRules(symbol);
+      }
+      if (symbol.attrs().test(Attr::INTRINSIC)) {
+        AcquireIntrinsicProcedureFlags(symbol);
+      }
     }
   } else if (symbol.GetType() && flag == Symbol::Flag::Subroutine) {
     SayWithDecl(
@@ -10060,6 +10065,7 @@ void ResolveNamesVisitor::Post(const parser::CompilerDirective &x) {
       std::holds_alternative<parser::CompilerDirective::NoUnrollAndJam>(x.u) ||
       std::holds_alternative<parser::CompilerDirective::ForceInline>(x.u) ||
       std::holds_alternative<parser::CompilerDirective::Inline>(x.u) ||
+      std::holds_alternative<parser::CompilerDirective::Prefetch>(x.u) ||
       std::holds_alternative<parser::CompilerDirective::NoInline>(x.u)) {
     return;
   }
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 8eddd03faa962..cf1e5e7d44565 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -582,6 +582,18 @@ bool IsOrContainsEventOrLockComponent(const Symbol &original) {
   return false;
 }
 
+bool IsOrContainsNotifyComponent(const Symbol &original) {
+  const Symbol &symbol{ResolveAssociations(original, /*stopAtTypeGuard=*/true)};
+  if (evaluate::IsVariable(symbol)) {
+    if (const DeclTypeSpec *type{symbol.GetType()}) {
+      if (const DerivedTypeSpec *derived{type->AsDerived()}) {
+        return IsNotifyType(derived) || FindNotifyPotentialComponent(*derived);
+      }
+    }
+  }
+  return false;
+}
+
 // Check this symbol suitable as a type-bound procedure - C769
 bool CanBeTypeBoundProc(const Symbol &symbol) {
   if (IsDummy(symbol) || IsProcedurePointer(symbol)) {
@@ -1489,6 +1501,32 @@ PotentialComponentIterator::const_iterator FindEventOrLockPotentialComponent(
   return iter;
 }
 
+PotentialComponentIterator::const_iterator FindNotifyPotentialComponent(
+    const DerivedTypeSpec &derived, bool ignoreCoarrays) {
+  PotentialComponentIterator potentials{derived};
+  auto iter{potentials.begin()};
+  for (auto end{potentials.end()}; iter != end; ++iter) {
+    const Symbol &component{*iter};
+    if (const auto *object{component.detailsIf<ObjectEntityDetails>()}) {
+      if (const DeclTypeSpec *type{object->type()}) {
+        if (IsNotifyType(type->AsDerived())) {
+          if (!ignoreCoarrays) {
+            break; // found one
+          }
+          auto path{iter.GetComponentPath()};
+          path.pop_back();
+          if (std::find_if(path.begin(), path.end(), [](const Symbol &sym) {
+                return evaluate::IsCoarray(sym);
+              }) == path.end()) {
+            break; // found one not in a coarray
+          }
+        }
+      }
+    }
+  }
+  return iter;
+}
+
 UltimateComponentIterator::const_iterator FindAllocatableUltimateComponent(
     const DerivedTypeSpec &derived) {
   UltimateComponentIterator ultimates{derived};
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 59af58ddcd32e..27097193aaa9b 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -1171,6 +1171,45 @@ attributes(device) pure integer(8) function atomicaddl(address, val)
     integer(8), intent(inout) :: address
     integer(8), value :: val
     end function
+    attributes(device) pure integer(4) function atomicaddr2(address, val)
+      !dir$ ignore_tkr (rd) address, (d) val
+      real(2), dimension(2), intent(inout) :: address
+      real(2), dimension(2), intent(in) :: val
+    end function
+  end interface
+
+  interface atomicaddvector
+    attributes(device) pure function atomicaddvector_r2x2(address, val) result(z)
+      !dir$ ignore_tkr (rd) address, (d) val
+      real(2), dimension(2), intent(inout) :: address
+      real(2), dimension(2), intent(in) :: val
+      real(2), dimension(2) :: z
+    end function
+
+    attributes(device) pure function atomicaddvector_r4x2(address, val) result(z)
+      !dir$ ignore_tkr (rd) address, (d) val
+      real(4), dimension(2), intent(inout) :: address
+      real(4), dimension(2), intent(in) :: val
+      real(4), dimension(2) :: z
+    end function
+  end interface
+
+  interface atomicaddreal4x2
+    attributes(device) pure function atomicadd_r4x2(address, val) result(z)
+      !dir$ ignore_tkr (rd) address, (d) val
+      real(4), dimension(2), intent(inout) :: address
+      real(4), dimension(2), intent(in) :: val
+      real(4), dimension(2) :: z
+    end function
+  end interface
+
+  interface atomicaddreal4x4
+    attributes(device) pure function atomicadd_r4x4(address, val) result(z)
+      !dir$ ignore_tkr (rd) address, (d) val
+      real(4), dimension(4), intent(inout) :: address
+      real(4), dimension(4), intent(in) :: val
+      real(4), dimension(4) :: z
+    end function
   end interface
 
   interface atomicsub
diff --git a/flang/test/Analysis/AliasAnalysis/modref-call-globals.f90 b/flang/test/Analysis/AliasAnalysis/modref-call-globals.f90
index 695b38ed406a5..fd1d37d18ae15 100644
--- a/flang/test/Analysis/AliasAnalysis/modref-call-globals.f90
+++ b/flang/test/Analysis/AliasAnalysis/modref-call-globals.f90
@@ -75,7 +75,7 @@ subroutine internal
 subroutine test_common
   implicit none
   real :: test_var_x_common
-  common /comm/ test_var_x_common 
+  common /comm/ test_var_x_common
   call test_effect_external()
 end subroutine
 ! CHECK-LABEL: Testing : "_QPtest_common"
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtbegin.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtend.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crti.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtn.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtbegin.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtend.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crti.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtn.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/convert.f90 b/flang/test/Driver/convert.f90
index 0ba31d2188cdf..0b4da0282f3a7 100755
--- a/flang/test/Driver/convert.f90
+++ b/flang/test/Driver/convert.f90
@@ -1,5 +1,5 @@
 ! Ensure argument -fconvert=<value> accepts all relevant options and produces an
-! error if an invalid value is specified. 
+! error if an invalid value is specified.
 
 !--------------------------
 ! FLANG DRIVER (flang)
diff --git a/flang/test/Driver/do_concurrent_to_omp_cli.f90 b/flang/test/Driver/do_concurrent_to_omp_cli.f90
index bdb603f35639d..e44db04fb2ce7 100644
--- a/flang/test/Driver/do_concurrent_to_omp_cli.f90
+++ b/flang/test/Driver/do_concurrent_to_omp_cli.f90
@@ -3,12 +3,12 @@
 ! RUN: %flang --help | FileCheck %s --check-prefix=FLANG
 
 ! FLANG:      -fdo-concurrent-to-openmp=<value>
-! FLANG-NEXT:   Try to map `do concurrent` loops to OpenMP [none|host|device] 
+! FLANG-NEXT:   Try to map `do concurrent` loops to OpenMP [none|host|device]
 
 ! RUN: bbc --help | FileCheck %s --check-prefix=BBC
 
 ! BBC:      -fdo-concurrent-to-openmp=<string>
-! BBC-SAME:   Try to map `do concurrent` loops to OpenMP [none|host|device] 
+! BBC-SAME:   Try to map `do concurrent` loops to OpenMP [none|host|device]
 
 ! RUN: %flang -c -fdo-concurrent-to-openmp=host %s 2>&1 \
 ! RUN: | FileCheck %s --check-prefix=OPT
diff --git a/flang/test/Driver/emit-mlir.f90 b/flang/test/Driver/emit-mlir.f90
index de5a62d6bc7f3..f2a4b6cf7670b 100644
--- a/flang/test/Driver/emit-mlir.f90
+++ b/flang/test/Driver/emit-mlir.f90
@@ -21,7 +21,7 @@
 ! CHECK-NEXT: func.func @main(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr) -> i32 {
 ! CHECK-NEXT: %c0_i32 = arith.constant 0 : i32
 ! CHECK-NEXT: %0 = fir.zero_bits !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
-! CHECK-NEXT: fir.call @_FortranAProgramStart(%arg0, %arg1, %arg2, %0) {{.*}} : (i32, !llvm.ptr, !llvm.ptr, !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>) 
+! CHECK-NEXT: fir.call @_FortranAProgramStart(%arg0, %arg1, %arg2, %0) {{.*}} : (i32, !llvm.ptr, !llvm.ptr, !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>)
 ! CHECK-NEXT: fir.call @_QQmain() fastmath<contract> : () -> ()
 ! CHECK-NEXT: fir.call @_FortranAProgramEndStatement() {{.*}} : () -> ()
 ! CHECK-NEXT: return %c0_i32 : i32
diff --git a/flang/test/Driver/fatal-errors-parsing.f90 b/flang/test/Driver/fatal-errors-parsing.f90
index 185a6e08481d7..fd8e167a4807c 100644
--- a/flang/test/Driver/fatal-errors-parsing.f90
+++ b/flang/test/Driver/fatal-errors-parsing.f90
@@ -7,7 +7,7 @@ program p
    ! CHECK2: fatal-errors-parsing.f90:{{.*}} error:
 continue
 end
-   
+
 subroutine s
 contains
    ! CHECK1-NOT: error:
diff --git a/flang/test/Driver/fatal-errors-semantics.f90 b/flang/test/Driver/fatal-errors-semantics.f90
index 54740dd6deec0..3d3f64225288d 100644
--- a/flang/test/Driver/fatal-errors-semantics.f90
+++ b/flang/test/Driver/fatal-errors-semantics.f90
@@ -37,4 +37,3 @@ subroutine test
        call soa(null())
      end
    end
-   
\ No newline at end of file
diff --git a/flang/test/Driver/flang-ld-aarch64.f90 b/flang/test/Driver/flang-ld-aarch64.f90
index 61cd46cea5cd1..4039859327a31 100644
--- a/flang/test/Driver/flang-ld-aarch64.f90
+++ b/flang/test/Driver/flang-ld-aarch64.f90
@@ -1,4 +1,4 @@
-! Check linker flags for AArch64 linux, since it needs both libgcc and 
+! Check linker flags for AArch64 linux, since it needs both libgcc and
 ! compiler-rt, with compiler-rt second when -rtlib=libgcc.
 
 ! RUN: %flang -### -rtlib=libgcc --target=aarch64-linux-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s
diff --git a/flang/test/Driver/flang-ld-powerpc.f90 b/flang/test/Driver/flang-ld-powerpc.f90
index 5328077ac21af..90586793a6667 100644
--- a/flang/test/Driver/flang-ld-powerpc.f90
+++ b/flang/test/Driver/flang-ld-powerpc.f90
@@ -4,7 +4,7 @@
 !!       -static-libflang_rt in the future. Need to add that option here.
 
 !! Because flang-rt currently only supports
-!! LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=ON, use 
+!! LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=ON, use
 !! resource_dir_with_per_target_subdir as inputs.
 
 ! Check powerpc64-ibm-aix 64-bit linking to static flang-rt by default
@@ -26,7 +26,7 @@
 ! AIX64-LD-PER-TARGET-DEFAULT-NOT:      "-L/[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}powerpc64-ibm-aix"
 
 
-! Check powerpc64-ibm-aix 64-bit linking to static flang-rt by option 
+! Check powerpc64-ibm-aix 64-bit linking to static flang-rt by option
 ! RUN: %flang -static-libflangrt -Werror %s -### 2>&1 \
 ! RUN:        --target=powerpc64-ibm-aix \
 ! RUN:        -resource-dir=%S/../../../clang/test/Driver/Inputs/resource_dir_with_per_target_subdir \
@@ -44,7 +44,7 @@
 ! AIX64-LD-PER-TARGET-STATIC-SAME:     "-lpthread"
 
 
-! Check powerpc64-ibm-aix 64-bit linking to shared flang-rt by option 
+! Check powerpc64-ibm-aix 64-bit linking to shared flang-rt by option
 ! RUN: %flang -shared-libflangrt -Werror %s -### 2>&1 \
 ! RUN:        --target=powerpc64-ibm-aix \
 ! RUN:        -resource-dir=%S/../../../clang/test/Driver/Inputs/resource_dir_with_per_target_subdir \
diff --git a/flang/test/Driver/frame-pointer-forwarding.f90 b/flang/test/Driver/frame-pointer-forwarding.f90
index 9fcbd6e12f98b..7e97c98d899f1 100644
--- a/flang/test/Driver/frame-pointer-forwarding.f90
+++ b/flang/test/Driver/frame-pointer-forwarding.f90
@@ -1,12 +1,12 @@
 ! Test that flang forwards -fno-omit-frame-pointer and -fomit-frame-pointer Flang frontend
 ! RUN: %flang --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1  | FileCheck %s --check-prefix=CHECK-NOVALUE
-! CHECK-NOVALUE: "-fc1"{{.*}}"-mframe-pointer=non-leaf"
+! CHECK-NOVALUE: "-fc1"{{.*}}"-mframe-pointer=non-leaf-no-reserve"
 
 ! RUN: %flang -fomit-frame-pointer --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1  | FileCheck %s --check-prefix=CHECK-NONEFP
 ! CHECK-NONEFP: "-fc1"{{.*}}"-mframe-pointer=none"
 
 ! RUN: %flang -fno-omit-frame-pointer --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1  | FileCheck %s --check-prefix=CHECK-NONLEAFFP
-! CHECK-NONLEAFFP: "-fc1"{{.*}}"-mframe-pointer=non-leaf"
+! CHECK-NONLEAFFP: "-fc1"{{.*}}"-mframe-pointer=non-leaf-no-reserve"
 
 ! RUN: %flang -fno-omit-frame-pointer --target=x86-none-none -fsyntax-only -### %s -o %t 2>&1  | FileCheck %s --check-prefix=CHECK-ALLFP
 ! CHECK-ALLFP: "-fc1"{{.*}}"-mframe-pointer=all"
diff --git a/flang/test/Driver/gcc-toolchain-install-dir.f90 b/flang/test/Driver/gcc-toolchain-install-dir.f90
index e195bdde6d2c9..05b73bcc6a2e9 100644
--- a/flang/test/Driver/gcc-toolchain-install-dir.f90
+++ b/flang/test/Driver/gcc-toolchain-install-dir.f90
@@ -5,10 +5,10 @@
 ! RUN: %flang 2>&1 -### -v -o %t %s -no-integrated-as -fuse-ld=ld --target=i386-unknown-linux-gnu --gcc-install-dir=%S/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0 | FileCheck %s --check-prefix=CHECK-I386
 ! RUN: %flang 2>&1 -### -v -o %t %s -no-integrated-as -fuse-ld=ld --target=i386-unknown-linux-gnu --gcc-toolchain=%S/Inputs/basic_cross_linux_tree/usr                                         | FileCheck %s --check-prefix=CHECK-I386
 ! CHECK-I386:      Selected GCC installation: [[PREFIX:[^"]+]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0
-! CHECK-I386:      "-fc1" "-triple" "i386-unknown-linux-gnu" 
+! CHECK-I386:      "-fc1" "-triple" "i386-unknown-linux-gnu"
 ! CHECK-I386:      "[[PREFIX:[^"]+]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0/../../../../i386-unknown-linux-gnu/bin{{/|\\\\}}as"
 ! CHECK-I386:      "[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0/../../../../i386-unknown-linux-gnu/bin{{/|\\\\}}ld" {{.*}} "-m" "elf_i386"
-! CHECK-I386-SAME: "-L[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0" 
+! CHECK-I386-SAME: "-L[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0"
 ! CHECK-I386-SAME: "-L[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0/../../../../i386-unknown-linux-gnu/lib"
 
 ! RUN: %flang 2>&1 -### -v -o %t %s -no-integrated-as -fuse-ld=ld --target=x86_64-unknown-linux-gnu --gcc-install-dir=%S/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0 | FileCheck %s --check-prefix=CHECK-X86-64
@@ -17,5 +17,5 @@
 ! CHECK-X86-64:      "-fc1" "-triple" "x86_64-unknown-linux-gnu"
 ! CHECK-X86-64:      "[[PREFIX:[^"]+]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/../../../../x86_64-unknown-linux-gnu/bin{{/|\\\\}}as" "--64"
 ! CHECK-X86-64:      "[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/../../../../x86_64-unknown-linux-gnu/bin{{/|\\\\}}ld" {{.*}} "-m" "elf_x86_64"
-! CHECK-X86-64-SAME: "-L[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0" 
+! CHECK-X86-64-SAME: "-L[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0"
 ! CHECK-X86-64-SAME: "-L[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/../../../../x86_64-unknown-linux-gnu/lib"
diff --git a/flang/test/Driver/gcc-triple.f90 b/flang/test/Driver/gcc-triple.f90
new file mode 100644
index 0000000000000..3aacb84bfd227
--- /dev/null
+++ b/flang/test/Driver/gcc-triple.f90
@@ -0,0 +1,18 @@
+!! UNSUPPORTED: system-windows, system-aix
+
+!! Test that --gcc-triple option is working as expected.
+
+! RUN: %flang --target=x86_64-linux-gnu -v --sysroot=%S/Inputs/fedora_39_tree 2>&1 | FileCheck %s --dump-input=always --check-prefix=DEFAULT_TRIPLE
+! DEFAULT_TRIPLE: {{^}}Found candidate GCC installation:
+! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13
+! DEFAULT_TRIPLE: {{^}}Found candidate GCC installation:
+! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13
+! DEFAULT_TRIPLE: {{^}}Selected GCC installation:
+! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13
+
+! RUN: %flang -v --sysroot=%S/Inputs/fedora_39_tree --gcc-triple=x86_64-redhat-linux 2>&1 | FileCheck %s --check-prefix=TRIPLE_EXISTS
+! TRIPLE_EXISTS: {{^}}Selected GCC installation:
+! TRIPLE_EXISTS: fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13
+
+! RUN: %flang -v --sysroot=%S/Inputs/fedora_39_tree --gcc-triple=x86_64-foo-linux 2>&1 | FileCheck %s --check-prefix=TRIPLE_DOES_NOT_EXISTS
+! TRIPLE_DOES_NOT_EXISTS-NOT: x86_64-foo-linux
\ No newline at end of file
diff --git a/flang/test/Driver/large-data-threshold.f90 b/flang/test/Driver/large-data-threshold.f90
index 6a7eef79559d0..fa2d4aef911e3 100644
--- a/flang/test/Driver/large-data-threshold.f90
+++ b/flang/test/Driver/large-data-threshold.f90
@@ -5,8 +5,8 @@
 ! RUN: %flang -### -c --target=x86_64 -mlarge-data-threshold=32768 %s 2>&1 | FileCheck %s --check-prefix=NO-MCMODEL
 ! RUN: %flang -### -c --target=x86_64 -mcmodel=small -mlarge-data-threshold=32768 %s 2>&1 | FileCheck %s --check-prefix=NO-MCMODEL
 ! RUN: not %flang -### -c --target=aarch64 -mcmodel=small -mlarge-data-threshold=32768 %s 2>&1 | FileCheck %s --check-prefix=NOT-SUPPORTED
-  
-  
+
+
 ! CHECK: "{{.*}}flang" "-fc1"
 ! CHECK-SAME: "-mlarge-data-threshold=32768"
 ! CHECK-59000: "{{.*}}flang" "-fc1"
diff --git a/flang/test/Driver/lto-fatlto.f90 b/flang/test/Driver/lto-fatlto.f90
index c52d6e386ef0b..2ea251eafacbf 100644
--- a/flang/test/Driver/lto-fatlto.f90
+++ b/flang/test/Driver/lto-fatlto.f90
@@ -1,5 +1,5 @@
 ! REQUIRES: x86-registered-target
-! checks fatlto objects: that valid bitcode is included in the object file generated. 
+! checks fatlto objects: that valid bitcode is included in the object file generated.
 
 ! RUN: %flang -fc1 -triple x86_64-unknown-linux-gnu -flto -ffat-lto-objects -emit-obj %s -o %t.o
 ! RUN: llvm-readelf -S %t.o | FileCheck %s --check-prefixes=ELF
diff --git a/flang/test/Driver/mlir-debug-pass-pipeline.f90 b/flang/test/Driver/mlir-debug-pass-pipeline.f90
index eb5165e36c919..0138d9b152744 100644
--- a/flang/test/Driver/mlir-debug-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-debug-pass-pipeline.f90
@@ -100,7 +100,7 @@
 ! ALL-NEXT: CSE
 ! ALL-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
-! ALL-NEXT: MIFOpConversion 
+! ALL-NEXT: MIFOpConversion
 ! ALL-NEXT: BoxedProcedurePass
 
 ! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
@@ -109,10 +109,10 @@
 ! ALL-NEXT:   'func.func' Pipeline
 ! ALL-NEXT:     AbstractResultOpt
 ! ALL-NEXT:   'gpu.module' Pipeline
-! ALL-NEXT:   Pipeline Collection : ['func.func', 'gpu.func'] 
-! ALL-NEXT:   'func.func' Pipeline 
+! ALL-NEXT:   Pipeline Collection : ['func.func', 'gpu.func']
+! ALL-NEXT:   'func.func' Pipeline
 ! ALL-NEXT:   AbstractResultOpt
-! ALL-NEXT:   'gpu.func' Pipeline 
+! ALL-NEXT:   'gpu.func' Pipeline
 ! ALL-NEXT:   AbstractResultOpt
 ! ALL-NEXT:   'omp.declare_reduction' Pipeline
 ! ALL-NEXT:     AbstractResultOpt
diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90
index 3b6a9d7cda7ed..0d68191fedc1e 100644
--- a/flang/test/Driver/mlir-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-pass-pipeline.f90
@@ -142,7 +142,7 @@
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 ! O2-NEXT:  'func.func' Pipeline
 ! O2-NEXT:    SetRuntimeCallAttributes
-! ALL-NEXT: MIFOpConversion 
+! ALL-NEXT: MIFOpConversion
 ! ALL-NEXT: BoxedProcedurePass
 ! O2-NEXT:  AddAliasTags
 
@@ -152,10 +152,10 @@
 ! ALL-NEXT:  'func.func' Pipeline
 ! ALL-NEXT:    AbstractResultOpt
 ! ALL-NEXT:  'gpu.module' Pipeline
-! ALL-NEXT:   Pipeline Collection : ['func.func', 'gpu.func'] 
-! ALL-NEXT:   'func.func' Pipeline 
+! ALL-NEXT:   Pipeline Collection : ['func.func', 'gpu.func']
+! ALL-NEXT:   'func.func' Pipeline
 ! ALL-NEXT:   AbstractResultOpt
-! ALL-NEXT:   'gpu.func' Pipeline 
+! ALL-NEXT:   'gpu.func' Pipeline
 ! ALL-NEXT:   AbstractResultOpt
 ! ALL-NEXT:  'omp.declare_reduction' Pipeline
 ! ALL-NEXT:    AbstractResultOpt
diff --git a/flang/test/Driver/multiple-actions-error.f95 b/flang/test/Driver/multiple-actions-error.f95
index 5ec4e9166657f..3b2b7dc26d2c6 100644
--- a/flang/test/Driver/multiple-actions-error.f95
+++ b/flang/test/Driver/multiple-actions-error.f95
@@ -1,8 +1,30 @@
-! Verify that the frontend driver error-out if multiple actions are specified
-
-! RUN: not %flang_fc1 -E -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix=ERROR
-! RUN: not %flang_fc1 -fsyntax-only -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix=ERROR
-
-! ERROR: error: Only one action option is allowed
-
-end progream
+! Verify that the frontend driver raises the expected error when multiple
+! actions are specified.
+!
+! RUN: not %flang_fc1 -fsyntax-only -fsyntax-only %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefixes=ERROR,ACTIONS-1
+!
+! RUN: not %flang_fc1 -E -fsyntax-only %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefixes=ERROR,ACTIONS-2
+!
+! RUN: not %flang_fc1 -fsyntax-only -E -emit-llvm %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefixes=ERROR,ACTIONS-3
+!
+! If one or more options are specified with -Xflang, they will appear last in
+! the error message.
+!
+! RUN: not %flang -S -Xflang -emit-llvm %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefixes=ERROR,ACTIONS-4
+!
+! RUN: not %flang -Xflang -emit-llvm -S %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefixes=ERROR,ACTIONS-4
+!
+! RUN: not %flang -Xflang -emit-obj -S -Xflang -emit-llvm %s 2>&1 \
+! RUN:     | FileCheck %s --check-prefixes=ERROR,ACTIONS-5
+!
+! ERROR: error: only one action option is allowed.
+! ACTIONS-1: Got '-fsyntax-only', '-fsyntax-only'
+! ACTIONS-2: Got '-E', '-fsyntax-only'
+! ACTIONS-3: Got '-fsyntax-only', '-E', '-emit-llvm'
+! ACTIONS-4: Got '-S', '-emit-llvm'
+! ACTIONS-5: Got '-S', '-emit-obj', '-emit-llvm'
diff --git a/flang/test/Driver/multiple-fc1-input.f90 b/flang/test/Driver/multiple-fc1-input.f90
index 57f7c5e92b4c4..e142f358b6c16 100644
--- a/flang/test/Driver/multiple-fc1-input.f90
+++ b/flang/test/Driver/multiple-fc1-input.f90
@@ -5,5 +5,5 @@
 ! RUN: %flang_fc1 -emit-fir %s %s -o - | FileCheck %s
 subroutine foo()
 end subroutine
-! CHECK: func @_QPfoo() 
-! CHECK: func @_QPfoo() 
+! CHECK: func @_QPfoo()
+! CHECK: func @_QPfoo()
diff --git a/flang/test/Driver/omp-driver-offload.f90 b/flang/test/Driver/omp-driver-offload.f90
index 09248572b9ff5..8660bec7e1ea9 100644
--- a/flang/test/Driver/omp-driver-offload.f90
+++ b/flang/test/Driver/omp-driver-offload.f90
@@ -1,9 +1,9 @@
-! Test that flang OpenMP and OpenMP offload related 
-! commands forward or expand to the appropriate commands 
+! Test that flang OpenMP and OpenMP offload related
+! commands forward or expand to the appropriate commands
 ! for flang -fc1 as expected. Assumes a gfx90a, aarch64,
-! and sm_70 architecture, but doesn't require one to be 
-! installed or compiled for, just testing the appropriate 
-! generation of jobs are created with the correct 
+! and sm_70 architecture, but doesn't require one to be
+! installed or compiled for, just testing the appropriate
+! generation of jobs are created with the correct
 ! corresponding arguments.
 
 ! Test regular -fopenmp with no offload
@@ -47,7 +47,7 @@
 ! OFFLOAD-DEVICE-NEXT: "{{[^"]*}}flang" "-fc1" "-triple" "nvptx64-nvidia-cuda"
 ! OFFLOAD-DEVICE-NOT: "{{[^"]*}}flang" "-fc1" "-triple" "aarch64-unknown-linux-gnu"
 
-! Test regular -fopenmp with offload for basic fopenmp-is-target-device flag addition and correct fopenmp 
+! Test regular -fopenmp with offload for basic fopenmp-is-target-device flag addition and correct fopenmp
 ! RUN: %flang -### -fopenmp --offload-arch=gfx90a -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib %s 2>&1 | FileCheck --check-prefixes=CHECK-OPENMP-IS-TARGET-DEVICE %s
 ! CHECK-OPENMP-IS-TARGET-DEVICE: "{{[^"]*}}flang" "-fc1" {{.*}} "-fopenmp" {{.*}} "-fopenmp-is-target-device" {{.*}}.f90"
 
@@ -169,7 +169,7 @@
 ! RUN: | FileCheck %s --check-prefixes=CHECK-OPENMP-VERSION
 ! CHECK-OPENMP-VERSION: "{{[^"]*}}flang" "-fc1" {{.*}} "-fopenmp" "-fopenmp-version=45" {{.*}}.f90"
 
-! Test diagnostic error when host IR file is non-existent 
+! Test diagnostic error when host IR file is non-existent
 ! RUN: not %flang_fc1 %s -o %t 2>&1 -fopenmp -fopenmp-is-target-device \
 ! RUN: -fopenmp-host-ir-file-path non-existant-file.bc \
 ! RUN: | FileCheck %s --check-prefix=HOST-IR-MISSING
diff --git a/flang/test/Driver/tco-emit-final-mlir.fir b/flang/test/Driver/tco-emit-final-mlir.fir
index 75f8f153127af..7e934c921e773 100644
--- a/flang/test/Driver/tco-emit-final-mlir.fir
+++ b/flang/test/Driver/tco-emit-final-mlir.fir
@@ -15,5 +15,7 @@
 
 func.func @_QPfoo() {
   %1 = fir.alloca i32
+  %0 = arith.constant 0 : i32
+  fir.store %0 to %1 : !fir.ref<i32>
   return
 }
diff --git a/flang/test/Driver/tune-cpu-fir.f90 b/flang/test/Driver/tune-cpu-fir.f90
index 43c13b426d5d9..843feebfa12ca 100644
--- a/flang/test/Driver/tune-cpu-fir.f90
+++ b/flang/test/Driver/tune-cpu-fir.f90
@@ -14,7 +14,7 @@
 ! ARMTUNE-SAME:     fir.tune_cpu = "neoverse-n1"
 
 ! ARMBOTH-SAME: fir.target_cpu = "aarch64"
-! ARMBOTH-SAME: fir.tune_cpu = "neoverse-n1"  
+! ARMBOTH-SAME: fir.tune_cpu = "neoverse-n1"
 
 ! X86CPU-SAME:      fir.target_cpu = "x86-64"
 ! X86CPU-NOT:       fir.tune_cpu = "pentium4"
diff --git a/flang/test/Driver/version-loops.f90 b/flang/test/Driver/version-loops.f90
index d206393a04f48..c4caf4688ab43 100644
--- a/flang/test/Driver/version-loops.f90
+++ b/flang/test/Driver/version-loops.f90
@@ -1,22 +1,22 @@
-! Test that flang forwards the -f{no-,}version-loops-for-stride 
+! Test that flang forwards the -f{no-,}version-loops-for-stride
 ! options correctly to flang -fc1 for different variants of optimisation
 ! and explicit flags.
 
 ! RUN: %flang -### %s -o %t 2>&1   -O3 \
 ! RUN:   | FileCheck %s
-  
+
 ! RUN: %flang -### %s -o %t 2>&1 -O2 \
 ! RUN:   | FileCheck %s --check-prefix=CHECK-O2
 
 ! RUN: %flang -### %s -o %t 2>&1  -O2 -fversion-loops-for-stride \
 ! RUN:   | FileCheck %s --check-prefix=CHECK-O2-with
-  
+
 ! RUN: %flang -### %s -o %t 2>&1  -O4 \
 ! RUN:   | FileCheck %s --check-prefix=CHECK-O4
-  
+
 ! RUN: %flang -### %s -o %t 2>&1  -Ofast \
 ! RUN:   | FileCheck %s --check-prefix=CHECK-Ofast
-  
+
 ! RUN: %flang -### %s -o %t 2>&1 -Ofast -fno-version-loops-for-stride \
 ! RUN:   | FileCheck %s --check-prefix=CHECK-Ofast-no
 
@@ -29,12 +29,12 @@
 
 ! CHECK-O2: "{{.*}}flang" "-fc1"
 ! CHECK-O2-NOT: "-fversion-loops-for-stride"
-! CHECK-O2-SAME: "-O2"  
+! CHECK-O2-SAME: "-O2"
 
 ! CHECK-O2-with: "{{.*}}flang" "-fc1"
 ! CHECK-O2-with-SAME: "-fversion-loops-for-stride"
-! CHECK-O2-with-SAME: "-O2"  
-  
+! CHECK-O2-with-SAME: "-O2"
+
 ! CHECK-O4: "{{.*}}flang" "-fc1"
 ! CHECK-O4-SAME: "-fversion-loops-for-stride"
 ! CHECK-O4-SAME: "-O3"
diff --git a/flang/test/Evaluate/folding12.f90 b/flang/test/Evaluate/folding12.f90
index 016e692f66264..1a0a8cb064c4c 100644
--- a/flang/test/Evaluate/folding12.f90
+++ b/flang/test/Evaluate/folding12.f90
@@ -5,7 +5,7 @@ module m1
     integer :: parent_field
   end type parent_type
   type, extends(parent_type) :: child_type
-    integer :: child_field 
+    integer :: child_field
   end type child_type
   type parent_array_type
     integer, dimension(2) :: parent_field
@@ -21,7 +21,7 @@ module m1
   type(child_type), parameter :: child_const2 = child_type(12, 13)
   type(child_type), parameter :: array_var(2) = &
     [child_type(14, 15), child_type(16, 17)]
-  logical, parameter :: test_array_child = array_var(2)%child_field == 17 
+  logical, parameter :: test_array_child = array_var(2)%child_field == 17
   logical, parameter :: test_array_parent = array_var(2)%parent_field == 16
 
   type array_type
@@ -40,7 +40,7 @@ module m1
   type(child_array_type), parameter, dimension(2) :: child_const5 = &
     [child_array_type([22, 23], 24), child_array_type([25, 26], 27)]
   integer, dimension(2), parameter :: int_const6 = child_const5(:)%parent_field(2)
-  logical, parameter :: test_child3 = int_const6(1) == 23 
+  logical, parameter :: test_child3 = int_const6(1) == 23
 
   type(child_type), parameter :: child_const7 =  child_type(28, 29)
   type(parent_type), parameter :: parent_const8 = child_const7%parent_type
@@ -114,7 +114,7 @@ module m3
   logical, parameter :: test_parent1 = child_const1%parent_field1 == 12
   logical, parameter :: test_parent2 = child_const1%parent_field2 == 10.0
   logical, parameter :: test_parent3 = child_const1%parent_field3 .eqv. .false.
-  logical, parameter :: test_parent4 = & 
+  logical, parameter :: test_parent4 = &
     child_const1%parent_type%parent_field1 == 12
   logical, parameter :: test_parent5 = &
     child_const1%parent_type%parent_field2 == 10.0
diff --git a/flang/test/Evaluate/folding33.f90 b/flang/test/Evaluate/folding33.f90
new file mode 100644
index 0000000000000..299cb7e1731a5
--- /dev/null
+++ b/flang/test/Evaluate/folding33.f90
@@ -0,0 +1,4 @@
+!RUN: %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
+!CHECK: warning: overflow on compilation-time evaluation of a call to 'exp' [-Wfolding-exception]
+print *, exp((11.265625_2,1._2))
+end
diff --git a/flang/test/Examples/omp-in-reduction-clause.f90 b/flang/test/Examples/omp-in-reduction-clause.f90
index ced672220fe78..73ba197d5605a 100644
--- a/flang/test/Examples/omp-in-reduction-clause.f90
+++ b/flang/test/Examples/omp-in-reduction-clause.f90
@@ -15,7 +15,7 @@ subroutine omp_in_reduction_taskgroup()
         do i=1,10
             z = z * 5
         end do
-    !$omp end taskloop 
+    !$omp end taskloop
     !$omp end taskgroup
 end subroutine omp_in_reduction_taskgroup
 
diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90
index 62118bb2eed2e..f21d8f9c37637 100644
--- a/flang/test/Fir/CUDA/cuda-constructor-2.f90
+++ b/flang/test/Fir/CUDA/cuda-constructor-2.f90
@@ -28,10 +28,10 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
 // CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE2]], %[[VAR_ADDR2]], %[[VAR_NAME2]], %[[CST2]]) : (!fir.ref<!fir.llvm_ptr<i8>>, !fir.ref<i8>, !fir.ref<i8>, i64) -> ()
 // CHECK-DAG: %[[BOX:.*]] = fir.address_of(@_QMmtestsEndev) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK-DAG: %[[BOXREF:.*]] = fir.convert %[[BOX]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<i8>
-// CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE:.*]], %[[BOXREF]], %{{.*}}, %{{.*}}) 
+// CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE:.*]], %[[BOXREF]], %{{.*}}, %{{.*}})
 //
 
-// ----- 
+// -----
 
 // Checking that constant global variables are not registered
 
@@ -40,7 +40,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git 3372303188df0f7f8ac26e7ab610cf8b0f716d42)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
   fir.global @_QMiso_c_bindingECc_int constant : i32
-  
+
 
   fir.type_info @_QM__fortran_builtinsT__builtin_c_ptr noinit nodestroy nofinal : !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
   gpu.module @cuda_device_mod {
@@ -63,7 +63,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i
 
 // -----
 
-module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git 3372303188df0f7f8ac26e7ab610cf8b0f716d42)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {  
+module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git 3372303188df0f7f8ac26e7ab610cf8b0f716d42)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
   fir.global @_QMmEa00 {data_attr = #cuf.cuda<managed>} : !fir.box<!fir.heap<!fir.array<?x?x?x?x?xf64>>> {
     %c0 = arith.constant 0 : index
     %0 = fir.zero_bits !fir.heap<!fir.array<?x?x?x?x?xf64>>
diff --git a/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 b/flang/test/Fir/CUDA/cuda-implicit-device-global.f90
index 758c2e2244257..f399767a885fa 100644
--- a/flang/test/Fir/CUDA/cuda-implicit-device-global.f90
+++ b/flang/test/Fir/CUDA/cuda-implicit-device-global.f90
@@ -144,7 +144,7 @@ // Test that global used in device function are flagged with the correct
 // Checking that a constant fir.global that is used in device code is copied over to the device
 // CHECK: fir.global linkonce @_QQclX5465737420504153534544 constant : !fir.char<1,11>
 
-// CHECK-LABEL: gpu.module @cuda_device_mod 
+// CHECK-LABEL: gpu.module @cuda_device_mod
 // CHECK: fir.global linkonce @_QQclX5465737420504153534544 constant
 
 // -----
@@ -312,10 +312,10 @@ // Test that global used in device function are flagged with the correct
 // -----
 
 // Variables with initialization are promoted to non constant global.
-// 
+//
 // attributes(global) subroutine kernel4()
 //   integer :: a = 4
-// end subroutine 
+// end subroutine
 
 func.func @_QPkernel4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
   %0 = fir.address_of(@_QFkernel4Ea) : !fir.ref<i32>
diff --git a/flang/test/Fir/CUDA/cuda-shared-offset.mlir b/flang/test/Fir/CUDA/cuda-shared-offset.mlir
index 9c057d024426a..37b36b2bd050e 100644
--- a/flang/test/Fir/CUDA/cuda-shared-offset.mlir
+++ b/flang/test/Fir/CUDA/cuda-shared-offset.mlir
@@ -3,9 +3,9 @@
 module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
   gpu.module @cuda_device_mod {
     gpu.func @_QPdynshared() kernel {
-      %c-1 = arith.constant -1 : index
-      %6 = cuf.shared_memory !fir.array<?xf32>, %c-1 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
-      %7 = fir.shape %c-1 : (index) -> !fir.shape<1>
+      %0 = fir.assumed_size_extent : index
+      %6 = cuf.shared_memory !fir.array<?xf32>, %0 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
+      %7 = fir.shape %0 : (index) -> !fir.shape<1>
       %8 = fir.declare %6(%7) {data_attr = #cuf.cuda<shared>, uniq_name = "_QFdynsharedEr"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xf32>>
       gpu.return
     }
@@ -14,7 +14,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
 
 // CHECK-LABEL: gpu.module @cuda_device_mod
 // CHECK: gpu.func @_QPdynshared()
-// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %c-1 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>       
+// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %{{.*}} : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>       
 // CHECK: gpu.return
 // CHECK: }
 // CHECK: fir.global external @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
@@ -127,16 +127,16 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
   gpu.module @cuda_device_mod {
     gpu.func @_QMmtestsPtestany(%arg0: !fir.ref<!fir.array<?xf32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
       %0 = fir.dummy_scope : !fir.dscope
-      %c-1 = arith.constant -1 : index
-      %1 = fir.shape %c-1 : (index) -> !fir.shape<1>
+      %a0 = fir.assumed_size_extent : index
+      %1 = fir.shape %a0 : (index) -> !fir.shape<1>
       %2:2 = hlfir.declare %arg0(%1) dummy_scope %0 {data_attr = #cuf.cuda<device>, uniq_name = "_QMmtestsFtestanyEa"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
       %3 = fir.address_of(@_QM__fortran_builtinsE__builtin_blockdim) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
       %4:2 = hlfir.declare %3 {uniq_name = "_QM__fortran_builtinsE__builtin_blockdim"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>)
       %5 = fir.address_of(@_QM__fortran_builtinsE__builtin_blockidx) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
       %6:2 = hlfir.declare %5 {uniq_name = "_QM__fortran_builtinsE__builtin_blockidx"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>)
-      %c-1_0 = arith.constant -1 : index
-      %7 = cuf.shared_memory !fir.array<?xf64>, %c-1_0 : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>>
-      %8 = fir.shape %c-1_0 : (index) -> !fir.shape<1>
+      %a2 = fir.assumed_size_extent : index
+      %7 = cuf.shared_memory !fir.array<?xf64>, %a2 : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>>
+      %8 = fir.shape %a2 : (index) -> !fir.shape<1>
       %9:2 = hlfir.declare %7(%8) {data_attr = #cuf.cuda<shared>, uniq_name = "_QMmtestsFtestanyEdmasks"} : (!fir.ref<!fir.array<?xf64>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>)
       %10 = fir.address_of(@_QM__fortran_builtinsE__builtin_griddim) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
       %11:2 = hlfir.declare %10 {uniq_name = "_QM__fortran_builtinsE__builtin_griddim"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>)
@@ -146,9 +146,9 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
       %15:2 = hlfir.declare %14 {uniq_name = "_QMmtestsFtestanyEiam"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
       %16 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QMmtestsFtestanyEj"}
       %17:2 = hlfir.declare %16 {uniq_name = "_QMmtestsFtestanyEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-      %c-1_1 = arith.constant -1 : index
-      %18 = cuf.shared_memory !fir.array<?xf32>, %c-1_1 : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>
-      %19 = fir.shape %c-1_1 : (index) -> !fir.shape<1>
+      %a3 = fir.assumed_size_extent : index
+      %18 = cuf.shared_memory !fir.array<?xf32>, %a3 : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>
+      %19 = fir.shape %a3 : (index) -> !fir.shape<1>
       %20:2 = hlfir.declare %18(%19) {data_attr = #cuf.cuda<shared>, uniq_name = "_QMmtestsFtestanyEsmasks"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
       gpu.return
     }
@@ -156,7 +156,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
 }
 
 // CHECK-LABEL: gpu.func @_QMmtestsPtestany
-// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf64>, %c-1{{.*}} : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>>
-// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %c-1{{.*}} : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>
+// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf64>, %{{.*}} : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>>
+// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %{{.*}} : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>
 
 // CHECK: fir.global external @_QMmtestsPtestany__shared_mem {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
diff --git a/flang/test/Fir/MIF/change_team.mlir b/flang/test/Fir/MIF/change_team.mlir
new file mode 100644
index 0000000000000..1dbfee574cc51
--- /dev/null
+++ b/flang/test/Fir/MIF/change_team.mlir
@@ -0,0 +1,51 @@
+// RUN: fir-opt --mif-convert %s | FileCheck %s
+
+func.func @_QQmain() attributes {fir.bindc_name = "TEST_CHANGE_TEAM"} {
+  %0 = fir.dummy_scope : !fir.dscope
+  %c10 = arith.constant 10 : index
+  %1 = fir.alloca !fir.char<1,10> {bindc_name = "err", uniq_name = "_QFEerr"}
+  %2:2 = hlfir.declare %1 typeparams %c10 {uniq_name = "_QFEerr"} : (!fir.ref<!fir.char<1,10>>, index) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>)
+  %3 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+  %4:2 = hlfir.declare %3 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %5 = fir.alloca i32 {bindc_name = "stat", uniq_name = "_QFEstat"}
+  %6:2 = hlfir.declare %5 {uniq_name = "_QFEstat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %7 = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}> {bindc_name = "team", uniq_name = "_QFEteam"}
+  %8:2 = hlfir.declare %7 {uniq_name = "_QFEteam"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>)
+  %9 = fir.address_of(@_QQ_QM__fortran_builtinsT__builtin_team_type.DerivedInit) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  fir.copy %9 to %8#0 no_overlap : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  %10 = fir.embox %8#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  mif.change_team %10 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) {
+    %13 = fir.load %4#0 : !fir.ref<i32>
+    %c1_i32 = arith.constant 1 : i32
+    %14 = arith.addi %13, %c1_i32 : i32
+    hlfir.assign %14 to %4#0 : i32, !fir.ref<i32>
+    mif.end_team : () -> ()
+  }
+  %11 = fir.embox %2#0 : (!fir.ref<!fir.char<1,10>>) -> !fir.box<!fir.char<1,10>>
+  %12 = fir.embox %8#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  mif.change_team %12 stat %6#0 errmsg %11 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<i32>, !fir.box<!fir.char<1,10>>) {
+    mif.end_team : () -> ()
+  }
+  return
+}
+
+// CHECK: %[[VAL_1:.*]] = fir.absent !fir.ref<i32>
+// CHECK: %[[VAL_2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: %[[VAL_3:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none>
+// CHECK: fir.call @_QMprifPprif_change_team(%[[VAL_3]], %[[VAL_1]], %[[VAL_2]], %[[VAL_2]]) : (!fir.box<none>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+// CHECK: %[[VAL_4:.*]] = fir.load %[[VAR_1:.*]]#0 : !fir.ref<i32>
+// CHECK: %[[C1:.*]] = arith.constant 1 : i32
+// CHECK: %[[VAL_5:.*]] = arith.addi %[[VAL_4]], %[[C1]] : i32
+// CHECK: hlfir.assign %[[VAL_5]] to %[[VAR_1]]#0 : i32, !fir.ref<i32>
+// CHECK: %[[VAL_6:.*]] = fir.absent !fir.ref<i32>
+// CHECK: %[[VAL_7:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: fir.call @_QMprifPprif_end_team(%[[VAL_6]], %[[VAL_7]], %[[VAL_7]]) : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+// CHECK: %[[VAL_8:.*]] = fir.embox %[[ERRMSG:.*]]#0 : (!fir.ref<!fir.char<1,10>>) -> !fir.box<!fir.char<1,10>
+// CHECK: %[[VAL_9:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: %[[TEAM_2:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none>
+// CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.char<1,10>>) -> !fir.box<!fir.char<1,?>>
+// CHECK: fir.call @_QMprifPprif_change_team(%[[TEAM_2]], %[[STAT:.*]]#0, %[[VAL_10]], %[[VAL_9]]) : (!fir.box<none>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+// CHECK: %[[VAL_11:.*]] = fir.absent !fir.ref<i32>
+// CHECK: %[[VAL_12:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: fir.call @_QMprifPprif_end_team(%[[VAL_11]], %[[VAL_12]], %[[VAL_12]]) : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
diff --git a/flang/test/Fir/MIF/form_team.mlir b/flang/test/Fir/MIF/form_team.mlir
new file mode 100644
index 0000000000000..f7f957afb7cc0
--- /dev/null
+++ b/flang/test/Fir/MIF/form_team.mlir
@@ -0,0 +1,56 @@
+// RUN: fir-opt --mif-convert %s | FileCheck %s
+
+func.func @_QQmain() attributes {fir.bindc_name = "TEST_FORM_TEAM"} {
+  %0 = fir.dummy_scope : !fir.dscope
+  %c10 = arith.constant 10 : index
+  %1 = fir.alloca !fir.char<1,10> {bindc_name = "err", uniq_name = "_QFEerr"}
+  %2:2 = hlfir.declare %1 typeparams %c10 {uniq_name = "_QFEerr"} : (!fir.ref<!fir.char<1,10>>, index) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>)
+  %3 = fir.alloca i32 {bindc_name = "stat", uniq_name = "_QFEstat"}
+  %4:2 = hlfir.declare %3 {uniq_name = "_QFEstat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %5 = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}> {bindc_name = "team", uniq_name = "_QFEteam"}
+  %6:2 = hlfir.declare %5 {uniq_name = "_QFEteam"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>)
+  %7 = fir.address_of(@_QQ_QM__fortran_builtinsT__builtin_team_type.DerivedInit) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  fir.copy %7 to %6#0 no_overlap : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  %8 = fir.alloca i32 {bindc_name = "team_index", uniq_name = "_QFEteam_index"}
+  %9:2 = hlfir.declare %8 {uniq_name = "_QFEteam_index"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %10 = fir.alloca i32 {bindc_name = "team_number", uniq_name = "_QFEteam_number"}
+  %11:2 = hlfir.declare %10 {uniq_name = "_QFEteam_number"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %12 = fir.load %11#0 : !fir.ref<i32>
+  %13 = fir.embox %6#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  mif.form_team team_number %12 team_var %13 : (i32, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> ()
+  %14 = fir.load %9#0 : !fir.ref<i32>
+  %15 = fir.load %11#0 : !fir.ref<i32>
+  %16 = fir.embox %6#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  mif.form_team team_number %15 team_var %16 new_index %14 : (i32, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, i32) -> ()
+  %17 = fir.load %11#0 : !fir.ref<i32>
+  %18 = fir.embox %6#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  mif.form_team team_number %17 team_var %18 stat %4#0 : (i32, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<i32>) -> ()
+  %19 = fir.embox %2#0 : (!fir.ref<!fir.char<1,10>>) -> !fir.box<!fir.char<1,10>>
+  %20 = fir.load %11#0 : !fir.ref<i32>
+  %21 = fir.embox %6#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  mif.form_team team_number %20 team_var %21 errmsg %19 : (i32, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.box<!fir.char<1,10>>) -> ()
+  return
+}
+// CHECK: %[[VAL_1:.*]] = fir.absent !fir.ref<i32>
+// CHECK: %[[VAL_2:.*]] = fir.absent !fir.ref<i32>
+// CHECK: %[[VAL_3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: %[[VAL_4:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none>
+// CHECK: fir.call @_QMprifPprif_form_team(%[[TEAM_NUMBER:.*]], %[[VAL_4]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_3]]) : (!fir.ref<i64>, !fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> () 
+
+// CHECK: %[[VAL_5:.*]] = fir.absent !fir.ref<i32>
+// CHECK: %[[VAL_6:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: %[[VAL_7:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none>
+// CHECK: fir.call @_QMprifPprif_form_team(%[[TEAM_NUMBER:.*]], %[[VAL_7]], %[[NEW_INDEX:.*]], %[[VAL_5]], %[[VAL_6]], %[[VAL_6]]) : (!fir.ref<i64>, !fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+// CHECK: %[[VAL_8:.*]] = fir.absent !fir.ref<i32>
+// CHECK: %[[VAL_9:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: %[[VAL_10:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none>
+// CHECK: fir.call @_QMprifPprif_form_team(%[[TEAM_NUMBER:.*]], %[[VAL_10]], %[[VAL_8]], %[[START:.*]]#0, %[[VAL_9]], %[[VAL_9]]) : (!fir.ref<i64>, !fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+// CHECK: %[[VAL_11:.*]] = fir.embox %[[ERRMSG:.*]]#0 : (!fir.ref<!fir.char<1,10>>) -> !fir.box<!fir.char<1,10>>
+// CHECK: %[[VAL_12:.*]] = fir.absent !fir.ref<i32>
+// CHECK: %[[VAL_13:.*]] = fir.absent !fir.ref<i32>
+// CHECK: %[[VAL_14:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: %[[VAL_15:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none>
+// CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_11]] : (!fir.box<!fir.char<1,10>>) -> !fir.box<!fir.char<1,?>>
+// CHECK: fir.call @_QMprifPprif_form_team(%[[TEAM_NUMBER:.*]], %[[VAL_15]], %[[VAL_12]], %[[VAL_13]], %[[VAL_16]], %[[VAL_14]]) : (!fir.ref<i64>, !fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
diff --git a/flang/test/Fir/MIF/get_team.mlir b/flang/test/Fir/MIF/get_team.mlir
new file mode 100644
index 0000000000000..10799fa2292b6
--- /dev/null
+++ b/flang/test/Fir/MIF/get_team.mlir
@@ -0,0 +1,68 @@
+// RUN: fir-opt --mif-convert %s | FileCheck %s
+
+func.func @_QQmain() attributes {fir.bindc_name = "TEST_FORM_TEAM"} {
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.address_of(@_QMiso_fortran_envECcurrent_team) : !fir.ref<i32>
+  %2:2 = hlfir.declare %1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QMiso_fortran_envECcurrent_team"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %3 = fir.address_of(@_QMiso_fortran_envECinitial_team) : !fir.ref<i32>
+  %4:2 = hlfir.declare %3 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QMiso_fortran_envECinitial_team"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %5 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFEn"}
+  %6:2 = hlfir.declare %5 {uniq_name = "_QFEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %7 = fir.address_of(@_QMiso_fortran_envECparent_team) : !fir.ref<i32>
+  %8:2 = hlfir.declare %7 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QMiso_fortran_envECparent_team"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %9 = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}> {bindc_name = "result_team", uniq_name = "_QFEresult_team"}
+  %10:2 = hlfir.declare %9 {uniq_name = "_QFEresult_team"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>)
+  %11 = fir.address_of(@_QQ_QM__fortran_builtinsT__builtin_team_type.DerivedInit) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  fir.copy %11 to %10#0 no_overlap : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  %12 = mif.get_team : () -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  %13:2 = hlfir.declare %12 {uniq_name = ".tmp.intrinsic_result"} : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>)
+  %false = arith.constant false
+  %14 = hlfir.as_expr %13#0 move %false : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, i1) -> !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  hlfir.assign %14 to %10#0 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  hlfir.destroy %14 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  %c-2_i32 = arith.constant -2 : i32
+  %15 = mif.get_team level %c-2_i32 : (i32) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  %16:2 = hlfir.declare %15 {uniq_name = ".tmp.intrinsic_result"} : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>)
+  %false_0 = arith.constant false
+  %17 = hlfir.as_expr %16#0 move %false_0 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, i1) -> !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  hlfir.assign %17 to %10#0 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  hlfir.destroy %17 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  %c-1_i32 = arith.constant -1 : i32
+  %18 = mif.get_team level %c-1_i32 : (i32) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  %19:2 = hlfir.declare %18 {uniq_name = ".tmp.intrinsic_result"} : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>)
+  %false_1 = arith.constant false
+  %20 = hlfir.as_expr %19#0 move %false_1 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, i1) -> !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  hlfir.assign %20 to %10#0 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  hlfir.destroy %20 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  %c-3_i32 = arith.constant -3 : i32
+  %21 = mif.get_team level %c-3_i32 : (i32) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  %22:2 = hlfir.declare %21 {uniq_name = ".tmp.intrinsic_result"} : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>)
+  %false_2 = arith.constant false
+  %23 = hlfir.as_expr %22#0 move %false_2 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, i1) -> !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  hlfir.assign %23 to %10#0 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  hlfir.destroy %23 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  %24 = fir.load %6#0 : !fir.ref<i32>
+  %25 = mif.get_team level %24 : (i32) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  %26:2 = hlfir.declare %25 {uniq_name = ".tmp.intrinsic_result"} : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>)
+  %false_3 = arith.constant false
+  %27 = hlfir.as_expr %26#0 move %false_3 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, i1) -> !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  hlfir.assign %27 to %10#0 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  hlfir.destroy %27 : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  return
+}
+
+// CHECK: %[[VAL_1:.*]] = fir.absent !fir.ref<i32>
+// CHECK: %[[RESULT:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none>
+// CHECK: fir.call @_QMprifPprif_get_team(%[[VAL_1]], %[[RESULT]]) : (!fir.ref<i32>, !fir.box<none>) -> ()
+
+// CHECK: %[[RESULT:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none>
+// CHECK: fir.call @_QMprifPprif_get_team(%[[INIT:.*]], %[[RESULT]]) : (!fir.ref<i32>, !fir.box<none>) -> ()
+
+// CHECK: %[[RESULT:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none>
+// CHECK: fir.call @_QMprifPprif_get_team(%[[CURRENT:.*]], %[[RESULT]]) : (!fir.ref<i32>, !fir.box<none>) -> ()
+
+// CHECK: %[[RESULT:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none>
+// CHECK: fir.call @_QMprifPprif_get_team(%[[PARENT:.*]], %[[RESULT]]) : (!fir.ref<i32>, !fir.box<none>) -> ()
+
+// CHECK: %[[RESULT:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none>
+// CHECK: fir.call @_QMprifPprif_get_team(%[[VAL_N:.*]], %[[RESULT]]) : (!fir.ref<i32>, !fir.box<none>) -> ()
diff --git a/flang/test/Fir/MIF/sync_team.mlir b/flang/test/Fir/MIF/sync_team.mlir
new file mode 100644
index 0000000000000..d7db171546fb5
--- /dev/null
+++ b/flang/test/Fir/MIF/sync_team.mlir
@@ -0,0 +1,54 @@
+// RUN: fir-opt --mif-convert %s | FileCheck %s
+
+func.func @_QQmain() attributes {fir.bindc_name = "TEST_SYNC_TEAM"} {
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.address_of(@_QFEerror_message) : !fir.ref<!fir.char<1,128>>
+  %c128 = arith.constant 128 : index
+  %2:2 = hlfir.declare %1 typeparams %c128 {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
+  %3 = fir.alloca i32 {bindc_name = "sync_status", uniq_name = "_QFEsync_status"}
+  %4:2 = hlfir.declare %3 {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %5 = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}> {bindc_name = "team", uniq_name = "_QFEteam"}
+  %6:2 = hlfir.declare %5 {uniq_name = "_QFEteam"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>)
+  %7 = fir.address_of(@_QQ_QM__fortran_builtinsT__builtin_team_type.DerivedInit) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  fir.copy %7 to %6#0 no_overlap : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  %8 = fir.embox %6#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  mif.sync_team %8 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> ()
+  %9 = fir.embox %6#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  mif.sync_team %9 stat %4#0 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<i32>) -> ()
+  %10 = fir.embox %6#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  %11 = fir.embox %2#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  mif.sync_team %10 errmsg %11 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.box<!fir.char<1,128>>) -> ()
+  %12 = fir.embox %6#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  %13 = fir.embox %2#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  mif.sync_team %12 stat %4#0 errmsg %13 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<i32>, !fir.box<!fir.char<1,128>>) -> ()
+  return
+}
+fir.global internal @_QFEerror_message : !fir.char<1,128> {
+  %0 = fir.zero_bits !fir.char<1,128>
+  fir.has_value %0 : !fir.char<1,128>
+}
+
+// CHECK: %[[ERRMSG:.*]]:2 = hlfir.declare %[[E:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
+// CHECK: %[[STAT:.*]]:2 = hlfir.declare %[[S:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+// CHECK: %[[VAL_1:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: %[[VAL_2:.*]] = fir.absent !fir.ref<i32>
+// CHECK: %[[TEAM_2:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none>
+// CHECK: fir.call @_QMprifPprif_sync_team(%[[TEAM_2]], %[[VAL_2]], %[[VAL_1]], %[[VAL_1]]) : (!fir.box<none>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+// CHECK: %[[VAL_3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: %[[TEAM_2:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none>
+// CHECK: fir.call @_QMprifPprif_sync_team(%[[TEAM_2]], %[[STAT]]#0, %[[VAL_3]], %[[VAL_3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+// CHECK: %[[VAL_4:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+// CHECK: %[[VAL_5:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: %[[VAL_6:.*]] = fir.absent !fir.ref<i32>
+// CHECK: %[[TEAM_2:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none>
+// CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
+// CHECK: fir.call @_QMprifPprif_sync_team(%[[TEAM_2]], %[[VAL_6]], %[[VAL_7]], %[[VAL_5]]) : (!fir.box<none>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+// CHECK: %[[VAL_8:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+// CHECK: %[[VAL_9:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: %[[TEAM_2:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none>
+// CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
+// CHECK: fir.call @_QMprifPprif_sync_team(%[[TEAM_2]], %[[STAT]]#0, %[[VAL_10]], %[[VAL_9]]) : (!fir.box<none>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
diff --git a/flang/test/Fir/MIF/team_number.mlir b/flang/test/Fir/MIF/team_number.mlir
new file mode 100644
index 0000000000000..4dc766d2a9ff4
--- /dev/null
+++ b/flang/test/Fir/MIF/team_number.mlir
@@ -0,0 +1,27 @@
+// RUN: fir-opt --mif-convert %s | FileCheck %s
+
+func.func @_QQmain() attributes {fir.bindc_name = "TEST_TEAM_NUMBER"} {
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.alloca i32 {bindc_name = "t", uniq_name = "_QFEt"}
+  %2:2 = hlfir.declare %1 {uniq_name = "_QFEt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %3 = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}> {bindc_name = "team", uniq_name = "_QFEteam"}
+  %4:2 = hlfir.declare %3 {uniq_name = "_QFEteam"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>)
+  %5 = fir.address_of(@_QQ_QM__fortran_builtinsT__builtin_team_type.DerivedInit) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  fir.copy %5 to %4#0 no_overlap : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  %6 = fir.embox %4#0 : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> !fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>
+  %7 = mif.team_number team %6 : (!fir.box<!fir.type<_QM__fortran_builtinsT__builtin_team_type{_QM__fortran_builtinsT__builtin_team_type.__id:i64}>>) -> i64
+  %8 = fir.convert %7 : (i64) -> i32
+  hlfir.assign %8 to %2#0 : i32, !fir.ref<i32>
+  %9 = mif.team_number : () -> i64
+  %10 = fir.convert %9 : (i64) -> i32
+  hlfir.assign %10 to %2#0 : i32, !fir.ref<i32>
+  return
+}
+
+// CHECK: %[[VAL_1:.*]] = fir.convert %[[TEAM:.*]] : ({{.*}}) -> !fir.box<none>
+// CHECK: fir.call @_QMprifPprif_team_number(%[[VAL_1]], %[[RESULT:.*]]) : (!fir.box<none>, !fir.ref<i64>) -> ()
+// CHECK: %[[VAL_2:.*]] = fir.load %[[RESULT]] : !fir.ref<i64>
+
+// CHECK: %[[VAL_3:.*]] = fir.absent !fir.box<none>
+// CHECK: fir.call @_QMprifPprif_team_number(%[[VAL_3]], %[[RESULT:.*]]) : (!fir.box<none>, !fir.ref<i64>) -> ()
+// CHECK: %[[VAL_4:.*]] = fir.load %[[RESULT]] : !fir.ref<i64>
diff --git a/flang/test/Fir/OpenACC/openacc-mappable.fir b/flang/test/Fir/OpenACC/openacc-mappable.fir
index 05df35a482907..00fe2574da62a 100644
--- a/flang/test/Fir/OpenACC/openacc-mappable.fir
+++ b/flang/test/Fir/OpenACC/openacc-mappable.fir
@@ -21,11 +21,13 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>,
   // CHECK: Mappable: !fir.box<!fir.array<10xf32>>
   // CHECK: Type category: array
   // CHECK: Size: 40
+  // CHECK: Has unknown dimensions: false
 
   // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "arr", structured = false}
   // CHECK: Pointer-like and Mappable: !fir.ref<!fir.array<10xf32>>
   // CHECK: Type category: array
   // CHECK: Size: 40
+  // CHECK: Has unknown dimensions: false
 
   // This second test exercises argument of explicit-shape arrays in following forms:
   // `real :: arr1(nn), arr2(2:nn), arr3(10)`
@@ -62,6 +64,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>,
   // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> {name = "arr1", structured = false}
   // CHECK: Pointer-like and Mappable: !fir.ref<!fir.array<?xf32>>
   // CHECK: Type category: array
+  // CHECK: Has unknown dimensions: true
   // CHECK: Shape: %{{.*}} = fir.shape %[[EXTENT1:.*]] : (index) -> !fir.shape<1>
   // CHECK: Bound[0]: %{{.*}} = acc.bounds lowerbound(%[[LB1:.*]] : index) upperbound(%[[UB1:.*]] : index) extent(%{{.*}} : index) stride(%c1{{.*}} : index) startIdx(%c1{{.*}} : index)
   // CHECK: Lower bound: %[[LB1]] = arith.constant 0 : index
@@ -70,6 +73,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>,
   // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> {name = "arr2", structured = false}
   // CHECK: Pointer-like and Mappable: !fir.ref<!fir.array<?xf32>>
   // CHECK: Type category: array
+  // CHECK: Has unknown dimensions: true
   // CHECK: Shape: %{{.*}} = fir.shape_shift %c2{{.*}}, %[[EXTENT2:.*]] : (index, index) -> !fir.shapeshift<1>
   // CHECK: Bound[0]: %{{.*}} = acc.bounds lowerbound(%[[LB2:.*]] : index) upperbound(%[[UB2:.*]] : index) extent(%{{.*}} : index) stride(%c1{{.*}} : index) startIdx(%c2{{.*}} : index)
   // CHECK: Lower bound: %[[LB2]] = arith.constant 0 : index
@@ -80,6 +84,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>,
   // CHECK: Type category: array
   // CHECK: Size: 40
   // CHECK: Offset: 0
+  // CHECK: Has unknown dimensions: false
   // CHECK: Shape: %{{.*}} = fir.shape %[[EXTENT3:.*]] : (index) -> !fir.shape<1>
   // CHECK: Bound[0]: %{{.*}} = acc.bounds lowerbound(%[[LB3:.*]] : index) upperbound(%[[UB3:.*]] : index) extent(%c10{{.*}} : index) stride(%c1{{.*}} : index) startIdx(%c1{{.*}} : index)
   // CHECK: Lower bound: %[[LB3]] = arith.constant 0 : index
diff --git a/flang/test/Fir/alloc.fir b/flang/test/Fir/alloc.fir
index 8da8b828c18b9..613c8e274baad 100644
--- a/flang/test/Fir/alloc.fir
+++ b/flang/test/Fir/alloc.fir
@@ -372,8 +372,17 @@ func.func @alloca_unlimited_polymorphic_box() {
   %1 = fir.alloca !fir.class<!fir.array<?xnone>>
   %2 = fir.alloca !fir.box<none>
   %3 = fir.alloca !fir.box<!fir.array<?xnone>>
+  // Add real uses so allocas are not trivially dead.
+  fir.call @__use_class_none(%0) : (!fir.ref<!fir.class<none>>) -> ()
+  fir.call @__use_class_array(%1) : (!fir.ref<!fir.class<!fir.array<?xnone>>>) -> ()
+  fir.call @__use_box_none(%2) : (!fir.ref<!fir.box<none>>) -> ()
+  fir.call @__use_box_array(%3) : (!fir.ref<!fir.box<!fir.array<?xnone>>>) -> ()
   return
 }
+func.func private @__use_class_none(!fir.ref<!fir.class<none>>) -> ()
+func.func private @__use_class_array(!fir.ref<!fir.class<!fir.array<?xnone>>>) -> ()
+func.func private @__use_box_none(!fir.ref<!fir.box<none>>) -> ()
+func.func private @__use_box_array(!fir.ref<!fir.box<!fir.array<?xnone>>>) -> ()
 // Note: allocmem of fir.box are not possible (fir::HeapType::verify does not
 // accept box types), so there is no equivalent of
 // alloca_unlimited_polymorphic_box for allocmem.
diff --git a/flang/test/Fir/dispatch.f90 b/flang/test/Fir/dispatch.f90
index 2b1ae225986ca..741099706981f 100644
--- a/flang/test/Fir/dispatch.f90
+++ b/flang/test/Fir/dispatch.f90
@@ -195,7 +195,7 @@ program test_type_to_class
 
 ! CHECK-LABEL: func.func @_QMdispatch1Pdisplay_class(
 ! CHECK-SAME: %[[ARG:.*]]: [[CLASS:!fir.class<.*>>]]
-! CHECK: %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMdispatch1Fdisplay_classEp"} : (!fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>, !fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>)
+! CHECK: %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMdispatch1Fdisplay_classEp"} : (!fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>, !fir.class<!fir.type<_QMdispatch1Tp1{a:i32,b:i32}>>)
 
 ! Check dynamic dispatch equal to `call p%display2()` with binding index = 2.
 ! CHECK: %[[BOXDESC:.*]] = fir.box_tdesc %[[ARG_DECL]]#0 : ([[CLASS]]) -> !fir.tdesc<none>
@@ -296,7 +296,7 @@ program test_type_to_class
 ! CHECK-LABEL: _QMdispatch1Pno_pass_array_pointer
 ! CHECK-LABEL: _QMdispatch1Pcall_a1_proc
 
-! Check the layout of the binding table. This is easier to do in FIR than in 
+! Check the layout of the binding table. This is easier to do in FIR than in
 ! LLVM IR.
 
 ! BT-LABEL: fir.type_info @_QMdispatch1Tty_kindK10K20
diff --git a/flang/test/Fir/non-trivial-procedure-binding-description.f90 b/flang/test/Fir/non-trivial-procedure-binding-description.f90
index 668928600157b..13fcfeed774cf 100644
--- a/flang/test/Fir/non-trivial-procedure-binding-description.f90
+++ b/flang/test/Fir/non-trivial-procedure-binding-description.f90
@@ -25,6 +25,6 @@ end module a
 program main
   use a
 
-  type(f) :: obj 
+  type(f) :: obj
   print *, obj%foo(obj)
 end program
diff --git a/flang/test/Fir/omp-reduction-embox-codegen.fir b/flang/test/Fir/omp-reduction-embox-codegen.fir
index 1645e1a407ad4..47fffb35a7d92 100644
--- a/flang/test/Fir/omp-reduction-embox-codegen.fir
+++ b/flang/test/Fir/omp-reduction-embox-codegen.fir
@@ -28,9 +28,11 @@ func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
   omp.parallel reduction(byref @test_reduction %4 -> %arg0 : !fir.ref<!fir.box<i32>>) {
     omp.terminator
   }
+  func.call @__use_box_i32(%4) : (!fir.ref<!fir.box<i32>>) -> ()
   return
 }
 
+func.func private @__use_box_i32(!fir.ref<!fir.box<i32>>) -> ()
 // basically we are testing that there isn't a crash
 // CHECK-LABEL: define void @_QQmain
 // CHECK-NEXT:    alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8
diff --git a/flang/test/Fir/pdt.fir b/flang/test/Fir/pdt.fir
index a200cd7e7cc03..04f48e745d033 100644
--- a/flang/test/Fir/pdt.fir
+++ b/flang/test/Fir/pdt.fir
@@ -95,14 +95,14 @@ func.func @_QTt1P.f2.offset(%0 : i32, %1 : i32) -> i32 {
 // end program p
 
 func.func private @bar(!fir.ref<!fir.char<1,?>>)
+func.func private @__use_t1(!fir.ref<!fir.type<_QTt1(p1:i32,p2:i32){f1:!fir.char<1,?>,f2:!fir.char<1,?>}>>) -> ()
 
 // CHECK-LABEL: define void @_QPfoo(i32 %0, i32 %1)
 func.func @_QPfoo(%arg0 : i32, %arg1 : i32) {
   // CHECK: %[[size:.*]] = call i64 @_QTt1P.mem.size(i32 %0, i32 %1)
   // CHECK: %[[alloc:.*]] = alloca i8, i64 %[[size]]
   %0 = fir.alloca !fir.type<_QTt1(p1:i32,p2:i32){f1:!fir.char<1,?>,f2:!fir.char<1,?>}>(%arg0, %arg1 : i32, i32)
-  //%2 = fir.coordinate_of %0, f2 : (!fir.ref<!fir.type<_QTt1>>) -> !fir.ref<!fir.char<1,?>>
-  %2 = fir.zero_bits !fir.ref<!fir.char<1,?>>
-  fir.call @bar(%2) : (!fir.ref<!fir.char<1,?>>) -> ()
+  // Keep alloca live without creating an unsupported coordinate_of on dynamic-sized field.
+  func.call @__use_t1(%0) : (!fir.ref<!fir.type<_QTt1(p1:i32,p2:i32){f1:!fir.char<1,?>,f2:!fir.char<1,?>}>>) -> ()
   return
 }
diff --git a/flang/test/HLFIR/assumed-type-actual-args.f90 b/flang/test/HLFIR/assumed-type-actual-args.f90
index aaac98ba3c79d..fde7965e76e3a 100644
--- a/flang/test/HLFIR/assumed-type-actual-args.f90
+++ b/flang/test/HLFIR/assumed-type-actual-args.f90
@@ -105,7 +105,7 @@ subroutine s5b(x)
 ! CHECK-LABEL:   func.func @_QPtest1(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<none> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest1Ex"} : (!fir.ref<none>, !fir.dscope) -> (!fir.ref<none>, !fir.ref<none>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.ref<none>, !fir.dscope) -> (!fir.ref<none>, !fir.ref<none>)
 ! CHECK:           fir.call @_QPs1(%[[VAL_1]]#0) fastmath<contract> : (!fir.ref<none>) -> ()
 ! CHECK:           return
 ! CHECK:         }
@@ -115,7 +115,7 @@ subroutine s5b(x)
 ! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_1:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest2Ex"} : (!fir.ref<!fir.array<?xnone>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.ref<!fir.array<?xnone>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.ref<!fir.array<?xnone>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.ref<!fir.array<?xnone>>)
 ! CHECK:           fir.call @_QPs2(%[[VAL_3]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xnone>>) -> ()
 ! CHECK:           return
 ! CHECK:         }
@@ -123,7 +123,7 @@ subroutine s5b(x)
 ! CHECK-LABEL:   func.func @_QPtest3(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest3Ex"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest3Ex"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
 ! CHECK:           fir.call @_QPs3(%[[VAL_1]]#0) fastmath<contract> : (!fir.box<!fir.array<?xnone>>) -> ()
 ! CHECK:           return
 ! CHECK:         }
@@ -131,7 +131,7 @@ subroutine s5b(x)
 ! CHECK-LABEL:   func.func @_QPtest4(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest4Ex"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest4Ex"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xnone>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xnone>>>>) -> (!fir.box<!fir.array<?xnone>>, i1)
 ! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]]#0 : (!fir.box<!fir.array<?xnone>>) -> !fir.ref<!fir.array<?xnone>>
 ! CHECK:           fir.call @_QPs4(%[[VAL_3]]) fastmath<contract> : (!fir.ref<!fir.array<?xnone>>) -> ()
@@ -142,7 +142,7 @@ subroutine s5b(x)
 ! CHECK-LABEL:   func.func @_QPtest3b(
 ! CHECK-SAME:                         %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x", fir.optional}) {
 ! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest3bEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest3bEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> i1
 ! CHECK:           %[[VAL_3:.*]]:3 = fir.if %[[VAL_2]] -> (!fir.box<!fir.array<?xnone>>, i1, !fir.box<!fir.array<?xnone>>) {
 ! CHECK:             %[[VAL_4:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xnone>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xnone>>>>) -> (!fir.box<!fir.array<?xnone>>, i1)
@@ -161,7 +161,7 @@ subroutine s5b(x)
 ! CHECK-LABEL:   func.func @_QPtest4b(
 ! CHECK-SAME:                         %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x", fir.optional}) {
 ! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest4bEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest4bEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> i1
 ! CHECK:           %[[VAL_3:.*]]:3 = fir.if %[[VAL_2]] -> (!fir.ref<!fir.array<?xnone>>, i1, !fir.box<!fir.array<?xnone>>) {
 ! CHECK:             %[[VAL_4:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xnone>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xnone>>>>) -> (!fir.box<!fir.array<?xnone>>, i1)
@@ -181,7 +181,7 @@ subroutine s5b(x)
 ! CHECK-LABEL:   func.func @_QPtest4c(
 ! CHECK-SAME:                         %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x", fir.contiguous, fir.optional}) {
 ! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFtest4cEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFtest4cEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> i1
 ! CHECK:           %[[VAL_3:.*]] = fir.if %[[VAL_2]] -> (!fir.ref<!fir.array<?xnone>>) {
 ! CHECK:             %[[VAL_4:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.box<!fir.array<?xnone>>) -> !fir.ref<!fir.array<?xnone>>
@@ -197,7 +197,7 @@ subroutine s5b(x)
 ! CHECK-LABEL:   func.func @_QPtest4d(
 ! CHECK-SAME:                         %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x", fir.contiguous}) {
 ! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QFtest4dEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QFtest4dEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.box<!fir.array<?xnone>>) -> !fir.ref<!fir.array<?xnone>>
 ! CHECK:           fir.call @_QPs4d(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?xnone>>) -> ()
 ! CHECK:           return
@@ -206,7 +206,7 @@ subroutine s5b(x)
 ! CHECK-LABEL:   func.func @_QPtest5(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest5Ex"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest5Ex"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> !fir.box<!fir.array<*:none>>
 ! CHECK:           fir.call @_QPs5(%[[VAL_2]]) fastmath<contract> : (!fir.box<!fir.array<*:none>>) -> ()
 ! CHECK:           return
@@ -215,7 +215,7 @@ subroutine s5b(x)
 ! CHECK-LABEL:   func.func @_QPtest5b(
 ! CHECK-SAME:                         %[[VAL_0:.*]]: !fir.box<!fir.array<?xnone>> {fir.bindc_name = "x", fir.optional}) {
 ! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest5bEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest5bEx"} : (!fir.box<!fir.array<?xnone>>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.box<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.box<!fir.array<?xnone>>) -> i1
 ! CHECK:           %[[VAL_3:.*]]:3 = fir.if %[[VAL_2]] -> (!fir.box<!fir.array<?xnone>>, i1, !fir.box<!fir.array<?xnone>>) {
 ! CHECK:             %[[VAL_4:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xnone>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xnone>>>>) -> (!fir.box<!fir.array<?xnone>>, i1)
diff --git a/flang/test/HLFIR/assumed_shape_with_value_keyword.f90 b/flang/test/HLFIR/assumed_shape_with_value_keyword.f90
index 0f904041b7101..89f83863b560c 100644
--- a/flang/test/HLFIR/assumed_shape_with_value_keyword.f90
+++ b/flang/test/HLFIR/assumed_shape_with_value_keyword.f90
@@ -9,7 +9,7 @@ subroutine test_integer_value1(x)
 
 ! CHECK-LABEL:  func.func @_QPtest_integer_value1(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "x"}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_integer_value1Ex"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_integer_value1Ex"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:          %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.box<!fir.array<?xi32>>, i1)
 ! CHECK:          %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
 ! CHECK:          fir.call @_QPinternal_call1(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?xi32>>) -> ()
@@ -23,7 +23,7 @@ subroutine test_integer_value2(x)
 end
 ! CHECK-LABEL:  func.func @_QPtest_integer_value2(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "x"}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_integer_value2Ex"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_integer_value2Ex"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
 ! CHECK:          %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?x?xi32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.box<!fir.array<?x?xi32>>, i1)
 ! CHECK:          %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.ref<!fir.array<?x?xi32>>
 ! CHECK:          fir.call @_QPinternal_call2(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?x?xi32>>) -> ()
@@ -31,13 +31,13 @@ subroutine test_integer_value2(x)
 ! CHECK:          return
 ! CHECK:        }
 
-subroutine test_real_value1(x) 
+subroutine test_real_value1(x)
   real, value :: x(:)
   call internal_call3(x)
 end
 ! CHECK-LABEL:  func.func @_QPtest_real_value1(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_real_value1Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_real_value1Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:          %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.box<!fir.array<?xf32>>, i1)
 ! CHECK:          %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:          fir.call @_QPinternal_call3(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?xf32>>) -> ()
@@ -45,13 +45,13 @@ subroutine test_real_value1(x)
 ! CHECK:          return
 ! CHECK:        }
 
-subroutine test_real_value2(x) 
+subroutine test_real_value2(x)
   real, value :: x(:,:)
   call internal_call4(x)
 end
 ! CHECK-LABEL:  func.func @_QPtest_real_value2(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_real_value2Ex"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_real_value2Ex"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
 ! CHECK:          %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> (!fir.box<!fir.array<?x?xf32>>, i1)
 ! CHECK:          %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 ! CHECK:          fir.call @_QPinternal_call4(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?x?xf32>>) -> ()
@@ -65,7 +65,7 @@ subroutine test_complex_value1(x)
 end
 ! CHECK-LABEL:  func.func @_QPtest_complex_value1(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?xcomplex<f32>>> {fir.bindc_name = "x"}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_complex_value1Ex"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_complex_value1Ex"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>)
 ! CHECK:          %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xcomplex<f32>>>>>) -> (!fir.box<!fir.array<?xcomplex<f32>>>, i1)
 ! CHECK:          %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?xcomplex<f32>>>) -> !fir.ref<!fir.array<?xcomplex<f32>>>
 ! CHECK:          fir.call @_QPinternal_call5(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?xcomplex<f32>>>) -> ()
@@ -79,7 +79,7 @@ subroutine test_complex_value2(x)
 end
 ! CHECK-LABEL:  func.func @_QPtest_complex_value2(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?x?xcomplex<f32>>> {fir.bindc_name = "x"}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_complex_value2Ex"} : (!fir.box<!fir.array<?x?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xcomplex<f32>>>, !fir.box<!fir.array<?x?xcomplex<f32>>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFtest_complex_value2Ex"} : (!fir.box<!fir.array<?x?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xcomplex<f32>>>, !fir.box<!fir.array<?x?xcomplex<f32>>>)
 ! CHECK:          %[[VAL_1:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?x?xcomplex<f32>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xcomplex<f32>>>>>) -> (!fir.box<!fir.array<?x?xcomplex<f32>>>, i1)
 ! CHECK:          %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?xcomplex<f32>>>) -> !fir.ref<!fir.array<?x?xcomplex<f32>>>
 ! CHECK:          fir.call @_QPinternal_call6(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?x?xcomplex<f32>>>) -> ()
@@ -95,7 +95,7 @@ subroutine test_optional1(x)
 end
 ! CHECK-LABEL:  func.func @_QPtest_optional1(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.optional}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional1Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional1Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:          %[[VAL_1:.*]] = fir.is_present %[[VAL_0]]#1 : (!fir.box<!fir.array<?xf32>>) -> i1
 ! CHECK:          fir.if %[[VAL_1:.*]] {
 ! CHECK:            %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.box<!fir.array<?xf32>>, i1)
@@ -114,7 +114,7 @@ subroutine test_optional2(x)
 end
 ! CHECK-LABEL:  func.func @_QPtest_optional2(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x", fir.optional}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional2Ex"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional2Ex"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
 ! CHECK:          %[[VAL_1:.*]] = fir.is_present %[[VAL_0]]#1 : (!fir.box<!fir.array<?x?xf32>>) -> i1
 ! CHECK:          fir.if %[[VAL_1:.*]] {
 ! CHECK:            %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_0]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> (!fir.box<!fir.array<?x?xf32>>, i1)
@@ -133,7 +133,7 @@ subroutine test_optional3(x)
 end
 ! CHECK-LABEL:  func.func @_QPtest_optional3(
 ! CHECK-SAME:     %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.optional}) {
-! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional3Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:          %[[VAL_0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, value>, uniq_name = "_QFtest_optional3Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:          %[[VAL_1:.*]] = fir.is_present %[[VAL_0]]#1 : (!fir.box<!fir.array<?xf32>>) -> i1
 ! CHECK:          cf.cond_br %[[VAL_1]], ^bb1, ^bb2
 ! CHECK:          b1:  // pred: ^bb0
diff --git a/flang/test/HLFIR/boxchar_emboxing.f90 b/flang/test/HLFIR/boxchar_emboxing.f90
index b80ff9858da34..da61e36b801bc 100644
--- a/flang/test/HLFIR/boxchar_emboxing.f90
+++ b/flang/test/HLFIR/boxchar_emboxing.f90
@@ -2,7 +2,7 @@
 
 ! CHECK-LABEL:   func.func @_QPtest1(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.class<none> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.class<none>, !fir.dscope) -> (!fir.class<none>, !fir.class<none>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.class<none>, !fir.dscope) -> (!fir.class<none>, !fir.class<none>)
 ! CHECK:           fir.select_type %[[VAL_1]]#1 : !fir.class<none> [#fir.type_is<!fir.char<1,?>>, ^bb1, unit, ^bb2]
 ! CHECK:         ^bb1:
 ! CHECK:           %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.class<none>) -> !fir.ref<!fir.char<1,?>>
@@ -44,7 +44,7 @@ end subroutine test1
 
 ! CHECK-LABEL:   func.func @_QPtest2(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.class<!fir.array<10xnone>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.class<!fir.array<10xnone>>, !fir.dscope) -> (!fir.class<!fir.array<10xnone>>, !fir.class<!fir.array<10xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.class<!fir.array<10xnone>>, !fir.dscope) -> (!fir.class<!fir.array<10xnone>>, !fir.class<!fir.array<10xnone>>)
 ! CHECK:           fir.select_type %[[VAL_1]]#1 : !fir.class<!fir.array<10xnone>> [#fir.type_is<!fir.char<1,?>>, ^bb1, unit, ^bb2]
 ! CHECK:         ^bb1:
 ! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#1 : (!fir.class<!fir.array<10xnone>>) -> !fir.box<!fir.array<10x!fir.char<1,?>>>
diff --git a/flang/test/HLFIR/c_ptr_byvalue.f90 b/flang/test/HLFIR/c_ptr_byvalue.f90
index f39059a8cfa8d..651c37bb27f11 100644
--- a/flang/test/HLFIR/c_ptr_byvalue.f90
+++ b/flang/test/HLFIR/c_ptr_byvalue.f90
@@ -22,7 +22,7 @@ end subroutine get_expected_f
 ! CHECK-LABEL:   func.func @_QPtest2(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "cptr"}) {
 ! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_97:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest2Ecptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK:           %[[VAL_97:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest2Ecptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
 ! CHECK:           %[[VAL_99:.*]] = fir.coordinate_of %[[VAL_97]]#0, __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64>
 ! CHECK:           %[[VAL_100:.*]] = fir.load %[[VAL_99]] : !fir.ref<i64>
 ! CHECK:           %[[VAL_101:.*]] = fir.convert %[[VAL_100]] : (i64) -> !fir.ref<i64>
diff --git a/flang/test/HLFIR/call_with_poly_dummy.f90 b/flang/test/HLFIR/call_with_poly_dummy.f90
index 93cd410428f7b..9b74bfbc293a0 100644
--- a/flang/test/HLFIR/call_with_poly_dummy.f90
+++ b/flang/test/HLFIR/call_with_poly_dummy.f90
@@ -23,7 +23,7 @@ end subroutine test1
 ! CHECK-LABEL:   func.func @_QPtest2(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest2Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<f32>
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           %[[VAL_4:.*]] = arith.cmpf oeq, %[[VAL_2]], %[[VAL_3]] {{.*}} : f32
diff --git a/flang/test/HLFIR/inline-hlfir-copy-in.fir b/flang/test/HLFIR/inline-hlfir-copy-in.fir
index f3c4b38962a0c..f1da1da9f9a5c 100644
--- a/flang/test/HLFIR/inline-hlfir-copy-in.fir
+++ b/flang/test/HLFIR/inline-hlfir-copy-in.fir
@@ -75,7 +75,7 @@ func.func private @_test_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>> {
 // CHECK:    %[[VAL_22:.*]] = fir.box_addr %[[VAL_21:.*]]#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
 // CHECK:    %[[VAL_23:.*]]:3 = hlfir.associate %[[VAL_5:.*]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
 // CHECK:    fir.call @_QFPsb(%[[VAL_22:.*]], %[[VAL_23:.*]]#0) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
-// CHECK:    hlfir.copy_out %16, %15#1 : (!fir.ref<!fir.box<!fir.array<?xf64>>>, i1) -> ()
+// CHECK:    hlfir.copy_out %{{.*}}, %[[VAL_21:.*]]#1 : (!fir.ref<!fir.box<!fir.array<?xf64>>>, i1) -> ()
 // CHECK:    hlfir.end_associate %[[VAL_23:.*]]#1, %[[VAL_23:.*]]#2 : !fir.ref<i32>, i1
 // CHECK:    return
 // CHECK:  }
diff --git a/flang/test/HLFIR/optional_dummy.f90 b/flang/test/HLFIR/optional_dummy.f90
index ecb14f60fd7df..86ddeb9ba135a 100644
--- a/flang/test/HLFIR/optional_dummy.f90
+++ b/flang/test/HLFIR/optional_dummy.f90
@@ -5,7 +5,7 @@
 
 ! CHECK-LABEL:   func.func @_QPtest(
 ! CHECK-SAME:        %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "ext_buf", fir.contiguous, fir.optional}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFtestEext_buf"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFtestEext_buf"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#1 : (!fir.box<!fir.array<?xi32>>) -> i1
 ! CHECK:           cf.cond_br %[[VAL_2]], ^bb1, ^bb2
 ! CHECK:         ^bb1:
diff --git a/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir b/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir
index 1d198765aff9e..855b62ca0ed39 100644
--- a/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir
+++ b/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir
@@ -91,10 +91,8 @@ func.func @test_need_to_save_rhs(%n: i64, %arg1: !fir.box<!fir.array<?x!ptr_wrap
 // CHECK:             %[[VAL_21:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_20]])  : (!fir.box<!fir.array<?x!fir.type<ptr_wrapper{p:!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>}>>>, i64) -> !fir.ref<!fir.type<ptr_wrapper{p:!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>}>>
 // CHECK:             %[[VAL_22:.*]] = hlfir.designate %[[VAL_21]]{"p"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<ptr_wrapper{p:!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>}>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>>
 // CHECK:             %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref<!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>>
-// CHECK:             %[[VAL_24:.*]] = fir.box_addr %[[VAL_23]] : (!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>) -> !fir.ptr<!fir.type<t{i:i64}>>
-// CHECK:             %[[VAL_25:.*]] = fir.embox %[[VAL_24]] : (!fir.ptr<!fir.type<t{i:i64}>>) -> !fir.box<!fir.type<t{i:i64}>>
-// CHECK:             %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (!fir.box<!fir.type<t{i:i64}>>) -> !fir.box<none>
-// CHECK:             fir.call @_FortranAPushDescriptor(%[[VAL_16]], %[[VAL_26]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
+// CHECK:             %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>) -> !fir.box<none>
+// CHECK:             fir.call @_FortranAPushDescriptor(%[[VAL_16]], %[[VAL_24]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
 // CHECK:           }
 // CHECK:           %[[VAL_27:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
 // CHECK:           %[[VAL_28:.*]] = fir.convert %[[VAL_0]] : (i64) -> index
diff --git a/flang/test/HLFIR/order_assignments/forall-proc-pointer-assignment-scheduling-character.f90 b/flang/test/HLFIR/order_assignments/forall-proc-pointer-assignment-scheduling-character.f90
index d2d1939890882..ff7f70bac1513 100644
--- a/flang/test/HLFIR/order_assignments/forall-proc-pointer-assignment-scheduling-character.f90
+++ b/flang/test/HLFIR/order_assignments/forall-proc-pointer-assignment-scheduling-character.f90
@@ -44,7 +44,7 @@ pure character(2) function f10()
 
   integer pure function decode(c)
     character(2), intent(in) :: c
-    decode = modulo(iachar(c(2:2))-49,10)+1 
+    decode = modulo(iachar(c(2:2))-49,10)+1
   end function
 
   subroutine test_no_conflict(x)
diff --git a/flang/test/Integration/debug-char-arg-issue-112886.f90 b/flang/test/Integration/debug-char-arg-issue-112886.f90
new file mode 100644
index 0000000000000..e2ebb3891ecd4
--- /dev/null
+++ b/flang/test/Integration/debug-char-arg-issue-112886.f90
@@ -0,0 +1,46 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | \
+! RUN:   FileCheck %s --check-prefix=LLVM
+
+! Test CHARACTER argument
+subroutine char_arg(str1)
+  character(len=5) :: str1
+  print *, str1
+end subroutine
+
+! Test CHARACTER argument with different length
+subroutine char_arg_len10(str2)
+  character(len=10) :: str2
+  print *, str2
+end subroutine
+
+! Test multiple CHARACTER arguments
+subroutine multi_char_args(s1, s2, s3)
+  character(len=5) :: s1
+  character(len=8) :: s2
+  character(len=3) :: s3
+  print *, s1, s2, s3
+end subroutine
+
+! Test mixed argument types (CHARACTER and INTEGER)
+subroutine mixed_args(n, str, m)
+  integer :: n
+  character(len=7) :: str
+  integer :: m
+  print *, n, str, m
+end subroutine
+
+program test
+  call char_arg('hello')
+  call char_arg_len10('hello test')
+  call multi_char_args('abc', 'test123', 'xyz')
+  call mixed_args(1, 'fortran', 2)
+end program test
+
+! LLVM-DAG: !DILocalVariable(name: "str1", arg: 1
+! LLVM-DAG: !DILocalVariable(name: "str2", arg: 1
+! LLVM-DAG: !DILocalVariable(name: "s1", arg: 1
+! LLVM-DAG: !DILocalVariable(name: "s2", arg: 2
+! LLVM-DAG: !DILocalVariable(name: "s3", arg: 3
+! LLVM-DAG: !DILocalVariable(name: "n", arg: 1
+! LLVM-DAG: !DILocalVariable(name: "str", arg: 2
+! LLVM-DAG: !DILocalVariable(name: "m", arg: 3
diff --git a/flang/test/Integration/debug-proc-ptr-e2e.f90 b/flang/test/Integration/debug-proc-ptr-e2e.f90
new file mode 100644
index 0000000000000..aa89160b7c8f9
--- /dev/null
+++ b/flang/test/Integration/debug-proc-ptr-e2e.f90
@@ -0,0 +1,26 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s
+
+program test_proc_ptr
+  implicit none
+  procedure(fun1), pointer :: fun_ptr
+
+  fun_ptr => fun1
+  print *, fun_ptr(3)
+
+contains
+  integer function fun1(x)
+    integer :: x
+    fun1 = x + 1
+  end function fun1
+end program test_proc_ptr
+
+! Check that fun_ptr is declared with correct type
+! CHECK-DAG: ![[INT:.*]] = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
+! CHECK-DAG: ![[PTR_INT:.*]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[INT]], size: 64)
+
+! Check that fun_ptr variable is a pointer to a subroutine type
+! The order is: DILocalVariable -> pointer type -> subroutine type -> {return, params}
+! CHECK-DAG: ![[FUN_PTR_VAR:.*]] = !DILocalVariable(name: "fun_ptr", {{.*}}type: ![[PROC_PTR:[0-9]+]]
+! CHECK-DAG: ![[PROC_PTR]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[SUBR_TYPE:[0-9]+]], size: 64)
+! CHECK-DAG: ![[SUBR_TYPE]] = !DISubroutineType(types: ![[SUBR_TYPES:[0-9]+]])
+! CHECK-DAG: ![[SUBR_TYPES]] = !{![[INT]], ![[PTR_INT]]}
diff --git a/flang/test/Integration/unroll.f90 b/flang/test/Integration/unroll.f90
index f2c2ecb5cffac..63c71e1dc0078 100644
--- a/flang/test/Integration/unroll.f90
+++ b/flang/test/Integration/unroll.f90
@@ -3,7 +3,7 @@
 ! CHECK-LABEL: unroll_dir
 subroutine unroll_dir
   integer :: a(10)
-  !dir$ unroll 
+  !dir$ unroll
   ! CHECK:   br i1 {{.*}}, label {{.*}}, label {{.*}}
   ! CHECK-NOT: !llvm.loop
   ! CHECK:   br label {{.*}}, !llvm.loop ![[UNROLL_ENABLE_FULL_ANNO:.*]]
diff --git a/flang/test/Integration/unroll_and_jam.f90 b/flang/test/Integration/unroll_and_jam.f90
index 05b3aaa04a1e0..e5e509cce15aa 100644
--- a/flang/test/Integration/unroll_and_jam.f90
+++ b/flang/test/Integration/unroll_and_jam.f90
@@ -27,7 +27,7 @@ end subroutine unroll_and_jam_dir_0
 ! CHECK-LABEL: unroll_and_jam_dir_1
 subroutine unroll_and_jam_dir_1
   integer :: a(10)
-  !dir$ unroll_and_jam 1 
+  !dir$ unroll_and_jam 1
   ! CHECK:   br i1 {{.*}}, label {{.*}}, label {{.*}}
   ! CHECK-NOT: !llvm.loop
   ! CHECK:   br label {{.*}}, !llvm.loop ![[ANNOTATION_DISABLE]]
diff --git a/flang/test/Lower/CUDA/cuda-atomicadd.cuf b/flang/test/Lower/CUDA/cuda-atomicadd.cuf
new file mode 100644
index 0000000000000..6669b4afa291d
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-atomicadd.cuf
@@ -0,0 +1,35 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test CUDA Fortran atmoicadd functions available cudadevice module
+
+attributes(global) subroutine test_atomicaddvector_r2()
+  real(2), device :: a(2), tmp1(2), tmp2(2)
+  tmp1 = atomicAddVector(a, tmp2)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_atomicaddvector_r2() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf16>
+
+attributes(global) subroutine test_atomicaddvector_r4()
+  real(4), device :: a(2), tmp1(2), tmp2(2)
+  tmp1 = atomicAddVector(a, tmp2)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_atomicaddvector_r4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf32>
+
+attributes(global) subroutine test_atomicadd_r2x4()
+  real(4), device :: a(2), tmp1(2), tmp2(2)
+  tmp1 = atomicaddreal4x2(a, tmp2)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_atomicadd_r2x4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf32>
+
+attributes(global) subroutine test_atomicadd_r4x4()
+  real(4), device :: a(4), tmp1(4), tmp2(4)
+  tmp1 = atomicaddreal4x4(a, tmp2)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_atomicadd_r4x4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<4xf32>
diff --git a/flang/test/Lower/CUDA/cuda-data-attribute.cuf b/flang/test/Lower/CUDA/cuda-data-attribute.cuf
index 9023bc72cc149..17bdce975c3d0 100644
--- a/flang/test/Lower/CUDA/cuda-data-attribute.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-attribute.cuf
@@ -54,28 +54,28 @@ subroutine dummy_arg_device(dd)
 end subroutine
 ! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_device(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "dd"}) {
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {data_attr = #cuf.cuda<device>, uniq_name = "_QMcuda_varFdummy_arg_deviceEdd"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {data_attr = #cuf.cuda<device>, uniq_name = "_QMcuda_varFdummy_arg_deviceEdd"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 
 subroutine dummy_arg_managed(dm)
   real, allocatable, managed :: dm
 end subroutine
 ! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_managed(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {cuf.data_attr = #cuf.cuda<managed>, fir.bindc_name = "dm"}) {
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {data_attr = #cuf.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_managedEdm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {data_attr = #cuf.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_managedEdm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
 subroutine dummy_arg_pinned(dp)
   real, allocatable, pinned :: dp
 end subroutine
 ! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_pinned(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {cuf.data_attr = #cuf.cuda<pinned>, fir.bindc_name = "dp"}) {
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {data_attr = #cuf.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_pinnedEdp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {data_attr = #cuf.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_pinnedEdp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
 subroutine dummy_arg_unified(du)
   real, unified :: du
 end subroutine
 ! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_unified(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {cuf.data_attr = #cuf.cuda<unified>, fir.bindc_name = "du"})
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {data_attr = #cuf.cuda<unified>, uniq_name = "_QMcuda_varFdummy_arg_unifiedEdu"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {data_attr = #cuf.cuda<unified>, uniq_name = "_QMcuda_varFdummy_arg_unifiedEdu"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 
 subroutine cuda_alloc_free(n)
   integer :: n
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index b0b8d09c0c55b..015947430b07c 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -199,9 +199,9 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPsub7(
 ! CHECK-SAME:  %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {fir.bindc_name = "b"}, %[[ARG2:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "c"}) {
-! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
-! CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
-! CHECK: %[[C:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ec"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} arg {{[0-9]+}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+! CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{.*}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+! CHECK: %[[C:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %0 arg {{[0-9]+}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ec"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 ! CHECK: cuf.data_transfer %[[A]]#0 to %[[B]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK: cuf.data_transfer %[[B]]#0 to %[[A]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK: cuf.data_transfer %[[A]]#0 to %[[C]]#0 {transfer_kind = #cuf.cuda_transfer<device_device>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
@@ -216,8 +216,8 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPsub8(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xi32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "b"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "n"})
-! CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{.*}} {uniq_name = "_QFsub8Eb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
-! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub8Ea"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
+! CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{.*}} {{.*}} {uniq_name = "_QFsub8Eb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{.*}} {{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub8Ea"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
 ! CHECK: cuf.data_transfer %[[A]]#1 to %[[B]]#0, %{{.*}} : !fir.shape<1> {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.array<?xi32>>, !fir.ref<!fir.array<10xi32>>
 ! CHECK: cuf.data_transfer %[[B]]#0 to %[[A]]#1, %{{.*}} : !fir.shape<1> {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<?xi32>>
 
@@ -242,7 +242,7 @@ end subroutine
 ! CHECK-LABEL: func.func @_QPsub10(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}
 
-! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %1 {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub10Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %1 {{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub10Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: cuf.data_transfer %[[A]]#0 to %{{.*}}#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<i32>, !fir.ref<i32>
 ! CHECK-NOT: cuf.data_transfer
 
@@ -305,9 +305,9 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPsub15(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a_dev"}, %[[ARG1:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a_host"}
-! CHECK: %[[ADEV:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub15Ea_dev"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[ADEV:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{.*}} {{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub15Ea_dev"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK: %[[AHOST:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{.*}} {uniq_name = "_QFsub15Ea_host"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[AHOST:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{.*}} {{.*}} {uniq_name = "_QFsub15Ea_host"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: cuf.data_transfer %[[AHOST]]#1 to %[[ADEV]]#1, %[[SHAPE]] : !fir.shape<1> {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>
 
 ! Check that cuf.data_transfer are not generated within OpenACC region
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index e5d3c437d7152..3a255afd59263 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -14,15 +14,14 @@ attributes(global) subroutine devsub()
   integer :: smalltime
   integer(4) :: res, offset
   integer(8) :: resl
+  real(2) :: r2a(2)
+  real(2) :: tmp2(2)
 
   integer :: tid
   tid = threadIdx%x
 
   call syncthreads()
   call syncwarp(1)
-  call threadfence()
-  call threadfence_block()
-  call threadfence_system()
   ret = syncthreads_and(1)
   res = syncthreads_and(tid > offset)
   ret = syncthreads_count(1)
@@ -34,6 +33,7 @@ attributes(global) subroutine devsub()
   al = atomicadd(al, 1_8)
   af = atomicadd(af, 1.0_4)
   ad = atomicadd(ad, 1.0_8)
+  ai = atomicadd(r2a, tmp2)
   
   ai = atomicsub(ai, 1_4)
   al = atomicsub(al, 1_8)
@@ -102,10 +102,7 @@ end
 
 ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
 ! CHECK: nvvm.barrier0
-! CHECK: fir.call @llvm.nvvm.bar.warp.sync(%c1{{.*}}) fastmath<contract> : (i32) -> ()
-! CHECK: fir.call @llvm.nvvm.membar.gl() fastmath<contract> : () -> ()
-! CHECK: fir.call @llvm.nvvm.membar.cta() fastmath<contract> : () -> ()
-! CHECK: fir.call @llvm.nvvm.membar.sys() fastmath<contract> : () -> ()
+! CHECK: nvvm.bar.warp.sync %c1{{.*}} : i32 
 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32
 ! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32>
@@ -128,6 +125,7 @@ end
 ! CHECK: %{{.*}} = llvm.atomicrmw add  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64
+! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf16>
 
 ! CHECK: %{{.*}} = llvm.atomicrmw sub  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 ! CHECK: %{{.*}} = llvm.atomicrmw sub  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
@@ -215,7 +213,7 @@ end
 ! CHECK-LABEL: func.func @_QPhost1()
 ! CHECK: cuf.kernel
 ! CHECK: nvvm.barrier0
-! CHECK: fir.call @llvm.nvvm.bar.warp.sync(%c1{{.*}}) fastmath<contract> : (i32) -> ()
+! CHECK: nvvm.bar.warp.sync %c1{{.*}} : i32 
 ! CHECK: fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32
 ! CHECK: fir.call @llvm.nvvm.barrier0.popc(%c1{{.*}}) fastmath<contract> : (i32) -> i32
 ! CHECK: fir.call @llvm.nvvm.barrier0.or(%c1{{.*}}) fastmath<contract> : (i32) -> i32
@@ -431,16 +429,16 @@ end subroutine
 ! CHECK: %[[COUNT:.*]] = arith.constant 256 : i32
 ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
 ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
-! CHECK: nvvm.mbarrier.init.shared %[[SHARED_PTR]], %[[COUNT]] : !llvm.ptr<3>, i32
+! CHECK: nvvm.mbarrier.init %[[SHARED_PTR]], %[[COUNT]] : !llvm.ptr<3>, i32
 ! CHECK: nvvm.fence.proxy {kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>}
 
 ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
 ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
-! CHECK: %{{.*}} = nvvm.mbarrier.arrive.shared %[[SHARED_PTR]] : !llvm.ptr<3> -> i64
+! CHECK: %{{.*}} = nvvm.mbarrier.arrive %[[SHARED_PTR]] : !llvm.ptr<3> -> i64
 
 ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
 ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
-! CHECK: nvvm.mbarrier.arrive.expect_tx %[[SHARED_PTR]], %{{.*}} : !llvm.ptr<3>, i32
+! CHECK: %{{.*}} = nvvm.inline_ptx "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %{{.*}}, [%{{.*}}], %{{.*}};" ro(%{{.*}}, %{{.*}} : !llvm.ptr<3>, i32) -> i64
 
 
 attributes(global) subroutine test_fence()
@@ -490,7 +488,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_bulk_s2g
 ! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(device) subroutine testAtomicCasLoop(aa, n)
@@ -515,7 +513,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_barrier_try_wait()
 ! CHECK: scf.while
-! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %{{.*}}, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %c1000000{{.*}} : !llvm.ptr, i64, i32) -> i32
+! CHECK: %{{.*}} = nvvm.inline_ptx "{\0A  .reg .pred p;\0A  mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}};\0A  selp.b32 %{{.*}}, 1, 0, p;\0A}" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32
 
 attributes(global) subroutine test_barrier_try_wait_sleep()
   integer :: istat
@@ -526,7 +524,7 @@ attributes(global) subroutine test_barrier_try_wait_sleep()
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_barrier_try_wait_sleep()
-! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %0, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32
+! CHECK: %{{.*}} = nvvm.inline_ptx "{\0A  .reg .pred p;\0A  mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}};\0A  selp.b32 %{{.*}}, 1, 0, p;\0A}" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32
 
 attributes(global) subroutine test_tma_bulk_load_c4(a, n)
   integer(8), shared :: barrier1
@@ -671,7 +669,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c4
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(global) subroutine test_tma_bulk_store_c8(c, n)
@@ -684,7 +682,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c8
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(global) subroutine test_tma_bulk_store_i4(c, n)
@@ -697,7 +695,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i4
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(global) subroutine test_tma_bulk_store_i8(c, n)
@@ -710,7 +708,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i8
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 
@@ -724,7 +722,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r2
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(global) subroutine test_tma_bulk_store_r4(c, n)
@@ -737,7 +735,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r4
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(global) subroutine test_tma_bulk_store_r8(c, n)
@@ -750,5 +748,5 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r8
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
diff --git a/flang/test/Lower/CUDA/cuda-synchronization.cuf b/flang/test/Lower/CUDA/cuda-synchronization.cuf
new file mode 100644
index 0000000000000..6e2e23423c360
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-synchronization.cuf
@@ -0,0 +1,14 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test CUDA Fortran instrinsics lowerings for synchronization.
+
+attributes(global) subroutine sync()
+  call threadfence()
+  call threadfence_block()
+  call threadfence_system()
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsync() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: nvvm.memory.barrier <gpu>
+! CHECK: nvvm.memory.barrier <cta>
+! CHECK: nvvm.memory.barrier <sys>
diff --git a/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90 b/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90
index efe9e6dd190c0..2222259bda458 100644
--- a/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90
+++ b/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90
@@ -50,7 +50,7 @@ end subroutine integer_assumed_shape_array
 ! CHECK-SAME:                                              %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "i", fir.target}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>>
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFinteger_assumed_shape_arrayEi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFinteger_assumed_shape_arrayEi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.rebox %[[VAL_3]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
 ! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
 ! CHECK:           fir.call @_QPinteger_assumed_shape_array_callee(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> ()
@@ -159,8 +159,8 @@ end subroutine char_assumed_shape_array
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>>
 ! CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,2>>>>
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 2 : index
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_8]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_assumed_shape_arrayEa1"} : (!fir.box<!fir.array<?x!fir.char<1,2>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,2>>>, !fir.box<!fir.array<?x!fir.char<1,2>>>)
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_assumed_shape_arrayEa2"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_8]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_assumed_shape_arrayEa1"} : (!fir.box<!fir.array<?x!fir.char<1,2>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,2>>>, !fir.box<!fir.array<?x!fir.char<1,2>>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_assumed_shape_arrayEa2"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_11:.*]] = fir.rebox %[[VAL_9]]#1 : (!fir.box<!fir.array<?x!fir.char<1,2>>>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,2>>>>
 ! CHECK:           fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,2>>>>>
 ! CHECK:           fir.call @_QPchar_assumed_shape_array_explicit_len_callee(%[[VAL_7]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.char<1,2>>>>>) -> ()
@@ -220,7 +220,7 @@ end subroutine char_explicit_shape_array
 ! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<100x!fir.char<1,?>>>
 ! CHECK:           %[[VAL_14:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_15:.*]] = fir.shape %[[VAL_14]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_13]](%[[VAL_15]]) typeparams %[[VAL_12]]#1 dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_explicit_shape_arrayEa2"} : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<100x!fir.char<1,?>>>, !fir.ref<!fir.array<100x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_13]](%[[VAL_15]]) typeparams %[[VAL_12]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_explicit_shape_arrayEa2"} : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<100x!fir.char<1,?>>>, !fir.ref<!fir.array<100x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_17:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_18:.*]] = fir.convert %[[VAL_11]]#0 : (!fir.ref<!fir.array<100x!fir.char<1,2>>>) -> !fir.ref<!fir.array<?x!fir.char<1,2>>>
 ! CHECK:           %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_17]]) : (!fir.ref<!fir.array<?x!fir.char<1,2>>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,2>>>>
@@ -317,7 +317,7 @@ end subroutine type_assumed_shape_array
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>>
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtype_assumed_shape_arrayEt"} : (!fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtype_assumed_shape_arrayEt"} : (!fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.rebox %[[VAL_4]]#1 : (!fir.box<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
 ! CHECK:           fir.store %[[VAL_5]] to %[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>>
 ! CHECK:           fir.call @_QPtype_assumed_shape_array_callee(%[[VAL_3]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>>) -> ()
@@ -400,7 +400,7 @@ end subroutine class_scalar
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<none>>
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.type<_QMtarget_to_pointer_typesTt1>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.type<_QMtarget_to_pointer_typesTt1>>>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_scalarEt"} : (!fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>, !fir.dscope) -> (!fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>, !fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_scalarEt"} : (!fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>, !fir.dscope) -> (!fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>, !fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.rebox %[[VAL_4]]#1 : (!fir.class<!fir.type<_QMtarget_to_pointer_typesTt1>>) -> !fir.box<!fir.ptr<!fir.type<_QMtarget_to_pointer_typesTt1>>>
 ! CHECK:           fir.store %[[VAL_5]] to %[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.type<_QMtarget_to_pointer_typesTt1>>>>
 ! CHECK:           fir.call @_QPclass_scalar_callee(%[[VAL_3]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QMtarget_to_pointer_typesTt1>>>>) -> ()
@@ -439,7 +439,7 @@ end subroutine class_assumed_shape_array
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>>
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_assumed_shape_arrayEt"} : (!fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_assumed_shape_arrayEt"} : (!fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.rebox %[[VAL_4]]#1 : (!fir.class<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
 ! CHECK:           fir.store %[[VAL_5]] to %[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>>
 ! CHECK:           fir.call @_QPclass_assumed_shape_array_callee(%[[VAL_3]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>>) -> ()
@@ -478,7 +478,7 @@ end subroutine class_explicit_shape_array
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>>
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_explicit_shape_arrayEt"} : (!fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.dscope) -> (!fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFclass_explicit_shape_arrayEt"} : (!fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.dscope) -> (!fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>, !fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.rebox %[[VAL_4]]#1 : (!fir.class<!fir.array<100x!fir.type<_QMtarget_to_pointer_typesTt1>>>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>
 ! CHECK:           fir.store %[[VAL_5]] to %[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>>
 ! CHECK:           fir.call @_QPclass_explicit_shape_array_callee(%[[VAL_3]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMtarget_to_pointer_typesTt1>>>>>) -> ()
@@ -505,7 +505,7 @@ end subroutine uclass_scalar
 ! CHECK-LABEL:   func.func @_QPuclass_scalar(
 ! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.class<none> {fir.bindc_name = "t", fir.target}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<none>>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_scalarEt"} : (!fir.class<none>, !fir.dscope) -> (!fir.class<none>, !fir.class<none>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_scalarEt"} : (!fir.class<none>, !fir.dscope) -> (!fir.class<none>, !fir.class<none>)
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox %[[VAL_2]]#1 : (!fir.class<none>) -> !fir.class<!fir.ptr<none>>
 ! CHECK:           fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<!fir.class<!fir.ptr<none>>>
 ! CHECK:           fir.call @_QPuclass_scalar_uclass_callee(%[[VAL_1]]) fastmath<contract> : (!fir.ref<!fir.class<!fir.ptr<none>>>) -> ()
@@ -526,7 +526,7 @@ end subroutine uclass_assumed_shape_array
 ! CHECK-LABEL:   func.func @_QPuclass_assumed_shape_array(
 ! CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.class<!fir.array<?xnone>> {fir.bindc_name = "t", fir.target}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_assumed_shape_arrayEt"} : (!fir.class<!fir.array<?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_assumed_shape_arrayEt"} : (!fir.class<!fir.array<?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox %[[VAL_2]]#1 : (!fir.class<!fir.array<?xnone>>) -> !fir.class<!fir.ptr<!fir.array<?xnone>>>
 ! CHECK:           fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>
 ! CHECK:           fir.call @_QPuclass_assumed_shape_array_uclass_callee(%[[VAL_1]]) fastmath<contract> : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) -> ()
@@ -547,7 +547,7 @@ end subroutine uclass_explicit_shape_array
 ! CHECK-LABEL:   func.func @_QPuclass_explicit_shape_array(
 ! CHECK-SAME:                                              %[[VAL_0:.*]]: !fir.class<!fir.array<100xnone>> {fir.bindc_name = "t", fir.target}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?xnone>>>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_explicit_shape_arrayEt"} : (!fir.class<!fir.array<100xnone>>, !fir.dscope) -> (!fir.class<!fir.array<100xnone>>, !fir.class<!fir.array<100xnone>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFuclass_explicit_shape_arrayEt"} : (!fir.class<!fir.array<100xnone>>, !fir.dscope) -> (!fir.class<!fir.array<100xnone>>, !fir.class<!fir.array<100xnone>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox %[[VAL_2]]#1 : (!fir.class<!fir.array<100xnone>>) -> !fir.class<!fir.ptr<!fir.array<?xnone>>>
 ! CHECK:           fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>
 ! CHECK:           fir.call @_QPuclass_explicit_shape_array_uclass_callee(%[[VAL_1]]) fastmath<contract> : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) -> ()
diff --git a/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90 b/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90
index 08492e913c992..9d77516a3fd38 100644
--- a/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90
+++ b/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90
@@ -5,7 +5,7 @@
 subroutine allocation(x)
   character(*), allocatable :: x(:)
 ! CHECK-LABEL: func.func @_QPallocation(
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_2:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>,  {{.*}}Ex
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_2:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>,  {{.*}}Ex
   deallocate(x)
 ! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
 ! CHECK:  %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>) -> !fir.heap<!fir.array<?x!fir.char<1,?>>>
@@ -30,8 +30,8 @@ subroutine pointer_assignment(p, ziel)
   real, pointer :: p(:)
   real, target :: ziel(42:)
 ! CHECK-LABEL: func.func @_QPpointer_assignment(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>,  {{.*}}Ep
-! CHECK:  %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_5:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>,  {{.*}}Eziel
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>,  {{.*}}Ep
+! CHECK:  %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_5:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>,  {{.*}}Eziel
   p => ziel
 ! CHECK:  %[[VAL_7:.*]] = fir.shift %[[VAL_4:.*]] : (index) -> !fir.shift<1>
 ! CHECK:  %[[VAL_8:.*]] = fir.rebox %[[VAL_6]]#1(%[[VAL_7]]) : (!fir.box<!fir.array<?xf32>>, !fir.shift<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
@@ -46,8 +46,8 @@ subroutine pointer_remapping(p, ziel)
   real, pointer :: p(:, :)
   real, target :: ziel(10, 20, 30)
 ! CHECK-LABEL: func.func @_QPpointer_remapping(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>,  {{.*}}Ep
-! CHECK:  %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_6:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>,  {{.*}}Eziel
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>,  {{.*}}Ep
+! CHECK:  %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_6:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>,  {{.*}}Eziel
   p(2:7, 3:102) => ziel
 ! CHECK:  %[[VAL_8:.*]] = arith.constant 2 : i64
 ! CHECK:  %[[VAL_9:.*]] = arith.constant 7 : i64
@@ -105,7 +105,7 @@ subroutine ptr_comp_assign(x, ziel)
   x(9_8)%p => ziel
 ! CHECK:  %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:  %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_6:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>,  {{.*}}Eziel
+! CHECK:  %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_6:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>,  {{.*}}Eziel
 ! CHECK:  %[[VAL_8:.*]] = arith.constant 9 : index
 ! CHECK:  %[[VAL_9:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_8]])  : (!fir.ref<!fir.array<10x!fir.type<_QFptr_comp_assignTt{p:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>>, index) -> !fir.ref<!fir.type<_QFptr_comp_assignTt{p:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>
 ! CHECK:  %[[VAL_10:.*]] = hlfir.designate %[[VAL_9]]{"p"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<_QFptr_comp_assignTt{p:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
diff --git a/flang/test/Lower/HLFIR/allocatables-and-pointers.f90 b/flang/test/Lower/HLFIR/allocatables-and-pointers.f90
index d6cbea8d5c8bc..ed20d2ef23dfa 100644
--- a/flang/test/Lower/HLFIR/allocatables-and-pointers.f90
+++ b/flang/test/Lower/HLFIR/allocatables-and-pointers.f90
@@ -15,7 +15,7 @@ subroutine takes_array(y)
   call takes_array(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPpassing_allocatable(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name =  {{.*}}Ex"}
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name =  {{.*}}Ex"}
 ! CHECK:  fir.call @_QPtakes_allocatable(%[[VAL_1]]#0) {{.*}} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> ()
 ! CHECK:  %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
@@ -34,7 +34,7 @@ subroutine takes_pointer(y)
 end subroutine
 ! CHECK-LABEL: func.func @_QPpassing_pointer(
 ! CHECK:  %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xf32>>>
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name =  {{.*}}Ex"}
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name =  {{.*}}Ex"}
 ! CHECK:  fir.call @_QPtakes_pointer(%[[VAL_2]]#0) {{.*}} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> ()
 ! CHECK:  %[[VAL_3:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xf32>>
 ! CHECK:  %[[VAL_4:.*]] = arith.constant 0 : index
@@ -53,7 +53,7 @@ subroutine takes_array(y)
   call takes_array(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPpassing_contiguous_pointer(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, pointer>, uniq_name =  {{.*}}Ex"}
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, pointer>, uniq_name =  {{.*}}Ex"}
 ! CHECK:  %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>>
 ! CHECK:  %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ptr<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
@@ -66,7 +66,7 @@ subroutine character_allocatable_cst_len(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPcharacter_allocatable_cst_len(
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 10 : index
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name =  {{.*}}Ex"}
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name =  {{.*}}Ex"}
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>>
 ! CHECK:  %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.heap<!fir.char<1,10>>>) -> !fir.heap<!fir.char<1,10>>
 ! CHECK:  %[[VAL_5:.*]] = arith.constant 10 : index
@@ -87,12 +87,12 @@ subroutine character_allocatable_dyn_len(x, l)
   call takes_char(x//"hello")
 end subroutine
 ! CHECK-LABEL: func.func @_QPcharacter_allocatable_dyn_len(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {uniq_name =  {{.*}}El"}
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name =  {{.*}}El"}
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i64>
 ! CHECK:  %[[VAL_4:.*]] = arith.constant 0 : i64
 ! CHECK:  %[[VAL_5:.*]] = arith.cmpi sgt, %[[VAL_3]], %[[VAL_4]] : i64
 ! CHECK:  %[[VAL_6:.*]] = arith.select %[[VAL_5]], %[[VAL_3]], %[[VAL_4]] : i64
-! CHECK:  %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_6:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name =  {{.*}}Ex"}
+! CHECK:  %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_6:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name =  {{.*}}Ex"}
 ! CHECK:  %[[VAL_8:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
 ! CHECK:  %[[VAL_9:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
 ! CHECK:  %[[VAL_10:.*]] = fir.emboxchar %[[VAL_9]], %[[VAL_6]] : (!fir.heap<!fir.char<1,?>>, i64) -> !fir.boxchar<1>
@@ -110,7 +110,7 @@ subroutine print_allocatable(x)
   print *, x
 end subroutine
 ! CHECK-LABEL: func.func @_QPprint_allocatable(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name =  {{.*}}Ex"}
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name =  {{.*}}Ex"}
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.box<none>
 ! CHECK:  %[[VAL_9:.*]] = fir.call @_FortranAioOutputDescriptor(%{{.*}}, %[[VAL_8]])
@@ -120,7 +120,7 @@ subroutine print_pointer(x)
   print *, x
 end subroutine
 ! CHECK-LABEL: func.func @_QPprint_pointer(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name =  {{.*}}Ex"}
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name =  {{.*}}Ex"}
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.box<none>
 ! CHECK:  %[[VAL_9:.*]] = fir.call @_FortranAioOutputDescriptor(%{{.*}}, %[[VAL_8]])
@@ -130,7 +130,7 @@ subroutine elemental_expr(x)
   call takes_array_2(x+42)
 end subroutine
 ! CHECK-LABEL: func.func @_QPelemental_expr(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name =  {{.*}}Ex"}
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name =  {{.*}}Ex"}
 ! CHECK:  %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xi32>>>>
 ! CHECK:  %[[VAL_3:.*]] = arith.constant 42 : i32
 ! CHECK:  %[[VAL_4:.*]] = arith.constant 0 : index
diff --git a/flang/test/Lower/HLFIR/array-ctor-as-elemental-nested.f90 b/flang/test/Lower/HLFIR/array-ctor-as-elemental-nested.f90
index 1dc033d0ba033..bad3c62422372 100644
--- a/flang/test/Lower/HLFIR/array-ctor-as-elemental-nested.f90
+++ b/flang/test/Lower/HLFIR/array-ctor-as-elemental-nested.f90
@@ -9,14 +9,14 @@
 ! CHECK-SAME:                       %[[VAL_1:.*]]: !fir.ref<!fir.array<2xf32>> {fir.bindc_name = "h1"}) {
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 2 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtestEh1"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<2xf32>>, !fir.ref<!fir.array<2xf32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtestEh1"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<2xf32>>, !fir.ref<!fir.array<2xf32>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFtestEk"}
 ! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFtestEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "l", uniq_name = "_QFtestEl"}
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFtestEl"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_9:.*]] = fir.address_of(@_QFtestECn) : !fir.ref<i32>
 ! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_9]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QFtestECn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtestEpi"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtestEpi"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:           %[[VAL_12:.*]] = arith.constant 2 : index
 ! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_14:.*]] = hlfir.elemental %[[VAL_13]] unordered : (!fir.shape<1>) -> !hlfir.expr<2xf32> {
diff --git a/flang/test/Lower/HLFIR/array-ctor-as-elemental.f90 b/flang/test/Lower/HLFIR/array-ctor-as-elemental.f90
index 10fb500f5ffb8..5c4f079c86e20 100644
--- a/flang/test/Lower/HLFIR/array-ctor-as-elemental.f90
+++ b/flang/test/Lower/HLFIR/array-ctor-as-elemental.f90
@@ -7,7 +7,7 @@ subroutine test_as_simple_elemental(n)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_as_simple_elemental(
 ! CHECK-SAME:                                           %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_as_simple_elementalEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_as_simple_elementalEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i64
@@ -42,9 +42,9 @@ subroutine test_as_strided_elemental(lb, ub, stride)
 ! CHECK-SAME:                                            %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "ub"},
 ! CHECK-SAME:                                            %[[VAL_2:.*]]: !fir.ref<i64> {fir.bindc_name = "stride"}) {
 ! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest_as_strided_elementalElb"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest_as_strided_elementalEstride"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest_as_strided_elementalEub"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest_as_strided_elementalElb"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest_as_strided_elementalEstride"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtest_as_strided_elementalEub"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i64>
 ! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i64>
@@ -92,7 +92,7 @@ integer pure function foo(i)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_as_elemental_with_pure_call(
 ! CHECK-SAME:                                                   %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_as_elemental_with_pure_callEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_as_elemental_with_pure_callEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i64
diff --git a/flang/test/Lower/HLFIR/array-ctor-as-inlined-temp.f90 b/flang/test/Lower/HLFIR/array-ctor-as-inlined-temp.f90
index 6bbfffca47047..415a9df5bf838 100644
--- a/flang/test/Lower/HLFIR/array-ctor-as-inlined-temp.f90
+++ b/flang/test/Lower/HLFIR/array-ctor-as-inlined-temp.f90
@@ -116,7 +116,7 @@ subroutine test_implied_do(n)
 ! CHECK-LABEL:   func.func @_QPtest_implied_do(
 ! CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca index
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_implied_doEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_implied_doEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 2 : i64
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i64>
@@ -178,9 +178,9 @@ subroutine test_strided_implied_do(lb, ub, stride)
 ! CHECK-SAME:                                          %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "ub"},
 ! CHECK-SAME:                                          %[[VAL_2:.*]]: !fir.ref<i64> {fir.bindc_name = "stride"}) {
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca index
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_strided_implied_doElb"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_strided_implied_doEstride"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_strided_implied_doEub"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_strided_implied_doElb"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_strided_implied_doEstride"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_strided_implied_doEub"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 2 : i64
 ! CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i64>
@@ -241,8 +241,8 @@ subroutine test_nested_implied_do(n, m)
 ! CHECK-SAME:                                         %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "n"},
 ! CHECK-SAME:                                         %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "m"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca index
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_nested_implied_doEm"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_nested_implied_doEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_nested_implied_doEm"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_nested_implied_doEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_7:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i64>
diff --git a/flang/test/Lower/HLFIR/array-ctor-index.f90 b/flang/test/Lower/HLFIR/array-ctor-index.f90
index d94f45bff336b..98b3aeece8f1b 100644
--- a/flang/test/Lower/HLFIR/array-ctor-index.f90
+++ b/flang/test/Lower/HLFIR/array-ctor-index.f90
@@ -8,7 +8,7 @@ function test1(k)
 end function test1
 ! CHECK-LABEL:   func.func @_QPtest1(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i8> {fir.bindc_name = "k"}) -> !fir.array<4xi8> {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ek"} : (!fir.ref<i8>, !fir.dscope) -> (!fir.ref<i8>, !fir.ref<i8>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest1Ek"} : (!fir.ref<i8>, !fir.dscope) -> (!fir.ref<i8>, !fir.ref<i8>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.array<4xi8> {bindc_name = "test1", uniq_name = "_QFtest1Etest1"}
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
@@ -58,7 +58,7 @@ function test2(k)
 end function test2
 ! CHECK-LABEL:   func.func @_QPtest2(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i16> {fir.bindc_name = "k"}) -> !fir.array<4xi16> {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ek"} : (!fir.ref<i16>, !fir.dscope) -> (!fir.ref<i16>, !fir.ref<i16>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest2Ek"} : (!fir.ref<i16>, !fir.dscope) -> (!fir.ref<i16>, !fir.ref<i16>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.array<4xi16> {bindc_name = "test2", uniq_name = "_QFtest2Etest2"}
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
@@ -108,7 +108,7 @@ function test3(k)
 end function test3
 ! CHECK-LABEL:   func.func @_QPtest3(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "k"}) -> !fir.array<4xi32> {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest3Ek"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest3Ek"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.array<4xi32> {bindc_name = "test3", uniq_name = "_QFtest3Etest3"}
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
@@ -158,7 +158,7 @@ function test4(k)
 end function test4
 ! CHECK-LABEL:   func.func @_QPtest4(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "k"}) -> !fir.array<4xi64> {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest4Ek"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest4Ek"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.array<4xi64> {bindc_name = "test4", uniq_name = "_QFtest4Etest4"}
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/HLFIR/assumed-rank-calls.f90 b/flang/test/Lower/HLFIR/assumed-rank-calls.f90
index 63b8d9fd81f33..74583104c1d7e 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-calls.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-calls.f90
@@ -15,7 +15,7 @@ subroutine takes_assumed_rank(x)
 ! CHECK-LABEL:   func.func @_QPtest_alloc_to_nonalloc(
 ! CHECK-SAME:                                         %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_alloc_to_nonallocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_alloc_to_nonallocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>
 ! CHECK:           %[[VAL_4:.*]] = fir.rebox_assumed_rank %[[VAL_3]] lbs ones : (!fir.box<!fir.heap<!fir.array<*:f32>>>) -> !fir.box<!fir.array<*:f32>>
 ! CHECK:           fir.call @_QPtakes_assumed_rank(%[[VAL_4]]) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
@@ -34,7 +34,7 @@ subroutine bindc_func(x) bind(c)
 ! CHECK-LABEL:   func.func @_QPtest_to_bindc(
 ! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_to_bindcEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_to_bindcEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox_assumed_rank %[[VAL_2]]#0 lbs zeroes : (!fir.box<!fir.array<*:f32>>) -> !fir.box<!fir.array<*:f32>>
 ! CHECK:           fir.call @bindc_func(%[[VAL_3]]) proc_attrs<bind_c> fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
 ! CHECK:           return
@@ -53,7 +53,7 @@ subroutine takes_target_as_pointer(x)
 ! CHECK-SAME:                                         %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x", fir.target}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<*:f32>>>
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtest_target_to_pointerEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtest_target_to_pointerEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.rebox_assumed_rank %[[VAL_3]]#0 lbs preserve : (!fir.box<!fir.array<*:f32>>) -> !fir.box<!fir.ptr<!fir.array<*:f32>>>
 ! CHECK:           fir.store %[[VAL_4]] to %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
 ! CHECK:           fir.call @_QPtakes_target_as_pointer(%[[VAL_1]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) -> ()
@@ -74,7 +74,7 @@ subroutine takes_assumed_rank_t(x)
 ! CHECK-LABEL:   func.func @_QPtest_poly_to_nonepoly(
 ! CHECK-SAME:                                        %[[VAL_0:.*]]: !fir.class<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_poly_to_nonepolyEx"} : (!fir.class<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>>, !fir.class<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_poly_to_nonepolyEx"} : (!fir.class<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>>, !fir.class<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox_assumed_rank %[[VAL_2]]#0 lbs ones : (!fir.class<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>>) -> !fir.box<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>>
 ! CHECK:           fir.call @_QPtakes_assumed_rank_t(%[[VAL_3]]) fastmath<contract> : (!fir.box<!fir.array<*:!fir.type<_QFtest_poly_to_nonepolyTt{i:i32}>>>) -> ()
 ! CHECK:           return
@@ -94,7 +94,7 @@ subroutine takes_contiguous(x)
 ! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<*:f32>>>
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_copy_in_outEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_copy_in_outEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.copy_in %[[VAL_3]]#0 to %[[VAL_1]] : (!fir.box<!fir.array<*:f32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) -> (!fir.box<!fir.array<*:f32>>, i1)
 ! CHECK:           fir.call @_QPtakes_contiguous(%[[VAL_4]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
 ! CHECK:           hlfir.copy_out %[[VAL_1]], %[[VAL_4]]#1 to %[[VAL_3]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, i1, !fir.box<!fir.array<*:f32>>) -> ()
@@ -112,7 +112,7 @@ subroutine takes_contiguous_intentin(x)
 ! CHECK-SAME:                                     %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<*:f32>>>
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_copy_in_out_2Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_copy_in_out_2Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.copy_in %[[VAL_3]]#0 to %[[VAL_1]] : (!fir.box<!fir.array<*:f32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) -> (!fir.box<!fir.array<*:f32>>, i1)
 ! CHECK:           fir.call @_QPtakes_contiguous_intentin(%[[VAL_4]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
 ! CHECK:           hlfir.copy_out %[[VAL_1]], %[[VAL_4]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, i1) -> ()
diff --git a/flang/test/Lower/HLFIR/assumed-rank-entry.f90 b/flang/test/Lower/HLFIR/assumed-rank-entry.f90
index d2e470ab85a40..3135a63c03eb5 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-entry.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-entry.f90
@@ -16,7 +16,7 @@ subroutine some_proc(x)
 ! CHECK-LABEL:   func.func @_QPtest_main_entry(
 ! CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_main_entryEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_main_entryEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 
 ! CHECK-LABEL:   func.func @_QPtest_alternate_entry() {
 ! CHECK:           %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<*:f32>>>
diff --git a/flang/test/Lower/HLFIR/assumed-rank-iface-alloc-ptr.f90 b/flang/test/Lower/HLFIR/assumed-rank-iface-alloc-ptr.f90
index fb1385f87f1bc..bcb0031cad8db 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-iface-alloc-ptr.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-iface-alloc-ptr.f90
@@ -23,7 +23,7 @@ subroutine scalar_alloc_to_assumed_rank(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPscalar_alloc_to_assumed_rank(
 ! CHECK-SAME:                                               %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFscalar_alloc_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFscalar_alloc_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>
 ! CHECK:           fir.call @_QPalloc_assumed_rank(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) -> ()
 
@@ -34,7 +34,7 @@ subroutine r2_alloc_to_assumed_rank(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPr2_alloc_to_assumed_rank(
 ! CHECK-SAME:                                           %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFr2_alloc_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFr2_alloc_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>
 ! CHECK:           fir.call @_QPalloc_assumed_rank(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) -> ()
 
@@ -45,7 +45,7 @@ subroutine scalar_pointer_to_assumed_rank(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPscalar_pointer_to_assumed_rank(
 ! CHECK-SAME:                                                 %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<f32>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFscalar_pointer_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.ptr<f32>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFscalar_pointer_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.ptr<f32>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.box<!fir.ptr<f32>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
 ! CHECK:           fir.call @_QPpointer_assumed_rank(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) -> ()
 
@@ -56,7 +56,7 @@ subroutine r2_pointer_to_assumed_rank(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPr2_pointer_to_assumed_rank(
 ! CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFr2_pointer_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFr2_pointer_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
 ! CHECK:           fir.call @_QPpointer_assumed_rank(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) -> ()
 
@@ -68,7 +68,7 @@ subroutine r2_target_to_pointer_assumed_rank(x)
 ! CHECK-LABEL:   func.func @_QPr2_target_to_pointer_assumed_rank(
 ! CHECK-SAME:                                                    %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x", fir.target}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x?xf32>>>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFr2_target_to_pointer_assumed_rankEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFr2_target_to_pointer_assumed_rankEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox %[[VAL_2]]#1 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.ptr<!fir.array<?x?xf32>>>
 ! CHECK:           fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
diff --git a/flang/test/Lower/HLFIR/assumed-rank-iface.f90 b/flang/test/Lower/HLFIR/assumed-rank-iface.f90
index ffb36fa4b7003..7837b4f4db866 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-iface.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-iface.f90
@@ -23,7 +23,7 @@ subroutine int_scalar_to_assumed_rank(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPint_scalar_to_assumed_rank(
 ! CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_scalar_to_assumed_rankEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFint_scalar_to_assumed_rankEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.embox %[[VAL_1]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
 ! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.box<i32>) -> !fir.box<!fir.array<*:i32>>
 ! CHECK:           fir.call @_QPint_assumed_rank(%[[VAL_3]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> ()
@@ -35,7 +35,7 @@ subroutine int_scalar_to_assumed_rank_bindc(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPint_scalar_to_assumed_rank_bindc(
 ! CHECK-SAME:                                                   %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_scalar_to_assumed_rank_bindcEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFint_scalar_to_assumed_rank_bindcEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.embox %[[VAL_1]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
 ! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.box<i32>) -> !fir.box<!fir.array<*:i32>>
 ! CHECK:           fir.call @int_assumed_rank_bindc(%[[VAL_3]]) proc_attrs<bind_c> fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> ()
@@ -49,7 +49,7 @@ subroutine int_r1_to_assumed_rank(x)
 ! CHECK-SAME:                                         %[[VAL_0:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_r1_to_assumed_rankEx"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFint_r1_to_assumed_rankEx"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#0(%[[VAL_2]]) : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<10xi32>>
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.array<10xi32>>) -> !fir.box<!fir.array<*:i32>>
 ! CHECK:           fir.call @_QPint_assumed_rank(%[[VAL_5]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> ()
@@ -66,7 +66,7 @@ subroutine int_r4_to_assumed_rank(x)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 5 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]] : (index, index, index, index) -> !fir.shape<4>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_r4_to_assumed_rankEx"} : (!fir.ref<!fir.array<2x3x4x5xi32>>, !fir.shape<4>, !fir.dscope) -> (!fir.ref<!fir.array<2x3x4x5xi32>>, !fir.ref<!fir.array<2x3x4x5xi32>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFint_r4_to_assumed_rankEx"} : (!fir.ref<!fir.array<2x3x4x5xi32>>, !fir.shape<4>, !fir.dscope) -> (!fir.ref<!fir.array<2x3x4x5xi32>>, !fir.ref<!fir.array<2x3x4x5xi32>>)
 ! CHECK:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[VAL_5]]) : (!fir.ref<!fir.array<2x3x4x5xi32>>, !fir.shape<4>) -> !fir.box<!fir.array<2x3x4x5xi32>>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.array<2x3x4x5xi32>>) -> !fir.box<!fir.array<*:i32>>
 ! CHECK:           fir.call @_QPint_assumed_rank(%[[VAL_8]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> ()
@@ -78,7 +78,7 @@ subroutine int_assumed_shape_to_assumed_rank(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPint_assumed_shape_to_assumed_rank(
 ! CHECK-SAME:                                                    %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_assumed_shape_to_assumed_rankEx"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFint_assumed_shape_to_assumed_rankEx"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<!fir.array<*:i32>>
 ! CHECK:           fir.call @_QPint_assumed_rank(%[[VAL_2]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> ()
 
@@ -89,7 +89,7 @@ subroutine int_assumed_shape_to_assumed_rank_bindc(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPint_assumed_shape_to_assumed_rank_bindc(
 ! CHECK-SAME:                                                          %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_assumed_shape_to_assumed_rank_bindcEx"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFint_assumed_shape_to_assumed_rank_bindcEx"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shift %[[VAL_2]], %[[VAL_2]] : (index, index) -> !fir.shift<2>
 ! CHECK:           %[[VAL_4:.*]] = fir.rebox %[[VAL_1]]#0(%[[VAL_3]]) : (!fir.box<!fir.array<?x?xi32>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?xi32>>
@@ -103,7 +103,7 @@ subroutine int_allocatable_to_assumed_rank(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPint_allocatable_to_assumed_rank(
 ! CHECK-SAME:                                                  %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFint_allocatable_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFint_allocatable_to_assumed_rankEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !fir.box<!fir.array<?x?xi32>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<!fir.array<*:i32>>
@@ -116,7 +116,7 @@ subroutine int_allocatable_to_assumed_rank_opt(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPint_allocatable_to_assumed_rank_opt(
 ! CHECK-SAME:                                                      %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFint_allocatable_to_assumed_rank_optEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFint_allocatable_to_assumed_rank_optEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !fir.heap<!fir.array<?x?xi32>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.heap<!fir.array<?x?xi32>>) -> i64
@@ -147,6 +147,6 @@ subroutine int_r2_assumed_size_to_assumed_rank(x)
 ! CHECK:           %[[VAL_5:.*]] = arith.select %[[VAL_4]], %[[VAL_2]], %[[VAL_3]] : index
 ! CHECK:           %[[VAL_6:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_5]], %[[VAL_6]] : (index, index) -> !fir.shape<2>
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_r2_assumed_size_to_assumed_rankEx"} : (!fir.ref<!fir.array<10x?xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<10x?xi32>>, !fir.ref<!fir.array<10x?xi32>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFint_r2_assumed_size_to_assumed_rankEx"} : (!fir.ref<!fir.array<10x?xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<10x?xi32>>, !fir.ref<!fir.array<10x?xi32>>)
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]]#0 : (!fir.box<!fir.array<10x?xi32>>) -> !fir.box<!fir.array<*:i32>>
 ! CHECK:           fir.call @_QPint_assumed_rank(%[[VAL_9]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> ()
diff --git a/flang/test/Lower/HLFIR/assumed-rank-inquiries-2.f90 b/flang/test/Lower/HLFIR/assumed-rank-inquiries-2.f90
index f54399e96feea..1e82b296559dd 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-inquiries-2.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-inquiries-2.f90
@@ -28,7 +28,7 @@ subroutine test_size_4(x)
 ! CHECK-LABEL:   func.func @_QPtest_size_1(
 ! CHECK-SAME:                              %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_size_1Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_size_1Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_7:.*]] = fir.call @_FortranASize(%[[VAL_5]]
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i64) -> i32
@@ -42,7 +42,7 @@ subroutine test_size_4(x)
 ! CHECK-SAME:                              %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca i32
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_size_2Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_size_2Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 2 : i32
 ! CHECK:           fir.store %[[VAL_4]] to %[[VAL_1]] : !fir.ref<i32>
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<i32>) -> i64
@@ -70,8 +70,8 @@ subroutine test_size_4(x)
 ! CHECK-SAME:                              %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                              %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "d", fir.optional}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_size_3Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_size_3Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_size_3Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_size_3Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.ref<i32>) -> i64
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_7:.*]] = arith.cmpi eq, %[[VAL_5]], %[[VAL_6]] : i64
@@ -96,7 +96,7 @@ subroutine test_size_4(x)
 ! CHECK-LABEL:   func.func @_QPtest_size_4(
 ! CHECK-SAME:                              %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_size_4Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_size_4Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>
 ! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_3]] : (!fir.box<!fir.heap<!fir.array<*:f32>>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_8:.*]] = fir.call @_FortranASize(%[[VAL_6]]
diff --git a/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90 b/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90
index f19596ef5e1f0..fcca1733dc639 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90
@@ -98,7 +98,7 @@ subroutine c_loc_2(x)
 ! CHECK-LABEL:   func.func @_QPtest_allocated(
 ! CHECK-SAME:                                 %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_allocatedEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_allocatedEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>
 ! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.heap<!fir.array<*:f32>>>) -> !fir.heap<!fir.array<*:f32>>
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.heap<!fir.array<*:f32>>) -> i64
@@ -114,7 +114,7 @@ subroutine c_loc_2(x)
 ! CHECK-LABEL:   func.func @_QPtest_associated_1(
 ! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_1Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_1Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
 ! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.ptr<!fir.array<*:f32>>
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ptr<!fir.array<*:f32>>) -> i64
@@ -131,8 +131,8 @@ subroutine c_loc_2(x)
 ! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                    %[[VAL_1:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "y", fir.target}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtest_associated_2Ey"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtest_associated_2Ey"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
 ! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_4]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
@@ -148,8 +148,8 @@ subroutine c_loc_2(x)
 ! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                    %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "y"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_3Ey"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_3Ey"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.box<none>
@@ -165,7 +165,7 @@ subroutine c_loc_2(x)
 ! CHECK-LABEL:   func.func @_QPtest_len_1(
 ! CHECK-SAME:                             %[[VAL_0:.*]]: !fir.box<!fir.array<*:!fir.char<1,?>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_len_1Ex"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_len_1Ex"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.box_elesize %[[VAL_2]]#0 : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> index
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (index) -> i32
 ! CHECK:           %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
@@ -179,7 +179,7 @@ subroutine c_loc_2(x)
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.box_elesize %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>) -> index
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_len_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_len_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_3]] : (index) -> i32
 ! CHECK:           %[[VAL_6:.*]]:3 = hlfir.associate %[[VAL_5]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
 ! CHECK:           fir.call @_QPtakes_integer(%[[VAL_6]]#0) fastmath<contract> : (!fir.ref<i32>) -> ()
@@ -190,7 +190,7 @@ subroutine c_loc_2(x)
 ! CHECK-LABEL:   func.func @_QPtest_storage_size_1(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_storage_size_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_storage_size_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.box_elesize %[[VAL_2]]#0 : (!fir.class<!fir.array<*:none>>) -> i32
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 8 : i32
 ! CHECK:           %[[VAL_5:.*]] = arith.muli %[[VAL_3]], %[[VAL_4]] : i32
@@ -203,7 +203,7 @@ subroutine c_loc_2(x)
 ! CHECK-LABEL:   func.func @_QPtest_storage_size_2(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_storage_size_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_storage_size_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
 ! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.ptr<!fir.array<*:none>>
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ptr<!fir.array<*:none>>) -> i64
@@ -225,7 +225,7 @@ subroutine c_loc_2(x)
 ! CHECK-LABEL:   func.func @_QPtest_present_1(
 ! CHECK-SAME:                                 %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x", fir.optional}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_present_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_present_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.is_present %[[VAL_2]]#0 : (!fir.class<!fir.array<*:none>>) -> i1
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
 ! CHECK:           %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1)
@@ -237,7 +237,7 @@ subroutine c_loc_2(x)
 ! CHECK-LABEL:   func.func @_QPtest_present_2(
 ! CHECK-SAME:                                 %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x", fir.optional}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QFtest_present_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QFtest_present_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.is_present %[[VAL_2]]#0 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) -> i1
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
 ! CHECK:           %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1)
@@ -249,7 +249,7 @@ subroutine c_loc_2(x)
 ! CHECK-LABEL:   func.func @_QPtest_is_contiguous_1(
 ! CHECK-SAME:                                       %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_is_contiguous_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_is_contiguous_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_4:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_3]]) fastmath<contract> : (!fir.box<none>) -> i1
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i1) -> !fir.logical<4>
@@ -262,7 +262,7 @@ subroutine c_loc_2(x)
 ! CHECK-LABEL:   func.func @_QPtest_is_contiguous_2(
 ! CHECK-SAME:                                       %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_is_contiguous_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_is_contiguous_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_5:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_4]]) fastmath<contract> : (!fir.box<none>) -> i1
@@ -277,8 +277,8 @@ subroutine c_loc_2(x)
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "y"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_same_type_as_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_same_type_as_1Ey"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_same_type_as_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_same_type_as_1Ey"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_7:.*]] = fir.call @_FortranASameTypeAs(%[[VAL_5]], %[[VAL_6]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>) -> i1
@@ -293,8 +293,8 @@ subroutine c_loc_2(x)
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "y"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_same_type_as_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_same_type_as_2Ey"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_same_type_as_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_same_type_as_2Ey"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.box<none>
@@ -311,8 +311,8 @@ subroutine c_loc_2(x)
 ! CHECK-SAME:                                         %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                         %[[VAL_1:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "y"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_extends_type_of_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_extends_type_of_1Ey"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_extends_type_of_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_extends_type_of_1Ey"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_7:.*]] = fir.call @_FortranAExtendsTypeOf(%[[VAL_5]], %[[VAL_6]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>) -> i1
@@ -327,8 +327,8 @@ subroutine c_loc_2(x)
 ! CHECK-SAME:                                         %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                         %[[VAL_1:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "y"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_extends_type_of_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_extends_type_of_2Ey"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_extends_type_of_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_extends_type_of_2Ey"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.box<none>
@@ -344,7 +344,7 @@ subroutine c_loc_2(x)
 ! CHECK-LABEL:   func.func @_QPc_loc_1(
 ! CHECK-SAME:                          %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x", fir.target}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFc_loc_1Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFc_loc_1Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
 ! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64>
 ! CHECK:           %[[VAL_6:.*]] = fir.box_addr %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> !fir.ref<!fir.array<*:f32>>
@@ -363,7 +363,7 @@ subroutine c_loc_2(x)
 ! CHECK-LABEL:   func.func @_QPc_loc_2(
 ! CHECK-SAME:                          %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFc_loc_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFc_loc_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
 ! CHECK:           %[[VAL_4:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
 ! CHECK:           %[[VAL_6:.*]] = fir.coordinate_of %[[VAL_4]], __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64>
diff --git a/flang/test/Lower/HLFIR/assumed-rank-internal-proc.f90 b/flang/test/Lower/HLFIR/assumed-rank-internal-proc.f90
index e46d21d915eb1..cd01ea5831e23 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-internal-proc.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-internal-proc.f90
@@ -17,7 +17,7 @@ subroutine internal()
 ! CHECK-LABEL:   func.func @_QPtest_assumed_rank(
 ! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<internal_assoc>, uniq_name = "_QFtest_assumed_rankEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<internal_assoc>, uniq_name = "_QFtest_assumed_rankEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca tuple<!fir.box<!fir.array<*:f32>>>
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
 ! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], %[[VAL_4]] : (!fir.ref<tuple<!fir.box<!fir.array<*:f32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<*:f32>>>
@@ -55,7 +55,7 @@ subroutine internal()
 ! CHECK-LABEL:   func.func @_QPtest_assumed_rank_optional(
 ! CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x", fir.optional}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<optional, internal_assoc>, uniq_name = "_QFtest_assumed_rank_optionalEx"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, internal_assoc>, uniq_name = "_QFtest_assumed_rank_optionalEx"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca tuple<!fir.class<!fir.array<*:none>>>
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
 ! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], %[[VAL_4]] : (!fir.ref<tuple<!fir.class<!fir.array<*:none>>>>, i32) -> !fir.ref<!fir.class<!fir.array<*:none>>>
@@ -107,7 +107,7 @@ subroutine internal()
 ! CHECK-LABEL:   func.func @_QPtest_assumed_rank_ptr(
 ! CHECK-SAME:                                        %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer, internal_assoc>, uniq_name = "_QFtest_assumed_rank_ptrEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer, internal_assoc>, uniq_name = "_QFtest_assumed_rank_ptrEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>>
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
 ! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], %[[VAL_4]] : (!fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>>>, i32) -> !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>>
diff --git a/flang/test/Lower/HLFIR/binary-ops.f90 b/flang/test/Lower/HLFIR/binary-ops.f90
index b7695a761a0b8..f4e1643dd78b6 100644
--- a/flang/test/Lower/HLFIR/binary-ops.f90
+++ b/flang/test/Lower/HLFIR/binary-ops.f90
@@ -281,8 +281,8 @@ subroutine cmp_char(l, x, y)
   l = x .eq. y
 end subroutine
 ! CHECK-LABEL: func.func @_QPcmp_char(
-! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_4:.*]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcmp_charEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-! CHECK:  %[[VAL_7:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_6:.*]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcmp_charEy"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_4:.*]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcmp_charEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_7:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_6:.*]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcmp_charEy"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_8:.*]] = hlfir.cmpchar eq %[[VAL_5]]#0 %[[VAL_7]]#0 : (!fir.boxchar<1>, !fir.boxchar<1>) -> i1
 ! CHECK:  %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 
diff --git a/flang/test/Lower/HLFIR/bindc-value-derived.f90 b/flang/test/Lower/HLFIR/bindc-value-derived.f90
index e161884697161..3a9fb784fbacd 100644
--- a/flang/test/Lower/HLFIR/bindc-value-derived.f90
+++ b/flang/test/Lower/HLFIR/bindc-value-derived.f90
@@ -17,7 +17,7 @@ subroutine test(x) bind(c)
 ! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>
 ! CHECK:           fir.store %[[VAL_0]] to %[[VAL_1]] : !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QMbindc_byvalFtestEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]}} {{.*}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QMbindc_byvalFtestEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>)
 ! CHECK:           %[[VAL_3:.*]] = hlfir.designate %[[VAL_2]]#0{"i"}   : (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>) -> !fir.ref<i32>
 ! CHECK:           fir.call @_QPuse_it(%[[VAL_3]]) fastmath<contract> : (!fir.ref<i32>) -> ()
 ! CHECK:           return
@@ -29,7 +29,7 @@ subroutine call_it(x)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMbindc_byvalPcall_it(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]}} {uniq_name = "_QMbindc_byvalFcall_itEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]}} {{.*}} {uniq_name = "_QMbindc_byvalFcall_itEx"} : (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>, !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>>
 ! CHECK:           fir.call @test(%[[VAL_2]]) proc_attrs<bind_c> fastmath<contract> : (!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>) -> ()
 ! CHECK:           return
diff --git a/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90 b/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90
index 2cb9d7ce93b96..aa3c842b3fe3d 100644
--- a/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90
+++ b/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90
@@ -23,7 +23,7 @@ subroutine test_char_1(x)
     call takes_char(x, 100)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMbindc_seq_assocPtest_char_1(
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2:.*]](%[[VAL_5:.*]]) typeparams %[[VAL_1:.*]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QMbindc_seq_assocFtest_char_1Ex"} : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.ref<!fir.array<10x20x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2:.*]](%[[VAL_5:.*]]) typeparams %[[VAL_1:.*]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMbindc_seq_assocFtest_char_1Ex"} : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.ref<!fir.array<10x20x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 100 : i32
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_9:.*]] = fir.shift %[[VAL_8]], %[[VAL_8]] : (index, index) -> !fir.shift<2>
@@ -56,7 +56,7 @@ subroutine test_char_copy_in_copy_out(x)
     call takes_char(x, 100)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMbindc_seq_assocPtest_char_copy_in_copy_out(
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMbindc_seq_assocFtest_char_copy_in_copy_outEx"} : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.box<!fir.array<?x?x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMbindc_seq_assocFtest_char_copy_in_copy_outEx"} : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.box<!fir.array<?x?x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 100 : i32
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,?>>>>>) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, i1)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
@@ -91,7 +91,7 @@ subroutine test_char_assumed_size(x)
     call takes_char_assumed_size(x)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMbindc_seq_assocPtest_char_assumed_size(
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMbindc_seq_assocFtest_char_assumed_sizeEx"} : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.box<!fir.array<?x?x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMbindc_seq_assocFtest_char_assumed_sizeEx"} : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.box<!fir.array<?x?x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,?>>>>>) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, i1)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shift %[[VAL_3]], %[[VAL_3]] : (index, index) -> !fir.shift<2>
@@ -123,7 +123,7 @@ subroutine test_optional_char(x)
     call takes_optional_char(x, 100)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMbindc_seq_assocPtest_optional_char(
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2:.*]](%[[VAL_5:.*]]) typeparams %[[VAL_1:.*]]#1 dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMbindc_seq_assocFtest_optional_charEx"} : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.ref<!fir.array<10x20x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2:.*]](%[[VAL_5:.*]]) typeparams %[[VAL_1:.*]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMbindc_seq_assocFtest_optional_charEx"} : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.ref<!fir.array<10x20x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_7:.*]] = fir.is_present %[[VAL_6]]#0 : (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) -> i1
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 100 : i32
 ! CHECK:           %[[VAL_9:.*]] = fir.if %[[VAL_7]] -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) {
@@ -186,7 +186,7 @@ subroutine test_poly_1(x)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMpoly_seq_assocPtest_poly_1(
 ! CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.class<!fir.array<10x20xnone>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMpoly_seq_assocFtest_poly_1Ex"} : (!fir.class<!fir.array<10x20xnone>>, !fir.dscope) -> (!fir.class<!fir.array<10x20xnone>>, !fir.class<!fir.array<10x20xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMpoly_seq_assocFtest_poly_1Ex"} : (!fir.class<!fir.array<10x20xnone>>, !fir.dscope) -> (!fir.class<!fir.array<10x20xnone>>, !fir.class<!fir.array<10x20xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 100 : i32
 ! CHECK:           %[[VAL_3:.*]]:3 = hlfir.associate %[[VAL_2]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]]#0 {uniq_name = "_QMpoly_seq_assocFtakes_polyEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -214,7 +214,7 @@ subroutine test_poly_copy_in_copy_out(x)
     call takes_poly(x, 100)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMpoly_seq_assocPtest_poly_copy_in_copy_out(
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMpoly_seq_assocFtest_poly_copy_in_copy_outEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMpoly_seq_assocFtest_poly_copy_in_copy_outEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 100 : i32
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.class<!fir.array<?x?xnone>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> (!fir.class<!fir.array<?x?xnone>>, i1)
 ! CHECK:           %[[VAL_4:.*]]:3 = hlfir.associate %[[VAL_2]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
@@ -244,7 +244,7 @@ subroutine test_poly_assumed_size(x)
     call takes_poly_assumed_size(x)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMpoly_seq_assocPtest_poly_assumed_size(
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMpoly_seq_assocFtest_poly_assumed_sizeEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMpoly_seq_assocFtest_poly_assumed_sizeEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.class<!fir.array<?x?xnone>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> (!fir.class<!fir.array<?x?xnone>>, i1)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 10 : i64
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i64
@@ -271,7 +271,7 @@ subroutine test_optional_poly(x)
     call takes_optional_poly(x, 100)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMpoly_seq_assocPtest_optional_poly(
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMpoly_seq_assocFtest_optional_polyEx"} : (!fir.class<!fir.array<10x20xnone>>, !fir.dscope) -> (!fir.class<!fir.array<10x20xnone>>, !fir.class<!fir.array<10x20xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMpoly_seq_assocFtest_optional_polyEx"} : (!fir.class<!fir.array<10x20xnone>>, !fir.dscope) -> (!fir.class<!fir.array<10x20xnone>>, !fir.class<!fir.array<10x20xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.class<!fir.array<10x20xnone>>) -> i1
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : i32
 ! CHECK:           %[[VAL_4:.*]] = fir.if %[[VAL_2]] -> (!fir.class<!fir.array<10x20xnone>>) {
diff --git a/flang/test/Lower/HLFIR/calls-array-results.f90 b/flang/test/Lower/HLFIR/calls-array-results.f90
index 425969e0b1bf2..6bc8090de7a49 100644
--- a/flang/test/Lower/HLFIR/calls-array-results.f90
+++ b/flang/test/Lower/HLFIR/calls-array-results.f90
@@ -72,7 +72,7 @@ subroutine dispatch_test(x, a)
 ! CHECK-LABEL:   func.func @_QParg_test(
 ! CHECK-SAME:                           %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFarg_testEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFarg_testEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i64>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
@@ -106,10 +106,10 @@ subroutine dispatch_test(x, a)
 ! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.ref<!fir.array<10xf32>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                %[[VAL_1:.*]]: !fir.class<!fir.type<_QMtype_defsTt>> {fir.bindc_name = "a"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFdispatch_testEa"} : (!fir.class<!fir.type<_QMtype_defsTt>>, !fir.dscope) -> (!fir.class<!fir.type<_QMtype_defsTt>>, !fir.class<!fir.type<_QMtype_defsTt>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFdispatch_testEa"} : (!fir.class<!fir.type<_QMtype_defsTt>>, !fir.dscope) -> (!fir.class<!fir.type<_QMtype_defsTt>>, !fir.class<!fir.type<_QMtype_defsTt>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_2]] {uniq_name = "_QFdispatch_testEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFdispatch_testEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 10 : i64
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i64
 ! CHECK:           %[[VAL_9:.*]] = arith.subi %[[VAL_7]], %[[VAL_8]] : i64
diff --git a/flang/test/Lower/HLFIR/calls-assumed-shape.f90 b/flang/test/Lower/HLFIR/calls-assumed-shape.f90
index 102f31565f041..9bf150d805d99 100644
--- a/flang/test/Lower/HLFIR/calls-assumed-shape.f90
+++ b/flang/test/Lower/HLFIR/calls-assumed-shape.f90
@@ -12,7 +12,7 @@ subroutine takes_assumed(x)
   call takes_assumed(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_assumed_to_assumed(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_assumed_to_assumedEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_assumed_to_assumedEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:  fir.call @_QPtakes_assumed(%[[VAL_1]]#0) {{.*}} : (!fir.box<!fir.array<?xf32>>) -> ()
 
 subroutine test_ptr_to_assumed(p)
@@ -25,7 +25,7 @@ subroutine takes_assumed(x)
   call takes_assumed(p)
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_ptr_to_assumed(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_assumedEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_assumedEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK:  %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.box<!fir.array<?xf32>>
 ! CHECK:  fir.call @_QPtakes_assumed(%[[VAL_3]]) {{.*}} : (!fir.box<!fir.array<?xf32>>) -> ()
@@ -40,7 +40,7 @@ subroutine takes_contiguous_assumed(x)
   call takes_contiguous_assumed(p)
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_ptr_to_contiguous_assumed(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_contiguous_assumedEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_contiguous_assumedEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK:  %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_3:.*]]:2 = hlfir.copy_in %[[VAL_2]] to %[[TMP_BOX:.*]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.box<!fir.ptr<!fir.array<?xf32>>>, i1)
 ! CHECK:  %[[VAL_4:.*]] = fir.rebox %[[VAL_3]]#0 : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.box<!fir.array<?xf32>>
@@ -57,7 +57,7 @@ subroutine takes_contiguous_assumed_classstar(x)
   call takes_contiguous_assumed_classstar(p)
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_ptr_to_contiguous_assumed_classstar(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_contiguous_assumed_classstarEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_contiguous_assumed_classstarEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK:  %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_3:.*]]:2 = hlfir.copy_in %[[VAL_2]] to %[[TMP_BOX:.*]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.box<!fir.ptr<!fir.array<?xf32>>>, i1)
 ! CHECK:  %[[VAL_4:.*]] = fir.rebox %[[VAL_3]]#0 : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.class<!fir.array<?xnone>>
@@ -74,7 +74,7 @@ subroutine takes_assumed_typestar(x)
   call takes_assumed_typestar(p)
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_ptr_to_assumed_typestar(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_assumed_typestarEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ptr_to_assumed_typestarEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK:  %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.box<!fir.array<?xnone>>
 ! CHECK:  fir.call @_QPtakes_assumed_typestar(%[[VAL_3]]) {{.*}} : (!fir.box<!fir.array<?xnone>>) -> ()
@@ -94,7 +94,7 @@ subroutine takes_assumed_character(x)
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_4:.*]] = arith.constant 20 : index
 ! CHECK:  %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_5:[a-z0-9]*]]) typeparams %[[VAL_2:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_explicit_char_to_boxEe"} : (!fir.ref<!fir.array<20x!fir.char<1,10>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<20x!fir.char<1,10>>>, !fir.ref<!fir.array<20x!fir.char<1,10>>>)
+! CHECK:  %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_5:[a-z0-9]*]]) typeparams %[[VAL_2:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_explicit_char_to_boxEe"} : (!fir.ref<!fir.array<20x!fir.char<1,10>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<20x!fir.char<1,10>>>, !fir.ref<!fir.array<20x!fir.char<1,10>>>)
 ! CHECK:  %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[VAL_5]]) : (!fir.ref<!fir.array<20x!fir.char<1,10>>>, !fir.shape<1>) -> !fir.box<!fir.array<20x!fir.char<1,10>>>
 ! CHECK:  %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.array<20x!fir.char<1,10>>>) -> !fir.box<!fir.array<?x!fir.char<1,?>>>
 ! CHECK:  fir.call @_QPtakes_assumed_character(%[[VAL_8]]) {{.*}} : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> ()
@@ -109,7 +109,7 @@ subroutine takes_explicit_by_value(x)
   call takes_explicit_by_value(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_explicit_by_val(
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]](%[[VAL_2:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_explicit_by_valEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]](%[[VAL_2:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_explicit_by_valEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 ! CHECK:  %[[VAL_4:.*]] = hlfir.as_expr %[[VAL_3]]#0 : (!fir.ref<!fir.array<10xf32>>) -> !hlfir.expr<10xf32>
 ! CHECK:  %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]](%[[VAL_2]]) {adapt.valuebyref} : (!hlfir.expr<10xf32>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>, i1)
 ! CHECK:  fir.call @_QPtakes_explicit_by_value(%[[VAL_5]]#0) {{.*}} : (!fir.ref<!fir.array<10xf32>>) -> ()
diff --git a/flang/test/Lower/HLFIR/calls-constant-expr-arg.f90 b/flang/test/Lower/HLFIR/calls-constant-expr-arg.f90
index f41b9cd0a0bbe..2186bda79a46f 100644
--- a/flang/test/Lower/HLFIR/calls-constant-expr-arg.f90
+++ b/flang/test/Lower/HLFIR/calls-constant-expr-arg.f90
@@ -18,7 +18,7 @@ end subroutine sub
 ! CHECK-LABEL:   func.func @_QPsub(
 ! CHECK-SAME:                      %[[VAL_0:.*]]: !fir.ref<!fir.array<?xi32>> {fir.bindc_name = "i"},
 ! CHECK-SAME:                      %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsubEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsubEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i32) -> i64
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
@@ -26,7 +26,7 @@ end subroutine sub
 ! CHECK:           %[[VAL_7:.*]] = arith.cmpi sgt, %[[VAL_5]], %[[VAL_6]] : index
 ! CHECK:           %[[VAL_8:.*]] = arith.select %[[VAL_7]], %[[VAL_5]], %[[VAL_6]] : index
 ! CHECK:           %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsubEi"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsubEi"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_11:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_12:.*]] = arith.constant 2 : index
 ! CHECK:           %[[VAL_13:.*]] = arith.constant 0 : index
diff --git a/flang/test/Lower/HLFIR/calls-f77.f90 b/flang/test/Lower/HLFIR/calls-f77.f90
index 450f8811eb5e0..97d2307beeb06 100644
--- a/flang/test/Lower/HLFIR/calls-f77.f90
+++ b/flang/test/Lower/HLFIR/calls-f77.f90
@@ -19,7 +19,7 @@ subroutine call_int_arg_var(n)
 end subroutine
 ! CHECK-LABEL: func.func @_QPcall_int_arg_var(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<i32>
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_int_arg_varEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcall_int_arg_varEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:  fir.call @_QPtake_i4(%[[VAL_1]]#0) fastmath<contract> : (!fir.ref<i32>) -> ()
 
 subroutine call_int_arg_expr()
@@ -46,7 +46,7 @@ subroutine call_real_arg_var(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPcall_real_arg_var(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<f32>
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_real_arg_varEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcall_real_arg_varEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:  fir.call @_QPtake_r4(%[[VAL_1]]#0) fastmath<contract> : (!fir.ref<f32>) -> ()
 
 subroutine call_logical_arg_var(x)
@@ -55,7 +55,7 @@ subroutine call_logical_arg_var(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPcall_logical_arg_var(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_logical_arg_varEx"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcall_logical_arg_varEx"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:  fir.call @_QPtake_l4(%[[VAL_1]]#0) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> ()
 
 subroutine call_logical_arg_expr()
@@ -85,7 +85,7 @@ subroutine call_char_arg_var(x)
 ! CHECK-LABEL: func.func @_QPcall_char_arg_var(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.boxchar<1>
 ! CHECK:  %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_char_arg_varEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcall_char_arg_varEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  fir.call @_QPtake_c(%[[VAL_2]]#0) fastmath<contract> : (!fir.boxchar<1>) -> ()
 
 subroutine call_char_arg_var_expr(x)
@@ -95,7 +95,7 @@ subroutine call_char_arg_var_expr(x)
 ! CHECK-LABEL: func.func @_QPcall_char_arg_var_expr(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.boxchar<1>
 ! CHECK:  %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_char_arg_var_exprEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcall_char_arg_var_exprEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:  %[[VAL_3:.*]] = arith.addi %[[VAL_1]]#1, %[[VAL_1]]#1 : index
 ! CHECK:  %[[VAL_4:.*]] = hlfir.concat %[[VAL_2]]#0, %[[VAL_2]]#0 len %[[VAL_3]] : (!fir.boxchar<1>, !fir.boxchar<1>, index) -> !hlfir.expr<!fir.char<1,?>>
 ! CHECK:  %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]] typeparams %[[VAL_3]] {adapt.valuebyref} : (!hlfir.expr<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>, i1)
@@ -111,7 +111,7 @@ subroutine call_arg_array_var(n)
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 20 : index
 ! CHECK:  %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2>
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFcall_arg_array_varEn"} : (!fir.ref<!fir.array<10x20xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xi32>>, !fir.ref<!fir.array<10x20xi32>>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcall_arg_array_varEn"} : (!fir.ref<!fir.array<10x20xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xi32>>, !fir.ref<!fir.array<10x20xi32>>)
 ! CHECK:  fir.call @_QPtake_arr(%[[VAL_4]]#0) fastmath<contract> : (!fir.ref<!fir.array<10x20xi32>>) -> ()
 
 subroutine call_arg_array_2(n)
@@ -120,7 +120,7 @@ subroutine call_arg_array_2(n)
 end subroutine
 ! CHECK-LABEL: func.func @_QPcall_arg_array_2(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xi32>>
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFcall_arg_array_2En"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, optional>, uniq_name = "_QFcall_arg_array_2En"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
 ! CHECK:  %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.ref<!fir.array<?x?xi32>>
 ! CHECK:  fir.call @_QPtake_arr_2(%[[VAL_2]]) fastmath<contract> : (!fir.ref<!fir.array<?x?xi32>>) -> ()
 
diff --git a/flang/test/Lower/HLFIR/calls-optional.f90 b/flang/test/Lower/HLFIR/calls-optional.f90
index 76e1d1364047b..d8d3bfe4f4e5f 100644
--- a/flang/test/Lower/HLFIR/calls-optional.f90
+++ b/flang/test/Lower/HLFIR/calls-optional.f90
@@ -14,7 +14,7 @@ subroutine takes_optional_explicit(x)
   call  takes_optional_explicit(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPoptional_copy_in_out(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_copy_in_outEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_copy_in_outEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:  %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.box<!fir.array<?xf32>>) -> i1
 ! CHECK:  %[[VAL_3:.*]]:3 = fir.if %[[VAL_2]] -> (!fir.ref<!fir.array<?xf32>>, i1, !fir.box<!fir.array<?xf32>>) {
 ! CHECK:    %[[VAL_4:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.box<!fir.array<?xf32>>, i1)
@@ -39,7 +39,7 @@ subroutine takes_optional_explicit_value(x)
   call  takes_optional_explicit_value(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPoptional_value_copy(
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]](%[[VAL_2:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_value_copyEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]](%[[VAL_2:[a-z0-9]*]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_value_copyEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
 ! CHECK:  %[[VAL_4:.*]] = fir.is_present %[[VAL_3]]#0 : (!fir.ref<!fir.array<100xf32>>) -> i1
 ! CHECK:  %[[VAL_5:.*]]:3 = fir.if %[[VAL_4]] -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>, i1) {
 ! CHECK:    %[[VAL_6:.*]] = hlfir.as_expr %[[VAL_3]]#0 : (!fir.ref<!fir.array<100xf32>>) -> !hlfir.expr<100xf32>
@@ -65,8 +65,8 @@ elemental subroutine elem_takes_two_optional(x, y)
   call elem_takes_two_optional(x, y)
 end subroutine
 ! CHECK-LABEL: func.func @_QPelem_pointer_to_optional(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFelem_pointer_to_optionalEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFelem_pointer_to_optionalEy"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFelem_pointer_to_optionalEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFelem_pointer_to_optionalEy"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:  %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>>
 ! CHECK:  %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.ptr<!fir.array<?xf32>>) -> i64
@@ -104,7 +104,7 @@ elemental subroutine elem_takes_one_optional(x)
   call elem_takes_one_optional(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPoptional_cannot_be_absent_optional(
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_cannot_be_absent_optionalEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_cannot_be_absent_optionalEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_1]]#0, %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
 ! CHECK:  %[[VAL_4:.*]] = arith.constant 1 : index
@@ -124,8 +124,8 @@ elemental subroutine elem_optional_poly(x, y)
   call elem_optional_poly(x, y)
 end subroutine
 ! CHECK-LABEL: func.func @_QPoptional_elem_poly(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFoptional_elem_polyEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_elem_polyEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFoptional_elem_polyEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFoptional_elem_polyEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:  %[[VAL_4:.*]] = fir.is_present %[[VAL_3]]#0 : (!fir.box<!fir.array<?xf32>>) -> i1
 ! CHECK:  %[[VAL_5:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_2]]#0, %[[VAL_5]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
diff --git a/flang/test/Lower/HLFIR/calls-percent-val-ref.f90 b/flang/test/Lower/HLFIR/calls-percent-val-ref.f90
index d15029557d3b1..bb5591e538334 100644
--- a/flang/test/Lower/HLFIR/calls-percent-val-ref.f90
+++ b/flang/test/Lower/HLFIR/calls-percent-val-ref.f90
@@ -7,7 +7,7 @@ subroutine test_val_1(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_val_1(
 ! CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_val_1Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_val_1Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
 ! CHECK:           fir.call @_QPval1(%[[VAL_2]]) fastmath<contract> : (i32) -> ()
 
@@ -17,7 +17,7 @@ subroutine test_val_2(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_val_2(
 ! CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<complex<f32>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_val_2Ex"} : (!fir.ref<!fir.box<!fir.heap<complex<f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<complex<f32>>>>, !fir.ref<!fir.box<!fir.heap<complex<f32>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_val_2Ex"} : (!fir.ref<!fir.box<!fir.heap<complex<f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<complex<f32>>>>, !fir.ref<!fir.box<!fir.heap<complex<f32>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<complex<f32>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.heap<complex<f32>>>) -> !fir.heap<complex<f32>>
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]] : !fir.heap<complex<f32>>
@@ -32,7 +32,7 @@ subroutine test_ref_char(x)
 ! CHECK-LABEL:   func.func @_QPtest_ref_char(
 ! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.boxchar<1> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_ref_charEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_ref_charEx"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:           %[[VAL_3:.*]]:2 = fir.unboxchar %[[VAL_2]]#0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:           fir.call @_QPref_char(%[[VAL_3]]#0) fastmath<contract> : (!fir.ref<!fir.char<1,?>>) -> ()
 
@@ -42,7 +42,7 @@ subroutine test_ref_1(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_ref_1(
 ! CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_ref_1Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_ref_1Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           fir.call @_QPref1(%[[VAL_1]]#0) fastmath<contract> : (!fir.ref<i32>) -> ()
 
 subroutine test_ref_2(x)
@@ -51,7 +51,7 @@ subroutine test_ref_2(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_ref_2(
 ! CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<complex<f32>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ref_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<complex<f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<complex<f32>>>>, !fir.ref<!fir.box<!fir.ptr<complex<f32>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ref_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<complex<f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<complex<f32>>>>, !fir.ref<!fir.box<!fir.ptr<complex<f32>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<complex<f32>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<complex<f32>>>) -> !fir.ptr<complex<f32>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ptr<complex<f32>>) -> !fir.ref<complex<f32>>
@@ -63,7 +63,7 @@ subroutine test_skip_copy_in_out(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_skip_copy_in_out(
 ! CHECK-SAME:                                        %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_skip_copy_in_outEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_skip_copy_in_outEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<?xf32>>) -> i64
 ! CHECK:           fir.call @_QPval3(%[[VAL_3]]) fastmath<contract> : (i64) -> ()
diff --git a/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90 b/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90
index d607e7422a31f..24d7ca9aee494 100644
--- a/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90
+++ b/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90
@@ -12,7 +12,7 @@ subroutine assumed_type_assumed_size(x)
   call assumed_type_assumed_size(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPpass_poly_to_assumed_type_assumed_size(
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFpass_poly_to_assumed_type_assumed_sizeEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFpass_poly_to_assumed_type_assumed_sizeEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 to %[[TMP_BOX:.*]] : (!fir.class<!fir.array<?x?xnone>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> (!fir.class<!fir.array<?x?xnone>>, i1)
 ! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]]#0 : (!fir.class<!fir.array<?x?xnone>>) -> !fir.ref<!fir.array<?x?xnone>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.array<?x?xnone>>) -> !fir.ref<!fir.array<?xnone>>
diff --git a/flang/test/Lower/HLFIR/charconvert.f90 b/flang/test/Lower/HLFIR/charconvert.f90
index 45b0f356617a0..f4cd3b17fee41 100644
--- a/flang/test/Lower/HLFIR/charconvert.f90
+++ b/flang/test/Lower/HLFIR/charconvert.f90
@@ -13,7 +13,7 @@ end subroutine callee
 end subroutine charconvert1
 
 ! CHECK-LABEL: func.func @_QPcharconvert1
-! CHECK:   %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFcharconvert1Ec"} : (!fir.box<!fir.array<?x!fir.char<4,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<4,?>>>, !fir.box<!fir.array<?x!fir.char<4,?>>>)
+! CHECK:   %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFcharconvert1Ec"} : (!fir.box<!fir.array<?x!fir.char<4,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<4,?>>>, !fir.box<!fir.array<?x!fir.char<4,?>>>)
 ! CHECK:   ^bb0(%[[ARG2:.*]]: index):
 ! CHECK:     %[[VAL_37:.*]] = fir.box_elesize %[[VAL_2]]#1 : (!fir.box<!fir.array<?x!fir.char<4,?>>>) -> index
 ! CHECK:     %[[C4_4:.*]] = arith.constant 4 : index
@@ -36,7 +36,7 @@ end subroutine charconvert2
 ! CHECK:   %[[C1:.*]] = arith.constant 1 : index
 ! CHECK:   %[[VAL_1:.*]] = fir.alloca !fir.char<4> {bindc_name = "cx", uniq_name = "_QFcharconvert2Ecx"}
 ! CHECK:   %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] typeparams %[[C1]] {uniq_name = "_QFcharconvert2Ecx"} : (!fir.ref<!fir.char<4>>, index) -> (!fir.ref<!fir.char<4>>, !fir.ref<!fir.char<4>>)
-! CHECK:   %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFcharconvert2Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:   %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFcharconvert2Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:   %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
 ! CHECK:   %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i32) -> i64
 ! CHECK:   %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i64) -> i8
@@ -56,11 +56,11 @@ subroutine charconvert3(c, c4)
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPcharconvert3
-! CHECK-SAME: %[[ARG0:.*]]: !fir.boxchar<1> {{.*}}, %[[ARG1:.*]]: !fir.boxchar<4> 
+! CHECK-SAME: %[[ARG0:.*]]: !fir.boxchar<1> {{.*}}, %[[ARG1:.*]]: !fir.boxchar<4>
 ! CHECK:   %[[VAL_0:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:   %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcharconvert3Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:   %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcharconvert3Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:   %[[VAL_2:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
-! CHECK:   %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcharconvert3Ec4"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
+! CHECK:   %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcharconvert3Ec4"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
 ! CHECK:   %[[VAL_4:.*]] = arith.addi %[[VAL_0]]#1, %[[VAL_0]]#1 : index
 ! CHECK:   %[[VAL_5:.*]] = hlfir.concat %[[VAL_1]]#0, %[[VAL_1]]#0 len %[[VAL_4]] : (!fir.boxchar<1>, !fir.boxchar<1>, index) -> !hlfir.expr<!fir.char<1,?>>
 ! CHECK:   %[[VAL_7:.*]]:3 = hlfir.associate %[[VAL_5]] typeparams %[[VAL_4]] {adapt.valuebyref} : (!hlfir.expr<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>, i1)
diff --git a/flang/test/Lower/HLFIR/complex-div-to-hlfir-kind10.f90 b/flang/test/Lower/HLFIR/complex-div-to-hlfir-kind10.f90
index 0e219e30ec94b..7af4418d2db96 100644
--- a/flang/test/Lower/HLFIR/complex-div-to-hlfir-kind10.f90
+++ b/flang/test/Lower/HLFIR/complex-div-to-hlfir-kind10.f90
@@ -11,9 +11,9 @@
 ! CHECK-LABEL: @_QPdiv_test_extended
 ! CHECK-SAME: %[[REF_0:.*]]: !fir.ref<complex<f80>> {{.*}}, %[[REF_1:.*]]: !fir.ref<complex<f80>> {{.*}}, %[[REF_2:.*]]: !fir.ref<complex<f80>> {{.*}})
 ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_extendedEa"} : (!fir.ref<complex<f80>>, !fir.dscope) -> (!fir.ref<complex<f80>>, !fir.ref<complex<f80>>)
-! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_extendedEb"} : (!fir.ref<complex<f80>>, !fir.dscope) -> (!fir.ref<complex<f80>>, !fir.ref<complex<f80>>)
-! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_extendedEc"} : (!fir.ref<complex<f80>>, !fir.dscope) -> (!fir.ref<complex<f80>>, !fir.ref<complex<f80>>)
+! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_extendedEa"} : (!fir.ref<complex<f80>>, !fir.dscope) -> (!fir.ref<complex<f80>>, !fir.ref<complex<f80>>)
+! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_extendedEb"} : (!fir.ref<complex<f80>>, !fir.dscope) -> (!fir.ref<complex<f80>>, !fir.ref<complex<f80>>)
+! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_extendedEc"} : (!fir.ref<complex<f80>>, !fir.dscope) -> (!fir.ref<complex<f80>>, !fir.ref<complex<f80>>)
 ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<complex<f80>>
 ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<complex<f80>>
 
diff --git a/flang/test/Lower/HLFIR/complex-div-to-hlfir-kind16.f90 b/flang/test/Lower/HLFIR/complex-div-to-hlfir-kind16.f90
index fe4a7256a4a16..e732221fa6d50 100644
--- a/flang/test/Lower/HLFIR/complex-div-to-hlfir-kind16.f90
+++ b/flang/test/Lower/HLFIR/complex-div-to-hlfir-kind16.f90
@@ -12,9 +12,9 @@
 ! CHECK-LABEL: @_QPdiv_test_quad
 ! CHECK-SAME: %[[REF_0:.*]]: !fir.ref<complex<f128>> {{.*}}, %[[REF_1:.*]]: !fir.ref<complex<f128>> {{.*}}, %[[REF_2:.*]]: !fir.ref<complex<f128>> {{.*}})
 ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_quadEa"} : (!fir.ref<complex<f128>>, !fir.dscope) -> (!fir.ref<complex<f128>>, !fir.ref<complex<f128>>)
-! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_quadEb"} : (!fir.ref<complex<f128>>, !fir.dscope) -> (!fir.ref<complex<f128>>, !fir.ref<complex<f128>>)
-! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_quadEc"} : (!fir.ref<complex<f128>>, !fir.dscope) -> (!fir.ref<complex<f128>>, !fir.ref<complex<f128>>)
+! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_quadEa"} : (!fir.ref<complex<f128>>, !fir.dscope) -> (!fir.ref<complex<f128>>, !fir.ref<complex<f128>>)
+! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_quadEb"} : (!fir.ref<complex<f128>>, !fir.dscope) -> (!fir.ref<complex<f128>>, !fir.ref<complex<f128>>)
+! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_quadEc"} : (!fir.ref<complex<f128>>, !fir.dscope) -> (!fir.ref<complex<f128>>, !fir.ref<complex<f128>>)
 ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<complex<f128>>
 ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<complex<f128>>
 
diff --git a/flang/test/Lower/HLFIR/complex-div-to-hlfir.f90 b/flang/test/Lower/HLFIR/complex-div-to-hlfir.f90
index b488bfde4ee85..f9d7f8fba9142 100644
--- a/flang/test/Lower/HLFIR/complex-div-to-hlfir.f90
+++ b/flang/test/Lower/HLFIR/complex-div-to-hlfir.f90
@@ -11,9 +11,9 @@
 ! CHECK-LABEL: @_QPdiv_test_half
 ! CHECK-SAME: %[[REF_0:.*]]: !fir.ref<complex<f16>> {{.*}}, %[[REF_1:.*]]: !fir.ref<complex<f16>> {{.*}}, %[[REF_2:.*]]: !fir.ref<complex<f16>> {{.*}})
 ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_halfEa"} : (!fir.ref<complex<f16>>, !fir.dscope) -> (!fir.ref<complex<f16>>, !fir.ref<complex<f16>>)
-! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_halfEb"} : (!fir.ref<complex<f16>>, !fir.dscope) -> (!fir.ref<complex<f16>>, !fir.ref<complex<f16>>)
-! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_halfEc"} : (!fir.ref<complex<f16>>, !fir.dscope) -> (!fir.ref<complex<f16>>, !fir.ref<complex<f16>>)
+! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_halfEa"} : (!fir.ref<complex<f16>>, !fir.dscope) -> (!fir.ref<complex<f16>>, !fir.ref<complex<f16>>)
+! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_halfEb"} : (!fir.ref<complex<f16>>, !fir.dscope) -> (!fir.ref<complex<f16>>, !fir.ref<complex<f16>>)
+! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_halfEc"} : (!fir.ref<complex<f16>>, !fir.dscope) -> (!fir.ref<complex<f16>>, !fir.ref<complex<f16>>)
 ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<complex<f16>>
 ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<complex<f16>>
 ! CHECK: %[[VAL_9:.*]] = complex.div %[[VAL_7]], %[[VAL_8]] fastmath<contract> : complex<f16>
@@ -28,9 +28,9 @@ end subroutine div_test_half
 ! CHECK-LABEL: @_QPdiv_test_bfloat
 ! CHECK-SAME: %[[REF_0:.*]]: !fir.ref<complex<bf16>> {{.*}}, %[[REF_1:.*]]: !fir.ref<complex<bf16>> {{.*}}, %[[REF_2:.*]]: !fir.ref<complex<bf16>> {{.*}})
 ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_bfloatEa"} : (!fir.ref<complex<bf16>>, !fir.dscope) -> (!fir.ref<complex<bf16>>, !fir.ref<complex<bf16>>)
-! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_bfloatEb"} : (!fir.ref<complex<bf16>>, !fir.dscope) -> (!fir.ref<complex<bf16>>, !fir.ref<complex<bf16>>)
-! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_bfloatEc"} : (!fir.ref<complex<bf16>>, !fir.dscope) -> (!fir.ref<complex<bf16>>, !fir.ref<complex<bf16>>)
+! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_bfloatEa"} : (!fir.ref<complex<bf16>>, !fir.dscope) -> (!fir.ref<complex<bf16>>, !fir.ref<complex<bf16>>)
+! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_bfloatEb"} : (!fir.ref<complex<bf16>>, !fir.dscope) -> (!fir.ref<complex<bf16>>, !fir.ref<complex<bf16>>)
+! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_bfloatEc"} : (!fir.ref<complex<bf16>>, !fir.dscope) -> (!fir.ref<complex<bf16>>, !fir.ref<complex<bf16>>)
 ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<complex<bf16>>
 ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<complex<bf16>>
 ! CHECK: %[[VAL_9:.*]] = complex.div %[[VAL_7]], %[[VAL_8]] fastmath<contract> : complex<bf16>
@@ -45,9 +45,9 @@ end subroutine div_test_bfloat
 ! CHECK-LABEL: @_QPdiv_test_single
 ! CHECK-SAME: %[[REF_0:.*]]: !fir.ref<complex<f32>> {{.*}}, %[[REF_1:.*]]: !fir.ref<complex<f32>> {{.*}}, %[[REF_2:.*]]: !fir.ref<complex<f32>> {{.*}})
 ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_singleEa"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
-! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_singleEb"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
-! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_singleEc"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
+! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_singleEa"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
+! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_singleEb"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
+! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_singleEc"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
 ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<complex<f32>>
 ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<complex<f32>>
 
@@ -71,9 +71,9 @@ end subroutine div_test_single
 ! CHECK-LABEL: @_QPdiv_test_double
 ! CHECK-SAME: %[[REF_0:.*]]: !fir.ref<complex<f64>> {{.*}}, %[[REF_1:.*]]: !fir.ref<complex<f64>> {{.*}}, %[[REF_2:.*]]: !fir.ref<complex<f64>> {{.*}})
 ! CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_doubleEa"} : (!fir.ref<complex<f64>>, !fir.dscope) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>)
-! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_doubleEb"} : (!fir.ref<complex<f64>>, !fir.dscope) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>)
-! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFdiv_test_doubleEc"} : (!fir.ref<complex<f64>>, !fir.dscope) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>)
+! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[REF_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_doubleEa"} : (!fir.ref<complex<f64>>, !fir.dscope) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>)
+! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[REF_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_doubleEb"} : (!fir.ref<complex<f64>>, !fir.dscope) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>)
+! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[REF_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFdiv_test_doubleEc"} : (!fir.ref<complex<f64>>, !fir.dscope) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>)
 ! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<complex<f64>>
 ! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<complex<f64>>
 
diff --git a/flang/test/Lower/HLFIR/convert-mbox-to-value.f90 b/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
index ef9c12102a561..b34dc8dfd32b0 100644
--- a/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
+++ b/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
@@ -7,7 +7,7 @@ subroutine test_int_allocatable(a)
 end subroutine test_int_allocatable
 ! CHECK-LABEL:   func.func @_QPtest_int_allocatable(
 ! CHECK-SAME:                                       %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>> {fir.bindc_name = "a"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_int_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_int_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 6 : i32
 ! CHECK:           %[[VAL_3:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
@@ -27,7 +27,7 @@ subroutine test_int_pointer(p)
 end subroutine test_int_pointer
 ! CHECK-LABEL:   func.func @_QPtest_int_pointer(
 ! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>> {fir.bindc_name = "p"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_int_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_int_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 6 : i32
 ! CHECK:           %[[VAL_3:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
@@ -49,7 +49,7 @@ end subroutine test_char_allocatable
 ! CHECK-LABEL:   func.func @_QPtest_char_allocatable(
 ! CHECK-SAME:                                        %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>> {fir.bindc_name = "a"}) {
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 11 : index
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_char_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_char_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_char_allocatableEi"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFtest_char_allocatableEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,11>>>>
@@ -86,7 +86,7 @@ end subroutine test_char_pointer
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_char_pointerEi"}
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_char_pointerEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 11 : index
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_char_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_char_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,11>>>>
 ! CHECK:           %[[VAL_6:.*]] = fir.box_addr %[[VAL_5]] : (!fir.box<!fir.ptr<!fir.char<1,11>>>) -> !fir.ptr<!fir.char<1,11>>
 ! CHECK:           %[[VAL_3B:.*]] = arith.constant 11 : index
@@ -120,7 +120,7 @@ end subroutine test_dyn_char_allocatable
 ! CHECK-SAME:                                            %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> {fir.bindc_name = "a"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
 ! CHECK:           %[[VAL_2:.*]] = fir.box_elesize %[[VAL_1]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> index
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_dyn_char_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_dyn_char_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_dyn_char_allocatableEi"}
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFtest_dyn_char_allocatableEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
@@ -157,7 +157,7 @@ end subroutine test_dyn_char_pointer
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_dyn_char_pointerEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
 ! CHECK:           %[[VAL_4:.*]] = fir.box_elesize %[[VAL_3]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> index
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_4]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_dyn_char_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_4]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_dyn_char_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
 ! CHECK:           %[[VAL_7:.*]] = fir.box_addr %[[VAL_6]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>>
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : index
@@ -201,7 +201,7 @@ end subroutine test_derived_allocatable
 ! CHECK:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]] : (!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>) -> !fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>>
 ! CHECK:           fir.store %[[VAL_7]] to %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>>>
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_5]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_derived_allocatableEa2"} : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>>>) -> (!fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>>>, !fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>>>)
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_derived_allocatableEl"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_derived_allocatableEl"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_10:.*]] = fir.alloca !fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>> {bindc_name = "r", uniq_name = "_QFtest_derived_allocatableEr"}
 ! CHECK:           %[[VAL_11:.*]] = fir.zero_bits !fir.heap<!fir.type<_QFtest_derived_allocatableTt>>
 ! CHECK:           %[[VAL_12:.*]] = fir.embox %[[VAL_11]] : (!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>) -> !fir.class<!fir.heap<!fir.type<_QFtest_derived_allocatableTt>>>
@@ -241,7 +241,7 @@ end subroutine test_derived_pointer
 ! CHECK:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]] : (!fir.heap<!fir.type<_QFtest_derived_pointerTt>>) -> !fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>>
 ! CHECK:           fir.store %[[VAL_7]] to %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>>>
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_5]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_derived_pointerEa2"} : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>>>) -> (!fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>>>, !fir.ref<!fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>>>)
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_derived_pointerEl"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_derived_pointerEl"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_10:.*]] = fir.alloca !fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>> {bindc_name = "r", uniq_name = "_QFtest_derived_pointerEr"}
 ! CHECK:           %[[VAL_11:.*]] = fir.zero_bits !fir.heap<!fir.type<_QFtest_derived_pointerTt>>
 ! CHECK:           %[[VAL_12:.*]] = fir.embox %[[VAL_11]] : (!fir.heap<!fir.type<_QFtest_derived_pointerTt>>) -> !fir.class<!fir.heap<!fir.type<_QFtest_derived_pointerTt>>>
diff --git a/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90 b/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90
index d2931eabb75c2..8c7a8c483f454 100644
--- a/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90
+++ b/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90
@@ -52,7 +52,7 @@ subroutine test_assumed_length_alloc(x)
 ! CHECK-LABEL:   func.func @_QMassumed_rank_testsPtest_intrinsic(
 ! CHECK-SAME:                                                    %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QMassumed_rank_testsFtest_intrinsicEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QMassumed_rank_testsFtest_intrinsicEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           fir.call @_QPtakes_real(%[[VAL_2]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
 ! CHECK:           return
 ! CHECK:         }
@@ -61,12 +61,12 @@ subroutine test_assumed_length_alloc(x)
 ! CHECK-SAME:                                                                 %[[VAL_0:.*]]: !fir.box<!fir.array<*:!fir.char<1,?>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                                                 %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMassumed_rank_testsFtest_character_explicit_lenEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMassumed_rank_testsFtest_character_explicit_lenEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i64>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : i64
 ! CHECK:           %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : i64
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] {uniq_name = "_QMassumed_rank_testsFtest_character_explicit_lenEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, i64, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMassumed_rank_testsFtest_character_explicit_lenEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, i64, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>)
 ! CHECK:           fir.call @_QPtakes_char(%[[VAL_8]]#0) fastmath<contract> : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> ()
 ! CHECK:           return
 ! CHECK:         }
@@ -74,7 +74,7 @@ subroutine test_assumed_length_alloc(x)
 ! CHECK-LABEL:   func.func @_QMassumed_rank_testsPtest_character_assumed_len(
 ! CHECK-SAME:                                                                %[[VAL_0:.*]]: !fir.box<!fir.array<*:!fir.char<1,?>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QMassumed_rank_testsFtest_character_assumed_lenEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QMassumed_rank_testsFtest_character_assumed_lenEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>)
 ! CHECK:           fir.call @_QPtakes_char(%[[VAL_2]]#0) fastmath<contract> : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> ()
 ! CHECK:           return
 ! CHECK:         }
@@ -82,27 +82,27 @@ subroutine test_assumed_length_alloc(x)
 ! CHECK-LABEL:   func.func @_QMassumed_rank_testsPtest_with_attrs(
 ! CHECK-SAME:                                                     %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x", fir.optional, fir.target}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<optional, target>, uniq_name = "_QMassumed_rank_testsFtest_with_attrsEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, target>, uniq_name = "_QMassumed_rank_testsFtest_with_attrsEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           fir.call @_QPtakes_real(%[[VAL_2]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
 
 ! CHECK-LABEL:   func.func @_QMassumed_rank_testsPtest_simple_allocatable(
 ! CHECK-SAME:                                                             %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMassumed_rank_testsFtest_simple_allocatableEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMassumed_rank_testsFtest_simple_allocatableEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>)
 ! CHECK:           return
 ! CHECK:         }
 
 ! CHECK-LABEL:   func.func @_QMassumed_rank_testsPtest_simple_pointer(
 ! CHECK-SAME:                                                         %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMassumed_rank_testsFtest_simple_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMassumed_rank_testsFtest_simple_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>)
 ! CHECK:           return
 ! CHECK:         }
 
 ! CHECK-LABEL:   func.func @_QMassumed_rank_testsPtest_intentout(
 ! CHECK-SAME:                                                    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable, intent_out>, uniq_name = "_QMassumed_rank_testsFtest_intentoutEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable, intent_out>, uniq_name = "_QMassumed_rank_testsFtest_intentoutEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>
 ! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.heap<!fir.array<*:f32>>>) -> !fir.heap<!fir.array<*:f32>>
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.heap<!fir.array<*:f32>>) -> i64
@@ -122,7 +122,7 @@ subroutine test_assumed_length_alloc(x)
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.box_elesize %[[VAL_2]] : (!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>) -> index
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMassumed_rank_testsFtest_assumed_length_allocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMassumed_rank_testsFtest_assumed_length_allocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>)
 ! CHECK:           return
 ! CHECK:         }
 end module
diff --git a/flang/test/Lower/HLFIR/convert-variable-block.f90 b/flang/test/Lower/HLFIR/convert-variable-block.f90
index dad6bc14fbdb5..ba988bcdcee17 100644
--- a/flang/test/Lower/HLFIR/convert-variable-block.f90
+++ b/flang/test/Lower/HLFIR/convert-variable-block.f90
@@ -12,7 +12,7 @@ subroutine test(n)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest(
 ! CHECK-SAME:                       %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtestEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtestEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           fir.call @_QPbefore_block() {{.*}}: () -> ()
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i64>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
diff --git a/flang/test/Lower/HLFIR/convert-variable.f90 b/flang/test/Lower/HLFIR/convert-variable.f90
index 07b91d0f34a07..b9fda640182d4 100644
--- a/flang/test/Lower/HLFIR/convert-variable.f90
+++ b/flang/test/Lower/HLFIR/convert-variable.f90
@@ -6,7 +6,7 @@ subroutine scalar_numeric(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_numeric(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<i32>
-! CHECK:  %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFscalar_numericEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFscalar_numericEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 
 subroutine scalar_character(c)
   character(*) :: c
@@ -14,7 +14,7 @@ subroutine scalar_character(c)
 ! CHECK-LABEL: func.func @_QPscalar_character(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.boxchar<1>
 ! CHECK:  %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:  %[[VAL_2:.*]] = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFscalar_characterEc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:  %[[VAL_2:.*]] = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFscalar_characterEc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 
 subroutine scalar_character_cst_len(c)
   character(10) :: c
@@ -24,7 +24,7 @@ subroutine scalar_character_cst_len(c)
 ! CHECK:  %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:  %[[VAL_3:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,10>>
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 10 : index
-! CHECK:  %[[VAL_4:.*]] = hlfir.declare %[[VAL_3]] typeparams %[[VAL_2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFscalar_character_cst_lenEc"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>)
+! CHECK:  %[[VAL_4:.*]] = hlfir.declare %[[VAL_3]] typeparams %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFscalar_character_cst_lenEc"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>)
 
 subroutine array_numeric(x)
   integer :: x(10, 20)
@@ -34,7 +34,7 @@ subroutine array_numeric(x)
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 20 : index
 ! CHECK:  %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2>
-! CHECK:  %[[VAL_4:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_numericEx"} : (!fir.ref<!fir.array<10x20xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xi32>>, !fir.ref<!fir.array<10x20xi32>>)
+! CHECK:  %[[VAL_4:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFarray_numericEx"} : (!fir.ref<!fir.array<10x20xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xi32>>, !fir.ref<!fir.array<10x20xi32>>)
 
 
 subroutine array_numeric_lbounds(x)
@@ -47,7 +47,7 @@ subroutine array_numeric_lbounds(x)
 ! CHECK:  %[[VAL_3:.*]] = arith.constant -2 : index
 ! CHECK:  %[[VAL_4:.*]] = arith.constant 23 : index
 ! CHECK:  %[[VAL_5:.*]] = fir.shape_shift %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]] : (index, index, index, index) -> !fir.shapeshift<2>
-! CHECK:  %[[VAL_6:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}}  {uniq_name = "_QFarray_numeric_lboundsEx"} : (!fir.ref<!fir.array<12x23xi32>>, !fir.shapeshift<2>, !fir.dscope) -> (!fir.box<!fir.array<12x23xi32>>, !fir.ref<!fir.array<12x23xi32>>)
+! CHECK:  %[[VAL_6:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFarray_numeric_lboundsEx"} : (!fir.ref<!fir.array<12x23xi32>>, !fir.shapeshift<2>, !fir.dscope) -> (!fir.box<!fir.array<12x23xi32>>, !fir.ref<!fir.array<12x23xi32>>)
 
 subroutine array_character(c)
   character(*) :: c(50)
@@ -58,14 +58,14 @@ subroutine array_character(c)
 ! CHECK:  %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<50x!fir.char<1,?>>>
 ! CHECK:  %[[VAL_3:.*]] = arith.constant 50 : index
 ! CHECK:  %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_5:.*]] = hlfir.declare %[[VAL_2]](%[[VAL_4]]) typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}}  {uniq_name = "_QFarray_characterEc"} : (!fir.ref<!fir.array<50x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<50x!fir.char<1,?>>>, !fir.ref<!fir.array<50x!fir.char<1,?>>>)
+! CHECK:  %[[VAL_5:.*]] = hlfir.declare %[[VAL_2]](%[[VAL_4]]) typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFarray_characterEc"} : (!fir.ref<!fir.array<50x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<50x!fir.char<1,?>>>, !fir.ref<!fir.array<50x!fir.char<1,?>>>)
 
 subroutine scalar_numeric_attributes(x)
   integer, optional, target, intent(in) :: x
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_numeric_attributes(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<i32>
-! CHECK:  %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in, optional, target>, uniq_name = "_QFscalar_numeric_attributesEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in, optional, target>, uniq_name = "_QFscalar_numeric_attributesEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 
 subroutine scalar_numeric_attributes_2(x)
   integer, parameter :: rk = merge(16, 8, selected_real_kind(33, 4931)==16)
@@ -76,22 +76,22 @@ subroutine scalar_numeric_attributes_2(x)
 ! F64-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.array<100xf64>>
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 100 : index
 ! CHECK:  %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
-! F128:  %[[VAL_3:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFscalar_numeric_attributes_2Ex"} : (!fir.ref<!fir.array<100xf128>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf128>>, !fir.ref<!fir.array<100xf128>>)
-! F64:  %[[VAL_3:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFscalar_numeric_attributes_2Ex"} : (!fir.ref<!fir.array<100xf64>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf64>>, !fir.ref<!fir.array<100xf64>>)
+! F128:  %[[VAL_3:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFscalar_numeric_attributes_2Ex"} : (!fir.ref<!fir.array<100xf128>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf128>>, !fir.ref<!fir.array<100xf128>>)
+! F64:  %[[VAL_3:.*]] = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFscalar_numeric_attributes_2Ex"} : (!fir.ref<!fir.array<100xf64>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf64>>, !fir.ref<!fir.array<100xf64>>)
 
 subroutine scalar_numeric_attributes_3(x)
   real, intent(in) :: x
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_numeric_attributes_3(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<f32>
-! CHECK:  %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFscalar_numeric_attributes_3Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:  %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}}  {{.*}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFscalar_numeric_attributes_3Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 
 subroutine scalar_numeric_attributes_4(x)
   logical(8), intent(out) :: x
 end subroutine
 ! CHECK-LABEL: func.func @_QPscalar_numeric_attributes_4(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.logical<8>>
-! CHECK:  %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QFscalar_numeric_attributes_4Ex"} : (!fir.ref<!fir.logical<8>>, !fir.dscope) -> (!fir.ref<!fir.logical<8>>, !fir.ref<!fir.logical<8>>)
+! CHECK:  %[[VAL_1:.*]] = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}}  {{.*}} {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QFscalar_numeric_attributes_4Ex"} : (!fir.ref<!fir.logical<8>>, !fir.dscope) -> (!fir.ref<!fir.logical<8>>, !fir.ref<!fir.logical<8>>)
 
 subroutine scalar_numeric_parameter()
   integer, parameter :: p = 42
diff --git a/flang/test/Lower/HLFIR/cray-pointers.f90 b/flang/test/Lower/HLFIR/cray-pointers.f90
index 6a5a3d110849a..082aa1ef8c3f2 100644
--- a/flang/test/Lower/HLFIR/cray-pointers.f90
+++ b/flang/test/Lower/HLFIR/cray-pointers.f90
@@ -62,8 +62,8 @@ end subroutine test3
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "cp"},
 ! CHECK-SAME:                        %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,11>>>>
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest3En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest3Ecp"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest3En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest3Ecp"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 11 : index
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 11 : index
 ! CHECK:           %[[VAL_24:.*]] = fir.shape_shift %{{.*}}, %{{.*}} : (index, index) -> !fir.shapeshift<1>
@@ -88,7 +88,7 @@ end subroutine test4
 ! CHECK-LABEL:   func.func @_QPtest4(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.char<1,?>>>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest4En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest4En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i64 {bindc_name = "cp", uniq_name = "_QFtest4Ecp"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFtest4Ecp"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
@@ -153,7 +153,7 @@ end subroutine test6
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xf32>>>
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x!fir.char<1,?>>>>
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest6En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest6En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.alloca i64 {bindc_name = "cp", uniq_name = "_QFtest6Ecp"}
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFtest6Ecp"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
@@ -379,7 +379,7 @@ subroutine internal()
 ! CHECK-LABEL:   func.func @_QPtest_craypointer_capture(
 ! CHECK-SAME:                                           %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.char<1,?>>>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_craypointer_captureEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_craypointer_captureEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i64 {bindc_name = "cray_pointer", uniq_name = "_QFtest_craypointer_captureEcray_pointer"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {fortran_attrs = #fir.var_attrs<internal_assoc>, uniq_name = "_QFtest_craypointer_captureEcray_pointer"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/HLFIR/cshift.f90 b/flang/test/Lower/HLFIR/cshift.f90
index c3743068da4d7..64fd376e33f43 100644
--- a/flang/test/Lower/HLFIR/cshift.f90
+++ b/flang/test/Lower/HLFIR/cshift.f90
@@ -206,9 +206,9 @@ subroutine cshift11(a, s, d)
 ! CHECK-SAME:                           %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "s"},
 ! CHECK-SAME:                           %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "d"}) {
 ! CHECK:           %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFcshift11Ea"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFcshift11Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFcshift11Es"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFcshift11Ea"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFcshift11Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFcshift11Es"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 2 : i32
 ! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_9:.*]] = hlfir.cshift %[[VAL_4]]#0 %[[VAL_7]] dim %[[VAL_8]] : (!fir.box<!fir.array<?xi32>>, i32, i32) -> !hlfir.expr<?xi32>
diff --git a/flang/test/Lower/HLFIR/custom-intrinsic.f90 b/flang/test/Lower/HLFIR/custom-intrinsic.f90
index 5ec6e0a17e9ac..4999eebf376e7 100644
--- a/flang/test/Lower/HLFIR/custom-intrinsic.f90
+++ b/flang/test/Lower/HLFIR/custom-intrinsic.f90
@@ -8,8 +8,8 @@ function max_simple(a, b)
 ! CHECK-SAME:      %[[A_ARG:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}
 ! CHECK-SAME:      %[[B_ARG:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}
 ! CHECK-NEXT:    %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK-NEXT:    %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ARG]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFmax_simpleEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK-NEXT:    %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ARG]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFmax_simpleEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-NEXT:    %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ARG]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFmax_simpleEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-NEXT:    %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ARG]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFmax_simpleEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK-NEXT:    %[[RES_ALLOC:.*]] = fir.alloca i32 {bindc_name = "max_simple", uniq_name = "_QFmax_simpleEmax_simple"}
 ! CHECK-NEXT:    %[[RES_DECL:.*]]:2 = hlfir.declare %[[RES_ALLOC]] {uniq_name = "_QFmax_simpleEmax_simple"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK-NEXT:    %[[A_LD:.*]] = fir.load %[[A_DECL]]#0 : !fir.ref<i32>
@@ -30,9 +30,9 @@ function max_dynamic_optional_scalar(a, b, c)
 ! CHECK-SAME:                                              %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
 ! CHECK-SAME:                                              %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "b"},
 ! CHECK-SAME:                                              %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "c", fir.optional}) -> i32 {
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalarEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalarEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}}  {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalarEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalarEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalarEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalarEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "max_dynamic_optional_scalar", uniq_name = "_QFmax_dynamic_optional_scalarEmax_dynamic_optional_scalar"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmax_dynamic_optional_scalarEmax_dynamic_optional_scalar"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
@@ -63,10 +63,10 @@ function max_dynamic_optional_scalar2(a, b, c, d)
 ! CHECK-SAME:                                               %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "b"},
 ! CHECK-SAME:                                               %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "c", fir.optional},
 ! CHECK-SAME:                                               %[[VAL_3:.*]]: !fir.ref<i32> {fir.bindc_name = "d", fir.optional}) -> i32 {
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalar2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalar2Eb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalar2Ec"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalar2Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalar2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_scalar2Eb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalar2Ec"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_scalar2Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca i32 {bindc_name = "max_dynamic_optional_scalar2", uniq_name = "_QFmax_dynamic_optional_scalar2Emax_dynamic_optional_scalar2"}
 ! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmax_dynamic_optional_scalar2Emax_dynamic_optional_scalar2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
@@ -105,10 +105,10 @@ function max_array(a, b)
 ! CHECK-SAME:                            %[[VAL_1:.*]]: !fir.ref<!fir.array<42xi32>> {fir.bindc_name = "b"}) -> !fir.array<42xi32> {
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_arrayEa"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmax_arrayEa"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_arrayEb"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmax_arrayEb"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_9:.*]] = fir.alloca !fir.array<42xi32> {bindc_name = "max_array", uniq_name = "_QFmax_arrayEmax_array"}
 ! CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
@@ -138,13 +138,13 @@ function max_dynamic_optional_array(a, b, c)
 ! CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
 ! CHECK-SAME:                                             %[[VAL_1:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "b"},
 ! CHECK-SAME:                                             %[[VAL_2:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "c", fir.optional}) -> !fir.array<10xi32> {
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_arrayEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_arrayEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_arrayEb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmax_dynamic_optional_arrayEb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_arrayEc"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmax_dynamic_optional_arrayEc"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_11:.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "max_dynamic_optional_array", uniq_name = "_QFmax_dynamic_optional_arrayEmax_dynamic_optional_array"}
 ! CHECK:           %[[VAL_12:.*]] = fir.shape %[[VAL_10]] : (index) -> !fir.shape<1>
@@ -181,8 +181,8 @@ function min_simple(a, b)
 ! CHECK-LABEL:   func.func @_QPmin_simple(
 ! CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
 ! CHECK-SAME:                             %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) -> i32 {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_simpleEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_simpleEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_simpleEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_simpleEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "min_simple", uniq_name = "_QFmin_simpleEmin_simple"}
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFmin_simpleEmin_simple"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
@@ -203,9 +203,9 @@ function min_dynamic_optional_scalar(a, b, c)
 ! CHECK-SAME:                                              %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
 ! CHECK-SAME:                                              %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "b"},
 ! CHECK-SAME:                                              %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "c", fir.optional}) -> i32 {
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalarEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalarEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}}  {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalarEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalarEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalarEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalarEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "min_dynamic_optional_scalar", uniq_name = "_QFmin_dynamic_optional_scalarEmin_dynamic_optional_scalar"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmin_dynamic_optional_scalarEmin_dynamic_optional_scalar"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
@@ -236,10 +236,10 @@ function min_dynamic_optional_scalar2(a, b, c, d)
 ! CHECK-SAME:                                               %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "b"},
 ! CHECK-SAME:                                               %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "c", fir.optional},
 ! CHECK-SAME:                                               %[[VAL_3:.*]]: !fir.ref<i32> {fir.bindc_name = "d", fir.optional}) -> i32 {
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalar2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalar2Eb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalar2Ec"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalar2Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalar2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_scalar2Eb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalar2Ec"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_scalar2Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca i32 {bindc_name = "min_dynamic_optional_scalar2", uniq_name = "_QFmin_dynamic_optional_scalar2Emin_dynamic_optional_scalar2"}
 ! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmin_dynamic_optional_scalar2Emin_dynamic_optional_scalar2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
@@ -278,10 +278,10 @@ function min_array(a, b)
 ! CHECK-SAME:                            %[[VAL_1:.*]]: !fir.ref<!fir.array<42xi32>> {fir.bindc_name = "b"}) -> !fir.array<42xi32> {
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_arrayEa"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_arrayEa"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_arrayEb"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_arrayEb"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_9:.*]] = fir.alloca !fir.array<42xi32> {bindc_name = "min_array", uniq_name = "_QFmin_arrayEmin_array"}
 ! CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
@@ -311,13 +311,13 @@ function min_dynamic_optional_array(a, b, c)
 ! CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
 ! CHECK-SAME:                                             %[[VAL_1:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "b"},
 ! CHECK-SAME:                                             %[[VAL_2:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "c", fir.optional}) -> !fir.array<10xi32> {
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_arrayEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_arrayEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_arrayEb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmin_dynamic_optional_arrayEb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_arrayEc"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFmin_dynamic_optional_arrayEc"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_11:.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "min_dynamic_optional_array", uniq_name = "_QFmin_dynamic_optional_arrayEmin_dynamic_optional_array"}
 ! CHECK:           %[[VAL_12:.*]] = fir.shape %[[VAL_10]] : (index) -> !fir.shape<1>
@@ -356,7 +356,7 @@ function associated_simple(pointer)
 ! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>> {fir.bindc_name = "pointer"}) -> !fir.logical<4> {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.logical<4> {bindc_name = "associated_simple", uniq_name = "_QFassociated_simpleEassociated_simple"}
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFassociated_simpleEassociated_simple"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_simpleEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_simpleEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 ! CHECK:           %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
 ! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.ptr<i32>) -> i64
@@ -379,8 +379,8 @@ function associated_target(pointer, target)
 ! CHECK-SAME:                                    %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "target", fir.target}) -> !fir.logical<4> {
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "associated_target", uniq_name = "_QFassociated_targetEassociated_target"}
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFassociated_targetEassociated_target"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_targetEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFassociated_targetEtarget"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_targetEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFassociated_targetEtarget"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_6:.*]] = fir.embox %[[VAL_5]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
 ! CHECK:           %[[VAL_7:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.ptr<i32>>) -> !fir.box<none>
@@ -403,8 +403,8 @@ function associated_pointer(pointer, target)
 ! CHECK-SAME:                                     %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>> {fir.bindc_name = "target"}) -> !fir.logical<4> {
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "associated_pointer", uniq_name = "_QFassociated_pointerEassociated_pointer"}
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFassociated_pointerEassociated_pointer"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_pointerEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_pointerEtarget"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_pointerEpointer"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_pointerEtarget"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 ! CHECK:           %[[VAL_7:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.ptr<i32>>) -> !fir.box<none>
@@ -427,8 +427,8 @@ function associated_array(pointer, target)
 ! CHECK-SAME:                                   %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {fir.bindc_name = "target"}) -> !fir.logical<4> {
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "associated_array", uniq_name = "_QFassociated_arrayEassociated_array"}
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFassociated_arrayEassociated_array"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_arrayEpointer"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_arrayEtarget"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_arrayEpointer"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFassociated_arrayEtarget"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
 ! CHECK:           %[[VAL_7:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !fir.box<none>
@@ -448,11 +448,11 @@ function ishftc_simple(i, shift, size)
 ! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "i"},
 ! CHECK-SAME:                                %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "shift"},
 ! CHECK-SAME:                                %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "size"}) -> i32 {
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_simpleEi"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_simpleEi"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "ishftc_simple", uniq_name = "_QFishftc_simpleEishftc_simple"}
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFishftc_simpleEishftc_simple"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_simpleEshift"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_simpleEsize"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_simpleEshift"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_simpleEsize"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32>
@@ -499,11 +499,11 @@ function ishftc_dynamically_optional_scalar(i, shift, size)
 ! CHECK-SAME:                                                     %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "i"},
 ! CHECK-SAME:                                                     %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "shift"},
 ! CHECK-SAME:                                                     %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "size", fir.optional}) -> i32 {
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_scalarEi"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_scalarEi"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "ishftc_dynamically_optional_scalar", uniq_name = "_QFishftc_dynamically_optional_scalarEishftc_dynamically_optional_scalar"}
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFishftc_dynamically_optional_scalarEishftc_dynamically_optional_scalar"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_scalarEshift"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFishftc_dynamically_optional_scalarEsize"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_scalarEshift"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFishftc_dynamically_optional_scalarEsize"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_10:.*]] = fir.is_present %[[VAL_7]]#0 : (!fir.ref<i32>) -> i1
@@ -558,17 +558,17 @@ function ishftc_array(i, shift, size)
 ! CHECK-SAME:                               %[[VAL_2:.*]]: !fir.ref<!fir.array<42xi32>> {fir.bindc_name = "size"}) -> !fir.array<42xi32> {
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_arrayEi"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_arrayEi"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.array<42xi32> {bindc_name = "ishftc_array", uniq_name = "_QFishftc_arrayEishftc_array"}
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_8]]) {uniq_name = "_QFishftc_arrayEishftc_array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_11:.*]] = fir.shape %[[VAL_10]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_11]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_arrayEshift"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_11]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_arrayEshift"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_13:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_14:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_14]]) dummy_scope %{{[0-9]+}}  {uniq_name = "_QFishftc_arrayEsize"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+! CHECK:           %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_14]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_arrayEsize"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_16:.*]] = hlfir.elemental %[[VAL_4]] unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
 ! CHECK:           ^bb0(%[[VAL_17:.*]]: index):
 ! CHECK:             %[[VAL_18:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_17]])  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
@@ -625,13 +625,13 @@ function ishftc_dynamically_optional_array(i, shift, size)
 ! CHECK-SAME:                                                    %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "size", fir.optional}) -> !fir.array<42xi32> {
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_arrayEi"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_arrayEi"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 42 : index
 ! CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.array<42xi32> {bindc_name = "ishftc_dynamically_optional_array", uniq_name = "_QFishftc_dynamically_optional_arrayEishftc_dynamically_optional_array"}
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_8]]) {uniq_name = "_QFishftc_dynamically_optional_arrayEishftc_dynamically_optional_array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_arrayEshift"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFishftc_dynamically_optional_arrayEsize"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFishftc_dynamically_optional_arrayEshift"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFishftc_dynamically_optional_arrayEsize"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_12:.*]] = fir.is_present %[[VAL_11]]#0 : (!fir.ref<i32>) -> i1
 ! CHECK:           %[[VAL_13:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_14:.*]] = hlfir.elemental %[[VAL_4]] unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
@@ -699,9 +699,9 @@ subroutine allocatables_test(a, b, c)
 ! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>> {fir.bindc_name = "a"},
 ! CHECK-SAME:                                    %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>> {fir.bindc_name = "b"},
 ! CHECK-SAME:                                    %[[VAL_2:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>> {fir.bindc_name = "c"}) {
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEc"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatables_testEc"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.address_of(@_QFallocatables_testECnx) : !fir.ref<i32>
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QFallocatables_testECnx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_8:.*]] = fir.address_of(@_QFallocatables_testECny) : !fir.ref<i32>
diff --git a/flang/test/Lower/HLFIR/designators-component-ref.f90 b/flang/test/Lower/HLFIR/designators-component-ref.f90
index 935176becac75..e6bb9c3095a85 100644
--- a/flang/test/Lower/HLFIR/designators-component-ref.f90
+++ b/flang/test/Lower/HLFIR/designators-component-ref.f90
@@ -350,7 +350,7 @@ subroutine test_scalar_array_complex_chain(a)
   type(t_complex) :: a
   print *, a%array_comp%im
 ! CHECK-LABEL:   func.func @_QPtest_scalar_array_complex_chain(
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_scalar_array_complex_chainEa"} : (!fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20xcomplex<f32>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20xcomplex<f32>>}>>, !fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20xcomplex<f32>>}>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_scalar_array_complex_chainEa"} : (!fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20xcomplex<f32>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20xcomplex<f32>>}>>, !fir.ref<!fir.type<_QMcomp_refTt_complex{array_comp:!fir.array<10x20xcomplex<f32>>}>>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 20 : index
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 2 : index
@@ -389,13 +389,13 @@ end subroutine test_poly_array_vector_subscript
 ! CHECK-SAME:      %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>> {fir.bindc_name = "p"},
 ! CHECK-SAME:      %[[VAL_1:.*]]: !fir.ref<!fir.array<3xi32>> {fir.bindc_name = "v"},
 ! CHECK-SAME:      %[[VAL_2:.*]]: !fir.ref<!fir.array<3xi32>> {fir.bindc_name = "r"}) {
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_poly_array_vector_subscriptEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_poly_array_vector_subscriptEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_poly_array_vector_subscriptEr"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_poly_array_vector_subscriptEr"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_poly_array_vector_subscriptEv"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_poly_array_vector_subscriptEv"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
 ! CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcomp_refTt1{scalar_i:i32,scalar_x:f32}>>>>>
 ! CHECK:           %[[VAL_11:.*]] = hlfir.elemental %[[VAL_8]] unordered : (!fir.shape<1>) -> !hlfir.expr<3xi64> {
 ! CHECK:           ^bb0(%[[VAL_12:.*]]: index):
diff --git a/flang/test/Lower/HLFIR/designators.f90 b/flang/test/Lower/HLFIR/designators.f90
index cb1cab334c9ae..6e7ee6dfffeec 100644
--- a/flang/test/Lower/HLFIR/designators.f90
+++ b/flang/test/Lower/HLFIR/designators.f90
@@ -7,8 +7,8 @@ subroutine array_ref(x, n)
   print *, x(n)
 end subroutine
 ! CHECK-LABEL: func.func @_QParray_ref(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_refEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_refEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFarray_refEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFarray_refEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:  %[[VAL_9:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i64>
 ! CHECK:  %[[VAL_10:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_9]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
 
@@ -17,8 +17,8 @@ subroutine char_array_ref(x, n)
   print *, x(10)
 end subroutine
 ! CHECK-LABEL: func.func @_QPchar_array_ref(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_refEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_refEx"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_array_refEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_array_refEx"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
 ! CHECK:  %[[VAL_9:.*]] = fir.box_elesize %[[VAL_3]]#1 : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> index
 ! CHECK:  %[[VAL_10:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_11:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_10]])  typeparams %[[VAL_9]] : (!fir.box<!fir.array<?x!fir.char<1,?>>>, index, index) -> !fir.boxchar<1>
@@ -28,9 +28,9 @@ subroutine char_array_ref_cst_len(x, n)
   print *, x(10)
 end subroutine
 ! CHECK-LABEL: func.func @_QPchar_array_ref_cst_len(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_ref_cst_lenEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_array_ref_cst_lenEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:  %[[VAL_3:.*]] = arith.constant 5 : index
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_ref_cst_lenEx"} : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,5>>>, !fir.box<!fir.array<?x!fir.char<1,5>>>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_array_ref_cst_lenEx"} : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,5>>>, !fir.box<!fir.array<?x!fir.char<1,5>>>)
 ! CHECK:  %[[VAL_10:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_11:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_10]])  typeparams %[[VAL_3]] : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index, index) -> !fir.ref<!fir.char<1,5>>
 
@@ -41,7 +41,7 @@ subroutine array_section(x)
 ! CHECK-LABEL: func.func @_QParray_section(
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}(%[[VAL_2]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_sectionEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}}(%[[VAL_2]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFarray_sectionEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 ! CHECK:  %[[VAL_9:.*]] = arith.constant 2 : index
 ! CHECK:  %[[VAL_10:.*]] = arith.constant 8 : index
 ! CHECK:  %[[VAL_11:.*]] = arith.constant 3 : index
@@ -55,8 +55,8 @@ subroutine array_section_2(x, n)
   print *, x(n::3)
 end subroutine
 ! CHECK-LABEL: func.func @_QParray_section_2(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_section_2En"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFarray_section_2Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFarray_section_2En"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFarray_section_2Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:  %[[VAL_9:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i64>
 ! CHECK:  %[[VAL_10:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_11:.*]]:3 = fir.box_dims %[[VAL_3]]#1, %[[VAL_10]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
@@ -76,8 +76,8 @@ subroutine char_array_section(x, n)
   print *, x(::3)
 end subroutine
 ! CHECK-LABEL: func.func @_QPchar_array_section(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_sectionEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_sectionEx"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_array_sectionEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_array_sectionEx"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
 ! CHECK:  %[[VAL_9:.*]] = fir.box_elesize %[[VAL_3]]#1 : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> index
 ! CHECK:  %[[VAL_10:.*]] = arith.constant 1 : index
 ! CHECK:  %[[VAL_11:.*]] = arith.constant 0 : index
@@ -97,9 +97,9 @@ subroutine char_array_section_cst_len(x, n)
   print *, x(::3)
 end subroutine
 ! CHECK-LABEL: func.func @_QPchar_array_section_cst_len(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_section_cst_lenEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_array_section_cst_lenEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:  %[[VAL_3:.*]] = arith.constant 5 : index
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_array_section_cst_lenEx"} : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,5>>>, !fir.box<!fir.array<?x!fir.char<1,5>>>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} typeparams %[[VAL_3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_array_section_cst_lenEx"} : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,5>>>, !fir.box<!fir.array<?x!fir.char<1,5>>>)
 ! CHECK:  %[[VAL_10:.*]] = arith.constant 1 : index
 ! CHECK:  %[[VAL_11:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_12:.*]]:3 = fir.box_dims %[[VAL_4]]#1, %[[VAL_11]] : (!fir.box<!fir.array<?x!fir.char<1,5>>>, index) -> (index, index, index)
@@ -120,7 +120,7 @@ subroutine complex_imag_ref(x)
   print *, x%im
 end subroutine
 ! CHECK-LABEL: func.func @_QPcomplex_imag_ref(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_imag_refEx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcomplex_imag_refEx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>)
 ! CHECK:  %[[VAL_3:.*]] = fir.shape %[[VAL_4:.*]]#1 : (index) -> !fir.shape<1>
 ! CHECK:  %[[VAL_5:.*]] = hlfir.designate %[[VAL_2]]#0  imag shape %[[VAL_3]] : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
 
@@ -129,7 +129,7 @@ subroutine complex_real_ref(x)
   print *, x%re
 end subroutine
 ! CHECK-LABEL: func.func @_QPcomplex_real_ref(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_real_refEx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcomplex_real_refEx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>)
 ! CHECK:  %[[VAL_3:.*]] = fir.shape %[[VAL_4:.*]]#1 : (index) -> !fir.shape<1>
 ! CHECK:  %[[VAL_5:.*]] = hlfir.designate %[[VAL_2]]#0  real shape %[[VAL_3]] : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
 
@@ -139,8 +139,8 @@ subroutine complex_individual_ref(x, n)
   print *, x(n)%im
 end subroutine
 ! CHECK-LABEL: func.func @_QPcomplex_individual_ref(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_individual_refEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_individual_refEx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcomplex_individual_refEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcomplex_individual_refEx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>)
 ! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
 ! CHECK:  %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i32) -> i64
 ! CHECK:  %[[VAL_6:.*]] = hlfir.designate %{{[0-9]+}}#0 (%[[VAL_5]]) imag : (!fir.box<!fir.array<?xcomplex<f32>>>, i64) -> !fir.ref<f32>
@@ -151,9 +151,9 @@ subroutine complex_slice_ref(x, start, end)
   print *, x(start:end)%re
 end subroutine
 ! CHECK-LABEL: func.func @_QPcomplex_slice_ref(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_slice_refEend"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_slice_refEstart"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} {uniq_name = "_QFcomplex_slice_refEx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcomplex_slice_refEend"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcomplex_slice_refEstart"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  %[[VAL_4:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFcomplex_slice_refEx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>)
 ! CHECK:  %[[VAL_5:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
 ! CHECK:  %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i32) -> i64
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/HLFIR/dot_product.f90 b/flang/test/Lower/HLFIR/dot_product.f90
index 2d3ee97b7e408..f36c314caf5aa 100644
--- a/flang/test/Lower/HLFIR/dot_product.f90
+++ b/flang/test/Lower/HLFIR/dot_product.f90
@@ -72,10 +72,10 @@ subroutine dot_product4(lhs, rhs, res)
 ! CHECK-NEXT:   }
 
 ! CHECK-LABEL: func.func @_QPdot_product5
-! CHECK:    %[[LHS:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFdot_product5Elhs"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:    %[[LHS:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFdot_product5Elhs"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:    %[[C3:.*]] = arith.constant 3 : index
 ! CHECK:    %[[RHS_SHAPE:.*]] = fir.shape %[[C3]] : (index) -> !fir.shape<1>
-! CHECK:    %[[RHS:.*]]:2 = hlfir.declare %{{.*}}(%[[RHS_SHAPE]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFdot_product5Erhs"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
+! CHECK:    %[[RHS:.*]]:2 = hlfir.declare %{{.*}}(%[[RHS_SHAPE]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFdot_product5Erhs"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
 ! CHECK:    {{.*}} = hlfir.dot_product %[[LHS]]#0 %[[RHS]]#0 {fastmath = #arith.fastmath<contract>} : (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<3xi32>>) -> i32
 subroutine dot_product5(lhs, rhs, res)
   integer :: lhs(:), rhs(3)
diff --git a/flang/test/Lower/HLFIR/dummy-arg-number.f90 b/flang/test/Lower/HLFIR/dummy-arg-number.f90
new file mode 100644
index 0000000000000..938cdcc7619b0
--- /dev/null
+++ b/flang/test/Lower/HLFIR/dummy-arg-number.f90
@@ -0,0 +1,53 @@
+! Test that dummy argument positions are tracked in hlfir.declare
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+! CHECK-LABEL: func.func @_QPsingle_arg(
+subroutine single_arg(n)
+  integer :: n
+  ! CHECK: hlfir.declare %{{.*}} dummy_scope %{{.*}} arg 1 {uniq_name = "_QFsingle_argEn"}
+  print *, n
+end subroutine
+
+! CHECK-LABEL: func.func @_QPmultiple_args(
+subroutine multiple_args(a, b, c)
+  integer :: a, b, c
+  ! CHECK-DAG: hlfir.declare %{{.*}} dummy_scope %{{.*}} arg 1 {uniq_name = "_QFmultiple_argsEa"}
+  ! CHECK-DAG: hlfir.declare %{{.*}} dummy_scope %{{.*}} arg 2 {uniq_name = "_QFmultiple_argsEb"}
+  ! CHECK-DAG: hlfir.declare %{{.*}} dummy_scope %{{.*}} arg 3 {uniq_name = "_QFmultiple_argsEc"}
+  print *, a, b, c
+end subroutine
+
+! CHECK-LABEL: func.func @_QPchar_arg(
+subroutine char_arg(str)
+  character(len=5) :: str
+  ! CHECK: hlfir.declare %{{.*}} typeparams %{{.*}} dummy_scope %{{.*}} arg 1 {uniq_name = "_QFchar_argEstr"}
+  print *, str
+end subroutine
+
+! CHECK-LABEL: func.func @_QParray_arg(
+subroutine array_arg(arr)
+  integer :: arr(:)
+  ! CHECK: hlfir.declare %{{.*}} dummy_scope %{{.*}} arg 1 {uniq_name = "_QFarray_argEarr"}
+  print *, arr(1)
+end subroutine
+
+! Test that local variables do NOT get arg numbers
+! CHECK-LABEL: func.func @_QPlocal_var()
+subroutine local_var()
+  integer :: x
+  ! CHECK: hlfir.declare %{{[0-9]+}} {uniq_name = "_QFlocal_varEx"}
+  x = 10
+  print *, x
+end subroutine
+
+! Test mixed arguments and locals
+! CHECK-LABEL: func.func @_QPmixed(
+subroutine mixed(n)
+  integer :: n
+  integer :: local_x
+  ! CHECK-DAG: hlfir.declare %{{[0-9]+}} {uniq_name = "_QFmixedElocal_x"}
+  ! CHECK-DAG: hlfir.declare {{.*}} dummy_scope {{.*}} arg 1 {uniq_name = "_QFmixedEn"}
+  local_x = n + 1
+  print *, local_x
+end subroutine
+
diff --git a/flang/test/Lower/HLFIR/dummy-scope.f90 b/flang/test/Lower/HLFIR/dummy-scope.f90
index 4b1a3324c0777..da22318bc4d29 100644
--- a/flang/test/Lower/HLFIR/dummy-scope.f90
+++ b/flang/test/Lower/HLFIR/dummy-scope.f90
@@ -7,7 +7,7 @@ end subroutine sub_arg
 ! CHECK-LABEL:   func.func @_QPsub_arg(
 ! CHECK-SAME:                          %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<i32> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFsub_argEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFsub_argEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           return
 ! CHECK:         }
 
@@ -29,7 +29,7 @@ end function func_arg
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "func_arg", uniq_name = "_QFfunc_argEfunc_arg"}
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFfunc_argEfunc_arg"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFfunc_argEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFfunc_argEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
 ! CHECK:           hlfir.assign %[[VAL_5]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/HLFIR/elemental-array-ops.f90 b/flang/test/Lower/HLFIR/elemental-array-ops.f90
index 10450f6876c14..3a923b3c70ec5 100644
--- a/flang/test/Lower/HLFIR/elemental-array-ops.f90
+++ b/flang/test/Lower/HLFIR/elemental-array-ops.f90
@@ -166,9 +166,9 @@ end subroutine char_return
 ! CHECK:           fir.store %[[VAL_7]] to %[[VAL_3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_3]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFchar_returnEl"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>)
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 3 : index
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_9]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFchar_returnEx"} : (!fir.box<!fir.array<?x!fir.char<1,3>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,3>>>, !fir.box<!fir.array<?x!fir.char<1,3>>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_9]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFchar_returnEx"} : (!fir.box<!fir.array<?x!fir.char<1,3>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,3>>>, !fir.box<!fir.array<?x!fir.char<1,3>>>)
 ! CHECK:           %[[VAL_11:.*]] = arith.constant 3 : index
-! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]] typeparams %[[VAL_11]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFchar_returnEy"} : (!fir.box<!fir.array<?x!fir.char<1,3>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,3>>>, !fir.box<!fir.array<?x!fir.char<1,3>>>)
+! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]] typeparams %[[VAL_11]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFchar_returnEy"} : (!fir.box<!fir.array<?x!fir.char<1,3>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,3>>>, !fir.box<!fir.array<?x!fir.char<1,3>>>)
 ! CHECK:           %[[VAL_13:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_14:.*]]:3 = fir.box_dims %[[VAL_12]]#0, %[[VAL_13]] : (!fir.box<!fir.array<?x!fir.char<1,3>>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_15:.*]] = fir.shape %[[VAL_14]]#1 : (index) -> !fir.shape<1>
@@ -210,8 +210,8 @@ end subroutine polymorphic_parenthesis
 ! CHECK-LABEL:   func.func @_QPpolymorphic_parenthesis(
 ! CHECK-SAME:        %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:        %[[VAL_1:.*]]: !fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFpolymorphic_parenthesisEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFpolymorphic_parenthesisEy"} : (!fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>, !fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFpolymorphic_parenthesisEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFpolymorphic_parenthesisEy"} : (!fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>, !fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_4]] : (!fir.class<!fir.array<?x!fir.type<_QFpolymorphic_parenthesisTt>>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1>
@@ -234,8 +234,8 @@ end subroutine unlimited_polymorphic_parenthesis
 ! CHECK-LABEL:   func.func @_QPunlimited_polymorphic_parenthesis(
 ! CHECK-SAME:        %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:        %[[VAL_1:.*]]: !fir.class<!fir.array<?xnone>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFunlimited_polymorphic_parenthesisEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFunlimited_polymorphic_parenthesisEy"} : (!fir.class<!fir.array<?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFunlimited_polymorphic_parenthesisEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFunlimited_polymorphic_parenthesisEy"} : (!fir.class<!fir.array<?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_4]] : (!fir.class<!fir.array<?xnone>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90 b/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90
index 36762d47100c5..7453ecaafd373 100644
--- a/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90
+++ b/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90
@@ -14,10 +14,10 @@ end subroutine test_polymorphic_merge
 ! CHECK-SAME:        %[[VAL_1:.*]]: !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>> {fir.bindc_name = "y"},
 ! CHECK-SAME:        %[[VAL_2:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>> {fir.bindc_name = "r"},
 ! CHECK-SAME:        %[[VAL_3:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "m"}) {
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_polymorphic_mergeEm"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_mergeEr"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphic_mergeEx"} : (!fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>, !fir.dscope) -> (!fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>, !fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>)
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphic_mergeEy"} : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>, !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_polymorphic_mergeEm"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_mergeEr"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphic_mergeEx"} : (!fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>, !fir.dscope) -> (!fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>, !fir.class<!fir.type<_QFtest_polymorphic_mergeTt>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphic_mergeEy"} : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>, !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_9:.*]]:3 = fir.box_dims %[[VAL_7]]#0, %[[VAL_8]] : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphic_mergeTt>>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_9]]#1 : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/HLFIR/elemental-result-length.f90 b/flang/test/Lower/HLFIR/elemental-result-length.f90
index 9418a40537683..4cce2ce5e496e 100644
--- a/flang/test/Lower/HLFIR/elemental-result-length.f90
+++ b/flang/test/Lower/HLFIR/elemental-result-length.f90
@@ -18,11 +18,11 @@ subroutine sub2(a,b,c)
 ! CHECK-LABEL: func.func @_QMm1Psub2(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "b"}, %[[ARG2:.*]]: !fir.boxchar<1> {fir.bindc_name = "c"}) {
 ! CHECK: %[[UNBOX_ARG0:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK: %[[A:.*]]:2 = hlfir.declare %[[UNBOX_ARG0]]#0 typeparams %[[UNBOX_ARG0]]#1 dummy_scope %0 {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Fsub2Ea"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[UNBOX_ARG0]]#0 typeparams %[[UNBOX_ARG0]]#1 dummy_scope %0 {{.*}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Fsub2Ea"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK: %[[UNBOX_ARG1:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK: %[[B:.*]]:2 = hlfir.declare %[[UNBOX_ARG1]]#0 typeparams %[[UNBOX_ARG1]]#1 dummy_scope %{{.*}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Fsub2Eb"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK: %[[B:.*]]:2 = hlfir.declare %[[UNBOX_ARG1]]#0 typeparams %[[UNBOX_ARG1]]#1 dummy_scope %{{.*}} {{.*}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Fsub2Eb"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK: %[[UNBOX_ARG2:.*]]:2 = fir.unboxchar %[[ARG2]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK: %[[C:.*]]:2 = hlfir.declare %[[UNBOX_ARG2]]#0 typeparams %[[UNBOX_ARG2]]#1 dummy_scope %{{.*}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMm1Fsub2Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK: %[[C:.*]]:2 = hlfir.declare %[[UNBOX_ARG2]]#0 typeparams %[[UNBOX_ARG2]]#1 dummy_scope %{{.*}} {{.*}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMm1Fsub2Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK: %[[UNBOX_A:.*]]:2 = fir.unboxchar %[[A]]#0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK: %[[DUMMYA:.*]]:2 = hlfir.declare %[[UNBOX_A]]#0 typeparams %[[UNBOX_A]]#1 {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Ffct1Ea"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK: %[[UNBOX_B:.*]]:2 = fir.unboxchar %[[B]]#0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
@@ -45,9 +45,9 @@ subroutine sub4(a,b,c)
 
 ! CHECK-LABEL: func.func @_QMm1Psub4(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_name = "b"}, %[[ARG2:.*]]: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_name = "c"}) {
-! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Fsub4Ea"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
-! CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{.*}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Fsub4Eb"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
-! CHECK: %[[C:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{.*}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMm1Fsub4Ec"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {{.*}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Fsub4Ea"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
+! CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{.*}} {{.*}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm1Fsub4Eb"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
+! CHECK: %[[C:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{.*}} {{.*}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMm1Fsub4Ec"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
 ! CHECK: %[[LEN_A:.*]] = fir.box_elesize %[[A]]#1 : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> index
 ! CHECK: %[[LEN_B:.*]] = fir.box_elesize %[[B]]#1 : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> index
 ! CHECK: %[[LEN_A_I32:.*]] = fir.convert %[[LEN_A]] : (index) -> i64
diff --git a/flang/test/Lower/HLFIR/elemental-user-procedure-ref.f90 b/flang/test/Lower/HLFIR/elemental-user-procedure-ref.f90
index 1080c9dfd914b..95e74cda2c9ba 100644
--- a/flang/test/Lower/HLFIR/elemental-user-procedure-ref.f90
+++ b/flang/test/Lower/HLFIR/elemental-user-procedure-ref.f90
@@ -111,7 +111,7 @@ impure elemental subroutine impure_elem(a)
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 20 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFimpure_elementalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFimpure_elementalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
 ! CHECK:           fir.do_loop %[[VAL_6:.*]] = %[[VAL_5]] to %[[VAL_2]] step %[[VAL_5]] {
 ! CHECK:             fir.do_loop %[[VAL_7:.*]] = %[[VAL_5]] to %[[VAL_1]] step %[[VAL_5]] {
@@ -136,7 +136,7 @@ elemental subroutine ordered_elem(a)
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 20 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFordered_elementalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFordered_elementalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
 ! CHECK:           fir.do_loop %[[VAL_6:.*]] = %[[VAL_5]] to %[[VAL_2]] step %[[VAL_5]] {
 ! CHECK:             fir.do_loop %[[VAL_7:.*]] = %[[VAL_5]] to %[[VAL_1]] step %[[VAL_5]] {
@@ -161,7 +161,7 @@ impure elemental subroutine impure_elem(a)
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 20 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFimpure_elemental_arg_evalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFimpure_elemental_arg_evalEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
 ! CHECK:           %[[VAL_5:.*]] = hlfir.elemental %[[VAL_3]] unordered : (!fir.shape<2>) -> !hlfir.expr<10x20xf32> {
 ! CHECK:           ^bb0(%[[VAL_6:.*]]: index, %[[VAL_7:.*]]: index):
 ! CHECK:             %[[VAL_8:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_6]], %[[VAL_7]])  : (!fir.ref<!fir.array<10x20xf32>>, index, index) -> !fir.ref<f32>
diff --git a/flang/test/Lower/HLFIR/eoshift.f90 b/flang/test/Lower/HLFIR/eoshift.f90
index 8d541779a2569..25442aeeb5a67 100644
--- a/flang/test/Lower/HLFIR/eoshift.f90
+++ b/flang/test/Lower/HLFIR/eoshift.f90
@@ -169,8 +169,8 @@ subroutine eoshift9(a, s)
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.box<!fir.array<?x!fir.type<_QMeoshift_typesTt>>> {fir.bindc_name = "a"},
 ! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<f32> {fir.bindc_name = "s"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFeoshift9Ea"} : (!fir.box<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>, !fir.box<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFeoshift9Es"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFeoshift9Ea"} : (!fir.box<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>, !fir.box<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFeoshift9Es"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 2 : i32
 ! CHECK:           %[[VAL_4:.*]] = fir.address_of(@_QQro._QMeoshift_typesTt.0) : !fir.ref<!fir.type<_QMeoshift_typesTt>>
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro._QMeoshift_typesTt.0"} : (!fir.ref<!fir.type<_QMeoshift_typesTt>>) -> (!fir.ref<!fir.type<_QMeoshift_typesTt>>, !fir.ref<!fir.type<_QMeoshift_typesTt>>)
@@ -190,8 +190,8 @@ subroutine eoshift10(a, s)
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>>> {fir.bindc_name = "a"},
 ! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<f32> {fir.bindc_name = "s"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFeoshift10Ea"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>>>)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFeoshift10Es"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFeoshift10Ea"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMeoshift_typesTt>>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFeoshift10Es"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 2 : i32
 ! CHECK:           %[[VAL_4:.*]] = fir.address_of(@_QQro._QMeoshift_typesTt.1) : !fir.ref<!fir.type<_QMeoshift_typesTt>>
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro._QMeoshift_typesTt.1"} : (!fir.ref<!fir.type<_QMeoshift_typesTt>>) -> (!fir.ref<!fir.type<_QMeoshift_typesTt>>, !fir.ref<!fir.type<_QMeoshift_typesTt>>)
@@ -212,9 +212,9 @@ subroutine eoshift11(a, s, d)
 ! CHECK-SAME:                           %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "s"},
 ! CHECK-SAME:                           %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "d"}) {
 ! CHECK:           %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFeoshift11Ea"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFeoshift11Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFeoshift11Es"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFeoshift11Ea"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFeoshift11Ed"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFeoshift11Es"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 2 : i32
 ! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_9:.*]] = hlfir.eoshift %[[VAL_4]]#0 %[[VAL_7]] dim %[[VAL_8]] : (!fir.box<!fir.array<?xi32>>, i32, i32) -> !hlfir.expr<?xi32>
@@ -235,10 +235,10 @@ end subroutine eoshift12
 ! CHECK-SAME:      %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "boundary", fir.optional},
 ! CHECK-SAME:      %[[ARG3:.*]]: !fir.ref<i32> {fir.bindc_name = "dim"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFeoshift12Earray"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFeoshift12Eboundary"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %[[VAL_0]] {uniq_name = "_QFeoshift12Edim"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFeoshift12Eshift"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFeoshift12Earray"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFeoshift12Eboundary"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFeoshift12Edim"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFeoshift12Eshift"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.is_present %[[VAL_2]]#0 : (!fir.ref<f32>) -> i1
 ! CHECK:           %[[VAL_6:.*]] = fir.embox %[[VAL_2]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
 ! CHECK:           %[[VAL_7:.*]] = fir.absent !fir.box<f32>
diff --git a/flang/test/Lower/HLFIR/expr-addr.f90 b/flang/test/Lower/HLFIR/expr-addr.f90
index 1f676172880fe..fae54ece30ea0 100644
--- a/flang/test/Lower/HLFIR/expr-addr.f90
+++ b/flang/test/Lower/HLFIR/expr-addr.f90
@@ -6,7 +6,7 @@
 subroutine foo(x)
   integer :: x
   read (*,*) x
-  ! CHECK: %[[x:.]]:2 = hlfir.declare %[[arg0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfooEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[x:.]]:2 = hlfir.declare %[[arg0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfooEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! CHECK: %[[x_cast:.*]] = fir.convert %[[x]]#0 : (!fir.ref<i32>) -> !fir.ref<i64>
   ! CHECK: fir.call @_FortranAioInputInteger(%{{.*}}, %[[x_cast]], %{{.*}}) {{.*}}: (!fir.ref<i8>, !fir.ref<i64>, i32) -> i1
 end subroutine
diff --git a/flang/test/Lower/HLFIR/expr-box.f90 b/flang/test/Lower/HLFIR/expr-box.f90
index f0de381c74575..4631d18e481bd 100644
--- a/flang/test/Lower/HLFIR/expr-box.f90
+++ b/flang/test/Lower/HLFIR/expr-box.f90
@@ -9,7 +9,7 @@ subroutine foo(x)
 ! CHECK-DAG:  %[[VAL_3:.*]] = arith.constant 21 : index
 ! CHECK-DAG:  %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK:  %[[VAL_5:.*]] = fir.shape_shift %[[VAL_3]], %[[VAL_4]] : (index, index) -> !fir.shapeshift<1>
-! CHECK:  %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFfooEx"} : (!fir.ref<!fir.array<10xi32>>, !fir.shapeshift<1>, !fir.dscope) -> (!fir.box<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:  %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfooEx"} : (!fir.ref<!fir.array<10xi32>>, !fir.shapeshift<1>, !fir.dscope) -> (!fir.box<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:  fir.embox %[[VAL_6]]#1(%[[VAL_5]]) : (!fir.ref<!fir.array<10xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<10xi32>>
 end subroutine
 
diff --git a/flang/test/Lower/HLFIR/expr-value.f90 b/flang/test/Lower/HLFIR/expr-value.f90
index c692ec72bf7ef..e8e7f756f391c 100644
--- a/flang/test/Lower/HLFIR/expr-value.f90
+++ b/flang/test/Lower/HLFIR/expr-value.f90
@@ -11,7 +11,7 @@ subroutine foo()
 ! CHECK-LABEL: func.func @_QPfoo_designator(
 ! CHECK-SAME: %[[arg0:.*]]: !fir.ref<i32>
 subroutine foo_designator(n)
-  !CHECK:  %[[n:.*]]:2 = hlfir.declare %[[arg0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfoo_designatorEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  !CHECK:  %[[n:.*]]:2 = hlfir.declare %[[arg0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfoo_designatorEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   print *, n
   ! CHECK: %[[nval:.*]] = fir.load %[[n]]#0 : !fir.ref<i32>
   ! CHECK: fir.call @_FortranAioOutputInteger32(%{{.*}}, %[[nval]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
diff --git a/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90 b/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90
index aeb5c2a8f14e7..84cceee723000 100644
--- a/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90
+++ b/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90
@@ -49,7 +49,7 @@ subroutine test_logical_assumed_shape_array(x)
 end subroutine test_logical_assumed_shape_array
 ! CHECK-LABEL:   func.func @_QPtest_logical_assumed_shape_array(
 ! CHECK-SAME:                                                   %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_logical_assumed_shape_arrayEx"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_logical_assumed_shape_arrayEx"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.rebox %[[VAL_1]]#0 : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.class<!fir.array<?xnone>>
 ! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.class<!fir.array<?xnone>>) -> !fir.class<none>
 ! CHECK:           fir.call @_QPcallee(%[[VAL_3]]) fastmath<contract> : (!fir.class<none>) -> ()
@@ -63,7 +63,7 @@ subroutine test_real_2d_pointer(x)
 end subroutine test_real_2d_pointer
 ! CHECK-LABEL:   func.func @_QPtest_real_2d_pointer(
 ! CHECK-SAME:                                       %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_real_2d_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_real_2d_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<?x?xf32>>>) -> !fir.class<!fir.array<?x?xnone>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.class<none>
@@ -78,7 +78,7 @@ subroutine test_up_assumed_shape_1d_array(x)
 end subroutine test_up_assumed_shape_1d_array
 ! CHECK-LABEL:   func.func @_QPtest_up_assumed_shape_1d_array(
 ! CHECK-SAME:                                                 %[[VAL_0:.*]]: !fir.class<!fir.array<?xnone>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_up_assumed_shape_1d_arrayEx"} : (!fir.class<!fir.array<?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_up_assumed_shape_1d_arrayEx"} : (!fir.class<!fir.array<?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?xnone>>, !fir.class<!fir.array<?xnone>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.class<!fir.array<?xnone>>) -> !fir.class<none>
 ! CHECK:           fir.call @_QPcallee(%[[VAL_2]]) fastmath<contract> : (!fir.class<none>) -> ()
 ! CHECK:           return
@@ -115,7 +115,7 @@ subroutine test_up_allocatable_2d_array(x)
 end subroutine test_up_allocatable_2d_array
 ! CHECK-LABEL:   func.func @_QPtest_up_allocatable_2d_array(
 ! CHECK-SAME:                                               %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_up_allocatable_2d_arrayEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_up_allocatable_2d_arrayEx"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.class<!fir.array<?x?xnone>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.class<none>
@@ -130,7 +130,7 @@ subroutine test_up_pointer_1d_array(x)
 end subroutine test_up_pointer_1d_array
 ! CHECK-LABEL:   func.func @_QPtest_up_pointer_1d_array(
 ! CHECK-SAME:                                           %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_up_pointer_1d_arrayEx"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_up_pointer_1d_arrayEx"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.rebox %[[VAL_2]] : (!fir.class<!fir.ptr<!fir.array<?xnone>>>) -> !fir.class<!fir.array<?xnone>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.class<!fir.array<?xnone>>) -> !fir.class<none>
diff --git a/flang/test/Lower/HLFIR/implicit-type-conversion.f90 b/flang/test/Lower/HLFIR/implicit-type-conversion.f90
index dc2d111a8f7f5..f55784e9883e0 100644
--- a/flang/test/Lower/HLFIR/implicit-type-conversion.f90
+++ b/flang/test/Lower/HLFIR/implicit-type-conversion.f90
@@ -3,8 +3,8 @@
 ! CHECK-LABEL:   func.func @_QPtest1(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"},
 ! CHECK-SAME:                        %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ey"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest1Ey"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.logical<4>) -> i32
 ! CHECK:           hlfir.assign %[[VAL_5]] to %[[VAL_2]]#0 : i32, !fir.ref<i32>
@@ -19,8 +19,8 @@ end subroutine test1
 ! CHECK-LABEL:   func.func @_QPtest2(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"},
 ! CHECK-SAME:                        %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ey"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest2Ey"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i32) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_5]] to %[[VAL_3]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -35,8 +35,8 @@ end subroutine test2
 ! CHECK-LABEL:   func.func @_QPtest3(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                        %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest3Ex"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest3Ey"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest3Ex"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest3Ey"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_6:.*]] = arith.cmpi eq, %[[VAL_4]], %[[VAL_5]] : i32
@@ -54,8 +54,8 @@ end subroutine test3
 ! CHECK-LABEL:   func.func @_QPtest4(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"},
 ! CHECK-SAME:                        %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest4Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest4Ey"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest4Ex"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest4Ey"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_6:.*]] = arith.cmpi eq, %[[VAL_4]], %[[VAL_5]] : i32
@@ -73,8 +73,8 @@ end subroutine test4
 ! CHECK-LABEL:   func.func @_QPtest5(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                        %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest5Ex"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest5Ey"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest5Ex"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest5Ey"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.logical<4>) -> i32
 ! CHECK:           hlfir.assign %[[VAL_5]] to %[[VAL_2]]#0 : i32, !fir.box<!fir.array<?xi32>>
@@ -89,8 +89,8 @@ end subroutine test5
 ! CHECK-LABEL:   func.func @_QPtest6(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                        %[[VAL_1:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest6Ex"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest6Ey"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest6Ex"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest6Ey"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_4]] : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1>
@@ -114,8 +114,8 @@ end subroutine test6
 ! CHECK-LABEL:   func.func @_QPtest7(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                        %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest7Ex"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest7Ey"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest7Ex"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest7Ey"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_4]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/HLFIR/index.f90 b/flang/test/Lower/HLFIR/index.f90
index a36027f4cf06f..84cdd580fc676 100644
--- a/flang/test/Lower/HLFIR/index.f90
+++ b/flang/test/Lower/HLFIR/index.f90
@@ -13,7 +13,7 @@ end subroutine t
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFtEn"}
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]]#0 typeparams %[[VAL_3]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFtEs"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]]#0 typeparams %[[VAL_3]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtEs"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.address_of(@_QQclX74686973) : !fir.ref<!fir.char<1,4>>
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX74686973"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
@@ -31,11 +31,11 @@ end subroutine t1
 ! CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"},
 ! CHECK-SAME:                     %[[ARG1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFt1Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFt1Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt1En"}
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFt1En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt1Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFt1Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.address_of(@_QQclX74686973) : !fir.ref<!fir.char<1,4>>
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_6]] typeparams %[[VAL_7]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX74686973"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
@@ -55,11 +55,11 @@ end subroutine t2
 ! CHECK-SAME:                     %[[ARG1:.*]]: !fir.boxchar<2> {fir.bindc_name = "c"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_1:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt2Ec"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFt2Ec"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt2En"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt2Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFt2Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant false
 ! CHECK:           %[[VAL_8:.*]] = hlfir.index %[[VAL_2]]#0 in %[[VAL_6]]#0 back %[[VAL_7]] : (!fir.boxchar<2>, !fir.boxchar<2>, i1) -> i32
 ! CHECK:           hlfir.assign %[[VAL_8]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
@@ -75,11 +75,11 @@ end subroutine t3
 ! CHECK-SAME:                     %[[ARG1:.*]]: !fir.boxchar<4> {fir.bindc_name = "c"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_1:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt3Ec"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFt3Ec"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt3En"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt3En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt3Es"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFt3Es"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant true
 ! CHECK:           %[[VAL_8:.*]] = hlfir.index %[[VAL_2]]#0 in %[[VAL_6]]#0 back %[[VAL_7]] : (!fir.boxchar<4>, !fir.boxchar<4>, i1) -> i8
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i8) -> i32
@@ -100,12 +100,12 @@ end subroutine t4
 ! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<3x!fir.char<1,?>>>
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_4]]) typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt4Ec1"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_4]]) typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFt4Ec1"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<3x!fir.char<1,?>>>
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_9]]) typeparams %[[VAL_6]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFt4Ec2"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_9]]) typeparams %[[VAL_6]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFt4Ec2"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_11:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_12:.*]] = fir.alloca !fir.array<3xi8> {bindc_name = "n", uniq_name = "_QFt4En"}
 ! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_11]] : (index) -> !fir.shape<1>
@@ -137,9 +137,9 @@ end program test
 ! CHECK-SAME:      %[[ARG2:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "c", fir.optional}) attributes {fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_1:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] {uniq_name = "_QFFsubEa"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFFsubEb"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFFsubEc"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFFsubEa"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFFsubEb"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFFsubEc"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_10:.*]] = fir.is_present %[[VAL_4]]#0 : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> i1
 ! CHECK:           %[[VAL_11:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_12:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_11]] : (!fir.box<!fir.array<?x!fir.char<1,?>>>, index) -> (index, index, index)
diff --git a/flang/test/Lower/HLFIR/intentout-allocatable-components.f90 b/flang/test/Lower/HLFIR/intentout-allocatable-components.f90
index 8cb733addc649..4093452b17aae 100644
--- a/flang/test/Lower/HLFIR/intentout-allocatable-components.f90
+++ b/flang/test/Lower/HLFIR/intentout-allocatable-components.f90
@@ -10,7 +10,7 @@ subroutine test_intentout_component_deallocate(a)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_intentout_component_deallocate(
 ! CHECK-SAME:      %[[VAL_0:.*]]: !fir.ref<!fir.type<_QFtest_intentout_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QFtest_intentout_component_deallocateEa"}
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QFtest_intentout_component_deallocateEa"}
 ! CHECK:  %[[VAL_2:.*]] = fir.embox %[[VAL_1]]#0 : (!fir.ref<!fir.type<_QFtest_intentout_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFtest_intentout_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>
 ! CHECK:  %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.box<!fir.type<_QFtest_intentout_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<none>
 ! CHECK:  fir.call @_FortranADestroy(%[[VAL_3]]) fastmath<contract> : (!fir.box<none>) -> ()
@@ -23,7 +23,7 @@ subroutine test_intentout_optional_component_deallocate(a)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_intentout_optional_component_deallocate(
 ! CHECK-SAME:      %[[VAL_0:.*]]: !fir.ref<!fir.type<_QFtest_intentout_optional_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out, optional>, uniq_name = "_QFtest_intentout_optional_component_deallocateEa"}
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out, optional>, uniq_name = "_QFtest_intentout_optional_component_deallocateEa"}
 ! CHECK:  %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.ref<!fir.type<_QFtest_intentout_optional_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>) -> i1
 ! CHECK:  fir.if %[[VAL_2]] {
 ! CHECK:    %[[VAL_3:.*]] = fir.embox %[[VAL_1]]#0 : (!fir.ref<!fir.type<_QFtest_intentout_optional_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>) -> !fir.box<!fir.type<_QFtest_intentout_optional_component_deallocateTt{x:!fir.box<!fir.heap<i32>>}>>
diff --git a/flang/test/Lower/HLFIR/internal-procedures.f90 b/flang/test/Lower/HLFIR/internal-procedures.f90
index f0e168a9f05f2..07e4ebc53f0a9 100644
--- a/flang/test/Lower/HLFIR/internal-procedures.f90
+++ b/flang/test/Lower/HLFIR/internal-procedures.f90
@@ -70,7 +70,7 @@ subroutine internal()
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_proc_pointer(
 ! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer, internal_assoc>, uniq_name = "_QFtest_proc_pointerEp"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer, internal_assoc>, uniq_name = "_QFtest_proc_pointerEp"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca tuple<!fir.ref<!fir.boxproc<() -> ()>>>
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i32
 ! CHECK:           %[[VAL_4:.*]] = fir.coordinate_of %[[VAL_2]], %[[VAL_3]] : (!fir.ref<tuple<!fir.ref<!fir.boxproc<() -> ()>>>>, i32) -> !fir.llvm_ptr<!fir.ref<!fir.boxproc<() -> ()>>>
diff --git a/flang/test/Lower/HLFIR/intrinsic-dynamically-optional.f90 b/flang/test/Lower/HLFIR/intrinsic-dynamically-optional.f90
index 683017579f681..79cc58d940711 100644
--- a/flang/test/Lower/HLFIR/intrinsic-dynamically-optional.f90
+++ b/flang/test/Lower/HLFIR/intrinsic-dynamically-optional.f90
@@ -166,10 +166,10 @@ function test_elemental_optional_as_value(real, imaginary)
 ! CHECK-SAME:                                                   %[[VAL_1:.*]]: !fir.ref<!fir.array<3xf32>> {fir.bindc_name = "imaginary", fir.optional}) -> !fir.array<3xcomplex<f32>> {
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_elemental_optional_as_valueEimaginary"} : (!fir.ref<!fir.array<3xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xf32>>, !fir.ref<!fir.array<3xf32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_3]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_elemental_optional_as_valueEimaginary"} : (!fir.ref<!fir.array<3xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xf32>>, !fir.ref<!fir.array<3xf32>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_elemental_optional_as_valueEreal"} : (!fir.ref<!fir.array<3xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xf32>>, !fir.ref<!fir.array<3xf32>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_elemental_optional_as_valueEreal"} : (!fir.ref<!fir.array<3xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<3xf32>>, !fir.ref<!fir.array<3xf32>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_9:.*]] = fir.alloca !fir.array<3xcomplex<f32>> {bindc_name = "test_elemental_optional_as_value", uniq_name = "_QFtest_elemental_optional_as_valueEtest_elemental_optional_as_value"}
 ! CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/HLFIR/procedure-pointer-component-default-init.f90 b/flang/test/Lower/HLFIR/procedure-pointer-component-default-init.f90
index 85931262b5892..61cc743ec9c59 100644
--- a/flang/test/Lower/HLFIR/procedure-pointer-component-default-init.f90
+++ b/flang/test/Lower/HLFIR/procedure-pointer-component-default-init.f90
@@ -1,5 +1,5 @@
 ! Test procedure pointer component default initialization when the size
-! of the derived type is 32 bytes and larger. 
+! of the derived type is 32 bytes and larger.
 ! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
   interface
diff --git a/flang/test/Lower/HLFIR/procedure-pointer.f90 b/flang/test/Lower/HLFIR/procedure-pointer.f90
index 053c44f56ed31..75e81c165d25b 100644
--- a/flang/test/Lower/HLFIR/procedure-pointer.f90
+++ b/flang/test/Lower/HLFIR/procedure-pointer.f90
@@ -11,7 +11,7 @@ real function real_func(x)
       real :: x
     end function
     character(:) function char_func(x)
-      pointer :: char_func 
+      pointer :: char_func
       integer :: x
     end function
     subroutine sub(x)
@@ -148,7 +148,7 @@ subroutine  sub5()
 use m
   procedure(real), pointer :: p3
 
-  p3 => real_func 
+  p3 => real_func
 ! CHECK: %[[VAL_0:.*]] = fir.alloca !fir.boxproc<() -> f32> {bindc_name = "p3", uniq_name = "_QFsub5Ep3"}
 ! CHECK: %[[VAL_1:.*]] = fir.zero_bits () -> f32
 ! CHECK: %[[VAL_2:.*]] = fir.emboxproc %[[VAL_1]] : (() -> f32) -> !fir.boxproc<() -> f32>
@@ -165,7 +165,7 @@ subroutine  sub6()
   procedure(), pointer :: p4
   real :: r
 
-  p4 => sub 
+  p4 => sub
 ! CHECK: %[[VAL_0:.*]] = fir.alloca !fir.boxproc<() -> ()> {bindc_name = "p4", uniq_name = "_QFsub6Ep4"}
 ! CHECK: %[[VAL_1:.*]] = fir.zero_bits () -> ()
 ! CHECK: %[[VAL_2:.*]] = fir.emboxproc %[[VAL_1]] : (() -> ()) -> !fir.boxproc<() -> ()>
@@ -186,10 +186,10 @@ subroutine  sub6()
 subroutine sub7(p1, p2)
 use m
   procedure(real_func), pointer :: p1
-! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub7Ep1"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub7Ep1"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 
   procedure(char_func), pointer :: p2
-! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %arg1 dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub7Ep2"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %arg1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsub7Ep2"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 
   call foo1(p1)
 ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]]#0 : !fir.ref<!fir.boxproc<() -> ()>>
@@ -197,7 +197,7 @@ subroutine sub7(p1, p2)
 
   call foo2(p2)
 ! CHECK: fir.call @_QPfoo2(%[[VAL_1]]#0) fastmath<contract> : (!fir.ref<!fir.boxproc<() -> ()>>) -> ()
-end 
+end
 
 subroutine sub8()
 use m
@@ -265,7 +265,7 @@ subroutine sub10()
   function reffunc(arg) result(pp)
     integer :: arg
     procedure(real_func), pointer :: pp
-! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} {uniq_name = "_QFsub10FreffuncEarg"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsub10FreffuncEarg"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.boxproc<(!fir.ref<f32>) -> f32> {bindc_name = "pp", uniq_name = "_QFsub10FreffuncEpp"}
 ! CHECK: %[[VAL_2:.*]] = fir.zero_bits (!fir.ref<f32>) -> f32
 ! CHECK: %[[VAL_3:.*]] = fir.emboxproc %[[VAL_2]] : ((!fir.ref<f32>) -> f32) -> !fir.boxproc<(!fir.ref<f32>) -> f32>
@@ -338,7 +338,7 @@ subroutine sub12()
 ! CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref<!fir.boxproc<(!fir.ref<i32>) -> !fir.box<!fir.ptr<!fir.char<1,?>>>>>) -> (!fir.ref<!fir.boxproc<(!fir.ref<i32>) -> !fir.box<!fir.ptr<!fir.char<1,?>>>>>, !fir.ref<!fir.boxproc<(!fir.ref<i32>) -> !fir.box<!fir.ptr<!fir.char<1,?>>>>>)
 ! CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_16]]#0 : (!fir.ref<!fir.boxproc<(!fir.ref<i32>) -> !fir.box<!fir.ptr<!fir.char<1,?>>>>>) -> !fir.ref<!fir.boxproc<() -> ()>>
 ! CHECK: fir.call @_QPfoo2(%[[VAL_17]]) fastmath<contract> : (!fir.ref<!fir.boxproc<() -> ()>>) -> ()
-end 
+end
 
 subroutine test_opt_pointer()
   interface
diff --git a/flang/test/Lower/HLFIR/reshape.f90 b/flang/test/Lower/HLFIR/reshape.f90
index 8bf3cfd08c6ac..83072d33d6052 100644
--- a/flang/test/Lower/HLFIR/reshape.f90
+++ b/flang/test/Lower/HLFIR/reshape.f90
@@ -49,7 +49,7 @@ end subroutine reshape_test_nopad
 ! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFreshape_test_nopadEsh"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
 ! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFreshape_test_nopadEsource"} : (!fir.box<!fir.array<?x?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xi32>>, !fir.box<!fir.array<?x?x?xi32>>)
 ! CHECK:           %[[VAL_13:.*]] = hlfir.reshape %[[VAL_11]]#0 %[[VAL_10]]#0 order %[[VAL_7]]#0 : (!fir.box<!fir.array<?x?x?xi32>>, !fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>) -> !hlfir.expr<?x?xi32>
-  
+
 subroutine test_reshape_optional1(pad, order, source, shape)
   real, pointer :: pad(:, :)
   integer, pointer :: order(:)
diff --git a/flang/test/Lower/HLFIR/select-rank.f90 b/flang/test/Lower/HLFIR/select-rank.f90
index f1f968decd412..4e05926bcd24f 100644
--- a/flang/test/Lower/HLFIR/select-rank.f90
+++ b/flang/test/Lower/HLFIR/select-rank.f90
@@ -293,7 +293,7 @@ subroutine test_branching(x)
 ! CHECK-LABEL:   func.func @_QPtest_single_case(
 ! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_single_caseEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_single_caseEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i8
 ! CHECK:           %[[VAL_4:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1
 ! CHECK:           cf.cond_br %[[VAL_4]], ^bb3, ^bb1
@@ -312,7 +312,7 @@ subroutine test_branching(x)
 ! CHECK-LABEL:   func.func @_QPtest_simple_case(
 ! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_simple_caseEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_simple_caseEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i8
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 15 : i8
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i8
@@ -348,7 +348,7 @@ subroutine test_branching(x)
 ! CHECK-LABEL:   func.func @_QPtest_rank_star(
 ! CHECK-SAME:                                 %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_rank_starEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_rank_starEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 2 : i8
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i8
 ! CHECK:           %[[VAL_5:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1
@@ -385,7 +385,7 @@ subroutine test_branching(x)
 ! CHECK-LABEL:   func.func @_QPtest_renaming(
 ! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_renamingEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_renamingEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i8
 ! CHECK:           %[[VAL_4:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1
 ! CHECK:           cf.cond_br %[[VAL_4]], ^bb3, ^bb1
@@ -405,7 +405,7 @@ subroutine test_branching(x)
 ! CHECK-LABEL:   func.func @_QPtest_no_case(
 ! CHECK-SAME:                               %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_no_caseEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_no_caseEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1
 ! CHECK:           cf.cond_br %[[VAL_3]], ^bb2, ^bb1
 ! CHECK:         ^bb1:
@@ -418,7 +418,7 @@ subroutine test_branching(x)
 ! CHECK-LABEL:   func.func @_QPtest_rank_star_attributes(
 ! CHECK-SAME:                                            %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.asynchronous, fir.bindc_name = "x", fir.optional, fir.target}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<asynchronous, optional, target>, uniq_name = "_QFtest_rank_star_attributesEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<asynchronous, optional, target>, uniq_name = "_QFtest_rank_star_attributesEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 2 : i8
 ! CHECK:           %[[VAL_4:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1
 ! CHECK:           cf.cond_br %[[VAL_4]], ^bb4, ^bb1
@@ -449,7 +449,7 @@ subroutine test_branching(x)
 ! CHECK-LABEL:   func.func @_QPtest_rank_star_contiguous(
 ! CHECK-SAME:                                            %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x", fir.contiguous, fir.target}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<contiguous, target>, uniq_name = "_QFtest_rank_star_contiguousEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous, target>, uniq_name = "_QFtest_rank_star_contiguousEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 2 : i8
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i8
 ! CHECK:           %[[VAL_5:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1
@@ -497,12 +497,12 @@ subroutine test_branching(x)
 ! CHECK-SAME:                                                      %[[VAL_0:.*]]: !fir.box<!fir.array<*:!fir.char<1,?>>> {fir.bindc_name = "x", fir.contiguous},
 ! CHECK-SAME:                                                      %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_rank_star_contiguous_characterEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_rank_star_contiguous_characterEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i64>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : i64
 ! CHECK:           %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : i64
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QFtest_rank_star_contiguous_characterEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, i64, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QFtest_rank_star_contiguous_characterEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, i64, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 0 : i8
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 1 : i8
 ! CHECK:           %[[VAL_11:.*]] = fir.is_assumed_size %[[VAL_8]]#0 : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> i1
@@ -550,7 +550,7 @@ subroutine test_branching(x)
 ! CHECK-LABEL:   func.func @_QPtest_simple_alloc(
 ! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_simple_allocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_simple_allocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 2 : i8
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i8
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i8
@@ -583,7 +583,7 @@ subroutine test_branching(x)
 ! CHECK-LABEL:   func.func @_QPtest_character_alloc(
 ! CHECK-SAME:                                       %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_character_allocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_character_allocEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i8
 ! CHECK:           %[[VAL_4:.*]] = fir.box_rank %[[VAL_2]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) -> i8
 ! CHECK:           fir.select_case %[[VAL_4]] : i8 [#fir.point, %[[VAL_3]], ^bb2, unit, ^bb1]
@@ -604,12 +604,12 @@ subroutine test_branching(x)
 ! CHECK-SAME:                                              %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                              %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_explicit_character_ptrEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_explicit_character_ptrEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i64>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i64
 ! CHECK:           %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : i64
 ! CHECK:           %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : i64
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_explicit_character_ptrEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, i64, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_explicit_character_ptrEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, i64, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>)
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 0 : i8
 ! CHECK:           %[[VAL_10:.*]] = fir.box_rank %[[VAL_8]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) -> i8
 ! CHECK:           fir.select_case %[[VAL_10]] : i8 [#fir.point, %[[VAL_9]], ^bb2, unit, ^bb1]
@@ -629,7 +629,7 @@ subroutine test_branching(x)
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.box_elesize %[[VAL_2]] : (!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>) -> index
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_assumed_character_ptrEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_assumed_character_ptrEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i8
 ! CHECK:           %[[VAL_6:.*]] = fir.box_rank %[[VAL_4]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:!fir.char<1,?>>>>>) -> i8
 ! CHECK:           fir.select_case %[[VAL_6]] : i8 [#fir.point, %[[VAL_5]], ^bb2, unit, ^bb1]
@@ -647,7 +647,7 @@ subroutine test_branching(x)
 ! CHECK-LABEL:   func.func @_QPtest_polymorphic(
 ! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_polymorphicEx"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_polymorphicEx"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i8
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i8
 ! CHECK:           %[[VAL_5:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.class<!fir.array<*:none>>) -> i1
@@ -677,8 +677,8 @@ subroutine test_branching(x)
 ! CHECK-SAME:                                          %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x1"},
 ! CHECK-SAME:                                          %[[VAL_1:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x2"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_nested_select_rankEx1"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_nested_select_rankEx2"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_nested_select_rankEx1"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_nested_select_rankEx2"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i8
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 1 : i8
 ! CHECK:           %[[VAL_7:.*]] = fir.is_assumed_size %[[VAL_3]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1
@@ -783,7 +783,7 @@ subroutine test_branching(x)
 ! CHECK-LABEL:   func.func @_QPtest_branching(
 ! CHECK-SAME:                                 %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_branchingEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_branchingEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i8
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 2 : i8
 ! CHECK:           %[[VAL_5:.*]] = fir.is_assumed_size %[[VAL_2]]#0 : (!fir.box<!fir.array<*:f32>>) -> i1
diff --git a/flang/test/Lower/HLFIR/statement-functions.f90 b/flang/test/Lower/HLFIR/statement-functions.f90
index 4f91c947690cc..86e3074b1453b 100644
--- a/flang/test/Lower/HLFIR/statement-functions.f90
+++ b/flang/test/Lower/HLFIR/statement-functions.f90
@@ -43,7 +43,7 @@ subroutine char_test2(c)
   call test(stmt_func(c))
 end subroutine
 ! CHECK-LABEL:  func.func @_QPchar_test2(
-! CHECK:    %[[C:.*]]:2 = hlfir.declare %{{.*}} typeparams %c10 dummy_scope %{{[0-9]+}} {uniq_name = "_QFchar_test2Ec"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>)
+! CHECK:    %[[C:.*]]:2 = hlfir.declare %{{.*}} typeparams %c10 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFchar_test2Ec"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>)
 ! CHECK:    %[[CAST:.*]] = fir.convert %[[C]]#0 : (!fir.ref<!fir.char<1,10>>) -> !fir.ref<!fir.char<1,5>>
 ! CHECK:    %[[C_STMT_FUNC:.*]]:2 = hlfir.declare %[[CAST]] typeparams %c5{{.*}} {uniq_name = "_QFchar_test2Fstmt_funcEc_stmt_func"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
 ! CHECK:    hlfir.concat %[[C_STMT_FUNC]]#0, %{{.*}} len %{{.*}} : (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,7>>, index) -> !hlfir.expr<!fir.char<1,12>>
diff --git a/flang/test/Lower/HLFIR/structure-constructor.f90 b/flang/test/Lower/HLFIR/structure-constructor.f90
index 5d5fe3379635e..094aeaa14f682 100644
--- a/flang/test/Lower/HLFIR/structure-constructor.f90
+++ b/flang/test/Lower/HLFIR/structure-constructor.f90
@@ -43,7 +43,7 @@ end subroutine test1
 ! CHECK:           %[[VAL_4:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,4>>
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 4 : index
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>) -> (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>, !fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>)
 ! CHECK:           %[[VAL_9:.*]] = fir.embox %[[VAL_8]]#0 : (!fir.ref<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>) -> !fir.box<!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>
 ! CHECK:           %[[VAL_10:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
@@ -71,7 +71,7 @@ end subroutine test2
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFtest2Eres"} : (!fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>) -> (!fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>, !fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>) -> (!fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>, !fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.embox %[[VAL_7]]#0 : (!fir.ref<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>) -> !fir.box<!fir.type<_QMtypesTt2{i:!fir.array<10xi32>}>>
 ! CHECK:           %[[VAL_9:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
@@ -100,7 +100,7 @@ end subroutine test3
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFtest3Eres"} : (!fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>)
 ! CHECK:           %[[ADDR:.*]] = fir.address_of(@_QQ_QMtypesTt3.DerivedInit) : !fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>
 ! CHECK:           fir.copy %[[ADDR]] to %[[VAL_3]]#0 no_overlap : !fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>)
 ! CHECK:           %[[VAL_12:.*]] = fir.embox %[[VAL_11]]#0 : (!fir.ref<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.box<!fir.type<_QMtypesTt3{r:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>
 ! CHECK:           %[[VAL_13:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
@@ -133,7 +133,7 @@ end subroutine test4
 ! CHECK:           %[[ADDR:.*]] = fir.address_of(@_QQ_QMtypesTt4.DerivedInit) : !fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>
 ! CHECK:           fir.copy %[[ADDR]] to %[[VAL_3]]#0 no_overlap : !fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>, !fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 2 : index
-! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_10]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest4Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>)
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_10]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest4Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>>)
 ! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>, !fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>)
 ! CHECK:           %[[VAL_13:.*]] = fir.embox %[[VAL_12]]#0 : (!fir.ref<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>) -> !fir.box<!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>
 ! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
@@ -172,7 +172,7 @@ end subroutine test5
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFtest5Eres"} : (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>, !fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>)
 ! CHECK:           %[[ADDR:.*]] = fir.address_of(@_QQ_QMtypesTt5.DerivedInit) : !fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>
 ! CHECK:           fir.copy %[[ADDR]] to %[[VAL_3]]#0 no_overlap : !fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>, !fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest5Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest5Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>)
 ! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>, !fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>)
 ! CHECK:           %[[VAL_12:.*]] = fir.embox %[[VAL_11]]#0 : (!fir.ref<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>) -> !fir.box<!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>>
 ! CHECK:           %[[VAL_13:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
@@ -214,12 +214,12 @@ end subroutine test6
 ! CHECK:           %[[VAL_7:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,4>>
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 4 : index
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_9]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest6Ec"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_9]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest6Ec"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
 ! CHECK:           %[[VAL_11:.*]] = fir.alloca !fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}> {bindc_name = "res", uniq_name = "_QFtest6Eres"}
 ! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFtest6Eres"} : (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>, !fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>)
 ! CHECK:           %[[ADDR:.*]] = fir.address_of(@_QQ_QMtypesTt6.DerivedInit) : !fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>
 ! CHECK:           fir.copy %[[ADDR]] to %[[VAL_12]]#0 no_overlap : !fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>, !fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>
-! CHECK:           %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest6Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>)
+! CHECK:           %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest6Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>>)
 ! CHECK:           %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>, !fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>)
 ! CHECK:           %[[VAL_21:.*]] = fir.embox %[[VAL_20]]#0 : (!fir.ref<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>) -> !fir.box<!fir.type<_QMtypesTt6{t5:!fir.type<_QMtypesTt5{t5m:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt4{c:!fir.box<!fir.heap<!fir.array<?x!fir.char<1,2>>>>}>>>>}>,t6m:!fir.array<1x!fir.type<_QMtypesTt1{c:!fir.char<1,4>}>>}>>
 ! CHECK:           %[[VAL_22:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
@@ -299,7 +299,7 @@ end subroutine test7
 ! CHECK-LABEL:   func.func @_QPtest7(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest7En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest7En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}> {bindc_name = "x", uniq_name = "_QFtest7Ex"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFtest7Ex"} : (!fir.ref<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> (!fir.ref<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>)
 ! CHECK:           %[[ADDR:.*]] = fir.address_of(@_QQ_QMtypesTt7.DerivedInit) : !fir.ref<!fir.type<_QMtypesTt7{c1:i32,c2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>
diff --git a/flang/test/Lower/HLFIR/transformational.f90 b/flang/test/Lower/HLFIR/transformational.f90
index 6be018402a3ee..7af563546c1d6 100644
--- a/flang/test/Lower/HLFIR/transformational.f90
+++ b/flang/test/Lower/HLFIR/transformational.f90
@@ -16,7 +16,7 @@ subroutine test_transformational_implemented_with_runtime_allocation(x)
 end subroutine
 ! CHECK-LABEL: func.func @_QPtest_transformational_implemented_with_runtime_allocation(
 ! CHECK-SAME:                                                                          %[[ARG0:.*]]: !fir.ref<!fir.array<10x10xf32>> {fir.bindc_name = "x"}) {
-! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_transformational_implemented_with_runtime_allocationEx"}
+! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_transformational_implemented_with_runtime_allocationEx"}
 ! CHECK:  %[[VAL_2:.*]] = hlfir.minloc %[[VAL_1]]#0
 ! CHECK:  %[[VAL_3:.*]] = hlfir.shape_of %[[VAL_2]]
 ! CHECK:  %[[VAL_4:.*]]:3 = hlfir.associate %[[VAL_2]](%[[VAL_3]]) {adapt.valuebyref}
diff --git a/flang/test/Lower/HLFIR/transpose.f90 b/flang/test/Lower/HLFIR/transpose.f90
index 6d8e337f1ac8b..823ecf4c1f836 100644
--- a/flang/test/Lower/HLFIR/transpose.f90
+++ b/flang/test/Lower/HLFIR/transpose.f90
@@ -8,8 +8,8 @@ subroutine transpose1(m, res)
 ! CHECK-LABEL: func.func @_QPtranspose1
 ! CHECK:           %[[M_ARG:.*]]: !fir.ref<!fir.array<1x2xi32>>
 ! CHECK:           %[[RES_ARG:.*]]: !fir.ref<!fir.array<2x1xi32>>
-! CHECK-DAG:     %[[ARG:.*]]:2 = hlfir.declare %[[M_ARG]](%[[M_SHAPE:.*]]) dummy_scope %{{[0-9]+}} {[[NAME:.*]]} : (!fir.ref<!fir.array<1x2xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<1x2xi32>>, !fir.ref<!fir.array<1x2xi32>>)
-! CHECK-DAG:     %[[RES:.*]]:2 = hlfir.declare %[[RES_ARG]](%[[RES_SHAPE:.*]]) dummy_scope %{{[0-9]+}} {[[NAME2:.*]]} : (!fir.ref<!fir.array<2x1xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<2x1xi32>>, !fir.ref<!fir.array<2x1xi32>>)
+! CHECK-DAG:     %[[ARG:.*]]:2 = hlfir.declare %[[M_ARG]](%[[M_SHAPE:.*]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {[[NAME:.*]]} : (!fir.ref<!fir.array<1x2xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<1x2xi32>>, !fir.ref<!fir.array<1x2xi32>>)
+! CHECK-DAG:     %[[RES:.*]]:2 = hlfir.declare %[[RES_ARG]](%[[RES_SHAPE:.*]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {[[NAME2:.*]]} : (!fir.ref<!fir.array<2x1xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<2x1xi32>>, !fir.ref<!fir.array<2x1xi32>>)
 ! CHECK:         %[[EXPR:.*]] = hlfir.transpose %[[ARG]]#0 : (!fir.ref<!fir.array<1x2xi32>>) -> !hlfir.expr<2x1xi32>
 ! CHECK-NEXT:    hlfir.assign %[[EXPR]] to %[[RES]]#0
 ! CHECK-NEXT:    hlfir.destroy %[[EXPR]]
@@ -38,7 +38,7 @@ subroutine transpose3(m, res)
 ! CHECK:           %[[M_ARG:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
 ! CHECK:           %[[RES_ARG:.*]]: !fir.ref<!fir.array<2x1xi32>>
 ! CHECK-DAG:     %[[ARG:.*]]:2 = hlfir.declare %[[M_ARG]]
-! CHECK-DAG:     %[[RES:.*]]:2 = hlfir.declare %[[RES_ARG]](%[[RES_SHAPE:.*]]) dummy_scope %{{[0-9]+}} {[[NAME2:.*]]} : (!fir.ref<!fir.array<2x1xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<2x1xi32>>, !fir.ref<!fir.array<2x1xi32>>)
+! CHECK-DAG:     %[[RES:.*]]:2 = hlfir.declare %[[RES_ARG]](%[[RES_SHAPE:.*]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {[[NAME2:.*]]} : (!fir.ref<!fir.array<2x1xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<2x1xi32>>, !fir.ref<!fir.array<2x1xi32>>)
 ! CHECK:         %[[ARG_LOADED:.*]] = fir.load %[[ARG]]#0
 ! CHECK:         %[[EXPR:.*]] = hlfir.transpose %[[ARG_LOADED]] : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !hlfir.expr<?x?xi32>
 ! CHECK-NEXT:    hlfir.assign %[[EXPR]] to %[[RES]]#0
@@ -54,8 +54,8 @@ end subroutine test_polymorphic_result
 ! CHECK-LABEL:   func.func @_QPtest_polymorphic_result(
 ! CHECK-SAME:        %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>> {fir.bindc_name = "m"},
 ! CHECK-SAME:        %[[VAL_1:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>> {fir.bindc_name = "res"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_resultEm"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_resultEres"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_resultEm"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_polymorphic_resultEres"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
 ! CHECK:           %[[VAL_5:.*]] = hlfir.transpose %[[VAL_4]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !hlfir.expr<?x?xnone?>
 ! CHECK:           hlfir.assign %[[VAL_5]] to %[[VAL_3]]#0 realloc : !hlfir.expr<?x?xnone?>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
diff --git a/flang/test/Lower/HLFIR/trim.f90 b/flang/test/Lower/HLFIR/trim.f90
index d09e6c58cb012..9b630293fe70f 100644
--- a/flang/test/Lower/HLFIR/trim.f90
+++ b/flang/test/Lower/HLFIR/trim.f90
@@ -4,7 +4,7 @@
 ! CHECK-SAME:     %[[VAL_0:.*]]: !fir.boxchar<1> {fir.bindc_name = "c"}) {
 ! CHECK:          %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK-NEXT:     %[[VAL_2:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK-NEXT:     %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] {uniq_name = "_QFtrim_testEc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK-NEXT:     %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtrim_testEc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK-NEXT:     %[[VAL_4:.*]] = arith.constant 8 : index
 ! CHECK-NEXT:     %[[VAL_5:.*]] = fir.alloca !fir.char<1,8> {bindc_name = "tc", uniq_name = "_QFtrim_testEtc"}
 ! CHECK-NEXT:     %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_4]] {uniq_name = "_QFtrim_testEtc"} : (!fir.ref<!fir.char<1,8>>, index) -> (!fir.ref<!fir.char<1,8>>, !fir.ref<!fir.char<1,8>>)
diff --git a/flang/test/Lower/HLFIR/user-defined-assignment.f90 b/flang/test/Lower/HLFIR/user-defined-assignment.f90
index 5d58dca8183a7..2dffd92ca50f5 100644
--- a/flang/test/Lower/HLFIR/user-defined-assignment.f90
+++ b/flang/test/Lower/HLFIR/user-defined-assignment.f90
@@ -35,8 +35,8 @@ subroutine test_user_defined_elemental_array(i, l)
 ! CHECK-LABEL:   func.func @_QMuser_defPtest_user_defined_elemental_array(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "i"},
 ! CHECK-SAME:    %[[VAL_1:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "l"}) {
-! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_arrayEi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_arrayEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_arrayEi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_arrayEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
 ! CHECK:    hlfir.region_assign {
 ! CHECK:      hlfir.yield %[[VAL_3]]#0 : !fir.box<!fir.array<?x!fir.logical<4>>>
 ! CHECK:    } to {
@@ -53,8 +53,8 @@ subroutine test_user_defined_elemental_array_value(z, l)
 ! CHECK-LABEL:   func.func @_QMuser_defPtest_user_defined_elemental_array_value(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?xcomplex<f32>>> {fir.bindc_name = "z"},
 ! CHECK-SAME:    %[[VAL_1:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "l"}) {
-! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_array_valueEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
-! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_array_valueEz"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>)
+! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_array_valueEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_elemental_array_valueEz"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>)
 ! CHECK:    hlfir.region_assign {
 ! CHECK:      hlfir.yield %[[VAL_2]]#0 : !fir.box<!fir.array<?x!fir.logical<4>>>
 ! CHECK:    } to {
@@ -72,8 +72,8 @@ subroutine test_user_defined_scalar(i, l)
 ! CHECK-LABEL:   func.func @_QMuser_defPtest_user_defined_scalar(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "i"},
 ! CHECK-SAME:    %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "l"}) {
-! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_scalarEi"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_scalarEl"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_scalarEi"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_user_defined_scalarEl"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:    hlfir.region_assign {
 ! CHECK:      %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK:      hlfir.yield %[[VAL_4]] : !fir.logical<4>
@@ -91,7 +91,7 @@ subroutine test_non_elemental_array(x)
 end subroutine
 ! CHECK-LABEL:   func.func @_QMuser_defPtest_non_elemental_array(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
-! CHECK:    %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_non_elemental_arrayEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:    %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_non_elemental_arrayEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:    hlfir.region_assign {
 ! CHECK:      %[[VAL_2:.*]] = arith.constant 4.200000e+01 : f32
 ! CHECK:      %[[VAL_3:.*]] = arith.constant 0 : index
@@ -126,9 +126,9 @@ subroutine test_where_user_def_assignment(i, l, l2)
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "i"},
 ! CHECK-SAME:    %[[VAL_1:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "l"},
 ! CHECK-SAME:    %[[VAL_2:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>> {fir.bindc_name = "l2"}) {
-! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-! CHECK:    %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
-! CHECK:    %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEl2"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:    %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEl"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
+! CHECK:    %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_where_user_def_assignmentEl2"} : (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.logical<4>>>, !fir.box<!fir.array<?x!fir.logical<4>>>)
 ! CHECK:    hlfir.where {
 ! CHECK:      hlfir.yield %[[VAL_4]]#0 : !fir.box<!fir.array<?x!fir.logical<4>>>
 ! CHECK:    } do {
@@ -171,11 +171,11 @@ subroutine test_forall_user_def_assignment(i, l)
 ! CHECK:    %[[VAL_2:.*]] = arith.constant 20 : index
 ! CHECK:    %[[VAL_3:.*]] = arith.constant 10 : index
 ! CHECK:    %[[VAL_4:.*]] = fir.shape %[[VAL_2]], %[[VAL_3]] : (index, index) -> !fir.shape<2>
-! CHECK:    %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignmentEi"} : (!fir.ref<!fir.array<20x10xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10xi32>>, !fir.ref<!fir.array<20x10xi32>>)
+! CHECK:    %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignmentEi"} : (!fir.ref<!fir.array<20x10xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10xi32>>, !fir.ref<!fir.array<20x10xi32>>)
 ! CHECK:    %[[VAL_6:.*]] = arith.constant 20 : index
 ! CHECK:    %[[VAL_7:.*]] = arith.constant 10 : index
 ! CHECK:    %[[VAL_8:.*]] = fir.shape %[[VAL_6]], %[[VAL_7]] : (index, index) -> !fir.shape<2>
-! CHECK:    %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignmentEl"} : (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.ref<!fir.array<20x10x!fir.logical<4>>>)
+! CHECK:    %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignmentEl"} : (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.ref<!fir.array<20x10x!fir.logical<4>>>)
 ! CHECK:    %[[VAL_10:.*]] = arith.constant 1 : i32
 ! CHECK:    %[[VAL_11:.*]] = arith.constant 10 : i32
 ! CHECK:    hlfir.forall lb {
@@ -218,11 +218,11 @@ subroutine test_forall_user_def_assignment_non_elemental_array(x, l)
 ! CHECK:    %[[VAL_2:.*]] = arith.constant 20 : index
 ! CHECK:    %[[VAL_3:.*]] = arith.constant 10 : index
 ! CHECK:    %[[VAL_4:.*]] = fir.shape %[[VAL_2]], %[[VAL_3]] : (index, index) -> !fir.shape<2>
-! CHECK:    %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignment_non_elemental_arrayEl"} : (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.ref<!fir.array<20x10x!fir.logical<4>>>)
+! CHECK:    %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignment_non_elemental_arrayEl"} : (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10x!fir.logical<4>>>, !fir.ref<!fir.array<20x10x!fir.logical<4>>>)
 ! CHECK:    %[[VAL_6:.*]] = arith.constant 20 : index
 ! CHECK:    %[[VAL_7:.*]] = arith.constant 10 : index
 ! CHECK:    %[[VAL_8:.*]] = fir.shape %[[VAL_6]], %[[VAL_7]] : (index, index) -> !fir.shape<2>
-! CHECK:    %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignment_non_elemental_arrayEx"} : (!fir.ref<!fir.array<20x10xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10xf32>>, !fir.ref<!fir.array<20x10xf32>>)
+! CHECK:    %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_8]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_forall_user_def_assignment_non_elemental_arrayEx"} : (!fir.ref<!fir.array<20x10xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<20x10xf32>>, !fir.ref<!fir.array<20x10xf32>>)
 ! CHECK:    %[[VAL_10:.*]] = arith.constant 1 : i32
 ! CHECK:    %[[VAL_11:.*]] = arith.constant 10 : i32
 ! CHECK:    hlfir.forall lb {
@@ -269,8 +269,8 @@ subroutine test_pointer(p, x)
 ! CHECK-LABEL:   func.func @_QMuser_defPtest_pointer(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {fir.bindc_name = "p"},
 ! CHECK-SAME:    %[[VAL_1:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"}) {
-! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMuser_defFtest_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
-! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_pointerEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMuser_defFtest_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_pointerEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
 ! CHECK:    hlfir.region_assign {
 ! CHECK:      hlfir.yield %[[VAL_3]]#0 : !fir.box<!fir.array<?x?xf32>>
 ! CHECK:    } to {
@@ -287,8 +287,8 @@ subroutine test_allocatable(a, x)
 ! CHECK-LABEL:   func.func @_QMuser_defPtest_allocatable(
 ! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> {fir.bindc_name = "a"},
 ! CHECK-SAME:    %[[VAL_1:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
-! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMuser_defFtest_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
-! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMuser_defFtest_allocatableEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:    %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMuser_defFtest_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
+! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMuser_defFtest_allocatableEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:    hlfir.region_assign {
 ! CHECK:      hlfir.yield %[[VAL_3]]#0 : !fir.box<!fir.array<?xf32>>
 ! CHECK:    } to {
@@ -313,7 +313,7 @@ end subroutine test_char_get_length
 ! CHECK-LABEL:   func.func @_QPtest_char_get_length(
 ! CHECK-SAME:                                       %[[VAL_0:.*]]: !fir.boxchar<1> {fir.bindc_name = "ch"}) {
 ! CHECK:           %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_char_get_lengthEch"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_char_get_lengthEch"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtest_char_get_lengthEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFtest_char_get_lengthEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           hlfir.region_assign {
diff --git a/flang/test/Lower/HLFIR/vector-subscript-as-value.f90 b/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
index 6b693692b4d28..4ec57c7ba1c6d 100644
--- a/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
+++ b/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
@@ -68,7 +68,7 @@ subroutine foo3(x, y)
   call bar2(x(1:8:2, 5, y))
 end subroutine
 ! CHECK-LABEL:   func.func @_QPfoo3(
-! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFfoo3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>)
+! CHECK:  %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFfoo3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xi32>>>>)
 ! CHECK:  %[[VAL_3:.*]] = arith.constant 20 : index
 ! CHECK:  %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
 ! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]](%[[VAL_4:[a-z0-9]*]])  {{.*}}Ey
@@ -196,8 +196,8 @@ subroutine do_something(x)
 ! CHECK-LABEL:   func.func @_QPtest_passing_subscripted_poly(
 ! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.class<!fir.array<?x?xnone>>
 ! CHECK-SAME:                                                %[[VAL_1:.*]]: !fir.box<!fir.array<?xi64>>
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_passing_subscripted_polyEvector"} : (!fir.box<!fir.array<?xi64>>, !fir.dscope) -> (!fir.box<!fir.array<?xi64>>, !fir.box<!fir.array<?xi64>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_passing_subscripted_polyEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_passing_subscripted_polyEvector"} : (!fir.box<!fir.array<?xi64>>, !fir.dscope) -> (!fir.box<!fir.array<?xi64>>, !fir.box<!fir.array<?xi64>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_passing_subscripted_polyEx"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 314 : index
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_2]]#0, %[[VAL_5]] : (!fir.box<!fir.array<?xi64>>, index) -> (index, index, index)
diff --git a/flang/test/Lower/Intrinsics/associated-proc-pointers.f90 b/flang/test/Lower/Intrinsics/associated-proc-pointers.f90
index f5dd86f0808e5..d1db8a56dd8f5 100644
--- a/flang/test/Lower/Intrinsics/associated-proc-pointers.f90
+++ b/flang/test/Lower/Intrinsics/associated-proc-pointers.f90
@@ -9,7 +9,7 @@ subroutine test_proc_pointer_1(p, dummy_proc)
 ! CHECK-LABEL:   func.func @_QPtest_proc_pointer_1(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.boxproc<() -> ()>) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_1Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_1Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.boxproc<() -> ()>>
 ! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> (() -> ())
 ! CHECK:           %[[VAL_5:.*]] = fir.box_addr %[[VAL_1]] : (!fir.boxproc<() -> ()>) -> (() -> ())
@@ -28,8 +28,8 @@ subroutine test_proc_pointer_2(p, p_target)
 ! CHECK-LABEL:   func.func @_QPtest_proc_pointer_2(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_2Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_2Ep_target"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_2Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_2Ep_target"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.boxproc<() -> ()>>
 ! CHECK:           %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.boxproc<() -> ()>) -> (() -> ())
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.boxproc<() -> ()>>
@@ -50,7 +50,7 @@ subroutine test_proc_pointer_3(p, dummy_proc)
 ! CHECK-LABEL:   func.func @_QPtest_proc_pointer_3(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.boxproc<() -> ()>) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_3Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_3Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.boxproc<() -> ()>>
 ! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> (() -> ())
 ! CHECK:           %[[VAL_5:.*]] = fir.box_addr %[[VAL_1]] : (!fir.boxproc<() -> ()>) -> (() -> ())
@@ -69,7 +69,7 @@ subroutine test_proc_pointer_4(p)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_proc_pointer_4(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_4Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_4Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QPsome_external) : () -> ()
 ! CHECK:           %[[VAL_3:.*]] = fir.emboxproc %[[VAL_2]] : (() -> ()) -> !fir.boxproc<() -> ()>
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.boxproc<() -> ()>>
@@ -95,7 +95,7 @@ character(10) function char_func()
 ! CHECK-LABEL:   func.func @_QPtest_proc_pointer_5(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: tuple<!fir.boxproc<() -> ()>, i64> {fir.char_proc}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_5Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_5Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.extract_value %[[VAL_1]], [0 : index] : (tuple<!fir.boxproc<() -> ()>, i64>) -> !fir.boxproc<() -> ()>
 ! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> (() -> ())
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 10 : i64
diff --git a/flang/test/Lower/Intrinsics/c_f_pointer.f90 b/flang/test/Lower/Intrinsics/c_f_pointer.f90
index c1f1d7972d4b1..f54fda42cf51b 100644
--- a/flang/test/Lower/Intrinsics/c_f_pointer.f90
+++ b/flang/test/Lower/Intrinsics/c_f_pointer.f90
@@ -153,7 +153,6 @@ subroutine dynamic_shape_lower(cptr, fpr, shape, lower)
 ! CHECK: %[[VAL_2:.*]] = fir.shape %[[C_0]], %[[C_0]] : (index, index) -> !fir.shape<2>
 ! CHECK: %[[VAL_3:.*]] = fir.embox %[[VAL_1:.*]](%[[VAL_2]]) : (!fir.ptr<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.ptr<!fir.array<?x?xf32>>>
 ! CHECK: fir.store %[[VAL_3]] to %[[VAL_0:.*]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
-! CHECK: %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFdynamic_shape_lowerEn"}
 ! CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[ARG_0:.*]], __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64>
 ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64>
 ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> !fir.ptr<!fir.array<?x?xf32>>
diff --git a/flang/test/Lower/Intrinsics/c_f_procpointer.f90 b/flang/test/Lower/Intrinsics/c_f_procpointer.f90
index b5e9783b09b54..e4cfacbf9867e 100644
--- a/flang/test/Lower/Intrinsics/c_f_procpointer.f90
+++ b/flang/test/Lower/Intrinsics/c_f_procpointer.f90
@@ -10,8 +10,8 @@ subroutine test_c_funloc(fptr, cptr)
 ! CHECK-LABEL:   func.func @_QPtest_c_funloc(
 ! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
 ! CHECK-SAME:                                %[[VAL_1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>> {fir.bindc_name = "cptr"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c_funlocEcptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funlocEfptr"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_c_funlocEcptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funlocEfptr"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_2]]#0, __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) -> !fir.ref<i64>
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64>
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> (() -> ())
@@ -31,8 +31,8 @@ character(10) function char_func()
 ! CHECK-LABEL:   func.func @_QPtest_c_funloc_char(
 ! CHECK-SAME:                                     %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
 ! CHECK-SAME:                                     %[[VAL_1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>> {fir.bindc_name = "cptr"}) {
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c_funloc_charEcptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funloc_charEfptr"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_c_funloc_charEcptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funloc_charEfptr"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_2]]#0, __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) -> !fir.ref<i64>
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64>
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> (() -> ())
diff --git a/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90 b/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90
index fbd196832ba65..e991958c2592e 100644
--- a/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90
+++ b/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90
@@ -8,7 +8,7 @@ subroutine test_c_funloc(p)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_c_funloc(
 ! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funlocEp"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funlocEp"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.boxproc<() -> ()>>
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
 ! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) -> !fir.ref<i64>
@@ -27,7 +27,7 @@ character(10) function char_func()
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_c_funloc_char(
 ! CHECK-SAME:                                     %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funloc_charEp"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funloc_charEp"} : (!fir.ref<!fir.boxproc<() -> ()>>, !fir.dscope) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.boxproc<() -> ()>>
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
 ! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) -> !fir.ref<i64>
diff --git a/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90 b/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90
index 80feb0862cc74..16c2452dcbdbf 100644
--- a/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90
+++ b/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90
@@ -10,8 +10,8 @@ function test_c_ptr_eq(ptr1, ptr2)
 
 ! CHECK-LABEL: func.func @_QPtest_c_ptr_eq(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr1"}, %[[ARG1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr2"}) -> !fir.logical<4> {
-! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_eqEptr1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
-! CHECK: %[[DECL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_eqEptr2"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_eqEptr1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK: %[[DECL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_eqEptr2"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
 ! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4> {bindc_name = "test_c_ptr_eq", uniq_name = "_QFtest_c_ptr_eqEtest_c_ptr_eq"}
 ! CHECK: %[[DECL_RET:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFtest_c_ptr_eqEtest_c_ptr_eq"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK: %[[COORD_ADDRESS0:.*]] = fir.coordinate_of %[[DECL_ARG0]]#0, __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64>
@@ -35,8 +35,8 @@ function test_c_ptr_ne(ptr1, ptr2)
 
 ! CHECK-LABEL: func.func @_QPtest_c_ptr_ne(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr1"}, %[[ARG1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr2"}) -> !fir.logical<4> {
-! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_neEptr1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
-! CHECK: %[[DECL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_neEptr2"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_neEptr1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK: %[[DECL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_neEptr2"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
 ! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4> {bindc_name = "test_c_ptr_ne", uniq_name = "_QFtest_c_ptr_neEtest_c_ptr_ne"}
 ! CHECK: %[[DECL_RET:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFtest_c_ptr_neEtest_c_ptr_ne"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK: %[[COORD_ADDRESS0:.*]] = fir.coordinate_of %[[DECL_ARG0]]#0, __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64>
diff --git a/flang/test/Lower/Intrinsics/etime-function.f90 b/flang/test/Lower/Intrinsics/etime-function.f90
index f4594cee7525d..a2017f1f05571 100644
--- a/flang/test/Lower/Intrinsics/etime-function.f90
+++ b/flang/test/Lower/Intrinsics/etime-function.f90
@@ -11,9 +11,9 @@ subroutine etime_test(values, time)
   ! CHECK-NEXT:        %[[c2:.*]] = arith.constant 2 : index
   ! CHECK-NEXT:        %[[timeTmpAddr:.*]] = fir.alloca f32
   ! CHECK-NEXT:        %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-  ! CHECK-NEXT:        %[[timeDeclare:.*]] = fir.declare %[[timeArg]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFetime_testEtime"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  ! CHECK-NEXT:        %[[timeDeclare:.*]] = fir.declare %[[timeArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFetime_testEtime"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
   ! CHECK-NEXT:        %[[shape:.*]] = fir.shape %[[c2]] : (index) -> !fir.shape<1>
-  ! CHECK-NEXT:        %[[valuesDeclare:.*]] = fir.declare %[[valuesArg]](%[[shape]]) dummy_scope %[[DSCOPE]] {uniq_name = "_QFetime_testEvalues"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>, !fir.dscope) -> !fir.ref<!fir.array<2xf32>>
+  ! CHECK-NEXT:        %[[valuesDeclare:.*]] = fir.declare %[[valuesArg]](%[[shape]]) dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFetime_testEvalues"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>, !fir.dscope) -> !fir.ref<!fir.array<2xf32>>
   ! CHECK-NEXT:        %[[valuesBox:.*]] = fir.embox %[[valuesDeclare]](%[[shape]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
   ! CHECK-NEXT:        %[[timeTmpBox:.*]] = fir.embox %[[timeTmpAddr]] : (!fir.ref<f32>) -> !fir.box<f32>
   ! CHECK:             %[[values:.*]] = fir.convert %[[valuesBox]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
diff --git a/flang/test/Lower/Intrinsics/etime.f90 b/flang/test/Lower/Intrinsics/etime.f90
index fe5d16b64cd0c..a8fabb13ffdd3 100644
--- a/flang/test/Lower/Intrinsics/etime.f90
+++ b/flang/test/Lower/Intrinsics/etime.f90
@@ -10,9 +10,9 @@ subroutine etime_test(values, time)
   ! CHECK-NEXT:        %[[c9:.*]] = arith.constant 9 : i32
   ! CHECK-NEXT:        %[[c2:.*]] = arith.constant 2 : index
   ! CHECK-NEXT:        %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-  ! CHECK-NEXT:        %[[timeDeclare:.*]] = fir.declare %[[timeArg]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFetime_testEtime"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  ! CHECK-NEXT:        %[[timeDeclare:.*]] = fir.declare %[[timeArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFetime_testEtime"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
   ! CHECK-NEXT:        %[[shape:.*]] = fir.shape %[[c2]] : (index) -> !fir.shape<1>
-  ! CHECK-NEXT:        %[[valuesDeclare:.*]] = fir.declare %[[valuesArg]](%[[shape]]) dummy_scope %[[DSCOPE]] {uniq_name = "_QFetime_testEvalues"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>, !fir.dscope) -> !fir.ref<!fir.array<2xf32>>
+  ! CHECK-NEXT:        %[[valuesDeclare:.*]] = fir.declare %[[valuesArg]](%[[shape]]) dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFetime_testEvalues"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>, !fir.dscope) -> !fir.ref<!fir.array<2xf32>>
   ! CHECK-NEXT:        %[[valuesBox:.*]] = fir.embox %[[valuesDeclare]](%[[shape]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
   ! CHECK-NEXT:        %[[timeBox:.*]] = fir.embox %[[timeDeclare]] : (!fir.ref<f32>) -> !fir.box<f32>
   ! CHECK:             %[[values:.*]] = fir.convert %[[valuesBox]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
diff --git a/flang/test/Lower/Intrinsics/execute_command_line-optional.f90 b/flang/test/Lower/Intrinsics/execute_command_line-optional.f90
index 00a3258c9a647..f8c667f3fa82d 100644
--- a/flang/test/Lower/Intrinsics/execute_command_line-optional.f90
+++ b/flang/test/Lower/Intrinsics/execute_command_line-optional.f90
@@ -16,14 +16,14 @@ subroutine all_args_optional(command, isWait, exitVal, cmdVal, msg)
 ! CHECK-NEXT:    %true = arith.constant true 
 ! CHECK-NEXT:    %[[c0:.*]] = arith.constant 0 : i64 
 ! CHECK-NEXT:    %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK-NEXT:    %[[cmdstatDeclare:.*]] = fir.declare %[[cmdstatArg]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEcmdval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+! CHECK-NEXT:    %[[cmdstatDeclare:.*]] = fir.declare %[[cmdstatArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEcmdval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
 ! CHECK-NEXT:    %[[commandUnbox:.*]]:2 = fir.unboxchar %[[commandArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK-NEXT:    %[[commandDeclare:.*]] = fir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,?>>
+! CHECK-NEXT:    %[[commandDeclare:.*]] = fir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,?>>
 ! CHECK-NEXT:    %[[commandBoxTemp:.*]] = fir.emboxchar %[[commandDeclare]], %[[commandUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
-! CHECK-NEXT:    %[[exitstatDeclare:.*]] = fir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEexitval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
-! CHECK-NEXT:    %[[waitDeclare:.*]] = fir.declare %[[waitArg]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEiswait"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> !fir.ref<!fir.logical<4>>
+! CHECK-NEXT:    %[[exitstatDeclare:.*]] = fir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEexitval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+! CHECK-NEXT:    %[[waitDeclare:.*]] = fir.declare %[[waitArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEiswait"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> !fir.ref<!fir.logical<4>>
 ! CHECK-NEXT:    %[[cmdmsgUnbox:.*]]:2 = fir.unboxchar %[[cmdmsgArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK-NEXT:    %[[cmdmsgDeclare:.*]] = fir.declare %[[cmdmsgUnbox]]#0 typeparams %[[cmdmsgUnbox]]#1 dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEmsg"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,?>>
+! CHECK-NEXT:    %[[cmdmsgDeclare:.*]] = fir.declare %[[cmdmsgUnbox]]#0 typeparams %[[cmdmsgUnbox]]#1 dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEmsg"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,?>>
 ! CHECK-NEXT:    %[[cmdmsgBoxTemp:.*]] = fir.emboxchar %[[cmdmsgDeclare]], %[[cmdmsgUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
 ! CHECK-NEXT:    %[[exitstatIsPresent:.*]] = fir.is_present %[[exitstatDeclare]] : (!fir.ref<i32>) -> i1
 ! CHECK-NEXT:    %[[cmdstatIsPresent:.*]] = fir.is_present %[[cmdstatDeclare]] : (!fir.ref<i32>) -> i1
diff --git a/flang/test/Lower/Intrinsics/execute_command_line.f90 b/flang/test/Lower/Intrinsics/execute_command_line.f90
index 77f1750c504bd..e70513068ab3e 100644
--- a/flang/test/Lower/Intrinsics/execute_command_line.f90
+++ b/flang/test/Lower/Intrinsics/execute_command_line.f90
@@ -16,15 +16,15 @@ subroutine all_args(command, isWait, exitVal, cmdVal, msg)
 ! CHECK-NEXT:        %[[c0:.*]] = arith.constant 0 : i64 
 ! CHECK-NEXT:        %[[c30:.*]] = arith.constant 30 : index
 ! CHECK-NEXT:        %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK-NEXT:        %[[cmdstatsDeclare:.*]] = fir.declare %[[cmdstatArg]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEcmdval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+! CHECK-NEXT:        %[[cmdstatsDeclare:.*]] = fir.declare %[[cmdstatArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFall_argsEcmdval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
 ! CHECK-NEXT:        %[[commandUnbox:.*]]:2 = fir.unboxchar %[[commandArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK-NEXT:        %[[commandCast:.*]] = fir.convert %[[commandUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,30>>
-! CHECK-NEXT:        %[[commandDeclare:.*]] = fir.declare %[[commandCast]] typeparams %[[c30]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,30>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,30>>
-! CHECK-NEXT:        %[[exitstatDeclare:.*]] = fir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEexitval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
-! CHECK-NEXT:        %[[waitDeclare:.*]] = fir.declare %[[waitArg]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEiswait"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> !fir.ref<!fir.logical<4>>
+! CHECK-NEXT:        %[[commandDeclare:.*]] = fir.declare %[[commandCast]] typeparams %[[c30]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,30>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,30>>
+! CHECK-NEXT:        %[[exitstatDeclare:.*]] = fir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFall_argsEexitval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+! CHECK-NEXT:        %[[waitDeclare:.*]] = fir.declare %[[waitArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFall_argsEiswait"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> !fir.ref<!fir.logical<4>>
 ! CHECK-NEXT:        %[[cmdmsgUnbox:.*]]:2 = fir.unboxchar %[[cmdmsgArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK-NEXT:        %[[cmdmsgCast:.*]] = fir.convert %[[cmdmsgUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,30>>
-! CHECK-NEXT:        %[[cmdmsgDeclare:.*]] = fir.declare %[[cmdmsgCast]] typeparams %[[c30]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEmsg"} : (!fir.ref<!fir.char<1,30>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,30>>
+! CHECK-NEXT:        %[[cmdmsgDeclare:.*]] = fir.declare %[[cmdmsgCast]] typeparams %[[c30]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFall_argsEmsg"} : (!fir.ref<!fir.char<1,30>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,30>>
 ! CHECK-NEXT:        %[[commandBox:.*]] = fir.embox %[[commandDeclare]] : (!fir.ref<!fir.char<1,30>>) -> !fir.box<!fir.char<1,30>>
 ! CHECK-NEXT:        %[[exitstatBox:.*]] = fir.embox %[[exitstatDeclare]] : (!fir.ref<i32>) -> !fir.box<i32>
 ! CHECK-NEXT:        %[[cmdstatBox:.*]] = fir.embox %[[cmdstatsDeclare]] : (!fir.ref<i32>) -> !fir.box<i32>
@@ -57,7 +57,7 @@ subroutine only_command_default_wait_true(command)
 ! CHECK-NEXT:        %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK-NEXT:     %[[commandUnbox:.*]]:2 = fir.unboxchar %[[cmdArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK-NEXT:     %[[commandCast:.*]] = fir.convert %[[commandUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,30>>
-! CHECK-NEXT:     %[[commandDeclare:.*]] = fir.declare %[[commandCast]] typeparams %[[c30]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFonly_command_default_wait_trueEcommand"} : (!fir.ref<!fir.char<1,30>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,30>>
+! CHECK-NEXT:     %[[commandDeclare:.*]] = fir.declare %[[commandCast]] typeparams %[[c30]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFonly_command_default_wait_trueEcommand"} : (!fir.ref<!fir.char<1,30>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,30>>
 ! CHECK-NEXT:     %[[commandBox:.*]] = fir.embox %[[commandDeclare]] : (!fir.ref<!fir.char<1,30>>) -> !fir.box<!fir.char<1,30>>
 ! CHECK-NEXT:     %[[absent:.*]] = fir.absent !fir.box<none>
 ! CHECK:          %[[command:.*]] = fir.convert %[[commandBox]] : (!fir.box<!fir.char<1,30>>) -> !fir.box<none> 
diff --git a/flang/test/Lower/Intrinsics/getcwd-function.f90 b/flang/test/Lower/Intrinsics/getcwd-function.f90
index 50b64729294fe..4442941905676 100644
--- a/flang/test/Lower/Intrinsics/getcwd-function.f90
+++ b/flang/test/Lower/Intrinsics/getcwd-function.f90
@@ -11,7 +11,7 @@ integer function test(cwd)
   ! CHECK-NEXT:        %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
   ! CHECK-NEXT:        %[[cwdUnbox:.*]]:2 = fir.unboxchar %[[cwdArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
   ! CHECK-NEXT:        %[[cwdCast:.*]] = fir.convert %[[cwdUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,255>>
-  ! CHECK-NEXT:        %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtestEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>>
+  ! CHECK-NEXT:        %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtestEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>>
   ! CHECK-NEXT:        %[[test:.*]] = fir.alloca i32 {bindc_name = "test", uniq_name = "_QFtestEtest"}
   ! CHECK-NEXT:        %[[testAddr:.*]] = fir.declare %[[test]] {uniq_name = "_QFtestEtest"} : (!fir.ref<i32>) -> !fir.ref<i32>
   ! CHECK-NEXT:        %[[cwdBox:.*]] = fir.embox %[[cwdDeclare]] : (!fir.ref<!fir.char<1,255>>) -> !fir.box<!fir.char<1,255>>
diff --git a/flang/test/Lower/Intrinsics/getcwd-optional.f90 b/flang/test/Lower/Intrinsics/getcwd-optional.f90
index 3e2a221f0c3f9..ee1612f3ed8ee 100644
--- a/flang/test/Lower/Intrinsics/getcwd-optional.f90
+++ b/flang/test/Lower/Intrinsics/getcwd-optional.f90
@@ -15,8 +15,8 @@ subroutine test(cwd, status)
   ! CHECK-NEXT:        %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
   ! CHECK-NEXT:        %[[cwdUnbox:.*]]:2 = fir.unboxchar %[[cwdArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
   ! CHECK-NEXT:        %[[cwdCast:.*]] = fir.convert %[[cwdUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,255>>
-  ! CHECK-NEXT:        %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtestEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>>
-  ! CHECK-NEXT:        %[[statusAddr:.*]] = fir.declare %[[statusArg]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtestEstatus"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+  ! CHECK-NEXT:        %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFtestEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>>
+  ! CHECK-NEXT:        %[[statusAddr:.*]] = fir.declare %[[statusArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtestEstatus"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
   ! CHECK-NEXT:       %[[cwdBox:.*]] = fir.embox %[[cwdDeclare]] : (!fir.ref<!fir.char<1,255>>) -> !fir.box<!fir.char<1,255>>
   ! CHECK:            %[[cwd:.*]] = fir.convert %[[cwdBox]] : (!fir.box<!fir.char<1,255>>) -> !fir.box<none>
   ! CHECK:            %[[statusValue:.*]] = fir.call @_FortranAGetCwd(%[[cwd]], %[[VAL_8:.*]], %[[c11]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/Intrinsics/getcwd.f90 b/flang/test/Lower/Intrinsics/getcwd.f90
index fe207854aff0a..900075fe46e0e 100644
--- a/flang/test/Lower/Intrinsics/getcwd.f90
+++ b/flang/test/Lower/Intrinsics/getcwd.f90
@@ -10,7 +10,7 @@ subroutine cwd_only(cwd)
   ! CHECK-NEXT:        %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
   ! CHECK-NEXT:        %[[cwdUnbox:.*]]:2 = fir.unboxchar %[[cwdArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
   ! CHECK-NEXT:        %[[cwdCast:.*]] = fir.convert %[[cwdUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,255>>
-  ! CHECK-NEXT:        %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFcwd_onlyEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>>
+  ! CHECK-NEXT:        %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFcwd_onlyEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>>
   ! CHECK-NEXT:        %[[cwdBox:.*]] = fir.embox %[[cwdDeclare]] : (!fir.ref<!fir.char<1,255>>) -> !fir.box<!fir.char<1,255>>
   ! CHECK:             %[[cwd:.*]] = fir.convert %[[cwdBox]] : (!fir.box<!fir.char<1,255>>) -> !fir.box<none>
   ! CHECK:             %[[statusValue:.*]] = fir.call @_FortranAGetCwd(%[[cwd]], %[[VAL_7:.*]], %[[c7]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -30,8 +30,8 @@ subroutine all_arguments(cwd, status)
   ! CHECK-NEXT:        %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
   ! CHECK-NEXT:        %[[cwdUnbox:.*]]:2 = fir.unboxchar %[[cwdArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
   ! CHECK-NEXT:        %[[cwdCast:.*]] = fir.convert %[[cwdUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,255>>
-  ! CHECK-NEXT:        %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argumentsEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>>
-  ! CHECK-NEXT:        %[[statusAddr:.*]] = fir.declare %[[statusArg]] dummy_scope %0 {uniq_name = "_QFall_argumentsEstatus"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+  ! CHECK-NEXT:        %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFall_argumentsEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>>
+  ! CHECK-NEXT:        %[[statusAddr:.*]] = fir.declare %[[statusArg]] dummy_scope %0 {{.*}} {uniq_name = "_QFall_argumentsEstatus"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
   ! CHECK-NEXT:       %[[cwdBox:.*]] = fir.embox %[[cwdDeclare]] : (!fir.ref<!fir.char<1,255>>) -> !fir.box<!fir.char<1,255>>
   ! CHECK:            %[[cwd:.*]] = fir.convert %[[cwdBox]] : (!fir.box<!fir.char<1,255>>) -> !fir.box<none>
   ! CHECK:            %[[statusValue:.*]] = fir.call @_FortranAGetCwd(%[[cwd]], %[[VAL_8:.*]], %[[c26]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/Intrinsics/ieee_logb.f90 b/flang/test/Lower/Intrinsics/ieee_logb.f90
index 4d32d95196199..fd4144e516118 100644
--- a/flang/test/Lower/Intrinsics/ieee_logb.f90
+++ b/flang/test/Lower/Intrinsics/ieee_logb.f90
@@ -9,7 +9,7 @@ subroutine out(x)
   ! CHECK:     %[[V_61:[0-9]+]] = fir.declare %[[V_60]] {uniq_name = "_QFoutEl"} : (!fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>>
   ! CHECK:     %[[V_62:[0-9]+]] = fir.alloca f64 {bindc_name = "r", uniq_name = "_QFoutEr"}
   ! CHECK:     %[[V_63:[0-9]+]] = fir.declare %[[V_62]] {uniq_name = "_QFoutEr"} : (!fir.ref<f64>) -> !fir.ref<f64>
-  ! CHECK:     %[[V_64:[0-9]+]] = fir.declare %arg0 dummy_scope %{{[0-9]+}} {uniq_name = "_QFoutEx"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64>
+  ! CHECK:     %[[V_64:[0-9]+]] = fir.declare %arg0 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFoutEx"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64>
   real(k) :: x, r
   logical :: L
 
diff --git a/flang/test/Lower/Intrinsics/nearest.f90 b/flang/test/Lower/Intrinsics/nearest.f90
index 6dbdc8b89070f..95e3ea52f69c8 100644
--- a/flang/test/Lower/Intrinsics/nearest.f90
+++ b/flang/test/Lower/Intrinsics/nearest.f90
@@ -4,8 +4,8 @@
   ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
   ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f16 {bindc_name = "res", uniq_name = "_QFnearest_test1Eres"}
   ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test1Eres"} : (!fir.ref<f16>) -> !fir.ref<f16>
-  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test1Es"} : (!fir.ref<f16>, !fir.dscope) -> !fir.ref<f16>
-  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test1Ex"} : (!fir.ref<f16>, !fir.dscope) -> !fir.ref<f16>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test1Es"} : (!fir.ref<f16>, !fir.dscope) -> !fir.ref<f16>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test1Ex"} : (!fir.ref<f16>, !fir.dscope) -> !fir.ref<f16>
   ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f16>
   ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f16>
   ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f16) -> i1
@@ -63,8 +63,8 @@ subroutine nearest_test1(x, s)
   ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
   ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca bf16 {bindc_name = "res", uniq_name = "_QFnearest_test2Eres"}
   ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test2Eres"} : (!fir.ref<bf16>) -> !fir.ref<bf16>
-  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test2Es"} : (!fir.ref<bf16>, !fir.dscope) -> !fir.ref<bf16>
-  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test2Ex"} : (!fir.ref<bf16>, !fir.dscope) -> !fir.ref<bf16>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test2Es"} : (!fir.ref<bf16>, !fir.dscope) -> !fir.ref<bf16>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test2Ex"} : (!fir.ref<bf16>, !fir.dscope) -> !fir.ref<bf16>
   ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<bf16>
   ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<bf16>
   ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (bf16) -> i1
@@ -126,8 +126,8 @@ subroutine nearest_test2(x, s)
   ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
   ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f32 {bindc_name = "res", uniq_name = "_QFnearest_test3Eres"}
   ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test3Eres"} : (!fir.ref<f32>) -> !fir.ref<f32>
-  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test3Es"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
-  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test3Ex"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test3Es"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test3Ex"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
   ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f32>
   ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f32>
   ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f32) -> i1
@@ -185,8 +185,8 @@ subroutine nearest_test3(x, s)
   ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
   ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f64 {bindc_name = "res", uniq_name = "_QFnearest_test4Eres"}
   ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test4Eres"} : (!fir.ref<f64>) -> !fir.ref<f64>
-  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test4Es"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64>
-  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test4Ex"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test4Es"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test4Ex"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64>
   ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f64>
   ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f64>
   ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f64) -> i1
@@ -244,8 +244,8 @@ subroutine nearest_test4(x, s)
   ! CHECK-KIND10:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
   ! CHECK-KIND10:     %[[V_1:[0-9]+]] = fir.alloca f80 {bindc_name = "res", uniq_name = "_QFnearest_test5Eres"}
   ! CHECK-KIND10:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test5Eres"} : (!fir.ref<f80>) -> !fir.ref<f80>
-  ! CHECK-KIND10:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test5Es"} : (!fir.ref<f80>, !fir.dscope) -> !fir.ref<f80>
-  ! CHECK-KIND10:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test5Ex"} : (!fir.ref<f80>, !fir.dscope) -> !fir.ref<f80>
+  ! CHECK-KIND10:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test5Es"} : (!fir.ref<f80>, !fir.dscope) -> !fir.ref<f80>
+  ! CHECK-KIND10:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test5Ex"} : (!fir.ref<f80>, !fir.dscope) -> !fir.ref<f80>
   ! CHECK-KIND10:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f80>
   ! CHECK-KIND10:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f80>
   ! CHECK-KIND10:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f80) -> i1
@@ -291,8 +291,8 @@ subroutine nearest_test5(x, s)
   ! CHECK-KIND16:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
   ! CHECK-KIND16:     %[[V_1:[0-9]+]] = fir.alloca f128 {bindc_name = "res", uniq_name = "_QFnearest_test6Eres"}
   ! CHECK-KIND16:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test6Eres"} : (!fir.ref<f128>) -> !fir.ref<f128>
-  ! CHECK-KIND16:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test6Es"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
-  ! CHECK-KIND16:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test6Ex"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
+  ! CHECK-KIND16:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test6Es"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
+  ! CHECK-KIND16:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test6Ex"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
   ! CHECK-KIND16:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f128>
   ! CHECK-KIND16:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f128>
   ! CHECK-KIND16:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f128) -> i1
@@ -351,8 +351,8 @@ subroutine nearest_test6(x, s)
   ! CHECK-KIND16:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
   ! CHECK-KIND16:     %[[V_1:[0-9]+]] = fir.alloca f128 {bindc_name = "res", uniq_name = "_QFnearest_test7Eres"}
   ! CHECK-KIND16:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test7Eres"} : (!fir.ref<f128>) -> !fir.ref<f128>
-  ! CHECK-KIND16:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test7Es"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
-  ! CHECK-KIND16:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test7Ex"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
+  ! CHECK-KIND16:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test7Es"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  ! CHECK-KIND16:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] arg {{[0-9]+}} {uniq_name = "_QFnearest_test7Ex"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
   ! CHECK-KIND16:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f128>
   ! CHECK-KIND16:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f32>
   ! CHECK-KIND16:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f128) -> i1
diff --git a/flang/test/Lower/Intrinsics/perror.f90 b/flang/test/Lower/Intrinsics/perror.f90
index acecf0b996949..e746e73a5f9bc 100644
--- a/flang/test/Lower/Intrinsics/perror.f90
+++ b/flang/test/Lower/Intrinsics/perror.f90
@@ -43,7 +43,7 @@ subroutine test_perror_unknown_length(str)
   call perror(str)
   ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
   ! CHECK: %[[VAL_1:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-  ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_perror_unknown_lengthEstr"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+  ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_perror_unknown_lengthEstr"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
   ! CHECK: %[[VAL_3:.*]] = fir.embox %[[VAL_2]]#1 typeparams %[[VAL_1]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
   ! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,?>>
   ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
diff --git a/flang/test/Lower/Intrinsics/putenv-sub.f90 b/flang/test/Lower/Intrinsics/putenv-sub.f90
index 285dbc6fddb19..f7c347f31e678 100644
--- a/flang/test/Lower/Intrinsics/putenv-sub.f90
+++ b/flang/test/Lower/Intrinsics/putenv-sub.f90
@@ -6,7 +6,7 @@ subroutine str_only(str)
     CHARACTER(len=*) :: str
     !CHECK-DAG:    %[[scope:.*]] = fir.dummy_scope : !fir.dscope
     !CHECK-DAG:    %[[unbox_str:.*]]:2 = fir.unboxchar %[[dummyStr]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-    !CHECK-DAG:    %[[str_decl:.*]]:2 = hlfir.declare %[[unbox_str]]#0 typeparams %[[unbox_str]]#1 dummy_scope %[[scope]] {uniq_name = "_QFstr_onlyEstr"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    !CHECK-DAG:    %[[str_decl:.*]]:2 = hlfir.declare %[[unbox_str]]#0 typeparams %[[unbox_str]]#1 dummy_scope %[[scope]] arg {{[0-9]+}} {uniq_name = "_QFstr_onlyEstr"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
     !CHECK-DAG:    %[[src_str_addr:.*]] = fir.address_of(@_{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
     !CHECK-DAG:    %[[line_value:.*]] = arith.constant {{.*}} : i64
     !CHECK-DAG:    %[[str:.*]] = fir.convert %[[str_decl]]#1 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
@@ -30,8 +30,8 @@ subroutine all_arguments(str, status)
     INTEGER :: status
     !CHECK-DAG:    %[[scope:.*]] = fir.dummy_scope : !fir.dscope
     !CHECK-DAG:    %[[unbox_str:.*]]:2 = fir.unboxchar %[[dummyStr]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-    !CHECK-DAG:    %[[str_decl:.*]]:2 = hlfir.declare %[[unbox_str]]#0 typeparams %[[unbox_str]]#1 dummy_scope %[[scope]] {uniq_name = "_QFall_argumentsEstr"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-    !CHECK-DAG:    %[[status_decl:.*]]:2 = hlfir.declare %[[dummyStat]] dummy_scope %[[scope]] {uniq_name = "_QFall_argumentsEstatus"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+    !CHECK-DAG:    %[[str_decl:.*]]:2 = hlfir.declare %[[unbox_str]]#0 typeparams %[[unbox_str]]#1 dummy_scope %[[scope]] arg {{[0-9]+}} {uniq_name = "_QFall_argumentsEstr"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    !CHECK-DAG:    %[[status_decl:.*]]:2 = hlfir.declare %[[dummyStat]] dummy_scope %[[scope]] arg {{[0-9]+}} {uniq_name = "_QFall_argumentsEstatus"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
     !CHECK-DAG:    %[[src_str_addr:.*]] = fir.address_of(@_{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
     !CHECK-DAG:    %[[line_value:.*]] = arith.constant {{.*}} : i64
     !CHECK-DAG:    %[[str:.*]] = fir.convert %[[str_decl]]#1 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
diff --git a/flang/test/Lower/Intrinsics/rename.f90 b/flang/test/Lower/Intrinsics/rename.f90
index 41d120367b140..6fdb5144b7fc9 100644
--- a/flang/test/Lower/Intrinsics/rename.f90
+++ b/flang/test/Lower/Intrinsics/rename.f90
@@ -9,9 +9,9 @@ subroutine test_rename(src, dst)
 
     call rename(src, dst)
     !CHECK:      %[[dstUnbox:.*]]:2 = fir.unboxchar %[[dummyDst]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-    !CHECK-NEXT: %[[dstDecl:.*]]:2 = hlfir.declare %[[dstUnbox]]#0 typeparams %[[dstUnbox]]#1 dummy_scope %0 {uniq_name = "_QFtest_renameEdst"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    !CHECK-NEXT: %[[dstDecl:.*]]:2 = hlfir.declare %[[dstUnbox]]#0 typeparams %[[dstUnbox]]#1 dummy_scope %0 {{.*}} {uniq_name = "_QFtest_renameEdst"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
     !CHECK-NEXT: %[[srcUnbox:.*]]:2 = fir.unboxchar %[[dummySrc]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-    !CHECK-NEXT: %[[srcDecl:.*]]:2 = hlfir.declare %3#0 typeparams %[[srcUnbox]]#1 dummy_scope %0 {uniq_name = "_QFtest_renameEsrc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    !CHECK-NEXT: %[[srcDecl:.*]]:2 = hlfir.declare %3#0 typeparams %[[srcUnbox]]#1 dummy_scope %0 {{.*}} {uniq_name = "_QFtest_renameEsrc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
     !CHECK-NEXT: %[[srcBox:.*]] = fir.embox %[[srcDecl]]#1 typeparams %[[srcUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
     !CHECK-NEXT: %[[dstBox:.*]] = fir.embox %[[dstDecl]]#1 typeparams %[[dstUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
     !CHECK-NEXT: %[[statusBox:.*]] = fir.absent !fir.box<none>
@@ -33,9 +33,9 @@ subroutine test_rename_status(src, dst)
 
     call rename(src, dst, status)
     !CHECK:      %[[dstUnbox:.*]]:2 = fir.unboxchar %[[dummyDst]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-    !CHECK-NEXT: %[[dstDecl:.*]]:2 = hlfir.declare %[[dstUnbox]]#0 typeparams %[[dstUnbox]]#1 dummy_scope %0 {uniq_name = "_QFtest_rename_statusEdst"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    !CHECK-NEXT: %[[dstDecl:.*]]:2 = hlfir.declare %[[dstUnbox]]#0 typeparams %[[dstUnbox]]#1 dummy_scope %0 {{.*}} {uniq_name = "_QFtest_rename_statusEdst"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
     !CHECK-NEXT: %[[srcUnbox:.*]]:2 = fir.unboxchar %[[dummySrc]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-    !CHECK-NEXT: %[[srcDecl:.*]]:2 = hlfir.declare %3#0 typeparams %[[srcUnbox]]#1 dummy_scope %0 {uniq_name = "_QFtest_rename_statusEsrc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    !CHECK-NEXT: %[[srcDecl:.*]]:2 = hlfir.declare %3#0 typeparams %[[srcUnbox]]#1 dummy_scope %0 {{.*}} {uniq_name = "_QFtest_rename_statusEsrc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
     !CHECK-NEXT: %[[statusAlloc:.*]] = fir.alloca i32 {bindc_name = "status", uniq_name = "_QFtest_rename_statusEstatus"}
     !CHECK-NEXT: %[[statusDecl:.*]]:2 = hlfir.declare %[[statusAlloc]] {uniq_name = "_QFtest_rename_statusEstatus"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
     !CHECK-NEXT: %[[srcBox:.*]] = fir.embox %[[srcDecl]]#1 typeparams %[[srcUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
diff --git a/flang/test/Lower/Intrinsics/second.f90 b/flang/test/Lower/Intrinsics/second.f90
index 55af10b31382d..0da64634283ff 100644
--- a/flang/test/Lower/Intrinsics/second.f90
+++ b/flang/test/Lower/Intrinsics/second.f90
@@ -8,7 +8,7 @@ subroutine test_subroutine(time)
 ! CHECK-LABEL:   func.func @_QPtest_subroutine(
 ! CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.ref<f32> {fir.bindc_name = "time"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_subroutineEtime"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFtest_subroutineEtime"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.call @_FortranACpuTime() fastmath<contract> : () -> f64
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (f64) -> f32
 ! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]]#0 : !fir.ref<f32>
@@ -24,7 +24,7 @@ subroutine test_function(time)
 ! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.ref<f32> {fir.bindc_name = "time"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca f32
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_functionEtime"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_functionEtime"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.call @_FortranACpuTime() fastmath<contract> : () -> f64
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (f64) -> f32
 ! CHECK:           fir.store %[[VAL_5]] to %[[VAL_1]] : !fir.ref<f32>
@@ -42,8 +42,8 @@ subroutine test_function_subexpr(t1, t2)
 ! CHECK-SAME:                                        %[[VAL_1:.*]]: !fir.ref<f32> {fir.bindc_name = "t2"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca f32
 ! CHECK:           %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFtest_function_subexprEt1"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QFtest_function_subexprEt2"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFtest_function_subexprEt1"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFtest_function_subexprEt2"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:           %[[VAL_6:.*]] = fir.call @_FortranACpuTime() fastmath<contract> : () -> f64
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (f64) -> f32
 ! CHECK:           fir.store %[[VAL_7]] to %[[VAL_2]] : !fir.ref<f32>
diff --git a/flang/test/Lower/Intrinsics/selected_char_kind.f90 b/flang/test/Lower/Intrinsics/selected_char_kind.f90
index 4012591f22867..d2040a414cea1 100644
--- a/flang/test/Lower/Intrinsics/selected_char_kind.f90
+++ b/flang/test/Lower/Intrinsics/selected_char_kind.f90
@@ -9,7 +9,7 @@ subroutine selected_char_kind_test(c)
 ! CHECK-LABEL: func.func @_QPselected_char_kind_test(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "c"})
 ! CHECK: %[[UNBOXCHAR:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK: %[[C:.*]]:2 = hlfir.declare %[[UNBOXCHAR]]#0 typeparams %[[UNBOXCHAR]]#1 dummy_scope %0 {uniq_name = "_QFselected_char_kind_testEc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK: %[[C:.*]]:2 = hlfir.declare %[[UNBOXCHAR]]#0 typeparams %[[UNBOXCHAR]]#1 dummy_scope %0 {{.*}} {uniq_name = "_QFselected_char_kind_testEc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFselected_char_kind_testEres"}
 ! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_char_kind_testEres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: %[[CHAR_PTR:.*]] = fir.convert %[[C]]#1 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
diff --git a/flang/test/Lower/Intrinsics/selected_logical_kind.f90 b/flang/test/Lower/Intrinsics/selected_logical_kind.f90
index 5d2d99553efc2..c818a1f60800f 100644
--- a/flang/test/Lower/Intrinsics/selected_logical_kind.f90
+++ b/flang/test/Lower/Intrinsics/selected_logical_kind.f90
@@ -7,7 +7,7 @@ subroutine selected_logical_kind_test1(input)
 
 ! CHECK-LABEL: func.func @_QPselected_logical_kind_test1(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i8> {fir.bindc_name = "input"})
-! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFselected_logical_kind_test1Einput"} : (!fir.ref<i8>, !fir.dscope) -> (!fir.ref<i8>, !fir.ref<i8>)
+! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {{.*}} {uniq_name = "_QFselected_logical_kind_test1Einput"} : (!fir.ref<i8>, !fir.dscope) -> (!fir.ref<i8>, !fir.ref<i8>)
 ! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i8 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test1Eres"}
 ! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test1Eres"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
 ! CHECK: %[[KIND:.*]] = arith.constant 1 : i32
@@ -21,7 +21,7 @@ subroutine selected_logical_kind_test2(input)
 
 ! CHECK-LABEL: func.func @_QPselected_logical_kind_test2(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i16> {fir.bindc_name = "input"})
-! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFselected_logical_kind_test2Einput"} : (!fir.ref<i16>, !fir.dscope) -> (!fir.ref<i16>, !fir.ref<i16>)
+! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {{.*}} {uniq_name = "_QFselected_logical_kind_test2Einput"} : (!fir.ref<i16>, !fir.dscope) -> (!fir.ref<i16>, !fir.ref<i16>)
 ! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i16 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test2Eres"}
 ! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test2Eres"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
 ! CHECK: %[[KIND:.*]] = arith.constant 2 : i32
@@ -35,7 +35,7 @@ subroutine selected_logical_kind_test4(input)
 
 ! CHECK-LABEL: func.func @_QPselected_logical_kind_test4(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "input"})
-! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFselected_logical_kind_test4Einput"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %arg0 dummy_scope %0 {{.*}} {uniq_name = "_QFselected_logical_kind_test4Einput"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test4Eres"}
 ! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test4Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: %[[KIND:.*]] = arith.constant 4 : i32
@@ -49,7 +49,7 @@ subroutine selected_logical_kind_test8(input)
 
 ! CHECK-LABEL: func.func @_QPselected_logical_kind_test8(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i64> {fir.bindc_name = "input"})
-! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFselected_logical_kind_test8Einput"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {{.*}} {uniq_name = "_QFselected_logical_kind_test8Einput"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK: %[[RES_ALLOCA]] = fir.alloca i64 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test8Eres"}
 ! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test8Eres"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK: %[[KIND:.*]] = arith.constant 8 : i32
@@ -63,7 +63,7 @@ subroutine selected_logical_kind_test16(input)
 
 ! CHECK-LABEL: func.func @_QPselected_logical_kind_test16(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i128> {fir.bindc_name = "input"})
-! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFselected_logical_kind_test16Einput"} : (!fir.ref<i128>, !fir.dscope) -> (!fir.ref<i128>, !fir.ref<i128>)
+! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {{.*}} {uniq_name = "_QFselected_logical_kind_test16Einput"} : (!fir.ref<i128>, !fir.dscope) -> (!fir.ref<i128>, !fir.ref<i128>)
 ! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i128 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test16Eres"}
 ! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test16Eres"} : (!fir.ref<i128>) -> (!fir.ref<i128>, !fir.ref<i128>)
 ! CHECK: %[[KIND:.*]] = arith.constant 16 : i32
diff --git a/flang/test/Lower/Intrinsics/signal.f90 b/flang/test/Lower/Intrinsics/signal.f90
index 18a6470504cac..5c227ae80534e 100644
--- a/flang/test/Lower/Intrinsics/signal.f90
+++ b/flang/test/Lower/Intrinsics/signal.f90
@@ -23,7 +23,7 @@ subroutine setup_signals(optional_status)
     integer, optional, intent(out) :: optional_status
 
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca i32
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out, optional>, uniq_name = "_QMmFsetup_signalsEoptional_status"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out, optional>, uniq_name = "_QMmFsetup_signalsEoptional_status"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QMmFsetup_signalsEstat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
     call signal(SIGFPE, handler)
diff --git a/flang/test/Lower/Intrinsics/sizeof.f90 b/flang/test/Lower/Intrinsics/sizeof.f90
index 7e749f096112e..a8bc8b3501c6a 100644
--- a/flang/test/Lower/Intrinsics/sizeof.f90
+++ b/flang/test/Lower/Intrinsics/sizeof.f90
@@ -6,7 +6,7 @@ integer(8) function test1(x)
   test1 = sizeof(x)
 end function
 ! CHECK-LABEL:   func.func @_QPtest1(
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.class<none>, !fir.dscope) -> (!fir.class<none>, !fir.class<none>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest1Ex"} : (!fir.class<none>, !fir.dscope) -> (!fir.class<none>, !fir.class<none>)
 ! CHECK:           %[[VAL_4:.*]] = fir.box_elesize %[[VAL_3]]#1 : (!fir.class<none>) -> i64
 ! CHECK:           hlfir.assign %[[VAL_4]] to %{{.*}} : i64, !fir.ref<i64>
 
@@ -15,7 +15,7 @@ integer(8) function test2(x)
   test2 = sizeof(x)
 end function
 ! CHECK-LABEL:   func.func @_QPtest2(
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest2Ex"} : (!fir.class<!fir.array<?x?xnone>>, !fir.dscope) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.box_elesize %[[VAL_3]]#1 : (!fir.class<!fir.array<?x?xnone>>) -> i64
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_9:.*]] = fir.call @_FortranASize(%[[VAL_7]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> i64
diff --git a/flang/test/Lower/Intrinsics/system-optional.f90 b/flang/test/Lower/Intrinsics/system-optional.f90
index 8695d18a0641a..672c398b8ac79 100644
--- a/flang/test/Lower/Intrinsics/system-optional.f90
+++ b/flang/test/Lower/Intrinsics/system-optional.f90
@@ -11,8 +11,8 @@ subroutine all_args(command, exitstat)
 ! CHECK-NEXT:    %[[cmdstatVal:.*]] = fir.alloca i16
 ! CHECK-NEXT:    %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK-NEXT:    %[[commandUnbox:.*]]:2 = fir.unboxchar %[[commandArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK-NEXT:    %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-! CHECK-NEXT:    %[[exitstatDeclare:.*]]:2 = hlfir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_argsEexitstat"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-NEXT:    %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK-NEXT:    %[[exitstatDeclare:.*]]:2 = hlfir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_argsEexitstat"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK-NEXT:    %[[exitstatIsPresent:.*]] = fir.is_present %[[exitstatDeclare]]#0 : (!fir.ref<i32>) -> i1
 ! CHECK-NEXT:    %[[commandBox:.*]] = fir.embox %[[commandDeclare]]#1 typeparams %[[commandUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
 ! CHECK-NEXT:    %[[exitstatBox:.*]] = fir.embox %[[exitstatDeclare]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
diff --git a/flang/test/Lower/Intrinsics/system.f90 b/flang/test/Lower/Intrinsics/system.f90
index 5e9f8f20e0729..6ea98bca7de72 100644
--- a/flang/test/Lower/Intrinsics/system.f90
+++ b/flang/test/Lower/Intrinsics/system.f90
@@ -10,8 +10,8 @@ subroutine all_args(command, exitstat)
 ! CHECK-NEXT:   %[[cmdstatVal:.*]] = fir.alloca i16
 ! CHECK-NEXT:   %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK-NEXT:   %[[commandUnbox:.*]]:2 = fir.unboxchar %[[commandArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK-NEXT:   %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-! CHECK-NEXT:   %[[exitstatDeclare:.*]]:2 = hlfir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argsEexitstat"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-NEXT:   %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK-NEXT:   %[[exitstatDeclare:.*]]:2 = hlfir.declare %[[exitstatArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFall_argsEexitstat"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK-NEXT:   %[[commandBox:.*]] = fir.embox %[[commandDeclare]]#1 typeparams %[[commandUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
 ! CHECK-NEXT:   %[[exitstatBox:.*]] = fir.embox %[[exitstatDeclare]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
 ! CHECK-NEXT:   %[[true:.*]] = arith.constant true
@@ -36,7 +36,7 @@ subroutine only_command(command)
 ! CHECK-NEXT:   %[[cmdstatVal:.*]] = fir.alloca i16
 ! CHECK-NEXT:   %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK-NEXT:   %[[commandUnbox:.*]]:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK-NEXT:   %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] {uniq_name = "_QFonly_commandEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK-NEXT:   %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFonly_commandEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK-NEXT:   %[[commandBox:.*]] = fir.embox %[[commandDeclare]]#1 typeparams %[[commandUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
 ! CHECK-NEXT:   %[[true:.*]] = arith.constant true
 ! CHECK-NEXT:   %[[absentBox:.*]] = fir.absent !fir.box<none>
@@ -63,7 +63,7 @@ subroutine as_function(command)
 ! CHECK-NEXT:   %[[RETVAL:.*]] = fir.alloca i32
 ! CHECK-NEXT:   %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK-NEXT:   %[[commandUnbox:.*]]:2 = fir.unboxchar %[[commandArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK-NEXT:   %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] {uniq_name = "_QFas_functionEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK-NEXT:   %[[commandDeclare:.*]]:2 = hlfir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFas_functionEcommand"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK-NEXT:   %[[EXITSTAT_ALLOC:.*]] = fir.alloca i32
 ! CHECK-NEXT:   %[[exitstatDeclare:.*]]:2 = hlfir.declare %[[EXITSTAT_ALLOC]] {uniq_name = "_QFas_functionEexitstat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK-NEXT:   %[[commandBox:.*]] = fir.embox %[[commandDeclare]]#1 typeparams %[[commandUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
diff --git a/flang/test/Lower/Intrinsics/system_clock.f90 b/flang/test/Lower/Intrinsics/system_clock.f90
index 9eae3a58884fa..f6fae1113b315 100644
--- a/flang/test/Lower/Intrinsics/system_clock.f90
+++ b/flang/test/Lower/Intrinsics/system_clock.f90
@@ -32,11 +32,9 @@ subroutine system_clock_test()
 
 ! CHECK-LABEL: @_QPss
 subroutine ss(count)
-  ! CHECK:   %[[V_0:[0-9]+]] = fir.alloca !fir.box<!fir.heap<i64>> {bindc_name = "count_max", uniq_name = "_QFssEcount_max"}
   ! CHECK:   %[[V_1:[0-9]+]] = fir.alloca !fir.heap<i64> {uniq_name = "_QFssEcount_max.addr"}
   ! CHECK:   %[[V_2:[0-9]+]] = fir.zero_bits !fir.heap<i64>
   ! CHECK:   fir.store %[[V_2]] to %[[V_1]] : !fir.ref<!fir.heap<i64>>
-  ! CHECK:   %[[V_3:[0-9]+]] = fir.alloca !fir.box<!fir.ptr<i64>> {bindc_name = "count_rate", uniq_name = "_QFssEcount_rate"}
   ! CHECK:   %[[V_4:[0-9]+]] = fir.alloca !fir.ptr<i64> {uniq_name = "_QFssEcount_rate.addr"}
   ! CHECK:   %[[V_5:[0-9]+]] = fir.zero_bits !fir.ptr<i64>
   ! CHECK:   fir.store %[[V_5]] to %[[V_4]] : !fir.ref<!fir.ptr<i64>>
diff --git a/flang/test/Lower/Intrinsics/unlink-sub.f90 b/flang/test/Lower/Intrinsics/unlink-sub.f90
index 78d2b1096ae82..ac535005fd442 100644
--- a/flang/test/Lower/Intrinsics/unlink-sub.f90
+++ b/flang/test/Lower/Intrinsics/unlink-sub.f90
@@ -6,7 +6,7 @@ subroutine path_only(path)
     CHARACTER(len=*) :: path
     !CHECK-DAG:    %[[scope:.*]] = fir.dummy_scope : !fir.dscope
     !CHECK-DAG:    %[[unbox_path:.*]]:2 = fir.unboxchar %[[dummyPath]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-    !CHECK-DAG:    %[[path_decl:.*]]:2 = hlfir.declare %[[unbox_path]]#0 typeparams %[[unbox_path]]#1 dummy_scope %[[scope]] {uniq_name = "_QFpath_onlyEpath"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    !CHECK-DAG:    %[[path_decl:.*]]:2 = hlfir.declare %[[unbox_path]]#0 typeparams %[[unbox_path]]#1 dummy_scope %[[scope]] arg {{[0-9]+}} {uniq_name = "_QFpath_onlyEpath"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
     !CHECK-DAG:    %[[src_path_addr:.*]] = fir.address_of(@_{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
     !CHECK-DAG:    %[[line_value:.*]] = arith.constant {{.*}} : i64
     !CHECK-DAG:    %[[path:.*]] = fir.convert %[[path_decl]]#1 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
@@ -30,8 +30,8 @@ subroutine all_arguments(path, status)
     INTEGER :: status
     !CHECK-DAG:    %[[scope:.*]] = fir.dummy_scope : !fir.dscope
     !CHECK-DAG:    %[[unbox_path:.*]]:2 = fir.unboxchar %[[dummyPath]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-    !CHECK-DAG:    %[[path_decl:.*]]:2 = hlfir.declare %[[unbox_path]]#0 typeparams %[[unbox_path]]#1 dummy_scope %[[scope]] {uniq_name = "_QFall_argumentsEpath"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-    !CHECK-DAG:    %[[status_decl:.*]]:2 = hlfir.declare %[[dummyStat]] dummy_scope %[[scope]] {uniq_name = "_QFall_argumentsEstatus"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+    !CHECK-DAG:    %[[path_decl:.*]]:2 = hlfir.declare %[[unbox_path]]#0 typeparams %[[unbox_path]]#1 dummy_scope %[[scope]] arg {{[0-9]+}} {uniq_name = "_QFall_argumentsEpath"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    !CHECK-DAG:    %[[status_decl:.*]]:2 = hlfir.declare %[[dummyStat]] dummy_scope %[[scope]] arg {{[0-9]+}} {uniq_name = "_QFall_argumentsEstatus"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
     !CHECK-DAG:    %[[src_path_addr:.*]] = fir.address_of(@_{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
     !CHECK-DAG:    %[[line_value:.*]] = arith.constant {{.*}} : i64
     !CHECK-DAG:    %[[path:.*]] = fir.convert %[[path_decl]]#1 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
diff --git a/flang/test/Lower/MIF/change_team.f90 b/flang/test/Lower/MIF/change_team.f90
new file mode 100644
index 0000000000000..7fb31aa99812b
--- /dev/null
+++ b/flang/test/Lower/MIF/change_team.f90
@@ -0,0 +1,27 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY
+! RUN: not %flang_fc1 -emit-hlfir %s 2>&1 | FileCheck %s --check-prefixes=NOCOARRAY
+
+program test_change_team
+  use, intrinsic :: iso_fortran_env, only: team_type
+  implicit none
+  ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
+
+  type(team_type) :: team
+  integer :: stat, i
+  character(len=10) :: err
+
+  ! COARRAY: mif.change_team %[[TEAM:.*]] : ({{.*}}) {
+  change team (team)
+    i = i +1
+  end team 
+  ! COARRAY: mif.end_team 
+  ! COARRAY: }
+ 
+  ! COARRAY: mif.change_team %[[TEAM:.*]] stat %[[STAT:.*]]#0 errmsg %[[ERR:.*]] : ({{.*}}, !fir.ref<i32>, !fir.box<!fir.char<1,10>>) {
+  change team (team, STAT=stat, ERRMSG=err)
+  end team
+  ! COARRAY: mif.end_team 
+  ! COARRAY: }
+
+end program test_change_team
+ 
diff --git a/flang/test/Lower/MIF/co_broadcast.f90 b/flang/test/Lower/MIF/co_broadcast.f90
index 25e4330ade704..fadee5f6bcdf8 100644
--- a/flang/test/Lower/MIF/co_broadcast.f90
+++ b/flang/test/Lower/MIF/co_broadcast.f90
@@ -34,13 +34,13 @@ program test_co_broadcast
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
   ! CHECK: mif.co_broadcast %[[V1]] source %[[C1_i32:.*]] : (!fir.box<!fir.array<2xi32>>, i32)
   call co_broadcast(array_i, source_image=1)
-  
+
   ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
   ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xcomplex<f32>>>, !fir.shape<1>) -> !fir.box<!fir.array<2xcomplex<f32>>>
   ! CHECK: mif.co_broadcast %[[V1]] source %[[C1_i32:.*]] : (!fir.box<!fir.array<2xcomplex<f32>>>, i32)
   call co_broadcast(array_c, source_image=1)
-  
+
   ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
   ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
diff --git a/flang/test/Lower/MIF/co_max.f90 b/flang/test/Lower/MIF/co_max.f90
index 19e65626b50f2..0a179c832dce8 100644
--- a/flang/test/Lower/MIF/co_max.f90
+++ b/flang/test/Lower/MIF/co_max.f90
@@ -40,12 +40,12 @@ program test_co_max
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
   ! CHECK: mif.co_max %[[V1]] : (!fir.box<!fir.array<2xi32>>)
   call co_max(array_i)
-   
+
   ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1>>>
   ! CHECK: mif.co_max %[[V1]] result %[[C1_i32:.*]] : (!fir.box<!fir.array<2x!fir.char<1>>>, i32)
-  call co_max(array_c, result_image=1) 
-   
+  call co_max(array_c, result_image=1)
+
   ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
   ! CHECK: mif.co_max %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 : (!fir.box<!fir.array<2xf64>>, i32, !fir.ref<i32>)
diff --git a/flang/test/Lower/MIF/co_min.f90 b/flang/test/Lower/MIF/co_min.f90
index a7adc6b540147..bedee0e61619c 100644
--- a/flang/test/Lower/MIF/co_min.f90
+++ b/flang/test/Lower/MIF/co_min.f90
@@ -40,12 +40,12 @@ program test_co_min
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
   ! CHECK: mif.co_min %[[V1]] : (!fir.box<!fir.array<2xi32>>)
   call co_min(array_i)
-   
+
   ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1>>>
   ! CHECK: mif.co_min %[[V1]] result %[[C1_i32:.*]] : (!fir.box<!fir.array<2x!fir.char<1>>>, i32)
-  call co_min(array_c, result_image=1) 
-   
+  call co_min(array_c, result_image=1)
+
   ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
   ! CHECK: mif.co_min %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 : (!fir.box<!fir.array<2xf64>>, i32, !fir.ref<i32>)
diff --git a/flang/test/Lower/MIF/co_sum.f90 b/flang/test/Lower/MIF/co_sum.f90
index 0d8a25850ad5f..9710fd6d521ff 100644
--- a/flang/test/Lower/MIF/co_sum.f90
+++ b/flang/test/Lower/MIF/co_sum.f90
@@ -36,7 +36,7 @@ program test_co_sum
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
   ! CHECK: mif.co_sum %[[V1]] : (!fir.box<!fir.array<2xi32>>)
   call co_sum(array_i)
-   
+
   ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
   ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
   ! CHECK: mif.co_sum %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 : (!fir.box<!fir.array<2xf64>>, i32, !fir.ref<i32>)
diff --git a/flang/test/Lower/MIF/coarray-init.f90 b/flang/test/Lower/MIF/coarray-init.f90
index e3544736df284..e3526f6e09993 100644
--- a/flang/test/Lower/MIF/coarray-init.f90
+++ b/flang/test/Lower/MIF/coarray-init.f90
@@ -3,7 +3,7 @@
 
 program test_init
 
-end 
+end
 
 ! ALL-LABEL: func.func @main
 ! ALL: fir.call @_FortranAProgramStart
diff --git a/flang/test/Lower/MIF/form_team.f90 b/flang/test/Lower/MIF/form_team.f90
new file mode 100644
index 0000000000000..4f44b23b3ceed
--- /dev/null
+++ b/flang/test/Lower/MIF/form_team.f90
@@ -0,0 +1,29 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY
+! RUN: not %flang_fc1 -emit-hlfir %s 2>&1 | FileCheck %s --check-prefixes=NOCOARRAY
+
+program test_form_team
+  use, intrinsic :: iso_fortran_env, only: team_type
+  implicit none
+  ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
+
+  type(team_type) :: team
+  integer :: team_number
+  integer :: team_index
+  integer :: stat
+  character(len=10) :: err
+
+  form team (team_number, team)
+  ! COARRAY: mif.form_team team_number %[[ARG1:.*]] team_var %[[ARG2:.*]] : (i32, {{.*}}) -> ()
+  
+  form team (team_number, team, NEW_INDEX=team_index)
+  ! COARRAY: mif.form_team team_number %[[ARG1:.*]] team_var %[[ARG2:.*]] new_index %[[NI:.*]] : (i32, {{.*}}, i32) -> ()
+  
+  form team (team_number, team, STAT=stat)
+  ! COARRAY: mif.form_team team_number %[[ARG1:.*]] team_var %[[ARG2:.*]] stat %[[STAT:.*]] : (i32, {{.*}}, !fir.ref<i32>) -> ()
+  
+  form team (team_number, team, ERRMSG=err)
+  ! COARRAY: mif.form_team team_number %[[ARG1:.*]] team_var %[[ARG2:.*]] errmsg %[[ERR:.*]] : (i32, {{.*}}, !fir.box<!fir.char<1,10>>) -> ()
+
+end program test_form_team
+
+ 
diff --git a/flang/test/Lower/MIF/get_team.f90 b/flang/test/Lower/MIF/get_team.f90
new file mode 100644
index 0000000000000..f27b70efafc20
--- /dev/null
+++ b/flang/test/Lower/MIF/get_team.f90
@@ -0,0 +1,28 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY
+! RUN: not %flang_fc1 -emit-hlfir %s 2>&1 | FileCheck %s --check-prefixes=NOCOARRAY
+
+program test_get_team
+  use, intrinsic :: iso_fortran_env, only: team_type, initial_team, current_team, parent_team
+  implicit none
+  ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
+
+  type(team_type) :: result_team
+  integer :: n 
+
+  ! COARRAY: %[[RES:.*]] = mif.get_team : () -> {{.*}}
+  result_team = get_team()
+
+  ! COARRAY: %[[RES:.*]] = mif.get_team level %[[INIT:.*]] : (i32) -> {{.*}}
+  result_team = get_team(initial_team)
+
+  ! COARRAY: %[[RES:.*]] = mif.get_team level %[[CURRENT:.*]] : (i32) -> {{.*}}
+  result_team = get_team(current_team)
+
+  ! COARRAY: %[[RES:.*]] = mif.get_team level %[[PARENT:.*]] : (i32) -> {{.*}}
+  result_team = get_team(parent_team)
+
+  ! COARRAY: %[[RES:.*]] = mif.get_team level %[[VAL_N:.*]] : (i32) -> {{.*}}
+  result_team = get_team(n) 
+
+end program test_get_team
+ 
diff --git a/flang/test/Lower/MIF/num_images.f90 b/flang/test/Lower/MIF/num_images.f90
index a673b6e8120f8..8f31ab4bc0090 100644
--- a/flang/test/Lower/MIF/num_images.f90
+++ b/flang/test/Lower/MIF/num_images.f90
@@ -3,7 +3,7 @@
 program test
   use iso_fortran_env
   integer :: i
-  integer :: team_number 
+  integer :: team_number
   type(team_type) :: team
 
   ! CHECK: mif.num_images : () -> i32
diff --git a/flang/test/Lower/MIF/sync_all.f90 b/flang/test/Lower/MIF/sync_all.f90
index 2b1997c8cc0b8..4d685df31abbb 100644
--- a/flang/test/Lower/MIF/sync_all.f90
+++ b/flang/test/Lower/MIF/sync_all.f90
@@ -4,7 +4,7 @@
 program test_sync_all
   implicit none
   ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
- 
+
   ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
   ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer sync_status
@@ -15,11 +15,11 @@ program test_sync_all
 
   ! COARRAY: mif.sync_all stat %[[STAT]]#0 : (!fir.ref<i32>)
   sync all(stat=sync_status)
-  
+
   ! COARRAY: %[[VAL_1:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
   ! COARRAY: mif.sync_all errmsg %[[VAL_1]] : (!fir.box<!fir.char<1,128>>)
   sync all(                  errmsg=error_message)
-  
+
   ! COARRAY: %[[VAL_2:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
   ! COARRAY: mif.sync_all stat %[[STAT]]#0 errmsg %[[VAL_2]] : (!fir.ref<i32>, !fir.box<!fir.char<1,128>>)
   sync all(stat=sync_status, errmsg=error_message)
diff --git a/flang/test/Lower/MIF/sync_images.f90 b/flang/test/Lower/MIF/sync_images.f90
index 7ee5936131750..1ef577ed4f158 100644
--- a/flang/test/Lower/MIF/sync_images.f90
+++ b/flang/test/Lower/MIF/sync_images.f90
@@ -4,7 +4,7 @@
 program test_sync_images
   implicit none
   ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
- 
+
   ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
   ! COARRAY: %[[ME:.*]]:2 = hlfir.declare %[[VAL_3:.*]] {uniq_name = "_QFEme"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -24,14 +24,14 @@ program test_sync_images
   ! COARRAY: %[[VAL_5:.*]] = fir.embox %[[IMG_SET:.*]]#0(%[[SHAPE_1:.*]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
   ! COARRAY: mif.sync_images image_set %[[VAL_5]] stat %[[STAT]]#0 errmsg %[[VAL_4]] : (!fir.box<!fir.array<1xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,128>>)
   sync images([1],  stat=sync_status, errmsg=error_message)
-  
+
   ! COARRAY: mif.sync_images : ()
   sync images(*)
-  
+
   ! COARRAY: %[[VAL_6:.*]] = fir.embox %[[ME]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
   ! COARRAY: mif.sync_images image_set %[[VAL_6]] : (!fir.box<i32>)
   sync images(me)
-  
+
   ! COARRAY: %[[VAL_7:.*]] = fir.embox %[[IMG_SET:.*]]#0(%[[SHAPE_3:.*]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
   ! COARRAY: mif.sync_images image_set %[[VAL_7]] : (!fir.box<!fir.array<1xi32>>)
   sync images([1])
diff --git a/flang/test/Lower/MIF/sync_memory.f90 b/flang/test/Lower/MIF/sync_memory.f90
index e6e0fa1e7fdf3..a36fc2d1919a5 100644
--- a/flang/test/Lower/MIF/sync_memory.f90
+++ b/flang/test/Lower/MIF/sync_memory.f90
@@ -4,22 +4,22 @@
 program test_sync_memory
   implicit none
   ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
- 
+
   ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
   ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer sync_status
   character(len=128) :: error_message
 
-  ! COARRAY: mif.sync_memory : () 
+  ! COARRAY: mif.sync_memory : ()
   sync memory
 
   ! COARRAY: mif.sync_memory stat %[[STAT]]#0 : (!fir.ref<i32>)
   sync memory(stat=sync_status)
-  
+
   ! COARRAY: %[[VAL_1:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
   ! COARRAY: mif.sync_memory errmsg %[[VAL_1]] : (!fir.box<!fir.char<1,128>>)
   sync memory(                  errmsg=error_message)
-  
+
   ! COARRAY: %[[VAL_2:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
   ! COARRAY: mif.sync_memory stat %[[STAT]]#0 errmsg %[[VAL_2]] : (!fir.ref<i32>, !fir.box<!fir.char<1,128>>)
   sync memory(stat=sync_status, errmsg=error_message)
diff --git a/flang/test/Lower/MIF/sync_team.f90 b/flang/test/Lower/MIF/sync_team.f90
new file mode 100644
index 0000000000000..923bfbc327951
--- /dev/null
+++ b/flang/test/Lower/MIF/sync_team.f90
@@ -0,0 +1,25 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY
+! RUN2: not %flang_fc1 -emit-hlfir %s -o - | FileCheck %s --check-prefixes=NOCOARRAY
+
+program test_sync_team
+  use, intrinsic :: iso_fortran_env, only: team_type
+  implicit none
+  ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
+ 
+  integer sync_status
+  character(len=128) :: error_message
+  type(team_type) :: team
+
+  ! COARRAY: mif.sync_team %[[TEAM:.*]] : ({{.*}}) -> ()
+  sync team(team)
+
+  ! COARRAY: mif.sync_team %[[TEAM:.*]] stat %[[STAT:.*]]#0 : ({{.*}}, !fir.ref<i32>) -> ()
+  sync team(team, stat=sync_status)
+  
+  ! COARRAY: mif.sync_team %[[TEAM:.*]] errmsg %[[ERR:.*]] : ({{.*}}, !fir.box<!fir.char<1,128>>) -> ()
+  sync team(team,                    errmsg=error_message)
+  
+  ! COARRAY: mif.sync_team %[[TEAM:.*]] stat %[[STAT:.*]]#0 errmsg %[[ERR:.*]] : ({{.*}}, !fir.ref<i32>, !fir.box<!fir.char<1,128>>) -> ()
+  sync team(team, stat=sync_status, errmsg=error_message)
+
+end program test_sync_team
diff --git a/flang/test/Lower/MIF/team_number.f90 b/flang/test/Lower/MIF/team_number.f90
new file mode 100644
index 0000000000000..48a5f5b3d37d4
--- /dev/null
+++ b/flang/test/Lower/MIF/team_number.f90
@@ -0,0 +1,19 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY
+! RUN: not %flang_fc1 -emit-hlfir %s 2>&1 | FileCheck %s --check-prefixes=NOCOARRAY
+
+program test_team_number
+  use, intrinsic :: iso_fortran_env, only: team_type
+  implicit none
+  ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
+
+  type(team_type) :: team
+  integer :: t
+ 
+  ! COARRAY: %[[RES:.*]] = mif.team_number team %[[TEAM:.*]] : ({{.*}}) -> i64
+  t = team_number(team)
+  
+  ! COARRAY: %[[RES:.*]] = mif.team_number : () -> i64
+  t = team_number()
+  
+end program test_team_number
+ 
diff --git a/flang/test/Lower/MIF/this_image.f90 b/flang/test/Lower/MIF/this_image.f90
index ce729b349e6cf..c6674c309f3f4 100644
--- a/flang/test/Lower/MIF/this_image.f90
+++ b/flang/test/Lower/MIF/this_image.f90
@@ -5,7 +5,7 @@ program test
   integer :: i
   type(team_type) :: team
 
-  ! CHECK: mif.this_image : () -> i32 
+  ! CHECK: mif.this_image : () -> i32
   i = this_image()
 
   ! CHECK: mif.this_image team %[[TEAM:.*]] : ({{.*}}) -> i32
diff --git a/flang/test/Lower/OpenACC/acc-atomic-update-array.f90 b/flang/test/Lower/OpenACC/acc-atomic-update-array.f90
index d281fe4aced35..184c2a6fb0aeb 100644
--- a/flang/test/Lower/OpenACC/acc-atomic-update-array.f90
+++ b/flang/test/Lower/OpenACC/acc-atomic-update-array.f90
@@ -20,8 +20,8 @@ subroutine atomic_update_array1(r, n, x)
 
 ! CHECK-LABEL: func.func @_QPatomic_update_array1(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "r"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}, %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) {
-! CHECK: %[[DECL_ARG2:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_update_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_update_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[DECL_ARG2:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_update_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_update_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: %[[ARRAY_REF:.*]] = hlfir.designate %[[DECL_ARG0]]#0 (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
 ! CHECK: %[[LOAD_X:.*]] = fir.load %[[DECL_ARG2]]#0 : !fir.ref<f32>
 ! CHECK: acc.atomic.update %[[ARRAY_REF]] : !fir.ref<f32> {
@@ -42,8 +42,8 @@ subroutine atomic_read_array1(r, n, x)
 
 ! CHECK-LABEL: func.func @_QPatomic_read_array1(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "r"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}, %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) {
-! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_read_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_read_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_read_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_read_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: %[[DES:.*]] = hlfir.designate %[[DECL_R]]#0 (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
 ! CHECK: acc.atomic.read %[[DECL_X]]#0 = %[[DES]] : !fir.ref<f32>, !fir.ref<f32>, f32
 
@@ -58,8 +58,8 @@ subroutine atomic_write_array1(r, n, x)
 
 ! CHECK-LABEL: func.func @_QPatomic_write_array1(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "r"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}, %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) {
-! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_write_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_write_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_write_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_write_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: %[[DES:.*]] = hlfir.designate %[[DECL_R]]#0 (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
 ! CHECK: %[[LOAD:.*]] = fir.load %[[DES]] : !fir.ref<f32> 
 ! CHECK: acc.atomic.write %[[DECL_X]]#0 = %[[LOAD]] : !fir.ref<f32>, f32
@@ -77,9 +77,9 @@ subroutine atomic_capture_array1(r, n, x, y)
 
 ! CHECK-LABEL: func.func @_QPatomic_capture_array1(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "r"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}, %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}, %[[ARG3:.*]]: !fir.ref<f32> {fir.bindc_name = "y"}) {
-! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_capture_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: %[[DECL_Y:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_capture_array1Ey"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_capture_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_capture_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %[[DECL_Y:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_capture_array1Ey"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_capture_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: %[[R_I:.*]] = hlfir.designate %[[DECL_R]]#0 (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
 ! CHECK: %[[LOAD:.*]] = fir.load %[[DECL_X]]#0 : !fir.ref<f32>
 ! CHECK: acc.atomic.capture {
diff --git a/flang/test/Lower/OpenACC/acc-bounds.f90 b/flang/test/Lower/OpenACC/acc-bounds.f90
index f6996df6d2454..44ca2514f6eea 100644
--- a/flang/test/Lower/OpenACC/acc-bounds.f90
+++ b/flang/test/Lower/OpenACC/acc-bounds.f90
@@ -81,7 +81,7 @@ subroutine acc_undefined_extent(a)
 
 ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_undefined_extent(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a"}) {
-! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QMopenacc_boundsFacc_undefined_extentEa"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMopenacc_boundsFacc_undefined_extentEa"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %[[DECL_ARG0]]#0, %c0{{.*}} : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
 ! CHECK: %[[UB:.*]] = arith.subi %[[DIMS0]]#1, %c1{{.*}} : index
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%c0{{.*}} : index) upperbound(%[[UB]] : index) extent(%[[DIMS0]]#1 : index) stride(%[[DIMS0]]#2 : index) startIdx(%c1{{.*}} : index) {strideInBytes = true}
@@ -97,7 +97,7 @@ subroutine acc_multi_strides(a)
 
 ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_multi_strides(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x?x?xf32>> {fir.bindc_name = "a"})
-! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMopenacc_boundsFacc_multi_stridesEa"} : (!fir.box<!fir.array<?x?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf32>>, !fir.box<!fir.array<?x?x?xf32>>)
+! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMopenacc_boundsFacc_multi_stridesEa"} : (!fir.box<!fir.array<?x?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf32>>, !fir.box<!fir.array<?x?x?xf32>>)
 ! CHECK: %[[BOX_DIMS0:.*]]:3 = fir.box_dims %[[DECL_ARG0]]#0, %c0{{.*}} : (!fir.box<!fir.array<?x?x?xf32>>, index) -> (index, index, index)
 ! CHECK: %[[BOUNDS0:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[BOX_DIMS0]]#1 : index) stride(%[[BOX_DIMS0]]#2 : index) startIdx(%{{.*}} : index) {strideInBytes = true}
 ! CHECK: %[[STRIDE1:.*]] = arith.muli %[[BOX_DIMS0]]#2, %[[BOX_DIMS0]]#1 : index
@@ -117,7 +117,7 @@ subroutine acc_optional_data(a)
   
 ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "a", fir.optional}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMopenacc_boundsFacc_optional_dataEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMopenacc_boundsFacc_optional_dataEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK: %[[IS_PRESENT:.*]] = fir.is_present %[[ARG0_DECL]]#1 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> i1
 ! CHECK: %[[RES:.*]]:5 = fir.if %[[IS_PRESENT]] -> (index, index, index, index, index) {
 ! CHECK:   %[[LOAD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
@@ -140,7 +140,7 @@ subroutine acc_optional_data2(a, n)
 
 ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data2(
 ! CHECK-SAME: %[[A:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a", fir.optional}, %[[N:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
-! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMopenacc_boundsFacc_optional_data2Ea"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMopenacc_boundsFacc_optional_data2Ea"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: %[[NO_CREATE:.*]] = acc.nocreate varPtr(%[[DECL_A]]#1 : !fir.ref<!fir.array<?xf32>>) bounds(%{{[0-9]+}}) -> !fir.ref<!fir.array<?xf32>> {name = "a"}
 ! CHECK: acc.data dataOperands(%[[NO_CREATE]] : !fir.ref<!fir.array<?xf32>>) {
 
@@ -153,7 +153,7 @@ subroutine acc_optional_data3(a, n)
 
 ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data3(
 ! CHECK-SAME: %[[A:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a", fir.optional}, %[[N:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
-! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMopenacc_boundsFacc_optional_data3Ea"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[A]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMopenacc_boundsFacc_optional_data3Ea"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: %[[PRES:.*]] = fir.is_present %[[DECL_A]]#1 : (!fir.ref<!fir.array<?xf32>>) -> i1
 ! CHECK: %[[STRIDE:.*]] = fir.if %[[PRES]] -> (index) {
 ! CHECK:   %[[DIMS:.*]]:3 = fir.box_dims %[[DECL_A]]#0, %c0{{.*}} : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
diff --git a/flang/test/Lower/OpenACC/acc-data-operands-remapping.f90 b/flang/test/Lower/OpenACC/acc-data-operands-remapping.f90
index 9d36f6a99e8a7..b657881a13ebc 100644
--- a/flang/test/Lower/OpenACC/acc-data-operands-remapping.f90
+++ b/flang/test/Lower/OpenACC/acc-data-operands-remapping.f90
@@ -187,11 +187,11 @@ subroutine test_optional_pointer(x)
 ! CHECK-LABEL:   func.func @_QMmPtest_scalar(
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:           %[[VAL_2:.*]] = acc.copyin varPtr(%[[VAL_1]]#0 : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_2]] : !fir.ref<f32>) {
 ! CHECK:             %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QMmFtest_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:             %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:             fir.call @_QPtakes_scalar(%[[VAL_4]]#0) fastmath<contract> : (!fir.ref<f32>) -> ()
 ! CHECK:             acc.yield
 ! CHECK:           }
@@ -203,7 +203,7 @@ subroutine test_optional_pointer(x)
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "c"},
 ! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "l"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_scalar_characterEl"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_scalar_characterEl"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QMmFtest_scalar_characterEx"}
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QMmFtest_scalar_characterEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:           %[[VAL_4:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
@@ -211,7 +211,7 @@ subroutine test_optional_pointer(x)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           %[[VAL_7:.*]] = arith.cmpi sgt, %[[VAL_5]], %[[VAL_6]] : i32
 ! CHECK:           %[[VAL_8:.*]] = arith.select %[[VAL_7]], %[[VAL_5]], %[[VAL_6]] : i32
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_8]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_scalar_characterEc"} : (!fir.ref<!fir.char<1,?>>, i32, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_8]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_scalar_characterEc"} : (!fir.ref<!fir.char<1,?>>, i32, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:           %[[VAL_10:.*]] = acc.copyin varPtr(%[[VAL_3]]#0 : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_10]] : !fir.ref<f32>) {
 ! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QMmFtest_scalar_characterEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
@@ -229,11 +229,11 @@ subroutine test_optional_pointer(x)
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_2]]) dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_2]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
 ! CHECK:           %[[VAL_4:.*]] = acc.copyin varPtr(%[[VAL_3]]#0 : !fir.ref<!fir.array<100xf32>>) -> !fir.ref<!fir.array<100xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_4]] : !fir.ref<!fir.array<100xf32>>) {
 ! CHECK:             %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_2]]) dummy_scope %[[VAL_5]] {uniq_name = "_QMmFtest_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_2]]) dummy_scope %[[VAL_5]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
 ! CHECK:             fir.call @_QPtakes_explicit_cst_shape(%[[VAL_6]]#0) fastmath<contract> : (!fir.ref<!fir.array<100xf32>>) -> ()
 ! CHECK:             acc.yield
 ! CHECK:           }
@@ -245,7 +245,7 @@ subroutine test_optional_pointer(x)
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "x"},
 ! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_explicit_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_explicit_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
@@ -253,12 +253,12 @@ subroutine test_optional_pointer(x)
 ! CHECK:           %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : index
 ! CHECK:           %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : index
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_8]]) dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_8]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_10:.*]] = acc.copyin var(%[[VAL_9]]#0 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_10]] : !fir.box<!fir.array<?xf32>>) {
 ! CHECK:             %[[VAL_11:.*]] = fir.box_addr %[[VAL_10]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:             %[[VAL_12:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_8]]) dummy_scope %[[VAL_12]] {uniq_name = "_QMmFtest_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:             %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_8]]) dummy_scope %[[VAL_12]] {{.*}} {uniq_name = "_QMmFtest_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_7]] : (index) -> i64
 ! CHECK:             %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i64) -> i32
 ! CHECK:             %[[VAL_16:.*]]:3 = hlfir.associate %[[VAL_15]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
@@ -274,12 +274,12 @@ subroutine test_optional_pointer(x)
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"},
 ! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_3:.*]] = acc.copyin var(%[[VAL_2]]#0 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_3]] : !fir.box<!fir.array<?xf32>>) {
 ! CHECK:             %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %[[VAL_4]] skip_rebox {uniq_name = "_QMmFtest_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %[[VAL_4]] arg {{[0-9]+}} skip_rebox {uniq_name = "_QMmFtest_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:             fir.call @_QPtakes_assumed_shape(%[[VAL_5]]#0) fastmath<contract> : (!fir.box<!fir.array<?xf32>>) -> ()
 ! CHECK:             acc.yield
 ! CHECK:           }
@@ -291,18 +291,18 @@ subroutine test_optional_pointer(x)
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.contiguous},
 ! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_contiguous_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_contiguous_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.box_addr %[[ARG0]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[ARG0]], %[[VAL_3]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape_shift %[[VAL_5]], %[[VAL_4]]#1 : (index, index) -> !fir.shapeshift<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_6]]) dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFtest_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_6]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFtest_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_8:.*]] = acc.copyin var(%[[VAL_7]]#0 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_8]] : !fir.box<!fir.array<?xf32>>) {
 ! CHECK:             %[[VAL_9:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:             %[[VAL_10:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_9]](%[[VAL_6]]) dummy_scope %[[VAL_10]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFtest_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_9]](%[[VAL_6]]) dummy_scope %[[VAL_10]] {{.*}} {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFtest_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK:             %[[VAL_12:.*]] = fir.convert %[[VAL_4]]#1 : (index) -> i64
 ! CHECK:             %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i64) -> i32
 ! CHECK:             %[[VAL_14:.*]]:3 = hlfir.associate %[[VAL_13]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
@@ -318,12 +318,12 @@ subroutine test_optional_pointer(x)
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_pointerEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFtest_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_pointerEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFtest_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK:           %[[VAL_3:.*]] = acc.copyin varPtr(%[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) {
 ! CHECK:             %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %[[VAL_4]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFtest_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %[[VAL_4]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFtest_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK:             fir.call @_QPtakes_pointer(%[[VAL_5]]#0) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> ()
 ! CHECK:             acc.yield
 ! CHECK:           }
@@ -335,7 +335,7 @@ subroutine test_optional_pointer(x)
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "x"},
 ! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_using_both_resultsEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_using_both_resultsEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
@@ -343,12 +343,12 @@ subroutine test_optional_pointer(x)
 ! CHECK:           %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : index
 ! CHECK:           %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : index
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_8]]) dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_using_both_resultsEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_8]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_using_both_resultsEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_10:.*]] = acc.copyin var(%[[VAL_9]]#0 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_10]] : !fir.box<!fir.array<?xf32>>) {
 ! CHECK:             %[[VAL_11:.*]] = fir.box_addr %[[VAL_10]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:             %[[VAL_12:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_8]]) dummy_scope %[[VAL_12]] {uniq_name = "_QMmFtest_using_both_resultsEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:             %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_8]]) dummy_scope %[[VAL_12]] {{.*}} {uniq_name = "_QMmFtest_using_both_resultsEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK:             fir.call @_QPtakes_assumed_shape(%[[VAL_13]]#0) fastmath<contract> : (!fir.box<!fir.array<?xf32>>) -> ()
 ! CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_7]] : (index) -> i64
 ! CHECK:             %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i64) -> i32
@@ -367,11 +367,11 @@ subroutine test_optional_pointer(x)
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 20 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_2]] : (index, index) -> !fir.shape<2>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_3]]) dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_cst_shapeEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_3]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFaddressing_cst_shapeEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
 ! CHECK:           %[[VAL_5:.*]] = acc.copyin varPtr(%[[VAL_4]]#0 : !fir.ref<!fir.array<10x20xf32>>) -> !fir.ref<!fir.array<10x20xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_5]] : !fir.ref<!fir.array<10x20xf32>>) {
 ! CHECK:             %[[VAL_6:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_3]]) dummy_scope %[[VAL_6]] {uniq_name = "_QMmFaddressing_cst_shapeEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
+! CHECK:             %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_3]]) dummy_scope %[[VAL_6]] {{.*}} {uniq_name = "_QMmFaddressing_cst_shapeEx"} : (!fir.ref<!fir.array<10x20xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>)
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 2 : index
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 3 : index
 ! CHECK:             %[[VAL_10:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_8]], %[[VAL_9]])  : (!fir.ref<!fir.array<10x20xf32>>, index, index) -> !fir.ref<f32>
@@ -387,8 +387,8 @@ subroutine test_optional_pointer(x)
 ! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"},
 ! CHECK-SAME:      %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "m"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_explicit_shapeEm"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_explicit_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFaddressing_explicit_shapeEm"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFaddressing_explicit_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i32) -> i64
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
@@ -402,12 +402,12 @@ subroutine test_optional_pointer(x)
 ! CHECK:           %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_11]], %[[VAL_12]] : index
 ! CHECK:           %[[VAL_14:.*]] = arith.select %[[VAL_13]], %[[VAL_11]], %[[VAL_12]] : index
 ! CHECK:           %[[VAL_15:.*]] = fir.shape %[[VAL_8]], %[[VAL_14]] : (index, index) -> !fir.shape<2>
-! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_15]]) dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_explicit_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
+! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_15]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFaddressing_explicit_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
 ! CHECK:           %[[VAL_17:.*]] = acc.copyin var(%[[VAL_16]]#0 : !fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_17]] : !fir.box<!fir.array<?x?xf32>>) {
 ! CHECK:             %[[VAL_18:.*]] = fir.box_addr %[[VAL_17]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 ! CHECK:             %[[VAL_19:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_18]](%[[VAL_15]]) dummy_scope %[[VAL_19]] {uniq_name = "_QMmFaddressing_explicit_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
+! CHECK:             %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_18]](%[[VAL_15]]) dummy_scope %[[VAL_19]] {{.*}} {uniq_name = "_QMmFaddressing_explicit_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
 ! CHECK:             %[[VAL_21:.*]] = arith.constant 2 : index
 ! CHECK:             %[[VAL_22:.*]] = arith.constant 3 : index
 ! CHECK:             %[[VAL_23:.*]] = hlfir.designate %[[VAL_20]]#0 (%[[VAL_21]], %[[VAL_22]])  : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
@@ -422,12 +422,12 @@ subroutine test_optional_pointer(x)
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"},
 ! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_assumed_shapeEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFaddressing_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFaddressing_assumed_shapeEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
 ! CHECK:           %[[VAL_3:.*]] = acc.copyin var(%[[VAL_2]]#0 : !fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_3]] : !fir.box<!fir.array<?x?xf32>>) {
 ! CHECK:             %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %[[VAL_4]] skip_rebox {uniq_name = "_QMmFaddressing_assumed_shapeEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %[[VAL_4]] arg {{[0-9]+}} skip_rebox {uniq_name = "_QMmFaddressing_assumed_shapeEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
 ! CHECK:             %[[VAL_6:.*]] = arith.constant 2 : index
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 3 : index
 ! CHECK:             %[[VAL_8:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_6]], %[[VAL_7]])  : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
@@ -442,7 +442,7 @@ subroutine test_optional_pointer(x)
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x", fir.contiguous},
 ! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFaddressing_contiguous_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFaddressing_contiguous_assumed_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.box_addr %[[ARG0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[ARG0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
@@ -451,12 +451,12 @@ subroutine test_optional_pointer(x)
 ! CHECK:           %[[VAL_7:.*]]:3 = fir.box_dims %[[ARG0]], %[[VAL_6]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : index
 ! CHECK:           %[[VAL_9:.*]] = fir.shape_shift %[[VAL_5]], %[[VAL_4]]#1, %[[VAL_8]], %[[VAL_7]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_9]]) dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFaddressing_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shapeshift<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_9]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFaddressing_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shapeshift<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
 ! CHECK:           %[[VAL_11:.*]] = acc.copyin var(%[[VAL_10]]#0 : !fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_11]] : !fir.box<!fir.array<?x?xf32>>) {
 ! CHECK:             %[[VAL_12:.*]] = fir.box_addr %[[VAL_11]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 ! CHECK:             %[[VAL_13:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]](%[[VAL_9]]) dummy_scope %[[VAL_13]] {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFaddressing_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shapeshift<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
+! CHECK:             %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]](%[[VAL_9]]) dummy_scope %[[VAL_13]] {{.*}} {fortran_attrs = #fir.var_attrs<contiguous>, uniq_name = "_QMmFaddressing_contiguous_assumed_shapeEx"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shapeshift<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
 ! CHECK:             %[[VAL_15:.*]] = arith.constant 2 : index
 ! CHECK:             %[[VAL_16:.*]] = arith.constant 3 : index
 ! CHECK:             %[[VAL_17:.*]] = hlfir.designate %[[VAL_14]]#0 (%[[VAL_15]], %[[VAL_16]])  : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
@@ -470,11 +470,11 @@ subroutine test_optional_pointer(x)
 ! CHECK-LABEL:   func.func @_QMmPaddressing_pointer(
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFaddressing_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFaddressing_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>)
 ! CHECK:           %[[VAL_2:.*]] = acc.copyin varPtr(%[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) {
 ! CHECK:             %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFaddressing_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>)
+! CHECK:             %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMmFaddressing_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>)
 ! CHECK:             %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
 ! CHECK:             %[[VAL_6:.*]] = arith.constant 2 : index
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 3 : index
@@ -489,11 +489,11 @@ subroutine test_optional_pointer(x)
 ! CHECK-LABEL:   func.func @_QMmPtest_optional_scalar(
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "x", fir.optional}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:           %[[VAL_2:.*]] = acc.copyin varPtr(%[[VAL_1]]#0 : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_2]] : !fir.ref<f32>) {
 ! CHECK:             %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:             %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_scalarEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:             %[[VAL_5:.*]] = fir.is_present %[[VAL_4]]#0 : (!fir.ref<f32>) -> i1
 ! CHECK:             %[[VAL_6:.*]] = fir.if %[[VAL_5]] -> (!fir.ref<f32>) {
 ! CHECK:               fir.result %[[VAL_4]]#0 : !fir.ref<f32>
@@ -513,11 +513,11 @@ subroutine test_optional_pointer(x)
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_2]]) dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_2]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
 ! CHECK:           %[[VAL_4:.*]] = acc.copyin varPtr(%[[VAL_3]]#0 : !fir.ref<!fir.array<100xf32>>) -> !fir.ref<!fir.array<100xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_4]] : !fir.ref<!fir.array<100xf32>>) {
 ! CHECK:             %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_2]]) dummy_scope %[[VAL_5]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_2]]) dummy_scope %[[VAL_5]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_cst_shapeEx"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
 ! CHECK:             %[[VAL_7:.*]] = fir.is_present %[[VAL_6]]#0 : (!fir.ref<!fir.array<100xf32>>) -> i1
 ! CHECK:             %[[VAL_8:.*]] = fir.if %[[VAL_7]] -> (!fir.ref<!fir.array<100xf32>>) {
 ! CHECK:               fir.result %[[VAL_6]]#0 : !fir.ref<!fir.array<100xf32>>
@@ -536,7 +536,7 @@ subroutine test_optional_pointer(x)
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "x", fir.optional},
 ! CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QMmFtest_optional_explicit_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMmFtest_optional_explicit_shapeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
@@ -544,11 +544,11 @@ subroutine test_optional_pointer(x)
 ! CHECK:           %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : index
 ! CHECK:           %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : index
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_8]]) dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]](%[[VAL_8]]) dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_10:.*]] = acc.copyin varPtr(%[[VAL_9]]#1 : !fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_10]] : !fir.ref<!fir.array<?xf32>>) {
 ! CHECK:             %[[VAL_11:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]](%[[VAL_8]]) dummy_scope %[[VAL_11]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]](%[[VAL_8]]) dummy_scope %[[VAL_11]] {{.*}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_explicit_shapeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK:             %[[VAL_13:.*]] = fir.is_present %[[VAL_12]]#0 : (!fir.box<!fir.array<?xf32>>) -> i1
 ! CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.ref<!fir.array<?xf32>>) {
 ! CHECK:               fir.result %[[VAL_12]]#1 : !fir.ref<!fir.array<?xf32>>
@@ -566,11 +566,11 @@ subroutine test_optional_pointer(x)
 ! CHECK-LABEL:   func.func @_QMmPtest_optional_assumed_shape(
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.optional}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_2:.*]] = acc.copyin var(%[[VAL_1]]#0 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_2]] : !fir.box<!fir.array<?xf32>>) {
 ! CHECK:             %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] skip_rebox {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:             %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} skip_rebox {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMmFtest_optional_assumed_shapeEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:             %[[VAL_5:.*]] = fir.is_present %[[VAL_4]]#0 : (!fir.box<!fir.array<?xf32>>) -> i1
 ! CHECK:             %[[VAL_6:.*]] = fir.if %[[VAL_5]] -> (!fir.box<!fir.array<?xf32>>) {
 ! CHECK:               fir.result %[[VAL_4]]#0 : !fir.box<!fir.array<?xf32>>
@@ -588,11 +588,11 @@ subroutine test_optional_pointer(x)
 ! CHECK-LABEL:   func.func @_QMmPtest_optional_pointer(
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "x", fir.optional}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMmFtest_optional_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMmFtest_optional_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK:           %[[VAL_2:.*]] = acc.copyin varPtr(%[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {dataClause = #acc<data_clause acc_copy>, name = "x"}
 ! CHECK:           acc.parallel dataOperands(%[[VAL_2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) {
 ! CHECK:             %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMmFtest_optional_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:             %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMmFtest_optional_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK:             fir.call @_QPtakes_optional_pointer(%[[VAL_4]]#0) fastmath<contract> : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> ()
 ! CHECK:             acc.yield
 ! CHECK:           }
diff --git a/flang/test/Lower/OpenACC/acc-declare.f90 b/flang/test/Lower/OpenACC/acc-declare.f90
index 46c4365f23fd6..98f7ed39edb41 100644
--- a/flang/test/Lower/OpenACC/acc-declare.f90
+++ b/flang/test/Lower/OpenACC/acc-declare.f90
@@ -57,7 +57,7 @@ subroutine acc_declare_present(a)
 
 ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_present(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"})
-! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_present>, uniq_name = "_QMacc_declareFacc_declare_presentEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_present>, uniq_name = "_QMacc_declareFacc_declare_presentEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %[[PRESENT:.*]] = acc.present varPtr(%[[DECL]]#0 : !fir.ref<!fir.array<100xi32>>) -> !fir.ref<!fir.array<100xi32>> {name = "a"}
 ! CHECK: %[[TOKEN:.*]] = acc.declare_enter dataOperands(%[[PRESENT]] : !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %{{.*}} = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%arg{{.*}} = %{{.*}}) -> (i32)
@@ -115,7 +115,7 @@ subroutine acc_declare_deviceptr(a)
 
 ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_deviceptr(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"}) {
-! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_deviceptr>, uniq_name = "_QMacc_declareFacc_declare_deviceptrEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_deviceptr>, uniq_name = "_QMacc_declareFacc_declare_deviceptrEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %[[DEVICEPTR:.*]] = acc.deviceptr varPtr(%[[DECL]]#0 : !fir.ref<!fir.array<100xi32>>) -> !fir.ref<!fir.array<100xi32>> {name = "a"}
 ! CHECK: acc.declare_enter dataOperands(%[[DEVICEPTR]] : !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %{{.*}} = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%arg{{.*}} = %{{.*}}) -> (i32)
@@ -131,7 +131,7 @@ subroutine acc_declare_link(a)
 
 ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_link(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"})
-! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_declare_link>, uniq_name = "_QMacc_declareFacc_declare_linkEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_declare_link>, uniq_name = "_QMacc_declareFacc_declare_linkEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %[[LINK:.*]] = acc.declare_link varPtr(%[[DECL]]#0 : !fir.ref<!fir.array<100xi32>>) -> !fir.ref<!fir.array<100xi32>> {name = "a"}
 ! CHECK: acc.declare_enter dataOperands(%[[LINK]] : !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %{{.*}} = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%arg{{.*}} = %{{.*}}) -> (i32)
@@ -147,7 +147,7 @@ subroutine acc_declare_device_resident(a)
 
 ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_device_resident(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"})
-! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_declare_device_resident>, uniq_name = "_QMacc_declareFacc_declare_device_residentEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_declare_device_resident>, uniq_name = "_QMacc_declareFacc_declare_device_residentEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %[[DEVICERES:.*]] = acc.declare_device_resident varPtr(%[[DECL]]#0 : !fir.ref<!fir.array<100xi32>>) -> !fir.ref<!fir.array<100xi32>> {name = "a"}
 ! CHECK: %[[TOKEN:.*]] = acc.declare_enter dataOperands(%[[DEVICERES]] : !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %{{.*}} = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%arg{{.*}} = %{{.*}}) -> (i32)
@@ -274,8 +274,8 @@ subroutine acc_declare_multiple_directive(a, b)
 
 ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_multiple_directive(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "b"}) {
-! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_copy>, uniq_name = "_QMacc_declareFacc_declare_multiple_directiveEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
-! CHECK: %[[DECL_B:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_copyout>, uniq_name = "_QMacc_declareFacc_declare_multiple_directiveEb"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_copy>, uniq_name = "_QMacc_declareFacc_declare_multiple_directiveEa"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK: %[[DECL_B:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {acc.declare = #acc.declare<dataClause =  acc_copyout>, uniq_name = "_QMacc_declareFacc_declare_multiple_directiveEb"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
 ! CHECK: %[[COPYIN:.*]] = acc.copyin varPtr(%[[DECL_A]]#0 : !fir.ref<!fir.array<100xi32>>) -> !fir.ref<!fir.array<100xi32>> {dataClause = #acc<data_clause acc_copy>, name = "a"}
 ! CHECK: %[[CREATE:.*]] = acc.create varPtr(%[[DECL_B]]#0 : !fir.ref<!fir.array<100xi32>>) -> !fir.ref<!fir.array<100xi32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"}
 ! CHECK: acc.declare_enter dataOperands(%[[COPYIN]], %[[CREATE]] : !fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
@@ -296,7 +296,7 @@ subroutine acc_declare_array_section(a)
 
 ! CHECK-LABEL: func.func @_QMacc_declarePacc_declare_array_section(
 ! CHECK-SAME:    %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) {
-! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {acc.declare = #acc.declare<dataClause = acc_copy>, uniq_name = "_QMacc_declareFacc_declare_array_sectionEa"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {acc.declare = #acc.declare<dataClause = acc_copy>, uniq_name = "_QMacc_declareFacc_declare_array_sectionEa"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK: %[[COPYIN:.*]] = acc.copyin var(%[[DECL_A]]#0 : !fir.box<!fir.array<?xi32>>) bounds(%{{.*}}) -> !fir.box<!fir.array<?xi32>> {dataClause = #acc<data_clause acc_copy>, name = "a(1:10)"}
 ! CHECK: acc.declare_enter dataOperands(%[[COPYIN]] : !fir.box<!fir.array<?xi32>>)
 
diff --git a/flang/test/Lower/OpenACC/acc-enter-data.f90 b/flang/test/Lower/OpenACC/acc-enter-data.f90
index 2718c96a563fb..1e20bdc238573 100644
--- a/flang/test/Lower/OpenACC/acc-enter-data.f90
+++ b/flang/test/Lower/OpenACC/acc-enter-data.f90
@@ -668,6 +668,6 @@ subroutine test_class(a)
 
 ! CHECK-LABEL: func.func @_QPtest_class(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<!fir.type<_QMmod1Tderived{m:f32}>> {fir.bindc_name = "a"}) {
-! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %0 {uniq_name = "_QFtest_classEa"} : (!fir.class<!fir.type<_QMmod1Tderived{m:f32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMmod1Tderived{m:f32}>>, !fir.class<!fir.type<_QMmod1Tderived{m:f32}>>)
+! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %0 {{.*}} {uniq_name = "_QFtest_classEa"} : (!fir.class<!fir.type<_QMmod1Tderived{m:f32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMmod1Tderived{m:f32}>>, !fir.class<!fir.type<_QMmod1Tderived{m:f32}>>)
 ! CHECK: %[[COPYIN:.*]] = acc.copyin var(%[[DECL_ARG0]]#0 : !fir.class<!fir.type<_QMmod1Tderived{m:f32}>>) -> !fir.class<!fir.type<_QMmod1Tderived{m:f32}>> {name = "a", structured = false}
 ! CHECK: acc.enter_data dataOperands(%[[COPYIN]] : !fir.class<!fir.type<_QMmod1Tderived{m:f32}>>)
diff --git a/flang/test/Lower/OpenACC/acc-firstprivate-derived-pointer-component.f90 b/flang/test/Lower/OpenACC/acc-firstprivate-derived-pointer-component.f90
index 9ef4fe6913551..fcbb7a19593c8 100644
--- a/flang/test/Lower/OpenACC/acc-firstprivate-derived-pointer-component.f90
+++ b/flang/test/Lower/OpenACC/acc-firstprivate-derived-pointer-component.f90
@@ -34,7 +34,7 @@ subroutine test(a)
 ! CHECK-LABEL:   func.func @_QMm_firstprivate_derived_ptr_compPtest(
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>> {fir.bindc_name = "a"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEa"} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEa"} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEi"}
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEn"}
@@ -42,7 +42,7 @@ subroutine test(a)
 ! CHECK:           %[[VAL_6:.*]] = acc.firstprivate varPtr(%[[VAL_1]]#0 : !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>> {name = "a"}
 ! CHECK:           acc.parallel combined(loop) firstprivate(@firstprivatization_ref_rec__QMm_firstprivate_derived_ptr_compTpoint -> %[[VAL_6]] : !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>) {
 ! CHECK:             %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_6]] dummy_scope %[[VAL_7]] {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEa"} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>)
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_6]] dummy_scope %[[VAL_7]] {{.*}} {uniq_name = "_QMm_firstprivate_derived_ptr_compFtestEa"} : (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>, !fir.ref<!fir.type<_QMm_firstprivate_derived_ptr_compTpoint{x:!fir.box<!fir.ptr<!fir.array<?xf32>>>}>>)
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_10:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
diff --git a/flang/test/Lower/OpenACC/acc-kernels-loop.f90 b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
index 8d95f35b186ee..ef8dcd34807e0 100644
--- a/flang/test/Lower/OpenACC/acc-kernels-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
@@ -360,7 +360,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a"}
-! CHECK:      %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"}
+! CHECK:      %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"}
 ! CHECK:      acc.kernels {{.*}} dataOperands(%[[CREATE_A]], %[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
 ! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
@@ -368,7 +368,7 @@ subroutine acc_kernels_loop
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 ! CHECK:      acc.copyout accPtr(%[[CREATE_A]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) {name = "a"}
-! CHECK:      acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) {name = "b"}
+! CHECK:      acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"}
 
   !$acc kernels loop create(b) create(zero: a)
   DO i = 1, n
diff --git a/flang/test/Lower/OpenACC/acc-kernels.f90 b/flang/test/Lower/OpenACC/acc-kernels.f90
index b90870db25095..65079e693c74b 100644
--- a/flang/test/Lower/OpenACC/acc-kernels.f90
+++ b/flang/test/Lower/OpenACC/acc-kernels.f90
@@ -222,13 +222,13 @@ subroutine acc_kernels
   !$acc end kernels
 
 ! CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a"}
-! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"}
+! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"}
 ! CHECK: %[[CREATE_C:.*]] = acc.create varPtr(%[[DECLC]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "c"}
 ! CHECK:      acc.kernels dataOperands(%[[CREATE_A]], %[[CREATE_B]], %[[CREATE_C]] : !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 ! CHECK: acc.copyout accPtr(%[[CREATE_A]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "a"}
-! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "b"}
+! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"}
 ! CHECK: acc.copyout accPtr(%[[CREATE_C]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLC]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "c"}
 
   !$acc kernels create(a, b) create(zero: c)
diff --git a/flang/test/Lower/OpenACC/acc-loop-exit.f90 b/flang/test/Lower/OpenACC/acc-loop-exit.f90
index af11b34d5f65f..0b35a86c41b2e 100644
--- a/flang/test/Lower/OpenACC/acc-loop-exit.f90
+++ b/flang/test/Lower/OpenACC/acc-loop-exit.f90
@@ -14,7 +14,7 @@ subroutine sub1(x, a)
 end 
 
 ! CHECK-LABEL: func.func @_QPsub1
-! CHECK: %[[A:.*]]:2 = hlfir.declare %arg1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFsub1Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[A:.*]]:2 = hlfir.declare %arg1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsub1Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: %[[I:.*]]:2 = hlfir.declare %{{[0-9]+}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: %[[EXIT_COND:.*]] = acc.loop
 ! CHECK:   %[[I:.*]]:2 = hlfir.declare %{{[0-9]+}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Lower/OpenACC/acc-parallel-loop.f90 b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
index 8086080bd3797..648b8298f0965 100644
--- a/flang/test/Lower/OpenACC/acc-parallel-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
@@ -360,7 +360,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a"}
-! CHECK:      %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"}
+! CHECK:      %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"}
 ! CHECK:      acc.parallel {{.*}} dataOperands(%[[CREATE_A]], %[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
 ! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
@@ -368,7 +368,7 @@ subroutine acc_parallel_loop
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 ! CHECK:      acc.copyout accPtr(%[[CREATE_A]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) {name = "a"}
-! CHECK:      acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) {name = "b"}
+! CHECK:      acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"}
 
   !$acc parallel loop create(b) create(zero: a)
   DO i = 1, n
diff --git a/flang/test/Lower/OpenACC/acc-parallel.f90 b/flang/test/Lower/OpenACC/acc-parallel.f90
index 1eae106ba61b2..fa98fb1255f1a 100644
--- a/flang/test/Lower/OpenACC/acc-parallel.f90
+++ b/flang/test/Lower/OpenACC/acc-parallel.f90
@@ -252,13 +252,13 @@ subroutine acc_parallel
   !$acc end parallel
 
 ! CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a"}
-! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"}
+! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"}
 ! CHECK: %[[CREATE_C:.*]] = acc.create varPtr(%[[DECLC]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "c"}
 ! CHECK:      acc.parallel dataOperands(%[[CREATE_A]], %[[CREATE_B]], %[[CREATE_C]] : !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 ! CHECK: acc.copyout accPtr(%[[CREATE_A]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "a"}
-! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "b"}
+! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"}
 ! CHECK: acc.copyout accPtr(%[[CREATE_C]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLC]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "c"}
 
   !$acc parallel create(a, b) create(zero: c)
diff --git a/flang/test/Lower/OpenACC/acc-private.f90 b/flang/test/Lower/OpenACC/acc-private.f90
index 485825dfa8129..10d103c84f8de 100644
--- a/flang/test/Lower/OpenACC/acc-private.f90
+++ b/flang/test/Lower/OpenACC/acc-private.f90
@@ -21,10 +21,7 @@
 ! CHECK:   acc.yield %[[DECL]]#0 : !fir.box<!fir.array<?x?x2xi32>>
 ! CHECK: } copy {
 ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?x?x2xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?x?x2xi32>>):
-! CHECK:   %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}}, %{{.*}} : (index, index, index) -> !fir.shape<3>
-! CHECK:   %[[DES_SRC:.*]] = hlfir.designate %[[ARG0]] shape %[[SHAPE]] : (!fir.box<!fir.array<?x?x2xi32>>, !fir.shape<3>) -> !fir.box<!fir.array<?x?x2xi32>>
-! CHECK:   %[[DES_DST:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?x?x2xi32>>, !fir.shape<3>) -> !fir.box<!fir.array<?x?x2xi32>>
-! CHECK:   hlfir.assign %[[DES_SRC]] to %[[DES_DST]] : !fir.box<!fir.array<?x?x2xi32>>, !fir.box<!fir.array<?x?x2xi32>>
+! CHECK:   hlfir.assign %[[ARG0]] to %[[ARG1]] temporary_lhs : !fir.box<!fir.array<?x?x2xi32>>, !fir.box<!fir.array<?x?x2xi32>>
 ! CHECK:   acc.terminator
 ! CHECK: } destroy {
 ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?x?x2xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?x?x2xi32>>):
@@ -38,20 +35,7 @@
 ! CHECK: ^bb0(%{{.*}}: !fir.box<!fir.array<?xi32>>):
 ! CHECK: } copy {
 ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>):
-! CHECK:   %[[LB:.*]] = arith.constant 4 : index
-! CHECK:   %[[UB:.*]] = arith.constant 9 : index
-! CHECK:   %[[STEP:.*]] = arith.constant 1 : index
-! CHECK:   %[[C1:.*]] = arith.constant 1 : index
-! CHECK:   %[[C0:.*]] = arith.constant 0 : index
-! CHECK:   %[[EXT0:.*]] = arith.subi %[[UB]], %[[LB]] : index
-! CHECK:   %[[EXT1:.*]] = arith.addi %[[EXT0]], %[[C1]] : index
-! CHECK:   %[[EXT2:.*]] = arith.divsi %[[EXT1]], %[[STEP]] : index
-! CHECK:   %[[CMP:.*]] = arith.cmpi sgt, %[[EXT2]], %[[C0]] : index
-! CHECK:   %[[SELECT:.*]] = arith.select %[[CMP]], %[[EXT2]], %[[C0]] : index
-! CHECK:   %[[SHAPE:.*]] = fir.shape %[[SELECT]] : (index) -> !fir.shape<1>
-! CHECK:   %[[LEFT:.*]] = hlfir.designate %[[ARG0]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
-! CHECK:   %[[RIGHT:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
-! CHECK:   hlfir.assign %[[LEFT]] to %[[RIGHT]] : !fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>
+! CHECK:   hlfir.assign {{.*}} to {{.*}} temporary_lhs : !fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>
 ! CHECK:   acc.terminator
 ! CHECK: } destroy {
 ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>):
@@ -71,10 +55,7 @@
 ! CHECK:   acc.yield %[[DECL]]#0 : !fir.box<!fir.array<?xi32>>
 ! CHECK: } copy {
 ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>):
-! CHECK:   %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK:   %[[DES_V1:.*]] = hlfir.designate %[[ARG0]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
-! CHECK:   %[[DES_V2:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
-! CHECK:   hlfir.assign %[[DES_V1]] to %[[DES_V2]] : !fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>
+! CHECK:   hlfir.assign %[[ARG0]] to %[[ARG1]] temporary_lhs : !fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>
 ! CHECK:   acc.terminator
 ! CHECK: } destroy {
 ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>):
@@ -183,12 +164,19 @@
 ! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<50xf32>>
 ! CHECK: } copy {
 ! CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<!fir.array<50xf32>>, %[[DST:.*]]: !fir.ref<!fir.array<50xf32>>):
-! CHECK:   %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK:   %[[DECL_SRC:.*]]:2 = hlfir.declare %[[SRC]](%[[SHAPE]]) {uniq_name = ""} : (!fir.ref<!fir.array<50xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<50xf32>>, !fir.ref<!fir.array<50xf32>>)
-! CHECK:   %[[DECL_DST:.*]]:2 = hlfir.declare %[[DST]](%[[SHAPE]]) {uniq_name = ""} : (!fir.ref<!fir.array<50xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<50xf32>>, !fir.ref<!fir.array<50xf32>>)
-! CHECK:   %[[DES_SRC:.*]] = hlfir.designate %[[DECL_SRC]]#0 shape %[[SHAPE:.*]] : (!fir.ref<!fir.array<50xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<50xf32>>
-! CHECK:   %[[DES_DST:.*]] = hlfir.designate %[[DECL_DST]]#0 shape %[[SHAPE:.*]] : (!fir.ref<!fir.array<50xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<50xf32>>
-! CHECK:   hlfir.assign %[[DES_SRC]] to %[[DES_DST]] : !fir.ref<!fir.array<50xf32>>, !fir.ref<!fir.array<50xf32>>
+! CHECK:   %[[C50:.*]] = arith.constant 50 : index
+! CHECK:   %[[C99:.*]] = arith.constant 99 : index
+! CHECK:   %[[C1:.*]] = arith.constant 1 : index
+! CHECK:   %[[C0:.*]] = arith.constant 0 : index
+! CHECK:   %[[D0:.*]] = arith.subi %[[C99]], %[[C50]] : index
+! CHECK:   %[[D1:.*]] = arith.addi %[[D0]], %[[C1]] : index
+! CHECK:   %[[D2:.*]] = arith.divsi %[[D1]], %[[C1]] : index
+! CHECK:   %[[CMP:.*]] = arith.cmpi sgt, %[[D2]], %[[C0]] : index
+! CHECK:   %[[SEL:.*]] = arith.select %[[CMP]], %[[D2]], %[[C0]] : index
+! CHECK:   %[[SH:.*]] = fir.shape %[[SEL]] : (index) -> !fir.shape<1>
+! CHECK:   %[[SEC_SRC:.*]] = hlfir.designate %[[SRC]] (%c51{{.*}}:%c100{{.*}}:%c1{{.*}}) shape %[[SH]] : (!fir.ref<!fir.array<50xf32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<50xf32>>
+! CHECK:   %[[SEC_DST:.*]] = hlfir.designate %[[DST]] (%c51{{.*}}:%c100{{.*}}:%c1{{.*}}) shape %[[SH]] : (!fir.ref<!fir.array<50xf32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<50xf32>>
+! CHECK:   hlfir.assign %[[SEC_SRC]] to %[[SEC_DST]] temporary_lhs : !fir.ref<!fir.array<50xf32>>, !fir.ref<!fir.array<50xf32>>
 ! CHECK:   acc.terminator
 ! CHECK: }
 
@@ -200,12 +188,7 @@
 ! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xf32>>
 ! CHECK: } copy {
 ! CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<!fir.array<100xf32>>, %[[DST:.*]]: !fir.ref<!fir.array<100xf32>>):
-! CHECK:   %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK:   %[[DECL_SRC:.*]]:2 = hlfir.declare %[[SRC]](%[[SHAPE]]) {uniq_name = ""} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
-! CHECK:   %[[DECL_DST:.*]]:2 = hlfir.declare %[[DST]](%[[SHAPE]]) {uniq_name = ""} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
-! CHECK:   %[[DES_SRC:.*]] = hlfir.designate %[[DECL_SRC]]#0 shape %[[SHAPE]] : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<100xf32>>
-! CHECK:   %[[DES_DST:.*]] = hlfir.designate %[[DECL_DST]]#0 shape %[[SHAPE]] : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<100xf32>>
-! CHECK:   hlfir.assign %[[DES_SRC]] to %[[DES_DST]] : !fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>
+! CHECK:   hlfir.assign %[[SRC]] to %[[DST]] temporary_lhs : !fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>
 ! CHECK:   acc.terminator
 ! CHECK: }
 
@@ -217,7 +200,7 @@
 ! CHECK: } copy {
 ! CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<i32>, %[[DST:.*]]: !fir.ref<i32>):
 ! CHECK:   %[[VALUE:.*]] = fir.load %[[SRC]] : !fir.ref<i32>
-! CHECK:   fir.store %[[VALUE]] to %[[DST]] : !fir.ref<i32>
+! CHECK:   fir.assign %[[VALUE]] to %[[DST]] temporary_lhs : i32, !fir.ref<i32>
 ! CHECK:   acc.terminator
 ! CHECK: }
 
@@ -333,7 +316,7 @@ subroutine acc_private_assumed_shape(a, n)
 
 ! CHECK-LABEL: func.func @_QPacc_private_assumed_shape(
 ! CHECK-SAME:    %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}
-! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFacc_private_assumed_shapeEa"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFacc_private_assumed_shapeEa"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK: acc.parallel {{.*}} {
 ! CHECK: %[[PRIVATE:.*]] = acc.private var(%[[DECL_A]]#0 : !fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>> {name = "a"}
 ! CHECK: acc.loop {{.*}} private({{.*}}@privatization_box_Uxi32 -> %[[PRIVATE]] : !fir.box<!fir.array<?xi32>>{{.*}})
@@ -354,7 +337,7 @@ subroutine acc_private_allocatable_array(a, n)
 
 ! CHECK-LABEL: func.func @_QPacc_private_allocatable_array(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {fir.bindc_name = "a"}
-! CHECK: %[[DECLA_A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_private_allocatable_arrayEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+! CHECK: %[[DECLA_A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_private_allocatable_arrayEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 ! CHECK: acc.parallel {{.*}} {
 ! CHECK: %[[PRIVATE:.*]] = acc.private varPtr(%[[DECLA_A]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "a"}
 ! CHECK: acc.loop {{.*}} private({{.*}}@privatization_ref_box_heap_Uxi32 -> %[[PRIVATE]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>{{.*}})
@@ -377,7 +360,7 @@ subroutine acc_private_allocatable_scalar(b, a, n)
 
 ! CHECK-LABEL: func.func @_QPacc_private_allocatable_scalar(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>> {fir.bindc_name = "b"}
-! CHECK: %[[DECLA_B:.*]]:2 = hlfir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_private_allocatable_scalarEb"} : (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+! CHECK: %[[DECLA_B:.*]]:2 = hlfir.declare %arg0 dummy_scope %0 {{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_private_allocatable_scalarEb"} : (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
 ! CHECK: acc.parallel {{.*}} {
 ! CHECK: %[[PRIVATE:.*]] = acc.private varPtr(%[[DECLA_B]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<!fir.heap<i32>>> {name = "b"}
 ! CHECK: acc.loop {{.*}} private({{.*}}@privatization_ref_box_heap_i32 -> %[[PRIVATE]] : !fir.ref<!fir.box<!fir.heap<i32>>>{{.*}})
@@ -395,7 +378,7 @@ subroutine acc_private_pointer_array(a, n)
 
 ! CHECK-LABEL: func.func @_QPacc_private_pointer_array(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {fir.bindc_name = "a"}, %arg1: !fir.ref<i32> {fir.bindc_name = "n"}) {
-! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFacc_private_pointer_arrayEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %arg0 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFacc_private_pointer_arrayEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
 ! CHECK: acc.parallel {{.*}} {
 ! CHECK: %[[PRIVATE:.*]] = acc.private varPtr(%[[DECLA_A]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {name = "a"}
 ! CHECK: acc.loop {{.*}} private({{.*}}@privatization_ref_box_ptr_Uxi32 -> %[[PRIVATE]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>{{.*}})
@@ -412,8 +395,8 @@ subroutine acc_private_dynamic_extent(a, n)
 
 ! CHECK-LABEL: func.func @_QPacc_private_dynamic_extent(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?x?x2xi32>> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
-! CHECK: %[[DECL_N:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFacc_private_dynamic_extentEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFacc_private_dynamic_extentEa"} : (!fir.ref<!fir.array<?x?x2xi32>>, !fir.shape<3>, !fir.dscope) -> (!fir.box<!fir.array<?x?x2xi32>>, !fir.ref<!fir.array<?x?x2xi32>>)
+! CHECK: %[[DECL_N:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFacc_private_dynamic_extentEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFacc_private_dynamic_extentEa"} : (!fir.ref<!fir.array<?x?x2xi32>>, !fir.shape<3>, !fir.dscope) -> (!fir.box<!fir.array<?x?x2xi32>>, !fir.ref<!fir.array<?x?x2xi32>>)
 ! CHECK: acc.parallel {{.*}} {
 ! CHECK: %[[PRIV:.*]] = acc.private var(%[[DECL_A]]#0 : !fir.box<!fir.array<?x?x2xi32>>) -> !fir.box<!fir.array<?x?x2xi32>> {name = "a"}
 ! CHECK: acc.loop {{.*}} private({{.*}}@privatization_box_UxUx2xi32 -> %[[PRIV]] : !fir.box<!fir.array<?x?x2xi32>>{{.*}})
diff --git a/flang/test/Lower/OpenACC/acc-reduction.f90 b/flang/test/Lower/OpenACC/acc-reduction.f90
index 6cb8bdf6b511a..b26b1a9540270 100644
--- a/flang/test/Lower/OpenACC/acc-reduction.f90
+++ b/flang/test/Lower/OpenACC/acc-reduction.f90
@@ -2,757 +2,1212 @@
 
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
 
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_box_UxUxf32 : !fir.box<!fir.array<?x?xf32>> reduction_operator <max> init {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>>):
-! CHECK:   %[[CST:.*]] = arith.constant -1.401300e-45 : f32
-! CHECK:   %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0]], %c0{{.*}} : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
-! CHECK:   %[[DIMS1:.*]]:3 = fir.box_dims %[[ARG0]], %c1 : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
-! CHECK:   %[[SHAPE:.*]] = fir.shape %[[DIMS0]]#1, %[[DIMS1]]#1 : (index, index) -> !fir.shape<2>
-! CHECK:   %[[TEMP:.*]] = fir.allocmem !fir.array<?x?xf32>, %[[DIMS0]]#1, %[[DIMS1]]#1 {bindc_name = ".tmp", uniq_name = ""} 
-! CHECK:   %[[DECL:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.heap<!fir.array<?x?xf32>>)
-! CHECK:   hlfir.assign %[[CST]] to %[[DECL]]#0 : f32, !fir.box<!fir.array<?x?xf32>>
-! CHECK:   acc.yield %[[DECL]]#0 : !fir.box<!fir.array<?x?xf32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[V1:.*]]: !fir.box<!fir.array<?x?xf32>>, %[[V2:.*]]: !fir.box<!fir.array<?x?xf32>>):
-! CHECK:   %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}} : (index, index) -> !fir.shape<2>
-! CHECK:   %[[DES_V1:.*]] = hlfir.designate %[[V1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf32>>
-! CHECK:   %[[DES_V2:.*]] = hlfir.designate %[[V2]] shape %[[SHAPE]] : (!fir.box<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf32>>
-! CHECK:   %[[ELEMENTAL:.*]] = hlfir.elemental %[[SHAPE]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xf32> {
-! CHECK:   ^bb0(%[[ARG0:.*]]: index, %[[ARG1:.*]]: index):
-! CHECK:     %[[D1:.*]] = hlfir.designate %[[DES_V1]] (%[[ARG0]], %[[ARG1]])  : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
-! CHECK:     %[[D2:.*]] = hlfir.designate %[[DES_V2]] (%[[ARG0]], %[[ARG1]])  : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
-! CHECK:     %[[LOAD1:.*]] = fir.load %[[D1]] : !fir.ref<f32>
-! CHECK:     %[[LOAD2:.*]] = fir.load %[[D2]] : !fir.ref<f32>
-! CHECK:     %[[CMP:.*]] = arith.cmpf ogt, %[[LOAD1]], %[[LOAD2]] {{.*}} : f32
-! CHECK:     %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD1]], %[[LOAD2]] : f32
-! CHECK:     hlfir.yield_element %[[SELECT]] : f32
-! CHECK:   }
-! CHECK:   hlfir.assign %[[ELEMENTAL]] to %[[V1]] : !hlfir.expr<?x?xf32>, !fir.box<!fir.array<?x?xf32>>
-! CHECK:   acc.yield %[[V1]] : !fir.box<!fir.array<?x?xf32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_box_ptr_Uxf32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> reduction_operator <max> init {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
-! CHECK:   %[[CST:.*]] = arith.constant -1.401300e-45 : f32
-! CHECK:   %[[BOX:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK:   %[[C0:.*]] = arith.constant 0 : index
-! CHECK:   %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[BOX]], %[[C0]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
-! CHECK:   %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! CHECK:   %[[TEMP:.*]] = fir.allocmem !fir.array<?xf32>, %[[BOX_DIMS]]#1 {bindc_name = ".tmp", uniq_name = ""}
-! CHECK:   %[[STORAGE:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
-! CHECK:   %[[BOXTEMP:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xf32>>>
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[BOXTEMP]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
-! CHECK:   hlfir.assign %[[CST]] to %[[DECLARE]]#0 : f32, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
-! CHECK:   %[[BOX0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK:   %[[C0:.*]] = arith.constant 0 : index
-! CHECK:   %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[BOX0]], %[[C0]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
-! CHECK:   %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! CHECK:   %[[BOX0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK:   %[[DES_V1:.*]] = hlfir.designate %[[BOX0]] shape %[[SHAPE]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
-! CHECK:   %[[BOX1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK:   %[[DES_V2:.*]] = hlfir.designate %[[BOX1]] shape %[[SHAPE]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
-! CHECK:   %[[ELEMENTAL:.*]] = hlfir.elemental %[[SHAPE]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
-! CHECK:   ^bb0(%[[IV:.*]]: index):
-! CHECK:     %[[V1:.*]] = hlfir.designate %[[DES_V1]] (%[[IV]])  : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
-! CHECK:     %[[V2:.*]] = hlfir.designate %[[DES_V2]] (%[[IV]])  : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
-! CHECK:     %[[LOAD_V1:.*]] = fir.load %[[V1]] : !fir.ref<f32>
-! CHECK:     %[[LOAD_V2:.*]] = fir.load %[[V2]] : !fir.ref<f32>
-! CHECK:     %[[CMP:.*]] = arith.cmpf ogt, %[[LOAD_V1]], %[[LOAD_V2]] {{.*}} : f32
-! CHECK:     %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD_V1]], %[[LOAD_V2]] : f32
-! CHECK:     hlfir.yield_element %[[SELECT]] : f32
-! CHECK:   }
-! CHECK:   hlfir.assign %[[ELEMENTAL]] to %[[ARG0]] : !hlfir.expr<?xf32>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_box_heap_Uxf32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> reduction_operator <max> init {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>):
-! CHECK:   %[[CST:.*]] = arith.constant -1.401300e-45 : f32
-! CHECK:   %[[BOX:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK:   %[[C0:.*]] = arith.constant 0 : index
-! CHECK:   %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[BOX]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
-! CHECK:   %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! CHECK:   %[[TEMP:.*]] = fir.allocmem !fir.array<?xf32>, %[[BOX_DIMS]]#1 {bindc_name = ".tmp", uniq_name = ""}
-! CHECK:   %[[STORAGE:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
-! CHECK:   %[[BOXTEMP:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>>
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[BOXTEMP]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
-! CHECK:   hlfir.assign %[[CST]] to %[[DECLARE]]#0 : f32, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>):
-! CHECK:   %[[BOX0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK:   %[[C0:.*]] = arith.constant 0 : index
-! CHECK:   %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[BOX0]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
-! CHECK:   %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! CHECK:   %[[BOX0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK:   %[[DES_V1:.*]] = hlfir.designate %[[BOX0]] shape %[[SHAPE]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
-! CHECK:   %[[BOX1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK:   %[[DES_V2:.*]] = hlfir.designate %[[BOX1]] shape %[[SHAPE]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
-! CHECK:   %[[ELEMENTAL:.*]] = hlfir.elemental %[[SHAPE]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
-! CHECK:   ^bb0(%[[IV:.*]]: index):
-! CHECK:     %[[V1:.*]] = hlfir.designate %[[DES_V1]] (%[[IV]])  : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
-! CHECK:     %[[V2:.*]] = hlfir.designate %[[DES_V2]] (%[[IV]])  : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
-! CHECK:     %[[LOAD_V1:.*]] = fir.load %[[V1]] : !fir.ref<f32>
-! CHECK:     %[[LOAD_V2:.*]] = fir.load %[[V2]] : !fir.ref<f32>
-! CHECK:     %[[CMP:.*]] = arith.cmpf ogt, %[[LOAD_V1]], %[[LOAD_V2]] {{.*}} : f32
-! CHECK:     %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD_V1]], %[[LOAD_V2]] : f32
-! CHECK:     hlfir.yield_element %[[SELECT]] : f32
-! CHECK:   }
-! CHECK:   hlfir.assign %[[ELEMENTAL]] to %[[ARG0]] : !hlfir.expr<?xf32>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_section_lb1.ub3_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>):
-! CHECK:   %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %c0{{.*}} : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-! CHECK:   %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! CHECK:   %[[TEMP:.*]] = fir.allocmem !fir.array<?xi32>, %0#1 {bindc_name = ".tmp", uniq_name = ""}
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
-! CHECK:   hlfir.assign %c0{{.*}} to %[[DECLARE]]#0 : i32, !fir.box<!fir.array<?xi32>>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.box<!fir.array<?xi32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>):
-! CHECK:   %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK:   %[[DES1:.*]] = hlfir.designate %[[ARG0]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
-! CHECK:   %[[DES2:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
-! CHECK:   %[[ELEMENTAL:.*]] = hlfir.elemental %[[SHAPE]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
-! CHECK:   ^bb0(%[[IV:.*]]: index):
-! CHECK:     %[[DES_V1:.*]] = hlfir.designate %[[DES1]] (%[[IV]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-! CHECK:     %[[DES_V2:.*]] = hlfir.designate %[[DES2]] (%[[IV]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-! CHECK:     %[[LOAD_V1:.*]] = fir.load %[[DES_V1]] : !fir.ref<i32>
-! CHECK:     %[[LOAD_V2:.*]] = fir.load %[[DES_V2]] : !fir.ref<i32>
-! CHECK:     %[[COMBINED:.*]] = arith.addi %[[LOAD_V1]], %[[LOAD_V2]] : i32
-! CHECK:     hlfir.yield_element %[[COMBINED]] : i32
-! CHECK:   }
-! CHECK:   hlfir.assign %[[ELEMENTAL]] to %[[ARG0]] : !hlfir.expr<?xi32>, !fir.box<!fir.array<?xi32>>
-! CHECK:   acc.yield %[[ARG0]] : !fir.box<!fir.array<?xi32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_box_Uxf32 : !fir.box<!fir.array<?xf32>> reduction_operator <max> init {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xf32>>):
-! CHECK:   %[[INIT_VALUE:.*]] = arith.constant -1.401300e-45 : f32
-! CHECK:   %[[C0:.*]] = arith.constant 0 : index
-! CHECK:   %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[C0]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
-! CHECK:   %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! CHECK:   %[[TEMP:.*]] = fir.allocmem !fir.array<?xf32>, %0#1 {bindc_name = ".tmp", uniq_name = ""}
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
-! CHECK:   hlfir.assign %[[INIT_VALUE]] to %[[DECLARE]]#0 : f32, !fir.box<!fir.array<?xf32>>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.box<!fir.array<?xf32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xf32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xf32>>
-! CHECK:   %[[LEFT:.*]] = hlfir.designate %[[ARG0]] shape %{{.*}} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
-! CHECK:   %[[RIGHT:.*]] = hlfir.designate %[[ARG1]] shape %{{.*}} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
-! CHECK:   %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
-! CHECK:   ^bb0(%{{.*}}: index):
-! CHECK:     %[[DES_V1:.*]] = hlfir.designate %[[LEFT]] (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-! CHECK:     %[[DES_V2:.*]] = hlfir.designate %[[RIGHT]] (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
-! CHECK:     %[[LOAD_V1:.*]] = fir.load %[[DES_V1]] : !fir.ref<f32>
-! CHECK:     %[[LOAD_V2:.*]] = fir.load %[[DES_V2]] : !fir.ref<f32>
-! CHECK:     %[[CMPF:.*]] = arith.cmpf ogt, %[[LOAD_V1]], %[[LOAD_V2]] {{.*}} : f32
-! CHECK:     %[[SELECT:.*]] = arith.select %[[CMPF]], %[[LOAD_V1]], %[[LOAD_V2]] : f32
-! CHECK:     hlfir.yield_element %[[SELECT]] : f32
-! CHECK:   }
-! CHECK:   hlfir.assign %[[ELEMENTAL]] to %[[ARG0]] : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
-! CHECK: acc.yield %[[ARG0]] : !fir.box<!fir.array<?xf32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>):
-! CHECK:   %[[INIT_VALUE:.*]] = arith.constant 0 : i32
-! CHECK:   %[[C0:.*]] = arith.constant 0 : index
-! CHECK:   %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[C0]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-! CHECK:   %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! CHECK:   %[[TEMP:.*]] = fir.allocmem !fir.array<?xi32>, %[[BOX_DIMS]]#1 {bindc_name = ".tmp", uniq_name = ""}
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
-! CHECK:   hlfir.assign %[[INIT_VALUE]] to %[[DECLARE]]#0 : i32, !fir.box<!fir.array<?xi32>>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.box<!fir.array<?xi32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>
-! CHECK:   %[[C0:.*]] = arith.constant 0 : index
-! CHECK:   %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[C0]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-! CHECK:   %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
-! CHECK:   %[[LEFT:.*]] = hlfir.designate %[[ARG0]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
-! CHECK:   %[[RIGHT:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
-! CHECK:   %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
-! CHECK:   ^bb0(%{{.*}}: index):
-! CHECK:     %[[DES_V1:.*]] = hlfir.designate %[[LEFT]] (%{{.*}})  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-! CHECK:     %[[DES_V2:.*]] = hlfir.designate %[[RIGHT]] (%{{.*}})  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-! CHECK:     %[[LOAD_V1:.*]] = fir.load %[[DES_V1]] : !fir.ref<i32>
-! CHECK:     %[[LOAD_V2:.*]] = fir.load %[[DES_V2]] : !fir.ref<i32>
-! CHECK:     %[[COMBINED:.*]] = arith.addi %[[LOAD_V1]], %[[LOAD_V2]] : i32
-! CHECK:     hlfir.yield_element %[[COMBINED]] : i32
-! CHECK:   }
-! CHECK:   hlfir.assign %[[ELEMENTAL]] to %[[ARG0]] : !hlfir.expr<?xi32>, !fir.box<!fir.array<?xi32>>
-! CHECK:   acc.yield %arg0 : !fir.box<!fir.array<?xi32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_section_lb0.ub9xlb0.ub19_ref_10x20xi32 : !fir.ref<!fir.array<10x20xi32>> reduction_operator <add> init {
-! CHECK:     fir.do_loop %arg1 = %c0 to %c19 step %c1 {
-! CHECK:       fir.do_loop %arg2 = %c0_0 to %c9 step %c1_1 {
-! CHECK: } combiner {
-! CHECK:     fir.do_loop %arg2 = %c0 to %c19 step %c1 {
-! CHECK:       fir.do_loop %arg3 = %c0_0 to %c9 step %c1_1 {
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_z32 : !fir.ref<complex<f32>> reduction_operator <mul> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<complex<f32>>):
-! CHECK:   %[[REAL:.*]] = arith.constant 1.000000e+00 : f32
-! CHECK:   %[[IMAG:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:   %[[UNDEF:.*]] = fir.undefined complex<f32>
-! CHECK:   %[[UNDEF1:.*]] = fir.insert_value %[[UNDEF]], %[[REAL]], [0 : index] : (complex<f32>, f32) -> complex<f32>
-! CHECK:   %[[UNDEF2:.*]] = fir.insert_value %[[UNDEF1]], %[[IMAG]], [1 : index] : (complex<f32>, f32) -> complex<f32>
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca complex<f32>
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
-! CHECK:   fir.store %[[UNDEF2]] to %[[DECLARE]]#0 : !fir.ref<complex<f32>>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<complex<f32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<complex<f32>>, %[[ARG1:.*]]: !fir.ref<complex<f32>>):
-! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<complex<f32>>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<complex<f32>>
-! CHECK:   %[[COMBINED:.*]] = fir.mulc %[[LOAD0]], %[[LOAD1]] {fastmath = #arith.fastmath<contract>} : complex<f32>
-! CHECK:   fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<complex<f32>>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<complex<f32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_z32 : !fir.ref<complex<f32>> reduction_operator <add> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<complex<f32>>):
-! CHECK:   %[[REAL:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:   %[[IMAG:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:   %[[UNDEF:.*]] = fir.undefined complex<f32>
-! CHECK:   %[[UNDEF1:.*]] = fir.insert_value %[[UNDEF]], %[[REAL]], [0 : index] : (complex<f32>, f32) -> complex<f32>
-! CHECK:   %[[UNDEF2:.*]] = fir.insert_value %[[UNDEF1]], %[[IMAG]], [1 : index] : (complex<f32>, f32) -> complex<f32>
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca complex<f32>
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
-! CHECK:   fir.store %[[UNDEF2]] to %[[DECLARE]]#0 : !fir.ref<complex<f32>>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<complex<f32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<complex<f32>>, %[[ARG1:.*]]: !fir.ref<complex<f32>>):
-! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<complex<f32>>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<complex<f32>>
-! CHECK:   %[[COMBINED:.*]] = fir.addc %[[LOAD0]], %[[LOAD1]] {fastmath = #arith.fastmath<contract>} : complex<f32>
-! CHECK:   fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<complex<f32>>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<complex<f32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_neqv_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <neqv> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>):
-! CHECK:   %[[CST:.*]] = arith.constant false
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.logical<4>
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK:   %[[CONVERT:.*]] = fir.convert %[[CST]] : (i1) -> !fir.logical<4>
-! CHECK:   fir.store %[[CONVERT]] to %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
-! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
-! CHECK:   %[[CONV0:.*]] = fir.convert %[[LOAD0]] : (!fir.logical<4>) -> i1
-! CHECK:   %[[CONV1:.*]] = fir.convert %[[LOAD1]] : (!fir.logical<4>) -> i1
-! CHECK:   %[[CMP:.*]] = arith.cmpi ne, %[[CONV0]], %[[CONV1]] : i1
-! CHECK:   %[[CMP_CONV:.*]] = fir.convert %[[CMP]] : (i1) -> !fir.logical<4>
-! CHECK:   fir.store %[[CMP_CONV]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_eqv_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <eqv> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>):
-! CHECK:   %[[CST:.*]] = arith.constant true
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.logical<4>
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK:   %[[CONVERT:.*]] = fir.convert %[[CST]] : (i1) -> !fir.logical<4>
-! CHECK:   fir.store %[[CONVERT]] to %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
-! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
-! CHECK:   %[[CONV0:.*]] = fir.convert %[[LOAD0]] : (!fir.logical<4>) -> i1
-! CHECK:   %[[CONV1:.*]] = fir.convert %[[LOAD1]] : (!fir.logical<4>) -> i1
-! CHECK:   %[[CMP:.*]] = arith.cmpi eq, %[[CONV0]], %[[CONV1]] : i1
-! CHECK:   %[[CMP_CONV:.*]] = fir.convert %[[CMP]] : (i1) -> !fir.logical<4>
-! CHECK:   fir.store %[[CMP_CONV]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_lor_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <lor> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>):
-! CHECK:   %[[CST:.*]] = arith.constant false
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.logical<4>
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK:   %[[CONVERT:.*]] = fir.convert %[[CST]] : (i1) -> !fir.logical<4>
-! CHECK:   fir.store %[[CONVERT]] to %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
-! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
-! CHECK:   %[[CONV0:.*]] = fir.convert %[[LOAD0]] : (!fir.logical<4>) -> i1
-! CHECK:   %[[CONV1:.*]] = fir.convert %[[LOAD1]] : (!fir.logical<4>) -> i1
-! CHECK:   %[[CMP:.*]] = arith.ori %[[CONV0]], %[[CONV1]] : i1
-! CHECK:   %[[CMP_CONV:.*]] = fir.convert %[[CMP]] : (i1) -> !fir.logical<4>
-! CHECK:   fir.store %[[CMP_CONV]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_land_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <land> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>):
-! CHECK:   %[[CST:.*]] = arith.constant true
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.logical<4>
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-! CHECK:   %[[CONVERT:.*]] = fir.convert %[[CST]] : (i1) -> !fir.logical<4>
-! CHECK:   fir.store %[[CONVERT]] to %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.logical<4>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
-! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
-! CHECK:   %[[CONV0:.*]] = fir.convert %[[LOAD0]] : (!fir.logical<4>) -> i1
-! CHECK:   %[[CONV1:.*]] = fir.convert %[[LOAD1]] : (!fir.logical<4>) -> i1
-! CHECK:   %[[CMP:.*]] = arith.andi %[[CONV0]], %[[CONV1]] : i1
-! CHECK:   %[[CMP_CONV:.*]] = fir.convert %[[CMP]] : (i1) -> !fir.logical<4>
-! CHECK:   fir.store %[[CMP_CONV]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_xor_ref_i32 : !fir.ref<i32> reduction_operator <xor> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
-! CHECK:   %[[CST:.*]] = arith.constant 0 : i32
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca i32
-! CHECK:   %[[DECLARE]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:   fir.store %[[CST]] to %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK:   %[[COMBINED:.*]] = arith.xori %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK:   fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<i32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_ior_ref_i32 : !fir.ref<i32> reduction_operator <ior> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
-! CHECK:   %[[CST:.*]] = arith.constant 0 : i32
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca i32
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:   fir.store %[[CST]] to %[[DECLARE:.*]]#0 : !fir.ref<i32>
-! CHECK:   acc.yield %[[DECLARE:.*]]#0 : !fir.ref<i32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK:   %[[COMBINED:.*]] = arith.ori %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK:   fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<i32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_iand_ref_i32 : !fir.ref<i32> reduction_operator <iand> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
-! CHECK:   %[[CST:.*]] = arith.constant -1 : i32
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca i32
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:   fir.store %[[CST]] to %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK:   %[[COMBINED:.*]] = arith.andi %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK:   fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<i32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <max> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xf32>>):
-! CHECK:   %[[INIT:.*]] = arith.constant -1.401300e-45 : f32
-! CHECK:   %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100xf32>
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
-! CHECK:   %[[LB:.*]] = arith.constant 0 : index
-! CHECK:   %[[UB:.*]] = arith.constant 99 : index
-! CHECK:   %[[STEP:.*]] = arith.constant 1 : index
-! CHECK:   fir.do_loop %[[IV:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
-! CHECK:     %[[COORD:.*]] = fir.coordinate_of %[[DECLARE]]#0, %[[IV]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
-! CHECK:     fir.store %[[INIT]] to %[[COORD]] : !fir.ref<f32>
-! CHECK:   }
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xf32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xf32>>):
-! CHECK:   %[[LB0:.*]] = arith.constant 0 : index
-! CHECK:   %[[UB0:.*]] = arith.constant 99 : index
-! CHECK:   %[[STEP0:.*]] = arith.constant 1 : index
-! CHECK:   fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
-! CHECK:     %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV0]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
-! CHECK:     %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV0]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
-! CHECK:     %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<f32>
-! CHECK:     %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<f32>
-! CHECK:     %[[CMP:.*]] = arith.cmpf ogt, %[[LOAD1]], %[[LOAD2]] {{.*}} : f32
-! CHECK:     %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD1]], %[[LOAD2]] : f32
-! CHECK:     fir.store %[[SELECT]] to %[[COORD1]] : !fir.ref<f32>
-! CHECK:   }
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.array<100xf32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_f32 : !fir.ref<f32> reduction_operator <max> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<f32>):
-! CHECK:   %[[INIT:.*]] = arith.constant -1.401300e-45 : f32
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca f32
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %0 {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:   fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<f32>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<f32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
-! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-! CHECK:   %[[CMP:.*]] = arith.cmpf ogt, %[[LOAD0]], %[[LOAD1]] {{.*}} : f32
-! CHECK:   %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD0]], %[[LOAD1]] : f32
-! CHECK:   fir.store %[[SELECT]] to %[[ARG0]] : !fir.ref<f32>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<f32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <max> init {
-! CHECK: ^bb0(%arg0: !fir.ref<!fir.array<100x10xi32>>):
-! CHECK:   %[[INIT:.*]] = arith.constant -2147483648 : i32
-! CHECK:   %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}} : (index, index) -> !fir.shape<2>
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100x10xi32>
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xi32>>, !fir.ref<!fir.array<100x10xi32>>)
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100x10xi32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xi32>>):
-! CHECK:   %[[LB0:.*]] = arith.constant 0 : index
-! CHECK:   %[[UB0:.*]] = arith.constant 9 : index
-! CHECK:   %[[STEP0:.*]] = arith.constant 1 : index
-! CHECK:   fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
-! CHECK:     %[[LB1:.*]] = arith.constant 0 : index
-! CHECK:     %[[UB1:.*]] = arith.constant 99 : index
-! CHECK:     %[[STEP1:.*]] = arith.constant 1 : index
-! CHECK:     fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] {
-! CHECK:       %[[COORD1:.*]] = fir.coordinate_of %[[ARG0:.*]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
-! CHECK:       %[[COORD2:.*]] = fir.coordinate_of %[[ARG1:.*]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
-! CHECK:       %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<i32>
-! CHECK:       %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<i32>
-! CHECK:       %[[CMP:.*]] = arith.cmpi sgt, %[[LOAD1]], %[[LOAD2]] : i32
-! CHECK:       %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD1]], %[[LOAD2]] : i32
-! CHECK:       fir.store %[[SELECT]] to %[[COORD1]] : !fir.ref<i32>
-! CHECK:     }
-! CHECK:   }
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.array<100x10xi32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_max_ref_i32 : !fir.ref<i32> reduction_operator <max> init {
-! CHECK: ^bb0(%arg0: !fir.ref<i32>):
-! CHECK:   %[[INIT:.*]] = arith.constant -2147483648 : i32
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca i32
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:   fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK:   %[[CMP:.*]] = arith.cmpi sgt, %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK:   %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK:   fir.store %[[SELECT]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<i32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_100x10xf32 : !fir.ref<!fir.array<100x10xf32>> reduction_operator <min> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100x10xf32>>):
-! CHECK:   %[[INIT:.*]] = arith.constant 3.40282347E+38 : f32
-! CHECK:   %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}} : (index, index) -> !fir.shape<2>
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100x10xf32>
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xf32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xf32>>, !fir.ref<!fir.array<100x10xf32>>)
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100x10xf32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10xf32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xf32>>):
-! CHECK:   %[[LB0:.*]] = arith.constant 0 : index
-! CHECK:   %[[UB0:.*]] = arith.constant 9 : index
-! CHECK:   %[[STEP0:.*]] = arith.constant 1 : index
-! CHECK:   fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
-! CHECK:     %[[LB1:.*]] = arith.constant 0 : index
-! CHECK:     %[[UB1:.*]] = arith.constant 99 : index
-! CHECK:     %[[STEP1:.*]] = arith.constant 1 : index
-! CHECK:     fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] {
-! CHECK:       %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32>
-! CHECK:       %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32>
-! CHECK:       %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<f32>
-! CHECK:       %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<f32>
-! CHECK:       %[[CMP:.*]] = arith.cmpf olt, %[[LOAD1]], %[[LOAD2]] {{.*}} : f32
-! CHECK:       %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD1]], %[[LOAD2]] : f32
-! CHECK:       fir.store %[[SELECT]] to %[[COORD1]] : !fir.ref<f32>
-! CHECK:     }
-! CHECK:   }
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.array<100x10xf32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_f32 : !fir.ref<f32> reduction_operator <min> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<f32>):
-! CHECK:   %[[INIT:.*]] = arith.constant 3.40282347E+38 : f32
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca f32
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:   fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<f32>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<f32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
-! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-! CHECK:   %[[CMP:.*]] = arith.cmpf olt, %[[LOAD0]], %[[LOAD1]] {{.*}} : f32
-! CHECK:   %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD0]], %[[LOAD1]] : f32
-! CHECK:   fir.store %[[SELECT]] to %[[ARG0]] : !fir.ref<f32>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<f32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <min> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xi32>>):
-! CHECK:   %[[INIT:.*]] = arith.constant 2147483647 : i32
-! CHECK:   %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100xi32>
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xi32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>>):
-! CHECK:   %[[LB0:.*]] = arith.constant 0 : index
-! CHECK:   %[[UB0:.*]] = arith.constant 99 : index
-! CHECK:   %[[STEP0:.*]] = arith.constant 1 : index
-! CHECK:   fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
-! CHECK:     %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV0]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK:     %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV0]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK:     %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<i32>
-! CHECK:     %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<i32>
-! CHECK:     %[[CMP:.*]] = arith.cmpi slt, %[[LOAD1]], %[[LOAD2]] : i32
-! CHECK:     %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD1]], %[[LOAD2]] : i32
-! CHECK:     fir.store %[[SELECT]] to %[[COORD1]] : !fir.ref<i32>
-! CHECK:   }
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.array<100xi32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_min_ref_i32 : !fir.ref<i32> reduction_operator <min> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
-! CHECK:   %[[INIT:.*]] = arith.constant 2147483647 : i32
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca i32
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:   fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK:   %[[CMP:.*]] = arith.cmpi slt, %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK:   %[[SELECT:.*]] = arith.select %[[CMP]], %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK:   fir.store %[[SELECT]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<i32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_f32 : !fir.ref<f32> reduction_operator <mul> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<f32>):
-! CHECK:   %[[INIT:.*]] = arith.constant 1.000000e+00 : f32
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca f32
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:   fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<f32>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<f32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
-! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-! CHECK:   %[[COMBINED:.*]] = arith.mulf %[[LOAD0]], %[[LOAD1]] fastmath<contract> : f32
-! CHECK:   fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<f32>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<f32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <mul> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xi32>>):
-! CHECK:   %[[INIT:.*]] = arith.constant 1 : i32
-! CHECK:   %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100xi32>
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xi32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>>):
-! CHECK:   %[[LB:.*]] = arith.constant 0 : index
-! CHECK:   %[[UB:.*]] = arith.constant 99 : index
-! CHECK:   %[[STEP:.*]] = arith.constant 1 : index
-! CHECK:   fir.do_loop %[[IV:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
-! CHECK:     %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK:     %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK:     %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<i32>
-! CHECK:     %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<i32>
-! CHECK:     %[[COMBINED:.*]] = arith.muli %[[LOAD1]], %[[LOAD2]] : i32
-! CHECK:     fir.store %[[COMBINED]] to %[[COORD1]] : !fir.ref<i32>
-! CHECK:   }
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.array<100xi32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_mul_ref_i32 : !fir.ref<i32> reduction_operator <mul> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
-! CHECK:   %[[INIT:.*]] = arith.constant 1 : i32
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca i32
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:   fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK:   %[[COMBINED:.*]] = arith.muli %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK:   fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<i32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <add> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xf32>>):
-! CHECK:   %[[INIT:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:   %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100xf32>
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xf32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xf32>>):
-! CHECK:   %[[LB:.*]] = arith.constant 0 : index
-! CHECK:   %[[UB:.*]] = arith.constant 99 : index
-! CHECK:   %[[STEP:.*]] = arith.constant 1 : index
-! CHECK:   fir.do_loop %[[IV:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
-! CHECK:   %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
-! CHECK:   %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<f32>
-! CHECK:   %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<f32>
-! CHECK:   %[[COMBINED:.*]] = arith.addf %[[LOAD1]], %[[LOAD2]] fastmath<contract> : f32
-! CHECK:   fir.store %[[COMBINED]] to %[[COORD1]] : !fir.ref<f32>
-! CHECK:   }
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.array<100xf32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_f32 : !fir.ref<f32> reduction_operator <add> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<f32>):
-! CHECK:   %[[INIT:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca f32
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:   fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<f32>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<f32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
-! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-! CHECK:   %[[COMBINED:.*]] = arith.addf %[[LOAD0]], %[[LOAD1]] fastmath<contract> : f32
-! CHECK:   fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<f32>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<f32>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100x10x2xi32 : !fir.ref<!fir.array<100x10x2xi32>> reduction_operator <add> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100x10x2xi32>>):
-! CHECK:   %[[INIT:.*]] = arith.constant 0 : i32
-! CHECK:   %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}}, %{{.*}} : (index, index, index) -> !fir.shape<3>
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100x10x2xi32>
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10x2xi32>>, !fir.shape<3>) -> (!fir.ref<!fir.array<100x10x2xi32>>, !fir.ref<!fir.array<100x10x2xi32>>)
-! CHECK:   %[[LB0:.*]] = arith.constant 0 : index
-! CHECK:   %[[UB0:.*]] = arith.constant 1 : index
-! CHECK:   %[[STEP0:.*]] = arith.constant 1 : index
-! CHECK:   fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
-! CHECK:     %[[LB1:.*]] = arith.constant 0 : index
-! CHECK:     %[[UB1:.*]] = arith.constant 9 : index
-! CHECK:     %[[STEP1:.*]] = arith.constant 1 : index
-! CHECK:     fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] {
-! CHECK:       %[[LB2:.*]] = arith.constant 0 : index
-! CHECK:       %[[UB2:.*]] = arith.constant 99 : index
-! CHECK:       %[[STEP2:.*]] = arith.constant 1 : index
-! CHECK:       fir.do_loop %[[IV2:.*]] = %[[LB2]] to %[[UB2]] step %[[STEP2]] {
-! CHECK:         %[[COORD]] = fir.coordinate_of %[[DECLARE]]#0, %[[IV2]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
-! CHECK:         fir.store %[[INIT]] to %[[COORD]] : !fir.ref<i32>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100x10x2xi32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10x2xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10x2xi32>>):
-! CHECK:   %[[LB0:.*]] = arith.constant 0 : index
-! CHECK:   %[[UB0:.*]] = arith.constant 1 : index
-! CHECK:   %[[STEP0:.*]] = arith.constant 1 : index
-! CHECK:   fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
-! CHECK:     %[[LB1:.*]] = arith.constant 0 : index
-! CHECK:     %[[UB1:.*]] = arith.constant 9 : index
-! CHECK:     %[[STEP1:.*]] = arith.constant 1 : index
-! CHECK:     fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] {
-! CHECK:       %[[LB2:.*]] = arith.constant 0 : index
-! CHECK:       %[[UB2:.*]] = arith.constant 99 : index
-! CHECK:       %[[STEP2:.*]] = arith.constant 1 : index
-! CHECK:       fir.do_loop %[[IV2:.*]] = %[[LB2]] to %[[UB2]] step %[[STEP2]] {
-! CHECK:         %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV2]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
-! CHECK:         %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV2]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
-! CHECK:         %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<i32>
-! CHECK:         %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<i32>
-! CHECK:         %[[COMBINED:.*]] = arith.addi %[[LOAD1]], %[[LOAD2]] : i32
-! CHECK:         fir.store %[[COMBINED]] to %[[COORD1]] : !fir.ref<i32>
-! CHECK:       }
-! CHECK:     }
-! CHECK:   }
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.array<100x10x2xi32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <add> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100x10xi32>>):
-! CHECK:   %[[INIT:.*]] = arith.constant 0 : i32
-! CHECK:   %[[SHAPE:.*]] = fir.shape %{{.*}}, %{{.*}} : (index, index) -> !fir.shape<2>
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100x10xi32>
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xi32>>, !fir.ref<!fir.array<100x10xi32>>)
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100x10xi32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xi32>>):
-! CHECK:   %[[LB0:.*]] = arith.constant 0 : index
-! CHECK:   %[[UB0:.*]] = arith.constant 9 : index
-! CHECK:   %[[STEP0:.*]] = arith.constant 1 : index
-! CHECK:   fir.do_loop %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
-! CHECK:     %[[LB1:.*]] = arith.constant 0 : index
-! CHECK:     %[[UB1:.*]] = arith.constant 99 : index
-! CHECK:     %[[STEP1:.*]] = arith.constant 1 : index
-! CHECK:     fir.do_loop %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] {
-! CHECK:       %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
-! CHECK:       %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV1]], %[[IV0]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
-! CHECK:       %[[LOAD1]] = fir.load %[[COORD1]] : !fir.ref<i32>
-! CHECK:       %[[LOAD2]] = fir.load %[[COORD2]] : !fir.ref<i32>
-! CHECK:       %[[COMBINED:.*]] = arith.addi %[[LOAD1]], %[[LOAD2]] : i32
-! CHECK:       fir.store %[[COMBINED]] to %[[COORD1]] : !fir.ref<i32>
-! CHECK:     }
-! CHECK:   }
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.array<100x10xi32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <add> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xi32>>):
-! CHECK:   %[[INIT:.*]] = arith.constant 0 : i32
-! CHECK:   %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<100xi32>
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
-! HFLIR:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<100xi32>>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>>):
-! CHECK:   %[[LB:.*]] = arith.constant 0 : index
-! CHECK:   %[[UB:.*]] = arith.constant 99 : index
-! CHECK:   %[[STEP:.*]] = arith.constant 1 : index
-! CHECK:   fir.do_loop %[[IV:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
-! CHECK:     %[[COORD1:.*]] = fir.coordinate_of %[[ARG0]], %[[IV]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK:     %[[COORD2:.*]] = fir.coordinate_of %[[ARG1]], %[[IV]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
-! CHECK:     %[[LOAD1:.*]] = fir.load %[[COORD1]] : !fir.ref<i32>
-! CHECK:     %[[LOAD2:.*]] = fir.load %[[COORD2]] : !fir.ref<i32>
-! CHECK:     %[[COMBINED:.*]] = arith.addi %[[LOAD1]], %[[LOAD2]] : i32
-! CHECK:     fir.store %[[COMBINED]] to %[[COORD1]] : !fir.ref<i32>
-! CHECK:   }
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.array<100xi32>>
-! CHECK: }
-
-! CHECK-LABEL: acc.reduction.recipe @reduction_add_ref_i32 : !fir.ref<i32> reduction_operator <add> init {
-! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
-! CHECK:   %[[INIT:.*]] = arith.constant 0 : i32
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca i32
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:   fir.store %[[INIT]] to %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<i32>
-! CHECK: } combiner {
-! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK:   %[[COMBINED:.*]] = arith.addi %[[LOAD0]], %[[LOAD1]] : i32
-! CHECK:   fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK:   acc.yield %[[ARG0]] : !fir.ref<i32>
-! CHECK: }
+! CHECK-LABEL:   acc.reduction.recipe @reduction_max_box_UxUxf32 : !fir.box<!fir.array<?x?xf32>> reduction_operator <max> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?x?xf32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK:           %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK:           %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_2]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1, %[[BOX_DIMS_1]]#1 : (index, index) -> !fir.shape<2>
+! CHECK:           %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?x?xf32>, %[[BOX_DIMS_0]]#1, %[[BOX_DIMS_1]]#1 {bindc_name = ".tmp", uniq_name = ""}
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.heap<!fir.array<?x?xf32>>)
+! CHECK:           hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : f32, !fir.box<!fir.array<?x?xf32>>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.box<!fir.array<?x?xf32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?x?xf32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?x?xf32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : index
+! CHECK:           %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_0]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 1 : index
+! CHECK:           %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1, %[[BOX_DIMS_1]]#1 : (index, index) -> !fir.shape<2>
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 0 : index
+! CHECK:           %[[BOX_DIMS_2:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_2]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 1 : index
+! CHECK:           %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_3]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK:           %[[SHAPE_1:.*]] = fir.shape %[[BOX_DIMS_2]]#1, %[[BOX_DIMS_3]]#1 : (index, index) -> !fir.shape<2>
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[BOX_DIMS_1]]#1 step %[[CONSTANT_4]] unordered {
+! CHECK:             fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[BOX_DIMS_0]]#1 step %[[CONSTANT_4]] unordered {
+! CHECK:               %[[CONSTANT_5:.*]] = arith.constant 0 : index
+! CHECK:               %[[BOX_DIMS_4:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_5]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK:               %[[CONSTANT_6:.*]] = arith.constant 1 : index
+! CHECK:               %[[BOX_DIMS_5:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_6]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK:               %[[CONSTANT_7:.*]] = arith.constant 1 : index
+! CHECK:               %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_4]]#0, %[[CONSTANT_7]] : index
+! CHECK:               %[[ADDI_0:.*]] = arith.addi %[[VAL_3]], %[[SUBI_0]] : index
+! CHECK:               %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_5]]#0, %[[CONSTANT_7]] : index
+! CHECK:               %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
+! CHECK:               %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[ADDI_0]], %[[ADDI_1]])  : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
+! CHECK:               %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
+! CHECK:               %[[CONSTANT_8:.*]] = arith.constant 0 : index
+! CHECK:               %[[BOX_DIMS_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_8]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK:               %[[CONSTANT_9:.*]] = arith.constant 1 : index
+! CHECK:               %[[BOX_DIMS_7:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_9]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+! CHECK:               %[[CONSTANT_10:.*]] = arith.constant 1 : index
+! CHECK:               %[[SUBI_2:.*]] = arith.subi %[[BOX_DIMS_6]]#0, %[[CONSTANT_10]] : index
+! CHECK:               %[[ADDI_2:.*]] = arith.addi %[[VAL_3]], %[[SUBI_2]] : index
+! CHECK:               %[[SUBI_3:.*]] = arith.subi %[[BOX_DIMS_7]]#0, %[[CONSTANT_10]] : index
+! CHECK:               %[[ADDI_3:.*]] = arith.addi %[[VAL_2]], %[[SUBI_3]] : index
+! CHECK:               %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[ADDI_2]], %[[ADDI_3]])  : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
+! CHECK:               %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
+! CHECK:               %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK:               %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
+! CHECK:               hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
+! CHECK:             }
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.box<!fir.array<?x?xf32>>
+
+! CHECK-LABEL:   } destroy {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?x?xf32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?x?xf32>>):
+! CHECK:           %[[BOX_ADDR_0:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
+! CHECK:           %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ref<!fir.array<?x?xf32>>) -> !fir.heap<!fir.array<?x?xf32>>
+! CHECK:           fir.freemem %[[CONVERT_0]] : !fir.heap<!fir.array<?x?xf32>>
+! CHECK:           acc.terminator
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_max_ref_box_ptr_Uxf32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> reduction_operator <max> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK:           %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_1]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?xf32>, %[[BOX_DIMS_0]]#1 {bindc_name = ".tmp", uniq_name = ""}
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xf32>>>
+! CHECK:           %[[DECLARE_1:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK:           %[[CONVERT_0:.*]] = fir.convert %[[DECLARE_1]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.array<?xf32>>>
+! CHECK:           fir.store %[[DECLARE_0]]#0 to %[[CONVERT_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+! CHECK:           hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_1]]#0 : f32, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:           acc.yield %[[DECLARE_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : index
+! CHECK:           %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[LOAD_1]], %[[CONSTANT_0]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK:           %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_1]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK:           %[[SHAPE_1:.*]] = fir.shape %[[BOX_DIMS_1]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[BOX_DIMS_0]]#1 step %[[CONSTANT_2]] unordered {
+! CHECK:             %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK:             %[[BOX_DIMS_2:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_3]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK:             %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK:             %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index
+! CHECK:             %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index
+! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[LOAD_0]] (%[[ADDI_0]])  : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_2:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
+! CHECK:             %[[CONSTANT_5:.*]] = arith.constant 0 : index
+! CHECK:             %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[LOAD_1]], %[[CONSTANT_5]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK:             %[[CONSTANT_6:.*]] = arith.constant 1 : index
+! CHECK:             %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index
+! CHECK:             %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
+! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[LOAD_1]] (%[[ADDI_1]])  : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_3:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
+! CHECK:             %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_3]], %[[LOAD_2]] fastmath<contract> : f32
+! CHECK:             %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_3]], %[[LOAD_2]] : f32
+! CHECK:             hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+
+! CHECK-LABEL:   } destroy {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:           %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>>
+! CHECK:           %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ptr<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>>
+! CHECK:           fir.freemem %[[CONVERT_0]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:           acc.terminator
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_max_ref_box_heap_Uxf32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> reduction_operator <max> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK:           %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_1]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?xf32>, %[[BOX_DIMS_0]]#1 {bindc_name = ".tmp", uniq_name = ""}
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>>
+! CHECK:           %[[DECLARE_1:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK:           %[[CONVERT_0:.*]] = fir.convert %[[DECLARE_1]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.array<?xf32>>>
+! CHECK:           fir.store %[[DECLARE_0]]#0 to %[[CONVERT_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+! CHECK:           hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_1]]#0 : f32, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK:           acc.yield %[[DECLARE_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : index
+! CHECK:           %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[LOAD_1]], %[[CONSTANT_0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK:           %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_1]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK:           %[[SHAPE_1:.*]] = fir.shape %[[BOX_DIMS_1]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[BOX_DIMS_0]]#1 step %[[CONSTANT_2]] unordered {
+! CHECK:             %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK:             %[[BOX_DIMS_2:.*]]:3 = fir.box_dims %[[LOAD_0]], %[[CONSTANT_3]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK:             %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK:             %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index
+! CHECK:             %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index
+! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[LOAD_0]] (%[[ADDI_0]])  : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_2:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
+! CHECK:             %[[CONSTANT_5:.*]] = arith.constant 0 : index
+! CHECK:             %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[LOAD_1]], %[[CONSTANT_5]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK:             %[[CONSTANT_6:.*]] = arith.constant 1 : index
+! CHECK:             %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index
+! CHECK:             %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
+! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[LOAD_1]] (%[[ADDI_1]])  : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_3:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
+! CHECK:             %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_3]], %[[LOAD_2]] fastmath<contract> : f32
+! CHECK:             %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_3]], %[[LOAD_2]] : f32
+! CHECK:             hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+
+! CHECK-LABEL:   } destroy {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK:           %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
+! CHECK:           fir.freemem %[[BOX_ADDR_0]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:           acc.terminator
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_add_section_lb1.ub3_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK:           %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?xi32>, %[[BOX_DIMS_0]]#1 {bindc_name = ".tmp", uniq_name = ""}
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
+! CHECK:           hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : i32, !fir.box<!fir.array<?xi32>>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.box<!fir.array<?xi32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1 : index
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 3 : index
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[SUBI_0:.*]] = arith.subi %[[CONSTANT_1]], %[[CONSTANT_0]] : index
+! CHECK:           %[[ADDI_0:.*]] = arith.addi %[[SUBI_0]], %[[CONSTANT_2]] : index
+! CHECK:           %[[DIVSI_0:.*]] = arith.divsi %[[ADDI_0]], %[[CONSTANT_2]] : index
+! CHECK:           %[[CMPI_0:.*]] = arith.cmpi sgt, %[[DIVSI_0]], %[[CONSTANT_3]] : index
+! CHECK:           %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[DIVSI_0]], %[[CONSTANT_3]] : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[SELECT_0]] : (index) -> !fir.shape<1>
+! CHECK:           %[[BD_LHS:.*]]:3 = fir.box_dims %[[VAL_0]], %c0{{.*}} : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK:           %[[LB_LHS:.*]] = arith.addi %[[BD_LHS]]#0, %c1{{.*}} : index
+! CHECK:           %[[UB_LHS:.*]] = arith.addi %[[BD_LHS]]#0, %c3{{.*}} : index
+! CHECK:           %[[BD_RHS:.*]]:3 = fir.box_dims %[[VAL_1]], %c0{{.*}} : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK:           %[[LB_RHS:.*]] = arith.addi %[[BD_RHS]]#0, %c1{{.*}} : index
+! CHECK:           %[[UB_RHS:.*]] = arith.addi %[[BD_RHS]]#0, %c3{{.*}} : index
+! CHECK:           %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[LB_RHS]]:%[[UB_RHS]]:%c1{{.*}})  shape %[[SHAPE_0]] : (!fir.box<!fir.array<?xi32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+! CHECK:           %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[LB_LHS]]:%[[UB_LHS]]:%c1{{.*}})  shape %[[SHAPE_0]] : (!fir.box<!fir.array<?xi32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[SELECT_0]] step %[[CONSTANT_4]] unordered {
+! CHECK:             %[[DESIGNATE_2:.*]] = hlfir.designate %[[DESIGNATE_0]] (%[[VAL_2]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
+! CHECK:             %[[DESIGNATE_3:.*]] = hlfir.designate %[[DESIGNATE_1]] (%[[VAL_2]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_3]] : !fir.ref<i32>
+! CHECK:             %[[ADDI_1:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:             hlfir.assign %[[ADDI_1]] to %[[DESIGNATE_3]] : i32, !fir.ref<i32>
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.box<!fir.array<?xi32>>
+
+! CHECK-LABEL:   } destroy {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>):
+! CHECK:           %[[BOX_ADDR_0:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+! CHECK:           %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK:           fir.freemem %[[CONVERT_0]] : !fir.heap<!fir.array<?xi32>>
+! CHECK:           acc.terminator
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_max_box_Uxf32 : !fir.box<!fir.array<?xf32>> reduction_operator <max> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK:           %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?xf32>, %[[BOX_DIMS_0]]#1 {bindc_name = ".tmp", uniq_name = ""}
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
+! CHECK:           hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : f32, !fir.box<!fir.array<?xf32>>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.box<!fir.array<?xf32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xf32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : index
+! CHECK:           %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_0]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK:           %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK:           %[[SHAPE_1:.*]] = fir.shape %[[BOX_DIMS_1]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[BOX_DIMS_0]]#1 step %[[CONSTANT_2]] unordered {
+! CHECK:             %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK:             %[[BOX_DIMS_2:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_3]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK:             %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK:             %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index
+! CHECK:             %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index
+! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[ADDI_0]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
+! CHECK:             %[[CONSTANT_5:.*]] = arith.constant 0 : index
+! CHECK:             %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_5]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK:             %[[CONSTANT_6:.*]] = arith.constant 1 : index
+! CHECK:             %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index
+! CHECK:             %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
+! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[ADDI_1]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
+! CHECK:             %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK:             %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
+! CHECK:             hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.box<!fir.array<?xf32>>
+
+! CHECK-LABEL:   } destroy {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xf32>>):
+! CHECK:           %[[BOX_ADDR_0:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+! CHECK:           %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ref<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>>
+! CHECK:           fir.freemem %[[CONVERT_0]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:           acc.terminator
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_add_box_Uxi32 : !fir.box<!fir.array<?xi32>> reduction_operator <add> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK:           %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[ALLOCMEM_0:.*]] = fir.allocmem !fir.array<?xi32>, %[[BOX_DIMS_0]]#1 {bindc_name = ".tmp", uniq_name = ""}
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCMEM_0]](%[[SHAPE_0]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
+! CHECK:           hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : i32, !fir.box<!fir.array<?xi32>>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.box<!fir.array<?xi32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : index
+! CHECK:           %[[BOX_DIMS_0:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_0]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[BOX_DIMS_0]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 0 : index
+! CHECK:           %[[BOX_DIMS_1:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK:           %[[SHAPE_1:.*]] = fir.shape %[[BOX_DIMS_1]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[BOX_DIMS_0]]#1 step %[[CONSTANT_2]] unordered {
+! CHECK:             %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK:             %[[BOX_DIMS_2:.*]]:3 = fir.box_dims %[[VAL_1]], %[[CONSTANT_3]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK:             %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK:             %[[SUBI_0:.*]] = arith.subi %[[BOX_DIMS_2]]#0, %[[CONSTANT_4]] : index
+! CHECK:             %[[ADDI_0:.*]] = arith.addi %[[VAL_2]], %[[SUBI_0]] : index
+! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[ADDI_0]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
+! CHECK:             %[[CONSTANT_5:.*]] = arith.constant 0 : index
+! CHECK:             %[[BOX_DIMS_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[CONSTANT_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK:             %[[CONSTANT_6:.*]] = arith.constant 1 : index
+! CHECK:             %[[SUBI_1:.*]] = arith.subi %[[BOX_DIMS_3]]#0, %[[CONSTANT_6]] : index
+! CHECK:             %[[ADDI_1:.*]] = arith.addi %[[VAL_2]], %[[SUBI_1]] : index
+! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[ADDI_1]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
+! CHECK:             %[[ADDI_2:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:             hlfir.assign %[[ADDI_2]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.box<!fir.array<?xi32>>
+
+! CHECK-LABEL:   } destroy {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>, %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>):
+! CHECK:           %[[BOX_ADDR_0:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+! CHECK:           %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK:           fir.freemem %[[CONVERT_0]] : !fir.heap<!fir.array<?xi32>>
+! CHECK:           acc.terminator
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_add_section_lb0.ub9xlb0.ub19_ref_10x20xi32 : !fir.ref<!fir.array<10x20xi32>> reduction_operator <add> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<10x20xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 10 : index
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 20 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]], %[[CONSTANT_2]] : (index, index) -> !fir.shape<2>
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.array<10x20xi32>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<10x20xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<10x20xi32>>, !fir.ref<!fir.array<10x20xi32>>)
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 19 : index
+! CHECK:           %[[CONSTANT_5:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_3]] to %[[CONSTANT_4]] step %[[CONSTANT_5]] {
+! CHECK:             %[[CONSTANT_6:.*]] = arith.constant 0 : index
+! CHECK:             %[[CONSTANT_7:.*]] = arith.constant 9 : index
+! CHECK:             %[[CONSTANT_8:.*]] = arith.constant 1 : index
+! CHECK:             fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_7]] step %[[CONSTANT_8]] {
+! CHECK:               %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_2]], %[[VAL_1]] : (!fir.ref<!fir.array<10x20xi32>>, index, index) -> !fir.ref<i32>
+! CHECK:               fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32>
+! CHECK:             }
+! CHECK:           }
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<10x20xi32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<10x20xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<10x20xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : index
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 9 : index
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 19 : index
+! CHECK:           %[[CONSTANT_5:.*]] = arith.constant 1 : index
+! CHECK:           %[[CONSTANT_6:.*]] = arith.constant 0 : index
+! CHECK:           %[[SUBI_0:.*]] = arith.subi %[[CONSTANT_1]], %[[CONSTANT_0]] : index
+! CHECK:           %[[ADDI_0:.*]] = arith.addi %[[SUBI_0]], %[[CONSTANT_2]] : index
+! CHECK:           %[[DIVSI_0:.*]] = arith.divsi %[[ADDI_0]], %[[CONSTANT_2]] : index
+! CHECK:           %[[CMPI_0:.*]] = arith.cmpi sgt, %[[DIVSI_0]], %[[CONSTANT_6]] : index
+! CHECK:           %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[DIVSI_0]], %[[CONSTANT_6]] : index
+! CHECK:           %[[CONSTANT_7:.*]] = arith.constant 0 : index
+! CHECK:           %[[SUBI_1:.*]] = arith.subi %[[CONSTANT_4]], %[[CONSTANT_3]] : index
+! CHECK:           %[[ADDI_1:.*]] = arith.addi %[[SUBI_1]], %[[CONSTANT_5]] : index
+! CHECK:           %[[DIVSI_1:.*]] = arith.divsi %[[ADDI_1]], %[[CONSTANT_5]] : index
+! CHECK:           %[[CMPI_1:.*]] = arith.cmpi sgt, %[[DIVSI_1]], %[[CONSTANT_7]] : index
+! CHECK:           %[[SELECT_1:.*]] = arith.select %[[CMPI_1]], %[[DIVSI_1]], %[[CONSTANT_7]] : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[SELECT_0]], %[[SELECT_1]] : (index, index) -> !fir.shape<2>
+! CHECK:           %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%c1{{.*}}:%c10{{.*}}:%c1{{.*}}, %c1{{.*}}:%c20{{.*}}:%c1{{.*}})  shape %[[SHAPE_0]] : (!fir.ref<!fir.array<10x20xi32>>, index, index, index, index, index, index, !fir.shape<2>) -> !fir.ref<!fir.array<10x20xi32>>
+! CHECK:           %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%c1{{.*}}:%c10{{.*}}:%c1{{.*}}, %c1{{.*}}:%c20{{.*}}:%c1{{.*}})  shape %[[SHAPE_0]] : (!fir.ref<!fir.array<10x20xi32>>, index, index, index, index, index, index, !fir.shape<2>) -> !fir.ref<!fir.array<10x20xi32>>
+! CHECK:           %[[CONSTANT_8:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_8]] to %[[SELECT_1]] step %[[CONSTANT_8]] unordered {
+! CHECK:             fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_8]] to %[[SELECT_0]] step %[[CONSTANT_8]] unordered {
+! CHECK:               %[[DESIGNATE_2:.*]] = hlfir.designate %[[DESIGNATE_0]] (%[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<10x20xi32>>, index, index) -> !fir.ref<i32>
+! CHECK:               %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
+! CHECK:               %[[DESIGNATE_3:.*]] = hlfir.designate %[[DESIGNATE_1]] (%[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<10x20xi32>>, index, index) -> !fir.ref<i32>
+! CHECK:               %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_3]] : !fir.ref<i32>
+! CHECK:               %[[ADDI_2:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:               hlfir.assign %[[ADDI_2]] to %[[DESIGNATE_3]] : i32, !fir.ref<i32>
+! CHECK:             }
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.array<10x20xi32>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_add_section_lb10.ub19_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <add> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xi32>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 0 : index
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 99 : index
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] {
+! CHECK:             %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK:             fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32>
+! CHECK:           }
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xi32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 10 : index
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 19 : index
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[SUBI_0:.*]] = arith.subi %[[CONSTANT_1]], %[[CONSTANT_0]] : index
+! CHECK:           %[[ADDI_0:.*]] = arith.addi %[[SUBI_0]], %[[CONSTANT_2]] : index
+! CHECK:           %[[DIVSI_0:.*]] = arith.divsi %[[ADDI_0]], %[[CONSTANT_2]] : index
+! CHECK:           %[[CMPI_0:.*]] = arith.cmpi sgt, %[[DIVSI_0]], %[[CONSTANT_3]] : index
+! CHECK:           %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[DIVSI_0]], %[[CONSTANT_3]] : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[SELECT_0]] : (index) -> !fir.shape<1>
+! CHECK:           %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%c11{{.*}}:%c20{{.*}}:%c1{{.*}})  shape %[[SHAPE_0]] : (!fir.ref<!fir.array<100xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<100xi32>>
+! CHECK:           %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%c11{{.*}}:%c20{{.*}}:%c1{{.*}})  shape %[[SHAPE_0]] : (!fir.ref<!fir.array<100xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<100xi32>>
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[SELECT_0]] step %[[CONSTANT_4]] unordered {
+! CHECK:             %[[DESIGNATE_2:.*]] = hlfir.designate %[[DESIGNATE_0]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_2]] : !fir.ref<i32>
+! CHECK:             %[[DESIGNATE_3:.*]] = hlfir.designate %[[DESIGNATE_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_3]] : !fir.ref<i32>
+! CHECK:             %[[ADDI_1:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:             hlfir.assign %[[ADDI_1]] to %[[DESIGNATE_3]] : i32, !fir.ref<i32>
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xi32>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_add_ref_box_ptr_i32 : !fir.ref<!fir.box<!fir.ptr<i32>>> reduction_operator <add> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.box<!fir.ptr<i32>>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+! CHECK:           %[[ALLOCMEM_0:.*]] = fir.allocmem i32
+! CHECK:           %[[EMBOX_0:.*]] = fir.embox %[[ALLOCMEM_0]] : (!fir.heap<i32>) -> !fir.box<!fir.ptr<i32>>
+! CHECK:           fir.store %[[EMBOX_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+! CHECK:           hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : i32, !fir.ref<!fir.box<!fir.ptr<i32>>>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
+! CHECK:           %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
+! CHECK:           %[[BOX_ADDR_1:.*]] = fir.box_addr %[[LOAD_1]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
+! CHECK:           %[[LOAD_2:.*]] = fir.load %[[BOX_ADDR_0]] : !fir.ptr<i32>
+! CHECK:           %[[LOAD_3:.*]] = fir.load %[[BOX_ADDR_1]] : !fir.ptr<i32>
+! CHECK:           %[[ADDI_0:.*]] = arith.addi %[[LOAD_3]], %[[LOAD_2]] : i32
+! CHECK:           hlfir.assign %[[ADDI_0]] to %[[BOX_ADDR_1]] : i32, !fir.ptr<i32>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
+
+! CHECK-LABEL:   } destroy {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
+! CHECK:           %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
+! CHECK:           %[[CONVERT_0:.*]] = fir.convert %[[BOX_ADDR_0]] : (!fir.ptr<i32>) -> !fir.heap<i32>
+! CHECK:           fir.freemem %[[CONVERT_0]] : !fir.heap<i32>
+! CHECK:           acc.terminator
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_add_ref_box_heap_i32 : !fir.ref<!fir.box<!fir.heap<i32>>> reduction_operator <add> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.box<!fir.heap<i32>>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+! CHECK:           %[[ALLOCMEM_0:.*]] = fir.allocmem i32
+! CHECK:           %[[EMBOX_0:.*]] = fir.embox %[[ALLOCMEM_0]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
+! CHECK:           fir.store %[[EMBOX_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK:           hlfir.assign %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : i32, !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK:           %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK:           %[[BOX_ADDR_1:.*]] = fir.box_addr %[[LOAD_1]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
+! CHECK:           %[[LOAD_2:.*]] = fir.load %[[BOX_ADDR_0]] : !fir.heap<i32>
+! CHECK:           %[[LOAD_3:.*]] = fir.load %[[BOX_ADDR_1]] : !fir.heap<i32>
+! CHECK:           %[[ADDI_0:.*]] = arith.addi %[[LOAD_3]], %[[LOAD_2]] : i32
+! CHECK:           hlfir.assign %[[ADDI_0]] to %[[BOX_ADDR_1]] : i32, !fir.heap<i32>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+
+! CHECK-LABEL:   } destroy {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK:           %[[BOX_ADDR_0:.*]] = fir.box_addr %[[LOAD_0]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
+! CHECK:           fir.freemem %[[BOX_ADDR_0]] : !fir.heap<i32>
+! CHECK:           acc.terminator
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_mul_ref_z32 : !fir.ref<complex<f32>> reduction_operator <mul> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<complex<f32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           %[[UNDEFINED_0:.*]] = fir.undefined complex<f32>
+! CHECK:           %[[INSERT_VALUE_0:.*]] = fir.insert_value %[[UNDEFINED_0]], %[[CONSTANT_0]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+! CHECK:           %[[INSERT_VALUE_1:.*]] = fir.insert_value %[[INSERT_VALUE_0]], %[[CONSTANT_1]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca complex<f32>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
+! CHECK:           fir.store %[[INSERT_VALUE_1]] to %[[DECLARE_0]]#0 : !fir.ref<complex<f32>>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<complex<f32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<complex<f32>>, %[[VAL_1:.*]]: !fir.ref<complex<f32>>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<complex<f32>>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<complex<f32>>
+! CHECK:           %[[MULC_0:.*]] = fir.mulc %[[LOAD_1]], %[[LOAD_0]] {fastmath = #arith.fastmath<contract>} : complex<f32>
+! CHECK:           hlfir.assign %[[MULC_0]] to %[[VAL_0]] : complex<f32>, !fir.ref<complex<f32>>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<complex<f32>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_add_ref_z32 : !fir.ref<complex<f32>> reduction_operator <add> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<complex<f32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           %[[UNDEFINED_0:.*]] = fir.undefined complex<f32>
+! CHECK:           %[[INSERT_VALUE_0:.*]] = fir.insert_value %[[UNDEFINED_0]], %[[CONSTANT_0]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+! CHECK:           %[[INSERT_VALUE_1:.*]] = fir.insert_value %[[INSERT_VALUE_0]], %[[CONSTANT_1]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca complex<f32>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
+! CHECK:           fir.store %[[INSERT_VALUE_1]] to %[[DECLARE_0]]#0 : !fir.ref<complex<f32>>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<complex<f32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<complex<f32>>, %[[VAL_1:.*]]: !fir.ref<complex<f32>>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<complex<f32>>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<complex<f32>>
+! CHECK:           %[[ADDC_0:.*]] = fir.addc %[[LOAD_1]], %[[LOAD_0]] {fastmath = #arith.fastmath<contract>} : complex<f32>
+! CHECK:           hlfir.assign %[[ADDC_0]] to %[[VAL_0]] : complex<f32>, !fir.ref<complex<f32>>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<complex<f32>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_neqv_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <neqv> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant false
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.logical<4>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[CONVERT_0:.*]] = fir.convert %[[CONSTANT_0]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[CONVERT_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>, %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[CONVERT_0:.*]] = fir.convert %[[LOAD_1]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[CONVERT_1:.*]] = fir.convert %[[LOAD_0]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[CMPI_0:.*]] = arith.cmpi ne, %[[CONVERT_0]], %[[CONVERT_1]] : i1
+! CHECK:           %[[CONVERT_2:.*]] = fir.convert %[[CMPI_0]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[CONVERT_2]] to %[[VAL_0]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.logical<4>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_eqv_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <eqv> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant true
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.logical<4>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[CONVERT_0:.*]] = fir.convert %[[CONSTANT_0]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[CONVERT_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>, %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[CONVERT_0:.*]] = fir.convert %[[LOAD_1]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[CONVERT_1:.*]] = fir.convert %[[LOAD_0]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[CMPI_0:.*]] = arith.cmpi eq, %[[CONVERT_0]], %[[CONVERT_1]] : i1
+! CHECK:           %[[CONVERT_2:.*]] = fir.convert %[[CMPI_0]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[CONVERT_2]] to %[[VAL_0]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.logical<4>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_lor_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <lor> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant false
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.logical<4>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[CONVERT_0:.*]] = fir.convert %[[CONSTANT_0]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[CONVERT_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>, %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[CONVERT_0:.*]] = fir.convert %[[LOAD_1]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[CONVERT_1:.*]] = fir.convert %[[LOAD_0]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[ORI_0:.*]] = arith.ori %[[CONVERT_0]], %[[CONVERT_1]] : i1
+! CHECK:           %[[CONVERT_2:.*]] = fir.convert %[[ORI_0]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[CONVERT_2]] to %[[VAL_0]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.logical<4>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_land_ref_l32 : !fir.ref<!fir.logical<4>> reduction_operator <land> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant true
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.logical<4>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[CONVERT_0:.*]] = fir.convert %[[CONSTANT_0]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[CONVERT_0]] to %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.logical<4>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>, %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[CONVERT_0:.*]] = fir.convert %[[LOAD_1]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[CONVERT_1:.*]] = fir.convert %[[LOAD_0]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[ANDI_0:.*]] = arith.andi %[[CONVERT_0]], %[[CONVERT_1]] : i1
+! CHECK:           %[[CONVERT_2:.*]] = fir.convert %[[ANDI_0]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[CONVERT_2]] to %[[VAL_0]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.logical<4>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_xor_ref_i32 : !fir.ref<i32> reduction_operator <xor> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca i32
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
+! CHECK:           %[[XORI_0:.*]] = arith.xori %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:           hlfir.assign %[[XORI_0]] to %[[VAL_0]] : i32, !fir.ref<i32>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<i32>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_ior_ref_i32 : !fir.ref<i32> reduction_operator <ior> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca i32
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
+! CHECK:           %[[ORI_0:.*]] = arith.ori %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:           hlfir.assign %[[ORI_0]] to %[[VAL_0]] : i32, !fir.ref<i32>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<i32>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_iand_ref_i32 : !fir.ref<i32> reduction_operator <iand> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant -1 : i32
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca i32
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
+! CHECK:           %[[ANDI_0:.*]] = arith.andi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:           hlfir.assign %[[ANDI_0]] to %[[VAL_0]] : i32, !fir.ref<i32>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<i32>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_max_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <max> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xf32>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 0 : index
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 99 : index
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] {
+! CHECK:             %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK:             fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<f32>
+! CHECK:           }
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xf32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xf32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
+! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
+! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
+! CHECK:             %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK:             %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
+! CHECK:             hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xf32>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_max_ref_f32 : !fir.ref<f32> reduction_operator <max> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant -1.401300e-45 : f32
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca f32
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<f32>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<f32>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>, %[[VAL_1:.*]]: !fir.ref<f32>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<f32>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<f32>
+! CHECK:           %[[CMPF_0:.*]] = arith.cmpf ogt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK:           %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
+! CHECK:           hlfir.assign %[[SELECT_0]] to %[[VAL_0]] : f32, !fir.ref<f32>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<f32>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_max_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <max> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant -2147483648 : i32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 10 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]], %[[CONSTANT_2]] : (index, index) -> !fir.shape<2>
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100x10xi32>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xi32>>, !fir.ref<!fir.array<100x10xi32>>)
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 9 : index
+! CHECK:           %[[CONSTANT_5:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_3]] to %[[CONSTANT_4]] step %[[CONSTANT_5]] {
+! CHECK:             %[[CONSTANT_6:.*]] = arith.constant 0 : index
+! CHECK:             %[[CONSTANT_7:.*]] = arith.constant 99 : index
+! CHECK:             %[[CONSTANT_8:.*]] = arith.constant 1 : index
+! CHECK:             fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_7]] step %[[CONSTANT_8]] {
+! CHECK:               %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_2]], %[[VAL_1]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
+! CHECK:               fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32>
+! CHECK:             }
+! CHECK:           }
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100x10xi32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100x10xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 10 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]], %[[CONSTANT_1]] : (index, index) -> !fir.shape<2>
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 100 : index
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 10 : index
+! CHECK:           %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_2]], %[[CONSTANT_3]] : (index, index) -> !fir.shape<2>
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_1]] step %[[CONSTANT_4]] unordered {
+! CHECK:             fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_0]] step %[[CONSTANT_4]] unordered {
+! CHECK:               %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
+! CHECK:               %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
+! CHECK:               %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
+! CHECK:               %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
+! CHECK:               %[[CMPI_0:.*]] = arith.cmpi sgt, %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:               %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:               hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
+! CHECK:             }
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100x10xi32>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_max_ref_i32 : !fir.ref<i32> reduction_operator <max> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant -2147483648 : i32
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca i32
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
+! CHECK:           %[[CMPI_0:.*]] = arith.cmpi sgt, %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:           %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:           hlfir.assign %[[SELECT_0]] to %[[VAL_0]] : i32, !fir.ref<i32>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<i32>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_min_ref_100x10xf32 : !fir.ref<!fir.array<100x10xf32>> reduction_operator <min> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xf32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 3.40282347E+38 : f32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 10 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]], %[[CONSTANT_2]] : (index, index) -> !fir.shape<2>
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100x10xf32>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xf32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xf32>>, !fir.ref<!fir.array<100x10xf32>>)
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 9 : index
+! CHECK:           %[[CONSTANT_5:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_3]] to %[[CONSTANT_4]] step %[[CONSTANT_5]] {
+! CHECK:             %[[CONSTANT_6:.*]] = arith.constant 0 : index
+! CHECK:             %[[CONSTANT_7:.*]] = arith.constant 99 : index
+! CHECK:             %[[CONSTANT_8:.*]] = arith.constant 1 : index
+! CHECK:             fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_7]] step %[[CONSTANT_8]] {
+! CHECK:               %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_2]], %[[VAL_1]] : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32>
+! CHECK:               fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<f32>
+! CHECK:             }
+! CHECK:           }
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100x10xf32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xf32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100x10xf32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 10 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]], %[[CONSTANT_1]] : (index, index) -> !fir.shape<2>
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 100 : index
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 10 : index
+! CHECK:           %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_2]], %[[CONSTANT_3]] : (index, index) -> !fir.shape<2>
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_1]] step %[[CONSTANT_4]] unordered {
+! CHECK:             fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_0]] step %[[CONSTANT_4]] unordered {
+! CHECK:               %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32>
+! CHECK:               %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
+! CHECK:               %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<100x10xf32>>, index, index) -> !fir.ref<f32>
+! CHECK:               %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
+! CHECK:               %[[CMPF_0:.*]] = arith.cmpf olt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK:               %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
+! CHECK:               hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
+! CHECK:             }
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100x10xf32>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_min_ref_f32 : !fir.ref<f32> reduction_operator <min> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 3.40282347E+38 : f32
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca f32
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<f32>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<f32>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>, %[[VAL_1:.*]]: !fir.ref<f32>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<f32>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<f32>
+! CHECK:           %[[CMPF_0:.*]] = arith.cmpf olt, %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK:           %[[SELECT_0:.*]] = arith.select %[[CMPF_0]], %[[LOAD_1]], %[[LOAD_0]] : f32
+! CHECK:           hlfir.assign %[[SELECT_0]] to %[[VAL_0]] : f32, !fir.ref<f32>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<f32>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_min_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <min> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 2147483647 : i32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xi32>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 0 : index
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 99 : index
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] {
+! CHECK:             %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK:             fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32>
+! CHECK:           }
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xi32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
+! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
+! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
+! CHECK:             %[[CMPI_0:.*]] = arith.cmpi slt, %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:             %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:             hlfir.assign %[[SELECT_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xi32>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_min_ref_i32 : !fir.ref<i32> reduction_operator <min> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 2147483647 : i32
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca i32
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
+! CHECK:           %[[CMPI_0:.*]] = arith.cmpi slt, %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:           %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:           hlfir.assign %[[SELECT_0]] to %[[VAL_0]] : i32, !fir.ref<i32>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<i32>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_mul_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <mul> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xf32>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 0 : index
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 99 : index
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] {
+! CHECK:             %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK:             fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<f32>
+! CHECK:           }
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xf32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xf32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
+! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
+! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
+! CHECK:             %[[MULF_0:.*]] = arith.mulf %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK:             hlfir.assign %[[MULF_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xf32>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_mul_ref_f32 : !fir.ref<f32> reduction_operator <mul> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca f32
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<f32>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<f32>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>, %[[VAL_1:.*]]: !fir.ref<f32>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<f32>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<f32>
+! CHECK:           %[[MULF_0:.*]] = arith.mulf %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK:           hlfir.assign %[[MULF_0]] to %[[VAL_0]] : f32, !fir.ref<f32>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<f32>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_mul_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <mul> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1 : i32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xi32>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 0 : index
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 99 : index
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] {
+! CHECK:             %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK:             fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32>
+! CHECK:           }
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xi32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
+! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
+! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
+! CHECK:             %[[MULI_0:.*]] = arith.muli %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:             hlfir.assign %[[MULI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xi32>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_mul_ref_i32 : !fir.ref<i32> reduction_operator <mul> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1 : i32
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca i32
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
+! CHECK:           %[[MULI_0:.*]] = arith.muli %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:           hlfir.assign %[[MULI_0]] to %[[VAL_0]] : i32, !fir.ref<i32>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<i32>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_add_ref_100xf32 : !fir.ref<!fir.array<100xf32>> reduction_operator <add> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xf32>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 0 : index
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 99 : index
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] {
+! CHECK:             %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK:             fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<f32>
+! CHECK:           }
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xf32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xf32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xf32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
+! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<f32>
+! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xf32>>, index) -> !fir.ref<f32>
+! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<f32>
+! CHECK:             %[[ADDF_0:.*]] = arith.addf %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK:             hlfir.assign %[[ADDF_0]] to %[[DESIGNATE_1]] : f32, !fir.ref<f32>
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xf32>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_add_ref_f32 : !fir.ref<f32> reduction_operator <add> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca f32
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<f32>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<f32>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>, %[[VAL_1:.*]]: !fir.ref<f32>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<f32>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<f32>
+! CHECK:           %[[ADDF_0:.*]] = arith.addf %[[LOAD_1]], %[[LOAD_0]] fastmath<contract> : f32
+! CHECK:           hlfir.assign %[[ADDF_0]] to %[[VAL_0]] : f32, !fir.ref<f32>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<f32>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_add_ref_100x10x2xi32 : !fir.ref<!fir.array<100x10x2xi32>> reduction_operator <add> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10x2xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 10 : index
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 2 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]], %[[CONSTANT_2]], %[[CONSTANT_3]] : (index, index, index) -> !fir.shape<3>
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100x10x2xi32>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10x2xi32>>, !fir.shape<3>) -> (!fir.ref<!fir.array<100x10x2xi32>>, !fir.ref<!fir.array<100x10x2xi32>>)
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 0 : index
+! CHECK:           %[[CONSTANT_5:.*]] = arith.constant 1 : index
+! CHECK:           %[[CONSTANT_6:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_5]] step %[[CONSTANT_6]] {
+! CHECK:             %[[CONSTANT_7:.*]] = arith.constant 0 : index
+! CHECK:             %[[CONSTANT_8:.*]] = arith.constant 9 : index
+! CHECK:             %[[CONSTANT_9:.*]] = arith.constant 1 : index
+! CHECK:             fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_7]] to %[[CONSTANT_8]] step %[[CONSTANT_9]] {
+! CHECK:               %[[CONSTANT_10:.*]] = arith.constant 0 : index
+! CHECK:               %[[CONSTANT_11:.*]] = arith.constant 99 : index
+! CHECK:               %[[CONSTANT_12:.*]] = arith.constant 1 : index
+! CHECK:               fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_10]] to %[[CONSTANT_11]] step %[[CONSTANT_12]] {
+! CHECK:                 %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_3]], %[[VAL_2]], %[[VAL_1]] : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
+! CHECK:                 fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32>
+! CHECK:               }
+! CHECK:             }
+! CHECK:           }
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100x10x2xi32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10x2xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100x10x2xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 10 : index
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 2 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]], %[[CONSTANT_1]], %[[CONSTANT_2]] : (index, index, index) -> !fir.shape<3>
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 100 : index
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 10 : index
+! CHECK:           %[[CONSTANT_5:.*]] = arith.constant 2 : index
+! CHECK:           %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_3]], %[[CONSTANT_4]], %[[CONSTANT_5]] : (index, index, index) -> !fir.shape<3>
+! CHECK:           %[[CONSTANT_6:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_2]] step %[[CONSTANT_6]] unordered {
+! CHECK:             fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_1]] step %[[CONSTANT_6]] unordered {
+! CHECK:               fir.do_loop %[[VAL_4:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_0]] step %[[CONSTANT_6]] unordered {
+! CHECK:                 %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_4]], %[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
+! CHECK:                 %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
+! CHECK:                 %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_4]], %[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<100x10x2xi32>>, index, index, index) -> !fir.ref<i32>
+! CHECK:                 %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
+! CHECK:                 %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:                 hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
+! CHECK:               }
+! CHECK:             }
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100x10x2xi32>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_add_ref_100x10xi32 : !fir.ref<!fir.array<100x10xi32>> reduction_operator <add> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 10 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]], %[[CONSTANT_2]] : (index, index) -> !fir.shape<2>
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100x10xi32>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100x10xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<100x10xi32>>, !fir.ref<!fir.array<100x10xi32>>)
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 9 : index
+! CHECK:           %[[CONSTANT_5:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_3]] to %[[CONSTANT_4]] step %[[CONSTANT_5]] {
+! CHECK:             %[[CONSTANT_6:.*]] = arith.constant 0 : index
+! CHECK:             %[[CONSTANT_7:.*]] = arith.constant 99 : index
+! CHECK:             %[[CONSTANT_8:.*]] = arith.constant 1 : index
+! CHECK:             fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_6]] to %[[CONSTANT_7]] step %[[CONSTANT_8]] {
+! CHECK:               %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_2]], %[[VAL_1]] : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
+! CHECK:               fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32>
+! CHECK:             }
+! CHECK:           }
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100x10xi32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100x10xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100x10xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 10 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]], %[[CONSTANT_1]] : (index, index) -> !fir.shape<2>
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 100 : index
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 10 : index
+! CHECK:           %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_2]], %[[CONSTANT_3]] : (index, index) -> !fir.shape<2>
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_1]] step %[[CONSTANT_4]] unordered {
+! CHECK:             fir.do_loop %[[VAL_3:.*]] = %[[CONSTANT_4]] to %[[CONSTANT_0]] step %[[CONSTANT_4]] unordered {
+! CHECK:               %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
+! CHECK:               %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
+! CHECK:               %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_3]], %[[VAL_2]])  : (!fir.ref<!fir.array<100x10xi32>>, index, index) -> !fir.ref<i32>
+! CHECK:               %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
+! CHECK:               %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:               hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
+! CHECK:             }
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100x10xi32>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_add_ref_100xi32 : !fir.ref<!fir.array<100xi32>> reduction_operator <add> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca !fir.array<100xi32>
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]](%[[SHAPE_0]]) {uniq_name = "acc.reduction.init"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xi32>>, !fir.ref<!fir.array<100xi32>>)
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 0 : index
+! CHECK:           %[[CONSTANT_3:.*]] = arith.constant 99 : index
+! CHECK:           %[[CONSTANT_4:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_1:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_3]] step %[[CONSTANT_4]] {
+! CHECK:             %[[COORDINATE_OF_0:.*]] = fir.coordinate_of %[[DECLARE_0]]#0, %[[VAL_1]] : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK:             fir.store %[[CONSTANT_0]] to %[[COORDINATE_OF_0]] : !fir.ref<i32>
+! CHECK:           }
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<!fir.array<100xi32>>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.array<100xi32>>, %[[VAL_1:.*]]: !fir.ref<!fir.array<100xi32>>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_0]] : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+! CHECK:           %[[SHAPE_1:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+! CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_2:.*]] = %[[CONSTANT_2]] to %[[CONSTANT_0]] step %[[CONSTANT_2]] unordered {
+! CHECK:             %[[DESIGNATE_0:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_0:.*]] = fir.load %[[DESIGNATE_0]] : !fir.ref<i32>
+! CHECK:             %[[DESIGNATE_1:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]])  : (!fir.ref<!fir.array<100xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[LOAD_1:.*]] = fir.load %[[DESIGNATE_1]] : !fir.ref<i32>
+! CHECK:             %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:             hlfir.assign %[[ADDI_0]] to %[[DESIGNATE_1]] : i32, !fir.ref<i32>
+! CHECK:           }
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<!fir.array<100xi32>>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.private.recipe @privatization_ref_i32 : !fir.ref<i32> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca i32
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.private.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK:         }
+
+! CHECK-LABEL:   acc.reduction.recipe @reduction_add_ref_i32 : !fir.ref<i32> reduction_operator <add> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK:           %[[ALLOCA_0:.*]] = fir.alloca i32
+! CHECK:           %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           fir.store %[[CONSTANT_0]] to %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK:           acc.yield %[[DECLARE_0]]#0 : !fir.ref<i32>
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LOAD_0:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+! CHECK:           %[[LOAD_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
+! CHECK:           %[[ADDI_0:.*]] = arith.addi %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK:           hlfir.assign %[[ADDI_0]] to %[[VAL_0]] : i32, !fir.ref<i32>
+! CHECK:           acc.yield %[[VAL_0]] : !fir.ref<i32>
+! CHECK:         }
 
 subroutine acc_reduction_add_int(a, b)
   integer :: a(100)
@@ -1244,7 +1699,7 @@ subroutine acc_reduction_add_dynamic_extent_add_with_section(a)
 
 ! CHECK-LABEL: func.func @_QPacc_reduction_add_dynamic_extent_add_with_section(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"})
-! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFacc_reduction_add_dynamic_extent_add_with_sectionEa"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFacc_reduction_add_dynamic_extent_add_with_sectionEa"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%c1{{.*}} : index) upperbound(%c3{{.*}} : index) extent(%{{.*}}#1 : index) stride(%{{.*}}#2 : index) startIdx(%{{.*}} : index) {strideInBytes = true}
 ! CHECK: %[[RED:.*]] = acc.reduction var(%[[DECL]]#0 : !fir.box<!fir.array<?xi32>>) bounds(%[[BOUND]]) -> !fir.box<!fir.array<?xi32>> {name = "a(2:4)"}
 ! CHECK: acc.parallel reduction(@reduction_add_section_lb1.ub3_box_Uxi32 -> %[[RED]] : !fir.box<!fir.array<?xi32>>)
@@ -1257,7 +1712,7 @@ subroutine acc_reduction_add_allocatable(a)
 
 ! CHECK-LABEL: func.func @_QPacc_reduction_add_allocatable(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {fir.bindc_name = "a"})
-! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_reduction_add_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_reduction_add_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {name = "a"}
 ! CHECK: acc.parallel reduction(@reduction_max_ref_box_heap_Uxf32 -> %[[RED]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 
@@ -1269,7 +1724,7 @@ subroutine acc_reduction_add_pointer_array(a)
 
 ! CHECK-LABEL: func.func @_QPacc_reduction_add_pointer_array(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "a"})
-! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFacc_reduction_add_pointer_arrayEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFacc_reduction_add_pointer_arrayEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {name = "a"}
 ! CHECK: acc.parallel reduction(@reduction_max_ref_box_ptr_Uxf32 -> %[[RED]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 
@@ -1282,6 +1737,6 @@ subroutine acc_reduction_max_dynamic_extent_max(a, n)
 
 ! CHECK-LABEL: func.func @_QPacc_reduction_max_dynamic_extent_max(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?x?xf32>> {fir.bindc_name = "a"}, %{{.*}}: !fir.ref<i32> {fir.bindc_name = "n"})
-! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFacc_reduction_max_dynamic_extent_maxEa"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFacc_reduction_max_dynamic_extent_maxEa"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
 ! CHECK: %[[RED:.*]] = acc.reduction var(%[[DECL_A]]#0 : !fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>> {name = "a"}
 ! CHECK: acc.parallel reduction(@reduction_max_box_UxUxf32 -> %[[RED]] : !fir.box<!fir.array<?x?xf32>>)
diff --git a/flang/test/Lower/OpenACC/acc-serial-loop.f90 b/flang/test/Lower/OpenACC/acc-serial-loop.f90
index cad0ee73f6cc5..15ae69ab86965 100644
--- a/flang/test/Lower/OpenACC/acc-serial-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-serial-loop.f90
@@ -301,7 +301,7 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a"}
-! CHECK:      %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"}
+! CHECK:      %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"}
 ! CHECK:      acc.serial {{.*}} dataOperands(%[[CREATE_A]], %[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
 ! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
@@ -309,7 +309,7 @@ subroutine acc_serial_loop
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 ! CHECK:      acc.copyout accPtr(%[[CREATE_A]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) {name = "a"}
-! CHECK:      acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) {name = "b"}
+! CHECK:      acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"}
 
   !$acc serial loop create(b) create(zero: a)
   DO i = 1, n
diff --git a/flang/test/Lower/OpenACC/acc-serial.f90 b/flang/test/Lower/OpenACC/acc-serial.f90
index 1e4f32fd209ef..1eaa0e4994b05 100644
--- a/flang/test/Lower/OpenACC/acc-serial.f90
+++ b/flang/test/Lower/OpenACC/acc-serial.f90
@@ -201,13 +201,13 @@ subroutine acc_serial
   !$acc end serial
 
 ! CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a"}
-! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"}
+! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"}
 ! CHECK: %[[CREATE_C:.*]] = acc.create varPtr(%[[DECLC]]#0 : !fir.ref<!fir.array<10x10xf32>>) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "c"}
 ! CHECK:      acc.serial dataOperands(%[[CREATE_A]], %[[CREATE_B]], %[[CREATE_C]] : !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>, !fir.ref<!fir.array<10x10xf32>>) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 ! CHECK: acc.copyout accPtr(%[[CREATE_A]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "a"}
-! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "b"}
+! CHECK: acc.copyout accPtr(%[[CREATE_B]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) {dataClause = #acc<data_clause acc_copyout_zero>, name = "b"}
 ! CHECK: acc.copyout accPtr(%[[CREATE_C]] : !fir.ref<!fir.array<10x10xf32>>) to varPtr(%[[DECLC]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "c"}
 
   !$acc serial create(a, b) create(zero: c)
diff --git a/flang/test/Lower/OpenACC/acc-unstructured.f90 b/flang/test/Lower/OpenACC/acc-unstructured.f90
index c42c7dddc5ca1..829ed5486c196 100644
--- a/flang/test/Lower/OpenACC/acc-unstructured.f90
+++ b/flang/test/Lower/OpenACC/acc-unstructured.f90
@@ -1,5 +1,4 @@
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
-! XFAIL: *
 
 subroutine test_unstructured1(a, b, c)
   integer :: i, j, k
@@ -55,10 +54,11 @@ subroutine test_unstructured2(a, b, c)
 
 ! CHECK-LABEL: func.func @_QPtest_unstructured2
 ! CHECK: acc.parallel
-! CHECK: acc.loop
+! CHECK: acc.loop combined(parallel) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) {
 ! CHECK: fir.call @_FortranAStopStatementText
 ! CHECK: acc.yield
 ! CHECK: acc.yield
+! CHECK: } attributes {independent = [#acc.device_type<none>], unstructured}
 ! CHECK: acc.yield
 
 end subroutine
diff --git a/flang/test/Lower/OpenACC/acc-use-device.f90 b/flang/test/Lower/OpenACC/acc-use-device.f90
index 081a6e317bfc9..30fefdb44a2bf 100644
--- a/flang/test/Lower/OpenACC/acc-use-device.f90
+++ b/flang/test/Lower/OpenACC/acc-use-device.f90
@@ -36,9 +36,9 @@ subroutine test2(a, b, c)
   call allocate(d(N))
   c => d
 ! CHECK: %[[DS:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK: %[[E:.*]]:2 = hlfir.declare %arg0 dummy_scope %[[DS]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest2Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
-! CHECK: %[[F:.*]]:2 = hlfir.declare %arg1 dummy_scope %[[DS]] {uniq_name = "_QFtest2Eb"} : (!fir.box<!fir.array<?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>)
-! CHECK: %[[G:.*]]:2 = hlfir.declare %arg2 dummy_scope %[[DS]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest2Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>)
+! CHECK: %[[E:.*]]:2 = hlfir.declare %arg0 dummy_scope %[[DS]] {{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest2Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
+! CHECK: %[[F:.*]]:2 = hlfir.declare %arg1 dummy_scope %[[DS]] {{.*}} {uniq_name = "_QFtest2Eb"} : (!fir.box<!fir.array<?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>)
+! CHECK: %[[G:.*]]:2 = hlfir.declare %arg2 dummy_scope %[[DS]] {{.*}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest2Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>)
 
   !$acc data copy(a,b,c,d)
   !$acc host_data use_device(a,b,c)
diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90
index 272f34fc0fd1a..cfe42367b051b 100644
--- a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90
+++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90
@@ -72,7 +72,7 @@ end subroutine target_allocatable
 ! CPU-SAME: {bindc_name = "alloc_var", {{.*}}}
 ! CPU:  %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]]
 ! CPU:  %[[BASE_ADDR:.*]] = fir.box_offset %[[VAR_DECL]]#0 base_addr : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> [[MEMBER_TYPE:.*]]
-! CPU:  %[[MEMBER:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR]] : [[MEMBER_TYPE:.*]]) -> {{.*}}
+! CPU:  %[[MEMBER:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], i32) map_clauses(tofrom) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR]] : [[MEMBER_TYPE:.*]]) -> {{.*}}
 ! CPU:  %[[MAP_VAR:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], [[DESC_TYPE]]) map_clauses(to) capture(ByRef) members(%[[MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.heap<i32>>>
 
 ! CPU:  omp.target map_entries(%[[MAP_VAR]] -> %arg0, %[[MEMBER]] -> %arg1 : [[TYPE]], [[MEMBER_TYPE]]) private(
diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90
index f3b939780c2b6..a6394ea196998 100644
--- a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90
+++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-multiple-variables.f90
@@ -156,7 +156,7 @@ end subroutine target_allocatable
 ! CHECK-SAME:       %[[REAL_ARR_DESC_MAP]] -> %[[MAPPED_ARG2:[^,]+]]
 ! CHECK-SAME:       %[[CHAR_VAR_DESC_MAP]] -> %[[MAPPED_ARG3:.[^,]+]]
 ! CHECK-SAME:       %[[MAPPED_MI0]] -> %[[MAPPED_ARG0:[^,]+]]
-! CHECK-SAME:       !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.array<?xf32>>>, !fir.ref<!fir.boxchar<1>>, !fir.ref<i32>, !fir.llvm_ptr<!fir.ref<i32>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.ref<!fir.boxchar<1>>
+! CHECK-SAME:       !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.array<?xf32>>>, !fir.ref<!fir.boxchar<1>>, !fir.ref<i32>, !fir.llvm_ptr<!fir.ref<i32>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>
 ! CHECK-SAME:     private(
 ! CHECK-SAME:       @[[ALLOC_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[ALLOC_ARG:[^,]+]] [map_idx=0],
 ! CHECK-SAME:       @[[REAL_PRIVATIZER_SYM]] %{{[^[:space:]]+}}#0 -> %[[REAL_ARG:[^,]+]],
diff --git a/flang/test/Lower/OpenMP/Todo/defaultmap-clause-firstprivate.f90 b/flang/test/Lower/OpenMP/Todo/defaultmap-clause-firstprivate.f90
index 6818c39f63a3c..1e0d9694258cc 100644
--- a/flang/test/Lower/OpenMP/Todo/defaultmap-clause-firstprivate.f90
+++ b/flang/test/Lower/OpenMP/Todo/defaultmap-clause-firstprivate.f90
@@ -6,7 +6,7 @@ subroutine f00
     ! NOTE: This is implemented for scalars as it is the default behaviour, so we utilise
     ! a different data type.
     integer, allocatable :: i
-    !CHECK: not yet implemented: Firstprivate and None are currently unsupported defaultmap behaviour
+    !CHECK: not yet implemented: Firstprivate is currently unsupported defaultmap behaviour
     !$omp target defaultmap(firstprivate)
       i = 10
     !$omp end target
diff --git a/flang/test/Lower/OpenMP/Todo/defaultmap-clause-none.f90 b/flang/test/Lower/OpenMP/Todo/defaultmap-clause-none.f90
deleted file mode 100644
index 287eb4a9dfe8f..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/defaultmap-clause-none.f90
+++ /dev/null
@@ -1,11 +0,0 @@
-!RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
-!RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
-
-subroutine f00
-  implicit none
-  integer :: i
-  !CHECK: not yet implemented: Firstprivate and None are currently unsupported defaultmap behaviour
-  !$omp target defaultmap(none)
-    i = 10
-  !$omp end target
-end
diff --git a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90
index 8daf20e1ae400..fec146ac70313 100644
--- a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90
+++ b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90
@@ -5,6 +5,6 @@
 program main
   integer :: x
 
-  ! CHECK: not yet implemented: OpenMPDeclarativeAllocate
+  ! CHECK: not yet implemented: OmpAllocateDirective
   !$omp allocate(x) align(32)
 end
diff --git a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90 b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90
index e83b433d0fda0..3307eb2505b71 100644
--- a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90
+++ b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90
@@ -5,6 +5,6 @@
 program main
   integer :: x, y
 
-  ! CHECK: not yet implemented: OpenMPDeclarativeAllocate
+  ! CHECK: not yet implemented: OmpAllocateDirective
   !$omp allocate(x, y)
 end
diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-inreduction.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-inreduction.f90
deleted file mode 100644
index 8acc399a92abe..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/taskloop-inreduction.f90
+++ /dev/null
@@ -1,13 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-
-! CHECK: not yet implemented: Unhandled clause IN_REDUCTION in TASKLOOP construct
-subroutine omp_taskloop_inreduction()
-   integer x
-   x = 0
-   !$omp taskloop in_reduction(+:x)
-   do i = 1, 100
-      x = x + 1
-   end do
-   !$omp end taskloop
-end subroutine omp_taskloop_inreduction
diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-reduction.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-reduction.f90
deleted file mode 100644
index 0c16bd227257f..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/taskloop-reduction.f90
+++ /dev/null
@@ -1,13 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-
-! CHECK: not yet implemented: Unhandled clause REDUCTION in TASKLOOP construct
-subroutine omp_taskloop_reduction()
-   integer x
-   x = 0
-   !$omp taskloop reduction(+:x)
-   do i = 1, 100
-      x = x + 1
-   end do
-   !$omp end taskloop
-end subroutine omp_taskloop_reduction
diff --git a/flang/test/Lower/OpenMP/allocatable-array-bounds.f90 b/flang/test/Lower/OpenMP/allocatable-array-bounds.f90
index 96d779c763d18..465a07cb2baf6 100644
--- a/flang/test/Lower/OpenMP/allocatable-array-bounds.f90
+++ b/flang/test/Lower/OpenMP/allocatable-array-bounds.f90
@@ -64,7 +64,7 @@ module assumed_allocatable_array_routines
 
 !HOST-LABEL: func.func @_QMassumed_allocatable_array_routinesPassumed_shape_array(
 
-!HOST: %[[DECLARE:.*]]:2 = hlfir.declare %[[ARG:.*]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable, intent_inout>, uniq_name = "_QMassumed_allocatable_array_routinesFassumed_shape_arrayEarr_read_write"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+!HOST: %[[DECLARE:.*]]:2 = hlfir.declare %[[ARG:.*]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable, intent_inout>, uniq_name = "_QMassumed_allocatable_array_routinesFassumed_shape_arrayEarr_read_write"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 !HOST: %[[LOAD_1:.*]] = fir.load %[[DECLARE]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 !HOST: %[[LOAD_2:.*]] = fir.load %[[DECLARE]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 !HOST: %[[CONSTANT_1:.*]] = arith.constant 0 : index
diff --git a/flang/test/Lower/OpenMP/array-bounds.f90 b/flang/test/Lower/OpenMP/array-bounds.f90
index 8f98d671486ae..00ebab315e166 100644
--- a/flang/test/Lower/OpenMP/array-bounds.f90
+++ b/flang/test/Lower/OpenMP/array-bounds.f90
@@ -41,7 +41,7 @@ module assumed_array_routines
 !HOST-LABEL: func.func @_QMassumed_array_routinesPassumed_shape_array(
 !HOST-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "arr_read_write"}) {
 !HOST: %[[INTERMEDIATE_ALLOCA:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
-!HOST: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMassumed_array_routinesFassumed_shape_arrayEarr_read_write"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+!HOST: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMassumed_array_routinesFassumed_shape_arrayEarr_read_write"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 !HOST: %[[C0:.*]] = arith.constant 1 : index
 !HOST: %[[C1:.*]] = arith.constant 0 : index
 !HOST: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0_DECL]]#0, %[[C1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
@@ -68,7 +68,7 @@ end subroutine assumed_shape_array
 !HOST-LABEL: func.func @_QMassumed_array_routinesPassumed_size_array(
 !HOST-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xi32>> {fir.bindc_name = "arr_read_write"}) {
 !HOST: %[[ARG0_SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-!HOST: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]](%[[ARG0_SHAPE]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEarr_read_write"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
+!HOST: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]](%[[ARG0_SHAPE]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEarr_read_write"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
 !HOST: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEi"}
 !HOST: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0_DECL]]#0, %c0{{.*}} : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
 !HOST: %[[C4_1:.*]] = arith.subi %c4, %c1{{.*}} : index
diff --git a/flang/test/Lower/OpenMP/atomic-update-capture-complex-part.f90 b/flang/test/Lower/OpenMP/atomic-update-capture-complex-part.f90
new file mode 100644
index 0000000000000..ee15b8805a69b
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-update-capture-complex-part.f90
@@ -0,0 +1,17 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s
+
+! Check that this compiles successfully.
+
+!CHECK: omp.atomic.capture
+!CHECK: omp.atomic.read
+!CHECK: omp.atomic.update
+subroutine f00
+  implicit none
+  real :: c
+  complex, allocatable :: x
+  !$omp atomic update capture
+    c = x%re
+    x%re = x%re + 1.0
+  !$omp end atomic
+end
+
diff --git a/flang/test/Lower/OpenMP/cancel.f90 b/flang/test/Lower/OpenMP/cancel.f90
index fd1f110e5804c..8870572703f0b 100644
--- a/flang/test/Lower/OpenMP/cancel.f90
+++ b/flang/test/Lower/OpenMP/cancel.f90
@@ -85,7 +85,7 @@ subroutine cancel_parallel_if(cond)
 ! CHECK-LABEL:   func.func @_QPcancel_parallel_if(
 ! CHECK-SAME:                                     %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFcancel_parallel_ifEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFcancel_parallel_ifEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           omp.parallel {
 ! CHECK:             %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK:             %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.logical<4>) -> i1
@@ -106,7 +106,7 @@ subroutine cancel_do_if(cond)
 ! CHECK-LABEL:   func.func @_QPcancel_do_if(
 ! CHECK-SAME:                               %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFcancel_do_ifEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFcancel_do_ifEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFcancel_do_ifEi"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFcancel_do_ifEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           omp.parallel {
@@ -138,7 +138,7 @@ subroutine cancel_sections_if(cond)
 ! CHECK-LABEL:   func.func @_QPcancel_sections_if(
 ! CHECK-SAME:                                     %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFcancel_sections_ifEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFcancel_sections_ifEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           omp.sections {
 ! CHECK:             omp.section {
 ! CHECK:               %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.logical<4>>
@@ -162,7 +162,7 @@ subroutine cancel_taskgroup_if(cond)
 ! CHECK-LABEL:   func.func @_QPcancel_taskgroup_if(
 ! CHECK-SAME:                                      %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFcancel_taskgroup_ifEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFcancel_taskgroup_ifEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           omp.taskgroup {
 ! CHECK:             omp.task {
 ! CHECK:               %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.logical<4>>
diff --git a/flang/test/Lower/OpenMP/depend-complex.f90 b/flang/test/Lower/OpenMP/depend-complex.f90
index 488696b565077..84c4cb549116d 100644
--- a/flang/test/Lower/OpenMP/depend-complex.f90
+++ b/flang/test/Lower/OpenMP/depend-complex.f90
@@ -5,7 +5,7 @@ subroutine depend_complex(z)
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<complex<f32>> {fir.bindc_name = "z"}) {
   complex :: z
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFdepend_complexEz"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFdepend_complexEz"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
   !$omp task depend(in:z%re)
 ! CHECK:           %[[VAL_2:.*]] = hlfir.designate %[[VAL_1]]#0  real : (!fir.ref<complex<f32>>) -> !fir.ref<f32>
 ! CHECK:           omp.task depend(taskdependin -> %[[VAL_2]] : !fir.ref<f32>) {
diff --git a/flang/test/Lower/OpenMP/depend-substring.f90 b/flang/test/Lower/OpenMP/depend-substring.f90
index 5de11e06cc10b..eab6cd49ec036 100644
--- a/flang/test/Lower/OpenMP/depend-substring.f90
+++ b/flang/test/Lower/OpenMP/depend-substring.f90
@@ -8,7 +8,7 @@ subroutine substring_0(c)
 ! CHECK-LABEL:   func.func @_QPsubstring_0(
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> {fir.bindc_name = "c"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_0Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_0Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>>
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
@@ -41,7 +41,7 @@ subroutine substring_1(c)
 ! CHECK-LABEL:   func.func @_QPsubstring_1(
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> {fir.bindc_name = "c"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_1Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_1Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>>
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
@@ -74,7 +74,7 @@ subroutine substring_2(c)
 ! CHECK-LABEL:   func.func @_QPsubstring_2(
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> {fir.bindc_name = "c"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_2Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_2Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>>
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
@@ -98,7 +98,7 @@ subroutine substring_4(c)
 ! CHECK-LABEL:   func.func @_QPsubstring_4(
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> {fir.bindc_name = "c"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_4Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_4Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>>
 ! CHECK:           omp.task depend(taskdependout -> %[[VAL_3]] : !fir.ptr<!fir.char<1,?>>) {
diff --git a/flang/test/Lower/OpenMP/dynamic-len-char-bounds-gen.f90 b/flang/test/Lower/OpenMP/dynamic-len-char-bounds-gen.f90
new file mode 100644
index 0000000000000..07b4f041c6188
--- /dev/null
+++ b/flang/test/Lower/OpenMP/dynamic-len-char-bounds-gen.f90
@@ -0,0 +1,19 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+subroutine TestCharLenBounds(clen)
+
+  character(len=*) :: clen
+
+  !$omp target map(clen)
+  !$omp end target
+end subroutine TestCharLenBounds
+
+!CHECK: %[[DUMMY:.*]] = fir.dummy_scope : !fir.dscope
+!CHECK: %[[UNBOX:.*]]:2 = fir.unboxchar %{{.*}} : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+!CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[UNBOX]]#0 typeparams %2#1 dummy_scope %[[DUMMY]] arg {{[0-9]+}} {uniq_name = "_QFtestcharlenboundsEclen"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+!CHECK: %[[LB_START_IDX:.*]] = arith.constant 0 : index
+!CHECK: %[[STRIDE:.*]] = arith.constant 1 : index
+!CHECK: %[[EXTENT:.*]]:2 = fir.unboxchar %[[DECLARE]]#0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+!CHECK: %[[UB:.*]] = arith.subi %[[EXTENT]]#1, %[[STRIDE]] : index
+!CHECK: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%[[LB_START_IDX]] : index) upper_bound(%[[UB]] : index) extent(%[[EXTENT]]#1 : index) stride(%[[STRIDE]] : index) start_idx(%[[LB_START_IDX]] : index) {stride_in_bytes = true}
+!CHECK: %{{.*}} = omp.map.info {{.*}} bounds(%[[BOUNDS]]) {{.*}}
diff --git a/flang/test/Lower/OpenMP/flush.f90 b/flang/test/Lower/OpenMP/flush.f90
index 03e9f3b2e437c..f9d90e38e6eeb 100644
--- a/flang/test/Lower/OpenMP/flush.f90
+++ b/flang/test/Lower/OpenMP/flush.f90
@@ -7,9 +7,9 @@
 subroutine flush_standalone(a, b, c)
     integer, intent(inout) :: a, b, c
 
-!CHECK:    %[[A:.*]]:2 = hlfir.declare %[[ARG_A]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:    %[[B:.*]]:2 = hlfir.declare %[[ARG_B]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:    %[[C:.*]]:2 = hlfir.declare %[[ARG_C]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[A:.*]]:2 = hlfir.declare %[[ARG_A]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[B:.*]]:2 = hlfir.declare %[[ARG_B]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C:.*]]:2 = hlfir.declare %[[ARG_C]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_standaloneEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    omp.flush(%[[A]]#0, %[[B]]#0, %[[C]]#0 : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    omp.flush
 !$omp flush(a,b,c)
@@ -21,9 +21,9 @@ end subroutine flush_standalone
 !CHECK-SAME: %[[ARG_A:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, %[[ARG_B:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}, %[[ARG_C:.*]]: !fir.ref<i32> {fir.bindc_name = "c"})
 subroutine flush_parallel(a, b, c)
     integer, intent(inout) :: a, b, c
-!CHECK:    %[[A:.*]]:2 = hlfir.declare %[[ARG_A]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:    %[[B:.*]]:2 = hlfir.declare %[[ARG_B]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:    %[[C:.*]]:2 = hlfir.declare %[[ARG_C]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[A:.*]]:2 = hlfir.declare %[[ARG_A]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[B:.*]]:2 = hlfir.declare %[[ARG_B]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C:.*]]:2 = hlfir.declare %[[ARG_C]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFflush_parallelEc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 
 !$omp parallel
 !CHECK:    omp.parallel
diff --git a/flang/test/Lower/OpenMP/if-clause.f90 b/flang/test/Lower/OpenMP/if-clause.f90
index 3ae9018ae4d5d..e8f69b470e2ca 100644
--- a/flang/test/Lower/OpenMP/if-clause.f90
+++ b/flang/test/Lower/OpenMP/if-clause.f90
@@ -12,7 +12,6 @@ program main
   ! - PARALLEL SECTIONS
   ! - PARALLEL WORKSHARE
   ! - TARGET UPDATE
-  ! - TASKLOOP
   ! - TASKLOOP SIMD
 
   ! ----------------------------------------------------------------------------
@@ -1580,4 +1579,29 @@ program main
   !$omp teams if(teams: .true.)
   i = 1
   !$omp end teams
+
+  ! ----------------------------------------------------------------------------
+  ! TASKLOOP
+  ! ----------------------------------------------------------------------------
+
+  ! CHECK:      omp.taskloop
+  ! CHECK-NOT: if({{.*}})
+  !$omp taskloop
+  do i = 1, 10
+  end do
+  !$omp end taskloop
+
+  ! CHECK:      omp.taskloop
+  ! CHECK-SAME: if({{.*}})
+  !$omp taskloop if(.true.)
+  do i = 1, 10
+  end do
+  !$omp end taskloop
+
+  ! CHECK:      omp.taskloop
+  ! CHECK-SAME: if({{.*}})
+  !$omp taskloop if(taskloop: .true.)
+  do i = 1, 10
+  end do
+  !$omp end taskloop
 end program main
diff --git a/flang/test/Lower/OpenMP/implicit-dsa.f90 b/flang/test/Lower/OpenMP/implicit-dsa.f90
index 0d2db63edfe79..9d01460253899 100644
--- a/flang/test/Lower/OpenMP/implicit-dsa.f90
+++ b/flang/test/Lower/OpenMP/implicit-dsa.f90
@@ -5,6 +5,36 @@
 
 ! Privatizers
 
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = private} @[[TASKLOOP_TEST3_I_PRIVATE:.*]] : i32
+! CHECK-NOT:   copy {
+
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = firstprivate} @[[TASKLOOP_TEST3_X_FIRSTPRIVATE:.*]] : i32
+! CHECK-SAME:  copy {
+! CHECK:         hlfir.assign
+
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = private} @[[TASKLOOP_TEST2_X_PRIVATE:.*]] : i32
+! CHECK-NOT:   copy {
+
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = private} @[[TASKLOOP_TEST2_I_PRIVATE:.*]] : i32
+! CHECK-NOT:   copy {
+
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = private} @[[TASKLOOP_TEST1_I_PRIVATE:.*]] : i32
+! CHECK-NOT:   copy {
+
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = firstprivate} @[[TASKLOOP_TEST1_X_FIRSTPRIVATE:.*]] : i32
+! CHECK-SAME:  copy {
+! CHECK:         hlfir.assign
+
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = private} @[[TASKLOOP_TEST1_Y_PRIVATE:.*]] : i32
+! CHECK-NOT:   copy {
+
 ! CHECK-LABEL: omp.private
 ! CHECK-SAME:      {type = firstprivate} @[[TEST7_Y_FIRSTPRIV:.*]] : i32
 ! CHECK-SAME:  copy {
@@ -310,4 +340,100 @@ subroutine implicit_dsa_test7
   !$omp end task
 end subroutine
 
-! TODO Test taskloop
+! Test taskloop
+! CHECK-LABEL:   func.func @_QPimplicit_dsa_taskloop_test1
+! CHECK:           %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFimplicit_dsa_taskloop_test1Ei"}
+! CHECK:           %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[ALLOCA_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_taskloop_test1Ex"}
+! CHECK:           %[[DECL_X:.*]]:2 = hlfir.declare %[[ALLOCA_X]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[ALLOCA_Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFimplicit_dsa_taskloop_test1Ey"}
+! CHECK:           %[[DECL_Y:.*]]:2 = hlfir.declare %[[ALLOCA_Y]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[ALLOCA_Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFimplicit_dsa_taskloop_test1Ez"}
+! CHECK:           %[[DECL_Z:.*]]:2 = hlfir.declare %[[ALLOCA_Z]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+subroutine implicit_dsa_taskloop_test1
+   integer :: x, y, z
+   ! CHECK: omp.taskloop private(
+   ! CHECK-SAME: @[[TASKLOOP_TEST1_Y_PRIVATE]] %[[DECL_Y]]#0 -> %[[ARG0:.*]], @[[TASKLOOP_TEST1_X_FIRSTPRIVATE]] %[[DECL_X]]#0 -> %[[ARG1:.*]], @[[TASKLOOP_TEST1_I_PRIVATE]] %[[DECL_I]]#0 -> %[[ARG2:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
+   ! CHECK: omp.loop_nest (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) {
+   !$omp taskloop private(y) shared(z)
+   do i = 1, 100
+      ! CHECK: %[[Y_VAL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      ! CHECK: %[[X_VAL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      ! CHECK: %[[LOAD_Z:.*]] = fir.load %[[DECL_Z]]#0 : !fir.ref<i32>
+      x = y + z
+      ! CHECK: hlfir.assign %{{.*}} to %[[X_VAL]]#0 : i32, !fir.ref<i32>
+   end do
+   !$omp end taskloop
+
+   ! CHECK: omp.taskloop private(@[[TASKLOOP_TEST1_I_PRIVATE]] %[[DECL_I]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) {
+   !$omp taskloop default(shared)
+   do i = 1, 100
+      ! CHECK:  %[[LOAD_Y:.*]] = fir.load %[[DECL_Y]]#0 : !fir.ref<i32>
+      ! CHECK: %[[LOAD_Z:.*]] = fir.load %[[DECL_Z]]#0 : !fir.ref<i32>
+      ! CHECK: %[[ADD_VAL:.*]] = arith.addi %[[LOAD_Y]], %[[LOAD_Z]] : i32
+      x = y + z
+      ! CHECK: hlfir.assign %[[ADD_VAL]] to %[[DECL_X]]#0 : i32, !fir.ref<i32>
+   end do
+   !$omp end taskloop
+end subroutine
+
+! Nested taskloop with implicit shared DSA variables.
+! CHECK-LABEL: func @_QPimplicit_dsa_taskloop_test2
+! CHECK:          %[[I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFimplicit_dsa_taskloop_test2Ei"}
+! CHECK:          %[[I_DECL:.*]]:2 = hlfir.declare %[[I]] {uniq_name = "_QFimplicit_dsa_taskloop_test2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_taskloop_test2Ex"}
+! CHECK:          %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_taskloop_test2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+subroutine implicit_dsa_taskloop_test2
+   integer :: x
+   ! CHECK:   omp.parallel {
+   !$omp parallel 
+   ! CHECK:   omp.taskloop private(@[[TASKLOOP_TEST2_I_PRIVATE]] %[[I_DECL]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) {
+   !$omp taskloop
+   do i = 1, 100
+      ! CHECK: hlfir.assign %{{.*}} to %[[X_DECL]]#0 : i32, !fir.ref<i32>
+      x = 2
+   end do
+   !$omp end taskloop
+
+   ! CHECK: omp.taskloop private(@[[TASKLOOP_TEST2_X_PRIVATE]] %[[X_DECL]]#0 -> %[[ARG0]], @[[TASKLOOP_TEST2_I_PRIVATE]] %[[I_DECL]]#0 -> %[[ARG1:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
+   !$omp taskloop private(x)
+   do i = 1, 10
+      ! CHECK: %[[DECL_PRIV_X:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFimplicit_dsa_taskloop_test2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      ! CHECK: %[[LOAD_X:.*]] = fir.load %[[DECL_PRIV_X]]#0 : !fir.ref<i32>
+      x = x + 1
+      ! CHECK: hlfir.assign %{{.*}} to %[[DECL_PRIV_X]]#0 : i32, !fir.ref<i32>
+   end do
+   !$omp end parallel
+
+end subroutine
+
+! Taskloop with implicit firstprivate DSA variables, enclosed in private context.
+
+! CHECK-LABEL: func @_QPimplicit_dsa_taskloop_test3
+! CHECK:          %[[I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFimplicit_dsa_taskloop_test3Ei"}
+! CHECK:          %[[I_DECL:.*]]:2 = hlfir.declare %[[I]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_taskloop_test3Ex"}
+! CHECK:          %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFimplicit_dsa_taskloop_test3Ey"}
+! CHECK:          %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFimplicit_dsa_taskloop_test3Ez"}
+! CHECK:          %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+subroutine implicit_dsa_taskloop_test3
+   integer :: x, y, z
+   ! CHECK:  omp.parallel private(@[[TASKLOOP_TEST3_X_FIRSTPRIVATE]] %[[X_DECL]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) {
+   ! CHECK:  %[[X_PRIV_VAL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+   !$omp parallel firstprivate(x)
+   ! CHECK:  omp.taskloop private(@[[TASKLOOP_TEST3_X_FIRSTPRIVATE]] %[[X_PRIV_VAL]]#0 -> %[[ARG1:.*]], @[[TASKLOOP_TEST3_I_PRIVATE]] %[[I_DECL]]#0 -> %[[ARG2:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
+   !$omp taskloop
+   ! CHECK:  %[[X_VAL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+   do i = 1, 100
+      ! CHECK: %[[LOAD_Y:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
+      ! CHECK: %[[LOAD_Z:.*]] = fir.load %[[Z_DECL]]#0 : !fir.ref<i32>
+      x = y + z
+      ! CHECK: hlfir.assign %{{.*}} to %[[X_VAL]]#0 : i32, !fir.ref<i32>
+   end do
+   !$omp end taskloop
+   !$omp end parallel
+end subroutine
+
diff --git a/flang/test/Lower/OpenMP/map-character.f90 b/flang/test/Lower/OpenMP/map-character.f90
index cefd3ac0e54f9..9a114238fa9ec 100644
--- a/flang/test/Lower/OpenMP/map-character.f90
+++ b/flang/test/Lower/OpenMP/map-character.f90
@@ -39,8 +39,11 @@ end subroutine TestOfCharacter
 !CHECK:  %[[A1_UB:.*]] = arith.subi %[[UNBOXED_ARG1]]#1, %[[CONST_ONE]] : index
 !CHECK:  %[[BOUNDS_A1_BOXCHAR:.*]] = omp.map.bounds lower_bound(%[[CONST_ZERO]] : index) upper_bound(%[[A1_UB]] : index) extent(%[[UNBOXED_ARG1]]#1 : index)
 !CHECK-SAME: stride(%[[CONST_ONE]] : index) start_idx(%[[CONST_ZERO]] : index) {stride_in_bytes = true}
-!CHECK:  %[[A1_BOXCHAR_MAP:.*]] = omp.map.info var_ptr(%[[A1_BOXCHAR_ALLOCA]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(implicit, to)
-!CHECK-SAME: capture(ByRef) bounds(%[[BOUNDS_A1_BOXCHAR]]) -> !fir.ref<!fir.boxchar<1>> {name = ""}
+!CHECK: %[[A1_BOX_ADDR:.*]] = fir.box_offset %[[A1_BOXCHAR_ALLOCA]] base_addr : (!fir.ref<!fir.boxchar<1>>) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>
+!CHECK: %[[A1_BOXCHAR_MAP:.*]] = omp.map.info var_ptr(%[[A1_BOXCHAR_ALLOCA]] : !fir.ref<!fir.boxchar<1>>, !fir.char<1,?>) map_clauses(implicit, to)
+!CHECK-SAME: capture(ByRef) var_ptr_ptr(%[[A1_BOX_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) bounds(%[[BOUNDS_A1_BOXCHAR]]) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>> {name = ""}
+!CHECK: %[[A1_BOXCHAR_MAP_2:.*]] = omp.map.info var_ptr(%[[A1_BOXCHAR_ALLOCA]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>)
+!CHECK-SAME: map_clauses(implicit, to) capture(ByRef) members(%[[A1_BOXCHAR_MAP]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) -> !fir.ref<!fir.boxchar<1>> {name = ""}
 !CHECK:  fir.store %[[ARG0]] to %[[A0_BOXCHAR_ALLOCA]] : !fir.ref<!fir.boxchar<1>>
 !CHECK:  %[[CONST_ZERO:.*]] = arith.constant 0 : index
 !CHECK:  %[[CONST_ONE:.*]] = arith.constant 1 : index
@@ -48,9 +51,12 @@ end subroutine TestOfCharacter
 !CHECK:  %[[A0_UB:.*]] = arith.subi %[[UNBOXED_ARG0]]#1, %[[CONST_ONE]] : index
 !CHECK:  %[[BOUNDS_A0_BOXCHAR:.*]] = omp.map.bounds lower_bound(%[[CONST_ZERO]] : index) upper_bound(%[[A0_UB]] : index) extent(%[[UNBOXED_ARG0]]#1 : index)
 !CHECK-SAME: stride(%[[CONST_ONE]] : index) start_idx(%[[CONST_ZERO]] : index) {stride_in_bytes = true}
-!CHECK:  %[[A0_BOXCHAR_MAP:.*]] = omp.map.info var_ptr(%[[A0_BOXCHAR_ALLOCA]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(implicit, to)
-!CHECK-SAME: capture(ByRef) bounds(%[[BOUNDS_A0_BOXCHAR]]) -> !fir.ref<!fir.boxchar<1>> {name = ""}
-!CHECK:  omp.target map_entries(%[[A0_MAP]] -> %[[TGT_A0:.*]], %[[A1_MAP]] -> %[[TGT_A1:.*]], %[[A1_BOXCHAR_MAP]] -> %[[TGT_A1_BOXCHAR:.*]], %[[A0_BOXCHAR_MAP]] -> %[[TGT_A0_BOXCHAR:.*]] : !fir.ref<!fir.char<1,?>>, !fir.ref<!fir.char<1,?>>, !fir.ref<!fir.boxchar<1>>, !fir.ref<!fir.boxchar<1>>) {
+!CHECK: %[[A0_BOX_ADDR:.*]] = fir.box_offset %[[A0_BOXCHAR_ALLOCA]] base_addr : (!fir.ref<!fir.boxchar<1>>) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>
+!CHECK: %[[A0_BOXCHAR_MAP:.*]] = omp.map.info var_ptr(%[[A0_BOXCHAR_ALLOCA]] : !fir.ref<!fir.boxchar<1>>, !fir.char<1,?>) map_clauses(implicit, to)
+!CHECK-SAME: capture(ByRef) var_ptr_ptr(%[[A0_BOX_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) bounds(%24) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>> {name = ""}
+!CHECK: %[[A0_BOXCHAR_MAP_2:.*]] = omp.map.info var_ptr(%[[A0_BOXCHAR_ALLOCA]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(implicit, to)
+!CHECK-SAME: capture(ByRef) members(%[[A0_BOXCHAR_MAP]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) -> !fir.ref<!fir.boxchar<1>> {name = ""}
+!CHECK:  omp.target map_entries(%[[A0_MAP]] -> %[[TGT_A0:.*]], %[[A1_MAP]] -> %[[TGT_A1:.*]], %[[A1_BOXCHAR_MAP_2]] -> %[[TGT_A1_BOXCHAR:.*]], %[[A0_BOXCHAR_MAP_2]] -> %[[TGT_A0_BOXCHAR:.*]], %[[A1_BOXCHAR_MAP]] -> %[[TGT_A1_BOXCHAR2:.*]], %[[A0_BOXCHAR_MAP]] -> %[[TGT_A0_BOXCHAR2:.*]] : !fir.ref<!fir.char<1,?>>, !fir.ref<!fir.char<1,?>>, !fir.ref<!fir.boxchar<1>>, !fir.ref<!fir.boxchar<1>>, !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>, !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) {
 !CHECK:    %[[TGT_A0_BC_LD:.*]] = fir.load %[[TGT_A0_BOXCHAR]] : !fir.ref<!fir.boxchar<1>>
 !CHECK:    %[[TGT_A1_BC_LD:.*]] = fir.load %[[TGT_A1_BOXCHAR]] : !fir.ref<!fir.boxchar<1>>
 !CHECK:    %[[UNBOXED_TGT_A1:.*]]:2 = fir.unboxchar %[[TGT_A1_BC_LD]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
diff --git a/flang/test/Lower/OpenMP/optional-argument-map-2.f90 b/flang/test/Lower/OpenMP/optional-argument-map-2.f90
index 791d509028dee..2eb18448e901f 100644
--- a/flang/test/Lower/OpenMP/optional-argument-map-2.f90
+++ b/flang/test/Lower/OpenMP/optional-argument-map-2.f90
@@ -28,7 +28,7 @@ end module mod
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {fir.bindc_name = "a", fir.optional}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>>
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable, intent_inout, optional>, uniq_name = "_QMmodFroutine_boxEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable, intent_inout, optional>, uniq_name = "_QMmodFroutine_boxEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.is_present %[[VAL_2]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> i1
 ! CHECK:           %[[VAL_9:.*]]:5 = fir.if %[[VAL_8]] -> (index, index, index, index, index) {
 ! CHECK:             %[[VAL_10:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
@@ -58,7 +58,7 @@ end module mod
 ! CHECK:           %[[VAL_0:.*]] = fir.alloca !fir.boxchar<1>
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<intent_in, optional>, uniq_name = "_QMmodFroutine_boxcharEa"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in, optional>, uniq_name = "_QMmodFroutine_boxcharEa"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.alloca !fir.char<1,4> {bindc_name = "b", uniq_name = "_QMmodFroutine_boxcharEb"}
 ! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_4]] {uniq_name = "_QMmodFroutine_boxcharEb"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
@@ -71,11 +71,10 @@ end module mod
 ! CHECK-FPRIV:     %[[VAL_12:.*]]:2 = fir.unboxchar %[[VAL_8]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK-FPRIV:     %[[VAL_13:.*]] = arith.subi %[[VAL_12]]#1, %[[VAL_11]] : index
 ! CHECK-FPRIV:     %[[VAL_14:.*]] = omp.map.bounds lower_bound(%[[VAL_10]] : index) upper_bound(%[[VAL_13]] : index) extent(%[[VAL_12]]#1 : index) stride(%[[VAL_11]] : index) start_idx(%[[VAL_10]] : index) {stride_in_bytes = true}
-! CHECK-FPRIV:     %[[VAL_15:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.boxchar<1>>
 ! CHECK-FPRIV:     %[[VAL_16:.*]] = fir.box_offset %[[VAL_0]] base_addr : (!fir.ref<!fir.boxchar<1>>) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>
-! CHECK-FPRIV:     %[[VAL_17:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.char<1,?>) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr(%[[VAL_16]] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) bounds(%[[VAL_14]]) -> !fir.ref<!fir.boxchar<1>>
-! CHECK-FPRIV:     %[[VAL_18:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(to) capture(ByRef) members(%[[VAL_17]] : [0] : !fir.ref<!fir.boxchar<1>>) -> !fir.ref<!fir.boxchar<1>>
-! CHECK-FPRIV:     omp.target map_entries(%[[VAL_7]] -> %[[VAL_19:.*]], %[[VAL_18]] -> %[[VAL_20:.*]], %[[VAL_17]] -> %[[VAL_21:.*]] : !fir.ref<!fir.char<1,4>>, !fir.ref<!fir.boxchar<1>>, !fir.ref<!fir.boxchar<1>>) private(@_QMmodFroutine_boxcharEa_firstprivate_boxchar_c8xU %[[VAL_3]]#0 -> %[[VAL_22:.*]] [map_idx=1] : !fir.boxchar<1>) {
+! CHECK-FPRIV:     %[[VAL_17:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.char<1,?>) map_clauses(tofrom) capture(ByRef) var_ptr_ptr(%[[VAL_16]] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) bounds(%[[VAL_14]]) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>> {name = ""}
+! CHECK-FPRIV:     %[[VAL_18:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(to) capture(ByRef) members(%[[VAL_17]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) -> !fir.ref<!fir.boxchar<1>>
+! CHECK-FPRIV:     omp.target map_entries(%[[VAL_7]] -> %[[VAL_19:.*]], %[[VAL_18]] -> %[[VAL_20:.*]], %[[VAL_17]] -> %[[VAL_21:.*]] : !fir.ref<!fir.char<1,4>>, !fir.ref<!fir.boxchar<1>>, !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) private(@_QMmodFroutine_boxcharEa_firstprivate_boxchar_c8xU %[[VAL_3]]#0 -> %[[VAL_22:.*]] [map_idx=1] : !fir.boxchar<1>) {
 ! CHECK-FPRIV:         %[[VAL_23:.*]] = arith.constant 4 : index
 ! CHECK-FPRIV:         %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_19]] typeparams %[[VAL_23]] {uniq_name = "_QMmodFroutine_boxcharEb"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
 ! CHECK-FPRIV:         %[[VAL_25:.*]]:2 = fir.unboxchar %[[VAL_22]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
@@ -103,14 +102,16 @@ end module mod
 ! CHECK-NO-FPRIV:           %[[VAL_19:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK-NO-FPRIV:           %[[VAL_20:.*]] = arith.subi %[[VAL_19]]#1, %[[VAL_18]] : index
 ! CHECK-NO-FPRIV:           %[[VAL_21:.*]] = omp.map.bounds lower_bound(%[[VAL_17]] : index) upper_bound(%[[VAL_20]] : index) extent(%[[VAL_19]]#1 : index) stride(%[[VAL_18]] : index) start_idx(%[[VAL_17]] : index) {stride_in_bytes = true}
-! CHECK-NO-FPRIV:           %[[VAL_22:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(implicit, to) capture(ByRef) bounds(%[[VAL_21]]) -> !fir.ref<!fir.boxchar<1>> {name = ""}
-! CHECK-NO-FPRIV:           omp.target map_entries(%[[VAL_7]] -> %[[VAL_23:.*]], %[[VAL_16]] -> %[[VAL_24:.*]], %[[VAL_22]] -> %[[VAL_25:.*]] : !fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,?>>, !fir.ref<!fir.boxchar<1>>) {
-! CHECK-NO-FPRIV:             %[[VAL_26:.*]] = fir.load %[[VAL_25]] : !fir.ref<!fir.boxchar<1>>
-! CHECK-NO-FPRIV:             %[[VAL_27:.*]]:2 = fir.unboxchar %[[VAL_26]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK-NO-FPRIV:             %[[VAL_28:.*]] = arith.constant 4 : index
-! CHECK-NO-FPRIV:             %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_23]] typeparams %[[VAL_28]] {uniq_name = "_QMmodFroutine_boxcharEb"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
-! CHECK-NO-FPRIV:             %[[VAL_30:.*]]:2 = hlfir.declare %[[VAL_24]] typeparams %[[VAL_27]]#1 {fortran_attrs = #fir.var_attrs<intent_in, optional>, uniq_name = "_QMmodFroutine_boxcharEa"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-! CHECK-NO-FPRIV:             hlfir.assign %[[VAL_30]]#0 to %[[VAL_29]]#0 : !fir.boxchar<1>, !fir.ref<!fir.char<1,4>>
+! CHECK-NO-FPRIV:           %[[VAL_22:.*]] = fir.box_offset %[[VAL_0]] base_addr : (!fir.ref<!fir.boxchar<1>>) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>
+! CHECK-NO-FPRIV:           %[[VAL_23:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.char<1,?>) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr(%[[VAL_22]] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) bounds(%14) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>> {name = ""}
+! CHECK-NO-FPRIV:           %[[VAL_24:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(implicit, to) capture(ByRef) members(%[[VAL_23]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) -> !fir.ref<!fir.boxchar<1>> {name = ""}
+! CHECK-NO-FPRIV:           omp.target map_entries(%[[VAL_7]] -> %[[VAL_25:.*]], %[[VAL_16]] -> %[[VAL_26:.*]], %[[VAL_24]] -> %[[VAL_27:.*]], %[[VAL_23]] -> %[[VAL_28:.*]] : !fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,?>>, !fir.ref<!fir.boxchar<1>>, !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) {
+! CHECK-NO-FPRIV:             %[[VAL_29:.*]] = fir.load %[[VAL_27]] : !fir.ref<!fir.boxchar<1>>
+! CHECK-NO-FPRIV:             %[[VAL_30:.*]]:2 = fir.unboxchar %[[VAL_29]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+! CHECK-NO-FPRIV:             %[[VAL_31:.*]] = arith.constant 4 : index
+! CHECK-NO-FPRIV:             %[[VAL_32:.*]]:2 = hlfir.declare %[[VAL_25]] typeparams %[[VAL_31]] {uniq_name = "_QMmodFroutine_boxcharEb"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+! CHECK-NO-FPRIV:             %[[VAL_33:.*]]:2 = hlfir.declare %[[VAL_26]] typeparams %[[VAL_30]]#1 {fortran_attrs = #fir.var_attrs<intent_in, optional>, uniq_name = "_QMmodFroutine_boxcharEa"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK-NO-FPRIV:             hlfir.assign %[[VAL_33]]#0 to %[[VAL_32]]#0 : !fir.boxchar<1>, !fir.ref<!fir.char<1,4>>
 ! CHECK-NO-FPRIV:             omp.terminator
 ! CHECK-NO-FPRIV:           }
 ! CHECK-NO-FPRIV:           return
diff --git a/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90
index 416d1abca36c8..a049b43e85ed9 100644
--- a/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90
+++ b/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90
@@ -23,8 +23,8 @@
 !CHECK:  }
 
 !CHECK-DAG: func @_QPfirstprivate_complex(%[[ARG1:.*]]: !fir.ref<complex<f32>>{{.*}}, %[[ARG2:.*]]: !fir.ref<complex<f64>>{{.*}}) {
-!CHECK:    %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_complexEarg1"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
-!CHECK:    %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_complexEarg2"} : (!fir.ref<complex<f64>>, !fir.dscope) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>)
+!CHECK:    %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_complexEarg1"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
+!CHECK:    %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_complexEarg2"} : (!fir.ref<complex<f64>>, !fir.dscope) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>)
 !CHECK:   omp.parallel private(@[[ARG1_COMPLEX_PRIVATIZER]] %{{.*}}#0 -> %[[ARG1_PVT:.*]], @[[ARG2_COMPLEX_PRIVATIZER]] %{{.*}}#0 -> %[[ARG2_PVT:.*]] : {{.*}}) {
 !CHECK:     %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_complexEarg1"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
 !CHECK:     %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "_QFfirstprivate_complexEarg2"} : (!fir.ref<complex<f64>>) -> (!fir.ref<complex<f64>>, !fir.ref<complex<f64>>)
@@ -43,12 +43,12 @@ subroutine firstprivate_complex(arg1, arg2)
 end subroutine
 
 !CHECK-DAG: func @_QPfirstprivate_integer(%[[ARG1:.*]]: !fir.ref<i32>{{.*}}, %[[ARG2:.*]]: !fir.ref<i8>{{.*}}, %[[ARG3:.*]]: !fir.ref<i16>{{.*}}, %[[ARG4:.*]]: !fir.ref<i32>{{.*}}, %[[ARG5:.*]]: !fir.ref<i64>{{.*}}, %[[ARG6:.*]]: !fir.ref<i128>{{.*}}) {
-!CHECK:  %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:  %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg2"} : (!fir.ref<i8>, !fir.dscope) -> (!fir.ref<i8>, !fir.ref<i8>)
-!CHECK:  %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg3"} : (!fir.ref<i16>, !fir.dscope) -> (!fir.ref<i16>, !fir.ref<i16>)
-!CHECK:  %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg4"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:  %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg5"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
-!CHECK:  %[[ARG6_DECL:.*]]:2 = hlfir.declare %[[ARG6]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg6"} : (!fir.ref<i128>, !fir.dscope) -> (!fir.ref<i128>, !fir.ref<i128>)
+!CHECK:  %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg2"} : (!fir.ref<i8>, !fir.dscope) -> (!fir.ref<i8>, !fir.ref<i8>)
+!CHECK:  %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg3"} : (!fir.ref<i16>, !fir.dscope) -> (!fir.ref<i16>, !fir.ref<i16>)
+!CHECK:  %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg4"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg5"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+!CHECK:  %[[ARG6_DECL:.*]]:2 = hlfir.declare %[[ARG6]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_integerEarg6"} : (!fir.ref<i128>, !fir.dscope) -> (!fir.ref<i128>, !fir.ref<i128>)
 !CHECK:  omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[ARG1_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG2_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG3_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG4_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG5_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG6_PVT:.*]] : {{.*}}) {
 !CHECK:    %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_integerEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "_QFfirstprivate_integerEarg2"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
@@ -76,11 +76,11 @@ subroutine firstprivate_integer(arg1, arg2, arg3, arg4, arg5, arg6)
 end subroutine
 
 !CHECK-DAG: func @_QPfirstprivate_logical(%[[ARG1:.*]]: !fir.ref<!fir.logical<4>>{{.*}}, %[[ARG2:.*]]: !fir.ref<!fir.logical<1>>{{.*}}, %[[ARG3:.*]]: !fir.ref<!fir.logical<2>>{{.*}}, %[[ARG4:.*]]: !fir.ref<!fir.logical<4>>{{.*}}, %[[ARG5:.*]]: !fir.ref<!fir.logical<8>>{{.*}}) {
-!CHECK:    %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg1"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-!CHECK:    %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg2"} : (!fir.ref<!fir.logical<1>>, !fir.dscope) -> (!fir.ref<!fir.logical<1>>, !fir.ref<!fir.logical<1>>)
-!CHECK:    %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg3"} : (!fir.ref<!fir.logical<2>>, !fir.dscope) -> (!fir.ref<!fir.logical<2>>, !fir.ref<!fir.logical<2>>)
-!CHECK:    %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg4"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-!CHECK:    %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg5"} : (!fir.ref<!fir.logical<8>>, !fir.dscope) -> (!fir.ref<!fir.logical<8>>, !fir.ref<!fir.logical<8>>)
+!CHECK:    %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg1"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:    %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg2"} : (!fir.ref<!fir.logical<1>>, !fir.dscope) -> (!fir.ref<!fir.logical<1>>, !fir.ref<!fir.logical<1>>)
+!CHECK:    %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg3"} : (!fir.ref<!fir.logical<2>>, !fir.dscope) -> (!fir.ref<!fir.logical<2>>, !fir.ref<!fir.logical<2>>)
+!CHECK:    %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg4"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:    %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_logicalEarg5"} : (!fir.ref<!fir.logical<8>>, !fir.dscope) -> (!fir.ref<!fir.logical<8>>, !fir.ref<!fir.logical<8>>)
 !CHECK:  omp.parallel private(@[[ARG1_LOGICAL_PRIVATIZER]] {{.*}}#0 -> %[[ARG1_PVT:.*]], @[[ARG2_LOGICAL_PRIVATIZER]] {{.*}}#0 -> %[[ARG2_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG3_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG4_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG5_PVT:.*]] : {{.*}}) {
 !CHECK:     %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_logicalEarg1"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 !CHECK:     %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "_QFfirstprivate_logicalEarg2"} : (!fir.ref<!fir.logical<1>>) -> (!fir.ref<!fir.logical<1>>, !fir.ref<!fir.logical<1>>)
@@ -106,10 +106,10 @@ subroutine firstprivate_logical(arg1, arg2, arg3, arg4, arg5)
 
 !CHECK-LABEL: func @_QPfirstprivate_real(
 !CHECK-SAME: %[[ARG1:.*]]: !fir.ref<f32>{{.*}}, %[[ARG2:.*]]: !fir.ref<f16>{{.*}}, %[[ARG3:.*]]: !fir.ref<f32>{{.*}}, %[[ARG4:.*]]: !fir.ref<f64>{{.*}}) {
-!CHECK:   %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg1"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
-!CHECK:   %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg2"} : (!fir.ref<f16>, !fir.dscope) -> (!fir.ref<f16>, !fir.ref<f16>)
-!CHECK:   %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg3"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
-!CHECK:   %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg4"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>)
+!CHECK:   %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg1"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:   %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg2"} : (!fir.ref<f16>, !fir.dscope) -> (!fir.ref<f16>, !fir.ref<f16>)
+!CHECK:   %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg3"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:   %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_realEarg4"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>)
 !CHECK:  omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[ARG1_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG2_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG3_PVT:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[ARG4_PVT:.*]] : {{.*}}) {
 !CHECK:     %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_realEarg1"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK:     %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "_QFfirstprivate_realEarg2"} : (!fir.ref<f16>) -> (!fir.ref<f16>, !fir.ref<f16>)
@@ -132,7 +132,7 @@ subroutine firstprivate_real(arg1, arg2, arg3, arg4, arg5, arg6)
 
 !CHECK-KIND10-LABEL: func @_QPfirstprivate_real10(
 !CHECK-KIND10-SAME: %[[ARG1:.*]]: !fir.ref<f80>{{.*}}) {
-!CHECK-KIND10:   %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_real10Earg1"} : (!fir.ref<f80>, !fir.dscope) -> (!fir.ref<f80>, !fir.ref<f80>)
+!CHECK-KIND10:   %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_real10Earg1"} : (!fir.ref<f80>, !fir.dscope) -> (!fir.ref<f80>, !fir.ref<f80>)
 !CHECK-KIND10:  omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[ARG1_PVT:.*]] : {{.*}}) {
 !CHECK-KIND10:     %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_real10Earg1"} : (!fir.ref<f80>) -> (!fir.ref<f80>, !fir.ref<f80>)
 !CHECK-KIND10:     fir.call @_QPqux10(%[[ARG1_PVT_DECL]]#0) {{.*}} : (!fir.ref<f80>) -> ()
@@ -148,7 +148,7 @@ subroutine firstprivate_real10(arg1)
 
 !CHECK-KIND16-LABEL: func @_QPfirstprivate_real16(
 !CHECK-KIND16-SAME: %[[ARG1:.*]]: !fir.ref<f128>{{.*}}) {
-!CHECK-KIND16:   %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivate_real16Earg1"} : (!fir.ref<f128>, !fir.dscope) -> (!fir.ref<f128>, !fir.ref<f128>)
+!CHECK-KIND16:   %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivate_real16Earg1"} : (!fir.ref<f128>, !fir.dscope) -> (!fir.ref<f128>, !fir.ref<f128>)
 !CHECK-KIND16:  omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[ARG1_PVT:.*]] : {{.*}}) {
 !CHECK-KIND16:     %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_real16Earg1"} : (!fir.ref<f128>) -> (!fir.ref<f128>, !fir.ref<f128>)
 !CHECK-KIND16:     fir.call @_QPqux16(%[[ARG1_PVT_DECL]]#0) {{.*}} : (!fir.ref<f128>) -> ()
@@ -165,8 +165,8 @@ subroutine firstprivate_real16(arg1)
 !CHECK-LABEL:   func.func @_QPmultiple_firstprivate(
 !CHECK-SAME:                                        %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
 !CHECK-SAME:                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
-!CHECK:           %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_firstprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:           %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_firstprivateEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_firstprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_firstprivateEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:  omp.parallel private({{.*firstprivate.*}} {{.*}}#0 -> %[[A_PRIV_ADDR:.*]], {{.*firstprivate.*}} {{.*}}#0 -> %[[B_PRIV_ADDR:.*]] : {{.*}}) {
 !CHECK:             %[[A_PRIV_DECL:.*]]:2 = hlfir.declare %[[A_PRIV_ADDR]] {uniq_name = "_QFmultiple_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:             %[[B_PRIV_DECL:.*]]:2 = hlfir.declare %[[B_PRIV_ADDR]] {uniq_name = "_QFmultiple_firstprivateEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
index 5d37010f4095b..50fd8e11089ea 100644
--- a/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
+++ b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
@@ -7,7 +7,7 @@
 !CHECK-DAG: %[[ARG1_UNBOX:.*]]:2 = fir.unboxchar
 !CHECK-DAG: %[[FIVE:.*]] = arith.constant 5 : index
 !CHECK-DAG: %[[ARG1_REF:.*]] = fir.convert %[[ARG1_UNBOX]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,5>>
-!CHECK-DAG: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1_REF]] typeparams %[[FIVE]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFlastprivate_characterEarg1"} : (!fir.ref<!fir.char<1,5>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
+!CHECK-DAG: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1_REF]] typeparams %[[FIVE]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFlastprivate_characterEarg1"} : (!fir.ref<!fir.char<1,5>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
 
 !CHECK: omp.parallel {
 
@@ -56,7 +56,7 @@ subroutine lastprivate_character(arg1)
 end subroutine
 
 !CHECK: func @_QPlastprivate_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}) {
-!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFlastprivate_intEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFlastprivate_intEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK-DAG: omp.parallel  {
 !CHECK: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[CLONE:.*]], @{{.*}} %{{.*}}#0 -> %{{.*}} : !fir.ref<i32>, !{{.*}}) {
 !CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
@@ -94,8 +94,8 @@ subroutine lastprivate_int(arg1)
 end subroutine
 
 !CHECK: func.func @_QPmult_lastprivate_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) {
-!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmult_lastprivate_intEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmult_lastprivate_intEarg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmult_lastprivate_intEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmult_lastprivate_intEarg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel  {
 !CHECK: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[CLONE1:.*]], @{{.*}} %{{.*}}#0 -> %[[CLONE2:.*]], @{{.*}} %{{.*}}#0 -> %{{.*}} : !fir.ref<i32>, !fir.ref<i32>, !{{.*}}) {
 !CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
@@ -136,8 +136,8 @@ subroutine mult_lastprivate_int(arg1, arg2)
 end subroutine
 
 !CHECK: func.func @_QPmult_lastprivate_int2(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) {
-!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmult_lastprivate_int2Earg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmult_lastprivate_int2Earg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmult_lastprivate_int2Earg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmult_lastprivate_int2Earg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel  {
 !CHECK: omp.wsloop private(@{{.*}} %{{.*}}#0 -> %[[CLONE1:.*]], @{{.*}} %{{.*}}#0 -> %[[CLONE2:.*]], @{{.*}} %{{.*}}#0 -> %{{.*}} : !fir.ref<i32>, !fir.ref<i32>, !{{.*}}) {
 !CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
@@ -178,8 +178,8 @@ subroutine mult_lastprivate_int2(arg1, arg2)
 end subroutine
 
 !CHECK: func.func @_QPfirstpriv_lastpriv_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) {
-!CHECK:    %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstpriv_lastpriv_intEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:    %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstpriv_lastpriv_intEarg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstpriv_lastpriv_intEarg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstpriv_lastpriv_intEarg2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel  {
 ! Firstprivate update
 !CHECK-NOT: omp.barrier
@@ -220,7 +220,7 @@ subroutine firstpriv_lastpriv_int(arg1, arg2)
 end subroutine
 
 !CHECK: func.func @_QPfirstpriv_lastpriv_int2(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}) {
-!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstpriv_lastpriv_int2Earg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstpriv_lastpriv_int2Earg1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: omp.parallel  {
 
 ! Firstprivate update
diff --git a/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90 b/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
index 3bb40834afe4c..e2fbd8b7d1ac9 100644
--- a/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
+++ b/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
@@ -35,7 +35,7 @@
 
 ! CHECK-LABEL: @_QPmultiple_private_fix(
 ! CHECK-SAME:  %[[GAMA:.*]]: !fir.ref<i32> {fir.bindc_name = "gama"}
-! CHECK-DAG:         %[[GAMA_DECL:.*]]:2 = hlfir.declare %[[GAMA]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_private_fixEgama"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-DAG:         %[[GAMA_DECL:.*]]:2 = hlfir.declare %[[GAMA]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_private_fixEgama"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK-DAG:         %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_private_fixEi"}
 ! CHECK-DAG:         %[[I_DECL:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmultiple_private_fixEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK-DAG:         %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFmultiple_private_fixEj"}
@@ -124,7 +124,7 @@ subroutine multiple_private_fix2()
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> {fir.bindc_name = "aaa"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
 ! CHECK:           %[[VAL_2:.*]] = fir.box_elesize %[[VAL_1]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> index
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %{{[0-9]+}} {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QFsub01Eaaa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QFsub01Eaaa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
 ! CHECK:           omp.parallel private(@[[BOX_HEAP_CHAR_PRIVATIZER]] %[[VAL_3]]#0 -> %{{.*}} : {{.*}}) {
 ! CHECK:             omp.terminator
 ! CHECK:           }
@@ -139,7 +139,7 @@ subroutine sub01(aaa)
 
 ! CHECK-LABEL:   func.func @_QPsub02(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> {fir.bindc_name = "bbb"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QFsub02Ebbb"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QFsub02Ebbb"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
 ! CHECK:           omp.parallel private(@{{.*}} %[[VAL_1]]#0 -> %[[PRIV_ARG:.*]] : {{.*}}) {
 ! CHECK:             %{{.*}}:2 = hlfir.declare %[[PRIV_ARG]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub02Ebbb"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
 ! CHECK:             omp.terminator
diff --git a/flang/test/Lower/OpenMP/parallel-private-clause-str.f90 b/flang/test/Lower/OpenMP/parallel-private-clause-str.f90
index a08c0b28e3e41..7d364c00ce8be 100644
--- a/flang/test/Lower/OpenMP/parallel-private-clause-str.f90
+++ b/flang/test/Lower/OpenMP/parallel-private-clause-str.f90
@@ -65,7 +65,7 @@ subroutine test_allocatable_string(n)
 end subroutine
 
 !CHECK:  func.func @_QPtest_allocatable_string_array(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
-!CHECK:    %{{.*}} = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_allocatable_string_arrayEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %{{.*}} = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_allocatable_string_arrayEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[C_BOX_REF:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>> {bindc_name = "c", uniq_name = "_QFtest_allocatable_string_arrayEc"}
 !CHECK:    %[[C_BOX:.*]] = fir.embox %{{.*}}(%{{.*}}) typeparams %{{.*}} : (!fir.heap<!fir.array<?x!fir.char<1,?>>>, !fir.shape<1>, i32) -> !fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>
 !CHECK:    fir.store %[[C_BOX]] to %[[C_BOX_REF]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
diff --git a/flang/test/Lower/OpenMP/parallel-reduction3.f90 b/flang/test/Lower/OpenMP/parallel-reduction3.f90
index 9af18378f0ae0..6ff7f96b2b9bf 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction3.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction3.f90
@@ -54,7 +54,7 @@
 
 ! CHECK-LABEL:   func.func @_QPs(
 ! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) {
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsEi"}
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90
index 55eceaab06206..5ff2947c6ac95 100644
--- a/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90
+++ b/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90
@@ -5,7 +5,7 @@
 
 ! CHECK: func @_QPomp_do_firstprivate(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}) 
 subroutine omp_do_firstprivate(a)
-  ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_firstprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_do_firstprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer::a
   integer::n
   n = a+1
@@ -33,8 +33,8 @@ end subroutine omp_do_firstprivate
 
 ! CHECK: func @_QPomp_do_firstprivate2(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) 
 subroutine omp_do_firstprivate2(a, n)
-  ! CHECK:  %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_firstprivate2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-  ! CHECK:  %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_firstprivate2En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:  %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_do_firstprivate2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:  %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_do_firstprivate2En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer::a
   integer::n
   n = a+1
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90
index faf8f717f6308..b3d52a04a3ae6 100644
--- a/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90
+++ b/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90
@@ -5,7 +5,7 @@
 
 ! CHECK: func @_QPomp_do_lastprivate(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"})
 subroutine omp_do_lastprivate(a)
-  ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_lastprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_do_lastprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer::a
   integer::n
   n = a+1
@@ -50,8 +50,8 @@ end subroutine omp_do_lastprivate
 
 ! CHECK: func @_QPomp_do_lastprivate2(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"})
 subroutine omp_do_lastprivate2(a, n)
-  ! CHECK:  %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-  ! CHECK:  %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate2En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:  %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:  %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate2En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer::a
   integer::n
   n = a+1
@@ -96,7 +96,7 @@ end subroutine omp_do_lastprivate2
 
 ! CHECK: func @_QPomp_do_lastprivate_collapse2(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"})
 subroutine omp_do_lastprivate_collapse2(a)
-  ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate_collapse2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate_collapse2Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer::a
   !$omp parallel do lastprivate(a) collapse(2)
   ! CHECK:  omp.parallel {
@@ -158,7 +158,7 @@ end subroutine omp_do_lastprivate_collapse2
 
 ! CHECK: func @_QPomp_do_lastprivate_collapse3(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"})
 subroutine omp_do_lastprivate_collapse3(a)
-  ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate_collapse3Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_do_lastprivate_collapse3Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer::a
   !$omp parallel do lastprivate(a) collapse(3)
   ! CHECK:  omp.parallel {
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop.f90 b/flang/test/Lower/OpenMP/parallel-wsloop.f90
index 15a68e2c0e65b..5a9be40569f06 100644
--- a/flang/test/Lower/OpenMP/parallel-wsloop.f90
+++ b/flang/test/Lower/OpenMP/parallel-wsloop.f90
@@ -27,8 +27,8 @@ subroutine simple_parallel_do
 ! CHECK-LABEL: func @_QPparallel_do_with_parallel_clauses
 ! CHECK-SAME: %[[COND_REF:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}, %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}
 subroutine parallel_do_with_parallel_clauses(cond, nt)
-  ! CHECK: %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_REF]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_with_parallel_clausesEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-  ! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_with_parallel_clausesEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_REF]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFparallel_do_with_parallel_clausesEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+  ! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFparallel_do_with_parallel_clausesEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   logical :: cond
   integer :: nt
   integer :: i
@@ -56,7 +56,7 @@ subroutine parallel_do_with_parallel_clauses(cond, nt)
 ! CHECK-LABEL: func @_QPparallel_do_with_clauses
 ! CHECK-SAME: %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}
 subroutine parallel_do_with_clauses(nt)
-  ! CHECK:  %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_with_clausesEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:  %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFparallel_do_with_clausesEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer :: nt
   integer :: i
   ! CHECK:  %[[NT:.*]] = fir.load %[[NT_DECL]]#0 : !fir.ref<i32>
@@ -86,8 +86,8 @@ subroutine parallel_do_with_clauses(nt)
 ! CHECK-LABEL: func @_QPparallel_do_with_privatisation_clauses
 ! CHECK-SAME: %[[COND_REF:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}, %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}
 subroutine parallel_do_with_privatisation_clauses(cond,nt)
-  ! CHECK: %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_REF]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_with_privatisation_clausesEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-  ! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_REF]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFparallel_do_with_privatisation_clausesEcond"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+  ! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   logical :: cond
   integer :: nt
   integer :: i
@@ -139,7 +139,7 @@ end subroutine parallel_private_do
 ! CHECK-LABEL:   func.func @_QPparallel_private_do(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"},
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}) {
-! CHECK:           %[[NT_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_private_doEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[NT_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFparallel_private_doEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           omp.parallel private(@{{.*}} %{{.*}}#0 -> %[[COND_ADDR:.*]], @{{.*firstprivate.*}} %{{.*}}#0 -> %[[NT_PRIV_ADDR:.*]] : {{.*}}) {
 
 ! CHECK:             %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_ADDR]] {uniq_name = "_QFparallel_private_doEcond"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
@@ -182,8 +182,8 @@ end subroutine omp_parallel_multiple_firstprivate_do
 ! CHECK-LABEL:   func.func @_QPomp_parallel_multiple_firstprivate_do(
 ! CHECK-SAME:                                                        %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
 ! CHECK-SAME:                                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
-! CHECK:            %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:            %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:            %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:            %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           omp.parallel private(@{{.*firstprivate.*}} %{{.*}}#0 -> %[[A_PRIV_ADDR:.*]], @{{.*firstprivate.*}} %{{.*}}#0 -> %[[B_PRIV_ADDR:.*]] : {{.*}}) {
 
 ! CHECK:             %[[A_PRIV_DECL:.*]]:2 = hlfir.declare %[[A_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -229,7 +229,7 @@ end subroutine parallel_do_private
 ! CHECK-LABEL:   func.func @_QPparallel_do_private(
 ! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"},
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}) {
-! CHECK:           %[[NT_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFparallel_do_privateEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[NT_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFparallel_do_privateEnt"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           omp.parallel   {
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 9 : i32
@@ -270,8 +270,8 @@ end subroutine omp_parallel_do_multiple_firstprivate
 ! CHECK-LABEL:   func.func @_QPomp_parallel_do_multiple_firstprivate(
 ! CHECK-SAME:                                                        %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
 ! CHECK-SAME:                                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
-! CHECK:           %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>
+! CHECK:           %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>
 ! CHECK:           omp.parallel {
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 10 : i32
diff --git a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90
index 8b94d51f986f5..bd91fa51a6988 100644
--- a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90
+++ b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90
@@ -65,8 +65,8 @@ subroutine max_array_reduction(l, r)
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "r"}) {
 ! CHECK:           %[[VAL_5:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFmax_array_reductionEl"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFmax_array_reductionEr"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFmax_array_reductionEl"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFmax_array_reductionEr"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           fir.store %[[VAL_3]]#0 to %[[VAL_5]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
 ! CHECK:           omp.parallel reduction(byref @max_byref_box_Uxi32 %[[VAL_5]] -> %[[VAL_6:.*]] : !fir.ref<!fir.box<!fir.array<?xi32>>>) {
 ! CHECK:             %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmax_array_reductionEl"} : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> (!fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.array<?xi32>>>)
diff --git a/flang/test/Lower/OpenMP/sections-array-reduction.f90 b/flang/test/Lower/OpenMP/sections-array-reduction.f90
index 2f2808cebfc0c..1d286008a11f3 100644
--- a/flang/test/Lower/OpenMP/sections-array-reduction.f90
+++ b/flang/test/Lower/OpenMP/sections-array-reduction.f90
@@ -31,7 +31,7 @@ subroutine sectionsReduction(x)
 ! CHECK-LABEL:   func.func @_QPsectionsreduction(
 ! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFsectionsreductionEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFsectionsreductionEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:           omp.parallel {
 ! CHECK:             %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.array<?xf32>>
 ! CHECK:             fir.store %[[VAL_2]]#0 to %[[VAL_3]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
diff --git a/flang/test/Lower/OpenMP/sections-reduction.f90 b/flang/test/Lower/OpenMP/sections-reduction.f90
index 27da965c2ca16..09fd1c2a8abe0 100644
--- a/flang/test/Lower/OpenMP/sections-reduction.f90
+++ b/flang/test/Lower/OpenMP/sections-reduction.f90
@@ -37,8 +37,8 @@ subroutine sectionsReduction(x,y)
 ! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.ref<f32> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                    %[[VAL_1:.*]]: !fir.ref<f32> {fir.bindc_name = "y"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFsectionsreductionEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFsectionsreductionEy"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFsectionsreductionEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFsectionsreductionEy"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:           omp.parallel {
 ! CHECK:             omp.sections reduction(@add_reduction_f32 %[[VAL_3]]#0 -> %[[VAL_5:.*]], @add_reduction_f32 %[[VAL_4]]#0 -> %[[VAL_6:.*]] : !fir.ref<f32>, !fir.ref<f32>) {
 ! CHECK:               omp.section {
diff --git a/flang/test/Lower/OpenMP/sections.f90 b/flang/test/Lower/OpenMP/sections.f90
index b77c46ed054f2..59827713b6240 100644
--- a/flang/test/Lower/OpenMP/sections.f90
+++ b/flang/test/Lower/OpenMP/sections.f90
@@ -79,7 +79,7 @@ program sample
 end program sample
 
 !CHECK: func @_QPfirstprivate(%[[ARG:.*]]: !fir.ref<f32> {fir.bindc_name = "alpha"}) {
-!CHECK:   %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFfirstprivateEalpha"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) 
+!CHECK:   %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFfirstprivateEalpha"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>) 
 !CHECK:   %[[PRIVATE_ALPHA:.*]] = fir.alloca f32 {bindc_name = "alpha", pinned, uniq_name = "_QFfirstprivateEalpha"}
 !CHECK:   %[[PRIVATE_ALPHA_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_ALPHA]] {uniq_name = "_QFfirstprivateEalpha"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 !CHECK:   %[[TEMP:.*]] = fir.load %[[ARG_DECL]]#0 : !fir.ref<f32>
diff --git a/flang/test/Lower/OpenMP/simd.f90 b/flang/test/Lower/OpenMP/simd.f90
index 369b5eb072af9..99654d6f1f45e 100644
--- a/flang/test/Lower/OpenMP/simd.f90
+++ b/flang/test/Lower/OpenMP/simd.f90
@@ -26,7 +26,7 @@ subroutine simd
 
 !CHECK-LABEL: func @_QPsimd_with_if_clause
 subroutine simd_with_if_clause(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_if_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimd_with_if_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer :: i, n, threshold
   !$OMP SIMD IF( n .GE. threshold )
   ! CHECK: %[[COND:.*]] = arith.cmpi sge
@@ -46,7 +46,7 @@ subroutine simd_with_if_clause(n, threshold)
 
 !CHECK-LABEL: func @_QPsimd_with_simdlen_clause
 subroutine simd_with_simdlen_clause(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer :: i, n, threshold
   !$OMP SIMD SIMDLEN(2)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
@@ -65,7 +65,7 @@ subroutine simd_with_simdlen_clause(n, threshold)
 
 !CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_param
 subroutine simd_with_simdlen_clause_from_param(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_clause_from_paramEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_clause_from_paramEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer :: i, n, threshold
   integer, parameter :: simdlen = 2;
   !$OMP SIMD SIMDLEN(simdlen)
@@ -85,7 +85,7 @@ subroutine simd_with_simdlen_clause_from_param(n, threshold)
 
 !CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_expr_from_param
 subroutine simd_with_simdlen_clause_from_expr_from_param(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_clause_from_expr_from_paramEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_clause_from_expr_from_paramEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer :: i, n, threshold
   integer, parameter :: simdlen = 2;
   !$OMP SIMD SIMDLEN(simdlen*2 + 2)
@@ -105,7 +105,7 @@ subroutine simd_with_simdlen_clause_from_expr_from_param(n, threshold)
 
 !CHECK-LABEL: func @_QPsimd_with_safelen_clause
 subroutine simd_with_safelen_clause(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_safelen_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimd_with_safelen_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer :: i, n, threshold
   !$OMP SIMD SAFELEN(2)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
@@ -124,7 +124,7 @@ subroutine simd_with_safelen_clause(n, threshold)
 
 !CHECK-LABEL: func @_QPsimd_with_safelen_clause_from_expr_from_param
 subroutine simd_with_safelen_clause_from_expr_from_param(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_safelen_clause_from_expr_from_paramEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimd_with_safelen_clause_from_expr_from_paramEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer :: i, n, threshold
   integer, parameter :: safelen = 2;
   !$OMP SIMD SAFELEN(safelen*2 + 2)
@@ -144,7 +144,7 @@ subroutine simd_with_safelen_clause_from_expr_from_param(n, threshold)
 
 !CHECK-LABEL: func @_QPsimd_with_simdlen_safelen_clause
 subroutine simd_with_simdlen_safelen_clause(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_safelen_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimd_with_simdlen_safelen_clauseEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   integer :: i, n, threshold
   !$OMP SIMD SIMDLEN(1) SAFELEN(2)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
diff --git a/flang/test/Lower/OpenMP/single.f90 b/flang/test/Lower/OpenMP/single.f90
index 59b76e398612c..45a0318d2892a 100644
--- a/flang/test/Lower/OpenMP/single.f90
+++ b/flang/test/Lower/OpenMP/single.f90
@@ -11,7 +11,7 @@
 !CHECK-SAME: (%[[X:.*]]: !fir.ref<i32> {fir.bindc_name = "x"})
 subroutine omp_single(x)
   integer, intent(inout) :: x
-  !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFomp_singleEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFomp_singleEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   !CHECK: omp.parallel
   !$omp parallel
   !CHECK: omp.single
@@ -34,7 +34,7 @@ end subroutine omp_single
 !CHECK-SAME: (%[[X:.*]]: !fir.ref<i32> {fir.bindc_name = "x"})
 subroutine omp_single_nowait(x)
   integer, intent(inout) :: x
-  !CHECK:   %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFomp_single_nowaitEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  !CHECK:   %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFomp_single_nowaitEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
   !CHECK: omp.parallel
   !$omp parallel
   !CHECK: omp.single nowait
@@ -76,8 +76,8 @@ end subroutine single_allocate
 ! CHECK-LABEL: func.func @_QPsingle_privatization(
 ! CHECK-SAME:                                     %[[X:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}, 
 ! CHECK-SAME:                                     %[[Y:.*]]: !fir.ref<f64> {fir.bindc_name = "y"}) {
-! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsingle_privatizationEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:           %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsingle_privatizationEy"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>)
+! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsingle_privatizationEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsingle_privatizationEy"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>)
 ! CHECK:           omp.single   {
 ! CHECK:             %[[X_PVT:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFsingle_privatizationEx"}
 ! CHECK:             %[[X_PVT_DECL:.*]]:2 = hlfir.declare %[[X_PVT]] {uniq_name = "_QFsingle_privatizationEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
@@ -103,8 +103,8 @@ subroutine single_privatization(x, y)
 ! CHECK-LABEL: func.func @_QPsingle_privatization2(
 ! CHECK-SAME:                                      %[[X:.*]]: !fir.ref<f32> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                      %[[Y:.*]]: !fir.ref<f64> {fir.bindc_name = "y"}) {
-! CHECK:         %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsingle_privatization2Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:         %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFsingle_privatization2Ey"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>)
+! CHECK:         %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsingle_privatization2Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:         %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsingle_privatization2Ey"} : (!fir.ref<f64>, !fir.dscope) -> (!fir.ref<f64>, !fir.ref<f64>)
 ! CHECK:         omp.parallel   {
 ! CHECK:           omp.single   {
 ! CHECK:             %[[X_PVT:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFsingle_privatization2Ex"}
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index 26bd62edf9d0c..ad1dd7044fc81 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -465,7 +465,7 @@ end subroutine omp_target_implicit_nested
 !CHECK: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 subroutine omp_target_implicit_bounds(n)
    !CHECK: %[[VAL_COPY:.*]] = fir.alloca i32
-   !CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFomp_target_implicit_boundsEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+   !CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFomp_target_implicit_boundsEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
    !CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
    !CHECK: fir.store %[[VAL_2]] to %[[VAL_COPY]] : !fir.ref<i32>
    !CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64
diff --git a/flang/test/Lower/OpenMP/task-depend-array-section.f90 b/flang/test/Lower/OpenMP/task-depend-array-section.f90
index 4033b8ed1abf3..ab2bfd7f1e81e 100644
--- a/flang/test/Lower/OpenMP/task-depend-array-section.f90
+++ b/flang/test/Lower/OpenMP/task-depend-array-section.f90
@@ -12,7 +12,7 @@ subroutine knownShape(array)
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_1]] {uniq_name = "_QFknownshapeEarray"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFknownshapeEarray"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 2 : index
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 8 : index
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
@@ -36,7 +36,7 @@ subroutine assumedShape(array)
 ! CHECK-LABEL:   func.func @_QPassumedshape(
 ! CHECK-SAME:                              %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "array"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFassumedshapeEarray"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFassumedshapeEarray"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 2 : index
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 8 : index
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 2 : index
@@ -61,8 +61,8 @@ subroutine vectorSubscriptArraySection(array, indices)
 ! CHECK-SAME:                                              %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "array"},
 ! CHECK-SAME:                                              %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "indices"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFvectorsubscriptarraysectionEarray"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFvectorsubscriptarraysectionEindices"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFvectorsubscriptarraysectionEarray"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFvectorsubscriptarraysectionEindices"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_4]]#0, %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]]#1 : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/OpenMP/taskloop-collapse.f90 b/flang/test/Lower/OpenMP/taskloop-collapse.f90
new file mode 100644
index 0000000000000..48243640d07b9
--- /dev/null
+++ b/flang/test/Lower/OpenMP/taskloop-collapse.f90
@@ -0,0 +1,34 @@
+! Test the collapse clause when being used with the taskloop construct
+! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=45 %s -o - 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=45 %s -o - 2>&1 | FileCheck %s
+
+! CHECK-LABEL: omp.private
+! CHECK-SAME: {type = private} @[[J_PRIVATE:.*]] : i32
+! CHECK-LABEL: omp.private
+! CHECK-SAME: {type = private} @[[I_PRIVATE:.*]] : i32
+! CHECK-LABEL: omp.private
+! CHECK-SAME: {type = firstprivate} @[[SUM_FIRSTPRIVATE:.*]] : i32 copy
+
+! CHECK-LABEL: func.func @_QPtest()
+! CHECK: %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtestEi"}
+! CHECK: %[[DECLARE_I:.*]]:2 = hlfir.declare %1 {uniq_name = "_QFtestEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[ALLOCA_J:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFtestEj"}
+! CHECK: %[[DECLARE_J:.*]]:2 = hlfir.declare %3 {uniq_name = "_QFtestEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[ALLOCA_SUM:.*]] = fir.alloca i32 {bindc_name = "sum", uniq_name = "_QFtestEsum"}
+! CHECK: %[[DECLARE_SUM:.*]]:2 = hlfir.declare %5 {uniq_name = "_QFtestEsum"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+subroutine test()
+    integer :: i, j, sum
+
+    !$omp taskloop collapse(2)
+    ! CHECK-LABEL: omp.taskloop
+    ! CHECK-SAME: private(@_QFtestEsum_firstprivate_i32 %[[DECLARE_SUM]]#0 -> %arg0, @_QFtestEi_private_i32 %[[DECLARE_I]]#0 -> %arg1, @_QFtestEj_private_i32 %[[DECLARE_J]]#0 -> %arg2 : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>)
+    ! CHECK-LABEL: omp.loop_nest
+    ! CHECK-SAME: (%arg3, %arg4) : i32 = (%c1_i32, %c1_i32_1) to (%c10_i32, %c5_i32) inclusive step (%c1_i32_0, %c1_i32_2) collapse(2)
+    do i = 1, 10
+        do j = 1, 5
+            sum = sum + i + j
+        end do
+    end do
+    !$omp end taskloop
+end subroutine
diff --git a/flang/test/Lower/OpenMP/taskloop.f90 b/flang/test/Lower/OpenMP/taskloop.f90
index 79b0c20e176c0..d23eef2d4ac2d 100644
--- a/flang/test/Lower/OpenMP/taskloop.f90
+++ b/flang/test/Lower/OpenMP/taskloop.f90
@@ -1,5 +1,36 @@
-! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! REQUIRES: openmp_runtime
+! RUN: bbc -emit-hlfir %openmp_flags -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %openmp_flags -o - %s 2>&1 | FileCheck %s
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[LAST_PRIVATE_I:.*]] : i32
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[LAST_PRIVATE_X:.*]] : i32
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[QFOMP_TASKLOOP_NOGROUPEI_PRIVATE_I32:.*]] : i32
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[OMP_TASKLOOP_UNTIEDEI_PRIVATE_I32:.*]] : i32
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[QFTEST_PRIORITYEI_PRIVATE_I32:.*]] : i32
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[QFTEST_MERGEABLEEI_PRIVATE_I32:.*]] : i32
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[I_PRIVATE_IF_TEST1:.*]] : i32
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[I_PRIVATE_FINAL:.*]] : i32
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[I_PRIVATE_TEST_ALLOCATE:.*]] : i32
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[X_PRIVATE_TEST_ALLOCATE:.*]] : i32
 
 ! CHECK-LABEL:  omp.private 
 ! CHECK-SAME:       {type = private} @[[I_PRIVATE_TEST2:.*]] : i32
@@ -70,3 +101,149 @@ subroutine omp_taskloop_private
 ! CHECK:         }
   !$omp end taskloop
 end subroutine omp_taskloop_private
+
+!===============================================================================
+! `allocate` clause
+!===============================================================================
+
+! CHECK-LABEL:  func.func @_QPtaskloop_allocate
+! CHECK:           %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtaskloop_allocateEi"}
+! CHECK:           %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFtaskloop_allocateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[ALLOCA_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtaskloop_allocateEx"}
+! CHECK:           %[[DECL_X:.*]]:2 = hlfir.declare %[[ALLOCA_X]] {uniq_name = "_QFtaskloop_allocateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+subroutine taskloop_allocate()
+   use omp_lib
+   integer :: x
+   ! CHECK:           omp.taskloop allocate(%{{.*}} : i64 -> %[[DECL_X]]#0 : !fir.ref<i32>) 
+   ! CHECK-SAME:      private(@[[X_PRIVATE_TEST_ALLOCATE]] %[[DECL_X]]#0 -> %[[ARG0:.*]], @[[I_PRIVATE_TEST_ALLOCATE]] %[[DECL_I]]#0 -> %[[ARG1:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
+   !$omp taskloop allocate(omp_high_bw_mem_alloc: x) private(x)
+   do i = 1, 100
+      ! CHECK: arith.addi
+      x = x + 12
+      ! CHECK: omp.yield
+   end do
+   !$omp end taskloop
+end subroutine taskloop_allocate
+
+!===============================================================================
+! `final` clause
+!===============================================================================
+
+! CHECK-LABEL:  func.func @_QPtaskloop_final
+! CHECK:           %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtaskloop_finalEi"}
+! CHECK:           %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFtaskloop_finalEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+subroutine taskloop_final()
+    ! CHECK:  omp.taskloop final(%true) private(@[[I_PRIVATE_FINAL]] %[[DECL_I]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) {
+   !$omp taskloop final(.true.)
+   do i = 1, 100
+      ! CHECK: fir.call @_QPfoo()
+      call foo()
+   end do
+   !$omp end taskloop
+end subroutine
+
+!===============================================================================
+! `if` clause
+!===============================================================================
+
+! CHECK-LABEL:  func.func @_QPomp_taskloop_if
+! CHECK:            %[[DECL_BAR:.*]]:2 = hlfir.declare %[[ARG0:.*]] dummy_scope %{{.*}}
+! CHECK:           %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_taskloop_ifEi"}
+! CHECK:           %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFomp_taskloop_ifEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[LOAD_VAL:.*]] = fir.load %[[DECL_BAR]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_BAR:.*]] = fir.convert %[[LOAD_VAL]] : (!fir.logical<4>) -> i1
+subroutine omp_taskloop_if(bar)
+   logical, intent(inout) :: bar
+   !CHECK: omp.taskloop if(%[[VAL_BAR]]) private(@[[I_PRIVATE_IF_TEST1]] %[[DECL_I]]#0 -> %[[ARG1:.*]] : !fir.ref<i32>) {
+   !$omp taskloop if(bar)
+   do i = 1, 10
+      call foo()
+   end do
+   !$omp end taskloop
+end subroutine omp_taskloop_if
+
+!===============================================================================
+! `mergeable` clause
+!===============================================================================
+
+! CHECK-LABEL:  func.func @_QPtest_mergeable
+subroutine test_mergeable
+  ! CHECK: omp.taskloop mergeable
+  !$omp taskloop mergeable
+  do i = 1, 10
+  end do
+  !$omp end taskloop
+end subroutine test_mergeable
+
+!===============================================================================
+! `priority` clause
+!===============================================================================
+
+! CHECK-LABEL:  func.func @_QPtest_priority
+! CHECK:          %[[VAL1:.*]]:2 = hlfir.declare %[[ARG0:.*]] dummy_scope %{{.*}}
+! CHECK:          %[[LOAD_VAL:.*]] = fir.load %[[VAL1]]#0 : !fir.ref<i32>
+subroutine test_priority(n)
+   integer, intent(inout) :: n
+   ! CHECK:  omp.taskloop priority(%[[LOAD_VAL]] : i32)
+   !$omp taskloop priority(n)
+   do i = 1, 10
+   end do
+   !$omp end taskloop
+end subroutine test_priority
+
+!===============================================================================
+! `untied` clause
+!===============================================================================
+
+! CHECK-LABEL:  func.func @_QPomp_taskloop_untied
+subroutine omp_taskloop_untied()
+  ! CHECK: omp.taskloop untied
+  !$omp taskloop untied
+  do i = 1, 10
+    call foo()
+  end do
+  !$omp end taskloop
+end subroutine
+
+!===============================================================================
+! `nogroup` clause
+!===============================================================================
+
+subroutine omp_taskloop_nogroup()
+  ! CHECK: omp.taskloop nogroup
+  !$omp taskloop nogroup
+  do i = 1, 10
+    call foo()
+  end do
+  !$omp end taskloop
+end subroutine
+
+!===============================================================================
+! `lastprivate` clause
+!===============================================================================
+
+! CHECK-LABEL:  func.func @_QPomp_taskloop_lastprivate
+! CHECK:          %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_taskloop_lastprivateEi"}
+! CHECK:          %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFomp_taskloop_lastprivateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[ALLOCA_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFomp_taskloop_lastprivateEx"}
+! CHECK:          %[[DECL_X:.*]]:2 = hlfir.declare %[[ALLOCA_X]] {uniq_name = "_QFomp_taskloop_lastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+subroutine omp_taskloop_lastprivate()
+   integer x
+   x = 0
+   ! CHECK:  omp.taskloop private(@[[LAST_PRIVATE_X]] %[[DECL_X]]#0 -> %[[ARG0]], @[[LAST_PRIVATE_I]] %[[DECL_I]]#0 -> %[[ARG1]] : !fir.ref<i32>, !fir.ref<i32>) {
+   !$omp taskloop lastprivate(x)
+   do i = 1, 100
+      ! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFomp_taskloop_lastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      ! CHECK: %[[LOAD_ARG0:.*]] = fir.load %[[DECL_ARG0]]#0 : !fir.ref<i32>
+      ! CHECK: %[[RES_ADD:.*]] = arith.addi %[[LOAD_ARG0]], %{{.*}} : i32
+      ! CHECK:  hlfir.assign %[[RES_ADD]] to %[[DECL_ARG0]]#0 : i32, !fir.ref<i32>
+      x = x + 1
+      ! CHECK:  %[[SELCT_RESULT:.*]] = arith.select %{{.*}}, %{{.*}}, %{{.*}} : i1
+      ! CHECK:  fir.if %[[SELCT_RESULT]] {
+      ! CHECK:    %[[LOADED_SUM:.*]] = fir.load %[[DECL_ARG0]]#0 : !fir.ref<i32>
+      ! CHECK:    hlfir.assign %[[LOADED_SUM]] to %[[DECL_X]]#0 : i32, !fir.ref<i32>
+      ! CHECK:  }
+      ! CHECK:  omp.yield
+   end do
+   !$omp end taskloop
+end subroutine omp_taskloop_lastprivate
diff --git a/flang/test/Lower/OpenMP/tile01.f90 b/flang/test/Lower/OpenMP/tile01.f90
index 7603eee4b18d8..fc5a485b305d5 100644
--- a/flang/test/Lower/OpenMP/tile01.f90
+++ b/flang/test/Lower/OpenMP/tile01.f90
@@ -20,11 +20,11 @@ end subroutine omp_tile01
 ! CHECK:         %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_tile01Ei"}
 ! CHECK:         %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_tile01Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:         %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile01Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:         %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile01Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_tile01Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_tile01Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:         %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_tile01Eres"}
 ! CHECK:         %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFomp_tile01Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:         %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile01Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_tile01Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:         %[[VAL_8:.*]] = arith.constant 4 : i32
 ! CHECK:         %[[VAL_9:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
 ! CHECK:         %[[VAL_10:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/tile02.f90 b/flang/test/Lower/OpenMP/tile02.f90
index 5df506d17ed05..266c310450587 100644
--- a/flang/test/Lower/OpenMP/tile02.f90
+++ b/flang/test/Lower/OpenMP/tile02.f90
@@ -22,13 +22,13 @@ end subroutine omp_tile02
 ! CHECK:         %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_tile02Ei"}
 ! CHECK:         %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_tile02Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:         %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile02Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_tile02Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:         %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFomp_tile02Ej"}
 ! CHECK:         %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFomp_tile02Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:         %[[VAL_6:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile02Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_6:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_tile02Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:         %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_tile02Eres"}
 ! CHECK:         %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFomp_tile02Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:         %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile02Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_tile02Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:         %[[VAL_10:.*]] = arith.constant 3 : i32
 ! CHECK:         %[[VAL_11:.*]] = arith.constant 7 : i32
 ! CHECK:         %[[VAL_12:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/unroll-heuristic01.f90 b/flang/test/Lower/OpenMP/unroll-heuristic01.f90
index 34020eb727e55..3ec96a9f0dab2 100644
--- a/flang/test/Lower/OpenMP/unroll-heuristic01.f90
+++ b/flang/test/Lower/OpenMP/unroll-heuristic01.f90
@@ -20,11 +20,11 @@ end subroutine omp_unroll_heuristic01
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_unroll_heuristic01Ei"}
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_unroll_heuristic01Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic01Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic01Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic01Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic01Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_unroll_heuristic01Eres"}
 ! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFomp_unroll_heuristic01Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic01Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic01Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/unroll-heuristic02.f90 b/flang/test/Lower/OpenMP/unroll-heuristic02.f90
index fdb1366960b23..20b5c50455295 100644
--- a/flang/test/Lower/OpenMP/unroll-heuristic02.f90
+++ b/flang/test/Lower/OpenMP/unroll-heuristic02.f90
@@ -27,14 +27,14 @@ end subroutine omp_unroll_heuristic_nested02
 !CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
 !CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_unroll_heuristic_nested02Ei"}
 !CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_unroll_heuristic_nested02Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Einner_inc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Einner_lb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Einner_ub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic_nested02Einner_inc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic_nested02Einner_lb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic_nested02Einner_ub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:           %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFomp_unroll_heuristic_nested02Ej"}
 !CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFomp_unroll_heuristic_nested02Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Eouter_inc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Eouter_lb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Eouter_ub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic_nested02Eouter_inc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic_nested02Eouter_lb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic_nested02Eouter_ub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:           %[[VAL_11:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_unroll_heuristic_nested02Eres"}
 !CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFomp_unroll_heuristic_nested02Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:           %[[VAL_13:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/unroll-heuristic03.f90 b/flang/test/Lower/OpenMP/unroll-heuristic03.f90
index 308c149c260dc..c0aead2a5be5d 100644
--- a/flang/test/Lower/OpenMP/unroll-heuristic03.f90
+++ b/flang/test/Lower/OpenMP/unroll-heuristic03.f90
@@ -23,11 +23,11 @@ end subroutine omp_unroll_heuristic03
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_unroll_heuristic03Ei"}
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_unroll_heuristic03Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic03Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic03Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic03Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic03Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_unroll_heuristic03Eres"}
 ! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFomp_unroll_heuristic03Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic03Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_unroll_heuristic03Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           omp.parallel private(@_QFomp_unroll_heuristic03Ei_private_i32 %[[VAL_2]]#0 -> %[[VAL_8:.*]] : !fir.ref<i32>) {
 ! CHECK:             %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFomp_unroll_heuristic03Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:             %[[VAL_10:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
index 209ee9a4e0cef..7184b3b102fd8 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
@@ -77,7 +77,7 @@ subroutine reduce(r)
 ! CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.box<!fir.array<?xf64>> {fir.bindc_name = "r"}) attributes {{.*}} {
 ! CHECK:           %[[VAL_1:.*]] = fir.address_of(@_QFFreduceEi) : !fir.ref<i32>
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFFreduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {fortran_attrs = {{.*}}, uniq_name = "_QFFreduceEr"} : (!fir.box<!fir.array<?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = {{.*}}, uniq_name = "_QFFreduceEr"} : (!fir.box<!fir.array<?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>)
 ! CHECK:           omp.parallel {
 ! CHECK:             %[[VAL_4:.*]] = fir.alloca !fir.box<!fir.array<?xf64>>
 ! CHECK:             fir.store %[[VAL_3]]#0 to %[[VAL_4]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90
index 501dd04850107..634f07e121f90 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90
@@ -28,7 +28,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_iandEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iandEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_iandEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_iandEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_iandEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90
index 8243c73f6b8e5..b9b911cde7418 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-iand.f90
@@ -20,7 +20,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_iandEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iandEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_iandEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_iandEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_iandEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90
index 84814b8eb97a7..795e41704172e 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90
@@ -24,7 +24,7 @@
 !CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
 !CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_ieorEx"}
 !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFreduction_ieorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_ieorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_ieorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 
 
 !CHECK: omp.parallel
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90
index c474f6d2979d4..5482887d85e22 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ieor.f90
@@ -13,7 +13,7 @@
 !CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
 !CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_ieorEx"}
 !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFreduction_ieorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_ieorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_ieorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 
 
 !CHECK: omp.parallel
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90
index 550945d719214..83aa396b496f9 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90
@@ -26,7 +26,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_iorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iorEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_iorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_iorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_iorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90
index 8de6eb7f9ac26..14a5997ead104 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ior.f90
@@ -20,7 +20,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_iorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iorEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_iorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_iorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_iorEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
index cf5c1b127c34d..fc73fb58d8fd4 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
@@ -34,7 +34,7 @@
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -82,7 +82,7 @@ end subroutine simple_reduction
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -127,7 +127,7 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90
index eff97aa5d0db0..2cf45938ffb5a 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and.f90
@@ -26,7 +26,7 @@
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -74,7 +74,7 @@ end subroutine simple_reduction
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -119,7 +119,7 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
index bbe4bf04b6827..d2f27badc3302 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
@@ -34,7 +34,7 @@
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -81,7 +81,7 @@ subroutine simple_reduction(y)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -126,7 +126,7 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90
index 304c7eefd58b0..7b9b6b847be7d 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv.f90
@@ -26,7 +26,7 @@
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -73,7 +73,7 @@ subroutine simple_reduction(y)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -118,7 +118,7 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
index 8c869952547c6..94a24bdeabbff 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
@@ -34,7 +34,7 @@
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -82,7 +82,7 @@ subroutine simple_reduction(y)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -129,7 +129,7 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90
index e0901bec6bc9c..3f7f97f29afb4 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv.f90
@@ -26,7 +26,7 @@
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -74,7 +74,7 @@ subroutine simple_reduction(y)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -121,7 +121,7 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
index bb09e2563e378..4c1496b1f083b 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
@@ -33,7 +33,7 @@
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -80,7 +80,7 @@ subroutine simple_reduction(y)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -125,7 +125,7 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90
index 5c9bcbf584d48..6d457f490b9bc 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or.f90
@@ -26,7 +26,7 @@
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -73,7 +73,7 @@ subroutine simple_reduction(y)
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant true
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
 ! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
@@ -118,7 +118,7 @@ subroutine simple_reduction_switch_order(y)
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
index 69219331ab3ab..73a8885fa3124 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
@@ -41,7 +41,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
@@ -70,7 +70,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_max_realEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_max_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_max_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max.f90
index 83582d279fd3d..cebd779dee61f 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-max.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max.f90
@@ -31,7 +31,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
@@ -60,7 +60,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_max_realEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_max_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_max_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
index f691d57e276df..1a6bc37168557 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
@@ -41,7 +41,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_min_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_min_intEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_min_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_min_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_min_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
@@ -70,7 +70,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_min_realEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_min_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_min_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_min_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min.f90
index 3ee2ecc50e19a..b3a899d3b70ba 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-min.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min.f90
@@ -31,7 +31,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_min_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_min_intEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_min_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_min_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_min_intEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
 ! CHECK:           omp.parallel {
@@ -60,7 +60,7 @@
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_min_realEx"}
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_min_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_min_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFreduction_min_realEy"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
 ! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32>
 ! CHECK:           omp.parallel {
diff --git a/flang/test/Lower/allocatable-assignment.f90 b/flang/test/Lower/allocatable-assignment.f90
index 71385aa7761b0..3c220232104a5 100644
--- a/flang/test/Lower/allocatable-assignment.f90
+++ b/flang/test/Lower/allocatable-assignment.f90
@@ -18,7 +18,7 @@ subroutine test_simple_scalar(x)
 ! CHECK-LABEL:   func.func @_QMalloc_assignPtest_simple_scalar(
 ! CHECK-SAME:                                                  %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_simple_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_simple_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 4.200000e+01 : f32
 ! CHECK:           hlfir.assign %[[VAL_3]] to %[[VAL_2]]#0 realloc : f32, !fir.ref<!fir.box<!fir.heap<f32>>>
 
@@ -47,7 +47,7 @@ subroutine test_deferred_char_scalar(x)
 ! CHECK-LABEL:   func.func @_QMalloc_assignPtest_deferred_char_scalar(
 ! CHECK-SAME:                                                         %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_deferred_char_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_deferred_char_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.address_of(@_QQclX48656C6C6F20776F726C6421) : !fir.ref<!fir.char<1,12>>
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 12 : index
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] typeparams %[[VAL_4]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX48656C6C6F20776F726C6421"} : (!fir.ref<!fir.char<1,12>>, index) -> (!fir.ref<!fir.char<1,12>>, !fir.ref<!fir.char<1,12>>)
@@ -61,7 +61,7 @@ subroutine test_cst_char_scalar(x)
 ! CHECK-SAME:                                                    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 10 : index
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_cst_char_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_cst_char_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,10>>>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.address_of(@_QQclX48656C6C6F20776F726C6421) : !fir.ref<!fir.char<1,12>>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 12 : index
 ! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]] typeparams %[[VAL_5]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX48656C6C6F20776F726C6421"} : (!fir.ref<!fir.char<1,12>>, index) -> (!fir.ref<!fir.char<1,12>>, !fir.ref<!fir.char<1,12>>)
@@ -76,12 +76,12 @@ subroutine test_dyn_char_scalar(x, n)
 ! CHECK-SAME:                                                    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                                    %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_dyn_char_scalarEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_dyn_char_scalarEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i32
 ! CHECK:           %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : i32
 ! CHECK:           %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : i32
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_dyn_char_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, i32, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_dyn_char_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, i32, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
 ! CHECK:           %[[VAL_9:.*]] = fir.address_of(@_QQclX48656C6C6F20776F726C6421) : !fir.ref<!fir.char<1,12>>
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 12 : index
 ! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_9]] typeparams %[[VAL_10]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX48656C6C6F20776F726C6421"} : (!fir.ref<!fir.char<1,12>>, index) -> (!fir.ref<!fir.char<1,12>>, !fir.ref<!fir.char<1,12>>)
@@ -96,8 +96,8 @@ subroutine test_derived_scalar(x, s)
 ! CHECK-SAME:                                                   %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignTt{i:i32}>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                                   %[[VAL_1:.*]]: !fir.ref<!fir.type<_QMalloc_assignTt{i:i32}>> {fir.bindc_name = "s"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_derived_scalarEs"} : (!fir.ref<!fir.type<_QMalloc_assignTt{i:i32}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMalloc_assignTt{i:i32}>>, !fir.ref<!fir.type<_QMalloc_assignTt{i:i32}>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_derived_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignTt{i:i32}>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignTt{i:i32}>>>>, !fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignTt{i:i32}>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_derived_scalarEs"} : (!fir.ref<!fir.type<_QMalloc_assignTt{i:i32}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMalloc_assignTt{i:i32}>>, !fir.ref<!fir.type<_QMalloc_assignTt{i:i32}>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_derived_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignTt{i:i32}>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignTt{i:i32}>>>>, !fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignTt{i:i32}>>>>)
 ! CHECK:           hlfir.assign %[[VAL_3]]#0 to %[[VAL_4]]#0 realloc : !fir.ref<!fir.type<_QMalloc_assignTt{i:i32}>>, !fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignTt{i:i32}>>>>
 
 ! -----------------------------------------------------------------------------
@@ -113,11 +113,11 @@ subroutine test_from_cst_shape_array(x, y)
 ! CHECK-SAME:                                                         %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                                         %[[VAL_1:.*]]: !fir.ref<!fir.array<2x3xf32>> {fir.bindc_name = "y"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_from_cst_shape_arrayEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_from_cst_shape_arrayEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 2 : index
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_4]], %[[VAL_5]] : (index, index) -> !fir.shape<2>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_from_cst_shape_arrayEy"} : (!fir.ref<!fir.array<2x3xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<2x3xf32>>, !fir.ref<!fir.array<2x3xf32>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_from_cst_shape_arrayEy"} : (!fir.ref<!fir.array<2x3xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.ref<!fir.array<2x3xf32>>, !fir.ref<!fir.array<2x3xf32>>)
 ! CHECK:           hlfir.assign %[[VAL_7]]#0 to %[[VAL_3]]#0 realloc : !fir.ref<!fir.array<2x3xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
 
 subroutine test_from_dyn_shape_array(x, y)
@@ -129,8 +129,8 @@ subroutine test_from_dyn_shape_array(x, y)
 ! CHECK-SAME:                                                         %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                                         %[[VAL_1:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "y"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_from_dyn_shape_arrayEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_from_dyn_shape_arrayEy"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_from_dyn_shape_arrayEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_from_dyn_shape_arrayEy"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
 ! CHECK:           hlfir.assign %[[VAL_4]]#0 to %[[VAL_3]]#0 realloc : !fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
 
 subroutine test_with_lbounds(x, y)
@@ -142,13 +142,13 @@ subroutine test_with_lbounds(x, y)
 ! CHECK-SAME:                                                 %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                                 %[[VAL_1:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "y"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_with_lboundsEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_with_lboundsEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 10 : i64
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 20 : i64
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> index
 ! CHECK:           %[[VAL_8:.*]] = fir.shift %[[VAL_5]], %[[VAL_7]] : (index, index) -> !fir.shift<2>
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_with_lboundsEy"} : (!fir.box<!fir.array<?x?xf32>>, !fir.shift<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_with_lboundsEy"} : (!fir.box<!fir.array<?x?xf32>>, !fir.shift<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
 ! CHECK:           hlfir.assign %[[VAL_9]]#0 to %[[VAL_3]]#0 realloc : !fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
 
 subroutine test_runtime_shape(x)
@@ -164,7 +164,7 @@ function return_pointer()
 ! CHECK-SAME:                                                  %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?x?xf32>>> {bindc_name = ".result"}
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_runtime_shapeEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_runtime_shapeEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.call @_QPreturn_pointer() fastmath<contract> : () -> !fir.box<!fir.ptr<!fir.array<?x?xf32>>>
 ! CHECK:           fir.save_result %[[VAL_4]] to %[[VAL_1]] : !fir.box<!fir.ptr<!fir.array<?x?xf32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>)
@@ -180,8 +180,8 @@ subroutine test_scalar_rhs(x, y)
 ! CHECK-SAME:                                               %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                               %[[VAL_1:.*]]: !fir.ref<f32> {fir.bindc_name = "y"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_scalar_rhsEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_scalar_rhsEy"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_scalar_rhsEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_scalar_rhsEy"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<f32>
 ! CHECK:           hlfir.assign %[[VAL_5]] to %[[VAL_3]]#0 realloc : f32, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 
@@ -205,7 +205,7 @@ subroutine test_cst_char_rhs_scalar(x)
 ! CHECK-SAME:                                                        %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 10 : index
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_cst_char_rhs_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_cst_char_rhs_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.address_of(@_QQclX48656C6C6F20776F726C6421) : !fir.ref<!fir.char<1,12>>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 12 : index
 ! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]] typeparams %[[VAL_5]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX48656C6C6F20776F726C6421"} : (!fir.ref<!fir.char<1,12>>, index) -> (!fir.ref<!fir.char<1,12>>, !fir.ref<!fir.char<1,12>>)
@@ -221,12 +221,12 @@ subroutine test_dyn_char_rhs_scalar(x, n)
 ! CHECK-SAME:                                                        %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                                        %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_dyn_char_rhs_scalarEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_dyn_char_rhs_scalarEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i32
 ! CHECK:           %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : i32
 ! CHECK:           %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : i32
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_dyn_char_rhs_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, i32, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_dyn_char_rhs_scalarEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, i32, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>)
 ! CHECK:           %[[VAL_9:.*]] = fir.address_of(@_QQclX48656C6C6F20776F726C6421) : !fir.ref<!fir.char<1,12>>
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 12 : index
 ! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_9]] typeparams %[[VAL_10]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX48656C6C6F20776F726C6421"} : (!fir.ref<!fir.char<1,12>>, index) -> (!fir.ref<!fir.char<1,12>>, !fir.ref<!fir.char<1,12>>)
@@ -253,9 +253,9 @@ subroutine test_cst_char(x, c)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 12 : index
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 20 : index
 ! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_7]]) typeparams %[[VAL_5]] dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_cst_charEc"} : (!fir.ref<!fir.array<20x!fir.char<1,12>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<20x!fir.char<1,12>>>, !fir.ref<!fir.array<20x!fir.char<1,12>>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_7]]) typeparams %[[VAL_5]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_cst_charEc"} : (!fir.ref<!fir.array<20x!fir.char<1,12>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<20x!fir.char<1,12>>>, !fir.ref<!fir.array<20x!fir.char<1,12>>>)
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 10 : index
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_9]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_cst_charEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_9]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_cst_charEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>)
 ! CHECK:           hlfir.assign %[[VAL_8]]#0 to %[[VAL_10]]#0 realloc keep_lhs_len : !fir.ref<!fir.array<20x!fir.char<1,12>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,10>>>>>
 
 subroutine test_dyn_char(x, n, c)
@@ -273,13 +273,13 @@ subroutine test_dyn_char(x, n, c)
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<20x!fir.char<1,?>>>
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 20 : index
 ! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_7]]) typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_3]] {uniq_name = "_QMalloc_assignFtest_dyn_charEc"} : (!fir.ref<!fir.array<20x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<20x!fir.char<1,?>>>, !fir.ref<!fir.array<20x!fir.char<1,?>>>)
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QMalloc_assignFtest_dyn_charEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_7]]) typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_dyn_charEc"} : (!fir.ref<!fir.array<20x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<20x!fir.char<1,?>>>, !fir.ref<!fir.array<20x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_dyn_charEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
 ! CHECK:           %[[VAL_11:.*]] = arith.constant 0 : i32
 ! CHECK:           %[[VAL_12:.*]] = arith.cmpi sgt, %[[VAL_10]], %[[VAL_11]] : i32
 ! CHECK:           %[[VAL_13:.*]] = arith.select %[[VAL_12]], %[[VAL_10]], %[[VAL_11]] : i32
-! CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_13]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_dyn_charEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, i32, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>)
+! CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_13]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_dyn_charEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, i32, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>)
 ! CHECK:           hlfir.assign %[[VAL_8]]#0 to %[[VAL_14]]#0 realloc keep_lhs_len : !fir.box<!fir.array<20x!fir.char<1,?>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.char<1,?>>>>>
 
 subroutine test_derived_with_init(x, y)
@@ -297,8 +297,8 @@ subroutine test_derived_with_init(x, y)
 ! CHECK-SAME:                                                      %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>> {fir.bindc_name = "x"},
 ! CHECK-SAME:                                                      %[[VAL_1:.*]]: !fir.ref<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>> {fir.bindc_name = "y"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_derived_with_initEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>, !fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>)
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMalloc_assignFtest_derived_with_initEy"} : (!fir.ref<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.ref<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_derived_with_initEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>, !fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_derived_with_initEy"} : (!fir.ref<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.dscope) -> (!fir.ref<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.ref<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>)
 ! CHECK:           hlfir.assign %[[VAL_10]]#0 to %[[VAL_9]]#0 realloc : !fir.ref<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.ref<!fir.box<!fir.heap<!fir.type<_QMalloc_assignFtest_derived_with_initTt{a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>
 
 subroutine test_vector_subscript(x, y, v)
@@ -314,9 +314,9 @@ subroutine test_vector_subscript(x, y, v)
 ! CHECK-SAME:                                                     %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "y"},
 ! CHECK-SAME:                                                     %[[VAL_2:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "v"}) {
 ! CHECK:           %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QMalloc_assignFtest_vector_subscriptEv"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_vector_subscriptEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] {uniq_name = "_QMalloc_assignFtest_vector_subscriptEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_vector_subscriptEv"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_vector_subscriptEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMalloc_assignFtest_vector_subscriptEy"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_16:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
 ! CHECK:           hlfir.assign %[[VAL_16]] to %[[VAL_5]]#0 realloc : !hlfir.expr<?xi32>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 
@@ -332,7 +332,7 @@ end function elt
 ! CHECK-LABEL:   func.func @_QMalloc_assignPtest_both_sides_with_elemental_call(
 ! CHECK-SAME:                                                                   %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_both_sides_with_elemental_callEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMalloc_assignFtest_both_sides_with_elemental_callEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_3]], %[[VAL_4]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90
index e6a8c5e025123..27cdf2839767d 100644
--- a/flang/test/Lower/allocatable-polymorphic.f90
+++ b/flang/test/Lower/allocatable-polymorphic.f90
@@ -520,8 +520,8 @@ subroutine test_allocatable_up_from_up_mold(a, b)
 
 ! CHECK-LABEL: func.func @_QMpolyPtest_allocatable_up_from_up_mold(
 ! CHECK-SAME: %[[A:.*]]: !fir.ref<!fir.class<!fir.heap<none>>> {fir.bindc_name = "a"}, %[[B:.*]]: !fir.ref<!fir.class<!fir.ptr<none>>> {fir.bindc_name = "b"}) {
-! CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMpolyFtest_allocatable_up_from_up_moldEa"} : (!fir.ref<!fir.class<!fir.heap<none>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<none>>>, !fir.ref<!fir.class<!fir.heap<none>>>)
-! CHECK: %[[B_DECL:.*]]:2 = hlfir.declare %[[B]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMpolyFtest_allocatable_up_from_up_moldEb"} : (!fir.ref<!fir.class<!fir.ptr<none>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<none>>>, !fir.ref<!fir.class<!fir.ptr<none>>>)
+! CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMpolyFtest_allocatable_up_from_up_moldEa"} : (!fir.ref<!fir.class<!fir.heap<none>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<none>>>, !fir.ref<!fir.class<!fir.heap<none>>>)
+! CHECK: %[[B_DECL:.*]]:2 = hlfir.declare %[[B]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMpolyFtest_allocatable_up_from_up_moldEb"} : (!fir.ref<!fir.class<!fir.ptr<none>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<none>>>, !fir.ref<!fir.class<!fir.ptr<none>>>)
 ! CHECK: %[[LOAD_B:.*]] = fir.load %[[B_DECL]]#0 : !fir.ref<!fir.class<!fir.ptr<none>>>
 ! CHECK: %[[RANK:.*]] = arith.constant 0 : i32
 ! CHECK: %[[A_BOX_NONE:.*]] = fir.convert %[[A_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.ref<!fir.box<none>>
@@ -539,7 +539,7 @@ subroutine test_allocatable_up_from_mold_rank(a)
 ! CHECK-LABEL: func.func @_QMpolyPtest_allocatable_up_from_mold_rank(
 ! CHECK-SAME: %[[A:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>> {fir.bindc_name = "a"}) {
 ! CHECK: %[[VALUE_10:.*]] = fir.alloca i32
-! CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMpolyFtest_allocatable_up_from_mold_rankEa"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>)
+! CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMpolyFtest_allocatable_up_from_mold_rankEa"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>>>)
 ! CHECK: %[[C10:.*]] = arith.constant 10 : i32
 ! CHECK: fir.store %[[C10]] to %[[VALUE_10]] : !fir.ref<i32>
 ! CHECK: %[[EMBOX_10:.*]] = fir.embox %[[VALUE_10]] : (!fir.ref<i32>) -> !fir.box<i32>
diff --git a/flang/test/Lower/allocatables.f90 b/flang/test/Lower/allocatables.f90
index e62f92fa0c1c7..60b7de3301c48 100644
--- a/flang/test/Lower/allocatables.f90
+++ b/flang/test/Lower/allocatables.f90
@@ -56,7 +56,7 @@ subroutine foodim1()
   ! CHECK-DAG: fir.load %[[xAddrVar]] : !fir.ref<!fir.heap<!fir.array<?xf32>>>
 
   deallocate(x)
-  ! CHECK: %[[xAddr1:.*]] = fir.load %1 : !fir.ref<!fir.heap<!fir.array<?xf32>>>
+  ! CHECK: %[[xAddr1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.heap<!fir.array<?xf32>>>
   ! CHECK: fir.freemem %[[xAddr1]]
   ! CHECK: %[[nullAddr1:.*]] = fir.zero_bits !fir.heap<!fir.array<?xf32>>
   ! CHECK: fir.store %[[nullAddr1]] to %[[xAddrVar]] : !fir.ref<!fir.heap<!fir.array<?xf32>>>
@@ -67,10 +67,6 @@ subroutine foodim2()
   ! Test lowering of local allocatable specification
   real, allocatable :: x(:, :)
   ! CHECK-DAG: fir.alloca !fir.heap<!fir.array<?x?xf32>> {{{.*}}uniq_name = "_QFfoodim2Ex.addr"}
-  ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.lb0"}
-  ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.ext0"}
-  ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.lb1"}
-  ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.ext1"}
 end subroutine
 
 ! test lowering of character allocatables. Focus is placed on the length handling
diff --git a/flang/test/Lower/array-character.f90 b/flang/test/Lower/array-character.f90
index e2899d967c80d..85f5af0492c3b 100644
--- a/flang/test/Lower/array-character.f90
+++ b/flang/test/Lower/array-character.f90
@@ -15,12 +15,12 @@ subroutine issue(c1, c2)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_7]]) typeparams %[[VAL_5]] dummy_scope %[[VAL_2]] {uniq_name = "_QFissueEc1"} : (!fir.ref<!fir.array<3x!fir.char<1,4>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<3x!fir.char<1,4>>>, !fir.ref<!fir.array<3x!fir.char<1,4>>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_7]]) typeparams %[[VAL_5]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFissueEc1"} : (!fir.ref<!fir.array<3x!fir.char<1,4>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<3x!fir.char<1,4>>>, !fir.ref<!fir.array<3x!fir.char<1,4>>>)
 ! CHECK:           %[[VAL_9:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<3x!fir.char<1,?>>>
 ! CHECK:           %[[VAL_11:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_12:.*]] = fir.shape %[[VAL_11]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_10]](%[[VAL_12]]) typeparams %[[VAL_9]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFissueEc2"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_10]](%[[VAL_12]]) typeparams %[[VAL_9]]#1 dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFissueEc2"} : (!fir.ref<!fir.array<3x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,?>>>)
 ! CHECK:           hlfir.assign %[[VAL_13]]#0 to %[[VAL_8]]#0 : !fir.box<!fir.array<3x!fir.char<1,?>>>, !fir.ref<!fir.array<3x!fir.char<1,4>>>
 
 program p
diff --git a/flang/test/Lower/array-elemental-calls-char-byval.f90 b/flang/test/Lower/array-elemental-calls-char-byval.f90
index 04a437513432f..8fb3e7fc5396a 100644
--- a/flang/test/Lower/array-elemental-calls-char-byval.f90
+++ b/flang/test/Lower/array-elemental-calls-char-byval.f90
@@ -27,13 +27,13 @@ subroutine foo1(i, j, c)
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<10x!fir.char<1,?>>>
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_7]]) typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo1Ec"} : (!fir.ref<!fir.array<10x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<10x!fir.char<1,?>>>, !fir.ref<!fir.array<10x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_7]]) typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo1Ec"} : (!fir.ref<!fir.array<10x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<10x!fir.char<1,?>>>, !fir.ref<!fir.array<10x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_10]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo1Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_10]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo1Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_12:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_13]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo1Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_13]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo1Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_15:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> {
 ! CHECK:           ^bb0(%[[VAL_16:.*]]: index):
 ! CHECK:             %[[VAL_17:.*]] = hlfir.designate %[[VAL_8]]#0 (%[[VAL_16]])  typeparams %[[VAL_4]]#1 : (!fir.box<!fir.array<10x!fir.char<1,?>>>, index, index) -> !fir.boxchar<1>
@@ -60,13 +60,13 @@ subroutine foo2(i, j, c)
 ! CHECK-SAME:                                       %[[VAL_2:.*]]: !fir.boxchar<1> {fir.bindc_name = "c"}) {
 ! CHECK:           %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_4:.*]]:2 = fir.unboxchar %[[VAL_2]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo2Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo2Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo2Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo2Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_10]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo2Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_10]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo2Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_12:.*]] = hlfir.elemental %[[VAL_10]] unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> {
 ! CHECK:           ^bb0(%[[VAL_13:.*]]: index):
 ! CHECK:             %[[VAL_14:.*]] = hlfir.as_expr %[[VAL_5]]#0 : (!fir.boxchar<1>) -> !hlfir.expr<!fir.char<1,?>>
@@ -92,10 +92,10 @@ subroutine foo3(i, j)
 ! CHECK:           %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo3Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo3Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo3Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo3Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_10:.*]] = hlfir.elemental %[[VAL_8]] unordered : (!fir.shape<1>) -> !hlfir.expr<10xi64> {
 ! CHECK:           ^bb0(%[[VAL_11:.*]]: index):
 ! CHECK:             %[[VAL_12:.*]] = hlfir.designate %[[VAL_9]]#0 (%[[VAL_11]])  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
@@ -143,10 +143,10 @@ subroutine foo4(i, j)
 ! CHECK:           %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo4Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo4Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elem_byvalFfoo4Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo4Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 1 : index
 ! CHECK:           %[[VAL_11:.*]] = hlfir.designate %[[VAL_9]]#0 (%[[VAL_10]])  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
 ! CHECK:           %[[VAL_12:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
@@ -186,10 +186,10 @@ subroutine foo5(i, j)
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %[[VAL_2]] {uniq_name = "_QMchar_elem_byvalFfoo5Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo5Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_7]]) dummy_scope %[[VAL_2]] {uniq_name = "_QMchar_elem_byvalFfoo5Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_7]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMchar_elem_byvalFfoo5Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_9:.*]] = fir.address_of(@_QQclX68656C6C6F) : !fir.ref<!fir.char<1,5>>
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 5 : index
 ! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_9]] typeparams %[[VAL_10]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX68656C6C6F"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
diff --git a/flang/test/Lower/array-elemental-calls-char-dynamic.f90 b/flang/test/Lower/array-elemental-calls-char-dynamic.f90
index 9671669b08c9a..24b798904d82d 100644
--- a/flang/test/Lower/array-elemental-calls-char-dynamic.f90
+++ b/flang/test/Lower/array-elemental-calls-char-dynamic.f90
@@ -19,8 +19,8 @@ elemental function bug_145151_1(c_dummy)
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_name = "c"},
 ! CHECK-SAME:      %[[ARG1:.*]]: !fir.box<!fir.array<?xi64>> {fir.bindc_name = "vector_subscript"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFtest_vector_subscripted_argEc"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFtest_vector_subscripted_argEvector_subscript"} : (!fir.box<!fir.array<?xi64>>, !fir.dscope) -> (!fir.box<!fir.array<?xi64>>, !fir.box<!fir.array<?xi64>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtest_vector_subscripted_argEc"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtest_vector_subscripted_argEvector_subscript"} : (!fir.box<!fir.array<?xi64>>, !fir.dscope) -> (!fir.box<!fir.array<?xi64>>, !fir.box<!fir.array<?xi64>>)
 ! CHECK:           %[[VAL_3:.*]] = fir.box_elesize %[[VAL_1]]#1 : (!fir.box<!fir.array<?x!fir.char<1,?>>>) -> index
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_2]]#0, %[[VAL_4]] : (!fir.box<!fir.array<?xi64>>, index) -> (index, index, index)
@@ -80,8 +80,8 @@ elemental function bug_145151_2(x)
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_name = "c"},
 ! CHECK-SAME:      %[[ARG1:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFtest_module_variableEc"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFtest_module_variableEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtest_module_variableEc"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtest_module_variableEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_2]]#0, %[[VAL_3]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]]#1 : (index) -> !fir.shape<1>
@@ -129,9 +129,9 @@ elemental function f_opt(x, opt)
 ! CHECK-SAME:      %[[ARG1:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"},
 ! CHECK-SAME:      %[[ARG2:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "opt", fir.optional}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_presentEopt"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFtest_presentEres"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFtest_presentEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_presentEopt"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtest_presentEres"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtest_presentEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK:           %[[VAL_4:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.box<!fir.array<?xf32>>) -> i1
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_3]]#0, %[[VAL_5]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
@@ -197,9 +197,9 @@ elemental function f_poly(p1, p2)
 ! CHECK-SAME:      %[[ARG1:.*]]: !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>> {fir.bindc_name = "p1"},
 ! CHECK-SAME:      %[[ARG2:.*]]: !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>> {fir.bindc_name = "p2"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphicEp1"} : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>)
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphicEp2"} : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>)
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFtest_polymorphicEres"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphicEp1"} : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_polymorphicEp2"} : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, !fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtest_polymorphicEres"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_1]]#0, %[[VAL_4]] : (!fir.class<!fir.array<?x!fir.type<_QFtest_polymorphicTt>>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1>
@@ -258,7 +258,7 @@ elemental function f_value(c_dummy)
 ! CHECK-LABEL:   func.func @_QPtest_value(
 ! CHECK-SAME:      %[[ARG0:.*]]: !fir.box<!fir.array<?x!fir.char<1,?>>> {fir.bindc_name = "c"}) {
 ! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFtest_valueEc"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {uniq_name = "_QFtest_valueEc"} : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.box<!fir.array<?x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_1]]#0, %[[VAL_2]] : (!fir.box<!fir.array<?x!fir.char<1,?>>>, index) -> (index, index, index)
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]]#1 : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/array-elemental-calls-char.f90 b/flang/test/Lower/array-elemental-calls-char.f90
index a75b335ba5767..f99bce4cfec81 100644
--- a/flang/test/Lower/array-elemental-calls-char.f90
+++ b/flang/test/Lower/array-elemental-calls-char.f90
@@ -31,10 +31,10 @@ subroutine foo1(i, c)
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<10x!fir.char<1,?>>>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_6]]) typeparams %[[VAL_3]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QMchar_elemFfoo1Ec"} : (!fir.ref<!fir.array<10x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<10x!fir.char<1,?>>>, !fir.ref<!fir.array<10x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_6]]) typeparams %[[VAL_3]]#1 dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo1Ec"} : (!fir.ref<!fir.array<10x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<10x!fir.char<1,?>>>, !fir.ref<!fir.array<10x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) dummy_scope %[[VAL_2]] {uniq_name = "_QMchar_elemFfoo1Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo1Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_11:.*]] = hlfir.elemental %[[VAL_6]] unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> {
 ! CHECK:           ^bb0(%[[VAL_12:.*]]: index):
 ! CHECK:             %[[VAL_13:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_12]])  typeparams %[[VAL_3]]#1 : (!fir.box<!fir.array<10x!fir.char<1,?>>>, index, index) -> !fir.boxchar<1>
@@ -60,10 +60,10 @@ subroutine foo1b(i, c)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_7]]) typeparams %[[VAL_5]] dummy_scope %[[VAL_2]] {uniq_name = "_QMchar_elemFfoo1bEc"} : (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.ref<!fir.array<10x!fir.char<1,10>>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_7]]) typeparams %[[VAL_5]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo1bEc"} : (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.ref<!fir.array<10x!fir.char<1,10>>>)
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_10]]) dummy_scope %[[VAL_2]] {uniq_name = "_QMchar_elemFfoo1bEi"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_10]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo1bEi"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_12:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> {
 ! CHECK:           ^bb0(%[[VAL_13:.*]]: index):
 ! CHECK:             %[[VAL_14:.*]] = hlfir.designate %[[VAL_8]]#0 (%[[VAL_13]])  typeparams %[[VAL_5]] : (!fir.ref<!fir.array<10x!fir.char<1,10>>>, index, index) -> !fir.ref<!fir.char<1,10>>
@@ -87,13 +87,13 @@ subroutine foo2(i, j, c)
 ! CHECK-SAME:                                 %[[VAL_2:.*]]: !fir.boxchar<1> {fir.bindc_name = "c"}) {
 ! CHECK:           %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_4:.*]]:2 = fir.unboxchar %[[VAL_2]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elemFfoo2Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo2Ec"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elemFfoo2Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo2Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_10]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elemFfoo2Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_10]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo2Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_12:.*]] = hlfir.elemental %[[VAL_10]] unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> {
 ! CHECK:           ^bb0(%[[VAL_13:.*]]: index):
 ! CHECK:             %[[VAL_14:.*]] = hlfir.designate %[[VAL_11]]#0 (%[[VAL_13]])  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
@@ -118,13 +118,13 @@ subroutine foo2b(i, j, c)
 ! CHECK:           %[[VAL_4:.*]]:2 = fir.unboxchar %[[VAL_2]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,10>>
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 10 : index
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elemFfoo2bEc"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo2bEc"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elemFfoo2bEi"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo2bEi"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_11:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_12:.*]] = fir.shape %[[VAL_11]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_12]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elemFfoo2bEj"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_12]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo2bEj"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_14:.*]] = hlfir.elemental %[[VAL_12]] unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> {
 ! CHECK:           ^bb0(%[[VAL_15:.*]]: index):
 ! CHECK:             %[[VAL_16:.*]] = fir.emboxchar %[[VAL_7]]#0, %[[VAL_6]] : (!fir.ref<!fir.char<1,10>>, index) -> !fir.boxchar<1>
@@ -148,10 +148,10 @@ subroutine foo3(i, j)
 ! CHECK:           %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elemFfoo3Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo3Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %[[VAL_3]] {uniq_name = "_QMchar_elemFfoo3Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_8]]) dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo3Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_10:.*]] = hlfir.elemental %[[VAL_8]] unordered : (!fir.shape<1>) -> !hlfir.expr<10xi64> {
 ! CHECK:           ^bb0(%[[VAL_11:.*]]: index):
 ! CHECK:             %[[VAL_12:.*]] = hlfir.designate %[[VAL_9]]#0 (%[[VAL_11]])  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
@@ -197,10 +197,10 @@ subroutine foo4(i, j)
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %[[VAL_2]] {uniq_name = "_QMchar_elemFfoo4Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo4Ei"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_7]]) dummy_scope %[[VAL_2]] {uniq_name = "_QMchar_elemFfoo4Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_7]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo4Ej"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_9:.*]] = fir.address_of(@_QQclX68656C6C6F) : !fir.ref<!fir.char<1,5>>
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 5 : index
 ! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_9]] typeparams %[[VAL_10]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX68656C6C6F"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
@@ -239,7 +239,7 @@ subroutine foo6(c)
 ! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<10x!fir.char<1,?>>>
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_5]]) typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] {uniq_name = "_QMchar_elemFfoo6Ec"} : (!fir.ref<!fir.array<10x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<10x!fir.char<1,?>>>, !fir.ref<!fir.array<10x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_5]]) typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QMchar_elemFfoo6Ec"} : (!fir.ref<!fir.array<10x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<10x!fir.char<1,?>>>, !fir.ref<!fir.array<10x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %c1_i64 : (i64) -> !fir.ref<!fir.char<1,?>>
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_2]]#1 {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMchar_elemFelem_return_charEc"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_2]]#1 : (index) -> i64
diff --git a/flang/test/Lower/box-address.f90 b/flang/test/Lower/box-address.f90
index 04f14188a7bec..d43fb55776972 100644
--- a/flang/test/Lower/box-address.f90
+++ b/flang/test/Lower/box-address.f90
@@ -18,7 +18,7 @@ end module m3
 ! CHECK-LABEL: func.func @_QMm3Pchk(
 ! CHECK-SAME:    %[[ARG0:.*]]: !fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>> {fir.bindc_name = "c1"}) {
 ! CHECK:         %[[DUMMY_SCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:         %[[DECLARE:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[DUMMY_SCOPE]] {uniq_name = "_QMm3FdummyEc1"} : (!fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>>, !fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>>)
+! CHECK:         %[[DECLARE:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[DUMMY_SCOPE]] {{.*}} {uniq_name = "_QMm3FdummyEc1"} : (!fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>>, !fir.class<!fir.array<3x!fir.type<_QMm3Tx1{ix1:i32}>>>)
 
 subroutine s1
   use m3
diff --git a/flang/test/Lower/call-by-value-attr.f90 b/flang/test/Lower/call-by-value-attr.f90
index 14776be0f6d77..9b9076722828e 100644
--- a/flang/test/Lower/call-by-value-attr.f90
+++ b/flang/test/Lower/call-by-value-attr.f90
@@ -45,7 +45,7 @@ end subroutine subri
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca i32
 ! CHECK:           fir.store %[[VAL_0]] to %[[VAL_2]] : !fir.ref<i32>
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFsubriEval"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFsubriEval"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           fir.call @_QPtest_numeric_scalar_value(%[[VAL_3]]#0) fastmath<contract> : (!fir.ref<i32>) -> ()
 ! CHECK:           return
 ! CHECK:         }
diff --git a/flang/test/Lower/call-character-array-to-polymorphic-pointer.f90 b/flang/test/Lower/call-character-array-to-polymorphic-pointer.f90
index 8644a4a3faf7f..b0f94a203c6a3 100644
--- a/flang/test/Lower/call-character-array-to-polymorphic-pointer.f90
+++ b/flang/test/Lower/call-character-array-to-polymorphic-pointer.f90
@@ -20,7 +20,7 @@ end subroutine char_explicit_shape_array
 ! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<100x!fir.char<1,?>>>
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 100 : index
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_5]]) typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_explicit_shape_arrayEa2"} : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<100x!fir.char<1,?>>>, !fir.ref<!fir.array<100x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_5]]) typeparams %[[VAL_2]]#1 dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFchar_explicit_shape_arrayEa2"} : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box<!fir.array<100x!fir.char<1,?>>>, !fir.ref<!fir.array<100x!fir.char<1,?>>>)
 ! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_8:.*]] = fir.embox %[[VAL_6]]#1(%[[VAL_7]]) typeparams %[[VAL_2]]#1 : (!fir.ref<!fir.array<100x!fir.char<1,?>>>, !fir.shape<1>, index) -> !fir.class<!fir.ptr<!fir.array<?xnone>>>
 ! CHECK:           fir.store %[[VAL_8]] to %[[VAL_0]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>
diff --git a/flang/test/Lower/character-local-variables.f90 b/flang/test/Lower/character-local-variables.f90
index d5b959eca1ff6..6325229993a25 100644
--- a/flang/test/Lower/character-local-variables.f90
+++ b/flang/test/Lower/character-local-variables.f90
@@ -8,6 +8,7 @@
 subroutine scalar_cst_len()
   character(10) :: c
   ! CHECK: fir.alloca !fir.char<1,10> {{{.*}}uniq_name = "_QFscalar_cst_lenEc"}
+  print *, c
 end subroutine
 
 ! CHECK-LABEL: func @_QPscalar_dyn_len
@@ -19,12 +20,14 @@ subroutine scalar_dyn_len(l)
   ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i32
   ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i32
   ! CHECK: fir.alloca !fir.char<1,?>(%[[l]] : i32) {{{.*}}uniq_name = "_QFscalar_dyn_lenEc"}
+  print *, c
 end subroutine
 
 ! CHECK-LABEL: func @_QPcst_array_cst_len
 subroutine cst_array_cst_len()
   character(10) :: c(20)
   ! CHECK: fir.alloca !fir.array<20x!fir.char<1,10>> {{{.*}}uniq_name = "_QFcst_array_cst_lenEc"}
+  print *, c(1)
 end subroutine
 
 ! CHECK-LABEL: func @_QPcst_array_dyn_len
@@ -36,6 +39,7 @@ subroutine cst_array_dyn_len(l)
   ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i32
   ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i32
   ! CHECK: fir.alloca !fir.array<10x!fir.char<1,?>>(%[[l]] : i32) {{{.*}}uniq_name = "_QFcst_array_dyn_lenEc"}
+  print *, c(1)
 end subroutine
 
 ! CHECK-LABEL: func @_QPdyn_array_cst_len
@@ -48,6 +52,7 @@ subroutine dyn_array_cst_len(n)
   ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[ni]], %c0{{.*}} : index
   ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[ni]], %c0{{.*}} : index
   ! CHECK: fir.alloca !fir.array<?x!fir.char<1,10>>, %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_cst_lenEc"}
+  print *, c(1)
 end subroutine
 
 ! CHECK: func @_QPdyn_array_dyn_len
@@ -63,12 +68,14 @@ subroutine dyn_array_dyn_len(l, n)
   ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[ni]], %c0{{.*}} : index
   ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[ni]], %c0{{.*}} : index
   ! CHECK: fir.alloca !fir.array<?x!fir.char<1,?>>(%[[l]] : i32), %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_dyn_lenEc"}
+  print *, c(1)
 end subroutine
 
 ! CHECK-LABEL: func @_QPcst_array_cst_len_lb
 subroutine cst_array_cst_len_lb()
   character(10) :: c(11:30)
   ! CHECK: fir.alloca !fir.array<20x!fir.char<1,10>> {{{.*}}uniq_name = "_QFcst_array_cst_len_lbEc"}
+  print *, c(11)
 end subroutine
 
 ! CHECK-LABEL: func @_QPcst_array_dyn_len_lb
@@ -80,6 +87,7 @@ subroutine cst_array_dyn_len_lb(l)
   ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i64
   ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i64
   ! CHECK: fir.alloca !fir.array<10x!fir.char<1,?>>(%[[l]] : i64) {{{.*}}uniq_name = "_QFcst_array_dyn_len_lbEc"}
+  print *, c(11)
 end subroutine
 
 ! CHECK-LABEL: func @_QPdyn_array_cst_len_lb
@@ -94,6 +102,7 @@ subroutine dyn_array_cst_len_lb(n)
   ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[raw_extent]], %c0{{.*}} : index
   ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[raw_extent]], %c0{{.*}} : index
   ! CHECK: fir.alloca !fir.array<?x!fir.char<1,10>>, %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_cst_len_lbEc"}
+  print *, c(11)
 end subroutine
 
 ! CHECK-LABEL: func @_QPdyn_array_dyn_len_lb
@@ -111,6 +120,7 @@ subroutine dyn_array_dyn_len_lb(l, n)
   ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[raw_extent]], %c0{{.*}} : index
   ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[raw_extent]], %c0{{.*}} : index
   ! CHECK: fir.alloca !fir.array<?x!fir.char<1,?>>(%[[l]] : i64), %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_dyn_len_lbEc"}
+  print *, c(11)
 end subroutine
 
 ! Test that the length of assumed length parameter is correctly deduced in lowering.
@@ -129,4 +139,5 @@ subroutine assumed_length_param(n)
 subroutine scalar_cst_neg_len()
   character(-1) :: c
   ! CHECK: fir.alloca !fir.char<1,0> {{{.*}}uniq_name = "_QFscalar_cst_neg_lenEc"}
+  print *, c
 end subroutine
diff --git a/flang/test/Lower/character-substrings.f90 b/flang/test/Lower/character-substrings.f90
index 38343112c5c47..69e1dc8f918a4 100644
--- a/flang/test/Lower/character-substrings.f90
+++ b/flang/test/Lower/character-substrings.f90
@@ -10,8 +10,8 @@ end subroutine scalar_substring_embox
 ! CHECK-SAME:                                         %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "i"},
 ! CHECK-SAME:                                         %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "j"}) {
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFscalar_substring_emboxEi"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFscalar_substring_emboxEj"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFscalar_substring_emboxEi"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFscalar_substring_emboxEj"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_5:.*]] = fir.address_of(@_QQclX61626348656C6C6F20576F726C6421646667) : !fir.ref<!fir.char<1,18>>
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 18 : index
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = ".stringlit"} : (!fir.ref<!fir.char<1,18>>, index) -> (!fir.ref<!fir.char<1,18>>, !fir.ref<!fir.char<1,18>>)
@@ -53,7 +53,7 @@ end subroutine array_substring_embox
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 7 : index
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_6]]) typeparams %[[VAL_4]] dummy_scope %[[VAL_1]] {uniq_name = "_QFarray_substring_emboxEarr"} : (!fir.ref<!fir.array<4x!fir.char<1,7>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<4x!fir.char<1,7>>>, !fir.ref<!fir.array<4x!fir.char<1,7>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_6]]) typeparams %[[VAL_4]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFarray_substring_emboxEarr"} : (!fir.ref<!fir.array<4x!fir.char<1,7>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<4x!fir.char<1,7>>>, !fir.ref<!fir.array<4x!fir.char<1,7>>>)
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : index
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 1 : index
 ! CHECK:           %[[VAL_10:.*]] = arith.constant 4 : index
@@ -79,11 +79,11 @@ end subroutine substring_assignment
 ! CHECK:           %[[VAL_3:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,4>>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 4 : index
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]] typeparams %[[VAL_5]] dummy_scope %[[VAL_2]] {uniq_name = "_QFsubstring_assignmentEa"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]] typeparams %[[VAL_5]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFsubstring_assignmentEa"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
 ! CHECK:           %[[VAL_7:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,4>>
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 4 : index
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_9]] dummy_scope %[[VAL_2]] {uniq_name = "_QFsubstring_assignmentEb"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_9]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFsubstring_assignmentEb"} : (!fir.ref<!fir.char<1,4>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
 ! CHECK:           %[[VAL_11:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_12:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_13:.*]] = arith.constant 2 : index
@@ -109,7 +109,7 @@ end subroutine array_substring_assignment
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 5 : index
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 6 : index
 ! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_6]]) typeparams %[[VAL_4]] dummy_scope %[[VAL_1]] {uniq_name = "_QFarray_substring_assignmentEa"} : (!fir.ref<!fir.array<6x!fir.char<1,5>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<6x!fir.char<1,5>>>, !fir.ref<!fir.array<6x!fir.char<1,5>>>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_6]]) typeparams %[[VAL_4]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFarray_substring_assignmentEa"} : (!fir.ref<!fir.array<6x!fir.char<1,5>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<6x!fir.char<1,5>>>, !fir.ref<!fir.array<6x!fir.char<1,5>>>)
 ! CHECK:           %[[VAL_8:.*]] = fir.address_of(@_QQclX424144) : !fir.ref<!fir.char<1,3>>
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_9]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX424144"} : (!fir.ref<!fir.char<1,3>>, index) -> (!fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>>)
@@ -138,7 +138,7 @@ end subroutine array_substring_assignment2
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 8 : index
 ! CHECK:           %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) dummy_scope %[[VAL_1]] {uniq_name = "_QFarray_substring_assignment2Ea"} : (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment2Tt{ch:!fir.char<1,7>}>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment2Tt{ch:!fir.char<1,7>}>>>, !fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment2Tt{ch:!fir.char<1,7>}>>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFarray_substring_assignment2Ea"} : (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment2Tt{ch:!fir.char<1,7>}>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment2Tt{ch:!fir.char<1,7>}>>>, !fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment2Tt{ch:!fir.char<1,7>}>>>)
 ! CHECK:           %[[VAL_18:.*]] = fir.address_of(@_QQclX6E696365) : !fir.ref<!fir.char<1,4>>
 ! CHECK:           %[[VAL_19:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_18]] typeparams %[[VAL_19]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX6E696365"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
@@ -164,10 +164,10 @@ end subroutine array_substring_assignment3
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 8 : index
 ! CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_10]]) dummy_scope %[[VAL_2]] {uniq_name = "_QFarray_substring_assignment3Ea"} : (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>, !fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>)
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_10]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFarray_substring_assignment3Ea"} : (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>, !fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>)
 ! CHECK:           %[[VAL_12:.*]] = arith.constant 8 : index
 ! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_13]]) dummy_scope %[[VAL_2]] {uniq_name = "_QFarray_substring_assignment3Eb"} : (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>, !fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>)
+! CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_13]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFarray_substring_assignment3Eb"} : (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>, !fir.ref<!fir.array<8x!fir.type<_QFarray_substring_assignment3Tt{ch:!fir.char<1,7>}>>>)
 ! CHECK:           %[[VAL_22:.*]] = arith.constant 2 : index
 ! CHECK:           %[[VAL_23:.*]] = arith.constant 5 : index
 ! CHECK:           %[[VAL_24:.*]] = arith.constant 4 : index
diff --git a/flang/test/Lower/charconvert.f90 b/flang/test/Lower/charconvert.f90
index e3f7f66b8476b..bacf9a9626a7d 100644
--- a/flang/test/Lower/charconvert.f90
+++ b/flang/test/Lower/charconvert.f90
@@ -14,17 +14,17 @@ subroutine test_c4_to_c1(c4, c1)
                                                 
 ! CHECK: func.func @_QPtest_c1_to_c4(%[[ARG0:.*]]: !fir.boxchar<4> {fir.bindc_name = "c4"}, %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "c1"}) {
 ! CHECK:   %[[VAL_0:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:   %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c1_to_c4Ec1"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:   %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_c1_to_c4Ec1"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:   %[[VAL_2:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
-! CHECK:   %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c1_to_c4Ec4"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
+! CHECK:   %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_c1_to_c4Ec4"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
 ! CHECK:   %[[VAL_4:.*]] = fir.alloca !fir.char<4,?>(%[[VAL_0]]#1 : index)
 ! CHECK:   fir.char_convert %[[VAL_1]]#1 for %[[VAL_0]]#1 to %[[VAL_4:.*]] : !fir.ref<!fir.char<1,?>>, index, !fir.ref<!fir.char<4,?>>
 
 ! CHECK: func.func @_QPtest_c4_to_c1(%[[ARG0:.*]]: !fir.boxchar<4> {fir.bindc_name = "c4"}, %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "c1"}) {
 ! CHECK:   %[[VAL_0:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-! CHECK:   %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c4_to_c1Ec1"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:   %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_c4_to_c1Ec1"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 ! CHECK:   %[[VAL_2:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
-! CHECK:   %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} {uniq_name = "_QFtest_c4_to_c1Ec4"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
+! CHECK:   %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFtest_c4_to_c1Ec4"} : (!fir.ref<!fir.char<4,?>>, index, !fir.dscope) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
 ! CHECK:   %[[C4:.*]] = arith.constant 4 : index
 ! CHECK:   %[[VAL_4:.*]] = arith.muli %[[VAL_2]]#1, %[[C4]] : index
 ! CHECK:   %[[VAL_5:.*]] = fir.alloca !fir.char<1,?>(%[[VAL_4]] : index)
diff --git a/flang/test/Lower/components.f90 b/flang/test/Lower/components.f90
index f0caddbaaa914..82e1de1833822 100644
--- a/flang/test/Lower/components.f90
+++ b/flang/test/Lower/components.f90
@@ -29,8 +29,8 @@ end subroutine s1
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_3:.*]] = fir.address_of(@_QMcomponents_testEinstance) : !fir.ref<!fir.type<_QMcomponents_testTt3
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QMcomponents_testEinstance"} : (!fir.ref<!fir.type<_QMcomponents_testTt3
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QMcomponents_testFs1Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMcomponents_testFs1Ej"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMcomponents_testFs1Ei"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMcomponents_testFs1Ej"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 2 : index
@@ -130,7 +130,7 @@ subroutine lhs_char_section(a)
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_1]] {uniq_name = "_QFlhs_char_sectionEa"} : (!fir.ref<!fir.array<10x!fir.type<_QFlhs_char_sectionTt
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFlhs_char_sectionEa"} : (!fir.ref<!fir.array<10x!fir.type<_QFlhs_char_sectionTt
 ! CHECK:           %[[VAL_5:.*]] = fir.address_of(@_QQclX68656C6C6F) : !fir.ref<!fir.char<1,5>>
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 5 : index
 ! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_6]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX68656C6C6F"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
@@ -154,13 +154,13 @@ subroutine rhs_char_section(a, c)
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %[[VAL_2]] {uniq_name = "_QFrhs_char_sectionEa"} : (!fir.ref<!fir.array<10x!fir.type<_QFrhs_char_sectionTt
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFrhs_char_sectionEa"} : (!fir.ref<!fir.array<10x!fir.type<_QFrhs_char_sectionTt
 ! CHECK:           %[[VAL_6:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<10x!fir.char<1,10>>>
 ! CHECK:           %[[VAL_8:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_10]]) typeparams %[[VAL_8]] dummy_scope %[[VAL_2]] {uniq_name = "_QFrhs_char_sectionEc"} : (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.ref<!fir.array<10x!fir.char<1,10>>>)
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_10]]) typeparams %[[VAL_8]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFrhs_char_sectionEc"} : (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.ref<!fir.array<10x!fir.char<1,10>>>)
 ! CHECK:           %[[VAL_12:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_13:.*]] = hlfir.designate %[[VAL_5]]#0{"c"}   shape %[[VAL_4]] typeparams %[[VAL_12]] : (!fir.ref<!fir.array<10x!fir.type<_QFrhs_char_sectionTt
 ! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_11]]#0 : !fir.ref<!fir.array<10x!fir.char<1,10>>>, !fir.ref<!fir.array<10x!fir.char<1,10>>>
@@ -181,10 +181,10 @@ subroutine elemental_char_section(a, i)
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %[[VAL_2]] {uniq_name = "_QFelemental_char_sectionEa"} : (!fir.ref<!fir.array<10x!fir.type<_QFelemental_char_sectionTt
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFelemental_char_sectionEa"} : (!fir.ref<!fir.array<10x!fir.type<_QFelemental_char_sectionTt
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_7]]) dummy_scope %[[VAL_2]] {uniq_name = "_QFelemental_char_sectionEi"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_7]]) dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFelemental_char_sectionEi"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_10:.*]] = hlfir.designate %[[VAL_5]]#0{"c"}   shape %[[VAL_4]] typeparams %[[VAL_9]] : (!fir.ref<!fir.array<10x!fir.type<_QFelemental_char_sectionTt
 ! CHECK:           %[[VAL_11:.*]] = fir.address_of(@_QQclX68656C6C6F) : !fir.ref<!fir.char<1,5>>
diff --git a/flang/test/Lower/derived-assignments.f90 b/flang/test/Lower/derived-assignments.f90
index 5870ea11fc496..3e0950da8cfa7 100644
--- a/flang/test/Lower/derived-assignments.f90
+++ b/flang/test/Lower/derived-assignments.f90
@@ -57,8 +57,8 @@ subroutine t_to_t(a1,b1)
   end subroutine t_to_t
 ! CHECK-LABEL:   func.func @_QMm2Pt_to_t(
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare {{.*}} dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QMm2Ft_to_tEa1"} : (!fir.ref<!fir.type<_QMm2Tt
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm2Ft_to_tEb1"} : (!fir.ref<!fir.type<_QMm2Tt
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare {{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_out>, uniq_name = "_QMm2Ft_to_tEa1"} : (!fir.ref<!fir.type<_QMm2Tt
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMm2Ft_to_tEb1"} : (!fir.ref<!fir.type<_QMm2Tt
 ! CHECK:           %[[VAL_5:.*]] = hlfir.designate %[[VAL_4]]#0{"b"}   : (!fir.ref<!fir.type<_QMm2Tt
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
 ! CHECK:           %[[VAL_7:.*]] = hlfir.designate %[[VAL_3]]#0{"a"}   : (!fir.ref<!fir.type<_QMm2Tt
@@ -99,8 +99,8 @@ subroutine test_array_comp(t1, t2)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_array_comp(
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_array_compEt1"} : (!fir.ref<!fir.type<_QFtest_array_compTt
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_array_compEt2"} : (!fir.ref<!fir.type<_QFtest_array_compTt
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_array_compEt1"} : (!fir.ref<!fir.type<_QFtest_array_compTt
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_array_compEt2"} : (!fir.ref<!fir.type<_QFtest_array_compTt
 ! CHECK:           hlfir.assign %[[VAL_4]]#0 to %[[VAL_3]]#0 : !fir.ref<!fir.type<_QFtest_array_compTt
 ! CHECK:           return
 ! CHECK:         }
@@ -116,8 +116,8 @@ subroutine test_ptr_comp(t1, t2)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_ptr_comp(
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_ptr_compEt1"} : (!fir.ref<!fir.type<_QFtest_ptr_compTt
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_ptr_compEt2"} : (!fir.ref<!fir.type<_QFtest_ptr_compTt
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_ptr_compEt1"} : (!fir.ref<!fir.type<_QFtest_ptr_compTt
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_ptr_compEt2"} : (!fir.ref<!fir.type<_QFtest_ptr_compTt
 ! CHECK:           hlfir.assign %[[VAL_4]]#0 to %[[VAL_3]]#0 : !fir.ref<!fir.type<_QFtest_ptr_compTt
 ! CHECK:           return
 ! CHECK:         }
@@ -134,8 +134,8 @@ subroutine test_box_assign(t1, t2)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_box_assign(
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_box_assignEt1"} : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFtest_box_assignTt
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_box_assignEt2"} : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFtest_box_assignTt
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_box_assignEt1"} : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFtest_box_assignTt
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_box_assignEt2"} : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFtest_box_assignTt
 ! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.type<_QFtest_box_assignTt
 ! CHECK:           %[[VAL_6:.*]] = fir.box_addr %[[VAL_5]] : (!fir.box<!fir.ptr<!fir.type<_QFtest_box_assignTt
 ! CHECK:           %[[VAL_7:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.type<_QFtest_box_assignTt
@@ -156,8 +156,8 @@ subroutine test_alloc_comp(t1, t2)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_alloc_comp(
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_alloc_compEt1"} : (!fir.ref<!fir.type<_QFtest_alloc_compTt
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_alloc_compEt2"} : (!fir.ref<!fir.type<_QFtest_alloc_compTt
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_alloc_compEt1"} : (!fir.ref<!fir.type<_QFtest_alloc_compTt
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QFtest_alloc_compEt2"} : (!fir.ref<!fir.type<_QFtest_alloc_compTt
 ! CHECK:           hlfir.assign %[[VAL_4]]#0 to %[[VAL_3]]#0 : !fir.ref<!fir.type<_QFtest_alloc_compTt
 ! CHECK:           return
 ! CHECK:         }
@@ -192,8 +192,8 @@ subroutine test(t1, t2)
   end subroutine
 ! CHECK-LABEL:   func.func @_QMcomponent_with_user_def_assignPtest(
 ! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {uniq_name = "_QMcomponent_with_user_def_assignFtestEt1"} : (!fir.ref<!fir.type<_QMcomponent_with_user_def_assignTt
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] {uniq_name = "_QMcomponent_with_user_def_assignFtestEt2"} : (!fir.ref<!fir.type<_QMcomponent_with_user_def_assignTt
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMcomponent_with_user_def_assignFtestEt1"} : (!fir.ref<!fir.type<_QMcomponent_with_user_def_assignTt
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %[[VAL_2]] arg {{[0-9]+}} {uniq_name = "_QMcomponent_with_user_def_assignFtestEt2"} : (!fir.ref<!fir.type<_QMcomponent_with_user_def_assignTt
 ! CHECK:           hlfir.assign %[[VAL_4]]#0 to %[[VAL_3]]#0 : !fir.ref<!fir.type<_QMcomponent_with_user_def_assignTt
 ! CHECK:           return
 ! CHECK:         }
diff --git a/flang/test/Lower/derived-types.f90 b/flang/test/Lower/derived-types.f90
index 4d36a7632b070..7e36ec0cfe93f 100644
--- a/flang/test/Lower/derived-types.f90
+++ b/flang/test/Lower/derived-types.f90
@@ -35,6 +35,8 @@ subroutine local_derived()
   ! CHECK-DAG: fir.alloca !fir.type<_QMdTr{x:f32}>
   type(r) :: some_r
   type(c2) :: some_c2
+  print *, some_c2%ch_array(1,1)
+  print *, some_r%x
 end subroutine
 
 ! CHECK-LABEL: func @_QMdPsaved_derived(
diff --git a/flang/test/Lower/dispatch.f90 b/flang/test/Lower/dispatch.f90
index 02338065548d4..41ebad7a1f4e3 100644
--- a/flang/test/Lower/dispatch.f90
+++ b/flang/test/Lower/dispatch.f90
@@ -151,7 +151,7 @@ subroutine check_dispatch(p)
 
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch(
 ! CHECK-SAME:  %[[P:.*]]: !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>> {fir.bindc_name = "p"}) {
-! CHECK:       %[[P_DECL:.*]]:2 = hlfir.declare %[[P]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatchEp"} : (!fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>, !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>)
+! CHECK:       %[[P_DECL:.*]]:2 = hlfir.declare %[[P]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatchEp"} : (!fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>, !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>)
 ! CHECK:       fir.dispatch "tbp_nopass"(%[[P_DECL]]#1 : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>){{$}}
 ! CHECK:       fir.dispatch "tbp_pass"(%[[P_DECL]]#0 : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[P_DECL]]#0 : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32}
 ! CHECK:       fir.dispatch "tbp_pass_arg0"(%[[P_DECL]]#0 : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[P_DECL]]#0 : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32}
@@ -176,8 +176,8 @@ subroutine check_dispatch_deferred(a, x)
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_deferred(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>> {fir.bindc_name = "a"}, 
 ! CHECK-SAME: %[[ARG1:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_deferredEa"} : (!fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>, !fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>)
-! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_deferredEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_deferredEa"} : (!fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>, !fir.dscope) -> (!fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>, !fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>)
+! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_deferredEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
 ! CHECK: fir.dispatch "nopassd"(%[[ARG0_DECL]]#1 : !fir.class<!fir.type<_QMcall_dispatchTa1{a:f32,b:f32}>>) (%[[ARG1_DECL]]#0 : !fir.box<!fir.array<?xf32>>)
 
     subroutine check_dispatch_scalar_allocatable(p)
@@ -187,7 +187,7 @@ subroutine check_dispatch_scalar_allocatable(p)
 
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_scalar_allocatable(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>> {fir.bindc_name = "p"}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_scalar_allocatableEp"} : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_scalar_allocatableEp"} : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>)
 ! CHECK: %[[LOAD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>
 ! CHECK: %[[REBOX:.*]] = fir.rebox %[[LOAD]] : (!fir.class<!fir.heap<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>
 ! CHECK: fir.dispatch "tbp_pass"(%[[REBOX]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[REBOX]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32}
@@ -199,7 +199,7 @@ subroutine check_dispatch_scalar_pointer(p)
 
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_scalar_pointer(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>> {fir.bindc_name = "p"}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_scalar_pointerEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_scalar_pointerEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>)
 ! CHECK: %[[LOAD:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>
 ! CHECK: %[[REBOX:.*]] = fir.rebox %[[LOAD]] : (!fir.class<!fir.ptr<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>
 ! CHECK: fir.dispatch "tbp_pass"(%[[REBOX]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[REBOX]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32}
@@ -220,8 +220,8 @@ subroutine check_dispatch_static_array(p, t)
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_static_array(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "p"}, 
 ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "t"}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_static_arrayEp"} : (!fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
-! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_static_arrayEt"} : (!fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_static_arrayEp"} : (!fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
+! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_static_arrayEt"} : (!fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.ref<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
 ! CHECK: fir.do_loop {{.*}} {
 ! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[ARG0_DECL]]#0 (%{{.*}})  : (!fir.class<!fir.array<10x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, i64) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>
 ! CHECK: fir.dispatch "tbp_pass"(%[[DESIGNATE]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[DESIGNATE]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32}
@@ -248,8 +248,8 @@ subroutine check_dispatch_dynamic_array(p, t)
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_dynamic_array(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "p"}, 
 ! CHECK-SAME: %[[ARG1:.*]]: !fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "t"}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_arrayEp"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
-! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_arrayEt"} : (!fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_arrayEp"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
+! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_arrayEt"} : (!fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.box<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
 ! CHECK: %{{.*}} = fir.do_loop {{.*}} {
 ! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[ARG0_DECL]]#0 (%{{.*}})  : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, i64) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>
 ! CHECK: fir.dispatch "tbp_pass"(%[[DESIGNATE]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) (%[[DESIGNATE]] : !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>) {pass_arg_pos = 0 : i32}
@@ -276,8 +276,8 @@ subroutine check_dispatch_allocatable_array(p, t)
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_allocatable_array(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>> {fir.bindc_name = "p"}, 
 ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>> {fir.bindc_name = "t"}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_allocatable_arrayEp"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>)
-! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_allocatable_arrayEt"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_allocatable_arrayEp"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>)
+! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcall_dispatchFcheck_dispatch_allocatable_arrayEt"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>)
 ! CHECK: %{{.*}} = fir.do_loop {{.*}} {
 ! CHECK: %[[LOAD_ARG0:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>
 ! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[LOAD_ARG0]] (%{{.*}})  : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>, i64) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>
@@ -306,8 +306,8 @@ subroutine check_dispatch_pointer_array(p, t)
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_pointer_array(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>> {fir.bindc_name = "p"}, 
 ! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>> {fir.bindc_name = "t"}) {
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_pointer_arrayEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>)
-! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_pointer_arrayEt"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_pointer_arrayEp"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>)
+! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMcall_dispatchFcheck_dispatch_pointer_arrayEt"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>)
 
 ! CHECK: %{{.*}} = fir.do_loop {{.*}} {
 ! CHECK: %[[LOAD_ARG0:.*]] = fir.load %[[ARG0_DECL]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>>>
@@ -334,8 +334,8 @@ subroutine check_dispatch_dynamic_array_copy(p, o)
 ! CHECK-LABEL: func.func @_QMcall_dispatchPcheck_dispatch_dynamic_array_copy(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "p"}, 
 ! CHECK-SAME: %[[ARG1:.*]]: !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>> {fir.bindc_name = "o"}) {
-! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_array_copyEo"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
-! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_array_copyEp"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
+! CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_array_copyEo"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
+! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMcall_dispatchFcheck_dispatch_dynamic_array_copyEp"} : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, !fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>)
 
 ! CHECK: %{{.*}} = fir.do_loop {{.*}} {
 ! CHECK: %[[DESIGNATE0:.*]] = hlfir.designate %[[ARG0_DECL]]#0 (%{{.*}})  : (!fir.class<!fir.array<?x!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>>, i64) -> !fir.class<!fir.type<_QMcall_dispatchTp1{a:i32,b:i32}>>
diff --git a/flang/test/Lower/do_loop_unstructured.f90 b/flang/test/Lower/do_loop_unstructured.f90
index 3b03850b43bb2..9c7d874a1aac8 100644
--- a/flang/test/Lower/do_loop_unstructured.f90
+++ b/flang/test/Lower/do_loop_unstructured.f90
@@ -235,6 +235,7 @@ subroutine nested_structured_in_unstructured()
 subroutine unstructured_do_concurrent
   logical :: success
   do concurrent (i=1:10) local(success)
+    success = .false.
     error stop "fail"
   enddo
 end
diff --git a/flang/test/Lower/entry-statement.f90 b/flang/test/Lower/entry-statement.f90
index f1e535a2bd3a1..387cd055e6f37 100644
--- a/flang/test/Lower/entry-statement.f90
+++ b/flang/test/Lower/entry-statement.f90
@@ -198,7 +198,7 @@ subroutine ashapec(asc)
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
 ! CHECK:           %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_4]] dummy_scope %[[VAL_3]] {uniq_name = "_QFashapecEasc"} : (!fir.box<!fir.array<?x!fir.char<1>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1>>>, !fir.box<!fir.array<?x!fir.char<1>>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_4]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFashapecEasc"} : (!fir.box<!fir.array<?x!fir.char<1>>>, index, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.char<1>>>, !fir.box<!fir.array<?x!fir.char<1>>>)
 ! CHECK:           %[[VAL_6:.*]] = fir.zero_bits !fir.heap<!fir.array<?xi32>>
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
@@ -252,7 +252,7 @@ subroutine ashapec(asc)
 ! CHECK:           %[[VAL_12:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box<!fir.heap<!fir.array<?x!fir.char<1>>>>) -> !fir.heap<!fir.array<?x!fir.char<1>>>
 ! CHECK:           %[[VAL_13:.*]] = fir.shape_shift %[[VAL_10]]#0, %[[VAL_10]]#1 : (index, index) -> !fir.shapeshift<1>
 ! CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]](%[[VAL_13]]) typeparams %[[VAL_11]] {uniq_name = "_QFashapecEasc"} : (!fir.heap<!fir.array<?x!fir.char<1>>>, !fir.shapeshift<1>, index) -> (!fir.box<!fir.array<?x!fir.char<1>>>, !fir.heap<!fir.array<?x!fir.char<1>>>)
-! CHECK:           %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFashapecEasi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFashapecEasi"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           %[[VAL_16:.*]] = fir.zero_bits !fir.heap<!fir.array<?xcomplex<f32>>>
 ! CHECK:           %[[VAL_17:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_18:.*]] = fir.shape %[[VAL_17]] : (index) -> !fir.shape<1>
@@ -303,7 +303,7 @@ subroutine ashapec(asc)
 ! CHECK:           %[[VAL_22:.*]] = fir.box_addr %[[VAL_19]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
 ! CHECK:           %[[VAL_23:.*]] = fir.shape_shift %[[VAL_21]]#0, %[[VAL_21]]#1 : (index, index) -> !fir.shapeshift<1>
 ! CHECK:           %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_22]](%[[VAL_23]]) {uniq_name = "_QFashapecEasi"} : (!fir.heap<!fir.array<?xi32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
-! CHECK:           %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFashapecEasx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>)
+! CHECK:           %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFashapecEasx"} : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.dscope) -> (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.box<!fir.array<?xcomplex<f32>>>)
 ! CHECK:           cf.br ^bb1
 ! CHECK:         ^bb1:
 ! CHECK:           hlfir.assign %{{.*}} to %[[VAL_25]]#0 : complex<f32>, !fir.box<!fir.array<?xcomplex<f32>>>
@@ -343,7 +343,7 @@ function f1(n1) result(res1)
 ! CHECK-SAME:                     %[[VAL_1:.*]]: index,
 ! CHECK-SAME:                     %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "n1"}) -> !fir.boxchar<1> {
 ! CHECK:           %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFf1En1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFf1En1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "n2", uniq_name = "_QFf1En2"}
 ! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFf1En2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 5 : index
@@ -391,7 +391,7 @@ function f1(n1) result(res1)
 ! CHECK:           %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n1", uniq_name = "_QFf1En1"}
 ! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFf1En1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] {uniq_name = "_QFf1En2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {uniq_name = "_QFf1En2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 5 : index
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] {uniq_name = "_QFf1Eres1"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
 ! CHECK:           %[[VAL_9:.*]] = arith.constant 5 : index
@@ -493,7 +493,7 @@ subroutine assumed_size()
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_2:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_1]] {uniq_name = "_QFassumed_sizeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFassumed_sizeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK:           cf.br ^bb1
 ! CHECK:         ^bb1:
 ! CHECK:           return
diff --git a/flang/test/Lower/forall-polymorphic.f90 b/flang/test/Lower/forall-pointer-assignment.f90
similarity index 63%
rename from flang/test/Lower/forall-polymorphic.f90
rename to flang/test/Lower/forall-pointer-assignment.f90
index 2b7a51f9b549a..d89fb3ed5cb57 100644
--- a/flang/test/Lower/forall-polymorphic.f90
+++ b/flang/test/Lower/forall-pointer-assignment.f90
@@ -1,6 +1,7 @@
-! Test lower of FORALL polymorphic pointer assignment 
+! Test lower of FORALL pointer assignment 
 ! RUN: bbc -emit-fir %s -o - | FileCheck %s
 
+
 !! Test when LHS is polymorphic and RHS is not polymorphic
 ! CHECK-LABEL: c.func @_QPforallpolymorphic
   subroutine forallPolymorphic()
@@ -46,6 +47,7 @@ subroutine forallPolymorphic()
 
   end subroutine forallPolymorphic
 
+
 !! Test when LHS is not polymorphic but RHS is polymorphic
 ! CHECK-LABEL: c.func @_QPforallpolymorphic2(
 ! CHECK-SAME: %arg0: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>> {fir.bindc_name = "tar1", fir.target}) {
@@ -68,7 +70,7 @@ subroutine forallPolymorphic2(Tar1)
 ! CHECK: %[[V_11:[0-9]+]] = fir.alloca !fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>> {bindc_name = "t", uniq_name = "_QFforallpolymorphic2Et"}
 ! CHECK: %[[V_12:[0-9]+]] = fir.shape %c10 : (index) -> !fir.shape<1>
 ! CHECK: %[[V_13:[0-9]+]] = fir.declare %[[V_11]](%[[V_12]]) {uniq_name = "_QFforallpolymorphic2Et"} : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>, !fir.shape<1>) -> !fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>
-! CHECK: %[[V_18:[0-9]+]] = fir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs<allocatable, target>, uniq_name = "_QFforallpolymorphic2Etar1"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>, !fir.dscope) -> !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>
+! CHECK: %[[V_18:[0-9]+]] = fir.declare %arg0 dummy_scope %0 {{.*}} {fortran_attrs = #fir.var_attrs<allocatable, target>, uniq_name = "_QFforallpolymorphic2Etar1"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>, !fir.dscope) -> !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>
 ! CHECK: %[[V_30:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index
 ! CHECK: %[[V_31:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index
 ! CHECK: fir.do_loop %arg1 = %[[V_30]] to %[[V_31]] step %c1
@@ -87,3 +89,86 @@ subroutine forallPolymorphic2(Tar1)
 
   end subroutine forallPolymorphic2
 
+
+!! Test when LHS is unlimited polymorphic and RHS non-polymorphic intrinsic
+!! type target.
+! CHECK-LABEL: c.func @_QPforallpolymorphic3
+subroutine forallPolymorphic3()
+  TYPE :: DT
+    CLASS(*), POINTER    :: Ptr => NULL()
+  END TYPE
+
+  TYPE(DT) :: D1(10)
+  CHARACTER*1, TARGET :: TAR1(10)
+  INTEGER :: I
+
+  FORALL (I=1:10)
+    D1(I)%Ptr => Tar1(I)
+  END FORALL
+
+! CHECK: %[[V_7:[0-9]+]] = fir.alloca !fir.array<10x!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>> {bindc_name = "d1", uniq_name = "_QFforallpolymorphic3Ed1"}
+! CHECK: %[[V_8:[0-9]+]] = fir.shape %c10 : (index) -> !fir.shape<1>
+! CHECK: %[[V_9:[0-9]+]] = fir.declare %[[V_7]](%[[V_8]]) {uniq_name = "_QFforallpolymorphic3Ed1"} : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>>, !fir.shape<1>) -> !fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>>
+! CHECK: %[[V_16:[0-9]+]] = fir.alloca !fir.array<10x!fir.char<1>> {bindc_name = "tar1", fir.target, uniq_name = "_QFforallpolymorphic3Etar1"}
+! CHECK: %[[V_17:[0-9]+]] = fir.declare %[[V_16]](%[[V_8]]) typeparams %c1 {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFforallpolymorphic3Etar1"} : (!fir.ref<!fir.array<10x!fir.char<1>>>, !fir.shape<1>, index) -> !fir.ref<!fir.array<10x!fir.char<1>>>
+! CHECK: %[[V_24:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index
+! CHECK: %[[V_25:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index
+! CHECK: fir.do_loop %arg0 = %[[V_24]] to %[[V_25]] step %c1
+! CHECK: {
+! CHECK: %[[V_26:[0-9]+]] = fir.convert %arg0 : (index) -> i32
+! CHECK: %[[V_27:[0-9]+]] = fir.convert %[[V_26]] : (i32) -> i64
+! CHECK: %[[V_28:[0-9]+]] = fir.array_coor %[[V_9]](%[[V_8]]) %[[V_27]] : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>>, !fir.shape<1>, i64) -> !fir.ref<!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>
+! CHECK: %[[V_29:[0-9]+]] = fir.field_index ptr, !fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>
+! CHECK: %[[V_30:[0-9]+]] = fir.coordinate_of %[[V_28]], ptr : (!fir.ref<!fir.type<_QFforallpolymorphic3Tdt{ptr:!fir.class<!fir.ptr<none>>}>>) -> !fir.ref<!fir.class<!fir.ptr<none>>>
+! CHECK: %[[V_31:[0-9]+]] = fir.convert %[[V_26]] : (i32) -> i64
+! CHECK: %[[V_32:[0-9]+]] = fir.array_coor %[[V_17]](%[[V_8]]) %31 : (!fir.ref<!fir.array<10x!fir.char<1>>>, !fir.shape<1>, i64) -> !fir.ref<!fir.char<1>>
+! CHECK: %[[V_33:[0-9]+]] = fir.embox %[[V_32]] : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.ptr<!fir.char<1>>>
+! CHECK: %[[V_34:[0-9]+]] = fir.rebox %[[V_33]] : (!fir.box<!fir.ptr<!fir.char<1>>>) -> !fir.class<!fir.ptr<none>>
+! CHECK: fir.store %[[V_34]] to %[[V_30]] : !fir.ref<!fir.class<!fir.ptr<none>>>
+! CHECK: }
+
+end subroutine forallPolymorphic3
+
+
+!! Test the LHS of a pointer assignment gets the isPointer flag from the
+!! RHS that is a reference to a function that returns a pointer.
+! CHECK-LABEL: c.func @_QPforallpointerassignment1
+  subroutine forallPointerAssignment1()
+    type base
+        real, pointer :: data => null()
+    end type
+
+    interface
+      pure function makeData (i)
+        real, pointer :: makeData
+        integer*4, intent(in) :: i
+      end function
+    end interface
+
+    type(base) :: co1(10)
+
+    forall (i=1:10)
+        co1(i)%data => makeData (i)
+    end forall
+
+! CHECK: %[[V_3:[0-9]+]] = fir.alloca i64
+! CHECK: %[[V_3:[0-9]+]] = fir.alloca i32 {bindc_name = "i"}
+! CHECK: %[[V_4:[0-9]+]] = fir.alloca !fir.box<!fir.ptr<f32>> {bindc_name = ".result"}
+! CHECK: %[[V_25:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index
+! CHECK: %[[V_26:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index
+! CHECK: %[[V_27:[0-9]+]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
+! CHECK: %[[V_28:[0-9]+]] = fir.convert %[[V_27]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
+! CHECK: %[[V_29:[0-9]+]] = fir.call @_FortranACreateDescriptorStack(%[[V_28]], %c{{.*}}) : (!fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
+! CHECK: fir.do_loop %arg0 = %[[V_25]] to %[[V_26]] step %c1
+! CHECK: {
+! CHECK: %[[V_32:[0-9]+]] = fir.convert %arg0 : (index) -> i32
+! CHECK: fir.store %[[V_32]] to %[[V_3]] : !fir.ref<i32>
+! CHECK: %[[V_33:[0-9]+]] = fir.call @_QPmakedata(%[[V_3]]) proc_attrs<pure> fastmath<contract> : (!fir.ref<i32>) -> !fir.box<!fir.ptr<f32>>
+! CHECK: fir.save_result %[[V_33]] to %[[V_4]] : !fir.box<!fir.ptr<f32>>, !fir.ref<!fir.box<!fir.ptr<f32>>>
+! CHECK: %[[V_34:[0-9]+]] = fir.declare %[[V_4]] {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.box<!fir.ptr<f32>>>) -> !fir.ref<!fir.box<!fir.ptr<f32>>>
+! CHECK: %[[V_35:[0-9]+]] = fir.load %[[V_34]] : !fir.ref<!fir.box<!fir.ptr<f32>>>
+! CHECK: %[[V_36:[0-9]+]] = fir.convert %[[V_35]] : (!fir.box<!fir.ptr<f32>>) -> !fir.box<none>
+! CHECK: fir.call @_FortranAPushDescriptor(%[[V_29]], %[[V_36]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
+! CHECK: }
+
+  end subroutine forallPointerAssignment1
diff --git a/flang/test/Lower/forall/array-pointer.f90 b/flang/test/Lower/forall/array-pointer.f90
index fd3efed736c39..6b8c5648af29e 100644
--- a/flang/test/Lower/forall/array-pointer.f90
+++ b/flang/test/Lower/forall/array-pointer.f90
@@ -318,7 +318,6 @@ end subroutine s2_3
 ! CHECK-LABEL: func @_QPs2_3(
 ! CHECK-SAME:                %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.type<_QMarray_of_pointer_testTt{ip:!fir.box<!fir.ptr<i32>>}>>> {fir.bindc_name = "x"}) {
 ! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"}
-! CHECK:         %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "y", fir.target, uniq_name = "_QFs2_3Ey"}
 ! CHECK:         %[[VAL_3:.*]] = fir.alloca !fir.heap<!fir.array<?xi32>> {uniq_name = "_QFs2_3Ey.addr"}
 ! CHECK:         %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFs2_3Ey.lb0"}
 ! CHECK:         %[[VAL_5:.*]] = fir.alloca index {uniq_name = "_QFs2_3Ey.ext0"}
diff --git a/flang/test/Lower/forall/forall-allocatable.f90 b/flang/test/Lower/forall/forall-allocatable.f90
index 96cd37ea3ed8a..8e54d282aea4b 100644
--- a/flang/test/Lower/forall/forall-allocatable.f90
+++ b/flang/test/Lower/forall/forall-allocatable.f90
@@ -13,20 +13,19 @@ end subroutine forall_with_allocatable
 ! CHECK-LABEL: func @_QPforall_with_allocatable(
 ! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>>{{.*}}) {
 ! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"}
-! CHECK:         %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "arr", uniq_name = "_QFforall_with_allocatableEarr"}
-! CHECK:         %[[VAL_3:.*]] = fir.alloca !fir.heap<!fir.array<?xf32>> {uniq_name = "_QFforall_with_allocatableEarr.addr"}
-! CHECK:         %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.lb0"}
-! CHECK:         %[[VAL_5:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.ext0"}
-! CHECK:         %[[VAL_6:.*]] = fir.zero_bits !fir.heap<!fir.array<?xf32>>
-! CHECK:         fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<!fir.heap<!fir.array<?xf32>>>
+! CHECK:         %[[VAL_2:.*]] = fir.alloca !fir.heap<!fir.array<?xf32>> {uniq_name = "_QFforall_with_allocatableEarr.addr"}
+! CHECK:         %[[VAL_3:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.lb0"}
+! CHECK:         %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.ext0"}
+! CHECK:         %[[VAL_5:.*]] = fir.zero_bits !fir.heap<!fir.array<?xf32>>
+! CHECK:         fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<!fir.heap<!fir.array<?xf32>>>
 ! CHECK:         %[[VAL_7:.*]] = arith.constant 5 : i32
 ! CHECK:         %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i32) -> index
 ! CHECK:         %[[VAL_9:.*]] = arith.constant 15 : i32
 ! CHECK:         %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index
 ! CHECK:         %[[VAL_11:.*]] = arith.constant 1 : index
-! CHECK:         %[[VAL_12:.*]] = fir.load %[[VAL_4]] : !fir.ref<index>
-! CHECK:         %[[VAL_13:.*]] = fir.load %[[VAL_5]] : !fir.ref<index>
-! CHECK:         %[[VAL_14:.*]] = fir.load %[[VAL_3]] : !fir.ref<!fir.heap<!fir.array<?xf32>>>
+! CHECK:         %[[VAL_12:.*]] = fir.load %[[VAL_3]] : !fir.ref<index>
+! CHECK:         %[[VAL_13:.*]] = fir.load %[[VAL_4]] : !fir.ref<index>
+! CHECK:         %[[VAL_14:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.heap<!fir.array<?xf32>>>
 ! CHECK:         %[[VAL_15:.*]] = fir.shape_shift %[[VAL_12]], %[[VAL_13]] : (index, index) -> !fir.shapeshift<1>
 ! CHECK:         %[[VAL_16:.*]] = fir.array_load %[[VAL_14]](%[[VAL_15]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>) -> !fir.array<?xf32>
 ! CHECK:         %[[VAL_17:.*]] = fir.array_load %[[VAL_0]] : (!fir.box<!fir.array<?xf32>>) -> !fir.array<?xf32>
diff --git a/flang/test/Lower/forall/scalar-substring.f90 b/flang/test/Lower/forall/scalar-substring.f90
index f70221a9b31ee..8ed2406b58539 100644
--- a/flang/test/Lower/forall/scalar-substring.f90
+++ b/flang/test/Lower/forall/scalar-substring.f90
@@ -12,7 +12,7 @@ end subroutine s
 ! CHECK:           %[[VAL_2:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,10>>
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] typeparams %[[VAL_4]] dummy_scope %[[VAL_1]] {uniq_name = "_QFsEch"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]] typeparams %[[VAL_4]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {uniq_name = "_QFsEch"} : (!fir.ref<!fir.char<1,10>>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>)
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 4 : i32
 ! CHECK:           hlfir.forall lb {
diff --git a/flang/test/Lower/force-temp.f90 b/flang/test/Lower/force-temp.f90
index d9ba543d46313..093e098d10ac7 100644
--- a/flang/test/Lower/force-temp.f90
+++ b/flang/test/Lower/force-temp.f90
@@ -27,6 +27,14 @@ subroutine pass_intent_out(buf)
       integer, intent(out) :: buf(5)
     end subroutine
   end interface
+
+  ! Used by call_s6() and others below
+  type base
+    integer :: i = -1
+  end type
+  type, extends (base) :: child
+    real :: r = -2.0
+  end type
 contains
   subroutine s1(buf)
 !CHECK-LABEL: func.func @_QMtestPs1
@@ -79,4 +87,54 @@ subroutine s5()
     p => x(::2) ! pointer to non-contiguous array section
     call pass_intent_out(p)
   end subroutine
+  subroutine call_s6()
+    interface
+      subroutine s6(b)
+        import :: base
+        type(base), intent(inout) :: b(:)
+      end subroutine s6
+    end interface
+    class(base), pointer :: pb(:)
+    type(child), target :: c(2)
+!CHECK-LABEL: func.func @_QMtestPcall_s6
+!CHECK-NOT: hlfir.copy_in
+!CHECK: fir.call @_QPs6
+!CHECK-NOT: hlfir.copy_out
+    pb => c
+    call s6(pb)
+  end subroutine call_s6
+  subroutine call_s7()
+    interface
+      subroutine s7(b1, b2, n)
+        import :: base
+        integer :: n
+        type(base), intent(inout) :: b1(n)
+        type(base), intent(inout) :: b2(*)
+      end subroutine
+    end interface
+    integer, parameter :: n = 7
+    class(base), allocatable :: c1(:), c2(:)
+!CHECK-LABEL: func.func @_QMtestPcall_s7
+!CHECK: hlfir.copy_in
+!CHECK: hlfir.copy_in
+!CHECK: fir.call @_QPs7
+!CHECK: hlfir.copy_out
+!CHECK: hlfir.copy_out
+    call s7(c1, c2, n)
+  end subroutine call_s7
+  subroutine call_s8()
+    interface
+      subroutine s8(buf)
+        ! IGNORE_TKR(C) takes precendence over CONTIGUOUS
+        !DIR$ IGNORE_TKR(C) buf
+        real, contiguous :: buf(:)
+      end subroutine
+    end interface
+    real a(10)
+!CHECK-LABEL: func.func @_QMtestPcall_s8
+!CHECK-NOT: hlfir.copy_in
+!CHECK: fir.call @_QPs8
+!CHECK-NOT: hlfir.copy_out
+    call s8(a(1:5:2))
+  end subroutine call_s8
 end module
diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90
index 2fea84b03891a..5ee6562733dae 100644
--- a/flang/test/Lower/loops.f90
+++ b/flang/test/Lower/loops.f90
@@ -90,7 +90,6 @@ subroutine lis(n)
   ! CHECK-DAG: fir.alloca !fir.array<?x?x?xi32>, %{{.*}}, %{{.*}}, %{{.*}} {bindc_name = "a", fir.target, uniq_name = "_QFlisEa"}
   ! CHECK-DAG: fir.alloca !fir.array<?x?xi32>, %{{.*}}, %{{.*}} {bindc_name = "r", uniq_name = "_QFlisEr"}
   ! CHECK-DAG: fir.alloca !fir.array<?x?xi32>, %{{.*}}, %{{.*}} {bindc_name = "s", uniq_name = "_QFlisEs"}
-  ! CHECK-DAG: fir.alloca !fir.array<?x?xi32>, %{{.*}}, %{{.*}} {bindc_name = "t", uniq_name = "_QFlisEt"}
   integer, target    :: a(n,n,n) ! operand via p
   integer            :: r(n,n)   ! result, unspecified locality
   integer            :: s(n,n)   ! shared locality
diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90
index f586380e653a0..bc4eed54282df 100644
--- a/flang/test/Lower/polymorphic.f90
+++ b/flang/test/Lower/polymorphic.f90
@@ -287,7 +287,6 @@ subroutine pointer_assign_parent(p)
 ! First test is here to have a reference with non polymorphic on both sides.
 ! CHECK-LABEL: func.func @_QMpolymorphic_testPpointer_assign_parent(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.type<_QMpolymorphic_testTp2{a:i32,b:i32,c:f32}>> {fir.bindc_name = "p", fir.target}) {
-! CHECK: %[[TP:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>> {bindc_name = "tp", uniq_name = "_QMpolymorphic_testFpointer_assign_parentEtp"}
 ! CHECK: %[[PTR:.*]] = fir.alloca !fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {uniq_name = "_QMpolymorphic_testFpointer_assign_parentEtp.addr"}
 ! CHECK: %[[ZERO:.*]] = fir.zero_bits !fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
 ! CHECK: fir.store %[[ZERO]] to %[[PTR]] : !fir.ref<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>
@@ -302,7 +301,6 @@ subroutine pointer_assign_non_poly(p)
 
 ! CHECK-LABEL: func.func @_QMpolymorphic_testPpointer_assign_non_poly(
 ! CHECK-SAME: %arg0: !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {fir.bindc_name = "p", fir.target}) {
-! CHECK: %[[TP:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>> {bindc_name = "tp", uniq_name = "_QMpolymorphic_testFpointer_assign_non_polyEtp"}
 ! CHECK: %[[PTR:.*]] = fir.alloca !fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {uniq_name = "_QMpolymorphic_testFpointer_assign_non_polyEtp.addr"}
 ! CHECK: %[[ZERO:.*]] = fir.zero_bits !fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
 ! CHECK: fir.store %[[ZERO]] to %[[PTR]] : !fir.ref<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>
@@ -1103,11 +1101,9 @@ subroutine class_with_entry(a)
 
 ! CHECK-LABEL: func.func @_QMpolymorphic_testPclass_with_entry(
 ! CHECK-SAME: %[[A:.*]]: !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {fir.bindc_name = "a"}) {
-! CHECK: %[[B:.*]] = fir.alloca !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {bindc_name = "b", uniq_name = "_QMpolymorphic_testFclass_with_entryEb"}
 
 ! CHECK-LABEL: func.func @_QMpolymorphic_testPd(
 ! CHECK-SAME: %[[B:.*]]: !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {fir.bindc_name = "b"}) {
-! CHECK: %[[A:.*]] = fir.alloca !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {bindc_name = "a", uniq_name = "_QMpolymorphic_testFclass_with_entryEa"}
 
   subroutine class_array_with_entry(a)
     class(p1) :: a(:), b(:)
diff --git a/flang/test/Lower/statement-function.f90 b/flang/test/Lower/statement-function.f90
index cfec06c35baa8..fe07649e669af 100644
--- a/flang/test/Lower/statement-function.f90
+++ b/flang/test/Lower/statement-function.f90
@@ -129,7 +129,6 @@ integer function test_stmt_character_with_different_length_2(c, n)
   character(n) :: argc
   character(*) :: c
   ! CHECK: %[[unboxed:.*]]:2 = fir.unboxchar %[[arg0]] :
-  ! CHECK: fir.load %[[arg1]] : !fir.ref<i32>
   ! CHECK: %[[n:.*]] = fir.load %[[arg1]] : !fir.ref<i32>
   ! CHECK: %[[n_is_positive:.*]] = arith.cmpi sgt, %[[n]], %c0{{.*}} : i32
   ! CHECK: %[[len:.*]] = arith.select %[[n_is_positive]], %[[n]], %c0{{.*}} : i32
diff --git a/flang/test/Lower/structure-constructors-alloc-comp.f90 b/flang/test/Lower/structure-constructors-alloc-comp.f90
index 9df5be11d1784..c624433c1ba5f 100644
--- a/flang/test/Lower/structure-constructors-alloc-comp.f90
+++ b/flang/test/Lower/structure-constructors-alloc-comp.f90
@@ -24,7 +24,7 @@ subroutine test_alloc1(y)
 ! HLFIR-LABEL:  func.func @_QMm_struct_ctorPtest_alloc1(
 ! HLFIR-SAME:      %[[ARG_0:.*]]: !fir.ref<f32> {fir.bindc_name = "y"}) {
 ! HLFIR:    %[[VAL_0:.*]] = fir.alloca !fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>
-! HLFIR:    %[[VAL_12:.*]]:2 = hlfir.declare %[[ARG_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMm_struct_ctorFtest_alloc1Ey"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! HLFIR:    %[[VAL_12:.*]]:2 = hlfir.declare %[[ARG_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMm_struct_ctorFtest_alloc1Ey"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! HLFIR:    %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>)
 ! HLFIR:    %[[VAL_14:.*]] = fir.embox %[[VAL_13]]#0 : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.box<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
 ! HLFIR:    %[[VAL_15:.*]] = fir.address_of(@_QQ{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
@@ -49,8 +49,8 @@ subroutine test_alloc2(y, b)
 ! HLFIR:    %[[VAL_0:.*]] = fir.alloca !fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>
 ! HLFIR:    %[[CONS_6:.*]] = arith.constant 5 : index
 ! HLFIR:    %[[VAL_12:.*]] = fir.shape %[[CONS_6]] : (index) -> !fir.shape<1>
-! HLFIR:    %[[VAL_13:.*]]:2 = hlfir.declare %[[ARG_1]](%[[VAL_12]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QMm_struct_ctorFtest_alloc2Eb"} : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>)
-! HLFIR:    %[[VAL_14:.*]]:2 = hlfir.declare %[[ARG_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMm_struct_ctorFtest_alloc2Ey"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! HLFIR:    %[[VAL_13:.*]]:2 = hlfir.declare %[[ARG_1]](%[[VAL_12]]) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMm_struct_ctorFtest_alloc2Eb"} : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>)
+! HLFIR:    %[[VAL_14:.*]]:2 = hlfir.declare %[[ARG_0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QMm_struct_ctorFtest_alloc2Ey"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! HLFIR:    %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>)
 ! HLFIR:    %[[VAL_16:.*]] = fir.embox %[[VAL_15]]#0 : (!fir.ref<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.box<!fir.type<_QMm_struct_ctorTt_alloc{x:f32,a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
 ! HLFIR:    %[[VAL_17:.*]] = fir.address_of(@_QQ{{.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
diff --git a/flang/test/Lower/taskloop-inreduction.f90 b/flang/test/Lower/taskloop-inreduction.f90
new file mode 100644
index 0000000000000..e7d3f96115fbd
--- /dev/null
+++ b/flang/test/Lower/taskloop-inreduction.f90
@@ -0,0 +1,40 @@
+! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:        {type = private} @[[PRIVATE_I:.*]] : i32
+
+! CHECK-LABEL:  omp.declare_reduction 
+! CHECK-SAME:   @[[ADD_RED_I32:.*]] : i32 init {
+! CHECK:       ^bb0(%{{.*}}: i32):
+! CHECK:        %[[C0_I32:.*]] = arith.constant 0 : i32
+! CHECK:        omp.yield(%[[C0_I32]] : i32)
+! CHECK:     } combiner {
+! CHECK:     ^bb0(%{{.*}}: i32, %{{.*}}: i32):
+! CHECK:        %[[RES:.*]] = arith.addi %{{.*}}, %{{.*}} : i32
+! CHECK:        omp.yield(%[[RES]] : i32)
+! CHECK:     }
+
+! CHECK-LABEL: func.func @_QPomp_taskloop_inreduction
+! CHECK:          %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_taskloop_inreductionEi"}
+! CHECK:          %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFomp_taskloop_inreductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[ALLOCA_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFomp_taskloop_inreductionEx"}
+! CHECK:          %[[DECL_X:.*]]:2 = hlfir.declare %[[ALLOCA_X]] {uniq_name = "_QFomp_taskloop_inreductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[INIT_X:.*]] = arith.constant 0 : i32
+! CHECK:          hlfir.assign %[[INIT_X]] to %[[DECL_X]]#0 : i32, !fir.ref<i32>
+subroutine omp_taskloop_inreduction()
+   integer x
+   x = 0
+   ! CHECK:        omp.taskloop in_reduction(@[[ADD_RED_I32]] 
+   ! CHECK:        %[[DECL_X]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) private(@[[PRIVATE_I]] %[[DECL_I]]#0 -> %[[ARG1:.*]] : !fir.ref<i32>) {
+   ! CHECK:        %[[VAL_ARG1:.*]]:2 = hlfir.declare %[[ARG0]] 
+   ! CHECK-SAME:   {uniq_name = "_QFomp_taskloop_inreductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+   !$omp taskloop in_reduction(+:x)
+   do i = 1, 100
+      ! CHECK: %[[X_VAL:.*]] = fir.load %[[VAL_ARG1]]#0 : !fir.ref<i32>
+      ! CHECK: %[[ADD_VAL:.*]] = arith.addi %[[X_VAL]], %{{.*}} : i32
+      x = x + 1
+      ! CHECK: hlfir.assign %[[ADD_VAL]] to %[[VAL_ARG1]]#0 : i32, !fir.ref<i32>
+   end do
+   !$omp end taskloop
+end subroutine omp_taskloop_inreduction
diff --git a/flang/test/Lower/taskloop-reduction.f90 b/flang/test/Lower/taskloop-reduction.f90
new file mode 100644
index 0000000000000..e45c0181bcc8b
--- /dev/null
+++ b/flang/test/Lower/taskloop-reduction.f90
@@ -0,0 +1,39 @@
+! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:        {type = private} @[[PRIVATE_I:.*]] : i32
+
+! CHECK-LABEL:  omp.declare_reduction 
+! CHECK-SAME:   @[[ADD_RED_I32:.*]] : i32 init {
+! CHECK:       ^bb0(%{{.*}}: i32):
+! CHECK:        %[[C0_I32:.*]] = arith.constant 0 : i32
+! CHECK:        omp.yield(%[[C0_I32]] : i32)
+! CHECK:     } combiner {
+! CHECK:     ^bb0(%{{.*}}: i32, %{{.*}}: i32):
+! CHECK:        %[[RES:.*]] = arith.addi %{{.*}}, %{{.*}} : i32
+! CHECK:        omp.yield(%[[RES]] : i32)
+! CHECK:     }
+
+! CHECK-LABEL:  func.func @_QPomp_taskloop_reduction
+! CHECK:          %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_taskloop_reductionEi"}
+! CHECK:          %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFomp_taskloop_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[ALLOCA_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFomp_taskloop_reductionEx"}
+! CHECK:          %[[DECL_X:.*]]:2 = hlfir.declare %[[ALLOCA_X]] {uniq_name = "_QFomp_taskloop_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[INIT_X:.*]] = arith.constant 0 : i32
+! CHECK:          hlfir.assign %[[INIT_X]] to %[[DECL_X]]#0 : i32, !fir.ref<i32>
+subroutine omp_taskloop_reduction()
+   integer x
+   x = 0
+   ! CHECK:       omp.taskloop private(@[[PRIVATE_I]] 
+   ! CHECK-SAME:  %[[DECL_I]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) reduction(@[[ADD_RED_I32]] %[[DECL_X]]#0 -> %[[ARG1:.*]] : !fir.ref<i32>) {
+   ! CHECK:       %[[VAL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] 
+   !$omp taskloop reduction(+:x)
+   do i = 1, 100
+      ! CHECK: %[[X_VAL:.*]] = fir.load %[[VAL_ARG1]]#0 : !fir.ref<i32>
+      ! CHECK: %[[ADD_VAL:.*]] = arith.addi %[[X_VAL]], %{{.*}} : i32
+      x = x + 1
+      ! CHECK: hlfir.assign %[[ADD_VAL]] to %[[VAL_ARG1]]#0 : i32, !fir.ref<i32>
+   end do
+   !$omp end taskloop
+end subroutine omp_taskloop_reduction
diff --git a/flang/test/Lower/unsigned-ops.f90 b/flang/test/Lower/unsigned-ops.f90
index 13e17721ceb9f..644f0c4ea11c9 100644
--- a/flang/test/Lower/unsigned-ops.f90
+++ b/flang/test/Lower/unsigned-ops.f90
@@ -10,8 +10,8 @@ unsigned function f01(u, v)
 !CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
 !CHECK: %[[VAL_1:.*]] = fir.alloca ui32 {bindc_name = "f01", uniq_name = "_QFf01Ef01"}
 !CHECK: %[[VAL_2:.*]] = fir.declare %[[VAL_1]] {uniq_name = "_QFf01Ef01"} : (!fir.ref<ui32>) -> !fir.ref<ui32>
-!CHECK: %[[VAL_3:.*]] = fir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFf01Eu"} : (!fir.ref<ui32>, !fir.dscope) -> !fir.ref<ui32>
-!CHECK: %[[VAL_4:.*]] = fir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFf01Ev"} : (!fir.ref<ui32>, !fir.dscope) -> !fir.ref<ui32>
+!CHECK: %[[VAL_3:.*]] = fir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFf01Eu"} : (!fir.ref<ui32>, !fir.dscope) -> !fir.ref<ui32>
+!CHECK: %[[VAL_4:.*]] = fir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFf01Ev"} : (!fir.ref<ui32>, !fir.dscope) -> !fir.ref<ui32>
 !CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_3]] : !fir.ref<ui32>
 !CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_4]] : !fir.ref<ui32>
 !CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (ui32) -> i32
@@ -35,8 +35,8 @@ unsigned function f02(u, v)
 !CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
 !CHECK: %[[VAL_1:.*]] = fir.alloca ui32 {bindc_name = "f02", uniq_name = "_QFf02Ef02"}
 !CHECK: %[[VAL_2:.*]] = fir.declare %[[VAL_1]] {uniq_name = "_QFf02Ef02"} : (!fir.ref<ui32>) -> !fir.ref<ui32>
-!CHECK: %[[VAL_3:.*]] = fir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFf02Eu"} : (!fir.ref<ui32>, !fir.dscope) -> !fir.ref<ui32>
-!CHECK: %[[VAL_4:.*]] = fir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFf02Ev"} : (!fir.ref<ui32>, !fir.dscope) -> !fir.ref<ui32>
+!CHECK: %[[VAL_3:.*]] = fir.declare %[[ARG0]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFf02Eu"} : (!fir.ref<ui32>, !fir.dscope) -> !fir.ref<ui32>
+!CHECK: %[[VAL_4:.*]] = fir.declare %[[ARG1]] dummy_scope %[[VAL_0]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFf02Ev"} : (!fir.ref<ui32>, !fir.dscope) -> !fir.ref<ui32>
 !CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_3]] : !fir.ref<ui32>
 !CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_4]] : !fir.ref<ui32>
 !CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (ui32) -> i64
diff --git a/flang/test/Lower/volatile-derived-type.f90 b/flang/test/Lower/volatile-derived-type.f90
index 963e4cf45a761..9691df275ad3e 100644
--- a/flang/test/Lower/volatile-derived-type.f90
+++ b/flang/test/Lower/volatile-derived-type.f90
@@ -43,6 +43,6 @@ subroutine test(v)
 ! CHECK-LABEL:   func.func private @_QFPtest(
 ! CHECK-SAME:                                %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?xi32>> {fir.asynchronous, fir.bindc_name = "v"}) attributes {fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<asynchronous>, uniq_name = "_QFFtestEv"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<asynchronous>, uniq_name = "_QFFtestEv"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK:           return
 ! CHECK:         }
diff --git a/flang/test/Lower/volatile-string.f90 b/flang/test/Lower/volatile-string.f90
index 54f22af5ca26b..f3e291dd19182 100644
--- a/flang/test/Lower/volatile-string.f90
+++ b/flang/test/Lower/volatile-string.f90
@@ -76,7 +76,7 @@ subroutine assign_different_length(string)
 ! CHECK:           %[[VAL_3:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,3>>
 ! CHECK:           %[[VAL_5:.*]] = fir.volatile_cast %[[VAL_4]] : (!fir.ref<!fir.char<1,3>>) -> !fir.ref<!fir.char<1,3>, volatile>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<intent_inout, volatile>, uniq_name = "_QFFassign_same_lengthEx"} : (!fir.ref<!fir.char<1,3>, volatile>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,3>, volatile>, !fir.ref<!fir.char<1,3>, volatile>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_1]] dummy_scope %[[VAL_2]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout, volatile>, uniq_name = "_QFFassign_same_lengthEx"} : (!fir.ref<!fir.char<1,3>, volatile>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,3>, volatile>, !fir.ref<!fir.char<1,3>, volatile>)
 ! CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX626172) : !fir.ref<!fir.char<1,3>>
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX626172"} : (!fir.ref<!fir.char<1,3>>, index) -> (!fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>>)
 ! CHECK:           hlfir.assign %[[VAL_8]]#0 to %[[VAL_6]]#0 : !fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>, volatile>
@@ -91,7 +91,7 @@ subroutine assign_different_length(string)
 ! CHECK:           %[[VAL_4:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,3>>
 ! CHECK:           %[[VAL_6:.*]] = fir.volatile_cast %[[VAL_5]] : (!fir.ref<!fir.char<1,3>>) -> !fir.ref<!fir.char<1,3>, volatile>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] typeparams %[[VAL_2]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<intent_inout, volatile>, uniq_name = "_QFFassign_different_lengthEstring"} : (!fir.ref<!fir.char<1,3>, volatile>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,3>, volatile>, !fir.ref<!fir.char<1,3>, volatile>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] typeparams %[[VAL_2]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<intent_inout, volatile>, uniq_name = "_QFFassign_different_lengthEstring"} : (!fir.ref<!fir.char<1,3>, volatile>, index, !fir.dscope) -> (!fir.ref<!fir.char<1,3>, volatile>, !fir.ref<!fir.char<1,3>, volatile>)
 ! CHECK:           %[[VAL_8:.*]] = fir.address_of(@_QQclX626F) : !fir.ref<!fir.char<1,2>>
 ! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX626F"} : (!fir.ref<!fir.char<1,2>>, index) -> (!fir.ref<!fir.char<1,2>>, !fir.ref<!fir.char<1,2>>)
 ! CHECK:           hlfir.assign %[[VAL_9]]#0 to %[[VAL_7]]#0 : !fir.ref<!fir.char<1,2>>, !fir.ref<!fir.char<1,3>, volatile>
diff --git a/flang/test/Lower/volatile3.f90 b/flang/test/Lower/volatile3.f90
index a32f29d2bb9e7..9bee56e660531 100644
--- a/flang/test/Lower/volatile3.f90
+++ b/flang/test/Lower/volatile3.f90
@@ -178,7 +178,7 @@ subroutine sub_select_rank(arr)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 10 : index
 ! CHECK:           %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_4]] {uniq_name = "_QFFsub_nonvolatile_arrayEarr"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_4]] arg {{[0-9]+}} {uniq_name = "_QFFsub_nonvolatile_arrayEarr"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[VAL_7:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_1]])  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
 ! CHECK:           hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.ref<i32>
 ! CHECK:           return
@@ -190,7 +190,7 @@ subroutine sub_select_rank(arr)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 5 : i32
 ! CHECK:           %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_4:.*]] = fir.volatile_cast %[[VAL_0]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>, volatile>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFFsub_volatile_array_assumed_shapeEarr"} : (!fir.box<!fir.array<?xi32>, volatile>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>, volatile>, !fir.box<!fir.array<?xi32>, volatile>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFFsub_volatile_array_assumed_shapeEarr"} : (!fir.box<!fir.array<?xi32>, volatile>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>, volatile>, !fir.box<!fir.array<?xi32>, volatile>)
 ! CHECK:           %[[VAL_6:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_1]])  : (!fir.box<!fir.array<?xi32>, volatile>, index) -> !fir.ref<i32, volatile>
 ! CHECK:           hlfir.assign %[[VAL_2]] to %[[VAL_6]] : i32, !fir.ref<i32, volatile>
 ! CHECK:           return
@@ -202,7 +202,7 @@ subroutine sub_select_rank(arr)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 5 : i32
 ! CHECK:           %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_4:.*]] = fir.volatile_cast %[[VAL_0]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<!fir.array<?x?xi32>, volatile>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFFsub_volatile_array_assumed_shape_2dEarr"} : (!fir.box<!fir.array<?x?xi32>, volatile>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>, volatile>, !fir.box<!fir.array<?x?xi32>, volatile>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFFsub_volatile_array_assumed_shape_2dEarr"} : (!fir.box<!fir.array<?x?xi32>, volatile>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>, volatile>, !fir.box<!fir.array<?x?xi32>, volatile>)
 ! CHECK:           %[[VAL_6:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_1]], %[[VAL_1]])  : (!fir.box<!fir.array<?x?xi32>, volatile>, index, index) -> !fir.ref<i32, volatile>
 ! CHECK:           hlfir.assign %[[VAL_2]] to %[[VAL_6]] : i32, !fir.ref<i32, volatile>
 ! CHECK:           return
@@ -216,7 +216,7 @@ subroutine sub_select_rank(arr)
 ! CHECK:           %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_6:.*]] = fir.volatile_cast %[[VAL_0]] : (!fir.ref<!fir.array<10xi32>>) -> !fir.ref<!fir.array<10xi32>, volatile>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]](%[[VAL_5]]) dummy_scope %[[VAL_4]] {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFFsub_volatile_arrayEarr"} : (!fir.ref<!fir.array<10xi32>, volatile>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>, volatile>, !fir.ref<!fir.array<10xi32>, volatile>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]](%[[VAL_5]]) dummy_scope %[[VAL_4]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFFsub_volatile_arrayEarr"} : (!fir.ref<!fir.array<10xi32>, volatile>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>, volatile>, !fir.ref<!fir.array<10xi32>, volatile>)
 ! CHECK:           %[[VAL_8:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_1]])  : (!fir.ref<!fir.array<10xi32>, volatile>, index) -> !fir.ref<i32, volatile>
 ! CHECK:           hlfir.assign %[[VAL_2]] to %[[VAL_8]] : i32, !fir.ref<i32, volatile>
 ! CHECK:           return
@@ -228,7 +228,7 @@ subroutine sub_select_rank(arr)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 5 : i32
 ! CHECK:           %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_4:.*]] = fir.volatile_cast %[[VAL_0]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile>
-! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] dummy_scope %[[VAL_3]] {fortran_attrs = #fir.var_attrs<pointer, volatile>, uniq_name = "_QFFsub_volatile_array_pointerEarr"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] dummy_scope %[[VAL_3]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer, volatile>, uniq_name = "_QFFsub_volatile_array_pointerEarr"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile>)
 ! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile>
 ! CHECK:           %[[VAL_7:.*]] = hlfir.designate %[[VAL_6]] (%[[VAL_1]])  : (!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, index) -> !fir.ref<i32, volatile>
 ! CHECK:           hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.ref<i32, volatile>
@@ -243,7 +243,7 @@ subroutine sub_select_rank(arr)
 ! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i8
 ! CHECK:           %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK:           %[[VAL_6:.*]] = fir.volatile_cast %[[VAL_0]] : (!fir.box<!fir.array<*:i32>>) -> !fir.box<!fir.array<*:i32>, volatile>
-! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] dummy_scope %[[VAL_5]] {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFFsub_select_rankEarr"} : (!fir.box<!fir.array<*:i32>, volatile>, !fir.dscope) -> (!fir.box<!fir.array<*:i32>, volatile>, !fir.box<!fir.array<*:i32>, volatile>)
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] dummy_scope %[[VAL_5]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFFsub_select_rankEarr"} : (!fir.box<!fir.array<*:i32>, volatile>, !fir.dscope) -> (!fir.box<!fir.array<*:i32>, volatile>, !fir.box<!fir.array<*:i32>, volatile>)
 ! CHECK:           %[[VAL_8:.*]] = fir.volatile_cast %[[VAL_7]]#0 : (!fir.box<!fir.array<*:i32>, volatile>) -> !fir.box<!fir.array<*:i32>>
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.array<*:i32>>) -> !fir.box<none>
 ! CHECK:           %[[VAL_10:.*]] = fir.call @_FortranAIsAssumedSize(%[[VAL_9]]) : (!fir.box<none>) -> i1
diff --git a/flang/test/Parser/OpenMP/allocate-align-tree.f90 b/flang/test/Parser/OpenMP/allocate-align-tree.f90
index 0d247cd1ed945..d799aa10a82ff 100644
--- a/flang/test/Parser/OpenMP/allocate-align-tree.f90
+++ b/flang/test/Parser/OpenMP/allocate-align-tree.f90
@@ -16,27 +16,33 @@ program allocate_align_tree
     allocate(j(z), xarray(t))
 end program allocate_align_tree
 
-!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt
-!CHECK-NEXT: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
-!CHECK-NEXT: | | | AttrSpec -> Allocatable
-!CHECK-NEXT: | | | EntityDecl
-!CHECK-NEXT: | | | | Name = 'j'
+!CHECK:      DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt
+!CHECK-NEXT: | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
+!CHECK-NEXT: | AttrSpec -> Allocatable
+!CHECK-NEXT: | EntityDecl
+!CHECK-NEXT: | | Name = 'j'
 
+!CHECK:      ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | OmpBeginDirective
+!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'j'
+!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '16_4'
+!CHECK-NEXT: | | | LiteralConstant -> IntLiteralConstant = '16'
+!CHECK-NEXT: | | Flags = None
+!CHECK-NEXT: | Block
+!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | OmpBeginDirective
+!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
+!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '32_4'
+!CHECK-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '32'
+!CHECK-NEXT: | | | | OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8'
+!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc'
+!CHECK-NEXT: | | | | Flags = None
+!CHECK-NEXT: | | | Block
+!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt
 
-!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate
-!CHECK-NEXT: | | | Verbatim
-!CHECK-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
-!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '32_4'
-!CHECK-NEXT: | | | | LiteralConstant -> IntLiteralConstant = '32'
-!CHECK-NEXT: | | | OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8'
-!CHECK-NEXT: | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc'
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'j'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '16_4'
-!CHECK-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '16'
-!CHECK-NEXT: | | | AllocateStmt
+!UNPARSE:      !$OMP ALLOCATE(j) ALIGN(16_4)
+!UNPARSE-NEXT: !$OMP ALLOCATE(xarray) ALIGN(32_4) ALLOCATOR(2_8)
+!UNPARSE-NEXT:  ALLOCATE(j(z), xarray(t))
 
-!UNPARSE: !$OMP ALLOCATE (j) ALIGN(16_4)
-!UNPARSE: !$OMP ALLOCATE (xarray) ALIGN(32_4) ALLOCATOR(2_8)
-!UNPARSE-NEXT: ALLOCATE(j(z), xarray(t))
diff --git a/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90 b/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90
index afcaf44b09f03..800e4a57d5f0e 100644
--- a/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90
+++ b/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90
@@ -17,33 +17,48 @@ program allocate_tree
     allocate (w, xarray(4), zarray(5, f))
 end program allocate_tree
 
-!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | Verbatim
-!CHECK-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'f'
-!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | Designator -> DataRef -> Name =
+!CHECK:      | | DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | OmpBeginDirective
+!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'f'
+!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8'
+!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc'
+!CHECK-NEXT: | | | | Flags = None
+!CHECK-NEXT: | | | Block
 !CHECK-NEXT: | ExecutionPart -> Block
 !CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'f=2_4'
 !CHECK-NEXT: | | | Variable = 'f'
 !CHECK-NEXT: | | | | Designator -> DataRef -> Name = 'f'
 !CHECK-NEXT: | | | Expr = '2_4'
 !CHECK-NEXT: | | | | LiteralConstant -> IntLiteralConstant = '2'
-!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate
-!CHECK-NEXT: | | | Verbatim
-!CHECK-NEXT: | | | OmpClauseList ->
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'w'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'zarray'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | AllocateStmt
+!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | OmpBeginDirective
+!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'w'
+!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '3_8'
+!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_const_mem_alloc'
+!CHECK-NEXT: | | | | Flags = None
+!CHECK-NEXT: | | | Block
+!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | | | OmpBeginDirective
+!CHECK-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
+!CHECK-NEXT: | | | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8'
+!CHECK-NEXT: | | | | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc'
+!CHECK-NEXT: | | | | | | Flags = None
+!CHECK-NEXT: | | | | | Block
+!CHECK-NEXT: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | | | | | OmpBeginDirective
+!CHECK-NEXT: | | | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'zarray'
+!CHECK-NEXT: | | | | | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8'
+!CHECK-NEXT: | | | | | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc'
+!CHECK-NEXT: | | | | | | | | Flags = None
+!CHECK-NEXT: | | | | | | | Block
+!CHECK-NEXT: | | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | | | | | | | OmpBeginDirective
+!CHECK-NEXT: | | | | | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | | | | | | | OmpClauseList ->
+!CHECK-NEXT: | | | | | | | | | | Flags = None
+!CHECK-NEXT: | | | | | | | | | Block
+!CHECK-NEXT: | | | | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt
diff --git a/flang/test/Parser/OpenMP/allocate-tree.f90 b/flang/test/Parser/OpenMP/allocate-tree.f90
index bf413d591baf2..021d8104a7e62 100644
--- a/flang/test/Parser/OpenMP/allocate-tree.f90
+++ b/flang/test/Parser/OpenMP/allocate-tree.f90
@@ -7,52 +7,54 @@
 
 program allocate_tree
     use omp_lib
-    integer, allocatable :: w, xarray(:), zarray(:, :)
-    integer :: z, t
+    integer, allocatable :: xarray(:), zarray(:, :)
+    integer :: z, t, w
+!$omp allocate(w) allocator(omp_const_mem_alloc)
     t = 2
     z = 3
-!$omp allocate(w) allocator(omp_const_mem_alloc)
 !$omp allocate(xarray) allocator(omp_large_cap_mem_alloc)
 !$omp allocate(zarray) allocator(omp_default_mem_alloc)
 !$omp allocate
-    allocate(w, xarray(4), zarray(t, z))
+    allocate(xarray(4), zarray(t, z))
 end program allocate_tree
 
-!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt
-!CHECK-NEXT: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
-!CHECK-NEXT: | | | AttrSpec -> Allocatable
-!CHECK-NEXT: | | | EntityDecl
-!CHECK-NEXT: | | | | Name = 'w'
-!CHECK-NEXT: | | | EntityDecl
-!CHECK-NEXT: | | | | Name = 'xarray'
-!CHECK-NEXT: | | | | ArraySpec -> DeferredShapeSpecList -> int = '1'
-!CHECK-NEXT: | | | EntityDecl
-!CHECK-NEXT: | | | | Name = 'zarray'
-!CHECK-NEXT: | | | | ArraySpec -> DeferredShapeSpecList -> int = '2'
-
+!CHECK:      DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | OmpBeginDirective
+!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'w'
+!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '3_8'
+!CHECK-NEXT: | | | Designator -> DataRef -> Name = 'omp_const_mem_alloc'
+!CHECK-NEXT: | | Flags = None
+!CHECK-NEXT: | Block
 
-!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate
-!CHECK-NEXT: | | | Verbatim
-!CHECK-NEXT: | | | OmpClauseList ->
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'w'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'zarray'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | AllocateStmt
+!CHECK:      ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | OmpBeginDirective
+!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
+!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8'
+!CHECK-NEXT: | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc'
+!CHECK-NEXT: | | Flags = None
+!CHECK-NEXT: | Block
+!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | OmpBeginDirective
+!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'zarray'
+!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8'
+!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc'
+!CHECK-NEXT: | | | | Flags = None
+!CHECK-NEXT: | | | Block
+!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | | | OmpBeginDirective
+!CHECK-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | | | OmpClauseList ->
+!CHECK-NEXT: | | | | | | Flags = None
+!CHECK-NEXT: | | | | | Block
+!CHECK-NEXT: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt
 
-!UNPARSE: !$OMP ALLOCATE (w) ALLOCATOR(3_8)
-!UNPARSE-NEXT: !$OMP ALLOCATE (xarray) ALLOCATOR(2_8)
-!UNPARSE-NEXT: !$OMP ALLOCATE (zarray) ALLOCATOR(1_8)
+!UNPARSE:      !$OMP ALLOCATE(w) ALLOCATOR(3_8)
+!UNPARSE-NEXT:   t=2_4
+!UNPARSE-NEXT:   z=3_4
+!UNPARSE-NEXT: !$OMP ALLOCATE(xarray) ALLOCATOR(2_8)
+!UNPARSE-NEXT: !$OMP ALLOCATE(zarray) ALLOCATOR(1_8)
 !UNPARSE-NEXT: !$OMP ALLOCATE
-!UNPARSE-NEXT: ALLOCATE(w, xarray(4_4), zarray(t,z))
+!UNPARSE-NEXT:  ALLOCATE(xarray(4_4), zarray(t,z))
diff --git a/flang/test/Parser/OpenMP/allocate-unparse.f90 b/flang/test/Parser/OpenMP/allocate-unparse.f90
index 94bc2adf35ea9..b61a97150cad2 100644
--- a/flang/test/Parser/OpenMP/allocate-unparse.f90
+++ b/flang/test/Parser/OpenMP/allocate-unparse.f90
@@ -9,6 +9,7 @@ program allocate_unparse
 
 ! 2.11.3 declarative allocate
 
+!$omp allocate
 !$omp allocate(x, y)
 !$omp allocate(x, y) allocator(omp_default_mem_alloc)
 
@@ -28,19 +29,24 @@ program allocate_unparse
 !$omp allocate(j) align(16)
     allocate ( darray(z, t) )
 
+!$omp allocate
+    allocate ( darray(a, b) )
 end program allocate_unparse
 
-!CHECK:!$OMP ALLOCATE (x,y)
-!CHECK:!$OMP ALLOCATE (x,y) ALLOCATOR(omp_default_mem_alloc)
-!CHECK:!$OMP ALLOCATE (a,b)
+!CHECK:!$OMP ALLOCATE{{[ ]*$}}
+!CHECK:!$OMP ALLOCATE(x, y)
+!CHECK:!$OMP ALLOCATE(x, y) ALLOCATOR(omp_default_mem_alloc)
+!CHECK:!$OMP ALLOCATE(a, b)
 !CHECK:ALLOCATE(darray(a,b))
 !CHECK:!$OMP ALLOCATE ALLOCATOR(omp_default_mem_alloc)
 !CHECK:ALLOCATE(darray(a,b))
-!CHECK:!$OMP ALLOCATE (a,b) ALLOCATOR(omp_default_mem_alloc)
+!CHECK:!$OMP ALLOCATE(a, b) ALLOCATOR(omp_default_mem_alloc)
 !CHECK:ALLOCATE(darray(a,b))
-!CHECK:!$OMP ALLOCATE (t) ALLOCATOR(omp_const_mem_alloc)
-!CHECK:!$OMP ALLOCATE (z) ALLOCATOR(omp_default_mem_alloc)
-!CHECK:!$OMP ALLOCATE (m) ALLOCATOR(omp_default_mem_alloc)
-!CHECK:!$OMP ALLOCATE (n)
-!CHECK:!$OMP ALLOCATE (j) ALIGN(16)
+!CHECK:!$OMP ALLOCATE(t) ALLOCATOR(omp_const_mem_alloc)
+!CHECK:!$OMP ALLOCATE(z) ALLOCATOR(omp_default_mem_alloc)
+!CHECK:!$OMP ALLOCATE(m) ALLOCATOR(omp_default_mem_alloc)
+!CHECK:!$OMP ALLOCATE(n)
+!CHECK:!$OMP ALLOCATE(j) ALIGN(16)
 !CHECK:ALLOCATE(darray(z,t))
+!CHECK:!$OMP ALLOCATE{{[ ]*$}}
+!CHECK:ALLOCATE(darray(a,b))
diff --git a/flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90 b/flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90
index 7d41efd348e50..599821dbe3377 100644
--- a/flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90
+++ b/flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90
@@ -26,21 +26,21 @@ subroutine f00(n)
 subroutine f01(n)
   implicit none
   integer :: n
-  !$omp target dyn_groupprivate(strict: n)
+  !$omp target dyn_groupprivate(fallback(abort): n)
   !$omp end target
 end
 
 !UNPARSE: SUBROUTINE f01 (n)
 !UNPARSE:  IMPLICIT NONE
 !UNPARSE:  INTEGER n
-!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(STRICT: n)
+!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(FALLBACK(ABORT): n)
 !UNPARSE: !$OMP END TARGET
 !UNPARSE: END SUBROUTINE
 
 !PARSE-TREE: OmpBeginDirective
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = target
 !PARSE-TREE: | OmpClauseList -> OmpClause -> DynGroupprivate -> OmpDynGroupprivateClause
-!PARSE-TREE: | | Modifier -> OmpPrescriptiveness -> Value = Strict
+!PARSE-TREE: | | Modifier -> OmpFallbackModifier -> Value = Abort
 !PARSE-TREE: | | Scalar -> Integer -> Expr = 'n'
 !PARSE-TREE: | | | Designator -> DataRef -> Name = 'n'
 !PARSE-TREE: | Flags = None
@@ -49,21 +49,21 @@ subroutine f01(n)
 subroutine f02(n)
   implicit none
   integer :: n
-  !$omp target dyn_groupprivate(fallback, cgroup: n)
+  !$omp target dyn_groupprivate(fallback(default_mem), cgroup: n)
   !$omp end target
 end
 
 !UNPARSE: SUBROUTINE f02 (n)
 !UNPARSE:  IMPLICIT NONE
 !UNPARSE:  INTEGER n
-!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(FALLBACK, CGROUP: n)
+!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(FALLBACK(DEFAULT_MEM), CGROUP: n)
 !UNPARSE: !$OMP END TARGET
 !UNPARSE: END SUBROUTINE
 
 !PARSE-TREE: OmpBeginDirective
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = target
 !PARSE-TREE: | OmpClauseList -> OmpClause -> DynGroupprivate -> OmpDynGroupprivateClause
-!PARSE-TREE: | | Modifier -> OmpPrescriptiveness -> Value = Fallback
+!PARSE-TREE: | | Modifier -> OmpFallbackModifier -> Value = Default_Mem
 !PARSE-TREE: | | Modifier -> OmpAccessGroup -> Value = Cgroup
 !PARSE-TREE: | | Scalar -> Integer -> Expr = 'n'
 !PARSE-TREE: | | | Designator -> DataRef -> Name = 'n'
diff --git a/flang/test/Parser/OpenMP/nested-directive.f90 b/flang/test/Parser/OpenMP/nested-directive.f90
new file mode 100644
index 0000000000000..2a10bbe666bb8
--- /dev/null
+++ b/flang/test/Parser/OpenMP/nested-directive.f90
@@ -0,0 +1,7 @@
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s 2>&1 | FileCheck %s --match-full-lines
+
+subroutine func
+  implicit none
+! CHECK: !$OMP NOTHING
+  !$omp nothing !$omp Cannot nest directives inside directives; must be interpreted as a comment
+end subroutine func
diff --git a/flang/test/Parser/prefetch.f90 b/flang/test/Parser/prefetch.f90
new file mode 100644
index 0000000000000..1013a09c92117
--- /dev/null
+++ b/flang/test/Parser/prefetch.f90
@@ -0,0 +1,80 @@
+!RUN: %flang_fc1 -fdebug-unparse-no-sema %s 2>&1 | FileCheck %s -check-prefix=UNPARSE
+!RUN: %flang_fc1 -fdebug-dump-parse-tree-no-sema %s 2>&1 | FileCheck %s -check-prefix=TREE
+
+subroutine test_prefetch_01(a, b)
+    integer, intent(in) :: a
+    integer, intent(inout) :: b(5)
+    integer :: i = 2
+    integer :: res
+
+!TREE: | | DeclarationConstruct -> SpecificationConstruct -> CompilerDirective -> Prefetch -> Designator -> DataRef -> Name = 'a'
+
+!UNPARSE:    !DIR$ PREFETCH a
+    !dir$ prefetch a
+    b(1) = a
+
+!TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> CompilerDirective -> Prefetch -> Designator -> DataRef -> Name = 'b'
+
+!UNPARSE:    !DIR$ PREFETCH b
+    !dir$ prefetch b
+    res = sum(b)
+
+!TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> CompilerDirective -> Prefetch -> Designator -> DataRef -> Name = 'a'
+!TREE: | | Designator -> DataRef -> ArrayElement
+!TREE: | | | DataRef -> Name = 'b'
+!TREE: | | | SectionSubscript -> SubscriptTriplet
+!TREE: | | | | Scalar -> Integer -> Expr -> LiteralConstant -> IntLiteralConstant = '3'
+!TREE: | | | | Scalar -> Integer -> Expr -> LiteralConstant -> IntLiteralConstant = '5'
+
+!UNPARSE:    !DIR$ PREFETCH a, b(3:5)
+    !dir$ prefetch a, b(3:5)
+    res = a + b(4)
+
+!TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> CompilerDirective -> Prefetch -> Designator -> DataRef -> Name = 'res'
+!TREE: | | Designator -> DataRef -> ArrayElement
+!TREE: | | | DataRef -> Name = 'b'
+!TREE: | | | SectionSubscript -> Integer -> Expr -> Add
+!TREE: | | | | Expr -> Designator -> DataRef -> Name = 'i'
+!TREE: | | | | Expr -> LiteralConstant -> IntLiteralConstant = '2'
+
+!UNPARSE:    !DIR$ PREFETCH res, b(i+2)
+    !dir$ prefetch res, b(i+2)
+    res = res + b(i+2)
+end subroutine
+
+subroutine test_prefetch_02(n, a)
+    integer, intent(in) :: n
+    integer, intent(in) :: a(n)
+    type :: t
+        real, allocatable :: x(:, :)
+    end type t
+    type(t) :: p
+
+    do i = 1, n
+!TREE: | | | | ExecutionPartConstruct -> ExecutableConstruct -> CompilerDirective -> Prefetch -> Designator -> DataRef -> ArrayElement
+!TREE: | | | | | DataRef -> StructureComponent
+!TREE: | | | | | | DataRef -> Name = 'p'
+!TREE: | | | | | | Name = 'x'
+!TREE: | | | | | SectionSubscript -> Integer -> Expr -> Designator -> DataRef -> Name = 'i'
+!TREE: | | | | | SectionSubscript -> SubscriptTriplet
+!TREE: | | | | Designator -> DataRef -> Name = 'a'
+
+!UNPARSE:  !DIR$ PREFETCH p%x(i,:), a
+        !dir$ prefetch p%x(i, :), a
+        do j = 1, n
+!TREE: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> CompilerDirective -> Prefetch -> Designator -> DataRef -> ArrayElement
+!TREE: | | | | | | | DataRef -> StructureComponent
+!TREE: | | | | | | | | DataRef -> Name = 'p'
+!TREE: | | | | | | | | Name = 'x'
+!TREE: | | | | | | | SectionSubscript -> Integer -> Expr -> Designator -> DataRef -> Name = 'i'
+!TREE: | | | | | | | SectionSubscript -> Integer -> Expr -> Designator -> DataRef -> Name = 'j'
+!TREE: | | | | | | Designator -> DataRef -> ArrayElement
+!TREE: | | | | | | | DataRef -> Name = 'a'
+!TREE: | | | | | | | SectionSubscript -> Integer -> Expr -> Designator -> DataRef -> Name = 'i'
+
+!UNPARSE:   !DIR$ PREFETCH p%x(i,j), a(i)
+            !dir$ prefetch p%x(i, j), a(i)
+            p%x(i, j) = p%x(i, j) ** a(j)
+        end do
+    end do
+end subroutine
diff --git a/flang/test/Preprocessing/bug136845.F b/flang/test/Preprocessing/bug136845.F
index ce52c2953bb57..311ee0a2d874c 100644
--- a/flang/test/Preprocessing/bug136845.F
+++ b/flang/test/Preprocessing/bug136845.F
@@ -18,7 +18,6 @@
 *$1   continue
       end
 
-!PREPRO:!$   &
 !PREPRO:              continue
 !PREPRO:      k=0
 !PREPRO:      k=0
diff --git a/flang/test/Preprocessing/cond-comment.f b/flang/test/Preprocessing/cond-comment.f
new file mode 100644
index 0000000000000..a484fcbfa8fa7
--- /dev/null
+++ b/flang/test/Preprocessing/cond-comment.f
@@ -0,0 +1,5 @@
+!RUN: %flang_fc1 -fopenmp -fdebug-unparse %s 2>&1 | FileCheck %s
+!CHECK: END
+!CHECK-NOT: error:
+      end
+c$      !
diff --git a/flang/test/Preprocessing/cond-comment.f90 b/flang/test/Preprocessing/cond-comment.f90
new file mode 100644
index 0000000000000..457614ae9372e
--- /dev/null
+++ b/flang/test/Preprocessing/cond-comment.f90
@@ -0,0 +1,5 @@
+!RUN: %flang_fc1 -fopenmp -fdebug-unparse %s 2>&1 | FileCheck %s
+!CHECK: END
+!CHECK-NOT: error:
+end
+!$ !
diff --git a/flang/test/Semantics/OpenMP/allocate-align01.f90 b/flang/test/Semantics/OpenMP/allocate-align01.f90
index 88bcd6d2f1008..4a1e60cf73fff 100644
--- a/flang/test/Semantics/OpenMP/allocate-align01.f90
+++ b/flang/test/Semantics/OpenMP/allocate-align01.f90
@@ -11,9 +11,9 @@ program allocate_align_tree
     integer :: z, t, xx
     t = 2
     z = 3
+    !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage]
     !ERROR: Must be a constant value
 !$omp allocate(j) align(xx)
-    !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage]
     !ERROR: The alignment should be positive
 !$omp allocate(xarray) align(-32) allocator(omp_large_cap_mem_alloc)
     allocate(j(z), xarray(t))
diff --git a/flang/test/Semantics/OpenMP/allocate-directive.f90 b/flang/test/Semantics/OpenMP/allocate-directive.f90
index 18a14b825f00d..e34125b392bda 100644
--- a/flang/test/Semantics/OpenMP/allocate-directive.f90
+++ b/flang/test/Semantics/OpenMP/allocate-directive.f90
@@ -11,7 +11,7 @@
 integer, allocatable :: a, b, m, n, t, z
 !$omp allocate(x, y)
 !$omp allocate(x, y) allocator(omp_default_mem_alloc)
-
+    continue
 !$omp allocate(a, b)
     allocate ( a, b )
 
diff --git a/flang/test/Semantics/OpenMP/allocate01.f90 b/flang/test/Semantics/OpenMP/allocate01.f90
index 229fd4d6c3f95..5fe4efdd106d9 100644
--- a/flang/test/Semantics/OpenMP/allocate01.f90
+++ b/flang/test/Semantics/OpenMP/allocate01.f90
@@ -17,7 +17,7 @@ subroutine sema()
 
     !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
     !$omp allocate(y)
-        print *, a
+    print *, a
 
     !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage]
     !$omp allocate(x) allocator(omp_default_mem_alloc)
diff --git a/flang/test/Semantics/OpenMP/allocate02.f90 b/flang/test/Semantics/OpenMP/allocate02.f90
index 8f0579e810bb9..a1e684796edb2 100644
--- a/flang/test/Semantics/OpenMP/allocate02.f90
+++ b/flang/test/Semantics/OpenMP/allocate02.f90
@@ -16,6 +16,7 @@ subroutine allocate()
   !ERROR: At most one ALLOCATOR clause can appear on the ALLOCATE directive
   !$omp allocate(x, y) allocator(omp_default_mem_alloc) allocator(omp_default_mem_alloc)
 
+  continue
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
       allocate ( darray(a, b) )
 
diff --git a/flang/test/Semantics/OpenMP/allocate03.f90 b/flang/test/Semantics/OpenMP/allocate03.f90
index e35115f3897cc..3609f38eb6ee7 100644
--- a/flang/test/Semantics/OpenMP/allocate03.f90
+++ b/flang/test/Semantics/OpenMP/allocate03.f90
@@ -17,6 +17,7 @@ subroutine allocate()
 
   !ERROR: A variable that is part of another variable (as an array or structure element) cannot appear on the ALLOCATE directive
   !$omp allocate(my_var%array)
+  continue
 
   !ERROR: A variable that is part of another variable (as an array or structure element) cannot appear on the ALLOCATE directive
   !$omp allocate(darray, my_var%array) allocator(omp_default_mem_alloc)
diff --git a/flang/test/Semantics/OpenMP/allocate06.f90 b/flang/test/Semantics/OpenMP/allocate06.f90
index 9b57322bbadc6..272094aaaeec2 100644
--- a/flang/test/Semantics/OpenMP/allocate06.f90
+++ b/flang/test/Semantics/OpenMP/allocate06.f90
@@ -13,7 +13,7 @@ subroutine allocate()
 
   !ERROR: A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
-
+  continue
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
     allocate(darray(a, b))
 
diff --git a/flang/test/Semantics/OpenMP/allocate08.f90 b/flang/test/Semantics/OpenMP/allocate08.f90
index f4f11e229a28b..3f59493713213 100644
--- a/flang/test/Semantics/OpenMP/allocate08.f90
+++ b/flang/test/Semantics/OpenMP/allocate08.f90
@@ -1,11 +1,11 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
-! OpenMP Version 5.0
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=51
+! OpenMP Version 5.1
 ! 2.11.3 allocate Directive
 ! If list items within the ALLOCATE directive have the SAVE attribute, are a
-! common block name, or are declared in the scope of a module, then only
-! predefined memory allocator parameters can be used in the allocator clause
+! common block name, then only predefined memory allocator parameters can be
+! used in the allocator clause
 
 module AllocateModule
   INTEGER :: z
diff --git a/flang/test/Semantics/OpenMP/allocate09.f90 b/flang/test/Semantics/OpenMP/allocate09.f90
index 0f93a340fe1e4..8b8d07ccd0be8 100644
--- a/flang/test/Semantics/OpenMP/allocate09.f90
+++ b/flang/test/Semantics/OpenMP/allocate09.f90
@@ -23,11 +23,11 @@ subroutine allocate()
   !$omp allocate
     allocate(e(5), f(6), g(7))
 
-  !ERROR: Object 'i' in ALLOCATE directive not found in corresponding ALLOCATE statement
+  !ERROR: A list item on an executable ALLOCATE must be specified on the associated ALLOCATE statement
   !$omp allocate(h, i) allocator(omp_default_mem_alloc)
     allocate(h(8))
 
-  !ERROR: Object 'j' in ALLOCATE directive not found in corresponding ALLOCATE statement
+  !ERROR: A list item on an executable ALLOCATE must be specified on the associated ALLOCATE statement
   !$omp allocate(j, k) allocator(omp_default_mem_alloc)
   !$omp allocate(l) allocator(omp_default_mem_alloc)
     allocate(k(9), l(10))
diff --git a/flang/test/Semantics/OpenMP/allocate10.f90 b/flang/test/Semantics/OpenMP/allocate10.f90
new file mode 100644
index 0000000000000..0a9e85b8ae2fe
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/allocate10.f90
@@ -0,0 +1,11 @@
+!RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=51
+
+subroutine f00
+  integer, allocatable :: x, y
+
+  continue
+  !$omp allocate
+  !ERROR: If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items
+  !$omp allocate
+  allocate(x, y)
+end
diff --git a/flang/test/Semantics/OpenMP/allocate11.f90 b/flang/test/Semantics/OpenMP/allocate11.f90
new file mode 100644
index 0000000000000..89beaa0450169
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/allocate11.f90
@@ -0,0 +1,27 @@
+! REQUIRES: openmp_runtime
+
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=50
+! OpenMP Version 5.0
+! 2.11.3 allocate Directive
+! If list items within the ALLOCATE directive have the SAVE attribute, are a
+! common block name, or are declared in the scope of a module, then only
+! predefined memory allocator parameters can be used in the allocator clause
+
+module AllocateModule
+  INTEGER :: z
+  !ERROR: If a list item is a named common block, has SAVE attribute or is declared in the scope of a module, an ALLOCATOR clause must be present with a predefined allocator
+  !$omp allocate(z)
+end module
+
+subroutine allocate(custom_allocator)
+use omp_lib
+use AllocateModule
+  integer, SAVE :: x
+  integer :: w
+  COMMON /CommonName/ y
+
+  integer(kind=omp_allocator_handle_kind) :: custom_allocator
+
+  !ERROR: If a list item is a named common block, has SAVE attribute or is declared in the scope of a module, an ALLOCATOR clause must be present with a predefined allocator
+  !$omp allocate(x)
+end subroutine allocate
diff --git a/flang/test/Semantics/OpenMP/allocate12.f90 b/flang/test/Semantics/OpenMP/allocate12.f90
new file mode 100644
index 0000000000000..2b3b510fbf40c
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/allocate12.f90
@@ -0,0 +1,16 @@
+!RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=51
+
+subroutine f00
+  integer, allocatable :: x
+  continue
+  !ERROR: An executable ALLOCATE directive must be associated with an ALLOCATE statement
+  !$omp allocate(x)
+end
+
+subroutine f01
+  integer, allocatable :: x
+  continue
+  !$omp allocate(x)
+  !ERROR: The statement associated with executable ALLOCATE directive must be an ALLOCATE statement
+  continue
+end
diff --git a/flang/test/Semantics/OpenMP/allocators01.f90 b/flang/test/Semantics/OpenMP/allocators01.f90
index ff92fa3b23463..a3342063e25f2 100644
--- a/flang/test/Semantics/OpenMP/allocators01.f90
+++ b/flang/test/Semantics/OpenMP/allocators01.f90
@@ -16,7 +16,7 @@ subroutine allocate()
     allocate(arr3(3), arr4(4, 4))
   !$omp end allocators
 
-  !ERROR: Object 'arr1' in ALLOCATORS directive not found in corresponding ALLOCATE statement
+  !ERROR: A list item in an ALLOCATORS construct must be specified on the associated ALLOCATE statement
   !$omp allocators allocate(omp_default_mem_alloc: arr1, arr2)
     allocate(arr2(2, 2))
 
diff --git a/flang/test/Semantics/OpenMP/allocators04.f90 b/flang/test/Semantics/OpenMP/allocators04.f90
deleted file mode 100644
index 212e48fbd1b26..0000000000000
--- a/flang/test/Semantics/OpenMP/allocators04.f90
+++ /dev/null
@@ -1,31 +0,0 @@
-! REQUIRES: openmp_runtime
-
-! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=50
-! OpenMP Version 5.2
-! Inherited from 2.11.3 allocate Directive
-! If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a
-! module, then only predefined memory allocator parameters can be used in the allocator clause
-! SAVE and common block names can't be declared as allocatable, only module scope variables are tested
-
-module AllocateModule
-    integer, allocatable :: a, b
-end module
-
-subroutine allocate()
-    use omp_lib
-    use AllocateModule
-
-    integer(kind=omp_allocator_handle_kind) :: custom_allocator
-    type(omp_alloctrait) :: trait(1)
-
-    trait(1)%key = fallback
-    trait(1)%value = default_mem_fb
-    custom_allocator = omp_init_allocator(omp_default_mem_space, 1, trait)
-
-    !$omp allocators allocate(omp_default_mem_alloc: a)
-        allocate(a)
-
-    !ERROR: If list items within the ALLOCATORS directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
-    !$omp allocators allocate(custom_allocator: b)
-        allocate(b)
-end subroutine
diff --git a/flang/test/Semantics/OpenMP/allocators05.f90 b/flang/test/Semantics/OpenMP/allocators05.f90
index efacdfaec7647..f49182f128e74 100644
--- a/flang/test/Semantics/OpenMP/allocators05.f90
+++ b/flang/test/Semantics/OpenMP/allocators05.f90
@@ -17,7 +17,7 @@ subroutine allocate()
     !$omp target private(a, b)
     !$omp allocators allocate(omp_default_mem_alloc: a)
         allocate(a(LEN))
-    !ERROR: ALLOCATORS directives that appear in a TARGET region must specify an allocator
+    !ERROR: An ALLOCATE clause in a TARGET region must specify an allocator or REQUIRES(DYNAMIC_ALLOCATORS) must be specified
     !$omp allocators allocate(b)
         allocate(b(LEN))
     !$omp end target
diff --git a/flang/test/Semantics/OpenMP/allocators07.f90 b/flang/test/Semantics/OpenMP/allocators07.f90
index a28f706965cb1..baaacee8b691e 100644
--- a/flang/test/Semantics/OpenMP/allocators07.f90
+++ b/flang/test/Semantics/OpenMP/allocators07.f90
@@ -5,7 +5,7 @@ subroutine f00
   integer, allocatable :: a(:)
 
   !$omp allocators allocate(a)
-!ERROR: The body of the ALLOCATORS construct should be an ALLOCATE statement
+!ERROR: The body of an ALLOCATORS construct should be an ALLOCATE statement
   continue
 end
 
@@ -13,7 +13,7 @@ subroutine f01
   implicit none
   integer, allocatable :: a(:)
 
-!ERROR: The ALLOCATORS construct should contain a single ALLOCATE statement
+!ERROR: The body of an ALLOCATORS construct should be an ALLOCATE statement
   !$omp allocators allocate(a)
   !$omp end allocators
 end
@@ -22,6 +22,6 @@ subroutine f02
   implicit none
   integer, allocatable :: a(:)
 
-!ERROR: The ALLOCATORS construct should contain a single ALLOCATE statement
+!ERROR: The body of an ALLOCATORS construct should be an ALLOCATE statement
   !$omp allocators allocate(a)
 end
diff --git a/flang/test/Semantics/OpenMP/defaultmap-clause-none.f90 b/flang/test/Semantics/OpenMP/defaultmap-clause-none.f90
new file mode 100644
index 0000000000000..08e8ebc995097
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/defaultmap-clause-none.f90
@@ -0,0 +1,96 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51
+
+subroutine defaultmap_all_none_no_errors
+    implicit none
+    real :: array(10)
+    integer,  pointer :: ptr(:)
+    real, allocatable :: alloca
+    integer :: index
+
+    !$omp target defaultmap(none) map(to: index, alloca) map(tofrom: array, ptr)
+        do index = 1, 10
+            ptr(index) = array(index) + alloca
+        end do
+    !$omp end target
+end subroutine defaultmap_all_none_no_errors
+
+subroutine defaultmap_all_none
+    implicit none
+    real :: array(10)
+    integer,  pointer :: ptr(:)
+    real, allocatable :: alloca
+    integer :: index
+    !$omp target defaultmap(none)
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+        do index = 1, 10
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'ptr' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'array' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'alloca' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+            ptr(index) = array(index) + alloca
+        end do
+    !$omp end target
+end subroutine defaultmap_all_none
+
+subroutine defaultmap_scalar_none
+    implicit none
+    real :: array(10)
+    integer,  pointer :: ptr(:)
+    real, allocatable :: alloca
+    integer :: index
+
+    !$omp target defaultmap(none: scalar)
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+        do index = 1, 10
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+            ptr(index) = array(index) + alloca
+        end do
+    !$omp end target
+end subroutine defaultmap_scalar_none
+
+subroutine defaultmap_pointer_none
+    implicit none
+    real :: array(10)
+    integer,  pointer :: ptr(:)
+    real, allocatable :: alloca
+    integer :: index
+
+    !$omp target defaultmap(none: pointer)
+        do index = 1, 10
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'ptr' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+            ptr(index) = array(index) + alloca
+        end do
+    !$omp end target
+end subroutine defaultmap_pointer_none
+
+subroutine defaultmap_allocatable_none
+    implicit none
+    real :: array(10)
+    integer,  pointer :: ptr(:)
+    real, allocatable :: alloca
+    integer :: index
+
+    !$omp target defaultmap(none: allocatable)
+        do index = 1, 10
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'alloca' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+            ptr(index) = array(index) + alloca
+        end do
+    !$omp end target
+end subroutine defaultmap_allocatable_none
+
+subroutine defaultmap_aggregate_none
+    implicit none
+    real :: array(10)
+    integer,  pointer :: ptr(:)
+    real, allocatable :: alloca
+    integer :: index
+
+    !$omp target defaultmap(none: aggregate)
+        do index = 1, 10
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'array' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+            ptr(index) = array(index) + alloca
+        end do
+    !$omp end target
+end subroutine defaultmap_aggregate_none
diff --git a/flang/test/Semantics/OpenMP/dyn-groupprivate.f90 b/flang/test/Semantics/OpenMP/dyn-groupprivate.f90
new file mode 100644
index 0000000000000..f77a0b0d35f44
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/dyn-groupprivate.f90
@@ -0,0 +1,8 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=61
+
+subroutine f00(x)
+  integer :: x
+  !ERROR: The access-group modifier can only occur on a single clause in a construct
+  !$omp target dyn_groupprivate(cgroup: x), dyn_groupprivate(10)
+  !$omp end target
+end
diff --git a/flang/test/Semantics/OpenMP/in-reduction.f90 b/flang/test/Semantics/OpenMP/in-reduction.f90
index 1b82134b7104b..3f1e735214061 100644
--- a/flang/test/Semantics/OpenMP/in-reduction.f90
+++ b/flang/test/Semantics/OpenMP/in-reduction.f90
@@ -47,6 +47,7 @@ subroutine f06
     integer :: a(10)
   end type
   type(t) :: x
+!ERROR: A variable that is part of another variable cannot appear on the IN_REDUCTION clause
 !ERROR: The base expression of an array element or section in IN_REDUCTION clause must be an identifier
 !$omp target in_reduction(+: x%a(2))
 !$omp end target
@@ -57,6 +58,7 @@ subroutine f07
     integer :: a(10)
   end type
   type(t) :: x
+!ERROR: A variable that is part of another variable cannot appear on the IN_REDUCTION clause
 !ERROR: The base expression of an array element or section in IN_REDUCTION clause must be an identifier
 !$omp target in_reduction(+: x%a(1:10))
 !$omp end target
diff --git a/flang/test/Semantics/OpenMP/reduction15.f90 b/flang/test/Semantics/OpenMP/reduction15.f90
index 1d4de6ff702bb..61fa417f1111c 100644
--- a/flang/test/Semantics/OpenMP/reduction15.f90
+++ b/flang/test/Semantics/OpenMP/reduction15.f90
@@ -13,6 +13,7 @@ module m
 
   subroutine f00
     type(t) :: x
+  !ERROR: A variable that is part of another variable cannot appear on the REDUCTION clause
   !ERROR: The base expression of an array element or section in REDUCTION clause must be an identifier
   !$omp do reduction (+ : x%a(2))
     do i = 1, 10
@@ -22,6 +23,7 @@ subroutine f00
 
   subroutine f01
     type(t) :: x
+  !ERROR: A variable that is part of another variable cannot appear on the REDUCTION clause
   !ERROR: The base expression of an array element or section in REDUCTION clause must be an identifier
   !$omp do reduction (+ : x%a(1:10))
     do i = 1, 10
diff --git a/flang/test/Semantics/OpenMP/reduction17.f90 b/flang/test/Semantics/OpenMP/reduction17.f90
new file mode 100644
index 0000000000000..5b6e8e977f46c
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/reduction17.f90
@@ -0,0 +1,18 @@
+! Test that Structure Component Array Elements are caught by Semantics and return an error
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -fopenmp-version=45
+
+type test_type
+    integer :: array(2)
+end type
+
+contains
+    subroutine test
+        type(test_type) :: x
+
+        !ERROR: A variable that is part of another variable cannot appear on the REDUCTION clause
+        !$omp do reduction(+: x%array(2))
+        do i=1, 2
+        end do
+        !$omp end do
+    end subroutine
+end
diff --git a/flang/test/Semantics/OpenMP/task-reduction.f90 b/flang/test/Semantics/OpenMP/task-reduction.f90
index 5a18ee48e7728..f76b07ae568f4 100644
--- a/flang/test/Semantics/OpenMP/task-reduction.f90
+++ b/flang/test/Semantics/OpenMP/task-reduction.f90
@@ -47,6 +47,7 @@ subroutine f06
     integer :: a(10)
   end type
   type(t) :: x
+!ERROR: A variable that is part of another variable cannot appear on the TASK_REDUCTION clause
 !ERROR: The base expression of an array element or section in TASK_REDUCTION clause must be an identifier
 !$omp taskgroup task_reduction(+: x%a(2))
 !$omp end taskgroup
@@ -57,6 +58,7 @@ subroutine f07
     integer :: a(10)
   end type
   type(t) :: x
+!ERROR: A variable that is part of another variable cannot appear on the TASK_REDUCTION clause
 !ERROR: The base expression of an array element or section in TASK_REDUCTION clause must be an identifier
 !$omp taskgroup task_reduction(+: x%a(1:10))
 !$omp end taskgroup
diff --git a/flang/test/Semantics/OpenMP/taskloop04.f90 b/flang/test/Semantics/OpenMP/taskloop04.f90
new file mode 100644
index 0000000000000..4ffcf84f708e9
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/taskloop04.f90
@@ -0,0 +1,15 @@
+! When lowering Taskloop, it is possible for the TileSizes clause to be lowered, but this is not a supported clause.
+! We should make sure that any use of Tilesizes with Taskloop is correctly rejected by the Semantics.
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+
+subroutine test
+    integer :: i, sum
+
+    !ERROR: TILE cannot follow TASKLOOP
+    !ERROR: SIZES clause is not allowed on the TASKLOOP directive
+    !$omp taskloop tile sizes(2)
+    do i=1,10
+        sum = sum + i
+    end do
+    !$omp end taskloop
+end subroutine
diff --git a/flang/test/Semantics/allocate14.f90 b/flang/test/Semantics/allocate14.f90
new file mode 100644
index 0000000000000..a97cf5ad88b08
--- /dev/null
+++ b/flang/test/Semantics/allocate14.f90
@@ -0,0 +1,56 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! Check for semantic errors in ALLOCATE statements
+
+program allocate14
+  
+  integer, allocatable :: i1, i2
+  character(200), allocatable :: msg1, msg2
+  type t
+    integer, allocatable :: i
+    character(10), allocatable :: msg
+  end type t
+  type(t) :: tt(2)
+  type(t), allocatable :: ts(:)
+
+  allocate(i1)
+  allocate(msg1)
+
+  allocate(i2, stat=i1, errmsg=msg1)
+  allocate(msg2, stat=i1, errmsg=msg1)
+  deallocate(i2, stat=i1, errmsg=msg1)
+  deallocate(msg2, stat=i1, errmsg=msg1)
+
+  !ERROR: STAT variable in ALLOCATE must not be the variable being allocated
+  allocate(i2, stat=i2, errmsg=msg2)
+  !ERROR: ERRMSG variable in ALLOCATE must not be the variable being allocated
+  allocate(msg2, stat=i2, errmsg=msg2)
+  !ERROR: STAT variable in DEALLOCATE must not be the variable being deallocated
+  deallocate(i2, stat=i2, errmsg=msg2)
+  !ERROR: ERRMSG variable in DEALLOCATE must not be the variable being deallocated
+  deallocate(msg2, stat=i2, errmsg=msg2)
+
+  allocate(tt(1)%i)
+  allocate(tt(1)%msg)
+
+  allocate(tt(2)%i, stat=tt(1)%i, errmsg=tt(1)%msg)
+  allocate(tt(2)%msg, stat=tt(1)%i, errmsg=tt(1)%msg)
+  deallocate(tt(2)%i, stat=tt(1)%i, errmsg=tt(1)%msg)
+  deallocate(tt(2)%msg, stat=tt(1)%i, errmsg=tt(1)%msg)
+
+  !ERROR: STAT variable in ALLOCATE must not be the variable being allocated
+  allocate(tt(2)%i, stat=tt(2)%i, errmsg=tt(2)%msg)
+  !ERROR: ERRMSG variable in ALLOCATE must not be the variable being allocated
+  allocate(tt(2)%msg, stat=tt(2)%i, errmsg=tt(2)%msg)
+  !ERROR: STAT variable in DEALLOCATE must not be the variable being deallocated
+  deallocate(tt(2)%i, stat=tt(2)%i, errmsg=tt(2)%msg)
+  !ERROR: ERRMSG variable in DEALLOCATE must not be the variable being deallocated
+  deallocate(tt(2)%msg, stat=tt(2)%i, errmsg=tt(2)%msg)
+
+  !TODO: STAT variable in ALLOCATE must not be the variable being allocated
+  !TODO: ERRMSG variable in ALLOCATE must not be the variable being allocated
+  allocate(ts(10), stat=ts(1)%i, errmsg=ts(1)%msg)
+  !TODO: STAT variable in DEALLOCATE must not be the variable being deallocated
+  !TODO: ERRMSG variable in DEALLOCATE must not be the variable being deallocated
+  deallocate(ts, stat=ts(1)%i, errmsg=ts(1)%msg)
+end program
+
diff --git a/flang/test/Semantics/coarrays02.f90 b/flang/test/Semantics/coarrays02.f90
index b16e0ccb58797..e866dd89c07ab 100644
--- a/flang/test/Semantics/coarrays02.f90
+++ b/flang/test/Semantics/coarrays02.f90
@@ -16,6 +16,8 @@ program main
   type(event_type) event
   !ERROR: Variable 'lock' with EVENT_TYPE or LOCK_TYPE must be a coarray
   type(lock_type) lock
+  !ERROR: Variable 'notify' with NOTIFY_TYPE must be a coarray
+  type(notify_type) notify
   integer :: local[*] ! ok in main
 end
 
@@ -120,3 +122,18 @@ subroutine s4
   !ERROR: Subscripts must appear in a coindexed reference when its base is an array
   print *, ta(1)%a[1]
 end
+
+subroutine s5(a, notify, res)
+  use iso_fortran_env
+  type t
+    type(notify_type) :: a
+  end type
+  real, intent(in) :: a[*]
+  type(event_type), intent(in) :: notify[*]
+  !ERROR: An INTENT(OUT) dummy argument may not be, or contain, NOTIFY_TYPE
+  type(notify_type), intent(out) :: res[*]
+  !ERROR: Variable 'bad' with NOTIFY_TYPE potential component '%a' must be a coarray
+  type(t) :: bad
+  !ERROR: NOTIFY= specifier must have type NOTIFY_TYPE from ISO_FORTRAN_ENV
+  print *, a[1, NOTIFY=notify]
+end
diff --git a/flang/test/Semantics/notifywait03.f90 b/flang/test/Semantics/notifywait03.f90
index 0fc56f66ad32d..a336a7a67669a 100644
--- a/flang/test/Semantics/notifywait03.f90
+++ b/flang/test/Semantics/notifywait03.f90
@@ -10,6 +10,7 @@ program test_notify_wait
   implicit none
 
   ! notify_type variables must be coarrays
+  !ERROR: Variable 'non_coarray' with NOTIFY_TYPE must be a coarray
   type(notify_type) :: non_coarray
 
   type(notify_type) :: notify_var[*], notify_array(2)[*]
diff --git a/flang/test/Semantics/resolve09.f90 b/flang/test/Semantics/resolve09.f90
index 2fe21aebf66bd..3384b05bf8f27 100644
--- a/flang/test/Semantics/resolve09.f90
+++ b/flang/test/Semantics/resolve09.f90
@@ -140,11 +140,11 @@ subroutine s9
     procedure(), nopass, pointer :: p1, p2
   end type
   type(t) x
+  !ERROR: Function result characteristics are not known
   print *, x%p1()
-  call x%p2
-  !ERROR: Cannot call function 'p1' like a subroutine
-  call x%p1
-  !ERROR: Cannot call subroutine 'p2' like a function
+  call x%p2 ! ok
+  call x%p1 ! ok
+  !ERROR: Function result characteristics are not known
   print *, x%p2()
 end subroutine
 
diff --git a/flang/test/Semantics/structconst12.f90 b/flang/test/Semantics/structconst12.f90
new file mode 100644
index 0000000000000..345016b236c8a
--- /dev/null
+++ b/flang/test/Semantics/structconst12.f90
@@ -0,0 +1,12 @@
+!RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+!CHECK: TYPE(t) :: x = t(pp=f)
+!CHECK-NOT: error:
+interface
+  function f()
+  end
+end interface
+type t
+  procedure(f), nopass, pointer :: pp
+end type
+type(t) :: x = t(pp=f)
+end
diff --git a/flang/test/Semantics/val-tkr.f90 b/flang/test/Semantics/val-tkr.f90
new file mode 100644
index 0000000000000..bed41f3ed0569
--- /dev/null
+++ b/flang/test/Semantics/val-tkr.f90
@@ -0,0 +1,22 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+implicit none
+interface
+  subroutine s(b)
+    !dir$ ignore_tkr(tr) b
+    real, value :: b
+  end
+  subroutine s1(b)
+    !dir$ ignore_tkr(r) b
+    integer, value :: b
+  end
+end interface
+integer :: a(5), a1
+! forbid array to scalar with VALUE and ignore_tkr(r)
+!ERROR: Array actual argument may not be associated with IGNORE_TKR(R) scalar dummy argument 'b=' with VALUE attribute
+call s(a)
+!ERROR: Array actual argument may not be associated with IGNORE_TKR(R) scalar dummy argument 'b=' with VALUE attribute
+call s1(a)
+! allow scalar to scalar with VALUE
+call s(a1)
+call s1(a(1))
+end
diff --git a/flang/test/Transforms/DoConcurrent/basic_host.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90
index 6f24b346e3fb9..b4eb15837d0a5 100644
--- a/flang/test/Transforms/DoConcurrent/basic_host.f90
+++ b/flang/test/Transforms/DoConcurrent/basic_host.f90
@@ -4,7 +4,7 @@
 ! RUN:   | FileCheck %s
 ! RUN: bbc -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
 ! RUN:   | FileCheck %s
- 
+
 ! CHECK-LABEL: DO_CONCURRENT_BASIC
 program do_concurrent_basic
     ! CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
diff --git a/flang/test/Transforms/DoConcurrent/map_shape_info.f90 b/flang/test/Transforms/DoConcurrent/map_shape_info.f90
index 40f66c19718e8..95bfc236888d1 100644
--- a/flang/test/Transforms/DoConcurrent/map_shape_info.f90
+++ b/flang/test/Transforms/DoConcurrent/map_shape_info.f90
@@ -28,7 +28,7 @@ end program do_concurrent_shape
 ! CHECK: omp.map.info
 ! CHECK: omp.map.info
 
-! CHECK: %[[DIM0_EXT_MAP:.*]] = omp.map.info 
+! CHECK: %[[DIM0_EXT_MAP:.*]] = omp.map.info
 ! CHECK-SAME:   var_ptr(%[[DIM0_EXT]] : !fir.ref<index>, index)
 ! CHECK-SAME:   map_clauses(implicit)
 ! CHECK-SAME:   capture(ByCopy) -> !fir.ref<index> {name = "_QFEa.extent.dim0"}
@@ -77,9 +77,9 @@ end subroutine do_concurrent_shape_shift
 ! CHECK: omp.map.info
 ! CHECK: omp.map.info
 
-! CHECK: %[[DIM0_STRT_MAP:.*]] = omp.map.info 
+! CHECK: %[[DIM0_STRT_MAP:.*]] = omp.map.info
 ! CHECK-SAME:   var_ptr(%[[DIM0_STRT]] : !fir.ref<index>, index)
-! CHECK-SAME:   map_clauses(implicit) 
+! CHECK-SAME:   map_clauses(implicit)
 ! CHECK-SAME:   capture(ByCopy) -> !fir.ref<index> {name = "_QF{{.*}}Ea.start_idx.dim0"}
 
 ! CHECK: %[[DIM0_EXT_MAP:.*]] = omp.map.info
diff --git a/flang/test/Transforms/DoConcurrent/use_loop_bounds_in_body.f90 b/flang/test/Transforms/DoConcurrent/use_loop_bounds_in_body.f90
index b467747293ace..07a3b5b62b5a5 100644
--- a/flang/test/Transforms/DoConcurrent/use_loop_bounds_in_body.f90
+++ b/flang/test/Transforms/DoConcurrent/use_loop_bounds_in_body.f90
@@ -14,7 +14,7 @@ subroutine foo(a, n)
   do concurrent (i=1:n)
     a(i) = n
   end do
-end subroutine 
+end subroutine
 
 ! CHECK-LABEL: func.func @_QPfoo
 ! CHECK: omp.target
diff --git a/flang/test/Transforms/OpenACC/acc-implicit-copy-reduction.fir b/flang/test/Transforms/OpenACC/acc-implicit-copy-reduction.fir
new file mode 100644
index 0000000000000..d0fc5b7a2ee0b
--- /dev/null
+++ b/flang/test/Transforms/OpenACC/acc-implicit-copy-reduction.fir
@@ -0,0 +1,134 @@
+// RUN: fir-opt %s --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data{enable-implicit-reduction-copy=true})" -split-input-file | FileCheck %s --check-prefix=COPY
+// RUN: fir-opt %s --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data{enable-implicit-reduction-copy=false})" -split-input-file | FileCheck %s --check-prefix=FIRSTPRIVATE
+
+// Test case: integer reduction in parallel loop
+// This corresponds to Fortran code:
+//   integer :: r, i
+//   r = 0
+//   !$acc parallel
+//   !$acc loop gang reduction(+:r)
+//   do i = 1, N
+//     r = r + 1
+//   enddo
+//   !$acc end parallel
+
+acc.reduction.recipe @reduction_add_ref_i32 : !fir.ref<i32> reduction_operator <add> init {
+^bb0(%arg0: !fir.ref<i32>):
+  %c0_i32 = arith.constant 0 : i32
+  %0 = fir.alloca i32
+  %1 = fir.declare %0 {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  fir.store %c0_i32 to %1 : !fir.ref<i32>
+  acc.yield %1 : !fir.ref<i32>
+} combiner {
+^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>):
+  %0 = fir.load %arg0 : !fir.ref<i32>
+  %1 = fir.load %arg1 : !fir.ref<i32>
+  %2 = arith.addi %0, %1 : i32
+  fir.store %2 to %arg0 : !fir.ref<i32>
+  acc.yield %arg0 : !fir.ref<i32>
+}
+
+func.func @test_reduction_implicit_copy() {
+  %c1_i32 = arith.constant 1 : i32
+  %cN = arith.constant 100 : i32
+  %r = fir.alloca i32 {bindc_name = "r", uniq_name = "_QFEr"}
+  %i = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+  %r_decl = fir.declare %r {uniq_name = "_QFEr"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %i_decl = fir.declare %i {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %c0_i32 = arith.constant 0 : i32
+  fir.store %c0_i32 to %r_decl : !fir.ref<i32>
+
+  acc.parallel {
+    %red_var = acc.reduction varPtr(%r_decl : !fir.ref<i32>) -> !fir.ref<i32> {name = "r"}
+    acc.loop reduction(@reduction_add_ref_i32 -> %red_var : !fir.ref<i32>) control(%iv : i32) = (%c1_i32 : i32) to (%cN : i32) step (%c1_i32 : i32) {
+      fir.store %iv to %i_decl : !fir.ref<i32>
+      %cur_r = fir.load %red_var : !fir.ref<i32>
+      %new_r = arith.addi %cur_r, %c1_i32 : i32
+      fir.store %new_r to %red_var : !fir.ref<i32>
+      acc.yield
+    } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+    acc.yield
+  }
+  return
+}
+
+// When enable-implicit-reduction-copy=true: expect copyin/copyout for reduction variable
+// COPY: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {dataClause = #acc<data_clause acc_reduction>, implicit = true, name = "r"}
+// COPY: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<i32>) to varPtr({{.*}} : !fir.ref<i32>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "r"}
+
+// When enable-implicit-reduction-copy=false: expect firstprivate for reduction variable
+// FIRSTPRIVATE: acc.firstprivate varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "r"}
+// FIRSTPRIVATE-NOT: acc.copyin
+// FIRSTPRIVATE-NOT: acc.copyout
+
+// -----
+
+// Test case: reduction variable used both in loop and outside (should be firstprivate)
+// This corresponds to Fortran code:
+//   integer :: r = 0, i, out
+//   !$acc parallel num_gangs(1)
+//   !$acc loop reduction(+:r) copyout(out)
+//   do i = 1, N
+//     r = r + 1
+//   enddo
+//   out = r
+//   !$acc end parallel
+
+acc.reduction.recipe @reduction_add_ref_i32 : !fir.ref<i32> reduction_operator <add> init {
+^bb0(%arg0: !fir.ref<i32>):
+  %c0_i32 = arith.constant 0 : i32
+  %0 = fir.alloca i32
+  %1 = fir.declare %0 {uniq_name = "acc.reduction.init"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  fir.store %c0_i32 to %1 : !fir.ref<i32>
+  acc.yield %1 : !fir.ref<i32>
+} combiner {
+^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>):
+  %0 = fir.load %arg0 : !fir.ref<i32>
+  %1 = fir.load %arg1 : !fir.ref<i32>
+  %2 = arith.addi %0, %1 : i32
+  fir.store %2 to %arg0 : !fir.ref<i32>
+  acc.yield %arg0 : !fir.ref<i32>
+}
+
+func.func @test_reduction_with_usage_outside_loop() {
+  %c1_i32 = arith.constant 1 : i32
+  %cN = arith.constant 100 : i32
+  %c0_i32 = arith.constant 0 : i32
+
+  %r = fir.alloca i32 {bindc_name = "r", uniq_name = "_QFEr"}
+  %i = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+  %out = fir.alloca i32 {bindc_name = "out", uniq_name = "_QFEout"}
+
+  %r_decl = fir.declare %r {uniq_name = "_QFEr"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %i_decl = fir.declare %i {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %out_decl = fir.declare %out {uniq_name = "_QFEout"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  fir.store %c0_i32 to %r_decl : !fir.ref<i32>
+
+  %out_copyout = acc.create varPtr(%out_decl : !fir.ref<i32>) -> !fir.ref<i32> {dataClause = #acc<data_clause acc_copyout>, name = "out"}
+  acc.parallel dataOperands(%out_copyout : !fir.ref<i32>) {
+    %red_var = acc.reduction varPtr(%r_decl : !fir.ref<i32>) -> !fir.ref<i32> {name = "r"}
+    acc.loop reduction(@reduction_add_ref_i32 -> %red_var : !fir.ref<i32>) control(%iv : i32) = (%c1_i32 : i32) to (%cN : i32) step (%c1_i32 : i32) {
+      fir.store %iv to %i_decl : !fir.ref<i32>
+      %cur_r = fir.load %red_var : !fir.ref<i32>
+      %new_r = arith.addi %cur_r, %c1_i32 : i32
+      fir.store %new_r to %red_var : !fir.ref<i32>
+      acc.yield
+    } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+    // out = r (usage of r outside the loop)
+    %final_r = fir.load %r_decl : !fir.ref<i32>
+    fir.store %final_r to %out_copyout : !fir.ref<i32>
+    acc.yield
+  }
+  acc.copyout accPtr(%out_copyout : !fir.ref<i32>) to varPtr(%out_decl : !fir.ref<i32>) {dataClause = #acc<data_clause acc_copyout>, name = "out"}
+  return
+}
+
+// In this case, r should be firstprivate regardless of the flag setting because it's used outside the reduction context
+// COPY-LABEL: func.func @test_reduction_with_usage_outside_loop
+// COPY: acc.firstprivate varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "r"}
+// COPY-NOT: acc.copyin varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {{.*}} name = "r"
+
+// FIRSTPRIVATE-LABEL: func.func @test_reduction_with_usage_outside_loop
+// FIRSTPRIVATE: acc.firstprivate varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "r"}
+// FIRSTPRIVATE-NOT: acc.copyin varPtr({{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {{.*}} name = "r"
+
diff --git a/flang/test/Transforms/OpenACC/acc-implicit-data-derived-type-member.F90 b/flang/test/Transforms/OpenACC/acc-implicit-data-derived-type-member.F90
new file mode 100644
index 0000000000000..71e7d79b7260f
--- /dev/null
+++ b/flang/test/Transforms/OpenACC/acc-implicit-data-derived-type-member.F90
@@ -0,0 +1,38 @@
+!RUN: rm -rf %t && mkdir %t && cd %t && \
+!RUN:   bbc %s -fopenacc -emit-hlfir -o - \
+!RUN:   | fir-opt --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data)" \
+!RUN:   | FileCheck %s
+
+! This test exercises whether the ACCImplicitData pass inserts its new
+! data operations in appropriate position so that parents are copied in before
+! their children.
+
+module types
+  type derivc8r4
+    complex(8) :: member0
+    real(4) :: member1
+  end type derivc8r4
+end module
+program test
+  use types
+  implicit none
+  type (derivc8r4) :: d2
+  type (derivc8r4) :: d4
+  integer(4) :: i0
+  d2%member0 = 123
+  !$acc serial copyin(d2%member0) copyout(d4%member0)
+  do i0 = 1, 1
+    d4%member0 = d2%member0
+  end do
+  !$acc end serial
+end program
+
+!CHECK: acc.copyin {{.*}} {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "d2"}
+!CHECK: acc.copyin {{.*}} {name = "d2%member0"}
+!CHECK: acc.copyin {{.*}} {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "d4"}
+!CHECK: acc.create {{.*}} {dataClause = #acc<data_clause acc_copyout>, name = "d4%member0"}
+!CHECK: acc.delete {{.*}} {dataClause = #acc<data_clause acc_copyin>, name = "d2%member0"}
+!CHECK: acc.copyout {{.*}} {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "d2"}
+!CHECK: acc.copyout {{.*}} {name = "d4%member0"}
+!CHECK: acc.copyout {{.*}} {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "d4"}
+
diff --git a/flang/test/Transforms/OpenACC/acc-implicit-data-fortran.F90 b/flang/test/Transforms/OpenACC/acc-implicit-data-fortran.F90
new file mode 100644
index 0000000000000..228aba1b1164d
--- /dev/null
+++ b/flang/test/Transforms/OpenACC/acc-implicit-data-fortran.F90
@@ -0,0 +1,79 @@
+!RUN: rm -rf %t && mkdir %t && cd %t && \
+!RUN:   bbc %s -fopenacc -emit-hlfir -o - \
+!RUN:   | fir-opt --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data)" \
+!RUN:   | FileCheck %s --check-prefix=CHECKHLFIR
+
+!RUN: rm -rf %t && mkdir %t && cd %t && \
+!RUN:   bbc %s -fopenacc -emit-hlfir -o - \
+!RUN:   | fir-opt --pass-pipeline="builtin.module(cse,acc-initialize-fir-analyses,acc-implicit-data)" \
+!RUN:   | FileCheck %s --check-prefix=CHECKCSE
+
+!RUN: rm -rf %t && mkdir %t && cd %t && \
+!RUN:   bbc %s -fopenacc -emit-fir -o - \
+!RUN:   | fir-opt --pass-pipeline="builtin.module(cse,acc-initialize-fir-analyses,acc-implicit-data)" \
+!RUN:   | FileCheck %s --check-prefix=CHECKCSE
+
+! This test uses bbc to generate both HLFIR and FIR for this test. The intent is
+! that it is exercising the acc implicit data pipeline and ensures that
+! correct clauses are generated. It also runs CSE which eliminates redundant
+! interior pointer computations (and thus different live-ins are found).
+
+program main
+  type aggr
+    real :: field
+  end type
+  type nested
+    type(aggr) :: outer
+  end type
+  type(aggr) :: aggrvar
+  type(nested) :: nestaggrvar
+  real :: scalarvar
+  real :: arrayvar(10)
+  complex :: scalarcomp
+
+  aggrvar%field = 1
+  scalarvar = aggrvar%field
+  nestaggrvar%outer%field = scalarvar
+  scalarcomp = scalarvar
+  arrayvar = real(scalarcomp)
+  arrayvar(2) = aggrvar%field
+
+  !$acc kernels
+  arrayvar = aggrvar%field + scalarvar + nestaggrvar%outer%field + real(scalarcomp) + arrayvar(2)
+  !$acc end kernels
+
+  !$acc parallel
+  arrayvar = aggrvar%field + scalarvar + nestaggrvar%outer%field + real(scalarcomp) + arrayvar(2)
+  !$acc end parallel
+end program
+
+!CHECKHLFIR-LABEL: @_QQmain
+!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"}
+!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"}
+!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.type<_QFTnested{outer:!fir.type<_QFTaggr{field:f32}>}>>) -> !fir.ref<!fir.type<_QFTnested{outer:!fir.type<_QFTaggr{field:f32}>}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "nestaggrvar"}
+!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarcomp"}
+!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"}
+!CHECKHLFIR: acc.kernels
+!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}}  : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"}
+!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}}  : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"}
+!CHECKHLFIR-DAG: acc.copyin varPtr(%{{.*}}  : !fir.ref<!fir.type<_QFTnested{outer:!fir.type<_QFTaggr{field:f32}>}>>) -> !fir.ref<!fir.type<_QFTnested{outer:!fir.type<_QFTaggr{field:f32}>}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "nestaggrvar"}
+!CHECKHLFIR-DAG: acc.firstprivate varPtr(%{{.*}}  : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {implicit = true, name = "scalarcomp"}
+!CHECKHLFIR-DAG: acc.firstprivate varPtr(%{{.*}}  : !fir.ref<f32>) -> !fir.ref<f32> {implicit = true, name = "scalarvar"}
+!CHECKHLFIR: acc.parallel
+
+!CHECKCSE-LABEL: @_QQmain
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"}
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarcomp"}
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"}
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar%field"}
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "nestaggrvar%outer%field"}
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar(2)"}
+!CHECKCSE: acc.kernels
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"}
+!CHECKCSE-DAG: acc.firstprivate varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {implicit = true, name = "scalarcomp"}
+!CHECKCSE-DAG: acc.firstprivate varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {implicit = true, name = "scalarvar"}
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar%field"}
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "nestaggrvar%outer%field"}
+!CHECKCSE-DAG: acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar(2)"}
+!CHECKCSE: acc.parallel
+
diff --git a/flang/test/Transforms/OpenACC/acc-implicit-data.fir b/flang/test/Transforms/OpenACC/acc-implicit-data.fir
new file mode 100644
index 0000000000000..7f6a57cb4d8c6
--- /dev/null
+++ b/flang/test/Transforms/OpenACC/acc-implicit-data.fir
@@ -0,0 +1,358 @@
+// RUN: fir-opt %s --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data)" -split-input-file | FileCheck %s
+
+// -----
+
+func.func @test_fir_scalar_in_serial() {
+  %livein = fir.alloca i64 {bindc_name = "scalarvar"}
+  acc.serial {
+    %load = fir.load %livein : !fir.ref<i64>
+    acc.yield
+  }
+  return
+}
+
+// CHECK: acc.firstprivate varPtr({{.*}} : !fir.ref<i64>) -> !fir.ref<i64> {implicit = true, name = "scalarvar"}
+
+// -----
+
+func.func @test_fir_scalar_in_parallel() {
+  %livein = fir.alloca f32 {bindc_name = "scalarvar"}
+  acc.parallel {
+    %load = fir.load %livein : !fir.ref<f32>
+    acc.yield
+  }
+  return
+}
+
+// CHECK: acc.firstprivate varPtr({{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {implicit = true, name = "scalarvar"}
+
+// -----
+
+func.func @test_fir_scalar_in_kernels() {
+  %livein = fir.alloca f64 {bindc_name = "scalarvar"}
+  acc.kernels {
+    %load = fir.load %livein : !fir.ref<f64>
+    acc.terminator
+  }
+  return
+}
+
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<f64>) -> !fir.ref<f64> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<f64>) to varPtr({{.*}} : !fir.ref<f64>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"}
+
+// -----
+
+func.func @test_fir_scalar_in_parallel_defaultnone() {
+  %livein = fir.alloca f32 {bindc_name = "scalarvar"}
+  acc.parallel {
+    %load = fir.load %livein : !fir.ref<f32>
+    acc.yield
+  } attributes {defaultAttr = #acc<defaultvalue none>}
+  return
+}
+
+// CHECK-NOT: acc.firstprivate
+
+// -----
+
+func.func @test_fir_scalar_in_kernels_defaultnone() {
+  %livein = fir.alloca f64 {bindc_name = "scalarvar"}
+  acc.kernels {
+    %load = fir.load %livein : !fir.ref<f64>
+    acc.terminator
+  } attributes {defaultAttr = #acc<defaultvalue none>}
+  return
+}
+
+// CHECK-NOT: acc.copyin
+
+// -----
+
+func.func @test_fir_derivedtype_in_parallel() {
+  %livein = fir.alloca !fir.type<_QFTaggr{field:f32}> {bindc_name = "aggrvar"}
+  acc.parallel {
+    %load = fir.load %livein : !fir.ref<!fir.type<_QFTaggr{field:f32}>>
+    acc.yield
+  }
+  return
+}
+
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) to varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"}
+
+// -----
+
+func.func @test_fir_derivedtype_in_kernels() {
+  %livein = fir.alloca !fir.type<_QFTaggr{field:f32}> {bindc_name = "aggrvar"}
+  acc.kernels {
+    %load = fir.load %livein : !fir.ref<!fir.type<_QFTaggr{field:f32}>>
+    acc.terminator
+  }
+  return
+}
+
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) to varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aggrvar"}
+
+// -----
+
+func.func @test_fir_array_in_parallel() {
+  %livein = fir.alloca !fir.array<10xf32> {bindc_name = "arrayvar"}
+  acc.parallel {
+    %load = fir.load %livein : !fir.ref<!fir.array<10xf32>>
+    acc.yield
+  }
+  return
+}
+
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<!fir.array<10xf32>>) to varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"}
+
+// -----
+
+func.func @test_fir_array_in_kernels() {
+  %livein = fir.alloca !fir.array<10xf32> {bindc_name = "arrayvar"}
+  acc.kernels {
+    %load = fir.load %livein : !fir.ref<!fir.array<10xf32>>
+    acc.terminator
+  }
+  return
+}
+
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<!fir.array<10xf32>>) to varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "arrayvar"}
+
+// -----
+
+func.func @test_fir_derivedtype_in_parallel_defaultpresent() {
+  %livein = fir.alloca !fir.type<_QFTaggr{field:f32}> {bindc_name = "aggrvar"}
+  acc.parallel {
+    %load = fir.load %livein : !fir.ref<!fir.type<_QFTaggr{field:f32}>>
+    acc.yield
+  } attributes {defaultAttr = #acc<defaultvalue present>}
+  return
+}
+
+// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {implicit = true, name = "aggrvar"}
+// CHECK: acc.delete accPtr(%[[PRESENT]] : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = "aggrvar"}
+
+// -----
+
+func.func @test_fir_derivedtype_in_kernels_defaultpresent() {
+  %livein = fir.alloca !fir.type<_QFTaggr{field:f32}> {bindc_name = "aggrvar"}
+  acc.kernels {
+    %load = fir.load %livein : !fir.ref<!fir.type<_QFTaggr{field:f32}>>
+    acc.terminator
+  } attributes {defaultAttr = #acc<defaultvalue present>}
+  return
+}
+
+// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {implicit = true, name = "aggrvar"}
+// CHECK: acc.delete accPtr(%[[PRESENT]] : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = "aggrvar"}
+
+// -----
+
+func.func @test_fir_array_in_parallel_defaultpresent() {
+  %livein = fir.alloca !fir.array<10xf32> {bindc_name = "arrayvar"}
+  acc.parallel {
+    %load = fir.load %livein : !fir.ref<!fir.array<10xf32>>
+    acc.yield
+  } attributes {defaultAttr = #acc<defaultvalue present>}
+  return
+}
+
+// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {implicit = true, name = "arrayvar"}
+// CHECK: acc.delete accPtr(%[[PRESENT]] : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = "arrayvar"}
+
+// -----
+
+func.func @test_fir_array_in_kernels_defaultpresent() {
+  %livein = fir.alloca !fir.array<10xf32> {bindc_name = "arrayvar"}
+  acc.kernels {
+    %load = fir.load %livein : !fir.ref<!fir.array<10xf32>>
+    acc.terminator
+  } attributes {defaultAttr = #acc<defaultvalue present>}
+  return
+}
+
+// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {implicit = true, name = "arrayvar"}
+// CHECK: acc.delete accPtr(%[[PRESENT]] : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = "arrayvar"}
+
+// -----
+
+func.func @test_fir_scalar_in_parallel_defaultpresent() {
+  %livein = fir.alloca f32 {bindc_name = "scalarvar"}
+  acc.parallel {
+    %load = fir.load %livein : !fir.ref<f32>
+    acc.yield
+  } attributes {defaultAttr = #acc<defaultvalue present>}
+  return
+}
+
+// CHECK: acc.firstprivate varPtr({{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {implicit = true, name = "scalarvar"}
+
+// -----
+
+func.func @test_fir_scalar_in_kernels_defaultpresent() {
+  %livein = fir.alloca f64 {bindc_name = "scalarvar"}
+  acc.kernels {
+    %load = fir.load %livein : !fir.ref<f64>
+    acc.terminator
+  } attributes {defaultAttr = #acc<defaultvalue present>}
+  return
+}
+
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<f64>) -> !fir.ref<f64> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<f64>) to varPtr({{.*}} : !fir.ref<f64>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "scalarvar"}
+
+// -----
+
+func.func @test_fir_box_ref() {
+  %livein = fir.alloca !fir.box<!fir.array<?xi32>> {bindc_name = "descriptor"}
+  acc.parallel {
+    %load = fir.load %livein : !fir.ref<!fir.box<!fir.array<?xi32>>>
+    acc.yield
+  }
+  return
+}
+
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : !fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.array<?xi32>>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "descriptor"}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<!fir.box<!fir.array<?xi32>>>) to varPtr({{.*}} : !fir.ref<!fir.box<!fir.array<?xi32>>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "descriptor"}
+
+// -----
+
+func.func @test_fir_box_val() {
+  %desc = fir.alloca !fir.box<!fir.array<?xi32>> {bindc_name = "descriptor"}
+  %livein = fir.load %desc : !fir.ref<!fir.box<!fir.array<?xi32>>>
+  acc.parallel {
+    %addr = fir.box_addr %livein : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+    acc.yield
+  }
+  return
+}
+
+// CHECK: %[[COPYIN:.*]] = acc.copyin var({{.*}} : !fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "descriptor"}
+// CHECK: acc.copyout accVar(%[[COPYIN]] : !fir.box<!fir.array<?xi32>>) to var({{.*}} : !fir.box<!fir.array<?xi32>>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "descriptor"}
+
+
+// -----
+
+// This test has an explicit data clause for the box - but the pointer held
+// inside the box is used in the region instead of the box itself. Test that
+// implicit present is actually used.
+func.func @test_explicit_box_implicit_ptr() {
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %arr = fir.alloca !fir.array<10xf32> {bindc_name = "aa"}
+  %shape = fir.shape %c10 : (index) -> !fir.shape<1>
+  %arr_decl = fir.declare %arr(%shape) {uniq_name = "aa"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<10xf32>>
+  %box = fir.embox %arr_decl(%shape) : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<10xf32>>
+  %copyin = acc.copyin var(%box : !fir.box<!fir.array<10xf32>>) -> !fir.box<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "aa"}
+  acc.serial dataOperands(%copyin : !fir.box<!fir.array<10xf32>>) {
+    // Use the pointer, not the box
+    %elem = fir.array_coor %arr_decl(%shape) %c1 : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, index) -> !fir.ref<f32>
+    acc.yield
+  }
+  acc.copyout accVar(%copyin : !fir.box<!fir.array<10xf32>>) to var(%box : !fir.box<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_copy>, name = "aa"}
+  return
+}
+
+// CHECK: acc.present varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>){{.*}}-> !fir.ref<!fir.array<10xf32>> {implicit = true, name = "aa"}
+
+// -----
+
+// This test uses an explicit-shape array with no data clause - it also has
+// an optimization where the pointer is used instead of the boxed entity.
+// It tests that the implicit data pass is able to recover the size despite
+// it not being encoded in the FIR type.
+// It was generated from the following Fortran source:
+//   subroutine array(aa,nn)
+//     integer :: nn
+//     real :: aa(10:nn)
+//     !$acc kernels loop
+//     do ii = 10, nn
+//       aa(ii) = ii
+//     end do
+//     !$acc end kernels
+//   end subroutine
+
+func.func @_QParray(%arg0: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "aa"}, %arg1: !fir.ref<i32> {fir.bindc_name = "nn"}) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c10_i64 = arith.constant 10 : i64
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.declare %arg1 dummy_scope %0 {uniq_name = "_QFarrayEnn"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+  %4 = fir.convert %c10_i64 : (i64) -> index
+  %5 = fir.load %1 : !fir.ref<i32>
+  %6 = fir.convert %5 : (i32) -> i64
+  %7 = fir.convert %6 : (i64) -> index
+  %8 = arith.subi %7, %4 : index
+  %9 = arith.addi %8, %c1 : index
+  %10 = arith.cmpi sgt, %9, %c0 : index
+  %11 = arith.select %10, %9, %c0 : index
+  %12 = fir.shape_shift %4, %11 : (index, index) -> !fir.shapeshift<1>
+  %13 = fir.declare %arg0(%12) dummy_scope %0 {uniq_name = "_QFarrayEaa"} : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>, !fir.dscope) -> !fir.ref<!fir.array<?xf32>>
+  acc.kernels {
+    %elem = fir.array_coor %13(%12) %4 : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>, index) -> !fir.ref<f32>
+    acc.terminator
+  }
+  return
+}
+
+// This tries to confirm that the acc.bounds operation is as expected.
+// Effectively the extent needs to be max(0, nn), stride needs to be 1,
+// adjusted lowerbound is 0, and actual language start index is 10.
+// CHECK: %[[NN:.*]] = fir.declare %{{.*}} dummy_scope %{{.*}} {uniq_name = "_QFarrayEnn"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+// CHECK: %[[C10:.*]] = fir.convert %c10{{.*}} : (i64) -> index
+// CHECK: %[[LOADEDNN:.*]] = fir.load %[[NN]] : !fir.ref<i32>
+// CHECK: %[[CAST1:.*]] = fir.convert %[[LOADEDNN]] : (i32) -> i64
+// CHECK: %[[CAST2:.*]]  = fir.convert %[[CAST1]] : (i64) -> index
+// CHECK: %[[SUBI:.*]] = arith.subi %[[CAST2]], %[[C10]] : index
+// CHECK: %[[ADDI:.*]] = arith.addi %[[SUBI]], %c1{{.*}} : index
+// CHECK: %[[CMPI:.*]] = arith.cmpi sgt, %[[ADDI]], %c0{{.*}} : index
+// CHECK: %[[SELECT:.*]] = arith.select %[[CMPI]], %[[ADDI]], %c0{{.*}} : index
+// CHECK: %[[BOUNDS:.*]] = acc.bounds lowerbound(%c0{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[SELECT]] : index) stride(%c1{{.*}} : index) startIdx(%[[C10]] : index)
+// CHECK: acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<?xf32>>) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xf32>> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = "aa"}
+
+// -----
+
+// Test to confirm that a copyin clause is not implicitly generated for deviceptr symbol.
+func.func @test_deviceptr_no_implicit_copy() {
+  %c10 = arith.constant 10 : index
+  %arr = fir.alloca !fir.array<10xf64> {bindc_name = "a"}
+  %shape = fir.shape %c10 : (index) -> !fir.shape<1>
+  %arr_box = fir.embox %arr(%shape) : (!fir.ref<!fir.array<10xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<10xf64>>
+  %devptr = acc.deviceptr var(%arr_box : !fir.box<!fir.array<10xf64>>) -> !fir.box<!fir.array<10xf64>> {name = "a"}
+  acc.parallel dataOperands(%devptr : !fir.box<!fir.array<10xf64>>) {
+    %elem = fir.box_addr %arr_box : (!fir.box<!fir.array<10xf64>>) -> !fir.ref<!fir.array<10xf64>>
+    acc.yield
+  }
+  return
+}
+
+// CHECK-NOT: acc.copyin
+// CHECK: acc.deviceptr
+
+// -----
+
+// Test that acc.declare with deviceptr doesn't generate implicit copyin
+func.func @test_acc_declare_deviceptr() {
+  %c10 = arith.constant 10 : index
+  %arr = fir.alloca !fir.array<10xf64> {bindc_name = "a"}
+  %shape = fir.shape %c10 : (index) -> !fir.shape<1>
+  %arr_box = fir.embox %arr(%shape) : (!fir.ref<!fir.array<10xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<10xf64>>
+  %devptr = acc.deviceptr var(%arr_box : !fir.box<!fir.array<10xf64>>) -> !fir.box<!fir.array<10xf64>> {name = "a"}
+  %token = acc.declare_enter dataOperands(%devptr : !fir.box<!fir.array<10xf64>>)
+  acc.parallel {
+    %elem = fir.box_addr %arr_box : (!fir.box<!fir.array<10xf64>>) -> !fir.ref<!fir.array<10xf64>>
+    acc.yield
+  }
+  acc.declare_exit token(%token)
+  return
+}
+
+// CHECK-LABEL: func.func @test_acc_declare_deviceptr
+// CHECK: acc.deviceptr
+// CHECK-NOT: acc.copyin
+// CHECK: acc.deviceptr
+
diff --git a/flang/test/Transforms/OpenACC/acc-implicit-firstprivate.fir b/flang/test/Transforms/OpenACC/acc-implicit-firstprivate.fir
new file mode 100644
index 0000000000000..e4a7b8b18bc2a
--- /dev/null
+++ b/flang/test/Transforms/OpenACC/acc-implicit-firstprivate.fir
@@ -0,0 +1,284 @@
+// RUN: fir-opt %s --pass-pipeline="builtin.module(acc-initialize-fir-analyses,acc-implicit-data)" -split-input-file | FileCheck %s
+
+// Test implicit firstprivate behavior for various scalar types in parallel and serial constructs.
+// Scalars in parallel/serial constructs should be implicitly firstprivate according to OpenACC spec.
+
+// -----
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_i32 : !fir.ref<i32> init {
+// CHECK:       ^bb0(%{{.*}}: !fir.ref<i32>):
+// CHECK:         %[[ALLOC:.*]] = fir.alloca i32
+// CHECK:         %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK:         acc.yield %[[DECL]]#0 : !fir.ref<i32>
+// CHECK:       } copy {
+// CHECK:       ^bb0(%[[SRC:.*]]: !fir.ref<i32>, %[[DST:.*]]: !fir.ref<i32>):
+// CHECK:         %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<i32>
+// CHECK:         fir.store %[[LOADED]] to %[[DST]] : !fir.ref<i32>
+// CHECK:         acc.terminator
+// CHECK:       }
+
+// CHECK-LABEL: func.func @test_i32_scalar_in_parallel
+func.func @test_i32_scalar_in_parallel() {
+  %scalar = fir.alloca i32 {bindc_name = "i32_var"}
+  acc.parallel {
+    %load = fir.load %scalar : !fir.ref<i32>
+    acc.yield
+  }
+  return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i32_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_i32 -> %[[FIRSTPRIV]] : !fir.ref<i32>)
+
+// -----
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_i64 : !fir.ref<i64> init {
+// CHECK:       ^bb0(%{{.*}}: !fir.ref<i64>):
+// CHECK:         %[[ALLOC:.*]] = fir.alloca i64
+// CHECK:         %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK:         acc.yield %[[DECL]]#0 : !fir.ref<i64>
+// CHECK:       } copy {
+// CHECK:       ^bb0(%[[SRC:.*]]: !fir.ref<i64>, %[[DST:.*]]: !fir.ref<i64>):
+// CHECK:         %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<i64>
+// CHECK:         fir.store %[[LOADED]] to %[[DST]] : !fir.ref<i64>
+// CHECK:         acc.terminator
+// CHECK:       }
+
+// CHECK-LABEL: func.func @test_i64_scalar_in_parallel
+func.func @test_i64_scalar_in_parallel() {
+  %scalar = fir.alloca i64 {bindc_name = "i64_var"}
+  acc.parallel {
+    %load = fir.load %scalar : !fir.ref<i64>
+    acc.yield
+  }
+  return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<i64>) -> !fir.ref<i64> {implicit = true, name = "i64_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_i64 -> %[[FIRSTPRIV]] : !fir.ref<i64>)
+
+// -----
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_f32 : !fir.ref<f32> init {
+// CHECK:       ^bb0(%{{.*}}: !fir.ref<f32>):
+// CHECK:         %[[ALLOC:.*]] = fir.alloca f32
+// CHECK:         %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK:         acc.yield %[[DECL]]#0 : !fir.ref<f32>
+// CHECK:       } copy {
+// CHECK:       ^bb0(%[[SRC:.*]]: !fir.ref<f32>, %[[DST:.*]]: !fir.ref<f32>):
+// CHECK:         %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<f32>
+// CHECK:         fir.store %[[LOADED]] to %[[DST]] : !fir.ref<f32>
+// CHECK:         acc.terminator
+// CHECK:       }
+
+// CHECK-LABEL: func.func @test_f32_scalar_in_parallel
+func.func @test_f32_scalar_in_parallel() {
+  %scalar = fir.alloca f32 {bindc_name = "f32_var"}
+  acc.parallel {
+    %load = fir.load %scalar : !fir.ref<f32>
+    acc.yield
+  }
+  return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {implicit = true, name = "f32_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_f32 -> %[[FIRSTPRIV]] : !fir.ref<f32>)
+
+// -----
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_f64 : !fir.ref<f64> init {
+// CHECK:       ^bb0(%{{.*}}: !fir.ref<f64>):
+// CHECK:         %[[ALLOC:.*]] = fir.alloca f64
+// CHECK:         %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK:         acc.yield %[[DECL]]#0 : !fir.ref<f64>
+// CHECK:       } copy {
+// CHECK:       ^bb0(%[[SRC:.*]]: !fir.ref<f64>, %[[DST:.*]]: !fir.ref<f64>):
+// CHECK:         %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<f64>
+// CHECK:         fir.store %[[LOADED]] to %[[DST]] : !fir.ref<f64>
+// CHECK:         acc.terminator
+// CHECK:       }
+
+// CHECK-LABEL: func.func @test_f64_scalar_in_parallel
+func.func @test_f64_scalar_in_parallel() {
+  %scalar = fir.alloca f64 {bindc_name = "f64_var"}
+  acc.parallel {
+    %load = fir.load %scalar : !fir.ref<f64>
+    acc.yield
+  }
+  return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<f64>) -> !fir.ref<f64> {implicit = true, name = "f64_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_f64 -> %[[FIRSTPRIV]] : !fir.ref<f64>)
+
+// -----
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_l32 : !fir.ref<!fir.logical<4>> init {
+// CHECK:       ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>):
+// CHECK:         %[[ALLOC:.*]] = fir.alloca !fir.logical<4>
+// CHECK:         %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK:         acc.yield %[[DECL]]#0 : !fir.ref<!fir.logical<4>>
+// CHECK:       } copy {
+// CHECK:       ^bb0(%[[SRC:.*]]: !fir.ref<!fir.logical<4>>, %[[DST:.*]]: !fir.ref<!fir.logical<4>>):
+// CHECK:         %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<!fir.logical<4>>
+// CHECK:         fir.store %[[LOADED]] to %[[DST]] : !fir.ref<!fir.logical<4>>
+// CHECK:         acc.terminator
+// CHECK:       }
+
+// CHECK-LABEL: func.func @test_logical_scalar_in_parallel
+func.func @test_logical_scalar_in_parallel() {
+  %scalar = fir.alloca !fir.logical<4> {bindc_name = "logical_var"}
+  acc.parallel {
+    %load = fir.load %scalar : !fir.ref<!fir.logical<4>>
+    acc.yield
+  }
+  return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>> {implicit = true, name = "logical_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_l32 -> %[[FIRSTPRIV]] : !fir.ref<!fir.logical<4>>)
+
+// -----
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_z32 : !fir.ref<complex<f32>> init {
+// CHECK:       ^bb0(%{{.*}}: !fir.ref<complex<f32>>):
+// CHECK:         %[[ALLOC:.*]] = fir.alloca complex<f32>
+// CHECK:         %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK:         acc.yield %[[DECL]]#0 : !fir.ref<complex<f32>>
+// CHECK:       } copy {
+// CHECK:       ^bb0(%[[SRC:.*]]: !fir.ref<complex<f32>>, %[[DST:.*]]: !fir.ref<complex<f32>>):
+// CHECK:         %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<complex<f32>>
+// CHECK:         fir.store %[[LOADED]] to %[[DST]] : !fir.ref<complex<f32>>
+// CHECK:         acc.terminator
+// CHECK:       }
+
+// CHECK-LABEL: func.func @test_complex_scalar_in_parallel
+func.func @test_complex_scalar_in_parallel() {
+  %scalar = fir.alloca complex<f32> {bindc_name = "complex_var"}
+  acc.parallel {
+    %load = fir.load %scalar : !fir.ref<complex<f32>>
+    acc.yield
+  }
+  return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {implicit = true, name = "complex_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_z32 -> %[[FIRSTPRIV]] : !fir.ref<complex<f32>>)
+
+// -----
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_z64 : !fir.ref<complex<f64>> init {
+// CHECK:       ^bb0(%{{.*}}: !fir.ref<complex<f64>>):
+// CHECK:         %[[ALLOC:.*]] = fir.alloca complex<f64>
+// CHECK:         %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK:         acc.yield %[[DECL]]#0 : !fir.ref<complex<f64>>
+// CHECK:       } copy {
+// CHECK:       ^bb0(%[[SRC:.*]]: !fir.ref<complex<f64>>, %[[DST:.*]]: !fir.ref<complex<f64>>):
+// CHECK:         %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<complex<f64>>
+// CHECK:         fir.store %[[LOADED]] to %[[DST]] : !fir.ref<complex<f64>>
+// CHECK:         acc.terminator
+// CHECK:       }
+
+// CHECK-LABEL: func.func @test_complex8_scalar_in_parallel
+func.func @test_complex8_scalar_in_parallel() {
+  %scalar = fir.alloca complex<f64> {bindc_name = "complex8_var"}
+  acc.parallel {
+    %load = fir.load %scalar : !fir.ref<complex<f64>>
+    acc.yield
+  }
+  return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<complex<f64>>) -> !fir.ref<complex<f64>> {implicit = true, name = "complex8_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_z64 -> %[[FIRSTPRIV]] : !fir.ref<complex<f64>>)
+
+// -----
+
+// Test with serial construct
+
+// CHECK-LABEL: func.func @test_i32_scalar_in_serial
+func.func @test_i32_scalar_in_serial() {
+  %scalar = fir.alloca i32 {bindc_name = "serial_i32_var"}
+  acc.serial {
+    %load = fir.load %scalar : !fir.ref<i32>
+    acc.yield
+  }
+  return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "serial_i32_var"}
+// CHECK: acc.serial firstprivate(@firstprivatization_ref_i32 -> %[[FIRSTPRIV]] : !fir.ref<i32>)
+
+// -----
+
+// Test with serial construct and f64
+
+// CHECK-LABEL: func.func @test_f64_scalar_in_serial
+func.func @test_f64_scalar_in_serial() {
+  %scalar = fir.alloca f64 {bindc_name = "serial_f64_var"}
+  acc.serial {
+    %load = fir.load %scalar : !fir.ref<f64>
+    acc.yield
+  }
+  return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<f64>) -> !fir.ref<f64> {implicit = true, name = "serial_f64_var"}
+// CHECK: acc.serial firstprivate(@firstprivatization_ref_f64 -> %[[FIRSTPRIV]] : !fir.ref<f64>)
+
+// -----
+
+// Test i8 and i16 scalar types
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_i8 : !fir.ref<i8> init {
+// CHECK:       ^bb0(%{{.*}}: !fir.ref<i8>):
+// CHECK:         %[[ALLOC:.*]] = fir.alloca i8
+// CHECK:         %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK:         acc.yield %[[DECL]]#0 : !fir.ref<i8>
+// CHECK:       } copy {
+// CHECK:       ^bb0(%[[SRC:.*]]: !fir.ref<i8>, %[[DST:.*]]: !fir.ref<i8>):
+// CHECK:         %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<i8>
+// CHECK:         fir.store %[[LOADED]] to %[[DST]] : !fir.ref<i8>
+// CHECK:         acc.terminator
+// CHECK:       }
+
+// CHECK-LABEL: func.func @test_i8_scalar_in_parallel
+func.func @test_i8_scalar_in_parallel() {
+  %scalar = fir.alloca i8 {bindc_name = "i8_var"}
+  acc.parallel {
+    %load = fir.load %scalar : !fir.ref<i8>
+    acc.yield
+  }
+  return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<i8>) -> !fir.ref<i8> {implicit = true, name = "i8_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_i8 -> %[[FIRSTPRIV]] : !fir.ref<i8>)
+
+// -----
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_i16 : !fir.ref<i16> init {
+// CHECK:       ^bb0(%{{.*}}: !fir.ref<i16>):
+// CHECK:         %[[ALLOC:.*]] = fir.alloca i16
+// CHECK:         %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]]
+// CHECK:         acc.yield %[[DECL]]#0 : !fir.ref<i16>
+// CHECK:       } copy {
+// CHECK:       ^bb0(%[[SRC:.*]]: !fir.ref<i16>, %[[DST:.*]]: !fir.ref<i16>):
+// CHECK:         %[[LOADED:.*]] = fir.load %[[SRC]] : !fir.ref<i16>
+// CHECK:         fir.store %[[LOADED]] to %[[DST]] : !fir.ref<i16>
+// CHECK:         acc.terminator
+// CHECK:       }
+
+// CHECK-LABEL: func.func @test_i16_scalar_in_parallel
+func.func @test_i16_scalar_in_parallel() {
+  %scalar = fir.alloca i16 {bindc_name = "i16_var"}
+  acc.parallel {
+    %load = fir.load %scalar : !fir.ref<i16>
+    acc.yield
+  }
+  return
+}
+
+// CHECK: %[[FIRSTPRIV:.*]] = acc.firstprivate varPtr(%{{.*}} : !fir.ref<i16>) -> !fir.ref<i16> {implicit = true, name = "i16_var"}
+// CHECK: acc.parallel firstprivate(@firstprivatization_ref_i16 -> %[[FIRSTPRIV]] : !fir.ref<i16>)
+
diff --git a/flang/test/Transforms/debug-dummy-argument.fir b/flang/test/Transforms/debug-dummy-argument.fir
index fb677e60abc1f..61862530f8396 100644
--- a/flang/test/Transforms/debug-dummy-argument.fir
+++ b/flang/test/Transforms/debug-dummy-argument.fir
@@ -23,7 +23,7 @@
 module attributes {dlti.dl_spec = #dlti.dl_spec<i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i32 = dense<32> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, i8 = dense<8> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.mangling_mode" = "e", "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", fir.target_cpu = "x86-64", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
   func.func @test_(%arg0: !fir.ref<i32> {fir.bindc_name = "expected"} loc("debug-dummy-argument.f90":1:1), %arg1: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "x"} loc("debug-dummy-argument.f90":1:1)) attributes {fir.internal_name = "_QPtest"} {
     %0 = fir.undefined !fir.dscope loc(#loc1)
-    %1 = fircg.ext_declare %arg0 dummy_scope %0 {uniq_name = "_QFtestEexpected"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> loc(#loc3)
+    %1 = fircg.ext_declare %arg0 dummy_scope %0 arg 1 {uniq_name = "_QFtestEexpected"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> loc(#loc3)
     %2 = fir.is_present %arg1 : (!fir.box<!fir.array<?xi32>>) -> i1 loc(#loc4)
     cf.cond_br %2, ^bb5(%arg1 : !fir.box<!fir.array<?xi32>>), ^bb5(%arg1 : !fir.box<!fir.array<?xi32>>) loc(#loc4)
   ^bb5(%17: !fir.box<!fir.array<?xi32>> loc("debug-dummy-argument.f90":2:14)):  // 2 preds: ^bb3, ^bb4
diff --git a/flang/test/Transforms/debug-local-var.fir b/flang/test/Transforms/debug-local-var.fir
index d39017e6dd62a..863d86cb05948 100644
--- a/flang/test/Transforms/debug-local-var.fir
+++ b/flang/test/Transforms/debug-local-var.fir
@@ -22,9 +22,9 @@ module {
   } loc(#loc7)
   func.func private @_QFPfn1(%arg0: !fir.ref<i32> {fir.bindc_name = "a1"}, %arg1: !fir.ref<f64> {fir.bindc_name = "b1"}, %arg2: !fir.ref<!fir.logical<1>> {fir.bindc_name = "c1"}) -> i64 attributes {fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage<internal>} {
     %0 = fir.undefined !fir.dscope loc(#loc11)
-    %1 = fircg.ext_declare %arg0 dummy_scope %0 {uniq_name = "_QFFfn1Ea1"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> loc(#loc8)
-    %2 = fircg.ext_declare %arg1 dummy_scope %0 {uniq_name = "_QFFfn1Eb1"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64> loc(#loc9)
-    %3 = fircg.ext_declare %arg2 dummy_scope %0 {uniq_name = "_QFFfn1Ec1"} : (!fir.ref<!fir.logical<1>>, !fir.dscope) -> !fir.ref<!fir.logical<1>> loc(#loc10)
+    %1 = fircg.ext_declare %arg0 dummy_scope %0 arg 1 {uniq_name = "_QFFfn1Ea1"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> loc(#loc8)
+    %2 = fircg.ext_declare %arg1 dummy_scope %0 arg 2 {uniq_name = "_QFFfn1Eb1"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64> loc(#loc9)
+    %3 = fircg.ext_declare %arg2 dummy_scope %0 arg 3 {uniq_name = "_QFFfn1Ec1"} : (!fir.ref<!fir.logical<1>>, !fir.dscope) -> !fir.ref<!fir.logical<1>> loc(#loc10)
     %4 = fir.alloca i64 {bindc_name = "res1", uniq_name = "_QFFfn1Eres1"} loc(#loc15)
     %5 = fircg.ext_declare %4 {uniq_name = "_QFFfn1Eres1"} : (!fir.ref<i64>) -> !fir.ref<i64> loc(#loc11)
     %6 = fir.load %1 : !fir.ref<i32>
@@ -38,9 +38,9 @@ module {
   } loc(#loc12)
   func.func private @_QFPfn2(%arg0: !fir.ref<i64> {fir.bindc_name = "a2"}, %arg1: !fir.ref<f32> {fir.bindc_name = "b2"}, %arg2: !fir.ref<!fir.logical<4>> {fir.bindc_name = "c2"}) -> i32 attributes {fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage<internal>} {
     %0 = fir.undefined !fir.dscope
-    %1 = fircg.ext_declare %arg0 dummy_scope %0 {uniq_name = "_QFFfn2Ea2"} : (!fir.ref<i64>, !fir.dscope) -> !fir.ref<i64> loc(#loc13)
-    %2 = fircg.ext_declare %arg1 dummy_scope %0 {uniq_name = "_QFFfn2Eb2"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32> loc(#loc14)
-    %3 = fircg.ext_declare %arg2 dummy_scope %0 {uniq_name = "_QFFfn2Ec2"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> !fir.ref<!fir.logical<4>> loc(#loc15)
+    %1 = fircg.ext_declare %arg0 dummy_scope %0 arg 1 {uniq_name = "_QFFfn2Ea2"} : (!fir.ref<i64>, !fir.dscope) -> !fir.ref<i64> loc(#loc13)
+    %2 = fircg.ext_declare %arg1 dummy_scope %0 arg 2 {uniq_name = "_QFFfn2Eb2"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32> loc(#loc14)
+    %3 = fircg.ext_declare %arg2 dummy_scope %0 arg 3 {uniq_name = "_QFFfn2Ec2"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> !fir.ref<!fir.logical<4>> loc(#loc15)
     %4 = fir.alloca i32 {bindc_name = "res2", uniq_name = "_QFFfn2Eres2"}
     %5 = fircg.ext_declare %4 {uniq_name = "_QFFfn2Eres2"} : (!fir.ref<i32>) -> !fir.ref<i32> loc(#loc16)
     %6 = fir.load %1 : !fir.ref<i64>
diff --git a/flang/test/Transforms/debug-proc-ptr.fir b/flang/test/Transforms/debug-proc-ptr.fir
new file mode 100644
index 0000000000000..2963557786907
--- /dev/null
+++ b/flang/test/Transforms/debug-proc-ptr.fir
@@ -0,0 +1,41 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
+
+module {
+  func.func @_QQmain() attributes {fir.bindc_name = "test"} {
+    %0 = fir.alloca (!fir.ref<i32>) -> i32 {bindc_name = "fun_ptr", uniq_name = "_QFEfun_ptr"}
+    %1 = fircg.ext_declare %0 {uniq_name = "_QFEfun_ptr"} : (!fir.ref<(!fir.ref<i32>) -> i32>) -> !fir.ref<(!fir.ref<i32>) -> i32> loc(#loc1)
+
+    // Procedure pointer with no return: procedure(sub1), pointer :: sub_ptr
+    %2 = fir.alloca () -> () {bindc_name = "sub_ptr", uniq_name = "_QFEsub_ptr"}
+    %3 = fircg.ext_declare %2 {uniq_name = "_QFEsub_ptr"} : (!fir.ref<() -> ()>) -> !fir.ref<() -> ()> loc(#loc2)
+
+    // Procedure pointer with multiple args: procedure(func2), pointer :: func_ptr
+    %4 = fir.alloca (!fir.ref<i32>, !fir.ref<f64>) -> f32 {bindc_name = "func_ptr", uniq_name = "_QFEfunc_ptr"}
+    %5 = fircg.ext_declare %4 {uniq_name = "_QFEfunc_ptr"} : (!fir.ref<(!fir.ref<i32>, !fir.ref<f64>) -> f32>) -> !fir.ref<(!fir.ref<i32>, !fir.ref<f64>) -> f32> loc(#loc3)
+
+    return
+  } loc(#loc)
+}
+#loc = loc("test.f90":1:1)
+#loc1 = loc("test.f90":2:30)
+#loc2 = loc("test.f90":3:30)
+#loc3 = loc("test.f90":4:30)
+
+// CHECK-DAG: #[[INT:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
+// CHECK-DAG: #[[REAL32:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
+// CHECK-DAG: #[[REAL:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real(kind=8)", sizeInBits = 64, encoding = DW_ATE_float>
+
+// CHECK-DAG: #[[PTR_INT:.*]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type{{.*}}baseType = #[[INT]]{{.*}}>
+// CHECK-DAG: #[[PTR_REAL:.*]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type{{.*}}baseType = #[[REAL]]{{.*}}>
+
+// CHECK-DAG: #[[SUB1:.*]] = #llvm.di_subroutine_type<types = #[[INT]], #[[PTR_INT]]>
+// CHECK-DAG: #[[PTR_SUB1:.*]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type{{.*}}baseType = #[[SUB1]]{{.*}}>
+// CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "fun_ptr"{{.*}}type = #[[PTR_SUB1]]{{.*}}>
+
+// CHECK-DAG: #di_subroutine_type{{.*}} = #llvm.di_subroutine_type<types = #di_null_type>
+// CHECK-DAG: #di_local_variable{{.*}} = #llvm.di_local_variable<{{.*}}name = "sub_ptr"{{.*}}type = #di_derived_type{{.*}}>
+// CHECK-DAG: #di_derived_type{{.*}} = #llvm.di_derived_type<tag = DW_TAG_pointer_type{{.*}}baseType = #di_subroutine_type{{.*}}{{.*}}>
+
+// CHECK-DAG: #[[SUB3:.*]] = #llvm.di_subroutine_type<types = #[[REAL32]], #[[PTR_INT]], #[[PTR_REAL]]>
+// CHECK-DAG: #[[PTR_SUB3:.*]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type{{.*}}baseType = #[[SUB3]]{{.*}}>
+// CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "func_ptr"{{.*}}type = #[[PTR_SUB3]]{{.*}}>
diff --git a/flang/test/Transforms/omp-map-info-finalization.fir b/flang/test/Transforms/omp-map-info-finalization.fir
index b30a2fc4e9a80..5b0fd9f23d63d 100644
--- a/flang/test/Transforms/omp-map-info-finalization.fir
+++ b/flang/test/Transforms/omp-map-info-finalization.fir
@@ -381,11 +381,10 @@ func.func @_QPrealtest(%arg0: !fir.boxchar<1>) {
 // CHECK:           %[[VAL_8:.*]]:2 = fir.unboxchar %[[VAL_4]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 // CHECK:           %[[VAL_9:.*]] = arith.subi %[[VAL_8]]#1, %[[VAL_7]] : index
 // CHECK:           %[[VAL_10:.*]] = omp.map.bounds lower_bound(%[[VAL_6]] : index) upper_bound(%[[VAL_9]] : index) extent(%[[VAL_8]]#1 : index) stride(%[[VAL_7]] : index) start_idx(%[[VAL_6]] : index) {stride_in_bytes = true}
-// CHECK:           %[[VAL_11:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.boxchar<1>>
 // CHECK:           %[[VAL_12:.*]] = fir.box_offset %[[VAL_0]] base_addr : (!fir.ref<!fir.boxchar<1>>) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>
-// CHECK:           %[[VAL_13:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.char<1,?>) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr(%[[VAL_12]] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) bounds(%[[VAL_10]]) -> !fir.ref<!fir.boxchar<1>>
-// CHECK:           %[[VAL_14:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(to) capture(ByRef) members(%[[VAL_13]] : [0] : !fir.ref<!fir.boxchar<1>>) -> !fir.ref<!fir.boxchar<1>>
-// CHECK:           omp.target map_entries(%[[VAL_14]] -> %[[VAL_15:.*]], %[[VAL_13]] -> %[[VAL_16:.*]] : !fir.ref<!fir.boxchar<1>>, !fir.ref<!fir.boxchar<1>>) private(@boxchar.privatizer %[[VAL_3]]#0 -> %[[VAL_17:.*]] [map_idx=0] : !fir.boxchar<1>) {
+// CHECK:           %[[VAL_13:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.char<1,?>) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[VAL_12]] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) bounds(%[[VAL_10]]) -> !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>
+// CHECK:           %[[VAL_14:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.boxchar<1>>, !fir.boxchar<1>) map_clauses(to) capture(ByRef) members(%[[VAL_13]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) -> !fir.ref<!fir.boxchar<1>>
+// CHECK:           omp.target map_entries(%[[VAL_14]] -> %[[VAL_15:.*]], %[[VAL_13]] -> %[[VAL_16:.*]] : !fir.ref<!fir.boxchar<1>>, !fir.llvm_ptr<!fir.ref<!fir.char<1,?>>>) private(@boxchar.privatizer %[[VAL_3]]#0 -> %[[VAL_17:.*]] [map_idx=0] : !fir.boxchar<1>) {
 // CHECK:             %[[VAL_18:.*]]:2 = fir.unboxchar %[[VAL_17]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 // CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]]#0 typeparams %[[VAL_18]]#1 {uniq_name = "tgt_a0"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
 // CHECK:             omp.terminator
diff --git a/flang/test/Transforms/omp-maps-for-privatized-symbols.fir b/flang/test/Transforms/omp-maps-for-privatized-symbols.fir
index 10a76126ed054..6054c70a2700d 100644
--- a/flang/test/Transforms/omp-maps-for-privatized-symbols.fir
+++ b/flang/test/Transforms/omp-maps-for-privatized-symbols.fir
@@ -6,7 +6,12 @@ module attributes {omp.is_target_device = false} {
     // extract box address, see if it is null, etc
     omp.yield(%arg1: !fir.ref<!fir.box<!fir.heap<i32>>>)
   }
-
+  omp.private {type = firstprivate} @_QFtarget_simpleEfp_int_firstprivate_i32 : i32 copy {
+  ^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>):
+    %0 = fir.load %arg0 : !fir.ref<i32>
+    hlfir.assign %0 to %arg1 : i32, !fir.ref<i32>
+    omp.yield(%arg1 : !fir.ref<i32>)
+  }
   func.func @_QPtarget_simple() {
     %0 = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFtarget_simpleEa"}
     %1:2 = hlfir.declare %0 {uniq_name = "_QFtarget_simpleEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -15,34 +20,18 @@ module attributes {omp.is_target_device = false} {
     %4 = fir.embox %3 : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
     fir.store %4 to %2 : !fir.ref<!fir.box<!fir.heap<i32>>>
     %5:2 = hlfir.declare %2 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtarget_simpleEsimple_var"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+    %6 = fir.alloca i32 {bindc_name = "fp_int", uniq_name = "_QFtarget_simpleEfp_int"}
+    %7:2 = hlfir.declare %6 {uniq_name = "_QFtarget_simpleEfp_int"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
     %c2_i32 = arith.constant 2 : i32
     hlfir.assign %c2_i32 to %1#0 : i32, !fir.ref<i32>
-    %6 = omp.map.info var_ptr(%1#1 : !fir.ref<i32>, i32) map_clauses(to) capture(ByRef) -> !fir.ref<i32> {name = "a"}
-    omp.target map_entries(%6 -> %arg0 : !fir.ref<i32>) private(@_QFtarget_simpleEsimple_var_private_ref_box_heap_i32 %5#0 -> %arg1 : !fir.ref<!fir.box<!fir.heap<i32>>>) {
-      %11:2 = hlfir.declare %arg0 {uniq_name = "_QFtarget_simpleEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-      %12:2 = hlfir.declare %arg1 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtarget_simpleEsimple_var"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
-      %c10_i32 = arith.constant 10 : i32
-      %13 = fir.load %11#0 : !fir.ref<i32>
-      %14 = arith.addi %c10_i32, %13 : i32
-      hlfir.assign %14 to %12#0 realloc : i32, !fir.ref<!fir.box<!fir.heap<i32>>>
+    %8 = omp.map.info var_ptr(%1#1 : !fir.ref<i32>, i32) map_clauses(to) capture(ByRef) -> !fir.ref<i32> {name = "a"}
+    omp.target map_entries(%8 -> %arg0 : !fir.ref<i32>) private(@_QFtarget_simpleEsimple_var_private_ref_box_heap_i32 %5#0 -> %arg1, @_QFtarget_simpleEfp_int_firstprivate_i32 %7#0 -> %arg2  : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<i32>) {
       omp.terminator
     }
-    %7 = fir.load %5#1 : !fir.ref<!fir.box<!fir.heap<i32>>>
-    %8 = fir.box_addr %7 : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
-    %9 = fir.convert %8 : (!fir.heap<i32>) -> i64
-    %c0_i64 = arith.constant 0 : i64
-    %10 = arith.cmpi ne, %9, %c0_i64 : i64
-    fir.if %10 {
-      %11 = fir.load %5#1 : !fir.ref<!fir.box<!fir.heap<i32>>>
-      %12 = fir.box_addr %11 : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
-      fir.freemem %12 : !fir.heap<i32>
-      %13 = fir.zero_bits !fir.heap<i32>
-      %14 = fir.embox %13 : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
-      fir.store %14 to %5#1 : !fir.ref<!fir.box<!fir.heap<i32>>>
-    }
     return
   }
 }
 // CHECK: %[[MAP0:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<i32>, i32) map_clauses(to) capture(ByRef) -> !fir.ref<i32> {name = "a"}
-// CHECK: %[[MAP1:.*]] =  omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.box<!fir.heap<i32>>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<i32>>>    
-// CHECK: omp.target map_entries(%[[MAP0]] -> %arg0, %[[MAP1]] -> %arg1 : !fir.ref<i32>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+// CHECK: %[[MAP1:.*]] =  omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.box<!fir.heap<i32>>) map_clauses(tofrom) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<i32>>>
+// CHECK: %[[MAP2:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<i32>, i32) map_clauses(to) capture(ByCopy) -> !fir.ref<i32>
+// CHECK:  omp.target map_entries(%[[MAP0]] -> %arg0, %[[MAP1]] -> %arg1, %[[MAP2]] -> %arg2 : !fir.ref<i32>, !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<i32>)
diff --git a/flang/test/Transforms/stack-arrays.fir b/flang/test/Transforms/stack-arrays.fir
index 4a417ed981ab1..25fc73153003a 100644
--- a/flang/test/Transforms/stack-arrays.fir
+++ b/flang/test/Transforms/stack-arrays.fir
@@ -3,13 +3,17 @@
 // Simplest transformation
 func.func @simple() {
   %0 = fir.allocmem !fir.array<42xi32>
+  %c0_s = arith.constant 0 : index
+  %c0_i32_s = arith.constant 0 : i32
+  %ref_s = fir.convert %0 : (!fir.heap<!fir.array<42xi32>>) -> !fir.ref<!fir.array<42xi32>>
+  %elt_s = fir.coordinate_of %ref_s, %c0_s : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+  fir.store %c0_i32_s to %elt_s : !fir.ref<i32>
   fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
   return
 }
-// CHECK: func.func @simple() {
-// CHECK-NEXT: fir.alloca !fir.array<42xi32>
-// CHECK-NEXT: return
-// CHECK-NEXT: }
+// CHECK: func.func @simple()
+// CHECK: fir.alloca !fir.array<42xi32>
+// CHECK: return
 
 // Check fir.must_be_heap allocations are not moved
 func.func @must_be_heap() {
@@ -17,7 +21,7 @@ func.func @must_be_heap() {
   fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
   return
 }
-// CHECK:      func.func @must_be_heap() {
+// CHECK-LABEL: func.func @must_be_heap()
 // CHECK-NEXT:   %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32> {fir.must_be_heap = true}
 // CHECK-NEXT:   fir.freemem %[[ALLOC]] : !fir.heap<!fir.array<42xi32>>
 // CHECK-NEXT:   return
@@ -36,7 +40,7 @@ func.func @dfa1(%arg0: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}) {
   }
   return
 }
-// CHECK:      func.func @dfa1(%arg0: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}) {
+// CHECK-LABEL: func.func @dfa1(%arg0: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"})
 // CHECK-NEXT:   %[[C42:.*]] = arith.constant 42 : index
 // CHECK-NEXT:   %[[MEM:.*]] = fir.allocmem !fir.array<?xi32>, %[[C42]] {uniq_name = "_QFdfa1Earr.alloc"}
 // CHECK-NEXT:   %[[LOGICAL:.*]] = fir.load %arg0 : !fir.ref<!fir.logical<4>>
@@ -57,7 +61,7 @@ func.func @dfa2(%arg0: i1) {
   }
   return
 }
-// CHECK:     func.func @dfa2(%arg0: i1) {
+// CHECK-LABEL: func.func @dfa2(%arg0: i1)
 // CHECK-NEXT:  %[[MEM:.*]] = fir.allocmem !fir.array<1xi8>
 // CHECK-NEXT:  scf.if %arg0 {
 // CHECK-NEXT:    fir.freemem %[[MEM]] : !fir.heap<!fir.array<1xi8>>
@@ -74,15 +78,16 @@ func.func @dfa3(%arg0: i1) {
   } else {
     fir.freemem %a : !fir.heap<!fir.array<1xi8>>
   }
+  %c0_d3 = arith.constant 0 : index
+  %c0_i8_d3 = arith.constant 0 : i8
+  %ref_d3 = fir.convert %a : (!fir.heap<!fir.array<1xi8>>) -> !fir.ref<!fir.array<1xi8>>
+  %elt_d3 = fir.coordinate_of %ref_d3, %c0_d3 : (!fir.ref<!fir.array<1xi8>>, index) -> !fir.ref<i8>
+  fir.store %c0_i8_d3 to %elt_d3 : !fir.ref<i8>
   return
 }
-// CHECK:     func.func @dfa3(%arg0: i1) {
-// CHECK-NEXT:  %[[MEM:.*]] = fir.alloca !fir.array<1xi8>
-// CHECK-NEXT:  fir.if %arg0 {
-// CHECK-NEXT:  } else {
-// CHECK-NEXT:  }
-// CHECK-NEXT:  return
-// CHECK-NEXT:  }
+// CHECK: func.func @dfa3(%arg0: i1)
+// CHECK:  %[[MEM:.*]] = fir.alloca !fir.array<1xi8>
+// CHECK:  return
 
 func.func private @dfa3a_foo(!fir.ref<!fir.array<1xi8>>) -> ()
 func.func private @dfa3a_bar(!fir.ref<!fir.array<1xi8>>) -> ()
@@ -101,7 +106,7 @@ func.func @dfa3a(%arg0: i1) {
   }
   return
 }
-// CHECK:     func.func @dfa3a(%arg0: i1) {
+// CHECK-LABEL: func.func @dfa3a(%arg0: i1)
 // CHECK-NEXT:  %[[MEM:.*]] = fir.alloca !fir.array<1xi8>
 // CHECK-NEXT:  %[[HEAP:.*]] = fir.convert %[[MEM]] : (!fir.ref<!fir.array<1xi8>>) -> !fir.heap<!fir.array<1xi8>>
 // CHECK-NEXT:  fir.if %arg0 {
@@ -123,13 +128,18 @@ func.func @placement1() {
   // operand is now available
   %4 = fir.allocmem !fir.array<?xi32>, %3
   // ...
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %ref1 = fir.convert %4 : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+  %elt1 = fir.coordinate_of %ref1, %c0 : (!fir.ref<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+  fir.store %c0_i32 to %elt1 : !fir.ref<i32>
   fir.freemem %4 : !fir.heap<!fir.array<?xi32>>
   return
 }
-// CHECK:      func.func @placement1() {
+// CHECK-LABEL: func.func @placement1()
 // CHECK-NEXT:   %[[ARG:.*]] = arith.constant 3 : index
 // CHECK-NEXT:   %[[MEM:.*]] = fir.alloca !fir.array<?xi32>, %[[ARG]]
-// CHECK-NEXT:   return
+// CHECK:   return
 // CHECK-NEXT: }
 
 // check that if there are no operands, then the alloca is placed early
@@ -140,16 +150,21 @@ func.func @placement2() {
   %3 = arith.addi %1, %2 : index
   %4 = fir.allocmem !fir.array<42xi32>
   // ...
+  %c0_p2 = arith.constant 0 : index
+  %c0_i32_p2 = arith.constant 0 : i32
+  %ref_p2 = fir.convert %4 : (!fir.heap<!fir.array<42xi32>>) -> !fir.ref<!fir.array<42xi32>>
+  %elt_p2 = fir.coordinate_of %ref_p2, %c0_p2 : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+  fir.store %c0_i32_p2 to %elt_p2 : !fir.ref<i32>
   fir.freemem %4 : !fir.heap<!fir.array<42xi32>>
   return
 }
-// CHECK:      func.func @placement2() {
-// CHECK-NEXT:   %[[MEM:.*]] = fir.alloca !fir.array<42xi32>
-// CHECK-NEXT:   %[[ONE:.*]] = arith.constant 1 : index
-// CHECK-NEXT:   %[[TWO:.*]] = arith.constant 2 : index
-// CHECK-NEXT:   %[[SUM:.*]] = arith.addi %[[ONE]], %[[TWO]] : index
-// CHECK-NEXT:   return
-// CHECK-NEXT: }
+// CHECK-LABEL: func.func @placement2()
+// CHECK:   %[[MEM:.*]] = fir.alloca !fir.array<42xi32>
+// CHECK:   %[[ONE:.*]] = arith.constant 1 : index
+// CHECK:   %[[TWO:.*]] = arith.constant 2 : index
+// CHECK:   %[[SUM:.*]] = arith.addi %[[ONE]], %[[TWO]] : index
+// CHECK:   return
+// CHECK: }
 
 // check that stack allocations which must be placed in loops use stacksave
 func.func @placement3() {
@@ -162,12 +177,17 @@ func.func @placement3() {
     // operand is now available
     %4 = fir.allocmem !fir.array<?xi32>, %3
     // ...
+    %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
+    %ref2 = fir.convert %4 : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+    %elt2 = fir.coordinate_of %ref2, %c0 : (!fir.ref<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+    fir.store %c0_i32 to %elt2 : !fir.ref<i32>
     fir.freemem %4 : !fir.heap<!fir.array<?xi32>>
     fir.result %3, %c1_i32 : index, i32
   }
   return
 }
-// CHECK:      func.func @placement3() {
+// CHECK-LABEL: func.func @placement3()
 // CHECK-NEXT:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK-NEXT:   %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32
 // CHECK-NEXT:   %[[C2:.*]] = arith.constant 2 : index
@@ -176,7 +196,7 @@ func.func @placement3() {
 // CHECK-NEXT:     %[[SUM:.*]] = arith.addi %[[C1]], %[[C2]] : index
 // CHECK-NEXT:     %[[SP:.*]] = llvm.intr.stacksave : !llvm.ptr
 // CHECK-NEXT:     %[[MEM:.*]] = fir.alloca !fir.array<?xi32>, %[[SUM]]
-// CHECK-NEXT:     llvm.intr.stackrestore %[[SP]] : !llvm.ptr
+// CHECK:     llvm.intr.stackrestore %[[SP]] : !llvm.ptr
 // CHECK-NEXT:     fir.result
 // CHECK-NEXT:   }
 // CHECK-NEXT:   return
@@ -194,12 +214,17 @@ func.func @placement4(%arg0 : i1) {
   // operand is now available
   %4 = fir.allocmem !fir.array<?xi32>, %3
   // ...
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %ref3 = fir.convert %4 : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+  %elt3 = fir.coordinate_of %ref3, %c0 : (!fir.ref<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+  fir.store %c0_i32 to %elt3 : !fir.ref<i32>
   fir.freemem %4 : !fir.heap<!fir.array<?xi32>>
   cf.cond_br %arg0, ^bb1, ^bb2
 ^bb2:
   return
 }
-// CHECK:      func.func @placement4(%arg0: i1) {
+// CHECK-LABEL: func.func @placement4(%arg0: i1)
 // CHECK-NEXT:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK-NEXT:   %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32
 // CHECK-NEXT:   %[[C10:.*]] = arith.constant 10 : index
@@ -208,7 +233,7 @@ func.func @placement4(%arg0 : i1) {
 // CHECK-NEXT:   %[[C3:.*]] = arith.constant 3 : index
 // CHECK-NEXT:   %[[SP:.*]] = llvm.intr.stacksave : !llvm.ptr
 // CHECK-NEXT:   %[[MEM:.*]] = fir.alloca !fir.array<?xi32>, %[[C3]]
-// CHECK-NEXT:   llvm.intr.stackrestore %[[SP]] : !llvm.ptr
+// CHECK:   llvm.intr.stackrestore %[[SP]] : !llvm.ptr
 // CHECK-NEXT:   cf.cond_br %arg0, ^bb1, ^bb2
 // CHECK-NEXT: ^bb2:
 // CHECK-NEXT:   return
@@ -230,7 +255,7 @@ func.func @placement5() {
   }
   return
 }
-// CHECK:      func.func @placement5() {
+// CHECK-LABEL: func.func @placement5()
 // CHECK-NEXT:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK-NEXT:   %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32
 // CHECK-NEXT:   %[[C2:.*]] = arith.constant 2 : index
@@ -268,7 +293,7 @@ func.func @placement6(%arg0: i1) {
   fir.freemem %4 : !fir.heap<!fir.array<?xi32>>
   cf.br ^bb1
 }
-// CHECK:      func.func @placement6(%arg0: i1) {
+// CHECK-LABEL: func.func @placement6(%arg0: i1)
 // CHECK-NEXT:   %[[c1:.*]] = arith.constant 1 : index
 // CHECK-NEXT:   %[[c1_i32:.*]] = fir.convert %[[c1]] : (index) -> i32
 // CHECK-NEXT:   %[[c2:.*]] = arith.constant 2 : index
@@ -289,6 +314,11 @@ func.func @placement6(%arg0: i1) {
 // Check multiple returns, where the memory is always freed
 func.func @returns(%arg0: i1) {
   %0 = fir.allocmem !fir.array<42xi32>
+  %c0_ret = arith.constant 0 : index
+  %c0_i32_ret = arith.constant 0 : i32
+  %ref_ret = fir.convert %0 : (!fir.heap<!fir.array<42xi32>>) -> !fir.ref<!fir.array<42xi32>>
+  %elt_ret = fir.coordinate_of %ref_ret, %c0_ret : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+  fir.store %c0_i32_ret to %elt_ret : !fir.ref<i32>
   cf.cond_br %arg0, ^bb1, ^bb2
 ^bb1:
   fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
@@ -297,9 +327,9 @@ func.func @returns(%arg0: i1) {
   fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
   return
 }
-// CHECK:      func.func @returns(%[[COND:.*]]: i1) {
-// CHECK-NEXT:   %[[ALLOC:.*]] = fir.alloca !fir.array<42xi32>
-// CHECK-NEXT:   cf.cond_br %[[COND]], ^bb1, ^bb2
+// CHECK-LABEL: func.func @returns(
+// CHECK:   %[[ALLOC:.*]] = fir.alloca !fir.array<42xi32>
+// CHECK:   cf.cond_br %{{.*}}, ^bb1, ^bb2
 // CHECK-NEXT: ^bb1:
 // CHECK-NEXT:   return
 // CHECK-NEXT: ^bb2:
@@ -309,6 +339,11 @@ func.func @returns(%arg0: i1) {
 // Check multiple returns, where the memory is not freed on one branch
 func.func @returns2(%arg0: i1) {
   %0 = fir.allocmem !fir.array<42xi32>
+  %c0_ret2 = arith.constant 0 : index
+  %c0_i32_ret2 = arith.constant 0 : i32
+  %ref_ret2 = fir.convert %0 : (!fir.heap<!fir.array<42xi32>>) -> !fir.ref<!fir.array<42xi32>>
+  %elt_ret2 = fir.coordinate_of %ref_ret2, %c0_ret2 : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+  fir.store %c0_i32_ret2 to %elt_ret2 : !fir.ref<i32>
   cf.cond_br %arg0, ^bb1, ^bb2
 ^bb1:
   fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
@@ -316,9 +351,9 @@ func.func @returns2(%arg0: i1) {
 ^bb2:
   return
 }
-// CHECK:      func.func @returns2(%[[COND:.*]]: i1) {
-// CHECK-NEXT:   %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32>
-// CHECK-NEXT:   cf.cond_br %[[COND]], ^bb1, ^bb2
+// CHECK-LABEL: func.func @returns2(
+// CHECK:   %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32>
+// CHECK:   cf.cond_br %{{.*}}, ^bb1, ^bb2
 // CHECK-NEXT: ^bb1:
 // CHECK-NEXT:   fir.freemem %[[ALLOC]] : !fir.heap<!fir.array<42xi32>>
 // CHECK-NEXT:   return
@@ -338,7 +373,7 @@ func.func @omp_placement1() {
   }
   return
 }
-// CHECK:      func.func @omp_placement1() {
+// CHECK-LABEL: func.func @omp_placement1()
 // CHECK-NEXT:   %[[MEM:.*]] = fir.alloca !fir.array<42xi32>
 // CHECK-NEXT:   %[[MEM_CONV:.*]] = fir.convert %[[MEM]] : (!fir.ref<!fir.array<42xi32>>) -> !fir.heap<!fir.array<42xi32>>
 // CHECK-NEXT:   omp.sections {
@@ -353,19 +388,21 @@ func.func @omp_placement1() {
 // function terminated by stop statement
 func.func @stop_terminator() {
   %0 = fir.allocmem !fir.array<42xi32>
+  %c0 = arith.constant 0 : index
+  %c0_i32_st = arith.constant 0 : i32
+  %ref4 = fir.convert %0 : (!fir.heap<!fir.array<42xi32>>) -> !fir.ref<!fir.array<42xi32>>
+  %elt4 = fir.coordinate_of %ref4, %c0 : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+  fir.store %c0_i32_st to %elt4 : !fir.ref<i32>
   fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
   %c0_i32 = arith.constant 0 : i32
   %false = arith.constant false
   fir.call @_FortranAStopStatement(%c0_i32, %false, %false) : (i32, i1, i1) -> ()
   fir.unreachable
 }
-// CHECK: func.func @stop_terminator() {
-// CHECK-NEXT: fir.alloca !fir.array<42xi32>
-// CHECK-NEXT:  %[[ZERO:.*]] = arith.constant 0 : i32
-// CHECK-NEXT:  %[[FALSE:.*]] = arith.constant false
-// CHECK-NEXT:  fir.call @_FortranAStopStatement(%[[ZERO]], %[[FALSE]], %[[FALSE]]) : (i32, i1, i1) -> ()
-// CHECK-NEXT:  fir.unreachable
-// CHECK-NEXT: }
+// CHECK-LABEL: func.func @stop_terminator()
+// CHECK: fir.alloca !fir.array<42xi32>
+// CHECK: fir.call @_FortranAStopStatement(
+// CHECK: fir.unreachable
 
 
 // check that stack allocations that use fir.declare which must be placed in loops
@@ -387,7 +424,7 @@ func.func @placement_loop_declare() {
   }
   return
 }
-// CHECK:      func.func @placement_loop_declare() {
+// CHECK-LABEL: func.func @placement_loop_declare()
 // CHECK-NEXT:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK-NEXT:   %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32
 // CHECK-NEXT:   %[[C2:.*]] = arith.constant 2 : index
@@ -415,7 +452,7 @@ func.func @lookthrough() {
   fir.freemem %4 : !fir.heap<!fir.array<42xi32>>
   return
 }
-// CHECK: func.func @lookthrough() {
+// CHECK-LABEL: func.func @lookthrough()
 // CHECK:     fir.alloca !fir.array<42xi32>
 // CHECK-NOT: fir.freemem
 
@@ -457,6 +494,6 @@ func.func @finding_freemem_in_block() {
 ^bb3:  // pred: ^bb1
   return
 }
-// CHECK: func.func @finding_freemem_in_block() {
+// CHECK-LABEL: func.func @finding_freemem_in_block()
 // CHECK:     fir.alloca !fir.array<?xi32>
 // CHECK-NOT: fir.freemem
diff --git a/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp b/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp
index 9a80e3b1a9aee..072aee5ba269f 100644
--- a/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp
+++ b/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp
@@ -100,6 +100,10 @@ struct TestFIROpenACCInterfaces
             }
           }
 
+          llvm::errs() << "\t\tHas unknown dimensions: "
+                       << (mappableTy.hasUnknownDimensions() ? "true" : "false")
+                       << "\n";
+
           if (auto declareOp =
                   dyn_cast_if_present<hlfir::DeclareOp>(var.getDefiningOp())) {
             llvm::errs() << "\t\tShape: " << declareOp.getShape() << "\n";
diff --git a/flang/tools/flang-driver/CMakeLists.txt b/flang/tools/flang-driver/CMakeLists.txt
index b5d6727025121..4dfc0d40cd55d 100644
--- a/flang/tools/flang-driver/CMakeLists.txt
+++ b/flang/tools/flang-driver/CMakeLists.txt
@@ -26,6 +26,7 @@ target_link_libraries(flang
 clang_target_link_libraries(flang
   PRIVATE
   clangDriver
+  clangOptions
   clangBasic
 )
 
diff --git a/flang/tools/flang-driver/driver.cpp b/flang/tools/flang-driver/driver.cpp
index bd878b7a642f1..0840255a739f3 100644
--- a/flang/tools/flang-driver/driver.cpp
+++ b/flang/tools/flang-driver/driver.cpp
@@ -52,9 +52,9 @@ createAndPopulateDiagOpts(llvm::ArrayRef<const char *> argv) {
   // Any errors that would be diagnosed here will also be diagnosed later,
   // when the DiagnosticsEngine actually exists.
   unsigned missingArgIndex, missingArgCount;
-  llvm::opt::InputArgList args = clang::driver::getDriverOptTable().ParseArgs(
+  llvm::opt::InputArgList args = clang::getDriverOptTable().ParseArgs(
       argv.slice(1), missingArgIndex, missingArgCount,
-      llvm::opt::Visibility(clang::driver::options::FlangOption));
+      llvm::opt::Visibility(clang::options::FlangOption));
 
   (void)Fortran::frontend::parseDiagnosticArgs(*diagOpts, args);
 
diff --git a/flang/unittests/Optimizer/FortranVariableTest.cpp b/flang/unittests/Optimizer/FortranVariableTest.cpp
index 57a04dccef7f7..7b363590cdea1 100644
--- a/flang/unittests/Optimizer/FortranVariableTest.cpp
+++ b/flang/unittests/Optimizer/FortranVariableTest.cpp
@@ -51,7 +51,8 @@ TEST_F(FortranVariableTest, SimpleScalar) {
       /*shape=*/mlir::Value{}, /*typeParams=*/mlir::ValueRange{},
       /*dummy_scope=*/nullptr, /*storage=*/nullptr, /*storage_offset=*/0, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
-      /*data_attr=*/cuf::DataAttributeAttr{});
+      /*data_attr=*/cuf::DataAttributeAttr{},
+      /*dummy_arg_no=*/mlir::IntegerAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_FALSE(fortranVariable.isArray());
@@ -78,7 +79,8 @@ TEST_F(FortranVariableTest, CharacterScalar) {
       /*shape=*/mlir::Value{}, typeParams, /*dummy_scope=*/nullptr,
       /*storage=*/nullptr, /*storage_offset=*/0, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
-      /*data_attr=*/cuf::DataAttributeAttr{});
+      /*data_attr=*/cuf::DataAttributeAttr{},
+      /*dummy_arg_no=*/mlir::IntegerAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_FALSE(fortranVariable.isArray());
@@ -110,7 +112,8 @@ TEST_F(FortranVariableTest, SimpleArray) {
       shape, /*typeParams=*/mlir::ValueRange{}, /*dummy_scope=*/nullptr,
       /*storage=*/nullptr, /*storage_offset=*/0, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
-      /*data_attr=*/cuf::DataAttributeAttr{});
+      /*data_attr=*/cuf::DataAttributeAttr{},
+      /*dummy_arg_no=*/mlir::IntegerAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_TRUE(fortranVariable.isArray());
@@ -142,7 +145,8 @@ TEST_F(FortranVariableTest, CharacterArray) {
       shape, typeParams, /*dummy_scope=*/nullptr, /*storage=*/nullptr,
       /*storage_offset=*/0, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
-      /*data_attr=*/cuf::DataAttributeAttr{});
+      /*data_attr=*/cuf::DataAttributeAttr{},
+      /*dummy_arg_no=*/mlir::IntegerAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_TRUE(fortranVariable.isArray());
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index ae555a256ba66..4e6b4195a9c5e 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -47,8 +47,6 @@ set(LIBC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
 set(LIBC_ENABLE_USE_BY_CLANG OFF CACHE BOOL "Whether or not to place libc in a build directory findable by a just built clang")
 
-set(LIBC_KERNEL_HEADERS "/usr/include" CACHE STRING "Path to Linux kernel headers")
-
 # Defining a global namespace to enclose all libc functions.
 set(default_namespace "__llvm_libc")
 if(LLVM_VERSION_MAJOR)
@@ -146,6 +144,11 @@ option(LLVM_LIBC_ALL_HEADERS "Outputs all functions in header files, regardless
 
 option(LIBC_CONFIG_PATH "The path to user provided folder that configures the build for the target system." OFF)
 
+if(LIBC_TARGET_OS_IS_LINUX)
+  set(kernel_headers "/usr/include")
+endif()
+set(LIBC_KERNEL_HEADERS "${kernel_headers}" CACHE STRING "Path to Linux kernel headers")
+
 set(LIBC_ENABLE_UNITTESTS ON)
 set(LIBC_ENABLE_HERMETIC_TESTS ${LLVM_LIBC_FULL_BUILD})
 
diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index 60f522d7d8c65..c17cc106f96d7 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -19,6 +19,8 @@ set(LLVM_LINK_COMPONENTS
 # Add Unit Testing Support
 #==============================================================================
 
+make_gtest_target()
+
 function(add_libc_benchmark_unittest target_name)
   if(NOT LLVM_INCLUDE_TESTS)
     return()
@@ -38,8 +40,8 @@ function(add_libc_benchmark_unittest target_name)
   )
   target_link_libraries(${target_name}
     PRIVATE
-    llvm_gtest_main
-    llvm_gtest
+    default_gtest_main
+    default_gtest
     ${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
   )
   llvm_update_compile_flags(${target_name})
diff --git a/libc/cmake/modules/LLVMLibCArchitectures.cmake b/libc/cmake/modules/LLVMLibCArchitectures.cmake
index d4103f8a5a23f..6c730f807de6d 100644
--- a/libc/cmake/modules/LLVMLibCArchitectures.cmake
+++ b/libc/cmake/modules/LLVMLibCArchitectures.cmake
@@ -94,17 +94,6 @@ if(NOT libc_compiler_target_info)
 endif()
 string(STRIP ${libc_compiler_target_info} libc_compiler_target_info)
 string(SUBSTRING ${libc_compiler_target_info} 8 -1 libc_compiler_triple)
-get_arch_and_system_from_triple(${libc_compiler_triple}
-                                compiler_arch compiler_sys)
-if(NOT compiler_arch)
-  message(FATAL_ERROR
-          "libc build: Invalid or unknown libc compiler target triple: "
-          "${libc_compiler_triple}")
-endif()
-
-set(LIBC_TARGET_ARCHITECTURE ${compiler_arch})
-set(LIBC_TARGET_OS ${compiler_sys})
-set(LIBC_CROSSBUILD FALSE)
 
 # One should not set LLVM_RUNTIMES_TARGET and LIBC_TARGET_TRIPLE
 if(LLVM_RUNTIMES_TARGET AND LIBC_TARGET_TRIPLE)
@@ -128,12 +117,40 @@ endif()
 # architecture.
 if(explicit_target_triple)
   get_arch_and_system_from_triple(${explicit_target_triple} libc_arch libc_sys)
-  if(NOT libc_arch)
+  if(NOT libc_arch OR NOT libc_sys)
     message(FATAL_ERROR
             "libc build: Invalid or unknown triple: ${explicit_target_triple}")
   endif()
   set(LIBC_TARGET_ARCHITECTURE ${libc_arch})
   set(LIBC_TARGET_OS ${libc_sys})
+  # If the compiler target triple is not the same as the triple specified by
+  # LIBC_TARGET_TRIPLE or LLVM_RUNTIMES_TARGET, we will add a --target option
+  # if the compiler is clang. If the compiler is GCC we just error out as there
+  # is no equivalent of an option like --target.
+  if(NOT libc_compiler_triple STREQUAL explicit_target_triple)
+    set(LIBC_CROSSBUILD TRUE)
+    if(CMAKE_COMPILER_IS_GNUCXX)
+      message(FATAL_ERROR
+              "GCC target triple (${libc_compiler_triple}) and the explicity "
+              "specified target triple (${explicit_target_triple}) do not match.")
+    else()
+      list(APPEND
+           LIBC_COMPILE_OPTIONS_DEFAULT "--target=${explicit_target_triple}")
+    endif()
+  else()
+    set(LIBC_CROSSBUILD FALSE)
+  endif()
+else()
+  get_arch_and_system_from_triple(${libc_compiler_triple}
+                                  compiler_arch compiler_sys)
+  if(NOT compiler_arch OR NOT compiler_sys)
+    message(FATAL_ERROR
+            "libc build: Unknown compiler default target triple: "
+            "${libc_compiler_triple}")
+  endif()
+  set(LIBC_TARGET_ARCHITECTURE ${compiler_arch})
+  set(LIBC_TARGET_OS ${compiler_sys})
+  set(LIBC_CROSSBUILD FALSE)
 endif()
 
 if((LIBC_TARGET_OS STREQUAL "unknown") OR (LIBC_TARGET_OS STREQUAL "none"))
@@ -198,31 +215,11 @@ else()
           "Unsupported libc target operating system ${LIBC_TARGET_OS}")
 endif()
 
-
-# If the compiler target triple is not the same as the triple specified by
-# LIBC_TARGET_TRIPLE or LLVM_RUNTIMES_TARGET, we will add a --target option
-# if the compiler is clang. If the compiler is GCC we just error out as there
-# is no equivalent of an option like --target.
-if(explicit_target_triple AND
-   (NOT (libc_compiler_triple STREQUAL explicit_target_triple)))
-  set(LIBC_CROSSBUILD TRUE)
-  if(CMAKE_COMPILER_IS_GNUCXX)
-    message(FATAL_ERROR
-            "GCC target triple (${libc_compiler_triple}) and the explicity "
-            "specified target triple (${explicit_target_triple}) do not match.")
-  else()
-    list(APPEND
-         LIBC_COMPILE_OPTIONS_DEFAULT "--target=${explicit_target_triple}")
-  endif()
-endif()
-
-
 # Windows does not support full mode build.
 if (LIBC_TARGET_OS_IS_WINDOWS AND LLVM_LIBC_FULL_BUILD)
   message(FATAL_ERROR "Windows does not support full mode build.")
 endif ()
 
-
 message(STATUS
-        "Building libc for ${LIBC_TARGET_ARCHITECTURE} on ${LIBC_TARGET_OS} with
-        LIBC_COMPILE_OPTIONS_DEFAULT: ${LIBC_COMPILE_OPTIONS_DEFAULT}")
+        "Building libc for ${LIBC_TARGET_ARCHITECTURE} on ${LIBC_TARGET_OS} with "
+        "LIBC_COMPILE_OPTIONS_DEFAULT: ${LIBC_COMPILE_OPTIONS_DEFAULT}")
diff --git a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
index c09d4751d3907..d76f3b16b30ec 100644
--- a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
+++ b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
@@ -9,7 +9,7 @@ if(LIBC_TARGET_ARCHITECTURE_IS_X86_64)
   set(ALL_CPU_FEATURES SSE2 SSE4_2 AVX AVX2 AVX512F AVX512BW FMA)
   set(LIBC_COMPILE_OPTIONS_NATIVE -march=native)
 elseif(LIBC_TARGET_ARCHITECTURE_IS_AARCH64)
-  set(ALL_CPU_FEATURES "FullFP16")
+  set(ALL_CPU_FEATURES FullFP16 MOPS SVE SVE2)
   set(LIBC_COMPILE_OPTIONS_NATIVE -mcpu=native)
 endif()
 
diff --git a/libc/cmake/modules/cpu_features/check_MOPS.cpp b/libc/cmake/modules/cpu_features/check_MOPS.cpp
new file mode 100644
index 0000000000000..314fe9b38bc81
--- /dev/null
+++ b/libc/cmake/modules/cpu_features/check_MOPS.cpp
@@ -0,0 +1,5 @@
+#include "src/__support/macros/properties/cpu_features.h"
+
+#ifndef LIBC_TARGET_CPU_HAS_MOPS
+#error unsupported
+#endif
diff --git a/libc/cmake/modules/cpu_features/check_SVE.cpp b/libc/cmake/modules/cpu_features/check_SVE.cpp
new file mode 100644
index 0000000000000..725f42f6eb883
--- /dev/null
+++ b/libc/cmake/modules/cpu_features/check_SVE.cpp
@@ -0,0 +1,5 @@
+#include "src/__support/macros/properties/cpu_features.h"
+
+#ifndef LIBC_TARGET_CPU_HAS_SVE
+#error unsupported
+#endif
diff --git a/libc/cmake/modules/cpu_features/check_SVE2.cpp b/libc/cmake/modules/cpu_features/check_SVE2.cpp
new file mode 100644
index 0000000000000..37f4b4fa038bb
--- /dev/null
+++ b/libc/cmake/modules/cpu_features/check_SVE2.cpp
@@ -0,0 +1,5 @@
+#include "src/__support/macros/properties/cpu_features.h"
+
+#ifndef LIBC_TARGET_CPU_HAS_SVE2
+#error unsupported
+#endif
diff --git a/libc/config/baremetal/aarch64/entrypoints.txt b/libc/config/baremetal/aarch64/entrypoints.txt
index 935c95af0d4af..c69ab3d0bb37c 100644
--- a/libc/config/baremetal/aarch64/entrypoints.txt
+++ b/libc/config/baremetal/aarch64/entrypoints.txt
@@ -269,9 +269,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.time.difftime
     libc.src.time.gmtime
     libc.src.time.gmtime_r
-    # TODO: Re-enable these when tests aren't broken.
-    # libc.src.time.localtime
-    # libc.src.time.localtime_r
+    libc.src.time.localtime
+    libc.src.time.localtime_r
     libc.src.time.mktime
     libc.src.time.strftime
     libc.src.time.strftime_l
@@ -321,8 +320,10 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.feupdateenv
 
     # math.h entrypoints
+    libc.src.math.acos
     libc.src.math.acosf
     libc.src.math.acoshf
+    libc.src.math.asin
     libc.src.math.asinf
     libc.src.math.asinhf
     libc.src.math.atan2
diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index 82e257c1d2b0d..c566f8ad08c8e 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -269,6 +269,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.time.difftime
     libc.src.time.gmtime
     libc.src.time.gmtime_r
+    libc.src.time.localtime
+    libc.src.time.localtime_r
     libc.src.time.mktime
     libc.src.time.strftime
     libc.src.time.strftime_l
@@ -321,8 +323,10 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.feupdateenv
 
     # math.h entrypoints
+    libc.src.math.acos
     libc.src.math.acosf
     libc.src.math.acoshf
+    libc.src.math.asin
     libc.src.math.asinf
     libc.src.math.asinhf
     libc.src.math.atan2
diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index c10cc1162cc5a..a6aef96e91698 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -269,6 +269,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.time.difftime
     libc.src.time.gmtime
     libc.src.time.gmtime_r
+    libc.src.time.localtime
+    libc.src.time.localtime_r
     libc.src.time.mktime
     libc.src.time.strftime
     libc.src.time.strftime_l
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 714120a79e39a..42571862b24b2 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -325,9 +325,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.dup2
     libc.src.unistd.dup3
     libc.src.unistd.execve
-    # Disabled while SYS_faccessat2 is unavailable on the buildbot.
-    # libc.src.unistd.faccessat
+    libc.src.unistd.faccessat
     libc.src.unistd.fchdir
+    libc.src.unistd.fchown
     libc.src.unistd.fpathconf
     libc.src.unistd.fsync
     libc.src.unistd.ftruncate
@@ -1144,6 +1144,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.time.ctime_r
     libc.src.time.clock
     libc.src.time.clock_gettime
+    libc.src.time.clock_settime
     libc.src.time.difftime
     libc.src.time.gettimeofday
     libc.src.time.gmtime
diff --git a/libc/config/linux/aarch64/exclude.txt b/libc/config/linux/aarch64/exclude.txt
new file mode 100644
index 0000000000000..f2f553f78933c
--- /dev/null
+++ b/libc/config/linux/aarch64/exclude.txt
@@ -0,0 +1,8 @@
+include(CheckSymbolExists)
+check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2)
+if(NOT HAVE_SYS_FACCESSAT2)
+  message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system")
+  list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS
+    libc.src.unistd.faccessat
+  )
+endif()
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index f6bbb346d10e5..b62a46b7178d5 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -329,6 +329,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.dup2
     libc.src.unistd.dup3
     libc.src.unistd.execve
+    libc.src.unistd.faccessat
     libc.src.unistd.fchdir
     libc.src.unistd.fpathconf
     libc.src.unistd.fsync
@@ -1272,6 +1273,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.time.ctime_r
     libc.src.time.clock
     libc.src.time.clock_gettime
+    libc.src.time.clock_settime
     libc.src.time.difftime
     libc.src.time.gettimeofday
     libc.src.time.gmtime
diff --git a/libc/config/linux/riscv/exclude.txt b/libc/config/linux/riscv/exclude.txt
new file mode 100644
index 0000000000000..f2f553f78933c
--- /dev/null
+++ b/libc/config/linux/riscv/exclude.txt
@@ -0,0 +1,8 @@
+include(CheckSymbolExists)
+check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2)
+if(NOT HAVE_SYS_FACCESSAT2)
+  message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system")
+  list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS
+    libc.src.unistd.faccessat
+  )
+endif()
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 7a8d74a4e5da9..8a46a7a1baae3 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -326,6 +326,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     # unistd.h entrypoints
     libc.src.unistd.access
     libc.src.unistd.chdir
+    libc.src.unistd.chown
     libc.src.unistd.close
     libc.src.unistd.dup
     libc.src.unistd.dup2
@@ -333,6 +334,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.execve
     libc.src.unistd.faccessat
     libc.src.unistd.fchdir
+    libc.src.unistd.fchown
     libc.src.unistd.fpathconf
     libc.src.unistd.fsync
     libc.src.unistd.ftruncate
@@ -344,6 +346,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.getppid
     libc.src.unistd.getsid
     libc.src.unistd.gettid
+    libc.src.unistd.getgid
     libc.src.unistd.getuid
     libc.src.unistd.isatty
     libc.src.unistd.link
@@ -1311,6 +1314,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.time.ctime_r
     libc.src.time.clock
     libc.src.time.clock_gettime
+    libc.src.time.clock_settime
     libc.src.time.difftime
     libc.src.time.gettimeofday
     libc.src.time.gmtime
diff --git a/libc/config/linux/x86_64/exclude.txt b/libc/config/linux/x86_64/exclude.txt
index a0686310d21ac..31b60a9c3497c 100644
--- a/libc/config/linux/x86_64/exclude.txt
+++ b/libc/config/linux/x86_64/exclude.txt
@@ -23,6 +23,7 @@ endif()
 include(CheckSymbolExists)
 check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2)
 if(NOT HAVE_SYS_FACCESSAT2)
+  message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system")
   list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS
     libc.src.unistd.faccessat
   )
diff --git a/libc/docs/headers/time.rst b/libc/docs/headers/time.rst
index 55bc1a17ee285..f07e0d93a4ce6 100644
--- a/libc/docs/headers/time.rst
+++ b/libc/docs/headers/time.rst
@@ -67,11 +67,11 @@ Implementation Status
 +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | clock_getres        |         |         |         |                 |         |         |         |         |         |         |         |         |         |
 +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| clock_gettime       | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |         |
+| clock_gettime       | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         | |check| | |check| |
 +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | clock_nanosleep     |         |         |         |                 |         |         |         |         |         |         |         |         |         |
 +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| clock_settime       |         |         |         |                 |         |         |         |         |         |         |         |         |         |
+| clock_settime       | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |         |
 +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | ctime               | |check| | |check| |         |     |check|     |         |         |         |         |         |         |         |         |         |
 +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
diff --git a/libc/fuzzing/__support/freelist_heap_fuzz.cpp b/libc/fuzzing/__support/freelist_heap_fuzz.cpp
index 7b7985a83c3e6..0b400cb156491 100644
--- a/libc/fuzzing/__support/freelist_heap_fuzz.cpp
+++ b/libc/fuzzing/__support/freelist_heap_fuzz.cpp
@@ -24,7 +24,7 @@ asm(R"(
 _end:
   .fill 1024
 __llvm_libc_heap_limit:
-)";
+)");
 
 using LIBC_NAMESPACE::FreeListHeap;
 using LIBC_NAMESPACE::inline_memset;
diff --git a/libc/fuzzing/string/CMakeLists.txt b/libc/fuzzing/string/CMakeLists.txt
index efda80b59c951..0918e92552ea7 100644
--- a/libc/fuzzing/string/CMakeLists.txt
+++ b/libc/fuzzing/string/CMakeLists.txt
@@ -40,3 +40,11 @@ add_libc_fuzzer(
   DEPENDS
     libc.src.strings.bcmp
 )
+
+add_libc_fuzzer(
+  strlen_fuzz
+  SRCS
+    strlen_fuzz.cpp
+  DEPENDS
+    libc.src.string.strlen
+)
diff --git a/libc/fuzzing/string/strlen_fuzz.cpp b/libc/fuzzing/string/strlen_fuzz.cpp
new file mode 100644
index 0000000000000..dd72c19b7fdc7
--- /dev/null
+++ b/libc/fuzzing/string/strlen_fuzz.cpp
@@ -0,0 +1,32 @@
+//===-- strlen_fuzz.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc strlen implementation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/string/strlen.h"
+#include <cstdint>
+#include <cstring>
+
+// always null terminate the data
+extern "C" size_t LLVMFuzzerMutate(uint8_t *data, size_t size, size_t max_size);
+extern "C" size_t LLVMFuzzerCustomMutator(uint8_t *data, size_t size,
+                                          size_t max_size, unsigned int seed) {
+  size = LLVMFuzzerMutate(data, size, max_size);
+  data[size - 1] = '\0';
+  return size;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  size_t ref = ::strlen(reinterpret_cast<const char *>(data));
+  size_t impl = LIBC_NAMESPACE::strlen(reinterpret_cast<const char *>(data));
+  if (ref != impl)
+    __builtin_trap();
+  return 0;
+}
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index 225843924c243..433c47b174766 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -479,3 +479,11 @@ add_proxy_header_library(
     libc.include.llvm-libc-types.struct_rlimit
     libc.include.sys_resource
 )
+
+add_proxy_header_library(
+  gid_t
+  HDRS
+    gid_t.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.gid_t
+)
diff --git a/libc/hdr/types/gid_t.h b/libc/hdr/types/gid_t.h
new file mode 100644
index 0000000000000..bc274aaa9a8a8
--- /dev/null
+++ b/libc/hdr/types/gid_t.h
@@ -0,0 +1,22 @@
+//===-- Proxy for gid_t ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_TYPES_GID_T_H
+#define LLVM_LIBC_HDR_TYPES_GID_T_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/gid_t.h"
+
+#else // Overlay mode
+
+#include <sys/types.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_GID_T_H
diff --git a/libc/include/llvm-libc-macros/math-macros.h b/libc/include/llvm-libc-macros/math-macros.h
index 6697ce5b03851..e1b12e3010fe9 100644
--- a/libc/include/llvm-libc-macros/math-macros.h
+++ b/libc/include/llvm-libc-macros/math-macros.h
@@ -42,14 +42,37 @@
 #define FP_LLOGBNAN LONG_MAX
 #endif
 
-#if defined(__NVPTX__) || defined(__AMDGPU__) || defined(__FAST_MATH__)
-#define math_errhandling 0
-#elif defined(__NO_MATH_ERRNO__)
-#define math_errhandling (MATH_ERREXCEPT)
+// Math error handling. Target support is assumed to be existent unless
+// explicitly disabled.
+#if defined(__NVPTX__) || defined(__AMDGPU__) || defined(__FAST_MATH__) ||     \
+    defined(__NO_MATH_ERRNO__)
+#define __LIBC_SUPPORTS_MATH_ERRNO 0
+#else
+#define __LIBC_SUPPORTS_MATH_ERRNO 1
+#endif
+
+#if defined(__FAST_MATH__) ||                                                  \
+    ((defined(__arm__) || defined(_M_ARM) || defined(__thumb__) ||             \
+      defined(__aarch64__) || defined(_M_ARM64)) &&                            \
+     !defined(__ARM_FP))
+#define __LIBC_SUPPORTS_MATH_ERREXCEPT 0
 #else
+#define __LIBC_SUPPORTS_MATH_ERREXCEPT 1
+#endif
+
+#if __LIBC_SUPPORTS_MATH_ERRNO && __LIBC_SUPPORTS_MATH_ERREXCEPT
 #define math_errhandling (MATH_ERRNO | MATH_ERREXCEPT)
+#elif __LIBC_SUPPORTS_MATH_ERRNO
+#define math_errhandling (MATH_ERRNO)
+#elif __LIBC_SUPPORTS_MATH_ERREXCEPT
+#define math_errhandling (MATH_ERREXCEPT)
+#else
+#define math_errhandling 0
 #endif
 
+#undef __LIBC_SUPPORTS_MATH_ERRNO
+#undef __LIBC_SUPPORTS_MATH_ERREXCEPT
+
 // POSIX math constants
 // https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/math.h.html
 #define M_E (__extension__ 0x1.5bf0a8b145769p1)
diff --git a/libc/include/llvm-libc-macros/netinet-in-macros.h b/libc/include/llvm-libc-macros/netinet-in-macros.h
index fb7564cee9e80..2011c34e288cd 100644
--- a/libc/include/llvm-libc-macros/netinet-in-macros.h
+++ b/libc/include/llvm-libc-macros/netinet-in-macros.h
@@ -9,6 +9,9 @@
 #ifndef LLVM_LIBC_MACROS_NETINET_IN_MACROS_H
 #define LLVM_LIBC_MACROS_NETINET_IN_MACROS_H
 
+#include "../llvm-libc-types/in_addr_t.h"
+#include "__llvm-libc-common.h"
+
 #define IPPROTO_IP 0
 #define IPPROTO_ICMP 1
 #define IPPROTO_TCP 6
@@ -24,4 +27,10 @@
 #define IPV6_LEAVE_GROUP 21
 #define IPV6_V6ONLY 26
 
+#define INADDR_ANY __LLVM_LIBC_CAST(static_cast, in_addr_t, 0x00000000)
+#define INADDR_BROADCAST __LLVM_LIBC_CAST(static_cast, in_addr_t, 0xffffffff)
+
+#define INET_ADDRSTRLEN 16
+#define INET6_ADDRSTRLEN 46
+
 #endif // LLVM_LIBC_MACROS_NETINET_IN_MACROS_H
diff --git a/libc/include/llvm-libc-types/__barrier_type.h b/libc/include/llvm-libc-types/__barrier_type.h
index 59712619e917d..5752f832f04b9 100644
--- a/libc/include/llvm-libc-types/__barrier_type.h
+++ b/libc/include/llvm-libc-types/__barrier_type.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_TYPES__BARRIER_TYPE_H
 #define LLVM_LIBC_TYPES__BARRIER_TYPE_H
 
+#include <stdbool.h>
+
 typedef struct __attribute__((aligned(8 /* alignof (Barrier) */))) {
   unsigned expected;
   unsigned waiting;
diff --git a/libc/include/llvm-libc-types/pthread_barrierattr_t.h b/libc/include/llvm-libc-types/pthread_barrierattr_t.h
index 064be5bfb6721..b62fdc0f72e12 100644
--- a/libc/include/llvm-libc-types/pthread_barrierattr_t.h
+++ b/libc/include/llvm-libc-types/pthread_barrierattr_t.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_TYPES_PTHREAD_BARRIERATTR_T_H
 #define LLVM_LIBC_TYPES_PTHREAD_BARRIERATTR_T_H
 
+#include <stdbool.h>
+
 typedef struct {
   bool pshared;
 } pthread_barrierattr_t;
diff --git a/libc/include/time.yaml b/libc/include/time.yaml
index 88e50d1288238..c2b8a1e4cfb8e 100644
--- a/libc/include/time.yaml
+++ b/libc/include/time.yaml
@@ -67,6 +67,13 @@ functions:
     arguments:
       - type: clockid_t
       - type: struct timespec *
+  - name: clock_settime
+    standard:
+      - POSIX
+    return_type: int
+    arguments:
+      - type: clockid_t
+      - type: const struct timespec *
   - name: difftime
     standard:
       - stdc
diff --git a/libc/include/unistd.yaml b/libc/include/unistd.yaml
index 2ff86eafaf550..3f5e957768533 100644
--- a/libc/include/unistd.yaml
+++ b/libc/include/unistd.yaml
@@ -3,6 +3,7 @@ header_template: unistd.h.def
 macros: []
 types:
   - type_name: uid_t
+  - type_name: gid_t
   - type_name: ssize_t
   - type_name: size_t
   - type_name: pid_t
@@ -54,6 +55,14 @@ functions:
     return_type: int
     arguments:
       - type: const char *
+  - name: chown
+    standards:
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: uid_t
+      - type: gid_t
   - name: close
     standards:
       - POSIX
@@ -111,6 +120,14 @@ functions:
     return_type: int
     arguments:
       - type: int
+  - name: fchown
+    standards:
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: uid_t
+      - type: gid_t
   - name: fork
     standards:
       - POSIX
@@ -195,6 +212,12 @@ functions:
     return_type: uid_t
     arguments:
       - type: void
+  - name: getgid
+    standards:
+      - POSIX
+    return_type: gid_t
+    arguments:
+      - type: void
   - name: isatty
     standards:
       - POSIX
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index c8b9e21b56b28..fb5b19b523b31 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -6,6 +6,10 @@ macros:
 types:
   - type_name: FILE
   - type_name: size_t
+  # TODO: Remove this once we have a function declaration using "struct tm"
+  # (wcsftime). We're declaring it here now, since libc++ expects
+  # forward-declaration of "struct tm" in the <wchar.h> header.
+  - type_name: struct_tm
   - type_name: wint_t
   - type_name: wchar_t
   - type_name: mbstate_t
diff --git a/libc/shared/math.h b/libc/shared/math.h
index bd6aee73c3933..282dd6243d6a7 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -51,6 +51,7 @@
 #include "math/exp2f.h"
 #include "math/exp2f16.h"
 #include "math/exp2m1f.h"
+#include "math/exp2m1f16.h"
 #include "math/expf.h"
 #include "math/expf16.h"
 #include "math/frexpf.h"
diff --git a/libc/shared/math/exp2m1f16.h b/libc/shared/math/exp2m1f16.h
new file mode 100644
index 0000000000000..96a404708be18
--- /dev/null
+++ b/libc/shared/math/exp2m1f16.h
@@ -0,0 +1,29 @@
+//===-- Shared exp2m1f16 function -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXP2M1F16_H
+#define LLVM_LIBC_SHARED_MATH_EXP2M1F16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+#include "shared/libc_common.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/math/exp2m1f16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::exp2m1f16;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_EXP2M1F16_H
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 0ef09a9b8c9d0..96874702b1fdf 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -161,6 +161,7 @@ add_header_library(
   HDRS
     wctype_utils.h
   DEPENDS
+    libc.hdr.types.wchar_t
     libc.hdr.types.wint_t
 )
 
@@ -179,19 +180,7 @@ add_header_library(
   DEPENDS
     .ctype_utils
     .str_to_num_result
-    libc.hdr.errno_macros
-    libc.src.__support.CPP.limits
-    libc.src.__support.CPP.type_traits
-    libc.src.__support.common
-)
-
-add_header_library(
-  wcs_to_integer
-  HDRS
-    wcs_to_integer.h
-  DEPENDS
     .wctype_utils
-    .str_to_num_result
     libc.hdr.errno_macros
     libc.src.__support.CPP.limits
     libc.src.__support.CPP.type_traits
diff --git a/libc/src/__support/CPP/type_traits/is_destructible.h b/libc/src/__support/CPP/type_traits/is_destructible.h
index 7ada2235b4e73..dc5e62b32dce0 100644
--- a/libc/src/__support/CPP/type_traits/is_destructible.h
+++ b/libc/src/__support/CPP/type_traits/is_destructible.h
@@ -15,6 +15,7 @@
 #include "src/__support/CPP/type_traits/remove_all_extents.h"
 #include "src/__support/CPP/type_traits/true_type.h"
 #include "src/__support/CPP/type_traits/type_identity.h"
+#include "src/__support/CPP/utility/declval.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 
diff --git a/libc/src/__support/OSUtil/linux/fcntl.cpp b/libc/src/__support/OSUtil/linux/fcntl.cpp
index bb76eee90efd2..08db4859c6417 100644
--- a/libc/src/__support/OSUtil/linux/fcntl.cpp
+++ b/libc/src/__support/OSUtil/linux/fcntl.cpp
@@ -66,7 +66,7 @@ ErrorOr<int> fcntl(int fd, int cmd, void *arg) {
         LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, cmd, &flk64);
     // On failure, return
     if (ret < 0)
-      return Error(-1);
+      return Error(-ret);
     // Check for overflow, i.e. the offsets are not the same when cast
     // to off_t from off64_t.
     if (static_cast<off_t>(flk64.l_len) != flk64.l_len ||
diff --git a/libc/src/__support/ctype_utils.h b/libc/src/__support/ctype_utils.h
index be0f25330af9e..d60562c02e81c 100644
--- a/libc/src/__support/ctype_utils.h
+++ b/libc/src/__support/ctype_utils.h
@@ -27,7 +27,7 @@ namespace internal {
 // as well as a way to support non-ASCII character encodings.
 
 // Similarly, do not change these functions to use case ranges. e.g.
-//  bool islower(int ch) {
+//  bool islower(char ch) {
 //    switch(ch) {
 //    case 'a'...'z':
 //      return true;
@@ -37,7 +37,7 @@ namespace internal {
 // EBCDIC. Technically we could use some smaller ranges, but that's even harder
 // to read.
 
-LIBC_INLINE static constexpr bool islower(int ch) {
+LIBC_INLINE static constexpr bool islower(char ch) {
   switch (ch) {
   case 'a':
   case 'b':
@@ -71,7 +71,7 @@ LIBC_INLINE static constexpr bool islower(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr bool isupper(int ch) {
+LIBC_INLINE static constexpr bool isupper(char ch) {
   switch (ch) {
   case 'A':
   case 'B':
@@ -105,7 +105,7 @@ LIBC_INLINE static constexpr bool isupper(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr bool isdigit(int ch) {
+LIBC_INLINE static constexpr bool isdigit(char ch) {
   switch (ch) {
   case '0':
   case '1':
@@ -123,7 +123,7 @@ LIBC_INLINE static constexpr bool isdigit(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr int tolower(int ch) {
+LIBC_INLINE static constexpr char tolower(char ch) {
   switch (ch) {
   case 'A':
     return 'a';
@@ -182,7 +182,7 @@ LIBC_INLINE static constexpr int tolower(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr int toupper(int ch) {
+LIBC_INLINE static constexpr char toupper(char ch) {
   switch (ch) {
   case 'a':
     return 'A';
@@ -241,7 +241,7 @@ LIBC_INLINE static constexpr int toupper(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr bool isalpha(int ch) {
+LIBC_INLINE static constexpr bool isalpha(char ch) {
   switch (ch) {
   case 'a':
   case 'b':
@@ -301,7 +301,7 @@ LIBC_INLINE static constexpr bool isalpha(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr bool isalnum(int ch) {
+LIBC_INLINE static constexpr bool isalnum(char ch) {
   switch (ch) {
   case 'a':
   case 'b':
@@ -371,7 +371,7 @@ LIBC_INLINE static constexpr bool isalnum(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr int b36_char_to_int(int ch) {
+LIBC_INLINE static constexpr int b36_char_to_int(char ch) {
   switch (ch) {
   case '0':
     return 0;
@@ -476,7 +476,7 @@ LIBC_INLINE static constexpr int b36_char_to_int(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr int int_to_b36_char(int num) {
+LIBC_INLINE static constexpr char int_to_b36_char(int num) {
   // Can't actually use LIBC_ASSERT here because it depends on integer_to_string
   // which depends on this.
 
@@ -559,7 +559,7 @@ LIBC_INLINE static constexpr int int_to_b36_char(int num) {
   }
 }
 
-LIBC_INLINE static constexpr bool isspace(int ch) {
+LIBC_INLINE static constexpr bool isspace(char ch) {
   switch (ch) {
   case ' ':
   case '\t':
@@ -574,10 +574,17 @@ LIBC_INLINE static constexpr bool isspace(int ch) {
 }
 
 // not yet encoding independent.
-LIBC_INLINE static constexpr bool isgraph(int ch) {
+LIBC_INLINE static constexpr bool isgraph(char ch) {
   return 0x20 < ch && ch < 0x7f;
 }
 
+// An overload which provides a way to compare input with specific character
+// values, when input can be of a regular or a wide character type.
+LIBC_INLINE static constexpr bool is_char_or_wchar(char ch, char c_value,
+                                                   [[maybe_unused]] wchar_t) {
+  return (ch == c_value);
+}
+
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/__support/float_to_string.h b/libc/src/__support/float_to_string.h
index cab146a5b8698..9115ed2856881 100644
--- a/libc/src/__support/float_to_string.h
+++ b/libc/src/__support/float_to_string.h
@@ -700,7 +700,11 @@ template <> class FloatToString<long double> {
 
       const int SHIFT_AMOUNT = FLOAT_AS_INT_WIDTH + exponent;
       static_assert(EXTRA_INT_WIDTH >= sizeof(long double) * 8);
-      float_as_fixed <<= SHIFT_AMOUNT;
+      if (SHIFT_AMOUNT > 0) {
+        float_as_fixed <<= SHIFT_AMOUNT;
+      } else {
+        float_as_fixed >>= -SHIFT_AMOUNT;
+      }
 
       // If there are still digits above the decimal point, handle those.
       if (cpp::countl_zero(float_as_fixed) <
@@ -769,7 +773,7 @@ template <> class FloatToString<long double> {
     // The decimal representation of 2**(-i) will have exactly i digits after
     // the decimal point.
     const int num_requested_digits =
-        static_cast<int>((negative_block_index + 1) * BLOCK_SIZE);
+        static_cast<int>(negative_block_index * BLOCK_SIZE);
 
     return num_requested_digits > -exponent;
   }
diff --git a/libc/src/__support/integer_to_string.h b/libc/src/__support/integer_to_string.h
index 29449bd739730..5e7369de00962 100644
--- a/libc/src/__support/integer_to_string.h
+++ b/libc/src/__support/integer_to_string.h
@@ -378,9 +378,8 @@ template <typename T, typename Fmt = radix::Dec> class IntegerToString {
     using UNSIGNED_T = make_integral_or_big_int_unsigned_t<T>;
 
     LIBC_INLINE static char digit_char(uint8_t digit) {
-      const int result = internal::int_to_b36_char(digit);
-      return static_cast<char>(Fmt::IS_UPPERCASE ? internal::toupper(result)
-                                                 : result);
+      const char result = internal::int_to_b36_char(digit);
+      return Fmt::IS_UPPERCASE ? internal::toupper(result) : result;
     }
 
     LIBC_INLINE static void
diff --git a/libc/src/__support/macros/properties/cpu_features.h b/libc/src/__support/macros/properties/cpu_features.h
index fc6099ca6ccc5..1fe20d9b23a34 100644
--- a/libc/src/__support/macros/properties/cpu_features.h
+++ b/libc/src/__support/macros/properties/cpu_features.h
@@ -18,6 +18,18 @@
 #define LIBC_TARGET_CPU_HAS_FULLFP16
 #endif
 
+#if defined(__ARM_FEATURE_SVE)
+#define LIBC_TARGET_CPU_HAS_SVE
+#endif
+
+#if defined(__ARM_FEATURE_SVE2)
+#define LIBC_TARGET_CPU_HAS_SVE2
+#endif
+
+#if defined(__ARM_FEATURE_MOPS)
+#define LIBC_TARGET_CPU_HAS_MOPS
+#endif
+
 #if defined(__SSE2__)
 #define LIBC_TARGET_CPU_HAS_SSE2
 #define LIBC_TARGET_CPU_HAS_FPU_FLOAT
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 620900028d424..ddc0159b10ce4 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -769,6 +769,24 @@ add_header_library(
     libc.src.__support.macros.properties.cpu_features
 )
 
+add_header_library(
+  exp2m1f16
+  HDRS
+    exp2m1f16.h
+  DEPENDS
+    .expxf16_utils
+    libc.src.__support.common
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+    libc.src.__support.macros.properties.cpu_features
+)
+
 add_header_library(
   exp10
   HDRS
diff --git a/libc/src/__support/math/exp2m1f16.h b/libc/src/__support/math/exp2m1f16.h
new file mode 100644
index 0000000000000..0424af4aa953d
--- /dev/null
+++ b/libc/src/__support/math/exp2m1f16.h
@@ -0,0 +1,180 @@
+//===-- Implementation header for exp2m1f16 ----------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP2M1F16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP2M1F16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+#include "src/__support/macros/properties/cpu_features.h"
+#include "src/__support/math/expxf16_utils.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE static constexpr float16 exp2m1f16(float16 x) {
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  constexpr fputil::ExceptValues<float16, 6> EXP2M1F16_EXCEPTS_LO = {{
+      // (input, RZ output, RU offset, RD offset, RN offset)
+      // x = 0x1.cf4p-13, exp2m1f16(x) = 0x1.41p-13 (RZ)
+      {0x0b3dU, 0x0904U, 1U, 0U, 1U},
+      // x = 0x1.4fcp-12, exp2m1f16(x) = 0x1.d14p-13 (RZ)
+      {0x0d3fU, 0x0b45U, 1U, 0U, 1U},
+      // x = 0x1.63p-11, exp2m1f16(x) = 0x1.ec4p-12 (RZ)
+      {0x118cU, 0x0fb1U, 1U, 0U, 0U},
+      // x = 0x1.6fp-7, exp2m1f16(x) = 0x1.fe8p-8 (RZ)
+      {0x21bcU, 0x1ffaU, 1U, 0U, 1U},
+      // x = -0x1.c6p-10, exp2m1f16(x) = -0x1.3a8p-10 (RZ)
+      {0x9718U, 0x94eaU, 0U, 1U, 0U},
+      // x = -0x1.cfcp-10, exp2m1f16(x) = -0x1.414p-10 (RZ)
+      {0x973fU, 0x9505U, 0U, 1U, 0U},
+  }};
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+  constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 6;
+#else
+  constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 7;
+#endif
+
+  constexpr fputil::ExceptValues<float16, N_EXP2M1F16_EXCEPTS_HI>
+      EXP2M1F16_EXCEPTS_HI = {{
+          // (input, RZ output, RU offset, RD offset, RN offset)
+          // x = 0x1.e58p-3, exp2m1f16(x) = 0x1.6dcp-3 (RZ)
+          {0x3396U, 0x31b7U, 1U, 0U, 0U},
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+          // x = 0x1.2e8p-2, exp2m1f16(x) = 0x1.d14p-3 (RZ)
+          {0x34baU, 0x3345U, 1U, 0U, 0U},
+#endif
+          // x = 0x1.ad8p-2, exp2m1f16(x) = 0x1.598p-2 (RZ)
+          {0x36b6U, 0x3566U, 1U, 0U, 0U},
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+          // x = 0x1.edcp-2, exp2m1f16(x) = 0x1.964p-2 (RZ)
+          {0x37b7U, 0x3659U, 1U, 0U, 1U},
+#endif
+          // x = -0x1.804p-3, exp2m1f16(x) = -0x1.f34p-4 (RZ)
+          {0xb201U, 0xafcdU, 0U, 1U, 1U},
+          // x = -0x1.f3p-3, exp2m1f16(x) = -0x1.3e4p-3 (RZ)
+          {0xb3ccU, 0xb0f9U, 0U, 1U, 0U},
+          // x = -0x1.294p-1, exp2m1f16(x) = -0x1.53p-2 (RZ)
+          {0xb8a5U, 0xb54cU, 0U, 1U, 1U},
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+          // x = -0x1.a34p-1, exp2m1f16(x) = -0x1.bb4p-2 (RZ)
+          {0xba8dU, 0xb6edU, 0U, 1U, 1U},
+#endif
+      }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  using namespace math::expxf16_internal;
+  using FPBits = fputil::FPBits<float16>;
+  FPBits x_bits(x);
+
+  uint16_t x_u = x_bits.uintval();
+  uint16_t x_abs = x_u & 0x7fffU;
+
+  // When |x| <= 2^(-3), or |x| >= 11, or x is NaN.
+  if (LIBC_UNLIKELY(x_abs <= 0x3000U || x_abs >= 0x4980U)) {
+    // exp2m1(NaN) = NaN
+    if (x_bits.is_nan()) {
+      if (x_bits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+
+      return x;
+    }
+
+    // When x >= 16.
+    if (x_u >= 0x4c00 && x_bits.is_pos()) {
+      // exp2m1(+inf) = +inf
+      if (x_bits.is_inf())
+        return FPBits::inf().get_val();
+
+      switch (fputil::quick_get_round()) {
+      case FE_TONEAREST:
+      case FE_UPWARD:
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
+        return FPBits::inf().get_val();
+      default:
+        return FPBits::max_normal().get_val();
+      }
+    }
+
+    // When x < -11.
+    if (x_u > 0xc980U) {
+      // exp2m1(-inf) = -1
+      if (x_bits.is_inf())
+        return FPBits::one(Sign::NEG).get_val();
+
+      // When -12 < x < -11, round(2^x - 1, HP, RN) = -0x1.ffcp-1.
+      if (x_u < 0xca00U)
+        return fputil::round_result_slightly_down(
+            fputil::cast<float16>(-0x1.ffcp-1));
+
+      // When x <= -12, round(2^x - 1, HP, RN) = -1.
+      switch (fputil::quick_get_round()) {
+      case FE_TONEAREST:
+      case FE_DOWNWARD:
+        return FPBits::one(Sign::NEG).get_val();
+      default:
+        return fputil::cast<float16>(-0x1.ffcp-1);
+      }
+    }
+
+    // When |x| <= 2^(-3).
+    if (x_abs <= 0x3000U) {
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+      if (auto r = EXP2M1F16_EXCEPTS_LO.lookup(x_u);
+          LIBC_UNLIKELY(r.has_value()))
+        return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+      float xf = x;
+      // Degree-5 minimax polynomial generated by Sollya with the following
+      // commands:
+      //   > display = hexadecimal;
+      //   > P = fpminimax((2^x - 1)/x, 4, [|SG...|], [-2^-3, 2^-3]);
+      //   > x * P;
+      return fputil::cast<float16>(
+          xf * fputil::polyeval(xf, 0x1.62e43p-1f, 0x1.ebfbdep-3f,
+                                0x1.c6af88p-5f, 0x1.3b45d6p-7f,
+                                0x1.641e7cp-10f));
+    }
+  }
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  if (auto r = EXP2M1F16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  // exp2(x) = exp2(hi + mid) * exp2(lo)
+  auto [exp2_hi_mid, exp2_lo] = exp2_range_reduction(x);
+  // exp2m1(x) = exp2(hi + mid) * exp2(lo) - 1
+  return fputil::cast<float16>(
+      fputil::multiply_add(exp2_hi_mid, exp2_lo, -1.0f));
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP2M1F16_H
diff --git a/libc/src/__support/str_to_integer.h b/libc/src/__support/str_to_integer.h
index d332c929f2c31..2df1ea894e53a 100644
--- a/libc/src/__support/str_to_integer.h
+++ b/libc/src/__support/str_to_integer.h
@@ -25,36 +25,51 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_num_result.h"
 #include "src/__support/uint128.h"
+#include "src/__support/wctype_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
 // Returns the idx to the first character in src that is not a whitespace
 // character (as determined by isspace())
+template <typename CharType>
 LIBC_INLINE size_t
-first_non_whitespace(const char *__restrict src,
+first_non_whitespace(const CharType *__restrict src,
                      size_t src_len = cpp::numeric_limits<size_t>::max()) {
   size_t src_cur = 0;
-  while (src_cur < src_len && internal::isspace(src[src_cur])) {
-    ++src_cur;
-  }
+  for (; src_cur < src_len && internal::isspace(src[src_cur]); ++src_cur)
+    ;
   return src_cur;
 }
 
+// Returns +1, -1, or 0 if 'src' starts with (respectively)
+// plus sign, minus sign, or neither.
+template <typename CharType>
+LIBC_INLINE static int get_sign(const CharType *__restrict src) {
+  if (is_char_or_wchar(src[0], '+', L'+'))
+    return 1;
+  if (is_char_or_wchar(src[0], '-', L'-'))
+    return -1;
+  return 0;
+}
+
 // checks if the next 3 characters of the string pointer are the start of a
 // hexadecimal number. Does not advance the string pointer.
-LIBC_INLINE bool
-is_hex_start(const char *__restrict src,
-             size_t src_len = cpp::numeric_limits<size_t>::max()) {
+template <typename CharType>
+LIBC_INLINE static bool is_hex_start(const CharType *__restrict src,
+                                     size_t src_len) {
   if (src_len < 3)
     return false;
-  return *src == '0' && tolower(*(src + 1)) == 'x' && isalnum(*(src + 2)) &&
-         b36_char_to_int(*(src + 2)) < 16;
+  return is_char_or_wchar(src[0], '0', L'0') &&
+         is_char_or_wchar(tolower(src[1]), 'x', L'x') && isalnum(src[2]) &&
+         b36_char_to_int(src[2]) < 16;
 }
 
 // Takes the address of the string pointer and parses the base from the start of
 // it.
-LIBC_INLINE int infer_base(const char *__restrict src, size_t src_len) {
+template <typename CharType>
+LIBC_INLINE static int infer_base(const CharType *__restrict src,
+                                  size_t src_len) {
   // A hexadecimal number is defined as "the prefix 0x or 0X followed by a
   // sequence of the decimal digits and the letters a (or A) through f (or F)
   // with values 10 through 15 respectively." (C standard 6.4.4.1)
@@ -63,8 +78,9 @@ LIBC_INLINE int infer_base(const char *__restrict src, size_t src_len) {
   // An octal number is defined as "the prefix 0 optionally followed by a
   // sequence of the digits 0 through 7 only" (C standard 6.4.4.1) and so any
   // number that starts with 0, including just 0, is an octal number.
-  if (src_len > 0 && src[0] == '0')
+  if (src_len > 0 && is_char_or_wchar(src[0], '0', L'0')) {
     return 8;
+  }
   // A decimal number is defined as beginning "with a nonzero digit and
   // consist[ing] of a sequence of decimal digits." (C standard 6.4.4.1)
   return 10;
@@ -77,32 +93,27 @@ LIBC_INLINE int infer_base(const char *__restrict src, size_t src_len) {
 // -----------------------------------------------------------------------------
 // Takes a pointer to a string and the base to convert to. This function is used
 // as the backend for all of the string to int functions.
-template <class T>
+template <typename T, typename CharType>
 LIBC_INLINE StrToNumResult<T>
-strtointeger(const char *__restrict src, int base,
+strtointeger(const CharType *__restrict src, int base,
              const size_t src_len = cpp::numeric_limits<size_t>::max()) {
   using ResultType = make_integral_or_big_int_unsigned_t<T>;
 
-  ResultType result = 0;
-
-  bool is_number = false;
-  size_t src_cur = 0;
-  int error_val = 0;
-
   if (src_len == 0)
     return {0, 0, 0};
 
   if (base < 0 || base == 1 || base > 36)
     return {0, 0, EINVAL};
 
-  src_cur = first_non_whitespace(src, src_len);
-
-  char result_sign = '+';
-  if (src[src_cur] == '+' || src[src_cur] == '-') {
-    result_sign = src[src_cur];
-    ++src_cur;
+  size_t src_cur = first_non_whitespace(src, src_len);
+  if (src_cur == src_len) {
+    return {0, 0, 0};
   }
 
+  int sign = get_sign(src + src_cur);
+  bool is_positive = (sign >= 0);
+  src_cur += (sign != 0);
+
   if (base == 0)
     base = infer_base(src + src_cur, src_len - src_cur);
 
@@ -110,8 +121,6 @@ strtointeger(const char *__restrict src, int base,
     src_cur = src_cur + 2;
 
   constexpr bool IS_UNSIGNED = cpp::is_unsigned_v<T>;
-  const bool is_positive = (result_sign == '+');
-
   ResultType constexpr NEGATIVE_MAX =
       !IS_UNSIGNED ? static_cast<ResultType>(cpp::numeric_limits<T>::max()) + 1
                    : cpp::numeric_limits<T>::max();
@@ -120,6 +129,9 @@ strtointeger(const char *__restrict src, int base,
   ResultType const abs_max_div_by_base =
       abs_max / static_cast<ResultType>(base);
 
+  bool is_number = false;
+  int error_val = 0;
+  ResultType result = 0;
   while (src_cur < src_len && isalnum(src[src_cur])) {
     int cur_digit = b36_char_to_int(src[src_cur]);
     if (cur_digit >= base)
diff --git a/libc/src/__support/time/CMakeLists.txt b/libc/src/__support/time/CMakeLists.txt
index 8247e792e8410..3851037e4161f 100644
--- a/libc/src/__support/time/CMakeLists.txt
+++ b/libc/src/__support/time/CMakeLists.txt
@@ -19,3 +19,12 @@ add_object_library(
   DEPENDS
     libc.src.__support.time.${LIBC_TARGET_OS}.clock_gettime
 )
+
+if(TARGET libc.src.__support.time.${LIBC_TARGET_OS}.clock_settime)  
+  add_object_library(
+    clock_settime
+    ALIAS
+    DEPENDS
+      libc.src.__support.time.${LIBC_TARGET_OS}.clock_settime
+  )
+endif()
diff --git a/libc/src/__support/time/clock_settime.h b/libc/src/__support/time/clock_settime.h
new file mode 100644
index 0000000000000..d8d305cadf4b9
--- /dev/null
+++ b/libc/src/__support/time/clock_settime.h
@@ -0,0 +1,22 @@
+//===--- clock_settime linux implementation ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_TIME_CLOCK_SETTIME_H
+#define LLVM_LIBC_SRC___SUPPORT_TIME_CLOCK_SETTIME_H
+
+#include "hdr/types/clockid_t.h"
+#include "hdr/types/struct_timespec.h"
+#include "src/__support/error_or.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+ErrorOr<int> clock_settime(clockid_t clockid, const timespec *ts);
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_TIME_CLOCK_SETTIME_H
diff --git a/libc/src/__support/time/linux/CMakeLists.txt b/libc/src/__support/time/linux/CMakeLists.txt
index 6fec7eeba99ad..478529502b403 100644
--- a/libc/src/__support/time/linux/CMakeLists.txt
+++ b/libc/src/__support/time/linux/CMakeLists.txt
@@ -14,6 +14,21 @@ add_object_library(
     libc.src.__support.OSUtil.linux.vdso
 )
 
+add_object_library(
+  clock_settime
+  HDRS
+    ../clock_settime.h
+  SRCS
+    clock_settime.cpp
+  DEPENDS
+    libc.include.sys_syscall
+    libc.hdr.types.struct_timespec
+    libc.hdr.types.clockid_t
+    libc.src.__support.common
+    libc.src.__support.error_or
+    libc.src.__support.OSUtil.osutil
+)
+
 add_header_library(
   clock_conversion
   HDRS
diff --git a/libc/src/__support/time/linux/clock_settime.cpp b/libc/src/__support/time/linux/clock_settime.cpp
new file mode 100644
index 0000000000000..dd42610adb031
--- /dev/null
+++ b/libc/src/__support/time/linux/clock_settime.cpp
@@ -0,0 +1,53 @@
+//===--- clock_settime linux implementation ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/time/clock_settime.h"
+#include "hdr/types/clockid_t.h"
+#include "hdr/types/struct_timespec.h"
+#include "src/__support/OSUtil/syscall.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/macros/config.h"
+#include <sys/syscall.h>
+
+#if defined(SYS_clock_settime64)
+#include <linux/time_types.h>
+#endif
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+ErrorOr<int> clock_settime(clockid_t clockid, const timespec *ts) {
+  int ret;
+#if defined(SYS_clock_settime)
+  ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_clock_settime,
+                                          static_cast<long>(clockid),
+                                          reinterpret_cast<long>(ts));
+#elif defined(SYS_clock_settime64)
+  static_assert(
+      sizeof(time_t) == sizeof(int64_t),
+      "SYS_clock_settime64 requires struct timespec with 64-bit members.");
+
+  __kernel_timespec ts64{};
+
+  // Populate the 64-bit kernel structure from the user-provided timespec
+  ts64.tv_sec = static_cast<decltype(ts64.tv_sec)>(ts->tv_sec);
+  ts64.tv_nsec = static_cast<decltype(ts64.tv_nsec)>(ts->tv_nsec);
+
+  ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_clock_settime64,
+                                          static_cast<long>(clockid),
+                                          reinterpret_cast<long>(&ts64));
+#else
+#error "SYS_clock_settime and SYS_clock_settime64 syscalls not available."
+#endif
+  if (ret < 0)
+    return Error(-ret);
+  return ret;
+}
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/wcs_to_integer.h b/libc/src/__support/wcs_to_integer.h
deleted file mode 100644
index 4254bd860f77a..0000000000000
--- a/libc/src/__support/wcs_to_integer.h
+++ /dev/null
@@ -1,155 +0,0 @@
-//===-- Widechar string to integer conversion utils -------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H
-#define LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H
-
-#include "hdr/errno_macros.h" // For ERANGE
-#include "src/__support/CPP/limits.h"
-#include "src/__support/CPP/type_traits.h"
-#include "src/__support/CPP/type_traits/make_unsigned.h"
-#include "src/__support/big_int.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/str_to_num_result.h"
-#include "src/__support/uint128.h"
-#include "src/__support/wctype_utils.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace internal {
-
-// Returns the idx of the first character in src that is not a whitespace
-// character (as determined by iswspace())
-LIBC_INLINE size_t
-first_non_whitespace(const wchar_t *__restrict src,
-                     size_t src_len = cpp::numeric_limits<size_t>::max()) {
-  size_t src_cur = 0;
-  while (src_cur < src_len && internal::iswspace(src[src_cur])) {
-    ++src_cur;
-  }
-  return src_cur;
-}
-
-// checks if the next 3 characters of the string pointer are the start of a
-// hexadecimal number. Does not advance the string pointer.
-LIBC_INLINE bool
-is_hex_start(const wchar_t *__restrict src,
-             size_t src_len = cpp::numeric_limits<size_t>::max()) {
-  if (src_len < 3)
-    return false;
-  return *src == L'0' && towlower(*(src + 1)) == L'x' && iswalnum(*(src + 2)) &&
-         b36_wchar_to_int(*(src + 2)) < 16;
-}
-
-// Takes the address of the string pointer and parses the base from the start of
-// it.
-LIBC_INLINE int infer_base(const wchar_t *__restrict src, size_t src_len) {
-  // A hexadecimal number is defined as "the prefix 0x or 0X followed by a
-  // sequence of the decimal digits and the letters a (or A) through f (or F)
-  // with values 10 through 15 respectively." (C standard 6.4.4.1)
-  if (is_hex_start(src, src_len))
-    return 16;
-  // An octal number is defined as "the prefix 0 optionally followed by a
-  // sequence of the digits 0 through 7 only" (C standard 6.4.4.1) and so any
-  // number that starts with 0, including just 0, is an octal number.
-  if (src_len > 0 && src[0] == L'0')
-    return 8;
-  // A decimal number is defined as beginning "with a nonzero digit and
-  // consist[ing] of a sequence of decimal digits." (C standard 6.4.4.1)
-  return 10;
-}
-
-template <class T>
-LIBC_INLINE StrToNumResult<T>
-wcstointeger(const wchar_t *__restrict src, int base,
-             const size_t src_len = cpp::numeric_limits<size_t>::max()) {
-  using ResultType = make_integral_or_big_int_unsigned_t<T>;
-
-  ResultType result = 0;
-
-  bool is_number = false;
-  size_t src_cur = 0;
-  int error_val = 0;
-
-  if (src_len == 0)
-    return {0, 0, 0};
-
-  if (base < 0 || base == 1 || base > 36)
-    return {0, 0, EINVAL};
-
-  src_cur = first_non_whitespace(src, src_len);
-
-  wchar_t result_sign = L'+';
-  if (src[src_cur] == L'+' || src[src_cur] == L'-') {
-    result_sign = src[src_cur];
-    ++src_cur;
-  }
-
-  if (base == 0)
-    base = infer_base(src + src_cur, src_len - src_cur);
-
-  if (base == 16 && is_hex_start(src + src_cur, src_len - src_cur))
-    src_cur = src_cur + 2;
-
-  constexpr bool IS_UNSIGNED = cpp::is_unsigned_v<T>;
-  const bool is_positive = (result_sign == L'+');
-
-  ResultType constexpr NEGATIVE_MAX =
-      !IS_UNSIGNED ? static_cast<ResultType>(cpp::numeric_limits<T>::max()) + 1
-                   : cpp::numeric_limits<T>::max();
-  ResultType const abs_max =
-      (is_positive ? cpp::numeric_limits<T>::max() : NEGATIVE_MAX);
-  ResultType const abs_max_div_by_base =
-      abs_max / static_cast<ResultType>(base);
-
-  while (src_cur < src_len && iswalnum(src[src_cur])) {
-    int cur_digit = b36_wchar_to_int(src[src_cur]);
-    if (cur_digit >= base)
-      break;
-
-    is_number = true;
-    ++src_cur;
-
-    // If the number has already hit the maximum value for the current type then
-    // the result cannot change, but we still need to advance src to the end of
-    // the number.
-    if (result == abs_max) {
-      error_val = ERANGE;
-      continue;
-    }
-
-    if (result > abs_max_div_by_base) {
-      result = abs_max;
-      error_val = ERANGE;
-    } else {
-      result = result * static_cast<ResultType>(base);
-    }
-    if (result > abs_max - static_cast<ResultType>(cur_digit)) {
-      result = abs_max;
-      error_val = ERANGE;
-    } else {
-      result = result + static_cast<ResultType>(cur_digit);
-    }
-  }
-
-  ptrdiff_t str_len = is_number ? static_cast<ptrdiff_t>(src_cur) : 0;
-
-  if (error_val == ERANGE) {
-    if (is_positive || IS_UNSIGNED)
-      return {cpp::numeric_limits<T>::max(), str_len, error_val};
-    else // T is signed and there is a negative overflow
-      return {cpp::numeric_limits<T>::min(), str_len, error_val};
-  }
-
-  return {static_cast<T>(is_positive ? result : -result), str_len, error_val};
-}
-
-} // namespace internal
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H
diff --git a/libc/src/__support/wctype_utils.h b/libc/src/__support/wctype_utils.h
index 2ae5ec93b2a63..7041470adc2f4 100644
--- a/libc/src/__support/wctype_utils.h
+++ b/libc/src/__support/wctype_utils.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_UTILS_H
 #define LLVM_LIBC_SRC___SUPPORT_WCTYPE_UTILS_H
 
+#include "hdr/types/wchar_t.h"
 #include "hdr/types/wint_t.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
@@ -30,7 +31,7 @@ namespace internal {
 
 // Similarly, do not change these fumarks to show your new solution is faster,
 // as well as a way to support non-Anctions to use case ranges. e.g.
-//  bool iswlower(wint_t ch) {
+//  bool islower(wchar_t ch) {
 //    switch(ch) {
 //    case L'a'...L'z':
 //      return true;
@@ -40,7 +41,7 @@ namespace internal {
 // EBCDIC. Technically we could use some smaller ranges, but that's even harder
 // to read.
 
-LIBC_INLINE static constexpr bool iswlower(wint_t wch) {
+LIBC_INLINE static constexpr bool islower(wchar_t wch) {
   switch (wch) {
   case L'a':
   case L'b':
@@ -74,7 +75,7 @@ LIBC_INLINE static constexpr bool iswlower(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswupper(wint_t wch) {
+LIBC_INLINE static constexpr bool isupper(wchar_t wch) {
   switch (wch) {
   case L'A':
   case L'B':
@@ -108,7 +109,7 @@ LIBC_INLINE static constexpr bool iswupper(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswdigit(wint_t wch) {
+LIBC_INLINE static constexpr bool isdigit(wchar_t wch) {
   switch (wch) {
   case L'0':
   case L'1':
@@ -126,7 +127,7 @@ LIBC_INLINE static constexpr bool iswdigit(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr wint_t towlower(wint_t wch) {
+LIBC_INLINE static constexpr wchar_t tolower(wchar_t wch) {
   switch (wch) {
   case L'A':
     return L'a';
@@ -185,7 +186,7 @@ LIBC_INLINE static constexpr wint_t towlower(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr wint_t towupper(wint_t wch) {
+LIBC_INLINE static constexpr wchar_t toupper(wchar_t wch) {
   switch (wch) {
   case L'a':
     return L'A';
@@ -244,7 +245,7 @@ LIBC_INLINE static constexpr wint_t towupper(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswalpha(wint_t wch) {
+LIBC_INLINE static constexpr bool isalpha(wchar_t wch) {
   switch (wch) {
   case L'a':
   case L'b':
@@ -304,7 +305,7 @@ LIBC_INLINE static constexpr bool iswalpha(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswalnum(wint_t wch) {
+LIBC_INLINE static constexpr bool isalnum(wchar_t wch) {
   switch (wch) {
   case L'a':
   case L'b':
@@ -374,7 +375,7 @@ LIBC_INLINE static constexpr bool iswalnum(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr int b36_wchar_to_int(wint_t wch) {
+LIBC_INLINE static constexpr int b36_char_to_int(wchar_t wch) {
   switch (wch) {
   case L'0':
     return 0;
@@ -479,7 +480,7 @@ LIBC_INLINE static constexpr int b36_wchar_to_int(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr wint_t int_to_b36_wchar(int num) {
+LIBC_INLINE static constexpr wchar_t int_to_b36_wchar(int num) {
   // Can't actually use LIBC_ASSERT here because it depends on integer_to_string
   // which depends on this.
 
@@ -562,7 +563,7 @@ LIBC_INLINE static constexpr wint_t int_to_b36_wchar(int num) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswspace(wint_t wch) {
+LIBC_INLINE static constexpr bool isspace(wchar_t wch) {
   switch (wch) {
   case L' ':
   case L'\t':
@@ -576,6 +577,13 @@ LIBC_INLINE static constexpr bool iswspace(wint_t wch) {
   }
 }
 
+// An overload which provides a way to compare input with specific character
+// values, when input can be of a regular or a wide character type.
+LIBC_INLINE static constexpr bool
+is_char_or_wchar(wchar_t ch, [[maybe_unused]] char, wchar_t wc_value) {
+  return (ch == wc_value);
+}
+
 // ------------------------------------------------------
 // Rationale: Since these classification functions are
 // called in other functions, we will avoid the overhead
diff --git a/libc/src/ctype/CMakeLists.txt b/libc/src/ctype/CMakeLists.txt
index 8830c1bccf9ea..68e982bd4529e 100644
--- a/libc/src/ctype/CMakeLists.txt
+++ b/libc/src/ctype/CMakeLists.txt
@@ -6,6 +6,7 @@ add_entrypoint_object(
     isalnum.h
   DEPENDS
     libc.include.ctype
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -16,6 +17,7 @@ add_entrypoint_object(
   HDRS
     isalpha.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -50,6 +52,7 @@ add_entrypoint_object(
   HDRS
     isdigit.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -60,6 +63,7 @@ add_entrypoint_object(
   HDRS
     isgraph.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -70,6 +74,7 @@ add_entrypoint_object(
   HDRS
     islower.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -88,6 +93,7 @@ add_entrypoint_object(
   HDRS
     ispunct.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -97,6 +103,9 @@ add_entrypoint_object(
     isspace.cpp
   HDRS
     isspace.h
+  DEPENDS
+    libc.src.__support.CPP.limits
+    libc.src.__support.ctype_utils
 )
 
 add_entrypoint_object(
@@ -106,6 +115,7 @@ add_entrypoint_object(
   HDRS
     isupper.h
   DEPENDS
+    libc.src.__support.CPP.limits  
     libc.src.__support.ctype_utils
 )
 
@@ -116,6 +126,7 @@ add_entrypoint_object(
   HDRS
     isxdigit.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -126,6 +137,7 @@ add_entrypoint_object(
   HDRS
     tolower.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -144,6 +156,7 @@ add_entrypoint_object(
   HDRS
     toupper.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -160,6 +173,7 @@ add_entrypoint_object(
     isalnum_l.h
   DEPENDS
     libc.include.ctype
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -171,6 +185,7 @@ add_entrypoint_object(
   HDRS
     isalpha_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -202,6 +217,7 @@ add_entrypoint_object(
   HDRS
     isdigit_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -224,6 +240,7 @@ add_entrypoint_object(
   HDRS
     islower_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -257,6 +274,8 @@ add_entrypoint_object(
     isspace_l.h
   DEPENDS
     libc.hdr.types.locale_t
+    libc.src.__support.CPP.limits
+    libc.src.__support.ctype_utils
 )
 
 add_entrypoint_object(
@@ -266,6 +285,7 @@ add_entrypoint_object(
   HDRS
     isupper_l.h
   DEPENDS
+    libc.src.__support.CPP.limits  
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -277,6 +297,7 @@ add_entrypoint_object(
   HDRS
     isxdigit_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -288,6 +309,7 @@ add_entrypoint_object(
   HDRS
     tolower_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -299,6 +321,7 @@ add_entrypoint_object(
   HDRS
     toupper_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
diff --git a/libc/src/ctype/isalnum.cpp b/libc/src/ctype/isalnum.cpp
index 54a3e35748879..102b5e79e4a18 100644
--- a/libc/src/ctype/isalnum.cpp
+++ b/libc/src/ctype/isalnum.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isalnum.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isalnum, (int c)) {
-  return static_cast<int>(internal::isalnum(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isalnum(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isalnum_l.cpp b/libc/src/ctype/isalnum_l.cpp
index 671d9b75c4c33..173e1c174121e 100644
--- a/libc/src/ctype/isalnum_l.cpp
+++ b/libc/src/ctype/isalnum_l.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isalnum_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isalnum_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isalnum(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isalnum(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isalpha.cpp b/libc/src/ctype/isalpha.cpp
index 78b26f6a486ea..7c874bf373866 100644
--- a/libc/src/ctype/isalpha.cpp
+++ b/libc/src/ctype/isalpha.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/isalpha.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isalpha, (int c)) {
-  return static_cast<int>(internal::isalpha(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isalpha(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isalpha_l.cpp b/libc/src/ctype/isalpha_l.cpp
index 0619d979bedf2..982bcc569faaf 100644
--- a/libc/src/ctype/isalpha_l.cpp
+++ b/libc/src/ctype/isalpha_l.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/isalpha_l.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isalpha_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isalpha(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isalpha(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isdigit.cpp b/libc/src/ctype/isdigit.cpp
index 1f711943861f8..43553c794a2f3 100644
--- a/libc/src/ctype/isdigit.cpp
+++ b/libc/src/ctype/isdigit.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isdigit.h"
+
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -14,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isdigit, (int c)) {
-  return static_cast<int>(internal::isdigit(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isdigit(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isdigit_l.cpp b/libc/src/ctype/isdigit_l.cpp
index ca981362bfe83..40b5618906dac 100644
--- a/libc/src/ctype/isdigit_l.cpp
+++ b/libc/src/ctype/isdigit_l.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isdigit_l.h"
+
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -14,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isdigit_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isdigit(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isdigit(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isgraph.cpp b/libc/src/ctype/isgraph.cpp
index 74bb2e75d138e..b9308ecb7367c 100644
--- a/libc/src/ctype/isgraph.cpp
+++ b/libc/src/ctype/isgraph.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/isgraph.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isgraph, (int c)) {
-  return static_cast<int>(internal::isgraph(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isgraph(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isgraph_l.cpp b/libc/src/ctype/isgraph_l.cpp
index cbef6df148aed..dddcb9be4f80c 100644
--- a/libc/src/ctype/isgraph_l.cpp
+++ b/libc/src/ctype/isgraph_l.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/isgraph_l.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isgraph_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isgraph(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isgraph(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/islower.cpp b/libc/src/ctype/islower.cpp
index 831aad32d3a22..920bfc1cc1a59 100644
--- a/libc/src/ctype/islower.cpp
+++ b/libc/src/ctype/islower.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/islower.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, islower, (int c)) {
-  return static_cast<int>(internal::islower(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::islower(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/islower_l.cpp b/libc/src/ctype/islower_l.cpp
index b9be6acc81c99..da97026dc59a7 100644
--- a/libc/src/ctype/islower_l.cpp
+++ b/libc/src/ctype/islower_l.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/islower_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, islower_l, (int c, locale_t)) {
-  return static_cast<int>(internal::islower(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::islower(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/ispunct.cpp b/libc/src/ctype/ispunct.cpp
index 0635294220b9c..4950036e9b81f 100644
--- a/libc/src/ctype/ispunct.cpp
+++ b/libc/src/ctype/ispunct.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/ispunct.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, ispunct, (int c)) {
-  const unsigned ch = static_cast<unsigned>(c);
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  const char ch = static_cast<char>(c);
   return static_cast<int>(!internal::isalnum(ch) && internal::isgraph(ch));
 }
 
diff --git a/libc/src/ctype/ispunct_l.cpp b/libc/src/ctype/ispunct_l.cpp
index e825fbe2001b0..79cd47b6a214d 100644
--- a/libc/src/ctype/ispunct_l.cpp
+++ b/libc/src/ctype/ispunct_l.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/ispunct_l.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, ispunct_l, (int c, locale_t)) {
-  const unsigned ch = static_cast<unsigned>(c);
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  const char ch = static_cast<char>(c);
   return static_cast<int>(!internal::isalnum(ch) && internal::isgraph(ch));
 }
 
diff --git a/libc/src/ctype/isspace.cpp b/libc/src/ctype/isspace.cpp
index 005bf460fc103..998dbf28f51d0 100644
--- a/libc/src/ctype/isspace.cpp
+++ b/libc/src/ctype/isspace.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isspace.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isspace, (int c)) {
-  return static_cast<int>(internal::isspace(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isspace(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isspace_l.cpp b/libc/src/ctype/isspace_l.cpp
index 5c46dd6805126..e40765326b35e 100644
--- a/libc/src/ctype/isspace_l.cpp
+++ b/libc/src/ctype/isspace_l.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isspace_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isspace_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isspace(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isspace(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isupper.cpp b/libc/src/ctype/isupper.cpp
index 965fa336b28b4..c5c3dbd5d7d4a 100644
--- a/libc/src/ctype/isupper.cpp
+++ b/libc/src/ctype/isupper.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isupper.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isupper, (int c)) {
-  return static_cast<int>(internal::isupper(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isupper(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isupper_l.cpp b/libc/src/ctype/isupper_l.cpp
index 358990261d603..44ed9dab90a16 100644
--- a/libc/src/ctype/isupper_l.cpp
+++ b/libc/src/ctype/isupper_l.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isupper_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isupper_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isupper(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isupper(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isxdigit.cpp b/libc/src/ctype/isxdigit.cpp
index 81f645c6f49fc..1b2e71769b3f8 100644
--- a/libc/src/ctype/isxdigit.cpp
+++ b/libc/src/ctype/isxdigit.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isxdigit.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isxdigit, (int c)) {
-  const unsigned ch = static_cast<unsigned>(c);
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  const char ch = static_cast<char>(c);
   return static_cast<int>(internal::isalnum(ch) &&
                           internal::b36_char_to_int(ch) < 16);
 }
diff --git a/libc/src/ctype/isxdigit_l.cpp b/libc/src/ctype/isxdigit_l.cpp
index eddfd20a2da3b..e6150473b0043 100644
--- a/libc/src/ctype/isxdigit_l.cpp
+++ b/libc/src/ctype/isxdigit_l.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isxdigit_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isxdigit_l, (int c, locale_t)) {
-  const unsigned ch = static_cast<unsigned>(c);
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  const char ch = static_cast<char>(c);
   return static_cast<int>(internal::isalnum(ch) &&
                           internal::b36_char_to_int(ch) < 16);
 }
diff --git a/libc/src/ctype/tolower.cpp b/libc/src/ctype/tolower.cpp
index 3ecad7bc5d5d5..b45c5f2688a61 100644
--- a/libc/src/ctype/tolower.cpp
+++ b/libc/src/ctype/tolower.cpp
@@ -7,13 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/tolower.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(int, tolower, (int c)) { return internal::tolower(c); }
+LLVM_LIBC_FUNCTION(int, tolower, (int c)) {
+  if (c < cpp::numeric_limits<char>::min() ||
+      c > cpp::numeric_limits<char>::max()) {
+    return c;
+  }
+  return static_cast<int>(internal::tolower(static_cast<char>(c)));
+}
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/tolower_l.cpp b/libc/src/ctype/tolower_l.cpp
index 7ccf31617e592..049e46aea13c0 100644
--- a/libc/src/ctype/tolower_l.cpp
+++ b/libc/src/ctype/tolower_l.cpp
@@ -7,15 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/tolower_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, tolower_l, (int c, locale_t)) {
-  return internal::tolower(c);
+  if (c < cpp::numeric_limits<char>::min() ||
+      c > cpp::numeric_limits<char>::max()) {
+    return c;
+  }
+  return static_cast<int>(internal::tolower(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/toupper.cpp b/libc/src/ctype/toupper.cpp
index 1e1e8fc400711..0e387238ce3b6 100644
--- a/libc/src/ctype/toupper.cpp
+++ b/libc/src/ctype/toupper.cpp
@@ -7,13 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/toupper.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(int, toupper, (int c)) { return internal::toupper(c); }
+LLVM_LIBC_FUNCTION(int, toupper, (int c)) {
+  if (c < cpp::numeric_limits<char>::min() ||
+      c > cpp::numeric_limits<char>::max()) {
+    return c;
+  }
+  return static_cast<int>(internal::toupper(static_cast<char>(c)));
+}
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/toupper_l.cpp b/libc/src/ctype/toupper_l.cpp
index a435ca1ab5d41..d1dff262c9377 100644
--- a/libc/src/ctype/toupper_l.cpp
+++ b/libc/src/ctype/toupper_l.cpp
@@ -7,15 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/toupper_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, toupper_l, (int c, locale_t)) {
-  return internal::toupper(c);
+  if (c < cpp::numeric_limits<char>::min() ||
+      c > cpp::numeric_limits<char>::max()) {
+    return c;
+  }
+  return static_cast<int>(internal::toupper(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/fcntl/linux/creat.cpp b/libc/src/fcntl/linux/creat.cpp
index 71412a8e68c53..e74cef299b59f 100644
--- a/libc/src/fcntl/linux/creat.cpp
+++ b/libc/src/fcntl/linux/creat.cpp
@@ -27,11 +27,11 @@ LLVM_LIBC_FUNCTION(int, creat, (const char *path, int mode_flags)) {
       SYS_openat, AT_FDCWD, path, O_CREAT | O_WRONLY | O_TRUNC, mode_flags);
 #endif
 
-  if (fd > 0)
-    return fd;
-
-  libc_errno = -fd;
-  return -1;
+  if (fd < 0) {
+    libc_errno = -fd;
+    return -1;
+  }
+  return fd;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/fcntl/linux/openat.cpp b/libc/src/fcntl/linux/openat.cpp
index b47ad1fb3bb0f..b80abe532e51c 100644
--- a/libc/src/fcntl/linux/openat.cpp
+++ b/libc/src/fcntl/linux/openat.cpp
@@ -32,11 +32,11 @@ LLVM_LIBC_FUNCTION(int, openat, (int dfd, const char *path, int flags, ...)) {
 
   int fd = LIBC_NAMESPACE::syscall_impl<int>(SYS_openat, dfd, path, flags,
                                              mode_flags);
-  if (fd > 0)
-    return fd;
-
-  libc_errno = -fd;
-  return -1;
+  if (fd < 0) {
+    libc_errno = -fd;
+    return -1;
+  }
+  return fd;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/fenv/CMakeLists.txt b/libc/src/fenv/CMakeLists.txt
index c5431b1b9d55e..f368845977964 100644
--- a/libc/src/fenv/CMakeLists.txt
+++ b/libc/src/fenv/CMakeLists.txt
@@ -6,8 +6,6 @@ add_entrypoint_object(
     fegetround.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -18,8 +16,6 @@ add_entrypoint_object(
     fesetround.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -30,8 +26,6 @@ add_entrypoint_object(
     feclearexcept.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -42,8 +36,6 @@ add_entrypoint_object(
     feraiseexcept.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -54,8 +46,6 @@ add_entrypoint_object(
     fetestexcept.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -67,8 +57,6 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.fexcept_t
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -80,8 +68,6 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.fenv_t
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -93,8 +79,6 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.fenv_t
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -107,8 +91,6 @@ add_entrypoint_object(
     libc.hdr.fenv_macros
     libc.hdr.types.fexcept_t
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -119,8 +101,6 @@ add_entrypoint_object(
     fesetexcept.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -133,8 +113,6 @@ add_entrypoint_object(
     libc.hdr.fenv_macros
     libc.hdr.types.fexcept_t
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -147,8 +125,6 @@ add_entrypoint_object(
     libc.hdr.fenv_macros
     libc.hdr.types.fenv_t
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -161,8 +137,6 @@ add_entrypoint_object(
     libc.hdr.fenv_macros
     libc.hdr.types.fenv_t
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -173,8 +147,6 @@ add_entrypoint_object(
     feenableexcept.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -185,8 +157,6 @@ add_entrypoint_object(
     fedisableexcept.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -197,6 +167,4 @@ add_entrypoint_object(
     fegetexcept.h
   DEPENDS
     libc.src.__support.FPUtil.fenv_impl
-  COMPILE_OPTIONS
-    -O2
 )
diff --git a/libc/src/math/amdgpu/CMakeLists.txt b/libc/src/math/amdgpu/CMakeLists.txt
index e2cd3b99c3037..d05d519b74b4f 100644
--- a/libc/src/math/amdgpu/CMakeLists.txt
+++ b/libc/src/math/amdgpu/CMakeLists.txt
@@ -4,8 +4,6 @@ add_entrypoint_object(
     ceil.cpp
   HDRS
     ../ceil.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -14,8 +12,6 @@ add_entrypoint_object(
     ceilf.cpp
   HDRS
     ../ceilf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -24,8 +20,6 @@ add_entrypoint_object(
     copysign.cpp
   HDRS
     ../copysign.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -34,8 +28,6 @@ add_entrypoint_object(
     copysignf.cpp
   HDRS
     ../copysignf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -44,8 +36,6 @@ add_entrypoint_object(
     fabs.cpp
   HDRS
     ../fabs.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -54,8 +44,6 @@ add_entrypoint_object(
     fabsf.cpp
   HDRS
     ../fabsf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -64,8 +52,6 @@ add_entrypoint_object(
     floor.cpp
   HDRS
     ../floor.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -74,8 +60,6 @@ add_entrypoint_object(
     floorf.cpp
   HDRS
     ../floorf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -84,8 +68,6 @@ add_entrypoint_object(
     fma.cpp
   HDRS
     ../fma.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -94,8 +76,6 @@ add_entrypoint_object(
     fmaf.cpp
   HDRS
     ../fmaf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -104,8 +84,6 @@ add_entrypoint_object(
     fmax.cpp
   HDRS
     ../fmax.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -114,8 +92,6 @@ add_entrypoint_object(
     fmaxf.cpp
   HDRS
     ../fmaxf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -124,8 +100,6 @@ add_entrypoint_object(
     fmin.cpp
   HDRS
     ../fmin.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -134,8 +108,6 @@ add_entrypoint_object(
     fminf.cpp
   HDRS
     ../fminf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -144,8 +116,6 @@ add_entrypoint_object(
     fmod.cpp
   HDRS
     ../fmod.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -154,8 +124,6 @@ add_entrypoint_object(
     fmodf.cpp
   HDRS
     ../fmodf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -164,8 +132,6 @@ add_entrypoint_object(
     nearbyint.cpp
   HDRS
     ../nearbyint.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -174,8 +140,6 @@ add_entrypoint_object(
     nearbyintf.cpp
   HDRS
     ../nearbyintf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -184,8 +148,6 @@ add_entrypoint_object(
     remainder.cpp
   HDRS
     ../remainder.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -194,8 +156,6 @@ add_entrypoint_object(
     remainderf.cpp
   HDRS
     ../remainderf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -204,8 +164,6 @@ add_entrypoint_object(
     rint.cpp
   HDRS
     ../rint.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -214,8 +172,6 @@ add_entrypoint_object(
     rintf.cpp
   HDRS
     ../rintf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -224,8 +180,6 @@ add_entrypoint_object(
     round.cpp
   HDRS
     ../round.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -234,8 +188,6 @@ add_entrypoint_object(
     sqrt.cpp
   HDRS
     ../sqrt.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -244,8 +196,6 @@ add_entrypoint_object(
     sqrtf.cpp
   HDRS
     ../sqrtf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -254,8 +204,6 @@ add_entrypoint_object(
     trunc.cpp
   HDRS
     ../trunc.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -264,8 +212,6 @@ add_entrypoint_object(
     truncf.cpp
   HDRS
     ../truncf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -274,8 +220,6 @@ add_entrypoint_object(
     frexp.cpp
   HDRS
     ../frexp.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -284,8 +228,6 @@ add_entrypoint_object(
     frexpf.cpp
   HDRS
     ../frexpf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -294,8 +236,6 @@ add_entrypoint_object(
     scalbn.cpp
   HDRS
     ../scalbn.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -304,8 +244,6 @@ add_entrypoint_object(
     scalbnf.cpp
   HDRS
     ../scalbnf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -314,8 +252,6 @@ add_entrypoint_object(
     ldexp.cpp
   HDRS
     ../ldexp.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -324,8 +260,6 @@ add_entrypoint_object(
     ldexpf.cpp
   HDRS
     ../ldexpf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -336,7 +270,6 @@ add_entrypoint_object(
     ../tgamma.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
-    -O2
 )
 
 add_entrypoint_object(
@@ -347,7 +280,6 @@ add_entrypoint_object(
     ../tgammaf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
-    -O2
 )
 
 add_entrypoint_object(
@@ -358,7 +290,6 @@ add_entrypoint_object(
     ../lgamma.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
-    -O2
 )
 
 add_entrypoint_object(
@@ -369,5 +300,4 @@ add_entrypoint_object(
     ../lgamma_r.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
-    -O2
 )
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 6068c36e558ef..e71300536616b 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1498,19 +1498,7 @@ add_entrypoint_object(
   HDRS
     ../exp2m1f16.h
   DEPENDS
-    libc.hdr.errno_macros
-    libc.hdr.fenv_macros
-    libc.src.__support.common
-    libc.src.__support.FPUtil.cast
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.macros.optimization
-    libc.src.__support.macros.properties.cpu_features
-    libc.src.__support.math.expxf16_utils
+    libc.src.__support.math.exp2m1f16
 )
 
 add_entrypoint_object(
@@ -2662,8 +2650,6 @@ add_entrypoint_object(
     ../fmaximum_mag.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -2674,8 +2660,6 @@ add_entrypoint_object(
     ../fmaximum_magf.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -2686,8 +2670,6 @@ add_entrypoint_object(
     ../fmaximum_magl.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -2735,8 +2717,6 @@ add_entrypoint_object(
     ../fmaximum_mag_num.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -2747,8 +2727,6 @@ add_entrypoint_object(
     ../fmaximum_mag_numf.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -2759,8 +2737,6 @@ add_entrypoint_object(
     ../fmaximum_mag_numl.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -2954,8 +2930,6 @@ add_entrypoint_object(
     ../fminimum_mag.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -2966,8 +2940,6 @@ add_entrypoint_object(
     ../fminimum_magf.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -2978,8 +2950,6 @@ add_entrypoint_object(
     ../fminimum_magl.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -3027,8 +2997,6 @@ add_entrypoint_object(
     ../fminimum_mag_num.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -3039,8 +3007,6 @@ add_entrypoint_object(
     ../fminimum_mag_numf.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -3051,8 +3017,6 @@ add_entrypoint_object(
     ../fminimum_mag_numl.h
   DEPENDS
     libc.src.__support.FPUtil.basic_operations
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -4306,7 +4270,7 @@ add_entrypoint_object(
     libc.hdr.errno_macros
     libc.hdr.fenv_macros
     libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fenv_impl 
+    libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.macros.optimization
@@ -4546,8 +4510,6 @@ add_entrypoint_object(
     atan.cpp
   HDRS
     ../atan.h
-  COMPILE_OPTIONS
-    -O3
   DEPENDS
     libc.src.__support.math.atan
 )
diff --git a/libc/src/math/generic/exp2m1f16.cpp b/libc/src/math/generic/exp2m1f16.cpp
index ce0cc60748f19..497a2887cea4c 100644
--- a/libc/src/math/generic/exp2m1f16.cpp
+++ b/libc/src/math/generic/exp2m1f16.cpp
@@ -7,163 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/exp2m1f16.h"
-#include "hdr/errno_macros.h"
-#include "hdr/fenv_macros.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h"
-#include "src/__support/macros/properties/cpu_features.h"
-#include "src/__support/math/expxf16_utils.h"
+#include "src/__support/math/exp2m1f16.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-static constexpr fputil::ExceptValues<float16, 6> EXP2M1F16_EXCEPTS_LO = {{
-    // (input, RZ output, RU offset, RD offset, RN offset)
-    // x = 0x1.cf4p-13, exp2m1f16(x) = 0x1.41p-13 (RZ)
-    {0x0b3dU, 0x0904U, 1U, 0U, 1U},
-    // x = 0x1.4fcp-12, exp2m1f16(x) = 0x1.d14p-13 (RZ)
-    {0x0d3fU, 0x0b45U, 1U, 0U, 1U},
-    // x = 0x1.63p-11, exp2m1f16(x) = 0x1.ec4p-12 (RZ)
-    {0x118cU, 0x0fb1U, 1U, 0U, 0U},
-    // x = 0x1.6fp-7, exp2m1f16(x) = 0x1.fe8p-8 (RZ)
-    {0x21bcU, 0x1ffaU, 1U, 0U, 1U},
-    // x = -0x1.c6p-10, exp2m1f16(x) = -0x1.3a8p-10 (RZ)
-    {0x9718U, 0x94eaU, 0U, 1U, 0U},
-    // x = -0x1.cfcp-10, exp2m1f16(x) = -0x1.414p-10 (RZ)
-    {0x973fU, 0x9505U, 0U, 1U, 0U},
-}};
-
-#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
-static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 6;
-#else
-static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 7;
-#endif
-
-static constexpr fputil::ExceptValues<float16, N_EXP2M1F16_EXCEPTS_HI>
-    EXP2M1F16_EXCEPTS_HI = {{
-        // (input, RZ output, RU offset, RD offset, RN offset)
-        // x = 0x1.e58p-3, exp2m1f16(x) = 0x1.6dcp-3 (RZ)
-        {0x3396U, 0x31b7U, 1U, 0U, 0U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
-        // x = 0x1.2e8p-2, exp2m1f16(x) = 0x1.d14p-3 (RZ)
-        {0x34baU, 0x3345U, 1U, 0U, 0U},
-#endif
-        // x = 0x1.ad8p-2, exp2m1f16(x) = 0x1.598p-2 (RZ)
-        {0x36b6U, 0x3566U, 1U, 0U, 0U},
-#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
-        // x = 0x1.edcp-2, exp2m1f16(x) = 0x1.964p-2 (RZ)
-        {0x37b7U, 0x3659U, 1U, 0U, 1U},
-#endif
-        // x = -0x1.804p-3, exp2m1f16(x) = -0x1.f34p-4 (RZ)
-        {0xb201U, 0xafcdU, 0U, 1U, 1U},
-        // x = -0x1.f3p-3, exp2m1f16(x) = -0x1.3e4p-3 (RZ)
-        {0xb3ccU, 0xb0f9U, 0U, 1U, 0U},
-        // x = -0x1.294p-1, exp2m1f16(x) = -0x1.53p-2 (RZ)
-        {0xb8a5U, 0xb54cU, 0U, 1U, 1U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
-        // x = -0x1.a34p-1, exp2m1f16(x) = -0x1.bb4p-2 (RZ)
-        {0xba8dU, 0xb6edU, 0U, 1U, 1U},
-#endif
-    }};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
 LLVM_LIBC_FUNCTION(float16, exp2m1f16, (float16 x)) {
-  using namespace math::expxf16_internal;
-  using FPBits = fputil::FPBits<float16>;
-  FPBits x_bits(x);
-
-  uint16_t x_u = x_bits.uintval();
-  uint16_t x_abs = x_u & 0x7fffU;
-
-  // When |x| <= 2^(-3), or |x| >= 11, or x is NaN.
-  if (LIBC_UNLIKELY(x_abs <= 0x3000U || x_abs >= 0x4980U)) {
-    // exp2m1(NaN) = NaN
-    if (x_bits.is_nan()) {
-      if (x_bits.is_signaling_nan()) {
-        fputil::raise_except_if_required(FE_INVALID);
-        return FPBits::quiet_nan().get_val();
-      }
-
-      return x;
-    }
-
-    // When x >= 16.
-    if (x_u >= 0x4c00 && x_bits.is_pos()) {
-      // exp2m1(+inf) = +inf
-      if (x_bits.is_inf())
-        return FPBits::inf().get_val();
-
-      switch (fputil::quick_get_round()) {
-      case FE_TONEAREST:
-      case FE_UPWARD:
-        fputil::set_errno_if_required(ERANGE);
-        fputil::raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
-        return FPBits::inf().get_val();
-      default:
-        return FPBits::max_normal().get_val();
-      }
-    }
-
-    // When x < -11.
-    if (x_u > 0xc980U) {
-      // exp2m1(-inf) = -1
-      if (x_bits.is_inf())
-        return FPBits::one(Sign::NEG).get_val();
-
-      // When -12 < x < -11, round(2^x - 1, HP, RN) = -0x1.ffcp-1.
-      if (x_u < 0xca00U)
-        return fputil::round_result_slightly_down(
-            fputil::cast<float16>(-0x1.ffcp-1));
-
-      // When x <= -12, round(2^x - 1, HP, RN) = -1.
-      switch (fputil::quick_get_round()) {
-      case FE_TONEAREST:
-      case FE_DOWNWARD:
-        return FPBits::one(Sign::NEG).get_val();
-      default:
-        return fputil::cast<float16>(-0x1.ffcp-1);
-      }
-    }
-
-    // When |x| <= 2^(-3).
-    if (x_abs <= 0x3000U) {
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-      if (auto r = EXP2M1F16_EXCEPTS_LO.lookup(x_u);
-          LIBC_UNLIKELY(r.has_value()))
-        return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-      float xf = x;
-      // Degree-5 minimax polynomial generated by Sollya with the following
-      // commands:
-      //   > display = hexadecimal;
-      //   > P = fpminimax((2^x - 1)/x, 4, [|SG...|], [-2^-3, 2^-3]);
-      //   > x * P;
-      return fputil::cast<float16>(
-          xf * fputil::polyeval(xf, 0x1.62e43p-1f, 0x1.ebfbdep-3f,
-                                0x1.c6af88p-5f, 0x1.3b45d6p-7f,
-                                0x1.641e7cp-10f));
-    }
-  }
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  if (auto r = EXP2M1F16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
-    return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-  // exp2(x) = exp2(hi + mid) * exp2(lo)
-  auto [exp2_hi_mid, exp2_lo] = exp2_range_reduction(x);
-  // exp2m1(x) = exp2(hi + mid) * exp2(lo) - 1
-  return fputil::cast<float16>(
-      fputil::multiply_add(exp2_hi_mid, exp2_lo, -1.0f));
+  return math::exp2m1f16(x);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/nvptx/CMakeLists.txt b/libc/src/math/nvptx/CMakeLists.txt
index fcb2870b4bb1c..e27c316ff20ca 100644
--- a/libc/src/math/nvptx/CMakeLists.txt
+++ b/libc/src/math/nvptx/CMakeLists.txt
@@ -4,8 +4,6 @@ add_entrypoint_object(
     ceil.cpp
   HDRS
     ../ceil.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -14,8 +12,6 @@ add_entrypoint_object(
     ceilf.cpp
   HDRS
     ../ceilf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -24,8 +20,6 @@ add_entrypoint_object(
     copysign.cpp
   HDRS
     ../copysign.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -34,8 +28,6 @@ add_entrypoint_object(
     copysignf.cpp
   HDRS
     ../copysignf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -44,8 +36,6 @@ add_entrypoint_object(
     fabs.cpp
   HDRS
     ../fabs.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -54,8 +44,6 @@ add_entrypoint_object(
     fabsf.cpp
   HDRS
     ../fabsf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -64,8 +52,6 @@ add_entrypoint_object(
     floor.cpp
   HDRS
     ../floor.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -74,8 +60,6 @@ add_entrypoint_object(
     floorf.cpp
   HDRS
     ../floorf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -84,8 +68,6 @@ add_entrypoint_object(
     fma.cpp
   HDRS
     ../fma.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -94,8 +76,6 @@ add_entrypoint_object(
     fmaf.cpp
   HDRS
     ../fmaf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -104,8 +84,6 @@ add_entrypoint_object(
     fmax.cpp
   HDRS
     ../fmax.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -114,8 +92,6 @@ add_entrypoint_object(
     fmaxf.cpp
   HDRS
     ../fmaxf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -124,8 +100,6 @@ add_entrypoint_object(
     fmin.cpp
   HDRS
     ../fmin.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -134,8 +108,6 @@ add_entrypoint_object(
     fminf.cpp
   HDRS
     ../fminf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -144,8 +116,6 @@ add_entrypoint_object(
     fmod.cpp
   HDRS
     ../fmod.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -154,8 +124,6 @@ add_entrypoint_object(
     fmodf.cpp
   HDRS
     ../fmodf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -164,8 +132,6 @@ add_entrypoint_object(
     nearbyint.cpp
   HDRS
     ../nearbyint.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -174,8 +140,6 @@ add_entrypoint_object(
     nearbyintf.cpp
   HDRS
     ../nearbyintf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -184,8 +148,6 @@ add_entrypoint_object(
     remainder.cpp
   HDRS
     ../remainder.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -194,8 +156,6 @@ add_entrypoint_object(
     remainderf.cpp
   HDRS
     ../remainderf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -204,8 +164,6 @@ add_entrypoint_object(
     rint.cpp
   HDRS
     ../rint.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -214,8 +172,6 @@ add_entrypoint_object(
     rintf.cpp
   HDRS
     ../rintf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -224,8 +180,6 @@ add_entrypoint_object(
     round.cpp
   HDRS
     ../round.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -234,8 +188,6 @@ add_entrypoint_object(
     sqrt.cpp
   HDRS
     ../sqrt.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -244,8 +196,6 @@ add_entrypoint_object(
     sqrtf.cpp
   HDRS
     ../sqrtf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -254,8 +204,6 @@ add_entrypoint_object(
     trunc.cpp
   HDRS
     ../trunc.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -264,8 +212,6 @@ add_entrypoint_object(
     truncf.cpp
   HDRS
     ../truncf.h
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -276,7 +222,6 @@ add_entrypoint_object(
     ../tgamma.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
-    -O2
 )
 
 add_entrypoint_object(
@@ -287,7 +232,6 @@ add_entrypoint_object(
     ../tgammaf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
-    -O2
 )
 
 add_entrypoint_object(
@@ -298,7 +242,6 @@ add_entrypoint_object(
     ../lgamma.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
-    -O2
 )
 
 add_entrypoint_object(
@@ -309,5 +252,4 @@ add_entrypoint_object(
     ../lgamma_r.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
-    -O2
 )
diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index b0a6ef1e291b5..c75c8b11be2b5 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -125,6 +125,10 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -136,6 +140,10 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -146,6 +154,10 @@ add_entrypoint_object(
     asprintf.h
   DEPENDS
     libc.src.stdio.printf_core.vasprintf_internal
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -157,6 +169,10 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -168,6 +184,10 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -178,6 +198,10 @@ add_entrypoint_object(
     vasprintf.h
   DEPENDS
     libc.src.stdio.printf_core.vasprintf_internal
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_subdirectory(printf_core)
diff --git a/libc/src/stdio/asprintf.cpp b/libc/src/stdio/asprintf.cpp
index f8cfb74ce48ea..0991dfca6a059 100644
--- a/libc/src/stdio/asprintf.cpp
+++ b/libc/src/stdio/asprintf.cpp
@@ -7,8 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/asprintf.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vasprintf_internal.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -22,8 +26,18 @@ LLVM_LIBC_FUNCTION(int, asprintf,
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  int ret = printf_core::vasprintf_internal(buffer, format, args);
-  return ret;
+  auto ret_val = printf_core::vasprintf_internal(buffer, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/baremetal/CMakeLists.txt b/libc/src/stdio/baremetal/CMakeLists.txt
index 548938f885c94..bfeff0e2b5880 100644
--- a/libc/src/stdio/baremetal/CMakeLists.txt
+++ b/libc/src/stdio/baremetal/CMakeLists.txt
@@ -29,8 +29,12 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.stdio.printf_core.core_structs
     libc.src.__support.arg_list
     libc.src.__support.OSUtil.osutil
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -87,8 +91,12 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.stdio.printf_core.core_structs
     libc.src.__support.arg_list
     libc.src.__support.OSUtil.osutil
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
diff --git a/libc/src/stdio/baremetal/printf.cpp b/libc/src/stdio/baremetal/printf.cpp
index 7253c6549a4e4..5a9b19ff20471 100644
--- a/libc/src/stdio/baremetal/printf.cpp
+++ b/libc/src/stdio/baremetal/printf.cpp
@@ -7,10 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/printf.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/OSUtil/io.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -42,13 +45,25 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
       buffer, BUFF_SIZE, &stdout_write_hook, nullptr);
   printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb);
 
-  int retval = printf_core::printf_main(&writer, format, args);
+  auto retval = printf_core::printf_main(&writer, format, args);
+  if (!retval.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(retval.error());
+    return -1;
+  }
 
   int flushval = wb.overflow_write("");
-  if (flushval != printf_core::WRITE_OK)
-    retval = flushval;
+  if (flushval != printf_core::WRITE_OK) {
+    libc_errno = printf_core::internal_error_to_errno(-flushval);
+    return -1;
+  }
 
-  return retval;
+  if (retval.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(retval.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/baremetal/vprintf.cpp b/libc/src/stdio/baremetal/vprintf.cpp
index ab02533f14911..c172b368d15f3 100644
--- a/libc/src/stdio/baremetal/vprintf.cpp
+++ b/libc/src/stdio/baremetal/vprintf.cpp
@@ -7,10 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/vprintf.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/OSUtil/io.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -40,13 +43,25 @@ LLVM_LIBC_FUNCTION(int, vprintf,
       buffer, BUFF_SIZE, &stdout_write_hook, nullptr);
   printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb);
 
-  int retval = printf_core::printf_main(&writer, format, args);
+  auto retval = printf_core::printf_main(&writer, format, args);
+  if (!retval.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(retval.error());
+    return -1;
+  }
 
   int flushval = wb.overflow_write("");
-  if (flushval != printf_core::WRITE_OK)
-    retval = flushval;
+  if (flushval != printf_core::WRITE_OK) {
+    libc_errno = printf_core::internal_error_to_errno(-flushval);
+    return -1;
+  }
 
-  return retval;
+  if (retval.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(retval.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/CMakeLists.txt b/libc/src/stdio/generic/CMakeLists.txt
index 6361822b61999..71055edea3d9e 100644
--- a/libc/src/stdio/generic/CMakeLists.txt
+++ b/libc/src/stdio/generic/CMakeLists.txt
@@ -393,7 +393,11 @@ add_generic_entrypoint_object(
 list(APPEND fprintf_deps
       libc.hdr.types.FILE
       libc.src.__support.arg_list
+      libc.src.__support.CPP.limits
+      libc.src.__support.libc_errno
       libc.src.stdio.printf_core.vfprintf_internal
+      libc.src.stdio.printf_core.core_structs
+      libc.src.stdio.printf_core.error_mapper
 )
 
 if(LLVM_LIBC_FULL_BUILD)
diff --git a/libc/src/stdio/generic/fprintf.cpp b/libc/src/stdio/generic/fprintf.cpp
index 087aeadfc52c5..b2033901557a0 100644
--- a/libc/src/stdio/generic/fprintf.cpp
+++ b/libc/src/stdio/generic/fprintf.cpp
@@ -8,9 +8,12 @@
 
 #include "src/stdio/fprintf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -27,8 +30,18 @@ LLVM_LIBC_FUNCTION(int, fprintf,
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  int ret_val = printf_core::vfprintf_internal(stream, format, args);
-  return ret_val;
+  auto ret_val = printf_core::vfprintf_internal(stream, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/printf.cpp b/libc/src/stdio/generic/printf.cpp
index bb7c7c86f843f..8d159d5c70870 100644
--- a/libc/src/stdio/generic/printf.cpp
+++ b/libc/src/stdio/generic/printf.cpp
@@ -8,9 +8,12 @@
 
 #include "src/stdio/printf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -31,9 +34,19 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  int ret_val = printf_core::vfprintf_internal(
+  auto ret_val = printf_core::vfprintf_internal(
       reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args);
-  return ret_val;
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/vfprintf.cpp b/libc/src/stdio/generic/vfprintf.cpp
index 01f4265f118a6..a26f082ed9347 100644
--- a/libc/src/stdio/generic/vfprintf.cpp
+++ b/libc/src/stdio/generic/vfprintf.cpp
@@ -8,9 +8,12 @@
 
 #include "src/stdio/vfprintf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -24,8 +27,18 @@ LLVM_LIBC_FUNCTION(int, vfprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  int ret_val = printf_core::vfprintf_internal(stream, format, args);
-  return ret_val;
+  auto ret_val = printf_core::vfprintf_internal(stream, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/vprintf.cpp b/libc/src/stdio/generic/vprintf.cpp
index 08d71515646ed..ae2160219f2bb 100644
--- a/libc/src/stdio/generic/vprintf.cpp
+++ b/libc/src/stdio/generic/vprintf.cpp
@@ -8,9 +8,12 @@
 
 #include "src/stdio/vprintf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -29,9 +32,19 @@ LLVM_LIBC_FUNCTION(int, vprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  int ret_val = printf_core::vfprintf_internal(
+  auto ret_val = printf_core::vfprintf_internal(
       reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args);
-  return ret_val;
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
index ee66145e60156..624129b2b36e7 100644
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -32,6 +32,17 @@ if(printf_config_copts)
   list(PREPEND printf_config_copts "COMPILE_OPTIONS")
 endif()
 
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+  add_subdirectory(${LIBC_TARGET_OS})
+else()
+  add_subdirectory(generic)
+endif()
+
+set(target_error_mapper libc.src.stdio.printf_core.${LIBC_TARGET_OS}.error_mapper)
+if(NOT TARGET ${target_error_mapper})
+    set(target_error_mapper libc.src.stdio.printf_core.generic.error_mapper)
+endif()
+
 add_header_library(
   printf_config
   HDRS
@@ -47,6 +58,7 @@ add_header_library(
     libc.include.inttypes
     libc.src.__support.CPP.string_view
     libc.src.__support.FPUtil.fp_bits
+    libc.hdr.errno_macros
 )
 
 add_header_library(
@@ -125,6 +137,7 @@ add_header_library(
     .writer
     .core_structs
     libc.src.__support.arg_list
+    libc.src.__support.error_or
 )
 
 add_header_library(
@@ -136,10 +149,20 @@ add_header_library(
     libc.hdr.func.free
     libc.hdr.func.realloc
     libc.src.__support.arg_list
+    libc.src.__support.error_or
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
 )
 
+add_header_library(
+  error_mapper
+  HDRS
+    error_mapper.h
+  DEPENDS
+    ${target_error_mapper}
+    libc.src.__support.macros.properties.architectures
+)
+
 if(NOT (TARGET libc.src.__support.File.file) AND LLVM_LIBC_FULL_BUILD)
   # Not all platforms have a file implementation. If file is unvailable, and a
   # full build is requested, then we must skip all file based printf sections.
@@ -152,8 +175,10 @@ add_header_library(
     vfprintf_internal.h
   DEPENDS
     libc.src.__support.File.file
+    libc.src.__support.error_or
     libc.src.__support.arg_list
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
   ${use_system_file}
 )
+
diff --git a/libc/src/stdio/printf_core/core_structs.h b/libc/src/stdio/printf_core/core_structs.h
index e27f77b6b594a..0d41f2244d8da 100644
--- a/libc/src/stdio/printf_core/core_structs.h
+++ b/libc/src/stdio/printf_core/core_structs.h
@@ -132,14 +132,17 @@ template <typename T> LIBC_INLINE constexpr TypeDesc type_desc_from_type() {
 
 // This is the value to be returned by conversions when no error has occurred.
 constexpr int WRITE_OK = 0;
-// These are the printf return values for when an error has occurred. They are
-// all negative, and should be distinct.
-constexpr int FILE_WRITE_ERROR = -1;
-constexpr int FILE_STATUS_ERROR = -2;
-constexpr int NULLPTR_WRITE_ERROR = -3;
-constexpr int INT_CONVERSION_ERROR = -4;
-constexpr int FIXED_POINT_CONVERSION_ERROR = -5;
-constexpr int ALLOCATION_ERROR = -6;
+// These are the error return values used by the printf engine when an
+// error has occurred. They are all large negative, distinct values starting
+// from -1000 to not overlap with system errors.
+constexpr int FILE_WRITE_ERROR = -1001;
+constexpr int FILE_STATUS_ERROR = -1002;
+constexpr int NULLPTR_WRITE_ERROR = -1003;
+constexpr int INT_CONVERSION_ERROR = -1004;
+constexpr int FIXED_POINT_CONVERSION_ERROR = -1005;
+constexpr int ALLOCATION_ERROR = -1006;
+constexpr int OVERFLOW_ERROR = -1007;
+
 } // namespace printf_core
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/stdio/printf_core/error_mapper.h b/libc/src/stdio/printf_core/error_mapper.h
new file mode 100644
index 0000000000000..23030930133a1
--- /dev/null
+++ b/libc/src/stdio/printf_core/error_mapper.h
@@ -0,0 +1,21 @@
+//===-- Error mapper for printf ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H
+#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H
+
+#include "src/__support/macros/properties/architectures.h"
+
+// Maps internal errors to the available errnos on the platform.
+#if defined(__linux__)
+#include "linux/error_mapper.h"
+#else
+#include "generic/error_mapper.h"
+#endif
+
+#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H
diff --git a/libc/src/stdio/printf_core/float_dec_converter_limited.h b/libc/src/stdio/printf_core/float_dec_converter_limited.h
index 9cdc13573d320..0f85d0a8d26b4 100644
--- a/libc/src/stdio/printf_core/float_dec_converter_limited.h
+++ b/libc/src/stdio/printf_core/float_dec_converter_limited.h
@@ -363,8 +363,8 @@ DigitsOutput decimal_digits(DigitsInput input, int precision, bool e_mode) {
       // we made it from and doing the decimal conversion all over again.)
       for (size_t i = output.ndigits; i-- > 0;) {
         if (output.digits[i] != '9') {
-          output.digits[i] = static_cast<char>(internal::int_to_b36_char(
-              internal::b36_char_to_int(output.digits[i]) + 1));
+          output.digits[i] = internal::int_to_b36_char(
+              internal::b36_char_to_int(output.digits[i]) + 1);
           break;
         } else {
           output.digits[i] = '0';
diff --git a/libc/src/stdio/printf_core/float_hex_converter.h b/libc/src/stdio/printf_core/float_hex_converter.h
index 16592e7bac932..9b57f1d803e74 100644
--- a/libc/src/stdio/printf_core/float_hex_converter.h
+++ b/libc/src/stdio/printf_core/float_hex_converter.h
@@ -137,9 +137,9 @@ LIBC_INLINE int convert_float_hex_exp(Writer<write_mode> *writer,
   size_t first_non_zero = 1;
   for (; mant_cur > 0; --mant_cur, mantissa >>= 4) {
     char mant_mod_16 = static_cast<char>(mantissa % 16);
-    char new_digit = static_cast<char>(internal::int_to_b36_char(mant_mod_16));
+    char new_digit = internal::int_to_b36_char(mant_mod_16);
     if (internal::isupper(to_conv.conv_name))
-      new_digit = static_cast<char>(internal::toupper(new_digit));
+      new_digit = internal::toupper(new_digit);
     mant_buffer[mant_cur - 1] = new_digit;
     if (new_digit != '0' && first_non_zero < mant_cur)
       first_non_zero = mant_cur;
@@ -167,8 +167,7 @@ LIBC_INLINE int convert_float_hex_exp(Writer<write_mode> *writer,
 
   size_t exp_cur = EXP_LEN;
   for (; exponent > 0; --exp_cur, exponent /= 10) {
-    exp_buffer[exp_cur - 1] =
-        static_cast<char>(internal::int_to_b36_char(exponent % 10));
+    exp_buffer[exp_cur - 1] = internal::int_to_b36_char(exponent % 10);
   }
   if (exp_cur == EXP_LEN) { // if nothing else was written, write a 0.
     exp_buffer[EXP_LEN - 1] = '0';
diff --git a/libc/src/stdio/printf_core/generic/CMakeLists.txt b/libc/src/stdio/printf_core/generic/CMakeLists.txt
new file mode 100644
index 0000000000000..2f0143d992e31
--- /dev/null
+++ b/libc/src/stdio/printf_core/generic/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_header_library(
+  error_mapper
+  HDRS
+    error_mapper.h
+  DEPENDS
+    libc.src.stdio.printf_core.core_structs
+    libc.hdr.errno_macros
+)
diff --git a/libc/src/stdio/printf_core/generic/error_mapper.h b/libc/src/stdio/printf_core/generic/error_mapper.h
new file mode 100644
index 0000000000000..d8cdd2cc2dbaa
--- /dev/null
+++ b/libc/src/stdio/printf_core/generic/error_mapper.h
@@ -0,0 +1,49 @@
+//===-- Generic implementation of error mapper ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H
+#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H
+
+#include "hdr/errno_macros.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace printf_core {
+
+LIBC_INLINE static int internal_error_to_errno(int internal_error) {
+  // System error occured, return error as is.
+  if (internal_error < 1001 && internal_error > 0) {
+    return internal_error;
+  }
+
+  // Map internal error to the available C standard errnos.
+  switch (-internal_error) {
+  case WRITE_OK:
+    return 0;
+  case FILE_WRITE_ERROR:
+  case FILE_STATUS_ERROR:
+  case NULLPTR_WRITE_ERROR:
+  case ALLOCATION_ERROR:
+    return EDOM;
+  case INT_CONVERSION_ERROR:
+  case FIXED_POINT_CONVERSION_ERROR:
+  case OVERFLOW_ERROR:
+    return ERANGE;
+  default:
+    LIBC_ASSERT(
+        false &&
+        "Invalid internal printf error code passed to internal_error_to_errno");
+    return EDOM;
+  }
+}
+
+} // namespace printf_core
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H
diff --git a/libc/src/stdio/printf_core/linux/CMakeLists.txt b/libc/src/stdio/printf_core/linux/CMakeLists.txt
new file mode 100644
index 0000000000000..2f0143d992e31
--- /dev/null
+++ b/libc/src/stdio/printf_core/linux/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_header_library(
+  error_mapper
+  HDRS
+    error_mapper.h
+  DEPENDS
+    libc.src.stdio.printf_core.core_structs
+    libc.hdr.errno_macros
+)
diff --git a/libc/src/stdio/printf_core/linux/error_mapper.h b/libc/src/stdio/printf_core/linux/error_mapper.h
new file mode 100644
index 0000000000000..3c2fe663072d0
--- /dev/null
+++ b/libc/src/stdio/printf_core/linux/error_mapper.h
@@ -0,0 +1,54 @@
+//===-- Linux implementation of error mapper --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H
+#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H
+
+#include "hdr/errno_macros.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace printf_core {
+
+LIBC_INLINE static int internal_error_to_errno(int internal_error) {
+  // System error occured, return error as is.
+  if (internal_error < 1001 && internal_error > 0) {
+    return internal_error;
+  }
+
+  // Map internal error to POSIX errnos.
+  switch (-internal_error) {
+  case WRITE_OK:
+    return 0;
+  case FILE_WRITE_ERROR:
+    return EIO;
+  case FILE_STATUS_ERROR:
+    return EIO;
+  case NULLPTR_WRITE_ERROR:
+    return EINVAL;
+  case INT_CONVERSION_ERROR:
+    return ERANGE;
+  case FIXED_POINT_CONVERSION_ERROR:
+    return EINVAL;
+  case ALLOCATION_ERROR:
+    return ENOMEM;
+  case OVERFLOW_ERROR:
+    return EOVERFLOW;
+  default:
+    LIBC_ASSERT(
+        false &&
+        "Invalid internal printf error code passed to internal_error_to_errno");
+    return EINVAL;
+  }
+}
+
+} // namespace printf_core
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H
diff --git a/libc/src/stdio/printf_core/printf_main.h b/libc/src/stdio/printf_core/printf_main.h
index 57f29858d5298..1c7a7237c097d 100644
--- a/libc/src/stdio/printf_core/printf_main.h
+++ b/libc/src/stdio/printf_core/printf_main.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PRINTF_MAIN_H
 
 #include "src/__support/arg_list.h"
+#include "src/__support/error_or.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/converter.h"
 #include "src/stdio/printf_core/core_structs.h"
@@ -22,8 +23,9 @@ namespace LIBC_NAMESPACE_DECL {
 namespace printf_core {
 
 template <WriteMode write_mode>
-int printf_main(Writer<write_mode> *writer, const char *__restrict str,
-                internal::ArgList &args) {
+ErrorOr<size_t> printf_main(Writer<write_mode> *writer,
+                            const char *__restrict str,
+                            internal::ArgList &args) {
   Parser<internal::ArgList> parser(str, args);
   int result = 0;
   for (FormatSection cur_section = parser.get_next_section();
@@ -33,9 +35,8 @@ int printf_main(Writer<write_mode> *writer, const char *__restrict str,
       result = convert(writer, cur_section);
     else
       result = writer->write(cur_section.raw_string);
-
     if (result < 0)
-      return result;
+      return Error(-result);
   }
 
   return writer->get_chars_written();
diff --git a/libc/src/stdio/printf_core/vasprintf_internal.h b/libc/src/stdio/printf_core/vasprintf_internal.h
index 283d8df2810fb..41df17b67f35b 100644
--- a/libc/src/stdio/printf_core/vasprintf_internal.h
+++ b/libc/src/stdio/printf_core/vasprintf_internal.h
@@ -10,6 +10,7 @@
 #include "hdr/func/malloc.h"
 #include "hdr/func/realloc.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/error_or.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
@@ -29,7 +30,7 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) {
   if (new_buff == nullptr) {
     if (wb->buff != wb->init_buff)
       free(wb->buff);
-    return printf_core::ALLOCATION_ERROR;
+    return ALLOCATION_ERROR;
   }
   if (isBuffOnStack)
     inline_memcpy(new_buff, wb->buff, wb->buff_cur);
@@ -42,27 +43,28 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) {
 
 constexpr size_t DEFAULT_BUFFER_SIZE = 200;
 
-LIBC_INLINE int vasprintf_internal(char **ret, const char *__restrict format,
-                                   internal::ArgList args) {
+LIBC_INLINE ErrorOr<size_t> vasprintf_internal(char **ret,
+                                               const char *__restrict format,
+                                               internal::ArgList args) {
   char init_buff_on_stack[DEFAULT_BUFFER_SIZE];
   printf_core::WriteBuffer<Mode<WriteMode::RESIZE_AND_FILL_BUFF>::value> wb(
       init_buff_on_stack, DEFAULT_BUFFER_SIZE, resize_overflow_hook);
   printf_core::Writer writer(wb);
 
   auto ret_val = printf_core::printf_main(&writer, format, args);
-  if (ret_val < 0) {
+  if (!ret_val.has_value()) {
     *ret = nullptr;
-    return -1;
+    return ret_val;
   }
   if (wb.buff == init_buff_on_stack) {
-    *ret = static_cast<char *>(malloc(ret_val + 1));
+    *ret = static_cast<char *>(malloc(ret_val.value() + 1));
     if (ret == nullptr)
-      return printf_core::ALLOCATION_ERROR;
-    inline_memcpy(*ret, wb.buff, ret_val);
+      return Error(ALLOCATION_ERROR);
+    inline_memcpy(*ret, wb.buff, ret_val.value());
   } else {
     *ret = wb.buff;
   }
-  (*ret)[ret_val] = '\0';
+  (*ret)[ret_val.value()] = '\0';
   return ret_val;
 }
 } // namespace printf_core
diff --git a/libc/src/stdio/printf_core/vfprintf_internal.h b/libc/src/stdio/printf_core/vfprintf_internal.h
index 630de9d9d43dd..c47a03d741f98 100644
--- a/libc/src/stdio/printf_core/vfprintf_internal.h
+++ b/libc/src/stdio/printf_core/vfprintf_internal.h
@@ -11,6 +11,7 @@
 
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/error_or.h"
 #include "src/__support/macros/attributes.h" // For LIBC_INLINE
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/core_structs.h"
@@ -35,8 +36,8 @@ LIBC_INLINE void funlockfile(FILE *f) {
   reinterpret_cast<LIBC_NAMESPACE::File *>(f)->unlock();
 }
 
-LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb,
-                                   FILE *f) {
+LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size,
+                                         size_t nmemb, FILE *f) {
   return reinterpret_cast<LIBC_NAMESPACE::File *>(f)->write_unlocked(
       ptr, size * nmemb);
 }
@@ -47,9 +48,14 @@ LIBC_INLINE void flockfile(::FILE *f) { ::flockfile(f); }
 
 LIBC_INLINE void funlockfile(::FILE *f) { ::funlockfile(f); }
 
-LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb,
-                                   ::FILE *f) {
-  return ::fwrite_unlocked(ptr, size, nmemb, f);
+LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size,
+                                         size_t nmemb, ::FILE *f) {
+  // Need to use system errno in this case, as system write will set this errno
+  // which we need to propagate back into our code. fwrite only modifies errno
+  // if there was an error, and errno may have previously been nonzero. Only
+  // return errno if there was an error.
+  size_t members_written = ::fwrite_unlocked(ptr, size, nmemb, f);
+  return {members_written, members_written == nmemb ? 0 : errno};
 }
 #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE
 } // namespace internal
@@ -60,26 +66,38 @@ LIBC_INLINE int file_write_hook(cpp::string_view new_str, void *fp) {
   ::FILE *target_file = reinterpret_cast<::FILE *>(fp);
   // Write new_str to the target file. The logic preventing a zero-length write
   // is in the writer, so we don't check here.
-  size_t written = internal::fwrite_unlocked(new_str.data(), sizeof(char),
-                                             new_str.size(), target_file);
-  if (written != new_str.size() || internal::ferror_unlocked(target_file))
+  auto write_result = internal::fwrite_unlocked(new_str.data(), sizeof(char),
+                                                new_str.size(), target_file);
+  // Propagate actual system error in FileIOResult.
+  if (write_result.has_error())
+    return -write_result.error;
+
+  // In case short write occured or error was not set on FileIOResult for some
+  // reason.
+  if (write_result.value != new_str.size() ||
+      internal::ferror_unlocked(target_file))
     return FILE_WRITE_ERROR;
+
   return WRITE_OK;
 }
 
-LIBC_INLINE int vfprintf_internal(::FILE *__restrict stream,
-                                  const char *__restrict format,
-                                  internal::ArgList &args) {
+LIBC_INLINE ErrorOr<size_t> vfprintf_internal(::FILE *__restrict stream,
+                                              const char *__restrict format,
+                                              internal::ArgList &args) {
   constexpr size_t BUFF_SIZE = 1024;
   char buffer[BUFF_SIZE];
   printf_core::WriteBuffer<Mode<WriteMode::FLUSH_TO_STREAM>::value> wb(
       buffer, BUFF_SIZE, &file_write_hook, reinterpret_cast<void *>(stream));
   Writer writer(wb);
   internal::flockfile(stream);
-  int retval = printf_main(&writer, format, args);
+  auto retval = printf_main(&writer, format, args);
+  if (!retval.has_value()) {
+    internal::funlockfile(stream);
+    return retval;
+  }
   int flushval = wb.overflow_write("");
   if (flushval != WRITE_OK)
-    retval = flushval;
+    retval = Error(-flushval);
   internal::funlockfile(stream);
   return retval;
 }
diff --git a/libc/src/stdio/printf_core/write_int_converter.h b/libc/src/stdio/printf_core/write_int_converter.h
index efcff278bd284..04b2bef05bc7b 100644
--- a/libc/src/stdio/printf_core/write_int_converter.h
+++ b/libc/src/stdio/printf_core/write_int_converter.h
@@ -29,11 +29,11 @@ LIBC_INLINE int convert_write_int(Writer<write_mode> *writer,
     return NULLPTR_WRITE_ERROR;
 #endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
 
-  int written = writer->get_chars_written();
+  size_t written = writer->get_chars_written();
 
   switch (to_conv.length_modifier) {
   case LengthModifier::none:
-    *reinterpret_cast<int *>(to_conv.conv_val_ptr) = written;
+    *reinterpret_cast<int *>(to_conv.conv_val_ptr) = static_cast<int>(written);
     break;
   case LengthModifier::l:
     *reinterpret_cast<long *>(to_conv.conv_val_ptr) = written;
diff --git a/libc/src/stdio/printf_core/writer.h b/libc/src/stdio/printf_core/writer.h
index 1d4734a51b9b8..9de108ece510f 100644
--- a/libc/src/stdio/printf_core/writer.h
+++ b/libc/src/stdio/printf_core/writer.h
@@ -127,7 +127,7 @@ template <WriteMode write_mode> struct WriteBuffer {
 
 template <WriteMode write_mode> class Writer final {
   WriteBuffer<write_mode> &wb;
-  int chars_written = 0;
+  size_t chars_written = 0;
 
   LIBC_INLINE int pad(char new_char, size_t length) {
     // First, fill as much of the buffer as possible with the padding char.
@@ -161,7 +161,7 @@ template <WriteMode write_mode> class Writer final {
   // Takes a string, copies it into the buffer if there is space, else passes it
   // to the overflow mechanism to be handled separately.
   LIBC_INLINE int write(cpp::string_view new_string) {
-    chars_written += static_cast<int>(new_string.size());
+    chars_written += new_string.size();
     if (LIBC_LIKELY(wb.buff_cur + new_string.size() <= wb.buff_len)) {
       inline_memcpy(wb.buff + wb.buff_cur, new_string.data(),
                     new_string.size());
@@ -175,7 +175,7 @@ template <WriteMode write_mode> class Writer final {
   // if there is space, else calls pad which will loop and call the overflow
   // mechanism on a secondary buffer.
   LIBC_INLINE int write(char new_char, size_t length) {
-    chars_written += static_cast<int>(length);
+    chars_written += length;
 
     if (LIBC_LIKELY(wb.buff_cur + length <= wb.buff_len)) {
       inline_memset(wb.buff + wb.buff_cur, static_cast<unsigned char>(new_char),
@@ -199,7 +199,7 @@ template <WriteMode write_mode> class Writer final {
     return wb.overflow_write(char_string_view);
   }
 
-  LIBC_INLINE int get_chars_written() { return chars_written; }
+  LIBC_INLINE size_t get_chars_written() { return chars_written; }
 };
 
 // Class-template auto deduction helpers.
diff --git a/libc/src/stdio/snprintf.cpp b/libc/src/stdio/snprintf.cpp
index c8940862f711f..d95195f6f485f 100644
--- a/libc/src/stdio/snprintf.cpp
+++ b/libc/src/stdio/snprintf.cpp
@@ -8,8 +8,12 @@
 
 #include "src/stdio/snprintf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -32,10 +36,21 @@ LLVM_LIBC_FUNCTION(int, snprintf,
       wb(buffer, (buffsz > 0 ? buffsz - 1 : 0));
   printf_core::Writer writer(wb);
 
-  int ret_val = printf_core::printf_main(&writer, format, args);
+  auto ret_val = printf_core::printf_main(&writer, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
   if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer.
     wb.buff[wb.buff_cur] = '\0';
-  return ret_val;
+
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/sprintf.cpp b/libc/src/stdio/sprintf.cpp
index 7be97d3591aaf..2a9b6ea7c5e50 100644
--- a/libc/src/stdio/sprintf.cpp
+++ b/libc/src/stdio/sprintf.cpp
@@ -10,7 +10,10 @@
 
 #include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -33,9 +36,20 @@ LLVM_LIBC_FUNCTION(int, sprintf,
       wb(buffer, cpp::numeric_limits<size_t>::max());
   printf_core::Writer writer(wb);
 
-  int ret_val = printf_core::printf_main(&writer, format, args);
+  auto ret_val = printf_core::printf_main(&writer, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
   wb.buff[wb.buff_cur] = '\0';
-  return ret_val;
+
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vasprintf.cpp b/libc/src/stdio/vasprintf.cpp
index 4a44d4a0f8842..bd77cd8864312 100644
--- a/libc/src/stdio/vasprintf.cpp
+++ b/libc/src/stdio/vasprintf.cpp
@@ -7,7 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/vasprintf.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vasprintf_internal.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -18,7 +22,17 @@ LLVM_LIBC_FUNCTION(int, vasprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  return printf_core::vasprintf_internal(ret, format, args);
+  auto ret_val = printf_core::vasprintf_internal(ret, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vsnprintf.cpp b/libc/src/stdio/vsnprintf.cpp
index b07a2499a0dd3..5d936360c0857 100644
--- a/libc/src/stdio/vsnprintf.cpp
+++ b/libc/src/stdio/vsnprintf.cpp
@@ -8,8 +8,12 @@
 
 #include "src/stdio/vsnprintf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -29,10 +33,21 @@ LLVM_LIBC_FUNCTION(int, vsnprintf,
       wb(buffer, (buffsz > 0 ? buffsz - 1 : 0));
   printf_core::Writer writer(wb);
 
-  int ret_val = printf_core::printf_main(&writer, format, args);
+  auto ret_val = printf_core::printf_main(&writer, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
   if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer.
     wb.buff[wb.buff_cur] = '\0';
-  return ret_val;
+
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vsprintf.cpp b/libc/src/stdio/vsprintf.cpp
index 26d497be42125..f9cf8118534f6 100644
--- a/libc/src/stdio/vsprintf.cpp
+++ b/libc/src/stdio/vsprintf.cpp
@@ -10,7 +10,10 @@
 
 #include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -30,9 +33,19 @@ LLVM_LIBC_FUNCTION(int, vsprintf,
       wb(buffer, cpp::numeric_limits<size_t>::max());
   printf_core::Writer writer(wb);
 
-  int ret_val = printf_core::printf_main(&writer, format, args);
+  auto ret_val = printf_core::printf_main(&writer, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
   wb.buff[wb.buff_cur] = '\0';
-  return ret_val;
+
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index c464f82dcbda7..1ccdcc8bec148 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -73,6 +73,8 @@ add_entrypoint_object(
     strfromf.h
   DEPENDS
     .str_from_util
+    libc.src.__support.CPP.limits
+    libc.src.stdio.printf_core.error_mapper
 )
 
 add_entrypoint_object(
@@ -83,6 +85,8 @@ add_entrypoint_object(
     strfromd.h
   DEPENDS
     .str_from_util
+    libc.src.__support.CPP.limits
+    libc.src.stdio.printf_core.error_mapper
 )
 
 add_entrypoint_object(
@@ -93,6 +97,8 @@ add_entrypoint_object(
     strfroml.h
   DEPENDS
     .str_from_util
+    libc.src.__support.CPP.limits
+    libc.src.stdio.printf_core.error_mapper
 )
 
 add_header_library(
diff --git a/libc/src/stdlib/l64a.cpp b/libc/src/stdlib/l64a.cpp
index d59e65e7dc4c2..d8fe8ef86bf7d 100644
--- a/libc/src/stdlib/l64a.cpp
+++ b/libc/src/stdlib/l64a.cpp
@@ -32,15 +32,13 @@ constexpr static char b64_int_to_char(uint32_t num) {
   if (num == 1)
     return '/';
   if (num < 38)
-    return static_cast<char>(
-        internal::toupper(internal::int_to_b36_char(num - 2)));
+    return internal::toupper(internal::int_to_b36_char(num - 2));
 
   // this tolower is technically unnecessary, but it provides safety if we
   // change the default behavior of int_to_b36_char. Also the compiler
   // completely elides it so there's no performance penalty, see:
   // https://godbolt.org/z/o5ennv7fc
-  return static_cast<char>(
-      internal::tolower(internal::int_to_b36_char(num - 2 - 26)));
+  return internal::tolower(internal::int_to_b36_char(num - 2 - 26));
 }
 
 // This function takes a long and converts the low 32 bits of it into at most 6
diff --git a/libc/src/stdlib/strfromd.cpp b/libc/src/stdlib/strfromd.cpp
index f51e6d4c7f1df..71e257f08645b 100644
--- a/libc/src/stdlib/strfromd.cpp
+++ b/libc/src/stdlib/strfromd.cpp
@@ -7,7 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/strfromd.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdlib/str_from_util.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -36,7 +39,13 @@ LLVM_LIBC_FUNCTION(int, strfromd,
   if (n > 0)
     wb.buff[wb.buff_cur] = '\0';
 
-  return writer.get_chars_written();
+  if (writer.get_chars_written() >
+      static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+  return static_cast<int>(writer.get_chars_written());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/strfromf.cpp b/libc/src/stdlib/strfromf.cpp
index 14dbfdb25bab6..65f242b200f18 100644
--- a/libc/src/stdlib/strfromf.cpp
+++ b/libc/src/stdlib/strfromf.cpp
@@ -7,7 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/strfromf.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdlib/str_from_util.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -36,7 +39,13 @@ LLVM_LIBC_FUNCTION(int, strfromf,
   if (n > 0)
     wb.buff[wb.buff_cur] = '\0';
 
-  return writer.get_chars_written();
+  if (writer.get_chars_written() >
+      static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+  return static_cast<int>(writer.get_chars_written());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/strfroml.cpp b/libc/src/stdlib/strfroml.cpp
index 12f22a8a2fb65..31668a0323c93 100644
--- a/libc/src/stdlib/strfroml.cpp
+++ b/libc/src/stdlib/strfroml.cpp
@@ -7,7 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/strfroml.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdlib/str_from_util.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -41,7 +44,13 @@ LLVM_LIBC_FUNCTION(int, strfroml,
   if (n > 0)
     wb.buff[wb.buff_cur] = '\0';
 
-  return writer.get_chars_written();
+  if (writer.get_chars_written() >
+      static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+  return static_cast<int>(writer.get_chars_written());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/string/memory_utils/aarch64/inline_strlen.h b/libc/src/string/memory_utils/aarch64/inline_strlen.h
index 87f5ccdd56e23..eafaca9776a42 100644
--- a/libc/src/string/memory_utils/aarch64/inline_strlen.h
+++ b/libc/src/string/memory_utils/aarch64/inline_strlen.h
@@ -8,14 +8,13 @@
 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
 
+#include "src/__support/macros/properties/cpu_features.h"
+
 #if defined(__ARM_NEON)
 #include "src/__support/CPP/bit.h" // countr_zero
-
 #include <arm_neon.h>
 #include <stddef.h> // size_t
-
 namespace LIBC_NAMESPACE_DECL {
-
 namespace neon {
 [[maybe_unused]] LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static size_t
 string_length(const char *src) {
@@ -45,9 +44,63 @@ string_length(const char *src) {
   }
 }
 } // namespace neon
+} // namespace LIBC_NAMESPACE_DECL
+#endif // __ARM_NEON
 
-namespace string_length_impl = neon;
+#ifdef LIBC_TARGET_CPU_HAS_SVE
+#include "src/__support/macros/optimization.h"
+#include <arm_sve.h>
+namespace LIBC_NAMESPACE_DECL {
+namespace sve {
+[[maybe_unused]] LIBC_INLINE static size_t string_length(const char *src) {
+  const uint8_t *ptr = reinterpret_cast<const uint8_t *>(src);
+  // Initialize the first-fault register to all true
+  svsetffr();
+  const svbool_t all_true = svptrue_b8(); // all true predicate
+  svbool_t cmp_zero;
+  size_t len = 0;
 
+  for (;;) {
+    // Read a vector's worth of bytes, stopping on first fault.
+    svuint8_t data = svldff1_u8(all_true, &ptr[len]);
+    svbool_t fault_mask = svrdffr_z(all_true);
+    bool has_no_fault = svptest_last(all_true, fault_mask);
+    if (LIBC_LIKELY(has_no_fault)) {
+      // First fault did not fail: the whole vector is valid.
+      // Avoid depending on the contents of FFR beyond the branch.
+      len += svcntb(); // speculative increment
+      cmp_zero = svcmpeq_n_u8(all_true, data, 0);
+      bool has_no_zero = !svptest_any(all_true, cmp_zero);
+      if (LIBC_LIKELY(has_no_zero))
+        continue;
+      len -= svcntb(); // undo speculative increment
+      break;
+    } else {
+      // First fault failed: only some of the vector is valid.
+      // Perform the comparison only on the valid bytes.
+      cmp_zero = svcmpeq_n_u8(fault_mask, data, 0);
+      bool has_zero = svptest_any(fault_mask, cmp_zero);
+      if (LIBC_LIKELY(has_zero))
+        break;
+      svsetffr();
+      len += svcntp_b8(all_true, fault_mask);
+      continue;
+    }
+  }
+  // Select the bytes before the first and count them.
+  svbool_t before_zero = svbrkb_z(all_true, cmp_zero);
+  len += svcntp_b8(all_true, before_zero);
+  return len;
+}
+} // namespace sve
+} // namespace LIBC_NAMESPACE_DECL
+#endif // LIBC_TARGET_CPU_HAS_SVE
+
+namespace LIBC_NAMESPACE_DECL {
+#ifdef LIBC_TARGET_CPU_HAS_SVE
+namespace string_length_impl = sve;
+#elif defined(__ARM_NEON)
+namespace string_length_impl = neon;
+#endif
 } // namespace LIBC_NAMESPACE_DECL
-#endif // __ARM_NEON
 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
diff --git a/libc/src/string/strcasestr.cpp b/libc/src/string/strcasestr.cpp
index de8e4bec7fe0b..575d6bed16d11 100644
--- a/libc/src/string/strcasestr.cpp
+++ b/libc/src/string/strcasestr.cpp
@@ -21,8 +21,8 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(char *, strcasestr,
                    (const char *haystack, const char *needle)) {
   auto case_cmp = [](char a, char b) {
-    return LIBC_NAMESPACE::internal::tolower(a) -
-           LIBC_NAMESPACE::internal::tolower(b);
+    return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) -
+           static_cast<int>(LIBC_NAMESPACE::internal::tolower(b));
   };
 
   LIBC_CRASH_ON_NULLPTR(haystack);
diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
index 7feef56fb3676..cbce62ead0328 100644
--- a/libc/src/string/string_utils.h
+++ b/libc/src/string/string_utils.h
@@ -127,8 +127,8 @@ find_first_character_wide_read(const unsigned char *src, unsigned char ch,
   size_t cur = 0;
 
   // Step 1: read 1 byte at a time to align to block size
-  for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0 && cur < n;
-       ++char_ptr, ++cur) {
+  for (; cur < n && reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0;
+       ++cur, ++char_ptr) {
     if (*char_ptr == ch)
       return const_cast<unsigned char *>(char_ptr);
   }
@@ -136,18 +136,18 @@ find_first_character_wide_read(const unsigned char *src, unsigned char ch,
   const Word ch_mask = repeat_byte<Word>(ch);
 
   // Step 2: read blocks
-  for (const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
-       !has_zeroes<Word>((*block_ptr) ^ ch_mask) && cur < n;
-       ++block_ptr, cur += sizeof(Word)) {
-    char_ptr = reinterpret_cast<const unsigned char *>(block_ptr);
-  }
+  const Word *block_ptr = reinterpret_cast<const Word *>(char_ptr);
+  for (; cur < n && !has_zeroes<Word>((*block_ptr) ^ ch_mask);
+       cur += sizeof(Word), ++block_ptr)
+    ;
+  char_ptr = reinterpret_cast<const unsigned char *>(block_ptr);
 
   // Step 3: find the match in the block
-  for (; *char_ptr != ch && cur < n; ++char_ptr, ++cur) {
+  for (; cur < n && *char_ptr != ch; ++cur, ++char_ptr) {
     ;
   }
 
-  if (*char_ptr != ch || cur >= n)
+  if (cur >= n || *char_ptr != ch)
     return static_cast<void *>(nullptr);
 
   return const_cast<unsigned char *>(char_ptr);
diff --git a/libc/src/strings/strcasecmp.cpp b/libc/src/strings/strcasecmp.cpp
index 4bbe2909df1e2..4518647deabe4 100644
--- a/libc/src/strings/strcasecmp.cpp
+++ b/libc/src/strings/strcasecmp.cpp
@@ -17,8 +17,8 @@ namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, strcasecmp, (const char *left, const char *right)) {
   auto case_cmp = [](char a, char b) {
-    return LIBC_NAMESPACE::internal::tolower(a) -
-           LIBC_NAMESPACE::internal::tolower(b);
+    return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) -
+           static_cast<int>(LIBC_NAMESPACE::internal::tolower(b));
   };
   return inline_strcmp(left, right, case_cmp);
 }
diff --git a/libc/src/strings/strcasecmp_l.cpp b/libc/src/strings/strcasecmp_l.cpp
index 95117cb27a564..d77f95637a396 100644
--- a/libc/src/strings/strcasecmp_l.cpp
+++ b/libc/src/strings/strcasecmp_l.cpp
@@ -18,8 +18,8 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, strcasecmp_l,
                    (const char *left, const char *right, locale_t)) {
   auto case_cmp = [](char a, char b) {
-    return LIBC_NAMESPACE::internal::tolower(a) -
-           LIBC_NAMESPACE::internal::tolower(b);
+    return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) -
+           static_cast<int>(LIBC_NAMESPACE::internal::tolower(b));
   };
   return inline_strcmp(left, right, case_cmp);
 }
diff --git a/libc/src/strings/strncasecmp.cpp b/libc/src/strings/strncasecmp.cpp
index 9c2f0ab131269..a5926495a3e22 100644
--- a/libc/src/strings/strncasecmp.cpp
+++ b/libc/src/strings/strncasecmp.cpp
@@ -18,8 +18,8 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, strncasecmp,
                    (const char *left, const char *right, size_t n)) {
   auto case_cmp = [](char a, char b) {
-    return LIBC_NAMESPACE::internal::tolower(a) -
-           LIBC_NAMESPACE::internal::tolower(b);
+    return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) -
+           static_cast<int>(LIBC_NAMESPACE::internal::tolower(b));
   };
   return inline_strncmp(left, right, n, case_cmp);
 }
diff --git a/libc/src/strings/strncasecmp_l.cpp b/libc/src/strings/strncasecmp_l.cpp
index 91ac7e5e89107..a828f609fd9e8 100644
--- a/libc/src/strings/strncasecmp_l.cpp
+++ b/libc/src/strings/strncasecmp_l.cpp
@@ -18,8 +18,8 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, strncasecmp_l,
                    (const char *left, const char *right, size_t n, locale_t)) {
   auto case_cmp = [](char a, char b) {
-    return LIBC_NAMESPACE::internal::tolower(a) -
-           LIBC_NAMESPACE::internal::tolower(b);
+    return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) -
+           static_cast<int>(LIBC_NAMESPACE::internal::tolower(b));
   };
   return inline_strncmp(left, right, n, case_cmp);
 }
diff --git a/libc/src/time/CMakeLists.txt b/libc/src/time/CMakeLists.txt
index ec942e38d1af5..4d647c22c3239 100644
--- a/libc/src/time/CMakeLists.txt
+++ b/libc/src/time/CMakeLists.txt
@@ -245,3 +245,11 @@ add_entrypoint_object(
   DEPENDS
     .${LIBC_TARGET_OS}.clock_getres
 )
+
+add_entrypoint_object(
+  clock_settime
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.clock_settime
+)
+
diff --git a/libc/src/time/baremetal/CMakeLists.txt b/libc/src/time/baremetal/CMakeLists.txt
index cbe9cf3db3e21..7a5bad3311cd5 100644
--- a/libc/src/time/baremetal/CMakeLists.txt
+++ b/libc/src/time/baremetal/CMakeLists.txt
@@ -24,11 +24,11 @@ add_entrypoint_object(
   localtime
   SRCS
     localtime.cpp
+    ../time_utils.cpp
   HDRS
     ../localtime.h
-    time_utils.h
+    ../time_utils.h
   DEPENDS
-    .time_utils
     libc.hdr.types.struct_tm
     libc.hdr.types.time_t
 )
@@ -37,11 +37,11 @@ add_entrypoint_object(
   localtime_r
   SRCS
     localtime_r.cpp
+    ../time_utils.cpp
   HDRS
     ../localtime.h
-    time_utils.h
+    ../time_utils.h
   DEPENDS
-    .time_utils
     libc.hdr.types.struct_tm
     libc.hdr.types.time_t
 )
diff --git a/libc/src/time/clock_settime.h b/libc/src/time/clock_settime.h
new file mode 100644
index 0000000000000..9321dd1074101
--- /dev/null
+++ b/libc/src/time/clock_settime.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for clock_settime function --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_TIME_CLOCK_SETTIME_H
+#define LLVM_LIBC_SRC_TIME_CLOCK_SETTIME_H
+
+#include "hdr/types/clockid_t.h"
+#include "hdr/types/struct_timespec.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int clock_settime(clockid_t clockid, const timespec *tp);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_TIME_CLOCK_SETTIME_H
diff --git a/libc/src/time/linux/CMakeLists.txt b/libc/src/time/linux/CMakeLists.txt
index a6ec7c7c06963..6ea04597063cb 100644
--- a/libc/src/time/linux/CMakeLists.txt
+++ b/libc/src/time/linux/CMakeLists.txt
@@ -54,6 +54,19 @@ add_entrypoint_object(
     libc.src.errno.errno
 )
 
+add_entrypoint_object(
+  clock_settime
+  SRCS
+    clock_settime.cpp
+  HDRS
+    ../clock_settime.h
+  DEPENDS
+    libc.hdr.types.clockid_t
+    libc.hdr.types.struct_timespec
+    libc.src.__support.time.clock_settime
+    libc.src.errno.errno
+)
+
 add_entrypoint_object(
   gettimeofday
   SRCS
diff --git a/libc/src/time/linux/clock.cpp b/libc/src/time/linux/clock.cpp
index c38697cd0668e..c560bd10be83c 100644
--- a/libc/src/time/linux/clock.cpp
+++ b/libc/src/time/linux/clock.cpp
@@ -19,7 +19,7 @@ namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(clock_t, clock, ()) {
   using namespace time_units;
-  struct timespec ts;
+  timespec ts;
   auto result = internal::clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
   if (!result.has_value()) {
     libc_errno = result.error();
diff --git a/libc/src/time/linux/clock_gettime.cpp b/libc/src/time/linux/clock_gettime.cpp
index b3fcd2b22f9da..52ace2a743dd4 100644
--- a/libc/src/time/linux/clock_gettime.cpp
+++ b/libc/src/time/linux/clock_gettime.cpp
@@ -15,8 +15,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 // TODO(michaelrj): Move this into time/linux with the other syscalls.
-LLVM_LIBC_FUNCTION(int, clock_gettime,
-                   (clockid_t clockid, struct timespec *ts)) {
+LLVM_LIBC_FUNCTION(int, clock_gettime, (clockid_t clockid, timespec *ts)) {
   auto result = internal::clock_gettime(clockid, ts);
 
   // A negative return value indicates an error with the magnitude of the
diff --git a/libc/src/time/linux/clock_settime.cpp b/libc/src/time/linux/clock_settime.cpp
new file mode 100644
index 0000000000000..3c582cf0b4646
--- /dev/null
+++ b/libc/src/time/linux/clock_settime.cpp
@@ -0,0 +1,30 @@
+//===---------- Linux implementation of the POSIX clock_settime function --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/time/clock_settime.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/time/clock_settime.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, clock_settime,
+                   (clockid_t clockid, const timespec *ts)) {
+  auto result = internal::clock_settime(clockid, ts);
+
+  // A negative return value indicates an error with the magnitude of the
+  // value being the error code.
+  if (!result.has_value()) {
+    libc_errno = result.error();
+    return -1;
+  }
+  return 0;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/linux/nanosleep.cpp b/libc/src/time/linux/nanosleep.cpp
index e5df1585df988..a30b97de40492 100644
--- a/libc/src/time/linux/nanosleep.cpp
+++ b/libc/src/time/linux/nanosleep.cpp
@@ -18,8 +18,7 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(int, nanosleep,
-                   (const struct timespec *req, struct timespec *rem)) {
+LLVM_LIBC_FUNCTION(int, nanosleep, (const timespec *req, timespec *rem)) {
 #if SYS_nanosleep
   int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_nanosleep, req, rem);
 #elif defined(SYS_clock_nanosleep_time64)
diff --git a/libc/src/time/linux/timespec_get.cpp b/libc/src/time/linux/timespec_get.cpp
index a4d4372332732..031cb9f83b1c3 100644
--- a/libc/src/time/linux/timespec_get.cpp
+++ b/libc/src/time/linux/timespec_get.cpp
@@ -15,7 +15,7 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(int, timespec_get, (struct timespec * ts, int base)) {
+LLVM_LIBC_FUNCTION(int, timespec_get, (timespec * ts, int base)) {
   clockid_t clockid;
   switch (base) {
   case TIME_UTC:
diff --git a/libc/src/time/strftime.cpp b/libc/src/time/strftime.cpp
index 89b7d9bb7c1b9..ff8c05a0b07da 100644
--- a/libc/src/time/strftime.cpp
+++ b/libc/src/time/strftime.cpp
@@ -23,10 +23,10 @@ LLVM_LIBC_FUNCTION(size_t, strftime,
       printf_core::WriteMode::FILL_BUFF_AND_DROP_OVERFLOW>::value>
       wb(buffer, (buffsz > 0 ? buffsz - 1 : 0));
   printf_core::Writer writer(wb);
-  int ret = strftime_core::strftime_main(&writer, format, timeptr);
+  auto ret = strftime_core::strftime_main(&writer, format, timeptr);
   if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer.
     wb.buff[wb.buff_cur] = '\0';
-  return (ret < 0 || static_cast<size_t>(ret) >= buffsz) ? 0 : ret;
+  return (!ret.has_value() || ret.value() >= buffsz) ? 0 : ret.value();
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/strftime_core/CMakeLists.txt b/libc/src/time/strftime_core/CMakeLists.txt
index 3ffd283ead7fe..a9aa573cc9a63 100644
--- a/libc/src/time/strftime_core/CMakeLists.txt
+++ b/libc/src/time/strftime_core/CMakeLists.txt
@@ -43,6 +43,7 @@ add_header_library(
     .core_structs
     .parser
     .converter
+    libc.src.__support.error_or
     libc.src.stdio.printf_core.writer
     libc.hdr.types.struct_tm
 )
diff --git a/libc/src/time/strftime_core/strftime_main.h b/libc/src/time/strftime_core/strftime_main.h
index c7e590627094a..855a44107914c 100644
--- a/libc/src/time/strftime_core/strftime_main.h
+++ b/libc/src/time/strftime_core/strftime_main.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_SRC_STDIO_STRFTIME_CORE_STRFTIME_MAIN_H
 
 #include "hdr/types/struct_tm.h"
+#include "src/__support/error_or.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/writer.h"
 #include "src/time/strftime_core/converter.h"
@@ -20,8 +21,8 @@ namespace LIBC_NAMESPACE_DECL {
 namespace strftime_core {
 
 template <printf_core::WriteMode write_mode>
-int strftime_main(printf_core::Writer<write_mode> *writer,
-                  const char *__restrict str, const tm *timeptr) {
+ErrorOr<size_t> strftime_main(printf_core::Writer<write_mode> *writer,
+                              const char *__restrict str, const tm *timeptr) {
   Parser parser(str);
   int result = 0;
   for (strftime_core::FormatSection cur_section = parser.get_next_section();
@@ -33,7 +34,7 @@ int strftime_main(printf_core::Writer<write_mode> *writer,
       result = writer->write(cur_section.raw_string);
 
     if (result < 0)
-      return result;
+      return Error(-result);
   }
 
   return writer->get_chars_written();
diff --git a/libc/src/time/strftime_l.cpp b/libc/src/time/strftime_l.cpp
index 409f8683b7289..2ec90634ea347 100644
--- a/libc/src/time/strftime_l.cpp
+++ b/libc/src/time/strftime_l.cpp
@@ -26,10 +26,10 @@ LLVM_LIBC_FUNCTION(size_t, strftime_l,
       printf_core::WriteMode::FILL_BUFF_AND_DROP_OVERFLOW>::value>
       wb(buffer, (buffsz > 0 ? buffsz - 1 : 0));
   printf_core::Writer writer(wb);
-  int ret = strftime_core::strftime_main(&writer, format, timeptr);
+  auto ret = strftime_core::strftime_main(&writer, format, timeptr);
   if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer.
     wb.buff[wb.buff_cur] = '\0';
-  return (ret < 0 || static_cast<size_t>(ret) >= buffsz) ? 0 : ret;
+  return (!ret.has_value() || ret.value() >= buffsz) ? 0 : ret.value();
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/unistd/CMakeLists.txt b/libc/src/unistd/CMakeLists.txt
index 78c3bf8442fab..b7444a4722b0d 100644
--- a/libc/src/unistd/CMakeLists.txt
+++ b/libc/src/unistd/CMakeLists.txt
@@ -27,6 +27,13 @@ add_entrypoint_object(
     .${LIBC_TARGET_OS}.chdir
 )
 
+add_entrypoint_object(
+  chown
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.chown
+)
+
 add_entrypoint_object(
   close
   ALIAS
@@ -69,6 +76,13 @@ add_entrypoint_object(
     .${LIBC_TARGET_OS}.fchdir
 )
 
+add_entrypoint_object(
+  fchown
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.fchown
+)
+
 add_entrypoint_object(
   fork
   ALIAS
@@ -160,6 +174,13 @@ add_entrypoint_object(
     .${LIBC_TARGET_OS}.getuid
 )
 
+add_entrypoint_object(
+  getgid
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.getgid
+)
+
 add_entrypoint_object(
   isatty
   ALIAS
diff --git a/libc/src/unistd/chown.h b/libc/src/unistd/chown.h
new file mode 100644
index 0000000000000..84a8eba2cb2e6
--- /dev/null
+++ b/libc/src/unistd/chown.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for chown -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_UNISTD_CHOWN_H
+#define LLVM_LIBC_SRC_UNISTD_CHOWN_H
+
+#include "hdr/types/gid_t.h"
+#include "hdr/types/uid_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int chown(const char *path, uid_t owner, gid_t group);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_UNISTD_CHOWN_H
diff --git a/libc/src/unistd/fchown.h b/libc/src/unistd/fchown.h
new file mode 100644
index 0000000000000..9ea44426568cc
--- /dev/null
+++ b/libc/src/unistd/fchown.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for fchown ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_UNISTD_FCHOWN_H
+#define LLVM_LIBC_SRC_UNISTD_FCHOWN_H
+
+#include "hdr/types/gid_t.h"
+#include "hdr/types/uid_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int fchown(int fildes, uid_t owner, gid_t group);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_UNISTD_FCHOWN_H
diff --git a/libc/src/unistd/getgid.h b/libc/src/unistd/getgid.h
new file mode 100644
index 0000000000000..eed0b20d688b1
--- /dev/null
+++ b/libc/src/unistd/getgid.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for getgid ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_UNISTD_GETGID_H
+#define LLVM_LIBC_SRC_UNISTD_GETGID_H
+
+#include "hdr/types/gid_t.h"
+#include "hdr/unistd_macros.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+gid_t getgid();
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_UNISTD_GETGID_H
diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt
index 4eb3c7d3d7fae..c45b6ef1c5d80 100644
--- a/libc/src/unistd/linux/CMakeLists.txt
+++ b/libc/src/unistd/linux/CMakeLists.txt
@@ -25,6 +25,20 @@ add_entrypoint_object(
     libc.src.errno.errno
 )
 
+add_entrypoint_object(
+  chown
+  SRCS
+    chown.cpp
+  HDRS
+    ../chown.h
+  DEPENDS
+    libc.hdr.types.uid_t
+    libc.hdr.types.gid_t
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
+    libc.src.errno.errno
+)
+
 add_entrypoint_object(
   close
   SRCS
@@ -106,6 +120,20 @@ add_entrypoint_object(
     libc.src.errno.errno
 )
 
+add_entrypoint_object(
+  fchown
+  SRCS
+    fchown.cpp
+  HDRS
+    ../fchown.h
+  DEPENDS
+    libc.hdr.types.uid_t
+    libc.hdr.types.gid_t
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
+    libc.src.errno.errno
+)
+
 add_entrypoint_object(
   fork
   SRCS
@@ -276,6 +304,20 @@ add_entrypoint_object(
     libc.src.errno.errno
 )
 
+add_entrypoint_object(
+  getgid
+  SRCS
+    getgid.cpp
+  HDRS
+    ../getgid.h
+  DEPENDS
+    libc.hdr.types.gid_t
+    libc.hdr.fcntl_macros
+    libc.include.unistd
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
+)
+
 add_entrypoint_object(
   getuid
   SRCS
diff --git a/libc/src/unistd/linux/chown.cpp b/libc/src/unistd/linux/chown.cpp
new file mode 100644
index 0000000000000..c7bf1703ffe57
--- /dev/null
+++ b/libc/src/unistd/linux/chown.cpp
@@ -0,0 +1,29 @@
+//===-- Linux implementation of chown -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/unistd/chown.h"
+
+#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/common.h"
+
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include <sys/syscall.h> // For syscall numbers.
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, chown, (const char *path, uid_t owner, gid_t group)) {
+  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_chown, path, owner, group);
+  if (ret < 0) {
+    libc_errno = -ret;
+    return -1;
+  }
+  return 0;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/unistd/linux/fchown.cpp b/libc/src/unistd/linux/fchown.cpp
new file mode 100644
index 0000000000000..9cf3d139050c1
--- /dev/null
+++ b/libc/src/unistd/linux/fchown.cpp
@@ -0,0 +1,31 @@
+//===-- Linux implementation of fchown ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/unistd/fchown.h"
+
+#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/common.h"
+
+#include "hdr/types/gid_t.h"
+#include "hdr/types/uid_t.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include <sys/syscall.h> // For syscall numbers.
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, fchown, (int fildes, uid_t owner, gid_t group)) {
+  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_fchown, fildes, owner, group);
+  if (ret < 0) {
+    libc_errno = -ret;
+    return -1;
+  }
+  return 0;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/unistd/linux/getgid.cpp b/libc/src/unistd/linux/getgid.cpp
new file mode 100644
index 0000000000000..1656fd601d843
--- /dev/null
+++ b/libc/src/unistd/linux/getgid.cpp
@@ -0,0 +1,23 @@
+//===-- Linux implementation of getgid ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/unistd/getgid.h"
+
+#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+#include <sys/syscall.h> // For syscall numbers.
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(gid_t, getgid, ()) {
+  return LIBC_NAMESPACE::syscall_impl<gid_t>(SYS_getgid);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index adde382bf0950..ba27cd77f6bac 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -63,7 +63,7 @@ add_entrypoint_object(
     wcstol.h
   DEPENDS
     libc.src.errno.errno
-    libc.src.__support.wcs_to_integer
+    libc.src.__support.str_to_integer
 )
 
 add_entrypoint_object(
@@ -74,7 +74,7 @@ add_entrypoint_object(
     wcstoll.h
   DEPENDS
     libc.src.errno.errno
-    libc.src.__support.wcs_to_integer
+    libc.src.__support.str_to_integer
 )
 
 add_entrypoint_object(
@@ -85,7 +85,7 @@ add_entrypoint_object(
     wcstoul.h
   DEPENDS
     libc.src.errno.errno
-    libc.src.__support.wcs_to_integer
+    libc.src.__support.str_to_integer
 )
 
 add_entrypoint_object(
@@ -96,7 +96,7 @@ add_entrypoint_object(
     wcstoull.h
   DEPENDS
     libc.src.errno.errno
-    libc.src.__support.wcs_to_integer
+    libc.src.__support.str_to_integer
 )
 
 add_entrypoint_object(
diff --git a/libc/src/wchar/wcstol.cpp b/libc/src/wchar/wcstol.cpp
index a05718f706dfd..a56b5f91272cd 100644
--- a/libc/src/wchar/wcstol.cpp
+++ b/libc/src/wchar/wcstol.cpp
@@ -10,14 +10,14 @@
 #include "src/__support/common.h"
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/wcs_to_integer.h"
+#include "src/__support/str_to_integer.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(long, wcstol,
                    (const wchar_t *__restrict str, wchar_t **__restrict str_end,
                     int base)) {
-  auto result = internal::wcstointeger<long>(str, base);
+  auto result = internal::strtointeger<long>(str, base);
   if (result.has_error())
     libc_errno = result.error;
 
diff --git a/libc/src/wchar/wcstoll.cpp b/libc/src/wchar/wcstoll.cpp
index de1299d681cdb..6229d24172b51 100644
--- a/libc/src/wchar/wcstoll.cpp
+++ b/libc/src/wchar/wcstoll.cpp
@@ -10,14 +10,14 @@
 #include "src/__support/common.h"
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/wcs_to_integer.h"
+#include "src/__support/str_to_integer.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(long long, wcstoll,
                    (const wchar_t *__restrict str, wchar_t **__restrict str_end,
                     int base)) {
-  auto result = internal::wcstointeger<long long>(str, base);
+  auto result = internal::strtointeger<long long>(str, base);
   if (result.has_error())
     libc_errno = result.error;
 
diff --git a/libc/src/wchar/wcstoul.cpp b/libc/src/wchar/wcstoul.cpp
index 79b8c9b5c9fa3..c5639bee1d649 100644
--- a/libc/src/wchar/wcstoul.cpp
+++ b/libc/src/wchar/wcstoul.cpp
@@ -10,14 +10,14 @@
 #include "src/__support/common.h"
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/wcs_to_integer.h"
+#include "src/__support/str_to_integer.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(unsigned long, wcstoul,
                    (const wchar_t *__restrict str, wchar_t **__restrict str_end,
                     int base)) {
-  auto result = internal::wcstointeger<unsigned long>(str, base);
+  auto result = internal::strtointeger<unsigned long>(str, base);
   if (result.has_error())
     libc_errno = result.error;
 
diff --git a/libc/src/wchar/wcstoull.cpp b/libc/src/wchar/wcstoull.cpp
index 768e03c4bd189..2ab24e9b2b2a1 100644
--- a/libc/src/wchar/wcstoull.cpp
+++ b/libc/src/wchar/wcstoull.cpp
@@ -10,14 +10,14 @@
 #include "src/__support/common.h"
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/wcs_to_integer.h"
+#include "src/__support/str_to_integer.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(unsigned long long, wcstoull,
                    (const wchar_t *__restrict str, wchar_t **__restrict str_end,
                     int base)) {
-  auto result = internal::wcstointeger<unsigned long long>(str, base);
+  auto result = internal::strtointeger<unsigned long long>(str, base);
   if (result.has_error())
     libc_errno = result.error;
 
diff --git a/libc/src/wctype/iswalpha.cpp b/libc/src/wctype/iswalpha.cpp
index 09f55d391dbff..01ceac8f68b23 100644
--- a/libc/src/wctype/iswalpha.cpp
+++ b/libc/src/wctype/iswalpha.cpp
@@ -14,6 +14,8 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(int, iswalpha, (wint_t c)) { return internal::iswalpha(c); }
+LLVM_LIBC_FUNCTION(int, iswalpha, (wint_t c)) {
+  return internal::isalpha(static_cast<wchar_t>(c));
+}
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/startup/baremetal/arm/start.cpp b/libc/startup/baremetal/arm/start.cpp
index c089a14f5f782..db89828a0b45e 100644
--- a/libc/startup/baremetal/arm/start.cpp
+++ b/libc/startup/baremetal/arm/start.cpp
@@ -131,6 +131,41 @@ namespace LIBC_NAMESPACE_DECL {
   __arm_wsr("CPSR_c", 0x13); // SVC
 #endif
 
+#if __ARM_ARCH_PROFILE == 'M' &&                                               \
+    (defined(__ARM_FP) || defined(__ARM_FEATURE_MVE))
+  // Enable FPU and MVE. They can't be enabled independently: the two are
+  // governed by the same bits in CPACR.
+  // Based on
+  // https://developer.arm.com/documentation/dui0646/c/Cortex-M7-Peripherals/Floating-Point-Unit/Enabling-the-FPU
+  // Set CPACR cp10 and cp11.
+  auto cpacr = reinterpret_cast<volatile uint32_t *const>(0xE000ED88);
+  *cpacr |= (0xF << 20);
+  __dsb(0xF);
+  __isb(0xF);
+#if defined(__ARM_FEATURE_MVE)
+  // Initialize low-overhead-loop tail predication to its neutral state
+  uint32_t fpscr;
+  __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(fpscr) : :);
+  fpscr |= (0x4 << 16);
+  __asm__ __volatile__("vmsr FPSCR, %0" : : "r"(fpscr) :);
+#endif
+#elif (__ARM_ARCH_PROFILE == 'A' || __ARM_ARCH_PROFILE == 'R') &&              \
+    defined(__ARM_FP)
+  // Enable FPU.
+  // Based on
+  // https://developer.arm.com/documentation/dui0472/m/Compiler-Coding-Practices/Enabling-NEON-and-FPU-for-bare-metal
+  // Set CPACR cp10 and cp11.
+  uint32_t cpacr = __arm_rsr("p15:0:c1:c0:2");
+  cpacr |= (0xF << 20);
+  __arm_wsr("p15:0:c1:c0:2", cpacr);
+  __isb(0xF);
+  // Set FPEXC.EN
+  uint32_t fpexc;
+  __asm__ __volatile__("vmrs %0, FPEXC" : "=r"(fpexc) : :);
+  fpexc |= (0x1 << 30);
+  __asm__ __volatile__("vmsr FPEXC, %0" : : "r"(fpexc) :);
+#endif
+
   // Perform the equivalent of scatterloading
   LIBC_NAMESPACE::memcpy(__data_start, __data_source,
                          reinterpret_cast<uintptr_t>(__data_size));
diff --git a/libc/test/IntegrationTest/CMakeLists.txt b/libc/test/IntegrationTest/CMakeLists.txt
index 235e9fe2f55ee..d0752ea178429 100644
--- a/libc/test/IntegrationTest/CMakeLists.txt
+++ b/libc/test/IntegrationTest/CMakeLists.txt
@@ -14,5 +14,6 @@ add_object_library(
     libc.hdr.stdint_proxy
     libc.src.__support.OSUtil.osutil
     libc.src.__support.CPP.atomic
+    libc.src.__support.macros.properties.architectures
     ${arch_specific_deps}
 )
diff --git a/libc/test/IntegrationTest/test.h b/libc/test/IntegrationTest/test.h
index 4a03f7aa6318b..9f5a3dfb3583c 100644
--- a/libc/test/IntegrationTest/test.h
+++ b/libc/test/IntegrationTest/test.h
@@ -11,6 +11,7 @@
 
 #include "src/__support/OSUtil/exit.h"
 #include "src/__support/OSUtil/io.h"
+#include "src/__support/macros/properties/architectures.h"
 
 #define __AS_STRING(val) #val
 #define __CHECK_TRUE(file, line, val, should_exit)                             \
@@ -68,9 +69,15 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Errno checks.
 
+#ifdef LIBC_TARGET_ARCH_IS_GPU
+#define ASSERT_ERRNO_EQ(VAL)
+#define ASSERT_ERRNO_SUCCESS()
+#define ASSERT_ERRNO_FAILURE()
+#else
 #define ASSERT_ERRNO_EQ(VAL) ASSERT_EQ(VAL, static_cast<int>(errno))
 #define ASSERT_ERRNO_SUCCESS() ASSERT_EQ(0, static_cast<int>(errno))
 #define ASSERT_ERRNO_FAILURE() ASSERT_NE(0, static_cast<int>(errno))
+#endif
 
 // Integration tests are compiled with -ffreestanding which stops treating
 // the main function as a non-overloadable special function. Hence, we use a
diff --git a/libc/test/UnitTest/BazelFilePath.cpp b/libc/test/UnitTest/BazelFilePath.cpp
index ee5fcaaa63d91..7f9f42b46dca9 100644
--- a/libc/test/UnitTest/BazelFilePath.cpp
+++ b/libc/test/UnitTest/BazelFilePath.cpp
@@ -20,6 +20,10 @@ namespace testing {
 CString libc_make_test_file_path_func(const char *file_name) {
   // This is the path to the folder bazel wants the test outputs written to.
   const char *UNDECLARED_OUTPUTS_PATH = getenv("TEST_UNDECLARED_OUTPUTS_DIR");
+  // Do something sensible if not run under bazel, otherwise this may segfault
+  // when constructing the string.
+  if (UNDECLARED_OUTPUTS_PATH == nullptr)
+    UNDECLARED_OUTPUTS_PATH = "";
 
   return cpp::string(UNDECLARED_OUTPUTS_PATH) + file_name;
 }
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index 31d1e9dce8204..54e41ece5f4d9 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -83,7 +83,7 @@ add_unittest_framework_library(
 )
 
 set(libc_death_test_srcs LibcDeathTestExecutors.cpp)
-if(${LIBC_TARGET_OS} STREQUAL "linux")
+if(${LIBC_TARGET_OS} STREQUAL "linux" OR ${LIBC_TARGET_OS} STREQUAL "darwin")
   list(APPEND libc_death_test_srcs ExecuteFunctionUnix.cpp)
 endif()
 
@@ -204,5 +204,6 @@ add_header_library(
     ErrnoCheckingTest.h
   DEPENDS
     libc.src.__support.common
+    libc.src.__support.macros.properties.architectures
     libc.src.errno.errno
 )
diff --git a/libc/test/UnitTest/ErrnoCheckingTest.h b/libc/test/UnitTest/ErrnoCheckingTest.h
index 5b1bc9441d830..111d812c58612 100644
--- a/libc/test/UnitTest/ErrnoCheckingTest.h
+++ b/libc/test/UnitTest/ErrnoCheckingTest.h
@@ -11,11 +11,17 @@
 
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/Test.h"
 
 // Define macro to validate the value stored in the errno and restore it
 // to zero.
 
+#ifdef LIBC_TARGET_ARCH_IS_GPU
+#define ASSERT_ERRNO_EQ(VAL)
+#define ASSERT_ERRNO_SUCCESS()
+#define ASSERT_ERRNO_FAILURE()
+#else
 #define ASSERT_ERRNO_EQ(VAL)                                                   \
   do {                                                                         \
     ASSERT_EQ(VAL, static_cast<int>(libc_errno));                              \
@@ -27,6 +33,7 @@
     ASSERT_NE(0, static_cast<int>(libc_errno));                                \
     libc_errno = 0;                                                            \
   } while (0)
+#endif
 
 namespace LIBC_NAMESPACE_DECL {
 namespace testing {
diff --git a/libc/test/UnitTest/FEnvSafeTest.cpp b/libc/test/UnitTest/FEnvSafeTest.cpp
index 4393f9d5e5c3b..64f50d7be7fe3 100644
--- a/libc/test/UnitTest/FEnvSafeTest.cpp
+++ b/libc/test/UnitTest/FEnvSafeTest.cpp
@@ -43,7 +43,8 @@ void FEnvSafeTest::set_fenv(const fenv_t &fenv) {
 
 void FEnvSafeTest::expect_fenv_eq(const fenv_t &before_fenv,
                                   const fenv_t &after_fenv) {
-#if defined(LIBC_TARGET_ARCH_IS_AARCH64) && !defined(LIBC_COMPILER_IS_MSVC)
+#if defined(LIBC_TARGET_ARCH_IS_AARCH64) && !defined(LIBC_COMPILER_IS_MSVC) && \
+    defined(__ARM_FP)
   using FPState = LIBC_NAMESPACE::fputil::FEnv::FPState;
   const FPState &before_state = reinterpret_cast<const FPState &>(before_fenv);
   const FPState &after_state = reinterpret_cast<const FPState &>(after_fenv);
diff --git a/libc/test/UnitTest/MemoryMatcher.cpp b/libc/test/UnitTest/MemoryMatcher.cpp
index 6e375768e9333..405f226798f7a 100644
--- a/libc/test/UnitTest/MemoryMatcher.cpp
+++ b/libc/test/UnitTest/MemoryMatcher.cpp
@@ -41,8 +41,8 @@ bool MemoryMatcher::match(MemoryView actualValue) {
 
 static void display(char C) {
   const auto print = [](unsigned char i) {
-    tlog << static_cast<char>(LIBC_NAMESPACE::internal::toupper(
-        LIBC_NAMESPACE::internal::int_to_b36_char(i)));
+    tlog << LIBC_NAMESPACE::internal::toupper(
+        LIBC_NAMESPACE::internal::int_to_b36_char(i));
   };
   print(static_cast<unsigned char>(C) / 16);
   print(static_cast<unsigned char>(C) & 15);
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
index aede395350821..762b5b0417ef6 100644
--- a/libc/test/shared/CMakeLists.txt
+++ b/libc/test/shared/CMakeLists.txt
@@ -44,6 +44,7 @@ add_fp_unittest(
     libc.src.__support.math.exp2f
     libc.src.__support.math.exp2f16
     libc.src.__support.math.exp2m1f
+    libc.src.__support.math.exp2m1f16
     libc.src.__support.math.exp10
     libc.src.__support.math.exp10f
     libc.src.__support.math.exp10f16
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
index a6825a10654c9..5b409781a5b07 100644
--- a/libc/test/shared/shared_math_test.cpp
+++ b/libc/test/shared/shared_math_test.cpp
@@ -29,6 +29,7 @@ TEST(LlvmLibcSharedMathTest, AllFloat16) {
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::exp10f16(0.0f16));
   EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::exp10m1f16(0.0f16));
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::exp2f16(0.0f16));
+  EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::exp2m1f16(0.0f16));
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::expf16(0.0f16));
 
   ASSERT_FP_EQ(float16(8 << 5), LIBC_NAMESPACE::shared::ldexpf16(8.0f16, 5));
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index a02514106a307..138866b4cc869 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -151,7 +151,7 @@ add_libc_test(
     wcs_to_integer_test.cpp
   DEPENDS
     libc.src.__support.integer_literals
-    libc.src.__support.wcs_to_integer
+    libc.src.__support.str_to_integer
 )
 
 add_libc_test(
diff --git a/libc/test/src/__support/str_to_integer_test.cpp b/libc/test/src/__support/str_to_integer_test.cpp
index 1ec882b212b8a..e5ac1d6cbb7b3 100644
--- a/libc/test/src/__support/str_to_integer_test.cpp
+++ b/libc/test/src/__support/str_to_integer_test.cpp
@@ -49,12 +49,14 @@ TEST(LlvmLibcStrToIntegerTest, LeadingSpaces) {
   EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
   ASSERT_EQ(result.value, 12);
 
-  result = LIBC_NAMESPACE::internal::strtointeger<int>("     12345", 10, 5);
+  // Use a non-null-terminated buffer to test for possible OOB access.
+  char buf[5] = {' ', ' ', ' ', ' ', ' '};
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(buf, 10, 5);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::strtointeger<int>("     12345", 10, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(buf, 10, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
diff --git a/libc/test/src/__support/wcs_to_integer_test.cpp b/libc/test/src/__support/wcs_to_integer_test.cpp
index 4554968be67ce..38af778ca2440 100644
--- a/libc/test/src/__support/wcs_to_integer_test.cpp
+++ b/libc/test/src/__support/wcs_to_integer_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/wcs_to_integer.h"
+#include "src/__support/str_to_integer.h"
 #include <stddef.h>
 
 #include "test/UnitTest/Test.h"
@@ -14,224 +14,226 @@
 // This file is for testing the src_len argument and other internal interface
 // features. Primary testing is done through the public interface.
 
-TEST(LlvmLibcStrToIntegerTest, SimpleLength) {
-  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"12345", 10, 10);
+TEST(LlvmLibcWcsToIntegerTest, SimpleLength) {
+  auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"12345", 10, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(5));
   ASSERT_EQ(result.value, 12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"12345", 10, 2);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"12345", 10, 2);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(2));
   ASSERT_EQ(result.value, 12);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"12345", 10, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"12345", 10, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 }
 
-TEST(LlvmLibcStrToIntegerTest, LeadingSpaces) {
+TEST(LlvmLibcWcsToIntegerTest, LeadingSpaces) {
   auto result =
-      LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 15);
+      LIBC_NAMESPACE::internal::strtointeger<int>(L"     12345", 10, 15);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(10));
   ASSERT_EQ(result.value, 12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 10);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"     12345", 10, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(10));
   ASSERT_EQ(result.value, 12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 7);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"     12345", 10, 7);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
   ASSERT_EQ(result.value, 12);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 5);
+  // Use a non-null-terminated buffer to test for possible OOB access.
+  wchar_t buf[5] = {L' ', L' ', L' ', L' ', L' '};
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(buf, 10, 5);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(buf, 10, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 }
 
-TEST(LlvmLibcStrToIntegerTest, LeadingSign) {
-  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 10);
+TEST(LlvmLibcWcsToIntegerTest, LeadingSign) {
+  auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"+12345", 10, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, 12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 10);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"-12345", 10, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, -12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 6);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"+12345", 10, 6);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, 12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 6);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"-12345", 10, 6);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, -12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 3);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"+12345", 10, 3);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(3));
   ASSERT_EQ(result.value, 12);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 3);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"-12345", 10, 3);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(3));
   ASSERT_EQ(result.value, -12);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 1);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"+12345", 10, 1);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 1);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"-12345", 10, 1);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"+12345", 10, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"-12345", 10, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 }
 
-TEST(LlvmLibcStrToIntegerTest, Base16PrefixAutoSelect) {
-  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 10);
+TEST(LlvmLibcWcsToIntegerTest, Base16PrefixAutoSelect) {
+  auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 0, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
   ASSERT_EQ(result.value, 0x12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 7);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 0, 7);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
   ASSERT_EQ(result.value, 0x12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 5);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 0, 5);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(5));
   ASSERT_EQ(result.value, 0x123);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 2);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 0, 2);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(1));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 0, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 }
 
-TEST(LlvmLibcStrToIntegerTest, Base16PrefixManualSelect) {
-  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 10);
+TEST(LlvmLibcWcsToIntegerTest, Base16PrefixManualSelect) {
+  auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 16, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
   ASSERT_EQ(result.value, 0x12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 7);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 16, 7);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
   ASSERT_EQ(result.value, 0x12345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 5);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 16, 5);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(5));
   ASSERT_EQ(result.value, 0x123);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 2);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 16, 2);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(1));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"0x12345", 16, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 }
 
-TEST(LlvmLibcStrToIntegerTest, Base8PrefixAutoSelect) {
-  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 10);
+TEST(LlvmLibcWcsToIntegerTest, Base8PrefixAutoSelect) {
+  auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 0, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, 012345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 6);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 0, 6);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, 012345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 4);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 0, 4);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(4));
   ASSERT_EQ(result.value, 0123);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 1);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 0, 1);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(1));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 0, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 }
 
-TEST(LlvmLibcStrToIntegerTest, Base8PrefixManualSelect) {
-  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 10);
+TEST(LlvmLibcWcsToIntegerTest, Base8PrefixManualSelect) {
+  auto result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 8, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, 012345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 6);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 8, 6);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, 012345);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 4);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 8, 4);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(4));
   ASSERT_EQ(result.value, 0123);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 1);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 8, 1);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(1));
   ASSERT_EQ(result.value, 0);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 0);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"012345", 8, 0);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
   ASSERT_EQ(result.value, 0);
 }
 
-TEST(LlvmLibcStrToIntegerTest, CombinedTests) {
+TEST(LlvmLibcWcsToIntegerTest, CombinedTests) {
   auto result =
-      LIBC_NAMESPACE::internal::wcstointeger<int>(L"    -0x123", 0, 10);
+      LIBC_NAMESPACE::internal::strtointeger<int>(L"    -0x123", 0, 10);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(10));
   ASSERT_EQ(result.value, -0x123);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"    -0x123", 0, 8);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"    -0x123", 0, 8);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(8));
   ASSERT_EQ(result.value, -0x1);
 
-  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"    -0x123", 0, 7);
+  result = LIBC_NAMESPACE::internal::strtointeger<int>(L"    -0x123", 0, 7);
   EXPECT_FALSE(result.has_error());
   EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
   ASSERT_EQ(result.value, 0);
diff --git a/libc/test/src/ctype/islower_test.cpp b/libc/test/src/ctype/islower_test.cpp
index f877171abb9a3..e4e5f5cefd954 100644
--- a/libc/test/src/ctype/islower_test.cpp
+++ b/libc/test/src/ctype/islower_test.cpp
@@ -40,7 +40,7 @@ TEST(LlvmLibcIsLower, SimpleTest) {
 }
 
 TEST(LlvmLibcIsLower, DefaultLocale) {
-  // Loops through all characters, verifying that numbers and letters
+  // Loops through all characters, verifying that only lowercase letters
   // return non-zero integer and everything else returns a zero.
   for (int ch = -255; ch < 255; ++ch) {
     if (in_span(ch, LOWER_ARRAY))
diff --git a/libc/test/src/fcntl/fcntl_test.cpp b/libc/test/src/fcntl/fcntl_test.cpp
index 84feb34e537a0..d008aea54b425 100644
--- a/libc/test/src/fcntl/fcntl_test.cpp
+++ b/libc/test/src/fcntl/fcntl_test.cpp
@@ -94,68 +94,105 @@ TEST_F(LlvmLibcFcntlTest, FcntlSetFl) {
   ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
 }
 
-TEST_F(LlvmLibcFcntlTest, FcntlGetLkRead) {
-  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkread.test";
-  auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
-
-  struct flock flk, svflk;
-  int retVal;
-  int fd =
-      LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDONLY, S_IRWXU);
-  ASSERT_ERRNO_SUCCESS();
-  ASSERT_GT(fd, 0);
-
-  flk.l_type = F_RDLCK;
-  flk.l_start = 0;
-  flk.l_whence = SEEK_SET;
-  flk.l_len = 50;
-
-  // copy flk into svflk
-  svflk = flk;
-
-  retVal = LIBC_NAMESPACE::fcntl(fd, F_GETLK, &svflk);
-  ASSERT_ERRNO_SUCCESS();
-  ASSERT_GT(retVal, -1);
-  ASSERT_NE((int)flk.l_type, F_WRLCK); // File should not be write locked.
-
-  retVal = LIBC_NAMESPACE::fcntl(fd, F_SETLK, &svflk);
-  ASSERT_ERRNO_SUCCESS();
-  ASSERT_GT(retVal, -1);
-
-  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
-}
-
-TEST_F(LlvmLibcFcntlTest, FcntlGetLkWrite) {
-  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkwrite.test";
-  auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
-
-  struct flock flk, svflk;
-  int retVal;
-  int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
-  ASSERT_ERRNO_SUCCESS();
-  ASSERT_GT(fd, 0);
-
-  flk.l_type = F_WRLCK;
-  flk.l_start = 0;
-  flk.l_whence = SEEK_SET;
-  flk.l_len = 0;
-
-  // copy flk into svflk
-  svflk = flk;
-
-  retVal = LIBC_NAMESPACE::fcntl(fd, F_GETLK, &svflk);
-  ASSERT_ERRNO_SUCCESS();
-  ASSERT_GT(retVal, -1);
-  ASSERT_NE((int)flk.l_type, F_RDLCK); // File should not be read locked.
-
-  retVal = LIBC_NAMESPACE::fcntl(fd, F_SETLK, &svflk);
-  ASSERT_ERRNO_SUCCESS();
-  ASSERT_GT(retVal, -1);
-
-  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
-}
+/* Tests that are common between OFD and traditional variants of fcntl locks. */
+template <int GETLK_CMD, int SETLK_CMD>
+class LibcFcntlCommonLockTests : public LlvmLibcFcntlTest {
+public:
+  void GetLkRead() {
+    using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+    constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkread.test";
+    const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
+
+    struct flock flk = {};
+    struct flock svflk = {};
+    int retVal;
+    int fd =
+        LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDONLY, S_IRWXU);
+    ASSERT_ERRNO_SUCCESS();
+    ASSERT_GT(fd, 0);
+
+    flk.l_type = F_RDLCK;
+    flk.l_start = 0;
+    flk.l_whence = SEEK_SET;
+    flk.l_len = 50;
+
+    // copy flk into svflk
+    svflk = flk;
+
+    retVal = LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &svflk);
+    ASSERT_ERRNO_SUCCESS();
+    ASSERT_GT(retVal, -1);
+    ASSERT_NE((int)svflk.l_type, F_WRLCK); // File should not be write locked.
+
+    retVal = LIBC_NAMESPACE::fcntl(fd, SETLK_CMD, &svflk);
+    ASSERT_ERRNO_SUCCESS();
+    ASSERT_GT(retVal, -1);
+
+    ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+  }
+
+  void GetLkWrite() {
+    using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+    constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkwrite.test";
+    const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
+
+    struct flock flk = {};
+    struct flock svflk = {};
+    int retVal;
+    int fd =
+        LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
+    ASSERT_ERRNO_SUCCESS();
+    ASSERT_GT(fd, 0);
+
+    flk.l_type = F_WRLCK;
+    flk.l_start = 0;
+    flk.l_whence = SEEK_SET;
+    flk.l_len = 0;
+
+    // copy flk into svflk
+    svflk = flk;
+
+    retVal = LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &svflk);
+    ASSERT_ERRNO_SUCCESS();
+    ASSERT_GT(retVal, -1);
+    ASSERT_NE((int)svflk.l_type, F_RDLCK); // File should not be read locked.
+
+    retVal = LIBC_NAMESPACE::fcntl(fd, SETLK_CMD, &svflk);
+    ASSERT_ERRNO_SUCCESS();
+    ASSERT_GT(retVal, -1);
+
+    ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+  }
+
+  void UseAfterClose() {
+    using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+    constexpr const char *TEST_FILE_NAME =
+        "testdata/fcntl_use_after_close.test";
+    const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
+    int fd =
+        LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
+    ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+
+    flock flk = {};
+    flk.l_type = F_RDLCK;
+    flk.l_start = 0;
+    flk.l_whence = SEEK_SET;
+    flk.l_len = 50;
+    ASSERT_EQ(-1, LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &flk));
+    ASSERT_ERRNO_EQ(EBADF);
+  }
+};
+
+#define COMMON_LOCK_TESTS(NAME, GETLK_CMD, SETLK_CMD)                          \
+  using NAME = LibcFcntlCommonLockTests<GETLK_CMD, SETLK_CMD>;                 \
+  TEST_F(NAME, GetLkRead) { GetLkRead(); }                                     \
+  TEST_F(NAME, GetLkWrite) { GetLkWrite(); }                                   \
+  TEST_F(NAME, UseAfterClose) { UseAfterClose(); }                             \
+  static_assert(true, "Require semicolon.")
+
+COMMON_LOCK_TESTS(LlvmLibcFcntlProcessAssociatedLockTest, F_GETLK, F_SETLK);
+COMMON_LOCK_TESTS(LlvmLibcFcntlOpenFileDescriptionLockTest, F_OFD_GETLK,
+                  F_OFD_SETLK);
 
 TEST_F(LlvmLibcFcntlTest, UseAfterClose) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index eec108bc12ca5..a39428fb8d16c 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -186,6 +186,9 @@ add_libc_test(
     fprintf_test.cpp
   DEPENDS
     libc.src.stdio.fprintf
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
+    libc.src.__support.macros.properties.architectures
     ${fprintf_test_deps}
   COMPILE_OPTIONS
     ${use_system_file}
diff --git a/libc/test/src/stdio/fprintf_test.cpp b/libc/test/src/stdio/fprintf_test.cpp
index 6799323cc6ad9..b035b6d9bd45d 100644
--- a/libc/test/src/stdio/fprintf_test.cpp
+++ b/libc/test/src/stdio/fprintf_test.cpp
@@ -15,6 +15,10 @@
 
 #include "src/stdio/fprintf.h"
 
+#include "src/__support/CPP/limits.h"
+#include "src/__support/macros/properties/architectures.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 namespace printf_test {
@@ -31,6 +35,8 @@ using ::fread;
 #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE
 } // namespace printf_test
 
+using LlvmLibcFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
 TEST(LlvmLibcFPrintfTest, WriteToFile) {
   const char *FILENAME = APPEND_LIBC_TEST("fprintf_output.test");
   auto FILE_PATH = libc_make_test_file_path(FILENAME);
@@ -78,6 +84,26 @@ TEST(LlvmLibcFPrintfTest, WriteToFile) {
   written =
       LIBC_NAMESPACE::fprintf(file, "Writing to a read only file should fail.");
   EXPECT_LT(written, 0);
+  ASSERT_ERRNO_FAILURE();
+
+  ASSERT_EQ(printf_test::fclose(file), 0);
+}
+
+#if !defined(LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS) &&                            \
+    !defined(LIBC_COPT_PRINTF_DISABLE_WRITE_INT) &&                            \
+    !defined(LIBC_TARGET_ARCH_IS_GPU)
+TEST(LlvmLibcFPrintfTest, NullPtrCheck) {
+  const char *FILENAME = APPEND_LIBC_TEST("fprintf_nullptr.test");
+  auto FILE_PATH = libc_make_test_file_path(FILENAME);
+
+  ::FILE *file = printf_test::fopen(FILE_PATH, "w");
+  ASSERT_FALSE(file == nullptr);
+
+  int ret =
+      LIBC_NAMESPACE::fprintf(file, "hello %n", static_cast<int *>(nullptr));
+  EXPECT_LT(ret, 0);
+  ASSERT_ERRNO_FAILURE();
 
   ASSERT_EQ(printf_test::fclose(file), 0);
 }
+#endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
diff --git a/libc/test/src/stdio/printf_core/converter_test.cpp b/libc/test/src/stdio/printf_core/converter_test.cpp
index bf088937e4104..2dae2a22c864c 100644
--- a/libc/test/src/stdio/printf_core/converter_test.cpp
+++ b/libc/test/src/stdio/printf_core/converter_test.cpp
@@ -38,7 +38,7 @@ TEST_F(LlvmLibcPrintfConverterTest, SimpleRawConversion) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "abc");
-  ASSERT_EQ(writer.get_chars_written(), 3);
+  ASSERT_EQ(writer.get_chars_written(), size_t{3});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) {
@@ -52,7 +52,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "%");
-  ASSERT_EQ(writer.get_chars_written(), 1);
+  ASSERT_EQ(writer.get_chars_written(), size_t{1});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) {
@@ -70,7 +70,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "D");
-  ASSERT_EQ(writer.get_chars_written(), 1);
+  ASSERT_EQ(writer.get_chars_written(), size_t{1});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) {
@@ -85,7 +85,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "   E");
-  ASSERT_EQ(writer.get_chars_written(), 4);
+  ASSERT_EQ(writer.get_chars_written(), size_t{4});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) {
@@ -102,7 +102,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "F   ");
-  ASSERT_EQ(writer.get_chars_written(), 4);
+  ASSERT_EQ(writer.get_chars_written(), size_t{4});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) {
@@ -118,7 +118,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "DEF");
-  ASSERT_EQ(writer.get_chars_written(), 3);
+  ASSERT_EQ(writer.get_chars_written(), size_t{3});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) {
@@ -133,7 +133,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "456");
-  ASSERT_EQ(writer.get_chars_written(), 3);
+  ASSERT_EQ(writer.get_chars_written(), size_t{3});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) {
@@ -148,7 +148,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "xy");
-  ASSERT_EQ(writer.get_chars_written(), 2);
+  ASSERT_EQ(writer.get_chars_written(), size_t{2});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) {
@@ -163,7 +163,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, " 789");
-  ASSERT_EQ(writer.get_chars_written(), 4);
+  ASSERT_EQ(writer.get_chars_written(), size_t{4});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) {
@@ -180,7 +180,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "ghi ");
-  ASSERT_EQ(writer.get_chars_written(), 4);
+  ASSERT_EQ(writer.get_chars_written(), size_t{4});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) {
@@ -194,7 +194,7 @@ TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "12345");
-  ASSERT_EQ(writer.get_chars_written(), 5);
+  ASSERT_EQ(writer.get_chars_written(), size_t{5});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, HexConversion) {
@@ -211,7 +211,7 @@ TEST_F(LlvmLibcPrintfConverterTest, HexConversion) {
 
   wb.buff[wb.buff_cur] = '\0';
   ASSERT_STREQ(str, "0x00000000123456ab");
-  ASSERT_EQ(writer.get_chars_written(), 18);
+  ASSERT_EQ(writer.get_chars_written(), size_t{18});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) {
@@ -225,7 +225,7 @@ TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "101010");
-  ASSERT_EQ(writer.get_chars_written(), 6);
+  ASSERT_EQ(writer.get_chars_written(), size_t{6});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) {
@@ -239,7 +239,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) {
 
   wb.buff[wb.buff_cur] = '\0';
   ASSERT_STREQ(str, "0x123456ab");
-  ASSERT_EQ(writer.get_chars_written(), 10);
+  ASSERT_EQ(writer.get_chars_written(), size_t{10});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, OctConversion) {
@@ -253,5 +253,5 @@ TEST_F(LlvmLibcPrintfConverterTest, OctConversion) {
 
   wb.buff[wb.buff_cur] = '\0';
   ASSERT_STREQ(str, "1234");
-  ASSERT_EQ(writer.get_chars_written(), 4);
+  ASSERT_EQ(writer.get_chars_written(), size_t{4});
 }
diff --git a/libc/test/src/stdio/printf_core/writer_test.cpp b/libc/test/src/stdio/printf_core/writer_test.cpp
index d036341be7981..d263cf55aa474 100644
--- a/libc/test/src/stdio/printf_core/writer_test.cpp
+++ b/libc/test/src/stdio/printf_core/writer_test.cpp
@@ -39,7 +39,7 @@ TEST(LlvmLibcPrintfWriterTest, Write) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("abc", str);
-  ASSERT_EQ(writer.get_chars_written(), 3);
+  ASSERT_EQ(writer.get_chars_written(), size_t{3});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) {
@@ -53,7 +53,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("abcDEF123", str);
-  ASSERT_EQ(writer.get_chars_written(), 9);
+  ASSERT_EQ(writer.get_chars_written(), size_t{9});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteChars) {
@@ -66,7 +66,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteChars) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaa", str);
-  ASSERT_EQ(writer.get_chars_written(), 3);
+  ASSERT_EQ(writer.get_chars_written(), size_t{3});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) {
@@ -80,7 +80,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaaDDD111", str);
-  ASSERT_EQ(writer.get_chars_written(), 9);
+  ASSERT_EQ(writer.get_chars_written(), size_t{9});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteManyChars) {
@@ -102,7 +102,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteManyChars) {
                "ZZZZZZZZZZ"
                "ZZZZZZZZZ",
                str);
-  ASSERT_EQ(writer.get_chars_written(), 99);
+  ASSERT_EQ(writer.get_chars_written(), size_t{99});
 }
 
 TEST(LlvmLibcPrintfWriterTest, MixedWrites) {
@@ -117,7 +117,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWrites) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaaDEF111456", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) {
@@ -129,7 +129,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("abcDEF1234", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) {
@@ -141,7 +141,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("1111111111", str);
-  ASSERT_EQ(writer.get_chars_written(), 15);
+  ASSERT_EQ(writer.get_chars_written(), size_t{15});
 }
 
 TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) {
@@ -157,7 +157,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaaDEF1114", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) {
@@ -175,7 +175,7 @@ TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) {
@@ -187,7 +187,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) {
   writer.write('1', 3);
   writer.write({"456", 3});
 
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 struct OutBuff {
@@ -226,7 +226,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLengthWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("abcDEF123456", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) {
@@ -246,7 +246,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("111111111111111", str);
-  ASSERT_EQ(writer.get_chars_written(), 15);
+  ASSERT_EQ(writer.get_chars_written(), size_t{15});
 }
 
 TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) {
@@ -269,7 +269,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("aaaDEF111456", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) {
@@ -292,7 +292,7 @@ TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("aaaDEF111456", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) {
@@ -312,7 +312,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) {
   wb.overflow_write("");
   str[out_buff.cur_pos] = '\0';
 
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
   ASSERT_STREQ("aaaDEF111456", str);
 }
 
diff --git a/libc/test/src/stdio/snprintf_test.cpp b/libc/test/src/stdio/snprintf_test.cpp
index baaa664cdc9ee..95507e0885dbf 100644
--- a/libc/test/src/stdio/snprintf_test.cpp
+++ b/libc/test/src/stdio/snprintf_test.cpp
@@ -8,8 +8,12 @@
 
 #include "src/stdio/snprintf.h"
 
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
+using LlvmLibcSNPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
 // The sprintf test cases cover testing the shared printf functionality, so
 // these tests will focus on snprintf exclusive features.
 
@@ -59,3 +63,14 @@ TEST(LlvmLibcSNPrintfTest, NoCutOff) {
   EXPECT_EQ(written, 10);
   ASSERT_STREQ(buff, "1234567890");
 }
+
+TEST(LlvmLibcSNPrintfTest, CharsWrittenOverflow) {
+  char buff[0];
+
+  // Trigger an overflow in the return value of snprintf by writing more than
+  // INT_MAX bytes.
+  int int_max = LIBC_NAMESPACE::cpp::numeric_limits<int>::max();
+  int written = LIBC_NAMESPACE::snprintf(buff, 0, "%*stest", int_max, "");
+  EXPECT_LT(written, 0);
+  ASSERT_ERRNO_FAILURE();
+}
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index f1b545ba546f9..42fdd59cf4d9c 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -1537,6 +1537,14 @@ TEST(LlvmLibcSPrintfTest, FloatDecimalLongDoubleConv) {
 #if defined(LIBC_TYPES_LONG_DOUBLE_IS_X86_FLOAT80)
 
 #ifndef LIBC_COPT_FLOAT_TO_STR_REDUCED_PRECISION
+  written = LIBC_NAMESPACE::sprintf(
+      buff, "%.75Lf",
+      0.0833333333333333333355920878593448009041821933351457118988037109375L);
+  ASSERT_STREQ_LEN(written, buff,
+                   "0."
+                   "08333333333333333333559208785934480090418219333514571189880"
+                   "3710937500000000");
+
   written = LIBC_NAMESPACE::sprintf(buff, "%Lf", 1e100L);
   ASSERT_STREQ_LEN(written, buff,
                    "99999999999999999996693535322073426194986990198284960792713"
@@ -2976,6 +2984,10 @@ TEST(LlvmLibcSPrintfTest, FloatAutoLongDoubleConv) {
   written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 0xf.fffffffffffffffp+16380L);
   ASSERT_STREQ_LEN(written, buff, "1.18973e+4932");
 
+  // Minimum normal
+  written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 3.36210314311209350626E-4932L);
+  ASSERT_STREQ_LEN(written, buff, "3.3621e-4932");
+
   written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 0xa.aaaaaaaaaaaaaabp-7L);
   ASSERT_STREQ_LEN(written, buff, "0.0833333");
 
diff --git a/libc/test/src/stdio/vfprintf_test.cpp b/libc/test/src/stdio/vfprintf_test.cpp
index f50565a0f68ca..0e003f5de5bee 100644
--- a/libc/test/src/stdio/vfprintf_test.cpp
+++ b/libc/test/src/stdio/vfprintf_test.cpp
@@ -19,6 +19,8 @@
 
 #include "src/stdio/vfprintf.h"
 
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 namespace printf_test {
@@ -44,6 +46,8 @@ int call_vfprintf(::FILE *__restrict stream, const char *__restrict format,
   return ret;
 }
 
+using LlvmLibcVFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
 TEST(LlvmLibcVFPrintfTest, WriteToFile) {
   const char *FILENAME = APPEND_LIBC_TEST("vfprintf_output.test");
   auto FILE_PATH = libc_make_test_file_path(FILENAME);
@@ -90,6 +94,7 @@ TEST(LlvmLibcVFPrintfTest, WriteToFile) {
 
   written = call_vfprintf(file, "Writing to a read only file should fail.");
   EXPECT_LT(written, 0);
+  ASSERT_ERRNO_FAILURE();
 
   ASSERT_EQ(printf_test::fclose(file), 0);
 }
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index 0eb373c3fa061..42e8faa3fd69f 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -187,6 +187,7 @@ add_header_library(
   DEPENDS
     libc.src.__support.CPP.type_traits
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.macros.properties.architectures
 )
 
 add_libc_test(
diff --git a/libc/test/src/stdlib/StrfromTest.h b/libc/test/src/stdlib/StrfromTest.h
index e82c94499aa11..3dacfca9e89f9 100644
--- a/libc/test/src/stdlib/StrfromTest.h
+++ b/libc/test/src/stdlib/StrfromTest.h
@@ -8,6 +8,9 @@
 
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/properties/architectures.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #define ASSERT_STREQ_LEN(actual_written, actual_str, expected_str)             \
@@ -15,7 +18,7 @@
   EXPECT_STREQ(actual_str, expected_str);
 
 template <typename InputT>
-class StrfromTest : public LIBC_NAMESPACE::testing::Test {
+class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 
   static constexpr bool is_single_prec =
       LIBC_NAMESPACE::cpp::is_same<InputT, float>::value;
@@ -481,6 +484,19 @@ class StrfromTest : public LIBC_NAMESPACE::testing::Test {
     written = func(buff, 10, "%A", -ld_nan);
     ASSERT_STREQ_LEN(written, buff, "-NAN");
   }
+
+  // https://github.com/llvm/llvm-project/issues/166795
+  void charsWrittenOverflow(FunctionT func) {
+#ifndef LIBC_TARGET_ARCH_IS_RISCV32
+    char buff[100];
+    // Trigger an overflow in the return value of strfrom by writing more than
+    // INT_MAX bytes.
+    int result = func(buff, sizeof(buff), "%.2147483647f", 1.0f);
+
+    EXPECT_LT(result, 0);
+    ASSERT_ERRNO_FAILURE();
+#endif
+  }
 };
 
 #define STRFROM_TEST(InputType, name, func)                                    \
@@ -501,4 +517,7 @@ class StrfromTest : public LIBC_NAMESPACE::testing::Test {
   TEST_F(LlvmLibc##name##Test, InsufficientBufferSize) {                       \
     insufficentBufsize(func);                                                  \
   }                                                                            \
-  TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); }
+  TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); }        \
+  TEST_F(LlvmLibc##name##Test, CharsWrittenOverflow) {                         \
+    charsWrittenOverflow(func);                                                \
+  }
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index 03f0a6539c785..3a7da1fa85ac7 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -177,8 +177,8 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
     char small_string[4] = {'\0', '\0', '\0', '\0'};
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<char>(
-            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit);
         if (first_digit < base) {
           ASSERT_EQ(func(small_string, nullptr, base),
                     static_cast<ReturnT>(first_digit));
@@ -192,11 +192,11 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<char>(
-            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit);
         for (int second_digit = 0; second_digit <= 36; ++second_digit) {
-          small_string[1] = static_cast<char>(
-              LIBC_NAMESPACE::internal::int_to_b36_char(second_digit));
+          small_string[1] =
+              LIBC_NAMESPACE::internal::int_to_b36_char(second_digit);
           if (first_digit < base && second_digit < base) {
             ASSERT_EQ(
                 func(small_string, nullptr, base),
@@ -216,14 +216,14 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<char>(
-            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit);
         for (int second_digit = 0; second_digit <= 36; ++second_digit) {
-          small_string[1] = static_cast<char>(
-              LIBC_NAMESPACE::internal::int_to_b36_char(second_digit));
+          small_string[1] =
+              LIBC_NAMESPACE::internal::int_to_b36_char(second_digit);
           for (int third_digit = 0; third_digit <= limit; ++third_digit) {
-            small_string[2] = static_cast<char>(
-                LIBC_NAMESPACE::internal::int_to_b36_char(third_digit));
+            small_string[2] =
+                LIBC_NAMESPACE::internal::int_to_b36_char(third_digit);
 
             if (first_digit < base && second_digit < base &&
                 third_digit < base) {
diff --git a/libc/test/src/string/memchr_test.cpp b/libc/test/src/string/memchr_test.cpp
index ede841118fe03..a92c5fe80be98 100644
--- a/libc/test/src/string/memchr_test.cpp
+++ b/libc/test/src/string/memchr_test.cpp
@@ -21,6 +21,11 @@ const char *call_memchr(const void *src, int c, size_t size) {
   return reinterpret_cast<const char *>(LIBC_NAMESPACE::memchr(src, c, size));
 }
 
+TEST(LlvmLibcMemChrTest, WideReadMultiIteration) {
+  const char *src = "abcdefghijklmnopqrst$\n";
+  ASSERT_STREQ(call_memchr(src, '$', 22), "$\n");
+}
+
 TEST(LlvmLibcMemChrTest, FindsCharacterAfterNullTerminator) {
   // memchr should continue searching after a null terminator.
   const size_t size = 5;
diff --git a/libc/test/src/string/strlen_test.cpp b/libc/test/src/string/strlen_test.cpp
index 4eb9d47e9209d..784dd7b194b3f 100644
--- a/libc/test/src/string/strlen_test.cpp
+++ b/libc/test/src/string/strlen_test.cpp
@@ -22,3 +22,15 @@ TEST(LlvmLibcStrLenTest, AnyString) {
   size_t result = LIBC_NAMESPACE::strlen(any);
   ASSERT_EQ((size_t)12, result);
 }
+
+TEST(LlvmLibcStrLenTest, DataAfterNulString) {
+  constexpr char A[10] = {'a', 'b', 'c', 'd', 'e', 'f', 0, 'h', 'i', 'j'};
+  size_t result = LIBC_NAMESPACE::strlen(A);
+  ASSERT_EQ((size_t)6, result);
+}
+
+TEST(LlvmLibcStrLenTest, MultipleNulsInOneWord) {
+  constexpr char A[10] = {'a', 'b', 0, 'd', 'e', 'f', 0, 'h', 'i', 'j'};
+  size_t result = LIBC_NAMESPACE::strlen(A);
+  ASSERT_EQ((size_t)2, result);
+}
diff --git a/libc/test/src/time/CMakeLists.txt b/libc/test/src/time/CMakeLists.txt
index 03e5428292418..c8e113f06d50b 100644
--- a/libc/test/src/time/CMakeLists.txt
+++ b/libc/test/src/time/CMakeLists.txt
@@ -124,6 +124,21 @@ add_libc_test(
     libc.src.time.clock_getres
 )
 
+add_libc_test(
+  clock_settime_test
+  SUITE
+    libc_time_unittests
+  SRCS
+    clock_settime_test.cpp
+  DEPENDS
+    libc.src.time.clock_settime
+    libc.hdr.types.time_t
+    libc.hdr.types.struct_timespec
+    libc.hdr.time_macros
+    libc.hdr.errno_macros
+    libc.test.UnitTest.ErrnoCheckingTest
+)
+
 add_libc_unittest(
   difftime_test
   SUITE
diff --git a/libc/test/src/time/clock_settime_test.cpp b/libc/test/src/time/clock_settime_test.cpp
new file mode 100644
index 0000000000000..ccbad9ed2e847
--- /dev/null
+++ b/libc/test/src/time/clock_settime_test.cpp
@@ -0,0 +1,54 @@
+//===-- Unittests for clock_settime ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/time_macros.h"
+#include "hdr/types/struct_timespec.h"
+#include "src/time/clock_settime.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcClockSetTime = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+#ifdef CLOCK_MONOTONIC
+TEST_F(LlvmLibcClockSetTime, MonotonicIsNotSettable) {
+  timespec ts = {0, 0};
+  int result = LIBC_NAMESPACE::clock_settime(CLOCK_MONOTONIC, &ts);
+  ASSERT_EQ(result, -1);
+  ASSERT_ERRNO_EQ(EINVAL);
+}
+#endif // CLOCK_MONOTONIC
+
+TEST_F(LlvmLibcClockSetTime, InvalidClockId) {
+  timespec ts = {0, 0};
+  int result = LIBC_NAMESPACE::clock_settime(static_cast<clockid_t>(-1), &ts);
+  ASSERT_EQ(result, -1);
+  ASSERT_ERRNO_EQ(EINVAL);
+}
+
+TEST_F(LlvmLibcClockSetTime, InvalidTimespecNsec) {
+  timespec ts = {0, 1000000000L};
+  int result = LIBC_NAMESPACE::clock_settime(CLOCK_REALTIME, &ts);
+  ASSERT_EQ(result, -1);
+  ASSERT_ERRNO_EQ(EINVAL);
+}
+
+TEST_F(LlvmLibcClockSetTime, NullPointerIsEFAULT) {
+  int result = LIBC_NAMESPACE::clock_settime(CLOCK_REALTIME, nullptr);
+  ASSERT_EQ(result, -1);
+  ASSERT_ERRNO_EQ(EFAULT);
+}
+
+TEST_F(LlvmLibcClockSetTime, ClockIsSet) {
+  timespec ts = {0, 0};
+  int result = LIBC_NAMESPACE::clock_settime(CLOCK_REALTIME, &ts);
+  if (result == 0) {
+    ASSERT_ERRNO_SUCCESS();
+  } else {
+    ASSERT_ERRNO_EQ(EPERM);
+  }
+}
diff --git a/libc/test/src/time/strftime_test.cpp b/libc/test/src/time/strftime_test.cpp
index 38176f77804d5..5222152791905 100644
--- a/libc/test/src/time/strftime_test.cpp
+++ b/libc/test/src/time/strftime_test.cpp
@@ -2329,20 +2329,21 @@ TEST(LlvmLibcStrftimeTest, TimeFormatFullDateTime) {
 
 TEST(LlvmLibcStrftimeTest, BufferTooSmall) {
   struct tm time;
-  char buffer[1];
+  char tiny_buffer[1];
 
   time.tm_year = get_adjusted_year(2025);
   time.tm_mon = 10;
   time.tm_mday = 24;
 
   size_t written =
-      LIBC_NAMESPACE::strftime(buffer, sizeof(buffer), "%F", &time);
+      LIBC_NAMESPACE::strftime(tiny_buffer, sizeof(tiny_buffer), "%F", &time);
   EXPECT_EQ(written, size_t{0});
 
-  char buffer2[10];
+  char small_buffer[10];
 
   // The string "2025-11-24" is 10 chars,
   // so strftime needs 10 + 1 bytes to write the string and the null terminator.
-  written = LIBC_NAMESPACE::strftime(buffer, sizeof(buffer2), "%F", &time);
+  written =
+      LIBC_NAMESPACE::strftime(small_buffer, sizeof(small_buffer), "%F", &time);
   EXPECT_EQ(written, size_t{0});
 }
diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt
index 44f28fff9ad39..3012ea9a466f4 100644
--- a/libc/test/src/unistd/CMakeLists.txt
+++ b/libc/test/src/unistd/CMakeLists.txt
@@ -36,6 +36,26 @@ add_libc_unittest(
     libc.test.UnitTest.ErrnoSetterMatcher
 )
 
+add_libc_unittest(
+  chown_test
+  SUITE
+    libc_unistd_unittests
+  SRCS
+    chown_test.cpp
+  DEPENDS
+    libc.hdr.fcntl_macros
+    libc.include.unistd
+    libc.src.errno.errno
+    libc.src.unistd.chown
+    libc.src.unistd.close
+    libc.src.unistd.unlink
+    libc.src.fcntl.open
+    libc.src.unistd.getuid
+    libc.src.unistd.getgid
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
+)
+
 add_libc_unittest(
   dup_test
   SUITE
@@ -126,6 +146,26 @@ add_libc_unittest(
     libc.test.UnitTest.ErrnoSetterMatcher
 )
 
+add_libc_unittest(
+  fchown_test
+  SUITE
+    libc_unistd_unittests
+  SRCS
+    fchown_test.cpp
+  DEPENDS
+    libc.hdr.fcntl_macros
+    libc.include.unistd
+    libc.src.errno.errno
+    libc.src.unistd.fchown
+    libc.src.unistd.close
+    libc.src.unistd.unlink
+    libc.src.fcntl.open
+    libc.src.unistd.getuid
+    libc.src.unistd.getgid
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
+)
+
 add_libc_unittest(
   ftruncate_test
   SUITE
@@ -437,6 +477,16 @@ add_libc_unittest(
     libc.test.UnitTest.ErrnoCheckingTest
 )
 
+add_libc_unittest(
+  getgid_test
+  SUITE
+    libc_unistd_unittests
+  SRCS
+    getgid_test.cpp
+  DEPENDS
+    libc.src.unistd.getgid
+)
+
 add_libc_unittest(
   getpid_test
   SUITE
diff --git a/libc/test/src/unistd/chown_test.cpp b/libc/test/src/unistd/chown_test.cpp
new file mode 100644
index 0000000000000..8b1f783273624
--- /dev/null
+++ b/libc/test/src/unistd/chown_test.cpp
@@ -0,0 +1,51 @@
+//===-- Unittests for chown -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/fcntl/open.h"
+#include "src/unistd/chown.h"
+#include "src/unistd/close.h"
+#include "src/unistd/getgid.h"
+#include "src/unistd/getuid.h"
+#include "src/unistd/unlink.h"
+
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/Test.h"
+
+#include "hdr/fcntl_macros.h"
+#include <sys/stat.h>
+
+using LlvmLibcChownTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcChownTest, ChownSuccess) {
+  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+  uid_t my_uid = LIBC_NAMESPACE::getuid();
+  gid_t my_gid = LIBC_NAMESPACE::getgid();
+  constexpr const char *FILENAME = "chown.test";
+  auto TEST_FILE = libc_make_test_file_path(FILENAME);
+
+  // Create a test file.
+  int write_fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(write_fd, 0);
+
+  // Change the ownership of the file.
+  ASSERT_THAT(LIBC_NAMESPACE::chown(TEST_FILE, my_uid, my_gid), Succeeds(0));
+
+  // Close the file descriptor.
+  ASSERT_THAT(LIBC_NAMESPACE::close(write_fd), Succeeds(0));
+
+  // Clean up the test file.
+  ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0));
+}
+
+TEST_F(LlvmLibcChownTest, ChownNonExistentFile) {
+  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
+  ASSERT_THAT(LIBC_NAMESPACE::chown("non-existent-file", 1000, 1000),
+              Fails(ENOENT));
+}
diff --git a/libc/test/src/unistd/fchown_test.cpp b/libc/test/src/unistd/fchown_test.cpp
new file mode 100644
index 0000000000000..7954410afb929
--- /dev/null
+++ b/libc/test/src/unistd/fchown_test.cpp
@@ -0,0 +1,50 @@
+//===-- Unittests for fchown ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/fcntl/open.h"
+#include "src/unistd/close.h"
+#include "src/unistd/fchown.h"
+#include "src/unistd/getgid.h"
+#include "src/unistd/getuid.h"
+#include "src/unistd/unlink.h"
+
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/Test.h"
+
+#include "hdr/fcntl_macros.h"
+#include <sys/stat.h>
+
+using LlvmLibcFchownTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcFchownTest, FchownSuccess) {
+  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+  uid_t my_uid = LIBC_NAMESPACE::getuid();
+  gid_t my_gid = LIBC_NAMESPACE::getgid();
+  constexpr const char *FILENAME = "fchown.test";
+  auto TEST_FILE = libc_make_test_file_path(FILENAME);
+
+  // Create a test file.
+  int write_fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(write_fd, 0);
+
+  // Change the ownership of the file.
+  ASSERT_THAT(LIBC_NAMESPACE::fchown(write_fd, my_uid, my_gid), Succeeds(0));
+
+  // Close the file descriptor.
+  ASSERT_THAT(LIBC_NAMESPACE::close(write_fd), Succeeds(0));
+
+  // Clean up the test file.
+  ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0));
+}
+
+TEST_F(LlvmLibcFchownTest, FchownInvalidFileDescriptor) {
+  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
+  ASSERT_THAT(LIBC_NAMESPACE::fchown(-1, 1000, 1000), Fails(EBADF));
+}
diff --git a/libc/test/src/unistd/getgid_test.cpp b/libc/test/src/unistd/getgid_test.cpp
new file mode 100644
index 0000000000000..77dbad2f18e00
--- /dev/null
+++ b/libc/test/src/unistd/getgid_test.cpp
@@ -0,0 +1,15 @@
+//===-- Unittests for getgid ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/unistd/getgid.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcGetGidTest, SmokeTest) {
+  // getgid always succeeds. So, we just call it as a smoke test.
+  LIBC_NAMESPACE::getgid();
+}
diff --git a/libc/test/src/wchar/WcstolTest.h b/libc/test/src/wchar/WcstolTest.h
index 4d5b752e62238..cadf9e0c42b90 100644
--- a/libc/test/src/wchar/WcstolTest.h
+++ b/libc/test/src/wchar/WcstolTest.h
@@ -178,8 +178,8 @@ struct WcstoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
     wchar_t small_string[4] = {L'\0', L'\0', L'\0', L'\0'};
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<wchar_t>(
-            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit);
         if (first_digit < base) {
           ASSERT_EQ(func(small_string, nullptr, base),
                     static_cast<ReturnT>(first_digit));
@@ -193,11 +193,11 @@ struct WcstoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<wchar_t>(
-            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit);
         for (int second_digit = 0; second_digit <= 36; ++second_digit) {
-          small_string[1] = static_cast<wchar_t>(
-              LIBC_NAMESPACE::internal::int_to_b36_wchar(second_digit));
+          small_string[1] =
+              LIBC_NAMESPACE::internal::int_to_b36_wchar(second_digit);
           if (first_digit < base && second_digit < base) {
             ASSERT_EQ(
                 func(small_string, nullptr, base),
@@ -217,14 +217,14 @@ struct WcstoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<wchar_t>(
-            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit);
         for (int second_digit = 0; second_digit <= 36; ++second_digit) {
-          small_string[1] = static_cast<wchar_t>(
-              LIBC_NAMESPACE::internal::int_to_b36_wchar(second_digit));
+          small_string[1] =
+              LIBC_NAMESPACE::internal::int_to_b36_wchar(second_digit);
           for (int third_digit = 0; third_digit <= limit; ++third_digit) {
-            small_string[2] = static_cast<wchar_t>(
-                LIBC_NAMESPACE::internal::int_to_b36_wchar(third_digit));
+            small_string[2] =
+                LIBC_NAMESPACE::internal::int_to_b36_wchar(third_digit);
 
             if (first_digit < base && second_digit < base &&
                 third_digit < base) {
diff --git a/libclc/clc/include/clc/math/clc_cbrt.inc b/libclc/clc/include/clc/math/clc_cbrt.h
similarity index 100%
rename from libclc/clc/include/clc/math/clc_cbrt.inc
rename to libclc/clc/include/clc/math/clc_cbrt.h
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_def.inc b/libclc/clc/lib/generic/atomic/clc_atomic_def.inc
index 14a09b1f09f5c..75561430b33ad 100644
--- a/libclc/clc/lib/generic/atomic/clc_atomic_def.inc
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_def.inc
@@ -21,47 +21,50 @@
 
 #ifdef __CLC_HAS_ATOMIC
 
-#ifndef __CLC_PTR_CASTTYPE
-#define __CLC_PTR_CASTTYPE __CLC_GENTYPE
+#ifndef __CLC_CASTTYPE
+#define __CLC_CASTTYPE __CLC_GENTYPE
 #endif
 
 #ifndef __CLC_AS_RETTYPE
 #define __CLC_AS_RETTYPE(x) x
 #endif
 
+#ifndef __CLC_AS_CASTTYPE
+#define __CLC_AS_CASTTYPE(x) x
+#endif
+
 #ifdef __CLC_NO_VALUE_ARG
 #define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
   _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(                         \
       volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder,                  \
       int MemoryScope) {                                                       \
     return __CLC_AS_RETTYPE(__CLC_IMPL_FUNCTION(                               \
-        (ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, MemoryOrder, MemoryScope));       \
+        (ADDRSPACE __CLC_CASTTYPE *)Ptr, MemoryOrder, MemoryScope));           \
   }
 #elif defined(__CLC_INC_DEC)
 #define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
   _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(                         \
       volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder,                  \
       int MemoryScope) {                                                       \
-    return __CLC_AS_RETTYPE(                                                   \
-        __CLC_IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr,               \
-                            (__CLC_GENTYPE)1, MemoryOrder, MemoryScope));      \
+    return __CLC_IMPL_FUNCTION(Ptr, (__CLC_GENTYPE)1, MemoryOrder,             \
+                               MemoryScope);                                   \
   }
 #elif defined(__CLC_RETURN_VOID)
 #define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
   _CLC_OVERLOAD _CLC_DEF void __CLC_FUNCTION(                                  \
       volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value,              \
       int MemoryOrder, int MemoryScope) {                                      \
-    __CLC_IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, Value,            \
-                        MemoryOrder, MemoryScope);                             \
+    __CLC_IMPL_FUNCTION((ADDRSPACE __CLC_CASTTYPE *)Ptr,                       \
+                        __CLC_AS_CASTTYPE(Value), MemoryOrder, MemoryScope);   \
   }
 #else
 #define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
   _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(                         \
       volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value,              \
       int MemoryOrder, int MemoryScope) {                                      \
-    return __CLC_AS_RETTYPE(                                                   \
-        __CLC_IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, Value,        \
-                            MemoryOrder, MemoryScope));                        \
+    return __CLC_AS_RETTYPE(__CLC_IMPL_FUNCTION(                               \
+        (ADDRSPACE __CLC_CASTTYPE *)Ptr, __CLC_AS_CASTTYPE(Value),             \
+        MemoryOrder, MemoryScope));                                            \
   }
 #endif
 
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl b/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl
index ee80256d3dbb6..b2c26758103cd 100644
--- a/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl
@@ -14,10 +14,12 @@
 #define __CLC_BODY <clc_atomic_def.inc>
 #include <clc/integer/gentype.inc>
 
-#undef __CLC_PTR_CASTTYPE
+#undef __CLC_CASTTYPE
 #undef __CLC_AS_RETTYPE
-#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN
+#undef __CLC_AS_CASTTYPE
+#define __CLC_CASTTYPE __CLC_BIT_INTN
 #define __CLC_AS_RETTYPE(x) __CLC_AS_GENTYPE(x)
+#define __CLC_AS_CASTTYPE __CLC_AS_S_GENTYPE
 
 #define __CLC_BODY <clc_atomic_def.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_load.cl b/libclc/clc/lib/generic/atomic/clc_atomic_load.cl
index f7fe2510569e4..af808553a7110 100644
--- a/libclc/clc/lib/generic/atomic/clc_atomic_load.cl
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_load.cl
@@ -15,9 +15,9 @@
 #define __CLC_BODY <clc_atomic_def.inc>
 #include <clc/integer/gentype.inc>
 
-#undef __CLC_PTR_CASTTYPE
+#undef __CLC_CASTTYPE
 #undef __CLC_AS_RETTYPE
-#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN
+#define __CLC_CASTTYPE __CLC_BIT_INTN
 #define __CLC_AS_RETTYPE(x) __CLC_AS_GENTYPE(x)
 
 #define __CLC_BODY <clc_atomic_def.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_store.cl b/libclc/clc/lib/generic/atomic/clc_atomic_store.cl
index a93d21e8430ce..66ae2ba98556d 100644
--- a/libclc/clc/lib/generic/atomic/clc_atomic_store.cl
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_store.cl
@@ -15,8 +15,10 @@
 #define __CLC_BODY <clc_atomic_def.inc>
 #include <clc/integer/gentype.inc>
 
-#undef __CLC_PTR_CASTTYPE
-#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN
+#undef __CLC_CASTTYPE
+#undef __CLC_AS_CASTTYPE
+#define __CLC_CASTTYPE __CLC_BIT_INTN
+#define __CLC_AS_CASTTYPE __CLC_AS_S_GENTYPE
 
 #define __CLC_BODY <clc_atomic_def.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/math/clc_cbrt.cl b/libclc/clc/lib/generic/math/clc_cbrt.cl
index 105f6329d5bad..935b7b7eae78c 100644
--- a/libclc/clc/lib/generic/math/clc_cbrt.cl
+++ b/libclc/clc/lib/generic/math/clc_cbrt.cl
@@ -8,6 +8,7 @@
 
 #include <clc/clc_convert.h>
 #include <clc/internal/clc.h>
+#include <clc/math/clc_cbrt.h>
 #include <clc/math/clc_copysign.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_fma.h>
diff --git a/libclc/opencl/lib/generic/math/cbrt.cl b/libclc/opencl/lib/generic/math/cbrt.cl
index 0d670150ed4c9..7de61436522b3 100644
--- a/libclc/opencl/lib/generic/math/cbrt.cl
+++ b/libclc/opencl/lib/generic/math/cbrt.cl
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/math/clc_cbrt.inc>
+#include <clc/math/clc_cbrt.h>
 #include <clc/opencl/math/cbrt.h>
 
 #define __CLC_FUNCTION cbrt
diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index 8fba6db871f08..756bdf71f8b22 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -426,6 +426,10 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_algorithms``                         ``202306L``
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_constexpr_flat_map``                           ``202502L``
+    ---------------------------------------------------------- -----------------
+    ``__cpp_lib_constexpr_flat_set``                           ``202502L``
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_forward_list``                       ``202502L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_list``                               ``202502L``
@@ -474,7 +478,7 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_is_virtual_base_of``                           ``202406L``
     ---------------------------------------------------------- -----------------
-    ``__cpp_lib_is_within_lifetime``                           *unimplemented*
+    ``__cpp_lib_is_within_lifetime``                           ``202306L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_linalg``                                       *unimplemented*
     ---------------------------------------------------------- -----------------
@@ -482,6 +486,8 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_not_fn``                                       ``202306L``
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_optional``                                     ``202506L``
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_optional_range_support``                       ``202406L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_out_ptr``                                      ``202311L``
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index 980390c4fe3d7..2c19dfc57a3f8 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -40,9 +40,12 @@ Implemented Papers
 
 - P2321R2: ``zip`` (`Github <https://llvm.org/PR105169>`__) (The paper is partially implemented. ``zip_transform_view``
   is implemented in this release)
+- P2988R12: ``std::optional<T&>`` (`Github <https://llvm.org/PR148131>`__)
 - P3044R2: sub-``string_view`` from ``string`` (`Github <https://llvm.org/PR148140>`__)
 - P3223R2: Making ``std::istream::ignore`` less surprising (`Github <https://llvm.org/PR148178>`__)
 - P3060R3: Add ``std::views::indices(n)`` (`Github <https://llvm.org/PR148175>`__)
+- P2641R4: Checking if a ``union`` alternative is active (``std::is_within_lifetime``)
+  (`Github <https://llvm.org/PR105381>`__)
 - P2835R7: Expose ``std::atomic_ref``'s object address (`Github <https://llvm.org/PR118377>`__)
 - P2944R3: Comparisons for ``reference_wrapper`` (`Github <https://llvm.org/PR105424>`__)
 - P3168R2: Give ``std::optional`` Range Support (`Github <https://llvm.org/PR105430>`__)
@@ -64,8 +67,8 @@ Improvements and New Features
   by up to 2.5x
 - The performance of ``erase(iterator, iterator)`` in the unordered containers has been improved by up to 1.9x
 - The performance of ``map::insert_or_assign`` has been improved by up to 2x
-- ``ofstream::write`` has been optimized to pass through large strings to system calls directly instead of copying them
-  in chunks into a buffer.
+- ``ofstream::write`` and ``ifstream::read`` have been optimized to pass through large reads and writes to system calls
+  directly instead of copying them in chunks.
 - Multiple internal types have been refactored to use ``[[no_unique_address]]``, resulting in faster compile times and
   reduced debug information.
 
diff --git a/libcxx/docs/Status/Cxx17Papers.csv b/libcxx/docs/Status/Cxx17Papers.csv
index 1a9d3b08f3ec3..6bb29823d1b9b 100644
--- a/libcxx/docs/Status/Cxx17Papers.csv
+++ b/libcxx/docs/Status/Cxx17Papers.csv
@@ -84,7 +84,7 @@
 "`P0508R0 <https://wg21.link/P0508R0>`__","Wording for GB 58 - structured bindings for node_handles","2016-11 (Issaquah)","|Complete|","7","`#99944 <https://github.com/llvm/llvm-project/issues/99944>`__",""
 "`P0509R1 <https://wg21.link/P0509R1>`__","Updating ""Restrictions on exception handling""","2016-11 (Issaquah)","|Nothing To Do|","n/a","`#103676 <https://github.com/llvm/llvm-project/issues/103676>`__",""
 "`P0510R0 <https://wg21.link/P0510R0>`__","Disallowing references, incomplete types, arrays, and empty variants","2016-11 (Issaquah)","|Complete|","4","`#103677 <https://github.com/llvm/llvm-project/issues/103677>`__",""
-"`P0513R0 <https://wg21.link/P0513R0>`__","Poisoning the Hash","2016-11 (Issaquah)","|Complete|","5","`#103678 <https://github.com/llvm/llvm-project/issues/103678>`__",""
+"`P0513R0 <https://wg21.link/P0513R0>`__","Poisoning the Hash","2016-11 (Issaquah)","|Complete|","5","`#103678 <https://github.com/llvm/llvm-project/issues/103678>`__","Implemented as a DR against C++11 since LLVM 22. MSVC STL does the same."
 "`P0516R0 <https://wg21.link/P0516R0>`__","Clarify That shared_future's Copy Operations have Wide Contracts","2016-11 (Issaquah)","|Complete|","4","`#103679 <https://github.com/llvm/llvm-project/issues/103679>`__",""
 "`P0517R0 <https://wg21.link/P0517R0>`__","Make future_error Constructible","2016-11 (Issaquah)","|Complete|","4","`#103680 <https://github.com/llvm/llvm-project/issues/103680>`__",""
 "`P0521R0 <https://wg21.link/P0521R0>`__","Proposed Resolution for CA 14 (shared_ptr use_count/unique)","2016-11 (Issaquah)","|Complete|","18","`#103681 <https://github.com/llvm/llvm-project/issues/103681>`__",""
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index a5423acf0d419..0455643446f8e 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -18,7 +18,7 @@
 "`P2874R2 <https://wg21.link/P2874R2>`__","P2874R2: Mandating Annex D Require No More","2023-06 (Varna)","|Complete|","12","`#105377 <https://github.com/llvm/llvm-project/issues/105377>`__",""
 "`P2757R3 <https://wg21.link/P2757R3>`__","Type-checking format args","2023-06 (Varna)","","","`#105378 <https://github.com/llvm/llvm-project/issues/105378>`__",""
 "`P2637R3 <https://wg21.link/P2637R3>`__","Member ``visit``","2023-06 (Varna)","|Complete|","19","`#105380 <https://github.com/llvm/llvm-project/issues/105380>`__","Change of ``__cpp_lib_variant`` is completed in LLVM 20. Change of ``__cpp_lib_format`` is blocked by `P2419R2 <https://wg21.link/P2419R2>`__."
-"`P2641R4 <https://wg21.link/P2641R4>`__","Checking if a ``union`` alternative is active","2023-06 (Varna)","","","`#105381 <https://github.com/llvm/llvm-project/issues/105381>`__",""
+"`P2641R4 <https://wg21.link/P2641R4>`__","Checking if a ``union`` alternative is active","2023-06 (Varna)","|Complete|","22","`#105381 <https://github.com/llvm/llvm-project/issues/105381>`__",""
 "`P1759R6 <https://wg21.link/P1759R6>`__","Native handles and file streams","2023-06 (Varna)","|Complete|","18","`#105382 <https://github.com/llvm/llvm-project/issues/105382>`__",""
 "`P2697R1 <https://wg21.link/P2697R1>`__","Interfacing ``bitset`` with ``string_view``","2023-06 (Varna)","|Complete|","18","`#105384 <https://github.com/llvm/llvm-project/issues/105384>`__",""
 "`P1383R2 <https://wg21.link/P1383R2>`__","More ``constexpr`` for ``<cmath>`` and ``<complex>``","2023-06 (Varna)","","","`#105385 <https://github.com/llvm/llvm-project/issues/105385>`__",""
@@ -81,7 +81,7 @@
 "`P3379R0 <https://wg21.link/P3379R0>`__","Constrain ``std::expected`` equality operators","2024-11 (Wrocław)","|Complete|","21","`#118135 <https://github.com/llvm/llvm-project/issues/118135>`__",""
 "`P2862R1 <https://wg21.link/P2862R1>`__","``text_encoding::name()`` should never return null values","2024-11 (Wrocław)","","","`#118371 <https://github.com/llvm/llvm-project/issues/118371>`__",""
 "`P2897R7 <https://wg21.link/P2897R7>`__","``aligned_accessor``: An ``mdspan`` accessor expressing pointer over-alignment","2024-11 (Wrocław)","|Complete|","21","`#118372 <https://github.com/llvm/llvm-project/issues/118372>`__",""
-"`P3355R1 <https://wg21.link/P3355R1>`__","Fix ``submdspan`` for C++26","2024-11 (Wrocław)","","","`#118373 <https://github.com/llvm/llvm-project/issues/118373>`__",""
+"`P3355R2 <https://wg21.link/P3355R2>`__","Fix ``submdspan`` for C++26","2024-11 (Wrocław)","","","`#118373 <https://github.com/llvm/llvm-project/issues/118373>`__",""
 "`P3222R0 <https://wg21.link/P3222R0>`__","Fix C++26 by adding transposed special cases for P2642 layouts","2024-11 (Wrocław)","","","`#118374 <https://github.com/llvm/llvm-project/issues/118374>`__",""
 "`P3050R2 <https://wg21.link/P3050R2>`__","Fix C++26 by optimizing ``linalg::conjugated`` for noncomplex value types","2024-11 (Wrocław)","","","`#118375 <https://github.com/llvm/llvm-project/issues/118375>`__",""
 "`P3396R1 <https://wg21.link/P3396R1>`__","``std::execution`` wording fixes","2024-11 (Wrocław)","","","`#118376 <https://github.com/llvm/llvm-project/issues/118376>`__",""
@@ -122,7 +122,7 @@
 "`P3293R3 <https://wg21.link/P3293R3>`__","Splicing a base class subobject","2025-06 (Sofia)","","","`#148125 <https://github.com/llvm/llvm-project/issues/148125>`__",""
 "`P3491R3 <https://wg21.link/P3491R3>`__","``define_static_{string,object,array}``","2025-06 (Sofia)","","","`#148126 <https://github.com/llvm/llvm-project/issues/148126>`__",""
 "`P3096R12 <https://wg21.link/P3096R12>`__","Function Parameter Reflection in Reflection for C++26","2025-06 (Sofia)","","","`#148127 <https://github.com/llvm/llvm-project/issues/148127>`__",""
-"`P2988R12 <https://wg21.link/P2988R12>`__","``std::optional<T&>``","2025-06 (Sofia)","","","`#148131 <https://github.com/llvm/llvm-project/issues/148131>`__",""
+"`P2988R12 <https://wg21.link/P2988R12>`__","``std::optional<T&>``","2025-06 (Sofia)","|Complete|","22","`#148131 <https://github.com/llvm/llvm-project/issues/148131>`__",""
 "`P3348R4 <https://wg21.link/P3348R4>`__","C++26 should refer to C23 not C17","2025-06 (Sofia)","","","`#148133 <https://github.com/llvm/llvm-project/issues/148133>`__",""
 "`P3037R6 <https://wg21.link/P3037R6>`__","``constexpr`` ``std::shared_ptr`` and friends","2025-06 (Sofia)","","","`#148135 <https://github.com/llvm/llvm-project/issues/148135>`__",""
 "`P3284R4 <https://wg21.link/P3284R4>`__","``write_env`` and ``unstoppable`` Sender Adaptors","2025-06 (Sofia)","","","`#148136 <https://github.com/llvm/llvm-project/issues/148136>`__",""
diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst
index dbe69484abedf..e15c5b1a5d32f 100644
--- a/libcxx/docs/TestingLibcxx.rst
+++ b/libcxx/docs/TestingLibcxx.rst
@@ -451,7 +451,7 @@ Instead use:
 
 .. code-block:: cpp
 
-   // UNSUPPORTED: std-at-least-c++26
+   // REQUIRES: std-at-least-c++26
 
 There is no corresponding ``std-at-most-c++23``. This could be useful when
 tests are only valid for a small set of standard versions. For example, a
diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst
index 495ccceb31cef..03dfb9d41aa1a 100644
--- a/libcxx/docs/index.rst
+++ b/libcxx/docs/index.rst
@@ -132,7 +132,7 @@ velocity, libc++ drops support for older compilers as newer ones are released.
 ============ =================== ========================== =====================
 Compiler     Versions            Restrictions               Support policy
 ============ =================== ========================== =====================
-Clang        19, 20, 21-git                                 latest two stable releases per `LLVM's release page <https://releases.llvm.org>`_ and the development version
+Clang        20, 21, 22-git                                 latest two stable releases per `LLVM's release page <https://releases.llvm.org>`_ and the development version
 AppleClang   26.0                                           latest stable release per `Xcode's release page <https://developer.apple.com/documentation/xcode-release-notes>`_
 Open XL      17.1.3 (AIX)                                   latest stable release per `Open XL's documentation page <https://www.ibm.com/docs/en/openxl-c-and-cpp-aix>`_
 GCC          15                  In C++11 or later only     latest stable release per `GCC's release page <https://gcc.gnu.org/releases.html>`_
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index de9819cf5346a..131ba99357d62 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -262,6 +262,7 @@ set(files
   __chrono/gps_clock.h
   __chrono/hh_mm_ss.h
   __chrono/high_resolution_clock.h
+  __chrono/is_clock.h
   __chrono/leap_second.h
   __chrono/literals.h
   __chrono/local_info.h
@@ -327,6 +328,8 @@ set(files
   __configuration/abi.h
   __configuration/availability.h
   __configuration/compiler.h
+  __configuration/experimental.h
+  __configuration/hardening.h
   __configuration/language.h
   __configuration/platform.h
   __coroutine/coroutine_handle.h
@@ -785,7 +788,6 @@ set(files
   __tuple/tuple_like.h
   __tuple/tuple_like_no_subrange.h
   __tuple/tuple_size.h
-  __tuple/tuple_types.h
   __type_traits/add_cv_quals.h
   __type_traits/add_pointer.h
   __type_traits/add_reference.h
@@ -857,7 +859,6 @@ set(files
   __type_traits/is_reference.h
   __type_traits/is_reference_wrapper.h
   __type_traits/is_referenceable.h
-  __type_traits/is_replaceable.h
   __type_traits/is_same.h
   __type_traits/is_scalar.h
   __type_traits/is_signed.h
@@ -878,6 +879,7 @@ set(files
   __type_traits/is_valid_expansion.h
   __type_traits/is_void.h
   __type_traits/is_volatile.h
+  __type_traits/is_within_lifetime.h
   __type_traits/lazy.h
   __type_traits/make_32_64_or_128_bit.h
   __type_traits/make_const_lvalue_ref.h
@@ -1064,7 +1066,6 @@ set(files
   sstream
   stack
   stdatomic.h
-  stdbool.h
   stddef.h
   stdexcept
   stdio.h
diff --git a/libcxx/include/__algorithm/fill.h b/libcxx/include/__algorithm/fill.h
index 328ebb663376a..37732cc22afd2 100644
--- a/libcxx/include/__algorithm/fill.h
+++ b/libcxx/include/__algorithm/fill.h
@@ -15,6 +15,7 @@
 #include <__iterator/iterator_traits.h>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/enable_if.h>
+#include <__type_traits/is_same.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -27,6 +28,15 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _ForwardIterator, class _Sentinel, class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
 __fill(_ForwardIterator __first, _Sentinel __last, const _Tp& __value) {
+#ifndef _LIBCPP_CXX03_LANG
+  if constexpr (is_same<_ForwardIterator, _Sentinel>::value && __is_segmented_iterator_v<_ForwardIterator>) {
+    using __local_iterator_t = typename __segmented_iterator_traits<_ForwardIterator>::__local_iterator;
+    std::__for_each_segment(__first, __last, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
+      std::__fill(__lfirst, __llast, __value);
+    });
+    return __last;
+  }
+#endif
   for (; __first != __last; ++__first)
     *__first = __value;
   return __first;
@@ -42,18 +52,6 @@ __fill(_RandomAccessIterator __first, _RandomAccessIterator __last, const _Tp& _
   return std::__fill_n(__first, __last - __first, __value);
 }
 
-#ifndef _LIBCPP_CXX03_LANG
-template <class _SegmentedIterator, class _Tp, __enable_if_t<__is_segmented_iterator_v<_SegmentedIterator>, int> = 0>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
-_SegmentedIterator __fill(_SegmentedIterator __first, _SegmentedIterator __last, const _Tp& __value) {
-  using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
-  std::__for_each_segment(__first, __last, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
-    std::__fill(__lfirst, __llast, __value);
-  });
-  return __last;
-}
-#endif // !_LIBCPP_CXX03_LANG
-
 template <class _ForwardIterator, class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
 fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
diff --git a/libcxx/include/__algorithm/fill_n.h b/libcxx/include/__algorithm/fill_n.h
index 2bfacf3178c4e..426fe228bdabb 100644
--- a/libcxx/include/__algorithm/fill_n.h
+++ b/libcxx/include/__algorithm/fill_n.h
@@ -16,10 +16,6 @@
 #include <__iterator/iterator_traits.h>
 #include <__iterator/segmented_iterator.h>
 #include <__memory/pointer_traits.h>
-#include <__type_traits/conjunction.h>
-#include <__type_traits/enable_if.h>
-#include <__type_traits/integral_constant.h>
-#include <__type_traits/negation.h>
 #include <__utility/convert_to_integral.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -33,39 +29,24 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 // fill_n isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset.
 
-template <class _OutputIterator,
-          class _Size,
-          class _Tp
-#ifndef _LIBCPP_CXX03_LANG
-          ,
-          __enable_if_t<!_And<_BoolConstant<__is_segmented_iterator_v<_OutputIterator>>,
-                              __has_random_access_local_iterator<_OutputIterator>>::value,
-                        int> = 0
-#endif
-          >
+template <class _OutputIterator, class _Size, class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
 __fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
+#ifndef _LIBCPP_CXX03_LANG
+  if constexpr (__is_segmented_iterator_v<_OutputIterator>) {
+    using __local_iterator = typename __segmented_iterator_traits<_OutputIterator>::__local_iterator;
+    if constexpr (__has_random_access_iterator_category<__local_iterator>::value) {
+      return std::__for_each_n_segment(__first, __n, [&](__local_iterator __lfirst, __local_iterator __llast) {
+        std::__fill_n(__lfirst, __llast - __lfirst, __value);
+      });
+    }
+  }
+#endif
   for (; __n > 0; ++__first, (void)--__n)
     *__first = __value;
   return __first;
 }
 
-#ifndef _LIBCPP_CXX03_LANG
-template < class _OutputIterator,
-           class _Size,
-           class _Tp,
-           __enable_if_t<_And<_BoolConstant<__is_segmented_iterator_v<_OutputIterator>>,
-                              __has_random_access_local_iterator<_OutputIterator>>::value,
-                         int> = 0>
-inline _LIBCPP_HIDE_FROM_ABI
-_LIBCPP_CONSTEXPR_SINCE_CXX14 _OutputIterator __fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
-  using __local_iterator_t = typename __segmented_iterator_traits<_OutputIterator>::__local_iterator;
-  return std::__for_each_n_segment(__first, __n, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
-    std::__fill_n(__lfirst, __llast - __lfirst, __value);
-  });
-}
-#endif // !_LIBCPP_CXX03_LANG
-
 template <bool _FillVal, class _Cp>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
 __fill_n_bool(__bit_iterator<_Cp, false> __first, typename __size_difference_type_traits<_Cp>::size_type __n) {
diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index 6fb66d25a2462..cb26aa4d2656a 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -14,8 +14,8 @@
 #include <__config>
 #include <__functional/identity.h>
 #include <__iterator/segmented_iterator.h>
-#include <__type_traits/enable_if.h>
 #include <__type_traits/invoke.h>
+#include <__type_traits/is_same.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -25,27 +25,21 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Sent, class _Func, class _Proj>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
-__for_each(_InputIterator __first, _Sent __last, _Func& __f, _Proj& __proj) {
+__for_each(_InputIterator __first, _Sent __last, _Func& __func, _Proj& __proj) {
+#ifndef _LIBCPP_CXX03_LANG
+  if constexpr (is_same<_InputIterator, _Sent>::value && __is_segmented_iterator_v<_InputIterator>) {
+    using __local_iterator_t = typename __segmented_iterator_traits<_InputIterator>::__local_iterator;
+    std::__for_each_segment(__first, __last, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
+      std::__for_each(__lfirst, __llast, __func, __proj);
+    });
+    return __last;
+  }
+#endif
   for (; __first != __last; ++__first)
-    std::__invoke(__f, std::__invoke(__proj, *__first));
+    std::__invoke(__func, std::__invoke(__proj, *__first));
   return __first;
 }
 
-#ifndef _LIBCPP_CXX03_LANG
-template <class _SegmentedIterator,
-          class _Func,
-          class _Proj,
-          __enable_if_t<__is_segmented_iterator_v<_SegmentedIterator>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
-__for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Func& __func, _Proj& __proj) {
-  using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
-  std::__for_each_segment(__first, __last, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
-    std::__for_each(__lfirst, __llast, __func, __proj);
-  });
-  return __last;
-}
-#endif // !_LIBCPP_CXX03_LANG
-
 template <class _InputIterator, class _Func>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Func
 for_each(_InputIterator __first, _InputIterator __last, _Func __f) {
diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h
index 04650e15b6362..72c7adb093f95 100644
--- a/libcxx/include/__algorithm/for_each_n.h
+++ b/libcxx/include/__algorithm/for_each_n.h
@@ -16,10 +16,7 @@
 #include <__functional/identity.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/segmented_iterator.h>
-#include <__type_traits/disjunction.h>
-#include <__type_traits/enable_if.h>
 #include <__type_traits/invoke.h>
-#include <__type_traits/negation.h>
 #include <__utility/convert_to_integral.h>
 #include <__utility/move.h>
 
@@ -32,57 +29,33 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _InputIterator,
-          class _Size,
-          class _Func,
-          class _Proj,
-          __enable_if_t<!__has_random_access_iterator_category<_InputIterator>::value &&
-                            _Or<integral_constant<bool, !__is_segmented_iterator_v<_InputIterator> >,
-                                _Not<__has_random_access_local_iterator<_InputIterator> > >::value,
-                        int> = 0>
+template <class _InputIterator, class _Size, class _Func, class _Proj>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
 __for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
   typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
   _IntegralSize __n = __orig_n;
-  while (__n > 0) {
-    std::__invoke(__f, std::__invoke(__proj, *__first));
-    ++__first;
-    --__n;
-  }
-  return std::move(__first);
-}
-
-template <class _RandIter,
-          class _Size,
-          class _Func,
-          class _Proj,
-          __enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
-__for_each_n(_RandIter __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
-  typename std::iterator_traits<_RandIter>::difference_type __n = __orig_n;
-  auto __last                                                   = __first + __n;
-  std::__for_each(__first, __last, __f, __proj);
-  return __last;
-}
 
 #ifndef _LIBCPP_CXX03_LANG
-template <class _SegmentedIterator,
-          class _Size,
-          class _Func,
-          class _Proj,
-          __enable_if_t<!__has_random_access_iterator_category<_SegmentedIterator>::value &&
-                            __is_segmented_iterator_v<_SegmentedIterator> &&
-                            __has_random_access_iterator_category<
-                                typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
-                        int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
-__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
-  using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
-  return std::__for_each_n_segment(__first, __orig_n, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
-    std::__for_each(__lfirst, __llast, __f, __proj);
-  });
+  if constexpr (__is_segmented_iterator_v<_InputIterator>) {
+    using __local_iterator = typename __segmented_iterator_traits<_InputIterator>::__local_iterator;
+    if constexpr (__has_random_access_iterator_category<__local_iterator>::value) {
+      return std::__for_each_n_segment(__first, __orig_n, [&](__local_iterator __lfirst, __local_iterator __llast) {
+        std::__for_each(__lfirst, __llast, __f, __proj);
+      });
+    } else {
+      return std::__for_each(__first, __first + __n, __f, __proj);
+    }
+  } else
+#endif
+  {
+    while (__n > 0) {
+      std::__invoke(__f, std::__invoke(__proj, *__first));
+      ++__first;
+      --__n;
+    }
+    return std::move(__first);
+  }
 }
-#endif // !_LIBCPP_CXX03_LANG
 
 #if _LIBCPP_STD_VER >= 17
 
diff --git a/libcxx/include/__chrono/is_clock.h b/libcxx/include/__chrono/is_clock.h
new file mode 100644
index 0000000000000..e63b8485d06e1
--- /dev/null
+++ b/libcxx/include/__chrono/is_clock.h
@@ -0,0 +1,72 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___CHRONO_IS_CLOCK_H
+#define _LIBCPP___CHRONO_IS_CLOCK_H
+
+#include <__config>
+
+#include <__chrono/duration.h>
+#include <__chrono/time_point.h>
+#include <__concepts/same_as.h>
+#include <__type_traits/integral_constant.h>
+#include <__type_traits/is_arithmetic.h>
+#include <__type_traits/is_class.h>
+#include <__type_traits/is_union.h>
+#include <ratio>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if _LIBCPP_STD_VER >= 20
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace chrono {
+
+// Helper to check that _Tp::time_point has the form time_point<_, typename _Tp::duration>.
+template <class _TimePoint, class _ClockType>
+inline constexpr bool __is_valid_clock_time_point_v = false;
+
+template <class _TimePointClock, class _ClockType>
+inline constexpr bool
+    __is_valid_clock_time_point_v<time_point<_TimePointClock, typename _ClockType::duration>, _ClockType> = true;
+
+// Check if a clock satisfies the Cpp17Clock requirements as defined in [time.clock.req]
+template <class _Tp>
+_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_clock_v = requires {
+  typename _Tp::rep;
+  requires is_arithmetic_v<typename _Tp::rep> || is_class_v<typename _Tp::rep> || is_union_v<typename _Tp::rep>;
+
+  typename _Tp::period;
+  requires __is_ratio_v<typename _Tp::period>;
+
+  typename _Tp::duration;
+  requires same_as<typename _Tp::duration, duration<typename _Tp::rep, typename _Tp::period>>;
+
+  typename _Tp::time_point;
+  requires __is_valid_clock_time_point_v<typename _Tp::time_point, _Tp>;
+
+  _Tp::is_steady;
+  requires same_as<decltype((_Tp::is_steady)), const bool&>;
+
+  _Tp::now();
+  requires same_as<decltype(_Tp::now()), typename _Tp::time_point>;
+};
+
+template <class _Tp>
+struct _LIBCPP_NO_SPECIALIZATIONS is_clock : bool_constant<is_clock_v<_Tp>> {};
+
+} // namespace chrono
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER
+#endif // _LIBCPP___CHRONO_IS_CLOCK_H
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 357f77b7d27d6..d079bf8b500b6 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -14,6 +14,8 @@
 #include <__configuration/abi.h>
 #include <__configuration/availability.h>
 #include <__configuration/compiler.h>
+#include <__configuration/experimental.h>
+#include <__configuration/hardening.h>
 #include <__configuration/language.h>
 #include <__configuration/platform.h>
 
@@ -38,195 +40,6 @@
 #    define _LIBCPP_FREESTANDING
 #  endif
 
-// NOLINTNEXTLINE(libcpp-cpp-version-check)
-#  if __cplusplus < 201103L
-#    define _LIBCPP_CXX03_LANG
-#  endif
-
-#  if __has_feature(experimental_library)
-#    ifndef _LIBCPP_ENABLE_EXPERIMENTAL
-#      define _LIBCPP_ENABLE_EXPERIMENTAL
-#    endif
-#  endif
-
-// Incomplete features get their own specific disabling flags. This makes it
-// easier to grep for target specific flags once the feature is complete.
-#  if defined(_LIBCPP_ENABLE_EXPERIMENTAL) || defined(_LIBCPP_BUILDING_LIBRARY)
-#    define _LIBCPP_HAS_EXPERIMENTAL_LIBRARY 1
-#  else
-#    define _LIBCPP_HAS_EXPERIMENTAL_LIBRARY 0
-#  endif
-
-#  define _LIBCPP_HAS_EXPERIMENTAL_PSTL _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
-#  define _LIBCPP_HAS_EXPERIMENTAL_TZDB _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
-#  define _LIBCPP_HAS_EXPERIMENTAL_SYNCSTREAM _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
-#  define _LIBCPP_HAS_EXPERIMENTAL_HARDENING_OBSERVE_SEMANTIC _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
-
-// HARDENING {
-
-// TODO(LLVM 23): Remove this. We're making these an error to catch folks who might not have migrated.
-//       Since hardening went through several changes (many of which impacted user-facing macros),
-//       we're keeping these checks around for a bit longer than usual. Failure to properly configure
-//       hardening results in checks being dropped silently, which is a pretty big deal.
-#  if defined(_LIBCPP_ENABLE_ASSERTIONS)
-#    error "_LIBCPP_ENABLE_ASSERTIONS has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)"
-#  endif
-#  if defined(_LIBCPP_ENABLE_HARDENED_MODE)
-#    error "_LIBCPP_ENABLE_HARDENED_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)"
-#  endif
-#  if defined(_LIBCPP_ENABLE_SAFE_MODE)
-#    error "_LIBCPP_ENABLE_SAFE_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)"
-#  endif
-#  if defined(_LIBCPP_ENABLE_DEBUG_MODE)
-#    error "_LIBCPP_ENABLE_DEBUG_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)"
-#  endif
-
-// The library provides the macro `_LIBCPP_HARDENING_MODE` which can be set to one of the following values:
-//
-// - `_LIBCPP_HARDENING_MODE_NONE`;
-// - `_LIBCPP_HARDENING_MODE_FAST`;
-// - `_LIBCPP_HARDENING_MODE_EXTENSIVE`;
-// - `_LIBCPP_HARDENING_MODE_DEBUG`.
-//
-// These values have the following effects:
-//
-// - `_LIBCPP_HARDENING_MODE_NONE` -- sets the hardening mode to "none" which disables all runtime hardening checks;
-//
-// - `_LIBCPP_HARDENING_MODE_FAST` -- sets that hardening mode to "fast". The fast mode enables security-critical checks
-//   that can be done with relatively little runtime overhead in constant time;
-//
-// - `_LIBCPP_HARDENING_MODE_EXTENSIVE` -- sets the hardening mode to "extensive". The extensive mode is a superset of
-//   the fast mode that additionally enables checks that are relatively cheap and prevent common types of logic errors
-//   but are not necessarily security-critical;
-//
-// - `_LIBCPP_HARDENING_MODE_DEBUG` -- sets the hardening mode to "debug". The debug mode is a superset of the extensive
-//   mode and enables all checks available in the library, including internal assertions. Checks that are part of the
-//   debug mode can be very expensive and thus the debug mode is intended to be used for testing, not in production.
-
-// Inside the library, assertions are categorized so they can be cherry-picked based on the chosen hardening mode. These
-// macros are only for internal use -- users should only pick one of the high-level hardening modes described above.
-//
-// - `_LIBCPP_ASSERT_VALID_INPUT_RANGE` -- checks that ranges (whether expressed as an iterator pair, an iterator and
-//   a sentinel, an iterator and a count, or a `std::range`) given as input to library functions are valid:
-//   - the sentinel is reachable from the begin iterator;
-//   - TODO(hardening): both iterators refer to the same container.
-//
-// - `_LIBCPP_ASSERT_VALID_ELEMENT_ACCESS` -- checks that any attempts to access a container element, whether through
-//   the container object or through an iterator, are valid and do not attempt to go out of bounds or otherwise access
-//   a non-existent element. For iterator checks to work, bounded iterators must be enabled in the ABI. Types like
-//   `optional` and `function` are considered one-element containers for the purposes of this check.
-//
-// - `_LIBCPP_ASSERT_NON_NULL` -- checks that the pointer being dereferenced is not null. On most modern platforms zero
-//   address does not refer to an actual location in memory, so a null pointer dereference would not compromize the
-//   memory security of a program (however, it is still undefined behavior that can result in strange errors due to
-//   compiler optimizations).
-//
-// - `_LIBCPP_ASSERT_NON_OVERLAPPING_RANGES` -- for functions that take several ranges as arguments, checks that the
-//   given ranges do not overlap.
-//
-// - `_LIBCPP_ASSERT_VALID_DEALLOCATION` -- checks that an attempt to deallocate memory is valid (e.g. the given object
-//   was allocated by the given allocator). Violating this category typically results in a memory leak.
-//
-// - `_LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL` -- checks that a call to an external API doesn't fail in
-//   an unexpected manner. This includes triggering documented cases of undefined behavior in an external library (like
-//   attempting to unlock an unlocked mutex in pthreads). Any API external to the library falls under this category
-//   (from system calls to compiler intrinsics). We generally don't expect these failures to compromize memory safety or
-//   otherwise create an immediate security issue.
-//
-// - `_LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR` -- checks any operations that exchange nodes between containers to make sure
-//   the containers have compatible allocators.
-//
-// - `_LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN` -- checks that the given argument is within the domain of valid arguments
-//   for the function. Violating this typically produces an incorrect result (e.g. the clamp algorithm returns the
-//   original value without clamping it due to incorrect functors) or puts an object into an invalid state (e.g.
-//   a string view where only a subset of elements is possible to access). This category is for assertions violating
-//   which doesn't cause any immediate issues in the library -- whatever the consequences are, they will happen in the
-//   user code.
-//
-// - `_LIBCPP_ASSERT_PEDANTIC` -- checks prerequisites which are imposed by the Standard, but violating which happens to
-//   be benign in our implementation.
-//
-// - `_LIBCPP_ASSERT_SEMANTIC_REQUIREMENT` -- checks that the given argument satisfies the semantic requirements imposed
-//   by the Standard. Typically, there is no simple way to completely prove that a semantic requirement is satisfied;
-//   thus, this would often be a heuristic check and it might be quite expensive.
-//
-// - `_LIBCPP_ASSERT_INTERNAL` -- checks that internal invariants of the library hold. These assertions don't depend on
-//   user input.
-//
-// - `_LIBCPP_ASSERT_UNCATEGORIZED` -- for assertions that haven't been properly classified yet.
-
-// clang-format off
-#  define _LIBCPP_HARDENING_MODE_NONE      (1 << 1)
-#  define _LIBCPP_HARDENING_MODE_FAST      (1 << 2)
-#  define _LIBCPP_HARDENING_MODE_EXTENSIVE (1 << 4) // Deliberately not ordered.
-#  define _LIBCPP_HARDENING_MODE_DEBUG     (1 << 3)
-// clang-format on
-
-#  ifndef _LIBCPP_HARDENING_MODE
-
-#    ifndef _LIBCPP_HARDENING_MODE_DEFAULT
-#      error _LIBCPP_HARDENING_MODE_DEFAULT is not defined. This definition should be set at configuration time in the \
-`__config_site` header, please make sure your installation of libc++ is not broken.
-#    endif
-
-#    define _LIBCPP_HARDENING_MODE _LIBCPP_HARDENING_MODE_DEFAULT
-#  endif
-
-#  if _LIBCPP_HARDENING_MODE != _LIBCPP_HARDENING_MODE_NONE &&                                                         \
-      _LIBCPP_HARDENING_MODE != _LIBCPP_HARDENING_MODE_FAST &&                                                         \
-      _LIBCPP_HARDENING_MODE != _LIBCPP_HARDENING_MODE_EXTENSIVE &&                                                    \
-      _LIBCPP_HARDENING_MODE != _LIBCPP_HARDENING_MODE_DEBUG
-#    error _LIBCPP_HARDENING_MODE must be set to one of the following values: \
-_LIBCPP_HARDENING_MODE_NONE, \
-_LIBCPP_HARDENING_MODE_FAST, \
-_LIBCPP_HARDENING_MODE_EXTENSIVE, \
-_LIBCPP_HARDENING_MODE_DEBUG
-#  endif
-
-// Hardening assertion semantics generally mirror the evaluation semantics of C++26 Contracts:
-// - `ignore` evaluates the assertion but doesn't do anything if it fails (note that it differs from the Contracts
-//   `ignore` semantic which wouldn't evaluate the assertion at all);
-// - `observe` logs an error (indicating, if possible, that the error is fatal) and continues execution;
-// - `quick-enforce` terminates the program as fast as possible (via trapping);
-// - `enforce` logs an error and then terminates the program.
-//
-// Notes:
-// - Continuing execution after a hardening check fails results in undefined behavior; the `observe` semantic is meant
-//   to make adopting hardening easier but should not be used outside of this scenario;
-// - C++26 wording for Library Hardening precludes a conforming Hardened implementation from using the Contracts
-//   `ignore` semantic when evaluating hardened preconditions in the Library. Libc++ allows using this semantic for
-//   hardened preconditions, however, be aware that using `ignore` does not produce a conforming "Hardened"
-//   implementation, unlike the other semantics above.
-// clang-format off
-#  define _LIBCPP_ASSERTION_SEMANTIC_IGNORE        (1 << 1)
-#  define _LIBCPP_ASSERTION_SEMANTIC_OBSERVE       (1 << 2)
-#  define _LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE (1 << 3)
-#  define _LIBCPP_ASSERTION_SEMANTIC_ENFORCE       (1 << 4)
-// clang-format on
-
-// Allow users to define an arbitrary assertion semantic; otherwise, use the default mapping from modes to semantics.
-// The default is for production-capable modes to use `quick-enforce` (i.e., trap) and for the `debug` mode to use
-// `enforce` (i.e., log and abort).
-#  ifndef _LIBCPP_ASSERTION_SEMANTIC
-
-#    if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
-#      define _LIBCPP_ASSERTION_SEMANTIC _LIBCPP_ASSERTION_SEMANTIC_ENFORCE
-#    else
-#      define _LIBCPP_ASSERTION_SEMANTIC _LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE
-#    endif
-
-#  else
-#    if !_LIBCPP_HAS_EXPERIMENTAL_LIBRARY
-#      error "Assertion semantics are an experimental feature."
-#    endif
-#    if defined(_LIBCPP_CXX03_LANG)
-#      error "Assertion semantics are not available in the C++03 mode."
-#    endif
-
-#  endif // _LIBCPP_ASSERTION_SEMANTIC
-
-// } HARDENING
-
 #  define _LIBCPP_TOSTRING2(x) #x
 #  define _LIBCPP_TOSTRING(x) _LIBCPP_TOSTRING2(x)
 
@@ -339,7 +152,7 @@ _LIBCPP_HARDENING_MODE_DEBUG
 
 #  ifndef _LIBCPP_CXX03_LANG
 
-#    define _LIBCPP_ALIGNOF(_Tp) alignof(_Tp)
+#    define _LIBCPP_ALIGNOF(...) alignof(__VA_ARGS__)
 #    define _ALIGNAS_TYPE(x) alignas(x)
 #    define _ALIGNAS(x) alignas(x)
 #    define _NOEXCEPT noexcept
@@ -348,7 +161,7 @@ _LIBCPP_HARDENING_MODE_DEBUG
 
 #  else
 
-#    define _LIBCPP_ALIGNOF(_Tp) _Alignof(_Tp)
+#    define _LIBCPP_ALIGNOF(...) _Alignof(__VA_ARGS__)
 #    define _ALIGNAS_TYPE(x) __attribute__((__aligned__(_LIBCPP_ALIGNOF(x))))
 #    define _ALIGNAS(x) __attribute__((__aligned__(x)))
 #    define nullptr __nullptr
diff --git a/libcxx/include/__configuration/availability.h b/libcxx/include/__configuration/availability.h
index d0414ecfac2bb..5433df872fa39 100644
--- a/libcxx/include/__configuration/availability.h
+++ b/libcxx/include/__configuration/availability.h
@@ -118,14 +118,40 @@
 #  define _LIBCPP_INTRODUCED_IN_LLVM_21_ATTRIBUTE __attribute__((unavailable))
 
 // LLVM 20
-// TODO: Fill this in
-#  define _LIBCPP_INTRODUCED_IN_LLVM_20 0
-#  define _LIBCPP_INTRODUCED_IN_LLVM_20_ATTRIBUTE __attribute__((unavailable))
+//
+// Note that versions for most Apple OSes were bumped forward and aligned in that release.
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 260000) ||       \
+      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 260000) ||     \
+      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 260000) ||             \
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 260000) ||       \
+      (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 100000)
+#    define _LIBCPP_INTRODUCED_IN_LLVM_20 0
+#  else
+#    define _LIBCPP_INTRODUCED_IN_LLVM_20 1
+#  endif
+#  define _LIBCPP_INTRODUCED_IN_LLVM_20_ATTRIBUTE                                                                 \
+    __attribute__((availability(macos, strict, introduced = 26.0)))                                               \
+    __attribute__((availability(ios, strict, introduced = 26.0)))                                                 \
+    __attribute__((availability(tvos, strict, introduced = 26.0)))                                                \
+    __attribute__((availability(watchos, strict, introduced = 26.0)))                                             \
+    __attribute__((availability(bridgeos, strict, introduced = 10.0)))
 
 // LLVM 19
-// TODO: Fill this in
-#  define _LIBCPP_INTRODUCED_IN_LLVM_19 0
-#  define _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE __attribute__((unavailable))
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 150400) ||       \
+      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 180400) ||     \
+      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 180400) ||             \
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 110400) ||       \
+      (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 90400)
+#    define _LIBCPP_INTRODUCED_IN_LLVM_19 0
+#  else
+#    define _LIBCPP_INTRODUCED_IN_LLVM_19 1
+#  endif
+#  define _LIBCPP_INTRODUCED_IN_LLVM_19_ATTRIBUTE                                                                 \
+    __attribute__((availability(macos, strict, introduced = 15.4)))                                               \
+    __attribute__((availability(ios, strict, introduced = 18.4)))                                                 \
+    __attribute__((availability(tvos, strict, introduced = 18.4)))                                                \
+    __attribute__((availability(watchos, strict, introduced = 11.4)))                                             \
+    __attribute__((availability(bridgeos, strict, introduced = 9.4)))
 
 // LLVM 18
 #  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 150000) ||       \
diff --git a/libcxx/include/__configuration/compiler.h b/libcxx/include/__configuration/compiler.h
index 11c07ed0dc474..7cd81e03b05ba 100644
--- a/libcxx/include/__configuration/compiler.h
+++ b/libcxx/include/__configuration/compiler.h
@@ -33,16 +33,16 @@
 // Warn if a compiler version is used that is not supported anymore
 // LLVM RELEASE Update the minimum compiler versions
 #  if defined(_LIBCPP_CLANG_VER)
-#    if _LIBCPP_CLANG_VER < 1900
-#      warning "Libc++ only supports Clang 19 and later"
+#    if _LIBCPP_CLANG_VER < 2001
+#      warning "Libc++ only supports Clang 20 and later"
 #    endif
 #  elif defined(_LIBCPP_APPLE_CLANG_VER)
-#    if _LIBCPP_APPLE_CLANG_VER < 1600
-#      warning "Libc++ only supports AppleClang 15 and later"
+#    if _LIBCPP_APPLE_CLANG_VER < 1700
+#      warning "Libc++ only supports AppleClang 26 and later"
 #    endif
 #  elif defined(_LIBCPP_GCC_VER)
-#    if _LIBCPP_GCC_VER < 1400
-#      warning "Libc++ only supports GCC 14 and later"
+#    if _LIBCPP_GCC_VER < 1500
+#      warning "Libc++ only supports GCC 15 and later"
 #    endif
 #  endif
 
diff --git a/libcxx/include/__configuration/experimental.h b/libcxx/include/__configuration/experimental.h
new file mode 100644
index 0000000000000..d14df3e5175f3
--- /dev/null
+++ b/libcxx/include/__configuration/experimental.h
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___CONFIGURATION_EXPERIMENTAL_H
+#define _LIBCPP___CONFIGURATION_EXPERIMENTAL_H
+
+#include <__config_site>
+
+#ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER
+#  pragma GCC system_header
+#endif
+
+#if __has_feature(experimental_library)
+#  ifndef _LIBCPP_ENABLE_EXPERIMENTAL
+#    define _LIBCPP_ENABLE_EXPERIMENTAL
+#  endif
+#endif
+
+// Incomplete features get their own specific disabling flags. This makes it
+// easier to grep for target specific flags once the feature is complete.
+#if defined(_LIBCPP_ENABLE_EXPERIMENTAL) || defined(_LIBCPP_BUILDING_LIBRARY)
+#  define _LIBCPP_HAS_EXPERIMENTAL_LIBRARY 1
+#else
+#  define _LIBCPP_HAS_EXPERIMENTAL_LIBRARY 0
+#endif
+
+#define _LIBCPP_HAS_EXPERIMENTAL_PSTL _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
+#define _LIBCPP_HAS_EXPERIMENTAL_TZDB _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
+#define _LIBCPP_HAS_EXPERIMENTAL_SYNCSTREAM _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
+#define _LIBCPP_HAS_EXPERIMENTAL_HARDENING_OBSERVE_SEMANTIC _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
+
+#endif // _LIBCPP___CONFIGURATION_EXPERIMENTAL_H
diff --git a/libcxx/include/__configuration/hardening.h b/libcxx/include/__configuration/hardening.h
new file mode 100644
index 0000000000000..bc2a8d078fa77
--- /dev/null
+++ b/libcxx/include/__configuration/hardening.h
@@ -0,0 +1,181 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___CONFIGURATION_HARDENING_H
+#define _LIBCPP___CONFIGURATION_HARDENING_H
+
+#include <__config_site>
+#include <__configuration/experimental.h>
+#include <__configuration/language.h>
+
+#ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER
+#  pragma GCC system_header
+#endif
+
+// TODO(LLVM 23): Remove this. We're making these an error to catch folks who might not have migrated.
+//       Since hardening went through several changes (many of which impacted user-facing macros),
+//       we're keeping these checks around for a bit longer than usual. Failure to properly configure
+//       hardening results in checks being dropped silently, which is a pretty big deal.
+#if defined(_LIBCPP_ENABLE_ASSERTIONS)
+#  error "_LIBCPP_ENABLE_ASSERTIONS has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)"
+#endif
+#if defined(_LIBCPP_ENABLE_HARDENED_MODE)
+#  error "_LIBCPP_ENABLE_HARDENED_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)"
+#endif
+#if defined(_LIBCPP_ENABLE_SAFE_MODE)
+#  error "_LIBCPP_ENABLE_SAFE_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)"
+#endif
+#if defined(_LIBCPP_ENABLE_DEBUG_MODE)
+#  error "_LIBCPP_ENABLE_DEBUG_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)"
+#endif
+
+// The library provides the macro `_LIBCPP_HARDENING_MODE` which can be set to one of the following values:
+//
+// - `_LIBCPP_HARDENING_MODE_NONE`;
+// - `_LIBCPP_HARDENING_MODE_FAST`;
+// - `_LIBCPP_HARDENING_MODE_EXTENSIVE`;
+// - `_LIBCPP_HARDENING_MODE_DEBUG`.
+//
+// These values have the following effects:
+//
+// - `_LIBCPP_HARDENING_MODE_NONE` -- sets the hardening mode to "none" which disables all runtime hardening checks;
+//
+// - `_LIBCPP_HARDENING_MODE_FAST` -- sets that hardening mode to "fast". The fast mode enables security-critical checks
+//   that can be done with relatively little runtime overhead in constant time;
+//
+// - `_LIBCPP_HARDENING_MODE_EXTENSIVE` -- sets the hardening mode to "extensive". The extensive mode is a superset of
+//   the fast mode that additionally enables checks that are relatively cheap and prevent common types of logic errors
+//   but are not necessarily security-critical;
+//
+// - `_LIBCPP_HARDENING_MODE_DEBUG` -- sets the hardening mode to "debug". The debug mode is a superset of the extensive
+//   mode and enables all checks available in the library, including internal assertions. Checks that are part of the
+//   debug mode can be very expensive and thus the debug mode is intended to be used for testing, not in production.
+
+// Inside the library, assertions are categorized so they can be cherry-picked based on the chosen hardening mode. These
+// macros are only for internal use -- users should only pick one of the high-level hardening modes described above.
+//
+// - `_LIBCPP_ASSERT_VALID_INPUT_RANGE` -- checks that ranges (whether expressed as an iterator pair, an iterator and
+//   a sentinel, an iterator and a count, or a `std::range`) given as input to library functions are valid:
+//   - the sentinel is reachable from the begin iterator;
+//   - TODO(hardening): both iterators refer to the same container.
+//
+// - `_LIBCPP_ASSERT_VALID_ELEMENT_ACCESS` -- checks that any attempts to access a container element, whether through
+//   the container object or through an iterator, are valid and do not attempt to go out of bounds or otherwise access
+//   a non-existent element. For iterator checks to work, bounded iterators must be enabled in the ABI. Types like
+//   `optional` and `function` are considered one-element containers for the purposes of this check.
+//
+// - `_LIBCPP_ASSERT_NON_NULL` -- checks that the pointer being dereferenced is not null. On most modern platforms zero
+//   address does not refer to an actual location in memory, so a null pointer dereference would not compromize the
+//   memory security of a program (however, it is still undefined behavior that can result in strange errors due to
+//   compiler optimizations).
+//
+// - `_LIBCPP_ASSERT_NON_OVERLAPPING_RANGES` -- for functions that take several ranges as arguments, checks that the
+//   given ranges do not overlap.
+//
+// - `_LIBCPP_ASSERT_VALID_DEALLOCATION` -- checks that an attempt to deallocate memory is valid (e.g. the given object
+//   was allocated by the given allocator). Violating this category typically results in a memory leak.
+//
+// - `_LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL` -- checks that a call to an external API doesn't fail in
+//   an unexpected manner. This includes triggering documented cases of undefined behavior in an external library (like
+//   attempting to unlock an unlocked mutex in pthreads). Any API external to the library falls under this category
+//   (from system calls to compiler intrinsics). We generally don't expect these failures to compromize memory safety or
+//   otherwise create an immediate security issue.
+//
+// - `_LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR` -- checks any operations that exchange nodes between containers to make sure
+//   the containers have compatible allocators.
+//
+// - `_LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN` -- checks that the given argument is within the domain of valid arguments
+//   for the function. Violating this typically produces an incorrect result (e.g. the clamp algorithm returns the
+//   original value without clamping it due to incorrect functors) or puts an object into an invalid state (e.g.
+//   a string view where only a subset of elements is possible to access). This category is for assertions violating
+//   which doesn't cause any immediate issues in the library -- whatever the consequences are, they will happen in the
+//   user code.
+//
+// - `_LIBCPP_ASSERT_PEDANTIC` -- checks prerequisites which are imposed by the Standard, but violating which happens to
+//   be benign in our implementation.
+//
+// - `_LIBCPP_ASSERT_SEMANTIC_REQUIREMENT` -- checks that the given argument satisfies the semantic requirements imposed
+//   by the Standard. Typically, there is no simple way to completely prove that a semantic requirement is satisfied;
+//   thus, this would often be a heuristic check and it might be quite expensive.
+//
+// - `_LIBCPP_ASSERT_INTERNAL` -- checks that internal invariants of the library hold. These assertions don't depend on
+//   user input.
+//
+// - `_LIBCPP_ASSERT_UNCATEGORIZED` -- for assertions that haven't been properly classified yet.
+
+// clang-format off
+#  define _LIBCPP_HARDENING_MODE_NONE      (1 << 1)
+#  define _LIBCPP_HARDENING_MODE_FAST      (1 << 2)
+#  define _LIBCPP_HARDENING_MODE_EXTENSIVE (1 << 4) // Deliberately not ordered.
+#  define _LIBCPP_HARDENING_MODE_DEBUG     (1 << 3)
+// clang-format on
+
+#ifndef _LIBCPP_HARDENING_MODE
+
+#  ifndef _LIBCPP_HARDENING_MODE_DEFAULT
+#    error _LIBCPP_HARDENING_MODE_DEFAULT is not defined. This definition should be set at configuration time in the \
+`__config_site` header, please make sure your installation of libc++ is not broken.
+#  endif
+
+#  define _LIBCPP_HARDENING_MODE _LIBCPP_HARDENING_MODE_DEFAULT
+#endif
+
+#if _LIBCPP_HARDENING_MODE != _LIBCPP_HARDENING_MODE_NONE && _LIBCPP_HARDENING_MODE != _LIBCPP_HARDENING_MODE_FAST &&  \
+    _LIBCPP_HARDENING_MODE != _LIBCPP_HARDENING_MODE_EXTENSIVE &&                                                      \
+    _LIBCPP_HARDENING_MODE != _LIBCPP_HARDENING_MODE_DEBUG
+#  error _LIBCPP_HARDENING_MODE must be set to one of the following values: \
+_LIBCPP_HARDENING_MODE_NONE, \
+_LIBCPP_HARDENING_MODE_FAST, \
+_LIBCPP_HARDENING_MODE_EXTENSIVE, \
+_LIBCPP_HARDENING_MODE_DEBUG
+#endif
+
+// Hardening assertion semantics generally mirror the evaluation semantics of C++26 Contracts:
+// - `ignore` evaluates the assertion but doesn't do anything if it fails (note that it differs from the Contracts
+//   `ignore` semantic which wouldn't evaluate the assertion at all);
+// - `observe` logs an error (indicating, if possible, that the error is fatal) and continues execution;
+// - `quick-enforce` terminates the program as fast as possible (via trapping);
+// - `enforce` logs an error and then terminates the program.
+//
+// Notes:
+// - Continuing execution after a hardening check fails results in undefined behavior; the `observe` semantic is meant
+//   to make adopting hardening easier but should not be used outside of this scenario;
+// - C++26 wording for Library Hardening precludes a conforming Hardened implementation from using the Contracts
+//   `ignore` semantic when evaluating hardened preconditions in the Library. Libc++ allows using this semantic for
+//   hardened preconditions, however, be aware that using `ignore` does not produce a conforming "Hardened"
+//   implementation, unlike the other semantics above.
+// clang-format off
+#  define _LIBCPP_ASSERTION_SEMANTIC_IGNORE        (1 << 1)
+#  define _LIBCPP_ASSERTION_SEMANTIC_OBSERVE       (1 << 2)
+#  define _LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE (1 << 3)
+#  define _LIBCPP_ASSERTION_SEMANTIC_ENFORCE       (1 << 4)
+// clang-format on
+
+// Allow users to define an arbitrary assertion semantic; otherwise, use the default mapping from modes to semantics.
+// The default is for production-capable modes to use `quick-enforce` (i.e., trap) and for the `debug` mode to use
+// `enforce` (i.e., log and abort).
+#ifndef _LIBCPP_ASSERTION_SEMANTIC
+
+#  if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+#    define _LIBCPP_ASSERTION_SEMANTIC _LIBCPP_ASSERTION_SEMANTIC_ENFORCE
+#  else
+#    define _LIBCPP_ASSERTION_SEMANTIC _LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE
+#  endif
+
+#else
+
+#  if !_LIBCPP_HAS_EXPERIMENTAL_LIBRARY
+#    error "Assertion semantics are an experimental feature."
+#  endif
+#  if defined(_LIBCPP_CXX03_LANG)
+#    error "Assertion semantics are not available in the C++03 mode."
+#  endif
+
+#endif // _LIBCPP_ASSERTION_SEMANTIC
+
+#endif // _LIBCPP___CONFIGURATION_HARDENING_H
diff --git a/libcxx/include/__configuration/language.h b/libcxx/include/__configuration/language.h
index 9c224dfa76e40..26e87f87afd87 100644
--- a/libcxx/include/__configuration/language.h
+++ b/libcxx/include/__configuration/language.h
@@ -18,6 +18,9 @@
 
 // NOLINTBEGIN(libcpp-cpp-version-check)
 #ifdef __cplusplus
+#  if __cplusplus < 201103L
+#    define _LIBCPP_CXX03_LANG
+#  endif
 #  if __cplusplus <= 201103L
 #    define _LIBCPP_STD_VER 11
 #  elif __cplusplus <= 201402L
diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h
index 796fa924be121..92ff5c701e0d3 100644
--- a/libcxx/include/__exception/exception_ptr.h
+++ b/libcxx/include/__exception/exception_ptr.h
@@ -11,18 +11,24 @@
 
 #include <__config>
 #include <__cstddef/nullptr_t.h>
+#include <__cstddef/size_t.h>
 #include <__exception/operations.h>
 #include <__memory/addressof.h>
 #include <__memory/construct_at.h>
 #include <__type_traits/decay.h>
 #include <__type_traits/is_pointer.h>
-#include <cstdlib>
+#include <__utility/move.h>
+#include <__utility/swap.h>
+#include <__verbose_abort>
 #include <typeinfo>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #ifndef _LIBCPP_ABI_MICROSOFT
 
 #  if _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION
@@ -30,7 +36,7 @@
 namespace __cxxabiv1 {
 
 extern "C" {
-_LIBCPP_OVERRIDABLE_FUNC_VIS void* __cxa_allocate_exception(size_t) throw();
+_LIBCPP_OVERRIDABLE_FUNC_VIS void* __cxa_allocate_exception(std::size_t) throw();
 _LIBCPP_OVERRIDABLE_FUNC_VIS void __cxa_free_exception(void*) throw();
 
 struct __cxa_exception;
@@ -57,6 +63,8 @@ _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD
 
 #ifndef _LIBCPP_ABI_MICROSOFT
 
+inline _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT;
+
 class _LIBCPP_EXPORTED_FROM_ABI exception_ptr {
   void* __ptr_;
 
@@ -67,15 +75,21 @@ class _LIBCPP_EXPORTED_FROM_ABI exception_ptr {
 
 public:
   // exception_ptr is basically a COW string so it is trivially relocatable.
-  // It is also replaceable because assignment has normal value semantics.
   using __trivially_relocatable _LIBCPP_NODEBUG = exception_ptr;
-  using __replaceable _LIBCPP_NODEBUG           = exception_ptr;
 
   _LIBCPP_HIDE_FROM_ABI exception_ptr() _NOEXCEPT : __ptr_() {}
   _LIBCPP_HIDE_FROM_ABI exception_ptr(nullptr_t) _NOEXCEPT : __ptr_() {}
 
   exception_ptr(const exception_ptr&) _NOEXCEPT;
+  _LIBCPP_HIDE_FROM_ABI exception_ptr(exception_ptr&& __other) _NOEXCEPT : __ptr_(__other.__ptr_) {
+    __other.__ptr_ = nullptr;
+  }
   exception_ptr& operator=(const exception_ptr&) _NOEXCEPT;
+  _LIBCPP_HIDE_FROM_ABI exception_ptr& operator=(exception_ptr&& __other) _NOEXCEPT {
+    exception_ptr __tmp(std::move(__other));
+    std::swap(__tmp, *this);
+    return *this;
+  }
   ~exception_ptr() _NOEXCEPT;
 
   _LIBCPP_HIDE_FROM_ABI explicit operator bool() const _NOEXCEPT { return __ptr_ != nullptr; }
@@ -88,10 +102,16 @@ class _LIBCPP_EXPORTED_FROM_ABI exception_ptr {
     return !(__x == __y);
   }
 
+  friend _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT;
+
   friend _LIBCPP_EXPORTED_FROM_ABI exception_ptr current_exception() _NOEXCEPT;
   friend _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr);
 };
 
+inline _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT {
+  std::swap(__x.__ptr_, __y.__ptr_);
+}
+
 #  if _LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION
 template <class _Ep>
@@ -153,7 +173,7 @@ _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep __e) _NOEXCEPT {
 #  else  // !_LIBCPP_HAS_EXCEPTIONS
 template <class _Ep>
 _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep) _NOEXCEPT {
-  std::abort();
+  _LIBCPP_VERBOSE_ABORT("make_exception_ptr was called in -fno-exceptions mode");
 }
 #  endif // _LIBCPP_HAS_EXCEPTIONS
 
@@ -201,4 +221,6 @@ _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep __e) _NOEXCEPT {
 #endif // _LIBCPP_ABI_MICROSOFT
 _LIBCPP_END_UNVERSIONED_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___EXCEPTION_EXCEPTION_PTR_H
diff --git a/libcxx/include/__exception/nested_exception.h b/libcxx/include/__exception/nested_exception.h
index 90b14158d57a2..dc3266a27cdfd 100644
--- a/libcxx/include/__exception/nested_exception.h
+++ b/libcxx/include/__exception/nested_exception.h
@@ -73,7 +73,7 @@ template <class _Tp>
   __throw_with_nested<_Tp,
                       _Up,
                       is_class<_Up>::value && !is_base_of<nested_exception, _Up>::value &&
-                          !__libcpp_is_final<_Up>::value>::__do_throw(std::forward<_Tp>(__t));
+                          !__is_final_v<_Up> >::__do_throw(std::forward<_Tp>(__t));
 #else
   ((void)__t);
   // FIXME: Make this abort
diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h
index 8b3eeebd38ae7..be37e8ab66ac4 100644
--- a/libcxx/include/__expected/expected.h
+++ b/libcxx/include/__expected/expected.h
@@ -30,7 +30,6 @@
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_reference.h>
-#include <__type_traits/is_replaceable.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
 #include <__type_traits/is_trivially_constructible.h>
@@ -472,8 +471,6 @@ class expected : private __expected_base<_Tp, _Err> {
       __conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value && __libcpp_is_trivially_relocatable<_Err>::value,
                       expected,
                       void>;
-  using __replaceable _LIBCPP_NODEBUG =
-      __conditional_t<__is_replaceable_v<_Tp> && __is_replaceable_v<_Err>, expected, void>;
 
   template <class _Up>
   using rebind = expected<_Up, error_type>;
diff --git a/libcxx/include/__flat_set/flat_multiset.h b/libcxx/include/__flat_set/flat_multiset.h
index 7be0b2d20c54d..0f6bae584ca90 100644
--- a/libcxx/include/__flat_set/flat_multiset.h
+++ b/libcxx/include/__flat_set/flat_multiset.h
@@ -95,16 +95,16 @@ class flat_multiset {
 
 public:
   // [flat.multiset.cons], constructors
-  _LIBCPP_HIDE_FROM_ABI flat_multiset() noexcept(is_nothrow_default_constructible_v<_KeyContainer> &&
-                                                 is_nothrow_default_constructible_v<_Compare>)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset() noexcept(
+      is_nothrow_default_constructible_v<_KeyContainer> && is_nothrow_default_constructible_v<_Compare>)
       : __keys_(), __compare_() {}
 
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(const flat_multiset&) = default;
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(const flat_multiset&) = default;
 
   // The copy/move constructors are not specified in the spec, which means they should be defaulted.
   // However, the move constructor can potentially leave a moved-from object in an inconsistent
   // state if an exception is thrown.
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(flat_multiset&& __other) noexcept(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(flat_multiset&& __other) noexcept(
       is_nothrow_move_constructible_v<_KeyContainer> && is_nothrow_move_constructible_v<_Compare>)
 #  if _LIBCPP_HAS_EXCEPTIONS
       try
@@ -121,14 +121,16 @@ class flat_multiset {
 #  endif // _LIBCPP_HAS_EXCEPTIONS
   }
 
-  _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(const key_compare& __comp) : __keys_(), __compare_(__comp) {}
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multiset(const key_compare& __comp)
+      : __keys_(), __compare_(__comp) {}
 
-  _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(container_type __keys, const key_compare& __comp = key_compare())
+  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multiset(container_type __keys, const key_compare& __comp = key_compare())
       : __keys_(std::move(__keys)), __compare_(__comp) {
     ranges::sort(__keys_, __compare_);
   }
 
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(sorted_equivalent_t, container_type __keys, const key_compare& __comp = key_compare())
       : __keys_(std::move(__keys)), __compare_(__comp) {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
@@ -136,7 +138,7 @@ class flat_multiset {
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(_InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare())
       : __keys_(), __compare_(__comp) {
     insert(__first, __last);
@@ -144,48 +146,53 @@ class flat_multiset {
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(
       sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare())
       : __keys_(__first, __last), __compare_(__comp) {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
   }
 
   template <_ContainerCompatibleRange<value_type> _Range>
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t __fr, _Range&& __rg)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(from_range_t __fr, _Range&& __rg)
       : flat_multiset(__fr, std::forward<_Range>(__rg), key_compare()) {}
 
   template <_ContainerCompatibleRange<value_type> _Range>
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp) : flat_multiset(__comp) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp)
+      : flat_multiset(__comp) {
     insert_range(std::forward<_Range>(__rg));
   }
 
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(initializer_list<value_type> __il, const key_compare& __comp = key_compare())
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(initializer_list<value_type> __il, const key_compare& __comp = key_compare())
       : flat_multiset(__il.begin(), __il.end(), __comp) {}
 
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(sorted_equivalent_t, initializer_list<value_type> __il, const key_compare& __comp = key_compare())
       : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __comp) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multiset(const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_() {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(const key_compare& __comp, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(const key_compare& __comp, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_(__comp) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(const container_type& __keys, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(const container_type& __keys, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __keys)), __compare_() {
     ranges::sort(__keys_, __compare_);
   }
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(const container_type& __keys, const key_compare& __comp, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __keys)), __compare_(__comp) {
     ranges::sort(__keys_, __compare_);
@@ -193,14 +200,15 @@ class flat_multiset {
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(sorted_equivalent_t, const container_type& __keys, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(sorted_equivalent_t, const container_type& __keys, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __keys)), __compare_() {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
   }
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(sorted_equivalent_t, const container_type& __keys, const key_compare& __comp, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __keys)), __compare_(__comp) {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
@@ -208,13 +216,14 @@ class flat_multiset {
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(const flat_multiset& __other, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(const flat_multiset& __other, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __other.__keys_)),
         __compare_(__other.__compare_) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(flat_multiset&& __other, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(flat_multiset&& __other, const _Allocator& __alloc)
 #  if _LIBCPP_HAS_EXCEPTIONS
       try
 #  endif // _LIBCPP_HAS_EXCEPTIONS
@@ -230,14 +239,15 @@ class flat_multiset {
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator<container_type, _Allocator>::value)
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_() {
     insert(__first, __last);
   }
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator<container_type, _Allocator>::value)
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(_InputIterator __first, _InputIterator __last, const key_compare& __comp, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_(__comp) {
     insert(__first, __last);
@@ -245,7 +255,7 @@ class flat_multiset {
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator<container_type, _Allocator>::value)
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __first, __last)), __compare_() {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
@@ -253,53 +263,57 @@ class flat_multiset {
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator<container_type, _Allocator>::value)
-  _LIBCPP_HIDE_FROM_ABI
-  flat_multiset(sorted_equivalent_t,
-                _InputIterator __first,
-                _InputIterator __last,
-                const key_compare& __comp,
-                const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(
+      sorted_equivalent_t,
+      _InputIterator __first,
+      _InputIterator __last,
+      const key_compare& __comp,
+      const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __first, __last)), __compare_(__comp) {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
   }
 
   template <_ContainerCompatibleRange<value_type> _Range, class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(from_range_t, _Range&& __rg, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_() {
     insert_range(std::forward<_Range>(__rg));
   }
 
   template <_ContainerCompatibleRange<value_type> _Range, class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_(__comp) {
     insert_range(std::forward<_Range>(__rg));
   }
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(initializer_list<value_type> __il, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(initializer_list<value_type> __il, const _Allocator& __alloc)
       : flat_multiset(__il.begin(), __il.end(), __alloc) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(initializer_list<value_type> __il, const key_compare& __comp, const _Allocator& __alloc)
       : flat_multiset(__il.begin(), __il.end(), __comp, __alloc) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(sorted_equivalent_t, initializer_list<value_type> __il, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(sorted_equivalent_t, initializer_list<value_type> __il, const _Allocator& __alloc)
       : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __alloc) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(
       sorted_equivalent_t, initializer_list<value_type> __il, const key_compare& __comp, const _Allocator& __alloc)
       : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __comp, __alloc) {}
 
-  _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(initializer_list<value_type> __il) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset& operator=(initializer_list<value_type> __il) {
     clear();
     insert(__il);
     return *this;
@@ -308,9 +322,9 @@ class flat_multiset {
   // copy/move assignment are not specified in the spec (defaulted)
   // but move assignment can potentially leave moved from object in an inconsistent
   // state if an exception is thrown
-  _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(const flat_multiset&) = default;
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset& operator=(const flat_multiset&) = default;
 
-  _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(flat_multiset&& __other) noexcept(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset& operator=(flat_multiset&& __other) noexcept(
       is_nothrow_move_assignable_v<_KeyContainer> && is_nothrow_move_assignable_v<_Compare>) {
     auto __clear_other_guard = std::__make_scope_guard([&]() noexcept { __other.clear() /* noexcept */; });
     auto __clear_self_guard  = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
@@ -321,30 +335,52 @@ class flat_multiset {
   }
 
   // iterators
-  _LIBCPP_HIDE_FROM_ABI iterator begin() noexcept { return iterator(std::as_const(__keys_).begin()); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator begin() const noexcept { return const_iterator(__keys_.begin()); }
-  _LIBCPP_HIDE_FROM_ABI iterator end() noexcept { return iterator(std::as_const(__keys_).end()); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator end() const noexcept { return const_iterator(__keys_.end()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator begin() noexcept {
+    return iterator(std::as_const(__keys_).begin());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator begin() const noexcept {
+    return const_iterator(__keys_.begin());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator end() noexcept {
+    return iterator(std::as_const(__keys_).end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator end() const noexcept {
+    return const_iterator(__keys_.end());
+  }
 
-  _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() noexcept { return reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const noexcept { return const_reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() noexcept { return reverse_iterator(begin()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const noexcept { return const_reverse_iterator(begin()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rbegin() noexcept {
+    return reverse_iterator(end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator rbegin() const noexcept {
+    return const_reverse_iterator(end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rend() noexcept {
+    return reverse_iterator(begin());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator rend() const noexcept {
+    return const_reverse_iterator(begin());
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const noexcept { return begin(); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator cend() const noexcept { return end(); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const noexcept { return const_reverse_iterator(begin()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cbegin() const noexcept { return begin(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cend() const noexcept { return end(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator crbegin() const noexcept {
+    return const_reverse_iterator(end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator crend() const noexcept {
+    return const_reverse_iterator(begin());
+  }
 
   // capacity
-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool empty() const noexcept { return __keys_.empty(); }
-  _LIBCPP_HIDE_FROM_ABI size_type size() const noexcept { return __keys_.size(); }
-  _LIBCPP_HIDE_FROM_ABI size_type max_size() const noexcept { return __keys_.max_size(); }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool empty() const noexcept {
+    return __keys_.empty();
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type size() const noexcept { return __keys_.size(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type max_size() const noexcept { return __keys_.max_size(); }
 
   // [flat.multiset.modifiers], modifiers
   template <class... _Args>
     requires is_constructible_v<value_type, _Args...>
-  _LIBCPP_HIDE_FROM_ABI iterator emplace(_Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator emplace(_Args&&... __args) {
     if constexpr (sizeof...(__args) == 1 && (is_same_v<remove_cvref_t<_Args>, _Key> && ...)) {
       return __emplace(std::forward<_Args>(__args)...);
     } else {
@@ -354,7 +390,7 @@ class flat_multiset {
 
   template <class... _Args>
     requires is_constructible_v<value_type, _Args...>
-  _LIBCPP_HIDE_FROM_ABI iterator emplace_hint(const_iterator __hint, _Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator emplace_hint(const_iterator __hint, _Args&&... __args) {
     if constexpr (sizeof...(__args) == 1 && (is_same_v<remove_cvref_t<_Args>, _Key> && ...)) {
       return __emplace_hint(std::move(__hint), std::forward<_Args>(__args)...);
     } else {
@@ -362,21 +398,23 @@ class flat_multiset {
     }
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return emplace(__x); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const value_type& __x) { return emplace(__x); }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(value_type&& __x) { return emplace(std::move(__x)); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(value_type&& __x) {
+    return emplace(std::move(__x));
+  }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, const value_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, const value_type& __x) {
     return emplace_hint(__hint, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, value_type&& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, value_type&& __x) {
     return emplace_hint(__hint, std::move(__x));
   }
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(_InputIterator __first, _InputIterator __last) {
     if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) {
       __reserve(__last - __first);
     }
@@ -385,7 +423,8 @@ class flat_multiset {
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, _InputIterator __first, _InputIterator __last) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void
+  insert(sorted_equivalent_t, _InputIterator __first, _InputIterator __last) {
     if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) {
       __reserve(__last - __first);
     }
@@ -394,7 +433,7 @@ class flat_multiset {
   }
 
   template <_ContainerCompatibleRange<value_type> _Range>
-  _LIBCPP_HIDE_FROM_ABI void insert_range(_Range&& __range) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert_range(_Range&& __range) {
     if constexpr (ranges::sized_range<_Range>) {
       __reserve(ranges::size(__range));
     }
@@ -402,26 +441,29 @@ class flat_multiset {
     __append_sort_merge</*WasSorted = */ false>(std::forward<_Range>(__range));
   }
 
-  _LIBCPP_HIDE_FROM_ABI void insert(initializer_list<value_type> __il) { insert(__il.begin(), __il.end()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(initializer_list<value_type> __il) {
+    insert(__il.begin(), __il.end());
+  }
 
-  _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, initializer_list<value_type> __il) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void
+  insert(sorted_equivalent_t, initializer_list<value_type> __il) {
     insert(sorted_equivalent, __il.begin(), __il.end());
   }
 
-  _LIBCPP_HIDE_FROM_ABI container_type extract() && {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 container_type extract() && {
     auto __guard = std::__make_scope_guard([&]() noexcept { clear() /* noexcept */; });
     auto __ret   = std::move(__keys_);
     return __ret;
   }
 
-  _LIBCPP_HIDE_FROM_ABI void replace(container_type&& __keys) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void replace(container_type&& __keys) {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys, __compare_), "Key container is not sorted");
     auto __guard = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     __keys_      = std::move(__keys);
     __guard.__complete();
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator erase(iterator __position) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(iterator __position) {
     auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     auto __key_iter   = __keys_.erase(__position.__base());
     __on_failure.__complete();
@@ -431,7 +473,7 @@ class flat_multiset {
   // The following overload is the same as the iterator overload
   // iterator erase(const_iterator __position);
 
-  _LIBCPP_HIDE_FROM_ABI size_type erase(const key_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type erase(const key_type& __x) {
     auto [__first, __last] = equal_range(__x);
     auto __res             = __last - __first;
     erase(__first, __last);
@@ -441,21 +483,21 @@ class flat_multiset {
   template <class _Kp>
     requires(__is_transparent_v<_Compare> && !is_convertible_v<_Kp &&, iterator> &&
              !is_convertible_v<_Kp &&, const_iterator>)
-  _LIBCPP_HIDE_FROM_ABI size_type erase(_Kp&& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type erase(_Kp&& __x) {
     auto [__first, __last] = equal_range(__x);
     auto __res             = __last - __first;
     erase(__first, __last);
     return __res;
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __first, const_iterator __last) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(const_iterator __first, const_iterator __last) {
     auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     auto __key_it     = __keys_.erase(__first.__base(), __last.__base());
     __on_failure.__complete();
     return iterator(std::move(__key_it));
   }
 
-  _LIBCPP_HIDE_FROM_ABI void swap(flat_multiset& __y) noexcept {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void swap(flat_multiset& __y) noexcept {
     // warning: The spec has unconditional noexcept, which means that
     // if any of the following functions throw an exception,
     // std::terminate will be called
@@ -464,126 +506,139 @@ class flat_multiset {
     ranges::swap(__keys_, __y.__keys_);
   }
 
-  _LIBCPP_HIDE_FROM_ABI void clear() noexcept { __keys_.clear(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void clear() noexcept { __keys_.clear(); }
 
   // observers
-  _LIBCPP_HIDE_FROM_ABI key_compare key_comp() const { return __compare_; }
-  _LIBCPP_HIDE_FROM_ABI value_compare value_comp() const { return __compare_; }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 key_compare key_comp() const { return __compare_; }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 value_compare value_comp() const { return __compare_; }
 
   // map operations
-  _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __x) { return __find_impl(*this, __x); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const key_type& __x) {
+    return __find_impl(*this, __x);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __x) const { return __find_impl(*this, __x); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const key_type& __x) const {
+    return __find_impl(*this, __x);
+  }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI iterator find(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const _Kp& __x) {
     return __find_impl(*this, __x);
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI const_iterator find(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const _Kp& __x) const {
     return __find_impl(*this, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const key_type& __x) const {
     auto [__first, __last] = equal_range(__x);
     return __last - __first;
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI size_type count(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const _Kp& __x) const {
     auto [__first, __last] = equal_range(__x);
     return __last - __first;
   }
 
-  _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __x) const { return find(__x) != end(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const key_type& __x) const {
+    return find(__x) != end();
+  }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI bool contains(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const _Kp& __x) const {
     return find(__x) != end();
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const key_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const key_type& __x) {
     const auto& __keys = __keys_;
     return iterator(std::lower_bound(__keys.begin(), __keys.end(), __x, __compare_));
   }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator lower_bound(const key_type& __x) const {
     return const_iterator(std::lower_bound(__keys_.begin(), __keys_.end(), __x, __compare_));
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const _Kp& __x) {
     const auto& __keys = __keys_;
     return iterator(std::lower_bound(__keys.begin(), __keys.end(), __x, __compare_));
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator lower_bound(const _Kp& __x) const {
     return const_iterator(std::lower_bound(__keys_.begin(), __keys_.end(), __x, __compare_));
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const key_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const key_type& __x) {
     const auto& __keys = __keys_;
     return iterator(std::upper_bound(__keys.begin(), __keys.end(), __x, __compare_));
   }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator upper_bound(const key_type& __x) const {
     return const_iterator(std::upper_bound(__keys_.begin(), __keys_.end(), __x, __compare_));
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const _Kp& __x) {
     const auto& __keys = __keys_;
     return iterator(std::upper_bound(__keys.begin(), __keys.end(), __x, __compare_));
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator upper_bound(const _Kp& __x) const {
     return const_iterator(std::upper_bound(__keys_.begin(), __keys_.end(), __x, __compare_));
   }
 
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const key_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, iterator> equal_range(const key_type& __x) {
     return __equal_range_impl(*this, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<const_iterator, const_iterator>
+  equal_range(const key_type& __x) const {
     return __equal_range_impl(*this, __x);
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, iterator> equal_range(const _Kp& __x) {
     return __equal_range_impl(*this, __x);
   }
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<const_iterator, const_iterator>
+  equal_range(const _Kp& __x) const {
     return __equal_range_impl(*this, __x);
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI bool operator==(const flat_multiset& __x, const flat_multiset& __y) {
+  friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool
+  operator==(const flat_multiset& __x, const flat_multiset& __y) {
     return ranges::equal(__x, __y);
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI auto operator<=>(const flat_multiset& __x, const flat_multiset& __y) {
+  friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 auto
+  operator<=>(const flat_multiset& __x, const flat_multiset& __y) {
     return std::lexicographical_compare_three_way(
         __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI void swap(flat_multiset& __x, flat_multiset& __y) noexcept { __x.swap(__y); }
+  friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void
+  swap(flat_multiset& __x, flat_multiset& __y) noexcept {
+    __x.swap(__y);
+  }
 
 private:
   template <bool _WasSorted, class... _Args>
-  _LIBCPP_HIDE_FROM_ABI void __append_sort_merge(_Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void __append_sort_merge(_Args&&... __args) {
     auto __on_failure    = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     size_type __old_size = size();
     __flat_set_utils::__append(*this, std::forward<_Args>(__args)...);
@@ -598,13 +653,13 @@ class flat_multiset {
   }
 
   template <class _Kp>
-  _LIBCPP_HIDE_FROM_ABI iterator __emplace(_Kp&& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator __emplace(_Kp&& __key) {
     auto __it = upper_bound(__key);
     return __flat_set_utils::__emplace_exact_pos(*this, __it, std::forward<_Kp>(__key));
   }
 
   template <class _Kp>
-  _LIBCPP_HIDE_FROM_ABI iterator __emplace_hint(const_iterator __hint, _Kp&& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator __emplace_hint(const_iterator __hint, _Kp&& __key) {
     auto __prev_larger  = __hint != cbegin() && __compare_(__key, *std::prev(__hint));
     auto __next_smaller = __hint != cend() && __compare_(*__hint, __key);
 
@@ -636,7 +691,7 @@ class flat_multiset {
   }
 
   template <class _Self, class _Kp>
-  _LIBCPP_HIDE_FROM_ABI static auto __find_impl(_Self&& __self, const _Kp& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto __find_impl(_Self&& __self, const _Kp& __key) {
     auto __it   = __self.lower_bound(__key);
     auto __last = __self.end();
     if (__it == __last || __self.__compare_(__key, *__it)) {
@@ -646,29 +701,30 @@ class flat_multiset {
   }
 
   template <class _Self, class _Kp>
-  _LIBCPP_HIDE_FROM_ABI static auto __equal_range_impl(_Self&& __self, const _Kp& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto __equal_range_impl(_Self&& __self, const _Kp& __key) {
     using __iter = _If<is_const_v<__libcpp_remove_reference_t<_Self>>, const_iterator, iterator>;
     auto [__key_first, __key_last] =
         std::equal_range(__self.__keys_.begin(), __self.__keys_.end(), __key, __self.__compare_);
     return std::make_pair(__iter(__key_first), __iter(__key_last));
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __reserve(size_t __size) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void __reserve(size_t __size) {
     if constexpr (__container_traits<_KeyContainer>::__reservable) {
       __keys_.reserve(__size);
     }
   }
 
   template <class _Key2, class _Compare2, class _KeyContainer2, class _Predicate>
-  friend typename flat_multiset<_Key2, _Compare2, _KeyContainer2>::size_type
+  friend typename flat_multiset<_Key2, _Compare2, _KeyContainer2>::size_type _LIBCPP_CONSTEXPR_SINCE_CXX26
   erase_if(flat_multiset<_Key2, _Compare2, _KeyContainer2>&, _Predicate);
 
   _KeyContainer __keys_;
   _LIBCPP_NO_UNIQUE_ADDRESS key_compare __compare_;
 
   struct __key_equiv {
-    _LIBCPP_HIDE_FROM_ABI __key_equiv(key_compare __c) : __comp_(__c) {}
-    _LIBCPP_HIDE_FROM_ABI bool operator()(const_reference __x, const_reference __y) const {
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __key_equiv(key_compare __c) : __comp_(__c) {}
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool
+    operator()(const_reference __x, const_reference __y) const {
       return !__comp_(std::get<0>(__x), std::get<0>(__y)) && !__comp_(std::get<0>(__y), std::get<0>(__x));
     }
     key_compare __comp_;
@@ -757,7 +813,7 @@ struct uses_allocator<flat_multiset<_Key, _Compare, _KeyContainer>, _Allocator>
     : bool_constant<uses_allocator_v<_KeyContainer, _Allocator> > {};
 
 template <class _Key, class _Compare, class _KeyContainer, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI typename flat_multiset<_Key, _Compare, _KeyContainer>::size_type
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 typename flat_multiset<_Key, _Compare, _KeyContainer>::size_type
 erase_if(flat_multiset<_Key, _Compare, _KeyContainer>& __flat_multiset, _Predicate __pred) {
   auto __guard = std::__make_exception_guard([&] { __flat_multiset.clear(); });
   auto __it =
diff --git a/libcxx/include/__format/formatter_output.h b/libcxx/include/__format/formatter_output.h
index d53b6cec707d8..63dd7fcacdcc9 100644
--- a/libcxx/include/__format/formatter_output.h
+++ b/libcxx/include/__format/formatter_output.h
@@ -151,45 +151,41 @@ _LIBCPP_HIDE_FROM_ABI _OutIt __fill(_OutIt __out_it, size_t __n, _CharT __value)
   }
 }
 
-#  if _LIBCPP_HAS_UNICODE
 template <__fmt_char_type _CharT, output_iterator<const _CharT&> _OutIt>
-  requires(same_as<_CharT, char>)
 _LIBCPP_HIDE_FROM_ABI _OutIt __fill(_OutIt __out_it, size_t __n, __format_spec::__code_point<_CharT> __value) {
-  std::size_t __bytes = std::countl_one(static_cast<unsigned char>(__value.__data[0]));
-  if (__bytes == 0)
-    return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]);
-
-  for (size_t __i = 0; __i < __n; ++__i)
-    __out_it = __formatter::__copy(
-        std::addressof(__value.__data[0]), std::addressof(__value.__data[0]) + __bytes, std::move(__out_it));
-  return __out_it;
-}
-
+#  if _LIBCPP_HAS_UNICODE
+  if constexpr (same_as<_CharT, char>) {
+    std::size_t __bytes = std::countl_one(static_cast<unsigned char>(__value.__data[0]));
+    if (__bytes == 0)
+      return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]);
+
+    for (size_t __i = 0; __i < __n; ++__i)
+      __out_it = __formatter::__copy(
+          std::addressof(__value.__data[0]), std::addressof(__value.__data[0]) + __bytes, std::move(__out_it));
+    return __out_it;
 #    if _LIBCPP_HAS_WIDE_CHARACTERS
-template <__fmt_char_type _CharT, output_iterator<const _CharT&> _OutIt>
-  requires(same_as<_CharT, wchar_t> && sizeof(wchar_t) == 2)
-_LIBCPP_HIDE_FROM_ABI _OutIt __fill(_OutIt __out_it, size_t __n, __format_spec::__code_point<_CharT> __value) {
-  if (!__unicode::__is_high_surrogate(__value.__data[0]))
-    return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]);
-
-  for (size_t __i = 0; __i < __n; ++__i)
-    __out_it = __formatter::__copy(
-        std::addressof(__value.__data[0]), std::addressof(__value.__data[0]) + 2, std::move(__out_it));
-  return __out_it;
-}
-
-template <__fmt_char_type _CharT, output_iterator<const _CharT&> _OutIt>
-  requires(same_as<_CharT, wchar_t> && sizeof(wchar_t) == 4)
-_LIBCPP_HIDE_FROM_ABI _OutIt __fill(_OutIt __out_it, size_t __n, __format_spec::__code_point<_CharT> __value) {
-  return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]);
-}
+  } else if constexpr (same_as<_CharT, wchar_t>) {
+    if constexpr (sizeof(wchar_t) == 2) {
+      if (!__unicode::__is_high_surrogate(__value.__data[0]))
+        return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]);
+
+      for (size_t __i = 0; __i < __n; ++__i)
+        __out_it = __formatter::__copy(
+            std::addressof(__value.__data[0]), std::addressof(__value.__data[0]) + 2, std::move(__out_it));
+      return __out_it;
+    } else if constexpr (sizeof(wchar_t) == 4) {
+      return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]);
+    } else {
+      static_assert(false, "expected sizeof(wchar_t) to be 2 or 4");
+    }
 #    endif // _LIBCPP_HAS_WIDE_CHARACTERS
-#  else    // _LIBCPP_HAS_UNICODE
-template <__fmt_char_type _CharT, output_iterator<const _CharT&> _OutIt>
-_LIBCPP_HIDE_FROM_ABI _OutIt __fill(_OutIt __out_it, size_t __n, __format_spec::__code_point<_CharT> __value) {
+  } else {
+    static_assert(false, "Unexpected CharT");
+  }
+#  else  // _LIBCPP_HAS_UNICODE
   return __formatter::__fill(std::move(__out_it), __n, __value.__data[0]);
+#  endif // _LIBCPP_HAS_UNICODE
 }
-#  endif   // _LIBCPP_HAS_UNICODE
 
 /// Writes the input to the output with the required padding.
 ///
diff --git a/libcxx/include/__functional/hash.h b/libcxx/include/__functional/hash.h
index 83bbf1b5e26c3..f74f25fa6e84b 100644
--- a/libcxx/include/__functional/hash.h
+++ b/libcxx/include/__functional/hash.h
@@ -433,13 +433,10 @@ struct __hash_impl<long double> : __scalar_hash<long double> {
 template <class _Tp>
 struct hash : public __hash_impl<_Tp> {};
 
-#if _LIBCPP_STD_VER >= 17
-
 template <>
 struct hash<nullptr_t> : public __unary_function<nullptr_t, size_t> {
   _LIBCPP_HIDE_FROM_ABI size_t operator()(nullptr_t) const _NOEXCEPT { return 662607004ull; }
 };
-#endif
 
 #ifndef _LIBCPP_CXX03_LANG
 template <class _Key, class _Hash>
@@ -452,18 +449,12 @@ template <class _Key, class _Hash = hash<_Key> >
 using __has_enabled_hash _LIBCPP_NODEBUG =
     integral_constant<bool, __check_hash_requirements<_Key, _Hash>::value && is_default_constructible<_Hash>::value >;
 
-#  if _LIBCPP_STD_VER >= 17
 template <class _Type, class>
 using __enable_hash_helper_imp _LIBCPP_NODEBUG = _Type;
 
 template <class _Type, class... _Keys>
 using __enable_hash_helper _LIBCPP_NODEBUG =
     __enable_hash_helper_imp<_Type, __enable_if_t<__all<__has_enabled_hash<_Keys>::value...>::value> >;
-#  else
-template <class _Type, class...>
-using __enable_hash_helper _LIBCPP_NODEBUG = _Type;
-#  endif
-
 #endif // !_LIBCPP_CXX03_LANG
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__functional/identity.h b/libcxx/include/__functional/identity.h
index 1b1c6cf73c378..02dde2b4f323d 100644
--- a/libcxx/include/__functional/identity.h
+++ b/libcxx/include/__functional/identity.h
@@ -44,7 +44,7 @@ struct __is_identity<reference_wrapper<const __identity> > : true_type {};
 
 struct identity {
   template <class _Tp>
-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp&& operator()(_Tp&& __t) const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp&& operator()(_LIBCPP_LIFETIMEBOUND _Tp&& __t) const noexcept {
     return std::forward<_Tp>(__t);
   }
 
diff --git a/libcxx/include/__iterator/distance.h b/libcxx/include/__iterator/distance.h
index 9be9db0f0c70e..1a9fbf27f776b 100644
--- a/libcxx/include/__iterator/distance.h
+++ b/libcxx/include/__iterator/distance.h
@@ -11,6 +11,7 @@
 #define _LIBCPP___ITERATOR_DISTANCE_H
 
 #include <__algorithm/for_each_segment.h>
+#include <__concepts/same_as.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__iterator/incrementable_traits.h>
@@ -41,35 +42,29 @@ template <class _Iter>
 using __iter_distance_t _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::difference_type;
 #endif
 
-template <class _InputIter, class _Sent>
-inline _LIBCPP_HIDE_FROM_ABI
-_LIBCPP_CONSTEXPR_SINCE_CXX17 __iter_distance_t<_InputIter> __distance(_InputIter __first, _Sent __last) {
-  __iter_distance_t<_InputIter> __r(0);
-  for (; __first != __last; ++__first)
-    ++__r;
-  return __r;
-}
-
 template <class _RandIter, __enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 __iter_distance_t<_RandIter>
 __distance(_RandIter __first, _RandIter __last) {
   return __last - __first;
 }
 
+template <class _InputIter, class _Sent>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 __iter_distance_t<_InputIter>
+__distance(_InputIter __first, _Sent __last) {
+  __iter_distance_t<_InputIter> __r(0);
 #if _LIBCPP_STD_VER >= 20
-template <class _SegmentedIter,
-          __enable_if_t<!__has_random_access_iterator_category<_SegmentedIter>::value &&
-                            __is_segmented_iterator_v<_SegmentedIter>,
-                        int> = 0>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 __iter_distance_t<_SegmentedIter>
-__distance(_SegmentedIter __first, _SegmentedIter __last) {
-  __iter_distance_t<_SegmentedIter> __r(0);
-  std::__for_each_segment(__first, __last, [&__r](auto __lfirst, auto __llast) {
-    __r += std::__distance(__lfirst, __llast);
-  });
+  if constexpr (same_as<_InputIter, _Sent> && __is_segmented_iterator_v<_InputIter>) {
+    std::__for_each_segment(__first, __last, [&__r](auto __lfirst, auto __llast) {
+      __r += std::__distance(__lfirst, __llast);
+    });
+  } else
+#endif
+  {
+    for (; __first != __last; ++__first)
+      ++__r;
+  }
   return __r;
 }
-#endif // _LIBCPP_STD_VER >= 20
 
 template <class _InputIter>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 typename iterator_traits<_InputIter>::difference_type
diff --git a/libcxx/include/__iterator/segmented_iterator.h b/libcxx/include/__iterator/segmented_iterator.h
index 5df9737137101..dc56a740130b5 100644
--- a/libcxx/include/__iterator/segmented_iterator.h
+++ b/libcxx/include/__iterator/segmented_iterator.h
@@ -75,11 +75,6 @@ inline const bool __has_specialization_v<_Tp, sizeof(_Tp) * 0> = true;
 template <class _Iterator>
 inline const bool __is_segmented_iterator_v = __has_specialization_v<__segmented_iterator_traits<_Iterator> >;
 
-template <class _SegmentedIterator>
-struct __has_random_access_local_iterator
-    : __has_random_access_iterator_category<
-          typename __segmented_iterator_traits< _SegmentedIterator >::__local_iterator > {};
-
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___SEGMENTED_ITERATOR_H
diff --git a/libcxx/include/__iterator/wrap_iter.h b/libcxx/include/__iterator/wrap_iter.h
index d18d9682da449..98745f600a6ec 100644
--- a/libcxx/include/__iterator/wrap_iter.h
+++ b/libcxx/include/__iterator/wrap_iter.h
@@ -117,8 +117,8 @@ class __wrap_iter {
   friend class span;
   template <class _Tp, size_t _Size>
   friend struct array;
-  template <class _Tp>
-  friend class optional;
+  template <class _Tp, class>
+  friend struct __optional_iterator;
 };
 
 template <class _Iter1>
diff --git a/libcxx/include/__locale b/libcxx/include/__locale
index eb7b7786208e8..0948bd29b6f1b 100644
--- a/libcxx/include/__locale
+++ b/libcxx/include/__locale
@@ -57,9 +57,8 @@ _LIBCPP_HIDE_FROM_ABI const _Facet& use_facet(const locale&);
 class _LIBCPP_EXPORTED_FROM_ABI locale {
 public:
   // locale is essentially a shared_ptr that doesn't support weak_ptrs and never got a move constructor,
-  // so it is trivially relocatable. Like shared_ptr, it is also replaceable.
+  // so it is trivially relocatable.
   using __trivially_relocatable _LIBCPP_NODEBUG = locale;
-  using __replaceable _LIBCPP_NODEBUG           = locale;
 
   // types:
   class _LIBCPP_EXPORTED_FROM_ABI facet;
diff --git a/libcxx/include/__locale_dir/money.h b/libcxx/include/__locale_dir/money.h
index c1296665505e1..12ba38467d805 100644
--- a/libcxx/include/__locale_dir/money.h
+++ b/libcxx/include/__locale_dir/money.h
@@ -433,7 +433,7 @@ bool money_get<_CharT, _InputIterator>::__do_get(
           __err |= ios_base::failbit;
           return false;
         }
-        for (++__b; __fd > 0; --__fd, ++__b) {
+        for (++__b; __fd > 0; --__fd, (void)++__b) {
           if (__b == __e || !__ct.is(ctype_base::digit, *__b)) {
             __err |= ios_base::failbit;
             return false;
@@ -451,7 +451,7 @@ bool money_get<_CharT, _InputIterator>::__do_get(
     }
   }
   if (__trailing_sign) {
-    for (unsigned __i = 1; __i < __trailing_sign->size(); ++__i, ++__b) {
+    for (unsigned __i = 1; __i < __trailing_sign->size(); ++__i, (void)++__b) {
       if (__b == __e || *__b != (*__trailing_sign)[__i]) {
         __err |= ios_base::failbit;
         return false;
diff --git a/libcxx/include/__locale_dir/num.h b/libcxx/include/__locale_dir/num.h
index 7ca8ffe348959..ff357cd2d97db 100644
--- a/libcxx/include/__locale_dir/num.h
+++ b/libcxx/include/__locale_dir/num.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___LOCALE_DIR_NUM_H
 #define _LIBCPP___LOCALE_DIR_NUM_H
 
+#include <__algorithm/copy.h>
 #include <__algorithm/find.h>
 #include <__algorithm/reverse.h>
 #include <__charconv/to_chars_integral.h>
@@ -885,9 +886,7 @@ num_put<_CharT, _OutputIterator>::do_put(iter_type __s, ios_base& __iob, char_ty
   const numpunct<char_type>& __np = std::use_facet<numpunct<char_type> >(__iob.getloc());
   typedef typename numpunct<char_type>::string_type string_type;
   string_type __nm = __v ? __np.truename() : __np.falsename();
-  for (typename string_type::iterator __i = __nm.begin(); __i != __nm.end(); ++__i, ++__s)
-    *__s = *__i;
-  return __s;
+  return std::copy(__nm.begin(), __nm.end(), __s);
 }
 
 template <class _CharT, class _OutputIterator>
diff --git a/libcxx/include/__locale_dir/pad_and_output.h b/libcxx/include/__locale_dir/pad_and_output.h
index a1cb37d0786da..bdd4d2856dad6 100644
--- a/libcxx/include/__locale_dir/pad_and_output.h
+++ b/libcxx/include/__locale_dir/pad_and_output.h
@@ -13,6 +13,8 @@
 
 #if _LIBCPP_HAS_LOCALIZATION
 
+#  include <__algorithm/copy.h>
+#  include <__algorithm/fill_n.h>
 #  include <ios>
 
 #  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -30,12 +32,9 @@ _LIBCPP_HIDE_FROM_ABI _OutputIterator __pad_and_output(
     __ns -= __sz;
   else
     __ns = 0;
-  for (; __ob < __op; ++__ob, ++__s)
-    *__s = *__ob;
-  for (; __ns; --__ns, ++__s)
-    *__s = __fl;
-  for (; __ob < __oe; ++__ob, ++__s)
-    *__s = *__ob;
+  __s = std::copy(__ob, __op, __s);
+  __s = std::fill_n(__s, __ns, __fl);
+  __s = std::copy(__op, __oe, __s);
   __iob.width(0);
   return __s;
 }
diff --git a/libcxx/include/__memory/compressed_pair.h b/libcxx/include/__memory/compressed_pair.h
index 0388d752ccc8b..f1f1c920453cf 100644
--- a/libcxx/include/__memory/compressed_pair.h
+++ b/libcxx/include/__memory/compressed_pair.h
@@ -67,7 +67,7 @@ inline const size_t __compressed_pair_alignment<_Tp&> = _LIBCPP_ALIGNOF(void*);
 
 template <class _ToPad>
 inline const bool __is_reference_or_unpadded_object =
-    (is_empty<_ToPad>::value && !__libcpp_is_final<_ToPad>::value) || sizeof(_ToPad) == __datasizeof_v<_ToPad>;
+    (is_empty<_ToPad>::value && !__is_final_v<_ToPad>) || sizeof(_ToPad) == __datasizeof_v<_ToPad>;
 
 template <class _Tp>
 inline const bool __is_reference_or_unpadded_object<_Tp&> = true;
diff --git a/libcxx/include/__memory/construct_at.h b/libcxx/include/__memory/construct_at.h
index 658269158d945..5378c03abab3a 100644
--- a/libcxx/include/__memory/construct_at.h
+++ b/libcxx/include/__memory/construct_at.h
@@ -14,7 +14,6 @@
 #include <__config>
 #include <__memory/addressof.h>
 #include <__new/placement_new_delete.h>
-#include <__type_traits/enable_if.h>
 #include <__type_traits/is_array.h>
 #include <__utility/declval.h>
 #include <__utility/forward.h>
@@ -55,35 +54,25 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp* __construct_at(_Tp* __l
 // The internal functions are available regardless of the language version (with the exception of the `__destroy_at`
 // taking an array).
 
-template <class _Tp, __enable_if_t<!is_array<_Tp>::value, int> = 0>
+template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __destroy_at(_Tp* __loc) {
   _LIBCPP_ASSERT_NON_NULL(__loc != nullptr, "null pointer given to destroy_at");
-  __loc->~_Tp();
-}
-
 #if _LIBCPP_STD_VER >= 20
-template <class _Tp, __enable_if_t<is_array<_Tp>::value, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr void __destroy_at(_Tp* __loc) {
-  _LIBCPP_ASSERT_NON_NULL(__loc != nullptr, "null pointer given to destroy_at");
-  for (auto&& __val : *__loc)
-    std::__destroy_at(std::addressof(__val));
-}
+  if constexpr (is_array_v<_Tp>) {
+    for (auto&& __val : *__loc)
+      std::__destroy_at(std::addressof(__val));
+  } else
 #endif
+  {
+    __loc->~_Tp();
+  }
+}
 
 #if _LIBCPP_STD_VER >= 17
-
-template <class _Tp, enable_if_t<!is_array_v<_Tp>, int> = 0>
+template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void destroy_at(_Tp* _LIBCPP_DIAGNOSE_NULLPTR __loc) {
   std::__destroy_at(__loc);
 }
-
-#  if _LIBCPP_STD_VER >= 20
-template <class _Tp, enable_if_t<is_array_v<_Tp>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr void destroy_at(_Tp* _LIBCPP_DIAGNOSE_NULLPTR __loc) {
-  std::__destroy_at(__loc);
-}
-#  endif
-
 #endif // _LIBCPP_STD_VER >= 17
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index e90db587d2836..67b94114988b5 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -317,10 +317,8 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI shared_ptr {
 #endif
 
   // A shared_ptr contains only two raw pointers which point to the heap and move constructing already doesn't require
-  // any bookkeeping, so it's always trivially relocatable. It is also replaceable because assignment just rebinds the
-  // shared_ptr to manage a different object.
+  // any bookkeeping, so it's always trivially relocatable.
   using __trivially_relocatable _LIBCPP_NODEBUG = shared_ptr;
-  using __replaceable _LIBCPP_NODEBUG           = shared_ptr;
 
 private:
   element_type* __ptr_;
@@ -1186,9 +1184,8 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI weak_ptr {
 #endif
 
   // A weak_ptr contains only two raw pointers which point to the heap and move constructing already doesn't require
-  // any bookkeeping, so it's always trivially relocatable. It's also replaceable for the same reason.
+  // any bookkeeping, so it's always trivially relocatable.
   using __trivially_relocatable _LIBCPP_NODEBUG = weak_ptr;
-  using __replaceable _LIBCPP_NODEBUG           = weak_ptr;
 
 private:
   element_type* __ptr_;
diff --git a/libcxx/include/__memory/temp_value.h b/libcxx/include/__memory/temp_value.h
index 4a133b3fbcf6c..5285bcab9a30d 100644
--- a/libcxx/include/__memory/temp_value.h
+++ b/libcxx/include/__memory/temp_value.h
@@ -12,7 +12,6 @@
 #include <__config>
 #include <__memory/addressof.h>
 #include <__memory/allocator_traits.h>
-#include <__type_traits/aligned_storage.h>
 #include <__utility/forward.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -26,7 +25,7 @@ struct __temp_value {
   typedef allocator_traits<_Alloc> _Traits;
 
 #ifdef _LIBCPP_CXX03_LANG
-  typename aligned_storage<sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp)>::type __v;
+  _ALIGNAS_TYPE(_Tp) char __v[sizeof(_Tp)];
 #else
   union {
     _Tp __v;
diff --git a/libcxx/include/__memory/unique_ptr.h b/libcxx/include/__memory/unique_ptr.h
index eff24546cdc01..491d1c2e42417 100644
--- a/libcxx/include/__memory/unique_ptr.h
+++ b/libcxx/include/__memory/unique_ptr.h
@@ -39,7 +39,6 @@
 #include <__type_traits/is_function.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_reference.h>
-#include <__type_traits/is_replaceable.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
 #include <__type_traits/is_trivially_relocatable.h>
@@ -145,8 +144,6 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI unique_ptr {
       __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<deleter_type>::value,
       unique_ptr,
       void>;
-  using __replaceable _LIBCPP_NODEBUG =
-      __conditional_t<__is_replaceable_v<pointer> && __is_replaceable_v<deleter_type>, unique_ptr, void>;
 
 private:
   _LIBCPP_COMPRESSED_PAIR(pointer, __ptr_, deleter_type, __deleter_);
@@ -413,8 +410,6 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI unique_ptr<_Tp[], _Dp> {
       __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<deleter_type>::value,
       unique_ptr,
       void>;
-  using __replaceable _LIBCPP_NODEBUG =
-      __conditional_t<__is_replaceable_v<pointer> && __is_replaceable_v<deleter_type>, unique_ptr, void>;
 
 private:
   template <class _Up, class _OtherDeleter>
diff --git a/libcxx/include/__new/align_val_t.h b/libcxx/include/__new/align_val_t.h
index 03ab7cb143a2b..d8ce5283345fb 100644
--- a/libcxx/include/__new/align_val_t.h
+++ b/libcxx/include/__new/align_val_t.h
@@ -16,6 +16,12 @@
 #  pragma GCC system_header
 #endif
 
+// <vcruntime_exception.h> defines its own std::align_val_t type,
+// which we use in order to be ABI-compatible with other STLs on Windows.
+#if _LIBCPP_HAS_LIBRARY_ALIGNED_ALLOCATION && defined(_LIBCPP_ABI_VCRUNTIME)
+#  include <vcruntime_new.h>
+#endif
+
 _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD
 #if _LIBCPP_HAS_LIBRARY_ALIGNED_ALLOCATION && !defined(_LIBCPP_ABI_VCRUNTIME)
 #  ifndef _LIBCPP_CXX03_LANG
diff --git a/libcxx/include/__new/exceptions.h b/libcxx/include/__new/exceptions.h
index 86951818b7aa2..483e5e3811182 100644
--- a/libcxx/include/__new/exceptions.h
+++ b/libcxx/include/__new/exceptions.h
@@ -17,6 +17,12 @@
 #  pragma GCC system_header
 #endif
 
+// <vcruntime_exception.h> defines its own std::bad_alloc type,
+// which we use in order to be ABI-compatible with other STLs on Windows.
+#if defined(_LIBCPP_ABI_VCRUNTIME)
+#  include <vcruntime_exception.h>
+#endif
+
 _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD
 #if !defined(_LIBCPP_ABI_VCRUNTIME)
 
diff --git a/libcxx/include/__numeric/saturation_arithmetic.h b/libcxx/include/__numeric/saturation_arithmetic.h
index 9bd3af12c9572..4491bab2b1479 100644
--- a/libcxx/include/__numeric/saturation_arithmetic.h
+++ b/libcxx/include/__numeric/saturation_arithmetic.h
@@ -30,6 +30,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <__signed_or_unsigned_integer _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr _Tp __add_sat(_Tp __x, _Tp __y) noexcept {
+#  if defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 2101
+  return __builtin_elementwise_add_sat(__x, __y);
+#  else
   if (_Tp __sum; !__builtin_add_overflow(__x, __y, std::addressof(__sum)))
     return __sum;
   // Handle overflow
@@ -44,10 +47,14 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __add_sat(_Tp __x, _Tp __y) noexcept {
       // Overflows if  (x < 0 && y < 0)
       return std::numeric_limits<_Tp>::min();
   }
+#  endif
 }
 
 template <__signed_or_unsigned_integer _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr _Tp __sub_sat(_Tp __x, _Tp __y) noexcept {
+#  if defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 2101
+  return __builtin_elementwise_sub_sat(__x, __y);
+#  else
   if (_Tp __sub; !__builtin_sub_overflow(__x, __y, std::addressof(__sub)))
     return __sub;
   // Handle overflow
@@ -63,6 +70,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __sub_sat(_Tp __x, _Tp __y) noexcept {
       // Overflows if (x < 0 && y > 0)
       return std::numeric_limits<_Tp>::min();
   }
+#  endif
 }
 
 template <__signed_or_unsigned_integer _Tp>
@@ -113,27 +121,27 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Rp __saturate_cast(_Tp __x) noexcept {
 #if _LIBCPP_STD_VER >= 26
 
 template <__signed_or_unsigned_integer _Tp>
-_LIBCPP_HIDE_FROM_ABI constexpr _Tp add_sat(_Tp __x, _Tp __y) noexcept {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp add_sat(_Tp __x, _Tp __y) noexcept {
   return std::__add_sat(__x, __y);
 }
 
 template <__signed_or_unsigned_integer _Tp>
-_LIBCPP_HIDE_FROM_ABI constexpr _Tp sub_sat(_Tp __x, _Tp __y) noexcept {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp sub_sat(_Tp __x, _Tp __y) noexcept {
   return std::__sub_sat(__x, __y);
 }
 
 template <__signed_or_unsigned_integer _Tp>
-_LIBCPP_HIDE_FROM_ABI constexpr _Tp mul_sat(_Tp __x, _Tp __y) noexcept {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp mul_sat(_Tp __x, _Tp __y) noexcept {
   return std::__mul_sat(__x, __y);
 }
 
 template <__signed_or_unsigned_integer _Tp>
-_LIBCPP_HIDE_FROM_ABI constexpr _Tp div_sat(_Tp __x, _Tp __y) noexcept {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp div_sat(_Tp __x, _Tp __y) noexcept {
   return std::__div_sat(__x, __y);
 }
 
 template <__signed_or_unsigned_integer _Rp, __signed_or_unsigned_integer _Tp>
-_LIBCPP_HIDE_FROM_ABI constexpr _Rp saturate_cast(_Tp __x) noexcept {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Rp saturate_cast(_Tp __x) noexcept {
   return std::__saturate_cast<_Rp>(__x);
 }
 
diff --git a/libcxx/include/__random/piecewise_constant_distribution.h b/libcxx/include/__random/piecewise_constant_distribution.h
index c5bfa8dc3a4be..3faf339325f74 100644
--- a/libcxx/include/__random/piecewise_constant_distribution.h
+++ b/libcxx/include/__random/piecewise_constant_distribution.h
@@ -9,9 +9,11 @@
 #ifndef _LIBCPP___RANDOM_PIECEWISE_CONSTANT_DISTRIBUTION_H
 #define _LIBCPP___RANDOM_PIECEWISE_CONSTANT_DISTRIBUTION_H
 
+#include <__algorithm/copy_n.h>
 #include <__algorithm/upper_bound.h>
 #include <__config>
 #include <__cstddef/ptrdiff_t.h>
+#include <__iterator/back_insert_iterator.h>
 #include <__random/is_valid.h>
 #include <__random/uniform_real_distribution.h>
 #include <__vector/vector.h>
@@ -190,8 +192,7 @@ piecewise_constant_distribution<_RealType>::param_type::param_type(
     __areas_.assign(1, 0.0);
   } else {
     __densities_.reserve(__b_.size() - 1);
-    for (size_t __i = 0; __i < __b_.size() - 1; ++__i, ++__f_w)
-      __densities_.push_back(*__f_w);
+    std::copy_n(__f_w, __b_.size() - 1, std::back_inserter(__densities_));
     __init();
   }
 }
diff --git a/libcxx/include/__random/piecewise_linear_distribution.h b/libcxx/include/__random/piecewise_linear_distribution.h
index a9906430c005c..8aa3f19ca9004 100644
--- a/libcxx/include/__random/piecewise_linear_distribution.h
+++ b/libcxx/include/__random/piecewise_linear_distribution.h
@@ -9,9 +9,11 @@
 #ifndef _LIBCPP___RANDOM_PIECEWISE_LINEAR_DISTRIBUTION_H
 #define _LIBCPP___RANDOM_PIECEWISE_LINEAR_DISTRIBUTION_H
 
+#include <__algorithm/copy_n.h>
 #include <__algorithm/upper_bound.h>
 #include <__config>
 #include <__cstddef/ptrdiff_t.h>
+#include <__iterator/back_insert_iterator.h>
 #include <__random/is_valid.h>
 #include <__random/uniform_real_distribution.h>
 #include <__vector/comparison.h>
@@ -194,8 +196,7 @@ piecewise_linear_distribution<_RealType>::param_type::param_type(
     __areas_.assign(1, 0.0);
   } else {
     __densities_.reserve(__b_.size());
-    for (size_t __i = 0; __i < __b_.size(); ++__i, ++__f_w)
-      __densities_.push_back(*__f_w);
+    std::copy_n(__f_w, __b_.size(), std::back_inserter(__densities_));
     __init();
   }
 }
diff --git a/libcxx/include/__split_buffer b/libcxx/include/__split_buffer
index 15368a3bc8955..1e05e4df8ba0f 100644
--- a/libcxx/include/__split_buffer
+++ b/libcxx/include/__split_buffer
@@ -30,7 +30,6 @@
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_replaceable.h>
 #include <__type_traits/is_swappable.h>
 #include <__type_traits/is_trivially_destructible.h>
 #include <__type_traits/is_trivially_relocatable.h>
@@ -484,10 +483,6 @@ public:
       __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<allocator_type>::value,
       __split_buffer,
       void>;
-  using __replaceable _LIBCPP_NODEBUG =
-      __conditional_t<__is_replaceable_v<pointer> && __container_allocator_is_replaceable<__alloc_traits>::value,
-                      __split_buffer,
-                      void>;
 
   __split_buffer(const __split_buffer&)            = delete;
   __split_buffer& operator=(const __split_buffer&) = delete;
diff --git a/libcxx/include/__tree b/libcxx/include/__tree
index 694796922c914..ceae22bb48702 100644
--- a/libcxx/include/__tree
+++ b/libcxx/include/__tree
@@ -887,6 +887,18 @@ public:
   }
 
   _LIBCPP_HIDE_FROM_ABI __tree(const __tree& __t);
+
+  _LIBCPP_HIDE_FROM_ABI __tree(const __tree& __other, const allocator_type& __alloc)
+      : __begin_node_(__end_node()), __node_alloc_(__alloc), __size_(0), __value_comp_(__other.value_comp()) {
+    if (__other.size() == 0)
+      return;
+
+    *__root_ptr()       = static_cast<__node_base_pointer>(__copy_construct_tree(__other.__root()));
+    __root()->__parent_ = __end_node();
+    __begin_node_       = static_cast<__end_node_pointer>(std::__tree_min(__end_node()->__left_));
+    __size_             = __other.size();
+  }
+
   _LIBCPP_HIDE_FROM_ABI __tree& operator=(const __tree& __t);
   template <class _ForwardIterator>
   _LIBCPP_HIDE_FROM_ABI void __assign_unique(_ForwardIterator __first, _ForwardIterator __last);
@@ -995,27 +1007,6 @@ public:
         std::forward<_Args>(__args)...);
   }
 
-  template <class _ValueT = _Tp, __enable_if_t<__is_tree_value_type_v<_ValueT>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI void
-  __insert_unique_from_orphaned_node(const_iterator __p, __get_node_value_type_t<_Tp>&& __value) {
-    __emplace_hint_unique(__p, const_cast<key_type&&>(__value.first), std::move(__value.second));
-  }
-
-  template <class _ValueT = _Tp, __enable_if_t<!__is_tree_value_type_v<_ValueT>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI void __insert_unique_from_orphaned_node(const_iterator __p, _Tp&& __value) {
-    __emplace_hint_unique(__p, std::move(__value));
-  }
-
-  template <class _ValueT = _Tp, __enable_if_t<__is_tree_value_type_v<_ValueT>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI void __insert_multi_from_orphaned_node(const_iterator __p, value_type&& __value) {
-    __emplace_hint_multi(__p, const_cast<key_type&&>(__value.first), std::move(__value.second));
-  }
-
-  template <class _ValueT = _Tp, __enable_if_t<!__is_tree_value_type_v<_ValueT>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI void __insert_multi_from_orphaned_node(const_iterator __p, _Tp&& __value) {
-    __emplace_hint_multi(__p, std::move(__value));
-  }
-
   template <class _InIter, class _Sent>
   _LIBCPP_HIDE_FROM_ABI void __insert_range_multi(_InIter __first, _Sent __last) {
     if (__first == __last)
@@ -1388,19 +1379,19 @@ private:
   // copy the exact structure 1:1. Since this is for copy construction _only_ we know that we get a correct tree. If we
   // didn't get a correct tree, the invariants of __tree are broken and we have a much bigger problem than an improperly
   // balanced tree.
+  template <class _NodeConstructor>
 #ifdef _LIBCPP_COMPILER_CLANG_BASED // FIXME: GCC complains about not being able to always_inline a recursive function
   _LIBCPP_HIDE_FROM_ABI
 #endif
-  __node_pointer
-  __copy_construct_tree(__node_pointer __src) {
+  __node_pointer __construct_from_tree(__node_pointer __src, _NodeConstructor __construct) {
     if (!__src)
       return nullptr;
 
-    __node_holder __new_node = __construct_node(__src->__get_value());
+    __node_holder __new_node = __construct(__src->__get_value());
 
     unique_ptr<__node, __tree_deleter> __left(
-        __copy_construct_tree(static_cast<__node_pointer>(__src->__left_)), __node_alloc_);
-    __node_pointer __right = __copy_construct_tree(static_cast<__node_pointer>(__src->__right_));
+        __construct_from_tree(static_cast<__node_pointer>(__src->__left_), __construct), __node_alloc_);
+    __node_pointer __right = __construct_from_tree(static_cast<__node_pointer>(__src->__right_), __construct);
 
     __node_pointer __new_node_ptr = __new_node.release();
 
@@ -1414,46 +1405,85 @@ private:
     return __new_node_ptr;
   }
 
+  _LIBCPP_HIDE_FROM_ABI __node_pointer __copy_construct_tree(__node_pointer __src) {
+    return __construct_from_tree(__src, [this](const value_type& __val) { return __construct_node(__val); });
+  }
+
+  template <class _ValueT = _Tp, __enable_if_t<__is_tree_value_type_v<_ValueT>, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI __node_pointer __move_construct_tree(__node_pointer __src) {
+    return __construct_from_tree(__src, [this](value_type& __val) {
+      return __construct_node(const_cast<key_type&&>(__val.first), std::move(__val.second));
+    });
+  }
+
+  template <class _ValueT = _Tp, __enable_if_t<!__is_tree_value_type_v<_ValueT>, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI __node_pointer __move_construct_tree(__node_pointer __src) {
+    return __construct_from_tree(__src, [this](value_type& __val) { return __construct_node(std::move(__val)); });
+  }
+
+  template <class _Assignment, class _ConstructionAlg>
   // This copy assignment will always produce a correct red-black-tree assuming the incoming tree is correct, since our
   // own tree is a red-black-tree and the incoming tree is a red-black-tree. The invariants of a red-black-tree are
   // temporarily not met until all of the incoming red-black tree is copied.
 #ifdef _LIBCPP_COMPILER_CLANG_BASED // FIXME: GCC complains about not being able to always_inline a recursive function
   _LIBCPP_HIDE_FROM_ABI
 #endif
-  __node_pointer
-  __copy_assign_tree(__node_pointer __dest, __node_pointer __src) {
+  __node_pointer __assign_from_tree(
+      __node_pointer __dest, __node_pointer __src, _Assignment __assign, _ConstructionAlg __construct_subtree) {
     if (!__src) {
       destroy(__dest);
       return nullptr;
     }
 
-    __assign_value(__dest->__get_value(), __src->__get_value());
+    __assign(__dest->__get_value(), __src->__get_value());
     __dest->__is_black_ = __src->__is_black_;
 
     // If we already have a left node in the destination tree, reuse it and copy-assign recursively
     if (__dest->__left_) {
-      __dest->__left_ = static_cast<__node_base_pointer>(__copy_assign_tree(
-          static_cast<__node_pointer>(__dest->__left_), static_cast<__node_pointer>(__src->__left_)));
+      __dest->__left_ = static_cast<__node_base_pointer>(__assign_from_tree(
+          static_cast<__node_pointer>(__dest->__left_),
+          static_cast<__node_pointer>(__src->__left_),
+          __assign,
+          __construct_subtree));
 
       // Otherwise, we must create new nodes; copy-construct from here on
     } else if (__src->__left_) {
-      auto __new_left       = __copy_construct_tree(static_cast<__node_pointer>(__src->__left_));
+      auto __new_left       = __construct_subtree(static_cast<__node_pointer>(__src->__left_));
       __dest->__left_       = static_cast<__node_base_pointer>(__new_left);
       __new_left->__parent_ = static_cast<__end_node_pointer>(__dest);
     }
 
     // Identical to the left case above, just for the right nodes
     if (__dest->__right_) {
-      __dest->__right_ = static_cast<__node_base_pointer>(__copy_assign_tree(
-          static_cast<__node_pointer>(__dest->__right_), static_cast<__node_pointer>(__src->__right_)));
+      __dest->__right_ = static_cast<__node_base_pointer>(__assign_from_tree(
+          static_cast<__node_pointer>(__dest->__right_),
+          static_cast<__node_pointer>(__src->__right_),
+          __assign,
+          __construct_subtree));
     } else if (__src->__right_) {
-      auto __new_right       = __copy_construct_tree(static_cast<__node_pointer>(__src->__right_));
+      auto __new_right       = __construct_subtree(static_cast<__node_pointer>(__src->__right_));
       __dest->__right_       = static_cast<__node_base_pointer>(__new_right);
       __new_right->__parent_ = static_cast<__end_node_pointer>(__dest);
     }
 
     return __dest;
   }
+
+  _LIBCPP_HIDE_FROM_ABI __node_pointer __copy_assign_tree(__node_pointer __dest, __node_pointer __src) {
+    return __assign_from_tree(
+        __dest,
+        __src,
+        [](value_type& __lhs, const value_type& __rhs) { __assign_value(__lhs, __rhs); },
+        [this](__node_pointer __nd) { return __copy_construct_tree(__nd); });
+  }
+
+  _LIBCPP_HIDE_FROM_ABI __node_pointer __move_assign_tree(__node_pointer __dest, __node_pointer __src) {
+    return __assign_from_tree(
+        __dest,
+        __src,
+        [](value_type& __lhs, value_type& __rhs) { __assign_value(__lhs, std::move(__rhs)); },
+        [this](__node_pointer __nd) { return __move_construct_tree(__nd); });
+  }
 };
 
 // Precondition:  __size_ != 0
@@ -1594,21 +1624,26 @@ __tree<_Tp, _Compare, _Allocator>::__tree(__tree&& __t) _NOEXCEPT_(
 
 template <class _Tp, class _Compare, class _Allocator>
 __tree<_Tp, _Compare, _Allocator>::__tree(__tree&& __t, const allocator_type& __a)
-    : __node_alloc_(__node_allocator(__a)), __size_(0), __value_comp_(std::move(__t.value_comp())) {
+    : __begin_node_(__end_node()),
+      __node_alloc_(__node_allocator(__a)),
+      __size_(0),
+      __value_comp_(std::move(__t.value_comp())) {
+  if (__t.size() == 0)
+    return;
   if (__a == __t.__alloc()) {
-    if (__t.__size_ == 0)
-      __begin_node_ = __end_node();
-    else {
-      __begin_node_                    = __t.__begin_node_;
-      __end_node()->__left_            = __t.__end_node()->__left_;
-      __end_node()->__left_->__parent_ = static_cast<__end_node_pointer>(__end_node());
-      __size_                          = __t.__size_;
-      __t.__begin_node_                = __t.__end_node();
-      __t.__end_node()->__left_        = nullptr;
-      __t.__size_                      = 0;
-    }
+    __begin_node_                    = __t.__begin_node_;
+    __end_node()->__left_            = __t.__end_node()->__left_;
+    __end_node()->__left_->__parent_ = static_cast<__end_node_pointer>(__end_node());
+    __size_                          = __t.__size_;
+    __t.__begin_node_                = __t.__end_node();
+    __t.__end_node()->__left_        = nullptr;
+    __t.__size_                      = 0;
   } else {
-    __begin_node_ = __end_node();
+    *__root_ptr()       = static_cast<__node_base_pointer>(__move_construct_tree(__t.__root()));
+    __root()->__parent_ = __end_node();
+    __begin_node_       = static_cast<__end_node_pointer>(std::__tree_min(__end_node()->__left_));
+    __size_             = __t.size();
+    __t.clear(); // Ensure that __t is in a valid state after moving out the keys
   }
 }
 
@@ -1633,22 +1668,21 @@ void __tree<_Tp, _Compare, _Allocator>::__move_assign(__tree& __t, true_type)
 
 template <class _Tp, class _Compare, class _Allocator>
 void __tree<_Tp, _Compare, _Allocator>::__move_assign(__tree& __t, false_type) {
-  if (__node_alloc() == __t.__node_alloc())
+  if (__node_alloc() == __t.__node_alloc()) {
     __move_assign(__t, true_type());
-  else {
-    value_comp()       = std::move(__t.value_comp());
-    const_iterator __e = end();
+  } else {
+    value_comp() = std::move(__t.value_comp());
     if (__size_ != 0) {
-      _DetachedTreeCache __cache(this);
-      while (__cache.__get() != nullptr && __t.__size_ != 0) {
-        __assign_value(__cache.__get()->__get_value(), std::move(__t.remove(__t.begin())->__get_value()));
-        __node_insert_multi(__cache.__get());
-        __cache.__advance();
-      }
-    }
-    while (__t.__size_ != 0) {
-      __insert_multi_from_orphaned_node(__e, std::move(__t.remove(__t.begin())->__get_value()));
+      *__root_ptr() = static_cast<__node_base_pointer>(__move_assign_tree(__root(), __t.__root()));
+    } else {
+      *__root_ptr() = static_cast<__node_base_pointer>(__move_construct_tree(__t.__root()));
+      if (__root())
+        __root()->__parent_ = __end_node();
     }
+    __begin_node_ =
+        __end_node()->__left_ ? static_cast<__end_node_pointer>(std::__tree_min(__end_node()->__left_)) : __end_node();
+    __size_ = __t.size();
+    __t.clear(); // Ensure that __t is in a valid state after moving out the keys
   }
 }
 
diff --git a/libcxx/include/__tuple/tuple_size.h b/libcxx/include/__tuple/tuple_size.h
index 60f2a667a1ba3..719edc0e342c0 100644
--- a/libcxx/include/__tuple/tuple_size.h
+++ b/libcxx/include/__tuple/tuple_size.h
@@ -12,7 +12,6 @@
 #include <__config>
 #include <__cstddef/size_t.h>
 #include <__fwd/tuple.h>
-#include <__tuple/tuple_types.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_const.h>
diff --git a/libcxx/include/__type_traits/aligned_storage.h b/libcxx/include/__type_traits/aligned_storage.h
index 5c2208ae0c70a..33c0368d0c3c8 100644
--- a/libcxx/include/__type_traits/aligned_storage.h
+++ b/libcxx/include/__type_traits/aligned_storage.h
@@ -11,8 +11,6 @@
 
 #include <__config>
 #include <__cstddef/size_t.h>
-#include <__type_traits/integral_constant.h>
-#include <__type_traits/type_list.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -21,10 +19,10 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct __align_type {
-  static const size_t value = _LIBCPP_PREFERRED_ALIGNOF(_Tp);
-  typedef _Tp type;
-};
+struct _ALIGNAS(_LIBCPP_PREFERRED_ALIGNOF(_Tp)) _AlignedAsT {};
+
+template <class... _Args>
+struct __max_align_impl : _AlignedAsT<_Args>... {};
 
 struct __struct_double {
   long double __lx;
@@ -33,41 +31,16 @@ struct __struct_double4 {
   double __lx[4];
 };
 
-using __all_types _LIBCPP_NODEBUG =
-    __type_list<__align_type<unsigned char>,
-                __align_type<unsigned short>,
-                __align_type<unsigned int>,
-                __align_type<unsigned long>,
-                __align_type<unsigned long long>,
-                __align_type<double>,
-                __align_type<long double>,
-                __align_type<__struct_double>,
-                __align_type<__struct_double4>,
-                __align_type<int*> >;
-
-template <class _TL, size_t _Len>
-struct __find_max_align;
-
-template <class _Head, size_t _Len>
-struct __find_max_align<__type_list<_Head>, _Len> : public integral_constant<size_t, _Head::value> {};
-
-template <size_t _Len, size_t _A1, size_t _A2>
-struct __select_align {
-private:
-  static const size_t __min = _A2 < _A1 ? _A2 : _A1;
-  static const size_t __max = _A1 < _A2 ? _A2 : _A1;
-
-public:
-  static const size_t value = _Len < __max ? __min : __max;
-};
+inline const size_t __aligned_storage_max_align =
+    _LIBCPP_ALIGNOF(__max_align_impl<unsigned long long, double, long double, __struct_double, __struct_double4, int*>);
 
-template <class _Head, class... _Tail, size_t _Len>
-struct __find_max_align<__type_list<_Head, _Tail...>, _Len>
-    : public integral_constant<
-          size_t,
-          __select_align<_Len, _Head::value, __find_max_align<__type_list<_Tail...>, _Len>::value>::value> {};
+template <size_t _Len>
+inline const size_t __aligned_storage_alignment =
+    _Len > __aligned_storage_max_align
+        ? __aligned_storage_max_align
+        : size_t(1) << ((sizeof(size_t) * __CHAR_BIT__) - __builtin_clzg(_Len) - 1);
 
-template <size_t _Len, size_t _Align = __find_max_align<__all_types, _Len>::value>
+template <size_t _Len, size_t _Align = __aligned_storage_alignment<_Len> >
 struct _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_NO_SPECIALIZATIONS aligned_storage {
   union _ALIGNAS(_Align) type {
     unsigned char __data[(_Len + _Align - 1) / _Align * _Align];
@@ -77,7 +50,7 @@ struct _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_NO_SPECIALIZATIONS aligned_storage {
 #if _LIBCPP_STD_VER >= 14
 
 _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-template <size_t _Len, size_t _Align = __find_max_align<__all_types, _Len>::value>
+template <size_t _Len, size_t _Align = __aligned_storage_alignment<_Len> >
 using aligned_storage_t _LIBCPP_DEPRECATED_IN_CXX23 = typename aligned_storage<_Len, _Align>::type;
 _LIBCPP_SUPPRESS_DEPRECATED_POP
 
diff --git a/libcxx/include/__type_traits/is_final.h b/libcxx/include/__type_traits/is_final.h
index e9ef1425c9760..ab1cace52c4f6 100644
--- a/libcxx/include/__type_traits/is_final.h
+++ b/libcxx/include/__type_traits/is_final.h
@@ -19,7 +19,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct __libcpp_is_final : integral_constant<bool, __is_final(_Tp)> {};
+inline const bool __is_final_v = __is_final(_Tp);
 
 #if _LIBCPP_STD_VER >= 14
 template <class _Tp>
diff --git a/libcxx/include/__type_traits/is_floating_point.h b/libcxx/include/__type_traits/is_floating_point.h
index b87363fe5b357..586fce6af60d6 100644
--- a/libcxx/include/__type_traits/is_floating_point.h
+++ b/libcxx/include/__type_traits/is_floating_point.h
@@ -20,18 +20,19 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // clang-format off
-template <class _Tp> struct __libcpp_is_floating_point              : false_type {};
-template <>          struct __libcpp_is_floating_point<float>       : true_type {};
-template <>          struct __libcpp_is_floating_point<double>      : true_type {};
-template <>          struct __libcpp_is_floating_point<long double> : true_type {};
+template <class _Tp> inline const bool __is_floating_point_impl              = false;
+template <>          inline const bool __is_floating_point_impl<float>       = true;
+template <>          inline const bool __is_floating_point_impl<double>      = true;
+template <>          inline const bool __is_floating_point_impl<long double> = true;
 // clang-format on
 
 template <class _Tp>
-struct _LIBCPP_NO_SPECIALIZATIONS is_floating_point : __libcpp_is_floating_point<__remove_cv_t<_Tp> > {};
+struct _LIBCPP_NO_SPECIALIZATIONS is_floating_point
+    : integral_constant<bool, __is_floating_point_impl<__remove_cv_t<_Tp> > > {};
 
 #if _LIBCPP_STD_VER >= 17
 template <class _Tp>
-_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_floating_point_v = is_floating_point<_Tp>::value;
+_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_floating_point_v = __is_floating_point_impl<__remove_cv_t<_Tp>>;
 #endif
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__type_traits/is_replaceable.h b/libcxx/include/__type_traits/is_replaceable.h
deleted file mode 100644
index e1d17c099cd3a..0000000000000
--- a/libcxx/include/__type_traits/is_replaceable.h
+++ /dev/null
@@ -1,61 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_REPLACEABLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_REPLACEABLE_H
-
-#include <__config>
-#include <__type_traits/enable_if.h>
-#include <__type_traits/integral_constant.h>
-#include <__type_traits/is_same.h>
-#include <__type_traits/is_trivially_copyable.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-// A type is replaceable if, with `x` and `y` being different objects, `x = std::move(y)` is equivalent to:
-//
-//  std::destroy_at(&x)
-//  std::construct_at(&x, std::move(y))
-//
-// This allows turning a move-assignment into a sequence of destroy + move-construct, which
-// is often more efficient. This is especially relevant when the move-construct is in fact
-// part of a trivial relocation from somewhere else, in which case there is a huge win.
-//
-// Note that this requires language support in order to be really effective, but we
-// currently emulate the base template with something very conservative.
-template <class _Tp, class = void>
-struct __is_replaceable : is_trivially_copyable<_Tp> {};
-
-template <class _Tp>
-struct __is_replaceable<_Tp, __enable_if_t<is_same<_Tp, typename _Tp::__replaceable>::value> > : true_type {};
-
-template <class _Tp>
-inline const bool __is_replaceable_v = __is_replaceable<_Tp>::value;
-
-// Determines whether an allocator member of a container is replaceable.
-//
-// First, we require the allocator type to be considered replaceable. If not, then something fishy might be
-// happening. Assuming the allocator type is replaceable, we conclude replaceability of the allocator as a
-// member of the container if the allocator always compares equal (in which case propagation doesn't matter),
-// or if the allocator always propagates on assignment, which is required in order for move construction and
-// assignment to be equivalent.
-template <class _AllocatorTraits>
-struct __container_allocator_is_replaceable
-    : integral_constant<bool,
-                        __is_replaceable_v<typename _AllocatorTraits::allocator_type> &&
-                            (_AllocatorTraits::is_always_equal::value ||
-                             (_AllocatorTraits::propagate_on_container_move_assignment::value &&
-                              _AllocatorTraits::propagate_on_container_copy_assignment::value))> {};
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_REPLACEABLE_H
diff --git a/libcxx/include/__tuple/tuple_types.h b/libcxx/include/__type_traits/is_within_lifetime.h
similarity index 58%
rename from libcxx/include/__tuple/tuple_types.h
rename to libcxx/include/__type_traits/is_within_lifetime.h
index 7e1256cf8790e..242f2adaf357b 100644
--- a/libcxx/include/__tuple/tuple_types.h
+++ b/libcxx/include/__type_traits/is_within_lifetime.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___TUPLE_TUPLE_TYPES_H
-#define _LIBCPP___TUPLE_TUPLE_TYPES_H
+#ifndef _LIBCPP___TYPE_TRAITS_IS_WITHIN_LIFETIME_H
+#define _LIBCPP___TYPE_TRAITS_IS_WITHIN_LIFETIME_H
 
 #include <__config>
 
@@ -17,9 +17,13 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class... _Tp>
-struct __tuple_types {};
+#if _LIBCPP_STD_VER >= 26 && __has_builtin(__builtin_is_within_lifetime)
+template <class _Tp>
+_LIBCPP_HIDE_FROM_ABI consteval bool is_within_lifetime(const _Tp* __p) noexcept {
+  return __builtin_is_within_lifetime(__p);
+}
+#endif
 
 _LIBCPP_END_NAMESPACE_STD
 
-#endif // _LIBCPP___TUPLE_TUPLE_TYPES_H
+#endif // _LIBCPP___TYPE_TRAITS_IS_WITHIN_LIFETIME_H
diff --git a/libcxx/include/__type_traits/reference_constructs_from_temporary.h b/libcxx/include/__type_traits/reference_constructs_from_temporary.h
index 3d097ce90cb09..a8325620414ea 100644
--- a/libcxx/include/__type_traits/reference_constructs_from_temporary.h
+++ b/libcxx/include/__type_traits/reference_constructs_from_temporary.h
@@ -18,7 +18,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_constructs_from_temporary)
+#if _LIBCPP_STD_VER >= 23
 
 template <class _Tp, class _Up>
 struct _LIBCPP_NO_SPECIALIZATIONS reference_constructs_from_temporary
diff --git a/libcxx/include/__type_traits/reference_converts_from_temporary.h b/libcxx/include/__type_traits/reference_converts_from_temporary.h
index c68f1765af9d5..9c51225e53b8e 100644
--- a/libcxx/include/__type_traits/reference_converts_from_temporary.h
+++ b/libcxx/include/__type_traits/reference_converts_from_temporary.h
@@ -18,7 +18,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_converts_from_temporary)
+#if _LIBCPP_STD_VER >= 23
 
 template <class _Tp, class _Up>
 struct _LIBCPP_NO_SPECIALIZATIONS reference_converts_from_temporary
diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h
index 33694c52430f1..61485123114ba 100644
--- a/libcxx/include/__utility/pair.h
+++ b/libcxx/include/__utility/pair.h
@@ -31,8 +31,6 @@
 #include <__type_traits/is_implicitly_default_constructible.h>
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_replaceable.h>
-#include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
 #include <__type_traits/is_trivially_relocatable.h>
 #include <__type_traits/nat.h>
@@ -102,7 +100,6 @@ struct pair
       __conditional_t<__libcpp_is_trivially_relocatable<_T1>::value && __libcpp_is_trivially_relocatable<_T2>::value,
                       pair,
                       void>;
-  using __replaceable _LIBCPP_NODEBUG = __conditional_t<__is_replaceable_v<_T1> && __is_replaceable_v<_T2>, pair, void>;
 
   _LIBCPP_HIDE_FROM_ABI pair(pair const&) = default;
   _LIBCPP_HIDE_FROM_ABI pair(pair&&)      = default;
diff --git a/libcxx/include/__vector/vector.h b/libcxx/include/__vector/vector.h
index 316d3a9d10eff..7051e044314ea 100644
--- a/libcxx/include/__vector/vector.h
+++ b/libcxx/include/__vector/vector.h
@@ -54,7 +54,6 @@
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_pointer.h>
-#include <__type_traits/is_replaceable.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_trivially_relocatable.h>
 #include <__type_traits/type_identity.h>
@@ -123,10 +122,6 @@ class vector {
       __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<allocator_type>::value,
       vector,
       void>;
-  using __replaceable _LIBCPP_NODEBUG =
-      __conditional_t<__is_replaceable_v<pointer> && __container_allocator_is_replaceable<__alloc_traits>::value,
-                      vector,
-                      void>;
 
   static_assert(__check_valid_allocator<allocator_type>::value, "");
   static_assert(is_same<typename allocator_type::value_type, value_type>::value,
@@ -664,9 +659,6 @@ class vector {
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
   __construct_at_end(_InputIterator __first, _Sentinel __last, size_type __n);
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __append(size_type __n);
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __append(size_type __n, const_reference __x);
-
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator __make_iter(pointer __p) _NOEXCEPT {
 #ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR
     // Bound the iterator according to the capacity, rather than the size.
@@ -971,36 +963,6 @@ vector<_Tp, _Allocator>::__construct_at_end(_InputIterator __first, _Sentinel __
   __tx.__pos_ = std::__uninitialized_allocator_copy(this->__alloc_, std::move(__first), std::move(__last), __tx.__pos_);
 }
 
-//  Default constructs __n objects starting at __end_
-//  throws if construction throws
-//  Postcondition:  size() == size() + __n
-//  Exception safety: strong.
-template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__append(size_type __n) {
-  if (static_cast<size_type>(this->__cap_ - this->__end_) >= __n)
-    this->__construct_at_end(__n);
-  else {
-    __split_buffer<value_type, allocator_type&> __v(__recommend(size() + __n), size(), this->__alloc_);
-    __v.__construct_at_end(__n);
-    __swap_out_circular_buffer(__v);
-  }
-}
-
-//  Default constructs __n objects starting at __end_
-//  throws if construction throws
-//  Postcondition:  size() == size() + __n
-//  Exception safety: strong.
-template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__append(size_type __n, const_reference __x) {
-  if (static_cast<size_type>(this->__cap_ - this->__end_) >= __n)
-    this->__construct_at_end(__n, __x);
-  else {
-    __split_buffer<value_type, allocator_type&> __v(__recommend(size() + __n), size(), this->__alloc_);
-    __v.__construct_at_end(__n, __x);
-    __swap_out_circular_buffer(__v);
-  }
-}
-
 template <class _Tp, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI vector<_Tp, _Allocator>::vector(vector&& __x)
 #if _LIBCPP_STD_VER >= 17
@@ -1402,21 +1364,35 @@ vector<_Tp, _Allocator>::__insert_with_size(
 }
 
 template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __sz) {
-  size_type __cs = size();
-  if (__cs < __sz)
-    this->__append(__sz - __cs);
-  else if (__cs > __sz)
-    this->__destruct_at_end(this->__begin_ + __sz);
+_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __new_size) {
+  size_type __current_size = size();
+  if (__current_size < __new_size) {
+    if (__new_size <= capacity()) {
+      __construct_at_end(__new_size - __current_size);
+    } else {
+      __split_buffer<value_type, allocator_type&> __v(__recommend(__new_size), __current_size, __alloc_);
+      __v.__construct_at_end(__new_size - __current_size);
+      __swap_out_circular_buffer(__v);
+    }
+  } else if (__current_size > __new_size) {
+    this->__destruct_at_end(this->__begin_ + __new_size);
+  }
 }
 
 template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __sz, const_reference __x) {
-  size_type __cs = size();
-  if (__cs < __sz)
-    this->__append(__sz - __cs, __x);
-  else if (__cs > __sz)
-    this->__destruct_at_end(this->__begin_ + __sz);
+_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __new_size, const_reference __x) {
+  size_type __current_size = size();
+  if (__current_size < __new_size) {
+    if (__new_size <= capacity())
+      __construct_at_end(__new_size - __current_size, __x);
+    else {
+      __split_buffer<value_type, allocator_type&> __v(__recommend(__new_size), __current_size, __alloc_);
+      __v.__construct_at_end(__new_size - __current_size, __x);
+      __swap_out_circular_buffer(__v);
+    }
+  } else if (__current_size > __new_size) {
+    this->__destruct_at_end(this->__begin_ + __new_size);
+  }
 }
 
 template <class _Tp, class _Allocator>
diff --git a/libcxx/include/any b/libcxx/include/any
index 148fb16c802a5..b3e5b8748df4c 100644
--- a/libcxx/include/any
+++ b/libcxx/include/any
@@ -88,7 +88,6 @@ namespace std {
 #  include <__new/allocate.h>
 #  include <__type_traits/add_cv_quals.h>
 #  include <__type_traits/add_pointer.h>
-#  include <__type_traits/aligned_storage.h>
 #  include <__type_traits/conditional.h>
 #  include <__type_traits/decay.h>
 #  include <__type_traits/enable_if.h>
@@ -147,14 +146,13 @@ template <class _ValueType>
 _LIBCPP_HIDE_FROM_ABI add_pointer_t<_ValueType> any_cast(any*) _NOEXCEPT;
 
 namespace __any_imp {
-_LIBCPP_SUPPRESS_DEPRECATED_PUSH
-using _Buffer _LIBCPP_NODEBUG = aligned_storage_t<3 * sizeof(void*), alignof(void*)>;
-_LIBCPP_SUPPRESS_DEPRECATED_POP
+inline constexpr size_t __small_buffer_size      = 3 * sizeof(void*);
+inline constexpr size_t __small_buffer_alignment = alignof(void*);
 
 template <class _Tp>
 using _IsSmallObject _LIBCPP_NODEBUG =
     integral_constant<bool,
-                      sizeof(_Tp) <= sizeof(_Buffer) && alignof(_Buffer) % alignof(_Tp) == 0 &&
+                      sizeof(_Tp) <= __small_buffer_size && alignof(_Tp) <= __small_buffer_alignment &&
                           is_nothrow_move_constructible<_Tp>::value >;
 
 enum class _Action { _Destroy, _Copy, _Move, _Get, _TypeInfo };
@@ -284,7 +282,7 @@ private:
   union _Storage {
     _LIBCPP_HIDE_FROM_ABI constexpr _Storage() : __ptr(nullptr) {}
     void* __ptr;
-    __any_imp::_Buffer __buf;
+    alignas(__any_imp::__small_buffer_alignment) char __buf[__any_imp::__small_buffer_size];
   };
 
   _LIBCPP_HIDE_FROM_ABI void*
diff --git a/libcxx/include/array b/libcxx/include/array
index 9643fc1dd9dca..ff46838e2e8e2 100644
--- a/libcxx/include/array
+++ b/libcxx/include/array
@@ -134,7 +134,6 @@ template <size_t I, class T, size_t N> const T&& get(const array<T, N>&&) noexce
 #  include <__type_traits/is_const.h>
 #  include <__type_traits/is_constructible.h>
 #  include <__type_traits/is_nothrow_constructible.h>
-#  include <__type_traits/is_replaceable.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_swappable.h>
 #  include <__type_traits/is_trivially_relocatable.h>
@@ -176,7 +175,6 @@ template <class _Tp, size_t _Size>
 struct array {
   using __trivially_relocatable _LIBCPP_NODEBUG =
       __conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value, array, void>;
-  using __replaceable _LIBCPP_NODEBUG = __conditional_t<__is_replaceable_v<_Tp>, array, void>;
 
   // types:
   using __self _LIBCPP_NODEBUG = array;
diff --git a/libcxx/include/chrono b/libcxx/include/chrono
index 82e99a31bcc9f..aa4fc6218f962 100644
--- a/libcxx/include/chrono
+++ b/libcxx/include/chrono
@@ -218,6 +218,9 @@ template <class ToDuration, class Rep, class Period>
 template <class ToDuration, class Rep, class Period>
     constexpr ToDuration round(const duration<Rep, Period>& d);    // C++17
 
+template <class T> struct is_clock;                                // C++20
+template <class T> inline constexpr bool is_clock_v = is_clock<T>::value;                   // C++20
+
 // duration I/O
 template<class charT, class traits, class Rep, class Period>       // C++20
   basic_ostream<charT, traits>&
@@ -1057,6 +1060,7 @@ constexpr chrono::year                                  operator ""y(unsigned lo
 #    include <__chrono/day.h>
 #    include <__chrono/exception.h>
 #    include <__chrono/hh_mm_ss.h>
+#    include <__chrono/is_clock.h>
 #    include <__chrono/literals.h>
 #    include <__chrono/local_info.h>
 #    include <__chrono/month.h>
diff --git a/libcxx/include/deque b/libcxx/include/deque
index 3e7ee8d8565b6..08bf8141eb782 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -193,7 +193,6 @@ template <class T, class Allocator, class Predicate>
 #  include <__algorithm/move_backward.h>
 #  include <__algorithm/remove.h>
 #  include <__algorithm/remove_if.h>
-#  include <__algorithm/unwrap_iter.h>
 #  include <__assert>
 #  include <__config>
 #  include <__debug_utils/sanitizers.h>
@@ -220,17 +219,14 @@ template <class T, class Allocator, class Predicate>
 #  include <__ranges/concepts.h>
 #  include <__ranges/container_compatible_range.h>
 #  include <__ranges/from_range.h>
-#  include <__ranges/size.h>
 #  include <__split_buffer>
 #  include <__type_traits/conditional.h>
 #  include <__type_traits/container_traits.h>
-#  include <__type_traits/disjunction.h>
 #  include <__type_traits/enable_if.h>
 #  include <__type_traits/is_allocator.h>
 #  include <__type_traits/is_convertible.h>
 #  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
-#  include <__type_traits/is_replaceable.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_swappable.h>
 #  include <__type_traits/is_trivially_relocatable.h>
@@ -534,10 +530,6 @@ public:
       __libcpp_is_trivially_relocatable<__map>::value && __libcpp_is_trivially_relocatable<allocator_type>::value,
       deque,
       void>;
-  using __replaceable _LIBCPP_NODEBUG =
-      __conditional_t<__is_replaceable_v<__map> && __container_allocator_is_replaceable<__alloc_traits>::value,
-                      deque,
-                      void>;
 
   static_assert(is_nothrow_default_constructible<allocator_type>::value ==
                     is_nothrow_default_constructible<__pointer_allocator>::value,
@@ -782,6 +774,10 @@ public:
   // 23.2.2.3 modifiers:
   _LIBCPP_HIDE_FROM_ABI void push_front(const value_type& __v);
   _LIBCPP_HIDE_FROM_ABI void push_back(const value_type& __v);
+
+  template <class... _Args>
+  _LIBCPP_HIDE_FROM_ABI iterator __emplace(const_iterator __p, _Args&&... __args);
+
 #  ifndef _LIBCPP_CXX03_LANG
 #    if _LIBCPP_STD_VER >= 17
   template <class... _Args>
@@ -794,8 +790,11 @@ public:
   template <class... _Args>
   _LIBCPP_HIDE_FROM_ABI void emplace_back(_Args&&... __args);
 #    endif
+
   template <class... _Args>
-  _LIBCPP_HIDE_FROM_ABI iterator emplace(const_iterator __p, _Args&&... __args);
+  _LIBCPP_HIDE_FROM_ABI iterator emplace(const_iterator __p, _Args&&... __args) {
+    return __emplace(__p, std::forward<_Args>(__args)...);
+  }
 
   _LIBCPP_HIDE_FROM_ABI void push_front(value_type&& __v);
   _LIBCPP_HIDE_FROM_ABI void push_back(value_type&& __v);
@@ -812,13 +811,13 @@ public:
   }
 #    endif
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, value_type&& __v);
+  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, value_type&& __v) { return __emplace(__p, std::move(__v)); }
 
   _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, initializer_list<value_type> __il) {
     return insert(__p, __il.begin(), __il.end());
   }
 #  endif // _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, const value_type& __v);
+  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, const value_type& __v) { return __emplace(__p, __v); }
   _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, size_type __n, const value_type& __v);
   template <class _InputIter, __enable_if_t<__has_exactly_input_iterator_category<_InputIter>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, _InputIter __f, _InputIter __l);
@@ -1664,56 +1663,11 @@ deque<_Tp, _Allocator>::emplace_front(_Args&&... __args) {
   return *begin();
 #    endif
 }
-
-template <class _Tp, class _Allocator>
-typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::insert(const_iterator __p, value_type&& __v) {
-  size_type __pos     = __p - begin();
-  size_type __to_end  = size() - __pos;
-  allocator_type& __a = __alloc();
-  if (__pos < __to_end) { // insert by shifting things backward
-    if (__front_spare() == 0)
-      __add_front_capacity();
-    // __front_spare() >= 1
-    __annotate_increase_front(1);
-    if (__pos == 0) {
-      __alloc_traits::construct(__a, std::addressof(*--begin()), std::move(__v));
-      --__start_;
-      ++__size();
-    } else {
-      iterator __b   = begin();
-      iterator __bm1 = std::prev(__b);
-      __alloc_traits::construct(__a, std::addressof(*__bm1), std::move(*__b));
-      --__start_;
-      ++__size();
-      if (__pos > 1)
-        __b = std::move(std::next(__b), __b + __pos, __b);
-      *__b = std::move(__v);
-    }
-  } else { // insert by shifting things forward
-    if (__back_spare() == 0)
-      __add_back_capacity();
-    // __back_capacity >= 1
-    __annotate_increase_back(1);
-    size_type __de = size() - __pos;
-    if (__de == 0) {
-      __alloc_traits::construct(__a, std::addressof(*end()), std::move(__v));
-      ++__size();
-    } else {
-      iterator __e   = end();
-      iterator __em1 = std::prev(__e);
-      __alloc_traits::construct(__a, std::addressof(*__e), std::move(*__em1));
-      ++__size();
-      if (__de > 1)
-        __e = std::move_backward(__e - __de, __em1, __e);
-      *--__e = std::move(__v);
-    }
-  }
-  return begin() + __pos;
-}
+#  endif // _LIBCPP_CXX03_LANG
 
 template <class _Tp, class _Allocator>
 template <class... _Args>
-typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::emplace(const_iterator __p, _Args&&... __args) {
+typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::__emplace(const_iterator __p, _Args&&... __args) {
   size_type __pos     = __p - begin();
   size_type __to_end  = size() - __pos;
   allocator_type& __a = __alloc();
@@ -1760,60 +1714,6 @@ typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::emplace(const_
   return begin() + __pos;
 }
 
-#  endif // _LIBCPP_CXX03_LANG
-
-template <class _Tp, class _Allocator>
-typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::insert(const_iterator __p, const value_type& __v) {
-  size_type __pos     = __p - begin();
-  size_type __to_end  = size() - __pos;
-  allocator_type& __a = __alloc();
-  if (__pos < __to_end) { // insert by shifting things backward
-    if (__front_spare() == 0)
-      __add_front_capacity();
-    // __front_spare() >= 1
-    __annotate_increase_front(1);
-    if (__pos == 0) {
-      __alloc_traits::construct(__a, std::addressof(*--begin()), __v);
-      --__start_;
-      ++__size();
-    } else {
-      const_pointer __vt = pointer_traits<const_pointer>::pointer_to(__v);
-      iterator __b       = begin();
-      iterator __bm1     = std::prev(__b);
-      if (__vt == pointer_traits<const_pointer>::pointer_to(*__b))
-        __vt = pointer_traits<const_pointer>::pointer_to(*__bm1);
-      __alloc_traits::construct(__a, std::addressof(*__bm1), std::move(*__b));
-      --__start_;
-      ++__size();
-      if (__pos > 1)
-        __b = __move_and_check(std::next(__b), __b + __pos, __b, __vt);
-      *__b = *__vt;
-    }
-  } else { // insert by shifting things forward
-    if (__back_spare() == 0)
-      __add_back_capacity();
-    // __back_capacity >= 1
-    __annotate_increase_back(1);
-    size_type __de = size() - __pos;
-    if (__de == 0) {
-      __alloc_traits::construct(__a, std::addressof(*end()), __v);
-      ++__size();
-    } else {
-      const_pointer __vt = pointer_traits<const_pointer>::pointer_to(__v);
-      iterator __e       = end();
-      iterator __em1     = std::prev(__e);
-      if (__vt == pointer_traits<const_pointer>::pointer_to(*__em1))
-        __vt = pointer_traits<const_pointer>::pointer_to(*__e);
-      __alloc_traits::construct(__a, std::addressof(*__e), std::move(*__em1));
-      ++__size();
-      if (__de > 1)
-        __e = __move_backward_and_check(__e - __de, __em1, __e, __vt);
-      *--__e = *__vt;
-    }
-  }
-  return begin() + __pos;
-}
-
 template <class _Tp, class _Allocator>
 typename deque<_Tp, _Allocator>::iterator
 deque<_Tp, _Allocator>::insert(const_iterator __p, size_type __n, const value_type& __v) {
diff --git a/libcxx/include/exception b/libcxx/include/exception
index 74229cd16c006..0b2372e571e99 100644
--- a/libcxx/include/exception
+++ b/libcxx/include/exception
@@ -93,10 +93,13 @@ template <class E> void rethrow_if_nested(const E& e);
 
 #  if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #    include <cstddef>
-#    include <cstdlib>
 #    include <new>
 #    include <type_traits>
 #  endif
+
+#  if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 23
+#    include <cstdlib>
+#  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 
 #endif // _LIBCPP_EXCEPTION
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index 88d863f494e86..272e52d68f46a 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -223,14 +223,12 @@ template <class T, class Allocator, class Predicate>
 #  include <__ranges/concepts.h>
 #  include <__ranges/container_compatible_range.h>
 #  include <__ranges/from_range.h>
-#  include <__type_traits/conditional.h>
 #  include <__type_traits/container_traits.h>
 #  include <__type_traits/enable_if.h>
 #  include <__type_traits/is_allocator.h>
 #  include <__type_traits/is_const.h>
 #  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
-#  include <__type_traits/is_pointer.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_swappable.h>
 #  include <__type_traits/remove_cv.h>
diff --git a/libcxx/include/fstream b/libcxx/include/fstream
index 1f88d134fe061..b07ca636094af 100644
--- a/libcxx/include/fstream
+++ b/libcxx/include/fstream
@@ -308,6 +308,19 @@ protected:
     return basic_streambuf<_CharT, _Traits>::xsputn(__str, __len);
   }
 
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL streamsize xsgetn(char_type* __str, streamsize __len) override {
+    if (__always_noconv_) {
+      const streamsize __n = std::min(this->egptr() - this->gptr(), __len);
+      if (__n != 0) {
+        traits_type::copy(__str, this->gptr(), __n);
+        this->__gbump_ptrdiff(__n);
+      }
+      if (__len - __n >= this->egptr() - this->eback())
+        return std::fread(__str + __n, sizeof(char_type), __len - __n, __file_);
+    }
+    return basic_streambuf<_CharT, _Traits>::xsgetn(__str, __len);
+  }
+
 private:
   char* __extbuf_;
   const char* __extbufnext_;
diff --git a/libcxx/include/future b/libcxx/include/future
index 4b7c09841cbd3..0877d66602e6b 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -584,12 +584,9 @@ inline future_status __assoc_sub_state::wait_for(const chrono::duration<_Rep, _P
 template <class _Rp>
 class _LIBCPP_HIDDEN __assoc_state : public __assoc_sub_state {
   typedef __assoc_sub_state base;
-  _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-  typedef typename aligned_storage<sizeof(_Rp), _LIBCPP_ALIGNOF(_Rp)>::type _Up;
-  _LIBCPP_SUPPRESS_DEPRECATED_POP
 
 protected:
-  _Up __value_;
+  _ALIGNAS_TYPE(_Rp) char __value_[sizeof(_Rp)];
 
   _LIBCPP_HIDE_FROM_ABI_VIRTUAL void __on_zero_shared() _NOEXCEPT override;
 
diff --git a/libcxx/include/list b/libcxx/include/list
index 0ff85d2ebcb86..2898a45da0029 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -228,13 +228,11 @@ template <class T, class Allocator, class Predicate>
 #  include <__ranges/concepts.h>
 #  include <__ranges/container_compatible_range.h>
 #  include <__ranges/from_range.h>
-#  include <__type_traits/conditional.h>
 #  include <__type_traits/container_traits.h>
 #  include <__type_traits/enable_if.h>
 #  include <__type_traits/is_allocator.h>
 #  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
-#  include <__type_traits/is_pointer.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/type_identity.h>
 #  include <__utility/exception_guard.h>
diff --git a/libcxx/include/map b/libcxx/include/map
index 3ff849afcde09..0dca11cabd12e 100644
--- a/libcxx/include/map
+++ b/libcxx/include/map
@@ -600,9 +600,7 @@ erase_if(multimap<Key, T, Compare, Allocator>& c, Predicate pred);  // C++20
 #  include <__ranges/from_range.h>
 #  include <__tree>
 #  include <__type_traits/container_traits.h>
-#  include <__type_traits/desugars_to.h>
 #  include <__type_traits/is_allocator.h>
-#  include <__type_traits/is_convertible.h>
 #  include <__type_traits/make_transparent.h>
 #  include <__type_traits/remove_const.h>
 #  include <__type_traits/type_identity.h>
@@ -997,7 +995,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI map(map&& __m) = default;
 
-  _LIBCPP_HIDE_FROM_ABI map(map&& __m, const allocator_type& __a);
+  _LIBCPP_HIDE_FROM_ABI map(map&& __m, const allocator_type& __a) : __tree_(std::move(__m.__tree_), __a) {}
 
   _LIBCPP_HIDE_FROM_ABI map& operator=(map&& __m) = default;
 
@@ -1025,10 +1023,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI explicit map(const allocator_type& __a) : __tree_(typename __base::allocator_type(__a)) {}
 
-  _LIBCPP_HIDE_FROM_ABI map(const map& __m, const allocator_type& __a)
-      : __tree_(__m.__tree_.value_comp(), typename __base::allocator_type(__a)) {
-    insert(__m.begin(), __m.end());
-  }
+  _LIBCPP_HIDE_FROM_ABI map(const map& __m, const allocator_type& __alloc) : __tree_(__m.__tree_, __alloc) {}
 
   _LIBCPP_HIDE_FROM_ABI ~map() { static_assert(sizeof(std::__diagnose_non_const_comparator<_Key, _Compare>()), ""); }
 
@@ -1428,18 +1423,6 @@ map(initializer_list<pair<_Key, _Tp>>, _Allocator)
 #  endif
 
 #  ifndef _LIBCPP_CXX03_LANG
-template <class _Key, class _Tp, class _Compare, class _Allocator>
-map<_Key, _Tp, _Compare, _Allocator>::map(map&& __m, const allocator_type& __a)
-    : __tree_(std::move(__m.__tree_), typename __base::allocator_type(__a)) {
-  if (__a != __m.get_allocator()) {
-    const_iterator __e = cend();
-    while (!__m.empty()) {
-      __tree_.__insert_unique_from_orphaned_node(
-          __e.__i_, std::move(__m.__tree_.remove(__m.begin().__i_)->__get_value()));
-    }
-  }
-}
-
 template <class _Key, class _Tp, class _Compare, class _Allocator>
 _Tp& map<_Key, _Tp, _Compare, _Allocator>::operator[](const key_type& __k) {
   return __tree_.__emplace_unique(std::piecewise_construct, std::forward_as_tuple(__k), std::forward_as_tuple())
@@ -1685,7 +1668,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m) = default;
 
-  _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m, const allocator_type& __a);
+  _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m, const allocator_type& __a) : __tree_(std::move(__m.__tree_), __a) {}
 
   _LIBCPP_HIDE_FROM_ABI multimap& operator=(multimap&& __m) = default;
 
@@ -1714,10 +1697,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI explicit multimap(const allocator_type& __a) : __tree_(typename __base::allocator_type(__a)) {}
 
-  _LIBCPP_HIDE_FROM_ABI multimap(const multimap& __m, const allocator_type& __a)
-      : __tree_(__m.__tree_.value_comp(), typename __base::allocator_type(__a)) {
-    insert(__m.begin(), __m.end());
-  }
+  _LIBCPP_HIDE_FROM_ABI multimap(const multimap& __m, const allocator_type& __a) : __tree_(__m.__tree_, __a) {}
 
   _LIBCPP_HIDE_FROM_ABI ~multimap() {
     static_assert(sizeof(std::__diagnose_non_const_comparator<_Key, _Compare>()), "");
@@ -1992,19 +1972,6 @@ multimap(initializer_list<pair<_Key, _Tp>>, _Allocator)
     -> multimap<remove_const_t<_Key>, _Tp, less<remove_const_t<_Key>>, _Allocator>;
 #  endif
 
-#  ifndef _LIBCPP_CXX03_LANG
-template <class _Key, class _Tp, class _Compare, class _Allocator>
-multimap<_Key, _Tp, _Compare, _Allocator>::multimap(multimap&& __m, const allocator_type& __a)
-    : __tree_(std::move(__m.__tree_), typename __base::allocator_type(__a)) {
-  if (__a != __m.get_allocator()) {
-    const_iterator __e = cend();
-    while (!__m.empty())
-      __tree_.__insert_multi_from_orphaned_node(
-          __e.__i_, std::move(__m.__tree_.remove(__m.begin().__i_)->__get_value()));
-  }
-}
-#  endif
-
 template <class _Key, class _Tp, class _Compare, class _Allocator>
 inline _LIBCPP_HIDE_FROM_ABI bool
 operator==(const multimap<_Key, _Tp, _Compare, _Allocator>& __x, const multimap<_Key, _Tp, _Compare, _Allocator>& __y) {
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 11ab61d959e22..7ca57f6455dd8 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -6,6 +6,8 @@ module std_config [system] {
   textual header "__configuration/abi.h"
   textual header "__configuration/availability.h"
   textual header "__configuration/compiler.h"
+  textual header "__configuration/experimental.h"
+  textual header "__configuration/hardening.h"
   textual header "__configuration/language.h"
   textual header "__configuration/platform.h"
   textual header "version"
@@ -269,10 +271,6 @@ module std_core [system] {
       header "__type_traits/is_referenceable.h"
       export std_core.type_traits.integral_constant
     }
-    module is_replaceable {
-      header "__type_traits/is_replaceable.h"
-      export std_core.type_traits.integral_constant
-    }
     module is_same {
       header "__type_traits/is_same.h"
       export std_core.type_traits.integral_constant
@@ -350,6 +348,7 @@ module std_core [system] {
       header "__type_traits/is_volatile.h"
       export std_core.type_traits.integral_constant
     }
+    module is_within_lifetime                         { header "__type_traits/is_within_lifetime.h" }
     module lazy                                       { header "__type_traits/lazy.h" }
     module make_32_64_or_128_bit                      { header "__type_traits/make_32_64_or_128_bit.h" }
     module make_const_lvalue_ref                      { header "__type_traits/make_const_lvalue_ref.h" }
@@ -972,6 +971,10 @@ module std [system] {
       header "__chrono/high_resolution_clock.h"
       export *
     }
+    module is_clock {
+      header "__chrono/is_clock.h"
+      export std_core.type_traits.integral_constant
+    }
     module leap_second {
       header "__chrono/leap_second.h"
     }
@@ -2119,7 +2122,6 @@ module std [system] {
     module tuple_like_no_subrange   { header "__tuple/tuple_like_no_subrange.h" }
     module tuple_like               { header "__tuple/tuple_like.h" }
     module tuple_size               { header "__tuple/tuple_size.h" }
-    module tuple_types              { header "__tuple/tuple_types.h" }
 
     header "tuple"
     export *
@@ -2435,10 +2437,6 @@ module std_stdatomic_h [system] {
   header "stdatomic.h"
   export *
 }
-module std_stdbool_h [system] {
-  // <stdbool.h>'s __bool_true_false_are_defined macro requires textual inclusion.
-  textual header "stdbool.h"
-}
 module std_stddef_h [system] {
   // <stddef.h> supports being included multiple times with different pre-defined macros
   textual header "stddef.h"
diff --git a/libcxx/include/optional b/libcxx/include/optional
index ef1bfd3ec44c0..ad672f6a9914f 100644
--- a/libcxx/include/optional
+++ b/libcxx/include/optional
@@ -210,6 +210,7 @@ namespace std {
 #  include <__iterator/wrap_iter.h>
 #  include <__memory/addressof.h>
 #  include <__memory/construct_at.h>
+#  include <__ranges/enable_borrowed_range.h>
 #  include <__ranges/enable_view.h>
 #  include <__tuple/sfinae_helpers.h>
 #  include <__type_traits/add_pointer.h>
@@ -230,7 +231,6 @@ namespace std {
 #  include <__type_traits/is_nothrow_constructible.h>
 #  include <__type_traits/is_object.h>
 #  include <__type_traits/is_reference.h>
-#  include <__type_traits/is_replaceable.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_scalar.h>
 #  include <__type_traits/is_swappable.h>
@@ -240,6 +240,7 @@ namespace std {
 #  include <__type_traits/is_trivially_relocatable.h>
 #  include <__type_traits/is_unbounded_array.h>
 #  include <__type_traits/negation.h>
+#  include <__type_traits/reference_constructs_from_temporary.h>
 #  include <__type_traits/remove_const.h>
 #  include <__type_traits/remove_cv.h>
 #  include <__type_traits/remove_cvref.h>
@@ -410,39 +411,30 @@ struct __optional_storage_base : __optional_destruct_base<_Tp> {
         __construct(std::forward<_That>(__opt).__get());
     }
   }
+
+  template <class _Up>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __assign_from_val(_Up&& __val) {
+    this->__get() = std::forward<_Up>(__val);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __swap(__optional_storage_base& __rhs) {
+    using std::swap;
+    swap(this->__get(), __rhs.__get());
+  }
 };
 
-// optional<T&> is currently required to be ill-formed. However, it may
-// be allowed in the future. For this reason, it has already been implemented
-// to ensure we can make the change in an ABI-compatible manner.
 template <class _Tp>
 struct __optional_storage_base<_Tp, true> {
   using value_type                 = _Tp;
   using __raw_type _LIBCPP_NODEBUG = remove_reference_t<_Tp>;
   __raw_type* __value_;
 
-  template <class _Up>
-  static _LIBCPP_HIDE_FROM_ABI constexpr bool __can_bind_reference() {
-    using _RawUp = __libcpp_remove_reference_t<_Up>;
-    using _UpPtr = _RawUp*;
-    using _RawTp = __libcpp_remove_reference_t<_Tp>;
-    using _TpPtr = _RawTp*;
-    using _CheckLValueArg =
-        integral_constant<bool,
-                          (is_lvalue_reference<_Up>::value && is_convertible<_UpPtr, _TpPtr>::value) ||
-                              is_same<_RawUp, reference_wrapper<_RawTp>>::value ||
-                              is_same<_RawUp, reference_wrapper<__remove_const_t<_RawTp>>>::value >;
-    return (is_lvalue_reference<_Tp>::value && _CheckLValueArg::value) ||
-           (is_rvalue_reference<_Tp>::value && !is_lvalue_reference<_Up>::value &&
-            is_convertible<_UpPtr, _TpPtr>::value);
-  }
-
   _LIBCPP_HIDE_FROM_ABI constexpr __optional_storage_base() noexcept : __value_(nullptr) {}
 
   template <class _UArg>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit __optional_storage_base(in_place_t, _UArg&& __uarg)
       : __value_(std::addressof(__uarg)) {
-    static_assert(__can_bind_reference<_UArg>(),
+    static_assert(!__reference_constructs_from_temporary_v<_Tp, _UArg>,
                   "Attempted to construct a reference element in tuple from a "
                   "possible temporary");
   }
@@ -458,7 +450,7 @@ struct __optional_storage_base<_Tp, true> {
   template <class _UArg>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __construct(_UArg&& __val) {
     _LIBCPP_ASSERT_INTERNAL(!has_value(), "__construct called for engaged __optional_storage");
-    static_assert(__can_bind_reference<_UArg>(),
+    static_assert(!__reference_constructs_from_temporary_v<_Tp, _UArg>,
                   "Attempted to construct a reference element in tuple from a "
                   "possible temporary");
     __value_ = std::addressof(__val);
@@ -482,6 +474,15 @@ struct __optional_storage_base<_Tp, true> {
         __construct(std::forward<_That>(__opt).__get());
     }
   }
+
+  template <class _Up>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __assign_from_val(_Up&& __val) noexcept {
+    __value_ = std::addressof(__val);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __swap(__optional_storage_base& __rhs) noexcept {
+    std::swap(__value_, __rhs.__value_);
+  }
 };
 
 template <class _Tp, bool = is_trivially_copy_constructible<_Tp>::value>
@@ -593,6 +594,10 @@ constexpr bool ranges::enable_view<optional<_Tp>> = true;
 
 template <class _Tp>
 constexpr range_format format_kind<optional<_Tp>> = range_format::disabled;
+
+template <class _Tp>
+constexpr bool ranges::enable_borrowed_range<optional<_Tp&>> = true;
+
 #    endif
 
 #    if _LIBCPP_STD_VER >= 20
@@ -607,19 +612,19 @@ struct __is_std_optional : false_type {};
 template <class _Tp>
 struct __is_std_optional<optional<_Tp>> : true_type {};
 
-template <class _Tp>
-class _LIBCPP_DECLSPEC_EMPTY_BASES optional
-    : private __optional_move_assign_base<_Tp>,
-      private __optional_sfinae_ctor_base_t<_Tp>,
-      private __optional_sfinae_assign_base_t<_Tp> {
-  using __base _LIBCPP_NODEBUG = __optional_move_assign_base<_Tp>;
+template <class _Tp, class = void>
+struct __optional_iterator {};
 
-  using __pointer _LIBCPP_NODEBUG       = std::add_pointer_t<_Tp>;
-  using __const_pointer _LIBCPP_NODEBUG = std::add_pointer_t<const _Tp>;
+template <class _Tp>
+struct __optional_iterator<
+    _Tp,
+    enable_if_t<!(is_lvalue_reference_v<_Tp> && is_function_v<__libcpp_remove_reference_t<_Tp>>) &&
+                !(is_lvalue_reference_v<_Tp> && is_array_v<__libcpp_remove_reference_t<_Tp>>)> > {
+private:
+  using __pointer _LIBCPP_NODEBUG       = add_pointer_t<remove_reference_t<_Tp>>;
+  using __const_pointer _LIBCPP_NODEBUG = add_pointer_t<const remove_reference_t<_Tp>>;
 
 public:
-  using value_type = _Tp;
-
 #    if _LIBCPP_STD_VER >= 26
 #      ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_OPTIONAL
   using iterator       = __bounded_iter<__wrap_iter<__pointer>>;
@@ -628,20 +633,86 @@ public:
   using iterator       = __wrap_iter<__pointer>;
   using const_iterator = __wrap_iter<__const_pointer>;
 #      endif
+
+  // [optional.iterators], iterator support
+  _LIBCPP_HIDE_FROM_ABI constexpr iterator begin() noexcept {
+    auto& __derived_self = static_cast<optional<_Tp>&>(*this);
+    auto __ptr           = [&__derived_self]() {
+      if constexpr (is_lvalue_reference_v<_Tp>) {
+        return __derived_self.has_value() ? std::addressof(__derived_self.__get()) : nullptr;
+      }
+      return std::addressof(__derived_self.__get());
+    }();
+
+#      ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_OPTIONAL
+    return std::__make_bounded_iter(
+        __wrap_iter<__pointer>(__ptr),
+        __wrap_iter<__pointer>(__ptr),
+        __wrap_iter<__pointer>(__ptr) + (__derived_self.has_value() ? 1 : 0));
+#      else
+    return iterator(__ptr);
+#      endif
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr const_iterator begin() const noexcept {
+    auto& __derived_self = static_cast<const optional<_Tp>&>(*this);
+    auto* __ptr          = [&__derived_self]() {
+      if constexpr (is_lvalue_reference_v<_Tp>) {
+        return __derived_self.has_value() ? std::addressof(__derived_self.__get()) : nullptr;
+      }
+      return std::addressof(__derived_self.__get());
+    }();
+
+#      ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_OPTIONAL
+    return std::__make_bounded_iter(
+        __wrap_iter<__const_pointer>(__ptr),
+        __wrap_iter<__const_pointer>(__ptr),
+        __wrap_iter<__const_pointer>(__ptr) + (__derived_self.has_value() ? 1 : 0));
+#      else
+    return const_iterator(__ptr);
+#      endif
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr iterator end() noexcept {
+    return begin() + (static_cast<optional<_Tp>&>(*this).has_value() ? 1 : 0);
+  }
+  _LIBCPP_HIDE_FROM_ABI constexpr const_iterator end() const noexcept {
+    return begin() + (static_cast<const optional<_Tp>&>(*this).has_value() ? 1 : 0);
+  }
 #    endif
+};
+
+template <class _Tp>
+class _LIBCPP_DECLSPEC_EMPTY_BASES optional
+    : private __optional_move_assign_base<_Tp>,
+      private __optional_sfinae_ctor_base_t<_Tp>,
+      private __optional_sfinae_assign_base_t<_Tp>,
+      public __optional_iterator<_Tp> {
+  using __base _LIBCPP_NODEBUG = __optional_move_assign_base<_Tp>;
+
+public:
+  using value_type = __libcpp_remove_reference_t<_Tp>;
+
   using __trivially_relocatable _LIBCPP_NODEBUG =
       conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value, optional, void>;
-  using __replaceable _LIBCPP_NODEBUG = conditional_t<__is_replaceable_v<_Tp>, optional, void>;
 
 private:
-  // Disable the reference extension using this static assert.
-  static_assert(!is_same_v<__remove_cvref_t<value_type>, in_place_t>,
+  static_assert(!is_same_v<__remove_cvref_t<_Tp>, in_place_t>,
                 "instantiation of optional with in_place_t is ill-formed");
-  static_assert(!is_same_v<__remove_cvref_t<value_type>, nullopt_t>,
-                "instantiation of optional with nullopt_t is ill-formed");
-  static_assert(!is_reference_v<value_type>, "instantiation of optional with a reference type is ill-formed");
-  static_assert(is_destructible_v<value_type>, "instantiation of optional with a non-destructible type is ill-formed");
-  static_assert(!is_array_v<value_type>, "instantiation of optional with an array type is ill-formed");
+  static_assert(!is_same_v<__remove_cvref_t<_Tp>, nullopt_t>, "instantiation of optional with nullopt_t is ill-formed");
+#    if _LIBCPP_STD_VER >= 26
+  static_assert(!is_rvalue_reference_v<_Tp>, "instantiation of optional with an rvalue reference type is ill-formed");
+#    else
+  static_assert(!is_reference_v<_Tp>, "instantiation of optional with a reference type is ill-formed");
+#    endif
+  static_assert(is_destructible_v<_Tp>, "instantiation of optional with a non-destructible type is ill-formed");
+  static_assert(!is_array_v<_Tp>, "instantiation of optional with an array type is ill-formed");
+
+#    if _LIBCPP_STD_VER >= 26
+  template <class _Up>
+  constexpr static bool __libcpp_opt_ref_ctor_deleted =
+      is_lvalue_reference_v<_Tp> && reference_constructs_from_temporary_v<_Tp, _Up>;
+#    endif
 
   // LWG2756: conditionally explicit conversion from _Up
   struct _CheckOptionalArgsConstructor {
@@ -716,18 +787,15 @@ public:
 
   template <class _InPlaceT,
             class... _Args,
-            enable_if_t<_And<_IsSame<_InPlaceT, in_place_t>, is_constructible<value_type, _Args...>>::value, int> = 0>
+            enable_if_t<_And<_IsSame<_InPlaceT, in_place_t>, is_constructible<_Tp, _Args...>>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit optional(_InPlaceT, _Args&&... __args)
       : __base(in_place, std::forward<_Args>(__args)...) {}
 
-  template <class _Up,
-            class... _Args,
-            enable_if_t<is_constructible_v<value_type, initializer_list<_Up>&, _Args...>, int> = 0>
+  template <class _Up, class... _Args, enable_if_t<is_constructible_v<_Tp, initializer_list<_Up>&, _Args...>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit optional(in_place_t, initializer_list<_Up> __il, _Args&&... __args)
       : __base(in_place, __il, std::forward<_Args>(__args)...) {}
 
-  template <class _Up                                                                        = value_type,
-            enable_if_t<_CheckOptionalArgsCtor<_Up>::template __enable_implicit<_Up>(), int> = 0>
+  template <class _Up = _Tp, enable_if_t<_CheckOptionalArgsCtor<_Up>::template __enable_implicit<_Up>(), int> = 0>
   _LIBCPP_HIDE_FROM_ABI constexpr optional(_Up&& __v) : __base(in_place, std::forward<_Up>(__v)) {}
 
   template <class _Up                                                                        = remove_cv_t<_Tp>,
@@ -754,6 +822,38 @@ public:
     this->__construct_from(std::move(__v));
   }
 
+  // deleted optional<T&> constructors
+#    if _LIBCPP_STD_VER >= 26
+  template <class _Up, class... _Args, enable_if_t<is_constructible_v<_Tp, initializer_list<_Up>&, _Args...>, int> = 0>
+    requires __libcpp_opt_ref_ctor_deleted<_Up>
+  explicit optional(in_place_t, initializer_list<_Up>, _Args&&...) = delete;
+
+  template <class _Up = _Tp, enable_if_t<_CheckOptionalArgsCtor<_Up>::template __enable_implicit<_Up>(), int> = 0>
+    requires __libcpp_opt_ref_ctor_deleted<_Up>
+  optional(_Up&&) = delete;
+
+  template <class _Up                                                                        = remove_cv_t<_Tp>,
+            enable_if_t<_CheckOptionalArgsCtor<_Up>::template __enable_explicit<_Up>(), int> = 0>
+    requires __libcpp_opt_ref_ctor_deleted<_Up>
+  explicit optional(_Up&&) = delete;
+
+  template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up const&>::template __enable_implicit<_Up>(), int> = 0>
+    requires __libcpp_opt_ref_ctor_deleted<_Up>
+  optional(const optional<_Up>&) = delete;
+
+  template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up const&>::template __enable_explicit<_Up>(), int> = 0>
+    requires __libcpp_opt_ref_ctor_deleted<_Up>
+  explicit optional(const optional<_Up>&) = delete;
+
+  template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up&&>::template __enable_implicit<_Up>(), int> = 0>
+    requires __libcpp_opt_ref_ctor_deleted<_Up>
+  optional(optional<_Up>&&) = delete;
+
+  template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up&&>::template __enable_explicit<_Up>(), int> = 0>
+    requires __libcpp_opt_ref_ctor_deleted<_Up>
+  explicit optional(optional<_Up>&&) = delete;
+#    endif
+
 #    if _LIBCPP_STD_VER >= 23
   template <class _Tag,
             class _Fp,
@@ -772,15 +872,15 @@ public:
   _LIBCPP_HIDE_FROM_ABI constexpr optional& operator=(optional&&)      = default;
 
   // LWG2756
-  template <class _Up        = remove_cv_t<value_type>,
+  template <class _Up        = remove_cv_t<_Tp>,
             enable_if_t<_And<_IsNotSame<__remove_cvref_t<_Up>, optional>,
-                             _Or<_IsNotSame<__remove_cvref_t<_Up>, value_type>, _Not<is_scalar<value_type>>>,
-                             is_constructible<value_type, _Up>,
-                             is_assignable<value_type&, _Up>>::value,
+                             _Or<_IsNotSame<__remove_cvref_t<_Up>, _Tp>, _Not<is_scalar<_Tp>>>,
+                             is_constructible<_Tp, _Up>,
+                             is_assignable<_Tp&, _Up>>::value,
                         int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 optional& operator=(_Up&& __v) {
     if (this->has_value())
-      this->__get() = std::forward<_Up>(__v);
+      this->__assign_from_val(std::forward<_Up>(__v));
     else
       this->__construct(std::forward<_Up>(__v));
     return *this;
@@ -800,7 +900,7 @@ public:
     return *this;
   }
 
-  template <class... _Args, enable_if_t<is_constructible_v<value_type, _Args...>, int> = 0>
+  template <class... _Args, enable_if_t<is_constructible_v<_Tp, _Args...>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& emplace(_Args&&... __args) {
     reset();
     this->__construct(std::forward<_Args>(__args)...);
@@ -809,7 +909,12 @@ public:
 
   template <class _Up,
             class... _Args,
-            enable_if_t<is_constructible_v<value_type, initializer_list<_Up>&, _Args...>, int> = 0>
+            enable_if_t<is_constructible_v<_Tp, initializer_list<_Up>&, _Args...>
+#    if _LIBCPP_STD_VER >= 26
+                            && !reference_constructs_from_temporary_v<_Tp&, _Up>
+#    endif
+                        ,
+                        int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& emplace(initializer_list<_Up> __il, _Args&&... __args) {
     reset();
     this->__construct(__il, std::forward<_Args>(__args)...);
@@ -817,11 +922,10 @@ public:
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-  swap(optional& __opt) noexcept(is_nothrow_move_constructible_v<value_type> && is_nothrow_swappable_v<value_type>) {
+  swap(optional& __opt) noexcept(is_nothrow_move_constructible_v<_Tp> && is_nothrow_swappable_v<_Tp>) {
     if (this->has_value() == __opt.has_value()) {
-      using std::swap;
       if (this->has_value())
-        swap(this->__get(), __opt.__get());
+        this->__swap(__opt);
     } else {
       if (this->has_value()) {
         __opt.__construct(std::move(this->__get()));
@@ -833,60 +937,32 @@ public:
     }
   }
 
-#    if _LIBCPP_STD_VER >= 26
-  // [optional.iterators], iterator support
-  _LIBCPP_HIDE_FROM_ABI constexpr iterator begin() noexcept {
-#      ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_OPTIONAL
-    return std::__make_bounded_iter(
-        std::__wrap_iter<__pointer>(std::addressof(this->__get())),
-        std::__wrap_iter<__pointer>(std::addressof(this->__get())),
-        std::__wrap_iter<__pointer>(std::addressof(this->__get()) + (this->has_value() ? 1 : 0)));
-#      else
-    return iterator(std::addressof(this->__get()));
-#      endif
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr const_iterator begin() const noexcept {
-#      ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_OPTIONAL
-    return std::__make_bounded_iter(
-        std::__wrap_iter<__const_pointer>(std::addressof(this->__get())),
-        std::__wrap_iter<__const_pointer>(std::addressof(this->__get())),
-        std::__wrap_iter<__const_pointer>(std::addressof(this->__get()) + (this->has_value() ? 1 : 0)));
-#      else
-    return const_iterator(std::addressof(this->__get()));
-#      endif
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr iterator end() noexcept { return begin() + (this->has_value() ? 1 : 0); }
-  _LIBCPP_HIDE_FROM_ABI constexpr const_iterator end() const noexcept { return begin() + (this->has_value() ? 1 : 0); }
-#    endif
-
-  _LIBCPP_HIDE_FROM_ABI constexpr add_pointer_t<value_type const> operator->() const noexcept {
+  _LIBCPP_HIDE_FROM_ABI constexpr add_pointer_t<_Tp const> operator->() const noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(this->has_value(), "optional operator-> called on a disengaged value");
     return std::addressof(this->__get());
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr add_pointer_t<value_type> operator->() noexcept {
+  _LIBCPP_HIDE_FROM_ABI constexpr add_pointer_t<_Tp> operator->() noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(this->has_value(), "optional operator-> called on a disengaged value");
     return std::addressof(this->__get());
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr const value_type& operator*() const& noexcept {
+  _LIBCPP_HIDE_FROM_ABI constexpr const _Tp& operator*() const& noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(this->has_value(), "optional operator* called on a disengaged value");
     return this->__get();
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr value_type& operator*() & noexcept {
+  _LIBCPP_HIDE_FROM_ABI constexpr _Tp& operator*() & noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(this->has_value(), "optional operator* called on a disengaged value");
     return this->__get();
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr value_type&& operator*() && noexcept {
+  _LIBCPP_HIDE_FROM_ABI constexpr _Tp&& operator*() && noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(this->has_value(), "optional operator* called on a disengaged value");
     return std::move(this->__get());
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr const value_type&& operator*() const&& noexcept {
+  _LIBCPP_HIDE_FROM_ABI constexpr const _Tp&& operator*() const&& noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(this->has_value(), "optional operator* called on a disengaged value");
     return std::move(this->__get());
   }
@@ -896,48 +972,66 @@ public:
   using __base::__get;
   using __base::has_value;
 
-  _LIBCPP_HIDE_FROM_ABI constexpr value_type const& value() const& {
+  _LIBCPP_HIDE_FROM_ABI constexpr _Tp const& value() const& {
     if (!this->has_value())
       std::__throw_bad_optional_access();
     return this->__get();
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr value_type& value() & {
+  _LIBCPP_HIDE_FROM_ABI constexpr _Tp& value() & {
     if (!this->has_value())
       std::__throw_bad_optional_access();
     return this->__get();
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr value_type&& value() && {
+  _LIBCPP_HIDE_FROM_ABI constexpr _Tp&& value() && {
     if (!this->has_value())
       std::__throw_bad_optional_access();
     return std::move(this->__get());
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr value_type const&& value() const&& {
+  _LIBCPP_HIDE_FROM_ABI constexpr _Tp const&& value() const&& {
     if (!this->has_value())
       std::__throw_bad_optional_access();
     return std::move(this->__get());
   }
 
   template <class _Up = remove_cv_t<_Tp>>
-  _LIBCPP_HIDE_FROM_ABI constexpr value_type value_or(_Up&& __v) const& {
-    static_assert(is_copy_constructible_v<value_type>, "optional<T>::value_or: T must be copy constructible");
-    static_assert(is_convertible_v<_Up, value_type>, "optional<T>::value_or: U must be convertible to T");
-    return this->has_value() ? this->__get() : static_cast<value_type>(std::forward<_Up>(__v));
+#    if _LIBCPP_STD_VER >= 26
+    requires(!(is_lvalue_reference_v<_Tp> && is_function_v<__libcpp_remove_reference_t<_Tp>>) &&
+             !(is_lvalue_reference_v<_Tp> && is_array_v<__libcpp_remove_reference_t<_Tp>>))
+#    endif
+  _LIBCPP_HIDE_FROM_ABI constexpr _Tp value_or(_Up&& __v) const& {
+    static_assert(is_copy_constructible_v<_Tp>, "optional<T>::value_or: T must be copy constructible");
+    static_assert(is_convertible_v<_Up, _Tp>, "optional<T>::value_or: U must be convertible to T");
+    return this->has_value() ? this->__get() : static_cast<_Tp>(std::forward<_Up>(__v));
   }
 
   template <class _Up = remove_cv_t<_Tp>>
-  _LIBCPP_HIDE_FROM_ABI constexpr value_type value_or(_Up&& __v) && {
-    static_assert(is_move_constructible_v<value_type>, "optional<T>::value_or: T must be move constructible");
-    static_assert(is_convertible_v<_Up, value_type>, "optional<T>::value_or: U must be convertible to T");
-    return this->has_value() ? std::move(this->__get()) : static_cast<value_type>(std::forward<_Up>(__v));
+#    if _LIBCPP_STD_VER >= 26
+    requires(!is_lvalue_reference_v<_Tp>)
+#    endif
+  _LIBCPP_HIDE_FROM_ABI constexpr _Tp value_or(_Up&& __v) && {
+    static_assert(is_move_constructible_v<_Tp>, "optional<T>::value_or: T must be move constructible");
+    static_assert(is_convertible_v<_Up, _Tp>, "optional<T>::value_or: U must be convertible to T");
+    return this->has_value() ? std::move(this->__get()) : static_cast<_Tp>(std::forward<_Up>(__v));
+  }
+
+#    if _LIBCPP_STD_VER >= 26
+  template <class _Up = remove_cv_t<_Tp>>
+    requires(is_lvalue_reference_v<_Tp> &&
+             !(is_function_v<__libcpp_remove_reference_t<_Tp>> || is_array_v<__libcpp_remove_reference_t<_Tp>>))
+  _LIBCPP_HIDE_FROM_ABI constexpr _Tp value_or(_Up&& __v) && {
+    static_assert(is_move_constructible_v<_Tp>, "optional<T>::value_or: T must be move constructible");
+    static_assert(is_convertible_v<_Up, _Tp>, "optional<T>::value_or: U must be convertible to T");
+    return this->has_value() ? this->__get() : static_cast<_Tp>(std::forward<_Up>(__v));
   }
+#    endif
 
 #    if _LIBCPP_STD_VER >= 23
   template <class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr auto and_then(_Func&& __f) & {
-    using _Up = invoke_result_t<_Func, value_type&>;
+    using _Up = invoke_result_t<_Func, _Tp&>;
     static_assert(__is_std_optional<remove_cvref_t<_Up>>::value,
                   "Result of f(value()) must be a specialization of std::optional");
     if (*this)
@@ -947,7 +1041,7 @@ public:
 
   template <class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr auto and_then(_Func&& __f) const& {
-    using _Up = invoke_result_t<_Func, const value_type&>;
+    using _Up = invoke_result_t<_Func, const _Tp&>;
     static_assert(__is_std_optional<remove_cvref_t<_Up>>::value,
                   "Result of f(value()) must be a specialization of std::optional");
     if (*this)
@@ -957,7 +1051,7 @@ public:
 
   template <class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr auto and_then(_Func&& __f) && {
-    using _Up = invoke_result_t<_Func, value_type&&>;
+    using _Up = invoke_result_t<_Func, _Tp&&>;
     static_assert(__is_std_optional<remove_cvref_t<_Up>>::value,
                   "Result of f(std::move(value())) must be a specialization of std::optional");
     if (*this)
@@ -967,7 +1061,7 @@ public:
 
   template <class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr auto and_then(_Func&& __f) const&& {
-    using _Up = invoke_result_t<_Func, const value_type&&>;
+    using _Up = invoke_result_t<_Func, const _Tp&&>;
     static_assert(__is_std_optional<remove_cvref_t<_Up>>::value,
                   "Result of f(std::move(value())) must be a specialization of std::optional");
     if (*this)
@@ -977,7 +1071,7 @@ public:
 
   template <class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr auto transform(_Func&& __f) & {
-    using _Up = remove_cv_t<invoke_result_t<_Func, value_type&>>;
+    using _Up = remove_cv_t<invoke_result_t<_Func, _Tp&>>;
     static_assert(!is_array_v<_Up>, "Result of f(value()) should not be an Array");
     static_assert(!is_same_v<_Up, in_place_t>, "Result of f(value()) should not be std::in_place_t");
     static_assert(!is_same_v<_Up, nullopt_t>, "Result of f(value()) should not be std::nullopt_t");
@@ -989,7 +1083,7 @@ public:
 
   template <class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr auto transform(_Func&& __f) const& {
-    using _Up = remove_cv_t<invoke_result_t<_Func, const value_type&>>;
+    using _Up = remove_cv_t<invoke_result_t<_Func, const _Tp&>>;
     static_assert(!is_array_v<_Up>, "Result of f(value()) should not be an Array");
     static_assert(!is_same_v<_Up, in_place_t>, "Result of f(value()) should not be std::in_place_t");
     static_assert(!is_same_v<_Up, nullopt_t>, "Result of f(value()) should not be std::nullopt_t");
@@ -1001,7 +1095,7 @@ public:
 
   template <class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr auto transform(_Func&& __f) && {
-    using _Up = remove_cv_t<invoke_result_t<_Func, value_type&&>>;
+    using _Up = remove_cv_t<invoke_result_t<_Func, _Tp&&>>;
     static_assert(!is_array_v<_Up>, "Result of f(std::move(value())) should not be an Array");
     static_assert(!is_same_v<_Up, in_place_t>, "Result of f(std::move(value())) should not be std::in_place_t");
     static_assert(!is_same_v<_Up, nullopt_t>, "Result of f(std::move(value())) should not be std::nullopt_t");
@@ -1013,7 +1107,7 @@ public:
 
   template <class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr auto transform(_Func&& __f) const&& {
-    using _Up = remove_cvref_t<invoke_result_t<_Func, const value_type&&>>;
+    using _Up = remove_cvref_t<invoke_result_t<_Func, const _Tp&&>>;
     static_assert(!is_array_v<_Up>, "Result of f(std::move(value())) should not be an Array");
     static_assert(!is_same_v<_Up, in_place_t>, "Result of f(std::move(value())) should not be std::in_place_t");
     static_assert(!is_same_v<_Up, nullopt_t>, "Result of f(std::move(value())) should not be std::nullopt_t");
@@ -1025,7 +1119,7 @@ public:
 
   template <invocable _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr optional or_else(_Func&& __f) const&
-    requires is_copy_constructible_v<value_type>
+    requires is_copy_constructible_v<_Tp>
   {
     static_assert(is_same_v<remove_cvref_t<invoke_result_t<_Func>>, optional>,
                   "Result of f() should be the same type as this optional");
@@ -1036,7 +1130,7 @@ public:
 
   template <invocable _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr optional or_else(_Func&& __f) &&
-    requires is_move_constructible_v<value_type>
+    requires is_move_constructible_v<_Tp>
   {
     static_assert(is_same_v<remove_cvref_t<invoke_result_t<_Func>>, optional>,
                   "Result of f() should be the same type as this optional");
@@ -1338,7 +1432,15 @@ swap(optional<_Tp>& __x, optional<_Tp>& __y) noexcept(noexcept(__x.swap(__y))) {
   __x.swap(__y);
 }
 
-template <class _Tp>
+struct __make_optional_barrier_tag {
+  explicit __make_optional_barrier_tag() = default;
+};
+
+template <
+#    if _LIBCPP_STD_VER >= 26
+    __make_optional_barrier_tag = __make_optional_barrier_tag{},
+#    endif
+    class _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr optional<decay_t<_Tp>> make_optional(_Tp&& __v) {
   return optional<decay_t<_Tp>>(std::forward<_Tp>(__v));
 }
diff --git a/libcxx/include/set b/libcxx/include/set
index 59ed0155c1def..3d6f571a42a1a 100644
--- a/libcxx/include/set
+++ b/libcxx/include/set
@@ -524,7 +524,6 @@ erase_if(multiset<Key, Compare, Allocator>& c, Predicate pred);  // C++20
 #  include <__functional/operations.h>
 #  include <__iterator/erase_if_container.h>
 #  include <__iterator/iterator_traits.h>
-#  include <__iterator/ranges_iterator_traits.h>
 #  include <__iterator/reverse_iterator.h>
 #  include <__memory/allocator.h>
 #  include <__memory/allocator_traits.h>
@@ -538,7 +537,6 @@ erase_if(multiset<Key, Compare, Allocator>& c, Predicate pred);  // C++20
 #  include <__type_traits/container_traits.h>
 #  include <__type_traits/enable_if.h>
 #  include <__type_traits/is_allocator.h>
-#  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_swappable.h>
@@ -673,12 +671,10 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI explicit set(const allocator_type& __a) : __tree_(__a) {}
 
-  _LIBCPP_HIDE_FROM_ABI set(const set& __s, const allocator_type& __a) : __tree_(__s.__tree_.value_comp(), __a) {
-    insert(__s.begin(), __s.end());
-  }
+  _LIBCPP_HIDE_FROM_ABI set(const set& __s, const allocator_type& __alloc) : __tree_(__s.__tree_, __alloc) {}
 
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI set(set&& __s, const allocator_type& __a);
+  _LIBCPP_HIDE_FROM_ABI set(set&& __s, const allocator_type& __alloc) : __tree_(std::move(__s.__tree_), __alloc) {}
 
   _LIBCPP_HIDE_FROM_ABI set(initializer_list<value_type> __il, const value_compare& __comp = value_compare())
       : __tree_(__comp) {
@@ -948,19 +944,6 @@ template <class _Key, class _Allocator, class = enable_if_t<__is_allocator_v<_Al
 set(initializer_list<_Key>, _Allocator) -> set<_Key, less<_Key>, _Allocator>;
 #  endif
 
-#  ifndef _LIBCPP_CXX03_LANG
-
-template <class _Key, class _Compare, class _Allocator>
-set<_Key, _Compare, _Allocator>::set(set&& __s, const allocator_type& __a) : __tree_(std::move(__s.__tree_), __a) {
-  if (__a != __s.get_allocator()) {
-    const_iterator __e = cend();
-    while (!__s.empty())
-      insert(__e, std::move(__s.__tree_.remove(__s.begin())->__get_value()));
-  }
-}
-
-#  endif // _LIBCPP_CXX03_LANG
-
 template <class _Key, class _Compare, class _Allocator>
 inline _LIBCPP_HIDE_FROM_ABI bool
 operator==(const set<_Key, _Compare, _Allocator>& __x, const set<_Key, _Compare, _Allocator>& __y) {
@@ -1130,13 +1113,10 @@ public:
 #  ifndef _LIBCPP_CXX03_LANG
   _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s) = default;
 
-  _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s, const allocator_type& __a);
+  _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s, const allocator_type& __a) : __tree_(std::move(__s.__tree_), __a) {}
 #  endif // _LIBCPP_CXX03_LANG
   _LIBCPP_HIDE_FROM_ABI explicit multiset(const allocator_type& __a) : __tree_(__a) {}
-  _LIBCPP_HIDE_FROM_ABI multiset(const multiset& __s, const allocator_type& __a)
-      : __tree_(__s.__tree_.value_comp(), __a) {
-    insert(__s.begin(), __s.end());
-  }
+  _LIBCPP_HIDE_FROM_ABI multiset(const multiset& __s, const allocator_type& __a) : __tree_(__s.__tree_, __a) {}
 
 #  ifndef _LIBCPP_CXX03_LANG
   _LIBCPP_HIDE_FROM_ABI multiset(initializer_list<value_type> __il, const value_compare& __comp = value_compare())
@@ -1409,20 +1389,6 @@ template <class _Key, class _Allocator, class = enable_if_t<__is_allocator_v<_Al
 multiset(initializer_list<_Key>, _Allocator) -> multiset<_Key, less<_Key>, _Allocator>;
 #  endif
 
-#  ifndef _LIBCPP_CXX03_LANG
-
-template <class _Key, class _Compare, class _Allocator>
-multiset<_Key, _Compare, _Allocator>::multiset(multiset&& __s, const allocator_type& __a)
-    : __tree_(std::move(__s.__tree_), __a) {
-  if (__a != __s.get_allocator()) {
-    const_iterator __e = cend();
-    while (!__s.empty())
-      insert(__e, std::move(__s.__tree_.remove(__s.begin())->__get_value()));
-  }
-}
-
-#  endif // _LIBCPP_CXX03_LANG
-
 template <class _Key, class _Compare, class _Allocator>
 inline _LIBCPP_HIDE_FROM_ABI bool
 operator==(const multiset<_Key, _Compare, _Allocator>& __x, const multiset<_Key, _Compare, _Allocator>& __y) {
diff --git a/libcxx/include/stdbool.h b/libcxx/include/stdbool.h
deleted file mode 100644
index 768d08247256a..0000000000000
--- a/libcxx/include/stdbool.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP_STDBOOL_H
-#define _LIBCPP_STDBOOL_H
-
-/*
-    stdbool.h synopsis
-
-Macros:
-
-    __bool_true_false_are_defined
-
-*/
-
-#if defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
-#  include <__cxx03/__config>
-#else
-#  include <__config>
-#endif
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-#if __has_include_next(<stdbool.h>)
-#  include_next <stdbool.h>
-#endif
-
-#ifdef __cplusplus
-#  undef bool
-#  undef true
-#  undef false
-#  undef __bool_true_false_are_defined
-#  define __bool_true_false_are_defined 1
-#endif
-
-#endif // _LIBCPP_STDBOOL_H
diff --git a/libcxx/include/string b/libcxx/include/string
index 8f80afbc2fd37..09fc6228c4fdb 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -632,7 +632,6 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 #  include <__type_traits/is_generic_transparent_comparator.h>
 #  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
-#  include <__type_traits/is_replaceable.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_standard_layout.h>
 #  include <__type_traits/is_trivially_constructible.h>
@@ -644,6 +643,7 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 #  include <__utility/forward.h>
 #  include <__utility/is_pointer_in_range.h>
 #  include <__utility/move.h>
+#  include <__utility/no_destroy.h>
 #  include <__utility/scope_guard.h>
 #  include <__utility/swap.h>
 #  include <climits>
@@ -756,9 +756,6 @@ public:
   // external memory. In such cases, the destructor is responsible for unpoisoning
   // the memory to avoid triggering false positives.
   // Therefore it's crucial to ensure the destructor is called.
-  //
-  // However, it is replaceable since implementing move-assignment as a destroy + move-construct
-  // will maintain the right ASAN state.
   using __trivially_relocatable = void;
 #  else
   using __trivially_relocatable _LIBCPP_NODEBUG = __conditional_t<
@@ -766,10 +763,6 @@ public:
       basic_string,
       void>;
 #  endif
-  using __replaceable _LIBCPP_NODEBUG =
-      __conditional_t<__is_replaceable_v<pointer> && __container_allocator_is_replaceable<__alloc_traits>::value,
-                      basic_string,
-                      void>;
 
 #  if __has_feature(address_sanitizer) && _LIBCPP_INSTRUMENTED_WITH_ASAN
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pointer __asan_volatile_wrapper(pointer const& __ptr) const {
@@ -914,6 +907,11 @@ private:
   union __rep {
     __short __s;
     __long __l;
+
+    __rep() = default;
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__short __r) : __s(__r) {}
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__long __r) : __l(__r) {}
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__uninitialized_tag) {}
   };
 
   _LIBCPP_COMPRESSED_PAIR(__rep, __rep_, allocator_type, __alloc_);
@@ -1206,7 +1204,10 @@ public:
   }
 #  endif // _LIBCPP_CXX03_LANG
 
-  inline _LIBCPP_CONSTEXPR_SINCE_CXX20 ~basic_string() { __reset_internal_buffer(); }
+  // TODO(boomanaiden154): Once we mark this in destructors as dead on return,
+  // we can use a normal call to __reset_internal_buffer and remove the extra
+  // __rep constructor.
+  inline _LIBCPP_CONSTEXPR_SINCE_CXX20 ~basic_string() { __reset_internal_buffer(__rep(__uninitialized_tag())); }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 operator __self_view() const _NOEXCEPT {
     return __self_view(typename __self_view::__assume_valid(), data(), size());
@@ -1244,45 +1245,55 @@ public:
 #  endif
   _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator=(value_type __c);
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator begin() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator begin() _NOEXCEPT {
     return __make_iterator(__get_pointer());
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator begin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator begin() const _NOEXCEPT {
     return __make_const_iterator(__get_pointer());
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator end() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator end() _NOEXCEPT {
     return __make_iterator(__get_pointer() + size());
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator end() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator end() const _NOEXCEPT {
     return __make_const_iterator(__get_pointer() + size());
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reverse_iterator rbegin() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reverse_iterator rbegin() _NOEXCEPT {
     return reverse_iterator(end());
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator rbegin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator
+  rbegin() const _NOEXCEPT {
     return const_reverse_iterator(end());
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reverse_iterator rend() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reverse_iterator rend() _NOEXCEPT {
     return reverse_iterator(begin());
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator rend() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator rend() const _NOEXCEPT {
     return const_reverse_iterator(begin());
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator cbegin() const _NOEXCEPT { return begin(); }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator cend() const _NOEXCEPT { return end(); }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator crbegin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator cbegin() const _NOEXCEPT {
+    return begin();
+  }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator cend() const _NOEXCEPT {
+    return end();
+  }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator
+  crbegin() const _NOEXCEPT {
     return rbegin();
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator crend() const _NOEXCEPT { return rend(); }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator crend() const _NOEXCEPT {
+    return rend();
+  }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type size() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type size() const _NOEXCEPT {
     return __is_long() ? __get_long_size() : __get_short_size();
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type length() const _NOEXCEPT { return size(); }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type length() const _NOEXCEPT {
+    return size();
+  }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type max_size() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type max_size() const _NOEXCEPT {
     if (size_type __m = __alloc_traits::max_size(__alloc_); __m <= std::numeric_limits<size_type>::max() / 2) {
       size_type __res = __m - __alignment;
 
@@ -1300,7 +1311,7 @@ public:
     }
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type capacity() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type capacity() const _NOEXCEPT {
     return (__is_long() ? __get_long_cap() : static_cast<size_type>(__min_cap)) - 1;
   }
 
@@ -1335,7 +1346,8 @@ public:
     return size() == 0;
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference operator[](size_type __pos) const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference
+  operator[](size_type __pos) const _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__pos <= size(), "string index out of bounds");
     if (__builtin_constant_p(__pos) && !__fits_in_sso(__pos)) {
       return *(__get_long_pointer() + __pos);
@@ -1343,7 +1355,8 @@ public:
     return *(data() + __pos);
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference operator[](size_type __pos) _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference
+  operator[](size_type __pos) _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__pos <= size(), "string index out of bounds");
     if (__builtin_constant_p(__pos) && !__fits_in_sso(__pos)) {
       return *(__get_long_pointer() + __pos);
@@ -1351,8 +1364,8 @@ public:
     return *(__get_pointer() + __pos);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference at(size_type __n) const;
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 reference at(size_type __n);
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference at(size_type __n) const;
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 reference at(size_type __n);
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator+=(const basic_string& __str) {
     return append(__str);
@@ -1464,22 +1477,22 @@ public:
   _LIBCPP_CONSTEXPR_SINCE_CXX20 void push_back(value_type __c);
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void pop_back();
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference front() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference front() _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "string::front(): string is empty");
     return *__get_pointer();
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference front() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference front() const _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "string::front(): string is empty");
     return *data();
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference back() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference back() _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "string::back(): string is empty");
     return *(__get_pointer() + size() - 1);
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference back() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference back() const _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "string::back(): string is empty");
     return *(data() + size() - 1);
   }
@@ -1752,16 +1765,16 @@ public:
   _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type copy(value_type* __s, size_type __n, size_type __pos = 0) const;
 
 #  if _LIBCPP_STD_VER <= 20
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string
-  substr(size_type __pos = 0, size_type __n = npos) const {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string substr(size_type __pos = 0, size_type __n = npos) const {
     return basic_string(*this, __pos, __n);
   }
 #  else
-  _LIBCPP_HIDE_FROM_ABI constexpr basic_string substr(size_type __pos = 0, size_type __n = npos) const& {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr basic_string substr(size_type __pos = 0, size_type __n = npos) const& {
     return basic_string(*this, __pos, __n);
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr basic_string substr(size_type __pos = 0, size_type __n = npos) && {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr basic_string substr(size_type __pos = 0, size_type __n = npos) && {
     return basic_string(std::move(*this), __pos, __n);
   }
 #  endif
@@ -1781,231 +1794,238 @@ public:
   // [string.ops]
   // ------------
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const value_type* c_str() const _NOEXCEPT { return data(); }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const value_type* data() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const value_type* c_str() const _NOEXCEPT {
+    return data();
+  }
+
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const value_type* data() const _NOEXCEPT {
     return std::__to_address(__get_pointer());
   }
 #  if _LIBCPP_STD_VER >= 17
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 value_type* data() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 value_type* data() _NOEXCEPT {
     return std::__to_address(__get_pointer());
   }
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 allocator_type get_allocator() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 allocator_type get_allocator() const _NOEXCEPT {
     return __alloc_;
   }
 
   // find
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find(const basic_string& __str, size_type __pos = 0) const _NOEXCEPT {
     return std::__str_find<value_type, size_type, traits_type, npos>(data(), size(), __str.data(), __pos, __str.size());
   }
 
   template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT {
     __self_view __sv = __t;
     return std::__str_find<value_type, size_type, traits_type, npos>(data(), size(), __sv.data(), __pos, __sv.size());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  find(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT
       _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find(): received nullptr");
     return std::__str_find<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = 0) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::find(): received nullptr");
     return std::__str_find<value_type, size_type, traits_type, npos>(
         data(), size(), __s, __pos, traits_type::length(__s));
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find(value_type __c, size_type __pos = 0) const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find(value_type __c, size_type __pos = 0) const _NOEXCEPT {
     return std::__str_find<value_type, size_type, traits_type, npos>(data(), size(), __c, __pos);
   }
 
   // rfind
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   rfind(const basic_string& __str, size_type __pos = npos) const _NOEXCEPT {
     return std::__str_rfind<value_type, size_type, traits_type, npos>(
         data(), size(), __str.data(), __pos, __str.size());
   }
 
   template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   rfind(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT {
     __self_view __sv = __t;
     return std::__str_rfind<value_type, size_type, traits_type, npos>(data(), size(), __sv.data(), __pos, __sv.size());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type rfind(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  rfind(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT
       _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::rfind(): received nullptr");
     return std::__str_rfind<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   rfind(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = npos) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::rfind(): received nullptr");
     return std::__str_rfind<value_type, size_type, traits_type, npos>(
         data(), size(), __s, __pos, traits_type::length(__s));
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type rfind(value_type __c, size_type __pos = npos) const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  rfind(value_type __c, size_type __pos = npos) const _NOEXCEPT {
     return std::__str_rfind<value_type, size_type, traits_type, npos>(data(), size(), __c, __pos);
   }
 
   // find_first_of
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_first_of(const basic_string& __str, size_type __pos = 0) const _NOEXCEPT {
     return std::__str_find_first_of<value_type, size_type, traits_type, npos>(
         data(), size(), __str.data(), __pos, __str.size());
   }
 
   template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_first_of(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT {
     __self_view __sv = __t;
     return std::__str_find_first_of<value_type, size_type, traits_type, npos>(
         data(), size(), __sv.data(), __pos, __sv.size());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_first_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT
       _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_first_of(): received nullptr");
     return std::__str_find_first_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_first_of(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = 0) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::find_first_of(): received nullptr");
     return std::__str_find_first_of<value_type, size_type, traits_type, npos>(
         data(), size(), __s, __pos, traits_type::length(__s));
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_first_of(value_type __c, size_type __pos = 0) const _NOEXCEPT {
     return find(__c, __pos);
   }
 
   // find_last_of
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_last_of(const basic_string& __str, size_type __pos = npos) const _NOEXCEPT {
     return std::__str_find_last_of<value_type, size_type, traits_type, npos>(
         data(), size(), __str.data(), __pos, __str.size());
   }
 
   template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_last_of(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT {
     __self_view __sv = __t;
     return std::__str_find_last_of<value_type, size_type, traits_type, npos>(
         data(), size(), __sv.data(), __pos, __sv.size());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_last_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT
       _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_last_of(): received nullptr");
     return std::__str_find_last_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_last_of(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = npos) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::find_last_of(): received nullptr");
     return std::__str_find_last_of<value_type, size_type, traits_type, npos>(
         data(), size(), __s, __pos, traits_type::length(__s));
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_last_of(value_type __c, size_type __pos = npos) const _NOEXCEPT {
     return rfind(__c, __pos);
   }
 
   // find_first_not_of
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_first_not_of(const basic_string& __str, size_type __pos = 0) const _NOEXCEPT {
     return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>(
         data(), size(), __str.data(), __pos, __str.size());
   }
 
   template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_first_not_of(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT {
     __self_view __sv = __t;
     return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>(
         data(), size(), __sv.data(), __pos, __sv.size());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_first_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT
       _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_first_not_of(): received nullptr");
     return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_first_not_of(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = 0) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::find_first_not_of(): received nullptr");
     return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>(
         data(), size(), __s, __pos, traits_type::length(__s));
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_first_not_of(value_type __c, size_type __pos = 0) const _NOEXCEPT {
     return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>(data(), size(), __c, __pos);
   }
 
   // find_last_not_of
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_last_not_of(const basic_string& __str, size_type __pos = npos) const _NOEXCEPT {
     return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>(
         data(), size(), __str.data(), __pos, __str.size());
   }
 
   template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_last_not_of(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT {
     __self_view __sv = __t;
     return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>(
         data(), size(), __sv.data(), __pos, __sv.size());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_last_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT
       _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_last_not_of(): received nullptr");
     return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_last_not_of(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = npos) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::find_last_not_of(): received nullptr");
     return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>(
         data(), size(), __s, __pos, traits_type::length(__s));
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_last_not_of(value_type __c, size_type __pos = npos) const _NOEXCEPT {
     return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>(data(), size(), __c, __pos);
   }
 
   // compare
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(const basic_string& __str) const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int
+  compare(const basic_string& __str) const _NOEXCEPT {
     return compare(__self_view(__str));
   }
 
   template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(const _Tp& __t) const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(const _Tp& __t) const _NOEXCEPT {
     __self_view __sv = __t;
     size_t __lhs_sz  = size();
     size_t __rhs_sz  = __sv.size();
@@ -2020,18 +2040,18 @@ public:
   }
 
   template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int
   compare(size_type __pos1, size_type __n1, const _Tp& __t) const {
     __self_view __sv = __t;
     return compare(__pos1, __n1, __sv.data(), __sv.size());
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int
   compare(size_type __pos1, size_type __n1, const basic_string& __str) const {
     return compare(__pos1, __n1, __str.data(), __str.size());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 int
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 int
   compare(size_type __pos1, size_type __n1, const basic_string& __str, size_type __pos2, size_type __n2 = npos) const {
     return compare(__pos1, __n1, __self_view(__str), __pos2, __n2);
   }
@@ -2040,53 +2060,56 @@ public:
             __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
-  inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int
+  [[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int
   compare(size_type __pos1, size_type __n1, const _Tp& __t, size_type __pos2, size_type __n2 = npos) const {
     __self_view __sv = __t;
     return __self_view(*this).substr(__pos1, __n1).compare(__sv.substr(__pos2, __n2));
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 int
+  compare(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::compare(): received nullptr");
     return compare(0, npos, __s, traits_type::length(__s));
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 int
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 int
   compare(size_type __pos1, size_type __n1, const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::compare(): received nullptr");
     return compare(__pos1, __n1, __s, traits_type::length(__s));
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 int
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 int
   compare(size_type __pos1, size_type __n1, const value_type* __s, size_type __n2) const
       _LIBCPP_DIAGNOSE_NULLPTR_IF(__n2 != 0 && __s == nullptr, " if n2 is not zero");
 
   // starts_with
 
 #  if _LIBCPP_STD_VER >= 20
-  constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(__self_view __sv) const noexcept {
+  [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(__self_view __sv) const noexcept {
     return __self_view(typename __self_view::__assume_valid(), data(), size()).starts_with(__sv);
   }
 
-  constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(value_type __c) const noexcept {
+  [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(value_type __c) const noexcept {
     return !empty() && _Traits::eq(front(), __c);
   }
 
-  constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const noexcept {
+  [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool
+  starts_with(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const noexcept {
     return starts_with(__self_view(__s));
   }
 
   // ends_with
 
-  constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(__self_view __sv) const noexcept {
+  [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(__self_view __sv) const noexcept {
     return __self_view(typename __self_view::__assume_valid(), data(), size()).ends_with(__sv);
   }
 
-  constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(value_type __c) const noexcept {
+  [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(value_type __c) const noexcept {
     return !empty() && _Traits::eq(back(), __c);
   }
 
-  constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const noexcept {
+  [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool
+  ends_with(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const noexcept {
     return ends_with(__self_view(__s));
   }
 #  endif
@@ -2094,15 +2117,16 @@ public:
   // contains
 
 #  if _LIBCPP_STD_VER >= 23
-  constexpr _LIBCPP_HIDE_FROM_ABI bool contains(__self_view __sv) const noexcept {
+  [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool contains(__self_view __sv) const noexcept {
     return __self_view(typename __self_view::__assume_valid(), data(), size()).contains(__sv);
   }
 
-  constexpr _LIBCPP_HIDE_FROM_ABI bool contains(value_type __c) const noexcept {
+  [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool contains(value_type __c) const noexcept {
     return __self_view(typename __self_view::__assume_valid(), data(), size()).contains(__c);
   }
 
-  constexpr _LIBCPP_HIDE_FROM_ABI bool contains(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const {
+  [[__nodiscard__]] constexpr _LIBCPP_HIDE_FROM_ABI bool
+  contains(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const {
     return __self_view(typename __self_view::__assume_valid(), data(), size()).contains(__s);
   }
 #  endif
@@ -2259,18 +2283,12 @@ private:
     return __long(__buffer, __capacity);
   }
 
-  // Deallocate the long buffer if it exists and clear the short buffer so we are an empty string
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __reset_internal_buffer() {
+  // Replace the current buffer with __new_rep. Deallocate the old long buffer if it exists.
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __reset_internal_buffer(__rep __new_rep = __short()) {
     __annotate_delete();
     if (__is_long())
       __alloc_traits::deallocate(__alloc_, __get_long_pointer(), __get_long_cap());
-    __rep_.__s = __short();
-  }
-
-  // Replace the current buffer with __alloc; the first __size elements constitute a string
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __replace_internal_buffer(__long __alloc) {
-    __reset_internal_buffer();
-    __rep_.__l = __alloc;
+    __rep_ = __new_rep;
   }
 
   // Initialize the internal buffer to hold __size elements
@@ -2444,7 +2462,7 @@ private:
         __annotate_delete();
         auto __guard = std::__make_scope_guard(__annotate_new_size(*this));
         auto __alloc = __str.__alloc_;
-        __replace_internal_buffer(__allocate_long_buffer(__alloc, __str.size()));
+        __reset_internal_buffer(__allocate_long_buffer(__alloc, __str.size()));
         __alloc_ = std::move(__alloc);
       }
     }
@@ -2710,7 +2728,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::__
                       __sec_cp_sz);
   __buffer.__size_ = __n_copy + __n_add + __sec_cp_sz;
   traits_type::assign(__buffer.__data_[__buffer.__size_], value_type());
-  __replace_internal_buffer(__buffer);
+  __reset_internal_buffer(__buffer);
 }
 
 // __grow_by is deprecated because it does not set the size. It may not update the size when the size is changed, and it
@@ -2746,7 +2764,7 @@ _LIBCPP_DEPRECATED_("use __grow_by_without_replace") basic_string<_CharT, _Trait
   // This is -1 to make sure the caller sets the size properly, since old versions of this function didn't set the size
   // at all.
   __buffer.__size_ = -1;
-  __replace_internal_buffer(__buffer);
+  __reset_internal_buffer(__buffer);
 }
 
 template <class _CharT, class _Traits, class _Allocator>
@@ -3394,7 +3412,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::re
   __long __buffer  = __allocate_long_buffer(__alloc_, __requested_capacity);
   __buffer.__size_ = size();
   traits_type::copy(std::__to_address(__buffer.__data_), data(), __buffer.__size_ + 1);
-  __replace_internal_buffer(__buffer);
+  __reset_internal_buffer(__buffer);
 }
 
 template <class _CharT, class _Traits, class _Allocator>
@@ -3433,7 +3451,7 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocat
     }
 
     traits_type::copy(std::__to_address(__buffer.__data_), std::__to_address(__get_long_pointer()), __size + 1);
-    __replace_internal_buffer(__buffer);
+    __reset_internal_buffer(__buffer);
 #  if _LIBCPP_HAS_EXCEPTIONS
   } catch (...) {
     return;
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index 466f501b5f4f8..caa473012a7c4 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -235,7 +235,6 @@ template <class... Types>
 #  include <__tuple/tuple_element.h>
 #  include <__tuple/tuple_like.h>
 #  include <__tuple/tuple_size.h>
-#  include <__tuple/tuple_types.h>
 #  include <__type_traits/common_reference.h>
 #  include <__type_traits/common_type.h>
 #  include <__type_traits/conditional.h>
@@ -253,7 +252,6 @@ template <class... Types>
 #  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
 #  include <__type_traits/is_reference.h>
-#  include <__type_traits/is_replaceable.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_swappable.h>
 #  include <__type_traits/is_trivially_relocatable.h>
@@ -265,6 +263,7 @@ template <class... Types>
 #  include <__type_traits/remove_cv.h>
 #  include <__type_traits/remove_cvref.h>
 #  include <__type_traits/remove_reference.h>
+#  include <__type_traits/type_list.h>
 #  include <__type_traits/unwrap_ref.h>
 #  include <__utility/declval.h>
 #  include <__utility/forward.h>
@@ -347,7 +346,7 @@ using __tuple_common_comparison_category _LIBCPP_NODEBUG =
 
 // __tuple_leaf
 
-template <size_t _Ip, class _Hp, bool = is_empty<_Hp>::value && !__libcpp_is_final<_Hp>::value >
+template <size_t _Ip, class _Hp, bool = is_empty<_Hp>::value && !__is_final_v<_Hp> >
 class __tuple_leaf;
 
 template <size_t _Ip, class _Hp, bool _Ep>
@@ -571,7 +570,7 @@ __memberwise_copy_assign(_Dest& __dest, _Source const& __source, __index_sequenc
 
 template <class _Dest, class _Source, class... _Up, size_t... _Np>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
-__memberwise_forward_assign(_Dest& __dest, _Source&& __source, __tuple_types<_Up...>, __index_sequence<_Np...>) {
+__memberwise_forward_assign(_Dest& __dest, _Source&& __source, __type_list<_Up...>, __index_sequence<_Np...>) {
   std::__swallow(((std::get<_Np>(__dest) = std::forward<_Up>(std::get<_Np>(__source))), void(), 0)...);
 }
 
@@ -596,7 +595,6 @@ class _LIBCPP_NO_SPECIALIZATIONS tuple {
 public:
   using __trivially_relocatable _LIBCPP_NODEBUG =
       __conditional_t<_And<__libcpp_is_trivially_relocatable<_Tp>...>::value, tuple, void>;
-  using __replaceable _LIBCPP_NODEBUG = __conditional_t<_And<__is_replaceable<_Tp>...>::value, tuple, void>;
 
   // [tuple.cnstr]
 
@@ -876,7 +874,7 @@ public:
     requires(_And<is_assignable<const _Tp&, _Tp>...>::value)
   {
     std::__memberwise_forward_assign(
-        *this, std::move(__tuple), __tuple_types<_Tp...>(), __make_index_sequence<sizeof...(_Tp)>());
+        *this, std::move(__tuple), __type_list<_Tp...>(), __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 #    endif // _LIBCPP_STD_VER >= 23
@@ -885,7 +883,7 @@ public:
   operator=(_If<_And<is_move_assignable<_Tp>...>::value, tuple, __nat>&& __tuple) noexcept(
       _And<is_nothrow_move_assignable<_Tp>...>::value) {
     std::__memberwise_forward_assign(
-        *this, std::move(__tuple), __tuple_types<_Tp...>(), __make_index_sequence<sizeof...(_Tp)>());
+        *this, std::move(__tuple), __type_list<_Tp...>(), __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 
@@ -905,7 +903,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
   operator=(tuple<_Up...>&& __tuple) noexcept(_And<is_nothrow_assignable<_Tp&, _Up>...>::value) {
     std::__memberwise_forward_assign(
-        *this, std::move(__tuple), __tuple_types<_Up...>(), __make_index_sequence<sizeof...(_Tp)>());
+        *this, std::move(__tuple), __type_list<_Up...>(), __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 
@@ -922,7 +920,7 @@ public:
             enable_if_t< _And<_BoolConstant<sizeof...(_Tp) == sizeof...(_UTypes)>,
                               is_assignable<const _Tp&, _UTypes>...>::value>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI constexpr const tuple& operator=(tuple<_UTypes...>&& __u) const {
-    std::__memberwise_forward_assign(*this, __u, __tuple_types<_UTypes...>(), __make_index_sequence<sizeof...(_Tp)>());
+    std::__memberwise_forward_assign(*this, __u, __type_list<_UTypes...>(), __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 #    endif // _LIBCPP_STD_VER >= 23
@@ -1000,7 +998,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
   operator=(array<_Up, _Np>&& __array) noexcept(_And<is_nothrow_assignable<_Tp&, _Up>...>::value) {
     std::__memberwise_forward_assign(
-        *this, std::move(__array), __tuple_types<_If<true, _Up, _Tp>...>(), __make_index_sequence<sizeof...(_Tp)>());
+        *this, std::move(__array), __type_list<_If<true, _Up, _Tp>...>(), __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 
@@ -1443,7 +1441,7 @@ template <class _Tp, class _Tuple, class = enable_if_t<__can_make_from_tuple<_Tp
 inline _LIBCPP_HIDE_FROM_ABI constexpr _Tp make_from_tuple(_Tuple&& __t)
   noexcept(noexcept(std::__make_from_tuple_impl<_Tp>(std::forward<_Tuple>(__t),
                     make_index_sequence<tuple_size_v<remove_reference_t<_Tuple>>>()))) {
-#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_constructs_from_temporary)
+#if _LIBCPP_STD_VER >= 23
   if constexpr (tuple_size_v<remove_reference_t<_Tuple>> == 1) {
     static_assert(!std::reference_constructs_from_temporary_v<_Tp, decltype(std::get<0>(std::declval<_Tuple>()))>,
                   "Attempted construction of reference element binds to a temporary whose lifetime has ended");
diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits
index a6e0c1867566b..dab0c0640c389 100644
--- a/libcxx/include/type_traits
+++ b/libcxx/include/type_traits
@@ -454,6 +454,10 @@ namespace std
       template<class B> inline constexpr bool negation_v
         = negation<B>::value;                                   // since C++17
 
+      // [meta.const.eval], constant evaluation context
+      constexpr bool is_constant_evaluated() noexcept;                   // C++20
+      template<class T>
+        consteval bool is_within_lifetime(const T*) noexcept;            // C++26
 }
 
 */
@@ -559,6 +563,10 @@ namespace std
 #    include <__type_traits/reference_converts_from_temporary.h>
 #  endif
 
+#  if _LIBCPP_STD_VER >= 26
+#    include <__type_traits/is_within_lifetime.h>
+#  endif
+
 #  include <version>
 
 #  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo
index 24aaabf0a87df..f608c94d3031e 100644
--- a/libcxx/include/typeinfo
+++ b/libcxx/include/typeinfo
@@ -186,99 +186,99 @@ public:
 #        endif
 #      endif
 
-struct __type_info_implementations {
-  struct __string_impl_base {
-    typedef const char* __type_name_t;
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static const char*
-    __type_name_to_string(__type_name_t __v) _NOEXCEPT {
-      return __v;
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static __type_name_t
-    __string_to_type_name(const char* __v) _NOEXCEPT {
-      return __v;
-    }
-  };
+namespace __type_info_implementations {
+struct __string_impl_base {
+  typedef const char* __type_name_t;
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static const char*
+  __type_name_to_string(__type_name_t __v) _NOEXCEPT {
+    return __v;
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static __type_name_t
+  __string_to_type_name(const char* __v) _NOEXCEPT {
+    return __v;
+  }
+};
 
-  struct __unique_impl : __string_impl_base {
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT {
-      return reinterpret_cast<size_t>(__v);
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      return __lhs == __rhs;
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      return __lhs < __rhs;
-    }
-  };
-
-  struct __non_unique_impl : __string_impl_base {
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __ptr) _NOEXCEPT {
-      size_t __hash = 5381;
-      while (unsigned char __c = static_cast<unsigned char>(*__ptr++))
-        __hash = (__hash * 33) ^ __c;
-      return __hash;
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      return __lhs == __rhs || __builtin_strcmp(__lhs, __rhs) == 0;
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      return __builtin_strcmp(__lhs, __rhs) < 0;
-    }
-  };
+struct __unique_impl : __string_impl_base {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT {
+    return reinterpret_cast<size_t>(__v);
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    return __lhs == __rhs;
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    return __lhs < __rhs;
+  }
+};
+
+struct __non_unique_impl : __string_impl_base {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __ptr) _NOEXCEPT {
+    size_t __hash = 5381;
+    while (unsigned char __c = static_cast<unsigned char>(*__ptr++))
+      __hash = (__hash * 33) ^ __c;
+    return __hash;
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    return __lhs == __rhs || __builtin_strcmp(__lhs, __rhs) == 0;
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    return __builtin_strcmp(__lhs, __rhs) < 0;
+  }
+};
 
-  struct __non_unique_arm_rtti_bit_impl {
-    typedef uintptr_t __type_name_t;
+struct __non_unique_arm_rtti_bit_impl {
+  typedef uintptr_t __type_name_t;
 
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static const char* __type_name_to_string(__type_name_t __v) _NOEXCEPT {
-      return reinterpret_cast<const char*>(__v & ~__non_unique_rtti_bit::value);
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static __type_name_t __string_to_type_name(const char* __v) _NOEXCEPT {
-      return reinterpret_cast<__type_name_t>(__v);
-    }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static const char* __type_name_to_string(__type_name_t __v) _NOEXCEPT {
+    return reinterpret_cast<const char*>(__v & ~__non_unique_rtti_bit::value);
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static __type_name_t __string_to_type_name(const char* __v) _NOEXCEPT {
+    return reinterpret_cast<__type_name_t>(__v);
+  }
 
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT {
-      if (__is_type_name_unique(__v))
-        return __v;
-      return __non_unique_impl::__hash(__type_name_to_string(__v));
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      if (__lhs == __rhs)
-        return true;
-      if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs))
-        // Either both are unique and have a different address, or one of them
-        // is unique and the other one isn't. In both cases they are unequal.
-        return false;
-      return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) == 0;
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs))
-        return __lhs < __rhs;
-      return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) < 0;
-    }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT {
+    if (__is_type_name_unique(__v))
+      return __v;
+    return __non_unique_impl::__hash(__type_name_to_string(__v));
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    if (__lhs == __rhs)
+      return true;
+    if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs))
+      // Either both are unique and have a different address, or one of them
+      // is unique and the other one isn't. In both cases they are unequal.
+      return false;
+    return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) == 0;
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs))
+      return __lhs < __rhs;
+    return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) < 0;
+  }
 
-  private:
-    // The unique bit is the top bit. It is expected that __type_name_t is 64 bits when
-    // this implementation is actually used.
-    typedef integral_constant<__type_name_t, (1ULL << ((__CHAR_BIT__ * sizeof(__type_name_t)) - 1))>
-        __non_unique_rtti_bit;
+private:
+  // The unique bit is the top bit. It is expected that __type_name_t is 64 bits when
+  // this implementation is actually used.
+  typedef integral_constant<__type_name_t, (1ULL << ((__CHAR_BIT__ * sizeof(__type_name_t)) - 1))>
+      __non_unique_rtti_bit;
 
-    _LIBCPP_HIDE_FROM_ABI static bool __is_type_name_unique(__type_name_t __lhs) _NOEXCEPT {
-      return !(__lhs & __non_unique_rtti_bit::value);
-    }
-  };
+  _LIBCPP_HIDE_FROM_ABI static bool __is_type_name_unique(__type_name_t __lhs) _NOEXCEPT {
+    return !(__lhs & __non_unique_rtti_bit::value);
+  }
+};
 
-  typedef
+typedef
 #      if _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 1
-      __unique_impl
+    __unique_impl
 #      elif _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 2
-      __non_unique_impl
+    __non_unique_impl
 #      elif _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 3
-      __non_unique_arm_rtti_bit_impl
+    __non_unique_arm_rtti_bit_impl
 #      else
 #        error invalid configuration for _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION
 #      endif
-          __impl;
-};
+        __impl;
+} // namespace __type_info_implementations
 
 #      if __has_cpp_attribute(_Clang::__ptrauth_vtable_pointer__)
 #        if __has_feature(ptrauth_type_info_vtable_pointer_discrimination)
diff --git a/libcxx/include/unordered_set b/libcxx/include/unordered_set
index 4d0e2ac21e125..9873f1ec70664 100644
--- a/libcxx/include/unordered_set
+++ b/libcxx/include/unordered_set
@@ -544,8 +544,6 @@ template <class Value, class Hash, class Pred, class Alloc>
 #  include <__iterator/distance.h>
 #  include <__iterator/erase_if_container.h>
 #  include <__iterator/iterator_traits.h>
-#  include <__iterator/ranges_iterator_traits.h>
-#  include <__memory/addressof.h>
 #  include <__memory/allocator.h>
 #  include <__memory/allocator_traits.h>
 #  include <__memory_resource/polymorphic_allocator.h>
@@ -558,7 +556,6 @@ template <class Value, class Hash, class Pred, class Alloc>
 #  include <__type_traits/invoke.h>
 #  include <__type_traits/is_allocator.h>
 #  include <__type_traits/is_integral.h>
-#  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_swappable.h>
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 8e958581a6b07..df587ccf23843 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -247,7 +247,6 @@ namespace std {
 #  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
 #  include <__type_traits/is_reference.h>
-#  include <__type_traits/is_replaceable.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_swappable.h>
 #  include <__type_traits/is_trivially_assignable.h>
@@ -1172,7 +1171,6 @@ class _LIBCPP_DECLSPEC_EMPTY_BASES _LIBCPP_NO_SPECIALIZATIONS variant
 public:
   using __trivially_relocatable _LIBCPP_NODEBUG =
       conditional_t<_And<__libcpp_is_trivially_relocatable<_Types>...>::value, variant, void>;
-  using __replaceable _LIBCPP_NODEBUG = conditional_t<_And<__is_replaceable<_Types>...>::value, variant, void>;
 
   template <bool _Dummy                                                                               = true,
             enable_if_t<__dependent_type<is_default_constructible<__first_type>, _Dummy>::value, int> = 0>
diff --git a/libcxx/include/version b/libcxx/include/version
index 0fef1bb87cf60..05532ea731ff3 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -71,6 +71,8 @@ __cpp_lib_constexpr_charconv                            202207L <charconv>
 __cpp_lib_constexpr_cmath                               202202L <cmath> <cstdlib>
 __cpp_lib_constexpr_complex                             201711L <complex>
 __cpp_lib_constexpr_dynamic_alloc                       201907L <memory>
+__cpp_lib_constexpr_flat_map                            202502L <flat_map>
+__cpp_lib_constexpr_flat_set                            202502L <flat_set>
 __cpp_lib_constexpr_forward_list                        202502L <forward_list>
 __cpp_lib_constexpr_functional                          201907L <functional>
 __cpp_lib_constexpr_iterator                            201811L <iterator>
@@ -185,7 +187,8 @@ __cpp_lib_nonmember_container_access                    201411L <array> <deque>
 __cpp_lib_not_fn                                        202306L <functional>
                                                         201603L // C++17
 __cpp_lib_null_iterators                                201304L <iterator>
-__cpp_lib_optional                                      202110L <optional>
+__cpp_lib_optional                                      202506L <optional>
+                                                        202110L // C++23
                                                         202106L // C++20
                                                         201606L // C++17
 __cpp_lib_optional_range_support                        202406L <optional>
@@ -552,6 +555,8 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_bitset                               202306L
 # undef  __cpp_lib_constexpr_algorithms
 # define __cpp_lib_constexpr_algorithms                 202306L
+# define __cpp_lib_constexpr_flat_map                   202502L
+# define __cpp_lib_constexpr_flat_set                   202502L
 # define __cpp_lib_constexpr_forward_list               202502L
 # define __cpp_lib_constexpr_list                       202502L
 # if !defined(_LIBCPP_ABI_VCRUNTIME)
@@ -582,12 +587,16 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # if __has_builtin(__builtin_is_virtual_base_of)
 #   define __cpp_lib_is_virtual_base_of                 202406L
 # endif
-// # define __cpp_lib_is_within_lifetime                   202306L
+# if __has_builtin(__builtin_is_within_lifetime)
+#   define __cpp_lib_is_within_lifetime                 202306L
+# endif
 // # define __cpp_lib_linalg                               202311L
 # undef  __cpp_lib_mdspan
 # define __cpp_lib_mdspan                               202406L
 # undef  __cpp_lib_not_fn
 # define __cpp_lib_not_fn                               202306L
+# undef  __cpp_lib_optional
+# define __cpp_lib_optional                             202506L
 # define __cpp_lib_optional_range_support               202406L
 # undef  __cpp_lib_out_ptr
 # define __cpp_lib_out_ptr                              202311L
diff --git a/libcxx/modules/std/chrono.inc b/libcxx/modules/std/chrono.inc
index 66eccd8d290ad..db405d482bf9e 100644
--- a/libcxx/modules/std/chrono.inc
+++ b/libcxx/modules/std/chrono.inc
@@ -25,8 +25,8 @@ export namespace std {
 
     using std::chrono::duration_values;
 
-    // using std::chrono::is_clock;
-    // using std::chrono::is_clock_v;
+    using std::chrono::is_clock;
+    using std::chrono::is_clock_v;
 
     // [time.duration.nonmember], duration arithmetic
     using std::chrono::operator+;
diff --git a/libcxx/modules/std/exception.inc b/libcxx/modules/std/exception.inc
index 02b0f80190e5b..3dbc0112c15a0 100644
--- a/libcxx/modules/std/exception.inc
+++ b/libcxx/modules/std/exception.inc
@@ -18,6 +18,7 @@ export namespace std {
   using std::rethrow_exception;
   using std::rethrow_if_nested;
   using std::set_terminate;
+  using std::swap;
   using std::terminate;
   using std::terminate_handler;
   using std::throw_with_nested;
diff --git a/libcxx/modules/std/optional.inc b/libcxx/modules/std/optional.inc
index 9ee51117277ce..88de0bb4db12b 100644
--- a/libcxx/modules/std/optional.inc
+++ b/libcxx/modules/std/optional.inc
@@ -13,8 +13,9 @@ export namespace std {
 #if _LIBCPP_STD_VER >= 26
   // [optional.iterators], iterator support
   namespace ranges {
+    using std::ranges::enable_borrowed_range;
     using std::ranges::enable_view;
-  }
+  } // namespace ranges
 #endif
   // [optional.nullopt], no-value state indicator
   using std::nullopt;
diff --git a/libcxx/modules/std/type_traits.inc b/libcxx/modules/std/type_traits.inc
index 6823c86ed153b..4e49ed8f255c7 100644
--- a/libcxx/modules/std/type_traits.inc
+++ b/libcxx/modules/std/type_traits.inc
@@ -330,6 +330,9 @@ export namespace std {
 
   // [meta.const.eval], constant evaluation context
   using std::is_constant_evaluated;
+#if _LIBCPP_STD_VER >= 26 && __has_builtin(__builtin_is_within_lifetime)
+  using std::is_within_lifetime;
+#endif
 
   // [depr.meta.types]
   using std::aligned_storage;
diff --git a/libcxx/src/exception.cpp b/libcxx/src/exception.cpp
index ac6324cd9fe35..9932141006591 100644
--- a/libcxx/src/exception.cpp
+++ b/libcxx/src/exception.cpp
@@ -9,20 +9,12 @@
 #define _LIBCPP_ENABLE_CXX20_REMOVED_UNCAUGHT_EXCEPTION
 #define _LIBCPP_DISABLE_DEPRECATION_WARNINGS
 
-#include <exception>
-#include <new>
-#include <typeinfo>
-
-#if defined(LIBCXXRT) || defined(LIBCXX_BUILDING_LIBCXXABI)
-#  include <cxxabi.h>
-using namespace __cxxabiv1;
-#  define HAVE_DEPENDENT_EH_ABI 1
-#endif
+#include <__config>
 
 #if defined(_LIBCPP_ABI_MICROSOFT)
 #  include "support/runtime/exception_msvc.ipp"
 #  include "support/runtime/exception_pointer_msvc.ipp"
-#elif defined(_LIBCPPABI_VERSION)
+#elif defined(LIBCXX_BUILDING_LIBCXXABI)
 #  include "support/runtime/exception_libcxxabi.ipp"
 #  include "support/runtime/exception_pointer_cxxabi.ipp"
 #elif defined(LIBCXXRT)
diff --git a/libcxx/src/print.cpp b/libcxx/src/print.cpp
index 3f2baa6dcc60b..82cf2afd052e2 100644
--- a/libcxx/src/print.cpp
+++ b/libcxx/src/print.cpp
@@ -22,6 +22,14 @@
 #  include <windows.h>
 #elif __has_include(<unistd.h>)
 #  include <unistd.h>
+#  if defined(_NEWLIB_VERSION)
+#    if defined(_POSIX_C_SOURCE) && __has_include(<stdio.h>)
+#      include <stdio.h>
+#      define HAS_FILENO_AND_ISATTY
+#    endif
+#  else
+#    define HAS_FILENO_AND_ISATTY
+#  endif
 #endif
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -56,7 +64,7 @@ __write_to_windows_console([[maybe_unused]] FILE* __stream, [[maybe_unused]] wst
 }
 #  endif // _LIBCPP_HAS_WIDE_CHARACTERS
 
-#elif __has_include(<unistd.h>) // !_LIBCPP_WIN32API
+#elif defined(HAS_FILENO_AND_ISATTY) // !_LIBCPP_WIN32API
 
 _LIBCPP_EXPORTED_FROM_ABI bool __is_posix_terminal(FILE* __stream) { return isatty(fileno(__stream)); }
 #endif
diff --git a/libcxx/src/support/runtime/exception_fallback.ipp b/libcxx/src/support/runtime/exception_fallback.ipp
index ba283aee22901..dca904e902da1 100644
--- a/libcxx/src/support/runtime/exception_fallback.ipp
+++ b/libcxx/src/support/runtime/exception_fallback.ipp
@@ -8,6 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include <__verbose_abort>
+#include <exception>
+#include "include/atomic_support.h"
 
 namespace std {
 
diff --git a/libcxx/src/support/runtime/exception_glibcxx.ipp b/libcxx/src/support/runtime/exception_glibcxx.ipp
index aa67cab6bc239..5eb8d87f6d4e1 100644
--- a/libcxx/src/support/runtime/exception_glibcxx.ipp
+++ b/libcxx/src/support/runtime/exception_glibcxx.ipp
@@ -11,6 +11,9 @@
 #  error header can only be used when targeting libstdc++ or libsupc++
 #endif
 
+#include <exception>
+#include <new>
+
 namespace std {
 
 bad_alloc::bad_alloc() noexcept {}
diff --git a/libcxx/src/support/runtime/exception_libcxxabi.ipp b/libcxx/src/support/runtime/exception_libcxxabi.ipp
index df6bd6574bde2..c42bb237d9db8 100644
--- a/libcxx/src/support/runtime/exception_libcxxabi.ipp
+++ b/libcxx/src/support/runtime/exception_libcxxabi.ipp
@@ -7,6 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <exception>
+
+#include <cxxabi.h>
+
 #ifndef _LIBCPPABI_VERSION
 #  error this header can only be used with libc++abi
 #endif
@@ -17,9 +21,9 @@ bool uncaught_exception() noexcept { return uncaught_exceptions() > 0; }
 
 int uncaught_exceptions() noexcept {
 #if _LIBCPPABI_VERSION > 1001
-  return __cxa_uncaught_exceptions();
+  return abi::__cxa_uncaught_exceptions();
 #else
-  return __cxa_uncaught_exception() ? 1 : 0;
+  return abi::__cxa_uncaught_exception() ? 1 : 0;
 #endif
 }
 
diff --git a/libcxx/src/support/runtime/exception_libcxxrt.ipp b/libcxx/src/support/runtime/exception_libcxxrt.ipp
index f17fecc71e34b..6afdc006563c9 100644
--- a/libcxx/src/support/runtime/exception_libcxxrt.ipp
+++ b/libcxx/src/support/runtime/exception_libcxxrt.ipp
@@ -11,6 +11,8 @@
 #  error this header may only be used when targeting libcxxrt
 #endif
 
+#include <exception>
+
 namespace std {
 
 bad_exception::~bad_exception() noexcept {}
diff --git a/libcxx/src/support/runtime/exception_msvc.ipp b/libcxx/src/support/runtime/exception_msvc.ipp
index 2ae004bb02e5d..7114d90892cc1 100644
--- a/libcxx/src/support/runtime/exception_msvc.ipp
+++ b/libcxx/src/support/runtime/exception_msvc.ipp
@@ -12,6 +12,8 @@
 #endif
 
 #include <__verbose_abort>
+#include <exception>
+#include <new>
 
 extern "C" {
 typedef void(__cdecl* terminate_handler)();
diff --git a/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp b/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp
index 8f5c2060bb06c..75cb7c9d82ccd 100644
--- a/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp
@@ -7,22 +7,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef HAVE_DEPENDENT_EH_ABI
-#  error this header may only be used with libc++abi or libcxxrt
-#endif
+#include <cxxabi.h>
+#include <exception>
 
 namespace std {
 
-exception_ptr::~exception_ptr() noexcept { __cxa_decrement_exception_refcount(__ptr_); }
+exception_ptr::~exception_ptr() noexcept { abi::__cxa_decrement_exception_refcount(__ptr_); }
 
 exception_ptr::exception_ptr(const exception_ptr& other) noexcept : __ptr_(other.__ptr_) {
-  __cxa_increment_exception_refcount(__ptr_);
+  abi::__cxa_increment_exception_refcount(__ptr_);
 }
 
 exception_ptr& exception_ptr::operator=(const exception_ptr& other) noexcept {
   if (__ptr_ != other.__ptr_) {
-    __cxa_increment_exception_refcount(other.__ptr_);
-    __cxa_decrement_exception_refcount(__ptr_);
+    abi::__cxa_increment_exception_refcount(other.__ptr_);
+    abi::__cxa_decrement_exception_refcount(__ptr_);
     __ptr_ = other.__ptr_;
   }
   return *this;
@@ -31,7 +30,7 @@ exception_ptr& exception_ptr::operator=(const exception_ptr& other) noexcept {
 exception_ptr exception_ptr::__from_native_exception_pointer(void* __e) noexcept {
   exception_ptr ptr;
   ptr.__ptr_ = __e;
-  __cxa_increment_exception_refcount(ptr.__ptr_);
+  abi::__cxa_increment_exception_refcount(ptr.__ptr_);
 
   return ptr;
 }
@@ -51,12 +50,12 @@ exception_ptr current_exception() noexcept {
   // this whole function would be just:
   //    return exception_ptr(__cxa_current_primary_exception());
   exception_ptr ptr;
-  ptr.__ptr_ = __cxa_current_primary_exception();
+  ptr.__ptr_ = abi::__cxa_current_primary_exception();
   return ptr;
 }
 
 void rethrow_exception(exception_ptr p) {
-  __cxa_rethrow_primary_exception(p.__ptr_);
+  abi::__cxa_rethrow_primary_exception(p.__ptr_);
   // if p.__ptr_ is NULL, above returns so we terminate
   terminate();
 }
diff --git a/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp b/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp
index 174b44ce0e6f7..4b08db6f1ae6f 100644
--- a/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp
@@ -16,6 +16,8 @@
 // stable ABI), and its rethrow_exception(std::__exception_ptr::exception_ptr)
 // function.
 
+#include <exception>
+
 namespace std {
 
 namespace __exception_ptr {
diff --git a/libcxx/src/support/runtime/exception_pointer_msvc.ipp b/libcxx/src/support/runtime/exception_pointer_msvc.ipp
index 2be5136176e32..4141e0312349b 100644
--- a/libcxx/src/support/runtime/exception_pointer_msvc.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_msvc.ipp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <exception>
 #include <stdio.h>
 #include <stdlib.h>
 
diff --git a/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp b/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp
index 05a71ce34e5ac..5e55f0f6dede3 100644
--- a/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include <__verbose_abort>
+#include <exception>
 
 namespace std {
 
diff --git a/libcxx/test/benchmarks/containers/associative/associative_container_benchmarks.h b/libcxx/test/benchmarks/containers/associative/associative_container_benchmarks.h
index 22a6d0d753b0c..5dd55f244d885 100644
--- a/libcxx/test/benchmarks/containers/associative/associative_container_benchmarks.h
+++ b/libcxx/test/benchmarks/containers/associative/associative_container_benchmarks.h
@@ -11,6 +11,7 @@
 
 #include <algorithm>
 #include <iterator>
+#include <memory_resource>
 #include <random>
 #include <string>
 #include <ranges>
@@ -33,6 +34,9 @@ struct adapt_operations {
 
   // using InsertionResult = ...;
   // static Container::iterator get_iterator(InsertionResult const&);
+
+  // template <class Allocator>
+  // using rebind_alloc = ...;
 };
 
 template <class Container>
@@ -103,6 +107,61 @@ void associative_container_benchmarks(std::string container) {
     }
   });
 
+  bench("ctor(const&, alloc)", [=](auto& st) {
+    const std::size_t size = st.range(0);
+    std::vector<Value> in  = make_value_types(generate_unique_keys(size));
+    Container src(in.begin(), in.end());
+    ScratchSpace c[BatchSize];
+
+    while (st.KeepRunningBatch(BatchSize)) {
+      for (std::size_t i = 0; i != BatchSize; ++i) {
+        new (c + i) Container(src, std::allocator<typename Container::value_type>());
+        benchmark::DoNotOptimize(c + i);
+        benchmark::ClobberMemory();
+      }
+
+      st.PauseTiming();
+      for (std::size_t i = 0; i != BatchSize; ++i) {
+        reinterpret_cast<Container*>(c + i)->~Container();
+      }
+      st.ResumeTiming();
+    }
+  });
+
+  bench("ctor(&&, different allocs)", [=](auto& st) {
+    using PMRContainer = adapt_operations<Container>::template rebind_alloc<
+        std::pmr::polymorphic_allocator<typename Container::value_type>>;
+
+    const std::size_t size = st.range(0);
+    std::vector<Value> in  = make_value_types(generate_unique_keys(size));
+    std::pmr::monotonic_buffer_resource rs(size * 64 * BatchSize); // 64 bytes should be enough per node
+    std::vector<PMRContainer> srcs;
+    srcs.reserve(BatchSize);
+    for (size_t i = 0; i != BatchSize; ++i)
+      srcs.emplace_back(&rs).insert(in.begin(), in.end());
+    alignas(PMRContainer) char c[BatchSize * sizeof(PMRContainer)];
+
+    std::pmr::monotonic_buffer_resource rs2(size * 64 * BatchSize); // 64 bytes should be enough per node
+    while (st.KeepRunningBatch(BatchSize)) {
+      for (std::size_t i = 0; i != BatchSize; ++i) {
+        new (c + i * sizeof(PMRContainer)) PMRContainer(std::move(srcs[i]), &rs2);
+        benchmark::DoNotOptimize(c + i);
+        benchmark::ClobberMemory();
+      }
+
+      st.PauseTiming();
+      for (std::size_t i = 0; i != BatchSize; ++i) {
+        reinterpret_cast<PMRContainer*>(c + i * sizeof(PMRContainer))->~PMRContainer();
+      }
+      rs2.release();
+      srcs.clear();
+      for (size_t i = 0; i != BatchSize; ++i)
+        srcs.emplace_back(&rs).insert(in.begin(), in.end());
+
+      st.ResumeTiming();
+    }
+  });
+
   bench("ctor(iterator, iterator) (unsorted sequence)", [=](auto& st) {
     const std::size_t size = st.range(0);
     std::mt19937 randomness;
diff --git a/libcxx/test/benchmarks/containers/associative/flat_map.bench.cpp b/libcxx/test/benchmarks/containers/associative/flat_map.bench.cpp
index f3b86554802ca..407afb14e1e13 100644
--- a/libcxx/test/benchmarks/containers/associative/flat_map.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/flat_map.bench.cpp
@@ -24,6 +24,14 @@ struct support::adapt_operations<std::flat_map<K, V>> {
 
   using InsertionResult = std::pair<typename std::flat_map<K, V>::iterator, bool>;
   static auto get_iterator(InsertionResult const& result) { return result.first; }
+
+  template <class Allocator>
+  using rebind_alloc =
+      std::flat_map<K,
+                    V,
+                    std::less<K>,
+                    std::vector<K, typename std::allocator_traits<Allocator>::template rebind_alloc<K>>,
+                    std::vector<V, typename std::allocator_traits<Allocator>::template rebind_alloc<V>>>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/flat_multimap.bench.cpp b/libcxx/test/benchmarks/containers/associative/flat_multimap.bench.cpp
index 80eaa549042c6..4f70d26116b0b 100644
--- a/libcxx/test/benchmarks/containers/associative/flat_multimap.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/flat_multimap.bench.cpp
@@ -23,6 +23,14 @@ struct support::adapt_operations<std::flat_multimap<K, V>> {
 
   using InsertionResult = typename std::flat_multimap<K, V>::iterator;
   static auto get_iterator(InsertionResult const& result) { return result; }
+
+  template <class Allocator>
+  using rebind_alloc =
+      std::flat_multimap<K,
+                         V,
+                         std::less<K>,
+                         std::vector<K, typename std::allocator_traits<Allocator>::template rebind_alloc<K>>,
+                         std::vector<V, typename std::allocator_traits<Allocator>::template rebind_alloc<V>>>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/map.bench.cpp b/libcxx/test/benchmarks/containers/associative/map.bench.cpp
index 142229ae64cad..cc9ffd857caf2 100644
--- a/libcxx/test/benchmarks/containers/associative/map.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/map.bench.cpp
@@ -38,6 +38,9 @@ struct support::adapt_operations<std::map<K, V>> {
 
   using InsertionResult = std::pair<typename std::map<K, V>::iterator, bool>;
   static auto get_iterator(InsertionResult const& result) { return result.first; }
+
+  template <class Allocator>
+  using rebind_alloc = std::map<K, V, std::less<K>, Allocator>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/multimap.bench.cpp b/libcxx/test/benchmarks/containers/associative/multimap.bench.cpp
index 15a0b573081bb..8e3abf0b7cf8b 100644
--- a/libcxx/test/benchmarks/containers/associative/multimap.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/multimap.bench.cpp
@@ -24,6 +24,9 @@ struct support::adapt_operations<std::multimap<K, V>> {
 
   using InsertionResult = typename std::multimap<K, V>::iterator;
   static auto get_iterator(InsertionResult const& result) { return result; }
+
+  template <class Allocator>
+  using rebind_alloc = std::multimap<K, V, std::less<K>, Allocator>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/multiset.bench.cpp b/libcxx/test/benchmarks/containers/associative/multiset.bench.cpp
index c205e0a4f793f..7bafd0ab52dce 100644
--- a/libcxx/test/benchmarks/containers/associative/multiset.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/multiset.bench.cpp
@@ -22,6 +22,9 @@ struct support::adapt_operations<std::multiset<K>> {
 
   using InsertionResult = typename std::multiset<K>::iterator;
   static auto get_iterator(InsertionResult const& result) { return result; }
+
+  template <class Allocator>
+  using rebind_alloc = std::multiset<K, std::less<K>, Allocator>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/set.bench.cpp b/libcxx/test/benchmarks/containers/associative/set.bench.cpp
index 50ee142b6e8b3..e5a6cc58913d2 100644
--- a/libcxx/test/benchmarks/containers/associative/set.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/set.bench.cpp
@@ -23,6 +23,9 @@ struct support::adapt_operations<std::set<K>> {
 
   using InsertionResult = std::pair<typename std::set<K>::iterator, bool>;
   static auto get_iterator(InsertionResult const& result) { return result.first; }
+
+  template <class Allocator>
+  using rebind_alloc = std::set<K, std::less<K>, Allocator>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/unordered_map.bench.cpp b/libcxx/test/benchmarks/containers/associative/unordered_map.bench.cpp
index d670c531910ea..ddfc90c306010 100644
--- a/libcxx/test/benchmarks/containers/associative/unordered_map.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/unordered_map.bench.cpp
@@ -37,6 +37,9 @@ struct support::adapt_operations<std::unordered_map<K, V>> {
 
   using InsertionResult = std::pair<typename std::unordered_map<K, V>::iterator, bool>;
   static auto get_iterator(InsertionResult const& result) { return result.first; }
+
+  template <class Allocator>
+  using rebind_alloc = std::unordered_map<K, V, std::hash<K>, std::equal_to<K>, Allocator>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/unordered_multimap.bench.cpp b/libcxx/test/benchmarks/containers/associative/unordered_multimap.bench.cpp
index 8738ca4bf9f0c..5d92bd8b2deaf 100644
--- a/libcxx/test/benchmarks/containers/associative/unordered_multimap.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/unordered_multimap.bench.cpp
@@ -23,6 +23,9 @@ struct support::adapt_operations<std::unordered_multimap<K, V>> {
 
   using InsertionResult = typename std::unordered_multimap<K, V>::iterator;
   static auto get_iterator(InsertionResult const& result) { return result; }
+
+  template <class Allocator>
+  using rebind_alloc = std::unordered_multimap<K, V, std::hash<K>, std::equal_to<K>, Allocator>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/unordered_multiset.bench.cpp b/libcxx/test/benchmarks/containers/associative/unordered_multiset.bench.cpp
index 4888b01bfeba0..09412fc4aeae7 100644
--- a/libcxx/test/benchmarks/containers/associative/unordered_multiset.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/unordered_multiset.bench.cpp
@@ -22,6 +22,9 @@ struct support::adapt_operations<std::unordered_multiset<K>> {
 
   using InsertionResult = typename std::unordered_multiset<K>::iterator;
   static auto get_iterator(InsertionResult const& result) { return result; }
+
+  template <class Allocator>
+  using rebind_alloc = std::unordered_multiset<K, std::hash<K>, std::equal_to<K>, Allocator>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/containers/associative/unordered_set.bench.cpp b/libcxx/test/benchmarks/containers/associative/unordered_set.bench.cpp
index 89443a597e85a..1b6663321b43c 100644
--- a/libcxx/test/benchmarks/containers/associative/unordered_set.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/unordered_set.bench.cpp
@@ -24,6 +24,9 @@ struct support::adapt_operations<std::unordered_set<K>> {
 
   using InsertionResult = std::pair<typename std::unordered_set<K>::iterator, bool>;
   static auto get_iterator(InsertionResult const& result) { return result.first; }
+
+  template <class Allocator>
+  using rebind_alloc = std::unordered_set<K, std::hash<K>, std::equal_to<K>, Allocator>;
 };
 
 int main(int argc, char** argv) {
diff --git a/libcxx/test/benchmarks/streams/ofstream.bench.cpp b/libcxx/test/benchmarks/streams/fstream.bench.cpp
similarity index 54%
rename from libcxx/test/benchmarks/streams/ofstream.bench.cpp
rename to libcxx/test/benchmarks/streams/fstream.bench.cpp
index 60606a9d67e2f..3ca1801ca8d03 100644
--- a/libcxx/test/benchmarks/streams/ofstream.bench.cpp
+++ b/libcxx/test/benchmarks/streams/fstream.bench.cpp
@@ -11,7 +11,7 @@
 
 #include <benchmark/benchmark.h>
 
-static void bm_write(benchmark::State& state) {
+static void bm_ofstream_write(benchmark::State& state) {
   std::vector<char> buffer;
   buffer.resize(16384);
 
@@ -20,6 +20,24 @@ static void bm_write(benchmark::State& state) {
   for (auto _ : state)
     stream.write(buffer.data(), buffer.size());
 }
-BENCHMARK(bm_write);
+BENCHMARK(bm_ofstream_write);
+
+static void bm_ifstream_read(benchmark::State& state) {
+  std::vector<char> buffer;
+  buffer.resize(16384);
+
+  std::ofstream gen_testfile("testfile");
+  gen_testfile.write(buffer.data(), buffer.size());
+
+  std::ifstream stream("testfile");
+  assert(stream);
+
+  for (auto _ : state) {
+    stream.read(buffer.data(), buffer.size());
+    benchmark::DoNotOptimize(buffer);
+    stream.seekg(0);
+  }
+}
+BENCHMARK(bm_ifstream_read);
 
 BENCHMARK_MAIN();
diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp
index 248f282209fd7..acd20ce525a0d 100644
--- a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp
+++ b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp
@@ -21,7 +21,7 @@
 #include "../flat_helpers.h"
 #include "test_macros.h"
 
-bool test() {
+constexpr bool test() {
   using M = std::flat_multiset<TrackCopyMove>;
   {
     M m;
@@ -43,6 +43,9 @@ bool test() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp
index 57a581c6c5cb9..c2fcd86fcf913 100644
--- a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp
+++ b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp
@@ -20,27 +20,36 @@
 #include <cassert>
 #include <flat_set>
 #include <ranges>
-#include <sstream>
 #include <vector>
 
 #include "../flat_helpers.h"
+#include "test_iterators.h"
 #include "test_macros.h"
 
-void test() {
+constexpr bool test() {
   NotQuiteSequenceContainer<int> v;
   std::flat_multiset s(v);
-  std::istringstream ints("0 1 1 0");
-  auto r = std::ranges::subrange(std::istream_iterator<int>(ints), std::istream_iterator<int>()) |
-           std::views::transform([](int i) { return i * i; });
+
+  int ar[]   = {0, 1, 1, 0};
+  using Iter = cpp20_input_iterator<const int*>;
+  using Sent = sentinel_wrapper<Iter>;
+  using R    = std::ranges::subrange<Iter, Sent>;
+  auto r     = R(Iter(ar), Sent(Iter(ar + 4)));
+
   static_assert(
       ![](auto& t) { return requires { t.insert_range(t.end(), r); }; }(v),
       "This test is to test the case where the underlying container does not provide insert_range");
   s.insert_range(r);
   assert(std::ranges::equal(s, std::vector<int>{0, 0, 1, 1}));
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/libcxx/diagnostics/string.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/string.nodiscard.verify.cpp
index d421eaf5cb8f1..f020516a2495a 100644
--- a/libcxx/test/libcxx/diagnostics/string.nodiscard.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/string.nodiscard.verify.cpp
@@ -11,13 +11,122 @@
 // check that <string> functions are marked [[nodiscard]]
 
 #include <string>
+#include <string_view>
 
 #include "test_macros.h"
 
+std::string prval();
+
 void test() {
-  std::string string;
-  string.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::string str;
+  const std::string cstr;
+  std::string_view sv;
+
+  str[0];     // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cstr[0];    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.at(0);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cstr.at(0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  str.c_str(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.data();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cstr.data(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  str.substr(0);     // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  prval().substr(0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  str.front();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cstr.front(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.back();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cstr.back();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  str.begin();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cstr.begin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.end();    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cstr.end();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  str.rbegin();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cstr.rbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.rend();    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cstr.rend();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  str.cbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.cend();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  str.crbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.crend();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  str.capacity();      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.empty();         // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.length();        // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.size();          // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.max_size();      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.get_allocator(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  str.find(str);      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find(sv);       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find(' ');      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find("");       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find("", 0, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  str.rfind(str);      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.rfind(sv);       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.rfind(' ');      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.rfind("");       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.rfind("", 0, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  // clang-format off
+  str.find_first_of(str);      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find_first_of(sv);       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find_first_of(' ');      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find_first_of("");       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find_first_of("", 0, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  str.find_last_of(str);      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find_last_of(sv);       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find_last_of(' ');      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find_last_of("");       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find_last_of("", 0, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  str.find_first_not_of(str);      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find_first_not_of(sv);       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find_first_not_of(' ');      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find_first_not_of("");       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find_first_not_of("", 0, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  str.find_last_not_of(str);      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find_last_not_of(sv);       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find_last_not_of(' ');      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find_last_not_of("");       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.find_last_not_of("", 0, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  str.compare(str);          // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.compare(sv);           // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.compare(0, 0, sv);     // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.compare(0, 0, sv, 0);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.compare(0, 0, str);    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.compare(0, 0, str, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.compare("");           // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.compare(0, 0, "");     // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.compare(0, 0, "", 0);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // clang-format on
+
+#if TEST_STD_VER >= 20
+  str.starts_with(sv);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.starts_with(' '); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.starts_with("");  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  str.ends_with(sv);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.ends_with(' '); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.ends_with("");  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+#endif
+
+#if TEST_STD_VER >= 23
+  str.contains(sv);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.contains(' '); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.contains("");  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+#endif
+
 #if TEST_STD_VER >= 26
-  string.subview(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  str.subview(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
 #endif
 }
diff --git a/libcxx/test/libcxx/input.output/file.streams/fstreams/filebuf/traits_mismatch.verify.cpp b/libcxx/test/libcxx/input.output/file.streams/fstreams/filebuf/traits_mismatch.verify.cpp
index 283adbc057d1e..30e7b66d42325 100644
--- a/libcxx/test/libcxx/input.output/file.streams/fstreams/filebuf/traits_mismatch.verify.cpp
+++ b/libcxx/test/libcxx/input.output/file.streams/fstreams/filebuf/traits_mismatch.verify.cpp
@@ -19,4 +19,4 @@
 
 std::basic_filebuf<char, std::char_traits<wchar_t> > f;
 // expected-error-re@*:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
-// expected-error@*:* 10 {{only virtual member functions can be marked 'override'}}
+// expected-error@*:* 11 {{only virtual member functions can be marked 'override'}}
diff --git a/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp b/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp
index ba6f3c31d3e34..daafb36f9151a 100644
--- a/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp
+++ b/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp
@@ -21,7 +21,7 @@ std::basic_fstream<char, std::char_traits<wchar_t> > f;
 // expected-error-re@*:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
 // expected-error-re@*:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
 
-// expected-error@*:* 12 {{only virtual member functions can be marked 'override'}}
+// expected-error@*:* 13 {{only virtual member functions can be marked 'override'}}
 
 // FIXME: As of commit r324062 Clang incorrectly generates a diagnostic about mismatching
 // exception specifications for types which are already invalid for one reason or another.
diff --git a/libcxx/test/libcxx/input.output/iostreams.base/ios.base/ios.base.cons/dtor.uninitialized.pass.cpp b/libcxx/test/libcxx/input.output/iostreams.base/ios.base/ios.base.cons/dtor.uninitialized.pass.cpp
index 16d66e3be14ee..e5d48a35f4fd7 100644
--- a/libcxx/test/libcxx/input.output/iostreams.base/ios.base/ios.base.cons/dtor.uninitialized.pass.cpp
+++ b/libcxx/test/libcxx/input.output/iostreams.base/ios.base/ios.base.cons/dtor.uninitialized.pass.cpp
@@ -10,8 +10,7 @@
 
 // The fix for issue 57964 requires an updated dylib due to explicit
 // instantiations. That means Apple backdeployment targets remain broken.
-// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added
-// XFAIL: using-built-library-before-llvm-19 && !darwin
+// XFAIL: using-built-library-before-llvm-19
 
 // <ios>
 
diff --git a/libcxx/test/libcxx/numerics/nodiscard.verify.cpp b/libcxx/test/libcxx/numerics/nodiscard.verify.cpp
new file mode 100644
index 0000000000000..10da62feca7c0
--- /dev/null
+++ b/libcxx/test/libcxx/numerics/nodiscard.verify.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++20
+
+// <numeric>
+
+// Check that functions are marked [[nodiscard]]
+
+#include <bit>
+#include <numeric>
+
+#include "test_macros.h"
+
+void test() {
+  // [bit.rotate]
+  std::rotl(0u, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::rotr(0u, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  // clang-format off
+#if TEST_STD_VER >= 26
+  // [numeric.sat]
+  std::add_sat(94, 82);               // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::sub_sat(94, 82);               // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::mul_sat(94, 82);               // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::div_sat(94, 82);               // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::saturate_cast<signed int>(49); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+#endif // TEST_STD_VER >= 26
+  // clang-format on
+}
diff --git a/libcxx/test/libcxx/strings/basic.string/nonnull.verify.cpp b/libcxx/test/libcxx/strings/basic.string/nonnull.verify.cpp
index f428c49fd05f4..c9df9f061905e 100644
--- a/libcxx/test/libcxx/strings/basic.string/nonnull.verify.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/nonnull.verify.cpp
@@ -13,6 +13,8 @@
 // Clang 19 and AppleClang don't have diagnose_if with diagnostic flags
 // UNSUPPORTED: clang-19, apple-clang-17
 
+// ADDITIONAL_COMPILE_FLAGS: -Wno-unused-result
+
 #include <string>
 
 #include "test_macros.h"
diff --git a/libcxx/test/libcxx/time/time.traits/is.clock.verify.cpp b/libcxx/test/libcxx/time/time.traits/is.clock.verify.cpp
new file mode 100644
index 0000000000000..f4f438d348a85
--- /dev/null
+++ b/libcxx/test/libcxx/time/time.traits/is.clock.verify.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++20
+
+// <chrono>
+//
+// template<class T> struct is_clock;
+// template<class T> constexpr bool is_clock_v = is_clock<T>::value;
+
+// [time.traits.is.clock]/3:
+//  The behavior of a program that adds specializations for is_clock is undefined.
+
+// [namespace.std]/3:
+//   The behavior of a C++ program is undefined if it declares an explicit or partial specialization of any standard
+//   library variable template, except where explicitly permitted by the specification of that variable template.
+
+#include <chrono>
+#include <ratio>
+
+#if !__has_warning("-Winvalid-specializations")
+// expected-no-diagnostics
+#else
+
+template <>
+struct std::chrono::is_clock<int> : std::false_type {}; // expected-error@*:* {{'is_clock' cannot be specialized}}
+
+template <>
+constexpr bool std::chrono::is_clock_v<float> = false; // expected-error@*:* {{'is_clock_v' cannot be specialized}}
+
+#endif
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index d047b29b63cc6..8c3e1f0a97dfe 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -245,7 +245,6 @@ deque stdexcept
 deque tuple
 deque version
 exception cstdint
-exception cstdlib
 exception typeinfo
 exception version
 execution version
diff --git a/libcxx/test/libcxx/type_traits/is_replaceable.compile.pass.cpp b/libcxx/test/libcxx/type_traits/is_replaceable.compile.pass.cpp
deleted file mode 100644
index c04e9443c8e67..0000000000000
--- a/libcxx/test/libcxx/type_traits/is_replaceable.compile.pass.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <__type_traits/is_replaceable.h>
-#include <array>
-#include <deque>
-#include <exception>
-#include <expected>
-#include <memory>
-#include <optional>
-#include <string>
-#include <tuple>
-#include <type_traits>
-#include <variant>
-#include <vector>
-
-#include "constexpr_char_traits.h"
-#include "test_allocator.h"
-#include "test_macros.h"
-
-#ifndef TEST_HAS_NO_LOCALIZATION
-#  include <locale>
-#endif
-
-template <class T>
-struct NonPropagatingStatefulMoveAssignAlloc : std::allocator<T> {
-  using propagate_on_container_move_assignment = std::false_type;
-  using is_always_equal                        = std::false_type;
-  template <class U>
-  struct rebind {
-    using other = NonPropagatingStatefulMoveAssignAlloc<U>;
-  };
-};
-
-template <class T>
-struct NonPropagatingStatefulCopyAssignAlloc : std::allocator<T> {
-  using propagate_on_container_copy_assignment = std::false_type;
-  using is_always_equal                        = std::false_type;
-  template <class U>
-  struct rebind {
-    using other = NonPropagatingStatefulCopyAssignAlloc<U>;
-  };
-};
-
-template <class T>
-struct NonPropagatingStatelessMoveAssignAlloc : std::allocator<T> {
-  using propagate_on_container_move_assignment = std::false_type;
-  using is_always_equal                        = std::true_type;
-  template <class U>
-  struct rebind {
-    using other = NonPropagatingStatelessMoveAssignAlloc<U>;
-  };
-};
-
-template <class T>
-struct NonPropagatingStatelessCopyAssignAlloc : std::allocator<T> {
-  using propagate_on_container_copy_assignment = std::false_type;
-  using is_always_equal                        = std::true_type;
-  template <class U>
-  struct rebind {
-    using other = NonPropagatingStatelessCopyAssignAlloc<U>;
-  };
-};
-
-template <class T>
-struct NonReplaceableStatelessAlloc : std::allocator<T> {
-  // Ensure that we don't consider an allocator that is a member of a container to be
-  // replaceable if it's not replaceable, even if it always compares equal and always propagates.
-  using propagate_on_container_move_assignment = std::true_type;
-  using propagate_on_container_copy_assignment = std::true_type;
-  using is_always_equal                        = std::true_type;
-  NonReplaceableStatelessAlloc()               = default;
-  NonReplaceableStatelessAlloc(NonReplaceableStatelessAlloc const&) {}
-  NonReplaceableStatelessAlloc(NonReplaceableStatelessAlloc&&) = default;
-  template <class U>
-  struct rebind {
-    using other = NonReplaceableStatelessAlloc<U>;
-  };
-};
-static_assert(!std::__is_replaceable<NonReplaceableStatelessAlloc<int> >::value, "");
-
-static_assert(!std::__is_replaceable<test_allocator<char> >::value, ""); // we use that property below
-
-struct Empty {};
-static_assert(std::__is_replaceable<char>::value, "");
-static_assert(std::__is_replaceable<int>::value, "");
-static_assert(std::__is_replaceable<double>::value, "");
-static_assert(std::__is_replaceable<Empty>::value, "");
-
-struct TriviallyCopyable {
-  char c;
-  int i;
-  Empty s;
-};
-static_assert(std::__is_replaceable<TriviallyCopyable>::value, "");
-
-struct NotTriviallyCopyable {
-  NotTriviallyCopyable(const NotTriviallyCopyable&);
-  ~NotTriviallyCopyable();
-};
-static_assert(!std::__is_replaceable<NotTriviallyCopyable>::value, "");
-
-struct MoveOnlyTriviallyCopyable {
-  MoveOnlyTriviallyCopyable(const MoveOnlyTriviallyCopyable&)            = delete;
-  MoveOnlyTriviallyCopyable& operator=(const MoveOnlyTriviallyCopyable&) = delete;
-  MoveOnlyTriviallyCopyable(MoveOnlyTriviallyCopyable&&)                 = default;
-  MoveOnlyTriviallyCopyable& operator=(MoveOnlyTriviallyCopyable&&)      = default;
-};
-static_assert(std::__is_replaceable<MoveOnlyTriviallyCopyable>::value, "");
-
-struct CustomCopyAssignment {
-  CustomCopyAssignment(const CustomCopyAssignment&) = default;
-  CustomCopyAssignment(CustomCopyAssignment&&)      = default;
-  CustomCopyAssignment& operator=(const CustomCopyAssignment&);
-  CustomCopyAssignment& operator=(CustomCopyAssignment&&) = default;
-};
-static_assert(!std::__is_replaceable<CustomCopyAssignment>::value, "");
-
-struct CustomMoveAssignment {
-  CustomMoveAssignment(const CustomMoveAssignment&)            = default;
-  CustomMoveAssignment(CustomMoveAssignment&&)                 = default;
-  CustomMoveAssignment& operator=(const CustomMoveAssignment&) = default;
-  CustomMoveAssignment& operator=(CustomMoveAssignment&&);
-};
-static_assert(!std::__is_replaceable<CustomMoveAssignment>::value, "");
-
-// library-internal types
-// ----------------------
-
-// __split_buffer
-static_assert(
-    std::__is_replaceable<std::__split_buffer<int, std::allocator<int>, std::__split_buffer_pointer_layout> >::value,
-    "");
-static_assert(std::__is_replaceable<std::__split_buffer<NotTriviallyCopyable,
-                                                        std::allocator<NotTriviallyCopyable>,
-                                                        std::__split_buffer_pointer_layout> >::value,
-              "");
-static_assert(
-    !std::__is_replaceable<
-        std::__split_buffer<int, NonPropagatingStatefulCopyAssignAlloc<int>, std::__split_buffer_pointer_layout > >::
-        value,
-    "");
-static_assert(
-    !std::__is_replaceable<
-        std::__split_buffer<int, NonPropagatingStatefulMoveAssignAlloc<int>, std::__split_buffer_pointer_layout > >::
-        value,
-    "");
-static_assert(
-    std::__is_replaceable<
-        std::__split_buffer<int, NonPropagatingStatelessCopyAssignAlloc<int>, std::__split_buffer_pointer_layout > >::
-        value,
-    "");
-static_assert(
-    std::__is_replaceable<
-        std::__split_buffer<int, NonPropagatingStatelessMoveAssignAlloc<int>, std::__split_buffer_pointer_layout > >::
-        value,
-    "");
-
-static_assert(
-    std::__is_replaceable<std::__split_buffer<int, std::allocator<int>, std::__split_buffer_size_layout> >::value, "");
-static_assert(std::__is_replaceable<std::__split_buffer<NotTriviallyCopyable,
-                                                        std::allocator<NotTriviallyCopyable>,
-                                                        std::__split_buffer_size_layout> >::value,
-              "");
-static_assert(
-    !std::__is_replaceable<
-        std::__split_buffer<int, NonPropagatingStatefulCopyAssignAlloc<int>, std::__split_buffer_size_layout > >::value,
-    "");
-static_assert(
-    !std::__is_replaceable<
-        std::__split_buffer<int, NonPropagatingStatefulMoveAssignAlloc<int>, std::__split_buffer_size_layout > >::value,
-    "");
-static_assert(
-    std::__is_replaceable<
-        std::__split_buffer<int, NonPropagatingStatelessCopyAssignAlloc<int>, std::__split_buffer_size_layout > >::
-        value,
-    "");
-static_assert(
-    std::__is_replaceable<
-        std::__split_buffer<int, NonPropagatingStatelessMoveAssignAlloc<int>, std::__split_buffer_size_layout > >::
-        value,
-    "");
-
-// standard library types
-// ----------------------
-
-// array
-static_assert(std::__is_replaceable<std::array<int, 0> >::value, "");
-static_assert(std::__is_replaceable<std::array<NotTriviallyCopyable, 0> >::value, "");
-static_assert(std::__is_replaceable<std::array<std::unique_ptr<int>, 0> >::value, "");
-
-static_assert(std::__is_replaceable<std::array<int, 1> >::value, "");
-static_assert(!std::__is_replaceable<std::array<NotTriviallyCopyable, 1> >::value, "");
-static_assert(std::__is_replaceable<std::array<std::unique_ptr<int>, 1> >::value, "");
-
-// basic_string
-struct MyChar {
-  char c;
-};
-template <class T>
-struct NotReplaceableCharTraits : constexpr_char_traits<T> {
-  NotReplaceableCharTraits(const NotReplaceableCharTraits&);
-  NotReplaceableCharTraits& operator=(const NotReplaceableCharTraits&);
-  ~NotReplaceableCharTraits();
-};
-
-static_assert(std::__is_replaceable<std::basic_string<char, std::char_traits<char>, std::allocator<char> > >::value,
-              "");
-static_assert(
-    std::__is_replaceable<std::basic_string<char, NotReplaceableCharTraits<char>, std::allocator<char> > >::value, "");
-static_assert(
-    std::__is_replaceable<std::basic_string<MyChar, constexpr_char_traits<MyChar>, std::allocator<MyChar> > >::value,
-    "");
-static_assert(!std::__is_replaceable<std::basic_string<char, std::char_traits<char>, test_allocator<char> > >::value,
-              "");
-static_assert(!std::__is_replaceable<
-                  std::basic_string<char, std::char_traits<char>, NonReplaceableStatelessAlloc<char> > >::value,
-              "");
-static_assert(std::__is_replaceable<
-                  std::basic_string<MyChar, NotReplaceableCharTraits<MyChar>, std::allocator<MyChar> > >::value,
-              "");
-static_assert(
-    !std::__is_replaceable<
-        std::basic_string<char, std::char_traits<char>, NonPropagatingStatefulCopyAssignAlloc<char> > >::value,
-    "");
-static_assert(
-    !std::__is_replaceable<
-        std::basic_string<char, std::char_traits<char>, NonPropagatingStatefulMoveAssignAlloc<char> > >::value,
-    "");
-static_assert(
-    std::__is_replaceable<
-        std::basic_string<char, std::char_traits<char>, NonPropagatingStatelessCopyAssignAlloc<char> > >::value,
-    "");
-static_assert(
-    std::__is_replaceable<
-        std::basic_string<char, std::char_traits<char>, NonPropagatingStatelessMoveAssignAlloc<char> > >::value,
-    "");
-
-// deque
-static_assert(std::__is_replaceable<std::deque<int> >::value, "");
-static_assert(std::__is_replaceable<std::deque<NotTriviallyCopyable> >::value, "");
-static_assert(!std::__is_replaceable<std::deque<int, test_allocator<int> > >::value, "");
-static_assert(!std::__is_replaceable<std::deque<int, NonReplaceableStatelessAlloc<int> > >::value, "");
-static_assert(!std::__is_replaceable<std::deque<int, NonPropagatingStatefulCopyAssignAlloc<int> > >::value, "");
-static_assert(!std::__is_replaceable<std::deque<int, NonPropagatingStatefulMoveAssignAlloc<int> > >::value, "");
-static_assert(std::__is_replaceable<std::deque<int, NonPropagatingStatelessCopyAssignAlloc<int> > >::value, "");
-static_assert(std::__is_replaceable<std::deque<int, NonPropagatingStatelessMoveAssignAlloc<int> > >::value, "");
-
-// exception_ptr
-#ifndef _LIBCPP_ABI_MICROSOFT
-static_assert(std::__is_replaceable<std::exception_ptr>::value, "");
-#endif
-
-// expected
-#if TEST_STD_VER >= 23
-static_assert(std::__is_replaceable<std::expected<int, int> >::value);
-static_assert(!std::__is_replaceable<std::expected<CustomCopyAssignment, int>>::value);
-static_assert(!std::__is_replaceable<std::expected<int, CustomCopyAssignment>>::value);
-static_assert(!std::__is_replaceable<std::expected<CustomCopyAssignment, CustomCopyAssignment>>::value);
-#endif
-
-// locale
-#ifndef TEST_HAS_NO_LOCALIZATION
-static_assert(std::__is_replaceable<std::locale>::value, "");
-#endif
-
-// optional
-#if TEST_STD_VER >= 17
-static_assert(std::__is_replaceable<std::optional<int>>::value, "");
-static_assert(!std::__is_replaceable<std::optional<CustomCopyAssignment>>::value, "");
-#endif
-
-// pair
-static_assert(std::__is_replaceable<std::pair<int, int> >::value, "");
-static_assert(!std::__is_replaceable<std::pair<CustomCopyAssignment, int> >::value, "");
-static_assert(!std::__is_replaceable<std::pair<int, CustomCopyAssignment> >::value, "");
-static_assert(!std::__is_replaceable<std::pair<CustomCopyAssignment, CustomCopyAssignment> >::value, "");
-
-// shared_ptr
-static_assert(std::__is_replaceable<std::shared_ptr<int> >::value, "");
-
-// tuple
-#if TEST_STD_VER >= 11
-static_assert(std::__is_replaceable<std::tuple<> >::value, "");
-
-static_assert(std::__is_replaceable<std::tuple<int> >::value, "");
-static_assert(!std::__is_replaceable<std::tuple<CustomCopyAssignment> >::value, "");
-
-static_assert(std::__is_replaceable<std::tuple<int, int> >::value, "");
-static_assert(!std::__is_replaceable<std::tuple<CustomCopyAssignment, int> >::value, "");
-static_assert(!std::__is_replaceable<std::tuple<int, CustomCopyAssignment> >::value, "");
-static_assert(!std::__is_replaceable<std::tuple<CustomCopyAssignment, CustomCopyAssignment> >::value, "");
-#endif // TEST_STD_VER >= 11
-
-// unique_ptr
-struct NonReplaceableDeleter {
-  NonReplaceableDeleter(const NonReplaceableDeleter&);
-  NonReplaceableDeleter& operator=(const NonReplaceableDeleter&);
-  ~NonReplaceableDeleter();
-
-  template <class T>
-  void operator()(T*);
-};
-
-struct NonReplaceablePointer {
-  struct pointer {
-    pointer(const pointer&);
-    pointer& operator=(const pointer&);
-    ~pointer();
-  };
-
-  template <class T>
-  void operator()(T*);
-};
-
-static_assert(std::__is_replaceable<std::unique_ptr<int> >::value, "");
-static_assert(std::__is_replaceable<std::unique_ptr<CustomCopyAssignment> >::value, "");
-static_assert(std::__is_replaceable<std::unique_ptr<int[]> >::value, "");
-static_assert(!std::__is_replaceable<std::unique_ptr<int, NonReplaceableDeleter> >::value, "");
-static_assert(!std::__is_replaceable<std::unique_ptr<int[], NonReplaceableDeleter> >::value, "");
-static_assert(!std::__is_replaceable<std::unique_ptr<int, NonReplaceablePointer> >::value, "");
-static_assert(!std::__is_replaceable<std::unique_ptr<int[], NonReplaceablePointer> >::value, "");
-
-// variant
-#if TEST_STD_VER >= 17
-static_assert(std::__is_replaceable<std::variant<int> >::value, "");
-static_assert(!std::__is_replaceable<std::variant<CustomCopyAssignment> >::value, "");
-
-static_assert(std::__is_replaceable<std::variant<int, int> >::value, "");
-static_assert(!std::__is_replaceable<std::variant<CustomCopyAssignment, int> >::value, "");
-static_assert(!std::__is_replaceable<std::variant<int, CustomCopyAssignment> >::value, "");
-static_assert(!std::__is_replaceable<std::variant<CustomCopyAssignment, CustomCopyAssignment> >::value, "");
-#endif // TEST_STD_VER >= 17
-
-// vector
-static_assert(std::__is_replaceable<std::vector<int> >::value, "");
-static_assert(std::__is_replaceable<std::vector<CustomCopyAssignment> >::value, "");
-static_assert(!std::__is_replaceable<std::vector<int, test_allocator<int> > >::value, "");
-static_assert(!std::__is_replaceable<std::vector<int, NonReplaceableStatelessAlloc<int> > >::value, "");
-static_assert(!std::__is_replaceable<std::vector<int, NonPropagatingStatefulCopyAssignAlloc<int> > >::value, "");
-static_assert(!std::__is_replaceable<std::vector<int, NonPropagatingStatefulMoveAssignAlloc<int> > >::value, "");
-static_assert(std::__is_replaceable<std::vector<int, NonPropagatingStatelessCopyAssignAlloc<int> > >::value, "");
-static_assert(std::__is_replaceable<std::vector<int, NonPropagatingStatelessMoveAssignAlloc<int> > >::value, "");
-
-// weak_ptr
-static_assert(std::__is_replaceable<std::weak_ptr<CustomCopyAssignment> >::value, "");
-
-// TODO: Mark all the replaceable STL types as such
diff --git a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
index 897ae89365014..3fac952b9eb98 100644
--- a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
+++ b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
@@ -154,14 +154,10 @@ SPECIALIZE_UTT(is_unbounded_array);     // expected-error 2 {{cannot be speciali
 #  endif
 
 #  if TEST_STD_VER >= 23
-SPECIALIZE_UTT(is_implicit_lifetime); // expected-error 2 {{cannot be specialized}}
-SPECIALIZE_UTT(is_scoped_enum);       // expected-error 2 {{cannot be specialized}}
-#    if __has_builtin(__reference_constructs_from_temporary)
+SPECIALIZE_UTT(is_implicit_lifetime);                // expected-error 2 {{cannot be specialized}}
+SPECIALIZE_UTT(is_scoped_enum);                      // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_BTT(reference_constructs_from_temporary); // expected-error 2 {{cannot be specialized}}
-#    endif
-#    if __has_builtin(__reference_converts_from_temporary)
-SPECIALIZE_BTT(reference_converts_from_temporary); // expected-error 2 {{cannot be specialized}}
-#    endif
+SPECIALIZE_BTT(reference_converts_from_temporary);   // expected-error 2 {{cannot be specialized}}
 #  endif
 
 #  if TEST_STD_VER >= 26
diff --git a/libcxx/test/std/numerics/bit/bitops.rot/nodiscard.verify.cpp b/libcxx/test/libcxx/utilities/function.objects/lifetimebound.verify.cpp
similarity index 58%
rename from libcxx/test/std/numerics/bit/bitops.rot/nodiscard.verify.cpp
rename to libcxx/test/libcxx/utilities/function.objects/lifetimebound.verify.cpp
index 885534a85c3cb..5c66bc11fca4c 100644
--- a/libcxx/test/std/numerics/bit/bitops.rot/nodiscard.verify.cpp
+++ b/libcxx/test/libcxx/utilities/function.objects/lifetimebound.verify.cpp
@@ -7,12 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
+// ADDITIONAL_COMPILE_FLAGS: -Wno-pessimizing-move -Wno-unused-variable
 
-// Check that std::rotl and std::rotr are marked [[nodiscard]]
+#include <functional>
 
-#include <bit>
+#include "test_macros.h"
+
+// clang-format off
 
 void func() {
-  std::rotl(0u, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-  std::rotr(0u, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  auto&& v1 = std::identity()(1); // expected-warning {{temporary bound to local reference 'v1' will be destroyed at the end of the full-expression}}
 }
diff --git a/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/libcxx.control_block_layout.pass.cpp b/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/libcxx.control_block_layout.pass.cpp
index 0b48bc92f02af..9cb5b2ffbae97 100644
--- a/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/libcxx.control_block_layout.pass.cpp
+++ b/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/libcxx.control_block_layout.pass.cpp
@@ -30,7 +30,7 @@
 
 struct value_init_tag {};
 
-template <class T, int _Idx, bool CanBeEmptyBase = std::is_empty<T>::value && !std::__libcpp_is_final<T>::value>
+template <class T, int _Idx, bool CanBeEmptyBase = std::is_empty<T>::value && !std::__is_final_v<T>>
 struct compressed_pair_elem {
   explicit compressed_pair_elem(value_init_tag) : value_() {}
 
diff --git a/libcxx/test/libcxx/utilities/meta/is_within_lifetime.verify.cpp b/libcxx/test/libcxx/utilities/meta/is_within_lifetime.verify.cpp
new file mode 100644
index 0000000000000..ff3ecfbbc120c
--- /dev/null
+++ b/libcxx/test/libcxx/utilities/meta/is_within_lifetime.verify.cpp
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// UNSUPPORTED: gcc-15, apple-clang-17
+
+// <type_traits>
+
+// LWG4138 <https://cplusplus.github.io/LWG/issue4138>
+// std::is_within_lifetime shouldn't work when a function type is
+// explicitly specified, even if it isn't evaluated
+
+#include <type_traits>
+
+template <class T>
+consteval bool checked_is_within_lifetime(T* p) {
+  return p ? std::is_within_lifetime<T>(p) : false;
+}
+static_assert(!checked_is_within_lifetime<int>(nullptr));
+static_assert(!checked_is_within_lifetime<void()>(nullptr));
+// expected-error@*:* {{function pointer argument to '__builtin_is_within_lifetime' is not allowed}}
diff --git a/libcxx/test/libcxx/utilities/optional/optional.iterator/iterator.compile.pass.cpp b/libcxx/test/libcxx/utilities/optional/optional.iterator/iterator.compile.pass.cpp
index 3cdd7553e2e5d..b604579e43557 100644
--- a/libcxx/test/libcxx/utilities/optional/optional.iterator/iterator.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/optional/optional.iterator/iterator.compile.pass.cpp
@@ -23,8 +23,7 @@ concept has_iterator_aliases = requires {
 
 static_assert(has_iterator_aliases<std::optional<int>>);
 static_assert(has_iterator_aliases<std::optional<const int>>);
-
-// TODO: Uncomment these once P2988R12 is implemented, as they would be testing optional<T&>
-
-// static_assert(!has_iterator_aliases<std::optional<int (&)[]>>);
-// static_assert(!has_iterator_aliases<std::optional<void (&)(int, char)>>);
+static_assert(has_iterator_aliases<std::optional<int&>>);
+static_assert(has_iterator_aliases<std::optional<const int&>>);
+static_assert(!has_iterator_aliases<std::optional<int (&)[1]>>);
+static_assert(!has_iterator_aliases<std::optional<int (&)()>>);
diff --git a/libcxx/test/libcxx/utilities/optional/optional.object/optional.object.observe/value_or.compile.pass.cpp b/libcxx/test/libcxx/utilities/optional/optional.object/optional.object.observe/value_or.compile.pass.cpp
new file mode 100644
index 0000000000000..25df0dd6c1936
--- /dev/null
+++ b/libcxx/test/libcxx/utilities/optional/optional.object/optional.object.observe/value_or.compile.pass.cpp
@@ -0,0 +1,28 @@
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++26
+
+// <optional>
+
+// template <class U> T optional<T>::value_or(U&&);
+
+#include <concepts>
+#include <optional>
+
+template <typename Opt, typename T>
+concept has_value_or = requires(Opt opt, T&& t) {
+  { opt.value_or(t) } -> std::same_as<T>;
+};
+
+static_assert(has_value_or<std::optional<int>, int>);
+static_assert(has_value_or<std::optional<int&>, int&>);
+static_assert(has_value_or<std::optional<const int&>, const int&>);
+static_assert(!has_value_or<std::optional<int (&)[1]>&&, int (&)[1]>);
+static_assert(!has_value_or<std::optional<int (&)()>&&, int (&)()>);
diff --git a/libcxx/test/selftest/dsl/dsl.sh.py b/libcxx/test/selftest/dsl/dsl.sh.py
index 93f351f58eb4b..b8ee2ca3d6bb9 100644
--- a/libcxx/test/selftest/dsl/dsl.sh.py
+++ b/libcxx/test/selftest/dsl/dsl.sh.py
@@ -61,7 +61,7 @@ def setUp(self):
         self.litConfig = lit.LitConfig.LitConfig(
             progname="lit",
             path=[],
-            quiet=False,
+            diagnostic_level="note",
             useValgrind=False,
             valgrindLeakCheck=False,
             valgrindArgs=[],
diff --git a/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp
index 13bd761ae9808..602bd1612015d 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp
@@ -9,7 +9,7 @@
 // https://github.com/llvm/llvm-project/issues/30023
 // compare exchange does not work with types of which the size is not a power of 2
 
-// XFAIL: clang-19, clang-20, clang-21, apple-clang-15, apple-clang-16, apple-clang-17
+// XFAIL: clang-20, clang-21, apple-clang-17
 // UNSUPPORTED: c++03
 
 // TODO: remove the UNSUPPORTED clang-22 once libc++ CI's clang is updated to include
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp
index 52f77438df2ce..88a76d3c1c8b8 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<int>, KeyContainer>;
   M m;
@@ -38,15 +38,23 @@ void test_one() {
   assert(m.empty());
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp
index 4e3d1414b28af..fb9c38f592262 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp
@@ -24,7 +24,7 @@
 #include "test_allocator.h"
 #include "test_macros.h"
 
-void test() {
+constexpr bool test() {
   {
     using A1 = limited_allocator<int, 10>;
     using C  = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>;
@@ -59,10 +59,15 @@ void test() {
     assert(c.max_size() <= max_dist);
     assert(c.max_size() <= alloc_max_size(std::allocator<char>()));
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp
index 4aff08b8127b6..156bb27fae992 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=200000000
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=800000000
 
 // <flat_set>
 
@@ -23,7 +25,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using M = std::flat_multiset<int, std::less<int>, KeyContainer>;
   using S = typename M::size_type;
   {
@@ -46,7 +48,7 @@ void test_one() {
   }
   {
     M m;
-    S s = 500000;
+    S s = 5000;
     for (std::size_t i = 0u; i < s; ++i) {
       m.emplace(i);
       m.emplace(i);
@@ -57,15 +59,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp
index 4fffcb304d20a..2426fbc0fc063 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp
@@ -14,6 +14,7 @@
 //   explicit flat_multiset(const Allocator& a);
 
 #include <cassert>
+#include <deque>
 #include <flat_set>
 #include <functional>
 #include <vector>
@@ -22,7 +23,8 @@
 #include "test_allocator.h"
 #include "../../../test_compare.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true
@@ -30,8 +32,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, const A1&>);
@@ -39,25 +41,38 @@ void test() {
     static_assert(!std::is_constructible_v<M1, const A2&>);
     static_assert(!std::is_constructible_v<M2, const A1&>);
   }
-  {
-    // explicit
-    using M = std::flat_multiset<int, std::less<int>, std::vector<int, test_allocator<int>>>;
-
-    static_assert(std::is_constructible_v<M, test_allocator<int>>);
-    static_assert(!std::is_convertible_v<test_allocator<int>, M>);
-  }
   {
     using A = test_allocator<short>;
-    using M = std::flat_multiset<int, std::less<int>, std::vector<int, test_allocator<int>>>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, test_allocator<int>>>;
     M m(A(0, 5));
     assert(m.empty());
     assert(m.begin() == m.end());
     assert(std::move(m).extract().get_allocator().get_id() == 5);
   }
+  {
+    // explicit
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, test_allocator<int>>>;
+
+    static_assert(std::is_constructible_v<M, test_allocator<int>>);
+    static_assert(!std::is_convertible_v<test_allocator<int>, M>);
+  }
+}
+
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp
index ae81ab044932d..a895117517ef4 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp
@@ -26,7 +26,7 @@
 #include "test_allocator.h"
 
 template <class KeyContainer>
-void test() {
+constexpr void test() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   {
@@ -53,16 +53,24 @@ void test() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test<std::vector<int>>();
   test<std::vector<double>>();
-  test<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>>();
   test<MinSequenceContainer<int>>();
   test<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp
index 6b68589e6814f..43ebea740f66c 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp
@@ -20,11 +20,35 @@
 #include <type_traits>
 #include <vector>
 
+#include "MinSequenceContainer.h"
+#include "min_allocator.h"
 #include "test_macros.h"
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-void test() {
+template <class KeyContainer>
+constexpr void test_compare() {
+  using Key = typename KeyContainer::value_type;
+  {
+    // The one-argument ctor is explicit.
+    using C = test_less<Key>;
+    static_assert(std::is_constructible_v<std::flat_multiset<Key, C>, C>);
+    static_assert(!std::is_convertible_v<C, std::flat_multiset<Key, C>>);
+
+    static_assert(std::is_constructible_v<std::flat_multiset<Key>, std::less<Key>>);
+    static_assert(!std::is_convertible_v<std::less<Key>, std::flat_multiset<Key>>);
+  }
+  {
+    using C = test_less<Key>;
+    auto m  = std::flat_multiset<Key, C>(C(3));
+    assert(m.empty());
+    assert(m.begin() == m.end());
+    assert(m.key_comp() == C(3));
+  }
+}
+
+template <template <class...> class KeyContainer>
+constexpr void test_compare_alloc() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true
@@ -32,8 +56,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, const C&, const A1&>);
@@ -41,26 +65,10 @@ void test() {
     static_assert(!std::is_constructible_v<M1, const C&, const A2&>);
     static_assert(!std::is_constructible_v<M2, const C&, const A1&>);
   }
-  {
-    using C = test_less<int>;
-    auto m  = std::flat_multiset<int, C>(C(3));
-    assert(m.empty());
-    assert(m.begin() == m.end());
-    assert(m.key_comp() == C(3));
-  }
-  {
-    // The one-argument ctor is explicit.
-    using C = test_less<int>;
-    static_assert(std::is_constructible_v<std::flat_multiset<int, C>, C>);
-    static_assert(!std::is_convertible_v<C, std::flat_multiset<int, C>>);
-
-    static_assert(std::is_constructible_v<std::flat_multiset<int>, std::less<int>>);
-    static_assert(!std::is_convertible_v<std::less<int>, std::flat_multiset<int>>);
-  }
   {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
-    auto m   = std::flat_multiset<int, C, std::vector<int, A1>>(C(4), A1(5));
+    auto m   = std::flat_multiset<int, C, KeyContainer<int, A1>>(C(4), A1(5));
     assert(m.empty());
     assert(m.begin() == m.end());
     assert(m.key_comp() == C(4));
@@ -68,9 +76,9 @@ void test() {
   }
   {
     // explicit(false)
-    using C                                           = test_less<int>;
-    using A1                                          = test_allocator<int>;
-    std::flat_multiset<int, C, std::deque<int, A1>> m = {C(4), A1(5)};
+    using C                                             = test_less<int>;
+    using A1                                            = test_allocator<int>;
+    std::flat_multiset<int, C, KeyContainer<int, A1>> m = {C(4), A1(5)};
     assert(m.empty());
     assert(m.begin() == m.end());
     assert(m.key_comp() == C(4));
@@ -78,8 +86,29 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test_compare<std::vector<int>>();
+  test_compare<MinSequenceContainer<int>>();
+  test_compare<std::vector<int, min_allocator<int>>>();
+
+  test_compare_alloc<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test_compare<std::deque<int>>();
+    test_compare_alloc<std::deque>();
+  }
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp
index 78eac420a8f22..1a476009e45d3 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp
@@ -35,7 +35,8 @@ void conversion_test(T);
 template <class T, class... Args>
 concept ImplicitlyConstructible = requires(Args&&... args) { conversion_test<T>({std::forward<Args>(args)...}); };
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true
@@ -43,8 +44,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, const V1&, const A1&>);
@@ -59,15 +60,15 @@ void test() {
   }
   {
     // flat_multiset(container_type)
-    using M             = std::flat_multiset<int>;
-    std::vector<int> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
-    auto m              = M(ks);
-    int expected[]      = {1, 1, 1, 2, 2, 2, 3, 3, 3};
+    using M              = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
+    KeyContainer<int> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    auto m               = M(ks);
+    int expected[]       = {1, 1, 1, 2, 2, 2, 3, 3, 3};
     assert(std::ranges::equal(m, expected));
 
     // explicit(false)
-    static_assert(std::is_constructible_v<M, const std::vector<int>&>);
-    static_assert(!ImplicitlyConstructible<M, const std::vector<int>&>);
+    static_assert(std::is_constructible_v<M, const KeyContainer<int>&>);
+    static_assert(!ImplicitlyConstructible<M, const KeyContainer<int>&>);
 
     m = M(std::move(ks));
     assert(ks.empty()); // it was moved-from
@@ -77,7 +78,7 @@ void test() {
     // flat_multiset(container_type)
     // move-only
     int expected[] = {3, 3, 2, 1};
-    using Ks       = std::deque<MoveOnly, min_allocator<MoveOnly>>;
+    using Ks       = KeyContainer<MoveOnly, min_allocator<MoveOnly>>;
     using M        = std::flat_multiset<MoveOnly, std::greater<MoveOnly>, Ks>;
     Ks ks;
     ks.push_back(1);
@@ -92,8 +93,8 @@ void test() {
     // flat_multiset(container_type)
     // container's allocators are used
     using A = test_allocator<int>;
-    using M = std::flat_multiset<int, std::less<int>, std::deque<int, A>>;
-    auto ks = std::deque<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
     auto m  = M(std::move(ks));
     assert(ks.empty()); // it was moved-from
     assert((m == M{1, 1, 1, 2, 2, 2, 3, 3, 3}));
@@ -102,22 +103,22 @@ void test() {
   }
   {
     // flat_multiset(container_type, key_compare)
-    using C             = test_less<int>;
-    using M             = std::flat_multiset<int, C>;
-    std::vector<int> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
-    auto m              = M(ks, C(4));
+    using C              = test_less<int>;
+    using M              = std::flat_multiset<int, C, KeyContainer<int>>;
+    KeyContainer<int> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    auto m               = M(ks, C(4));
     assert(std::ranges::equal(m, std::vector<int>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
     assert(m.key_comp() == C(4));
 
     // explicit
-    static_assert(std::is_constructible_v<M, const std::vector<int>&, const C&>);
-    static_assert(!ImplicitlyConstructible<M, const std::vector<int>&, const C&>);
+    static_assert(std::is_constructible_v<M, const KeyContainer<int>&, const C&>);
+    static_assert(!ImplicitlyConstructible<M, const KeyContainer<int>&, const C&>);
   }
   {
     // flat_multiset(container_type , const Allocator&)
     using A = test_allocator<int>;
-    using M = std::flat_multiset<int, std::less<int>, std::deque<int, A>>;
-    auto ks = std::deque<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
     auto m  = M(ks, A(4)); // replaces the allocators
     assert(!ks.empty());   // it was an lvalue above
     assert((m == M{1, 1, 1, 2, 2, 2, 3, 3, 3}));
@@ -125,7 +126,7 @@ void test() {
     assert(keys.get_allocator() == A(4));
 
     // explicit(false)
-    static_assert(ImplicitlyConstructible<M, const std::deque<int, A>&, const A&>);
+    static_assert(ImplicitlyConstructible<M, const KeyContainer<int, A>&, const A&>);
     M m2 = {ks, A(4)};   // implicit ctor
     assert(!ks.empty()); // it was an lvalue above
     assert(m2 == m);
@@ -134,19 +135,19 @@ void test() {
   }
   {
     // flat_multiset(container_type , const Allocator&)
-    using C                = test_less<int>;
-    using A                = test_allocator<int>;
-    using M                = std::flat_multiset<int, C, std::vector<int, A>>;
-    std::vector<int, A> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
-    auto m                 = M(ks, C(4), A(5));
-    assert(std::ranges::equal(m, std::vector<int, A>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    using C                 = test_less<int>;
+    using A                 = test_allocator<int>;
+    using M                 = std::flat_multiset<int, C, KeyContainer<int, A>>;
+    KeyContainer<int, A> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    auto m                  = M(ks, C(4), A(5));
+    assert(std::ranges::equal(m, KeyContainer<int, A>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
     assert(m.key_comp() == C(4));
     auto m_copy = m;
     auto keys   = std::move(m_copy).extract();
     assert(keys.get_allocator() == A(5));
 
     // explicit(false)
-    static_assert(ImplicitlyConstructible<M, const std::vector<int, A>&, const A&>);
+    static_assert(ImplicitlyConstructible<M, const KeyContainer<int, A>&, const A&>);
     M m2 = {ks, C(4), A(5)};
     assert(m2 == m);
     assert(m2.key_comp() == C(4));
@@ -155,8 +156,22 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp
index b4f7220e1bac7..55f3defc5ddff 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <deque>
 #include <flat_set>
 #include <vector>
 
@@ -21,10 +22,11 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     using C = test_less<int>;
-    std::vector<int, test_allocator<int>> ks({1, 3, 5, 3, 1}, test_allocator<int>(6));
+    KeyContainer<int, test_allocator<int>> ks({1, 3, 5, 3, 1}, test_allocator<int>(6));
     const int expected[] = {1, 1, 3, 3, 5};
     using M              = std::flat_multiset<int, C, decltype(ks)>;
     auto mo              = M(ks, C(5));
@@ -43,7 +45,7 @@ void test() {
   }
   {
     using C              = test_less<int>;
-    using Ks             = std::vector<int, other_allocator<int>>;
+    using Ks             = KeyContainer<int, other_allocator<int>>;
     auto ks              = Ks({1, 3, 5, 3, 1}, other_allocator<int>(6));
     const int expected[] = {1, 1, 3, 3, 5};
     using M              = std::flat_multiset<int, C, Ks>;
@@ -63,8 +65,22 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp
index ec8ad824ea14b..ec9f14ecab6bd 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp
@@ -23,7 +23,8 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -31,8 +32,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, const M1&, const A1&>);
@@ -42,7 +43,7 @@ void test() {
   }
   {
     using C = test_less<int>;
-    std::vector<int, test_allocator<int>> ks({1, 3, 5, 5}, test_allocator<int>(6));
+    KeyContainer<int, test_allocator<int>> ks({1, 3, 5, 5}, test_allocator<int>(6));
     using M = std::flat_multiset<int, C, decltype(ks)>;
     auto mo = M(ks, C(5));
     auto m  = M(mo, test_allocator<int>(3));
@@ -59,8 +60,23 @@ void test() {
     assert(keys2.get_allocator() == test_allocator<int>(6));
   }
 }
+
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp
index 2b6176ac915a7..2e63a004ffa88 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp
@@ -13,6 +13,7 @@
 // flat_multiset& operator=(const flat_multiset& m);
 
 #include <algorithm>
+#include <deque>
 #include <flat_set>
 #include <functional>
 #include <vector>
@@ -22,11 +23,12 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // test_allocator is not propagated
     using C = test_less<int>;
-    std::vector<int, test_allocator<int>> ks({1, 3, 5, 5}, test_allocator<int>(6));
+    KeyContainer<int, test_allocator<int>> ks({1, 3, 5, 5}, test_allocator<int>(6));
     using M = std::flat_multiset<int, C, decltype(ks)>;
     auto mo = M(ks, C(5));
     auto m  = M({{3, 4, 5, 4}}, C(3), test_allocator<int>(2));
@@ -46,7 +48,7 @@ void test() {
   {
     // other_allocator is propagated
     using C              = test_less<int>;
-    using Ks             = std::vector<int, other_allocator<int>>;
+    using Ks             = KeyContainer<int, other_allocator<int>>;
     auto ks              = Ks({1, 3, 5, 3}, other_allocator<int>(6));
     const int expected[] = {1, 3, 3, 5};
     using M              = std::flat_multiset<int, C, Ks>;
@@ -65,7 +67,7 @@ void test() {
     auto keys2 = std::move(mo).extract();
     assert(keys2.get_allocator() == other_allocator<int>(6));
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // comparator is copied and invariant is preserved
     using M = std::flat_multiset<int, std::function<bool(int, int)>>;
     M mo    = M({1, 2}, std::less<int>());
@@ -103,8 +105,22 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp
index 16f90322cd31a..3a7ff86c6c040 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp
@@ -25,28 +25,29 @@
 #include "test_macros.h"
 
 struct DefaultCtableComp {
-  explicit DefaultCtableComp() { default_constructed_ = true; }
-  bool operator()(int, int) const { return false; }
+  constexpr explicit DefaultCtableComp() { default_constructed_ = true; }
+  constexpr bool operator()(int, int) const { return false; }
   bool default_constructed_ = false;
 };
 
 struct ThrowingCtorComp {
-  ThrowingCtorComp() noexcept(false) {}
-  bool operator()(const auto&, const auto&) const { return false; }
+  constexpr ThrowingCtorComp() noexcept(false) {}
+  constexpr bool operator()(const auto&, const auto&) const { return false; }
 };
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
-    std::flat_multiset<int> m;
+    std::flat_multiset<int, std::less<int>, KeyContainer<int>> m;
     assert(m.empty());
   }
   {
     // explicit(false)
-    std::flat_multiset<int> m = {};
+    std::flat_multiset<int, std::less<int>, KeyContainer<int>> m = {};
     assert(m.empty());
   }
   {
-    std::flat_multiset<int, DefaultCtableComp, std::deque<int, min_allocator<int>>> m;
+    std::flat_multiset<int, DefaultCtableComp, KeyContainer<int, min_allocator<int>>> m;
     assert(m.empty());
     assert(m.begin() == m.end());
     assert(m.key_comp().default_constructed_);
@@ -54,7 +55,7 @@ void test() {
   {
     using A1 = explicit_allocator<int>;
     {
-      std::flat_multiset<int, DefaultCtableComp, std::vector<int, A1>> m;
+      std::flat_multiset<int, DefaultCtableComp, KeyContainer<int, A1>> m;
       assert(m.empty());
       assert(m.key_comp().default_constructed_);
     }
@@ -67,30 +68,46 @@ void test() {
   }
 #if defined(_LIBCPP_VERSION)
   {
-    using C = std::flat_multiset<MoveOnly>;
+    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>>;
     static_assert(std::is_nothrow_default_constructible_v<C>);
     C c;
   }
   {
-    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, std::vector<MoveOnly, test_allocator<MoveOnly>>>;
+    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, KeyContainer<MoveOnly, test_allocator<MoveOnly>>>;
     static_assert(std::is_nothrow_default_constructible_v<C>);
     C c;
   }
 #endif // _LIBCPP_VERSION
   {
-    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, std::vector<MoveOnly, other_allocator<MoveOnly>>>;
+    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, KeyContainer<MoveOnly, other_allocator<MoveOnly>>>;
     static_assert(!std::is_nothrow_default_constructible_v<C>);
     C c;
   }
   {
-    using C = std::flat_multiset<MoveOnly, ThrowingCtorComp>;
+    using C = std::flat_multiset<MoveOnly, ThrowingCtorComp, KeyContainer<MoveOnly>>;
     static_assert(!std::is_nothrow_default_constructible_v<C>);
     C c;
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque>();
+  }
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp
index f852f2f85572c..f7243fa7e7fb3 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp
@@ -23,39 +23,56 @@
 #include "test_allocator.h"
 
 struct ThrowingDtorComp {
-  bool operator()(const auto&, const auto&) const;
-  ~ThrowingDtorComp() noexcept(false) {}
+  constexpr bool operator()(const auto&, const auto&) const;
+  constexpr ~ThrowingDtorComp() noexcept(false) {}
 };
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
-    using C = std::flat_multiset<MoveOnly, MoveOnly>;
+    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, KeyContainer<MoveOnly>>;
     static_assert(std::is_nothrow_destructible_v<C>);
     C c;
   }
   {
-    using V = std::vector<MoveOnly, test_allocator<MoveOnly>>;
+    using V = KeyContainer<MoveOnly, test_allocator<MoveOnly>>;
     using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, V>;
     static_assert(std::is_nothrow_destructible_v<C>);
     C c;
   }
   {
-    using V = std::deque<MoveOnly, other_allocator<MoveOnly>>;
+    using V = KeyContainer<MoveOnly, other_allocator<MoveOnly>>;
     using C = std::flat_multiset<MoveOnly, std::greater<MoveOnly>, V>;
     static_assert(std::is_nothrow_destructible_v<C>);
     C c;
   }
 #if defined(_LIBCPP_VERSION)
   {
-    using C = std::flat_multiset<MoveOnly, ThrowingDtorComp>;
+    using C = std::flat_multiset<MoveOnly, ThrowingDtorComp, KeyContainer<MoveOnly>>;
     static_assert(!std::is_nothrow_destructible_v<C>);
     C c;
   }
 #endif // _LIBCPP_VERSION
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque>();
+  }
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp
index 10638d75bbd14..36f5def21c14c 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp
@@ -32,12 +32,13 @@
 #include "../../../test_compare.h"
 
 struct DefaultCtableComp {
-  explicit DefaultCtableComp() { default_constructed_ = true; }
-  bool operator()(int, int) const { return false; }
+  constexpr explicit DefaultCtableComp() { default_constructed_ = true; }
+  constexpr bool operator()(int, int) const { return false; }
   bool default_constructed_ = false;
 };
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -45,8 +46,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     using IL = std::initializer_list<int>;
@@ -60,10 +61,9 @@ void test() {
     static_assert(!std::is_constructible_v<M1, IL, const C&, const A2&>);
     static_assert(!std::is_constructible_v<M2, IL, const C&, const A1&>);
   }
-
   {
     // initializer_list<value_type> needs to match exactly
-    using M = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     using C = typename M::key_compare;
     static_assert(std::is_constructible_v<M, std::initializer_list<int>>);
     static_assert(std::is_constructible_v<M, std::initializer_list<int>, C>);
@@ -78,11 +78,10 @@ void test() {
     static_assert(!std::is_constructible_v<M, std::initializer_list<const int>, C, std::allocator<int>>);
     static_assert(!std::is_constructible_v<M, std::initializer_list<const int>, std::allocator<int>>);
   }
-
   int expected[] = {1, 2, 2, 3, 3, 5};
   {
     // flat_multiset(initializer_list<value_type>);
-    using M                       = std::flat_multiset<int>;
+    using M                       = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     std::initializer_list<int> il = {5, 2, 2, 3, 1, 3};
     M m(il);
     assert(std::ranges::equal(m, expected));
@@ -90,13 +89,13 @@ void test() {
   {
     // flat_multiset(initializer_list<value_type>);
     // explicit(false)
-    using M = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     M m     = {5, 2, 2, 3, 1, 3};
     assert(std::ranges::equal(m, expected));
   }
   {
     // flat_multiset(initializer_list<value_type>);
-    using M = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>;
+    using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     M m     = {5, 2, 2, 3, 1, 3};
     assert(std::ranges::equal(m, expected | std::views::reverse));
   }
@@ -105,15 +104,14 @@ void test() {
     {
       // flat_multiset(initializer_list<value_type>);
       // different comparator
-      using M = std::flat_multiset<int, DefaultCtableComp, std::vector<int, A>>;
+      using M = std::flat_multiset<int, DefaultCtableComp, KeyContainer<int, A>>;
       M m     = {1, 2, 3};
       assert(m.size() == 3);
-      LIBCPP_ASSERT(*m.begin() == 1);
       assert(m.key_comp().default_constructed_);
     }
     {
       // flat_multiset(initializer_list<value_type>, const Allocator&);
-      using M = std::flat_multiset<int, std::greater<int>, std::deque<int, A>>;
+      using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, A>>;
       A a;
       M m({5, 2, 2, 3, 1, 3}, a);
       assert(std::ranges::equal(m, expected | std::views::reverse));
@@ -122,7 +120,7 @@ void test() {
   {
     // flat_multiset(initializer_list<value_type>, const key_compare&);
     using C = test_less<int>;
-    using M = std::flat_multiset<int, C>;
+    using M = std::flat_multiset<int, C, KeyContainer<int>>;
     auto m  = M({5, 2, 2, 3, 1, 3}, C(10));
     assert(std::ranges::equal(m, expected));
     assert(m.key_comp() == C(10));
@@ -132,10 +130,10 @@ void test() {
     assert(m2 == m);
     assert(m2.key_comp() == C(10));
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // flat_multiset(initializer_list<value_type>, const key_compare&);
     // Sorting uses the comparator that was passed in
-    using M = std::flat_multiset<int, std::function<bool(int, int)>, std::deque<int, min_allocator<int>>>;
+    using M = std::flat_multiset<int, std::function<bool(int, int)>, KeyContainer<int, min_allocator<int>>>;
     auto m  = M({5, 2, 2, 1, 3, 3}, std::greater<int>());
     assert(std::ranges::equal(m, expected | std::views::reverse));
     assert(m.key_comp()(2, 1) == true);
@@ -143,15 +141,31 @@ void test() {
   {
     // flat_multiset(initializer_list<value_type> il, const key_compare& comp, const Alloc& a);
     using A = explicit_allocator<int>;
-    using M = std::flat_multiset<int, std::greater<int>, std::deque<int, A>>;
+    using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, A>>;
     A a;
     M m({5, 2, 2, 3, 1, 3}, {}, a);
     assert(std::ranges::equal(m, expected | std::views::reverse));
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque>();
+  }
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp
index da9aef3dc36cd..0f757db3eb9ac 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp
@@ -30,7 +30,8 @@
 #include "test_macros.h"
 #include "../../../test_compare.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -38,8 +39,8 @@ void test() {
     using C     = test_less<int>;
     using A1    = test_allocator<int>;
     using A2    = other_allocator<int>;
-    using V1    = std::vector<int, A1>;
-    using V2    = std::vector<int, A2>;
+    using V1    = KeyContainer<int, A1>;
+    using V2    = KeyContainer<int, A2>;
     using M1    = std::flat_multiset<int, C, V1>;
     using M2    = std::flat_multiset<int, C, V2>;
     using Iter1 = typename M1::iterator;
@@ -60,7 +61,7 @@ void test() {
   {
     // flat_multiset(InputIterator , InputIterator)
     // cpp17_input_iterator
-    using M = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     auto m  = M(cpp17_input_iterator<const int*>(ar), cpp17_input_iterator<const int*>(ar + 9));
     assert(std::ranges::equal(m, expected));
 
@@ -71,21 +72,21 @@ void test() {
   {
     // flat_multiset(InputIterator , InputIterator)
     // greater
-    using M = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>;
+    using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     auto m  = M(cpp17_input_iterator<const int*>(ar), cpp17_input_iterator<const int*>(ar + 9));
     assert(std::ranges::equal(m, expected | std::views::reverse));
   }
   {
     // flat_multiset(InputIterator , InputIterator)
     // Test when the operands are of array type (also contiguous iterator type)
-    using M = std::flat_multiset<int, std::greater<int>, std::vector<int, min_allocator<int>>>;
+    using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     auto m  = M(ar, ar);
     assert(m.empty());
   }
   {
     // flat_multiset(InputIterator , InputIterator, const key_compare&)
     using C = test_less<int>;
-    using M = std::flat_multiset<int, C, std::vector<int>>;
+    using M = std::flat_multiset<int, C, KeyContainer<int>>;
     auto m  = M(ar, ar + 9, C(3));
     assert(std::ranges::equal(m, expected));
     assert(m.key_comp() == C(3));
@@ -98,7 +99,7 @@ void test() {
   {
     // flat_multiset(InputIterator , InputIterator, const Allocator&)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     auto m   = M(ar, ar + 9, A1(5));
     assert(std::ranges::equal(m, expected));
     assert(std::move(m).extract().get_allocator() == A1(5));
@@ -107,7 +108,7 @@ void test() {
     // flat_multiset(InputIterator , InputIterator, const Allocator&)
     // explicit(false)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     M m      = {ar, ar + 9, A1(5)}; // implicit ctor
     assert(std::ranges::equal(m, expected));
     assert(std::move(m).extract().get_allocator() == A1(5));
@@ -116,7 +117,7 @@ void test() {
     // flat_multiset(InputIterator , InputIterator, const key_compare&, const Allocator&)
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, C, std::vector<int, A1>>;
+    using M  = std::flat_multiset<int, C, KeyContainer<int, A1>>;
     auto m   = M(ar, ar + 9, C(3), A1(5));
     assert(std::ranges::equal(m, expected));
     assert(m.key_comp() == C(3));
@@ -126,7 +127,7 @@ void test() {
     // flat_multiset(InputIterator , InputIterator, const key_compare&, const Allocator&)
     // explicit(false)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::deque<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     M m      = {ar, ar + 9, {}, A1(5)}; // implicit ctor
     assert(std::ranges::equal(m, expected));
     LIBCPP_ASSERT(std::ranges::equal(m, expected));
@@ -134,8 +135,21 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp
index 825ad75cc8f4c..7fb0c0e9c3fd0 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp
@@ -25,11 +25,12 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     using C = test_less<int>;
     using A = test_allocator<int>;
-    using M = std::flat_multiset<int, C, std::deque<int, A>>;
+    using M = std::flat_multiset<int, C, KeyContainer<int, A>>;
     M mo    = M({1, 2, 1, 3}, C(5), A(7));
     M m     = std::move(mo);
     assert((m == M{1, 1, 2, 3}));
@@ -43,7 +44,7 @@ void test() {
   {
     using C = test_less<int>;
     using A = min_allocator<int>;
-    using M = std::flat_multiset<int, C, std::vector<int, A>>;
+    using M = std::flat_multiset<int, C, KeyContainer<int, A>>;
     M mo    = M({1, 2, 1, 3}, C(5), A());
     M m     = std::move(mo);
     assert((m == M{1, 1, 2, 3}));
@@ -54,9 +55,9 @@ void test() {
     assert(mo.key_comp() == C(5));
     assert(std::move(mo).extract().get_allocator() == A());
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // A moved-from flat_multiset maintains its class invariant in the presence of moved-from comparators.
-    using M = std::flat_multiset<int, std::function<bool(int, int)>>;
+    using M = std::flat_multiset<int, std::function<bool(int, int)>, KeyContainer<int>>;
     M mo    = M({1, 2, 1, 3}, std::less<int>());
     M m     = std::move(mo);
     assert(m.size() == 4);
@@ -81,6 +82,16 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 template <class T>
 struct ThrowingMoveAllocator {
   using value_type                                    = T;
@@ -179,6 +190,9 @@ void test_move_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_move_noexcept();
 #if !defined(TEST_HAS_NO_EXCEPTIONS)
   test_move_exception();
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp
index ee8258e5ac846..1f095edb73370 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp
@@ -24,7 +24,8 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -32,8 +33,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, M1&&, const A1&>);
@@ -45,7 +46,7 @@ void test() {
     int expected[] = {1, 1, 2, 2, 3};
     using C        = test_less<int>;
     using A        = test_allocator<int>;
-    using M        = std::flat_multiset<int, C, std::deque<int, A>>;
+    using M        = std::flat_multiset<int, C, KeyContainer<int, A>>;
     auto mo        = M(expected, expected + 5, C(5), A(7));
     auto m         = M(std::move(mo), A(3));
 
@@ -72,8 +73,21 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp
index 96e046e38668f..62e21811e4962 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp
@@ -187,25 +187,12 @@ void test_move_assign_no_except() {
   }
 }
 
-void test() {
-  {
-    using C                           = test_less<int>;
-    using A1                          = test_allocator<int>;
-    using M                           = std::flat_multiset<int, C, std::vector<int, A1>>;
-    M mo                              = M({1, 1, 2, 3}, C(5), A1(7));
-    M m                               = M({}, C(3), A1(7));
-    std::same_as<M&> decltype(auto) r = m = std::move(mo);
-    assert(&r == &m);
-    assert((m == M{1, 1, 2, 3}));
-    assert(m.key_comp() == C(5));
-    auto ks = std::move(m).extract();
-    assert(ks.get_allocator() == A1(7));
-    assert(mo.empty());
-  }
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     using C                           = test_less<int>;
     using A1                          = other_allocator<int>;
-    using M                           = std::flat_multiset<int, C, std::deque<int, A1>>;
+    using M                           = std::flat_multiset<int, C, KeyContainer<int, A1>>;
     M mo                              = M({4, 4, 5}, C(5), A1(7));
     M m                               = M({1, 1, 2, 3, 4}, C(3), A1(7));
     std::same_as<M&> decltype(auto) r = m = std::move(mo);
@@ -218,7 +205,7 @@ void test() {
   }
   {
     using A                           = min_allocator<int>;
-    using M                           = std::flat_multiset<int, std::greater<int>, std::vector<int, A>>;
+    using M                           = std::flat_multiset<int, std::greater<int>, KeyContainer<int, A>>;
     M mo                              = M({5, 3, 4, 3}, A());
     M m                               = M({4, 1, 3, 2, 1}, A());
     std::same_as<M&> decltype(auto) r = m = std::move(mo);
@@ -228,10 +215,37 @@ void test() {
     assert(ks.get_allocator() == A());
     assert(mo.empty());
   }
+  {
+    using C                           = test_less<int>;
+    using A1                          = test_allocator<int>;
+    using M                           = std::flat_multiset<int, C, KeyContainer<int, A1>>;
+    M mo                              = M({1, 1, 2, 3}, C(5), A1(7));
+    M m                               = M({}, C(3), A1(7));
+    std::same_as<M&> decltype(auto) r = m = std::move(mo);
+    assert(&r == &m);
+    assert((m == M{1, 1, 2, 3}));
+    assert(m.key_comp() == C(5));
+    auto ks = std::move(m).extract();
+    assert(ks.get_allocator() == A1(7));
+    assert(mo.empty());
+  }
+}
+
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_move_assign_clears();
   test_move_assign_no_except();
 
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp
index 76485b47ec5ea..36501a566fbd6 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp
@@ -56,7 +56,8 @@ static_assert(
     !std::
         is_constructible_v<Set, std::from_range_t, RangeOf<std::pair<int, int>>, std::less<int>, std::allocator<int>>);
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -64,8 +65,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, std::from_range_t, M1, const A1&>);
@@ -84,7 +85,7 @@ void test() {
   {
     // flat_multiset(from_range_t, R&&)
     // input_range && !common
-    using M    = std::flat_multiset<int>;
+    using M    = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     using Iter = cpp20_input_iterator<const int*>;
     using Sent = sentinel_wrapper<Iter>;
     using R    = std::ranges::subrange<Iter, Sent>;
@@ -98,17 +99,17 @@ void test() {
   {
     // flat_multiset(from_range_t, R&&)
     // greater
-    using M    = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>;
+    using M    = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     using Iter = cpp20_input_iterator<const int*>;
     using Sent = sentinel_wrapper<Iter>;
     using R    = std::ranges::subrange<Iter, Sent>;
     auto m     = M(std::from_range, R(Iter(ar), Sent(Iter(ar + 9))));
-    assert(std::ranges::equal(m, std::deque<int, min_allocator<int>>{3, 3, 3, 2, 2, 2, 1, 1, 1}));
+    assert(std::ranges::equal(m, KeyContainer<int, min_allocator<int>>{3, 3, 3, 2, 2, 2, 1, 1, 1}));
   }
   {
     // flat_multiset(from_range_t, R&&)
     // contiguous range
-    using M = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     using R = std::ranges::subrange<const int*>;
     auto m  = M(std::from_range, R(ar, ar + 9));
     assert(std::ranges::equal(m, expected));
@@ -116,7 +117,7 @@ void test() {
   {
     // flat_multiset(from_range_t, R&&, const key_compare&)
     using C = test_less<int>;
-    using M = std::flat_multiset<int, C, std::vector<int>>;
+    using M = std::flat_multiset<int, C, KeyContainer<int>>;
     using R = std::ranges::subrange<const int*>;
     auto m  = M(std::from_range, R(ar, ar + 9), C(3));
     assert(std::ranges::equal(m, expected));
@@ -130,7 +131,7 @@ void test() {
   {
     // flat_multiset(from_range_t, R&&, const Allocator&)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     using R  = std::ranges::subrange<const int*>;
     auto m   = M(std::from_range, R(ar, ar + 9), A1(5));
     assert(std::ranges::equal(m, expected));
@@ -140,7 +141,7 @@ void test() {
     // flat_multiset(from_range_t, R&&, const Allocator&)
     // explicit(false)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::deque<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     using R  = std::ranges::subrange<const int*>;
     M m      = {std::from_range, R(ar, ar + 9), A1(5)}; // implicit ctor
     assert(std::ranges::equal(m, expected));
@@ -150,7 +151,7 @@ void test() {
     // flat_multiset(from_range_t, R&&, const key_compare&, const Allocator&)
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, C, std::vector<int, A1>>;
+    using M  = std::flat_multiset<int, C, KeyContainer<int, A1>>;
     using R  = std::ranges::subrange<const int*>;
     auto m   = M(std::from_range, R(ar, ar + 9), C(3), A1(5));
     assert(std::ranges::equal(m, expected));
@@ -161,7 +162,7 @@ void test() {
     // flat_multiset(from_range_t, R&&, const key_compare&, const Allocator&)
     // explicit(false)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::deque<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     using R  = std::ranges::subrange<const int*>;
     M m      = {std::from_range, R(ar, ar + 9), {}, A1(5)}; // implicit ctor
     assert(std::ranges::equal(m, expected));
@@ -169,8 +170,21 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp
index 76759be7da8e3..60fd70abc83b5 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp
@@ -30,7 +30,8 @@
 #include "test_macros.h"
 #include "../../../test_compare.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -38,8 +39,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const A1&>);
@@ -52,11 +53,12 @@ void test() {
     static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const C&, const A2&>);
     static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, const V2&, const C&, const A1&>);
   }
+
   {
     // flat_multiset(sorted_equivalent_t, container_type)
-    using M             = std::flat_multiset<int>;
-    std::vector<int> ks = {1, 2, 2, 4, 10};
-    auto ks2            = ks;
+    using M              = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
+    KeyContainer<int> ks = {1, 2, 2, 4, 10};
+    auto ks2             = ks;
 
     auto m = M(std::sorted_equivalent, ks);
     assert((m == M{1, 2, 2, 4, 10}));
@@ -71,7 +73,7 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, container_type)
     // non-default container, comparator and allocator type
-    using Ks = std::deque<int, min_allocator<int>>;
+    using Ks = KeyContainer<int, min_allocator<int>>;
     using M  = std::flat_multiset<int, std::greater<int>, Ks>;
     Ks ks    = {10, 4, 4, 2, 1};
     auto m   = M(std::sorted_equivalent, ks);
@@ -84,8 +86,8 @@ void test() {
     // flat_multiset(sorted_equivalent_t, container_type)
     // allocator copied into the containers
     using A = test_allocator<int>;
-    using M = std::flat_multiset<int, std::less<int>, std::deque<int, A>>;
-    auto ks = std::deque<int, A>({1, 2, 2, 4, 10}, A(4));
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 2, 2, 4, 10}, A(4));
     auto m  = M(std::sorted_equivalent, std::move(ks));
     assert(ks.empty()); // it was moved-from
     assert((m == M{1, 2, 2, 4, 10}));
@@ -93,9 +95,9 @@ void test() {
   }
   {
     // flat_multiset(sorted_equivalent_t, container_type ,  key_compare)
-    using C             = test_less<int>;
-    using M             = std::flat_multiset<int, C>;
-    std::vector<int> ks = {1, 2, 2, 4, 10};
+    using C              = test_less<int>;
+    using M              = std::flat_multiset<int, C, KeyContainer<int>>;
+    KeyContainer<int> ks = {1, 2, 2, 4, 10};
 
     auto m = M(std::sorted_equivalent, ks, C(4));
     assert((m == M{1, 2, 2, 4, 10}));
@@ -108,11 +110,11 @@ void test() {
   }
   {
     // flat_multiset(sorted_equivalent_t, container_type , key_compare, const Allocator&)
-    using C                = test_less<int>;
-    using A                = test_allocator<int>;
-    using M                = std::flat_multiset<int, C, std::vector<int, A>>;
-    std::vector<int, A> ks = {1, 2, 2, 4, 10};
-    auto m                 = M(std::sorted_equivalent, ks, C(4), A(5));
+    using C                 = test_less<int>;
+    using A                 = test_allocator<int>;
+    using M                 = std::flat_multiset<int, C, KeyContainer<int, A>>;
+    KeyContainer<int, A> ks = {1, 2, 2, 4, 10};
+    auto m                  = M(std::sorted_equivalent, ks, C(4), A(5));
     assert((m == M{1, 2, 2, 4, 10}));
     assert(m.key_comp() == C(4));
     assert(M(m).extract().get_allocator() == A(5));
@@ -126,8 +128,8 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, container_type , const Allocator&)
     using A = test_allocator<int>;
-    using M = std::flat_multiset<int, std::less<int>, std::deque<int, A>>;
-    auto ks = std::deque<int, A>({1, 2, 2, 4, 10}, A(4));
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 2, 2, 4, 10}, A(4));
     auto m  = M(std::sorted_equivalent, ks, A(6)); // replaces the allocators
     assert(!ks.empty());                           // it was an lvalue above
     assert((m == M{1, 2, 2, 4, 10}));
@@ -140,8 +142,22 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp
index 955662dd233ef..ff10c97c7bd3f 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp
@@ -31,12 +31,13 @@
 #include "../../../test_compare.h"
 
 template <class T>
-std::initializer_list<T> il = {1, 2, 4, 4, 5};
+constexpr std::initializer_list<T> il = {1, 2, 4, 4, 5};
 
-void test() {
-  const auto il1 = il<int>;
-  const auto il2 = il<short>;
+constexpr auto il1 = il<int>;
+constexpr auto il2 = il<short>;
 
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -44,8 +45,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     using IL = std::initializer_list<int>;
@@ -62,7 +63,7 @@ void test() {
   }
   {
     // initializer_list<value_type> needs to match exactly
-    using M = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     using C = typename M::key_compare;
     static_assert(std::is_constructible_v<M, std::sorted_equivalent_t, std::initializer_list<int>>);
     static_assert(std::is_constructible_v<M, std::sorted_equivalent_t, std::initializer_list<int>, C>);
@@ -88,7 +89,7 @@ void test() {
 
   {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>);
-    using M       = std::flat_multiset<int>;
+    using M       = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     auto m        = M(std::sorted_equivalent, il1);
     auto expected = M{1, 2, 4, 4, 5};
     assert(m == expected);
@@ -97,9 +98,9 @@ void test() {
     M m2 = {std::sorted_equivalent, il1};
     assert(m2 == m);
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>, const key_compare&);
-    using M = std::flat_multiset<int, std::function<bool(int, int)>>;
+    using M = std::flat_multiset<int, std::function<bool(int, int)>, KeyContainer<int>>;
     auto m  = M(std::sorted_equivalent, il1, std::less<int>());
     assert(m == M({1, 2, 4, 4, 5}, std::less<>()));
     assert(m.key_comp()(1, 2) == true);
@@ -111,7 +112,7 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>, const key_compare&);
     // greater
-    using M = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>;
+    using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     std::initializer_list<int> il4{5, 4, 4, 2, 1};
     auto m = M(std::sorted_equivalent, il4, std::greater<int>());
     assert((m == M{5, 4, 4, 2, 1}));
@@ -119,7 +120,7 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>,  const Allocator&)
     using A1      = test_allocator<short>;
-    using M       = std::flat_multiset<short, std::less<int>, std::deque<short, A1>>;
+    using M       = std::flat_multiset<short, std::less<int>, KeyContainer<short, A1>>;
     auto m        = M(std::sorted_equivalent, il2, A1(5));
     auto expected = M{1, 2, 4, 4, 5};
     assert(m == expected);
@@ -134,7 +135,7 @@ void test() {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>, const key_compare&, const Allocator&);
     using C  = test_less<int>;
     using A1 = test_allocator<short>;
-    using M  = std::flat_multiset<short, C, std::vector<short, A1>>;
+    using M  = std::flat_multiset<short, C, KeyContainer<short, A1>>;
     auto m   = M(std::sorted_equivalent, il2, C(3), A1(5));
     assert((m == M{1, 2, 4, 4, 5}));
     assert(m.key_comp() == C(3));
@@ -144,15 +145,29 @@ void test() {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>, const key_compare&, const Allocator&);
     // explicit(false)
     using A1 = test_allocator<short>;
-    using M  = std::flat_multiset<short, std::less<int>, std::deque<short, A1>>;
+    using M  = std::flat_multiset<short, std::less<int>, KeyContainer<short, A1>>;
     M m      = {std::sorted_equivalent, il2, {}, A1(5)}; // implicit ctor
     assert((m == M{1, 2, 4, 4, 5}));
     assert(std::move(m).extract().get_allocator() == A1(5));
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp
index 9ebe45d71d667..a3c998114ad5b 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp
@@ -28,7 +28,8 @@
 #include "test_macros.h"
 #include "../../../test_compare.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -36,8 +37,8 @@ void test() {
     using C     = test_less<int>;
     using A1    = test_allocator<int>;
     using A2    = other_allocator<int>;
-    using V1    = std::vector<int, A1>;
-    using V2    = std::vector<int, A2>;
+    using V1    = KeyContainer<int, A1>;
+    using V2    = KeyContainer<int, A2>;
     using M1    = std::flat_multiset<int, C, V1>;
     using M2    = std::flat_multiset<int, C, V2>;
     using Iter1 = typename M1::iterator;
@@ -52,10 +53,12 @@ void test() {
     static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, Iter1, Iter1, const C&, const A2&>);
     static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, Iter2, Iter2, const C&, const A1&>);
   }
+
   {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator);
     // cpp17_input_iterator
-    using M  = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
+
     int ar[] = {1, 2, 2, 4, 5};
     auto m = M(std::sorted_equivalent, cpp17_input_iterator<const int*>(ar), cpp17_input_iterator<const int*>(ar + 5));
     auto expected = M{1, 2, 2, 4, 5};
@@ -69,16 +72,16 @@ void test() {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator);
     // contiguous iterator
     using C       = test_less<int>;
-    using M       = std::flat_multiset<int, C, std::vector<int, min_allocator<int>>>;
+    using M       = std::flat_multiset<int, C, KeyContainer<int, min_allocator<int>>>;
     int ar[]      = {1, 2, 4, 4, 5};
     auto m        = M(std::sorted_equivalent, ar, ar + 5);
     auto expected = M{1, 2, 4, 4, 5};
     assert(m == expected);
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&);
     // cpp_17_input_iterator
-    using M  = std::flat_multiset<int, std::function<bool(int, int)>>;
+    using M  = std::flat_multiset<int, std::function<bool(int, int)>, KeyContainer<int>>;
     int ar[] = {1, 2, 4, 4, 5};
     auto m   = M(std::sorted_equivalent,
                cpp17_input_iterator<const int*>(ar),
@@ -97,7 +100,7 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&);
     // greater
-    using M  = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>;
+    using M  = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     int ar[] = {5, 4, 4, 2, 1};
     auto m   = M(std::sorted_equivalent,
                cpp17_input_iterator<const int*>(ar),
@@ -109,7 +112,7 @@ void test() {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&);
     // contiguous iterator
     using C   = test_less<int>;
-    using M   = std::flat_multiset<int, C, std::vector<int, min_allocator<int>>>;
+    using M   = std::flat_multiset<int, C, KeyContainer<int, min_allocator<int>>>;
     int ar[1] = {42};
     auto m    = M(std::sorted_equivalent, ar, ar, C(5));
     assert(m.empty());
@@ -118,7 +121,7 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, InputIterator , InputIterator, const Allocator&)
     using A1      = test_allocator<int>;
-    using M       = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>;
+    using M       = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     int ar[]      = {1, 2, 4, 4, 5};
     auto m        = M(std::sorted_equivalent, ar, ar + 5, A1(5));
     auto expected = M{1, 2, 4, 4, 5};
@@ -134,7 +137,7 @@ void test() {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&, const Allocator&);
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, C, std::deque<int, A1>>;
+    using M  = std::flat_multiset<int, C, KeyContainer<int, A1>>;
     int ar[] = {1, 2, 4, 4, 5};
     auto m   = M(std::sorted_equivalent, ar, ar + 5, C(3), A1(5));
     assert((m == M{1, 2, 4, 4, 5}));
@@ -145,7 +148,7 @@ void test() {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&, const Allocator&);
     // explicit(false)
     using A1 = test_allocator<short>;
-    using M  = std::flat_multiset<short, std::less<int>, std::deque<short, A1>>;
+    using M  = std::flat_multiset<short, std::less<int>, KeyContainer<short, A1>>;
     int ar[] = {1, 2, 4, 4, 5};
     M m      = {std::sorted_equivalent, ar, ar + 5, {}, A1(5)}; // implicit ctor
     assert((m == M{1, 2, 4, 4, 5}));
@@ -153,8 +156,22 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp
index 21f3c918dec0d..337ad04c9cd48 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp
@@ -32,7 +32,7 @@ static_assert(HasStdErase<std::vector<int>>);
 static_assert(!HasStdErase<std::flat_multiset<int>>);
 
 template <class M>
-M make(std::initializer_list<int> vals) {
+constexpr M make(std::initializer_list<int> vals) {
   M ret;
   for (int v : vals)
     ret.emplace(v);
@@ -40,8 +40,8 @@ M make(std::initializer_list<int> vals) {
 }
 
 template <class M, class Pred>
-void test0(
-    std::initializer_list<int> vals, Pred p, std::initializer_list<int> expected, std::size_t expected_erased_count) {
+constexpr void
+test0(std::initializer_list<int> vals, Pred p, std::initializer_list<int> expected, std::size_t expected_erased_count) {
   M s = make<M>(vals);
   ASSERT_SAME_TYPE(typename M::size_type, decltype(std::erase_if(s, p)));
   assert(expected_erased_count == std::erase_if(s, p));
@@ -50,11 +50,11 @@ void test0(
 
 struct NotBool {
   bool b;
-  explicit operator bool() const { return b; }
+  explicit constexpr operator bool() const { return b; }
 };
 
 template <class S>
-void test_one() {
+constexpr void test_one() {
   // Test all the plausible signatures for this predicate.
   auto is1        = [](typename S::const_reference v) { return v == 1; };
   auto is2        = [](typename S::value_type v) { return v == 2; };
@@ -96,18 +96,28 @@ void test_one() {
   test0<S>({1, 1, 2, 2, 3}, nonBoolIs1, {2, 2, 3}, 2);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::flat_multiset<int>>();
   test_one<std::flat_multiset<int, std::less<int>, std::vector<int, min_allocator<int>>>>();
   test_one<std::flat_multiset<int, std::greater<int>, std::vector<int, test_allocator<int>>>>();
-  test_one<std::flat_multiset<int, std::less<int>, std::deque<int, min_allocator<int>>>>();
-  test_one<std::flat_multiset<int, std::greater<int>, std::deque<int, test_allocator<int>>>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test_one<std::flat_multiset<int, std::less<int>, std::deque<int, min_allocator<int>>>>();
+    test_one<std::flat_multiset<int, std::greater<int>, std::deque<int, test_allocator<int>>>>();
+  }
   test_one<std::flat_multiset<long>>();
   test_one<std::flat_multiset<double>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp
index 809f03df47977..878b2b2094f71 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp
@@ -30,7 +30,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
 
@@ -68,9 +68,12 @@ void test_one() {
   assert(i == m.begin());
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
@@ -89,10 +92,15 @@ void test() {
     assert(!(ii1 != cii));
     assert(!(cii != ii1));
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp
index cbf69d6e04904..ff4ad3f8f0279 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using I   = M::iterator;
@@ -141,15 +141,23 @@ void test_one() {
   assert(cri2 <=> cri1 == std::strong_ordering::greater);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp
index e25d786d9b3b4..678109b88f9fb 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp
@@ -25,46 +25,59 @@
 
 #include <iterator>
 
+#include "MinSequenceContainer.h"
 #include "test_macros.h"
+#include "min_allocator.h"
 
-void test() {
-  {
-    using M        = std::flat_multiset<int, std::less<int>, std::deque<int>>;
-    M m            = {1, 1, 2, 2, 3, 4};
-    int expected[] = {1, 1, 2, 2, 3, 4};
-    const M& cm    = m;
-    ASSERT_SAME_TYPE(decltype(m.rbegin()), M::reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(m.crbegin()), M::const_reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(cm.rbegin()), M::const_reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(m.rend()), M::reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(m.crend()), M::const_reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(cm.rend()), M::const_reverse_iterator);
-    static_assert(noexcept(m.rbegin()));
-    static_assert(noexcept(cm.rbegin()));
-    static_assert(noexcept(m.crbegin()));
-    static_assert(noexcept(m.rend()));
-    static_assert(noexcept(cm.rend()));
-    static_assert(noexcept(m.crend()));
-    assert(m.size() == 6);
-    assert(std::distance(m.rbegin(), m.rend()) == 6);
-    assert(std::distance(cm.rbegin(), cm.rend()) == 6);
-    assert(std::distance(m.crbegin(), m.crend()) == 6);
-    assert(std::distance(cm.crbegin(), cm.crend()) == 6);
-    M::reverse_iterator i; // default-construct
-    ASSERT_SAME_TYPE(decltype(*i), const int&);
-    i                           = m.rbegin(); // move-assignment
-    M::const_reverse_iterator k = i;          // converting constructor
-    assert(i == k);                           // comparison
-    for (int j = 5; j >= 0; --j, ++i) {       // pre-increment
-      assert(*i == expected[j]);
-    }
-    assert(i == m.rend());
-    for (int j = 0; j <= 5; ++j) {
-      --i; // pre-decrement
-      assert(*i == expected[j]);
-    }
-    assert(i == m.rbegin());
+template <class KeyContainer>
+constexpr void test_one() {
+  using Key      = typename KeyContainer::value_type;
+  using M        = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
+  M m            = {1, 1, 2, 2, 3, 4};
+  int expected[] = {1, 1, 2, 2, 3, 4};
+  const M& cm    = m;
+  ASSERT_SAME_TYPE(decltype(m.rbegin()), typename M::reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(m.crbegin()), typename M::const_reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(cm.rbegin()), typename M::const_reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(m.rend()), typename M::reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(m.crend()), typename M::const_reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(cm.rend()), typename M::const_reverse_iterator);
+  static_assert(noexcept(m.rbegin()));
+  static_assert(noexcept(cm.rbegin()));
+  static_assert(noexcept(m.crbegin()));
+  static_assert(noexcept(m.rend()));
+  static_assert(noexcept(cm.rend()));
+  static_assert(noexcept(m.crend()));
+  assert(m.size() == 6);
+  assert(std::distance(m.rbegin(), m.rend()) == 6);
+  assert(std::distance(cm.rbegin(), cm.rend()) == 6);
+  assert(std::distance(m.crbegin(), m.crend()) == 6);
+  assert(std::distance(cm.crbegin(), cm.crend()) == 6);
+  typename M::reverse_iterator i; // default-construct
+  ASSERT_SAME_TYPE(decltype(*i), const int&);
+  i                                    = m.rbegin(); // move-assignment
+  typename M::const_reverse_iterator k = i;          // converting constructor
+  assert(i == k);                                    // comparison
+  for (int j = 5; j >= 0; --j, ++i) {                // pre-increment
+    assert(*i == expected[j]);
+  }
+  assert(i == m.rend());
+  for (int j = 0; j <= 5; ++j) {
+    --i; // pre-decrement
+    assert(*i == expected[j]);
   }
+  assert(i == m.rbegin());
+}
+
+constexpr bool test() {
+  test_one<std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
+  test_one<MinSequenceContainer<int>>();
+  test_one<std::vector<int, min_allocator<int>>>();
+
   {
     // N3644 testing
     using C = std::flat_multiset<int>;
@@ -80,10 +93,15 @@ void test() {
     assert(!(ii1 != cii));
     assert(!(cii != ii1));
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp
index 4d01ece7ed6a6..088a8838ad8ae 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp
@@ -38,7 +38,7 @@ static_assert(NoExceptClear<std::flat_multiset<int, std::less<int>, ThrowOnMoveC
 #endif
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   {
@@ -58,17 +58,25 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp
index 3ef13964c905e..6772e17378b70 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp
@@ -28,7 +28,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using R   = typename M::iterator;
@@ -91,7 +91,7 @@ void test_one() {
 }
 
 template <class KeyContainer>
-void test_emplaceable() {
+constexpr void test_emplaceable() {
   using M = std::flat_multiset<Emplaceable, std::less<Emplaceable>, KeyContainer>;
   using R = typename M::iterator;
 
@@ -111,16 +111,24 @@ void test_emplaceable() {
   assert(*r == Emplaceable(1, 3.5));
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
   test_emplaceable<std::vector<Emplaceable>>();
-  test_emplaceable<std::deque<Emplaceable>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_emplaceable<std::deque<Emplaceable>>();
   test_emplaceable<MinSequenceContainer<Emplaceable>>();
   test_emplaceable<std::vector<Emplaceable, min_allocator<Emplaceable>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -130,6 +138,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp
index 41a2e9c4ce115..ec99a9fcc1d9b 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp
@@ -27,11 +27,11 @@
 #include "../helpers.h"
 
 struct CompareTensDigit {
-  bool operator()(auto lhs, auto rhs) const { return (lhs / 10) < (rhs / 10); }
+  constexpr bool operator()(auto lhs, auto rhs) const { return (lhs / 10) < (rhs / 10); }
 };
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using R   = M::iterator;
@@ -179,7 +179,6 @@ void test_one() {
     assert(r == m.begin() + 2);
     assert(m.size() == 7);
     assert(*r == 23);
-    assert(*std::next(r) == 20);
   }
   {
     // hint incorrect and after the last duplicate
@@ -196,7 +195,7 @@ void test_one() {
 }
 
 template <class KeyContainer>
-void test_emplaceable() {
+constexpr void test_emplaceable() {
   using M = std::flat_multiset<Emplaceable, std::less<Emplaceable>, KeyContainer>;
   using R = M::iterator;
 
@@ -216,9 +215,12 @@ void test_emplaceable() {
   assert(*r == Emplaceable(1, 3.5));
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
@@ -226,6 +228,8 @@ void test() {
   test_emplaceable<std::vector<Emplaceable>>();
   test_emplaceable<MinSequenceContainer<Emplaceable>>();
   test_emplaceable<std::vector<Emplaceable, min_allocator<Emplaceable>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -235,6 +239,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp
index 8418efa67bb23..f2cb151d8661b 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp
@@ -27,7 +27,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using I   = M::iterator;
@@ -94,11 +94,16 @@ void test_one() {
   assert(i8 == m.end());
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -108,6 +113,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp
index 2d54fef17b6c0..76078920af1bf 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp
@@ -26,7 +26,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using I   = M::iterator;
@@ -78,11 +78,16 @@ void test_one() {
   assert(i5 == m.end());
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -92,6 +97,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp
index 8175afa5b626e..7ddd3d8657066 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp
@@ -26,7 +26,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class Compare = std::less<>>
-void test_one() {
+constexpr void test_one() {
   using M = std::flat_multiset<int, Compare, KeyContainer>;
 
   auto make = [](std::initializer_list<int> il) {
@@ -74,12 +74,17 @@ void test_one() {
   assert(m.empty());
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
   test_one<std::vector<int>, std::greater<>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -94,6 +99,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp
index a8765495d91d4..0613744ec5e39 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp
@@ -38,10 +38,10 @@ static_assert(!CanErase<const NonTransparentSet>);
 
 template <class Key, class It>
 struct HeterogeneousKey {
-  explicit HeterogeneousKey(Key key, It it) : key_(key), it_(it) {}
-  operator It() && { return it_; }
-  auto operator<=>(Key key) const { return key_ <=> key; }
-  friend bool operator<(const HeterogeneousKey&, const HeterogeneousKey&) {
+  constexpr explicit HeterogeneousKey(Key key, It it) : key_(key), it_(it) {}
+  constexpr operator It() && { return it_; }
+  constexpr auto operator<=>(Key key) const { return key_ <=> key; }
+  constexpr friend bool operator<(const HeterogeneousKey&, const HeterogeneousKey&) {
     assert(false);
     return false;
   }
@@ -50,7 +50,7 @@ struct HeterogeneousKey {
 };
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
 
@@ -70,7 +70,7 @@ void test_one() {
 }
 
 template <class KeyContainer>
-void test_transparent_comparator() {
+constexpr void test_transparent_comparator() {
   using M = std::flat_multiset<std::string, TransparentComparator, KeyContainer>;
   {
     M m = {"alpha", "beta", "beta", "epsilon", "epsilon", "epsilon", "eta", "eta", "gamma"};
@@ -95,14 +95,20 @@ void test_transparent_comparator() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
   test_transparent_comparator<std::vector<std::string>>();
-  test_transparent_comparator<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_transparent_comparator<std::deque<std::string>>();
   test_transparent_comparator<MinSequenceContainer<std::string>>();
   test_transparent_comparator<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -146,6 +152,8 @@ void test() {
     assert(n == 2);
     assert((m == M{"alpha", "epsilon", "eta", "gamma"}));
   }
+
+  return true;
 }
 
 void test_exception() {
@@ -159,6 +167,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp
index 8a66431396916..bb41cedf85497 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp
@@ -33,7 +33,7 @@ static_assert(!CanExtract<std::flat_multiset<int> const&>);
 static_assert(!CanExtract<std::flat_multiset<int> const&&>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using M = std::flat_multiset<int, std::less<int>, KeyContainer>;
   {
     M m = M({1, 1, 3});
@@ -55,9 +55,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
@@ -70,6 +73,8 @@ void test() {
     check_invariant(m);
     LIBCPP_ASSERT(m.empty());
   }
+
+  return true;
 }
 
 void test_exception() {
@@ -96,6 +101,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp
index eeb1bdd26ca16..5128a40ada694 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using R   = typename M::iterator;
@@ -61,11 +61,16 @@ void test_one() {
   assert(*r == 1);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -79,6 +84,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp
index 9c56d3bfb750b..f0b1eaf377c5d 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
 
@@ -65,11 +65,16 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -84,6 +89,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp
index 61f00f5138118..55a77d576dacc 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using R   = typename M::iterator;
@@ -61,11 +61,16 @@ void test_one() {
   assert(*r == 1);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -80,6 +85,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp
index 93815686787c4..9b10bf3fbb1a4 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp
@@ -37,7 +37,7 @@ static_assert(!CanInsert<Set, int, int>);
 static_assert(!CanInsert<Set, cpp20_input_iterator<int*>, cpp20_input_iterator<int*>>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using M = std::flat_multiset<int, std::less<int>, KeyContainer>;
 
   int ar1[] = {
@@ -75,9 +75,12 @@ void test_one() {
   assert(m == expected2);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
   {
@@ -86,6 +89,8 @@ void test() {
     m.insert(v.begin(), v.end());
     assert(std::ranges::equal(m, std::vector<int>{1, 2, 3, 4}));
   }
+
+  return true;
 }
 
 void test_exception() {
@@ -95,6 +100,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp
index 9976c04c9973a..8bbc6c80e4ef7 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp
@@ -22,7 +22,7 @@
 #include "test_macros.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using V   = Key;
@@ -59,15 +59,22 @@ void test_one() {
   assert(*r == V(1));
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
   test_one<std::vector<MoveOnly>>();
-  test_one<std::deque<int>>();
-  test_one<std::deque<MoveOnly>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test_one<std::deque<int>>();
+    test_one<std::deque<MoveOnly>>();
+  }
   test_one<MinSequenceContainer<int>>();
   test_one<MinSequenceContainer<MoveOnly>>();
   test_one<std::vector<int, min_allocator<int>>>();
   test_one<std::vector<MoveOnly, min_allocator<MoveOnly>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -82,6 +89,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp
index 566be3921bf77..a9d8f7e330fed 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp
@@ -39,7 +39,7 @@ static_assert(!CanInsertRange<Set, std::ranges::subrange<std::pair<int, int>*>>)
 static_assert(!CanInsertRange<Set, std::ranges::subrange<std::pair<short, short>*>>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
 
   {
@@ -72,9 +72,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
   {
@@ -85,6 +88,8 @@ void test() {
     MoveOnly expected[] = {1, 1, 3, 4, 5};
     assert(std::ranges::equal(m, expected));
   }
+
+  return true;
 }
 
 void test_exception() {
@@ -94,6 +99,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp
index 9328c42fb0cda..67f3036a8dae7 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp
@@ -25,7 +25,7 @@
 #include "../helpers.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
   using R   = typename M::iterator;
@@ -63,15 +63,22 @@ void test_one() {
   assert(*r == V(1));
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
   test_one<std::vector<MoveOnly>>();
-  test_one<std::deque<int>>();
-  test_one<std::deque<MoveOnly>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test_one<std::deque<int>>();
+    test_one<std::deque<MoveOnly>>();
+  }
   test_one<MinSequenceContainer<int>>();
   test_one<MinSequenceContainer<MoveOnly>>();
   test_one<std::vector<int, min_allocator<int>>>();
   test_one<std::vector<MoveOnly, min_allocator<MoveOnly>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -86,6 +93,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp
index 11af199c3d1ee..81b7e4e196b30 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   {
@@ -42,11 +42,16 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -61,6 +66,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp
index 07b62d04e0ebc..bfb230718fb6f 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanInsert<Set, std::sorted_equivalent_t, int, int>);
 static_assert(!CanInsert<Set, std::sorted_equivalent_t, cpp20_input_iterator<int*>, cpp20_input_iterator<int*>>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
 
@@ -60,11 +60,16 @@ void test_one() {
   assert(m == expected2);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -76,6 +81,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp
index 5fe61389d72a1..3c74cf6ebe995 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp
@@ -31,7 +31,7 @@ static_assert(CanReplace<Set, std::vector<int>>);
 static_assert(!CanReplace<Set, const std::vector<int>&>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   {
@@ -53,11 +53,16 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -82,6 +87,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp
index 2e3ed02c3c00e..241f2cf9e0a73 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp
@@ -38,7 +38,7 @@ static_assert(NoExceptAdlSwap<std::flat_multiset<int, std::less<int>, ThrowOnMov
 #endif
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
 
@@ -84,15 +84,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp
index 1d0d9152d1c1f..7ad96ed340955 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp
@@ -37,7 +37,7 @@ static_assert(NoExceptMemberSwap<std::flat_multiset<int, std::less<int>, ThrowOn
 #endif
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   {
@@ -82,15 +82,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp
index 4ca64516e242f..74c92f3a3f843 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp
@@ -21,7 +21,7 @@
 
 #include "test_macros.h"
 
-void test() {
+constexpr bool test() {
   {
     using M    = std::flat_multiset<int>;
     using Comp = std::less<int>; // the default
@@ -36,7 +36,7 @@ void test() {
     assert(vc(1, 2));
     assert(!vc(2, 1));
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     using Comp = std::function<bool(int, int)>;
     using M    = std::flat_multiset<int, Comp>;
     Comp comp  = std::greater<int>();
@@ -67,10 +67,15 @@ void test() {
     assert(vc(1, 2));
     assert(!vc(2, 1));
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp
index 00fda6c2edd88..a178dfd3d0cb5 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   {
     using M = std::flat_multiset<Key, std::less<>, KeyContainer>;
@@ -66,15 +66,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp
index abee2b1bb12f9..3222762122f88 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp
@@ -35,7 +35,7 @@ static_assert(!CanContains<NonTransparentSet>);
 static_assert(!CanContains<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
 
@@ -60,9 +60,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -82,10 +85,15 @@ void test() {
     assert(m.contains("beta"));
     assert(!m.contains("charlie"));
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp
index 1752dab0e0e3a..8b034dfa1423c 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using S   = typename KeyContainer::size_type;
 
@@ -66,15 +66,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp
index a9160aebb7517..a1a0d6b1f0310 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp
@@ -35,7 +35,7 @@ static_assert(!CanCount<NonTransparentSet>);
 static_assert(!CanCount<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
   {
@@ -59,9 +59,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -81,10 +84,15 @@ void test() {
     auto n  = m.count("beta");
     assert(n == 2);
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp
index 54ae27e9ba19c..b105d1914113a 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   {
     using M  = std::flat_multiset<Key, std::less<>, KeyContainer>;
@@ -74,15 +74,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp
index ae16ec1127f31..65bff7a095dc6 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanEqualRange<NonTransparentSet>);
 static_assert(!CanEqualRange<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
 
@@ -90,9 +90,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -113,10 +116,15 @@ void test() {
     assert(first == m.begin() + 1);
     assert(last == m.begin() + 3);
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp
index 49386a6f77fae..bc9a439eecbb9 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp
@@ -25,7 +25,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<>, KeyContainer>;
   {
@@ -50,15 +50,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp
index 9d0b75c7b52bc..4c9c403464634 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanFind<NonTransparentSet>);
 static_assert(!CanFind<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
 
@@ -77,9 +77,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -101,10 +104,15 @@ void test() {
     auto it2 = m.find("charlie");
     assert(it2 == m.end());
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp
index ba41b822fda74..07f053316ad32 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   {
     using M = std::flat_multiset<Key, std::less<>, KeyContainer>;
@@ -66,15 +66,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp
index c03fb27a7c27e..e674c85ab30e6 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanLowerBound<NonTransparentSet>);
 static_assert(!CanLowerBound<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
 
@@ -83,9 +83,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -107,10 +110,15 @@ void test() {
     auto it2 = m.lower_bound("charlie");
     assert(it2 == m.begin() + 3);
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp
index 7828f0500c8b9..d4d19926571d7 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   {
     using M = std::flat_multiset<Key, std::less<>, KeyContainer>;
@@ -67,15 +67,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp
index de517fd7e520a..75140a780cceb 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanUpperBound<NonTransparentSet>);
 static_assert(!CanUpperBound<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
 
@@ -83,9 +83,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -105,10 +108,15 @@ void test() {
     auto it = m.upper_bound("beta");
     assert(it == m.begin() + 3);
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h b/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h
index e7ed8a091d3be..82f917756e92c 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h
@@ -20,7 +20,7 @@
 #include "test_macros.h"
 
 template <class... Args>
-void check_invariant(const std::flat_multiset<Args...>& m) {
+constexpr void check_invariant(const std::flat_multiset<Args...>& m) {
   assert(std::is_sorted(m.begin(), m.end(), m.key_comp()));
 }
 
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp
index 94f0f2b34abcc..606cdfc3ba7d2 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp
@@ -31,7 +31,7 @@
 #include "test_container_comparisons.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
 
   {
@@ -64,9 +64,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
@@ -81,7 +84,7 @@ void test() {
   {
     // Comparisons use value_type's native operators, not the comparator
     struct StrongComp {
-      bool operator()(double a, double b) const { return std::strong_order(a, b) < 0; }
+      constexpr bool operator()(double a, double b) const { return std::strong_order(a, b) < 0; }
     };
     using C = std::flat_multiset<double, StrongComp>;
     C s1    = {1};
@@ -96,10 +99,15 @@ void test() {
     assert(s1 != s2);
     assert((s1 <=> s2) == std::partial_ordering::unordered);
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter.pass.cpp b/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter.pass.cpp
index 1f8a044d0b602..59d93ac7ea411 100644
--- a/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter.pass.cpp
@@ -11,6 +11,7 @@
 // template <class InputIterator> deque(InputIterator f, InputIterator l);
 
 #include "asan_testing.h"
+#include <algorithm>
 #include <deque>
 #include <cassert>
 #include <cstddef>
@@ -28,13 +29,11 @@ void test(InputIterator f, InputIterator l) {
   typedef typename std::iterator_traits<InputIterator>::value_type T;
   typedef std::allocator<T> Allocator;
   typedef std::deque<T, Allocator> C;
-  typedef typename C::const_iterator const_iterator;
   C d(f, l);
   assert(d.size() == static_cast<std::size_t>(std::distance(f, l)));
   assert(static_cast<std::size_t>(std::distance(d.begin(), d.end())) == d.size());
   LIBCPP_ASSERT(is_double_ended_contiguous_container_asan_correct(d));
-  for (const_iterator i = d.begin(), e = d.end(); i != e; ++i, ++f)
-    assert(*i == *f);
+  assert(std::equal(d.begin(), d.end(), f));
 }
 
 template <class Allocator, class InputIterator>
diff --git a/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter_alloc.pass.cpp b/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter_alloc.pass.cpp
index 61318c3d0f2d3..ef876bb272fc7 100644
--- a/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter_alloc.pass.cpp
@@ -12,6 +12,7 @@
 //   deque(InputIterator f, InputIterator l, const allocator_type& a);
 
 #include "asan_testing.h"
+#include <algorithm>
 #include <deque>
 #include <cassert>
 #include <cstddef>
@@ -28,14 +29,12 @@ template <class InputIterator, class Allocator>
 void test(InputIterator f, InputIterator l, const Allocator& a) {
   typedef typename std::iterator_traits<InputIterator>::value_type T;
   typedef std::deque<T, Allocator> C;
-  typedef typename C::const_iterator const_iterator;
   C d(f, l, a);
   assert(d.get_allocator() == a);
   assert(d.size() == static_cast<std::size_t>(std::distance(f, l)));
   assert(static_cast<std::size_t>(std::distance(d.begin(), d.end())) == d.size());
   LIBCPP_ASSERT(is_double_ended_contiguous_container_asan_correct(d));
-  for (const_iterator i = d.begin(), e = d.end(); i != e; ++i, ++f)
-    assert(*i == *f);
+  assert(std::equal(d.begin(), d.end(), f));
 }
 
 void basic_test() {
diff --git a/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter.pass.cpp
index e9fb2e6ecfbac..b862583c495e1 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter.pass.cpp
@@ -11,6 +11,7 @@
 
 // template <class InputIter> vector(InputIter first, InputIter last);
 
+#include <algorithm>
 #include <vector>
 #include <cassert>
 #include <cstddef>
@@ -24,8 +25,7 @@ TEST_CONSTEXPR_CXX20 void test(Iterator first, Iterator last) {
   C c(first, last);
   LIBCPP_ASSERT(c.__invariants());
   assert(c.size() == static_cast<std::size_t>(std::distance(first, last)));
-  for (typename C::const_iterator i = c.cbegin(), e = c.cend(); i != e; ++i, ++first)
-    assert(*i == *first);
+  assert(std::equal(c.cbegin(), c.cend(), first));
 }
 
 TEST_CONSTEXPR_CXX20 bool tests() {
diff --git a/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter_alloc.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter_alloc.pass.cpp
index 71a176a0a64ba..3fe462eef80ed 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter_alloc.pass.cpp
@@ -12,6 +12,7 @@
 // template <class InputIter> vector(InputIter first, InputIter last,
 //                                   const allocator_type& a);
 
+#include <algorithm>
 #include <vector>
 #include <cassert>
 #include <cstddef>
@@ -25,8 +26,7 @@ TEST_CONSTEXPR_CXX20 void test(Iterator first, Iterator last, const typename C::
   C c(first, last, a);
   LIBCPP_ASSERT(c.__invariants());
   assert(c.size() == static_cast<std::size_t>(std::distance(first, last)));
-  for (typename C::const_iterator i = c.cbegin(), e = c.cend(); i != e; ++i, ++first)
-    assert(*i == *first);
+  assert(std::equal(c.cbegin(), c.cend(), first));
 }
 
 TEST_CONSTEXPR_CXX20 bool tests() {
diff --git a/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter.pass.cpp
index 1a6364a8018bc..f2ac013987eb8 100644
--- a/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter.pass.cpp
@@ -10,6 +10,7 @@
 
 // template <class InputIter> vector(InputIter first, InputIter last);
 
+#include <algorithm>
 #include <vector>
 #include <cassert>
 #include <cstddef>
@@ -31,8 +32,7 @@ TEST_CONSTEXPR_CXX20 void test(Iterator first, Iterator last) {
     LIBCPP_ASSERT(c.__invariants());
     assert(c.size() == static_cast<std::size_t>(std::distance(first, last)));
     LIBCPP_ASSERT(is_contiguous_container_asan_correct(c));
-    for (typename C::const_iterator i = c.cbegin(), e = c.cend(); i != e; ++i, ++first)
-      assert(*i == *first);
+    assert(std::equal(c.cbegin(), c.cend(), first));
   }
   // Test with an empty range
   {
diff --git a/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter_alloc.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter_alloc.pass.cpp
index d1eff51011c4f..56a3778ddf965 100644
--- a/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter_alloc.pass.cpp
@@ -11,6 +11,7 @@
 // template <class InputIter> vector(InputIter first, InputIter last,
 //                                   const allocator_type& a);
 
+#include <algorithm>
 #include <vector>
 #include <cassert>
 #include <cstddef>
@@ -31,8 +32,7 @@ TEST_CONSTEXPR_CXX20 void test(Iterator first, Iterator last, const A& a) {
   LIBCPP_ASSERT(c.__invariants());
   assert(c.size() == static_cast<std::size_t>(std::distance(first, last)));
   LIBCPP_ASSERT(is_contiguous_container_asan_correct(c));
-  for (typename C::const_iterator i = c.cbegin(), e = c.cend(); i != e; ++i, ++first)
-    assert(*i == *first);
+  assert(std::equal(c.cbegin(), c.cend(), first));
 }
 
 #if TEST_STD_VER >= 11
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp
index 00aa97a45cc24..72af0a2db1180 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp
@@ -12,8 +12,7 @@
 
 // This test requires the fix to https://llvm.org/PR60509 in the dylib,
 // which landed in 5afb937d8a30445642ccaf33866ee4cdd0713222.
-// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added
-// XFAIL: using-built-library-before-llvm-19 && !darwin
+// XFAIL: using-built-library-before-llvm-19
 
 #include <fstream>
 #include <cstddef>
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/sync.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/sync.pass.cpp
index b04d2c07ebb1c..79d20ce68d11b 100644
--- a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/sync.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/sync.pass.cpp
@@ -13,8 +13,7 @@
 // The fix for bug 51497 and bug 51499 require and updated dylib due to
 // explicit instantiations. That means Apple backdeployment targets remain
 // broken.
-// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added
-// XFAIL: using-built-library-before-llvm-19 && !darwin
+// XFAIL: using-built-library-before-llvm-19
 
 #include <istream>
 #include <cassert>
diff --git a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/arrow.pass.cpp b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/arrow.pass.cpp
index 665a1a89223bc..a238b753d1f15 100644
--- a/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/arrow.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/reverse.iterators/reverse.iter.elem/arrow.pass.cpp
@@ -17,10 +17,10 @@
 // LWG 198 was superseded by LWG 2360
 //    http://www.open-std.org/jtc1/sc22/wg21/docs/lwg-defects.html#2360
 
-
+#include <cassert>
 #include <iterator>
 #include <list>
-#include <cassert>
+#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
index 731d751df08d9..dc4d8ae2851f4 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
@@ -11,7 +11,6 @@
 // UNSUPPORTED: c++03, c++11
 
 // These compiler versions and platforms don't enable sized deallocation by default.
-// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
index 64a26ed63e8ce..834c01b2272e2 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
@@ -11,7 +11,6 @@
 // UNSUPPORTED: c++03, c++11
 
 // These compiler versions and platforms don't enable sized deallocation by default.
-// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation
diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp
index 0aded33e660d5..7e25d40dc8a7d 100644
--- a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp
+++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp
@@ -14,7 +14,6 @@
 
 #include <exception>
 #include <cassert>
-#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp
new file mode 100644
index 0000000000000..6882bc6548da3
--- /dev/null
+++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions, c++03
+
+// <exception>
+
+// typedef unspecified exception_ptr;
+
+// Test the move assignment of exception_ptr
+
+#include <exception>
+#include <utility>
+#include <cassert>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  std::exception_ptr p = std::make_exception_ptr(42);
+  std::exception_ptr p2{p};
+  assert(p2 == p);
+  // Under test: the move assignment
+  std::exception_ptr p3;
+  p3 = std::move(p2);
+  assert(p3 == p);
+// `p2` was moved from. In libc++ it will be nullptr, but
+// this is not guaranteed by the standard.
+#if defined(_LIBCPP_VERSION) && !defined(_LIBCPP_ABI_MICROSOFT)
+  assert(p2 == nullptr);
+  assert(p2 == nullptr);
+#endif
+
+  try {
+    std::rethrow_exception(p3);
+  } catch (int e) {
+    assert(e == 42);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp
new file mode 100644
index 0000000000000..122e229fd6e47
--- /dev/null
+++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp
@@ -0,0 +1,43 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions, c++03
+
+// <exception>
+
+// typedef unspecified exception_ptr;
+
+// Test the move constructor of exception_ptr
+
+#include <exception>
+#include <utility>
+#include <cassert>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  std::exception_ptr p = std::make_exception_ptr(42);
+  std::exception_ptr p2{p};
+  assert(p2 == p);
+  // Under test: The move constructor
+  std::exception_ptr p3{std::move(p2)};
+  assert(p3 == p);
+// `p2` was moved from. In libc++ it will be nullptr, but
+// this is not guaranteed by the standard.
+#if defined(_LIBCPP_VERSION) && !defined(_LIBCPP_ABI_MICROSOFT)
+  assert(p2 == nullptr);
+#endif
+
+  try {
+    std::rethrow_exception(p3);
+  } catch (int e) {
+    assert(e == 42);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp
new file mode 100644
index 0000000000000..82b4713bed538
--- /dev/null
+++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp
@@ -0,0 +1,40 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions
+
+// <exception>
+
+// typedef unspecified exception_ptr;
+
+// Test swapping of exception_ptr
+
+#include <exception>
+#include <utility>
+#include <cassert>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  std::exception_ptr p21 = std::make_exception_ptr(42);
+  std::exception_ptr p42 = std::make_exception_ptr(21);
+  std::swap(p42, p21);
+
+  try {
+    std::rethrow_exception(p21);
+  } catch (int e) {
+    assert(e == 21);
+  }
+  try {
+    std::rethrow_exception(p42);
+  } catch (int e) {
+    assert(e == 42);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp
index 9c06eee27e0c8..26c8e1bc7d66f 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp
@@ -20,30 +20,50 @@
 
 #if TEST_STD_VER < 14
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 14
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 17
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 20
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 23
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should be defined in c++23"
 #  endif
@@ -53,6 +73,13 @@
 
 #elif TEST_STD_VER > 23
 
+#  ifndef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_flat_map != 202502L
+#    error "__cpp_lib_constexpr_flat_map should have the value 202502L in c++26"
+#  endif
+
 #  ifndef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should be defined in c++26"
 #  endif
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp
index 5985bdc2d7d4f..b29da9fdbe649 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp
@@ -20,30 +20,50 @@
 
 #if TEST_STD_VER < 14
 
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 14
 
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 17
 
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 20
 
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 23
 
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should be defined in c++23"
 #  endif
@@ -53,6 +73,13 @@
 
 #elif TEST_STD_VER > 23
 
+#  ifndef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_flat_set != 202502L
+#    error "__cpp_lib_constexpr_flat_set should have the value 202502L in c++26"
+#  endif
+
 #  ifndef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should be defined in c++26"
 #  endif
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp
index aca6290f5a4bf..c4e652979a4e6 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp
@@ -142,8 +142,8 @@
 #  ifndef __cpp_lib_optional
 #    error "__cpp_lib_optional should be defined in c++26"
 #  endif
-#  if __cpp_lib_optional != 202110L
-#    error "__cpp_lib_optional should have the value 202110L in c++26"
+#  if __cpp_lib_optional != 202506L
+#    error "__cpp_lib_optional should have the value 202506L in c++26"
 #  endif
 
 #  ifndef __cpp_lib_optional_range_support
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp
index 0074f3bf4cc57..cb5c008f16bb3 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp
@@ -918,7 +918,7 @@
 #    endif
 #  endif
 
-#  if !defined(_LIBCPP_VERSION)
+#  if __has_builtin(__builtin_is_within_lifetime)
 #    ifndef __cpp_lib_is_within_lifetime
 #      error "__cpp_lib_is_within_lifetime should be defined in c++26"
 #    endif
@@ -927,7 +927,7 @@
 #    endif
 #  else
 #    ifdef __cpp_lib_is_within_lifetime
-#      error "__cpp_lib_is_within_lifetime should not be defined because it is unimplemented in libc++!"
+#      error "__cpp_lib_is_within_lifetime should not be defined when the requirement '__has_builtin(__builtin_is_within_lifetime)' is not met!"
 #    endif
 #  endif
 
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index 05af1fb0cf14b..996ec29dce697 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -204,6 +204,14 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
 #  endif
@@ -1116,6 +1124,14 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
 #  endif
@@ -2130,6 +2146,14 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
 #  endif
@@ -3384,6 +3408,14 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
 #  endif
@@ -4860,6 +4892,14 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++23"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
 #  endif
@@ -6549,6 +6589,20 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++26"
 #  endif
 
+#  ifndef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_flat_map != 202502L
+#    error "__cpp_lib_constexpr_flat_map should have the value 202502L in c++26"
+#  endif
+
+#  ifndef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_flat_set != 202502L
+#    error "__cpp_lib_constexpr_flat_set should have the value 202502L in c++26"
+#  endif
+
 #  ifndef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should be defined in c++26"
 #  endif
@@ -7256,7 +7310,7 @@
 #    endif
 #  endif
 
-#  if !defined(_LIBCPP_VERSION)
+#  if __has_builtin(__builtin_is_within_lifetime)
 #    ifndef __cpp_lib_is_within_lifetime
 #      error "__cpp_lib_is_within_lifetime should be defined in c++26"
 #    endif
@@ -7265,7 +7319,7 @@
 #    endif
 #  else
 #    ifdef __cpp_lib_is_within_lifetime
-#      error "__cpp_lib_is_within_lifetime should not be defined because it is unimplemented in libc++!"
+#      error "__cpp_lib_is_within_lifetime should not be defined when the requirement '__has_builtin(__builtin_is_within_lifetime)' is not met!"
 #    endif
 #  endif
 
@@ -7455,8 +7509,8 @@
 #  ifndef __cpp_lib_optional
 #    error "__cpp_lib_optional should be defined in c++26"
 #  endif
-#  if __cpp_lib_optional != 202110L
-#    error "__cpp_lib_optional should have the value 202110L in c++26"
+#  if __cpp_lib_optional != 202506L
+#    error "__cpp_lib_optional should have the value 202506L in c++26"
 #  endif
 
 #  ifndef __cpp_lib_optional_range_support
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_en_US.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_en_US.pass.cpp
index 9997b07134563..9861662bb59c7 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_en_US.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_en_US.pass.cpp
@@ -15,6 +15,7 @@
 
 // Bionic has minimal locale support, investigate this later.
 // XFAIL: LIBCXX-ANDROID-FIXME
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 // REQUIRES: locale.en_US.UTF-8
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp
index c9ed59f3cb9aa..002fc4b1ec7ef 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp
@@ -8,6 +8,7 @@
 
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 // REQUIRES: locale.fr_FR.UTF-8
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_overlong.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_overlong.pass.cpp
index 0b7a38e5104cd..8fe74cdaca5e4 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_overlong.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_overlong.pass.cpp
@@ -16,6 +16,8 @@
 // Ensure that money_get::do_get correct works when the input doesn't fit into the stack buffer
 // (100 characters currently).
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <cassert>
 #include <cstddef>
 #include <ios>
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp
index 371cf0e90c8d3..7ce267d0617b0 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp
@@ -14,6 +14,7 @@
 // ADDITIONAL_COMPILE_FLAGS: -DRU_MON_THOU_SEP=%{LOCALE_CONV_RU_RU_UTF_8_MON_THOUSANDS_SEP}
 
 // XFAIL: glibc-old-ru_RU-decimal-point
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 // <locale>
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp
index c86df7e6b53bf..d83167d1ee458 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp
@@ -10,6 +10,7 @@
 // XFAIL: netbsd
 
 // XFAIL: LIBCXX-FREEBSD-FIXME
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 // REQUIRES: locale.zh_CN.UTF-8
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_string_en_US.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_string_en_US.pass.cpp
index 478df7964f6d2..0531260487b9f 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_string_en_US.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_string_en_US.pass.cpp
@@ -15,6 +15,7 @@
 
 // Bionic has minimal locale support, investigate this later.
 // XFAIL: LIBCXX-ANDROID-FIXME
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 // REQUIRES: locale.en_US.UTF-8
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_en_US.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_en_US.pass.cpp
index 4b767fae871fa..0f2c81a805282 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_en_US.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_en_US.pass.cpp
@@ -15,6 +15,7 @@
 
 // Bionic has minimal locale support, investigate this later.
 // XFAIL: LIBCXX-ANDROID-FIXME
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 // REQUIRES: locale.en_US.UTF-8
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp
index f9d7998b07ff4..733eea94fd9bd 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp
@@ -9,6 +9,8 @@
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 // REQUIRES: locale.fr_FR.UTF-8
 
 // ADDITIONAL_COMPILE_FLAGS: -DFR_MON_THOU_SEP=%{LOCALE_CONV_FR_FR_UTF_8_MON_THOUSANDS_SEP}
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp
index be1e397488468..24cc4fdb47f75 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp
@@ -9,6 +9,8 @@
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 // REQUIRES: locale.ru_RU.UTF-8
 
 // ADDITIONAL_COMPILE_FLAGS: -DRU_MON_THOU_SEP=%{LOCALE_CONV_RU_RU_UTF_8_MON_THOUSANDS_SEP}
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp
index 25046a8417083..d970b55eb704b 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp
@@ -10,6 +10,7 @@
 // XFAIL: netbsd
 
 // XFAIL: LIBCXX-FREEBSD-FIXME
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 // REQUIRES: locale.zh_CN.UTF-8
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_string_en_US.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_string_en_US.pass.cpp
index 1c8710a008f27..9770912da9dcf 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_string_en_US.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_string_en_US.pass.cpp
@@ -16,6 +16,8 @@
 // Bionic has minimal locale support, investigate this later.
 // XFAIL: LIBCXX-ANDROID-FIXME
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 // REQUIRES: locale.en_US.UTF-8
 
 #include <locale>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_bool.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_bool.pass.cpp
index d62a27a0f6ae9..22997ebbbc82d 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_bool.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_bool.pass.cpp
@@ -12,6 +12,8 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, bool v) const;
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <locale>
 #include <ios>
 #include <cassert>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp
index dea2be771e0c6..a4ef158954f59 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp
@@ -13,6 +13,7 @@
 // iter_type put(iter_type s, ios_base& iob, char_type fill, double v) const;
 
 // XFAIL: win32-broken-printf-a-precision
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 #include <locale>
 #include <ios>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.pass.cpp
index b131a41ceac34..45ede5a395c63 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.pass.cpp
@@ -13,6 +13,7 @@
 // iter_type put(iter_type s, ios_base& iob, char_type fill, double v) const;
 
 // XFAIL: win32-broken-printf-g-precision
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 #include <locale>
 #include <ios>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long.pass.cpp
index 7f034d487e57e..c3565c5bab11d 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long.pass.cpp
@@ -12,6 +12,8 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, long v) const;
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <locale>
 #include <ios>
 #include <cassert>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp
index 8db40b9e0dcbc..9e84fa8a53afe 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp
@@ -13,6 +13,7 @@
 // iter_type put(iter_type s, ios_base& iob, char_type fill, long double v) const;
 
 // XFAIL: win32-broken-printf-a-precision
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 #include <locale>
 #include <ios>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp
index d044898a1f828..e2868cfb37140 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp
@@ -13,6 +13,7 @@
 // iter_type put(iter_type s, ios_base& iob, char_type fill, long double v) const;
 
 // XFAIL: win32-broken-printf-g-precision
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 #include <locale>
 #include <ios>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_long.pass.cpp
index 2f4dd42e1a20c..4f60835880422 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_long.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_long.pass.cpp
@@ -12,6 +12,8 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, long long v) const;
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <locale>
 #include <ios>
 #include <cassert>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp
index fed5b4a610fd4..57607e6d6a521 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp
@@ -12,6 +12,8 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, void* v) const;
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <cassert>
 #include <ios>
 #include <locale>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long.pass.cpp
index 714c8dd8ccd9f..11216a3d111e3 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long.pass.cpp
@@ -12,6 +12,8 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, unsigned long v) const;
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <locale>
 #include <ios>
 #include <cassert>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long_long.pass.cpp
index 70ae4b3ae9de0..5dd555eda1e56 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long_long.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long_long.pass.cpp
@@ -12,6 +12,8 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, unsigned long long v) const;
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <locale>
 #include <ios>
 #include <cassert>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp
index 31682fea43bc4..a388c0b15a840 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp
@@ -8,8 +8,7 @@
 
 // The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed behavior of
 // FP parsing. This requires 3e15c97fa3812993bdc319827a5c6d867b765ae8 in the dylib.
-// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added
-// XFAIL: using-built-library-before-llvm-19 && !darwin
+// XFAIL: using-built-library-before-llvm-19
 
 // <locale>
 
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp
index 57eedc8633be3..596d81cbc8c91 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp
@@ -8,8 +8,7 @@
 
 // The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed behavior of
 // FP parsing. This requires 3e15c97fa3812993bdc319827a5c6d867b765ae8 in the dylib.
-// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added
-// XFAIL: using-built-library-before-llvm-19 && !darwin
+// XFAIL: using-built-library-before-llvm-19
 
 // <locale>
 
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp
index 8324ee317014d..8a9fd41501626 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp
@@ -8,8 +8,7 @@
 
 // The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed behavior of
 // FP parsing. This requires 3e15c97fa3812993bdc319827a5c6d867b765ae8 in the dylib.
-// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added
-// XFAIL: using-built-library-before-llvm-19 && !darwin
+// XFAIL: using-built-library-before-llvm-19
 
 // <locale>
 
diff --git a/libcxx/test/std/numerics/c.math/signbit.pass.cpp b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
index 7571ced2e4431..233e8ed2338b6 100644
--- a/libcxx/test/std/numerics/c.math/signbit.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
@@ -12,7 +12,7 @@
 // UNSUPPORTED: windows
 
 // These compilers don't support constexpr `__builtin_signbit` yet.
-// UNSUPPORTED: clang-19, apple-clang-16, apple-clang-17
+// UNSUPPORTED: apple-clang-17
 
 // GCC warns about signbit comparing `bool_v < 0`, which we're testing
 // ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp
index 6bd112c7d1280..f49e19acf0234 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++26
 
-// The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-16
-
 // <numeric>
 
 // template<class T>
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp
index bdfc57694dd53..0789213163847 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++26
 
-// The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-16
-
 // <numeric>
 
 // template<class T>
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp
index 1fe7916c67823..f09bf30771102 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++26
 
-// The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-16
-
 // <numeric>
 
 // template<class T>
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
index b797ae7533add..86e2e61647be8 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++26
 
-// The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-16
-
 // <numeric>
 
 // template<class R, class T>
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp
index 8b6188f1fad0e..c2be8c5a47bdf 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++26
 
-// The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-16
-
 // <numeric>
 
 // template<class T>
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/ctor_iterator.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/ctor_iterator.pass.cpp
index ea6e807ca47b5..400cfd78d94a3 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/ctor_iterator.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/ctor_iterator.pass.cpp
@@ -16,20 +16,24 @@
 //                                     InputIteratorB lastB,
 //                                     InputIteratorW firstW);
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <random>
 
 #include <cassert>
 #include <vector>
 
+#include "test_iterators.h"
 #include "test_macros.h"
 
 int main(int, char**)
 {
     {
         typedef std::piecewise_constant_distribution<> D;
+        typedef cpp17_input_iterator<const double*> InIt;
         double b[] = {10};
         double p[] = {12};
-        D d(b, b, p);
+        D d((InIt(b)), (InIt(b)), (InIt(p)));
         std::vector<double> iv = d.intervals();
         assert(iv.size() == 2);
         assert(iv[0] == 0);
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/param_ctor_iterator.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/param_ctor_iterator.pass.cpp
index baf6108b7e2e8..8b3e21fc0932e 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/param_ctor_iterator.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/param_ctor_iterator.pass.cpp
@@ -15,11 +15,14 @@
 //     param_type(InputIteratorB firstB, InputIteratorB lastB,
 //                InputIteratorW firstW);
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <random>
 
 #include <cassert>
 #include <vector>
 
+#include "test_iterators.h"
 #include "test_macros.h"
 
 int main(int, char**)
@@ -27,9 +30,10 @@ int main(int, char**)
     {
         typedef std::piecewise_constant_distribution<> D;
         typedef D::param_type P;
+        typedef cpp17_input_iterator<const double*> InIt;
         double b[] = {10};
         double p[] = {12};
-        P pa(b, b, p);
+        P pa((InIt(b)), (InIt(b)), (InIt(p)));
         std::vector<double> iv = pa.intervals();
         assert(iv.size() == 2);
         assert(iv[0] == 0);
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/ctor_iterator.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/ctor_iterator.pass.cpp
index 24f7d4e18c36a..8ed56ecdd31e9 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/ctor_iterator.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/ctor_iterator.pass.cpp
@@ -16,20 +16,24 @@
 //                                     InputIteratorB lastB,
 //                                     InputIteratorW firstW);
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <random>
 
 #include <cassert>
 #include <vector>
 
+#include "test_iterators.h"
 #include "test_macros.h"
 
 int main(int, char**)
 {
     {
         typedef std::piecewise_linear_distribution<> D;
+        typedef cpp17_input_iterator<const double*> InIt;
         double b[] = {10};
         double p[] = {12};
-        D d(b, b, p);
+        D d((InIt(b)), (InIt(b)), (InIt(p)));
         std::vector<double> iv = d.intervals();
         assert(iv.size() == 2);
         assert(iv[0] == 0);
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/param_ctor_iterator.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/param_ctor_iterator.pass.cpp
index 04ded2a1c9706..272d0b4c87459 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/param_ctor_iterator.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/param_ctor_iterator.pass.cpp
@@ -15,11 +15,14 @@
 //     param_type(InputIteratorB firstB, InputIteratorB lastB,
 //                InputIteratorW firstW);
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <random>
 
 #include <cassert>
 #include <vector>
 
+#include "test_iterators.h"
 #include "test_macros.h"
 
 int main(int, char**)
@@ -27,9 +30,10 @@ int main(int, char**)
     {
         typedef std::piecewise_linear_distribution<> D;
         typedef D::param_type P;
+        typedef cpp17_input_iterator<const double*> InIt;
         double b[] = {10};
         double p[] = {12};
-        P pa(b, b, p);
+        P pa((InIt(b)), (InIt(b)), (InIt(p)));
         std::vector<double> iv = pa.intervals();
         assert(iv.size() == 2);
         assert(iv[0] == 0);
diff --git a/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp
index 00f9e2b846783..ecdc39701641d 100644
--- a/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.capacity/deallocate_size.pass.cpp
@@ -12,12 +12,14 @@
 
 #include <string>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <type_traits>
 
 #include "test_macros.h"
 
-static int allocated_;
+static std::uint64_t allocated_;
+static std::uint64_t deallocated_;
 
 template <class T, class Sz>
 struct test_alloc {
@@ -41,12 +43,12 @@ struct test_alloc {
 
   pointer allocate(size_type n, const void* = nullptr) {
     allocated_ += n;
-    return std::allocator<value_type>().allocate(n);
+    return std::allocator<value_type>().allocate(static_cast<std::size_t>(n));
   }
 
   void deallocate(pointer p, size_type s) {
-    allocated_ -= s;
-    std::allocator<value_type>().deallocate(p, s);
+    deallocated_ += s;
+    std::allocator<value_type>().deallocate(p, static_cast<std::size_t>(s));
   }
 
   template <class U>
@@ -64,14 +66,13 @@ struct test_alloc {
 
 template <class Sz>
 void test() {
-  for (int i = 1; i < 1000; ++i) {
-    using Str = std::basic_string<char, std::char_traits<char>, test_alloc<char, Sz> >;
+  for (unsigned int i = 1; i < 1000; ++i) {
     {
-      Str s(i, 't');
-      assert(allocated_ == 0 || allocated_ >= i);
+      std::basic_string<char, std::char_traits<char>, test_alloc<char, Sz> > s(i, 't');
+      (void)s;
     }
+    assert(allocated_ == deallocated_);
   }
-  assert(allocated_ == 0);
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp
index 5eb3240699a81..8e5919539d94e 100644
--- a/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp
@@ -8,11 +8,13 @@
 
 // UNSUPPORTED: no-exceptions
 
-// After changing the alignment of the allocated pointer from 16 to 8, the exception
-// thrown is no longer `bad_alloc` but instead length_error on systems using new
-// headers but a dylib that doesn't contain 04ce0ba.
+// This test fails when using a built library that does not contain
+// 15860446a8c3, which changed the return value of max_size(). Without
+// that change, the built library believes the max size to be one greater
+// than it really is, and we fail to throw `length_error` from `string::resize()`,
+// which is explicitly instantiated in the built library.
 //
-// XFAIL: using-built-library-before-llvm-19
+// XFAIL: using-built-library-before-llvm-21
 
 // <string>
 
diff --git a/libcxx/test/std/time/time.traits/is.clock.compile.pass.cpp b/libcxx/test/std/time/time.traits/is.clock.compile.pass.cpp
new file mode 100644
index 0000000000000..4af29d20943ea
--- /dev/null
+++ b/libcxx/test/std/time/time.traits/is.clock.compile.pass.cpp
@@ -0,0 +1,230 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++20
+
+// <chrono>
+//
+// template<class T> struct is_clock;
+// template<class T> constexpr bool is_clock_v = is_clock<T>::value;
+
+#include <chrono>
+#include <ratio>
+
+#include "test_macros.h"
+
+struct EmptyStruct {};
+
+// Test structs missing required members
+struct MissingRep {
+  using period                    = std::ratio<1>;
+  using duration                  = std::chrono::seconds;
+  using time_point                = std::chrono::time_point<MissingRep>;
+  static constexpr bool is_steady = false;
+  static time_point now();
+};
+
+struct MissingPeriod {
+  using rep                       = long;
+  using duration                  = std::chrono::seconds;
+  using time_point                = std::chrono::time_point<MissingPeriod>;
+  static constexpr bool is_steady = false;
+  static time_point now();
+};
+
+struct MissingDuration {
+  using rep                       = long;
+  using time_point                = long;
+  static constexpr bool is_steady = false;
+  static time_point now();
+};
+
+struct MissingTimePoint {
+  using rep                       = long;
+  using period                    = std::ratio<1>;
+  using duration                  = std::chrono::seconds;
+  static constexpr bool is_steady = false;
+  static std::chrono::time_point<MissingTimePoint> now();
+};
+
+struct MissingIsSteady {
+  using rep        = long;
+  using period     = std::ratio<1>;
+  using duration   = std::chrono::seconds;
+  using time_point = std::chrono::time_point<MissingIsSteady>;
+  static time_point now();
+};
+
+struct MissingNow {
+  using rep                       = long;
+  using period                    = std::ratio<1>;
+  using duration                  = std::chrono::seconds;
+  using time_point                = std::chrono::time_point<MissingNow>;
+  static constexpr bool is_steady = false;
+};
+
+// Valid clock types
+struct ValidSteadyClock {
+  using rep                       = long long;
+  using period                    = std::nano;
+  using duration                  = std::chrono::nanoseconds;
+  using time_point                = std::chrono::time_point<ValidSteadyClock>;
+  static constexpr bool is_steady = true;
+  static time_point now();
+};
+
+struct ValidSystemClock {
+  using rep                       = long long;
+  using period                    = std::micro;
+  using duration                  = std::chrono::microseconds;
+  using time_point                = std::chrono::time_point<ValidSystemClock>;
+  static constexpr bool is_steady = false;
+  static time_point now();
+};
+
+// Test clocks with invalid is_steady type
+struct WrongIsSteadyType {
+  using rep        = long;
+  using period     = std::ratio<1>;
+  using duration   = std::chrono::seconds;
+  using time_point = std::chrono::time_point<WrongIsSteadyType>;
+  static bool is_steady; // Not const bool
+  static time_point now();
+};
+
+struct WrongIsSteadyNonBool {
+  using rep                      = long;
+  using period                   = std::ratio<1>;
+  using duration                 = std::chrono::seconds;
+  using time_point               = std::chrono::time_point<WrongIsSteadyNonBool>;
+  static constexpr int is_steady = 1; // Not bool
+  static time_point now();
+};
+
+// Test clocks with invalid now() return type
+struct WrongNowReturnType {
+  using rep                       = long;
+  using period                    = std::ratio<1>;
+  using duration                  = std::chrono::seconds;
+  using time_point                = std::chrono::time_point<WrongNowReturnType>;
+  static constexpr bool is_steady = false;
+  static int now(); // Wrong return type
+};
+
+// Test clocks with invalid period type
+struct WrongPeriodType {
+  using rep                       = long;
+  using period                    = int; // Not a ratio
+  using duration                  = std::chrono::seconds;
+  using time_point                = std::chrono::time_point<WrongPeriodType>;
+  static constexpr bool is_steady = false;
+  static time_point now();
+};
+
+// Test clocks with wrong duration type
+struct WrongDurationType {
+  using rep                       = long;
+  using period                    = std::ratio<1>;
+  using duration                  = std::chrono::milliseconds; // Should be duration<long, ratio<1>>
+  using time_point                = std::chrono::time_point<WrongDurationType>;
+  static constexpr bool is_steady = false;
+  static time_point now();
+};
+
+// Test clocks with wrong time_point type
+struct WrongTimePointType {
+  using rep                       = long;
+  using period                    = std::ratio<1>;
+  using duration                  = std::chrono::duration<long, std::ratio<1>>;
+  using time_point                = int; // Not a time_point
+  static constexpr bool is_steady = false;
+  static time_point now();
+};
+
+struct WrongTimePointClock {
+  using rep                       = long;
+  using period                    = std::ratio<1>;
+  using duration                  = std::chrono::duration<long, std::ratio<1>>;
+  using time_point                = std::chrono::time_point<ValidSystemClock>; // Wrong clock type
+  static constexpr bool is_steady = false;
+  static time_point now();
+};
+
+// Valid clock with time_point that has matching duration instead of matching clock
+struct ValidClockWithDurationMatch {
+  using rep                       = int;
+  using period                    = std::milli;
+  using duration                  = std::chrono::duration<int, std::milli>;
+  using time_point                = std::chrono::time_point<ValidSystemClock, duration>; // Valid: matches duration
+  static constexpr bool is_steady = false;
+  static time_point now();
+};
+
+// Test both is_clock and is_clock_v
+static_assert(std::chrono::is_clock<std::chrono::system_clock>::value);
+static_assert(std::chrono::is_clock_v<std::chrono::system_clock>);
+
+// Test standard clock types
+static_assert(std::chrono::is_clock_v<std::chrono::system_clock>);
+static_assert(std::chrono::is_clock_v<std::chrono::high_resolution_clock>);
+
+// Test non-clock types
+static_assert(!std::chrono::is_clock_v<EmptyStruct>);
+static_assert(!std::chrono::is_clock_v<int>);
+static_assert(!std::chrono::is_clock_v<void>);
+static_assert(!std::chrono::is_clock_v<std::chrono::system_clock::time_point>);
+static_assert(!std::chrono::is_clock_v<std::chrono::seconds>);
+static_assert(!std::chrono::is_clock_v<std::chrono::milliseconds>);
+
+// Test structs missing required members
+static_assert(!std::chrono::is_clock_v<MissingRep>);
+static_assert(!std::chrono::is_clock_v<MissingPeriod>);
+static_assert(!std::chrono::is_clock_v<MissingDuration>);
+static_assert(!std::chrono::is_clock_v<MissingTimePoint>);
+static_assert(!std::chrono::is_clock_v<MissingIsSteady>);
+static_assert(!std::chrono::is_clock_v<MissingNow>);
+
+// Test valid custom clocks
+static_assert(std::chrono::is_clock_v<ValidSteadyClock>);
+static_assert(std::chrono::is_clock_v<ValidSystemClock>);
+static_assert(std::chrono::is_clock_v<ValidClockWithDurationMatch>);
+
+// cv-qualified and reference types
+static_assert(std::chrono::is_clock_v<const std::chrono::system_clock>);
+static_assert(std::chrono::is_clock_v<volatile std::chrono::system_clock>);
+static_assert(std::chrono::is_clock_v<const volatile std::chrono::system_clock>);
+static_assert(!std::chrono::is_clock_v<std::chrono::system_clock&>);
+static_assert(!std::chrono::is_clock_v<std::chrono::system_clock&&>);
+static_assert(!std::chrono::is_clock_v<const std::chrono::system_clock&>);
+
+// array and pointer types
+static_assert(!std::chrono::is_clock_v<std::chrono::system_clock[]>);
+static_assert(!std::chrono::is_clock_v<std::chrono::system_clock[10]>);
+static_assert(!std::chrono::is_clock_v<std::chrono::system_clock*>);
+static_assert(!std::chrono::is_clock_v<std::chrono::system_clock* const>);
+
+// The Standard defined a minimum set of checks and allowed implementation to perform stricter checks. The following
+// static asserts are implementation specific and a conforming standard library implementation doesn't have to produce
+// the same outcome.
+
+// Test clocks with invalid is_steady type
+LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongIsSteadyType>);    // is_steady not const bool
+LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongIsSteadyNonBool>); // is_steady not bool type
+
+// Test clocks with invalid now() return type
+LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongNowReturnType>); // now() doesn't return time_point
+
+// Test clocks with invalid period type
+LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongPeriodType>); // period is not a ratio
+
+// Test clocks with wrong duration type
+LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongDurationType>); // duration doesn't match duration<rep, period>
+
+// Test clocks with wrong time_point type
+LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongTimePointType>);  // time_point is not a time_point
+LIBCPP_STATIC_ASSERT(!std::chrono::is_clock_v<WrongTimePointClock>); // time_point has wrong clock and wrong duration
diff --git a/libcxx/test/std/utilities/function.objects/unord.hash/pointer.pass.cpp b/libcxx/test/std/utilities/function.objects/unord.hash/pointer.pass.cpp
index 448c5ba143c10..ce331e59ffdb5 100644
--- a/libcxx/test/std/utilities/function.objects/unord.hash/pointer.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/unord.hash/pointer.pass.cpp
@@ -17,6 +17,8 @@
 //     size_t operator()(T val) const;
 // };
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 // Not very portable
 
 #include <cassert>
@@ -44,18 +46,14 @@ test()
     assert(h(&i) != h(&j));
 }
 
-// can't hash nullptr_t until C++17
-void test_nullptr()
-{
-#if TEST_STD_VER > 14
-    typedef std::nullptr_t T;
-    typedef std::hash<T> H;
+void test_nullptr() {
+  typedef std::nullptr_t T;
+  typedef std::hash<T> H;
 #if TEST_STD_VER <= 17
-    static_assert((std::is_same<typename H::argument_type, T>::value), "" );
-    static_assert((std::is_same<typename H::result_type, std::size_t>::value), "" );
-#endif
-    ASSERT_NOEXCEPT(H()(T()));
+  static_assert((std::is_same<typename H::argument_type, T>::value), "");
+  static_assert((std::is_same<typename H::result_type, std::size_t>::value), "");
 #endif
+  ASSERT_NOEXCEPT(H()(T()));
 }
 
 int main(int, char**)
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.hash/hash_unique_ptr.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.hash/hash_unique_ptr.pass.cpp
index 32fc949354c69..e7540498b8de7 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.hash/hash_unique_ptr.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.hash/hash_unique_ptr.pass.cpp
@@ -90,12 +90,10 @@ int main(int, char**)
     test_enabled_with_deleter<A, PointerDeleter<A, 1>>();
     test_enabled_with_deleter<A[], PointerDeleter<A[], 1>>();
 
-#if TEST_STD_VER > 14
     test_disabled_with_deleter<int, PointerDeleter<int, 0>>();
     test_disabled_with_deleter<int[], PointerDeleter<int[], 0>>();
     test_disabled_with_deleter<A, PointerDeleter<A, 0>>();
     test_disabled_with_deleter<A[], PointerDeleter<A[], 0>>();
-#endif
   }
 #endif
 
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp
index f71f688afc52e..bb0b2d322218d 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp
@@ -10,7 +10,6 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // These compiler versions and platforms don't enable sized deallocation by default.
-// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation
 
diff --git a/libcxx/test/std/utilities/meta/meta.const.eval/is_within_lifetime.compile.pass.cpp b/libcxx/test/std/utilities/meta/meta.const.eval/is_within_lifetime.compile.pass.cpp
new file mode 100644
index 0000000000000..40c2273f1f862
--- /dev/null
+++ b/libcxx/test/std/utilities/meta/meta.const.eval/is_within_lifetime.compile.pass.cpp
@@ -0,0 +1,148 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// UNSUPPORTED: gcc-15, apple-clang-17
+
+// <type_traits>
+
+// template <class T>
+//   consteval bool is_within_lifetime(const T*) noexcept; // C++26
+
+#include <cassert>
+#include <type_traits>
+#include <utility>
+
+#include "test_macros.h"
+
+ASSERT_SAME_TYPE(decltype(std::is_within_lifetime(std::declval<int*>())), bool);
+ASSERT_SAME_TYPE(decltype(std::is_within_lifetime(std::declval<const int*>())), bool);
+ASSERT_SAME_TYPE(decltype(std::is_within_lifetime(std::declval<void*>())), bool);
+ASSERT_SAME_TYPE(decltype(std::is_within_lifetime(std::declval<const void*>())), bool);
+
+ASSERT_NOEXCEPT(std::is_within_lifetime(std::declval<int*>()));
+ASSERT_NOEXCEPT(std::is_within_lifetime(std::declval<const int*>()));
+ASSERT_NOEXCEPT(std::is_within_lifetime(std::declval<void*>()));
+ASSERT_NOEXCEPT(std::is_within_lifetime(std::declval<const void*>()));
+
+template <class T>
+concept is_within_lifetime_exists = requires(T t) { std::is_within_lifetime(t); };
+
+struct S {};
+
+static_assert(is_within_lifetime_exists<int*>);
+static_assert(is_within_lifetime_exists<const int*>);
+static_assert(is_within_lifetime_exists<void*>);
+static_assert(is_within_lifetime_exists<const void*>);
+static_assert(!is_within_lifetime_exists<int>);               // Not a pointer
+static_assert(!is_within_lifetime_exists<decltype(nullptr)>); // Not a pointer
+static_assert(!is_within_lifetime_exists<void() const>);      // Not a pointer
+static_assert(!is_within_lifetime_exists<int S::*>);          // Doesn't accept pointer-to-data-member
+static_assert(!is_within_lifetime_exists<void (S::*)()>);     // Doesn't accept pointer-to-member-function
+static_assert(!is_within_lifetime_exists<void (*)()>);        // Doesn't match `const T*`
+
+consteval bool f() {
+  // Test that it works with global variables whose lifetime is in a
+  // different constant expression
+  {
+    static constexpr int i = 0;
+    static_assert(std::is_within_lifetime(&i));
+    // (Even when cast to a different type)
+    static_assert(std::is_within_lifetime(const_cast<int*>(&i)));
+    static_assert(std::is_within_lifetime(static_cast<const void*>(&i)));
+    static_assert(std::is_within_lifetime(static_cast<void*>(const_cast<int*>(&i))));
+    static_assert(std::is_within_lifetime<const int>(&i));
+    static_assert(std::is_within_lifetime<int>(const_cast<int*>(&i)));
+    static_assert(std::is_within_lifetime<const void>(static_cast<const void*>(&i)));
+    static_assert(std::is_within_lifetime<void>(static_cast<void*>(const_cast<int*>(&i))));
+  }
+
+  {
+    static constexpr union {
+      int member1;
+      int member2;
+    } u{.member2 = 1};
+    static_assert(!std::is_within_lifetime(&u.member1) && std::is_within_lifetime(&u.member2));
+  }
+
+  // Test that it works for varibles inside the same constant expression
+  {
+    int i = 0;
+    assert(std::is_within_lifetime(&i));
+    // (Even when cast to a different type)
+    assert(std::is_within_lifetime(const_cast<int*>(&i)));
+    assert(std::is_within_lifetime(static_cast<const void*>(&i)));
+    assert(std::is_within_lifetime(static_cast<void*>(const_cast<int*>(&i))));
+    assert(std::is_within_lifetime<const int>(&i));
+    assert(std::is_within_lifetime<int>(const_cast<int*>(&i)));
+    assert(std::is_within_lifetime<const void>(static_cast<const void*>(&i)));
+    assert(std::is_within_lifetime<void>(static_cast<void*>(const_cast<int*>(&i))));
+  }
+  // Anonymous union
+  {
+    union {
+      int member1;
+      int member2;
+    };
+    assert(!std::is_within_lifetime(&member1) && !std::is_within_lifetime(&member2));
+    member1 = 1;
+    assert(std::is_within_lifetime(&member1) && !std::is_within_lifetime(&member2));
+    member2 = 1;
+    assert(!std::is_within_lifetime(&member1) && std::is_within_lifetime(&member2));
+  }
+  // Variant members
+  {
+    struct X {
+      union {
+        int member1;
+        int member2;
+      };
+    } x;
+    assert(!std::is_within_lifetime(&x.member1) && !std::is_within_lifetime(&x.member2));
+    x.member1 = 1;
+    assert(std::is_within_lifetime(&x.member1) && !std::is_within_lifetime(&x.member2));
+    x.member2 = 1;
+    assert(!std::is_within_lifetime(&x.member1) && std::is_within_lifetime(&x.member2));
+  }
+  // Unions
+  {
+    union X {
+      int member1;
+      int member2;
+    } x;
+    assert(!std::is_within_lifetime(&x.member1) && !std::is_within_lifetime(&x.member2));
+    x.member1 = 1;
+    assert(std::is_within_lifetime(&x.member1) && !std::is_within_lifetime(&x.member2));
+    x.member2 = 1;
+    assert(!std::is_within_lifetime(&x.member1) && std::is_within_lifetime(&x.member2));
+  }
+  {
+    S s; // uninitialised
+    assert(std::is_within_lifetime(&s));
+  }
+
+  return true;
+}
+static_assert(f());
+
+// Check that it is a consteval (and consteval-propagating) function
+// (i.e., taking the address of below will fail because it will be an immediate function)
+template <typename T>
+constexpr void does_escalate(T p) {
+  std::is_within_lifetime(p);
+}
+template <typename T, void (*)(T) = &does_escalate<T>>
+constexpr bool check_escalated(int) {
+  return false;
+}
+template <typename T>
+constexpr bool check_escalated(long) {
+  return true;
+}
+static_assert(check_escalated<int*>(0), "");
+static_assert(check_escalated<void*>(0), "");
diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
index 2e0b1e9025a61..b6b226c7f79d8 100644
--- a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
 
 // These compilers don't support __builtin_is_virtual_base_of yet.
-// UNSUPPORTED: clang-19, gcc-14, apple-clang-16, apple-clang-17
+// UNSUPPORTED: apple-clang-17
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
index 1ca9d44b82afe..f43693c08bc39 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // These compilers don't support __builtin_is_implicit_lifetime yet.
-// UNSUPPORTED: clang-19, gcc-14, apple-clang-16, apple-clang-17
+// UNSUPPORTED: apple-clang-17
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp
index ad53c8176cc92..84fe7cfb02208 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++23
 
-// These compilers don't support std::reference_converts_from_temporary yet.
-// UNSUPPORTED: apple-clang-16, clang-19.1
-
 // <type_traits>
 
 // template<class T, class U> struct reference_constructs_from_temporary;
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp
index 73cc4f3e29d5a..8319d9e1563fe 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++23
 
-// These compilers don't support std::reference_converts_from_temporary yet.
-// UNSUPPORTED: apple-clang-16, clang-19.1
-
 // <type_traits>
 
 // template<class T, class U> struct reference_converts_from_temporary;
diff --git a/libcxx/test/std/utilities/optional/optional.iterator/begin.pass.cpp b/libcxx/test/std/utilities/optional/optional.iterator/begin.pass.cpp
index df95a8df3793f..81234525923a1 100644
--- a/libcxx/test/std/utilities/optional/optional.iterator/begin.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.iterator/begin.pass.cpp
@@ -21,7 +21,8 @@
 
 template <typename T>
 constexpr bool test() {
-  std::optional<T> opt{T{}};
+  std::remove_reference_t<T> t = std::remove_reference_t<T>{};
+  std::optional<T> opt{t};
 
   { // begin() is marked noexcept
     static_assert(noexcept(opt.begin()));
@@ -53,6 +54,10 @@ constexpr bool tests() {
   assert(test<char>());
   assert(test<const int>());
   assert(test<const char>());
+  assert(test<int&>());
+  assert(test<char&>());
+  assert(test<const int&>());
+  assert(test<const char&>());
   return true;
 }
 
diff --git a/libcxx/test/std/utilities/optional/optional.iterator/borrowed_range.compile.pass.cpp b/libcxx/test/std/utilities/optional/optional.iterator/borrowed_range.compile.pass.cpp
new file mode 100644
index 0000000000000..a79d1d51a5b11
--- /dev/null
+++ b/libcxx/test/std/utilities/optional/optional.iterator/borrowed_range.compile.pass.cpp
@@ -0,0 +1,34 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++26
+
+// <optional>
+
+// template <class T> class optional<T&>::iterator;
+// template <class T> class optional<T&>::const_iterator;
+// template <class T>
+// constexpr bool ranges::enable_borrowed_range<optional<T&>> = true;
+
+#include <cassert>
+#include <optional>
+#include <ranges>
+
+template <typename T>
+void borrowed_range() {
+  static_assert(std::ranges::enable_borrowed_range<std::optional<T&>>);
+  static_assert(std::ranges::range<std::optional<T&>> == std::ranges::borrowed_range<std::optional<T&>>);
+}
+
+void test_borrowed_range() {
+  borrowed_range<int>();
+  borrowed_range<const int>();
+  borrowed_range<int[]>();
+  borrowed_range<int[10]>();
+  borrowed_range<int()>();
+}
diff --git a/libcxx/test/std/utilities/optional/optional.iterator/end.pass.cpp b/libcxx/test/std/utilities/optional/optional.iterator/end.pass.cpp
index 966c3e7441880..c62c9fc7746d6 100644
--- a/libcxx/test/std/utilities/optional/optional.iterator/end.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.iterator/end.pass.cpp
@@ -17,6 +17,7 @@
 #include <iterator>
 #include <optional>
 #include <ranges>
+#include <type_traits>
 #include <utility>
 
 template <typename T>
@@ -41,7 +42,8 @@ constexpr bool test() {
     assert(it2 == std::as_const(disengaged).end());
   }
 
-  std::optional<T> engaged{T{}};
+  std::remove_reference_t<T> t = std::remove_reference_t<T>{};
+  std::optional<T> engaged{t};
 
   { // end() != begin() if the optional is engaged
     auto it  = engaged.end();
@@ -62,6 +64,10 @@ constexpr bool tests() {
   assert(test<char>());
   assert(test<const int>());
   assert(test<const char>());
+  assert(test<int&>());
+  assert(test<char&>());
+  assert(test<const int&>());
+  assert(test<const char&>());
 
   return true;
 }
diff --git a/libcxx/test/std/utilities/optional/optional.iterator/iterator.pass.cpp b/libcxx/test/std/utilities/optional/optional.iterator/iterator.pass.cpp
index 1203290a0290a..671fac35e732a 100644
--- a/libcxx/test/std/utilities/optional/optional.iterator/iterator.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.iterator/iterator.pass.cpp
@@ -14,15 +14,23 @@
 // template <class T> class optional::const_iterator;
 
 #include <cassert>
-#include <iterator>
 #include <optional>
 #include <ranges>
 #include <type_traits>
 #include <utility>
 
-template <typename T, T __val>
+template <typename T>
+constexpr bool test_range_concept() {
+  return std::ranges::range<std::optional<T>>;
+}
+
+template <typename T, std::remove_reference_t<T> __val>
 constexpr bool test() {
-  std::optional<T> opt{__val};
+  std::remove_reference_t<T> v{__val};
+  std::optional<T> opt{v};
+  {
+    assert(test_range_concept<T>());
+  }
 
   { // Dereferencing an iterator of an engaged optional will return the same value that the optional holds.
     auto it  = opt.begin();
@@ -41,13 +49,14 @@ constexpr bool test() {
     assert(std::random_access_iterator<decltype(it2)>);
   }
 
-  { // const_iterator::value_type == std::remove_cv_t<T>, const_iterator::reference == const T&, iterator::value_type = std::remove_cv_t<T>, iterator::reference == T&
+  { // const_iterator::value_type == std::remove_cvref_t<T>, const_iterator::reference == const T&, iterator::value_type = std::remove_cvref_t<T>, iterator::reference == T&
+    // std::remove_cv_t is impossible for optional<T&>
     auto it  = opt.begin();
     auto it2 = std::as_const(opt).begin();
-    assert((std::is_same_v<typename decltype(it)::value_type, std::remove_cv_t<T>>));
-    assert((std::is_same_v<typename decltype(it)::reference, T&>));
-    assert((std::is_same_v<typename decltype(it2)::value_type, std::remove_cv_t<T>>));
-    assert((std::is_same_v<typename decltype(it2)::reference, const T&>));
+    assert((std::is_same_v<typename decltype(it)::value_type, std::remove_cvref_t<T>>));
+    assert((std::is_same_v<typename decltype(it)::reference, std::remove_reference_t<T>&>));
+    assert((std::is_same_v<typename decltype(it2)::value_type, std::remove_cvref_t<T>>));
+    assert((std::is_same_v<typename decltype(it2)::reference, const std::remove_reference_t<T>&>));
   }
 
   { // std::ranges::size for an engaged optional<T> == 1, disengaged optional<T> == 0
@@ -68,13 +77,13 @@ constexpr bool test() {
   // An optional with value that is reset will have a begin() == end(), then when it is reassigned a value,
   // begin() != end(), and *begin() will contain the new value.
   {
-    std::optional<T> val{__val};
+    std::optional<T> val{v};
     assert(val.begin() != val.end());
     val.reset();
     assert(val.begin() == val.end());
-    val.emplace(__val);
+    val.emplace(v);
     assert(val.begin() != val.end());
-    assert(*(val.begin()) == __val);
+    assert(*(val.begin()) == v);
   }
 
   return true;
@@ -86,6 +95,15 @@ constexpr bool tests() {
   assert((test<bool, true>()));
   assert((test<const int, 2>()));
   assert((test<const char, 'b'>()));
+  assert((test<int&, 1>()));
+  assert((test<char&, 'a'>()));
+  assert((test<bool&, true>()));
+  assert((test<const int&, 2>()));
+  assert((test<const char&, 'b'>()));
+
+  assert(!test_range_concept<int (&)()>());
+  assert(!test_range_concept<int (&)[]>());
+  assert(!test_range_concept<int (&)[42]>());
 
   return true;
 }
diff --git a/libcxx/test/std/utilities/optional/optional.monadic/and_then.pass.cpp b/libcxx/test/std/utilities/optional/optional.monadic/and_then.pass.cpp
index 97305d976e066..133eed4a606bb 100644
--- a/libcxx/test/std/utilities/optional/optional.monadic/and_then.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.monadic/and_then.pass.cpp
@@ -16,6 +16,7 @@
 // template<class F> constexpr auto and_then(F&&) const&&;
 
 #include <cassert>
+#include <concepts>
 #include <optional>
 
 #include "test_macros.h"
@@ -257,8 +258,94 @@ constexpr bool test() {
   return true;
 }
 
+#if TEST_STD_VER >= 26
+constexpr bool test_ref() {
+  // Test & overload
+  {
+    // Without & qualifier on F's operator()
+    {
+      int j = 42;
+      std::optional<int&> i{j};
+      std::same_as<std::optional<int>> decltype(auto) r = i.and_then(LVal{});
+
+      assert(r == 1);
+      assert(i.and_then(NOLVal{}) == std::nullopt);
+    }
+
+    //With & qualifier on F's operator()
+    {
+      int j = 42;
+      std::optional<int&> i{j};
+      RefQual l{};
+      NORefQual nl{};
+      std::same_as<std::optional<int>> decltype(auto) r = i.and_then(l);
+
+      assert(r == 1);
+      assert(i.and_then(nl) == std::nullopt);
+    }
+  }
+
+  // Test const& overload
+  {
+    // Without & qualifier on F's operator()
+    {
+      int j = 42;
+      std::optional<const int&> i{j};
+      std::same_as<std::optional<int>> decltype(auto) r = i.and_then(CLVal{});
+
+      assert(r == 1);
+      assert(i.and_then(NOCLVal{}) == std::nullopt);
+    }
+
+    //With & qualifier on F's operator()
+    {
+      int j = 42;
+      const std::optional<int&> i{j};
+      const CRefQual l{};
+      const NOCRefQual nl{};
+      std::same_as<std::optional<int>> decltype(auto) r = i.and_then(l);
+
+      assert(r == 1);
+      assert(i.and_then(nl) == std::nullopt);
+    }
+  }
+  // Test && overload
+  {
+    //With & qualifier on F's operator()
+    {
+      int j = 42;
+      std::optional<int&> i{j};
+      std::same_as<std::optional<int>> decltype(auto) r = i.and_then(RVRefQual{});
+
+      assert(r == 1);
+      assert(i.and_then(NORVRefQual{}) == std::nullopt);
+    }
+  }
+
+  // Test const&& overload
+  {
+    //With & qualifier on F's operator()
+    {
+      int j = 42;
+      const std::optional<int&> i{j};
+      const RVCRefQual l{};
+      const NORVCRefQual nl{};
+      std::same_as<std::optional<int>> decltype(auto) r = i.and_then(std::move(l));
+
+      assert(r == 1);
+      assert(i.and_then(std::move(nl)) == std::nullopt);
+    }
+  }
+  return true;
+}
+#endif
+
 int main(int, char**) {
   test();
   static_assert(test());
+#if TEST_STD_VER >= 26
+  test_ref();
+  static_assert(test_ref());
+#endif
   return 0;
 }
diff --git a/libcxx/test/std/utilities/optional/optional.monadic/or_else.pass.cpp b/libcxx/test/std/utilities/optional/optional.monadic/or_else.pass.cpp
index ccc94ab9be2cb..de0a67c1579ee 100644
--- a/libcxx/test/std/utilities/optional/optional.monadic/or_else.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.monadic/or_else.pass.cpp
@@ -62,6 +62,32 @@ constexpr bool test() {
       return std::optional<MoveOnly>{};
     });
   }
+#if TEST_STD_VER >= 26
+  {
+    int i = 2;
+    std::optional<int&> opt;
+    assert(opt.or_else([&] { return std::optional<int&>{i}; }) == i);
+    int j = 3;
+    opt   = j;
+    opt.or_else([] {
+      assert(false);
+      return std::optional<int&>{};
+    });
+    assert(opt == j);
+  }
+  {
+    int i = 2;
+    std::optional<int&> opt;
+    assert(std::move(opt).or_else([&] { return std::optional<int&>{i}; }) == i);
+    int j = 3;
+    opt   = j;
+    std::move(opt).or_else([] {
+      assert(false);
+      return std::optional<int&>{};
+    });
+    assert(opt == j);
+  }
+#endif
 
   return true;
 }
diff --git a/libcxx/test/std/utilities/optional/optional.monadic/transform.pass.cpp b/libcxx/test/std/utilities/optional/optional.monadic/transform.pass.cpp
index 0a151517b101c..ad2713f2ac5b8 100644
--- a/libcxx/test/std/utilities/optional/optional.monadic/transform.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.monadic/transform.pass.cpp
@@ -17,62 +17,64 @@
 
 #include "test_macros.h"
 #include <cassert>
+#include <concepts>
 #include <optional>
 #include <type_traits>
+#include <utility>
 
 struct LVal {
   constexpr int operator()(int&) { return 1; }
-  int operator()(const int&) = delete;
-  int operator()(int&&) = delete;
+  int operator()(const int&)  = delete;
+  int operator()(int&&)       = delete;
   int operator()(const int&&) = delete;
 };
 
 struct CLVal {
   int operator()(int&) = delete;
   constexpr int operator()(const int&) { return 1; }
-  int operator()(int&&) = delete;
+  int operator()(int&&)       = delete;
   int operator()(const int&&) = delete;
 };
 
 struct RVal {
-  int operator()(int&) = delete;
+  int operator()(int&)       = delete;
   int operator()(const int&) = delete;
   constexpr int operator()(int&&) { return 1; }
   int operator()(const int&&) = delete;
 };
 
 struct CRVal {
-  int operator()(int&) = delete;
+  int operator()(int&)       = delete;
   int operator()(const int&) = delete;
-  int operator()(int&&) = delete;
+  int operator()(int&&)      = delete;
   constexpr int operator()(const int&&) { return 1; }
 };
 
 struct RefQual {
   constexpr int operator()(int) & { return 1; }
-  int operator()(int) const& = delete;
-  int operator()(int) && = delete;
+  int operator()(int) const&  = delete;
+  int operator()(int) &&      = delete;
   int operator()(int) const&& = delete;
 };
 
 struct CRefQual {
   int operator()(int) & = delete;
   constexpr int operator()(int) const& { return 1; }
-  int operator()(int) && = delete;
+  int operator()(int) &&      = delete;
   int operator()(int) const&& = delete;
 };
 
 struct RVRefQual {
-  int operator()(int) & = delete;
+  int operator()(int) &      = delete;
   int operator()(int) const& = delete;
   constexpr int operator()(int) && { return 1; }
   int operator()(int) const&& = delete;
 };
 
 struct RVCRefQual {
-  int operator()(int) & = delete;
+  int operator()(int) &      = delete;
   int operator()(int) const& = delete;
-  int operator()(int) && = delete;
+  int operator()(int) &&     = delete;
   constexpr int operator()(int) const&& { return 1; }
 };
 
@@ -83,7 +85,7 @@ struct NoCopy {
 };
 
 struct NoMove {
-  NoMove() = default;
+  NoMove()         = default;
   NoMove(NoMove&&) = delete;
   NoMove operator()(const NoCopy&&) { return NoMove{}; }
 };
@@ -200,8 +202,111 @@ constexpr bool test() {
   return true;
 }
 
+#if TEST_STD_VER >= 26
+constexpr bool test_ref() {
+  {
+    std::optional<int&> opt1;
+    std::same_as<std::optional<int>> decltype(auto) opt1r = opt1.transform([](int i) { return i + 2; });
+    assert(!opt1);
+    assert(!opt1r);
+  }
+
+  {
+    int i = 42;
+    std::optional<int&> opt{i};
+    std::same_as<std::optional<int>> decltype(auto) o2 = opt.transform([](int j) { return j + 2; });
+
+    assert(*o2 == 44);
+  }
+  // Test & overload
+  {
+    // Without & qualifier on F's operator()
+    {
+      int i = 42;
+      std::optional<int&> opt{i};
+      std::same_as<std::optional<int>> decltype(auto) o3 = opt.transform(LVal{});
+
+      assert(*o3 == 1);
+    }
+
+    //With & qualifier on F's operator()
+    {
+      int i = 42;
+      std::optional<int&> opt{i};
+      RefQual l{};
+      std::same_as<std::optional<int>> decltype(auto) o3 = opt.transform(l);
+
+      assert(*o3 == 1);
+    }
+  }
+  // const& overload
+  {
+    // Without & qualifier on F's operator()
+    {
+      int i = 42;
+      std::optional<const int&> opt{i};
+      std::same_as<std::optional<int>> decltype(auto) o3 = std::as_const(opt).transform(CLVal{});
+
+      assert(*o3 == 1);
+    }
+
+    //With & qualifier on F's operator()
+    {
+      int i = 42;
+      const std::optional<int&> opt{i};
+      const CRefQual l{};
+      std::same_as<std::optional<int>> decltype(auto) o3 = opt.transform(l);
+
+      assert(*o3 == 1);
+    }
+  }
+
+  // Test && overload
+  {
+    // Without & qualifier on F's operator()
+    {
+      int i = 42;
+      std::optional<int> opt{i};
+      std::same_as<std::optional<int>> decltype(auto) o3 = std::move(opt).transform(RVal{});
+
+      assert(*o3 == 1);
+    }
+
+    //With & qualifier on F's operator()
+    {
+      int i = 42;
+      std::optional<int&> opt{i};
+      std::same_as<std::optional<int>> decltype(auto) o3 = std::move(opt).transform(RVRefQual{});
+      assert(*o3 == 1);
+    }
+  }
+
+  // const&& overload
+  {
+    //With & qualifier on F's operator()
+    {
+      int i = 42;
+      std::optional<int&> opt{i};
+      const RVCRefQual rvc{};
+      std::same_as<std::optional<int>> decltype(auto) o3 = opt.transform(std::move(rvc));
+      assert(*o3 == 1);
+    }
+  }
+  {
+    std::optional<int&> o6 = std::nullopt;
+    auto o6r               = o6.transform([](int) { return 42; });
+    assert(!o6r);
+  }
+  return true;
+}
+#endif
+
 int main(int, char**) {
   test();
   static_assert(test());
+#if TEST_STD_VER >= 26
+  test_ref();
+  static_assert(test_ref());
+#endif
   return 0;
 }
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.assign/assign_value.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.assign/assign_value.pass.cpp
index eaca111b72dca..ddb9ffc4bf80c 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.assign/assign_value.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.assign/assign_value.pass.cpp
@@ -250,6 +250,57 @@ constexpr T pr38638(T v)
   return *o + 2;
 }
 
+#if TEST_STD_VER >= 26
+
+template <typename T, std::remove_reference_t<T> _Val>
+constexpr void test_with_ref() {
+  T t{_Val};
+  { // to empty
+    optional<T&> opt;
+    opt = t;
+    assert(static_cast<bool>(opt) == true);
+    assert(*opt == t);
+  }
+  { // to existing
+    optional<T&> opt{t};
+    opt = t;
+    assert(static_cast<bool>(opt) == true);
+    assert(*opt == t);
+  }
+  { // test default argument
+    optional<T&> opt;
+    opt = {t};
+    assert(static_cast<bool>(opt) == true);
+    assert(*opt == t);
+  }
+  { // test default argument
+    optional<T&> opt{t};
+    opt = {};
+    assert(static_cast<bool>(opt) == false);
+  }
+  // test two objects, make sure that the optional only changes what it holds a reference to
+  {
+    T t2{_Val};
+    optional<T&> opt{t};
+    opt = t2;
+
+    assert(std::addressof(*opt) != std::addressof(t));
+    assert(std::addressof(*opt) == std::addressof(t2));
+  }
+  // test that reassigning the reference for an optional<T&> doesn't affect the objet it's holding a reference to
+  {
+    int i = -1;
+    int j = 2;
+    optional<int&> opt{i};
+    opt = j;
+
+    assert(i == -1);
+    assert(std::addressof(*opt) != std::addressof(i));
+    assert(std::addressof(*opt) == std::addressof(j));
+    assert(*opt == 2);
+  }
+}
+#endif
 
 int main(int, char**)
 {
@@ -281,5 +332,8 @@ int main(int, char**)
 
     static_assert(pr38638(3) == 5, "");
 
-  return 0;
+#if TEST_STD_VER >= 26
+    test_with_ref<int, 3>();
+#endif
+    return 0;
 }
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.assign/emplace.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.assign/emplace.pass.cpp
index 245d8ff3d2146..629e315add4d9 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.assign/emplace.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.assign/emplace.pass.cpp
@@ -221,6 +221,24 @@ TEST_CONSTEXPR_CXX20 bool test_empty_emplace() {
   return true;
 }
 
+#if TEST_STD_VER >= 26
+template <class T, std::remove_reference_t<T> _Val>
+constexpr bool test_ref() {
+  using Opt = std::optional<T&>;
+  T t{_Val};
+  {
+    Opt opt;
+    auto& v = opt.emplace(t);
+    static_assert(std::is_same_v<T&, decltype(v)>);
+    assert(static_cast<bool>(opt) == true);
+    assert(*opt == t);
+    assert(&v == &*opt);
+    assert(&t == &*opt);
+  }
+  return true;
+}
+#endif
+
 int main(int, char**)
 {
     {
@@ -291,6 +309,11 @@ int main(int, char**)
         }
     }
 #endif
-
-  return 0;
+#if TEST_STD_VER >= 26
+    static_assert(test_ref<int, 1>());
+    static_assert(test_ref<double, 15.0>());
+    assert((test_ref<int, 1>()));
+    assert((test_ref<double, 15.0>()));
+#endif
+    return 0;
 }
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ctor.verify.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ctor.verify.cpp
index 775d2bde7d13d..c5281783d4350 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ctor.verify.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ctor.verify.cpp
@@ -23,18 +23,26 @@ struct NonDestructible { ~NonDestructible() = delete; };
 
 int main(int, char**)
 {
-    {
-    std::optional<char &> o1;          // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with a reference type is ill-formed}}
-    std::optional<NonDestructible> o2; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with a non-destructible type is ill-formed}}
-    std::optional<char[20]> o3;        // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with an array type is ill-formed}}
-    }
-
-    {
+  {
+#if TEST_STD_VER >= 26
+    std::optional<int&&>
+        opt2; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with an rvalue reference type is ill-formed}}
+#else
+    std::optional<char&>
+        o1; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with a reference type is ill-formed}}
+#endif
+    std::optional<NonDestructible>
+        o2; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with a non-destructible type is ill-formed}}
+    std::optional<char[20]>
+        o3; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with an array type is ill-formed}}
+  }
+
+  {
     std::optional<               std::in_place_t> o1; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with in_place_t is ill-formed}}
     std::optional<const          std::in_place_t> o2; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with in_place_t is ill-formed}}
     std::optional<      volatile std::in_place_t> o3; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with in_place_t is ill-formed}}
     std::optional<const volatile std::in_place_t> o4; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with in_place_t is ill-formed}}
-    }
+  }
 
     {
     std::optional<               std::nullopt_t> o1; // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with nullopt_t is ill-formed}}
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/move.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/move.pass.cpp
index f856c1d41d05a..f59fc3b82ad7f 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/move.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/move.pass.cpp
@@ -78,71 +78,71 @@ void test_ref(InitArgs&&... args)
         assert(&(*lhs) == &(*rhs));
 }
 
-void test_reference_extension()
-{
-#if defined(_LIBCPP_VERSION) && 0 // FIXME these extensions are currently disabled.
-    using T = TestTypes::TestType;
-    T::reset();
-    {
-        T t;
-        T::reset_constructors();
-        test_ref<T&>();
-        test_ref<T&>(t);
-        assert(T::alive == 1);
-        assert(T::constructed == 0);
-        assert(T::assigned == 0);
-        assert(T::destroyed == 0);
-    }
-    assert(T::destroyed == 1);
-    assert(T::alive == 0);
-    {
-        T t;
-        const T& ct = t;
-        T::reset_constructors();
-        test_ref<T const&>();
-        test_ref<T const&>(t);
-        test_ref<T const&>(ct);
-        assert(T::alive == 1);
-        assert(T::constructed == 0);
-        assert(T::assigned == 0);
-        assert(T::destroyed == 0);
-    }
-    assert(T::alive == 0);
-    assert(T::destroyed == 1);
-    {
-        T t;
-        T::reset_constructors();
-        test_ref<T&&>();
-        test_ref<T&&>(std::move(t));
-        assert(T::alive == 1);
-        assert(T::constructed == 0);
-        assert(T::assigned == 0);
-        assert(T::destroyed == 0);
-    }
-    assert(T::alive == 0);
-    assert(T::destroyed == 1);
-    {
-        T t;
-        const T& ct = t;
-        T::reset_constructors();
-        test_ref<T const&&>();
-        test_ref<T const&&>(std::move(t));
-        test_ref<T const&&>(std::move(ct));
-        assert(T::alive == 1);
-        assert(T::constructed == 0);
-        assert(T::assigned == 0);
-        assert(T::destroyed == 0);
-    }
-    assert(T::alive == 0);
-    assert(T::destroyed == 1);
-    {
-        static_assert(!std::is_copy_constructible<std::optional<T&&>>::value, "");
-        static_assert(!std::is_copy_constructible<std::optional<T const&&>>::value, "");
-    }
+void test_reference_extension() {
+#if TEST_STD_VER >= 26
+  using T = TestTypes::TestType;
+  T::reset();
+  {
+    T t;
+    T::reset_constructors();
+    test_ref<T&>();
+    test_ref<T&>(t);
+    assert(T::alive == 1);
+    assert(T::constructed == 0);
+    assert(T::assigned == 0);
+    assert(T::destroyed == 0);
+  }
+  assert(T::destroyed == 1);
+  assert(T::alive == 0);
+  {
+    T t;
+    const T& ct = t;
+    T::reset_constructors();
+    test_ref<T const&>();
+    test_ref<T const&>(t);
+    test_ref<T const&>(ct);
+    assert(T::alive == 1);
+    assert(T::constructed == 0);
+    assert(T::assigned == 0);
+    assert(T::destroyed == 0);
+  }
+  assert(T::alive == 0);
+  assert(T::destroyed == 1);
+#  if 0 // FIXME: optional<T&&> is not allowed.
+  {
+    T t;
+    T::reset_constructors();
+    test_ref<T&&>();
+    test_ref<T&&>(std::move(t));
+    assert(T::alive == 1);
+    assert(T::constructed == 0);
+    assert(T::assigned == 0);
+    assert(T::destroyed == 0);
+  }
+  assert(T::alive == 0);
+  assert(T::destroyed == 1);
+  {
+    T t;
+    const T& ct = t;
+    T::reset_constructors();
+    test_ref<T const&&>();
+    test_ref<T const&&>(std::move(t));
+    test_ref<T const&&>(std::move(ct));
+    assert(T::alive == 1);
+    assert(T::constructed == 0);
+    assert(T::assigned == 0);
+    assert(T::destroyed == 0);
+  }
+  assert(T::alive == 0);
+  assert(T::destroyed == 1);
+  {
+    static_assert(!std::is_copy_constructible_v<std::optional<T&&>>);
+    static_assert(!std::is_copy_constructible_v<std::optional<T const&&>>);
+  }
+#  endif
 #endif
 }
 
-
 int main(int, char**)
 {
     test<int>();
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ref_constructs_from_temporary.verify.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ref_constructs_from_temporary.verify.cpp
new file mode 100644
index 0000000000000..01b241ffbe79b
--- /dev/null
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ref_constructs_from_temporary.verify.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++26
+
+// optional
+
+#include <optional>
+#include <utility>
+
+struct X {
+  int i;
+
+  X(int j) : i(j) {}
+};
+
+int main(int, char**) {
+  const std::optional<int> _co(1);
+  std::optional<int> _o(1);
+
+  // expected-error-re@*:* 8 {{call to deleted constructor of 'std::optional<{{.*}}>'}}
+  std::optional<const int&> o1{1};                     // optional(U&&)
+  std::optional<const int&> o2{std::optional<int>(1)}; // optional(optional<U>&&)
+  std::optional<const int&> o3{_co};                   // optional(const optional<U>&)
+  std::optional<const int&> o4{_o};                    // optional(optional<U>&)
+  std::optional<const X&> o5{1};                       // optional(U&&)
+  std::optional<const X&> o6{std::optional<int>(1)};   // optional(optional<U>&&)
+  std::optional<const X&> o7{_co};                     // optional(const optional<U>&)
+  std::optional<const X&> o8{_o};                      // optional(optional<U>&)
+}
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ref_t.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ref_t.pass.cpp
new file mode 100644
index 0000000000000..57552743af138
--- /dev/null
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/ref_t.pass.cpp
@@ -0,0 +1,75 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++26
+
+// <optional>
+
+#include <cassert>
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+template <typename RefType, std::remove_reference_t<RefType> _Val>
+constexpr bool test() {
+  std::remove_reference_t<RefType> item{_Val};
+  std::optional<RefType> opt{item};
+
+  {
+    assert(*opt == item);
+    assert(&(*opt) == &item);
+  }
+  {
+    assert(*std::as_const(opt) == item);
+    assert(&(*std::as_const(opt)) == &item);
+  }
+
+  return true;
+}
+
+template <typename T>
+constexpr T foo(T val) {
+  return val;
+}
+
+template <typename T, T _Val>
+constexpr bool fn_ref_test() {
+  std::optional<T (&)(T)> opt{foo<T>};
+  assert(opt.has_value());
+  assert((*opt)(_Val) == _Val);
+
+  return true;
+}
+
+template <typename T, T _Val>
+constexpr bool array_ref_test() {
+  T arr[5]{};
+  std::optional<T(&)[5]> opt{arr};
+
+  assert(opt.has_value());
+  (*opt)[0] = _Val;
+  assert((*opt)[0] == _Val);
+  assert(arr[0] == _Val);
+
+  return true;
+}
+
+constexpr bool tests() {
+  assert((test<int&, 1>()));
+  assert((test<double&, 1.0>()));
+  assert((fn_ref_test<int, 1>()));
+  assert((array_ref_test<int, 1>()));
+  assert((fn_ref_test<double, 1.0>()));
+  assert((array_ref_test<double, 1.0>()));
+  return true;
+}
+
+int main(int, char**) {
+  static_assert(tests());
+  tests();
+}
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.dtor/dtor.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.dtor/dtor.pass.cpp
index c0044276ea9ad..1202879036f56 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.dtor/dtor.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.dtor/dtor.pass.cpp
@@ -11,9 +11,9 @@
 
 // ~optional();
 
+#include <cassert>
 #include <optional>
 #include <type_traits>
-#include <cassert>
 
 #include "test_macros.h"
 
@@ -64,6 +64,24 @@ int main(int, char**)
         }
         assert(X::dtor_called == true);
     }
+#if TEST_STD_VER >= 26
+    {
+      typedef X& T;
+      static_assert(std::is_trivially_destructible_v<T>);
+      static_assert(std::is_trivially_destructible_v<optional<T>>);
+    }
+    X::dtor_called = false;
+    X x;
+    {
+      optional<X&> opt{x};
+      assert(X::dtor_called == false);
+    }
+    assert(X::dtor_called == false);
 
-  return 0;
+    {
+      static_assert(std::is_trivially_destructible_v<X (&)()>);
+      static_assert(std::is_trivially_destructible_v<optional<X (&)()>>);
+    }
+#endif
+    return 0;
 }
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.mod/reset.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.mod/reset.pass.cpp
index 7029b37cbecd7..e23e481f6a05d 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.mod/reset.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.mod/reset.pass.cpp
@@ -69,5 +69,16 @@ int main(int, char**)
         X::dtor_called = false;
     }
 
-  return 0;
+#if TEST_STD_VER >= 26
+    {
+      X x{};
+      optional<X&> opt(x);
+      X::dtor_called = false;
+      opt.reset();
+      assert(X::dtor_called == false);
+      assert(static_cast<bool>(opt) == false);
+    }
+#endif
+
+    return 0;
 }
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference.pass.cpp
index 49b4d21a28066..6c1bf8aa15a8d 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference.pass.cpp
@@ -50,7 +50,19 @@ int main(int, char**)
         optional<X> opt(X{});
         assert((*opt).test() == 4);
     }
+#if TEST_STD_VER >= 26
+    {
+      X x{};
+      optional<X&> opt(x);
+      ASSERT_SAME_TYPE(decltype(*opt), X&);
+      ASSERT_NOEXCEPT(*opt);
+    }
+    {
+      X x{};
+      optional<X&> opt(x);
+      assert((*opt).test() == 4);
+    }
+#endif
     static_assert(test() == 7, "");
-
     return 0;
 }
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_const.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_const.pass.cpp
index ff86d9534faf6..c15d4e4af74cc 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_const.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/dereference_const.pass.cpp
@@ -43,6 +43,25 @@ int main(int, char**)
         constexpr optional<X> opt(X{});
         static_assert((*opt).test() == 3, "");
     }
+#if TEST_STD_VER >= 26
+    {
+      X x{};
+      const optional<X&> opt{x};
+      ASSERT_SAME_TYPE(decltype(*opt), X&);
+      ASSERT_NOEXCEPT(*opt);
+    }
+    {
+      X x{};
+      const optional<const X&> opt{x};
+      ASSERT_SAME_TYPE(decltype(*opt), const X&);
+      ASSERT_NOEXCEPT(*opt);
+    }
+    {
+      static constexpr X x{};
+      constexpr optional<const X&> opt(x);
+      static_assert((*opt).test() == 3);
+    }
+#endif
     {
         constexpr optional<Y> opt(Y{});
         assert((*opt).test() == 2);
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/has_value.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/has_value.pass.cpp
index 6998e023022c5..9873a767cfbe6 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/has_value.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/has_value.pass.cpp
@@ -33,6 +33,13 @@ int main(int, char**)
         constexpr optional<int> opt(0);
         static_assert(opt.has_value(), "");
     }
+#if TEST_STD_VER >= 26
+    {
+      static constexpr int i = 0;
+      constexpr optional<const int&> opt{i};
+      static_assert(opt.has_value());
+    }
+#endif
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow.pass.cpp
index 2b5fba546ef42..96d22743ac7fe 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow.pass.cpp
@@ -19,9 +19,9 @@
 
 using std::optional;
 
-struct X
-{
-    int test() noexcept {return 3;}
+struct X {
+  int test() noexcept { return 3; }
+  int test() const noexcept { return 3; }
 };
 
 struct Y
@@ -47,6 +47,30 @@ int main(int, char**)
         optional<X> opt(X{});
         assert(opt->test() == 3);
     }
+#if TEST_STD_VER >= 26
+    {
+      X x{};
+      std::optional<X&> opt(x);
+      ASSERT_SAME_TYPE(decltype(opt.operator->()), X*);
+      ASSERT_NOEXCEPT(opt.operator->());
+    }
+    {
+      X x{};
+      std::optional<const X&> opt(x);
+      ASSERT_SAME_TYPE(decltype(opt.operator->()), const X*);
+      ASSERT_NOEXCEPT(opt.operator->());
+    }
+    {
+      X x{};
+      optional<X&> opt{x};
+      assert(opt->test() == 3);
+    }
+    {
+      X x{};
+      optional<const X&> opt{x};
+      assert(opt->test() == 3);
+    }
+#endif
     {
         static_assert(test() == 3, "");
     }
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow_const.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow_const.pass.cpp
index d8ce932bd7810..e9694fd6d9640 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow_const.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/op_arrow_const.pass.cpp
@@ -54,6 +54,25 @@ int main(int, char**)
         constexpr optional<Z> opt(Z{});
         static_assert(opt->test() == 1, "");
     }
+#if TEST_STD_VER >= 26
+    {
+      X x{};
+      const std::optional<X&> opt(x);
+      ASSERT_SAME_TYPE(decltype(opt.operator->()), X*);
+      ASSERT_NOEXCEPT(opt.operator->());
+    }
+    {
+      X x{};
+      const std::optional<const X&> opt(x);
+      ASSERT_SAME_TYPE(decltype(opt.operator->()), const X*);
+      ASSERT_NOEXCEPT(opt.operator->());
+    }
+    {
+      static constexpr Z z{};
+      constexpr optional<const Z&> opt(z);
+      static_assert(opt->test() == 1);
+    }
+#endif
 
     return 0;
 }
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value.pass.cpp
index 781784c6806a4..22b74f5512d53 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value.pass.cpp
@@ -56,6 +56,14 @@ int main(int, char**)
         opt.emplace();
         assert(opt.value().test() == 4);
     }
+#if TEST_STD_VER >= 26
+    {
+      X x;
+      optional<X&> opt{x};
+      ASSERT_NOT_NOEXCEPT(opt.value());
+      ASSERT_SAME_TYPE(decltype(opt.value()), X&);
+    }
+#endif
 #ifndef TEST_HAS_NO_EXCEPTIONS
     {
         optional<X> opt;
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or.pass.cpp
index 8c063ae1a799c..66890ff9c9b91 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or.pass.cpp
@@ -80,6 +80,14 @@ constexpr int test()
       assert((std::move(opt).value_or({2, 3}) == Z{2, 3}));
       assert(!opt);
     }
+#if TEST_STD_VER >= 26
+    {
+      int y = 2;
+      optional<int&> opt;
+      assert(std::move(opt).value_or(y) == 2);
+      assert(!opt);
+    }
+#endif
     return 0;
 }
 
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or_const.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or_const.pass.cpp
index ec42890a3b995..6bd308b405605 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or_const.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or_const.pass.cpp
@@ -79,6 +79,12 @@ int main(int, char**)
       const optional<X> opt;
       assert(opt.value_or({Y(3)}) == 4);
     }
-
-  return 0;
+#if TEST_STD_VER >= 26
+    {
+      X y{3};
+      const optional<X&> opt;
+      assert(opt.value_or(y) == 3);
+    }
+#endif
+    return 0;
 }
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.swap/swap.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.swap/swap.pass.cpp
index e3a2fdb8b0020..a82ca615e0c8c 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.swap/swap.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.swap/swap.pass.cpp
@@ -13,9 +13,10 @@
 //     noexcept(is_nothrow_move_constructible<T>::value &&
 //              is_nothrow_swappable<T>::value)
 
+#include <cassert>
+#include <memory>
 #include <optional>
 #include <type_traits>
-#include <cassert>
 
 #include "test_macros.h"
 #include "archetypes.h"
@@ -127,6 +128,74 @@ TEST_CONSTEXPR_CXX20 bool check_swap()
     return true;
 }
 
+#if TEST_STD_VER >= 26
+template <typename T>
+constexpr bool check_swap_ref() {
+  {
+    optional<T&> opt1;
+    optional<T&> opt2;
+    static_assert(noexcept(opt1.swap(opt2)) == true);
+    assert(static_cast<bool>(opt1) == false);
+    assert(static_cast<bool>(opt2) == false);
+    opt1.swap(opt2);
+    assert(static_cast<bool>(opt1) == false);
+    assert(static_cast<bool>(opt2) == false);
+  }
+
+  {
+    T one{1};
+    optional<T&> opt1(one);
+    optional<T&> opt2;
+    static_assert(noexcept(opt1.swap(opt2)) == true);
+    assert(static_cast<bool>(opt1) == true);
+    assert(std::addressof(*opt1) == std::addressof(one));
+    assert(static_cast<bool>(opt2) == false);
+    opt1.swap(opt2);
+    assert(static_cast<bool>(opt1) == false);
+    assert(static_cast<bool>(opt2) == true);
+    assert(std::addressof(*opt2) == std::addressof(one));
+  }
+
+  {
+    T two{2};
+    optional<T&> opt1;
+    optional<T&> opt2(two);
+    static_assert(noexcept(opt1.swap(opt2)) == true);
+    assert(static_cast<bool>(opt1) == false);
+    assert(static_cast<bool>(opt2) == true);
+    assert(std::addressof(*opt2) == std::addressof(two));
+    opt1.swap(opt2);
+    assert(static_cast<bool>(opt1) == true);
+    assert(std::addressof(*opt1) == std::addressof(two));
+    assert(static_cast<bool>(opt2) == false);
+  }
+
+  {
+    T one{1};
+    T two{2};
+
+    optional<T&> opt1(one);
+    optional<T&> opt2(two);
+    static_assert(noexcept(opt1.swap(opt2)) == true);
+    assert(static_cast<bool>(opt1) == true);
+    assert(*opt1 == 1);
+    assert(std::addressof(*opt1) == std::addressof(one));
+    assert(static_cast<bool>(opt2) == true);
+    assert(*opt2 == 2);
+    assert(std::addressof(*opt2) == std::addressof(two));
+    opt1.swap(opt2);
+    assert(static_cast<bool>(opt1) == true);
+    assert(*opt1 == 2);
+    assert(std::addressof(*opt1) == std::addressof(two));
+    assert(static_cast<bool>(opt2) == true);
+    assert(*opt2 == 1);
+    assert(std::addressof(*opt2) == std::addressof(one));
+  }
+
+  return true;
+}
+#endif
+
 int main(int, char**)
 {
     check_swap<int>();
@@ -134,6 +203,12 @@ int main(int, char**)
 #if TEST_STD_VER > 17
     static_assert(check_swap<int>());
     static_assert(check_swap<W>());
+#endif
+#if TEST_STD_VER >= 26
+    static_assert(check_swap_ref<int>());
+    static_assert(check_swap_ref<W>());
+    check_swap_ref<int>();
+    check_swap_ref<W>();
 #endif
     {
         optional<X> opt1;
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional_requires_destructible_object.verify.cpp b/libcxx/test/std/utilities/optional/optional.object/optional_requires_destructible_object.verify.cpp
index a96c3c648f939..a956ab3a219cf 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional_requires_destructible_object.verify.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional_requires_destructible_object.verify.cpp
@@ -13,6 +13,8 @@
 
 #include <optional>
 
+#include "test_macros.h"
+
 using std::optional;
 
 struct X
@@ -25,9 +27,13 @@ int main(int, char**)
 {
     using std::optional;
     {
-        // expected-error-re@optional:* 2 {{static assertion failed{{.*}}instantiation of optional with a reference type is ill-formed}}
-        optional<int&> opt1;
-        optional<int&&> opt2;
+#if TEST_STD_VER >= 26
+      // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with an rvalue reference type is ill-formed}}
+#else
+      // expected-error-re@optional:* 2 {{static assertion failed{{.*}}instantiation of optional with a reference type is ill-formed}}
+#endif
+      optional<int&> opt1;
+      optional<int&&> opt2;
     }
     {
         // expected-error-re@optional:* {{static assertion failed{{.*}}instantiation of optional with a non-destructible type is ill-formed}}
diff --git a/libcxx/test/std/utilities/optional/optional.object/types.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/types.pass.cpp
index d097559877267..ecbc6b4548ee6 100644
--- a/libcxx/test/std/utilities/optional/optional.object/types.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/types.pass.cpp
@@ -36,6 +36,11 @@ int main(int, char**)
     test<optional<const int>, const int>();
     test<optional<double>, double>();
     test<optional<const double>, const double>();
-
-  return 0;
+#if TEST_STD_VER >= 26
+    test<optional<int&>, int>();
+    test<optional<const int&>, const int>();
+    test<optional<double&>, double>();
+    test<optional<const double&>, const double>();
+#endif
+    return 0;
 }
diff --git a/libcxx/test/std/utilities/optional/optional.specalg/make_optional.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/make_optional.pass.cpp
index e325a7af558eb..c27645165d20e 100644
--- a/libcxx/test/std/utilities/optional/optional.specalg/make_optional.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.specalg/make_optional.pass.cpp
@@ -13,10 +13,10 @@
 // template <class T>
 //   constexpr optional<decay_t<T>> make_optional(T&& v);
 
+#include <cassert>
+#include <memory>
 #include <optional>
 #include <string>
-#include <memory>
-#include <cassert>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp
index 23f131d2fc499..5dd1d6f0b3380 100644
--- a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp
@@ -15,13 +15,30 @@
 // GCC crashes on this file, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120577
 // XFAIL: gcc-15
 
+#include <cassert>
+#include <memory>
 #include <optional>
 #include <string>
-#include <memory>
-#include <cassert>
+#include <string_view>
 
 #include "test_macros.h"
 
+template <typename T>
+constexpr bool test_ref() {
+  T i{0};
+  auto opt = std::make_optional<T&>(i);
+
+#if TEST_STD_VER < 26
+  assert((std::is_same_v<decltype(opt), std::optional<T>>));
+#else
+  assert((std::is_same_v<decltype(opt), std::optional<T&>>));
+#endif
+
+  assert(*opt == 0);
+
+  return true;
+}
+
 int main(int, char**)
 {
     {
@@ -43,6 +60,12 @@ int main(int, char**)
         auto opt = std::make_optional<std::string>(4u, 'X');
         assert(*opt == "XXXX");
     }
+    using namespace std::string_view_literals;
+
+    static_assert(test_ref<int>());
+    assert((test_ref<int>()));
+    static_assert(test_ref<double>());
+    assert((test_ref<double>()));
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/utilities/optional/optional.specalg/swap.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/swap.pass.cpp
index 0da3a821e7961..c757120a1c146 100644
--- a/libcxx/test/std/utilities/optional/optional.specalg/swap.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.specalg/swap.pass.cpp
@@ -12,9 +12,10 @@
 // template <class T> void swap(optional<T>& x, optional<T>& y)
 //     noexcept(noexcept(x.swap(y)));
 
+#include <cassert>
+#include <memory>
 #include <optional>
 #include <type_traits>
-#include <cassert>
 
 #include "test_macros.h"
 #include "archetypes.h"
@@ -109,9 +110,82 @@ void test_swap_sfinae() {
     }
 }
 
+#if TEST_STD_VER >= 26
+template <typename T>
+constexpr bool test_swap_ref() {
+  {
+    optional<T&> opt1;
+    optional<T&> opt2;
+    static_assert(noexcept(swap(opt1, opt2)) == true);
+    assert(static_cast<bool>(opt1) == false);
+    assert(static_cast<bool>(opt2) == false);
+    swap(opt1, opt2);
+    assert(static_cast<bool>(opt1) == false);
+    assert(static_cast<bool>(opt2) == false);
+  }
+  {
+    T one{1};
+    optional<T&> opt1(one);
+    optional<T&> opt2;
+    static_assert(noexcept(swap(opt1, opt2)) == true);
+    assert(static_cast<bool>(opt1) == true);
+    assert(*opt1 == 1);
+    assert(std::addressof(*opt1) == std::addressof(one));
+    assert(static_cast<bool>(opt2) == false);
+    swap(opt1, opt2);
+    assert(static_cast<bool>(opt1) == false);
+    assert(static_cast<bool>(opt2) == true);
+    assert(*opt2 == 1);
+    assert(std::addressof(*opt2) == std::addressof(one));
+  }
+  {
+    T two{2};
+    optional<T&> opt1;
+    optional<T&> opt2(two);
+    static_assert(noexcept(swap(opt1, opt2)) == true);
+    assert(static_cast<bool>(opt1) == false);
+    assert(static_cast<bool>(opt2) == true);
+    assert(*opt2 == 2);
+    assert(std::addressof(*opt2) == std::addressof(two));
+    swap(opt1, opt2);
+    assert(static_cast<bool>(opt1) == true);
+    assert(*opt1 == 2);
+    assert(std::addressof(*opt1) == std::addressof(two));
+    assert(static_cast<bool>(opt2) == false);
+  }
+  {
+    T one{1};
+    T two{2};
+    optional<T&> opt1(one);
+    optional<T&> opt2(two);
+    static_assert(noexcept(swap(opt1, opt2)) == true);
+    assert(static_cast<bool>(opt1) == true);
+    assert(*opt1 == 1);
+    assert(std::addressof(*opt1) == std::addressof(one));
+    assert(static_cast<bool>(opt2) == true);
+    assert(*opt2 == 2);
+    assert(std::addressof(*opt2) == std::addressof(two));
+    swap(opt1, opt2);
+    assert(static_cast<bool>(opt1) == true);
+    assert(*opt1 == 2);
+    assert(std::addressof(*opt1) == std::addressof(two));
+    assert(static_cast<bool>(opt2) == true);
+    assert(*opt2 == 1);
+    assert(std::addressof(*opt2) == std::addressof(one));
+  }
+  return true;
+}
+#endif
+
 int main(int, char**)
 {
     test_swap_sfinae();
+#if TEST_STD_VER >= 26
+    static_assert(test_swap_ref<int>());
+    static_assert(test_swap_ref<double>());
+    test_swap_ref<int>();
+    test_swap_ref<double>();
+#endif
     {
         optional<int> opt1;
         optional<int> opt2;
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp
index 2dfbae9138864..12d778408d5ec 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp
@@ -19,11 +19,7 @@
 #include "test_macros.h"
 
 void test() {
-  // FreeBSD ci use clang 19.1.1, which hasn't implement __reference_constructs_from_temporary.
-  // The static_assert inner std::make_from_tuple will not triggered.
-#if __has_builtin(__reference_constructs_from_temporary)
   // expected-error@*:* {{static assertion failed}}
-#endif
 
   // Turns to an error since C++26 (Disallow Binding a Returned Glvalue to a Temporary https://wg21.link/P2748R5).
 #if TEST_STD_VER >= 26
diff --git a/libcxx/test/support/poisoned_hash_helper.h b/libcxx/test/support/poisoned_hash_helper.h
index 93b579d2dfde3..cd71cd70d6a84 100644
--- a/libcxx/test/support/poisoned_hash_helper.h
+++ b/libcxx/test/support/poisoned_hash_helper.h
@@ -123,13 +123,9 @@ struct Class {};
 // Each header that declares the std::hash template provides enabled
 // specializations of std::hash for std::nullptr_t and all cv-unqualified
 // arithmetic, enumeration, and pointer types.
-#if TEST_STD_VER >= 17
-using MaybeNullptr = types::type_list<std::nullptr_t>;
-#else
-using MaybeNullptr = types::type_list<>;
-#endif
-using LibraryHashTypes = types::
-    concatenate_t<types::arithmetic_types, types::type_list<Enum, EnumClass, void*, void const*, Class*>, MaybeNullptr>;
+using LibraryHashTypes =
+    types::concatenate_t<types::arithmetic_types,
+                         types::type_list<Enum, EnumClass, void*, void const*, Class*, std::nullptr_t>>;
 
 struct TestHashEnabled {
   template <class T>
diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h
index 0335a4c561017..4fc8345c2dcef 100644
--- a/libcxx/test/support/test_iterators.h
+++ b/libcxx/test/support/test_iterators.h
@@ -59,6 +59,9 @@ class cpp17_output_iterator
 
     template <class T>
     void operator,(T const &) = delete;
+
+    template <class T>
+    friend void operator,(const T&, const cpp17_output_iterator&) = delete;
 };
 #if TEST_STD_VER > 14
 template <class It>
@@ -109,6 +112,9 @@ class cpp17_input_iterator
 
     template <class T>
     void operator,(T const &) = delete;
+
+    template <class T>
+    friend void operator,(const T&, const cpp17_input_iterator&) = delete;
 };
 #if TEST_STD_VER > 14
 template <class It>
@@ -157,6 +163,9 @@ class forward_iterator
 
     template <class T>
     void operator,(T const &) = delete;
+
+    template <class T>
+    friend void operator,(const T&, const forward_iterator&) = delete;
 };
 #if TEST_STD_VER > 14
 template <class It>
@@ -203,6 +212,9 @@ class bidirectional_iterator
 
     template <class T>
     void operator,(T const &) = delete;
+
+    template <class T>
+    friend void operator,(const T&, const bidirectional_iterator&) = delete;
 };
 #if TEST_STD_VER > 14
 template <class It>
@@ -261,6 +273,9 @@ class random_access_iterator
 
     template <class T>
     void operator,(T const &) = delete;
+
+    template <class T>
+    friend void operator,(const T&, const random_access_iterator&) = delete;
 };
 #if TEST_STD_VER > 14
 template <class It>
@@ -390,6 +405,9 @@ class three_way_random_access_iterator {
 
   template <class T>
   void operator,(T const&) = delete;
+
+  template <class T>
+  friend void operator,(const T&, const three_way_random_access_iterator&) = delete;
 };
 #if TEST_STD_VER > 14
 template <class It>
@@ -485,6 +503,9 @@ class cpp20_random_access_iterator {
 
   template <class T>
   void operator,(T const&) = delete;
+
+  template <class T>
+  friend void operator,(const T&, const cpp20_random_access_iterator&) = delete;
 };
 template <class It>
 cpp20_random_access_iterator(It) -> cpp20_random_access_iterator<It>;
@@ -578,6 +599,9 @@ class contiguous_iterator {
 
   template <class T>
   void operator,(T const&) = delete;
+
+  template <class T>
+  friend void operator,(const T&, const contiguous_iterator&) = delete;
 };
 template <class It>
 contiguous_iterator(It) -> contiguous_iterator<It>;
@@ -635,6 +659,9 @@ class three_way_contiguous_iterator
 
     template <class T>
     void operator,(T const &) = delete;
+
+    template <class T>
+    friend void operator,(const T&, const three_way_contiguous_iterator&) = delete;
 };
 template <class It>
 three_way_contiguous_iterator(It) -> three_way_contiguous_iterator<It>;
@@ -746,7 +773,10 @@ struct ThrowingIterator {
     template <class T2>
     void operator,(T2 const &) = delete;
 
-private:
+    template <class T2>
+    friend void operator,(const T2&, const ThrowingIterator&) = delete;
+
+  private:
     const T* begin_;
     const T* end_;
     const T* current_;
@@ -817,7 +847,10 @@ struct NonThrowingIterator {
     template <class T2>
     void operator,(T2 const &) = delete;
 
-private:
+    template <class T2>
+    friend void operator,(const T2&, const NonThrowingIterator&) = delete;
+
+  private:
     const T *begin_;
     const T *end_;
     const T *current_;
@@ -847,6 +880,9 @@ class cpp20_input_iterator
 
     template <class T>
     void operator,(T const &) = delete;
+
+    template <class T>
+    friend void operator,(const T&, const cpp20_input_iterator&) = delete;
 };
 template <class It>
 cpp20_input_iterator(It) -> cpp20_input_iterator<It>;
@@ -884,6 +920,9 @@ class cpp20_output_iterator {
 
   template <class T>
   void operator,(T const&) = delete;
+
+  template <class T>
+  friend void operator,(const T&, const cpp20_output_iterator&) = delete;
 };
 template <class It>
 cpp20_output_iterator(It) -> cpp20_output_iterator<It>;
@@ -1077,17 +1116,20 @@ class operation_counting_iterator {
     template <class T>
     void operator,(T const &) = delete;
 
-private:
-  constexpr void moved_by(difference_type n) {
-    if (counts_ == nullptr)
-      return;
-    if (n > 0)
-      ++counts_->increments;
-    else if (n < 0)
-      ++counts_->decrements;
-    else
-      ++counts_->zero_moves;
-  }
+    template <class T>
+    friend void operator,(const T&, const operation_counting_iterator&) = delete;
+
+  private:
+    constexpr void moved_by(difference_type n) {
+      if (counts_ == nullptr)
+        return;
+      if (n > 0)
+        ++counts_->increments;
+      else if (n < 0)
+        ++counts_->decrements;
+      else
+        ++counts_->zero_moves;
+    }
 
     decltype(base(std::declval<It>())) base_;
     IteratorOpCounts* counts_ = nullptr;
diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml
index ca83af9824b83..2ac69c38ebffa 100644
--- a/libcxx/utils/ci/buildkite-pipeline.yml
+++ b/libcxx/utils/ci/buildkite-pipeline.yml
@@ -103,7 +103,6 @@ steps:
       queue: libcxx-builders
       os: aix
     <<: *common
-    skip: "https://github.com/llvm/llvm-project/issues/162516"
 
   - label: AIX (64-bit)
     command: libcxx/utils/ci/run-buildbot aix
@@ -115,7 +114,6 @@ steps:
       queue: libcxx-builders
       os: aix
     <<: *common
-    skip: "https://github.com/llvm/llvm-project/issues/162516"
 
 - group: ':freebsd: FreeBSD'
   steps:
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index 57ecf1e49dbf2..d265dddebe11f 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -35,8 +35,6 @@ CC                  The C compiler to use, this value is used by CMake. This
 CXX                 The C++ compiler to use, this value is used by CMake. This
                     variable is optional.
 
-CMAKE               The CMake binary to use. This variable is optional.
-
 CLANG_FORMAT        The clang-format binary to use when generating the format
                     ignore list.
 
@@ -73,29 +71,6 @@ MONOREPO_ROOT="${MONOREPO_ROOT:="$(git rev-parse --show-toplevel)"}"
 BUILD_DIR="${BUILD_DIR:=${MONOREPO_ROOT}/build/${BUILDER}}"
 INSTALL_DIR="${BUILD_DIR}/install"
 
-# If we can find Ninja/CMake provided by Xcode, use those since we know their
-# version will generally work with the Clang shipped in Xcode (e.g. if Clang
-# knows about -std=c++20, the CMake bundled in Xcode will probably know about
-# that flag too).
-if xcrun --find ninja &>/dev/null; then
-    NINJA="$(xcrun --find ninja)"
-elif which ninja &>/dev/null; then
-    # The current implementation of modules needs the absolute path to the ninja
-    # binary.
-    # TODO MODULES Is this still needed when CMake has libc++ module support?
-    NINJA="$(which ninja)"
-else
-    NINJA="ninja"
-fi
-
-if [ -z "${CMAKE}" ]; then
-    if xcrun --find cmake &>/dev/null; then
-        CMAKE="$(xcrun --find cmake)"
-    else
-        CMAKE="cmake"
-    fi
-fi
-
 function step() {
   endstep
   set +x
@@ -129,10 +104,10 @@ function generate-cmake-base() {
     step "Generating CMake"
 
     # We can remove -DCMAKE_INSTALL_MESSAGE=NEVER once https://gitlab.kitware.com/cmake/cmake/-/issues/26085 is fixed.
-    ${CMAKE} \
+    cmake \
           -S "${MONOREPO_ROOT}/runtimes" \
           -B "${BUILD_DIR}" \
-          -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \
+          -GNinja \
           -DCMAKE_BUILD_TYPE=RelWithDebInfo \
           -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
           -DLIBCXX_ENABLE_WERROR=YES \
@@ -168,25 +143,25 @@ function generate-cmake-android() {
 
 function check-runtimes() {
     step "Building libc++ test dependencies"
-    ${NINJA} -vC "${BUILD_DIR}" cxx-test-depends
+    ninja -vC "${BUILD_DIR}" cxx-test-depends
 
     step "Running the libc++ tests"
-    ${NINJA} -vC "${BUILD_DIR}" check-cxx
+    ninja -vC "${BUILD_DIR}" check-cxx
 
     step "Running the libc++abi tests"
-    ${NINJA} -vC "${BUILD_DIR}" check-cxxabi
+    ninja -vC "${BUILD_DIR}" check-cxxabi
 
     step "Running the libunwind tests"
-    ${NINJA} -vC "${BUILD_DIR}" check-unwind
+    ninja -vC "${BUILD_DIR}" check-unwind
 }
 
 # TODO: The goal is to test this against all configurations. We should also move
 #       this to the Lit test suite instead of being a separate CMake target.
 function check-abi-list() {
     step "Running the libc++ ABI list test"
-    ${NINJA} -vC "${BUILD_DIR}" check-cxx-abilist || (
+    ninja -vC "${BUILD_DIR}" check-cxx-abilist || (
         error "Generating the libc++ ABI list after failed check"
-        ${NINJA} -vC "${BUILD_DIR}" generate-cxx-abilist
+        ninja -vC "${BUILD_DIR}" generate-cxx-abilist
         false
     )
 }
@@ -212,10 +187,10 @@ function test-armv7m-picolibc() {
     # architecture name, which is not what Clang's driver expects to find.
     # The install location will however be wrong with
     # LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=ON, so we correct that below.
-    ${CMAKE} \
+    cmake \
         -S "${MONOREPO_ROOT}/compiler-rt" \
         -B "${BUILD_DIR}/compiler-rt" \
-        -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \
+        -GNinja \
         -DCMAKE_BUILD_TYPE=RelWithDebInfo \
         -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
         -DCMAKE_C_FLAGS="${flags}" \
@@ -233,7 +208,7 @@ function test-armv7m-picolibc() {
         "${@}"
 
     step "Installing compiler-rt"
-    ${NINJA} -vC "${BUILD_DIR}/compiler-rt" install
+    ninja -vC "${BUILD_DIR}/compiler-rt" install
     # Move compiler-rt libs into the same directory as all the picolib objects.
     mv "${INSTALL_DIR}/lib/armv7m-unknown-none-eabi"/* "${INSTALL_DIR}/lib"
 
@@ -242,8 +217,8 @@ function test-armv7m-picolibc() {
 
 # Print the version of a few tools to aid diagnostics in some cases
 step "Diagnose tools in use"
-${CMAKE} --version
-${NINJA} --version
+cmake --version
+ninja --version
 if [ ! -z "${CXX}" ]; then ${CXX} --version; fi
 
 case "${BUILDER}" in
@@ -256,7 +231,7 @@ check-generated-output)
     # Reject patches that forgot to re-run the generator scripts.
     step "Making sure the generator scripts were run"
     set +x # Printing all the commands below just creates extremely confusing output
-    ${NINJA} -vC "${BUILD_DIR}" libcxx-generate-files
+    ninja -vC "${BUILD_DIR}" libcxx-generate-files
     git diff | tee ${BUILD_DIR}/generated_output.patch
     git ls-files -o --exclude-standard | tee ${BUILD_DIR}/generated_output.status
     ! grep -q '^--- a' ${BUILD_DIR}/generated_output.patch || false
@@ -383,10 +358,10 @@ bootstrapping-build)
     clean
 
     step "Generating CMake"
-    ${CMAKE} \
+    cmake \
           -S "${MONOREPO_ROOT}/llvm" \
           -B "${BUILD_DIR}" \
-          -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \
+          -GNinja \
           -DCMAKE_CXX_COMPILER_LAUNCHER="ccache" \
           -DCMAKE_BUILD_TYPE=Release \
           -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
@@ -400,13 +375,13 @@ bootstrapping-build)
           -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests"
 
     step "Running the libc++ and libc++abi tests"
-    ${NINJA} -vC "${BUILD_DIR}" check-runtimes
+    ninja -vC "${BUILD_DIR}" check-runtimes
 
     step "Installing libc++ and libc++abi to a fake location"
-    ${NINJA} -vC "${BUILD_DIR}" install-runtimes
+    ninja -vC "${BUILD_DIR}" install-runtimes
 
     step "Running the LLDB libc++ data formatter tests"
-    ${NINJA} -vC "${BUILD_DIR}" lldb-api-test-deps
+    ninja -vC "${BUILD_DIR}" lldb-api-test-deps
     ${BUILD_DIR}/bin/llvm-lit -sv --param dotest-args='--category libc++' "${MONOREPO_ROOT}/lldb/test/API"
 
     ccache -s
@@ -572,10 +547,10 @@ apple-system|apple-system-hardened)
 
     # In the Apple system configuration, we build libc++ and libunwind separately.
     step "Installing libc++ and libc++abi in Apple-system configuration"
-    ${CMAKE} \
+    cmake \
         -S "${MONOREPO_ROOT}/runtimes" \
         -B "${BUILD_DIR}/cxx" \
-        -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \
+        -GNinja \
         -DCMAKE_BUILD_TYPE=RelWithDebInfo \
         -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}/cxx" \
         -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \
@@ -588,10 +563,10 @@ apple-system|apple-system-hardened)
         -DLIBCXXABI_TEST_PARAMS="${params}"
 
     step "Installing libunwind in Apple-system configuration"
-    ${CMAKE} \
+    cmake \
         -S "${MONOREPO_ROOT}/runtimes" \
         -B "${BUILD_DIR}/unwind" \
-        -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \
+        -GNinja \
         -DCMAKE_BUILD_TYPE=RelWithDebInfo \
         -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}/unwind" \
         -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \
@@ -601,13 +576,13 @@ apple-system|apple-system-hardened)
         -DCMAKE_INSTALL_NAME_DIR="/usr/lib/system"
 
     step "Running the libc++ tests"
-    ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxx
+    ninja -vC "${BUILD_DIR}/cxx" check-cxx
 
     step "Running the libc++abi tests"
-    ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxxabi
+    ninja -vC "${BUILD_DIR}/cxx" check-cxxabi
 
     step "Running the libunwind tests"
-    ${NINJA} -vC "${BUILD_DIR}/unwind" check-unwind
+    ninja -vC "${BUILD_DIR}/unwind" check-unwind
 ;;
 aarch64)
     clean
@@ -665,13 +640,13 @@ clang-cl-dll)
     # setting when cmake and the test driver does the right thing automatically.
     generate-cmake-libcxx-win -DLIBCXX_TEST_PARAMS="enable_experimental=False"
     step "Running the libc++ tests"
-    ${NINJA} -vC "${BUILD_DIR}" check-cxx
+    ninja -vC "${BUILD_DIR}" check-cxx
 ;;
 clang-cl-static)
     clean
     generate-cmake-libcxx-win -DLIBCXX_ENABLE_SHARED=OFF
     step "Running the libc++ tests"
-    ${NINJA} -vC "${BUILD_DIR}" check-cxx
+    ninja -vC "${BUILD_DIR}" check-cxx
 ;;
 clang-cl-no-vcruntime)
     clean
@@ -682,14 +657,14 @@ clang-cl-no-vcruntime)
     generate-cmake-libcxx-win -DLIBCXX_TEST_PARAMS="enable_experimental=False" \
                               -DLIBCXX_TEST_CONFIG="llvm-libc++-shared-no-vcruntime-clangcl.cfg.in"
     step "Running the libc++ tests"
-    ${NINJA} -vC "${BUILD_DIR}" check-cxx
+    ninja -vC "${BUILD_DIR}" check-cxx
 ;;
 clang-cl-debug)
     clean
     generate-cmake-libcxx-win -DLIBCXX_TEST_PARAMS="enable_experimental=False" \
                               -DCMAKE_BUILD_TYPE=Debug
     step "Running the libc++ tests"
-    ${NINJA} -vC "${BUILD_DIR}" check-cxx
+    ninja -vC "${BUILD_DIR}" check-cxx
 ;;
 clang-cl-static-crt)
     clean
@@ -698,7 +673,7 @@ clang-cl-static-crt)
     generate-cmake-libcxx-win -DLIBCXX_ENABLE_SHARED=OFF \
                               -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
     step "Running the libc++ tests"
-    ${NINJA} -vC "${BUILD_DIR}" check-cxx
+    ninja -vC "${BUILD_DIR}" check-cxx
 ;;
 mingw-dll)
     clean
@@ -744,7 +719,7 @@ mingw-incomplete-sysroot)
     # Only test that building succeeds; there's not much extra value in running
     # the tests here, as it would be equivalent to the mingw-dll config above.
     step "Building the runtimes"
-    ${NINJA} -vC "${BUILD_DIR}"
+    ninja -vC "${BUILD_DIR}"
 ;;
 aix)
     clean
@@ -781,7 +756,7 @@ android-ndk-*)
                            -DLIBCXX_TEST_PARAMS="${PARAMS}" \
                            -DLIBCXXABI_TEST_PARAMS="${PARAMS}"
     check-abi-list
-    ${NINJA} -vC "${BUILD_DIR}" install-cxx install-cxxabi
+    ninja -vC "${BUILD_DIR}" install-cxx install-cxxabi
 
     # Start the emulator and make sure we can connect to the adb server running
     # inside of it.
@@ -794,9 +769,9 @@ android-ndk-*)
     adb shell mkdir -p /data/local/tmp/adb_run
     adb push "${BUILD_DIR}/lib/libc++_shared.so" /data/local/tmp/libc++/libc++_shared.so
     step "Running the libc++ tests"
-    ${NINJA} -vC "${BUILD_DIR}" check-cxx
+    ninja -vC "${BUILD_DIR}" check-cxx
     step "Running the libc++abi tests"
-    ${NINJA} -vC "${BUILD_DIR}" check-cxxabi
+    ninja -vC "${BUILD_DIR}" check-cxxabi
 ;;
 #################################################################
 # Insert vendor-specific internal configurations below.
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index f6f252751b3e3..0802f865f9406 100644
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -14,7 +14,7 @@
 )
 import functools
 import json
-from libcxx.header_information import module_c_headers, module_headers, header_restrictions, headers_not_available, libcxx_root
+from libcxx.header_information import headers_not_available
 
 
 def get_libcxx_paths():
@@ -368,6 +368,16 @@ def add_version_header(tc):
             "values": {"c++20": 201907},
             "headers": ["memory"],
         },
+        {
+            "name": "__cpp_lib_constexpr_flat_map",
+            "values": {"c++26": 202502},
+            "headers": ["flat_map"],
+        },
+        {
+            "name": "__cpp_lib_constexpr_flat_set",
+            "values": {"c++26": 202502},
+            "headers": ["flat_set"],
+        },
         {
             "name": "__cpp_lib_constexpr_forward_list",
             "values": {"c++26": 202502},
@@ -863,7 +873,8 @@ def add_version_header(tc):
                 "c++26": 202306  # P2641R4 Checking if a union alternative is active
             },
             "headers": ["type_traits"],
-            "unimplemented": True,
+            "test_suite_guard": "__has_builtin(__builtin_is_within_lifetime)",
+            "libcxx_guard": "__has_builtin(__builtin_is_within_lifetime)",
         },
         {
             "name": "__cpp_lib_jthread",
@@ -1006,6 +1017,7 @@ def add_version_header(tc):
                 "c++17": 201606,
                 "c++20": 202106,  # P2231R1 Missing constexpr in std::optional and std::variant
                 "c++23": 202110,  # P0798R8 Monadic operations for std::optional + LWG3621 Remove feature-test macro __cpp_lib_monadic_optional
+                "c++26": 202506,  # P2988R12: std::optional<T&>
             },
             "headers": ["optional"],
         },
diff --git a/libcxx/utils/libcxx/test/config.py b/libcxx/utils/libcxx/test/config.py
index 0840c46d7bfae..00fab6a73ba68 100644
--- a/libcxx/utils/libcxx/test/config.py
+++ b/libcxx/utils/libcxx/test/config.py
@@ -22,6 +22,7 @@ def _appendToSubstitution(substitutions, key, value):
 
 def configure(parameters, features, config, lit_config):
     note = lambda s: lit_config.note("({}) {}".format(config.name, s))
+    debug = lambda s: lit_config.dbg("({}) {}".format(config.name, s))
     config.environment = dict(os.environ)
 
     # Apply the actions supplied by parameters to the configuration first, since
@@ -31,25 +32,23 @@ def configure(parameters, features, config, lit_config):
         actions = param.getActions(config, lit_config.params)
         for action in actions:
             action.applyTo(config)
-            if lit_config.debug:
-                note(
-                    "Applied '{}' as a result of parameter '{}'".format(
-                        action.pretty(config, lit_config.params),
-                        param.pretty(config, lit_config.params),
-                    )
+            debug(
+                "Applied '{}' as a result of parameter '{}'".format(
+                    action.pretty(config, lit_config.params),
+                    param.pretty(config, lit_config.params),
                 )
+            )
 
     # Then, apply the automatically-detected features.
     for feature in features:
         actions = feature.getActions(config)
         for action in actions:
             action.applyTo(config)
-            if lit_config.debug:
-                note(
-                    "Applied '{}' as a result of implicitly detected feature '{}'".format(
-                        action.pretty(config, lit_config.params), feature.pretty(config)
-                    )
+            debug(
+                "Applied '{}' as a result of implicitly detected feature '{}'".format(
+                    action.pretty(config, lit_config.params), feature.pretty(config)
                 )
+            )
 
     # Print the basic substitutions
     for sub in ("%{cxx}", "%{flags}", "%{compile_flags}", "%{link_flags}", "%{benchmark_flags}", "%{exec}"):
diff --git a/libcxx/utils/libcxx/test/dsl.py b/libcxx/utils/libcxx/test/dsl.py
index 3fb30d82e0d24..88fc49160c56b 100644
--- a/libcxx/utils/libcxx/test/dsl.py
+++ b/libcxx/utils/libcxx/test/dsl.py
@@ -88,7 +88,7 @@ def _executeWithFakeConfig(test, commands):
     litConfig = lit.LitConfig.LitConfig(
         progname="lit",
         path=[],
-        quiet=False,
+        diagnostic_level="note",
         useValgrind=False,
         valgrindLeakCheck=False,
         valgrindArgs=[],
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
deleted file mode 100644
index 7d6e78de343c5..0000000000000
--- a/libcxx/utils/libcxx/test/features.py
+++ /dev/null
@@ -1,892 +0,0 @@
-# ===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# ===----------------------------------------------------------------------===##
-
-from libcxx.test.dsl import *
-from lit.BooleanExpression import BooleanExpression
-import re
-import shutil
-import subprocess
-import sys
-
-_isAnyClang = lambda cfg: "__clang__" in compilerMacros(cfg)
-_isAppleClang = lambda cfg: "__apple_build_version__" in compilerMacros(cfg)
-_isAnyGCC = lambda cfg: "__GNUC__" in compilerMacros(cfg)
-_isClang = lambda cfg: _isAnyClang(cfg) and not _isAppleClang(cfg)
-_isGCC = lambda cfg: _isAnyGCC(cfg) and not _isAnyClang(cfg)
-_isAnyClangOrGCC = lambda cfg: _isAnyClang(cfg) or _isAnyGCC(cfg)
-_isClExe = lambda cfg: not _isAnyClangOrGCC(cfg)
-_isMSVC = lambda cfg: "_MSC_VER" in compilerMacros(cfg)
-_msvcVersion = lambda cfg: (int(compilerMacros(cfg)["_MSC_VER"]) // 100, int(compilerMacros(cfg)["_MSC_VER"]) % 100)
-
-def _getAndroidDeviceApi(cfg):
-    return int(
-        programOutput(
-            cfg,
-            r"""
-                #include <android/api-level.h>
-                #include <stdio.h>
-                int main(int, char**) {
-                    printf("%d\n", android_get_device_api_level());
-                    return 0;
-                }
-            """,
-        )
-    )
-
-
-def _mingwSupportsModules(cfg):
-    # Only mingw headers are known to work with libc++ built as a module,
-    # at the moment.
-    if not "__MINGW32__" in compilerMacros(cfg):
-        return False
-    # For mingw headers, check for a version known to support being built
-    # as a module.
-    return sourceBuilds(
-        cfg,
-        """
-        #include <_mingw_mac.h>
-        #if __MINGW64_VERSION_MAJOR < 12
-        #error Headers known to be incompatible
-        #elif __MINGW64_VERSION_MAJOR == 12
-        // The headers were fixed to work with libc++ modules during
-        // __MINGW64_VERSION_MAJOR == 12. The headers became compatible
-        // with libc++ built as a module in
-        // 1652e9241b5d8a5a779c6582b1c3c4f4a7cc66e5 (Apr 2024), but the
-        // following commit 8c13b28ace68f2c0094d45121d59a4b951b533ed
-        // removed the now unused __mingw_static_ovr define. Use this
-        // as indicator for whether we've got new enough headers.
-        #ifdef __mingw_static_ovr
-        #error Headers too old
-        #endif
-        #else
-        // __MINGW64_VERSION_MAJOR > 12 should be ok.
-        #endif
-        int main(int, char**) { return 0; }
-        """,
-    )
-
-
-# Lit features are evaluated in order. Some checks may require the compiler detection to have
-# run first in order to work properly.
-DEFAULT_FEATURES = [
-    # gcc-style-warnings detects compilers that understand -Wno-meow flags, unlike MSVC's compiler driver cl.exe.
-    Feature(name="gcc-style-warnings", when=_isAnyClangOrGCC),
-    Feature(name="cl-style-warnings", when=_isClExe),
-    Feature(name="apple-clang", when=_isAppleClang),
-    Feature(
-        name=lambda cfg: "apple-clang-{__clang_major__}".format(**compilerMacros(cfg)),
-        when=_isAppleClang,
-    ),
-    Feature(
-        name=lambda cfg: "apple-clang-{__clang_major__}.{__clang_minor__}".format(**compilerMacros(cfg)),
-        when=_isAppleClang,
-    ),
-    Feature(
-        name=lambda cfg: "apple-clang-{__clang_major__}.{__clang_minor__}.{__clang_patchlevel__}".format(**compilerMacros(cfg)),
-        when=_isAppleClang,
-    ),
-    Feature(name="clang", when=_isClang),
-    Feature(
-        name=lambda cfg: "clang-{__clang_major__}".format(**compilerMacros(cfg)),
-        when=_isClang,
-    ),
-    Feature(
-        name=lambda cfg: "clang-{__clang_major__}.{__clang_minor__}".format(**compilerMacros(cfg)),
-        when=_isClang,
-    ),
-    Feature(
-        name=lambda cfg: "clang-{__clang_major__}.{__clang_minor__}.{__clang_patchlevel__}".format(**compilerMacros(cfg)),
-        when=_isClang,
-    ),
-    # Note: Due to a GCC bug (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104760), we must disable deprecation warnings
-    #       on GCC or spurious diagnostics are issued.
-    #
-    # TODO:
-    # - Enable -Wplacement-new with GCC.
-    # - Enable -Wclass-memaccess with GCC.
-    Feature(
-        name="gcc",
-        when=_isGCC,
-        actions=[
-            AddCompileFlag("-D_LIBCPP_DISABLE_DEPRECATION_WARNINGS"),
-            AddCompileFlag("-Wno-placement-new"),
-            AddCompileFlag("-Wno-class-memaccess"),
-            AddFeature("GCC-ALWAYS_INLINE-FIXME"),
-        ],
-    ),
-    Feature(
-        name=lambda cfg: "gcc-{__GNUC__}".format(**compilerMacros(cfg)), when=_isGCC
-    ),
-    Feature(
-        name=lambda cfg: "gcc-{__GNUC__}.{__GNUC_MINOR__}".format(**compilerMacros(cfg)),
-        when=_isGCC,
-    ),
-    Feature(
-        name=lambda cfg: "gcc-{__GNUC__}.{__GNUC_MINOR__}.{__GNUC_PATCHLEVEL__}".format(**compilerMacros(cfg)),
-        when=_isGCC,
-    ),
-    Feature(name="msvc", when=_isMSVC),
-    Feature(name=lambda cfg: "msvc-{}".format(*_msvcVersion(cfg)), when=_isMSVC),
-    Feature(name=lambda cfg: "msvc-{}.{}".format(*_msvcVersion(cfg)), when=_isMSVC),
-
-    Feature(
-        name="diagnose-if-support",
-        when=lambda cfg: hasCompileFlag(cfg, "-Wuser-defined-warnings"),
-        actions=[AddCompileFlag("-Wuser-defined-warnings")],
-    ),
-    Feature(
-        name="character-conversion-warnings",
-        when=lambda cfg: hasCompileFlag(cfg, "-Wcharacter-conversion"),
-    ),
-    # Tests to validate whether the compiler has a way to set the maximum number
-    # of steps during constant evaluation. Since the flag differs per compiler
-    # store the "valid" flag as a feature. This allows passing the proper compile
-    # flag to the compiler:
-    # // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=12345678
-    # // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=12345678
-    Feature(
-        name="has-fconstexpr-steps",
-        when=lambda cfg: hasCompileFlag(cfg, "-fconstexpr-steps=1"),
-    ),
-    Feature(
-        name="has-fconstexpr-ops-limit",
-        when=lambda cfg: hasCompileFlag(cfg, "-fconstexpr-ops-limit=1"),
-    ),
-    Feature(name="has-fblocks", when=lambda cfg: hasCompileFlag(cfg, "-fblocks")),
-    Feature(
-        name="fdelayed-template-parsing",
-        when=lambda cfg: hasCompileFlag(cfg, "-fdelayed-template-parsing"),
-    ),
-    Feature(
-        name="has-fobjc-arc",
-        when=lambda cfg: hasCompileFlag(cfg, "-xobjective-c++ -fobjc-arc")
-        and sys.platform.lower().strip() == "darwin",
-    ),  # TODO: this doesn't handle cross-compiling to Apple platforms.
-    Feature(
-        name="objective-c++",
-        when=lambda cfg: hasCompileFlag(cfg, "-xobjective-c++ -fobjc-arc"),
-    ),
-    Feature(
-        name="verify-support",
-        when=lambda cfg: hasCompileFlag(cfg, "-Xclang -verify-ignore-unexpected"),
-    ),
-    Feature(
-        name="add-latomic-workaround",  # https://llvm.org/PR73361
-        when=lambda cfg: sourceBuilds(
-            cfg, "int main(int, char**) { return 0; }", ["-latomic"]
-        ),
-        actions=[AddLinkFlag("-latomic")],
-    ),
-    Feature(
-        name="has-64-bit-atomics",
-        when=lambda cfg: sourceBuilds(
-            cfg,
-            """
-            #include <atomic>
-            struct Large { char storage[64/8]; };
-            std::atomic<Large> x;
-            int main(int, char**) { (void)x.load(); (void)x.is_lock_free(); return 0; }
-          """,
-        ),
-    ),
-    Feature(
-        name="has-1024-bit-atomics",
-        when=lambda cfg: sourceBuilds(
-            cfg,
-            """
-            #include <atomic>
-            struct Large { char storage[1024/8]; };
-            std::atomic<Large> x;
-            int main(int, char**) { (void)x.load(); (void)x.is_lock_free(); return 0; }
-          """,
-        ),
-    ),
-    # Tests that require 64-bit architecture
-    Feature(
-        name="32-bit-pointer",
-        when=lambda cfg: sourceBuilds(
-            cfg,
-            """
-            int main(int, char**) {
-              static_assert(sizeof(void *) == 4);
-            }
-          """,
-        ),
-    ),
-    # Check for a Windows UCRT bug (fixed in UCRT/Windows 10.0.20348.0):
-    # https://developercommunity.visualstudio.com/t/utf-8-locales-break-ctype-functions-for-wchar-type/1653678
-    Feature(
-        name="win32-broken-utf8-wchar-ctype",
-        when=lambda cfg: not "_LIBCPP_HAS_LOCALIZATION" in compilerMacros(cfg)
-        or compilerMacros(cfg)["_LIBCPP_HAS_LOCALIZATION"] == "1"
-        and "_WIN32" in compilerMacros(cfg)
-        and not programSucceeds(
-            cfg,
-            """
-            #include <locale.h>
-            #include <wctype.h>
-            int main(int, char**) {
-              setlocale(LC_ALL, "en_US.UTF-8");
-              return towlower(L'\\xDA') != L'\\xFA';
-            }
-          """,
-        ),
-    ),
-    # Check for a Windows UCRT bug (fixed in UCRT/Windows 10.0.19041.0).
-    # https://developercommunity.visualstudio.com/t/printf-formatting-with-g-outputs-too/1660837
-    Feature(
-        name="win32-broken-printf-g-precision",
-        when=lambda cfg: "_WIN32" in compilerMacros(cfg)
-        and not programSucceeds(
-            cfg,
-            """
-            #include <stdio.h>
-            #include <string.h>
-            int main(int, char**) {
-              char buf[100];
-              snprintf(buf, sizeof(buf), "%#.*g", 0, 0.0);
-              return strcmp(buf, "0.");
-            }
-          """,
-        ),
-    ),
-    # Check for a Windows UCRT bug (not fixed upstream yet).
-    # With UCRT, printf("%a", 0.0) produces "0x0.0000000000000p+0",
-    # while other C runtimes produce just "0x0p+0".
-    # https://developercommunity.visualstudio.com/t/Printf-formatting-of-float-as-hex-prints/1660844
-    Feature(
-        name="win32-broken-printf-a-precision",
-        when=lambda cfg: "_WIN32" in compilerMacros(cfg)
-        and not programSucceeds(
-            cfg,
-            """
-            #include <stdio.h>
-            #include <string.h>
-            int main(int, char**) {
-              char buf[100];
-              snprintf(buf, sizeof(buf), "%a", 0.0);
-              return strcmp(buf, "0x0p+0");
-            }
-          """,
-        ),
-    ),
-    # Check for Glibc < 2.27, where the ru_RU.UTF-8 locale had
-    # mon_decimal_point == ".", which our tests don't handle.
-    Feature(
-        name="glibc-old-ru_RU-decimal-point",
-        when=lambda cfg: not "_LIBCPP_HAS_LOCALIZATION" in compilerMacros(cfg)
-        or compilerMacros(cfg)["_LIBCPP_HAS_LOCALIZATION"] == "1"
-        and not programSucceeds(
-            cfg,
-            """
-            #include <locale.h>
-            #include <string.h>
-            int main(int, char**) {
-              setlocale(LC_ALL, "ru_RU.UTF-8");
-              return strcmp(localeconv()->mon_decimal_point, ",");
-            }
-          """,
-        ),
-    ),
-    Feature(
-        name="has-unix-headers",
-        when=lambda cfg: sourceBuilds(
-            cfg,
-            """
-            #include <unistd.h>
-            #include <sys/wait.h>
-            int main(int, char**) {
-              int fd[2];
-              return pipe(fd);
-            }
-          """,
-        ),
-    ),
-    # Whether Bash can run on the executor.
-    # This is not always the case, for example when running on embedded systems.
-    #
-    # For the corner case of bash existing, but it being missing in the path
-    # set in %{exec} as "--env PATH=one-single-dir", the executor does find
-    # and executes bash, but bash then can't find any other common shell
-    # utilities. Test executing "bash -c 'bash --version'" to see if bash
-    # manages to find binaries to execute.
-    Feature(
-        name="executor-has-no-bash",
-        when=lambda cfg: runScriptExitCode(cfg, ["%{exec} bash -c 'bash --version'"]) != 0,
-    ),
-    # Whether module support for the platform is available.
-    Feature(
-        name="has-no-cxx-module-support",
-        # The libc of these platforms have functions with internal linkage.
-        # This is not allowed per C11 7.1.2 Standard headers/6
-        #  Any declaration of a library function shall have external linkage.
-        when=lambda cfg: "__ANDROID__" in compilerMacros(cfg)
-        or "__FreeBSD__" in compilerMacros(cfg)
-        or ("_WIN32" in compilerMacros(cfg) and not _mingwSupportsModules(cfg))
-        or platform.system().lower().startswith("aix")
-        # Avoid building on platforms that don't support modules properly.
-        or not hasCompileFlag(cfg, "-Wno-reserved-module-identifier")
-        # older versions don't support extern "C++", newer versions don't support main in named module.
-        or not (
-            sourceBuilds(
-                cfg,
-                """
-            export module test;
-            extern "C++" int main(int, char**) { return 0; }
-          """,
-            )
-            or sourceBuilds(
-                cfg,
-                """
-            export module test;
-            int main(int, char**) { return 0; }
-          """,
-            )
-        ),
-    ),
-    # The time zone validation tests compare the output of zdump against the
-    # output generated by <chrono>'s time zone support.
-    Feature(
-        name="has-no-zdump",
-        when=lambda cfg: runScriptExitCode(cfg, ["zdump --version"]) != 0,
-    ),
-]
-
-# Deduce and add the test features that that are implied by the #defines in
-# the <__config> header.
-#
-# For each macro of the form `_LIBCPP_XXX_YYY_ZZZ` defined below that
-# is defined after including <__config>, add a Lit feature called
-# `libcpp-xxx-yyy-zzz`. When a macro is defined to a specific value
-# (e.g. `_LIBCPP_ABI_VERSION=2`), the feature is `libcpp-xxx-yyy-zzz=<value>`.
-#
-# Note that features that are more strongly tied to libc++ are named libcpp-foo,
-# while features that are more general in nature are not prefixed with 'libcpp-'.
-macros = {
-    "_LIBCPP_NO_VCRUNTIME": "libcpp-no-vcruntime",
-    "_LIBCPP_ABI_VERSION": "libcpp-abi-version",
-    "_LIBCPP_ABI_BOUNDED_ITERATORS": "libcpp-has-abi-bounded-iterators",
-    "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING": "libcpp-has-abi-bounded-iterators-in-string",
-    "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR": "libcpp-has-abi-bounded-iterators-in-vector",
-    "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY": "libcpp-has-abi-bounded-iterators-in-std-array",
-    "_LIBCPP_ABI_BOUNDED_UNIQUE_PTR": "libcpp-has-abi-bounded-unique_ptr",
-    "_LIBCPP_ABI_FIX_UNORDERED_CONTAINER_SIZE_TYPE": "libcpp-has-abi-fix-unordered-container-size-type",
-    "_LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR": "libcpp-deprecated-abi-disable-pair-trivial-copy-ctor",
-    "_LIBCPP_ABI_NO_COMPRESSED_PAIR_PADDING": "libcpp-abi-no-compressed-pair-padding",
-    "_LIBCPP_PSTL_BACKEND_LIBDISPATCH": "libcpp-pstl-backend-libdispatch",
-}
-for macro, feature in macros.items():
-    DEFAULT_FEATURES.append(
-        Feature(
-            name=lambda cfg, m=macro, f=feature: f + ("={}".format(compilerMacros(cfg)[m]) if compilerMacros(cfg)[m] else ""),
-            when=lambda cfg, m=macro: m in compilerMacros(cfg),
-        )
-    )
-
-true_false_macros = {
-    "_LIBCPP_HAS_THREAD_API_EXTERNAL": "libcpp-has-thread-api-external",
-    "_LIBCPP_HAS_THREAD_API_PTHREAD": "libcpp-has-thread-api-pthread",
-}
-for macro, feature in true_false_macros.items():
-    DEFAULT_FEATURES.append(
-        Feature(
-            name=feature,
-            when=lambda cfg, m=macro: m in compilerMacros(cfg)
-            and compilerMacros(cfg)[m] == "1",
-        )
-    )
-
-inverted_macros = {
-    "_LIBCPP_HAS_TIME_ZONE_DATABASE": "no-tzdb",
-    "_LIBCPP_HAS_FILESYSTEM": "no-filesystem",
-    "_LIBCPP_HAS_LOCALIZATION": "no-localization",
-    "_LIBCPP_HAS_THREADS": "no-threads",
-    "_LIBCPP_HAS_MONOTONIC_CLOCK": "no-monotonic-clock",
-    "_LIBCPP_HAS_WIDE_CHARACTERS": "no-wide-characters",
-    "_LIBCPP_HAS_VENDOR_AVAILABILITY_ANNOTATIONS": "libcpp-has-no-availability-markup",
-    "_LIBCPP_HAS_RANDOM_DEVICE": "no-random-device",
-    "_LIBCPP_HAS_UNICODE": "libcpp-has-no-unicode",
-    "_LIBCPP_HAS_TERMINAL": "no-terminal",
-}
-for macro, feature in inverted_macros.items():
-    DEFAULT_FEATURES.append(
-        Feature(
-            name=feature,
-            when=lambda cfg, m=macro: m in compilerMacros(cfg)
-            and compilerMacros(cfg)[m] == "0",
-        )
-    )
-
-# Mapping from canonical locale names (used in the tests) to possible locale
-# names on various systems. Each locale is considered supported if any of the
-# alternative names is supported.
-locales = {
-    "en_US.UTF-8": ["en_US.UTF-8", "en_US.utf8", "English_United States.1252"],
-    "fr_FR.UTF-8": ["fr_FR.UTF-8", "fr_FR.utf8", "French_France.1252"],
-    "ja_JP.UTF-8": ["ja_JP.UTF-8", "ja_JP.utf8", "Japanese_Japan.923"],
-    "ru_RU.UTF-8": ["ru_RU.UTF-8", "ru_RU.utf8", "Russian_Russia.1251"],
-    "zh_CN.UTF-8": ["zh_CN.UTF-8", "zh_CN.utf8", "Chinese_China.936"],
-    "fr_CA.ISO8859-1": ["fr_CA.ISO8859-1", "French_Canada.1252"],
-    "cs_CZ.ISO8859-2": ["cs_CZ.ISO8859-2", "Czech_Czech Republic.1250"],
-}
-provide_locale_conversions = {
-    "fr_FR.UTF-8": ["decimal_point", "mon_thousands_sep", "thousands_sep"],
-    "ru_RU.UTF-8": ["mon_thousands_sep"],
-}
-for locale, alts in locales.items():
-    # Note: Using alts directly in the lambda body here will bind it to the value at the
-    # end of the loop. Assigning it to a default argument works around this issue.
-    DEFAULT_FEATURES.append(
-        Feature(
-            name="locale.{}".format(locale),
-            when=lambda cfg, alts=alts: hasAnyLocale(cfg, alts),
-            actions=lambda cfg, locale=locale, alts=alts: _getLocaleFlagsAction(
-                cfg, locale, alts, provide_locale_conversions[locale]
-            )
-            if locale in provide_locale_conversions
-            and ("_LIBCPP_HAS_WIDE_CHARACTERS" not in compilerMacros(cfg) or
-                 compilerMacros(cfg)["_LIBCPP_HAS_WIDE_CHARACTERS"] == "1")
-            else [],
-        ),
-    )
-
-
-# Provide environment locale conversions through substitutions to avoid platform specific
-# maintenance.
-def _getLocaleFlagsAction(cfg, locale, alts, members):
-    alts_list = ",".join([f'"{l}"' for l in alts])
-    get_member_list = ",".join([f"lc->{m}" for m in members])
-
-    localeconv_info = programOutput(
-        cfg,
-        r"""
-        #if defined(_WIN32) && !defined(_CRT_SECURE_NO_WARNINGS)
-        #define _CRT_SECURE_NO_WARNINGS
-        #endif
-        #include <stdio.h>
-        #include <locale.h>
-        #include <stdlib.h>
-        #include <wchar.h>
-
-        // Print each requested locale conversion member on separate lines.
-        int main(int, char**) {
-          const char* locales[] = { %s };
-          for (int loc_i = 0; loc_i < %d; ++loc_i) {
-            if (!setlocale(LC_ALL, locales[loc_i])) {
-              continue; // Choose first locale name that is recognized.
-            }
-
-            lconv* lc = localeconv();
-            const char* members[] = { %s };
-            for (size_t m_i = 0; m_i < %d; ++m_i) {
-              if (!members[m_i]) {
-                printf("\n"); // member value is an empty string
-                continue;
-              }
-
-              size_t len = mbstowcs(nullptr, members[m_i], 0);
-              if (len == static_cast<size_t>(-1)) {
-                fprintf(stderr, "mbstowcs failed unexpectedly\n");
-                return 1;
-              }
-              // Include room for null terminator. Use malloc as these features
-              // are also used by lit configs that don't use -lc++ (libunwind tests).
-              wchar_t* dst = (wchar_t*)malloc((len + 1) * sizeof(wchar_t));
-              size_t ret = mbstowcs(dst, members[m_i], len + 1);
-              if (ret == static_cast<size_t>(-1)) {
-                fprintf(stderr, "mbstowcs failed unexpectedly\n");
-                free(dst);
-                return 1;
-              }
-
-              for (size_t i = 0; i < len; ++i) {
-                if (dst[i] > 0x7F) {
-                  printf("\\u%%04x", dst[i]);
-                } else {
-                  // c++03 does not allow basic ascii-range characters in UCNs
-                  printf("%%c", (char)dst[i]);
-                }
-              }
-              printf("\n");
-              free(dst);
-            }
-            return 0;
-          }
-
-          return 1;
-        }
-        """
-        % (alts_list, len(alts), get_member_list, len(members)),
-    )
-    valid_define_name = re.sub(r"[.-]", "_", locale).upper()
-    return [
-        # Provide locale conversion through a substitution.
-        # Example: %{LOCALE_CONV_FR_FR_UTF_8_THOUSANDS_SEP} = L"\u202f"
-        AddSubstitution(
-            f"%{{LOCALE_CONV_{valid_define_name}_{member.upper()}}}",
-            lambda cfg, value=value: f"'L\"{value}\"'",
-        )
-        for member, value in zip(members, localeconv_info.split("\n"))
-    ]
-
-
-# Add features representing the target platform name: darwin, linux, windows, etc...
-DEFAULT_FEATURES += [
-    Feature(name="darwin", when=lambda cfg: "__APPLE__" in compilerMacros(cfg)),
-    Feature(name="windows", when=lambda cfg: "_WIN32" in compilerMacros(cfg)),
-    Feature(
-        name="windows-dll",
-        when=lambda cfg: "_WIN32" in compilerMacros(cfg)
-        and sourceBuilds(
-            cfg,
-            """
-            #include <iostream>
-            int main(int, char**) { return 0; }
-          """,
-        )
-        and programSucceeds(
-            cfg,
-            """
-            #include <iostream>
-            #include <windows.h>
-            #include <winnt.h>
-            int main(int, char**) {
-              // Get a pointer to a data member that gets linked from the C++
-              // library. This must be a data member (functions can get
-              // thunk inside the calling executable), and must not be
-              // something that is defined inline in headers.
-              void *ptr = &std::cout;
-              // Get a handle to the current main executable.
-              void *exe = GetModuleHandle(NULL);
-              // The handle points at the PE image header. Navigate through
-              // the header structure to find the size of the PE image (the
-              // executable).
-              PIMAGE_DOS_HEADER dosheader = (PIMAGE_DOS_HEADER)exe;
-              PIMAGE_NT_HEADERS ntheader = (PIMAGE_NT_HEADERS)((BYTE *)dosheader + dosheader->e_lfanew);
-              PIMAGE_OPTIONAL_HEADER peheader = &ntheader->OptionalHeader;
-              void *exeend = (BYTE*)exe + peheader->SizeOfImage;
-              // Check if the tested pointer - the data symbol from the
-              // C++ library - is located within the exe.
-              if (ptr >= exe && ptr <= exeend)
-                return 1;
-              // Return success if it was outside of the executable, i.e.
-              // loaded from a DLL.
-              return 0;
-            }
-          """,
-        ),
-        actions=[AddCompileFlag("-DTEST_WINDOWS_DLL")],
-    ),
-    Feature(name="linux", when=lambda cfg: "__linux__" in compilerMacros(cfg)),
-    Feature(name="android", when=lambda cfg: "__ANDROID__" in compilerMacros(cfg)),
-    Feature(
-        name=lambda cfg: "android-device-api={}".format(_getAndroidDeviceApi(cfg)),
-        when=lambda cfg: "__ANDROID__" in compilerMacros(cfg),
-    ),
-    Feature(
-        name="LIBCXX-ANDROID-FIXME",
-        when=lambda cfg: "__ANDROID__" in compilerMacros(cfg),
-    ),
-    Feature(name="netbsd", when=lambda cfg: "__NetBSD__" in compilerMacros(cfg)),
-    Feature(name="freebsd", when=lambda cfg: "__FreeBSD__" in compilerMacros(cfg)),
-    Feature(
-        name="LIBCXX-FREEBSD-FIXME",
-        when=lambda cfg: "__FreeBSD__" in compilerMacros(cfg),
-    ),
-    Feature(
-        name="LIBCXX-PICOLIBC-FIXME",
-        when=lambda cfg: sourceBuilds(
-            cfg,
-            """
-            #include <string.h>
-            #ifndef __PICOLIBC__
-            #error not picolibc
-            #endif
-            int main(int, char**) { return 0; }
-          """,
-        ),
-    ),
-    Feature(
-        name="LIBCXX-AMDGPU-FIXME",
-        when=lambda cfg: "__AMDGPU__" in compilerMacros(cfg),
-    ),
-    Feature(
-        name="LIBCXX-NVPTX-FIXME",
-        when=lambda cfg: "__NVPTX__" in compilerMacros(cfg),
-    ),
-    Feature(
-        name="can-create-symlinks",
-        when=lambda cfg: "_WIN32" not in compilerMacros(cfg)
-        or programSucceeds(
-            cfg,
-            # Creation of symlinks require elevated privileges on Windows unless
-            # Windows developer mode is enabled.
-            """
-            #include <stdio.h>
-            #include <windows.h>
-            int main(int, char**) {
-              CHAR tempDirPath[MAX_PATH];
-              DWORD tempPathRet = GetTempPathA(MAX_PATH, tempDirPath);
-              if (tempPathRet == 0 || tempPathRet > MAX_PATH) {
-                return 1;
-              }
-
-              CHAR tempFilePath[MAX_PATH];
-              UINT uRetVal = GetTempFileNameA(
-                tempDirPath,
-                "cxx", // Prefix
-                0, // Unique=0 also implies file creation.
-                tempFilePath);
-              if (uRetVal == 0) {
-                return 1;
-              }
-
-              CHAR symlinkFilePath[MAX_PATH];
-              int ret = sprintf_s(symlinkFilePath, MAX_PATH, "%s_symlink", tempFilePath);
-              if (ret == -1) {
-                DeleteFileA(tempFilePath);
-                return 1;
-              }
-
-              // Requires either administrator, or developer mode enabled.
-              BOOL bCreatedSymlink = CreateSymbolicLinkA(symlinkFilePath,
-                tempFilePath,
-                SYMBOLIC_LINK_FLAG_ALLOW_UNPRIVILEGED_CREATE);
-              if (!bCreatedSymlink) {
-                DeleteFileA(tempFilePath);
-                return 1;
-              }
-
-              DeleteFileA(tempFilePath);
-              DeleteFileA(symlinkFilePath);
-              return 0;
-            }
-            """,
-        ),
-    ),
-]
-
-# Add features representing the build host platform name.
-# The build host could differ from the target platform for cross-compilation.
-DEFAULT_FEATURES += [
-    Feature(name="buildhost={}".format(sys.platform.lower().strip())),
-    # sys.platform can often be represented by a "sub-system", such as 'win32', 'cygwin', 'mingw', freebsd13 & etc.
-    # We define a consolidated feature on a few platforms.
-    Feature(
-        name="buildhost=windows",
-        when=lambda cfg: platform.system().lower().startswith("windows"),
-    ),
-    Feature(
-        name="buildhost=freebsd",
-        when=lambda cfg: platform.system().lower().startswith("freebsd"),
-    ),
-    Feature(
-        name="buildhost=aix",
-        when=lambda cfg: platform.system().lower().startswith("aix"),
-    ),
-]
-
-# Detect whether GDB is on the system, has Python scripting and supports
-# adding breakpoint commands. If so add a substitution to access it.
-def check_gdb(cfg):
-    gdb_path = shutil.which("gdb")
-    if gdb_path is None:
-        return False
-
-    # Check that we can set breakpoint commands, which was added in 8.3.
-    # Using the quit command here means that gdb itself exits, not just
-    # the "python <...>" command.
-    test_src = """\
-try:
-  gdb.Breakpoint(\"main\").commands=\"foo\"
-except AttributeError:
-  gdb.execute(\"quit 1\")
-gdb.execute(\"quit\")"""
-
-    try:
-        stdout = subprocess.check_output(
-            [gdb_path, "-ex", "python " + test_src, "--batch"],
-            stderr=subprocess.DEVNULL,
-            universal_newlines=True,
-        )
-    except subprocess.CalledProcessError:
-        # We can't set breakpoint commands
-        return False
-
-    # Check we actually ran the Python
-    return not "Python scripting is not supported" in stdout
-
-
-DEFAULT_FEATURES += [
-    Feature(
-        name="host-has-gdb-with-python",
-        when=check_gdb,
-        actions=[AddSubstitution("%{gdb}", lambda cfg: shutil.which("gdb"))],
-    )
-]
-
-# Helpers to define correspondances between LLVM versions and vendor system versions.
-# Those are used for backdeployment features below, do not use directly in tests.
-DEFAULT_FEATURES += [
-    Feature(
-        name="_target-has-llvm-18",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "target={{.+}}-apple-macosx{{15(.[0-9]+)?(.[0-9]+)?}}",
-            cfg.available_features,
-        ),
-    ),
-    Feature(
-        name="_target-has-llvm-17",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-18 || target={{.+}}-apple-macosx{{14.[4-9](.[0-9]+)?}} || target={{.+}}-apple-macosx{{1[5-9]([.].+)?}}",
-            cfg.available_features,
-        ),
-    ),
-    Feature(
-        name="_target-has-llvm-16",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-17 || target={{.+}}-apple-macosx{{14.[0-3](.[0-9]+)?}}",
-            cfg.available_features,
-        ),
-    ),
-    Feature(
-        name="_target-has-llvm-15",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-16 || target={{.+}}-apple-macosx{{13.[4-9](.[0-9]+)?}}",
-            cfg.available_features,
-        ),
-    ),
-    Feature(
-        name="_target-has-llvm-14",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-15",
-            cfg.available_features,
-        ),
-    ),
-    Feature(
-        name="_target-has-llvm-13",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-14 || target={{.+}}-apple-macosx{{13.[0-3](.[0-9]+)?}}",
-            cfg.available_features,
-        ),
-    ),
-    Feature(
-        name="_target-has-llvm-12",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-13 || target={{.+}}-apple-macosx{{12.[3-9](.[0-9]+)?}}",
-            cfg.available_features,
-        ),
-    ),
-]
-
-# Define features for back-deployment testing.
-#
-# These features can be used to XFAIL tests that fail when deployed on (or compiled
-# for) an older system. For example, if a test exhibits a bug in the libc++ on a
-# particular system version, or if it uses a symbol that is not available on an
-# older version of the dylib, it can be marked as XFAIL with these features.
-#
-# We have two families of Lit features:
-#
-# The first one is `using-built-library-before-llvm-XYZ`. These features encode the
-# fact that the test suite is being *run* against a version of the shared/static library
-# that predates LLVM version XYZ. This is useful to represent the use case of compiling
-# a program against the latest libc++ but then deploying it and running it on an older
-# system with an older version of the (usually shared) library.
-#
-# This feature is built up using the target triple passed to the compiler and the
-# `stdlib=system` Lit feature, which encodes that we're running against the same library
-# as described by the target triple.
-#
-# The second set of features is `availability-<FEATURE>-missing`. This family of Lit
-# features encodes the presence of availability markup in the libc++ headers. This is
-# useful to check that a test fails specifically when compiled for a given deployment
-# target, such as when testing availability markup where we want to make sure that
-# using the annotated facility on a deployment target that doesn't support it will fail
-# at compile time. This can be achieved by creating a `.verify.cpp` test that checks for
-# the right errors and marking the test as `REQUIRES: availability-<FEATURE>-missing`.
-#
-# This feature is built up using the presence of availability markup detected inside
-# __config, the flavor of the library being tested and the target triple passed to the
-# compiler.
-#
-# Note that both families of Lit features are similar but different in important ways.
-# For example, tests for availability markup should be expected to produce diagnostics
-# regardless of whether we're running against a system library, as long as we're using
-# a libc++ flavor that enables availability markup. Similarly, a test could fail when
-# run against the system library of an older version of FreeBSD, even though FreeBSD
-# doesn't provide availability markup at the time of writing this.
-for version in ("12", "13", "14", "15", "16", "17", "18", "19", "20"):
-    DEFAULT_FEATURES.append(
-        Feature(
-            name="using-built-library-before-llvm-{}".format(version),
-            when=lambda cfg, v=version: BooleanExpression.evaluate(
-                "stdlib=system && !_target-has-llvm-{}".format(v),
-                cfg.available_features,
-            ),
-        )
-    )
-
-DEFAULT_FEATURES += [
-    # Tests that require https://wg21.link/P0482 support in the built library
-    Feature(
-        name="availability-char8_t_support-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-12)",
-            cfg.available_features,
-        ),
-    ),
-    # Tests that require std::to_chars(floating-point) in the built library
-    Feature(
-        name="availability-fp_to_chars-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-14)",
-            cfg.available_features,
-        ),
-    ),
-    # Tests that require __libcpp_verbose_abort support in the built library
-    Feature(
-        name="availability-verbose_abort-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-15)",
-            cfg.available_features,
-        ),
-    ),
-    # Tests that require std::pmr support in the built library
-    Feature(
-        name="availability-pmr-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-16)",
-            cfg.available_features,
-        ),
-    ),
-    # Tests that require support for <print> and std::print in <ostream> in the built library.
-    Feature(
-        name="availability-print-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-18)",
-            cfg.available_features,
-        ),
-    ),
-    # Tests that require time zone database support in the built library
-    Feature(
-        name="availability-tzdb-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-19)",
-            cfg.available_features,
-        ),
-    ),
-    # Tests that require std::from_chars(floating-point) in the built library
-    Feature(
-        name="availability-fp_from_chars-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-20)",
-            cfg.available_features,
-        ),
-    ),
-]
diff --git a/libcxx/utils/libcxx/test/features/__init__.py b/libcxx/utils/libcxx/test/features/__init__.py
new file mode 100644
index 0000000000000..5c0d1f3aaafc6
--- /dev/null
+++ b/libcxx/utils/libcxx/test/features/__init__.py
@@ -0,0 +1,21 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from . import availability, compiler, gdb, libcxx_macros, localization, misc, platform
+
+# Lit features are evaluated in order. Some features depend on other features, so
+# we are careful to define them in the correct order. For example, several features
+# require the compiler detection to have been performed.
+DEFAULT_FEATURES = []
+DEFAULT_FEATURES += compiler.features
+DEFAULT_FEATURES += libcxx_macros.features
+DEFAULT_FEATURES += platform.features
+DEFAULT_FEATURES += localization.features
+DEFAULT_FEATURES += gdb.features
+DEFAULT_FEATURES += misc.features
+DEFAULT_FEATURES += availability.features
diff --git a/libcxx/utils/libcxx/test/features/availability.py b/libcxx/utils/libcxx/test/features/availability.py
new file mode 100644
index 0000000000000..c312a7cf830ed
--- /dev/null
+++ b/libcxx/utils/libcxx/test/features/availability.py
@@ -0,0 +1,199 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from libcxx.test.dsl import Feature
+from lit.BooleanExpression import BooleanExpression
+
+# Helpers to define correspondances between LLVM versions and vendor system versions.
+# Those are used for backdeployment features below, do not use directly in tests.
+features = [
+    Feature(
+        name="_target-has-llvm-22",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "TBD",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-21",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "TBD",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-20",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-21 || target={{.+}}-apple-macosx{{26.[0-9](.\d+)?}}",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-19",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-20 || target={{.+}}-apple-macosx{{15.[4-9](.\d+)?}}",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-18",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-19 || target={{.+}}-apple-macosx{{15.[0-3](.\d+)?}}",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-17",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-18 || target={{.+}}-apple-macosx{{14.[4-9](.\d+)?}}",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-16",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-17 || target={{.+}}-apple-macosx{{14.[0-3](.[0-9]+)?}}",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-15",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-16 || target={{.+}}-apple-macosx{{13.[4-9](.[0-9]+)?}}",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-14",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-15",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-13",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-14 || target={{.+}}-apple-macosx{{13.[0-3](.[0-9]+)?}}",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-12",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-13 || target={{.+}}-apple-macosx{{12.[3-9](.[0-9]+)?}}",
+            cfg.available_features,
+        ),
+    ),
+]
+
+# Define features for back-deployment testing.
+#
+# These features can be used to XFAIL tests that fail when deployed on (or compiled
+# for) an older system. For example, if a test exhibits a bug in the libc++ on a
+# particular system version, or if it uses a symbol that is not available on an
+# older version of the dylib, it can be marked as XFAIL with these features.
+#
+# We have two families of Lit features:
+#
+# The first one is `using-built-library-before-llvm-XYZ`. These features encode the
+# fact that the test suite is being *run* against a version of the shared/static library
+# that predates LLVM version XYZ. This is useful to represent the use case of compiling
+# a program against the latest libc++ but then deploying it and running it on an older
+# system with an older version of the (usually shared) library.
+#
+# This feature is built up using the target triple passed to the compiler and the
+# `stdlib=system` Lit feature, which encodes that we're running against the same library
+# as described by the target triple.
+#
+# The second set of features is `availability-<FEATURE>-missing`. This family of Lit
+# features encodes the presence of availability markup in the libc++ headers. This is
+# useful to check that a test fails specifically when compiled for a given deployment
+# target, such as when testing availability markup where we want to make sure that
+# using the annotated facility on a deployment target that doesn't support it will fail
+# at compile time. This can be achieved by creating a `.verify.cpp` test that checks for
+# the right errors and marking the test as `REQUIRES: availability-<FEATURE>-missing`.
+#
+# This feature is built up using the presence of availability markup detected inside
+# __config, the flavor of the library being tested and the target triple passed to the
+# compiler.
+#
+# Note that both families of Lit features are similar but different in important ways.
+# For example, tests for availability markup should be expected to produce diagnostics
+# regardless of whether we're running against a system library, as long as we're using
+# a libc++ flavor that enables availability markup. Similarly, a test could fail when
+# run against the system library of an older version of FreeBSD, even though FreeBSD
+# doesn't provide availability markup at the time of writing this.
+for version in ("12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22"):
+    features.append(
+        Feature(
+            name="using-built-library-before-llvm-{}".format(version),
+            when=lambda cfg, v=version: BooleanExpression.evaluate(
+                "stdlib=system && !_target-has-llvm-{}".format(v),
+                cfg.available_features,
+            ),
+        )
+    )
+
+features += [
+    # Tests that require https://wg21.link/P0482 support in the built library
+    Feature(
+        name="availability-char8_t_support-missing",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-12)",
+            cfg.available_features,
+        ),
+    ),
+    # Tests that require std::to_chars(floating-point) in the built library
+    Feature(
+        name="availability-fp_to_chars-missing",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-14)",
+            cfg.available_features,
+        ),
+    ),
+    # Tests that require __libcpp_verbose_abort support in the built library
+    Feature(
+        name="availability-verbose_abort-missing",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-15)",
+            cfg.available_features,
+        ),
+    ),
+    # Tests that require std::pmr support in the built library
+    Feature(
+        name="availability-pmr-missing",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-16)",
+            cfg.available_features,
+        ),
+    ),
+    # Tests that require support for <print> and std::print in <ostream> in the built library.
+    Feature(
+        name="availability-print-missing",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-18)",
+            cfg.available_features,
+        ),
+    ),
+    # Tests that require time zone database support in the built library
+    Feature(
+        name="availability-tzdb-missing",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-19)",
+            cfg.available_features,
+        ),
+    ),
+    # Tests that require std::from_chars(floating-point) in the built library
+    Feature(
+        name="availability-fp_from_chars-missing",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-20)",
+            cfg.available_features,
+        ),
+    ),
+]
diff --git a/libcxx/utils/libcxx/test/features/compiler.py b/libcxx/utils/libcxx/test/features/compiler.py
new file mode 100644
index 0000000000000..2fb2d4b1502ad
--- /dev/null
+++ b/libcxx/utils/libcxx/test/features/compiler.py
@@ -0,0 +1,82 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from libcxx.test.dsl import compilerMacros, Feature, AddCompileFlag, AddFeature
+
+_isAnyClang = lambda cfg: "__clang__" in compilerMacros(cfg)
+_isAppleClang = lambda cfg: "__apple_build_version__" in compilerMacros(cfg)
+_isAnyGCC = lambda cfg: "__GNUC__" in compilerMacros(cfg)
+_isClang = lambda cfg: _isAnyClang(cfg) and not _isAppleClang(cfg)
+_isGCC = lambda cfg: _isAnyGCC(cfg) and not _isAnyClang(cfg)
+_isAnyClangOrGCC = lambda cfg: _isAnyClang(cfg) or _isAnyGCC(cfg)
+_isClExe = lambda cfg: not _isAnyClangOrGCC(cfg)
+_isMSVC = lambda cfg: "_MSC_VER" in compilerMacros(cfg)
+_msvcVersion = lambda cfg: (int(compilerMacros(cfg)["_MSC_VER"]) // 100, int(compilerMacros(cfg)["_MSC_VER"]) % 100)
+
+features = [
+    # gcc-style-warnings detects compilers that understand -Wno-meow flags, unlike MSVC's compiler driver cl.exe.
+    Feature(name="gcc-style-warnings", when=_isAnyClangOrGCC),
+    Feature(name="cl-style-warnings", when=_isClExe),
+
+    Feature(name="apple-clang", when=_isAppleClang),
+    Feature(
+        name=lambda cfg: "apple-clang-{__clang_major__}".format(**compilerMacros(cfg)),
+        when=_isAppleClang,
+    ),
+    Feature(
+        name=lambda cfg: "apple-clang-{__clang_major__}.{__clang_minor__}".format(**compilerMacros(cfg)),
+        when=_isAppleClang,
+    ),
+    Feature(
+        name=lambda cfg: "apple-clang-{__clang_major__}.{__clang_minor__}.{__clang_patchlevel__}".format(**compilerMacros(cfg)),
+        when=_isAppleClang,
+    ),
+    Feature(name="clang", when=_isClang),
+    Feature(
+        name=lambda cfg: "clang-{__clang_major__}".format(**compilerMacros(cfg)),
+        when=_isClang,
+    ),
+    Feature(
+        name=lambda cfg: "clang-{__clang_major__}.{__clang_minor__}".format(**compilerMacros(cfg)),
+        when=_isClang,
+    ),
+    Feature(
+        name=lambda cfg: "clang-{__clang_major__}.{__clang_minor__}.{__clang_patchlevel__}".format(**compilerMacros(cfg)),
+        when=_isClang,
+    ),
+    # Note: Due to a GCC bug (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104760), we must disable deprecation warnings
+    #       on GCC or spurious diagnostics are issued.
+    #
+    # TODO:
+    # - Enable -Wplacement-new with GCC.
+    # - Enable -Wclass-memaccess with GCC.
+    Feature(
+        name="gcc",
+        when=_isGCC,
+        actions=[
+            AddCompileFlag("-D_LIBCPP_DISABLE_DEPRECATION_WARNINGS"),
+            AddCompileFlag("-Wno-placement-new"),
+            AddCompileFlag("-Wno-class-memaccess"),
+            AddFeature("GCC-ALWAYS_INLINE-FIXME"),
+        ],
+    ),
+    Feature(
+        name=lambda cfg: "gcc-{__GNUC__}".format(**compilerMacros(cfg)), when=_isGCC
+    ),
+    Feature(
+        name=lambda cfg: "gcc-{__GNUC__}.{__GNUC_MINOR__}".format(**compilerMacros(cfg)),
+        when=_isGCC,
+    ),
+    Feature(
+        name=lambda cfg: "gcc-{__GNUC__}.{__GNUC_MINOR__}.{__GNUC_PATCHLEVEL__}".format(**compilerMacros(cfg)),
+        when=_isGCC,
+    ),
+    Feature(name="msvc", when=_isMSVC),
+    Feature(name=lambda cfg: "msvc-{}".format(*_msvcVersion(cfg)), when=_isMSVC),
+    Feature(name=lambda cfg: "msvc-{}.{}".format(*_msvcVersion(cfg)), when=_isMSVC),
+]
diff --git a/libcxx/utils/libcxx/test/features/gdb.py b/libcxx/utils/libcxx/test/features/gdb.py
new file mode 100644
index 0000000000000..459a59afc32f4
--- /dev/null
+++ b/libcxx/utils/libcxx/test/features/gdb.py
@@ -0,0 +1,50 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from libcxx.test.dsl import Feature, AddSubstitution
+import shutil
+import subprocess
+
+# Detect whether GDB is on the system, has Python scripting and supports
+# adding breakpoint commands. If so add a substitution to access it.
+def check_gdb(cfg):
+    gdb_path = shutil.which("gdb")
+    if gdb_path is None:
+        return False
+
+    # Check that we can set breakpoint commands, which was added in 8.3.
+    # Using the quit command here means that gdb itself exits, not just
+    # the "python <...>" command.
+    test_src = """\
+try:
+  gdb.Breakpoint(\"main\").commands=\"foo\"
+except AttributeError:
+  gdb.execute(\"quit 1\")
+gdb.execute(\"quit\")"""
+
+    try:
+        stdout = subprocess.check_output(
+            [gdb_path, "-ex", "python " + test_src, "--batch"],
+            stderr=subprocess.DEVNULL,
+            universal_newlines=True,
+        )
+    except subprocess.CalledProcessError:
+        # We can't set breakpoint commands
+        return False
+
+    # Check we actually ran the Python
+    return not "Python scripting is not supported" in stdout
+
+
+features = [
+    Feature(
+        name="host-has-gdb-with-python",
+        when=check_gdb,
+        actions=[AddSubstitution("%{gdb}", lambda cfg: shutil.which("gdb"))],
+    )
+]
diff --git a/libcxx/utils/libcxx/test/features/libcxx_macros.py b/libcxx/utils/libcxx/test/features/libcxx_macros.py
new file mode 100644
index 0000000000000..7a465f2e87866
--- /dev/null
+++ b/libcxx/utils/libcxx/test/features/libcxx_macros.py
@@ -0,0 +1,76 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from libcxx.test.dsl import Feature, compilerMacros
+
+features = []
+
+# Deduce and add the test features that that are implied by the #defines in
+# the <__config> header.
+#
+# For each macro of the form `_LIBCPP_XXX_YYY_ZZZ` defined below that
+# is defined after including <__config>, add a Lit feature called
+# `libcpp-xxx-yyy-zzz`. When a macro is defined to a specific value
+# (e.g. `_LIBCPP_ABI_VERSION=2`), the feature is `libcpp-xxx-yyy-zzz=<value>`.
+#
+# Note that features that are more strongly tied to libc++ are named libcpp-foo,
+# while features that are more general in nature are not prefixed with 'libcpp-'.
+macros = {
+    "_LIBCPP_NO_VCRUNTIME": "libcpp-no-vcruntime",
+    "_LIBCPP_ABI_VERSION": "libcpp-abi-version",
+    "_LIBCPP_ABI_BOUNDED_ITERATORS": "libcpp-has-abi-bounded-iterators",
+    "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING": "libcpp-has-abi-bounded-iterators-in-string",
+    "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR": "libcpp-has-abi-bounded-iterators-in-vector",
+    "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY": "libcpp-has-abi-bounded-iterators-in-std-array",
+    "_LIBCPP_ABI_BOUNDED_UNIQUE_PTR": "libcpp-has-abi-bounded-unique_ptr",
+    "_LIBCPP_ABI_FIX_UNORDERED_CONTAINER_SIZE_TYPE": "libcpp-has-abi-fix-unordered-container-size-type",
+    "_LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR": "libcpp-deprecated-abi-disable-pair-trivial-copy-ctor",
+    "_LIBCPP_ABI_NO_COMPRESSED_PAIR_PADDING": "libcpp-abi-no-compressed-pair-padding",
+    "_LIBCPP_PSTL_BACKEND_LIBDISPATCH": "libcpp-pstl-backend-libdispatch",
+}
+for macro, feature in macros.items():
+    features.append(
+        Feature(
+            name=lambda cfg, m=macro, f=feature: f + ("={}".format(compilerMacros(cfg)[m]) if compilerMacros(cfg)[m] else ""),
+            when=lambda cfg, m=macro: m in compilerMacros(cfg),
+        )
+    )
+
+true_false_macros = {
+    "_LIBCPP_HAS_THREAD_API_EXTERNAL": "libcpp-has-thread-api-external",
+    "_LIBCPP_HAS_THREAD_API_PTHREAD": "libcpp-has-thread-api-pthread",
+}
+for macro, feature in true_false_macros.items():
+    features.append(
+        Feature(
+            name=feature,
+            when=lambda cfg, m=macro: m in compilerMacros(cfg)
+            and compilerMacros(cfg)[m] == "1",
+        )
+    )
+
+inverted_macros = {
+    "_LIBCPP_HAS_TIME_ZONE_DATABASE": "no-tzdb",
+    "_LIBCPP_HAS_FILESYSTEM": "no-filesystem",
+    "_LIBCPP_HAS_LOCALIZATION": "no-localization",
+    "_LIBCPP_HAS_THREADS": "no-threads",
+    "_LIBCPP_HAS_MONOTONIC_CLOCK": "no-monotonic-clock",
+    "_LIBCPP_HAS_WIDE_CHARACTERS": "no-wide-characters",
+    "_LIBCPP_HAS_VENDOR_AVAILABILITY_ANNOTATIONS": "libcpp-has-no-availability-markup",
+    "_LIBCPP_HAS_RANDOM_DEVICE": "no-random-device",
+    "_LIBCPP_HAS_UNICODE": "libcpp-has-no-unicode",
+    "_LIBCPP_HAS_TERMINAL": "no-terminal",
+}
+for macro, feature in inverted_macros.items():
+    features.append(
+        Feature(
+            name=feature,
+            when=lambda cfg, m=macro: m in compilerMacros(cfg)
+            and compilerMacros(cfg)[m] == "0",
+        )
+    )
diff --git a/libcxx/utils/libcxx/test/features/localization.py b/libcxx/utils/libcxx/test/features/localization.py
new file mode 100644
index 0000000000000..157c250429d27
--- /dev/null
+++ b/libcxx/utils/libcxx/test/features/localization.py
@@ -0,0 +1,142 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from libcxx.test.dsl import compilerMacros, Feature, programSucceeds, hasAnyLocale, programOutput, AddSubstitution
+import re
+
+features = [
+    # Check for Glibc < 2.27, where the ru_RU.UTF-8 locale had
+    # mon_decimal_point == ".", which our tests don't handle.
+    Feature(
+        name="glibc-old-ru_RU-decimal-point",
+        when=lambda cfg: not "_LIBCPP_HAS_LOCALIZATION" in compilerMacros(cfg)
+        or compilerMacros(cfg)["_LIBCPP_HAS_LOCALIZATION"] == "1"
+        and not programSucceeds(
+            cfg,
+            """
+            #include <locale.h>
+            #include <string.h>
+            int main(int, char**) {
+              setlocale(LC_ALL, "ru_RU.UTF-8");
+              return strcmp(localeconv()->mon_decimal_point, ",");
+            }
+          """,
+        ),
+    ),
+]
+
+# Mapping from canonical locale names (used in the tests) to possible locale
+# names on various systems. Each locale is considered supported if any of the
+# alternative names is supported.
+_locales = {
+    "en_US.UTF-8": ["en_US.UTF-8", "en_US.utf8", "English_United States.1252"],
+    "fr_FR.UTF-8": ["fr_FR.UTF-8", "fr_FR.utf8", "French_France.1252"],
+    "ja_JP.UTF-8": ["ja_JP.UTF-8", "ja_JP.utf8", "Japanese_Japan.923"],
+    "ru_RU.UTF-8": ["ru_RU.UTF-8", "ru_RU.utf8", "Russian_Russia.1251"],
+    "zh_CN.UTF-8": ["zh_CN.UTF-8", "zh_CN.utf8", "Chinese_China.936"],
+    "fr_CA.ISO8859-1": ["fr_CA.ISO8859-1", "French_Canada.1252"],
+    "cs_CZ.ISO8859-2": ["cs_CZ.ISO8859-2", "Czech_Czech Republic.1250"],
+}
+_provide_locale_conversions = {
+    "fr_FR.UTF-8": ["decimal_point", "mon_thousands_sep", "thousands_sep"],
+    "ru_RU.UTF-8": ["mon_thousands_sep"],
+}
+for locale, alts in _locales.items():
+    # Note: Using alts directly in the lambda body here will bind it to the value at the
+    # end of the loop. Assigning it to a default argument works around this issue.
+    features.append(
+        Feature(
+            name="locale.{}".format(locale),
+            when=lambda cfg, alts=alts: hasAnyLocale(cfg, alts),
+            actions=lambda cfg, locale=locale, alts=alts: _getLocaleFlagsAction(
+                cfg, locale, alts, _provide_locale_conversions[locale]
+            )
+            if locale in _provide_locale_conversions
+            and ("_LIBCPP_HAS_WIDE_CHARACTERS" not in compilerMacros(cfg) or
+                 compilerMacros(cfg)["_LIBCPP_HAS_WIDE_CHARACTERS"] == "1")
+            else [],
+        ),
+    )
+
+# Provide environment locale conversions through substitutions to avoid platform specific
+# maintenance.
+def _getLocaleFlagsAction(cfg, locale, alts, members):
+    alts_list = ",".join([f'"{l}"' for l in alts])
+    get_member_list = ",".join([f"lc->{m}" for m in members])
+
+    localeconv_info = programOutput(
+        cfg,
+        r"""
+        #if defined(_WIN32) && !defined(_CRT_SECURE_NO_WARNINGS)
+        #define _CRT_SECURE_NO_WARNINGS
+        #endif
+        #include <stdio.h>
+        #include <locale.h>
+        #include <stdlib.h>
+        #include <wchar.h>
+
+        // Print each requested locale conversion member on separate lines.
+        int main(int, char**) {
+          const char* locales[] = { %s };
+          for (int loc_i = 0; loc_i < %d; ++loc_i) {
+            if (!setlocale(LC_ALL, locales[loc_i])) {
+              continue; // Choose first locale name that is recognized.
+            }
+
+            lconv* lc = localeconv();
+            const char* members[] = { %s };
+            for (size_t m_i = 0; m_i < %d; ++m_i) {
+              if (!members[m_i]) {
+                printf("\n"); // member value is an empty string
+                continue;
+              }
+
+              size_t len = mbstowcs(nullptr, members[m_i], 0);
+              if (len == static_cast<size_t>(-1)) {
+                fprintf(stderr, "mbstowcs failed unexpectedly\n");
+                return 1;
+              }
+              // Include room for null terminator. Use malloc as these features
+              // are also used by lit configs that don't use -lc++ (libunwind tests).
+              wchar_t* dst = (wchar_t*)malloc((len + 1) * sizeof(wchar_t));
+              size_t ret = mbstowcs(dst, members[m_i], len + 1);
+              if (ret == static_cast<size_t>(-1)) {
+                fprintf(stderr, "mbstowcs failed unexpectedly\n");
+                free(dst);
+                return 1;
+              }
+
+              for (size_t i = 0; i < len; ++i) {
+                if (dst[i] > 0x7F) {
+                  printf("\\u%%04x", dst[i]);
+                } else {
+                  // c++03 does not allow basic ascii-range characters in UCNs
+                  printf("%%c", (char)dst[i]);
+                }
+              }
+              printf("\n");
+              free(dst);
+            }
+            return 0;
+          }
+
+          return 1;
+        }
+        """
+        % (alts_list, len(alts), get_member_list, len(members)),
+    )
+    valid_define_name = re.sub(r"[.-]", "_", locale).upper()
+    return [
+        # Provide locale conversion through a substitution.
+        # Example: %{LOCALE_CONV_FR_FR_UTF_8_THOUSANDS_SEP} = L"\u202f"
+        AddSubstitution(
+            f"%{{LOCALE_CONV_{valid_define_name}_{member.upper()}}}",
+            lambda cfg, value=value: f"'L\"{value}\"'",
+        )
+        for member, value in zip(members, localeconv_info.split("\n"))
+    ]
diff --git a/libcxx/utils/libcxx/test/features/misc.py b/libcxx/utils/libcxx/test/features/misc.py
new file mode 100644
index 0000000000000..738e3d8bb207c
--- /dev/null
+++ b/libcxx/utils/libcxx/test/features/misc.py
@@ -0,0 +1,299 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from libcxx.test.dsl import compilerMacros, sourceBuilds, hasCompileFlag, programSucceeds, runScriptExitCode
+from libcxx.test.dsl import Feature, AddCompileFlag, AddLinkFlag
+import platform
+import sys
+
+def _mingwSupportsModules(cfg):
+    # Only mingw headers are known to work with libc++ built as a module,
+    # at the moment.
+    if not "__MINGW32__" in compilerMacros(cfg):
+        return False
+    # For mingw headers, check for a version known to support being built
+    # as a module.
+    return sourceBuilds(
+        cfg,
+        """
+        #include <_mingw_mac.h>
+        #if __MINGW64_VERSION_MAJOR < 12
+        #error Headers known to be incompatible
+        #elif __MINGW64_VERSION_MAJOR == 12
+        // The headers were fixed to work with libc++ modules during
+        // __MINGW64_VERSION_MAJOR == 12. The headers became compatible
+        // with libc++ built as a module in
+        // 1652e9241b5d8a5a779c6582b1c3c4f4a7cc66e5 (Apr 2024), but the
+        // following commit 8c13b28ace68f2c0094d45121d59a4b951b533ed
+        // removed the now unused __mingw_static_ovr define. Use this
+        // as indicator for whether we've got new enough headers.
+        #ifdef __mingw_static_ovr
+        #error Headers too old
+        #endif
+        #else
+        // __MINGW64_VERSION_MAJOR > 12 should be ok.
+        #endif
+        int main(int, char**) { return 0; }
+        """,
+    )
+
+features = [
+    Feature(
+        name="diagnose-if-support",
+        when=lambda cfg: hasCompileFlag(cfg, "-Wuser-defined-warnings"),
+        actions=[AddCompileFlag("-Wuser-defined-warnings")],
+    ),
+    Feature(
+        name="character-conversion-warnings",
+        when=lambda cfg: hasCompileFlag(cfg, "-Wcharacter-conversion"),
+    ),
+    # Tests to validate whether the compiler has a way to set the maximum number
+    # of steps during constant evaluation. Since the flag differs per compiler
+    # store the "valid" flag as a feature. This allows passing the proper compile
+    # flag to the compiler:
+    # // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=12345678
+    # // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=12345678
+    Feature(
+        name="has-fconstexpr-steps",
+        when=lambda cfg: hasCompileFlag(cfg, "-fconstexpr-steps=1"),
+    ),
+    Feature(
+        name="has-fconstexpr-ops-limit",
+        when=lambda cfg: hasCompileFlag(cfg, "-fconstexpr-ops-limit=1"),
+    ),
+    Feature(name="has-fblocks", when=lambda cfg: hasCompileFlag(cfg, "-fblocks")),
+    Feature(
+        name="fdelayed-template-parsing",
+        when=lambda cfg: hasCompileFlag(cfg, "-fdelayed-template-parsing"),
+    ),
+    Feature(
+        name="has-fobjc-arc",
+        when=lambda cfg: hasCompileFlag(cfg, "-xobjective-c++ -fobjc-arc")
+        and sys.platform.lower().strip() == "darwin",
+    ),  # TODO: this doesn't handle cross-compiling to Apple platforms.
+    Feature(
+        name="objective-c++",
+        when=lambda cfg: hasCompileFlag(cfg, "-xobjective-c++ -fobjc-arc"),
+    ),
+    Feature(
+        name="verify-support",
+        when=lambda cfg: hasCompileFlag(cfg, "-Xclang -verify-ignore-unexpected"),
+    ),
+    Feature(
+        name="add-latomic-workaround",  # https://llvm.org/PR73361
+        when=lambda cfg: sourceBuilds(
+            cfg, "int main(int, char**) { return 0; }", ["-latomic"]
+        ),
+        actions=[AddLinkFlag("-latomic")],
+    ),
+    Feature(
+        name="has-64-bit-atomics",
+        when=lambda cfg: sourceBuilds(
+            cfg,
+            """
+            #include <atomic>
+            struct Large { char storage[64/8]; };
+            std::atomic<Large> x;
+            int main(int, char**) { (void)x.load(); (void)x.is_lock_free(); return 0; }
+          """,
+        ),
+    ),
+    Feature(
+        name="has-1024-bit-atomics",
+        when=lambda cfg: sourceBuilds(
+            cfg,
+            """
+            #include <atomic>
+            struct Large { char storage[1024/8]; };
+            std::atomic<Large> x;
+            int main(int, char**) { (void)x.load(); (void)x.is_lock_free(); return 0; }
+          """,
+        ),
+    ),
+    # Tests that require 64-bit architecture
+    Feature(
+        name="32-bit-pointer",
+        when=lambda cfg: sourceBuilds(
+            cfg,
+            """
+            int main(int, char**) {
+              static_assert(sizeof(void *) == 4);
+            }
+          """,
+        ),
+    ),
+    # Check for a Windows UCRT bug (fixed in UCRT/Windows 10.0.20348.0):
+    # https://developercommunity.visualstudio.com/t/utf-8-locales-break-ctype-functions-for-wchar-type/1653678
+    Feature(
+        name="win32-broken-utf8-wchar-ctype",
+        when=lambda cfg: not "_LIBCPP_HAS_LOCALIZATION" in compilerMacros(cfg)
+        or compilerMacros(cfg)["_LIBCPP_HAS_LOCALIZATION"] == "1"
+        and "_WIN32" in compilerMacros(cfg)
+        and not programSucceeds(
+            cfg,
+            """
+            #include <locale.h>
+            #include <wctype.h>
+            int main(int, char**) {
+              setlocale(LC_ALL, "en_US.UTF-8");
+              return towlower(L'\\xDA') != L'\\xFA';
+            }
+          """,
+        ),
+    ),
+    # Check for a Windows UCRT bug (fixed in UCRT/Windows 10.0.19041.0).
+    # https://developercommunity.visualstudio.com/t/printf-formatting-with-g-outputs-too/1660837
+    Feature(
+        name="win32-broken-printf-g-precision",
+        when=lambda cfg: "_WIN32" in compilerMacros(cfg)
+        and not programSucceeds(
+            cfg,
+            """
+            #include <stdio.h>
+            #include <string.h>
+            int main(int, char**) {
+              char buf[100];
+              snprintf(buf, sizeof(buf), "%#.*g", 0, 0.0);
+              return strcmp(buf, "0.");
+            }
+          """,
+        ),
+    ),
+    # Check for a Windows UCRT bug (not fixed upstream yet).
+    # With UCRT, printf("%a", 0.0) produces "0x0.0000000000000p+0",
+    # while other C runtimes produce just "0x0p+0".
+    # https://developercommunity.visualstudio.com/t/Printf-formatting-of-float-as-hex-prints/1660844
+    Feature(
+        name="win32-broken-printf-a-precision",
+        when=lambda cfg: "_WIN32" in compilerMacros(cfg)
+        and not programSucceeds(
+            cfg,
+            """
+            #include <stdio.h>
+            #include <string.h>
+            int main(int, char**) {
+              char buf[100];
+              snprintf(buf, sizeof(buf), "%a", 0.0);
+              return strcmp(buf, "0x0p+0");
+            }
+          """,
+        ),
+    ),
+    Feature(
+        name="has-unix-headers",
+        when=lambda cfg: sourceBuilds(
+            cfg,
+            """
+            #include <unistd.h>
+            #include <sys/wait.h>
+            int main(int, char**) {
+              int fd[2];
+              return pipe(fd);
+            }
+          """,
+        ),
+    ),
+    # Whether Bash can run on the executor.
+    # This is not always the case, for example when running on embedded systems.
+    #
+    # For the corner case of bash existing, but it being missing in the path
+    # set in %{exec} as "--env PATH=one-single-dir", the executor does find
+    # and executes bash, but bash then can't find any other common shell
+    # utilities. Test executing "bash -c 'bash --version'" to see if bash
+    # manages to find binaries to execute.
+    Feature(
+        name="executor-has-no-bash",
+        when=lambda cfg: runScriptExitCode(cfg, ["%{exec} bash -c 'bash --version'"]) != 0,
+    ),
+    # Whether module support for the platform is available.
+    Feature(
+        name="has-no-cxx-module-support",
+        # The libc of these platforms have functions with internal linkage.
+        # This is not allowed per C11 7.1.2 Standard headers/6
+        #  Any declaration of a library function shall have external linkage.
+        when=lambda cfg: "__ANDROID__" in compilerMacros(cfg)
+        or "__FreeBSD__" in compilerMacros(cfg)
+        or ("_WIN32" in compilerMacros(cfg) and not _mingwSupportsModules(cfg))
+        or platform.system().lower().startswith("aix")
+        # Avoid building on platforms that don't support modules properly.
+        or not hasCompileFlag(cfg, "-Wno-reserved-module-identifier")
+        # older versions don't support extern "C++", newer versions don't support main in named module.
+        or not (
+            sourceBuilds(
+                cfg,
+                """
+            export module test;
+            extern "C++" int main(int, char**) { return 0; }
+          """,
+            )
+            or sourceBuilds(
+                cfg,
+                """
+            export module test;
+            int main(int, char**) { return 0; }
+          """,
+            )
+        ),
+    ),
+    # The time zone validation tests compare the output of zdump against the
+    # output generated by <chrono>'s time zone support.
+    Feature(
+        name="has-no-zdump",
+        when=lambda cfg: runScriptExitCode(cfg, ["zdump --version"]) != 0,
+    ),
+    Feature(
+        name="can-create-symlinks",
+        when=lambda cfg: "_WIN32" not in compilerMacros(cfg)
+        or programSucceeds(
+            cfg,
+            # Creation of symlinks require elevated privileges on Windows unless
+            # Windows developer mode is enabled.
+            """
+            #include <stdio.h>
+            #include <windows.h>
+            int main(int, char**) {
+              CHAR tempDirPath[MAX_PATH];
+              DWORD tempPathRet = GetTempPathA(MAX_PATH, tempDirPath);
+              if (tempPathRet == 0 || tempPathRet > MAX_PATH) {
+                return 1;
+              }
+
+              CHAR tempFilePath[MAX_PATH];
+              UINT uRetVal = GetTempFileNameA(
+                tempDirPath,
+                "cxx", // Prefix
+                0, // Unique=0 also implies file creation.
+                tempFilePath);
+              if (uRetVal == 0) {
+                return 1;
+              }
+
+              CHAR symlinkFilePath[MAX_PATH];
+              int ret = sprintf_s(symlinkFilePath, MAX_PATH, "%s_symlink", tempFilePath);
+              if (ret == -1) {
+                DeleteFileA(tempFilePath);
+                return 1;
+              }
+
+              // Requires either administrator, or developer mode enabled.
+              BOOL bCreatedSymlink = CreateSymbolicLinkA(symlinkFilePath,
+                tempFilePath,
+                SYMBOLIC_LINK_FLAG_ALLOW_UNPRIVILEGED_CREATE);
+              if (!bCreatedSymlink) {
+                DeleteFileA(tempFilePath);
+                return 1;
+              }
+
+              DeleteFileA(tempFilePath);
+              DeleteFileA(symlinkFilePath);
+              return 0;
+            }
+            """,
+        ),
+    ),
+]
diff --git a/libcxx/utils/libcxx/test/features/platform.py b/libcxx/utils/libcxx/test/features/platform.py
new file mode 100644
index 0000000000000..db9d3931da7ff
--- /dev/null
+++ b/libcxx/utils/libcxx/test/features/platform.py
@@ -0,0 +1,132 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from libcxx.test.dsl import programOutput, Feature, compilerMacros, programSucceeds, AddCompileFlag, sourceBuilds
+import platform
+import sys
+
+def _getAndroidDeviceApi(cfg):
+    return int(
+        programOutput(
+            cfg,
+            r"""
+                #include <android/api-level.h>
+                #include <stdio.h>
+                int main(int, char**) {
+                    printf("%d\n", android_get_device_api_level());
+                    return 0;
+                }
+            """,
+        )
+    )
+
+# Add features representing the target platform name: darwin, linux, windows, etc...
+features = [
+    Feature(name="darwin", when=lambda cfg: "__APPLE__" in compilerMacros(cfg)),
+    Feature(name="windows", when=lambda cfg: "_WIN32" in compilerMacros(cfg)),
+    Feature(
+        name="windows-dll",
+        when=lambda cfg: "_WIN32" in compilerMacros(cfg)
+        and sourceBuilds(
+            cfg,
+            """
+            #include <iostream>
+            int main(int, char**) { return 0; }
+          """,
+        )
+        and programSucceeds(
+            cfg,
+            """
+            #include <iostream>
+            #include <windows.h>
+            #include <winnt.h>
+            int main(int, char**) {
+              // Get a pointer to a data member that gets linked from the C++
+              // library. This must be a data member (functions can get
+              // thunk inside the calling executable), and must not be
+              // something that is defined inline in headers.
+              void *ptr = &std::cout;
+              // Get a handle to the current main executable.
+              void *exe = GetModuleHandle(NULL);
+              // The handle points at the PE image header. Navigate through
+              // the header structure to find the size of the PE image (the
+              // executable).
+              PIMAGE_DOS_HEADER dosheader = (PIMAGE_DOS_HEADER)exe;
+              PIMAGE_NT_HEADERS ntheader = (PIMAGE_NT_HEADERS)((BYTE *)dosheader + dosheader->e_lfanew);
+              PIMAGE_OPTIONAL_HEADER peheader = &ntheader->OptionalHeader;
+              void *exeend = (BYTE*)exe + peheader->SizeOfImage;
+              // Check if the tested pointer - the data symbol from the
+              // C++ library - is located within the exe.
+              if (ptr >= exe && ptr <= exeend)
+                return 1;
+              // Return success if it was outside of the executable, i.e.
+              // loaded from a DLL.
+              return 0;
+            }
+          """,
+        ),
+        actions=[AddCompileFlag("-DTEST_WINDOWS_DLL")],
+    ),
+    Feature(name="linux", when=lambda cfg: "__linux__" in compilerMacros(cfg)),
+    Feature(name="android", when=lambda cfg: "__ANDROID__" in compilerMacros(cfg)),
+    Feature(
+        name=lambda cfg: "android-device-api={}".format(_getAndroidDeviceApi(cfg)),
+        when=lambda cfg: "__ANDROID__" in compilerMacros(cfg),
+    ),
+    Feature(
+        name="LIBCXX-ANDROID-FIXME",
+        when=lambda cfg: "__ANDROID__" in compilerMacros(cfg),
+    ),
+    Feature(name="netbsd", when=lambda cfg: "__NetBSD__" in compilerMacros(cfg)),
+    Feature(name="freebsd", when=lambda cfg: "__FreeBSD__" in compilerMacros(cfg)),
+    Feature(
+        name="LIBCXX-FREEBSD-FIXME",
+        when=lambda cfg: "__FreeBSD__" in compilerMacros(cfg),
+    ),
+    Feature(
+        name="LIBCXX-PICOLIBC-FIXME",
+        when=lambda cfg: sourceBuilds(
+            cfg,
+            """
+            #include <string.h>
+            #ifndef __PICOLIBC__
+            #error not picolibc
+            #endif
+            int main(int, char**) { return 0; }
+          """,
+        ),
+    ),
+    Feature(
+        name="LIBCXX-AMDGPU-FIXME",
+        when=lambda cfg: "__AMDGPU__" in compilerMacros(cfg),
+    ),
+    Feature(
+        name="LIBCXX-NVPTX-FIXME",
+        when=lambda cfg: "__NVPTX__" in compilerMacros(cfg),
+    ),
+]
+
+# Add features representing the build host platform name.
+# The build host could differ from the target platform for cross-compilation.
+features += [
+    Feature(name="buildhost={}".format(sys.platform.lower().strip())),
+    # sys.platform can often be represented by a "sub-system", such as 'win32', 'cygwin', 'mingw', freebsd13 & etc.
+    # We define a consolidated feature on a few platforms.
+    Feature(
+        name="buildhost=windows",
+        when=lambda cfg: platform.system().lower().startswith("windows"),
+    ),
+    Feature(
+        name="buildhost=freebsd",
+        when=lambda cfg: platform.system().lower().startswith("freebsd"),
+    ),
+    Feature(
+        name="buildhost=aix",
+        when=lambda cfg: platform.system().lower().startswith("aix"),
+    ),
+]
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index c02d6df1c47a4..299aa28777fd5 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -11,7 +11,7 @@
 from pathlib import Path
 
 from libcxx.test.dsl import *
-from libcxx.test.features import _isClang, _isAppleClang, _isGCC, _isMSVC
+from libcxx.test.features.compiler import _isClang, _isAppleClang, _isGCC, _isMSVC
 
 
 _warningFlags = [
diff --git a/libcxxabi/include/__cxxabi_config.h b/libcxxabi/include/__cxxabi_config.h
index f5101dbc9e599..e4fd845b1fb35 100644
--- a/libcxxabi/include/__cxxabi_config.h
+++ b/libcxxabi/include/__cxxabi_config.h
@@ -14,10 +14,6 @@
 #define _LIBCXXABI_ARM_EHABI
 #endif
 
-#if !defined(__has_attribute)
-#define __has_attribute(_attribute_) 0
-#endif
-
 #if defined(__clang__)
 #  define _LIBCXXABI_COMPILER_CLANG
 #  ifndef __apple_build_version__
@@ -25,10 +21,6 @@
 #  endif
 #elif defined(__GNUC__)
 #  define _LIBCXXABI_COMPILER_GCC
-#elif defined(_MSC_VER)
-#  define _LIBCXXABI_COMPILER_MSVC
-#elif defined(__IBMCPP__)
-#  define _LIBCXXABI_COMPILER_IBM
 #endif
 
 #if defined(_WIN32)
@@ -66,17 +58,7 @@
  #endif
 #endif
 
-#if defined(_LIBCXXABI_COMPILER_MSVC)
-#define _LIBCXXABI_WEAK
-#else
 #define _LIBCXXABI_WEAK __attribute__((__weak__))
-#endif
-
-#if defined(__clang__)
-#define _LIBCXXABI_COMPILER_CLANG
-#elif defined(__GNUC__)
-#define _LIBCXXABI_COMPILER_GCC
-#endif
 
 #if __has_attribute(__no_sanitize__) && defined(_LIBCXXABI_COMPILER_CLANG)
 #define _LIBCXXABI_NO_CFI __attribute__((__no_sanitize__("cfi")))
@@ -89,11 +71,7 @@
 #  define _LIBCXXABI_GUARD_ABI_ARM
 #endif
 
-#if defined(_LIBCXXABI_COMPILER_CLANG)
-#  if !__has_feature(cxx_exceptions)
-#    define _LIBCXXABI_NO_EXCEPTIONS
-#  endif
-#elif defined(_LIBCXXABI_COMPILER_GCC) && !defined(__EXCEPTIONS)
+#if !defined(__cpp_exceptions) || __cpp_exceptions < 199711L
 #  define _LIBCXXABI_NO_EXCEPTIONS
 #endif
 
diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index 6f27da7b9cadf..b999438ff2ca8 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -1366,7 +1366,7 @@ class TemplateTemplateParamDecl final : public Node {
   template <typename Fn> void match(Fn F) const { F(Name, Params, Requires); }
 
   void printLeft(OutputBuffer &OB) const override {
-    ScopedOverride<unsigned> LT(OB.GtIsGt, 0);
+    ScopedOverride<bool> LT(OB.TemplateTracker.InsideTemplate, true);
     OB += "template<";
     Params.printWithComma(OB);
     OB += "> typename ";
@@ -1550,7 +1550,7 @@ class TemplateArgs final : public Node {
   NodeArray getParams() { return Params; }
 
   void printLeft(OutputBuffer &OB) const override {
-    ScopedOverride<unsigned> LT(OB.GtIsGt, 0);
+    ScopedOverride<bool> LT(OB.TemplateTracker.InsideTemplate, true);
     OB += "<";
     Params.printWithComma(OB);
     OB += ">";
@@ -1824,7 +1824,7 @@ class ClosureTypeName : public Node {
 
   void printDeclarator(OutputBuffer &OB) const {
     if (!TemplateParams.empty()) {
-      ScopedOverride<unsigned> LT(OB.GtIsGt, 0);
+      ScopedOverride<bool> LT(OB.TemplateTracker.InsideTemplate, true);
       OB += "<";
       TemplateParams.printWithComma(OB);
       OB += ">";
@@ -1885,7 +1885,9 @@ class BinaryExpr : public Node {
   }
 
   void printLeft(OutputBuffer &OB) const override {
-    bool ParenAll = OB.isGtInsideTemplateArgs() &&
+    // If we're printing a '<' inside of a template argument, and we haven't
+    // yet parenthesized the expression, do so now.
+    bool ParenAll = !OB.isInParensInTemplateArgs() &&
                     (InfixOperator == ">" || InfixOperator == ">>");
     if (ParenAll)
       OB.printOpen();
@@ -2061,7 +2063,7 @@ class CastExpr : public Node {
   void printLeft(OutputBuffer &OB) const override {
     OB += CastKind;
     {
-      ScopedOverride<unsigned> LT(OB.GtIsGt, 0);
+      ScopedOverride<bool> LT(OB.TemplateTracker.InsideTemplate, true);
       OB += "<";
       OB.printLeft(*To);
       OB += ">";
diff --git a/libcxxabi/src/demangle/Utility.h b/libcxxabi/src/demangle/Utility.h
index 8829f3fa13a93..df5b54dca492d 100644
--- a/libcxxabi/src/demangle/Utility.h
+++ b/libcxxabi/src/demangle/Utility.h
@@ -81,7 +81,7 @@ class OutputBuffer {
   OutputBuffer(const OutputBuffer &) = delete;
   OutputBuffer &operator=(const OutputBuffer &) = delete;
 
-  virtual ~OutputBuffer() {}
+  virtual ~OutputBuffer() = default;
 
   operator std::string_view() const {
     return std::string_view(Buffer, CurrentPosition);
@@ -104,18 +104,32 @@ class OutputBuffer {
   unsigned CurrentPackIndex = std::numeric_limits<unsigned>::max();
   unsigned CurrentPackMax = std::numeric_limits<unsigned>::max();
 
-  /// When zero, we're printing template args and '>' needs to be parenthesized.
-  /// Use a counter so we can simply increment inside parentheses.
-  unsigned GtIsGt = 1;
+  struct {
+    /// The depth of '(' and ')' inside the currently printed template
+    /// arguments.
+    unsigned ParenDepth = 0;
 
-  bool isGtInsideTemplateArgs() const { return GtIsGt == 0; }
+    /// True if we're currently printing a template argument.
+    bool InsideTemplate = false;
+  } TemplateTracker;
+
+  /// Returns true if we're currently between a '(' and ')' when printing
+  /// template args.
+  bool isInParensInTemplateArgs() const {
+    return TemplateTracker.ParenDepth > 0;
+  }
+
+  /// Returns true if we're printing template args.
+  bool isInsideTemplateArgs() const { return TemplateTracker.InsideTemplate; }
 
   void printOpen(char Open = '(') {
-    GtIsGt++;
+    if (isInsideTemplateArgs())
+      TemplateTracker.ParenDepth++;
     *this += Open;
   }
   void printClose(char Close = ')') {
-    GtIsGt--;
+    if (isInsideTemplateArgs())
+      TemplateTracker.ParenDepth--;
     *this += Close;
   }
 
diff --git a/libcxxabi/src/demangle/cp-to-llvm.sh b/libcxxabi/src/demangle/cp-to-llvm.sh
index f773dff9f0a8b..9c1db6fec29a6 100755
--- a/libcxxabi/src/demangle/cp-to-llvm.sh
+++ b/libcxxabi/src/demangle/cp-to-llvm.sh
@@ -42,6 +42,7 @@ copy_files() {
     chmod -w $dst/README.txt
 
     for I in $hdrs ; do
+	    echo "Copying ${src}/$I to ${dst}/$I"
 	    rm -f $dst/$I
 	    dash=$(echo "$I---------------------------" | cut -c -27 |\
 		       sed 's|[^-]*||')
@@ -53,6 +54,6 @@ copy_files() {
 }
 
 if [[ $ANSWER =~ ^[Yy]$ ]]; then
-    copy_files . $LLVM_DEMANGLE_DIR $HDRS
-    copy_files ../../test $LLVM_TESTING_DIR $TEST_HDRS
+    copy_files . $LLVM_DEMANGLE_DIR "${HDRS}"
+    copy_files ../../test $LLVM_TESTING_DIR "${TEST_HDRS}"
 fi
diff --git a/libcxxabi/test/namespace.compile.pass.cpp b/libcxxabi/test/namespace.compile.pass.cpp
new file mode 100644
index 0000000000000..076c75c635718
--- /dev/null
+++ b/libcxxabi/test/namespace.compile.pass.cpp
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <cxxabi.h>
+
+// Make sure the `abi` namespace already exists
+namespace abi_should_exist = abi;
+
+// Make sure `abi` is an alias for `__cxxabiv1`
+namespace abi = __cxxabiv1;
diff --git a/libcxxabi/test/native/x86_64/lpstart-zero.pass.sh.s b/libcxxabi/test/native/x86_64/lpstart-zero.pass.sh.s
index e18134cf88639..7f1da22971223 100644
--- a/libcxxabi/test/native/x86_64/lpstart-zero.pass.sh.s
+++ b/libcxxabi/test/native/x86_64/lpstart-zero.pass.sh.s
@@ -23,6 +23,7 @@
 ## The exception table is modified to use udata4 encoding for LPStart and
 ## sdata4 encoding for call sites.
 
+	.att_syntax
 	.text
 	.globl	main                            # -- Begin function main
 	.p2align	4, 0x90
diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp
index 858347bedce15..6790d7074a8b7 100644
--- a/libcxxabi/test/test_demangle.pass.cpp
+++ b/libcxxabi/test/test_demangle.pass.cpp
@@ -13,6 +13,10 @@
 // dd8b266ef.
 // UNSUPPORTED: using-built-library-before-llvm-20
 
+// This test exercises support for BitInt demangling introduced in
+// 20f56d140909a01c74e9981835373eaab6021af9.
+// UNSUPPORTED: using-built-library-before-llvm-21
+
 // XFAIL: win32-broken-printf-a-precision
 
 #include "support/timer.h"
diff --git a/libcxxabi/test/uncaught_exception.pass.cpp b/libcxxabi/test/uncaught_exception.pass.cpp
index e97732006e110..cace88a309d0b 100644
--- a/libcxxabi/test/uncaught_exception.pass.cpp
+++ b/libcxxabi/test/uncaught_exception.pass.cpp
@@ -15,8 +15,7 @@
 // to undefined symbols when linking against a libc++ that re-exports the symbols,
 // but running against a libc++ that doesn't. Fortunately, usage of __cxa_uncaught_exception()
 // in the wild seems to be close to non-existent.
-// TODO: Remove && !darwin once availability markup for LLVM 19 on macOS has been added
-// XFAIL: using-built-library-before-llvm-19 && !darwin
+// XFAIL: using-built-library-before-llvm-19
 
 #include <cxxabi.h>
 #include <cassert>
diff --git a/libunwind/CMakeLists.txt b/libunwind/CMakeLists.txt
index 5f4b0902d522c..97edff0b87ea3 100644
--- a/libunwind/CMakeLists.txt
+++ b/libunwind/CMakeLists.txt
@@ -332,6 +332,10 @@ if (C_SUPPORTS_COMMENT_LIB_PRAGMA)
   endif()
 endif()
 
+if (RUNTIMES_EXECUTE_ONLY_CODE)
+  add_compile_definitions(_LIBUNWIND_EXECUTE_ONLY_CODE)
+endif()
+
 #===============================================================================
 # Setup Source Code
 #===============================================================================
diff --git a/libunwind/include/libunwind.h b/libunwind/include/libunwind.h
index 18684ce311f95..56ca7110274a3 100644
--- a/libunwind/include/libunwind.h
+++ b/libunwind/include/libunwind.h
@@ -234,6 +234,7 @@ extern int unw_is_fpreg(unw_cursor_t *, unw_regnum_t) LIBUNWIND_AVAIL;
 extern int unw_is_signal_frame(unw_cursor_t *) LIBUNWIND_AVAIL;
 extern int unw_get_proc_name(unw_cursor_t *, char *, size_t, unw_word_t *) LIBUNWIND_AVAIL;
 //extern int       unw_get_save_loc(unw_cursor_t*, int, unw_save_loc_t*);
+extern const char *unw_strerror(int) LIBUNWIND_AVAIL;
 
 extern unw_addr_space_t unw_local_addr_space;
 
diff --git a/libunwind/src/Registers.hpp b/libunwind/src/Registers.hpp
index 5a5b57835379a..28649fafb23d5 100644
--- a/libunwind/src/Registers.hpp
+++ b/libunwind/src/Registers.hpp
@@ -20,6 +20,11 @@
 #include "libunwind_ext.h"
 #include "shadow_stack_unwind.h"
 
+#if __has_include(<sys/auxv.h>)
+#include <sys/auxv.h>
+#define HAVE_SYS_AUXV_H
+#endif
+
 namespace libunwind {
 
 // For emulating 128-bit registers
@@ -1827,7 +1832,9 @@ inline const char *Registers_ppc64::getRegisterName(int regNum) {
 /// Registers_arm64  holds the register state of a thread in a 64-bit arm
 /// process.
 class _LIBUNWIND_HIDDEN Registers_arm64;
-extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *);
+extern "C" int64_t __libunwind_Registers_arm64_za_disable();
+extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *,
+                                                   unsigned walkedFrames);
 
 #if defined(_LIBUNWIND_USE_GCS)
 extern "C" void *__libunwind_shstk_get_jump_target() {
@@ -1837,7 +1844,7 @@ extern "C" void *__libunwind_shstk_get_jump_target() {
 
 class _LIBUNWIND_HIDDEN Registers_arm64 {
 public:
-  Registers_arm64();
+  Registers_arm64() = default;
   Registers_arm64(const void *registers);
   Registers_arm64(const Registers_arm64 &);
   Registers_arm64 &operator=(const Registers_arm64 &);
@@ -1855,7 +1862,17 @@ class _LIBUNWIND_HIDDEN Registers_arm64 {
   v128        getVectorRegister(int num) const;
   void        setVectorRegister(int num, v128 value);
   static const char *getRegisterName(int num);
-  void        jumpto() { __libunwind_Registers_arm64_jumpto(this); }
+#ifdef _LIBUNWIND_TRACE_RET_INJECT
+  _LIBUNWIND_TRACE_NO_INLINE
+    void      returnto(unsigned walkedFrames) {
+      __libunwind_Registers_arm64_jumpto(this, walkedFrames);
+    }
+#else
+  void        jumpto() {
+      zaDisable();
+      __libunwind_Registers_arm64_jumpto(this, 0);
+  }
+#endif
   static constexpr int lastDwarfRegNum() {
     return _LIBUNWIND_HIGHEST_DWARF_REGISTER_ARM64;
   }
@@ -1908,25 +1925,43 @@ class _LIBUNWIND_HIDDEN Registers_arm64 {
 private:
   uint64_t lazyGetVG() const;
 
+  void zaDisable() const {
+    if (!_misc_registers.__has_sme)
+      return;
+    if (__libunwind_Registers_arm64_za_disable() != 0)
+      _LIBUNWIND_ABORT("SME ZA disable failed");
+  }
+
+  static bool checkHasSME() {
+#if defined(HAVE_SYS_AUXV_H)
+    constexpr int hwcap2_sme = (1 << 23);
+    unsigned long hwcap2 = getauxval(AT_HWCAP2);
+    return (hwcap2 & hwcap2_sme) != 0;
+#endif
+    // TODO: Support other platforms.
+    return false;
+  }
+
   struct GPRs {
-    uint64_t __x[29]; // x0-x28
-    uint64_t __fp;    // Frame pointer x29
-    uint64_t __lr;    // Link register x30
-    uint64_t __sp;    // Stack pointer x31
-    uint64_t __pc;    // Program counter
-    uint64_t __ra_sign_state; // RA sign state register
+    uint64_t __x[29] = {};        // x0-x28
+    uint64_t __fp = 0;            // Frame pointer x29
+    uint64_t __lr = 0;            // Link register x30
+    uint64_t __sp = 0;            // Stack pointer x31
+    uint64_t __pc = 0;            // Program counter
+    uint64_t __ra_sign_state = 0; // RA sign state register
   };
 
   struct Misc {
-    mutable uint64_t __vg = 0; // Vector Granule
+    mutable uint32_t __vg = 0; // Vector Granule
+    bool __has_sme = checkHasSME();
   };
 
-  GPRs _registers;
+  GPRs _registers = {};
   // Currently only the lower double in 128-bit vectore registers
   // is perserved during unwinding.  We could define new register
   // numbers (> 96) which mean whole vector registers, then this
   // struct would need to change to contain whole vector registers.
-  double _vectorHalfRegisters[32];
+  double _vectorHalfRegisters[32] = {};
 
   // Miscellaneous/virtual registers. These are stored below the GPRs and FPRs
   // as they do not correspond to physical registers, so do not need to be
@@ -1971,10 +2006,6 @@ Registers_arm64::operator=(const Registers_arm64 &other) {
   return *this;
 }
 
-inline Registers_arm64::Registers_arm64() {
-  memset(static_cast<void *>(this), 0, sizeof(*this));
-}
-
 inline bool Registers_arm64::validRegister(int regNum) const {
   if (regNum == UNW_REG_IP)
     return true;
diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp
index 7ec5f9e91578a..d7348254af07b 100644
--- a/libunwind/src/UnwindCursor.hpp
+++ b/libunwind/src/UnwindCursor.hpp
@@ -472,7 +472,9 @@ class _LIBUNWIND_HIDDEN AbstractUnwindCursor {
   virtual void getInfo(unw_proc_info_t *) {
     _LIBUNWIND_ABORT("getInfo not implemented");
   }
-  virtual void jumpto() { _LIBUNWIND_ABORT("jumpto not implemented"); }
+  _LIBUNWIND_TRACE_NO_INLINE virtual void jumpto() {
+    _LIBUNWIND_ABORT("jumpto not implemented");
+  }
   virtual bool isSignalFrame() {
     _LIBUNWIND_ABORT("isSignalFrame not implemented");
   }
@@ -489,6 +491,12 @@ class _LIBUNWIND_HIDDEN AbstractUnwindCursor {
   virtual void saveVFPAsX() { _LIBUNWIND_ABORT("saveVFPAsX not implemented"); }
 #endif
 
+#ifdef _LIBUNWIND_TRACE_RET_INJECT
+  virtual void setWalkedFrames(unsigned) {
+    _LIBUNWIND_ABORT("setWalkedFrames not implemented");
+  }
+#endif
+
 #ifdef _AIX
   virtual uintptr_t getDataRelBase() {
     _LIBUNWIND_ABORT("getDataRelBase not implemented");
@@ -965,7 +973,8 @@ class UnwindCursor : public AbstractUnwindCursor{
   virtual void        setFloatReg(int, unw_fpreg_t);
   virtual int         step(bool stage2 = false);
   virtual void        getInfo(unw_proc_info_t *);
-  virtual void        jumpto();
+  _LIBUNWIND_TRACE_NO_INLINE
+    virtual void      jumpto();
   virtual bool        isSignalFrame();
   virtual bool        getFunctionName(char *buf, size_t len, unw_word_t *off);
   virtual void        setInfoBasedOnIPRegister(bool isReturnAddress = false);
@@ -974,6 +983,10 @@ class UnwindCursor : public AbstractUnwindCursor{
   virtual void        saveVFPAsX();
 #endif
 
+#ifdef _LIBUNWIND_TRACE_RET_INJECT
+  virtual void setWalkedFrames(unsigned);
+#endif
+
 #ifdef _AIX
   virtual uintptr_t getDataRelBase();
 #endif
@@ -1356,6 +1369,9 @@ class UnwindCursor : public AbstractUnwindCursor{
     defined(_LIBUNWIND_TARGET_HAIKU)
   bool             _isSigReturn = false;
 #endif
+#ifdef _LIBUNWIND_TRACE_RET_INJECT
+  uint32_t _walkedFrames;
+#endif
 };
 
 
@@ -1410,7 +1426,46 @@ void UnwindCursor<A, R>::setFloatReg(int regNum, unw_fpreg_t value) {
 }
 
 template <typename A, typename R> void UnwindCursor<A, R>::jumpto() {
+#ifdef _LIBUNWIND_TRACE_RET_INJECT
+  /*
+
+  The value of `_walkedFrames` is computed in `unwind_phase2` and represents the
+  number of frames walked starting `unwind_phase2` to get to the landing pad.
+
+  ```
+    // uc is initialized by __unw_getcontext in the parent frame.
+    // The first stack frame walked is unwind_phase2.
+    unsigned framesWalked = 1;
+  ```
+
+  To that, we need to add the number of function calls in libunwind between
+  `unwind_phase2` & `__libunwind_Registers_arm64_jumpto` which performs the long
+  jump, to rebalance the execution flow.
+
+  ```
+      frame #0: libunwind.1.dylib`__libunwind_Registers_arm64_jumpto at UnwindRegistersRestore.S:646
+      frame #1: libunwind.1.dylib`libunwind::Registers_arm64::returnto at Registers.hpp:2291:3
+      frame #2: libunwind.1.dylib`libunwind::UnwindCursor<libunwind::LocalAddressSpace, libunwind::Registers_arm64>::jumpto at UnwindCursor.hpp:1474:14
+      frame #3: libunwind.1.dylib`__unw_resume at libunwind.cpp:375:7
+      frame #4: libunwind.1.dylib`__unw_resume_with_frames_walked at libunwind.cpp:363:10
+      frame #5: libunwind.1.dylib`unwind_phase2 at UnwindLevel1.c:328:9
+      frame #6: libunwind.1.dylib`_Unwind_RaiseException at UnwindLevel1.c:480:10
+      frame #7: libc++abi.dylib`__cxa_throw at cxa_exception.cpp:295:5
+      ...
+  ```
+
+  If we look at the backtrace from `__libunwind_Registers_arm64_jumpto`, we see
+  there are 5 frames on the stack to reach `unwind_phase2`. However, only 4 of
+  them will never return, since `__libunwind_Registers_arm64_jumpto` returns
+  back to the landing pad, so we need to subtract 1 to the number of
+  `_EXTRA_LIBUNWIND_FRAMES_WALKED`.
+  */
+
+  static constexpr size_t _EXTRA_LIBUNWIND_FRAMES_WALKED = 5 - 1;
+  _registers.returnto(_walkedFrames + _EXTRA_LIBUNWIND_FRAMES_WALKED);
+#else
   _registers.jumpto();
+#endif
 }
 
 #ifdef __arm__
@@ -1419,6 +1474,13 @@ template <typename A, typename R> void UnwindCursor<A, R>::saveVFPAsX() {
 }
 #endif
 
+#ifdef _LIBUNWIND_TRACE_RET_INJECT
+template <typename A, typename R>
+void UnwindCursor<A, R>::setWalkedFrames(unsigned walkedFrames) {
+  _walkedFrames = walkedFrames;
+}
+#endif
+
 #ifdef _AIX
 template <typename A, typename R>
 uintptr_t UnwindCursor<A, R>::getDataRelBase() {
diff --git a/libunwind/src/UnwindLevel1.c b/libunwind/src/UnwindLevel1.c
index b0cd60dfb9141..73a27928e91d1 100644
--- a/libunwind/src/UnwindLevel1.c
+++ b/libunwind/src/UnwindLevel1.c
@@ -48,16 +48,15 @@
 // avoided when invoking the `jumpto()` function. To do this, we use inline
 // assemblies to "goto" the `jumpto()` for these architectures.
 #if !defined(_LIBUNWIND_USE_CET) && !defined(_LIBUNWIND_USE_GCS)
-#define __unw_phase2_resume(cursor, fn)                                        \
+#define __unw_phase2_resume(cursor, payload)                                   \
   do {                                                                         \
-    (void)fn;                                                                  \
-    __unw_resume((cursor));                                                    \
+    __unw_resume_with_frames_walked((cursor), (payload));                      \
   } while (0)
 #elif defined(_LIBUNWIND_TARGET_I386)
 #define __shstk_step_size (4)
-#define __unw_phase2_resume(cursor, fn)                                        \
+#define __unw_phase2_resume(cursor, payload)                                   \
   do {                                                                         \
-    _LIBUNWIND_POP_SHSTK_SSP((fn));                                            \
+    _LIBUNWIND_POP_SHSTK_SSP((payload));                                       \
     void *shstkRegContext = __libunwind_shstk_get_registers((cursor));         \
     void *shstkJumpAddress = __libunwind_shstk_get_jump_target();              \
     __asm__ volatile("push %%edi\n\t"                                          \
@@ -67,9 +66,9 @@
   } while (0)
 #elif defined(_LIBUNWIND_TARGET_X86_64)
 #define __shstk_step_size (8)
-#define __unw_phase2_resume(cursor, fn)                                        \
+#define __unw_phase2_resume(cursor, payload)                                   \
   do {                                                                         \
-    _LIBUNWIND_POP_SHSTK_SSP((fn));                                            \
+    _LIBUNWIND_POP_SHSTK_SSP((payload));                                       \
     void *shstkRegContext = __libunwind_shstk_get_registers((cursor));         \
     void *shstkJumpAddress = __libunwind_shstk_get_jump_target();              \
     __asm__ volatile("jmpq *%%rdx\n\t" ::"D"(shstkRegContext),                 \
@@ -77,16 +76,17 @@
   } while (0)
 #elif defined(_LIBUNWIND_TARGET_AARCH64)
 #define __shstk_step_size (8)
-#define __unw_phase2_resume(cursor, fn)                                        \
+#define __unw_phase2_resume(cursor, payload)                                   \
   do {                                                                         \
-    _LIBUNWIND_POP_SHSTK_SSP((fn));                                            \
+    _LIBUNWIND_POP_SHSTK_SSP((payload));                                       \
     void *shstkRegContext = __libunwind_shstk_get_registers((cursor));         \
     void *shstkJumpAddress = __libunwind_shstk_get_jump_target();              \
     __asm__ volatile("mov x0, %0\n\t"                                          \
+                     "mov x1, #0\n\t"                                         \
                      "br %1\n\t"                                               \
                      :                                                         \
                      : "r"(shstkRegContext), "r"(shstkJumpAddress)             \
-                     : "x0");                                                  \
+                     : "x0", "x1");                                            \
   } while (0)
 #endif
 
@@ -205,6 +205,8 @@ extern int __unw_step_stage2(unw_cursor_t *);
 #if defined(_LIBUNWIND_USE_GCS)
 // Enable the GCS target feature to permit gcspop instructions to be used.
 __attribute__((target("+gcs")))
+#else
+_LIBUNWIND_TRACE_NO_INLINE
 #endif
 static _Unwind_Reason_Code
 unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor,
@@ -349,6 +351,8 @@ unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor,
 #if defined(_LIBUNWIND_USE_GCS)
 // Enable the GCS target feature to permit gcspop instructions to be used.
 __attribute__((target("+gcs")))
+#else
+_LIBUNWIND_TRACE_NO_INLINE
 #endif
 static _Unwind_Reason_Code
 unwind_phase2_forced(unw_context_t *uc, unw_cursor_t *cursor,
diff --git a/libunwind/src/UnwindRegistersRestore.S b/libunwind/src/UnwindRegistersRestore.S
index 198735fa800a9..76a80344034f7 100644
--- a/libunwind/src/UnwindRegistersRestore.S
+++ b/libunwind/src/UnwindRegistersRestore.S
@@ -18,6 +18,8 @@
 
 #if defined(_AIX)
   .toc
+#elif defined(__aarch64__) && defined(__ELF__) && defined(_LIBUNWIND_EXECUTE_ONLY_CODE)
+  .section .text,"axy",@progbits,unique,0
 #else
   .text
 #endif
@@ -643,13 +645,26 @@ Lnovec:
 #endif
 
 //
-// extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *);
+// extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *, unsigned);
 //
 // On entry:
 //  thread_state pointer is in x0
+//  walked_frames counter is in x1
 //
   .p2align 2
 DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto)
+
+  #if defined(_LIBUNWIND_TRACE_RET_INJECT)
+    cbz     w1, 1f
+  0:
+    subs    w1, w1, #1
+    adr     x16, #8
+    ret     x16
+
+    b.ne    0b
+  1:
+  #endif
+
   // skip restore of x0,x1 for now
   ldp    x2, x3,  [x0, #0x010]
   ldp    x4, x5,  [x0, #0x020]
diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S
index 619a59751151e..f988fd461def1 100644
--- a/libunwind/src/UnwindRegistersSave.S
+++ b/libunwind/src/UnwindRegistersSave.S
@@ -18,6 +18,8 @@
 
 #if defined(_AIX)
     .toc
+#elif defined(__aarch64__) && defined(__ELF__) && defined(_LIBUNWIND_EXECUTE_ONLY_CODE)
+    .section .text,"axy",@progbits,unique,0
 #else
     .text
 #endif
@@ -827,6 +829,68 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
   ret
 #endif
 
+//
+// extern "C" int64_t __libunwind_Registers_arm64_za_disable()
+//
+//   This function implements the requirements of the __arm_za_disable ABI
+//   routine, except that it will not abort; it will return a non-zero value
+//   to signify the routine failed.
+//
+//   Note: This function uses SME instructions. It must only be called if SME
+//   has been confirmed to be available.
+//
+// On return:
+//
+//   A status is placed in x0. A zero value indicates success; any non-zero
+//   value indicates failure.
+//
+  .p2align 2
+DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_za_disable)
+  .variant_pcs __libunwind_Registers_arm64_za_disable
+#if __has_feature(ptrauth_calls)
+  pacibsp
+#endif
+  // If TPIDR2_EL0 is null, the subroutine just disables ZA.
+  .inst 0xd53bd0b0 // mrs x16, TPIDR2_EL0
+  cbz x16, 1f
+
+  // If any of the reserved bytes in the first 16 bytes of the TPIDR2 block are
+  // nonzero, return a non-zero value (libunwind will then abort).
+  ldrh w0, [x16, #10]
+  cbnz w0, 2f
+  ldr w0, [x16, #12]
+  cbnz w0, 2f
+
+  // If num_za_save_slices is zero, the subroutine just disables ZA.
+  ldrh w0, [x16, #8]
+  cbz x0, 1f
+
+  // If za_save_buffer is NULL, the subroutine just disables ZA.
+  ldr x16, [x16]
+  cbz x16, 1f
+
+  // Store ZA to za_save_buffer.
+  mov x15, xzr
+0:
+  .inst 0xe1206200 // str za[w15,0], [x16]
+  .inst 0x04305830 // addsvl x16, x16, #1
+  add x15, x15, #1
+  cmp x0, x15
+  b.ne 0b
+1:
+  // * Set TPIDR2_EL0 to null.
+  .inst 0xd51bd0bf // msr TPIDR2_EL0, xzr
+  // * Set PSTATE.ZA to 0.
+  .inst 0xd503447f // smstop za
+  // * Return zero (success)
+  mov x0, xzr
+2:
+#if __has_feature(ptrauth_calls)
+  retab
+#else
+  ret
+#endif
+
 #elif defined(__arm__) && !defined(__APPLE__)
 
 #if !defined(__ARM_ARCH_ISA_ARM)
diff --git a/libunwind/src/assembly.h b/libunwind/src/assembly.h
index f0fcd006f2073..84c9d526f1d75 100644
--- a/libunwind/src/assembly.h
+++ b/libunwind/src/assembly.h
@@ -132,6 +132,10 @@
 
 #if defined(__APPLE__)
 
+#if defined(__aarch64__) || defined(__arm64__) || defined(__arm64e__)
+#define _LIBUNWIND_TRACE_RET_INJECT 1
+#endif
+
 #define SYMBOL_IS_FUNC(name)
 #define HIDDEN_SYMBOL(name) .private_extern name
 #if defined(_LIBUNWIND_HIDE_SYMBOLS)
diff --git a/libunwind/src/config.h b/libunwind/src/config.h
index deb5a4d4d73d4..f017403fa2234 100644
--- a/libunwind/src/config.h
+++ b/libunwind/src/config.h
@@ -28,6 +28,9 @@
     #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND 1
     #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1
   #endif
+  #if defined(__aarch64__) || defined(__arm64__) || defined(__arm64e__)
+    #define _LIBUNWIND_TRACE_RET_INJECT 1
+  #endif
 #elif defined(_WIN32)
   #ifdef __SEH__
     #define _LIBUNWIND_SUPPORT_SEH_UNWIND 1
@@ -61,6 +64,12 @@
   #endif
 #endif
 
+#ifdef _LIBUNWIND_TRACE_RET_INJECT
+#define _LIBUNWIND_TRACE_NO_INLINE __attribute__((noinline, disable_tail_calls))
+#else
+#define _LIBUNWIND_TRACE_NO_INLINE
+#endif
+
 #if defined(_LIBUNWIND_HIDE_SYMBOLS)
   // The CMake file passes -fvisibility=hidden to control ELF/Mach-O visibility.
   #define _LIBUNWIND_EXPORT
diff --git a/libunwind/src/libunwind.cpp b/libunwind/src/libunwind.cpp
index 951d87db868bc..b3036396c379d 100644
--- a/libunwind/src/libunwind.cpp
+++ b/libunwind/src/libunwind.cpp
@@ -247,7 +247,27 @@ _LIBUNWIND_HIDDEN int __unw_get_proc_info(unw_cursor_t *cursor,
 }
 _LIBUNWIND_WEAK_ALIAS(__unw_get_proc_info, unw_get_proc_info)
 
-/// Resume execution at cursor position (aka longjump).
+/// Rebalance the execution flow by injecting the right amount of `ret`
+/// instruction relatively to the amount of `walkedFrames` then resume execution
+/// at cursor position (aka longjump).
+_LIBUNWIND_HIDDEN int __unw_resume_with_frames_walked(unw_cursor_t *cursor,
+                                                      unsigned walkedFrames) {
+  _LIBUNWIND_TRACE_API("__unw_resume(cursor=%p, walkedFrames=%u)",
+                       static_cast<void *>(cursor), walkedFrames);
+#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
+  // Inform the ASan runtime that now might be a good time to clean stuff up.
+  __asan_handle_no_return();
+#endif
+#ifdef _LIBUNWIND_TRACE_RET_INJECT
+  AbstractUnwindCursor *co = (AbstractUnwindCursor *)cursor;
+  co->setWalkedFrames(walkedFrames);
+#endif
+  return __unw_resume(cursor);
+}
+_LIBUNWIND_WEAK_ALIAS(__unw_resume_with_frames_walked,
+                      unw_resume_with_frames_walked)
+
+/// Legacy function. Resume execution at cursor position (aka longjump).
 _LIBUNWIND_HIDDEN int __unw_resume(unw_cursor_t *cursor) {
   _LIBUNWIND_TRACE_API("__unw_resume(cursor=%p)", static_cast<void *>(cursor));
 #if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
@@ -389,6 +409,41 @@ void __unw_remove_dynamic_eh_frame_section(unw_word_t eh_frame_start) {
 }
 
 #endif // defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
+
+/// Maps the UNW_* error code to a textual representation
+_LIBUNWIND_HIDDEN const char *__unw_strerror(int error_code) {
+  switch (error_code) {
+  case UNW_ESUCCESS:
+    return "no error";
+  case UNW_EUNSPEC:
+    return "unspecified (general) error";
+  case UNW_ENOMEM:
+    return "out of memory";
+  case UNW_EBADREG:
+    return "bad register number";
+  case UNW_EREADONLYREG:
+    return "attempt to write read-only register";
+  case UNW_ESTOPUNWIND:
+    return "stop unwinding";
+  case UNW_EINVALIDIP:
+    return "invalid IP";
+  case UNW_EBADFRAME:
+    return "bad frame";
+  case UNW_EINVAL:
+    return "unsupported operation or bad value";
+  case UNW_EBADVERSION:
+    return "unwind info has unsupported version";
+  case UNW_ENOINFO:
+    return "no unwind info found";
+#if defined(_LIBUNWIND_TARGET_AARCH64) && !defined(_LIBUNWIND_IS_NATIVE_ONLY)
+  case UNW_ECROSSRASIGNING:
+    return "cross unwind with return address signing";
+#endif
+  }
+  return "invalid error code";
+}
+_LIBUNWIND_WEAK_ALIAS(__unw_strerror, unw_strerror)
+
 #endif // !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__)
 
 #ifdef __APPLE__
diff --git a/libunwind/src/libunwind_ext.h b/libunwind/src/libunwind_ext.h
index 28db43a4f6eef..f5da90d7bd3b7 100644
--- a/libunwind/src/libunwind_ext.h
+++ b/libunwind/src/libunwind_ext.h
@@ -30,7 +30,11 @@ extern int __unw_get_reg(unw_cursor_t *, unw_regnum_t, unw_word_t *);
 extern int __unw_get_fpreg(unw_cursor_t *, unw_regnum_t, unw_fpreg_t *);
 extern int __unw_set_reg(unw_cursor_t *, unw_regnum_t, unw_word_t);
 extern int __unw_set_fpreg(unw_cursor_t *, unw_regnum_t, unw_fpreg_t);
-extern int __unw_resume(unw_cursor_t *);
+_LIBUNWIND_TRACE_NO_INLINE
+  extern int __unw_resume_with_frames_walked(unw_cursor_t *, unsigned);
+// `__unw_resume` is a legacy function. Use `__unw_resume_with_frames_walked` instead.
+_LIBUNWIND_TRACE_NO_INLINE
+  extern int __unw_resume(unw_cursor_t *);
 
 #ifdef __arm__
 /* Save VFP registers in FSTMX format (instead of FSTMD). */
@@ -42,6 +46,7 @@ extern int __unw_get_proc_info(unw_cursor_t *, unw_proc_info_t *);
 extern int __unw_is_fpreg(unw_cursor_t *, unw_regnum_t);
 extern int __unw_is_signal_frame(unw_cursor_t *);
 extern int __unw_get_proc_name(unw_cursor_t *, char *, size_t, unw_word_t *);
+extern const char *__unw_strerror(int);
 
 #if defined(_AIX)
 extern uintptr_t __unw_get_data_rel_base(unw_cursor_t *);
diff --git a/libunwind/test/aarch64_za_unwind.pass.cpp b/libunwind/test/aarch64_za_unwind.pass.cpp
new file mode 100644
index 0000000000000..2985bb8d298de
--- /dev/null
+++ b/libunwind/test/aarch64_za_unwind.pass.cpp
@@ -0,0 +1,117 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: linux && target={{aarch64-.+}}
+
+#include <libunwind.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/auxv.h>
+
+// Basic test of unwinding with SME lazy saves. This tests libunwind disables ZA
+// (and commits a lazy save of ZA) before resuming from unwinding.
+
+// Note: This test requires SME (and is setup to pass on targets without SME).
+
+static bool checkHasSME() {
+  constexpr int hwcap2_sme = (1 << 23);
+  unsigned long hwcap2 = getauxval(AT_HWCAP2);
+  return (hwcap2 & hwcap2_sme) != 0;
+}
+
+struct TPIDR2Block {
+  void *za_save_buffer;
+  uint64_t num_save_slices;
+};
+
+__attribute__((noinline)) void private_za() {
+  // Note: Lazy save active on entry to function.
+  unw_context_t context;
+  unw_cursor_t cursor;
+
+  unw_getcontext(&context);
+  unw_init_local(&cursor, &context);
+  unw_step(&cursor);
+  unw_resume(&cursor);
+}
+
+bool isZAOn() {
+  register uint64_t svcr asm("x20");
+  asm(".inst 0xd53b4254" : "=r"(svcr));
+  return (svcr & 0b10) != 0;
+}
+
+__attribute__((noinline)) void za_function_with_lazy_save() {
+  register uint64_t tmp asm("x8");
+
+  // SMSTART ZA (should zero ZA)
+  asm(".inst 0xd503457f");
+
+  // RDSVL x8, #1 (read streaming vector length)
+  asm(".inst 0x04bf5828" : "=r"(tmp));
+
+  // Allocate and fill ZA save buffer with 0xAA.
+  size_t buffer_size = tmp * tmp;
+  uint8_t *za_save_buffer = (uint8_t *)alloca(buffer_size);
+  memset(za_save_buffer, 0xAA, buffer_size);
+
+  TPIDR2Block block = {za_save_buffer, tmp};
+  tmp = reinterpret_cast<uint64_t>(&block);
+
+  // MRS TPIDR2_EL0, x8 (setup lazy save of ZA)
+  asm(".inst 0xd51bd0a8" ::"r"(tmp));
+
+  // ZA should be on before unwinding.
+  if (!isZAOn()) {
+    fprintf(stderr, __FILE__ ": fail (ZA not on before call)\n");
+    abort();
+  } else {
+    fprintf(stderr, __FILE__ ": pass (ZA on before call)\n");
+  }
+
+  private_za();
+
+  // ZA should be off after unwinding.
+  if (isZAOn()) {
+    fprintf(stderr, __FILE__ ": fail (ZA on after unwinding)\n");
+    abort();
+  } else {
+    fprintf(stderr, __FILE__ ": pass (ZA off after unwinding)\n");
+  }
+
+  // MRS x8, TPIDR2_EL0 (read TPIDR2_EL0)
+  asm(".inst 0xd53bd0a8" : "=r"(tmp));
+  // ZA should have been saved (TPIDR2_EL0 zero).
+  if (tmp != 0) {
+    fprintf(stderr, __FILE__ ": fail (TPIDR2_EL0 non-null after unwinding)\n");
+    abort();
+  } else {
+    fprintf(stderr, __FILE__ ": pass (TPIDR2_EL0 null after unwinding)\n");
+  }
+
+  // ZA (all zero) should have been saved to the buffer.
+  for (unsigned i = 0; i < buffer_size; ++i) {
+    if (za_save_buffer[i] != 0) {
+      fprintf(stderr,
+              __FILE__ ": fail (za_save_buffer non-zero after unwinding)\n");
+      abort();
+    }
+  }
+  fprintf(stderr, __FILE__ ": pass (za_save_buffer zero'd after unwinding)\n");
+}
+
+int main(int, char **) {
+  if (!checkHasSME()) {
+    fprintf(stderr, __FILE__ ": pass (no SME support)\n");
+    return 0; // Pass (SME is required for this test to run).
+  }
+  za_function_with_lazy_save();
+  return 0;
+}
diff --git a/libunwind/test/remember_state_leak.pass.sh.s b/libunwind/test/remember_state_leak.pass.sh.s
index 63beb7e4701ec..d3335cf82290b 100644
--- a/libunwind/test/remember_state_leak.pass.sh.s
+++ b/libunwind/test/remember_state_leak.pass.sh.s
@@ -38,6 +38,7 @@
 
     SIZEOF_UNWIND_EXCEPTION = 32
 
+    .att_syntax
     .text
 callback:
     xorl    %eax, %eax
diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index 91a673f13d68e..6c4290ff1e448 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -472,7 +472,7 @@ bool ARM::inBranchRange(RelType type, uint64_t src, uint64_t dst) const {
     // Bit 0 == 1 denotes Thumb state, it is not part of the range.
     dst &= ~0x1;
 
-  int64_t offset = dst - src;
+  int64_t offset = llvm::SignExtend64<32>(dst - src);
   switch (type) {
   case R_ARM_PC24:
   case R_ARM_PLT32:
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index e52d3a0e11113..8647752be31fe 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -156,23 +156,23 @@ static std::tuple<ELFKind, uint16_t, uint8_t> parseEmulation(Ctx &ctx,
 
   std::pair<ELFKind, uint16_t> ret =
       StringSwitch<std::pair<ELFKind, uint16_t>>(s)
-          .Cases("aarch64elf", "aarch64linux", {ELF64LEKind, EM_AARCH64})
-          .Cases("aarch64elfb", "aarch64linuxb", {ELF64BEKind, EM_AARCH64})
-          .Cases("armelf", "armelf_linux_eabi", {ELF32LEKind, EM_ARM})
-          .Cases("armelfb", "armelfb_linux_eabi", {ELF32BEKind, EM_ARM})
+          .Cases({"aarch64elf", "aarch64linux"}, {ELF64LEKind, EM_AARCH64})
+          .Cases({"aarch64elfb", "aarch64linuxb"}, {ELF64BEKind, EM_AARCH64})
+          .Cases({"armelf", "armelf_linux_eabi"}, {ELF32LEKind, EM_ARM})
+          .Cases({"armelfb", "armelfb_linux_eabi"}, {ELF32BEKind, EM_ARM})
           .Case("elf32_x86_64", {ELF32LEKind, EM_X86_64})
-          .Cases("elf32btsmip", "elf32btsmipn32", {ELF32BEKind, EM_MIPS})
-          .Cases("elf32ltsmip", "elf32ltsmipn32", {ELF32LEKind, EM_MIPS})
+          .Cases({"elf32btsmip", "elf32btsmipn32"}, {ELF32BEKind, EM_MIPS})
+          .Cases({"elf32ltsmip", "elf32ltsmipn32"}, {ELF32LEKind, EM_MIPS})
           .Case("elf32lriscv", {ELF32LEKind, EM_RISCV})
-          .Cases("elf32ppc", "elf32ppclinux", {ELF32BEKind, EM_PPC})
-          .Cases("elf32lppc", "elf32lppclinux", {ELF32LEKind, EM_PPC})
+          .Cases({"elf32ppc", "elf32ppclinux"}, {ELF32BEKind, EM_PPC})
+          .Cases({"elf32lppc", "elf32lppclinux"}, {ELF32LEKind, EM_PPC})
           .Case("elf32loongarch", {ELF32LEKind, EM_LOONGARCH})
           .Case("elf64btsmip", {ELF64BEKind, EM_MIPS})
           .Case("elf64ltsmip", {ELF64LEKind, EM_MIPS})
           .Case("elf64lriscv", {ELF64LEKind, EM_RISCV})
           .Case("elf64ppc", {ELF64BEKind, EM_PPC64})
           .Case("elf64lppc", {ELF64LEKind, EM_PPC64})
-          .Cases("elf_amd64", "elf_x86_64", {ELF64LEKind, EM_X86_64})
+          .Cases({"elf_amd64", "elf_x86_64"}, {ELF64LEKind, EM_X86_64})
           .Case("elf_i386", {ELF32LEKind, EM_386})
           .Case("elf_iamcu", {ELF32LEKind, EM_IAMCU})
           .Case("elf64_sparc", {ELF64BEKind, EM_SPARCV9})
diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp
index 4b9c941eb9d69..b61dc647401a3 100644
--- a/lld/ELF/ScriptParser.cpp
+++ b/lld/ELF/ScriptParser.cpp
@@ -450,7 +450,7 @@ static std::pair<ELFKind, uint16_t> parseBfdName(StringRef s) {
       .Case("elf64-powerpc", {ELF64BEKind, EM_PPC64})
       .Case("elf64-powerpcle", {ELF64LEKind, EM_PPC64})
       .Case("elf64-x86-64", {ELF64LEKind, EM_X86_64})
-      .Cases("elf32-tradbigmips", "elf32-bigmips", {ELF32BEKind, EM_MIPS})
+      .Cases({"elf32-tradbigmips", "elf32-bigmips"}, {ELF32BEKind, EM_MIPS})
       .Case("elf32-ntradbigmips", {ELF32BEKind, EM_MIPS})
       .Case("elf32-tradlittlemips", {ELF32LEKind, EM_MIPS})
       .Case("elf32-ntradlittlemips", {ELF32LEKind, EM_MIPS})
@@ -463,7 +463,8 @@ static std::pair<ELFKind, uint16_t> parseBfdName(StringRef s) {
       .Case("elf32-loongarch", {ELF32LEKind, EM_LOONGARCH})
       .Case("elf64-loongarch", {ELF64LEKind, EM_LOONGARCH})
       .Case("elf64-s390", {ELF64BEKind, EM_S390})
-      .Cases("elf32-hexagon", "elf32-littlehexagon", {ELF32LEKind, EM_HEXAGON})
+      .Cases({"elf32-hexagon", "elf32-littlehexagon"},
+             {ELF32LEKind, EM_HEXAGON})
       .Default({ELFNoneKind, EM_NONE});
 }
 
@@ -745,7 +746,7 @@ StringMatcher ScriptParser::readFilePatterns() {
 SortSectionPolicy ScriptParser::peekSortKind() {
   return StringSwitch<SortSectionPolicy>(peek())
       .Case("REVERSE", SortSectionPolicy::Reverse)
-      .Cases("SORT", "SORT_BY_NAME", SortSectionPolicy::Name)
+      .Cases({"SORT", "SORT_BY_NAME"}, SortSectionPolicy::Name)
       .Case("SORT_BY_ALIGNMENT", SortSectionPolicy::Alignment)
       .Case("SORT_BY_INIT_PRIORITY", SortSectionPolicy::Priority)
       .Case("SORT_NONE", SortSectionPolicy::None)
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index a4150ebfa1653..9a70c0d19c41d 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -54,8 +54,6 @@ using llvm::support::endian::read32le;
 using llvm::support::endian::write32le;
 using llvm::support::endian::write64le;
 
-constexpr size_t MergeNoTailSection::numShards;
-
 static uint64_t readUint(Ctx &ctx, uint8_t *buf) {
   return ctx.arg.is64 ? read64(ctx, buf) : read32(ctx, buf);
 }
diff --git a/lld/MachO/Arch/X86_64.cpp b/lld/MachO/Arch/X86_64.cpp
index a7c4b452f990b..111c4d9846d28 100644
--- a/lld/MachO/Arch/X86_64.cpp
+++ b/lld/MachO/Arch/X86_64.cpp
@@ -104,7 +104,7 @@ int64_t X86_64::getEmbeddedAddend(MemoryBufferRef mb, uint64_t offset,
 void X86_64::relocateOne(uint8_t *loc, const Reloc &r, uint64_t value,
                          uint64_t relocVA) const {
   if (r.pcrel) {
-    uint64_t pc = relocVA + (1 << r.length) + pcrelOffset(r.type);
+    uint64_t pc = relocVA + (1ull << r.length) + pcrelOffset(r.type);
     value -= pc;
   }
 
diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp
index d50abc22fc6c1..328c33e6cfb65 100644
--- a/lld/MachO/BPSectionOrderer.cpp
+++ b/lld/MachO/BPSectionOrderer.cpp
@@ -118,6 +118,10 @@ DenseMap<const InputSection *, int> lld::macho::runBalancedPartitioning(
         auto *isec = subsec.isec;
         if (!isec || isec->data.empty() || !isec->data.data())
           continue;
+        // CString section order is handled by
+        // {Deduplicated}CStringSection::finalizeContents()
+        if (isa<CStringInputSection>(isec) || isec->isFinal)
+          continue;
         // ConcatInputSections are entirely live or dead, so the offset is
         // irrelevant.
         if (isa<ConcatInputSection>(isec) && !isec->isLive(0))
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 9b67db9fa55cf..32b20993af67c 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -841,18 +841,18 @@ static PlatformVersion parsePlatformVersion(const Arg *arg) {
   // TODO(compnerd) see if we can generate this case list via XMACROS
   platformVersion.platform =
       StringSwitch<PlatformType>(lowerDash(platformStr))
-          .Cases("macos", "1", PLATFORM_MACOS)
-          .Cases("ios", "2", PLATFORM_IOS)
-          .Cases("tvos", "3", PLATFORM_TVOS)
-          .Cases("watchos", "4", PLATFORM_WATCHOS)
-          .Cases("bridgeos", "5", PLATFORM_BRIDGEOS)
-          .Cases("mac-catalyst", "6", PLATFORM_MACCATALYST)
-          .Cases("ios-simulator", "7", PLATFORM_IOSSIMULATOR)
-          .Cases("tvos-simulator", "8", PLATFORM_TVOSSIMULATOR)
-          .Cases("watchos-simulator", "9", PLATFORM_WATCHOSSIMULATOR)
-          .Cases("driverkit", "10", PLATFORM_DRIVERKIT)
-          .Cases("xros", "11", PLATFORM_XROS)
-          .Cases("xros-simulator", "12", PLATFORM_XROS_SIMULATOR)
+          .Cases({"macos", "1"}, PLATFORM_MACOS)
+          .Cases({"ios", "2"}, PLATFORM_IOS)
+          .Cases({"tvos", "3"}, PLATFORM_TVOS)
+          .Cases({"watchos", "4"}, PLATFORM_WATCHOS)
+          .Cases({"bridgeos", "5"}, PLATFORM_BRIDGEOS)
+          .Cases({"mac-catalyst", "6"}, PLATFORM_MACCATALYST)
+          .Cases({"ios-simulator", "7"}, PLATFORM_IOSSIMULATOR)
+          .Cases({"tvos-simulator", "8"}, PLATFORM_TVOSSIMULATOR)
+          .Cases({"watchos-simulator", "9"}, PLATFORM_WATCHOSSIMULATOR)
+          .Cases({"driverkit", "10"}, PLATFORM_DRIVERKIT)
+          .Cases({"xros", "11"}, PLATFORM_XROS)
+          .Cases({"xros-simulator", "12"}, PLATFORM_XROS_SIMULATOR)
           .Default(PLATFORM_UNKNOWN);
   if (platformVersion.platform == PLATFORM_UNKNOWN)
     error(Twine("malformed platform: ") + platformStr);
@@ -948,7 +948,7 @@ getUndefinedSymbolTreatment(const ArgList &args) {
   StringRef treatmentStr = args.getLastArgValue(OPT_undefined);
   auto treatment =
       StringSwitch<UndefinedSymbolTreatment>(treatmentStr)
-          .Cases("error", "", UndefinedSymbolTreatment::error)
+          .Cases({"error", ""}, UndefinedSymbolTreatment::error)
           .Case("warning", UndefinedSymbolTreatment::warning)
           .Case("suppress", UndefinedSymbolTreatment::suppress)
           .Case("dynamic_lookup", UndefinedSymbolTreatment::dynamic_lookup)
@@ -972,7 +972,7 @@ getUndefinedSymbolTreatment(const ArgList &args) {
 static ICFLevel getICFLevel(const ArgList &args) {
   StringRef icfLevelStr = args.getLastArgValue(OPT_icf_eq);
   auto icfLevel = StringSwitch<ICFLevel>(icfLevelStr)
-                      .Cases("none", "", ICFLevel::none)
+                      .Cases({"none", ""}, ICFLevel::none)
                       .Case("safe", ICFLevel::safe)
                       .Case("safe_thunks", ICFLevel::safe_thunks)
                       .Case("all", ICFLevel::all)
diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp
index 7b31378c3781e..e0fc89782a419 100644
--- a/lld/MachO/ICF.cpp
+++ b/lld/MachO/ICF.cpp
@@ -173,14 +173,37 @@ bool ICF::equalsConstant(const ConcatInputSection *ia,
       // a valid offset in the literal section.
       return isecA->getOffset(valueA) == isecB->getOffset(valueB) &&
              ra.addend == rb.addend;
-    else {
-      assert(valueA == 0 && valueB == 0);
-      // For section relocs, we compare the content at the section offset.
-      return isecA->getOffset(ra.addend) == isecB->getOffset(rb.addend);
-    }
+    assert(valueA == 0 && valueB == 0);
+    // For section relocs, we compare the content at the section offset.
+    return isecA->getOffset(ra.addend) == isecB->getOffset(rb.addend);
   };
-  return std::equal(ia->relocs.begin(), ia->relocs.end(), ib->relocs.begin(),
-                    f);
+  if (!llvm::equal(ia->relocs, ib->relocs, f))
+    return false;
+
+  // Check unwind info structural compatibility: if there are symbols with
+  // associated unwind info, check that both sections have compatible symbol
+  // layouts. For simplicity, we only attempt folding when all symbols are at
+  // offset zero within the section (which is typically the case with
+  // .subsections_via_symbols.)
+  auto hasUnwind = [](Defined *d) { return d->unwindEntry() != nullptr; };
+  const auto *itA = llvm::find_if(ia->symbols, hasUnwind);
+  const auto *itB = llvm::find_if(ib->symbols, hasUnwind);
+  if (itA == ia->symbols.end())
+    return itB == ib->symbols.end();
+  if (itB == ib->symbols.end())
+    return false;
+  const Defined *da = *itA;
+  const Defined *db = *itB;
+  if (da->value != 0 || db->value != 0)
+    return false;
+  auto isZero = [](Defined *d) { return d->value == 0; };
+  // Since symbols are stored in order of value, and since we have already
+  // checked that da/db have value zero, we just need to do the isZero check on
+  // the subsequent symbols.
+  return std::find_if_not(std::next(itA), ia->symbols.end(), isZero) ==
+             ia->symbols.end() &&
+         std::find_if_not(std::next(itB), ib->symbols.end(), isZero) ==
+             ib->symbols.end();
 }
 
 // Compare the "moving" parts of two ConcatInputSections -- i.e. everything not
@@ -217,31 +240,19 @@ bool ICF::equalsVariable(const ConcatInputSection *ia,
     }
     return isecA->icfEqClass[icfPass % 2] == isecB->icfEqClass[icfPass % 2];
   };
-  if (!std::equal(ia->relocs.begin(), ia->relocs.end(), ib->relocs.begin(), f))
+  if (!llvm::equal(ia->relocs, ib->relocs, f))
     return false;
 
-  // If there are symbols with associated unwind info, check that the unwind
-  // info matches. For simplicity, we only handle the case where there are only
-  // symbols at offset zero within the section (which is typically the case with
-  // .subsections_via_symbols.)
+  // Compare unwind info equivalence classes.
   auto hasUnwind = [](Defined *d) { return d->unwindEntry() != nullptr; };
   const auto *itA = llvm::find_if(ia->symbols, hasUnwind);
-  const auto *itB = llvm::find_if(ib->symbols, hasUnwind);
   if (itA == ia->symbols.end())
-    return itB == ib->symbols.end();
-  if (itB == ib->symbols.end())
-    return false;
+    return true;
   const Defined *da = *itA;
-  const Defined *db = *itB;
-  if (da->unwindEntry()->icfEqClass[icfPass % 2] !=
-          db->unwindEntry()->icfEqClass[icfPass % 2] ||
-      da->value != 0 || db->value != 0)
-    return false;
-  auto isZero = [](Defined *d) { return d->value == 0; };
-  return std::find_if_not(std::next(itA), ia->symbols.end(), isZero) ==
-             ia->symbols.end() &&
-         std::find_if_not(std::next(itB), ib->symbols.end(), isZero) ==
-             ib->symbols.end();
+  // equalsConstant() guarantees that both sections have unwind info.
+  const Defined *db = *llvm::find_if(ib->symbols, hasUnwind);
+  return da->unwindEntry()->icfEqClass[icfPass % 2] ==
+         db->unwindEntry()->icfEqClass[icfPass % 2];
 }
 
 // Find the first InputSection after BEGIN whose equivalence class differs
diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp
index 20e4a1d755229..d0128d03a9eab 100644
--- a/lld/MachO/InputFiles.cpp
+++ b/lld/MachO/InputFiles.cpp
@@ -808,6 +808,17 @@ void ObjFile::parseSymbols(ArrayRef<typename LP::section> sectionHeaders,
       continue;
 
     if ((sym.n_type & N_TYPE) == N_SECT) {
+      if (sym.n_sect == 0) {
+        fatal("section symbol " + StringRef(strtab + sym.n_strx) + " in " +
+              toString(this) + " has an invalid section index [0]");
+      }
+      if (sym.n_sect > sections.size()) {
+        fatal("section symbol " + StringRef(strtab + sym.n_strx) + " in " +
+              toString(this) + " has an invalid section index [" +
+              Twine(static_cast<unsigned>(sym.n_sect)) +
+              "] greater than the total number of sections [" +
+              Twine(sections.size()) + "]");
+      }
       Subsections &subsections = sections[sym.n_sect - 1]->subsections;
       // parseSections() may have chosen not to parse this section.
       if (subsections.empty())
diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp
index b173e14cc86a8..2b2d28ef63e2d 100644
--- a/lld/MachO/InputSection.cpp
+++ b/lld/MachO/InputSection.cpp
@@ -348,6 +348,9 @@ WordLiteralInputSection::WordLiteralInputSection(const Section &section,
 }
 
 uint64_t WordLiteralInputSection::getOffset(uint64_t off) const {
+  if (off >= data.size())
+    fatal(toString(this) + ": offset is outside the section");
+
   auto *osec = cast<WordLiteralSection>(parent);
   const uintptr_t buf = reinterpret_cast<uintptr_t>(data.data());
   switch (sectionType(getFlags())) {
diff --git a/lld/MachO/Sections.cpp b/lld/MachO/Sections.cpp
index a27d902c0a227..47169c7e14ed0 100644
--- a/lld/MachO/Sections.cpp
+++ b/lld/MachO/Sections.cpp
@@ -27,7 +27,7 @@ bool isCodeSection(StringRef name, StringRef segName, uint32_t flags) {
 
   if (segName == segment_names::text)
     return StringSwitch<bool>(name)
-        .Cases(section_names::textCoalNt, section_names::staticInit, true)
+        .Cases({section_names::textCoalNt, section_names::staticInit}, true)
         .Default(false);
 
   return false;
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 29db1cdf9e9c4..60db612e33956 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -52,5 +52,8 @@ MachO Improvements
 WebAssembly Improvements
 ------------------------
 
+* The ``--stack-first`` flag is now enabled by default. The old
+  behavior can be enabled using ``--no-stack-first``.
+
 Fixes
 #####
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index bb1a53ad1112a..cfdde0a6c2299 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -719,6 +719,19 @@ a given pattern are handled as if they were given as arguments of
 Creates a separate output section for every orphan input section.
 .It Fl -unresolved-symbols Ns = Ns Ar value
 Determine how to handle unresolved symbols.
+.Ar value
+may be:
+.Pp
+.Bl -tag -width 2n -compact
+.It Cm report-all
+Report unresolved symbols (default).
+.It Cm ignore-in-object-files
+Only report unresolved symbols contained in shared libraries.
+.It Cm ignore-in-shared-libs
+Only report unresolved symbols contained in object files.
+.It Cm ignore-all
+Do not report unresolved symbols.
+.El
 .It Fl -use-android-relr-tags
 Use SHT_ANDROID_RELR / DT_ANDROID_RELR* tags instead of SHT_RELR / DT_RELR*.
 .It Fl v , Fl V
diff --git a/lld/test/ELF/aarch64-build-attributes.s b/lld/test/ELF/aarch64-build-attributes.s
index f2d542150897e..3d333bf6ccf2f 100644
--- a/lld/test/ELF/aarch64-build-attributes.s
+++ b/lld/test/ELF/aarch64-build-attributes.s
@@ -1,11 +1,11 @@
 // REQUIRES: aarch64
 // RUN: rm -rf %t && split-file %s %t && cd %t
 
-// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o %t1.o
-// RUN: llvm-mc -triple=aarch64 -filetype=obj pauth-bti-gcs.s -o %t2.o
-// RUN: llvm-mc -triple=aarch64 -filetype=obj pauth-bti-pac.s -o %t3.o
-// RUN: ld.lld -r %t1.o %t2.o %t3.o -o %t.merged.o
-// RUN: llvm-readelf -n %t.merged.o | FileCheck %s --check-prefix=NOTE
+// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o 1.o
+// RUN: llvm-mc -triple=aarch64 -filetype=obj pauth-bti-gcs.s -o 2.o
+// RUN: llvm-mc -triple=aarch64 -filetype=obj pauth-bti-pac.s -o 3.o
+// RUN: ld.lld -r 1.o 2.o 3.o -o merged.o
+// RUN: llvm-readelf -n merged.o | FileCheck %s --check-prefix=NOTE
 
 /// This test merges three object files with AArch64 build attributes.
 /// All contain identical PAuth ABI info (platform/version), which must be preserved.
diff --git a/lld/test/ELF/arm-wraparound-veneer.s b/lld/test/ELF/arm-wraparound-veneer.s
new file mode 100644
index 0000000000000..74dd6f29d8170
--- /dev/null
+++ b/lld/test/ELF/arm-wraparound-veneer.s
@@ -0,0 +1,102 @@
+// REQUIRES: arm
+// RUN: rm -rf %t && split-file %s %t && cd %t
+// RUN: llvm-mc -filetype=obj -triple=armv7-none-eabi code.s -o code.o
+// RUN: ld.lld -T unsigned1.ld code.o -o unsigned1.elf
+// RUN: llvm-objdump --triple=armv7 --no-show-raw-insn -d unsigned1.elf | FileCheck %s --check-prefix=UNSIGNED1
+// RUN: ld.lld -T unsigned2.ld code.o -o unsigned2.elf
+// RUN: llvm-objdump --triple=armv7 --no-show-raw-insn -d unsigned2.elf | FileCheck %s --check-prefix=UNSIGNED2
+// RUN: ld.lld -T signed1.ld code.o -o signed1.elf
+// RUN: llvm-objdump --triple=armv7 --no-show-raw-insn -d signed1.elf | FileCheck %s --check-prefix=SIGNED1
+// RUN: ld.lld -T signed2.ld code.o -o signed2.elf
+// RUN: llvm-objdump --triple=armv7 --no-show-raw-insn -d signed2.elf | FileCheck %s --check-prefix=SIGNED2
+
+/// The aim of this test is to ensure that a BL instruction near one end of the
+/// address space can reach a function at the extreme other end, directly,
+/// using a branch offset that makes the address wrap round. We check this at
+/// both the unsigned wraparound point (one address near 0 and the other near
+/// 0xFFFFFFFF) and the signed wraparound point (addresses either side of
+/// 0x80000000), crossing the boundary in both directions. In all four cases we
+/// expect a direct branch with no veneer.
+
+// UNSIGNED1: Disassembly of section .text.lowaddr:
+// UNSIGNED1: <func>:
+// UNSIGNED1:    10000:       bx      lr
+//
+// UNSIGNED1: Disassembly of section .text.highaddr:
+// UNSIGNED1: <_start>:
+// UNSIGNED1: ffff0000:       bl      0x10000
+// UNSIGNED1-NEXT:            bx      lr
+
+// UNSIGNED2: Disassembly of section .text.lowaddr:
+// UNSIGNED2: <_start>:
+// UNSIGNED2:    10000:       bl      0xffff0000
+// UNSIGNED2-NEXT:            bx      lr
+//
+// UNSIGNED2: Disassembly of section .text.highaddr:
+// UNSIGNED2: <func>:
+// UNSIGNED2: ffff0000:       bx      lr
+
+// SIGNED1:   Disassembly of section .text.posaddr:
+// SIGNED1:   <_start>:
+// SIGNED1:   7fff0000:       bl      0x80010000
+// SIGNED1-NEXT:              bx      lr
+//
+// SIGNED1:   Disassembly of section .text.negaddr:
+// SIGNED1:   <func>:
+// SIGNED1:   80010000:       bx      lr
+
+// SIGNED2:   Disassembly of section .text.posaddr:
+// SIGNED2:   <func>:
+// SIGNED2:   7fff0000:       bx      lr
+//
+// SIGNED2:   Disassembly of section .text.negaddr:
+// SIGNED2:   <_start>:
+// SIGNED2:   80010000:       bl      0x7fff0000
+// SIGNED2-NEXT:              bx      lr
+
+//--- code.s
+
+  .section .text.callee, "ax", %progbits
+  .global func
+  .type func, %function
+func:
+  bx lr
+
+  .section .text.caller, "ax", %progbits
+  .global _start
+  .type _start, %function
+_start:
+  bl func
+  bx lr
+
+//--- unsigned1.ld
+
+ENTRY(_start)
+SECTIONS {
+  .text.lowaddr  0x00010000 : AT(0x00010000) { *(.text.callee) }
+  .text.highaddr 0xffff0000 : AT(0xffff0000) { *(.text.caller) }
+}
+
+//--- unsigned2.ld
+
+ENTRY(_start)
+SECTIONS {
+  .text.lowaddr  0x00010000 : AT(0x00010000) { *(.text.caller) }
+  .text.highaddr 0xffff0000 : AT(0xffff0000) { *(.text.callee) }
+}
+
+//--- signed1.ld
+
+ENTRY(_start)
+SECTIONS {
+  .text.posaddr  0x7fff0000 : AT(0x7fff0000) { *(.text.caller) }
+  .text.negaddr  0x80010000 : AT(0x80010000) { *(.text.callee) }
+}
+
+//--- signed2.ld
+
+ENTRY(_start)
+SECTIONS {
+  .text.posaddr  0x7fff0000 : AT(0x7fff0000) { *(.text.callee) }
+  .text.negaddr  0x80010000 : AT(0x80010000) { *(.text.caller) }
+}
diff --git a/lld/test/MachO/bp-section-orderer.s b/lld/test/MachO/bp-section-orderer.s
index 90924e5797b64..d7de90d6cd7b3 100644
--- a/lld/test/MachO/bp-section-orderer.s
+++ b/lld/test/MachO/bp-section-orderer.s
@@ -106,6 +106,11 @@ r3:
 r4:
   .quad s2
 
+# cstrings are ignored by runBalancedPartitioning()
+.cstring
+cstr:
+  .asciz "this is cstr"
+
 .bss
 bss0:
   .zero 10
diff --git a/lld/test/MachO/handle-invalid-section-reference-too-big.test b/lld/test/MachO/handle-invalid-section-reference-too-big.test
new file mode 100644
index 0000000000000..1642d63e50af4
--- /dev/null
+++ b/lld/test/MachO/handle-invalid-section-reference-too-big.test
@@ -0,0 +1,128 @@
+# REQUIRES: aarch64
+
+## This is a regression test which makes sure that when there is an invalid section index
+## associated with a section symbol, the linker does not segfault.
+
+## Test YAML content was created using the following steps
+## 1. Create an object file from the following assembly 
+##     `llvm-mc -filetype=obj -triple=arm64-apple-darwin symbol.s -o symbol.o`
+##
+##     .text
+##     .section __TEST,__mystuff
+##     .globl _mysec
+## _mysec:
+##     .byte 0xC3
+##
+## 2. Use obj2yaml to convert object file to yaml
+##    `obj2yaml symbol.o -o symbol.yaml`
+##
+## 3. Manually set n_sect value of ltmp1 symbol to 10 which is greater than the number of sections 2.
+##
+
+# RUN: yaml2obj %s -o %t
+# RUN: not %lld -platform_version macos 10.14 11.0 -arch arm64 %t 2>&1 | FileCheck %s --check-prefix=FATAL
+
+# FATAL: error: section symbol ltmp0 in {{.*}} has an invalid section index [10] greater than the total number of sections [2]
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x1
+  ncmds:           3
+  sizeofcmds:      336
+  flags:           0x0
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         ''
+    vmaddr:          0
+    vmsize:          1
+    fileoff:         368
+    filesize:        1
+    maxprot:         7
+    initprot:        7
+    nsects:          2
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0
+        size:            0
+        offset:          0x170
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000000
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         ''
+      - sectname:        __mystuff
+        segname:         __TEST
+        addr:            0x0
+        size:            1
+        offset:          0x170
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         C3
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          376
+    nsyms:           3
+    stroff:          424
+    strsize:         24
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       2
+    iextdefsym:      2
+    nextdefsym:      1
+    iundefsym:       3
+    nundefsym:       0
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+LinkEditData:
+  NameList:
+    - n_strx:          14
+      n_type:          0xE
+      n_sect:          10
+      n_desc:          0
+      n_value:         0
+    - n_strx:          8
+      n_type:          0xE
+      n_sect:          2
+      n_desc:          0
+      n_value:         0
+    - n_strx:          1
+      n_type:          0xF
+      n_sect:          2
+      n_desc:          0
+      n_value:         0
+  StringTable:
+    - ''
+    - _mysec
+    - ltmp1
+    - ltmp0
+    - ''
+    - ''
+    - ''
+    - ''
+...
diff --git a/lld/test/MachO/handle-invalid-section-reference-zero.test b/lld/test/MachO/handle-invalid-section-reference-zero.test
new file mode 100644
index 0000000000000..ab636705198e5
--- /dev/null
+++ b/lld/test/MachO/handle-invalid-section-reference-zero.test
@@ -0,0 +1,128 @@
+# REQUIRES: aarch64
+
+## This is a regression test which makes sure that when there is an invalid section index
+## associated with a section symbol, the linker does not segfault.
+
+## Test YAML content was created using the following steps
+## 1. Create an object file from the following assembly 
+##     `llvm-mc -filetype=obj -triple=arm64-apple-darwin symbol.s -o symbol.o`
+##
+##     .text
+##     .section __TEST,__mystuff
+##     .globl _mysec
+## _mysec:
+##     .byte 0xC3
+##
+## 2. Use obj2yaml to convert object file to yaml
+##    `obj2yaml symbol.o -o symbol.yaml`
+##
+## 3. Manually set n_sect value of ltmp1 symbol to 0 instead of 1.
+##
+
+# RUN: yaml2obj %s -o %t
+# RUN: not %lld -platform_version macos 10.14 11.0 -arch arm64 %t 2>&1 | FileCheck %s --check-prefix=FATAL
+
+# FATAL: error: section symbol ltmp0 in {{.*}} has an invalid section index [0]
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x1
+  ncmds:           3
+  sizeofcmds:      336
+  flags:           0x0
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         ''
+    vmaddr:          0
+    vmsize:          1
+    fileoff:         368
+    filesize:        1
+    maxprot:         7
+    initprot:        7
+    nsects:          2
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0
+        size:            0
+        offset:          0x170
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000000
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         ''
+      - sectname:        __mystuff
+        segname:         __TEST
+        addr:            0x0
+        size:            1
+        offset:          0x170
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         C3
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          376
+    nsyms:           3
+    stroff:          424
+    strsize:         24
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       2
+    iextdefsym:      2
+    nextdefsym:      1
+    iundefsym:       3
+    nundefsym:       0
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+LinkEditData:
+  NameList:
+    - n_strx:          14
+      n_type:          0xE
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          8
+      n_type:          0xE
+      n_sect:          2
+      n_desc:          0
+      n_value:         0
+    - n_strx:          1
+      n_type:          0xF
+      n_sect:          2
+      n_desc:          0
+      n_value:         0
+  StringTable:
+    - ''
+    - _mysec
+    - ltmp1
+    - ltmp0
+    - ''
+    - ''
+    - ''
+    - ''
+...
diff --git a/lld/test/MachO/invalid/bad-offsets.s b/lld/test/MachO/invalid/bad-offsets.s
new file mode 100644
index 0000000000000..e1244ee501960
--- /dev/null
+++ b/lld/test/MachO/invalid/bad-offsets.s
@@ -0,0 +1,45 @@
+## Test that we properly detect and report out-of-bounds offsets in literal sections.
+## We're intentionally testing fatal errors (for malformed input files), and
+## fatal errors aren't supported for testing when main is run twice.
+# XFAIL: main-run-twice
+
+# REQUIRES: x86
+# RUN: rm -rf %t; split-file %s %t
+
+## Test WordLiteralInputSection bounds checking
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/word-literal.s -o %t/word-literal.o
+# RUN: not %lld -dylib %t/word-literal.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=WORD
+
+## Test CStringInputSection bounds checking
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/cstring.s -o %t/cstring.o
+# RUN: not %lld -dylib %t/cstring.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=CSTRING
+
+# WORD: error: {{.*}}word-literal.o:(__literal4): offset is outside the section
+# CSTRING: error: {{.*}}cstring.o:(__cstring): offset is outside the section
+
+#--- word-literal.s
+.section __TEXT,__literal4,4byte_literals
+L_literal:
+  .long 0x01020304
+
+.text
+.globl _main
+_main:
+  # We use a subtractor expression to force a section relocation. Symbol relocations
+  # don't trigger the error.
+  .long L_literal - _main + 4
+
+.subsections_via_symbols
+
+#--- cstring.s
+## Create a cstring section with a reference that points past the end
+.cstring
+L_str:
+  .asciz "foo"
+
+.text
+.globl _main
+_main:
+  .long L_str - _main + 4
+
+.subsections_via_symbols
\ No newline at end of file
diff --git a/lld/test/wasm/alias.s b/lld/test/wasm/alias.s
index 0bb035b92f29c..83f40a8369921 100644
--- a/lld/test/wasm/alias.s
+++ b/lld/test/wasm/alias.s
@@ -24,7 +24,7 @@ _start:
 # CHECK-NEXT:     FunctionTypes:   [ 0 ]
 # CHECK-NEXT:   - Type:            MEMORY
 # CHECK-NEXT:     Memories:
-# CHECK-NEXT:       - Minimum:         0x2
+# CHECK-NEXT:       - Minimum:         0x1
 # CHECK-NEXT:   - Type:            GLOBAL
 # CHECK-NEXT:     Globals:
 # CHECK-NEXT:       - Index:           0
@@ -32,7 +32,7 @@ _start:
 # CHECK-NEXT:         Mutable:         true
 # CHECK-NEXT:         InitExpr:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           66560
+# CHECK-NEXT:           Value:           65536
 # CHECK-NEXT:   - Type:            EXPORT
 # CHECK-NEXT:     Exports:
 # CHECK-NEXT:       - Name:            memory
diff --git a/lld/test/wasm/bss-only.s b/lld/test/wasm/bss-only.s
index 1c0500f172ca4..bec7592129e7b 100644
--- a/lld/test/wasm/bss-only.s
+++ b/lld/test/wasm/bss-only.s
@@ -26,13 +26,13 @@ b:
 # CHECK-NEXT:         Mutable:         true
 # CHECK-NEXT:         InitExpr:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           67568
+# CHECK-NEXT:           Value:           65536
 # CHECK-NEXT:       - Index:           1
 # CHECK-NEXT:         Type:            I32
 # CHECK-NEXT:         Mutable:         false
 # CHECK-NEXT:         InitExpr:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           2028
+# CHECK-NEXT:           Value:           66540
 # CHECK-NEXT:   - Type:            EXPORT
 # CHECK-NEXT:     Exports:
 # CHECK-NEXT:       - Name:            memory
diff --git a/lld/test/wasm/build-id.test b/lld/test/wasm/build-id.test
index dd6e2237108f3..5fafd21d32240 100644
--- a/lld/test/wasm/build-id.test
+++ b/lld/test/wasm/build-id.test
@@ -43,12 +43,12 @@ foo:
 
 
 # DEFAULT:      Contents of section build_id:
-# DEFAULT-NEXT: 0079 10299168 1e3c845a 3c8f80ae 2f16cc22  .).h.<.Z<.../.."
-# DEFAULT-NEXT: 0089 2d
+# DEFAULT-NEXT: 0079 103f86e6 3bb81959 2e99ffa9 acfed331 .?..;..Y.......1
+# DEFAULT-NEXT: 0089 3a
 
 # SHA1:      Contents of section build_id:
-# SHA1-NEXT: 0079 145abdda 387a9bc4 e3aed3c3 3319cd37  .Z..8z......3..7
-# SHA1-NEXT: 0089 0212237c e4                          ..#|.
+# SHA1-NEXT: 0079 1410ade4 e75d1c9d 71023465 03b7572f .....]..q.4e..W/
+# SHA1-NEXT: 0089 c06c5ae0 74                          .lZ.t
 
 # UUID:      Contents of section build_id:
 # UUID-NEXT: 0079 10
diff --git a/lld/test/wasm/call-indirect.s b/lld/test/wasm/call-indirect.s
index 7bf39a9f5aec9..64eaa593731be 100644
--- a/lld/test/wasm/call-indirect.s
+++ b/lld/test/wasm/call-indirect.s
@@ -82,13 +82,13 @@ indirect_func:
 # CHECK-NEXT:         Mutable:         true
 # CHECK-NEXT:         InitExpr:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           66576
+# CHECK-NEXT:           Value:           65536
 # CHECK-NEXT:       - Index:           1
 # CHECK-NEXT:         Type:            I32
 # CHECK-NEXT:         Mutable:         false
 # CHECK-NEXT:         InitExpr:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           1032
+# CHECK-NEXT:           Value:           65544
 # CHECK-NEXT:   - Type:            EXPORT
 # CHECK-NEXT:     Exports:
 # CHECK-NEXT:       - Name:            memory
@@ -125,23 +125,23 @@ indirect_func:
 # CHECK-NEXT:         Body:            42010B
 # CHECK-NEXT:       - Index:           1
 # CHECK-NEXT:         Locals:
-# CHECK-NEXT:         Body:            410028028088808000118080808000001A410028028488808000118180808000001A0B
+# CHECK-NEXT:         Body:            410028028080848000118080808000001A410028028480848000118180808000001A0B
 # CHECK-NEXT:       - Index:           2
 # CHECK-NEXT:         Locals:
 # CHECK-NEXT:         Body:            41020B
 # CHECK-NEXT:       - Index:           3
 # CHECK-NEXT:         Locals:
-# CHECK-NEXT:         Body:            410028028888808000118180808000001A0B
+# CHECK-NEXT:         Body:            410028028880848000118180808000001A0B
 # CHECK-NEXT:       - Index:           4
 # CHECK-NEXT:         Locals:
 # CHECK-NEXT:         Body:            42012000118280808000001A0B
 # CHECK-NEXT:   - Type:            DATA
 # CHECK-NEXT:     Segments:
-# CHECK-NEXT:       - SectionOffset:    7
+# CHECK-NEXT:       - SectionOffset:    8
 # CHECK-NEXT:         InitFlags:        0
 # CHECK-NEXT:         Offset:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           1024
+# CHECK-NEXT:           Value:           65536
 # CHECK-NEXT:         Content:         '010000000200000002000000'
 # CHECK-NEXT:   - Type:            CUSTOM
 # CHECK-NEXT:     Name:            name
diff --git a/lld/test/wasm/comdats.ll b/lld/test/wasm/comdats.ll
index 2dd687fbad1ef..1662a983698ac 100644
--- a/lld/test/wasm/comdats.ll
+++ b/lld/test/wasm/comdats.ll
@@ -23,13 +23,13 @@ entry:
 ; CHECK-NEXT:        Mutable:         true
 ; CHECK-NEXT:        InitExpr:
 ; CHECK-NEXT:          Opcode:          I32_CONST
-; CHECK-NEXT:          Value:           66576
+; CHECK-NEXT:          Value:           65536
 ; CHECK-NEXT:      - Index:           1
 ; CHECK-NEXT:        Type:            I32
 ; CHECK-NEXT:        Mutable:         false
 ; CHECK-NEXT:        InitExpr:
 ; CHECK-NEXT:          Opcode:          I32_CONST
-; CHECK-NEXT:          Value:           1024
+; CHECK-NEXT:          Value:           65536
 ; CHECK-NEXT:  - Type:            EXPORT
 ; CHECK-NEXT:    Exports:
 ; CHECK-NEXT:      - Name:            memory
@@ -69,7 +69,7 @@ entry:
 ; CHECK-NEXT:        Body:            1080808080001082808080001A0B
 ; CHECK-NEXT:      - Index:           2
 ; CHECK-NEXT:        Locals:
-; CHECK-NEXT:        Body:            4180888080000B
+; CHECK-NEXT:        Body:            4180808480000B
 ; CHECK-NEXT:      - Index:           3
 ; CHECK-NEXT:        Locals:
 ; CHECK-NEXT:        Body:            0B
@@ -81,9 +81,9 @@ entry:
 ; CHECK-NEXT:        Body:            4181808080000B
 ; CHECK-NEXT:  - Type:            DATA
 ; CHECK-NEXT:    Segments:
-; CHECK-NEXT:      - SectionOffset:   7
+; CHECK-NEXT:      - SectionOffset:   8
 ; CHECK-NEXT:        InitFlags:       0
 ; CHECK-NEXT:        Offset:
 ; CHECK-NEXT:          Opcode:          I32_CONST
-; CHECK-NEXT:          Value:           1024
+; CHECK-NEXT:          Value:           65536
 ; CHECK-NEXT:        Content:         '616263'
diff --git a/lld/test/wasm/compress-relocs.s b/lld/test/wasm/compress-relocs.s
index 41d4ff567d501..37f1b3b170ff7 100644
--- a/lld/test/wasm/compress-relocs.s
+++ b/lld/test/wasm/compress-relocs.s
@@ -47,16 +47,16 @@ test_memory_and_indirect_call_relocs:
   end_function
 
 # CHECK:    test_memory_and_indirect_call_relocs
-# CHECK:      41 90 88 80 80 00                 i32.const      1040
+# CHECK:      41 90 80 84 80 00                 i32.const      65552
 # CHECK:      11 80 80 80 80 00 80 80 80 80 00  call_indirect  0
-# CHECK:      28 02 94 88 80 80 00              i32.load       1044
+# CHECK:      28 02 94 80 84 80 00              i32.load       65556
 # CHECK:      11 81 80 80 80 00 80 80 80 80 00  call_indirect  1
 # CHECK:      41 81 80 80 80 00                 i32.const      1
 # CHECK:      11 80 80 80 80 00 80 80 80 80 00  call_indirect  0
 # COMPRESS: test_memory_and_indirect_call_relocs
-# COMPRESS:   41 90 08                          i32.const      1040
+# COMPRESS:   41 90 80 04                       i32.const      65552
 # COMPRESS:   11 00 00                          call_indirect  0
-# COMPRESS:   28 02 94 08                       i32.load       1044
+# COMPRESS:   28 02 94 80 04                    i32.load       65556
 # COMPRESS:   11 01 00                          call_indirect  1
 # COMPRESS:   41 01                             i32.const      1
 # COMPRESS:   11 00 00                          call_indirect  0
@@ -91,11 +91,11 @@ test_relative_relocs:
   end_function
 
 # CHECK:    test_relative_relocs
-# CHECK:      41 90 88 80 80 00  i32.const  1040
+# CHECK:      41 90 80 84 80 00  i32.const  65552
 # CHECK:      41 81 80 80 80 00  i32.const  1
 # CHECK:      41 83 80 80 80 00  i32.const  3
 # COMPRESS: test_relative_relocs
-# COMPRESS:   41 90 08           i32.const  1040
+# COMPRESS:   41 90 80 04        i32.const  65552
 # COMPRESS:   41 01              i32.const  1
 # COMPRESS:   41 03              i32.const  3
 
diff --git a/lld/test/wasm/compress-relocs64.s b/lld/test/wasm/compress-relocs64.s
index 44e7a089275bb..f3ff646cc3b1c 100644
--- a/lld/test/wasm/compress-relocs64.s
+++ b/lld/test/wasm/compress-relocs64.s
@@ -36,12 +36,12 @@ test_memory_and_indirect_call_relocs:
   end_function
 
 # CHECK:    test_memory_and_indirect_call_relocs
-# CHECK:      42 90 88 80 80 80 80 80 80 80 00     i64.const 1040
-# CHECK:      29 03 98 88 80 80 80 80 80 80 80 00  i64.load  1048
+# CHECK:      42 90 80 84 80 80 80 80 80 80 00     i64.const 65552
+# CHECK:      29 03 98 80 84 80 80 80 80 80 80 00  i64.load  65560
 # CHECK:      42 81 80 80 80 80 80 80 80 80 00     i64.const 1
 # COMPRESS: test_memory_and_indirect_call_relocs
-# COMPRESS:   42 90 08                             i64.const 1040
-# COMPRESS:   29 03 98 08                          i64.load  1048
+# COMPRESS:   42 90 80 04                          i64.const 65552
+# COMPRESS:   29 03 98 80 04                       i64.load  65560
 # COMPRESS:   42 01                                i64.const 1
 
   .globl test_relative_relocs
@@ -56,11 +56,11 @@ test_relative_relocs:
   end_function
 
 # CHECK:    test_relative_relocs
-# CHECK:      42 90 88 80 80 80 80 80 80 80 00  i64.const 1040
+# CHECK:      42 90 80 84 80 80 80 80 80 80 00  i64.const 65552
 # CHECK:      42 81 80 80 80 80 80 80 80 80 00  i64.const 1
 # CHECK:      42 83 80 80 80 80 80 80 80 80 00  i64.const 3
 # COMPRESS: test_relative_relocs
-# COMPRESS:   42 90 08                          i64.const 1040
+# COMPRESS:   42 90 80 04                       i64.const 65552
 # COMPRESS:   42 01                             i64.const 1
 # COMPRESS:   42 03                             i64.const 3
 
diff --git a/lld/test/wasm/custom-section-name.ll b/lld/test/wasm/custom-section-name.ll
index 8799fbf36056d..89cb72fe3cf99 100644
--- a/lld/test/wasm/custom-section-name.ll
+++ b/lld/test/wasm/custom-section-name.ll
@@ -16,29 +16,29 @@ target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: - Type:            DATA
 ; CHECK-NEXT:    Segments:
-; CHECK-NEXT:      - SectionOffset:   7
+; CHECK-NEXT:      - SectionOffset:   8
 ; CHECK-NEXT:        InitFlags:       0
 ; CHECK-NEXT:        Offset:
 ; CHECK-NEXT:          Opcode:          I32_CONST
-; CHECK-NEXT:          Value:           1024
+; CHECK-NEXT:          Value:           65536
 ; CHECK-NEXT:        Content:         '00000000'
-; CHECK-NEXT:      - SectionOffset:   17
+; CHECK-NEXT:      - SectionOffset:   19
 ; CHECK-NEXT:        InitFlags:       0
 ; CHECK-NEXT:        Offset:
 ; CHECK-NEXT:          Opcode:          I32_CONST
-; CHECK-NEXT:          Value:           1028
+; CHECK-NEXT:          Value:           65540
 ; CHECK-NEXT:        Content:         2A000000
-; CHECK-NEXT:      - SectionOffset:   27
+; CHECK-NEXT:      - SectionOffset:   30
 ; CHECK-NEXT:        InitFlags:       0
 ; CHECK-NEXT:        Offset:
 ; CHECK-NEXT:          Opcode:          I32_CONST
-; CHECK-NEXT:          Value:           1032
+; CHECK-NEXT:          Value:           65544
 ; CHECK-NEXT:        Content:         '07000000'
-; BSS-NEXT:        - SectionOffset:   37
+; BSS-NEXT:        - SectionOffset:   41
 ; BSS-NEXT:          InitFlags:       0
 ; BSS-NEXT:          Offset:
 ; BSS-NEXT:            Opcode:          I32_CONST
-; BSS-NEXT:            Value:           1036
+; BSS-NEXT:            Value:           65548
 ; BSS-NEXT:          Content:         '00000000'
 ; NO-BSS-NOT:      - SectionOffset:
 
diff --git a/lld/test/wasm/data-layout.s b/lld/test/wasm/data-layout.s
index a68bc032e4840..8df834d6ea8c4 100644
--- a/lld/test/wasm/data-layout.s
+++ b/lld/test/wasm/data-layout.s
@@ -63,33 +63,33 @@ local_struct_internal_ptr:
 # CHECK-NEXT:         Mutable:         true
 # CHECK-NEXT:         InitExpr:
 # CHECK-NEXT:           Opcode:          [[PTR]]_CONST
-# CHECK-NEXT:           Value:           66624
+# CHECK-NEXT:           Value:           65536
 # CHECK-NEXT:       - Index:           1
 # CHECK-NEXT:         Type:            [[PTR]]
 # CHECK-NEXT:         Mutable:         false
 # CHECK-NEXT:         InitExpr:
 # CHECK-NEXT:           Opcode:          [[PTR]]_CONST
-# CHECK-NEXT:           Value:           1080
+# CHECK-NEXT:           Value:           65592
 # CHECK-NEXT:       - Index:           2
 # CHECK-NEXT:         Type:            [[PTR]]
 # CHECK-NEXT:         Mutable:         false
 # CHECK-NEXT:         InitExpr:
 # CHECK-NEXT:           Opcode:          [[PTR]]_CONST
-# CHECK-NEXT:           Value:           66624
+# CHECK-NEXT:           Value:           65600
 
 # CHECK:        - Type:            DATA
 # CHECK-NEXT:     Segments:
-# CHECK-NEXT:       - SectionOffset:   7
+# CHECK-NEXT:       - SectionOffset:   8
 # CHECK-NEXT:         InitFlags:       0
 # CHECK-NEXT:         Offset:
 # CHECK-NEXT:           Opcode:          [[PTR]]_CONST
-# CHECK-NEXT:           Value:           1024
+# CHECK-NEXT:           Value:           65536
 # CHECK-NEXT:         Content:         68656C6C6F0A00
-# CHECK-NEXT:       - SectionOffset:   20
+# CHECK-NEXT:       - SectionOffset:   22
 # CHECK-NEXT:         InitFlags:       0
 # CHECK-NEXT:         Offset:
 # CHECK-NEXT:           Opcode:          [[PTR]]_CONST
-# CHECK-NEXT:           Value:           1040
+# CHECK-NEXT:           Value:           65552
 
 
 # RUN: wasm-ld -no-gc-sections --allow-undefined --no-entry \
diff --git a/lld/test/wasm/data-segment-merging.ll b/lld/test/wasm/data-segment-merging.ll
index e6f3c5ee469f2..34c49e8b901f6 100644
--- a/lld/test/wasm/data-segment-merging.ll
+++ b/lld/test/wasm/data-segment-merging.ll
@@ -15,11 +15,11 @@
 
 ; MERGE-LABEL: - Type:            DATA
 ; MERGE-NEXT:    Segments:
-; MERGE-NEXT:      - SectionOffset:   7
+; MERGE-NEXT:      - SectionOffset:   8
 ; MERGE-NEXT:        InitFlags:       0
 ; MERGE-NEXT:        Offset:
 ; MERGE:             Content:         636F6E7374616E74000000002B
-; MERGE-NEXT:      - SectionOffset:   26
+; MERGE-NEXT:      - SectionOffset:   28
 ; MERGE-NEXT:        InitFlags:       0
 ; MERGE-NEXT:        Offset:
 ; MERGE:             Content:         68656C6C6F00676F6F6462796500776861746576657200002A000000
@@ -41,27 +41,27 @@
 ; SEPARATE-NOT:                  DATACOUNT
 ; SEPARATE-LABEL: - Type:            DATA
 ; SEPARATE-NEXT:    Segments:
-; SEPARATE-NEXT:      - SectionOffset:   7
+; SEPARATE-NEXT:      - SectionOffset:   8
 ; SEPARATE-NEXT:        InitFlags:       0
 ; SEPARATE-NEXT:        Offset:
 ; SEPARATE:             Content:         636F6E7374616E7400
-; SEPARATE-NEXT:      - SectionOffset:   22
+; SEPARATE-NEXT: - SectionOffset: 24
 ; SEPARATE-NEXT:        InitFlags:       0
 ; SEPARATE-NEXT:        Offset:
 ; SEPARATE:             Content:         2B
-; SEPARATE-NEXT:      - SectionOffset:   29
+; SEPARATE-NEXT:      - SectionOffset:   32
 ; SEPARATE-NEXT:        InitFlags:       0
 ; SEPARATE-NEXT:        Offset:
 ; SEPARATE:             Content:         68656C6C6F00
-; SEPARATE-NEXT:      - SectionOffset:   41
+; SEPARATE-NEXT:      - SectionOffset:   45
 ; SEPARATE-NEXT:        InitFlags:       0
 ; SEPARATE-NEXT:        Offset:
 ; SEPARATE:             Content:         676F6F6462796500
-; SEPARATE-NEXT:      - SectionOffset:   55
+; SEPARATE-NEXT:      - SectionOffset:   60
 ; SEPARATE-NEXT:        InitFlags:       0
 ; SEPARATE-NEXT:        Offset:
 ; SEPARATE:             Content:         '776861746576657200'
-; SEPARATE-NEXT:      - SectionOffset:   70
+; SEPARATE-NEXT:      - SectionOffset:   76
 ; SEPARATE-NEXT:        InitFlags:       0
 ; SEPARATE-NEXT:        Offset:
 ; SEPARATE:             Content:         2A000000
diff --git a/lld/test/wasm/data-segments.ll b/lld/test/wasm/data-segments.ll
index 6c401c4873910..237f4285e3763 100644
--- a/lld/test/wasm/data-segments.ll
+++ b/lld/test/wasm/data-segments.ll
@@ -61,20 +61,20 @@
 ; ACTIVE-NEXT:        Body:            0B
 ; ACTIVE-NEXT:  - Type:            DATA
 ; ACTIVE-NEXT:    Segments:
-; ACTIVE-NEXT:      - SectionOffset:   7
+; ACTIVE-NEXT:      - SectionOffset:   8
 ; ACTIVE-NEXT:        InitFlags:       0
 ; ACTIVE-NEXT:        Offset:
 ; ACTIVE32-NEXT:        Opcode:          I32_CONST
 ; ACTIVE64-NEXT:        Opcode:          I64_CONST
-; ACTIVE-NEXT:          Value:           1024
+; ACTIVE-NEXT:          Value:           65536
 ; ACTIVE-NEXT:        Content:         636F6E7374616E74000000002B
-; ACTIVE-NEXT:      - SectionOffset:   26
+; ACTIVE-NEXT:      - SectionOffset:   28
 ; ACTIVE-NEXT:        InitFlags:       0
 ; ACTIVE-NEXT:        Offset:
 ; ACTIVE32-NEXT:        Opcode:          I32_CONST
 ; ACTIVE64-NEXT:        Opcode:          I64_CONST
-; ACTIVE-NEXT:          Value:           1040
-; ACTIVE-NEXT:        Content:         68656C6C6F00676F6F646279650000002A000000
+; ACTIVE-NEXT:          Value:           65552
+; ACTIVE-NEXT:        Content:         68656C6C6F00676F6F646279650000002A00000063000000
 ; ACTIVE-NEXT:  - Type:            CUSTOM
 ; ACTIVE-NEXT:    Name:            name
 ; ACTIVE-NEXT:    FunctionNames:
@@ -201,7 +201,7 @@
 ; DIS-NEXT:           block
 ; DIS-NEXT:            block
 
-; NOPIC-DIS-NEXT:       [[PTR]].const   11064
+; NOPIC-DIS-NEXT:       [[PTR]].const   75576
 ; PIC-DIS-NEXT:         local.get       0
 
 ; DIS-NEXT:             i32.const       0
@@ -211,8 +211,8 @@
 ; DIS-NEXT:                                            # 2:     down to label0
 ; DIS-NEXT:            end
 
-; NOPIC-DIS-NEXT:      [[PTR]].const   1024
-; NOPIC-DIS-NEXT:      [[PTR]].const   1024
+; NOPIC-DIS-NEXT:      [[PTR]].const   65536
+; NOPIC-DIS-NEXT:      [[PTR]].const   65536
 ; NOPIC-DIS-NEXT:      global.set      1
 ; PIC-DIS-NEXT:        [[PTR]].const   0
 ; PIC-DIS-NEXT:        global.get      1
@@ -224,7 +224,7 @@
 ; DIS-NEXT:            i32.const       4
 ; DIS-NEXT:            memory.init  0, 0
 
-; NOPIC-DIS-NEXT:      [[PTR]].const   1028
+; NOPIC-DIS-NEXT:      [[PTR]].const   65540
 ; PIC-DIS-NEXT:        [[PTR]].const   4
 ; PIC-DIS-NEXT:        global.get      1
 ; PIC-DIS-NEXT:        [[PTR]].add
@@ -233,7 +233,7 @@
 ; DIS-NEXT:            i32.const       13
 ; DIS-NEXT:            memory.init     1, 0
 
-; NOPIC-DIS-NEXT:      [[PTR]].const   1044
+; NOPIC-DIS-NEXT:      [[PTR]].const   65556
 ; PIC-DIS-NEXT:        [[PTR]].const   20
 ; PIC-DIS-NEXT:        global.get      1
 ; PIC-DIS-NEXT:        [[PTR]].add
@@ -241,7 +241,7 @@
 ; DIS-NEXT:            i32.const       0
 ; DIS-NEXT:            i32.const       20
 ; DIS-NEXT:            memory.init     2, 0
-; NOPIC-DIS-NEXT:      [[PTR]].const   1064
+; NOPIC-DIS-NEXT:      [[PTR]].const   65576
 ; PIC-DIS-NEXT:        [[PTR]].const   40
 ; PIC-DIS-NEXT:        global.get      1
 ; PIC-DIS-NEXT:        [[PTR]].add
@@ -249,13 +249,13 @@
 ; DIS-NEXT:            [[PTR]].const   10000
 ; DIS-NEXT:            memory.fill     0
 
-; NOPIC-DIS-NEXT:      [[PTR]].const   11064
+; NOPIC-DIS-NEXT:      [[PTR]].const   75576
 ; PIC-DIS-NEXT:        local.get       0
 
 ; DIS-NEXT:            i32.const       2
 ; DIS-NEXT:            i32.atomic.store        0
 
-; NOPIC-DIS-NEXT:      [[PTR]].const   11064
+; NOPIC-DIS-NEXT:      [[PTR]].const   75576
 ; PIC-DIS-NEXT:        local.get       0
 
 ; DIS-NEXT:            i32.const       -1
@@ -264,7 +264,7 @@
 ; DIS-NEXT:            br              1               # 1:     down to label1
 ; DIS-NEXT:           end
 
-; NOPIC-DIS-NEXT:     [[PTR]].const   11064
+; NOPIC-DIS-NEXT:     [[PTR]].const   75576
 ; PIC-DIS-NEXT:       local.get       0
 
 ; DIS-NEXT:           i32.const       1
diff --git a/lld/test/wasm/debuginfo.test b/lld/test/wasm/debuginfo.test
index 9cb1cc31e515a..7e6bd51cb35c9 100644
--- a/lld/test/wasm/debuginfo.test
+++ b/lld/test/wasm/debuginfo.test
@@ -50,7 +50,7 @@ CHECK-NEXT:                DW_AT_type	(0x000000ac "int[2]")
 CHECK-NEXT:                DW_AT_external	(true)
 CHECK-NEXT:                DW_AT_decl_file	("{{.*}}hi_foo.c")
 CHECK-NEXT:                DW_AT_decl_line	(1)
-CHECK:                     DW_AT_location	(DW_OP_addr 0x400)
+CHECK:                     DW_AT_location	(DW_OP_addr 0x10000)
 
 CHECK:   DW_TAG_array_type
 
diff --git a/lld/test/wasm/dylink-non-pie.s b/lld/test/wasm/dylink-non-pie.s
index 3157b8c32120f..fddfddb4df658 100755
--- a/lld/test/wasm/dylink-non-pie.s
+++ b/lld/test/wasm/dylink-non-pie.s
@@ -32,7 +32,7 @@ f_p:
 
 # DIS:    <__wasm_apply_data_relocs>:
 # DIS-EMPTY:
-# DIS-NEXT:    i32.const       1024
+# DIS-NEXT:    i32.const       65536
 # DIS-NEXT:    global.get      0
 # DIS-NEXT:    i32.store       0
 # DIS-NEXT:    end
diff --git a/lld/test/wasm/emit-relocs.s b/lld/test/wasm/emit-relocs.s
index 385344cb23321..3df345c0c7038 100644
--- a/lld/test/wasm/emit-relocs.s
+++ b/lld/test/wasm/emit-relocs.s
@@ -41,11 +41,11 @@ foo:
 
 # CHECK:        - Type:            DATA
 # CHECK-NEXT:     Segments:
-# CHECK-NEXT:       - SectionOffset:   7
+# CHECK-NEXT:       - SectionOffset:   8
 # CHECK-NEXT:         InitFlags:       0
 # CHECK-NEXT:         Offset:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           1024
+# CHECK-NEXT:           Value:           65536
 # CHECK-NEXT:         Content:         '00000000'
 
 # There should be a single relocation in this section (just the live symbol)
@@ -75,5 +75,5 @@ foo:
 # CHECK-NEXT:         Kind:            DATA
 # CHECK-NEXT:         Name:            __stack_low
 # CHECK-NEXT:         Flags:           [ VISIBILITY_HIDDEN, ABSOLUTE ]
-# CHECK-NEXT:         Offset:          1040
 # CHECK-NEXT:         Size:            0
+# CHECK-NEXT:       - Index:           3
diff --git a/lld/test/wasm/externref.s b/lld/test/wasm/externref.s
index ffc63a6d3d0be..1443e5f7fda5f 100644
--- a/lld/test/wasm/externref.s
+++ b/lld/test/wasm/externref.s
@@ -35,7 +35,7 @@ _start:
 # CHECK-NEXT:        Mutable:         true
 # CHECK-NEXT:        InitExpr:
 # CHECK-NEXT:          Opcode:          I32_CONST
-# CHECK-NEXT:          Value:           66560
+# CHECK-NEXT:          Value:           65536
 # CHECK-NEXT:      - Index:           1
 # CHECK-NEXT:        Type:            EXTERNREF
 # CHECK-NEXT:        Mutable:         true
diff --git a/lld/test/wasm/gc-sections.ll b/lld/test/wasm/gc-sections.ll
index e709ab7ba2c3d..69d7ed2105cf7 100644
--- a/lld/test/wasm/gc-sections.ll
+++ b/lld/test/wasm/gc-sections.ll
@@ -57,7 +57,7 @@ entry:
 ; CHECK-NEXT:         Mutable:         true
 ; CHECK-NEXT:         InitExpr:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           66576
+; CHECK-NEXT:           Value:           65536
 ; CHECK-NEXT:       - Index:       1
 ; CHECK-NEXT:         Type:        I64
 ; CHECK-NEXT:         Mutable:     true
@@ -67,11 +67,11 @@ entry:
 
 ; CHECK:        - Type:            DATA
 ; CHECK-NEXT:     Segments:
-; CHECK-NEXT:       - SectionOffset:   7
+; CHECK-NEXT:       - SectionOffset:   8
 ; CHECK-NEXT:         InitFlags:       0
 ; CHECK-NEXT:         Offset:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           1024
+; CHECK-NEXT:           Value:           65536
 ; CHECK-NEXT:         Content:         '02000000'
 ; CHECK-NEXT:   - Type:            CUSTOM
 ; CHECK-NEXT:     Name:            name
@@ -123,7 +123,7 @@ entry:
 ; NO-GC-NEXT:         Mutable:         true
 ; NO-GC-NEXT:         InitExpr:
 ; NO-GC-NEXT:           Opcode:          I32_CONST
-; NO-GC-NEXT:           Value:           66576
+; NO-GC-NEXT:           Value:           65536
 ; NO-GC-NEXT:       - Index:       1
 ; NO-GC-NEXT:         Type:        I64
 ; NO-GC-NEXT:         Mutable:     true
@@ -139,11 +139,11 @@ entry:
 
 ; NO-GC:        - Type:            DATA
 ; NO-GC-NEXT:     Segments:
-; NO-GC-NEXT:       - SectionOffset:   7
+; NO-GC-NEXT:       - SectionOffset:   8
 ; NO-GC-NEXT:         InitFlags:       0
 ; NO-GC-NEXT:         Offset:
 ; NO-GC-NEXT:           Opcode:          I32_CONST
-; NO-GC-NEXT:           Value:           1024
+; NO-GC-NEXT:           Value:           65536
 ; NO-GC-NEXT:         Content:         '010000000000000002000000'
 ; NO-GC-NEXT:   - Type:            CUSTOM
 ; NO-GC-NEXT:     Name:            name
diff --git a/lld/test/wasm/global-base.test b/lld/test/wasm/global-base.test
index 0e65f0cce8f49..e84b8ec3ef9ce 100644
--- a/lld/test/wasm/global-base.test
+++ b/lld/test/wasm/global-base.test
@@ -19,19 +19,19 @@ CHECK-1024-NEXT:        Type:            I32
 CHECK-1024-NEXT:        Mutable:         true
 CHECK-1024-NEXT:        InitExpr:
 CHECK-1024-NEXT:          Opcode:          I32_CONST
-CHECK-1024-NEXT:          Value:           66560
+CHECK-1024-NEXT:          Value:           65536
 CHECK-1024-NEXT:      - Index:           1
 CHECK-1024-NEXT:        Type:            I32
 CHECK-1024-NEXT:        Mutable:         false
 CHECK-1024-NEXT:        InitExpr:
 CHECK-1024-NEXT:          Opcode:          I32_CONST
-CHECK-1024-NEXT:          Value:           1024
+CHECK-1024-NEXT:          Value:           65536
 CHECK-1024-NEXT:      - Index:           2
 CHECK-1024-NEXT:        Type:            I32
 CHECK-1024-NEXT:        Mutable:         false
 CHECK-1024-NEXT:        InitExpr:
 CHECK-1024-NEXT:          Opcode:          I32_CONST
-CHECK-1024-NEXT:          Value:           1024
+CHECK-1024-NEXT:          Value:           65536
 
 CHECK-1024:       - Type:            EXPORT
 CHECK-1024:           - Name:            __data_end
@@ -50,7 +50,7 @@ CHECK-16777216-NEXT:        Type:            I32
 CHECK-16777216-NEXT:        Mutable:         true
 CHECK-16777216-NEXT:        InitExpr:
 CHECK-16777216-NEXT:          Opcode:          I32_CONST
-CHECK-16777216-NEXT:          Value:           16842752
+CHECK-16777216-NEXT:          Value:           65536
 CHECK-16777216-NEXT:      - Index:           1
 CHECK-16777216-NEXT:        Type:            I32
 CHECK-16777216-NEXT:        Mutable:         false
diff --git a/lld/test/wasm/globals.s b/lld/test/wasm/globals.s
index 6e049e1e73f91..47d9ba82818b7 100644
--- a/lld/test/wasm/globals.s
+++ b/lld/test/wasm/globals.s
@@ -42,7 +42,7 @@ immutable_global:
 # CHECK-NEXT:        Mutable:         true
 # CHECK-NEXT:        InitExpr:
 # CHECK-NEXT:          Opcode:          I32_CONST
-# CHECK-NEXT:          Value:           66560
+# CHECK-NEXT:          Value:           65536
 # CHECK-NEXT:       - Index:           1
 # CHECK-NEXT:         Type:            I32
 # CHECK-NEXT:         Mutable:         false
diff --git a/lld/test/wasm/import-memory.test b/lld/test/wasm/import-memory.test
index dd7066dec0059..9b8148e4d4495 100644
--- a/lld/test/wasm/import-memory.test
+++ b/lld/test/wasm/import-memory.test
@@ -10,7 +10,7 @@
 # CHECK-NEXT:        Field:           memory
 # CHECK-NEXT:        Kind:            MEMORY
 # CHECK-NEXT:        Memory:
-# CHECK-NEXT:          Minimum:         0x2
+# CHECK-NEXT:          Minimum:         0x1
 # CHECK-NEXT:  - Type:
 
 
diff --git a/lld/test/wasm/init-fini.ll b/lld/test/wasm/init-fini.ll
index ef2f41f96e89b..7471ebb5d8147 100644
--- a/lld/test/wasm/init-fini.ll
+++ b/lld/test/wasm/init-fini.ll
@@ -78,7 +78,7 @@ entry:
 ; CHECK-NEXT:         Body:            10041005100A100F1012100F10141004100C100F10161002100E0B
 ; CHECK:            - Index:           22
 ; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            02404186808080004100418088808000108080808000450D00000B0B
+; CHECK-NEXT:         Body:            02404186808080004100418080848000108080808000450D00000B0B
 ; CHECK-NEXT:   - Type:            CUSTOM
 ; CHECK-NEXT:     Name:            name
 ; CHECK-NEXT:     FunctionNames:
diff --git a/lld/test/wasm/large-memory.test b/lld/test/wasm/large-memory.test
index 5b737e4154963..a2888c61702ac 100644
--- a/lld/test/wasm/large-memory.test
+++ b/lld/test/wasm/large-memory.test
@@ -12,7 +12,7 @@ RUN: obj2yaml %t2.wasm | FileCheck %s --check-prefixes=CHECK,CHECK-4G
 CHECK:      - Type:            MEMORY
 CHECK-NEXT:   Memories:
 CHECK-NEXT:     - Flags:           [ HAS_MAX ]
-CHECK-NEXT:       Minimum:         0x2
+CHECK-NEXT:       Minimum:         0x1
 CHECK-2G-NEXT:    Maximum:         0x8000
 CHECK-4G-NEXT:    Maximum:         0x10000
 
diff --git a/lld/test/wasm/local-symbols.ll b/lld/test/wasm/local-symbols.ll
index 8faee647c44c8..6c639a83dbf51 100644
--- a/lld/test/wasm/local-symbols.ll
+++ b/lld/test/wasm/local-symbols.ll
@@ -45,13 +45,13 @@ entry:
 ; CHECK-NEXT:         Mutable:         true
 ; CHECK-NEXT:         InitExpr:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           66576
+; CHECK-NEXT:           Value:           65536
 ; CHECK-NEXT:       - Index:           1
 ; CHECK-NEXT:         Type:            I32
 ; CHECK-NEXT:         Mutable:         false
 ; CHECK-NEXT:         InitExpr:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           1024
+; CHECK-NEXT:           Value:           65536
 ; CHECK-NEXT:   - Type:            EXPORT
 ; CHECK-NEXT:     Exports:
 ; CHECK-NEXT:       - Name:            memory
@@ -67,17 +67,17 @@ entry:
 ; CHECK-NEXT:     Functions:
 ; CHECK-NEXT:       - Index:           0
 ; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            4100280284888080000B
+; CHECK-NEXT:         Body:            4100280284808480000B
 ; CHECK-NEXT:       - Index:           1
 ; CHECK-NEXT:         Locals:
 ; CHECK-NEXT:         Body:            1080808080001A0B
 ; CHECK-NEXT:   - Type:            DATA
 ; CHECK-NEXT:     Segments:
-; CHECK-NEXT:       - SectionOffset:   7
+; CHECK-NEXT:       - SectionOffset:   8
 ; CHECK-NEXT:         InitFlags:       0
 ; CHECK-NEXT:         Offset:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           1024
+; CHECK-NEXT:           Value:           65536
 ; CHECK-NEXT:         Content:         '0100000003000000'
 ; CHECK-NEXT:   - Type:            CUSTOM
 ; CHECK-NEXT:     Name:            name
diff --git a/lld/test/wasm/locals-duplicate.test b/lld/test/wasm/locals-duplicate.test
index 5c3135a424e69..88819b2707dde 100644
--- a/lld/test/wasm/locals-duplicate.test
+++ b/lld/test/wasm/locals-duplicate.test
@@ -26,7 +26,7 @@
 ; CHECK-NEXT:           Maximum:         0x7
 ; CHECK-NEXT:   - Type:            MEMORY
 ; CHECK-NEXT:     Memories:
-; CHECK-NEXT:       - Minimum:         0x2
+; CHECK-NEXT:  - Minimum: 0x2
 ; CHECK-NEXT:   - Type:            GLOBAL
 ; CHECK-NEXT:     Globals:
 ; CHECK-NEXT:       - Index:           0
@@ -34,19 +34,19 @@
 ; CHECK-NEXT:         Mutable:         true
 ; CHECK-NEXT:         InitExpr:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           66592
+; CHECK-NEXT:           Value:           65536
 ; CHECK-NEXT:       - Index:           1
 ; CHECK-NEXT:         Type:            I32
 ; CHECK-NEXT:         Mutable:         false
 ; CHECK-NEXT:         InitExpr:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           1028
+; CHECK-NEXT: Value: 65540
 ; CHECK-NEXT:       - Index:           2
 ; CHECK-NEXT:         Type:            I32
 ; CHECK-NEXT:         Mutable:         false
 ; CHECK-NEXT:         InitExpr:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           1036
+; CHECK-NEXT:           Value:           65548
 ; CHECK-NEXT:   - Type:            EXPORT
 ; CHECK-NEXT:     Exports:
 ; CHECK-NEXT:       - Name:            memory
@@ -119,13 +119,13 @@
 ; CHECK-NEXT:         Body:            41020B
 ; CHECK-NEXT:       - Index:           3
 ; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            4180888080000B
+; CHECK-NEXT:         Body:            4180808480000B
 ; CHECK-NEXT:       - Index:           4
 ; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            4184888080000B
+; CHECK-NEXT:         Body:            4184808480000B
 ; CHECK-NEXT:       - Index:           5
 ; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            4188888080000B
+; CHECK-NEXT:         Body:            4188808480000B
 ; CHECK-NEXT:       - Index:           6
 ; CHECK-NEXT:         Locals:
 ; CHECK-NEXT:         Body:            4181808080000B
@@ -146,13 +146,13 @@
 ; CHECK-NEXT:         Body:            41020B
 ; CHECK-NEXT:       - Index:           12
 ; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            418C888080000B
+; CHECK-NEXT:         Body:            418C808480000B
 ; CHECK-NEXT:       - Index:           13
 ; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            4190888080000B
+; CHECK-NEXT:         Body:            4190808480000B
 ; CHECK-NEXT:       - Index:           14
 ; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            4194888080000B
+; CHECK-NEXT:         Body:            4194808480000B
 ; CHECK-NEXT:       - Index:           15
 ; CHECK-NEXT:         Locals:
 ; CHECK-NEXT:         Body:            4184808080000B
@@ -164,11 +164,11 @@
 ; CHECK-NEXT:         Body:            4186808080000B
 ; CHECK-NEXT:   - Type:            DATA
 ; CHECK-NEXT:     Segments:
-; CHECK-NEXT:       - SectionOffset:   7
+; CHECK-NEXT:       - SectionOffset:   8
 ; CHECK-NEXT:         InitFlags:       0
 ; CHECK-NEXT:         Offset:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           1024
+; CHECK-NEXT:           Value:           65536
 ; CHECK-NEXT:         Content:         '010000000100000001000000010000000100000001000000'
 ; CHECK-NEXT:   - Type:            CUSTOM
 ; CHECK-NEXT:     Name:            name
diff --git a/lld/test/wasm/lto/tls.ll b/lld/test/wasm/lto/tls.ll
index b61edfba6146f..9c1642eb84535 100644
--- a/lld/test/wasm/lto/tls.ll
+++ b/lld/test/wasm/lto/tls.ll
@@ -30,13 +30,13 @@ attributes #0 = { noinline nounwind optnone "target-features"="+atomics,+bulk-me
 ; CHECK-NEXT:         Mutable:         true
 ; CHECK-NEXT:         InitExpr:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           66576
+; CHECK-NEXT:           Value:           65536
 ; CHECK-NEXT:       - Index:           1
 ; CHECK-NEXT:         Type:            I32
 ; CHECK-NEXT:         Mutable:         false
 ; CHECK-NEXT:         InitExpr:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           1024
+; CHECK-NEXT:           Value:           65536
 
 ;      CHECK:     GlobalNames:
 ; CHECK-NEXT:       - Index:           0
diff --git a/lld/test/wasm/lto/used.ll b/lld/test/wasm/lto/used.ll
index a1851035fa247..dd36259d92e38 100644
--- a/lld/test/wasm/lto/used.ll
+++ b/lld/test/wasm/lto/used.ll
@@ -26,11 +26,11 @@ return:
 
 ; CHECK:        - Type:            DATA
 ; CHECK-NEXT:     Segments:
-; CHECK-NEXT:       - SectionOffset:   7
+; CHECK-NEXT:       - SectionOffset:   8
 ; CHECK-NEXT:         InitFlags:       0
 ; CHECK-NEXT:         Offset:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           1024
+; CHECK-NEXT:           Value:           65536
 ; CHECK-NEXT:         Content:         '01000000'
 
 ; CHECK:       - Type:            CUSTOM
diff --git a/lld/test/wasm/map-file.s b/lld/test/wasm/map-file.s
index 2757f50187ffe..380ab57ca8ee4 100644
--- a/lld/test/wasm/map-file.s
+++ b/lld/test/wasm/map-file.s
@@ -59,15 +59,15 @@ somezeroes:
 # CHECK-NEXT:       -       66        b                 write_global
 # CHECK-NEXT:       -       71        f         {{.*}}{{/|\\}}map-file.s.tmp1.o:(_start)
 # CHECK-NEXT:       -       71        f                 _start
-# CHECK-NEXT:       -       82       11 DATA
-# CHECK-NEXT:     400       83        8 .data
-# CHECK-NEXT:     400       89        8         {{.*}}{{/|\\}}map-file.s.tmp1.o:(.data.somedata)
-# CHECK-NEXT:     400       89        8                 somedata
-# CHECK-NEXT:     408       82        4 .bss
-# CHECK-NEXT:     408        0        4         {{.*}}{{/|\\}}map-file.s.tmp1.o:(.bss.somezeroes)
-# CHECK-NEXT:     408        0        4                 somezeroes
-# CHECK-NEXT:       -       93       12 CUSTOM(.debug_info)
-# CHECK-NEXT:       -       a5       61 CUSTOM(name)
+# CHECK-NEXT:       -       82       12 DATA
+# CHECK-NEXT:   10000       83        8 .data
+# CHECK-NEXT:   10000       8a        8         {{.*}}{{/|\\}}map-file.s.tmp1.o:(.data.somedata)
+# CHECK-NEXT:   10000       8a        8                 somedata
+# CHECK-NEXT:   10008       82        4 .bss
+# CHECK-NEXT:   10008        0        4         {{.*}}{{/|\\}}map-file.s.tmp1.o:(.bss.somezeroes)
+# CHECK-NEXT:   10008        0        4                 somezeroes
+# CHECK-NEXT:       -       94       12 CUSTOM(.debug_info)
+# CHECK-NEXT:       -       a6       61 CUSTOM(name)
 
 # RUN: not wasm-ld %t1.o -o /dev/null -Map=/ 2>&1 \
 # RUN:  | FileCheck -check-prefix=FAIL %s
diff --git a/lld/test/wasm/memory-naming.test b/lld/test/wasm/memory-naming.test
index 766d9cd59050b..66143c317798c 100644
--- a/lld/test/wasm/memory-naming.test
+++ b/lld/test/wasm/memory-naming.test
@@ -57,7 +57,7 @@
 # CHECK-IMPORT-NEXT:        Field:           bar
 # CHECK-IMPORT-NEXT:        Kind:            MEMORY
 # CHECK-IMPORT-NEXT:        Memory:
-# CHECK-IMPORT-NEXT:          Minimum:         0x2
+# CHECK-IMPORT-NEXT:          Minimum:         0x1
 # CHECK-IMPORT:      - Type:            EXPORT
 # CHECK-IMPORT-NEXT:    Exports:
 # CHECK-IMPORT-NEXT:      - Name:            _start
@@ -77,7 +77,7 @@
 # CHECK-IMPORT-DEFAULT-NEXT:        Field:           foo
 # CHECK-IMPORT-DEFAULT-NEXT:        Kind:            MEMORY
 # CHECK-IMPORT-DEFAULT-NEXT:        Memory:
-# CHECK-IMPORT-DEFAULT-NEXT:          Minimum:         0x2
+# CHECK-IMPORT-DEFAULT-NEXT:          Minimum:         0x1
 # CHECK-IMPORT-DEFAULT-NEXT:  - Type:
 
 # RUN:wasm-ld --import-memory=foo,bar --export-memory=qux -o %t.both.wasm %t.start.o
@@ -91,7 +91,7 @@
 # CHECK-BOTH-NEXT:        Field:           bar
 # CHECK-BOTH-NEXT:        Kind:            MEMORY
 # CHECK-BOTH-NEXT:        Memory:
-# CHECK-BOTH-NEXT:          Minimum:         0x2
+# CHECK-BOTH-NEXT:          Minimum:         0x1
 # CHECK-BOTH:       - Type:            EXPORT
 # CHECK-BOTH-NEXT:    Exports:
 # CHECK-BOTH-NEXT:      - Name:            qux
diff --git a/lld/test/wasm/merge-string.s b/lld/test/wasm/merge-string.s
index a4b89abfc46d7..229f3933f1d89 100644
--- a/lld/test/wasm/merge-string.s
+++ b/lld/test/wasm/merge-string.s
@@ -41,21 +41,21 @@ negative_addend:
 //  COMMON-NEXT:        Mutable:         true
 //  COMMON-NEXT:        InitExpr:
 //  COMMON-NEXT:          Opcode:          I32_CONST
-//  COMMON-NEXT:          Value:           66576
+//  COMMON-NEXT:          Value:           65536
 //  COMMON-NEXT:      - Index:           1
 //  COMMON-NEXT:        Type:            I32
 //  COMMON-NEXT:        Mutable:         false
 //  COMMON-NEXT:        InitExpr:
 //  COMMON-NEXT:          Opcode:          I32_CONST
-//   MERGE-NEXT:          Value:           1024
-// NOMERGE-NEXT:          Value:           1028
+//   MERGE-NEXT:          Value:           65536
+// NOMERGE-NEXT:          Value:           65540
 //  COMMON-NEXT:      - Index:           2
 //  COMMON-NEXT:        Type:            I32
 //  COMMON-NEXT:        Mutable:         false
 //  COMMON-NEXT:        InitExpr:
 //  COMMON-NEXT:          Opcode:          I32_CONST
-//   MERGE-NEXT:          Value:           1025
-// NOMERGE-NEXT:          Value:           1029
+//   MERGE-NEXT:          Value:           65537
+// NOMERGE-NEXT:          Value:           65541
 //  COMMON-NEXT:  - Type:            EXPORT
 //  COMMON-NEXT:    Exports:
 //  COMMON-NEXT:      - Name:            memory
@@ -71,11 +71,11 @@ negative_addend:
 //
 //       COMMON:  - Type:            DATA
 //  COMMON-NEXT:    Segments:
-//  COMMON-NEXT:      - SectionOffset:   7
+//  COMMON-NEXT:      - SectionOffset:   8
 //  COMMON-NEXT:        InitFlags:       0
 //  COMMON-NEXT:        Offset:
 //  COMMON-NEXT:          Opcode:          I32_CONST
-//  COMMON-NEXT:          Value:           1024
+//  COMMON-NEXT:          Value:           65536
 //   MERGE-NEXT:          Content:         '61626300'
 // NOMERGE-NEXT:          Content:         '6162630061626300626300'
 
diff --git a/lld/test/wasm/multi-table.s b/lld/test/wasm/multi-table.s
index afe8ddac49768..31cba9ff77c2d 100644
--- a/lld/test/wasm/multi-table.s
+++ b/lld/test/wasm/multi-table.s
@@ -87,7 +87,7 @@ call_indirect_explicit_tables:
 # CHECK-NEXT:         Mutable:         true
 # CHECK-NEXT:         InitExpr:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           66576
+# CHECK-NEXT:           Value:           65536
 # CHECK-NEXT:   - Type:            EXPORT
 # CHECK-NEXT:     Exports:
 # CHECK-NEXT:       - Name:            memory
@@ -127,14 +127,14 @@ call_indirect_explicit_tables:
 # CHECK-NEXT:         Body:            42010B
 # CHECK-NEXT:       - Index:           3
 # CHECK-NEXT:         Locals:          []
-# CHECK-NEXT:         Body:            41002802808880800011818080800083808080001A41002802848880800011828080800083808080001A0B
+# CHECK-NEXT:         Body:            41002802808084800011818080800083808080001A41002802848084800011828080800083808080001A0B
 # CHECK-NEXT:   - Type:            DATA
 # CHECK-NEXT:     Segments:
-# CHECK-NEXT:       - SectionOffset:   7
+# CHECK-NEXT:       - SectionOffset:   8
 # CHECK-NEXT:         InitFlags:       0
 # CHECK-NEXT:         Offset:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           1024
+# CHECK-NEXT:           Value:           65536
 # CHECK-NEXT:         Content:         '0100000002000000'
 # CHECK-NEXT:   - Type:            CUSTOM
 # CHECK-NEXT:     Name:            name
diff --git a/lld/test/wasm/no-strip-segment.s b/lld/test/wasm/no-strip-segment.s
index e70acae296d1a..2b79ed625ea8a 100644
--- a/lld/test/wasm/no-strip-segment.s
+++ b/lld/test/wasm/no-strip-segment.s
@@ -47,16 +47,16 @@ grab_liba:
 # "greetings" section
 # CHECK: - Type:            DATA
 # CHECK:   Segments:
-# CHECK:     - SectionOffset:   7
+# CHECK:     - SectionOffset:   8
 # CHECK:       InitFlags:       0
 # CHECK:       Offset:
 # CHECK:         Opcode:          I32_CONST
-# CHECK:         Value:           1024
+# CHECK:         Value:           65536
 # CHECK:       Content:         68656C6C6F00776F726C6400
 # "weahters" section.
-# CHECK: - SectionOffset:   25
+# CHECK: - SectionOffset:   27
 # CHECK:   InitFlags:       0
 # CHECK:   Offset:
 # CHECK:     Opcode:          I32_CONST
-# CHECK:     Value:           1036
+# CHECK:     Value:           65548
 # CHECK:   Content:         636C6F75647900
diff --git a/lld/test/wasm/no-tls.s b/lld/test/wasm/no-tls.s
index c0786c83ffe70..c082c1ef2dc9f 100644
--- a/lld/test/wasm/no-tls.s
+++ b/lld/test/wasm/no-tls.s
@@ -28,7 +28,7 @@ _start:
 # CHECK-NEXT:        Mutable:         true
 # CHECK-NEXT:        InitExpr:
 # CHECK-NEXT:          Opcode:          I32_CONST
-# CHECK-NEXT:          Value:           66560
+# CHECK-NEXT:          Value:           65536
 
 # __tls_base
 # CHECK-NEXT:      - Index:           1
diff --git a/lld/test/wasm/page-size.s b/lld/test/wasm/page-size.s
index a2bf694936476..17850b5b17d30 100644
--- a/lld/test/wasm/page-size.s
+++ b/lld/test/wasm/page-size.s
@@ -19,7 +19,7 @@ foo:
 # CHECK-CUSTOM:      - Type:            MEMORY
 # CHECK-CUSTOM-NEXT:   Memories:
 # CHECK-CUSTOM-NEXT:   - Flags:           [ HAS_PAGE_SIZE ]
-# CHECK-CUSTOM-NEXT:     Minimum:         0x10410
+# CHECK-CUSTOM-NEXT:     Minimum:         0x10004
 # CHECK-CUSTOM-NEXT:     PageSize:        0x1
 
 # RUN: llvm-objdump --disassemble-symbols=_start %t.custom.wasm | FileCheck %s --check-prefix=CHECK-CUSTOM-DIS
@@ -51,7 +51,7 @@ foo:
 # CHECK-CUSTOM-IMPORT-NEXT:     Kind:            MEMORY
 # CHECK-CUSTOM-IMPORT-NEXT:     Memory:
 # CHECK-CUSTOM-IMPORT-NEXT:       Flags:           [ HAS_PAGE_SIZE ]
-# CHECK-CUSTOM-IMPORT-NEXT:       Minimum:         0x10410
+# CHECK-CUSTOM-IMPORT-NEXT:       Minimum:         0x10004
 # CHECK-CUSTOM-IMPORT-NEXT:       PageSize:        0x1
 
 # RUN: llvm-objdump --disassemble-symbols=_start %t.custom-import.wasm | FileCheck %s --check-prefix=CHECK-CUSTOM-IMPORT-DIS
diff --git a/lld/test/wasm/pic-static.ll b/lld/test/wasm/pic-static.ll
index 794b7218880b8..12ac4c3a5544e 100644
--- a/lld/test/wasm/pic-static.ll
+++ b/lld/test/wasm/pic-static.ll
@@ -62,7 +62,7 @@ entry:
 ; CHECK-NEXT:         Mutable:         true
 ; CHECK-NEXT:         InitExpr:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           66576
+; CHECK-NEXT:           Value:           65536
 
 ; GOT.func.ret32
 ; CHECK-NEXT:       - Index:           1
@@ -70,7 +70,7 @@ entry:
 ; CHECK-NEXT:         Mutable:         false
 ; CHECK-NEXT:         InitExpr:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           1
+; CHECK-NEXT: Value: 1
 
 ; GOT.func.missing_function
 ; CHECK-NEXT:       - Index:           2
@@ -102,7 +102,7 @@ entry:
 ; CHECK-NEXT:         Mutable:         false
 ; CHECK-NEXT:         InitExpr:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           1024
+; CHECK-NEXT:           Value:           65536
 
 ; GOT.mem.ret32_ptr
 ; CHECK-NEXT:       - Index:           6
@@ -110,7 +110,7 @@ entry:
 ; CHECK-NEXT:         Mutable:         false
 ; CHECK-NEXT:         InitExpr:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           1032
+; CHECK-NEXT:           Value:           65544
 
 ; __memory_base
 ; CHECK-NEXT:       - Index:           7
diff --git a/lld/test/wasm/reloc-relative.s b/lld/test/wasm/reloc-relative.s
index fde1d1dd08247..5aab061c63a0b 100644
--- a/lld/test/wasm/reloc-relative.s
+++ b/lld/test/wasm/reloc-relative.s
@@ -50,40 +50,40 @@ far:
 
 # CHECK:      - Type:            DATA
 # CHECK-NEXT:    Segments:
-# CHECK-NEXT:      - SectionOffset:   7
+# CHECK-NEXT:      - SectionOffset:   8
 # CHECK-NEXT:        InitFlags:       0
 # CHECK-NEXT:        Offset:
 # CHECK-NEXT:          Opcode:          I32_CONST
-# CHECK-NEXT:          Value:           1024
+# CHECK-NEXT:          Value:           65536
 # CHECK-NEXT:        Content:         68656C6C6F0A00
-# CHECK-NEXT:      - SectionOffset:   20
+# CHECK-NEXT:      - SectionOffset:   22
 # CHECK-NEXT:        InitFlags:       0
 # CHECK-NEXT:        Offset:
 # CHECK-NEXT:          Opcode:          I32_CONST
-# CHECK-NEXT:          Value:           1031
+# CHECK-NEXT:          Value:           65543
 # CHECK-NEXT:        Content:         000000002A000000
-# CHECK-NEXT:      - SectionOffset:   34
+# CHECK-NEXT:      - SectionOffset:   37
 # CHECK-NEXT:        InitFlags:       0
 # CHECK-NEXT:        Offset:
 # CHECK-NEXT:          Opcode:          I32_CONST
-# CHECK-NEXT:          Value:           1039
+# CHECK-NEXT:          Value:           65551
 # CHECK-NEXT:        Content:         FCFFFFFFFCFFFFFF
-# CHECK-NEXT:      - SectionOffset:   48
+# CHECK-NEXT:      - SectionOffset:   52
 # CHECK-NEXT:        InitFlags:       0
 # CHECK-NEXT:        Offset:
 # CHECK-NEXT:          Opcode:          I32_CONST
-# CHECK-NEXT:          Value:           1047
+# CHECK-NEXT:          Value:           65559
 # CHECK-NEXT:        Content:         E9FFFFFFE9FFFFFF
-# CHECK-NEXT:      - SectionOffset:   62
+# CHECK-NEXT:      - SectionOffset:   67
 # CHECK-NEXT:        InitFlags:       0
 # CHECK-NEXT:        Offset:
 # CHECK-NEXT:          Opcode:          I32_CONST
-# CHECK-NEXT:          Value:           1055
+# CHECK-NEXT:          Value:           65567
 # CHECK-NEXT:        Content:         '0800000008000000'
-# CHECK-NEXT:      - SectionOffset:   76
+# CHECK-NEXT:      - SectionOffset:   82
 # CHECK-NEXT:        InitFlags:       0
 # CHECK-NEXT:        Offset:
 # CHECK-NEXT:          Opcode:          I32_CONST
-# CHECK-NEXT:          Value:           1063
+# CHECK-NEXT:          Value:           65575
 # CHECK-NEXT:        Content:         '15000000'
 
diff --git a/lld/test/wasm/runtime-relocations-himem.s b/lld/test/wasm/runtime-relocations-himem.s
new file mode 100644
index 0000000000000..a12a93a6cb933
--- /dev/null
+++ b/lld/test/wasm/runtime-relocations-himem.s
@@ -0,0 +1,60 @@
+## Verifies runtime relocation code for addresses over 2gb works correctly.
+## We have had issues with LEB encoding of address over 2gb in i32.const
+## instruction leading to invalid binaries.
+
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: wasm-ld --global-base=2147483648 --experimental-pic --unresolved-symbols=import-dynamic -no-gc-sections --shared-memory --no-entry -o %t.wasm %t.o
+# XUN: obj2yaml %t.wasm | FileCheck %s
+# RUN: llvm-objdump -d --no-show-raw-insn --no-leading-addr %t.wasm | FileCheck %s --
+
+.globl tls_sym
+.globl data_sym
+.globl _start
+.globaltype __tls_base, i32
+
+_start:
+  .functype _start () -> ()
+  global.get __tls_base
+  i32.const tls_sym@TLSREL
+  i32.add
+  drop
+  i32.const data_sym
+  drop
+  end_function
+
+.section tls_sec,"T",@
+.p2align  2
+tls_sym:
+  .int32 0
+  .int32 extern_sym
+  .size tls_sym, 8
+
+.section data_sec,"",@
+.p2align  2
+data_sym:
+  .int32 0
+  .int32 extern_sym
+  .size data_sym, 8
+
+.section  .custom_section.target_features,"",@
+  .int8 2
+  .int8 43
+  .int8 7
+  .ascii  "atomics"
+  .int8 43
+  .int8 11
+  .ascii  "bulk-memory"
+
+# CHECK: <__wasm_apply_data_relocs>:
+# CHECK-EMPTY:
+# CHECK-NEXT:  i32.const -2147483636
+# CHECK-NEXT:  global.get 0
+# CHECK-NEXT:  i32.store 0
+# CHECK-NEXT:  end
+
+# CHECK: <__wasm_apply_tls_relocs>:
+# CHECK-EMPTY:
+# CHECK-NEXT:  i32.const -2147483644
+# CHECK-NEXT:  global.get 0
+# CHECK-NEXT:  i32.store 0
+# CHECK-NEXT:  end
diff --git a/lld/test/wasm/shared-memory-no-atomics.yaml b/lld/test/wasm/shared-memory-no-atomics.yaml
index 942c69053a4b2..62f4ac91822de 100644
--- a/lld/test/wasm/shared-memory-no-atomics.yaml
+++ b/lld/test/wasm/shared-memory-no-atomics.yaml
@@ -55,7 +55,7 @@ Sections:
 
 # NO-SHARED:      - Type:            MEMORY
 # NO-SHARED-NEXT:   Memories:
-# NO-SHARED-NEXT:     - Minimum:         0x2
+# NO-SHARED-NEXT:     - Minimum:         0x1
 # NO-SHARED-NOT:        Maximum:
 
 # SHARED: --shared-memory is disallowed by {{.*}}shared-memory-no-atomics.yaml.tmp1.o because it was not compiled with 'atomics' or 'bulk-memory' features.
diff --git a/lld/test/wasm/shared-memory.yaml b/lld/test/wasm/shared-memory.yaml
index 4cdbb951eab9c..b3490c8785f00 100644
--- a/lld/test/wasm/shared-memory.yaml
+++ b/lld/test/wasm/shared-memory.yaml
@@ -1,16 +1,16 @@
 # RUN: yaml2obj %s -o %t1.o
 
-# RUN: wasm-ld --no-entry --shared-memory --features=atomics,bulk-memory %t1.o -o - | obj2yaml | FileCheck %s --check-prefix SHARED
+# RUN: wasm-ld --no-entry --no-gc-sections --shared-memory --features=atomics,bulk-memory %t1.o -o - | obj2yaml | FileCheck %s --check-prefix SHARED
 
-# RUN: not wasm-ld --no-entry --shared-memory --max-memory=100000 %t1.o -o - 2>&1 | FileCheck %s --check-prefix SHARED-UNALIGNED
+# RUN: not wasm-ld --no-entry --no-gc-sections --shared-memory --max-memory=100000 %t1.o -o - 2>&1 | FileCheck %s --check-prefix SHARED-UNALIGNED
 
-# RUN: not wasm-ld --no-entry --shared-memory --max-memory=131072 --features=bulk-memory %t1.o -o - 2>&1 | FileCheck %s --check-prefix SHARED-NO-ATOMICS
+# RUN: not wasm-ld --no-entry --no-gc-sections --shared-memory --max-memory=131072 --features=bulk-memory %t1.o -o - 2>&1 | FileCheck %s --check-prefix SHARED-NO-ATOMICS
 
-# RUN: not wasm-ld --no-entry --shared-memory --max-memory=131072 --features=atomics %t1.o -o - 2>&1 | FileCheck %s --check-prefix SHARED-NO-BULK-MEM
+# RUN: not wasm-ld --no-entry --no-gc-sections --shared-memory --max-memory=131072 --features=atomics %t1.o -o - 2>&1 | FileCheck %s --check-prefix SHARED-NO-BULK-MEM
 
 # RUN: wasm-ld --relocatable --features=atomics %t1.o -o - | obj2yaml | FileCheck %s --check-prefix ATOMICS-RELOCATABLE
 
-# RUN: wasm-ld --no-entry --shared-memory --max-memory=131072 --features=atomics,bulk-memory %t1.o -o - | obj2yaml | FileCheck %s --check-prefix SHARED
+# XUN: wasm-ld --no-entry --no-gc-sections --shared-memory --max-memory=131072 --features=atomics,bulk-memory %t1.o -o - | obj2yaml | FileCheck %s --check-prefix SHARED
 
 --- !WASM
 FileHeader:
@@ -22,7 +22,7 @@ Sections:
         Field:           __linear_memory
         Kind:            MEMORY
         Memory:
-          Minimum:         0x00000001
+          Minimum:         0x00000009
       - Module:          env
         Field:           __indirect_function_table
         Kind:            TABLE
diff --git a/lld/test/wasm/stack-first.test b/lld/test/wasm/stack-first.test
index 72e1a006d5700..91f06a47070a0 100644
--- a/lld/test/wasm/stack-first.test
+++ b/lld/test/wasm/stack-first.test
@@ -5,9 +5,20 @@
 ; Also test that __heap_base is still aligned with the --stack-first option.
 
 RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/stack-first.s -o %t.o
-RUN: wasm-ld -z stack-size=512 --stack-first --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o
+
+; Check that the default is `--stack-first`
+RUN: wasm-ld -z stack-size=512 --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o
+RUN: obj2yaml %t.wasm | FileCheck %s
+
+; Check `--no-stack-first`
+RUN: wasm-ld -z stack-size=512 --no-stack-first --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o
+RUN: obj2yaml %t.wasm | FileCheck %s --check-prefix=NOT-FIRST
+
+; Check `--stack-first`
+RUN: wasm-ld -z stack-size=512 --no-stack-first --stack-first --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o
 RUN: obj2yaml %t.wasm | FileCheck %s
 
+
 CHECK:        - Type:            GLOBAL
 CHECK-NEXT:     Globals:         
 CHECK-NEXT:       - Index:           0
@@ -51,3 +62,19 @@ CHECK-NEXT:         Index:           2
 CHECK-NEXT:       - Name:            __heap_base
 CHECK-NEXT:         Kind:            GLOBAL
 CHECK-NEXT:         Index:           3
+
+NOT-FIRST:        - Type:            GLOBAL
+NOT-FIRST-NEXT:     Globals:
+NOT-FIRST-NEXT:       - Index:           0
+NOT-FIRST-NEXT:         Type:            I32
+NOT-FIRST-NEXT:         Mutable:         true
+NOT-FIRST-NEXT:         InitExpr:
+NOT-FIRST-NEXT:           Opcode:          I32_CONST
+NOT-FIRST-NEXT:           Value:           1552
+NOT-FIRST-NEXT:       - Index:           1
+NOT-FIRST-NEXT:         Type:            I32
+NOT-FIRST-NEXT:         Mutable:         false
+NOT-FIRST-NEXT:         InitExpr:
+NOT-FIRST-NEXT:           Opcode:          I32_CONST
+NOT-FIRST-NEXT:           Value:           1024
+
diff --git a/lld/test/wasm/startstop.ll b/lld/test/wasm/startstop.ll
index e7a5c80f91f28..c22956f2759e4 100644
--- a/lld/test/wasm/startstop.ll
+++ b/lld/test/wasm/startstop.ll
@@ -27,19 +27,19 @@ entry:
 
 ;      CHECK:   - Type:            DATA
 ; CHECK-NEXT:     Segments:
-; CHECK-NEXT:       - SectionOffset:   7
+; CHECK-NEXT:       - SectionOffset:   8
 ; CHECK-NEXT:         InitFlags:       0
 ; CHECK-NEXT:         Offset:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           1024
+; CHECK-NEXT:           Value:           65536
 ; CHECK-NEXT:         Content:         03000000040000002A0000002B000000
 
-;       ASM: 0000006e <get_start>:
+;       ASM: 00000070 <get_start>:
 ; ASM-EMPTY:
-;  ASM-NEXT:        70:     i32.const 1024
-;  ASM-NEXT:        76:     end
+;  ASM-NEXT:        72:     i32.const 65536
+;  ASM-NEXT:        78:     end
 
-;       ASM: 00000077 <get_end>:
+;       ASM: 00000079 <get_end>:
 ; ASM-EMPTY:
-;  ASM-NEXT:        79:     i32.const 1040
-;  ASM-NEXT:       7f:     end
+;  ASM-NEXT:        7b:     i32.const 65552
+;  ASM-NEXT:       81:     end
diff --git a/lld/test/wasm/table-base.s b/lld/test/wasm/table-base.s
index 56fff414fd31d..92156c49cd538 100644
--- a/lld/test/wasm/table-base.s
+++ b/lld/test/wasm/table-base.s
@@ -29,7 +29,7 @@ _start:
 # CHECK-DEFAULT-NEXT:        Mutable:         true
 # CHECK-DEFAULT-NEXT:        InitExpr:
 # CHECK-DEFAULT-NEXT:          Opcode:          I32_CONST
-# CHECK-DEFAULT-NEXT:          Value:           66560
+# CHECK-DEFAULT-NEXT:          Value:           65536
 # CHECK-DEFAULT-NEXT:      - Index:           1
 # CHECK-DEFAULT-NEXT:        Type:            I32
 # CHECK-DEFAULT-NEXT:        Mutable:         false
@@ -58,7 +58,7 @@ _start:
 # CHECK-100-NEXT:        Mutable:         true
 # CHECK-100-NEXT:        InitExpr:
 # CHECK-100-NEXT:          Opcode:          I32_CONST
-# CHECK-100-NEXT:          Value:           66560
+# CHECK-100-NEXT:          Value:           65536
 # CHECK-100-NEXT:      - Index:           1
 # CHECK-100-NEXT:        Type:            I32
 # CHECK-100-NEXT:        Mutable:         false
diff --git a/lld/test/wasm/tls-align.s b/lld/test/wasm/tls-align.s
index 4fd296e1ef7fd..3b51165aea30e 100644
--- a/lld/test/wasm/tls-align.s
+++ b/lld/test/wasm/tls-align.s
@@ -65,7 +65,7 @@ tls2:
 # CHECK-NEXT:       Mutable:         true
 # CHECK-NEXT:       InitExpr:
 # CHECK-NEXT:         Opcode:          I32_CONST
-# CHECK-NEXT:         Value:           66592
+# CHECK-NEXT:         Value:           65536
 
 # __tls_base
 # CHECK-NEXT:     - Index:           1
diff --git a/lld/test/wasm/tls-non-shared-memory-basic.s b/lld/test/wasm/tls-non-shared-memory-basic.s
index 8ef0173ba72d7..66ccf8f4f945c 100644
--- a/lld/test/wasm/tls-non-shared-memory-basic.s
+++ b/lld/test/wasm/tls-non-shared-memory-basic.s
@@ -27,11 +27,11 @@ tls1:
 
 #      CHECK:  - Type:            DATA
 # CHECK-NEXT:    Segments:
-# CHECK-NEXT:      - SectionOffset:   7
+# CHECK-NEXT:      - SectionOffset:   8
 # CHECK-NEXT:        InitFlags:       0
 # CHECK-NEXT:        Offset:
 # CHECK-NEXT:          Opcode:          I32_CONST
-# CHECK-NEXT:          Value:           1024
+# CHECK-NEXT:          Value:           65536
 # CHECK-NEXT:        Content:         2B000000
 # CHECK-NEXT:  - Type:            CUSTOM
 # CHECK-NOT:   - Type:            IMPORT
diff --git a/lld/test/wasm/tls-non-shared-memory.s b/lld/test/wasm/tls-non-shared-memory.s
index 04fbb62228a7e..0d73acb429b18 100644
--- a/lld/test/wasm/tls-non-shared-memory.s
+++ b/lld/test/wasm/tls-non-shared-memory.s
@@ -63,38 +63,38 @@ tls1:
 # CHECK-NEXT:         Mutable:         true
 # CHECK-NEXT:         InitExpr:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           66576
+# CHECK-NEXT:           Value:           65536
 # __tls_base
 # CHECK-NEXT:       - Index:           1
 # CHECK-NEXT:         Type:            I32
 # CHECK-NEXT:         Mutable:         false
 # CHECK-NEXT:         InitExpr:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           1024
+# CHECK-NEXT:           Value:           65536
 # GOT.data.internal.tls1
 # CHECK-NEXT:       - Index:           2
 # CHECK-NEXT:         Type:            I32
 # CHECK-NEXT:         Mutable:         false
 # CHECK-NEXT:         InitExpr:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           1024
+# CHECK-NEXT:           Value:           65536
 # CHECK-NEXT:   - Type:            EXPORT
 
 #      CHECK:  - Type:            DATA
 # .data
 # CHECK-NEXT:    Segments:
-# CHECK-NEXT:      - SectionOffset:   7
+# CHECK-NEXT:      - SectionOffset:   8
 # CHECK-NEXT:        InitFlags:       0
 # CHECK-NEXT:        Offset:
 # CHECK-NEXT:          Opcode:          I32_CONST
-# CHECK-NEXT:          Value:           1024
+# CHECK-NEXT:          Value:           65536
 # CHECK-NEXT:        Content:         2B000000
 # .tdata
-# CHECK-NEXT:      - SectionOffset:   17
+# CHECK-NEXT:      - SectionOffset:   19
 # CHECK-NEXT:        InitFlags:       0
 # CHECK-NEXT:        Offset:
 # CHECK-NEXT:          Opcode:          I32_CONST
-# CHECK-NEXT:          Value:           1028
+# CHECK-NEXT:          Value:           65540
 # CHECK-NEXT:        Content:         2A000000
 # CHECK-NEXT:  - Type:            CUSTOM
 
diff --git a/lld/test/wasm/tls.s b/lld/test/wasm/tls.s
index b1f47f6769927..21f25f5ee7d18 100644
--- a/lld/test/wasm/tls.s
+++ b/lld/test/wasm/tls.s
@@ -98,7 +98,7 @@ tls3:
 # CHECK-NEXT:       Mutable:         true
 # CHECK-NEXT:       InitExpr:
 # CHECK-NEXT:         Opcode:          I32_CONST
-# CHECK-NEXT:         Value:           66592
+# CHECK-NEXT:         Value:           65536
 
 # __tls_base
 # CHECK-NEXT:     - Index:           1
diff --git a/lld/test/wasm/undefined-weak-call.s b/lld/test/wasm/undefined-weak-call.s
index 7490104afe516..47775c86f28ea 100644
--- a/lld/test/wasm/undefined-weak-call.s
+++ b/lld/test/wasm/undefined-weak-call.s
@@ -61,7 +61,7 @@ callWeakFuncs:
 # CHECK-NEXT:           Maximum:         0x1
 # CHECK-NEXT:   - Type:            MEMORY
 # CHECK-NEXT:     Memories:
-# CHECK-NEXT:       - Minimum:         0x2
+# CHECK-NEXT:       - Minimum:         0x1
 # CHECK-NEXT:   - Type:            GLOBAL
 # CHECK-NEXT:     Globals:
 # CHECK-NEXT:       - Index:           0
@@ -69,7 +69,7 @@ callWeakFuncs:
 # CHECK-NEXT:         Mutable:         true
 # CHECK-NEXT:         InitExpr:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           66560
+# CHECK-NEXT:           Value:           65536
 # CHECK-NEXT:   - Type:            EXPORT
 # CHECK-NEXT:     Exports:
 # CHECK-NEXT:       - Name:            memory
diff --git a/lld/test/wasm/weak-alias-overide.ll b/lld/test/wasm/weak-alias-overide.ll
index ca6f4bf4230a2..30bf7cf3f9dff 100644
--- a/lld/test/wasm/weak-alias-overide.ll
+++ b/lld/test/wasm/weak-alias-overide.ll
@@ -44,7 +44,7 @@ entry:
 ; CHECK-NEXT:           Maximum:         0x3
 ; CHECK-NEXT:   - Type:            MEMORY
 ; CHECK-NEXT:     Memories:
-; CHECK-NEXT:       - Minimum:         0x2
+; CHECK-NEXT:       - Minimum:         0x1
 ; CHECK-NEXT:   - Type:            GLOBAL
 ; CHECK-NEXT:     Globals:
 ; CHECK-NEXT:       - Index:           0
@@ -52,7 +52,7 @@ entry:
 ; CHECK-NEXT:         Mutable:         true
 ; CHECK-NEXT:         InitExpr:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           66560
+; CHECK-NEXT:           Value:           65536
 ; CHECK-NEXT:   - Type:            EXPORT
 ; CHECK-NEXT:     Exports:
 ; CHECK-NEXT:       - Name:            memory
diff --git a/lld/test/wasm/weak-alias.ll b/lld/test/wasm/weak-alias.ll
index 1768b8fd5b385..86e42a788b492 100644
--- a/lld/test/wasm/weak-alias.ll
+++ b/lld/test/wasm/weak-alias.ll
@@ -41,7 +41,7 @@ entry:
 ; CHECK-NEXT:           Maximum:         0x2
 ; CHECK-NEXT:   - Type:            MEMORY
 ; CHECK-NEXT:     Memories:
-; CHECK-NEXT:       - Minimum:         0x2
+; CHECK-NEXT:       - Minimum:         0x1
 ; CHECK-NEXT:   - Type:            GLOBAL
 ; CHECK-NEXT:     Globals:
 ; CHECK-NEXT:       - Index:           0
@@ -49,7 +49,7 @@ entry:
 ; CHECK-NEXT:         Mutable:         true
 ; CHECK-NEXT:         InitExpr:
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           66560
+; CHECK-NEXT:           Value:           65536
 ; CHECK-NEXT:   - Type:            EXPORT
 ; CHECK-NEXT:     Exports:
 ; CHECK-NEXT:       - Name:            memory
diff --git a/lld/test/wasm/weak-symbols.s b/lld/test/wasm/weak-symbols.s
index 165ec174aaa50..ed85851729b95 100644
--- a/lld/test/wasm/weak-symbols.s
+++ b/lld/test/wasm/weak-symbols.s
@@ -48,13 +48,13 @@ _start:
 # CHECK-NEXT:         Mutable:         true
 # CHECK-NEXT:         InitExpr:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           66576
+# CHECK-NEXT:           Value:           65536
 # CHECK-NEXT:       - Index:           1
 # CHECK-NEXT:         Type:            I32
 # CHECK-NEXT:         Mutable:         false
 # CHECK-NEXT:         InitExpr:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           1024
+# CHECK-NEXT:           Value:           65536
 # CHECK-NEXT:   - Type:            EXPORT
 # CHECK-NEXT:     Exports:
 # CHECK-NEXT:       - Name:            memory
@@ -97,11 +97,11 @@ _start:
 # CHECK-NEXT:         Body:            4181808080000B
 # CHECK-NEXT:   - Type:            DATA
 # CHECK-NEXT:     Segments:
-# CHECK-NEXT:       - SectionOffset:   7
+# CHECK-NEXT:       - SectionOffset:   8
 # CHECK-NEXT:         InitFlags:       0
 # CHECK-NEXT:         Offset:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           1024
+# CHECK-NEXT:           Value:           65536
 # CHECK-NEXT:         Content:         '01000000'
 # CHECK-NEXT:   - Type:            CUSTOM
 # CHECK-NEXT:     Name:            name
diff --git a/lld/test/wasm/weak-undefined-pic.s b/lld/test/wasm/weak-undefined-pic.s
index 5937380ee4e96..1a3a1715b4bb9 100644
--- a/lld/test/wasm/weak-undefined-pic.s
+++ b/lld/test/wasm/weak-undefined-pic.s
@@ -45,7 +45,7 @@ _start:
 # CHECK-NEXT:         Mutable:         true
 # CHECK-NEXT:         InitExpr:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           66560
+# CHECK-NEXT:           Value:           65536
 # Global 'undefined_weak:foo' representing the GOT entry for foo
 # Unlike other internal GOT entries that need to be mutable this one
 # is immutable and not updated by `__wasm_apply_global_relocs`
diff --git a/lld/test/wasm/weak-undefined.s b/lld/test/wasm/weak-undefined.s
index e1f551d6d30b6..558cac527d702 100644
--- a/lld/test/wasm/weak-undefined.s
+++ b/lld/test/wasm/weak-undefined.s
@@ -67,7 +67,7 @@ _start:
 # CHECK-NEXT:           Maximum:         0x1
 # CHECK-NEXT:   - Type:            MEMORY
 # CHECK-NEXT:     Memories:
-# CHECK-NEXT:       - Minimum:         0x2
+# CHECK-NEXT:       - Minimum:         0x1
 # CHECK-NEXT:   - Type:            GLOBAL
 # CHECK-NEXT:     Globals:
 # CHECK-NEXT:       - Index:           0
@@ -75,7 +75,7 @@ _start:
 # CHECK-NEXT:         Mutable:         true
 # CHECK-NEXT:         InitExpr:
 # CHECK-NEXT:           Opcode:          I32_CONST
-# CHECK-NEXT:           Value:           66560
+# CHECK-NEXT:           Value:           65536
 # CHECK-NEXT:   - Type:            EXPORT
 # CHECK-NEXT:     Exports:
 # CHECK-NEXT:       - Name:            memory
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 9c0e1b58e62f9..fac166587cb9b 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -595,7 +595,7 @@ static void readConfigs(opt::InputArgList &args) {
   ctx.arg.shlibSigCheck = !args.hasArg(OPT_no_shlib_sigcheck);
   ctx.arg.stripAll = args.hasArg(OPT_strip_all);
   ctx.arg.stripDebug = args.hasArg(OPT_strip_debug);
-  ctx.arg.stackFirst = args.hasArg(OPT_stack_first);
+  ctx.arg.stackFirst = args.hasFlag(OPT_stack_first, OPT_no_stack_first, true);
   ctx.arg.trace = args.hasArg(OPT_trace);
   ctx.arg.thinLTOCacheDir = args.getLastArgValue(OPT_thinlto_cache_dir);
   ctx.arg.thinLTOCachePolicy = CHECK(
diff --git a/lld/wasm/InputChunks.cpp b/lld/wasm/InputChunks.cpp
index 44927e7a432bc..14e02e6009318 100644
--- a/lld/wasm/InputChunks.cpp
+++ b/lld/wasm/InputChunks.cpp
@@ -423,8 +423,6 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
 
   bool is64 = ctx.arg.is64.value_or(false);
   bool generated = false;
-  unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST
-                                   : WASM_OPCODE_I32_CONST;
   unsigned opcode_ptr_add = is64 ? WASM_OPCODE_I64_ADD
                                  : WASM_OPCODE_I32_ADD;
 
@@ -451,8 +449,7 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
                       << " output offset=" << offset << "\n");
 
     // Calculate the address at which to apply the relocation
-    writeU8(os, opcode_ptr_const, "CONST");
-    writeSleb128(os, offset, "offset");
+    writePtrConst(os, offset, is64, "offset");
 
     // In PIC mode we need to add the __memory_base
     if (ctx.isPic) {
@@ -466,8 +463,6 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
 
     // Now figure out what we want to store at this location
     bool is64 = relocIs64(rel.Type);
-    unsigned opcode_reloc_const =
-        is64 ? WASM_OPCODE_I64_CONST : WASM_OPCODE_I32_CONST;
     unsigned opcode_reloc_add =
         is64 ? WASM_OPCODE_I64_ADD : WASM_OPCODE_I32_ADD;
     unsigned opcode_reloc_store =
@@ -477,8 +472,7 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
       writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET");
       writeUleb128(os, sym->getGOTIndex(), "global index");
       if (rel.Addend) {
-        writeU8(os, opcode_reloc_const, "CONST");
-        writeSleb128(os, rel.Addend, "addend");
+        writePtrConst(os, rel.Addend, is64, "addend");
         writeU8(os, opcode_reloc_add, "ADD");
       }
     } else {
@@ -491,8 +485,8 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
         baseSymbol = ctx.sym.tlsBase;
       writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET");
       writeUleb128(os, baseSymbol->getGlobalIndex(), "base");
-      writeU8(os, opcode_reloc_const, "CONST");
-      writeSleb128(os, file->calcNewValue(rel, tombstone, this), "offset");
+      writePtrConst(os, file->calcNewValue(rel, tombstone, this), is64,
+                    "offset");
       writeU8(os, opcode_reloc_add, "ADD");
     }
 
diff --git a/lld/wasm/Options.td b/lld/wasm/Options.td
index 2f699e2f68350..33ecf03176d36 100644
--- a/lld/wasm/Options.td
+++ b/lld/wasm/Options.td
@@ -250,8 +250,9 @@ def no_entry: FF<"no-entry">,
 def no_shlib_sigcheck: FF<"no-shlib-sigcheck">,
   HelpText<"Do not check signatures of functions defined in shared libraries.">;
 
-def stack_first: FF<"stack-first">,
-  HelpText<"Place stack at start of linear memory rather than after data">;
+defm stack_first: B<"stack-first",
+    "Place stack at start of linear memory (default)",
+    "Place the stack after static data region">;
 
 def table_base: JJ<"table-base=">,
   HelpText<"Table offset at which to place address taken functions (Defaults to 1)">;
diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp
index e1192706ea913..399a5084e6595 100644
--- a/lld/wasm/SyntheticSections.cpp
+++ b/lld/wasm/SyntheticSections.cpp
@@ -434,8 +434,6 @@ void GlobalSection::addInternalGOTEntry(Symbol *sym) {
 void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const {
   assert(!ctx.arg.extendedConst);
   bool is64 = ctx.arg.is64.value_or(false);
-  unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST
-                                   : WASM_OPCODE_I32_CONST;
   unsigned opcode_ptr_add = is64 ? WASM_OPCODE_I64_ADD
                                  : WASM_OPCODE_I32_ADD;
 
@@ -452,8 +450,7 @@ void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const {
         writeUleb128(os, ctx.sym.memoryBase->getGlobalIndex(), "__memory_base");
 
       // Add the virtual address of the data symbol
-      writeU8(os, opcode_ptr_const, "CONST");
-      writeSleb128(os, d->getVA(), "offset");
+      writePtrConst(os, d->getVA(), is64, "offset");
     } else if (auto *f = dyn_cast<FunctionSymbol>(sym)) {
       if (f->isStub)
         continue;
@@ -462,8 +459,7 @@ void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const {
       writeUleb128(os, ctx.sym.tableBase->getGlobalIndex(), "__table_base");
 
       // Add the table index to __table_base
-      writeU8(os, opcode_ptr_const, "CONST");
-      writeSleb128(os, f->getTableIndex(), "offset");
+      writePtrConst(os, f->getTableIndex(), is64, "offset");
     } else {
       assert(isa<UndefinedData>(sym) || isa<SharedData>(sym));
       continue;
diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt
index e3b72e94d4beb..0736e6ba132c8 100644
--- a/lldb/CMakeLists.txt
+++ b/lldb/CMakeLists.txt
@@ -62,11 +62,16 @@ if (LLDB_ENABLE_PYTHON)
   set(cachestring_LLDB_PYTHON_EXT_SUFFIX
     "Filename extension for native code python modules")
 
+  if (LLDB_ENABLE_PYTHON_LIMITED_API)
+    set(stable_abi "--stable-abi")
+  endif()
+
   foreach(var LLDB_PYTHON_RELATIVE_PATH LLDB_PYTHON_EXE_RELATIVE_PATH LLDB_PYTHON_EXT_SUFFIX)
     if(NOT DEFINED ${var} AND NOT CMAKE_CROSSCOMPILING)
       execute_process(
         COMMAND ${Python3_EXECUTABLE}
           ${CMAKE_CURRENT_SOURCE_DIR}/bindings/python/get-python-config.py
+          ${stable_abi}
           ${var}
         OUTPUT_VARIABLE value
         OUTPUT_STRIP_TRAILING_WHITESPACE)
@@ -87,6 +92,12 @@ if (LLDB_ENABLE_PYTHON)
       set(LLDB_PYTHON_EXT_SUFFIX "_d${LLDB_PYTHON_EXT_SUFFIX}")
     endif()
   endif()
+  if(TARGET Python3::Python)
+    get_target_property(_Python3_LIB_PATH Python3::Python IMPORTED_LIBRARY_LOCATION)
+    if(_Python3_LIB_PATH)
+      get_filename_component(LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME "${_Python3_LIB_PATH}" NAME)
+    endif()
+  endif()
 endif ()
 
 if (LLDB_ENABLE_LUA)
diff --git a/lldb/bindings/interface/SBFrameListExtensions.i b/lldb/bindings/interface/SBFrameListExtensions.i
new file mode 100644
index 0000000000000..1c6ac8d50a54c
--- /dev/null
+++ b/lldb/bindings/interface/SBFrameListExtensions.i
@@ -0,0 +1,41 @@
+%extend lldb::SBFrameList {
+
+#ifdef SWIGPYTHON
+       %nothreadallow;
+#endif
+       std::string lldb::SBFrameList::__str__ (){
+           lldb::SBStream description;
+           if (!$self->GetDescription(description))
+               return std::string("<empty> lldb.SBFrameList()");
+           const char *desc = description.GetData();
+           size_t desc_len = description.GetSize();
+           if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
+               --desc_len;
+           return std::string(desc, desc_len);
+       }
+#ifdef SWIGPYTHON
+       %clearnothreadallow;
+#endif
+
+#ifdef SWIGPYTHON
+    %pythoncode %{
+        def __iter__(self):
+            '''Iterate over all frames in a lldb.SBFrameList object.'''
+            return lldb_iter(self, 'GetSize', 'GetFrameAtIndex')
+
+        def __len__(self):
+            return int(self.GetSize())
+
+        def __getitem__(self, key):
+            if type(key) is not int:
+                return None
+            if key < 0:
+                count = len(self)
+                if -count <= key < count:
+                    key %= count
+
+            frame = self.GetFrameAtIndex(key)
+            return frame if frame.IsValid() else None
+    %}
+#endif
+}
diff --git a/lldb/bindings/interface/SBSectionDocstrings.i b/lldb/bindings/interface/SBSectionDocstrings.i
index 231e9e89da116..9c9cb813158d9 100644
--- a/lldb/bindings/interface/SBSectionDocstrings.i
+++ b/lldb/bindings/interface/SBSectionDocstrings.i
@@ -4,7 +4,7 @@
 SBSection supports iteration through its subsection, represented as SBSection
 as well.  For example, ::
 
-    for sec in exe_module:
+    for sec in exe_module.section_iter():
         if sec.GetName() == '__TEXT':
             print sec
             break
diff --git a/lldb/bindings/interface/SBThreadExtensions.i b/lldb/bindings/interface/SBThreadExtensions.i
index 4ec9f10b1a256..c9ae4103d7b60 100644
--- a/lldb/bindings/interface/SBThreadExtensions.i
+++ b/lldb/bindings/interface/SBThreadExtensions.i
@@ -41,7 +41,8 @@ STRING_EXTENSION_OUTSIDE(SBThread)
         def get_thread_frames(self):
             '''An accessor function that returns a list() that contains all frames in a lldb.SBThread object.'''
             frames = []
-            for frame in self:
+            frame_list = self.GetFrames()
+            for frame in frame_list:
                 frames.append(frame)
             return frames
 
diff --git a/lldb/bindings/interfaces.swig b/lldb/bindings/interfaces.swig
index b3d44979c916c..fddbedf02e835 100644
--- a/lldb/bindings/interfaces.swig
+++ b/lldb/bindings/interfaces.swig
@@ -119,6 +119,7 @@
 %include "lldb/API/SBFileSpecList.h"
 %include "lldb/API/SBFormat.h"
 %include "lldb/API/SBFrame.h"
+%include "lldb/API/SBFrameList.h"
 %include "lldb/API/SBFunction.h"
 %include "lldb/API/SBHostOS.h"
 %include "lldb/API/SBInstruction.h"
@@ -193,6 +194,7 @@
 %include "./interface/SBFileSpecExtensions.i"
 %include "./interface/SBFileSpecListExtensions.i"
 %include "./interface/SBFrameExtensions.i"
+%include "./interface/SBFrameListExtensions.i"
 %include "./interface/SBFunctionExtensions.i"
 %include "./interface/SBInstructionExtensions.i"
 %include "./interface/SBInstructionListExtensions.i"
diff --git a/lldb/bindings/lua/lua-typemaps.swig b/lldb/bindings/lua/lua-typemaps.swig
index 56756936a532c..f2a7401419368 100644
--- a/lldb/bindings/lua/lua-typemaps.swig
+++ b/lldb/bindings/lua/lua-typemaps.swig
@@ -121,9 +121,27 @@ LLDB_NUMBER_TYPEMAP(enum SWIGTYPE);
   $1 = (char *)malloc($2);
 }
 
+// Disable default type checking for this method to avoid SWIG dispatch issues.
+// 
+// Problem: SBThread::GetStopDescription has two overloads:
+//   1. GetStopDescription(char* dst_or_null, size_t dst_len) 
+//   2. GetStopDescription(lldb::SBStream& stream)
+//
+// SWIG generates a dispatch function to select the correct overload based on argument types.
+// see https://www.swig.org/Doc4.0/SWIGDocumentation.html#Typemaps_overloading.
+// However, this dispatcher doesn't consider typemaps that transform function signatures.
+//
+// In lua, our typemap converts GetStopDescription(char*, size_t) to GetStopDescription(int).
+// The dispatcher still checks against the original (char*, size_t) signature instead of 
+// the transformed (int) signature, causing type matching to fail.
+// This only affects SBThread::GetStopDescription since the type check also matches 
+// the argument name, which is unique to this function.
+%typemap(typecheck, precedence=SWIG_TYPECHECK_POINTER) (char *dst_or_null, size_t dst_len) ""
+
 %typemap(argout) (char *dst_or_null, size_t dst_len) {
   lua_pop(L, 1); // Blow away the previous result
-  lua_pushlstring(L, (const char *)$1, $result);
+  llvm::StringRef ref($1);
+  lua_pushlstring(L, (const char *)$1, ref.size());
   free($1);
   // SWIG_arg was already incremented
 }
diff --git a/lldb/bindings/python/CMakeLists.txt b/lldb/bindings/python/CMakeLists.txt
index ef6def3f26872..28a8af8f06319 100644
--- a/lldb/bindings/python/CMakeLists.txt
+++ b/lldb/bindings/python/CMakeLists.txt
@@ -107,6 +107,7 @@ function(finish_swig_python swig_target lldb_python_bindings_dir lldb_python_tar
     "plugins"
     FILES
     "${LLDB_SOURCE_DIR}/examples/python/templates/parsed_cmd.py"
+    "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_frame_provider.py"
     "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_process.py"
     "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_platform.py"
     "${LLDB_SOURCE_DIR}/examples/python/templates/operating_system.py"
diff --git a/lldb/bindings/python/get-python-config.py b/lldb/bindings/python/get-python-config.py
index ae84cbb1215a9..bf8cc48b013e1 100755
--- a/lldb/bindings/python/get-python-config.py
+++ b/lldb/bindings/python/get-python-config.py
@@ -18,6 +18,9 @@ def relpath_nodots(path, base):
 def main():
     parser = argparse.ArgumentParser(description="extract cmake variables from python")
     parser.add_argument("variable_name")
+    parser.add_argument(
+        "--stable-abi", action="store_true", help="Target the Stable C ABI"
+    )
     args = parser.parse_args()
     if args.variable_name == "LLDB_PYTHON_RELATIVE_PATH":
         # LLDB_PYTHON_RELATIVE_PATH is the relative path from lldb's prefix
@@ -68,7 +71,10 @@ def main():
                     print("sys.prefix:", sys.prefix, file=sys.stderr)
                     sys.exit(1)
     elif args.variable_name == "LLDB_PYTHON_EXT_SUFFIX":
-        print(sysconfig.get_config_var("EXT_SUFFIX"))
+        if args.stable_abi:
+            print(".abi3%s" % sysconfig.get_config_var("SHLIB_SUFFIX"))
+        else:
+            print(sysconfig.get_config_var("EXT_SUFFIX"))
     else:
         parser.error(f"unknown variable {args.variable_name}")
 
diff --git a/lldb/bindings/python/python-swigsafecast.swig b/lldb/bindings/python/python-swigsafecast.swig
index 3ea24f1a31414..a86dc44ce4106 100644
--- a/lldb/bindings/python/python-swigsafecast.swig
+++ b/lldb/bindings/python/python-swigsafecast.swig
@@ -37,6 +37,11 @@ PythonObject SWIGBridge::ToSWIGWrapper(lldb::ThreadPlanSP thread_plan_sp) {
                       SWIGTYPE_p_lldb__SBThreadPlan);
 }
 
+PythonObject SWIGBridge::ToSWIGWrapper(lldb::StackFrameListSP frames_sp) {
+  return ToSWIGHelper(new lldb::SBFrameList(std::move(frames_sp)),
+                      SWIGTYPE_p_lldb__SBFrameList);
+}
+
 PythonObject SWIGBridge::ToSWIGWrapper(lldb::BreakpointSP breakpoint_sp) {
   return ToSWIGHelper(new lldb::SBBreakpoint(std::move(breakpoint_sp)),
                       SWIGTYPE_p_lldb__SBBreakpoint);
diff --git a/lldb/bindings/python/python-typemaps.swig b/lldb/bindings/python/python-typemaps.swig
index 715914fe745f8..4d3a95768f2f3 100644
--- a/lldb/bindings/python/python-typemaps.swig
+++ b/lldb/bindings/python/python-typemaps.swig
@@ -224,6 +224,24 @@ AND call SWIG_fail at the same time, because it will result in a double free.
   }
   $1 = (char *)malloc($2);
 }
+
+// Disable default type checking for this method to avoid SWIG dispatch issues.
+// 
+// Problem: SBThread::GetStopDescription has two overloads:
+//   1. GetStopDescription(char* dst_or_null, size_t dst_len) 
+//   2. GetStopDescription(lldb::SBStream& stream)
+//
+// SWIG generates a dispatch function to select the correct overload based on argument types.
+// see https://www.swig.org/Doc4.0/SWIGDocumentation.html#Typemaps_overloading.
+// However, this dispatcher doesn't consider typemaps that transform function signatures.
+//
+// In Python, our typemap converts GetStopDescription(char*, size_t) to GetStopDescription(int).
+// The dispatcher still checks against the original (char*, size_t) signature instead of 
+// the transformed (int) signature, causing type matching to fail.
+// This only affects SBThread::GetStopDescription since the type check also matches 
+// the argument name, which is unique to this function.
+%typemap(typecheck, precedence=SWIG_TYPECHECK_POINTER) (char *dst_or_null, size_t dst_len) ""
+
 %typemap(argout) (char *dst_or_null, size_t dst_len) {
   Py_XDECREF($result); /* Blow away any previous result */
   llvm::StringRef ref($1);
diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig
index e7acba5b95d89..3a0995e84f643 100644
--- a/lldb/bindings/python/python-wrapper.swig
+++ b/lldb/bindings/python/python-wrapper.swig
@@ -556,6 +556,18 @@ void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBExecutionContext(PyOb
   return sb_ptr;
 }
 
+void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBFrameList(PyObject *data) {
+  lldb::SBFrameList *sb_ptr = NULL;
+
+  int valid_cast = SWIG_ConvertPtr(data, (void **)&sb_ptr,
+                                   SWIGTYPE_p_lldb__SBFrameList, 0);
+
+  if (valid_cast == -1)
+    return NULL;
+
+  return sb_ptr;
+}
+
 bool lldb_private::python::SWIGBridge::LLDBSwigPythonCallCommand(
     const char *python_function_name, const char *session_dictionary_name,
     lldb::DebuggerSP debugger, const char *args,
diff --git a/lldb/docs/use/tutorials/implementing-standalone-scripts.md b/lldb/docs/use/tutorials/implementing-standalone-scripts.md
index 285d2d3dea9ea..b1a3441ffe2ee 100644
--- a/lldb/docs/use/tutorials/implementing-standalone-scripts.md
+++ b/lldb/docs/use/tutorials/implementing-standalone-scripts.md
@@ -147,3 +147,20 @@ SBFunction: id = 0x0000002e, name = main, type = main
 a.out[0x714]: mov    w0, #0x0                  ; =0
 a.out[0x718]: ret
 ```
+
+### Troubleshooting
+
+You can use all the usual Python tools to debug scripts, and on top of that
+you can enable LLDB's log channels. To do this in the script shown above, add
+this line right after `debugger` has been assigned:
+
+```python
+debugger.EnableLog("lldb", ["all"])
+```
+
+`lldb` `all` enables a lot of different channels, so you will probably want
+to enable only a few channels once you know what you are interested in.
+
+This API call is the equivalent of `log enable lldb all` when using LLDB
+interactively. All channels available to `log enable` can be enabled using
+`EnableLog` too.
\ No newline at end of file
diff --git a/lldb/examples/python/templates/scripted_frame_provider.py b/lldb/examples/python/templates/scripted_frame_provider.py
new file mode 100644
index 0000000000000..20f4d76d188c2
--- /dev/null
+++ b/lldb/examples/python/templates/scripted_frame_provider.py
@@ -0,0 +1,113 @@
+from abc import ABCMeta, abstractmethod
+
+import lldb
+
+
+class ScriptedFrameProvider(metaclass=ABCMeta):
+    """
+    The base class for a scripted frame provider.
+
+    A scripted frame provider allows you to provide custom stack frames for a
+    thread, which can be used to augment or replace the standard unwinding
+    mechanism. This is useful for:
+
+    - Providing frames for custom calling conventions or languages
+    - Reconstructing missing frames from crash dumps or core files
+    - Adding diagnostic or synthetic frames for debugging
+    - Visualizing state machines or async execution contexts
+
+    Most of the base class methods are `@abstractmethod` that need to be
+    overwritten by the inheriting class.
+
+    Example usage:
+
+    .. code-block:: python
+
+        # Attach a frame provider to a thread
+        thread = process.GetSelectedThread()
+        error = thread.SetScriptedFrameProvider(
+                        "my_module.MyFrameProvider",
+                        lldb.SBStructuredData()
+        )
+    """
+
+    @abstractmethod
+    def __init__(self, input_frames, args):
+        """Construct a scripted frame provider.
+
+        Args:
+            input_frames (lldb.SBFrameList): The frame list to use as input.
+                This allows you to access frames by index. The frames are
+                materialized lazily as you access them.
+            args (lldb.SBStructuredData): A Dictionary holding arbitrary
+                key/value pairs used by the scripted frame provider.
+        """
+        self.input_frames = None
+        self.args = None
+        self.thread = None
+        self.target = None
+        self.process = None
+
+        if isinstance(input_frames, lldb.SBFrameList) and input_frames.IsValid():
+            self.input_frames = input_frames
+            self.thread = input_frames.GetThread()
+            if self.thread and self.thread.IsValid():
+                self.process = self.thread.GetProcess()
+                if self.process and self.process.IsValid():
+                    self.target = self.process.GetTarget()
+
+        if isinstance(args, lldb.SBStructuredData) and args.IsValid():
+            self.args = args
+
+    @abstractmethod
+    def get_frame_at_index(self, index):
+        """Get a single stack frame at the given index.
+
+        This method is called lazily when a specific frame is needed in the
+        thread's backtrace (e.g., via the 'bt' command). Each frame is
+        requested individually as needed.
+
+        Args:
+            index (int): The frame index to retrieve (0 for youngest/top frame).
+
+        Returns:
+            Dict or None: A frame dictionary describing the stack frame, or None
+                if no frame exists at this index. The dictionary should contain:
+
+            Required fields:
+            - idx (int): The synthetic frame index (0 for youngest/top frame)
+            - pc (int): The program counter address for the synthetic frame
+
+            Alternatively, you can return:
+            - A ScriptedFrame object for full control over frame behavior
+            - An integer representing an input frame index to reuse
+            - None to indicate no more frames exist
+
+        Example:
+
+        .. code-block:: python
+
+            def get_frame_at_index(self, index):
+                # Return None when there are no more frames
+                if index >= self.total_frames:
+                    return None
+
+                # Re-use an input frame by returning its index
+                if self.should_use_input_frame(index):
+                    return index  # Returns input frame at this index
+
+                # Or create a custom frame dictionary
+                if index == 0:
+                    return {
+                        "idx": 0,
+                        "pc": 0x100001234,
+                    }
+
+                return None
+
+        Note:
+            The frames are indexed from 0 (youngest/top) to N (oldest/bottom).
+            This method will be called repeatedly with increasing indices until
+            None is returned.
+        """
+        pass
diff --git a/lldb/include/lldb/API/LLDB.h b/lldb/include/lldb/API/LLDB.h
index 6485f35302a1c..6ac35bb4a364b 100644
--- a/lldb/include/lldb/API/LLDB.h
+++ b/lldb/include/lldb/API/LLDB.h
@@ -37,6 +37,7 @@
 #include "lldb/API/SBFileSpecList.h"
 #include "lldb/API/SBFormat.h"
 #include "lldb/API/SBFrame.h"
+#include "lldb/API/SBFrameList.h"
 #include "lldb/API/SBFunction.h"
 #include "lldb/API/SBHostOS.h"
 #include "lldb/API/SBInstruction.h"
diff --git a/lldb/include/lldb/API/SBDefines.h b/lldb/include/lldb/API/SBDefines.h
index 85f6bbeea5bf9..5fcc685050c0b 100644
--- a/lldb/include/lldb/API/SBDefines.h
+++ b/lldb/include/lldb/API/SBDefines.h
@@ -76,6 +76,7 @@ class LLDB_API SBFileSpec;
 class LLDB_API SBFileSpecList;
 class LLDB_API SBFormat;
 class LLDB_API SBFrame;
+class LLDB_API SBFrameList;
 class LLDB_API SBFunction;
 class LLDB_API SBHostOS;
 class LLDB_API SBInstruction;
diff --git a/lldb/include/lldb/API/SBFrame.h b/lldb/include/lldb/API/SBFrame.h
index 92917e57fc125..5283cdfe53faa 100644
--- a/lldb/include/lldb/API/SBFrame.h
+++ b/lldb/include/lldb/API/SBFrame.h
@@ -222,6 +222,7 @@ class LLDB_API SBFrame {
 protected:
   friend class SBBlock;
   friend class SBExecutionContext;
+  friend class SBFrameList;
   friend class SBInstruction;
   friend class SBThread;
   friend class SBValue;
diff --git a/lldb/include/lldb/API/SBFrameList.h b/lldb/include/lldb/API/SBFrameList.h
new file mode 100644
index 0000000000000..0039ffb1f863f
--- /dev/null
+++ b/lldb/include/lldb/API/SBFrameList.h
@@ -0,0 +1,96 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_API_SBFRAMELIST_H
+#define LLDB_API_SBFRAMELIST_H
+
+#include "lldb/API/SBDefines.h"
+
+namespace lldb_private {
+class ScriptInterpreter;
+namespace python {
+class SWIGBridge;
+}
+namespace lua {
+class SWIGBridge;
+}
+} // namespace lldb_private
+
+namespace lldb {
+
+/// Represents a list of SBFrame objects.
+///
+/// SBFrameList provides a way to iterate over stack frames lazily,
+/// materializing frames on-demand as they are accessed. This is more
+/// efficient than eagerly creating all frames upfront.
+class LLDB_API SBFrameList {
+public:
+  SBFrameList();
+
+  SBFrameList(const lldb::SBFrameList &rhs);
+
+  ~SBFrameList();
+
+  const lldb::SBFrameList &operator=(const lldb::SBFrameList &rhs);
+
+  explicit operator bool() const;
+
+  bool IsValid() const;
+
+  /// Returns the number of frames in the list.
+  uint32_t GetSize() const;
+
+  /// Returns the frame at the given index.
+  ///
+  /// \param[in] idx
+  ///     The index of the frame to retrieve (0-based).
+  ///
+  /// \return
+  ///     An SBFrame object for the frame at the specified index.
+  ///     Returns an invalid SBFrame if idx is out of range.
+  lldb::SBFrame GetFrameAtIndex(uint32_t idx) const;
+
+  /// Get the thread associated with this frame list.
+  ///
+  /// \return
+  ///     An SBThread object representing the thread.
+  lldb::SBThread GetThread() const;
+
+  /// Clear all frames from this list.
+  void Clear();
+
+  /// Get a description of this frame list.
+  ///
+  /// \param[in] description
+  ///     The stream to write the description to.
+  ///
+  /// \return
+  ///     True if the description was successfully written.
+  bool GetDescription(lldb::SBStream &description) const;
+
+protected:
+  friend class SBThread;
+
+  friend class lldb_private::python::SWIGBridge;
+  friend class lldb_private::lua::SWIGBridge;
+  friend class lldb_private::ScriptInterpreter;
+
+private:
+  SBFrameList(const lldb::StackFrameListSP &frame_list_sp);
+
+  void SetFrameList(const lldb::StackFrameListSP &frame_list_sp);
+
+  // This needs to be a shared_ptr since an SBFrameList can be passed to
+  // scripting affordances like ScriptedFrameProviders but also out of
+  // convenience because Thread::GetStackFrameList returns a StackFrameListSP.
+  lldb::StackFrameListSP m_opaque_sp;
+};
+
+} // namespace lldb
+
+#endif // LLDB_API_SBFRAMELIST_H
diff --git a/lldb/include/lldb/API/SBModuleSpec.h b/lldb/include/lldb/API/SBModuleSpec.h
index 8d1ecfe6e6f8b..b80a52b7a235f 100644
--- a/lldb/include/lldb/API/SBModuleSpec.h
+++ b/lldb/include/lldb/API/SBModuleSpec.h
@@ -87,6 +87,16 @@ class LLDB_API SBModuleSpec {
 
   bool GetDescription(lldb::SBStream &description);
 
+  lldb::SBTarget GetTarget();
+
+  /// Set the target to be used when resolving a module.
+  ///
+  /// A target can help locate a module specified by a SBModuleSpec. The
+  /// target settings, like the executable and debug info search paths, can
+  /// be essential. The target's platform can also be used to locate or download
+  /// the specified module.
+  void SetTarget(lldb::SBTarget target);
+
 private:
   friend class SBModuleSpecList;
   friend class SBModule;
diff --git a/lldb/include/lldb/API/SBStream.h b/lldb/include/lldb/API/SBStream.h
index d230da6123fb3..21f9d21e0e717 100644
--- a/lldb/include/lldb/API/SBStream.h
+++ b/lldb/include/lldb/API/SBStream.h
@@ -81,6 +81,7 @@ class LLDB_API SBStream {
   friend class SBFileSpec;
   friend class SBFileSpecList;
   friend class SBFrame;
+  friend class SBFrameList;
   friend class SBFunction;
   friend class SBInstruction;
   friend class SBInstructionList;
diff --git a/lldb/include/lldb/API/SBTarget.h b/lldb/include/lldb/API/SBTarget.h
index 173fd05b54a13..379a0bb7e9513 100644
--- a/lldb/include/lldb/API/SBTarget.h
+++ b/lldb/include/lldb/API/SBTarget.h
@@ -999,6 +999,7 @@ class LLDB_API SBTarget {
   friend class SBFunction;
   friend class SBInstruction;
   friend class SBModule;
+  friend class SBModuleSpec;
   friend class SBPlatform;
   friend class SBProcess;
   friend class SBSection;
diff --git a/lldb/include/lldb/API/SBThread.h b/lldb/include/lldb/API/SBThread.h
index e9fe5858d125e..f6a6d19935b83 100644
--- a/lldb/include/lldb/API/SBThread.h
+++ b/lldb/include/lldb/API/SBThread.h
@@ -81,6 +81,14 @@ class LLDB_API SBThread {
   SBThreadCollection
   GetStopReasonExtendedBacktraces(InstrumentationRuntimeType type);
 
+  /// Gets a human-readable description of why the thread stopped.
+  ///
+  /// \param stream Output stream to receive the stop description text
+  /// \return
+  ///   true if obtained and written to the stream,
+  //    false if there was an error retrieving the description.
+  bool GetStopDescription(lldb::SBStream &stream) const;
+
   size_t GetStopDescription(char *dst_or_null, size_t dst_len);
 
   SBValue GetStopReturnValue();
@@ -178,6 +186,8 @@ class LLDB_API SBThread {
 
   lldb::SBFrame GetFrameAtIndex(uint32_t idx);
 
+  lldb::SBFrameList GetFrames();
+
   lldb::SBFrame GetSelectedFrame();
 
   lldb::SBFrame SetSelectedFrame(uint32_t frame_idx);
@@ -236,6 +246,7 @@ class LLDB_API SBThread {
   friend class SBSaveCoreOptions;
   friend class SBExecutionContext;
   friend class SBFrame;
+  friend class SBFrameList;
   friend class SBProcess;
   friend class SBDebugger;
   friend class SBValue;
diff --git a/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h b/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h
index 124cb55eaf723..57acb82dd96e9 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h
@@ -32,7 +32,8 @@ class BreakpointLocationCollection {
 
   ~BreakpointLocationCollection();
 
-  BreakpointLocationCollection &operator=(const BreakpointLocationCollection &rhs);
+  BreakpointLocationCollection &
+  operator=(const BreakpointLocationCollection &rhs);
 
   /// Add the breakpoint \a bp_loc_sp to the list.
   ///
@@ -172,17 +173,18 @@ class BreakpointLocationCollection {
                          lldb::break_id_t break_loc_id) const;
 
   collection m_break_loc_collection;
-  mutable std::mutex m_collection_mutex;
+  mutable std::recursive_mutex m_collection_mutex;
   /// These are used if we're preserving breakpoints in this list:
   const bool m_preserving_bkpts = false;
   std::map<std::pair<lldb::break_id_t, lldb::break_id_t>, lldb::BreakpointSP>
       m_preserved_bps;
 
 public:
-  typedef llvm::iterator_range<collection::const_iterator>
+  typedef LockingAdaptedIterable<std::recursive_mutex, collection>
       BreakpointLocationCollectionIterable;
   BreakpointLocationCollectionIterable BreakpointLocations() {
-    return BreakpointLocationCollectionIterable(m_break_loc_collection);
+    return BreakpointLocationCollectionIterable(m_break_loc_collection,
+                                                m_collection_mutex);
   }
 };
 } // namespace lldb_private
diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h
index e71f3b2bad6b4..df473dff091f8 100644
--- a/lldb/include/lldb/Core/ModuleList.h
+++ b/lldb/include/lldb/Core/ModuleList.h
@@ -476,9 +476,9 @@ class ModuleList {
 
   static Status
   GetSharedModule(const ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
-                  const FileSpecList *module_search_paths_ptr,
                   llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
-                  bool *did_create_ptr, bool always_create = false);
+                  bool *did_create_ptr, bool always_create = false,
+                  bool invoke_locate_callback = true);
 
   static bool RemoveSharedModule(lldb::ModuleSP &module_sp);
 
diff --git a/lldb/include/lldb/Core/ModuleSpec.h b/lldb/include/lldb/Core/ModuleSpec.h
index 86be0383f8b47..acbc85b48f02c 100644
--- a/lldb/include/lldb/Core/ModuleSpec.h
+++ b/lldb/include/lldb/Core/ModuleSpec.h
@@ -16,9 +16,11 @@
 #include "lldb/Utility/Iterable.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/Utility/UUID.h"
+#include "lldb/lldb-forward.h"
 
 #include "llvm/Support/Chrono.h"
 
+#include <memory>
 #include <mutex>
 #include <vector>
 
@@ -126,6 +128,16 @@ class ModuleSpec {
 
   lldb::DataBufferSP GetData() const { return m_data; }
 
+  lldb::TargetSP GetTargetSP() const { return m_target_wp.lock(); }
+
+  /// Set the target to be used when resolving a module.
+  ///
+  /// A target can help locate a module specified by a ModuleSpec. The target
+  /// settings, like the executable and debug info search paths, can be
+  /// essential. The target's platform can also be used to locate or download
+  /// the specified module.
+  void SetTarget(std::shared_ptr<Target> target) { m_target_wp = target; }
+
   void Clear() {
     m_file.Clear();
     m_platform_file.Clear();
@@ -137,6 +149,7 @@ class ModuleSpec {
     m_object_size = 0;
     m_source_mappings.Clear(false);
     m_object_mod_time = llvm::sys::TimePoint<>();
+    m_target_wp.reset();
   }
 
   explicit operator bool() const {
@@ -265,6 +278,11 @@ class ModuleSpec {
   ArchSpec m_arch;
   UUID m_uuid;
   ConstString m_object_name;
+  /// The target used when resolving a module. A target can help locate a module
+  /// specified by a ModuleSpec. The target settings, like the executable and
+  /// debug info search paths, can be essential. The target's platform can also
+  /// be used to locate or download the specified module.
+  std::weak_ptr<Target> m_target_wp;
   uint64_t m_object_offset = 0;
   uint64_t m_object_size = 0;
   llvm::sys::TimePoint<> m_object_mod_time;
diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h
index aa60b7c6693ca..ab2ca58a88ddd 100644
--- a/lldb/include/lldb/Core/PluginManager.h
+++ b/lldb/include/lldb/Core/PluginManager.h
@@ -356,6 +356,24 @@ class PluginManager {
   GetScriptInterpreterForLanguage(lldb::ScriptLanguage script_lang,
                                   Debugger &debugger);
 
+  // SyntheticFrameProvider
+  static bool
+  RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
+                 SyntheticFrameProviderCreateInstance create_native_callback,
+                 ScriptedFrameProviderCreateInstance create_scripted_callback);
+
+  static bool
+  UnregisterPlugin(SyntheticFrameProviderCreateInstance create_callback);
+
+  static bool
+  UnregisterPlugin(ScriptedFrameProviderCreateInstance create_callback);
+
+  static SyntheticFrameProviderCreateInstance
+  GetSyntheticFrameProviderCreateCallbackForPluginName(llvm::StringRef name);
+
+  static ScriptedFrameProviderCreateInstance
+  GetScriptedFrameProviderCreateCallbackAtIndex(uint32_t idx);
+
   // StructuredDataPlugin
 
   /// Register a StructuredDataPlugin class along with optional
diff --git a/lldb/include/lldb/Core/Section.h b/lldb/include/lldb/Core/Section.h
index f0f5a0b3499c0..3c5586c489da5 100644
--- a/lldb/include/lldb/Core/Section.h
+++ b/lldb/include/lldb/Core/Section.h
@@ -46,6 +46,8 @@ class SectionList {
   /// Create an empty list.
   SectionList() = default;
 
+  SectionList(const SectionList &lhs);
+
   SectionList &operator=(const SectionList &rhs);
 
   size_t AddSection(const lldb::SectionSP &section_sp);
@@ -96,6 +98,17 @@ class SectionList {
   /// information.
   uint64_t GetDebugInfoSize() const;
 
+  // Callback to decide which of two matching sections should be used in the
+  // merged output.
+  using MergeCallback =
+      std::function<lldb::SectionSP(lldb::SectionSP, lldb::SectionSP)>;
+
+  // Function that merges two different sections into a new output list. All
+  // unique sections will be checked for conflict and resolved using the
+  // supplied merging callback.
+  static SectionList Merge(SectionList &lhs, SectionList &rhs,
+                           MergeCallback filter);
+
 protected:
   collection m_sections;
 };
@@ -273,6 +286,9 @@ class Section : public std::enable_shared_from_this<Section>,
   /// return true.
   bool ContainsOnlyDebugInfo() const;
 
+  /// Returns true if this is a global offset table section.
+  bool IsGOTSection() const;
+
 protected:
   ObjectFile *m_obj_file;   // The object file that data for this section should
                             // be read from
diff --git a/lldb/include/lldb/Core/SourceManager.h b/lldb/include/lldb/Core/SourceManager.h
index 1244291596b73..83dc74768733d 100644
--- a/lldb/include/lldb/Core/SourceManager.h
+++ b/lldb/include/lldb/Core/SourceManager.h
@@ -109,6 +109,8 @@ class SourceManager {
   private:
     void CommonInitializer(lldb::SupportFileSP support_file_sp,
                            lldb::TargetSP target_sp);
+    void CommonInitializerImpl(lldb::SupportFileSP support_file_sp,
+                               lldb::TargetSP target_sp);
   };
 
   typedef std::shared_ptr<File> FileSP;
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h
new file mode 100644
index 0000000000000..2d9f713676f90
--- /dev/null
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_INTERPRETER_INTERFACES_SCRIPTEDFRAMEPROVIDERINTERFACE_H
+#define LLDB_INTERPRETER_INTERFACES_SCRIPTEDFRAMEPROVIDERINTERFACE_H
+
+#include "lldb/lldb-private.h"
+
+#include "ScriptedInterface.h"
+
+namespace lldb_private {
+class ScriptedFrameProviderInterface : public ScriptedInterface {
+public:
+  virtual llvm::Expected<StructuredData::GenericSP>
+  CreatePluginObject(llvm::StringRef class_name,
+                     lldb::StackFrameListSP input_frames,
+                     StructuredData::DictionarySP args_sp) = 0;
+
+  virtual StructuredData::ObjectSP GetFrameAtIndex(uint32_t index) {
+    return {};
+  }
+};
+} // namespace lldb_private
+
+#endif // LLDB_INTERPRETER_INTERFACES_SCRIPTEDFRAMEPROVIDERINTERFACE_H
diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
index edb80dc66aca7..7fed4940b85bf 100644
--- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h
+++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
@@ -16,6 +16,7 @@
 #include "lldb/API/SBError.h"
 #include "lldb/API/SBEvent.h"
 #include "lldb/API/SBExecutionContext.h"
+#include "lldb/API/SBFrameList.h"
 #include "lldb/API/SBLaunchInfo.h"
 #include "lldb/API/SBMemoryRegionInfo.h"
 #include "lldb/API/SBStream.h"
@@ -28,6 +29,7 @@
 #include "lldb/Host/StreamFile.h"
 #include "lldb/Interpreter/Interfaces/OperatingSystemInterface.h"
 #include "lldb/Interpreter/Interfaces/ScriptedFrameInterface.h"
+#include "lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h"
 #include "lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h"
 #include "lldb/Interpreter/Interfaces/ScriptedProcessInterface.h"
 #include "lldb/Interpreter/Interfaces/ScriptedThreadInterface.h"
@@ -537,6 +539,11 @@ class ScriptInterpreter : public PluginInterface {
     return {};
   }
 
+  virtual lldb::ScriptedFrameProviderInterfaceSP
+  CreateScriptedFrameProviderInterface() {
+    return {};
+  }
+
   virtual lldb::ScriptedThreadPlanInterfaceSP
   CreateScriptedThreadPlanInterface() {
     return {};
@@ -596,6 +603,9 @@ class ScriptInterpreter : public PluginInterface {
   lldb::ExecutionContextRefSP GetOpaqueTypeFromSBExecutionContext(
       const lldb::SBExecutionContext &exe_ctx) const;
 
+  lldb::StackFrameListSP
+  GetOpaqueTypeFromSBFrameList(const lldb::SBFrameList &exe_ctx) const;
+
 protected:
   Debugger &m_debugger;
   lldb::ScriptLanguage m_script_lang;
diff --git a/lldb/include/lldb/Symbol/CompilerType.h b/lldb/include/lldb/Symbol/CompilerType.h
index df8489a7fe582..869c5076ee0a7 100644
--- a/lldb/include/lldb/Symbol/CompilerType.h
+++ b/lldb/include/lldb/Symbol/CompilerType.h
@@ -144,7 +144,7 @@ class CompilerType {
 
   bool IsDefined() const;
 
-  bool IsFloatingPointType(uint32_t &count, bool &is_complex) const;
+  bool IsFloatingPointType(bool &is_complex) const;
 
   bool IsFunctionType() const;
 
@@ -400,7 +400,7 @@ class CompilerType {
   /// Return the size of the type in bits.
   llvm::Expected<uint64_t> GetBitSize(ExecutionContextScope *exe_scope) const;
 
-  lldb::Encoding GetEncoding(uint64_t &count) const;
+  lldb::Encoding GetEncoding() const;
 
   lldb::Format GetFormat() const;
 
diff --git a/lldb/include/lldb/Symbol/ObjectFile.h b/lldb/include/lldb/Symbol/ObjectFile.h
index 1b9ae1fb31a69..1de08a8576507 100644
--- a/lldb/include/lldb/Symbol/ObjectFile.h
+++ b/lldb/include/lldb/Symbol/ObjectFile.h
@@ -758,6 +758,12 @@ class ObjectFile : public std::enable_shared_from_this<ObjectFile>,
     return false;
   }
 
+  /// Returns true if the section is a global offset table section.
+  virtual bool IsGOTSection(const lldb_private::Section &section) const {
+    assert(section.GetObjectFile() == this && "Wrong object file!");
+    return false;
+  }
+
   /// Get a hash that can be used for caching object file releated information.
   ///
   /// Data for object files can be cached between runs of debug sessions and
diff --git a/lldb/include/lldb/Symbol/Type.h b/lldb/include/lldb/Symbol/Type.h
index e657357b942f1..02b43e300a83e 100644
--- a/lldb/include/lldb/Symbol/Type.h
+++ b/lldb/include/lldb/Symbol/Type.h
@@ -507,7 +507,7 @@ class Type : public std::enable_shared_from_this<Type>, public UserID {
 
   lldb::Format GetFormat();
 
-  lldb::Encoding GetEncoding(uint64_t &count);
+  lldb::Encoding GetEncoding();
 
   SymbolContextScope *GetSymbolContextScope() { return m_context; }
   const SymbolContextScope *GetSymbolContextScope() const { return m_context; }
diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h
index 0ec3a28898329..25b208a65349b 100644
--- a/lldb/include/lldb/Symbol/TypeSystem.h
+++ b/lldb/include/lldb/Symbol/TypeSystem.h
@@ -163,7 +163,7 @@ class TypeSystem : public PluginInterface,
   virtual bool IsDefined(lldb::opaque_compiler_type_t type) = 0;
 
   virtual bool IsFloatingPointType(lldb::opaque_compiler_type_t type,
-                                   uint32_t &count, bool &is_complex) = 0;
+                                   bool &is_complex) = 0;
 
   virtual bool IsFunctionType(lldb::opaque_compiler_type_t type) = 0;
 
@@ -317,8 +317,7 @@ class TypeSystem : public PluginInterface,
   GetBitSize(lldb::opaque_compiler_type_t type,
              ExecutionContextScope *exe_scope) = 0;
 
-  virtual lldb::Encoding GetEncoding(lldb::opaque_compiler_type_t type,
-                                     uint64_t &count) = 0;
+  virtual lldb::Encoding GetEncoding(lldb::opaque_compiler_type_t type) = 0;
 
   virtual lldb::Format GetFormat(lldb::opaque_compiler_type_t type) = 0;
 
diff --git a/lldb/include/lldb/Target/InstrumentationRuntime.h b/lldb/include/lldb/Target/InstrumentationRuntime.h
index a6121c24b9560..d2499528e97ab 100644
--- a/lldb/include/lldb/Target/InstrumentationRuntime.h
+++ b/lldb/include/lldb/Target/InstrumentationRuntime.h
@@ -73,6 +73,13 @@ class InstrumentationRuntime
   /// is guaranteed to be loaded.
   virtual void Activate() = 0;
 
+  /// \return true if `CheckIfRuntimeIsValid` should be called on all modules.
+  /// In this case the return value of `GetPatternForRuntimeLibrary` will be
+  /// ignored. Return false if `CheckIfRuntimeIsValid` should only be called
+  /// for modules whose name matches `GetPatternForRuntimeLibrary`.
+  ///
+  virtual bool MatchAllModules() { return false; }
+
 public:
   static void ModulesDidLoad(lldb_private::ModuleList &module_list,
                              Process *process,
diff --git a/lldb/include/lldb/Target/Language.h b/lldb/include/lldb/Target/Language.h
index 9958b6ea2f815..9292f790333a1 100644
--- a/lldb/include/lldb/Target/Language.h
+++ b/lldb/include/lldb/Target/Language.h
@@ -318,7 +318,9 @@ class Language : public PluginInterface {
   ///
   /// This function should only return true if there is a high confidence
   /// that the name actually belongs to this language.
-  virtual bool SymbolNameFitsToLanguage(Mangled name) const { return false; }
+  virtual bool SymbolNameFitsToLanguage(const Mangled &name) const {
+    return false;
+  }
 
   /// An individual data formatter may apply to several types and cross language
   /// boundaries. Each of those languages may want to customize the display of
diff --git a/lldb/include/lldb/Target/Platform.h b/lldb/include/lldb/Target/Platform.h
index 35ffdabf907e7..1104722f52c70 100644
--- a/lldb/include/lldb/Target/Platform.h
+++ b/lldb/include/lldb/Target/Platform.h
@@ -127,8 +127,7 @@ class Platform : public PluginInterface {
   ///     Returns \b true if this Platform plug-in was able to find
   ///     a suitable executable, \b false otherwise.
   virtual Status ResolveExecutable(const ModuleSpec &module_spec,
-                                   lldb::ModuleSP &exe_module_sp,
-                                   const FileSpecList *module_search_paths_ptr);
+                                   lldb::ModuleSP &exe_module_sp);
 
   /// Find a symbol file given a symbol file module specification.
   ///
@@ -304,10 +303,11 @@ class Platform : public PluginInterface {
   /// \return
   ///     The Status object for any errors found while searching for
   ///     the binary.
-  virtual Status GetSharedModule(
-      const ModuleSpec &module_spec, Process *process,
-      lldb::ModuleSP &module_sp, const FileSpecList *module_search_paths_ptr,
-      llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr);
+  virtual Status
+  GetSharedModule(const ModuleSpec &module_spec, Process *process,
+                  lldb::ModuleSP &module_sp,
+                  llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
+                  bool *did_create_ptr);
 
   void CallLocateModuleCallbackIfSet(const ModuleSpec &module_spec,
                                      lldb::ModuleSP &module_sp,
@@ -1039,8 +1039,8 @@ class Platform : public PluginInterface {
   /// predefined trap handlers, this method may be a no-op.
   virtual void CalculateTrapHandlerSymbolNames() = 0;
 
-  Status GetCachedExecutable(ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
-                             const FileSpecList *module_search_paths_ptr);
+  Status GetCachedExecutable(ModuleSpec &module_spec,
+                             lldb::ModuleSP &module_sp);
 
   virtual Status DownloadModuleSlice(const FileSpec &src_file_spec,
                                      const uint64_t src_offset,
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index 8f5892e16cedf..c1f9785e76f90 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -127,10 +127,7 @@ class ProcessAttachInfo : public ProcessInstanceInfo {
 public:
   ProcessAttachInfo() = default;
 
-  ProcessAttachInfo(const ProcessLaunchInfo &launch_info)
-      : m_resume_count(0), m_wait_for_launch(false), m_ignore_existing(true),
-        m_continue_once_attached(false), m_detach_on_error(true),
-        m_async(false) {
+  ProcessAttachInfo(const ProcessLaunchInfo &launch_info) {
     ProcessInfo::operator=(launch_info);
     SetProcessPluginName(launch_info.GetProcessPluginName());
     SetResumeCount(launch_info.GetResumeCount());
diff --git a/lldb/include/lldb/Target/RemoteAwarePlatform.h b/lldb/include/lldb/Target/RemoteAwarePlatform.h
index fb2eecfaa23a8..de13b18f30d85 100644
--- a/lldb/include/lldb/Target/RemoteAwarePlatform.h
+++ b/lldb/include/lldb/Target/RemoteAwarePlatform.h
@@ -20,10 +20,8 @@ class RemoteAwarePlatform : public Platform {
 public:
   using Platform::Platform;
 
-  virtual Status
-  ResolveExecutable(const ModuleSpec &module_spec,
-                    lldb::ModuleSP &exe_module_sp,
-                    const FileSpecList *module_search_paths_ptr) override;
+  virtual Status ResolveExecutable(const ModuleSpec &module_spec,
+                                   lldb::ModuleSP &exe_module_sp) override;
 
   bool GetModuleSpec(const FileSpec &module_file_spec, const ArchSpec &arch,
                      ModuleSpec &module_spec) override;
diff --git a/lldb/include/lldb/Target/StackFrameList.h b/lldb/include/lldb/Target/StackFrameList.h
index ea9aab86b8ea1..5b0df0ddb3e29 100644
--- a/lldb/include/lldb/Target/StackFrameList.h
+++ b/lldb/include/lldb/Target/StackFrameList.h
@@ -101,6 +101,9 @@ class StackFrameList {
   /// Returns whether we have currently fetched all the frames of a stack.
   bool WereAllFramesFetched() const;
 
+  /// Get the thread associated with this frame list.
+  Thread &GetThread() const { return m_thread; }
+
 protected:
   friend class Thread;
   friend class ScriptedThread;
diff --git a/lldb/include/lldb/Target/SyntheticFrameProvider.h b/lldb/include/lldb/Target/SyntheticFrameProvider.h
new file mode 100644
index 0000000000000..61a492f356ece
--- /dev/null
+++ b/lldb/include/lldb/Target/SyntheticFrameProvider.h
@@ -0,0 +1,156 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_TARGET_SYNTHETICFRAMEPROVIDER_H
+#define LLDB_TARGET_SYNTHETICFRAMEPROVIDER_H
+
+#include "lldb/Core/PluginInterface.h"
+#include "lldb/Target/StackFrameList.h"
+#include "lldb/Target/ThreadSpec.h"
+#include "lldb/Utility/ScriptedMetadata.h"
+#include "lldb/Utility/Status.h"
+#include "lldb/lldb-forward.h"
+#include "llvm/Support/Error.h"
+
+#include <optional>
+#include <vector>
+
+namespace lldb_private {
+
+/// This struct contains the metadata needed to instantiate a frame provider
+/// and optional filters to control which threads it applies to.
+struct SyntheticFrameProviderDescriptor {
+  /// Metadata for instantiating the provider (e.g. script class name and args).
+  lldb::ScriptedMetadataSP scripted_metadata_sp;
+
+  /// Optional list of thread specifications to which this provider applies.
+  /// If empty, the provider applies to all threads. A thread matches if it
+  /// satisfies ANY of the specs in this vector (OR logic).
+  std::vector<ThreadSpec> thread_specs;
+
+  SyntheticFrameProviderDescriptor() = default;
+
+  SyntheticFrameProviderDescriptor(lldb::ScriptedMetadataSP metadata_sp)
+      : scripted_metadata_sp(metadata_sp) {}
+
+  SyntheticFrameProviderDescriptor(lldb::ScriptedMetadataSP metadata_sp,
+                                   const std::vector<ThreadSpec> &specs)
+      : scripted_metadata_sp(metadata_sp), thread_specs(specs) {}
+
+  /// Get the name of this descriptor (the scripted class name).
+  llvm::StringRef GetName() const {
+    return scripted_metadata_sp ? scripted_metadata_sp->GetClassName() : "";
+  }
+
+  /// Check if this descriptor applies to the given thread.
+  bool AppliesToThread(Thread &thread) const {
+    // If no thread specs specified, applies to all threads.
+    if (thread_specs.empty())
+      return true;
+
+    // Check if the thread matches any of the specs (OR logic).
+    for (const auto &spec : thread_specs) {
+      if (spec.ThreadPassesBasicTests(thread))
+        return true;
+    }
+    return false;
+  }
+
+  /// Check if this descriptor has valid metadata for script-based providers.
+  bool IsValid() const { return scripted_metadata_sp != nullptr; }
+
+  void Dump(Stream *s) const;
+};
+
+/// Base class for all synthetic frame providers.
+///
+/// Synthetic frame providers allow modifying or replacing the stack frames
+/// shown for a thread. This is useful for:
+/// - Providing frames for custom calling conventions or languages.
+/// - Reconstructing missing frames from crash dumps or core files.
+/// - Adding diagnostic or synthetic frames for debugging.
+/// - Visualizing state machines or async execution contexts.
+class SyntheticFrameProvider : public PluginInterface {
+public:
+  /// Try to create a SyntheticFrameProvider instance for the given input
+  /// frames and descriptor.
+  ///
+  /// This method iterates through all registered SyntheticFrameProvider
+  /// plugins and returns the first one that can handle the given descriptor.
+  ///
+  /// \param[in] input_frames
+  ///     The input stack frame list that this provider will transform.
+  ///     This could be real unwound frames or output from another provider.
+  ///
+  /// \param[in] descriptor
+  ///     The descriptor containing metadata for the provider.
+  ///
+  /// \return
+  ///     A shared pointer to a SyntheticFrameProvider if one could be created,
+  ///     otherwise an \a llvm::Error.
+  static llvm::Expected<lldb::SyntheticFrameProviderSP>
+  CreateInstance(lldb::StackFrameListSP input_frames,
+                 const SyntheticFrameProviderDescriptor &descriptor);
+
+  /// Try to create a SyntheticFrameProvider instance for the given input
+  /// frames using a specific C++ plugin.
+  ///
+  /// This method directly invokes a specific SyntheticFrameProvider plugin
+  /// by name, bypassing the descriptor-based plugin iteration. This is useful
+  /// for C++ plugins that don't require scripted metadata.
+  ///
+  /// \param[in] input_frames
+  ///     The input stack frame list that this provider will transform.
+  ///     This could be real unwound frames or output from another provider.
+  ///
+  /// \param[in] plugin_name
+  ///     The name of the plugin to use for creating the provider.
+  ///
+  /// \param[in] thread_specs
+  ///     Optional list of thread specifications to which this provider applies.
+  ///     If empty, the provider applies to all threads.
+  ///
+  /// \return
+  ///     A shared pointer to a SyntheticFrameProvider if one could be created,
+  ///     otherwise an \a llvm::Error.
+  static llvm::Expected<lldb::SyntheticFrameProviderSP>
+  CreateInstance(lldb::StackFrameListSP input_frames,
+                 llvm::StringRef plugin_name,
+                 const std::vector<ThreadSpec> &thread_specs = {});
+
+  ~SyntheticFrameProvider() override;
+
+  /// Get a single stack frame at the specified index.
+  ///
+  /// This method is called lazily - frames are only created when requested.
+  /// The provider can access its input frames via GetInputFrames() if needed.
+  ///
+  /// \param[in] idx
+  ///     The index of the frame to create.
+  ///
+  /// \return
+  ///     An Expected containing the StackFrameSP if successful. Returns an
+  ///     error when the index is beyond the last frame to signal the end of
+  ///     the frame list.
+  virtual llvm::Expected<lldb::StackFrameSP> GetFrameAtIndex(uint32_t idx) = 0;
+
+  /// Get the thread associated with this provider.
+  Thread &GetThread() { return m_input_frames->GetThread(); }
+
+  /// Get the input frames that this provider transforms.
+  lldb::StackFrameListSP GetInputFrames() const { return m_input_frames; }
+
+protected:
+  SyntheticFrameProvider(lldb::StackFrameListSP input_frames);
+
+  lldb::StackFrameListSP m_input_frames;
+};
+
+} // namespace lldb_private
+
+#endif // LLDB_TARGET_SYNTHETICFRAMEPROVIDER_H
diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h
index c375df248154f..40f9c9bea1c12 100644
--- a/lldb/include/lldb/Target/Target.h
+++ b/lldb/include/lldb/Target/Target.h
@@ -1346,6 +1346,13 @@ class Target : public std::enable_shared_from_this<Target>,
                                const lldb_private::RegisterFlags &flags,
                                uint32_t byte_size);
 
+  /// Sends a breakpoint notification event.
+  void NotifyBreakpointChanged(Breakpoint &bp,
+                               lldb::BreakpointEventType event_kind);
+  /// Sends a breakpoint notification event.
+  void NotifyBreakpointChanged(Breakpoint &bp,
+                               const lldb::EventDataSP &breakpoint_data_sp);
+
   llvm::Expected<lldb::DisassemblerSP>
   ReadInstructions(const Address &start_addr, uint32_t count,
                    const char *flavor_string = nullptr);
diff --git a/lldb/include/lldb/Target/Thread.h b/lldb/include/lldb/Target/Thread.h
index 688c056da2633..841f80cd1b1eb 100644
--- a/lldb/include/lldb/Target/Thread.h
+++ b/lldb/include/lldb/Target/Thread.h
@@ -1295,6 +1295,8 @@ class Thread : public std::enable_shared_from_this<Thread>,
   ///     an empty std::optional is returned in that case.
   std::optional<lldb::addr_t> GetPreviousFrameZeroPC();
 
+  lldb::StackFrameListSP GetStackFrameList();
+
 protected:
   friend class ThreadPlan;
   friend class ThreadList;
@@ -1336,8 +1338,6 @@ class Thread : public std::enable_shared_from_this<Thread>,
     return StructuredData::ObjectSP();
   }
 
-  lldb::StackFrameListSP GetStackFrameList();
-
   void SetTemporaryResumeState(lldb::StateType new_state) {
     m_temporary_resume_state = new_state;
   }
diff --git a/lldb/include/lldb/Utility/Stream.h b/lldb/include/lldb/Utility/Stream.h
index 82774d56922a9..13455552131da 100644
--- a/lldb/include/lldb/Utility/Stream.h
+++ b/lldb/include/lldb/Utility/Stream.h
@@ -300,6 +300,12 @@ class Stream {
   ///     The current indentation level.
   unsigned GetIndentLevel() const;
 
+  /// Set the current indentation level.
+  ///
+  /// \param[in] level
+  ///     The new indentation level.
+  void SetIndentLevel(unsigned level);
+
   /// Indent the current line in the stream.
   ///
   /// Indent the current line using the current indentation level and print an
@@ -315,6 +321,20 @@ class Stream {
   /// Increment the current indentation level.
   void IndentMore(unsigned amount = 2);
 
+  struct IndentScope {
+    IndentScope(Stream &stream)
+        : m_stream(stream), m_original_indent_level(stream.GetIndentLevel()) {}
+    ~IndentScope() { m_stream.SetIndentLevel(m_original_indent_level); }
+
+  private:
+    Stream &m_stream;
+    unsigned m_original_indent_level;
+  };
+
+  /// Create an indentation scope that restores the original indent level when
+  /// the object goes out of scope (RAII).
+  IndentScope MakeIndentScope(unsigned indent_amount = 2);
+
   /// Output an offset value.
   ///
   /// Put an offset \a uval out to the stream using the printf format in \a
@@ -364,12 +384,6 @@ class Stream {
   ///     address and pointer values.
   void SetAddressByteSize(uint32_t addr_size);
 
-  /// Set the current indentation level.
-  ///
-  /// \param[in] level
-  ///     The new indentation level.
-  void SetIndentLevel(unsigned level);
-
   /// Output a SLEB128 number to the stream.
   ///
   /// Put an SLEB128 \a uval out to the stream using the printf format in \a
diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h
index af5656b3dcad1..8b8d081ca2113 100644
--- a/lldb/include/lldb/lldb-forward.h
+++ b/lldb/include/lldb/lldb-forward.h
@@ -188,6 +188,7 @@ class Scalar;
 class ScriptInterpreter;
 class ScriptInterpreterLocker;
 class ScriptedFrameInterface;
+class ScriptedFrameProviderInterface;
 class ScriptedMetadata;
 class ScriptedBreakpointInterface;
 class ScriptedPlatformInterface;
@@ -235,6 +236,7 @@ class SymbolVendor;
 class Symtab;
 class SyntheticChildren;
 class SyntheticChildrenFrontEnd;
+class SyntheticFrameProvider;
 class SystemRuntime;
 class Progress;
 class Target;
@@ -411,6 +413,10 @@ typedef std::shared_ptr<lldb_private::ScriptSummaryFormat>
 typedef std::shared_ptr<lldb_private::ScriptInterpreter> ScriptInterpreterSP;
 typedef std::shared_ptr<lldb_private::ScriptedFrameInterface>
     ScriptedFrameInterfaceSP;
+typedef std::shared_ptr<lldb_private::ScriptedFrameProviderInterface>
+    ScriptedFrameProviderInterfaceSP;
+typedef std::shared_ptr<lldb_private::SyntheticFrameProvider>
+    SyntheticFrameProviderSP;
 typedef std::shared_ptr<lldb_private::ScriptedMetadata> ScriptedMetadataSP;
 typedef std::unique_ptr<lldb_private::ScriptedPlatformInterface>
     ScriptedPlatformInterfaceUP;
diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h
index 249b25c251ac2..5fc5c14c52f9e 100644
--- a/lldb/include/lldb/lldb-private-interfaces.h
+++ b/lldb/include/lldb/lldb-private-interfaces.h
@@ -15,6 +15,7 @@
 #include "lldb/lldb-types.h"
 #include <memory>
 #include <set>
+#include <vector>
 
 namespace llvm {
 namespace json {
@@ -25,6 +26,7 @@ class Value;
 
 namespace lldb_private {
 class ScriptedInterfaceUsages;
+struct SyntheticFrameProviderDescriptor;
 typedef lldb::ABISP (*ABICreateInstance)(lldb::ProcessSP process_sp,
                                          const ArchSpec &arch);
 typedef std::unique_ptr<Architecture> (*ArchitectureCreateInstance)(
@@ -86,6 +88,14 @@ typedef lldb::RegisterTypeBuilderSP (*RegisterTypeBuilderCreateInstance)(
     Target &target);
 typedef lldb::ScriptInterpreterSP (*ScriptInterpreterCreateInstance)(
     Debugger &debugger);
+typedef llvm::Expected<lldb::SyntheticFrameProviderSP> (
+    *ScriptedFrameProviderCreateInstance)(
+    lldb::StackFrameListSP input_frames,
+    const lldb_private::SyntheticFrameProviderDescriptor &descriptor);
+typedef llvm::Expected<lldb::SyntheticFrameProviderSP> (
+    *SyntheticFrameProviderCreateInstance)(
+    lldb::StackFrameListSP input_frames,
+    const std::vector<lldb_private::ThreadSpec> &thread_specs);
 typedef SymbolFile *(*SymbolFileCreateInstance)(lldb::ObjectFileSP objfile_sp);
 typedef SymbolVendor *(*SymbolVendorCreateInstance)(
     const lldb::ModuleSP &module_sp,
diff --git a/lldb/include/lldb/lldb-private-types.h b/lldb/include/lldb/lldb-private-types.h
index b82a2b8aa0574..185467e91bf62 100644
--- a/lldb/include/lldb/lldb-private-types.h
+++ b/lldb/include/lldb/lldb-private-types.h
@@ -102,13 +102,18 @@ struct RegisterSet {
 /// A type-erased pair of llvm::dwarf::SourceLanguageName and version.
 struct SourceLanguage {
   SourceLanguage() = default;
-  SourceLanguage(lldb::LanguageType language_type);
+  explicit SourceLanguage(lldb::LanguageType language_type);
+
   SourceLanguage(uint16_t name, uint32_t version)
       : name(name), version(version) {}
-  SourceLanguage(std::optional<std::pair<uint16_t, uint32_t>> name_vers)
+
+  explicit SourceLanguage(
+      std::optional<std::pair<uint16_t, uint32_t>> name_vers)
       : name(name_vers ? name_vers->first : 0),
         version(name_vers ? name_vers->second : 0) {}
-  operator bool() const { return name > 0; }
+
+  explicit operator bool() const { return name > 0; }
+
   lldb::LanguageType AsLanguageType() const;
   llvm::StringRef GetDescription() const;
   bool IsC() const;
diff --git a/lldb/packages/Python/lldbsuite/test/builders/builder.py b/lldb/packages/Python/lldbsuite/test/builders/builder.py
index 96c7b3987d8a1..024c9f1c7e435 100644
--- a/lldb/packages/Python/lldbsuite/test/builders/builder.py
+++ b/lldb/packages/Python/lldbsuite/test/builders/builder.py
@@ -258,6 +258,7 @@ def _getDebugInfoArgs(self, debug_info):
             "gmodules": {"MAKE_DSYM": "NO", "MAKE_GMODULES": "YES"},
             "debug_names": {"MAKE_DEBUG_NAMES": "YES"},
             "dwp": {"MAKE_DSYM": "NO", "MAKE_DWP": "YES"},
+            "pdb": {"MAKE_PDB": "YES"},
         }
 
         # Collect all flags, with later options overriding earlier ones
diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py
index 454196e1b0264..23d2165e07f7e 100644
--- a/lldb/packages/Python/lldbsuite/test/decorators.py
+++ b/lldb/packages/Python/lldbsuite/test/decorators.py
@@ -647,6 +647,31 @@ def is_out_of_tree_debugserver():
     return skipTestIfFn(is_out_of_tree_debugserver)(func)
 
 
+def skipIfOutOfTreeLibunwind(func):
+    """Decorate the item to skip tests if libunwind was not built in-tree."""
+
+    def is_out_of_tree_libunwind():
+        if not configuration.llvm_tools_dir:
+            return "out-of-tree libunwind"
+
+        # llvm_tools_dir is typically <build>/bin, so lib is a sibling.
+        llvm_lib_dir = os.path.join(
+            os.path.dirname(configuration.llvm_tools_dir), "lib"
+        )
+
+        if not os.path.isdir(llvm_lib_dir):
+            return "out-of-tree libunwind"
+
+        # Check for libunwind library (any extension).
+        for filename in os.listdir(llvm_lib_dir):
+            if filename.startswith("libunwind.") or filename.startswith("unwind."):
+                return None
+
+        return "out-of-tree libunwind"
+
+    return skipTestIfFn(is_out_of_tree_libunwind)(func)
+
+
 def skipIfRemote(func):
     """Decorate the item to skip tests if testing remotely."""
     return unittest.skipIf(lldb.remote_platform, "skip on remote platform")(func)
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index b92de941c4124..8c1eea97620e2 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -1791,6 +1791,11 @@ def no_reason(_):
                         if can_replicate
                     ]
 
+                    # PDB is off by default, because it has a lot of failures right now.
+                    # See llvm.org/pr149498
+                    if original_testcase.TEST_WITH_PDB_DEBUG_INFO:
+                        dbginfo_categories.append("pdb")
+
                 xfail_for_debug_info_cat_fn = getattr(
                     attrvalue, "__xfail_for_debug_info_cat_fn__", no_reason
                 )
@@ -1878,6 +1883,13 @@ class TestBase(Base, metaclass=LLDBTestCaseFactory):
     # test multiple times with various debug info types.
     NO_DEBUG_INFO_TESTCASE = False
 
+    TEST_WITH_PDB_DEBUG_INFO = False
+    """
+    Subclasses can set this to True to test with PDB in addition to the other debug info
+    types. This id off by default because many tests will fail due to missing functionality in PDB.
+    See llvm.org/pr149498.
+    """
+
     def generateSource(self, source):
         template = source + ".template"
         temp = os.path.join(self.getSourceDir(), template)
diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index 09939e29e5b75..0122fe8409c29 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -249,6 +249,10 @@ ifeq ($(CC_TYPE), clang)
    MODULE_DEBUG_INFO_FLAGS += -gmodules
 endif
 
+ifeq "$(MAKE_PDB)" "YES"
+	DEBUG_INFO_FLAG ?= -g -gcodeview
+endif
+
 # If the OS is Windows, we need to pass -gdwarf to clang, otherwise it will build
 # with codeview by default but all the tests rely on dwarf.
 ifeq "$(OS)" "Windows_NT"
@@ -290,6 +294,11 @@ ifeq "$(MAKE_DEBUG_NAMES)" "YES"
 	CFLAGS += -gpubnames
 endif
 
+# Enable GNU POSIX extensions (e.g. kill(), usleep(), getpgid(), ...)
+ifeq "$(OS)" "Linux"
+	CFLAGS += -D_DEFAULT_SOURCE
+endif
+
 ifeq "$(USE_PRIVATE_MODULE_CACHE)" "YES"
 THE_CLANG_MODULE_CACHE_DIR := $(BUILDDIR)/private-module-cache
 else
@@ -322,6 +331,17 @@ ifeq (,$(filter $(OS), Windows_NT Android Darwin))
 		LDFLAGS += -pthread
 	endif
 endif
+
+# macOS forbids injecting the ASAN runtime into system processes when
+# SIP is enabled. That includes the just-built libLTO that the
+# just-built clang injects into the system linker.  Since we don't
+# test the compiler here, just use the system (non-asanified) LTO
+# library to make ASAN tests work for most users, including the bots.
+ifeq "$(OS)" "Darwin"
+ifneq "$(ASAN_OPTIONS)" ""
+LDFLAGS += -Wl,-lto_library -Wl,$(shell dirname $(shell xcrun -find clang))/../lib/libLTO.dylib
+endif
+endif
 OBJECTS =
 EXE ?= a.out
 
diff --git a/lldb/packages/Python/lldbsuite/test/test_categories.py b/lldb/packages/Python/lldbsuite/test/test_categories.py
index 1f6e8a78e0c0d..b8a764fb3349a 100644
--- a/lldb/packages/Python/lldbsuite/test/test_categories.py
+++ b/lldb/packages/Python/lldbsuite/test/test_categories.py
@@ -12,7 +12,13 @@
 
 # Key: Category name
 # Value: should be used in lldbtest's debug-info replication
-debug_info_categories = {"dwarf": True, "dwo": True, "dsym": True, "gmodules": False}
+debug_info_categories = {
+    "dwarf": True,
+    "dwo": True,
+    "dsym": True,
+    "pdb": False,
+    "gmodules": False,
+}
 
 all_categories = {
     "basic_process": "Basic process execution sniff tests.",
@@ -34,6 +40,7 @@
     "lldb-dap": "Tests for the Debug Adapter Protocol with lldb-dap",
     "llgs": "Tests for the gdb-server functionality of lldb-server",
     "msvcstl": "Test for MSVC STL data formatters",
+    "pdb": "Tests that can be run with PDB debug information",
     "pexpect": "Tests requiring the pexpect library to be available",
     "objc": "Tests related to the Objective-C programming language support",
     "pyapi": "Tests related to the Python API",
@@ -65,6 +72,8 @@ def is_supported_on_platform(category, platform, compiler_path):
         if platform not in ["darwin", "macosx", "ios", "watchos", "tvos", "bridgeos"]:
             return False
         return gmodules.is_compiler_clang_with_gmodules(compiler_path)
+    elif category == "pdb":
+        return platform == "windows"
     return True
 
 
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 8f3652172dfdf..ac550962cfb85 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -10,8 +10,8 @@
 import subprocess
 import signal
 import sys
+import threading
 import warnings
-import selectors
 import time
 from typing import (
     Any,
@@ -32,6 +32,10 @@
 # timeout by a factor of 10 if ASAN is enabled.
 DEFAULT_TIMEOUT = 10 * (10 if ("ASAN_OPTIONS" in os.environ) else 1)
 
+# See lldbtest.Base.spawnSubprocess, which should help ensure any processes
+# created by the DAP client are terminated correctly when the test ends.
+SpawnHelperCallback = Callable[[str, List[str], List[str]], subprocess.Popen]
+
 ## DAP type references
 
 
@@ -139,6 +143,35 @@ def dump_memory(base_addr, data, num_per_line, outfile):
         outfile.write("\n")
 
 
+def read_packet(
+    f: IO[bytes], trace_file: Optional[IO[str]] = None
+) -> Optional[ProtocolMessage]:
+    """Decode a JSON packet that starts with the content length and is
+    followed by the JSON bytes from a file 'f'. Returns None on EOF.
+    """
+    line = f.readline().decode("utf-8")
+    if len(line) == 0:
+        return None  # EOF.
+
+    # Watch for line that starts with the prefix
+    prefix = "Content-Length: "
+    if line.startswith(prefix):
+        # Decode length of JSON bytes
+        length = int(line[len(prefix) :])
+        # Skip empty line
+        separator = f.readline().decode()
+        if separator != "":
+            Exception("malformed DAP content header, unexpected line: " + separator)
+        # Read JSON bytes
+        json_str = f.read(length).decode()
+        if trace_file:
+            trace_file.write("from adapter:\n%s\n" % (json_str))
+        # Decode the JSON bytes into a python dictionary
+        return json.loads(json_str)
+
+    raise Exception("unexpected malformed message from lldb-dap: " + line)
+
+
 def packet_type_is(packet, packet_type):
     return "type" in packet and packet["type"] == packet_type
 
@@ -162,19 +195,29 @@ def __init__(
         self,
         recv: BinaryIO,
         send: BinaryIO,
-        init_commands: list[str],
-        log_file: Optional[TextIO] = None,
+        init_commands: Optional[List[str]] = None,
+        log_file: Optional[str] = None,
+        spawn_helper: Optional[SpawnHelperCallback] = None,
     ):
         # For debugging test failures, try setting `trace_file = sys.stderr`.
         self.trace_file: Optional[TextIO] = None
         self.log_file = log_file
         self.send = send
         self.recv = recv
-        self.selector = selectors.DefaultSelector()
-        self.selector.register(recv, selectors.EVENT_READ)
+        self.spawn_helper = spawn_helper
+
+        # Packets that have been received and processed but have not yet been
+        # requested by a test case.
+        self._pending_packets: List[Optional[ProtocolMessage]] = []
+        # Received packets that have not yet been processed.
+        self._recv_packets: List[Optional[ProtocolMessage]] = []
+        # Used as a mutex for _recv_packets and for notify when _recv_packets
+        # changes.
+        self._recv_condition = threading.Condition()
+        self._recv_thread = threading.Thread(target=self._read_packet_thread)
 
         # session state
-        self.init_commands = init_commands
+        self.init_commands = init_commands if init_commands else []
         self.exit_status: Optional[int] = None
         self.capabilities: Dict = {}
         self.initialized: bool = False
@@ -197,6 +240,9 @@ def __init__(
         # keyed by breakpoint id
         self.resolved_breakpoints: dict[str, Breakpoint] = {}
 
+        # trigger enqueue thread
+        self._recv_thread.start()
+
     @classmethod
     def encode_content(cls, s: str) -> bytes:
         return ("Content-Length: %u\r\n\r\n%s" % (len(s), s)).encode("utf-8")
@@ -212,46 +258,17 @@ def validate_response(cls, command, response):
                 f"seq mismatch in response {command['seq']} != {response['request_seq']}"
             )
 
-    def _read_packet(
-        self,
-        timeout: float = DEFAULT_TIMEOUT,
-    ) -> Optional[ProtocolMessage]:
-        """Decode a JSON packet that starts with the content length and is
-        followed by the JSON bytes from self.recv. Returns None on EOF.
-        """
-
-        ready = self.selector.select(timeout)
-        if not ready:
-            warnings.warn(
-                "timeout occurred waiting for a packet, check if the test has a"
-                " negative assertion and see if it can be inverted.",
-                stacklevel=4,
-            )
-            return None  # timeout
-
-        line = self.recv.readline().decode("utf-8")
-        if len(line) == 0:
-            return None  # EOF.
-
-        # Watch for line that starts with the prefix
-        prefix = "Content-Length: "
-        if line.startswith(prefix):
-            # Decode length of JSON bytes
-            length = int(line[len(prefix) :])
-            # Skip empty line
-            separator = self.recv.readline().decode()
-            if separator != "":
-                Exception("malformed DAP content header, unexpected line: " + separator)
-            # Read JSON bytes
-            json_str = self.recv.read(length).decode()
-            if self.trace_file:
-                self.trace_file.write(
-                    "%s from adapter:\n%s\n" % (time.time(), json_str)
-                )
-            # Decode the JSON bytes into a python dictionary
-            return json.loads(json_str)
-
-        raise Exception("unexpected malformed message from lldb-dap: " + line)
+    def _read_packet_thread(self):
+        try:
+            while True:
+                packet = read_packet(self.recv, trace_file=self.trace_file)
+                # `packet` will be `None` on EOF. We want to pass it down to
+                # handle_recv_packet anyway so the main thread can handle unexpected
+                # termination of lldb-dap and stop waiting for new packets.
+                if not self._handle_recv_packet(packet):
+                    break
+        finally:
+            dump_dap_log(self.log_file)
 
     def get_modules(
         self, start_module: Optional[int] = None, module_count: Optional[int] = None
@@ -299,6 +316,29 @@ def collect_output(
             output += self.get_output(category, clear=clear)
         return output
 
+    def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool:
+        """Handles an incoming packet.
+
+        Called by the read thread that is waiting for all incoming packets
+        to store the incoming packet in "self._recv_packets" in a thread safe
+        way. This function will then signal the "self._recv_condition" to
+        indicate a new packet is available.
+
+        Args:
+            packet: A new packet to store.
+
+        Returns:
+            True if the caller should keep calling this function for more
+            packets.
+        """
+        with self._recv_condition:
+            self._recv_packets.append(packet)
+            self._recv_condition.notify()
+            # packet is None on EOF
+            return packet is not None and not (
+                packet["type"] == "response" and packet["command"] == "disconnect"
+            )
+
     def _recv_packet(
         self,
         *,
@@ -322,34 +362,46 @@ def _recv_packet(
             The first matching packet for the given predicate, if specified,
             otherwise None.
         """
-        deadline = time.time() + timeout
-
-        while time.time() < deadline:
-            packet = self._read_packet(timeout=deadline - time.time())
-            if packet is None:
-                return None
-            self._process_recv_packet(packet)
-            if not predicate or predicate(packet):
-                return packet
-
-    def _process_recv_packet(self, packet) -> None:
+        assert (
+            threading.current_thread != self._recv_thread
+        ), "Must not be called from the _recv_thread"
+
+        def process_until_match():
+            self._process_recv_packets()
+            for i, packet in enumerate(self._pending_packets):
+                if packet is None:
+                    # We need to return a truthy value to break out of the
+                    # wait_for, use `EOFError` as an indicator of EOF.
+                    return EOFError()
+                if predicate and predicate(packet):
+                    self._pending_packets.pop(i)
+                    return packet
+
+        with self._recv_condition:
+            packet = self._recv_condition.wait_for(process_until_match, timeout)
+            return None if isinstance(packet, EOFError) else packet
+
+    def _process_recv_packets(self) -> None:
         """Process received packets, updating the session state."""
-        if packet and ("seq" not in packet or packet["seq"] == 0):
-            warnings.warn(
-                f"received a malformed packet, expected 'seq != 0' for {packet!r}"
-            )
-        # Handle events that may modify any stateful properties of
-        # the DAP session.
-        if packet and packet["type"] == "event":
-            self._handle_event(packet)
-        elif packet and packet["type"] == "request":
-            # Handle reverse requests and keep processing.
-            self._handle_reverse_request(packet)
+        with self._recv_condition:
+            for packet in self._recv_packets:
+                if packet and ("seq" not in packet or packet["seq"] == 0):
+                    warnings.warn(
+                        f"received a malformed packet, expected 'seq != 0' for {packet!r}"
+                    )
+                # Handle events that may modify any stateful properties of
+                # the DAP session.
+                if packet and packet["type"] == "event":
+                    self._handle_event(packet)
+                elif packet and packet["type"] == "request":
+                    # Handle reverse requests and keep processing.
+                    self._handle_reverse_request(packet)
+                # Move the packet to the pending queue.
+                self._pending_packets.append(packet)
+            self._recv_packets.clear()
 
     def _handle_event(self, packet: Event) -> None:
         """Handle any events that modify debug session state we track."""
-        self.events.append(packet)
-
         event = packet["event"]
         body: Optional[Dict] = packet.get("body", None)
 
@@ -402,8 +454,6 @@ def _handle_event(self, packet: Event) -> None:
             self.invalidated_event = packet
         elif event == "memory":
             self.memory_event = packet
-        elif event == "module":
-            self.module_events.append(packet)
 
     def _handle_reverse_request(self, request: Request) -> None:
         if request in self.reverse_requests:
@@ -411,22 +461,11 @@ def _handle_reverse_request(self, request: Request) -> None:
         self.reverse_requests.append(request)
         arguments = request.get("arguments")
         if request["command"] == "runInTerminal" and arguments is not None:
-            in_shell = arguments.get("argsCanBeInterpretedByShell", False)
-            print("spawning...", arguments["args"])
-            proc = subprocess.Popen(
-                arguments["args"],
-                env=arguments.get("env", {}),
-                cwd=arguments.get("cwd", None),
-                stdin=subprocess.DEVNULL,
-                stdout=sys.stderr,
-                stderr=sys.stderr,
-                shell=in_shell,
-            )
-            body = {}
-            if in_shell:
-                body["shellProcessId"] = proc.pid
-            else:
-                body["processId"] = proc.pid
+            assert self.spawn_helper is not None, "Not configured to spawn subprocesses"
+            [exe, *args] = arguments["args"]
+            env = [f"{k}={v}" for k, v in arguments.get("env", {}).items()]
+            proc = self.spawn_helper(exe, args, env)
+            body = {"processId": proc.pid}
             self.send_packet(
                 {
                     "type": "response",
@@ -472,14 +511,18 @@ def send_packet(self, packet: ProtocolMessage) -> int:
 
         Returns the seq number of the request.
         """
-        packet["seq"] = self.sequence
-        self.sequence += 1
+        # Set the seq for requests.
+        if packet["type"] == "request":
+            packet["seq"] = self.sequence
+            self.sequence += 1
+        else:
+            packet["seq"] = 0
 
         # Encode our command dictionary as a JSON string
         json_str = json.dumps(packet, separators=(",", ":"))
 
         if self.trace_file:
-            self.trace_file.write("%s to adapter:\n%s\n" % (time.time(), json_str))
+            self.trace_file.write("to adapter:\n%s\n" % (json_str))
 
         length = len(json_str)
         if length > 0:
@@ -860,8 +903,6 @@ def request_restart(self, restartArguments=None):
         if restartArguments:
             command_dict["arguments"] = restartArguments
 
-        # Clear state, the process is about to restart...
-        self._process_continued(True)
         response = self._send_recv(command_dict)
         # Caller must still call wait_for_stopped.
         return response
@@ -1428,10 +1469,8 @@ def request_testGetTargetBreakpoints(self):
 
     def terminate(self):
         self.send.close()
-        self.recv.close()
-        self.selector.close()
-        if self.log_file:
-            dump_dap_log(self.log_file)
+        if self._recv_thread.is_alive():
+            self._recv_thread.join()
 
     def request_setInstructionBreakpoints(self, memory_reference=[]):
         breakpoints = []
@@ -1452,12 +1491,14 @@ def request_setInstructionBreakpoints(self, memory_reference=[]):
 class DebugAdapterServer(DebugCommunication):
     def __init__(
         self,
+        *,
         executable: Optional[str] = None,
         connection: Optional[str] = None,
-        init_commands: list[str] = [],
-        log_file: Optional[TextIO] = None,
-        env: Optional[dict[str, str]] = None,
-        additional_args: list[str] = [],
+        init_commands: Optional[list[str]] = None,
+        log_file: Optional[str] = None,
+        env: Optional[Dict[str, str]] = None,
+        additional_args: Optional[List[str]] = None,
+        spawn_helper: Optional[SpawnHelperCallback] = None,
     ):
         self.process = None
         self.connection = None
@@ -1483,13 +1524,21 @@ def __init__(
                 s = socket.create_connection((host.strip("[]"), int(port)))
             else:
                 raise ValueError("invalid connection: {}".format(connection))
-            DebugCommunication.__init__(
-                self, s.makefile("rb"), s.makefile("wb"), init_commands, log_file
+            super().__init__(
+                s.makefile("rb"),
+                s.makefile("wb"),
+                init_commands,
+                log_file,
+                spawn_helper,
             )
             self.connection = connection
         else:
-            DebugCommunication.__init__(
-                self, self.process.stdout, self.process.stdin, init_commands, log_file
+            super().__init__(
+                self.process.stdout,
+                self.process.stdin,
+                init_commands,
+                log_file,
+                spawn_helper,
             )
 
     @classmethod
@@ -1497,14 +1546,14 @@ def launch(
         cls,
         *,
         executable: str,
-        env: Optional[dict[str, str]] = None,
-        log_file: Optional[TextIO] = None,
+        env: Optional[Dict[str, str]] = None,
+        log_file: Optional[str] = None,
         connection: Optional[str] = None,
         connection_timeout: Optional[int] = None,
-        additional_args: list[str] = [],
+        additional_args: Optional[List[str]] = None,
     ) -> tuple[subprocess.Popen, Optional[str]]:
         adapter_env = os.environ.copy()
-        if env is not None:
+        if env:
             adapter_env.update(env)
 
         if log_file:
@@ -1512,7 +1561,8 @@ def launch(
         args = [executable]
 
         # Add additional arguments first (like --no-lldbinit)
-        args.extend(additional_args)
+        if additional_args:
+            args.extend(additional_args)
 
         if connection is not None:
             args.append("--connection")
@@ -1528,7 +1578,6 @@ def launch(
             stdout=subprocess.PIPE,
             stderr=sys.stderr,
             env=adapter_env,
-            bufsize=0,
         )
 
         if connection is None:
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index 405e91fc2dc36..71ca60ebe8d34 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -15,8 +15,6 @@
 # DAP tests as a whole have been flakey on the Windows on Arm bot. See:
 # https://github.com/llvm/llvm-project/issues/137660
 @skipIf(oslist=["windows"], archs=["aarch64"])
-# The Arm Linux bot needs stable resources before it can run these tests reliably.
-@skipIf(oslist=["linux"], archs=["arm$"])
 class DAPTestCaseBase(TestBase):
     # set timeout based on whether ASAN was enabled or not. Increase
     # timeout by a factor of 10 if ASAN is enabled.
@@ -41,6 +39,7 @@ def create_debug_adapter(
             log_file=log_file_path,
             env=lldbDAPEnv,
             additional_args=additional_args or [],
+            spawn_helper=self.spawnSubprocess,
         )
 
     def build_and_create_debug_adapter(
@@ -225,6 +224,16 @@ def verify_stop_exception_info(self, expected_description):
                     return True
         return False
 
+    def verify_stop_on_entry(self) -> None:
+        """Waits for the process to be stopped and then verifies at least one
+        thread has the stop reason 'entry'."""
+        self.dap_server.wait_for_stopped()
+        self.assertIn(
+            "entry",
+            (t["reason"] for t in self.dap_server.thread_stop_reasons.values()),
+            "Expected at least one thread to report stop reason 'entry' in {self.dap_server.thread_stop_reasons}",
+        )
+
     def verify_commands(self, flavor: str, output: str, commands: list[str]):
         self.assertTrue(output and len(output) > 0, "expect console output")
         lines = output.splitlines()
@@ -418,7 +427,7 @@ def continue_to_next_stop(self):
         return self.dap_server.wait_for_stopped()
 
     def continue_to_breakpoint(self, breakpoint_id: str):
-        self.continue_to_breakpoints([breakpoint_id])
+        self.continue_to_breakpoints((breakpoint_id))
 
     def continue_to_breakpoints(self, breakpoint_ids):
         self.do_continue()
diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt
index ce59ee505cd3d..ac47580d60840 100644
--- a/lldb/source/API/CMakeLists.txt
+++ b/lldb/source/API/CMakeLists.txt
@@ -69,6 +69,7 @@ add_lldb_library(liblldb SHARED ${option_framework}
   SBFileSpecList.cpp
   SBFormat.cpp
   SBFrame.cpp
+  SBFrameList.cpp
   SBFunction.cpp
   SBHostOS.cpp
   SBInstruction.cpp
diff --git a/lldb/source/API/SBFrameList.cpp b/lldb/source/API/SBFrameList.cpp
new file mode 100644
index 0000000000000..d5fa955c10f70
--- /dev/null
+++ b/lldb/source/API/SBFrameList.cpp
@@ -0,0 +1,97 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception.
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/API/SBFrameList.h"
+#include "lldb/API/SBFrame.h"
+#include "lldb/API/SBStream.h"
+#include "lldb/API/SBThread.h"
+#include "lldb/Target/StackFrameList.h"
+#include "lldb/Target/Thread.h"
+#include "lldb/Utility/Instrumentation.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+SBFrameList::SBFrameList() : m_opaque_sp() { LLDB_INSTRUMENT_VA(this); }
+
+SBFrameList::SBFrameList(const SBFrameList &rhs)
+    : m_opaque_sp(rhs.m_opaque_sp) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+}
+
+SBFrameList::~SBFrameList() = default;
+
+const SBFrameList &SBFrameList::operator=(const SBFrameList &rhs) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+
+  if (this != &rhs)
+    m_opaque_sp = rhs.m_opaque_sp;
+  return *this;
+}
+
+SBFrameList::SBFrameList(const lldb::StackFrameListSP &frame_list_sp)
+    : m_opaque_sp(frame_list_sp) {}
+
+void SBFrameList::SetFrameList(const lldb::StackFrameListSP &frame_list_sp) {
+  m_opaque_sp = frame_list_sp;
+}
+
+SBFrameList::operator bool() const {
+  LLDB_INSTRUMENT_VA(this);
+
+  return m_opaque_sp.get() != nullptr;
+}
+
+bool SBFrameList::IsValid() const {
+  LLDB_INSTRUMENT_VA(this);
+  return this->operator bool();
+}
+
+uint32_t SBFrameList::GetSize() const {
+  LLDB_INSTRUMENT_VA(this);
+
+  if (m_opaque_sp)
+    return m_opaque_sp->GetNumFrames();
+  return 0;
+}
+
+SBFrame SBFrameList::GetFrameAtIndex(uint32_t idx) const {
+  LLDB_INSTRUMENT_VA(this, idx);
+
+  SBFrame sb_frame;
+  if (m_opaque_sp)
+    sb_frame.SetFrameSP(m_opaque_sp->GetFrameAtIndex(idx));
+  return sb_frame;
+}
+
+SBThread SBFrameList::GetThread() const {
+  LLDB_INSTRUMENT_VA(this);
+
+  SBThread sb_thread;
+  if (m_opaque_sp)
+    sb_thread.SetThread(m_opaque_sp->GetThread().shared_from_this());
+  return sb_thread;
+}
+
+void SBFrameList::Clear() {
+  LLDB_INSTRUMENT_VA(this);
+
+  if (m_opaque_sp)
+    m_opaque_sp->Clear();
+}
+
+bool SBFrameList::GetDescription(SBStream &description) const {
+  LLDB_INSTRUMENT_VA(this, description);
+
+  if (!m_opaque_sp)
+    return false;
+
+  Stream &strm = description.ref();
+  m_opaque_sp->Dump(&strm);
+  return true;
+}
diff --git a/lldb/source/API/SBModule.cpp b/lldb/source/API/SBModule.cpp
index 5a57f45f0d475..32067ac1c650f 100644
--- a/lldb/source/API/SBModule.cpp
+++ b/lldb/source/API/SBModule.cpp
@@ -37,8 +37,8 @@ SBModule::SBModule(const SBModuleSpec &module_spec) {
   LLDB_INSTRUMENT_VA(this, module_spec);
 
   ModuleSP module_sp;
-  Status error = ModuleList::GetSharedModule(
-      *module_spec.m_opaque_up, module_sp, nullptr, nullptr, nullptr);
+  Status error = ModuleList::GetSharedModule(*module_spec.m_opaque_up,
+                                             module_sp, nullptr, nullptr);
   if (module_sp)
     SetSP(module_sp);
 }
diff --git a/lldb/source/API/SBModuleSpec.cpp b/lldb/source/API/SBModuleSpec.cpp
index fbbcfeac20178..031ba1256d18a 100644
--- a/lldb/source/API/SBModuleSpec.cpp
+++ b/lldb/source/API/SBModuleSpec.cpp
@@ -9,6 +9,7 @@
 #include "lldb/API/SBModuleSpec.h"
 #include "Utils.h"
 #include "lldb/API/SBStream.h"
+#include "lldb/API/SBTarget.h"
 #include "lldb/Core/Module.h"
 #include "lldb/Core/ModuleSpec.h"
 #include "lldb/Host/Host.h"
@@ -174,6 +175,18 @@ void SBModuleSpec::SetObjectSize(uint64_t object_size) {
   m_opaque_up->SetObjectSize(object_size);
 }
 
+SBTarget SBModuleSpec::GetTarget() {
+  LLDB_INSTRUMENT_VA(this);
+
+  return SBTarget(m_opaque_up->GetTargetSP());
+}
+
+void SBModuleSpec::SetTarget(SBTarget target) {
+  LLDB_INSTRUMENT_VA(this, target);
+
+  m_opaque_up->SetTarget(target.GetSP());
+}
+
 SBModuleSpecList::SBModuleSpecList() : m_opaque_up(new ModuleSpecList()) {
   LLDB_INSTRUMENT_VA(this);
 }
diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp
index 4e4aa48bc9a2e..f32c5c56444cd 100644
--- a/lldb/source/API/SBThread.cpp
+++ b/lldb/source/API/SBThread.cpp
@@ -14,6 +14,7 @@
 #include "lldb/API/SBFileSpec.h"
 #include "lldb/API/SBFormat.h"
 #include "lldb/API/SBFrame.h"
+#include "lldb/API/SBFrameList.h"
 #include "lldb/API/SBProcess.h"
 #include "lldb/API/SBStream.h"
 #include "lldb/API/SBStructuredData.h"
@@ -239,11 +240,34 @@ SBThread::GetStopReasonExtendedBacktraces(InstrumentationRuntimeType type) {
   return threads;
 }
 
-size_t SBThread::GetStopDescription(char *dst, size_t dst_len) {
-  LLDB_INSTRUMENT_VA(this, dst, dst_len);
+bool SBThread::GetStopDescription(lldb::SBStream &stream) const {
+  LLDB_INSTRUMENT_VA(this, stream);
 
-  if (dst)
-    *dst = 0;
+  if (!m_opaque_sp)
+    return false;
+
+  llvm::Expected<StoppedExecutionContext> exe_ctx =
+      GetStoppedExecutionContext(m_opaque_sp);
+  if (!exe_ctx) {
+    LLDB_LOG_ERROR(GetLog(LLDBLog::API), exe_ctx.takeError(), "{0}");
+    return false;
+  }
+
+  if (!exe_ctx->HasThreadScope())
+    return false;
+
+  Stream &strm = stream.ref();
+  const std::string stop_desc = exe_ctx->GetThreadPtr()->GetStopDescription();
+  strm.PutCString(stop_desc);
+
+  return true;
+}
+
+size_t SBThread::GetStopDescription(char *dst_or_null, size_t dst_len) {
+  LLDB_INSTRUMENT_VA(this, dst_or_null, dst_len);
+
+  if (dst_or_null)
+    *dst_or_null = 0;
 
   llvm::Expected<StoppedExecutionContext> exe_ctx =
       GetStoppedExecutionContext(m_opaque_sp);
@@ -259,8 +283,8 @@ size_t SBThread::GetStopDescription(char *dst, size_t dst_len) {
   if (thread_stop_desc.empty())
     return 0;
 
-  if (dst)
-    return ::snprintf(dst, dst_len, "%s", thread_stop_desc.c_str()) + 1;
+  if (dst_or_null)
+    return ::snprintf(dst_or_null, dst_len, "%s", thread_stop_desc.c_str()) + 1;
 
   // NULL dst passed in, return the length needed to contain the
   // description.
@@ -1079,6 +1103,26 @@ SBFrame SBThread::GetFrameAtIndex(uint32_t idx) {
   return sb_frame;
 }
 
+lldb::SBFrameList SBThread::GetFrames() {
+  LLDB_INSTRUMENT_VA(this);
+
+  SBFrameList sb_frame_list;
+  llvm::Expected<StoppedExecutionContext> exe_ctx =
+      GetStoppedExecutionContext(m_opaque_sp);
+  if (!exe_ctx) {
+    LLDB_LOG_ERROR(GetLog(LLDBLog::API), exe_ctx.takeError(), "{0}");
+    return SBFrameList();
+  }
+
+  if (exe_ctx->HasThreadScope()) {
+    StackFrameListSP frame_list_sp =
+        exe_ctx->GetThreadPtr()->GetStackFrameList();
+    sb_frame_list.SetFrameList(frame_list_sp);
+  }
+
+  return sb_frame_list;
+}
+
 lldb::SBFrame SBThread::GetSelectedFrame() {
   LLDB_INSTRUMENT_VA(this);
 
diff --git a/lldb/source/Breakpoint/Breakpoint.cpp b/lldb/source/Breakpoint/Breakpoint.cpp
index b23d1143d60c4..201d8d20c4901 100644
--- a/lldb/source/Breakpoint/Breakpoint.cpp
+++ b/lldb/source/Breakpoint/Breakpoint.cpp
@@ -1098,14 +1098,9 @@ bool Breakpoint::EvaluatePrecondition(StoppointCallbackContext &context) {
 }
 
 void Breakpoint::SendBreakpointChangedEvent(
-    lldb::BreakpointEventType eventKind) {
-  if (!IsInternal() && GetTarget().EventTypeHasListeners(
-                           Target::eBroadcastBitBreakpointChanged)) {
-    std::shared_ptr<BreakpointEventData> data =
-        std::make_shared<BreakpointEventData>(eventKind, shared_from_this());
-
-    GetTarget().BroadcastEvent(Target::eBroadcastBitBreakpointChanged, data);
-  }
+    lldb::BreakpointEventType event_kind) {
+  if (!IsInternal())
+    GetTarget().NotifyBreakpointChanged(*this, event_kind);
 }
 
 void Breakpoint::SendBreakpointChangedEvent(
@@ -1113,10 +1108,8 @@ void Breakpoint::SendBreakpointChangedEvent(
   if (!breakpoint_data_sp)
     return;
 
-  if (!IsInternal() &&
-      GetTarget().EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged))
-    GetTarget().BroadcastEvent(Target::eBroadcastBitBreakpointChanged,
-                               breakpoint_data_sp);
+  if (!IsInternal())
+    GetTarget().NotifyBreakpointChanged(*this, breakpoint_data_sp);
 }
 
 const char *Breakpoint::BreakpointEventTypeAsCString(BreakpointEventType type) {
diff --git a/lldb/source/Breakpoint/BreakpointList.cpp b/lldb/source/Breakpoint/BreakpointList.cpp
index 779490ae0316a..e3dd62bfa329d 100644
--- a/lldb/source/Breakpoint/BreakpointList.cpp
+++ b/lldb/source/Breakpoint/BreakpointList.cpp
@@ -16,13 +16,7 @@ using namespace lldb;
 using namespace lldb_private;
 
 static void NotifyChange(const BreakpointSP &bp, BreakpointEventType event) {
-  Target &target = bp->GetTarget();
-  if (target.EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged)) {
-    auto event_data_sp =
-        std::make_shared<Breakpoint::BreakpointEventData>(event, bp);
-    target.BroadcastEvent(Target::eBroadcastBitBreakpointChanged,
-                          event_data_sp);
-  }
+  bp->GetTarget().NotifyBreakpointChanged(*bp, event);
 }
 
 BreakpointList::BreakpointList(bool is_internal)
diff --git a/lldb/source/Breakpoint/BreakpointLocation.cpp b/lldb/source/Breakpoint/BreakpointLocation.cpp
index 22c98acda8c59..25285beb7ffd5 100644
--- a/lldb/source/Breakpoint/BreakpointLocation.cpp
+++ b/lldb/source/Breakpoint/BreakpointLocation.cpp
@@ -251,7 +251,7 @@ bool BreakpointLocation::ConditionSaysStop(ExecutionContext &exe_ctx,
     }
 
     m_user_expression_sp.reset(GetTarget().GetUserExpressionForLanguage(
-        condition.GetText(), llvm::StringRef(), language,
+        condition.GetText(), llvm::StringRef(), SourceLanguage{language},
         Expression::eResultTypeAny, EvaluateExpressionOptions(), nullptr,
         error));
     if (error.Fail()) {
@@ -749,13 +749,11 @@ void BreakpointLocation::Dump(Stream *s) const {
 
 void BreakpointLocation::SendBreakpointLocationChangedEvent(
     lldb::BreakpointEventType eventKind) {
-  if (!m_owner.IsInternal() && m_owner.GetTarget().EventTypeHasListeners(
-                                   Target::eBroadcastBitBreakpointChanged)) {
+  if (!m_owner.IsInternal()) {
     auto data_sp = std::make_shared<Breakpoint::BreakpointEventData>(
         eventKind, m_owner.shared_from_this());
     data_sp->GetBreakpointLocationCollection().Add(shared_from_this());
-    m_owner.GetTarget().BroadcastEvent(Target::eBroadcastBitBreakpointChanged,
-                                       data_sp);
+    m_owner.GetTarget().NotifyBreakpointChanged(m_owner, data_sp);
   }
 }
 
diff --git a/lldb/source/Breakpoint/BreakpointLocationCollection.cpp b/lldb/source/Breakpoint/BreakpointLocationCollection.cpp
index 97715836ec104..adff4299a5289 100644
--- a/lldb/source/Breakpoint/BreakpointLocationCollection.cpp
+++ b/lldb/source/Breakpoint/BreakpointLocationCollection.cpp
@@ -24,7 +24,7 @@ BreakpointLocationCollection::BreakpointLocationCollection(bool preserving)
 BreakpointLocationCollection::~BreakpointLocationCollection() = default;
 
 void BreakpointLocationCollection::Add(const BreakpointLocationSP &bp_loc) {
-  std::lock_guard<std::mutex> guard(m_collection_mutex);
+  std::lock_guard<std::recursive_mutex> guard(m_collection_mutex);
   BreakpointLocationSP old_bp_loc =
       FindByIDPair(bp_loc->GetBreakpoint().GetID(), bp_loc->GetID());
   if (!old_bp_loc.get()) {
@@ -44,7 +44,7 @@ void BreakpointLocationCollection::Add(const BreakpointLocationSP &bp_loc) {
 
 bool BreakpointLocationCollection::Remove(lldb::break_id_t bp_id,
                                           lldb::break_id_t bp_loc_id) {
-  std::lock_guard<std::mutex> guard(m_collection_mutex);
+  std::lock_guard<std::recursive_mutex> guard(m_collection_mutex);
   collection::iterator pos = GetIDPairIterator(bp_id, bp_loc_id); // Predicate
   if (pos != m_break_loc_collection.end()) {
     if (m_preserving_bkpts) {
@@ -117,7 +117,7 @@ const BreakpointLocationSP BreakpointLocationCollection::FindByIDPair(
 }
 
 BreakpointLocationSP BreakpointLocationCollection::GetByIndex(size_t i) {
-  std::lock_guard<std::mutex> guard(m_collection_mutex);
+  std::lock_guard<std::recursive_mutex> guard(m_collection_mutex);
   BreakpointLocationSP stop_sp;
   if (i < m_break_loc_collection.size())
     stop_sp = m_break_loc_collection[i];
@@ -127,7 +127,7 @@ BreakpointLocationSP BreakpointLocationCollection::GetByIndex(size_t i) {
 
 const BreakpointLocationSP
 BreakpointLocationCollection::GetByIndex(size_t i) const {
-  std::lock_guard<std::mutex> guard(m_collection_mutex);
+  std::lock_guard<std::recursive_mutex> guard(m_collection_mutex);
   BreakpointLocationSP stop_sp;
   if (i < m_break_loc_collection.size())
     stop_sp = m_break_loc_collection[i];
@@ -168,7 +168,7 @@ bool BreakpointLocationCollection::ShouldStop(
 }
 
 bool BreakpointLocationCollection::ValidForThisThread(Thread &thread) {
-  std::lock_guard<std::mutex> guard(m_collection_mutex);
+  std::lock_guard<std::recursive_mutex> guard(m_collection_mutex);
   collection::iterator pos, begin = m_break_loc_collection.begin(),
                             end = m_break_loc_collection.end();
 
@@ -180,7 +180,7 @@ bool BreakpointLocationCollection::ValidForThisThread(Thread &thread) {
 }
 
 bool BreakpointLocationCollection::IsInternal() const {
-  std::lock_guard<std::mutex> guard(m_collection_mutex);
+  std::lock_guard<std::recursive_mutex> guard(m_collection_mutex);
   collection::const_iterator pos, begin = m_break_loc_collection.begin(),
                                   end = m_break_loc_collection.end();
 
@@ -197,7 +197,7 @@ bool BreakpointLocationCollection::IsInternal() const {
 
 void BreakpointLocationCollection::GetDescription(
     Stream *s, lldb::DescriptionLevel level) {
-  std::lock_guard<std::mutex> guard(m_collection_mutex);
+  std::lock_guard<std::recursive_mutex> guard(m_collection_mutex);
   collection::iterator pos, begin = m_break_loc_collection.begin(),
                             end = m_break_loc_collection.end();
 
@@ -212,8 +212,10 @@ BreakpointLocationCollection &BreakpointLocationCollection::operator=(
     const BreakpointLocationCollection &rhs) {
   if (this != &rhs) {
       std::lock(m_collection_mutex, rhs.m_collection_mutex);
-      std::lock_guard<std::mutex> lhs_guard(m_collection_mutex, std::adopt_lock);
-      std::lock_guard<std::mutex> rhs_guard(rhs.m_collection_mutex, std::adopt_lock);
+      std::lock_guard<std::recursive_mutex> lhs_guard(m_collection_mutex,
+                                                      std::adopt_lock);
+      std::lock_guard<std::recursive_mutex> rhs_guard(rhs.m_collection_mutex,
+                                                      std::adopt_lock);
       m_break_loc_collection = rhs.m_break_loc_collection;
   }
   return *this;
diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
index 0d9eb45732161..40f00c90bbbfb 100644
--- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp
+++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
@@ -95,9 +95,9 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
   StackFrame *frame = m_exe_ctx.GetFramePtr();
 
   // Either the language was explicitly specified, or we check the frame.
-  lldb::LanguageType language = m_expr_options.language;
-  if (language == lldb::eLanguageTypeUnknown && frame)
-    language = frame->GuessLanguage().AsLanguageType();
+  SourceLanguage language{m_expr_options.language};
+  if (!language && frame)
+    language = frame->GuessLanguage();
 
   // Add a hint if object description was requested, but no description
   // function was implemented.
@@ -119,8 +119,8 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
         "^<\\S+: 0x[[:xdigit:]]{5,}>\\s*$");
 
     if (GetDebugger().GetShowDontUsePoHint() && target_ptr &&
-        (language == lldb::eLanguageTypeSwift ||
-         language == lldb::eLanguageTypeObjC) &&
+        (language.AsLanguageType() == lldb::eLanguageTypeSwift ||
+         language.IsObjC()) &&
         std::regex_match(output.data(), swift_class_regex)) {
 
       result.AppendNote(
@@ -193,7 +193,8 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
 
   // Second, try `expr` as a persistent variable.
   if (expr.starts_with("$"))
-    if (auto *state = target.GetPersistentExpressionStateForLanguage(language))
+    if (auto *state = target.GetPersistentExpressionStateForLanguage(
+            language.AsLanguageType()))
       if (auto var_sp = state->GetVariable(expr))
         if (auto valobj_sp = var_sp->GetValueObject()) {
           dump_val_object(*valobj_sp);
diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp
index 88a02dce35b9d..9133359fbf537 100644
--- a/lldb/source/Commands/CommandObjectFrame.cpp
+++ b/lldb/source/Commands/CommandObjectFrame.cpp
@@ -265,6 +265,29 @@ class CommandObjectFrameSelect : public CommandObjectParsed {
 
   Options *GetOptions() override { return &m_options; }
 
+private:
+  void SkipHiddenFrames(Thread &thread, uint32_t frame_idx) {
+    uint32_t candidate_idx = frame_idx;
+    const unsigned max_depth = 12;
+    for (unsigned num_try = 0; num_try < max_depth; ++num_try) {
+      if (candidate_idx == 0 && *m_options.relative_frame_offset == -1) {
+        candidate_idx = UINT32_MAX;
+        break;
+      }
+      candidate_idx += *m_options.relative_frame_offset;
+      if (auto candidate_sp = thread.GetStackFrameAtIndex(candidate_idx)) {
+        if (candidate_sp->IsHidden())
+          continue;
+        // Now candidate_idx is the first non-hidden frame.
+        break;
+      }
+      candidate_idx = UINT32_MAX;
+      break;
+    };
+    if (candidate_idx != UINT32_MAX)
+      m_options.relative_frame_offset = candidate_idx - frame_idx;
+  }
+
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     // No need to check "thread" for validity as eCommandRequiresThread ensures
@@ -278,28 +301,13 @@ class CommandObjectFrameSelect : public CommandObjectParsed {
       if (frame_idx == UINT32_MAX)
         frame_idx = 0;
 
-      // If moving up/down by one, skip over hidden frames.
-      if (*m_options.relative_frame_offset == 1 ||
-          *m_options.relative_frame_offset == -1) {
-        uint32_t candidate_idx = frame_idx;
-        const unsigned max_depth = 12;
-        for (unsigned num_try = 0; num_try < max_depth; ++num_try) {
-          if (candidate_idx == 0 && *m_options.relative_frame_offset == -1) {
-            candidate_idx = UINT32_MAX;
-            break;
-          }
-          candidate_idx += *m_options.relative_frame_offset;
-          if (auto candidate_sp = thread->GetStackFrameAtIndex(candidate_idx)) {
-            if (candidate_sp->IsHidden())
-              continue;
-            // Now candidate_idx is the first non-hidden frame.
-            break;
-          }
-          candidate_idx = UINT32_MAX;
-          break;
-        };
-        if (candidate_idx != UINT32_MAX)
-          m_options.relative_frame_offset = candidate_idx - frame_idx;
+      // If moving up/down by one, skip over hidden frames, unless we started
+      // in a hidden frame.
+      if ((*m_options.relative_frame_offset == 1 ||
+           *m_options.relative_frame_offset == -1)) {
+        if (auto current_frame_sp = thread->GetStackFrameAtIndex(frame_idx);
+            !current_frame_sp->IsHidden())
+          SkipHiddenFrames(*thread, frame_idx);
       }
 
       if (*m_options.relative_frame_offset < 0) {
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index 8de6521e65b25..30bca639060e6 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -5121,6 +5121,15 @@ class CommandObjectTargetStopHookDelete : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "target stop-hook delete",
                             "Delete a stop-hook.",
                             "target stop-hook delete [<idx>]") {
+    SetHelpLong(
+        R"(
+Deletes the stop hook by index.
+
+At any given stop, all enabled stop hooks that pass the stop filter will
+get a chance to run.  That means if one stop-hook deletes another stop hook 
+while executing, the deleted stop hook will still fire for the stop at which 
+it was deleted.
+        )");
     AddSimpleArgumentList(eArgTypeStopHookID, eArgRepeatStar);
   }
 
diff --git a/lldb/source/Core/DemangledNameInfo.cpp b/lldb/source/Core/DemangledNameInfo.cpp
index 76f8987c5149c..16fbfda299b21 100644
--- a/lldb/source/Core/DemangledNameInfo.cpp
+++ b/lldb/source/Core/DemangledNameInfo.cpp
@@ -16,7 +16,7 @@ bool TrackingOutputBuffer::shouldTrack() const {
   if (!isPrintingTopLevelFunctionType())
     return false;
 
-  if (isGtInsideTemplateArgs())
+  if (isInsideTemplateArgs())
     return false;
 
   if (NameInfo.ArgumentsRange.first > 0)
@@ -29,7 +29,7 @@ bool TrackingOutputBuffer::canFinalize() const {
   if (!isPrintingTopLevelFunctionType())
     return false;
 
-  if (isGtInsideTemplateArgs())
+  if (isInsideTemplateArgs())
     return false;
 
   if (NameInfo.ArgumentsRange.first == 0)
diff --git a/lldb/source/Core/DynamicLoader.cpp b/lldb/source/Core/DynamicLoader.cpp
index 7580b15c02ce1..b309e0f0a72fd 100644
--- a/lldb/source/Core/DynamicLoader.cpp
+++ b/lldb/source/Core/DynamicLoader.cpp
@@ -227,6 +227,7 @@ ModuleSP DynamicLoader::LoadBinaryWithUUIDAndAddress(
     }
   }
   ModuleSpec module_spec;
+  module_spec.SetTarget(target.shared_from_this());
   module_spec.GetUUID() = uuid;
   FileSpec name_filespec(name);
   if (FileSystem::Instance().Exists(name_filespec))
@@ -238,8 +239,8 @@ ModuleSP DynamicLoader::LoadBinaryWithUUIDAndAddress(
     // Has lldb already seen a module with this UUID?
     // Or have external lookup enabled in DebugSymbols on macOS.
     if (!module_sp)
-      error = ModuleList::GetSharedModule(module_spec, module_sp, nullptr,
-                                          nullptr, nullptr);
+      error =
+          ModuleList::GetSharedModule(module_spec, module_sp, nullptr, nullptr);
 
     // Can lldb's symbol/executable location schemes
     // find an executable and symbol file.
diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp
index c40612c1ced5e..d9f845681e701 100644
--- a/lldb/source/Core/ModuleList.cpp
+++ b/lldb/source/Core/ModuleList.cpp
@@ -19,6 +19,8 @@
 #include "lldb/Symbol/SymbolContext.h"
 #include "lldb/Symbol/TypeList.h"
 #include "lldb/Symbol/VariableList.h"
+#include "lldb/Target/Platform.h"
+#include "lldb/Target/Target.h"
 #include "lldb/Utility/ArchSpec.h"
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/FileSpecList.h"
@@ -1038,9 +1040,9 @@ size_t ModuleList::RemoveOrphanSharedModules(bool mandatory) {
 
 Status
 ModuleList::GetSharedModule(const ModuleSpec &module_spec, ModuleSP &module_sp,
-                            const FileSpecList *module_search_paths_ptr,
                             llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
-                            bool *did_create_ptr, bool always_create) {
+                            bool *did_create_ptr, bool always_create,
+                            bool invoke_locate_callback) {
   SharedModuleList &shared_module_list = GetSharedModuleList();
   std::lock_guard<std::recursive_mutex> guard(shared_module_list.GetMutex());
   char path[PATH_MAX];
@@ -1095,6 +1097,22 @@ ModuleList::GetSharedModule(const ModuleSpec &module_spec, ModuleSP &module_sp,
   if (module_sp)
     return error;
 
+  // Try target's platform locate module callback before second attempt.
+  if (invoke_locate_callback) {
+    TargetSP target_sp = module_spec.GetTargetSP();
+    if (target_sp && target_sp->IsValid()) {
+      if (PlatformSP platform_sp = target_sp->GetPlatform()) {
+        FileSpec symbol_file_spec;
+        platform_sp->CallLocateModuleCallbackIfSet(
+            module_spec, module_sp, symbol_file_spec, did_create_ptr);
+        if (module_sp) {
+          // The callback found a module.
+          return error;
+        }
+      }
+    }
+  }
+
   module_sp = std::make_shared<Module>(module_spec);
   // Make sure there are a module and an object file since we can specify a
   // valid file path with an architecture that might not be in that file. By
@@ -1122,10 +1140,16 @@ ModuleList::GetSharedModule(const ModuleSpec &module_spec, ModuleSP &module_sp,
     module_sp.reset();
   }
 
-  if (module_search_paths_ptr) {
-    const auto num_directories = module_search_paths_ptr->GetSize();
+  // Get module search paths from the target if available.
+  lldb::TargetSP target_sp = module_spec.GetTargetSP();
+  FileSpecList module_search_paths;
+  if (target_sp)
+    module_search_paths = target_sp->GetExecutableSearchPaths();
+
+  if (!module_search_paths.IsEmpty()) {
+    const auto num_directories = module_search_paths.GetSize();
     for (size_t idx = 0; idx < num_directories; ++idx) {
-      auto search_path_spec = module_search_paths_ptr->GetFileSpecAtIndex(idx);
+      auto search_path_spec = module_search_paths.GetFileSpecAtIndex(idx);
       FileSystem::Instance().Resolve(search_path_spec);
       namespace fs = llvm::sys::fs;
       if (!FileSystem::Instance().IsDirectory(search_path_spec))
diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp
index 588736715f817..4e3563cf419fe 100644
--- a/lldb/source/Core/PluginManager.cpp
+++ b/lldb/source/Core/PluginManager.cpp
@@ -1300,6 +1300,61 @@ PluginManager::GetScriptInterpreterForLanguage(lldb::ScriptLanguage script_lang,
   return none_instance(debugger);
 }
 
+#pragma mark SyntheticFrameProvider
+
+typedef PluginInstance<SyntheticFrameProviderCreateInstance>
+    SyntheticFrameProviderInstance;
+typedef PluginInstance<ScriptedFrameProviderCreateInstance>
+    ScriptedFrameProviderInstance;
+typedef PluginInstances<SyntheticFrameProviderInstance>
+    SyntheticFrameProviderInstances;
+typedef PluginInstances<ScriptedFrameProviderInstance>
+    ScriptedFrameProviderInstances;
+
+static SyntheticFrameProviderInstances &GetSyntheticFrameProviderInstances() {
+  static SyntheticFrameProviderInstances g_instances;
+  return g_instances;
+}
+
+static ScriptedFrameProviderInstances &GetScriptedFrameProviderInstances() {
+  static ScriptedFrameProviderInstances g_instances;
+  return g_instances;
+}
+
+bool PluginManager::RegisterPlugin(
+    llvm::StringRef name, llvm::StringRef description,
+    SyntheticFrameProviderCreateInstance create_native_callback,
+    ScriptedFrameProviderCreateInstance create_scripted_callback) {
+  if (create_native_callback)
+    return GetSyntheticFrameProviderInstances().RegisterPlugin(
+        name, description, create_native_callback);
+  else if (create_scripted_callback)
+    return GetScriptedFrameProviderInstances().RegisterPlugin(
+        name, description, create_scripted_callback);
+  return false;
+}
+
+bool PluginManager::UnregisterPlugin(
+    SyntheticFrameProviderCreateInstance create_callback) {
+  return GetSyntheticFrameProviderInstances().UnregisterPlugin(create_callback);
+}
+
+bool PluginManager::UnregisterPlugin(
+    ScriptedFrameProviderCreateInstance create_callback) {
+  return GetScriptedFrameProviderInstances().UnregisterPlugin(create_callback);
+}
+
+SyntheticFrameProviderCreateInstance
+PluginManager::GetSyntheticFrameProviderCreateCallbackForPluginName(
+    llvm::StringRef name) {
+  return GetSyntheticFrameProviderInstances().GetCallbackForName(name);
+}
+
+ScriptedFrameProviderCreateInstance
+PluginManager::GetScriptedFrameProviderCreateCallbackAtIndex(uint32_t idx) {
+  return GetScriptedFrameProviderInstances().GetCallbackAtIndex(idx);
+}
+
 #pragma mark StructuredDataPlugin
 
 struct StructuredDataPluginInstance
diff --git a/lldb/source/Core/Section.cpp b/lldb/source/Core/Section.cpp
index 02d9d86fe5374..f16035b5649e1 100644
--- a/lldb/source/Core/Section.cpp
+++ b/lldb/source/Core/Section.cpp
@@ -471,8 +471,14 @@ bool Section::ContainsOnlyDebugInfo() const {
   return false;
 }
 
+bool Section::IsGOTSection() const {
+  return GetObjectFile()->IsGOTSection(*this);
+}
+
 #pragma mark SectionList
 
+SectionList::SectionList(const SectionList &rhs) : m_sections(rhs.m_sections) {}
+
 SectionList &SectionList::operator=(const SectionList &rhs) {
   if (this != &rhs)
     m_sections = rhs.m_sections;
@@ -683,6 +689,33 @@ uint64_t SectionList::GetDebugInfoSize() const {
   return debug_info_size;
 }
 
+SectionList SectionList::Merge(SectionList &lhs, SectionList &rhs,
+                               MergeCallback filter) {
+  SectionList output_list;
+
+  // Iterate through all the sections in lhs and see if we have matches in
+  // the rhs list.
+  for (const auto &lhs_section : lhs) {
+    auto rhs_section = rhs.FindSectionByName(lhs_section->GetName());
+    if (rhs_section)
+      output_list.AddSection(filter(lhs_section, rhs_section));
+    else
+      output_list.AddSection(lhs_section);
+  }
+
+  // Now that we've visited all possible duplicates, we can iterate over
+  // the rhs and take any values not in lhs.
+  for (const auto &rhs_section : rhs) {
+    auto lhs_section = lhs.FindSectionByName(rhs_section->GetName());
+    // Because we already visited everything overlapping between rhs
+    // and lhs, any section not in lhs is unique and can be output.
+    if (!lhs_section)
+      output_list.AddSection(rhs_section);
+  }
+
+  return output_list;
+}
+
 namespace llvm {
 namespace json {
 
diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp
index f786866a18137..097173ffe678e 100644
--- a/lldb/source/Core/SourceManager.cpp
+++ b/lldb/source/Core/SourceManager.cpp
@@ -34,6 +34,7 @@
 
 #include "llvm/ADT/Twine.h"
 
+#include <future>
 #include <memory>
 #include <optional>
 #include <utility>
@@ -54,8 +55,7 @@ using namespace lldb_private;
 static inline bool is_newline_char(char ch) { return ch == '\n' || ch == '\r'; }
 
 static void resolve_tilde(FileSpec &file_spec) {
-  if (!FileSystem::Instance().Exists(file_spec) &&
-      file_spec.GetDirectory() &&
+  if (!FileSystem::Instance().Exists(file_spec) && file_spec.GetDirectory() &&
       file_spec.GetDirectory().GetCString()[0] == '~') {
     FileSystem::Instance().Resolve(file_spec);
   }
@@ -477,6 +477,28 @@ SourceManager::File::File(SupportFileSP support_file_sp, TargetSP target_sp)
 
 void SourceManager::File::CommonInitializer(SupportFileSP support_file_sp,
                                             TargetSP target_sp) {
+  // It might take a while to read a source file, for example because it's
+  // coming from a virtual file system that's fetching the data on demand. When
+  // reading the data exceeds a certain threshold, show a progress event to let
+  // the user know what's going on.
+  static constexpr auto g_progress_delay = std::chrono::milliseconds(500);
+
+  std::future<void> future = std::async(std::launch::async, [=]() {
+    CommonInitializerImpl(support_file_sp, target_sp);
+  });
+
+  std::optional<Progress> progress;
+  if (future.wait_for(g_progress_delay) == std::future_status::timeout) {
+    Debugger *debugger = target_sp ? &target_sp->GetDebugger() : nullptr;
+    progress.emplace("Loading source file",
+                     support_file_sp->GetSpecOnly().GetFilename().GetString(),
+                     1, debugger);
+  }
+  future.wait();
+}
+
+void SourceManager::File::CommonInitializerImpl(SupportFileSP support_file_sp,
+                                                TargetSP target_sp) {
   // Set the file and update the modification time.
   SetSupportFile(support_file_sp);
 
diff --git a/lldb/source/Expression/UserExpression.cpp b/lldb/source/Expression/UserExpression.cpp
index af4b477660eeb..5563eba21777e 100644
--- a/lldb/source/Expression/UserExpression.cpp
+++ b/lldb/source/Expression/UserExpression.cpp
@@ -246,7 +246,7 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
   // language in the target's properties if specified, else default to the
   // langage for the frame.
   if (!language) {
-    if (target->GetLanguage() != lldb::eLanguageTypeUnknown)
+    if (target->GetLanguage())
       language = target->GetLanguage();
     else if (StackFrame *frame = exe_ctx.GetFramePtr())
       language = frame->GetLanguage();
diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp
index 1b1922e710764..e2995b37429fd 100644
--- a/lldb/source/Host/common/Editline.cpp
+++ b/lldb/source/Host/common/Editline.cpp
@@ -1626,6 +1626,9 @@ bool Editline::GetLine(std::string &line, bool &interrupted) {
   m_editor_status = EditorStatus::Editing;
   m_revert_cursor_index = -1;
 
+  lldbassert(m_output_stream_sp);
+  fprintf(m_locked_output->GetFile().GetStream(), "\r" ANSI_CLEAR_RIGHT);
+
   int count;
   auto input = el_wgets(m_editline, &count);
 
diff --git a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
index e8bf04e308447..b5831f013ba62 100644
--- a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
+++ b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
@@ -149,11 +149,11 @@ ConnectionFileDescriptor::Connect(llvm::StringRef path,
         llvm::StringSwitch<ConnectionStatus (ConnectionFileDescriptor::*)(
             llvm::StringRef, socket_id_callback_type, Status *)>(scheme)
             .Case("listen", &ConnectionFileDescriptor::AcceptTCP)
-            .Cases("accept", "unix-accept",
+            .Cases({"accept", "unix-accept"},
                    &ConnectionFileDescriptor::AcceptNamedSocket)
             .Case("unix-abstract-accept",
                   &ConnectionFileDescriptor::AcceptAbstractSocket)
-            .Cases("connect", "tcp-connect",
+            .Cases({"connect", "tcp-connect"},
                    &ConnectionFileDescriptor::ConnectTCP)
             .Case("udp", &ConnectionFileDescriptor::ConnectUDP)
             .Case("unix-connect", &ConnectionFileDescriptor::ConnectNamedSocket)
diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp
index ca768db1199c1..211868b51facb 100644
--- a/lldb/source/Interpreter/ScriptInterpreter.cpp
+++ b/lldb/source/Interpreter/ScriptInterpreter.cpp
@@ -150,6 +150,11 @@ ScriptInterpreter::GetOpaqueTypeFromSBExecutionContext(
   return exe_ctx.m_exe_ctx_sp;
 }
 
+lldb::StackFrameListSP ScriptInterpreter::GetOpaqueTypeFromSBFrameList(
+    const lldb::SBFrameList &frame_list) const {
+  return frame_list.m_opaque_sp;
+}
+
 lldb::ScriptLanguage
 ScriptInterpreter::StringToLanguage(const llvm::StringRef &language) {
   if (language.equals_insensitive(LanguageToString(eScriptLanguageNone)))
diff --git a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp
index e40d2c5fc121a..8bfb4327a5f73 100644
--- a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp
+++ b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp
@@ -86,9 +86,9 @@ std::string ABIAArch64::GetMCName(std::string reg) {
 uint32_t ABIAArch64::GetGenericNum(llvm::StringRef name) {
   return llvm::StringSwitch<uint32_t>(name)
       .Case("pc", LLDB_REGNUM_GENERIC_PC)
-      .Cases("lr", "x30", LLDB_REGNUM_GENERIC_RA)
-      .Cases("sp", "x31", LLDB_REGNUM_GENERIC_SP)
-      .Cases("fp", "x29", LLDB_REGNUM_GENERIC_FP)
+      .Cases({"lr", "x30"}, LLDB_REGNUM_GENERIC_RA)
+      .Cases({"sp", "x31"}, LLDB_REGNUM_GENERIC_SP)
+      .Cases({"fp", "x29"}, LLDB_REGNUM_GENERIC_FP)
       .Case("cpsr", LLDB_REGNUM_GENERIC_FLAGS)
       .Case("x0", LLDB_REGNUM_GENERIC_ARG1)
       .Case("x1", LLDB_REGNUM_GENERIC_ARG2)
diff --git a/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp b/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp
index f9c249d7fec1c..e41a28bd21c36 100644
--- a/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp
+++ b/lldb/source/Plugins/ABI/ARC/ABISysV_arc.cpp
@@ -480,11 +480,10 @@ ABISysV_arc::GetReturnValueObjectSimple(Thread &thread,
   }
   // Floating point return type.
   else if (type_flags & eTypeIsFloat) {
-    uint32_t float_count = 0;
     bool is_complex = false;
 
-    if (compiler_type.IsFloatingPointType(float_count, is_complex) &&
-        1 == float_count && !is_complex) {
+    if (compiler_type.IsFloatingPointType(is_complex) &&
+        !compiler_type.IsVectorType() && !is_complex) {
       const size_t byte_size =
           llvm::expectedToOptional(compiler_type.GetByteSize(&thread))
               .value_or(0);
diff --git a/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp b/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp
index 5b5f6facc924c..8e690218843fa 100644
--- a/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp
+++ b/lldb/source/Plugins/ABI/ARM/ABIMacOSX_arm.cpp
@@ -1695,7 +1695,6 @@ Status ABIMacOSX_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -1767,7 +1766,7 @@ Status ABIMacOSX_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
diff --git a/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp b/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp
index bb0c4ba3f1b57..7258f5cc9acb5 100644
--- a/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp
+++ b/lldb/source/Plugins/ABI/ARM/ABISysV_arm.cpp
@@ -1550,7 +1550,6 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl(
 
   bool is_signed;
   bool is_complex;
-  uint32_t float_count;
   bool is_vfp_candidate = false;
   uint8_t vfp_count = 0;
   uint8_t vfp_byte_size = 0;
@@ -1634,8 +1633,9 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl(
       if (!GetReturnValuePassedInMemory(thread, reg_ctx, *byte_size, value))
         return return_valobj_sp;
     }
-  } else if (compiler_type.IsFloatingPointType(float_count, is_complex)) {
-    if (float_count == 1 && !is_complex) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
+    // Vector types are handled above.
+    if (!is_complex) {
       switch (*bit_width) {
       default:
         return return_valobj_sp;
@@ -1681,7 +1681,7 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl(
         break;
       }
       }
-    } else if (is_complex && float_count == 2) {
+    } else if (is_complex) {
       if (IsArmHardFloat(thread)) {
         is_vfp_candidate = true;
         vfp_byte_size = *byte_size / 2;
@@ -1709,8 +1709,9 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl(
             vfp_count = (*base_byte_size == 8 ? homogeneous_count
                                               : homogeneous_count * 2);
           }
-        } else if (base_type.IsFloatingPointType(float_count, is_complex)) {
-          if (float_count == 1 && !is_complex) {
+        } else if (base_type.IsFloatingPointType(is_complex)) {
+          // Vector types are handled above.
+          if (!is_complex) {
             is_vfp_candidate = true;
             if (base_byte_size)
               vfp_byte_size = *base_byte_size;
@@ -1727,10 +1728,10 @@ ValueObjectSP ABISysV_arm::GetReturnValueObjectImpl(
             base_type = compiler_type.GetFieldAtIndex(index, name, nullptr,
                                                       nullptr, nullptr);
 
-            if (base_type.IsFloatingPointType(float_count, is_complex)) {
+            if (base_type.IsFloatingPointType(is_complex)) {
               std::optional<uint64_t> base_byte_size =
                   llvm::expectedToOptional(base_type.GetByteSize(&thread));
-              if (float_count == 2 && is_complex) {
+              if (is_complex) {
                 if (index != 0 && base_byte_size &&
                     vfp_byte_size != *base_byte_size)
                   break;
@@ -1841,7 +1842,6 @@ Status ABISysV_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -1884,7 +1884,7 @@ Status ABISysV_arm::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
diff --git a/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp b/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp
index 7bf99ce7bddee..91b965d3b5715 100644
--- a/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp
+++ b/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp
@@ -510,11 +510,10 @@ ValueObjectSP ABISysV_loongarch::GetReturnValueObjectSimple(
                                           value, ConstString(""));
   }
   if (type_flags & eTypeIsFloat) {
-    uint32_t float_count = 0;
     bool is_complex = false;
 
-    if (compiler_type.IsFloatingPointType(float_count, is_complex) &&
-        float_count == 1 && !is_complex) {
+    if (compiler_type.IsFloatingPointType(is_complex) &&
+        !(type_flags & eTypeIsVector) && !is_complex) {
       return_valobj_sp =
           GetValObjFromFPRegs(thread, reg_ctx, machine, type_flags, byte_size);
       return return_valobj_sp;
@@ -623,17 +622,17 @@ void ABISysV_loongarch::Terminate() {
 static uint32_t GetGenericNum(llvm::StringRef name) {
   return llvm::StringSwitch<uint32_t>(name)
       .Case("pc", LLDB_REGNUM_GENERIC_PC)
-      .Cases("ra", "r1", LLDB_REGNUM_GENERIC_RA)
-      .Cases("sp", "r3", LLDB_REGNUM_GENERIC_SP)
-      .Cases("fp", "r22", LLDB_REGNUM_GENERIC_FP)
-      .Cases("a0", "r4", LLDB_REGNUM_GENERIC_ARG1)
-      .Cases("a1", "r5", LLDB_REGNUM_GENERIC_ARG2)
-      .Cases("a2", "r6", LLDB_REGNUM_GENERIC_ARG3)
-      .Cases("a3", "r7", LLDB_REGNUM_GENERIC_ARG4)
-      .Cases("a4", "r8", LLDB_REGNUM_GENERIC_ARG5)
-      .Cases("a5", "r9", LLDB_REGNUM_GENERIC_ARG6)
-      .Cases("a6", "r10", LLDB_REGNUM_GENERIC_ARG7)
-      .Cases("a7", "r11", LLDB_REGNUM_GENERIC_ARG8)
+      .Cases({"ra", "r1"}, LLDB_REGNUM_GENERIC_RA)
+      .Cases({"sp", "r3"}, LLDB_REGNUM_GENERIC_SP)
+      .Cases({"fp", "r22"}, LLDB_REGNUM_GENERIC_FP)
+      .Cases({"a0", "r4"}, LLDB_REGNUM_GENERIC_ARG1)
+      .Cases({"a1", "r5"}, LLDB_REGNUM_GENERIC_ARG2)
+      .Cases({"a2", "r6"}, LLDB_REGNUM_GENERIC_ARG3)
+      .Cases({"a3", "r7"}, LLDB_REGNUM_GENERIC_ARG4)
+      .Cases({"a4", "r8"}, LLDB_REGNUM_GENERIC_ARG5)
+      .Cases({"a5", "r9"}, LLDB_REGNUM_GENERIC_ARG6)
+      .Cases({"a6", "r10"}, LLDB_REGNUM_GENERIC_ARG7)
+      .Cases({"a7", "r11"}, LLDB_REGNUM_GENERIC_ARG8)
       .Default(LLDB_INVALID_REGNUM);
 }
 
diff --git a/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp b/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp
index dd91a05534e37..e03604467ceec 100644
--- a/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp
+++ b/lldb/source/Plugins/ABI/Mips/ABISysV_mips.cpp
@@ -708,7 +708,6 @@ Status ABISysV_mips::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -750,7 +749,7 @@ Status ABISysV_mips::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
@@ -797,7 +796,6 @@ ValueObjectSP ABISysV_mips::GetReturnValueObjectImpl(
 
   bool is_signed = false;
   bool is_complex = false;
-  uint32_t count = 0;
 
   // In MIPS register "r2" (v0) holds the integer function return values
   const RegisterInfo *r2_reg_info = reg_ctx->GetRegisterInfoByName("r2", 0);
@@ -860,10 +858,10 @@ ValueObjectSP ABISysV_mips::GetReturnValueObjectImpl(
     return_valobj_sp = ValueObjectMemory::Create(
         &thread, "", Address(mem_address, nullptr), return_compiler_type);
     return return_valobj_sp;
-  } else if (return_compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (return_compiler_type.IsFloatingPointType(is_complex)) {
     if (IsSoftFloat(fp_flag)) {
       uint64_t raw_value = reg_ctx->ReadRegisterAsUnsigned(r2_reg_info, 0);
-      if (count != 1 && is_complex)
+      if (is_complex)
         return return_valobj_sp;
       switch (*bit_width) {
       default:
@@ -896,7 +894,7 @@ ValueObjectSP ABISysV_mips::GetReturnValueObjectImpl(
       f0_value.GetData(f0_data);
       lldb::offset_t offset = 0;
 
-      if (count == 1 && !is_complex) {
+      if (!return_compiler_type.IsVectorType() && !is_complex) {
         switch (*bit_width) {
         default:
           return return_valobj_sp;
diff --git a/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp b/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp
index baefbfc363d99..0dd9db0948220 100644
--- a/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp
+++ b/lldb/source/Plugins/ABI/Mips/ABISysV_mips64.cpp
@@ -923,7 +923,6 @@ ValueObjectSP ABISysV_mips64::GetReturnValueObjectImpl(
       bool sucess = false;
       std::string name;
       bool is_complex;
-      uint32_t count;
       const uint32_t num_children = return_compiler_type.GetNumFields();
 
       // A structure consisting of one or two FP values (and nothing else) will
@@ -937,7 +936,7 @@ ValueObjectSP ABISysV_mips64::GetReturnValueObjectImpl(
               return_compiler_type.GetFieldAtIndex(idx, name, &field_bit_offset,
                                                    nullptr, nullptr);
 
-          if (field_compiler_type.IsFloatingPointType(count, is_complex))
+          if (field_compiler_type.IsFloatingPointType(is_complex))
             use_fp_regs = true;
           else
             found_non_fp_field = true;
@@ -1044,7 +1043,7 @@ ValueObjectSP ABISysV_mips64::GetReturnValueObjectImpl(
 
         if (field_compiler_type.IsIntegerOrEnumerationType(is_signed) ||
             field_compiler_type.IsPointerType() ||
-            field_compiler_type.IsFloatingPointType(count, is_complex)) {
+            field_compiler_type.IsFloatingPointType(is_complex)) {
           padding = field_byte_offset - integer_bytes;
 
           if (integer_bytes < 8) {
diff --git a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp
index e4bdc44c59c10..0d25faef1c659 100644
--- a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp
+++ b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc.cpp
@@ -426,7 +426,6 @@ Status ABISysV_ppc::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -454,7 +453,7 @@ Status ABISysV_ppc::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
@@ -695,7 +694,6 @@ ValueObjectSP ABISysV_ppc::GetReturnValueObjectImpl(
         uint64_t field_bit_offset = 0;
         bool is_signed;
         bool is_complex;
-        uint32_t count;
 
         CompilerType field_compiler_type = return_compiler_type.GetFieldAtIndex(
             idx, name, &field_bit_offset, nullptr, nullptr);
@@ -741,7 +739,7 @@ ValueObjectSP ABISysV_ppc::GetReturnValueObjectImpl(
             // return a nullptr return value object.
             return return_valobj_sp;
           }
-        } else if (field_compiler_type.IsFloatingPointType(count, is_complex)) {
+        } else if (field_compiler_type.IsFloatingPointType(is_complex)) {
           // Structs with long doubles are always passed in memory.
           if (*field_bit_width == 128) {
             is_memory = true;
diff --git a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
index f5327a1f403c0..63357618774d4 100644
--- a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
+++ b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
@@ -309,7 +309,6 @@ Status ABISysV_ppc64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -339,7 +338,7 @@ Status ABISysV_ppc64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
diff --git a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp
index 822c93dbbec3d..a5547a4699ca9 100644
--- a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp
+++ b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp
@@ -643,11 +643,10 @@ ABISysV_riscv::GetReturnValueObjectSimple(Thread &thread,
   }
   // Floating point return type.
   else if (type_flags & eTypeIsFloat) {
-    uint32_t float_count = 0;
     bool is_complex = false;
 
-    if (compiler_type.IsFloatingPointType(float_count, is_complex) &&
-        float_count == 1 && !is_complex) {
+    if (compiler_type.IsFloatingPointType(is_complex) &&
+        !(type_flags & eTypeIsVector) && !is_complex) {
       const uint32_t arch_fp_flags =
           arch.GetFlags() & ArchSpec::eRISCV_float_abi_mask;
       return_valobj_sp = GetValObjFromFPRegs(
@@ -799,6 +798,8 @@ bool ABISysV_riscv::RegisterIsCalleeSaved(const RegisterInfo *reg_info) {
           .Cases({"f8", "f9", "f18", "f19", "f20", "f21", "f22", "f23"},
                  is_hw_fp)
           .Cases({"f24", "f25", "f26", "f27"}, is_hw_fp)
+          // vlenb is constant and needed for vector unwinding.
+          .Case("vlenb", true)
           .Default(false);
 
   return is_callee_saved;
@@ -816,9 +817,9 @@ void ABISysV_riscv::Terminate() {
 static uint32_t GetGenericNum(llvm::StringRef name) {
   return llvm::StringSwitch<uint32_t>(name)
       .Case("pc", LLDB_REGNUM_GENERIC_PC)
-      .Cases("ra", "x1", LLDB_REGNUM_GENERIC_RA)
-      .Cases("sp", "x2", LLDB_REGNUM_GENERIC_SP)
-      .Cases("fp", "s0", LLDB_REGNUM_GENERIC_FP)
+      .Cases({"ra", "x1"}, LLDB_REGNUM_GENERIC_RA)
+      .Cases({"sp", "x2"}, LLDB_REGNUM_GENERIC_SP)
+      .Cases({"fp", "s0"}, LLDB_REGNUM_GENERIC_FP)
       .Case("a0", LLDB_REGNUM_GENERIC_ARG1)
       .Case("a1", LLDB_REGNUM_GENERIC_ARG2)
       .Case("a2", LLDB_REGNUM_GENERIC_ARG3)
diff --git a/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp b/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp
index 5e52b6e4db499..301c3b309ffd5 100644
--- a/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp
+++ b/lldb/source/Plugins/ABI/SystemZ/ABISysV_s390x.cpp
@@ -393,7 +393,6 @@ Status ABISysV_s390x::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -423,7 +422,7 @@ Status ABISysV_s390x::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
diff --git a/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp b/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp
index eaeed6c04590c..ee79abe55ead0 100644
--- a/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp
+++ b/lldb/source/Plugins/ABI/X86/ABIMacOSX_i386.cpp
@@ -198,7 +198,6 @@ Status ABIMacOSX_i386::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -240,7 +239,7 @@ Status ABIMacOSX_i386::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
diff --git a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp
index effb3de8215d6..29fd9f0eceb93 100644
--- a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp
+++ b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp
@@ -307,7 +307,6 @@ Status ABISysV_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -337,7 +336,7 @@ Status ABISysV_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
@@ -587,7 +586,6 @@ static bool FlattenAggregateType(
   for (uint32_t idx = 0; idx < num_children; ++idx) {
     std::string name;
     bool is_signed;
-    uint32_t count;
     bool is_complex;
 
     uint64_t field_bit_offset = 0;
@@ -606,7 +604,7 @@ static bool FlattenAggregateType(
     const uint32_t field_type_flags = field_compiler_type.GetTypeInfo();
     if (field_compiler_type.IsIntegerOrEnumerationType(is_signed) ||
         field_compiler_type.IsPointerType() ||
-        field_compiler_type.IsFloatingPointType(count, is_complex)) {
+        field_compiler_type.IsFloatingPointType(is_complex)) {
       aggregate_field_offsets.push_back(field_byte_offset);
       aggregate_compiler_types.push_back(field_compiler_type);
     } else if (field_type_flags & eTypeHasChildren) {
@@ -696,7 +694,6 @@ ValueObjectSP ABISysV_x86_64::GetReturnValueObjectImpl(
       is_memory = false;
       for (uint32_t idx = 0; idx < num_children; idx++) {
         bool is_signed;
-        uint32_t count;
         bool is_complex;
 
         CompilerType field_compiler_type = aggregate_compiler_types[idx];
@@ -736,7 +733,7 @@ ValueObjectSP ABISysV_x86_64::GetReturnValueObjectImpl(
             // return a nullptr return value object.
             return return_valobj_sp;
           }
-        } else if (field_compiler_type.IsFloatingPointType(count, is_complex)) {
+        } else if (field_compiler_type.IsFloatingPointType(is_complex)) {
           // Structs with long doubles are always passed in memory.
           if (field_bit_width == 128) {
             is_memory = true;
diff --git a/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp b/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp
index 339012cffb688..6520af2f643ee 100644
--- a/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp
+++ b/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp
@@ -312,7 +312,6 @@ Status ABIWindows_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
   Thread *thread = frame_sp->GetThread().get();
 
   bool is_signed;
-  uint32_t count;
   bool is_complex;
 
   RegisterContext *reg_ctx = thread->GetRegisterContext().get();
@@ -342,7 +341,7 @@ Status ABIWindows_x86_64::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
           "We don't support returning longer than 64 bit "
           "integer values at present.");
     }
-  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
+  } else if (compiler_type.IsFloatingPointType(is_complex)) {
     if (is_complex)
       error = Status::FromErrorString(
           "We don't support returning complex values at present");
@@ -558,7 +557,6 @@ static bool FlattenAggregateType(
   for (uint32_t idx = 0; idx < num_children; ++idx) {
     std::string name;
     bool is_signed;
-    uint32_t count;
     bool is_complex;
 
     uint64_t field_bit_offset = 0;
@@ -582,7 +580,7 @@ static bool FlattenAggregateType(
     const uint32_t field_type_flags = field_compiler_type.GetTypeInfo();
     if (field_compiler_type.IsIntegerOrEnumerationType(is_signed) ||
         field_compiler_type.IsPointerType() ||
-        field_compiler_type.IsFloatingPointType(count, is_complex)) {
+        field_compiler_type.IsFloatingPointType(is_complex)) {
       aggregate_field_offsets.push_back(field_byte_offset);
       aggregate_compiler_types.push_back(field_compiler_type);
     } else if (field_type_flags & eTypeHasChildren) {
@@ -672,7 +670,6 @@ ValueObjectSP ABIWindows_x86_64::GetReturnValueObjectImpl(
     for (uint32_t idx = 0; idx < num_children; idx++) {
       bool is_signed;
       bool is_complex;
-      uint32_t count;
 
       CompilerType field_compiler_type = aggregate_compiler_types[idx];
       uint32_t field_byte_width =
@@ -691,7 +688,7 @@ ValueObjectSP ABIWindows_x86_64::GetReturnValueObjectImpl(
       uint32_t copy_from_offset = 0;
       if (field_compiler_type.IsIntegerOrEnumerationType(is_signed) ||
           field_compiler_type.IsPointerType() ||
-          field_compiler_type.IsFloatingPointType(count, is_complex)) {
+          field_compiler_type.IsFloatingPointType(is_complex)) {
         copy_from_extractor = &rax_data;
         copy_from_offset = used_bytes;
         used_bytes += field_byte_width;
diff --git a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
index 1d210ea78df1a..2d0a4f67499ee 100644
--- a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
+++ b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
@@ -789,6 +789,7 @@ bool DynamicLoaderDarwinKernel::KextImageInfo::LoadImageUsingMemoryModule(
     // Search for the kext on the local filesystem via the UUID
     if (!m_module_sp && m_uuid.IsValid()) {
       ModuleSpec module_spec;
+      module_spec.SetTarget(target.shared_from_this());
       module_spec.GetUUID() = m_uuid;
       if (!m_uuid.IsValid())
         module_spec.GetArchitecture() = target.GetArchitecture();
@@ -801,9 +802,8 @@ bool DynamicLoaderDarwinKernel::KextImageInfo::LoadImageUsingMemoryModule(
       // system.
       PlatformSP platform_sp(target.GetPlatform());
       if (platform_sp) {
-        FileSpecList search_paths = target.GetExecutableSearchPaths();
-        platform_sp->GetSharedModule(module_spec, process, m_module_sp,
-                                     &search_paths, nullptr, nullptr);
+        platform_sp->GetSharedModule(module_spec, process, m_module_sp, nullptr,
+                                     nullptr);
       }
 
       // Ask the Target to find this file on the local system, if possible.
diff --git a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
index 326b6910b5267..470fc2a2fdbb9 100644
--- a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
@@ -901,10 +901,9 @@ void DynamicLoaderPOSIXDYLD::ResolveExecutableModule(
   if (module_sp && module_sp->MatchesModuleSpec(module_spec))
     return;
 
+  module_spec.SetTarget(target.shared_from_this());
   const auto executable_search_paths(Target::GetDefaultExecutableSearchPaths());
-  auto error = platform_sp->ResolveExecutable(
-      module_spec, module_sp,
-      !executable_search_paths.IsEmpty() ? &executable_search_paths : nullptr);
+  auto error = platform_sp->ResolveExecutable(module_spec, module_sp);
   if (error.Fail()) {
     StreamString stream;
     module_spec.Dump(stream);
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
index 990074566be7e..c99f6c6e5e2c5 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
@@ -115,6 +115,7 @@ class ClangExpressionParser::LLDBPreprocessorCallbacks : public PPCallbacks {
   ClangModulesDeclVendor &m_decl_vendor;
   ClangPersistentVariables &m_persistent_vars;
   clang::SourceManager &m_source_mgr;
+  /// Accumulates error messages across all moduleImport calls.
   StreamString m_error_stream;
   bool m_has_errors = false;
 
@@ -140,11 +141,12 @@ class ClangExpressionParser::LLDBPreprocessorCallbacks : public PPCallbacks {
       module.path.push_back(
           ConstString(component.getIdentifierInfo()->getName()));
 
-    StreamString error_stream;
-
     ClangModulesDeclVendor::ModuleVector exported_modules;
-    if (!m_decl_vendor.AddModule(module, &exported_modules, m_error_stream))
+    if (auto err = m_decl_vendor.AddModule(module, &exported_modules)) {
       m_has_errors = true;
+      m_error_stream.PutCString(llvm::toString(std::move(err)));
+      m_error_stream.PutChar('\n');
+    }
 
     for (ClangModulesDeclVendor::ModuleID module : exported_modules)
       m_persistent_vars.AddHandLoadedClangModule(module);
@@ -169,9 +171,9 @@ class ClangDiagnosticManagerAdapter : public clang::DiagnosticConsumer {
       : m_options(opts), m_filename(filename) {
     m_options.ShowPresumedLoc = true;
     m_options.ShowLevel = false;
-    m_os = std::make_shared<llvm::raw_string_ostream>(m_output);
+    m_os = std::make_unique<llvm::raw_string_ostream>(m_output);
     m_passthrough =
-        std::make_shared<clang::TextDiagnosticPrinter>(*m_os, m_options);
+        std::make_unique<clang::TextDiagnosticPrinter>(*m_os, m_options);
   }
 
   void ResetManager(DiagnosticManager *manager = nullptr) {
@@ -313,11 +315,11 @@ class ClangDiagnosticManagerAdapter : public clang::DiagnosticConsumer {
 private:
   DiagnosticManager *m_manager = nullptr;
   DiagnosticOptions m_options;
-  std::shared_ptr<clang::TextDiagnosticPrinter> m_passthrough;
-  /// Output stream of m_passthrough.
-  std::shared_ptr<llvm::raw_string_ostream> m_os;
   /// Output string filled by m_os.
   std::string m_output;
+  /// Output stream of m_passthrough.
+  std::unique_ptr<llvm::raw_string_ostream> m_os;
+  std::unique_ptr<clang::TextDiagnosticPrinter> m_passthrough;
   StringRef m_filename;
 };
 
@@ -1502,7 +1504,7 @@ lldb_private::Status ClangExpressionParser::DoPrepareForExecution(
     LLDB_LOGF(log, "%s - Current expression language is %s\n", __FUNCTION__,
               lang.GetDescription().data());
     lldb::ProcessSP process_sp = exe_ctx.GetProcessSP();
-    if (process_sp && lang != lldb::eLanguageTypeUnknown) {
+    if (process_sp && lang) {
       auto runtime = process_sp->GetLanguageRuntime(lang.AsLanguageType());
       if (runtime)
         runtime->GetIRPasses(custom_passes);
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.cpp
index ff9ed9c27f70f..ad48d293ab8f0 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.cpp
@@ -383,10 +383,11 @@ bool ClangExpressionSourceCode::GetText(
             block->CalculateSymbolContext(&sc);
 
             if (sc.comp_unit) {
-              StreamString error_stream;
-
-              decl_vendor->AddModulesForCompileUnit(
-                  *sc.comp_unit, modules_for_macros, error_stream);
+              if (auto err = decl_vendor->AddModulesForCompileUnit(
+                      *sc.comp_unit, modules_for_macros))
+                LLDB_LOG_ERROR(
+                    GetLog(LLDBLog::Expressions), std::move(err),
+                    "Error while loading hand-imported modules:\n{0}");
             }
           }
         }
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
index b77e2690deb06..e37c84efefdc9 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
@@ -67,13 +67,13 @@ class StoringDiagnosticConsumer : public clang::DiagnosticConsumer {
       IDAndDiagnostic;
   std::vector<IDAndDiagnostic> m_diagnostics;
   std::unique_ptr<clang::DiagnosticOptions> m_diag_opts;
+  /// Output string filled by m_os. Will be reused for different diagnostics.
+  std::string m_output;
+  /// Output stream of m_diag_printer.
+  std::unique_ptr<llvm::raw_string_ostream> m_os;
   /// The DiagnosticPrinter used for creating the full diagnostic messages
   /// that are stored in m_diagnostics.
   std::unique_ptr<clang::TextDiagnosticPrinter> m_diag_printer;
-  /// Output stream of m_diag_printer.
-  std::unique_ptr<llvm::raw_string_ostream> m_os;
-  /// Output string filled by m_os. Will be reused for different diagnostics.
-  std::string m_output;
   /// A Progress with explicitly managed lifetime.
   std::unique_ptr<Progress> m_current_progress_up;
   std::vector<std::string> m_module_build_stack;
@@ -92,11 +92,11 @@ class ClangModulesDeclVendorImpl : public ClangModulesDeclVendor {
 
   ~ClangModulesDeclVendorImpl() override = default;
 
-  bool AddModule(const SourceModule &module, ModuleVector *exported_modules,
-                 Stream &error_stream) override;
+  llvm::Error AddModule(const SourceModule &module,
+                        ModuleVector *exported_modules) override;
 
-  bool AddModulesForCompileUnit(CompileUnit &cu, ModuleVector &exported_modules,
-                                Stream &error_stream) override;
+  llvm::Error AddModulesForCompileUnit(CompileUnit &cu,
+                                       ModuleVector &exported_modules) override;
 
   uint32_t FindDecls(ConstString name, bool append, uint32_t max_matches,
                      std::vector<CompilerDecl> &decls) override;
@@ -273,16 +273,14 @@ void ClangModulesDeclVendorImpl::ReportModuleExports(
     exports.push_back(module);
 }
 
-bool ClangModulesDeclVendorImpl::AddModule(const SourceModule &module,
-                                           ModuleVector *exported_modules,
-                                           Stream &error_stream) {
+llvm::Error
+ClangModulesDeclVendorImpl::AddModule(const SourceModule &module,
+                                      ModuleVector *exported_modules) {
   // Fail early.
 
-  if (m_compiler_instance->hadModuleLoaderFatalFailure()) {
-    error_stream.PutCString("error: Couldn't load a module because the module "
-                            "loader is in a fatal state.\n");
-    return false;
-  }
+  if (m_compiler_instance->hadModuleLoaderFatalFailure())
+    return llvm::createStringError(
+        "couldn't load a module because the module loader is in a fatal state");
 
   // Check if we've already imported this module.
 
@@ -297,7 +295,7 @@ bool ClangModulesDeclVendorImpl::AddModule(const SourceModule &module,
     if (mi != m_imported_modules.end()) {
       if (exported_modules)
         ReportModuleExports(*exported_modules, mi->second);
-      return true;
+      return llvm::Error::success();
     }
   }
 
@@ -315,30 +313,30 @@ bool ClangModulesDeclVendorImpl::AddModule(const SourceModule &module,
                             std::equal(sysroot_begin, sysroot_end, path_begin);
     // No need to inject search paths to modules in the sysroot.
     if (!is_system_module) {
-      auto error = [&]() {
-        error_stream.Printf("error: No module map file in %s\n",
-                            module.search_path.AsCString());
-        return false;
-      };
-
       bool is_system = true;
       bool is_framework = false;
       auto dir = HS.getFileMgr().getOptionalDirectoryRef(
           module.search_path.GetStringRef());
       if (!dir)
-        return error();
+        return llvm::createStringError(
+            "couldn't find module search path directory %s",
+            module.search_path.GetCString());
+
       auto file = HS.lookupModuleMapFile(*dir, is_framework);
       if (!file)
-        return error();
+        return llvm::createStringError("couldn't find modulemap file in %s",
+                                       module.search_path.GetCString());
+
       if (HS.parseAndLoadModuleMapFile(*file, is_system))
-        return error();
+        return llvm::createStringError(
+            "failed to parse and load modulemap file in %s",
+            module.search_path.GetCString());
     }
   }
-  if (!HS.lookupModule(module.path.front().GetStringRef())) {
-    error_stream.Printf("error: Header search couldn't locate module '%s'\n",
-                        module.path.front().AsCString());
-    return false;
-  }
+
+  if (!HS.lookupModule(module.path.front().GetStringRef()))
+    return llvm::createStringError("header search couldn't locate module '%s'",
+                                   module.path.front().AsCString());
 
   llvm::SmallVector<clang::IdentifierLoc, 4> clang_path;
 
@@ -364,22 +362,29 @@ bool ClangModulesDeclVendorImpl::AddModule(const SourceModule &module,
   clang::Module *top_level_module = DoGetModule(clang_path.front(), false);
 
   if (!top_level_module) {
+    lldb_private::StreamString error_stream;
     diagnostic_consumer->DumpDiagnostics(error_stream);
-    error_stream.Printf("error: Couldn't load top-level module %s\n",
-                        module.path.front().AsCString());
-    return false;
+
+    return llvm::createStringError(llvm::formatv(
+        "couldn't load top-level module {0}:\n{1}",
+        module.path.front().GetStringRef(), error_stream.GetString()));
   }
 
   clang::Module *submodule = top_level_module;
 
   for (auto &component : llvm::ArrayRef<ConstString>(module.path).drop_front()) {
-    submodule = submodule->findSubmodule(component.GetStringRef());
-    if (!submodule) {
+    clang::Module *found = submodule->findSubmodule(component.GetStringRef());
+    if (!found) {
+      lldb_private::StreamString error_stream;
       diagnostic_consumer->DumpDiagnostics(error_stream);
-      error_stream.Printf("error: Couldn't load submodule %s\n",
-                          component.GetCString());
-      return false;
+
+      return llvm::createStringError(llvm::formatv(
+          "couldn't load submodule '{0}' of module '{1}':\n{2}",
+          component.GetStringRef(), submodule->getFullModuleName(),
+          error_stream.GetString()));
     }
+
+    submodule = found;
   }
 
   // If we didn't make the submodule visible here, Clang wouldn't allow LLDB to
@@ -399,10 +404,12 @@ bool ClangModulesDeclVendorImpl::AddModule(const SourceModule &module,
 
     m_enabled = true;
 
-    return true;
+    return llvm::Error::success();
   }
 
-  return false;
+  return llvm::createStringError(
+      llvm::formatv("unknown error while loading module {0}\n",
+                    module.path.front().GetStringRef()));
 }
 
 bool ClangModulesDeclVendor::LanguageSupportsClangModules(
@@ -424,15 +431,18 @@ bool ClangModulesDeclVendor::LanguageSupportsClangModules(
   }
 }
 
-bool ClangModulesDeclVendorImpl::AddModulesForCompileUnit(
-    CompileUnit &cu, ClangModulesDeclVendor::ModuleVector &exported_modules,
-    Stream &error_stream) {
-  if (LanguageSupportsClangModules(cu.GetLanguage())) {
-    for (auto &imported_module : cu.GetImportedModules())
-      if (!AddModule(imported_module, &exported_modules, error_stream))
-        return false;
-  }
-  return true;
+llvm::Error ClangModulesDeclVendorImpl::AddModulesForCompileUnit(
+    CompileUnit &cu, ClangModulesDeclVendor::ModuleVector &exported_modules) {
+  if (!LanguageSupportsClangModules(cu.GetLanguage()))
+    return llvm::Error::success();
+
+  llvm::Error errors = llvm::Error::success();
+
+  for (auto &imported_module : cu.GetImportedModules())
+    if (auto err = AddModule(imported_module, &exported_modules))
+      errors = llvm::joinErrors(std::move(errors), std::move(err));
+
+  return errors;
 }
 
 // ClangImporter::lookupValue
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.h
index ad4d060319e31..043632007b7d3 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.h
@@ -41,21 +41,16 @@ class ClangModulesDeclVendor : public DeclVendor {
   ///     The path to the exact module to be loaded.  E.g., if the desired
   ///     module is std.io, then this should be { "std", "io" }.
   ///
-  /// \param[in] exported_modules
+  /// \param[out] exported_modules
   ///     If non-NULL, a pointer to a vector to populate with the ID of every
   ///     module that is re-exported by the specified module.
   ///
-  /// \param[in] error_stream
-  ///     A stream to populate with the output of the Clang parser when
-  ///     it tries to load the module.
-  ///
   /// \return
   ///     True if the module could be loaded; false if not.  If the
   ///     compiler encountered a fatal error during a previous module
   ///     load, then this will always return false for this ModuleImporter.
-  virtual bool AddModule(const SourceModule &module,
-                         ModuleVector *exported_modules,
-                         Stream &error_stream) = 0;
+  virtual llvm::Error AddModule(const SourceModule &module,
+                                ModuleVector *exported_modules) = 0;
 
   /// Add all modules referred to in a given compilation unit to the list
   /// of modules to search.
@@ -63,22 +58,17 @@ class ClangModulesDeclVendor : public DeclVendor {
   /// \param[in] cu
   ///     The compilation unit to scan for imported modules.
   ///
-  /// \param[in] exported_modules
+  /// \param[out] exported_modules
   ///     A vector to populate with the ID of each module loaded (directly
   ///     and via re-exports) in this way.
   ///
-  /// \param[in] error_stream
-  ///     A stream to populate with the output of the Clang parser when
-  ///     it tries to load the modules.
-  ///
   /// \return
   ///     True if all modules referred to by the compilation unit could be
   ///     loaded; false if one could not be loaded.  If the compiler
   ///     encountered a fatal error during a previous module
   ///     load, then this will always return false for this ModuleImporter.
-  virtual bool AddModulesForCompileUnit(CompileUnit &cu,
-                                        ModuleVector &exported_modules,
-                                        Stream &error_stream) = 0;
+  virtual llvm::Error
+  AddModulesForCompileUnit(CompileUnit &cu, ModuleVector &exported_modules) = 0;
 
   /// Enumerate all the macros that are defined by a given set of modules
   /// that are already imported.
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
index e8d5ec3c7fd96..d1feda1f0b629 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
@@ -371,26 +371,20 @@ static void SetupDeclVendor(ExecutionContext &exe_ctx, Target *target,
 
   if (!sc.comp_unit)
     return;
-  StreamString error_stream;
-
   ClangModulesDeclVendor::ModuleVector modules_for_macros =
       persistent_state->GetHandLoadedClangModules();
-  if (decl_vendor->AddModulesForCompileUnit(*sc.comp_unit, modules_for_macros,
-                                            error_stream))
-    return;
 
-  // Failed to load some modules, so emit the error stream as a diagnostic.
-  if (!error_stream.Empty()) {
-    // The error stream already contains several Clang diagnostics that might
-    // be either errors or warnings, so just print them all as one remark
-    // diagnostic to prevent that the message starts with "error: error:".
-    diagnostic_manager.PutString(lldb::eSeverityInfo, error_stream.GetString());
+  auto err =
+      decl_vendor->AddModulesForCompileUnit(*sc.comp_unit, modules_for_macros);
+  if (!err)
     return;
-  }
 
-  diagnostic_manager.PutString(lldb::eSeverityError,
-                               "Unknown error while loading modules needed for "
-                               "current compilation unit.");
+  // Module load errors aren't fatal to the expression evaluator. Printing
+  // them as diagnostics to the console would be too noisy and misleading
+  // Hence just print them to the expression log.
+  llvm::handleAllErrors(std::move(err), [](const llvm::StringError &e) {
+    LLDB_LOG(GetLog(LLDBLog::Expressions), "{0}", e.getMessage());
+  });
 }
 
 ClangExpressionSourceCode::WrapKind ClangUserExpression::GetWrapKind() const {
diff --git a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp
index 5c1b7d4943b3f..2957cb716041d 100644
--- a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp
+++ b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp
@@ -1328,32 +1328,36 @@ class Executor {
         m_emu, inst, 8, ZextD,
         [](uint64_t a, uint64_t b) { return std::max(a, b); });
   }
-  template <typename T>
-  bool F_Load(T inst, const fltSemantics &(*semantics)(),
-              unsigned int numBits) {
+  template <typename I, typename T>
+  bool F_Load(I inst, const fltSemantics &(*semantics)()) {
     return transformOptional(inst.rs1.Read(m_emu),
                              [&](auto &&rs1) {
-                               uint64_t addr = rs1 + uint64_t(inst.imm);
-                               uint64_t bits = *m_emu.ReadMem<uint64_t>(addr);
+                               uint64_t addr =
+                                   rs1 + uint64_t(SignExt(inst.imm));
+                               uint64_t bits = *m_emu.ReadMem<T>(addr);
+                               unsigned numBits = sizeof(T) * 8;
                                APFloat f(semantics(), APInt(numBits, bits));
                                return inst.rd.WriteAPFloat(m_emu, f);
                              })
         .value_or(false);
   }
-  bool operator()(FLW inst) { return F_Load(inst, &APFloat::IEEEsingle, 32); }
-  template <typename T> bool F_Store(T inst, bool isDouble) {
+  bool operator()(FLW inst) {
+    return F_Load<FLW, uint32_t>(inst, &APFloat::IEEEsingle);
+  }
+  template <typename I, typename T> bool F_Store(I inst, bool isDouble) {
     return transformOptional(zipOpt(inst.rs1.Read(m_emu),
                                     inst.rs2.ReadAPFloat(m_emu, isDouble)),
                              [&](auto &&tup) {
                                auto [rs1, rs2] = tup;
-                               uint64_t addr = rs1 + uint64_t(inst.imm);
+                               uint64_t addr =
+                                   rs1 + uint64_t(SignExt(inst.imm));
                                uint64_t bits =
                                    rs2.bitcastToAPInt().getZExtValue();
-                               return m_emu.WriteMem<uint64_t>(addr, bits);
+                               return m_emu.WriteMem<T>(addr, bits);
                              })
         .value_or(false);
   }
-  bool operator()(FSW inst) { return F_Store(inst, false); }
+  bool operator()(FSW inst) { return F_Store<FSW, uint32_t>(inst, false); }
   std::tuple<bool, APFloat> FusedMultiplyAdd(APFloat rs1, APFloat rs2,
                                              APFloat rs3) {
     auto opStatus = rs1.fusedMultiplyAdd(rs2, rs3, m_emu.GetRoundingMode());
@@ -1616,8 +1620,10 @@ class Executor {
   bool operator()(FCVT_S_LU inst) {
     return FCVT_f2i(inst, &Rs::Read, APFloat::IEEEsingle());
   }
-  bool operator()(FLD inst) { return F_Load(inst, &APFloat::IEEEdouble, 64); }
-  bool operator()(FSD inst) { return F_Store(inst, true); }
+  bool operator()(FLD inst) {
+    return F_Load<FLD, uint64_t>(inst, &APFloat::IEEEdouble);
+  }
+  bool operator()(FSD inst) { return F_Store<FSD, uint64_t>(inst, true); }
   bool operator()(FMADD_D inst) { return FMA(inst, true, 1.0f, 1.0f); }
   bool operator()(FMSUB_D inst) { return FMA(inst, true, 1.0f, -1.0f); }
   bool operator()(FNMSUB_D inst) { return FMA(inst, true, -1.0f, 1.0f); }
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index a2199cb65cd35..e935ea8fab813 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -103,7 +103,7 @@ CPlusPlusLanguage::GetFunctionNameInfo(ConstString name) const {
     return {func_name_type, ConstString(basename)};
 }
 
-bool CPlusPlusLanguage::SymbolNameFitsToLanguage(Mangled mangled) const {
+bool CPlusPlusLanguage::SymbolNameFitsToLanguage(const Mangled &mangled) const {
   auto mangling_scheme =
       Mangled::GetManglingScheme(mangled.GetMangledName().GetStringRef());
   return mangling_scheme == Mangled::eManglingSchemeItanium ||
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h
index 9a528ca7b03f9..13d436a68c691 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.h
@@ -92,7 +92,7 @@ class CPlusPlusLanguage : public Language {
 
   static llvm::StringRef GetPluginNameStatic() { return "cplusplus"; }
 
-  bool SymbolNameFitsToLanguage(Mangled mangled) const override;
+  bool SymbolNameFitsToLanguage(const Mangled &mangled) const override;
 
   bool DemangledNameContainsPath(llvm::StringRef path,
                                  ConstString demangled) const override;
diff --git a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp
index 3b8e21cbb9269..c0dcb958ad85f 100644
--- a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp
+++ b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp
@@ -235,7 +235,7 @@ ObjCLanguage::GetFunctionNameInfo(ConstString name) const {
   return {func_name_type, std::nullopt};
 }
 
-bool ObjCLanguage::SymbolNameFitsToLanguage(Mangled mangled) const {
+bool ObjCLanguage::SymbolNameFitsToLanguage(const Mangled &mangled) const {
   ConstString demangled_name = mangled.GetDemangledName();
   if (!demangled_name)
     return false;
@@ -1065,3 +1065,10 @@ ObjCLanguage::GetBooleanFromString(llvm::StringRef str) const {
       .Case("NO", {false})
       .Default({});
 }
+
+bool ObjCLanguage::IsPossibleObjCMethodName(llvm::StringRef name) {
+  if (!name.starts_with("-[") && !name.starts_with("+["))
+    return false;
+
+  return name.ends_with("]");
+}
diff --git a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
index a68ea41c723de..ced6bd3290a86 100644
--- a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
+++ b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
@@ -145,7 +145,7 @@ class ObjCLanguage : public Language {
   std::pair<lldb::FunctionNameType, std::optional<ConstString>>
   GetFunctionNameInfo(ConstString name) const override;
 
-  bool SymbolNameFitsToLanguage(Mangled mangled) const override;
+  bool SymbolNameFitsToLanguage(const Mangled &mangled) const override;
 
   lldb::TypeCategoryImplSP GetFormatters() override;
 
@@ -175,13 +175,7 @@ class ObjCLanguage : public Language {
 
   static llvm::StringRef GetPluginNameStatic() { return "objc"; }
 
-  static bool IsPossibleObjCMethodName(const char *name) {
-    if (!name)
-      return false;
-    bool starts_right = (name[0] == '+' || name[0] == '-') && name[1] == '[';
-    bool ends_right = (name[strlen(name) - 1] == ']');
-    return (starts_right && ends_right);
-  }
+  static bool IsPossibleObjCMethodName(llvm::StringRef name);
 
   static bool IsPossibleObjCSelector(const char *name) {
     if (!name)
diff --git a/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp
index 0489f4d6ada32..faa0dd0d87321 100644
--- a/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/ObjCPlusPlus/ObjCPlusPlusLanguage.cpp
@@ -47,7 +47,7 @@ Language *ObjCPlusPlusLanguage::CreateInstance(lldb::LanguageType language) {
 std::optional<bool>
 ObjCPlusPlusLanguage::GetBooleanFromString(llvm::StringRef str) const {
   return llvm::StringSwitch<std::optional<bool>>(str)
-      .Cases("true", "YES", {true})
-      .Cases("false", "NO", {false})
+      .Cases({"true", "YES"}, {true})
+      .Cases({"false", "NO"}, {false})
       .Default({});
 }
diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt
index 1717b0a896669..727c8290bceb4 100644
--- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt
@@ -1,10 +1,13 @@
 add_lldb_library(lldbPluginCPPRuntime
   CPPLanguageRuntime.cpp
+  VerboseTrapFrameRecognizer.cpp
 
   LINK_LIBS
     lldbCore
     lldbSymbol
     lldbTarget
+  CLANG_LIBS
+    clangCodeGen
 )
 
 add_subdirectory(ItaniumABI)
diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp
index 21a5ebe53073a..913678b629f2f 100644
--- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp
@@ -12,6 +12,7 @@
 #include <memory>
 
 #include "CPPLanguageRuntime.h"
+#include "VerboseTrapFrameRecognizer.h"
 
 #include "llvm/ADT/StringRef.h"
 
@@ -107,12 +108,15 @@ class LibCXXFrameRecognizer : public StackFrameRecognizer {
 
 CPPLanguageRuntime::CPPLanguageRuntime(Process *process)
     : LanguageRuntime(process) {
-  if (process)
+  if (process) {
     process->GetTarget().GetFrameRecognizerManager().AddRecognizer(
         StackFrameRecognizerSP(new LibCXXFrameRecognizer()), {},
         std::make_shared<RegularExpression>("^std::__[^:]*::"),
         /*mangling_preference=*/Mangled::ePreferDemangledWithoutArguments,
         /*first_instruction_only=*/false);
+
+    RegisterVerboseTrapFrameRecognizer(*process);
+  }
 }
 
 bool CPPLanguageRuntime::IsAllowedRuntimeValue(ConstString name) {
diff --git a/lldb/source/Target/VerboseTrapFrameRecognizer.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp
similarity index 81%
rename from lldb/source/Target/VerboseTrapFrameRecognizer.cpp
rename to lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp
index 03ab58b8c59a9..2b6bf2cd470e6 100644
--- a/lldb/source/Target/VerboseTrapFrameRecognizer.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp
@@ -1,4 +1,4 @@
-#include "lldb/Target/VerboseTrapFrameRecognizer.h"
+#include "VerboseTrapFrameRecognizer.h"
 
 #include "lldb/Core/Module.h"
 #include "lldb/Symbol/Function.h"
@@ -95,33 +95,14 @@ VerboseTrapFrameRecognizer::RecognizeFrame(lldb::StackFrameSP frame_sp) {
   if (func_name.empty())
     return {};
 
-  static auto trap_regex =
-      llvm::Regex(llvm::formatv("^{0}\\$(.*)\\$(.*)$", ClangTrapPrefix).str());
-  SmallVector<llvm::StringRef, 3> matches;
-  std::string regex_err_msg;
-  if (!trap_regex.match(func_name, &matches, &regex_err_msg)) {
-    LLDB_LOGF(GetLog(LLDBLog::Unwind),
-              "Failed to parse match trap regex for '%s': %s", func_name.data(),
-              regex_err_msg.c_str());
-
-    return {};
-  }
-
-  // For `__clang_trap_msg$category$message$` we expect 3 matches:
-  // 1. entire string
-  // 2. category
-  // 3. message
-  if (matches.size() != 3) {
-    LLDB_LOGF(GetLog(LLDBLog::Unwind),
-              "Unexpected function name format. Expected '<trap prefix>$<trap "
-              "category>$<trap message>'$ but got: '%s'.",
-              func_name.data());
-
+  auto maybe_trap_reason =
+      clang::CodeGen::DemangleTrapReasonInDebugInfo(func_name);
+  if (!maybe_trap_reason.has_value()) {
+    LLDB_LOGF(GetLog(LLDBLog::Unwind), "Failed to demangle '%s' as trap reason",
+              func_name.str().c_str());
     return {};
   }
-
-  auto category = matches[1];
-  auto message = matches[2];
+  auto [category, message] = maybe_trap_reason.value();
 
   std::string stop_reason =
       category.empty() ? "<empty category>" : category.str();
diff --git a/lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h
similarity index 63%
rename from lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h
rename to lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h
index 7e045760a28be..7d7020f63c8d2 100644
--- a/lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h
@@ -1,5 +1,13 @@
-#ifndef LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H
-#define LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H
+//===-- VerboseTrapFrameRecognizer.h --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H
+#define LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H
 
 #include "lldb/Target/StackFrameRecognizer.h"
 
@@ -36,4 +44,4 @@ class VerboseTrapFrameRecognizer : public StackFrameRecognizer {
 
 } // namespace lldb_private
 
-#endif // LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H
+#endif // LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
index 9beb133f5595f..83e39f37d8dcf 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
@@ -320,7 +320,7 @@ extern "C"
 static const char *g_get_shared_cache_class_info_name =
     "__lldb_apple_objc_v2_get_shared_cache_class_info";
 
-static const char *g_get_shared_cache_class_info_body = R"(
+static const char *g_get_shared_cache_class_info_definitions = R"(
 
 extern "C"
 {
@@ -411,6 +411,9 @@ struct ClassInfo
     Class isa;
     uint32_t hash;
 }  __attribute__((__packed__));
+)";
+
+static const char *g_get_shared_cache_class_info_body = R"(
 
 uint32_t
 __lldb_apple_objc_v2_get_shared_cache_class_info (void *objc_opt_ro_ptr,
@@ -418,6 +421,7 @@ __lldb_apple_objc_v2_get_shared_cache_class_info (void *objc_opt_ro_ptr,
                                                   void *class_infos_ptr,
                                                   uint64_t *relative_selector_offset,
                                                   uint32_t class_infos_byte_size,
+                                                  uint32_t *start_idx,
                                                   uint32_t should_log)
 {
     *relative_selector_offset = 0;
@@ -426,6 +430,7 @@ __lldb_apple_objc_v2_get_shared_cache_class_info (void *objc_opt_ro_ptr,
     DEBUG_PRINTF ("shared_cache_base_ptr = %p\n", shared_cache_base_ptr);
     DEBUG_PRINTF ("class_infos_ptr = %p\n", class_infos_ptr);
     DEBUG_PRINTF ("class_infos_byte_size = %u (%llu class infos)\n", class_infos_byte_size, (uint64_t)(class_infos_byte_size/sizeof(ClassInfo)));
+    DEBUG_PRINTF ("start_idx = %u\n", *start_idx);
     if (objc_opt_ro_ptr)
     {
         const objc_opt_t *objc_opt = (objc_opt_t *)objc_opt_ro_ptr;
@@ -480,7 +485,11 @@ __lldb_apple_objc_v2_get_shared_cache_class_info (void *objc_opt_ro_ptr,
             DEBUG_PRINTF ("clsopt->mask = 0x%8.8x\n", clsopt->mask);
             DEBUG_PRINTF ("classOffsets = %p\n", classOffsets);
 
-            for (uint32_t i=0; i<clsopt->capacity; ++i)
+            const uint32_t original_start_idx = *start_idx;
+
+            // Always start at the start_idx here. If it's greater than the capacity,
+            // it will skip the loop entirely and go to the duplicate handling below.
+            for (uint32_t i=*start_idx; i<clsopt->capacity; ++i)
             {
                 const uint64_t objectCacheOffset = classOffsets[i].objectCacheOffset;
                 DEBUG_PRINTF("objectCacheOffset[%u] = %u\n", i, objectCacheOffset);
@@ -524,59 +533,78 @@ __lldb_apple_objc_v2_get_shared_cache_class_info (void *objc_opt_ro_ptr,
                 else
                 {
                     DEBUG_PRINTF("not(class_infos && idx < max_class_infos)\n");
+                    *start_idx = i;
+                    break;
                 }
                 ++idx;
             }
 
-            const uint32_t *duplicate_count_ptr = (uint32_t *)&classOffsets[clsopt->capacity];
-            const uint32_t duplicate_count = *duplicate_count_ptr;
-            const objc_classheader_v16_t *duplicateClassOffsets = (const objc_classheader_v16_t *)(&duplicate_count_ptr[1]);
-
-            DEBUG_PRINTF ("duplicate_count = %u\n", duplicate_count);
-            DEBUG_PRINTF ("duplicateClassOffsets = %p\n", duplicateClassOffsets);
-
-            for (uint32_t i=0; i<duplicate_count; ++i)
-            {
-                const uint64_t objectCacheOffset = classOffsets[i].objectCacheOffset;
-                DEBUG_PRINTF("objectCacheOffset[%u] = %u\n", i, objectCacheOffset);
+            if (idx < max_class_infos) {
+                const uint32_t *duplicate_count_ptr = (uint32_t *)&classOffsets[clsopt->capacity];
+                const uint32_t duplicate_count = *duplicate_count_ptr;
+                const objc_classheader_v16_t *duplicateClassOffsets = (const objc_classheader_v16_t *)(&duplicate_count_ptr[1]);
 
-                if (classOffsets[i].isDuplicate) {
-                    DEBUG_PRINTF("isDuplicate = true\n");
-                    continue; // duplicate
-                }
+                DEBUG_PRINTF ("duplicate_count = %u\n", duplicate_count);
+                DEBUG_PRINTF ("duplicateClassOffsets = %p\n", duplicateClassOffsets);
 
-                if (objectCacheOffset == 0) {
-                    DEBUG_PRINTF("objectCacheOffset == invalidEntryOffset\n");
-                    continue; // invalid offset
-                }
+                const uint32_t duplicate_start_idx =
+                      *start_idx < clsopt->capacity ?
+                      0 :
+                      *start_idx - clsopt->capacity;
 
-                if (class_infos && idx < max_class_infos)
+                for (uint32_t i=duplicate_start_idx; i<duplicate_count; ++i)
                 {
-                    class_infos[idx].isa = (Class)((uint8_t *)shared_cache_base_ptr + objectCacheOffset);
+                    const uint64_t objectCacheOffset = duplicateClassOffsets[i].objectCacheOffset;
+                    DEBUG_PRINTF("objectCacheOffset[%u] = %u\n", i, objectCacheOffset);
 
-                    // Lookup the class name.
-                    const char *name = class_name_lookup_func(class_infos[idx].isa);
-                    DEBUG_PRINTF("[%u] isa = %8p %s\n", idx, class_infos[idx].isa, name);
+                    if (duplicateClassOffsets[i].isDuplicate) {
+                        DEBUG_PRINTF("isDuplicate = true\n");
+                        continue; // duplicate
+                    }
 
-                    // Hash the class name so we don't have to read it.
-                    const char *s = name;
-                    uint32_t h = 5381;
-                    for (unsigned char c = *s; c; c = *++s)
+                    if (objectCacheOffset == 0) {
+                        DEBUG_PRINTF("objectCacheOffset == invalidEntryOffset\n");
+                        continue; // invalid offset
+                    }
+
+                    if (class_infos && idx < max_class_infos)
                     {
-                        // class_getName demangles swift names and the hash must
-                        // be calculated on the mangled name.  hash==0 means lldb
-                        // will fetch the mangled name and compute the hash in
-                        // ParseClassInfoArray.
-                        if (c == '.')
+                        class_infos[idx].isa = (Class)((uint8_t *)shared_cache_base_ptr + objectCacheOffset);
+
+                        // Lookup the class name.
+                        const char *name = class_name_lookup_func(class_infos[idx].isa);
+                        DEBUG_PRINTF("[%u] isa = %8p %s\n", idx, class_infos[idx].isa, name);
+
+                        // Hash the class name so we don't have to read it.
+                        const char *s = name;
+                        uint32_t h = 5381;
+                        for (unsigned char c = *s; c; c = *++s)
                         {
-                            h = 0;
-                            break;
+                            // class_getName demangles swift names and the hash must
+                            // be calculated on the mangled name.  hash==0 means lldb
+                            // will fetch the mangled name and compute the hash in
+                            // ParseClassInfoArray.
+                            if (c == '.')
+                            {
+                                h = 0;
+                                break;
+                            }
+                            h = ((h << 5) + h) + c;
                         }
-                        h = ((h << 5) + h) + c;
+                        class_infos[idx].hash = h;
+                    } else {
+                        DEBUG_PRINTF("not(class_infos && idx < max_class_infos)\n");
+                        *start_idx = i;
+                        break;
                     }
-                    class_infos[idx].hash = h;
+                    ++idx;
                 }
             }
+            // Always make sure start_idx gets updated. Otherwise we have an infinite
+            // loop if there are exactly max_class_infos number of classes.
+            if (*start_idx == original_start_idx) {
+              *start_idx = idx;
+            }
         }
         else if (objc_opt->version >= 12 && objc_opt->version <= 15)
         {
@@ -1937,6 +1965,7 @@ AppleObjCRuntimeV2::SharedCacheClassInfoExtractor::
                       class_name_getter_function_name.AsCString(),
                       class_name_getter_function_name.AsCString());
 
+  shared_class_expression += g_get_shared_cache_class_info_definitions;
   shared_class_expression += g_get_shared_cache_class_info_body;
 
   auto utility_fn_or_error = exe_ctx.GetTargetRef().CreateUtilityFunction(
@@ -1958,6 +1987,9 @@ AppleObjCRuntimeV2::SharedCacheClassInfoExtractor::
   CompilerType clang_uint64_t_pointer_type =
       scratch_ts_sp->GetBuiltinTypeForEncodingAndBitSize(eEncodingUint, 64)
           .GetPointerType();
+  CompilerType clang_uint32_t_pointer_type =
+      scratch_ts_sp->GetBuiltinTypeForEncodingAndBitSize(eEncodingUint, 32)
+          .GetPointerType();
 
   // Next make the function caller for our implementation utility function.
   ValueList arguments;
@@ -1975,6 +2007,13 @@ AppleObjCRuntimeV2::SharedCacheClassInfoExtractor::
   value.SetValueType(Value::ValueType::Scalar);
   value.SetCompilerType(clang_uint32_t_type);
   arguments.PushValue(value);
+
+  value.SetValueType(Value::ValueType::Scalar);
+  value.SetCompilerType(clang_uint32_t_pointer_type);
+  arguments.PushValue(value);
+
+  value.SetValueType(Value::ValueType::Scalar);
+  value.SetCompilerType(clang_uint32_t_type);
   arguments.PushValue(value);
 
   std::unique_ptr<UtilityFunction> utility_fn = std::move(*utility_fn_or_error);
@@ -2312,10 +2351,7 @@ AppleObjCRuntimeV2::SharedCacheClassInfoExtractor::UpdateISAToDescriptorMap() {
 
   // The number of entries to pre-allocate room for.
   // Each entry is (addrsize + 4) bytes
-  // FIXME: It is not sustainable to continue incrementing this value every time
-  // the shared cache grows. This is because it requires allocating memory in
-  // the inferior process and some inferior processes have small memory limits.
-  const uint32_t max_num_classes = 212992;
+  const uint32_t max_num_classes_in_buffer = 212992;
 
   UtilityFunction *get_class_info_code = GetClassInfoUtilityFunction(exe_ctx);
   if (!get_class_info_code) {
@@ -2337,15 +2373,22 @@ AppleObjCRuntimeV2::SharedCacheClassInfoExtractor::UpdateISAToDescriptorMap() {
   DiagnosticManager diagnostics;
 
   const uint32_t class_info_byte_size = addr_size + 4;
-  const uint32_t class_infos_byte_size = max_num_classes * class_info_byte_size;
+  const uint32_t class_infos_byte_size =
+      max_num_classes_in_buffer * class_info_byte_size;
   lldb::addr_t class_infos_addr = process->AllocateMemory(
       class_infos_byte_size, ePermissionsReadable | ePermissionsWritable, err);
   const uint32_t relative_selector_offset_addr_size = 64;
   lldb::addr_t relative_selector_offset_addr =
       process->AllocateMemory(relative_selector_offset_addr_size,
                               ePermissionsReadable | ePermissionsWritable, err);
+  constexpr uint32_t class_info_start_idx_byte_size = sizeof(uint32_t);
+  lldb::addr_t class_info_start_idx_addr =
+      process->AllocateMemory(class_info_start_idx_byte_size,
+                              ePermissionsReadable | ePermissionsWritable, err);
 
-  if (class_infos_addr == LLDB_INVALID_ADDRESS) {
+  if (class_infos_addr == LLDB_INVALID_ADDRESS ||
+      relative_selector_offset_addr == LLDB_INVALID_ADDRESS ||
+      class_info_start_idx_addr == LLDB_INVALID_ADDRESS) {
     LLDB_LOGF(log,
               "unable to allocate %" PRIu32
               " bytes in process for shared cache read",
@@ -2353,6 +2396,17 @@ AppleObjCRuntimeV2::SharedCacheClassInfoExtractor::UpdateISAToDescriptorMap() {
     return DescriptorMapUpdateResult::Fail();
   }
 
+  const uint32_t start_idx_init_value = 0;
+  size_t bytes_written = process->WriteMemory(
+      class_info_start_idx_addr, &start_idx_init_value, sizeof(uint32_t), err);
+  if (bytes_written != sizeof(uint32_t)) {
+    LLDB_LOGF(log,
+              "unable to write %" PRIu32
+              " bytes in process for shared cache read",
+              class_infos_byte_size);
+    return DescriptorMapUpdateResult::Fail();
+  }
+
   std::lock_guard<std::mutex> guard(m_mutex);
 
   // Fill in our function argument values
@@ -2361,12 +2415,13 @@ AppleObjCRuntimeV2::SharedCacheClassInfoExtractor::UpdateISAToDescriptorMap() {
   arguments.GetValueAtIndex(2)->GetScalar() = class_infos_addr;
   arguments.GetValueAtIndex(3)->GetScalar() = relative_selector_offset_addr;
   arguments.GetValueAtIndex(4)->GetScalar() = class_infos_byte_size;
+  arguments.GetValueAtIndex(5)->GetScalar() = class_info_start_idx_addr;
   // Only dump the runtime classes from the expression evaluation if the log is
   // verbose:
   Log *type_log = GetLog(LLDBLog::Types);
   bool dump_log = type_log && type_log->GetVerbose();
 
-  arguments.GetValueAtIndex(5)->GetScalar() = dump_log ? 1 : 0;
+  arguments.GetValueAtIndex(6)->GetScalar() = dump_log ? 1 : 0;
 
   bool success = false;
 
@@ -2393,78 +2448,80 @@ AppleObjCRuntimeV2::SharedCacheClassInfoExtractor::UpdateISAToDescriptorMap() {
 
     diagnostics.Clear();
 
-    // Run the function
-    ExpressionResults results =
-        get_shared_cache_class_info_function->ExecuteFunction(
-            exe_ctx, &m_args, options, diagnostics, return_value);
-
-    if (results == eExpressionCompleted) {
-      // The result is the number of ClassInfo structures that were filled in
-      num_class_infos = return_value.GetScalar().ULong();
-      LLDB_LOG(log, "Discovered {0} Objective-C classes in the shared cache",
-               num_class_infos);
-      // Assert if there were more classes than we pre-allocated
-      // room for.
-      assert(num_class_infos <= max_num_classes);
-      if (num_class_infos > 0) {
-        if (num_class_infos > max_num_classes) {
-          num_class_infos = max_num_classes;
-
-          success = false;
-        } else {
+    uint32_t num_class_infos_read = 0;
+    bool already_read_relative_selector_offset = false;
+
+    do {
+      // Run the function.
+      ExpressionResults results =
+          get_shared_cache_class_info_function->ExecuteFunction(
+              exe_ctx, &m_args, options, diagnostics, return_value);
+
+      if (results == eExpressionCompleted) {
+        // The result is the number of ClassInfo structures that were filled in.
+        num_class_infos_read = return_value.GetScalar().ULong();
+        num_class_infos += num_class_infos_read;
+        LLDB_LOG(log, "Discovered {0} Objective-C classes in the shared cache",
+                 num_class_infos_read);
+        if (num_class_infos_read > 0) {
           success = true;
-        }
 
-        // Read the relative selector offset.
-        DataBufferHeap relative_selector_offset_buffer(64, 0);
-        if (process->ReadMemory(relative_selector_offset_addr,
-                                relative_selector_offset_buffer.GetBytes(),
-                                relative_selector_offset_buffer.GetByteSize(),
-                                err) ==
-            relative_selector_offset_buffer.GetByteSize()) {
-          DataExtractor relative_selector_offset_data(
-              relative_selector_offset_buffer.GetBytes(),
-              relative_selector_offset_buffer.GetByteSize(),
-              process->GetByteOrder(), addr_size);
-          lldb::offset_t offset = 0;
-          uint64_t relative_selector_offset =
-              relative_selector_offset_data.GetU64(&offset);
-          if (relative_selector_offset > 0) {
-            // The offset is relative to the objc_opt struct.
-            m_runtime.SetRelativeSelectorBaseAddr(objc_opt_ptr +
-                                                  relative_selector_offset);
+          // Read the relative selector offset. This only needs to occur once no
+          // matter how many times the function is called.
+          if (!already_read_relative_selector_offset) {
+            DataBufferHeap relative_selector_offset_buffer(64, 0);
+            if (process->ReadMemory(
+                    relative_selector_offset_addr,
+                    relative_selector_offset_buffer.GetBytes(),
+                    relative_selector_offset_buffer.GetByteSize(),
+                    err) == relative_selector_offset_buffer.GetByteSize()) {
+              DataExtractor relative_selector_offset_data(
+                  relative_selector_offset_buffer.GetBytes(),
+                  relative_selector_offset_buffer.GetByteSize(),
+                  process->GetByteOrder(), addr_size);
+              lldb::offset_t offset = 0;
+              uint64_t relative_selector_offset =
+                  relative_selector_offset_data.GetU64(&offset);
+              if (relative_selector_offset > 0) {
+                // The offset is relative to the objc_opt struct.
+                m_runtime.SetRelativeSelectorBaseAddr(objc_opt_ptr +
+                                                      relative_selector_offset);
+              }
+            }
+            already_read_relative_selector_offset = true;
           }
-        }
-
-        // Read the ClassInfo structures
-        DataBufferHeap class_infos_buffer(
-            num_class_infos * class_info_byte_size, 0);
-        if (process->ReadMemory(class_infos_addr, class_infos_buffer.GetBytes(),
-                                class_infos_buffer.GetByteSize(),
-                                err) == class_infos_buffer.GetByteSize()) {
-          DataExtractor class_infos_data(class_infos_buffer.GetBytes(),
-                                         class_infos_buffer.GetByteSize(),
-                                         process->GetByteOrder(), addr_size);
 
-          m_runtime.ParseClassInfoArray(class_infos_data, num_class_infos);
+          // Read the ClassInfo structures
+          DataBufferHeap class_infos_buffer(
+              num_class_infos_read * class_info_byte_size, 0);
+          if (process->ReadMemory(class_infos_addr,
+                                  class_infos_buffer.GetBytes(),
+                                  class_infos_buffer.GetByteSize(),
+                                  err) == class_infos_buffer.GetByteSize()) {
+            DataExtractor class_infos_data(class_infos_buffer.GetBytes(),
+                                           class_infos_buffer.GetByteSize(),
+                                           process->GetByteOrder(), addr_size);
+
+            m_runtime.ParseClassInfoArray(class_infos_data,
+                                          num_class_infos_read);
+          }
         }
-      } else {
-        success = true;
-      }
-    } else {
-      if (log) {
+      } else if (log) {
         LLDB_LOGF(log, "Error evaluating our find class name function.");
         diagnostics.Dump(log);
+        break;
       }
-    }
-  } else {
-    if (log) {
-      LLDB_LOGF(log, "Error writing function arguments.");
-      diagnostics.Dump(log);
-    }
+    } while (num_class_infos_read == max_num_classes_in_buffer);
+  } else if (log) {
+    LLDB_LOGF(log, "Error writing function arguments.");
+    diagnostics.Dump(log);
   }
 
-  // Deallocate the memory we allocated for the ClassInfo array
+  LLDB_LOG(log, "Processed {0} Objective-C classes total from the shared cache",
+           num_class_infos);
+  // Cleanup memory we allocated in the process.
+  process->DeallocateMemory(relative_selector_offset_addr);
+  process->DeallocateMemory(class_info_start_idx_addr);
   process->DeallocateMemory(class_infos_addr);
 
   return DescriptorMapUpdateResult(success, false, num_class_infos);
diff --git a/lldb/source/Plugins/ObjectFile/Breakpad/BreakpadRecords.cpp b/lldb/source/Plugins/ObjectFile/Breakpad/BreakpadRecords.cpp
index d40f87b1a7b42..945b70fcb96ec 100644
--- a/lldb/source/Plugins/ObjectFile/Breakpad/BreakpadRecords.cpp
+++ b/lldb/source/Plugins/ObjectFile/Breakpad/BreakpadRecords.cpp
@@ -70,7 +70,7 @@ llvm::Triple::ArchType stringTo<llvm::Triple::ArchType>(llvm::StringRef Str) {
   using llvm::Triple;
   return llvm::StringSwitch<Triple::ArchType>(Str)
       .Case("arm", Triple::arm)
-      .Cases("arm64", "arm64e", Triple::aarch64)
+      .Cases({"arm64", "arm64e"}, Triple::aarch64)
       .Case("mips", Triple::mips)
       .Case("msp430", Triple::msp430)
       .Case("ppc", Triple::ppc)
@@ -79,7 +79,7 @@ llvm::Triple::ArchType stringTo<llvm::Triple::ArchType>(llvm::StringRef Str) {
       .Case("sparc", Triple::sparc)
       .Case("sparcv9", Triple::sparcv9)
       .Case("x86", Triple::x86)
-      .Cases("x86_64", "x86_64h", Triple::x86_64)
+      .Cases({"x86_64", "x86_64h"}, Triple::x86_64)
       .Default(Triple::UnknownArch);
 }
 
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index 097c91b623e8f..3968715a6d215 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -130,6 +130,29 @@ class ELFRelocation {
 
   RelocUnion reloc;
 };
+
+lldb::SectionSP MergeSections(lldb::SectionSP lhs, lldb::SectionSP rhs) {
+  assert(lhs && rhs);
+
+  lldb::ModuleSP lhs_module_parent = lhs->GetModule();
+  lldb::ModuleSP rhs_module_parent = rhs->GetModule();
+  assert(lhs_module_parent && rhs_module_parent);
+
+  // Do a sanity check, these should be the same.
+  if (lhs->GetFileAddress() != rhs->GetFileAddress())
+    lhs_module_parent->ReportWarning(
+        "Mismatch addresses for section {0} when "
+        "merging with {1}, expected: {2:x}, "
+        "actual: {3:x}",
+        lhs->GetTypeAsCString(),
+        rhs_module_parent->GetFileSpec().GetPathAsConstString().GetCString(),
+        lhs->GetByteSize(), rhs->GetByteSize());
+
+  // We want to take the greater of two sections. If LHS and RHS are both
+  // SHT_NOBITS, we should default to LHS. If RHS has a bigger section,
+  // indicating it has data that wasn't stripped, we should take that instead.
+  return rhs->GetFileSize() > lhs->GetFileSize() ? rhs : lhs;
+}
 } // end anonymous namespace
 
 ELFRelocation::ELFRelocation(unsigned type) {
@@ -1678,7 +1701,7 @@ static SectionType GetSectionTypeFromName(llvm::StringRef Name) {
       .Case(".ARM.exidx", eSectionTypeARMexidx)
       .Case(".ARM.extab", eSectionTypeARMextab)
       .Case(".ctf", eSectionTypeDebug)
-      .Cases(".data", ".tdata", eSectionTypeData)
+      .Cases({".data", ".tdata"}, eSectionTypeData)
       .Case(".eh_frame", eSectionTypeEHFrame)
       .Case(".gnu_debugaltlink", eSectionTypeDWARFGNUDebugAltLink)
       .Case(".gosymtab", eSectionTypeGoSymtab)
@@ -1967,10 +1990,10 @@ void ObjectFileELF::CreateSections(SectionList &unified_section_list) {
     provider.AddSection(std::move(*InfoOr), std::move(section_sp));
   }
 
-  // For eTypeDebugInfo files, the Symbol Vendor will take care of updating the
-  // unified section list.
-  if (GetType() != eTypeDebugInfo)
-    unified_section_list = *m_sections_up;
+  // Merge the two adding any new sections, and overwriting any existing
+  // sections that are SHT_NOBITS
+  unified_section_list =
+      SectionList::Merge(unified_section_list, *m_sections_up, MergeSections);
 
   // If there's a .gnu_debugdata section, we'll try to read the .symtab that's
   // embedded in there and replace the one in the original object file (if any).
@@ -2735,9 +2758,8 @@ static void ApplyELF64ABS64Relocation(Symtab *symtab, ELFRelocation &rel,
     // ObjectFileELF creates a WritableDataBuffer in CreateInstance.
     WritableDataBuffer *data_buffer =
         llvm::cast<WritableDataBuffer>(data_buffer_sp.get());
-    uint64_t *dst = reinterpret_cast<uint64_t *>(
-        data_buffer->GetBytes() + rel_section->GetFileOffset() +
-        ELFRelocation::RelocOffset64(rel));
+    void *const dst = data_buffer->GetBytes() + rel_section->GetFileOffset() +
+                      ELFRelocation::RelocOffset64(rel);
     uint64_t val_offset = value + ELFRelocation::RelocAddend64(rel);
     memcpy(dst, &val_offset, sizeof(uint64_t));
   }
@@ -2762,9 +2784,8 @@ static void ApplyELF64ABS32Relocation(Symtab *symtab, ELFRelocation &rel,
     // ObjectFileELF creates a WritableDataBuffer in CreateInstance.
     WritableDataBuffer *data_buffer =
         llvm::cast<WritableDataBuffer>(data_buffer_sp.get());
-    uint32_t *dst = reinterpret_cast<uint32_t *>(
-        data_buffer->GetBytes() + rel_section->GetFileOffset() +
-        ELFRelocation::RelocOffset32(rel));
+    void *const dst = data_buffer->GetBytes() + rel_section->GetFileOffset() +
+                      ELFRelocation::RelocOffset32(rel);
     memcpy(dst, &truncated_addr, sizeof(uint32_t));
   }
 }
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index 9cdb8467bfc60..2218c23db5a95 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -1674,6 +1674,10 @@ void ObjectFileMachO::ProcessSegmentCommand(
   uint32_t segment_sect_idx;
   const lldb::user_id_t first_segment_sectID = context.NextSectionIdx + 1;
 
+  // 64 bit mach-o files have sections with 32 bit file offsets. If any section
+  // data end will exceed UINT32_MAX, then we need to do some bookkeeping to
+  // ensure we can access this data correctly.
+  uint64_t section_offset_adjust = 0;
   const uint32_t num_u32s = load_cmd.cmd == LC_SEGMENT ? 7 : 8;
   for (segment_sect_idx = 0; segment_sect_idx < load_cmd.nsects;
        ++segment_sect_idx) {
@@ -1697,6 +1701,16 @@ void ObjectFileMachO::ProcessSegmentCommand(
     // isn't stored in the abstracted Sections.
     m_mach_sections.push_back(sect64);
 
+    // Make sure we can load sections in mach-o files where some sections cross
+    // a 4GB boundary. llvm::MachO::section_64 have only 32 bit file offsets
+    // for the file offset of the section contents, so we need to track and
+    // sections that overflow and adjust the offsets accordingly.
+    const uint64_t section_file_offset =
+        (uint64_t)sect64.offset + section_offset_adjust;
+    const uint64_t end_section_offset = (uint64_t)sect64.offset + sect64.size;
+    if (end_section_offset >= UINT32_MAX)
+      section_offset_adjust += end_section_offset & 0xFFFFFFFF00000000ull;
+
     if (add_section) {
       ConstString section_name(
           sect64.sectname, strnlen(sect64.sectname, sizeof(sect64.sectname)));
@@ -1736,13 +1750,13 @@ void ObjectFileMachO::ProcessSegmentCommand(
           }
 
           // Grow the section size as needed.
-          if (sect64.offset) {
+          if (section_file_offset) {
             const lldb::addr_t segment_min_file_offset =
                 segment->GetFileOffset();
             const lldb::addr_t segment_max_file_offset =
                 segment_min_file_offset + segment->GetFileSize();
 
-            const lldb::addr_t section_min_file_offset = sect64.offset;
+            const lldb::addr_t section_min_file_offset = section_file_offset;
             const lldb::addr_t section_max_file_offset =
                 section_min_file_offset + sect64.size;
             const lldb::addr_t new_file_offset =
@@ -1769,10 +1783,10 @@ void ObjectFileMachO::ProcessSegmentCommand(
               // other sections.
               sect64.addr, // File VM address == addresses as they are
               // found in the object file
-              sect64.size,   // VM size in bytes of this section
-              sect64.offset, // Offset to the data for this section in
+              sect64.size,         // VM size in bytes of this section
+              section_file_offset, // Offset to the data for this section in
               // the file
-              sect64.offset ? sect64.size : 0, // Size in bytes of
+              section_file_offset ? sect64.size : 0, // Size in bytes of
               // this section as
               // found in the file
               sect64.align,
@@ -1792,14 +1806,14 @@ void ObjectFileMachO::ProcessSegmentCommand(
       SectionSP section_sp(new Section(
           segment_sp, module_sp, this, ++context.NextSectionIdx, section_name,
           sect_type, sect64.addr - segment_sp->GetFileAddress(), sect64.size,
-          sect64.offset, sect64.offset == 0 ? 0 : sect64.size, sect64.align,
-          sect64.flags));
+          section_file_offset, section_file_offset == 0 ? 0 : sect64.size,
+          sect64.align, sect64.flags));
       // Set the section to be encrypted to match the segment
 
       bool section_is_encrypted = false;
       if (!segment_is_encrypted && load_cmd.filesize != 0)
         section_is_encrypted = context.EncryptedRanges.FindEntryThatContains(
-                                   sect64.offset) != nullptr;
+                                   section_file_offset) != nullptr;
 
       section_sp->SetIsEncrypted(segment_is_encrypted || section_is_encrypted);
       section_sp->SetPermissions(segment_permissions);
@@ -5922,6 +5936,20 @@ Section *ObjectFileMachO::GetMachHeaderSection() {
   return nullptr;
 }
 
+bool ObjectFileMachO::IsGOTSection(const lldb_private::Section &section) const {
+  assert(section.GetObjectFile() == this && "Wrong object file!");
+  SectionSP segment = section.GetParent();
+  if (!segment)
+    return false;
+
+  const bool is_data_const_got =
+      segment->GetName() == "__DATA_CONST" && section.GetName() == "__got";
+  const bool is_auth_const_ptr =
+      segment->GetName() == "__AUTH_CONST" &&
+      (section.GetName() == "__auth_got" || section.GetName() == "__auth_ptr");
+  return is_data_const_got || is_auth_const_ptr;
+}
+
 bool ObjectFileMachO::SectionIsLoadable(const Section *section) {
   if (!section)
     return false;
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
index 25643aacb3d2d..5456f0315c942 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
@@ -162,6 +162,8 @@ class ObjectFileMachO : public lldb_private::ObjectFile {
 
   lldb_private::Section *GetMachHeaderSection();
 
+  bool IsGOTSection(const lldb_private::Section &section) const override;
+
   // PluginInterface protocol
   llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
index 4984445dcbab9..244489ae06d65 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
@@ -985,7 +985,7 @@ SectionType ObjectFilePECOFF::GetSectionType(llvm::StringRef sect_name,
           .Case(".stabstr", eSectionTypeDataCString)
           .Case(".reloc", eSectionTypeOther)
           // .eh_frame can be truncated to 8 chars.
-          .Cases(".eh_frame", ".eh_fram", eSectionTypeEHFrame)
+          .Cases({".eh_frame", ".eh_fram"}, eSectionTypeEHFrame)
           .Case(".gosymtab", eSectionTypeGoSymtab)
           .Case(".lldbsummaries", lldb::eSectionTypeLLDBTypeSummaries)
           .Case(".lldbformatters", lldb::eSectionTypeLLDBFormatters)
diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp b/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp
index 57d88f615e2b3..22b9711fda480 100644
--- a/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp
+++ b/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp
@@ -15,6 +15,8 @@
 #include "lldb/Utility/UriParser.h"
 #include "lldb/ValueObject/ValueObject.h"
 
+#include "llvm/ADT/DenseMap.h"
+
 #include "AdbClient.h"
 #include "PlatformAndroid.h"
 #include "PlatformAndroidRemoteGDBServer.h"
@@ -479,136 +481,90 @@ std::string PlatformAndroid::GetRunAs() {
   return run_as.str();
 }
 
-// Helper function to populate process status information from
-// /proc/[pid]/status
-void PlatformAndroid::PopulateProcessStatusInfo(
-    lldb::pid_t pid, ProcessInstanceInfo &process_info) {
-  // Read /proc/[pid]/status to get parent PID, UIDs, and GIDs
-  Status error;
-  AdbClientUP status_adb = GetAdbClient(error);
-  if (error.Fail())
-    return;
-
-  std::string status_output;
-  StreamString status_cmd;
-  status_cmd.Printf(
-      "cat /proc/%llu/status 2>/dev/null | grep -E '^(PPid|Uid|Gid):'",
-      static_cast<unsigned long long>(pid));
-  Status status_error =
-      status_adb->Shell(status_cmd.GetData(), seconds(5), &status_output);
+static bool NeedsCmdlineSupplement(const ProcessInstanceInfo &proc_info) {
+  llvm::StringRef name =
+      proc_info.GetExecutableFile().GetFilename().GetStringRef();
+  return name.contains("app_process") || name.contains("zygote");
+}
 
-  if (status_error.Fail() || status_output.empty())
+// Fetch /proc/PID/cmdline for processes to get actual package names.
+// Android apps often show as "zygote" or "app_process" without this.
+static void SupplementWithCmdlineInfo(ProcessInstanceInfoList &proc_infos,
+                                      AdbClient *adb) {
+  if (proc_infos.empty())
     return;
 
-  llvm::SmallVector<llvm::StringRef, 16> lines;
-  llvm::StringRef(status_output).split(lines, '\n');
-
-  for (llvm::StringRef line : lines) {
-    line = line.trim();
-    if (line.starts_with("PPid:")) {
-      llvm::StringRef ppid_str = line.substr(5).trim();
-      lldb::pid_t ppid;
-      if (llvm::to_integer(ppid_str, ppid))
-        process_info.SetParentProcessID(ppid);
-    } else if (line.starts_with("Uid:")) {
-      llvm::SmallVector<llvm::StringRef, 4> uid_parts;
-      line.substr(4).trim().split(uid_parts, '\t', -1, false);
-      if (uid_parts.size() >= 2) {
-        uint32_t uid, euid;
-        if (llvm::to_integer(uid_parts[0].trim(), uid))
-          process_info.SetUserID(uid);
-        if (llvm::to_integer(uid_parts[1].trim(), euid))
-          process_info.SetEffectiveUserID(euid);
-      }
-    } else if (line.starts_with("Gid:")) {
-      llvm::SmallVector<llvm::StringRef, 4> gid_parts;
-      line.substr(4).trim().split(gid_parts, '\t', -1, false);
-      if (gid_parts.size() >= 2) {
-        uint32_t gid, egid;
-        if (llvm::to_integer(gid_parts[0].trim(), gid))
-          process_info.SetGroupID(gid);
-        if (llvm::to_integer(gid_parts[1].trim(), egid))
-          process_info.SetEffectiveGroupID(egid);
-      }
+  llvm::DenseMap<lldb::pid_t, ProcessInstanceInfo *> pid_map;
+  std::string pid_list;
+  for (auto &proc_info : proc_infos) {
+    if (NeedsCmdlineSupplement(proc_info)) {
+      lldb::pid_t pid = proc_info.GetProcessID();
+      pid_map[pid] = &proc_info;
+      if (!pid_list.empty())
+        pid_list += " ";
+      pid_list += std::to_string(pid);
     }
   }
-}
 
-// Helper function to populate command line arguments from /proc/[pid]/cmdline
-void PlatformAndroid::PopulateProcessCommandLine(
-    lldb::pid_t pid, ProcessInstanceInfo &process_info) {
-  // Read /proc/[pid]/cmdline to get command line arguments
-  Status error;
-  AdbClientUP cmdline_adb = GetAdbClient(error);
-  if (error.Fail())
+  if (pid_list.empty())
     return;
 
+  Log *log = GetLog(LLDBLog::Platform);
+
+  // Use xargs -P to parallelize cmdline fetching (up to 8 concurrent reads)
+  StreamString cmd;
+  cmd.Printf(
+      "echo '%s' | xargs -n 1 -P 8 sh -c "
+      "'echo \"$1:$(cat /proc/$1/cmdline 2>/dev/null | tr \"\\0\" \" \")\"' sh",
+      pid_list.c_str());
+
   std::string cmdline_output;
-  StreamString cmdline_cmd;
-  cmdline_cmd.Printf("cat /proc/%llu/cmdline 2>/dev/null | tr '\\000' ' '",
-                     static_cast<unsigned long long>(pid));
-  Status cmdline_error =
-      cmdline_adb->Shell(cmdline_cmd.GetData(), seconds(5), &cmdline_output);
+  Status error = adb->Shell(cmd.GetData(), seconds(5), &cmdline_output);
 
-  if (cmdline_error.Fail() || cmdline_output.empty())
+  if (error.Fail() || cmdline_output.empty())
     return;
 
-  cmdline_output = llvm::StringRef(cmdline_output).trim().str();
-  if (cmdline_output.empty())
-    return;
+  llvm::SmallVector<llvm::StringRef, 256> lines;
+  llvm::StringRef(cmdline_output).split(lines, '\n', -1, false);
 
-  llvm::SmallVector<llvm::StringRef, 16> args;
-  llvm::StringRef(cmdline_output).split(args, ' ', -1, false);
-  if (args.empty())
-    return;
+  for (llvm::StringRef line : lines) {
+    line = line.trim();
+    auto [pid_str, cmdline] = line.split(':');
+    if (pid_str.empty() || cmdline.empty())
+      continue;
 
-  process_info.SetArg0(args[0]);
-  Args process_args;
-  for (size_t i = 1; i < args.size(); i++) {
-    if (!args[i].empty())
-      process_args.AppendArgument(args[i]);
-  }
-  process_info.SetArguments(process_args, false);
-}
+    cmdline = cmdline.trim();
 
-// Helper function to populate architecture from /proc/[pid]/exe
-void PlatformAndroid::PopulateProcessArchitecture(
-    lldb::pid_t pid, ProcessInstanceInfo &process_info) {
-  // Read /proc/[pid]/exe to get executable path for architecture detection
-  Status error;
-  AdbClientUP exe_adb = GetAdbClient(error);
-  if (error.Fail())
-    return;
+    lldb::pid_t pid;
+    if (!llvm::to_integer(pid_str, pid) || cmdline.empty())
+      continue;
 
-  std::string exe_output;
-  StreamString exe_cmd;
-  exe_cmd.Printf("readlink /proc/%llu/exe 2>/dev/null",
-                 static_cast<unsigned long long>(pid));
-  Status exe_error = exe_adb->Shell(exe_cmd.GetData(), seconds(5), &exe_output);
+    auto it = pid_map.find(pid);
+    if (it == pid_map.end())
+      continue;
 
-  if (exe_error.Fail() || exe_output.empty())
-    return;
+    ProcessInstanceInfo *proc_info = it->second;
+    llvm::SmallVector<llvm::StringRef, 16> args;
+    cmdline.split(args, ' ', -1, false);
 
-  exe_output = llvm::StringRef(exe_output).trim().str();
-
-  // Determine architecture from exe path
-  ArchSpec arch;
-  if (exe_output.find("64") != std::string::npos ||
-      exe_output.find("arm64") != std::string::npos ||
-      exe_output.find("aarch64") != std::string::npos) {
-    arch.SetTriple("aarch64-unknown-linux-android");
-  } else if (exe_output.find("x86_64") != std::string::npos) {
-    arch.SetTriple("x86_64-unknown-linux-android");
-  } else if (exe_output.find("x86") != std::string::npos ||
-             exe_output.find("i686") != std::string::npos) {
-    arch.SetTriple("i686-unknown-linux-android");
-  } else {
-    // Default to armv7 for 32-bit ARM (most common on Android)
-    arch.SetTriple("armv7-unknown-linux-android");
-  }
+    if (!args.empty()) {
+      proc_info->GetExecutableFile().SetFile(args[0], FileSpec::Style::posix);
+
+      if (args.size() > 1) {
+        Args process_args;
+        for (size_t i = 1; i < args.size(); ++i) {
+          if (!args[i].empty())
+            process_args.AppendArgument(args[i]);
+        }
+        proc_info->SetArguments(process_args, false);
+      }
 
-  if (arch.IsValid())
-    process_info.SetArchitecture(arch);
+      LLDB_LOGF(log,
+                "PlatformAndroid::%s supplemented PID %llu with cmdline: %s",
+                __FUNCTION__, static_cast<unsigned long long>(pid),
+                cmdline.str().c_str());
+    }
+  }
 }
 
 uint32_t
@@ -616,109 +572,39 @@ PlatformAndroid::FindProcesses(const ProcessInstanceInfoMatch &match_info,
                                ProcessInstanceInfoList &proc_infos) {
   proc_infos.clear();
 
-  // When LLDB is running natively on an Android device (IsHost() == true),
-  // use the parent class's standard Linux /proc enumeration. IsHost() is only
-  // true when compiled for Android (#if defined(__ANDROID__)), so calling
-  // PlatformLinux methods is safe (Android is Linux-based).
   if (IsHost())
     return PlatformLinux::FindProcesses(match_info, proc_infos);
 
-  // Remote Android platform: implement process name lookup using 'pidof' over
-  // adb.
-
-  // LLDB stores the search name in GetExecutableFile() (even though it's
-  // actually a process name like "com.android.chrome" rather than an
-  // executable path). If no search name is provided, we can't use
-  // 'pidof', so return early with no results.
-  const ProcessInstanceInfo &match_process_info = match_info.GetProcessInfo();
-  if (!match_process_info.GetExecutableFile() ||
-      match_info.GetNameMatchType() == NameMatch::Ignore) {
-    return 0;
-  }
-
-  // Extract the process name to search for (typically an Android package name
-  // like "com.example.app" or binary name like "app_process64")
-  std::string process_name = match_process_info.GetExecutableFile().GetPath();
-  if (process_name.empty())
-    return 0;
-
-  // Use adb to find the process by name
-  Status error;
-  AdbClientUP adb(GetAdbClient(error));
-  if (error.Fail()) {
-    Log *log = GetLog(LLDBLog::Platform);
-    LLDB_LOGF(log, "PlatformAndroid::%s failed to get ADB client: %s",
-              __FUNCTION__, error.AsCString());
-    return 0;
-  }
-
-  // Use 'pidof' command to get PIDs for the process name.
-  // Quote the process name to handle special characters (spaces, etc.)
-  std::string pidof_output;
-  StreamString command;
-  command.Printf("pidof '%s'", process_name.c_str());
-  error = adb->Shell(command.GetData(), seconds(5), &pidof_output);
-
-  if (error.Fail()) {
-    Log *log = GetLog(LLDBLog::Platform);
-    LLDB_LOG(log, "PlatformAndroid::{} 'pidof {}' failed: {}", __FUNCTION__,
-             process_name.c_str(), error.AsCString());
-    return 0;
-  }
-
-  // Parse PIDs from pidof output.
-  // Note: pidof can return multiple PIDs (space-separated) if multiple
-  // instances of the same executable are running.
-  pidof_output = llvm::StringRef(pidof_output).trim().str();
-  if (pidof_output.empty()) {
-    Log *log = GetLog(LLDBLog::Platform);
-    LLDB_LOGF(log, "PlatformAndroid::%s no process found with name '%s'",
-              __FUNCTION__, process_name.c_str());
+  if (!m_remote_platform_sp)
     return 0;
-  }
-
-  // Split the output by whitespace to handle multiple PIDs
-  llvm::SmallVector<llvm::StringRef, 8> pid_strings;
-  llvm::StringRef(pidof_output).split(pid_strings, ' ', -1, false);
-
-  Log *log = GetLog(LLDBLog::Platform);
-
-  // Process each PID and gather information
-  uint32_t num_matches = 0;
-  for (llvm::StringRef pid_str : pid_strings) {
-    pid_str = pid_str.trim();
-    if (pid_str.empty())
-      continue;
-
-    lldb::pid_t pid;
-    if (!llvm::to_integer(pid_str, pid)) {
-      LLDB_LOGF(log, "PlatformAndroid::%s failed to parse PID from: '%s'",
-                __FUNCTION__, pid_str.str().c_str());
-      continue;
-    }
-
-    ProcessInstanceInfo process_info;
-    process_info.SetProcessID(pid);
-    process_info.GetExecutableFile().SetFile(process_name,
-                                             FileSpec::Style::posix);
-
-    // Populate additional process information
-    PopulateProcessStatusInfo(pid, process_info);
-    PopulateProcessCommandLine(pid, process_info);
-    PopulateProcessArchitecture(pid, process_info);
-
-    // Check if this process matches the criteria
-    if (match_info.Matches(process_info)) {
-      proc_infos.push_back(process_info);
-      num_matches++;
 
-      LLDB_LOGF(log, "PlatformAndroid::%s found process '%s' with PID %llu",
-                __FUNCTION__, process_name.c_str(),
-                static_cast<unsigned long long>(pid));
+  // Android-specific process name handling:
+  // Apps spawned from zygote initially appear as "app_process" or "zygote"
+  // in the process list, but their actual package names (e.g.,
+  // "com.example.app") are only available in /proc/PID/cmdline. To support
+  // name-based matching, we must first fetch cmdline info for all processes,
+  // then apply the original name filter.
+  ProcessInstanceInfoMatch broad_match_info = match_info;
+  broad_match_info.SetNameMatchType(NameMatch::Ignore);
+
+  ProcessInstanceInfoList all_procs;
+  uint32_t count =
+      m_remote_platform_sp->FindProcesses(broad_match_info, all_procs);
+
+  if (count > 0) {
+    Status error;
+    AdbClientUP adb(GetAdbClient(error));
+    if (error.Success())
+      SupplementWithCmdlineInfo(all_procs, adb.get());
+
+    // Apply the original name matching against supplemented process info.
+    for (auto &proc_info : all_procs) {
+      if (match_info.Matches(proc_info))
+        proc_infos.push_back(proc_info);
     }
   }
 
-  return num_matches;
+  return proc_infos.size();
 }
 
 std::unique_ptr<AdbSyncService> PlatformAndroid::GetSyncService(Status &error) {
diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroid.h b/lldb/source/Plugins/Platform/Android/PlatformAndroid.h
index e771c6ae97d4d..c6a412b39d410 100644
--- a/lldb/source/Plugins/Platform/Android/PlatformAndroid.h
+++ b/lldb/source/Plugins/Platform/Android/PlatformAndroid.h
@@ -60,7 +60,7 @@ class PlatformAndroid : public platform_linux::PlatformLinux {
   uint32_t GetDefaultMemoryCacheLineSize() override;
 
   uint32_t FindProcesses(const ProcessInstanceInfoMatch &match_info,
-                         ProcessInstanceInfoList &proc_infos) override;
+                         ProcessInstanceInfoList &process_infos) override;
 
 protected:
   const char *GetCacheHostname() override;
@@ -86,17 +86,8 @@ class PlatformAndroid : public platform_linux::PlatformLinux {
 protected:
   virtual std::unique_ptr<AdbSyncService> GetSyncService(Status &error);
 
-private:
   std::string m_device_id;
   uint32_t m_sdk_version;
-
-  // Helper functions for process information gathering
-  void PopulateProcessStatusInfo(lldb::pid_t pid,
-                                 ProcessInstanceInfo &process_info);
-  void PopulateProcessCommandLine(lldb::pid_t pid,
-                                  ProcessInstanceInfo &process_info);
-  void PopulateProcessArchitecture(lldb::pid_t pid,
-                                   ProcessInstanceInfo &process_info);
 };
 
 } // namespace platform_android
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp
index 4cfb0a81dc6e4..47111c97927c1 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp
@@ -90,7 +90,7 @@ void PlatformAppleSimulator::GetStatus(Stream &strm) {
   if (!sdk.empty())
     strm << "  SDK Path: \"" << sdk << "\"\n";
   else
-    strm << "  SDK Path: error: unable to locate SDK\n";
+    strm << "  SDK Path: <unable to locate SDK>\n";
 
 #if defined(__APPLE__)
   // This will get called by subclasses, so just output status on the current
@@ -420,7 +420,6 @@ Status PlatformAppleSimulator::GetSymbolFile(const FileSpec &platform_file,
 
 Status PlatformAppleSimulator::GetSharedModule(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
-    const FileSpecList *module_search_paths_ptr,
     llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) {
   // For iOS/tvOS/watchOS, the SDK files are all cached locally on the
   // host system. So first we ask for the file in the cached SDK, then
@@ -432,12 +431,10 @@ Status PlatformAppleSimulator::GetSharedModule(
   error = GetSymbolFile(platform_file, module_spec.GetUUIDPtr(),
                         platform_module_spec.GetFileSpec());
   if (error.Success()) {
-    error = ResolveExecutable(platform_module_spec, module_sp,
-                              module_search_paths_ptr);
+    error = ResolveExecutable(platform_module_spec, module_sp);
   } else {
     const bool always_create = false;
-    error = ModuleList::GetSharedModule(module_spec, module_sp,
-                                        module_search_paths_ptr, old_modules,
+    error = ModuleList::GetSharedModule(module_spec, module_sp, old_modules,
                                         did_create_ptr, always_create);
   }
   if (module_sp)
@@ -660,4 +657,3 @@ void PlatformAppleSimulator::Terminate() {
       PlatformDarwin::Terminate();
     }
 }
-
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h
index 7fcf2c502ca6a..77d2a3b4e1cce 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h
@@ -89,7 +89,6 @@ class PlatformAppleSimulator : public PlatformDarwin {
 
   Status GetSharedModule(const ModuleSpec &module_spec, Process *process,
                          lldb::ModuleSP &module_sp,
-                         const FileSpecList *module_search_paths_ptr,
                          llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
                          bool *did_create_ptr) override;
 
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
index 5aad4470091bc..bfbd85ea34203 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
@@ -56,7 +56,7 @@ using namespace lldb;
 using namespace lldb_private;
 
 #define OPTTABLE_STR_TABLE_CODE
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTTABLE_STR_TABLE_CODE
 
 static Status ExceptionMaskValidator(const char *string, void *unused) {
@@ -331,7 +331,6 @@ Status PlatformDarwin::ResolveSymbolFile(Target &target,
 
 Status PlatformDarwin::GetSharedModule(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
-    const FileSpecList *module_search_paths_ptr,
     llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) {
   Status error;
   module_sp.reset();
@@ -341,19 +340,22 @@ Status PlatformDarwin::GetSharedModule(
     // module first.
     if (m_remote_platform_sp) {
       error = m_remote_platform_sp->GetSharedModule(
-          module_spec, process, module_sp, module_search_paths_ptr, old_modules,
-          did_create_ptr);
+          module_spec, process, module_sp, old_modules, did_create_ptr);
     }
   }
 
   if (!module_sp) {
     // Fall back to the local platform and find the file locally
     error = Platform::GetSharedModule(module_spec, process, module_sp,
-                                      module_search_paths_ptr, old_modules,
-                                      did_create_ptr);
+                                      old_modules, did_create_ptr);
 
     const FileSpec &platform_file = module_spec.GetFileSpec();
-    if (!module_sp && module_search_paths_ptr && platform_file) {
+    // Get module search paths from the target if available.
+    TargetSP target_sp = module_spec.GetTargetSP();
+    FileSpecList module_search_paths;
+    if (target_sp)
+      module_search_paths = target_sp->GetExecutableSearchPaths();
+    if (!module_sp && !module_search_paths.IsEmpty() && platform_file) {
       // We can try to pull off part of the file path up to the bundle
       // directory level and try any module search paths...
       FileSpec bundle_directory;
@@ -362,9 +364,9 @@ Status PlatformDarwin::GetSharedModule(
           ModuleSpec new_module_spec(module_spec);
           new_module_spec.GetFileSpec() = bundle_directory;
           if (Host::ResolveExecutableInBundle(new_module_spec.GetFileSpec())) {
-            Status new_error(Platform::GetSharedModule(
-                new_module_spec, process, module_sp, nullptr, old_modules,
-                did_create_ptr));
+            Status new_error(Platform::GetSharedModule(new_module_spec, process,
+                                                       module_sp, old_modules,
+                                                       did_create_ptr));
 
             if (module_sp)
               return new_error;
@@ -376,10 +378,10 @@ Status PlatformDarwin::GetSharedModule(
           const size_t bundle_directory_len =
               bundle_directory.GetPath(bundle_dir, sizeof(bundle_dir));
           char new_path[PATH_MAX];
-          size_t num_module_search_paths = module_search_paths_ptr->GetSize();
+          size_t num_module_search_paths = module_search_paths.GetSize();
           for (size_t i = 0; i < num_module_search_paths; ++i) {
             const size_t search_path_len =
-                module_search_paths_ptr->GetFileSpecAtIndex(i).GetPath(
+                module_search_paths.GetFileSpecAtIndex(i).GetPath(
                     new_path, sizeof(new_path));
             if (search_path_len < sizeof(new_path)) {
               snprintf(new_path + search_path_len,
@@ -390,7 +392,7 @@ Status PlatformDarwin::GetSharedModule(
                 ModuleSpec new_module_spec(module_spec);
                 new_module_spec.GetFileSpec() = new_file_spec;
                 Status new_error(Platform::GetSharedModule(
-                    new_module_spec, process, module_sp, nullptr, old_modules,
+                    new_module_spec, process, module_sp, old_modules,
                     did_create_ptr));
 
                 if (module_sp) {
@@ -1122,7 +1124,7 @@ void PlatformDarwin::AddClangModuleCompilationOptionsForSDKType(
 #define OPTION(PREFIX_OFFSET, NAME_OFFSET, VAR, ...)                           \
   llvm::StringRef opt_##VAR = OptionStrTable[NAME_OFFSET];                     \
   (void)opt_##VAR;
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTION
     minimum_version_option << '-';
     switch (sdk_type) {
@@ -1303,12 +1305,15 @@ PlatformDarwin::LaunchProcess(lldb_private::ProcessLaunchInfo &launch_info) {
 
 lldb_private::Status PlatformDarwin::FindBundleBinaryInExecSearchPaths(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
-    const FileSpecList *module_search_paths_ptr,
     llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) {
   const FileSpec &platform_file = module_spec.GetFileSpec();
-  // See if the file is present in any of the module_search_paths_ptr
+  TargetSP target_sp = module_spec.GetTargetSP();
+  FileSpecList module_search_paths;
+  if (target_sp)
+    module_search_paths = target_sp->GetExecutableSearchPaths();
+  // See if the file is present in any of the module_search_paths
   // directories.
-  if (!module_sp && module_search_paths_ptr && platform_file) {
+  if (!module_sp && !module_search_paths.IsEmpty() && platform_file) {
     // create a vector of all the file / directory names in platform_file e.g.
     // this might be
     // /System/Library/PrivateFrameworks/UIFoundation.framework/UIFoundation
@@ -1322,21 +1327,21 @@ lldb_private::Status PlatformDarwin::FindBundleBinaryInExecSearchPaths(
     std::reverse(path_parts.begin(), path_parts.end());
     const size_t path_parts_size = path_parts.size();
 
-    size_t num_module_search_paths = module_search_paths_ptr->GetSize();
+    size_t num_module_search_paths = module_search_paths.GetSize();
     for (size_t i = 0; i < num_module_search_paths; ++i) {
       Log *log_verbose = GetLog(LLDBLog::Host);
       LLDB_LOGF(
           log_verbose,
           "PlatformRemoteDarwinDevice::GetSharedModule searching for binary in "
           "search-path %s",
-          module_search_paths_ptr->GetFileSpecAtIndex(i).GetPath().c_str());
+          module_search_paths.GetFileSpecAtIndex(i).GetPath().c_str());
       // Create a new FileSpec with this module_search_paths_ptr plus just the
       // filename ("UIFoundation"), then the parent dir plus filename
       // ("UIFoundation.framework/UIFoundation") etc - up to four names (to
       // handle "Foo.framework/Contents/MacOS/Foo")
 
       for (size_t j = 0; j < 4 && j < path_parts_size - 1; ++j) {
-        FileSpec path_to_try(module_search_paths_ptr->GetFileSpecAtIndex(i));
+        FileSpec path_to_try(module_search_paths.GetFileSpecAtIndex(i));
 
         // Add the components backwards.  For
         // .../PrivateFrameworks/UIFoundation.framework/UIFoundation path_parts
@@ -1356,9 +1361,9 @@ lldb_private::Status PlatformDarwin::FindBundleBinaryInExecSearchPaths(
         if (FileSystem::Instance().Exists(path_to_try)) {
           ModuleSpec new_module_spec(module_spec);
           new_module_spec.GetFileSpec() = path_to_try;
-          Status new_error(
-              Platform::GetSharedModule(new_module_spec, process, module_sp,
-                                        nullptr, old_modules, did_create_ptr));
+          Status new_error(Platform::GetSharedModule(new_module_spec, process,
+                                                     module_sp, old_modules,
+                                                     did_create_ptr));
 
           if (module_sp) {
             module_sp->SetPlatformFileSpec(path_to_try);
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h
index f8a62ceb958fe..82e69e36dca0c 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h
@@ -73,7 +73,6 @@ class PlatformDarwin : public PlatformPOSIX {
 
   Status GetSharedModule(const ModuleSpec &module_spec, Process *process,
                          lldb::ModuleSP &module_sp,
-                         const FileSpecList *module_search_paths_ptr,
                          llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
                          bool *did_create_ptr) override;
 
@@ -189,7 +188,7 @@ class PlatformDarwin : public PlatformPOSIX {
 
   Status FindBundleBinaryInExecSearchPaths(
       const ModuleSpec &module_spec, Process *process,
-      lldb::ModuleSP &module_sp, const FileSpecList *module_search_paths_ptr,
+      lldb::ModuleSP &module_sp,
       llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr);
 
   // The OSType where lldb is running.
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp
index 68ef81789b089..a72d94ea79c49 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp
@@ -295,7 +295,6 @@ BringInRemoteFile(Platform *platform,
 
 lldb_private::Status PlatformDarwinDevice::GetSharedModuleWithLocalCache(
     const lldb_private::ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
-    const lldb_private::FileSpecList *module_search_paths_ptr,
     llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) {
 
   Log *log = GetLog(LLDBLog::Platform);
@@ -329,8 +328,7 @@ lldb_private::Status PlatformDarwinDevice::GetSharedModuleWithLocalCache(
       ModuleSpec shared_cache_spec(module_spec.GetFileSpec(), image_info.uuid,
                                    image_info.data_sp);
       err = ModuleList::GetSharedModule(shared_cache_spec, module_sp,
-                                        module_search_paths_ptr, old_modules,
-                                        did_create_ptr);
+                                        old_modules, did_create_ptr);
       if (module_sp) {
         LLDB_LOGF(log, "[%s] module %s was found in the in-memory shared cache",
                   (IsHost() ? "host" : "remote"),
@@ -348,8 +346,7 @@ lldb_private::Status PlatformDarwinDevice::GetSharedModuleWithLocalCache(
     FileSystem::Instance().Resolve(device_support_spec);
     if (FileSystem::Instance().Exists(device_support_spec)) {
       ModuleSpec local_spec(device_support_spec, module_spec.GetUUID());
-      err = ModuleList::GetSharedModule(local_spec, module_sp,
-                                        module_search_paths_ptr, old_modules,
+      err = ModuleList::GetSharedModule(local_spec, module_sp, old_modules,
                                         did_create_ptr);
       if (module_sp) {
         LLDB_LOGF(log,
@@ -363,8 +360,7 @@ lldb_private::Status PlatformDarwinDevice::GetSharedModuleWithLocalCache(
     }
   }
 
-  err = ModuleList::GetSharedModule(module_spec, module_sp,
-                                    module_search_paths_ptr, old_modules,
+  err = ModuleList::GetSharedModule(module_spec, module_sp, old_modules,
                                     did_create_ptr);
   if (module_sp)
     return err;
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.h b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.h
index e1eba08fb5584..e0142ab7ca4cb 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.h
@@ -26,7 +26,6 @@ class PlatformDarwinDevice : public PlatformDarwin {
 protected:
   virtual Status GetSharedModuleWithLocalCache(
       const ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
-      const FileSpecList *module_search_paths_ptr,
       llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr);
 
   struct SDKDirectoryInfo {
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
index 07c5a523161ed..04e87b9dea699 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
@@ -719,7 +719,6 @@ void PlatformDarwinKernel::UpdateKextandKernelsLocalScan() {
 
 Status PlatformDarwinKernel::GetSharedModule(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
-    const FileSpecList *module_search_paths_ptr,
     llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) {
   Status error;
   module_sp.reset();
@@ -734,14 +733,12 @@ Status PlatformDarwinKernel::GetSharedModule(
     // UUID search can get here with no name - and it may be a kernel.
     if (kext_bundle_id == "mach_kernel" || kext_bundle_id.empty()) {
       error = GetSharedModuleKernel(module_spec, process, module_sp,
-                                    module_search_paths_ptr, old_modules,
-                                    did_create_ptr);
+                                    old_modules, did_create_ptr);
       if (error.Success() && module_sp) {
         return error;
       }
     } else {
-      return GetSharedModuleKext(module_spec, process, module_sp,
-                                 module_search_paths_ptr, old_modules,
+      return GetSharedModuleKext(module_spec, process, module_sp, old_modules,
                                  did_create_ptr);
     }
   }
@@ -749,13 +746,11 @@ Status PlatformDarwinKernel::GetSharedModule(
   // Give the generic methods, including possibly calling into DebugSymbols
   // framework on macOS systems, a chance.
   return PlatformDarwin::GetSharedModule(module_spec, process, module_sp,
-                                         module_search_paths_ptr, old_modules,
-                                         did_create_ptr);
+                                         old_modules, did_create_ptr);
 }
 
 Status PlatformDarwinKernel::GetSharedModuleKext(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
-    const FileSpecList *module_search_paths_ptr,
     llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) {
   Status error;
   module_sp.reset();
@@ -782,8 +777,7 @@ Status PlatformDarwinKernel::GetSharedModuleKext(
   // Give the generic methods, including possibly calling into  DebugSymbols
   // framework on macOS systems, a chance.
   error = PlatformDarwin::GetSharedModule(module_spec, process, module_sp,
-                                          module_search_paths_ptr, old_modules,
-                                          did_create_ptr);
+                                          old_modules, did_create_ptr);
   if (error.Success() && module_sp.get()) {
     return error;
   }
@@ -793,7 +787,6 @@ Status PlatformDarwinKernel::GetSharedModuleKext(
 
 Status PlatformDarwinKernel::GetSharedModuleKernel(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
-    const FileSpecList *module_search_paths_ptr,
     llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) {
   assert(module_sp.get() == nullptr);
   UpdateKextandKernelsLocalScan();
@@ -848,8 +841,7 @@ Status PlatformDarwinKernel::GetSharedModuleKernel(
   // Give the generic methods, including possibly calling into DebugSymbols
   // framework on macOS systems, a chance.
   return PlatformDarwin::GetSharedModule(module_spec, process, module_sp,
-                                         module_search_paths_ptr, old_modules,
-                                         did_create_ptr);
+                                         old_modules, did_create_ptr);
 }
 
 std::vector<lldb_private::FileSpec>
@@ -888,8 +880,8 @@ Status PlatformDarwinKernel::ExamineKextForMatchingUUID(
       ModuleSP module_sp(new Module(exe_spec));
       if (module_sp && module_sp->GetObjectFile() &&
           module_sp->MatchesModuleSpec(exe_spec)) {
-        Status error = ModuleList::GetSharedModule(exe_spec, exe_module_sp,
-                                                   NULL, NULL, NULL);
+        Status error =
+            ModuleList::GetSharedModule(exe_spec, exe_module_sp, NULL, NULL);
         if (exe_module_sp && exe_module_sp->GetObjectFile()) {
           return error;
         }
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h
index 9db9c0065613d..b5cf701a76b4d 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h
@@ -60,7 +60,6 @@ class PlatformDarwinKernel : public PlatformDarwin {
 
   Status GetSharedModule(const ModuleSpec &module_spec, Process *process,
                          lldb::ModuleSP &module_sp,
-                         const FileSpecList *module_search_paths_ptr,
                          llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
                          bool *did_create_ptr) override;
 
@@ -142,14 +141,14 @@ class PlatformDarwinKernel : public PlatformDarwin {
 
   Status GetSharedModuleKext(const ModuleSpec &module_spec, Process *process,
                              lldb::ModuleSP &module_sp,
-                             const FileSpecList *module_search_paths_ptr,
                              llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
                              bool *did_create_ptr);
 
-  Status GetSharedModuleKernel(
-      const ModuleSpec &module_spec, Process *process,
-      lldb::ModuleSP &module_sp, const FileSpecList *module_search_paths_ptr,
-      llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr);
+  Status
+  GetSharedModuleKernel(const ModuleSpec &module_spec, Process *process,
+                        lldb::ModuleSP &module_sp,
+                        llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
+                        bool *did_create_ptr);
 
   Status ExamineKextForMatchingUUID(const FileSpec &kext_bundle_path,
                                     const UUID &uuid, const ArchSpec &arch,
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
index dad6dcd133955..e6ea75a35f921 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
@@ -182,10 +182,8 @@ PlatformMacOSX::GetSupportedArchitectures(const ArchSpec &process_host_arch) {
 lldb_private::Status PlatformMacOSX::GetSharedModule(
     const lldb_private::ModuleSpec &module_spec, Process *process,
     lldb::ModuleSP &module_sp,
-    const lldb_private::FileSpecList *module_search_paths_ptr,
     llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) {
   Status error = GetSharedModuleWithLocalCache(module_spec, module_sp,
-                                               module_search_paths_ptr,
                                                old_modules, did_create_ptr);
 
   if (module_sp) {
@@ -199,9 +197,9 @@ lldb_private::Status PlatformMacOSX::GetSharedModule(
         lldb::ModuleSP x86_64_module_sp;
         llvm::SmallVector<lldb::ModuleSP, 1> old_x86_64_modules;
         bool did_create = false;
-        Status x86_64_error = GetSharedModuleWithLocalCache(
-            module_spec_x86_64, x86_64_module_sp, module_search_paths_ptr,
-            &old_x86_64_modules, &did_create);
+        Status x86_64_error =
+            GetSharedModuleWithLocalCache(module_spec_x86_64, x86_64_module_sp,
+                                          &old_x86_64_modules, &did_create);
         if (x86_64_module_sp && x86_64_module_sp->GetObjectFile()) {
           module_sp = x86_64_module_sp;
           if (old_modules)
@@ -217,7 +215,6 @@ lldb_private::Status PlatformMacOSX::GetSharedModule(
 
   if (!module_sp) {
     error = FindBundleBinaryInExecSearchPaths(module_spec, process, module_sp,
-                                              module_search_paths_ptr,
                                               old_modules, did_create_ptr);
   }
   return error;
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.h b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.h
index be844856ef923..9555b16551d5a 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.h
@@ -48,7 +48,6 @@ class PlatformMacOSX : public PlatformDarwinDevice {
 
   Status GetSharedModule(const ModuleSpec &module_spec, Process *process,
                          lldb::ModuleSP &module_sp,
-                         const FileSpecList *module_search_paths_ptr,
                          llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
                          bool *did_create_ptr) override;
 
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
index b83d07b19235c..53fab93f5e705 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
@@ -53,7 +53,7 @@ void PlatformRemoteDarwinDevice::GetStatus(Stream &strm) {
   if (sdk_directory)
     strm.Printf("  SDK Path: \"%s\"\n", sdk_directory);
   else
-    strm.PutCString("  SDK Path: error: unable to locate SDK\n");
+    strm.PutCString("  SDK Path: <unable to locate SDK>\n");
 
   const uint32_t num_sdk_infos = m_sdk_directory_infos.size();
   for (uint32_t i = 0; i < num_sdk_infos; ++i) {
@@ -158,7 +158,6 @@ Status PlatformRemoteDarwinDevice::GetSymbolFile(const FileSpec &platform_file,
 
 Status PlatformRemoteDarwinDevice::GetSharedModule(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
-    const FileSpecList *module_search_paths_ptr,
     llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) {
   // For iOS, the SDK files are all cached locally on the host system. So first
   // we ask for the file in the cached SDK, then we attempt to get a shared
@@ -185,7 +184,7 @@ Status PlatformRemoteDarwinDevice::GetSharedModule(
       if (GetFileInSDK(platform_file_path, connected_sdk_idx,
                        platform_module_spec.GetFileSpec())) {
         module_sp.reset();
-        error = ResolveExecutable(platform_module_spec, module_sp, nullptr);
+        error = ResolveExecutable(platform_module_spec, module_sp);
         if (module_sp) {
           m_last_module_sdk_idx = connected_sdk_idx;
           error.Clear();
@@ -202,7 +201,7 @@ Status PlatformRemoteDarwinDevice::GetSharedModule(
       if (GetFileInSDK(platform_file_path, m_last_module_sdk_idx,
                        platform_module_spec.GetFileSpec())) {
         module_sp.reset();
-        error = ResolveExecutable(platform_module_spec, module_sp, nullptr);
+        error = ResolveExecutable(platform_module_spec, module_sp);
         if (module_sp) {
           error.Clear();
           return error;
@@ -224,7 +223,7 @@ Status PlatformRemoteDarwinDevice::GetSharedModule(
       if (GetFileInSDK(platform_file_path, current_sdk_idx,
                        platform_module_spec.GetFileSpec())) {
         module_sp.reset();
-        error = ResolveExecutable(platform_module_spec, module_sp, nullptr);
+        error = ResolveExecutable(platform_module_spec, module_sp);
         if (module_sp) {
           m_last_module_sdk_idx = current_sdk_idx;
           error.Clear();
@@ -245,7 +244,7 @@ Status PlatformRemoteDarwinDevice::GetSharedModule(
                        platform_module_spec.GetFileSpec())) {
         // printf ("sdk[%u]: '%s'\n", sdk_idx, local_file.GetPath().c_str());
 
-        error = ResolveExecutable(platform_module_spec, module_sp, nullptr);
+        error = ResolveExecutable(platform_module_spec, module_sp);
         if (module_sp) {
           // Remember the index of the last SDK that we found a file in in case
           // the wrong SDK was selected.
@@ -261,8 +260,7 @@ Status PlatformRemoteDarwinDevice::GetSharedModule(
 
   // This may not be an SDK-related module.  Try whether we can bring in the
   // thing to our local cache.
-  error = GetSharedModuleWithLocalCache(module_spec, module_sp,
-                                        module_search_paths_ptr, old_modules,
+  error = GetSharedModuleWithLocalCache(module_spec, module_sp, old_modules,
                                         did_create_ptr);
   if (error.Success())
     return error;
@@ -271,15 +269,13 @@ Status PlatformRemoteDarwinDevice::GetSharedModule(
   // directories.
   if (!module_sp)
     error = PlatformDarwin::FindBundleBinaryInExecSearchPaths(
-        module_spec, process, module_sp, module_search_paths_ptr, old_modules,
-        did_create_ptr);
+        module_spec, process, module_sp, old_modules, did_create_ptr);
 
   if (error.Success())
     return error;
 
   const bool always_create = false;
-  error = ModuleList::GetSharedModule(module_spec, module_sp,
-                                      module_search_paths_ptr, old_modules,
+  error = ModuleList::GetSharedModule(module_spec, module_sp, old_modules,
                                       did_create_ptr, always_create);
 
   if (module_sp)
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h
index 557f4876e91ab..4abd74ed07584 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h
@@ -47,7 +47,6 @@ class PlatformRemoteDarwinDevice : public PlatformDarwinDevice {
 
   Status GetSharedModule(const ModuleSpec &module_spec, Process *process,
                          lldb::ModuleSP &module_sp,
-                         const FileSpecList *module_search_paths_ptr,
                          llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
                          bool *did_create_ptr) override;
 
diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
index b7029fb3a95b3..f8e33eac614a4 100644
--- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
+++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
@@ -84,8 +84,9 @@ bool ProcessElfCore::CanDebug(lldb::TargetSP target_sp,
   // For now we are just making sure the file exists for a given module
   if (!m_core_module_sp && FileSystem::Instance().Exists(m_core_file)) {
     ModuleSpec core_module_spec(m_core_file, target_sp->GetArchitecture());
+    core_module_spec.SetTarget(target_sp);
     Status error(ModuleList::GetSharedModule(core_module_spec, m_core_module_sp,
-                                             nullptr, nullptr, nullptr));
+                                             nullptr, nullptr));
     if (m_core_module_sp) {
       ObjectFile *core_objfile = m_core_module_sp->GetObjectFile();
       if (core_objfile && core_objfile->GetType() == ObjectFile::eTypeCoreFile)
diff --git a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
index a780b3f59aded..83d684e9ca528 100644
--- a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
+++ b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
@@ -95,8 +95,9 @@ bool ProcessMachCore::CanDebug(lldb::TargetSP target_sp,
     // header but we should still try to use it -
     // ModuleSpecList::FindMatchingModuleSpec enforces a strict arch mach.
     ModuleSpec core_module_spec(m_core_file);
+    core_module_spec.SetTarget(target_sp);
     Status error(ModuleList::GetSharedModule(core_module_spec, m_core_module_sp,
-                                             nullptr, nullptr, nullptr));
+                                             nullptr, nullptr));
 
     if (m_core_module_sp) {
       ObjectFile *core_objfile = m_core_module_sp->GetObjectFile();
diff --git a/lldb/source/Plugins/Process/scripted/ScriptedFrame.h b/lldb/source/Plugins/Process/scripted/ScriptedFrame.h
index 6e01e2fd7653e..b6b77c4a7d160 100644
--- a/lldb/source/Plugins/Process/scripted/ScriptedFrame.h
+++ b/lldb/source/Plugins/Process/scripted/ScriptedFrame.h
@@ -9,7 +9,6 @@
 #ifndef LLDB_SOURCE_PLUGINS_SCRIPTED_FRAME_H
 #define LLDB_SOURCE_PLUGINS_SCRIPTED_FRAME_H
 
-#include "Plugins/Process/Utility/RegisterContextMemory.h"
 #include "ScriptedThread.h"
 #include "lldb/Interpreter/ScriptInterpreter.h"
 #include "lldb/Target/DynamicRegisterInfo.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
index 09103573b89c5..50569cdefaafa 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
@@ -23,6 +23,7 @@ add_lldb_library(lldbPluginScriptInterpreterPythonInterfaces PLUGIN
   OperatingSystemPythonInterface.cpp
   ScriptInterpreterPythonInterfaces.cpp
   ScriptedFramePythonInterface.cpp
+  ScriptedFrameProviderPythonInterface.cpp
   ScriptedPlatformPythonInterface.cpp
   ScriptedProcessPythonInterface.cpp
   ScriptedPythonInterface.cpp
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.h
index 3814f46615078..b2a347951d0f2 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.h
@@ -17,6 +17,7 @@
 
 #include "OperatingSystemPythonInterface.h"
 #include "ScriptedBreakpointPythonInterface.h"
+#include "ScriptedFrameProviderPythonInterface.h"
 #include "ScriptedFramePythonInterface.h"
 #include "ScriptedPlatformPythonInterface.h"
 #include "ScriptedProcessPythonInterface.h"
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp
new file mode 100644
index 0000000000000..b866bf332b7b6
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp
@@ -0,0 +1,57 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/Config.h"
+#include "lldb/Target/Thread.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/lldb-enumerations.h"
+
+#if LLDB_ENABLE_PYTHON
+
+// LLDB Python header must be included first
+#include "../lldb-python.h"
+
+#include "../SWIGPythonBridge.h"
+#include "../ScriptInterpreterPythonImpl.h"
+#include "ScriptedFrameProviderPythonInterface.h"
+#include <optional>
+
+using namespace lldb;
+using namespace lldb_private;
+using namespace lldb_private::python;
+using Locker = ScriptInterpreterPythonImpl::Locker;
+
+ScriptedFrameProviderPythonInterface::ScriptedFrameProviderPythonInterface(
+    ScriptInterpreterPythonImpl &interpreter)
+    : ScriptedFrameProviderInterface(), ScriptedPythonInterface(interpreter) {}
+
+llvm::Expected<StructuredData::GenericSP>
+ScriptedFrameProviderPythonInterface::CreatePluginObject(
+    const llvm::StringRef class_name, lldb::StackFrameListSP input_frames,
+    StructuredData::DictionarySP args_sp) {
+  if (!input_frames)
+    return llvm::createStringError("Invalid frame list");
+
+  StructuredDataImpl sd_impl(args_sp);
+  return ScriptedPythonInterface::CreatePluginObject(class_name, nullptr,
+                                                     input_frames, sd_impl);
+}
+
+StructuredData::ObjectSP
+ScriptedFrameProviderPythonInterface::GetFrameAtIndex(uint32_t index) {
+  Status error;
+  StructuredData::ObjectSP obj = Dispatch("get_frame_at_index", error, index);
+
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
+    return {};
+
+  return obj;
+}
+
+#endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h
new file mode 100644
index 0000000000000..fd163984028d3
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDFRAMEPROVIDERPYTHONINTERFACE_H
+#define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDFRAMEPROVIDERPYTHONINTERFACE_H
+
+#include "lldb/Host/Config.h"
+
+#if LLDB_ENABLE_PYTHON
+
+#include "ScriptedPythonInterface.h"
+#include "lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h"
+#include <optional>
+
+namespace lldb_private {
+class ScriptedFrameProviderPythonInterface
+    : public ScriptedFrameProviderInterface,
+      public ScriptedPythonInterface {
+public:
+  ScriptedFrameProviderPythonInterface(
+      ScriptInterpreterPythonImpl &interpreter);
+
+  llvm::Expected<StructuredData::GenericSP>
+  CreatePluginObject(llvm::StringRef class_name,
+                     lldb::StackFrameListSP input_frames,
+                     StructuredData::DictionarySP args_sp) override;
+
+  llvm::SmallVector<AbstractMethodRequirement>
+  GetAbstractMethodRequirements() const override {
+    return llvm::SmallVector<AbstractMethodRequirement>(
+        {{"get_frame_at_index"}});
+  }
+
+  StructuredData::ObjectSP GetFrameAtIndex(uint32_t index) override;
+};
+} // namespace lldb_private
+
+#endif // LLDB_ENABLE_PYTHON
+#endif // LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDFRAMEPROVIDERPYTHONINTERFACE_H
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
index 4fdf2b12a5500..af2e0b5df4d22 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
@@ -243,4 +243,21 @@ ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::DescriptionLevel>(
   return static_cast<lldb::DescriptionLevel>(unsigned_val);
 }
 
+template <>
+lldb::StackFrameListSP
+ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::StackFrameListSP>(
+    python::PythonObject &p, Status &error) {
+
+  lldb::SBFrameList *sb_frame_list = reinterpret_cast<lldb::SBFrameList *>(
+      python::LLDBSWIGPython_CastPyObjectToSBFrameList(p.get()));
+
+  if (!sb_frame_list) {
+    error = Status::FromErrorStringWithFormat(
+        "couldn't cast lldb::SBFrameList to lldb::StackFrameListSP.");
+    return {};
+  }
+
+  return m_interpreter.GetOpaqueTypeFromSBFrameList(*sb_frame_list);
+}
+
 #endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
index 2335b2ef0f171..af88a69e34a13 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
@@ -188,8 +188,13 @@ class ScriptedPythonInterface : virtual public ScriptedInterface {
       // This addresses the cases where the embedded interpreter session
       // dictionary is passed to the extension initializer which is not used
       // most of the time.
+      // Note, though none of our API's suggest defining the interfaces with
+      // varargs, we have some extant clients that were doing that.  To keep
+      // from breaking them, we just say putting a varargs in these signatures
+      // turns off argument checking.
       size_t num_args = sizeof...(Args);
-      if (num_args != arg_info->max_positional_args) {
+      if (arg_info->max_positional_args != PythonCallable::ArgInfo::UNBOUNDED &&
+          num_args != arg_info->max_positional_args) {
         if (num_args != arg_info->max_positional_args - 1)
           return create_error("Passed arguments ({0}) doesn't match the number "
                               "of expected arguments ({1}).",
@@ -444,6 +449,14 @@ class ScriptedPythonInterface : virtual public ScriptedInterface {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
 
+  python::PythonObject Transform(lldb::ThreadSP arg) {
+    return python::SWIGBridge::ToSWIGWrapper(arg);
+  }
+
+  python::PythonObject Transform(lldb::StackFrameListSP arg) {
+    return python::SWIGBridge::ToSWIGWrapper(arg);
+  }
+
   python::PythonObject Transform(lldb::ThreadPlanSP arg) {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
@@ -628,6 +641,11 @@ lldb::DescriptionLevel
 ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::DescriptionLevel>(
     python::PythonObject &p, Status &error);
 
+template <>
+lldb::StackFrameListSP
+ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::StackFrameListSP>(
+    python::PythonObject &p, Status &error);
+
 } // namespace lldb_private
 
 #endif // LLDB_ENABLE_PYTHON
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
index 27f5d2ee471c0..2c971262fc34e 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
@@ -93,6 +93,7 @@ class SWIGBridge {
   static PythonObject ToSWIGWrapper(const StructuredDataImpl &data_impl);
   static PythonObject ToSWIGWrapper(lldb::ThreadSP thread_sp);
   static PythonObject ToSWIGWrapper(lldb::StackFrameSP frame_sp);
+  static PythonObject ToSWIGWrapper(lldb::StackFrameListSP frames_sp);
   static PythonObject ToSWIGWrapper(lldb::DebuggerSP debugger_sp);
   static PythonObject ToSWIGWrapper(lldb::WatchpointSP watchpoint_sp);
   static PythonObject ToSWIGWrapper(lldb::BreakpointLocationSP bp_loc_sp);
@@ -269,6 +270,7 @@ void *LLDBSWIGPython_CastPyObjectToSBSymbolContext(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBValue(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBMemoryRegionInfo(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBExecutionContext(PyObject *data);
+void *LLDBSWIGPython_CastPyObjectToSBFrameList(PyObject *data);
 } // namespace python
 
 } // namespace lldb_private
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index d257a08a2c62c..3493fa9fef635 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -1526,6 +1526,11 @@ ScriptInterpreterPythonImpl::CreateScriptedFrameInterface() {
   return std::make_shared<ScriptedFramePythonInterface>(*this);
 }
 
+ScriptedFrameProviderInterfaceSP
+ScriptInterpreterPythonImpl::CreateScriptedFrameProviderInterface() {
+  return std::make_shared<ScriptedFrameProviderPythonInterface>(*this);
+}
+
 ScriptedThreadPlanInterfaceSP
 ScriptInterpreterPythonImpl::CreateScriptedThreadPlanInterface() {
   return std::make_shared<ScriptedThreadPlanPythonInterface>(*this);
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
index 00ae59c1c4241..ad2ddd2219e8a 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
@@ -101,6 +101,9 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython {
 
   lldb::ScriptedFrameInterfaceSP CreateScriptedFrameInterface() override;
 
+  lldb::ScriptedFrameProviderInterfaceSP
+  CreateScriptedFrameProviderInterface() override;
+
   lldb::ScriptedThreadPlanInterfaceSP
   CreateScriptedThreadPlanInterface() override;
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index c049829f37219..63b2dc4ab82b0 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -814,13 +814,18 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc,
     // there...
     [[fallthrough]];
 
-  case DW_TAG_base_type:
+  case DW_TAG_base_type: {
     resolve_state = Type::ResolveState::Full;
+    // If a builtin type's size isn't a multiple of a byte, DWARF producers may
+    // add a precise bit-size to the type. Use the most precise bit-size
+    // possible.
+    const uint64_t bit_size = attrs.data_bit_size
+                                  ? *attrs.data_bit_size
+                                  : attrs.byte_size.value_or(0) * 8;
     clang_type = m_ast.GetBuiltinTypeForDWARFEncodingAndBitSize(
-        attrs.name.GetStringRef(), attrs.encoding,
-        attrs.byte_size.value_or(0) * 8);
+        attrs.name.GetStringRef(), attrs.encoding, bit_size);
     break;
-
+  }
   case DW_TAG_pointer_type:
     encoding_data_type = Type::eEncodingIsPointerUID;
     break;
@@ -2047,11 +2052,10 @@ static std::optional<clang::APValue> MakeAPValue(const clang::ASTContext &ast,
   if (is_integral)
     return clang::APValue(apint);
 
-  uint32_t count;
   bool is_complex;
   // FIXME: we currently support a limited set of floating point types.
   // E.g., 16-bit floats are not supported.
-  if (!clang_type.IsFloatingPointType(count, is_complex))
+  if (!clang_type.IsFloatingPointType(is_complex))
     return std::nullopt;
 
   return clang::APValue(llvm::APFloat(
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
index d90108f687f84..36dee1470e0a2 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
@@ -22,7 +22,6 @@
 #include "lldb/Utility/Stream.h"
 #include "lldb/Utility/Timer.h"
 #include "lldb/lldb-private-enumerations.h"
-#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ThreadPool.h"
 #include <atomic>
 #include <optional>
@@ -33,10 +32,10 @@ using namespace lldb_private::plugin::dwarf;
 using namespace llvm::dwarf;
 
 void ManualDWARFIndex::Index() {
-  if (m_indexed)
-    return;
-  m_indexed = true;
+  std::call_once(m_indexed_flag, [this]() { IndexImpl(); });
+}
 
+void ManualDWARFIndex::IndexImpl() {
   ElapsedTime elapsed(m_index_time);
   LLDB_SCOPED_TIMERF("%p", static_cast<void *>(m_dwarf));
   if (LoadFromCache()) {
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
index 0b5b2f3e84309..41e0e620a4896 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
@@ -66,8 +66,14 @@ class ManualDWARFIndex : public DWARFIndex {
   void Dump(Stream &s) override;
 
 private:
+  /// Reads the DWARF debug info to build the index once.
+  ///
+  /// Should be called before attempting to retrieve symbols.
   void Index();
 
+  /// Call `ManualDWARFIndex::Index()` instead.
+  void IndexImpl();
+
   /// Decode a serialized version of this object from data.
   ///
   /// \param data
@@ -170,7 +176,7 @@ class ManualDWARFIndex : public DWARFIndex {
   llvm::DenseSet<uint64_t> m_type_sigs_to_avoid;
 
   IndexSet<NameToDIE> m_set;
-  bool m_indexed = false;
+  std::once_flag m_indexed_flag;
 };
 } // namespace dwarf
 } // namespace lldb_private::plugin
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 881268bc4ca03..f00e94aee9847 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -2018,7 +2018,7 @@ void SymbolFileDWARF::UpdateExternalModuleListIfNeeded() {
     }
 
     Status error = ModuleList::GetSharedModule(dwo_module_spec, module_sp,
-                                               nullptr, nullptr, nullptr);
+                                               nullptr, nullptr);
     if (!module_sp) {
       // ReportWarning also rate-limits based on the warning string,
       // but in a -gmodules build, each object file has a similar DAG
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
index e76b7a3cf274a..aaec1600dacff 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
@@ -1130,7 +1130,35 @@ void SymbolFileNativePDB::AddSymbols(Symtab &symtab) {
   if (!section_list)
     return;
 
-  for (auto pid : m_index->publics().getPublicsTable()) {
+  PublicSym32 last_sym;
+  size_t last_sym_idx = 0;
+  lldb::SectionSP section_sp;
+
+  // To estimate the size of a symbol, we use the difference to the next symbol.
+  // If there's no next symbol or the section/segment changed, the symbol will
+  // take the remaining space. The estimate can be too high in case there's
+  // padding between symbols. This similar to the algorithm used by the DIA
+  // SDK.
+  auto finish_last_symbol = [&](const PublicSym32 *next) {
+    if (!section_sp)
+      return;
+    Symbol *last = symtab.SymbolAtIndex(last_sym_idx);
+    if (!last)
+      return;
+
+    if (next && last_sym.Segment == next->Segment) {
+      assert(last_sym.Offset <= next->Offset);
+      last->SetByteSize(next->Offset - last_sym.Offset);
+    } else {
+      // the last symbol was the last in its section
+      assert(section_sp->GetByteSize() >= last_sym.Offset);
+      assert(!next || next->Segment > last_sym.Segment);
+      last->SetByteSize(section_sp->GetByteSize() - last_sym.Offset);
+    }
+  };
+
+  // The address map is sorted by the address of a symbol.
+  for (auto pid : m_index->publics().getAddressMap()) {
     PdbGlobalSymId global{pid, true};
     CVSymbol sym = m_index->ReadSymbolRecord(global);
     auto kind = sym.kind();
@@ -1138,8 +1166,11 @@ void SymbolFileNativePDB::AddSymbols(Symtab &symtab) {
       continue;
     PublicSym32 pub =
         llvm::cantFail(SymbolDeserializer::deserializeAs<PublicSym32>(sym));
+    finish_last_symbol(&pub);
+
+    if (!section_sp || last_sym.Segment != pub.Segment)
+      section_sp = section_list->FindSectionByID(pub.Segment);
 
-    auto section_sp = section_list->FindSectionByID(pub.Segment);
     if (!section_sp)
       continue;
 
@@ -1148,20 +1179,24 @@ void SymbolFileNativePDB::AddSymbols(Symtab &symtab) {
         (pub.Flags & PublicSymFlags::Code) != PublicSymFlags::None)
       type = eSymbolTypeCode;
 
-    symtab.AddSymbol(Symbol(/*symID=*/pid,
-                            /*name=*/pub.Name,
-                            /*type=*/type,
-                            /*external=*/true,
-                            /*is_debug=*/true,
-                            /*is_trampoline=*/false,
-                            /*is_artificial=*/false,
-                            /*section_sp=*/section_sp,
-                            /*value=*/pub.Offset,
-                            /*size=*/0,
-                            /*size_is_valid=*/false,
-                            /*contains_linker_annotations=*/false,
-                            /*flags=*/0));
-  }
+    last_sym_idx =
+        symtab.AddSymbol(Symbol(/*symID=*/pid,
+                                /*name=*/pub.Name,
+                                /*type=*/type,
+                                /*external=*/true,
+                                /*is_debug=*/true,
+                                /*is_trampoline=*/false,
+                                /*is_artificial=*/false,
+                                /*section_sp=*/section_sp,
+                                /*value=*/pub.Offset,
+                                /*size=*/0,
+                                /*size_is_valid=*/false,
+                                /*contains_linker_annotations=*/false,
+                                /*flags=*/0));
+    last_sym = pub;
+  }
+
+  finish_last_symbol(nullptr);
 }
 
 size_t SymbolFileNativePDB::ParseFunctions(CompileUnit &comp_unit) {
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
index 1c575e90bd72c..46cf9b8524ede 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
@@ -442,6 +442,10 @@ void UdtRecordCompleter::Record::ConstructRecord() {
 
   // The end offset to a vector of field/struct that ends at the offset.
   std::map<uint64_t, std::vector<Member *>> end_offset_map;
+  auto is_last_end_offset = [&](auto it) {
+    return it != end_offset_map.end() && ++it == end_offset_map.end();
+  };
+
   for (auto &pair : fields_map) {
     uint64_t offset = pair.first;
     auto &fields = pair.second;
@@ -462,8 +466,23 @@ void UdtRecordCompleter::Record::ConstructRecord() {
       }
       if (iter->second.empty())
         continue;
-      parent = iter->second.back();
-      iter->second.pop_back();
+
+      // If the new fields come after the already added ones
+      // without overlap, go back to the root.
+      if (iter->first <= offset && is_last_end_offset(iter)) {
+        if (record.kind == Member::Struct) {
+          parent = &record;
+        } else {
+          assert(record.kind == Member::Union &&
+                 "Current record must be a union");
+          assert(!record.fields.empty());
+          // For unions, append the field to the last struct
+          parent = record.fields.back().get();
+        }
+      } else {
+        parent = iter->second.back();
+        iter->second.pop_back();
+      }
     }
     // If it's a field, then the field is inside a union, so we can safely
     // increase its size by converting it to a struct to hold multiple fields.
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 6ec054d5eac05..51cb883748514 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -1000,6 +1000,8 @@ CompilerType TypeSystemClang::GetBuiltinTypeForDWARFEncodingAndBitSize(
 
   case DW_ATE_signed:
     if (!type_name.empty()) {
+      if (type_name.starts_with("_BitInt"))
+        return GetType(ast.getBitIntType(/*Unsigned=*/false, bit_size));
       if (type_name == "wchar_t" &&
           QualTypeMatchesBitSize(bit_size, ast, ast.WCharTy) &&
           (getTargetInfo() &&
@@ -1056,6 +1058,8 @@ CompilerType TypeSystemClang::GetBuiltinTypeForDWARFEncodingAndBitSize(
 
   case DW_ATE_unsigned:
     if (!type_name.empty()) {
+      if (type_name.starts_with("unsigned _BitInt"))
+        return GetType(ast.getBitIntType(/*Unsigned=*/true, bit_size));
       if (type_name == "wchar_t") {
         if (QualTypeMatchesBitSize(bit_size, ast, ast.WCharTy)) {
           if (!(getTargetInfo() &&
@@ -3488,7 +3492,7 @@ bool TypeSystemClang::IsReferenceType(lldb::opaque_compiler_type_t type,
 }
 
 bool TypeSystemClang::IsFloatingPointType(lldb::opaque_compiler_type_t type,
-                                          uint32_t &count, bool &is_complex) {
+                                          bool &is_complex) {
   if (type) {
     clang::QualType qual_type(GetCanonicalQualType(type));
 
@@ -3497,30 +3501,26 @@ bool TypeSystemClang::IsFloatingPointType(lldb::opaque_compiler_type_t type,
       clang::BuiltinType::Kind kind = BT->getKind();
       if (kind >= clang::BuiltinType::Float &&
           kind <= clang::BuiltinType::LongDouble) {
-        count = 1;
         is_complex = false;
         return true;
       }
     } else if (const clang::ComplexType *CT =
                    llvm::dyn_cast<clang::ComplexType>(
                        qual_type->getCanonicalTypeInternal())) {
-      if (IsFloatingPointType(CT->getElementType().getAsOpaquePtr(), count,
+      if (IsFloatingPointType(CT->getElementType().getAsOpaquePtr(),
                               is_complex)) {
-        count = 2;
         is_complex = true;
         return true;
       }
     } else if (const clang::VectorType *VT = llvm::dyn_cast<clang::VectorType>(
                    qual_type->getCanonicalTypeInternal())) {
-      if (IsFloatingPointType(VT->getElementType().getAsOpaquePtr(), count,
+      if (IsFloatingPointType(VT->getElementType().getAsOpaquePtr(),
                               is_complex)) {
-        count = VT->getNumElements();
         is_complex = false;
         return true;
       }
     }
   }
-  count = 0;
   is_complex = false;
   return false;
 }
@@ -3893,6 +3893,13 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type,
                            ->getModifiedType()
                            .getAsOpaquePtr(),
                        pointee_or_element_clang_type);
+  case clang::Type::BitInt: {
+    uint32_t type_flags = eTypeIsScalar | eTypeIsInteger | eTypeHasValue;
+    if (qual_type->isSignedIntegerType())
+      type_flags |= eTypeIsSigned;
+
+    return type_flags;
+  }
   case clang::Type::Builtin: {
     const clang::BuiltinType *builtin_type =
         llvm::cast<clang::BuiltinType>(qual_type->getCanonicalTypeInternal());
@@ -3965,9 +3972,9 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type,
     if (complex_type) {
       clang::QualType complex_element_type(complex_type->getElementType());
       if (complex_element_type->isIntegerType())
-        complex_type_flags |= eTypeIsFloat;
-      else if (complex_element_type->isFloatingType())
         complex_type_flags |= eTypeIsInteger;
+      else if (complex_element_type->isFloatingType())
+        complex_type_flags |= eTypeIsFloat;
     }
     return complex_type_flags;
   } break;
@@ -4062,12 +4069,17 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type,
     uint32_t vector_type_flags = eTypeHasChildren | eTypeIsVector;
     const clang::VectorType *vector_type = llvm::dyn_cast<clang::VectorType>(
         qual_type->getCanonicalTypeInternal());
-    if (vector_type) {
-      if (vector_type->isIntegerType())
-        vector_type_flags |= eTypeIsFloat;
-      else if (vector_type->isFloatingType())
-        vector_type_flags |= eTypeIsInteger;
-    }
+    if (!vector_type)
+      return 0;
+
+    QualType element_type = vector_type->getElementType();
+    if (element_type.isNull())
+      return 0;
+
+    if (element_type->isIntegerType())
+      vector_type_flags |= eTypeIsInteger;
+    else if (element_type->isFloatingType())
+      vector_type_flags |= eTypeIsFloat;
     return vector_type_flags;
   }
   default:
@@ -4864,12 +4876,10 @@ TypeSystemClang::GetTypeBitAlign(lldb::opaque_compiler_type_t type,
   return {};
 }
 
-lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
-                                            uint64_t &count) {
+lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type) {
   if (!type)
     return lldb::eEncodingInvalid;
 
-  count = 1;
   clang::QualType qual_type = RemoveWrappingTypes(GetCanonicalQualType(type));
 
   switch (qual_type->getTypeClass()) {
@@ -4903,7 +4913,6 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
   case clang::Type::DependentVector:
   case clang::Type::ExtVector:
   case clang::Type::Vector:
-    // TODO: Set this to more than one???
     break;
 
   case clang::Type::BitInt:
@@ -5104,11 +5113,10 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
       const clang::ComplexType *complex_type =
           qual_type->getAsComplexIntegerType();
       if (complex_type)
-        encoding = GetType(complex_type->getElementType()).GetEncoding(count);
+        encoding = GetType(complex_type->getElementType()).GetEncoding();
       else
         encoding = lldb::eEncodingSint;
     }
-    count = 2;
     return encoding;
   }
 
@@ -5165,7 +5173,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
   case clang::Type::SubstBuiltinTemplatePack:
     break;
   }
-  count = 0;
+
   return lldb::eEncodingInvalid;
 }
 
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index 9e0a54209345d..375891b3cfd2f 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -651,7 +651,7 @@ class TypeSystemClang : public TypeSystem {
 
   bool IsDefined(lldb::opaque_compiler_type_t type) override;
 
-  bool IsFloatingPointType(lldb::opaque_compiler_type_t type, uint32_t &count,
+  bool IsFloatingPointType(lldb::opaque_compiler_type_t type,
                            bool &is_complex) override;
 
   unsigned GetPtrAuthKey(lldb::opaque_compiler_type_t type) override;
@@ -837,8 +837,7 @@ class TypeSystemClang : public TypeSystem {
   GetBitSize(lldb::opaque_compiler_type_t type,
              ExecutionContextScope *exe_scope) override;
 
-  lldb::Encoding GetEncoding(lldb::opaque_compiler_type_t type,
-                             uint64_t &count) override;
+  lldb::Encoding GetEncoding(lldb::opaque_compiler_type_t type) override;
 
   lldb::Format GetFormat(lldb::opaque_compiler_type_t type) override;
 
diff --git a/lldb/source/Symbol/CompilerType.cpp b/lldb/source/Symbol/CompilerType.cpp
index 62c0ddf51c012..c999ab256fc98 100644
--- a/lldb/source/Symbol/CompilerType.cpp
+++ b/lldb/source/Symbol/CompilerType.cpp
@@ -240,13 +240,11 @@ bool CompilerType::ShouldTreatScalarValueAsAddress() const {
   return false;
 }
 
-bool CompilerType::IsFloatingPointType(uint32_t &count,
-                                       bool &is_complex) const {
+bool CompilerType::IsFloatingPointType(bool &is_complex) const {
   if (IsValid()) {
     if (auto type_system_sp = GetTypeSystem())
-      return type_system_sp->IsFloatingPointType(m_type, count, is_complex);
+      return type_system_sp->IsFloatingPointType(m_type, is_complex);
   }
-  count = 0;
   is_complex = false;
   return false;
 }
@@ -331,9 +329,8 @@ bool CompilerType::IsInteger() const {
 }
 
 bool CompilerType::IsFloat() const {
-  uint32_t count = 0;
   bool is_complex = false;
-  return IsFloatingPointType(count, is_complex);
+  return IsFloatingPointType(is_complex);
 }
 
 bool CompilerType::IsEnumerationType() const {
@@ -793,10 +790,10 @@ CompilerType::GetTypeBitAlign(ExecutionContextScope *exe_scope) const {
   return {};
 }
 
-lldb::Encoding CompilerType::GetEncoding(uint64_t &count) const {
+lldb::Encoding CompilerType::GetEncoding() const {
   if (IsValid())
     if (auto type_system_sp = GetTypeSystem())
-      return type_system_sp->GetEncoding(m_type, count);
+      return type_system_sp->GetEncoding(m_type);
   return lldb::eEncodingInvalid;
 }
 
@@ -1093,10 +1090,10 @@ bool CompilerType::GetValueAsScalar(const lldb_private::DataExtractor &data,
   if (IsAggregateType()) {
     return false; // Aggregate types don't have scalar values
   } else {
-    uint64_t count = 0;
-    lldb::Encoding encoding = GetEncoding(count);
+    // FIXME: check that type is scalar instead of checking encoding?
+    lldb::Encoding encoding = GetEncoding();
 
-    if (encoding == lldb::eEncodingInvalid || count != 1)
+    if (encoding == lldb::eEncodingInvalid || (GetTypeInfo() & eTypeIsComplex))
       return false;
 
     auto byte_size_or_err = GetByteSize(exe_scope);
diff --git a/lldb/source/Symbol/ObjectFile.cpp b/lldb/source/Symbol/ObjectFile.cpp
index 9a79b3c627623..6f5348c153030 100644
--- a/lldb/source/Symbol/ObjectFile.cpp
+++ b/lldb/source/Symbol/ObjectFile.cpp
@@ -647,14 +647,14 @@ ObjectFile::GetDWARFSectionTypeFromName(llvm::StringRef name) {
       .Case("frame", eSectionTypeDWARFDebugFrame)
       .Case("info", eSectionTypeDWARFDebugInfo)
       .Case("info.dwo", eSectionTypeDWARFDebugInfoDwo)
-      .Cases("line", "line.dwo", eSectionTypeDWARFDebugLine)
-      .Cases("line_str", "line_str.dwo", eSectionTypeDWARFDebugLineStr)
+      .Cases({"line", "line.dwo"}, eSectionTypeDWARFDebugLine)
+      .Cases({"line_str", "line_str.dwo"}, eSectionTypeDWARFDebugLineStr)
       .Case("loc", eSectionTypeDWARFDebugLoc)
       .Case("loc.dwo", eSectionTypeDWARFDebugLocDwo)
       .Case("loclists", eSectionTypeDWARFDebugLocLists)
       .Case("loclists.dwo", eSectionTypeDWARFDebugLocListsDwo)
       .Case("macinfo", eSectionTypeDWARFDebugMacInfo)
-      .Cases("macro", "macro.dwo", eSectionTypeDWARFDebugMacro)
+      .Cases({"macro", "macro.dwo"}, eSectionTypeDWARFDebugMacro)
       .Case("names", eSectionTypeDWARFDebugNames)
       .Case("pubnames", eSectionTypeDWARFDebugPubNames)
       .Case("pubtypes", eSectionTypeDWARFDebugPubTypes)
@@ -663,7 +663,7 @@ ObjectFile::GetDWARFSectionTypeFromName(llvm::StringRef name) {
       .Case("rnglists.dwo", eSectionTypeDWARFDebugRngListsDwo)
       .Case("str", eSectionTypeDWARFDebugStr)
       .Case("str.dwo", eSectionTypeDWARFDebugStrDwo)
-      .Cases("str_offsets", "str_offs", eSectionTypeDWARFDebugStrOffsets)
+      .Cases({"str_offsets", "str_offs"}, eSectionTypeDWARFDebugStrOffsets)
       .Case("str_offsets.dwo", eSectionTypeDWARFDebugStrOffsetsDwo)
       .Case("tu_index", eSectionTypeDWARFDebugTuIndex)
       .Case("types", eSectionTypeDWARFDebugTypes)
diff --git a/lldb/source/Symbol/Type.cpp b/lldb/source/Symbol/Type.cpp
index 952b2bdee1886..0c3246d238701 100644
--- a/lldb/source/Symbol/Type.cpp
+++ b/lldb/source/Symbol/Type.cpp
@@ -531,9 +531,9 @@ lldb::TypeSP Type::GetTypedefType() {
 
 lldb::Format Type::GetFormat() { return GetForwardCompilerType().GetFormat(); }
 
-lldb::Encoding Type::GetEncoding(uint64_t &count) {
+lldb::Encoding Type::GetEncoding() {
   // Make sure we resolve our type if it already hasn't been.
-  return GetForwardCompilerType().GetEncoding(count);
+  return GetForwardCompilerType().GetEncoding();
 }
 
 bool Type::ReadFromMemory(ExecutionContext *exe_ctx, lldb::addr_t addr,
diff --git a/lldb/source/Target/CMakeLists.txt b/lldb/source/Target/CMakeLists.txt
index b7788e80eecac..cff59049cdce5 100644
--- a/lldb/source/Target/CMakeLists.txt
+++ b/lldb/source/Target/CMakeLists.txt
@@ -38,6 +38,7 @@ add_lldb_library(lldbTarget
   RegisterNumber.cpp
   RemoteAwarePlatform.cpp
   ScriptedThreadPlan.cpp
+  SyntheticFrameProvider.cpp
   SectionLoadHistory.cpp
   SectionLoadList.cpp
   StackFrame.cpp
@@ -80,7 +81,6 @@ add_lldb_library(lldbTarget
   UnixSignals.cpp
   UnwindAssembly.cpp
   UnwindLLDB.cpp
-  VerboseTrapFrameRecognizer.cpp
 
   ADDITIONAL_HEADER_DIRS
     ${LLDB_INCLUDE_DIR}/lldb/Target
diff --git a/lldb/source/Target/InstrumentationRuntime.cpp b/lldb/source/Target/InstrumentationRuntime.cpp
index 7e58e8bf26cb1..d9800a8541f4e 100644
--- a/lldb/source/Target/InstrumentationRuntime.cpp
+++ b/lldb/source/Target/InstrumentationRuntime.cpp
@@ -55,7 +55,8 @@ void InstrumentationRuntime::ModulesDidLoad(
       return IterationAction::Continue;
 
     const RegularExpression &runtime_regex = GetPatternForRuntimeLibrary();
-    if (runtime_regex.Execute(file_spec.GetFilename().GetCString()) ||
+    if (MatchAllModules() ||
+        runtime_regex.Execute(file_spec.GetFilename().GetCString()) ||
         module_sp->IsExecutable()) {
       if (CheckIfRuntimeIsValid(module_sp)) {
         SetRuntimeModuleSP(module_sp);
diff --git a/lldb/source/Target/ModuleCache.cpp b/lldb/source/Target/ModuleCache.cpp
index f737836e0d971..9978946105456 100644
--- a/lldb/source/Target/ModuleCache.cpp
+++ b/lldb/source/Target/ModuleCache.cpp
@@ -255,7 +255,7 @@ Status ModuleCache::Get(const FileSpec &root_dir_spec, const char *hostname,
   cached_module_spec.GetPlatformFileSpec() = module_spec.GetFileSpec();
 
   error = ModuleList::GetSharedModule(cached_module_spec, cached_module_sp,
-                                      nullptr, nullptr, did_create_ptr, false);
+                                      nullptr, did_create_ptr, false);
   if (error.Fail())
     return error;
 
diff --git a/lldb/source/Target/Platform.cpp b/lldb/source/Target/Platform.cpp
index 8681adaf5ea76..5b0930cf26b77 100644
--- a/lldb/source/Target/Platform.cpp
+++ b/lldb/source/Target/Platform.cpp
@@ -163,11 +163,12 @@ Platform::LocateExecutableScriptingResources(Target *target, Module &module,
 
 Status Platform::GetSharedModule(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
-    const FileSpecList *module_search_paths_ptr,
     llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) {
   if (IsHost())
-    return ModuleList::GetSharedModule(module_spec, module_sp,
-                                       module_search_paths_ptr, old_modules,
+    // Note: module_search_paths_ptr functionality is now handled internally
+    // by getting target from module_spec and calling
+    // target->GetExecutableSearchPaths()
+    return ModuleList::GetSharedModule(module_spec, module_sp, old_modules,
                                        did_create_ptr, false);
 
   // Module resolver lambda.
@@ -180,16 +181,14 @@ Status Platform::GetSharedModule(
       resolved_spec = spec;
       resolved_spec.GetFileSpec().PrependPathComponent(m_sdk_sysroot);
       // Try to get shared module with resolved spec.
-      error = ModuleList::GetSharedModule(resolved_spec, module_sp,
-                                          module_search_paths_ptr, old_modules,
+      error = ModuleList::GetSharedModule(resolved_spec, module_sp, old_modules,
                                           did_create_ptr, false);
     }
     // If we don't have sysroot or it didn't work then
     // try original module spec.
     if (!error.Success()) {
       resolved_spec = spec;
-      error = ModuleList::GetSharedModule(resolved_spec, module_sp,
-                                          module_search_paths_ptr, old_modules,
+      error = ModuleList::GetSharedModule(resolved_spec, module_sp, old_modules,
                                           did_create_ptr, false);
     }
     if (error.Success() && module_sp)
@@ -731,10 +730,8 @@ bool Platform::SetOSVersion(llvm::VersionTuple version) {
   return false;
 }
 
-Status
-Platform::ResolveExecutable(const ModuleSpec &module_spec,
-                            lldb::ModuleSP &exe_module_sp,
-                            const FileSpecList *module_search_paths_ptr) {
+Status Platform::ResolveExecutable(const ModuleSpec &module_spec,
+                                   lldb::ModuleSP &exe_module_sp) {
 
   // We may connect to a process and use the provided executable (Don't use
   // local $PATH).
@@ -750,9 +747,8 @@ Platform::ResolveExecutable(const ModuleSpec &module_spec,
 
   if (resolved_module_spec.GetArchitecture().IsValid() ||
       resolved_module_spec.GetUUID().IsValid()) {
-    Status error =
-        ModuleList::GetSharedModule(resolved_module_spec, exe_module_sp,
-                                    module_search_paths_ptr, nullptr, nullptr);
+    Status error = ModuleList::GetSharedModule(resolved_module_spec,
+                                               exe_module_sp, nullptr, nullptr);
 
     if (exe_module_sp && exe_module_sp->GetObjectFile())
       return error;
@@ -767,9 +763,9 @@ Platform::ResolveExecutable(const ModuleSpec &module_spec,
   Status error;
   for (const ArchSpec &arch : GetSupportedArchitectures(process_host_arch)) {
     resolved_module_spec.GetArchitecture() = arch;
-    error =
-        ModuleList::GetSharedModule(resolved_module_spec, exe_module_sp,
-                                    module_search_paths_ptr, nullptr, nullptr);
+
+    error = ModuleList::GetSharedModule(resolved_module_spec, exe_module_sp,
+                                        nullptr, nullptr);
     if (error.Success()) {
       if (exe_module_sp && exe_module_sp->GetObjectFile())
         break;
@@ -1446,16 +1442,13 @@ const std::vector<ConstString> &Platform::GetTrapHandlerSymbolNames() {
   return m_trap_handlers;
 }
 
-Status
-Platform::GetCachedExecutable(ModuleSpec &module_spec,
-                              lldb::ModuleSP &module_sp,
-                              const FileSpecList *module_search_paths_ptr) {
+Status Platform::GetCachedExecutable(ModuleSpec &module_spec,
+                                     lldb::ModuleSP &module_sp) {
   FileSpec platform_spec = module_spec.GetFileSpec();
   Status error = GetRemoteSharedModule(
       module_spec, nullptr, module_sp,
       [&](const ModuleSpec &spec) {
-        return Platform::ResolveExecutable(spec, module_sp,
-                                           module_search_paths_ptr);
+        return Platform::ResolveExecutable(spec, module_sp);
       },
       nullptr);
   if (error.Success()) {
@@ -1497,7 +1490,7 @@ Status Platform::GetRemoteSharedModule(const ModuleSpec &module_spec,
     for (const ArchSpec &arch : GetSupportedArchitectures(process_host_arch)) {
       arch_module_spec.GetArchitecture() = arch;
       error = ModuleList::GetSharedModule(arch_module_spec, module_sp, nullptr,
-                                          nullptr, nullptr);
+                                          nullptr);
       // Did we find an executable using one of the
       if (error.Success() && module_sp)
         break;
@@ -1673,11 +1666,12 @@ void Platform::CallLocateModuleCallbackIfSet(const ModuleSpec &module_spec,
   cached_module_spec.GetUUID().Clear(); // Clear UUID since it may contain md5
                                         // content hash instead of real UUID.
   cached_module_spec.GetFileSpec() = module_file_spec;
+  cached_module_spec.GetSymbolFileSpec() = symbol_file_spec;
   cached_module_spec.GetPlatformFileSpec() = module_spec.GetFileSpec();
   cached_module_spec.SetObjectOffset(0);
 
   error = ModuleList::GetSharedModule(cached_module_spec, module_sp, nullptr,
-                                      nullptr, did_create_ptr, false);
+                                      did_create_ptr, false, false);
   if (error.Success() && module_sp) {
     // Succeeded to load the module file.
     LLDB_LOGF(log, "%s: locate module callback succeeded: module=%s symbol=%s",
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index fb9e7eb5ed1bd..69edea503002e 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -65,7 +65,6 @@
 #include "lldb/Target/ThreadPlanCallFunction.h"
 #include "lldb/Target/ThreadPlanStack.h"
 #include "lldb/Target/UnixSignals.h"
-#include "lldb/Target/VerboseTrapFrameRecognizer.h"
 #include "lldb/Utility/AddressableBits.h"
 #include "lldb/Utility/Event.h"
 #include "lldb/Utility/LLDBLog.h"
@@ -513,7 +512,6 @@ Process::Process(lldb::TargetSP target_sp, ListenerSP listener_sp,
   // We should have a plugin do the registration instead, for example, a
   // common C LanguageRuntime plugin.
   RegisterAssertFrameRecognizer(this);
-  RegisterVerboseTrapFrameRecognizer(*this);
 }
 
 Process::~Process() {
@@ -3258,6 +3256,7 @@ Status Process::ConnectRemote(llvm::StringRef remote_url) {
       if (state == eStateStopped || state == eStateCrashed) {
         // If we attached and actually have a process on the other end, then
         // this ended up being the equivalent of an attach.
+        SetShouldDetach(true);
         CompleteAttach();
 
         // This delays passing the stopped event to listeners till
diff --git a/lldb/source/Target/RemoteAwarePlatform.cpp b/lldb/source/Target/RemoteAwarePlatform.cpp
index cac738ea67b4c..89b946ba75162 100644
--- a/lldb/source/Target/RemoteAwarePlatform.cpp
+++ b/lldb/source/Target/RemoteAwarePlatform.cpp
@@ -29,9 +29,8 @@ bool RemoteAwarePlatform::GetModuleSpec(const FileSpec &module_file_spec,
   return false;
 }
 
-Status RemoteAwarePlatform::ResolveExecutable(
-    const ModuleSpec &module_spec, lldb::ModuleSP &exe_module_sp,
-    const FileSpecList *module_search_paths_ptr) {
+Status RemoteAwarePlatform::ResolveExecutable(const ModuleSpec &module_spec,
+                                              lldb::ModuleSP &exe_module_sp) {
   ModuleSpec resolved_module_spec(module_spec);
 
   // The host platform can resolve the path more aggressively.
@@ -47,12 +46,10 @@ Status RemoteAwarePlatform::ResolveExecutable(
     if (!FileSystem::Instance().Exists(resolved_file_spec))
       FileSystem::Instance().ResolveExecutableLocation(resolved_file_spec);
   } else if (m_remote_platform_sp) {
-    return GetCachedExecutable(resolved_module_spec, exe_module_sp,
-                               module_search_paths_ptr);
+    return GetCachedExecutable(resolved_module_spec, exe_module_sp);
   }
 
-  return Platform::ResolveExecutable(resolved_module_spec, exe_module_sp,
-                                     module_search_paths_ptr);
+  return Platform::ResolveExecutable(resolved_module_spec, exe_module_sp);
 }
 
 Status RemoteAwarePlatform::RunShellCommand(
diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp
index 2ed58c5331df4..95b515412d693 100644
--- a/lldb/source/Target/StackFrame.cpp
+++ b/lldb/source/Target/StackFrame.cpp
@@ -1344,18 +1344,18 @@ const char *StackFrame::GetDisplayFunctionName() {
 SourceLanguage StackFrame::GetLanguage() {
   CompileUnit *cu = GetSymbolContext(eSymbolContextCompUnit).comp_unit;
   if (cu)
-    return cu->GetLanguage();
+    return SourceLanguage{cu->GetLanguage()};
   return {};
 }
 
 SourceLanguage StackFrame::GuessLanguage() {
   SourceLanguage lang_type = GetLanguage();
 
-  if (lang_type == eLanguageTypeUnknown) {
+  if (!lang_type) {
     SymbolContext sc =
         GetSymbolContext(eSymbolContextFunction | eSymbolContextSymbol);
     if (sc.function)
-      lang_type = LanguageType(sc.function->GetMangled().GuessLanguage());
+      lang_type = SourceLanguage(sc.function->GetMangled().GuessLanguage());
     else if (sc.symbol)
       lang_type = SourceLanguage(sc.symbol->GetMangled().GuessLanguage());
   }
diff --git a/lldb/source/Target/SyntheticFrameProvider.cpp b/lldb/source/Target/SyntheticFrameProvider.cpp
new file mode 100644
index 0000000000000..241ce82c39be3
--- /dev/null
+++ b/lldb/source/Target/SyntheticFrameProvider.cpp
@@ -0,0 +1,100 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Target/SyntheticFrameProvider.h"
+#include "lldb/Core/PluginManager.h"
+#include "lldb/Target/Thread.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+SyntheticFrameProvider::SyntheticFrameProvider(StackFrameListSP input_frames)
+    : m_input_frames(std::move(input_frames)) {}
+
+SyntheticFrameProvider::~SyntheticFrameProvider() = default;
+
+void SyntheticFrameProviderDescriptor::Dump(Stream *s) const {
+  if (!s)
+    return;
+
+  s->Printf("  Name: %s\n", GetName().str().c_str());
+
+  // Show thread filter information.
+  if (thread_specs.empty()) {
+    s->PutCString("  Thread Filter: (applies to all threads)\n");
+  } else {
+    s->Printf("  Thread Filter: %zu specification(s)\n", thread_specs.size());
+    for (size_t i = 0; i < thread_specs.size(); ++i) {
+      const ThreadSpec &spec = thread_specs[i];
+      s->Printf("    [%zu] ", i);
+      spec.GetDescription(s, lldb::eDescriptionLevelVerbose);
+      s->PutChar('\n');
+    }
+  }
+}
+
+llvm::Expected<SyntheticFrameProviderSP> SyntheticFrameProvider::CreateInstance(
+    StackFrameListSP input_frames,
+    const SyntheticFrameProviderDescriptor &descriptor) {
+  if (!input_frames)
+    return llvm::createStringError(
+        "cannot create synthetic frame provider: invalid input frames");
+
+  // Iterate through all registered ScriptedFrameProvider plugins.
+  ScriptedFrameProviderCreateInstance create_callback = nullptr;
+  for (uint32_t idx = 0;
+       (create_callback =
+            PluginManager::GetScriptedFrameProviderCreateCallbackAtIndex(
+                idx)) != nullptr;
+       ++idx) {
+    auto provider_or_err = create_callback(input_frames, descriptor);
+    if (!provider_or_err) {
+      LLDB_LOG_ERROR(GetLog(LLDBLog::Target), provider_or_err.takeError(),
+                     "Failed to create synthetic frame provider: {0}");
+      continue;
+    }
+
+    if (auto frame_provider_up = std::move(*provider_or_err))
+      return std::move(frame_provider_up);
+  }
+
+  return llvm::createStringError(
+      "cannot create synthetic frame provider: no suitable plugin found");
+}
+
+llvm::Expected<SyntheticFrameProviderSP> SyntheticFrameProvider::CreateInstance(
+    StackFrameListSP input_frames, llvm::StringRef plugin_name,
+    const std::vector<ThreadSpec> &thread_specs) {
+  if (!input_frames)
+    return llvm::createStringError(
+        "cannot create synthetic frame provider: invalid input frames");
+
+  // Look up the specific C++ plugin by name.
+  SyntheticFrameProviderCreateInstance create_callback =
+      PluginManager::GetSyntheticFrameProviderCreateCallbackForPluginName(
+          plugin_name);
+
+  if (!create_callback)
+    return llvm::createStringError(
+        "cannot create synthetic frame provider: C++ plugin '%s' not found",
+        plugin_name.str().c_str());
+
+  auto provider_or_err = create_callback(input_frames, thread_specs);
+  if (!provider_or_err)
+    return provider_or_err.takeError();
+
+  if (auto frame_provider_sp = std::move(*provider_or_err))
+    return std::move(frame_provider_sp);
+
+  return llvm::createStringError(
+      "cannot create synthetic frame provider: C++ plugin '%s' returned null",
+      plugin_name.str().c_str());
+}
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index d070c3d953d4a..3b51e17d1c4e0 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Target/Target.h"
+#include "lldb/Breakpoint/Breakpoint.h"
 #include "lldb/Breakpoint/BreakpointIDList.h"
 #include "lldb/Breakpoint/BreakpointPrecondition.h"
 #include "lldb/Breakpoint/BreakpointResolver.h"
@@ -1778,9 +1779,9 @@ bool Target::SetArchitecture(const ArchSpec &arch_spec, bool set_platform,
               arch_spec.GetArchitectureName(),
               arch_spec.GetTriple().getTriple().c_str());
     ModuleSpec module_spec(executable_sp->GetFileSpec(), other);
-    FileSpecList search_paths = GetExecutableSearchPaths();
+    module_spec.SetTarget(shared_from_this());
     Status error = ModuleList::GetSharedModule(module_spec, executable_sp,
-                                               &search_paths, nullptr, nullptr);
+                                               nullptr, nullptr);
 
     if (!error.Fail() && executable_sp) {
       SetExecutableModule(executable_sp, eLoadDependentsYes);
@@ -2349,6 +2350,7 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &orig_module_spec,
 
   // Apply any remappings specified in target.object-map:
   ModuleSpec module_spec(orig_module_spec);
+  module_spec.SetTarget(shared_from_this());
   PathMappingList &obj_mapping = GetObjectPathMap();
   if (std::optional<FileSpec> remapped_obj_file =
           obj_mapping.RemapPath(orig_module_spec.GetFileSpec().GetPath(),
@@ -2407,9 +2409,9 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &orig_module_spec,
           transformed_spec.GetFileSpec().SetDirectory(transformed_dir);
           transformed_spec.GetFileSpec().SetFilename(
                 module_spec.GetFileSpec().GetFilename());
+          transformed_spec.SetTarget(shared_from_this());
           error = ModuleList::GetSharedModule(transformed_spec, module_sp,
-                                              &search_paths, &old_modules,
-                                              &did_create_module);
+                                              &old_modules, &did_create_module);
         }
       }
     }
@@ -2425,9 +2427,8 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &orig_module_spec,
       // cache.
       if (module_spec.GetUUID().IsValid()) {
         // We have a UUID, it is OK to check the global module list...
-        error =
-            ModuleList::GetSharedModule(module_spec, module_sp, &search_paths,
-                                        &old_modules, &did_create_module);
+        error = ModuleList::GetSharedModule(module_spec, module_sp,
+                                            &old_modules, &did_create_module);
       }
 
       if (!module_sp) {
@@ -2435,8 +2436,8 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &orig_module_spec,
         // module in the shared module cache.
         if (m_platform_sp) {
           error = m_platform_sp->GetSharedModule(
-              module_spec, m_process_sp.get(), module_sp, &search_paths,
-              &old_modules, &did_create_module);
+              module_spec, m_process_sp.get(), module_sp, &old_modules,
+              &did_create_module);
         } else {
           error = Status::FromErrorString("no platform is currently set");
         }
@@ -3206,6 +3207,11 @@ bool Target::RunStopHooks(bool at_initial_stop) {
   bool should_stop = false;
   bool requested_continue = false;
 
+  // A stop hook might get deleted while running stop hooks.
+  // We have to decide what that means.  We will follow the rule that deleting
+  // a stop hook while processing these stop hooks will delete it for FUTURE
+  // stops but not this stop.  Fortunately, copying the m_stop_hooks to the
+  // active_hooks list before iterating over the hooks has this effect.
   for (auto cur_hook_sp : active_hooks) {
     bool any_thread_matched = false;
     for (auto exc_ctx : exc_ctx_with_reasons) {
@@ -3961,9 +3967,7 @@ void Target::StopHook::GetDescription(Stream &s,
     return;
   }
 
-  unsigned indent_level = s.GetIndentLevel();
-
-  s.SetIndentLevel(indent_level + 2);
+  auto indent_scope = s.MakeIndentScope();
 
   s.Printf("Hook: %" PRIu64 "\n", GetID());
   if (m_active)
@@ -3977,19 +3981,17 @@ void Target::StopHook::GetDescription(Stream &s,
   if (m_specifier_sp) {
     s.Indent();
     s.PutCString("Specifier:\n");
-    s.SetIndentLevel(indent_level + 4);
+    auto indent_scope = s.MakeIndentScope();
     m_specifier_sp->GetDescription(&s, level);
-    s.SetIndentLevel(indent_level + 2);
   }
 
   if (m_thread_spec_up) {
     StreamString tmp;
     s.Indent("Thread:\n");
     m_thread_spec_up->GetDescription(&tmp, level);
-    s.SetIndentLevel(indent_level + 4);
+    auto indent_scope = s.MakeIndentScope();
     s.Indent(tmp.GetString());
     s.PutCString("\n");
-    s.SetIndentLevel(indent_level + 2);
   }
   GetSubclassDescription(s, level);
 }
@@ -4002,14 +4004,13 @@ void Target::StopHookCommandLine::GetSubclassDescription(
       s.PutCString(m_commands.GetStringAtIndex(0));
     return;
   }
-  s.Indent("Commands: \n");
-  s.SetIndentLevel(s.GetIndentLevel() + 4);
+  s.Indent("Commands:\n");
+  auto indent_scope = s.MakeIndentScope(4);
   uint32_t num_commands = m_commands.GetSize();
   for (uint32_t i = 0; i < num_commands; i++) {
     s.Indent(m_commands.GetStringAtIndex(i));
     s.PutCString("\n");
   }
-  s.SetIndentLevel(s.GetIndentLevel() - 4);
 }
 
 // Target::StopHookCommandLine
@@ -4144,7 +4145,7 @@ void Target::StopHookScripted::GetSubclassDescription(
     return;
 
   s.Indent("Args:\n");
-  s.SetIndentLevel(s.GetIndentLevel() + 4);
+  auto indent_scope = s.MakeIndentScope(4);
 
   auto print_one_element = [&s](llvm::StringRef key,
                                 StructuredData::Object *object) {
@@ -4154,8 +4155,6 @@ void Target::StopHookScripted::GetSubclassDescription(
   };
 
   as_dict->ForEach(print_one_element);
-
-  s.SetIndentLevel(s.GetIndentLevel() - 4);
 }
 
 static constexpr OptionEnumValueElement g_dynamic_value_types[] = {
@@ -4951,7 +4950,7 @@ void TargetProperties::SetStandardErrorPath(llvm::StringRef path) {
 
 SourceLanguage TargetProperties::GetLanguage() const {
   const uint32_t idx = ePropertyLanguage;
-  return {GetPropertyAtIndexAs<LanguageType>(idx, {})};
+  return SourceLanguage{GetPropertyAtIndexAs<LanguageType>(idx, {})};
 }
 
 llvm::StringRef TargetProperties::GetExpressionPrefixContents() {
@@ -5271,3 +5270,19 @@ void Target::ClearSectionLoadList() { GetSectionLoadList().Clear(); }
 void Target::DumpSectionLoadList(Stream &s) {
   GetSectionLoadList().Dump(s, this);
 }
+
+void Target::NotifyBreakpointChanged(Breakpoint &bp,
+                                     lldb::BreakpointEventType eventKind) {
+  if (EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged)) {
+    std::shared_ptr<Breakpoint::BreakpointEventData> data_sp =
+        std::make_shared<Breakpoint::BreakpointEventData>(
+            eventKind, bp.shared_from_this());
+    BroadcastEvent(Target::eBroadcastBitBreakpointChanged, data_sp);
+  }
+}
+
+void Target::NotifyBreakpointChanged(
+    Breakpoint &bp, const lldb::EventDataSP &breakpoint_data_sp) {
+  if (EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged))
+    BroadcastEvent(Target::eBroadcastBitBreakpointChanged, breakpoint_data_sp);
+}
diff --git a/lldb/source/Target/TargetList.cpp b/lldb/source/Target/TargetList.cpp
index 188c2508a71ed..2e03bc1e38ea0 100644
--- a/lldb/source/Target/TargetList.cpp
+++ b/lldb/source/Target/TargetList.cpp
@@ -304,13 +304,9 @@ Status TargetList::CreateTargetInternal(Debugger &debugger,
 
     ModuleSP exe_module_sp;
     if (platform_sp) {
-      FileSpecList executable_search_paths(
-          Target::GetDefaultExecutableSearchPaths());
       ModuleSpec module_spec(file, arch);
-      error = platform_sp->ResolveExecutable(module_spec, exe_module_sp,
-                                             executable_search_paths.GetSize()
-                                                 ? &executable_search_paths
-                                                 : nullptr);
+      module_spec.SetTarget(target_sp);
+      error = platform_sp->ResolveExecutable(module_spec, exe_module_sp);
     }
 
     if (error.Success() && exe_module_sp) {
diff --git a/lldb/source/Utility/Args.cpp b/lldb/source/Utility/Args.cpp
index 8ba40bae4d67e..7eff9cf3ed591 100644
--- a/lldb/source/Utility/Args.cpp
+++ b/lldb/source/Utility/Args.cpp
@@ -445,7 +445,7 @@ uint32_t Args::StringToGenericRegister(llvm::StringRef s) {
                         .Case("pc", LLDB_REGNUM_GENERIC_PC)
                         .Case("sp", LLDB_REGNUM_GENERIC_SP)
                         .Case("fp", LLDB_REGNUM_GENERIC_FP)
-                        .Cases("ra", "lr", LLDB_REGNUM_GENERIC_RA)
+                        .Cases({"ra", "lr"}, LLDB_REGNUM_GENERIC_RA)
                         .Case("flags", LLDB_REGNUM_GENERIC_FLAGS)
                         .Case("arg1", LLDB_REGNUM_GENERIC_ARG1)
                         .Case("arg2", LLDB_REGNUM_GENERIC_ARG2)
diff --git a/lldb/source/Utility/RegisterValue.cpp b/lldb/source/Utility/RegisterValue.cpp
index 12c349a143c0f..8b2af4e3d4f0e 100644
--- a/lldb/source/Utility/RegisterValue.cpp
+++ b/lldb/source/Utility/RegisterValue.cpp
@@ -206,7 +206,7 @@ Status RegisterValue::SetValueFromData(const RegisterInfo &reg_info,
         int128.x[0] = data2;
         int128.x[1] = data1;
       }
-      SetUInt128(llvm::APInt(128, 2, int128.x));
+      SetUInt128(llvm::APInt(128, int128.x));
     }
     break;
   case eEncodingIEEE754:
@@ -596,8 +596,10 @@ llvm::APInt RegisterValue::GetAsUInt128(const llvm::APInt &fail_value,
     case 8:
     case 16:
       return llvm::APInt(
-          BITWIDTH_INT128, NUM_OF_WORDS_INT128,
-          (reinterpret_cast<const type128 *>(buffer.bytes.data()))->x);
+          BITWIDTH_INT128,
+          llvm::ArrayRef(
+              (reinterpret_cast<const type128 *>(buffer.bytes.data()))->x,
+              NUM_OF_WORDS_INT128));
     }
   } break;
   }
diff --git a/lldb/source/Utility/Stream.cpp b/lldb/source/Utility/Stream.cpp
index 89dce9fb0e1f7..e9632c3e1fc1f 100644
--- a/lldb/source/Utility/Stream.cpp
+++ b/lldb/source/Utility/Stream.cpp
@@ -202,6 +202,14 @@ void Stream::IndentLess(unsigned amount) {
     m_indent_level = 0;
 }
 
+// Create an indentation scope that restores the original indent level when the
+// object goes out of scope (RAII).
+Stream::IndentScope Stream::MakeIndentScope(unsigned indent_amount) {
+  IndentScope indent_scope(*this);
+  IndentMore(indent_amount);
+  return indent_scope;
+}
+
 // Get the address size in bytes
 uint32_t Stream::GetAddressByteSize() const { return m_addr_size; }
 
diff --git a/lldb/source/ValueObject/ValueObject.cpp b/lldb/source/ValueObject/ValueObject.cpp
index 38b9f77e6ddda..aeea32f19ee2c 100644
--- a/lldb/source/ValueObject/ValueObject.cpp
+++ b/lldb/source/ValueObject/ValueObject.cpp
@@ -790,8 +790,7 @@ bool ValueObject::SetData(DataExtractor &data, Status &error) {
     return false;
   }
 
-  uint64_t count = 0;
-  const Encoding encoding = GetCompilerType().GetEncoding(count);
+  const Encoding encoding = GetCompilerType().GetEncoding();
 
   const size_t byte_size = llvm::expectedToOptional(GetByteSize()).value_or(0);
 
@@ -1669,8 +1668,7 @@ bool ValueObject::SetValueFromCString(const char *value_str, Status &error) {
     return false;
   }
 
-  uint64_t count = 0;
-  const Encoding encoding = GetCompilerType().GetEncoding(count);
+  const Encoding encoding = GetCompilerType().GetEncoding();
 
   const size_t byte_size = llvm::expectedToOptional(GetByteSize()).value_or(0);
 
diff --git a/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py b/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py
index 50efecbc88c36..bed129a7a7a8c 100644
--- a/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py
+++ b/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py
@@ -15,7 +15,7 @@ class TestWeakSymbolsInExpressions(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
     @skipUnlessDarwin
-    @skipIf(compiler="clang", compiler_version=["<", "7.0"])
+    @skipIf(compiler="clang", compiler_version=["<", "19.0"])
     def test_weak_symbol_in_expr(self):
         """Tests that we can refer to weak symbols in expressions."""
         self.build()
diff --git a/lldb/test/API/commands/frame/select-hidden/Makefile b/lldb/test/API/commands/frame/select-hidden/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/commands/frame/select-hidden/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py b/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py
new file mode 100644
index 0000000000000..698447b552877
--- /dev/null
+++ b/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py
@@ -0,0 +1,32 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class NavigateHiddenFrameTestCase(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    @add_test_categories(["libc++"])
+    def test(self):
+        """Test going up/down a backtrace but we started in a hidden frame."""
+        self.build()
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "Break here", lldb.SBFileSpec("main.cpp")
+        )
+        # up
+        self.assertIn("__impl2", thread.selected_frame.GetFunctionName())
+        self.expect("up")
+        self.assertIn("__impl1", thread.selected_frame.GetFunctionName())
+        self.expect("up")
+        self.assertIn("__impl", thread.selected_frame.GetFunctionName())
+        self.expect("up")
+        self.assertIn("non_impl", thread.selected_frame.GetFunctionName())
+
+        # Back down again.
+        self.expect("down")
+        self.assertIn("__impl", thread.selected_frame.GetFunctionName())
+        self.expect("down")
+        self.assertIn("__impl1", thread.selected_frame.GetFunctionName())
+        self.expect("down")
+        self.assertIn("__impl2", thread.selected_frame.GetFunctionName())
diff --git a/lldb/test/API/commands/frame/select-hidden/main.cpp b/lldb/test/API/commands/frame/select-hidden/main.cpp
new file mode 100644
index 0000000000000..dc97abb6323a4
--- /dev/null
+++ b/lldb/test/API/commands/frame/select-hidden/main.cpp
@@ -0,0 +1,13 @@
+namespace std {
+namespace __1 {
+static const char *__impl2() { return "Break here"; }
+static const char *__impl1() { return __impl2(); }
+static const char *__impl() { return __impl1(); }
+static const char *non_impl() { return __impl(); }
+} // namespace __1
+} // namespace std
+
+int main() {
+  std::__1::non_impl();
+  __builtin_debugtrap();
+}
diff --git a/lldb/test/API/commands/target/stop-hooks/TestStopHookScripted.py b/lldb/test/API/commands/target/stop-hooks/TestStopHookScripted.py
index 954cac1592435..8e91781b87a39 100644
--- a/lldb/test/API/commands/target/stop-hooks/TestStopHookScripted.py
+++ b/lldb/test/API/commands/target/stop-hooks/TestStopHookScripted.py
@@ -48,6 +48,39 @@ def test_bad_handler(self):
             "Got the right error",
         )
 
+    def test_self_deleting(self):
+        """Test that we can handle a stop hook that deletes itself"""
+        self.script_setup()
+        # Run to the first breakpoint before setting the stop hook
+        # so we don't have to figure out where it showed up in the new
+        # target.
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "Stop here first", self.main_source_file
+        )
+
+        # Now add our stop hook and register it:
+        result = lldb.SBCommandReturnObject()
+        command = "target stop-hook add -P stop_hook.self_deleting_stop"
+        self.interp.HandleCommand(command, result)
+        self.assertCommandReturn(result, f"Added my stop hook: {result.GetError()}")
+
+        result_str = result.GetOutput()
+        p = re.compile("Stop hook #([0-9]+) added.")
+        m = p.match(result_str)
+        current_stop_hook_id = m.group(1)
+        command = "command script add -o -f stop_hook.handle_stop_hook_id handle_id"
+        self.interp.HandleCommand(command, result)
+        self.assertCommandReturn(result, "Added my command")
+
+        command = f"handle_id {current_stop_hook_id}"
+        self.interp.HandleCommand(command, result)
+        self.assertCommandReturn(result, "Registered my stop ID")
+
+        # Now step the process and make sure the stop hook was deleted.
+        thread.StepOver()
+        self.interp.HandleCommand("target stop-hook list", result)
+        self.assertEqual(result.GetOutput().rstrip(), "No stop hooks.", "Deleted hook")
+
     def test_stop_hooks_scripted(self):
         """Test that a scripted stop hook works with no specifiers"""
         self.stop_hooks_scripted(5, "-I false")
diff --git a/lldb/test/API/commands/target/stop-hooks/stop_hook.py b/lldb/test/API/commands/target/stop-hooks/stop_hook.py
index cb7a4337c40d4..a41190baeadf2 100644
--- a/lldb/test/API/commands/target/stop-hooks/stop_hook.py
+++ b/lldb/test/API/commands/target/stop-hooks/stop_hook.py
@@ -48,3 +48,28 @@ def handle_stop(self):
 class no_handle_stop:
     def __init__(self, target, extra_args, dict):
         print("I am okay")
+
+
+class self_deleting_stop:
+    def __init__(self, target, extra_args, dict):
+        self.target = target
+
+    def handle_stop(self, exe_ctx, stream):
+        interp = exe_ctx.target.debugger.GetCommandInterpreter()
+        result = lldb.SBCommandReturnObject()
+        interp.HandleCommand("handle_id", result)
+        id_str = result.GetOutput().rstrip()
+
+        command = f"target stop-hook delete {id_str}"
+        interp.HandleCommand(command, result)
+
+
+stop_hook_id = 0
+
+
+def handle_stop_hook_id(debugger, command, exe_ctx, result, extra_args):
+    global stop_hook_id
+    if command == "":
+        result.AppendMessage(str(stop_hook_id))
+    else:
+        stop_hook_id = int(command)
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/after_rebuild/TestLocationsAfterRebuild.py b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/after_rebuild/TestLocationsAfterRebuild.py
index 1c7bb538d5df7..bc53feaafa635 100644
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/after_rebuild/TestLocationsAfterRebuild.py
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/after_rebuild/TestLocationsAfterRebuild.py
@@ -54,6 +54,24 @@ def test_remaining_location_spec(self):
             self, target, bkpt
         )
 
+        # After enabling locate_module callback for main executables,
+        # the number of locations may vary depending on the platform.
+        num_locs = bkpt.GetNumLocations()
         bkpt_id = bkpt.GetID()
-        loc_string = f"{bkpt_id}.3"
-        self.runCmd(f"break disable {loc_string}")
+
+        self.assertGreater(
+            num_locs,
+            0,
+            f"Expected at least one breakpoint location, but found {num_locs}",
+        )
+
+        # Iterate through all valid locations and verify we can disable each one.
+        # This tests that breakpoint location IDs remain valid after rebuilds.
+        for loc_idx in range(num_locs):
+            loc = bkpt.GetLocationAtIndex(loc_idx)
+            self.assertTrue(loc.IsValid(), f"Location at index {loc_idx} is not valid")
+
+            # Get the actual location ID from the location object
+            loc_id = loc.GetID()
+            loc_string = f"{bkpt_id}.{loc_id}"
+            self.runCmd(f"break disable {loc_string}")
diff --git a/lldb/test/API/functionalities/breakpoint/same_cu_name/Makefile b/lldb/test/API/functionalities/breakpoint/same_cu_name/Makefile
index b19e7818601eb..b508da24c6828 100644
--- a/lldb/test/API/functionalities/breakpoint/same_cu_name/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/same_cu_name/Makefile
@@ -4,16 +4,16 @@ LD_EXTRAS := ns1.o ns2.o ns3.o ns4.o
 a.out: main.o ns1.o ns2.o ns3.o ns4.o
 
 ns1.o: common.cpp
-	$(CC) -gdwarf -c -DNAMESPACE=ns1 -o $@ $<
+	$(CXX) -gdwarf -c -DNAMESPACE=ns1 -o $@ $<
 
 ns2.o: common.cpp
-	$(CC) -gdwarf -c -DNAMESPACE=ns2 -o $@ $<
+	$(CXX) -gdwarf -c -DNAMESPACE=ns2 -o $@ $<
 
 ns3.o: common.cpp
-	$(CC) -gdwarf -c -DNAMESPACE=ns3 -o $@ $<
+	$(CXX) -gdwarf -c -DNAMESPACE=ns3 -o $@ $<
 
 ns4.o: common.cpp
-	$(CC) -gdwarf -c -DNAMESPACE=ns4 -o $@ $<
+	$(CXX) -gdwarf -c -DNAMESPACE=ns4 -o $@ $<
 
 
 include Makefile.rules
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py
index 07d6c963eb05d..ca2d2d6b49541 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/map/TestDataFormatterStdMap.py
@@ -9,6 +9,8 @@
 
 
 class StdMapDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def setUp(self):
         TestBase.setUp(self)
         ns = "ndk" if lldbplatformutil.target_is_android() else ""
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multimap/TestDataFormatterGenericMultiMap.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multimap/TestDataFormatterGenericMultiMap.py
index 7ac79714db88d..4b0854b180e0a 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multimap/TestDataFormatterGenericMultiMap.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multimap/TestDataFormatterGenericMultiMap.py
@@ -11,6 +11,8 @@
 
 
 class GenericMultiMapDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def setUp(self):
         TestBase.setUp(self)
         self.namespace = "std"
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multiset/TestDataFormatterGenericMultiSet.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multiset/TestDataFormatterGenericMultiSet.py
index 7e922fccdf7d7..e846e072777f8 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multiset/TestDataFormatterGenericMultiSet.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/multiset/TestDataFormatterGenericMultiSet.py
@@ -10,6 +10,8 @@
 
 
 class GenericMultiSetDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def setUp(self):
         TestBase.setUp(self)
         self.namespace = "std"
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/set/TestDataFormatterGenericSet.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/set/TestDataFormatterGenericSet.py
index 1ac5e323e23e3..355f0c6edba19 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/set/TestDataFormatterGenericSet.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/set/TestDataFormatterGenericSet.py
@@ -10,6 +10,8 @@
 
 
 class GenericSetDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def setUp(self):
         TestBase.setUp(self)
         self.namespace = "std"
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py
index 6a27b5d2f0780..00047e419de37 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py
@@ -11,6 +11,8 @@
 
 
 class StdStringDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
@@ -18,6 +20,17 @@ def setUp(self):
         self.main_spec = lldb.SBFileSpec("main.cpp")
         self.namespace = "std"
 
+    def _makeStringName(self, typedef: str, char_type: str, allocator=None):
+        if allocator is None:
+            allocator = self.namespace + "::allocator"
+
+        if self.getDebugInfo() == "pdb":
+            return f"{self.namespace}::basic_string<{char_type}, std::char_traits<{char_type}>, {allocator}<{char_type}>>"
+
+        if typedef.startswith("::"):
+            return self.namespace + typedef
+        return typedef
+
     def do_test(self):
         """Test that that file and class static variables display correctly."""
         (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
@@ -36,10 +49,17 @@ def cleanup():
         # Execute the cleanup function during test case tear down.
         self.addTearDownHook(cleanup)
 
-        ns = self.namespace
+        string_name = self._makeStringName("::string", "char")
+        wstring_name = self._makeStringName("::wstring", "wchar_t")
+        custom_string_name = self._makeStringName(
+            "CustomString", "char", allocator="CustomAlloc"
+        )
+        custom_wstring_name = self._makeStringName(
+            "CustomWString", "wchar_t", allocator="CustomAlloc"
+        )
 
         # Check 'S' pre-assignment.
-        self.expect("frame variable S", substrs=['(%s::wstring) S = L"!!!!"' % ns])
+        self.expect("frame variable S", substrs=[f'({wstring_name}) S = L"!!!!"'])
 
         thread.StepOver()
 
@@ -54,34 +74,31 @@ def cleanup():
         )
 
         self.expect_expr(
-            "s", result_type=ns + "::wstring", result_summary='L"hello world! מזל טוב!"'
+            "s", result_type=wstring_name, result_summary='L"hello world! מזל טוב!"'
         )
 
-        self.expect_expr(
-            "q", result_type=ns + "::string", result_summary='"hello world"'
-        )
+        self.expect_expr("q", result_type=string_name, result_summary='"hello world"')
 
         self.expect_expr(
             "Q",
-            result_type=ns + "::string",
+            result_type=string_name,
             result_summary='"quite a long std::strin with lots of info inside it"',
         )
 
         self.expect(
             "frame variable",
             substrs=[
-                '(%s::wstring) wempty = L""' % ns,
-                '(%s::wstring) s = L"hello world! מזל טוב!"' % ns,
-                '(%s::wstring) S = L"!!!!!"' % ns,
+                f'({wstring_name}) wempty = L""',
+                f'({wstring_name}) s = L"hello world! מזל טוב!"',
+                f'({wstring_name}) S = L"!!!!!"',
                 "(const wchar_t *) mazeltov = 0x",
                 'L"מזל טוב"',
-                '(%s::string) empty = ""' % ns,
-                '(%s::string) q = "hello world"' % ns,
-                '(%s::string) Q = "quite a long std::strin with lots of info inside it"'
-                % ns,
-                "(%s::string *) null_str = nullptr" % ns,
-                '(CustomString) custom_str = "hello!"',
-                '(CustomWString) custom_wstr = L"hello!"',
+                f'({string_name}) empty = ""',
+                f'({string_name}) q = "hello world"',
+                f'({string_name}) Q = "quite a long std::strin with lots of info inside it"',
+                f"({string_name} *) null_str = nullptr",
+                f'({custom_string_name}) custom_str = "hello!"',
+                f'({custom_wstring_name}) custom_wstr = L"hello!"',
             ],
         )
 
@@ -136,19 +153,26 @@ def do_test_multibyte(self):
             self, "Set break point at this line.", self.main_spec
         )
 
-        ns = self.namespace
+        u16string_name = self._makeStringName("::u16string", "char16_t")
+        u32string_name = self._makeStringName("::u32string", "char32_t")
+        custom_u16string_name = self._makeStringName(
+            "CustomStringU16", "char16_t", allocator="CustomAlloc"
+        )
+        custom_u32string_name = self._makeStringName(
+            "CustomStringU32", "char32_t", allocator="CustomAlloc"
+        )
 
         self.expect(
             "frame variable",
             substrs=[
-                '(%s::u16string) u16_string = u"ß水氶"' % ns,
-                '(%s::u16string) u16_empty = u""' % ns,
-                '(%s::u32string) u32_string = U"🍄🍅🍆🍌"' % ns,
-                '(%s::u32string) u32_empty = U""' % ns,
-                '(CustomStringU16) custom_u16 = u"ß水氶"',
-                '(CustomStringU16) custom_u16_empty = u""',
-                '(CustomStringU32) custom_u32 = U"🍄🍅🍆🍌"',
-                '(CustomStringU32) custom_u32_empty = U""',
+                f'({u16string_name}) u16_string = u"ß水氶"',
+                f'({u16string_name}) u16_empty = u""',
+                f'({u32string_name}) u32_string = U"🍄🍅🍆🍌"',
+                f'({u32string_name}) u32_empty = U""',
+                f'({custom_u16string_name}) custom_u16 = u"ß水氶"',
+                f'({custom_u16string_name}) custom_u16_empty = u""',
+                f'({custom_u32string_name}) custom_u32 = U"🍄🍅🍆🍌"',
+                f'({custom_u32string_name}) custom_u32_empty = U""',
             ],
         )
 
@@ -271,9 +295,8 @@ def do_test_embedded_null(self):
         self.expect(
             "frame variable",
             substrs=[
-                '(%s::string) IHaveEmbeddedZeros = "a\\0b\\0c\\0d"' % ns,
-                '(%s::wstring) IHaveEmbeddedZerosToo = L"hello world!\\0てざ ル゜䋨ミ㠧槊 きゅへ狦穤襩 じゃ馩リョ 䤦監"'
-                % ns,
+                f'({self._makeStringName("::string", "char")}) IHaveEmbeddedZeros = "a\\0b\\0c\\0d"',
+                f'({self._makeStringName("::wstring", "wchar_t")}) IHaveEmbeddedZerosToo = L"hello world!\\0てざ ル゜䋨ミ㠧槊 きゅへ狦穤襩 じゃ馩リョ 䤦監"',
             ],
         )
 
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string_view/TestDataFormatterStdStringView.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string_view/TestDataFormatterStdStringView.py
index 181141886c5a2..5c915b6d9f588 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string_view/TestDataFormatterStdStringView.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string_view/TestDataFormatterStdStringView.py
@@ -11,6 +11,8 @@
 
 
 class StdStringViewDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
@@ -20,6 +22,12 @@ def setUp(self):
             "main.cpp", "// Break here to look at bad string view."
         )
 
+    def _makeStringName(self, typedef: str, char_type: str):
+        if self.getDebugInfo() == "pdb":
+            return f"std::basic_string_view<{char_type}, std::char_traits<{char_type}>>"
+
+        return typedef
+
     def do_test(self):
         """Test that that file and class static variables display correctly."""
         self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET)
@@ -51,39 +59,47 @@ def cleanup():
         # Execute the cleanup function during test case tear down.
         self.addTearDownHook(cleanup)
 
-        self.expect_var_path("wempty", type="std::wstring_view", summary='L""')
+        string_view_name = self._makeStringName("std::string_view", "char")
+        wstring_view_name = self._makeStringName("std::wstring_view", "wchar_t")
+        u16string_view_name = self._makeStringName("std::u16string_view", "char16_t")
+        u32string_view_name = self._makeStringName("std::u32string_view", "char32_t")
+        string_name = (
+            "std::basic_string<char, std::char_traits<char>, std::allocator<char>>"
+            if self.getDebugInfo() == "pdb"
+            else "std::string"
+        )
+
+        self.expect_var_path("wempty", type=wstring_view_name, summary='L""')
         self.expect_var_path(
-            "s", type="std::wstring_view", summary='L"hello world! מזל טוב!"'
+            "s", type=wstring_view_name, summary='L"hello world! מזל טוב!"'
         )
-        self.expect_var_path("S", type="std::wstring_view", summary='L"!!!!"')
-        self.expect_var_path("empty", type="std::string_view", summary='""')
-        self.expect_var_path("q_source", type="std::string", summary='"hello world"')
-        self.expect_var_path("q", type="std::string_view", summary='"hello world"')
+        self.expect_var_path("S", type=wstring_view_name, summary='L"!!!!"')
+        self.expect_var_path("empty", type=string_view_name, summary='""')
+        self.expect_var_path("q_source", type=string_name, summary='"hello world"')
+        self.expect_var_path("q", type=string_view_name, summary='"hello world"')
         self.expect_var_path(
             "Q",
-            type="std::string_view",
+            type=string_view_name,
             summary='"quite a long std::strin with lots of info inside it"',
         )
         self.expect_var_path(
-            "IHaveEmbeddedZeros", type="std::string_view", summary='"a\\0b\\0c\\0d"'
+            "IHaveEmbeddedZeros", type=string_view_name, summary='"a\\0b\\0c\\0d"'
         )
         self.expect_var_path(
             "IHaveEmbeddedZerosToo",
-            type="std::wstring_view",
+            type=wstring_view_name,
             summary='L"hello world!\\0てざ ル゜䋨ミ㠧槊 きゅへ狦穤襩 じゃ馩リョ 䤦監"',
         )
-        self.expect_var_path("u16_string", type="std::u16string_view", summary='u"ß水氶"')
-        self.expect_var_path("u16_empty", type="std::u16string_view", summary='u""')
-        self.expect_var_path(
-            "u32_string", type="std::u32string_view", summary='U"🍄🍅🍆🍌"'
-        )
-        self.expect_var_path("u32_empty", type="std::u32string_view", summary='U""')
+        self.expect_var_path("u16_string", type=u16string_view_name, summary='u"ß水氶"')
+        self.expect_var_path("u16_empty", type=u16string_view_name, summary='u""')
+        self.expect_var_path("u32_string", type=u32string_view_name, summary='U"🍄🍅🍆🍌"')
+        self.expect_var_path("u32_empty", type=u32string_view_name, summary='U""')
 
         # GetSummary returns None so can't be checked by expect_var_path, so we
         # use the str representation instead
         null_obj = self.frame().GetValueForVariablePath("null_str")
         self.assertEqual(null_obj.GetSummary(), "Summary Unavailable")
-        self.assertEqual(str(null_obj), "(std::string_view *) null_str = nullptr")
+        self.assertEqual(str(null_obj), f"({string_view_name} *) null_str = nullptr")
 
         self.runCmd("n")
 
@@ -108,37 +124,35 @@ def cleanup():
 
         self.expect_expr(
             "s",
-            result_type="std::wstring_view",
+            result_type=wstring_view_name,
             result_summary='L"hello world! מזל טוב!"',
         )
 
-        self.expect_var_path("wempty", type="std::wstring_view", summary='L""')
+        self.expect_var_path("wempty", type=wstring_view_name, summary='L""')
         self.expect_var_path(
-            "s", type="std::wstring_view", summary='L"hello world! מזל טוב!"'
+            "s", type=wstring_view_name, summary='L"hello world! מזל טוב!"'
         )
-        self.expect_var_path("S", type="std::wstring_view", summary='L"!!!!"')
-        self.expect_var_path("empty", type="std::string_view", summary='""')
-        self.expect_var_path("q_source", type="std::string", summary='"Hello world"')
-        self.expect_var_path("q", type="std::string_view", summary='"Hello world"')
+        self.expect_var_path("S", type=wstring_view_name, summary='L"!!!!"')
+        self.expect_var_path("empty", type=string_view_name, summary='""')
+        self.expect_var_path("q_source", type=string_name, summary='"Hello world"')
+        self.expect_var_path("q", type=string_view_name, summary='"Hello world"')
         self.expect_var_path(
             "Q",
-            type="std::string_view",
+            type=string_view_name,
             summary='"quite a long std::strin with lots of info inside it"',
         )
         self.expect_var_path(
-            "IHaveEmbeddedZeros", type="std::string_view", summary='"a\\0b\\0c\\0d"'
+            "IHaveEmbeddedZeros", type=string_view_name, summary='"a\\0b\\0c\\0d"'
         )
         self.expect_var_path(
             "IHaveEmbeddedZerosToo",
-            type="std::wstring_view",
+            type=wstring_view_name,
             summary='L"hello world!\\0てざ ル゜䋨ミ㠧槊 きゅへ狦穤襩 じゃ馩リョ 䤦監"',
         )
-        self.expect_var_path("u16_string", type="std::u16string_view", summary='u"ß水氶"')
-        self.expect_var_path("u16_empty", type="std::u16string_view", summary='u""')
-        self.expect_var_path(
-            "u32_string", type="std::u32string_view", summary='U"🍄🍅🍆🍌"'
-        )
-        self.expect_var_path("u32_empty", type="std::u32string_view", summary='U""')
+        self.expect_var_path("u16_string", type=u16string_view_name, summary='u"ß水氶"')
+        self.expect_var_path("u16_empty", type=u16string_view_name, summary='u""')
+        self.expect_var_path("u32_string", type=u32string_view_name, summary='U"🍄🍅🍆🍌"')
+        self.expect_var_path("u32_empty", type=u32string_view_name, summary='U""')
 
         self.runCmd("cont")
         self.expect(
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/TestDataFormatterStdTuple.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/TestDataFormatterStdTuple.py
index b23d549fe4c18..898438729ff8f 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/TestDataFormatterStdTuple.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/TestDataFormatterStdTuple.py
@@ -9,6 +9,8 @@
 
 
 class TestDataFormatterStdTuple(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def setUp(self):
         TestBase.setUp(self)
         self.line = line_number("main.cpp", "// break here")
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/u8string/TestDataFormatterStdU8String.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/u8string/TestDataFormatterStdU8String.py
index b983ee175d389..dda97945f9b23 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/u8string/TestDataFormatterStdU8String.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/u8string/TestDataFormatterStdU8String.py
@@ -11,18 +11,26 @@
 
 
 class StdU8StringDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def do_test(self):
         lldbutil.run_to_source_breakpoint(
             self, "Set break point at this line.", lldb.SBFileSpec("main.cpp")
         )
 
+        string_name = (
+            "std::basic_string<char8_t, std::char_traits<char8_t>, std::allocator<char8_t>>"
+            if self.getDebugInfo() == "pdb"
+            else "std::u8string"
+        )
+
         self.expect(
             "frame variable",
             substrs=[
-                '(std::u8string) u8_string_small = u8"🍄"',
-                '(std::u8string) u8_string = u8"❤️👍📄📁😃🧑‍🌾"',
-                '(std::u8string) u8_empty = u8""',
-                '(std::u8string) u8_text = u8"ABCd"',
+                f'({string_name}) u8_string_small = u8"🍄"',
+                f'({string_name}) u8_string = u8"❤️👍📄📁😃🧑‍🌾"',
+                f'({string_name}) u8_empty = u8""',
+                f'({string_name}) u8_text = u8"ABCd"',
             ],
         )
 
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/u8string_view/TestDataFormatterStdU8StringView.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/u8string_view/TestDataFormatterStdU8StringView.py
index 1e35a0f6bb040..6cf72d18a864f 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/u8string_view/TestDataFormatterStdU8StringView.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/u8string_view/TestDataFormatterStdU8StringView.py
@@ -11,18 +11,26 @@
 
 
 class StdU8StringViewDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def do_test(self):
         lldbutil.run_to_source_breakpoint(
             self, "Set break point at this line.", lldb.SBFileSpec("main.cpp")
         )
 
+        string_view_name = (
+            "std::basic_string_view<char8_t, std::char_traits<char8_t>>"
+            if self.getDebugInfo() == "pdb"
+            else "std::u8string_view"
+        )
+
         self.expect(
             "frame variable",
             substrs=[
-                '(std::u8string_view) u8_string_small = u8"🍄"',
-                '(std::u8string_view) u8_string = u8"❤️👍📄📁😃🧑‍🌾"',
-                '(std::u8string_view) u8_empty = u8""',
-                '(std::u8string_view) u8_text = u8"ABCd"',
+                f'({string_view_name}) u8_string_small = u8"🍄"',
+                f'({string_view_name}) u8_string = u8"❤️👍📄📁😃🧑‍🌾"',
+                f'({string_view_name}) u8_empty = u8""',
+                f'({string_view_name}) u8_text = u8"ABCd"',
             ],
         )
 
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py
index dd142d2be193b..f74092ca3a0b8 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py
@@ -9,6 +9,8 @@
 
 
 class StdVBoolDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestConnectRemoteDetach.py b/lldb/test/API/functionalities/gdb_remote_client/TestConnectRemoteDetach.py
new file mode 100644
index 0000000000000..4380455efc452
--- /dev/null
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestConnectRemoteDetach.py
@@ -0,0 +1,67 @@
+"""
+Test that ConnectRemote sets ShouldDetach flag correctly.
+
+When connecting to a remote process that stops after connection,
+the process should be marked for detach (not kill) on destruction.
+"""
+
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+from lldbsuite.test.gdbclientutils import *
+from lldbsuite.test.lldbgdbclient import GDBRemoteTestBase
+from lldbsuite.test import lldbutil
+
+
+class TestConnectRemoteDetach(GDBRemoteTestBase):
+    """Test that ConnectRemote properly sets ShouldDetach flag."""
+
+    class StoppedResponder(MockGDBServerResponder):
+        """A responder that returns a stopped process."""
+
+        def qfThreadInfo(self):
+            return "m1"
+
+        def qsThreadInfo(self):
+            return "l"
+
+        def qC(self):
+            return "QC1"
+
+        def haltReason(self):
+            # Return that we're stopped
+            return "T05thread:1;"
+
+        def cont(self):
+            # Stay stopped
+            return "T05thread:1;"
+
+        def D(self):
+            # Detach packet: this is what we want to verify gets called.
+            return "OK"
+
+        def k(self):
+            # Kill packet: this is what we want to verify doesn't get called.
+            raise RuntimeError("should not receive k(ill) packet")
+
+    def test_connect_remote_sets_detach(self):
+        """Test that ConnectRemote to a stopped process sets ShouldDetach."""
+        self.server.responder = self.StoppedResponder()
+
+        target = self.createTarget("a.yaml")
+        process = self.connect(target)
+
+        # Wait for the process to be in stopped state after connecting.
+        # When ConnectRemote connects to a remote process that is stopped,
+        # it should call SetShouldDetach(true) before CompleteAttach().
+        lldbutil.expect_state_changes(
+            self, self.dbg.GetListener(), process, [lldb.eStateStopped]
+        )
+
+        # Now destroy the process. Because ShouldDetach was set to true
+        # during ConnectRemote, this should send a 'D' (detach) packet
+        # rather than a 'k' (kill) packet when the process is destroyed.
+        process.Destroy()
+
+        # Verify that the (D)etach packet was sent.
+        self.assertPacketLogReceived(["D"])
diff --git a/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py b/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py
index 7fd2ff4229004..5fd2b767a6237 100644
--- a/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py
+++ b/lldb/test/API/functionalities/multiple-slides/TestMultipleSlides.py
@@ -12,10 +12,6 @@
 class MultipleSlidesTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
-    # The intermediate object main.o is compiled without debug info, but
-    # a.out is linked with `-gdwarf` on Windows. This creates a PDB.
-    # However, in the native PDB plugin, the symbols don't have a size.
-    @expectedFailureWindows
     def test_mulitple_slides(self):
         """Test that a binary can be slid multiple times correctly."""
         self.build()
@@ -33,10 +29,13 @@ def test_mulitple_slides(self):
             first_sym.GetEndAddress().GetOffset()
             - first_sym.GetStartAddress().GetOffset()
         )
+        int_size = target.FindFirstType("int").GetByteSize()
+        self.assertGreaterEqual(first_size, 2048 * int_size)
         second_size = (
             second_sym.GetEndAddress().GetOffset()
             - second_sym.GetStartAddress().GetOffset()
         )
+        self.assertGreaterEqual(second_size, 2048 * int_size)
 
         # View the first element of `first` and `second` while
         # they have no load address set.
diff --git a/lldb/test/API/functionalities/thread/step_until/function.list b/lldb/test/API/functionalities/thread/step_until/function.list
index 5900fe8c35069..d8caa20ad3550 100644
--- a/lldb/test/API/functionalities/thread/step_until/function.list
+++ b/lldb/test/API/functionalities/thread/step_until/function.list
@@ -1 +1,4 @@
-!call_me
+v1
+f call_me
+c 0
+c 1
diff --git a/lldb/test/API/functionalities/unwind/libunwind_ret_injection/Makefile b/lldb/test/API/functionalities/unwind/libunwind_ret_injection/Makefile
new file mode 100644
index 0000000000000..4698eaa815b83
--- /dev/null
+++ b/lldb/test/API/functionalities/unwind/libunwind_ret_injection/Makefile
@@ -0,0 +1,6 @@
+CXX_SOURCES := main.cpp
+
+# Build with C++ exceptions enabled
+CXXFLAGS := -g -O0 -fexceptions
+
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/unwind/libunwind_ret_injection/TestLibUnwindRetInjection.py b/lldb/test/API/functionalities/unwind/libunwind_ret_injection/TestLibUnwindRetInjection.py
new file mode 100644
index 0000000000000..e03234d1b5077
--- /dev/null
+++ b/lldb/test/API/functionalities/unwind/libunwind_ret_injection/TestLibUnwindRetInjection.py
@@ -0,0 +1,177 @@
+"""
+Test that libunwind correctly injects 'ret' instructions to rebalance execution flow
+when unwinding C++ exceptions. This is important for Apple Processor Trace analysis.
+"""
+
+import lldb
+import os
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+from lldbsuite.test import configuration
+
+
+class LibunwindRetInjectionTestCase(TestBase):
+    @skipIf(archs=no_match(["arm64", "arm64e", "aarch64"]))
+    @skipUnlessDarwin
+    @skipIfOutOfTreeLibunwind
+    def test_ret_injection_on_exception_unwind(self):
+        """Test that __libunwind_Registers_arm64_jumpto receives correct walkedFrames count and injects the right number of ret instructions."""
+        self.build()
+
+        exe = self.getBuildArtifact("a.out")
+        target = self.dbg.CreateTarget(exe)
+        self.assertTrue(target, VALID_TARGET)
+
+        # Find the just-built libunwind, not the system one.
+        # llvm_tools_dir is typically <build>/bin, so lib is a sibling.
+        self.assertIsNotNone(
+            configuration.llvm_tools_dir,
+            "llvm_tools_dir must be set to find in-tree libunwind",
+        )
+
+        llvm_lib_dir = os.path.join(
+            os.path.dirname(configuration.llvm_tools_dir), "lib"
+        )
+
+        # Find the libunwind library (platform-agnostic).
+        libunwind_path = None
+        for filename in os.listdir(llvm_lib_dir):
+            if filename.startswith("libunwind.") or filename.startswith("unwind."):
+                libunwind_path = os.path.join(llvm_lib_dir, filename)
+                break
+
+        self.assertIsNotNone(
+            libunwind_path, f"Could not find libunwind in {llvm_lib_dir}"
+        )
+
+        # Set breakpoint in __libunwind_Registers_arm64_jumpto.
+        # This is the function that performs the actual jump and ret injection.
+        bp = target.BreakpointCreateByName("__libunwind_Registers_arm64_jumpto")
+        self.assertTrue(bp.IsValid())
+        self.assertGreater(bp.GetNumLocations(), 0)
+
+        # Set up DYLD_INSERT_LIBRARIES to use the just-built libunwind.
+        launch_info = lldb.SBLaunchInfo(None)
+        env = target.GetEnvironment()
+        env.Set("DYLD_INSERT_LIBRARIES", libunwind_path, True)
+        launch_info.SetEnvironment(env, False)
+
+        # Launch the process with our custom libunwind.
+        error = lldb.SBError()
+        process = target.Launch(launch_info, error)
+        self.assertSuccess(
+            error, f"Failed to launch process with libunwind at {libunwind_path}"
+        )
+        self.assertTrue(process, PROCESS_IS_VALID)
+
+        # We should hit the breakpoint in __libunwind_Registers_arm64_jumpto
+        # during the exception unwinding phase 2.
+        threads = lldbutil.get_threads_stopped_at_breakpoint(process, bp)
+        self.assertEqual(len(threads), 1, "Should have stopped at breakpoint")
+
+        thread = threads[0]
+        frame = thread.GetFrameAtIndex(0)
+
+        # Verify we're in __libunwind_Registers_arm64_jumpto.
+        function_name = frame.GetFunctionName()
+        self.assertTrue(
+            "__libunwind_Registers_arm64_jumpto" in function_name,
+            f"Expected to be in __libunwind_Registers_arm64_jumpto, got {function_name}",
+        )
+
+        # On ARM64, the walkedFrames parameter should be in register x1 (second parameter).
+        # According to the ARM64 calling convention, integer arguments are passed in x0-x7.
+        # x0 = Registers_arm64* pointer.
+        # x1 = unsigned walkedFrames.
+        error = lldb.SBError()
+        x1_value = frame.register["x1"].GetValueAsUnsigned(error)
+        self.assertSuccess(error, "Failed to read x1 register")
+
+        # According to the code in UnwindCursor.hpp, the walkedFrames value represents:
+        # 1. The number of frames walked in unwind_phase2 to reach the landing pad.
+        # 2. Plus _EXTRA_LIBUNWIND_FRAMES_WALKED = 5 - 1 = 4 additional libunwind frames.
+        #
+        # From the comment in the code:
+        #   frame #0: __libunwind_Registers_arm64_jumpto
+        #   frame #1: Registers_arm64::returnto
+        #   frame #2: UnwindCursor::jumpto
+        #   frame #3: __unw_resume
+        #   frame #4: __unw_resume_with_frames_walked
+        #   frame #5: unwind_phase2
+        #
+        # Since __libunwind_Registers_arm64_jumpto returns to the landing pad,
+        # we subtract 1, so _EXTRA_LIBUNWIND_FRAMES_WALKED = 4.
+        #
+        # For our test program:
+        # - unwind_phase2 starts walking (frame 0 counted here).
+        # - Walks through: func_d (throw site), func_c, func_b, func_a.
+        # - Finds landing pad in main.
+        # That's approximately 4-5 frames from the user code.
+        # Plus the 4 extra libunwind frames.
+        #
+        # So we expect x1 to be roughly 8-10.
+        expected_min_frames = 8
+        expected_max_frames = 13  # Allow some variation for libc++abi frames.
+
+        self.assertGreaterEqual(
+            x1_value,
+            expected_min_frames,
+            f"walkedFrames (x1) should be >= {expected_min_frames}, got {x1_value}. "
+            "This is the number of 'ret' instructions that will be executed.",
+        )
+
+        self.assertLessEqual(
+            x1_value,
+            expected_max_frames,
+            f"walkedFrames (x1) should be <= {expected_max_frames}, got {x1_value}. "
+            "Value seems too high.",
+        )
+
+        # Now step through the ret injection loop and count the actual number of 'ret' executions.
+        # The loop injects exactly x1_value ret instructions before continuing with register restoration.
+        # We step until we hit the first 'ldp' instruction (register restoration starts with 'ldp x2, x3, [x0, #0x010]').
+        ret_executed_count = 0
+        max_steps = 100  # Safety limit to prevent infinite loops.
+
+        for step_count in range(max_steps):
+            # Get current instruction.
+            pc = frame.GetPC()
+            inst = process.ReadMemory(pc, 4, lldb.SBError())
+
+            # Disassemble current instruction.
+            current_inst = target.GetInstructions(lldb.SBAddress(pc, target), inst)[0]
+            mnemonic = current_inst.GetMnemonic(target)
+            operands = current_inst.GetOperands(target)
+
+            # Check if we've reached the register restoration part (first ldp after the loop).
+            if mnemonic == "ldp":
+                # We've exited the ret injection loop.
+                break
+
+            # Count 'ret' instructions that get executed.
+            if mnemonic == "ret":
+                self.assertEqual(operands, "x16")
+                ret_executed_count += 1
+
+            # Step one instruction.
+            thread.StepInstruction(False)  # False = step over.
+
+            # Update frame reference.
+            frame = thread.GetFrameAtIndex(0)
+
+        # Verify we didn't hit the safety limit.
+        self.assertLess(
+            step_count,
+            max_steps - 1,
+            f"Stepped {max_steps} times without reaching 'ldp' instruction. Something is wrong.",
+        )
+
+        # The number of executed 'ret' instructions should match x1_value.
+        # According to the implementation, the loop executes exactly x1_value times.
+        self.assertEqual(
+            ret_executed_count,
+            x1_value,
+            f"Expected {x1_value} 'ret' instructions to be executed (matching x1 register), "
+            f"but counted {ret_executed_count} executed 'ret' instructions.",
+        )
diff --git a/lldb/test/API/functionalities/unwind/libunwind_ret_injection/main.cpp b/lldb/test/API/functionalities/unwind/libunwind_ret_injection/main.cpp
new file mode 100644
index 0000000000000..00685e4d6b137
--- /dev/null
+++ b/lldb/test/API/functionalities/unwind/libunwind_ret_injection/main.cpp
@@ -0,0 +1,45 @@
+// Test program to verify libunwind ret injection feature for execution flow
+// rebalancing.
+//
+// This test creates a multi-frame call stack and throws a C++ exception to
+// trigger libunwind's two-phase exception handling. The test verifies that
+// libunwind correctly injects the right amount of 'ret' instructions to
+// rebalance the execution flow when returning to the landing pad, which is
+// important for Apple Processor Trace analysis.
+
+#include <cstdio>
+#include <exception>
+#include <stdexcept>
+
+// Marker functions with noinline to ensure they appear in the stack.
+static void __attribute__((noinline)) func_d() {
+  printf("In func_d, about to throw exception\n");
+  throw std::runtime_error("test exception");
+}
+
+static void __attribute__((noinline)) func_c() {
+  printf("In func_c\n");
+  func_d();
+}
+
+static void __attribute__((noinline)) func_b() {
+  printf("In func_b\n");
+  func_c();
+}
+
+static void __attribute__((noinline)) func_a() {
+  printf("In func_a\n");
+  func_b();
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    printf("In main, about to call func_a\n");
+    func_a();
+    printf("ERROR: Should not reach here\n");
+    return 1;
+  } catch (const std::exception &e) {
+    printf("Caught exception in main: %s\n", e.what());
+    return 0;
+  }
+}
diff --git a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
index 2f942da604ff2..eeb5d1b554b01 100644
--- a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
+++ b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
@@ -9,7 +9,7 @@ class LibCxxInternalsRecognizerTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
     @add_test_categories(["libc++"])
-    @skipIf(compiler="clang", compiler_version=["<=", "19.0"])
+    @skipIf(compiler="clang", compiler_version=["<", "21.0"])
     def test_frame_recognizer(self):
         """Test that implementation details of libc++ are hidden"""
         self.build()
diff --git a/lldb/test/API/lang/objc/foundation/TestFoundationDisassembly.py b/lldb/test/API/lang/objc/foundation/TestFoundationDisassembly.py
index 245313d683774..75f6651a2845a 100644
--- a/lldb/test/API/lang/objc/foundation/TestFoundationDisassembly.py
+++ b/lldb/test/API/lang/objc/foundation/TestFoundationDisassembly.py
@@ -12,52 +12,6 @@
 class FoundationDisassembleTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
-    @skipIfAsan
-    def test_foundation_disasm(self):
-        """Do 'disassemble -n func' on each and every 'Code' symbol entry from the Foundation.framework."""
-        self.build()
-
-        # Enable synchronous mode
-        self.dbg.SetAsync(False)
-
-        # Create a target by the debugger.
-        target = self.dbg.CreateTarget(self.getBuildArtifact("a.out"))
-        self.assertTrue(target, VALID_TARGET)
-
-        # Now launch the process, and do not stop at entry point.
-        process = target.LaunchSimple(None, None, self.get_process_working_directory())
-        self.assertTrue(process, PROCESS_IS_VALID)
-
-        foundation_framework = None
-        for module in target.modules:
-            if module.file.basename == "Foundation":
-                foundation_framework = module.file.fullpath
-                break
-
-        self.assertIsNotNone(foundation_framework, "Foundation.framework path located")
-        self.runCmd("image dump symtab '%s'" % foundation_framework)
-        raw_output = self.res.GetOutput()
-        # Now, grab every 'Code' symbol and feed it into the command:
-        # 'disassemble -n func'.
-        #
-        # The symbol name is on the last column and trails the flag column which
-        # looks like '0xhhhhhhhh', i.e., 8 hexadecimal digits.
-        codeRE = re.compile(
-            r"""
-                             \ Code\ {9}    # ' Code' followed by 9 SPCs,
-                             .*             # the wildcard chars,
-                             0x[0-9a-f]{8}  # the flag column, and
-                             \ (.+)$        # finally the function symbol.
-                             """,
-            re.VERBOSE,
-        )
-        for line in raw_output.split(os.linesep):
-            match = codeRE.search(line)
-            if match:
-                func = match.group(1)
-                self.runCmd('image lookup -s "%s"' % func)
-                self.runCmd('disassemble --force -n "%s"' % func)
-
     @skipIfAsan
     def test_simple_disasm(self):
         """Test the lldb 'disassemble' command"""
diff --git a/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py b/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py
index 142d27ddad37f..f3558f62d51f8 100644
--- a/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py
+++ b/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py
@@ -16,6 +16,7 @@ def setUp(self):
         self.line = line_number("main.m", "// Set breakpoint 0 here.")
 
     @skipIf(macos_version=["<", "10.12"])
+    @skipIf(compiler="clang", compiler_version=["<", "19.0"])
     def test_expr(self):
         self.build()
         exe = self.getBuildArtifact("a.out")
diff --git a/lldb/test/API/lang/objc/modules-compile-error/Makefile b/lldb/test/API/lang/objc/modules-compile-error/Makefile
deleted file mode 100644
index e031aa0bbbb8d..0000000000000
--- a/lldb/test/API/lang/objc/modules-compile-error/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-OBJC_SOURCES := main.m
-
-CFLAGS_EXTRAS = $(MANDATORY_MODULE_BUILD_CFLAGS) -I$(BUILDDIR) -DONLY_CLANG=1
-
-include Makefile.rules
diff --git a/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py b/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py
deleted file mode 100644
index 36e302be2525b..0000000000000
--- a/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import lldb
-from lldbsuite.test.decorators import *
-from lldbsuite.test.lldbtest import *
-from lldbsuite.test import lldbutil
-
-
-class TestCase(TestBase):
-    @skipIf(compiler="clang", compiler_version=["<", "11.0"])
-    def test(self):
-        self.build()
-        lldbutil.run_to_source_breakpoint(
-            self, "// break here", lldb.SBFileSpec("main.m")
-        )
-
-        # Try importing our custom module. This will fail as LLDB won't define
-        # the CLANG_ONLY define when it compiles the module for the expression
-        # evaluator.
-        # Check that the error message shows file/line/column, prints the relevant
-        # line from the source code and mentions the module that failed to build.
-        self.expect(
-            "expr @import LLDBTestModule",
-            error=True,
-            substrs=[
-                "module.h:4:1: error: use of undeclared identifier 'syntax_error_for_lldb_to_find'",
-                "syntax_error_for_lldb_to_find // comment that tests source printing",
-                "could not build module 'LLDBTestModule'",
-            ],
-        )
diff --git a/lldb/test/API/lang/objc/modules-compile-error/main.m b/lldb/test/API/lang/objc/modules-compile-error/main.m
deleted file mode 100644
index 35259dd287b01..0000000000000
--- a/lldb/test/API/lang/objc/modules-compile-error/main.m
+++ /dev/null
@@ -1,5 +0,0 @@
-@import LLDBTestModule;
-
-int main() {
-  return foo(); // break here
-}
diff --git a/lldb/test/API/lang/objc/modules-compile-error/module.h b/lldb/test/API/lang/objc/modules-compile-error/module.h
deleted file mode 100644
index 2edd13b0832db..0000000000000
--- a/lldb/test/API/lang/objc/modules-compile-error/module.h
+++ /dev/null
@@ -1,5 +0,0 @@
-int foo() { return 123; }
-
-#ifndef ONLY_CLANG
-syntax_error_for_lldb_to_find // comment that tests source printing
-#endif
diff --git a/lldb/test/API/lang/objc/modules-compile-error/module.modulemap b/lldb/test/API/lang/objc/modules-compile-error/module.modulemap
deleted file mode 100644
index 3d44faf3e9080..0000000000000
--- a/lldb/test/API/lang/objc/modules-compile-error/module.modulemap
+++ /dev/null
@@ -1 +0,0 @@
-module LLDBTestModule { header "module.h" export * }
diff --git a/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py b/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py
index 3be064ae7d5f8..657a7103ee989 100644
--- a/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py
+++ b/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py
@@ -6,6 +6,7 @@
 
 class TestCase(TestBase):
     @no_debug_info_test
+    @skipIf(compiler="clang", compiler_version=["<", "19.0"])
     def test_conflicting_properties(self):
         """Tests receiving two properties with the same name from modules."""
         self.build()
diff --git a/lldb/test/API/lang/objc/real-definition/TestRealDefinition.py b/lldb/test/API/lang/objc/real-definition/TestRealDefinition.py
index 6cbb9ddec264d..9fb2bea93e9c2 100644
--- a/lldb/test/API/lang/objc/real-definition/TestRealDefinition.py
+++ b/lldb/test/API/lang/objc/real-definition/TestRealDefinition.py
@@ -27,13 +27,11 @@ def test_frame_var_after_stop_at_interface(self):
         # Run at stop at main
         lldbutil.check_breakpoint(self, bpno=1, expected_hit_count=1)
 
-        self.runCmd("settings set target.prefer-dynamic-value no-dynamic-values")
-
         # This should display correctly.
         self.expect(
             "frame variable foo->_bar->_hidden_ivar",
             VARIABLES_DISPLAYED_CORRECTLY,
-            substrs=["(NSString *)", "foo->_bar->_hidden_ivar = 0x"],
+            substrs=["foo->_bar->_hidden_ivar = 0x"],
         )
 
     def test_frame_var_after_stop_at_implementation(self):
@@ -54,11 +52,9 @@ def test_frame_var_after_stop_at_implementation(self):
         # Run at stop at main
         lldbutil.check_breakpoint(self, bpno=1, expected_hit_count=1)
 
-        self.runCmd("settings set target.prefer-dynamic-value no-dynamic-values")
-
         # This should display correctly.
         self.expect(
             "frame variable foo->_bar->_hidden_ivar",
             VARIABLES_DISPLAYED_CORRECTLY,
-            substrs=["(NSString *)", "foo->_bar->_hidden_ivar = 0x"],
+            substrs=["foo->_bar->_hidden_ivar = 0x"],
         )
diff --git a/lldb/test/API/lua_api/TestThreadAPI.lua b/lldb/test/API/lua_api/TestThreadAPI.lua
new file mode 100644
index 0000000000000..5a38d0ba9192f
--- /dev/null
+++ b/lldb/test/API/lua_api/TestThreadAPI.lua
@@ -0,0 +1,25 @@
+_T = require('lua_lldb_test').create_test('TestThreadAPI')
+
+function _T:TestGetStopDescription()
+    local target = self:create_target()
+    local breakpoint = target:BreakpointCreateByName("main", "a.out")
+    assertTrue(breakpoint:IsValid() and breakpoint:GetNumLocations() == 1)
+
+    local process = target:LaunchSimple({ 'arg1', 'arg2' }, nil, nil)
+    local thread = get_stopped_thread(process, lldb.eStopReasonBreakpoint)
+    assertNotNil(thread)
+    assertTrue(thread:IsValid())
+
+    assertEqual("breakpoint", thread:GetStopDescription(string.len("breakpoint") + 1))
+    assertEqual("break", thread:GetStopDescription(string.len("break") + 1))
+    assertEqual("b", thread:GetStopDescription(string.len("b") + 1))
+    assertEqual("breakpoint 1.1", thread:GetStopDescription(string.len("breakpoint 1.1") + 100))
+
+    -- Test stream variation
+    local stream = lldb.SBStream()
+    assertTrue(thread:GetStopDescription(stream))
+    assertNotNil(stream)
+    assertEqual("breakpoint 1.1", stream:GetData())
+end
+
+os.exit(_T:run())
diff --git a/lldb/test/API/python_api/default-constructor/sb_thread.py b/lldb/test/API/python_api/default-constructor/sb_thread.py
index 34eb3db852c38..4252fa0321fff 100644
--- a/lldb/test/API/python_api/default-constructor/sb_thread.py
+++ b/lldb/test/API/python_api/default-constructor/sb_thread.py
@@ -10,6 +10,7 @@ def fuzz_obj(obj):
     obj.GetStopReasonDataCount()
     obj.GetStopReasonDataAtIndex(100)
     obj.GetStopDescription(256)
+    obj.GetStopDescription(lldb.SBStream())
     obj.GetThreadID()
     obj.GetIndexID()
     obj.GetName()
diff --git a/lldb/test/API/python_api/frame_list/Makefile b/lldb/test/API/python_api/frame_list/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/python_api/frame_list/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/python_api/frame_list/TestSBFrameList.py b/lldb/test/API/python_api/frame_list/TestSBFrameList.py
new file mode 100644
index 0000000000000..f348ce492e547
--- /dev/null
+++ b/lldb/test/API/python_api/frame_list/TestSBFrameList.py
@@ -0,0 +1,194 @@
+"""
+Test SBFrameList API.
+"""
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class FrameListAPITestCase(TestBase):
+    def test_frame_list_api(self):
+        """Test SBThread.GetFrames() returns a valid SBFrameList."""
+        self.build()
+        self.frame_list_api()
+
+    def test_frame_list_iterator(self):
+        """Test SBFrameList iterator functionality."""
+        self.build()
+        self.frame_list_iterator()
+
+    def test_frame_list_indexing(self):
+        """Test SBFrameList indexing and length."""
+        self.build()
+        self.frame_list_indexing()
+
+    def test_frame_list_get_thread(self):
+        """Test SBFrameList.GetThread() returns correct thread."""
+        self.build()
+        self.frame_list_get_thread()
+
+    def setUp(self):
+        TestBase.setUp(self)
+        self.main_source = "main.cpp"
+
+    def frame_list_api(self):
+        """Test SBThread.GetFrames() returns a valid SBFrameList."""
+        exe = self.getBuildArtifact("a.out")
+
+        target, process, thread, bkpt = lldbutil.run_to_source_breakpoint(
+            self, "Set break point at this line", lldb.SBFileSpec(self.main_source)
+        )
+
+        self.assertTrue(
+            thread.IsValid(), "There should be a thread stopped due to breakpoint"
+        )
+
+        # Test GetFrames() returns a valid SBFrameList
+        frame_list = thread.GetFrames()
+        self.assertTrue(frame_list.IsValid(), "Frame list should be valid")
+        self.assertGreater(
+            frame_list.GetSize(), 0, "Frame list should have at least one frame"
+        )
+
+        # Verify frame list size matches thread frame count
+        self.assertEqual(
+            frame_list.GetSize(),
+            thread.GetNumFrames(),
+            "Frame list size should match thread frame count",
+        )
+
+        # Verify frames are the same
+        for i in range(frame_list.GetSize()):
+            frame_from_list = frame_list.GetFrameAtIndex(i)
+            frame_from_thread = thread.GetFrameAtIndex(i)
+            self.assertTrue(
+                frame_from_list.IsValid(), f"Frame {i} from list should be valid"
+            )
+            self.assertEqual(
+                frame_from_list.GetPC(),
+                frame_from_thread.GetPC(),
+                f"Frame {i} PC should match",
+            )
+
+    def frame_list_iterator(self):
+        """Test SBFrameList iterator functionality."""
+        exe = self.getBuildArtifact("a.out")
+
+        target, process, thread, bkpt = lldbutil.run_to_source_breakpoint(
+            self, "Set break point at this line", lldb.SBFileSpec(self.main_source)
+        )
+
+        self.assertTrue(
+            thread.IsValid(), "There should be a thread stopped due to breakpoint"
+        )
+
+        frame_list = thread.GetFrames()
+
+        # Test iteration
+        frame_count = 0
+        for frame in frame_list:
+            self.assertTrue(frame.IsValid(), "Each frame should be valid")
+            frame_count += 1
+
+        self.assertEqual(
+            frame_count,
+            frame_list.GetSize(),
+            "Iterator should visit all frames",
+        )
+
+        # Test that we can iterate multiple times
+        second_count = 0
+        for frame in frame_list:
+            second_count += 1
+
+        self.assertEqual(
+            frame_count, second_count, "Should be able to iterate multiple times"
+        )
+
+    def frame_list_indexing(self):
+        """Test SBFrameList indexing and length."""
+        exe = self.getBuildArtifact("a.out")
+
+        target, process, thread, bkpt = lldbutil.run_to_source_breakpoint(
+            self, "Set break point at this line", lldb.SBFileSpec(self.main_source)
+        )
+
+        self.assertTrue(
+            thread.IsValid(), "There should be a thread stopped due to breakpoint"
+        )
+
+        frame_list = thread.GetFrames()
+
+        # Test len()
+        self.assertEqual(
+            len(frame_list), frame_list.GetSize(), "len() should return frame count"
+        )
+
+        # Test positive indexing
+        first_frame = frame_list[0]
+        self.assertTrue(first_frame.IsValid(), "First frame should be valid")
+        self.assertEqual(
+            first_frame.GetPC(),
+            thread.GetFrameAtIndex(0).GetPC(),
+            "Indexed frame should match",
+        )
+
+        # Test negative indexing
+        if len(frame_list) > 0:
+            last_frame = frame_list[-1]
+            self.assertTrue(last_frame.IsValid(), "Last frame should be valid")
+            self.assertEqual(
+                last_frame.GetPC(),
+                thread.GetFrameAtIndex(len(frame_list) - 1).GetPC(),
+                "Negative indexing should work",
+            )
+
+        # Test out of bounds returns None
+        out_of_bounds = frame_list[10000]
+        self.assertIsNone(out_of_bounds, "Out of bounds index should return None")
+
+        # Test bool conversion
+        self.assertTrue(bool(frame_list), "Non-empty frame list should be truthy")
+
+        # Test Clear()
+        frame_list.Clear()
+        # Note: Clear() clears the underlying StackFrameList cache,
+        # but the frame list object itself should still be valid
+        self.assertTrue(
+            frame_list.IsValid(), "Frame list should still be valid after Clear()"
+        )
+
+    def frame_list_get_thread(self):
+        """Test SBFrameList.GetThread() returns correct thread."""
+        exe = self.getBuildArtifact("a.out")
+
+        target, process, thread, bkpt = lldbutil.run_to_source_breakpoint(
+            self, "Set break point at this line", lldb.SBFileSpec(self.main_source)
+        )
+
+        self.assertTrue(
+            thread.IsValid(), "There should be a thread stopped due to breakpoint"
+        )
+
+        frame_list = thread.GetFrames()
+        self.assertTrue(frame_list.IsValid(), "Frame list should be valid")
+
+        # Test GetThread() returns the correct thread
+        thread_from_list = frame_list.GetThread()
+        self.assertTrue(
+            thread_from_list.IsValid(), "Thread from frame list should be valid"
+        )
+        self.assertEqual(
+            thread_from_list.GetThreadID(),
+            thread.GetThreadID(),
+            "Frame list should return the correct thread",
+        )
+
+        # Verify it's the same thread object
+        self.assertEqual(
+            thread_from_list.GetProcess().GetProcessID(),
+            thread.GetProcess().GetProcessID(),
+            "Thread should belong to same process",
+        )
diff --git a/lldb/test/API/python_api/frame_list/main.cpp b/lldb/test/API/python_api/frame_list/main.cpp
new file mode 100644
index 0000000000000..e39944654a23e
--- /dev/null
+++ b/lldb/test/API/python_api/frame_list/main.cpp
@@ -0,0 +1,22 @@
+#include <stdio.h>
+
+int c(int val) {
+  // Set break point at this line
+  return val + 3;
+}
+
+int b(int val) {
+  int result = c(val);
+  return result;
+}
+
+int a(int val) {
+  int result = b(val);
+  return result;
+}
+
+int main() {
+  int result = a(1);
+  printf("Result: %d\n", result);
+  return 0;
+}
diff --git a/lldb/test/API/python_api/thread/TestThreadAPI.py b/lldb/test/API/python_api/thread/TestThreadAPI.py
index 5583434a742a9..acad7583eec19 100644
--- a/lldb/test/API/python_api/thread/TestThreadAPI.py
+++ b/lldb/test/API/python_api/thread/TestThreadAPI.py
@@ -138,6 +138,11 @@ def get_stop_description(self):
             "breakpoint 1.1", thread.GetStopDescription(len("breakpoint 1.1") + 100)
         )
 
+        # Test the stream variation
+        stream = lldb.SBStream()
+        self.assertTrue(thread.GetStopDescription(stream))
+        self.assertEqual("breakpoint 1.1", stream.GetData())
+
     def step_out_of_malloc_into_function_b(self, exe_name):
         """Test Python SBThread.StepOut() API to step out of a malloc call where the call site is at function b()."""
         exe = self.getBuildArtifact(exe_name)
diff --git a/lldb/test/API/python_api/unified_section_list/Makefile b/lldb/test/API/python_api/unified_section_list/Makefile
new file mode 100644
index 0000000000000..431e716ab8f69
--- /dev/null
+++ b/lldb/test/API/python_api/unified_section_list/Makefile
@@ -0,0 +1,5 @@
+CXX_SOURCES := main.cpp
+
+SPLIT_DEBUG_SYMBOLS := YES
+
+include Makefile.rules
diff --git a/lldb/test/API/python_api/unified_section_list/TestModuleUnifiedSectionList.py b/lldb/test/API/python_api/unified_section_list/TestModuleUnifiedSectionList.py
new file mode 100644
index 0000000000000..93b23d0ba81cb
--- /dev/null
+++ b/lldb/test/API/python_api/unified_section_list/TestModuleUnifiedSectionList.py
@@ -0,0 +1,285 @@
+"""
+Test Unified Section List merging.
+"""
+
+import os
+import shutil
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+from lldbsuite.test.lldbutil import symbol_type_to_str
+
+
+class ModuleUnifiedSectionList(TestBase):
+    @skipUnlessPlatform(["linux", "freebsd", "netbsd"])
+    def test_unified_section_list(self):
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        debug_info = self.getBuildArtifact("a.out.debug")
+        new_dir = os.path.join(os.path.dirname(debug_info), "new_dir")
+        os.mkdir(new_dir)
+        renamed_debug_info = os.path.join(new_dir, "renamed.debug")
+        os.rename(debug_info, renamed_debug_info)
+        target = self.dbg.CreateTarget(exe)
+        self.assertTrue(target, VALID_TARGET)
+        self.assertGreater(target.GetNumModules(), 0)
+
+        main_exe_module = target.GetModuleAtIndex(0)
+        eh_frame = main_exe_module.FindSection(".eh_frame")
+        self.assertTrue(eh_frame.IsValid())
+        self.assertGreater(eh_frame.size, 0)
+
+        # Should be stripped in main executable.
+        debug_info_section = main_exe_module.FindSection(".debug_info")
+        self.assertFalse(debug_info_section.IsValid())
+
+        ci = self.dbg.GetCommandInterpreter()
+        res = lldb.SBCommandReturnObject()
+        ci.HandleCommand(f"target symbols add {renamed_debug_info}", res)
+        self.assertTrue(res.Succeeded())
+
+        # Should be stripped in .debuginfo but be present in main executable.
+        main_exe_module = target.GetModuleAtIndex(0)
+        eh_frame = main_exe_module.FindSection(".eh_frame")
+        self.assertTrue(eh_frame.IsValid())
+        self.assertGreater(eh_frame.size, 0)
+
+        # Should be unified and both sections should have contents.
+        debug_info_section = main_exe_module.FindSection(".debug_info")
+        self.assertTrue(debug_info_section.IsValid())
+        self.assertGreater(debug_info_section.file_size, 0)
+
+    def test_unified_section_list_overwrite_larger_section(self):
+        """
+        Test the merging of an ELF file with another ELF File where all the new sections are bigger, validating we
+        overwrite .comment from SHT_NOBITS to the new SHT_PROGBITS section and the smaller .text with the larger
+        .text
+        """
+        exe = self.getBuildArtifact("a.out")
+        self.yaml2obj("main.yaml", exe)
+
+        target = self.dbg.CreateTarget(exe)
+        self.assertTrue(target, VALID_TARGET)
+        main_exe_module = target.GetModuleAtIndex(0)
+
+        # First we verify out .text section is the expected BEC0FFEE
+        text_before_merge = main_exe_module.FindSection(".text")
+        self.assertTrue(text_before_merge.IsValid())
+        error = lldb.SBError()
+        section_content = text_before_merge.data.ReadRawData(
+            error, 0, text_before_merge.data.size
+        )
+        self.assertTrue(error.Success())
+        self.assertEqual(section_content, bytes.fromhex("BEC0FFEE"))
+
+        # .comment in main.yaml should be SHT_NOBITS, and size 0
+        comment_before_merge = main_exe_module.FindSection(".comment")
+        self.assertTrue(comment_before_merge.IsValid())
+        self.assertEqual(comment_before_merge.data.size, 0)
+
+        # yamlize the main.largertext.yaml and force symbol loading
+        debug_info = self.getBuildArtifact("a.out.debug")
+        self.yaml2obj("main.largertext.yaml", debug_info)
+
+        ci = self.dbg.GetCommandInterpreter()
+        res = lldb.SBCommandReturnObject()
+        ci.HandleCommand(f"target symbols add {debug_info}", res)
+        self.assertTrue(res.Succeeded())
+
+        # verify we took the larger .text section
+        main_exe_module_after_merge = target.GetModuleAtIndex(0)
+        text_after_merge = main_exe_module_after_merge.FindSection(".text")
+        self.assertTrue(text_after_merge.IsValid())
+        self.assertGreater(text_after_merge.data.size, text_before_merge.data.size)
+        section_content_after_merge = text_after_merge.data.ReadRawData(
+            error, 0, text_after_merge.data.size
+        )
+        self.assertTrue(error.Success())
+        self.assertEqual(section_content_after_merge, bytes.fromhex("BEC0FFEEEEFF0CEB"))
+
+        # in main.largertext.yaml comment is not SHT_NOBITS, and so we should see
+        # the size > 0 and equal to BAADF00D
+        comment_after_merge = main_exe_module_after_merge.FindSection(".comment")
+        self.assertTrue(comment_after_merge.IsValid())
+        comment_content_after_merge = comment_after_merge.data.ReadRawData(
+            error, 0, comment_after_merge.data.size
+        )
+
+        self.assertTrue(error.Success())
+        self.assertEqual(comment_content_after_merge, bytes.fromhex("BAADF00D"))
+
+    def test_unified_section_list_overwrite_smaller_section(self):
+        """
+        Test the merging of an ELF file with another ELF File where all the existing sections are bigger, validating we don't
+        overwrite with the SHT_NOBITS for .comment or the smaller .text section.
+        """
+        exe = self.getBuildArtifact("a.out")
+        self.yaml2obj("main.largertext.yaml", exe)
+
+        target = self.dbg.CreateTarget(exe)
+        self.assertTrue(target, VALID_TARGET)
+        main_exe_module = target.GetModuleAtIndex(0)
+
+        # Same as above test but inverse, verify our larger .text section
+        # is the expected BEC0FFEE palindrome
+        text_before_merge = main_exe_module.FindSection(".text")
+        self.assertTrue(text_before_merge.IsValid())
+        error = lldb.SBError()
+        section_content = text_before_merge.data.ReadRawData(
+            error, 0, text_before_merge.data.size
+        )
+        self.assertTrue(error.Success())
+        self.assertEqual(section_content, bytes.fromhex("BEC0FFEEEEFF0CEB"))
+
+        # Comment is SHT_PROGBITS on the larger yaml and should remain
+        # the same after merge.
+        comment_before_merge = main_exe_module.FindSection(".comment")
+        self.assertTrue(comment_before_merge.IsValid())
+        comment_content = comment_before_merge.data.ReadRawData(
+            error, 0, comment_before_merge.data.size
+        )
+
+        self.assertTrue(error.Success())
+        self.assertEqual(comment_content, bytes.fromhex("BAADF00D"))
+
+        debug_info = self.getBuildArtifact("a.out.debug")
+        self.yaml2obj("main.yaml", debug_info)
+
+        ci = self.dbg.GetCommandInterpreter()
+        res = lldb.SBCommandReturnObject()
+        ci.HandleCommand(f"target symbols add {debug_info}", res)
+        self.assertTrue(res.Succeeded())
+
+        # Verify we didn't replace the sections after merge.s
+        main_exe_module_after_merge = target.GetModuleAtIndex(0)
+        text_after_merge = main_exe_module_after_merge.FindSection(".text")
+        self.assertTrue(text_after_merge.IsValid())
+        self.assertEqual(text_after_merge.data.size, text_before_merge.data.size)
+        section_content_after_merge = text_after_merge.data.ReadRawData(
+            error, 0, text_after_merge.data.size
+        )
+        self.assertTrue(error.Success())
+        self.assertEqual(section_content_after_merge, bytes.fromhex("BEC0FFEEEEFF0CEB"))
+
+        comment_after_merge = main_exe_module_after_merge.FindSection(".comment")
+        self.assertTrue(comment_after_merge.IsValid())
+        comment_content_after_merge = comment_after_merge.data.ReadRawData(
+            error, 0, comment_after_merge.data.size
+        )
+
+        self.assertTrue(error.Success())
+        self.assertEqual(comment_content_after_merge, bytes.fromhex("BAADF00D"))
+
+    def test_unified_section_list_overwrite_mixed_merge(self):
+        """
+        Test the merging of an ELF file with another ELF File where the lhs has a larger .comment section
+        and the RHS has a larger .text section.
+        """
+        exe = self.getBuildArtifact("a.out")
+        self.yaml2obj("main.largercomment.yaml", exe)
+
+        target = self.dbg.CreateTarget(exe)
+        self.assertTrue(target, VALID_TARGET)
+        main_exe_module = target.GetModuleAtIndex(0)
+
+        # Verify we have the expected smaller BEC0FFEE
+        text_before_merge = main_exe_module.FindSection(".text")
+        self.assertTrue(text_before_merge.IsValid())
+        error = lldb.SBError()
+        section_content = text_before_merge.data.ReadRawData(
+            error, 0, text_before_merge.data.size
+        )
+        self.assertTrue(error.Success())
+        self.assertEqual(section_content, bytes.fromhex("BEC0FFEE"))
+
+        # Verify we have the larger palindromic comment
+        comment_before_merge = main_exe_module.FindSection(".comment")
+        self.assertTrue(comment_before_merge.IsValid())
+        comment_content = comment_before_merge.data.ReadRawData(
+            error, 0, comment_before_merge.data.size
+        )
+
+        self.assertTrue(error.Success())
+        self.assertEqual(comment_content, bytes.fromhex("BAADF00DF00DBAAD"))
+
+        debug_info = self.getBuildArtifact("a.out.debug")
+        self.yaml2obj("main.largertext.yaml", debug_info)
+
+        ci = self.dbg.GetCommandInterpreter()
+        res = lldb.SBCommandReturnObject()
+        ci.HandleCommand(f"target symbols add {debug_info}", res)
+        self.assertTrue(res.Succeeded())
+
+        # Verify we replaced .text
+        main_exe_module_after_merge = target.GetModuleAtIndex(0)
+        text_after_merge = main_exe_module_after_merge.FindSection(".text")
+        self.assertTrue(text_after_merge.IsValid())
+        section_content_after_merge = text_after_merge.data.ReadRawData(
+            error, 0, text_after_merge.data.size
+        )
+        self.assertTrue(error.Success())
+        self.assertEqual(section_content_after_merge, bytes.fromhex("BEC0FFEEEEFF0CEB"))
+
+        # Verify .comment is still the same.
+        comment_after_merge = main_exe_module_after_merge.FindSection(".comment")
+        self.assertTrue(comment_after_merge.IsValid())
+        comment_content_after_merge = comment_after_merge.data.ReadRawData(
+            error, 0, comment_after_merge.data.size
+        )
+
+        self.assertTrue(error.Success())
+        self.assertEqual(comment_content_after_merge, bytes.fromhex("BAADF00DF00DBAAD"))
+
+    def test_unified_section_list_overwrite_equal_size(self):
+        """
+        Test the merging of an ELF file with an ELF file with sections of the same size with different values
+        .text
+        """
+        exe = self.getBuildArtifact("a.out")
+        self.yaml2obj("main.yaml", exe)
+
+        target = self.dbg.CreateTarget(exe)
+        self.assertTrue(target, VALID_TARGET)
+        main_exe_module = target.GetModuleAtIndex(0)
+
+        # First we verify out .text section is the expected BEC0FFEE
+        text_before_merge = main_exe_module.FindSection(".text")
+        self.assertTrue(text_before_merge.IsValid())
+        error = lldb.SBError()
+        section_content = text_before_merge.data.ReadRawData(
+            error, 0, text_before_merge.data.size
+        )
+        self.assertTrue(error.Success())
+        self.assertEqual(section_content, bytes.fromhex("BEC0FFEE"))
+
+        # .comment in main.yaml should be SHT_NOBITS, and size 0
+        comment_before_merge = main_exe_module.FindSection(".comment")
+        self.assertTrue(comment_before_merge.IsValid())
+        self.assertEqual(comment_before_merge.data.size, 0)
+
+        # yamlize the main with the .text reversed from BEC0FFEE
+        # to EEFF0CEB. We should still keep our .text with BEC0FFEE
+        debug_info = self.getBuildArtifact("a.out.debug")
+        self.yaml2obj("main.reversedtext.yaml", debug_info)
+
+        ci = self.dbg.GetCommandInterpreter()
+        res = lldb.SBCommandReturnObject()
+        ci.HandleCommand(f"target symbols add {debug_info}", res)
+        self.assertTrue(res.Succeeded())
+
+        # verify .text did not change
+        main_exe_module_after_merge = target.GetModuleAtIndex(0)
+        text_after_merge = main_exe_module_after_merge.FindSection(".text")
+        self.assertTrue(text_after_merge.IsValid())
+        section_content_after_merge = text_after_merge.data.ReadRawData(
+            error, 0, text_after_merge.data.size
+        )
+        self.assertTrue(error.Success())
+        self.assertEqual(section_content_after_merge, bytes.fromhex("BEC0FFEE"))
+
+        # verify comment did not change
+        comment_afer_merge = main_exe_module_after_merge.FindSection(".comment")
+        self.assertTrue(comment_afer_merge.IsValid())
+        self.assertEqual(comment_afer_merge.data.size, 0)
diff --git a/lldb/test/API/python_api/unified_section_list/main.cpp b/lldb/test/API/python_api/unified_section_list/main.cpp
new file mode 100644
index 0000000000000..45fd52eeeb303
--- /dev/null
+++ b/lldb/test/API/python_api/unified_section_list/main.cpp
@@ -0,0 +1,3 @@
+#include <stdio.h>
+
+int main() { printf("Hello World\n"); }
diff --git a/lldb/test/API/python_api/unified_section_list/main.largercomment.yaml b/lldb/test/API/python_api/unified_section_list/main.largercomment.yaml
new file mode 100644
index 0000000000000..f7860063e151a
--- /dev/null
+++ b/lldb/test/API/python_api/unified_section_list/main.largercomment.yaml
@@ -0,0 +1,46 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+  Entry:           0x1040
+ProgramHeaders:
+  - Type:            PT_PHDR
+    Flags:           [ PF_R ]
+    VAddr:           0x40
+    Align:           0x8
+    Offset:          0x40
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .text
+    LastSec:         .fini
+    Align:           0x1000
+    Offset:          0x0
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1040
+    AddressAlign:    0x10
+    Content:         BEC0FFEE
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1140
+    AddressAlign:    0x4
+    Content:         DEADBEEF
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3140
+    AddressAlign:    0x4
+    Content:         BAADF00DF00DBAAD
+Symbols:
+  - Name:            main
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1130
+    Size:            0xF
+...
diff --git a/lldb/test/API/python_api/unified_section_list/main.largertext.yaml b/lldb/test/API/python_api/unified_section_list/main.largertext.yaml
new file mode 100644
index 0000000000000..6450e6769db69
--- /dev/null
+++ b/lldb/test/API/python_api/unified_section_list/main.largertext.yaml
@@ -0,0 +1,46 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+  Entry:           0x1040
+ProgramHeaders:
+  - Type:            PT_PHDR
+    Flags:           [ PF_R ]
+    VAddr:           0x40
+    Align:           0x8
+    Offset:          0x40
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .text
+    LastSec:         .fini
+    Align:           0x1000
+    Offset:          0x0
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1040
+    AddressAlign:    0x10
+    Content:         BEC0FFEEEEFF0CEB
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1140
+    AddressAlign:    0x4
+    Content:         DEADBEEF
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3140
+    AddressAlign:    0x4
+    Content:         BAADF00D
+Symbols:
+  - Name:            main
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1130
+    Size:            0xF
+...
diff --git a/lldb/test/API/python_api/unified_section_list/main.reversedtext.yaml b/lldb/test/API/python_api/unified_section_list/main.reversedtext.yaml
new file mode 100644
index 0000000000000..57206666046a4
--- /dev/null
+++ b/lldb/test/API/python_api/unified_section_list/main.reversedtext.yaml
@@ -0,0 +1,45 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+  Entry:           0x1040
+ProgramHeaders:
+  - Type:            PT_PHDR
+    Flags:           [ PF_R ]
+    VAddr:           0x40
+    Align:           0x8
+    Offset:          0x40
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .text
+    LastSec:         .fini
+    Align:           0x1000
+    Offset:          0x0
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1040
+    AddressAlign:    0x10
+    Content:         BEC0FFEE
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1140
+    AddressAlign:    0x4
+    Content:         DEADBEEF
+  - Name:            .comment
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3140
+    AddressAlign:    0x4
+Symbols:
+  - Name:            main
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1130
+    Size:            0xF
+...
diff --git a/lldb/test/API/python_api/unified_section_list/main.yaml b/lldb/test/API/python_api/unified_section_list/main.yaml
new file mode 100644
index 0000000000000..57206666046a4
--- /dev/null
+++ b/lldb/test/API/python_api/unified_section_list/main.yaml
@@ -0,0 +1,45 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+  Entry:           0x1040
+ProgramHeaders:
+  - Type:            PT_PHDR
+    Flags:           [ PF_R ]
+    VAddr:           0x40
+    Align:           0x8
+    Offset:          0x40
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .text
+    LastSec:         .fini
+    Align:           0x1000
+    Offset:          0x0
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1040
+    AddressAlign:    0x10
+    Content:         BEC0FFEE
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1140
+    AddressAlign:    0x4
+    Content:         DEADBEEF
+  - Name:            .comment
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3140
+    AddressAlign:    0x4
+Symbols:
+  - Name:            main
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1130
+    Size:            0xF
+...
diff --git a/lldb/test/API/terminal/TestEditline.py b/lldb/test/API/terminal/TestEditline.py
index 38f4f34ed740b..4696b1e1b112e 100644
--- a/lldb/test/API/terminal/TestEditline.py
+++ b/lldb/test/API/terminal/TestEditline.py
@@ -94,7 +94,7 @@ def test_prompt_no_color(self):
         # after the prompt.
         self.child.send("foo")
         # Check that there are no escape codes.
-        self.child.expect(re.escape("\n(lldb) foo"))
+        self.child.expect(re.escape("\n\r\x1b[K(lldb) foo"))
 
     @skipIfAsan
     @skipIfEditlineSupportMissing
diff --git a/lldb/test/API/test_utils/pdb/Makefile b/lldb/test/API/test_utils/pdb/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/test_utils/pdb/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/test_utils/pdb/TestPdb.py b/lldb/test/API/test_utils/pdb/TestPdb.py
new file mode 100644
index 0000000000000..bd3a9d0c34ab3
--- /dev/null
+++ b/lldb/test/API/test_utils/pdb/TestPdb.py
@@ -0,0 +1,18 @@
+"""
+Test PDB enabled tests
+"""
+
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+
+
+class TestBuildMethod(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
+    def test(self):
+        self.build()
+        self.assertTrue(self.dbg.CreateTarget(self.getBuildArtifact()))
+        if self.getDebugInfo() == "pdb":
+            self.expect(
+                "target modules dump symfile", patterns=["SymbolFile (native-)?pdb"]
+            )
diff --git a/lldb/test/API/test_utils/pdb/main.cpp b/lldb/test/API/test_utils/pdb/main.cpp
new file mode 100644
index 0000000000000..76e8197013aab
--- /dev/null
+++ b/lldb/test/API/test_utils/pdb/main.cpp
@@ -0,0 +1 @@
+int main() { return 0; }
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py
index 7b78541fb4f8e..beab4d6c1f5a6 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py
@@ -81,20 +81,24 @@ def test_breakpoint_events(self):
                 breakpoint["verified"], "expect foo breakpoint to not be verified"
             )
 
+        # Flush the breakpoint events.
+        self.dap_server.wait_for_breakpoint_events()
+
         # Continue to the breakpoint
-        self.continue_to_breakpoint(foo_bp_id)
-        self.continue_to_next_stop()  # foo_bp2
-        self.continue_to_breakpoint(main_bp_id)
-        self.continue_to_exit()
+        self.continue_to_breakpoints(dap_breakpoint_ids)
 
-        bp_events = [e for e in self.dap_server.events if e["event"] == "breakpoint"]
+        verified_breakpoint_ids = []
+        unverified_breakpoint_ids = []
+        for breakpoint_event in self.dap_server.wait_for_breakpoint_events():
+            breakpoint = breakpoint_event["body"]["breakpoint"]
+            id = breakpoint["id"]
+            if breakpoint["verified"]:
+                verified_breakpoint_ids.append(id)
+            else:
+                unverified_breakpoint_ids.append(id)
 
-        main_bp_events = [
-            e for e in bp_events if e["body"]["breakpoint"]["id"] == main_bp_id
-        ]
-        foo_bp_events = [
-            e for e in bp_events if e["body"]["breakpoint"]["id"] == foo_bp_id
-        ]
+        self.assertIn(main_bp_id, unverified_breakpoint_ids)
+        self.assertIn(foo_bp_id, unverified_breakpoint_ids)
 
-        self.assertTrue(main_bp_events)
-        self.assertTrue(foo_bp_events)
+        self.assertIn(main_bp_id, verified_breakpoint_ids)
+        self.assertIn(foo_bp_id, verified_breakpoint_ids)
diff --git a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
index 09e3f62f0eead..19f88d88c2ff4 100644
--- a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
+++ b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
@@ -3,17 +3,15 @@
 """
 
 
-import dap_server
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 import lldbdap_testcase
-import subprocess
 import time
 import os
 
 
-class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
+class TestDAP_disconnect(lldbdap_testcase.DAPTestCaseBase):
     source = "main.cpp"
 
     def disconnect_and_assert_no_output_printed(self):
@@ -67,10 +65,11 @@ def test_attach(self):
             lambda: self.run_platform_command("rm %s" % (sync_file_path))
         )
 
-        self.process = subprocess.Popen([program, sync_file_path])
+        proc = self.spawnSubprocess(program, [sync_file_path])
         lldbutil.wait_for_file_on_target(self, sync_file_path)
 
-        self.attach(pid=self.process.pid, disconnectAutomatically=False)
+        self.attach(pid=proc.pid, disconnectAutomatically=False, stopOnEntry=True)
+        self.continue_to_next_stop()
         response = self.dap_server.request_evaluate("wait_for_attach = false;")
         self.assertTrue(response["success"])
 
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
index 09b13223e0a78..ca881f1d817c5 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
@@ -156,7 +156,6 @@ def test_debuggerRoot(self):
         self.build_and_launch(
             program, debuggerRoot=program_parent_dir, initCommands=commands
         )
-        self.continue_to_exit()
         output = self.get_console()
         self.assertTrue(output and len(output) > 0, "expect console output")
         lines = output.splitlines()
@@ -172,6 +171,7 @@ def test_debuggerRoot(self):
                     % (program_parent_dir, line[len(prefix) :]),
                 )
         self.assertTrue(found, "verified lldb-dap working directory")
+        self.continue_to_exit()
 
     def test_sourcePath(self):
         """
diff --git a/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py b/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py
index 9d1d17b704f76..1f4afabbd161e 100644
--- a/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py
+++ b/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py
@@ -1,58 +1,58 @@
-"""
-Test 'module' events for dynamically loaded libraries.
-"""
-
+import dap_server
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
 import lldbdap_testcase
+import re
 
 
 class TestDAP_module_event(lldbdap_testcase.DAPTestCaseBase):
-    def lookup_module_id(self, name):
-        """Returns the identifier for the first module event starting with the given name."""
-        for event in self.dap_server.module_events:
-            if self.get_dict_value(event, ["body", "module", "name"]).startswith(name):
-                return self.get_dict_value(event, ["body", "module", "id"])
-        self.fail(f"No module events matching name={name}")
-
-    def module_events(self, id):
-        """Finds all module events by identifier."""
-        return [
-            event
-            for event in self.dap_server.module_events
-            if self.get_dict_value(event, ["body", "module", "id"]) == id
-        ]
-
-    def module_reasons(self, events):
-        """Returns the list of 'reason' values from the given events."""
-        return [event["body"]["reason"] for event in events]
-
     @skipIfWindows
     def test_module_event(self):
-        """
-        Test that module events are fired on target load and when the list of
-        dynamic libraries updates while running.
-        """
         program = self.getBuildArtifact("a.out")
         self.build_and_launch(program)
-        # We can analyze the order of events after the process exits.
-        self.continue_to_exit()
 
-        a_out_id = self.lookup_module_id("a.out")
-        a_out_events = self.module_events(id=a_out_id)
+        source = "main.cpp"
+        breakpoint1_line = line_number(source, "// breakpoint 1")
+        breakpoint2_line = line_number(source, "// breakpoint 2")
+        breakpoint3_line = line_number(source, "// breakpoint 3")
 
-        self.assertIn(
-            "new",
-            self.module_reasons(a_out_events),
-            "Expected a.out to load during the debug session.",
+        breakpoint_ids = self.set_source_breakpoints(
+            source, [breakpoint1_line, breakpoint2_line, breakpoint3_line]
         )
+        self.continue_to_breakpoints(breakpoint_ids)
+
+        # We're now stopped at breakpoint 1 before the dlopen. Flush all the module events.
+        event = self.dap_server.wait_for_event(["module"])
+        while event is not None:
+            event = self.dap_server.wait_for_event(["module"])
+
+        # Continue to the second breakpoint, before the dlclose.
+        self.continue_to_breakpoints(breakpoint_ids)
+
+        # Make sure we got a module event for libother.
+        event = self.dap_server.wait_for_event(["module"])
+        self.assertIsNotNone(event, "didn't get a module event")
+        module_name = event["body"]["module"]["name"]
+        module_id = event["body"]["module"]["id"]
+        self.assertEqual(event["body"]["reason"], "new")
+        self.assertIn("libother", module_name)
+
+        # Continue to the third breakpoint, after the dlclose.
+        self.continue_to_breakpoints(breakpoint_ids)
+
+        # Make sure we got a module event for libother.
+        event = self.dap_server.wait_for_event(["module"])
+        self.assertIsNotNone(event, "didn't get a module event")
+        reason = event["body"]["reason"]
+        self.assertEqual(reason, "removed")
+        self.assertEqual(event["body"]["module"]["id"], module_id)
+
+        # The removed module event should omit everything but the module id and name
+        # as they are required fields.
+        module_data = event["body"]["module"]
+        required_keys = ["id", "name"]
+        self.assertListEqual(list(module_data.keys()), required_keys)
+        self.assertEqual(module_data["name"], "", "expects empty name.")
 
-        libother_id = self.lookup_module_id(
-            "libother."  # libother.so or libother.dylib based on OS.
-        )
-        libother_events = self.module_events(id=libother_id)
-        self.assertEqual(
-            self.module_reasons(libother_events),
-            ["new", "removed"],
-            "Expected libother to be loaded then unloaded during the debug session.",
-        )
+        self.continue_to_exit()
diff --git a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
index 2d00c512721c6..0ed53dac5d869 100644
--- a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
+++ b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
@@ -64,18 +64,19 @@ def check_symbols_loaded_with_size():
         self.assertEqual(program, program_module["path"])
         self.assertIn("addressRange", program_module)
 
-        self.continue_to_exit()
-
         # Collect all the module names we saw as events.
         module_new_names = []
         module_changed_names = []
-        for module_event in self.dap_server.module_events:
+        module_event = self.dap_server.wait_for_event(["module"])
+        while module_event is not None:
             reason = module_event["body"]["reason"]
             if reason == "new":
                 module_new_names.append(module_event["body"]["module"]["name"])
             elif reason == "changed":
                 module_changed_names.append(module_event["body"]["module"]["name"])
 
+            module_event = self.dap_server.wait_for_event(["module"])
+
         # Make sure we got an event for every active module.
         self.assertNotEqual(len(module_new_names), 0)
         for module in active_modules:
@@ -85,6 +86,7 @@ def check_symbols_loaded_with_size():
         # symbols got added.
         self.assertNotEqual(len(module_changed_names), 0)
         self.assertIn(program_module["name"], module_changed_names)
+        self.continue_to_exit()
 
     @skipIfWindows
     def test_modules(self):
diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py
index 83faf276852f8..e8e07e1e86fc4 100644
--- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py
+++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py
@@ -51,20 +51,8 @@ def test_stopOnEntry(self):
         self.build_and_launch(program, stopOnEntry=True)
         [bp_main] = self.set_function_breakpoints(["main"])
 
-        self.dap_server.request_configurationDone()
-        self.dap_server.wait_for_stopped()
-        # Once the "configuration done" event is sent, we should get a stopped
-        # event immediately because of stopOnEntry.
-        self.assertTrue(
-            len(self.dap_server.thread_stop_reasons) > 0,
-            "expected stopped event during launch",
-        )
-        for _, body in self.dap_server.thread_stop_reasons.items():
-            if "reason" in body:
-                reason = body["reason"]
-                self.assertNotEqual(
-                    reason, "breakpoint", 'verify stop isn\'t "main" breakpoint'
-                )
+        self.continue_to_next_stop()
+        self.verify_stop_on_entry()
 
         # Then, if we continue, we should hit the breakpoint at main.
         self.continue_to_breakpoints([bp_main])
@@ -73,17 +61,7 @@ def test_stopOnEntry(self):
         # main.
         resp = self.dap_server.request_restart()
         self.assertTrue(resp["success"])
-        stopped_events = self.dap_server.wait_for_stopped()
-        for stopped_event in stopped_events:
-            if "body" in stopped_event:
-                body = stopped_event["body"]
-                if "reason" in body:
-                    reason = body["reason"]
-                    self.assertNotEqual(
-                        reason,
-                        "breakpoint",
-                        'verify stop after restart isn\'t "main" breakpoint',
-                    )
+        self.verify_stop_on_entry()
 
     @skipIfWindows
     def test_arguments(self):
diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
index fa62ec243f5c5..7d4949907df0d 100644
--- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
+++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
@@ -11,31 +11,6 @@
 
 @skipIfBuildType(["debug"])
 class TestDAP_restart_console(lldbdap_testcase.DAPTestCaseBase):
-    def verify_stopped_on_entry(self, stopped_events: List[Dict[str, Any]]):
-        seen_stopped_event = 0
-        for stopped_event in stopped_events:
-            body = stopped_event.get("body")
-            if body is None:
-                continue
-
-            reason = body.get("reason")
-            if reason is None:
-                continue
-
-            self.assertNotEqual(
-                reason,
-                "breakpoint",
-                'verify stop after restart isn\'t "main" breakpoint',
-            )
-            if reason == "entry":
-                seen_stopped_event += 1
-
-        self.assertEqual(
-            seen_stopped_event,
-            1,
-            f"expect only one stopped entry event in {stopped_events}",
-        )
-
     @skipIfAsan
     @skipIfWindows
     @skipIf(oslist=["linux"], archs=["arm$"])  # Always times out on buildbot
@@ -97,12 +72,7 @@ def test_stopOnEntry(self):
         [bp_main] = self.set_function_breakpoints(["main"])
 
         self.dap_server.request_configurationDone()
-        stopped_threads = list(self.dap_server.thread_stop_reasons.values())
-        # We should be stopped at the entry point.
-        self.assertEqual(
-            len(stopped_threads), 1, "Expected the main thread to be stopped on entry."
-        )
-        self.assertEqual(stopped_threads[0]["reason"], "entry")
+        self.verify_stop_on_entry()
 
         # Then, if we continue, we should hit the breakpoint at main.
         self.dap_server.request_continue()
@@ -111,12 +81,7 @@ def test_stopOnEntry(self):
         # Restart and check that we still get a stopped event before reaching
         # main.
         self.dap_server.request_restart()
-        stopped_threads = list(self.dap_server.thread_stop_reasons.values())
-        # We should be stopped at the entry point.
-        self.assertEqual(
-            len(stopped_threads), 1, "Expected the main thread to be stopped on entry."
-        )
-        self.assertEqual(stopped_threads[0]["reason"], "entry")
+        self.verify_stop_on_entry()
 
         # continue to main
         self.dap_server.request_continue()
diff --git a/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py
index 0184020589176..a01845669666f 100644
--- a/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py
+++ b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py
@@ -32,7 +32,7 @@ def test_send_event(self):
             ],
         )
         self.set_source_breakpoints(source, [breakpoint_line])
-        self.do_continue()
+        self.continue_to_next_stop()
 
         custom_event = self.dap_server.wait_for_event(
             filter=["my-custom-event-no-body"]
diff --git a/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py b/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py
index 12b321cf42778..3c53cf2ed3460 100644
--- a/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py
+++ b/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py
@@ -37,7 +37,7 @@ def cleanup():
 
     def run_debug_session(self, connection, name, sleep_seconds_in_middle=None):
         self.dap_server = dap_server.DebugAdapterServer(
-            connection=connection,
+            connection=connection, spawn_helper=self.spawnSubprocess
         )
         program = self.getBuildArtifact("a.out")
         source = "main.c"
@@ -94,6 +94,7 @@ def test_server_interrupt(self):
         (process, connection) = self.start_server(connection="listen://localhost:0")
         self.dap_server = dap_server.DebugAdapterServer(
             connection=connection,
+            spawn_helper=self.spawnSubprocess,
         )
         program = self.getBuildArtifact("a.out")
         source = "main.c"
diff --git a/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/Makefile b/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/Makefile
new file mode 100644
index 0000000000000..10495940055b6
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/Makefile
@@ -0,0 +1,3 @@
+C_SOURCES := main.c
+
+include Makefile.rules
diff --git a/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/TestDAP_stackTraceCompilerGeneratedCode.py b/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/TestDAP_stackTraceCompilerGeneratedCode.py
new file mode 100644
index 0000000000000..4ddf92402ad8a
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/TestDAP_stackTraceCompilerGeneratedCode.py
@@ -0,0 +1,66 @@
+"""
+Test lldb-dap stackTrace request for compiler generated code
+"""
+
+import os
+
+import lldbdap_testcase
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+
+
+class TestDAP_stackTraceCompilerGeneratedCode(lldbdap_testcase.DAPTestCaseBase):
+    def test_non_leaf_frame_compiler_generate_code(self):
+        """
+        Test that non-leaf frames with compiler-generated code are properly resolved.
+
+        This test verifies that LLDB correctly handles stack frames containing
+        compiler-generated code (code without valid source location information).
+        When a non-leaf frame contains compiler-generated code immediately after a
+        call instruction, LLDB should resolve the frame's source location to the
+        call instruction's line, rather than to the compiler-generated code that
+        follows, which lacks proper symbolication information.
+        """
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+        source = "main.c"
+
+        # Set breakpoint inside bar() function
+        lines = [line_number(source, "// breakpoint here")]
+        breakpoint_ids = self.set_source_breakpoints(source, lines)
+        self.assertEqual(
+            len(breakpoint_ids), len(lines), "expect correct number of breakpoints"
+        )
+
+        self.continue_to_breakpoints(breakpoint_ids)
+
+        # Get the stack frames: [0] = bar(), [1] = foo(), [2] = main()
+        stack_frames = self.get_stackFrames()
+        self.assertGreater(len(stack_frames), 2, "Expected more than 2 stack frames")
+
+        # Examine the foo() frame (stack_frames[1])
+        # This is the critical frame containing compiler-generated code
+        foo_frame = stack_frames[1]
+
+        # Verify that the frame's line number points to the bar() call,
+        # not to the compiler-generated code after it
+        foo_call_bar_source_line = foo_frame.get("line")
+        self.assertEqual(
+            foo_call_bar_source_line,
+            line_number(source, "foo call bar"),
+            "Expected foo call bar to be the source line of the frame",
+        )
+
+        # Verify the source file name is correctly resolved
+        foo_source_name = foo_frame.get("source", {}).get("name")
+        self.assertEqual(
+            foo_source_name, "main.c", "Expected foo source name to be main.c"
+        )
+
+        # When lldb fails to symbolicate a frame it will emit a fake assembly
+        # source with path of format <module>`<symbol> or <module>`<address> with
+        # sourceReference to retrieve disassembly source file.
+        # Verify that this didn't happen - the path should be a real file path.
+        foo_path = foo_frame.get("source", {}).get("path")
+        self.assertNotIn("`", foo_path, "Expected foo source path to not contain `")
+        self.continue_to_exit()
diff --git a/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/main.c b/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/main.c
new file mode 100644
index 0000000000000..dd3fcc295d492
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/stackTraceCompilerGeneratedCode/main.c
@@ -0,0 +1,19 @@
+void bar() {
+  int val = 32; // breakpoint here
+}
+
+void at_line_zero() {}
+
+int foo();
+
+int main(int argc, char const *argv[]) {
+  foo();
+  return 0;
+}
+
+int foo() {
+  bar(); // foo call bar
+#line 0 "test.cpp"
+  at_line_zero();
+  return 0;
+}
diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt
index 513d1ec493ee1..818dff58aceeb 100644
--- a/lldb/test/CMakeLists.txt
+++ b/lldb/test/CMakeLists.txt
@@ -202,7 +202,7 @@ if(TARGET clang)
     else()
       # We require libcxx for the test suite, so if we aren't building it,
       # provide a helpful error about how to resolve the situation.
-      if(NOT LLDB_HAS_LIBCXX)
+      if(LLDB_ENFORCE_STRICT_TEST_REQUIREMENTS AND NOT LLDB_HAS_LIBCXX)
         message(SEND_ERROR
           "LLDB test suite requires libc++, but it is currently disabled. "
           "Please add `libcxx` to `LLVM_ENABLE_RUNTIMES` or disable tests via "
diff --git a/lldb/test/Shell/Breakpoint/jit-loader_jitlink_elf.test b/lldb/test/Shell/Breakpoint/jit-loader_jitlink_elf.test
index 52c86fa5530bf..9a972f1f1ece7 100644
--- a/lldb/test/Shell/Breakpoint/jit-loader_jitlink_elf.test
+++ b/lldb/test/Shell/Breakpoint/jit-loader_jitlink_elf.test
@@ -3,8 +3,8 @@
 
 # JITLink is the Orc-specific JIT linker implementation.
 #
-# RUN: %clang -g -S -emit-llvm -fPIC --target=x86_64-unknown-unknown-elf \
-# RUN:        -o %t.ll %p/Inputs/jitbp.cpp
+# RUN: %clangxx -g -S -emit-llvm -fPIC --target=x86_64-unknown-unknown-elf \
+# RUN:          -o %t.ll %p/Inputs/jitbp.cpp
 # RUN: %lldb -b -o 'settings set plugin.jit-loader.gdb.enable on' -o 'b jitbp' \
 # RUN:          -o 'run --jit-linker=jitlink %t.ll' lli | FileCheck %s
 
diff --git a/lldb/test/Shell/Breakpoint/jit-loader_rtdyld_elf.test b/lldb/test/Shell/Breakpoint/jit-loader_rtdyld_elf.test
index b34a5673936f5..ae9402a519494 100644
--- a/lldb/test/Shell/Breakpoint/jit-loader_rtdyld_elf.test
+++ b/lldb/test/Shell/Breakpoint/jit-loader_rtdyld_elf.test
@@ -3,8 +3,8 @@
 
 # RuntimeDyld can be used to link and load emitted code for both, MCJIT and Orc.
 #
-# RUN: %clang -g -S -emit-llvm --target=x86_64-unknown-unknown-elf \
-# RUN:        -o %t.ll %p/Inputs/jitbp.cpp
+# RUN: %clangxx -g -S -emit-llvm --target=x86_64-unknown-unknown-elf \
+# RUN:          -o %t.ll %p/Inputs/jitbp.cpp
 #
 # RUN: %lldb -b -o 'settings set plugin.jit-loader.gdb.enable on' -o 'b jitbp' \
 # RUN:          -o 'run --jit-kind=mcjit %t.ll' lli | FileCheck %s
diff --git a/lldb/test/Shell/Commands/Inputs/sigchld.c b/lldb/test/Shell/Commands/Inputs/sigchld.c
index ba8c5ef45365b..0121e70c1bdd0 100644
--- a/lldb/test/Shell/Commands/Inputs/sigchld.c
+++ b/lldb/test/Shell/Commands/Inputs/sigchld.c
@@ -1,3 +1,7 @@
+#if defined(__linux__)
+#define _XOPEN_SOURCE 500 /* for CLD_EXITED */
+#endif
+
 #include <assert.h>
 #include <signal.h>
 #include <stdio.h>
diff --git a/lldb/test/Shell/Commands/command-image-dump-ast-colored.test b/lldb/test/Shell/Commands/command-image-dump-ast-colored.test
index 355ef6bb1d199..7fd70d234fbd4 100644
--- a/lldb/test/Shell/Commands/command-image-dump-ast-colored.test
+++ b/lldb/test/Shell/Commands/command-image-dump-ast-colored.test
@@ -1,7 +1,7 @@
 # Test AST dumping with and without color.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Commands/command-image-dump-ast.test b/lldb/test/Shell/Commands/command-image-dump-ast.test
index 3204022418cb8..86fe1836a2c6c 100644
--- a/lldb/test/Shell/Commands/command-image-dump-ast.test
+++ b/lldb/test/Shell/Commands/command-image-dump-ast.test
@@ -5,7 +5,7 @@
 # UNSUPPORTED: system-windows
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Commands/command-list-reach-beginning-of-file.test b/lldb/test/Shell/Commands/command-list-reach-beginning-of-file.test
index fa4a93e5904aa..9987efedd8020 100644
--- a/lldb/test/Shell/Commands/command-list-reach-beginning-of-file.test
+++ b/lldb/test/Shell/Commands/command-list-reach-beginning-of-file.test
@@ -4,7 +4,7 @@
 # RUN: %lldb %t.out -b -s %s 2>&1 | FileCheck %s
 
 list
-# CHECK: note: No source available 
+# CHECK: note: No source available
 
 b main
 # CHECK: Breakpoint 1:
@@ -18,7 +18,7 @@ list
 list -
 # CHECK: int main()
 
-list -10
+list -13
 # CHECK: #include <assert.h>
 
 list -
diff --git a/lldb/test/Shell/Commands/list-header.test b/lldb/test/Shell/Commands/list-header.test
index 53c4b786f1810..27eaa1a4f29c2 100644
--- a/lldb/test/Shell/Commands/list-header.test
+++ b/lldb/test/Shell/Commands/list-header.test
@@ -3,11 +3,11 @@
 # XFAIL: target-windows
 
 ## Test that `list header.h:<line>` works correctly when header is available.
-## 
+##
 # RUN: split-file %s %t
 
-# RUN: %clang_host -g %t/main_with_inlined.cc %t/foo.cc -o %t/main_with_inlined.out
-# RUN: %clang_host -g %t/main_no_inlined.cc %t/foo.cc -o %t/main_no_inlined.out
+# RUN: %clangxx_host -g %t/main_with_inlined.cc %t/foo.cc -o %t/main_with_inlined.out
+# RUN: %clangxx_host -g %t/main_no_inlined.cc %t/foo.cc -o %t/main_no_inlined.out
 
 # RUN: %lldb %t/main_with_inlined.out -o "list foo.h:2" -o "exit" 2>&1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-INLINED
@@ -19,7 +19,7 @@
 
 # CHECK-INLINED: 2      extern int* ptr;
 # CHECK-INLINED: 3   	void f(int x);
-# CHECK-INLINED: 4   	
+# CHECK-INLINED: 4
 # CHECK-INLINED: 5   	inline void g(int x) {
 # CHECK-INLINED: 6   	  *ptr = x; // should crash here
 # CHECK-INLINED: 7   	}
diff --git a/lldb/test/Shell/DAP/TestClientLauncher.test b/lldb/test/Shell/DAP/TestClientLauncher.test
new file mode 100644
index 0000000000000..a79a940da5a98
--- /dev/null
+++ b/lldb/test/Shell/DAP/TestClientLauncher.test
@@ -0,0 +1,2 @@
+# RUN: lldb-dap --client vscode-url -- /path/to/foo | FileCheck %s
+# CHECK: vscode://llvm-vs-code-extensions.lldb-dap/start?program=%2Fpath%2Fto%2Ffoo
diff --git a/lldb/test/Shell/Error/cleanup.cpp b/lldb/test/Shell/Error/cleanup.cpp
index 6abc62dc4af99..1e83478a83337 100644
--- a/lldb/test/Shell/Error/cleanup.cpp
+++ b/lldb/test/Shell/Error/cleanup.cpp
@@ -1,5 +1,5 @@
 // Test CommandObject is cleaned up even after commands fail due to not taking any argument.
-// RUN: %clang_host -g %s -o %t
+// RUN: %clangxx_host -g %s -o %t
 // RUN: %lldb -f %t -o "settings set interpreter.stop-command-source-on-error false" -s \
 // RUN:   %S/Inputs/cleanup.lldbinit
 int main() { return 0; }
diff --git a/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test b/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test
new file mode 100644
index 0000000000000..a9557801cc134
--- /dev/null
+++ b/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test
@@ -0,0 +1,36 @@
+# Test format (e.g., indentation) when printing the list of stop hooks.
+#
+# RUN: %lldb -b -s %s | FileCheck %s --match-full-lines --strict-whitespace
+
+# Create some stop hooks
+target stop-hook add -o 'print "Hello"' --auto-continue true --at-initial-stop true
+target stop-hook add -o 'print "world,"' -o 'print "nice"' --file 'my_file'
+target stop-hook add -o 'print "weather!"'  --classname 'MyClass' --thread-name 'my_thread'
+
+# Print hooks
+target stop-hook list
+
+# CHECK:(lldb) target stop-hook list
+# CHECK:Hook: 1
+# CHECK:  State: enabled
+# CHECK:  AutoContinue on
+# CHECK:  Commands:
+# CHECK:      print "Hello"
+# CHECK-EMPTY:
+# CHECK:Hook: 2
+# CHECK:  State: enabled
+# CHECK:  Specifier:
+# CHECK:    File: my_file.
+# CHECK:  Commands:
+# CHECK:      print "world,"
+# CHECK:      print "nice"
+# CHECK-EMPTY:
+# CHECK:Hook: 3
+# CHECK:  State: enabled
+# CHECK:  Specifier:
+# CHECK:    Class name: MyClass.
+# CHECK:  Thread:
+# CHECK:    thread name: "my_thread" 
+# CHECK:  Commands:
+# CHECK:      print "weather!"
+# CHECK-EMPTY:
diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_CompileFailure.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_CompileFailure.test
new file mode 100644
index 0000000000000..49ee2778ea18b
--- /dev/null
+++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_CompileFailure.test
@@ -0,0 +1,46 @@
+## Tests the case where module compilation fails.
+#
+# REQUIRES: system-darwin
+#
+# RUN: split-file %s %t/sources
+# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \
+# RUN:             -DSHOULD_COMPILE=1 \
+# RUN:             -fmodule-map-file=%t/sources/module.modulemap \
+# RUN:             -fmodules-cache-path=%t/ModuleCache -o %t.out
+#
+# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \
+# RUN:       -s %t/sources/commands.input %t.out -o exit 2>&1 | FileCheck %s
+
+#--- main.m
+@import foo;
+
+int main() { __builtin_debugtrap(); }
+
+#--- foo.h
+struct foo {};
+
+#ifndef SHOULD_COMPILE
+#error "Compilation failure."
+#endif
+
+#--- module.modulemap
+module foo {
+    header "foo.h"
+    export *
+}
+
+#--- commands.input
+log enable lldb expr
+run
+## Make sure expression fails so the 'note' diagnostics get printed.
+expr blah
+
+# CHECK: Finished building Clang module foo
+# CHECK: couldn't load top-level module foo:
+# CHECK: While building module 'foo' imported from LLDBModulesMemoryBuffer
+# CHEKC: {{.*}}sources/foo.h{{.*}}: error: "Compilation failure."
+# CHECK: LLDBModulesMemoryBuffer:1:1: fatal error: could not build module 'foo'
+
+# CHECK: Error while loading hand-imported modules:
+# CHECK: couldn't load top-level module foo:
+# CHECK-NOT: Compilation failure
diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_FromExpression.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_FromExpression.test
new file mode 100644
index 0000000000000..b964e9b27e914
--- /dev/null
+++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_FromExpression.test
@@ -0,0 +1,54 @@
+## Tests the case where we fail to import modules from @import
+## statements that are part of the expression being run.
+#
+# REQUIRES: system-darwin
+#
+# RUN: split-file %s %t/sources
+# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \
+# RUN:             -fmodule-map-file=%t/sources/module.modulemap \
+# RUN:             -fmodules-cache-path=%t/ModuleCache -o %t.out
+#
+# RUN: sed -i '' -e 's/foo\.h/baz\.h/' %t/sources/module.modulemap
+#
+# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \
+# RUN:       -s %t/sources/commands.input %t.out -o exit 2>&1 | FileCheck %s
+
+#--- main.m
+@import foo;
+@import bar;
+
+int main() { __builtin_debugtrap(); }
+
+#--- foo.h
+struct foo {};
+
+#--- bar.h
+struct bar {};
+
+#--- module.modulemap
+module foo {
+    header "foo.h"
+    export *
+}
+
+module bar {
+    header "bar.h"
+    export *
+}
+
+#--- commands.input
+run
+## Make sure expression fails so the 'note' diagnostics get printed.
+expr @import Foo; @import Bar
+expr @import foo
+
+# CHECK: error: while importing modules:
+# CHECK-NEXT: header search couldn't locate module 'Foo'
+# CHECK-NEXT: header search couldn't locate module 'Bar'
+#
+# CHECK: expr @import foo
+# CHECK: error: while importing modules:
+# CHECK-NEXT: couldn't load top-level module foo
+## No mention of the previous import errors.
+# CHECK-NOT: Foo
+# CHECK-NOT: Bar
diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidNestedSubmodule.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidNestedSubmodule.test
new file mode 100644
index 0000000000000..1e8075dd20fad
--- /dev/null
+++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidNestedSubmodule.test
@@ -0,0 +1,70 @@
+## Tests the case where we fail to load a submodule of a submodule. We force this
+## by removing the submodule 'module qux' of 'module baz' from the modulemap.
+#
+# REQUIRES: system-darwin
+#
+# RUN: split-file %s %t/sources
+# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \
+# RUN:             -fmodule-map-file=%t/sources/module.modulemap \
+# RUN:             -fmodules-cache-path=%t/ModuleCache -o %t.out
+# RUN: sed -i '' -e 's/module qux/module quz/' %t/sources/module.modulemap
+#
+# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \
+# RUN:       -s %t/sources/commands.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=NO_LOG
+#
+# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \
+# RUN:       -s %t/sources/commands-with-log.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=LOG
+
+#--- main.m
+@import foo.baz.qux;
+@import bar;
+
+int main() { __builtin_debugtrap(); }
+
+#--- foo.h
+struct foo {};
+
+#--- bar.h
+struct bar {};
+
+#--- baz.h
+struct baz {};
+
+#--- qux.h
+struct qux {};
+
+#--- module.modulemap
+module foo {
+    header "foo.h"
+    export *
+
+    module baz {
+        header "baz.h"
+        export *
+
+        module qux {
+            header "qux.h"
+            export *
+        }
+    }
+}
+
+module bar {
+    header "bar.h"
+    export *
+}
+
+#--- commands.input
+run
+## Make sure expression fails so the 'note' diagnostics get printed.
+expr blah
+
+# NO_LOG-NOT: couldn't load submodule 'qux' of module 'foo.baz'
+
+#--- commands-with-log.input
+log enable lldb expr
+run
+## Make sure expression fails so the 'note' diagnostics get printed.
+expr blah
+
+# LOG: couldn't load submodule 'qux' of module 'foo.baz'
diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidSearchPath.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidSearchPath.test
new file mode 100644
index 0000000000000..35ba5802d2add
--- /dev/null
+++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidSearchPath.test
@@ -0,0 +1,59 @@
+## Tests the case where the DW_AT_LLVM_include_path of the module is invalid.
+## We forces this by just removing that directory (which in our case is 'sources').
+#
+# REQUIRES: system-darwin
+#
+# RUN: split-file %s %t/sources
+# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \
+# RUN:             -fmodule-map-file=%t/sources/module.modulemap \
+# RUN:             -fmodules-cache-path=%t/ModuleCache -o %t.out
+#
+# RUN: cp %t/sources/commands.input %t/commands.input
+# RUN: cp %t/sources/commands-with-log.input %t/commands-with-log.input
+# RUN: rm -r %t/sources
+#
+# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \
+# RUN:       -s %t/commands.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=NO_LOG
+#
+# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \
+# RUN:       -s %t/commands-with-log.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=LOG
+
+#--- main.m
+@import foo;
+@import bar;
+
+int main() { __builtin_debugtrap(); }
+
+#--- foo.h
+struct foo {};
+
+#--- bar.h
+struct bar {};
+
+#--- module.modulemap
+module foo {
+    header "foo.h"
+    export *
+}
+
+module bar {
+    header "bar.h"
+    export *
+}
+
+#--- commands.input
+run
+## Make sure expression fails so the 'note' diagnostics get printed.
+expr blah
+
+# NO_LOG-NOT: couldn't find module search path directory {{.*}}sources
+# NO_LOG-NOT: couldn't find module search path directory {{.*}}sources
+
+#--- commands-with-log.input
+log enable lldb expr
+run
+## Make sure expression fails so the 'note' diagnostics get printed.
+expr blah
+
+# LOG: couldn't find module search path directory {{.*}}sources
+# LOG: couldn't find module search path directory {{.*}}sources
diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidSubmodule.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidSubmodule.test
new file mode 100644
index 0000000000000..1bfbbcf32ecae
--- /dev/null
+++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidSubmodule.test
@@ -0,0 +1,62 @@
+## Tests the case where we fail to load a submodule. We force this by removing
+## the submodule 'module baz' from the modulemap.
+#
+# REQUIRES: system-darwin
+#
+# RUN: split-file %s %t/sources
+# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \
+# RUN:             -fmodule-map-file=%t/sources/module.modulemap \
+# RUN:             -fmodules-cache-path=%t/ModuleCache -o %t.out
+# RUN: sed -i '' -e 's/module baz/module qux/' %t/sources/module.modulemap
+#
+# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \
+# RUN:       -s %t/sources/commands.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=NO_LOG
+#
+# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \
+# RUN:       -s %t/sources/commands-with-log.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=LOG
+
+#--- main.m
+@import foo.baz;
+@import bar;
+
+int main() { __builtin_debugtrap(); }
+
+#--- foo.h
+struct foo {};
+
+#--- bar.h
+struct bar {};
+
+#--- baz.h
+struct baz {};
+
+#--- module.modulemap
+module foo {
+    header "foo.h"
+    export *
+
+    module baz {
+        header "baz.h"
+        export *
+    }
+}
+
+module bar {
+    header "bar.h"
+    export *
+}
+
+#--- commands.input
+run
+## Make sure expression fails so the 'note' diagnostics get printed.
+expr blah
+
+# NO_LOG-NOT: couldn't load submodule 'baz' of module 'foo'
+
+#--- commands-with-log.input
+log enable lldb expr
+run
+## Make sure expression fails so the 'note' diagnostics get printed.
+expr blah
+
+# LOG: couldn't load submodule 'baz' of module 'foo'
diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidTopLevelModule.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidTopLevelModule.test
new file mode 100644
index 0000000000000..ad181ee7e15e6
--- /dev/null
+++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_InvalidTopLevelModule.test
@@ -0,0 +1,59 @@
+## Tests the case where a module fails to load. We force this by
+## replacing the contents of the 'module foo' declaration with garbage.
+#
+# REQUIRES: system-darwin
+#
+# RUN: split-file %s %t/sources
+# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \
+# RUN:             -fmodule-map-file=%t/sources/module.modulemap \
+# RUN:             -fmodules-cache-path=%t/ModuleCache -o %t.out
+# RUN: sed -i '' -e 's/foo\.h/baz\.h/' %t/sources/module.modulemap
+# RUN: sed -i '' -e 's/bar\.h/qux\.h/' %t/sources/module.modulemap
+#
+# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \
+# RUN:       -s %t/sources/commands.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=NO_LOG
+#
+# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \
+# RUN:       -s %t/sources/commands-with-log.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=LOG
+
+#--- main.m
+@import foo;
+@import bar;
+
+int main() { __builtin_debugtrap(); }
+
+#--- foo.h
+struct foo {};
+
+#--- bar.h
+struct bar {};
+
+#--- module.modulemap
+module foo {
+    header "foo.h"
+    export *
+}
+
+module bar {
+    header "bar.h"
+    export *
+}
+
+#--- commands.input
+run
+## Make sure expression fails so the 'note' diagnostics get printed.
+expr blah
+
+# NO_LOG-NOT: couldn't load top-level module foo
+# NO_LOG-NOT: error: header
+
+#--- commands-with-log.input
+log enable lldb expr
+run
+## Make sure expression fails so the 'note' diagnostics get printed.
+expr blah
+
+# LOG: couldn't load top-level module foo
+# LOG: error: header 'baz.h'
+# LOG: couldn't load top-level module bar
+# LOG: error: header 'qux.h'
diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_ModulemapParsing.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_ModulemapParsing.test
new file mode 100644
index 0000000000000..6d8e665102b17
--- /dev/null
+++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_ModulemapParsing.test
@@ -0,0 +1,57 @@
+## Tests the case where the modulemap is semantically invalid and thus
+## Clang fails to load it on behalf of LLDB. We force this error by
+## creating a redefinition of 'module bar'.
+#
+# REQUIRES: system-darwin
+#
+# RUN: split-file %s %t/sources
+# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \
+# RUN:             -fmodule-map-file=%t/sources/module.modulemap \
+# RUN:             -fmodules-cache-path=%t/ModuleCache -o %t.out
+# RUN: sed -i '' -e 's/module foo/module bar/' %t/sources/module.modulemap
+#
+# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \
+# RUN:       -s %t/sources/commands.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=NO_LOG
+#
+# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \
+# RUN:       -s %t/sources/commands-with-log.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=LOG
+
+#--- main.m
+@import foo;
+@import bar;
+
+int main() { __builtin_debugtrap(); }
+
+#--- foo.h
+struct foo {};
+
+#--- bar.h
+struct bar {};
+
+#--- module.modulemap
+module foo {
+    header "foo.h"
+    export *
+}
+
+module bar {
+    header "bar.h"
+    export *
+}
+
+#--- commands.input
+run
+## Make sure expression fails so the 'note' diagnostics get printed.
+expr blah
+
+# NO_LOG-NOT: failed to parse and load
+# NO_LOG-NOT: failed to parse and load
+
+#--- commands-with-log.input
+log enable lldb expr
+run
+## Make sure expression fails so the 'note' diagnostics get printed.
+expr blah
+
+# LOG: failed to parse and load modulemap file in {{.*}}sources
+# LOG: failed to parse and load modulemap file in {{.*}}sources
diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_NoModule.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_NoModule.test
new file mode 100644
index 0000000000000..bcb8a7d2c5594
--- /dev/null
+++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_NoModule.test
@@ -0,0 +1,58 @@
+## Tests the case where the module LLDB is trying to load isn't
+## present in the modulemap. We force this by replacing 'module foo'
+## in the modulemap.
+#
+# REQUIRES: system-darwin
+#
+# RUN: split-file %s %t/sources
+# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \
+# RUN:             -fmodule-map-file=%t/sources/module.modulemap \
+# RUN:             -fmodules-cache-path=%t/ModuleCache -o %t.out
+# RUN: sed -i '' -e 's/module foo/module baz/' %t/sources/module.modulemap
+# RUN: sed -i '' -e 's/module bar/module qux/' %t/sources/module.modulemap
+#
+# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \
+# RUN:       -s %t/sources/commands.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=NO_LOG
+#
+# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \
+# RUN:       -s %t/sources/commands-with-log.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=LOG
+
+#--- main.m
+@import foo;
+@import bar;
+
+int main() { __builtin_debugtrap(); }
+
+#--- foo.h
+struct foo {};
+
+#--- bar.h
+struct bar {};
+
+#--- module.modulemap
+module foo {
+    header "foo.h"
+    export *
+}
+
+module bar {
+    header "bar.h"
+    export *
+}
+
+#--- commands.input
+run
+## Make sure expression fails so the 'note' diagnostics get printed.
+expr blah
+
+# NO_LOG-NOT: header search couldn't locate module 'foo'
+# NO_LOG-NOT: header search couldn't locate module 'bar'
+
+#--- commands-with-log.input
+log enable lldb expr
+run
+## Make sure expression fails so the 'note' diagnostics get printed.
+expr blah
+
+# LOG: header search couldn't locate module 'foo'
+# LOG: header search couldn't locate module 'bar'
diff --git a/lldb/test/Shell/Expr/TestClangModuleLoadError_NoModuleMap.test b/lldb/test/Shell/Expr/TestClangModuleLoadError_NoModuleMap.test
new file mode 100644
index 0000000000000..57f7f16cc84dd
--- /dev/null
+++ b/lldb/test/Shell/Expr/TestClangModuleLoadError_NoModuleMap.test
@@ -0,0 +1,53 @@
+# REQUIRES: system-darwin
+#
+# RUN: split-file %s %t/sources
+# RUN: %clang_host -g %t/sources/main.m -fmodules -fcxx-modules \
+# RUN:             -fmodule-map-file=%t/sources/module.modulemap \
+# RUN:             -fmodules-cache-path=%t/ModuleCache -o %t.out
+# RUN: rm %t/sources/module.modulemap
+#
+# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \
+# RUN:       -s %t/sources/commands.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=NO_LOG
+#
+# RUN: %lldb -x -o "settings set interpreter.stop-command-source-on-error false" \
+# RUN:       -s %t/sources/commands-with-log.input %t.out -o exit 2>&1 | FileCheck %s --check-prefix=LOG
+
+#--- main.m
+@import foo;
+@import bar;
+
+int main() { __builtin_debugtrap(); }
+
+#--- foo.h
+struct foo {};
+
+#--- bar.h
+struct bar {};
+
+#--- module.modulemap
+module foo {
+    header "foo.h"
+    export *
+}
+
+module bar {
+    header "bar.h"
+    export *
+}
+
+#--- commands.input
+run
+## Make sure expression fails so the 'note' diagnostics get printed.
+expr blah
+
+# NO_LOG-NOT: couldn't find modulemap
+# NO_LOG-NOT: couldn't find modulemap
+
+#--- commands-with-log.input
+log enable lldb expr
+run
+## Make sure expression fails so the 'note' diagnostics get printed.
+expr blah
+
+# LOG: couldn't find modulemap file in {{.*}}sources
+# LOG: couldn't find modulemap file in {{.*}}sources
diff --git a/lldb/test/Shell/Expr/TestExprLanguageNote.test b/lldb/test/Shell/Expr/TestExprLanguageNote.test
index e8e4e1399e451..e7da30816319e 100644
--- a/lldb/test/Shell/Expr/TestExprLanguageNote.test
+++ b/lldb/test/Shell/Expr/TestExprLanguageNote.test
@@ -1,5 +1,5 @@
 # RUN: split-file %s %t
-# RUN: %clang_host -g %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g %t/main.cpp -o %t.out
 #
 # RUN: %lldb -x -b -o "settings set interpreter.stop-command-source-on-error false" \
 # RUN:       -s %t/no-target.input 2>&1 | FileCheck %s --check-prefix=CHECK-NO-TARGET
diff --git a/lldb/test/Shell/Expr/TestLambdaExprImport.test b/lldb/test/Shell/Expr/TestLambdaExprImport.test
index c57ce06453fe2..b49a38036e566 100644
--- a/lldb/test/Shell/Expr/TestLambdaExprImport.test
+++ b/lldb/test/Shell/Expr/TestLambdaExprImport.test
@@ -3,7 +3,7 @@
 # uses always).
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -o "settings set interpreter.stop-command-source-on-error false" \
 # RUN:       -x -b -s %t/commands.input %t.out 2>&1 \
 # RUN:       | FileCheck %s
diff --git a/lldb/test/Shell/ObjectFile/ELF/elf-memory.test b/lldb/test/Shell/ObjectFile/ELF/elf-memory.test
index 75a68edd2d349..170dc7682aab0 100644
--- a/lldb/test/Shell/ObjectFile/ELF/elf-memory.test
+++ b/lldb/test/Shell/ObjectFile/ELF/elf-memory.test
@@ -11,7 +11,7 @@
 //   - verify that "image dump objfile" will dump the dynamic section of the
 //     memory elf file and find the .dynamic string table.
 
-// RUN: %clang_host %p/Inputs/memory-elf.cpp -g -O0 -o %t
+// RUN: %clangxx_host %p/Inputs/memory-elf.cpp -g -O0 -o %t
 
 // RUN: %lldb %t -b \
 // RUN:   -o "b main" \
diff --git a/lldb/test/Shell/ObjectFile/MachO/Inputs/section-overflow-binary b/lldb/test/Shell/ObjectFile/MachO/Inputs/section-overflow-binary
new file mode 100644
index 0000000000000..19dc2f4ac9ffe
Binary files /dev/null and b/lldb/test/Shell/ObjectFile/MachO/Inputs/section-overflow-binary differ
diff --git a/lldb/test/Shell/ObjectFile/MachO/section-overflow-binary.test b/lldb/test/Shell/ObjectFile/MachO/section-overflow-binary.test
new file mode 100644
index 0000000000000..76c335f65a76a
--- /dev/null
+++ b/lldb/test/Shell/ObjectFile/MachO/section-overflow-binary.test
@@ -0,0 +1,13 @@
+RUN: %lldb -b %p/Inputs/section-overflow-binary \
+RUN:   -o 'script dwarf = lldb.target.module[0].sections[0]' \
+RUN:   -o 'script section = dwarf.GetSubSectionAtIndex(0)' \
+RUN:   -o "script print(f'{section.GetName()} file_offset=0x{section.GetFileOffset():016x}')" \
+RUN:   -o 'script section = dwarf.GetSubSectionAtIndex(1)' \
+RUN:   -o "script print(f'{section.GetName()} file_offset=0x{section.GetFileOffset():016x}')" \
+RUN:   -o 'script section = dwarf.GetSubSectionAtIndex(2)' \
+RUN:   -o "script print(f'{section.GetName()} file_offset=0x{section.GetFileOffset():016x}')" \
+RUN:   | FileCheck %s
+
+CHECK: __debug_abbrev file_offset=0x00000000fffffff0
+CHECK: __debug_info file_offset=0x0000000100000010
+CHECK: __debug_line file_offset=0x0000000300000010
diff --git a/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m b/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m
new file mode 100644
index 0000000000000..83a829a8c2fdd
--- /dev/null
+++ b/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m
@@ -0,0 +1,4 @@
+int main() {
+  __builtin_verbose_trap("Foo", "Bar");
+  return 0;
+}
diff --git a/lldb/test/Shell/Recognizer/registration-unique.test b/lldb/test/Shell/Recognizer/registration-unique.test
new file mode 100644
index 0000000000000..e9641923faedf
--- /dev/null
+++ b/lldb/test/Shell/Recognizer/registration-unique.test
@@ -0,0 +1,56 @@
+# UNSUPPORTED: system-windows
+
+# Checks that the recognizers that should work across language runtimes
+# are only registered once with the target.
+
+# RUN: split-file %s %t
+
+# RUN: %clangxx_host %t/main.cpp -g -o %t/cpp.out
+# RUN: %lldb -b -s %t/commands.input %t/cpp.out | FileCheck %s
+
+# RUN: %clangxx_host %t/main.mm -g -o %t/objcxx.out
+# RUN: %lldb -b -s %t/commands.input %t/objcxx.out | FileCheck %s
+
+# RUN: %clang_host %t/main.c -g -o %t/c.out
+# RUN: %lldb -b -s %t/commands.input %t/c.out | FileCheck %s
+
+# RUN: %clang_host %t/main.m -g -o %t/objc.out
+# RUN: %lldb -b -s %t/commands.input %t/objc.out | FileCheck %s
+
+#--- main.m
+int main() {}
+
+#--- main.c
+int main() {}
+
+#--- main.mm
+int main() {}
+
+#--- main.cpp
+int main() {}
+
+#--- commands.input
+
+b main
+frame recognizer list
+run
+frame recognizer list
+continue
+run
+frame recognizer list
+
+# CHECK: frame recognizer list
+# CHECK-NEXT: no matching results found.
+
+# CHECK: frame recognizer list
+# CHECK-DAG: Verbose Trap StackFrame Recognizer
+# CHECK-DAG: Assert StackFrame Recognizer
+# CHECK-NOT: Verbose Trap StackFrame Recognizer
+# CHECK-NOT: Assert StackFrame Recognizer
+
+# FIXME: avoid duplicate frame recognizers in the target: https://github.com/llvm/llvm-project/issues/166341
+# CHECK: frame recognizer list
+# CHECK-DAG: Verbose Trap StackFrame Recognizer
+# CHECK-DAG: Assert StackFrame Recognizer
+# CHECK-DAG: Verbose Trap StackFrame Recognizer
+# CHECK-DAG: Assert StackFrame Recognizer
diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test
index 5a84c163453cc..32b4095d9addd 100644
--- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test
+++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback-user-leaf.test
@@ -12,7 +12,7 @@
 
 # UNSUPPORTED: system-windows
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback-user-leaf.cpp -o %t.out
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback-user-leaf.cpp -o %t.out
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK
 
 run
diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test
index b15bcb3a384f9..c8c433c0a819a 100644
--- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test
+++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-callback.test
@@ -11,7 +11,7 @@
 
 # UNSUPPORTED: system-windows
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback.cpp -o %t.out
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl-callback.cpp -o %t.out
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK
 
 run
diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test
index 2ea6594643c9c..d0789ac7dc67a 100644
--- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test
+++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-max-depth.test
@@ -4,7 +4,7 @@
 
 # UNSUPPORTED: system-windows
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-max-depth.cpp -o %t.out
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl-max-depth.cpp -o %t.out
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK
 
 run
diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test
index 81a492d1ed579..68a4ea612c0d1 100644
--- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test
+++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl-nested.test
@@ -3,7 +3,7 @@
 
 # UNSUPPORTED: system-windows
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl-nested.cpp -o %t.out
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl-nested.cpp -o %t.out
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK
 
 run
diff --git a/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test b/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test
index dd08290174e3a..bd4851146b40d 100644
--- a/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test
+++ b/lldb/test/Shell/Recognizer/verbose_trap-in-stl.test
@@ -3,7 +3,7 @@
 
 # UNSUPPORTED: system-windows
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap-in-stl.cpp -o %t.out
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap-in-stl.cpp -o %t.out
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK
 
 run
diff --git a/lldb/test/Shell/Recognizer/verbose_trap-objc.test b/lldb/test/Shell/Recognizer/verbose_trap-objc.test
new file mode 100644
index 0000000000000..0dbb04e0fd671
--- /dev/null
+++ b/lldb/test/Shell/Recognizer/verbose_trap-objc.test
@@ -0,0 +1,12 @@
+# REQUIRES: system-darwin
+#
+# RUN: %clang_host -g %S/Inputs/verbose_trap.m -o %t.out
+# RUN: %lldb -b -s %s %t.out | FileCheck %s
+
+run
+# CHECK: thread #{{.*}}stop reason = Foo: Bar
+frame info
+# CHECK: frame #{{.*}}`main at verbose_trap.m
+frame recognizer info 0
+# CHECK: frame 0 is recognized by Verbose Trap StackFrame Recognizer
+q
diff --git a/lldb/test/Shell/Recognizer/verbose_trap.test b/lldb/test/Shell/Recognizer/verbose_trap.test
index dafab7bdea688..ab0df082cc032 100644
--- a/lldb/test/Shell/Recognizer/verbose_trap.test
+++ b/lldb/test/Shell/Recognizer/verbose_trap.test
@@ -1,15 +1,15 @@
 # UNSUPPORTED: system-windows
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\"
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\"
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-BOTH
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\"
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"Bar\"
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-MESSAGE_ONLY
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\"
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"Foo\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\"
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-CATEGORY_ONLY
 #
-# RUN: %clang_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\"
+# RUN: %clangxx_host -g -O0 %S/Inputs/verbose_trap.cpp -o %t.out -DVERBOSE_TRAP_TEST_CATEGORY=\"\" -DVERBOSE_TRAP_TEST_MESSAGE=\"\"
 # RUN: %lldb -b -s %s %t.out | FileCheck %s --check-prefixes=CHECK,CHECK-NONE
 
 run
diff --git a/lldb/test/Shell/Register/Inputs/x86-multithread-read.cpp b/lldb/test/Shell/Register/Inputs/x86-multithread-read.cpp
index c5f571fc1d2c4..0d2869c0c577c 100644
--- a/lldb/test/Shell/Register/Inputs/x86-multithread-read.cpp
+++ b/lldb/test/Shell/Register/Inputs/x86-multithread-read.cpp
@@ -1,4 +1,5 @@
 #include <cstdint>
+#include <functional>
 #include <mutex>
 #include <thread>
 
diff --git a/lldb/test/Shell/Register/Inputs/x86-multithread-write.cpp b/lldb/test/Shell/Register/Inputs/x86-multithread-write.cpp
index 320f9e938e5bf..1f4e91acc4c03 100644
--- a/lldb/test/Shell/Register/Inputs/x86-multithread-write.cpp
+++ b/lldb/test/Shell/Register/Inputs/x86-multithread-write.cpp
@@ -1,6 +1,7 @@
 #include <cinttypes>
 #include <cstdint>
 #include <cstdio>
+#include <functional>
 #include <mutex>
 #include <thread>
 
diff --git a/lldb/test/Shell/Settings/TestChildCountTruncation.test b/lldb/test/Shell/Settings/TestChildCountTruncation.test
index da6436cb5ca20..b66d0df983069 100644
--- a/lldb/test/Shell/Settings/TestChildCountTruncation.test
+++ b/lldb/test/Shell/Settings/TestChildCountTruncation.test
@@ -2,7 +2,7 @@
 # when target.max-children-count wasn't explicitly set.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s --check-prefix=DWIM
 #
diff --git a/lldb/test/Shell/Settings/TestChildDepthTruncation.test b/lldb/test/Shell/Settings/TestChildDepthTruncation.test
index 12f5661600ae7..7e4fbbef9e458 100644
--- a/lldb/test/Shell/Settings/TestChildDepthTruncation.test
+++ b/lldb/test/Shell/Settings/TestChildDepthTruncation.test
@@ -2,7 +2,7 @@
 # when target.max-children-depth wasn't explicitly set.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s --check-prefix=DWIM
 #
diff --git a/lldb/test/Shell/Settings/TestCxxFrameFormat.test b/lldb/test/Shell/Settings/TestCxxFrameFormat.test
index d70db582e9750..3ee92d53492fb 100644
--- a/lldb/test/Shell/Settings/TestCxxFrameFormat.test
+++ b/lldb/test/Shell/Settings/TestCxxFrameFormat.test
@@ -3,7 +3,7 @@
 # Test the plugin.cplusplus.display.function-name-format setting.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestCxxFrameFormatEmpty.test b/lldb/test/Shell/Settings/TestCxxFrameFormatEmpty.test
index 0a6d2723ded34..a0550b733d781 100644
--- a/lldb/test/Shell/Settings/TestCxxFrameFormatEmpty.test
+++ b/lldb/test/Shell/Settings/TestCxxFrameFormatEmpty.test
@@ -5,7 +5,7 @@
 # ${function.name-with-args}.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestCxxFrameFormatMixedLanguages.test b/lldb/test/Shell/Settings/TestCxxFrameFormatMixedLanguages.test
index bafd36f5ae177..679d6e4d5abe4 100644
--- a/lldb/test/Shell/Settings/TestCxxFrameFormatMixedLanguages.test
+++ b/lldb/test/Shell/Settings/TestCxxFrameFormatMixedLanguages.test
@@ -4,9 +4,9 @@
 # when interoperating multiple languages.
 
 # RUN: split-file %s %t
-# RUN: %clangxx_host -x c -c -g %t/lib.c -o %t.clib.o
+# RUN: %clang_host -x c -c -g %t/lib.c -o %t.clib.o
 # RUN: %clangxx_host -c -g %t/lib.cpp -o %t.cxxlib.o
-# RUN: %clangxx_host %t/main.m %t.cxxlib.o %t.clib.o -o %t.out
+# RUN: %clang_host %t/main.m %t.cxxlib.o %t.clib.o -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 | FileCheck %s
 
 #--- lib.c
@@ -47,7 +47,7 @@ break set -n method
 run
 bt
 
-# CHECK: custom-frame 'this affects C++ only' 
-# CHECK: custom-frame 'this affects C++ only' 
-# CHECK: custom-frame 'func' 
-# CHECK: custom-frame 'main' 
+# CHECK: custom-frame 'this affects C++ only'
+# CHECK: custom-frame 'this affects C++ only'
+# CHECK: custom-frame 'func'
+# CHECK: custom-frame 'main'
diff --git a/lldb/test/Shell/Settings/TestCxxFrameFormatPartialFailure.test b/lldb/test/Shell/Settings/TestCxxFrameFormatPartialFailure.test
index e914ff7a010dd..f279f07afcda2 100644
--- a/lldb/test/Shell/Settings/TestCxxFrameFormatPartialFailure.test
+++ b/lldb/test/Shell/Settings/TestCxxFrameFormatPartialFailure.test
@@ -5,7 +5,7 @@
 # were successful.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionBasename.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionBasename.test
index c0008e50927b1..56ec09e2f951d 100644
--- a/lldb/test/Shell/Settings/TestFrameFormatFunctionBasename.test
+++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionBasename.test
@@ -3,11 +3,11 @@
 # Test the ${function.basename} frame-format variable.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 #
-# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out
+# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out
 # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionFormattedArguments.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionFormattedArguments.test
index 04f51701a2a2d..f20fc8ca77aeb 100644
--- a/lldb/test/Shell/Settings/TestFrameFormatFunctionFormattedArguments.test
+++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionFormattedArguments.test
@@ -3,11 +3,11 @@
 # Test the ${function.formatted-arguments} frame-format variable.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 #
-# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out
+# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out
 # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \
 # RUN:       | FileCheck %s --check-prefix=CHECK-NODEBUG
 
diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionQualifiers.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionQualifiers.test
index b1dfe834c1deb..d05e60b0e8d10 100644
--- a/lldb/test/Shell/Settings/TestFrameFormatFunctionQualifiers.test
+++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionQualifiers.test
@@ -3,11 +3,11 @@
 # Test the ${function.qualifiers} frame-format variable.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 #
-# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out
+# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out
 # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionReturn.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionReturn.test
index f913162a1aa66..bb78258aba753 100644
--- a/lldb/test/Shell/Settings/TestFrameFormatFunctionReturn.test
+++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionReturn.test
@@ -4,11 +4,11 @@
 # frame-format variables.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 #
-# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out
+# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out
 # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionScope.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionScope.test
index a28c16f95a9e2..f4a17661c3602 100644
--- a/lldb/test/Shell/Settings/TestFrameFormatFunctionScope.test
+++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionScope.test
@@ -3,11 +3,11 @@
 # Test the ${function.scope} frame-format variable.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 #
-# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out
+# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out
 # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionSuffix.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionSuffix.test
index 4609a0412a0ab..5883c722f3336 100644
--- a/lldb/test/Shell/Settings/TestFrameFormatFunctionSuffix.test
+++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionSuffix.test
@@ -3,7 +3,7 @@
 # Test the ${function.suffix} frame-format variable.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestFrameFormatFunctionTemplateArguments.test b/lldb/test/Shell/Settings/TestFrameFormatFunctionTemplateArguments.test
index ac8a32820c888..a09a9610f48db 100644
--- a/lldb/test/Shell/Settings/TestFrameFormatFunctionTemplateArguments.test
+++ b/lldb/test/Shell/Settings/TestFrameFormatFunctionTemplateArguments.test
@@ -3,11 +3,11 @@
 # Test the ${function.template-arguments} frame-format variable.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 #
-# RUN: %clang_host -O0 %t/main.cpp -o %t-nodebug.out
+# RUN: %clangxx_host -O0 %t/main.cpp -o %t-nodebug.out
 # RUN: %lldb -x -b -s %t/commands.input %t-nodebug.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/Settings/TestFrameFunctionInlined.test b/lldb/test/Shell/Settings/TestFrameFunctionInlined.test
index 5db34b4160850..1bb7ab486bcf5 100644
--- a/lldb/test/Shell/Settings/TestFrameFunctionInlined.test
+++ b/lldb/test/Shell/Settings/TestFrameFunctionInlined.test
@@ -6,7 +6,7 @@
 # REQUIRES: (system-windows && lld) || !system-windows
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out %if system-windows %{-fuse-ld=lld%}
+# RUN: %clangxx_host -g -gdwarf %t/main.cpp -o %t.out %if system-windows %{-fuse-ld=lld%}
 # RUN: %lldb -x -b -s %t/commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/split-dwarf-expression-eval-bug.cpp b/lldb/test/Shell/SymbolFile/DWARF/split-dwarf-expression-eval-bug.cpp
index 4a8004ddd287f..b02eea6bbc4f8 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/split-dwarf-expression-eval-bug.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/split-dwarf-expression-eval-bug.cpp
@@ -7,10 +7,10 @@
 
 // UNSUPPORTED: system-darwin, system-windows
 
-// RUN: %clang_host -c -gsplit-dwarf -g %s -o %t1.o -DONE
-// RUN: %clang_host -c -gsplit-dwarf -g %s -o %t2.o -DTWO
-// RUN: %clang_host -c -gsplit-dwarf -g %s -o %t3.o -DTHREE
-// RUN: %clang_host %t1.o %t2.o %t3.o -o %t
+// RUN: %clangxx_host -c -gsplit-dwarf -g %s -o %t1.o -DONE
+// RUN: %clangxx_host -c -gsplit-dwarf -g %s -o %t2.o -DTWO
+// RUN: %clangxx_host -c -gsplit-dwarf -g %s -o %t3.o -DTHREE
+// RUN: %clangxx_host %t1.o %t2.o %t3.o -o %t
 // RUN: %lldb %t -o "br set -n foo" -o run -o "expression bool_in_first_cu" -o exit \
 // RUN:   | FileCheck %s
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/apple-index-is-used.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/apple-index-is-used.cpp
index 5bcb2cbcbbe29..8ef2e56ba3d4d 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/apple-index-is-used.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/apple-index-is-used.cpp
@@ -1,5 +1,5 @@
 // Test that we use the apple indexes.
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx -gdwarf-4
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx -gdwarf-4
 // RUN: lldb-test symbols %t | FileCheck %s
 
 // CHECK: .apple_names index present
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-compressed.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-compressed.cpp
index 4dcbb47152203..53c3d3daa40c5 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-compressed.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-compressed.cpp
@@ -3,7 +3,7 @@
 
 // REQUIRES: lld, zlib
 
-// RUN: %clang -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %s
+// RUN: %clangxx -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %s
 // RUN: ld.lld %t.o -o %t --compress-debug-sections=zlib
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --find=variable --name=foo %t | FileCheck %s
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-debug-names.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-debug-names.cpp
index 2b7a928c89a8f..acc34dd41688b 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-debug-names.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-debug-names.cpp
@@ -6,7 +6,7 @@
 
 // REQUIRES: lld
 
-// RUN: %clang %s -target x86_64-pc-linux -gdwarf-5 -fdebug-types-section \
+// RUN: %clangxx %s -target x86_64-pc-linux -gdwarf-5 -fdebug-types-section \
 // RUN:   -gpubnames -fno-limit-debug-info -c -o %t.o
 // RUN: ld.lld %t.o -o %t
 // RUN: %lldb %t -o "type lookup stype" -b | FileCheck %s --check-prefix=BASE
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-dwo-cross-reference.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-dwo-cross-reference.cpp
index 0e29cb3e7f16e..bc863fb64a9cc 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-dwo-cross-reference.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-dwo-cross-reference.cpp
@@ -3,9 +3,9 @@
 
 // REQUIRES: lld
 
-// RUN: %clang %s -target x86_64-pc-linux -fno-standalone-debug -g \
+// RUN: %clangxx %s -target x86_64-pc-linux -fno-standalone-debug -g \
 // RUN:   -fdebug-types-section -gsplit-dwarf -c -o %t1.o -DONE
-// RUN: %clang %s -target x86_64-pc-linux -fno-standalone-debug -g \
+// RUN: %clangxx %s -target x86_64-pc-linux -fno-standalone-debug -g \
 // RUN:   -fdebug-types-section -gsplit-dwarf -c -o %t2.o -DTWO
 // RUN: llvm-dwarfdump %t1.dwo -debug-types -debug-info | FileCheck --check-prefix=ONEUNIT %s
 // RUN: llvm-dwarfdump %t2.dwo -debug-types -debug-info | FileCheck --check-prefix=ONEUNIT %s
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-index-is-used.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-index-is-used.cpp
index d6ac23716f6ce..2fdb1d8d7ca7d 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-index-is-used.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-index-is-used.cpp
@@ -2,7 +2,7 @@
 
 // REQUIRES: lld
 
-// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
+// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: lldb-test symbols %t | FileCheck %s
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-partial-index.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-partial-index.cpp
index ab84415f61b27..a739dfde48aaf 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-partial-index.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwarf5-partial-index.cpp
@@ -3,9 +3,9 @@
 
 // REQUIRES: lld
 
-// RUN: %clang %s -c -o %t-1.o --target=x86_64-pc-linux -DONE -gdwarf-5 -gpubnames
+// RUN: %clangxx %s -c -o %t-1.o --target=x86_64-pc-linux -DONE -gdwarf-5 -gpubnames
 // RUN: llvm-readobj --sections %t-1.o | FileCheck %s --check-prefix NAMES
-// RUN: %clang %s -c -o %t-2.o --target=x86_64-pc-linux -DTWO -gdwarf-5 -gno-pubnames
+// RUN: %clangxx %s -c -o %t-2.o --target=x86_64-pc-linux -DTWO -gdwarf-5 -gno-pubnames
 // RUN: ld.lld %t-1.o %t-2.o -o %t
 // RUN: lldb-test symbols --find=variable --name=foo  %t | FileCheck %s
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwo-not-found-warning.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwo-not-found-warning.cpp
index 929e11f80e34e..36eb299f06630 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwo-not-found-warning.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwo-not-found-warning.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang --target=x86_64-pc-linux -g -gsplit-dwarf -c %s -o %t.o
+// RUN: %clangxx --target=x86_64-pc-linux -g -gsplit-dwarf -c %s -o %t.o
 // RUN: rm %t.dwo
 // RUN: %lldb %t.o -o "br set -n main" -o exit 2>&1 | FileCheck %s
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp
index 9251930d7d13c..7fbc4f98e7976 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-foreign-type-units.cpp
@@ -16,9 +16,9 @@
 // type unit comes from by looking at the DW_AT_dwo_name attribute in the
 // DW_TAG_type_unit.
 
-// RUN: %clang -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf \
+// RUN: %clangxx -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf \
 // RUN:   -fdebug-types-section -gpubnames -c %s -o %t.main.o
-// RUN: %clang -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -DVARIANT \
+// RUN: %clangxx -target x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -DVARIANT \
 // RUN:   -fdebug-types-section -gpubnames -c %s -o %t.foo.o
 // RUN: ld.lld %t.main.o %t.foo.o -o %t
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp
index 3e97c3fb1ebc2..3edcd8f180a15 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp
@@ -14,8 +14,8 @@
 // complete DWARF index.
 
 // Test that if we don't have .debug_names, that we save a full DWARF index.
-// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o
-// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o
+// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o
+// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o
 // RUN: ld.lld %t.main.o %t.foo.o -o %t.nonames
 // RUN: llvm-dwp %t.main.dwo %t.foo.dwo -o %t.nonames.dwp
 // RUN: rm %t.main.dwo %t.foo.dwo
@@ -35,8 +35,8 @@
 
 // Test that if we have one .o file with .debug_names and one without, that we
 // save a partial DWARF index.
-// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o -gpubnames
-// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o
+// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o -gpubnames
+// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o
 // RUN: ld.lld %t.main.o %t.foo.o -o %t.somenames
 // RUN: llvm-dwp %t.main.dwo %t.foo.dwo -o %t.somenames.dwp
 // RUN: rm %t.main.dwo %t.foo.dwo
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp
index 888e96bbb10af..f625fda2087db 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: lld, python
 
 // Now test with DWARF5
-// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.dwarf5.o
+// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.dwarf5.o
 // RUN: ld.lld %t.dwarf5.o -o %t.dwarf5
 // RUN: llvm-dwp %t.dwarf5.dwo -o %t.dwarf5.dwp
 // RUN: rm %t.dwarf5.dwo
@@ -64,7 +64,7 @@
 // RUN:   -b %t.dwarf5.debug 2>&1 | FileCheck %s -check-prefix=NODWP
 
 // Now test with DWARF4
-// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-4 -c %s -o %t.dwarf4.o
+// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-4 -c %s -o %t.dwarf4.o
 // RUN: ld.lld %t.dwarf4.o -o %t.dwarf4
 // RUN: llvm-dwp %t.dwarf4.dwo -o %t.dwarf4.dwp
 // RUN: rm %t.dwarf4.dwo
@@ -128,7 +128,7 @@
 
 // Test if we have a GNU build ID in our main executable and in our debug file,
 // and we have a .dwp file that doesn't, that we can still load our .dwp file.
-// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.o
+// RUN: %clangxx -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.o
 // RUN: ld.lld %t.o --build-id=md5 -o %t
 // RUN: llvm-dwp %t.dwo -o %t.dwp
 // RUN: rm %t.dwo
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp
index c42f9fe0b8b52..a00b2bd9506ef 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-function.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: lld
 
-// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
+// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: lldb-test symbols --name=foo --find=function --function-flags=base %t | \
 // RUN:   FileCheck --check-prefix=BASE %s
@@ -19,7 +19,7 @@
 // RUN: lldb-test symbols --name=not_there --find=function %t | \
 // RUN:   FileCheck --check-prefix=EMPTY %s
 //
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx
 // RUN: lldb-test symbols --name=foo --find=function --function-flags=base %t | \
 // RUN:   FileCheck --check-prefix=BASE %s
 // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \
@@ -39,7 +39,7 @@
 // RUN: lldb-test symbols --name=not_there --find=function %t | \
 // RUN:   FileCheck --check-prefix=EMPTY %s
 
-// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
+// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --name=foo --find=function --function-flags=base %t | \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-namespace.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-namespace.cpp
index 13d50af7ef601..14c73c3e82efb 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-namespace.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-namespace.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: lld
 
-// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
+// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: lldb-test symbols --name=foo --find=namespace %t | \
 // RUN:   FileCheck --check-prefix=FOO %s
@@ -9,7 +9,7 @@
 // RUN: lldb-test symbols --name=not_there --find=namespace %t | \
 // RUN:   FileCheck --check-prefix=EMPTY %s
 //
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx
 // RUN: lldb-test symbols --name=foo --find=namespace %t | \
 // RUN:   FileCheck --check-prefix=FOO %s
 // RUN: lldb-test symbols --name=foo --find=namespace --context=context %t | \
@@ -17,7 +17,7 @@
 // RUN: lldb-test symbols --name=not_there --find=namespace %t | \
 // RUN:   FileCheck --check-prefix=EMPTY %s
 
-// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
+// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --name=foo --find=namespace %t | \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp
index af49206608723..315fab344dfee 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: lld
 
-// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
+// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: lldb-test symbols --name=foo --find=type %t | \
 // RUN:   FileCheck --check-prefix=NAME %s
@@ -11,7 +11,7 @@
 // RUN: lldb-test symbols --name=not_there --find=type %t | \
 // RUN:   FileCheck --check-prefix=EMPTY %s
 //
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx
 // RUN: lldb-test symbols --name=foo --find=type %t | \
 // RUN:   FileCheck --check-prefix=NAME %s
 // RUN: lldb-test symbols --name=::foo --find=type %t | \
@@ -21,7 +21,7 @@
 // RUN: lldb-test symbols --name=not_there --find=type %t | \
 // RUN:   FileCheck --check-prefix=EMPTY %s
 
-// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
+// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --name=foo --find=type %t | \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-variable.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-variable.cpp
index e46fa14489d32..b6e2252c28402 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-variable.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-variable.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: lld
 
-// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
+// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: lldb-test symbols --name=foo --find=variable --context=context %t | \
 // RUN:   FileCheck --check-prefix=CONTEXT %s
@@ -11,7 +11,7 @@
 // RUN: lldb-test symbols --name=not_there --find=variable %t | \
 // RUN:   FileCheck --check-prefix=EMPTY %s
 //
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx
 // RUN: lldb-test symbols --name=foo --find=variable --context=context %t | \
 // RUN:   FileCheck --check-prefix=CONTEXT %s
 // RUN: lldb-test symbols --name=foo --find=variable %t | \
@@ -21,7 +21,7 @@
 // RUN: lldb-test symbols --name=not_there --find=variable %t | \
 // RUN:   FileCheck --check-prefix=EMPTY %s
 //
-// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
+// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --name=foo --find=variable --context=context %t | \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-function-regex.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-function-regex.cpp
index be267596fb372..5c7ad844f6603 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-function-regex.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-function-regex.cpp
@@ -1,13 +1,13 @@
 // REQUIRES: lld
 
-// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
+// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: lldb-test symbols --name=f.o --regex --find=function %t | FileCheck %s
 //
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx
 // RUN: lldb-test symbols --name=f.o --regex --find=function %t | FileCheck %s
 
-// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
+// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --name=f.o --regex --find=function %t | FileCheck %s
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-method-local-struct.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-method-local-struct.cpp
index 3da4a4a23f8a8..46553a83081e4 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-method-local-struct.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-method-local-struct.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx
 // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \
 // RUN:   FileCheck %s
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-method.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-method.cpp
index 9f8b3df2f31a7..26faf8907b4a9 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-method.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-method.cpp
@@ -1,15 +1,15 @@
 // REQUIRES: lld
 
-// RUN: %clang %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
+// RUN: %clangxx %s -g -c -o %t.o --target=x86_64-pc-linux -gno-pubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \
 // RUN:   FileCheck %s
 //
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx
 // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \
 // RUN:   FileCheck %s
 
-// RUN: %clang %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
+// RUN: %clangxx %s -c -o %t.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames
 // RUN: ld.lld %t.o -o %t
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --name=foo --find=function --function-flags=method %t | \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-qualified-variable.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-qualified-variable.cpp
index 1ad3e7fbadf51..e3f9ce308b75c 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-qualified-variable.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-qualified-variable.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx
+// RUN: %clangxx %s -g -c -o %t --target=x86_64-apple-macosx
 // RUN: lldb-test symbols --name=A::foo --find=variable %t | FileCheck %s
 
 // CHECK: Found 1 variables:
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-dwo.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-dwo.cpp
index b5d35e4f7883f..250b34377acda 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-dwo.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-dwo.cpp
@@ -1,9 +1,9 @@
 // REQUIRES: lld
 
-// RUN: %clang %s -gdwarf-5 -gpubnames -gsplit-dwarf -c -emit-llvm -o - --target=x86_64-pc-linux -DONE | \
+// RUN: %clangxx %s -gdwarf-5 -gpubnames -gsplit-dwarf -c -emit-llvm -o - --target=x86_64-pc-linux -DONE | \
 // RUN:   llc -filetype=obj -split-dwarf-file=%t-1.dwo -o %t-1.o
 // RUN: llvm-objcopy --split-dwo=%t-1.dwo %t-1.o
-// RUN: %clang %s -gdwarf-5 -gpubnames -gsplit-dwarf -c -emit-llvm -o - --target=x86_64-pc-linux -DTWO | \
+// RUN: %clangxx %s -gdwarf-5 -gpubnames -gsplit-dwarf -c -emit-llvm -o - --target=x86_64-pc-linux -DTWO | \
 // RUN:   llc -filetype=obj -split-dwarf-file=%t-2.dwo -o %t-2.o
 // RUN: llvm-objcopy --split-dwo=%t-2.dwo %t-2.o
 // RUN: ld.lld %t-1.o %t-2.o -o %t
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-file.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-file.cpp
index f1a9a4eb12d07..3a8cf89ac367b 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-file.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-variable-file.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: lld
 
-// RUN: %clang -g -c -o %t-1.o --target=x86_64-pc-linux -gno-pubnames %s
-// RUN: %clang -g -c -o %t-2.o --target=x86_64-pc-linux -gno-pubnames %S/Inputs/find-variable-file-2.cpp
+// RUN: %clangxx -g -c -o %t-1.o --target=x86_64-pc-linux -gno-pubnames %s
+// RUN: %clangxx -g -c -o %t-2.o --target=x86_64-pc-linux -gno-pubnames %S/Inputs/find-variable-file-2.cpp
 // RUN: ld.lld %t-1.o %t-2.o -o %t
 // RUN: lldb-test symbols --file=find-variable-file.cpp --find=variable %t | \
 // RUN:   FileCheck --check-prefix=ONE %s
@@ -10,16 +10,16 @@
 
 // Run the same test with split-dwarf. This is interesting because the two
 // split compile units will have the same offset (0).
-// RUN: %clang -g -c -o %t-1.o --target=x86_64-pc-linux -gsplit-dwarf %s
-// RUN: %clang -g -c -o %t-2.o --target=x86_64-pc-linux -gsplit-dwarf %S/Inputs/find-variable-file-2.cpp
+// RUN: %clangxx -g -c -o %t-1.o --target=x86_64-pc-linux -gsplit-dwarf %s
+// RUN: %clangxx -g -c -o %t-2.o --target=x86_64-pc-linux -gsplit-dwarf %S/Inputs/find-variable-file-2.cpp
 // RUN: ld.lld %t-1.o %t-2.o -o %t
 // RUN: lldb-test symbols --file=find-variable-file.cpp --find=variable %t | \
 // RUN:   FileCheck --check-prefix=ONE %s
 // RUN: lldb-test symbols --file=find-variable-file-2.cpp --find=variable %t | \
 // RUN:   FileCheck --check-prefix=TWO %s
 
-// RUN: %clang -c -o %t-1.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %s
-// RUN: %clang -c -o %t-2.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %S/Inputs/find-variable-file-2.cpp
+// RUN: %clangxx -c -o %t-1.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %s
+// RUN: %clangxx -c -o %t-2.o --target=x86_64-pc-linux -gdwarf-5 -gpubnames %S/Inputs/find-variable-file-2.cpp
 // RUN: ld.lld %t-1.o %t-2.o -o %t
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --file=find-variable-file.cpp --find=variable %t | \
@@ -29,9 +29,9 @@
 
 // Run the same test with split dwarf and pubnames to check whether we can find
 // the compile unit using the name index if it is split.
-// RUN: %clang -c -o %t-1.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %s
-// RUN: %clang -c -o %t-2.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %S/Inputs/find-variable-file-2.cpp
-// RUN: %clang -c -o %t-3.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %S/Inputs/find-variable-file-3.cpp
+// RUN: %clangxx -c -o %t-1.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %s
+// RUN: %clangxx -c -o %t-2.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %S/Inputs/find-variable-file-2.cpp
+// RUN: %clangxx -c -o %t-3.o --target=x86_64-pc-linux -gdwarf-5 -gsplit-dwarf -gpubnames %S/Inputs/find-variable-file-3.cpp
 // RUN: ld.lld %t-1.o %t-2.o %t-3.o -o %t
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --file=find-variable-file.cpp --find=variable %t | \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/member-pointers.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/member-pointers.cpp
index a12892305798a..00805770af11e 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/member-pointers.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/member-pointers.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: lld
 
 // Itanium ABI:
-// RUN: %clang --target=x86_64-pc-linux -gdwarf -c -o %t_linux.o %s
+// RUN: %clangxx --target=x86_64-pc-linux -gdwarf -c -o %t_linux.o %s
 // RUN: %lldb -f %t_linux.o -b -o "target variable s1 s2 m1 m2 v1 v2 v3 v4" | FileCheck --check-prefix=CHECK-GNU %s
 //
 // CHECK-GNU: (void (Single1::*)()) s1 = 0x00000000000000000000000000000000
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm b/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm
index 2dec109a781ca..27aa1365ab54c 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/module-ownership.mm
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t.cache
-// RUN: %clang --target=x86_64-apple-macosx -g -gmodules -Wno-objc-root-class \
+// RUN: %clangxx --target=x86_64-apple-macosx -g -gmodules -Wno-objc-root-class \
 // RUN:    -fmodules -fmodules-cache-path=%t.cache \
 // RUN:    -c -o %t.o %s -I%S/Inputs
 // RUN: lldb-test symbols -dump-clang-ast %t.o | FileCheck --check-prefix CHECK-ANON-S1 %s
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp
index 297fb82caee5f..8f530c803a40c 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang --target=x86_64-apple-macosx -c -gdwarf -o %t %s
+// RUN: %clangxx --target=x86_64-apple-macosx -c -gdwarf -o %t %s
 // RUN: %lldb %t \
 // RUN:   -o "target var global" \
 // RUN:   -o "target var global2" \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp
index 5a40a6e0fbc27..5ab45eefd2211 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/type-definition-search.cpp
@@ -4,18 +4,18 @@
 
 // REQUIRES: lld
 
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-n-a.o -g -gsimple-template-names -DFILE_A
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-n-b.o -g -gsimple-template-names -DFILE_B
+// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-n-a.o -g -gsimple-template-names -DFILE_A
+// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-n-b.o -g -gsimple-template-names -DFILE_B
 // RUN: ld.lld %t-n-a.o %t-n-b.o -o %t-n
 // RUN: %lldb %t-n -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s
 
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-t-a.o -g -fdebug-types-section -DFILE_A
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-t-b.o -g -fdebug-types-section -DFILE_B
+// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-t-a.o -g -fdebug-types-section -DFILE_A
+// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-t-b.o -g -fdebug-types-section -DFILE_B
 // RUN: ld.lld %t-t-a.o %t-t-b.o -o %t-t
 // RUN: %lldb %t-t -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s
 
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-tn-a.o -g -fdebug-types-section -gsimple-template-names -DFILE_A
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-tn-b.o -g -fdebug-types-section -gsimple-template-names -DFILE_B
+// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-tn-a.o -g -fdebug-types-section -gsimple-template-names -DFILE_A
+// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-tn-b.o -g -fdebug-types-section -gsimple-template-names -DFILE_B
 // RUN: ld.lld %t-tn-a.o %t-tn-b.o -o %t-tn
 // RUN: %lldb %t-tn -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/type-unit-same-basename.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/type-unit-same-basename.cpp
index f7f5a30aaba9e..f9fd5b5e52250 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/type-unit-same-basename.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/type-unit-same-basename.cpp
@@ -5,8 +5,8 @@
 
 // REQUIRES: lld
 
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-a.o -g -fdebug-types-section -flimit-debug-info -DFILE_A
-// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-b.o -g -fdebug-types-section -flimit-debug-info -DFILE_B
+// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-a.o -g -fdebug-types-section -flimit-debug-info -DFILE_A
+// RUN: %clangxx --target=x86_64-pc-linux -c %s -o %t-b.o -g -fdebug-types-section -flimit-debug-info -DFILE_B
 // RUN: ld.lld -z undefs %t-a.o %t-b.o -o %t
 // RUN: %lldb %t -o "target variable x" -o exit | FileCheck %s
 
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp b/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp
index 36bfdb9a8e565..83ed533eb13e3 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp
+++ b/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp
@@ -34,9 +34,6 @@
 // CHECK-NEXT:           s4 = {
 // CHECK-NEXT:             x = ([0] = 67, [1] = 68, [2] = 99)
 // CHECK-NEXT:           }
-// CHECK-NEXT:           s1 = {
-// CHECK-NEXT:             x = ([0] = 69, [1] = 70, [2] = 71)
-// CHECK-NEXT:           }
 // CHECK-NEXT:         }
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }
@@ -47,6 +44,9 @@
 // CHECK-NEXT:       c2 = 'D'
 // CHECK-NEXT:     }
 // CHECK-NEXT:   }
+// CHECK-NEXT:   s1 = {
+// CHECK-NEXT:     x = ([0] = 69, [1] = 70, [2] = 71)
+// CHECK-NEXT:   }
 // CHECK-NEXT: }
 // CHECK-NEXT: (lldb) type lookup C
 // CHECK-NEXT: struct C {
@@ -63,7 +63,6 @@
 // CHECK-NEXT:                 struct {
 // CHECK-NEXT:                     char c4;
 // CHECK-NEXT:                     S3 s4;
-// CHECK-NEXT:                     S3 s1;
 // CHECK-NEXT:                 };
 // CHECK-NEXT:             };
 // CHECK-NEXT:         };
@@ -72,6 +71,7 @@
 // CHECK-NEXT:             char c2;
 // CHECK-NEXT:         };
 // CHECK-NEXT:     };
+// CHECK-NEXT:     S3 s1;
 // CHECK-NEXT: }
 
 
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp b/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp
index beb5ae2f90256..75c59c560fad9 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp
+++ b/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp
@@ -42,18 +42,18 @@ int main(int argc, char **argv) {
   return ns::a_function() + b.b_func();
 }
 
-// CHECK-DAG: Code {{.*}} main
-// CHECK-DAG: Code {{.*}} ?b_func@?$B@F@ns@@QEBAHXZ
-// CHECK-DAG: Code {{.*}} ?something@A@@QEAAXXZ
-// CHECK-DAG: Code {{.*}} ??_GDyn@ns@@UEAAPEAXI@Z
-// CHECK-DAG: Code {{.*}} ??2@YAPEAX_K@Z
-// CHECK-DAG: Code {{.*}} ??3@YAXPEAX_K@Z
-// CHECK-DAG: Code {{.*}} ?static_fn@C@?$B@H@ns@@SAHXZ
-// CHECK-DAG: Code {{.*}} ?a_function@ns@@YAHXZ
-// CHECK-DAG: Code {{.*}} ?static_fn@C@?$B@_N@ns@@SAHXZ
-// CHECK-DAG: Code {{.*}} ??1Dyn@ns@@UEAA@XZ
-// CHECK-DAG: Code {{.*}} ??0Dyn@ns@@QEAA@XZ
-// CHECK-DAG: Data {{.*}} ?global_int@@3HA
-// CHECK-DAG: Data {{.*}} ??_7Dyn@ns@@6B@
-// CHECK-DAG: Data {{.*}} ?global_a@@3UA@@A
-// CHECK-DAG: Data {{.*}} ?global_c@@3UC@?$B@_J@ns@@A
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 main
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?b_func@?$B@F@ns@@QEBAHXZ
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?something@A@@QEAAXXZ
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??_GDyn@ns@@UEAAPEAXI@Z
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??2@YAPEAX_K@Z
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??3@YAXPEAX_K@Z
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?static_fn@C@?$B@H@ns@@SAHXZ
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?a_function@ns@@YAHXZ
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?static_fn@C@?$B@_N@ns@@SAHXZ
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??1Dyn@ns@@UEAA@XZ
+// CHECK-DAG: Code 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??0Dyn@ns@@QEAA@XZ
+// CHECK-DAG: Data 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?global_int@@3HA
+// CHECK-DAG: Data 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ??_7Dyn@ns@@6B@
+// CHECK-DAG: Data 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?global_a@@3UA@@A
+// CHECK-DAG: Data 0x{{[0-9a-f]+}} 0x{{0*[1-9a-f][0-9a-f]*}} 0x00000000 ?global_c@@3UC@?$B@_J@ns@@A
diff --git a/lldb/test/Shell/Unwind/Inputs/call-asm.c b/lldb/test/Shell/Unwind/Inputs/call-asm.c
index b154c1ac1385d..778c16b36a761 100644
--- a/lldb/test/Shell/Unwind/Inputs/call-asm.c
+++ b/lldb/test/Shell/Unwind/Inputs/call-asm.c
@@ -1,3 +1,3 @@
-int asm_main() asm("asm_main");
-
+// Explicit mangling is necessary as on Darwin an underscore is prepended to the symbol.
+int asm_main() __asm("asm_main");
 int main() { return asm_main(); }
diff --git a/lldb/test/Shell/helper/build.py b/lldb/test/Shell/helper/build.py
index a5a7e997be044..1fa8aab92c128 100755
--- a/lldb/test/Shell/helper/build.py
+++ b/lldb/test/Shell/helper/build.py
@@ -804,7 +804,19 @@ def _get_link_command(self):
         args.extend(self._obj_file_names())
 
         if sys.platform == "darwin":
+            # By default, macOS doesn't allow injecting the ASAN
+            # runtime into system processes.
+            system_clang = (
+                subprocess.check_output(["xcrun", "-find", "clang"])
+                .strip()
+                .decode("utf-8")
+            )
+            system_liblto = os.path.join(
+                os.path.dirname(os.path.dirname(system_clang)), "lib", "libLTO.dylib"
+            )
             args.extend(["-isysroot", self.apple_sdk])
+            args.extend(["-Wl,-lto_library", "-Wl," + system_liblto])
+
         elif self.objc_gnustep_lib:
             args.extend(["-L", self.objc_gnustep_lib, "-lobjc"])
             if sys.platform == "linux":
diff --git a/lldb/test/Shell/helper/toolchain.py b/lldb/test/Shell/helper/toolchain.py
index 728f6347242f1..faa29d23387cc 100644
--- a/lldb/test/Shell/helper/toolchain.py
+++ b/lldb/test/Shell/helper/toolchain.py
@@ -250,6 +250,15 @@ def use_support_substitutions(config):
             "-L{}".format(config.libcxx_libs_dir),
             "-lc++",
         ]
+    # By default, macOS doesn't allow injecting the ASAN runtime into system processes.
+    if platform.system() in ["Darwin"] and config.llvm_use_sanitizer:
+        system_clang = (
+            subprocess.check_output(["xcrun", "-find", "clang"]).strip().decode("utf-8")
+        )
+        system_liblto = os.path.join(
+            os.path.dirname(os.path.dirname(system_clang)), "lib", "libLTO.dylib"
+        )
+        host_flags += ["-Wl,-lto_library", "-Wl," + system_liblto]
 
     host_flags = " ".join(host_flags)
     config.substitutions.append(("%clang_host", "%clang " + host_flags))
diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
index 3afaaa2f64c00..8df3f29a7e825 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
+++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
@@ -2853,12 +2853,6 @@ static uint64_t bits(uint64_t value, uint32_t msbit, uint32_t lsbit) {
 
     if (err.Success()) {
       m_flags |= eMachProcessFlagsAttached;
-      // Sleep a bit to let the exception get received and set our process
-      // status
-      // to stopped.
-      ::usleep(250000);
-      DNBLog("[LaunchAttach] (%d) Done napping after ptrace(PT_ATTACHEXC)'ing",
-             getpid());
       DNBLogThreadedIf(LOG_PROCESS, "successfully attached to pid %d", pid);
       return m_pid;
     } else {
diff --git a/lldb/tools/debugserver/source/MacOSX/MachTask.h b/lldb/tools/debugserver/source/MacOSX/MachTask.h
index c4a20b80fda95..915f65a8160ee 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachTask.h
+++ b/lldb/tools/debugserver/source/MacOSX/MachTask.h
@@ -81,9 +81,7 @@ class MachTask {
   void TaskPortChanged(task_t task);
   task_t TaskPort() const { return m_task; }
   task_t TaskPortForProcessID(DNBError &err, bool force = false);
-  static task_t TaskPortForProcessID(pid_t pid, DNBError &err,
-                                     uint32_t num_retries = 10,
-                                     uint32_t usec_interval = 10000);
+  static task_t TaskPortForProcessID(pid_t pid, DNBError &err);
 
   MachProcess *Process() { return m_process; }
   const MachProcess *Process() const { return m_process; }
diff --git a/lldb/tools/debugserver/source/MacOSX/MachTask.mm b/lldb/tools/debugserver/source/MacOSX/MachTask.mm
index 21156feecba2c..e5bbab830b187 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachTask.mm
+++ b/lldb/tools/debugserver/source/MacOSX/MachTask.mm
@@ -523,14 +523,15 @@ static void get_threads_profile_data(DNBProfileDataScanType scanType,
 //----------------------------------------------------------------------
 // MachTask::TaskPortForProcessID
 //----------------------------------------------------------------------
-task_t MachTask::TaskPortForProcessID(pid_t pid, DNBError &err,
-                                      uint32_t num_retries,
-                                      uint32_t usec_interval) {
+task_t MachTask::TaskPortForProcessID(pid_t pid, DNBError &err) {
+  static constexpr uint32_t k_num_retries = 10;
+  static constexpr uint32_t k_usec_delay = 10000;
+
   if (pid != INVALID_NUB_PROCESS) {
     DNBError err;
     mach_port_t task_self = mach_task_self();
     task_t task = TASK_NULL;
-    for (uint32_t i = 0; i < num_retries; i++) {
+    for (uint32_t i = 0; i < k_num_retries; i++) {
       DNBLog("[LaunchAttach] (%d) about to task_for_pid(%d)", getpid(), pid);
       err = ::task_for_pid(task_self, pid, &task);
 
@@ -557,7 +558,7 @@ static void get_threads_profile_data(DNBProfileDataScanType scanType,
       }
 
       // Sleep a bit and try again
-      ::usleep(usec_interval);
+      ::usleep(k_usec_delay);
     }
   }
   return TASK_NULL;
diff --git a/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp b/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp
index 9d0d60fdaaed9..c8dce75af05eb 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp
+++ b/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp
@@ -14,6 +14,12 @@
 #include "DNBLog.h"
 #include <cassert>
 #include <mach/mach_vm.h>
+#include <mach/vm_statistics.h>
+
+// From <mach/vm_statistics.h>, but not on older OSs.
+#ifndef VM_MEMORY_SANITIZER
+#define VM_MEMORY_SANITIZER 99
+#endif
 
 MachVMRegion::MachVMRegion(task_t task)
     : m_task(task), m_addr(INVALID_NUB_ADDRESS), m_err(),
diff --git a/lldb/tools/driver/CMakeLists.txt b/lldb/tools/driver/CMakeLists.txt
index 67956af7fe3fb..efe51506f3545 100644
--- a/lldb/tools/driver/CMakeLists.txt
+++ b/lldb/tools/driver/CMakeLists.txt
@@ -37,6 +37,9 @@ add_dependencies(lldb
 if(DEFINED LLDB_PYTHON_DLL_RELATIVE_PATH)
   target_compile_definitions(lldb PRIVATE LLDB_PYTHON_DLL_RELATIVE_PATH="${LLDB_PYTHON_DLL_RELATIVE_PATH}")
 endif()
+if(DEFINED LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME)
+  target_compile_definitions(lldb PRIVATE LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME="${LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME}")
+endif()
 
 if(LLDB_BUILD_FRAMEWORK)
   # In the build-tree, we know the exact path to the framework directory.
diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp
index 733331f4ddac0..bebf1a70d50e9 100644
--- a/lldb/tools/driver/Driver.cpp
+++ b/lldb/tools/driver/Driver.cpp
@@ -433,7 +433,8 @@ SBError Driver::ProcessArgs(const opt::InputArgList &args, bool &exiting) {
   return error;
 }
 
-#if defined(_WIN32) && defined(LLDB_PYTHON_DLL_RELATIVE_PATH)
+#ifdef _WIN32
+#ifdef LLDB_PYTHON_DLL_RELATIVE_PATH
 /// Returns the full path to the lldb.exe executable.
 inline std::wstring GetPathToExecutableW() {
   // Iterate until we reach the Windows API maximum path length (32,767).
@@ -447,30 +448,73 @@ inline std::wstring GetPathToExecutableW() {
   return L"";
 }
 
-/// Resolve the full path of the directory defined by
+/// \brief Resolve the full path of the directory defined by
 /// LLDB_PYTHON_DLL_RELATIVE_PATH. If it exists, add it to the list of DLL
 /// search directories.
-void AddPythonDLLToSearchPath() {
+/// \return `true` if the library was added to the search path.
+/// `false` otherwise.
+bool AddPythonDLLToSearchPath() {
   std::wstring modulePath = GetPathToExecutableW();
-  if (modulePath.empty()) {
-    llvm::errs() << "error: unable to find python.dll." << '\n';
-    return;
-  }
+  if (modulePath.empty())
+    return false;
 
   SmallVector<char, MAX_PATH> utf8Path;
   if (sys::windows::UTF16ToUTF8(modulePath.c_str(), modulePath.length(),
                                 utf8Path))
-    return;
+    return false;
   sys::path::remove_filename(utf8Path);
   sys::path::append(utf8Path, LLDB_PYTHON_DLL_RELATIVE_PATH);
   sys::fs::make_absolute(utf8Path);
 
   SmallVector<wchar_t, 1> widePath;
   if (sys::windows::widenPath(utf8Path.data(), widePath))
-    return;
+    return false;
 
   if (sys::fs::exists(utf8Path))
-    SetDllDirectoryW(widePath.data());
+    return SetDllDirectoryW(widePath.data());
+  return false;
+}
+#endif
+
+#ifdef LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME
+/// Returns whether `python3x.dll` is in the DLL search path.
+bool IsPythonDLLInPath() {
+#define WIDEN2(x) L##x
+#define WIDEN(x) WIDEN2(x)
+  WCHAR foundPath[MAX_PATH];
+  DWORD result =
+      SearchPathW(nullptr, WIDEN(LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME), nullptr,
+                  MAX_PATH, foundPath, nullptr);
+#undef WIDEN2
+#undef WIDEN
+
+  return result > 0;
+}
+#endif
+
+/// Try to setup the DLL search path for the Python Runtime Library
+/// (python3xx.dll).
+///
+/// If `LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME` is set, we first check if
+/// python3xx.dll is in the search path. If it's not, we try to add it and
+/// check for it a second time.
+/// If only `LLDB_PYTHON_DLL_RELATIVE_PATH` is set, we try to add python3xx.dll
+/// to the search path python.dll is already in the search path or not.
+void SetupPythonRuntimeLibrary() {
+#ifdef LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME
+  if (IsPythonDLLInPath())
+    return;
+#ifdef LLDB_PYTHON_DLL_RELATIVE_PATH
+  if (AddPythonDLLToSearchPath() && IsPythonDLLInPath())
+    return;
+#endif
+  llvm::errs() << "error: unable to find '"
+               << LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME << "'.\n";
+  return;
+#elif defined(LLDB_PYTHON_DLL_RELATIVE_PATH)
+  if (!AddPythonDLLToSearchPath())
+    llvm::errs() << "error: unable to find the Python runtime library.\n";
+#endif
 }
 #endif
 
@@ -776,8 +820,8 @@ int main(int argc, char const *argv[]) {
                         "~/Library/Logs/DiagnosticReports/.\n");
 #endif
 
-#if defined(_WIN32) && defined(LLDB_PYTHON_DLL_RELATIVE_PATH)
-  AddPythonDLLToSearchPath();
+#ifdef _WIN32
+  SetupPythonRuntimeLibrary();
 #endif
 
   // Parse arguments.
diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt
index 7db334ca56bcf..fa940b7b73943 100644
--- a/lldb/tools/lldb-dap/CMakeLists.txt
+++ b/lldb/tools/lldb-dap/CMakeLists.txt
@@ -1,13 +1,11 @@
 # We need to include the llvm components we depend on manually, as liblldb does
 # not re-export those.
 set(LLVM_LINK_COMPONENTS Support)
-set(LLVM_TARGET_DEFINITIONS Options.td)
-tablegen(LLVM Options.inc -gen-opt-parser-defs)
-add_public_tablegen_target(LLDBDAPOptionsTableGen)
 
 add_lldb_library(lldbDAP
   Breakpoint.cpp
   BreakpointBase.cpp
+  ClientLauncher.cpp
   CommandPlugins.cpp
   DAP.cpp
   DAPError.cpp
diff --git a/lldb/tools/lldb-dap/ClientLauncher.cpp b/lldb/tools/lldb-dap/ClientLauncher.cpp
new file mode 100644
index 0000000000000..4cac1d6346441
--- /dev/null
+++ b/lldb/tools/lldb-dap/ClientLauncher.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ClientLauncher.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace lldb_dap;
+
+std::optional<ClientLauncher::Client>
+ClientLauncher::GetClientFrom(llvm::StringRef str) {
+  return llvm::StringSwitch<std::optional<ClientLauncher::Client>>(str.lower())
+      .Case("vscode", ClientLauncher::VSCode)
+      .Case("vscode-url", ClientLauncher::VSCodeURL)
+      .Default(std::nullopt);
+}
+
+std::unique_ptr<ClientLauncher>
+ClientLauncher::GetLauncher(ClientLauncher::Client client) {
+  switch (client) {
+  case ClientLauncher::VSCode:
+    return std::make_unique<VSCodeLauncher>();
+  case ClientLauncher::VSCodeURL:
+    return std::make_unique<VSCodeURLPrinter>();
+  }
+  return nullptr;
+}
+
+std::string VSCodeLauncher::URLEncode(llvm::StringRef str) {
+  std::string out;
+  llvm::raw_string_ostream os(out);
+  for (char c : str) {
+    if (std::isalnum(c) || llvm::StringRef("-_.~").contains(c))
+      os << c;
+    else
+      os << '%' << llvm::utohexstr(c, false, 2);
+  }
+  return os.str();
+}
+
+std::string
+VSCodeLauncher::GetLaunchURL(const std::vector<llvm::StringRef> args) const {
+  assert(!args.empty() && "empty launch args");
+
+  std::vector<std::string> encoded_launch_args;
+  for (llvm::StringRef arg : args)
+    encoded_launch_args.push_back(URLEncode(arg));
+
+  const std::string args_str = llvm::join(encoded_launch_args, "&args=");
+  return llvm::formatv(
+             "vscode://llvm-vs-code-extensions.lldb-dap/start?program={0}",
+             args_str)
+      .str();
+}
+
+llvm::Error VSCodeLauncher::Launch(const std::vector<llvm::StringRef> args) {
+  const std::string launch_url = GetLaunchURL(args);
+  const std::string command =
+      llvm::formatv("code --open-url {0}", launch_url).str();
+
+  std::system(command.c_str());
+  return llvm::Error::success();
+}
+
+llvm::Error VSCodeURLPrinter::Launch(const std::vector<llvm::StringRef> args) {
+  llvm::outs() << GetLaunchURL(args) << '\n';
+  return llvm::Error::success();
+}
diff --git a/lldb/tools/lldb-dap/ClientLauncher.h b/lldb/tools/lldb-dap/ClientLauncher.h
new file mode 100644
index 0000000000000..780b178d2d6ef
--- /dev/null
+++ b/lldb/tools/lldb-dap/ClientLauncher.h
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_TOOLS_LLDB_DAP_CLIENTLAUNCHER_H
+#define LLDB_TOOLS_LLDB_DAP_CLIENTLAUNCHER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include <vector>
+
+namespace lldb_dap {
+
+class ClientLauncher {
+public:
+  enum Client {
+    VSCode,
+    VSCodeURL,
+  };
+
+  virtual ~ClientLauncher() = default;
+  virtual llvm::Error Launch(const std::vector<llvm::StringRef> args) = 0;
+
+  static std::optional<Client> GetClientFrom(llvm::StringRef str);
+  static std::unique_ptr<ClientLauncher> GetLauncher(Client client);
+};
+
+class VSCodeLauncher : public ClientLauncher {
+public:
+  using ClientLauncher::ClientLauncher;
+
+  llvm::Error Launch(const std::vector<llvm::StringRef> args) override;
+
+  std::string GetLaunchURL(const std::vector<llvm::StringRef> args) const;
+  static std::string URLEncode(llvm::StringRef str);
+};
+
+class VSCodeURLPrinter : public VSCodeLauncher {
+  using VSCodeLauncher::VSCodeLauncher;
+
+  llvm::Error Launch(const std::vector<llvm::StringRef> args) override;
+};
+
+} // namespace lldb_dap
+
+#endif
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index f009a902f79e7..d4203a2f00983 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -657,18 +657,20 @@ std::optional<protocol::Source> DAP::ResolveSource(const lldb::SBFrame &frame) {
   if (!frame.IsValid())
     return std::nullopt;
 
-  const lldb::SBAddress frame_pc = frame.GetPCAddress();
-  if (DisplayAssemblySource(debugger, frame_pc))
+  const lldb::SBLineEntry frame_line_entry = frame.GetLineEntry();
+  if (DisplayAssemblySource(debugger, frame_line_entry)) {
+    const lldb::SBAddress frame_pc = frame.GetPCAddress();
     return ResolveAssemblySource(frame_pc);
+  }
 
-  return CreateSource(frame.GetLineEntry().GetFileSpec());
+  return CreateSource(frame_line_entry.GetFileSpec());
 }
 
 std::optional<protocol::Source> DAP::ResolveSource(lldb::SBAddress address) {
-  if (DisplayAssemblySource(debugger, address))
+  lldb::SBLineEntry line_entry = GetLineEntryForAddress(target, address);
+  if (DisplayAssemblySource(debugger, line_entry))
     return ResolveAssemblySource(address);
 
-  lldb::SBLineEntry line_entry = GetLineEntryForAddress(target, address);
   if (!line_entry.IsValid())
     return std::nullopt;
 
@@ -1317,7 +1319,7 @@ void DAP::ProgressEventThread() {
   lldb::SBEvent event;
   bool done = false;
   while (!done) {
-    if (listener.WaitForEvent(1, event)) {
+    if (listener.WaitForEvent(UINT32_MAX, event)) {
       const auto event_mask = event.GetType();
       if (event.BroadcasterMatchesRef(broadcaster)) {
         if (event_mask & eBroadcastBitStopProgressThread) {
@@ -1375,7 +1377,6 @@ void DAP::ProgressEventThread() {
 // is required.
 void DAP::EventThread() {
   llvm::set_thread_name("lldb.DAP.client." + m_client_name + ".event_handler");
-  lldb::SBEvent event;
   lldb::SBListener listener = debugger.GetListener();
   broadcaster.AddListener(listener, eBroadcastBitStopEventThread);
   debugger.GetBroadcaster().AddListener(
@@ -1386,169 +1387,176 @@ void DAP::EventThread() {
       debugger, lldb::SBThread::GetBroadcasterClassName(),
       lldb::SBThread::eBroadcastBitStackChanged);
 
+  lldb::SBEvent event;
   bool done = false;
   while (!done) {
-    if (listener.WaitForEvent(1, event)) {
-      const auto event_mask = event.GetType();
-      if (lldb::SBProcess::EventIsProcessEvent(event)) {
-        lldb::SBProcess process = lldb::SBProcess::GetProcessFromEvent(event);
-        if (event_mask & lldb::SBProcess::eBroadcastBitStateChanged) {
-          auto state = lldb::SBProcess::GetStateFromEvent(event);
-          switch (state) {
-          case lldb::eStateConnected:
-          case lldb::eStateDetached:
-          case lldb::eStateInvalid:
-          case lldb::eStateUnloaded:
-            break;
-          case lldb::eStateAttaching:
-          case lldb::eStateCrashed:
-          case lldb::eStateLaunching:
-          case lldb::eStateStopped:
-          case lldb::eStateSuspended:
-            // Only report a stopped event if the process was not
-            // automatically restarted.
-            if (!lldb::SBProcess::GetRestartedFromEvent(event)) {
-              SendStdOutStdErr(*this, process);
-              if (llvm::Error err = SendThreadStoppedEvent(*this))
-                DAP_LOG_ERROR(log, std::move(err),
-                              "({1}) reporting thread stopped: {0}",
-                              m_client_name);
-            }
-            break;
-          case lldb::eStateRunning:
-          case lldb::eStateStepping:
-            WillContinue();
-            SendContinuedEvent(*this);
-            break;
-          case lldb::eStateExited:
-            lldb::SBStream stream;
-            process.GetStatus(stream);
-            SendOutput(OutputType::Console, stream.GetData());
-
-            // When restarting, we can get an "exited" event for the process we
-            // just killed with the old PID, or even with no PID. In that case
-            // we don't have to terminate the session.
-            if (process.GetProcessID() == LLDB_INVALID_PROCESS_ID ||
-                process.GetProcessID() == restarting_process_id) {
-              restarting_process_id = LLDB_INVALID_PROCESS_ID;
-            } else {
-              // Run any exit LLDB commands the user specified in the
-              // launch.json
-              RunExitCommands();
-              SendProcessExitedEvent(*this, process);
-              SendTerminatedEvent();
-              done = true;
-            }
-            break;
-          }
-        } else if ((event_mask & lldb::SBProcess::eBroadcastBitSTDOUT) ||
-                   (event_mask & lldb::SBProcess::eBroadcastBitSTDERR)) {
-          SendStdOutStdErr(*this, process);
-        }
-      } else if (lldb::SBTarget::EventIsTargetEvent(event)) {
-        if (event_mask & lldb::SBTarget::eBroadcastBitModulesLoaded ||
-            event_mask & lldb::SBTarget::eBroadcastBitModulesUnloaded ||
-            event_mask & lldb::SBTarget::eBroadcastBitSymbolsLoaded ||
-            event_mask & lldb::SBTarget::eBroadcastBitSymbolsChanged) {
-          const uint32_t num_modules =
-              lldb::SBTarget::GetNumModulesFromEvent(event);
-          const bool remove_module =
-              event_mask & lldb::SBTarget::eBroadcastBitModulesUnloaded;
-
-          // NOTE: Both mutexes must be acquired to prevent deadlock when
-          // handling `modules_request`, which also requires both locks.
-          lldb::SBMutex api_mutex = GetAPIMutex();
-          const std::scoped_lock<lldb::SBMutex, std::mutex> guard(
-              api_mutex, modules_mutex);
-          for (uint32_t i = 0; i < num_modules; ++i) {
-            lldb::SBModule module =
-                lldb::SBTarget::GetModuleAtIndexFromEvent(i, event);
-
-            std::optional<protocol::Module> p_module =
-                CreateModule(target, module, remove_module);
-            if (!p_module)
-              continue;
-
-            llvm::StringRef module_id = p_module->id;
-
-            const bool module_exists = modules.contains(module_id);
-            if (remove_module && module_exists) {
-              modules.erase(module_id);
-              Send(protocol::Event{
-                  "module", ModuleEventBody{std::move(p_module).value(),
-                                            ModuleEventBody::eReasonRemoved}});
-            } else if (module_exists) {
-              Send(protocol::Event{
-                  "module", ModuleEventBody{std::move(p_module).value(),
-                                            ModuleEventBody::eReasonChanged}});
-            } else if (!remove_module) {
-              modules.insert(module_id);
-              Send(protocol::Event{
-                  "module", ModuleEventBody{std::move(p_module).value(),
-                                            ModuleEventBody::eReasonNew}});
-            }
-          }
-        }
-      } else if (lldb::SBBreakpoint::EventIsBreakpointEvent(event)) {
-        if (event_mask & lldb::SBTarget::eBroadcastBitBreakpointChanged) {
-          auto event_type =
-              lldb::SBBreakpoint::GetBreakpointEventTypeFromEvent(event);
-          auto bp = Breakpoint(
-              *this, lldb::SBBreakpoint::GetBreakpointFromEvent(event));
-          // If the breakpoint was set through DAP, it will have the
-          // BreakpointBase::kDAPBreakpointLabel. Regardless of whether
-          // locations were added, removed, or resolved, the breakpoint isn't
-          // going away and the reason is always "changed".
-          if ((event_type & lldb::eBreakpointEventTypeLocationsAdded ||
-               event_type & lldb::eBreakpointEventTypeLocationsRemoved ||
-               event_type & lldb::eBreakpointEventTypeLocationsResolved) &&
-              bp.MatchesName(BreakpointBase::kDAPBreakpointLabel)) {
-            // As the DAP client already knows the path of this breakpoint, we
-            // don't need to send it back as part of the "changed" event. This
-            // avoids sending paths that should be source mapped. Note that
-            // CreateBreakpoint doesn't apply source mapping and certain
-            // implementation ignore the source part of this event anyway.
-            protocol::Breakpoint protocol_bp = bp.ToProtocolBreakpoint();
-
-            // "source" is not needed here, unless we add adapter data to be
-            // saved by the client.
-            if (protocol_bp.source && !protocol_bp.source->adapterData)
-              protocol_bp.source = std::nullopt;
-
-            llvm::json::Object body;
-            body.try_emplace("breakpoint", protocol_bp);
-            body.try_emplace("reason", "changed");
-
-            llvm::json::Object bp_event = CreateEventObject("breakpoint");
-            bp_event.try_emplace("body", std::move(body));
-
-            SendJSON(llvm::json::Value(std::move(bp_event)));
-          }
-        }
+    if (!listener.WaitForEvent(UINT32_MAX, event))
+      continue;
 
-      } else if (lldb::SBThread::EventIsThreadEvent(event)) {
-        HandleThreadEvent(event);
-      } else if (event_mask & lldb::eBroadcastBitError ||
-                 event_mask & lldb::eBroadcastBitWarning) {
-        lldb::SBStructuredData data =
-            lldb::SBDebugger::GetDiagnosticFromEvent(event);
-        if (!data.IsValid())
-          continue;
-        std::string type = GetStringValue(data.GetValueForKey("type"));
-        std::string message = GetStringValue(data.GetValueForKey("message"));
-        SendOutput(OutputType::Important,
-                   llvm::formatv("{0}: {1}", type, message).str());
-      } else if (event.BroadcasterMatchesRef(broadcaster)) {
-        if (event_mask & eBroadcastBitStopEventThread) {
-          done = true;
-        }
+    const uint32_t event_mask = event.GetType();
+    if (lldb::SBProcess::EventIsProcessEvent(event)) {
+      HandleProcessEvent(event, /*&process_exited=*/done);
+    } else if (lldb::SBTarget::EventIsTargetEvent(event)) {
+      HandleTargetEvent(event);
+    } else if (lldb::SBBreakpoint::EventIsBreakpointEvent(event)) {
+      HandleBreakpointEvent(event);
+    } else if (lldb::SBThread::EventIsThreadEvent(event)) {
+      HandleThreadEvent(event);
+    } else if (event_mask & lldb::eBroadcastBitError ||
+               event_mask & lldb::eBroadcastBitWarning) {
+      HandleDiagnosticEvent(event);
+    } else if (event.BroadcasterMatchesRef(broadcaster)) {
+      if (event_mask & eBroadcastBitStopEventThread) {
+        done = true;
       }
     }
   }
 }
 
+void DAP::HandleProcessEvent(const lldb::SBEvent &event, bool &process_exited) {
+  lldb::SBProcess process = lldb::SBProcess::GetProcessFromEvent(event);
+  const uint32_t event_mask = event.GetType();
+  if (event_mask & lldb::SBProcess::eBroadcastBitStateChanged) {
+    auto state = lldb::SBProcess::GetStateFromEvent(event);
+    switch (state) {
+    case lldb::eStateConnected:
+    case lldb::eStateDetached:
+    case lldb::eStateInvalid:
+    case lldb::eStateUnloaded:
+      break;
+    case lldb::eStateAttaching:
+    case lldb::eStateCrashed:
+    case lldb::eStateLaunching:
+    case lldb::eStateStopped:
+    case lldb::eStateSuspended:
+      // Only report a stopped event if the process was not
+      // automatically restarted.
+      if (!lldb::SBProcess::GetRestartedFromEvent(event)) {
+        SendStdOutStdErr(*this, process);
+        if (llvm::Error err = SendThreadStoppedEvent(*this))
+          DAP_LOG_ERROR(log, std::move(err),
+                        "({1}) reporting thread stopped: {0}", m_client_name);
+      }
+      break;
+    case lldb::eStateRunning:
+    case lldb::eStateStepping:
+      WillContinue();
+      SendContinuedEvent(*this);
+      break;
+    case lldb::eStateExited:
+      lldb::SBStream stream;
+      process.GetStatus(stream);
+      SendOutput(OutputType::Console, stream.GetData());
+
+      // When restarting, we can get an "exited" event for the process we
+      // just killed with the old PID, or even with no PID. In that case
+      // we don't have to terminate the session.
+      if (process.GetProcessID() == LLDB_INVALID_PROCESS_ID ||
+          process.GetProcessID() == restarting_process_id) {
+        restarting_process_id = LLDB_INVALID_PROCESS_ID;
+      } else {
+        // Run any exit LLDB commands the user specified in the
+        // launch.json
+        RunExitCommands();
+        SendProcessExitedEvent(*this, process);
+        SendTerminatedEvent();
+        process_exited = true;
+      }
+      break;
+    }
+  } else if ((event_mask & lldb::SBProcess::eBroadcastBitSTDOUT) ||
+             (event_mask & lldb::SBProcess::eBroadcastBitSTDERR)) {
+    SendStdOutStdErr(*this, process);
+  }
+}
+
+void DAP::HandleTargetEvent(const lldb::SBEvent &event) {
+  const uint32_t event_mask = event.GetType();
+  if (event_mask & lldb::SBTarget::eBroadcastBitModulesLoaded ||
+      event_mask & lldb::SBTarget::eBroadcastBitModulesUnloaded ||
+      event_mask & lldb::SBTarget::eBroadcastBitSymbolsLoaded ||
+      event_mask & lldb::SBTarget::eBroadcastBitSymbolsChanged) {
+    const uint32_t num_modules = lldb::SBTarget::GetNumModulesFromEvent(event);
+    const bool remove_module =
+        event_mask & lldb::SBTarget::eBroadcastBitModulesUnloaded;
+
+    // NOTE: Both mutexes must be acquired to prevent deadlock when
+    // handling `modules_request`, which also requires both locks.
+    lldb::SBMutex api_mutex = GetAPIMutex();
+    const std::scoped_lock<lldb::SBMutex, std::mutex> guard(api_mutex,
+                                                            modules_mutex);
+    for (uint32_t i = 0; i < num_modules; ++i) {
+      lldb::SBModule module =
+          lldb::SBTarget::GetModuleAtIndexFromEvent(i, event);
+
+      std::optional<protocol::Module> p_module =
+          CreateModule(target, module, remove_module);
+      if (!p_module)
+        continue;
+
+      const llvm::StringRef module_id = p_module->id;
+
+      const bool module_exists = modules.contains(module_id);
+      if (remove_module && module_exists) {
+        modules.erase(module_id);
+        Send(protocol::Event{"module",
+                             ModuleEventBody{std::move(p_module).value(),
+                                             ModuleEventBody::eReasonRemoved}});
+      } else if (module_exists) {
+        Send(protocol::Event{"module",
+                             ModuleEventBody{std::move(p_module).value(),
+                                             ModuleEventBody::eReasonChanged}});
+      } else if (!remove_module) {
+        modules.insert(module_id);
+        Send(protocol::Event{"module",
+                             ModuleEventBody{std::move(p_module).value(),
+                                             ModuleEventBody::eReasonNew}});
+      }
+    }
+  }
+}
+
+void DAP::HandleBreakpointEvent(const lldb::SBEvent &event) {
+  const uint32_t event_mask = event.GetType();
+  if (!(event_mask & lldb::SBTarget::eBroadcastBitBreakpointChanged))
+    return;
+
+  auto event_type = lldb::SBBreakpoint::GetBreakpointEventTypeFromEvent(event);
+  auto bp =
+      Breakpoint(*this, lldb::SBBreakpoint::GetBreakpointFromEvent(event));
+  // If the breakpoint was set through DAP, it will have the
+  // BreakpointBase::kDAPBreakpointLabel. Regardless of whether
+  // locations were added, removed, or resolved, the breakpoint isn't
+  // going away and the reason is always "changed".
+  if ((event_type & lldb::eBreakpointEventTypeLocationsAdded ||
+       event_type & lldb::eBreakpointEventTypeLocationsRemoved ||
+       event_type & lldb::eBreakpointEventTypeLocationsResolved) &&
+      bp.MatchesName(BreakpointBase::kDAPBreakpointLabel)) {
+    // As the DAP client already knows the path of this breakpoint, we
+    // don't need to send it back as part of the "changed" event. This
+    // avoids sending paths that should be source mapped. Note that
+    // CreateBreakpoint doesn't apply source mapping and certain
+    // implementation ignore the source part of this event anyway.
+    protocol::Breakpoint protocol_bp = bp.ToProtocolBreakpoint();
+
+    // "source" is not needed here, unless we add adapter data to be
+    // saved by the client.
+    if (protocol_bp.source && !protocol_bp.source->adapterData)
+      protocol_bp.source = std::nullopt;
+
+    llvm::json::Object body;
+    body.try_emplace("breakpoint", protocol_bp);
+    body.try_emplace("reason", "changed");
+
+    llvm::json::Object bp_event = CreateEventObject("breakpoint");
+    bp_event.try_emplace("body", std::move(body));
+
+    SendJSON(llvm::json::Value(std::move(bp_event)));
+  }
+}
+
 void DAP::HandleThreadEvent(const lldb::SBEvent &event) {
-  uint32_t event_type = event.GetType();
+  const uint32_t event_type = event.GetType();
 
   if (event_type & lldb::SBThread::eBroadcastBitStackChanged) {
     const lldb::SBThread evt_thread = lldb::SBThread::GetThreadFromEvent(event);
@@ -1557,6 +1565,18 @@ void DAP::HandleThreadEvent(const lldb::SBEvent &event) {
   }
 }
 
+void DAP::HandleDiagnosticEvent(const lldb::SBEvent &event) {
+  const lldb::SBStructuredData data =
+      lldb::SBDebugger::GetDiagnosticFromEvent(event);
+  if (!data.IsValid())
+    return;
+
+  std::string type = GetStringValue(data.GetValueForKey("type"));
+  std::string message = GetStringValue(data.GetValueForKey("message"));
+  SendOutput(OutputType::Important,
+             llvm::formatv("{0}: {1}", type, message).str());
+}
+
 std::vector<protocol::Breakpoint> DAP::SetSourceBreakpoints(
     const protocol::Source &source,
     const std::optional<std::vector<protocol::SourceBreakpoint>> &breakpoints) {
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index b4f111e4e720c..5d40341329f34 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -454,7 +454,11 @@ struct DAP final : public DAPTransport::MessageHandler {
   /// Event threads.
   /// @{
   void EventThread();
+  void HandleProcessEvent(const lldb::SBEvent &event, bool &process_exited);
+  void HandleTargetEvent(const lldb::SBEvent &event);
+  void HandleBreakpointEvent(const lldb::SBEvent &event);
   void HandleThreadEvent(const lldb::SBEvent &event);
+  void HandleDiagnosticEvent(const lldb::SBEvent &event);
   void ProgressEventThread();
 
   std::thread event_thread;
diff --git a/lldb/tools/lldb-dap/EventHelper.cpp b/lldb/tools/lldb-dap/EventHelper.cpp
index c5d5f2bb59b42..12d9e21c52ab3 100644
--- a/lldb/tools/lldb-dap/EventHelper.cpp
+++ b/lldb/tools/lldb-dap/EventHelper.cpp
@@ -176,7 +176,7 @@ llvm::Error SendThreadStoppedEvent(DAP &dap, bool on_entry) {
 
   llvm::DenseSet<lldb::tid_t> old_thread_ids;
   old_thread_ids.swap(dap.thread_ids);
-  uint32_t stop_id = process.GetStopID();
+  uint32_t stop_id = on_entry ? 0 : process.GetStopID();
   const uint32_t num_threads = process.GetNumThreads();
 
   // First make a pass through the threads to see if the focused thread
diff --git a/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp
index c1c2adb32a510..ddf55e6fb382d 100644
--- a/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp
@@ -7,168 +7,75 @@
 //===----------------------------------------------------------------------===//
 
 #include "DAP.h"
-#include "EventHelper.h"
-#include "JSONUtils.h"
+#include "DAPError.h"
+#include "Protocol/ProtocolRequests.h"
+#include "Protocol/ProtocolTypes.h"
 #include "RequestHandler.h"
 #include "lldb/API/SBStream.h"
 
+using namespace lldb_dap::protocol;
+
 namespace lldb_dap {
 
-// "ExceptionInfoRequest": {
-//   "allOf": [ { "$ref": "#/definitions/Request" }, {
-//     "type": "object",
-//     "description": "Retrieves the details of the exception that
-//     caused this event to be raised. Clients should only call this request if
-//     the corresponding capability `supportsExceptionInfoRequest` is true.",
-//     "properties": {
-//       "command": {
-//         "type": "string",
-//         "enum": [ "exceptionInfo" ]
-//       },
-//       "arguments": {
-//         "$ref": "#/definitions/ExceptionInfoArguments"
-//       }
-//     },
-//     "required": [ "command", "arguments"  ]
-//   }]
-// },
-// "ExceptionInfoArguments": {
-//   "type": "object",
-//   "description": "Arguments for `exceptionInfo` request.",
-//   "properties": {
-//     "threadId": {
-//       "type": "integer",
-//       "description": "Thread for which exception information should be
-//       retrieved."
-//     }
-//   },
-//   "required": [ "threadId" ]
-// },
-// "ExceptionInfoResponse": {
-//   "allOf": [ { "$ref": "#/definitions/Response" }, {
-//     "type": "object",
-//     "description": "Response to `exceptionInfo` request.",
-//     "properties": {
-//       "body": {
-//         "type": "object",
-//         "properties": {
-//           "exceptionId": {
-//             "type": "string",
-//             "description": "ID of the exception that was thrown."
-//           },
-//           "description": {
-//             "type": "string",
-//             "description": "Descriptive text for the exception."
-//           },
-//           "breakMode": {
-//          "$ref": "#/definitions/ExceptionBreakMode",
-//            "description": "Mode that caused the exception notification to
-//            be raised."
-//           },
-//           "details": {
-//             "$ref": "#/definitions/ExceptionDetails",
-//            "description": "Detailed information about the exception."
-//           }
-//         },
-//         "required": [ "exceptionId", "breakMode" ]
-//       }
-//     },
-//     "required": [ "body" ]
-//   }]
-// }
-// "ExceptionDetails": {
-//   "type": "object",
-//   "description": "Detailed information about an exception that has
-//   occurred.", "properties": {
-//     "message": {
-//       "type": "string",
-//       "description": "Message contained in the exception."
-//     },
-//     "typeName": {
-//       "type": "string",
-//       "description": "Short type name of the exception object."
-//     },
-//     "fullTypeName": {
-//       "type": "string",
-//       "description": "Fully-qualified type name of the exception object."
-//     },
-//     "evaluateName": {
-//       "type": "string",
-//       "description": "An expression that can be evaluated in the current
-//       scope to obtain the exception object."
-//     },
-//     "stackTrace": {
-//       "type": "string",
-//       "description": "Stack trace at the time the exception was thrown."
-//     },
-//     "innerException": {
-//       "type": "array",
-//       "items": {
-//         "$ref": "#/definitions/ExceptionDetails"
-//       },
-//       "description": "Details of the exception contained by this exception,
-//       if any."
-//     }
-//   }
-// },
-void ExceptionInfoRequestHandler::operator()(
-    const llvm::json::Object &request) const {
-  llvm::json::Object response;
-  FillResponse(request, response);
-  const auto *arguments = request.getObject("arguments");
-  llvm::json::Object body;
-  lldb::SBThread thread = dap.GetLLDBThread(*arguments);
-  if (thread.IsValid()) {
-    auto stopReason = thread.GetStopReason();
-    if (stopReason == lldb::eStopReasonSignal)
-      body.try_emplace("exceptionId", "signal");
-    else if (stopReason == lldb::eStopReasonBreakpoint) {
-      ExceptionBreakpoint *exc_bp = dap.GetExceptionBPFromStopReason(thread);
-      if (exc_bp) {
-        EmplaceSafeString(body, "exceptionId", exc_bp->GetFilter());
-        EmplaceSafeString(body, "description", exc_bp->GetLabel());
-      } else {
-        body.try_emplace("exceptionId", "exception");
-      }
+/// Retrieves the details of the exception that caused this event to be raised.
+///
+/// Clients should only call this request if the corresponding capability
+/// `supportsExceptionInfoRequest` is true.
+llvm::Expected<ExceptionInfoResponseBody>
+ExceptionInfoRequestHandler::Run(const ExceptionInfoArguments &args) const {
+
+  lldb::SBThread thread = dap.GetLLDBThread(args.threadId);
+  if (!thread.IsValid())
+    return llvm::make_error<DAPError>(
+        llvm::formatv("Invalid thread id: {}", args.threadId).str());
+
+  ExceptionInfoResponseBody response;
+  response.breakMode = eExceptionBreakModeAlways;
+  const lldb::StopReason stop_reason = thread.GetStopReason();
+  switch (stop_reason) {
+  case lldb::eStopReasonSignal:
+    response.exceptionId = "signal";
+    break;
+  case lldb::eStopReasonBreakpoint: {
+    const ExceptionBreakpoint *exc_bp =
+        dap.GetExceptionBPFromStopReason(thread);
+    if (exc_bp) {
+      response.exceptionId = exc_bp->GetFilter();
+      response.description = exc_bp->GetLabel();
     } else {
-      body.try_emplace("exceptionId", "exception");
+      response.exceptionId = "exception";
     }
-    if (!ObjectContainsKey(body, "description")) {
-      char description[1024];
-      if (thread.GetStopDescription(description, sizeof(description))) {
-        EmplaceSafeString(body, "description", description);
-      }
+  } break;
+  default:
+    response.exceptionId = "exception";
+  }
+
+  lldb::SBStream stream;
+  if (response.description.empty()) {
+    if (thread.GetStopDescription(stream)) {
+      response.description = {stream.GetData(), stream.GetSize()};
     }
-    body.try_emplace("breakMode", "always");
-    auto exception = thread.GetCurrentException();
-    if (exception.IsValid()) {
-      llvm::json::Object details;
-      lldb::SBStream stream;
-      if (exception.GetDescription(stream)) {
-        EmplaceSafeString(details, "message", stream.GetData());
-      }
+  }
 
-      auto exceptionBacktrace = thread.GetCurrentExceptionBacktrace();
-      if (exceptionBacktrace.IsValid()) {
-        lldb::SBStream stream;
-        exceptionBacktrace.GetDescription(stream);
-        for (uint32_t i = 0; i < exceptionBacktrace.GetNumFrames(); i++) {
-          lldb::SBFrame frame = exceptionBacktrace.GetFrameAtIndex(i);
-          frame.GetDescription(stream);
-        }
-        EmplaceSafeString(details, "stackTrace", stream.GetData());
-      }
+  if (lldb::SBValue exception = thread.GetCurrentException()) {
+    stream.Clear();
+    response.details = ExceptionDetails{};
+    if (exception.GetDescription(stream)) {
+      response.details->message = {stream.GetData(), stream.GetSize()};
+    }
+
+    if (lldb::SBThread exception_backtrace =
+            thread.GetCurrentExceptionBacktrace()) {
+      stream.Clear();
+      exception_backtrace.GetDescription(stream);
 
-      body.try_emplace("details", std::move(details));
+      for (uint32_t idx = 0; idx < exception_backtrace.GetNumFrames(); idx++) {
+        lldb::SBFrame frame = exception_backtrace.GetFrameAtIndex(idx);
+        frame.GetDescription(stream);
+      }
+      response.details->stackTrace = {stream.GetData(), stream.GetSize()};
     }
-    // auto excInfoCount = thread.GetStopReasonDataCount();
-    // for (auto i=0; i<excInfoCount; ++i) {
-    //   uint64_t exc_data = thread.GetStopReasonDataAtIndex(i);
-    // }
-  } else {
-    response["success"] = llvm::json::Value(false);
   }
-  response.try_emplace("body", std::move(body));
-  dap.SendJSON(llvm::json::Value(std::move(response)));
+  return response;
 }
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.h b/lldb/tools/lldb-dap/Handler/RequestHandler.h
index 977a247996750..bc22133d92453 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.h
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.h
@@ -302,14 +302,18 @@ class EvaluateRequestHandler : public LegacyRequestHandler {
   }
 };
 
-class ExceptionInfoRequestHandler : public LegacyRequestHandler {
+class ExceptionInfoRequestHandler final
+    : public RequestHandler<
+          protocol::ExceptionInfoArguments,
+          llvm::Expected<protocol::ExceptionInfoResponseBody>> {
 public:
-  using LegacyRequestHandler::LegacyRequestHandler;
+  using RequestHandler::RequestHandler;
   static llvm::StringLiteral GetCommand() { return "exceptionInfo"; }
   FeatureSet GetSupportedFeatures() const override {
     return {protocol::eAdapterFeatureExceptionInfoRequest};
   }
-  void operator()(const llvm::json::Object &request) const override;
+  llvm::Expected<protocol::ExceptionInfoResponseBody>
+  Run(const protocol::ExceptionInfoArguments &args) const override;
 };
 
 class InitializeRequestHandler
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 2780a5b7748e8..1a3a6701b194d 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -711,7 +711,7 @@ llvm::json::Value CreateThreadStopped(DAP &dap, lldb::SBThread &thread,
     break;
   }
   if (stop_id == 0)
-    body.try_emplace("reason", "entry");
+    body["reason"] = "entry";
   const lldb::tid_t tid = thread.GetThreadID();
   body.try_emplace("threadId", (int64_t)tid);
   // If no description has been set, then set it to the default thread stopped
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
index b9393356b4e01..44ae79f8b9f43 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
@@ -625,4 +625,22 @@ llvm::json::Value toJSON(const ModuleSymbolsResponseBody &DGMSR) {
   return result;
 }
 
+bool fromJSON(const json::Value &Params, ExceptionInfoArguments &Args,
+              json::Path Path) {
+  json::ObjectMapper O(Params, Path);
+  return O && O.map("threadId", Args.threadId);
+}
+
+json::Value toJSON(const ExceptionInfoResponseBody &ERB) {
+  json::Object result{{"exceptionId", ERB.exceptionId},
+                      {"breakMode", ERB.breakMode}};
+
+  if (!ERB.description.empty())
+    result.insert({"description", ERB.description});
+  if (ERB.details.has_value())
+    result.insert({"details", *ERB.details});
+
+  return result;
+}
+
 } // namespace lldb_dap::protocol
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
index a85a68b87014c..b894f2b4ed44d 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
@@ -1039,6 +1039,28 @@ struct ModuleSymbolsResponseBody {
 };
 llvm::json::Value toJSON(const ModuleSymbolsResponseBody &);
 
+struct ExceptionInfoArguments {
+  /// Thread for which exception information should be retrieved.
+  lldb::tid_t threadId = LLDB_INVALID_THREAD_ID;
+};
+bool fromJSON(const llvm::json::Value &, ExceptionInfoArguments &,
+              llvm::json::Path);
+
+struct ExceptionInfoResponseBody {
+  /// ID of the exception that was thrown.
+  std::string exceptionId;
+
+  /// Descriptive text for the exception.
+  std::string description;
+
+  /// Mode that caused the exception notification to be raised.
+  ExceptionBreakMode breakMode = eExceptionBreakModeNever;
+
+  /// Detailed information about the exception.
+  std::optional<ExceptionDetails> details;
+};
+llvm::json::Value toJSON(const ExceptionInfoResponseBody &);
+
 } // namespace lldb_dap::protocol
 
 #endif
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
index dc8edaadcd9bb..95007013742a0 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
@@ -1136,4 +1136,37 @@ bool fromJSON(const json::Value &Param, Variable &V, json::Path Path) {
                                Path, /*required=*/false);
 }
 
+json::Value toJSON(const ExceptionBreakMode Mode) {
+  switch (Mode) {
+  case eExceptionBreakModeNever:
+    return "never";
+  case eExceptionBreakModeAlways:
+    return "always";
+  case eExceptionBreakModeUnhandled:
+    return "unhandled";
+  case eExceptionBreakModeUserUnhandled:
+    return "userUnhandled";
+  }
+  llvm_unreachable("unhandled exception breakMode.");
+}
+
+json::Value toJSON(const ExceptionDetails &ED) {
+  json::Object result;
+
+  if (!ED.message.empty())
+    result.insert({"message", ED.message});
+  if (!ED.typeName.empty())
+    result.insert({"typeName", ED.typeName});
+  if (!ED.fullTypeName.empty())
+    result.insert({"fullTypeName", ED.fullTypeName});
+  if (!ED.evaluateName.empty())
+    result.insert({"evaluateName", ED.evaluateName});
+  if (!ED.stackTrace.empty())
+    result.insert({"stackTrace", ED.stackTrace});
+  if (!ED.innerException.empty())
+    result.insert({"innerException", ED.innerException});
+
+  return result;
+}
+
 } // namespace lldb_dap::protocol
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
index 7077df90a85b5..6d85c74377bd3 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
@@ -1007,6 +1007,36 @@ struct Variable {
 llvm::json::Value toJSON(const Variable &);
 bool fromJSON(const llvm::json::Value &, Variable &, llvm::json::Path);
 
+enum ExceptionBreakMode : unsigned {
+  eExceptionBreakModeNever,
+  eExceptionBreakModeAlways,
+  eExceptionBreakModeUnhandled,
+  eExceptionBreakModeUserUnhandled,
+};
+llvm::json::Value toJSON(ExceptionBreakMode);
+
+struct ExceptionDetails {
+  /// Message contained in the exception.
+  std::string message;
+
+  /// Short type name of the exception object.
+  std::string typeName;
+
+  /// Fully-qualified type name of the exception object.
+  std::string fullTypeName;
+
+  /// An expression that can be evaluated in the current scope to obtain the
+  /// exception object.
+  std::string evaluateName;
+
+  /// Stack trace at the time the exception was thrown.
+  std::string stackTrace;
+
+  /// Details of the exception contained by this exception, if any.
+  std::vector<ExceptionDetails> innerException;
+};
+llvm::json::Value toJSON(const ExceptionDetails &);
+
 } // namespace lldb_dap::protocol
 
 #endif
diff --git a/lldb/tools/lldb-dap/ProtocolUtils.cpp b/lldb/tools/lldb-dap/ProtocolUtils.cpp
index 868c67ca72986..acf31b03f7af0 100644
--- a/lldb/tools/lldb-dap/ProtocolUtils.cpp
+++ b/lldb/tools/lldb-dap/ProtocolUtils.cpp
@@ -27,7 +27,7 @@ using namespace lldb_dap::protocol;
 namespace lldb_dap {
 
 static bool ShouldDisplayAssemblySource(
-    lldb::SBAddress address,
+    lldb::SBLineEntry line_entry,
     lldb::StopDisassemblyType stop_disassembly_display) {
   if (stop_disassembly_display == lldb::eStopDisassemblyTypeNever)
     return false;
@@ -37,7 +37,6 @@ static bool ShouldDisplayAssemblySource(
 
   // A line entry of 0 indicates the line is compiler generated i.e. no source
   // file is associated with the frame.
-  auto line_entry = address.GetLineEntry();
   auto file_spec = line_entry.GetFileSpec();
   if (!file_spec.IsValid() || line_entry.GetLine() == 0 ||
       line_entry.GetLine() == LLDB_INVALID_LINE_NUMBER)
@@ -174,10 +173,10 @@ bool IsAssemblySource(const protocol::Source &source) {
 }
 
 bool DisplayAssemblySource(lldb::SBDebugger &debugger,
-                           lldb::SBAddress address) {
+                           lldb::SBLineEntry line_entry) {
   const lldb::StopDisassemblyType stop_disassembly_display =
       GetStopDisassemblyDisplay(debugger);
-  return ShouldDisplayAssemblySource(address, stop_disassembly_display);
+  return ShouldDisplayAssemblySource(line_entry, stop_disassembly_display);
 }
 
 std::string GetLoadAddressString(const lldb::addr_t addr) {
diff --git a/lldb/tools/lldb-dap/ProtocolUtils.h b/lldb/tools/lldb-dap/ProtocolUtils.h
index a1f7ae0661914..f4d576ba9f608 100644
--- a/lldb/tools/lldb-dap/ProtocolUtils.h
+++ b/lldb/tools/lldb-dap/ProtocolUtils.h
@@ -53,7 +53,8 @@ std::optional<protocol::Source> CreateSource(const lldb::SBFileSpec &file);
 /// Checks if the given source is for assembly code.
 bool IsAssemblySource(const protocol::Source &source);
 
-bool DisplayAssemblySource(lldb::SBDebugger &debugger, lldb::SBAddress address);
+bool DisplayAssemblySource(lldb::SBDebugger &debugger,
+                           lldb::SBLineEntry line_entry);
 
 /// Get the address as a 16-digit hex string, e.g. "0x0000000000012345"
 std::string GetLoadAddressString(const lldb::addr_t addr);
diff --git a/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts b/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts
index 7060638a94864..433d48fab9d85 100644
--- a/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts
+++ b/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts
@@ -6,6 +6,7 @@ import * as fs from "node:fs/promises";
 import { ConfigureButton, OpenSettingsButton } from "./ui/show-error-message";
 import { ErrorWithNotification } from "./ui/error-with-notification";
 import { LogFilePathProvider, LogType } from "./logging";
+import { expandUser } from "./utils";
 
 const exec = util.promisify(child_process.execFile);
 
@@ -116,8 +117,9 @@ async function getDAPExecutable(
   configuration: vscode.DebugConfiguration,
 ): Promise<string> {
   // Check if the executable was provided in the launch configuration.
-  const launchConfigPath = configuration["debugAdapterExecutable"];
+  let launchConfigPath = configuration["debugAdapterExecutable"];
   if (typeof launchConfigPath === "string" && launchConfigPath.length !== 0) {
+    launchConfigPath = expandUser(launchConfigPath);
     if (!(await isExecutable(launchConfigPath))) {
       throw new ErrorWithNotification(
         `Debug adapter path "${launchConfigPath}" is not a valid file. The path comes from your launch configuration.`,
@@ -129,7 +131,7 @@ async function getDAPExecutable(
 
   // Check if the executable was provided in the extension's configuration.
   const config = vscode.workspace.getConfiguration("lldb-dap", workspaceFolder);
-  const configPath = config.get<string>("executable-path");
+  const configPath = expandUser(config.get<string>("executable-path") ?? "");
   if (configPath && configPath.length !== 0) {
     if (!(await isExecutable(configPath))) {
       throw new ErrorWithNotification(
diff --git a/lldb/tools/lldb-dap/src-ts/utils.ts b/lldb/tools/lldb-dap/src-ts/utils.ts
new file mode 100644
index 0000000000000..efebe0b0f42ba
--- /dev/null
+++ b/lldb/tools/lldb-dap/src-ts/utils.ts
@@ -0,0 +1,41 @@
+import * as os from "os";
+import * as path from "path";
+
+/**
+ * Expands the character `~` to the user's home directory
+ */
+export function expandUser(file_path: string): string {
+  if (os.platform() == "win32") {
+    return file_path;
+  }
+
+  if (!file_path) {
+    return "";
+  }
+
+  if (!file_path.startsWith("~")) {
+    return file_path;
+  }
+
+  const path_len = file_path.length;
+  if (path_len == 1) {
+    return os.homedir();
+  }
+
+  if (file_path.charAt(1) == path.sep) {
+    return path.join(os.homedir(), file_path.substring(1));
+  }
+
+  const sep_index = file_path.indexOf(path.sep);
+  const user_name_end = sep_index == -1 ? file_path.length : sep_index;
+  const user_name = file_path.substring(1, user_name_end);
+  try {
+    if (user_name == os.userInfo().username) {
+      return path.join(os.homedir(), file_path.substring(user_name_end));
+    }
+  } catch (err) {
+    return file_path;
+  }
+
+  return file_path;
+}
diff --git a/lldb/tools/lldb-dap/tool/CMakeLists.txt b/lldb/tools/lldb-dap/tool/CMakeLists.txt
index b39a4ed9c40e7..5335d25c5d450 100644
--- a/lldb/tools/lldb-dap/tool/CMakeLists.txt
+++ b/lldb/tools/lldb-dap/tool/CMakeLists.txt
@@ -1,3 +1,7 @@
+set(LLVM_TARGET_DEFINITIONS Options.td)
+tablegen(LLVM Options.inc -gen-opt-parser-defs)
+add_public_tablegen_target(LLDBDAPOptionsTableGen)
+
 add_lldb_tool(lldb-dap
   lldb-dap.cpp
 
diff --git a/lldb/tools/lldb-dap/Options.td b/lldb/tools/lldb-dap/tool/Options.td
similarity index 94%
rename from lldb/tools/lldb-dap/Options.td
rename to lldb/tools/lldb-dap/tool/Options.td
index 5e9dd7a1d6419..339a64fed6c32 100644
--- a/lldb/tools/lldb-dap/Options.td
+++ b/lldb/tools/lldb-dap/tool/Options.td
@@ -82,3 +82,11 @@ def connection_timeout: S<"connection-timeout">,
     "timeout is reached, the server will be closed and the process will exit. "
     "Not specifying this argument or specifying non-positive values will "
     "cause the server to wait for new connections indefinitely.">;
+
+def client
+    : S<"client">,
+      MetaVarName<"<client>">,
+      HelpText<
+          "Use lldb-dap as a launcher for a curated number of DAP client.">;
+
+def REM : R<["--"], "">;
diff --git a/lldb/tools/lldb-dap/tool/lldb-dap.cpp b/lldb/tools/lldb-dap/tool/lldb-dap.cpp
index 45caa1a81059b..f10ed12344cbd 100644
--- a/lldb/tools/lldb-dap/tool/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/tool/lldb-dap.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ClientLauncher.h"
 #include "DAP.h"
 #include "DAPLog.h"
 #include "EventHelper.h"
@@ -141,6 +142,12 @@ static void PrintHelp(LLDBDAPOptTable &table, llvm::StringRef tool_name) {
   debugger to attach to the process.
 
     lldb-dap -g
+
+  You can also use lldb-dap to launch a supported client, for example the
+  LLDB-DAP Visual Studio Code extension.
+
+    lldb-dap --client vscode -- /path/to/binary <args>
+
 )___";
 }
 
@@ -150,6 +157,29 @@ static void PrintVersion() {
   llvm::outs() << "liblldb: " << lldb::SBDebugger::GetVersionString() << '\n';
 }
 
+static llvm::Error LaunchClient(const llvm::opt::InputArgList &args) {
+  auto *client_arg = args.getLastArg(OPT_client);
+  assert(client_arg && "must have client arg");
+
+  std::optional<ClientLauncher::Client> client =
+      ClientLauncher::GetClientFrom(client_arg->getValue());
+  if (!client)
+    return llvm::createStringError(
+        llvm::formatv("unsupported client: {0}", client_arg->getValue()));
+
+  std::vector<llvm::StringRef> launch_args;
+  if (auto *arg = args.getLastArgNoClaim(OPT_REM)) {
+    for (auto *value : arg->getValues()) {
+      launch_args.push_back(value);
+    }
+  }
+
+  if (launch_args.empty())
+    return llvm::createStringError("no launch arguments provided");
+
+  return ClientLauncher::GetLauncher(*client)->Launch(launch_args);
+}
+
 #if not defined(_WIN32)
 struct FDGroup {
   int GetFlags() const {
@@ -541,6 +571,14 @@ int main(int argc, char *argv[]) {
     return EXIT_SUCCESS;
   }
 
+  if (input_args.hasArg(OPT_client)) {
+    if (llvm::Error error = LaunchClient(input_args)) {
+      llvm::WithColor::error() << llvm::toString(std::move(error)) << '\n';
+      return EXIT_FAILURE;
+    }
+    return EXIT_SUCCESS;
+  }
+
   ReplMode default_repl_mode = ReplMode::Auto;
   if (input_args.hasArg(OPT_repl_mode)) {
     llvm::opt::Arg *repl_mode = input_args.getLastArg(OPT_repl_mode);
diff --git a/lldb/unittests/Core/CMakeLists.txt b/lldb/unittests/Core/CMakeLists.txt
index 6e609a63ad9b6..f0c9a9a9d5056 100644
--- a/lldb/unittests/Core/CMakeLists.txt
+++ b/lldb/unittests/Core/CMakeLists.txt
@@ -7,6 +7,7 @@ add_lldb_unittest(LLDBCoreTests
   DumpRegisterInfoTest.cpp
   FormatEntityTest.cpp
   MangledTest.cpp
+  ModuleListTest.cpp
   ModuleSpecTest.cpp
   PluginManagerTest.cpp
   ProgressReportTest.cpp
diff --git a/lldb/unittests/Core/MangledTest.cpp b/lldb/unittests/Core/MangledTest.cpp
index cbc0c5d951b99..706e67801e01a 100644
--- a/lldb/unittests/Core/MangledTest.cpp
+++ b/lldb/unittests/Core/MangledTest.cpp
@@ -636,6 +636,16 @@ DemanglingPartsTestCase g_demangling_parts_test_cases[] = {
      /*.basename=*/"operator()",
      /*.scope=*/"dyld4::Loader::runInitializersBottomUpPlusUpwardLinks(dyld4::RuntimeState&) const::$_0::",
      /*.qualifiers=*/" const",
+   },
+   {"_Z4funcILN3foo4EnumE1EEvv",
+     {
+       /*.BasenameRange=*/{5, 9}, /*.TemplateArgumentsRange=*/{9, 23}, /*.ScopeRange=*/{5, 5},
+       /*.ArgumentsRange=*/{23, 25}, /*.QualifiersRange=*/{25, 25}, /*.NameQualifiersRange=*/{0, 0},
+       /*.PrefixRange=*/{0, 0}, /*.SuffixRange=*/{0, 0}
+     },
+     /*.basename=*/"func",
+     /*.scope=*/"",
+     /*.qualifiers=*/"",
    }
     // clang-format on
 };
diff --git a/lldb/unittests/Core/ModuleListTest.cpp b/lldb/unittests/Core/ModuleListTest.cpp
new file mode 100644
index 0000000000000..3c70b0a4b21b8
--- /dev/null
+++ b/lldb/unittests/Core/ModuleListTest.cpp
@@ -0,0 +1,178 @@
+//===-- ModuleListTest.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Core/ModuleList.h"
+#include "TestingSupport/SubsystemRAII.h"
+#include "TestingSupport/TestUtilities.h"
+#include "lldb/Core/Module.h"
+#include "lldb/Core/ModuleSpec.h"
+#include "lldb/Host/FileSystem.h"
+#include "lldb/Utility/ArchSpec.h"
+#include "lldb/Utility/UUID.h"
+
+#include "Plugins/ObjectFile/ELF/ObjectFileELF.h"
+
+#include "gtest/gtest.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+// Test that when we already have a module in the shared_module_list with a
+// specific UUID, the next call to GetSharedModule with a module_spec with the
+// same UUID should return the existing module instead of creating a new one.
+TEST(ModuleListTest, GetSharedModuleReusesExistingModuleWithSameUUID) {
+  SubsystemRAII<FileSystem, ObjectFileELF> subsystems;
+
+  auto ExpectedFile = TestFile::fromYaml(R"(
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000010
+...
+)");
+  ASSERT_THAT_EXPECTED(ExpectedFile, llvm::Succeeded());
+
+  // First, let's verify that calling GetSharedModule twice with the same
+  // module_spec returns the same module pointer
+
+  ModuleSP first_module;
+  bool first_did_create = false;
+  Status error_first =
+      ModuleList::GetSharedModule(ExpectedFile->moduleSpec(), first_module,
+                                  nullptr, &first_did_create, false);
+
+  // Second call with the same spec
+  ModuleSP second_module;
+  bool second_did_create = false;
+  Status error_second =
+      ModuleList::GetSharedModule(ExpectedFile->moduleSpec(), second_module,
+                                  nullptr, &second_did_create, false);
+
+  if (error_first.Success() && error_second.Success()) {
+    // If both succeeded, verify they're the same module
+    EXPECT_EQ(first_module.get(), second_module.get())
+        << "GetSharedModule should return the same module for the same spec";
+    EXPECT_TRUE(first_did_create) << "First call should create the module";
+    EXPECT_FALSE(second_did_create)
+        << "Second call should reuse the existing module";
+  }
+}
+
+// Test that UUID-based lookup finds existing modules
+TEST(ModuleListTest, FindSharedModuleByUUID) {
+  SubsystemRAII<FileSystem, ObjectFileELF> subsystems;
+
+  auto ExpectedFile = TestFile::fromYaml(R"(
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000010
+...
+)");
+  ASSERT_THAT_EXPECTED(ExpectedFile, llvm::Succeeded());
+
+  // Create and add a module to the shared module list using the moduleSpec()
+  ModuleSP created_module;
+  bool did_create = false;
+  Status error = ModuleList::GetSharedModule(
+      ExpectedFile->moduleSpec(), created_module, nullptr, &did_create, false);
+
+  if (error.Success() && created_module) {
+    // Get the UUID of the created module
+    UUID module_uuid = created_module->GetUUID();
+
+    if (module_uuid.IsValid()) {
+      // Now try to find the module by UUID
+      ModuleSP found_module = ModuleList::FindSharedModule(module_uuid);
+
+      ASSERT_NE(found_module.get(), nullptr)
+          << "FindSharedModule should find the module by UUID";
+      EXPECT_EQ(found_module.get(), created_module.get())
+          << "FindSharedModule should return the same module instance";
+      EXPECT_EQ(found_module->GetUUID(), module_uuid)
+          << "Found module should have the same UUID";
+    }
+  }
+}
+
+// Test that GetSharedModule with UUID finds existing module even with different
+// path
+TEST(ModuleListTest, GetSharedModuleByUUIDIgnoresPath) {
+  SubsystemRAII<FileSystem, ObjectFileELF> subsystems;
+
+  auto ExpectedFile = TestFile::fromYaml(R"(
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000010
+...
+)");
+  ASSERT_THAT_EXPECTED(ExpectedFile, llvm::Succeeded());
+
+  // Create and add a module to the shared module list
+  ModuleSP first_module;
+  bool first_did_create = false;
+  Status first_error =
+      ModuleList::GetSharedModule(ExpectedFile->moduleSpec(), first_module,
+                                  nullptr, &first_did_create, false);
+
+  if (first_error.Success() && first_module) {
+    UUID module_uuid = first_module->GetUUID();
+
+    if (module_uuid.IsValid()) {
+      // Now try to get a module with the same UUID but different path
+      ModuleSpec second_spec;
+      second_spec.GetFileSpec() = FileSpec("/different/path/to/module.so");
+      second_spec.GetArchitecture() = ArchSpec("x86_64-pc-linux");
+      second_spec.GetUUID() = module_uuid;
+
+      ModuleSP second_module;
+      bool second_did_create = false;
+      Status second_error = ModuleList::GetSharedModule(
+          second_spec, second_module, nullptr, &second_did_create, false);
+
+      if (second_error.Success() && second_module) {
+        // If we got a module back, check if it's the same one
+        bool is_same_module = (second_module.get() == first_module.get());
+
+        // Document the behavior: ideally UUID should take precedence
+        // and return the existing module
+        EXPECT_TRUE(is_same_module)
+            << "GetSharedModule with matching UUID should return existing "
+               "module, "
+            << "even with different path (per PR #160199)";
+
+        if (is_same_module) {
+          EXPECT_FALSE(second_did_create)
+              << "Should not create a new module when UUID matches";
+        }
+      }
+    }
+  }
+}
diff --git a/lldb/unittests/DAP/CMakeLists.txt b/lldb/unittests/DAP/CMakeLists.txt
index a08414c30e6cd..a478cf07eedb2 100644
--- a/lldb/unittests/DAP/CMakeLists.txt
+++ b/lldb/unittests/DAP/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_lldb_unittest(DAPTests
+  ClientLauncherTest.cpp
   DAPErrorTest.cpp
   DAPTest.cpp
   DAPTypesTest.cpp
@@ -7,6 +8,7 @@ add_lldb_unittest(DAPTests
   Handler/ContinueTest.cpp
   JSONUtilsTest.cpp
   LLDBUtilsTest.cpp
+  ProtocolRequestsTest.cpp
   ProtocolTypesTest.cpp
   ProtocolUtilsTest.cpp
   TestBase.cpp
diff --git a/lldb/unittests/DAP/ClientLauncherTest.cpp b/lldb/unittests/DAP/ClientLauncherTest.cpp
new file mode 100644
index 0000000000000..dbaf9ee786336
--- /dev/null
+++ b/lldb/unittests/DAP/ClientLauncherTest.cpp
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ClientLauncher.h"
+#include "llvm/ADT/StringRef.h"
+#include "gtest/gtest.h"
+#include <optional>
+
+using namespace lldb_dap;
+using namespace llvm;
+
+TEST(ClientLauncherTest, GetClientFromVSCode) {
+  std::optional<ClientLauncher::Client> result =
+      ClientLauncher::GetClientFrom("vscode");
+  ASSERT_TRUE(result.has_value());
+  EXPECT_EQ(ClientLauncher::VSCode, result.value());
+}
+
+TEST(ClientLauncherTest, GetClientFromVSCodeUpperCase) {
+  std::optional<ClientLauncher::Client> result =
+      ClientLauncher::GetClientFrom("VSCODE");
+  ASSERT_TRUE(result.has_value());
+  EXPECT_EQ(ClientLauncher::VSCode, result.value());
+}
+
+TEST(ClientLauncherTest, GetClientFromVSCodeMixedCase) {
+  std::optional<ClientLauncher::Client> result =
+      ClientLauncher::GetClientFrom("VSCode");
+  ASSERT_TRUE(result.has_value());
+  EXPECT_EQ(ClientLauncher::VSCode, result.value());
+}
+
+TEST(ClientLauncherTest, GetClientFromInvalidString) {
+  std::optional<ClientLauncher::Client> result =
+      ClientLauncher::GetClientFrom("invalid");
+  EXPECT_FALSE(result.has_value());
+}
+
+TEST(ClientLauncherTest, GetClientFromEmptyString) {
+  std::optional<ClientLauncher::Client> result =
+      ClientLauncher::GetClientFrom("");
+  EXPECT_FALSE(result.has_value());
+}
+
+TEST(ClientLauncherTest, URLEncode) {
+  EXPECT_EQ("", VSCodeLauncher::URLEncode(""));
+  EXPECT_EQ(
+      "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_.~",
+      VSCodeLauncher::URLEncode("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRST"
+                                "UVWXYZ0123456789-_.~"));
+  EXPECT_EQ("hello%20world", VSCodeLauncher::URLEncode("hello world"));
+  EXPECT_EQ("hello%21%40%23%24", VSCodeLauncher::URLEncode("hello!@#$"));
+  EXPECT_EQ("%2Fpath%2Fto%2Ffile", VSCodeLauncher::URLEncode("/path/to/file"));
+  EXPECT_EQ("key%3Dvalue%26key2%3Dvalue2",
+            VSCodeLauncher::URLEncode("key=value&key2=value2"));
+  EXPECT_EQ("100%25complete", VSCodeLauncher::URLEncode("100%complete"));
+  EXPECT_EQ("file_name%20with%20spaces%20%26%20special%21.txt",
+            VSCodeLauncher::URLEncode("file_name with spaces & special!.txt"));
+  EXPECT_EQ("%00%01%02",
+            VSCodeLauncher::URLEncode(llvm::StringRef("\x00\x01\x02", 3)));
+  EXPECT_EQ("test-file_name.txt~",
+            VSCodeLauncher::URLEncode("test-file_name.txt~"));
+
+  // UTF-8 encoded characters should be percent-encoded byte by byte.
+  EXPECT_EQ("%C3%A9", VSCodeLauncher::URLEncode("é"));
+}
diff --git a/lldb/unittests/DAP/ProtocolRequestsTest.cpp b/lldb/unittests/DAP/ProtocolRequestsTest.cpp
new file mode 100644
index 0000000000000..498195dc09325
--- /dev/null
+++ b/lldb/unittests/DAP/ProtocolRequestsTest.cpp
@@ -0,0 +1,69 @@
+//===-- ProtocolRequestsTest.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Protocol/ProtocolRequests.h"
+#include "Protocol/ProtocolTypes.h"
+#include "TestingSupport/TestUtilities.h"
+#include "llvm/Testing/Support/Error.h"
+#include <gtest/gtest.h>
+
+using namespace llvm;
+using namespace lldb_dap::protocol;
+using lldb_private::PrettyPrint;
+using llvm::json::parse;
+
+TEST(ProtocolRequestsTest, ExceptionInfoArguments) {
+  llvm::Expected<ExceptionInfoArguments> expected =
+      parse<ExceptionInfoArguments>(R"({
+        "threadId": 3434
+        })");
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(expected->threadId, 3434U);
+
+  // Check required keys;
+  EXPECT_THAT_EXPECTED(parse<ExceptionInfoArguments>(R"({})"),
+                       FailedWithMessage("missing value at (root).threadId"));
+
+  EXPECT_THAT_EXPECTED(parse<ExceptionInfoArguments>(R"({"id": 10})"),
+                       FailedWithMessage("missing value at (root).threadId"));
+}
+
+TEST(ProtocolRequestsTest, ExceptionInfoResponseBody) {
+  ExceptionInfoResponseBody body;
+  body.exceptionId = "signal";
+  body.breakMode = eExceptionBreakModeAlways;
+
+  // Check required keys.
+  Expected<json::Value> expected = parse(
+      R"({
+    "exceptionId": "signal",
+    "breakMode": "always"
+    })");
+
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(PrettyPrint(*expected), PrettyPrint(body));
+
+  // Check optional keys.
+  body.description = "SIGNAL SIGWINCH";
+  body.breakMode = eExceptionBreakModeNever;
+  body.details = ExceptionDetails{};
+  body.details->message = "some message";
+
+  Expected<json::Value> expected_opt = parse(
+      R"({
+    "exceptionId": "signal",
+    "description": "SIGNAL SIGWINCH",
+    "breakMode": "never",
+    "details": {
+      "message": "some message"
+    }
+  })");
+
+  ASSERT_THAT_EXPECTED(expected_opt, llvm::Succeeded());
+  EXPECT_EQ(PrettyPrint(*expected_opt), PrettyPrint(body));
+}
diff --git a/lldb/unittests/DAP/ProtocolTypesTest.cpp b/lldb/unittests/DAP/ProtocolTypesTest.cpp
index 8170abdd25bc6..6a4620a3f1e59 100644
--- a/lldb/unittests/DAP/ProtocolTypesTest.cpp
+++ b/lldb/unittests/DAP/ProtocolTypesTest.cpp
@@ -1129,3 +1129,50 @@ TEST(ProtocolTypesTest, DataBreakpointInfoArguments) {
   EXPECT_THAT_EXPECTED(parse<DataBreakpointInfoArguments>(R"({"name":"data"})"),
                        llvm::Succeeded());
 }
+
+TEST(ProtocolTypesTest, ExceptionBreakMode) {
+  const std::vector<std::pair<ExceptionBreakMode, llvm::StringRef>> test_cases =
+      {{ExceptionBreakMode::eExceptionBreakModeAlways, "always"},
+       {ExceptionBreakMode::eExceptionBreakModeNever, "never"},
+       {ExceptionBreakMode::eExceptionBreakModeUnhandled, "unhandled"},
+       {ExceptionBreakMode::eExceptionBreakModeUserUnhandled, "userUnhandled"}};
+
+  for (const auto [value, expected] : test_cases) {
+    json::Value const serialized = toJSON(value);
+    ASSERT_EQ(serialized.kind(), llvm::json::Value::Kind::String);
+    EXPECT_EQ(serialized.getAsString(), expected);
+  }
+}
+
+TEST(ProtocolTypesTest, ExceptionDetails) {
+  ExceptionDetails details;
+
+  // Check required keys.
+  Expected<json::Value> expected = parse(R"({})");
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(pp(*expected), pp(details));
+
+  // Check optional keys.
+  details.message = "SIGABRT exception";
+  details.typeName = "signal";
+  details.fullTypeName = "SIGABRT";
+  details.evaluateName = "process handle SIGABRT";
+  details.stackTrace = "some stacktrace";
+  ExceptionDetails inner_details;
+  inner_details.message = "inner message";
+  details.innerException = {std::move(inner_details)};
+
+  Expected<json::Value> expected_opt = parse(R"({
+    "message": "SIGABRT exception",
+    "typeName": "signal",
+    "fullTypeName": "SIGABRT",
+    "evaluateName": "process handle SIGABRT",
+    "stackTrace": "some stacktrace",
+    "innerException": [{
+      "message": "inner message"
+    }]
+    })");
+
+  ASSERT_THAT_EXPECTED(expected_opt, llvm::Succeeded());
+  EXPECT_EQ(pp(*expected_opt), pp(details));
+}
diff --git a/lldb/unittests/Language/ObjC/ObjCLanguageTest.cpp b/lldb/unittests/Language/ObjC/ObjCLanguageTest.cpp
index 70baa7e6bc135..4b018a29f3587 100644
--- a/lldb/unittests/Language/ObjC/ObjCLanguageTest.cpp
+++ b/lldb/unittests/Language/ObjC/ObjCLanguageTest.cpp
@@ -112,3 +112,46 @@ TEST(ObjCLanguage, InvalidMethodNameParsing) {
     EXPECT_FALSE(lax_method.has_value());
   }
 }
+
+struct ObjCMethodTestCase {
+  llvm::StringRef name;
+  bool is_valid;
+};
+
+struct ObjCMethodNameTextFiture
+    : public testing::TestWithParam<ObjCMethodTestCase> {};
+
+static ObjCMethodTestCase g_objc_method_name_test_cases[] = {
+    {"", false},
+    {"+[Uh oh!", false},
+    {"-[Definitely not...", false},
+    {"[Nice try ] :)", false},
+    {"+MaybeIfYouSquintYourEyes]", false},
+    {"?[Tricky]", false},
+    {"[]", false},
+    {"-[a", false},
+    {"+[a", false},
+    {"-]a]", false},
+    {"+]a]", false},
+
+    // FIXME: should these count as valid?
+    {"+[]", true},
+    {"-[]", true},
+    {"-[[]", true},
+    {"+[[]", true},
+    {"+[a ]", true},
+    {"-[a ]", true},
+
+    // Valid names
+    {"+[a a]", true},
+    {"-[a a]", true},
+};
+
+TEST_P(ObjCMethodNameTextFiture, TestIsPossibleObjCMethodName) {
+  // Tests ObjCLanguage::IsPossibleObjCMethodName
+  auto [name, expect_valid] = GetParam();
+  EXPECT_EQ(ObjCLanguage::IsPossibleObjCMethodName(name), expect_valid);
+}
+
+INSTANTIATE_TEST_SUITE_P(ObjCMethodNameTests, ObjCMethodNameTextFiture,
+                         testing::ValuesIn(g_objc_method_name_test_cases));
diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
index 3d0e2d8a62482..a63b740d9472f 100644
--- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
+++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
@@ -161,6 +161,11 @@ void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBExecutionContext(
   return nullptr;
 }
 
+void *
+lldb_private::python::LLDBSWIGPython_CastPyObjectToSBFrameList(PyObject *data) {
+  return nullptr;
+}
+
 lldb::ValueObjectSP
 lldb_private::python::SWIGBridge::LLDBSWIGPython_GetValueObjectSPFromSBValue(
     void *data) {
@@ -329,6 +334,11 @@ lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ProcessSP) {
   return python::PythonObject();
 }
 
+python::PythonObject
+lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::StackFrameListSP) {
+  return python::PythonObject();
+}
+
 python::PythonObject lldb_private::python::SWIGBridge::ToSWIGWrapper(
     const lldb_private::StructuredDataImpl &) {
   return python::PythonObject();
diff --git a/lldb/unittests/Symbol/TestTypeSystemClang.cpp b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
index 1981e912fa4fa..155fc743934c2 100644
--- a/lldb/unittests/Symbol/TestTypeSystemClang.cpp
+++ b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
@@ -52,6 +52,12 @@ class TestTypeSystemClang : public testing::Test {
     return ClangUtil::GetQualType(
         m_ast->GetBuiltinTypeByName(ConstString(name)));
   }
+
+  CompilerType GetBuiltinTypeForDWARFEncodingAndBitSize(
+      llvm::StringRef type_name, uint32_t encoding, uint32_t bit_size) const {
+    return m_ast->GetBuiltinTypeForDWARFEncodingAndBitSize(type_name, encoding,
+                                                           bit_size);
+  }
 };
 
 TEST_F(TestTypeSystemClang, TestGetBasicTypeFromEnum) {
@@ -238,6 +244,91 @@ TEST_F(TestTypeSystemClang, TestBuiltinTypeForEncodingAndBitSize) {
   VerifyEncodingAndBitSize(*m_ast, eEncodingIEEE754, 64);
 }
 
+TEST_F(TestTypeSystemClang, TestGetBuiltinTypeForDWARFEncodingAndBitSize) {
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "_BitIn", llvm::dwarf::DW_ATE_signed, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "BitInt", llvm::dwarf::DW_ATE_signed, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "_BitInt(2)", llvm::dwarf::DW_ATE_signed_char, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "_BitInt", llvm::dwarf::DW_ATE_signed_char, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "_BitInt(2)", llvm::dwarf::DW_ATE_unsigned, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "_BitInt", llvm::dwarf::DW_ATE_unsigned, 2)
+                   .IsValid());
+
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "_BitInt(2)", llvm::dwarf::DW_ATE_signed, 2)
+                .GetTypeName(),
+            "_BitInt(2)");
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "_BitInt", llvm::dwarf::DW_ATE_signed, 2)
+                .GetTypeName(),
+            "_BitInt(2)");
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "_BitInt(129)", llvm::dwarf::DW_ATE_signed, 129)
+                .GetTypeName(),
+            "_BitInt(129)");
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "_BitInt", llvm::dwarf::DW_ATE_signed, 129)
+                .GetTypeName(),
+            "_BitInt(129)");
+
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "unsigned _BitIn", llvm::dwarf::DW_ATE_unsigned, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "unsigned BitInt", llvm::dwarf::DW_ATE_unsigned, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "unsigned _BitInt(2)", llvm::dwarf::DW_ATE_unsigned_char, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "unsigned _BitInt", llvm::dwarf::DW_ATE_unsigned_char, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "unsigned _BitInt(2)", llvm::dwarf::DW_ATE_signed, 2)
+                   .IsValid());
+  EXPECT_FALSE(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                   "unsigned _BitInt", llvm::dwarf::DW_ATE_signed, 2)
+                   .IsValid());
+
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "unsigned _BitInt(2)", llvm::dwarf::DW_ATE_unsigned, 2)
+                .GetTypeName(),
+            "unsigned _BitInt(2)");
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "unsigned _BitInt", llvm::dwarf::DW_ATE_unsigned, 2)
+                .GetTypeName(),
+            "unsigned _BitInt(2)");
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "unsigned _BitInt(129)", llvm::dwarf::DW_ATE_unsigned, 129)
+                .GetTypeName(),
+            "unsigned _BitInt(129)");
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "unsigned _BitInt", llvm::dwarf::DW_ATE_unsigned, 129)
+                .GetTypeName(),
+            "unsigned _BitInt(129)");
+}
+
+TEST_F(TestTypeSystemClang, TestBitIntTypeInfo) {
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "_BitInt", llvm::dwarf::DW_ATE_signed, 2)
+                .GetTypeInfo(),
+            eTypeIsSigned | eTypeIsScalar | eTypeHasValue | eTypeIsInteger);
+  EXPECT_EQ(GetBuiltinTypeForDWARFEncodingAndBitSize(
+                "unsigned _BitInt", llvm::dwarf::DW_ATE_unsigned, 2)
+                .GetTypeInfo(),
+            eTypeIsScalar | eTypeHasValue | eTypeIsInteger);
+}
+
 TEST_F(TestTypeSystemClang, TestBuiltinTypeForEmptyTriple) {
   // Test that we can access type-info of builtin Clang AST
   // types without crashing even when the target triple is
@@ -1123,6 +1214,30 @@ TEST_F(TestTypeSystemClang, AddMethodToCXXRecordType_ParmVarDecls) {
   EXPECT_EQ(method_it->getParamDecl(1)->getDeclContext(), *method_it);
 }
 
+TEST_F(TestTypeSystemClang, TestGetTypeInfo) {
+  // Tests TypeSystemClang::GetTypeInfo
+
+  const ASTContext &ast = m_ast->getASTContext();
+
+  CompilerType complex_int = m_ast->GetType(ast.getComplexType(ast.IntTy));
+  EXPECT_EQ(complex_int.GetTypeInfo(),
+            (eTypeIsInteger | eTypeIsComplex | eTypeIsBuiltIn | eTypeHasValue));
+
+  CompilerType complex_float = m_ast->GetType(ast.getComplexType(ast.FloatTy));
+  EXPECT_EQ(complex_float.GetTypeInfo(),
+            (eTypeIsFloat | eTypeIsComplex | eTypeIsBuiltIn | eTypeHasValue));
+
+  CompilerType vector_of_int =
+      m_ast->GetType(ast.getVectorType(ast.IntTy, 1, VectorKind::Generic));
+  EXPECT_EQ(vector_of_int.GetTypeInfo(),
+            (eTypeIsInteger | eTypeIsVector | eTypeHasChildren));
+
+  CompilerType vector_of_float =
+      m_ast->GetType(ast.getVectorType(ast.FloatTy, 1, VectorKind::Generic));
+  EXPECT_EQ(vector_of_float.GetTypeInfo(),
+            (eTypeIsFloat | eTypeIsVector | eTypeHasChildren));
+}
+
 TEST_F(TestTypeSystemClang, AsmLabel_CtorDtor) {
   // Tests TypeSystemClang::DeclGetMangledName for constructors/destructors
   // with and without AsmLabels.
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
index 064ed6d1d3e58..cef3a25a4a960 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
@@ -1741,3 +1741,215 @@ TEST_F(DWARFASTParserClangTests, TestTypeBitSize) {
   EXPECT_EQ(llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0),
             1U);
 }
+
+TEST_F(DWARFASTParserClangTests, TestBitIntParsing) {
+  // Tests that we correctly parse the DW_AT_base_type for a _BitInt.
+  // Older versions of Clang only emit the `_BitInt` string into the
+  // DW_AT_name (not including the bitsize). Make sure we understand
+  // those too.
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - _BitInt(2)
+    - _BitInt
+    - unsigned _BitInt(2)
+    - unsigned _BitInt
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_base_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute: DW_AT_name
+              Form:      DW_FORM_strp
+            - Attribute: DW_AT_encoding
+              Form:      DW_FORM_data1
+            - Attribute: DW_AT_byte_size
+              Form:      DW_FORM_data1
+            - Attribute: DW_AT_bit_size
+              Form:      DW_FORM_data1
+        - Code:            0x3
+          Tag:             DW_TAG_base_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute: DW_AT_name
+              Form:      DW_FORM_strp
+            - Attribute: DW_AT_encoding
+              Form:      DW_FORM_data1
+            - Attribute: DW_AT_byte_size
+              Form:      DW_FORM_data1
+
+  debug_info:
+     - Version:  5
+       UnitType: DW_UT_compile
+       AddrSize: 8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode: 0x1
+          Values:
+            - Value: 0x04
+
+#   DW_TAG_base_type
+#     DW_AT_name [DW_FORM_strp] ('_BitInt(2)')
+
+        - AbbrCode: 0x2
+          Values:
+            - Value: 0x0
+            - Value: 0x05
+            - Value: 0x01
+            - Value: 0x02
+
+#   DW_TAG_base_type
+#     DW_AT_name [DW_FORM_strp] ('_BitInt')
+
+        - AbbrCode: 0x2
+          Values:
+            - Value: 0x0b
+            - Value: 0x05
+            - Value: 0x08
+            - Value: 0x34
+
+#   DW_TAG_base_type
+#     DW_AT_name [DW_FORM_strp] ('unsigned _BitInt(2)')
+
+        - AbbrCode: 0x2
+          Values:
+            - Value: 0x13
+            - Value: 0x07
+            - Value: 0x01
+            - Value: 0x02
+
+#   DW_TAG_base_type
+#     DW_AT_name [DW_FORM_strp] ('unsigned _BitInt')
+
+        - AbbrCode: 0x2
+          Values:
+            - Value: 0x27
+            - Value: 0x07
+            - Value: 0x08
+            - Value: 0x34
+
+#   DW_TAG_base_type
+#     DW_AT_name [DW_FORM_strp] ('_BitInt')
+
+        - AbbrCode: 0x3
+          Values:
+            - Value: 0x0b
+            - Value: 0x05
+            - Value: 0x08
+...
+
+)";
+
+  YAMLModuleTester t(yamldata);
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto holder = std::make_unique<clang_utils::TypeSystemClangHolder>("ast");
+  auto &ast_ctx = *holder->GetAST();
+  DWARFASTParserClangStub ast_parser(ast_ctx);
+
+  auto type_die = cu_die.GetFirstChild();
+  ASSERT_TRUE(type_die.IsValid());
+
+  {
+    SymbolContext sc;
+    auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die,
+                                                 /*type_is_new_ptr=*/nullptr);
+    ASSERT_NE(type_sp, nullptr);
+
+    EXPECT_EQ(
+        llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0),
+        1U);
+    EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingSint);
+    EXPECT_EQ(type_sp->GetName(), "_BitInt(2)");
+    EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(), "_BitInt(2)");
+  }
+
+  {
+    type_die = type_die.GetSibling();
+    SymbolContext sc;
+    auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die,
+                                                 /*type_is_new_ptr=*/nullptr);
+    ASSERT_NE(type_sp, nullptr);
+
+    EXPECT_EQ(
+        llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0),
+        8U);
+    EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingSint);
+    EXPECT_EQ(type_sp->GetName(), "_BitInt");
+    EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(), "_BitInt(52)");
+  }
+
+  {
+    type_die = type_die.GetSibling();
+    SymbolContext sc;
+    auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die,
+                                                 /*type_is_new_ptr=*/nullptr);
+    ASSERT_NE(type_sp, nullptr);
+
+    EXPECT_EQ(
+        llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0),
+        1U);
+    EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingUint);
+    EXPECT_EQ(type_sp->GetName(), "unsigned _BitInt(2)");
+    EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(),
+              "unsigned _BitInt(2)");
+  }
+
+  {
+    type_die = type_die.GetSibling();
+    SymbolContext sc;
+    auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die,
+                                                 /*type_is_new_ptr=*/nullptr);
+    ASSERT_NE(type_sp, nullptr);
+
+    EXPECT_EQ(
+        llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0),
+        8U);
+    EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingUint);
+    EXPECT_EQ(type_sp->GetName(), "unsigned _BitInt");
+    EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(),
+              "unsigned _BitInt(52)");
+  }
+
+  {
+    type_die = type_die.GetSibling();
+    SymbolContext sc;
+    auto type_sp = ast_parser.ParseTypeFromDWARF(sc, type_die,
+                                                 /*type_is_new_ptr=*/nullptr);
+    ASSERT_NE(type_sp, nullptr);
+
+    EXPECT_EQ(
+        llvm::expectedToOptional(type_sp->GetByteSize(nullptr)).value_or(0),
+        8U);
+    EXPECT_EQ(type_sp->GetEncoding(), lldb::eEncodingSint);
+    EXPECT_EQ(type_sp->GetName(), "_BitInt");
+
+    // Older versions of Clang didn't emit a DW_AT_bit_size for _BitInt. In
+    // those cases we would format the CompilerType name using the byte-size.
+    EXPECT_EQ(type_sp->GetForwardCompilerType().GetTypeName(), "_BitInt(64)");
+  }
+}
diff --git a/lldb/unittests/SymbolFile/NativePDB/UdtRecordCompleterTests.cpp b/lldb/unittests/SymbolFile/NativePDB/UdtRecordCompleterTests.cpp
index 17284b61b9a6e..cd6db5fcb1f4c 100644
--- a/lldb/unittests/SymbolFile/NativePDB/UdtRecordCompleterTests.cpp
+++ b/lldb/unittests/SymbolFile/NativePDB/UdtRecordCompleterTests.cpp
@@ -99,7 +99,7 @@ Member *AddField(Member *member, StringRef name, uint64_t byte_offset,
       std::make_unique<Member>(name, byte_offset * 8, byte_size * 8,
                                clang::QualType(), lldb::eAccessPublic, 0);
   field->kind = kind;
-  field->base_offset = base_offset;
+  field->base_offset = base_offset * 8;
   member->fields.push_back(std::move(field));
   return member->fields.back().get();
 }
@@ -111,6 +111,9 @@ TEST_F(UdtRecordCompleterRecordTests, TestAnonymousUnionInStruct) {
   CollectMember("m2", 0, 4);
   CollectMember("m3", 0, 1);
   CollectMember("m4", 0, 8);
+  CollectMember("m5", 8, 8);
+  CollectMember("m6", 16, 4);
+  CollectMember("m7", 16, 8);
   ConstructRecord();
 
   // struct {
@@ -120,6 +123,11 @@ TEST_F(UdtRecordCompleterRecordTests, TestAnonymousUnionInStruct) {
   //       m3;
   //       m4;
   //   };
+  //   m5;
+  //   union {
+  //       m6;
+  //       m7;
+  //   };
   // };
   Record record;
   record.start_offset = 0;
@@ -128,6 +136,10 @@ TEST_F(UdtRecordCompleterRecordTests, TestAnonymousUnionInStruct) {
   AddField(u, "m2", 0, 4, Member::Field);
   AddField(u, "m3", 0, 1, Member::Field);
   AddField(u, "m4", 0, 8, Member::Field);
+  AddField(&record.record, "m5", 8, 8, Member::Field);
+  Member *u2 = AddField(&record.record, "", 16, 0, Member::Union);
+  AddField(u2, "m6", 16, 4, Member::Field);
+  AddField(u2, "m7", 16, 8, Member::Field);
   EXPECT_EQ(WrappedRecord(this->record), WrappedRecord(record));
 }
 
@@ -243,3 +255,41 @@ TEST_F(UdtRecordCompleterRecordTests, TestNestedUnionStructInUnion) {
   AddField(s2, "m4", 2, 4, Member::Field);
   EXPECT_EQ(WrappedRecord(this->record), WrappedRecord(record));
 }
+
+TEST_F(UdtRecordCompleterRecordTests, TestNestedStructInUnionInStructInUnion) {
+  SetKind(Member::Kind::Union);
+  CollectMember("m1", 0, 4);
+  CollectMember("m2", 0, 2);
+  CollectMember("m3", 0, 2);
+  CollectMember("m4", 2, 4);
+  CollectMember("m5", 6, 2);
+  CollectMember("m6", 6, 2);
+  CollectMember("m7", 8, 2);
+  ConstructRecord();
+
+  // union {
+  //   m1;
+  //   m2;
+  //   struct {
+  //       m3;
+  //       m4;
+  //       union {
+  //           m5;
+  //           m6;
+  //       };
+  //       m7;
+  //   };
+  // };
+  Record record;
+  record.start_offset = 0;
+  AddField(&record.record, "m1", 0, 4, Member::Field);
+  AddField(&record.record, "m2", 0, 2, Member::Field);
+  Member *s = AddField(&record.record, "", 0, 0, Member::Struct);
+  AddField(s, "m3", 0, 2, Member::Field);
+  AddField(s, "m4", 2, 4, Member::Field);
+  Member *u = AddField(s, "", 6, 0, Member::Union);
+  AddField(u, "m5", 6, 2, Member::Field);
+  AddField(u, "m6", 6, 2, Member::Field);
+  AddField(s, "m7", 8, 2, Member::Field);
+  EXPECT_EQ(WrappedRecord(this->record), WrappedRecord(record));
+}
diff --git a/lldb/unittests/Target/LocateModuleCallbackTest.cpp b/lldb/unittests/Target/LocateModuleCallbackTest.cpp
index 6ffa41b16b4ff..d727cea9f6eae 100644
--- a/lldb/unittests/Target/LocateModuleCallbackTest.cpp
+++ b/lldb/unittests/Target/LocateModuleCallbackTest.cpp
@@ -362,7 +362,7 @@ TEST_F(LocateModuleCallbackTest, GetOrCreateModuleCallbackFailureNoCache) {
       });
 
   m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
-  ASSERT_EQ(callback_call_count, 2);
+  ASSERT_EQ(callback_call_count, 3);
   ASSERT_FALSE(m_module_sp);
 }
 
@@ -383,7 +383,7 @@ TEST_F(LocateModuleCallbackTest, GetOrCreateModuleCallbackFailureCached) {
       });
 
   m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
-  ASSERT_EQ(callback_call_count, 2);
+  ASSERT_EQ(callback_call_count, 3);
   CheckModule(m_module_sp);
   ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view);
   ASSERT_FALSE(m_module_sp->GetSymbolFileFileSpec());
@@ -409,7 +409,7 @@ TEST_F(LocateModuleCallbackTest, GetOrCreateModuleCallbackNoFiles) {
       });
 
   m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
-  ASSERT_EQ(callback_call_count, 2);
+  ASSERT_EQ(callback_call_count, 3);
   CheckModule(m_module_sp);
   ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view);
   ASSERT_FALSE(m_module_sp->GetSymbolFileFileSpec());
@@ -435,7 +435,7 @@ TEST_F(LocateModuleCallbackTest, GetOrCreateModuleCallbackNonExistentModule) {
       });
 
   m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
-  ASSERT_EQ(callback_call_count, 2);
+  ASSERT_EQ(callback_call_count, 3);
   CheckModule(m_module_sp);
   ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view);
   ASSERT_FALSE(m_module_sp->GetSymbolFileFileSpec());
@@ -464,7 +464,7 @@ TEST_F(LocateModuleCallbackTest, GetOrCreateModuleCallbackNonExistentSymbol) {
       });
 
   m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
-  ASSERT_EQ(callback_call_count, 2);
+  ASSERT_EQ(callback_call_count, 3);
   CheckModule(m_module_sp);
   ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view);
   ASSERT_TRUE(m_module_sp->GetSymbolFileFileSpec().GetPath().empty());
@@ -622,7 +622,7 @@ TEST_F(LocateModuleCallbackTest,
       });
 
   m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
-  ASSERT_EQ(callback_call_count, 2);
+  ASSERT_EQ(callback_call_count, 3);
   CheckModule(m_module_sp);
   ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view);
   ASSERT_EQ(m_module_sp->GetSymbolFileFileSpec(),
@@ -650,7 +650,7 @@ TEST_F(LocateModuleCallbackTest,
       });
 
   m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
-  ASSERT_EQ(callback_call_count, 2);
+  ASSERT_EQ(callback_call_count, 3);
   CheckModule(m_module_sp);
   ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view);
   ASSERT_EQ(m_module_sp->GetSymbolFileFileSpec(),
@@ -682,7 +682,7 @@ TEST_F(LocateModuleCallbackTest,
       });
 
   m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
-  ASSERT_EQ(callback_call_count, 2);
+  ASSERT_EQ(callback_call_count, 3);
   CheckModule(m_module_sp);
   ASSERT_EQ(m_module_sp->GetFileSpec(), uuid_view);
   ASSERT_EQ(m_module_sp->GetSymbolFileFileSpec(),
@@ -709,7 +709,7 @@ TEST_F(LocateModuleCallbackTest,
       });
 
   m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
-  ASSERT_EQ(callback_call_count, 2);
+  ASSERT_EQ(callback_call_count, 3);
   ASSERT_FALSE(m_module_sp);
 }
 
@@ -731,7 +731,7 @@ TEST_F(LocateModuleCallbackTest,
       });
 
   m_module_sp = m_target_sp->GetOrCreateModule(m_module_spec, /*notify=*/false);
-  ASSERT_EQ(callback_call_count, 2);
+  ASSERT_EQ(callback_call_count, 3);
   ASSERT_FALSE(m_module_sp);
 }
 
diff --git a/lldb/unittests/Target/RemoteAwarePlatformTest.cpp b/lldb/unittests/Target/RemoteAwarePlatformTest.cpp
index 3278674ed0a05..cfcec693b8742 100644
--- a/lldb/unittests/Target/RemoteAwarePlatformTest.cpp
+++ b/lldb/unittests/Target/RemoteAwarePlatformTest.cpp
@@ -32,15 +32,12 @@ class RemoteAwarePlatformTester : public RemoteAwarePlatform {
                ProcessSP(ProcessAttachInfo &, Debugger &, Target *, Status &));
   MOCK_METHOD0(CalculateTrapHandlerSymbolNames, void());
 
-  MOCK_METHOD2(ResolveExecutable,
-               std::pair<bool, ModuleSP>(const ModuleSpec &,
-                                         const FileSpecList *));
-  Status
-  ResolveExecutable(const ModuleSpec &module_spec,
-                    lldb::ModuleSP &exe_module_sp,
-                    const FileSpecList *module_search_paths_ptr) /*override*/
+  MOCK_METHOD1(ResolveExecutable,
+               std::pair<bool, ModuleSP>(const ModuleSpec &));
+  Status ResolveExecutable(const ModuleSpec &module_spec,
+                           lldb::ModuleSP &exe_module_sp) /*override*/
   { // NOLINT(modernize-use-override)
-    auto pair = ResolveExecutable(module_spec, module_search_paths_ptr);
+    auto pair = ResolveExecutable(module_spec);
     exe_module_sp = pair.second;
     return pair.first ? Status() : Status::FromErrorString("error");
   }
@@ -80,14 +77,14 @@ TEST_F(RemoteAwarePlatformTest, TestResolveExecutabelOnClientByPlatform) {
   static const ArchSpec process_host_arch;
   EXPECT_CALL(platform, GetSupportedArchitectures(process_host_arch))
       .WillRepeatedly(Return(std::vector<ArchSpec>()));
-  EXPECT_CALL(platform, ResolveExecutable(_, _))
+  EXPECT_CALL(platform, ResolveExecutable(_))
       .WillRepeatedly(Return(std::make_pair(true, expected_executable)));
 
   platform.SetRemotePlatform(std::make_shared<TargetPlatformTester>(false));
 
   ModuleSP resolved_sp;
   lldb_private::Status status =
-      platform.ResolveExecutable(executable_spec, resolved_sp, nullptr);
+      platform.ResolveExecutable(executable_spec, resolved_sp);
 
   ASSERT_TRUE(status.Success());
   EXPECT_EQ(expected_executable.get(), resolved_sp.get());
diff --git a/lldb/unittests/TestingSupport/TestUtilities.cpp b/lldb/unittests/TestingSupport/TestUtilities.cpp
index b53822e38324b..d164c227afb9e 100644
--- a/lldb/unittests/TestingSupport/TestUtilities.cpp
+++ b/lldb/unittests/TestingSupport/TestUtilities.cpp
@@ -20,6 +20,11 @@ using namespace lldb_private;
 extern const char *TestMainArgv0;
 
 std::once_flag TestUtilities::g_debugger_initialize_flag;
+
+std::string lldb_private::PrettyPrint(const llvm::json::Value &value) {
+  return llvm::formatv("{0:2}", value).str();
+}
+
 std::string lldb_private::GetInputFilePath(const llvm::Twine &name) {
   llvm::SmallString<128> result = llvm::sys::path::parent_path(TestMainArgv0);
   llvm::sys::fs::make_absolute(result);
diff --git a/lldb/unittests/TestingSupport/TestUtilities.h b/lldb/unittests/TestingSupport/TestUtilities.h
index cc93a68a6a431..f05d176618fa0 100644
--- a/lldb/unittests/TestingSupport/TestUtilities.h
+++ b/lldb/unittests/TestingSupport/TestUtilities.h
@@ -30,6 +30,10 @@
   }
 
 namespace lldb_private {
+
+/// Returns a pretty printed json string of a `llvm::json::Value`.
+std::string PrettyPrint(const llvm::json::Value &E);
+
 std::string GetInputFilePath(const llvm::Twine &name);
 
 class TestUtilities {
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index c450ee5a3d72e..c02c75f10b12f 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -1264,10 +1264,10 @@ endif()
 # Build with _XOPEN_SOURCE on z/OS.
 if (CMAKE_SYSTEM_NAME MATCHES "OS390")
   add_compile_definitions(_XOPEN_SOURCE=600)
+  add_compile_definitions(_XPLATFORM_SOURCE) # Needed e.g. for O_CLOEXEC.
   add_compile_definitions(_OPEN_SYS) # Needed for process information.
   add_compile_definitions(_OPEN_SYS_FILE_EXT) # Needed for EBCDIC I/O.
   add_compile_definitions(_EXT) # Needed for file data.
-  add_compile_definitions(_UNIX03_THREADS) # Multithreading support.
   # Need to build LLVM as ASCII application.
   # This can't be a global setting because other projects may
   # need to be built in EBCDIC mode.
@@ -1340,9 +1340,7 @@ if( LLVM_INCLUDE_UTILS )
   add_subdirectory(utils/mlgo-utils)
   add_subdirectory(utils/llvm-test-mustache-spec)
   if( LLVM_INCLUDE_TESTS )
-    set(LLVM_SUBPROJECT_TITLE "Third-Party/Google Test")
     add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest)
-    set(LLVM_SUBPROJECT_TITLE)
   endif()
 else()
   if ( LLVM_INCLUDE_TESTS )
diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index e52259236fc19..1eba955f9d6ed 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -197,7 +197,7 @@ david.green@arm.com (email), [davemgreen](https://github.com/davemgreen) (GitHub
 Amara Emerson (esp. AArch64 GlobalISel) \
 amara@apple.com (email), [aemerson](https://github.com/aemerson) (GitHub) \
 Eli Friedman (esp. ARM64EC) \
-efriedma@quicinc.com (email), [efriedma-quic](https://github.com/efriedma-quic) (GitHub) \
+efriedma@qti.qualcomm.com (email), [efriedma-quic](https://github.com/efriedma-quic) (GitHub) \
 Sjoerd Meijer \
 smeijer@nvidia.com (email), [sjoerdmeijer](https://github.com/sjoerdmeijer) (GitHub) \
 Nashe Mncube \
@@ -246,7 +246,7 @@ mail@justinbogner.com (email), [bogner](https://github.com/bogner) (GitHub)
 #### Hexagon backend
 
 Sundeep Kushwaha \
-sundeepk@quicinc.com (email), [SundeepKushwaha](https://github.com/SundeepKushwaha) (GitHub)
+sundeepk@qti.qualcomm.com (email), [SundeepKushwaha](https://github.com/SundeepKushwaha) (GitHub)
 
 #### Lanai backend
 
diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt
index e411ed4326a36..6581f47012552 100644
--- a/llvm/benchmarks/CMakeLists.txt
+++ b/llvm/benchmarks/CMakeLists.txt
@@ -22,7 +22,7 @@ if(NOT LLVM_TOOL_LLVM_DRIVER_BUILD)
   get_host_tool_path(llvm-nm LLVM_NM llvm_nm_exe llvm_nm_target)
   get_host_tool_path(llc LLC llc_exe llc_target)
 
-  if(${llc_exe} AND ${llvm_nm_exe})
+  if(llc_exe AND llvm_nm_exe)
     # Extract the list of symbols in a random utility as sample data.
     set(SYMBOL_TEST_DATA_FILE "sample_symbol_list.txt")
     set(SYMBOL_TEST_DATA_SOURCE_BINARY ${llc_exe})
diff --git a/llvm/benchmarks/FormatVariadicBM.cpp b/llvm/benchmarks/FormatVariadicBM.cpp
index c03ead400d0d5..b451d1079d29b 100644
--- a/llvm/benchmarks/FormatVariadicBM.cpp
+++ b/llvm/benchmarks/FormatVariadicBM.cpp
@@ -1,17 +1,17 @@
-//===- FormatVariadicBM.cpp - formatv() benchmark ---------- --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "benchmark/benchmark.h"
-#include "llvm/Support/FormatVariadic.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-
+//===- FormatVariadicBM.cpp - formatv() benchmark ---------- --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "benchmark/benchmark.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+
 using namespace llvm;
 using namespace std;
 
diff --git a/llvm/benchmarks/SpecialCaseListBM.cpp b/llvm/benchmarks/SpecialCaseListBM.cpp
index b5d82682199db..7cf21431efecd 100644
--- a/llvm/benchmarks/SpecialCaseListBM.cpp
+++ b/llvm/benchmarks/SpecialCaseListBM.cpp
@@ -5,7 +5,6 @@
 #include "llvm/Support/SpecialCaseList.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
-#include <iterator>
 #include <random>
 #include <string>
 #include <utility>
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 7d40d309d538e..3c3695a77cb7b 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -1802,7 +1802,13 @@ function(add_unittest test_suite test_name)
   # libpthreads overrides some standard library symbols, so main
   # executable must be linked with it in order to provide consistent
   # API for all shared libaries loaded by this executable.
-  target_link_libraries(${test_name} PRIVATE llvm_gtest_main llvm_gtest ${LLVM_PTHREAD_LIB})
+  # default_gtest should be an alias to either llvm_gtest or runtimes_gtest.
+  # If it is not defined, fall back to llvm_gtest.
+  if(TARGET default_gtest)
+    target_link_libraries(${test_name} PRIVATE default_gtest_main default_gtest ${LLVM_PTHREAD_LIB})
+  else ()
+    target_link_libraries(${test_name} PRIVATE llvm_gtest_main llvm_gtest ${LLVM_PTHREAD_LIB})
+  endif ()
 
   add_dependencies(${test_suite} ${test_name})
 endfunction()
diff --git a/llvm/cmake/modules/CrossCompile.cmake b/llvm/cmake/modules/CrossCompile.cmake
index bfbd9cfd4063f..2a69c5133c56f 100644
--- a/llvm/cmake/modules/CrossCompile.cmake
+++ b/llvm/cmake/modules/CrossCompile.cmake
@@ -101,6 +101,7 @@ function(llvm_create_cross_target project_name target_name toolchain buildtype)
         -DLLVM_INCLUDE_BENCHMARKS=OFF
         -DLLVM_INCLUDE_TESTS=OFF
         -DLLVM_TABLEGEN_FLAGS="${LLVM_TABLEGEN_FLAGS}"
+        -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}"
         ${build_type_flags} ${linker_flag} ${external_clang_dir} ${libc_flags}
         ${ARGN}
     WORKING_DIRECTORY ${${project_name}_${target_name}_BUILD}
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 7780c0a6dca0a..ba0e53bceade8 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -883,8 +883,9 @@ supported for the ``amdgcn`` target.
      Buffer Fat Pointer                    7               N/A         N/A              160     0
      Buffer Resource                       8               N/A         V#               128     0x00000000000000000000000000000000
      Buffer Strided Pointer (experimental) 9               *TODO*
-     *reserved for downstream use*         10
-     *reserved for downstream use*         11
+     *reserved for future use*             10
+     *reserved for future use*             11
+     *reserved for downstream use (LLPC)*  12
      Streamout Registers                   128             N/A         GS_REGS
      ===================================== =============== =========== ================ ======= ============================
 
@@ -1179,6 +1180,51 @@ is conservatively correct for OpenCL.
                              other operations within the same address space.
      ======================= ===================================================
 
+Target Types
+------------
+
+The AMDGPU backend implements some target extension types.
+
+.. _amdgpu-types-named-barriers:
+
+Named Barriers
+~~~~~~~~~~~~~~
+
+Named barriers are fixed function hardware barrier objects that are available
+in gfx12.5+ in addition to the traditional default barriers.
+
+In LLVM IR, named barriers are represented by global variables of type
+``target("amdgcn.named.barrier", 0)`` in the LDS address space. Named barrier
+global variables do not occupy actual LDS memory, but their lifetime and
+allocation scope matches that of global variables in LDS. Programs in LLVM IR
+refer to named barriers using pointers.
+
+The following named barrier types are supported in global variables, defined
+recursively:
+
+* a single, standalone ``target("amdgcn.named.barrier", 0)``
+* an array of supported types
+* a struct containing a single element of supported type
+
+.. code-block:: llvm
+
+      @bar = addrspace(3) global target("amdgcn.named.barrier", 0) undef
+      @foo = addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] undef
+      @baz = addrspace(3) global { target("amdgcn.named.barrier", 0) } undef
+
+      ...
+
+      %foo.i = getelementptr [2 x target("amdgcn.named.barrier", 0)], ptr addrspace(3) @foo, i32 0, i32 %i
+      call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %foo.i, i32 0)
+
+Named barrier types may not be used in ``alloca``.
+
+Named barriers do not have an underlying byte representation.
+It is undefined behavior to use a pointer to any part of a named barrier object
+as the pointer operand of a regular memory access instruction or intrinsic.
+Pointers to named barrier objects are intended to be used with dedicated
+intrinsics. Reading from or writing to such pointers is undefined behavior.
+
 LLVM IR Intrinsics
 ------------------
 
@@ -2644,7 +2690,7 @@ are deprecated and should not be used.
   ``vendor_name_size`` and ``architecture_name_size`` are the length of the
   vendor and architecture names respectively, including the NUL character.
 
-  ``vendor_and_architecture_name`` contains the NUL terminates string for the
+  ``vendor_and_architecture_name`` contains the NUL terminated string for the
   vendor, immediately followed by the NUL terminated string for the
   architecture.
 
@@ -3336,7 +3382,7 @@ location.
 
 If the lane is inactive, but was active on entry to the subprogram, then this is
 the program location in the subprogram at which execution of the lane is
-conceptual positioned.
+conceptually positioned.
 
 If the lane was not active on entry to the subprogram, then this will be the
 undefined location. A client debugger can check if the lane is part of a valid
@@ -4708,7 +4754,7 @@ same *vendor-name*.
                                                      "image", or "pipe". This may be
                                                      more restrictive than indicated
                                                      by ".access" to reflect what the
-                                                     kernel actual does. If not
+                                                     kernel actually does. If not
                                                      present then the runtime must
                                                      assume what is implied by
                                                      ".access" and ".is_const"      . Values
@@ -5087,7 +5133,7 @@ supported except by flat and scratch instructions in GFX9-GFX11.
 
 The generic address space uses the hardware flat address support available in
 GFX7-GFX11. This uses two fixed ranges of virtual addresses (the private and
-local apertures), that are outside the range of addressible global memory, to
+local apertures), that are outside the range of addressable global memory, to
 map from a flat address to a private or local address.
 
 FLAT instructions can take a flat address and access global, private (scratch)
@@ -6540,7 +6586,7 @@ Acquire memory ordering is not meaningful on store atomic instructions and is
 treated as non-atomic.
 
 Release memory ordering is not meaningful on load atomic instructions and is
-treated a non-atomic.
+treated as non-atomic.
 
 Acquire-release memory ordering is not meaningful on load or store atomic
 instructions and is treated as acquire and release respectively.
diff --git a/llvm/docs/AddingConstrainedIntrinsics.rst b/llvm/docs/AddingConstrainedIntrinsics.rst
index bd14f121144ca..41e7dece6dd8b 100644
--- a/llvm/docs/AddingConstrainedIntrinsics.rst
+++ b/llvm/docs/AddingConstrainedIntrinsics.rst
@@ -31,7 +31,7 @@ node ``FADD`` must be ``STRICT_FADD``.
 Update mappings
 ===============
 
-Add new record to the mapping of instructions to constrained intrinsic and
+Add new record to the mapping of instructions to constrained intrinsics and
 DAG nodes::
 
   include/llvm/IR/ConstrainedOps.def
diff --git a/llvm/docs/Atomics.rst b/llvm/docs/Atomics.rst
index 522aed150bf62..1bcd864dd15bf 100644
--- a/llvm/docs/Atomics.rst
+++ b/llvm/docs/Atomics.rst
@@ -408,7 +408,7 @@ operations:
   MemoryDependencyAnalysis (which is also used by other passes like GVN).
 
 * Folding a load: Any atomic load from a constant global can be constant-folded,
-  because it cannot be observed.  Similar reasoning allows sroa with
+  because it cannot be observed.  Similar reasoning allows SROA with
   atomic loads and stores.
 
 Atomics and Codegen
diff --git a/llvm/docs/BranchWeightMetadata.rst b/llvm/docs/BranchWeightMetadata.rst
index 3fa21720d25fc..71d7a7d3a6c05 100644
--- a/llvm/docs/BranchWeightMetadata.rst
+++ b/llvm/docs/BranchWeightMetadata.rst
@@ -92,7 +92,7 @@ The second weight is optional and corresponds to the unwind branch.
 If only one weight is set, then it contains the execution count of the call
 and used in SamplePGO mode only as described for the call instruction. If both
 weights are specified then the second weight contains the count of unwind branch
-taken and the first weights contains the execution count of the call minus
+taken and the first weight contains the execution count of the call minus
 the count of unwind branch taken. Both weights specified are used to calculate
 BranchProbability as for BranchInst and for SamplePGO the sum of both weights
 is used.
@@ -223,7 +223,7 @@ indicates that it was called 2,590 times at runtime.
   !1 = !{!"function_entry_count", i64 2590}
 
 If "function_entry_count" has more than 2 operands, the subsequent operands are
-the GUID of the functions that needs to be imported by ThinLTO. This is only
+the GUID of the functions that need to be imported by ThinLTO. This is only
 set by sampling-based profile. It is needed because the sampling-based profile
 was collected on a binary that had already imported and inlined these functions,
 and we need to ensure the IR matches in the ThinLTO backends for profile
diff --git a/llvm/docs/CIBestPractices.rst b/llvm/docs/CIBestPractices.rst
index 855e2ccac8ece..a2270daf6cded 100644
--- a/llvm/docs/CIBestPractices.rst
+++ b/llvm/docs/CIBestPractices.rst
@@ -146,7 +146,7 @@ for LLVM infrastructure.
 Using Fully Qualified Container Names
 -------------------------------------
 
-When referencing container images from a registry, such as in Github Actions
+When referencing container images from a registry, such as in GitHub Actions
 workflows, or in ``Dockerfile`` files used for building images, prefer fully
 qualified names (i.e., including the registry domain) over just the image.
 For example, prefer ``docker.io/ubuntu:24.04`` over ``ubuntu:24.04``. This
diff --git a/llvm/docs/CodeGenerator.rst b/llvm/docs/CodeGenerator.rst
index fc704a3cdd51f..a74f16d7e9477 100644
--- a/llvm/docs/CodeGenerator.rst
+++ b/llvm/docs/CodeGenerator.rst
@@ -498,7 +498,7 @@ The ``MachineBasicBlock`` class
 The ``MachineBasicBlock`` class contains a list of machine instructions
 (:raw-html:`<tt>` `MachineInstr`_ :raw-html:`</tt>` instances).  It roughly
 corresponds to the LLVM code input to the instruction selector, but there can be
-a one-to-many mapping (i.e. one LLVM basic block can map to multiple machine
+a one-to-many mapping (i.e., one LLVM basic block can map to multiple machine
 basic blocks). The ``MachineBasicBlock`` class has a "``getBasicBlock``" method,
 which returns the LLVM basic block that it comes from.
 
@@ -522,7 +522,7 @@ LLVM code generator can model sequences of instructions as MachineInstr
 bundles. A MI bundle can model a VLIW group / pack which contains an arbitrary
 number of parallel instructions. It can also be used to model a sequential list
 of instructions (potentially with data dependencies) that cannot be legally
-separated (e.g. ARM Thumb2 IT blocks).
+separated (e.g., ARM Thumb2 IT blocks).
 
 Conceptually a MI bundle is a MI with a number of other MIs nested within:
 
@@ -583,8 +583,8 @@ Packing / bundling of MachineInstrs for VLIW architectures should
 generally be done as part of the register allocation super-pass. More
 specifically, the pass which determines what MIs should be bundled
 together should be done after code generator exits SSA form
-(i.e. after two-address pass, PHI elimination, and copy coalescing).
-Such bundles should be finalized (i.e. adding BUNDLE MIs and input and
+(i.e., after two-address pass, PHI elimination, and copy coalescing).
+Such bundles should be finalized (i.e., adding BUNDLE MIs and input and
 output register MachineOperands) after virtual registers have been
 rewritten into physical registers. This eliminates the need to add
 virtual register operands to BUNDLE instructions which would
@@ -615,7 +615,7 @@ The ``MCStreamer`` API
 ----------------------
 
 MCStreamer is best thought of as an assembler API.  It is an abstract API which
-is *implemented* in different ways (e.g. to output a ``.s`` file, output an ELF ``.o``
+is *implemented* in different ways (e.g., to output a ``.s`` file, output an ELF ``.o``
 file, etc) but whose API corresponds directly to what you see in a ``.s`` file.
 MCStreamer has one method per directive, such as EmitLabel, EmitSymbolAttribute,
 switchSection, emitValue (for .byte, .word), etc, which directly correspond to
@@ -631,7 +631,7 @@ directives through MCStreamer.
 On the implementation side of MCStreamer, there are two major implementations:
 one for writing out a ``.s`` file (MCAsmStreamer), and one for writing out a ``.o``
 file (MCObjectStreamer).  MCAsmStreamer is a straightforward implementation
-that prints out a directive for each method (e.g. ``EmitValue -> .byte``), but
+that prints out a directive for each method (e.g., ``EmitValue -> .byte``), but
 MCObjectStreamer implements a full assembler.
 
 For target-specific directives, the MCStreamer has a MCTargetStreamer instance.
@@ -681,7 +681,7 @@ The ``MCSection`` class
 -----------------------
 
 The ``MCSection`` class represents an object-file specific section. It is
-subclassed by object file specific implementations (e.g. ``MCSectionMachO``,
+subclassed by object file specific implementations (e.g., ``MCSectionMachO``,
 ``MCSectionCOFF``, ``MCSectionELF``) and these are created and uniqued by
 MCContext.  The MCStreamer has a notion of the current section, which can be
 changed with the SwitchToSection method (which corresponds to a ".section"
@@ -696,7 +696,7 @@ The ``MCInst`` class is a target-independent representation of an instruction.
 It is a simple class (much more so than `MachineInstr`_) that holds a
 target-specific opcode and a vector of MCOperands.  MCOperand, in turn, is a
 simple discriminated union of three cases: 1) a simple immediate, 2) a target
-register ID, 3) a symbolic expression (e.g. "``Lfoo-Lbar+42``") as an MCExpr.
+register ID, 3) a symbolic expression (e.g., "``Lfoo-Lbar+42``") as an MCExpr.
 
 MCInst is the common currency used to represent machine instructions at the MC
 layer.  It is the type used by the instruction encoder, the instruction printer,
@@ -711,9 +711,9 @@ The MC layer's object writers support a variety of object formats. Because of
 target-specific aspects of object formats each target only supports a subset of
 the formats supported by the MC layer. Most targets support emitting ELF
 objects. Other vendor-specific objects are generally supported only on targets
-that are supported by that vendor (i.e. MachO is only supported on targets
+that are supported by that vendor (i.e., MachO is only supported on targets
 supported by Darwin, and XCOFF is only supported on targets that support AIX).
-Additionally some targets have their own object formats (i.e. DirectX, SPIR-V
+Additionally some targets have their own object formats (i.e., DirectX, SPIR-V
 and WebAssembly).
 
 The table below captures a snapshot of object file support in LLVM:
@@ -769,7 +769,7 @@ Introduction to SelectionDAGs
 
 The SelectionDAG provides an abstraction for code representation in a way that
 is amenable to instruction selection using automatic techniques
-(e.g. dynamic-programming based optimal pattern matching selectors). It is also
+(e.g., dynamic-programming based optimal pattern matching selectors). It is also
 well-suited to other phases of code generation; in particular, instruction
 scheduling (SelectionDAG's are very close to scheduling DAGs post-selection).
 Additionally, the SelectionDAG provides a host representation where a large
@@ -898,7 +898,7 @@ Initial SelectionDAG Construction
 The initial SelectionDAG is na\ :raw-html:`&iuml;`\ vely peephole expanded from
 the LLVM input by the ``SelectionDAGBuilder`` class.  The intent of this pass
 is to expose as much low-level, target-specific details to the SelectionDAG as
-possible.  This pass is mostly hard-coded (e.g. an LLVM ``add`` turns into an
+possible.  This pass is mostly hard-coded (e.g., an LLVM ``add`` turns into an
 ``SDNode add`` while a ``getelementptr`` is expanded into the obvious
 arithmetic). This pass requires target-specific hooks to lower calls, returns,
 varargs, etc.  For these features, the :raw-html:`<tt>` `TargetLowering`_
@@ -944,7 +944,7 @@ The Legalize phase is in charge of converting a DAG to only use the operations
 that are natively supported by the target.
 
 Targets often have weird constraints, such as not supporting every operation on
-every supported data type (e.g. X86 does not support byte conditional moves and
+every supported data type (e.g., X86 does not support byte conditional moves and
 PowerPC does not support sign-extending loads from a 16-bit memory location).
 Legalize takes care of this by open-coding another sequence of operations to
 emulate the operation ("expansion"), by promoting one type to a larger type that
@@ -995,7 +995,7 @@ SelectionDAG Optimization Phase: the DAG Combiner
 
 The SelectionDAG optimization phase is run multiple times for code generation,
 immediately after the DAG is built and once after each legalization.  The first
-run of the pass allows the initial code to be cleaned up (e.g. performing
+run of the pass allows the initial code to be cleaned up (e.g., performing
 optimizations that depend on knowing that the operators have restricted type
 inputs).  Subsequent runs of the pass clean up the messy code generated by the
 Legalize passes, which allows Legalize to be very simple (it can focus on making
@@ -1120,10 +1120,10 @@ for your target.  It has the following strengths:
   16-bits of the immediate).
 
 * When using the 'Pat' class to map a pattern to an instruction that has one
-  or more complex operands (like e.g. `X86 addressing mode`_), the pattern may
+  or more complex operands (like e.g., `X86 addressing mode`_), the pattern may
   either specify the operand as a whole using a ``ComplexPattern``, or else it
   may specify the components of the complex operand separately.  The latter is
-  done e.g. for pre-increment instructions by the PowerPC back end:
+  done e.g., for pre-increment instructions by the PowerPC back end:
 
   ::
 
@@ -1145,13 +1145,13 @@ While it has many strengths, the system currently has some limitations,
 primarily because it is a work in progress and is not yet finished:
 
 * Overall, there is no way to define or match SelectionDAG nodes that define
-  multiple values (e.g. ``SMUL_LOHI``, ``LOAD``, ``CALL``, etc).  This is the
+  multiple values (e.g., ``SMUL_LOHI``, ``LOAD``, ``CALL``, etc).  This is the
   biggest reason that you currently still *have to* write custom C++ code
   for your instruction selector.
 
 * There is no great way to support matching complex addressing modes yet.  In
   the future, we will extend pattern fragments to allow them to define multiple
-  values (e.g. the four operands of the `X86 addressing mode`_, which are
+  values (e.g., the four operands of the `X86 addressing mode`_, which are
   currently matched with custom C++ code).  In addition, we'll extend fragments
   so that a fragment can match multiple different patterns.
 
@@ -1175,7 +1175,7 @@ SelectionDAG Scheduling and Formation Phase
 
 The scheduling phase takes the DAG of target instructions from the selection
 phase and assigns an order.  The scheduler can pick an order depending on
-various constraints of the machines (i.e. order for minimal register pressure or
+various constraints of the machines (i.e., order for minimal register pressure or
 try to cover instruction latencies).  Once an order is established, the DAG is
 converted to a list of :raw-html:`<tt>` `MachineInstr`_\s :raw-html:`</tt>` and
 the SelectionDAG is destroyed.
@@ -1615,7 +1615,7 @@ Since the MC layer works at the level of abstraction of object files, it doesn't
 have a notion of functions, global variables etc.  Instead, it thinks about
 labels, directives, and instructions.  A key class used at this time is the
 MCStreamer class.  This is an abstract API that is implemented in different ways
-(e.g. to output a ``.s`` file, output an ELF ``.o`` file, etc) that is effectively an
+(e.g., to output a ``.s`` file, output an ELF ``.o`` file, etc) that is effectively an
 "assembler API".  MCStreamer has one method per directive, such as EmitLabel,
 EmitSymbolAttribute, switchSection, etc, which directly correspond to assembly
 level directives.
diff --git a/llvm/docs/CommandGuide/dsymutil.rst b/llvm/docs/CommandGuide/dsymutil.rst
index 8e61e01d7d9c3..0e442d657e987 100644
--- a/llvm/docs/CommandGuide/dsymutil.rst
+++ b/llvm/docs/CommandGuide/dsymutil.rst
@@ -70,6 +70,14 @@ OPTIONS
 
  Print this help output.
 
+.. option:: --include-swiftmodules-from-interface
+
+ Whether or not to copy binary swiftmodules built from textual .swiftinterface
+ files into the dSYM bundle. These typically come only from the SDK (since
+ textual interfaces require library evolution) and thus are a waste of space to
+ copy into the bundle. Turn this on if the swiftmodules are different from
+ those in the SDK.
+
 .. option:: --keep-function-for-static
 
  Make a static variable keep the enclosing function even if it would have been
diff --git a/llvm/docs/CommandGuide/llc.rst b/llvm/docs/CommandGuide/llc.rst
index cc670f6043656..ffcccfbaefffb 100644
--- a/llvm/docs/CommandGuide/llc.rst
+++ b/llvm/docs/CommandGuide/llc.rst
@@ -129,6 +129,12 @@ End-user Options
 
  Print statistics recorded by code-generation passes.
 
+.. option:: --save-stats, --save-stats=cwd, --save-stats=obj
+
+ Save LLVM statistics to a file in the current directory
+ (:option:`--save-stats`/"--save-stats=cwd") or the directory
+ of the output file ("--save-stats=obj") in JSON format.
+
 .. option:: --time-passes
 
  Record the amount of time needed for each pass and print a report to standard
diff --git a/llvm/docs/CommandGuide/llvm-config.rst b/llvm/docs/CommandGuide/llvm-config.rst
index 63658d0d90452..1c5c9c7447902 100644
--- a/llvm/docs/CommandGuide/llvm-config.rst
+++ b/llvm/docs/CommandGuide/llvm-config.rst
@@ -126,6 +126,11 @@ OPTIONS
 
  Print the installation prefix for LLVM.
 
+**--quote-paths**
+
+ Quote and escape paths when needed, most notably when a quote, space, backslash
+ or dollar sign characters are present in the path.
+
 **--shared-mode**
 
  Print how the provided components can be collectively linked (`shared` or `static`).
diff --git a/llvm/docs/CommandGuide/llvm-dwarfdump.rst b/llvm/docs/CommandGuide/llvm-dwarfdump.rst
index 137830259eb64..dfc0431f07826 100644
--- a/llvm/docs/CommandGuide/llvm-dwarfdump.rst
+++ b/llvm/docs/CommandGuide/llvm-dwarfdump.rst
@@ -134,6 +134,15 @@ OPTIONS
 
             Abbreviate the description of type unit entries.
 
+.. option:: -t, --filter-child-tag
+
+            Only dump children whose DWARF tag is one of the specified tags.
+            Example usage:
+
+            .. code-block:: c
+
+              llvm-dwarfdump -t DW_TAG_structure_type -t DW_TAG_member -c
+
 .. option:: -x, --regex
 
             Treat any <name> strings as regular expressions when searching
diff --git a/llvm/docs/CompileCudaWithLLVM.rst b/llvm/docs/CompileCudaWithLLVM.rst
index 376d8ee550c99..0bd121a895028 100644
--- a/llvm/docs/CompileCudaWithLLVM.rst
+++ b/llvm/docs/CompileCudaWithLLVM.rst
@@ -36,7 +36,7 @@ CUDA installation on a handful of common Linux distributions, but in general the
 most reliable way to make it work is to install CUDA in a single directory from
 NVIDIA's `.run` package and specify its location via `--cuda-path=...` argument.
 
-CUDA compilation is supported on Linux. Compilation on MacOS and Windows may or
+CUDA compilation is supported on Linux. Compilation on macOS and Windows may or
 may not work and currently have no maintainers.
 
 Invoking clang
@@ -64,7 +64,7 @@ brackets as described below:
   y[2] = 6
   y[3] = 8
 
-On MacOS, replace `-lcudart_static` with `-lcudart`; otherwise, you may get
+On macOS, replace `-lcudart_static` with `-lcudart`; otherwise, you may get
 "CUDA driver version is insufficient for CUDA runtime version" errors when you
 run your program.
 
diff --git a/llvm/docs/Coroutines.rst b/llvm/docs/Coroutines.rst
index 13d2da42eaca7..0e6b49c84acee 100644
--- a/llvm/docs/Coroutines.rst
+++ b/llvm/docs/Coroutines.rst
@@ -193,7 +193,7 @@ Values live across a suspend point need to be stored in the coroutine frame to
 be available in the continuation function. This frame is stored as a tail to the
 `async context`.
 
-Every suspend point takes an `context projection function` argument which
+Every suspend point takes a `context projection function` argument which
 describes how-to obtain the continuations `async context` and every suspend
 point has an associated `resume function` denoted by the
 `llvm.coro.async.resume` intrinsic. The coroutine is resumed by calling this
@@ -221,7 +221,7 @@ a parameter to the `llvm.coro.suspend.async` intrinsic.
                                               ptr %resume_func_ptr,
                                               ptr %context_projection_function
 
-The frontend should provide a `async function pointer` struct associated with
+The frontend should provide an `async function pointer` struct associated with
 each async coroutine by `llvm.coro.id.async`'s argument. The initial size and
 alignment of the `async context` must be provided as arguments to the
 `llvm.coro.id.async` intrinsic. Lowering will update the size entry with the
@@ -314,7 +314,7 @@ coroutine handle. The second parameter of `coro.begin` is given a block of memor
 to be used if the coroutine frame needs to be allocated dynamically.
 
 The `coro.id`_ intrinsic serves as coroutine identity useful in cases when the
-`coro.begin`_ intrinsic get duplicated by optimization passes such as
+`coro.begin`_ intrinsic gets duplicated by optimization passes such as
 jump-threading.
 
 The `cleanup` block destroys the coroutine frame. The `coro.free`_ intrinsic,
@@ -2149,7 +2149,7 @@ CoroEarly
 The CoroEarly pass ensures later middle end passes correctly interpret coroutine 
 semantics and lowers coroutine intrinsics that not needed to be preserved to 
 help later coroutine passes. This pass lowers `coro.promise`_, `coro.frame`_ and 
-`coro.done`_ intrinsics. Afterwards, it replace uses of promise alloca with 
+`coro.done`_ intrinsics. Afterwards, it replaces uses of promise alloca with 
 `coro.promise`_ intrinsic.
 
 .. _CoroSplit:
@@ -2188,7 +2188,7 @@ Attributes
 coro_only_destroy_when_complete
 -------------------------------
 
-When the coroutine are marked with coro_only_destroy_when_complete, it indicates
+When the coroutine is marked with coro_only_destroy_when_complete, it indicates
 the coroutine must reach the final suspend point when it get destroyed.
 
 This attribute only works for switched-resume coroutines now.
@@ -2199,7 +2199,7 @@ coro_elide_safe
 When a Call or Invoke instruction to switch ABI coroutine `f` is marked with
 `coro_elide_safe`, CoroSplitPass generates a `f.noalloc` ramp function.
 `f.noalloc` has one more argument than its original ramp function `f`, which is
-the pointer to the allocated frame. `f.noalloc` also suppressed any allocations
+the pointer to the allocated frame. `f.noalloc` also suppresses any allocations
 or deallocations that may be guarded by `@llvm.coro.alloc` and `@llvm.coro.free`.
 
 CoroAnnotationElidePass performs the heap elision when possible. Note that for
diff --git a/llvm/docs/Docker.rst b/llvm/docs/Docker.rst
index 5f8e619d8b5eb..29078d1f79fdb 100644
--- a/llvm/docs/Docker.rst
+++ b/llvm/docs/Docker.rst
@@ -16,7 +16,7 @@ to fill out in order to produce Dockerfiles for a new docker image.
 Why?
 ----
 Docker images provide a way to produce binary distributions of
-software inside a controlled environment. Having Dockerfiles to builds docker images
+software inside a controlled environment. Having Dockerfiles to build docker images
 inside LLVM repo makes them much more discoverable than putting them into any other
 place.
 
@@ -35,7 +35,7 @@ A snapshot of a docker container filesystem is called a *docker image*.
 One can start a container from a prebuilt docker image.
 
 Docker images are built from a so-called *Dockerfile*, a source file written in
-a specialized language that defines instructions to be used when build
+a specialized language that defines instructions to be used when building
 the docker image (see `official
 documentation <https://docs.docker.com/engine/reference/builder/>`_ for more
 details). A minimal Dockerfile typically contains a base image and a number
diff --git a/llvm/docs/Extensions.rst b/llvm/docs/Extensions.rst
index 91a3ac05ef0e5..0d7f599548fb7 100644
--- a/llvm/docs/Extensions.rst
+++ b/llvm/docs/Extensions.rst
@@ -274,13 +274,13 @@ This would be equivalent to the following raw assembly:
 
 The following directives are specified:
 
-  - lib
+  - ``lib``
 
     The parameter identifies a library to be linked against.  The library will
     be looked up in the default and any specified library search paths
     (specified to this point).
 
-  - libpath
+  - ``libpath``
 
     The parameter identifies an additional library search path to be considered
     when looking up libraries after the inclusion of this option.
@@ -327,13 +327,13 @@ The contents of the section shall be a sequence of ``Elf_CGProfile`` entries.
     Elf_Xword cgp_weight;
   } Elf_CGProfile;
 
-cgp_from
+``cgp_from``
   The symbol index of the source of the edge.
 
-cgp_to
+``cgp_to``
   The symbol index of the destination of the edge.
 
-cgp_weight
+``cgp_weight``
   The weight of the edge.
 
 This is represented in assembly as:
@@ -352,7 +352,7 @@ table.
 ``SHT_LLVM_ADDRSIG`` Section (address-significance table)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-This section is used to mark symbols as address-significant, i.e. the address
+This section is used to mark symbols as address-significant, i.e., the address
 of the symbol is used in a comparison or leaks outside the translation unit. It
 has the same meaning as the absence of the LLVM attributes ``unnamed_addr``
 and ``local_unnamed_addr``.
@@ -519,11 +519,11 @@ those bits are:
 #. Basic Block Frequencies - Encoded as raw block frequency value taken from
    MBFI analysis. This value is an integer that encodes the relative frequency
    compared to the entry block. More information can be found in
-   'llvm/Support/BlockFrequency.h'.
+   ``llvm/Support/BlockFrequency.h``.
 
 #. Branch Probabilities - Encoded as raw numerator for branch probability
    taken from MBPI analysis. This value is the numerator for a fixed point ratio
-   defined in 'llvm/Support/BranchProbability.h'. It indicates the probability
+   defined in ``llvm/Support/BranchProbability.h``. It indicates the probability
    that the block is followed by a given successor block during execution.
 
 This extra data requires version 2 or above. This is necessary since successors
@@ -726,7 +726,7 @@ Syntax:
 Syntax:
   ``.cv_fpo_data`` *procsym*
 
-Target Specific Behaviour
+Target-Specific Behaviour
 =========================
 
 X86
@@ -792,7 +792,7 @@ emission of Variable Length Arrays (VLAs).
 The Windows ARM Itanium ABI extends the base ABI by adding support for emitting
 a dynamic stack allocation.  When emitting a variable stack allocation, a call
 to ``__chkstk`` is emitted unconditionally to ensure that guard pages are setup
-properly.  The emission of this stack probe emission is handled similar to the
+properly.  The emission of this stack probe emission is handled similarly to the
 standard stack probe emission.
 
 The MSVC environment does not emit code for VLAs currently.
@@ -813,7 +813,7 @@ in the following fashion:
   sub sp, sp, x15, lsl #4
 
 However, this has the limitation of 256 MiB (±128MiB).  In order to accommodate
-larger binaries, LLVM supports the use of ``-mcmodel=large`` to allow a 8GiB
+larger binaries, LLVM supports the use of ``-mcmodel=large`` to allow an 8GiB
 (±4GiB) range via a slight deviation.  It will generate an indirect jump as
 follows:
 
diff --git a/llvm/docs/FatLTO.rst b/llvm/docs/FatLTO.rst
index 5864944332fc0..c883513feb6ba 100644
--- a/llvm/docs/FatLTO.rst
+++ b/llvm/docs/FatLTO.rst
@@ -38,7 +38,7 @@ This pipeline will:
 
    Previously, we conservatively ran independent pipelines on separate copies
    of the LLVM module to generate the bitcode section and the object code,
-   which happen to be identical to those used outside of FatLTO. While that
+   which happened to be identical to those used outside of FatLTO. While that
    resulted in  compiled artifacts that were identical to those produced by the
    default and (Thin)LTO pipelines, module cloning led to some cases of
    miscompilation, and we have moved away from trying to keep bitcode
diff --git a/llvm/docs/FaultMaps.rst b/llvm/docs/FaultMaps.rst
index a089a38fcb30c..5dc5e574fd2fa 100644
--- a/llvm/docs/FaultMaps.rst
+++ b/llvm/docs/FaultMaps.rst
@@ -9,7 +9,7 @@ FaultMaps and implicit checks
 Motivation
 ==========
 
-Code generated by managed language runtimes tend to have checks that
+Code generated by managed language runtimes tends to have checks that
 are required for safety but never fail in practice.  In such cases, it
 is profitable to make the non-failing case cheaper even if it makes
 the failing case significantly more expensive.  This asymmetry can be
@@ -28,7 +28,7 @@ the same memory location.
 The Fault Map Section
 =====================
 
-Information about implicit checks generated by LLVM are put in a
+Information about implicit checks generated by LLVM is put in a
 special "fault map" section.  On Darwin this section is named
 ``__llvm_faultmaps``.
 
diff --git a/llvm/docs/GarbageCollection.rst b/llvm/docs/GarbageCollection.rst
index 67be080db1310..d5fdfbbb03f90 100644
--- a/llvm/docs/GarbageCollection.rst
+++ b/llvm/docs/GarbageCollection.rst
@@ -487,7 +487,7 @@ The 'Erlang' and 'OCaml' GCs
 LLVM ships with two example collectors which leverage the ``gcroot``
 mechanisms.  To our knowledge, these are not actually used by any language
 runtime, but they do provide a reasonable starting point for someone interested
-in writing an ``gcroot`` compatible GC plugin.  In particular, these are the
+in writing a ``gcroot`` compatible GC plugin.  In particular, these are the
 only in-tree examples of how to produce a custom binary stack map format using
 a ``gcroot`` strategy.
 
diff --git a/llvm/docs/GetElementPtr.rst b/llvm/docs/GetElementPtr.rst
index 6831a8e6e81eb..09389a0af751f 100644
--- a/llvm/docs/GetElementPtr.rst
+++ b/llvm/docs/GetElementPtr.rst
@@ -496,10 +496,10 @@ primitive integer expressions, which allows them to be combined with other
 integer expressions and/or split into multiple separate integer expressions. If
 they've made non-trivial changes, translating back into LLVM IR can involve
 reverse-engineering the structure of the addressing in order to fit it into the
-static type of the original first operand. It isn't always possibly to fully
+static type of the original first operand. It isn't always possible to fully
 reconstruct this structure; sometimes the underlying addressing doesn't
 correspond with the static type at all. In such cases the optimizer instead will
-emit a GEP with the base pointer casted to a simple address-unit pointer, using
+emit a GEP with the base pointer cast to a simple address-unit pointer, using
 the name "uglygep". This isn't pretty, but it's just as valid, and it's
 sufficient to preserve the pointer aliasing guarantees that GEP provides.
 
diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index 039d61624093d..ad544342de329 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -225,7 +225,7 @@ what to add to your calendar invite.
      -
    * - GlobalISel
      - Every 2nd Tuesday of the month
-     - `gcal <https://calendar.google.com/calendar/u/0?cid=ZDcyMjc0ZjZiZjNhMzFlYmE3NTNkMWM2MGM2NjM5ZWU3ZDE2MjM4MGFlZDc2ZjViY2UyYzMwNzVhZjk4MzQ4ZEBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
+     - `gcal <https://calendar.google.com/calendar/u/0?cid=YWZjNzhmMzE4MDNlNTAyNGY1NmE1MDIyODY0YTYwZmJmYzRjYTEwNTE1NmUxODA2NzBkYTliY2ZhYTVkNjk0NUBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
      - `Meeting details/agenda <https://docs.google.com/document/d/1Ry8O4-Tm5BFj9AMjr8qTQFU80z-ptiNQ62687NaIvLs/edit?usp=sharing>`__
 
 
@@ -562,7 +562,7 @@ An example invite looks as follows
 .. code-block:: none
 
   This event is a meetup for all developers of LLDB. Meeting agendas are posted
-  on discourse before the event.
+  on Discourse before the event.
 
   Attendees must adhere to the LLVM Code of Conduct
   (https://llvm.org/docs/CodeOfConduct.html). For any Code of Conduct reports,
diff --git a/llvm/docs/GettingStartedVS.rst b/llvm/docs/GettingStartedVS.rst
index e65fd8fde829d..b82a4a05b5213 100644
--- a/llvm/docs/GettingStartedVS.rst
+++ b/llvm/docs/GettingStartedVS.rst
@@ -244,7 +244,7 @@ Build the LLVM Suite:
 * The Fibonacci project is a sample program that uses the JIT. Modify the
   project's debugging properties to provide a numeric command-line argument
   or run it from the command line.  The program will print the
-  corresponding fibonacci value.
+  corresponding Fibonacci value.
 
 
 Links
diff --git a/llvm/docs/GoldPlugin.rst b/llvm/docs/GoldPlugin.rst
index 07d2fc203eba5..606f9e0820e60 100644
--- a/llvm/docs/GoldPlugin.rst
+++ b/llvm/docs/GoldPlugin.rst
@@ -83,7 +83,7 @@ which is why you otherwise need gold to be the installed system linker in
 your path.
 
 ``ar`` and ``nm`` also accept the ``-plugin`` option and it's possible to
-to install ``LLVMgold.so`` to ``/usr/lib/bfd-plugins`` for a seamless setup.
+install ``LLVMgold.so`` to ``/usr/lib/bfd-plugins`` for a seamless setup.
 If you built your own gold, be sure to install the ``ar`` and ``nm-new`` you
 built to ``/usr/bin``.
 
@@ -143,7 +143,7 @@ Quickstart for using LTO with autotooled projects
 =================================================
 
 Once your system ``ld``, ``ar``, and ``nm`` all support LLVM bitcode,
-everything is in place for an easy to use LTO build of autotooled projects:
+everything is in place for an easy-to-use LTO build of autotooled projects:
 
 * Follow the instructions :ref:`on how to build LLVMgold.so
   <lto-how-to-build>`.
diff --git a/llvm/docs/GwpAsan.rst b/llvm/docs/GwpAsan.rst
index 675a61de00983..937956fdabea7 100644
--- a/llvm/docs/GwpAsan.rst
+++ b/llvm/docs/GwpAsan.rst
@@ -31,7 +31,7 @@ Unlike `AddressSanitizer <https://clang.llvm.org/docs/AddressSanitizer.html>`_,
 GWP-ASan does not induce a significant performance overhead. ASan often requires
 the use of dedicated canaries to be viable in production environments, and as
 such is often impractical. Moreover, ASan's runtime is not developed with
-security consideration in mind, making compiled binaries more vulnerable to
+security considerations in mind, making compiled binaries more vulnerable to
 exploits.
 
 However, GWP-ASan is only capable of finding a subset of the memory issues
diff --git a/llvm/docs/HowToBuildWindowsItaniumPrograms.rst b/llvm/docs/HowToBuildWindowsItaniumPrograms.rst
index 48ca7b25b11ef..d932d9dd00bfd 100644
--- a/llvm/docs/HowToBuildWindowsItaniumPrograms.rst
+++ b/llvm/docs/HowToBuildWindowsItaniumPrograms.rst
@@ -8,7 +8,7 @@ Introduction
 This document contains information describing how to create a Windows Itanium toolchain.
 
 Windows Itanium allows you to deploy Itanium C++ ABI applications on top of the MS VS CRT.
-This environment can use the Windows SDK headers directly and does not required additional
+This environment can use the Windows SDK headers directly and does not require additional
 headers or additional runtime machinery (such as is used by mingw).
 
 Windows Itanium Stack:
diff --git a/llvm/docs/HowToReleaseLLVM.rst b/llvm/docs/HowToReleaseLLVM.rst
index 171bf889256cd..c269cc4c54bcc 100644
--- a/llvm/docs/HowToReleaseLLVM.rst
+++ b/llvm/docs/HowToReleaseLLVM.rst
@@ -311,10 +311,10 @@ This section describes how to triage bug reports:
    to backport.  You should also review the bug yourself to ensure that it
    meets the requirements for committing to the release branch.
 
-#. Once a bug has been reviewed, add the release:reviewed label and update the
-   issue's status to "Needs Merge".  Check the pull request associated with the
-   issue.  If all the tests pass, then the pull request can be merged.  If not,
-   then add a comment on the issue asking someone to take a look at the failures.
+#. Once a bug has been reviewed, update the status to "Needs Merge". Check the
+   pull request associated with the issue. If all the tests pass, then the pull
+   request can be merged. If not, then add a comment on the issue asking
+   someone to take a look at the failures.
 
 
 Release Patch Rules
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 1c6823be44dcb..820cc1cfd02ee 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -159,7 +159,7 @@ There are two kinds of escapes.
 * ``\\`` represents a single ``\`` character.
 
 * ``\`` followed by two hexadecimal characters (0-9, a-f, or A-F)
-  represents the byte with the given value (e.g. ``\00`` represents a
+  represents the byte with the given value (e.g., ``\00`` represents a
   null byte).
 
 To represent a ``"`` character, use ``\22``. (``\"`` will end the string
@@ -168,7 +168,7 @@ with a trailing ``\``.)
 Newlines do not terminate string constants; strings can span multiple
 lines.
 
-The interpretation of string constants (e.g. their character encoding)
+The interpretation of string constants (e.g., their character encoding)
 depends on context.
 
 
@@ -330,7 +330,7 @@ added in the future:
     the function (as does normal C).
 "``fastcc``" - The fast calling convention
     This calling convention attempts to make calls as fast as possible
-    (e.g. by passing things in registers). This calling convention
+    (e.g., by passing things in registers). This calling convention
     allows the target to use whatever tricks it wants to produce fast
     code for the target, without having to conform to an externally
     specified ABI (Application Binary Interface). `Tail calls can only
@@ -465,7 +465,7 @@ added in the future:
     This calling convention doesn't preserve any general registers. So all
     general registers are caller saved registers. It also uses all general
     registers to pass arguments. This attribute doesn't impact non-general
-    purpose registers (e.g. floating point registers, on X86 XMMs/YMMs).
+    purpose registers (e.g., floating point registers, on X86 XMMs/YMMs).
     Non-general purpose registers still follow the standard C calling
     convention. Currently it is for x86_64 and AArch64 only.
 "``cxx_fast_tlscc``" - The `CXX_FAST_TLS` calling convention for access functions
@@ -668,7 +668,7 @@ representation is not just an integer address are called "non-integral".
 Non-integral pointers have at least one of the following three properties:
 
 * the pointer representation contains non-address bits
-* the pointer representation is unstable (may changed at any time in a
+* the pointer representation is unstable (may change at any time in a
   target-specific way)
 * the pointer representation has external state
 
@@ -700,7 +700,7 @@ Unstable pointer representation
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Pointers in this address space have an *unspecified* bitwise representation
-(i.e. not backed by a fixed integer). The bitwise pattern of such pointers is
+(i.e., not backed by a fixed integer). The bitwise pattern of such pointers is
 allowed to change in a target-specific way. For example, this could be a pointer
 type used with copying garbage collection where the garbage collector could
 update the pointer at any time in the collection sweep.
@@ -757,7 +757,7 @@ The following restrictions apply to IR level optimization passes:
 
 The ``inttoptr`` instruction does not recreate the external state and therefore
 it is target dependent whether it can be used to create a dereferenceable
-pointer. In general passes should assume that the result of such an inttoptr
+pointer. In general passes should assume that the result of such an ``inttoptr``
 is not dereferenceable. For example, on CHERI targets an ``inttoptr`` will
 yield a capability with the external state (the validity tag bit) set to zero,
 which will cause any dereference to trap.
@@ -784,7 +784,7 @@ be performed as loads and stores of the correct type since stores of other
 types may not propagate the external data.
 Therefore it is not legal to convert an existing load/store (or a
 ``llvm.memcpy`` / ``llvm.memmove`` intrinsic) of pointer types with external
-state to a load/store of an integer type with same bitwidth, as that may drop
+state to a load/store of an integer type with the same bitwidth, as that may drop
 the external state.
 
 
@@ -806,7 +806,7 @@ Global variables can optionally specify a :ref:`linkage type <linkage>`.
 Either global variable definitions or declarations may have an explicit section
 to be placed in and may have an optional explicit alignment specified. If there
 is a mismatch between the explicit or inferred section information for the
-variable declaration and its definition the resulting behavior is undefined.
+variable declaration and its definition, the resulting behavior is undefined.
 
 A variable may be defined as a global ``constant``, which indicates that
 the contents of the variable will **never** be modified (enabling better
@@ -903,7 +903,7 @@ size is unknown at compile time. They are allowed in structs to facilitate
 intrinsics returning multiple values. Generally, structs containing scalable
 vectors are not considered "sized" and cannot be used in loads, stores, allocas,
 or GEPs. The only exception to this rule is for structs that contain scalable
-vectors of the same type (e.g. ``{<vscale x 2 x i32>, <vscale x 2 x i32>}``
+vectors of the same type (e.g., ``{<vscale x 2 x i32>, <vscale x 2 x i32>}``
 contains the same type while ``{<vscale x 2 x i32>, <vscale x 2 x i64>}``
 doesn't). These kinds of structs (we may call them homogeneous scalable vector
 structs) are considered sized and can be used in loads, stores, allocas, but
@@ -1221,7 +1221,7 @@ sections.
 Note that certain IR constructs like global variables and functions may
 create COMDATs in the object file in addition to any which are specified using
 COMDAT IR. This arises when the code generator is configured to emit globals
-in individual sections (e.g. when `-data-sections` or `-function-sections`
+in individual sections (e.g., when `-data-sections` or `-function-sections`
 is supplied to `llc`).
 
 .. _namedmetadatastructure:
@@ -1334,7 +1334,7 @@ Currently, only the following parameter attributes are defined:
     The byval type argument indicates the in-memory value type.
 
     The byval attribute also supports specifying an alignment with the
-    align attribute. It indicates the alignment of the stack slot to
+    ``align`` attribute. It indicates the alignment of the stack slot to
     form and the known alignment of the pointer specified to the call
     site. If the alignment is not specified, then the code generator
     makes a target-specific assumption.
@@ -1355,7 +1355,7 @@ Currently, only the following parameter attributes are defined:
 
     This is not a valid attribute for return values.
 
-    The alignment for an ``byref`` parameter can be explicitly
+    The alignment for a ``byref`` parameter can be explicitly
     specified by combining it with the ``align`` attribute, similar to
     ``byval``. If the alignment is not specified, then the code generator
     makes a target-specific assumption.
@@ -1382,7 +1382,7 @@ Currently, only the following parameter attributes are defined:
     The preallocated attribute requires a type argument.
 
     The preallocated attribute also supports specifying an alignment with the
-    align attribute. It indicates the alignment of the stack slot to
+    ``align`` attribute. It indicates the alignment of the stack slot to
     form and the known alignment of the pointer specified to the call
     site. If the alignment is not specified, then the code generator
     makes a target-specific assumption.
@@ -1550,7 +1550,7 @@ Currently, only the following parameter attributes are defined:
 
 ``nonnull``
     This indicates that the parameter or return pointer is not null. This
-    attribute may only be applied to pointer typed parameters. This is not
+    attribute may only be applied to pointer-typed parameters. This is not
     checked or enforced by LLVM; if the parameter or return pointer is null,
     :ref:`poison value <poisonvalues>` is returned or passed instead.
     The ``nonnull`` attribute should be combined with the ``noundef`` attribute
@@ -1558,7 +1558,7 @@ Currently, only the following parameter attributes are defined:
 
 ``dereferenceable(<n>)``
     This indicates that the parameter or return pointer is dereferenceable. This
-    attribute may only be applied to pointer typed parameters. A pointer that
+    attribute may only be applied to pointer-typed parameters. A pointer that
     is dereferenceable can be loaded from speculatively without a risk of
     trapping. The number of bytes known to be dereferenceable must be provided
     in parentheses. It is legal for the number of bytes to be less than the
@@ -1584,7 +1584,7 @@ Currently, only the following parameter attributes are defined:
     implies that a pointer is at least one of ``dereferenceable(<n>)``
     or ``null`` (i.e., it may be both ``null`` and
     ``dereferenceable(<n>)``). This attribute may only be applied to
-    pointer typed parameters.
+    pointer-typed parameters.
 
 ``swiftself``
     This indicates that the parameter is the self/context parameter. This is not
@@ -1601,7 +1601,7 @@ Currently, only the following parameter attributes are defined:
 
 ``swifterror``
     This attribute is motivated to model and optimize Swift error handling. It
-    can be applied to a parameter with pointer to pointer type or a
+    can be applied to a parameter with pointer-to-pointer type or a
     pointer-sized alloca. At the call site, the actual argument that corresponds
     to a ``swifterror`` parameter has to come from a ``swifterror`` alloca or
     the ``swifterror`` parameter of the caller. A ``swifterror`` value (either
@@ -1722,7 +1722,7 @@ Currently, only the following parameter attributes are defined:
     The function parameter marked with this attribute is the alignment in bytes of the
     newly allocated block returned by this function. The returned value must either have
     the specified alignment or be the null pointer. The return value MAY be more aligned
-    than the requested alignment, but not less aligned.  Invalid (e.g. non-power-of-2)
+    than the requested alignment, but not less aligned.  Invalid (e.g., non-power-of-2)
     alignments are permitted for the allocalign parameter, so long as the returned pointer
     is null. This attribute may only be applied to integer parameters.
 
@@ -1989,7 +1989,7 @@ functions will use the same set of attributes. In the degenerate case of a
 group will capture the important command line flags used to build that file.
 
 An attribute group is a module-level object. To use an attribute group, an
-object references the attribute group's ID (e.g. ``#37``). An object may refer
+object references the attribute group's ID (e.g., ``#37``). An object may refer
 to more than one attribute group. In that situation, the attributes from the
 different groups are merged.
 
@@ -2222,7 +2222,7 @@ For example:
     - ``errnomem``: This refers to accesses to the ``errno`` variable.
     - The default access kind (specified without a location prefix) applies to
       all locations that haven't been specified explicitly, including those that
-      don't currently have a dedicated location kind (e.g. accesses to globals
+      don't currently have a dedicated location kind (e.g., accesses to globals
       or captured pointers).
 
     If the ``memory`` attribute is not specified, then ``memory(readwrite)``
@@ -2713,7 +2713,7 @@ For example:
 
 ``mustprogress``
     This attribute indicates that the function is required to return, unwind,
-    or interact with the environment in an observable way e.g. via a volatile
+    or interact with the environment in an observable way e.g., via a volatile
     memory access, I/O, or other synchronization.  The ``mustprogress``
     attribute is intended to model the requirements of the first section of
     [intro.progress] of the C++ Standard. As a consequence, a loop in a
@@ -2741,6 +2741,32 @@ For example:
 ``"nooutline"``
     This attribute indicates that outlining passes should not modify the
     function.
+``nocreateundeforpoison``
+    This attribute indicates that the result of the function (prior to
+    application of return attributes/metadata) will not be undef or poison if
+    all arguments are not undef and not poison. Otherwise, it is undefined
+    behavior.
+
+``"modular-format"="<type>,<string_idx>,<first_arg_idx>,<modular_impl_fn>,<impl_name>,<aspects...>"``
+    This attribute indicates that the implementation is modular on a particular
+    format string argument. If the compiler can determine that not all aspects
+    of the implementation are needed, it can report which aspects were needed
+    and redirect the call to a modular implementation function instead.
+
+    The compiler reports that an implementation aspect is needed by issuing a
+    relocation for the symbol `<impl_name>_<aspect>``. This arranges for code
+    and data needed to support the aspect of the implementation to be brought
+    into the link to satisfy weak references in the modular implemenation
+    function.
+
+    The first three arguments have the same semantics as the arguments to the C
+    ``format`` attribute.
+
+    The following aspects are currently supported:
+
+    - ``float``: The call has a floating point argument
+
+
 
 Call Site Attributes
 ----------------------
@@ -2851,7 +2877,7 @@ are grouped into a single :ref:`attribute group <attrgrp>`.
     with `__attribute__((no_sanitize("memtag")))`,
     `__attribute__((disable_sanitizer_instrumentation))`, or included in the
     `-fsanitize-ignorelist` file. The AArch64 Globals Tagging pass may remove
-    this attribute when it's not possible to tag the global (e.g. it's a TLS
+    this attribute when it's not possible to tag the global (e.g., it's a TLS
     variable).
 ``sanitize_address_dyninit``
     This attribute indicates that the global variable, when instrumented with
@@ -3076,7 +3102,7 @@ the behavior is undefined, unless one of the following exceptions applies:
 
 * ``dereferenceable(<n>)`` operand bundles only guarantee the pointer is
   dereferenceable at the point of the assumption. The pointer may not be
-  dereferenceable at later pointers, e.g. because it could have been freed.
+  dereferenceable at later pointers, e.g., because it could have been freed.
 
 In addition to allowing operand bundles encoding function and parameter
 attributes, an assume operand bundle may also encode a ``separate_storage``
@@ -3270,7 +3296,7 @@ as follows:
     address space 0.
     Note: variable declarations without an address space are always created in
     address space 0, this property only affects the default value to be used
-    when creating globals without additional contextual information (e.g. in
+    when creating globals without additional contextual information (e.g., in
     LLVM passes).
 
 .. _alloca_addrspace:
@@ -3282,7 +3308,7 @@ as follows:
     This specifies the properties of a pointer in address space ``as``.
     The ``<size>`` parameter specifies the size of the bitwise representation.
     For :ref:`non-integral pointers <nointptrtype>` the representation size may
-    be larger than the address width of the underlying address space (e.g. to
+    be larger than the address width of the underlying address space (e.g., to
     accommodate additional metadata).
     The alignment requirements are specified via the ``<abi>`` and
     ``<pref>``\erred alignments parameters.
@@ -3478,7 +3504,7 @@ variables) may *not* change their size. (``realloc``-style operations do not
 change the size of an existing allocated object; instead, they create a new
 allocated object. Even if the object is at the same location as the old one, old
 pointers cannot be used to access this new object.) However, allocated objects
-can also be created by means not recognized by LLVM, e.g. by directly calling
+can also be created by means not recognized by LLVM, e.g., by directly calling
 ``mmap``. Those allocated objects are allowed to grow to the right (i.e.,
 keeping the same base address, but increasing their size) while maintaining the
 validity of existing pointers, as long as they always satisfy the properties
@@ -3632,7 +3658,7 @@ through the return value only:
     }
 
 However, we always consider direct inspection of the pointer address
-(e.g. using ``ptrtoint``) to be location-independent. The following example
+(e.g., using ``ptrtoint``) to be location-independent. The following example
 is *not* considered a return-only capture, even though the ``ptrtoint``
 ultimately only contributes to the return value:
 
@@ -4145,7 +4171,7 @@ output, given the original flags.
    ``a * (c / b)`` can be rewritten into ``a / (b / c)``.
 
 ``contract``
-   Allow floating-point contraction (e.g. fusing a multiply followed by an
+   Allow floating-point contraction (e.g., fusing a multiply followed by an
    addition into a fused multiply-and-add). This does not enable reassociation
    to form arbitrary contractions. For example, ``(a*b) + (c*d) + e`` can not
    be transformed into ``(a*b) + ((c*d) + e)`` to create two fma operations.
@@ -4440,7 +4466,7 @@ the default globals address space and ``addrspace("P")`` the program address
 space.
 
 The representation of pointers can be different for each address space and does
-not necessarily need to be a plain integer address (e.g. for
+not necessarily need to be a plain integer address (e.g., for
 :ref:`non-integral pointers <nointptrtype>`). In addition to a representation
 bits size, pointers in each address space also have an index size which defines
 the bitwidth of indexing operations as well as the size of `integer addresses`
@@ -4750,7 +4776,7 @@ is inserted as defined by the DataLayout string in the module, which is
 required to match what the underlying code generator expects.
 
 Structures can either be "literal" or "identified". A literal structure
-is defined inline with other types (e.g. ``[2 x {i32, i32}]``) whereas
+is defined inline with other types (e.g., ``[2 x {i32, i32}]``) whereas
 identified types are always defined at the top level with a name.
 Literal types are uniqued by their contents and can never be recursive
 or opaque since there is no way to write one. Identified types can be
@@ -4791,7 +4817,7 @@ Simple Constants
     Standard integers (such as '4') are constants of the :ref:`integer
     <t_integer>` type. They can be either decimal or
     hexadecimal. Decimal integers can be prefixed with - to represent
-    negative integers, e.g. '``-1234``'. Hexadecimal integers must be
+    negative integers, e.g., '``-1234``'. Hexadecimal integers must be
     prefixed with either u or s to indicate whether they are unsigned
     or signed respectively. e.g '``u0x8000``' gives 32768, whilst
     '``s0x8000``' gives -32768.
@@ -4801,7 +4827,7 @@ Simple Constants
     zeros. So '``s0x0001``' of type '``i16``' will be -1, not 1.
 **Floating-point constants**
     Floating-point constants use standard decimal notation (e.g.
-    123.421), exponential notation (e.g. 1.23421e+2), or a more precise
+    123.421), exponential notation (e.g., 1.23421e+2), or a more precise
     hexadecimal notation (see below). The assembler requires the exact
     decimal value of a floating-point constant. For example, the
     assembler accepts 1.25 but rejects 1.3 because 1.3 is a repeating
@@ -4883,7 +4909,7 @@ constants and smaller complex constants.
     The string '``zeroinitializer``' can be used to zero initialize a
     value to zero of *any* type, including scalar and
     :ref:`aggregate <t_aggregate>` types. This is often used to avoid
-    having to print large zero initializers (e.g. for large arrays) and
+    having to print large zero initializers (e.g., for large arrays) and
     is always exactly equivalent to using explicit zero initializers.
 **Metadata node**
     A metadata node is a constant tuple without types. For example:
@@ -5286,7 +5312,7 @@ Constant Expressions
 Constant expressions are used to allow expressions involving other
 constants to be used as constants. Constant expressions may be of any
 :ref:`first class <t_firstclass>` type and may involve any LLVM operation
-that does not have side effects (e.g. load and call are not supported).
+that does not have side effects (e.g., load and call are not supported).
 The following is the syntax for constant expressions:
 
 ``trunc (CST to TYPE)``
@@ -5472,7 +5498,7 @@ There are also three different categories of constraint codes:
 Output constraints
 """"""""""""""""""
 
-Output constraints are specified by an "``=``" prefix (e.g. "``=r``"). This
+Output constraints are specified by an "``=``" prefix (e.g., "``=r``"). This
 indicates that the assembly will write to this operand, and the operand will
 then be made available as a return value of the ``asm`` expression. Output
 constraints do not consume an argument from the call instruction. (Except, see
@@ -5480,10 +5506,10 @@ below about indirect outputs).
 
 Normally, it is expected that no output locations are written to by the assembly
 expression until *all* of the inputs have been read. As such, LLVM may assign
-the same register to an output and an input. If this is not safe (e.g. if the
+the same register to an output and an input. If this is not safe (e.g., if the
 assembly contains two instructions, where the first writes to one output, and
 the second reads an input and writes to a second output), then the "``&``"
-modifier must be used (e.g. "``=&r``") to specify that the output is an
+modifier must be used (e.g., "``=&r``") to specify that the output is an
 "early-clobber" output. Marking an output as "early-clobber" ensures that LLVM
 will not use the same register for any inputs (other than an input tied to this
 output).
@@ -5523,17 +5549,17 @@ However, this feature is often not as useful as you might think.
 
 Firstly, the registers are *not* guaranteed to be consecutive. So, on those
 architectures that have instructions which operate on multiple consecutive
-instructions, this is not an appropriate way to support them. (e.g. the 32-bit
+instructions, this is not an appropriate way to support them. (e.g., the 32-bit
 SparcV8 has a 64-bit load, which instruction takes a single 32-bit register. The
 hardware then loads into both the named register, and the next register. This
 feature of inline asm would not be useful to support that.)
 
 A few of the targets provide a template string modifier allowing explicit access
-to the second register of a two-register operand (e.g. MIPS ``L``, ``M``, and
+to the second register of a two-register operand (e.g., MIPS ``L``, ``M``, and
 ``D``). On such an architecture, you can actually access the second allocated
 register (yet, still, not any subsequent ones). But, in that case, you're still
 probably better off simply splitting the value into two separate operands, for
-clarity. (e.g. see the description of the ``A`` constraint on X86, which,
+clarity. (e.g., see the description of the ``A`` constraint on X86, which,
 despite existing only for use with this feature, is not really a good idea to
 use)
 
@@ -5549,11 +5575,11 @@ rather than producing a return value. An indirect output constraint is an
 "output" only in that the asm is expected to write to the contents of the input
 memory location, instead of just read from it).
 
-This is most typically used for memory constraint, e.g. "``=*m``", to pass the
+This is most typically used for memory constraint, e.g., "``=*m``", to pass the
 address of a variable as a value.
 
 It is also possible to use an indirect *register* constraint, but only on output
-(e.g. "``=*r``"). This will cause LLVM to allocate a register for an output
+(e.g., "``=*r``"). This will cause LLVM to allocate a register for an output
 value normally, and then, separately emit a store to the address provided as
 input, after the provided inline asm. (It's not clear what value this
 functionality provides, compared to writing the store explicitly after the asm
@@ -5570,7 +5596,7 @@ Clobber constraints
 A clobber constraint is indicated by a "``~``" prefix. A clobber does not
 consume an input operand, nor generate an output. Clobbers cannot use any of the
 general constraint code letters -- they may use only explicit register
-constraints, e.g. "``~{eax}``". The one exception is that a clobber string of
+constraints, e.g., "``~{eax}``". The one exception is that a clobber string of
 "``~{memory}``" indicates that the assembly writes to arbitrary undeclared
 memory locations -- not only the memory pointed to by a declared indirect
 output.
@@ -5594,9 +5620,9 @@ Constraint Codes
 """"""""""""""""
 After a potential prefix comes constraint code, or codes.
 
-A Constraint Code is either a single letter (e.g. "``r``"), a "``^``" character
-followed by two letters (e.g. "``^wc``"), or "``{``" register-name "``}``"
-(e.g. "``{eax}``").
+A Constraint Code is either a single letter (e.g., "``r``"), a "``^``" character
+followed by two letters (e.g., "``^wc``"), or "``{``" register-name "``}``"
+(e.g., "``{eax}``").
 
 The one and two letter constraint codes are typically chosen to be the same as
 GCC's constraint codes.
@@ -5973,11 +5999,11 @@ Target-independent:
 
 - ``a``: Print a memory reference. Targets might customize the output.
 - ``c``: Print an immediate integer constant unadorned, without
-  the target-specific immediate punctuation (e.g. no ``$`` prefix).
+  the target-specific immediate punctuation (e.g., no ``$`` prefix).
 - ``n``: Negate and print immediate integer constant unadorned, without the
-  target-specific immediate punctuation (e.g. no ``$`` prefix).
+  target-specific immediate punctuation (e.g., no ``$`` prefix).
 - ``l``: Print as an unadorned label, without the target-specific label
-  punctuation (e.g. no ``$`` prefix).
+  punctuation (e.g., no ``$`` prefix).
 
 AArch64:
 
@@ -5998,7 +6024,7 @@ ARM:
   register).
 - ``P``: No effect.
 - ``q``: No effect.
-- ``y``: Print a VFP single-precision register as an indexed double (e.g. print
+- ``y``: Print a VFP single-precision register as an indexed double (e.g., print
   as ``d4[1]`` instead of ``s9``)
 - ``B``: Bitwise invert and print an immediate integer constant without ``#``
   prefix.
@@ -6114,18 +6140,18 @@ X86:
 - ``c``: Print an unadorned integer or symbol name. (The latter is
   target-specific behavior for this typically target-independent modifier).
 - ``A``: Print a register name with a '``*``' before it.
-- ``b``: Print an 8-bit register name (e.g. ``al``); do nothing on a memory
+- ``b``: Print an 8-bit register name (e.g., ``al``); do nothing on a memory
   operand.
-- ``h``: Print the upper 8-bit register name (e.g. ``ah``); do nothing on a
+- ``h``: Print the upper 8-bit register name (e.g., ``ah``); do nothing on a
   memory operand.
-- ``w``: Print the 16-bit register name (e.g. ``ax``); do nothing on a memory
+- ``w``: Print the 16-bit register name (e.g., ``ax``); do nothing on a memory
   operand.
-- ``k``: Print the 32-bit register name (e.g. ``eax``); do nothing on a memory
+- ``k``: Print the 32-bit register name (e.g., ``eax``); do nothing on a memory
   operand.
-- ``q``: Print the 64-bit register name (e.g. ``rax``), if 64-bit registers are
+- ``q``: Print the 64-bit register name (e.g., ``rax``), if 64-bit registers are
   available, otherwise the 32-bit register name; do nothing on a memory operand.
 - ``n``: Negate and print an unadorned integer, or, for operands other than an
-  immediate integer (e.g. a relocatable symbol expression), print a '-' before
+  immediate integer (e.g., a relocatable symbol expression), print a '-' before
   the operand. (The behavior for relocatable symbol expressions is a
   target-specific behavior for this typically target-independent modifier)
 - ``H``: Print a memory reference with additional offset +8.
@@ -6883,7 +6909,7 @@ See :ref:`diexpression` for details.
 .. note::
 
    ``DIExpression``\s are always printed and parsed inline; they can never be
-   referenced by an ID (e.g. ``!1``).
+   referenced by an ID (e.g., ``!1``).
 
 Some examples of expressions:
 
@@ -8469,8 +8495,8 @@ that was typically cold and one allocating memory that was typically not cold.
 The format of the metadata describing a context specific profile (e.g.
 ``!1`` and ``!3`` above) requires a first operand that is a metadata node
 describing the context, followed by a list of string metadata tags describing
-the profile behavior (e.g. ``cold`` and ``notcold``) above. The metadata nodes
-describing the context (e.g. ``!2`` and ``!4`` above) are unique ids
+the profile behavior (e.g., ``cold`` and ``notcold``) above. The metadata nodes
+describing the context (e.g., ``!2`` and ``!4`` above) are unique ids
 corresponding to callsites, which can be matched to associated IR calls via
 :ref:`callsite metadata<md_callsite>`. In practice these ids are formed via
 a hash of the callsite's debug info, and the associated call may be in a
@@ -8946,7 +8972,7 @@ in syntax by a caret ('``^``').
 
 The summary is parsed into a bitcode output, along with the Module
 IR, via the "``llvm-as``" tool. Tools that parse the Module IR for the purposes
-of optimization (e.g. "``clang -x ir``" and "``opt``"), will ignore the
+of optimization (e.g., "``clang -x ir``" and "``opt``"), will ignore the
 summary entries (just as they currently ignore summary entries in a bitcode
 input file).
 
@@ -9176,7 +9202,7 @@ The optional ``Refs`` field looks like:
     refs: ((Ref)[, (Ref)]*)
 
 where each ``Ref`` contains a reference to the summary id of the referenced
-value (e.g. ``^1``).
+value (e.g., ``^1``).
 
 .. _typeidinfo_summary:
 
@@ -10385,7 +10411,7 @@ bit width of the result.
 Because LLVM integers use a two's complement representation, and the
 result is the same width as the operands, this instruction returns the
 correct result for both signed and unsigned integers. If a full product
-(e.g. ``i32`` * ``i32`` -> ``i64``) is needed, the operands should be
+(e.g., ``i32`` * ``i32`` -> ``i64``) is needed, the operands should be
 sign-extended or zero-extended as appropriate to the width of the full
 product.
 
@@ -11378,7 +11404,7 @@ allocation on any convenient boundary compatible with the type.
 '``type``' may be any sized type.
 
 Structs containing scalable vectors cannot be used in allocas unless all
-fields are the same scalable vector type (e.g. ``{<vscale x 2 x i32>,
+fields are the same scalable vector type (e.g., ``{<vscale x 2 x i32>,
 <vscale x 2 x i32>}`` contains the same type while ``{<vscale x 2 x i32>,
 <vscale x 2 x i64>}`` doesn't).
 
@@ -12766,7 +12792,7 @@ pointer then a truncation is done. If ``value`` is smaller than the size
 of a pointer then a zero extension is done. If they are the same size,
 nothing is done (*no-op cast*).
 The behavior is equivalent to a ``bitcast``, however, the resulting value is not
-guaranteed to be dereferenceable (e.g. if the result type is a
+guaranteed to be dereferenceable (e.g., if the result type is a
 :ref:`non-integral pointers <nointptrtype>`).
 
 Example:
@@ -14697,7 +14723,7 @@ C++ object with a non-trivial destructor.  ``llvm.seh.scope.begin`` is used to m
 the start of the region; it is always called with ``invoke``, with the unwind block
 being the desired unwind destination for any potentially-throwing instructions
 within the region.  `llvm.seh.scope.end` is used to mark when the scope ends
-and the EH cleanup is no longer required (e.g. because the destructor is being
+and the EH cleanup is no longer required (e.g., because the destructor is being
 called).
 
 .. _int_read_register:
@@ -14737,7 +14763,7 @@ return the current value of the register, where possible. The
 where possible.
 
 A call to '``llvm.read_volatile_register``' is assumed to have side-effects
-and possibly return a different value each time (e.g. for a timer register).
+and possibly return a different value each time (e.g., for a timer register).
 
 This is useful to implement named register global variables that need
 to always be mapped to a specific register, as is common practice on
@@ -15008,9 +15034,9 @@ flushes the instruction cache.
 Semantics:
 """"""""""
 
-On platforms with coherent instruction and data caches (e.g. x86), this
+On platforms with coherent instruction and data caches (e.g., x86), this
 intrinsic is a nop. On platforms with non-coherent instruction and data
-cache (e.g. ARM, MIPS), the intrinsic is lowered either to appropriate
+cache (e.g., ARM, MIPS), the intrinsic is lowered either to appropriate
 instructions or a system call, if cache flushing requires special
 privileges.
 
@@ -15462,7 +15488,7 @@ A call to '``llvm.call.preallocated.arg``' must have a call site
 ``preallocated`` attribute. The type of the ``preallocated`` attribute must
 match the type used by the ``preallocated`` attribute of the corresponding
 argument at the preallocated call. The type is used in the case that an
-``llvm.call.preallocated.setup`` does not have a corresponding call (e.g. due
+``llvm.call.preallocated.setup`` does not have a corresponding call (e.g., due
 to DCE), where otherwise we cannot know how large the arguments are.
 
 It is undefined behavior if this is called with a token from an
@@ -16656,7 +16682,7 @@ for large input values.
 .. note::
 
   Currently, the default lowering of this intrinsic relies on the ``sincospi[f|l]``
-  functions being available in the target's runtime (e.g. libc).
+  functions being available in the target's runtime (e.g., libc).
 
 When specified with the fast-math-flag 'afn', the result may be approximated
 using a less accurate calculation.
@@ -19719,7 +19745,7 @@ Arguments:
 """"""""""
 
 The integer operand is the loop trip count of the hardware-loop, and thus
-not e.g. the loop back-edge taken count.
+not e.g., the loop back-edge taken count.
 
 Semantics:
 """"""""""
@@ -19758,7 +19784,7 @@ Arguments:
 """"""""""
 
 The integer operand is the loop trip count of the hardware-loop, and thus
-not e.g. the loop back-edge taken count.
+not e.g., the loop back-edge taken count.
 
 Semantics:
 """"""""""
@@ -19794,7 +19820,7 @@ Arguments:
 """"""""""
 
 The integer operand is the loop trip count of the hardware-loop, and thus
-not e.g. the loop back-edge taken count.
+not e.g., the loop back-edge taken count.
 
 Semantics:
 """"""""""
@@ -19832,7 +19858,7 @@ Arguments:
 """"""""""
 
 The integer operand is the loop trip count of the hardware-loop, and thus
-not e.g. the loop back-edge taken count.
+not e.g., the loop back-edge taken count.
 
 Semantics:
 """"""""""
@@ -20363,6 +20389,77 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of floating-point values.
 
+Vector Partial Reduction Intrinsics
+-----------------------------------
+
+Partial reductions of vectors can be expressed using the intrinsics described in
+this section. Each one reduces the concatenation of the two vector arguments
+down to the number of elements of the result vector type.
+
+Other than the reduction operator (e.g. add, fadd), the way in which the
+concatenated arguments is reduced is entirely unspecified. By their nature these
+intrinsics are not expected to be useful in isolation but can instead be used to
+implement the first phase of an overall reduction operation.
+
+The typical use case is loop vectorization where reductions are split into an
+in-loop phase, where maintaining an unordered vector result is important for
+performance, and an out-of-loop phase is required to calculate the final scalar
+result.
+
+By avoiding the introduction of new ordering constraints, these intrinsics
+enhance the ability to leverage a target's accumulation instructions.
+
+'``llvm.vector.partial.reduce.add.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b)
+      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b)
+      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
+      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
+
+Arguments:
+""""""""""
+
+The first argument is an integer vector with the same type as the result.
+
+The second argument is a vector with a length that is a known integer multiple
+of the result's type, while maintaining the same element type.
+
+'``llvm.vector.partial.reduce.fadd.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b)
+      declare <vscale x 4 x f32> @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32(<vscale x 4 x f32> %a, <vscale x 8 x f32> %b)
+
+Arguments:
+""""""""""
+
+The first argument is a floating-point vector with the same type as the result.
+
+The second argument is a vector with a length that is a known integer multiple
+of the result's type, while maintaining the same element type.
+
+Semantics:
+""""""""""
+
+As the way in which the arguments to this floating-point intrinsic are reduced
+is unspecified, this intrinsic will assume floating-point reassociation and
+contraction can be leveraged to implement the reduction, which may result in
+variations to the results due to reordering or by lowering to different
+instructions (including combining multiple instructions into a single one).
+
 '``llvm.vector.insert``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -20736,50 +20833,6 @@ Note that it has the following implications:
 -  If ``%cnt`` is non-zero, the return value is non-zero as well.
 -  If ``%cnt`` is less than or equal to ``%max_lanes``, the return value is equal to ``%cnt``.
 
-'``llvm.vector.partial.reduce.add.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Syntax:
-"""""""
-This is an overloaded intrinsic.
-
-::
-
-      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b)
-      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b)
-      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
-      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
-
-Overview:
-"""""""""
-
-The '``llvm.vector.partial.reduce.add.*``' intrinsics reduce the
-concatenation of the two vector arguments down to the number of elements of the
-result vector type.
-
-Arguments:
-""""""""""
-
-The first argument is an integer vector with the same type as the result.
-
-The second argument is a vector with a length that is a known integer multiple
-of the result's type, while maintaining the same element type.
-
-Semantics:
-""""""""""
-
-Other than the reduction operator (e.g. add) the way in which the concatenated
-arguments is reduced is entirely unspecified. By their nature these intrinsics
-are not expected to be useful in isolation but instead implement the first phase
-of an overall reduction operation.
-
-The typical use case is loop vectorization where reductions are split into an
-in-loop phase, where maintaining an unordered vector result is important for
-performance, and an out-of-loop phase to calculate the final scalar result.
-
-By avoiding the introduction of new ordering constraints, these intrinsics
-enhance the ability to leverage a target's accumulation instructions.
-
 '``llvm.experimental.vector.histogram.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -24286,7 +24339,7 @@ The arguments are scalar types to accommodate scalable vector types, for which
 it is unknown what the type of the step vector needs to be that enumerate its
 lanes without overflow.
 
-This mask ``%m`` can e.g. be used in masked load/store instructions. These
+This mask ``%m`` can e.g., be used in masked load/store instructions. These
 intrinsics provide a hint to the backend. I.e., for a vector loop, the
 back-edge taken count of the original scalar loop is explicit as the second
 argument.
@@ -24624,7 +24677,7 @@ Examples:
 
 .. _int_vp_load_ff:
 
-'``llvm.vp.load_ff``' Intrinsic
+'``llvm.vp.load.ff``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -27966,7 +28019,7 @@ The quiet comparison operation performed by
 if either argument is a SNAN.  The signaling comparison operation
 performed by '``llvm.experimental.constrained.fcmps``' will raise an
 exception if either argument is a NAN (QNAN or SNAN). Such an exception
-does not preclude a result being produced (e.g. exception might only
+does not preclude a result being produced (e.g., exception might only
 set a flag), therefore the distinction between ordered and unordered
 comparisons is also relevant for the
 '``llvm.experimental.constrained.fcmps``' intrinsic.
@@ -29983,7 +30036,7 @@ Semantics:
 
 On some platforms, the value returned by this intrinsic remains unchanged
 between loads in the same thread. On other platforms, it returns the same
-global variable value, if any, e.g. ``@__stack_chk_guard``.
+global variable value, if any, e.g., ``@__stack_chk_guard``.
 
 Currently some platforms have IR-level customized stack guard loading (e.g.
 X86 Linux) that is not handled by ``llvm.stackguard()``, while they should be
@@ -30963,6 +31016,37 @@ This intrinsic does nothing, but optimizers must consider it a use of its single
 operand and should try to preserve the intrinsic and its position in the
 function.
 
+.. _llvm_reloc_none:
+
+'``llvm.reloc.none``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare void @llvm.reloc.none(metadata !<name_str>)
+
+Overview:
+"""""""""
+
+The ``llvm.reloc.none`` intrinsic emits a no-op relocation against a given
+operand symbol. This can bring the symbol definition into the link without
+emitting any code or data to the binary for that purpose.
+
+Arguments:
+""""""""""
+
+The ``llvm.reloc.none`` intrinsic takes the symbol as a metadata string
+argument.
+
+Semantics:
+""""""""""
+
+This intrinsic emits a no-op relocation for the symbol at the location of the
+intrinsic call.
+
 
 Stack Map Intrinsics
 --------------------
diff --git a/llvm/docs/MIRLangRef.rst b/llvm/docs/MIRLangRef.rst
index 3f4c3cde9b3aa..f7647c898c1e6 100644
--- a/llvm/docs/MIRLangRef.rst
+++ b/llvm/docs/MIRLangRef.rst
@@ -86,25 +86,25 @@ Tests are more accessible and future proof when simplified:
 - Use the ``-simplify-mir`` option with llc.
 
 - Machine function attributes often have default values or the test works just
-  as well with default values. Typical candidates for this are: `alignment:`,
-  `exposesReturnsTwice`, `legalized`, `regBankSelected`, `selected`.
+  as well with default values. Typical candidates for this are: ``alignment:``,
+  ``exposesReturnsTwice``, ``legalized``, ``regBankSelected``, ``selected``.
   The whole `frameInfo` section is often unnecessary if there is no special
-  frame usage in the function. `tracksRegLiveness` on the other hand is often
+  frame usage in the function. ``tracksRegLiveness`` on the other hand is often
   necessary for some passes that care about block livein lists.
 
-- The (global) `liveins:` list is typically only interesting for early
+- The (global) ``liveins:`` list is typically only interesting for early
   instruction selection passes and can be removed when testing later passes.
-  The per-block `liveins:` on the other hand are necessary if
+  The per-block ``liveins:`` on the other hand are necessary if
   `tracksRegLiveness` is true.
 
-- Branch probability data in block `successors:` lists can be dropped if the
+- Branch probability data in block ``successors:`` lists can be dropped if the
   test doesn't depend on it. Example:
-  `successors: %bb.1(0x40000000), %bb.2(0x40000000)` can be replaced with
-  `successors: %bb.1, %bb.2`.
+  ``successors: %bb.1(0x40000000), %bb.2(0x40000000)`` can be replaced with
+  ``successors: %bb.1, %bb.2``.
 
 - MIR code contains a whole IR module. This is necessary because there are
   no equivalents in MIR for global variables, references to external functions,
-  function attributes, metadata, debug info. Instead some MIR data references
+  function attributes, metadata, debug info. Instead, some MIR data references
   the IR constructs. You can often remove them if the test doesn't depend on
   them.
 
@@ -114,16 +114,16 @@ Tests are more accessible and future proof when simplified:
   dropped: `:: (load 8)`
 
 - MIR blocks can reference IR blocks for debug printing, profile information,
-  or debug locations. Example: `bb.42.myblock` in MIR references the IR block
-  `myblock`. It is usually possible to drop the `.myblock` reference and simply
-  use `bb.42`.
+  or debug locations. Example: ``bb.42.myblock`` in MIR references the IR block
+  ``myblock``. It is usually possible to drop the ``.myblock`` reference and simply
+  use ``bb.42``.
 
 - If there are no memory operands or blocks referencing the IR, then the
   IR function can be replaced by a parameterless dummy function like
-  `define @func() { ret void }`.
+  ``define @func() { ret void }``.
 
 - It is possible to drop the whole IR section of the MIR file if it only
-  contains dummy functions (see above). The .mir loader will create the
+  contains dummy functions (see above). The ``.mir`` loader will create the
   IR functions automatically in this case.
 
 .. _limitations:
@@ -131,7 +131,7 @@ Tests are more accessible and future proof when simplified:
 Limitations
 -----------
 
-Currently the MIR format has several limitations in terms of which state it
+Currently, the MIR format has several limitations in terms of which state it
 can serialize:
 
 - The target-specific state in the target-specific ``MachineFunctionInfo``
@@ -150,7 +150,7 @@ These limitations impose restrictions on what you can test with the MIR format.
 For now, tests that would like to test some behaviour that depends on the state
 of temporary or local ``MCSymbol``  operands or the exception handling state in
 MMI, can't use the MIR format. As well as that, tests that test some behaviour
-that depends on the state of the target specific ``MachineFunctionInfo`` or
+that depends on the state of the target-specific ``MachineFunctionInfo`` or
 ``MachineConstantPoolValue`` subclasses can't use the MIR format at the moment.
 
 High Level Structure
@@ -286,7 +286,7 @@ Example:
 Successors
 ^^^^^^^^^^
 
-The machine basic block's successors have to be specified before any of the
+The machine basic block's successors must be specified before any of the
 instructions:
 
 .. code-block:: text
@@ -489,13 +489,13 @@ In case this is true, the Machine Operand is printed according to the target.
 
 For example:
 
-In AArch64RegisterInfo.td:
+In ``AArch64RegisterInfo.td``:
 
 .. code-block:: text
 
   def sub_32 : SubRegIndex<32>;
 
-If the third operand is an immediate with the value ``15`` (target-dependent
+If the third operand is an immediate with the value ``15`` (a target-dependent
 value), based on the instruction's opcode and the operand's index the operand
 will be printed as ``%subreg.sub_32``:
 
@@ -503,7 +503,7 @@ will be printed as ``%subreg.sub_32``:
 
     %1:gpr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32
 
-For integers > 64 bits, we use a special machine operand, ``MO_CImmediate``,
+For integers larger than 64 bits, we use a special machine operand, ``MO_CImmediate``,
 which stores the immediate in a ``ConstantInt`` using an ``APInt`` (LLVM's
 arbitrary-precision integers).
 
@@ -552,7 +552,7 @@ corresponding internal ``llvm::RegState`` representation:
 
    * - ``implicit``
      - ``RegState::Implicit``
-     - Not emitted register (e.g. carry, or temporary result).
+     - Not emitted register (e.g., carry, or temporary result).
 
    * - ``implicit-def``
      - ``RegState::ImplicitDefine``
@@ -625,7 +625,7 @@ For a CPI with the index 0 and offset -12:
 
     %1:gr64 = MOV64ri %const.0 - 12
 
-A constant pool entry is bound to a LLVM IR ``Constant`` or a target-specific
+A constant pool entry is bound to an LLVM IR ``Constant`` or a target-specific
 ``MachineConstantPoolValue``. When serializing all the function's constants, the
 following format is used:
 
@@ -670,12 +670,12 @@ a global value operand named ``G``:
 
     $rax = MOV64rm $rip, 1, _, @G, _
 
-The named global values are represented using an identifier with the '@' prefix.
+The named global values are represented using an identifier with the ``@`` prefix.
 If the identifier doesn't match the regular expression
-`[-a-zA-Z$._][-a-zA-Z$._0-9]*`, then this identifier must be quoted.
+``[-a-zA-Z$._][-a-zA-Z$._0-9]*``, then this identifier must be quoted.
 
 The unnamed global values are represented using an unsigned numeric value with
-the '@' prefix, like in the following examples: ``@0``, ``@989``.
+the ``@`` prefix, as in the following examples: ``@0``, ``@989``.
 
 Target-dependent Index Operands
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -741,7 +741,7 @@ Example:
 MCSymbol Operands
 ^^^^^^^^^^^^^^^^^
 
-A MCSymbol operand holds a pointer to a ``MCSymbol``. For the limitations
+An ``MCSymbol`` operand holds a pointer to an ``MCSymbol``. For the limitations
 of this operand in MIR, see :ref:`limitations <limitations>`.
 
 The syntax is:
@@ -825,7 +825,7 @@ Comments
 ^^^^^^^^
 
 Machine operands can have C/C++ style comments, which are annotations enclosed
-between ``/*`` and ``*/`` to improve readability of e.g. immediate operands.
+between ``/*`` and ``*/`` to improve readability of e.g., immediate operands.
 In the example below, ARM instructions EOR and BCC and immediate operands
 ``14`` and ``0`` have been annotated with their condition codes (CC)
 definitions, i.e. the ``always`` and ``eq`` condition codes:
@@ -920,7 +920,7 @@ Instruction referencing locations
 
 This experimental feature aims to separate the specification of variable
 *values* from the program point where a variable takes on that value. Changes
-in variable value occur in the same manner as ``DBG_VALUE`` meta instructions
+in a variable value occur in the same manner as ``DBG_VALUE`` meta instructions
 but using ``DBG_INSTR_REF``. Variable values are identified by a pair of
 instruction number and operand number. Consider the example below:
 
diff --git a/llvm/docs/MergeFunctions.rst b/llvm/docs/MergeFunctions.rst
index d43b9c3a89091..d64c846687bae 100644
--- a/llvm/docs/MergeFunctions.rst
+++ b/llvm/docs/MergeFunctions.rst
@@ -8,9 +8,9 @@ MergeFunctions pass, how it works
 Introduction
 ============
 Sometimes code contains equal functions, or functions that do exactly the same
-thing even though they are non-equal on the IR level (e.g.: multiplication on 2
-and 'shl 1'). This can happen for several reasons: mainly, the usage of
-templates and automatic code generators. Though, sometimes the user itself could
+thing even though they are non-equal on the IR level (e.g.,: multiplication on 2
+and ``shl 1``). This can happen for several reasons: mainly, the usage of
+templates and automatic code generators. However, sometimes the user itself could
 write the same thing twice :-)
 
 The main purpose of this pass is to recognize such functions and merge them.
@@ -20,21 +20,21 @@ describes the algorithm used to compare functions and
 explains how we could combine equal functions correctly to keep the module
 valid.
 
-Material is brought in a top-down form, so the reader could start to learn pass
+The material is presented in a top-down form, so the reader could start to learn pass
 from high level ideas and end with low-level algorithm details, thus preparing
 him or her for reading the sources.
 
 The main goal is to describe the algorithm and logic here and the concept. If
 you *don't want* to read the source code, but want to understand pass
 algorithms, this document is good for you. The author tries not to repeat the
-source-code and covers only common cases to avoid the cases of needing to
+source code and covers only common cases to avoid the cases of needing to
 update this document after any minor code changes.
 
 
 What should I know to be able to follow along with this document?
 -----------------------------------------------------------------
 
-The reader should be familiar with common compile-engineering principles and
+The reader should be familiar with common compiler-engineering principles and
 LLVM code fundamentals. In this article, we assume the reader is familiar with
 `Single Static Assignment
 <http://en.wikipedia.org/wiki/Static_single_assignment_form>`_
@@ -99,7 +99,7 @@ and a ``void*`` as equal.
 This is just an example; more possible details are described a bit below.
 
 As another example, the reader may imagine two more functions. The first
-function performs a multiplication by 2, while the second one performs an
+function performs a multiplication by 2, while the second one performs a
 logical left shift by 1.
 
 Possible solutions
@@ -131,7 +131,7 @@ access lookup? The answer is: "yes".
 Random-access
 """""""""""""
 How can this be done? Just convert each function to a number, and gather
-all of them in a special hash-table. Functions with equal hashes are equal.
+all of them in a special hash table. Functions with equal hashes are equal.
 Good hashing means, that every function part must be taken into account. That
 means we have to convert every function part into some number, and then add it
 into the hash. The lookup-up time would be small, but such an approach adds some
@@ -175,7 +175,7 @@ merged with each other. It is defined as:
 
 ``std::set<FunctionNode> FnTree;``
 
-Here ``FunctionNode`` is a wrapper for ``llvm::Function`` class, with
+Here, ``FunctionNode`` is a wrapper for ``llvm::Function`` class, with an
 implemented “<” operator among the functions set (below we explain how it works
 exactly; this is a key point in fast functions comparison).
 
@@ -207,7 +207,7 @@ from method.
 Comparison and logarithmical search
 """""""""""""""""""""""""""""""""""
 Let's recall our task: for every function *F* from module *M*, we have to find
-equal functions *F`* in the shortest time possible , and merge them into a
+equal functions *F`* in the shortest time possible and merge them into a
 single function.
 
 Defining total ordering among the functions set allows us to organize
@@ -225,7 +225,7 @@ possible values:
 
 1, left is *greater* than right.
 
-Of course it means, that we have to maintain
+Of course, it means that we have to maintain
 *strict and non-strict order relation properties*:
 
 * reflexivity (``a <= a``, ``a == a``, ``a >= a``),
@@ -235,7 +235,7 @@ Of course it means, that we have to maintain
 
 As mentioned before, the comparison routine consists of
 "sub-comparison-routines", with each of them also consisting of
-"sub-comparison-routines", and so on. Finally, it ends up with primitive
+"sub-comparison-routines", and so on. Finally, it ends up with a primitive
 comparison.
 
 Below, we will use the following operations:
@@ -275,7 +275,7 @@ A brief look at the source code tells us that the comparison starts in the
 “``int FunctionComparator::compare(void)``” method.
 
 1. The first parts to be compared are the function's attributes and some
-properties that is outside the “attributes” term, but still could make the
+properties that are outside the “attributes” term, but still could make the
 function different without changing its body. This part of the comparison is
 usually done within simple *cmpNumbers* or *cmpFlags* operations (e.g.
 ``cmpFlags(F1->hasGC(), F2->hasGC())``). Below is a full list of function's
@@ -365,7 +365,7 @@ comparing them as numbers.
 7. Complex types (structures, arrays, etc.). Follow complex objects comparison
 technique (see the very first paragraph of this chapter). Both *left* and
 *right* are to be expanded and their element types will be checked the same
-way. If we get -1 or 1 on some stage, return it. Otherwise return 0.
+way. If we get -1 or 1 on some stage, return it. Otherwise, return 0.
 
 8. Steps 1-6 describe all the possible cases, if we passed steps 1-6 and didn't
 get any conclusions, then invoke ``llvm_unreachable``, since it's quite an
@@ -445,7 +445,7 @@ How to implement cmpValues?
 but, in general, we need to implement antisymmetric relation. As mentioned
 above, to understand what is *less*, we can use order in which we
 meet values. If both values have the same order in a function (met at the same
-time), we then treat values as *associated*. Otherwise – it depends on who was
+time), we then treat values as *associated*. Otherwise, it depends on who was
 first.
 
 Every time we run the top-level compare method, we initialize two identical
@@ -623,7 +623,7 @@ to use ``accumulateConstantOffset`` method.
 So, if we get constant offset for both left and right *GEPs*, then compare it as
 numbers, and return comparison result.
 
-Otherwise treat it like a regular operation (see previous paragraph).
+Otherwise, treat it like a regular operation (see previous paragraph).
 
 cmpOperation
 ------------
@@ -742,7 +742,7 @@ We call ``writeThunkOrAlias(Function *F, Function *G)``. Here we try to replace
   referenced anywhere,
 * function should come with external, local or weak linkage.
 
-Otherwise we write thunk: some wrapper that has *G's* interface and calls *F*,
+Otherwise, we write thunk: some wrapper that has *G's* interface and calls *F*,
 so *G* could be replaced with this wrapper.
 
 *writeAlias*
@@ -772,7 +772,7 @@ As it written in method comments:
 “Replace G with a simple tail call to bitcast(F). Also replace direct uses of G
 with bitcast(F). Deletes G.”
 
-In general it does the same as usual when we want to replace callee, except the
+In general, it does the same as usual when we want to replace callee, except the
 first point:
 
 1. We generate tail call wrapper around *F*, but with an interface that allows using
diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst
index d99b5843c2133..270a635e0d153 100644
--- a/llvm/docs/ProgrammersManual.rst
+++ b/llvm/docs/ProgrammersManual.rst
@@ -113,7 +113,7 @@ rarely have to include this file directly).
 
 ``isa<>``:
   The ``isa<>`` operator works exactly like the Java "``instanceof``" operator.
-  It returns true or false depending on whether a reference or pointer points to
+  It returns ``true`` or ``false`` depending on whether a reference or pointer points to
   an instance of the specified class.  This can be very useful for constraint
   checking of various sorts (example below).
 
@@ -167,7 +167,7 @@ rarely have to include this file directly).
 ``isa_and_present<>``:
   The ``isa_and_present<>`` operator works just like the ``isa<>`` operator,
   except that it allows for a null pointer as an argument (which it then
-  returns false).  This can sometimes be useful, allowing you to combine several
+  returns ``false``).  This can sometimes be useful, allowing you to combine several
   null checks into one.
 
 ``cast_if_present<>``:
@@ -402,7 +402,7 @@ doxygen documentation or by looking at the unit test suite.
 Error handling
 --------------
 
-Proper error handling helps us identify bugs in our code, and helps end-users
+Proper error handling helps us identify bugs in our code, and helps end users
 understand errors in their tool usage. Errors fall into two broad categories:
 *programmatic* and *recoverable*, with different strategies for handling and
 reporting.
@@ -449,10 +449,10 @@ violations even in builds that do not enable assertions:
 Recoverable Errors
 ^^^^^^^^^^^^^^^^^^
 
-Recoverable errors represent an error in the program's environment, for example
+Recoverable errors represent an error in the program's environment, for example,
 a resource failure (a missing file, a dropped network connection, etc.), or
 malformed input. These errors should be detected and communicated to a level of
-the program where they can be handled appropriately. Handling the error may be
+the program that can handle them appropriately. Handling the error may be
 as simple as reporting the issue to the user, or it may involve attempts at
 recovery.
 
@@ -668,7 +668,7 @@ Since the list of handlers passed to ``handleErrors`` may not cover every error
 type that can occur, the ``handleErrors`` function also returns an Error value
 that must be checked or propagated. If the error value that is passed to
 ``handleErrors`` does not match any of the handlers it will be returned from
-handleErrors. Idiomatic use of ``handleErrors`` thus looks like:
+``handleErrors``. Idiomatic use of ``handleErrors`` thus looks like:
 
 .. code-block:: c++
 
@@ -683,18 +683,18 @@ handleErrors. Idiomatic use of ``handleErrors`` thus looks like:
           }))
     return Err;
 
-In cases where you truly know that the handler list is exhaustive the
+In cases where you truly know that the handler list is exhaustive, the
 ``handleAllErrors`` function can be used instead. This is identical to
 ``handleErrors`` except that it will terminate the program if an unhandled
 error is passed in, and can therefore return void. The ``handleAllErrors``
 function should generally be avoided: the introduction of a new error type
 elsewhere in the program can easily turn a formerly exhaustive list of errors
 into a non-exhaustive list, risking unexpected program termination. Where
-possible, use handleErrors and propagate unknown errors up the stack instead.
+possible, use ``handleErrors`` and propagate unknown errors up the stack instead.
 
 For tool code, where errors can be handled by printing an error message then
 exiting with an error code, the :ref:`ExitOnError <err_exitonerr>` utility
-may be a better choice than handleErrors, as it simplifies control flow when
+may be a better choice than ``handleErrors``, as it simplifies control flow when
 calling fallible functions.
 
 In situations where it is known that a particular call to a fallible function
@@ -706,9 +706,9 @@ simplifying control flow.
 StringError
 """""""""""
 
-Many kinds of errors have no recovery strategy, the only action that can be
+Many kinds of errors have no recovery strategy; the only action that can be
 taken is to report them to the user so that the user can attempt to fix the
-environment. In this case representing the error as a string makes perfect
+environment. In this case, representing the error as a string makes perfect
 sense. LLVM provides the ``StringError`` class for this purpose. It takes two
 arguments: A string error message, and an equivalent ``std::error_code`` for
 interoperability. It also provides a ``createStringError`` function to simplify
@@ -721,7 +721,7 @@ common usage of this class:
   createStringError(errc::executable_format_error, "Bad executable");
 
 If you're certain that the error you're building will never need to be converted
-to a ``std::error_code`` you can use the ``inconvertibleErrorCode()`` function:
+to a ``std::error_code``, you can use the ``inconvertibleErrorCode()`` function:
 
 .. code-block:: c++
 
@@ -791,18 +791,18 @@ actually recognises three different forms of handler signature:
   Error(std::unique_ptr<UserDefinedError> E);
 
 Any error returned from a handler will be returned from the ``handleErrors``
-function so that it can be handled itself, or propagated up the stack.
+function so that it can be handled itself or propagated up the stack.
 
 .. _err_exitonerr:
 
 Using ExitOnError to simplify tool code
 """""""""""""""""""""""""""""""""""""""
 
-Library code should never call ``exit`` for a recoverable error, however in tool
+Library code should never call ``exit`` for a recoverable error; however, in tool
 code (especially command line tools) this can be a reasonable approach. Calling
 ``exit`` upon encountering an error dramatically simplifies control flow as the
 error no longer needs to be propagated up the stack. This allows code to be
-written in straight-line style, as long as each fallible call is wrapped in a
+written in a straight-line style, as long as each fallible call is wrapped in a
 check and call to exit. The ``ExitOnError`` class supports this pattern by
 providing call operators that inspect ``Error`` values, stripping the error away
 in the success case and logging to ``stderr`` then exiting in the failure case.
@@ -827,7 +827,7 @@ turning them into non-failing calls:
   }
 
 On failure, the error's log message will be written to ``stderr``, optionally
-preceded by a string "banner" that can be set by calling the setBanner method. A
+preceded by a string "banner" that can be set by calling the ``setBanner`` method. A
 mapping can also be supplied from ``Error`` values to exit codes using the
 ``setExitCodeMapper`` method:
 
@@ -854,8 +854,8 @@ Some functions may only fail for a subset of their inputs, so calls using known
 safe inputs can be assumed to succeed.
 
 The cantFail functions encapsulate this by wrapping an assertion that their
-argument is a success value and, in the case of Expected<T>, unwrapping the
-T value:
+argument is a success value and, in the case of ``Expected<T>``, unwrapping the
+``T`` value:
 
 .. code-block:: c++
 
@@ -868,16 +868,16 @@ T value:
     ...
   }
 
-Like the ExitOnError utility, cantFail simplifies control flow. Their treatment
+Like the ExitOnError utility, ``cantFail`` simplifies control flow. Their treatment
 of error cases is very different, however: Where ExitOnError is guaranteed to
-terminate the program on an error input, cantFail simply asserts that the result
+terminate the program on an error input, ``cantFail`` simply asserts that the result
 is success. In debug builds this will result in an assertion failure if an error
-is encountered. In release builds, the behavior of cantFail for failure values is
-undefined. As such, care must be taken in the use of cantFail: clients must be
-certain that a cantFail wrapped call really can not fail with the given
+is encountered. In release builds, the behavior of ``cantFail`` for failure values is
+undefined. As such, care must be taken in the use of ``cantFail``: clients must be
+certain that a ``cantFail`` wrapped call really can not fail with the given
 arguments.
 
-Use of the cantFail functions should be rare in library code, but they are
+Use of the ``cantFail`` functions should be rare in library code, but they are
 likely to be of more use in tool and unit-test code where inputs and/or
 mocked-up classes or functions may be known to be safe.
 
@@ -979,7 +979,7 @@ completing the walk over the archive they could use the ``joinErrors`` utility:
   }
 
 The ``joinErrors`` routine builds a special error type called ``ErrorList``,
-which holds a list of user defined errors. The ``handleErrors`` routine
+which holds a list of user-defined errors. The ``handleErrors`` routine
 recognizes this type and will attempt to handle each of the contained errors in
 order. If all contained errors can be handled, ``handleErrors`` will return
 ``Error::success()``; otherwise, ``handleErrors`` will concatenate the remaining
@@ -1043,7 +1043,7 @@ compared to ``end`` and found to be unequal (in particular, this marks the
 error as checked throughout the body of a range-based for loop), enabling early
 exit from the loop without redundant error checking.
 
-Instances of the fallible iterator interface (e.g. FallibleChildIterator above)
+Instances of the fallible iterator interface (e.g., FallibleChildIterator above)
 are wrapped using the ``make_fallible_itr`` and ``make_fallible_end``
 functions. E.g.:
 
@@ -1146,7 +1146,7 @@ be passed by value.
 The ``LDBG`` and ``LLVM_DEBUG()`` macros and ``-debug`` option
 --------------------------------------------------------------
 
-Often when working on your pass you will put a bunch of debugging printouts and
+Often, when working on your pass, you will put a bunch of debugging printouts and
 other code into your pass.  After you get it working, you want to remove it, but
 you may need it again in the future (to work out new bugs that you run across).
 
@@ -1183,7 +1183,7 @@ The debug output can be enabled by passing the ``-debug`` command line argument.
   $ opt < a.bc > /dev/null -mypass -debug
   [my-pass MyPass.cpp:123 2] I am here!
 
-While `LDBG()` is useful to add debug output to your code, there are cases
+While ``LDBG()`` is useful to add debug output to your code, there are cases
 where you may need to guard a block of code with a debug check. The
 ``llvm/Support/Debug.h`` (`doxygen
 <https://llvm.org/doxygen/Debug_8h_source.html>`__) file provides a macro named
@@ -1220,7 +1220,7 @@ with ``-debug``.
 Fine grained debug info with ``DEBUG_TYPE`` and the ``-debug-only`` option
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Sometimes you may find yourself in a situation where enabling ``-debug`` just
+Sometimes, you may find yourself in a situation where enabling ``-debug`` just
 turns on **too much** information (such as when working on the code generator).
 If you want to enable debug information with more fine-grained control, you
 can control the debug type and level with associate with each logging statement
@@ -1389,7 +1389,7 @@ maintainable and useful.
 Adding debug counters to aid in debugging your code
 ---------------------------------------------------
 
-Sometimes, when writing new passes, or trying to track down bugs, it
+Sometimes, when writing new passes or trying to track down bugs, it
 is useful to be able to control whether certain things in your pass
 happen or not.  For example, there are times the minimization tooling
 can only easily give you large testcases.  You would like to narrow
@@ -1640,7 +1640,7 @@ dynamically smaller than N, no malloc is performed.  This can be a big win in
 cases where the malloc/free call is far more expensive than the code that
 fiddles around with the elements.
 
-This is good for vectors that are "usually small" (e.g. the number of
+This is good for vectors that are "usually small" (e.g., the number of
 predecessors/successors of a block is usually less than 8).  On the other hand,
 this makes the size of the ``SmallVector`` itself large, so you don't want to
 allocate lots of them (doing so will waste a lot of space).  As such,
@@ -1684,7 +1684,7 @@ to keep ``sizeof(SmallVector<T>)`` around 64 bytes).
 
    .. code-block:: c++
 
-      // DISCOURAGED: Clients cannot pass e.g. raw arrays.
+      // DISCOURAGED: Clients cannot pass e.g., raw arrays.
       hardcodedContiguousStorage(const SmallVectorImpl<Foo> &In);
       // ENCOURAGED: Clients can pass any contiguous storage of Foo.
       allowsAnyContiguousStorage(ArrayRef<Foo> In);
@@ -1695,7 +1695,7 @@ to keep ``sizeof(SmallVector<T>)`` around 64 bytes).
         allowsAnyContiguousStorage(Vec); // Works.
       }
 
-      // DISCOURAGED: Clients cannot pass e.g. SmallVector<Foo, 8>.
+      // DISCOURAGED: Clients cannot pass e.g., SmallVector<Foo, 8>.
       hardcodedSmallSize(SmallVector<Foo, 2> &Out);
       // ENCOURAGED: Clients can pass any SmallVector<Foo, N>.
       allowsAnySmallSize(SmallVectorImpl<Foo> &Out);
@@ -1729,17 +1729,17 @@ page and one extra indirection when accessing elements with their positional
 index.
 
 In order to minimise the memory footprint of this container, it's important to
-balance the ``PageSize`` so that it's not too small (otherwise the overhead of the
-pointer per page might become too high) and not too big (otherwise the memory
+balance the ``PageSize`` so that it's not too small (otherwise, the overhead of the
+pointer per page might become too high) and not too big (otherwise, the memory
 is wasted if the page is not fully used).
 
 Moreover, while retaining the order of the elements based on their insertion
 index, like a vector, iterating over the elements via ``begin()`` and ``end()``
-is not provided in the API, due to the fact accessing the elements in order
+is not provided in the API, due to the fact that accessing the elements in order
 would allocate all the iterated pages, defeating memory savings and the purpose
 of the ``PagedVector``.
 
-Finally a ``materialized_begin()`` and ``materialized_end`` iterators are
+Finally, ``materialized_begin()`` and ``materialized_end`` iterators are
 provided to access the elements associated to the accessed pages, which could
 speed up operations that need to iterate over initialized elements in a
 non-ordered manner.
@@ -1782,9 +1782,9 @@ loop.
 ^^^^^^^
 
 ``std::deque`` is, in some senses, a generalized version of ``std::vector``.
-Like ``std::vector``, it provides constant time random access and other similar
+Like ``std::vector``, it provides constant-time random access and other similar
 properties, but it also provides efficient access to the front of the list.  It
-does not guarantee continuity of elements within memory.
+does not guarantee the continuity of elements within memory.
 
 In exchange for this extra flexibility, ``std::deque`` has significantly higher
 constant factor costs than ``std::vector``.  If possible, use ``std::vector`` or
@@ -1843,7 +1843,7 @@ Related classes of interest are explained in the following subsections:
 llvm/ADT/PackedVector.h
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-Useful for storing a vector of values using only a few number of bits for each
+Useful for storing a vector of values using only a few bits for each
 value.  Apart from the standard operations of a vector-like container, it can
 also perform an 'or' set operation.
 
@@ -1901,13 +1901,13 @@ non-empty ``ilist``\ s.
 
 The only sensible solution to this problem is to allocate a so-called *sentinel*
 along with the intrusive list, which serves as the ``end`` iterator, providing
-the back-link to the last element.  However conforming to the C++ convention it
+the back-link to the last element.  However, conforming to the C++ convention it
 is illegal to ``operator++`` beyond the sentinel and it also must not be
 dereferenced.
 
 These constraints allow for some implementation freedom to the ``ilist`` how to
 allocate and store the sentinel.  The corresponding policy is dictated by
-``ilist_traits<T>``.  By default a ``T`` gets heap-allocated whenever the need
+``ilist_traits<T>``.  By default, a ``T`` gets heap-allocated whenever the need
 for a sentinel arises.
 
 While the default policy is sufficient in most cases, it may break down when
@@ -1941,7 +1941,7 @@ String-like containers
 
 There are a variety of ways to pass around and use strings in C and C++, and
 LLVM adds a few new options to choose from.  Pick the first option on this list
-that will do what you need, they are ordered according to their relative cost.
+that will do what you need; they are ordered according to their relative cost.
 
 Note that it is generally preferred to *not* pass strings around as ``const
 char*``'s.  These have a number of problems, including the fact that they
@@ -1973,12 +1973,12 @@ either because they are C string literals, ``std::string``, a C array, or a
 ``StringRef`` has a few major limitations which make more powerful string containers
 useful:
 
-#. You cannot directly convert a ``StringRef`` to a 'const char*' because there is
+#. You cannot directly convert a ``StringRef`` to a ``const char*`` because there is
    no way to add a trailing nul (unlike the ``.c_str()`` method on various stronger
    classes).
 
 #. ``StringRef`` doesn't own or keep alive the underlying string bytes.
-   As such it can easily lead to dangling pointers, and is not suitable for
+   As such, it can easily lead to dangling pointers, and is not suitable for
    embedding in datastructures in most cases (instead, use an ``std::string`` or
    something like that).
 
@@ -2064,7 +2064,7 @@ so it can be embedded into heap data structures and returned by-value.  On the
 other hand, ``std::string`` is highly inefficient for inline editing (e.g.
 concatenating a bunch of stuff together) and because it is provided by the
 standard library, its performance characteristics depend a lot of the host
-standard library (e.g. libc++ and MSVC provide a highly optimized string class,
+standard library (e.g., libc++ and MSVC provide a highly optimized string class,
 GCC contains a really slow implementation).
 
 The major disadvantage of ``std::string`` is that almost every operation that makes
@@ -2198,7 +2198,7 @@ physical registers, virtual registers, or numbered basic blocks.
 ``SparseMultiSet`` is useful for algorithms that need very fast
 clear/find/insert/erase of the entire collection, and iteration over sets of
 elements sharing a key. It is often a more efficient choice than using composite
-data structures (e.g. vector-of-vectors, map-of-vectors). It is not intended for
+data structures (e.g., vector-of-vectors, map-of-vectors). It is not intended for
 building composite data structures.
 
 .. _dss_FoldingSet:
@@ -2268,7 +2268,7 @@ iteration.
 The difference between ``SetVector`` and other sets is that the order of iteration
 is guaranteed to match the order of insertion into the ``SetVector``.  This property
 is really important for things like sets of pointers.  Because pointer values
-are non-deterministic (e.g. vary across runs of the program on different
+are non-deterministic (e.g., vary across runs of the program on different
 machines), iterating over the pointers in the set will not be in a well-defined
 order.
 
@@ -2473,7 +2473,7 @@ pair in the map, etc.
 
 ``std::map`` is most useful when your keys or values are very large, if you need to
 iterate over the collection in sorted order, or if you need stable iterators
-into the map (i.e. they don't get invalidated if an insertion or deletion of
+into the map (i.e., they don't get invalidated if an insertion or deletion of
 another element takes place).
 
 .. _dss_mapvector:
@@ -2542,7 +2542,7 @@ There are several bit storage containers, and choosing when to use each is
 relatively straightforward.
 
 One additional option is ``std::vector<bool>``: we discourage its use for two
-reasons 1) the implementation in many common compilers (e.g.  commonly
+reasons 1) the implementation in many common compilers (e.g.,  commonly
 available versions of GCC) is extremely inefficient and 2) the C++ standards
 committee is likely to deprecate this container and/or change it significantly
 somehow.  In any case, please don't use it.
@@ -2557,7 +2557,7 @@ It supports individual bit setting/testing, as well as set operations.  The set
 operations take time O(size of bitvector), but operations are performed one word
 at a time, instead of one bit at a time.  This makes the ``BitVector`` very fast for
 set operations compared to other containers.  Use the ``BitVector`` when you expect
-the number of set bits to be high (i.e. a dense set).
+the number of set bits to be high (i.e., a dense set).
 
 .. _dss_smallbitvector:
 
@@ -3305,7 +3305,7 @@ naming value definitions.  The symbol table can provide a name for any Value_.
 Note that the ``SymbolTable`` class should not be directly accessed by most
 clients.  It should only be used when iteration over the symbol table names
 themselves are required, which is very special purpose.  Note that not all LLVM
-Value_\ s have names, and those without names (i.e. they have an empty name) do
+Value_\ s have names, and those without names (i.e., they have an empty name) do
 not exist in the symbol table.
 
 Symbol tables support iteration over the values in the symbol table with
@@ -3871,7 +3871,7 @@ Important Public Members of the ``Instruction`` class
 
 * ``bool mayWriteToMemory()``
 
-  Returns true if the instruction writes to memory, i.e. it is a ``call``,
+  Returns true if the instruction writes to memory, i.e., it is a ``call``,
   ``free``, ``invoke``, or ``store``.
 
 * ``unsigned getOpcode()``
@@ -3881,7 +3881,7 @@ Important Public Members of the ``Instruction`` class
 * ``Instruction *clone() const``
 
   Returns another instance of the specified instruction, identical in all ways
-  to the original except that the instruction has no parent (i.e. it's not
+  to the original except that the instruction has no parent (i.e., it's not
   embedded into a BasicBlock_), and it has no name.
 
 .. _Constant:
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 49184e3104868..a21f03d389444 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -351,6 +351,9 @@ The primary goal of experimental support is to assist in the process of ratifica
 ``experimental-zvqdotq``
   LLVM implements the `0.0.1 draft specification <https://github.com/riscv/riscv-dot-product/releases/tag/v0.0.1>`__.
 
+``experimental-smpmpmt``
+  LLVM implements the `0.6 draft specification <https://github.com/riscv/riscv-isa-manual/blob/smpmpmt/src/smpmpmt.adoc>`__.
+
 To use an experimental extension from `clang`, you must add `-menable-experimental-extensions` to the command line, and specify the exact version of the experimental extension you are using.  To use an experimental extension with LLVM's internal developer tools (e.g. `llc`, `llvm-objdump`, `llvm-mc`), you must prefix the extension name with `experimental-`.  Note that you don't need to specify the version with internal tools, and shouldn't include the `experimental-` prefix with `clang`.
 
 Vendor Extensions
@@ -406,6 +409,12 @@ The current vendor extensions supported are:
 ``XSfvcp``
   LLVM implements `version 1.1.0 of the SiFive Vector Coprocessor Interface (VCIX) Software Specification <https://sifive.cdn.prismic.io/sifive/Zn3m1R5LeNNTwnLS_vcix-spec-software-v1p1.pdf>`__ by SiFive.  All instructions are prefixed with `sf.vc.` as described in the specification, and the riscv-toolchain-convention document linked above.
 
+``Xsfvfexp16e``, ``Xsfvfbfexp16e``, and ``Xsfvfexp32e``
+  LLVM implements `version 0.5 of the Vector Exponential Extension Specification <https://www.sifive.com/document-file/exponential-function-instruction-xsfvfexp32e-xsfvf>`__ by SiFive. All instructions are prefixed with `sf.` as described in the specification linked above.
+
+``Xsfvfexpa`` and ``Xsfvfexpa64e``
+  LLVM implements `version 0.2 of the Vector Exponential Approximation Extension Specification <https://www.sifive.com/document-file/exponential-approximation-instruction-xsfvfexpa-ex>`__ by SiFive. All instructions are prefixed with `sf.` as described in the specification linked above.
+
 ``XSfvqmaccdod``, ``XSfvqmaccqoq``
   LLVM implements `version 1.1.0 of the SiFive Int8 Matrix Multiplication Extensions Specification <https://sifive.cdn.prismic.io/sifive/1a2ad85b-d818-49f7-ba83-f51f1731edbe_int8-matmul-spec.pdf>`__ by SiFive.  All instructions are prefixed with `sf.` as described in the specification linked above.
 
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 49158fb4217b6..6f386b81476ac 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -67,6 +67,13 @@ Changes to the LLVM IR
   Instead, the `align` attribute should be placed on the pointer (or vector of
   pointers) argument.
 * A `load atomic` may now be used with vector types on x86.
+* Added `@llvm.reloc.none` intrinsic to emit null relocations to symbols. This
+  emits an undefined symbol reference without adding any dedicated code or data to
+  to bear the relocation.
+* Added `modular-format` attribute to dynamically pull in aspects of libc
+  format string function implementations from statically-linked libc's based on
+  the requirements of each call. Currently only `float` is supported; this can
+  keep floating point support out of printf if it can be proven unused.
 
 Changes to LLVM infrastructure
 ------------------------------
@@ -97,6 +104,9 @@ Changes to the AArch64 Backend
 * Assembler/disassembler support has been added for Armv9.7-A (2025)
   architecture extensions.
 
+* Assembler/disassembler support has been added for 'Virtual Tagging
+  Extension (vMTE)' Future Architecture Technologies extension.
+
 Changes to the AMDGPU Backend
 -----------------------------
 
@@ -179,6 +189,11 @@ Changes to the LLVM tools
 * `llvm-readelf` now dumps all hex format values in lower-case mode.
 * Some code paths for supporting Python 2.7 in `llvm-lit` have been removed.
 * Support for `%T` in lit has been removed.
+* Add `--save-stats` option to `llc` to save LLVM statistics to a file. Compatible with the Clang option.
+
+* `llvm-config` gained a new flag `--quote-paths` which quotes and escapes paths
+  emitted on stdout, to account for spaces or other special characters in path.
+  (`#97305 <https://github.com/llvm/llvm-project/pull/97305>`_).
 
 Changes to LLDB
 ---------------------------------
diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index 85eeabf10244a..5ee3d83bd7aac 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -167,12 +167,16 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
      - Adds atomic add instruction on floating-point numbers.
    * - ``SPV_EXT_shader_atomic_float_min_max``
      - Adds atomic min and max instruction on floating-point numbers.
+   * - ``SPV_INTEL_16bit_atomics``
+     - Extends the SPV_EXT_shader_atomic_float_add and SPV_EXT_shader_atomic_float_min_max to support addition, minimum and maximum on 16-bit `bfloat16` floating-point numbers in memory.
    * - ``SPV_INTEL_2d_block_io``
      - Adds additional subgroup block prefetch, load, load transposed, load transformed and store instructions to read two-dimensional blocks of data from a two-dimensional region of memory, or to write two-dimensional blocks of data to a two dimensional region of memory.
    * - ``SPV_INTEL_arbitrary_precision_integers``
      - Allows generating arbitrary width integer types.
    * - ``SPV_INTEL_bindless_images``
      - Adds instructions to convert convert unsigned integer handles to images, samplers and sampled images.
+   * - ``SPV_INTEL_bfloat16_arithmetic``
+     - Allows the use of 16-bit bfloat16 values in arithmetic and relational operators.
    * - ``SPV_INTEL_bfloat16_conversion``
      - Adds instructions to convert between single-precision 32-bit floating-point values and 16-bit bfloat16 values.
    * - ``SPV_INTEL_cache_controls``
@@ -187,6 +191,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
      - Adds decorations that can be applied to global (module scope) variables.
    * - ``SPV_INTEL_global_variable_fpga_decorations``
      - Adds decorations that can be applied to global (module scope) variables to help code generation for FPGA devices.
+   * - ``SPV_INTEL_kernel_attributes``
+     - Adds execution modes that can be applied to entry points to inform scheduling.
    * - ``SPV_INTEL_media_block_io``
      - Adds additional subgroup block read and write functionality that allow applications to flexibly specify the width and height of the block to read from or write to a 2D image.
    * - ``SPV_INTEL_memory_access_aliasing``
@@ -226,9 +232,9 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
    * - ``SPV_INTEL_fp_max_error``
      - Adds the ability to specify the maximum error for floating-point operations.
    * - ``SPV_INTEL_ternary_bitwise_function``
-     - Adds a bitwise instruction on three operands and a look-up table index for specifying the bitwise operation to perform. 
+     - Adds a bitwise instruction on three operands and a look-up table index for specifying the bitwise operation to perform.
    * - ``SPV_INTEL_subgroup_matrix_multiply_accumulate``
-     - Adds an instruction to compute the matrix product of an M x K matrix with a K x N matrix and then add an M x N matrix. 
+     - Adds an instruction to compute the matrix product of an M x K matrix with a K x N matrix and then add an M x N matrix.
    * - ``SPV_INTEL_int4``
      - Adds support for 4-bit integer type, and allow this type to be used in cooperative matrices.
    * - ``SPV_KHR_float_controls2``
@@ -237,6 +243,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
      - Adds predicated load and store instructions that conditionally read from or write to memory based on a boolean predicate.
    * - ``SPV_KHR_maximal_reconvergence``
      - Adds execution mode and capability to enable maximal reconvergence.
+   * - ``SPV_ALTERA_blocking_pipes``
+     - Adds new pipe read and write functions that have blocking semantics instead of the non-blocking semantics of the existing pipe read/write functions.
 
 SPIR-V representation in LLVM IR
 ================================
diff --git a/llvm/docs/SandboxIR.md b/llvm/docs/SandboxIR.md
index d2b612ba95ef1..dbf488bba735c 100644
--- a/llvm/docs/SandboxIR.md
+++ b/llvm/docs/SandboxIR.md
@@ -8,8 +8,8 @@ Within your LLVM pass:
 
 ``` C++
 // 1. Include the necessary Sandbox IR header files.
-#include "llvm/SandboxIR/Context.h
-#include "llvm/SandboxIR/Function.h
+#include "llvm/SandboxIR/Context.h"
+#include "llvm/SandboxIR/Function.h"
 
 // 2. Create a sandboxir::Context using LLVMContext `LLVMCtx`.
 sandboxir::Context Ctx(LLVMCtx);
diff --git a/llvm/examples/Kaleidoscope/Chapter9/toy.cpp b/llvm/examples/Kaleidoscope/Chapter9/toy.cpp
index 51457a3c22ade..14081fb3c3b10 100644
--- a/llvm/examples/Kaleidoscope/Chapter9/toy.cpp
+++ b/llvm/examples/Kaleidoscope/Chapter9/toy.cpp
@@ -203,7 +203,7 @@ class ExprAST {
 
 public:
   ExprAST(SourceLocation Loc = CurLoc) : Loc(Loc) {}
-  virtual ~ExprAST() {}
+  virtual ~ExprAST() = default;
   virtual Value *codegen() = 0;
   int getLine() const { return Loc.Line; }
   int getCol() const { return Loc.Col; }
diff --git a/llvm/examples/OptSubcommand/llvm-hello-sub.cpp b/llvm/examples/OptSubcommand/llvm-hello-sub.cpp
index 8071f56cb3685..bcf433f2179c3 100644
--- a/llvm/examples/OptSubcommand/llvm-hello-sub.cpp
+++ b/llvm/examples/OptSubcommand/llvm-hello-sub.cpp
@@ -46,7 +46,7 @@ class HelloSubOptTable : public GenericOptTable {
   HelloSubOptTable()
       : GenericOptTable(OptionStrTable, OptionPrefixesTable, InfoTable,
                         /*IgnoreCase=*/false, OptionSubCommands,
-                        OptionSubCommandIDsTable) {}
+                        OptionSubCommandIDsTable) {};
 };
 } // namespace
 
diff --git a/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp b/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp
index 15dca0abe52d1..6132149ce473b 100644
--- a/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp
+++ b/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp
@@ -20,7 +20,6 @@
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ThreadPool.h"
 
-#include <list>
 #include <string>
 
 using namespace llvm;
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 4e380d9bd5969..83dd1eba876e6 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -531,6 +531,13 @@ enum {
  */
 typedef unsigned LLVMGEPNoWrapFlags;
 
+typedef enum {
+  LLVMDbgRecordLabel,
+  LLVMDbgRecordDeclare,
+  LLVMDbgRecordValue,
+  LLVMDbgRecordAssign,
+} LLVMDbgRecordKind;
+
 /**
  * @}
  */
@@ -3896,6 +3903,37 @@ LLVM_C_ABI LLVMDbgRecordRef LLVMGetNextDbgRecord(LLVMDbgRecordRef DbgRecord);
 LLVM_C_ABI LLVMDbgRecordRef
 LLVMGetPreviousDbgRecord(LLVMDbgRecordRef DbgRecord);
 
+/**
+ * Get the debug location attached to the debug record.
+ *
+ * @see llvm::DbgRecord::getDebugLoc()
+ */
+LLVMMetadataRef LLVMDbgRecordGetDebugLoc(LLVMDbgRecordRef Rec);
+
+LLVMDbgRecordKind LLVMDbgRecordGetKind(LLVMDbgRecordRef Rec);
+
+/**
+ * Get the value of the DbgVariableRecord.
+ *
+ * @see llvm::DbgVariableRecord::getValue()
+ */
+LLVMValueRef LLVMDbgVariableRecordGetValue(LLVMDbgRecordRef Rec,
+                                           unsigned OpIdx);
+
+/**
+ * Get the debug info variable of the DbgVariableRecord.
+ *
+ * @see llvm::DbgVariableRecord::getVariable()
+ */
+LLVMMetadataRef LLVMDbgVariableRecordGetVariable(LLVMDbgRecordRef Rec);
+
+/**
+ * Get the debug info expression of the DbgVariableRecord.
+ *
+ * @see llvm::DbgVariableRecord::getExpression()
+ */
+LLVMMetadataRef LLVMDbgVariableRecordGetExpression(LLVMDbgRecordRef Rec);
+
 /**
  * @defgroup LLVMCCoreValueInstructionCall Call Sites and Invocations
  *
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index bccdb8930561e..82ac9a3a1ef80 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -152,7 +152,7 @@ class APFloatBase {
   static constexpr unsigned integerPartWidth = APInt::APINT_BITS_PER_WORD;
 
   /// A signed type to represent a floating point numbers unbiased exponent.
-  typedef int32_t ExponentType;
+  using ExponentType = int32_t;
 
   /// \name Floating Point Semantics.
   /// @{
@@ -938,8 +938,8 @@ LLVM_ABI DoubleAPFloat frexp(const DoubleAPFloat &X, int &Exp, roundingMode);
 // This is a interface class that is currently forwarding functionalities from
 // detail::IEEEFloat.
 class APFloat : public APFloatBase {
-  typedef detail::IEEEFloat IEEEFloat;
-  typedef detail::DoubleAPFloat DoubleAPFloat;
+  using IEEEFloat = detail::IEEEFloat;
+  using DoubleAPFloat = detail::DoubleAPFloat;
 
   static_assert(std::is_standard_layout<IEEEFloat>::value);
 
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index 9fa98ad4ddde1..fdb3b84b73a1f 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -77,7 +77,7 @@ inline APInt operator-(APInt);
 ///
 class [[nodiscard]] APInt {
 public:
-  typedef uint64_t WordType;
+  using WordType = uint64_t;
 
   /// Byte size of a word.
   static constexpr unsigned APINT_WORD_SIZE = sizeof(WordType);
@@ -154,6 +154,7 @@ class [[nodiscard]] APInt {
   /// Once all uses of this constructor are migrated to other constructors,
   /// consider marking this overload ""= delete" to prevent calls from being
   /// incorrectly bound to the APInt(unsigned, uint64_t, bool) constructor.
+  [[deprecated("Use other constructors of APInt")]]
   LLVM_ABI APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[]);
 
   /// Construct an APInt from a string representation.
diff --git a/llvm/include/llvm/ADT/AddressRanges.h b/llvm/include/llvm/ADT/AddressRanges.h
index 79ba5d5a3eddb..6ea097d544011 100644
--- a/llvm/include/llvm/ADT/AddressRanges.h
+++ b/llvm/include/llvm/ADT/AddressRanges.h
@@ -21,7 +21,7 @@ namespace llvm {
 /// a start and an end address: [Start, End).
 class AddressRange {
 public:
-  AddressRange() {}
+  AddressRange() = default;
   AddressRange(uint64_t S, uint64_t E) : Start(S), End(E) {
     assert(Start <= End);
   }
diff --git a/llvm/include/llvm/ADT/ArrayRef.h b/llvm/include/llvm/ADT/ArrayRef.h
index 448d10013d371..d7ed2c78749f0 100644
--- a/llvm/include/llvm/ADT/ArrayRef.h
+++ b/llvm/include/llvm/ADT/ArrayRef.h
@@ -10,8 +10,8 @@
 #define LLVM_ADT_ARRAYREF_H
 
 #include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Compiler.h"
 #include <algorithm>
 #include <array>
@@ -19,7 +19,6 @@
 #include <cstddef>
 #include <initializer_list>
 #include <iterator>
-#include <memory>
 #include <type_traits>
 #include <vector>
 
@@ -66,10 +65,6 @@ namespace llvm {
     /// Construct an empty ArrayRef.
     /*implicit*/ ArrayRef() = default;
 
-    /// Construct an empty ArrayRef from std::nullopt.
-    /*implicit*/ LLVM_DEPRECATED("Use {} or ArrayRef<T>() instead", "{}")
-    ArrayRef(std::nullopt_t) {}
-
     /// Construct an ArrayRef from a single element.
     /*implicit*/ ArrayRef(const T &OneElt LLVM_LIFETIME_BOUND)
         : Data(&OneElt), Length(1) {}
diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h
index 9e81a4b735e7f..cc3f3a9226395 100644
--- a/llvm/include/llvm/ADT/BitVector.h
+++ b/llvm/include/llvm/ADT/BitVector.h
@@ -99,7 +99,7 @@ template <typename BitVectorT> class const_set_bits_iterator_impl {
 };
 
 class BitVector {
-  typedef uintptr_t BitWord;
+  using BitWord = uintptr_t;
 
   enum { BITWORD_SIZE = (unsigned)sizeof(BitWord) * CHAR_BIT };
 
@@ -147,8 +147,8 @@ class BitVector {
     }
   };
 
-  typedef const_set_bits_iterator_impl<BitVector> const_set_bits_iterator;
-  typedef const_set_bits_iterator set_iterator;
+  using const_set_bits_iterator = const_set_bits_iterator_impl<BitVector>;
+  using set_iterator = const_set_bits_iterator;
 
   const_set_bits_iterator set_bits_begin() const {
     return const_set_bits_iterator(*this);
diff --git a/llvm/include/llvm/ADT/BitmaskEnum.h b/llvm/include/llvm/ADT/BitmaskEnum.h
index 9555fadda6e47..c10a38c8ce4cb 100644
--- a/llvm/include/llvm/ADT/BitmaskEnum.h
+++ b/llvm/include/llvm/ADT/BitmaskEnum.h
@@ -11,7 +11,6 @@
 
 #include <cassert>
 #include <type_traits>
-#include <utility>
 
 #include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/bit.h"
diff --git a/llvm/include/llvm/ADT/ConcurrentHashtable.h b/llvm/include/llvm/ADT/ConcurrentHashtable.h
index 0cc03cf7a692a..9ee5f594ea56a 100644
--- a/llvm/include/llvm/ADT/ConcurrentHashtable.h
+++ b/llvm/include/llvm/ADT/ConcurrentHashtable.h
@@ -24,7 +24,6 @@
 #include <iomanip>
 #include <mutex>
 #include <sstream>
-#include <type_traits>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 22ef7ed64451e..d5b13e7731550 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -360,6 +360,12 @@ class DenseMapBase : public DebugEpochBase {
     return getBuckets();
   }
 
+  void swap(DerivedT &RHS) {
+    this->incrementEpoch();
+    RHS.incrementEpoch();
+    derived().swapImpl(RHS);
+  }
+
 protected:
   DenseMapBase() = default;
 
@@ -736,7 +742,7 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
 
   DenseMap(DenseMap &&other) : BaseT() {
     init(0);
-    swap(other);
+    this->swap(other);
   }
 
   template <typename InputIt> DenseMap(const InputIt &I, const InputIt &E) {
@@ -756,15 +762,15 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
     deallocateBuckets();
   }
 
-  void swap(DenseMap &RHS) {
-    this->incrementEpoch();
-    RHS.incrementEpoch();
+private:
+  void swapImpl(DenseMap &RHS) {
     std::swap(Buckets, RHS.Buckets);
     std::swap(NumEntries, RHS.NumEntries);
     std::swap(NumTombstones, RHS.NumTombstones);
     std::swap(NumBuckets, RHS.NumBuckets);
   }
 
+public:
   DenseMap &operator=(const DenseMap &other) {
     if (&other != this)
       this->copyFrom(other);
@@ -775,7 +781,7 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
     this->destroyAll();
     deallocateBuckets();
     init(0);
-    swap(other);
+    this->swap(other);
     return *this;
   }
 
@@ -895,7 +901,7 @@ class SmallDenseMap
 
   SmallDenseMap(SmallDenseMap &&other) : BaseT() {
     init(0);
-    swap(other);
+    this->swap(other);
   }
 
   template <typename InputIt>
@@ -916,7 +922,8 @@ class SmallDenseMap
     deallocateBuckets();
   }
 
-  void swap(SmallDenseMap &RHS) {
+private:
+  void swapImpl(SmallDenseMap &RHS) {
     unsigned TmpNumEntries = RHS.NumEntries;
     RHS.NumEntries = NumEntries;
     NumEntries = TmpNumEntries;
@@ -987,6 +994,7 @@ class SmallDenseMap
     new (SmallSide.getLargeRep()) LargeRep(std::move(TmpRep));
   }
 
+public:
   SmallDenseMap &operator=(const SmallDenseMap &other) {
     if (&other != this)
       this->copyFrom(other);
@@ -997,7 +1005,7 @@ class SmallDenseMap
     this->destroyAll();
     deallocateBuckets();
     init(0);
-    swap(other);
+    this->swap(other);
     return *this;
   }
 
diff --git a/llvm/include/llvm/ADT/FloatingPointMode.h b/llvm/include/llvm/ADT/FloatingPointMode.h
index 0314b4cb1c38a..a9702c65e631f 100644
--- a/llvm/include/llvm/ADT/FloatingPointMode.h
+++ b/llvm/include/llvm/ADT/FloatingPointMode.h
@@ -191,7 +191,7 @@ inline DenormalMode::DenormalModeKind
 parseDenormalFPAttributeComponent(StringRef Str) {
   // Assume ieee on unspecified attribute.
   return StringSwitch<DenormalMode::DenormalModeKind>(Str)
-      .Cases("", "ieee", DenormalMode::IEEE)
+      .Cases({"", "ieee"}, DenormalMode::IEEE)
       .Case("preserve-sign", DenormalMode::PreserveSign)
       .Case("positive-zero", DenormalMode::PositiveZero)
       .Case("dynamic", DenormalMode::Dynamic)
diff --git a/llvm/include/llvm/ADT/FunctionExtras.h b/llvm/include/llvm/ADT/FunctionExtras.h
index 2498cb7796f1f..807a2e769999c 100644
--- a/llvm/include/llvm/ADT/FunctionExtras.h
+++ b/llvm/include/llvm/ADT/FunctionExtras.h
@@ -39,7 +39,6 @@
 #include "llvm/Support/MemAlloc.h"
 #include "llvm/Support/type_traits.h"
 #include <cstring>
-#include <memory>
 #include <type_traits>
 
 namespace llvm {
diff --git a/llvm/include/llvm/ADT/GenericSSAContext.h b/llvm/include/llvm/ADT/GenericSSAContext.h
index e9f99bafe9f1e..426a083778d6e 100644
--- a/llvm/include/llvm/ADT/GenericSSAContext.h
+++ b/llvm/include/llvm/ADT/GenericSSAContext.h
@@ -25,7 +25,7 @@ template <typename, bool> class DominatorTreeBase;
 template <typename> class SmallVectorImpl;
 
 namespace Intrinsic {
-typedef unsigned ID;
+using ID = unsigned;
 }
 
 // Specializations of this template should provide the types used by the
diff --git a/llvm/include/llvm/ADT/PointerSumType.h b/llvm/include/llvm/ADT/PointerSumType.h
index c4971bf3af87a..c8e6cffd796a6 100644
--- a/llvm/include/llvm/ADT/PointerSumType.h
+++ b/llvm/include/llvm/ADT/PointerSumType.h
@@ -15,7 +15,6 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <type_traits>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index a9841c6651b72..af0e4a36be1b1 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -1516,8 +1516,8 @@ template <class Iterator, class RNG>
 void shuffle(Iterator first, Iterator last, RNG &&g) {
   // It would be better to use a std::uniform_int_distribution,
   // but that would be stdlib dependent.
-  typedef
-      typename std::iterator_traits<Iterator>::difference_type difference_type;
+  using difference_type =
+      typename std::iterator_traits<Iterator>::difference_type;
   for (auto size = last - first; size > 1; ++first, (void)--size) {
     difference_type offset = g() % size;
     // Avoid self-assignment due to incorrect assertions in libstdc++
@@ -2600,16 +2600,6 @@ bool hasNItemsOrLess(ContainerTy &&C, unsigned N) {
   return hasNItemsOrLess(adl_begin(C), adl_end(C), N);
 }
 
-/// Returns a raw pointer that represents the same address as the argument.
-///
-/// This implementation can be removed once we move to C++20 where it's defined
-/// as std::to_address().
-///
-/// The std::pointer_traits<>::to_address(p) variations of these overloads has
-/// not been implemented.
-template <class Ptr> auto to_address(const Ptr &P) { return P.operator->(); }
-template <class T> constexpr T *to_address(T *P) { return P; }
-
 // Detect incomplete types, relying on the fact that their size is unknown.
 namespace detail {
 template <typename T> using has_sizeof = decltype(sizeof(T));
diff --git a/llvm/include/llvm/ADT/STLForwardCompat.h b/llvm/include/llvm/ADT/STLForwardCompat.h
index e02694f043fbb..b975a403cd042 100644
--- a/llvm/include/llvm/ADT/STLForwardCompat.h
+++ b/llvm/include/llvm/ADT/STLForwardCompat.h
@@ -134,6 +134,19 @@ struct identity // NOLINT(readability-identifier-naming)
   }
 };
 
+/// Returns a raw pointer that represents the same address as the argument.
+///
+/// This implementation can be removed once we move to C++20 where it's defined
+/// as std::to_address().
+///
+/// The std::pointer_traits<>::to_address(p) variations of these overloads has
+/// not been implemented.
+template <class Ptr> auto to_address(const Ptr &P) { return P.operator->(); }
+template <class T> constexpr T *to_address(T *P) {
+  static_assert(!std::is_function_v<T>);
+  return P;
+}
+
 //===----------------------------------------------------------------------===//
 //     Features from C++23
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/ADT/SetVector.h b/llvm/include/llvm/ADT/SetVector.h
index c129f3a695b9e..0fde14126c79b 100644
--- a/llvm/include/llvm/ADT/SetVector.h
+++ b/llvm/include/llvm/ADT/SetVector.h
@@ -28,7 +28,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Compiler.h"
 #include <cassert>
-#include <iterator>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/ADT/StringMap.h b/llvm/include/llvm/ADT/StringMap.h
index 01cbf2d3fff71..7901365daa462 100644
--- a/llvm/include/llvm/ADT/StringMap.h
+++ b/llvm/include/llvm/ADT/StringMap.h
@@ -302,7 +302,7 @@ class LLVM_ALLOCATORHOLDER_EMPTYBASE StringMap
       if (FindInRHS == RHS.end())
         return false;
 
-      if constexpr (!std::is_same_v<ValueTy, std::nullopt_t>) {
+      if constexpr (!std::is_same_v<ValueTy, EmptyStringSetTag>) {
         if (!(KeyValue.getValue() == FindInRHS->getValue()))
           return false;
       }
diff --git a/llvm/include/llvm/ADT/StringMapEntry.h b/llvm/include/llvm/ADT/StringMapEntry.h
index 21be5ec343059..b0a3c8cd68abc 100644
--- a/llvm/include/llvm/ADT/StringMapEntry.h
+++ b/llvm/include/llvm/ADT/StringMapEntry.h
@@ -21,6 +21,9 @@
 
 namespace llvm {
 
+/// The "value type" of StringSet represented as an empty struct.
+struct EmptyStringSetTag {};
+
 /// StringMapEntryBase - Shared base class of StringMapEntry instances.
 class StringMapEntryBase {
   size_t keyLength;
@@ -85,14 +88,13 @@ class StringMapEntryStorage : public StringMapEntryBase {
 };
 
 template <>
-class StringMapEntryStorage<std::nullopt_t> : public StringMapEntryBase {
+class StringMapEntryStorage<EmptyStringSetTag> : public StringMapEntryBase {
 public:
-  explicit StringMapEntryStorage(size_t keyLength,
-                                 std::nullopt_t = std::nullopt)
+  explicit StringMapEntryStorage(size_t keyLength, EmptyStringSetTag = {})
       : StringMapEntryBase(keyLength) {}
   StringMapEntryStorage(StringMapEntryStorage &entry) = delete;
 
-  std::nullopt_t getValue() const { return std::nullopt; }
+  EmptyStringSetTag getValue() const { return {}; }
 };
 
 /// StringMapEntry - This is used to represent one value that is inserted into
diff --git a/llvm/include/llvm/ADT/StringSet.h b/llvm/include/llvm/ADT/StringSet.h
index c8be3f2a503e4..dc154af073f2f 100644
--- a/llvm/include/llvm/ADT/StringSet.h
+++ b/llvm/include/llvm/ADT/StringSet.h
@@ -22,8 +22,8 @@ namespace llvm {
 
 /// StringSet - A wrapper for StringMap that provides set-like functionality.
 template <class AllocatorTy = MallocAllocator>
-class StringSet : public StringMap<std::nullopt_t, AllocatorTy> {
-  using Base = StringMap<std::nullopt_t, AllocatorTy>;
+class StringSet : public StringMap<EmptyStringSetTag, AllocatorTy> {
+  using Base = StringMap<EmptyStringSetTag, AllocatorTy>;
 
 public:
   StringSet() = default;
diff --git a/llvm/include/llvm/ADT/StringSwitch.h b/llvm/include/llvm/ADT/StringSwitch.h
index 98685de8573fa..5da6076c27390 100644
--- a/llvm/include/llvm/ADT/StringSwitch.h
+++ b/llvm/include/llvm/ADT/StringSwitch.h
@@ -14,7 +14,6 @@
 #define LLVM_ADT_STRINGSWITCH_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstring>
@@ -42,6 +41,8 @@ namespace llvm {
 ///   .Cases({"violet", "purple"}, Violet)
 ///   .Default(UnknownColor);
 /// \endcode
+///
+/// When multiple matches are found, the value of the first match is returned.
 template<typename T, typename R = T>
 class StringSwitch {
   /// The string we are matching.
@@ -64,7 +65,7 @@ class StringSwitch {
   void operator=(const StringSwitch &) = delete;
   void operator=(StringSwitch &&) = delete;
 
-  // Case-sensitive case matchers
+  // Case-sensitive case matchers.
   StringSwitch &Case(StringLiteral S, T Value) {
     CaseImpl(S, Value);
     return *this;
@@ -89,6 +90,7 @@ class StringSwitch {
     return CasesImpl(CaseStrings, Value);
   }
 
+  [[deprecated("Pass cases in std::initializer_list instead")]]
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, T Value) {
     return CasesImpl({S0, S1}, Value);
   }
@@ -156,7 +158,7 @@ class StringSwitch {
 
   StringSwitch &EndsWithLower(StringLiteral S, T Value) {
     if (!Result && Str.ends_with_insensitive(S))
-      Result = Value;
+      Result = std::move(Value);
 
     return *this;
   }
@@ -173,6 +175,7 @@ class StringSwitch {
     return CasesLowerImpl(CaseStrings, Value);
   }
 
+  [[deprecated("Pass cases in std::initializer_list instead")]]
   StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, T Value) {
     return CasesLowerImpl({S0, S1}, Value);
   }
@@ -212,23 +215,30 @@ class StringSwitch {
   [[nodiscard]] operator R() { return DefaultUnreachable(); }
 
 private:
-  // Returns true when `Str` matches the `S` argument, and stores the result.
+  // Returns true when a match is found. If `Str` matches the `S` argument,
+  // stores the result.
   bool CaseImpl(StringLiteral S, T &Value) {
-    if (!Result && Str == S) {
-      Result = std::move(Value);
+    if (Result)
       return true;
-    }
-    return false;
+
+    if (Str != S)
+      return false;
+
+    Result = std::move(Value);
+    return true;
   }
 
-  // Returns true when `Str` matches the `S` argument (case-insensitive), and
-  // stores the result.
+  // Returns true when a match is found. If `Str` matches the `S` argument
+  // (case-insensitive), stores the result.
   bool CaseLowerImpl(StringLiteral S, T &Value) {
-    if (!Result && Str.equals_insensitive(S)) {
-      Result = std::move(Value);
+    if (Result)
       return true;
-    }
-    return false;
+
+    if (!Str.equals_insensitive(S))
+      return false;
+
+    Result = std::move(Value);
+    return true;
   }
 
   StringSwitch &CasesImpl(std::initializer_list<StringLiteral> Cases,
diff --git a/llvm/include/llvm/ADT/TinyPtrVector.h b/llvm/include/llvm/ADT/TinyPtrVector.h
index 8d7a07b5e9eb5..ed08ec8a966c7 100644
--- a/llvm/include/llvm/ADT/TinyPtrVector.h
+++ b/llvm/include/llvm/ADT/TinyPtrVector.h
@@ -15,7 +15,6 @@
 #include <cassert>
 #include <cstddef>
 #include <iterator>
-#include <type_traits>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/ADT/ilist.h b/llvm/include/llvm/ADT/ilist.h
index aed19ccbff7f2..64392903bec74 100644
--- a/llvm/include/llvm/ADT/ilist.h
+++ b/llvm/include/llvm/ADT/ilist.h
@@ -108,21 +108,21 @@ template <typename Ty> struct ilist_traits<const Ty> {};
 /// list.
 template <class IntrusiveListT, class TraitsT>
 class iplist_impl : public TraitsT, IntrusiveListT {
-  typedef IntrusiveListT base_list_type;
+  using base_list_type = IntrusiveListT;
 
 public:
-  typedef typename base_list_type::pointer pointer;
-  typedef typename base_list_type::const_pointer const_pointer;
-  typedef typename base_list_type::reference reference;
-  typedef typename base_list_type::const_reference const_reference;
-  typedef typename base_list_type::value_type value_type;
-  typedef typename base_list_type::size_type size_type;
-  typedef typename base_list_type::difference_type difference_type;
-  typedef typename base_list_type::iterator iterator;
-  typedef typename base_list_type::const_iterator const_iterator;
-  typedef typename base_list_type::reverse_iterator reverse_iterator;
-  typedef
-      typename base_list_type::const_reverse_iterator const_reverse_iterator;
+  using pointer = typename base_list_type::pointer;
+  using const_pointer = typename base_list_type::const_pointer;
+  using reference = typename base_list_type::reference;
+  using const_reference = typename base_list_type::const_reference;
+  using value_type = typename base_list_type::value_type;
+  using size_type = typename base_list_type::size_type;
+  using difference_type = typename base_list_type::difference_type;
+  using iterator = typename base_list_type::iterator;
+  using const_iterator = typename base_list_type::const_iterator;
+  using reverse_iterator = typename base_list_type::reverse_iterator;
+  using const_reverse_iterator =
+      typename base_list_type::const_reverse_iterator;
 
 private:
   static bool op_less(const_reference L, const_reference R) { return L < R; }
diff --git a/llvm/include/llvm/ADT/ilist_node_options.h b/llvm/include/llvm/ADT/ilist_node_options.h
index 003d5dabce897..53719b07a3768 100644
--- a/llvm/include/llvm/ADT/ilist_node_options.h
+++ b/llvm/include/llvm/ADT/ilist_node_options.h
@@ -58,8 +58,8 @@ namespace ilist_detail {
 template <bool IsExplicit> struct explicitness {
   static const bool is_explicit = IsExplicit;
 };
-typedef explicitness<true> is_explicit;
-typedef explicitness<false> is_implicit;
+using is_explicit = explicitness<true>;
+using is_implicit = explicitness<false>;
 
 /// Check whether an option is valid.
 ///
@@ -103,12 +103,12 @@ struct is_valid_option<ilist_sentinel_tracking<EnableSentinelTracking>>
 template <class... Options> struct extract_tag;
 template <class Tag, class... Options>
 struct extract_tag<ilist_tag<Tag>, Options...> {
-  typedef Tag type;
+  using type = Tag;
 };
 template <class Option1, class... Options>
 struct extract_tag<Option1, Options...> : extract_tag<Options...> {};
 template <> struct extract_tag<> {
-  typedef void type;
+  using type = void;
 };
 template <class Tag> struct is_valid_option<ilist_tag<Tag>> : std::true_type {};
 
@@ -134,11 +134,13 @@ struct is_valid_option<ilist_iterator_bits<IteratorBits>> : std::true_type {};
 template <class... Options> struct extract_parent;
 template <class ParentTy, class... Options>
 struct extract_parent<ilist_parent<ParentTy>, Options...> {
-  typedef ParentTy type;
+  using type = ParentTy;
 };
 template <class Option1, class... Options>
 struct extract_parent<Option1, Options...> : extract_parent<Options...> {};
-template <> struct extract_parent<> { typedef void type; };
+template <> struct extract_parent<> {
+  using type = void;
+};
 template <class ParentTy>
 struct is_valid_option<ilist_parent<ParentTy>> : std::true_type {};
 
@@ -154,28 +156,27 @@ struct check_options : std::conjunction<is_valid_option<Options>...> {};
 template <class T, bool EnableSentinelTracking, bool IsSentinelTrackingExplicit,
           class TagT, bool HasIteratorBits, class ParentTy>
 struct node_options {
-  typedef T value_type;
-  typedef T *pointer;
-  typedef T &reference;
-  typedef const T *const_pointer;
-  typedef const T &const_reference;
+  using value_type = T;
+  using pointer = T *;
+  using reference = T &;
+  using const_pointer = const T *;
+  using const_reference = const T &;
 
   static const bool enable_sentinel_tracking = EnableSentinelTracking;
   static const bool is_sentinel_tracking_explicit = IsSentinelTrackingExplicit;
   static const bool has_iterator_bits = HasIteratorBits;
-  typedef TagT tag;
-  typedef ParentTy parent_ty;
-  typedef ilist_node_base<enable_sentinel_tracking, parent_ty> node_base_type;
-  typedef ilist_base<enable_sentinel_tracking, parent_ty> list_base_type;
+  using tag = TagT;
+  using parent_ty = ParentTy;
+  using node_base_type = ilist_node_base<enable_sentinel_tracking, parent_ty>;
+  using list_base_type = ilist_base<enable_sentinel_tracking, parent_ty>;
 };
 
 template <class T, class... Options> struct compute_node_options {
-  typedef node_options<T, extract_sentinel_tracking<Options...>::value,
-                       extract_sentinel_tracking<Options...>::is_explicit,
-                       typename extract_tag<Options...>::type,
-                       extract_iterator_bits<Options...>::value,
-                       typename extract_parent<Options...>::type>
-      type;
+  using type = node_options<T, extract_sentinel_tracking<Options...>::value,
+                            extract_sentinel_tracking<Options...>::is_explicit,
+                            typename extract_tag<Options...>::type,
+                            extract_iterator_bits<Options...>::value,
+                            typename extract_parent<Options...>::type>;
 };
 
 } // end namespace ilist_detail
diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h
index 1681079054b8b..878b7e7a1fb3b 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -861,7 +861,7 @@ class AAResultBase {
 
   // Provide all the copy and move constructors so that derived types aren't
   // constrained.
-  AAResultBase(const AAResultBase &Arg) {}
+  AAResultBase(const AAResultBase &Arg) = default;
   AAResultBase(AAResultBase &&Arg) {}
 
 public:
diff --git a/llvm/include/llvm/Analysis/ConstantFolding.h b/llvm/include/llvm/Analysis/ConstantFolding.h
index 5f91f9747bb97..ea22ed48ab763 100644
--- a/llvm/include/llvm/Analysis/ConstantFolding.h
+++ b/llvm/include/llvm/Analysis/ConstantFolding.h
@@ -119,12 +119,6 @@ ConstantFoldFPInstOperands(unsigned Opcode, Constant *LHS, Constant *RHS,
 LLVM_ABI Constant *FlushFPConstant(Constant *Operand, const Instruction *I,
                                    bool IsOutput);
 
-/// Attempt to constant fold a select instruction with the specified
-/// operands. The constant result is returned if successful; if not, null is
-/// returned.
-LLVM_ABI Constant *ConstantFoldSelectInstruction(Constant *Cond, Constant *V1,
-                                                 Constant *V2);
-
 /// Attempt to constant fold a cast with the specified operand.  If it
 /// fails, it returns a constant expression of the specified operand.
 LLVM_ABI Constant *ConstantFoldCastOperand(unsigned Opcode, Constant *C,
@@ -135,40 +129,6 @@ LLVM_ABI Constant *ConstantFoldCastOperand(unsigned Opcode, Constant *C,
 LLVM_ABI Constant *ConstantFoldIntegerCast(Constant *C, Type *DestTy,
                                            bool IsSigned, const DataLayout &DL);
 
-/// ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue
-/// instruction with the specified operands and indices.  The constant result is
-/// returned if successful; if not, null is returned.
-LLVM_ABI Constant *ConstantFoldInsertValueInstruction(Constant *Agg,
-                                                      Constant *Val,
-                                                      ArrayRef<unsigned> Idxs);
-
-/// Attempt to constant fold an extractvalue instruction with the
-/// specified operands and indices.  The constant result is returned if
-/// successful; if not, null is returned.
-LLVM_ABI Constant *ConstantFoldExtractValueInstruction(Constant *Agg,
-                                                       ArrayRef<unsigned> Idxs);
-
-/// Attempt to constant fold an insertelement instruction with the
-/// specified operands and indices.  The constant result is returned if
-/// successful; if not, null is returned.
-LLVM_ABI Constant *ConstantFoldInsertElementInstruction(Constant *Val,
-                                                        Constant *Elt,
-                                                        Constant *Idx);
-
-/// Attempt to constant fold an extractelement instruction with the
-/// specified operands and indices.  The constant result is returned if
-/// successful; if not, null is returned.
-LLVM_ABI Constant *ConstantFoldExtractElementInstruction(Constant *Val,
-                                                         Constant *Idx);
-
-/// Attempt to constant fold a shufflevector instruction with the
-/// specified operands and mask.  See class ShuffleVectorInst for a description
-/// of the mask representation. The constant result is returned if successful;
-/// if not, null is returned.
-LLVM_ABI Constant *ConstantFoldShuffleVectorInstruction(Constant *V1,
-                                                        Constant *V2,
-                                                        ArrayRef<int> Mask);
-
 /// Extract value of C at the given Offset reinterpreted as Ty. If bits past
 /// the end of C are accessed, they are assumed to be poison.
 LLVM_ABI Constant *ConstantFoldLoadFromConst(Constant *C, Type *Ty,
diff --git a/llvm/include/llvm/Analysis/ConstraintSystem.h b/llvm/include/llvm/Analysis/ConstraintSystem.h
index 307ad50e81fec..1d9ac49a54745 100644
--- a/llvm/include/llvm/Analysis/ConstraintSystem.h
+++ b/llvm/include/llvm/Analysis/ConstraintSystem.h
@@ -64,7 +64,7 @@ class ConstraintSystem {
   SmallVector<std::string> getVarNamesList() const;
 
 public:
-  ConstraintSystem() {}
+  ConstraintSystem() = default;
   ConstraintSystem(ArrayRef<Value *> FunctionArgs) {
     NumVariables += FunctionArgs.size();
     for (auto *Arg : FunctionArgs) {
diff --git a/llvm/include/llvm/Analysis/DDG.h b/llvm/include/llvm/Analysis/DDG.h
index 1c5329181ddb1..120bb46330a79 100644
--- a/llvm/include/llvm/Analysis/DDG.h
+++ b/llvm/include/llvm/Analysis/DDG.h
@@ -60,11 +60,7 @@ class LLVM_ABI DDGNode : public DDGNodeBase {
   DDGNode(DDGNode &&N) : DDGNodeBase(std::move(N)), Kind(N.Kind) {}
   virtual ~DDGNode() = 0;
 
-  DDGNode &operator=(const DDGNode &N) {
-    DGNode::operator=(N);
-    Kind = N.Kind;
-    return *this;
-  }
+  DDGNode &operator=(const DDGNode &N) = default;
 
   DDGNode &operator=(DDGNode &&N) {
     DGNode::operator=(std::move(N));
diff --git a/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h b/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
index ba5ee1d7db487..19a202f78c6ce 100644
--- a/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
+++ b/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
@@ -80,7 +80,7 @@ struct DOTGraphTraitsViewer
   /// virtual destructor needed. Making this dtor protected stops accidental
   /// invocation when the derived class destructor should have been called.
   /// Those derived classes sould be marked final to avoid the warning.
-  ~DOTGraphTraitsViewer() {}
+  ~DOTGraphTraitsViewer() = default;
 
 private:
   StringRef Name;
@@ -161,7 +161,7 @@ struct DOTGraphTraitsPrinter
   /// virtual destructor needed. Making this dtor protected stops accidental
   /// invocation when the derived class destructor should have been called.
   /// Those derived classes sould be marked final to avoid the warning.
-  ~DOTGraphTraitsPrinter() {}
+  ~DOTGraphTraitsPrinter() = default;
 
 private:
   StringRef Name;
diff --git a/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h b/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h
index cb535ac14f1c6..a1b030c157eae 100644
--- a/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h
+++ b/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h
@@ -27,6 +27,9 @@ struct EntryProperties {
   unsigned NumThreadsX{0}; // X component
   unsigned NumThreadsY{0}; // Y component
   unsigned NumThreadsZ{0}; // Z component
+  unsigned WaveSizeMin{0}; // Minimum component
+  unsigned WaveSizeMax{0}; // Maximum component
+  unsigned WaveSizePref{0}; // Preferred component
 
   EntryProperties(const Function *Fn = nullptr) : Entry(Fn) {};
 };
diff --git a/llvm/include/llvm/Analysis/DominanceFrontier.h b/llvm/include/llvm/Analysis/DominanceFrontier.h
index 68ddcf753b59f..787793501f98a 100644
--- a/llvm/include/llvm/Analysis/DominanceFrontier.h
+++ b/llvm/include/llvm/Analysis/DominanceFrontier.h
@@ -24,7 +24,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/GenericDomTree.h"
 #include <cassert>
-#include <utility>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/Analysis/DominanceFrontierImpl.h b/llvm/include/llvm/Analysis/DominanceFrontierImpl.h
index e877b2c4749ab..1483588581f4e 100644
--- a/llvm/include/llvm/Analysis/DominanceFrontierImpl.h
+++ b/llvm/include/llvm/Analysis/DominanceFrontierImpl.h
@@ -24,8 +24,6 @@
 #include "llvm/Support/GenericDomTree.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
-#include <set>
-#include <utility>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index 71055dd16a378..7a68773a2643a 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -72,7 +72,7 @@ enum class IR2VecKind { Symbolic, FlowAware };
 
 namespace ir2vec {
 
-extern llvm::cl::OptionCategory IR2VecCategory;
+LLVM_ABI extern llvm::cl::OptionCategory IR2VecCategory;
 LLVM_ABI extern cl::opt<float> OpcWeight;
 LLVM_ABI extern cl::opt<float> TypeWeight;
 LLVM_ABI extern cl::opt<float> ArgWeight;
@@ -110,8 +110,8 @@ struct Embedding {
     return Data[Itr];
   }
 
-  using iterator = typename std::vector<double>::iterator;
-  using const_iterator = typename std::vector<double>::const_iterator;
+  using iterator = std::vector<double>::iterator;
+  using const_iterator = std::vector<double>::const_iterator;
 
   iterator begin() { return Data.begin(); }
   iterator end() { return Data.end(); }
diff --git a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
index 09a8875e1e28c..693777483ade2 100644
--- a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
+++ b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
@@ -509,11 +509,10 @@ struct IRInstructionMapper {
       : InstDataAllocator(IDA), IDLAllocator(IDLA) {
     // Make sure that the implementation of DenseMapInfo<unsigned> hasn't
     // changed.
-    assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) &&
-           "DenseMapInfo<unsigned>'s empty key isn't -1!");
-    assert(DenseMapInfo<unsigned>::getTombstoneKey() ==
-               static_cast<unsigned>(-2) &&
-           "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
+    static_assert(DenseMapInfo<unsigned>::getEmptyKey() ==
+                  static_cast<unsigned>(-1));
+    static_assert(DenseMapInfo<unsigned>::getTombstoneKey() ==
+                  static_cast<unsigned>(-2));
 
     IDL = new (IDLAllocator->Allocate())
         IRInstructionDataList();
diff --git a/llvm/include/llvm/Analysis/InlineSizeEstimatorAnalysis.h b/llvm/include/llvm/Analysis/InlineSizeEstimatorAnalysis.h
deleted file mode 100644
index b44edd370dd1c..0000000000000
--- a/llvm/include/llvm/Analysis/InlineSizeEstimatorAnalysis.h
+++ /dev/null
@@ -1,47 +0,0 @@
-//===- InlineSizeEstimatorAnalysis.h - ML size estimator --------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-
-#ifndef LLVM_ANALYSIS_INLINESIZEESTIMATORANALYSIS_H
-#define LLVM_ANALYSIS_INLINESIZEESTIMATORANALYSIS_H
-
-#include "llvm/IR/PassManager.h"
-
-namespace llvm {
-class Function;
-
-class TFModelEvaluator;
-class InlineSizeEstimatorAnalysis
-    : public AnalysisInfoMixin<InlineSizeEstimatorAnalysis> {
-public:
-  InlineSizeEstimatorAnalysis();
-  InlineSizeEstimatorAnalysis(InlineSizeEstimatorAnalysis &&);
-  ~InlineSizeEstimatorAnalysis();
-
-  static AnalysisKey Key;
-  using Result = std::optional<size_t>;
-  Result run(const Function &F, FunctionAnalysisManager &FAM);
-  static bool isEvaluatorRequested();
-
-private:
-  std::unique_ptr<TFModelEvaluator> Evaluator;
-};
-
-class InlineSizeEstimatorAnalysisPrinterPass
-    : public PassInfoMixin<InlineSizeEstimatorAnalysisPrinterPass> {
-  raw_ostream &OS;
-
-public:
-  explicit InlineSizeEstimatorAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
-
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
-
-  static bool isRequired() { return true; }
-};
-} // namespace llvm
-#endif // LLVM_ANALYSIS_INLINESIZEESTIMATORANALYSIS_H
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 84b4ad7c1d5a9..c85ef3e131068 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -893,7 +893,7 @@ replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
 /// result of this function is undefined.
 LLVM_ABI std::optional<int64_t>
 getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
-             const Loop *Lp,
+             const Loop *Lp, const DominatorTree &DT,
              const DenseMap<Value *, const SCEV *> &StridesMap =
                  DenseMap<Value *, const SCEV *>(),
              bool Assume = false, bool ShouldCheckWrap = true);
diff --git a/llvm/include/llvm/Analysis/LoopIterator.h b/llvm/include/llvm/Analysis/LoopIterator.h
index 523d2a21825d0..1ac8e68bfa2f1 100644
--- a/llvm/include/llvm/Analysis/LoopIterator.h
+++ b/llvm/include/llvm/Analysis/LoopIterator.h
@@ -45,12 +45,12 @@ struct LoopBodyTraits {
   class WrappedSuccIterator
       : public iterator_adaptor_base<
             WrappedSuccIterator, succ_iterator,
-            typename std::iterator_traits<succ_iterator>::iterator_category,
-            NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> {
+            std::iterator_traits<succ_iterator>::iterator_category, NodeRef,
+            std::ptrdiff_t, NodeRef *, NodeRef> {
     using BaseT = iterator_adaptor_base<
         WrappedSuccIterator, succ_iterator,
-        typename std::iterator_traits<succ_iterator>::iterator_category,
-        NodeRef, std::ptrdiff_t, NodeRef *, NodeRef>;
+        std::iterator_traits<succ_iterator>::iterator_category, NodeRef,
+        std::ptrdiff_t, NodeRef *, NodeRef>;
 
     const Loop *L;
 
diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h
index cbb942f022244..07d39ab3e10a9 100644
--- a/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/llvm/include/llvm/Analysis/MemorySSA.h
@@ -1247,7 +1247,7 @@ class upward_defs_iterator
     return DefIterator == Other.DefIterator;
   }
 
-  typename std::iterator_traits<BaseT>::reference operator*() const {
+  std::iterator_traits<BaseT>::reference operator*() const {
     assert(DefIterator != OriginalAccess->defs_end() &&
            "Tried to access past the end of our iterator");
     return CurrentPair;
diff --git a/llvm/include/llvm/Analysis/RegionPrinter.h b/llvm/include/llvm/Analysis/RegionPrinter.h
index 3a1d11d8fd4bc..1d4ba0fd4ebc6 100644
--- a/llvm/include/llvm/Analysis/RegionPrinter.h
+++ b/llvm/include/llvm/Analysis/RegionPrinter.h
@@ -18,64 +18,64 @@
 #include "llvm/Support/DOTGraphTraits.h"
 
 namespace llvm {
-  class FunctionPass;
-  class Function;
-  class RegionInfo;
-  class RegionNode;
+class FunctionPass;
+class Function;
+class RegionInfo;
+class RegionNode;
 
-  LLVM_ABI FunctionPass *createRegionViewerPass();
-  LLVM_ABI FunctionPass *createRegionOnlyViewerPass();
-  LLVM_ABI FunctionPass *createRegionPrinterPass();
-  LLVM_ABI FunctionPass *createRegionOnlyPrinterPass();
+LLVM_ABI FunctionPass *createRegionViewerPass();
+LLVM_ABI FunctionPass *createRegionOnlyViewerPass();
+LLVM_ABI FunctionPass *createRegionPrinterPass();
+LLVM_ABI FunctionPass *createRegionOnlyPrinterPass();
 
-  template <>
-  struct DOTGraphTraits<RegionNode *> : public DefaultDOTGraphTraits {
-    DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+template <> struct DOTGraphTraits<RegionNode *> : public DefaultDOTGraphTraits {
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
 
-    LLVM_ABI std::string getNodeLabel(RegionNode *Node, RegionNode *Graph);
-  };
+  LLVM_ABI std::string getNodeLabel(RegionNode *Node, RegionNode *Graph);
+};
 
 #ifndef NDEBUG
-  /// Open a viewer to display the GraphViz vizualization of the analysis
-  /// result.
-  ///
-  /// Practical to call in the debugger.
-  /// Includes the instructions in each BasicBlock.
-  ///
-  /// @param RI The analysis to display.
-  void viewRegion(llvm::RegionInfo *RI);
+/// Open a viewer to display the GraphViz vizualization of the analysis
+/// result.
+///
+/// Practical to call in the debugger.
+/// Includes the instructions in each BasicBlock.
+///
+/// @param RI The analysis to display.
+void viewRegion(llvm::RegionInfo *RI);
 
-  /// Analyze the regions of a function and open its GraphViz
-  /// visualization in a viewer.
-  ///
-  /// Useful to call in the debugger.
-  /// Includes the instructions in each BasicBlock.
-  /// The result of a new analysis may differ from the RegionInfo the pass
-  /// manager currently holds.
-  ///
-  /// @param F Function to analyze.
-  void viewRegion(const llvm::Function *F);
+/// Analyze the regions of a function and open its GraphViz
+/// visualization in a viewer.
+///
+/// Useful to call in the debugger.
+/// Includes the instructions in each BasicBlock.
+/// The result of a new analysis may differ from the RegionInfo the pass
+/// manager currently holds.
+///
+/// @param F Function to analyze.
+void viewRegion(const llvm::Function *F);
 
-  /// Open a viewer to display the GraphViz vizualization of the analysis
-  /// result.
-  ///
-  /// Useful to call in the debugger.
-  /// Shows only the BasicBlock names without their instructions.
-  ///
-  /// @param RI The analysis to display.
-  void viewRegionOnly(llvm::RegionInfo *RI);
+/// Open a viewer to display the GraphViz vizualization of the analysis
+/// result.
+///
+/// Useful to call in the debugger.
+/// Shows only the BasicBlock names without their instructions.
+///
+/// @param RI The analysis to display.
+void viewRegionOnly(llvm::RegionInfo *RI);
 
-  /// Analyze the regions of a function and open its GraphViz
-  /// visualization in a viewer.
-  ///
-  /// Useful to call in the debugger.
-  /// Shows only the BasicBlock names without their instructions.
-  /// The result of a new analysis may differ from the RegionInfo the pass
-  /// manager currently holds.
-  ///
-  /// @param F Function to analyze.
-  void viewRegionOnly(const llvm::Function *F);
-#endif
-} // End llvm namespace
+/// Analyze the regions of a function and open its GraphViz
+/// visualization in a viewer.
+///
+/// Useful to call in the debugger.
+/// Shows only the BasicBlock names without their instructions.
+/// The result of a new analysis may differ from the RegionInfo the pass
+/// manager currently holds.
+///
+/// @param F Function to analyze.
+void viewRegionOnly(const llvm::Function *F);
+#endif // NDEBUG
 
-#endif
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_REGIONPRINTER_H
diff --git a/llvm/include/llvm/Analysis/RuntimeLibcallInfo.h b/llvm/include/llvm/Analysis/RuntimeLibcallInfo.h
new file mode 100644
index 0000000000000..a3e1014b417e5
--- /dev/null
+++ b/llvm/include/llvm/Analysis/RuntimeLibcallInfo.h
@@ -0,0 +1,60 @@
+//===-- RuntimeLibcallInfo.h - Runtime library information ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_RUNTIMELIBCALLINFO_H
+#define LLVM_ANALYSIS_RUNTIMELIBCALLINFO_H
+
+#include "llvm/IR/RuntimeLibcalls.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+class LLVM_ABI RuntimeLibraryAnalysis
+    : public AnalysisInfoMixin<RuntimeLibraryAnalysis> {
+public:
+  using Result = RTLIB::RuntimeLibcallsInfo;
+
+  RuntimeLibraryAnalysis() = default;
+  RuntimeLibraryAnalysis(RTLIB::RuntimeLibcallsInfo &&BaselineInfoImpl)
+      : LibcallsInfo(std::move(BaselineInfoImpl)) {}
+  explicit RuntimeLibraryAnalysis(const Triple &T) : LibcallsInfo(T) {}
+
+  LLVM_ABI RTLIB::RuntimeLibcallsInfo run(const Module &M,
+                                          ModuleAnalysisManager &);
+
+private:
+  friend AnalysisInfoMixin<RuntimeLibraryAnalysis>;
+  LLVM_ABI static AnalysisKey Key;
+
+  RTLIB::RuntimeLibcallsInfo LibcallsInfo;
+};
+
+class LLVM_ABI RuntimeLibraryInfoWrapper : public ImmutablePass {
+  RuntimeLibraryAnalysis RTLA;
+  std::optional<RTLIB::RuntimeLibcallsInfo> RTLCI;
+
+public:
+  static char ID;
+  RuntimeLibraryInfoWrapper();
+  explicit RuntimeLibraryInfoWrapper(const Triple &T);
+  explicit RuntimeLibraryInfoWrapper(const RTLIB::RuntimeLibcallsInfo &RTLCI);
+
+  const RTLIB::RuntimeLibcallsInfo &getRTLCI(const Module &M) {
+    ModuleAnalysisManager DummyMAM;
+    RTLCI = RTLA.run(M, DummyMAM);
+    return *RTLCI;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+LLVM_ABI ModulePass *createRuntimeLibraryInfoWrapperPass();
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Analysis/TargetFolder.h b/llvm/include/llvm/Analysis/TargetFolder.h
index d27455cf3505d..cbce482ef47ab 100644
--- a/llvm/include/llvm/Analysis/TargetFolder.h
+++ b/llvm/include/llvm/Analysis/TargetFolder.h
@@ -20,6 +20,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/ConstantFold.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilderFolder.h"
 #include "llvm/IR/Operator.h"
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
index 014988299d37f..76b89dcb3f25d 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.def
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
@@ -1951,6 +1951,36 @@ TLI_DEFINE_ENUM_INTERNAL(nearbyintl)
 TLI_DEFINE_STRING_INTERNAL("nearbyintl")
 TLI_DEFINE_SIG_INTERNAL(LDbl, LDbl)
 
+/// double nextafter(double x, double y);
+TLI_DEFINE_ENUM_INTERNAL(nextafter)
+TLI_DEFINE_STRING_INTERNAL("nextafter")
+TLI_DEFINE_SIG_INTERNAL(Dbl, Dbl, Dbl)
+
+/// float nextafterf(float x, float y);
+TLI_DEFINE_ENUM_INTERNAL(nextafterf)
+TLI_DEFINE_STRING_INTERNAL("nextafterf")
+TLI_DEFINE_SIG_INTERNAL(Flt, Flt, Flt)
+
+/// long double nextafterl(long double x, long double y);
+TLI_DEFINE_ENUM_INTERNAL(nextafterl)
+TLI_DEFINE_STRING_INTERNAL("nextafterl")
+TLI_DEFINE_SIG_INTERNAL(LDbl, LDbl, LDbl)
+
+/// double nexttoward(double x, long double y);
+TLI_DEFINE_ENUM_INTERNAL(nexttoward)
+TLI_DEFINE_STRING_INTERNAL("nexttoward")
+TLI_DEFINE_SIG_INTERNAL(Dbl, Dbl, LDbl)
+
+/// float nexttowardf(float x, long double y);
+TLI_DEFINE_ENUM_INTERNAL(nexttowardf)
+TLI_DEFINE_STRING_INTERNAL("nexttowardf")
+TLI_DEFINE_SIG_INTERNAL(Flt, Flt, LDbl)
+
+/// long double nexttowardl(long double x, long double y);
+TLI_DEFINE_ENUM_INTERNAL(nexttowardl)
+TLI_DEFINE_STRING_INTERNAL("nexttowardl")
+TLI_DEFINE_SIG_INTERNAL(LDbl, LDbl, LDbl)
+
 /// uint32_t ntohl(uint32_t netlong);
 TLI_DEFINE_ENUM_INTERNAL(ntohl)
 TLI_DEFINE_STRING_INTERNAL("ntohl")
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index 3f39b4787eb11..78954431e81c3 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -23,6 +23,7 @@
 namespace llvm {
 
 template <typename T> class ArrayRef;
+enum class VectorLibrary;
 
 /// Provides info so a possible vectorization of a function can be
 /// computed. Function 'VectorFnName' is equivalent to 'ScalarFnName'
@@ -117,25 +118,6 @@ class TargetLibraryInfoImpl {
                                        const Module &M) const;
 
 public:
-  /// List of known vector-functions libraries.
-  ///
-  /// The vector-functions library defines, which functions are vectorizable
-  /// and with which factor. The library can be specified by either frontend,
-  /// or a commandline option, and then used by
-  /// addVectorizableFunctionsFromVecLib for filling up the tables of
-  /// vectorizable functions.
-  enum VectorLibrary {
-    NoLibrary,        // Don't use any vector library.
-    Accelerate,       // Use Accelerate framework.
-    DarwinLibSystemM, // Use Darwin's libsystem_m.
-    LIBMVEC,          // GLIBC Vector Math library.
-    MASSV,            // IBM MASS vector library.
-    SVML,             // Intel short vector math library.
-    SLEEFGNUABI, // SLEEF - SIMD Library for Evaluating Elementary Functions.
-    ArmPL,       // Arm Performance Libraries.
-    AMDLIBM      // AMD Math Vector library.
-  };
-
   TargetLibraryInfoImpl() = delete;
   LLVM_ABI explicit TargetLibraryInfoImpl(const Triple &T);
 
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 7b7dc1b46dd80..0f17312b03827 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1764,7 +1764,7 @@ class TargetTransformInfo {
   /// \param Types List of types to check.
   LLVM_ABI bool areTypesABICompatible(const Function *Caller,
                                       const Function *Callee,
-                                      const ArrayRef<Type *> &Types) const;
+                                      ArrayRef<Type *> Types) const;
 
   /// The type of load/store indexing.
   enum MemIndexedMode {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 4cd607c0d0c8d..aacb88d2f9684 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1028,7 +1028,7 @@ class TargetTransformInfoImplBase {
 
   virtual bool areTypesABICompatible(const Function *Caller,
                                      const Function *Callee,
-                                     const ArrayRef<Type *> &Types) const {
+                                     ArrayRef<Type *> Types) const {
     return (Caller->getFnAttribute("target-cpu") ==
             Callee->getFnAttribute("target-cpu")) &&
            (Caller->getFnAttribute("target-features") ==
diff --git a/llvm/include/llvm/Analysis/TensorSpec.h b/llvm/include/llvm/Analysis/TensorSpec.h
index d432ce8a203c4..8b19b6bb976ec 100644
--- a/llvm/include/llvm/Analysis/TensorSpec.h
+++ b/llvm/include/llvm/Analysis/TensorSpec.h
@@ -15,7 +15,6 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/IR/LLVMContext.h"
 
-#include <memory>
 #include <optional>
 #include <vector>
 
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index af218ba564081..093309cb8bbee 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -1024,6 +1024,16 @@ findValuesAffectedByCondition(Value *Cond, bool IsAssume,
 LLVM_ABI Value *stripNullTest(Value *V);
 LLVM_ABI const Value *stripNullTest(const Value *V);
 
+/// Enumerates all possible values of V and inserts them into the set \p
+/// Constants. If \p AllowUndefOrPoison is false, it fails when V may contain
+/// undef/poison elements. Returns true if the result is complete. Otherwise,
+/// the result is incomplete (more than MaxCount values).
+/// NOTE: The constant values are not distinct.
+LLVM_ABI bool
+collectPossibleValues(const Value *V,
+                      SmallPtrSetImpl<const Constant *> &Constants,
+                      unsigned MaxCount, bool AllowUndefOrPoison = true);
+
 } // end namespace llvm
 
 #endif // LLVM_ANALYSIS_VALUETRACKING_H
diff --git a/llvm/include/llvm/AsmParser/SlotMapping.h b/llvm/include/llvm/AsmParser/SlotMapping.h
index 2d2b8d8400bd7..aae2a863babd9 100644
--- a/llvm/include/llvm/AsmParser/SlotMapping.h
+++ b/llvm/include/llvm/AsmParser/SlotMapping.h
@@ -17,7 +17,6 @@
 #include "llvm/AsmParser/NumberedValues.h"
 #include "llvm/IR/TrackingMDRef.h"
 #include <map>
-#include <vector>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 6ee6b666c1735..39e9611c7190e 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1125,6 +1125,8 @@ struct Elf64_Shdr {
   Elf64_Xword sh_entsize;
 };
 
+enum { PN_XNUM = 0xffff };
+
 // Special section indices.
 enum {
   SHN_UNDEF = 0,          // Undefined, missing, irrelevant, or meaningless
diff --git a/llvm/include/llvm/Bitcode/BitcodeWriter.h b/llvm/include/llvm/Bitcode/BitcodeWriter.h
index e9b573733451b..d88e261f8c684 100644
--- a/llvm/include/llvm/Bitcode/BitcodeWriter.h
+++ b/llvm/include/llvm/Bitcode/BitcodeWriter.h
@@ -19,9 +19,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/MemoryBufferRef.h"
-#include <map>
 #include <memory>
-#include <string>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 464f475098ec5..b0c5beae631ce 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -801,6 +801,7 @@ enum AttributeKindCodes {
   ATTR_KIND_CAPTURES = 102,
   ATTR_KIND_DEAD_ON_RETURN = 103,
   ATTR_KIND_SANITIZE_ALLOC_TOKEN = 104,
+  ATTR_KIND_NO_CREATE_UNDEF_OR_POISON = 105,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/CAS/ActionCache.h b/llvm/include/llvm/CAS/ActionCache.h
index 69ee4dde1974a..7f5b11223c54d 100644
--- a/llvm/include/llvm/CAS/ActionCache.h
+++ b/llvm/include/llvm/CAS/ActionCache.h
@@ -75,6 +75,9 @@ class ActionCache {
                    CanBeDistributed);
   }
 
+  /// Validate the ActionCache contents.
+  virtual Error validate() const = 0;
+
   virtual ~ActionCache() = default;
 
 protected:
@@ -97,6 +100,9 @@ class ActionCache {
 /// Create an action cache in memory.
 std::unique_ptr<ActionCache> createInMemoryActionCache();
 
+/// Create an action cache on disk.
+Expected<std::unique_ptr<ActionCache>> createOnDiskActionCache(StringRef Path);
+
 } // end namespace llvm::cas
 
 #endif // LLVM_CAS_ACTIONCACHE_H
diff --git a/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h
new file mode 100644
index 0000000000000..6c165c421b168
--- /dev/null
+++ b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H
+#define LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H
+
+#include "llvm/Support/Error.h"
+
+namespace llvm::cas {
+
+class ActionCache;
+class ObjectStore;
+
+/// Create on-disk \c ObjectStore and \c ActionCache instances based on
+/// \c ondisk::UnifiedOnDiskCache, with built-in hashing.
+Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>>
+createOnDiskUnifiedCASDatabases(StringRef Path);
+
+/// Represents the result of validating the contents using
+/// \c validateOnDiskUnifiedCASDatabasesIfNeeded.
+///
+/// Note: invalid results are handled as an \c Error.
+enum class ValidationResult {
+  /// The data is already valid.
+  Valid,
+  /// The data was invalid, but was recovered.
+  Recovered,
+  /// Validation was skipped, as it was not needed.
+  Skipped,
+};
+
+/// Validate the data in \p Path, if needed to ensure correctness.
+///
+/// \param Path directory for the on-disk database.
+/// \param CheckHash Whether to validate hashes match the data.
+/// \param AllowRecovery Whether to automatically recover from invalid data by
+/// marking the files for garbage collection.
+/// \param ForceValidation Whether to force validation to occur even if it
+/// should not be necessary.
+/// \param LLVMCasBinaryPath If provided, validation is performed out-of-process
+/// using the given \c llvm-cas executable which protects against crashes
+/// during validation. Otherwise validation is performed in-process.
+///
+/// \returns \c Valid if the data is already valid, \c Recovered if data
+/// was invalid but has been cleared, \c Skipped if validation is not needed,
+/// or an \c Error if validation cannot be performed or if the data is left
+/// in an invalid state because \p AllowRecovery is false.
+Expected<ValidationResult> validateOnDiskUnifiedCASDatabasesIfNeeded(
+    StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation,
+    std::optional<StringRef> LLVMCasBinaryPath);
+
+} // namespace llvm::cas
+
+#endif // LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H
diff --git a/llvm/include/llvm/CAS/ObjectStore.h b/llvm/include/llvm/CAS/ObjectStore.h
index 6db5dd3904095..29950fe9d9029 100644
--- a/llvm/include/llvm/CAS/ObjectStore.h
+++ b/llvm/include/llvm/CAS/ObjectStore.h
@@ -5,6 +5,11 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the declaration of the ObjectStore class.
+///
+//===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CAS_OBJECTSTORE_H
 #define LLVM_CAS_OBJECTSTORE_H
@@ -111,7 +116,10 @@ class ObjectStore {
   virtual Expected<bool> isMaterialized(ObjectRef Ref) const = 0;
 
   /// Validate the underlying object referred by CASID.
-  virtual Error validate(const CASID &ID) = 0;
+  virtual Error validateObject(const CASID &ID) = 0;
+
+  /// Validate the entire ObjectStore.
+  virtual Error validate(bool CheckHash) const = 0;
 
 protected:
   /// Load the object referenced by \p Ref.
@@ -215,9 +223,39 @@ class ObjectStore {
     return Data.size();
   }
 
+  /// Set the size for limiting growth of on-disk storage. This has an effect
+  /// for when the instance is closed.
+  ///
+  /// Implementations may leave this unimplemented.
+  virtual Error setSizeLimit(std::optional<uint64_t> SizeLimit) {
+    return Error::success();
+  }
+
+  /// \returns the storage size of the on-disk CAS data.
+  ///
+  /// Implementations that don't have an implementation for this should return
+  /// \p std::nullopt.
+  virtual Expected<std::optional<uint64_t>> getStorageSize() const {
+    return std::nullopt;
+  }
+
+  /// Prune local storage to reduce its size according to the desired size
+  /// limit. Pruning can happen concurrently with other operations.
+  ///
+  /// Implementations may leave this unimplemented.
+  virtual Error pruneStorageData() { return Error::success(); }
+
   /// Validate the whole node tree.
   Error validateTree(ObjectRef Ref);
 
+  /// Import object from another CAS. This will import the full tree from the
+  /// other CAS.
+  Expected<ObjectRef> importObject(ObjectStore &Upstream, ObjectRef Other);
+
+  /// Print the ObjectStore internals for debugging purpose.
+  virtual void print(raw_ostream &) const {}
+  void dump() const;
+
   /// Get CASContext
   const CASContext &getContext() const { return Context; }
 
@@ -290,8 +328,15 @@ class ObjectProxy {
   ObjectHandle H;
 };
 
+/// Create an in memory CAS.
 std::unique_ptr<ObjectStore> createInMemoryCAS();
 
+/// \returns true if \c LLVM_ENABLE_ONDISK_CAS configuration was enabled.
+bool isOnDiskCASEnabled();
+
+/// Create a persistent on-disk path at \p Path.
+Expected<std::unique_ptr<ObjectStore>> createOnDiskCAS(const Twine &Path);
+
 } // namespace cas
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CAS/OnDiskGraphDB.h b/llvm/include/llvm/CAS/OnDiskGraphDB.h
index 5f0ee0e131c0f..76cc528711b69 100644
--- a/llvm/include/llvm/CAS/OnDiskGraphDB.h
+++ b/llvm/include/llvm/CAS/OnDiskGraphDB.h
@@ -340,13 +340,16 @@ class OnDiskGraphDB {
   /// \param HashByteSize Size for the object digest hash bytes.
   /// \param UpstreamDB Optional on-disk store to be used for faulting-in nodes
   /// if they don't exist in the primary store. The upstream store is only used
-  /// for reading nodes, new nodes are only written to the primary store.
+  /// for reading nodes, new nodes are only written to the primary store. User
+  /// need to make sure \p UpstreamDB outlives current instance of
+  /// OnDiskGraphDB and the common usage is to have an \p UnifiedOnDiskCache to
+  /// manage both.
   /// \param Policy If \p UpstreamDB is provided, controls how nodes are copied
   /// to primary store. This is recorded at creation time and subsequent opens
   /// need to pass the same policy otherwise the \p open will fail.
   static Expected<std::unique_ptr<OnDiskGraphDB>>
   open(StringRef Path, StringRef HashName, unsigned HashByteSize,
-       std::unique_ptr<OnDiskGraphDB> UpstreamDB = nullptr,
+       OnDiskGraphDB *UpstreamDB = nullptr,
        FaultInPolicy Policy = FaultInPolicy::FullTree);
 
   ~OnDiskGraphDB();
@@ -438,8 +441,7 @@ class OnDiskGraphDB {
 
   // Private constructor.
   OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
-                OnDiskDataAllocator DataPool,
-                std::unique_ptr<OnDiskGraphDB> UpstreamDB,
+                OnDiskDataAllocator DataPool, OnDiskGraphDB *UpstreamDB,
                 FaultInPolicy Policy);
 
   /// Mapping from hash to object reference.
@@ -459,7 +461,7 @@ class OnDiskGraphDB {
   std::string RootPath;
 
   /// Optional on-disk store to be used for faulting-in nodes.
-  std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+  OnDiskGraphDB *UpstreamDB = nullptr;
 
   /// The policy used to fault in data from upstream.
   FaultInPolicy FIPolicy;
diff --git a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h
index b762518366c21..17ae52f0307fc 100644
--- a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h
+++ b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h
@@ -19,6 +19,8 @@
 
 namespace llvm::cas::ondisk {
 
+class UnifiedOnDiskCache;
+
 /// An on-disk key-value data store with the following properties:
 /// * Keys are fixed length binary hashes with expected normal distribution.
 /// * Values are buffers of the same size, specified at creation time.
@@ -59,9 +61,13 @@ class OnDiskKeyValueDB {
   /// \param KeySize Size for the key hash bytes.
   /// \param ValueName Identifier name for the values.
   /// \param ValueSize Size for the value bytes.
+  /// \param UnifiedCache An optional UnifiedOnDiskCache that manages the size
+  /// and lifetime of the CAS instance and it must owns current initializing
+  /// KeyValueDB after initialized.
   static Expected<std::unique_ptr<OnDiskKeyValueDB>>
   open(StringRef Path, StringRef HashName, unsigned KeySize,
-       StringRef ValueName, size_t ValueSize);
+       StringRef ValueName, size_t ValueSize,
+       UnifiedOnDiskCache *UnifiedCache = nullptr);
 
   using CheckValueT =
       function_ref<Error(FileOffset Offset, ArrayRef<char> Data)>;
@@ -70,11 +76,14 @@ class OnDiskKeyValueDB {
   Error validate(CheckValueT CheckValue) const;
 
 private:
-  OnDiskKeyValueDB(size_t ValueSize, OnDiskTrieRawHashMap Cache)
-      : ValueSize(ValueSize), Cache(std::move(Cache)) {}
+  OnDiskKeyValueDB(size_t ValueSize, OnDiskTrieRawHashMap Cache,
+                   UnifiedOnDiskCache *UnifiedCache)
+      : ValueSize(ValueSize), Cache(std::move(Cache)),
+        UnifiedCache(UnifiedCache) {}
 
   const size_t ValueSize;
   OnDiskTrieRawHashMap Cache;
+  UnifiedOnDiskCache *UnifiedCache = nullptr;
 };
 
 } // namespace llvm::cas::ondisk
diff --git a/llvm/include/llvm/CAS/UnifiedOnDiskCache.h b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h
new file mode 100644
index 0000000000000..6e0878a65fe72
--- /dev/null
+++ b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h
@@ -0,0 +1,172 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_UNIFIEDONDISKCACHE_H
+#define LLVM_CAS_UNIFIEDONDISKCACHE_H
+
+#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include <atomic>
+
+namespace llvm::cas::ondisk {
+
+class OnDiskKeyValueDB;
+
+/// A unified CAS nodes and key-value database, using on-disk storage for both.
+/// It manages storage growth and provides APIs for garbage collection.
+///
+/// High-level properties:
+/// * While \p UnifiedOnDiskCache is open on a directory, by any process, the
+///   storage size in that directory will keep growing unrestricted. For data to
+///   become eligible for garbage-collection there should be no open instances
+///   of \p UnifiedOnDiskCache for that directory, by any process.
+/// * Garbage-collection needs to be triggered explicitly by the client. It can
+///   be triggered on a directory concurrently, at any time and by any process,
+///   without affecting any active readers/writers, in the same process or other
+///   processes.
+///
+/// Usage patterns should be that an instance of \p UnifiedOnDiskCache is open
+/// for a limited period of time, e.g. for the duration of a build operation.
+/// For long-living processes that need periodic access to a
+/// \p UnifiedOnDiskCache, the client should devise a scheme where access is
+/// performed within some defined period. For example, if a service is designed
+/// to continuously wait for requests that access a \p UnifiedOnDiskCache, it
+/// could keep the instance alive while new requests are coming in but close it
+/// after a time period in which there are no new requests.
+class UnifiedOnDiskCache {
+public:
+  /// The \p OnDiskGraphDB instance for the open directory.
+  OnDiskGraphDB &getGraphDB() { return *PrimaryGraphDB; }
+
+  /// The \p OnDiskGraphDB instance for the open directory.
+  OnDiskKeyValueDB &getKeyValueDB() { return *PrimaryKVDB; }
+
+  /// Open a \p UnifiedOnDiskCache instance for a directory.
+  ///
+  /// \param Path directory for the on-disk database. The directory will be
+  /// created if it doesn't exist.
+  /// \param SizeLimit Optional size for limiting growth. This has an effect for
+  /// when the instance is closed.
+  /// \param HashName Identifier name for the hashing algorithm that is going to
+  /// be used.
+  /// \param HashByteSize Size for the object digest hash bytes.
+  /// \param FaultInPolicy Controls how nodes are copied to primary store. This
+  /// is recorded at creation time and subsequent opens need to pass the same
+  /// policy otherwise the \p open will fail.
+  static Expected<std::unique_ptr<UnifiedOnDiskCache>>
+  open(StringRef Path, std::optional<uint64_t> SizeLimit, StringRef HashName,
+       unsigned HashByteSize,
+       OnDiskGraphDB::FaultInPolicy FaultInPolicy =
+           OnDiskGraphDB::FaultInPolicy::FullTree);
+
+  /// Validate the data in \p Path, if needed to ensure correctness.
+  ///
+  /// Note: if invalid data is detected and \p AllowRecovery is true, then
+  /// recovery requires exclusive access to the CAS and it is an error to
+  /// attempt recovery if there is concurrent use of the CAS.
+  ///
+  /// \param Path directory for the on-disk database.
+  /// \param HashName Identifier name for the hashing algorithm that is going to
+  /// be used.
+  /// \param HashByteSize Size for the object digest hash bytes.
+  /// \param CheckHash Whether to validate hashes match the data.
+  /// \param AllowRecovery Whether to automatically recover from invalid data by
+  /// marking the files for garbage collection.
+  /// \param ForceValidation Whether to force validation to occur even if it
+  /// should not be necessary.
+  /// \param LLVMCasBinary If provided, validation is performed out-of-process
+  /// using the given \c llvm-cas executable which protects against crashes
+  /// during validation. Otherwise validation is performed in-process.
+  ///
+  /// \returns \c Valid if the data is already valid, \c Recovered if data
+  /// was invalid but has been cleared, \c Skipped if validation is not needed,
+  /// or an \c Error if validation cannot be performed or if the data is left
+  /// in an invalid state because \p AllowRecovery is false.
+  static Expected<ValidationResult>
+  validateIfNeeded(StringRef Path, StringRef HashName, unsigned HashByteSize,
+                   bool CheckHash, bool AllowRecovery, bool ForceValidation,
+                   std::optional<StringRef> LLVMCasBinary);
+
+  /// This is called implicitly at destruction time, so it is not required for a
+  /// client to call this. After calling \p close the only method that is valid
+  /// to call is \p needsGarbageCollection.
+  ///
+  /// \param CheckSizeLimit if true it will check whether the primary store has
+  /// exceeded its intended size limit. If false the check is skipped even if a
+  /// \p SizeLimit was passed to the \p open call.
+  Error close(bool CheckSizeLimit = true);
+
+  /// Set the size for limiting growth. This has an effect for when the instance
+  /// is closed.
+  void setSizeLimit(std::optional<uint64_t> SizeLimit);
+
+  /// \returns the storage size of the cache data.
+  uint64_t getStorageSize() const;
+
+  /// \returns whether the primary store has exceeded the intended size limit.
+  /// This can return false even if the overall size of the opened directory is
+  /// over the \p SizeLimit passed to \p open. To know whether garbage
+  /// collection needs to be triggered or not, call \p needsGarbaseCollection.
+  bool hasExceededSizeLimit() const;
+
+  /// \returns whether there are unused data that can be deleted using a
+  /// \p collectGarbage call.
+  bool needsGarbageCollection() const { return NeedsGarbageCollection; }
+
+  /// Remove any unused data from the directory at \p Path. If there are no such
+  /// data the operation is a no-op.
+  ///
+  /// This can be called concurrently, regardless of whether there is an open
+  /// \p UnifiedOnDiskCache instance or not; it has no effect on readers/writers
+  /// in the same process or other processes.
+  ///
+  /// It is recommended that garbage-collection is triggered concurrently in the
+  /// background, so that it has minimal effect on the workload of the process.
+  static Error collectGarbage(StringRef Path);
+
+  /// Remove unused data from the current UnifiedOnDiskCache.
+  Error collectGarbage();
+
+  /// Helper function to convert the value stored in KeyValueDB and ObjectID.
+  static ObjectID getObjectIDFromValue(ArrayRef<char> Value);
+
+  using ValueBytes = std::array<char, sizeof(uint64_t)>;
+  static ValueBytes getValueFromObjectID(ObjectID ID);
+
+  ~UnifiedOnDiskCache();
+
+private:
+  friend class OnDiskGraphDB;
+  friend class OnDiskKeyValueDB;
+
+  UnifiedOnDiskCache();
+
+  Expected<std::optional<ArrayRef<char>>>
+  faultInFromUpstreamKV(ArrayRef<uint8_t> Key);
+
+  /// \returns the storage size of the primary directory.
+  uint64_t getPrimaryStorageSize() const;
+
+  std::string RootPath;
+  std::atomic<uint64_t> SizeLimit;
+
+  int LockFD = -1;
+
+  std::atomic<bool> NeedsGarbageCollection;
+  std::string PrimaryDBDir;
+
+  std::unique_ptr<OnDiskGraphDB> UpstreamGraphDB;
+  std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB;
+
+  std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB;
+  std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB;
+};
+
+} // namespace llvm::cas::ondisk
+
+#endif // LLVM_CAS_UNIFIEDONDISKCACHE_H
diff --git a/llvm/include/llvm/CGData/OutlinedHashTree.h b/llvm/include/llvm/CGData/OutlinedHashTree.h
index d994b68f33ee4..8cbc50bc1b9ee 100644
--- a/llvm/include/llvm/CGData/OutlinedHashTree.h
+++ b/llvm/include/llvm/CGData/OutlinedHashTree.h
@@ -22,7 +22,6 @@
 #include "llvm/Support/raw_ostream.h"
 
 #include <unordered_map>
-#include <vector>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/CodeGen/Analysis.h b/llvm/include/llvm/CodeGen/Analysis.h
index 98b52579d03b7..2f1364d199710 100644
--- a/llvm/include/llvm/CodeGen/Analysis.h
+++ b/llvm/include/llvm/CodeGen/Analysis.h
@@ -71,7 +71,7 @@ void ComputeValueTypes(const DataLayout &DL, Type *Ty,
 ///
 void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty,
                      SmallVectorImpl<EVT> &ValueVTs,
-                     SmallVectorImpl<EVT> *MemVTs,
+                     SmallVectorImpl<EVT> *MemVTs = nullptr,
                      SmallVectorImpl<TypeSize> *Offsets = nullptr,
                      TypeSize StartingOffset = TypeSize::getZero());
 void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty,
@@ -80,20 +80,6 @@ void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty,
                      SmallVectorImpl<uint64_t> *FixedOffsets,
                      uint64_t StartingOffset);
 
-/// Variant of ComputeValueVTs that don't produce memory VTs.
-inline void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
-                            Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
-                            SmallVectorImpl<TypeSize> *Offsets = nullptr,
-                            TypeSize StartingOffset = TypeSize::getZero()) {
-  ComputeValueVTs(TLI, DL, Ty, ValueVTs, nullptr, Offsets, StartingOffset);
-}
-inline void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
-                            Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
-                            SmallVectorImpl<uint64_t> *FixedOffsets,
-                            uint64_t StartingOffset) {
-  ComputeValueVTs(TLI, DL, Ty, ValueVTs, nullptr, FixedOffsets, StartingOffset);
-}
-
 /// computeValueLLTs - Given an LLVM IR type, compute a sequence of
 /// LLTs that represent all the individual underlying
 /// non-aggregate types that comprise it.
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index 9ace2555b4b62..311f7df98cf8c 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -18,7 +18,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/StaticDataProfileInfo.h"
@@ -207,9 +207,9 @@ class LLVM_ABI AsmPrinter : public MachineFunctionPass {
     using CGTypeId = uint64_t;
 
     /// Unique target type IDs.
-    SmallSet<CGTypeId, 4> IndirectCalleeTypeIDs;
+    SmallSetVector<CGTypeId, 4> IndirectCalleeTypeIDs;
     /// Unique direct callees.
-    SmallSet<MCSymbol *, 4> DirectCallees;
+    SmallSetVector<MCSymbol *, 4> DirectCallees;
   };
 
   enum CallGraphSectionFormatVersion : uint8_t {
diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
index 48650a6df22ff..ee1f28377f7e4 100644
--- a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
+++ b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
@@ -54,6 +54,10 @@ struct FunctionPathAndClusterInfo {
   DenseMap<UniqueBBID, uint64_t> NodeCounts;
   // Edge counts for each edge, stored as a nested map.
   DenseMap<UniqueBBID, DenseMap<UniqueBBID, uint64_t>> EdgeCounts;
+  // Hash for each basic block. The Hashes are stored for every original block
+  // (not cloned blocks), hence the map key being unsigned instead of
+  // UniqueBBID.
+  DenseMap<unsigned, uint64_t> BBHashes;
 };
 
 class BasicBlockSectionsProfileReader {
@@ -62,19 +66,15 @@ class BasicBlockSectionsProfileReader {
   BasicBlockSectionsProfileReader(const MemoryBuffer *Buf)
       : MBuf(Buf), LineIt(*Buf, /*SkipBlanks=*/true, /*CommentMarker=*/'#'){};
 
-  BasicBlockSectionsProfileReader(){};
+  BasicBlockSectionsProfileReader() = default;
 
-  // Returns true if basic block sections profile exist for function \p
-  // FuncName.
+  // Returns true if function \p FuncName is hot based on the basic block
+  // section profile.
   bool isFunctionHot(StringRef FuncName) const;
 
-  // Returns a pair with first element representing whether basic block sections
-  // profile exist for the function \p FuncName, and the second element
-  // representing the basic block sections profile (cluster info) for this
-  // function. If the first element is true and the second element is empty, it
-  // means unique basic block sections are desired for all basic blocks of the
-  // function.
-  std::pair<bool, SmallVector<BBClusterInfo>>
+  // Returns the cluster info for the function \p FuncName. Returns an empty
+  // vector if function has no cluster info.
+  SmallVector<BBClusterInfo>
   getClusterInfoForFunction(StringRef FuncName) const;
 
   // Returns the path clonings for the given function.
@@ -186,7 +186,7 @@ class BasicBlockSectionsProfileReaderWrapperPass : public ImmutablePass {
 
   bool isFunctionHot(StringRef FuncName) const;
 
-  std::pair<bool, SmallVector<BBClusterInfo>>
+  SmallVector<BBClusterInfo>
   getClusterInfoForFunction(StringRef FuncName) const;
 
   SmallVector<SmallVector<unsigned>>
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index e8dbc964a943e..944e1714e8f98 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -302,7 +302,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   /// (e.g. scalarization).
   std::optional<InstructionCost> getMultipleResultIntrinsicVectorLibCallCost(
       const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind,
-      RTLIB::Libcall LC,
       std::optional<unsigned> CallRetElementIndex = {}) const {
     Type *RetTy = ICA.getReturnType();
     // Vector variants of the intrinsic can be mapped to a vector library call.
@@ -311,26 +310,38 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
         !isVectorizedStructTy(cast<StructType>(RetTy)))
       return std::nullopt;
 
+    Type *Ty = getContainedTypes(RetTy).front();
+    EVT VT = getTLI()->getValueType(DL, Ty);
+
+    RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+
+    switch (ICA.getID()) {
+    case Intrinsic::modf:
+      LC = RTLIB::getMODF(VT);
+      break;
+    case Intrinsic::sincospi:
+      LC = RTLIB::getSINCOSPI(VT);
+      break;
+    case Intrinsic::sincos:
+      LC = RTLIB::getSINCOS(VT);
+      break;
+    default:
+      return std::nullopt;
+    }
+
     // Find associated libcall.
-    const char *LCName = getTLI()->getLibcallName(LC);
-    if (!LCName)
+    RTLIB::LibcallImpl LibcallImpl = getTLI()->getLibcallImpl(LC);
+    if (LibcallImpl == RTLIB::Unsupported)
       return std::nullopt;
 
-    // Search for a corresponding vector variant.
     LLVMContext &Ctx = RetTy->getContext();
-    ElementCount VF = getVectorizedTypeVF(RetTy);
-    VecDesc const *VD = nullptr;
-    for (bool Masked : {false, true}) {
-      if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
-        break;
-    }
-    if (!VD)
-      return std::nullopt;
 
     // Cost the call + mask.
     auto Cost =
         thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
-    if (VD->isMasked()) {
+
+    if (RTLIB::RuntimeLibcallsInfo::hasVectorMaskArgument(LibcallImpl)) {
+      ElementCount VF = getVectorizedTypeVF(RetTy);
       auto VecTy = VectorType::get(IntegerType::getInt1Ty(Ctx), VF);
       Cost += thisT()->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
                                       VecTy, {}, CostKind, 0, nullptr, {});
@@ -1306,8 +1317,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       bool SplitDst =
           TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
           TargetLowering::TypeSplitVector;
-      if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
-          DstVTy->getElementCount().isVector()) {
+      if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isKnownEven() &&
+          DstVTy->getElementCount().isKnownEven()) {
         Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
         Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
         const T *TTI = thisT();
@@ -2137,22 +2148,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     case Intrinsic::modf:
     case Intrinsic::sincos:
     case Intrinsic::sincospi: {
-      Type *Ty = getContainedTypes(RetTy).front();
-      EVT VT = getTLI()->getValueType(DL, Ty);
-
-      RTLIB::Libcall LC = [&] {
-        switch (ICA.getID()) {
-        case Intrinsic::modf:
-          return RTLIB::getMODF;
-        case Intrinsic::sincos:
-          return RTLIB::getSINCOS;
-        case Intrinsic::sincospi:
-          return RTLIB::getSINCOSPI;
-        default:
-          llvm_unreachable("unexpected intrinsic");
-        }
-      }()(VT.getScalarType());
-
       std::optional<unsigned> CallRetElementIndex;
       // The first element of the modf result is returned by value in the
       // libcall.
@@ -2160,7 +2155,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
         CallRetElementIndex = 0;
 
       if (auto Cost = getMultipleResultIntrinsicVectorLibCallCost(
-              ICA, CostKind, LC, CallRetElementIndex))
+              ICA, CostKind, CallRetElementIndex))
         return *Cost;
       // Otherwise, fallback to default scalarization cost.
       break;
diff --git a/llvm/include/llvm/CodeGen/DIE.h b/llvm/include/llvm/CodeGen/DIE.h
index 32f46517677f2..92265fd86ebb9 100644
--- a/llvm/include/llvm/CodeGen/DIE.h
+++ b/llvm/include/llvm/CodeGen/DIE.h
@@ -653,7 +653,7 @@ template <class T> class IntrusiveBackList : IntrusiveBackListBase {
   public:
     const_iterator() = default;
     // Placate MSVC by explicitly scoping 'iterator'.
-    const_iterator(typename IntrusiveBackList<T>::iterator X) : N(X.N) {}
+    const_iterator(IntrusiveBackList<T>::iterator X) : N(X.N) {}
     explicit const_iterator(const T *N) : N(N) {}
 
     const_iterator &operator++() {
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 36cb90b1bc134..96cb7cdf2d531 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -293,7 +293,7 @@ class CombinerHelper {
                                  SmallVectorImpl<Register> &Ops) const;
   /// Replace \p MI with a concat_vectors with \p Ops.
   void applyCombineShuffleVector(MachineInstr &MI,
-                                 const ArrayRef<Register> Ops) const;
+                                 ArrayRef<Register> Ops) const;
 
   /// Optimize memcpy intrinsics et al, e.g. constant len calls.
   /// /p MaxLen if non-zero specifies the max length of a mem libcall to inline.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 268025e7018d3..9d6038db4391f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -297,6 +297,10 @@ class IRTranslator : public MachineFunctionPass {
   /// \pre \p U is a call instruction.
   bool translateCall(const User &U, MachineIRBuilder &MIRBuilder);
 
+  bool translateIntrinsic(
+      const CallBase &CB, Intrinsic::ID ID, MachineIRBuilder &MIRBuilder,
+      const TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo = nullptr);
+
   /// When an invoke or a cleanupret unwinds to the next EH pad, there are
   /// many places it could ultimately go. In the IR, we have a single unwind
   /// destination, but in the machine CFG, we enumerate all the possible blocks.
diff --git a/llvm/include/llvm/CodeGen/GlobalMergeFunctions.h b/llvm/include/llvm/CodeGen/GlobalMergeFunctions.h
index caea5b62851ea..54ea68a418846 100644
--- a/llvm/include/llvm/CodeGen/GlobalMergeFunctions.h
+++ b/llvm/include/llvm/CodeGen/GlobalMergeFunctions.h
@@ -58,7 +58,7 @@ class GlobalMergeFunc {
   /// The suffix used to identify the merged function that parameterizes
   /// the constant values. Note that the original function, without this suffix,
   /// becomes a thunk supplying contexts to the merged function via parameters.
-  static constexpr const char MergingInstanceSuffix[] = ".Tgm";
+  static constexpr char MergingInstanceSuffix[] = ".Tgm";
 
   GlobalMergeFunc(const ModuleSummaryIndex *Index) : Index(Index) {};
 
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index ff3dd0d4c3c51..b3a2ced70e628 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1516,6 +1516,7 @@ enum NodeType {
   PARTIAL_REDUCE_SMLA,  // sext, sext
   PARTIAL_REDUCE_UMLA,  // zext, zext
   PARTIAL_REDUCE_SUMLA, // sext, zext
+  PARTIAL_REDUCE_FMLA,  // fpext, fpext
 
   // The `llvm.experimental.stackmap` intrinsic.
   // Operands: input chain, glue, <id>, <numShadowBytes>, [live0[, live1...]]
@@ -1537,6 +1538,9 @@ enum NodeType {
 #define BEGIN_REGISTER_VP_SDNODE(VPSDID, ...) VPSDID,
 #include "llvm/IR/VPIntrinsics.def"
 
+  // Issue a no-op relocation against a given symbol at the current location.
+  RELOC_NONE,
+
   // The `llvm.experimental.convergence.*` intrinsics.
   CONVERGENCECTRL_ANCHOR,
   CONVERGENCECTRL_ENTRY,
diff --git a/llvm/include/llvm/CodeGen/LibcallLoweringInfo.h b/llvm/include/llvm/CodeGen/LibcallLoweringInfo.h
new file mode 100644
index 0000000000000..e88079e796e7d
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/LibcallLoweringInfo.h
@@ -0,0 +1,71 @@
+//===- LibcallLoweringInfo.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_LIBCALLLOWERINGINFO_H
+#define LLVM_CODEGEN_LIBCALLLOWERINGINFO_H
+
+#include "llvm/IR/RuntimeLibcalls.h"
+
+namespace llvm {
+
+class LibcallLoweringInfo {
+private:
+  const RTLIB::RuntimeLibcallsInfo &RTLCI;
+  /// Stores the implementation choice for each each libcall.
+  RTLIB::LibcallImpl LibcallImpls[RTLIB::UNKNOWN_LIBCALL + 1] = {
+      RTLIB::Unsupported};
+
+public:
+  LLVM_ABI LibcallLoweringInfo(const RTLIB::RuntimeLibcallsInfo &RTLCI);
+
+  /// Get the libcall routine name for the specified libcall.
+  // FIXME: This should be removed. Only LibcallImpl should have a name.
+  LLVM_ABI const char *getLibcallName(RTLIB::Libcall Call) const {
+    // FIXME: Return StringRef
+    return RTLIB::RuntimeLibcallsInfo::getLibcallImplName(LibcallImpls[Call])
+        .data();
+  }
+
+  /// Return the lowering's selection of implementation call for \p Call
+  LLVM_ABI RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const {
+    return LibcallImpls[Call];
+  }
+
+  /// Rename the default libcall routine name for the specified libcall.
+  LLVM_ABI void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl) {
+    LibcallImpls[Call] = Impl;
+  }
+
+  // FIXME: Remove this wrapper in favor of directly using
+  // getLibcallImplCallingConv
+  LLVM_ABI CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const {
+    return RTLCI.LibcallImplCallingConvs[LibcallImpls[Call]];
+  }
+
+  /// Get the CallingConv that should be used for the specified libcall.
+  LLVM_ABI CallingConv::ID
+  getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const {
+    return RTLCI.LibcallImplCallingConvs[Call];
+  }
+
+  /// Return a function impl compatible with RTLIB::MEMCPY, or
+  /// RTLIB::Unsupported if fully unsupported.
+  RTLIB::LibcallImpl getMemcpyImpl() const {
+    RTLIB::LibcallImpl Memcpy = getLibcallImpl(RTLIB::MEMCPY);
+    if (Memcpy == RTLIB::Unsupported) {
+      // Fallback to memmove if memcpy isn't available.
+      return getLibcallImpl(RTLIB::MEMMOVE);
+    }
+
+    return Memcpy;
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_LIBCALLLOWERINGINFO_H
diff --git a/llvm/include/llvm/CodeGen/LiveIntervals.h b/llvm/include/llvm/CodeGen/LiveIntervals.h
index c252f9d99f2af..32027766e7093 100644
--- a/llvm/include/llvm/CodeGen/LiveIntervals.h
+++ b/llvm/include/llvm/CodeGen/LiveIntervals.h
@@ -412,7 +412,7 @@ class LiveIntervals {
 
   /// Return the live range for register unit \p Unit. It will be computed if
   /// it doesn't exist.
-  LiveRange &getRegUnit(unsigned Unit) {
+  LiveRange &getRegUnit(MCRegUnit Unit) {
     LiveRange *LR = RegUnitRanges[Unit];
     if (!LR) {
       // Compute missing ranges on demand.
@@ -425,15 +425,15 @@ class LiveIntervals {
 
   /// Return the live range for register unit \p Unit if it has already been
   /// computed, or nullptr if it hasn't been computed yet.
-  LiveRange *getCachedRegUnit(unsigned Unit) { return RegUnitRanges[Unit]; }
+  LiveRange *getCachedRegUnit(MCRegUnit Unit) { return RegUnitRanges[Unit]; }
 
-  const LiveRange *getCachedRegUnit(unsigned Unit) const {
+  const LiveRange *getCachedRegUnit(MCRegUnit Unit) const {
     return RegUnitRanges[Unit];
   }
 
   /// Remove computed live range for register unit \p Unit. Subsequent uses
   /// should rely on on-demand recomputation.
-  void removeRegUnit(unsigned Unit) {
+  void removeRegUnit(MCRegUnit Unit) {
     delete RegUnitRanges[Unit];
     RegUnitRanges[Unit] = nullptr;
   }
@@ -489,7 +489,7 @@ class LiveIntervals {
   void dumpInstrs() const;
 
   void computeLiveInRegUnits();
-  LLVM_ABI void computeRegUnitRange(LiveRange &, unsigned Unit);
+  LLVM_ABI void computeRegUnitRange(LiveRange &, MCRegUnit Unit);
   LLVM_ABI bool computeVirtRegInterval(LiveInterval &);
 
   using ShrinkToUsesWorkList = SmallVector<std::pair<SlotIndex, VNInfo *>, 16>;
diff --git a/llvm/include/llvm/CodeGen/MIR2Vec.h b/llvm/include/llvm/CodeGen/MIR2Vec.h
index 44f009cd7790e..18b12901c1862 100644
--- a/llvm/include/llvm/CodeGen/MIR2Vec.h
+++ b/llvm/include/llvm/CodeGen/MIR2Vec.h
@@ -73,7 +73,7 @@ namespace mir2vec {
 class MIREmbedder;
 class SymbolicMIREmbedder;
 
-extern llvm::cl::OptionCategory MIR2VecCategory;
+LLVM_ABI extern llvm::cl::OptionCategory MIR2VecCategory;
 extern cl::opt<float> OpcWeight, CommonOperandWeight, RegOperandWeight;
 
 using Embedding = ir2vec::Embedding;
@@ -154,14 +154,14 @@ class MIRVocabulary {
   void buildRegisterOperandMapping();
 
   /// Get canonical index for a machine opcode
-  unsigned getCanonicalOpcodeIndex(unsigned Opcode) const;
+  LLVM_ABI unsigned getCanonicalOpcodeIndex(unsigned Opcode) const;
 
   /// Get index for a common (non-register) machine operand
   unsigned
   getCommonOperandIndex(MachineOperand::MachineOperandType OperandType) const;
 
   /// Get index for a register machine operand
-  unsigned getRegisterOperandIndex(Register Reg) const;
+  LLVM_ABI unsigned getRegisterOperandIndex(Register Reg) const;
 
   // Accessors for operand types
   const Embedding &
@@ -192,7 +192,7 @@ class MIRVocabulary {
 
   /// Get entity ID (flat index) for a common operand type
   /// This is used for triplet generation
-  unsigned getEntityIDForCommonOperand(
+  LLVM_ABI unsigned getEntityIDForCommonOperand(
       MachineOperand::MachineOperandType OperandType) const {
     return Layout.CommonOperandBase + getCommonOperandIndex(OperandType);
   }
@@ -221,7 +221,7 @@ class MIRVocabulary {
                                              bool IsPhysical = true) const;
 
   /// Get the string key for a vocabulary entry at the given position
-  std::string getStringKey(unsigned Pos) const;
+  LLVM_ABI std::string getStringKey(unsigned Pos) const;
 
   unsigned getDimension() const { return Storage.getDimension(); }
 
@@ -268,7 +268,7 @@ class MIRVocabulary {
          const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI);
 
   /// Create a dummy vocabulary for testing purposes.
-  static Expected<MIRVocabulary>
+  LLVM_ABI static Expected<MIRVocabulary>
   createDummyVocabForTest(const TargetInstrInfo &TII,
                           const TargetRegisterInfo &TRI,
                           const MachineRegisterInfo &MRI, unsigned Dim = 1);
@@ -302,10 +302,10 @@ class MIREmbedder {
         RegOperandWeight(mir2vec::RegOperandWeight) {}
 
   /// Function to compute embeddings.
-  Embedding computeEmbeddings() const;
+  LLVM_ABI Embedding computeEmbeddings() const;
 
   /// Function to compute the embedding for a given machine basic block.
-  Embedding computeEmbeddings(const MachineBasicBlock &MBB) const;
+  LLVM_ABI Embedding computeEmbeddings(const MachineBasicBlock &MBB) const;
 
   /// Function to compute the embedding for a given machine instruction.
   /// Specific to the kind of embeddings being computed.
@@ -316,9 +316,9 @@ class MIREmbedder {
 
   /// Factory method to create an Embedder object of the specified kind
   /// Returns nullptr if the requested kind is not supported.
-  static std::unique_ptr<MIREmbedder> create(MIR2VecKind Mode,
-                                             const MachineFunction &MF,
-                                             const MIRVocabulary &Vocab);
+  LLVM_ABI static std::unique_ptr<MIREmbedder>
+  create(MIR2VecKind Mode, const MachineFunction &MF,
+         const MIRVocabulary &Vocab);
 
   /// Computes and returns the embedding for a given machine instruction MI in
   /// the machine function MF.
@@ -369,7 +369,7 @@ class MIR2VecVocabProvider {
 public:
   MIR2VecVocabProvider(const MachineModuleInfo &MMI) : MMI(MMI) {}
 
-  Expected<mir2vec::MIRVocabulary> getVocabulary(const Module &M);
+  LLVM_ABI Expected<mir2vec::MIRVocabulary> getVocabulary(const Module &M);
 
 private:
   Error readVocabulary(VocabMap &OpcVocab, VocabMap &CommonOperandVocab,
@@ -454,7 +454,7 @@ class MIR2VecPrinterLegacyPass : public MachineFunctionPass {
 };
 
 /// Create a machine pass that prints MIR2Vec embeddings
-MachineFunctionPass *createMIR2VecPrinterLegacyPass(raw_ostream &OS);
+LLVM_ABI MachineFunctionPass *createMIR2VecPrinterLegacyPass(raw_ostream &OS);
 
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 71739278cf513..fcf7bab09fcff 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -129,7 +129,7 @@ class MachineBasicBlock
     MCRegister PhysReg;
     LaneBitmask LaneMask;
 
-    RegisterMaskPair(MCPhysReg PhysReg, LaneBitmask LaneMask)
+    RegisterMaskPair(MCRegister PhysReg, LaneBitmask LaneMask)
         : PhysReg(PhysReg), LaneMask(LaneMask) {}
 
     bool operator==(const RegisterMaskPair &other) const {
diff --git a/llvm/include/llvm/CodeGen/MachineDominators.h b/llvm/include/llvm/CodeGen/MachineDominators.h
index 41df86468aa37..faea0b7de525f 100644
--- a/llvm/include/llvm/CodeGen/MachineDominators.h
+++ b/llvm/include/llvm/CodeGen/MachineDominators.h
@@ -24,7 +24,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/GenericDomTree.h"
 #include <cassert>
-#include <memory>
 #include <optional>
 
 namespace llvm {
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 27b30bd5929ff..6982dae4718d1 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -982,7 +982,7 @@ class MachineRegisterInfo {
   /// root registers, the root register and all super registers are reserved.
   /// This currently iterates the register hierarchy and may be slower than
   /// expected.
-  LLVM_ABI bool isReservedRegUnit(unsigned Unit) const;
+  LLVM_ABI bool isReservedRegUnit(MCRegUnit Unit) const;
 
   /// isAllocatable - Returns true when PhysReg belongs to an allocatable
   /// register class and it hasn't been reserved.
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 5a2aee2fa7643..7b965d400ed08 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -829,7 +829,7 @@ class ResourceSegments {
 
 public:
   // constructor for empty set
-  explicit ResourceSegments(){};
+  explicit ResourceSegments() = default;
   bool empty() const { return _Intervals.empty(); }
   explicit ResourceSegments(const std::list<IntervalTy> &Intervals)
       : _Intervals(Intervals) {
@@ -1038,7 +1038,7 @@ class SchedBoundary {
   getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx,
                        unsigned ReleaseAtCycle, unsigned AcquireAtCycle);
 
-  bool isUnbufferedGroup(unsigned PIdx) const {
+  bool isReservedGroup(unsigned PIdx) const {
     return SchedModel->getProcResource(PIdx)->SubUnitsIdxBegin &&
            !SchedModel->getProcResource(PIdx)->BufferSize;
   }
diff --git a/llvm/include/llvm/CodeGen/RDFRegisters.h b/llvm/include/llvm/CodeGen/RDFRegisters.h
index 82027cad53bdb..4c15bf534d55f 100644
--- a/llvm/include/llvm/CodeGen/RDFRegisters.h
+++ b/llvm/include/llvm/CodeGen/RDFRegisters.h
@@ -294,7 +294,7 @@ struct RegisterAggr {
   ref_iterator ref_begin() const { return ref_iterator(*this, false); }
   ref_iterator ref_end() const { return ref_iterator(*this, true); }
 
-  using unit_iterator = typename BitVector::const_set_bits_iterator;
+  using unit_iterator = BitVector::const_set_bits_iterator;
   unit_iterator unit_begin() const { return Units.set_bits_begin(); }
   unit_iterator unit_end() const { return Units.set_bits_end(); }
 
@@ -361,13 +361,6 @@ template <> struct hash<llvm::rdf::RegisterAggr> {
   }
 };
 
-template <> struct equal_to<llvm::rdf::RegisterAggr> {
-  bool operator()(const llvm::rdf::RegisterAggr &A,
-                  const llvm::rdf::RegisterAggr &B) const {
-    return A == B;
-  }
-};
-
 } // namespace std
 
 namespace llvm::rdf {
diff --git a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
index d987a5cf1c3df..2893e5ce6647e 100644
--- a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
+++ b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
@@ -77,23 +77,23 @@ class MBBReachingDefsInfo {
     AllReachingDefs[MBBNumber].resize(NumRegUnits);
   }
 
-  void append(unsigned MBBNumber, unsigned Unit, int Def) {
+  void append(unsigned MBBNumber, MCRegUnit Unit, int Def) {
     AllReachingDefs[MBBNumber][Unit].push_back(Def);
   }
 
-  void prepend(unsigned MBBNumber, unsigned Unit, int Def) {
+  void prepend(unsigned MBBNumber, MCRegUnit Unit, int Def) {
     auto &Defs = AllReachingDefs[MBBNumber][Unit];
     Defs.insert(Defs.begin(), Def);
   }
 
-  void replaceFront(unsigned MBBNumber, unsigned Unit, int Def) {
+  void replaceFront(unsigned MBBNumber, MCRegUnit Unit, int Def) {
     assert(!AllReachingDefs[MBBNumber][Unit].empty());
     *AllReachingDefs[MBBNumber][Unit].begin() = Def;
   }
 
   void clear() { AllReachingDefs.clear(); }
 
-  ArrayRef<ReachingDef> defs(unsigned MBBNumber, unsigned Unit) const {
+  ArrayRef<ReachingDef> defs(unsigned MBBNumber, MCRegUnit Unit) const {
     if (AllReachingDefs[MBBNumber].empty())
       // Block IDs are not necessarily dense.
       return ArrayRef<ReachingDef>();
diff --git a/llvm/include/llvm/CodeGen/RegAllocRegistry.h b/llvm/include/llvm/CodeGen/RegAllocRegistry.h
index cd81e084a859b..db6264085b8a1 100644
--- a/llvm/include/llvm/CodeGen/RegAllocRegistry.h
+++ b/llvm/include/llvm/CodeGen/RegAllocRegistry.h
@@ -67,7 +67,7 @@ class RegisterRegAlloc : public RegisterRegAllocBase<RegisterRegAlloc> {
 /// RegisterRegAlloc's global Registry tracks allocator registration.
 template <class T>
 MachinePassRegistry<typename RegisterRegAllocBase<T>::FunctionPassCtor>
-RegisterRegAllocBase<T>::Registry;
+    RegisterRegAllocBase<T>::Registry;
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/Register.h b/llvm/include/llvm/CodeGen/Register.h
index e462a814562dc..790db8a11e390 100644
--- a/llvm/include/llvm/CodeGen/Register.h
+++ b/llvm/include/llvm/CodeGen/Register.h
@@ -10,6 +10,7 @@
 #define LLVM_CODEGEN_REGISTER_H
 
 #include "llvm/MC/MCRegister.h"
+#include "llvm/Support/MathExtras.h"
 #include <cassert>
 
 namespace llvm {
@@ -35,19 +36,23 @@ class Register {
   // DenseMapInfo<unsigned> uses -1u and -2u.
   static_assert(std::numeric_limits<decltype(Reg)>::max() >= 0xFFFFFFFF,
                 "Reg isn't large enough to hold full range.");
-  static constexpr unsigned FirstStackSlot = 1u << 30;
-  static_assert(FirstStackSlot >= MCRegister::LastPhysicalReg);
+  static constexpr unsigned MaxFrameIndexBitwidth = 30;
+  static constexpr unsigned StackSlotZero = 1u << MaxFrameIndexBitwidth;
+  static constexpr const unsigned StackSlotMask = StackSlotZero - 1;
+  static_assert(StackSlotZero >= MCRegister::LastPhysicalReg);
   static constexpr unsigned VirtualRegFlag = 1u << 31;
 
   /// Return true if this is a stack slot.
   constexpr bool isStack() const {
-    return Register::FirstStackSlot <= Reg && Reg < Register::VirtualRegFlag;
+    return Register::StackSlotZero <= Reg && Reg < Register::VirtualRegFlag;
   }
 
   /// Convert a non-negative frame index to a stack slot register value.
   static Register index2StackSlot(int FI) {
-    assert(FI >= 0 && "Cannot hold a negative frame index.");
-    return Register(FI + Register::FirstStackSlot);
+    assert(isInt<MaxFrameIndexBitwidth>(FI) &&
+           "Frame index must be at most 30 bits.");
+    unsigned FIMasked = FI & Register::StackSlotMask;
+    return Register(FIMasked | Register::StackSlotZero);
   }
 
   /// Return true if the specified register number is in
@@ -87,7 +92,7 @@ class Register {
   /// Compute the frame index from a register value representing a stack slot.
   int stackSlotIndex() const {
     assert(isStack() && "Not a stack slot");
-    return static_cast<int>(Reg - Register::FirstStackSlot);
+    return SignExtend32<MaxFrameIndexBitwidth>(Reg & Register::StackSlotMask);
   }
 
   constexpr operator unsigned() const { return Reg; }
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 9a6bf5ffdd227..511cb56f73dcb 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -1311,7 +1311,7 @@ template <typename... PatternTs> struct ReassociatableOpc_match {
   }
 
   [[nodiscard]] inline bool
-  reassociatableMatchHelper(const ArrayRef<SmallBitVector> Matches,
+  reassociatableMatchHelper(ArrayRef<SmallBitVector> Matches,
                             SmallBitVector &Used, size_t Curr = 0) {
     if (Curr == Matches.size())
       return true;
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index ab0d7e334df44..059a3444c609c 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -77,9 +77,9 @@ namespace llvm {
   struct PhysRegSUOper {
     SUnit *SU;
     int OpIdx;
-    unsigned RegUnit;
+    MCRegUnit RegUnit;
 
-    PhysRegSUOper(SUnit *su, int op, unsigned R)
+    PhysRegSUOper(SUnit *su, int op, MCRegUnit R)
         : SU(su), OpIdx(op), RegUnit(R) {}
 
     unsigned getSparseSetIndex() const { return RegUnit; }
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index df6ce0fe1b037..b024e8a68bd6e 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1113,7 +1113,8 @@ class SelectionDAG {
                                       SDValue Mask, SDValue EVL);
 
   /// Returns sum of the base pointer and offset.
-  /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap by default.
+  /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap and InBounds by
+  /// default.
   LLVM_ABI SDValue
   getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL,
                        const SDNodeFlags Flags = SDNodeFlags());
@@ -1123,15 +1124,18 @@ class SelectionDAG {
 
   /// Create an add instruction with appropriate flags when used for
   /// addressing some offset of an object. i.e. if a load is split into multiple
-  /// components, create an add nuw from the base pointer to the offset.
+  /// components, create an add nuw (or ptradd nuw inbounds) from the base
+  /// pointer to the offset.
   SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset) {
-    return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap);
+    return getMemBasePlusOffset(
+        Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds);
   }
 
   SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, SDValue Offset) {
     // The object itself can't wrap around the address space, so it shouldn't be
     // possible for the adds of the offsets to the split parts to overflow.
-    return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap);
+    return getMemBasePlusOffset(
+        Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds);
   }
 
   /// Return a new CALLSEQ_START node, that starts new call frame, in which
@@ -1256,9 +1260,15 @@ class SelectionDAG {
   /// stack arguments from being clobbered.
   LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain);
 
-  std::pair<SDValue, SDValue> getMemcmp(SDValue Chain, const SDLoc &dl,
-                                        SDValue Dst, SDValue Src, SDValue Size,
-                                        const CallInst *CI);
+  /// Lower a memcmp operation into a target library call and return the
+  /// resulting chain and call result as SelectionDAG SDValues.
+  LLVM_ABI std::pair<SDValue, SDValue> getMemcmp(SDValue Chain, const SDLoc &dl,
+                                                 SDValue Dst, SDValue Src,
+                                                 SDValue Size,
+                                                 const CallInst *CI);
+
+  /// Lower a strlen operation into a target library call and return the
+  /// resulting chain and call result as SelectionDAG SDValues.
   LLVM_ABI std::pair<SDValue, SDValue>
   getStrlen(SDValue Chain, const SDLoc &dl, SDValue Src, const CallInst *CI);
 
@@ -1708,16 +1718,6 @@ class SelectionDAG {
   /// the target's desired shift amount type.
   LLVM_ABI SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);
 
-  /// Expands a node with multiple results to an FP or vector libcall. The
-  /// libcall is expected to take all the operands of the \p Node followed by
-  /// output pointers for each of the results. \p CallRetResNo can be optionally
-  /// set to indicate that one of the results comes from the libcall's return
-  /// value.
-  LLVM_ABI bool
-  expandMultipleResultFPLibCall(RTLIB::Libcall LC, SDNode *Node,
-                                SmallVectorImpl<SDValue> &Results,
-                                std::optional<unsigned> CallRetResNo = {});
-
   /// Expand the specified \c ISD::VAARG node as the Legalize pass would.
   LLVM_ABI SDValue expandVAArg(SDNode *Node);
 
@@ -2062,6 +2062,10 @@ class SelectionDAG {
   /// We use this predicate to simplify operations downstream.
   LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth = 0) const;
 
+  /// Return true if the sign bit of Op is known to be zero, for a
+  /// floating-point value.
+  LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth = 0) const;
+
   /// Return true if 'Op & Mask' is known to be zero.  We
   /// use this predicate to simplify operations downstream.  Op and Mask are
   /// known to be the same type.
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
index 5241a51dd8cd8..27acc83369f02 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
@@ -46,6 +46,7 @@ class SelectionDAGISel {
 public:
   TargetMachine &TM;
   const TargetLibraryInfo *LibInfo;
+  const RTLIB::RuntimeLibcallsInfo *RuntimeLibCallInfo;
   std::unique_ptr<FunctionLoweringInfo> FuncInfo;
   std::unique_ptr<SwiftErrorValueTracking> SwiftError;
   MachineFunction *MF;
@@ -473,6 +474,7 @@ class SelectionDAGISel {
   void Select_WRITE_REGISTER(SDNode *Op);
   void Select_UNDEF(SDNode *N);
   void Select_FAKE_USE(SDNode *N);
+  void Select_RELOC_NONE(SDNode *N);
   void CannotYetSelect(SDNode *N);
 
   void Select_FREEZE(SDNode *N);
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 1759463ea7965..cd466dceb900f 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -1949,6 +1949,10 @@ LLVM_ABI bool isNullOrNullSplat(SDValue V, bool AllowUndefs = false);
 /// be zero.
 LLVM_ABI bool isOneOrOneSplat(SDValue V, bool AllowUndefs = false);
 
+/// Return true if the value is a constant floating-point value, or a splatted
+/// vector of a constant floating-point value, of 1.0 (with no undefs).
+LLVM_ABI bool isOneOrOneSplatFP(SDValue V, bool AllowUndefs = false);
+
 /// Return true if the value is a constant -1 integer or a splatted vector of a
 /// constant -1 integer (with no undefs).
 /// Does not permit build vector implicit truncation.
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 175f205328361..18142c2c0adf3 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -113,15 +113,18 @@ struct ExtAddrMode {
 ///
 class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
 protected:
+  const TargetRegisterInfo &TRI;
+
   /// Subtarget specific sub-array of MCInstrInfo's RegClassByHwModeTables
   /// (i.e. the table for the active HwMode). This should be indexed by
   /// MCOperandInfo's RegClass field for LookupRegClassByHwMode operands.
   const int16_t *const RegClassByHwMode;
 
-  TargetInstrInfo(unsigned CFSetupOpcode = ~0u, unsigned CFDestroyOpcode = ~0u,
-                  unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u,
+  TargetInstrInfo(const TargetRegisterInfo &TRI, unsigned CFSetupOpcode = ~0u,
+                  unsigned CFDestroyOpcode = ~0u, unsigned CatchRetOpcode = ~0u,
+                  unsigned ReturnOpcode = ~0u,
                   const int16_t *const RegClassByHwModeTable = nullptr)
-      : RegClassByHwMode(RegClassByHwModeTable),
+      : TRI(TRI), RegClassByHwMode(RegClassByHwModeTable),
         CallFrameSetupOpcode(CFSetupOpcode),
         CallFrameDestroyOpcode(CFDestroyOpcode), CatchRetOpcode(CatchRetOpcode),
         ReturnOpcode(ReturnOpcode) {}
@@ -131,6 +134,8 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
   TargetInstrInfo &operator=(const TargetInstrInfo &) = delete;
   virtual ~TargetInstrInfo();
 
+  const TargetRegisterInfo &getRegisterInfo() const { return TRI; }
+
   static bool isGenericOpcode(unsigned Opc) {
     return Opc <= TargetOpcode::GENERIC_OP_END;
   }
@@ -154,9 +159,8 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
 
   /// Given a machine instruction descriptor, returns the register
   /// class constraint for OpNum, or NULL.
-  virtual const TargetRegisterClass *
-  getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
-              const TargetRegisterInfo *TRI) const;
+  virtual const TargetRegisterClass *getRegClass(const MCInstrDesc &MCID,
+                                                 unsigned OpNum) const;
 
   /// Returns true if MI is an instruction we are unable to reason about
   /// (like a call or something with unmodeled side effects).
@@ -436,7 +440,10 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
   /// MachineSink determines on its own whether the instruction is safe to sink;
   /// this gives the target a hook to override the default behavior with regards
   /// to which instructions should be sunk.
+  ///
+  /// shouldPostRASink() is used by PostRAMachineSink.
   virtual bool shouldSink(const MachineInstr &MI) const { return true; }
+  virtual bool shouldPostRASink(const MachineInstr &MI) const { return true; }
 
   /// Return false if the instruction should not be hoisted by MachineLICM.
   ///
@@ -456,8 +463,7 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
   /// SubIdx.
   virtual void reMaterialize(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator MI, Register DestReg,
-                             unsigned SubIdx, const MachineInstr &Orig,
-                             const TargetRegisterInfo &TRI) const;
+                             unsigned SubIdx, const MachineInstr &Orig) const;
 
   /// Clones instruction or the whole instruction bundle \p Orig and
   /// insert into \p MBB before \p InsertBefore. The target may update operands
@@ -1190,8 +1196,7 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
   /// register spill instruction, part of prologue, during the frame lowering.
   virtual void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const {
     llvm_unreachable("Target didn't implement "
                      "TargetInstrInfo::storeRegToStackSlot!");
@@ -1209,8 +1214,7 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
   /// register reload instruction, part of epilogue, during the frame lowering.
   virtual void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const {
     llvm_unreachable("Target didn't implement "
                      "TargetInstrInfo::loadRegFromStackSlot!");
@@ -1761,6 +1765,17 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
     return true;
   }
 
+  /// Return true if it's safe to move a machine instruction.
+  /// This allows the backend to prevent certain special instruction
+  /// sequences from being broken by instruction motion in optimization
+  /// passes.
+  /// By default, this returns true for every instruction.
+  virtual bool isSafeToMove(const MachineInstr &MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const {
+    return true;
+  }
+
   /// Test if the given instruction should be considered a scheduling boundary.
   /// This primarily includes labels and terminators.
   virtual bool isSchedulingBoundary(const MachineInstr &MI,
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 1920b98c8a1ef..cec7d09f494d6 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -29,6 +29,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/LibcallLoweringInfo.h"
 #include "llvm/CodeGen/LowLevelTypeUtils.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RuntimeLibcallUtil.h"
@@ -57,7 +58,6 @@
 #include <cassert>
 #include <climits>
 #include <cstdint>
-#include <iterator>
 #include <map>
 #include <string>
 #include <utility>
@@ -1678,7 +1678,7 @@ class LLVM_ABI TargetLoweringBase {
   LegalizeAction getPartialReduceMLAAction(unsigned Opc, EVT AccVT,
                                            EVT InputVT) const {
     assert(Opc == ISD::PARTIAL_REDUCE_SMLA || Opc == ISD::PARTIAL_REDUCE_UMLA ||
-           Opc == ISD::PARTIAL_REDUCE_SUMLA);
+           Opc == ISD::PARTIAL_REDUCE_SUMLA || Opc == ISD::PARTIAL_REDUCE_FMLA);
     PartialReduceActionTypes Key = {Opc, AccVT.getSimpleVT().SimpleTy,
                                     InputVT.getSimpleVT().SimpleTy};
     auto It = PartialReduceMLAActions.find(Key);
@@ -2792,7 +2792,7 @@ class LLVM_ABI TargetLoweringBase {
   void setPartialReduceMLAAction(unsigned Opc, MVT AccVT, MVT InputVT,
                                  LegalizeAction Action) {
     assert(Opc == ISD::PARTIAL_REDUCE_SMLA || Opc == ISD::PARTIAL_REDUCE_UMLA ||
-           Opc == ISD::PARTIAL_REDUCE_SUMLA);
+           Opc == ISD::PARTIAL_REDUCE_SUMLA || Opc == ISD::PARTIAL_REDUCE_FMLA);
     assert(AccVT.isValid() && InputVT.isValid() &&
            "setPartialReduceMLAAction types aren't valid");
     PartialReduceActionTypes Key = {Opc, AccVT.SimpleTy, InputVT.SimpleTy};
@@ -3597,7 +3597,7 @@ class LLVM_ABI TargetLoweringBase {
   }
 
   const RTLIB::RuntimeLibcallsInfo &getRuntimeLibcallsInfo() const {
-    return Libcalls;
+    return RuntimeLibcallInfo;
   }
 
   void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl) {
@@ -3610,9 +3610,9 @@ class LLVM_ABI TargetLoweringBase {
   }
 
   /// Get the libcall routine name for the specified libcall.
+  // FIXME: This should be removed. Only LibcallImpl should have a name.
   const char *getLibcallName(RTLIB::Libcall Call) const {
-    // FIXME: Return StringRef
-    return Libcalls.getLibcallName(Call).data();
+    return Libcalls.getLibcallName(Call);
   }
 
   /// Get the libcall routine name for the specified libcall implementation
@@ -3620,15 +3620,12 @@ class LLVM_ABI TargetLoweringBase {
     return RTLIB::RuntimeLibcallsInfo::getLibcallImplName(Call);
   }
 
-  const char *getMemcpyName() const {
-    // FIXME: Return StringRef
-    return Libcalls.getMemcpyName().data();
-  }
+  RTLIB::LibcallImpl getMemcpyImpl() const { return Libcalls.getMemcpyImpl(); }
 
   /// Check if this is valid libcall for the current module, otherwise
   /// RTLIB::Unsupported.
   RTLIB::LibcallImpl getSupportedLibcallImpl(StringRef FuncName) const {
-    return Libcalls.getSupportedLibcallImpl(FuncName);
+    return RuntimeLibcallInfo.getSupportedLibcallImpl(FuncName);
   }
 
   /// Get the comparison predicate that's to be used to test the result of the
@@ -3636,11 +3633,6 @@ class LLVM_ABI TargetLoweringBase {
   /// floating-point compare libcalls.
   ISD::CondCode getSoftFloatCmpLibcallPredicate(RTLIB::LibcallImpl Call) const;
 
-  /// Set the CallingConv that should be used for the specified libcall.
-  void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) {
-    Libcalls.setLibcallImplCallingConv(Call, CC);
-  }
-
   /// Get the CallingConv that should be used for the specified libcall
   /// implementation.
   CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const {
@@ -3837,8 +3829,11 @@ class LLVM_ABI TargetLoweringBase {
   std::map<std::pair<unsigned, MVT::SimpleValueType>, MVT::SimpleValueType>
     PromoteToType;
 
+  /// FIXME: This should not live here; it should come from an analysis.
+  const RTLIB::RuntimeLibcallsInfo RuntimeLibcallInfo;
+
   /// The list of libcalls that the target will use.
-  RTLIB::RuntimeLibcallsInfo Libcalls;
+  LibcallLoweringInfo Libcalls;
 
   /// The bits of IndexedModeActions used to store the legalisation actions
   /// We store the data as   | ML | MS |  L |  S | each taking 4 bits.
@@ -5649,17 +5644,35 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
   /// Get a pointer to vector element \p Idx located in memory for a vector of
   /// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of
   /// bounds the returned pointer is unspecified, but will be within the vector
-  /// bounds.
-  SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
-                                  SDValue Index) const;
+  /// bounds. \p PtrArithFlags can be used to mark that arithmetic within the
+  /// vector in memory is known to not wrap or to be inbounds.
+  SDValue getVectorElementPointer(
+      SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index,
+      const SDNodeFlags PtrArithFlags = SDNodeFlags()) const;
+
+  /// Get a pointer to vector element \p Idx located in memory for a vector of
+  /// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of
+  /// bounds the returned pointer is unspecified, but will be within the vector
+  /// bounds. \p VecPtr is guaranteed to point to the beginning of a memory
+  /// location large enough for the vector.
+  SDValue getInboundsVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr,
+                                          EVT VecVT, SDValue Index) const {
+    return getVectorElementPointer(DAG, VecPtr, VecVT, Index,
+                                   SDNodeFlags::NoUnsignedWrap |
+                                       SDNodeFlags::InBounds);
+  }
 
   /// Get a pointer to a sub-vector of type \p SubVecVT at index \p Idx located
   /// in memory for a vector of type \p VecVT starting at a base address of
   /// \p VecPtr. If \p Idx plus the size of \p SubVecVT is out of bounds the
   /// returned pointer is unspecified, but the value returned will be such that
-  /// the entire subvector would be within the vector bounds.
-  SDValue getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
-                                 EVT SubVecVT, SDValue Index) const;
+  /// the entire subvector would be within the vector bounds. \p PtrArithFlags
+  /// can be used to mark that arithmetic within the vector in memory is known
+  /// to not wrap or to be inbounds.
+  SDValue
+  getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
+                         EVT SubVecVT, SDValue Index,
+                         const SDNodeFlags PtrArithFlags = SDNodeFlags()) const;
 
   /// Method for building the DAG expansion of ISD::[US][MIN|MAX]. This
   /// method accepts integers as its arguments.
@@ -5744,6 +5757,16 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
   /// consisting of zext/sext, extract_subvector, mul and add operations.
   SDValue expandPartialReduceMLA(SDNode *Node, SelectionDAG &DAG) const;
 
+  /// Expands a node with multiple results to an FP or vector libcall. The
+  /// libcall is expected to take all the operands of the \p Node followed by
+  /// output pointers for each of the results. \p CallRetResNo can be optionally
+  /// set to indicate that one of the results comes from the libcall's return
+  /// value.
+  bool expandMultipleResultFPLibCall(
+      SelectionDAG &DAG, RTLIB::Libcall LC, SDNode *Node,
+      SmallVectorImpl<SDValue> &Results,
+      std::optional<unsigned> CallRetResNo = {}) const;
+
   /// Legalize a SETCC or VP_SETCC with given LHS and RHS and condition code CC
   /// on the current target. A VP_SETCC will additionally be given a Mask
   /// and/or EVL not equal to SDValue().
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index f031353422e40..dabf0dc5ec173 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -958,7 +958,7 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo {
   TypeSize getRegSizeInBits(Register Reg, const MachineRegisterInfo &MRI) const;
 
   /// Get the weight in units of pressure for this register unit.
-  virtual unsigned getRegUnitWeight(unsigned RegUnit) const = 0;
+  virtual unsigned getRegUnitWeight(MCRegUnit RegUnit) const = 0;
 
   /// Get the number of dimensions of register pressure.
   virtual unsigned getNumRegPressureSets() const = 0;
@@ -978,7 +978,7 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo {
 
   /// Get the dimensions of register pressure impacted by this register unit.
   /// Returns a -1 terminated array of pressure set IDs.
-  virtual const int *getRegUnitPressureSets(unsigned RegUnit) const = 0;
+  virtual const int *getRegUnitPressureSets(MCRegUnit RegUnit) const = 0;
 
   /// Get the scale factor of spill weight for this register class.
   virtual float getSpillWeightScaleFactor(const TargetRegisterClass *RC) const;
@@ -1446,7 +1446,7 @@ LLVM_ABI Printable printReg(Register Reg,
 ///   fp0~st7 - Dual roots.
 ///
 /// Usage: OS << printRegUnit(Unit, TRI) << '\n';
-LLVM_ABI Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI);
+LLVM_ABI Printable printRegUnit(MCRegUnit Unit, const TargetRegisterInfo *TRI);
 
 /// Create Printable object to print virtual registers and physical
 /// registers on a \ref raw_ostream.
diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index a8c7a8aff83cf..a1a130aa27798 100644
--- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -210,6 +210,10 @@ class LLVM_ABI TargetSubtargetInfo : public MCSubtargetInfo {
   /// can be overridden.
   virtual bool enableJoinGlobalCopies() const;
 
+  /// Hack to bring up option. This should be unconditionally true, all targets
+  /// should enable it and delete this.
+  virtual bool enableTerminalRule() const { return false; }
+
   /// True if the subtarget should run a scheduler after register allocation.
   ///
   /// By default this queries the PostRAScheduling bit in the scheduling model
diff --git a/llvm/include/llvm/CodeGen/TileShapeInfo.h b/llvm/include/llvm/CodeGen/TileShapeInfo.h
index 9cea327819895..24d9de842645a 100644
--- a/llvm/include/llvm/CodeGen/TileShapeInfo.h
+++ b/llvm/include/llvm/CodeGen/TileShapeInfo.h
@@ -34,30 +34,9 @@ class ShapeT {
     if (MRI)
       deduceImm(MRI);
   }
-  // When ShapeT has multiple shapes, we only use Shapes (never use Row and Col)
-  // and ImmShapes. Due to the most case is only one shape (just simply use
-  // Shape.Row or Shape.Col), so here we don't merge Row and Col into vector
-  // Shapes to keep the speed and code simplicity.
-  // TODO: The upper solution is a temporary way to minimize current tile
-  // register allocation code changes. It can not handle both Reg shape and
-  // Imm shape for different shapes (e.g. shape 1 is reg shape while shape 2
-  // is imm shape). Refine me when we have more multi-tile shape instructions!
-  ShapeT(ArrayRef<MachineOperand *> ShapesOperands,
-         const MachineRegisterInfo *MRI = nullptr)
-      : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
-        ColImm(InvalidImmShape) {
-    assert(ShapesOperands.size() % 2 == 0 && "Miss row or col!");
-
-    llvm::append_range(Shapes, ShapesOperands);
-
-    if (MRI)
-      deduceImm(MRI);
-  }
   ShapeT()
       : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
         ColImm(InvalidImmShape) {}
-  // TODO: We need to extern cmp operator for multi-shapes if
-  // we have requirement in the future.
   bool operator==(const ShapeT &Shape) const {
     MachineOperand *R = Shape.Row;
     MachineOperand *C = Shape.Col;
@@ -74,40 +53,11 @@ class ShapeT {
 
   bool operator!=(const ShapeT &Shape) const { return !(*this == Shape); }
 
-  MachineOperand *getRow(unsigned I = 0) const {
-    if (Shapes.empty())
-      return Row;
-    assert(Shapes.size() / 2 >= I && "Get invalid row from id!");
-    return Shapes[I * 2];
-  }
-
-  MachineOperand *getCol(unsigned I = 0) const {
-    if (Shapes.empty())
-      return Col;
-    assert(Shapes.size() / 2 >= I && "Get invalid col from id!");
-    return Shapes[I * 2 + 1];
-  }
-
-  int64_t getRowImm(unsigned I = 0) const {
-    if (ImmShapes.empty())
-      return RowImm;
-    assert(ImmShapes.size() / 2 >= I && "Get invalid imm row from id!");
-    return ImmShapes[I * 2];
-  }
-
-  int64_t getColImm(unsigned I = 0) const {
-    if (ImmShapes.empty())
-      return ColImm;
-    assert(ImmShapes.size() / 2 >= I && "Get invalid imm col from id!");
-    return ImmShapes[I * 2 + 1];
-  }
+  MachineOperand *getRow() const { return Row; }
+  MachineOperand *getCol() const { return Col; }
 
-  unsigned getShapeNum() {
-    if (Shapes.empty())
-      return isValid() ? 1 : 0;
-    else
-      return Shapes.size() / 2;
-  }
+  int64_t getRowImm() const { return RowImm; }
+  int64_t getColImm() const { return ColImm; }
 
   bool isValid() { return (Row != nullptr) && (Col != nullptr); }
 
@@ -120,35 +70,14 @@ class ShapeT {
       for (const MachineOperand &DefMO : MRI->def_operands(Reg)) {
         const auto *MI = DefMO.getParent();
         if (MI->isMoveImmediate()) {
-          assert(MI->getNumOperands() == 2 &&
-                 "Unsupported number of operands in instruction for setting "
-                 "row/column.");
-          if (MI->getOperand(1).isImm()) {
-            Imm = MI->getOperand(1).getImm();
-          } else {
-            assert(MI->getOperand(1).isImplicit() &&
-                   "Operand 1 is assumed to be implicit.");
-            Imm = 0;
-          }
+          Imm = MI->getOperand(1).getImm();
           break;
         }
       }
       return Imm;
     };
-    if (Shapes.empty()) { // Single Shape
-      RowImm = GetImm(Row->getReg());
-      ColImm = GetImm(Col->getReg());
-      // The number of rows of 2nd destination buffer is assigned by the one of
-      // 1st destination buffer. If the column size is equal to zero, the row
-      // size should be reset to zero too.
-      if (ColImm == 0)
-        Row = Col;
-    } else { // Multiple Shapes
-      for (auto *Shape : Shapes) {
-        int64_t ImmShape = GetImm(Shape->getReg());
-        ImmShapes.push_back(ImmShape);
-      }
-    }
+    RowImm = GetImm(Row->getReg());
+    ColImm = GetImm(Col->getReg());
   }
 
 private:
@@ -157,9 +86,6 @@ class ShapeT {
   MachineOperand *Col;
   int64_t RowImm = -1;
   int64_t ColImm = -1;
-  // Multiple Shapes
-  SmallVector<MachineOperand *, 0> Shapes;
-  SmallVector<int64_t, 0> ImmShapes;
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/CodeGen/WindowScheduler.h b/llvm/include/llvm/CodeGen/WindowScheduler.h
index 476d5ada27876..97776de353e3f 100644
--- a/llvm/include/llvm/CodeGen/WindowScheduler.h
+++ b/llvm/include/llvm/CodeGen/WindowScheduler.h
@@ -105,7 +105,7 @@ class WindowScheduler {
 
 public:
   WindowScheduler(MachineSchedContext *C, MachineLoop &ML);
-  virtual ~WindowScheduler() {}
+  virtual ~WindowScheduler() = default;
 
   bool run();
 
diff --git a/llvm/include/llvm/CodeGenTypes/LowLevelType.h b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
index 4c1fe13790011..472a3f3e23b3f 100644
--- a/llvm/include/llvm/CodeGenTypes/LowLevelType.h
+++ b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
@@ -340,18 +340,18 @@ class LLT {
   ///   valid encodings, SizeInBits/SizeOfElement must be larger than 0.
   /// * Non-pointer scalar (isPointer == 0 && isVector == 0):
   ///   SizeInBits: 32;
-  static const constexpr BitFieldInfo ScalarSizeFieldInfo{32, 29};
+  static constexpr BitFieldInfo ScalarSizeFieldInfo{32, 29};
   /// * Pointer (isPointer == 1 && isVector == 0):
   ///   SizeInBits: 16;
   ///   AddressSpace: 24;
-  static const constexpr BitFieldInfo PointerSizeFieldInfo{16, 45};
-  static const constexpr BitFieldInfo PointerAddressSpaceFieldInfo{24, 21};
+  static constexpr BitFieldInfo PointerSizeFieldInfo{16, 45};
+  static constexpr BitFieldInfo PointerAddressSpaceFieldInfo{24, 21};
   /// * Vector-of-non-pointer (isPointer == 0 && isVector == 1):
   ///   NumElements: 16;
   ///   SizeOfElement: 32;
   ///   Scalable: 1;
-  static const constexpr BitFieldInfo VectorElementsFieldInfo{16, 5};
-  static const constexpr BitFieldInfo VectorScalableFieldInfo{1, 0};
+  static constexpr BitFieldInfo VectorElementsFieldInfo{16, 5};
+  static constexpr BitFieldInfo VectorScalableFieldInfo{1, 0};
   /// * Vector-of-pointer (isPointer == 1 && isVector == 1):
   ///   NumElements: 16;
   ///   SizeOfElement: 16;
diff --git a/llvm/include/llvm/DWARFLinker/IndexedValuesMap.h b/llvm/include/llvm/DWARFLinker/IndexedValuesMap.h
index 5e0779157473e..8fde15d342a15 100644
--- a/llvm/include/llvm/DWARFLinker/IndexedValuesMap.h
+++ b/llvm/include/llvm/DWARFLinker/IndexedValuesMap.h
@@ -12,7 +12,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include <cstdint>
-#include <utility>
 
 namespace llvm {
 namespace dwarf_linker {
diff --git a/llvm/include/llvm/DWARFLinker/StringPool.h b/llvm/include/llvm/DWARFLinker/StringPool.h
index d0f4e211fac3e..7838e3b8d6f20 100644
--- a/llvm/include/llvm/DWARFLinker/StringPool.h
+++ b/llvm/include/llvm/DWARFLinker/StringPool.h
@@ -20,7 +20,7 @@ namespace dwarf_linker {
 
 /// StringEntry keeps data of the string: the length, external offset
 /// and a string body which is placed right after StringEntry.
-using StringEntry = StringMapEntry<std::nullopt_t>;
+using StringEntry = StringMapEntry<EmptyStringSetTag>;
 
 class StringPoolEntryInfo {
 public:
diff --git a/llvm/include/llvm/DebugInfo/CodeView/CodeView.h b/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
index b769e53d80270..7a1008689296d 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
@@ -15,7 +15,6 @@
 
 #include "llvm/Support/Compiler.h"
 #include <cinttypes>
-#include <type_traits>
 
 #include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/Support/Endian.h"
diff --git a/llvm/include/llvm/DebugInfo/DIContext.h b/llvm/include/llvm/DebugInfo/DIContext.h
index e7e87bbfebf38..b404c92e71836 100644
--- a/llvm/include/llvm/DebugInfo/DIContext.h
+++ b/llvm/include/llvm/DebugInfo/DIContext.h
@@ -211,6 +211,8 @@ struct DIDumpOptions {
   bool ShowAggregateErrors = false;
   bool PrintRegisterOnly = false;
   std::string JsonErrSummaryFile;
+  /// List of DWARF tags to filter children by.
+  llvm::SmallVector<unsigned, 0> FilterChildTag;
   std::function<llvm::StringRef(uint64_t DwarfRegNum, bool IsEH)>
       GetNameForDWARFReg;
 
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index be78647cf9fea..bd204f626ac01 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -28,7 +28,6 @@
 #include <cstdint>
 #include <map>
 #include <memory>
-#include <set>
 #include <utility>
 #include <vector>
 
@@ -136,8 +135,8 @@ class DWARFUnitVector final : public SmallVector<std::unique_ptr<DWARFUnit>, 1>
 
 public:
   using UnitVector = SmallVectorImpl<std::unique_ptr<DWARFUnit>>;
-  using iterator = typename UnitVector::iterator;
-  using iterator_range = llvm::iterator_range<typename UnitVector::iterator>;
+  using iterator = UnitVector::iterator;
+  using iterator_range = llvm::iterator_range<UnitVector::iterator>;
 
   using compile_unit_range =
       decltype(make_filter_range(std::declval<iterator_range>(), isCompileUnit));
diff --git a/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h b/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h
index c571112344254..e636296b058fd 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h
@@ -17,8 +17,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/TargetParser/Triple.h"
-#include <map>
-#include <memory>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h b/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h
index e3e9b2bb91e8a..f9382fa8d9577 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h
@@ -12,7 +12,6 @@
 #include "llvm/DebugInfo/DIContext.h"
 #include <cstdint>
 #include <memory>
-#include <string>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h
index 4caf1236dc0fb..7e3fd18422cdd 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h
@@ -20,7 +20,6 @@
 #include "llvm/DebugInfo/LogicalView/Core/LVSupport.h"
 #include "llvm/Support/Compiler.h"
 #include <limits>
-#include <list>
 #include <string>
 
 namespace llvm {
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h
index 2e2619c55d58e..78978831dc641 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h
@@ -20,7 +20,6 @@
 #include "llvm/DebugInfo/LogicalView/Core/LVSort.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Compiler.h"
-#include <list>
 #include <map>
 #include <set>
 
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleList.h b/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleList.h
index 8992faead73bb..bbed56b517093 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleList.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleList.h
@@ -32,7 +32,7 @@ struct FileInfoSubstreamHeader;
 class DbiModuleSourceFilesIterator
     : public iterator_facade_base<DbiModuleSourceFilesIterator,
                                   std::random_access_iterator_tag, StringRef> {
-  using BaseType = typename DbiModuleSourceFilesIterator::iterator_facade_base;
+  using BaseType = DbiModuleSourceFilesIterator::iterator_facade_base;
 
 public:
   LLVM_ABI DbiModuleSourceFilesIterator(const DbiModuleList &Modules,
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h b/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h
index 76a019ddf8f34..17b5bfac9ac31 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h
@@ -19,7 +19,6 @@
 #include "llvm/Support/FormatVariadic.h"
 
 #include <string>
-#include <type_traits>
 
 namespace llvm {
 namespace pdb {
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index 62d427c3966bb..67de123fdbad5 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -1366,7 +1366,7 @@ class TemplateTemplateParamDecl final : public Node {
   template <typename Fn> void match(Fn F) const { F(Name, Params, Requires); }
 
   void printLeft(OutputBuffer &OB) const override {
-    ScopedOverride<unsigned> LT(OB.GtIsGt, 0);
+    ScopedOverride<bool> LT(OB.TemplateTracker.InsideTemplate, true);
     OB += "template<";
     Params.printWithComma(OB);
     OB += "> typename ";
@@ -1550,7 +1550,7 @@ class TemplateArgs final : public Node {
   NodeArray getParams() { return Params; }
 
   void printLeft(OutputBuffer &OB) const override {
-    ScopedOverride<unsigned> LT(OB.GtIsGt, 0);
+    ScopedOverride<bool> LT(OB.TemplateTracker.InsideTemplate, true);
     OB += "<";
     Params.printWithComma(OB);
     OB += ">";
@@ -1824,7 +1824,7 @@ class ClosureTypeName : public Node {
 
   void printDeclarator(OutputBuffer &OB) const {
     if (!TemplateParams.empty()) {
-      ScopedOverride<unsigned> LT(OB.GtIsGt, 0);
+      ScopedOverride<bool> LT(OB.TemplateTracker.InsideTemplate, true);
       OB += "<";
       TemplateParams.printWithComma(OB);
       OB += ">";
@@ -1885,7 +1885,9 @@ class BinaryExpr : public Node {
   }
 
   void printLeft(OutputBuffer &OB) const override {
-    bool ParenAll = OB.isGtInsideTemplateArgs() &&
+    // If we're printing a '<' inside of a template argument, and we haven't
+    // yet parenthesized the expression, do so now.
+    bool ParenAll = !OB.isInParensInTemplateArgs() &&
                     (InfixOperator == ">" || InfixOperator == ">>");
     if (ParenAll)
       OB.printOpen();
@@ -2061,7 +2063,7 @@ class CastExpr : public Node {
   void printLeft(OutputBuffer &OB) const override {
     OB += CastKind;
     {
-      ScopedOverride<unsigned> LT(OB.GtIsGt, 0);
+      ScopedOverride<bool> LT(OB.TemplateTracker.InsideTemplate, true);
       OB += "<";
       OB.printLeft(*To);
       OB += ">";
diff --git a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
index 155cfe8dd3a98..711aa70a4a8d3 100644
--- a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
+++ b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
@@ -708,7 +708,7 @@ struct DEMANGLE_ABI SpecialTableSymbolNode : public SymbolNode {
     return N->kind() == NodeKind::SpecialTableSymbol;
   }
 
-  QualifiedNameNode *TargetName = nullptr;
+  NodeArrayNode *TargetNames = nullptr;
   Qualifiers Quals = Qualifiers::Q_None;
 };
 
diff --git a/llvm/include/llvm/Demangle/Utility.h b/llvm/include/llvm/Demangle/Utility.h
index 002a1f55467d6..afdc1a397ca6f 100644
--- a/llvm/include/llvm/Demangle/Utility.h
+++ b/llvm/include/llvm/Demangle/Utility.h
@@ -81,7 +81,7 @@ class OutputBuffer {
   OutputBuffer(const OutputBuffer &) = delete;
   OutputBuffer &operator=(const OutputBuffer &) = delete;
 
-  virtual ~OutputBuffer() {}
+  virtual ~OutputBuffer() = default;
 
   operator std::string_view() const {
     return std::string_view(Buffer, CurrentPosition);
@@ -104,18 +104,32 @@ class OutputBuffer {
   unsigned CurrentPackIndex = std::numeric_limits<unsigned>::max();
   unsigned CurrentPackMax = std::numeric_limits<unsigned>::max();
 
-  /// When zero, we're printing template args and '>' needs to be parenthesized.
-  /// Use a counter so we can simply increment inside parentheses.
-  unsigned GtIsGt = 1;
+  struct {
+    /// The depth of '(' and ')' inside the currently printed template
+    /// arguments.
+    unsigned ParenDepth = 0;
 
-  bool isGtInsideTemplateArgs() const { return GtIsGt == 0; }
+    /// True if we're currently printing a template argument.
+    bool InsideTemplate = false;
+  } TemplateTracker;
+
+  /// Returns true if we're currently between a '(' and ')' when printing
+  /// template args.
+  bool isInParensInTemplateArgs() const {
+    return TemplateTracker.ParenDepth > 0;
+  }
+
+  /// Returns true if we're printing template args.
+  bool isInsideTemplateArgs() const { return TemplateTracker.InsideTemplate; }
 
   void printOpen(char Open = '(') {
-    GtIsGt++;
+    if (isInsideTemplateArgs())
+      TemplateTracker.ParenDepth++;
     *this += Open;
   }
   void printClose(char Close = ')') {
-    GtIsGt--;
+    if (isInsideTemplateArgs())
+      TemplateTracker.ParenDepth--;
     *this += Close;
   }
 
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_systemz.h b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_systemz.h
new file mode 100644
index 0000000000000..a996dfd9543df
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_systemz.h
@@ -0,0 +1,39 @@
+//===--- ELF_systemz.h - JIT link functions for ELF/systemz --*- C++ -*----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+// jit-link functions for ELF/systemz.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_JITLINK_ELF_SYSTEMZ_H
+#define LLVM_EXECUTIONENGINE_JITLINK_ELF_SYSTEMZ_H
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+
+namespace llvm {
+namespace jitlink {
+
+/// Create a LinkGraph from an ELF/systemz relocatable object
+///
+/// Note: The graph does not take ownership of the underlying buffer, nor copy
+/// its contents. The caller is responsible for ensuring that the object buffer
+/// outlives the graph.
+Expected<std::unique_ptr<LinkGraph>> createLinkGraphFromELFObject_systemz(
+    MemoryBufferRef ObjectBuffer, std::shared_ptr<orc::SymbolStringPool> SSP);
+
+/// jit-link the given object buffer, which must be a ELF systemz relocatable
+/// object file.
+void link_ELF_systemz(std::unique_ptr<LinkGraph> G,
+                      std::unique_ptr<JITLinkContext> Ctx);
+
+} // end namespace jitlink
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_JITLINK_ELF_SYSTEMZ_H
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
index 98170f60f6e49..9479c107447d5 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
@@ -175,7 +175,7 @@ struct HalfWords {
 /// FixupInfo base class is required for dynamic lookups.
 struct FixupInfoBase {
   LLVM_ABI static const FixupInfoBase *getDynFixupInfo(Edge::Kind K);
-  virtual ~FixupInfoBase() {}
+  virtual ~FixupInfoBase() = default;
 };
 
 /// FixupInfo checks for Arm edge kinds work on 32-bit words
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/systemz.h b/llvm/include/llvm/ExecutionEngine/JITLink/systemz.h
new file mode 100644
index 0000000000000..dde3448cd5da7
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/systemz.h
@@ -0,0 +1,924 @@
+//=== systemz.h - Generic JITLink systemz edge kinds, utilities -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Generic utilities for graphs representing systemz objects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_JITLINK_SYSTEMZ_H
+#define LLVM_EXECUTIONENGINE_JITLINK_SYSTEMZ_H
+
+#include "TableManager.h"
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+
+using namespace llvm::support::endian;
+
+namespace llvm {
+namespace jitlink {
+namespace systemz {
+
+/// Represents systemz fixups and other systemz-specific edge kinds.
+enum EdgeKind_systemz : Edge::Kind {
+
+  /// A plain 64-bit pointer value relocation.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target + Addend : uint64
+  ///
+  Pointer64 = Edge::FirstRelocation,
+
+  /// A plain 32-bit pointer value relocation.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target + Addend : uint32
+  ///
+  /// Errors:
+  ///   - The target must reside in the low 32-bits of the address space,
+  ///     otherwise an out-of-range error will be returned.
+  ///
+  Pointer32,
+
+  /// A plain 20-bit pointer value relocation.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target + Addend : uint20
+  ///
+  /// Errors:
+  ///   - The target must reside in the low 20-bits of the address space,
+  ///     otherwise an out-of-range error will be returned.
+  ///
+  Pointer20,
+
+  /// A plain 16-bit pointer value relocation.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target + Addend : uint16
+  ///
+  /// Errors:
+  ///   - The target must reside in the low 16-bits of the address space,
+  ///     otherwise an out-of-range error will be returned.
+  ///
+  Pointer16,
+
+  /// A plain 12-bit pointer value relocation.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target + Addend : uint12
+  ///
+  /// Errors:
+  ///   - The target must reside in the low 12-bits of the address space,
+  ///     otherwise an out-of-range error will be returned.
+  ///
+  Pointer12,
+
+  /// A plain 8-bit pointer value relocation.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target + Addend : uint8
+  ///
+  /// Errors:
+  ///   - The target must reside in the low 8-bits of the address space,
+  ///     otherwise an out-of-range error will be returned.
+  ///
+  Pointer8,
+
+  /// A 64-bit delta.
+  ///
+  /// Delta from the fixup to the target.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - Fixup + Addend : int64
+  ///
+  Delta64,
+
+  /// A 32-bit delta.
+  ///
+  /// Delta from the fixup to the target.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - Fixup + Addend : int32
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression must fit into an int32, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta32,
+
+  /// A 16-bit delta.
+  ///
+  /// Delta from the fixup to the target.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - Fixup + Addend : int16
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression must fit into an int16, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta16,
+
+  /// A 32-bit delta shifted by 1.
+  ///
+  /// Delta from the fixup to the target.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 1 : int32
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int33, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  Delta32dbl,
+
+  /// A 24-bit delta shifted by 1.
+  ///
+  /// Delta from the fixup to the target.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 1 : int24
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int25, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  Delta24dbl,
+
+  /// A 16-bit delta shifted by 1.
+  ///
+  /// Delta from the fixup to the target.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 1 : int16
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int17, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  Delta16dbl,
+
+  /// A 12-bit delta shifted by 1.
+  ///
+  /// Delta from the fixup to the target.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 1 : int12
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int13, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  Delta12dbl,
+
+  /// A 64-bit negative delta.
+  ///
+  /// Delta from target back to the fixup.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Fixup - Target + Addend : int64
+  ///
+  NegDelta64,
+
+  /// A 32-bit negative delta.
+  ///
+  /// Delta from the target back to the fixup.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Fixup - Target + Addend : int32
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression must fit into an int32, otherwise
+  ///     an out-of-range error will be returned.
+  NegDelta32,
+
+  /// A 32-bit Delta shifted by 1.
+  ///
+  /// Delta from the fixup to the PLT slot for the target. This will lead to
+  /// creation of a PLT stub.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 1 : int32
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int33, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  DeltaPLT32dbl,
+
+  /// A 24-bit Delta shifted by 1.
+  ///
+  /// Delta from the fixup to the PLT slot for the target. This will lead to
+  /// creation of a PLT stub.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 1 : int24
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int25, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  DeltaPLT24dbl,
+
+  /// A 16-bit Delta shifted by 1.
+  ///
+  /// Delta from the fixup to the PLT slot for the target. This will lead to
+  /// creation of a PLT stub.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 1 : int16
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int17, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  DeltaPLT16dbl,
+
+  /// A 12-bit Delta shifted by 1.
+  ///
+  /// Delta from the fixup to the PLT slot for the target. This will lead to
+  /// creation of a PLT stub.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 1 : int12
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int13, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  DeltaPLT12dbl,
+
+  /// A 64-bit Delta.
+  ///
+  /// Delta from the fixup to the PLT slot for the target. This will lead to
+  /// creation of a PLT stub.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - Fixup + Addend : int64
+  ///
+  DeltaPLT64,
+
+  /// A 32-bit Delta.
+  ///
+  /// Delta from the fixup to the PLT slot for the target. This will lead to
+  /// creation of a PLT stub.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - Fixup + Addend : int32
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression must fit into an int32, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  DeltaPLT32,
+
+  /// A 64-bit offset from GOT to PLT.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - GOTBase + Addend : int64
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///
+  Delta64PLTFromGOT,
+
+  /// A 32-bit offset from GOT to PLT.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - GOTBase + Addend : int32
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///   - The result of the fixup expression must fit into an int32, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta32PLTFromGOT,
+
+  /// A 16-bit offset from GOT to PLT.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - GOTBase + Addend : int16
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///   - The result of the fixup expression must fit into an int16, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta16PLTFromGOT,
+
+  /// A 64-bit offset from GOT.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - GOTBase + Addend : int64
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///
+  Delta64FromGOT,
+
+  /// A 32-bit offset from GOT.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - GOTBase + Addend : int32
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///   - The result of the fixup expression must fit into an int32, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta32FromGOT,
+
+  /// A 16-bit offset from GOT.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - GOTBase + Addend : int16
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///   - The result of the fixup expression must fit into an int16, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta16FromGOT,
+
+  /// A 20-bit offset from GOT.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - GOTBase + Addend : int20
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///   - The result of the fixup expression must fit into an int16, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta20FromGOT,
+
+  /// A 12-bit offset from GOT.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - GOTBase + Addend : int12
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///   - The result of the fixup expression must fit into an int16, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta12FromGOT,
+
+  /// A GOT entry getter/constructor, transformed to Delta64FromGOT pointing
+  /// at the GOT entry for the original target.
+  ///
+  /// Indicates that this edge should be transformed into a Delta64FromGOT
+  /// targeting the GOT entry for the edge's current target, maintaining the
+  /// same addend. A GOT entry for the target should be created if one does
+  /// not already exist.
+  ///
+  /// Edges of this kind are usually handled by a GOT builder pass inserted by
+  /// default.
+  ///
+  /// Fixup expression:
+  ///   NONE
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
+  ///
+  RequestGOTAndTransformToDelta64FromGOT,
+
+  /// A GOT entry getter/constructor, transformed to Delta32FromGOT pointing
+  /// at the GOT entry for the original target.
+  ///
+  /// Indicates that this edge should be transformed into a Delta32FromGOT
+  /// targeting the GOT entry for the edge's current target, maintaining the
+  /// same addend. A GOT entry for the target should be created if one does
+  /// not already exist.
+  ///
+  /// Edges of this kind are usually handled by a GOT builder pass inserted by
+  /// default.
+  ///
+  /// Fixup expression:
+  ///   NONE
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
+  ///
+  RequestGOTAndTransformToDelta32FromGOT,
+
+  /// A GOT entry getter/constructor, transformed to Delta20FromGOT pointing
+  /// at the GOT entry for the original target.
+  ///
+  /// Indicates that this edge should be transformed into a Delta20FromGOT
+  /// targeting the GOT entry for the edge's current target, maintaining the
+  /// same addend. A GOT entry for the target should be created if one does
+  /// not already exist.
+  ///
+  /// Edges of this kind are usually handled by a GOT builder pass inserted by
+  /// default.
+  ///
+  /// Fixup expression:
+  ///   NONE
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
+  ///
+  RequestGOTAndTransformToDelta20FromGOT,
+
+  /// A GOT entry getter/constructor, transformed to Delta16FromGOT pointing
+  /// at the GOT entry for the original target.
+  ///
+  /// Indicates that this edge should be transformed into a Delta16FromGOT
+  /// targeting the GOT entry for the edge's current target, maintaining the
+  /// same addend. A GOT entry for the target should be created if one does
+  /// not already exist.
+  ///
+  /// Edges of this kind are usually handled by a GOT builder pass inserted by
+  /// default.
+  ///
+  /// Fixup expression:
+  ///   NONE
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
+  ///
+  RequestGOTAndTransformToDelta16FromGOT,
+
+  /// A GOT entry getter/constructor, transformed to Delta12FromGOT pointing
+  /// at the GOT entry for the original target.
+  ///
+  /// Indicates that this edge should be transformed into a Delta12FromGOT
+  /// targeting the GOT entry for the edge's current target, maintaining the
+  /// same addend. A GOT entry for the target should be created if one does
+  /// not already exist.
+  ///
+  /// Edges of this kind are usually handled by a GOT builder pass inserted by
+  /// default.
+  ///
+  /// Fixup expression:
+  ///   NONE
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
+  ///     phase will result in an assert/unreachable during the fixup phase.
+  ///
+  RequestGOTAndTransformToDelta12FromGOT,
+
+  /// A GOT entry getter/constructor, transformed to Delta32dbl pointing at
+  /// the GOT entry for the original target.
+  ///
+  /// Indicates that this edge should be transformed into a Delta32dbl targeting
+  /// the GOT entry for the edge's current target, maintaining the same addend.
+  /// A GOT entry for the target should be created if one does not already
+  /// exist.
+  ///
+  /// Edges of this kind are usually handled by a GOT builder pass inserted by
+  /// default.
+  ///
+  /// Fixup expression:
+  ///   NONE
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
+  ///     phase will result in an assert/unreachable during the fixup phase.
+  ///
+  RequestGOTAndTransformToDelta32dbl,
+
+  /// A 32-bit Delta to GOT base.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- GOTBase - Fixup + Addend : int32
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///   - The result of the fixup expression must fit into an int32, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta32GOTBase,
+
+  /// A 32-bit Delta to GOT base shifted by 1.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (GOTBase - Fixup + Addend) >> 1 : int32
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int33, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  Delta32dblGOTBase,
+
+};
+
+/// Returns a string name for the given systemz edge. For debugging purposes
+/// only
+const char *getEdgeKindName(Edge::Kind K);
+
+/// Apply fixup expression for edge to block content.
+inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
+                        const Symbol *GOTSymbol) {
+  using namespace support;
+
+  char *BlockWorkingMem = B.getAlreadyMutableContent().data();
+  char *FixupPtr = BlockWorkingMem + E.getOffset();
+  orc::ExecutorAddr FixupAddress = B.getAddress() + E.getOffset();
+  int64_t S = E.getTarget().getAddress().getValue();
+  int64_t A = E.getAddend();
+  int64_t P = FixupAddress.getValue();
+  int64_t GOTBase = GOTSymbol ? GOTSymbol->getAddress().getValue() : 0;
+  Edge::Kind K = E.getKind();
+
+  DEBUG_WITH_TYPE("jitlink", {
+    dbgs() << "    Applying fixup on " << G.getEdgeKindName(K)
+           << " edge, (S, A, P, .GOT.) = (" << formatv("{0:x}", S) << ", "
+           << formatv("{0:x}", A) << ", " << formatv("{0:x}", P) << ", "
+           << formatv("{0:x}", GOTBase) << ")\n";
+  });
+
+  const auto isAlignmentCorrect = [](uint64_t Value, int N) {
+    return (Value & (N - 1)) ? false : true;
+  };
+
+  switch (K) {
+  case Pointer64: {
+    uint64_t Value = S + A;
+    write64be(FixupPtr, Value);
+    break;
+  }
+  case Pointer32: {
+    uint64_t Value = S + A;
+    if (!LLVM_UNLIKELY(isUInt<32>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write32be(FixupPtr, Value);
+    break;
+  }
+  case Pointer20: {
+    uint64_t Value = S + A;
+    if (!LLVM_UNLIKELY(isInt<20>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write32be(FixupPtr, (read32be(FixupPtr) & 0xF00000FF) |
+                            ((Value & 0xFFF) << 16) | ((Value & 0xFF000) >> 4));
+    break;
+  }
+  case Pointer16: {
+    uint64_t Value = S + A;
+    if (!LLVM_UNLIKELY(isUInt<16>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write16be(FixupPtr, Value);
+    break;
+  }
+  case Pointer12: {
+    uint64_t Value = S + A;
+    if (!LLVM_UNLIKELY(isUInt<12>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write16be(FixupPtr, (read16be(FixupPtr) & 0xF000) | Value);
+    break;
+  }
+  case Pointer8: {
+    uint64_t Value = S + A;
+    if (!LLVM_UNLIKELY(isUInt<8>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    *(uint8_t *)FixupPtr = Value;
+    break;
+  }
+  case Delta64:
+  case DeltaPLT64: {
+    int64_t Value = S + A - P;
+    write64be(FixupPtr, Value);
+    break;
+  }
+  case Delta32:
+  case DeltaPLT32: {
+    int64_t Value = S + A - P;
+    if (!LLVM_UNLIKELY(isInt<32>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write32be(FixupPtr, Value);
+    break;
+  }
+  case Delta16: {
+    int64_t Value = S + A - P;
+    if (!LLVM_UNLIKELY(isInt<16>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write16be(FixupPtr, Value);
+    break;
+  }
+  case NegDelta32: {
+    int64_t Value = P + A - S;
+    if (!LLVM_UNLIKELY(isInt<32>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write32be(FixupPtr, Value);
+    break;
+  }
+  case Delta32dbl:
+  case DeltaPLT32dbl: {
+    int64_t Value = S + A - P;
+    if (!LLVM_UNLIKELY(isInt<33>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    if (!LLVM_UNLIKELY(isAlignmentCorrect(Value, 2)))
+      return makeAlignmentError(FixupAddress, Value, 2, E);
+    write32be(FixupPtr, Value >> 1);
+    break;
+  }
+  case Delta24dbl:
+  case DeltaPLT24dbl: {
+    int64_t Value = S + A - P;
+    if (!LLVM_UNLIKELY(isInt<25>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    if (!LLVM_UNLIKELY(isAlignmentCorrect(Value, 2)))
+      return makeAlignmentError(FixupAddress, Value, 2, E);
+    FixupPtr[0] = Value >> 17;
+    FixupPtr[1] = Value >> 9;
+    FixupPtr[2] = Value >> 1;
+    break;
+  }
+  case Delta16dbl:
+  case DeltaPLT16dbl: {
+    int64_t Value = S + A - P;
+    if (!LLVM_UNLIKELY(isInt<17>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    if (!LLVM_UNLIKELY(isAlignmentCorrect(Value, 2)))
+      return makeAlignmentError(FixupAddress, Value, 2, E);
+    write16be(FixupPtr, Value >> 1);
+    break;
+  }
+  case Delta12dbl:
+  case DeltaPLT12dbl: {
+    int64_t Value = S + A - P;
+    if (!LLVM_UNLIKELY(isInt<13>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    if (!LLVM_UNLIKELY(isAlignmentCorrect(Value, 2)))
+      return makeAlignmentError(FixupAddress, Value, 2, E);
+    write16be(FixupPtr,
+              (read16be(FixupPtr) & 0xF000) | ((Value >> 1) & 0x0FFF));
+    break;
+  }
+  case Delta32GOTBase: {
+    assert(GOTSymbol && "No GOT section symbol");
+    int64_t Value = GOTBase + A - P;
+    if (!LLVM_UNLIKELY(isInt<32>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write32be(FixupPtr, Value);
+    break;
+  }
+  case Delta32dblGOTBase: {
+    assert(GOTSymbol && "No GOT section symbol");
+    int64_t Value = GOTBase + A - P;
+    if (!LLVM_UNLIKELY(isInt<33>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    if (!LLVM_UNLIKELY(isAlignmentCorrect(Value, 2)))
+      return makeAlignmentError(FixupAddress, Value, 2, E);
+    write32be(FixupPtr, Value >> 1);
+    break;
+  }
+  case Delta64PLTFromGOT:
+  case Delta64FromGOT: {
+    assert(GOTSymbol && "No GOT section symbol");
+    int64_t Value = S + A - GOTBase;
+    write64be(FixupPtr, Value);
+    break;
+  }
+  case Delta32PLTFromGOT:
+  case Delta32FromGOT: {
+    assert(GOTSymbol && "No GOT section symbol");
+    int64_t Value = S + A - GOTBase;
+    if (!LLVM_UNLIKELY(isInt<32>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write32be(FixupPtr, Value);
+    break;
+  }
+  case Delta16PLTFromGOT:
+  case Delta16FromGOT: {
+    assert(GOTSymbol && "No GOT section symbol");
+    int64_t Value = S + A - GOTBase;
+    if (!LLVM_UNLIKELY(isInt<16>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write16be(FixupPtr, Value);
+    break;
+  }
+  case Delta20FromGOT: {
+    assert(GOTSymbol && "No GOT section symbol");
+    uint64_t Value = S - GOTBase + A;
+    if (!LLVM_UNLIKELY(isInt<20>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write32be(FixupPtr, (read32be(FixupPtr) & 0xF00000FF) |
+                            ((Value & 0xFFF) << 16) | ((Value & 0xFF000) >> 4));
+    break;
+  }
+  case Delta12FromGOT: {
+    assert(GOTSymbol && "No GOT section symbol");
+    uint64_t Value = S - GOTBase + A;
+    if (!LLVM_UNLIKELY(isUInt<12>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write16be(FixupPtr, (read16be(FixupPtr) & 0xF000) | Value);
+    break;
+  }
+  default:
+    return make_error<JITLinkError>(
+        "In graph " + G.getName() + ", section " + B.getSection().getName() +
+        " unsupported edge kind " + getEdgeKindName(E.getKind()));
+  }
+
+  return Error::success();
+}
+
+/// SystemZ null pointer content.
+extern const char NullPointerContent[8];
+inline ArrayRef<char> getGOTEntryBlockContent(LinkGraph &G) {
+  return {reinterpret_cast<const char *>(NullPointerContent),
+          G.getPointerSize()};
+}
+
+/// SystemZ pointer jump stub content.
+///
+/// Contains the instruction sequence for an indirect jump via an in-memory
+/// pointer:
+///   lgrl %r1, ptr
+///   j    %r1
+constexpr size_t StubEntrySize = 8;
+extern const char Pointer64JumpStubContent[StubEntrySize];
+inline ArrayRef<char> getStubBlockContent(LinkGraph &G) {
+  auto StubContent = Pointer64JumpStubContent;
+  return {reinterpret_cast<const char *>(StubContent), StubEntrySize};
+}
+
+/// Creates a new pointer block in the given section and returns an
+/// Anonymous symbol pointing to it.
+///
+/// If InitialTarget is given then an Pointer64 relocation will be added to the
+/// block pointing at InitialTarget.
+inline Symbol &createAnonymousPointer(LinkGraph &G, Section &PointerSection,
+                                      Symbol *InitialTarget = nullptr,
+                                      uint64_t InitialAddend = 0) {
+  auto &B = G.createContentBlock(PointerSection, getGOTEntryBlockContent(G),
+                                 orc::ExecutorAddr(), G.getPointerSize(), 0);
+  if (InitialTarget)
+    B.addEdge(Pointer64, 0, *InitialTarget, InitialAddend);
+  return G.addAnonymousSymbol(B, 0, G.getPointerSize(), false, false);
+}
+
+/// Create a jump stub block that jumps via the pointer at the given symbol.
+///
+/// The stub block will have the following default values:
+///   alignment: 16-bit
+///   alignment-offset: 0
+inline Block &createPointerJumpStubBlock(LinkGraph &G, Section &StubSection,
+                                         Symbol &PointerSymbol) {
+  auto &B = G.createContentBlock(StubSection, getStubBlockContent(G),
+                                 orc::ExecutorAddr(), 16, 0);
+  B.addEdge(Delta32dbl, 2, PointerSymbol, 2);
+  return B;
+}
+
+/// Create a jump stub that jumps via the pointer at the given symbol and
+/// an anonymous symbol pointing to it. Return the anonymous symbol.
+///
+/// The stub block will be created by createPointerJumpStubBlock.
+inline Symbol &createAnonymousPointerJumpStub(LinkGraph &G,
+                                              Section &StubSection,
+                                              Symbol &PointerSymbol) {
+  return G.addAnonymousSymbol(
+      createPointerJumpStubBlock(G, StubSection, PointerSymbol), 0,
+      StubEntrySize, true, false);
+}
+
+/// Global Offset Table Builder.
+class GOTTableManager : public TableManager<GOTTableManager> {
+public:
+  static StringRef getSectionName() { return "$__GOT"; }
+
+  bool visitEdge(LinkGraph &G, Block *B, Edge &E) {
+    if (E.getTarget().isDefined())
+      return false;
+    Edge::Kind KindToSet = Edge::Invalid;
+    switch (E.getKind()) {
+    case systemz::RequestGOTAndTransformToDelta12FromGOT:
+      KindToSet = systemz::Delta12FromGOT;
+      break;
+    case systemz::RequestGOTAndTransformToDelta16FromGOT:
+      KindToSet = systemz::Delta16FromGOT;
+      break;
+    case systemz::RequestGOTAndTransformToDelta20FromGOT:
+      KindToSet = systemz::Delta20FromGOT;
+      break;
+    case systemz::RequestGOTAndTransformToDelta32FromGOT:
+      KindToSet = systemz::Delta32FromGOT;
+      break;
+    case systemz::RequestGOTAndTransformToDelta64FromGOT:
+      KindToSet = systemz::Delta64FromGOT;
+      break;
+    case systemz::RequestGOTAndTransformToDelta32dbl:
+      KindToSet = systemz::DeltaPLT32dbl;
+      break;
+    default:
+      return false;
+    }
+    assert(KindToSet != Edge::Invalid &&
+           "Fell through switch, but no new kind to set");
+    DEBUG_WITH_TYPE("jitlink", {
+      dbgs() << "  Fixing " << G.getEdgeKindName(E.getKind()) << " edge at "
+             << B->getFixupAddress(E) << " (" << B->getAddress() << " + "
+             << formatv("{0:x}", E.getOffset()) << ")\n";
+    });
+    E.setKind(KindToSet);
+    E.setTarget(getEntryForTarget(G, E.getTarget()));
+    return true;
+  }
+
+  Symbol &createEntry(LinkGraph &G, Symbol &Target) {
+    return createAnonymousPointer(G, getGOTSection(G), &Target);
+  }
+
+private:
+  Section &getGOTSection(LinkGraph &G) {
+    if (!GOTSection)
+      GOTSection = &G.createSection(getSectionName(),
+                                    orc::MemProt::Read | orc::MemProt::Exec);
+    return *GOTSection;
+  }
+
+  Section *GOTSection = nullptr;
+};
+
+/// Procedure Linkage Table Builder.
+class PLTTableManager : public TableManager<PLTTableManager> {
+public:
+  PLTTableManager(GOTTableManager &GOT) : GOT(GOT) {}
+
+  static StringRef getSectionName() { return "$__STUBS"; }
+
+  bool visitEdge(LinkGraph &G, Block *B, Edge &E) {
+    if (E.getTarget().isDefined())
+      return false;
+
+    switch (E.getKind()) {
+    case systemz::DeltaPLT32:
+    case systemz::DeltaPLT64:
+    case systemz::DeltaPLT12dbl:
+    case systemz::DeltaPLT16dbl:
+    case systemz::DeltaPLT24dbl:
+    case systemz::DeltaPLT32dbl:
+    case systemz::Delta16PLTFromGOT:
+    case systemz::Delta32PLTFromGOT:
+    case systemz::Delta64PLTFromGOT:
+      break;
+    default:
+      return false;
+    }
+    DEBUG_WITH_TYPE("jitlink", {
+      dbgs() << "  Fixing " << G.getEdgeKindName(E.getKind()) << " edge at "
+             << B->getFixupAddress(E) << " (" << B->getAddress() << " + "
+             << formatv("{0:x}", E.getOffset()) << ")\n";
+    });
+    E.setTarget(getEntryForTarget(G, E.getTarget()));
+    return true;
+  }
+
+  Symbol &createEntry(LinkGraph &G, Symbol &Target) {
+    return createAnonymousPointerJumpStub(G, getStubsSection(G),
+                                          GOT.getEntryForTarget(G, Target));
+  }
+
+public:
+  Section &getStubsSection(LinkGraph &G) {
+    if (!StubsSection)
+      StubsSection = &G.createSection(getSectionName(),
+                                      orc::MemProt::Read | orc::MemProt::Exec);
+    return *StubsSection;
+  }
+
+  GOTTableManager &GOT;
+  Section *StubsSection = nullptr;
+};
+
+} // namespace systemz
+} // namespace jitlink
+} // namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_JITLINK_SYSTEMZ_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
index f964d006f4ae1..01e9cf914cb54 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
@@ -43,11 +43,7 @@
 #include <algorithm>
 #include <cassert>
 #include <functional>
-#include <iterator>
-#include <list>
 #include <memory>
-#include <optional>
-#include <set>
 #include <utility>
 
 namespace llvm {
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h b/llvm/include/llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h
index 7b5d0f0eaba26..9dda1f94c75c7 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h
@@ -18,7 +18,6 @@
 
 #include <memory>
 #include <mutex>
-#include <vector>
 
 namespace llvm::orc {
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
index dd4102599bdb5..1296e24fa4162 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
@@ -36,7 +36,7 @@ size_t writeMachOStruct(MutableArrayRef<char> Buf, size_t Offset, MachOStruct S,
 
 /// Base type for MachOBuilder load command wrappers.
 struct MachOBuilderLoadCommandBase {
-  virtual ~MachOBuilderLoadCommandBase() {}
+  virtual ~MachOBuilderLoadCommandBase() = default;
   virtual size_t size() const = 0;
   virtual size_t write(MutableArrayRef<char> Buf, size_t Offset,
                        bool SwapStruct) = 0;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
index 8c6a8f5899c17..a0499f79704eb 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
@@ -26,7 +26,6 @@
 #include <algorithm>
 #include <cassert>
 #include <functional>
-#include <list>
 #include <memory>
 #include <utility>
 #include <vector>
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/SectCreate.h b/llvm/include/llvm/ExecutionEngine/Orc/SectCreate.h
index e6384eb4b6d26..a30890aa17c60 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/SectCreate.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/SectCreate.h
@@ -18,7 +18,6 @@
 #include "llvm/Support/Compiler.h"
 
 #include <utility>
-#include <vector>
 
 namespace llvm::orc {
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h
new file mode 100644
index 0000000000000..81c6a0b01530a
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h
@@ -0,0 +1,172 @@
+//===- SymbolFilter.h - Utilities for Symbol Filtering ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H
+#define LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H
+
+#include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h"
+
+#include <cmath>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+namespace shared {
+using SPSBloomFilter =
+    SPSTuple<bool, uint32_t, uint32_t, uint32_t, SPSSequence<uint64_t>>;
+}
+
+class BloomFilter {
+public:
+  using HashFunc = std::function<uint32_t(StringRef)>;
+
+  BloomFilter() = default;
+  BloomFilter(BloomFilter &&) noexcept = default;
+  BloomFilter &operator=(BloomFilter &&) noexcept = default;
+  BloomFilter(const BloomFilter &) = delete;
+  BloomFilter &operator=(const BloomFilter &) = delete;
+
+  BloomFilter(uint32_t SymbolCount, float FalsePositiveRate, HashFunc hashFn)
+      : HashFn(std::move(hashFn)) {
+    initialize(SymbolCount, FalsePositiveRate);
+  }
+  bool isInitialized() const { return Initialized; }
+
+  void add(StringRef Sym) {
+    assert(Initialized);
+    addHash(HashFn(Sym));
+  }
+
+  bool mayContain(StringRef Sym) const {
+    return !isEmpty() && testHash(HashFn(Sym));
+  }
+
+  bool isEmpty() const { return SymbolCount == 0; }
+
+private:
+  friend class shared::SPSSerializationTraits<shared::SPSBloomFilter,
+                                              BloomFilter>;
+  static constexpr uint32_t BitsPerEntry = 64;
+
+  bool Initialized = false;
+  uint32_t SymbolCount = 0;
+  uint32_t BloomSize = 0;
+  uint32_t BloomShift = 0;
+  std::vector<uint64_t> BloomTable;
+  HashFunc HashFn;
+
+  void initialize(uint32_t SymCount, float FalsePositiveRate) {
+    assert(SymCount > 0);
+    SymbolCount = SymCount;
+    Initialized = true;
+
+    float ln2 = std::log(2.0f);
+    float M = -1.0f * SymbolCount * std::log(FalsePositiveRate) / (ln2 * ln2);
+    BloomSize = static_cast<uint32_t>(std::ceil(M / BitsPerEntry));
+    BloomShift = std::min(6u, log2ceil(SymbolCount));
+    BloomTable.resize(BloomSize, 0);
+  }
+
+  void addHash(uint32_t Hash) {
+    uint32_t Hash2 = Hash >> BloomShift;
+    uint32_t N = (Hash / BitsPerEntry) % BloomSize;
+    uint64_t Mask =
+        (1ULL << (Hash % BitsPerEntry)) | (1ULL << (Hash2 % BitsPerEntry));
+    BloomTable[N] |= Mask;
+  }
+
+  bool testHash(uint32_t Hash) const {
+    uint32_t Hash2 = Hash >> BloomShift;
+    uint32_t N = (Hash / BitsPerEntry) % BloomSize;
+    uint64_t Mask =
+        (1ULL << (Hash % BitsPerEntry)) | (1ULL << (Hash2 % BitsPerEntry));
+    return (BloomTable[N] & Mask) == Mask;
+  }
+
+  static constexpr uint32_t log2ceil(uint32_t V) {
+    return V <= 1 ? 0 : 32 - countl_zero(V - 1);
+  }
+};
+
+class BloomFilterBuilder {
+public:
+  using HashFunc = BloomFilter::HashFunc;
+
+  BloomFilterBuilder() = default;
+
+  BloomFilterBuilder &setFalsePositiveRate(float Rate) {
+    assert(Rate > 0.0f && Rate < 1.0f);
+    FalsePositiveRate = Rate;
+    return *this;
+  }
+
+  BloomFilterBuilder &setHashFunction(HashFunc Fn) {
+    HashFn = std::move(Fn);
+    return *this;
+  }
+
+  BloomFilter build(ArrayRef<StringRef> Symbols) const {
+    assert(!Symbols.empty() && "Cannot build filter from empty symbol list.");
+    BloomFilter F(static_cast<uint32_t>(Symbols.size()), FalsePositiveRate,
+                  HashFn);
+    for (const auto &Sym : Symbols)
+      F.add(Sym);
+
+    return F;
+  }
+
+private:
+  float FalsePositiveRate = 0.02f;
+  HashFunc HashFn = [](StringRef S) -> uint32_t {
+    uint32_t H = 5381;
+    for (char C : S)
+      H = ((H << 5) + H) + static_cast<uint8_t>(C); // H * 33 + C
+    return H;
+  };
+};
+
+namespace shared {
+
+template <> class SPSSerializationTraits<SPSBloomFilter, BloomFilter> {
+public:
+  static size_t size(const BloomFilter &Filter) {
+    return SPSBloomFilter::AsArgList::size(
+        Filter.Initialized, Filter.SymbolCount, Filter.BloomSize,
+        Filter.BloomShift, Filter.BloomTable);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB, const BloomFilter &Filter) {
+    return SPSBloomFilter::AsArgList::serialize(
+        OB, Filter.Initialized, Filter.SymbolCount, Filter.BloomSize,
+        Filter.BloomShift, Filter.BloomTable);
+  }
+
+  static bool deserialize(SPSInputBuffer &IB, BloomFilter &Filter) {
+    bool IsInitialized;
+    uint32_t SymbolCount = 0, BloomSize = 0, BloomShift = 0;
+    std::vector<uint64_t> BloomTable;
+
+    if (!SPSBloomFilter::AsArgList::deserialize(
+            IB, IsInitialized, SymbolCount, BloomSize, BloomShift, BloomTable))
+      return false;
+
+    Filter.Initialized = IsInitialized;
+    Filter.SymbolCount = SymbolCount;
+    Filter.BloomSize = BloomSize;
+    Filter.BloomShift = BloomShift;
+    Filter.BloomTable = std::move(BloomTable);
+
+    return true;
+  }
+};
+
+} // end namespace shared
+} // end namespace orc
+} // end namespace llvm
+#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h b/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
index ef0fed4f41556..e6058612de4b7 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
@@ -20,7 +20,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include <mutex>
-#include <type_traits>
 #include <utility>
 
 namespace llvm {
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h
index 2c385de48ddf6..8f876504eaf53 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h
@@ -29,7 +29,7 @@ namespace rt_bootstrap {
 class LLVM_ABI ExecutorSharedMemoryMapperService final
     : public ExecutorBootstrapService {
 public:
-  ~ExecutorSharedMemoryMapperService() override {};
+  ~ExecutorSharedMemoryMapperService() override = default;
 
   Expected<std::pair<ExecutorAddr, std::string>> reserve(uint64_t Size);
   Expected<ExecutorAddr> initialize(ExecutorAddr Reservation,
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h
new file mode 100644
index 0000000000000..79cfc4832fe9a
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h
@@ -0,0 +1,525 @@
+//===- LibraryResolver.h - Automatic Library Symbol Resolution -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides support for automatically searching symbols across
+// dynamic libraries that have not yet been loaded.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H
+
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+#include "llvm/Support/Path.h"
+
+#include <atomic>
+#include <shared_mutex>
+
+namespace llvm {
+namespace orc {
+
+/// Manages library metadata and state for symbol resolution.
+///
+/// Tracks libraries by load state and kind (user/system), and stores
+/// associated Bloom filters and hash maps to speed up symbol lookups.
+/// Thread-safe for concurrent access.
+class LibraryManager {
+public:
+  enum class LibState : uint8_t { Unloaded = 0, Loaded = 1, Queried = 2 };
+
+  class LibraryInfo {
+  public:
+    LibraryInfo(const LibraryInfo &) = delete;
+    LibraryInfo &operator=(const LibraryInfo &) = delete;
+
+    LibraryInfo(std::string FilePath, LibState S, PathType K,
+                std::optional<BloomFilter> Filter = std::nullopt)
+        : FilePath(std::move(FilePath)), S(S), K(K), Filter(std::move(Filter)) {
+    }
+
+    StringRef getBasePath() const { return sys::path::parent_path(FilePath); }
+    StringRef getFileName() const { return sys::path::filename(FilePath); }
+
+    std::string getFullPath() const { return FilePath; }
+
+    void setFilter(BloomFilter F) {
+      std::lock_guard<std::shared_mutex> Lock(Mtx);
+      if (Filter)
+        return;
+      Filter.emplace(std::move(F));
+    }
+
+    void ensureFilterBuilt(const BloomFilterBuilder &FB,
+                           ArrayRef<StringRef> Symbols) {
+      std::lock_guard<std::shared_mutex> Lock(Mtx);
+      if (Filter)
+        return;
+      Filter.emplace(FB.build(Symbols));
+    }
+
+    bool mayContain(StringRef Symbol) const {
+      assert(hasFilter());
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      return Filter->mayContain(Symbol);
+    }
+
+    bool hasFilter() const {
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      return Filter.has_value();
+    }
+
+    LibState getState() const { return S.load(); }
+    PathType getKind() const { return K; }
+
+    void setState(LibState s) { S.store(s); }
+
+    bool operator==(const LibraryInfo &other) const {
+      return FilePath == other.FilePath;
+    }
+
+  private:
+    std::string FilePath;
+    std::atomic<LibState> S;
+    PathType K;
+    std::optional<BloomFilter> Filter;
+    mutable std::shared_mutex Mtx;
+  };
+
+  /// A read-only view of libraries filtered by state and kind.
+  ///
+  /// Lets you loop over only the libraries in a map that match a given State
+  /// and PathType.
+  class FilteredView {
+  public:
+    using Map = StringMap<std::shared_ptr<LibraryInfo>>;
+    using Iterator = Map::const_iterator;
+    class FilterIterator {
+    public:
+      FilterIterator(Iterator it_, Iterator end_, LibState S, PathType K)
+          : it(it_), end(end_), S(S), K(K) {
+        advance();
+      }
+
+      bool operator!=(const FilterIterator &other) const {
+        return it != other.it;
+      }
+
+      const std::shared_ptr<LibraryInfo> &operator*() const {
+        return it->second;
+      }
+
+      FilterIterator &operator++() {
+        ++it;
+        advance();
+        return *this;
+      }
+
+    private:
+      void advance() {
+        for (; it != end; ++it)
+          if (it->second->getState() == S && it->second->getKind() == K)
+            break;
+      }
+      Iterator it;
+      Iterator end;
+      LibState S;
+      PathType K;
+    };
+    FilteredView(Iterator begin, Iterator end, LibState s, PathType k)
+        : mapBegin(begin), mapEnd(end), state(s), kind(k) {}
+
+    FilterIterator begin() const {
+      return FilterIterator(mapBegin, mapEnd, state, kind);
+    }
+
+    FilterIterator end() const {
+      return FilterIterator(mapEnd, mapEnd, state, kind);
+    }
+
+  private:
+    Iterator mapBegin;
+    Iterator mapEnd;
+    LibState state;
+    PathType kind;
+  };
+
+private:
+  StringMap<std::shared_ptr<LibraryInfo>> Libraries;
+  mutable std::shared_mutex Mtx;
+
+public:
+  using LibraryVisitor = std::function<bool(const LibraryInfo &)>;
+
+  LibraryManager() = default;
+  ~LibraryManager() = default;
+
+  bool addLibrary(std::string Path, PathType Kind,
+                  std::optional<BloomFilter> Filter = std::nullopt) {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    if (Libraries.count(Path) > 0)
+      return false;
+    Libraries.insert({std::move(Path),
+                      std::make_shared<LibraryInfo>(Path, LibState::Unloaded,
+                                                    Kind, std::move(Filter))});
+    return true;
+  }
+
+  bool hasLibrary(StringRef Path) const {
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
+    if (Libraries.count(Path) > 0)
+      return true;
+    return false;
+  }
+
+  void removeLibrary(StringRef Path) {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    auto I = Libraries.find(Path);
+    if (I == Libraries.end())
+      return;
+    Libraries.erase(I);
+  }
+
+  void markLoaded(StringRef Path) {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    if (auto It = Libraries.find(Path); It != Libraries.end())
+      It->second->setState(LibState::Loaded);
+  }
+
+  void markQueried(StringRef Path) {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    if (auto It = Libraries.find(Path); It != Libraries.end())
+      It->second->setState(LibState::Queried);
+  }
+
+  std::shared_ptr<LibraryInfo> getLibrary(StringRef Path) {
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
+    if (auto It = Libraries.find(Path); It != Libraries.end())
+      return It->second;
+    return nullptr;
+  }
+
+  FilteredView getView(LibState S, PathType K) const {
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
+    return FilteredView(Libraries.begin(), Libraries.end(), S, K);
+  }
+
+  using LibraryFilterFn = std::function<bool(const LibraryInfo &)>;
+  void getLibraries(LibState S, PathType K,
+                    std::vector<std::shared_ptr<LibraryInfo>> &Outs,
+                    LibraryFilterFn Filter = nullptr) const {
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
+    for (const auto &[_, Entry] : Libraries) {
+      const auto &Info = *Entry;
+      if (Info.getKind() != K || Info.getState() != S)
+        continue;
+      if (Filter && !Filter(Info))
+        continue;
+      Outs.push_back(Entry);
+    }
+  }
+
+  void forEachLibrary(const LibraryVisitor &visitor) const {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    for (const auto &[_, entry] : Libraries) {
+      if (!visitor(*entry))
+        break;
+    }
+  }
+
+  bool isLoaded(StringRef Path) const {
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
+    if (auto It = Libraries.find(Path.str()); It != Libraries.end())
+      return It->second->getState() == LibState::Loaded;
+    return false;
+  }
+
+  bool isQueried(StringRef Path) const {
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
+    if (auto It = Libraries.find(Path.str()); It != Libraries.end())
+      return It->second->getState() == LibState::Queried;
+    return false;
+  }
+
+  void clear() {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    Libraries.clear();
+  }
+};
+
+using LibraryInfo = LibraryManager::LibraryInfo;
+
+struct SearchPlanEntry {
+  LibraryManager::LibState State; // Loaded, Queried, Unloaded
+  PathType Type;                  // User, System
+};
+
+struct SearchPolicy {
+  std::vector<SearchPlanEntry> Plan;
+
+  static SearchPolicy defaultPlan() {
+    return {{{LibraryManager::LibState::Loaded, PathType::User},
+             {LibraryManager::LibState::Queried, PathType::User},
+             {LibraryManager::LibState::Unloaded, PathType::User},
+             {LibraryManager::LibState::Loaded, PathType::System},
+             {LibraryManager::LibState::Queried, PathType::System},
+             {LibraryManager::LibState::Unloaded, PathType::System}}};
+  }
+};
+
+struct SymbolEnumeratorOptions {
+  enum Filter : uint32_t {
+    None = 0,
+    IgnoreUndefined = 1 << 0,
+    IgnoreWeak = 1 << 1,
+    IgnoreIndirect = 1 << 2,
+    IgnoreHidden = 1 << 3,
+    IgnoreNonGlobal = 1 << 4
+  };
+
+  static SymbolEnumeratorOptions defaultOptions() {
+    return {Filter::IgnoreUndefined | Filter::IgnoreWeak |
+            Filter::IgnoreIndirect};
+  }
+  uint32_t FilterFlags = Filter::None;
+};
+
+struct SearchConfig {
+  SearchPolicy Policy;
+  SymbolEnumeratorOptions Options;
+
+  SearchConfig()
+      : Policy(SearchPolicy::defaultPlan()), // default plan
+        Options(SymbolEnumeratorOptions::defaultOptions()) {}
+};
+
+/// Scans libraries and resolves Symbols across user and system paths.
+///
+/// Supports symbol enumeration and filtering via SymbolEnumerator, and tracks
+/// symbol resolution results through SymbolQuery. Thread-safe and uses
+/// LibraryScanHelper for efficient path resolution and caching.
+class LibraryResolver {
+  friend class LibraryResolutionDriver;
+
+public:
+  class SymbolEnumerator {
+  public:
+    enum class EnumerateResult { Continue, Stop, Error };
+
+    using OnEachSymbolFn = std::function<EnumerateResult(StringRef Sym)>;
+
+    static bool enumerateSymbols(StringRef Path, OnEachSymbolFn OnEach,
+                                 const SymbolEnumeratorOptions &Opts);
+  };
+
+  /// Tracks a set of symbols and the libraries where they are resolved.
+  ///
+  /// SymbolQuery is used to keep track of which symbols have been resolved
+  /// to which libraries. It supports concurrent read/write access using a
+  /// shared mutex, allowing multiple readers or a single writer at a time.
+  class SymbolQuery {
+  public:
+    /// Holds the result for a single symbol.
+    struct Result {
+      std::string Name;
+      std::string ResolvedLibPath;
+    };
+
+  private:
+    mutable std::shared_mutex Mtx;
+    StringMap<Result> Results;
+    std::atomic<size_t> ResolvedCount = 0;
+
+  public:
+    explicit SymbolQuery(const std::vector<std::string> &Symbols) {
+      for (const auto &s : Symbols) {
+        if (!Results.contains(s))
+          Results.insert({s, Result{s, ""}});
+      }
+    }
+
+    SmallVector<StringRef> getUnresolvedSymbols() const {
+      SmallVector<StringRef> Unresolved;
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      for (const auto &[name, res] : Results) {
+        if (res.ResolvedLibPath.empty())
+          Unresolved.push_back(name);
+      }
+      return Unresolved;
+    }
+
+    void resolve(StringRef Sym, const std::string &LibPath) {
+      std::unique_lock<std::shared_mutex> Lock(Mtx);
+      auto It = Results.find(Sym);
+      if (It != Results.end() && It->second.ResolvedLibPath.empty()) {
+        It->second.ResolvedLibPath = LibPath;
+        ResolvedCount.fetch_add(1, std::memory_order_relaxed);
+      }
+    }
+
+    bool allResolved() const {
+      return ResolvedCount.load(std::memory_order_relaxed) == Results.size();
+    }
+
+    bool hasUnresolved() const {
+      return ResolvedCount.load(std::memory_order_relaxed) < Results.size();
+    }
+
+    std::optional<StringRef> getResolvedLib(StringRef Sym) const {
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      auto It = Results.find(Sym);
+      if (It != Results.end() && !It->second.ResolvedLibPath.empty())
+        return StringRef(It->second.ResolvedLibPath);
+      return std::nullopt;
+    }
+
+    bool isResolved(StringRef Sym) const {
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      auto It = Results.find(Sym.str());
+      return It != Results.end() && !It->second.ResolvedLibPath.empty();
+    }
+
+    std::vector<const Result *> getAllResults() const {
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      std::vector<const Result *> Out;
+      Out.reserve(Results.size());
+      for (const auto &[_, res] : Results)
+        Out.push_back(&res);
+      return Out;
+    }
+  };
+
+  struct Setup {
+    std::vector<std::string> BasePaths;
+    std::shared_ptr<LibraryPathCache> Cache;
+    std::shared_ptr<PathResolver> PResolver;
+
+    size_t ScanBatchSize = 0;
+
+    LibraryScanner::ShouldScanFn ShouldScanCall = [](StringRef) {
+      return true;
+    };
+
+    BloomFilterBuilder FilterBuilder = BloomFilterBuilder();
+
+    static Setup
+    create(std::vector<std::string> BasePaths,
+           std::shared_ptr<LibraryPathCache> existingCache = nullptr,
+           std::shared_ptr<PathResolver> existingResolver = nullptr,
+           LibraryScanner::ShouldScanFn customShouldScan = nullptr) {
+      Setup S;
+      S.BasePaths = std::move(BasePaths);
+
+      S.Cache =
+          existingCache ? existingCache : std::make_shared<LibraryPathCache>();
+
+      S.PResolver = existingResolver ? existingResolver
+                                     : std::make_shared<PathResolver>(S.Cache);
+
+      if (customShouldScan)
+        S.ShouldScanCall = std::move(customShouldScan);
+
+      return S;
+    }
+  };
+
+  LibraryResolver() = delete;
+  explicit LibraryResolver(const Setup &S);
+  ~LibraryResolver() = default;
+
+  using OnSearchComplete = unique_function<void(SymbolQuery &)>;
+
+  void dump() {
+    int i = 0;
+    LibMgr.forEachLibrary([&](const LibraryInfo &Lib) -> bool {
+      dbgs() << ++i << ". Library Path : " << Lib.getFullPath() << " -> \n\t\t:"
+             << " ({Type : ("
+             << (Lib.getKind() == PathType::User ? "User" : "System")
+             << ") }, { State : "
+             << (Lib.getState() == LibraryManager::LibState::Loaded
+                     ? "Loaded"
+                     : "Unloaded")
+             << "})\n";
+      return true;
+    });
+  }
+
+  void searchSymbolsInLibraries(std::vector<std::string> &SymList,
+                                OnSearchComplete OnComplete,
+                                const SearchConfig &Config = SearchConfig());
+
+private:
+  bool scanLibrariesIfNeeded(PathType K, size_t BatchSize = 0);
+  void resolveSymbolsInLibrary(LibraryInfo &Lib, SymbolQuery &Q,
+                               const SymbolEnumeratorOptions &Opts);
+  bool
+  symbolExistsInLibrary(const LibraryInfo &Lib, StringRef Sym,
+                        std::vector<std::string> *MatchedSymbols = nullptr);
+
+  bool symbolExistsInLibrary(const LibraryInfo &Lib, StringRef SymName,
+                             std::vector<std::string> *AllSymbols,
+                             const SymbolEnumeratorOptions &Opts);
+
+  std::shared_ptr<LibraryPathCache> LibPathCache;
+  std::shared_ptr<PathResolver> LibPathResolver;
+  LibraryScanHelper ScanHelper;
+  BloomFilterBuilder FB;
+  LibraryManager LibMgr;
+  LibraryScanner::ShouldScanFn ShouldScanCall;
+  size_t scanBatchSize;
+};
+
+using SymbolEnumerator = LibraryResolver::SymbolEnumerator;
+using SymbolQuery = LibraryResolver::SymbolQuery;
+using EnumerateResult = SymbolEnumerator::EnumerateResult;
+
+class LibraryResolutionDriver {
+public:
+  static std::unique_ptr<LibraryResolutionDriver>
+  create(const LibraryResolver::Setup &S);
+
+  void addScanPath(const std::string &Path, PathType Kind);
+  bool markLibraryLoaded(StringRef Path);
+  bool markLibraryUnLoaded(StringRef Path);
+  bool isLibraryLoaded(StringRef Path) const {
+    return LR->LibMgr.isLoaded(Path);
+  }
+
+  void resetAll() {
+    LR->LibMgr.clear();
+    LR->ScanHelper.resetToScan();
+    LR->LibPathCache->clear();
+  }
+
+  void scanAll(size_t BatchSize = 0) {
+    LR->scanLibrariesIfNeeded(PathType::User, BatchSize);
+    LR->scanLibrariesIfNeeded(PathType::System, BatchSize);
+  }
+
+  void scan(PathType PK, size_t BatchSize = 0) {
+    LR->scanLibrariesIfNeeded(PK, BatchSize);
+  }
+
+  void resolveSymbols(std::vector<std::string> Symbols,
+                      LibraryResolver::OnSearchComplete OnCompletion,
+                      const SearchConfig &Config = SearchConfig());
+
+  ~LibraryResolutionDriver() = default;
+
+private:
+  LibraryResolutionDriver(std::unique_ptr<LibraryResolver> L)
+      : LR(std::move(L)) {}
+
+  std::unique_ptr<LibraryResolver> LR;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h
new file mode 100644
index 0000000000000..61aefbda35337
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h
@@ -0,0 +1,472 @@
+//===- LibraryScanner.h - Scanner for Shared Libraries ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides functionality for scanning dynamic (shared) libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H
+
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/StringSaver.h"
+
+#include <atomic>
+#include <mutex>
+#include <queue>
+#include <shared_mutex>
+#include <string>
+
+namespace llvm {
+namespace orc {
+
+class LibraryManager;
+
+class LibraryPathCache {
+  friend class PathResolver;
+
+public:
+  LibraryPathCache() = default;
+
+  void clear(bool isRealPathCache = false) {
+    std::unique_lock<std::shared_mutex> lock(Mtx);
+    Seen.clear();
+    if (isRealPathCache) {
+      RealPathCache.clear();
+#ifndef _WIN32
+      ReadlinkCache.clear();
+      LstatCache.clear();
+#endif
+    }
+  }
+
+  void markSeen(const std::string &CanonPath) {
+    std::unique_lock<std::shared_mutex> lock(Mtx);
+    Seen.insert(CanonPath);
+  }
+
+  bool hasSeen(StringRef CanonPath) const {
+    std::shared_lock<std::shared_mutex> lock(Mtx);
+    return Seen.contains(CanonPath);
+  }
+
+  bool hasSeenOrMark(StringRef CanonPath) {
+    std::string s = CanonPath.str();
+    {
+      std::shared_lock<std::shared_mutex> lock(Mtx);
+      if (Seen.contains(s))
+        return true;
+    }
+    {
+      std::unique_lock<std::shared_mutex> lock(Mtx);
+      Seen.insert(s);
+    }
+    return false;
+  }
+
+private:
+  mutable std::shared_mutex Mtx;
+
+  struct PathInfo {
+    std::string canonicalPath;
+    std::error_code ErrnoCode;
+  };
+
+  void insert_realpath(StringRef Path, const PathInfo &Info) {
+    std::unique_lock<std::shared_mutex> lock(Mtx);
+    RealPathCache.insert({Path, Info});
+  }
+
+  std::optional<PathInfo> read_realpath(StringRef Path) const {
+    std::shared_lock<std::shared_mutex> lock(Mtx);
+    auto It = RealPathCache.find(Path);
+    if (It != RealPathCache.end())
+      return It->second;
+
+    return std::nullopt;
+  }
+
+  StringSet<> Seen;
+  StringMap<PathInfo> RealPathCache;
+
+#ifndef _WIN32
+  StringMap<std::string> ReadlinkCache;
+  StringMap<mode_t> LstatCache;
+
+  void insert_link(StringRef Path, const std::string &s) {
+    std::unique_lock<std::shared_mutex> lock(Mtx);
+    ReadlinkCache.insert({Path, s});
+  }
+
+  std::optional<std::string> read_link(StringRef Path) const {
+    std::shared_lock<std::shared_mutex> lock(Mtx);
+    auto It = ReadlinkCache.find(Path);
+    if (It != ReadlinkCache.end())
+      return It->second;
+
+    return std::nullopt;
+  }
+
+  void insert_lstat(StringRef Path, mode_t m) {
+    std::unique_lock<std::shared_mutex> lock(Mtx);
+    LstatCache.insert({Path, m});
+  }
+
+  std::optional<mode_t> read_lstat(StringRef Path) const {
+    std::shared_lock<std::shared_mutex> lock(Mtx);
+    auto It = LstatCache.find(Path);
+    if (It != LstatCache.end())
+      return It->second;
+
+    return std::nullopt;
+  }
+
+#endif
+};
+
+/// Resolves file system paths with optional caching of results.
+///
+/// Supports lstat, readlink, and realpath operations. Can resolve paths
+/// relative to a base and handle symbolic links. Caches results to reduce
+/// repeated system calls when enabled.
+class PathResolver {
+private:
+  std::shared_ptr<LibraryPathCache> LibPathCache;
+
+public:
+  PathResolver(std::shared_ptr<LibraryPathCache> cache)
+      : LibPathCache(std::move(cache)) {}
+
+  std::optional<std::string> resolve(StringRef Path, std::error_code &ec) {
+    return realpathCached(Path, ec);
+  }
+#ifndef _WIN32
+  mode_t lstatCached(StringRef Path);
+  std::optional<std::string> readlinkCached(StringRef Path);
+#endif
+  std::optional<std::string> realpathCached(StringRef Path, std::error_code &ec,
+                                            StringRef base = "",
+                                            bool baseIsResolved = false,
+                                            long symloopLevel = 40);
+};
+
+/// Performs placeholder substitution in dynamic library paths.
+///
+/// Configures known placeholders (like @loader_path) and replaces them
+/// in input paths with their resolved values.
+class DylibSubstitutor {
+public:
+  void configure(StringRef loaderPath);
+
+  std::string substitute(StringRef input) const {
+    for (const auto &[ph, value] : Placeholders) {
+      if (input.starts_with_insensitive(ph))
+        return (Twine(value) + input.drop_front(ph.size())).str();
+    }
+    return input.str();
+  }
+
+private:
+  StringMap<std::string> Placeholders;
+};
+
+/// Validates and normalizes dynamic library paths.
+///
+/// Uses a `PathResolver` to resolve paths to their canonical form and
+/// checks whether they point to valid shared libraries.
+class DylibPathValidator {
+public:
+  DylibPathValidator(PathResolver &PR) : LibPathResolver(PR) {}
+
+  static bool isSharedLibrary(StringRef Path);
+
+  std::optional<std::string> normalize(StringRef Path) const {
+    std::error_code ec;
+    auto real = LibPathResolver.resolve(Path, ec);
+    if (!real || ec)
+      return std::nullopt;
+
+    return real;
+  }
+
+  /// Validate the given path as a shared library.
+  std::optional<std::string> validate(StringRef Path) const {
+    auto realOpt = normalize(Path);
+    if (!realOpt)
+      return std::nullopt;
+
+    if (!isSharedLibrary(*realOpt))
+      return std::nullopt;
+
+    return realOpt;
+  }
+
+private:
+  PathResolver &LibPathResolver;
+};
+
+enum class SearchPathType {
+  RPath,
+  UsrOrSys,
+  RunPath,
+};
+
+struct SearchPathConfig {
+  ArrayRef<StringRef> Paths;
+  SearchPathType type;
+};
+
+class SearchPathResolver {
+public:
+  SearchPathResolver(const SearchPathConfig &Cfg,
+                     StringRef PlaceholderPrefix = "")
+      : Kind(Cfg.type), PlaceholderPrefix(PlaceholderPrefix) {
+    for (auto &path : Cfg.Paths)
+      Paths.emplace_back(path.str());
+  }
+
+  std::optional<std::string> resolve(StringRef libStem,
+                                     const DylibSubstitutor &Subst,
+                                     DylibPathValidator &Validator) const;
+  SearchPathType searchPathType() const { return Kind; }
+
+private:
+  std::vector<std::string> Paths;
+  SearchPathType Kind;
+  std::string PlaceholderPrefix;
+};
+
+class DylibResolverImpl {
+public:
+  DylibResolverImpl(DylibSubstitutor Substitutor, DylibPathValidator &Validator,
+                    std::vector<SearchPathResolver> Resolvers)
+      : Substitutor(std::move(Substitutor)), Validator(Validator),
+        Resolvers(std::move(Resolvers)) {}
+
+  std::optional<std::string> resolve(StringRef Stem,
+                                     bool VariateLibStem = false) const;
+
+private:
+  std::optional<std::string> tryWithExtensions(StringRef libstem) const;
+
+  DylibSubstitutor Substitutor;
+  DylibPathValidator &Validator;
+  std::vector<SearchPathResolver> Resolvers;
+};
+
+class DylibResolver {
+public:
+  DylibResolver(DylibPathValidator &Validator) : Validator(Validator) {}
+
+  void configure(StringRef loaderPath,
+                 ArrayRef<SearchPathConfig> SearchPathCfg) {
+    DylibSubstitutor Substitutor;
+    Substitutor.configure(loaderPath);
+
+    std::vector<SearchPathResolver> Resolvers;
+    for (const auto &cfg : SearchPathCfg) {
+      Resolvers.emplace_back(cfg,
+                             cfg.type == SearchPathType::RPath ? "@rpath" : "");
+    }
+
+    impl_ = std::make_unique<DylibResolverImpl>(
+        std::move(Substitutor), Validator, std::move(Resolvers));
+  }
+
+  std::optional<std::string> resolve(StringRef libStem,
+                                     bool VariateLibStem = false) const {
+    if (!impl_)
+      return std::nullopt;
+    return impl_->resolve(libStem, VariateLibStem);
+  }
+
+  static std::string resolvelinkerFlag(StringRef libStem,
+                                       StringRef loaderPath) {
+    DylibSubstitutor Substitutor;
+    Substitutor.configure(loaderPath);
+    return Substitutor.substitute(libStem);
+  }
+
+private:
+  DylibPathValidator &Validator;
+  std::unique_ptr<DylibResolverImpl> impl_;
+};
+
+enum class PathType : uint8_t { User, System, Unknown };
+
+enum class ScanState : uint8_t { NotScanned, Scanning, Scanned };
+
+struct LibrarySearchPath {
+  std::string BasePath; // Canonical base directory path
+  PathType Kind;        // User or System
+  std::atomic<ScanState> State;
+
+  LibrarySearchPath(std::string Base, PathType K)
+      : BasePath(std::move(Base)), Kind(K), State(ScanState::NotScanned) {}
+};
+
+/// Scans and tracks libraries for symbol resolution.
+///
+/// Maintains a list of library paths to scan, caches scanned units,
+/// and resolves paths canonically for consistent tracking.
+class LibraryScanHelper {
+public:
+  explicit LibraryScanHelper(const std::vector<std::string> &SPaths,
+                             std::shared_ptr<LibraryPathCache> LibPathCache,
+                             std::shared_ptr<PathResolver> LibPathResolver)
+      : LibPathCache(std::move(LibPathCache)),
+        LibPathResolver(std::move(LibPathResolver)) {
+    DEBUG_WITH_TYPE(
+        "orc", dbgs() << "LibraryScanHelper::LibraryScanHelper: base paths : "
+                      << SPaths.size() << "\n";);
+    for (const auto &p : SPaths)
+      addBasePath(p);
+  }
+
+  void
+  addBasePath(const std::string &P,
+              PathType Kind =
+                  PathType::Unknown); // Add a canonical directory for scanning
+  std::vector<std::shared_ptr<LibrarySearchPath>>
+  getNextBatch(PathType Kind, size_t batchSize);
+
+  bool leftToScan(PathType K) const;
+  void resetToScan();
+
+  bool isTrackedBasePath(StringRef P) const;
+  std::vector<std::shared_ptr<LibrarySearchPath>> getAllUnits() const;
+
+  SmallVector<StringRef> getSearchPaths() const {
+    SmallVector<StringRef> SearchPaths;
+    for (const auto &[_, SP] : LibSearchPaths)
+      SearchPaths.push_back(SP->BasePath);
+    return SearchPaths;
+  }
+
+  PathResolver &getPathResolver() const { return *LibPathResolver; }
+
+  LibraryPathCache &getCache() const { return *LibPathCache; }
+
+  bool hasSeenOrMark(StringRef P) const {
+    return LibPathCache->hasSeenOrMark(P);
+  }
+
+  std::optional<std::string> resolve(StringRef P, std::error_code &ec) const {
+    return LibPathResolver->resolve(P.str(), ec);
+  }
+
+private:
+  std::string resolveCanonical(StringRef P, std::error_code &ec) const;
+  PathType classifyKind(StringRef P) const;
+
+  mutable std::shared_mutex Mtx;
+  std::shared_ptr<LibraryPathCache> LibPathCache;
+  std::shared_ptr<PathResolver> LibPathResolver;
+
+  StringMap<std::shared_ptr<LibrarySearchPath>>
+      LibSearchPaths; // key: canonical path
+  std::deque<StringRef> UnscannedUsr;
+  std::deque<StringRef> UnscannedSys;
+};
+
+/// Loads an object file and provides access to it.
+///
+/// Owns the underlying `ObjectFile` and ensures it is valid.
+/// Any errors encountered during construction are stored and
+/// returned when attempting to access the file.
+class ObjectFileLoader {
+public:
+  /// Construct an object file loader from the given path.
+  explicit ObjectFileLoader(StringRef Path) {
+    auto ObjOrErr = loadObjectFileWithOwnership(Path);
+    if (ObjOrErr)
+      Obj = std::move(*ObjOrErr);
+    else {
+      consumeError(std::move(Err));
+      Err = ObjOrErr.takeError();
+    }
+  }
+
+  ObjectFileLoader(const ObjectFileLoader &) = delete;
+  ObjectFileLoader &operator=(const ObjectFileLoader &) = delete;
+
+  ObjectFileLoader(ObjectFileLoader &&) = default;
+  ObjectFileLoader &operator=(ObjectFileLoader &&) = default;
+
+  /// Get the loaded object file, or return an error if loading failed.
+  Expected<object::ObjectFile &> getObjectFile() {
+    if (Err)
+      return std::move(Err);
+    return *Obj.getBinary();
+  }
+
+  static bool isArchitectureCompatible(const object::ObjectFile &Obj);
+
+private:
+  object::OwningBinary<object::ObjectFile> Obj;
+  Error Err = Error::success();
+
+  static Expected<object::OwningBinary<object::ObjectFile>>
+  loadObjectFileWithOwnership(StringRef FilePath);
+};
+
+/// Scans libraries, resolves dependencies, and registers them.
+class LibraryScanner {
+public:
+  using ShouldScanFn = std::function<bool(StringRef)>;
+
+  LibraryScanner(
+      LibraryScanHelper &H, LibraryManager &LibMgr,
+      ShouldScanFn ShouldScanCall = [](StringRef path) { return true; })
+      : ScanHelper(H), LibMgr(LibMgr),
+        ShouldScanCall(std::move(ShouldScanCall)) {}
+
+  void scanNext(PathType Kind, size_t batchSize = 1);
+
+  /// Dependency info for a library.
+  struct LibraryDepsInfo {
+    llvm::BumpPtrAllocator Alloc;
+    llvm::StringSaver Saver{Alloc};
+
+    SmallVector<StringRef, 2> rpath;
+    SmallVector<StringRef, 2> runPath;
+    SmallVector<StringRef, 4> deps;
+    bool isPIE = false;
+
+    void addRPath(StringRef s) { rpath.push_back(Saver.save(s)); }
+
+    void addRunPath(StringRef s) { runPath.push_back(Saver.save(s)); }
+
+    void addDep(StringRef s) { deps.push_back(Saver.save(s)); }
+  };
+
+private:
+  LibraryScanHelper &ScanHelper;
+  LibraryManager &LibMgr;
+  ShouldScanFn ShouldScanCall;
+
+  std::optional<std::string> shouldScan(StringRef FilePath);
+  Expected<LibraryDepsInfo> extractDeps(StringRef FilePath);
+
+  void handleLibrary(StringRef P, PathType K, int level = 1);
+
+  void scanBaseDir(std::shared_ptr<LibrarySearchPath> U);
+};
+
+using LibraryDepsInfo = LibraryScanner::LibraryDepsInfo;
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index d7f0e3a3d49da..67ebafc89cf99 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -50,7 +50,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 
-#include <algorithm>
 #include <iterator>
 #include <optional>
 #include <tuple>
@@ -242,7 +241,7 @@ ENUM(MotionExpectation, Present);
 // V5.2: [15.9.1] `task-dependence-type` modifier
 ENUM(DependenceType, Depobj, In, Inout, Inoutset, Mutexinoutset, Out, Sink,
      Source);
-ENUM(Prescriptiveness, Strict, Fallback);
+ENUM(Prescriptiveness, Strict);
 
 template <typename I, typename E> //
 struct LoopIterationT {
@@ -446,7 +445,12 @@ struct CollapseT {
   N v;
 };
 
-// V5.2: [15.8.3] `extended-atomic` clauses
+// [6.0:266]
+template <typename T, typename I, typename E> //
+struct CollectorT {
+  using IncompleteTrait = std::true_type;
+};
+
 template <typename T, typename I, typename E> //
 struct CompareT {
   using EmptyTrait = std::true_type;
@@ -587,10 +591,10 @@ struct DynamicAllocatorsT {
 template <typename T, typename I, typename E> //
 struct DynGroupprivateT {
   ENUM(AccessGroup, Cgroup);
-  using Prescriptiveness = type::Prescriptiveness;
+  ENUM(Fallback, Abort, Default_Mem, Null);
   using Size = E;
   using TupleTrait = std::true_type;
-  std::tuple<OPT(AccessGroup), OPT(Prescriptiveness), Size> t;
+  std::tuple<OPT(AccessGroup), OPT(Fallback), Size> t;
 };
 
 // V5.2: [5.8.4] `enter` clause
@@ -736,6 +740,12 @@ struct IndirectT {
   OPT(InvokedByFptr) v;
 };
 
+// [6.0:265-266]
+template <typename T, typename I, typename E> //
+struct InductorT {
+  using IncompleteTrait = std::true_type;
+};
+
 // V5.2: [14.1.2] `init` clause
 template <typename T, typename I, typename E> //
 struct InitT {
@@ -1324,8 +1334,9 @@ using EmptyClausesT = std::variant<
 
 template <typename T, typename I, typename E>
 using IncompleteClausesT =
-    std::variant<AdjustArgsT<T, I, E>, AppendArgsT<T, I, E>, GraphIdT<T, I, E>,
-                 GraphResetT<T, I, E>, MatchT<T, I, E>, OtherwiseT<T, I, E>,
+    std::variant<AdjustArgsT<T, I, E>, AppendArgsT<T, I, E>,
+                 CollectorT<T, I, E>, GraphIdT<T, I, E>, GraphResetT<T, I, E>,
+                 InductorT<T, I, E>, MatchT<T, I, E>, OtherwiseT<T, I, E>,
                  ReplayableT<T, I, E>, TransparentT<T, I, E>, WhenT<T, I, E>>;
 
 template <typename T, typename I, typename E>
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index 6d6eb5cda52de..36b49e69650d8 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -68,17 +68,23 @@ find_unique(Container &&container, Predicate &&pred) {
 
 namespace tomp {
 
-// ClauseType - Either instance of ClauseT, or a type derived from ClauseT.
-//
-// This is the clause representation in the code using this infrastructure.
-//
-// HelperType - A class that implements two member functions:
+enum struct ErrorCode : int {
+  NoLeafAllowing,    // No leaf that allows this clause
+  NoLeafPrivatizing, // No leaf that has a privatizing clause
+  InvalidDirNameMod, // Invalid directive name modifier
+  RedModNotApplied,  // Reduction modifier not applied
+};
+
+// ClauseType: Either an instance of ClauseT, or a type derived from ClauseT.
+//   This is the clause representation in the code using this infrastructure.
 //
+// HelperType: A class that implements two member functions:
 //   // Return the base object of the given object, if any.
 //   std::optional<Object> getBaseObject(const Object &object) const
 //   // Return the iteration variable of the outermost loop associated
 //   // with the construct being worked on, if any.
 //   std::optional<Object> getLoopIterVar() const
+
 template <typename ClauseType, typename HelperType>
 struct ConstructDecompositionT {
   using ClauseTy = ClauseType;
@@ -115,10 +121,16 @@ struct ConstructDecompositionT {
   }
 
   tomp::ListT<DirectiveWithClauses<ClauseType>> output;
+  llvm::SmallVector<std::pair<const ClauseType *, ErrorCode>> errors;
 
 private:
   bool split();
 
+  bool error(const ClauseTy *node, ErrorCode ec) {
+    errors.emplace_back(node, ec);
+    return false;
+  }
+
   struct LeafReprInternal {
     llvm::omp::Directive id = llvm::omp::Directive::OMPD_unknown;
     tomp::type::ListT<const ClauseTy *> clauses;
@@ -181,66 +193,71 @@ struct ConstructDecompositionT {
   std::enable_if_t<llvm::remove_cvref_t<U>::UnionTrait::value, void>
   addClauseSymsToMap(U &&item, const ClauseTy *);
 
-  // Apply a clause to the only directive that allows it. If there are no
+  // Apply the clause to the only directive that allows it. If there are no
   // directives that allow it, or if there is more that one, do not apply
   // anything and return false, otherwise return true.
   bool applyToUnique(const ClauseTy *node);
 
-  // Apply a clause to the first directive in given range that allows it.
+  // Apply the clause to the first directive in given range that allows it.
   // If such a directive does not exist, return false, otherwise return true.
   template <typename Iterator>
   bool applyToFirst(const ClauseTy *node, llvm::iterator_range<Iterator> range);
 
-  // Apply a clause to the innermost directive that allows it. If such a
+  // Apply the clause to the innermost directive that allows it. If such a
   // directive does not exist, return false, otherwise return true.
   bool applyToInnermost(const ClauseTy *node);
 
-  // Apply a clause to the outermost directive that allows it. If such a
+  // Apply the clause to the outermost directive that allows it. If such a
   // directive does not exist, return false, otherwise return true.
   bool applyToOutermost(const ClauseTy *node);
 
+  // Apply the clause to all directives that allow it, and which satisfy
+  // the predicate: bool shouldApply(LeafReprInternal). If no such
+  // directives exist, return false, otherwise return true.
   template <typename Predicate>
   bool applyIf(const ClauseTy *node, Predicate shouldApply);
 
+  // Apply the clause to all directives that allow it. If no such directives
+  // exist, return false, otherwise return true.
   bool applyToAll(const ClauseTy *node);
 
   template <typename Clause>
   bool applyClause(Clause &&clause, const ClauseTy *node);
 
+  bool applyClause(const tomp::clause::AllocateT<TypeTy, IdTy, ExprTy> &clause,
+                   const ClauseTy *);
   bool applyClause(const tomp::clause::CollapseT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
-  bool applyClause(const tomp::clause::PrivateT<TypeTy, IdTy, ExprTy> &clause,
+  bool applyClause(const tomp::clause::DefaultT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
   bool
   applyClause(const tomp::clause::FirstprivateT<TypeTy, IdTy, ExprTy> &clause,
               const ClauseTy *);
+  bool applyClause(const tomp::clause::IfT<TypeTy, IdTy, ExprTy> &clause,
+                   const ClauseTy *);
   bool
   applyClause(const tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy> &clause,
               const ClauseTy *);
-  bool applyClause(const tomp::clause::SharedT<TypeTy, IdTy, ExprTy> &clause,
+  bool applyClause(const tomp::clause::LinearT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
-  bool applyClause(const tomp::clause::DefaultT<TypeTy, IdTy, ExprTy> &clause,
+  bool applyClause(const tomp::clause::NowaitT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
   bool
-  applyClause(const tomp::clause::ThreadLimitT<TypeTy, IdTy, ExprTy> &clause,
+  applyClause(const tomp::clause::OmpxAttributeT<TypeTy, IdTy, ExprTy> &clause,
               const ClauseTy *);
+  bool applyClause(const tomp::clause::OmpxBareT<TypeTy, IdTy, ExprTy> &clause,
+                   const ClauseTy *);
   bool applyClause(const tomp::clause::OrderT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
-  bool applyClause(const tomp::clause::AllocateT<TypeTy, IdTy, ExprTy> &clause,
+  bool applyClause(const tomp::clause::PrivateT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
   bool applyClause(const tomp::clause::ReductionT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
-  bool applyClause(const tomp::clause::IfT<TypeTy, IdTy, ExprTy> &clause,
-                   const ClauseTy *);
-  bool applyClause(const tomp::clause::LinearT<TypeTy, IdTy, ExprTy> &clause,
-                   const ClauseTy *);
-  bool applyClause(const tomp::clause::NowaitT<TypeTy, IdTy, ExprTy> &clause,
+  bool applyClause(const tomp::clause::SharedT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
   bool
-  applyClause(const tomp::clause::OmpxAttributeT<TypeTy, IdTy, ExprTy> &clause,
+  applyClause(const tomp::clause::ThreadLimitT<TypeTy, IdTy, ExprTy> &clause,
               const ClauseTy *);
-  bool applyClause(const tomp::clause::OmpxBareT<TypeTy, IdTy, ExprTy> &clause,
-                   const ClauseTy *);
 
   uint32_t version;
   llvm::omp::Directive construct;
@@ -452,10 +469,39 @@ bool ConstructDecompositionT<C, H>::applyClause(Specific &&specific,
   // S Some clauses are permitted only on a single leaf construct of the
   // S combined or composite construct, in which case the effect is as if
   // S the clause is applied to that specific construct. (p339, 31-33)
-  if (applyToUnique(node))
-    return true;
+  if (!applyToUnique(node))
+    return error(node, ErrorCode::NoLeafAllowing);
+  return true;
+}
 
-  return false;
+// --- Specific clauses -----------------------------------------------
+
+// ALLOCATE
+// [5.2:178:7-9]
+// Directives: allocators, distribute, do, for, parallel, scope, sections,
+// single, target, task, taskgroup, taskloop, teams
+//
+// [5.2:340:33-35]
+// (33) The effect of the allocate clause is as if it is applied to all leaf
+// constructs that permit the clause and to which a data-sharing attribute
+// clause that may create a private copy of the same list item is applied.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::AllocateT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  // This one needs to be applied at the end, once we know which clauses are
+  // assigned to which leaf constructs.
+
+  // [5.2:340:33]
+  bool applied = applyIf(node, [&](const auto &leaf) {
+    return llvm::any_of(leaf.clauses, [&](const ClauseTy *n) {
+      return llvm::omp::isPrivatizingClause(n->id);
+    });
+  });
+
+  if (!applied)
+    return error(node, ErrorCode::NoLeafPrivatizing);
+  return true;
 }
 
 // COLLAPSE
@@ -469,33 +515,26 @@ template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyClause(
     const tomp::clause::CollapseT<TypeTy, IdTy, ExprTy> &clause,
     const ClauseTy *node) {
-  // Apply "collapse" to the innermost directive. If it's not one that
-  // allows it flag an error.
-  if (!leafs.empty()) {
-    auto &last = leafs.back();
-
-    if (llvm::omp::isAllowedClauseForDirective(last.id, node->id, version)) {
-      last.clauses.push_back(node);
-      return true;
-    }
-  }
-
-  return false;
+  if (!applyToInnermost(node))
+    return error(node, ErrorCode::NoLeafAllowing);
+  return true;
 }
 
-// PRIVATE
-// [5.2:111:5-7]
-// Directives: distribute, do, for, loop, parallel, scope, sections, simd,
-// single, target, task, taskloop, teams
+// DEFAULT
+// [5.2:109:5-6]
+// Directives: parallel, task, taskloop, teams
 //
-// [5.2:340:1-2]
-// (1) The effect of the 1 private clause is as if it is applied only to the
-// innermost leaf construct that permits it.
+// [5.2:340:31-32]
+// (31) The effect of the shared, default, thread_limit, or order clause is as
+// if it is applied to all leaf constructs that permit the clause.
 template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::PrivateT<TypeTy, IdTy, ExprTy> &clause,
+    const tomp::clause::DefaultT<TypeTy, IdTy, ExprTy> &clause,
     const ClauseTy *node) {
-  return applyToInnermost(node);
+  // [5.2:340:31]
+  if (!applyToAll(node))
+    return error(node, ErrorCode::NoLeafAllowing);
+  return true;
 }
 
 // FIRSTPRIVATE
@@ -623,7 +662,49 @@ bool ConstructDecompositionT<C, H>::applyClause(
     applied = true;
   }
 
-  return applied;
+  if (!applied)
+    return error(node, ErrorCode::NoLeafAllowing);
+  return true;
+}
+
+// IF
+// [5.2:72:7-9]
+// Directives: cancel, parallel, simd, target, target data, target enter data,
+// target exit data, target update, task, taskloop
+//
+// [5.2:72:15-18]
+// (15) For combined or composite constructs, the if clause only applies to the
+// semantics of the construct named in the directive-name-modifier.
+// (16) For a combined or composite construct, if no directive-name-modifier is
+// specified then the if clause applies to all constituent constructs to which
+// an if clause can apply.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::IfT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  using DirectiveNameModifier =
+      typename clause::IfT<TypeTy, IdTy, ExprTy>::DirectiveNameModifier;
+  using IfExpression = typename clause::IfT<TypeTy, IdTy, ExprTy>::IfExpression;
+  auto &modifier = std::get<std::optional<DirectiveNameModifier>>(clause.t);
+
+  if (modifier) {
+    llvm::omp::Directive dirId = *modifier;
+    auto *unmodified =
+        makeClause(llvm::omp::Clause::OMPC_if,
+                   tomp::clause::IfT<TypeTy, IdTy, ExprTy>{
+                       {/*DirectiveNameModifier=*/std::nullopt,
+                        /*IfExpression=*/std::get<IfExpression>(clause.t)}});
+
+    if (auto *hasDir = findDirective(dirId)) {
+      hasDir->clauses.push_back(unmodified);
+      return true;
+    }
+    return error(node, ErrorCode::InvalidDirNameMod);
+  }
+
+  if (!applyToAll(node))
+    return error(node, ErrorCode::NoLeafAllowing);
+  return true;
 }
 
 // LASTPRIVATE
@@ -649,12 +730,9 @@ template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyClause(
     const tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy> &clause,
     const ClauseTy *node) {
-  bool applied = false;
-
   // [5.2:340:21]
-  applied = applyToAll(node);
-  if (!applied)
-    return false;
+  if (!applyToAll(node))
+    return error(node, ErrorCode::NoLeafAllowing);
 
   auto inFirstprivate = [&](const ObjectTy &object) {
     if (ClauseSet *set = findClausesWith(object)) {
@@ -680,7 +758,6 @@ bool ConstructDecompositionT<C, H>::applyClause(
           llvm::omp::Clause::OMPC_shared,
           tomp::clause::SharedT<TypeTy, IdTy, ExprTy>{/*List=*/sharedObjects});
       dirParallel->clauses.push_back(shared);
-      applied = true;
     }
 
     // [5.2:340:24]
@@ -689,7 +766,6 @@ bool ConstructDecompositionT<C, H>::applyClause(
           llvm::omp::Clause::OMPC_shared,
           tomp::clause::SharedT<TypeTy, IdTy, ExprTy>{/*List=*/sharedObjects});
       dirTeams->clauses.push_back(shared);
-      applied = true;
     }
   }
 
@@ -713,56 +789,103 @@ bool ConstructDecompositionT<C, H>::applyClause(
                           /*Mapper=*/std::nullopt, /*Iterator=*/std::nullopt,
                           /*LocatorList=*/std::move(tofrom)}});
       dirTarget->clauses.push_back(map);
-      applied = true;
     }
   }
 
-  return applied;
+  return true;
 }
 
-// SHARED
-// [5.2:110:5-6]
-// Directives: parallel, task, taskloop, teams
+// LINEAR
+// [5.2:118:1-2]
+// Directives: declare simd, do, for, simd
 //
-// [5.2:340:31-32]
-// (31) The effect of the shared, default, thread_limit, or order clause is as
-// if it is applied to all leaf constructs that permit the clause.
+// [5.2:341:15-22]
+// (15.1) The effect of the linear clause is as if it is applied to the
+// innermost leaf construct.
+// (15.2) Additionally, if the list item is not the iteration variable of a simd
+// or worksharing-loop SIMD construct, the effect on the outer leaf constructs
+// is as if the list item was specified in firstprivate and lastprivate clauses
+// on the combined or composite construct, with the rules specified above
+// applied.
+// (19) If a list item of the linear clause is the iteration variable of a simd
+// or worksharing-loop SIMD construct and it is not declared in the construct,
+// the effect on the outer leaf constructs is as if the list item was specified
+// in a lastprivate clause on the combined or composite construct with the rules
+// specified above applied.
 template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::SharedT<TypeTy, IdTy, ExprTy> &clause,
+    const tomp::clause::LinearT<TypeTy, IdTy, ExprTy> &clause,
     const ClauseTy *node) {
-  // [5.2:340:31]
-  return applyToAll(node);
+  // [5.2:341:15.1]
+  if (!applyToInnermost(node))
+    return error(node, ErrorCode::NoLeafAllowing);
+
+  // [5.2:341:15.2], [5.2:341:19]
+  auto dirSimd = findDirective(llvm::omp::Directive::OMPD_simd);
+  std::optional<ObjectTy> iterVar = helper.getLoopIterVar();
+  const auto &objects = std::get<tomp::ObjectListT<IdTy, ExprTy>>(clause.t);
+
+  // Lists of objects that will be used to construct "firstprivate" and
+  // "lastprivate" clauses.
+  tomp::ObjectListT<IdTy, ExprTy> first, last;
+
+  for (const ObjectTy &object : objects) {
+    last.push_back(object);
+    if (!dirSimd || !iterVar || object.id() != iterVar->id())
+      first.push_back(object);
+  }
+
+  if (!first.empty()) {
+    auto *firstp = makeClause(
+        llvm::omp::Clause::OMPC_firstprivate,
+        tomp::clause::FirstprivateT<TypeTy, IdTy, ExprTy>{/*List=*/first});
+    nodes.push_back(firstp); // Appending to the main clause list.
+  }
+  if (!last.empty()) {
+    auto *lastp =
+        makeClause(llvm::omp::Clause::OMPC_lastprivate,
+                   tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy>{
+                       {/*LastprivateModifier=*/std::nullopt, /*List=*/last}});
+    nodes.push_back(lastp); // Appending to the main clause list.
+  }
+  return true;
 }
 
-// DEFAULT
-// [5.2:109:5-6]
-// Directives: parallel, task, taskloop, teams
+// NOWAIT
+// [5.2:308:11-13]
+// Directives: dispatch, do, for, interop, scope, sections, single, target,
+// target enter data, target exit data, target update, taskwait, workshare
 //
-// [5.2:340:31-32]
-// (31) The effect of the shared, default, thread_limit, or order clause is as
-// if it is applied to all leaf constructs that permit the clause.
+// [5.2:341:23]
+// (23) The effect of the nowait clause is as if it is applied to the outermost
+// leaf construct that permits it.
 template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::DefaultT<TypeTy, IdTy, ExprTy> &clause,
+    const tomp::clause::NowaitT<TypeTy, IdTy, ExprTy> &clause,
     const ClauseTy *node) {
-  // [5.2:340:31]
-  return applyToAll(node);
+  if (!applyToOutermost(node))
+    return error(node, ErrorCode::NoLeafAllowing);
+  return true;
 }
 
-// THREAD_LIMIT
-// [5.2:277:14-15]
-// Directives: target, teams
-//
-// [5.2:340:31-32]
-// (31) The effect of the shared, default, thread_limit, or order clause is as
-// if it is applied to all leaf constructs that permit the clause.
+// OMPX_ATTRIBUTE
 template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::ThreadLimitT<TypeTy, IdTy, ExprTy> &clause,
+    const tomp::clause::OmpxAttributeT<TypeTy, IdTy, ExprTy> &clause,
     const ClauseTy *node) {
-  // [5.2:340:31]
-  return applyToAll(node);
+  if (!applyToAll(node))
+    return error(node, ErrorCode::NoLeafAllowing);
+  return true;
+}
+
+// OMPX_BARE
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::OmpxBareT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  if (!applyToOutermost(node))
+    return error(node, ErrorCode::NoLeafAllowing);
+  return true;
 }
 
 // ORDER
@@ -777,33 +900,26 @@ bool ConstructDecompositionT<C, H>::applyClause(
     const tomp::clause::OrderT<TypeTy, IdTy, ExprTy> &clause,
     const ClauseTy *node) {
   // [5.2:340:31]
-  return applyToAll(node);
+  if (!applyToAll(node))
+    return error(node, ErrorCode::NoLeafAllowing);
+  return true;
 }
 
-// ALLOCATE
-// [5.2:178:7-9]
-// Directives: allocators, distribute, do, for, parallel, scope, sections,
-// single, target, task, taskgroup, taskloop, teams
+// PRIVATE
+// [5.2:111:5-7]
+// Directives: distribute, do, for, loop, parallel, scope, sections, simd,
+// single, target, task, taskloop, teams
 //
-// [5.2:340:33-35]
-// (33) The effect of the allocate clause is as if it is applied to all leaf
-// constructs that permit the clause and to which a data-sharing attribute
-// clause that may create a private copy of the same list item is applied.
+// [5.2:340:1-2]
+// (1) The effect of the 1 private clause is as if it is applied only to the
+// innermost leaf construct that permits it.
 template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::AllocateT<TypeTy, IdTy, ExprTy> &clause,
+    const tomp::clause::PrivateT<TypeTy, IdTy, ExprTy> &clause,
     const ClauseTy *node) {
-  // This one needs to be applied at the end, once we know which clauses are
-  // assigned to which leaf constructs.
-
-  // [5.2:340:33]
-  bool applied = applyIf(node, [&](const auto &leaf) {
-    return llvm::any_of(leaf.clauses, [&](const ClauseTy *n) {
-      return llvm::omp::isPrivatizingClause(n->id);
-    });
-  });
-
-  return applied;
+  if (!applyToInnermost(node))
+    return error(node, ErrorCode::NoLeafAllowing);
+  return true;
 }
 
 // REDUCTION
@@ -885,7 +1001,7 @@ bool ConstructDecompositionT<C, H>::applyClause(
       return dir == llvm::omp::Directive::OMPD_simd ||
              llvm::is_contained(getWorksharingLoop(), dir);
     case ReductionModifier::Task:
-      if (alreadyApplied)
+      if (alreadyApplied) // Not an error
         return false;
       // According to [5.2:135:16-18], "task" only applies to "parallel" and
       // worksharing constructs.
@@ -905,31 +1021,37 @@ bool ConstructDecompositionT<C, H>::applyClause(
            /*List=*/objects}});
 
   ReductionModifier effective = modifier.value_or(ReductionModifier::Default);
-  bool effectiveApplied = false;
+  bool modifierApplied = false;
+  bool allowingLeaf = false;
   // Walk over the leaf constructs starting from the innermost, and apply
   // the clause as required by the spec.
   for (auto &leaf : llvm::reverse(leafs)) {
     if (!llvm::omp::isAllowedClauseForDirective(leaf.id, node->id, version))
       continue;
+    // Found a leaf that allows this clause. Keep track of this for better
+    // error reporting.
+    allowingLeaf = true;
     if (!applyToParallel && &leaf == dirParallel)
       continue;
     if (!applyToTeams && &leaf == dirTeams)
       continue;
     // Some form of the clause will be applied past this point.
-    if (isValidModifier(leaf.id, effective, effectiveApplied)) {
+    if (isValidModifier(leaf.id, effective, modifierApplied)) {
       // Apply clause with modifier.
       leaf.clauses.push_back(node);
-      effectiveApplied = true;
+      modifierApplied = true;
     } else {
       // Apply clause without modifier.
       leaf.clauses.push_back(unmodified);
     }
     // The modifier must be applied to some construct.
-    applied = effectiveApplied;
+    applied = modifierApplied;
   }
 
+  if (!allowingLeaf)
+    return error(node, ErrorCode::NoLeafAllowing);
   if (!applied)
-    return false;
+    return error(node, ErrorCode::RedModNotApplied);
 
   tomp::ObjectListT<IdTy, ExprTy> sharedObjects;
   llvm::transform(objects, std::back_inserter(sharedObjects),
@@ -976,135 +1098,47 @@ bool ConstructDecompositionT<C, H>::applyClause(
                /*LocatorList=*/std::move(tofrom)}});
 
       dirTarget->clauses.push_back(map);
-      applied = true;
     }
   }
 
-  return applied;
-}
-
-// IF
-// [5.2:72:7-9]
-// Directives: cancel, parallel, simd, target, target data, target enter data,
-// target exit data, target update, task, taskloop
-//
-// [5.2:72:15-18]
-// (15) For combined or composite constructs, the if clause only applies to the
-// semantics of the construct named in the directive-name-modifier.
-// (16) For a combined or composite construct, if no directive-name-modifier is
-// specified then the if clause applies to all constituent constructs to which
-// an if clause can apply.
-template <typename C, typename H>
-bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::IfT<TypeTy, IdTy, ExprTy> &clause,
-    const ClauseTy *node) {
-  using DirectiveNameModifier =
-      typename clause::IfT<TypeTy, IdTy, ExprTy>::DirectiveNameModifier;
-  using IfExpression = typename clause::IfT<TypeTy, IdTy, ExprTy>::IfExpression;
-  auto &modifier = std::get<std::optional<DirectiveNameModifier>>(clause.t);
-
-  if (modifier) {
-    llvm::omp::Directive dirId = *modifier;
-    auto *unmodified =
-        makeClause(llvm::omp::Clause::OMPC_if,
-                   tomp::clause::IfT<TypeTy, IdTy, ExprTy>{
-                       {/*DirectiveNameModifier=*/std::nullopt,
-                        /*IfExpression=*/std::get<IfExpression>(clause.t)}});
-
-    if (auto *hasDir = findDirective(dirId)) {
-      hasDir->clauses.push_back(unmodified);
-      return true;
-    }
-    return false;
-  }
-
-  return applyToAll(node);
+  return true;
 }
 
-// LINEAR
-// [5.2:118:1-2]
-// Directives: declare simd, do, for, simd
+// SHARED
+// [5.2:110:5-6]
+// Directives: parallel, task, taskloop, teams
 //
-// [5.2:341:15-22]
-// (15.1) The effect of the linear clause is as if it is applied to the
-// innermost leaf construct.
-// (15.2) Additionally, if the list item is not the iteration variable of a simd
-// or worksharing-loop SIMD construct, the effect on the outer leaf constructs
-// is as if the list item was specified in firstprivate and lastprivate clauses
-// on the combined or composite construct, with the rules specified above
-// applied.
-// (19) If a list item of the linear clause is the iteration variable of a simd
-// or worksharing-loop SIMD construct and it is not declared in the construct,
-// the effect on the outer leaf constructs is as if the list item was specified
-// in a lastprivate clause on the combined or composite construct with the rules
-// specified above applied.
+// [5.2:340:31-32]
+// (31) The effect of the shared, default, thread_limit, or order clause is as
+// if it is applied to all leaf constructs that permit the clause.
 template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::LinearT<TypeTy, IdTy, ExprTy> &clause,
+    const tomp::clause::SharedT<TypeTy, IdTy, ExprTy> &clause,
     const ClauseTy *node) {
-  // [5.2:341:15.1]
-  if (!applyToInnermost(node))
-    return false;
-
-  // [5.2:341:15.2], [5.2:341:19]
-  auto dirSimd = findDirective(llvm::omp::Directive::OMPD_simd);
-  std::optional<ObjectTy> iterVar = helper.getLoopIterVar();
-  const auto &objects = std::get<tomp::ObjectListT<IdTy, ExprTy>>(clause.t);
-
-  // Lists of objects that will be used to construct "firstprivate" and
-  // "lastprivate" clauses.
-  tomp::ObjectListT<IdTy, ExprTy> first, last;
-
-  for (const ObjectTy &object : objects) {
-    last.push_back(object);
-    if (!dirSimd || !iterVar || object.id() != iterVar->id())
-      first.push_back(object);
-  }
-
-  if (!first.empty()) {
-    auto *firstp = makeClause(
-        llvm::omp::Clause::OMPC_firstprivate,
-        tomp::clause::FirstprivateT<TypeTy, IdTy, ExprTy>{/*List=*/first});
-    nodes.push_back(firstp); // Appending to the main clause list.
-  }
-  if (!last.empty()) {
-    auto *lastp =
-        makeClause(llvm::omp::Clause::OMPC_lastprivate,
-                   tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy>{
-                       {/*LastprivateModifier=*/std::nullopt, /*List=*/last}});
-    nodes.push_back(lastp); // Appending to the main clause list.
-  }
+  // [5.2:340:31]
+  if (!applyToAll(node))
+    return error(node, ErrorCode::NoLeafAllowing);
   return true;
 }
 
-// NOWAIT
-// [5.2:308:11-13]
-// Directives: dispatch, do, for, interop, scope, sections, single, target,
-// target enter data, target exit data, target update, taskwait, workshare
+// THREAD_LIMIT
+// [5.2:277:14-15]
+// Directives: target, teams
 //
-// [5.2:341:23]
-// (23) The effect of the nowait clause is as if it is applied to the outermost
-// leaf construct that permits it.
-template <typename C, typename H>
-bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::NowaitT<TypeTy, IdTy, ExprTy> &clause,
-    const ClauseTy *node) {
-  return applyToOutermost(node);
-}
-
+// [5.2:340:31-32]
+// (31) The effect of the shared, default, thread_limit, or order clause is as
+// if it is applied to all leaf constructs that permit the clause.
 template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::OmpxBareT<TypeTy, IdTy, ExprTy> &clause,
+    const tomp::clause::ThreadLimitT<TypeTy, IdTy, ExprTy> &clause,
     const ClauseTy *node) {
-  return applyToOutermost(node);
+  // [5.2:340:31]
+  if (!applyToAll(node))
+    return error(node, ErrorCode::NoLeafAllowing);
+  return true;
 }
 
-template <typename C, typename H>
-bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::OmpxAttributeT<TypeTy, IdTy, ExprTy> &clause,
-    const ClauseTy *node) {
-  return applyToAll(node);
-}
+// --- Splitting ------------------------------------------------------
 
 template <typename C, typename H> bool ConstructDecompositionT<C, H>::split() {
   bool success = true;
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 208609f64f418..a01858fb220f1 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -123,6 +123,8 @@ def OMPC_Collapse : Clause<[Spelling<"collapse">]> {
   let clangClass = "OMPCollapseClause";
   let flangClass = "ScalarIntConstantExpr";
 }
+def OMPC_Collector : Clause<[Spelling<"collector">]> {
+}
 def OMPC_Compare : Clause<[Spelling<"compare">]> {
   let clangClass = "OMPCompareClause";
 }
@@ -185,6 +187,7 @@ def OMPC_DynamicAllocators : Clause<[Spelling<"dynamic_allocators">]> {
   let isValueOptional = true;
 }
 def OMPC_DynGroupprivate : Clause<[Spelling<"dyn_groupprivate">]> {
+  let clangClass = "OMPDynGroupprivateClause";
   let flangClass = "OmpDynGroupprivateClause";
 }
 def OMPC_Enter : Clause<[Spelling<"enter">]> {
@@ -264,6 +267,8 @@ def OMPC_Inclusive : Clause<[Spelling<"inclusive">]> {
 def OMPC_Indirect : Clause<[Spelling<"indirect">]> {
   let flangClass = "OmpIndirectClause";
 }
+def OMPC_Inductor : Clause<[Spelling<"inductor">]> {
+}
 def OMPC_Init : Clause<[Spelling<"init">]> {
   let clangClass = "OMPInitClause";
   let flangClass = "OmpInitClause";
@@ -749,6 +754,14 @@ def OMP_Critical : Directive<[Spelling<"critical">]> {
   let association = AS_Block;
   let category = CA_Executable;
 }
+def OMP_DeclareInduction : Directive<[Spelling<"declare_induction">]> {
+  let allowedOnceClauses = [
+    VersionedClause<OMPC_Collector, 60>,
+    VersionedClause<OMPC_Inductor, 60>,
+  ];
+  let association = AS_None;
+  let category = CA_Declarative;
+}
 def OMP_DeclareMapper : Directive<[Spelling<"declare mapper", 1, 52>,
                                    Spelling<"declare_mapper", 60>]> {
   let requiredClauses = [
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index 7bec7e0c6736d..1ac9ac040468c 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -190,6 +190,16 @@ enum class OMPScheduleType {
   LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierMask)
 };
 
+/// The fallback types for the dyn_groupprivate clause.
+enum class OMPDynGroupprivateFallbackType : uint64_t {
+  /// Abort the execution.
+  Abort = 0,
+  /// Return null pointer.
+  Null = 1,
+  /// Allocate from a implementation defined memory space.
+  DefaultMem = 2
+};
+
 // Default OpenMP mapper name suffix.
 inline constexpr const char *OmpDefaultMapperName = ".omp.default.mapper";
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 5331cb5abdc6f..9f77c24d0b27b 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2383,7 +2383,7 @@ class OpenMPIRBuilder {
     /// runtime library for debugging
     Value *MapNamesArray = nullptr;
 
-    explicit TargetDataRTArgs() {}
+    explicit TargetDataRTArgs() = default;
     explicit TargetDataRTArgs(Value *BasePointersArray, Value *PointersArray,
                               Value *SizesArray, Value *MapTypesArray,
                               Value *MapTypesArrayEnd, Value *MappersArray,
@@ -2446,20 +2446,24 @@ class OpenMPIRBuilder {
     /// The number of threads.
     ArrayRef<Value *> NumThreads;
     /// The size of the dynamic shared memory.
-    Value *DynCGGroupMem = nullptr;
+    Value *DynCGroupMem = nullptr;
     /// True if the kernel has 'no wait' clause.
     bool HasNoWait = false;
+    /// The fallback mechanism for the shared memory.
+    omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback =
+        omp::OMPDynGroupprivateFallbackType::Abort;
 
     // Constructors for TargetKernelArgs.
-    TargetKernelArgs() {}
+    TargetKernelArgs() = default;
     TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs,
                      Value *NumIterations, ArrayRef<Value *> NumTeams,
-                     ArrayRef<Value *> NumThreads, Value *DynCGGroupMem,
-                     bool HasNoWait)
+                     ArrayRef<Value *> NumThreads, Value *DynCGroupMem,
+                     bool HasNoWait,
+                     omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback)
         : NumTargetItems(NumTargetItems), RTArgs(RTArgs),
           NumIterations(NumIterations), NumTeams(NumTeams),
-          NumThreads(NumThreads), DynCGGroupMem(DynCGGroupMem),
-          HasNoWait(HasNoWait) {}
+          NumThreads(NumThreads), DynCGroupMem(DynCGroupMem),
+          HasNoWait(HasNoWait), DynCGroupMemFallback(DynCGroupMemFallback) {}
   };
 
   /// Create the kernel args vector used by emitTargetKernel. This function
@@ -2494,7 +2498,7 @@ class OpenMPIRBuilder {
     /// Whether the `target ... data` directive has a `nowait` clause.
     bool HasNoWait = false;
 
-    explicit TargetDataInfo() {}
+    explicit TargetDataInfo() = default;
     explicit TargetDataInfo(bool RequiresDevicePointerInfo,
                             bool SeparateBeginEndCalls)
         : RequiresDevicePointerInfo(RequiresDevicePointerInfo),
@@ -3244,6 +3248,10 @@ class OpenMPIRBuilder {
   ///        dependency information as passed in the depend clause
   /// \param HasNowait Whether the target construct has a `nowait` clause or
   ///        not.
+  /// \param DynCGroupMem The size of the dynamic groupprivate memory for each
+  /// cgroup.
+  /// \param DynCGroupMem The fallback mechanism to execute if the requested
+  /// cgroup memory cannot be provided.
   LLVM_ABI InsertPointOrErrorTy createTarget(
       const LocationDescription &Loc, bool IsOffloadEntry,
       OpenMPIRBuilder::InsertPointTy AllocaIP,
@@ -3255,7 +3263,10 @@ class OpenMPIRBuilder {
       TargetBodyGenCallbackTy BodyGenCB,
       TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
       CustomMapperCallbackTy CustomMapperCB,
-      const SmallVector<DependData> &Dependencies, bool HasNowait = false);
+      const SmallVector<DependData> &Dependencies, bool HasNowait = false,
+      Value *DynCGroupMem = nullptr,
+      omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback =
+          omp::OMPDynGroupprivateFallbackType::Abort);
 
   /// Returns __kmpc_for_static_init_* runtime function for the specified
   /// size \a IVSize and sign \a IVSigned. Will create a distribute call
@@ -3654,7 +3665,7 @@ class OpenMPIRBuilder {
   /// \param Name Name of the variable.
   LLVM_ABI GlobalVariable *
   getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
-                              unsigned AddressSpace = 0);
+                              std::optional<unsigned> AddressSpace = {});
 };
 
 /// Class to represented the control flow structure of an OpenMP canonical loop.
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 8ce2b1bea8fac..c086a39616249 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -183,6 +183,11 @@ def NoCallback : EnumAttr<"nocallback", IntersectAnd, [FnAttr]>;
 /// Specify how the pointer may be captured.
 def Captures : IntAttr<"captures", IntersectCustom, [ParamAttr]>;
 
+/// Result will not be undef or poison if all arguments are not undef and not
+/// poison.
+def NoCreateUndefOrPoison
+    : EnumAttr<"nocreateundeforpoison", IntersectAnd, [FnAttr]>;
+
 /// Function is not a source of divergence.
 def NoDivergenceSource : EnumAttr<"nodivergencesource", IntersectAnd, [FnAttr]>;
 
diff --git a/llvm/include/llvm/IR/ConstantFold.h b/llvm/include/llvm/IR/ConstantFold.h
index f9f2b3516a4ca..4056f1feb4dd3 100644
--- a/llvm/include/llvm/IR/ConstantFold.h
+++ b/llvm/include/llvm/IR/ConstantFold.h
@@ -26,42 +26,66 @@
 #include <optional>
 
 namespace llvm {
-  template <typename T> class ArrayRef;
-  class Value;
-  class Constant;
-  class Type;
+template <typename T> class ArrayRef;
+class Value;
+class Constant;
+class Type;
 
-  // Constant fold various types of instruction...
-  LLVM_ABI Constant *
-  ConstantFoldCastInstruction(unsigned opcode, ///< The opcode of the cast
-                              Constant *V,     ///< The source constant
-                              Type *DestTy     ///< The destination type
-  );
-  LLVM_ABI Constant *ConstantFoldSelectInstruction(Constant *Cond, Constant *V1,
-                                                   Constant *V2);
-  LLVM_ABI Constant *ConstantFoldExtractElementInstruction(Constant *Val,
-                                                           Constant *Idx);
-  LLVM_ABI Constant *ConstantFoldInsertElementInstruction(Constant *Val,
-                                                          Constant *Elt,
-                                                          Constant *Idx);
-  LLVM_ABI Constant *ConstantFoldShuffleVectorInstruction(Constant *V1,
-                                                          Constant *V2,
-                                                          ArrayRef<int> Mask);
-  LLVM_ABI Constant *
-  ConstantFoldExtractValueInstruction(Constant *Agg, ArrayRef<unsigned> Idxs);
-  LLVM_ABI Constant *
-  ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val,
-                                     ArrayRef<unsigned> Idxs);
-  LLVM_ABI Constant *ConstantFoldUnaryInstruction(unsigned Opcode, Constant *V);
-  LLVM_ABI Constant *ConstantFoldBinaryInstruction(unsigned Opcode,
-                                                   Constant *V1, Constant *V2);
-  LLVM_ABI Constant *
-  ConstantFoldCompareInstruction(CmpInst::Predicate Predicate, Constant *C1,
-                                 Constant *C2);
-  LLVM_ABI Constant *
-  ConstantFoldGetElementPtr(Type *Ty, Constant *C,
-                            std::optional<ConstantRange> InRange,
-                            ArrayRef<Value *> Idxs);
-} // End llvm namespace
+// Constant fold various types of instruction...
+LLVM_ABI Constant *
+ConstantFoldCastInstruction(unsigned opcode, ///< The opcode of the cast
+                            Constant *V,     ///< The source constant
+                            Type *DestTy     ///< The destination type
+);
+
+/// Attempt to constant fold a select instruction with the specified
+/// operands. The constant result is returned if successful; if not, null is
+/// returned.
+LLVM_ABI Constant *ConstantFoldSelectInstruction(Constant *Cond, Constant *V1,
+                                                 Constant *V2);
+
+/// Attempt to constant fold an extractelement instruction with the
+/// specified operands and indices.  The constant result is returned if
+/// successful; if not, null is returned.
+LLVM_ABI Constant *ConstantFoldExtractElementInstruction(Constant *Val,
+                                                         Constant *Idx);
+
+/// Attempt to constant fold an insertelement instruction with the
+/// specified operands and indices.  The constant result is returned if
+/// successful; if not, null is returned.
+LLVM_ABI Constant *ConstantFoldInsertElementInstruction(Constant *Val,
+                                                        Constant *Elt,
+                                                        Constant *Idx);
+
+/// Attempt to constant fold a shufflevector instruction with the
+/// specified operands and mask.  See class ShuffleVectorInst for a description
+/// of the mask representation. The constant result is returned if successful;
+/// if not, null is returned.
+LLVM_ABI Constant *ConstantFoldShuffleVectorInstruction(Constant *V1,
+                                                        Constant *V2,
+                                                        ArrayRef<int> Mask);
+
+/// Attempt to constant fold an extractvalue instruction with the
+/// specified operands and indices.  The constant result is returned if
+/// successful; if not, null is returned.
+LLVM_ABI Constant *ConstantFoldExtractValueInstruction(Constant *Agg,
+                                                       ArrayRef<unsigned> Idxs);
+
+/// Attempt to constant fold an insertvalue instruction with the specified
+/// operands and indices.  The constant result is returned if successful; if
+/// not, null is returned.
+LLVM_ABI Constant *ConstantFoldInsertValueInstruction(Constant *Agg,
+                                                      Constant *Val,
+                                                      ArrayRef<unsigned> Idxs);
+LLVM_ABI Constant *ConstantFoldUnaryInstruction(unsigned Opcode, Constant *V);
+LLVM_ABI Constant *ConstantFoldBinaryInstruction(unsigned Opcode, Constant *V1,
+                                                 Constant *V2);
+LLVM_ABI Constant *ConstantFoldCompareInstruction(CmpInst::Predicate Predicate,
+                                                  Constant *C1, Constant *C2);
+LLVM_ABI Constant *
+ConstantFoldGetElementPtr(Type *Ty, Constant *C,
+                          std::optional<ConstantRange> InRange,
+                          ArrayRef<Value *> Idxs);
+} // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h
index 56fc749838ef9..54458201af0b3 100644
--- a/llvm/include/llvm/IR/DataLayout.h
+++ b/llvm/include/llvm/IR/DataLayout.h
@@ -590,7 +590,7 @@ class DataLayout {
   ///
   /// This is the amount that alloca reserves for this type. For example,
   /// returns 12 or 16 for x86_fp80, depending on alignment.
-  TypeSize getTypeAllocSize(Type *Ty) const;
+  LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const;
 
   /// Returns the offset in bits between successive objects of the
   /// specified type, including alignment padding; always a multiple of 8.
diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h
index 33e6df0ecb873..862293c9666a7 100644
--- a/llvm/include/llvm/IR/DebugInfo.h
+++ b/llvm/include/llvm/IR/DebugInfo.h
@@ -108,7 +108,7 @@ class DebugInfoFinder {
   LLVM_ABI void processInstruction(const Module &M, const Instruction &I);
 
   /// Process a DILocalVariable.
-  LLVM_ABI void processVariable(DILocalVariable *DVI);
+  LLVM_ABI void processVariable(const DILocalVariable *DVI);
   /// Process debug info location.
   LLVM_ABI void processLocation(const Module &M, const DILocation *Loc);
   /// Process a DbgRecord.
@@ -124,7 +124,7 @@ class DebugInfoFinder {
   void processCompileUnit(DICompileUnit *CU);
   void processScope(DIScope *Scope);
   void processType(DIType *DT);
-  void processImportedEntity(DIImportedEntity *Import);
+  void processImportedEntity(const DIImportedEntity *Import);
   bool addCompileUnit(DICompileUnit *CU);
   bool addGlobalVariable(DIGlobalVariableExpression *DIG);
   bool addScope(DIScope *Scope);
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index 7ade6b8e13308..6918b21d5b363 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -2554,6 +2554,39 @@ class DISubprogram : public DILocalScope {
     replaceOperandWith(7, N.get());
   }
 
+  /// For the given retained node of DISubprogram, applies one of the
+  /// given functions depending on the type of the node.
+  template <typename T, typename FuncLVT, typename FuncLabelT,
+            typename FuncImportedEntityT, typename FuncUnknownT>
+  static T
+  visitRetainedNode(const Metadata *N, FuncLVT &&FuncLV, FuncLabelT &&FuncLabel,
+                    FuncImportedEntityT &&FuncIE, FuncUnknownT &&FuncUnknown) {
+    if (const auto *LV = dyn_cast<DILocalVariable>(N))
+      return FuncLV(LV);
+    if (const auto *L = dyn_cast<DILabel>(N))
+      return FuncLabel(L);
+    if (const auto *IE = dyn_cast<DIImportedEntity>(N))
+      return FuncIE(IE);
+    return FuncUnknown(N);
+  }
+
+  /// Returns the scope of subprogram's retainedNodes.
+  static const DILocalScope *getRetainedNodeScope(const MDNode *N);
+  // For use in Verifier.
+  static const DIScope *getRawRetainedNodeScope(const MDNode *N);
+
+  /// For each retained node, applies one of the given functions depending
+  /// on the type of a node.
+  template <typename FuncLVT, typename FuncLabelT, typename FuncImportedEntityT>
+  void forEachRetainedNode(FuncLVT &&FuncLV, FuncLabelT &&FuncLabel,
+                           FuncImportedEntityT &&FuncIE) const {
+    for (MDNode *N : getRetainedNodes())
+      visitRetainedNode<void>(N, FuncLV, FuncLabel, FuncIE,
+                              [](const Metadata *N) {
+                                llvm_unreachable("Unexpected retained node!");
+                              });
+  }
+
   /// Check if this subprogram describes the given function.
   ///
   /// FIXME: Should this be looking through bitcasts?
diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index 457c60e3bc929..66f44fe34d3f6 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -589,7 +589,7 @@ filterDbgVars(iterator_range<simple_ilist<DbgRecord>::iterator> R) {
 /// date.
 class DbgMarker {
 public:
-  DbgMarker() {}
+  DbgMarker() = default;
   /// Link back to the Instruction that owns this marker. Can be null during
   /// operations that move a marker from one instruction to another.
   Instruction *MarkedInstr = nullptr;
diff --git a/llvm/include/llvm/IR/DiagnosticInfo.h b/llvm/include/llvm/IR/DiagnosticInfo.h
index a426fb079ec04..8f6fb4da0c839 100644
--- a/llvm/include/llvm/IR/DiagnosticInfo.h
+++ b/llvm/include/llvm/IR/DiagnosticInfo.h
@@ -26,10 +26,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TypeSize.h"
-#include <algorithm>
 #include <cstdint>
 #include <functional>
-#include <iterator>
 #include <optional>
 #include <string>
 #include <utility>
diff --git a/llvm/include/llvm/IR/Dominators.h b/llvm/include/llvm/IR/Dominators.h
index bf128a3936cbd..1209def5ac0bd 100644
--- a/llvm/include/llvm/IR/Dominators.h
+++ b/llvm/include/llvm/IR/Dominators.h
@@ -32,7 +32,6 @@
 #include "llvm/Support/CFGUpdate.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/GenericDomTree.h"
-#include <algorithm>
 #include <utility>
 
 namespace llvm {
diff --git a/llvm/include/llvm/IR/DroppedVariableStats.h b/llvm/include/llvm/IR/DroppedVariableStats.h
index 42e86dd966751..8a1dbd6aeb60a 100644
--- a/llvm/include/llvm/IR/DroppedVariableStats.h
+++ b/llvm/include/llvm/IR/DroppedVariableStats.h
@@ -42,7 +42,7 @@ class DroppedVariableStats {
 public:
   LLVM_ABI DroppedVariableStats(bool DroppedVarStatsEnabled);
 
-  virtual ~DroppedVariableStats() {}
+  virtual ~DroppedVariableStats() = default;
 
   // We intend this to be unique per-compilation, thus no copies.
   DroppedVariableStats(const DroppedVariableStats &) = delete;
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 27930bbc651bd..8bd060ae8f485 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -3556,6 +3556,11 @@ class SwitchInstProfUpdateWrapper {
   /// correspondent branch weight.
   LLVM_ABI SwitchInst::CaseIt removeCase(SwitchInst::CaseIt I);
 
+  /// Replace the default destination by given case. Delegate the call to
+  /// the underlying SwitchInst::setDefaultDest and remove correspondent branch
+  /// weight.
+  LLVM_ABI void replaceDefaultDest(SwitchInst::CaseIt I);
+
   /// Delegate the call to the underlying SwitchInst::addCase() and set the
   /// specified branch weight for the added case.
   LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest, CaseWeightOpt W);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 4d59ee8676b9e..07aa2faffa7c5 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -186,6 +186,10 @@ def IntrSpeculatable : IntrinsicProperty;
 // defined by the hasSideEffects property of the TableGen Instruction class.
 def IntrHasSideEffects : IntrinsicProperty;
 
+// Result will not be undef or poison if all arguments are not undef and not
+// poison.
+def IntrNoCreateUndefOrPoison : IntrinsicProperty;
+
 //===----------------------------------------------------------------------===//
 // IIT constants and utils
 //===----------------------------------------------------------------------===//
@@ -1039,7 +1043,7 @@ def int_experimental_memset_pattern
 // FIXME: Add version of these floating point intrinsics which allow non-default
 // rounding modes and FP exception handling.
 
-let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in {
   def int_fma  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
                            [LLVMMatchType<0>, LLVMMatchType<0>,
                             LLVMMatchType<0>]>;
@@ -1052,16 +1056,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
   // environment so they can be treated as readnone.
   def int_sqrt : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_powi : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_anyint_ty]>;
-  def int_asin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_acos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_atan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_atan2 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_sin  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_cos  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_tan  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_sinh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_cosh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_tanh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_pow  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
                            [LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_log  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
@@ -1080,12 +1076,6 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
   def int_nearbyint : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_round : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_roundeven    : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_sincos : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
-                             [llvm_anyfloat_ty]>;
-  def int_sincospi : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
-                             [llvm_anyfloat_ty]>;
-  def int_modf : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
-                             [llvm_anyfloat_ty]>;
 
   // Truncate a floating point number with a specific rounding mode
   def int_fptrunc_round : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
@@ -1097,6 +1087,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
   def int_arithmetic_fence : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>],
                                                    [IntrNoMem]>;
 
+  // If the value doesn't fit an unspecified value is returned, but this
+  // is not poison so we can still mark these as IntrNoCreateUndefOrPoison.
   def int_lround : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
   def int_llround : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
   def int_lrint : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
@@ -1110,29 +1102,50 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
   def int_frexp : DefaultAttrsIntrinsic<[llvm_anyfloat_ty, llvm_anyint_ty], [LLVMMatchType<0>]>;
 }
 
+// TODO: Move all of these into the IntrNoCreateUndefOrPoison case above.
+let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+  // These functions do not read memory, but are sensitive to the
+  // rounding mode. LLVM purposely does not model changes to the FP
+  // environment so they can be treated as readnone.
+  def int_asin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_acos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_atan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_atan2 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
+  def int_tan  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_sinh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_cosh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_tanh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_sincos : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
+                             [llvm_anyfloat_ty]>;
+  def int_sincospi : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
+                             [llvm_anyfloat_ty]>;
+  def int_modf : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
+                             [llvm_anyfloat_ty]>;
+}
+
 def int_minnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 def int_maxnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 def int_minimum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 def int_maximum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 def int_minimumnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 def int_maximumnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 
 // Internal interface for object size checking
@@ -1164,7 +1177,7 @@ let IntrProperties = [IntrInaccessibleMemOnly] in {
 def int_is_fpclass
     : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                             [llvm_anyfloat_ty, llvm_i32_ty],
-                            [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>]>;
+                            [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, ImmArg<ArgIndex<1>>]>;
 
 //===--------------- Constrained Floating Point Intrinsics ----------------===//
 //
@@ -1406,7 +1419,7 @@ def int_expect_with_probability : DefaultAttrsIntrinsic<[llvm_anyint_ty],
 //
 
 // None of these intrinsics accesses memory at all.
-let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in {
   def int_bswap: DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
   def int_ctpop: DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
   def int_bitreverse : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
@@ -1521,7 +1534,7 @@ def int_adjust_trampoline : DefaultAttrsIntrinsic<
 //
 
 // Expose the carry flag from add operations on two integrals.
-let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in {
   def int_sadd_with_overflow : DefaultAttrsIntrinsic<[llvm_anyint_ty,
                                           LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                          [LLVMMatchType<0>, LLVMMatchType<0>]>;
@@ -1547,16 +1560,16 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
 //
 def int_sadd_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
-                             [IntrNoMem, IntrSpeculatable, Commutative]>;
+                             [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Commutative]>;
 def int_uadd_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
-                             [IntrNoMem, IntrSpeculatable, Commutative]>;
+                             [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Commutative]>;
 def int_ssub_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
-                             [IntrNoMem, IntrSpeculatable]>;
+                             [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_usub_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
-                             [IntrNoMem, IntrSpeculatable]>;
+                             [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_sshl_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
                              [IntrNoMem, IntrSpeculatable]>;
@@ -1611,22 +1624,22 @@ def int_abs : DefaultAttrsIntrinsic<
 
 def int_smax : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-    [IntrNoMem, IntrSpeculatable]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_smin : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-    [IntrNoMem, IntrSpeculatable]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_umax : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-    [IntrNoMem, IntrSpeculatable]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_umin : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-    [IntrNoMem, IntrSpeculatable]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_scmp : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>],
-    [IntrNoMem, IntrSpeculatable, Range<RetIndex, -1, 2>]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Range<RetIndex, -1, 2>]>;
 def int_ucmp : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>],
-    [IntrNoMem, IntrSpeculatable, Range<RetIndex, -1, 2>]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Range<RetIndex, -1, 2>]>;
 
 //===------------------------- Memory Use Markers -------------------------===//
 //
@@ -1868,7 +1881,7 @@ def int_convert_from_fp16 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_i16_
 }
 
 // Saturating floating point to integer intrinsics
-let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in {
 def int_fptoui_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
 def int_fptosi_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
 }
@@ -1890,7 +1903,7 @@ def int_fake_use : DefaultAttrsIntrinsic<[], [llvm_vararg_ty],
 // First argument must be pointer or vector of pointer. This is checked by the
 // verifier.
 def int_ptrmask: DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_anyint_ty],
-                           [IntrNoMem, IntrSpeculatable]>;
+                           [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 
 // Intrinsic to wrap a thread local variable.
 def int_threadlocal_address : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [LLVMMatchType<0>],
@@ -1900,6 +1913,9 @@ def int_threadlocal_address : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [LLVMMatch
 def int_stepvector : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                             [], [IntrNoMem]>;
 
+def int_reloc_none : DefaultAttrsIntrinsic<[], [llvm_metadata_ty],
+  [IntrNoMem, IntrHasSideEffects]>;
+
 //===---------------- Vector Predication Intrinsics --------------===//
 // Memory Intrinsics
 def int_vp_store : DefaultAttrsIntrinsic<[],
@@ -2797,6 +2813,10 @@ def int_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
                                                           [IntrNoMem,
                                                            IntrSpeculatable]>;
 
+def int_vector_partial_reduce_fadd : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
+                                                           [llvm_anyfloat_ty, llvm_anyfloat_ty],
+                                                           [IntrNoMem]>;
+
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
 
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index b81edc385cd43..4cab6e05ba79f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -499,6 +499,7 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
   def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
   def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
+  def int_aarch64_neon_fmmla : AdvSIMD_MatMul_Intrinsic;
   def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
   def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
   def int_aarch64_neon_bfmmla
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index d6b85630eb979..d7db935ee07f1 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -140,6 +140,9 @@ def int_dx_isinf : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1
 def int_dx_isnan : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
     [llvm_anyfloat_ty], [IntrNoMem]>;
 
+def int_dx_legacyf16tof32 : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_float_ty>],
+    [llvm_anyint_ty], [IntrNoMem]>;
+
 def int_dx_lerp : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>],
     [IntrNoMem]>;
 
@@ -173,4 +176,10 @@ def int_dx_firstbitlow : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, l
 
 def int_dx_group_memory_barrier_with_group_sync
     : DefaultAttrsIntrinsic<[], [], [IntrConvergent]>;
+
+def int_dx_load_input
+    : DefaultAttrsIntrinsic<[llvm_any_ty],
+                            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i8_ty,
+                             llvm_i32_ty],
+                            [IntrConvergent]>;
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsHexagonDep.td b/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
index fe95377f8e1a5..dde4132791f06 100644
--- a/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
+++ b/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
@@ -6835,6 +6835,180 @@ Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsub_hf_f8_128B">;
 
 // V81 HVX Instructions.
 
+def int_hexagon_V6_vabs_qf16_hf :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabs_qf16_hf">;
+
+def int_hexagon_V6_vabs_qf16_hf_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabs_qf16_hf_128B">;
+
+def int_hexagon_V6_vabs_qf16_qf16 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabs_qf16_qf16">;
+
+def int_hexagon_V6_vabs_qf16_qf16_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabs_qf16_qf16_128B">;
+
+def int_hexagon_V6_vabs_qf32_qf32 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabs_qf32_qf32">;
+
+def int_hexagon_V6_vabs_qf32_qf32_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabs_qf32_qf32_128B">;
+
+def int_hexagon_V6_vabs_qf32_sf :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vabs_qf32_sf">;
+
+def int_hexagon_V6_vabs_qf32_sf_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vabs_qf32_sf_128B">;
+
+def int_hexagon_V6_valign4 :
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_valign4">;
+
+def int_hexagon_V6_valign4_128B :
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_valign4_128B">;
+
+def int_hexagon_V6_vconv_bf_qf32 :
+Hexagon_v16i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_bf_qf32">;
+
+def int_hexagon_V6_vconv_bf_qf32_128B :
+Hexagon_v32i32_v64i32_Intrinsic<"HEXAGON_V6_vconv_bf_qf32_128B">;
+
+def int_hexagon_V6_vconv_f8_qf16 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_f8_qf16">;
+
+def int_hexagon_V6_vconv_f8_qf16_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_f8_qf16_128B">;
+
+def int_hexagon_V6_vconv_h_hf_rnd :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_h_hf_rnd">;
+
+def int_hexagon_V6_vconv_h_hf_rnd_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_h_hf_rnd_128B">;
+
+def int_hexagon_V6_vconv_qf16_f8 :
+Hexagon_v32i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_qf16_f8">;
+
+def int_hexagon_V6_vconv_qf16_f8_128B :
+Hexagon_v64i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_qf16_f8_128B">;
+
+def int_hexagon_V6_vconv_qf16_hf :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_qf16_hf">;
+
+def int_hexagon_V6_vconv_qf16_hf_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_qf16_hf_128B">;
+
+def int_hexagon_V6_vconv_qf16_qf16 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_qf16_qf16">;
+
+def int_hexagon_V6_vconv_qf16_qf16_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_qf16_qf16_128B">;
+
+def int_hexagon_V6_vconv_qf32_qf32 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_qf32_qf32">;
+
+def int_hexagon_V6_vconv_qf32_qf32_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_qf32_qf32_128B">;
+
+def int_hexagon_V6_vconv_qf32_sf :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vconv_qf32_sf">;
+
+def int_hexagon_V6_vconv_qf32_sf_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vconv_qf32_sf_128B">;
+
+def int_hexagon_V6_veqhf :
+Hexagon_v64i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_veqhf">;
+
+def int_hexagon_V6_veqhf_128B :
+Hexagon_v128i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_veqhf_128B">;
+
+def int_hexagon_V6_veqhf_and :
+Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqhf_and">;
+
+def int_hexagon_V6_veqhf_and_128B :
+Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqhf_and_128B">;
+
+def int_hexagon_V6_veqhf_or :
+Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqhf_or">;
+
+def int_hexagon_V6_veqhf_or_128B :
+Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqhf_or_128B">;
+
+def int_hexagon_V6_veqhf_xor :
+Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqhf_xor">;
+
+def int_hexagon_V6_veqhf_xor_128B :
+Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqhf_xor_128B">;
+
+def int_hexagon_V6_veqsf :
+Hexagon_v64i1_v16i32v16i32_Intrinsic<"HEXAGON_V6_veqsf">;
+
+def int_hexagon_V6_veqsf_128B :
+Hexagon_v128i1_v32i32v32i32_Intrinsic<"HEXAGON_V6_veqsf_128B">;
+
+def int_hexagon_V6_veqsf_and :
+Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqsf_and">;
+
+def int_hexagon_V6_veqsf_and_128B :
+Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqsf_and_128B">;
+
+def int_hexagon_V6_veqsf_or :
+Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqsf_or">;
+
+def int_hexagon_V6_veqsf_or_128B :
+Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqsf_or_128B">;
+
+def int_hexagon_V6_veqsf_xor :
+Hexagon_v64i1_v64i1v16i32v16i32_Intrinsic<"HEXAGON_V6_veqsf_xor">;
+
+def int_hexagon_V6_veqsf_xor_128B :
+Hexagon_v128i1_v128i1v32i32v32i32_Intrinsic<"HEXAGON_V6_veqsf_xor_128B">;
+
+def int_hexagon_V6_vilog2_hf :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vilog2_hf">;
+
+def int_hexagon_V6_vilog2_hf_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vilog2_hf_128B">;
+
+def int_hexagon_V6_vilog2_qf16 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vilog2_qf16">;
+
+def int_hexagon_V6_vilog2_qf16_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vilog2_qf16_128B">;
+
+def int_hexagon_V6_vilog2_qf32 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vilog2_qf32">;
+
+def int_hexagon_V6_vilog2_qf32_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vilog2_qf32_128B">;
+
+def int_hexagon_V6_vilog2_sf :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vilog2_sf">;
+
+def int_hexagon_V6_vilog2_sf_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vilog2_sf_128B">;
+
+def int_hexagon_V6_vneg_qf16_hf :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vneg_qf16_hf">;
+
+def int_hexagon_V6_vneg_qf16_hf_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vneg_qf16_hf_128B">;
+
+def int_hexagon_V6_vneg_qf16_qf16 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vneg_qf16_qf16">;
+
+def int_hexagon_V6_vneg_qf16_qf16_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vneg_qf16_qf16_128B">;
+
+def int_hexagon_V6_vneg_qf32_qf32 :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vneg_qf32_qf32">;
+
+def int_hexagon_V6_vneg_qf32_qf32_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vneg_qf32_qf32_128B">;
+
+def int_hexagon_V6_vneg_qf32_sf :
+Hexagon_v16i32_v16i32_Intrinsic<"HEXAGON_V6_vneg_qf32_sf">;
+
+def int_hexagon_V6_vneg_qf32_sf_128B :
+Hexagon_v32i32_v32i32_Intrinsic<"HEXAGON_V6_vneg_qf32_sf_128B">;
+
 def int_hexagon_V6_vsub_hf_mix :
 Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsub_hf_mix">;
 
diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
index 84026aa9d3624..1c46965d995fe 100644
--- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
@@ -1192,4 +1192,42 @@ def int_loongarch_lasx_xvstelm_w
 def int_loongarch_lasx_xvstelm_d
   : VecInt<[], [llvm_v4i64_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+
+// LASX and LSX conversion
+def int_loongarch_lasx_cast_128_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_cast_128_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_cast_128
+  : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_concat_128_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_concat_128_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_concat_128
+  : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_lo_s
+  : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_lo_d
+  : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_lo
+  : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_hi_s
+  : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_hi_d
+  : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_hi
+  : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_lo_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_lo_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_lo
+  : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_hi_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_hi_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_hi
+  : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 } // TargetPrefix = "loongarch"
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 719181a09f475..2710853e17688 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1334,15 +1334,8 @@ let TargetPrefix = "nvvm" in {
   //
   let IntrProperties = [IntrNoMem] in {
     foreach ftz = ["", "_ftz"] in
-      def int_nvvm_ex2_approx # ftz # _f : NVVMBuiltin,
-          DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>;
-
-    def int_nvvm_ex2_approx_d : NVVMBuiltin,
-        DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>;
-    def int_nvvm_ex2_approx_f16 :
-        DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty]>;
-    def int_nvvm_ex2_approx_f16x2 :
-        DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty]>;
+      def int_nvvm_ex2_approx # ftz :
+          DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
 
     foreach ftz = ["", "_ftz"] in
       def int_nvvm_lg2_approx # ftz # _f : NVVMBuiltin,
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index 636e88898a55e..3907e864bed1e 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -387,6 +387,12 @@ class PowerPC_VSX_Sca_DDD_Intrinsic<string GCCIntSuffix>
                           [llvm_double_ty], [llvm_double_ty, llvm_double_ty],
                           [IntrNoMem]>;
 
+/// PowerPC_VSX_WWW_Intrinsic - A PowerPC intrinsic that takes two v4i32
+/// vectors and returns one.  These intrinsics have no side effects.
+class PowerPC_VSX_WWW_Intrinsic<string GCCIntSuffix>
+  : PowerPC_VSX_Intrinsic<GCCIntSuffix,
+                          [llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
+                          [IntrNoMem]>;
 //===----------------------------------------------------------------------===//
 // PowerPC Altivec Intrinsic Definitions.
 
@@ -1214,6 +1220,7 @@ def int_ppc_altivec_vsraw : PowerPC_Vec_WWW_Intrinsic<"vsraw">;
 def int_ppc_altivec_vrlb  : PowerPC_Vec_BBB_Intrinsic<"vrlb">;
 def int_ppc_altivec_vrlh  : PowerPC_Vec_HHH_Intrinsic<"vrlh">;
 def int_ppc_altivec_vrlw  : PowerPC_Vec_WWW_Intrinsic<"vrlw">;
+def int_ppc_vsx_xvrlw     : PowerPC_VSX_WWW_Intrinsic<"xvrlw">;
 def int_ppc_altivec_vrld  : PowerPC_Vec_DDD_Intrinsic<"vrld">;
 
 let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index bc51fb639fd75..f39c6cda2c579 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -200,4 +200,7 @@ def int_spv_resource_nonuniformindex
   def int_spv_generic_cast_to_ptr_explicit
     : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [generic_ptr_ty],
        [IntrNoMem, NoUndef<RetIndex>]>;
+
+  def int_spv_unpackhalf2x16 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_i32_ty], [IntrNoMem]>;
+
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 81fbfbf0bb1b4..1dd23f60c7e1e 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5505,46 +5505,6 @@ let TargetPrefix = "x86" in {
                         [ImmArg<ArgIndex<0>>,
                         ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
-  // AMX-TRANSPOSE
-  def int_x86_t2rpntlvwz0 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0">,
-              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
-                        [ImmArg<ArgIndex<0>>]>;
-  def int_x86_t2rpntlvwz0t1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0t1">,
-              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
-                        [ImmArg<ArgIndex<0>>]>;
-  def int_x86_t2rpntlvwz1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1">,
-              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
-                        [ImmArg<ArgIndex<0>>]>;
-  def int_x86_t2rpntlvwz1t1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1t1">,
-              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
-                        [ImmArg<ArgIndex<0>>]>;
-  def int_x86_ttransposed : ClangBuiltin<"__builtin_ia32_ttransposed">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty],
-                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
-  def int_x86_ttdpbf16ps : ClangBuiltin<"__builtin_ia32_ttdpbf16ps">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
-                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
-                         ImmArg<ArgIndex<2>>]>;
-  def int_x86_ttdpfp16ps : ClangBuiltin<"__builtin_ia32_ttdpfp16ps">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
-                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
-                         ImmArg<ArgIndex<2>>]>;
-  def int_x86_ttcmmimfp16ps : ClangBuiltin<"__builtin_ia32_ttcmmimfp16ps">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
-                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
-                         ImmArg<ArgIndex<2>>]>;
-  def int_x86_ttcmmrlfp16ps : ClangBuiltin<"__builtin_ia32_ttcmmrlfp16ps">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
-                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
-                         ImmArg<ArgIndex<2>>]>;
-  def int_x86_tconjtcmmimfp16ps : ClangBuiltin<"__builtin_ia32_tconjtcmmimfp16ps">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
-                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
-                         ImmArg<ArgIndex<2>>]>;
-  def int_x86_tconjtfp16 : ClangBuiltin<"__builtin_ia32_tconjtfp16">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty],
-                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
-
   // AMX-MORVS, AMX-TRANSPOSE
   def int_x86_t2rpntlvwz0rs : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0rs">,
               Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
@@ -5685,61 +5645,6 @@ let TargetPrefix = "x86" in {
                         [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
                         [IntrArgMemOnly]>;
 
-  def int_x86_t2rpntlvwz0_internal :
-              Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
-                        []>;
-  def int_x86_t2rpntlvwz0t1_internal :
-              Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
-                        []>;
-  def int_x86_t2rpntlvwz1_internal :
-              Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
-                        []>;
-  def int_x86_t2rpntlvwz1t1_internal :
-              Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
-                        []>;
-  def int_x86_ttransposed_internal :
-              ClangBuiltin<"__builtin_ia32_ttransposed_internal">,
-              Intrinsic<[llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty], []>;
-  def int_x86_ttdpbf16ps_internal :
-              ClangBuiltin<"__builtin_ia32_ttdpbf16ps_internal">,
-              Intrinsic<[llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
-                         llvm_x86amx_ty, llvm_x86amx_ty,
-                         llvm_x86amx_ty], []>;
-  def int_x86_ttdpfp16ps_internal :
-              ClangBuiltin<"__builtin_ia32_ttdpfp16ps_internal">,
-              Intrinsic<[llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
-                         llvm_x86amx_ty, llvm_x86amx_ty,
-                         llvm_x86amx_ty], []>;
-  def int_x86_ttcmmimfp16ps_internal :
-              ClangBuiltin<"__builtin_ia32_ttcmmimfp16ps_internal">,
-              Intrinsic<[llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
-                         llvm_x86amx_ty, llvm_x86amx_ty,
-                         llvm_x86amx_ty], []>;
-  def int_x86_ttcmmrlfp16ps_internal :
-              ClangBuiltin<"__builtin_ia32_ttcmmrlfp16ps_internal">,
-              Intrinsic<[llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
-                         llvm_x86amx_ty, llvm_x86amx_ty,
-                         llvm_x86amx_ty], []>;
-  def int_x86_tconjtcmmimfp16ps_internal :
-              ClangBuiltin<"__builtin_ia32_tconjtcmmimfp16ps_internal">,
-              Intrinsic<[llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
-                         llvm_x86amx_ty, llvm_x86amx_ty,
-                         llvm_x86amx_ty], []>;
-  def int_x86_tconjtfp16_internal :
-              ClangBuiltin<"__builtin_ia32_tconjtfp16_internal">,
-              Intrinsic<[llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty], []>;
-
   def int_x86_tcvtrowd2ps_internal :
               ClangBuiltin<"__builtin_ia32_tcvtrowd2ps_internal">,
               Intrinsic<[llvm_v16f32_ty],
@@ -5775,20 +5680,11 @@ let TargetPrefix = "x86" in {
               Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
                         [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
                          ImmArg<ArgIndex<2>>]>;
-  def int_x86_ttmmultf32ps : ClangBuiltin<"__builtin_ia32_ttmmultf32ps">,
-              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
-                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
-                         ImmArg<ArgIndex<2>>]>;
   def int_x86_tmmultf32ps_internal :
               ClangBuiltin<"__builtin_ia32_tmmultf32ps_internal">,
               Intrinsic<[llvm_x86amx_ty],
                         [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty,
                          llvm_x86amx_ty, llvm_x86amx_ty], []>;
-  def int_x86_ttmmultf32ps_internal :
-              ClangBuiltin<"__builtin_ia32_ttmmultf32ps_internal">,
-              Intrinsic<[llvm_x86amx_ty],
-                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty,
-                         llvm_x86amx_ty, llvm_x86amx_ty], []>;
 
   def int_x86_tdpbf8ps_internal :
                 ClangBuiltin<"__builtin_ia32_tdpbf8ps_internal">,
diff --git a/llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h b/llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h
index 535635a9ad9b0..fcfb2db85a880 100644
--- a/llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h
+++ b/llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h
@@ -21,7 +21,8 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
-#include <tuple> // for std::pair
+
+#include <utility>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
index 3381e1777217a..ccb77e75492af 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -79,7 +79,7 @@ struct CustomMappingTraits<
       }
       Args.push_back(Arg);
     }
-    io.mapRequired(Key.str().c_str(), V[Args]);
+    io.mapRequired(Key, V[Args]);
   }
   static void output(
       IO &io,
@@ -91,7 +91,7 @@ struct CustomMappingTraits<
           Key += ',';
         Key += llvm::utostr(Arg);
       }
-      io.mapRequired(Key.c_str(), P.second);
+      io.mapRequired(Key, P.second);
     }
   }
 };
@@ -122,11 +122,11 @@ struct CustomMappingTraits<std::map<uint64_t, WholeProgramDevirtResolution>> {
       io.setError("key not an integer");
       return;
     }
-    io.mapRequired(Key.str().c_str(), V[KeyInt]);
+    io.mapRequired(Key, V[KeyInt]);
   }
   static void output(IO &io, std::map<uint64_t, WholeProgramDevirtResolution> &V) {
     for (auto &P : V)
-      io.mapRequired(llvm::utostr(P.first).c_str(), P.second);
+      io.mapRequired(llvm::utostr(P.first), P.second);
   }
 };
 
@@ -215,7 +215,7 @@ namespace yaml {
 template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
   static void inputOne(IO &io, StringRef Key, GlobalValueSummaryMapTy &V) {
     std::vector<GlobalValueSummaryYaml> GVSums;
-    io.mapRequired(Key.str().c_str(), GVSums);
+    io.mapRequired(Key, GVSums);
     uint64_t KeyInt;
     if (Key.getAsInteger(0, KeyInt)) {
       io.setError("key not an integer");
@@ -290,7 +290,7 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
         }
       }
       if (!GVSums.empty())
-        io.mapRequired(llvm::utostr(P.first).c_str(), GVSums);
+        io.mapRequired(llvm::utostr(P.first), GVSums);
     }
   }
   static void fixAliaseeLinks(GlobalValueSummaryMapTy &V) {
@@ -313,12 +313,12 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
 template <> struct CustomMappingTraits<TypeIdSummaryMapTy> {
   static void inputOne(IO &io, StringRef Key, TypeIdSummaryMapTy &V) {
     TypeIdSummary TId;
-    io.mapRequired(Key.str().c_str(), TId);
+    io.mapRequired(Key, TId);
     V.insert({GlobalValue::getGUIDAssumingExternalLinkage(Key), {Key, TId}});
   }
   static void output(IO &io, TypeIdSummaryMapTy &V) {
     for (auto &TidIter : V)
-      io.mapRequired(TidIter.second.first.str().c_str(), TidIter.second.second);
+      io.mapRequired(TidIter.second.first, TidIter.second.second);
   }
 };
 
diff --git a/llvm/include/llvm/IR/PassInstrumentation.h b/llvm/include/llvm/IR/PassInstrumentation.h
index 33eda5a4222f1..c25e2891d33d5 100644
--- a/llvm/include/llvm/IR/PassInstrumentation.h
+++ b/llvm/include/llvm/IR/PassInstrumentation.h
@@ -55,7 +55,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Compiler.h"
-#include <type_traits>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h
index acb17a8090c51..4354551a2405b 100644
--- a/llvm/include/llvm/IR/PassManager.h
+++ b/llvm/include/llvm/IR/PassManager.h
@@ -47,7 +47,6 @@
 #include "llvm/Support/TypeName.h"
 #include <cassert>
 #include <cstring>
-#include <iterator>
 #include <list>
 #include <memory>
 #include <tuple>
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index e3ec7e1764da7..88aef4a368f29 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -872,6 +872,9 @@ inline bind_and_match_ty<const Value, MatchTy> m_Value(const Value *&V,
 
 /// Match an instruction, capturing it if we match.
 inline bind_ty<Instruction> m_Instruction(Instruction *&I) { return I; }
+inline bind_ty<const Instruction> m_Instruction(const Instruction *&I) {
+  return I;
+}
 
 /// Match against the nested pattern, and capture the instruction if we match.
 template <typename MatchTy>
@@ -879,11 +882,22 @@ inline bind_and_match_ty<Instruction, MatchTy>
 m_Instruction(Instruction *&I, const MatchTy &Match) {
   return {I, Match};
 }
+template <typename MatchTy>
+inline bind_and_match_ty<const Instruction, MatchTy>
+m_Instruction(const Instruction *&I, const MatchTy &Match) {
+  return {I, Match};
+}
 
 /// Match a unary operator, capturing it if we match.
 inline bind_ty<UnaryOperator> m_UnOp(UnaryOperator *&I) { return I; }
+inline bind_ty<const UnaryOperator> m_UnOp(const UnaryOperator *&I) {
+  return I;
+}
 /// Match a binary operator, capturing it if we match.
 inline bind_ty<BinaryOperator> m_BinOp(BinaryOperator *&I) { return I; }
+inline bind_ty<const BinaryOperator> m_BinOp(const BinaryOperator *&I) {
+  return I;
+}
 /// Match a with overflow intrinsic, capturing it if we match.
 inline bind_ty<WithOverflowInst> m_WithOverflowInst(WithOverflowInst *&I) {
   return I;
@@ -3069,12 +3083,26 @@ m_c_MaxOrMin(const LHS &L, const RHS &R) {
                      m_CombineOr(m_c_UMax(L, R), m_c_UMin(L, R)));
 }
 
+template <Intrinsic::ID IntrID, typename LHS, typename RHS>
+struct CommutativeBinaryIntrinsic_match {
+  LHS L;
+  RHS R;
+
+  CommutativeBinaryIntrinsic_match(const LHS &L, const RHS &R) : L(L), R(R) {}
+
+  template <typename OpTy> bool match(OpTy *V) const {
+    const auto *II = dyn_cast<IntrinsicInst>(V);
+    if (!II || II->getIntrinsicID() != IntrID)
+      return false;
+    return (L.match(II->getArgOperand(0)) && R.match(II->getArgOperand(1))) ||
+           (L.match(II->getArgOperand(1)) && R.match(II->getArgOperand(0)));
+  }
+};
+
 template <Intrinsic::ID IntrID, typename T0, typename T1>
-inline match_combine_or<typename m_Intrinsic_Ty<T0, T1>::Ty,
-                        typename m_Intrinsic_Ty<T1, T0>::Ty>
+inline CommutativeBinaryIntrinsic_match<IntrID, T0, T1>
 m_c_Intrinsic(const T0 &Op0, const T1 &Op1) {
-  return m_CombineOr(m_Intrinsic<IntrID>(Op0, Op1),
-                     m_Intrinsic<IntrID>(Op1, Op0));
+  return CommutativeBinaryIntrinsic_match<IntrID, T0, T1>(Op0, Op1);
 }
 
 /// Matches FAdd with LHS and RHS in either order.
diff --git a/llvm/include/llvm/IR/ProfDataUtils.h b/llvm/include/llvm/IR/ProfDataUtils.h
index a0876b169e0b8..f1c2f38c74afd 100644
--- a/llvm/include/llvm/IR/ProfDataUtils.h
+++ b/llvm/include/llvm/IR/ProfDataUtils.h
@@ -18,6 +18,8 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/Compiler.h"
+#include <cstddef>
+#include <type_traits>
 
 namespace llvm {
 struct MDProfLabels {
@@ -194,10 +196,11 @@ LLVM_ABI void setExplicitlyUnknownBranchWeights(Instruction &I,
 /// Like setExplicitlyUnknownBranchWeights(...), but only sets unknown branch
 /// weights in the new instruction if the parent function of the original
 /// instruction has an entry count. This is to not confuse users by injecting
-/// profile data into non-profiled functions.
-LLVM_ABI void setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I,
-                                                          Function &F,
-                                                          StringRef PassName);
+/// profile data into non-profiled functions. If \p F is nullptr, we will fetch
+/// the function from \p I.
+LLVM_ABI void
+setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, StringRef PassName,
+                                            const Function *F = nullptr);
 
 /// Analogous to setExplicitlyUnknownBranchWeights, but for functions and their
 /// entry counts.
@@ -215,9 +218,13 @@ LLVM_ABI void scaleProfData(Instruction &I, uint64_t S, uint64_t T);
 /// branch weights B1 and B2, respectively. In both B1 and B2, the first
 /// position (index 0) is for the 'true' branch, and the second position (index
 /// 1) is for the 'false' branch.
+template <typename T1, typename T2,
+          typename = typename std::enable_if<
+              std::is_arithmetic_v<T1> && std::is_arithmetic_v<T2> &&
+              sizeof(T1) <= sizeof(uint64_t) && sizeof(T2) <= sizeof(uint64_t)>>
 inline SmallVector<uint64_t, 2>
-getDisjunctionWeights(const SmallVector<uint32_t, 2> &B1,
-                      const SmallVector<uint32_t, 2> &B2) {
+getDisjunctionWeights(const SmallVector<T1, 2> &B1,
+                      const SmallVector<T2, 2> &B2) {
   // For the first conditional branch, the probability the "true" case is taken
   // is p(b1) = B1[0] / (B1[0] + B1[1]). The "false" case's probability is
   // p(not b1) = B1[1] / (B1[0] + B1[1]).
@@ -234,8 +241,8 @@ getDisjunctionWeights(const SmallVector<uint32_t, 2> &B1,
   // the product of sums, the subtracted one cancels out).
   assert(B1.size() == 2);
   assert(B2.size() == 2);
-  auto FalseWeight = B1[1] * B2[1];
-  auto TrueWeight = B1[0] * B2[0] + B1[0] * B2[1] + B1[1] * B2[0];
+  uint64_t FalseWeight = B1[1] * B2[1];
+  uint64_t TrueWeight = B1[0] * (B2[0] + B2[1]) + B1[1] * B2[0];
   return {TrueWeight, FalseWeight};
 }
 } // namespace llvm
diff --git a/llvm/include/llvm/IR/ProfileSummary.h b/llvm/include/llvm/IR/ProfileSummary.h
index 6c087ea02b3c3..34012151f729f 100644
--- a/llvm/include/llvm/IR/ProfileSummary.h
+++ b/llvm/include/llvm/IR/ProfileSummary.h
@@ -14,7 +14,6 @@
 #define LLVM_IR_PROFILESUMMARY_H
 
 #include "llvm/Support/Compiler.h"
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <vector>
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 01359894b0421..0afe32a4ecc3c 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -9,6 +9,8 @@
 // This file implements a common interface to work with library calls into a
 // runtime that may be emitted by a given backend.
 //
+// FIXME: This should probably move to Analysis
+//
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_IR_RUNTIME_LIBCALLS_H
@@ -20,6 +22,7 @@
 #include "llvm/ADT/StringTable.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Compiler.h"
@@ -42,6 +45,8 @@ template <> struct enum_iteration_traits<RTLIB::LibcallImpl> {
   static constexpr bool is_iterable = true;
 };
 
+class LibcallLoweringInfo;
+
 namespace RTLIB {
 
 // Return an iterator over all Libcall values.
@@ -70,31 +75,20 @@ struct RuntimeLibcallsInfo {
   LibcallImplBitset AvailableLibcallImpls;
 
 public:
+  friend class llvm::LibcallLoweringInfo;
+
+  RuntimeLibcallsInfo() = default;
+
   explicit RuntimeLibcallsInfo(
       const Triple &TT,
       ExceptionHandling ExceptionModel = ExceptionHandling::None,
       FloatABI::ABIType FloatABI = FloatABI::Default,
-      EABI EABIVersion = EABI::Default, StringRef ABIName = "") {
-    // FIXME: The ExceptionModel parameter is to handle the field in
-    // TargetOptions. This interface fails to distinguish the forced disable
-    // case for targets which support exceptions by default. This should
-    // probably be a module flag and removed from TargetOptions.
-    if (ExceptionModel == ExceptionHandling::None)
-      ExceptionModel = TT.getDefaultExceptionHandling();
-
-    initLibcalls(TT, ExceptionModel, FloatABI, EABIVersion, ABIName);
-  }
+      EABI EABIVersion = EABI::Default, StringRef ABIName = "");
 
-  /// Rename the default libcall routine name for the specified libcall.
-  void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl) {
-    LibcallImpls[Call] = Impl;
-  }
+  explicit RuntimeLibcallsInfo(const Module &M);
 
-  /// Get the libcall routine name for the specified libcall.
-  // FIXME: This should be removed. Only LibcallImpl should have a name.
-  StringRef getLibcallName(RTLIB::Libcall Call) const {
-    return getLibcallImplName(LibcallImpls[Call]);
-  }
+  bool invalidate(Module &M, const PreservedAnalyses &PA,
+                  ModuleAnalysisManager::Invalidator &);
 
   /// Get the libcall routine name for the specified libcall implementation.
   static StringRef getLibcallImplName(RTLIB::LibcallImpl CallImpl) {
@@ -105,42 +99,24 @@ struct RuntimeLibcallsInfo {
                      RuntimeLibcallNameSizeTable[CallImpl]);
   }
 
-  /// Return the lowering's selection of implementation call for \p Call
-  RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const {
-    return LibcallImpls[Call];
-  }
-
   /// Set the CallingConv that should be used for the specified libcall
   /// implementation
   void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) {
     LibcallImplCallingConvs[Call] = CC;
   }
 
-  // FIXME: Remove this wrapper in favor of directly using
-  // getLibcallImplCallingConv
-  CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const {
-    return LibcallImplCallingConvs[LibcallImpls[Call]];
-  }
-
   /// Get the CallingConv that should be used for the specified libcall.
   CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const {
     return LibcallImplCallingConvs[Call];
   }
 
-  ArrayRef<RTLIB::LibcallImpl> getLibcallImpls() const {
-    // Trim UNKNOWN_LIBCALL from the back
-    return ArrayRef(LibcallImpls).drop_back();
+  /// Return the libcall provided by \p Impl
+  static RTLIB::Libcall getLibcallFromImpl(RTLIB::LibcallImpl Impl) {
+    return ImplToLibcall[Impl];
   }
 
-  /// Return a function name compatible with RTLIB::MEMCPY, or nullptr if fully
-  /// unsupported.
-  StringRef getMemcpyName() const {
-    RTLIB::LibcallImpl Memcpy = getLibcallImpl(RTLIB::MEMCPY);
-    if (Memcpy != RTLIB::Unsupported)
-      return getLibcallImplName(Memcpy);
-
-    // Fallback to memmove if memcpy isn't available.
-    return getLibcallName(RTLIB::MEMMOVE);
+  unsigned getNumAvailableLibcallImpls() const {
+    return AvailableLibcallImpls.count();
   }
 
   bool isAvailable(RTLIB::LibcallImpl Impl) const {
@@ -151,11 +127,6 @@ struct RuntimeLibcallsInfo {
     AvailableLibcallImpls.set(Impl);
   }
 
-  /// Return the libcall provided by \p Impl
-  static RTLIB::Libcall getLibcallFromImpl(RTLIB::LibcallImpl Impl) {
-    return ImplToLibcall[Impl];
-  }
-
   /// Check if a function name is a recognized runtime call of any kind. This
   /// does not consider if this call is available for any current compilation,
   /// just that it is a known call somewhere. This returns the set of all
@@ -176,24 +147,28 @@ struct RuntimeLibcallsInfo {
   LLVM_ABI RTLIB::LibcallImpl
       getSupportedLibcallImpl(StringRef FuncName) const {
     for (RTLIB::LibcallImpl Impl : lookupLibcallImplName(FuncName)) {
-      // FIXME: This should not depend on looking up ImplToLibcall, only the
-      // list of libcalls for the module.
-      RTLIB::LibcallImpl Recognized = LibcallImpls[ImplToLibcall[Impl]];
-      if (Recognized != RTLIB::Unsupported)
-        return Recognized;
+      if (isAvailable(Impl))
+        return Impl;
     }
 
     return RTLIB::Unsupported;
   }
 
+  /// \returns the function type and attributes for the \p LibcallImpl,
+  /// depending on the target \p TT. If the function has incomplete type
+  /// information, return nullptr for the function type.
+  std::pair<FunctionType *, AttributeList>
+  getFunctionTy(LLVMContext &Ctx, const Triple &TT, const DataLayout &DL,
+                RTLIB::LibcallImpl LibcallImpl) const;
+
+  /// Returns true if the function has a vector mask argument, which is assumed
+  /// to be the last argument.
+  static bool hasVectorMaskArgument(RTLIB::LibcallImpl Impl);
+
 private:
   LLVM_ABI static iota_range<RTLIB::LibcallImpl>
   lookupLibcallImplNameImpl(StringRef Name);
 
-  /// Stores the implementation choice for each each libcall.
-  RTLIB::LibcallImpl LibcallImpls[RTLIB::UNKNOWN_LIBCALL + 1] = {
-      RTLIB::Unsupported};
-
   static_assert(static_cast<int>(CallingConv::C) == 0,
                 "default calling conv should be encoded as 0");
 
@@ -267,6 +242,7 @@ struct RuntimeLibcallsInfo {
 };
 
 } // namespace RTLIB
+
 } // namespace llvm
 
 #endif // LLVM_IR_RUNTIME_LIBCALLS_H
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index 7be1b654ca727..ce7e836f66446 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -182,6 +182,12 @@ foreach FPTy = ["F32", "F64", "F80", "F128", "PPCF128"] in {
   def MODF_#FPTy : RuntimeLibcall;
 }
 
+foreach VecTy = ["V4F32", "V2F64", "NXV4F32", "NXV2F64"] in {
+  def MODF_#VecTy : RuntimeLibcall;
+  def SINCOS_#VecTy : RuntimeLibcall;
+  def SINCOSPI_#VecTy : RuntimeLibcall;
+}
+
 def FEGETENV : RuntimeLibcall;
 def FESETENV : RuntimeLibcall;
 
@@ -971,10 +977,6 @@ def frexpf : RuntimeLibcallImpl<FREXP_F32>;
 def frexp : RuntimeLibcallImpl<FREXP_F64>;
 defm frexpl : LibmLongDoubleLibCall;
 
-def sincospif : RuntimeLibcallImpl<SINCOSPI_F32>;
-def sincospi : RuntimeLibcallImpl<SINCOSPI_F64>;
-defm sincospil : LibmLongDoubleLibCall;
-
 def modff : RuntimeLibcallImpl<MODF_F32>;
 def modf : RuntimeLibcallImpl<MODF_F64>;
 defm modfl : LibmLongDoubleLibCall;
@@ -1051,6 +1053,15 @@ def sincosf : RuntimeLibcallImpl<SINCOS_F32>;
 def sincos : RuntimeLibcallImpl<SINCOS_F64>;
 defm sincosl : LibmLongDoubleLibCall;
 
+// Exists in sun math library
+def sincospif : RuntimeLibcallImpl<SINCOSPI_F32>;
+def sincospi : RuntimeLibcallImpl<SINCOSPI_F64>;
+defm sincospil : LibmLongDoubleLibCall;
+
+// Exists on macOS
+def __sincospif : RuntimeLibcallImpl<SINCOSPI_F32>;
+def __sincospi : RuntimeLibcallImpl<SINCOSPI_F64>;
+
 def bzero : RuntimeLibcallImpl<BZERO>;
 def __bzero : RuntimeLibcallImpl<BZERO>;
 
@@ -1078,6 +1089,50 @@ def __security_check_cookie : RuntimeLibcallImpl<SECURITY_CHECK_COOKIE>;
 def __security_check_cookie_arm64ec : RuntimeLibcallImpl<SECURITY_CHECK_COOKIE,
   "#__security_check_cookie_arm64ec">;
 
+//===----------------------------------------------------------------------===//
+// sleef calls
+//===----------------------------------------------------------------------===//
+
+defset list<RuntimeLibcallImpl> SleefLibcalls = {
+  def _ZGVnN2vl8_modf : RuntimeLibcallImpl<MODF_V2F64>;
+  def _ZGVnN4vl4_modff : RuntimeLibcallImpl<MODF_V4F32>;
+  def _ZGVsNxvl8_modf : RuntimeLibcallImpl<MODF_NXV2F64>;
+  def _ZGVsNxvl4_modff : RuntimeLibcallImpl<MODF_NXV4F32>;
+
+  def _ZGVnN2vl8l8_sincos : RuntimeLibcallImpl<SINCOS_V2F64>;
+  def _ZGVnN4vl4l4_sincosf : RuntimeLibcallImpl<SINCOS_V4F32>;
+  def _ZGVsNxvl8l8_sincos : RuntimeLibcallImpl<SINCOS_NXV2F64>;
+  def _ZGVsNxvl4l4_sincosf : RuntimeLibcallImpl<SINCOS_NXV4F32>;
+
+  def _ZGVnN4vl4l4_sincospif : RuntimeLibcallImpl<SINCOSPI_V4F32>;
+  def _ZGVnN2vl8l8_sincospi : RuntimeLibcallImpl<SINCOSPI_V2F64>;
+  def _ZGVsNxvl4l4_sincospif : RuntimeLibcallImpl<SINCOSPI_NXV4F32>;
+  def _ZGVsNxvl8l8_sincospi : RuntimeLibcallImpl<SINCOSPI_NXV2F64>;
+}
+
+//===----------------------------------------------------------------------===//
+// ARMPL calls
+//===----------------------------------------------------------------------===//
+
+defset list<RuntimeLibcallImpl> ARMPLLibcalls = {
+  def armpl_vmodfq_f64 : RuntimeLibcallImpl<MODF_V2F64>; // CallingConv::AArch64_VectorCall
+  def armpl_vmodfq_f32 : RuntimeLibcallImpl<MODF_V4F32>; // CallingConv::AArch64_VectorCall
+  def armpl_svmodf_f64_x : RuntimeLibcallImpl<MODF_NXV2F64>;
+  def armpl_svmodf_f32_x : RuntimeLibcallImpl<MODF_NXV4F32>;
+
+  def armpl_vsincosq_f64
+      : RuntimeLibcallImpl<SINCOS_V2F64>; // CallingConv::AArch64_VectorCall
+  def armpl_vsincosq_f32
+      : RuntimeLibcallImpl<SINCOS_V4F32>; // CallingConv::AArch64_VectorCall
+  def armpl_svsincos_f64_x : RuntimeLibcallImpl<SINCOS_NXV2F64>;
+  def armpl_svsincos_f32_x : RuntimeLibcallImpl<SINCOS_NXV4F32>;
+
+  def armpl_vsincospiq_f32 : RuntimeLibcallImpl<SINCOSPI_V4F32>;
+  def armpl_vsincospiq_f64 : RuntimeLibcallImpl<SINCOSPI_V2F64>;
+  def armpl_svsincospi_f32_x : RuntimeLibcallImpl<SINCOSPI_NXV4F32>;
+  def armpl_svsincospi_f64_x : RuntimeLibcallImpl<SINCOSPI_NXV2F64>;
+}
+
 //===----------------------------------------------------------------------===//
 // F128 libm Runtime Libcalls
 //===----------------------------------------------------------------------===//
@@ -1206,7 +1261,9 @@ defvar DefaultLibcallImpls32 = (add DefaultRuntimeLibcallImpls);
 defvar DefaultLibcallImpls64 = (add DefaultRuntimeLibcallImpls,
                                     Int128RTLibcalls);
 
-defvar DarwinSinCosStret = LibcallImpls<(add __sincosf_stret, __sincos_stret),
+// TODO: Guessing sincospi added at same time as sincos_stret
+defvar DarwinSinCosStret = LibcallImpls<(add __sincosf_stret, __sincos_stret,
+                                             __sincospif, __sincospi),
                                         darwinHasSinCosStret>;
 defvar DarwinExp10 = LibcallImpls<(add __exp10f, __exp10), darwinHasExp10>;
 
@@ -1585,7 +1642,7 @@ def __aeabi_f2ulz : RuntimeLibcallImpl<FPTOUINT_F32_I64>; // CallingConv::ARM_AA
 // RTABI chapter 4.1.2, Table 7
 def __aeabi_d2f : RuntimeLibcallImpl<FPROUND_F64_F32>; // CallingConv::ARM_AAPCS
 def __aeabi_d2h : RuntimeLibcallImpl<FPROUND_F64_F16>; // CallingConv::ARM_AAPCS
-def  __aeabi_f2d : RuntimeLibcallImpl<FPEXT_F32_F64>; // CallingConv::ARM_AAPCS
+def __aeabi_f2d : RuntimeLibcallImpl<FPEXT_F32_F64>; // CallingConv::ARM_AAPCS
 
 // Integer to floating-point conversions.
 // RTABI chapter 4.1.2, Table 8
@@ -2333,7 +2390,7 @@ defset list<RuntimeLibcallImpl> PPCRuntimeLibcalls = {
 
 defset list<RuntimeLibcallImpl> PPC64AIXCallList = {
   def ___memcmp64 : RuntimeLibcallImpl<MEMCMP>;
-  def ___memmove64 : RuntimeLibcallImpl<MEMCPY>;
+  def ___memmove64 : RuntimeLibcallImpl<MEMMOVE>;
   def ___memset64 : RuntimeLibcallImpl<MEMSET>;
   def ___bzero64 : RuntimeLibcallImpl<BZERO>;
   def ___strlen64 : RuntimeLibcallImpl<STRLEN>;
diff --git a/llvm/include/llvm/IR/RuntimeLibcallsImpl.td b/llvm/include/llvm/IR/RuntimeLibcallsImpl.td
index b5752c1b69ad8..92853125379f5 100644
--- a/llvm/include/llvm/IR/RuntimeLibcallsImpl.td
+++ b/llvm/include/llvm/IR/RuntimeLibcallsImpl.td
@@ -61,7 +61,6 @@ class RuntimeLibcall {
 class RuntimeLibcallImpl<RuntimeLibcall P, string Name = NAME> {
   RuntimeLibcall Provides = P;
   string LibCallFuncName = Name;
-  list<LibcallLoweringPredicate> LoweringPredicates;
   bit IsDefault = false;
 }
 
diff --git a/llvm/include/llvm/IR/SystemLibraries.h b/llvm/include/llvm/IR/SystemLibraries.h
new file mode 100644
index 0000000000000..1713b07c1c86f
--- /dev/null
+++ b/llvm/include/llvm/IR/SystemLibraries.h
@@ -0,0 +1,39 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_SYSTEMLIBRARIES_H
+#define LLVM_IR_SYSTEMLIBRARIES_H
+
+namespace llvm {
+/// List of known vector-functions libraries.
+///
+/// The vector-functions library defines, which functions are vectorizable
+/// and with which factor. The library can be specified by either frontend,
+/// or a commandline option, and then used by
+/// addVectorizableFunctionsFromVecLib for filling up the tables of
+/// vectorizable functions.
+enum class VectorLibrary {
+  NoLibrary,        // Don't use any vector library.
+  Accelerate,       // Use Accelerate framework.
+  DarwinLibSystemM, // Use Darwin's libsystem_m.
+  LIBMVEC,          // GLIBC Vector Math library.
+  MASSV,            // IBM MASS vector library.
+  SVML,             // Intel short vector math library.
+  SLEEFGNUABI,      // SLEEF - SIMD Library for Evaluating Elementary Functions.
+  ArmPL,            // Arm Performance Libraries.
+  AMDLIBM           // AMD Math Vector library.
+};
+
+/// Command line flag value for the vector math library to use
+///
+/// FIXME: This should come from a module flag, and not be mutually exclusive
+extern VectorLibrary ClVectorLibrary;
+
+} // namespace llvm
+
+#endif // LLVM_IR_SYSTEMLIBRARIES_H
diff --git a/llvm/include/llvm/IR/TrackingMDRef.h b/llvm/include/llvm/IR/TrackingMDRef.h
index d7377398b91b3..7ad7225d076fc 100644
--- a/llvm/include/llvm/IR/TrackingMDRef.h
+++ b/llvm/include/llvm/IR/TrackingMDRef.h
@@ -111,17 +111,14 @@ template <class T> class TypedTrackingMDRef {
   explicit TypedTrackingMDRef(T *MD) : Ref(static_cast<Metadata *>(MD)) {}
 
   TypedTrackingMDRef(TypedTrackingMDRef &&X) : Ref(std::move(X.Ref)) {}
-  TypedTrackingMDRef(const TypedTrackingMDRef &X) : Ref(X.Ref) {}
+  TypedTrackingMDRef(const TypedTrackingMDRef &X) = default;
 
   TypedTrackingMDRef &operator=(TypedTrackingMDRef &&X) {
     Ref = std::move(X.Ref);
     return *this;
   }
 
-  TypedTrackingMDRef &operator=(const TypedTrackingMDRef &X) {
-    Ref = X.Ref;
-    return *this;
-  }
+  TypedTrackingMDRef &operator=(const TypedTrackingMDRef &X) = default;
 
   T *get() const { return (T *)Ref.get(); }
   operator T *() const { return get(); }
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 581b4ad161daa..10a4d8525a9e8 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -90,7 +90,6 @@ LLVM_ABI void initializeDSELegacyPassPass(PassRegistry &);
 LLVM_ABI void initializeDXILMetadataAnalysisWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeDXILMetadataAnalysisWrapperPrinterPass(PassRegistry &);
 LLVM_ABI void initializeDXILResourceBindingWrapperPassPass(PassRegistry &);
-LLVM_ABI void initializeDXILResourceImplicitBindingLegacyPass(PassRegistry &);
 LLVM_ABI void initializeDXILResourceTypeWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeDXILResourceWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeDeadMachineInstructionElimPass(PassRegistry &);
@@ -291,6 +290,7 @@ LLVM_ABI void initializeRemoveRedundantDebugValuesLegacyPass(PassRegistry &);
 LLVM_ABI void initializeRenameIndependentSubregsLegacyPass(PassRegistry &);
 LLVM_ABI void initializeReplaceWithVeclibLegacyPass(PassRegistry &);
 LLVM_ABI void initializeResetMachineFunctionPass(PassRegistry &);
+LLVM_ABI void initializeRuntimeLibraryInfoWrapperPass(PassRegistry &);
 LLVM_ABI void initializeSCEVAAWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeSROALegacyPassPass(PassRegistry &);
 LLVM_ABI void initializeSafeStackLegacyPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/MC/DXContainerPSVInfo.h b/llvm/include/llvm/MC/DXContainerPSVInfo.h
index 3a2d2949d0223..eb6d9e14d92c3 100644
--- a/llvm/include/llvm/MC/DXContainerPSVInfo.h
+++ b/llvm/include/llvm/MC/DXContainerPSVInfo.h
@@ -17,7 +17,6 @@
 #include "llvm/TargetParser/Triple.h"
 
 #include <array>
-#include <numeric>
 #include <stdint.h>
 
 namespace llvm {
diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index 7a2e9ad154f01..ea8ac6dbe6e34 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -401,7 +401,7 @@ class LLVM_ABI MCAsmInfo {
   // Generated object files can use all ELF features supported by GNU ld of
   // this binutils version and later. INT_MAX means all features can be used,
   // regardless of GNU ld support. The default value is referenced by
-  // clang/Driver/Options.td.
+  // clang/Options/Options.td.
   std::pair<int, int> BinutilsVersion = {2, 26};
 
   /// Should we use the integrated assembler?
diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h
index 6e1d6421b8d33..dbae271a1c198 100644
--- a/llvm/include/llvm/MC/MCAssembler.h
+++ b/llvm/include/llvm/MC/MCAssembler.h
@@ -19,15 +19,12 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/SMLoc.h"
-#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <string>
-#include <tuple>
 #include <utility>
-#include <vector>
 
 namespace llvm {
 
@@ -198,8 +195,8 @@ class MCAssembler {
   const_iterator end() const { return Sections.end(); }
 
   SmallVectorImpl<const MCSymbol *> &getSymbols() { return Symbols; }
-  iterator_range<pointee_iterator<
-      typename SmallVector<const MCSymbol *, 0>::const_iterator>>
+  iterator_range<
+      pointee_iterator<SmallVector<const MCSymbol *, 0>::const_iterator>>
   symbols() const {
     return make_pointee_range(Symbols);
   }
diff --git a/llvm/include/llvm/MC/MCParser/AsmLexer.h b/llvm/include/llvm/MC/MCParser/AsmLexer.h
index 11d32fbb64702..c514b768637d1 100644
--- a/llvm/include/llvm/MC/MCParser/AsmLexer.h
+++ b/llvm/include/llvm/MC/MCParser/AsmLexer.h
@@ -21,7 +21,6 @@
 #include <cassert>
 #include <cstddef>
 #include <string>
-#include <utility>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/MC/MCParser/MCAsmParser.h b/llvm/include/llvm/MC/MCParser/MCAsmParser.h
index e3f44a08db641..5d74b76592df9 100644
--- a/llvm/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/llvm/include/llvm/MC/MCParser/MCAsmParser.h
@@ -209,28 +209,25 @@ class LLVM_ABI MCAsmParser {
       MCInstPrinter *IP, MCAsmParserSemaCallback &SI) = 0;
 
   /// Emit a note at the location \p L, with the message \p Msg.
-  virtual void Note(SMLoc L, const Twine &Msg,
-                    SMRange Range = std::nullopt) = 0;
+  virtual void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) = 0;
 
   /// Emit a warning at the location \p L, with the message \p Msg.
   ///
   /// \return The return value is true, if warnings are fatal.
-  virtual bool Warning(SMLoc L, const Twine &Msg,
-                       SMRange Range = std::nullopt) = 0;
+  virtual bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) = 0;
 
   /// Return an error at the location \p L, with the message \p Msg. This
   /// may be modified before being emitted.
   ///
   /// \return The return value is always true, as an idiomatic convenience to
   /// clients.
-  bool Error(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt);
+  bool Error(SMLoc L, const Twine &Msg, SMRange Range = {});
 
   /// Emit an error at the location \p L, with the message \p Msg.
   ///
   /// \return The return value is always true, as an idiomatic convenience to
   /// clients.
-  virtual bool printError(SMLoc L, const Twine &Msg,
-                          SMRange Range = std::nullopt) = 0;
+  virtual bool printError(SMLoc L, const Twine &Msg, SMRange Range = {}) = 0;
 
   bool hasPendingError() { return !PendingErrors.empty(); }
 
@@ -255,7 +252,7 @@ class LLVM_ABI MCAsmParser {
   const AsmToken &getTok() const;
 
   /// Report an error at the current lexer location.
-  bool TokError(const Twine &Msg, SMRange Range = std::nullopt);
+  bool TokError(const Twine &Msg, SMRange Range = {});
 
   bool parseTokenLoc(SMLoc &Loc);
   bool parseToken(AsmToken::TokenKind T, const Twine &Msg = "unexpected token");
diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h
index e6fc7077a2dc3..e6dbb38dfee67 100644
--- a/llvm/include/llvm/MC/MCRegisterInfo.h
+++ b/llvm/include/llvm/MC/MCRegisterInfo.h
@@ -272,7 +272,7 @@ class LLVM_ABI MCRegisterInfo {
   friend class MCRegUnitRootIterator;
   friend class MCRegAliasIterator;
 
-  virtual ~MCRegisterInfo() {}
+  virtual ~MCRegisterInfo() = default;
 
   /// Initialize MCRegisterInfo, called by TableGen
   /// auto-generated routines. *DO NOT USE*.
@@ -687,7 +687,7 @@ class MCRegUnitMaskIterator {
   }
 
   /// Returns a (RegUnit, LaneMask) pair.
-  std::pair<unsigned,LaneBitmask> operator*() const {
+  std::pair<MCRegUnit, LaneBitmask> operator*() const {
     return std::make_pair(*RUIter, *MaskListIter);
   }
 
@@ -719,7 +719,7 @@ class MCRegUnitRootIterator {
 public:
   MCRegUnitRootIterator() = default;
 
-  MCRegUnitRootIterator(unsigned RegUnit, const MCRegisterInfo *MCRI) {
+  MCRegUnitRootIterator(MCRegUnit RegUnit, const MCRegisterInfo *MCRI) {
     assert(RegUnit < MCRI->getNumRegUnits() && "Invalid register unit");
     Reg0 = MCRI->RegUnitRoots[RegUnit][0];
     Reg1 = MCRI->RegUnitRoots[RegUnit][1];
diff --git a/llvm/include/llvm/MCA/Instruction.h b/llvm/include/llvm/MCA/Instruction.h
index 3cdbf84748c79..b6b5b5979dec9 100644
--- a/llvm/include/llvm/MCA/Instruction.h
+++ b/llvm/include/llvm/MCA/Instruction.h
@@ -26,8 +26,6 @@
 #include "llvm/Support/raw_ostream.h"
 #endif
 
-#include <memory>
-
 namespace llvm {
 
 namespace mca {
diff --git a/llvm/include/llvm/MCA/SourceMgr.h b/llvm/include/llvm/MCA/SourceMgr.h
index 16a60d1116ad6..300961cbfcd69 100644
--- a/llvm/include/llvm/MCA/SourceMgr.h
+++ b/llvm/include/llvm/MCA/SourceMgr.h
@@ -50,7 +50,7 @@ struct SourceMgr {
   /// Advance to the next \a SourceRef.
   virtual void updateNext() = 0;
 
-  virtual ~SourceMgr() {}
+  virtual ~SourceMgr() = default;
 };
 
 /// The default implementation of \a SourceMgr. It always takes a fixed number
diff --git a/llvm/include/llvm/ObjCopy/ConfigManager.h b/llvm/include/llvm/ObjCopy/ConfigManager.h
index 15687998820c5..45f847ff7c434 100644
--- a/llvm/include/llvm/ObjCopy/ConfigManager.h
+++ b/llvm/include/llvm/ObjCopy/ConfigManager.h
@@ -23,7 +23,7 @@ namespace llvm {
 namespace objcopy {
 
 struct LLVM_ABI ConfigManager : public MultiFormatConfig {
-  ~ConfigManager() override {}
+  ~ConfigManager() override = default;
 
   const CommonConfig &getCommonConfig() const override { return Common; }
 
diff --git a/llvm/include/llvm/ObjCopy/MultiFormatConfig.h b/llvm/include/llvm/ObjCopy/MultiFormatConfig.h
index bb93f64aa2788..91baf9b286c58 100644
--- a/llvm/include/llvm/ObjCopy/MultiFormatConfig.h
+++ b/llvm/include/llvm/ObjCopy/MultiFormatConfig.h
@@ -24,7 +24,7 @@ struct DXContainerConfig;
 
 class MultiFormatConfig {
 public:
-  virtual ~MultiFormatConfig() {}
+  virtual ~MultiFormatConfig() = default;
 
   virtual const CommonConfig &getCommonConfig() const = 0;
   virtual Expected<const ELFConfig &> getELFConfig() const = 0;
diff --git a/llvm/include/llvm/Object/ELF.h b/llvm/include/llvm/Object/ELF.h
index 59f63eb6b5bb6..cc1e5f9dcb9da 100644
--- a/llvm/include/llvm/Object/ELF.h
+++ b/llvm/include/llvm/Object/ELF.h
@@ -261,6 +261,8 @@ class ELFFile {
   ELFFile(const ELFFile &) = default;
   ELFFile &operator=(const ELFFile &) = default;
 
+  ELFFile(ELFFile &&) = default;
+
   // This is a callback that can be passed to a number of functions.
   // It can be used to ignore non-critical errors (warnings), which is
   // useful for dumpers, like llvm-readobj.
@@ -278,9 +280,46 @@ class ELFFile {
   std::vector<Elf_Shdr> FakeSections;
   SmallString<0> FakeSectionStrings;
 
+  // When the number of program headers is >= PN_XNUM, the actual number is
+  // contained in the sh_info field of the section header at index 0.
+  std::optional<uint32_t> RealPhNum;
+  // When the number of section headers is >= SHN_LORESERVE, the actual number
+  // is contained in the sh_size field of the section header at index 0.
+  std::optional<uint64_t> RealShNum;
+  // When the section index of the section name table is >= SHN_LORESERVE, the
+  // actual number is contained in the sh_link field of the section header at
+  // index 0.
+  std::optional<uint32_t> RealShStrNdx;
+
   ELFFile(StringRef Object);
 
+  Error readShdrZero();
+
 public:
+  Expected<uint32_t> getPhNum() const {
+    if (!RealPhNum) {
+      if (Error E = const_cast<ELFFile<ELFT> *>(this)->readShdrZero())
+        return std::move(E);
+    }
+    return *RealPhNum;
+  }
+
+  Expected<uint64_t> getShNum() const {
+    if (!RealShNum) {
+      if (Error E = const_cast<ELFFile<ELFT> *>(this)->readShdrZero())
+        return std::move(E);
+    }
+    return *RealShNum;
+  }
+
+  Expected<uint32_t> getShStrNdx() const {
+    if (!RealShStrNdx) {
+      if (Error E = const_cast<ELFFile<ELFT> *>(this)->readShdrZero())
+        return std::move(E);
+    }
+    return *RealShStrNdx;
+  }
+
   const Elf_Ehdr &getHeader() const {
     return *reinterpret_cast<const Elf_Ehdr *>(base());
   }
@@ -379,22 +418,26 @@ class ELFFile {
 
   /// Iterate over program header table.
   Expected<Elf_Phdr_Range> program_headers() const {
-    if (getHeader().e_phnum && getHeader().e_phentsize != sizeof(Elf_Phdr))
+    uint32_t NumPh;
+    if (Expected<uint32_t> PhNumOrErr = getPhNum())
+      NumPh = *PhNumOrErr;
+    else
+      return PhNumOrErr.takeError();
+    if (NumPh && getHeader().e_phentsize != sizeof(Elf_Phdr))
       return createError("invalid e_phentsize: " +
                          Twine(getHeader().e_phentsize));
 
-    uint64_t HeadersSize =
-        (uint64_t)getHeader().e_phnum * getHeader().e_phentsize;
+    uint64_t HeadersSize = (uint64_t)NumPh * getHeader().e_phentsize;
     uint64_t PhOff = getHeader().e_phoff;
     if (PhOff + HeadersSize < PhOff || PhOff + HeadersSize > getBufSize())
       return createError("program headers are longer than binary of size " +
                          Twine(getBufSize()) + ": e_phoff = 0x" +
                          Twine::utohexstr(getHeader().e_phoff) +
-                         ", e_phnum = " + Twine(getHeader().e_phnum) +
+                         ", e_phnum = " + Twine(NumPh) +
                          ", e_phentsize = " + Twine(getHeader().e_phentsize));
 
     auto *Begin = reinterpret_cast<const Elf_Phdr *>(base() + PhOff);
-    return ArrayRef(Begin, Begin + getHeader().e_phnum);
+    return ArrayRef(Begin, Begin + NumPh);
   }
 
   /// Get an iterator over notes in a program header.
@@ -772,19 +815,15 @@ template <class ELFT>
 Expected<StringRef>
 ELFFile<ELFT>::getSectionStringTable(Elf_Shdr_Range Sections,
                                      WarningHandler WarnHandler) const {
-  uint32_t Index = getHeader().e_shstrndx;
-  if (Index == ELF::SHN_XINDEX) {
-    // If the section name string table section index is greater than
-    // or equal to SHN_LORESERVE, then the actual index of the section name
-    // string table section is contained in the sh_link field of the section
-    // header at index 0.
-    if (Sections.empty())
-      return createError(
-          "e_shstrndx == SHN_XINDEX, but the section header table is empty");
+  Expected<uint32_t> ShStrNdxOrErr = getShStrNdx();
+  if (!ShStrNdxOrErr)
+    return ShStrNdxOrErr.takeError();
 
-    Index = Sections[0].sh_link;
-  }
+  if (*ShStrNdxOrErr == ELF::SHN_XINDEX && Sections.empty())
+    return createError(
+        "e_shstrndx == SHN_XINDEX, but the section header table is empty");
 
+  uint32_t Index = *ShStrNdxOrErr;
   // There is no section name string table. Return FakeSectionStrings which
   // is non-empty if we have created fake sections.
   if (!Index)
@@ -891,6 +930,35 @@ Expected<uint64_t> ELFFile<ELFT>::getDynSymtabSize() const {
 
 template <class ELFT> ELFFile<ELFT>::ELFFile(StringRef Object) : Buf(Object) {}
 
+template <class ELFT> Error ELFFile<ELFT>::readShdrZero() {
+  const Elf_Ehdr &Header = getHeader();
+
+  if ((Header.e_phnum == ELF::PN_XNUM || Header.e_shnum == 0 ||
+       Header.e_shstrndx == ELF::SHN_XINDEX) &&
+      Header.e_shoff != 0) {
+    // Pretend we have section 0 or sections() would call getShNum and thus
+    // become an infinite recursion.
+    RealShNum = 1;
+    auto SecOrErr = getSection(0);
+    if (!SecOrErr) {
+      RealShNum = std::nullopt;
+      return SecOrErr.takeError();
+    }
+
+    RealPhNum =
+        Header.e_phnum == ELF::PN_XNUM ? (*SecOrErr)->sh_info : Header.e_phnum;
+    RealShNum = Header.e_shnum == 0 ? (*SecOrErr)->sh_size : Header.e_shnum;
+    RealShStrNdx = Header.e_shstrndx == ELF::SHN_XINDEX ? (*SecOrErr)->sh_link
+                                                        : Header.e_shstrndx;
+  } else {
+    RealPhNum = Header.e_phnum;
+    RealShNum = Header.e_shnum;
+    RealShStrNdx = Header.e_shstrndx;
+  }
+
+  return Error::success();
+}
+
 template <class ELFT>
 Expected<ELFFile<ELFT>> ELFFile<ELFT>::create(StringRef Object) {
   if (sizeof(Elf_Ehdr) > Object.size())
@@ -956,9 +1024,11 @@ Expected<typename ELFT::ShdrRange> ELFFile<ELFT>::sections() const {
   const Elf_Shdr *First =
       reinterpret_cast<const Elf_Shdr *>(base() + SectionTableOffset);
 
-  uintX_t NumSections = getHeader().e_shnum;
-  if (NumSections == 0)
-    NumSections = First->sh_size;
+  uintX_t NumSections = 0;
+  if (Expected<uint64_t> ShNumOrErr = getShNum())
+    NumSections = *ShNumOrErr;
+  else
+    return ShNumOrErr.takeError();
 
   if (NumSections > UINT64_MAX / sizeof(Elf_Shdr))
     return createError("invalid number of sections specified in the NULL "
diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index ced1afdd4cc6a..ca4135742bf6b 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -1218,12 +1218,12 @@ ELFObjectFile<ELFT>::ELFObjectFile(MemoryBufferRef Object, ELFFile<ELFT> EF,
     : ELFObjectFileBase(getELFType(ELFT::Endianness == llvm::endianness::little,
                                    ELFT::Is64Bits),
                         Object),
-      EF(EF), DotDynSymSec(DotDynSymSec), DotSymtabSec(DotSymtabSec),
+      EF(std::move(EF)), DotDynSymSec(DotDynSymSec), DotSymtabSec(DotSymtabSec),
       DotSymtabShndxSec(DotSymtabShndx) {}
 
 template <class ELFT>
 ELFObjectFile<ELFT>::ELFObjectFile(ELFObjectFile<ELFT> &&Other)
-    : ELFObjectFile(Other.Data, Other.EF, Other.DotDynSymSec,
+    : ELFObjectFile(Other.Data, std::move(Other.EF), Other.DotDynSymSec,
                     Other.DotSymtabSec, Other.DotSymtabShndxSec) {}
 
 template <class ELFT>
diff --git a/llvm/include/llvm/Object/ELFTypes.h b/llvm/include/llvm/Object/ELFTypes.h
index e9a417d3d4fb3..467ab6fd3c1e9 100644
--- a/llvm/include/llvm/Object/ELFTypes.h
+++ b/llvm/include/llvm/Object/ELFTypes.h
@@ -834,30 +834,32 @@ struct BBAddrMap {
     bool OmitBBEntries : 1;
     bool CallsiteEndOffsets : 1;
     bool BBHash : 1;
+    bool PostLinkCfg : 1;
 
     bool hasPGOAnalysis() const { return FuncEntryCount || BBFreq || BrProb; }
 
     bool hasPGOAnalysisBBData() const { return BBFreq || BrProb; }
 
     // Encodes to minimum bit width representation.
-    uint8_t encode() const {
-      return (static_cast<uint8_t>(FuncEntryCount) << 0) |
-             (static_cast<uint8_t>(BBFreq) << 1) |
-             (static_cast<uint8_t>(BrProb) << 2) |
-             (static_cast<uint8_t>(MultiBBRange) << 3) |
-             (static_cast<uint8_t>(OmitBBEntries) << 4) |
-             (static_cast<uint8_t>(CallsiteEndOffsets) << 5) |
-             (static_cast<uint8_t>(BBHash) << 6);
+    uint16_t encode() const {
+      return (static_cast<uint16_t>(FuncEntryCount) << 0) |
+             (static_cast<uint16_t>(BBFreq) << 1) |
+             (static_cast<uint16_t>(BrProb) << 2) |
+             (static_cast<uint16_t>(MultiBBRange) << 3) |
+             (static_cast<uint16_t>(OmitBBEntries) << 4) |
+             (static_cast<uint16_t>(CallsiteEndOffsets) << 5) |
+             (static_cast<uint16_t>(BBHash) << 6) |
+             (static_cast<uint16_t>(PostLinkCfg) << 7);
     }
 
     // Decodes from minimum bit width representation and validates no
     // unnecessary bits are used.
-    static Expected<Features> decode(uint8_t Val) {
+    static Expected<Features> decode(uint16_t Val) {
       Features Feat{
           static_cast<bool>(Val & (1 << 0)), static_cast<bool>(Val & (1 << 1)),
           static_cast<bool>(Val & (1 << 2)), static_cast<bool>(Val & (1 << 3)),
           static_cast<bool>(Val & (1 << 4)), static_cast<bool>(Val & (1 << 5)),
-          static_cast<bool>(Val & (1 << 6))};
+          static_cast<bool>(Val & (1 << 6)), static_cast<bool>(Val & (1 << 7))};
       if (Feat.encode() != Val)
         return createStringError(
             std::error_code(), "invalid encoding for BBAddrMap::Features: 0x%x",
@@ -867,10 +869,11 @@ struct BBAddrMap {
 
     bool operator==(const Features &Other) const {
       return std::tie(FuncEntryCount, BBFreq, BrProb, MultiBBRange,
-                      OmitBBEntries, CallsiteEndOffsets, BBHash) ==
+                      OmitBBEntries, CallsiteEndOffsets, BBHash, PostLinkCfg) ==
              std::tie(Other.FuncEntryCount, Other.BBFreq, Other.BrProb,
                       Other.MultiBBRange, Other.OmitBBEntries,
-                      Other.CallsiteEndOffsets, Other.BBHash);
+                      Other.CallsiteEndOffsets, Other.BBHash,
+                      Other.PostLinkCfg);
     }
   };
 
@@ -1010,23 +1013,30 @@ struct PGOAnalysisMap {
     /// probability associated with it.
     struct SuccessorEntry {
       /// Unique ID of this successor basic block.
-      uint32_t ID;
+      uint32_t ID = 0;
       /// Branch Probability of the edge to this successor taken from MBPI.
       BranchProbability Prob;
+      /// Raw edge count from the post link profile (e.g., from bolt or
+      /// propeller).
+      uint64_t PostLinkFreq = 0;
 
       bool operator==(const SuccessorEntry &Other) const {
-        return std::tie(ID, Prob) == std::tie(Other.ID, Other.Prob);
+        return std::tie(ID, Prob, PostLinkFreq) ==
+               std::tie(Other.ID, Other.Prob, Other.PostLinkFreq);
       }
     };
 
     /// Block frequency taken from MBFI
     BlockFrequency BlockFreq;
+    /// Raw block count taken from the post link profile (e.g., from bolt or
+    /// propeller).
+    uint64_t PostLinkBlockFreq = 0;
     /// List of successors of the current block
     llvm::SmallVector<SuccessorEntry, 2> Successors;
 
     bool operator==(const PGOBBEntry &Other) const {
-      return std::tie(BlockFreq, Successors) ==
-             std::tie(Other.BlockFreq, Other.Successors);
+      return std::tie(BlockFreq, PostLinkBlockFreq, Successors) ==
+             std::tie(Other.BlockFreq, PostLinkBlockFreq, Other.Successors);
     }
   };
 
diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h
index 01e7c6b07dd36..f4c1e30b097ee 100644
--- a/llvm/include/llvm/Object/MachO.h
+++ b/llvm/include/llvm/Object/MachO.h
@@ -447,7 +447,7 @@ class LLVM_ABI MachOObjectFile : public ObjectFile {
   uint64_t getSectionAddress(DataRefImpl Sec) const override;
   uint64_t getSectionIndex(DataRefImpl Sec) const override;
   uint64_t getSectionSize(DataRefImpl Sec) const override;
-  ArrayRef<uint8_t> getSectionContents(uint32_t Offset, uint64_t Size) const;
+  ArrayRef<uint8_t> getSectionContents(uint64_t Offset, uint64_t Size) const;
   Expected<ArrayRef<uint8_t>>
   getSectionContents(DataRefImpl Sec) const override;
   uint64_t getSectionAlignment(DataRefImpl Sec) const override;
diff --git a/llvm/include/llvm/Object/SFrameParser.h b/llvm/include/llvm/Object/SFrameParser.h
index 3ce5d70142a9f..23298357191b3 100644
--- a/llvm/include/llvm/Object/SFrameParser.h
+++ b/llvm/include/llvm/Object/SFrameParser.h
@@ -90,7 +90,7 @@ template <endianness E> class SFrameParser<E>::FallibleFREIterator {
                       uint32_t Idx, uint32_t Size, uint64_t Offset)
       : Data(Data), FREType(FREType), Idx(Idx), Size(Size), Offset(Offset) {}
 
-  Error inc();
+  LLVM_ABI Error inc();
   const FrameRowEntry &operator*() const { return FRE; }
 
   friend bool operator==(const FallibleFREIterator &LHS,
diff --git a/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypeHashing.h b/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypeHashing.h
index 25ba27c7c7a22..a70c2388c5168 100644
--- a/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypeHashing.h
+++ b/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypeHashing.h
@@ -21,7 +21,6 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
-#include <memory>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
index b5b110d0f59a1..fbfe3069566d3 100644
--- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
@@ -115,7 +115,7 @@ struct RootParameterHeaderYaml {
   dxbc::ShaderVisibility Visibility;
   uint32_t Offset;
 
-  RootParameterHeaderYaml(){};
+  RootParameterHeaderYaml() = default;
   RootParameterHeaderYaml(dxbc::RootParameterType T) : Type(T) {}
 };
 
@@ -123,7 +123,7 @@ struct RootParameterLocationYaml {
   RootParameterHeaderYaml Header;
   std::optional<size_t> IndexInSignature;
 
-  RootParameterLocationYaml(){};
+  RootParameterLocationYaml() = default;
   explicit RootParameterLocationYaml(RootParameterHeaderYaml Header)
       : Header(Header) {}
 };
diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h
index a7c7c7c436dc2..a8236ca37b5ed 100644
--- a/llvm/include/llvm/ObjectYAML/ELFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h
@@ -166,7 +166,7 @@ struct BBAddrMapEntry {
     std::optional<llvm::yaml::Hex64> Hash;
   };
   uint8_t Version;
-  llvm::yaml::Hex8 Feature;
+  llvm::yaml::Hex16 Feature;
 
   struct BBRangeEntry {
     llvm::yaml::Hex64 BaseAddress;
@@ -203,8 +203,10 @@ struct PGOAnalysisMapEntry {
     struct SuccessorEntry {
       uint32_t ID;
       llvm::yaml::Hex32 BrProb;
+      std::optional<uint32_t> PostLinkBrFreq;
     };
     std::optional<uint64_t> BBFreq;
+    std::optional<uint32_t> PostLinkBBFreq;
     std::optional<std::vector<SuccessorEntry>> Successors;
   };
   std::optional<uint64_t> FuncEntryCount;
diff --git a/llvm/include/llvm/ObjectYAML/GOFFYAML.h b/llvm/include/llvm/ObjectYAML/GOFFYAML.h
index f9bf45e95bd3a..74aeade54b8f9 100644
--- a/llvm/include/llvm/ObjectYAML/GOFFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/GOFFYAML.h
@@ -17,7 +17,6 @@
 #include "llvm/BinaryFormat/GOFF.h"
 #include "llvm/ObjectYAML/YAML.h"
 #include <cstdint>
-#include <vector>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/Option/Arg.h b/llvm/include/llvm/Option/Arg.h
index b1e56b58da684..496373d28600f 100644
--- a/llvm/include/llvm/Option/Arg.h
+++ b/llvm/include/llvm/Option/Arg.h
@@ -51,7 +51,7 @@ class Arg {
   /// Was this argument used to affect compilation?
   ///
   /// This is used to generate an "argument unused" warning (without
-  /// clang::driver::options::TargetSpecific) or "unsupported option" error
+  /// clang::options::TargetSpecific) or "unsupported option" error
   /// (with TargetSpecific).
   mutable unsigned Claimed : 1;
 
diff --git a/llvm/include/llvm/Option/OptTable.h b/llvm/include/llvm/Option/OptTable.h
index f641ca4ac08d3..45083b31c11f4 100644
--- a/llvm/include/llvm/Option/OptTable.h
+++ b/llvm/include/llvm/Option/OptTable.h
@@ -148,15 +148,13 @@ class LLVM_ABI OptTable {
                             StringRef SubCommand) const {
     assert(!SubCommand.empty() &&
            "This helper is only for valid registered subcommands.");
-    auto SCIT =
-        std::find_if(SubCommands.begin(), SubCommands.end(),
-                     [&](const auto &C) { return SubCommand == C.Name; });
+    auto SCIT = llvm::find_if(
+        SubCommands, [&](const auto &C) { return SubCommand == C.Name; });
     assert(SCIT != SubCommands.end() &&
            "This helper is only for valid registered subcommands.");
     auto SubCommandIDs = CandidateInfo->getSubCommandIDs(SubCommandIDsTable);
     unsigned CurrentSubCommandID = SCIT - &SubCommands[0];
-    return std::find(SubCommandIDs.begin(), SubCommandIDs.end(),
-                     CurrentSubCommandID) != SubCommandIDs.end();
+    return llvm::is_contained(SubCommandIDs, CurrentSubCommandID);
   }
 
 private:
diff --git a/llvm/include/llvm/PassInfo.h b/llvm/include/llvm/PassInfo.h
index 380d6698d0c80..5734eb8bfb47e 100644
--- a/llvm/include/llvm/PassInfo.h
+++ b/llvm/include/llvm/PassInfo.h
@@ -15,7 +15,6 @@
 
 #include "llvm/ADT/StringRef.h"
 #include <cassert>
-#include <vector>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 2f20792568e63..03777c7fcb45f 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/RuntimeLibcallInfo.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
@@ -127,7 +128,6 @@
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
 #include "llvm/Transforms/Utils/LowerInvoke.h"
 #include <cassert>
-#include <type_traits>
 #include <utility>
 
 namespace llvm {
@@ -638,6 +638,8 @@ Error CodeGenPassBuilder<Derived, TargetMachineT>::buildPipeline(
               /*Force=*/true);
     addIRPass(RequireAnalysisPass<CollectorMetadataAnalysis, Module>(),
               /*Force=*/true);
+    addIRPass(RequireAnalysisPass<RuntimeLibraryAnalysis, Module>(),
+              /*Force=*/true);
     addISelPasses(addIRPass);
   }
 
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 8538a8b2afe14..8fa21f2cb2dd6 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -742,7 +742,7 @@ class PassBuilder {
   void addRequiredLTOPreLinkPasses(ModulePassManager &MPM);
 
   void addVectorPasses(OptimizationLevel Level, FunctionPassManager &FPM,
-                       bool IsFullLTO);
+                       ThinOrFullLTOPhase LTOPhase);
 
   static std::optional<std::vector<PipelineElement>>
   parsePipelineText(StringRef Text);
diff --git a/llvm/include/llvm/ProfileData/DataAccessProf.h b/llvm/include/llvm/ProfileData/DataAccessProf.h
index 608306f02be66..ea256ef7b170b 100644
--- a/llvm/include/llvm/ProfileData/DataAccessProf.h
+++ b/llvm/include/llvm/ProfileData/DataAccessProf.h
@@ -42,7 +42,7 @@ struct SourceLocation {
       : FileName(FileNameRef.str()), Line(Line) {}
 
   // Empty constructor is used in yaml conversion.
-  SourceLocation() {}
+  SourceLocation() = default;
   /// The filename where the data is located.
   std::string FileName;
   /// The line number in the source code.
diff --git a/llvm/include/llvm/ProfileData/HashKeyMap.h b/llvm/include/llvm/ProfileData/HashKeyMap.h
index b2f1bf222157b..fceb95143340f 100644
--- a/llvm/include/llvm/ProfileData/HashKeyMap.h
+++ b/llvm/include/llvm/ProfileData/HashKeyMap.h
@@ -16,7 +16,6 @@
 #define LLVM_PROFILEDATA_HASHKEYMAP_H
 
 #include "llvm/ADT/Hashing.h"
-#include <iterator>
 #include <utility>
 
 namespace llvm {
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 85a9efe73855b..f59ddc3e59324 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -41,7 +41,6 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <list>
 #include <memory>
 #include <string>
 #include <system_error>
@@ -1058,8 +1057,10 @@ struct NamedInstrProfRecord : InstrProfRecord {
   StringRef Name;
   uint64_t Hash;
 
-  // We reserve this bit as the flag for context sensitive profile record.
-  static const int CS_FLAG_IN_FUNC_HASH = 60;
+  // We reserve the highest 4 bits as flags.
+  static constexpr uint64_t FUNC_HASH_MASK = 0x0FFF'FFFF'FFFF'FFFF;
+  // The 60th bit is for context sensitive profile record.
+  static constexpr unsigned CS_FLAG_IN_FUNC_HASH = 60;
 
   NamedInstrProfRecord() = default;
   NamedInstrProfRecord(StringRef Name, uint64_t Hash,
@@ -1174,7 +1175,9 @@ enum ProfVersion {
   Version11 = 11,
   // VTable profiling, decision record and bitmap are modified for mcdc.
   Version12 = 12,
-  // The current version is 12.
+  // In this version, the frontend PGO stable hash algorithm defaults to V4.
+  Version13 = 13,
+  // The current version is 13.
   CurrentVersion = INSTR_PROF_INDEX_VERSION
 };
 const uint64_t Version = ProfVersion::CurrentVersion;
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 0496f240dc823..46d6bb5bd8896 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -722,7 +722,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 /* Raw profile format version (start from 1). */
 #define INSTR_PROF_RAW_VERSION 10
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 12
+#define INSTR_PROF_INDEX_VERSION 13
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 6
 
diff --git a/llvm/include/llvm/ProfileData/MemProfYAML.h b/llvm/include/llvm/ProfileData/MemProfYAML.h
index d66e16dda51d6..c55f7806d73a6 100644
--- a/llvm/include/llvm/ProfileData/MemProfYAML.h
+++ b/llvm/include/llvm/ProfileData/MemProfYAML.h
@@ -141,7 +141,7 @@ template <> struct CustomMappingTraits<memprof::PortableMemInfoBlock> {
 #define MIBEntryDef(NameTag, Name, Type)                                       \
   if (KeyStr == #Name) {                                                       \
     uint64_t Value;                                                            \
-    Io.mapRequired(KeyStr.str().c_str(), Value);                               \
+    Io.mapRequired(KeyStr, Value);                                             \
     MIB.Name = static_cast<Type>(Value);                                       \
     MIB.Schema.set(llvm::to_underlying(memprof::Meta::Name));                  \
     return;                                                                    \
diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h
index 799938ab901c1..67834f72c2400 100644
--- a/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -244,7 +244,6 @@
 #include <optional>
 #include <string>
 #include <system_error>
-#include <unordered_set>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h b/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h
index 69b8f9f000e1d..af9d809833023 100644
--- a/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h
+++ b/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h
@@ -16,7 +16,6 @@
 #include "llvm/Remarks/RemarkSerializer.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/YAMLTraits.h"
-#include <optional>
 
 namespace llvm {
 namespace remarks {
diff --git a/llvm/include/llvm/SandboxIR/Context.h b/llvm/include/llvm/SandboxIR/Context.h
index 7d8b2c86e94a7..a8966db29ab26 100644
--- a/llvm/include/llvm/SandboxIR/Context.h
+++ b/llvm/include/llvm/SandboxIR/Context.h
@@ -51,7 +51,7 @@ class Context {
     // Uses a 64-bit integer so we don't have to worry about the unlikely case
     // of overflowing a 32-bit counter.
     using ValTy = uint64_t;
-    static constexpr const ValTy InvalidVal = 0;
+    static constexpr ValTy InvalidVal = 0;
 
   private:
     // Default initialization results in an invalid ID.
diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h
index e1c1ca039a8a0..d928068f0bf27 100644
--- a/llvm/include/llvm/SandboxIR/Instruction.h
+++ b/llvm/include/llvm/SandboxIR/Instruction.h
@@ -1866,7 +1866,7 @@ class SwitchInst : public SingleLLVMInstructionImpl<llvm::SwitchInst> {
   friend class Context; // For accessing the constructor in create*()
 
 public:
-  static constexpr const unsigned DefaultPseudoIndex =
+  static constexpr unsigned DefaultPseudoIndex =
       llvm::SwitchInst::DefaultPseudoIndex;
 
   LLVM_ABI static SwitchInst *create(Value *V, BasicBlock *Dest,
@@ -1884,22 +1884,96 @@ class SwitchInst : public SingleLLVMInstructionImpl<llvm::SwitchInst> {
     return cast<llvm::SwitchInst>(Val)->getNumCases();
   }
 
+  template <typename LLVMCaseItT, typename BlockT, typename ConstT>
+  class CaseItImpl;
+
+  // The template helps avoid code duplication for const and non-const
+  // CaseHandle variants.
+  template <typename LLVMCaseItT, typename BlockT, typename ConstT>
+  class CaseHandleImpl {
+    Context &Ctx;
+    // NOTE: We are not wrapping an LLVM CaseHande here because it is not
+    // default-constructible. Instead we are wrapping the LLVM CaseIt
+    // iterator, as we can always get an LLVM CaseHandle by de-referencing it.
+    LLVMCaseItT LLVMCaseIt;
+    template <typename T1, typename T2, typename T3> friend class CaseItImpl;
+
+  public:
+    CaseHandleImpl(Context &Ctx, LLVMCaseItT LLVMCaseIt)
+        : Ctx(Ctx), LLVMCaseIt(LLVMCaseIt) {}
+    ConstT *getCaseValue() const;
+    BlockT *getCaseSuccessor() const;
+    unsigned getCaseIndex() const {
+      const auto &LLVMCaseHandle = *LLVMCaseIt;
+      return LLVMCaseHandle.getCaseIndex();
+    }
+    unsigned getSuccessorIndex() const {
+      const auto &LLVMCaseHandle = *LLVMCaseIt;
+      return LLVMCaseHandle.getSuccessorIndex();
+    }
+  };
+
+  // The template helps avoid code duplication for const and non-const CaseIt
+  // variants.
+  template <typename LLVMCaseItT, typename BlockT, typename ConstT>
+  class CaseItImpl : public iterator_facade_base<
+                         CaseItImpl<LLVMCaseItT, BlockT, ConstT>,
+                         std::random_access_iterator_tag,
+                         const CaseHandleImpl<LLVMCaseItT, BlockT, ConstT>> {
+    CaseHandleImpl<LLVMCaseItT, BlockT, ConstT> CH;
+
+  public:
+    CaseItImpl(Context &Ctx, LLVMCaseItT It) : CH(Ctx, It) {}
+    CaseItImpl(SwitchInst *SI, ptrdiff_t CaseNum)
+        : CH(SI->getContext(), llvm::SwitchInst::CaseIt(
+                                   cast<llvm::SwitchInst>(SI->Val), CaseNum)) {}
+    CaseItImpl &operator+=(ptrdiff_t N) {
+      CH.LLVMCaseIt += N;
+      return *this;
+    }
+    CaseItImpl &operator-=(ptrdiff_t N) {
+      CH.LLVMCaseIt -= N;
+      return *this;
+    }
+    ptrdiff_t operator-(const CaseItImpl &Other) const {
+      return CH.LLVMCaseIt - Other.CH.LLVMCaseIt;
+    }
+    bool operator==(const CaseItImpl &Other) const {
+      return CH.LLVMCaseIt == Other.CH.LLVMCaseIt;
+    }
+    bool operator<(const CaseItImpl &Other) const {
+      return CH.LLVMCaseIt < Other.CH.LLVMCaseIt;
+    }
+    const CaseHandleImpl<LLVMCaseItT, BlockT, ConstT> &operator*() const {
+      return CH;
+    }
+  };
+
   using CaseHandle =
-      llvm::SwitchInst::CaseHandleImpl<SwitchInst, ConstantInt, BasicBlock>;
-  using ConstCaseHandle =
-      llvm::SwitchInst::CaseHandleImpl<const SwitchInst, const ConstantInt,
-                                       const BasicBlock>;
-  using CaseIt = llvm::SwitchInst::CaseIteratorImpl<CaseHandle>;
-  using ConstCaseIt = llvm::SwitchInst::CaseIteratorImpl<ConstCaseHandle>;
+      CaseHandleImpl<llvm::SwitchInst::CaseIt, BasicBlock, ConstantInt>;
+  using CaseIt = CaseItImpl<llvm::SwitchInst::CaseIt, BasicBlock, ConstantInt>;
+
+  using ConstCaseHandle = CaseHandleImpl<llvm::SwitchInst::ConstCaseIt,
+                                         const BasicBlock, const ConstantInt>;
+  using ConstCaseIt = CaseItImpl<llvm::SwitchInst::ConstCaseIt,
+                                 const BasicBlock, const ConstantInt>;
 
   /// Returns a read/write iterator that points to the first case in the
   /// SwitchInst.
-  CaseIt case_begin() { return CaseIt(this, 0); }
-  ConstCaseIt case_begin() const { return ConstCaseIt(this, 0); }
+  CaseIt case_begin() {
+    return CaseIt(Ctx, cast<llvm::SwitchInst>(Val)->case_begin());
+  }
+  ConstCaseIt case_begin() const {
+    return ConstCaseIt(Ctx, cast<llvm::SwitchInst>(Val)->case_begin());
+  }
   /// Returns a read/write iterator that points one past the last in the
   /// SwitchInst.
-  CaseIt case_end() { return CaseIt(this, getNumCases()); }
-  ConstCaseIt case_end() const { return ConstCaseIt(this, getNumCases()); }
+  CaseIt case_end() {
+    return CaseIt(Ctx, cast<llvm::SwitchInst>(Val)->case_end());
+  }
+  ConstCaseIt case_end() const {
+    return ConstCaseIt(Ctx, cast<llvm::SwitchInst>(Val)->case_end());
+  }
   /// Iteration adapter for range-for loops.
   iterator_range<CaseIt> cases() {
     return make_range(case_begin(), case_end());
@@ -1907,22 +1981,19 @@ class SwitchInst : public SingleLLVMInstructionImpl<llvm::SwitchInst> {
   iterator_range<ConstCaseIt> cases() const {
     return make_range(case_begin(), case_end());
   }
-  CaseIt case_default() { return CaseIt(this, DefaultPseudoIndex); }
+  CaseIt case_default() {
+    return CaseIt(Ctx, cast<llvm::SwitchInst>(Val)->case_default());
+  }
   ConstCaseIt case_default() const {
-    return ConstCaseIt(this, DefaultPseudoIndex);
+    return ConstCaseIt(Ctx, cast<llvm::SwitchInst>(Val)->case_default());
   }
   CaseIt findCaseValue(const ConstantInt *C) {
-    return CaseIt(
-        this,
-        const_cast<const SwitchInst *>(this)->findCaseValue(C)->getCaseIndex());
+    const llvm::ConstantInt *LLVMC = cast<llvm::ConstantInt>(C->Val);
+    return CaseIt(Ctx, cast<llvm::SwitchInst>(Val)->findCaseValue(LLVMC));
   }
   ConstCaseIt findCaseValue(const ConstantInt *C) const {
-    ConstCaseIt I = llvm::find_if(cases(), [C](const ConstCaseHandle &Case) {
-      return Case.getCaseValue() == C;
-    });
-    if (I != case_end())
-      return I;
-    return case_default();
+    const llvm::ConstantInt *LLVMC = cast<llvm::ConstantInt>(C->Val);
+    return ConstCaseIt(Ctx, cast<llvm::SwitchInst>(Val)->findCaseValue(LLVMC));
   }
   LLVM_ABI ConstantInt *findCaseDest(BasicBlock *BB);
 
diff --git a/llvm/include/llvm/SandboxIR/Pass.h b/llvm/include/llvm/SandboxIR/Pass.h
index 267389a8a87a2..eb84f21483f8e 100644
--- a/llvm/include/llvm/SandboxIR/Pass.h
+++ b/llvm/include/llvm/SandboxIR/Pass.h
@@ -56,7 +56,7 @@ class Pass {
            "A pass name should not contain whitespaces!");
     assert(!Name.starts_with('-') && "A pass name should not start with '-'!");
   }
-  virtual ~Pass() {}
+  virtual ~Pass() = default;
   /// \Returns the name of the pass.
   StringRef getName() const { return Name; }
 #ifndef NDEBUG
diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h
index 93ca710805dd4..a8117aa3b9fa8 100644
--- a/llvm/include/llvm/SandboxIR/PassManager.h
+++ b/llvm/include/llvm/SandboxIR/PassManager.h
@@ -59,10 +59,10 @@ class PassManager : public ParentPass {
     Passes.push_back(std::move(Pass));
   }
 
-  static constexpr const char EndToken = '\0';
-  static constexpr const char BeginArgsToken = '<';
-  static constexpr const char EndArgsToken = '>';
-  static constexpr const char PassDelimToken = ',';
+  static constexpr char EndToken = '\0';
+  static constexpr char BeginArgsToken = '<';
+  static constexpr char EndArgsToken = '>';
+  static constexpr char PassDelimToken = ',';
 
   /// Parses \p Pipeline as a comma-separated sequence of pass names and sets
   /// the pass pipeline, using \p CreatePass to instantiate passes by name.
diff --git a/llvm/include/llvm/Support/Allocator.h b/llvm/include/llvm/Support/Allocator.h
index bc0265904ef65..fffcbd9f3c1d8 100644
--- a/llvm/include/llvm/Support/Allocator.h
+++ b/llvm/include/llvm/Support/Allocator.h
@@ -380,7 +380,7 @@ class BumpPtrAllocatorImpl
 
 /// The standard BumpPtrAllocator which just uses the default template
 /// parameters.
-typedef BumpPtrAllocatorImpl<> BumpPtrAllocator;
+using BumpPtrAllocator = BumpPtrAllocatorImpl<>;
 
 /// A BumpPtrAllocator that allows only elements of a specific type to be
 /// allocated.
diff --git a/llvm/include/llvm/Support/Atomic.h b/llvm/include/llvm/Support/Atomic.h
index c2d9ae2da231c..3c62672a077f1 100644
--- a/llvm/include/llvm/Support/Atomic.h
+++ b/llvm/include/llvm/Support/Atomic.h
@@ -30,9 +30,9 @@ namespace llvm {
   LLVM_ABI void MemoryFence();
 
 #ifdef _MSC_VER
-  typedef long cas_flag;
+  using cas_flag = long;
 #else
-  typedef uint32_t cas_flag;
+  using cas_flag = uint32_t;
 #endif
   LLVM_ABI cas_flag CompareAndSwap(volatile cas_flag *ptr, cas_flag new_value,
                                    cas_flag old_value);
diff --git a/llvm/include/llvm/Support/BinaryStreamArray.h b/llvm/include/llvm/Support/BinaryStreamArray.h
index ef2233c53ec2c..a7d03f6511f12 100644
--- a/llvm/include/llvm/Support/BinaryStreamArray.h
+++ b/llvm/include/llvm/Support/BinaryStreamArray.h
@@ -93,7 +93,7 @@ class VarStreamArray {
   friend class VarStreamArrayIterator<ValueType, Extractor>;
 
 public:
-  typedef VarStreamArrayIterator<ValueType, Extractor> Iterator;
+  using Iterator = VarStreamArrayIterator<ValueType, Extractor>;
 
   VarStreamArray() = default;
 
@@ -156,8 +156,8 @@ template <typename ValueType, typename Extractor>
 class VarStreamArrayIterator
     : public iterator_facade_base<VarStreamArrayIterator<ValueType, Extractor>,
                                   std::forward_iterator_tag, const ValueType> {
-  typedef VarStreamArrayIterator<ValueType, Extractor> IterType;
-  typedef VarStreamArray<ValueType, Extractor> ArrayType;
+  using IterType = VarStreamArrayIterator<ValueType, Extractor>;
+  using ArrayType = VarStreamArray<ValueType, Extractor>;
 
 public:
   VarStreamArrayIterator(const ArrayType &Array, const Extractor &E,
@@ -260,7 +260,7 @@ template <typename T> class FixedStreamArray {
   friend class FixedStreamArrayIterator<T>;
 
 public:
-  typedef FixedStreamArrayIterator<T> Iterator;
+  using Iterator = FixedStreamArrayIterator<T>;
 
   FixedStreamArray() = default;
   explicit FixedStreamArray(BinaryStreamRef Stream) : Stream(Stream) {
diff --git a/llvm/include/llvm/Support/BranchProbability.h b/llvm/include/llvm/Support/BranchProbability.h
index 42fe225709ef8..b15d6e1707afa 100644
--- a/llvm/include/llvm/Support/BranchProbability.h
+++ b/llvm/include/llvm/Support/BranchProbability.h
@@ -97,6 +97,9 @@ class BranchProbability {
   /// \return \c Num divided by \c this.
   LLVM_ABI uint64_t scaleByInverse(uint64_t Num) const;
 
+  /// Compute pow(Probability, N).
+  BranchProbability pow(unsigned N) const;
+
   BranchProbability &operator+=(BranchProbability RHS) {
     assert(N != UnknownN && RHS.N != UnknownN &&
            "Unknown probability cannot participate in arithmetics.");
diff --git a/llvm/include/llvm/Support/CFGDiff.h b/llvm/include/llvm/Support/CFGDiff.h
index 41004d755a124..88f4fe52d2019 100644
--- a/llvm/include/llvm/Support/CFGDiff.h
+++ b/llvm/include/llvm/Support/CFGDiff.h
@@ -21,7 +21,6 @@
 #include "llvm/Support/type_traits.h"
 #include <cassert>
 #include <cstddef>
-#include <iterator>
 
 // Two booleans are used to define orders in graphs:
 // InverseGraph defines when we need to reverse the whole graph and is as such
diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h
index 6f6df2e9703ea..af283e2c8ada3 100644
--- a/llvm/include/llvm/Support/Casting.h
+++ b/llvm/include/llvm/Support/Casting.h
@@ -816,6 +816,42 @@ template <typename... Types> struct IsaAndPresentCheckPredicate {
     return isa_and_present<Types...>(Val);
   }
 };
+
+//===----------------------------------------------------------------------===//
+// Casting Function Objects
+//===----------------------------------------------------------------------===//
+
+/// Usable in generic algorithms like map_range
+template <typename U> struct StaticCastFunc {
+  template <typename T> decltype(auto) operator()(T &&Val) const {
+    return static_cast<U>(Val);
+  }
+};
+
+template <typename U> struct DynCastFunc {
+  template <typename T> decltype(auto) operator()(T &&Val) const {
+    return dyn_cast<U>(Val);
+  }
+};
+
+template <typename U> struct CastFunc {
+  template <typename T> decltype(auto) operator()(T &&Val) const {
+    return cast<U>(Val);
+  }
+};
+
+template <typename U> struct CastIfPresentFunc {
+  template <typename T> decltype(auto) operator()(T &&Val) const {
+    return cast_if_present<U>(Val);
+  }
+};
+
+template <typename U> struct DynCastIfPresentFunc {
+  template <typename T> decltype(auto) operator()(T &&Val) const {
+    return dyn_cast_if_present<U>(Val);
+  }
+};
+
 } // namespace detail
 
 /// Function object wrapper for the `llvm::isa` type check. The function call
@@ -841,6 +877,20 @@ template <typename... Types>
 inline constexpr detail::IsaAndPresentCheckPredicate<Types...>
     IsaAndPresentPred{};
 
+/// Function objects corresponding to the Cast types defined above.
+template <typename To>
+inline constexpr detail::StaticCastFunc<To> StaticCastTo{};
+
+template <typename To> inline constexpr detail::CastFunc<To> CastTo{};
+
+template <typename To>
+inline constexpr detail::CastIfPresentFunc<To> CastIfPresentTo{};
+
+template <typename To>
+inline constexpr detail::DynCastIfPresentFunc<To> DynCastIfPresentTo{};
+
+template <typename To> inline constexpr detail::DynCastFunc<To> DynCastTo{};
+
 } // end namespace llvm
 
 #endif // LLVM_SUPPORT_CASTING_H
diff --git a/llvm/include/llvm/Support/Chrono.h b/llvm/include/llvm/Support/Chrono.h
index 5b8102d8e11cf..e5f98249cc074 100644
--- a/llvm/include/llvm/Support/Chrono.h
+++ b/llvm/include/llvm/Support/Chrono.h
@@ -150,10 +150,10 @@ template <> struct unit<std::nano> {
 template <typename Rep, typename Period>
 struct format_provider<std::chrono::duration<Rep, Period>> {
 private:
-  typedef std::chrono::duration<Rep, Period> Dur;
-  typedef std::conditional_t<std::chrono::treat_as_floating_point<Rep>::value,
-                             double, intmax_t>
-      InternalRep;
+  using Dur = std::chrono::duration<Rep, Period>;
+  using InternalRep =
+      std::conditional_t<std::chrono::treat_as_floating_point<Rep>::value,
+                         double, intmax_t>;
 
   template <typename AsPeriod> static InternalRep getAs(const Dur &D) {
     using namespace std::chrono;
diff --git a/llvm/include/llvm/Support/CodeGen.h b/llvm/include/llvm/Support/CodeGen.h
index cd1f9167b996d..15df265556339 100644
--- a/llvm/include/llvm/Support/CodeGen.h
+++ b/llvm/include/llvm/Support/CodeGen.h
@@ -115,7 +115,13 @@ namespace llvm {
   };
 
   // Specify what functions should keep the frame pointer.
-  enum class FramePointerKind { None, NonLeaf, All, Reserved };
+  enum class FramePointerKind {
+    None,
+    NonLeaf,
+    All,
+    Reserved,
+    NonLeafNoReserve
+  };
 
   // Specify what type of zeroing callee-used registers.
   namespace ZeroCallUsedRegs {
diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h
index 5a5f00e844705..d737fbcf891b3 100644
--- a/llvm/include/llvm/Support/CommandLine.h
+++ b/llvm/include/llvm/Support/CommandLine.h
@@ -2099,7 +2099,7 @@ getRegisteredOptions(SubCommand &Sub = SubCommand::getTopLevel());
 ///
 /// This interface is useful for defining subcommands in libraries and
 /// the dispatch from a single point (like in the main function).
-LLVM_ABI iterator_range<typename SmallPtrSet<SubCommand *, 4>::iterator>
+LLVM_ABI iterator_range<SmallPtrSet<SubCommand *, 4>::iterator>
 getRegisteredSubcommands();
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Support/ConvertUTF.h b/llvm/include/llvm/Support/ConvertUTF.h
index bb1723518a490..ddf7057bff59d 100644
--- a/llvm/include/llvm/Support/ConvertUTF.h
+++ b/llvm/include/llvm/Support/ConvertUTF.h
@@ -126,10 +126,10 @@ namespace llvm {
     bit mask & shift operations.
 ------------------------------------------------------------------------ */
 
-typedef unsigned int    UTF32;  /* at least 32 bits */
-typedef unsigned short  UTF16;  /* at least 16 bits */
-typedef unsigned char   UTF8;   /* typically 8 bits */
-typedef unsigned char   Boolean; /* 0 or 1 */
+using UTF32 = unsigned int;    /* at least 32 bits */
+using UTF16 = unsigned short;  /* at least 16 bits */
+using UTF8 = unsigned char;    /* typically 8 bits */
+using Boolean = unsigned char; /* 0 or 1 */
 
 /* Some fundamental constants */
 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
@@ -146,17 +146,14 @@ typedef unsigned char   Boolean; /* 0 or 1 */
 #define UNI_UTF32_BYTE_ORDER_MARK_NATIVE 0x0000FEFF
 #define UNI_UTF32_BYTE_ORDER_MARK_SWAPPED 0xFFFE0000
 
-typedef enum {
-  conversionOK,           /* conversion successful */
-  sourceExhausted,        /* partial character in source, but hit end */
-  targetExhausted,        /* insuff. room in target for conversion */
-  sourceIllegal           /* source sequence is illegal/malformed */
-} ConversionResult;
-
-typedef enum {
-  strictConversion = 0,
-  lenientConversion
-} ConversionFlags;
+enum ConversionResult {
+  conversionOK,    /* conversion successful */
+  sourceExhausted, /* partial character in source, but hit end */
+  targetExhausted, /* insuff. room in target for conversion */
+  sourceIllegal    /* source sequence is illegal/malformed */
+};
+
+enum ConversionFlags { strictConversion = 0, lenientConversion };
 
 LLVM_ABI ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart,
                                              const UTF8 *sourceEnd,
diff --git a/llvm/include/llvm/Support/DebugCounter.h b/llvm/include/llvm/Support/DebugCounter.h
index 39a08d499b67e..9904a0dd86559 100644
--- a/llvm/include/llvm/Support/DebugCounter.h
+++ b/llvm/include/llvm/Support/DebugCounter.h
@@ -140,7 +140,7 @@ class DebugCounter {
   }
 
   // Iterate through the registered counters
-  typedef UniqueVector<std::string> CounterVector;
+  using CounterVector = UniqueVector<std::string>;
   CounterVector::const_iterator begin() const {
     return RegisteredCounters.begin();
   }
diff --git a/llvm/include/llvm/Support/ELFAttributeParser.h b/llvm/include/llvm/Support/ELFAttributeParser.h
index 97350edb793c9..c2ad812b5d632 100644
--- a/llvm/include/llvm/Support/ELFAttributeParser.h
+++ b/llvm/include/llvm/Support/ELFAttributeParser.h
@@ -17,7 +17,7 @@ namespace llvm {
 
 class ELFAttributeParser {
 public:
-  virtual ~ELFAttributeParser() {}
+  virtual ~ELFAttributeParser() = default;
 
   virtual Error parse(ArrayRef<uint8_t> Section, llvm::endianness Endian) {
     return llvm::Error::success();
diff --git a/llvm/include/llvm/Support/ErrorHandling.h b/llvm/include/llvm/Support/ErrorHandling.h
index 4c17b6e83acd2..a4fd008a9ff3f 100644
--- a/llvm/include/llvm/Support/ErrorHandling.h
+++ b/llvm/include/llvm/Support/ErrorHandling.h
@@ -21,8 +21,8 @@ class StringRef;
 class Twine;
 
 /// An error handler callback.
-typedef void (*fatal_error_handler_t)(void *user_data, const char *reason,
-                                      bool gen_crash_diag);
+using fatal_error_handler_t = void (*)(void *user_data, const char *reason,
+                                       bool gen_crash_diag);
 
 /// install_fatal_error_handler - Installs a new error handler to be used
 /// whenever a serious (non-recoverable) error is encountered by LLVM.
diff --git a/llvm/include/llvm/Support/FormatProviders.h b/llvm/include/llvm/Support/FormatProviders.h
index 8eaa5e382c73e..3377781873b8c 100644
--- a/llvm/include/llvm/Support/FormatProviders.h
+++ b/llvm/include/llvm/Support/FormatProviders.h
@@ -261,7 +261,7 @@ template <> struct format_provider<bool> {
                   .Case("y", B ? "yes" : "no")
                   .CaseLower("D", B ? "1" : "0")
                   .Case("T", B ? "TRUE" : "FALSE")
-                  .Cases("t", "", B ? "true" : "false")
+                  .Cases({"t", ""}, B ? "true" : "false")
                   .Default(B ? "1" : "0");
   }
 };
diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h
index 85652924491ba..fdd448f7b5a3a 100644
--- a/llvm/include/llvm/Support/FormatVariadic.h
+++ b/llvm/include/llvm/Support/FormatVariadic.h
@@ -37,7 +37,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <array>
 #include <cstddef>
-#include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
diff --git a/llvm/include/llvm/Support/FormatVariadicDetails.h b/llvm/include/llvm/Support/FormatVariadicDetails.h
index 0fdc7b6f94da7..c0b245e297a58 100644
--- a/llvm/include/llvm/Support/FormatVariadicDetails.h
+++ b/llvm/include/llvm/Support/FormatVariadicDetails.h
@@ -63,8 +63,8 @@ template <typename T> class missing_format_adapter;
 template <class T> class has_FormatProvider {
 public:
   using Decayed = std::decay_t<T>;
-  typedef void (*Signature_format)(const Decayed &, llvm::raw_ostream &,
-                                   StringRef);
+  using Signature_format = void (*)(const Decayed &, llvm::raw_ostream &,
+                                    StringRef);
 
   template <typename U> using check = SameType<Signature_format, &U::format>;
 
diff --git a/llvm/include/llvm/Support/GenericDomTree.h b/llvm/include/llvm/Support/GenericDomTree.h
index af542bae9f8c6..b6aae9f7928e3 100644
--- a/llvm/include/llvm/Support/GenericDomTree.h
+++ b/llvm/include/llvm/Support/GenericDomTree.h
@@ -35,7 +35,6 @@
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
-#include <iterator>
 #include <memory>
 #include <type_traits>
 #include <utility>
diff --git a/llvm/include/llvm/Support/GenericLoopInfo.h b/llvm/include/llvm/Support/GenericLoopInfo.h
index b6bb360d9868f..9e2f61fd03e78 100644
--- a/llvm/include/llvm/Support/GenericLoopInfo.h
+++ b/llvm/include/llvm/Support/GenericLoopInfo.h
@@ -150,9 +150,9 @@ template <class BlockT, class LoopT> class LoopBase {
     assert(!isInvalid() && "Loop not in a valid state!");
     return SubLoops;
   }
-  typedef typename std::vector<LoopT *>::const_iterator iterator;
-  typedef
-      typename std::vector<LoopT *>::const_reverse_iterator reverse_iterator;
+  using iterator = typename std::vector<LoopT *>::const_iterator;
+  using reverse_iterator =
+      typename std::vector<LoopT *>::const_reverse_iterator;
   iterator begin() const { return getSubLoops().begin(); }
   iterator end() const { return getSubLoops().end(); }
   reverse_iterator rbegin() const { return getSubLoops().rbegin(); }
@@ -174,7 +174,7 @@ template <class BlockT, class LoopT> class LoopBase {
     assert(!isInvalid() && "Loop not in a valid state!");
     return Blocks;
   }
-  typedef typename ArrayRef<BlockT *>::const_iterator block_iterator;
+  using block_iterator = typename ArrayRef<BlockT *>::const_iterator;
   block_iterator block_begin() const { return getBlocks().begin(); }
   block_iterator block_end() const { return getBlocks().end(); }
   inline iterator_range<block_iterator> blocks() const {
@@ -302,7 +302,7 @@ template <class BlockT, class LoopT> class LoopBase {
   bool hasNoExitBlocks() const;
 
   /// Edge type.
-  typedef std::pair<BlockT *, BlockT *> Edge;
+  using Edge = std::pair<BlockT *, BlockT *>;
 
   /// Return all pairs of (_inside_block_,_outside_block_).
   void getExitEdges(SmallVectorImpl<Edge> &ExitEdges) const;
@@ -575,9 +575,9 @@ template <class BlockT, class LoopT> class LoopInfoBase {
   /// iterator/begin/end - The interface to the top-level loops in the current
   /// function.
   ///
-  typedef typename std::vector<LoopT *>::const_iterator iterator;
-  typedef
-      typename std::vector<LoopT *>::const_reverse_iterator reverse_iterator;
+  using iterator = typename std::vector<LoopT *>::const_iterator;
+  using reverse_iterator =
+      typename std::vector<LoopT *>::const_reverse_iterator;
   iterator begin() const { return TopLevelLoops.begin(); }
   iterator end() const { return TopLevelLoops.end(); }
   reverse_iterator rbegin() const { return TopLevelLoops.rbegin(); }
diff --git a/llvm/include/llvm/Support/GenericLoopInfoImpl.h b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
index 541678001a8ff..c830f0a67a448 100644
--- a/llvm/include/llvm/Support/GenericLoopInfoImpl.h
+++ b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
@@ -459,7 +459,7 @@ template <class BlockT, class LoopT>
 static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT *> Backedges,
                                   LoopInfoBase<BlockT, LoopT> *LI,
                                   const DomTreeBase<BlockT> &DomTree) {
-  typedef GraphTraits<Inverse<BlockT *>> InvBlockTraits;
+  using InvBlockTraits = GraphTraits<Inverse<BlockT *>>;
 
   unsigned NumBlocks = 0;
   unsigned NumSubloops = 0;
@@ -513,8 +513,8 @@ static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT *> Backedges,
 
 /// Populate all loop data in a stable order during a single forward DFS.
 template <class BlockT, class LoopT> class PopulateLoopsDFS {
-  typedef GraphTraits<BlockT *> BlockTraits;
-  typedef typename BlockTraits::ChildIteratorType SuccIterTy;
+  using BlockTraits = GraphTraits<BlockT *>;
+  using SuccIterTy = typename BlockTraits::ChildIteratorType;
 
   LoopInfoBase<BlockT, LoopT> *LI;
 
diff --git a/llvm/include/llvm/Support/GraphWriter.h b/llvm/include/llvm/Support/GraphWriter.h
index 3bef75cc7e508..43d9b0cfddef7 100644
--- a/llvm/include/llvm/Support/GraphWriter.h
+++ b/llvm/include/llvm/Support/GraphWriter.h
@@ -128,7 +128,7 @@ template <typename GraphType, typename Derived> class GraphWriterBase {
     DTraits = DOTTraits(SN);
     RenderUsingHTML = DTraits.renderNodesUsingHTML();
   }
-  virtual ~GraphWriterBase() {}
+  virtual ~GraphWriterBase() = default;
 
   void writeGraph(const std::string &Title = "") {
     // Output the header for the graph...
@@ -369,7 +369,7 @@ class GraphWriter : public GraphWriterBase<GraphType, GraphWriter<GraphType>> {
 public:
   GraphWriter(raw_ostream &o, const GraphType &g, bool SN)
       : GraphWriterBase<GraphType, GraphWriter<GraphType>>(o, g, SN) {}
-  ~GraphWriter() override {}
+  ~GraphWriter() override = default;
 };
 
 template <typename GraphType>
diff --git a/llvm/include/llvm/Support/JSON.h b/llvm/include/llvm/Support/JSON.h
index d8c6de49b4bc6..37baa7b45e4eb 100644
--- a/llvm/include/llvm/Support/JSON.h
+++ b/llvm/include/llvm/Support/JSON.h
@@ -154,7 +154,7 @@ class Object {
   LLVM_ABI const json::Array *getArray(StringRef K) const;
   LLVM_ABI json::Array *getArray(StringRef K);
 
-  friend bool operator==(const Object &LHS, const Object &RHS);
+  friend LLVM_ABI bool operator==(const Object &LHS, const Object &RHS);
 };
 LLVM_ABI bool operator==(const Object &LHS, const Object &RHS);
 inline bool operator!=(const Object &LHS, const Object &RHS) {
@@ -318,7 +318,7 @@ class Value {
   Value(std::string V) : Type(T_String) {
     if (LLVM_UNLIKELY(!isUTF8(V))) {
       assert(false && "Invalid UTF-8 in value used as JSON");
-      V = fixUTF8(std::move(V));
+      V = fixUTF8(V);
     }
     create<std::string>(std::move(V));
   }
@@ -549,10 +549,10 @@ inline const Value &Array::back() const { return V.back(); }
 inline Value *Array::data() { return V.data(); }
 inline const Value *Array::data() const { return V.data(); }
 
-inline typename Array::iterator Array::begin() { return V.begin(); }
-inline typename Array::const_iterator Array::begin() const { return V.begin(); }
-inline typename Array::iterator Array::end() { return V.end(); }
-inline typename Array::const_iterator Array::end() const { return V.end(); }
+inline Array::iterator Array::begin() { return V.begin(); }
+inline Array::const_iterator Array::begin() const { return V.begin(); }
+inline Array::iterator Array::end() { return V.end(); }
+inline Array::const_iterator Array::end() const { return V.end(); }
 
 inline bool Array::empty() const { return V.empty(); }
 inline size_t Array::size() const { return V.size(); }
@@ -565,18 +565,18 @@ template <typename... Args> inline void Array::emplace_back(Args &&...A) {
   V.emplace_back(std::forward<Args>(A)...);
 }
 inline void Array::pop_back() { V.pop_back(); }
-inline typename Array::iterator Array::insert(const_iterator P, const Value &E) {
+inline Array::iterator Array::insert(const_iterator P, const Value &E) {
   return V.insert(P, E);
 }
-inline typename Array::iterator Array::insert(const_iterator P, Value &&E) {
+inline Array::iterator Array::insert(const_iterator P, Value &&E) {
   return V.insert(P, std::move(E));
 }
 template <typename It>
-inline typename Array::iterator Array::insert(const_iterator P, It A, It Z) {
+inline Array::iterator Array::insert(const_iterator P, It A, It Z) {
   return V.insert(P, A, Z);
 }
 template <typename... Args>
-inline typename Array::iterator Array::emplace(const_iterator P, Args &&...A) {
+inline Array::iterator Array::emplace(const_iterator P, Args &&...A) {
   return V.emplace(P, std::forward<Args>(A)...);
 }
 inline bool operator==(const Array &L, const Array &R) { return L.V == R.V; }
@@ -591,7 +591,7 @@ class ObjectKey {
   ObjectKey(std::string S) : Owned(new std::string(std::move(S))) {
     if (LLVM_UNLIKELY(!isUTF8(*Owned))) {
       assert(false && "Invalid UTF-8 in value used as JSON");
-      *Owned = fixUTF8(std::move(*Owned));
+      *Owned = fixUTF8(*Owned);
     }
     Data = *Owned;
   }
diff --git a/llvm/include/llvm/Support/Jobserver.h b/llvm/include/llvm/Support/Jobserver.h
index 6bee3b5671d55..1fd4f7ed007af 100644
--- a/llvm/include/llvm/Support/Jobserver.h
+++ b/llvm/include/llvm/Support/Jobserver.h
@@ -67,8 +67,6 @@
 #define LLVM_SUPPORT_JOBSERVER_H
 
 #include "llvm/ADT/StringRef.h"
-#include <memory>
-#include <string>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/Support/LSP/Logging.h b/llvm/include/llvm/Support/LSP/Logging.h
index fe65899b1d4ce..f19cc49dbb606 100644
--- a/llvm/include/llvm/Support/LSP/Logging.h
+++ b/llvm/include/llvm/Support/LSP/Logging.h
@@ -11,7 +11,6 @@
 
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormatVariadic.h"
-#include <memory>
 #include <mutex>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Support/MD5.h b/llvm/include/llvm/Support/MD5.h
index 4ba386753f397..dbcb66d7680e4 100644
--- a/llvm/include/llvm/Support/MD5.h
+++ b/llvm/include/llvm/Support/MD5.h
@@ -90,7 +90,7 @@ class MD5 {
 
 private:
   // Any 32-bit or wider unsigned integer data type will do.
-  typedef uint32_t MD5_u32plus;
+  using MD5_u32plus = uint32_t;
 
   // Internal State
   struct {
diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index 9bbb8a2a30541..0a253efc2abcb 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -225,7 +225,7 @@ inline constexpr int64_t minIntN(int64_t N) {
 
   if (N == 0)
     return 0;
-  return UINT64_C(1) + ~(UINT64_C(1) << (N - 1));
+  return UINT64_MAX << (N - 1);
 }
 
 /// Gets the maximum value for a N-bit signed integer.
@@ -241,7 +241,7 @@ inline constexpr int64_t maxIntN(int64_t N) {
 
 /// Checks if an unsigned integer fits into the given (dynamic) bit width.
 inline constexpr bool isUIntN(unsigned N, uint64_t x) {
-  return N >= 64 || x <= maxUIntN(N);
+  return N >= 64 || (x >> N) == 0;
 }
 
 /// Checks if an signed integer fits into the given (dynamic) bit width.
diff --git a/llvm/include/llvm/Support/Mustache.h b/llvm/include/llvm/Support/Mustache.h
index 83047f2aafff6..76a83f26deab6 100644
--- a/llvm/include/llvm/Support/Mustache.h
+++ b/llvm/include/llvm/Support/Mustache.h
@@ -78,7 +78,6 @@
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/StringSaver.h"
 #include <functional>
-#include <vector>
 
 namespace llvm::mustache {
 
diff --git a/llvm/include/llvm/Support/Mutex.h b/llvm/include/llvm/Support/Mutex.h
index d61e3fd96efbe..3ca5c9a2f6be8 100644
--- a/llvm/include/llvm/Support/Mutex.h
+++ b/llvm/include/llvm/Support/Mutex.h
@@ -63,12 +63,12 @@ namespace llvm
     };
 
     /// Mutex - A standard, always enforced mutex.
-    typedef SmartMutex<false> Mutex;
+    using Mutex = SmartMutex<false>;
 
     template <bool mt_only>
     using SmartScopedLock = std::lock_guard<SmartMutex<mt_only>>;
 
-    typedef SmartScopedLock<false> ScopedLock;
+    using ScopedLock = SmartScopedLock<false>;
   }
 }
 
diff --git a/llvm/include/llvm/Support/OnDiskHashTable.h b/llvm/include/llvm/Support/OnDiskHashTable.h
index d7d72cfbbc649..54c6b713478b9 100644
--- a/llvm/include/llvm/Support/OnDiskHashTable.h
+++ b/llvm/include/llvm/Support/OnDiskHashTable.h
@@ -69,7 +69,7 @@ template <typename Info> class OnDiskChainedHashTableGenerator {
         : Key(Key), Data(Data), Next(nullptr), Hash(InfoObj.ComputeHash(Key)) {}
   };
 
-  typedef typename Info::offset_type offset_type;
+  using offset_type = typename Info::offset_type;
   offset_type NumBuckets;
   offset_type NumEntries;
   llvm::SpecificBumpPtrAllocator<Item> BA;
@@ -278,12 +278,12 @@ template <typename Info> class OnDiskChainedHashTable {
   Info InfoObj;
 
 public:
-  typedef Info InfoType;
-  typedef typename Info::internal_key_type internal_key_type;
-  typedef typename Info::external_key_type external_key_type;
-  typedef typename Info::data_type data_type;
-  typedef typename Info::hash_value_type hash_value_type;
-  typedef typename Info::offset_type offset_type;
+  using InfoType = Info;
+  using internal_key_type = typename Info::internal_key_type;
+  using external_key_type = typename Info::external_key_type;
+  using data_type = typename Info::data_type;
+  using hash_value_type = typename Info::hash_value_type;
+  using offset_type = typename Info::offset_type;
 
   OnDiskChainedHashTable(offset_type NumBuckets, offset_type NumEntries,
                          const unsigned char *Buckets,
@@ -435,12 +435,12 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
   const unsigned char *Payload;
 
 public:
-  typedef OnDiskChainedHashTable<Info>          base_type;
-  typedef typename base_type::internal_key_type internal_key_type;
-  typedef typename base_type::external_key_type external_key_type;
-  typedef typename base_type::data_type         data_type;
-  typedef typename base_type::hash_value_type   hash_value_type;
-  typedef typename base_type::offset_type       offset_type;
+  using base_type = OnDiskChainedHashTable<Info>;
+  using internal_key_type = typename base_type::internal_key_type;
+  using external_key_type = typename base_type::external_key_type;
+  using data_type = typename base_type::data_type;
+  using hash_value_type = typename base_type::hash_value_type;
+  using offset_type = typename base_type::offset_type;
 
 private:
   /// Iterates over all of the keys in the table.
@@ -450,7 +450,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
     offset_type NumEntriesLeft;
 
   public:
-    typedef external_key_type value_type;
+    using value_type = external_key_type;
 
     iterator_base(const unsigned char *const Ptr, offset_type NumEntries)
         : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries) {}
@@ -505,7 +505,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
     Info *InfoObj;
 
   public:
-    typedef external_key_type value_type;
+    using value_type = external_key_type;
 
     key_iterator(const unsigned char *const Ptr, offset_type NumEntries,
                  Info *InfoObj)
@@ -551,7 +551,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
     Info *InfoObj;
 
   public:
-    typedef data_type value_type;
+    using value_type = data_type;
 
     data_iterator(const unsigned char *const Ptr, offset_type NumEntries,
                   Info *InfoObj)
diff --git a/llvm/include/llvm/Support/PointerLikeTypeTraits.h b/llvm/include/llvm/Support/PointerLikeTypeTraits.h
index 320f6b63b447e..a47d68406acf3 100644
--- a/llvm/include/llvm/Support/PointerLikeTypeTraits.h
+++ b/llvm/include/llvm/Support/PointerLikeTypeTraits.h
@@ -70,7 +70,7 @@ template <> struct PointerLikeTypeTraits<void *> {
 
 // Provide PointerLikeTypeTraits for const things.
 template <typename T> struct PointerLikeTypeTraits<const T> {
-  typedef PointerLikeTypeTraits<T> NonConst;
+  using NonConst = PointerLikeTypeTraits<T>;
 
   static inline const void *getAsVoidPointer(const T P) {
     return NonConst::getAsVoidPointer(P);
@@ -83,7 +83,7 @@ template <typename T> struct PointerLikeTypeTraits<const T> {
 
 // Provide PointerLikeTypeTraits for const pointers.
 template <typename T> struct PointerLikeTypeTraits<const T *> {
-  typedef PointerLikeTypeTraits<T *> NonConst;
+  using NonConst = PointerLikeTypeTraits<T *>;
 
   static inline const void *getAsVoidPointer(const T *P) {
     return NonConst::getAsVoidPointer(const_cast<T *>(P));
diff --git a/llvm/include/llvm/Support/Program.h b/llvm/include/llvm/Support/Program.h
index 53c2e7597b2b4..575e416587ea8 100644
--- a/llvm/include/llvm/Support/Program.h
+++ b/llvm/include/llvm/Support/Program.h
@@ -39,8 +39,8 @@ const char EnvPathSeparator = ';';
 typedef unsigned long procid_t; // Must match the type of DWORD on Windows.
 typedef void *process_t;        // Must match the type of HANDLE on Windows.
 #else
-typedef ::pid_t procid_t;
-typedef procid_t process_t;
+using procid_t = ::pid_t;
+using process_t = procid_t;
 #endif
 
 /// This struct encapsulates information about a process.
diff --git a/llvm/include/llvm/Support/RISCVISAUtils.h b/llvm/include/llvm/Support/RISCVISAUtils.h
index 165bb08d66431..05fd32e0e7cfe 100644
--- a/llvm/include/llvm/Support/RISCVISAUtils.h
+++ b/llvm/include/llvm/Support/RISCVISAUtils.h
@@ -40,8 +40,8 @@ struct ExtensionComparator {
 
 /// OrderedExtensionMap is std::map, it's specialized to keep entries
 /// in canonical order of extension.
-typedef std::map<std::string, ExtensionVersion, ExtensionComparator>
-    OrderedExtensionMap;
+using OrderedExtensionMap =
+    std::map<std::string, ExtensionVersion, ExtensionComparator>;
 
 } // namespace RISCVISAUtils
 
diff --git a/llvm/include/llvm/Support/RWMutex.h b/llvm/include/llvm/Support/RWMutex.h
index 8d221aaab9ab9..efc1ca19a1208 100644
--- a/llvm/include/llvm/Support/RWMutex.h
+++ b/llvm/include/llvm/Support/RWMutex.h
@@ -162,7 +162,7 @@ template <bool mt_only> class SmartRWMutex {
   bool try_lock() { return impl.try_lock(); }
 };
 
-typedef SmartRWMutex<false> RWMutex;
+using RWMutex = SmartRWMutex<false>;
 
 /// ScopedReader - RAII acquisition of a reader lock
 #if !defined(LLVM_USE_RW_MUTEX_IMPL)
@@ -179,7 +179,7 @@ template <bool mt_only> struct SmartScopedReader {
   ~SmartScopedReader() { mutex.unlock_shared(); }
 };
 #endif
-typedef SmartScopedReader<false> ScopedReader;
+using ScopedReader = SmartScopedReader<false>;
 
 /// ScopedWriter - RAII acquisition of a writer lock
 #if !defined(LLVM_USE_RW_MUTEX_IMPL)
@@ -196,7 +196,7 @@ template <bool mt_only> struct SmartScopedWriter {
   ~SmartScopedWriter() { mutex.unlock(); }
 };
 #endif
-typedef SmartScopedWriter<false> ScopedWriter;
+using ScopedWriter = SmartScopedWriter<false>;
 
 } // end namespace sys
 } // end namespace llvm
diff --git a/llvm/include/llvm/Support/Registry.h b/llvm/include/llvm/Support/Registry.h
index c02f15e5e32b8..acd3b06fde6e7 100644
--- a/llvm/include/llvm/Support/Registry.h
+++ b/llvm/include/llvm/Support/Registry.h
@@ -43,8 +43,8 @@ namespace llvm {
   template <typename T>
   class Registry {
   public:
-    typedef T type;
-    typedef SimpleRegistryEntry<T> entry;
+    using type = T;
+    using entry = SimpleRegistryEntry<T>;
 
     class node;
     class iterator;
diff --git a/llvm/include/llvm/Support/SMLoc.h b/llvm/include/llvm/Support/SMLoc.h
index c80969b1d83dc..b7ae6e488cde9 100644
--- a/llvm/include/llvm/Support/SMLoc.h
+++ b/llvm/include/llvm/Support/SMLoc.h
@@ -15,7 +15,6 @@
 #define LLVM_SUPPORT_SMLOC_H
 
 #include <cassert>
-#include <optional>
 
 namespace llvm {
 
@@ -50,7 +49,6 @@ class SMRange {
   SMLoc Start, End;
 
   SMRange() = default;
-  SMRange(std::nullopt_t) {}
   SMRange(SMLoc St, SMLoc En) : Start(St), End(En) {
     assert(Start.isValid() == End.isValid() &&
            "Start and End should either both be valid or both be invalid!");
diff --git a/llvm/include/llvm/Support/ScaledNumber.h b/llvm/include/llvm/Support/ScaledNumber.h
index 07baf153e10c6..8ca8d457e339e 100644
--- a/llvm/include/llvm/Support/ScaledNumber.h
+++ b/llvm/include/llvm/Support/ScaledNumber.h
@@ -498,10 +498,10 @@ template <class DigitsT> class ScaledNumber : ScaledNumberBase {
   static_assert(!std::numeric_limits<DigitsT>::is_signed,
                 "only unsigned floats supported");
 
-  typedef DigitsT DigitsType;
+  using DigitsType = DigitsT;
 
 private:
-  typedef std::numeric_limits<DigitsType> DigitsLimits;
+  using DigitsLimits = std::numeric_limits<DigitsType>;
 
   static constexpr int Width = sizeof(DigitsType) * 8;
   static_assert(Width <= 64, "invalid integer width for digits");
@@ -782,7 +782,7 @@ uint64_t ScaledNumber<DigitsT>::scale(uint64_t N) const {
 template <class DigitsT>
 template <class IntT>
 IntT ScaledNumber<DigitsT>::toInt() const {
-  typedef std::numeric_limits<IntT> Limits;
+  using Limits = std::numeric_limits<IntT>;
   if (*this < 1)
     return 0;
   if (*this >= Limits::max())
diff --git a/llvm/include/llvm/Support/SourceMgr.h b/llvm/include/llvm/Support/SourceMgr.h
index 8320006ff5f6e..43f7e27c26ba1 100644
--- a/llvm/include/llvm/Support/SourceMgr.h
+++ b/llvm/include/llvm/Support/SourceMgr.h
@@ -103,7 +103,7 @@ class SourceMgr {
 
 public:
   /// Create new source manager without support for include files.
-  SourceMgr();
+  LLVM_ABI SourceMgr();
   /// Create new source manager with the capability of finding include files
   /// via the provided file system.
   explicit SourceMgr(IntrusiveRefCntPtr<vfs::FileSystem> FS);
@@ -111,10 +111,10 @@ class SourceMgr {
   SourceMgr &operator=(const SourceMgr &) = delete;
   SourceMgr(SourceMgr &&);
   SourceMgr &operator=(SourceMgr &&);
-  ~SourceMgr();
+  LLVM_ABI ~SourceMgr();
 
   IntrusiveRefCntPtr<vfs::FileSystem> getVirtualFileSystem() const;
-  void setVirtualFileSystem(IntrusiveRefCntPtr<vfs::FileSystem> FS);
+  LLVM_ABI void setVirtualFileSystem(IntrusiveRefCntPtr<vfs::FileSystem> FS);
 
   /// Return the include directories of this source manager.
   ArrayRef<std::string> getIncludeDirs() const { return IncludeDirectories; }
diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h
index cb8e568de02e0..5a012cf0c0264 100644
--- a/llvm/include/llvm/Support/SpecialCaseList.h
+++ b/llvm/include/llvm/Support/SpecialCaseList.h
@@ -12,19 +12,11 @@
 #ifndef LLVM_SUPPORT_SPECIALCASELIST_H
 #define LLVM_SUPPORT_SPECIALCASELIST_H
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/RadixTree.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/GlobPattern.h"
-#include "llvm/Support/Regex.h"
+#include "llvm/Support/Error.h"
 #include <memory>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 namespace llvm {
@@ -125,93 +117,20 @@ class SpecialCaseList {
   SpecialCaseList(SpecialCaseList const &) = delete;
   SpecialCaseList &operator=(SpecialCaseList const &) = delete;
 
-private:
-  // Lagacy v1 matcher.
-  class RegexMatcher {
+  class Section {
   public:
-    LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber);
-    LLVM_ABI void preprocess(bool BySize);
-
-    LLVM_ABI void
-    match(StringRef Query,
-          llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const;
-
-    struct Reg {
-      Reg(StringRef Name, unsigned LineNo, Regex &&Rg)
-          : Name(Name), LineNo(LineNo), Rg(std::move(Rg)) {}
-      StringRef Name;
-      unsigned LineNo;
-      Regex Rg;
-    };
-
-    std::vector<Reg> RegExes;
-  };
-
-  class GlobMatcher {
-  public:
-    LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber);
-    LLVM_ABI void preprocess(bool BySize);
-
-    LLVM_ABI void
-    match(StringRef Query,
-          llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const;
-
-    struct Glob {
-      Glob(StringRef Name, unsigned LineNo, GlobPattern &&Pattern)
-          : Name(Name), LineNo(LineNo), Pattern(std::move(Pattern)) {}
-      StringRef Name;
-      unsigned LineNo;
-      GlobPattern Pattern;
-    };
-
-    std::vector<GlobMatcher::Glob> Globs;
-
-    RadixTree<iterator_range<StringRef::const_iterator>,
-              RadixTree<iterator_range<StringRef::const_reverse_iterator>,
-                        SmallVector<const GlobMatcher::Glob *, 1>>>
-        PrefixSuffixToGlob;
-
-    RadixTree<iterator_range<StringRef::const_iterator>,
-              SmallVector<const GlobMatcher::Glob *, 1>>
-        SubstrToGlob;
-  };
-
-  /// Represents a set of patterns and their line numbers
-  class Matcher {
-  public:
-    LLVM_ABI Matcher(bool UseGlobs, bool RemoveDotSlash);
-
-    LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber);
-    LLVM_ABI void preprocess(bool BySize);
-
-    LLVM_ABI void
-    match(StringRef Query,
-          llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const;
+    LLVM_ABI Section(StringRef Name, unsigned FileIdx, bool UseGlobs);
+    LLVM_ABI Section(Section &&);
+    LLVM_ABI ~Section();
 
-    LLVM_ABI bool matchAny(StringRef Query) const {
-      bool R = false;
-      match(Query, [&](StringRef, unsigned) { R = true; });
-      return R;
-    }
+    // Returns name of the section, its entire string in [].
+    StringRef name() const { return Name; }
 
-    std::variant<RegexMatcher, GlobMatcher> M;
-    bool RemoveDotSlash;
-  };
-
-  using SectionEntries = StringMap<StringMap<Matcher>>;
+    // Returns true if string 'Name' matches section name interpreted as a glob.
+    LLVM_ABI bool matchName(StringRef Name) const;
 
-protected:
-  struct Section {
-    Section(StringRef Str, unsigned FileIdx, bool UseGlobs)
-        : SectionMatcher(UseGlobs, /*RemoveDotSlash=*/false), SectionStr(Str),
-          FileIdx(FileIdx) {}
-
-    Section(Section &&) = default;
-
-    Matcher SectionMatcher;
-    SectionEntries Entries;
-    std::string SectionStr;
-    unsigned FileIdx;
+    // Returns sequence number of the file where this section is defined.
+    unsigned fileIndex() const { return FileIdx; }
 
     // Helper method to search by Prefix, Query, and Category. Returns
     // 1-based line number on which rule is defined, or 0 if there is no match.
@@ -223,11 +142,16 @@ class SpecialCaseList {
     LLVM_ABI StringRef getLongestMatch(StringRef Prefix, StringRef Query,
                                        StringRef Category) const;
 
+    /// Returns true if the section has any entries for the given prefix.
+    LLVM_ABI bool hasPrefix(StringRef Prefix) const;
+
   private:
     friend class SpecialCaseList;
-    LLVM_ABI void preprocess(bool OrderBySize);
-    LLVM_ABI const SpecialCaseList::Matcher *
-    findMatcher(StringRef Prefix, StringRef Category) const;
+    class SectionImpl;
+
+    StringRef Name;
+    unsigned FileIdx;
+    std::unique_ptr<SectionImpl> Impl;
   };
 
   ArrayRef<const Section> sections() const { return Sections; }
diff --git a/llvm/include/llvm/Support/SuffixTree.h b/llvm/include/llvm/Support/SuffixTree.h
index 4c78235abf508..eac66d84d6f63 100644
--- a/llvm/include/llvm/Support/SuffixTree.h
+++ b/llvm/include/llvm/Support/SuffixTree.h
@@ -219,7 +219,7 @@ class SuffixTree {
     }
   };
 
-  typedef RepeatedSubstringIterator iterator;
+  using iterator = RepeatedSubstringIterator;
   iterator begin() { return iterator(Root, LeafNodes); }
   iterator end() { return iterator(nullptr); }
 };
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index e55314568d683..fb20da336dda0 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -233,6 +233,9 @@ HANDLE_TARGET_OPCODE(MEMBARRIER)
 // using.
 HANDLE_TARGET_OPCODE(JUMP_TABLE_DEBUG_INFO)
 
+// Issue a no-op relocation against a given symbol at the current location.
+HANDLE_TARGET_OPCODE(RELOC_NONE)
+
 HANDLE_TARGET_OPCODE(CONVERGENCECTRL_ENTRY)
 HANDLE_TARGET_OPCODE(CONVERGENCECTRL_ANCHOR)
 HANDLE_TARGET_OPCODE(CONVERGENCECTRL_LOOP)
diff --git a/llvm/include/llvm/Support/ThreadPool.h b/llvm/include/llvm/Support/ThreadPool.h
index c20efc7396b79..1be7779f2c72c 100644
--- a/llvm/include/llvm/Support/ThreadPool.h
+++ b/llvm/include/llvm/Support/ThreadPool.h
@@ -14,6 +14,7 @@
 #define LLVM_SUPPORT_THREADPOOL_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FunctionExtras.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Jobserver.h"
@@ -26,7 +27,6 @@
 #include <condition_variable>
 #include <deque>
 #include <functional>
-#include <memory>
 #include <mutex>
 #include <utility>
 
@@ -51,7 +51,7 @@ class ThreadPoolTaskGroup;
 class LLVM_ABI ThreadPoolInterface {
   /// The actual method to enqueue a task to be defined by the concrete
   /// implementation.
-  virtual void asyncEnqueue(std::function<void()> Task,
+  virtual void asyncEnqueue(llvm::unique_function<void()> Task,
                             ThreadPoolTaskGroup *Group) = 0;
 
 public:
@@ -95,22 +95,22 @@ class LLVM_ABI ThreadPoolInterface {
   /// used to wait for the task to finish and is *non-blocking* on destruction.
   template <typename Func>
   auto async(Func &&F) -> std::shared_future<decltype(F())> {
-    return asyncImpl(std::function<decltype(F())()>(std::forward<Func>(F)),
-                     nullptr);
+    return asyncImpl(
+        llvm::unique_function<decltype(F())()>(std::forward<Func>(F)), nullptr);
   }
 
   template <typename Func>
   auto async(ThreadPoolTaskGroup &Group, Func &&F)
       -> std::shared_future<decltype(F())> {
-    return asyncImpl(std::function<decltype(F())()>(std::forward<Func>(F)),
-                     &Group);
+    return asyncImpl(
+        llvm::unique_function<decltype(F())()>(std::forward<Func>(F)), &Group);
   }
 
 private:
   /// Asynchronous submission of a task to the pool. The returned future can be
   /// used to wait for the task to finish and is *non-blocking* on destruction.
   template <typename ResTy>
-  std::shared_future<ResTy> asyncImpl(std::function<ResTy()> Task,
+  std::shared_future<ResTy> asyncImpl(llvm::unique_function<ResTy()> Task,
                                       ThreadPoolTaskGroup *Group) {
     auto Future = std::async(std::launch::deferred, std::move(Task)).share();
     asyncEnqueue([Future]() { Future.wait(); }, Group);
@@ -160,7 +160,7 @@ class LLVM_ABI StdThreadPool : public ThreadPoolInterface {
 
   /// Asynchronous submission of a task to the pool. The returned future can be
   /// used to wait for the task to finish and is *non-blocking* on destruction.
-  void asyncEnqueue(std::function<void()> Task,
+  void asyncEnqueue(llvm::unique_function<void()> Task,
                     ThreadPoolTaskGroup *Group) override {
     int requestedThreads;
     {
@@ -189,7 +189,8 @@ class LLVM_ABI StdThreadPool : public ThreadPoolInterface {
   mutable llvm::sys::RWMutex ThreadsLock;
 
   /// Tasks waiting for execution in the pool.
-  std::deque<std::pair<std::function<void()>, ThreadPoolTaskGroup *>> Tasks;
+  std::deque<std::pair<llvm::unique_function<void()>, ThreadPoolTaskGroup *>>
+      Tasks;
 
   /// Locking and signaling for accessing the Tasks queue.
   std::mutex QueueLock;
@@ -239,13 +240,14 @@ class LLVM_ABI SingleThreadExecutor : public ThreadPoolInterface {
 private:
   /// Asynchronous submission of a task to the pool. The returned future can be
   /// used to wait for the task to finish and is *non-blocking* on destruction.
-  void asyncEnqueue(std::function<void()> Task,
+  void asyncEnqueue(llvm::unique_function<void()> Task,
                     ThreadPoolTaskGroup *Group) override {
     Tasks.emplace_back(std::make_pair(std::move(Task), Group));
   }
 
   /// Tasks waiting for execution in the pool.
-  std::deque<std::pair<std::function<void()>, ThreadPoolTaskGroup *>> Tasks;
+  std::deque<std::pair<llvm::unique_function<void()>, ThreadPoolTaskGroup *>>
+      Tasks;
 };
 
 #if LLVM_ENABLE_THREADS
diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h
index 88846807f111a..89d90b3438e92 100644
--- a/llvm/include/llvm/Support/Threading.h
+++ b/llvm/include/llvm/Support/Threading.h
@@ -53,7 +53,7 @@ constexpr bool llvm_is_multithreaded() { return LLVM_ENABLE_THREADS; }
 
 #if LLVM_THREADING_USE_STD_CALL_ONCE
 
-  typedef std::once_flag once_flag;
+using once_flag = std::once_flag;
 
 #else
 
diff --git a/llvm/include/llvm/Support/Timer.h b/llvm/include/llvm/Support/Timer.h
index 527d67f3b360c..097eaf3422ca3 100644
--- a/llvm/include/llvm/Support/Timer.h
+++ b/llvm/include/llvm/Support/Timer.h
@@ -15,7 +15,6 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Mutex.h"
 #include <cassert>
-#include <memory>
 #include <string>
 #include <vector>
 
diff --git a/llvm/include/llvm/Support/TrailingObjects.h b/llvm/include/llvm/Support/TrailingObjects.h
index c47976524dcd9..218c2e336d77b 100644
--- a/llvm/include/llvm/Support/TrailingObjects.h
+++ b/llvm/include/llvm/Support/TrailingObjects.h
@@ -76,7 +76,7 @@ class TrailingObjectsBase {
 // number of a different type. e.g.:
 //   ExtractSecondType<Foo..., int>::type
 template <typename Ty1, typename Ty2> struct ExtractSecondType {
-  typedef Ty2 type;
+  using type = Ty2;
 };
 
 // TrailingObjectsImpl is somewhat complicated, because it is a
@@ -101,8 +101,8 @@ class TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, PrevTy, NextTy,
     : public TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy,
                                  MoreTys...> {
 
-  typedef TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy, MoreTys...>
-      ParentType;
+  using ParentType =
+      TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy, MoreTys...>;
 
   struct RequiresRealignment {
     static const bool value = alignof(PrevTy) < alignof(NextTy);
diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index 0a7ae15edbb33..421d6613bfafc 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -20,7 +20,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <type_traits>
diff --git a/llvm/include/llvm/Support/UnicodeCharRanges.h b/llvm/include/llvm/Support/UnicodeCharRanges.h
index 7f1a9b3ff0c3b..03515cd61515f 100644
--- a/llvm/include/llvm/Support/UnicodeCharRanges.h
+++ b/llvm/include/llvm/Support/UnicodeCharRanges.h
@@ -12,7 +12,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 
 #define DEBUG_TYPE "unicode"
 
@@ -37,7 +36,7 @@ inline bool operator<(UnicodeCharRange Range, uint32_t Value) {
 /// array.
 class UnicodeCharSet {
 public:
-  typedef ArrayRef<UnicodeCharRange> CharRanges;
+  using CharRanges = ArrayRef<UnicodeCharRange>;
 
   /// Constructs a UnicodeCharSet instance from an array of
   /// UnicodeCharRanges.
diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h
index c8911a0225f86..dbd5a5c137fd1 100644
--- a/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -1116,8 +1116,9 @@ class LLVM_ABI RedirectingFileSystem
 /// Collect all pairs of <virtual path, real path> entries from the
 /// \p VFS. This is used by the module dependency collector to forward
 /// the entries into the reproducer output VFS YAML file.
-void collectVFSEntries(RedirectingFileSystem &VFS,
-                       SmallVectorImpl<YAMLVFSEntry> &CollectedEntries);
+LLVM_ABI void
+collectVFSEntries(RedirectingFileSystem &VFS,
+                  SmallVectorImpl<YAMLVFSEntry> &CollectedEntries);
 
 class YAMLVFSWriter {
   std::vector<YAMLVFSEntry> Mappings;
diff --git a/llvm/include/llvm/Support/VirtualOutputBackend.h b/llvm/include/llvm/Support/VirtualOutputBackend.h
index 85caa021c2aae..78ed4b9b66607 100644
--- a/llvm/include/llvm/Support/VirtualOutputBackend.h
+++ b/llvm/include/llvm/Support/VirtualOutputBackend.h
@@ -32,7 +32,7 @@ namespace llvm::vfs {
 /// If virtual functions are added here, also add them to \a
 /// ProxyOutputBackend.
 class OutputBackend : public RefCountedBase<OutputBackend> {
-  virtual void anchor();
+  LLVM_ABI virtual void anchor();
 
 public:
   /// Get a backend that points to the same destination as this one but that
@@ -47,7 +47,7 @@ class OutputBackend : public RefCountedBase<OutputBackend> {
   /// have been customized).
   ///
   /// Thread-safe.
-  Expected<OutputFile>
+  LLVM_ABI Expected<OutputFile>
   createFile(const Twine &Path,
              std::optional<OutputConfig> Config = std::nullopt);
 
diff --git a/llvm/include/llvm/Support/VirtualOutputBackends.h b/llvm/include/llvm/Support/VirtualOutputBackends.h
index 219bc30cfa6db..13a9611f7613a 100644
--- a/llvm/include/llvm/Support/VirtualOutputBackends.h
+++ b/llvm/include/llvm/Support/VirtualOutputBackends.h
@@ -77,14 +77,14 @@ class ProxyOutputBackend : public OutputBackend {
 
 /// An output backend that creates files on disk, wrapping APIs in sys::fs.
 class OnDiskOutputBackend : public OutputBackend {
-  void anchor() override;
+  LLVM_ABI void anchor() override;
 
 protected:
   IntrusiveRefCntPtr<OutputBackend> cloneImpl() const override {
     return clone();
   }
 
-  Expected<std::unique_ptr<OutputFileImpl>>
+  LLVM_ABI Expected<std::unique_ptr<OutputFileImpl>>
   createFileImpl(StringRef Path, std::optional<OutputConfig> Config) override;
 
 public:
diff --git a/llvm/include/llvm/Support/VirtualOutputError.h b/llvm/include/llvm/Support/VirtualOutputError.h
index 2293ff982a6b4..44590a1fb5ed0 100644
--- a/llvm/include/llvm/Support/VirtualOutputError.h
+++ b/llvm/include/llvm/Support/VirtualOutputError.h
@@ -43,7 +43,7 @@ class OutputError : public ErrorInfo<OutputError, ECError> {
   void log(raw_ostream &OS) const override;
 
   // Used by ErrorInfo::classID.
-  static char ID;
+  LLVM_ABI static char ID;
 
   OutputError(const Twine &OutputPath, std::error_code EC)
       : ErrorInfo<OutputError, ECError>(EC), OutputPath(OutputPath.str()) {
@@ -99,7 +99,7 @@ class TempFileOutputError : public ErrorInfo<TempFileOutputError, OutputError> {
   void log(raw_ostream &OS) const override;
 
   // Used by ErrorInfo::classID.
-  static char ID;
+  LLVM_ABI static char ID;
 
   TempFileOutputError(const Twine &TempPath, const Twine &OutputPath,
                       std::error_code EC)
diff --git a/llvm/include/llvm/Support/VirtualOutputFile.h b/llvm/include/llvm/Support/VirtualOutputFile.h
index dd50437605deb..d53701c130479 100644
--- a/llvm/include/llvm/Support/VirtualOutputFile.h
+++ b/llvm/include/llvm/Support/VirtualOutputFile.h
@@ -80,13 +80,13 @@ class OutputFile {
   ///
   /// If there's an open proxy from \a createProxy(), calls \a discard() to
   /// clean up temporaries followed by \a report_fatal_error().
-  Error keep();
+  LLVM_ABI Error keep();
 
   /// Discard an output, cleaning up any temporary state. Errors if clean-up
   /// fails.
   ///
   /// If it has already been closed, calls \a report_fatal_error().
-  Error discard();
+  LLVM_ABI Error discard();
 
   /// Discard the output when destroying it if it's still open, sending the
   /// result to \a Handler.
@@ -98,7 +98,7 @@ class OutputFile {
   /// producer. Errors if there's already a proxy. The proxy must be deleted
   /// before calling \a keep(). The proxy will crash if it's written to after
   /// calling \a discard().
-  Expected<std::unique_ptr<raw_pwrite_stream>> createProxy();
+  LLVM_ABI Expected<std::unique_ptr<raw_pwrite_stream>> createProxy();
 
   bool hasOpenProxy() const { return OpenProxy; }
 
@@ -132,7 +132,7 @@ class OutputFile {
 private:
   /// Destroy \a Impl. Reports fatal error if the file is open and there's no
   /// handler from \a discardOnDestroy().
-  void destroy();
+  LLVM_ABI void destroy();
   OutputFile &moveFrom(OutputFile &O) {
     Path = std::move(O.Path);
     Impl = std::move(O.Impl);
diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
index 4aa6c01d29cc2..6f6f65dc075f3 100644
--- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
+++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
@@ -511,7 +511,6 @@ enum OperandEncoding { ENCODINGS ENCODING_max };
   ENUM_ENTRY(TYPE_VK, "mask register")                                         \
   ENUM_ENTRY(TYPE_VK_PAIR, "mask register pair")                               \
   ENUM_ENTRY(TYPE_TMM, "tile")                                                 \
-  ENUM_ENTRY(TYPE_TMM_PAIR, "tile pair")                                       \
   ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand")                      \
   ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand")                          \
   ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand")                      \
diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h
index 3d36f41ca1a04..b53b28dd00fd1 100644
--- a/llvm/include/llvm/Support/YAMLTraits.h
+++ b/llvm/include/llvm/Support/YAMLTraits.h
@@ -1921,12 +1921,12 @@ template <typename T> struct StdMapStringCustomMappingTraitsImpl {
   using map_type = std::map<std::string, T>;
 
   static void inputOne(IO &io, StringRef key, map_type &v) {
-    io.mapRequired(key.str().c_str(), v[std::string(key)]);
+    io.mapRequired(key, v[std::string(key)]);
   }
 
   static void output(IO &io, map_type &v) {
     for (auto &p : v)
-      io.mapRequired(p.first.c_str(), p.second);
+      io.mapRequired(p.first, p.second);
   }
 };
 
diff --git a/llvm/include/llvm/Support/float128.h b/llvm/include/llvm/Support/float128.h
index e15a98dc5a677..ffad1241c3e3d 100644
--- a/llvm/include/llvm/Support/float128.h
+++ b/llvm/include/llvm/Support/float128.h
@@ -14,7 +14,7 @@ namespace llvm {
 #if defined(__clang__) && defined(__FLOAT128__) &&                             \
     defined(__SIZEOF_INT128__) && !defined(__LONG_DOUBLE_IBM128__)
 #define HAS_IEE754_FLOAT128
-typedef __float128 float128;
+using float128 = __float128;
 #elif defined(__FLOAT128__) && defined(__SIZEOF_INT128__) &&                   \
     !defined(__LONG_DOUBLE_IBM128__) &&                                        \
     (defined(__GNUC__) || defined(__GNUG__))
diff --git a/llvm/include/llvm/Support/thread.h b/llvm/include/llvm/Support/thread.h
index 16e322bfd8785..51873e7d529bf 100644
--- a/llvm/include/llvm/Support/thread.h
+++ b/llvm/include/llvm/Support/thread.h
@@ -34,7 +34,7 @@ typedef PVOID HANDLE;
 
 namespace llvm {
 
-#if LLVM_ON_UNIX || _WIN32
+#if defined(LLVM_ON_UNIX) || defined(_WIN32)
 
 /// LLVM thread following std::thread interface with added constructor to
 /// specify stack size.
@@ -49,7 +49,7 @@ class thread {
   }
 
 public:
-#if LLVM_ON_UNIX
+#ifdef LLVM_ON_UNIX
   using native_handle_type = pthread_t;
   using id = pthread_t;
   using start_routine_type = void *(*)(void *);
@@ -127,7 +127,7 @@ LLVM_ABI thread::id llvm_thread_get_current_id_impl();
 template <class Function, class... Args>
 thread::thread(std::optional<unsigned> StackSizeInBytes, Function &&f,
                Args &&...args) {
-  typedef std::tuple<std::decay_t<Function>, std::decay_t<Args>...> CalleeTuple;
+  using CalleeTuple = std::tuple<std::decay_t<Function>, std::decay_t<Args>...>;
   std::unique_ptr<CalleeTuple> Callee(
       new CalleeTuple(std::forward<Function>(f), std::forward<Args>(args)...));
 
diff --git a/llvm/include/llvm/Support/type_traits.h b/llvm/include/llvm/Support/type_traits.h
index a96125c16f11b..d037132fa5bad 100644
--- a/llvm/include/llvm/Support/type_traits.h
+++ b/llvm/include/llvm/Support/type_traits.h
@@ -15,7 +15,6 @@
 
 #include "llvm/Support/Compiler.h"
 #include <type_traits>
-#include <utility>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/TableGen/CodeGenHelpers.h b/llvm/include/llvm/TableGen/CodeGenHelpers.h
index e22c6d4f6d390..95866e306b5ff 100644
--- a/llvm/include/llvm/TableGen/CodeGenHelpers.h
+++ b/llvm/include/llvm/TableGen/CodeGenHelpers.h
@@ -20,6 +20,7 @@
 #include <string>
 
 namespace llvm {
+
 // Simple RAII helper for emitting ifdef-undef-endif scope.
 class IfDefEmitter {
 public:
@@ -57,7 +58,7 @@ class NamespaceEmitter {
   NamespaceEmitter(raw_ostream &OS, StringRef NameUntrimmed)
       : Name(trim(NameUntrimmed).str()), OS(OS) {
     if (!Name.empty())
-      OS << "namespace " << Name << " {\n";
+      OS << "namespace " << Name << " {\n\n";
   }
 
   ~NamespaceEmitter() { close(); }
@@ -65,7 +66,7 @@ class NamespaceEmitter {
   // Explicit function to close the namespace scopes.
   void close() {
     if (!Closed && !Name.empty())
-      OS << "} // namespace " << Name << "\n";
+      OS << "\n} // namespace " << Name << "\n";
     Closed = true;
   }
 
diff --git a/llvm/include/llvm/TableGen/DirectiveEmitter.h b/llvm/include/llvm/TableGen/DirectiveEmitter.h
index ce3e87e470b9d..2080f75eb8cfc 100644
--- a/llvm/include/llvm/TableGen/DirectiveEmitter.h
+++ b/llvm/include/llvm/TableGen/DirectiveEmitter.h
@@ -20,7 +20,6 @@
 #include "llvm/Frontend/Directive/Spelling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/TableGen/Record.h"
-#include <algorithm>
 #include <string>
 #include <vector>
 
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index 13175177edd3e..db99885121ec1 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1554,6 +1554,11 @@ def JUMP_TABLE_DEBUG_INFO : StandardPseudoInstruction {
   let Size = 0;
   let isMeta = true;
 }
+def RELOC_NONE : StandardPseudoInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins unknown:$symbol);
+  let hasSideEffects = true;
+}
 
 let hasSideEffects = false, isMeta = true, isConvergent = true in {
 def CONVERGENCECTRL_ANCHOR : StandardPseudoInstruction {
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 07a858fd682fc..a9750a5ab03f9 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -527,6 +527,8 @@ def partial_reduce_smla : SDNode<"ISD::PARTIAL_REDUCE_SMLA",
                                  SDTPartialReduceMLA>;
 def partial_reduce_sumla : SDNode<"ISD::PARTIAL_REDUCE_SUMLA",
                                  SDTPartialReduceMLA>;
+def partial_reduce_fmla : SDNode<"ISD::PARTIAL_REDUCE_FMLA",
+                                 SDTPartialReduceMLA>;
 
 def fadd       : SDNode<"ISD::FADD"       , SDTFPBinOp, [SDNPCommutative]>;
 def fsub       : SDNode<"ISD::FSUB"       , SDTFPBinOp>;
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 7e68ad20e7583..7da529e2e8a87 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -23,7 +23,6 @@
 #include "llvm/Support/VersionTuple.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
-#include <array>
 #include <set>
 #include <vector>
 
diff --git a/llvm/include/llvm/TargetParser/RISCVISAInfo.h b/llvm/include/llvm/TargetParser/RISCVISAInfo.h
index 0c308cadba790..20dbb60c96ab7 100644
--- a/llvm/include/llvm/TargetParser/RISCVISAInfo.h
+++ b/llvm/include/llvm/TargetParser/RISCVISAInfo.h
@@ -15,7 +15,6 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/RISCVISAUtils.h"
 
-#include <map>
 #include <set>
 #include <string>
 #include <vector>
diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h
index aad9859263480..9dfa50c1ad1ba 100644
--- a/llvm/include/llvm/TargetParser/TargetParser.h
+++ b/llvm/include/llvm/TargetParser/TargetParser.h
@@ -161,6 +161,9 @@ enum ArchFeatureKind : uint32_t {
 
   // WGP mode is supported.
   FEATURE_WGP = 1 << 9,
+
+  // Xnack is available by default
+  FEATURE_XNACK_ALWAYS = 1 << 10
 };
 
 enum FeatureError : uint32_t {
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 0e82dd212f34d..11b76cd183108 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -554,15 +554,29 @@ class Triple {
     return getOSVersion() < VersionTuple(Major, Minor, Micro);
   }
 
+  bool isOSVersionGE(unsigned Major, unsigned Minor = 0,
+                     unsigned Micro = 0) const {
+    return !isOSVersionLT(Major, Minor, Micro);
+  }
+
   bool isOSVersionLT(const Triple &Other) const {
     return getOSVersion() < Other.getOSVersion();
   }
 
+  bool isOSVersionGE(const Triple &Other) const {
+    return getOSVersion() >= Other.getOSVersion();
+  }
+
   /// Comparison function for checking OS X version compatibility, which handles
   /// supporting skewed version numbering schemes used by the "darwin" triples.
   LLVM_ABI bool isMacOSXVersionLT(unsigned Major, unsigned Minor = 0,
                                   unsigned Micro = 0) const;
 
+  bool isMacOSXVersionGE(unsigned Major, unsigned Minor = 0,
+                         unsigned Micro = 0) const {
+    return !isMacOSXVersionLT(Major, Minor, Micro);
+  }
+
   /// Is this a Mac OS X triple. For legacy reasons, we support both "darwin"
   /// and "osx" as OS X triples.
   bool isMacOSX() const {
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index a94eab1d7ae34..78cf46406192e 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -268,7 +268,6 @@ X86_FEATURE_COMPAT(AVX10_2_512,     "avx10.2-512",            0)
 X86_FEATURE       (MOVRS,           "movrs")
 X86_FEATURE       (ZU,              "zu")
 X86_FEATURE       (AMX_FP8,         "amx-fp8")
-X86_FEATURE       (AMX_TRANSPOSE,   "amx-transpose")
 X86_FEATURE       (AMX_MOVRS,       "amx-movrs")
 X86_FEATURE       (AMX_AVX512,      "amx-avx512")
 X86_FEATURE       (AMX_TF32,        "amx-tf32")
diff --git a/llvm/include/llvm/TargetParser/XtensaTargetParser.h b/llvm/include/llvm/TargetParser/XtensaTargetParser.h
index 828b4079ef328..41369b1d64499 100644
--- a/llvm/include/llvm/TargetParser/XtensaTargetParser.h
+++ b/llvm/include/llvm/TargetParser/XtensaTargetParser.h
@@ -15,7 +15,6 @@
 #define LLVM_TARGETPARSER_XTENSATARGETPARSER_H
 
 #include "llvm/TargetParser/Triple.h"
-#include <vector>
 
 namespace llvm {
 class StringRef;
diff --git a/llvm/include/llvm/Telemetry/Telemetry.h b/llvm/include/llvm/Telemetry/Telemetry.h
index 708ec439ed40f..9b607f1a3a8fc 100644
--- a/llvm/include/llvm/Telemetry/Telemetry.h
+++ b/llvm/include/llvm/Telemetry/Telemetry.h
@@ -19,11 +19,9 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
-#include <map>
 #include <memory>
 #include <optional>
 #include <string>
-#include <type_traits>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Transforms/Coroutines/CoroAnnotationElide.h b/llvm/include/llvm/Transforms/Coroutines/CoroAnnotationElide.h
index 352c9e1452669..2061098b6ea6a 100644
--- a/llvm/include/llvm/Transforms/Coroutines/CoroAnnotationElide.h
+++ b/llvm/include/llvm/Transforms/Coroutines/CoroAnnotationElide.h
@@ -24,7 +24,7 @@
 namespace llvm {
 
 struct CoroAnnotationElidePass : PassInfoMixin<CoroAnnotationElidePass> {
-  CoroAnnotationElidePass() {}
+  CoroAnnotationElidePass() = default;
 
   PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM,
                         LazyCallGraph &CG, CGSCCUpdateResult &UR);
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index a013f27766051..eb35e3644bd02 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -5339,6 +5339,19 @@ struct AAPotentialConstantValues
     return nullptr;
   }
 
+  /// Return the minimum trailing zeros of potential constants
+  unsigned getAssumedMinTrailingZeros() const {
+    if (!isValidState() || getAssumedSet().empty())
+      return 0;
+    unsigned TrailingZeros = getAssumedSet().begin()->getBitWidth() + 1;
+    for (const APInt &It : getAssumedSet()) {
+      if (It.countTrailingZeros() < TrailingZeros)
+        TrailingZeros = It.countTrailingZeros();
+    }
+    if (TrailingZeros > getAssumedSet().begin()->getBitWidth())
+      return 0;
+    return TrailingZeros;
+  }
   /// See AbstractAttribute::getName()
   StringRef getName() const override { return "AAPotentialConstantValues"; }
 
diff --git a/llvm/include/llvm/Transforms/IPO/FatLTOCleanup.h b/llvm/include/llvm/Transforms/IPO/FatLTOCleanup.h
index 17eab8568f4dc..6fc1b2623e163 100644
--- a/llvm/include/llvm/Transforms/IPO/FatLTOCleanup.h
+++ b/llvm/include/llvm/Transforms/IPO/FatLTOCleanup.h
@@ -26,7 +26,7 @@ class ModuleSummaryIndex;
 
 class FatLtoCleanup : public PassInfoMixin<FatLtoCleanup> {
 public:
-  FatLtoCleanup() {}
+  FatLtoCleanup() = default;
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
   static bool isRequired() { return true; }
 };
diff --git a/llvm/include/llvm/Transforms/IPO/IROutliner.h b/llvm/include/llvm/Transforms/IPO/IROutliner.h
index 28970f7dcdf10..e8275b2d20ade 100644
--- a/llvm/include/llvm/Transforms/IPO/IROutliner.h
+++ b/llvm/include/llvm/Transforms/IPO/IROutliner.h
@@ -204,10 +204,10 @@ class IROutliner {
       : getTTI(GTTI), getIRSI(GIRSI), getORE(GORE) {
     
     // Check that the DenseMap implementation has not changed.
-    assert(DenseMapInfo<unsigned>::getEmptyKey() == (unsigned)-1 &&
-           "DenseMapInfo<unsigned>'s empty key isn't -1!");
-    assert(DenseMapInfo<unsigned>::getTombstoneKey() == (unsigned)-2 &&
-           "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
+    static_assert(DenseMapInfo<unsigned>::getEmptyKey() ==
+                  static_cast<unsigned>(-1));
+    static_assert(DenseMapInfo<unsigned>::getTombstoneKey() ==
+                  static_cast<unsigned>(-2));
   }
   bool run(Module &M);
 
diff --git a/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h b/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h
index 8addf49fc0d81..272b96037c753 100644
--- a/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h
+++ b/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h
@@ -23,7 +23,7 @@ class Module;
 /// A pass which infers function attributes from the names and signatures of
 /// function declarations in a module.
 struct InferFunctionAttrsPass : PassInfoMixin<InferFunctionAttrsPass> {
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  LLVM_ABI PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
 
 }
diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
index 3bdcf9a18fe40..c695784641b4e 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
@@ -17,6 +17,8 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h"
 
+#include <unordered_set>
+
 namespace llvm {
 
 using AnchorList = std::vector<std::pair<LineLocation, FunctionId>>;
diff --git a/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h b/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
index a8a09fb95c4bd..346e7f06eaa43 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
@@ -33,7 +33,7 @@ class FileSystem;
 /// appends globals to llvm.compiler.used.
 class SanitizerCoveragePass : public PassInfoMixin<SanitizerCoveragePass> {
 public:
-  explicit SanitizerCoveragePass(
+  LLVM_ABI explicit SanitizerCoveragePass(
       SanitizerCoverageOptions Options = SanitizerCoverageOptions(),
       IntrusiveRefCntPtr<vfs::FileSystem> VFS = nullptr,
       const std::vector<std::string> &AllowlistFiles = {},
diff --git a/llvm/include/llvm/Transforms/Scalar/DropUnnecessaryAssumes.h b/llvm/include/llvm/Transforms/Scalar/DropUnnecessaryAssumes.h
index 4ff442ff80c76..54ddcc09f7204 100644
--- a/llvm/include/llvm/Transforms/Scalar/DropUnnecessaryAssumes.h
+++ b/llvm/include/llvm/Transforms/Scalar/DropUnnecessaryAssumes.h
@@ -19,7 +19,13 @@ namespace llvm {
 
 struct DropUnnecessaryAssumesPass
     : public PassInfoMixin<DropUnnecessaryAssumesPass> {
+  DropUnnecessaryAssumesPass(bool DropDereferenceable = false)
+      : DropDereferenceable(DropDereferenceable) {}
+
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  bool DropDereferenceable;
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
index a03a38466b27b..1a19eb94e60ea 100644
--- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
+++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
@@ -24,7 +24,6 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
-#include <optional>
 #include <utility>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
index 12513c2a704f2..35c9adbe17677 100644
--- a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
@@ -20,7 +20,6 @@
 
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Compiler.h"
-#include <optional>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 2d2355d6be68a..86eb21389756c 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -365,6 +365,40 @@ LLVM_ABI bool setLoopEstimatedTripCount(
     Loop *L, unsigned EstimatedTripCount,
     std::optional<unsigned> EstimatedLoopInvocationWeight = std::nullopt);
 
+/// Based on branch weight metadata, return either:
+/// - An unknown probability if the implementation is unable to handle the loop
+///   form of \p L (e.g., \p L must have a latch block that controls the loop
+///   exit).
+/// - The probability \c P that, at the end of any iteration, the latch of \p L
+///   will start another iteration such that `1 - P` is the probability of
+///   exiting the loop.
+BranchProbability getLoopProbability(Loop *L);
+
+/// Set branch weight metadata for the latch of \p L to indicate that, at the
+/// end of any iteration, \p P and `1 - P` are the probabilities of starting
+/// another iteration and exiting the loop, respectively.  Return false if the
+/// implementation is unable to handle the loop form of \p L (e.g., \p L must
+/// have a latch block that controls the loop exit).  Otherwise, return true.
+bool setLoopProbability(Loop *L, BranchProbability P);
+
+/// Based on branch weight metadata, return either:
+/// - An unknown probability if the implementation cannot extract the
+///   probability (e.g., \p B must have exactly two target labels, so it must be
+///   a conditional branch).
+/// - The probability \c P that control flows from \p B to its first target
+///   label such that `1 - P` is the probability of control flowing to its
+///   second target label, or vice-versa if \p ForFirstTarget is false.
+BranchProbability getBranchProbability(BranchInst *B, bool ForFirstTarget);
+
+/// Set branch weight metadata for \p B to indicate that \p P and `1 - P` are
+/// the probabilities of control flowing to its first and second target labels,
+/// respectively, or vice-versa if \p ForFirstTarget is false.  Return false if
+/// the implementation cannot set the probability (e.g., \p B must have exactly
+/// two target labels, so it must be a conditional branch).  Otherwise, return
+/// true.
+bool setBranchProbability(BranchInst *B, BranchProbability P,
+                          bool ForFirstTarget);
+
 /// Check inner loop (L) backedge count is known to be invariant on all
 /// iterations of its outer loop. If the loop has no parent, this is trivially
 /// true.
diff --git a/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h
index cb48bb01e178a..19b573d6546a0 100644
--- a/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h
+++ b/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h
@@ -14,7 +14,6 @@
 #define LLVM_TRANSFORMS_UTILS_LOWERVECTORINTRINSICS_H
 
 #include <cstdint>
-#include <optional>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/Transforms/Utils/SplitModuleByCategory.h b/llvm/include/llvm/Transforms/Utils/SplitModuleByCategory.h
index cfcd1611e27fe..47aa2ff5930b0 100644
--- a/llvm/include/llvm/Transforms/Utils/SplitModuleByCategory.h
+++ b/llvm/include/llvm/Transforms/Utils/SplitModuleByCategory.h
@@ -16,7 +16,6 @@
 
 #include <memory>
 #include <optional>
-#include <string>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index 871c13d972470..a3efc43c62dc3 100644
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -97,7 +97,9 @@ LLVM_ABI bool UnrollRuntimeLoopRemainder(
     LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
     const TargetTransformInfo *TTI, bool PreserveLCSSA,
     unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit,
-    Loop **ResultLoop = nullptr);
+    Loop **ResultLoop = nullptr,
+    std::optional<unsigned> OriginalTripCount = std::nullopt,
+    BranchProbability OriginalLoopProb = BranchProbability::getUnknown());
 
 LLVM_ABI LoopUnrollResult UnrollAndJamLoop(
     Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
diff --git a/llvm/include/llvm/Transforms/Utils/ValueMapper.h b/llvm/include/llvm/Transforms/Utils/ValueMapper.h
index 17b5d4b891230..28c4ae840b29f 100644
--- a/llvm/include/llvm/Transforms/Utils/ValueMapper.h
+++ b/llvm/include/llvm/Transforms/Utils/ValueMapper.h
@@ -204,7 +204,7 @@ class ValueMapper {
   LLVM_ABI void scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init,
                                              unsigned MappingContextID = 0);
   LLVM_ABI void scheduleMapAppendingVariable(GlobalVariable &GV,
-                                             Constant *InitPrefix,
+                                             GlobalVariable *OldGV,
                                              bool IsOldCtorDtor,
                                              ArrayRef<Constant *> NewMembers,
                                              unsigned MappingContextID = 0);
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h
index 4385df518a111..050396674e159 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h
@@ -19,7 +19,6 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h"
-#include <algorithm>
 
 namespace llvm::sandboxir {
 
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
index 96a2348403932..3d76cdaad6240 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
@@ -167,7 +167,7 @@ class LegalityResult {
   LegalityResult &operator=(const LegalityResult &) = delete;
 
 public:
-  virtual ~LegalityResult() {}
+  virtual ~LegalityResult() = default;
   LegalityResultID getSubclassID() const { return ID; }
 #ifndef NDEBUG
   virtual void print(raw_ostream &OS) const {
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
index b289520fa83af..821382b0b12d0 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
@@ -36,7 +36,7 @@ class SeedBundle {
   /// No need to allow copies.
   SeedBundle(const SeedBundle &) = delete;
   SeedBundle &operator=(const SeedBundle &) = delete;
-  virtual ~SeedBundle() {}
+  virtual ~SeedBundle() = default;
 
   using iterator = SmallVector<Instruction *>::iterator;
   using const_iterator = SmallVector<Instruction *>::const_iterator;
diff --git a/llvm/include/llvm/XRay/FDRRecordConsumer.h b/llvm/include/llvm/XRay/FDRRecordConsumer.h
index 13bb711328fdc..4ff65f043fe17 100644
--- a/llvm/include/llvm/XRay/FDRRecordConsumer.h
+++ b/llvm/include/llvm/XRay/FDRRecordConsumer.h
@@ -11,7 +11,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/FDRRecords.h"
-#include <algorithm>
 #include <memory>
 #include <vector>
 
diff --git a/llvm/include/llvm/XRay/YAMLXRayRecord.h b/llvm/include/llvm/XRay/YAMLXRayRecord.h
index 6bf4f1d1ae082..8de569827586c 100644
--- a/llvm/include/llvm/XRay/YAMLXRayRecord.h
+++ b/llvm/include/llvm/XRay/YAMLXRayRecord.h
@@ -12,8 +12,6 @@
 #ifndef LLVM_XRAY_YAMLXRAYRECORD_H
 #define LLVM_XRAY_YAMLXRAYRECORD_H
 
-#include <type_traits>
-
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/XRay/XRayRecord.h"
 
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index f2dc25fa5dbf5..26a560252d9aa 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -75,7 +75,7 @@ AAResults::AAResults(const TargetLibraryInfo &TLI) : TLI(TLI) {}
 AAResults::AAResults(AAResults &&Arg)
     : TLI(Arg.TLI), AAs(std::move(Arg.AAs)), AADeps(std::move(Arg.AADeps)) {}
 
-AAResults::~AAResults() {}
+AAResults::~AAResults() = default;
 
 bool AAResults::invalidate(Function &F, const PreservedAnalyses &PA,
                            FunctionAnalysisManager::Invalidator &Inv) {
diff --git a/llvm/lib/Analysis/Analysis.cpp b/llvm/lib/Analysis/Analysis.cpp
index 9f5daf32be9a0..aaac2cf187281 100644
--- a/llvm/lib/Analysis/Analysis.cpp
+++ b/llvm/lib/Analysis/Analysis.cpp
@@ -63,6 +63,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeRegionPrinterPass(Registry);
   initializeRegionOnlyViewerPass(Registry);
   initializeRegionOnlyPrinterPass(Registry);
+  initializeRuntimeLibraryInfoWrapperPass(Registry);
   initializeSCEVAAWrapperPassPass(Registry);
   initializeScalarEvolutionWrapperPassPass(Registry);
   initializeStackSafetyGlobalInfoWrapperPassPass(Registry);
diff --git a/llvm/lib/Analysis/AssumptionCache.cpp b/llvm/lib/Analysis/AssumptionCache.cpp
index 61b7b3fa9e2c4..7fe00c6e22c51 100644
--- a/llvm/lib/Analysis/AssumptionCache.cpp
+++ b/llvm/lib/Analysis/AssumptionCache.cpp
@@ -32,7 +32,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 16dd6f8b86006..bff9b62d98e06 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -89,7 +89,6 @@ add_llvm_component_library(LLVMAnalysis
   InlineCost.cpp
   InlineAdvisor.cpp
   InlineOrder.cpp
-  InlineSizeEstimatorAnalysis.cpp
   InstCount.cpp
   InstructionPrecedenceTracking.cpp
   InstructionSimplify.cpp
@@ -137,6 +136,7 @@ add_llvm_component_library(LLVMAnalysis
   RegionPass.cpp
   RegionPrinter.cpp
   ReplayInlineAdvisor.cpp
+  RuntimeLibcallInfo.cpp
   ScalarEvolution.cpp
   ScalarEvolutionAliasAnalysis.cpp
   ScalarEvolutionDivision.cpp
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index e9e2e7d0316c7..da32542cf7870 100755
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -2163,18 +2163,42 @@ Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
 }
 
 Constant *constantFoldVectorReduce(Intrinsic::ID IID, Constant *Op) {
-  FixedVectorType *VT = dyn_cast<FixedVectorType>(Op->getType());
-  if (!VT)
-    return nullptr;
-
-  // This isn't strictly necessary, but handle the special/common case of zero:
-  // all integer reductions of a zero input produce zero.
-  if (isa<ConstantAggregateZero>(Op))
-    return ConstantInt::get(VT->getElementType(), 0);
+  auto *OpVT = cast<VectorType>(Op->getType());
 
   // This is the same as the underlying binops - poison propagates.
-  if (isa<PoisonValue>(Op) || Op->containsPoisonElement())
-    return PoisonValue::get(VT->getElementType());
+  if (Op->containsPoisonElement())
+    return PoisonValue::get(OpVT->getElementType());
+
+  // Shortcut non-accumulating reductions.
+  if (Constant *SplatVal = Op->getSplatValue()) {
+    switch (IID) {
+    case Intrinsic::vector_reduce_and:
+    case Intrinsic::vector_reduce_or:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_umin:
+    case Intrinsic::vector_reduce_umax:
+      return SplatVal;
+    case Intrinsic::vector_reduce_add:
+      if (SplatVal->isNullValue())
+        return SplatVal;
+      break;
+    case Intrinsic::vector_reduce_mul:
+      if (SplatVal->isNullValue() || SplatVal->isOneValue())
+        return SplatVal;
+      break;
+    case Intrinsic::vector_reduce_xor:
+      if (SplatVal->isNullValue())
+        return SplatVal;
+      if (OpVT->getElementCount().isKnownMultipleOf(2))
+        return Constant::getNullValue(OpVT->getElementType());
+      break;
+    }
+  }
+
+  FixedVectorType *VT = dyn_cast<FixedVectorType>(OpVT);
+  if (!VT)
+    return nullptr;
 
   // TODO: Handle undef.
   auto *EltC = dyn_cast_or_null<ConstantInt>(Op->getAggregateElement(0U));
diff --git a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp
index 23f1aa82ae8a3..bd77cba385667 100644
--- a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp
+++ b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp
@@ -66,6 +66,22 @@ static ModuleMetadataInfo collectMetadataInfo(Module &M) {
       Success = llvm::to_integer(NumThreadsVec[2], EFP.NumThreadsZ, 10);
       assert(Success && "Failed to parse Z component of numthreads");
     }
+    // Get wavesize attribute value, if one exists
+    StringRef WaveSizeStr =
+        F.getFnAttribute("hlsl.wavesize").getValueAsString();
+    if (!WaveSizeStr.empty()) {
+      SmallVector<StringRef> WaveSizeVec;
+      WaveSizeStr.split(WaveSizeVec, ',');
+      assert(WaveSizeVec.size() == 3 && "Invalid wavesize specified");
+      // Read in the three component values of numthreads
+      [[maybe_unused]] bool Success =
+          llvm::to_integer(WaveSizeVec[0], EFP.WaveSizeMin, 10);
+      assert(Success && "Failed to parse Min component of wavesize");
+      Success = llvm::to_integer(WaveSizeVec[1], EFP.WaveSizeMax, 10);
+      assert(Success && "Failed to parse Max component of wavesize");
+      Success = llvm::to_integer(WaveSizeVec[2], EFP.WaveSizePref, 10);
+      assert(Success && "Failed to parse Preferred component of wavesize");
+    }
     MMDAI.EntryPropertyVec.push_back(EFP);
   }
   return MMDAI;
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index 27114e0705a1d..033f516abe017 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Support/DXILABI.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
-#include <optional>
 
 #define DEBUG_TYPE "dxil-resource"
 
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 11d829492a10e..b3b62cfe8b459 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -407,9 +407,10 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
         continue;
       Value *Ptr = getLoadStorePointerOperand(&Inst);
       const Loop *L = LI.getLoopFor(Inst.getParent());
+      const Loop *OutermostLoop = L ? L->getOutermostLoop() : nullptr;
       const SCEV *PtrSCEV = SE.getSCEVAtScope(Ptr, L);
       const SCEV *AccessFn = SE.removePointerBase(PtrSCEV);
-      SCEVMonotonicity Mon = Checker.checkMonotonicity(AccessFn, L);
+      SCEVMonotonicity Mon = Checker.checkMonotonicity(AccessFn, OutermostLoop);
       OS.indent(2) << "Inst: " << Inst << "\n";
       OS.indent(4) << "Expr: " << *AccessFn << "\n";
       Mon.print(OS, 4);
@@ -945,6 +946,8 @@ SCEVMonotonicity SCEVMonotonicityChecker::invariantOrUnknown(const SCEV *Expr) {
 SCEVMonotonicity
 SCEVMonotonicityChecker::checkMonotonicity(const SCEV *Expr,
                                            const Loop *OutermostLoop) {
+  assert((!OutermostLoop || OutermostLoop->isOutermost()) &&
+         "OutermostLoop must be outermost");
   assert(Expr->getType()->isIntegerTy() && "Expr must be integer type");
   this->OutermostLoop = OutermostLoop;
   return visit(Expr);
@@ -1587,6 +1590,15 @@ static const SCEV *minusSCEVNoSignedOverflow(const SCEV *A, const SCEV *B,
   return nullptr;
 }
 
+/// Returns \p A * \p B if it guaranteed not to signed wrap. Otherwise returns
+/// nullptr. \p A and \p B must have the same integer type.
+static const SCEV *mulSCEVNoSignedOverflow(const SCEV *A, const SCEV *B,
+                                           ScalarEvolution &SE) {
+  if (SE.willNotOverflow(Instruction::Mul, /*Signed=*/true, A, B))
+    return SE.getMulExpr(A, B);
+  return nullptr;
+}
+
 /// Returns the absolute value of \p A. In the context of dependence analysis,
 /// we need an absolute value in a mathematical sense. If \p A is the signed
 /// minimum value, we cannot represent it unless extending the original type.
@@ -1686,7 +1698,11 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
   assert(0 < Level && Level <= CommonLevels && "level out of range");
   Level--;
 
-  const SCEV *Delta = SE->getMinusSCEV(SrcConst, DstConst);
+  const SCEV *Delta = minusSCEVNoSignedOverflow(SrcConst, DstConst, *SE);
+  if (!Delta) {
+    Result.Consistent = false;
+    return false;
+  }
   LLVM_DEBUG(dbgs() << "\t    Delta = " << *Delta);
   LLVM_DEBUG(dbgs() << ", " << *Delta->getType() << "\n");
 
@@ -1702,7 +1718,9 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
     const SCEV *AbsCoeff = absSCEVNoSignedOverflow(Coeff, *SE);
     if (!AbsDelta || !AbsCoeff)
       return false;
-    const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff);
+    const SCEV *Product = mulSCEVNoSignedOverflow(UpperBound, AbsCoeff, *SE);
+    if (!Product)
+      return false;
     return isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product);
   }();
   if (IsDeltaLarge) {
diff --git a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
index 67e38ab8b35aa..d2be805a6f7a5 100644
--- a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
@@ -16,7 +16,6 @@
 
 #include "llvm/ADT/BitVector.h"
 #include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/InlineSizeEstimatorAnalysis.h"
 #include "llvm/Analysis/MLInlineAdvisor.h"
 #include "llvm/Analysis/ModelUnderTrainingRunner.h"
 #include "llvm/Analysis/NoInferenceModelRunner.h"
@@ -89,9 +88,6 @@ struct InlineEvent {
   /// error, even if AdvisedDecision were true, otherwise it agrees with
   /// AdvisedDecision.
   bool Effect = false;
-
-  /// What the change in size was: size_after - size_before
-  int64_t Reward = 0;
 };
 
 /// Collect data we may use for training a model.
@@ -150,31 +146,15 @@ class DevelopmentModeMLInlineAdvisor : public MLInlineAdvisor {
           GetModelRunner,
       std::function<bool(CallBase &)> GetDefaultAdvice);
 
-  size_t getTotalSizeEstimate();
-
-  void updateNativeSizeEstimate(int64_t Change) {
-    *CurrentNativeSize += Change;
-  }
-  void resetNativeSize(Function *F) {
-    PreservedAnalyses PA = PreservedAnalyses::all();
-    PA.abandon<InlineSizeEstimatorAnalysis>();
-    FAM.invalidate(*F, PA);
-  }
-
   std::unique_ptr<MLInlineAdvice>
   getAdviceFromModel(CallBase &CB, OptimizationRemarkEmitter &ORE) override;
 
-  std::optional<size_t> getNativeSizeEstimate(const Function &F) const;
-
 private:
   bool isLogging() const { return !!Logger; }
   std::unique_ptr<MLInlineAdvice> getMandatoryAdviceImpl(CallBase &CB) override;
 
   const bool IsDoingInference;
   std::unique_ptr<TrainingLogger> Logger;
-
-  const std::optional<int32_t> InitialNativeSize;
-  std::optional<int32_t> CurrentNativeSize;
 };
 
 /// A variant of MLInlineAdvice that tracks all non-trivial inlining
@@ -183,13 +163,9 @@ class LoggingMLInlineAdvice : public MLInlineAdvice {
 public:
   LoggingMLInlineAdvice(DevelopmentModeMLInlineAdvisor *Advisor, CallBase &CB,
                         OptimizationRemarkEmitter &ORE, bool Recommendation,
-                        TrainingLogger &Logger,
-                        std::optional<size_t> CallerSizeEstimateBefore,
-                        std::optional<size_t> CalleeSizeEstimateBefore,
-                        bool DefaultDecision, bool Mandatory = false)
+                        TrainingLogger &Logger, bool DefaultDecision,
+                        bool Mandatory = false)
       : MLInlineAdvice(Advisor, CB, ORE, Recommendation), Logger(Logger),
-        CallerSizeEstimateBefore(CallerSizeEstimateBefore),
-        CalleeSizeEstimateBefore(CalleeSizeEstimateBefore),
         DefaultDecision(DefaultDecision), Mandatory(Mandatory) {}
 
   virtual ~LoggingMLInlineAdvice() = default;
@@ -200,59 +176,35 @@ class LoggingMLInlineAdvice : public MLInlineAdvice {
   }
   void recordInliningImpl() override {
     MLInlineAdvice::recordInliningImpl();
-    getAdvisor()->resetNativeSize(Caller);
-    int Reward = std::numeric_limits<int>::max();
-    if (InlineSizeEstimatorAnalysis::isEvaluatorRequested() &&
-        !getAdvisor()->isForcedToStop()) {
-      int NativeSizeAfter = *getAdvisor()->getNativeSizeEstimate(*Caller) +
-                            *CalleeSizeEstimateBefore;
-      Reward = NativeSizeAfter -
-               (*CallerSizeEstimateBefore + *CalleeSizeEstimateBefore);
-      getAdvisor()->updateNativeSizeEstimate(Reward);
-    }
-    log(Reward, /*Success=*/true);
+    log(/*Success=*/true);
   }
 
   void recordInliningWithCalleeDeletedImpl() override {
     MLInlineAdvice::recordInliningWithCalleeDeletedImpl();
-    getAdvisor()->resetNativeSize(Caller);
-    if (InlineSizeEstimatorAnalysis::isEvaluatorRequested() &&
-        !getAdvisor()->isForcedToStop()) {
-      int NativeSizeAfter = *getAdvisor()->getNativeSizeEstimate(*Caller);
-      int Reward = NativeSizeAfter -
-                   (*CallerSizeEstimateBefore + *CalleeSizeEstimateBefore);
-      getAdvisor()->updateNativeSizeEstimate(Reward);
-      log(Reward, /*Success=*/true);
-    } else {
-      log(NoReward, /*Success=*/true);
-    }
+    log(/*Success=*/true);
   }
 
   void recordUnsuccessfulInliningImpl(const InlineResult &Result) override {
     MLInlineAdvice::recordUnsuccessfulInliningImpl(Result);
-    log(NoReward, /*Success=*/false);
+    log(/*Success=*/false);
   }
 
   void recordUnattemptedInliningImpl() override {
     MLInlineAdvice::recordUnattemptedInliningImpl();
-    log(NoReward, /*Success=*/false);
+    log(/*Success=*/false);
   }
 
-  void log(int64_t Reward, bool Success) {
+  void log(bool Success) {
     if (Mandatory)
       return;
     InlineEvent Event;
     Event.AdvisedDecision = isInliningRecommended();
     Event.DefaultDecision = DefaultDecision;
     Event.Effect = Success;
-    Event.Reward = Reward;
     Logger.logInlineEvent(Event, getAdvisor()->getModelRunner());
   }
 
-  static const int64_t NoReward = 0;
   TrainingLogger &Logger;
-  const std::optional<size_t> CallerSizeEstimateBefore;
-  const std::optional<size_t> CalleeSizeEstimateBefore;
   const int64_t DefaultDecision;
   const int64_t Mandatory;
 };
@@ -296,9 +248,9 @@ TrainingLogger::TrainingLogger(StringRef LogFileName,
   if (EC)
     dbgs() << (EC.message() + ":" + TrainingLog);
 
-  L = std::make_unique<Logger>(
-      std::move(OS), FT, TensorSpec::createSpec<int64_t>(RewardName, {1}),
-      InlineSizeEstimatorAnalysis::isEvaluatorRequested());
+  L = std::make_unique<Logger>(std::move(OS), FT,
+                               TensorSpec::createSpec<int64_t>(RewardName, {1}),
+                               false);
   L->switchContext("");
 }
 
@@ -326,8 +278,6 @@ void TrainingLogger::logInlineEvent(const InlineEvent &Event,
   L->logTensorValue(DecisionPos,
                     reinterpret_cast<const char *>(&Event.AdvisedDecision));
   L->endObservation();
-  if (InlineSizeEstimatorAnalysis::isEvaluatorRequested())
-    L->logReward(Event.Reward);
 
   // For debugging / later use
   Effects.push_back(Event.Effect);
@@ -340,9 +290,7 @@ DevelopmentModeMLInlineAdvisor::DevelopmentModeMLInlineAdvisor(
         GetModelRunner,
     std::function<bool(CallBase &)> GetDefaultAdvice)
     : MLInlineAdvisor(M, MAM, GetModelRunner, GetDefaultAdvice),
-      IsDoingInference(isa<ModelUnderTrainingRunner>(getModelRunner())),
-      InitialNativeSize(isLogging() ? getTotalSizeEstimate() : 0),
-      CurrentNativeSize(InitialNativeSize) {
+      IsDoingInference(isa<ModelUnderTrainingRunner>(getModelRunner())) {
   // We cannot have the case of neither inference nor logging.
   if (!TrainingLog.empty())
     Logger = std::make_unique<TrainingLogger>(
@@ -351,29 +299,12 @@ DevelopmentModeMLInlineAdvisor::DevelopmentModeMLInlineAdvisor(
   assert(IsDoingInference || isLogging());
 }
 
-std::optional<size_t>
-DevelopmentModeMLInlineAdvisor::getNativeSizeEstimate(const Function &F) const {
-  if (!InlineSizeEstimatorAnalysis::isEvaluatorRequested())
-    return std::nullopt;
-  auto &R =
-      FAM.getResult<InlineSizeEstimatorAnalysis>(const_cast<Function &>(F));
-  if (!R) {
-    F.getParent()->getContext().emitError(
-        "Native size estimator is not present.");
-    return 0;
-  }
-  return *R;
-}
-
 std::unique_ptr<MLInlineAdvice>
 DevelopmentModeMLInlineAdvisor::getMandatoryAdviceImpl(CallBase &CB) {
   return std::make_unique<LoggingMLInlineAdvice>(
       /*Advisor=*/this,
       /*CB=*/CB, /*ORE=*/getCallerORE(CB), /*Recommendation=*/true,
       /*Logger=*/*Logger,
-      /*CallerSizeEstimateBefore=*/getNativeSizeEstimate(*CB.getCaller()),
-      /*CalleeSizeEstimateBefore=*/
-      getNativeSizeEstimate(*CB.getCalledFunction()),
       /*DefaultDecision=*/true, /*Mandatory*/ true);
 }
 
@@ -391,24 +322,9 @@ DevelopmentModeMLInlineAdvisor::getAdviceFromModel(
       /*Advisor=*/this,
       /*CB=*/CB, /*ORE=*/ORE, /*Recommendation=*/Recommendation,
       /*Logger=*/*Logger,
-      /*CallerSizeEstimateBefore=*/getNativeSizeEstimate(*CB.getCaller()),
-      /*CalleeSizeEstimateBefore=*/
-      getNativeSizeEstimate(*CB.getCalledFunction()),
       /*DefaultDecision=*/DefaultAdvice);
 }
 
-size_t DevelopmentModeMLInlineAdvisor::getTotalSizeEstimate() {
-  if (!InlineSizeEstimatorAnalysis::isEvaluatorRequested())
-    return 0;
-  size_t Ret = 0;
-  for (auto &F : M) {
-    if (F.isDeclaration())
-      continue;
-    Ret += *getNativeSizeEstimate(F);
-  }
-  return Ret;
-}
-
 std::unique_ptr<InlineAdvisor> llvm::getDevelopmentModeAdvisor(
     Module &M, ModuleAnalysisManager &MAM,
     std::function<bool(CallBase &)> GetDefaultAdvice) {
diff --git a/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp b/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp
deleted file mode 100644
index fc635726a6aa4..0000000000000
--- a/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp
+++ /dev/null
@@ -1,281 +0,0 @@
-//===- InlineSizeEstimatorAnalysis.cpp - IR to native size from ML model --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This implements feature and label extraction for offline supervised learning
-// of a IR to native size model.
-//
-//===----------------------------------------------------------------------===//
-#include "llvm/Analysis/InlineSizeEstimatorAnalysis.h"
-
-#ifdef LLVM_HAVE_TFLITE
-#include "llvm/Analysis/Utils/TFUtils.h"
-#endif
-#include "llvm/IR/Function.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-AnalysisKey InlineSizeEstimatorAnalysis::Key;
-
-#ifdef LLVM_HAVE_TFLITE
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include <algorithm>
-#include <deque>
-#include <optional>
-
-static cl::opt<std::string> TFIR2NativeModelPath(
-    "ml-inliner-ir2native-model", cl::Hidden,
-    cl::desc("Path to saved model evaluating native size from IR."));
-
-#define DEBUG_TYPE "inline-size-estimator"
-namespace {
-unsigned getMaxInstructionID() {
-#define LAST_OTHER_INST(NR) return NR;
-#include "llvm/IR/Instruction.def"
-}
-
-class IRToNativeSizeLearning {
-public:
-  enum class NamedFeatureIndex : size_t {
-    InitialSize,
-    Blocks,
-    Calls,
-    IsLocal,
-    IsLinkOnceODR,
-    IsLinkOnce,
-    Loops,
-    MaxLoopDepth,
-    MaxDomTreeLevel,
-
-    NumNamedFeatures
-  };
-  static const size_t NumNamedFeatures =
-      static_cast<size_t>(NamedFeatureIndex::NumNamedFeatures);
-  struct FunctionFeatures {
-    static const size_t FeatureCount;
-
-    std::array<int32_t, NumNamedFeatures> NamedFeatures = {0};
-    std::vector<int32_t> InstructionHistogram;
-    std::vector<int32_t> InstructionPairHistogram;
-
-    void fillTensor(int32_t *Ptr) const;
-    int32_t &operator[](NamedFeatureIndex Pos) {
-      return NamedFeatures[static_cast<size_t>(Pos)];
-    }
-  };
-  IRToNativeSizeLearning() = default;
-
-  static FunctionFeatures getFunctionFeatures(Function &F,
-                                              FunctionAnalysisManager &FAM);
-};
-
-// This is a point in time - we determined including these pairs of
-// consecutive instructions (in the IR layout available at inline time) as
-// features improves the model performance. We want to move away from manual
-// feature selection.
-// The array is given in opcode pairs rather than labels because 1) labels
-// weren't readily available, and 2) the successions were hand - extracted.
-//
-// This array must be sorted.
-static const std::array<std::pair<size_t, size_t>, 137>
-    ImportantInstructionSuccessions{
-        {{1, 1},   {1, 4},   {1, 5},   {1, 7},   {1, 8},   {1, 9},   {1, 11},
-         {1, 12},  {1, 13},  {1, 14},  {1, 18},  {1, 20},  {1, 22},  {1, 24},
-         {1, 25},  {1, 26},  {1, 27},  {1, 28},  {1, 29},  {1, 30},  {1, 31},
-         {1, 32},  {1, 33},  {1, 34},  {1, 39},  {1, 40},  {1, 42},  {1, 45},
-         {2, 1},   {2, 2},   {2, 13},  {2, 28},  {2, 29},  {2, 32},  {2, 33},
-         {2, 34},  {2, 38},  {2, 48},  {2, 49},  {2, 53},  {2, 55},  {2, 56},
-         {13, 2},  {13, 13}, {13, 26}, {13, 33}, {13, 34}, {13, 56}, {15, 27},
-         {28, 2},  {28, 48}, {28, 53}, {29, 2},  {29, 33}, {29, 56}, {31, 31},
-         {31, 33}, {31, 34}, {31, 49}, {32, 1},  {32, 2},  {32, 13}, {32, 15},
-         {32, 28}, {32, 29}, {32, 32}, {32, 33}, {32, 34}, {32, 39}, {32, 40},
-         {32, 48}, {32, 49}, {32, 53}, {32, 56}, {33, 1},  {33, 2},  {33, 32},
-         {33, 33}, {33, 34}, {33, 49}, {33, 53}, {33, 56}, {34, 1},  {34, 2},
-         {34, 32}, {34, 33}, {34, 34}, {34, 49}, {34, 53}, {34, 56}, {38, 34},
-         {39, 57}, {40, 34}, {47, 15}, {47, 49}, {48, 2},  {48, 34}, {48, 56},
-         {49, 1},  {49, 2},  {49, 28}, {49, 32}, {49, 33}, {49, 34}, {49, 39},
-         {49, 49}, {49, 56}, {53, 1},  {53, 2},  {53, 28}, {53, 34}, {53, 53},
-         {53, 57}, {55, 1},  {55, 28}, {55, 34}, {55, 53}, {55, 55}, {55, 56},
-         {56, 1},  {56, 2},  {56, 7},  {56, 13}, {56, 32}, {56, 33}, {56, 34},
-         {56, 49}, {56, 53}, {56, 56}, {56, 64}, {57, 34}, {57, 56}, {57, 57},
-         {64, 1},  {64, 64}, {65, 1},  {65, 65}}};
-
-// We have: 9 calculated features (the features here); 1 feature for each
-// instruction opcode; and 1 feature for each manually-identified sequence.
-// For the latter 2, we build a histogram: we count the number of
-// occurrences of each instruction opcode or succession of instructions,
-// respectively.
-// Note that instruction opcodes start from 1. For convenience, we also have an
-// always 0 feature for the '0' opcode, hence the extra 1.
-const size_t IRToNativeSizeLearning::FunctionFeatures::FeatureCount =
-    ImportantInstructionSuccessions.size() + getMaxInstructionID() + 1 +
-    IRToNativeSizeLearning::NumNamedFeatures;
-
-size_t getSize(Function &F, TargetTransformInfo &TTI) {
-  size_t Ret = 0;
-  for (const auto &BB : F)
-    for (const auto &I : BB)
-      Ret += TTI.getInstructionCost(
-                    &I, TargetTransformInfo::TargetCostKind::TCK_CodeSize)
-                 .getValue();
-  return Ret;
-}
-
-size_t getSize(Function &F, FunctionAnalysisManager &FAM) {
-  auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
-  return getSize(F, TTI);
-}
-
-unsigned getMaxDominatorTreeDepth(const Function &F,
-                                  const DominatorTree &Tree) {
-  unsigned Ret = 0;
-  for (const auto &BB : F)
-    if (const auto *TN = Tree.getNode(&BB))
-      Ret = std::max(Ret, TN->getLevel());
-  return Ret;
-}
-} // namespace
-
-IRToNativeSizeLearning::FunctionFeatures
-IRToNativeSizeLearning::getFunctionFeatures(Function &F,
-                                            FunctionAnalysisManager &FAM) {
-  assert(llvm::is_sorted(ImportantInstructionSuccessions) &&
-         "expected function features are sorted");
-
-  auto &DomTree = FAM.getResult<DominatorTreeAnalysis>(F);
-  FunctionFeatures FF;
-  size_t InstrCount = getMaxInstructionID() + 1;
-  FF.InstructionHistogram.resize(InstrCount);
-
-  FF.InstructionPairHistogram.resize(ImportantInstructionSuccessions.size());
-
-  int StartID = 0;
-  int LastID = StartID;
-  auto getPairIndex = [](size_t a, size_t b) {
-    auto I = llvm::find(ImportantInstructionSuccessions, std::make_pair(a, b));
-    if (I == ImportantInstructionSuccessions.end())
-      return -1;
-    return static_cast<int>(
-        std::distance(ImportantInstructionSuccessions.begin(), I));
-  };
-
-  // We don't want debug calls, because they'd just add noise.
-  for (const auto &BB : F) {
-    for (const auto &I : BB.instructionsWithoutDebug()) {
-      auto ID = I.getOpcode();
-
-      ++FF.InstructionHistogram[ID];
-      int PairIndex = getPairIndex(LastID, ID);
-      if (PairIndex >= 0)
-        ++FF.InstructionPairHistogram[PairIndex];
-      LastID = ID;
-      if (isa<CallBase>(I))
-        ++FF[NamedFeatureIndex::Calls];
-    }
-  }
-
-  FF[NamedFeatureIndex::InitialSize] = getSize(F, FAM);
-  FF[NamedFeatureIndex::IsLocal] = F.hasLocalLinkage();
-  FF[NamedFeatureIndex::IsLinkOnceODR] = F.hasLinkOnceODRLinkage();
-  FF[NamedFeatureIndex::IsLinkOnce] = F.hasLinkOnceLinkage();
-  FF[NamedFeatureIndex::Blocks] = F.size();
-  auto &LI = FAM.getResult<LoopAnalysis>(F);
-  FF[NamedFeatureIndex::Loops] = std::distance(LI.begin(), LI.end());
-  for (auto &L : LI)
-    FF[NamedFeatureIndex::MaxLoopDepth] =
-        std::max(FF[NamedFeatureIndex::MaxLoopDepth],
-                 static_cast<int32_t>(L->getLoopDepth()));
-  FF[NamedFeatureIndex::MaxDomTreeLevel] = getMaxDominatorTreeDepth(F, DomTree);
-  return FF;
-}
-
-void IRToNativeSizeLearning::FunctionFeatures::fillTensor(int32_t *Ptr) const {
-  std::copy(NamedFeatures.begin(), NamedFeatures.end(), Ptr);
-  Ptr += NamedFeatures.size();
-  std::copy(InstructionHistogram.begin(), InstructionHistogram.end(), Ptr);
-  Ptr += InstructionHistogram.size();
-  std::copy(InstructionPairHistogram.begin(), InstructionPairHistogram.end(),
-            Ptr);
-}
-
-bool InlineSizeEstimatorAnalysis::isEvaluatorRequested() {
-  return !TFIR2NativeModelPath.empty();
-}
-
-InlineSizeEstimatorAnalysis::InlineSizeEstimatorAnalysis() {
-  if (!isEvaluatorRequested()) {
-    return;
-  }
-  std::vector<TensorSpec> InputSpecs{TensorSpec::createSpec<int32_t>(
-      "serving_default_input_1",
-      {1, static_cast<int64_t>(
-              IRToNativeSizeLearning::FunctionFeatures::FeatureCount)})};
-  std::vector<TensorSpec> OutputSpecs{
-      TensorSpec::createSpec<float>("StatefulPartitionedCall", {1})};
-  Evaluator = std::make_unique<TFModelEvaluator>(
-      TFIR2NativeModelPath.getValue().c_str(), InputSpecs, OutputSpecs);
-  if (!Evaluator || !Evaluator->isValid()) {
-    Evaluator.reset();
-    return;
-  }
-}
-
-InlineSizeEstimatorAnalysis::Result
-InlineSizeEstimatorAnalysis::run(const Function &F,
-                                 FunctionAnalysisManager &FAM) {
-  if (!Evaluator)
-    return std::nullopt;
-  auto Features = IRToNativeSizeLearning::getFunctionFeatures(
-      const_cast<Function &>(F), FAM);
-  int32_t *V = Evaluator->getInput<int32_t>(0);
-  Features.fillTensor(V);
-  auto ER = Evaluator->evaluate();
-  if (!ER)
-    return std::nullopt;
-  float Ret = *ER->getTensorValue<float>(0);
-  if (Ret < 0.0)
-    Ret = 0.0;
-  return static_cast<size_t>(Ret);
-}
-
-InlineSizeEstimatorAnalysis::~InlineSizeEstimatorAnalysis() {}
-InlineSizeEstimatorAnalysis::InlineSizeEstimatorAnalysis(
-    InlineSizeEstimatorAnalysis &&Other)
-    : Evaluator(std::move(Other.Evaluator)) {}
-
-#else
-namespace llvm {
-class TFModelEvaluator {};
-} // namespace llvm
-InlineSizeEstimatorAnalysis::InlineSizeEstimatorAnalysis() = default;
-InlineSizeEstimatorAnalysis ::InlineSizeEstimatorAnalysis(
-    InlineSizeEstimatorAnalysis &&) {}
-InlineSizeEstimatorAnalysis::~InlineSizeEstimatorAnalysis() = default;
-InlineSizeEstimatorAnalysis::Result
-InlineSizeEstimatorAnalysis::run(const Function &F,
-                                 FunctionAnalysisManager &FAM) {
-  return std::nullopt;
-}
-bool InlineSizeEstimatorAnalysis::isEvaluatorRequested() { return false; }
-#endif
-
-PreservedAnalyses
-InlineSizeEstimatorAnalysisPrinterPass::run(Function &F,
-                                            FunctionAnalysisManager &AM) {
-  OS << "[InlineSizeEstimatorAnalysis] size estimate for " << F.getName()
-     << ": " << AM.getResult<InlineSizeEstimatorAnalysis>(F) << "\n";
-  return PreservedAnalyses::all();
-}
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index e27a9b1c44014..5d88e5f54e3d6 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -806,11 +806,11 @@ class AccessAnalysis {
   typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
 
   AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI,
-                 MemoryDepChecker::DepCandidates &DA,
+                 DominatorTree &DT, MemoryDepChecker::DepCandidates &DA,
                  PredicatedScalarEvolution &PSE,
                  SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
-      : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE),
-        LoopAliasScopes(LoopAliasScopes) {
+      : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA),
+        PSE(PSE), LoopAliasScopes(LoopAliasScopes) {
     // We're analyzing dependences across loop iterations.
     BAA.enableCrossIterationMode();
   }
@@ -934,6 +934,9 @@ class AccessAnalysis {
   /// The LoopInfo of the loop being checked.
   const LoopInfo *LI;
 
+  /// The dominator tree of the function.
+  DominatorTree &DT;
+
   /// Sets of potentially dependent accesses - members of one set share an
   /// underlying pointer. The set "CheckDeps" identfies which sets really need a
   /// dependence check.
@@ -1015,6 +1018,7 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
 /// informating from the IR pointer value to determine no-wrap.
 static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
                      Value *Ptr, Type *AccessTy, const Loop *L, bool Assume,
+                     const DominatorTree &DT,
                      std::optional<int64_t> Stride = std::nullopt) {
   // FIXME: This should probably only return true for NUW.
   if (AR->getNoWrapFlags(SCEV::NoWrapMask))
@@ -1029,8 +1033,18 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
   // case, the GEP would be  poison and any memory access dependent on it would
   // be immediate UB when executed.
   if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Ptr);
-      GEP && GEP->hasNoUnsignedSignedWrap())
-    return true;
+      GEP && GEP->hasNoUnsignedSignedWrap()) {
+    // For the above reasoning to apply, the pointer must be dereferenced in
+    // every iteration.
+    if (L->getHeader() == L->getLoopLatch() ||
+        any_of(GEP->users(), [L, &DT, GEP](User *U) {
+          if (getLoadStorePointerOperand(U) != GEP)
+            return false;
+          BasicBlock *UserBB = cast<Instruction>(U)->getParent();
+          return !LoopAccessInfo::blockNeedsPredication(UserBB, L, &DT);
+        }))
+      return true;
+  }
 
   if (!Stride)
     Stride = getStrideFromAddRec(AR, L, AccessTy, Ptr, PSE);
@@ -1293,7 +1307,7 @@ bool AccessAnalysis::createCheckForAccess(
     }
 
     if (!isNoWrap(PSE, AR, RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
-                  TheLoop, Assume))
+                  TheLoop, Assume, DT))
       return false;
   }
 
@@ -1606,7 +1620,7 @@ void AccessAnalysis::processMemAccesses() {
 /// Check whether the access through \p Ptr has a constant stride.
 std::optional<int64_t>
 llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
-                   const Loop *Lp,
+                   const Loop *Lp, const DominatorTree &DT,
                    const DenseMap<Value *, const SCEV *> &StridesMap,
                    bool Assume, bool ShouldCheckWrap) {
   const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
@@ -1630,7 +1644,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
   if (!ShouldCheckWrap || !Stride)
     return Stride;
 
-  if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride))
+  if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, DT, Stride))
     return Stride;
 
   LLVM_DEBUG(
@@ -2047,10 +2061,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
       BPtr->getType()->getPointerAddressSpace())
     return MemoryDepChecker::Dependence::Unknown;
 
-  std::optional<int64_t> StrideAPtr =
-      getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true);
-  std::optional<int64_t> StrideBPtr =
-      getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true);
+  std::optional<int64_t> StrideAPtr = getPtrStride(
+      PSE, ATy, APtr, InnermostLoop, *DT, SymbolicStrides, true, true);
+  std::optional<int64_t> StrideBPtr = getPtrStride(
+      PSE, BTy, BPtr, InnermostLoop, *DT, SymbolicStrides, true, true);
 
   const SCEV *Src = PSE.getSCEV(APtr);
   const SCEV *Sink = PSE.getSCEV(BPtr);
@@ -2627,7 +2641,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
   }
 
   MemoryDepChecker::DepCandidates DepCands;
-  AccessAnalysis Accesses(TheLoop, AA, LI, DepCands, *PSE, LoopAliasScopes);
+  AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE,
+                          LoopAliasScopes);
 
   // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
@@ -2691,7 +2706,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
     bool IsReadOnlyPtr = false;
     Type *AccessTy = getLoadStoreType(LD);
     if (Seen.insert({Ptr, AccessTy}).second ||
-        !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides)) {
+        !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, *DT, SymbolicStrides, false,
+                      true)) {
       ++NumReads;
       IsReadOnlyPtr = true;
     }
diff --git a/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp b/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
index f31d625eca14c..9d53c37461ba8 100644
--- a/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
+++ b/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
@@ -43,13 +43,19 @@ static void printModuleDebugInfo(raw_ostream &O, const Module *M,
   // filenames), so just print a few useful things.
   for (DICompileUnit *CU : Finder.compile_units()) {
     O << "Compile unit: ";
-    auto Lang =
-        dwarf::LanguageString(CU->getSourceLanguage().getUnversionedName());
-    if (!Lang.empty())
-      O << Lang;
+
+    DISourceLanguageName Lang = CU->getSourceLanguage();
+    auto LangStr =
+        Lang.hasVersionedName()
+            ? dwarf::SourceLanguageNameString(
+                  static_cast<llvm::dwarf::SourceLanguageName>(Lang.getName()))
+            : dwarf::LanguageString(Lang.getName());
+
+    if (!LangStr.empty())
+      O << LangStr;
     else
-      O << "unknown-language(" << CU->getSourceLanguage().getUnversionedName()
-        << ")";
+      O << "unknown-language(" << CU->getSourceLanguage().getName() << ")";
+
     printFile(O, CU->getFilename(), CU->getDirectory());
     O << '\n';
   }
diff --git a/llvm/lib/Analysis/RegionPrinter.cpp b/llvm/lib/Analysis/RegionPrinter.cpp
index a83af4ebb430e..33e073b55d59c 100644
--- a/llvm/lib/Analysis/RegionPrinter.cpp
+++ b/llvm/lib/Analysis/RegionPrinter.cpp
@@ -29,10 +29,9 @@ onlySimpleRegions("only-simple-regions",
                   cl::Hidden,
                   cl::init(false));
 
-namespace llvm {
-
-std::string DOTGraphTraits<RegionNode *>::getNodeLabel(RegionNode *Node,
-                                                       RegionNode *Graph) {
+std::string
+llvm::DOTGraphTraits<RegionNode *>::getNodeLabel(RegionNode *Node,
+                                                 RegionNode *Graph) {
   if (!Node->isSubRegion()) {
     BasicBlock *BB = Node->getNodeAs<BasicBlock>();
 
@@ -46,7 +45,8 @@ std::string DOTGraphTraits<RegionNode *>::getNodeLabel(RegionNode *Node,
 }
 
 template <>
-struct DOTGraphTraits<RegionInfo *> : public DOTGraphTraits<RegionNode *> {
+struct llvm::DOTGraphTraits<RegionInfo *>
+    : public llvm::DOTGraphTraits<RegionNode *> {
 
   DOTGraphTraits (bool isSimple = false)
     : DOTGraphTraits<RegionNode*>(isSimple) {}
@@ -125,7 +125,6 @@ struct DOTGraphTraits<RegionInfo *> : public DOTGraphTraits<RegionNode *> {
     printRegionCluster(*G->getTopLevelRegion(), GW, 4);
   }
 };
-} // end namespace llvm
 
 namespace {
 
diff --git a/llvm/lib/Analysis/RuntimeLibcallInfo.cpp b/llvm/lib/Analysis/RuntimeLibcallInfo.cpp
new file mode 100644
index 0000000000000..6fb4119aa73f2
--- /dev/null
+++ b/llvm/lib/Analysis/RuntimeLibcallInfo.cpp
@@ -0,0 +1,43 @@
+//===- RuntimeLibcallInfo.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/RuntimeLibcallInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+AnalysisKey RuntimeLibraryAnalysis::Key;
+
+RTLIB::RuntimeLibcallsInfo
+RuntimeLibraryAnalysis::run(const Module &M, ModuleAnalysisManager &) {
+  return RTLIB::RuntimeLibcallsInfo(M);
+}
+
+INITIALIZE_PASS(RuntimeLibraryInfoWrapper, "runtime-library-info",
+                "Runtime Library Function Analysis", false, true)
+
+RuntimeLibraryInfoWrapper::RuntimeLibraryInfoWrapper()
+    : ImmutablePass(ID), RTLA(RTLIB::RuntimeLibcallsInfo(Triple())) {}
+
+char RuntimeLibraryInfoWrapper::ID = 0;
+
+ModulePass *llvm::createRuntimeLibraryInfoWrapperPass() {
+  return new RuntimeLibraryInfoWrapper();
+}
+
+void RuntimeLibraryInfoWrapper::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+}
+
+// Assume this is stable unless explicitly invalidated.
+bool RTLIB::RuntimeLibcallsInfo::invalidate(
+    Module &M, const PreservedAnalyses &PA,
+    ModuleAnalysisManager::Invalidator &) {
+  auto PAC = PA.getChecker<RuntimeLibraryAnalysis>();
+  return !PAC.preservedWhenStateless();
+}
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 7597f3ad685a0..a31f17b1936d6 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -2424,10 +2424,10 @@ ScalarEvolution::getStrengthenedNoWrapFlagsFromBinOp(
 // We're trying to construct a SCEV of type `Type' with `Ops' as operands and
 // `OldFlags' as can't-wrap behavior.  Infer a more aggressive set of
 // can't-overflow flags for the operation if possible.
-static SCEV::NoWrapFlags
-StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type,
-                      const ArrayRef<const SCEV *> Ops,
-                      SCEV::NoWrapFlags Flags) {
+static SCEV::NoWrapFlags StrengthenNoWrapFlags(ScalarEvolution *SE,
+                                               SCEVTypes Type,
+                                               ArrayRef<const SCEV *> Ops,
+                                               SCEV::NoWrapFlags Flags) {
   using namespace std::placeholders;
 
   using OBO = OverflowingBinaryOperator;
@@ -2540,7 +2540,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   unsigned Idx = isa<SCEVConstant>(Ops[0]) ? 1 : 0;
 
   // Delay expensive flag strengthening until necessary.
-  auto ComputeFlags = [this, OrigFlags](const ArrayRef<const SCEV *> Ops) {
+  auto ComputeFlags = [this, OrigFlags](ArrayRef<const SCEV *> Ops) {
     return StrengthenNoWrapFlags(this, scAddExpr, Ops, OrigFlags);
   };
 
@@ -3125,7 +3125,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
     return Folded;
 
   // Delay expensive flag strengthening until necessary.
-  auto ComputeFlags = [this, OrigFlags](const ArrayRef<const SCEV *> Ops) {
+  auto ComputeFlags = [this, OrigFlags](ArrayRef<const SCEV *> Ops) {
     return StrengthenNoWrapFlags(this, scMulExpr, Ops, OrigFlags);
   };
 
@@ -15510,6 +15510,78 @@ static const SCEV *getNextSCEVDivisibleByDivisor(const SCEV *Expr,
   return SE.getConstant(*ExprVal + DivisorVal - Rem);
 }
 
+static bool collectDivisibilityInformation(
+    ICmpInst::Predicate Predicate, const SCEV *LHS, const SCEV *RHS,
+    DenseMap<const SCEV *, const SCEV *> &DivInfo,
+    DenseMap<const SCEV *, APInt> &Multiples, ScalarEvolution &SE) {
+  // If we have LHS == 0, check if LHS is computing a property of some unknown
+  // SCEV %v which we can rewrite %v to express explicitly.
+  if (Predicate != CmpInst::ICMP_EQ || !match(RHS, m_scev_Zero()))
+    return false;
+  // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to
+  // explicitly express that.
+  const SCEVUnknown *URemLHS = nullptr;
+  const SCEV *URemRHS = nullptr;
+  if (!match(LHS, m_scev_URem(m_SCEVUnknown(URemLHS), m_SCEV(URemRHS), SE)))
+    return false;
+
+  const SCEV *Multiple =
+      SE.getMulExpr(SE.getUDivExpr(URemLHS, URemRHS), URemRHS);
+  DivInfo[URemLHS] = Multiple;
+  if (auto *C = dyn_cast<SCEVConstant>(URemRHS))
+    Multiples[URemLHS] = C->getAPInt();
+  return true;
+}
+
+// Check if the condition is a divisibility guard (A % B == 0).
+static bool isDivisibilityGuard(const SCEV *LHS, const SCEV *RHS,
+                                ScalarEvolution &SE) {
+  const SCEV *X, *Y;
+  return match(LHS, m_scev_URem(m_SCEV(X), m_SCEV(Y), SE)) && RHS->isZero();
+}
+
+// Apply divisibility by \p Divisor on MinMaxExpr with constant values,
+// recursively. This is done by aligning up/down the constant value to the
+// Divisor.
+static const SCEV *applyDivisibilityOnMinMaxExpr(const SCEV *MinMaxExpr,
+                                                 APInt Divisor,
+                                                 ScalarEvolution &SE) {
+  // Return true if \p Expr is a MinMax SCEV expression with a non-negative
+  // constant operand. If so, return in \p SCTy the SCEV type and in \p RHS
+  // the non-constant operand and in \p LHS the constant operand.
+  auto IsMinMaxSCEVWithNonNegativeConstant =
+      [&](const SCEV *Expr, SCEVTypes &SCTy, const SCEV *&LHS,
+          const SCEV *&RHS) {
+        if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr)) {
+          if (MinMax->getNumOperands() != 2)
+            return false;
+          if (auto *C = dyn_cast<SCEVConstant>(MinMax->getOperand(0))) {
+            if (C->getAPInt().isNegative())
+              return false;
+            SCTy = MinMax->getSCEVType();
+            LHS = MinMax->getOperand(0);
+            RHS = MinMax->getOperand(1);
+            return true;
+          }
+        }
+        return false;
+      };
+
+  const SCEV *MinMaxLHS = nullptr, *MinMaxRHS = nullptr;
+  SCEVTypes SCTy;
+  if (!IsMinMaxSCEVWithNonNegativeConstant(MinMaxExpr, SCTy, MinMaxLHS,
+                                           MinMaxRHS))
+    return MinMaxExpr;
+  auto IsMin = isa<SCEVSMinExpr>(MinMaxExpr) || isa<SCEVUMinExpr>(MinMaxExpr);
+  assert(SE.isKnownNonNegative(MinMaxLHS) && "Expected non-negative operand!");
+  auto *DivisibleExpr =
+      IsMin ? getPreviousSCEVDivisibleByDivisor(MinMaxLHS, Divisor, SE)
+            : getNextSCEVDivisibleByDivisor(MinMaxLHS, Divisor, SE);
+  SmallVector<const SCEV *> Ops = {
+      applyDivisibilityOnMinMaxExpr(MinMaxRHS, Divisor, SE), DivisibleExpr};
+  return SE.getMinMaxExpr(SCTy, Ops);
+}
+
 void ScalarEvolution::LoopGuards::collectFromBlock(
     ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
     const BasicBlock *Block, const BasicBlock *Pred,
@@ -15520,19 +15592,13 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
   SmallVector<const SCEV *> ExprsToRewrite;
   auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS,
                               const SCEV *RHS,
-                              DenseMap<const SCEV *, const SCEV *>
-                                  &RewriteMap) {
+                              DenseMap<const SCEV *, const SCEV *> &RewriteMap,
+                              const LoopGuards &DivGuards) {
     // WARNING: It is generally unsound to apply any wrap flags to the proposed
     // replacement SCEV which isn't directly implied by the structure of that
     // SCEV.  In particular, using contextual facts to imply flags is *NOT*
     // legal.  See the scoping rules for flags in the header to understand why.
 
-    // If LHS is a constant, apply information to the other expression.
-    if (isa<SCEVConstant>(LHS)) {
-      std::swap(LHS, RHS);
-      Predicate = CmpInst::getSwappedPredicate(Predicate);
-    }
-
     // Check for a condition of the form (-C1 + X < C2).  InstCombine will
     // create this form when combining two checks of the form (X u< C2 + C1) and
     // (X >=u C1).
@@ -15565,67 +15631,6 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
     if (MatchRangeCheckIdiom())
       return;
 
-    // Return true if \p Expr is a MinMax SCEV expression with a non-negative
-    // constant operand. If so, return in \p SCTy the SCEV type and in \p RHS
-    // the non-constant operand and in \p LHS the constant operand.
-    auto IsMinMaxSCEVWithNonNegativeConstant =
-        [&](const SCEV *Expr, SCEVTypes &SCTy, const SCEV *&LHS,
-            const SCEV *&RHS) {
-          const APInt *C;
-          SCTy = Expr->getSCEVType();
-          return match(Expr, m_scev_MinMax(m_SCEV(LHS), m_SCEV(RHS))) &&
-                 match(LHS, m_scev_APInt(C)) && C->isNonNegative();
-        };
-
-    // Apply divisibilty by \p Divisor on MinMaxExpr with constant values,
-    // recursively. This is done by aligning up/down the constant value to the
-    // Divisor.
-    std::function<const SCEV *(const SCEV *, const SCEV *)>
-        ApplyDivisibiltyOnMinMaxExpr = [&](const SCEV *MinMaxExpr,
-                                           const SCEV *Divisor) {
-          auto *ConstDivisor = dyn_cast<SCEVConstant>(Divisor);
-          if (!ConstDivisor)
-            return MinMaxExpr;
-          const APInt &DivisorVal = ConstDivisor->getAPInt();
-
-          const SCEV *MinMaxLHS = nullptr, *MinMaxRHS = nullptr;
-          SCEVTypes SCTy;
-          if (!IsMinMaxSCEVWithNonNegativeConstant(MinMaxExpr, SCTy, MinMaxLHS,
-                                                   MinMaxRHS))
-            return MinMaxExpr;
-          auto IsMin =
-              isa<SCEVSMinExpr>(MinMaxExpr) || isa<SCEVUMinExpr>(MinMaxExpr);
-          assert(SE.isKnownNonNegative(MinMaxLHS) &&
-                 "Expected non-negative operand!");
-          auto *DivisibleExpr =
-              IsMin
-                  ? getPreviousSCEVDivisibleByDivisor(MinMaxLHS, DivisorVal, SE)
-                  : getNextSCEVDivisibleByDivisor(MinMaxLHS, DivisorVal, SE);
-          SmallVector<const SCEV *> Ops = {
-              ApplyDivisibiltyOnMinMaxExpr(MinMaxRHS, Divisor), DivisibleExpr};
-          return SE.getMinMaxExpr(SCTy, Ops);
-        };
-
-    // If we have LHS == 0, check if LHS is computing a property of some unknown
-    // SCEV %v which we can rewrite %v to express explicitly.
-    if (Predicate == CmpInst::ICMP_EQ && match(RHS, m_scev_Zero())) {
-      // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to
-      // explicitly express that.
-      const SCEVUnknown *URemLHS = nullptr;
-      const SCEV *URemRHS = nullptr;
-      if (match(LHS,
-                m_scev_URem(m_SCEVUnknown(URemLHS), m_SCEV(URemRHS), SE))) {
-        auto I = RewriteMap.find(URemLHS);
-        const SCEV *RewrittenLHS = I != RewriteMap.end() ? I->second : URemLHS;
-        RewrittenLHS = ApplyDivisibiltyOnMinMaxExpr(RewrittenLHS, URemRHS);
-        const auto *Multiple =
-            SE.getMulExpr(SE.getUDivExpr(RewrittenLHS, URemRHS), URemRHS);
-        RewriteMap[URemLHS] = Multiple;
-        ExprsToRewrite.push_back(URemLHS);
-        return;
-      }
-    }
-
     // Do not apply information for constants or if RHS contains an AddRec.
     if (isa<SCEVConstant>(LHS) || SE.containsAddRecurrence(RHS))
       return;
@@ -15655,7 +15660,9 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
     };
 
     const SCEV *RewrittenLHS = GetMaybeRewritten(LHS);
-    const APInt &DividesBy = SE.getConstantMultiple(RewrittenLHS);
+    // Apply divisibility information when computing the constant multiple.
+    const APInt &DividesBy =
+        SE.getConstantMultiple(DivGuards.rewrite(RewrittenLHS));
 
     // Collect rewrites for LHS and its transitive operands based on the
     // condition.
@@ -15670,31 +15677,31 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
     // predicate.
     const SCEV *One = SE.getOne(RHS->getType());
     switch (Predicate) {
-      case CmpInst::ICMP_ULT:
-        if (RHS->getType()->isPointerTy())
-          return;
-        RHS = SE.getUMaxExpr(RHS, One);
-        [[fallthrough]];
-      case CmpInst::ICMP_SLT: {
-        RHS = SE.getMinusSCEV(RHS, One);
-        RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE);
-        break;
-      }
-      case CmpInst::ICMP_UGT:
-      case CmpInst::ICMP_SGT:
-        RHS = SE.getAddExpr(RHS, One);
-        RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE);
-        break;
-      case CmpInst::ICMP_ULE:
-      case CmpInst::ICMP_SLE:
-        RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE);
-        break;
-      case CmpInst::ICMP_UGE:
-      case CmpInst::ICMP_SGE:
-        RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE);
-        break;
-      default:
-        break;
+    case CmpInst::ICMP_ULT:
+      if (RHS->getType()->isPointerTy())
+        return;
+      RHS = SE.getUMaxExpr(RHS, One);
+      [[fallthrough]];
+    case CmpInst::ICMP_SLT: {
+      RHS = SE.getMinusSCEV(RHS, One);
+      RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE);
+      break;
+    }
+    case CmpInst::ICMP_UGT:
+    case CmpInst::ICMP_SGT:
+      RHS = SE.getAddExpr(RHS, One);
+      RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE);
+      break;
+    case CmpInst::ICMP_ULE:
+    case CmpInst::ICMP_SLE:
+      RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE);
+      break;
+    case CmpInst::ICMP_UGE:
+    case CmpInst::ICMP_SGE:
+      RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE);
+      break;
+    default:
+      break;
     }
 
     SmallVector<const SCEV *, 16> Worklist(1, LHS);
@@ -15840,8 +15847,11 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
 
   // Now apply the information from the collected conditions to
   // Guards.RewriteMap. Conditions are processed in reverse order, so the
-  // earliest conditions is processed first. This ensures the SCEVs with the
+  // earliest conditions is processed first, except guards with divisibility
+  // information, which are moved to the back. This ensures the SCEVs with the
   // shortest dependency chains are constructed first.
+  SmallVector<std::tuple<CmpInst::Predicate, const SCEV *, const SCEV *>>
+      GuardsToProcess;
   for (auto [Term, EnterIfTrue] : reverse(Terms)) {
     SmallVector<Value *, 8> Worklist;
     SmallPtrSet<Value *, 8> Visited;
@@ -15856,7 +15866,14 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
             EnterIfTrue ? Cmp->getPredicate() : Cmp->getInversePredicate();
         const auto *LHS = SE.getSCEV(Cmp->getOperand(0));
         const auto *RHS = SE.getSCEV(Cmp->getOperand(1));
-        CollectCondition(Predicate, LHS, RHS, Guards.RewriteMap);
+        // If LHS is a constant, apply information to the other expression.
+        // TODO: If LHS is not a constant, check if using CompareSCEVComplexity
+        // can improve results.
+        if (isa<SCEVConstant>(LHS)) {
+          std::swap(LHS, RHS);
+          Predicate = CmpInst::getSwappedPredicate(Predicate);
+        }
+        GuardsToProcess.emplace_back(Predicate, LHS, RHS);
         continue;
       }
 
@@ -15869,6 +15886,31 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
     }
   }
 
+  // Process divisibility guards in reverse order to populate DivGuards early.
+  DenseMap<const SCEV *, APInt> Multiples;
+  LoopGuards DivGuards(SE);
+  for (const auto &[Predicate, LHS, RHS] : GuardsToProcess) {
+    if (!isDivisibilityGuard(LHS, RHS, SE))
+      continue;
+    collectDivisibilityInformation(Predicate, LHS, RHS, DivGuards.RewriteMap,
+                                   Multiples, SE);
+  }
+
+  for (const auto &[Predicate, LHS, RHS] : GuardsToProcess)
+    CollectCondition(Predicate, LHS, RHS, Guards.RewriteMap, DivGuards);
+
+  // Apply divisibility information last. This ensures it is applied to the
+  // outermost expression after other rewrites for the given value.
+  for (const auto &[K, Divisor] : Multiples) {
+    const SCEV *DivisorSCEV = SE.getConstant(Divisor);
+    Guards.RewriteMap[K] =
+        SE.getMulExpr(SE.getUDivExpr(applyDivisibilityOnMinMaxExpr(
+                                         Guards.rewrite(K), Divisor, SE),
+                                     DivisorSCEV),
+                      DivisorSCEV);
+    ExprsToRewrite.push_back(K);
+  }
+
   // Let the rewriter preserve NUW/NSW flags if the unsigned/signed ranges of
   // the replacement expressions are contained in the ranges of the replaced
   // expressions.
diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 5e92ca1d38e70..fbe74d21c7199 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -30,7 +30,6 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
-#include <memory>
 #include <tuple>
 
 using namespace llvm;
diff --git a/llvm/lib/Analysis/TFLiteUtils.cpp b/llvm/lib/Analysis/TFLiteUtils.cpp
index 2762e22f28cef..fcef1c8aa7380 100644
--- a/llvm/lib/Analysis/TFLiteUtils.cpp
+++ b/llvm/lib/Analysis/TFLiteUtils.cpp
@@ -30,7 +30,6 @@
 #include "tensorflow/lite/logger.h"
 
 #include <cassert>
-#include <numeric>
 #include <optional>
 
 using namespace llvm;
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 813632c375308..f97abc9a32707 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -15,33 +15,11 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/SystemLibraries.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/TargetParser/Triple.h"
 using namespace llvm;
 
-static cl::opt<TargetLibraryInfoImpl::VectorLibrary> ClVectorLibrary(
-    "vector-library", cl::Hidden, cl::desc("Vector functions library"),
-    cl::init(TargetLibraryInfoImpl::NoLibrary),
-    cl::values(clEnumValN(TargetLibraryInfoImpl::NoLibrary, "none",
-                          "No vector functions library"),
-               clEnumValN(TargetLibraryInfoImpl::Accelerate, "Accelerate",
-                          "Accelerate framework"),
-               clEnumValN(TargetLibraryInfoImpl::DarwinLibSystemM,
-                          "Darwin_libsystem_m", "Darwin libsystem_m"),
-               clEnumValN(TargetLibraryInfoImpl::LIBMVEC, "LIBMVEC",
-                          "GLIBC Vector Math library"),
-               clEnumValN(TargetLibraryInfoImpl::MASSV, "MASSV",
-                          "IBM MASS vector library"),
-               clEnumValN(TargetLibraryInfoImpl::SVML, "SVML",
-                          "Intel SVML library"),
-               clEnumValN(TargetLibraryInfoImpl::SLEEFGNUABI, "sleefgnuabi",
-                          "SIMD Library for Evaluating Elementary Functions"),
-               clEnumValN(TargetLibraryInfoImpl::ArmPL, "ArmPL",
-                          "Arm Performance Libraries"),
-               clEnumValN(TargetLibraryInfoImpl::AMDLIBM, "AMDLIBM",
-                          "AMD vector math library")));
-
 StringLiteral const TargetLibraryInfoImpl::StandardNames[LibFunc::NumLibFuncs] =
     {
 #define TLI_DEFINE_STRING
@@ -388,6 +366,10 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T,
         TLI.setAvailableWithName(LibFunc_logbf, "_logbf");
       else
         TLI.setUnavailable(LibFunc_logbf);
+      TLI.setUnavailable(LibFunc_nextafter);
+      TLI.setUnavailable(LibFunc_nextafterf);
+      TLI.setUnavailable(LibFunc_nexttoward);
+      TLI.setUnavailable(LibFunc_nexttowardf);
       TLI.setUnavailable(LibFunc_rint);
       TLI.setUnavailable(LibFunc_rintf);
       TLI.setUnavailable(LibFunc_round);
@@ -418,6 +400,8 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_logbl);
     TLI.setUnavailable(LibFunc_ilogbl);
     TLI.setUnavailable(LibFunc_nearbyintl);
+    TLI.setUnavailable(LibFunc_nextafterl);
+    TLI.setUnavailable(LibFunc_nexttowardl);
     TLI.setUnavailable(LibFunc_rintl);
     TLI.setUnavailable(LibFunc_roundl);
     TLI.setUnavailable(LibFunc_scalblnl);
@@ -1386,15 +1370,15 @@ const VecDesc VecFuncs_AMDLIBM[] = {
 void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
     enum VectorLibrary VecLib, const llvm::Triple &TargetTriple) {
   switch (VecLib) {
-  case Accelerate: {
+  case VectorLibrary::Accelerate: {
     addVectorizableFunctions(VecFuncs_Accelerate);
     break;
   }
-  case DarwinLibSystemM: {
+  case VectorLibrary::DarwinLibSystemM: {
     addVectorizableFunctions(VecFuncs_DarwinLibSystemM);
     break;
   }
-  case LIBMVEC: {
+  case VectorLibrary::LIBMVEC: {
     switch (TargetTriple.getArch()) {
     default:
       break;
@@ -1409,15 +1393,15 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
     }
     break;
   }
-  case MASSV: {
+  case VectorLibrary::MASSV: {
     addVectorizableFunctions(VecFuncs_MASSV);
     break;
   }
-  case SVML: {
+  case VectorLibrary::SVML: {
     addVectorizableFunctions(VecFuncs_SVML);
     break;
   }
-  case SLEEFGNUABI: {
+  case VectorLibrary::SLEEFGNUABI: {
     switch (TargetTriple.getArch()) {
     default:
       break;
@@ -1433,7 +1417,7 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
     }
     break;
   }
-  case ArmPL: {
+  case VectorLibrary::ArmPL: {
     switch (TargetTriple.getArch()) {
     default:
       break;
@@ -1444,11 +1428,11 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
     }
     break;
   }
-  case AMDLIBM: {
+  case VectorLibrary::AMDLIBM: {
     addVectorizableFunctions(VecFuncs_AMDLIBM);
     break;
   }
-  case NoLibrary:
+  case VectorLibrary::NoLibrary:
     break;
   }
 }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c47a1c1b23a37..0426ac7e62fab 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1353,9 +1353,9 @@ TargetTransformInfo::getInlineCallPenalty(const Function *F,
   return TTIImpl->getInlineCallPenalty(F, Call, DefaultCallPenalty);
 }
 
-bool TargetTransformInfo::areTypesABICompatible(
-    const Function *Caller, const Function *Callee,
-    const ArrayRef<Type *> &Types) const {
+bool TargetTransformInfo::areTypesABICompatible(const Function *Caller,
+                                                const Function *Callee,
+                                                ArrayRef<Type *> Types) const {
   return TTIImpl->areTypesABICompatible(Caller, Callee, Types);
 }
 
diff --git a/llvm/lib/Analysis/TrainingLogger.cpp b/llvm/lib/Analysis/TrainingLogger.cpp
index 344ca92e18b51..39f79cffdcd88 100644
--- a/llvm/lib/Analysis/TrainingLogger.cpp
+++ b/llvm/lib/Analysis/TrainingLogger.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Support/raw_ostream.h"
 
 #include <cassert>
-#include <numeric>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 0a72076f51824..41ff816a33262 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -350,6 +350,139 @@ unsigned llvm::ComputeMaxSignificantBits(const Value *V, const DataLayout &DL,
   return V->getType()->getScalarSizeInBits() - SignBits + 1;
 }
 
+/// Try to detect the lerp pattern: a * (b - c) + c * d
+/// where a >= 0, b >= 0, c >= 0, d >= 0, and b >= c.
+///
+/// In that particular case, we can use the following chain of reasoning:
+///
+///   a * (b - c) + c * d <= a' * (b - c) + a' * c = a' * b where a' = max(a, d)
+///
+/// Since that is true for arbitrary a, b, c and d within our constraints, we
+/// can conclude that:
+///
+///   max(a * (b - c) + c * d) <= max(max(a), max(d)) * max(b) = U
+///
+/// Considering that any result of the lerp would be less or equal to U, it
+/// would have at least the number of leading 0s as in U.
+///
+/// While being quite a specific situation, it is fairly common in computer
+/// graphics in the shape of alpha blending.
+///
+/// Modifies given KnownOut in-place with the inferred information.
+static void computeKnownBitsFromLerpPattern(const Value *Op0, const Value *Op1,
+                                            const APInt &DemandedElts,
+                                            KnownBits &KnownOut,
+                                            const SimplifyQuery &Q,
+                                            unsigned Depth) {
+
+  Type *Ty = Op0->getType();
+  const unsigned BitWidth = Ty->getScalarSizeInBits();
+
+  // Only handle scalar types for now
+  if (Ty->isVectorTy())
+    return;
+
+  // Try to match: a * (b - c) + c * d.
+  // When a == 1 => A == nullptr, the same applies to d/D as well.
+  const Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
+  const Instruction *SubBC = nullptr;
+
+  const auto MatchSubBC = [&]() {
+    // (b - c) can have two forms that interest us:
+    //
+    //   1. sub nuw %b, %c
+    //   2. xor %c, %b
+    //
+    // For the first case, nuw flag guarantees our requirement b >= c.
+    //
+    // The second case might happen when the analysis can infer that b is a mask
+    // for c and we can transform sub operation into xor (that is usually true
+    // for constant b's). Even though xor is symmetrical, canonicalization
+    // ensures that the constant will be the RHS. We have additional checks
+    // later on to ensure that this xor operation is equivalent to subtraction.
+    return m_Instruction(SubBC, m_CombineOr(m_NUWSub(m_Value(B), m_Value(C)),
+                                            m_Xor(m_Value(C), m_Value(B))));
+  };
+
+  const auto MatchASubBC = [&]() {
+    // Cases:
+    //   - a * (b - c)
+    //   - (b - c) * a
+    //   - (b - c) <- a implicitly equals 1
+    return m_CombineOr(m_c_Mul(m_Value(A), MatchSubBC()), MatchSubBC());
+  };
+
+  const auto MatchCD = [&]() {
+    // Cases:
+    //   - d * c
+    //   - c * d
+    //   - c <- d implicitly equals 1
+    return m_CombineOr(m_c_Mul(m_Value(D), m_Specific(C)), m_Specific(C));
+  };
+
+  const auto Match = [&](const Value *LHS, const Value *RHS) {
+    // We do use m_Specific(C) in MatchCD, so we have to make sure that
+    // it's bound to anything and match(LHS, MatchASubBC()) absolutely
+    // has to evaluate first and return true.
+    //
+    // If Match returns true, it is guaranteed that B != nullptr, C != nullptr.
+    return match(LHS, MatchASubBC()) && match(RHS, MatchCD());
+  };
+
+  if (!Match(Op0, Op1) && !Match(Op1, Op0))
+    return;
+
+  const auto ComputeKnownBitsOrOne = [&](const Value *V) {
+    // For some of the values we use the convention of leaving
+    // it nullptr to signify an implicit constant 1.
+    return V ? computeKnownBits(V, DemandedElts, Q, Depth + 1)
+             : KnownBits::makeConstant(APInt(BitWidth, 1));
+  };
+
+  // Check that all operands are non-negative
+  const KnownBits KnownA = ComputeKnownBitsOrOne(A);
+  if (!KnownA.isNonNegative())
+    return;
+
+  const KnownBits KnownD = ComputeKnownBitsOrOne(D);
+  if (!KnownD.isNonNegative())
+    return;
+
+  const KnownBits KnownB = computeKnownBits(B, DemandedElts, Q, Depth + 1);
+  if (!KnownB.isNonNegative())
+    return;
+
+  const KnownBits KnownC = computeKnownBits(C, DemandedElts, Q, Depth + 1);
+  if (!KnownC.isNonNegative())
+    return;
+
+  // If we matched subtraction as xor, we need to actually check that xor
+  // is semantically equivalent to subtraction.
+  //
+  // For that to be true, b has to be a mask for c or that b's known
+  // ones cover all known and possible ones of c.
+  if (SubBC->getOpcode() == Instruction::Xor &&
+      !KnownC.getMaxValue().isSubsetOf(KnownB.getMinValue()))
+    return;
+
+  const APInt MaxA = KnownA.getMaxValue();
+  const APInt MaxD = KnownD.getMaxValue();
+  const APInt MaxAD = APIntOps::umax(MaxA, MaxD);
+  const APInt MaxB = KnownB.getMaxValue();
+
+  // We can't infer leading zeros info if the upper-bound estimate wraps.
+  bool Overflow;
+  const APInt UpperBound = MaxAD.umul_ov(MaxB, Overflow);
+
+  if (Overflow)
+    return;
+
+  // If we know that x <= y and both are positive than x has at least the same
+  // number of leading zeros as y.
+  const unsigned MinimumNumberOfLeadingZeros = UpperBound.countl_zero();
+  KnownOut.Zero.setHighBits(MinimumNumberOfLeadingZeros);
+}
+
 static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1,
                                    bool NSW, bool NUW,
                                    const APInt &DemandedElts,
@@ -369,6 +502,10 @@ static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1,
       isImpliedByDomCondition(ICmpInst::ICMP_SLE, Op1, Op0, Q.CxtI, Q.DL)
           .value_or(false))
     KnownOut.makeNonNegative();
+
+  if (Add)
+    // Try to match lerp pattern and combine results
+    computeKnownBitsFromLerpPattern(Op0, Op1, DemandedElts, KnownOut, Q, Depth);
 }
 
 static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
@@ -7419,84 +7556,20 @@ static bool canCreateUndefOrPoison(const Operator *Op, UndefPoisonKind Kind,
         if (cast<ConstantInt>(II->getArgOperand(1))->isNullValue())
           return false;
         break;
-      case Intrinsic::ctpop:
-      case Intrinsic::bswap:
-      case Intrinsic::bitreverse:
-      case Intrinsic::fshl:
-      case Intrinsic::fshr:
-      case Intrinsic::smax:
-      case Intrinsic::smin:
-      case Intrinsic::scmp:
-      case Intrinsic::umax:
-      case Intrinsic::umin:
-      case Intrinsic::ucmp:
-      case Intrinsic::ptrmask:
-      case Intrinsic::fptoui_sat:
-      case Intrinsic::fptosi_sat:
-      case Intrinsic::sadd_with_overflow:
-      case Intrinsic::ssub_with_overflow:
-      case Intrinsic::smul_with_overflow:
-      case Intrinsic::uadd_with_overflow:
-      case Intrinsic::usub_with_overflow:
-      case Intrinsic::umul_with_overflow:
-      case Intrinsic::sadd_sat:
-      case Intrinsic::uadd_sat:
-      case Intrinsic::ssub_sat:
-      case Intrinsic::usub_sat:
-        return false;
       case Intrinsic::sshl_sat:
       case Intrinsic::ushl_sat:
-        return includesPoison(Kind) &&
-               !shiftAmountKnownInRange(II->getArgOperand(1));
-      case Intrinsic::fma:
-      case Intrinsic::fmuladd:
-      case Intrinsic::sqrt:
-      case Intrinsic::powi:
-      case Intrinsic::sin:
-      case Intrinsic::cos:
-      case Intrinsic::pow:
-      case Intrinsic::log:
-      case Intrinsic::log10:
-      case Intrinsic::log2:
-      case Intrinsic::exp:
-      case Intrinsic::exp2:
-      case Intrinsic::exp10:
-      case Intrinsic::fabs:
-      case Intrinsic::copysign:
-      case Intrinsic::floor:
-      case Intrinsic::ceil:
-      case Intrinsic::trunc:
-      case Intrinsic::rint:
-      case Intrinsic::nearbyint:
-      case Intrinsic::round:
-      case Intrinsic::roundeven:
-      case Intrinsic::fptrunc_round:
-      case Intrinsic::canonicalize:
-      case Intrinsic::arithmetic_fence:
-      case Intrinsic::minnum:
-      case Intrinsic::maxnum:
-      case Intrinsic::minimum:
-      case Intrinsic::maximum:
-      case Intrinsic::minimumnum:
-      case Intrinsic::maximumnum:
-      case Intrinsic::is_fpclass:
-      case Intrinsic::ldexp:
-      case Intrinsic::frexp:
-        return false;
-      case Intrinsic::lround:
-      case Intrinsic::llround:
-      case Intrinsic::lrint:
-      case Intrinsic::llrint:
-        // If the value doesn't fit an unspecified value is returned (but this
-        // is not poison).
-        return false;
+        if (!includesPoison(Kind) ||
+            shiftAmountKnownInRange(II->getArgOperand(1)))
+          return false;
+        break;
       }
     }
     [[fallthrough]];
   case Instruction::CallBr:
   case Instruction::Invoke: {
     const auto *CB = cast<CallBase>(Op);
-    return !CB->hasRetAttr(Attribute::NoUndef);
+    return !CB->hasRetAttr(Attribute::NoUndef) &&
+           !CB->hasFnAttr(Attribute::NoCreateUndefOrPoison);
   }
   case Instruction::InsertElement:
   case Instruction::ExtractElement: {
@@ -10405,3 +10478,55 @@ const Value *llvm::stripNullTest(const Value *V) {
 Value *llvm::stripNullTest(Value *V) {
   return const_cast<Value *>(stripNullTest(const_cast<const Value *>(V)));
 }
+
+bool llvm::collectPossibleValues(const Value *V,
+                                 SmallPtrSetImpl<const Constant *> &Constants,
+                                 unsigned MaxCount, bool AllowUndefOrPoison) {
+  SmallPtrSet<const Instruction *, 8> Visited;
+  SmallVector<const Instruction *, 8> Worklist;
+  auto Push = [&](const Value *V) -> bool {
+    if (auto *C = dyn_cast<Constant>(V)) {
+      if (!AllowUndefOrPoison && !isGuaranteedNotToBeUndefOrPoison(C))
+        return false;
+      // Check existence first to avoid unnecessary allocations.
+      if (Constants.contains(C))
+        return true;
+      if (Constants.size() == MaxCount)
+        return false;
+      Constants.insert(C);
+      return true;
+    }
+
+    if (auto *Inst = dyn_cast<Instruction>(V)) {
+      if (Visited.insert(Inst).second)
+        Worklist.push_back(Inst);
+      return true;
+    }
+    return false;
+  };
+  if (!Push(V))
+    return false;
+  while (!Worklist.empty()) {
+    const Instruction *CurInst = Worklist.pop_back_val();
+    switch (CurInst->getOpcode()) {
+    case Instruction::Select:
+      if (!Push(CurInst->getOperand(1)))
+        return false;
+      if (!Push(CurInst->getOperand(2)))
+        return false;
+      break;
+    case Instruction::PHI:
+      for (Value *IncomingValue : cast<PHINode>(CurInst)->incoming_values()) {
+        // Fast path for recurrence PHI.
+        if (IncomingValue == CurInst)
+          continue;
+        if (!Push(IncomingValue))
+          return false;
+      }
+      break;
+    default:
+      return false;
+    }
+  }
+  return true;
+}
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 091d94843698c..977ed59e09243 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1387,9 +1387,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
       // wrap around the address space we would do a memory access at nullptr
       // even without the transformation. The wrapping checks are therefore
       // deferred until after we've formed the interleaved groups.
-      int64_t Stride =
-        getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides,
-                     /*Assume=*/true, /*ShouldCheckWrap=*/false).value_or(0);
+      int64_t Stride = getPtrStride(PSE, ElementTy, Ptr, TheLoop, *DT, Strides,
+                                    /*Assume=*/true, /*ShouldCheckWrap=*/false)
+                           .value_or(0);
 
       const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
       AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size,
@@ -1643,8 +1643,9 @@ void InterleavedAccessInfo::analyzeInterleaving(
     assert(Member && "Group member does not exist");
     Value *MemberPtr = getLoadStorePointerOperand(Member);
     Type *AccessTy = getLoadStoreType(Member);
-    if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, Strides,
-                     /*Assume=*/false, /*ShouldCheckWrap=*/true).value_or(0))
+    if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, *DT, Strides,
+                     /*Assume=*/false, /*ShouldCheckWrap=*/true)
+            .value_or(0))
       return false;
     LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
                       << FirstOrLast
diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
index f2ada27cac01d..a3cd157e6aa61 100644
--- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
+++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
@@ -17,8 +17,6 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/MsgPackDocument.h"
 
-#include <utility>
-
 namespace llvm {
 namespace AMDGPU {
 namespace HSAMD {
diff --git a/llvm/lib/BinaryFormat/CMakeLists.txt b/llvm/lib/BinaryFormat/CMakeLists.txt
index 4b2debb7ae236..0c8af1e7a4565 100644
--- a/llvm/lib/BinaryFormat/CMakeLists.txt
+++ b/llvm/lib/BinaryFormat/CMakeLists.txt
@@ -6,7 +6,6 @@ add_llvm_component_library(LLVMBinaryFormat
   ELF.cpp
   MachO.cpp
   Magic.cpp
-  Minidump.cpp
   MsgPackDocument.cpp
   MsgPackDocumentYAML.cpp
   MsgPackReader.cpp
diff --git a/llvm/lib/BinaryFormat/Dwarf.cpp b/llvm/lib/BinaryFormat/Dwarf.cpp
index 55fa2df632bfa..a6c7e6afdbe7a 100644
--- a/llvm/lib/BinaryFormat/Dwarf.cpp
+++ b/llvm/lib/BinaryFormat/Dwarf.cpp
@@ -1076,10 +1076,3 @@ StringRef (*const llvm::dwarf::EnumTraits<LineNumberOps>::StringFn)(unsigned) =
     LNStandardString;
 StringRef (*const llvm::dwarf::EnumTraits<Index>::StringFn)(unsigned) =
     IndexString;
-
-constexpr char llvm::dwarf::EnumTraits<Attribute>::Type[];
-constexpr char llvm::dwarf::EnumTraits<Form>::Type[];
-constexpr char llvm::dwarf::EnumTraits<Index>::Type[];
-constexpr char llvm::dwarf::EnumTraits<Tag>::Type[];
-constexpr char llvm::dwarf::EnumTraits<LineNumberOps>::Type[];
-constexpr char llvm::dwarf::EnumTraits<LocationAtom>::Type[];
diff --git a/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp b/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp
index 3de3dccce0c6c..80b421d5f752e 100644
--- a/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp
+++ b/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp
@@ -209,12 +209,12 @@ template <> struct CustomMappingTraits<MapDocNode> {
   static void inputOne(IO &IO, StringRef Key, MapDocNode &M) {
     ScalarDocNode KeyObj = M.getDocument()->getNode();
     KeyObj.fromString(Key, "");
-    IO.mapRequired(Key.str().c_str(), M.getMap()[KeyObj]);
+    IO.mapRequired(Key, M.getMap()[KeyObj]);
   }
 
   static void output(IO &IO, MapDocNode &M) {
     for (auto I : M.getMap()) {
-      IO.mapRequired(I.first.toString().c_str(), I.second);
+      IO.mapRequired(I.first.toString(), I.second);
     }
   }
 };
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 466dcb02696f4..8930d64de5e37 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2257,6 +2257,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::Captures;
   case bitc::ATTR_KIND_DEAD_ON_RETURN:
     return Attribute::DeadOnReturn;
+  case bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON:
+    return Attribute::NoCreateUndefOrPoison;
   }
 }
 
@@ -8566,16 +8568,13 @@ Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() {
 }
 
 static Expected<std::pair<bool, bool>>
-getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream,
-                                                 unsigned ID,
-                                                 BitcodeLTOInfo &LTOInfo) {
+getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, unsigned ID) {
   if (Error Err = Stream.EnterSubBlock(ID))
     return std::move(Err);
-  SmallVector<uint64_t, 64> Record;
 
+  SmallVector<uint64_t, 64> Record;
   while (true) {
     BitstreamEntry Entry;
-    std::pair<bool, bool> Result = {false,false};
     if (Error E = Stream.advanceSkippingSubblocks().moveInto(Entry))
       return std::move(E);
 
@@ -8584,8 +8583,8 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream,
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock: {
-      // If no flags record found, set both flags to false.
-      return Result;
+      // If no flags record found, return both flags as false.
+      return std::make_pair(false, false);
     }
     case BitstreamEntry::Record:
       // The interesting case.
@@ -8607,9 +8606,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream,
 
       bool EnableSplitLTOUnit = Flags & 0x8;
       bool UnifiedLTO = Flags & 0x200;
-      Result = {EnableSplitLTOUnit, UnifiedLTO};
-
-      return Result;
+      return std::make_pair(EnableSplitLTOUnit, UnifiedLTO);
     }
     }
   }
@@ -8638,26 +8635,15 @@ Expected<BitcodeLTOInfo> BitcodeModule::getLTOInfo() {
                             /*EnableSplitLTOUnit=*/false, /*UnifiedLTO=*/false};
 
     case BitstreamEntry::SubBlock:
-      if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID) {
-        BitcodeLTOInfo LTOInfo;
+      if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID ||
+          Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) {
         Expected<std::pair<bool, bool>> Flags =
-            getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo);
+            getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID);
         if (!Flags)
           return Flags.takeError();
-        std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get();
-        LTOInfo.IsThinLTO = true;
-        LTOInfo.HasSummary = true;
-        return LTOInfo;
-      }
-
-      if (Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) {
         BitcodeLTOInfo LTOInfo;
-        Expected<std::pair<bool, bool>> Flags =
-            getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo);
-        if (!Flags)
-          return Flags.takeError();
         std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get();
-        LTOInfo.IsThinLTO = false;
+        LTOInfo.IsThinLTO = (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID);
         LTOInfo.HasSummary = true;
         return LTOInfo;
       }
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index f17656c7c3b03..76494c792ac7b 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -956,6 +956,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_CAPTURES;
   case Attribute::DeadOnReturn:
     return bitc::ATTR_KIND_DEAD_ON_RETURN;
+  case Attribute::NoCreateUndefOrPoison:
+    return bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON;
   case Attribute::EndAttrKinds:
     llvm_unreachable("Can not encode end-attribute kinds marker.");
   case Attribute::None:
diff --git a/llvm/lib/CAS/ActionCaches.cpp b/llvm/lib/CAS/ActionCaches.cpp
index 571c5b3ca5b4b..003c850275ff4 100644
--- a/llvm/lib/CAS/ActionCaches.cpp
+++ b/llvm/lib/CAS/ActionCaches.cpp
@@ -13,7 +13,11 @@
 #include "BuiltinCAS.h"
 #include "llvm/ADT/TrieRawHashMap.h"
 #include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/BLAKE3.h"
+#include "llvm/Support/Errc.h"
 
 #define DEBUG_TYPE "cas-action-caches"
 
@@ -47,12 +51,54 @@ class InMemoryActionCache final : public ActionCache {
   Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey,
                                          bool CanBeDistributed) const final;
 
+  Error validate() const final {
+    return createStringError("InMemoryActionCache doesn't support validate()");
+  }
+
 private:
   using DataT = CacheEntry<sizeof(HashType)>;
   using InMemoryCacheT = ThreadSafeTrieRawHashMap<DataT, sizeof(HashType)>;
 
   InMemoryCacheT Cache;
 };
+
+/// Builtin basic OnDiskActionCache that uses one underlying OnDiskKeyValueDB.
+class OnDiskActionCache final : public ActionCache {
+public:
+  Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result,
+                bool CanBeDistributed) final;
+  Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey,
+                                         bool CanBeDistributed) const final;
+
+  static Expected<std::unique_ptr<OnDiskActionCache>> create(StringRef Path);
+
+  Error validate() const final;
+
+private:
+  static StringRef getHashName() { return "BLAKE3"; }
+
+  OnDiskActionCache(std::unique_ptr<ondisk::OnDiskKeyValueDB> DB);
+
+  std::unique_ptr<ondisk::OnDiskKeyValueDB> DB;
+  using DataT = CacheEntry<sizeof(HashType)>;
+};
+
+/// Builtin unified ActionCache that wraps around UnifiedOnDiskCache to provide
+/// access to its ActionCache.
+class UnifiedOnDiskActionCache final : public ActionCache {
+public:
+  Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result,
+                bool CanBeDistributed) final;
+  Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey,
+                                         bool CanBeDistributed) const final;
+
+  UnifiedOnDiskActionCache(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB);
+
+  Error validate() const final;
+
+private:
+  std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB;
+};
 } // end namespace
 
 static Error createResultCachePoisonedError(ArrayRef<uint8_t> KeyHash,
@@ -99,3 +145,123 @@ std::unique_ptr<ActionCache> createInMemoryActionCache() {
 }
 
 } // namespace llvm::cas
+
+OnDiskActionCache::OnDiskActionCache(
+    std::unique_ptr<ondisk::OnDiskKeyValueDB> DB)
+    : ActionCache(builtin::BuiltinCASContext::getDefaultContext()),
+      DB(std::move(DB)) {}
+
+Expected<std::unique_ptr<OnDiskActionCache>>
+OnDiskActionCache::create(StringRef AbsPath) {
+  std::unique_ptr<ondisk::OnDiskKeyValueDB> DB;
+  if (Error E = ondisk::OnDiskKeyValueDB::open(AbsPath, getHashName(),
+                                               sizeof(HashType), getHashName(),
+                                               sizeof(DataT))
+                    .moveInto(DB))
+    return std::move(E);
+  return std::unique_ptr<OnDiskActionCache>(
+      new OnDiskActionCache(std::move(DB)));
+}
+
+Expected<std::optional<CASID>>
+OnDiskActionCache::getImpl(ArrayRef<uint8_t> Key,
+                           bool /*CanBeDistributed*/) const {
+  std::optional<ArrayRef<char>> Val;
+  if (Error E = DB->get(Key).moveInto(Val))
+    return std::move(E);
+  if (!Val)
+    return std::nullopt;
+  return CASID::create(&getContext(), toStringRef(*Val));
+}
+
+Error OnDiskActionCache::putImpl(ArrayRef<uint8_t> Key, const CASID &Result,
+                                 bool /*CanBeDistributed*/) {
+  auto ResultHash = Result.getHash();
+  ArrayRef Expected((const char *)ResultHash.data(), ResultHash.size());
+  ArrayRef<char> Observed;
+  if (Error E = DB->put(Key, Expected).moveInto(Observed))
+    return E;
+
+  if (Expected == Observed)
+    return Error::success();
+
+  return createResultCachePoisonedError(
+      Key, getContext(), Result,
+      ArrayRef((const uint8_t *)Observed.data(), Observed.size()));
+}
+
+Error OnDiskActionCache::validate() const {
+  // FIXME: without the matching CAS there is nothing we can check about the
+  // cached values. The hash size is already validated by the DB validator.
+  return DB->validate(nullptr);
+}
+
+UnifiedOnDiskActionCache::UnifiedOnDiskActionCache(
+    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB)
+    : ActionCache(builtin::BuiltinCASContext::getDefaultContext()),
+      UniDB(std::move(UniDB)) {}
+
+Expected<std::optional<CASID>>
+UnifiedOnDiskActionCache::getImpl(ArrayRef<uint8_t> Key,
+                                  bool /*CanBeDistributed*/) const {
+  std::optional<ArrayRef<char>> Val;
+  if (Error E = UniDB->getKeyValueDB().get(Key).moveInto(Val))
+    return std::move(E);
+  if (!Val)
+    return std::nullopt;
+  auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Val);
+  return CASID::create(&getContext(),
+                       toStringRef(UniDB->getGraphDB().getDigest(ID)));
+}
+
+Error UnifiedOnDiskActionCache::putImpl(ArrayRef<uint8_t> Key,
+                                        const CASID &Result,
+                                        bool /*CanBeDistributed*/) {
+  auto Expected = UniDB->getGraphDB().getReference(Result.getHash());
+  if (LLVM_UNLIKELY(!Expected))
+    return Expected.takeError();
+
+  auto Value = ondisk::UnifiedOnDiskCache::getValueFromObjectID(*Expected);
+  std::optional<ArrayRef<char>> Observed;
+  if (Error E = UniDB->getKeyValueDB().put(Key, Value).moveInto(Observed))
+    return E;
+
+  auto ObservedID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Observed);
+  if (*Expected == ObservedID)
+    return Error::success();
+
+  return createResultCachePoisonedError(
+      Key, getContext(), Result, UniDB->getGraphDB().getDigest(ObservedID));
+}
+
+Error UnifiedOnDiskActionCache::validate() const {
+  auto ValidateRef = [](FileOffset Offset, ArrayRef<char> Value) -> Error {
+    auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(Value);
+    auto formatError = [&](Twine Msg) {
+      return createStringError(
+          llvm::errc::illegal_byte_sequence,
+          "bad record at 0x" +
+              utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " +
+              Msg.str());
+    };
+    if (ID.getOpaqueData() == 0)
+      return formatError("zero is not a valid ref");
+    return Error::success();
+  };
+  return UniDB->getKeyValueDB().validate(ValidateRef);
+}
+
+Expected<std::unique_ptr<ActionCache>>
+cas::createOnDiskActionCache(StringRef Path) {
+#if LLVM_ENABLE_ONDISK_CAS
+  return OnDiskActionCache::create(Path);
+#else
+  return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled");
+#endif
+}
+
+std::unique_ptr<ActionCache>
+cas::builtin::createActionCacheFromUnifiedOnDiskCache(
+    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) {
+  return std::make_unique<UnifiedOnDiskActionCache>(std::move(UniDB));
+}
diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp
index 73646ad2c3528..e9bc6d8beed4e 100644
--- a/llvm/lib/CAS/BuiltinCAS.cpp
+++ b/llvm/lib/CAS/BuiltinCAS.cpp
@@ -9,6 +9,7 @@
 #include "BuiltinCAS.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
 #include "llvm/Support/Process.h"
 
 using namespace llvm;
@@ -68,7 +69,7 @@ Expected<ObjectRef> BuiltinCAS::store(ArrayRef<ObjectRef> Refs,
                    Refs, Data);
 }
 
-Error BuiltinCAS::validate(const CASID &ID) {
+Error BuiltinCAS::validateObject(const CASID &ID) {
   auto Ref = getReference(ID);
   if (!Ref)
     return createUnknownObjectError(ID);
@@ -92,3 +93,14 @@ Error BuiltinCAS::validate(const CASID &ID) {
 
   return Error::success();
 }
+
+Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>>
+cas::builtin::createBuiltinUnifiedOnDiskCache(StringRef Path) {
+#if LLVM_ENABLE_ONDISK_CAS
+  return ondisk::UnifiedOnDiskCache::open(Path, /*SizeLimit=*/std::nullopt,
+                                          BuiltinCASContext::getHashName(),
+                                          sizeof(HashType));
+#else
+  return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled");
+#endif
+}
diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h
index 3b5374d5e1850..4d2de66cf636f 100644
--- a/llvm/lib/CAS/BuiltinCAS.h
+++ b/llvm/lib/CAS/BuiltinCAS.h
@@ -1,4 +1,4 @@
-//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -15,6 +15,9 @@
 
 namespace llvm::cas {
 class ActionCache;
+namespace ondisk {
+class UnifiedOnDiskCache;
+} // namespace ondisk
 namespace builtin {
 
 /// Common base class for builtin CAS implementations using the same CASContext.
@@ -65,9 +68,27 @@ class BuiltinCAS : public ObjectStore {
                              "corrupt storage");
   }
 
-  Error validate(const CASID &ID) final;
+  Error validateObject(const CASID &ID) final;
 };
 
+/// Create a \p UnifiedOnDiskCache instance that uses \p BLAKE3 hashing.
+Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>>
+createBuiltinUnifiedOnDiskCache(StringRef Path);
+
+/// \param UniDB A \p UnifiedOnDiskCache instance from \p
+/// createBuiltinUnifiedOnDiskCache.
+std::unique_ptr<ObjectStore> createObjectStoreFromUnifiedOnDiskCache(
+    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB);
+
+/// \param UniDB A \p UnifiedOnDiskCache instance from \p
+/// createBuiltinUnifiedOnDiskCache.
+std::unique_ptr<ActionCache> createActionCacheFromUnifiedOnDiskCache(
+    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB);
+
+// FIXME: Proxy not portable. Maybe also error-prone?
+constexpr StringLiteral DefaultDirProxy = "/^llvm::cas::builtin::default";
+constexpr StringLiteral DefaultDir = "llvm.cas.builtin.default";
+
 } // end namespace builtin
 } // end namespace llvm::cas
 
diff --git a/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp
new file mode 100644
index 0000000000000..f3f6fa043bc52
--- /dev/null
+++ b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
+#include "BuiltinCAS.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>>
+cas::createOnDiskUnifiedCASDatabases(StringRef Path) {
+  std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB;
+  if (Error E = builtin::createBuiltinUnifiedOnDiskCache(Path).moveInto(UniDB))
+    return std::move(E);
+  auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB);
+  auto AC = builtin::createActionCacheFromUnifiedOnDiskCache(std::move(UniDB));
+  return std::make_pair(std::move(CAS), std::move(AC));
+}
+
+Expected<ValidationResult> cas::validateOnDiskUnifiedCASDatabasesIfNeeded(
+    StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation,
+    std::optional<StringRef> LLVMCasBinary) {
+#if LLVM_ENABLE_ONDISK_CAS
+  return ondisk::UnifiedOnDiskCache::validateIfNeeded(
+      Path, builtin::BuiltinCASContext::getHashName(),
+      sizeof(builtin::HashType), CheckHash, AllowRecovery, ForceValidation,
+      LLVMCasBinary);
+#else
+  return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled");
+#endif
+}
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index a2f8c49e50145..b03895cfc77d7 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -1,22 +1,30 @@
+if (UNIX AND "${CMAKE_SYSTEM_NAME}" MATCHES "AIX")
+  set(additional_libs bsd)
+endif()
+
 add_llvm_component_library(LLVMCAS
   ActionCache.cpp
   ActionCaches.cpp
   BuiltinCAS.cpp
+  BuiltinUnifiedCASDatabases.cpp
   DatabaseFile.cpp
   InMemoryCAS.cpp
   MappedFileRegionArena.cpp
   ObjectStore.cpp
+  OnDiskCAS.cpp
   OnDiskCommon.cpp
   OnDiskDataAllocator.cpp
   OnDiskGraphDB.cpp
   OnDiskKeyValueDB.cpp
   OnDiskTrieRawHashMap.cpp
+  UnifiedOnDiskCache.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS
 
   LINK_LIBS
   ${LLVM_PTHREAD_LIB}
+  ${additional_libs}
 
   LINK_COMPONENTS
   Support
diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp
index c63ee70de0849..2d4eedd5bdc8f 100644
--- a/llvm/lib/CAS/InMemoryCAS.cpp
+++ b/llvm/lib/CAS/InMemoryCAS.cpp
@@ -233,6 +233,12 @@ class InMemoryCAS : public BuiltinCAS {
     return cast<InMemoryObject>(asInMemoryObject(Node)).getData();
   }
 
+  void print(raw_ostream &OS) const final;
+
+  Error validate(bool CheckHash) const final {
+    return createStringError("InMemoryCAS doesn't support validate()");
+  }
+
   InMemoryCAS() = default;
 
 private:
@@ -271,6 +277,8 @@ ArrayRef<const InMemoryObject *> InMemoryObject::getRefs() const {
   return cast<InMemoryInlineObject>(this)->getRefsImpl();
 }
 
+void InMemoryCAS::print(raw_ostream &OS) const {}
+
 Expected<ObjectRef>
 InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash,
                                            sys::fs::mapped_file_region Map) {
diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp
index e0be50bbe013a..c3f7a0c4c67ac 100644
--- a/llvm/lib/CAS/ObjectStore.cpp
+++ b/llvm/lib/CAS/ObjectStore.cpp
@@ -1,4 +1,4 @@
-//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -12,7 +12,7 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include <optional>
+#include <deque>
 
 using namespace llvm;
 using namespace llvm::cas;
@@ -21,6 +21,7 @@ void CASContext::anchor() {}
 void ObjectStore::anchor() {}
 
 LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void ObjectStore::dump() const { print(dbgs()); }
 LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); }
 LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); }
 
@@ -141,7 +142,7 @@ Error ObjectStore::validateTree(ObjectRef Root) {
     auto [I, Inserted] = ValidatedRefs.insert(Ref);
     if (!Inserted)
       continue; // already validated.
-    if (Error E = validate(getID(Ref)))
+    if (Error E = validateObject(getID(Ref)))
       return E;
     Expected<ObjectHandle> Obj = load(Ref);
     if (!Obj)
@@ -155,6 +156,95 @@ Error ObjectStore::validateTree(ObjectRef Root) {
   return Error::success();
 }
 
+Expected<ObjectRef> ObjectStore::importObject(ObjectStore &Upstream,
+                                              ObjectRef Other) {
+  // Copy the full CAS tree from upstream with depth-first ordering to ensure
+  // all the child nodes are available in downstream CAS before inserting
+  // current object. This uses a similar algorithm as
+  // `OnDiskGraphDB::importFullTree` but doesn't assume the upstream CAS schema
+  // so it can be used to import from any other ObjectStore reguardless of the
+  // CAS schema.
+
+  // There is no work to do if importing from self.
+  if (this == &Upstream)
+    return Other;
+
+  /// Keeps track of the state of visitation for current node and all of its
+  /// parents. Upstream Cursor holds information only from upstream CAS.
+  struct UpstreamCursor {
+    ObjectRef Ref;
+    ObjectHandle Node;
+    size_t RefsCount;
+    std::deque<ObjectRef> Refs;
+  };
+  SmallVector<UpstreamCursor, 16> CursorStack;
+  /// PrimaryNodeStack holds the ObjectRef of the current CAS, with nodes either
+  /// just stored in the CAS or nodes already exists in the current CAS.
+  SmallVector<ObjectRef, 128> PrimaryRefStack;
+  /// A map from upstream ObjectRef to current ObjectRef.
+  llvm::DenseMap<ObjectRef, ObjectRef> CreatedObjects;
+
+  auto enqueueNode = [&](ObjectRef Ref, ObjectHandle Node) {
+    unsigned NumRefs = Upstream.getNumRefs(Node);
+    std::deque<ObjectRef> Refs;
+    for (unsigned I = 0; I < NumRefs; ++I)
+      Refs.push_back(Upstream.readRef(Node, I));
+
+    CursorStack.push_back({Ref, Node, NumRefs, std::move(Refs)});
+  };
+
+  auto UpstreamHandle = Upstream.load(Other);
+  if (!UpstreamHandle)
+    return UpstreamHandle.takeError();
+  enqueueNode(Other, *UpstreamHandle);
+
+  while (!CursorStack.empty()) {
+    UpstreamCursor &Cur = CursorStack.back();
+    if (Cur.Refs.empty()) {
+      // Copy the node data into the primary store.
+      // The bottom of \p PrimaryRefStack contains the ObjectRef for the
+      // current node.
+      assert(PrimaryRefStack.size() >= Cur.RefsCount);
+      auto Refs = ArrayRef(PrimaryRefStack)
+                      .slice(PrimaryRefStack.size() - Cur.RefsCount);
+      auto NewNode = store(Refs, Upstream.getData(Cur.Node));
+      if (!NewNode)
+        return NewNode.takeError();
+
+      // Remove the current node and its IDs from the stack.
+      PrimaryRefStack.truncate(PrimaryRefStack.size() - Cur.RefsCount);
+
+      // Push new node into created objects.
+      PrimaryRefStack.push_back(*NewNode);
+      CreatedObjects.try_emplace(Cur.Ref, *NewNode);
+
+      // Pop the cursor in the end after all uses.
+      CursorStack.pop_back();
+      continue;
+    }
+
+    // Check if the node exists already.
+    auto CurrentID = Cur.Refs.front();
+    Cur.Refs.pop_front();
+    auto Ref = CreatedObjects.find(CurrentID);
+    if (Ref != CreatedObjects.end()) {
+      // If exists already, just need to enqueue the primary node.
+      PrimaryRefStack.push_back(Ref->second);
+      continue;
+    }
+
+    // Load child.
+    auto PrimaryID = Upstream.load(CurrentID);
+    if (LLVM_UNLIKELY(!PrimaryID))
+      return PrimaryID.takeError();
+
+    enqueueNode(CurrentID, *PrimaryID);
+  }
+
+  assert(PrimaryRefStack.size() == 1);
+  return PrimaryRefStack.front();
+}
+
 std::unique_ptr<MemoryBuffer>
 ObjectProxy::getMemoryBuffer(StringRef Name,
                              bool RequiresNullTerminator) const {
diff --git a/llvm/lib/CAS/OnDiskCAS.cpp b/llvm/lib/CAS/OnDiskCAS.cpp
new file mode 100644
index 0000000000000..7d29f4499211e
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskCAS.cpp
@@ -0,0 +1,211 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BuiltinCAS.h"
+#include "llvm/CAS/BuiltinCASContext.h"
+#include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Error.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::builtin;
+
+namespace {
+
+class OnDiskCAS : public BuiltinCAS {
+public:
+  Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash,
+                                ArrayRef<ObjectRef> Refs,
+                                ArrayRef<char> Data) final;
+
+  Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) final;
+
+  CASID getID(ObjectRef Ref) const final;
+
+  std::optional<ObjectRef> getReference(const CASID &ID) const final;
+
+  Expected<bool> isMaterialized(ObjectRef Ref) const final;
+
+  ArrayRef<char> getDataConst(ObjectHandle Node) const final;
+
+  void print(raw_ostream &OS) const final;
+  Error validate(bool CheckHash) const final;
+
+  static Expected<std::unique_ptr<OnDiskCAS>> open(StringRef Path);
+
+  OnDiskCAS(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB)
+      : UnifiedDB(std::move(UniDB)), DB(&UnifiedDB->getGraphDB()) {}
+
+private:
+  ObjectHandle convertHandle(ondisk::ObjectHandle Node) const {
+    return makeObjectHandle(Node.getOpaqueData());
+  }
+
+  ondisk::ObjectHandle convertHandle(ObjectHandle Node) const {
+    return ondisk::ObjectHandle(Node.getInternalRef(*this));
+  }
+
+  ObjectRef convertRef(ondisk::ObjectID Ref) const {
+    return makeObjectRef(Ref.getOpaqueData());
+  }
+
+  ondisk::ObjectID convertRef(ObjectRef Ref) const {
+    return ondisk::ObjectID::fromOpaqueData(Ref.getInternalRef(*this));
+  }
+
+  size_t getNumRefs(ObjectHandle Node) const final {
+    auto RefsRange = DB->getObjectRefs(convertHandle(Node));
+    return std::distance(RefsRange.begin(), RefsRange.end());
+  }
+
+  ObjectRef readRef(ObjectHandle Node, size_t I) const final {
+    auto RefsRange = DB->getObjectRefs(convertHandle(Node));
+    return convertRef(RefsRange.begin()[I]);
+  }
+
+  Error forEachRef(ObjectHandle Node,
+                   function_ref<Error(ObjectRef)> Callback) const final;
+
+  Error setSizeLimit(std::optional<uint64_t> SizeLimit) final;
+  Expected<std::optional<uint64_t>> getStorageSize() const final;
+  Error pruneStorageData() final;
+
+  OnDiskCAS(std::unique_ptr<ondisk::OnDiskGraphDB> GraphDB)
+      : OwnedDB(std::move(GraphDB)), DB(OwnedDB.get()) {}
+
+  std::unique_ptr<ondisk::OnDiskGraphDB> OwnedDB;
+  std::shared_ptr<ondisk::UnifiedOnDiskCache> UnifiedDB;
+  ondisk::OnDiskGraphDB *DB;
+};
+
+} // end anonymous namespace
+
+void OnDiskCAS::print(raw_ostream &OS) const { DB->print(OS); }
+Error OnDiskCAS::validate(bool CheckHash) const {
+  auto Hasher = [](ArrayRef<ArrayRef<uint8_t>> Refs, ArrayRef<char> Data,
+                   SmallVectorImpl<uint8_t> &Result) {
+    auto Hash = BuiltinObjectHasher<llvm::cas::builtin::HasherT>::hashObject(
+        Refs, Data);
+    Result.assign(Hash.begin(), Hash.end());
+  };
+
+  if (auto E = DB->validate(CheckHash, Hasher))
+    return E;
+
+  return Error::success();
+}
+
+CASID OnDiskCAS::getID(ObjectRef Ref) const {
+  ArrayRef<uint8_t> Hash = DB->getDigest(convertRef(Ref));
+  return CASID::create(&getContext(), toStringRef(Hash));
+}
+
+std::optional<ObjectRef> OnDiskCAS::getReference(const CASID &ID) const {
+  std::optional<ondisk::ObjectID> ObjID =
+      DB->getExistingReference(ID.getHash());
+  if (!ObjID)
+    return std::nullopt;
+  return convertRef(*ObjID);
+}
+
+Expected<bool> OnDiskCAS::isMaterialized(ObjectRef ExternalRef) const {
+  return DB->isMaterialized(convertRef(ExternalRef));
+}
+
+ArrayRef<char> OnDiskCAS::getDataConst(ObjectHandle Node) const {
+  return DB->getObjectData(convertHandle(Node));
+}
+
+Expected<std::optional<ObjectHandle>>
+OnDiskCAS::loadIfExists(ObjectRef ExternalRef) {
+  Expected<std::optional<ondisk::ObjectHandle>> ObjHnd =
+      DB->load(convertRef(ExternalRef));
+  if (!ObjHnd)
+    return ObjHnd.takeError();
+  if (!*ObjHnd)
+    return std::nullopt;
+  return convertHandle(**ObjHnd);
+}
+
+Expected<ObjectRef> OnDiskCAS::storeImpl(ArrayRef<uint8_t> ComputedHash,
+                                         ArrayRef<ObjectRef> Refs,
+                                         ArrayRef<char> Data) {
+  SmallVector<ondisk::ObjectID, 64> IDs;
+  IDs.reserve(Refs.size());
+  for (ObjectRef Ref : Refs) {
+    IDs.push_back(convertRef(Ref));
+  }
+
+  auto StoredID = DB->getReference(ComputedHash);
+  if (LLVM_UNLIKELY(!StoredID))
+    return StoredID.takeError();
+  if (Error E = DB->store(*StoredID, IDs, Data))
+    return std::move(E);
+  return convertRef(*StoredID);
+}
+
+Error OnDiskCAS::forEachRef(ObjectHandle Node,
+                            function_ref<Error(ObjectRef)> Callback) const {
+  auto RefsRange = DB->getObjectRefs(convertHandle(Node));
+  for (ondisk::ObjectID Ref : RefsRange) {
+    if (Error E = Callback(convertRef(Ref)))
+      return E;
+  }
+  return Error::success();
+}
+
+Error OnDiskCAS::setSizeLimit(std::optional<uint64_t> SizeLimit) {
+  UnifiedDB->setSizeLimit(SizeLimit);
+  return Error::success();
+}
+
+Expected<std::optional<uint64_t>> OnDiskCAS::getStorageSize() const {
+  return UnifiedDB->getStorageSize();
+}
+
+Error OnDiskCAS::pruneStorageData() { return UnifiedDB->collectGarbage(); }
+
+Expected<std::unique_ptr<OnDiskCAS>> OnDiskCAS::open(StringRef AbsPath) {
+  Expected<std::unique_ptr<ondisk::OnDiskGraphDB>> DB =
+      ondisk::OnDiskGraphDB::open(AbsPath, BuiltinCASContext::getHashName(),
+                                  sizeof(HashType));
+  if (!DB)
+    return DB.takeError();
+  return std::unique_ptr<OnDiskCAS>(new OnDiskCAS(std::move(*DB)));
+}
+
+bool cas::isOnDiskCASEnabled() {
+#if LLVM_ENABLE_ONDISK_CAS
+  return true;
+#else
+  return false;
+#endif
+}
+
+Expected<std::unique_ptr<ObjectStore>> cas::createOnDiskCAS(const Twine &Path) {
+#if LLVM_ENABLE_ONDISK_CAS
+  // FIXME: An absolute path isn't really good enough. Should open a directory
+  // and use openat() for files underneath.
+  SmallString<256> AbsPath;
+  Path.toVector(AbsPath);
+  sys::fs::make_absolute(AbsPath);
+
+  return OnDiskCAS::open(AbsPath);
+#else
+  return createStringError(inconvertibleErrorCode(), "OnDiskCAS is disabled");
+#endif /* LLVM_ENABLE_ONDISK_CAS */
+}
+
+std::unique_ptr<ObjectStore>
+cas::builtin::createObjectStoreFromUnifiedOnDiskCache(
+    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) {
+  return std::make_unique<OnDiskCAS>(std::move(UniDB));
+}
diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp
index 64cbe9dc8e159..245b6fb832549 100644
--- a/llvm/lib/CAS/OnDiskGraphDB.cpp
+++ b/llvm/lib/CAS/OnDiskGraphDB.cpp
@@ -893,6 +893,10 @@ int64_t DataRecordHandle::getDataRelOffset() const {
 }
 
 Error OnDiskGraphDB::validate(bool Deep, HashingFuncT Hasher) const {
+  if (UpstreamDB) {
+    if (auto E = UpstreamDB->validate(Deep, Hasher))
+      return E;
+  }
   return Index.validate([&](FileOffset Offset,
                             OnDiskTrieRawHashMap::ConstValueProxy Record)
                             -> Error {
@@ -1202,11 +1206,8 @@ OnDiskGraphDB::load(ObjectID ExternalRef) {
     return I.takeError();
   TrieRecord::Data Object = I->Ref.load();
 
-  if (Object.SK == TrieRecord::StorageKind::Unknown) {
-    if (!UpstreamDB)
-      return std::nullopt;
+  if (Object.SK == TrieRecord::StorageKind::Unknown)
     return faultInFromUpstream(ExternalRef);
-  }
 
   if (Object.SK == TrieRecord::StorageKind::DataPool)
     return ObjectHandle::fromFileOffset(Object.Offset);
@@ -1286,8 +1287,10 @@ OnDiskGraphDB::getObjectPresence(ObjectID ExternalRef,
   TrieRecord::Data Object = I->Ref.load();
   if (Object.SK != TrieRecord::StorageKind::Unknown)
     return ObjectPresence::InPrimaryDB;
+
   if (!CheckUpstream || !UpstreamDB)
     return ObjectPresence::Missing;
+
   std::optional<ObjectID> UpstreamID =
       UpstreamDB->getExistingReference(getDigest(*I));
   return UpstreamID.has_value() ? ObjectPresence::OnlyInUpstreamDB
@@ -1549,9 +1552,10 @@ unsigned OnDiskGraphDB::getHardStorageLimitUtilization() const {
   return std::max(IndexPercent, DataPercent);
 }
 
-Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open(
-    StringRef AbsPath, StringRef HashName, unsigned HashByteSize,
-    std::unique_ptr<OnDiskGraphDB> UpstreamDB, FaultInPolicy Policy) {
+Expected<std::unique_ptr<OnDiskGraphDB>>
+OnDiskGraphDB::open(StringRef AbsPath, StringRef HashName,
+                    unsigned HashByteSize, OnDiskGraphDB *UpstreamDB,
+                    FaultInPolicy Policy) {
   if (std::error_code EC = sys::fs::create_directories(AbsPath))
     return createFileError(AbsPath, EC);
 
@@ -1604,18 +1608,15 @@ Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open(
                              "unexpected user header in '" + DataPoolPath +
                                  "'");
 
-  return std::unique_ptr<OnDiskGraphDB>(
-      new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool),
-                        std::move(UpstreamDB), Policy));
+  return std::unique_ptr<OnDiskGraphDB>(new OnDiskGraphDB(
+      AbsPath, std::move(*Index), std::move(*DataPool), UpstreamDB, Policy));
 }
 
 OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
                              OnDiskDataAllocator DataPool,
-                             std::unique_ptr<OnDiskGraphDB> UpstreamDB,
-                             FaultInPolicy Policy)
+                             OnDiskGraphDB *UpstreamDB, FaultInPolicy Policy)
     : Index(std::move(Index)), DataPool(std::move(DataPool)),
-      RootPath(RootPath.str()), UpstreamDB(std::move(UpstreamDB)),
-      FIPolicy(Policy) {
+      RootPath(RootPath.str()), UpstreamDB(UpstreamDB), FIPolicy(Policy) {
   /// Lifetime for "big" objects not in DataPool.
   ///
   /// NOTE: Could use ThreadSafeTrieRawHashMap here. For now, doing something
@@ -1638,7 +1639,6 @@ Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID,
   // against the process dying during importing and leaving the database with an
   // incomplete tree. Note that if the upstream has missing nodes then the tree
   // will be copied with missing nodes as well, it won't be considered an error.
-
   struct UpstreamCursor {
     ObjectHandle Node;
     size_t RefsCount;
@@ -1720,7 +1720,6 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,
   // Copy the node data into the primary store.
   // FIXME: Use hard-link or cloning if the file-system supports it and data is
   // stored into a separate file.
-
   auto Data = UpstreamDB->getObjectData(UpstreamNode);
   auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode);
   SmallVector<ObjectID, 64> Refs;
@@ -1737,7 +1736,8 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,
 
 Expected<std::optional<ObjectHandle>>
 OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) {
-  assert(UpstreamDB);
+  if (!UpstreamDB)
+    return std::nullopt;
 
   auto UpstreamID = UpstreamDB->getReference(getDigest(PrimaryID));
   if (LLVM_UNLIKELY(!UpstreamID))
diff --git a/llvm/lib/CAS/OnDiskKeyValueDB.cpp b/llvm/lib/CAS/OnDiskKeyValueDB.cpp
index 21860717da3bf..15656cb38a5e5 100644
--- a/llvm/lib/CAS/OnDiskKeyValueDB.cpp
+++ b/llvm/lib/CAS/OnDiskKeyValueDB.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CAS/OnDiskKeyValueDB.h"
 #include "OnDiskCommon.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Errc.h"
@@ -53,15 +54,21 @@ Expected<std::optional<ArrayRef<char>>>
 OnDiskKeyValueDB::get(ArrayRef<uint8_t> Key) {
   // Check the result cache.
   OnDiskTrieRawHashMap::ConstOnDiskPtr ActionP = Cache.find(Key);
-  if (!ActionP)
+  if (ActionP) {
+    assert(isAddrAligned(Align(8), ActionP->Data.data()));
+    return ActionP->Data;
+  }
+  if (!UnifiedCache || !UnifiedCache->UpstreamKVDB)
     return std::nullopt;
-  assert(isAddrAligned(Align(8), ActionP->Data.data()));
-  return ActionP->Data;
+
+  // Try to fault in from upstream.
+  return UnifiedCache->faultInFromUpstreamKV(Key);
 }
 
 Expected<std::unique_ptr<OnDiskKeyValueDB>>
 OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize,
-                       StringRef ValueName, size_t ValueSize) {
+                       StringRef ValueName, size_t ValueSize,
+                       UnifiedOnDiskCache *Cache) {
   if (std::error_code EC = sys::fs::create_directories(Path))
     return createFileError(Path, EC);
 
@@ -87,10 +94,14 @@ OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize,
     return std::move(E);
 
   return std::unique_ptr<OnDiskKeyValueDB>(
-      new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache)));
+      new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache), Cache));
 }
 
 Error OnDiskKeyValueDB::validate(CheckValueT CheckValue) const {
+  if (UnifiedCache && UnifiedCache->UpstreamKVDB) {
+    if (auto E = UnifiedCache->UpstreamKVDB->validate(CheckValue))
+      return E;
+  }
   return Cache.validate(
       [&](FileOffset Offset,
           OnDiskTrieRawHashMap::ConstValueProxy Record) -> Error {
diff --git a/llvm/lib/CAS/UnifiedOnDiskCache.cpp b/llvm/lib/CAS/UnifiedOnDiskCache.cpp
new file mode 100644
index 0000000000000..e6b676accb0fe
--- /dev/null
+++ b/llvm/lib/CAS/UnifiedOnDiskCache.cpp
@@ -0,0 +1,611 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Encapsulates \p OnDiskGraphDB and \p OnDiskKeyValueDB instances within one
+/// directory while also restricting storage growth with a scheme of chaining
+/// the two most recent directories (primary & upstream), where the primary
+/// "faults-in" data from the upstream one. When the primary (most recent)
+/// directory exceeds its intended limit a new empty directory becomes the
+/// primary one.
+///
+/// Within the top-level directory (the path that \p UnifiedOnDiskCache::open
+/// receives) there are directories named like this:
+///
+/// 'v<version>.<x>'
+/// 'v<version>.<x+1>'
+/// 'v<version>.<x+2>'
+/// ...
+///
+/// 'version' is the version integer for this \p UnifiedOnDiskCache's scheme and
+/// the part after the dot is an increasing integer. The primary directory is
+/// the one with the highest integer and the upstream one is the directory
+/// before it. For example, if the sub-directories contained are:
+///
+/// 'v1.5', 'v1.6', 'v1.7', 'v1.8'
+///
+/// Then the primary one is 'v1.8', the upstream one is 'v1.7', and the rest are
+/// unused directories that can be safely deleted at any time and by any
+/// process.
+///
+/// Contained within the top-level directory is a file named "lock" which is
+/// used for processes to take shared or exclusive locks for the contents of the
+/// top directory. While a \p UnifiedOnDiskCache is open it keeps a shared lock
+/// for the top-level directory; when it closes, if the primary sub-directory
+/// exceeded its limit, it attempts to get an exclusive lock in order to create
+/// a new empty primary directory; if it can't get the exclusive lock it gives
+/// up and lets the next \p UnifiedOnDiskCache instance that closes to attempt
+/// again.
+///
+/// The downside of this scheme is that while \p UnifiedOnDiskCache is open on a
+/// directory, by any process, the storage size in that directory will keep
+/// growing unrestricted. But the major benefit is that garbage-collection can
+/// be triggered on a directory concurrently, at any time and by any process,
+/// without affecting any active readers/writers in the same process or other
+/// processes.
+///
+/// The \c UnifiedOnDiskCache also provides validation and recovery on top of
+/// the underlying on-disk storage. The low-level storage is designed to remain
+/// coherent across regular process crashes, but may be invalid after power loss
+/// or similar system failures. \c UnifiedOnDiskCache::validateIfNeeded allows
+/// validating the contents once per boot and can recover by marking invalid
+/// data for garbage collection.
+///
+/// The data recovery described above requires exclusive access to the CAS, and
+/// it is an error to attempt recovery if the CAS is open in any process/thread.
+/// In order to maximize backwards compatibility with tools that do not perform
+/// validation before opening the CAS, we do not attempt to get exclusive access
+/// until recovery is actually performed, meaning as long as the data is valid
+/// it will not conflict with concurrent use.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "BuiltinCAS.h"
+#include "OnDiskCommon.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/raw_ostream.h"
+#include <optional>
+
+#if __has_include(<sys/sysctl.h>)
+#include <sys/sysctl.h>
+#endif
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+/// FIXME: When the version of \p DBDirPrefix is bumped up we need to figure out
+/// how to handle the leftover sub-directories of the previous version, within
+/// the \p UnifiedOnDiskCache::collectGarbage function.
+static constexpr StringLiteral DBDirPrefix = "v1.";
+
+static constexpr StringLiteral ValidationFilename = "v1.validation";
+static constexpr StringLiteral CorruptPrefix = "corrupt.";
+
+ObjectID UnifiedOnDiskCache::getObjectIDFromValue(ArrayRef<char> Value) {
+  // little endian encoded.
+  assert(Value.size() == sizeof(uint64_t));
+  return ObjectID::fromOpaqueData(support::endian::read64le(Value.data()));
+}
+
+UnifiedOnDiskCache::ValueBytes
+UnifiedOnDiskCache::getValueFromObjectID(ObjectID ID) {
+  // little endian encoded.
+  UnifiedOnDiskCache::ValueBytes ValBytes;
+  static_assert(ValBytes.size() == sizeof(ID.getOpaqueData()));
+  support::endian::write64le(ValBytes.data(), ID.getOpaqueData());
+  return ValBytes;
+}
+
+Expected<std::optional<ArrayRef<char>>>
+UnifiedOnDiskCache::faultInFromUpstreamKV(ArrayRef<uint8_t> Key) {
+  assert(UpstreamGraphDB);
+  assert(UpstreamKVDB);
+
+  std::optional<ArrayRef<char>> UpstreamValue;
+  if (Error E = UpstreamKVDB->get(Key).moveInto(UpstreamValue))
+    return std::move(E);
+  if (!UpstreamValue)
+    return std::nullopt;
+
+  // The value is the \p ObjectID in the context of the upstream
+  // \p OnDiskGraphDB instance. Translate it to the context of the primary
+  // \p OnDiskGraphDB instance.
+  ObjectID UpstreamID = getObjectIDFromValue(*UpstreamValue);
+  auto PrimaryID =
+      PrimaryGraphDB->getReference(UpstreamGraphDB->getDigest(UpstreamID));
+  if (LLVM_UNLIKELY(!PrimaryID))
+    return PrimaryID.takeError();
+  return PrimaryKVDB->put(Key, getValueFromObjectID(*PrimaryID));
+}
+
+/// \returns all the 'v<version>.<x>' names of sub-directories, sorted with
+/// ascending order of the integer after the dot. Corrupt directories, if
+/// included, will come first.
+static Expected<SmallVector<std::string, 4>>
+getAllDBDirs(StringRef Path, bool IncludeCorrupt = false) {
+  struct DBDir {
+    uint64_t Order;
+    std::string Name;
+  };
+  SmallVector<DBDir> FoundDBDirs;
+
+  std::error_code EC;
+  for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE;
+       DirI.increment(EC)) {
+    if (DirI->type() != sys::fs::file_type::directory_file)
+      continue;
+    StringRef SubDir = sys::path::filename(DirI->path());
+    if (IncludeCorrupt && SubDir.starts_with(CorruptPrefix)) {
+      FoundDBDirs.push_back({0, std::string(SubDir)});
+      continue;
+    }
+    if (!SubDir.starts_with(DBDirPrefix))
+      continue;
+    uint64_t Order;
+    if (SubDir.substr(DBDirPrefix.size()).getAsInteger(10, Order))
+      return createStringError(inconvertibleErrorCode(),
+                               "unexpected directory " + DirI->path());
+    FoundDBDirs.push_back({Order, std::string(SubDir)});
+  }
+  if (EC)
+    return createFileError(Path, EC);
+
+  llvm::sort(FoundDBDirs, [](const DBDir &LHS, const DBDir &RHS) -> bool {
+    return LHS.Order < RHS.Order;
+  });
+
+  SmallVector<std::string, 4> DBDirs;
+  for (DBDir &Dir : FoundDBDirs)
+    DBDirs.push_back(std::move(Dir.Name));
+  return DBDirs;
+}
+
+static Expected<SmallVector<std::string, 4>> getAllGarbageDirs(StringRef Path) {
+  auto DBDirs = getAllDBDirs(Path, /*IncludeCorrupt=*/true);
+  if (!DBDirs)
+    return DBDirs.takeError();
+
+  // FIXME: When the version of \p DBDirPrefix is bumped up we need to figure
+  // out how to handle the leftover sub-directories of the previous version.
+
+  for (unsigned Keep = 2; Keep > 0 && !DBDirs->empty(); --Keep) {
+    StringRef Back(DBDirs->back());
+    if (Back.starts_with(CorruptPrefix))
+      break;
+    DBDirs->pop_back();
+  }
+  return *DBDirs;
+}
+
+/// \returns Given a sub-directory named 'v<version>.<x>', it outputs the
+/// 'v<version>.<x+1>' name.
+static void getNextDBDirName(StringRef DBDir, llvm::raw_ostream &OS) {
+  assert(DBDir.starts_with(DBDirPrefix));
+  uint64_t Count;
+  bool Failed = DBDir.substr(DBDirPrefix.size()).getAsInteger(10, Count);
+  assert(!Failed);
+  (void)Failed;
+  OS << DBDirPrefix << Count + 1;
+}
+
+static Error validateOutOfProcess(StringRef LLVMCasBinary, StringRef RootPath,
+                                  bool CheckHash) {
+  SmallVector<StringRef> Args{LLVMCasBinary, "-cas", RootPath, "-validate"};
+  if (CheckHash)
+    Args.push_back("-check-hash");
+
+  llvm::SmallString<128> StdErrPath;
+  int StdErrFD = -1;
+  if (std::error_code EC = sys::fs::createTemporaryFile(
+          "llvm-cas-validate-stderr", "txt", StdErrFD, StdErrPath,
+          llvm::sys::fs::OF_Text))
+    return createStringError(EC, "failed to create temporary file");
+  FileRemover OutputRemover(StdErrPath.c_str());
+
+  std::optional<llvm::StringRef> Redirects[] = {
+      {""}, // stdin = /dev/null
+      {""}, // stdout = /dev/null
+      StdErrPath.str(),
+  };
+
+  std::string ErrMsg;
+  int Result =
+      sys::ExecuteAndWait(LLVMCasBinary, Args, /*Env=*/std::nullopt, Redirects,
+                          /*SecondsToWait=*/120, /*MemoryLimit=*/0, &ErrMsg);
+
+  if (Result == -1)
+    return createStringError("failed to exec " + join(Args, " ") + ": " +
+                             ErrMsg);
+  if (Result != 0) {
+    llvm::SmallString<64> Err("cas contents invalid");
+    if (!ErrMsg.empty()) {
+      Err += ": ";
+      Err += ErrMsg;
+    }
+    auto StdErrBuf = MemoryBuffer::getFile(StdErrPath.c_str());
+    if (StdErrBuf && !(*StdErrBuf)->getBuffer().empty()) {
+      Err += ": ";
+      Err += (*StdErrBuf)->getBuffer();
+    }
+    return createStringError(Err);
+  }
+  return Error::success();
+}
+
+static Error validateInProcess(StringRef RootPath, StringRef HashName,
+                               unsigned HashByteSize, bool CheckHash) {
+  std::shared_ptr<UnifiedOnDiskCache> UniDB;
+  if (Error E = UnifiedOnDiskCache::open(RootPath, std::nullopt, HashName,
+                                         HashByteSize)
+                    .moveInto(UniDB))
+    return E;
+  auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB);
+  if (Error E = CAS->validate(CheckHash))
+    return E;
+  auto Cache = builtin::createActionCacheFromUnifiedOnDiskCache(UniDB);
+  if (Error E = Cache->validate())
+    return E;
+  return Error::success();
+}
+
+static Expected<uint64_t> getBootTime() {
+#if __has_include(<sys/sysctl.h>) && defined(KERN_BOOTTIME)
+  struct timeval TV;
+  size_t TVLen = sizeof(TV);
+  int KernBoot[2] = {CTL_KERN, KERN_BOOTTIME};
+  if (sysctl(KernBoot, 2, &TV, &TVLen, nullptr, 0) < 0)
+    return createStringError(llvm::errnoAsErrorCode(),
+                             "failed to get boottime");
+  if (TVLen != sizeof(TV))
+    return createStringError("sysctl kern.boottime unexpected format");
+  return TV.tv_sec;
+#elif defined(__linux__)
+  // Use the mtime for /proc, which is recreated during system boot.
+  // We could also read /proc/stat and search for 'btime'.
+  sys::fs::file_status Status;
+  if (std::error_code EC = sys::fs::status("/proc", Status))
+    return createFileError("/proc", EC);
+  return Status.getLastModificationTime().time_since_epoch().count();
+#else
+  llvm::report_fatal_error("getBootTime unimplemented");
+#endif
+}
+
+Expected<ValidationResult> UnifiedOnDiskCache::validateIfNeeded(
+    StringRef RootPath, StringRef HashName, unsigned HashByteSize,
+    bool CheckHash, bool AllowRecovery, bool ForceValidation,
+    std::optional<StringRef> LLVMCasBinaryPath) {
+  if (std::error_code EC = sys::fs::create_directories(RootPath))
+    return createFileError(RootPath, EC);
+
+  SmallString<256> PathBuf(RootPath);
+  sys::path::append(PathBuf, ValidationFilename);
+  int FD = -1;
+  if (std::error_code EC = sys::fs::openFileForReadWrite(
+          PathBuf, FD, sys::fs::CD_OpenAlways, sys::fs::OF_None))
+    return createFileError(PathBuf, EC);
+  assert(FD != -1);
+
+  sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
+  auto CloseFile = make_scope_exit([&]() { sys::fs::closeFile(File); });
+
+  if (std::error_code EC = lockFileThreadSafe(FD, sys::fs::LockKind::Exclusive))
+    return createFileError(PathBuf, EC);
+  auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(FD); });
+
+  SmallString<8> Bytes;
+  if (Error E = sys::fs::readNativeFileToEOF(File, Bytes))
+    return createFileError(PathBuf, std::move(E));
+
+  uint64_t ValidationBootTime = 0;
+  if (!Bytes.empty() &&
+      StringRef(Bytes).trim().getAsInteger(10, ValidationBootTime))
+    return createFileError(PathBuf, errc::illegal_byte_sequence,
+                           "expected integer");
+
+  static uint64_t BootTime = 0;
+  if (BootTime == 0)
+    if (Error E = getBootTime().moveInto(BootTime))
+      return std::move(E);
+
+  if (ValidationBootTime == BootTime && !ForceValidation)
+    return ValidationResult::Skipped;
+
+  // Validate!
+  bool NeedsRecovery = false;
+  if (Error E =
+          LLVMCasBinaryPath
+              ? validateOutOfProcess(*LLVMCasBinaryPath, RootPath, CheckHash)
+              : validateInProcess(RootPath, HashName, HashByteSize,
+                                  CheckHash)) {
+    if (AllowRecovery) {
+      consumeError(std::move(E));
+      NeedsRecovery = true;
+    } else {
+      return std::move(E);
+    }
+  }
+
+  if (NeedsRecovery) {
+    sys::path::remove_filename(PathBuf);
+    sys::path::append(PathBuf, "lock");
+
+    int LockFD = -1;
+    if (std::error_code EC = sys::fs::openFileForReadWrite(
+            PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None))
+      return createFileError(PathBuf, EC);
+    sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD);
+    auto CloseLock = make_scope_exit([&]() { sys::fs::closeFile(LockFile); });
+    if (std::error_code EC = tryLockFileThreadSafe(LockFD)) {
+      if (EC == std::errc::no_lock_available)
+        return createFileError(
+            PathBuf, EC,
+            "CAS validation requires exclusive access but CAS was in use");
+      return createFileError(PathBuf, EC);
+    }
+    auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); });
+
+    auto DBDirs = getAllDBDirs(RootPath);
+    if (!DBDirs)
+      return DBDirs.takeError();
+
+    for (StringRef DBDir : *DBDirs) {
+      sys::path::remove_filename(PathBuf);
+      sys::path::append(PathBuf, DBDir);
+      std::error_code EC;
+      int Attempt = 0, MaxAttempts = 100;
+      SmallString<128> GCPath;
+      for (; Attempt < MaxAttempts; ++Attempt) {
+        GCPath.assign(RootPath);
+        sys::path::append(GCPath, CorruptPrefix + std::to_string(Attempt) +
+                                      "." + DBDir);
+        EC = sys::fs::rename(PathBuf, GCPath);
+        // Darwin uses ENOTEMPTY. Linux may return either ENOTEMPTY or EEXIST.
+        if (EC != errc::directory_not_empty && EC != errc::file_exists)
+          break;
+      }
+      if (Attempt == MaxAttempts)
+        return createStringError(
+            EC, "rename " + PathBuf +
+                    " failed: too many CAS directories awaiting pruning");
+      if (EC)
+        return createStringError(EC, "rename " + PathBuf + " to " + GCPath +
+                                         " failed: " + EC.message());
+    }
+  }
+
+  if (ValidationBootTime != BootTime) {
+    // Fix filename in case we have error to report.
+    sys::path::remove_filename(PathBuf);
+    sys::path::append(PathBuf, ValidationFilename);
+    if (std::error_code EC = sys::fs::resize_file(FD, 0))
+      return createFileError(PathBuf, EC);
+    raw_fd_ostream OS(FD, /*shouldClose=*/false);
+    OS.seek(0); // resize does not reset position
+    OS << BootTime << '\n';
+    if (OS.has_error())
+      return createFileError(PathBuf, OS.error());
+  }
+
+  return NeedsRecovery ? ValidationResult::Recovered : ValidationResult::Valid;
+}
+
+Expected<std::unique_ptr<UnifiedOnDiskCache>>
+UnifiedOnDiskCache::open(StringRef RootPath, std::optional<uint64_t> SizeLimit,
+                         StringRef HashName, unsigned HashByteSize,
+                         OnDiskGraphDB::FaultInPolicy FaultInPolicy) {
+  if (std::error_code EC = sys::fs::create_directories(RootPath))
+    return createFileError(RootPath, EC);
+
+  SmallString<256> PathBuf(RootPath);
+  sys::path::append(PathBuf, "lock");
+  int LockFD = -1;
+  if (std::error_code EC = sys::fs::openFileForReadWrite(
+          PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None))
+    return createFileError(PathBuf, EC);
+  assert(LockFD != -1);
+  // Locking the directory using shared lock, which will prevent other processes
+  // from creating a new chain (essentially while a \p UnifiedOnDiskCache
+  // instance holds a shared lock the storage for the primary directory will
+  // grow unrestricted).
+  if (std::error_code EC =
+          lockFileThreadSafe(LockFD, sys::fs::LockKind::Shared))
+    return createFileError(PathBuf, EC);
+
+  auto DBDirs = getAllDBDirs(RootPath);
+  if (!DBDirs)
+    return DBDirs.takeError();
+  if (DBDirs->empty())
+    DBDirs->push_back((Twine(DBDirPrefix) + "1").str());
+
+  assert(!DBDirs->empty());
+
+  /// If there is only one directory open databases on it. If there are 2 or
+  /// more directories, get the most recent directories and chain them, with the
+  /// most recent being the primary one. The remaining directories are unused
+  /// data than can be garbage-collected.
+  auto UniDB = std::unique_ptr<UnifiedOnDiskCache>(new UnifiedOnDiskCache());
+  std::unique_ptr<OnDiskGraphDB> UpstreamGraphDB;
+  std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB;
+  if (DBDirs->size() > 1) {
+    StringRef UpstreamDir = *(DBDirs->end() - 2);
+    PathBuf = RootPath;
+    sys::path::append(PathBuf, UpstreamDir);
+    if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize,
+                                      /*UpstreamDB=*/nullptr, FaultInPolicy)
+                      .moveInto(UpstreamGraphDB))
+      return std::move(E);
+    if (Error E = OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize,
+                                         /*ValueName=*/"objectid",
+                                         /*ValueSize=*/sizeof(uint64_t))
+                      .moveInto(UpstreamKVDB))
+      return std::move(E);
+  }
+
+  StringRef PrimaryDir = *(DBDirs->end() - 1);
+  PathBuf = RootPath;
+  sys::path::append(PathBuf, PrimaryDir);
+  std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB;
+  if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize,
+                                    UpstreamGraphDB.get(), FaultInPolicy)
+                    .moveInto(PrimaryGraphDB))
+    return std::move(E);
+  std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB;
+  // \p UnifiedOnDiskCache does manual chaining for key-value requests,
+  // including an extra translation step of the value during fault-in.
+  if (Error E =
+          OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize,
+                                 /*ValueName=*/"objectid",
+                                 /*ValueSize=*/sizeof(uint64_t), UniDB.get())
+              .moveInto(PrimaryKVDB))
+    return std::move(E);
+
+  UniDB->RootPath = RootPath;
+  UniDB->SizeLimit = SizeLimit.value_or(0);
+  UniDB->LockFD = LockFD;
+  UniDB->NeedsGarbageCollection = DBDirs->size() > 2;
+  UniDB->PrimaryDBDir = PrimaryDir;
+  UniDB->UpstreamGraphDB = std::move(UpstreamGraphDB);
+  UniDB->PrimaryGraphDB = std::move(PrimaryGraphDB);
+  UniDB->UpstreamKVDB = std::move(UpstreamKVDB);
+  UniDB->PrimaryKVDB = std::move(PrimaryKVDB);
+
+  return std::move(UniDB);
+}
+
+void UnifiedOnDiskCache::setSizeLimit(std::optional<uint64_t> SizeLimit) {
+  this->SizeLimit = SizeLimit.value_or(0);
+}
+
+uint64_t UnifiedOnDiskCache::getStorageSize() const {
+  uint64_t TotalSize = getPrimaryStorageSize();
+  if (UpstreamGraphDB)
+    TotalSize += UpstreamGraphDB->getStorageSize();
+  if (UpstreamKVDB)
+    TotalSize += UpstreamKVDB->getStorageSize();
+  return TotalSize;
+}
+
+uint64_t UnifiedOnDiskCache::getPrimaryStorageSize() const {
+  return PrimaryGraphDB->getStorageSize() + PrimaryKVDB->getStorageSize();
+}
+
+bool UnifiedOnDiskCache::hasExceededSizeLimit() const {
+  uint64_t CurSizeLimit = SizeLimit;
+  if (!CurSizeLimit)
+    return false;
+
+  // If the hard limit is beyond 85%, declare above limit and request clean up.
+  unsigned CurrentPercent =
+      std::max(PrimaryGraphDB->getHardStorageLimitUtilization(),
+               PrimaryKVDB->getHardStorageLimitUtilization());
+  if (CurrentPercent > 85)
+    return true;
+
+  // We allow each of the directories in the chain to reach up to half the
+  // intended size limit. Check whether the primary directory has exceeded half
+  // the limit or not, in order to decide whether we need to start a new chain.
+  //
+  // We could check the size limit against the sum of sizes of both the primary
+  // and upstream directories but then if the upstream is significantly larger
+  // than the intended limit, it would trigger a new chain to be created before
+  // the primary has reached its own limit. Essentially in such situation we
+  // prefer reclaiming the storage later in order to have more consistent cache
+  // hits behavior.
+  return (CurSizeLimit / 2) < getPrimaryStorageSize();
+}
+
+Error UnifiedOnDiskCache::close(bool CheckSizeLimit) {
+  if (LockFD == -1)
+    return Error::success(); // already closed.
+  auto CloseLock = make_scope_exit([&]() {
+    assert(LockFD >= 0);
+    sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD);
+    sys::fs::closeFile(LockFile);
+    LockFD = -1;
+  });
+
+  bool ExceededSizeLimit = CheckSizeLimit ? hasExceededSizeLimit() : false;
+  UpstreamKVDB.reset();
+  PrimaryKVDB.reset();
+  UpstreamGraphDB.reset();
+  PrimaryGraphDB.reset();
+  if (std::error_code EC = unlockFileThreadSafe(LockFD))
+    return createFileError(RootPath, EC);
+
+  if (!ExceededSizeLimit)
+    return Error::success();
+
+  // The primary directory exceeded its intended size limit. Try to get an
+  // exclusive lock in order to create a new primary directory for next time
+  // this \p UnifiedOnDiskCache path is opened.
+
+  if (std::error_code EC = tryLockFileThreadSafe(
+          LockFD, std::chrono::milliseconds(0), sys::fs::LockKind::Exclusive)) {
+    if (EC == errc::no_lock_available)
+      return Error::success(); // couldn't get exclusive lock, give up.
+    return createFileError(RootPath, EC);
+  }
+  auto UnlockFile = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); });
+
+  // Managed to get an exclusive lock which means there are no other open
+  // \p UnifiedOnDiskCache instances for the same path, so we can safely start a
+  // new primary directory. To start a new primary directory we just have to
+  // create a new empty directory with the next consecutive index; since this is
+  // an atomic operation we will leave the top-level directory in a consistent
+  // state even if the process dies during this code-path.
+
+  SmallString<256> PathBuf(RootPath);
+  raw_svector_ostream OS(PathBuf);
+  OS << sys::path::get_separator();
+  getNextDBDirName(PrimaryDBDir, OS);
+  if (std::error_code EC = sys::fs::create_directory(PathBuf))
+    return createFileError(PathBuf, EC);
+
+  NeedsGarbageCollection = true;
+  return Error::success();
+}
+
+UnifiedOnDiskCache::UnifiedOnDiskCache() = default;
+
+UnifiedOnDiskCache::~UnifiedOnDiskCache() { consumeError(close()); }
+
+Error UnifiedOnDiskCache::collectGarbage(StringRef Path) {
+  auto DBDirs = getAllGarbageDirs(Path);
+  if (!DBDirs)
+    return DBDirs.takeError();
+
+  SmallString<256> PathBuf(Path);
+  for (StringRef UnusedSubDir : *DBDirs) {
+    sys::path::append(PathBuf, UnusedSubDir);
+    if (std::error_code EC = sys::fs::remove_directories(PathBuf))
+      return createFileError(PathBuf, EC);
+    sys::path::remove_filename(PathBuf);
+  }
+  return Error::success();
+}
+
+Error UnifiedOnDiskCache::collectGarbage() { return collectGarbage(RootPath); }
diff --git a/llvm/lib/CGData/OutlinedHashTreeRecord.cpp b/llvm/lib/CGData/OutlinedHashTreeRecord.cpp
index cc760634d7fae..2b6e2f0537524 100644
--- a/llvm/lib/CGData/OutlinedHashTreeRecord.cpp
+++ b/llvm/lib/CGData/OutlinedHashTreeRecord.cpp
@@ -37,7 +37,7 @@ template <> struct MappingTraits<HashNodeStable> {
 template <> struct CustomMappingTraits<IdHashNodeStableMapTy> {
   static void inputOne(IO &io, StringRef Key, IdHashNodeStableMapTy &V) {
     HashNodeStable NodeStable;
-    io.mapRequired(Key.str().c_str(), NodeStable);
+    io.mapRequired(Key, NodeStable);
     unsigned Id;
     if (Key.getAsInteger(0, Id)) {
       io.setError("Id not an integer");
@@ -48,7 +48,7 @@ template <> struct CustomMappingTraits<IdHashNodeStableMapTy> {
 
   static void output(IO &io, IdHashNodeStableMapTy &V) {
     for (auto Iter = V.begin(); Iter != V.end(); ++Iter)
-      io.mapRequired(utostr(Iter->first).c_str(), Iter->second);
+      io.mapRequired(utostr(Iter->first), Iter->second);
   }
 };
 
diff --git a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 6567bd403c857..060582cec74d8 100644
--- a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -34,7 +34,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
-#include <utility>
 
 using namespace llvm;
 
@@ -395,7 +394,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
     // Note register reference...
     const TargetRegisterClass *RC = nullptr;
     if (i < MI.getDesc().getNumOperands())
-      RC = TII->getRegClass(MI.getDesc(), i, TRI);
+      RC = TII->getRegClass(MI.getDesc(), i);
     AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
     RegRefs.emplace(Reg.asMCReg(), RR);
   }
@@ -479,7 +478,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI,
     // Note register reference...
     const TargetRegisterClass *RC = nullptr;
     if (i < MI.getDesc().getNumOperands())
-      RC = TII->getRegClass(MI.getDesc(), i, TRI);
+      RC = TII->getRegClass(MI.getDesc(), i);
     AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
     RegRefs.emplace(Reg.asMCReg(), RR);
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
index e5c85d588b45e..1ea30d8ab3c2b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
@@ -745,11 +745,6 @@ void AppleAccelTableStaticTypeData::emit(AsmPrinter *Asm) const {
   Asm->emitInt32(QualifiedNameHash);
 }
 
-constexpr AppleAccelTableData::Atom AppleAccelTableTypeData::Atoms[];
-constexpr AppleAccelTableData::Atom AppleAccelTableOffsetData::Atoms[];
-constexpr AppleAccelTableData::Atom AppleAccelTableStaticOffsetData::Atoms[];
-constexpr AppleAccelTableData::Atom AppleAccelTableStaticTypeData::Atoms[];
-
 #ifndef NDEBUG
 void AppleAccelTableWriter::Header::print(raw_ostream &OS) const {
   OS << "Magic: " << format("0x%x", Magic) << "\n"
diff --git a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
index 11ca48d9fe05c..bb55fc77fca0f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
@@ -12,7 +12,6 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 8aa488f0efd8f..3aa245b7f3f1e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1443,7 +1443,7 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges,
           MF.hasBBSections() && NumMBBSectionRanges > 1,
           // Use static_cast to avoid breakage of tests on windows.
           static_cast<bool>(BBAddrMapSkipEmitBBEntries), HasCalls,
-          static_cast<bool>(EmitBBHash)};
+          static_cast<bool>(EmitBBHash), false};
 }
 
 void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
@@ -1708,7 +1708,6 @@ void AsmPrinter::emitCallGraphSection(const MachineFunction &MF,
   OutStreamer->pushSection();
   OutStreamer->switchSection(FuncCGSection);
 
-  const MCSymbol *FunctionSymbol = getFunctionBegin();
   const Function &F = MF.getFunction();
   // If this function has external linkage or has its address taken and
   // it is not a callback, then anything could call it.
@@ -1747,7 +1746,7 @@ void AsmPrinter::emitCallGraphSection(const MachineFunction &MF,
   // 8) Each unique indirect target type id.
   OutStreamer->emitInt8(CallGraphSectionFormatVersion::V_0);
   OutStreamer->emitInt8(static_cast<uint8_t>(CGFlags));
-  OutStreamer->emitSymbolValue(FunctionSymbol, TM.getProgramPointerSize());
+  OutStreamer->emitSymbolValue(getSymbol(&F), TM.getProgramPointerSize());
   const auto *TypeId = extractNumericCGTypeId(F);
   if (IsIndirectTarget && TypeId)
     OutStreamer->emitInt64(TypeId->getZExtValue());
@@ -2088,6 +2087,17 @@ void AsmPrinter::emitFunctionBody() {
         // This is only used to influence register allocation behavior, no
         // actual initialization is needed.
         break;
+      case TargetOpcode::RELOC_NONE: {
+        // Generate a temporary label for the current PC.
+        MCSymbol *Sym = OutContext.createTempSymbol("reloc_none");
+        OutStreamer->emitLabel(Sym);
+        const MCExpr *Dot = MCSymbolRefExpr::create(Sym, OutContext);
+        const MCExpr *Value = MCSymbolRefExpr::create(
+            OutContext.getOrCreateSymbol(MI.getOperand(0).getSymbolName()),
+            OutContext);
+        OutStreamer->emitRelocDirective(*Dot, "BFD_RELOC_NONE", Value, SMLoc());
+        break;
+      }
       default:
         emitInstruction(&MI);
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index e57ed24a45065..2ebccee6aa68c 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -628,10 +628,15 @@ void CodeViewDebug::beginModule(Module *M) {
     // When emitting only compiler information, we may have only NoDebug CUs,
     // which would be skipped by debug_compile_units_begin.
     NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+    if (CUs->operands().empty()) {
+      Asm = nullptr;
+      return;
+    }
     Node = *CUs->operands().begin();
   }
-  const auto *CU = cast<DICompileUnit>(Node);
-  DISourceLanguageName Lang = CU->getSourceLanguage();
+
+  TheCU = cast<DICompileUnit>(Node);
+  DISourceLanguageName Lang = TheCU->getSourceLanguage();
   CurrentSourceLanguage =
       Lang.hasVersionedName()
           ? MapDWARFLanguageToCVLang(
@@ -639,7 +644,7 @@ void CodeViewDebug::beginModule(Module *M) {
           : MapDWARFLanguageToCVLang(
                 static_cast<dwarf::SourceLanguage>(Lang.getName()));
   if (!M->getCodeViewFlag() ||
-      CU->getEmissionKind() == DICompileUnit::NoDebug) {
+      TheCU->getEmissionKind() == DICompileUnit::NoDebug) {
     Asm = nullptr;
     return;
   }
@@ -900,11 +905,10 @@ void CodeViewDebug::emitCompilerInformation() {
   OS.AddComment("CPUType");
   OS.emitInt16(static_cast<uint64_t>(TheCPU));
 
-  NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
-  const MDNode *Node = *CUs->operands().begin();
-  const auto *CU = cast<DICompileUnit>(Node);
+  StringRef CompilerVersion = "0";
+  if (TheCU)
+    CompilerVersion = TheCU->getProducer();
 
-  StringRef CompilerVersion = CU->getProducer();
   Version FrontVer = parseVersion(CompilerVersion);
   OS.AddComment("Frontend version");
   for (int N : FrontVer.Part) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index c2b878e52e1c3..7fd2cec8c74f2 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -98,6 +98,8 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   /// The codeview CPU type used by the translation unit.
   codeview::CPUType TheCPU;
 
+  const DICompileUnit *TheCU = nullptr;
+
   /// The AsmPrinter used for emitting compiler metadata. When only compiler
   /// info is being emitted, DebugHandlerBase::Asm may be null.
   AsmPrinter *CompilerInfoAsm = nullptr;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
index 171fb8394990d..aff6a76879062 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
@@ -26,7 +26,6 @@
 #include <cassert>
 #include <map>
 #include <optional>
-#include <utility>
 
 using namespace llvm;
 
@@ -112,8 +111,7 @@ void DbgValueHistoryMap::Entry::endEntry(EntryIndex Index) {
 /// to the first intersecting scope range if one exists.
 static std::optional<ArrayRef<InsnRange>::iterator>
 intersects(const MachineInstr *StartMI, const MachineInstr *EndMI,
-           const ArrayRef<InsnRange> &Ranges,
-           const InstructionOrdering &Ordering) {
+           ArrayRef<InsnRange> Ranges, const InstructionOrdering &Ordering) {
   for (auto RangesI = Ranges.begin(), RangesE = Ranges.end();
        RangesI != RangesE; ++RangesI) {
     if (EndMI && Ordering.isBefore(EndMI, RangesI->first))
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 567acf75d1b8d..30db817ba3144 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -589,9 +589,9 @@ struct FwdRegParamInfo {
 
 /// Register worklist for finding call site values.
 using FwdRegWorklist = MapVector<uint64_t, SmallVector<FwdRegParamInfo, 2>>;
-/// Container for the set of registers known to be clobbered on the path to a
-/// call site.
-using ClobberedRegSet = SmallSet<Register, 16>;
+/// Container for the set of register units known to be clobbered on the path
+/// to a call site.
+using ClobberedRegUnitSet = SmallSet<MCRegUnit, 16>;
 
 /// Append the expression \p Addition to \p Original and return the result.
 static const DIExpression *combineDIExpressions(const DIExpression *Original,
@@ -663,7 +663,7 @@ static void addToFwdRegWorklist(FwdRegWorklist &Worklist, unsigned Reg,
 static void interpretValues(const MachineInstr *CurMI,
                             FwdRegWorklist &ForwardedRegWorklist,
                             ParamSet &Params,
-                            ClobberedRegSet &ClobberedRegUnits) {
+                            ClobberedRegUnitSet &ClobberedRegUnits) {
 
   const MachineFunction *MF = CurMI->getMF();
   const DIExpression *EmptyExpr =
@@ -695,7 +695,7 @@ static void interpretValues(const MachineInstr *CurMI,
 
   // If the MI is an instruction defining one or more parameters' forwarding
   // registers, add those defines.
-  ClobberedRegSet NewClobberedRegUnits;
+  ClobberedRegUnitSet NewClobberedRegUnits;
   auto getForwardingRegsDefinedByMI = [&](const MachineInstr &MI,
                                           SmallSetVector<unsigned, 4> &Defs) {
     if (MI.isDebugInstr())
@@ -778,7 +778,7 @@ static void interpretValues(const MachineInstr *CurMI,
 static bool interpretNextInstr(const MachineInstr *CurMI,
                                FwdRegWorklist &ForwardedRegWorklist,
                                ParamSet &Params,
-                               ClobberedRegSet &ClobberedRegUnits) {
+                               ClobberedRegUnitSet &ClobberedRegUnits) {
   // Skip bundle headers.
   if (CurMI->isBundle())
     return true;
@@ -848,7 +848,7 @@ static void collectCallSiteParameters(const MachineInstr *CallMI,
   bool ShouldTryEmitEntryVals = MBB->getIterator() == MF->begin();
 
   // Search for a loading value in forwarding registers inside call delay slot.
-  ClobberedRegSet ClobberedRegUnits;
+  ClobberedRegUnitSet ClobberedRegUnits;
   if (CallMI->hasDelaySlot()) {
     auto Suc = std::next(CallMI->getIterator());
     // Only one-instruction delay slot is supported.
@@ -1544,18 +1544,8 @@ void DwarfDebug::ensureAbstractEntityIsCreatedIfScoped(DwarfCompileUnit &CU,
 }
 
 static const DILocalScope *getRetainedNodeScope(const MDNode *N) {
-  const DIScope *S;
-  if (const auto *LV = dyn_cast<DILocalVariable>(N))
-    S = LV->getScope();
-  else if (const auto *L = dyn_cast<DILabel>(N))
-    S = L->getScope();
-  else if (const auto *IE = dyn_cast<DIImportedEntity>(N))
-    S = IE->getScope();
-  else
-    llvm_unreachable("Unexpected retained node!");
-
   // Ensure the scope is not a DILexicalBlockFile.
-  return cast<DILocalScope>(S)->getNonLexicalBlockFileScope();
+  return DISubprogram::getRetainedNodeScope(N)->getNonLexicalBlockFileScope();
 }
 
 // Collect variable information from side table maintained by MF.
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
index 700e0ec5813ee..c4929aed1c197 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -19,7 +19,6 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include <cassert>
 #include <cstdint>
-#include <iterator>
 #include <optional>
 
 namespace llvm {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
index d5dac417756f0..d304c7efe2a75 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
@@ -13,7 +13,6 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include <cassert>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 555c56fd322bb..1666a0e36b39a 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -32,7 +32,6 @@
 #include <cstdint>
 #include <limits>
 #include <string>
-#include <utility>
 
 using namespace llvm;
 
@@ -1120,7 +1119,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
           constructMemberDIE(Buffer, DDTy);
         }
       } else if (auto *Property = dyn_cast<DIObjCProperty>(Element)) {
-        DIE &ElemDie = createAndAddDIE(Property->getTag(), Buffer);
+        DIE &ElemDie = createAndAddDIE(Property->getTag(), Buffer, Property);
         StringRef PropertyName = Property->getName();
         addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName);
         if (Property->getType())
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 53f1cfe24a68d..d9bc042d6807e 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -38,6 +38,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -1259,8 +1260,7 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop(
   BasicBlock *BB = Builder.GetInsertBlock();
   Function *F = BB->getParent();
 
-  assert(AddrAlign >=
-             F->getDataLayout().getTypeStoreSize(ResultTy) &&
+  assert(AddrAlign >= F->getDataLayout().getTypeStoreSize(ResultTy) &&
          "Expected at least natural alignment at this point.");
 
   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
@@ -1295,7 +1295,13 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop(
       TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
   Value *TryAgain = Builder.CreateICmpNE(
       StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
-  Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
+
+  Instruction *CondBr = Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
+
+  // Atomic RMW expands to a Load-linked / Store-Conditional loop, because it is
+  // hard to predict precise branch weigths we mark the branch as "unknown"
+  // (50/50) to prevent misleading optimizations.
+  setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, DEBUG_TYPE);
 
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
   return Loaded;
@@ -1680,7 +1686,12 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
 
   Loaded->addIncoming(NewLoaded, LoopBB);
 
-  Builder.CreateCondBr(Success, ExitBB, LoopBB);
+  Instruction *CondBr = Builder.CreateCondBr(Success, ExitBB, LoopBB);
+
+  // Atomic RMW expands to a cmpxchg loop, Since precise branch weights
+  // cannot be easily determined here, we mark the branch as "unknown" (50/50)
+  // to prevent misleading optimizations.
+  setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, DEBUG_TYPE);
 
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
   return NewLoaded;
diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp
index e317e1c06741f..52e2909bec072 100644
--- a/llvm/lib/CodeGen/BasicBlockSections.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSections.cpp
@@ -183,8 +183,7 @@ updateBranches(MachineFunction &MF,
 // clusters are ordered in increasing order of their IDs, with the "Exception"
 // and "Cold" succeeding all other clusters.
 // FuncClusterInfo represents the cluster information for basic blocks. It
-// maps from BBID of basic blocks to their cluster information. If this is
-// empty, it means unique sections for all basic blocks in the function.
+// maps from BBID of basic blocks to their cluster information.
 static void
 assignSections(MachineFunction &MF,
                const DenseMap<UniqueBBID, BBClusterInfo> &FuncClusterInfo) {
@@ -197,10 +196,8 @@ assignSections(MachineFunction &MF,
   for (auto &MBB : MF) {
     // With the 'all' option, every basic block is placed in a unique section.
     // With the 'list' option, every basic block is placed in a section
-    // associated with its cluster, unless we want individual unique sections
-    // for every basic block in this function (if FuncClusterInfo is empty).
-    if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All ||
-        FuncClusterInfo.empty()) {
+    // associated with its cluster.
+    if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All) {
       // If unique sections are desired for all basic blocks of the function, we
       // set every basic block's section ID equal to its original position in
       // the layout (which is equal to its number). This ensures that basic
@@ -308,22 +305,22 @@ bool BasicBlockSections::handleBBSections(MachineFunction &MF) {
   if (BBSectionsType == BasicBlockSection::List &&
       hasInstrProfHashMismatch(MF))
     return false;
-  // Renumber blocks before sorting them. This is useful for accessing the
-  // original layout positions and finding the original fallthroughs.
-  MF.RenumberBlocks();
 
   DenseMap<UniqueBBID, BBClusterInfo> FuncClusterInfo;
   if (BBSectionsType == BasicBlockSection::List) {
-    auto [HasProfile, ClusterInfo] =
-        getAnalysis<BasicBlockSectionsProfileReaderWrapperPass>()
-            .getClusterInfoForFunction(MF.getName());
-    if (!HasProfile)
+    auto ClusterInfo = getAnalysis<BasicBlockSectionsProfileReaderWrapperPass>()
+                           .getClusterInfoForFunction(MF.getName());
+    if (ClusterInfo.empty())
       return false;
     for (auto &BBClusterInfo : ClusterInfo) {
       FuncClusterInfo.try_emplace(BBClusterInfo.BBID, BBClusterInfo);
     }
   }
 
+  // Renumber blocks before sorting them. This is useful for accessing the
+  // original layout positions and finding the original fallthroughs.
+  MF.RenumberBlocks();
+
   MF.setBBSectionsType(BBSectionsType);
   assignSections(MF, FuncClusterInfo);
 
diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
index fbcd614b85d18..c234c0f1b0b34 100644
--- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
@@ -58,22 +58,24 @@ BasicBlockSectionsProfileReader::parseUniqueBBID(StringRef S) const {
 }
 
 bool BasicBlockSectionsProfileReader::isFunctionHot(StringRef FuncName) const {
-  return getClusterInfoForFunction(FuncName).first;
+  return !getClusterInfoForFunction(FuncName).empty();
 }
 
-std::pair<bool, SmallVector<BBClusterInfo>>
+SmallVector<BBClusterInfo>
 BasicBlockSectionsProfileReader::getClusterInfoForFunction(
     StringRef FuncName) const {
   auto R = ProgramPathAndClusterInfo.find(getAliasName(FuncName));
-  return R != ProgramPathAndClusterInfo.end()
-             ? std::pair(true, R->second.ClusterInfo)
-             : std::pair(false, SmallVector<BBClusterInfo>());
+  return R != ProgramPathAndClusterInfo.end() ? R->second.ClusterInfo
+                                              : SmallVector<BBClusterInfo>();
 }
 
 SmallVector<SmallVector<unsigned>>
 BasicBlockSectionsProfileReader::getClonePathsForFunction(
     StringRef FuncName) const {
-  return ProgramPathAndClusterInfo.lookup(getAliasName(FuncName)).ClonePaths;
+  auto R = ProgramPathAndClusterInfo.find(getAliasName(FuncName));
+  return R != ProgramPathAndClusterInfo.end()
+             ? R->second.ClonePaths
+             : SmallVector<SmallVector<unsigned>>();
 }
 
 uint64_t BasicBlockSectionsProfileReader::getEdgeCount(
@@ -287,6 +289,25 @@ Error BasicBlockSectionsProfileReader::ReadV1Profile() {
       }
       continue;
     }
+    case 'h': { // Basic block hash secifier.
+      // Skip the profile when the profile iterator (FI) refers to the
+      // past-the-end element.
+      if (FI == ProgramPathAndClusterInfo.end())
+        continue;
+      for (auto BBIDHashStr : Values) {
+        auto [BBIDStr, HashStr] = BBIDHashStr.split(':');
+        unsigned long long BBID = 0, Hash = 0;
+        if (getAsUnsignedInteger(BBIDStr, 10, BBID))
+          return createProfileParseError(Twine("unsigned integer expected: '") +
+                                         BBIDStr + "'");
+        if (getAsUnsignedInteger(HashStr, 16, Hash))
+          return createProfileParseError(
+              Twine("unsigned integer expected in hex format: '") + HashStr +
+              "'");
+        FI->second.BBHashes[BBID] = Hash;
+      }
+      continue;
+    }
     default:
       return createProfileParseError(Twine("invalid specifier: '") +
                                      Twine(Specifier) + "'");
@@ -475,7 +496,7 @@ bool BasicBlockSectionsProfileReaderWrapperPass::isFunctionHot(
   return BBSPR.isFunctionHot(FuncName);
 }
 
-std::pair<bool, SmallVector<BBClusterInfo>>
+SmallVector<BBClusterInfo>
 BasicBlockSectionsProfileReaderWrapperPass::getClusterInfoForFunction(
     StringRef FuncName) const {
   return BBSPR.getClusterInfoForFunction(FuncName);
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index 7292bc2be0df2..0b212fb0beb20 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -1979,6 +1979,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
   MachineBasicBlock::iterator FIB = FBB->begin();
   MachineBasicBlock::iterator TIE = TBB->end();
   MachineBasicBlock::iterator FIE = FBB->end();
+  MachineFunction &MF = *TBB->getParent();
   while (TIB != TIE && FIB != FIE) {
     // Skip dbg_value instructions. These do not count.
     TIB = skipDebugInstructionsForward(TIB, TIE, false);
@@ -1993,6 +1994,10 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       // Hard to reason about register liveness with predicated instruction.
       break;
 
+    if (!TII->isSafeToMove(*TIB, TBB, MF))
+      // Don't hoist the instruction if it isn't safe to move.
+      break;
+
     bool IsSafe = true;
     for (MachineOperand &MO : TIB->operands()) {
       // Don't attempt to hoist instructions with register masks.
diff --git a/llvm/lib/CodeGen/BreakFalseDeps.cpp b/llvm/lib/CodeGen/BreakFalseDeps.cpp
index 1846880b0c181..fead3ee250841 100644
--- a/llvm/lib/CodeGen/BreakFalseDeps.cpp
+++ b/llvm/lib/CodeGen/BreakFalseDeps.cpp
@@ -133,7 +133,7 @@ bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
   }
 
   // Get the undef operand's register class
-  const TargetRegisterClass *OpRC = TII->getRegClass(MI->getDesc(), OpIdx, TRI);
+  const TargetRegisterClass *OpRC = TII->getRegClass(MI->getDesc(), OpIdx);
   assert(OpRC && "Not a valid register class");
 
   // If the instruction has a true dependency, we can hide the false depdency
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 4373c5397a3c6..1cf0b4964760b 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -88,6 +88,7 @@ add_llvm_component_library(LLVMCodeGen
   LatencyPriorityQueue.cpp
   LazyMachineBlockFrequencyInfo.cpp
   LexicalScopes.cpp
+  LibcallLoweringInfo.cpp
   LiveDebugVariables.cpp
   LiveIntervals.cpp
   LiveInterval.cpp
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 8ea132626a5af..b6dd174f9be80 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -368,7 +368,7 @@ class CodeGenPrepare {
   std::unique_ptr<DominatorTree> DT;
 
 public:
-  CodeGenPrepare(){};
+  CodeGenPrepare() = default;
   CodeGenPrepare(const TargetMachine *TM) : TM(TM){};
   /// If encounter huge function, we need to limit the build time.
   bool IsHugeFunc = false;
@@ -1839,7 +1839,8 @@ bool CodeGenPrepare::unfoldPowerOf2Test(CmpInst *Cmp) {
 /// lose; some adjustment may be wanted there.
 ///
 /// Return true if any changes are made.
-static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
+static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI,
+                              const DataLayout &DL) {
   if (TLI.hasMultipleConditionRegisters(EVT::getEVT(Cmp->getType())))
     return false;
 
@@ -1847,6 +1848,18 @@ static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
   if (TLI.useSoftFloat() && isa<FCmpInst>(Cmp))
     return false;
 
+  bool UsedInPhiOrCurrentBlock = any_of(Cmp->users(), [Cmp](User *U) {
+    return isa<PHINode>(U) ||
+           cast<Instruction>(U)->getParent() == Cmp->getParent();
+  });
+
+  // Avoid sinking larger than legal integer comparisons unless its ONLY used in
+  // another BB.
+  if (UsedInPhiOrCurrentBlock && Cmp->getOperand(0)->getType()->isIntegerTy() &&
+      Cmp->getOperand(0)->getType()->getScalarSizeInBits() >
+          DL.getLargestLegalIntTypeSizeInBits())
+    return false;
+
   // Only insert a cmp in each block once.
   DenseMap<BasicBlock *, CmpInst *> InsertedCmps;
 
@@ -2224,7 +2237,7 @@ bool CodeGenPrepare::optimizeURem(Instruction *Rem) {
 }
 
 bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
-  if (sinkCmpExpression(Cmp, *TLI))
+  if (sinkCmpExpression(Cmp, *TLI, *DL))
     return true;
 
   if (combineToUAddWithOverflow(Cmp, ModifiedDT))
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index c1365f499dcf5..02ae722b5a56e 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -210,6 +210,9 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
           clEnumValN(FramePointerKind::All, "all",
                      "Disable frame pointer elimination"),
           clEnumValN(FramePointerKind::NonLeaf, "non-leaf",
+                     "Disable frame pointer elimination for non-leaf frame but "
+                     "reserve the register in leaf functions"),
+          clEnumValN(FramePointerKind::NonLeafNoReserve, "non-leaf-no-reserve",
                      "Disable frame pointer elimination for non-leaf frame"),
           clEnumValN(FramePointerKind::Reserved, "reserved",
                      "Enable frame pointer elimination, but reserve the frame "
@@ -687,6 +690,8 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
       NewAttrs.addAttribute("frame-pointer", "all");
     else if (getFramePointerUsage() == FramePointerKind::NonLeaf)
       NewAttrs.addAttribute("frame-pointer", "non-leaf");
+    else if (getFramePointerUsage() == FramePointerKind::NonLeafNoReserve)
+      NewAttrs.addAttribute("frame-pointer", "non-leaf-no-reserve");
     else if (getFramePointerUsage() == FramePointerKind::Reserved)
       NewAttrs.addAttribute("frame-pointer", "reserved");
     else if (getFramePointerUsage() == FramePointerKind::None)
diff --git a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 86377cff2d29d..3259a3e83c541 100644
--- a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -187,7 +187,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr &MI) {
     const TargetRegisterClass *NewRC = nullptr;
 
     if (i < MI.getDesc().getNumOperands())
-      NewRC = TII->getRegClass(MI.getDesc(), i, TRI);
+      NewRC = TII->getRegClass(MI.getDesc(), i);
 
     // For now, only allow the register to be changed if its register
     // class is consistent across all uses.
@@ -316,7 +316,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr &MI, unsigned Count) {
 
     const TargetRegisterClass *NewRC = nullptr;
     if (i < MI.getDesc().getNumOperands())
-      NewRC = TII->getRegClass(MI.getDesc(), i, TRI);
+      NewRC = TII->getRegClass(MI.getDesc(), i);
 
     // For now, only allow the register to be changed if its register
     // class is consistent across all uses.
diff --git a/llvm/lib/CodeGen/DFAPacketizer.cpp b/llvm/lib/CodeGen/DFAPacketizer.cpp
index c16166a1d5e1c..edbd380eb827b 100644
--- a/llvm/lib/CodeGen/DFAPacketizer.cpp
+++ b/llvm/lib/CodeGen/DFAPacketizer.cpp
@@ -39,7 +39,6 @@
 #include <cassert>
 #include <iterator>
 #include <memory>
-#include <vector>
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index da0987c3b50bb..55caa6e8a8f95 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -134,7 +134,7 @@ class SSAIfConv {
   BitVector ClobberedRegUnits;
 
   // Scratch pad for findInsertionPoint.
-  SparseSet<unsigned> LiveRegUnits;
+  SparseSet<MCRegUnit> LiveRegUnits;
 
   /// Insertion point in Head for speculatively executed instructions form TBB
   /// and FBB.
@@ -421,7 +421,7 @@ bool SSAIfConv::findInsertionPoint() {
     if (!LiveRegUnits.empty()) {
       LLVM_DEBUG({
         dbgs() << "Would clobber";
-        for (unsigned LRU : LiveRegUnits)
+        for (MCRegUnit LRU : LiveRegUnits)
           dbgs() << ' ' << printRegUnit(LRU, TRI);
         dbgs() << " live before " << *I;
       });
diff --git a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
index 8b74dcebd00ac..c23cac7974d51 100644
--- a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
+++ b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
@@ -420,7 +420,7 @@ class StatepointState {
 
       LLVM_DEBUG(dbgs() << "Insert spill before " << *InsertBefore);
       TII.storeRegToStackSlot(*MI.getParent(), InsertBefore, Reg, IsKill, FI,
-                              RC, &TRI, Register());
+                              RC, Register());
     }
   }
 
@@ -429,7 +429,7 @@ class StatepointState {
     const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
     int FI = RegToSlotIdx[Reg];
     if (It != MBB->end()) {
-      TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI, Register());
+      TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, Register());
       return;
     }
 
@@ -437,7 +437,7 @@ class StatepointState {
     // and then swap them.
     assert(!MBB->empty() && "Empty block");
     --It;
-    TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI, Register());
+    TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, Register());
     MachineInstr *Reload = It->getPrevNode();
     int Dummy = 0;
     (void)Dummy;
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index b3c312569736f..7be7468300569 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -292,7 +292,8 @@ void CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
   LLVMContext &Ctx = OrigArg.Ty->getContext();
 
   SmallVector<EVT, 4> SplitVTs;
-  ComputeValueVTs(*TLI, DL, OrigArg.Ty, SplitVTs, Offsets, 0);
+  ComputeValueVTs(*TLI, DL, OrigArg.Ty, SplitVTs, /*MemVTs=*/nullptr, Offsets,
+                  0);
 
   if (SplitVTs.size() == 0)
     return;
@@ -996,7 +997,7 @@ void CallLowering::insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy,
 
   SmallVector<EVT, 4> SplitVTs;
   SmallVector<uint64_t, 4> Offsets;
-  ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, &Offsets, 0);
+  ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, /*MemVTs=*/nullptr, &Offsets, 0);
 
   assert(VRegs.size() == SplitVTs.size());
 
@@ -1028,7 +1029,7 @@ void CallLowering::insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy,
 
   SmallVector<EVT, 4> SplitVTs;
   SmallVector<uint64_t, 4> Offsets;
-  ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, &Offsets, 0);
+  ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, /*MemVTs=*/nullptr, &Offsets, 0);
 
   assert(VRegs.size() == SplitVTs.size());
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 9ace7d65413ad..ec4d13f1cd1b3 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -589,8 +589,8 @@ bool CombinerHelper::matchCombineShuffleVector(
   return true;
 }
 
-void CombinerHelper::applyCombineShuffleVector(
-    MachineInstr &MI, const ArrayRef<Register> Ops) const {
+void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI,
+                                               ArrayRef<Register> Ops) const {
   Register DstReg = MI.getOperand(0).getReg();
   Builder.setInsertPt(*MI.getParent(), MI);
   Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index d6f23b62519fe..c1fb8b6d78ff8 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -643,6 +643,38 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known,
     Known.Zero.setBitsFrom(LowBits);
     break;
   }
+  case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
+    GExtractVectorElement &Extract = cast<GExtractVectorElement>(MI);
+    Register InVec = Extract.getVectorReg();
+    Register EltNo = Extract.getIndexReg();
+
+    auto ConstEltNo = getIConstantVRegVal(EltNo, MRI);
+
+    LLT VecVT = MRI.getType(InVec);
+    // computeKnownBits not yet implemented for scalable vectors.
+    if (VecVT.isScalableVector())
+      break;
+
+    const unsigned EltBitWidth = VecVT.getScalarSizeInBits();
+    const unsigned NumSrcElts = VecVT.getNumElements();
+    // A return type different from the vector's element type may lead to
+    // issues with pattern selection. Bail out to avoid that.
+    if (BitWidth > EltBitWidth)
+      break;
+
+    Known.Zero.setAllBits();
+    Known.One.setAllBits();
+
+    // If we know the element index, just demand that vector element, else for
+    // an unknown element index, ignore DemandedElts and demand them all.
+    APInt DemandedSrcElts = APInt::getAllOnes(NumSrcElts);
+    if (ConstEltNo && ConstEltNo->ult(NumSrcElts))
+      DemandedSrcElts =
+          APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue());
+
+    computeKnownBitsImpl(InVec, Known, DemandedSrcElts, Depth + 1);
+    break;
+  }
   case TargetOpcode::G_SHUFFLE_VECTOR: {
     APInt DemandedLHS, DemandedRHS;
     // Collect the known bits that are shared by every vector element referenced
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 1fc90d0852aad..53c831b203cae 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -294,6 +294,10 @@ void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) {
   MachinePreds[Edge].push_back(NewPred);
 }
 
+static bool targetSupportsBF16Type(const MachineFunction *MF) {
+  return MF->getTarget().getTargetTriple().isSPIRV();
+}
+
 static bool containsBF16Type(const User &U) {
   // BF16 cannot currently be represented by LLT, to avoid miscompiles we
   // prevent any instructions using them. FIXME: This can be removed once LLT
@@ -306,7 +310,7 @@ static bool containsBF16Type(const User &U) {
 
 bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
                                      MachineIRBuilder &MIRBuilder) {
-  if (containsBF16Type(U))
+  if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
     return false;
 
   // Get or create a virtual register for each value.
@@ -328,7 +332,7 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
 
 bool IRTranslator::translateUnaryOp(unsigned Opcode, const User &U,
                                     MachineIRBuilder &MIRBuilder) {
-  if (containsBF16Type(U))
+  if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
     return false;
 
   Register Op0 = getOrCreateVReg(*U.getOperand(0));
@@ -348,7 +352,7 @@ bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) {
 
 bool IRTranslator::translateCompare(const User &U,
                                     MachineIRBuilder &MIRBuilder) {
-  if (containsBF16Type(U))
+  if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
     return false;
 
   auto *CI = cast<CmpInst>(&U);
@@ -1569,7 +1573,7 @@ bool IRTranslator::translateBitCast(const User &U,
 
 bool IRTranslator::translateCast(unsigned Opcode, const User &U,
                                  MachineIRBuilder &MIRBuilder) {
-  if (containsBF16Type(U))
+  if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
     return false;
 
   uint32_t Flags = 0;
@@ -2682,13 +2686,20 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
   case Intrinsic::experimental_convergence_entry:
   case Intrinsic::experimental_convergence_loop:
     return translateConvergenceControlIntrinsic(CI, ID, MIRBuilder);
+  case Intrinsic::reloc_none: {
+    Metadata *MD = cast<MetadataAsValue>(CI.getArgOperand(0))->getMetadata();
+    StringRef SymbolName = cast<MDString>(MD)->getString();
+    MIRBuilder.buildInstr(TargetOpcode::RELOC_NONE)
+        .addExternalSymbol(SymbolName.data());
+    return true;
+  }
   }
   return false;
 }
 
 bool IRTranslator::translateInlineAsm(const CallBase &CB,
                                       MachineIRBuilder &MIRBuilder) {
-  if (containsBF16Type(CB))
+  if (containsBF16Type(CB) && !targetSupportsBF16Type(MF))
     return false;
 
   const InlineAsmLowering *ALI = MF->getSubtarget().getInlineAsmLowering();
@@ -2779,7 +2790,7 @@ bool IRTranslator::translateCallBase(const CallBase &CB,
 }
 
 bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
-  if (!MF->getTarget().getTargetTriple().isSPIRV() && containsBF16Type(U))
+  if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
     return false;
 
   const CallInst &CI = cast<CallInst>(U);
@@ -2817,20 +2828,34 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   if (translateKnownIntrinsic(CI, ID, MIRBuilder))
     return true;
 
+  TargetLowering::IntrinsicInfo Info;
+  bool IsTgtMemIntrinsic = TLI->getTgtMemIntrinsic(Info, CI, *MF, ID);
+
+  return translateIntrinsic(CI, ID, MIRBuilder,
+                            IsTgtMemIntrinsic ? &Info : nullptr);
+}
+
+/// Translate a call to an intrinsic.
+/// Depending on whether TLI->getTgtMemIntrinsic() is true, TgtMemIntrinsicInfo
+/// is a pointer to the correspondingly populated IntrinsicInfo object.
+/// Otherwise, this pointer is null.
+bool IRTranslator::translateIntrinsic(
+    const CallBase &CB, Intrinsic::ID ID, MachineIRBuilder &MIRBuilder,
+    const TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo) {
   ArrayRef<Register> ResultRegs;
-  if (!CI.getType()->isVoidTy())
-    ResultRegs = getOrCreateVRegs(CI);
+  if (!CB.getType()->isVoidTy())
+    ResultRegs = getOrCreateVRegs(CB);
 
   // Ignore the callsite attributes. Backend code is most likely not expecting
   // an intrinsic to sometimes have side effects and sometimes not.
   MachineInstrBuilder MIB = MIRBuilder.buildIntrinsic(ID, ResultRegs);
-  if (isa<FPMathOperator>(CI))
-    MIB->copyIRFlags(CI);
+  if (isa<FPMathOperator>(CB))
+    MIB->copyIRFlags(CB);
 
-  for (const auto &Arg : enumerate(CI.args())) {
+  for (const auto &Arg : enumerate(CB.args())) {
     // If this is required to be an immediate, don't materialize it in a
     // register.
-    if (CI.paramHasAttr(Arg.index(), Attribute::ImmArg)) {
+    if (CB.paramHasAttr(Arg.index(), Attribute::ImmArg)) {
       if (ConstantInt *CI = dyn_cast<ConstantInt>(Arg.value())) {
         // imm arguments are more convenient than cimm (and realistically
         // probably sufficient), so use them.
@@ -2859,29 +2884,33 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   }
 
   // Add a MachineMemOperand if it is a target mem intrinsic.
-  TargetLowering::IntrinsicInfo Info;
-  // TODO: Add a GlobalISel version of getTgtMemIntrinsic.
-  if (TLI->getTgtMemIntrinsic(Info, CI, *MF, ID)) {
-    Align Alignment = Info.align.value_or(
-        DL->getABITypeAlign(Info.memVT.getTypeForEVT(F->getContext())));
-    LLT MemTy = Info.memVT.isSimple()
-                    ? getLLTForMVT(Info.memVT.getSimpleVT())
-                    : LLT::scalar(Info.memVT.getStoreSizeInBits());
+  if (TgtMemIntrinsicInfo) {
+    const Function *F = CB.getCalledFunction();
+
+    Align Alignment = TgtMemIntrinsicInfo->align.value_or(DL->getABITypeAlign(
+        TgtMemIntrinsicInfo->memVT.getTypeForEVT(F->getContext())));
+    LLT MemTy =
+        TgtMemIntrinsicInfo->memVT.isSimple()
+            ? getLLTForMVT(TgtMemIntrinsicInfo->memVT.getSimpleVT())
+            : LLT::scalar(TgtMemIntrinsicInfo->memVT.getStoreSizeInBits());
 
     // TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic
     //       didn't yield anything useful.
     MachinePointerInfo MPI;
-    if (Info.ptrVal)
-      MPI = MachinePointerInfo(Info.ptrVal, Info.offset);
-    else if (Info.fallbackAddressSpace)
-      MPI = MachinePointerInfo(*Info.fallbackAddressSpace);
+    if (TgtMemIntrinsicInfo->ptrVal) {
+      MPI = MachinePointerInfo(TgtMemIntrinsicInfo->ptrVal,
+                               TgtMemIntrinsicInfo->offset);
+    } else if (TgtMemIntrinsicInfo->fallbackAddressSpace) {
+      MPI = MachinePointerInfo(*TgtMemIntrinsicInfo->fallbackAddressSpace);
+    }
     MIB.addMemOperand(MF->getMachineMemOperand(
-        MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata(),
-        /*Ranges=*/nullptr, Info.ssid, Info.order, Info.failureOrder));
+        MPI, TgtMemIntrinsicInfo->flags, MemTy, Alignment, CB.getAAMetadata(),
+        /*Ranges=*/nullptr, TgtMemIntrinsicInfo->ssid,
+        TgtMemIntrinsicInfo->order, TgtMemIntrinsicInfo->failureOrder));
   }
 
-  if (CI.isConvergent()) {
-    if (auto Bundle = CI.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+  if (CB.isConvergent()) {
+    if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_convergencectrl)) {
       auto *Token = Bundle->Inputs[0].get();
       Register TokenReg = getOrCreateVReg(*Token);
       MIB.addUse(TokenReg, RegState::Implicit);
@@ -3453,7 +3482,7 @@ bool IRTranslator::translateAtomicCmpXchg(const User &U,
 
 bool IRTranslator::translateAtomicRMW(const User &U,
                                       MachineIRBuilder &MIRBuilder) {
-  if (containsBF16Type(U))
+  if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
     return false;
 
   const AtomicRMWInst &I = cast<AtomicRMWInst>(U);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 52c43a4ac4a04..d02f097fef829 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -776,7 +776,7 @@ llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
     break;
   case TargetOpcode::G_MEMCPY:
     RTLibcall = RTLIB::MEMCPY;
-    Name = TLI.getMemcpyName();
+    Name = TLI.getLibcallImplName(TLI.getMemcpyImpl()).data();
     Args[0].Flags[0].setReturned();
     break;
   case TargetOpcode::G_MEMMOVE:
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 4b4df98024f4a..637acd61c8a5f 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -109,8 +109,10 @@ MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
   if (auto *CI = dyn_cast<ConstantInt>(NumericConstant)) {
     if (CI->getBitWidth() > 64)
       MIB.addCImm(CI);
-    else
+    else if (CI->getBitWidth() == 1)
       MIB.addImm(CI->getZExtValue());
+    else
+      MIB.addImm(CI->getSExtValue());
   } else if (auto *CFP = dyn_cast<ConstantFP>(NumericConstant)) {
     MIB.addFPImm(CFP);
   } else if (isa<ConstantPointerNull>(NumericConstant)) {
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index ca82857319abc..e8954a3d9899b 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -114,7 +114,7 @@ Register llvm::constrainOperandRegClass(
   // Assume physical registers are properly constrained.
   assert(Reg.isVirtual() && "PhysReg not implemented");
 
-  const TargetRegisterClass *OpRC = TII.getRegClass(II, OpIdx, &TRI);
+  const TargetRegisterClass *OpRC = TII.getRegClass(II, OpIdx);
   // Some of the target independent instructions, like COPY, may not impose any
   // register class constraints on some of their operands: If it's a use, we can
   // skip constraining as the instruction defining the register would constrain
@@ -1893,6 +1893,8 @@ static bool canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI,
   case TargetOpcode::G_UADDSAT:
   case TargetOpcode::G_SSUBSAT:
   case TargetOpcode::G_USUBSAT:
+  case TargetOpcode::G_SBFX:
+  case TargetOpcode::G_UBFX:
     return false;
   case TargetOpcode::G_SSHLSAT:
   case TargetOpcode::G_USHLSAT:
diff --git a/llvm/lib/CodeGen/InitUndef.cpp b/llvm/lib/CodeGen/InitUndef.cpp
index e07e598019709..12b36f56d4d9a 100644
--- a/llvm/lib/CodeGen/InitUndef.cpp
+++ b/llvm/lib/CodeGen/InitUndef.cpp
@@ -232,7 +232,7 @@ bool InitUndef::processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB,
       MachineOperand &UseMO = MI.getOperand(UseOpIdx);
       if (UseMO.getReg() == MCRegister::NoRegister) {
         const TargetRegisterClass *RC =
-            TII->getRegClass(MI.getDesc(), UseOpIdx, TRI);
+            TII->getRegClass(MI.getDesc(), UseOpIdx);
         Register NewDest = MRI->createVirtualRegister(RC);
         // We don't have a way to update dead lanes, so keep track of the
         // new register so that we avoid querying it later.
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index c3e0964594bd5..68370303a3aef 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -473,7 +473,7 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
   MachineInstrSpan MIS(MII, MBB);
   // Insert spill without kill flag immediately after def.
   TII.storeRegToStackSlot(*MBB, MII, SrcReg, false, StackSlot,
-                          MRI.getRegClass(SrcReg), &TRI, Register());
+                          MRI.getRegClass(SrcReg), Register());
   LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MII);
   for (const MachineInstr &MI : make_range(MIS.begin(), MII))
     getVDefInterval(MI, LIS);
@@ -1119,7 +1119,7 @@ void InlineSpiller::insertReload(Register NewVReg,
 
   MachineInstrSpan MIS(MI, &MBB);
   TII.loadRegFromStackSlot(MBB, MI, NewVReg, StackSlot,
-                           MRI.getRegClass(NewVReg), &TRI, Register());
+                           MRI.getRegClass(NewVReg), Register());
 
   LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MI);
 
@@ -1155,7 +1155,7 @@ void InlineSpiller::insertSpill(Register NewVReg, bool isKill,
 
   if (IsRealSpill)
     TII.storeRegToStackSlot(MBB, SpillBefore, NewVReg, isKill, StackSlot,
-                            MRI.getRegClass(NewVReg), &TRI, Register());
+                            MRI.getRegClass(NewVReg), Register());
   else
     // Don't spill undef value.
     // Anything works for undef, in particular keeping the memory
@@ -1729,7 +1729,7 @@ void HoistSpillHelper::hoistAllSpills() {
       MachineBasicBlock::iterator MII = IPA.getLastInsertPointIter(OrigLI, *BB);
       MachineInstrSpan MIS(MII, BB);
       TII.storeRegToStackSlot(*BB, MII, LiveReg, false, Slot,
-                              MRI.getRegClass(LiveReg), &TRI, Register());
+                              MRI.getRegClass(LiveReg), Register());
       LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MII);
       for (const MachineInstr &MI : make_range(MIS.begin(), MII))
         getVDefInterval(MI, LIS);
diff --git a/llvm/lib/CodeGen/LibcallLoweringInfo.cpp b/llvm/lib/CodeGen/LibcallLoweringInfo.cpp
new file mode 100644
index 0000000000000..5c1698cb6060e
--- /dev/null
+++ b/llvm/lib/CodeGen/LibcallLoweringInfo.cpp
@@ -0,0 +1,26 @@
+//===- LibcallLoweringInfo.cpp - Interface for runtime libcalls -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LibcallLoweringInfo.h"
+
+using namespace llvm;
+
+LibcallLoweringInfo::LibcallLoweringInfo(
+    const RTLIB::RuntimeLibcallsInfo &RTLCI)
+    : RTLCI(RTLCI) {
+  // TODO: This should be generated with lowering predicates, and assert the
+  // call is available.
+  for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+    if (RTLCI.isAvailable(Impl)) {
+      RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl);
+      // FIXME: Hack, assume the first available libcall wins.
+      if (LibcallImpls[LC] == RTLIB::Unsupported)
+        LibcallImpls[LC] = Impl;
+    }
+  }
+}
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index d2f2c3ef33c9c..27c5addffa4ab 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -305,7 +305,7 @@ void LiveIntervals::computeRegMasks() {
 /// Compute the live range of a register unit, based on the uses and defs of
 /// aliasing registers.  The range should be empty, or contain only dead
 /// phi-defs from ABI blocks.
-void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
+void LiveIntervals::computeRegUnitRange(LiveRange &LR, MCRegUnit Unit) {
   assert(LICalc && "LICalc not initialized.");
   LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
 
@@ -354,7 +354,7 @@ void LiveIntervals::computeLiveInRegUnits() {
   LLVM_DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n");
 
   // Keep track of the live range sets allocated.
-  SmallVector<unsigned, 8> NewRanges;
+  SmallVector<MCRegUnit, 8> NewRanges;
 
   // Check all basic blocks for live-ins.
   for (const MachineBasicBlock &MBB : *MF) {
@@ -383,7 +383,7 @@ void LiveIntervals::computeLiveInRegUnits() {
   LLVM_DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n");
 
   // Compute the 'normal' part of the ranges.
-  for (unsigned Unit : NewRanges)
+  for (MCRegUnit Unit : NewRanges)
     computeRegUnitRange(*RegUnitRanges[Unit], Unit);
 }
 
@@ -1042,7 +1042,7 @@ class LiveIntervals::HMEditor {
   // physregs, even those that aren't needed for regalloc, in order to update
   // kill flags. This is wasteful. Eventually, LiveVariables will strip all kill
   // flags, and postRA passes will use a live register utility instead.
-  LiveRange *getRegUnitLI(unsigned Unit) {
+  LiveRange *getRegUnitLI(MCRegUnit Unit) {
     if (UpdateFlags && !MRI.isReservedRegUnit(Unit))
       return &LIS.getRegUnit(Unit);
     return LIS.getCachedRegUnit(Unit);
diff --git a/llvm/lib/CodeGen/LiveRangeCalc.cpp b/llvm/lib/CodeGen/LiveRangeCalc.cpp
index 149f93fa69ccb..0260ee2e75aa5 100644
--- a/llvm/lib/CodeGen/LiveRangeCalc.cpp
+++ b/llvm/lib/CodeGen/LiveRangeCalc.cpp
@@ -28,7 +28,6 @@
 #include <cassert>
 #include <iterator>
 #include <tuple>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 5b0365da4e8c6..6fe11704a9137 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -88,7 +88,7 @@ SlotIndex LiveRangeEdit::rematerializeAt(MachineBasicBlock &MBB,
                                          bool Late, unsigned SubIdx,
                                          MachineInstr *ReplaceIndexMI) {
   assert(RM.OrigMI && "Invalid remat");
-  TII.reMaterialize(MBB, MI, DestReg, SubIdx, *RM.OrigMI, tri);
+  TII.reMaterialize(MBB, MI, DestReg, SubIdx, *RM.OrigMI);
   // DestReg of the cloned instruction cannot be Dead. Set isDead of DestReg
   // to false anyway in case the isDead flag of RM.OrigMI's dest register
   // is true.
diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp
index cfda262aac82d..e3ee8dc325933 100644
--- a/llvm/lib/CodeGen/LiveRegMatrix.cpp
+++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -89,7 +89,7 @@ static bool foreachUnit(const TargetRegisterInfo *TRI,
                         Callable Func) {
   if (VRegInterval.hasSubRanges()) {
     for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
-      unsigned Unit = (*Units).first;
+      MCRegUnit Unit = (*Units).first;
       LaneBitmask Mask = (*Units).second;
       for (const LiveInterval::SubRange &S : VRegInterval.subranges()) {
         if ((S.LaneMask & Mask).any()) {
@@ -115,7 +115,7 @@ void LiveRegMatrix::assign(const LiveInterval &VirtReg, MCRegister PhysReg) {
   VRM->assignVirt2Phys(VirtReg.reg(), PhysReg);
 
   foreachUnit(
-      TRI, VirtReg, PhysReg, [&](unsigned Unit, const LiveRange &Range) {
+      TRI, VirtReg, PhysReg, [&](MCRegUnit Unit, const LiveRange &Range) {
         LLVM_DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI) << ' ' << Range);
         Matrix[Unit].unify(VirtReg, Range);
         return false;
@@ -132,7 +132,7 @@ void LiveRegMatrix::unassign(const LiveInterval &VirtReg) {
   VRM->clearVirt(VirtReg.reg());
 
   foreachUnit(TRI, VirtReg, PhysReg,
-              [&](unsigned Unit, const LiveRange &Range) {
+              [&](MCRegUnit Unit, const LiveRange &Range) {
                 LLVM_DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI));
                 Matrix[Unit].extract(VirtReg, Range);
                 return false;
@@ -175,11 +175,11 @@ bool LiveRegMatrix::checkRegUnitInterference(const LiveInterval &VirtReg,
     return false;
   CoalescerPair CP(VirtReg.reg(), PhysReg, *TRI);
 
-  bool Result = foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
-                                                       const LiveRange &Range) {
-    const LiveRange &UnitRange = LIS->getRegUnit(Unit);
-    return Range.overlaps(UnitRange, CP, *LIS->getSlotIndexes());
-  });
+  bool Result = foreachUnit(
+      TRI, VirtReg, PhysReg, [&](MCRegUnit Unit, const LiveRange &Range) {
+        const LiveRange &UnitRange = LIS->getRegUnit(Unit);
+        return Range.overlaps(UnitRange, CP, *LIS->getSlotIndexes());
+      });
   return Result;
 }
 
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 4795d81e3f348..434a579c3be3f 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1161,6 +1161,8 @@ bool MIParser::parse(MachineInstr *&MI) {
       MemOperands.push_back(MemOp);
       if (Token.isNewlineOrEOF())
         break;
+      if (OpCode == TargetOpcode::BUNDLE && Token.is(MIToken::lbrace))
+        break;
       if (Token.isNot(MIToken::comma))
         return error("expected ',' before the next machine memory operand");
       lex();
diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
index a72c2c41acc46..32b6c46303828 100644
--- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
@@ -83,13 +83,6 @@ static cl::opt<std::string> ModelUnderTraining(
     "regalloc-model", cl::Hidden,
     cl::desc("The model being trained for register allocation eviction"));
 
-static cl::opt<bool> EnableDevelopmentFeatures(
-    "regalloc-enable-development-features", cl::Hidden,
-    cl::desc("Whether or not to enable features under development for the ML "
-             "regalloc advisor"));
-
-#else
-static const bool EnableDevelopmentFeatures = false;
 #endif // #ifdef LLVM_HAVE_TFLITE
 
 /// The score injection pass.
@@ -212,23 +205,6 @@ static const std::vector<int64_t> PerLiveRangeShape{1, NumberOfInterferences};
     "lowest stage of an interval in this LR")                                  \
   M(float, progress, {1}, "ratio of current queue size to initial size")
 
-#ifdef LLVM_HAVE_TFLITE
-#define RA_EVICT_FIRST_DEVELOPMENT_FEATURE(M)                                  \
-  M(int64_t, instructions, InstructionsShape,                                  \
-    "Opcodes of the instructions covered by the eviction problem")
-
-#define RA_EVICT_REST_DEVELOPMENT_FEATURES(M)                                  \
-  M(int64_t, instructions_mapping, InstructionsMappingShape,                   \
-    "A binary matrix mapping LRs to instruction opcodes")                      \
-  M(float, mbb_frequencies, MBBFrequencyShape,                                 \
-    "A vector of machine basic block frequencies")                             \
-  M(int64_t, mbb_mapping, InstructionsShape,                                   \
-    "A vector of indices mapping instructions to MBBs")
-#else
-#define RA_EVICT_FIRST_DEVELOPMENT_FEATURE(M)
-#define RA_EVICT_REST_DEVELOPMENT_FEATURES(M)
-#endif
-
 // The model learns to pick one of the mask == 1 interferences. This is the
 // name of the output tensor. The contract with the model is that the output
 // will be guaranteed to be to a mask == 1 position. Using a macro here to
@@ -242,12 +218,6 @@ enum FeatureIDs {
 #define _FEATURE_IDX_SIMPLE(_, name, __, ___) name
 #define _FEATURE_IDX(A, B, C, D) _FEATURE_IDX_SIMPLE(A, B, C, D),
   RA_EVICT_FEATURES_LIST(_FEATURE_IDX) FeatureCount,
-#ifdef LLVM_HAVE_TFLITE
-  RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_FEATURE_IDX_SIMPLE) = FeatureCount,
-#else
-  RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_FEATURE_IDX)
-#endif // #ifdef LLVM_HAVE_TFLITE
-  RA_EVICT_REST_DEVELOPMENT_FEATURES(_FEATURE_IDX) FeaturesWithDevelopmentCount
 #undef _FEATURE_IDX
 #undef _FEATURE_IDX_SIMPLE
 };
@@ -268,11 +238,7 @@ void resetInputs(MLModelRunner &Runner) {
   std::memset(Runner.getTensorUntyped(FeatureIDs::NAME), 0,                    \
               getTotalSize<TYPE>(SHAPE));
   RA_EVICT_FEATURES_LIST(_RESET)
-  if (EnableDevelopmentFeatures) {
-    RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_RESET)
-    RA_EVICT_REST_DEVELOPMENT_FEATURES(_RESET)
 #undef _RESET
-  }
 }
 
 // Per-live interval components that get aggregated into the feature values
@@ -398,13 +364,7 @@ class ReleaseModeEvictionAdvisorProvider final
 public:
   ReleaseModeEvictionAdvisorProvider(LLVMContext &Ctx)
       : RegAllocEvictionAdvisorProvider(AdvisorMode::Release, Ctx) {
-    if (EnableDevelopmentFeatures) {
-      InputFeatures = {RA_EVICT_FEATURES_LIST(
-          _DECL_FEATURES) RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_DECL_FEATURES)
-                           RA_EVICT_REST_DEVELOPMENT_FEATURES(_DECL_FEATURES)};
-    } else {
-      InputFeatures = {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)};
-    }
+    InputFeatures = {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)};
   }
   // support for isa<> and dyn_cast.
   static bool classof(const RegAllocEvictionAdvisorProvider *R) {
@@ -500,25 +460,12 @@ class DevelopmentModeEvictionAdvisorProvider final
 public:
   DevelopmentModeEvictionAdvisorProvider(LLVMContext &Ctx)
       : RegAllocEvictionAdvisorProvider(AdvisorMode::Development, Ctx) {
-    if (EnableDevelopmentFeatures) {
-      InputFeatures = {RA_EVICT_FEATURES_LIST(
-          _DECL_FEATURES) RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_DECL_FEATURES)
-                           RA_EVICT_REST_DEVELOPMENT_FEATURES(_DECL_FEATURES)};
-      TrainingInputFeatures = {
-          RA_EVICT_FEATURES_LIST(_DECL_TRAIN_FEATURES)
-              RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_DECL_TRAIN_FEATURES)
-                  RA_EVICT_REST_DEVELOPMENT_FEATURES(_DECL_TRAIN_FEATURES)
-                      TensorSpec::createSpec<float>("action_discount", {1}),
-          TensorSpec::createSpec<int32_t>("action_step_type", {1}),
-          TensorSpec::createSpec<float>("action_reward", {1})};
-    } else {
-      InputFeatures = {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)};
-      TrainingInputFeatures = {
-          RA_EVICT_FEATURES_LIST(_DECL_TRAIN_FEATURES)
-              TensorSpec::createSpec<float>("action_discount", {1}),
-          TensorSpec::createSpec<int32_t>("action_step_type", {1}),
-          TensorSpec::createSpec<float>("action_reward", {1})};
-    }
+    InputFeatures = {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)};
+    TrainingInputFeatures = {
+        RA_EVICT_FEATURES_LIST(_DECL_TRAIN_FEATURES)
+            TensorSpec::createSpec<float>("action_discount", {1}),
+        TensorSpec::createSpec<int32_t>("action_step_type", {1}),
+        TensorSpec::createSpec<float>("action_reward", {1})};
     if (ModelUnderTraining.empty() && TrainingLog.empty()) {
       Ctx.emitError("Regalloc development mode should be requested with at "
                     "least logging enabled and/or a training model");
@@ -814,34 +761,6 @@ MCRegister MLEvictAdvisor::tryFindEvictionCandidate(
                     /*NumUrgent*/ 0.0, LRPosInfo);
   assert(InitialQSize > 0.0 && "We couldn't have gotten here if we had "
                                "nothing to allocate initially.");
-#ifdef LLVM_HAVE_TFLITE
-  if (EnableDevelopmentFeatures) {
-    extractInstructionFeatures(
-        LRPosInfo, Runner,
-        [this](SlotIndex InputIndex) -> int {
-          auto *CurrentMachineInstruction =
-              LIS->getInstructionFromIndex(InputIndex);
-          if (!CurrentMachineInstruction) {
-            return -1;
-          }
-          return CurrentMachineInstruction->getOpcode();
-        },
-        [this](SlotIndex InputIndex) -> float {
-          auto *CurrentMachineInstruction =
-              LIS->getInstructionFromIndex(InputIndex);
-          return MBFI.getBlockFreqRelativeToEntryBlock(
-              CurrentMachineInstruction->getParent());
-        },
-        [this](SlotIndex InputIndex) -> MachineBasicBlock * {
-          auto *CurrentMachineInstruction =
-              LIS->getInstructionFromIndex(InputIndex);
-          return CurrentMachineInstruction->getParent();
-        },
-        FeatureIDs::instructions, FeatureIDs::instructions_mapping,
-        FeatureIDs::mbb_frequencies, FeatureIDs::mbb_mapping,
-        LIS->getSlotIndexes()->getLastIndex());
-  }
-#endif // #ifdef LLVM_HAVE_TFLITE
   // Normalize the features.
   for (auto &V : Largest)
     V = V ? V : 1.0;
@@ -987,13 +906,6 @@ void MLEvictAdvisor::extractFeatures(
 
     HintWeights += LIFC.HintWeights;
     NumRematerializable += LIFC.IsRemat;
-
-    if (EnableDevelopmentFeatures) {
-      for (auto CurrentSegment : LI) {
-        LRPosInfo.push_back(
-            LRStartEndInfo{CurrentSegment.start, CurrentSegment.end, Pos});
-      }
-    }
   }
   size_t Size = 0;
   if (!Intervals.empty()) {
@@ -1209,9 +1121,7 @@ int64_t DevelopmentModeEvictAdvisor::tryFindEvictionCandidatePosition(
 
   Log->startObservation();
   size_t CurrentFeature = 0;
-  size_t FeatureCount = EnableDevelopmentFeatures
-                            ? FeatureIDs::FeaturesWithDevelopmentCount
-                            : FeatureIDs::FeatureCount;
+  size_t FeatureCount = FeatureIDs::FeatureCount;
   for (; CurrentFeature < FeatureCount; ++CurrentFeature) {
     Log->logTensorValue(CurrentFeature,
                         reinterpret_cast<const char *>(
diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index ea08365810a29..5ec7c48d7ee64 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -137,7 +137,7 @@ class CopyTracker {
       PreservedRegUnits.resize(TRI.getNumRegUnits());
       for (unsigned SafeReg = 0, E = TRI.getNumRegs(); SafeReg < E; ++SafeReg)
         if (!RegMaskOp.clobbersPhysReg(SafeReg))
-          for (auto SafeUnit : TRI.regunits(SafeReg))
+          for (MCRegUnit SafeUnit : TRI.regunits(SafeReg))
             PreservedRegUnits.set(SafeUnit);
 
       return PreservedRegUnits;
@@ -937,16 +937,6 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
     if (CopyOperands) {
       Register RegSrc = CopyOperands->Source->getReg();
       Register RegDef = CopyOperands->Destination->getReg();
-      // It's possible that the previous transformations have resulted in a
-      // no-op register move (i.e. one where source and destination registers
-      // are the same and are not referring to a reserved register). If so,
-      // delete it.
-      if (RegSrc == RegDef && !MRI->isReserved(RegSrc)) {
-        MI.eraseFromParent();
-        NumDeletes++;
-        Changed = true;
-        continue;
-      }
 
       if (!TRI->regsOverlap(RegDef, RegSrc)) {
         // Copy is now a candidate for deletion.
@@ -1005,7 +995,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
         // Invalidate all entries in the copy map which are not preserved by
         // this register mask.
         bool MIRefedinCopyInfo = false;
-        for (unsigned RegUnit : TRI->regunits(Reg)) {
+        for (MCRegUnit RegUnit : TRI->regunits(Reg)) {
           if (!PreservedRegUnits.test(RegUnit))
             Tracker.clobberRegUnit(RegUnit, *TRI, *TII, UseCopyInstr);
           else {
diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
index c31454a8affda..b5d3092ee84d8 100644
--- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
@@ -129,6 +129,9 @@ static bool isColdBlock(const MachineBasicBlock &MBB,
 }
 
 bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
   // Do not split functions when -basic-block-sections=all is specified.
   if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All)
     return false;
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 8ad9245a47684..eb46124d9eb5f 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -978,7 +978,7 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx,
   assert(getMF() && "Can't have an MF reference here!");
   // Most opcodes have fixed constraints in their MCInstrDesc.
   if (!isInlineAsm())
-    return TII->getRegClass(getDesc(), OpIdx, TRI);
+    return TII->getRegClass(getDesc(), OpIdx);
 
   if (!getOperand(OpIdx).isReg())
     return nullptr;
@@ -1547,10 +1547,14 @@ bool MachineInstr::mayAlias(BatchAAResults *AA, const MachineInstr &Other,
 
   // Check each pair of memory operands from both instructions, which can't
   // alias only if all pairs won't alias.
-  for (auto *MMOa : memoperands())
-    for (auto *MMOb : Other.memoperands())
+  for (auto *MMOa : memoperands()) {
+    for (auto *MMOb : Other.memoperands()) {
+      if (!MMOa->isStore() && !MMOb->isStore())
+        continue;
       if (MemOperandsHaveAlias(MFI, AA, UseTBAA, MMOa, MMOb))
         return true;
+    }
+  }
 
   return false;
 }
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index da29ffc9d2fed..fa654f266c89a 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -83,15 +83,21 @@ llvm::createUnpackMachineBundles(
   return new UnpackMachineBundles(std::move(Ftor));
 }
 
-/// Return the first found DebugLoc that has a DILocation, given a range of
-/// instructions. The search range is from FirstMI to LastMI (exclusive). If no
-/// DILocation is found, then an empty location is returned.
+/// Return the first DebugLoc that has line number information, given a
+/// range of instructions. The search range is from FirstMI to LastMI
+/// (exclusive). Otherwise return the first DILocation or an empty location if
+/// there are none.
 static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI,
                             MachineBasicBlock::instr_iterator LastMI) {
-  for (auto MII = FirstMI; MII != LastMI; ++MII)
-    if (MII->getDebugLoc())
-      return MII->getDebugLoc();
-  return DebugLoc();
+  DebugLoc DL;
+  for (auto MII = FirstMI; MII != LastMI; ++MII) {
+    if (DebugLoc MIIDL = MII->getDebugLoc()) {
+      if (MIIDL.getLine() != 0)
+        return MIIDL;
+      DL = MIIDL.get();
+    }
+  }
+  return DL;
 }
 
 /// Check if target reg is contained in given lists, which are:
@@ -136,6 +142,8 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
   SmallSetVector<Register, 8> ExternUses;
   SmallSet<Register, 8> KilledUseSet;
   SmallSet<Register, 8> UndefUseSet;
+  SmallVector<std::pair<Register, Register>> TiedOperands;
+  SmallVector<MachineInstr *> MemMIs;
   for (auto MII = FirstMI; MII != LastMI; ++MII) {
     // Debug instructions have no effects to track.
     if (MII->isDebugInstr())
@@ -161,6 +169,15 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
           // External def is now killed.
           KilledUseSet.insert(Reg);
         }
+        if (MO.isTied() && Reg.isVirtual()) {
+          // Record tied operand constraints that involve virtual registers so
+          // that bundles that are formed pre-register allocation reflect the
+          // relevant constraints.
+          unsigned TiedIdx = MII->findTiedOperandIdx(MO.getOperandNo());
+          MachineOperand &TiedMO = MII->getOperand(TiedIdx);
+          Register DefReg = TiedMO.getReg();
+          TiedOperands.emplace_back(DefReg, Reg);
+        }
       }
     }
 
@@ -190,6 +207,9 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
       MIB.setMIFlag(MachineInstr::FrameSetup);
     if (MII->getFlag(MachineInstr::FrameDestroy))
       MIB.setMIFlag(MachineInstr::FrameDestroy);
+
+    if (MII->mayLoadOrStore())
+      MemMIs.push_back(&*MII);
   }
 
   for (Register Reg : LocalDefs) {
@@ -203,8 +223,20 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
     bool isKill = KilledUseSet.contains(Reg);
     bool isUndef = UndefUseSet.contains(Reg);
     MIB.addReg(Reg, getKillRegState(isKill) | getUndefRegState(isUndef) |
-               getImplRegState(true));
+                        getImplRegState(true));
+  }
+
+  for (auto [DefReg, UseReg] : TiedOperands) {
+    unsigned DefIdx =
+        std::distance(LocalDefs.begin(), llvm::find(LocalDefs, DefReg));
+    unsigned UseIdx =
+        std::distance(ExternUses.begin(), llvm::find(ExternUses, UseReg));
+    assert(DefIdx < LocalDefs.size());
+    assert(UseIdx < ExternUses.size());
+    MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx);
   }
+
+  MIB->cloneMergedMemRefs(MF, MemMIs);
 }
 
 /// finalizeBundle - Same functionality as the previous finalizeBundle except
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 729e73c8c312c..c169467384f8b 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1399,7 +1399,7 @@ MachineInstr *MachineLICMImpl::ExtractHoistableLoad(MachineInstr *MI,
   if (NewOpc == 0) return nullptr;
   const MCInstrDesc &MID = TII->get(NewOpc);
   MachineFunction &MF = *MI->getMF();
-  const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI);
+  const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex);
   // Ok, we're unfolding. Create a temporary register and do the unfold.
   Register Reg = MRI->createVirtualRegister(RC);
 
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index bb9c76ff0c729..8c6d2194433d0 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -363,8 +363,9 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
   case MachineOperand::MO_RegisterMask:
   case MachineOperand::MO_RegisterLiveOut: {
     // Shallow compare of the two RegMasks
-    const uint32_t *RegMask = getRegMask();
-    const uint32_t *OtherRegMask = Other.getRegMask();
+    const uint32_t *RegMask = isRegMask() ? getRegMask() : getRegLiveOut();
+    const uint32_t *OtherRegMask =
+        isRegMask() ? Other.getRegMask() : Other.getRegLiveOut();
     if (RegMask == OtherRegMask)
       return true;
 
@@ -434,7 +435,8 @@ hash_code llvm::hash_value(const MachineOperand &MO) {
     if (const MachineFunction *MF = getMFIfAvailable(MO)) {
       const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
       unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
-      const uint32_t *RegMask = MO.getRegMask();
+      const uint32_t *RegMask =
+          MO.isRegMask() ? MO.getRegMask() : MO.getRegLiveOut();
       std::vector<stable_hash> RegMaskHashes(RegMask, RegMask + RegMaskSize);
       return hash_combine(MO.getType(), MO.getTargetFlags(),
                           stable_hash_combine(RegMaskHashes));
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index 9feb9740de126..9f95c5ee9cbc6 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -420,10 +420,10 @@ struct InstructionMapper {
   InstructionMapper(const MachineModuleInfo &MMI_) : MMI(MMI_) {
     // Make sure that the implementation of DenseMapInfo<unsigned> hasn't
     // changed.
-    assert(DenseMapInfo<unsigned>::getEmptyKey() == (unsigned)-1 &&
-           "DenseMapInfo<unsigned>'s empty key isn't -1!");
-    assert(DenseMapInfo<unsigned>::getTombstoneKey() == (unsigned)-2 &&
-           "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
+    static_assert(DenseMapInfo<unsigned>::getEmptyKey() ==
+                  static_cast<unsigned>(-1));
+    static_assert(DenseMapInfo<unsigned>::getTombstoneKey() ==
+                  static_cast<unsigned>(-2));
   }
 };
 
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index ae284f3ae2929..094315b3903ea 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -665,7 +665,7 @@ void MachineRegisterInfo::setCalleeSavedRegs(ArrayRef<MCPhysReg> CSRs) {
   IsUpdatedCSRsInitialized = true;
 }
 
-bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const {
+bool MachineRegisterInfo::isReservedRegUnit(MCRegUnit Unit) const {
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
   for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) {
     if (all_of(TRI->superregs_inclusive(*Root),
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 3ed10454f76c5..73993705c4a7b 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -334,7 +334,7 @@ class MachineSchedulerImpl : public MachineSchedulerBase {
     LiveIntervals &LIS;
   };
 
-  MachineSchedulerImpl() {}
+  MachineSchedulerImpl() = default;
   // Migration only
   void setLegacyPass(MachineFunctionPass *P) { this->P = P; }
   void setMFAM(MachineFunctionAnalysisManager *MFAM) { this->MFAM = MFAM; }
@@ -358,7 +358,7 @@ class PostMachineSchedulerImpl : public MachineSchedulerBase {
     MachineLoopInfo &MLI;
     AAResults &AA;
   };
-  PostMachineSchedulerImpl() {}
+  PostMachineSchedulerImpl() = default;
   // Migration only
   void setLegacyPass(MachineFunctionPass *P) { this->P = P; }
   void setMFAM(MachineFunctionAnalysisManager *MFAM) { this->MFAM = MFAM; }
@@ -2559,7 +2559,7 @@ init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
     for (unsigned i = 0; i < ResourceCount; ++i) {
       ReservedCyclesIndex[i] = NumUnits;
       NumUnits += SchedModel->getProcResource(i)->NumUnits;
-      if (isUnbufferedGroup(i)) {
+      if (isReservedGroup(i)) {
         auto SubUnits = SchedModel->getProcResource(i)->SubUnitsIdxBegin;
         for (unsigned U = 0, UE = SchedModel->getProcResource(i)->NumUnits;
              U != UE; ++U)
@@ -2631,7 +2631,7 @@ SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx,
   assert(NumberOfInstances > 0 &&
          "Cannot have zero instances of a ProcResource");
 
-  if (isUnbufferedGroup(PIdx)) {
+  if (isReservedGroup(PIdx)) {
     // If any subunits are used by the instruction, report that the
     // subunits of the resource group are available at the first cycle
     // in which the unit is available, effectively removing the group
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index cdcb29d92bfe6..0ceeda4eb16d2 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -569,7 +569,7 @@ bool MachineSinking::PerformSinkAndFold(MachineInstr &MI,
       // Sink a copy of the instruction, replacing a COPY instruction.
       MachineBasicBlock::iterator InsertPt = SinkDst->getIterator();
       Register DstReg = SinkDst->getOperand(0).getReg();
-      TII->reMaterialize(*SinkDst->getParent(), InsertPt, DstReg, 0, MI, *TRI);
+      TII->reMaterialize(*SinkDst->getParent(), InsertPt, DstReg, 0, MI);
       New = &*std::prev(InsertPt);
       if (!New->getDebugLoc())
         New->setDebugLoc(SinkDst->getDebugLoc());
@@ -2287,6 +2287,10 @@ bool PostRAMachineSinkingImpl::tryToSinkCopy(MachineBasicBlock &CurBB,
       continue;
     }
 
+    // Don't postRASink instructions that the target prefers not to sink.
+    if (!TII->shouldPostRASink(MI))
+      continue;
+
     if (MI.isDebugOrPseudoInstr())
       continue;
 
diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index 9d56696079478..6da708d51b95f 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -136,7 +136,8 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
           const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
           unsigned RegMaskSize =
               MachineOperand::getRegMaskSize(TRI->getNumRegs());
-          const uint32_t *RegMask = MO.getRegMask();
+          const uint32_t *RegMask =
+              MO.isRegMask() ? MO.getRegMask() : MO.getRegLiveOut();
           std::vector<llvm::stable_hash> RegMaskHashes(RegMask,
                                                        RegMask + RegMaskSize);
           return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index 9ac3f7411af35..c40bd1c83f34a 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -31,7 +31,6 @@
 #include <algorithm>
 #include <cassert>
 #include <tuple>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index c0710c467a2e6..013f52938b65c 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2584,6 +2584,14 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
       report("Extra explicit operand on non-variadic instruction", MO, MONum);
   }
 
+  // Verify earlyClobber def operand
+  if (MCID.getOperandConstraint(MONum, MCOI::EARLY_CLOBBER) != -1) {
+    if (!MO->isReg())
+      report("Early clobber must be a register", MI);
+    if (!MO->isEarlyClobber())
+      report("Missing earlyClobber flag", MI);
+  }
+
   switch (MO->getType()) {
   case MachineOperand::MO_Register: {
     // Verify debug flag on debug instructions. Check this first because reg0
@@ -2649,8 +2657,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
         return;
       }
       if (MONum < MCID.getNumOperands()) {
-        if (const TargetRegisterClass *DRC =
-                TII->getRegClass(MCID, MONum, TRI)) {
+        if (const TargetRegisterClass *DRC = TII->getRegClass(MCID, MONum)) {
           if (!DRC->contains(Reg)) {
             report("Illegal physical register for instruction", MO, MONum);
             OS << printReg(Reg, TRI) << " is not a "
@@ -2734,12 +2741,11 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
         // has register class constraint, the virtual register must
         // comply to it.
         if (!isPreISelGenericOpcode(MCID.getOpcode()) &&
-            MONum < MCID.getNumOperands() &&
-            TII->getRegClass(MCID, MONum, TRI)) {
+            MONum < MCID.getNumOperands() && TII->getRegClass(MCID, MONum)) {
           report("Virtual register does not match instruction constraint", MO,
                  MONum);
           OS << "Expect register class "
-             << TRI->getRegClassName(TII->getRegClass(MCID, MONum, TRI))
+             << TRI->getRegClassName(TII->getRegClass(MCID, MONum))
              << " but got nothing\n";
           return;
         }
@@ -2765,8 +2771,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
         }
       }
       if (MONum < MCID.getNumOperands()) {
-        if (const TargetRegisterClass *DRC =
-                TII->getRegClass(MCID, MONum, TRI)) {
+        if (const TargetRegisterClass *DRC = TII->getRegClass(MCID, MONum)) {
           if (SubIdx) {
             const TargetRegisterClass *SuperRC =
                 TRI->getLargestLegalSuperClass(RC, *MF);
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 620d3d3d02daa..d738dc4eea36d 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -244,7 +244,7 @@ static bool canEmitMemcpy(const TargetMachine *TM, Function *F) {
   if (!TM)
     return true;
   const TargetLowering *TLI = TM->getSubtargetImpl(*F)->getTargetLowering();
-  return TLI->getMemcpyName() != nullptr;
+  return TLI->getMemcpyImpl() != RTLIB::Unsupported;
 }
 
 // Return a value appropriate for use with the memset_pattern16 libcall, if
diff --git a/llvm/lib/CodeGen/RDFRegisters.cpp b/llvm/lib/CodeGen/RDFRegisters.cpp
index b8d54cadc07f6..1400699a607ff 100644
--- a/llvm/lib/CodeGen/RDFRegisters.cpp
+++ b/llvm/lib/CodeGen/RDFRegisters.cpp
@@ -58,7 +58,7 @@ PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri,
       UnitInfos[U].Reg = F;
     } else {
       for (MCRegUnitMaskIterator I(F, &TRI); I.isValid(); ++I) {
-        std::pair<uint32_t, LaneBitmask> P = *I;
+        std::pair<MCRegUnit, LaneBitmask> P = *I;
         UnitInfo &UI = UnitInfos[P.first];
         UI.Reg = F;
         UI.Mask = P.second;
@@ -281,9 +281,9 @@ bool RegisterAggr::hasAliasOf(RegisterRef RR) const {
     return Units.anyCommon(PRI.getMaskUnits(RR.Reg));
 
   for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
-    std::pair<uint32_t, LaneBitmask> P = *U;
-    if ((P.second & RR.Mask).any())
-      if (Units.test(P.first))
+    auto [Unit, LaneMask] = *U;
+    if ((LaneMask & RR.Mask).any())
+      if (Units.test(Unit))
         return true;
   }
   return false;
@@ -296,9 +296,9 @@ bool RegisterAggr::hasCoverOf(RegisterRef RR) const {
   }
 
   for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
-    std::pair<uint32_t, LaneBitmask> P = *U;
-    if ((P.second & RR.Mask).any())
-      if (!Units.test(P.first))
+    auto [Unit, LaneMask] = *U;
+    if ((LaneMask & RR.Mask).any())
+      if (!Units.test(Unit))
         return false;
   }
   return true;
@@ -311,9 +311,9 @@ RegisterAggr &RegisterAggr::insert(RegisterRef RR) {
   }
 
   for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
-    std::pair<uint32_t, LaneBitmask> P = *U;
-    if ((P.second & RR.Mask).any())
-      Units.set(P.first);
+    auto [Unit, LaneMask] = *U;
+    if ((LaneMask & RR.Mask).any())
+      Units.set(Unit);
   }
   return *this;
 }
@@ -384,9 +384,9 @@ RegisterRef RegisterAggr::makeRegRef() const {
 
   LaneBitmask M;
   for (MCRegUnitMaskIterator I(F, &PRI.getTRI()); I.isValid(); ++I) {
-    std::pair<uint32_t, LaneBitmask> P = *I;
-    if (Units.test(P.first))
-      M |= P.second;
+    auto [Unit, LaneMask] = *I;
+    if (Units.test(Unit))
+      M |= LaneMask;
   }
   return RegisterRef(F, M);
 }
diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index 40a89078bcf59..61706e13b8e91 100644
--- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -193,7 +193,6 @@ void ReachingDefInfo::processDefs(MachineInstr *MI) {
   for (auto &MO : MI->operands()) {
     if (MO.isFI()) {
       int FrameIndex = MO.getIndex();
-      assert(FrameIndex >= 0 && "Can't handle negative frame indicies yet!");
       if (!isFIDef(*MI, FrameIndex, TII))
         continue;
       MBBFrameObjsReachingDefs[{MBBNumber, FrameIndex}].push_back(CurInstr);
@@ -302,8 +301,6 @@ void ReachingDefInfo::print(raw_ostream &OS) {
         Register Reg;
         if (MO.isFI()) {
           int FrameIndex = MO.getIndex();
-          assert(FrameIndex >= 0 &&
-                 "Can't handle negative frame indicies yet!");
           Reg = Register::index2StackSlot(FrameIndex);
         } else if (MO.isReg()) {
           if (MO.isDef())
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 697b779e10106..9097728c84e7e 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -206,7 +206,7 @@ class RegAllocFastImpl {
     bool Error = false;              ///< Could not allocate.
 
     explicit LiveReg(Register VirtReg) : VirtReg(VirtReg) {}
-    explicit LiveReg() {}
+    explicit LiveReg() = default;
 
     unsigned getSparseSetIndex() const { return VirtReg.virtRegIndex(); }
   };
@@ -594,8 +594,7 @@ void RegAllocFastImpl::spill(MachineBasicBlock::iterator Before,
   LLVM_DEBUG(dbgs() << " to stack slot #" << FI << '\n');
 
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
-  TII->storeRegToStackSlot(*MBB, Before, AssignedReg, Kill, FI, &RC, TRI,
-                           VirtReg);
+  TII->storeRegToStackSlot(*MBB, Before, AssignedReg, Kill, FI, &RC, VirtReg);
   ++NumStores;
 
   MachineBasicBlock::iterator FirstTerm = MBB->getFirstTerminator();
@@ -652,7 +651,7 @@ void RegAllocFastImpl::reload(MachineBasicBlock::iterator Before,
                     << printReg(PhysReg, TRI) << '\n');
   int FI = getStackSpaceFor(VirtReg);
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
-  TII->loadRegFromStackSlot(*MBB, Before, PhysReg, FI, &RC, TRI, VirtReg);
+  TII->loadRegFromStackSlot(*MBB, Before, PhysReg, FI, &RC, VirtReg);
   ++NumLoads;
 }
 
@@ -1123,7 +1122,7 @@ bool RegAllocFastImpl::defineVirtReg(MachineInstr &MI, unsigned OpNum,
           if (MO.isMBB()) {
             MachineBasicBlock *Succ = MO.getMBB();
             TII->storeRegToStackSlot(*Succ, Succ->begin(), PhysReg, Kill, FI,
-                                     &RC, TRI, VirtReg);
+                                     &RC, VirtReg);
             ++NumStores;
             Succ->addLiveIn(PhysReg);
           }
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h
index 7f013d1f1f726..4affa275cbf8b 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.h
+++ b/llvm/lib/CodeGen/RegAllocGreedy.h
@@ -33,7 +33,6 @@
 #include "llvm/CodeGen/SpillPlacement.h"
 #include "llvm/CodeGen/Spiller.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <queue>
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index e17a214b9a27d..25c4375a73ce0 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -81,7 +81,7 @@ static cl::opt<bool> EnableJoining("join-liveintervals",
 
 static cl::opt<bool> UseTerminalRule("terminal-rule",
                                      cl::desc("Apply the terminal rule"),
-                                     cl::init(false), cl::Hidden);
+                                     cl::init(true), cl::Hidden);
 
 /// Temporary flag to test critical edge unsplitting.
 static cl::opt<bool> EnableJoinSplits(
@@ -378,7 +378,7 @@ class RegisterCoalescer : private LiveRangeEdit::Delegate {
 
 public:
   // For legacy pass only.
-  RegisterCoalescer() {}
+  RegisterCoalescer() = default;
   RegisterCoalescer &operator=(RegisterCoalescer &&Other) = default;
 
   RegisterCoalescer(LiveIntervals *LIS, SlotIndexes *SI,
@@ -1373,7 +1373,7 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP,
   }
 
   const unsigned DefSubIdx = DefMI->getOperand(0).getSubReg();
-  const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI);
+  const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0);
   if (!DefMI->isImplicitDef()) {
     if (DstReg.isPhysical()) {
       Register NewDstReg = DstReg;
@@ -1600,6 +1600,22 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP,
       SlotIndex DefIndex =
           CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber());
       VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator();
+
+      // Refine the subranges that are now defined by the remat.
+      // This will split existing subranges if necessary.
+      DstInt.refineSubRanges(
+          Alloc, DstMask,
+          [&DefIndex, &Alloc](LiveInterval::SubRange &SR) {
+            // We know that this lane is defined by this instruction,
+            // but at this point it might not be live because it was not defined
+            // by the original instruction. This happens when the
+            // rematerialization widens the defined register. Assign that lane a
+            // dead def so that the interferences are properly modeled.
+            if (!SR.liveAt(DefIndex))
+              SR.createDeadDef(DefIndex, Alloc);
+          },
+          *LIS->getSlotIndexes(), *TRI);
+
       for (LiveInterval::SubRange &SR : DstInt.subranges()) {
         if ((SR.LaneMask & DstMask).none()) {
           LLVM_DEBUG(dbgs()
@@ -1617,14 +1633,6 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP,
           // updateRegDefUses. The original subrange def may have only undefed
           // some lanes.
           UpdatedSubRanges = true;
-        } else {
-          // We know that this lane is defined by this instruction,
-          // but at this point it might not be live because it was not defined
-          // by the original instruction. This happens when the
-          // rematerialization widens the defined register. Assign that lane a
-          // dead def so that the interferences are properly modeled.
-          if (!SR.liveAt(DefIndex))
-            SR.createDeadDef(DefIndex, Alloc);
         }
       }
       if (UpdatedSubRanges)
diff --git a/llvm/lib/CodeGen/RegisterScavenging.cpp b/llvm/lib/CodeGen/RegisterScavenging.cpp
index 7e26c2ed59949..d8861672a348f 100644
--- a/llvm/lib/CodeGen/RegisterScavenging.cpp
+++ b/llvm/lib/CodeGen/RegisterScavenging.cpp
@@ -276,14 +276,14 @@ RegScavenger::spill(Register Reg, const TargetRegisterClass &RC, int SPAdj,
                          ": Cannot scavenge register without an emergency "
                          "spill slot!");
     }
-    TII->storeRegToStackSlot(*MBB, Before, Reg, true, FI, &RC, TRI, Register());
+    TII->storeRegToStackSlot(*MBB, Before, Reg, true, FI, &RC, Register());
     MachineBasicBlock::iterator II = std::prev(Before);
 
     unsigned FIOperandNum = getFrameIndexOperandNum(*II);
     TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
 
     // Restore the scavenged register before its use (or first terminator).
-    TII->loadRegFromStackSlot(*MBB, UseMI, Reg, FI, &RC, TRI, Register());
+    TII->loadRegFromStackSlot(*MBB, UseMI, Reg, FI, &RC, Register());
     II = std::prev(UseMI);
 
     FIOperandNum = getFrameIndexOperandNum(*II);
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index e9ffa85c10859..782898f430c19 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -69,7 +69,6 @@
 #include <cstdint>
 #include <optional>
 #include <string>
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::safestack;
@@ -196,8 +195,6 @@ class SafeStack {
   bool run();
 };
 
-constexpr Align SafeStack::StackAlignment;
-
 uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
   uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType());
   if (AI->isArrayAllocation()) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1ef5dc2863eb6..d9d3a3ec01757 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2042,6 +2042,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
                                 return visitPARTIAL_REDUCE_MLA(N);
   case ISD::VECTOR_COMPRESS:    return visitVECTOR_COMPRESS(N);
   case ISD::LIFETIME_END:       return visitLIFETIME_END(N);
@@ -2715,6 +2716,12 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
           (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
       SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags);
       AddToWorklist(Add.getNode());
+      // We can't set InBounds even if both original ptradds were InBounds and
+      // NUW: SDAG usually represents pointers as integers, therefore, the
+      // matched pattern behaves as if it had implicit casts:
+      //   (ptradd inbounds (inttoptr (ptrtoint (ptradd inbounds x, y))), z)
+      // The outer inbounds ptradd might therefore rely on a provenance that x
+      // does not have.
       return DAG.getMemBasePlusOffset(X, Add, DL, Flags);
     }
   }
@@ -2740,6 +2747,12 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
         // that.
         SDNodeFlags Flags =
             (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
+        // We can't set InBounds even if both original ptradds were InBounds and
+        // NUW: SDAG usually represents pointers as integers, therefore, the
+        // matched pattern behaves as if it had implicit casts:
+        //   (ptradd inbounds (inttoptr (ptrtoint (ptradd inbounds GA, v))), c)
+        // The outer inbounds ptradd might therefore rely on a provenance that
+        // GA does not have.
         SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
         AddToWorklist(Inner.getNode());
         return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
@@ -2763,8 +2776,13 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
     bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
 
     // If both additions in the original were NUW, reassociation preserves that.
-    SDNodeFlags ReassocFlags =
-        (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
+    SDNodeFlags CommonFlags = N->getFlags() & N1->getFlags();
+    SDNodeFlags ReassocFlags = CommonFlags & SDNodeFlags::NoUnsignedWrap;
+    if (CommonFlags.hasNoUnsignedWrap()) {
+      // If both operations are NUW and the PTRADD is inbounds, the offests are
+      // both non-negative, so the reassociated PTRADDs are also inbounds.
+      ReassocFlags |= N->getFlags() & SDNodeFlags::InBounds;
+    }
 
     if (ZIsConstant != YIsConstant) {
       if (YIsConstant)
@@ -4029,6 +4047,8 @@ static SDValue foldSubCtlzNot(SDNode *N, SelectionDAG &DAG) {
                                     m_ConstInt(AndMask)))) {
     // Type Legalisation Pattern:
     // (sub (ctlz (and (xor Op XorMask) AndMask)) BitWidthDiff)
+    if (BitWidthDiff.getZExtValue() >= BitWidth)
+      return SDValue();
     unsigned AndMaskWidth = BitWidth - BitWidthDiff.getZExtValue();
     if (!(AndMask.isMask(AndMaskWidth) && XorMask.countr_one() >= AndMaskWidth))
       return SDValue();
@@ -6199,6 +6219,25 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
                                         SDLoc(N), VT, N0, N1))
     return SD;
 
+  if (TLI.isOperationLegalOrCustom(ISD::USUBO, VT) &&
+      !TLI.isOperationLegalOrCustom(ISD::UMIN, VT)) {
+    SDValue B;
+
+    // (umin (sub a, b), a) -> (usubo a, b); (select usubo.1, a, usubo.0)
+    if (sd_match(N0, m_Sub(m_Specific(N1), m_Value(B)))) {
+      SDVTList VTs = DAG.getVTList(VT, getSetCCResultType(VT));
+      SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, N1, B);
+      return DAG.getSelect(DL, VT, USO.getValue(1), N1, USO.getValue(0));
+    }
+
+    // (umin a, (sub a, b)) -> (usubo a, b); (select usubo.1, a, usubo.0)
+    if (sd_match(N1, m_Sub(m_Specific(N0), m_Value(B)))) {
+      SDVTList VTs = DAG.getVTList(VT, getSetCCResultType(VT));
+      SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, N0, B);
+      return DAG.getSelect(DL, VT, USO.getValue(1), N0, USO.getValue(0));
+    }
+  }
+
   // Simplify the operands using demanded-bits information.
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
@@ -9357,7 +9396,7 @@ static unsigned bigEndianByteAt(unsigned BW, unsigned i) {
 // Check if the bytes offsets we are looking at match with either big or
 // little endian value loaded. Return true for big endian, false for little
 // endian, and std::nullopt if match failed.
-static std::optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets,
+static std::optional<bool> isBigEndian(ArrayRef<int64_t> ByteOffsets,
                                        int64_t FirstOffset) {
   // The endian can be decided only when it is 2 bytes at least.
   unsigned Width = ByteOffsets.size();
@@ -10968,6 +11007,22 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
     }
   }
 
+  // fold (sra (xor (sra x, c1), -1), c2) -> (xor (sra x, c3), -1)
+  // This allows merging two arithmetic shifts even when there's a NOT in
+  // between.
+  SDValue X;
+  APInt C1;
+  if (N1C && sd_match(N0, m_OneUse(m_Not(
+                              m_OneUse(m_Sra(m_Value(X), m_ConstInt(C1))))))) {
+    APInt C2 = N1C->getAPIntValue();
+    zeroExtendToMatch(C1, C2, 1 /* Overflow Bit */);
+    APInt Sum = C1 + C2;
+    unsigned ShiftSum = Sum.getLimitedValue(OpSizeInBits - 1);
+    SDValue NewShift = DAG.getNode(
+        ISD::SRA, DL, VT, X, DAG.getShiftAmountConstant(ShiftSum, VT, DL));
+    return DAG.getNOT(DL, NewShift, VT);
+  }
+
   // fold (sra (shl X, m), (sub result_size, n))
   // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for
   // result_size - n != m.
@@ -12987,6 +13042,9 @@ SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) {
 //
 // partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1))
 // -> partial_reduce_*mla(acc, x, C)
+//
+// partial_reduce_fmla(acc, fmul(fpext(a), fpext(b)), splat(1.0))
+// -> partial_reduce_fmla(acc, a, b)
 SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   SDLoc DL(N);
   auto *Context = DAG.getContext();
@@ -12995,7 +13053,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   SDValue Op2 = N->getOperand(2);
 
   unsigned Opc = Op1->getOpcode();
-  if (Opc != ISD::MUL && Opc != ISD::SHL)
+  if (Opc != ISD::MUL && Opc != ISD::FMUL && Opc != ISD::SHL)
     return SDValue();
 
   SDValue LHS = Op1->getOperand(0);
@@ -13014,13 +13072,16 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
     Opc = ISD::MUL;
   }
 
-  APInt C;
-  if (Opc != ISD::MUL || !ISD::isConstantSplatVector(Op2.getNode(), C) ||
-      !C.isOne())
+  if (!(Opc == ISD::MUL && llvm::isOneOrOneSplat(Op2)) &&
+      !(Opc == ISD::FMUL && llvm::isOneOrOneSplatFP(Op2)))
     return SDValue();
 
+  auto IsIntOrFPExtOpcode = [](unsigned int Opcode) {
+    return (ISD::isExtOpcode(Opcode) || Opcode == ISD::FP_EXTEND);
+  };
+
   unsigned LHSOpcode = LHS->getOpcode();
-  if (!ISD::isExtOpcode(LHSOpcode))
+  if (!IsIntOrFPExtOpcode(LHSOpcode))
     return SDValue();
 
   SDValue LHSExtOp = LHS->getOperand(0);
@@ -13028,6 +13089,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
 
   // partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1))
   // -> partial_reduce_*mla(acc, x, C)
+  APInt C;
   if (ISD::isConstantSplatVector(RHS.getNode(), C)) {
     // TODO: Make use of partial_reduce_sumla here
     APInt CTrunc = C.trunc(LHSExtOpVT.getScalarSizeInBits());
@@ -13052,7 +13114,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   }
 
   unsigned RHSOpcode = RHS->getOpcode();
-  if (!ISD::isExtOpcode(RHSOpcode))
+  if (!IsIntOrFPExtOpcode(RHSOpcode))
     return SDValue();
 
   SDValue RHSExtOp = RHS->getOperand(0);
@@ -13069,6 +13131,8 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   else if (LHSOpcode == ISD::ZERO_EXTEND && RHSOpcode == ISD::SIGN_EXTEND) {
     NewOpc = ISD::PARTIAL_REDUCE_SUMLA;
     std::swap(LHSExtOp, RHSExtOp);
+  } else if (LHSOpcode == ISD::FP_EXTEND && RHSOpcode == ISD::FP_EXTEND) {
+    NewOpc = ISD::PARTIAL_REDUCE_FMLA;
   } else
     return SDValue();
   // For a 2-stage extend the signedness of both of the extends must match
@@ -13096,30 +13160,33 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
 // -> partial.reduce.smla(acc, op, splat(trunc(1)))
 // partial.reduce.sumla(acc, sext(op), splat(1))
 // -> partial.reduce.smla(acc, op, splat(trunc(1)))
+// partial.reduce.fmla(acc, fpext(op), splat(1.0))
+// -> partial.reduce.fmla(acc, op, splat(1.0))
 SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
   SDLoc DL(N);
   SDValue Acc = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDValue Op2 = N->getOperand(2);
 
-  APInt ConstantOne;
-  if (!ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) ||
-      !ConstantOne.isOne())
+  if (!llvm::isOneOrOneSplat(Op2) && !llvm::isOneOrOneSplatFP(Op2))
     return SDValue();
 
   unsigned Op1Opcode = Op1.getOpcode();
-  if (!ISD::isExtOpcode(Op1Opcode))
+  if (!ISD::isExtOpcode(Op1Opcode) && Op1Opcode != ISD::FP_EXTEND)
     return SDValue();
 
-  bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND;
+  bool Op1IsSigned =
+      Op1Opcode == ISD::SIGN_EXTEND || Op1Opcode == ISD::FP_EXTEND;
   bool NodeIsSigned = N->getOpcode() != ISD::PARTIAL_REDUCE_UMLA;
   EVT AccElemVT = Acc.getValueType().getVectorElementType();
   if (Op1IsSigned != NodeIsSigned &&
       Op1.getValueType().getVectorElementType() != AccElemVT)
     return SDValue();
 
-  unsigned NewOpcode =
-      Op1IsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA;
+  unsigned NewOpcode = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA
+                           ? ISD::PARTIAL_REDUCE_FMLA
+                       : Op1IsSigned ? ISD::PARTIAL_REDUCE_SMLA
+                                     : ISD::PARTIAL_REDUCE_UMLA;
 
   SDValue UnextOp1 = Op1.getOperand(0);
   EVT UnextOp1VT = UnextOp1.getValueType();
@@ -13129,8 +13196,12 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
           TLI.getTypeToTransformTo(*Context, UnextOp1VT)))
     return SDValue();
 
+  SDValue Constant = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA
+                         ? DAG.getConstantFP(1, DL, UnextOp1VT)
+                         : DAG.getConstant(1, DL, UnextOp1VT);
+
   return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, UnextOp1,
-                     DAG.getConstant(1, DL, UnextOp1VT));
+                     Constant);
 }
 
 SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) {
@@ -16717,38 +16788,51 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   }
 
   // fold (conv (load x)) -> (load (conv*)x)
+  // fold (conv (freeze (load x))) -> (freeze (load (conv*)x))
   // If the resultant load doesn't need a higher alignment than the original!
-  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
-      // Do not remove the cast if the types differ in endian layout.
-      TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) ==
-          TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) &&
-      // If the load is volatile, we only want to change the load type if the
-      // resulting load is legal. Otherwise we might increase the number of
-      // memory accesses. We don't care if the original type was legal or not
-      // as we assume software couldn't rely on the number of accesses of an
-      // illegal type.
-      ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) ||
-       TLI.isOperationLegal(ISD::LOAD, VT))) {
-    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+  auto CastLoad = [this, &VT](SDValue N0, const SDLoc &DL) {
+    if (!ISD::isNormalLoad(N0.getNode()) || !N0.hasOneUse())
+      return SDValue();
 
-    if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG,
-                                    *LN0->getMemOperand())) {
-      // If the range metadata type does not match the new memory
-      // operation type, remove the range metadata.
-      if (const MDNode *MD = LN0->getRanges()) {
-        ConstantInt *Lower = mdconst::extract<ConstantInt>(MD->getOperand(0));
-        if (Lower->getBitWidth() != VT.getScalarSizeInBits() ||
-            !VT.isInteger()) {
-          LN0->getMemOperand()->clearRanges();
-        }
+    // Do not remove the cast if the types differ in endian layout.
+    if (TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) !=
+        TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()))
+      return SDValue();
+
+    // If the load is volatile, we only want to change the load type if the
+    // resulting load is legal. Otherwise we might increase the number of
+    // memory accesses. We don't care if the original type was legal or not
+    // as we assume software couldn't rely on the number of accesses of an
+    // illegal type.
+    auto *LN0 = cast<LoadSDNode>(N0);
+    if ((LegalOperations || !LN0->isSimple()) &&
+        !TLI.isOperationLegal(ISD::LOAD, VT))
+      return SDValue();
+
+    if (!TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG,
+                                     *LN0->getMemOperand()))
+      return SDValue();
+
+    // If the range metadata type does not match the new memory
+    // operation type, remove the range metadata.
+    if (const MDNode *MD = LN0->getRanges()) {
+      ConstantInt *Lower = mdconst::extract<ConstantInt>(MD->getOperand(0));
+      if (Lower->getBitWidth() != VT.getScalarSizeInBits() || !VT.isInteger()) {
+        LN0->getMemOperand()->clearRanges();
       }
-      SDValue Load =
-          DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
-                      LN0->getMemOperand());
-      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
-      return Load;
     }
-  }
+    SDValue Load = DAG.getLoad(VT, DL, LN0->getChain(), LN0->getBasePtr(),
+                               LN0->getMemOperand());
+    DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
+    return Load;
+  };
+
+  if (SDValue NewLd = CastLoad(N0, SDLoc(N)))
+    return NewLd;
+
+  if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse())
+    if (SDValue NewLd = CastLoad(N0.getOperand(0), SDLoc(N)))
+      return DAG.getFreeze(NewLd);
 
   if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI))
     return V;
@@ -18814,6 +18898,26 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
+  if (VT != N1.getValueType())
+    return SDValue();
+
+  // If this is equivalent to a disjoint or, replace it with one. This can
+  // happen if the sign operand is a sign mask (i.e., x << sign_bit_position).
+  if (DAG.SignBitIsZeroFP(N0) &&
+      DAG.computeKnownBits(N1).Zero.isMaxSignedValue()) {
+    // TODO: Just directly match the shift pattern. computeKnownBits is heavy
+    // for a such a narrowly targeted case.
+    EVT IntVT = VT.changeTypeToInteger();
+    // TODO: It appears to be profitable in some situations to unconditionally
+    // emit a fabs(n0) to perform this combine.
+    SDValue CastSrc0 = DAG.getNode(ISD::BITCAST, DL, IntVT, N0);
+    SDValue CastSrc1 = DAG.getNode(ISD::BITCAST, DL, IntVT, N1);
+
+    SDValue SignOr = DAG.getNode(ISD::OR, DL, IntVT, CastSrc0, CastSrc1,
+                                 SDNodeFlags::Disjoint);
+    return DAG.getNode(ISD::BITCAST, DL, VT, SignOr);
+  }
+
   return SDValue();
 }
 
@@ -22743,7 +22847,10 @@ SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) {
     NewPtr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(COffset), DL);
     PointerInfo = ST->getPointerInfo().getWithOffset(COffset);
   } else {
-    NewPtr = TLI.getVectorElementPointer(DAG, Ptr, Value.getValueType(), Idx);
+    // The original DAG loaded the entire vector from memory, so arithmetic
+    // within it must be inbounds.
+    NewPtr = TLI.getInboundsVectorElementPointer(DAG, Ptr, Value.getValueType(),
+                                                 Idx);
   }
 
   return DAG.getStore(Chain, DL, Elt, NewPtr, PointerInfo, ST->getAlign(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 507b2d61a534c..5c84059da273b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1965,7 +1965,7 @@ Register FastISel::createResultReg(const TargetRegisterClass *RC) {
 Register FastISel::constrainOperandRegClass(const MCInstrDesc &II, Register Op,
                                             unsigned OpNum) {
   if (Op.isVirtual()) {
-    const TargetRegisterClass *RegClass = TII.getRegClass(II, OpNum, &TRI);
+    const TargetRegisterClass *RegClass = TII.getRegClass(II, OpNum);
     if (!MRI.constrainRegClass(Op, RegClass)) {
       // If it's not legal to COPY between the register classes, something
       // has gone very wrong before we got here.
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index bb10cf687db8d..72d0c44889048 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -125,7 +125,7 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
           const TargetRegisterClass *RC = nullptr;
           if (i + II.getNumDefs() < II.getNumOperands()) {
             RC = TRI->getAllocatableClass(
-                TII->getRegClass(II, i + II.getNumDefs(), TRI));
+                TII->getRegClass(II, i + II.getNumDefs()));
           }
           if (!UseRC)
             UseRC = RC;
@@ -197,7 +197,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
     // register instead of creating a new vreg.
     Register VRBase;
     const TargetRegisterClass *RC =
-        TRI->getAllocatableClass(TII->getRegClass(II, i, TRI));
+        TRI->getAllocatableClass(TII->getRegClass(II, i));
     // Always let the value type influence the used register class. The
     // constraints on the instruction may be too lax to represent the value
     // type correctly. For example, a 64-bit float (X86::FR64) can't live in
@@ -330,7 +330,7 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB,
   if (II) {
     const TargetRegisterClass *OpRC = nullptr;
     if (IIOpNum < II->getNumOperands())
-      OpRC = TII->getRegClass(*II, IIOpNum, TRI);
+      OpRC = TII->getRegClass(*II, IIOpNum);
 
     if (OpRC) {
       unsigned MinNumRegs = MinRCSize;
@@ -409,8 +409,7 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB, SDValue Op,
     Register VReg = R->getReg();
     MVT OpVT = Op.getSimpleValueType();
     const TargetRegisterClass *IIRC =
-        II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum, TRI))
-           : nullptr;
+        II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum)) : nullptr;
     const TargetRegisterClass *OpRC =
         TLI->isTypeLegal(OpVT)
             ? TLI->getRegClassFor(OpVT,
@@ -733,6 +732,8 @@ MachineOperand GetMOForConstDbgOp(const SDDbgOperand &Op) {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
     if (CI->getBitWidth() > 64)
       return MachineOperand::CreateCImm(CI);
+    if (CI->getBitWidth() == 1)
+      return MachineOperand::CreateImm(CI->getZExtValue());
     return MachineOperand::CreateImm(CI->getSExtValue());
   }
   if (const ConstantFP *CF = dyn_cast<ConstantFP>(V))
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 431a81002074f..99d14a60c6ed1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -163,6 +163,8 @@ class SelectionDAGLegalize {
                                    RTLIB::Libcall CallI128);
   void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 
+  SDValue ExpandSincosStretLibCall(SDNode *Node) const;
+
   SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
                            const SDLoc &dl);
   SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
@@ -2423,6 +2425,101 @@ static bool useSinCos(SDNode *Node) {
   return false;
 }
 
+SDValue SelectionDAGLegalize::ExpandSincosStretLibCall(SDNode *Node) const {
+  // For iOS, we want to call an alternative entry point: __sincos_stret,
+  // which returns the values in two S / D registers.
+  SDLoc dl(Node);
+  SDValue Arg = Node->getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
+  RTLIB::LibcallImpl SincosStret = TLI.getLibcallImpl(LC);
+  if (SincosStret == RTLIB::Unsupported)
+    return SDValue();
+
+  /// There are 3 different ABI cases to handle:
+  /// - Direct return of separate fields in registers
+  /// - Single return as vector elements
+  /// - sret struct
+
+  const RTLIB::RuntimeLibcallsInfo &CallsInfo = TLI.getRuntimeLibcallsInfo();
+
+  const DataLayout &DL = DAG.getDataLayout();
+
+  auto [FuncTy, FuncAttrs] = CallsInfo.getFunctionTy(
+      *DAG.getContext(), TM.getTargetTriple(), DL, SincosStret);
+
+  Type *SincosStretRetTy = FuncTy->getReturnType();
+  CallingConv::ID CallConv = CallsInfo.getLibcallImplCallingConv(SincosStret);
+  StringRef LibcallImplName = CallsInfo.getLibcallImplName(SincosStret);
+
+  SDValue Callee = DAG.getExternalSymbol(LibcallImplName.data(),
+                                         TLI.getProgramPointerTy(DL));
+
+  TargetLowering::ArgListTy Args;
+  SDValue SRet;
+
+  int FrameIdx;
+  if (FuncTy->getParamType(0)->isPointerTy()) {
+    // Uses sret
+    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+    AttributeSet PtrAttrs = FuncAttrs.getParamAttrs(0);
+    Type *StructTy = PtrAttrs.getStructRetType();
+    const uint64_t ByteSize = DL.getTypeAllocSize(StructTy);
+    const Align StackAlign = DL.getPrefTypeAlign(StructTy);
+
+    FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
+    SRet = DAG.getFrameIndex(FrameIdx, TLI.getFrameIndexTy(DL));
+
+    TargetLowering::ArgListEntry Entry(SRet, FuncTy->getParamType(0));
+    Entry.IsSRet = true;
+    Entry.IndirectType = StructTy;
+    Entry.Alignment = StackAlign;
+
+    Args.push_back(Entry);
+    Args.emplace_back(Arg, FuncTy->getParamType(1));
+  } else {
+    Args.emplace_back(Arg, FuncTy->getParamType(0));
+  }
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(CallConv, SincosStretRetTy, Callee, std::move(Args))
+      .setIsPostTypeLegalization();
+
+  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+  if (SRet) {
+    MachinePointerInfo PtrInfo =
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
+    SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, PtrInfo);
+
+    TypeSize StoreSize = ArgVT.getStoreSize();
+
+    // Address of cos field.
+    SDValue Add = DAG.getObjectPtrOffset(dl, SRet, StoreSize);
+    SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
+                                  PtrInfo.getWithOffset(StoreSize));
+
+    SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+    return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, LoadSin.getValue(0),
+                       LoadCos.getValue(0));
+  }
+
+  if (!CallResult.first.getValueType().isVector())
+    return CallResult.first;
+
+  SDValue SinVal =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
+                  DAG.getVectorIdxConstant(0, dl));
+  SDValue CosVal =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
+                  DAG.getVectorIdxConstant(1, dl));
+  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
+}
+
 SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const {
   SDLoc dl(Node);
   EVT VT = Node->getValueType(0);
@@ -4730,12 +4827,30 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
   case ISD::FSINCOS:
   case ISD::FSINCOSPI: {
     EVT VT = Node->getValueType(0);
+
+    if (Node->getOpcode() == ISD::FSINCOS) {
+      RTLIB::Libcall SincosStret = RTLIB::getSINCOS_STRET(VT);
+      if (SincosStret != RTLIB::UNKNOWN_LIBCALL) {
+        if (SDValue Expanded = ExpandSincosStretLibCall(Node)) {
+          Results.push_back(Expanded);
+          Results.push_back(Expanded.getValue(1));
+          break;
+        }
+      }
+    }
+
     RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS
                             ? RTLIB::getSINCOS(VT)
                             : RTLIB::getSINCOSPI(VT);
-    bool Expanded = DAG.expandMultipleResultFPLibCall(LC, Node, Results);
-    if (!Expanded)
-      llvm_unreachable("Expected scalar FSINCOS[PI] to expand to libcall!");
+    bool Expanded = TLI.expandMultipleResultFPLibCall(DAG, LC, Node, Results);
+    if (!Expanded) {
+      DAG.getContext()->emitError(Twine("no libcall available for ") +
+                                  Node->getOperationName(&DAG));
+      SDValue Poison = DAG.getPOISON(VT);
+      Results.push_back(Poison);
+      Results.push_back(Poison);
+    }
+
     break;
   }
   case ISD::FLOG:
@@ -4825,7 +4940,7 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     EVT VT = Node->getValueType(0);
     RTLIB::Libcall LC = Node->getOpcode() == ISD::FMODF ? RTLIB::getMODF(VT)
                                                         : RTLIB::getFREXP(VT);
-    bool Expanded = DAG.expandMultipleResultFPLibCall(LC, Node, Results,
+    bool Expanded = TLI.expandMultipleResultFPLibCall(DAG, LC, Node, Results,
                                                       /*CallRetResNo=*/0);
     if (!Expanded)
       llvm_unreachable("Expected scalar FFREXP/FMODF to expand to libcall!");
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index bf1abfe50327e..383a025a4d916 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1172,6 +1172,12 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
   case ISD::FAKE_USE:
     Res = SoftenFloatOp_FAKE_USE(N);
     break;
+  case ISD::STACKMAP:
+    Res = SoftenFloatOp_STACKMAP(N, OpNo);
+    break;
+  case ISD::PATCHPOINT:
+    Res = SoftenFloatOp_PATCHPOINT(N, OpNo);
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -1512,6 +1518,20 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FAKE_USE(SDNode *N) {
                      N->getOperand(0), Op1);
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatOp_STACKMAP(SDNode *N, unsigned OpNo) {
+  assert(OpNo > 1); // Because the first two arguments are guaranteed legal.
+  SmallVector<SDValue> NewOps(N->ops());
+  NewOps[OpNo] = GetSoftenedFloat(NewOps[OpNo]);
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_PATCHPOINT(SDNode *N, unsigned OpNo) {
+  assert(OpNo >= 7);
+  SmallVector<SDValue> NewOps(N->ops());
+  NewOps[OpNo] = GetSoftenedFloat(NewOps[OpNo]);
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
 //===----------------------------------------------------------------------===//
 //  Float Result Expansion
 //===----------------------------------------------------------------------===//
@@ -1706,7 +1726,7 @@ void DAGTypeLegalizer::ExpandFloatRes_UnaryWithTwoFPResults(
     SDNode *N, RTLIB::Libcall LC, std::optional<unsigned> CallRetResNo) {
   assert(!N->isStrictFPOpcode() && "strictfp not implemented");
   SmallVector<SDValue> Results;
-  DAG.expandMultipleResultFPLibCall(LC, N, Results, CallRetResNo);
+  TLI.expandMultipleResultFPLibCall(DAG, LC, N, Results, CallRetResNo);
   for (auto [ResNo, Res] : enumerate(Results)) {
     SDValue Lo, Hi;
     GetPairElements(Res, Lo, Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index b1776eaae6e86..44e5a187c4281 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2871,18 +2871,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SET_ROUNDING(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo) {
   assert(OpNo > 1); // Because the first two arguments are guaranteed legal.
   SmallVector<SDValue> NewOps(N->ops());
-  SDValue Operand = N->getOperand(OpNo);
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType());
-  NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand);
+  NewOps[OpNo] = GetPromotedInteger(NewOps[OpNo]);
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo) {
   assert(OpNo >= 7);
   SmallVector<SDValue> NewOps(N->ops());
-  SDValue Operand = N->getOperand(OpNo);
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType());
-  NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand);
+  NewOps[OpNo] = GetPromotedInteger(NewOps[OpNo]);
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9656a30321efa..ede522eff6df3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -658,6 +658,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SoftenFloatOp_ATOMIC_STORE(SDNode *N, unsigned OpNo);
   SDValue SoftenFloatOp_FCOPYSIGN(SDNode *N);
   SDValue SoftenFloatOp_FAKE_USE(SDNode *N);
+  SDValue SoftenFloatOp_STACKMAP(SDNode *N, unsigned OpNo);
+  SDValue SoftenFloatOp_PATCHPOINT(SDNode *N, unsigned OpNo);
 
   //===--------------------------------------------------------------------===//
   // Float Expansion Support: LegalizeFloatTypes.cpp
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 8e423c4f83b38..7d979caa8bf82 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -534,6 +534,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     Action =
         TLI.getPartialReduceMLAAction(Op.getOpcode(), Node->getValueType(0),
                                       Node->getOperand(1).getValueType());
@@ -1243,6 +1244,7 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     Results.push_back(TLI.expandPartialReduceMLA(Node, DAG));
     return;
   case ISD::VECREDUCE_SEQ_FADD:
@@ -1268,18 +1270,23 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
     break;
   case ISD::FSINCOS:
   case ISD::FSINCOSPI: {
-    EVT VT = Node->getValueType(0).getVectorElementType();
+    EVT VT = Node->getValueType(0);
     RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS
                             ? RTLIB::getSINCOS(VT)
                             : RTLIB::getSINCOSPI(VT);
-    if (DAG.expandMultipleResultFPLibCall(LC, Node, Results))
+    if (LC != RTLIB::UNKNOWN_LIBCALL &&
+        TLI.expandMultipleResultFPLibCall(DAG, LC, Node, Results))
       return;
+
+    // TODO: Try to see if there's a narrower call available to use before
+    // scalarizing.
     break;
   }
   case ISD::FMODF: {
-    RTLIB::Libcall LC =
-        RTLIB::getMODF(Node->getValueType(0).getVectorElementType());
-    if (DAG.expandMultipleResultFPLibCall(LC, Node, Results,
+    EVT VT = Node->getValueType(0);
+    RTLIB::Libcall LC = RTLIB::getMODF(VT);
+    if (LC != RTLIB::UNKNOWN_LIBCALL &&
+        TLI.expandMultipleResultFPLibCall(DAG, LC, Node, Results,
                                           /*CallRetResNo=*/0))
       return;
     break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index bb4a8d9967f94..6284ded3be922 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1474,6 +1474,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     SplitVecRes_PARTIAL_REDUCE_MLA(N, Lo, Hi);
     break;
   case ISD::GET_ACTIVE_LANE_MASK:
@@ -3689,6 +3690,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     Res = SplitVecOp_PARTIAL_REDUCE_MLA(N);
     break;
   }
@@ -6055,11 +6057,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOOP_DEPENDENCE_MASK(SDNode *N) {
 
 SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
   SDLoc dl(N);
-  // Build a vector with undefined for the new nodes.
+  // Build a vector with poison for the new nodes.
   EVT VT = N->getValueType(0);
 
   // Integer BUILD_VECTOR operands may be larger than the node's vector element
-  // type. The UNDEFs need to have the same type as the existing operands.
+  // type. The POISONs need to have the same type as the existing operands.
   EVT EltVT = N->getOperand(0).getValueType();
   unsigned NumElts = VT.getVectorNumElements();
 
@@ -6068,7 +6070,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
 
   SmallVector<SDValue, 16> NewOps(N->ops());
   assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
-  NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT));
+  NewOps.append(WidenNumElts - NumElts, DAG.getPOISON(EltVT));
 
   return DAG.getBuildVector(WidenVT, dl, NewOps);
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index f70b6cddcc099..12fc26d949581 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -340,7 +340,7 @@ static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
 
     unsigned Idx = RegDefPos.GetIdx();
     const MCInstrDesc &Desc = TII->get(Opcode);
-    const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx, TRI);
+    const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx);
     assert(RC && "Not a valid register class");
     RegClass = RC->getID();
     // FIXME: Cost arbitrarily set to 1 because there doesn't seem to be a
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 379242ec5a157..c2b4c19846316 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -76,7 +76,6 @@
 #include <cstdlib>
 #include <limits>
 #include <optional>
-#include <set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -2468,180 +2467,6 @@ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
   return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
 }
 
-/// Given a store node \p StoreNode, return true if it is safe to fold that node
-/// into \p FPNode, which expands to a library call with output pointers.
-static bool canFoldStoreIntoLibCallOutputPointers(StoreSDNode *StoreNode,
-                                                  SDNode *FPNode) {
-  SmallVector<const SDNode *, 8> Worklist;
-  SmallVector<const SDNode *, 8> DeferredNodes;
-  SmallPtrSet<const SDNode *, 16> Visited;
-
-  // Skip FPNode use by StoreNode (that's the use we want to fold into FPNode).
-  for (SDValue Op : StoreNode->ops())
-    if (Op.getNode() != FPNode)
-      Worklist.push_back(Op.getNode());
-
-  unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
-  while (!Worklist.empty()) {
-    const SDNode *Node = Worklist.pop_back_val();
-    auto [_, Inserted] = Visited.insert(Node);
-    if (!Inserted)
-      continue;
-
-    if (MaxSteps > 0 && Visited.size() >= MaxSteps)
-      return false;
-
-    // Reached the FPNode (would result in a cycle).
-    // OR Reached CALLSEQ_START (would result in nested call sequences).
-    if (Node == FPNode || Node->getOpcode() == ISD::CALLSEQ_START)
-      return false;
-
-    if (Node->getOpcode() == ISD::CALLSEQ_END) {
-      // Defer looking into call sequences (so we can check we're outside one).
-      // We still need to look through these for the predecessor check.
-      DeferredNodes.push_back(Node);
-      continue;
-    }
-
-    for (SDValue Op : Node->ops())
-      Worklist.push_back(Op.getNode());
-  }
-
-  // True if we're outside a call sequence and don't have the FPNode as a
-  // predecessor. No cycles or nested call sequences possible.
-  return !SDNode::hasPredecessorHelper(FPNode, Visited, DeferredNodes,
-                                       MaxSteps);
-}
-
-bool SelectionDAG::expandMultipleResultFPLibCall(
-    RTLIB::Libcall LC, SDNode *Node, SmallVectorImpl<SDValue> &Results,
-    std::optional<unsigned> CallRetResNo) {
-  LLVMContext &Ctx = *getContext();
-  EVT VT = Node->getValueType(0);
-  unsigned NumResults = Node->getNumValues();
-
-  if (LC == RTLIB::UNKNOWN_LIBCALL)
-    return false;
-
-  const char *LCName = TLI->getLibcallName(LC);
-  if (!LCName)
-    return false;
-
-  auto getVecDesc = [&]() -> VecDesc const * {
-    for (bool Masked : {false, true}) {
-      if (VecDesc const *VD = getLibInfo().getVectorMappingInfo(
-              LCName, VT.getVectorElementCount(), Masked)) {
-        return VD;
-      }
-    }
-    return nullptr;
-  };
-
-  // For vector types, we must find a vector mapping for the libcall.
-  VecDesc const *VD = nullptr;
-  if (VT.isVector() && !(VD = getVecDesc()))
-    return false;
-
-  // Find users of the node that store the results (and share input chains). The
-  // destination pointers can be used instead of creating stack allocations.
-  SDValue StoresInChain;
-  SmallVector<StoreSDNode *, 2> ResultStores(NumResults);
-  for (SDNode *User : Node->users()) {
-    if (!ISD::isNormalStore(User))
-      continue;
-    auto *ST = cast<StoreSDNode>(User);
-    SDValue StoreValue = ST->getValue();
-    unsigned ResNo = StoreValue.getResNo();
-    // Ensure the store corresponds to an output pointer.
-    if (CallRetResNo == ResNo)
-      continue;
-    // Ensure the store to the default address space and not atomic or volatile.
-    if (!ST->isSimple() || ST->getAddressSpace() != 0)
-      continue;
-    // Ensure all store chains are the same (so they don't alias).
-    if (StoresInChain && ST->getChain() != StoresInChain)
-      continue;
-    // Ensure the store is properly aligned.
-    Type *StoreType = StoreValue.getValueType().getTypeForEVT(Ctx);
-    if (ST->getAlign() <
-        getDataLayout().getABITypeAlign(StoreType->getScalarType()))
-      continue;
-    // Avoid:
-    //  1. Creating cyclic dependencies.
-    //  2. Expanding the node to a call within a call sequence.
-    if (!canFoldStoreIntoLibCallOutputPointers(ST, Node))
-      continue;
-    ResultStores[ResNo] = ST;
-    StoresInChain = ST->getChain();
-  }
-
-  TargetLowering::ArgListTy Args;
-
-  // Pass the arguments.
-  for (const SDValue &Op : Node->op_values()) {
-    EVT ArgVT = Op.getValueType();
-    Type *ArgTy = ArgVT.getTypeForEVT(Ctx);
-    Args.emplace_back(Op, ArgTy);
-  }
-
-  // Pass the output pointers.
-  SmallVector<SDValue, 2> ResultPtrs(NumResults);
-  Type *PointerTy = PointerType::getUnqual(Ctx);
-  for (auto [ResNo, ST] : llvm::enumerate(ResultStores)) {
-    if (ResNo == CallRetResNo)
-      continue;
-    EVT ResVT = Node->getValueType(ResNo);
-    SDValue ResultPtr = ST ? ST->getBasePtr() : CreateStackTemporary(ResVT);
-    ResultPtrs[ResNo] = ResultPtr;
-    Args.emplace_back(ResultPtr, PointerTy);
-  }
-
-  SDLoc DL(Node);
-
-  // Pass the vector mask (if required).
-  if (VD && VD->isMasked()) {
-    EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), Ctx, VT);
-    SDValue Mask = getBoolConstant(true, DL, MaskVT, VT);
-    Args.emplace_back(Mask, MaskVT.getTypeForEVT(Ctx));
-  }
-
-  Type *RetType = CallRetResNo.has_value()
-                      ? Node->getValueType(*CallRetResNo).getTypeForEVT(Ctx)
-                      : Type::getVoidTy(Ctx);
-  SDValue InChain = StoresInChain ? StoresInChain : getEntryNode();
-  SDValue Callee = getExternalSymbol(VD ? VD->getVectorFnName().data() : LCName,
-                                     TLI->getPointerTy(getDataLayout()));
-  TargetLowering::CallLoweringInfo CLI(*this);
-  CLI.setDebugLoc(DL).setChain(InChain).setLibCallee(
-      TLI->getLibcallCallingConv(LC), RetType, Callee, std::move(Args));
-
-  auto [Call, CallChain] = TLI->LowerCallTo(CLI);
-
-  for (auto [ResNo, ResultPtr] : llvm::enumerate(ResultPtrs)) {
-    if (ResNo == CallRetResNo) {
-      Results.push_back(Call);
-      continue;
-    }
-    MachinePointerInfo PtrInfo;
-    SDValue LoadResult =
-        getLoad(Node->getValueType(ResNo), DL, CallChain, ResultPtr, PtrInfo);
-    SDValue OutChain = LoadResult.getValue(1);
-
-    if (StoreSDNode *ST = ResultStores[ResNo]) {
-      // Replace store with the library call.
-      ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain);
-      PtrInfo = ST->getPointerInfo();
-    } else {
-      PtrInfo = MachinePointerInfo::getFixedStack(
-          getMachineFunction(), cast<FrameIndexSDNode>(ResultPtr)->getIndex());
-    }
-
-    Results.push_back(LoadResult);
-  }
-
-  return true;
-}
-
 SDValue SelectionDAG::expandVAArg(SDNode *Node) {
   SDLoc dl(Node);
   const TargetLowering &TLI = getTargetLoweringInfo();
@@ -2921,6 +2746,34 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
   return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth);
 }
 
+bool SelectionDAG::SignBitIsZeroFP(SDValue Op, unsigned Depth) const {
+  if (Depth >= MaxRecursionDepth)
+    return false; // Limit search depth.
+
+  unsigned Opc = Op.getOpcode();
+  switch (Opc) {
+  case ISD::FABS:
+    return true;
+  case ISD::AssertNoFPClass: {
+    FPClassTest NoFPClass =
+        static_cast<FPClassTest>(Op.getConstantOperandVal(1));
+
+    const FPClassTest TestMask = fcNan | fcNegative;
+    return (NoFPClass & TestMask) == TestMask;
+  }
+  case ISD::ARITH_FENCE:
+    return SignBitIsZeroFP(Op, Depth + 1);
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FEXP10:
+    return Op->getFlags().hasNoNaNs();
+  default:
+    return false;
+  }
+
+  llvm_unreachable("covered opcode switch");
+}
+
 /// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero.  We use
 /// this predicate to simplify operations downstream.  Mask is known to be zero
 /// for bits that V cannot have.
@@ -4122,6 +3975,25 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     Known.One.clearLowBits(LogOfAlign);
     break;
   }
+  case ISD::AssertNoFPClass: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+    FPClassTest NoFPClass =
+        static_cast<FPClassTest>(Op.getConstantOperandVal(1));
+    const FPClassTest NegativeTestMask = fcNan | fcNegative;
+    if ((NoFPClass & NegativeTestMask) == NegativeTestMask) {
+      // Cannot be negative.
+      Known.makeNonNegative();
+    }
+
+    const FPClassTest PositiveTestMask = fcNan | fcPositive;
+    if ((NoFPClass & PositiveTestMask) == PositiveTestMask) {
+      // Cannot be positive.
+      Known.makeNegative();
+    }
+
+    break;
+  }
   case ISD::FGETSIGN:
     // All bits are zero except the low bit.
     Known.Zero.setBitsFrom(1);
@@ -6233,7 +6105,17 @@ bool SelectionDAG::cannotBeOrderedNegativeFP(SDValue Op) const {
   if (ConstantFPSDNode *C1 = isConstOrConstSplatFP(Op, true))
     return !C1->isNegative();
 
-  return Op.getOpcode() == ISD::FABS;
+  switch (Op.getOpcode()) {
+  case ISD::FABS:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FEXP10:
+    return true;
+  default:
+    return false;
+  }
+
+  llvm_unreachable("covered opcode switch");
 }
 
 bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
@@ -8404,7 +8286,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   }
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
-  case ISD::PARTIAL_REDUCE_SUMLA: {
+  case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA: {
     [[maybe_unused]] EVT AccVT = N1.getValueType();
     [[maybe_unused]] EVT Input1VT = N2.getValueType();
     [[maybe_unused]] EVT Input2VT = N3.getValueType();
@@ -9257,21 +9140,22 @@ SDValue SelectionDAG::getMemcpy(
   // FIXME: pass in SDLoc
   TargetLowering::CallLoweringInfo CLI(*this);
   bool IsTailCall = false;
-  const char *MemCpyName = TLI->getMemcpyName();
+  RTLIB::LibcallImpl MemCpyImpl = TLI->getMemcpyImpl();
 
   if (OverrideTailCall.has_value()) {
     IsTailCall = *OverrideTailCall;
   } else {
-    bool LowersToMemcpy = StringRef(MemCpyName) == StringRef("memcpy");
+    bool LowersToMemcpy = MemCpyImpl == RTLIB::impl_memcpy;
     IsTailCall = isInTailCallPositionWrapper(CI, this, LowersToMemcpy);
   }
 
   CLI.setDebugLoc(dl)
       .setChain(Chain)
       .setLibCallee(
-          TLI->getLibcallCallingConv(RTLIB::MEMCPY),
+          TLI->getLibcallImplCallingConv(MemCpyImpl),
           Dst.getValueType().getTypeForEVT(*getContext()),
-          getExternalSymbol(MemCpyName, TLI->getPointerTy(getDataLayout())),
+          getExternalSymbol(TLI->getLibcallImplName(MemCpyImpl).data(),
+                            TLI->getPointerTy(getDataLayout())),
           std::move(Args))
       .setDiscardResult()
       .setTailCall(IsTailCall);
@@ -9361,22 +9245,24 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
   // FIXME:  pass in SDLoc
   TargetLowering::CallLoweringInfo CLI(*this);
 
+  RTLIB::LibcallImpl MemmoveImpl = TLI->getLibcallImpl(RTLIB::MEMMOVE);
+
   bool IsTailCall = false;
   if (OverrideTailCall.has_value()) {
     IsTailCall = *OverrideTailCall;
   } else {
-    bool LowersToMemmove =
-        TLI->getLibcallName(RTLIB::MEMMOVE) == StringRef("memmove");
+    bool LowersToMemmove = MemmoveImpl == RTLIB::impl_memmove;
     IsTailCall = isInTailCallPositionWrapper(CI, this, LowersToMemmove);
   }
 
   CLI.setDebugLoc(dl)
       .setChain(Chain)
-      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
-                    Dst.getValueType().getTypeForEVT(*getContext()),
-                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
-                                      TLI->getPointerTy(getDataLayout())),
-                    std::move(Args))
+      .setLibCallee(
+          TLI->getLibcallImplCallingConv(MemmoveImpl),
+          Dst.getValueType().getTypeForEVT(*getContext()),
+          getExternalSymbol(TLI->getLibcallImplName(MemmoveImpl).data(),
+                            TLI->getPointerTy(getDataLayout())),
+          std::move(Args))
       .setDiscardResult()
       .setTailCall(IsTailCall);
 
@@ -9492,8 +9378,10 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
                                        TLI->getPointerTy(DL)),
                      std::move(Args));
   }
-  bool LowersToMemset =
-      TLI->getLibcallName(RTLIB::MEMSET) == StringRef("memset");
+
+  RTLIB::LibcallImpl MemsetImpl = TLI->getLibcallImpl(RTLIB::MEMSET);
+  bool LowersToMemset = MemsetImpl == RTLIB::impl_memset;
+
   // If we're going to use bzero, make sure not to tail call unless the
   // subsequent return doesn't need a value, as bzero doesn't return the first
   // arg unlike memset.
@@ -12741,6 +12629,10 @@ void SelectionDAG::getTopologicallyOrderedNodes(
   for (unsigned i = 0U; i < SortedNodes.size(); ++i) {
     const SDNode *N = SortedNodes[i];
     for (const SDNode *U : N->users()) {
+      // HandleSDNode is never part of a DAG and therefore has no entry in
+      // RemainingOperands.
+      if (U->getOpcode() == ISD::HANDLENODE)
+        continue;
       unsigned &NumRemOperands = RemainingOperands[U];
       assert(NumRemOperands && "Invalid number of remaining operands");
       --NumRemOperands;
@@ -12754,8 +12646,6 @@ void SelectionDAG::getTopologicallyOrderedNodes(
          "First node in topological sort is not the entry token");
   assert(SortedNodes.front()->getNumOperands() == 0 &&
          "First node in topological sort has operands");
-  assert(SortedNodes.back()->use_empty() &&
-         "Last node in topologic sort has users");
 }
 
 /// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
@@ -13057,6 +12947,11 @@ bool llvm::isOneOrOneSplat(SDValue N, bool AllowUndefs) {
   return C && C->isOne();
 }
 
+bool llvm::isOneOrOneSplatFP(SDValue N, bool AllowUndefs) {
+  ConstantFPSDNode *C = isConstOrConstSplatFP(N, AllowUndefs);
+  return C && C->isExactlyValue(1.0);
+}
+
 bool llvm::isAllOnesOrAllOnesSplat(SDValue N, bool AllowUndefs) {
   N = peekThroughBitcasts(N);
   unsigned BitWidth = N.getScalarValueSizeInBits();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index a52265055c88a..7b3a0881feb10 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3526,8 +3526,7 @@ void SelectionDAGBuilder::visitCallBr(const CallBrInst &I) {
 
   // Update successor info.
   addSuccessorWithProb(CallBrMBB, Return, BranchProbability::getOne());
-  for (unsigned i = 0, e = I.getNumIndirectDests(); i < e; ++i) {
-    BasicBlock *Dest = I.getIndirectDest(i);
+  for (BasicBlock *Dest : I.getIndirectDests()) {
     MachineBasicBlock *Target = FuncInfo.getMBB(Dest);
     Target->setIsInlineAsmBrIndirectTarget();
     // If we introduce a type of asm goto statement that is permitted to use an
@@ -4639,6 +4638,12 @@ static std::optional<ConstantRange> getRange(const Instruction &I) {
   return std::nullopt;
 }
 
+static FPClassTest getNoFPClass(const Instruction &I) {
+  if (const auto *CB = dyn_cast<CallBase>(&I))
+    return CB->getRetNoFPClass();
+  return fcNone;
+}
+
 void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   if (I.isAtomic())
     return visitAtomicLoad(I);
@@ -4759,7 +4764,7 @@ void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) {
   SmallVector<uint64_t, 4> Offsets;
   const Value *SrcV = I.getOperand(0);
   ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
-                  SrcV->getType(), ValueVTs, &Offsets, 0);
+                  SrcV->getType(), ValueVTs, /*MemVTs=*/nullptr, &Offsets, 0);
   assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
          "expect a single EVT for swifterror");
 
@@ -4795,7 +4800,7 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
   SmallVector<EVT, 4> ValueVTs;
   SmallVector<uint64_t, 4> Offsets;
   ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), Ty,
-                  ValueVTs, &Offsets, 0);
+                  ValueVTs, /*MemVTs=*/nullptr, &Offsets, 0);
   assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
          "expect a single EVT for swifterror");
 
@@ -5313,18 +5318,26 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
   DAG.setRoot(OutChain);
 }
 
-/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
-/// node.
-void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
-                                               unsigned Intrinsic) {
-  // Ignore the callsite's attributes. A specific call site may be marked with
-  // readnone, but the lowering code will expect the chain based on the
-  // definition.
+/// Check if this intrinsic call depends on the chain (1st return value)
+/// and if it only *loads* memory.
+/// Ignore the callsite's attributes. A specific call site may be marked with
+/// readnone, but the lowering code will expect the chain based on the
+/// definition.
+std::pair<bool, bool>
+SelectionDAGBuilder::getTargetIntrinsicCallProperties(const CallBase &I) {
   const Function *F = I.getCalledFunction();
   bool HasChain = !F->doesNotAccessMemory();
   bool OnlyLoad =
       HasChain && F->onlyReadsMemory() && F->willReturn() && F->doesNotThrow();
 
+  return {HasChain, OnlyLoad};
+}
+
+SmallVector<SDValue, 8> SelectionDAGBuilder::getTargetIntrinsicOperands(
+    const CallBase &I, bool HasChain, bool OnlyLoad,
+    TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
   // Build the operand list.
   SmallVector<SDValue, 8> Ops;
   if (HasChain) {  // If this intrinsic has side-effects, chainify it.
@@ -5336,17 +5349,10 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     }
   }
 
-  // Info is set by getTgtMemIntrinsic
-  TargetLowering::IntrinsicInfo Info;
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I,
-                                               DAG.getMachineFunction(),
-                                               Intrinsic);
-
   // Add the intrinsic ID as an integer operand if it's not a target intrinsic.
-  if (!IsTgtIntrinsic || Info.opc == ISD::INTRINSIC_VOID ||
-      Info.opc == ISD::INTRINSIC_W_CHAIN)
-    Ops.push_back(DAG.getTargetConstant(Intrinsic, getCurSDLoc(),
+  if (!TgtMemIntrinsicInfo || TgtMemIntrinsicInfo->opc == ISD::INTRINSIC_VOID ||
+      TgtMemIntrinsicInfo->opc == ISD::INTRINSIC_W_CHAIN)
+    Ops.push_back(DAG.getTargetConstant(I.getIntrinsicID(), getCurSDLoc(),
                                         TLI.getPointerTy(DAG.getDataLayout())));
 
   // Add all operands of the call to the operand list.
@@ -5369,13 +5375,85 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     }
   }
 
+  if (std::optional<OperandBundleUse> Bundle =
+          I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+    Value *Token = Bundle->Inputs[0].get();
+    SDValue ConvControlToken = getValue(Token);
+    assert(Ops.back().getValueType() != MVT::Glue &&
+           "Did not expect another glue node here.");
+    ConvControlToken =
+        DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue, ConvControlToken);
+    Ops.push_back(ConvControlToken);
+  }
+
+  return Ops;
+}
+
+SDVTList SelectionDAGBuilder::getTargetIntrinsicVTList(const CallBase &I,
+                                                       bool HasChain) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
   SmallVector<EVT, 4> ValueVTs;
   ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs);
 
   if (HasChain)
     ValueVTs.push_back(MVT::Other);
 
-  SDVTList VTs = DAG.getVTList(ValueVTs);
+  return DAG.getVTList(ValueVTs);
+}
+
+/// Get an INTRINSIC node for a target intrinsic which does not touch memory.
+SDValue SelectionDAGBuilder::getTargetNonMemIntrinsicNode(
+    const Type &IntrinsicVT, bool HasChain, ArrayRef<SDValue> Ops,
+    const SDVTList &VTs) {
+  if (!HasChain)
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
+  if (!IntrinsicVT.isVoidTy())
+    return DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops);
+  return DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops);
+}
+
+/// Set root, convert return type if necessary and check alignment.
+SDValue SelectionDAGBuilder::handleTargetIntrinsicRet(const CallBase &I,
+                                                      bool HasChain,
+                                                      bool OnlyLoad,
+                                                      SDValue Result) {
+  if (HasChain) {
+    SDValue Chain = Result.getValue(Result.getNode()->getNumValues() - 1);
+    if (OnlyLoad)
+      PendingLoads.push_back(Chain);
+    else
+      DAG.setRoot(Chain);
+  }
+
+  if (I.getType()->isVoidTy())
+    return Result;
+
+  if (MaybeAlign Alignment = I.getRetAlign(); InsertAssertAlign && Alignment) {
+    // Insert `assertalign` node if there's an alignment.
+    Result = DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne());
+  } else if (!isa<VectorType>(I.getType())) {
+    Result = lowerRangeToAssertZExt(DAG, I, Result);
+  }
+
+  return Result;
+}
+
+/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
+/// node.
+void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
+                                               unsigned Intrinsic) {
+  auto [HasChain, OnlyLoad] = getTargetIntrinsicCallProperties(I);
+
+  // Info is set by getTgtMemIntrinsic
+  TargetLowering::IntrinsicInfo Info;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  bool IsTgtMemIntrinsic =
+      TLI.getTgtMemIntrinsic(Info, I, DAG.getMachineFunction(), Intrinsic);
+
+  SmallVector<SDValue, 8> Ops = getTargetIntrinsicOperands(
+      I, HasChain, OnlyLoad, IsTgtMemIntrinsic ? &Info : nullptr);
+  SDVTList VTs = getTargetIntrinsicVTList(I, HasChain);
 
   // Propagate fast-math-flags from IR to node(s).
   SDNodeFlags Flags;
@@ -5386,19 +5464,9 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
   // Create the node.
   SDValue Result;
 
-  if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
-    auto *Token = Bundle->Inputs[0].get();
-    SDValue ConvControlToken = getValue(Token);
-    assert(Ops.back().getValueType() != MVT::Glue &&
-           "Did not expected another glue node here.");
-    ConvControlToken =
-        DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue, ConvControlToken);
-    Ops.push_back(ConvControlToken);
-  }
-
   // In some cases, custom collection of operands from CallInst I may be needed.
   TLI.CollectTargetIntrinsicOperands(I, Ops, DAG);
-  if (IsTgtIntrinsic) {
+  if (IsTgtMemIntrinsic) {
     // This is target intrinsic that touches memory
     //
     // TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic
@@ -5418,34 +5486,11 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
         Info.ssid, Info.order, Info.failureOrder);
     Result =
         DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops, MemVT, MMO);
-  } else if (!HasChain) {
-    Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
-  } else if (!I.getType()->isVoidTy()) {
-    Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops);
   } else {
-    Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops);
-  }
-
-  if (HasChain) {
-    SDValue Chain = Result.getValue(Result.getNode()->getNumValues()-1);
-    if (OnlyLoad)
-      PendingLoads.push_back(Chain);
-    else
-      DAG.setRoot(Chain);
+    Result = getTargetNonMemIntrinsicNode(*I.getType(), HasChain, Ops, VTs);
   }
 
-  if (!I.getType()->isVoidTy()) {
-    if (!isa<VectorType>(I.getType()))
-      Result = lowerRangeToAssertZExt(DAG, I, Result);
-
-    MaybeAlign Alignment = I.getRetAlign();
-
-    // Insert `assertalign` node if there's an alignment.
-    if (InsertAssertAlign && Alignment) {
-      Result =
-          DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne());
-    }
-  }
+  Result = handleTargetIntrinsicRet(I, HasChain, OnlyLoad, Result);
 
   setValue(&I, Result);
 }
@@ -7772,6 +7817,17 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   }
 
+  case Intrinsic::reloc_none: {
+    Metadata *MD = cast<MetadataAsValue>(I.getArgOperand(0))->getMetadata();
+    StringRef SymbolName = cast<MDString>(MD)->getString();
+    SDValue Ops[2] = {
+        getRoot(),
+        DAG.getTargetExternalSymbol(
+            SymbolName.data(), TLI.getProgramPointerTy(DAG.getDataLayout()))};
+    DAG.setRoot(DAG.getNode(ISD::RELOC_NONE, sdl, MVT::Other, Ops));
+    return;
+  }
+
   case Intrinsic::eh_exceptionpointer:
   case Intrinsic::eh_exceptioncode: {
     // Get the exception pointer vreg, copy from it, and resize it to fit.
@@ -8137,6 +8193,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                          Input, DAG.getConstant(1, sdl, Input.getValueType())));
     return;
   }
+  case Intrinsic::vector_partial_reduce_fadd: {
+    SDValue Acc = getValue(I.getOperand(0));
+    SDValue Input = getValue(I.getOperand(1));
+    setValue(&I, DAG.getNode(
+                     ISD::PARTIAL_REDUCE_FMLA, sdl, Acc.getValueType(), Acc,
+                     Input, DAG.getConstantFP(1.0, sdl, Input.getValueType())));
+    return;
+  }
   case Intrinsic::experimental_cttz_elts: {
     auto DL = getCurSDLoc();
     SDValue Op = getValue(I.getOperand(0));
@@ -8958,9 +9022,8 @@ bool SelectionDAGBuilder::canTailCall(const CallBase &CB) const {
   // Avoid emitting tail calls in functions with the disable-tail-calls
   // attribute.
   const Function *Caller = CB.getParent()->getParent();
-  if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() ==
-          "true" &&
-      !isMustTailCall)
+  if (!isMustTailCall &&
+      Caller->getFnAttribute("disable-tail-calls").getValueAsBool())
     return false;
 
   // We can't tail call inside a function with a swifterror argument. Lowering
@@ -9075,6 +9138,7 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
 
   if (Result.first.getNode()) {
     Result.first = lowerRangeToAssertZExt(DAG, CB, Result.first);
+    Result.first = lowerNoFPClassToAssertNoFPClass(DAG, CB, Result.first);
     setValue(&CB, Result.first);
   }
 
@@ -9392,7 +9456,9 @@ bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {
 bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
                                               unsigned Opcode) {
   // We already checked this call's prototype; verify it doesn't modify errno.
-  if (!I.onlyReadsMemory())
+  // Do not perform optimizations for call sites that require strict
+  // floating-point semantics.
+  if (!I.onlyReadsMemory() || I.isStrictFP())
     return false;
 
   SDNodeFlags Flags;
@@ -9412,7 +9478,9 @@ bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
 bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I,
                                                unsigned Opcode) {
   // We already checked this call's prototype; verify it doesn't modify errno.
-  if (!I.onlyReadsMemory())
+  // Do not perform optimizations for call sites that require strict
+  // floating-point semantics.
+  if (!I.onlyReadsMemory() || I.isStrictFP())
     return false;
 
   SDNodeFlags Flags;
@@ -9445,11 +9513,10 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
 
     // Check for well-known libc/libm calls.  If the function is internal, it
     // can't be a library call.  Don't do the check if marked as nobuiltin for
-    // some reason or the call site requires strict floating point semantics.
+    // some reason.
     LibFunc Func;
-    if (!I.isNoBuiltin() && !I.isStrictFP() && !F->hasLocalLinkage() &&
-        F->hasName() && LibInfo->getLibFunc(*F, Func) &&
-        LibInfo->hasOptimizedCodeGen(Func)) {
+    if (!I.isNoBuiltin() && !F->hasLocalLinkage() && F->hasName() &&
+        LibInfo->getLibFunc(*F, Func) && LibInfo->hasOptimizedCodeGen(Func)) {
       switch (Func) {
       default: break;
       case LibFunc_bcmp:
@@ -10661,6 +10728,30 @@ SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG,
   return DAG.getMergeValues(Ops, SL);
 }
 
+SDValue SelectionDAGBuilder::lowerNoFPClassToAssertNoFPClass(
+    SelectionDAG &DAG, const Instruction &I, SDValue Op) {
+  FPClassTest Classes = getNoFPClass(I);
+  if (Classes == fcNone)
+    return Op;
+
+  SDLoc SL = getCurSDLoc();
+  SDValue TestConst = DAG.getTargetConstant(Classes, SDLoc(), MVT::i32);
+
+  if (Op.getOpcode() != ISD::MERGE_VALUES) {
+    return DAG.getNode(ISD::AssertNoFPClass, SL, Op.getValueType(), Op,
+                       TestConst);
+  }
+
+  SmallVector<SDValue, 8> Ops(Op.getNumOperands());
+  for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
+    SDValue MergeOp = Op.getOperand(I);
+    Ops[I] = DAG.getNode(ISD::AssertNoFPClass, SL, MergeOp.getValueType(),
+                         MergeOp, TestConst);
+  }
+
+  return DAG.getMergeValues(Ops, SL);
+}
+
 /// Populate a CallLowerinInfo (into \p CLI) based on the properties of
 /// the call being lowered.
 ///
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 47e19f77a15e7..13e2daa783147 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -429,6 +429,10 @@ class SelectionDAGBuilder {
   SDValue lowerRangeToAssertZExt(SelectionDAG &DAG, const Instruction &I,
                                  SDValue Op);
 
+  // Lower nofpclass attributes to AssertNoFPClass
+  SDValue lowerNoFPClassToAssertNoFPClass(SelectionDAG &DAG,
+                                          const Instruction &I, SDValue Op);
+
   void populateCallLoweringInfo(TargetLowering::CallLoweringInfo &CLI,
                                 const CallBase *Call, unsigned ArgIdx,
                                 unsigned NumArgs, SDValue Callee,
@@ -727,6 +731,17 @@ class SelectionDAGBuilder {
                        MCSymbol *&BeginLabel);
   SDValue lowerEndEH(SDValue Chain, const InvokeInst *II,
                      const BasicBlock *EHPadBB, MCSymbol *BeginLabel);
+
+  std::pair<bool, bool> getTargetIntrinsicCallProperties(const CallBase &I);
+  SmallVector<SDValue, 8> getTargetIntrinsicOperands(
+      const CallBase &I, bool HasChain, bool OnlyLoad,
+      TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo = nullptr);
+  SDVTList getTargetIntrinsicVTList(const CallBase &I, bool HasChain);
+  SDValue getTargetNonMemIntrinsicNode(const Type &IntrinsicVT, bool HasChain,
+                                       ArrayRef<SDValue> Ops,
+                                       const SDVTList &VTs);
+  SDValue handleTargetIntrinsicRet(const CallBase &I, bool HasChain,
+                                   bool OnlyLoad, SDValue Result);
 };
 
 /// This struct represents the registers (physical or virtual)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 77377d348b836..ec5edd5f13978 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -472,6 +472,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::LIFETIME_END:               return "lifetime.end";
   case ISD::FAKE_USE:
     return "fake_use";
+  case ISD::RELOC_NONE:
+    return "reloc_none";
   case ISD::PSEUDO_PROBE:
     return "pseudoprobe";
   case ISD::GC_TRANSITION_START:        return "gc_transition.start";
@@ -588,6 +590,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
     return "partial_reduce_smla";
   case ISD::PARTIAL_REDUCE_SUMLA:
     return "partial_reduce_sumla";
+  case ISD::PARTIAL_REDUCE_FMLA:
+    return "partial_reduce_fmla";
   case ISD::LOOP_DEPENDENCE_WAR_MASK:
     return "loop_dep_war";
   case ISD::LOOP_DEPENDENCE_RAW_MASK:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 6c11c5b815b6b..e78dfb12505c7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2448,7 +2448,7 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root,
   // a cycle in the scheduling graph.
 
   // If the node has glue, walk down the graph to the "lowest" node in the
-  // glueged set.
+  // glued set.
   EVT VT = Root->getValueType(Root->getNumValues()-1);
   while (VT == MVT::Glue) {
     SDNode *GU = Root->getGluedUser();
@@ -2550,6 +2550,11 @@ void SelectionDAGISel::Select_FAKE_USE(SDNode *N) {
                        N->getOperand(1), N->getOperand(0));
 }
 
+void SelectionDAGISel::Select_RELOC_NONE(SDNode *N) {
+  CurDAG->SelectNodeTo(N, TargetOpcode::RELOC_NONE, N->getValueType(0),
+                       N->getOperand(1), N->getOperand(0));
+}
+
 void SelectionDAGISel::Select_FREEZE(SDNode *N) {
   // TODO: We don't have FREEZE pseudo-instruction in MachineInstr-level now.
   // If FREEZE instruction is added later, the code below must be changed as
@@ -3325,6 +3330,9 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
   case ISD::FAKE_USE:
     Select_FAKE_USE(NodeToMatch);
     return;
+  case ISD::RELOC_NONE:
+    Select_RELOC_NONE(NodeToMatch);
+    return;
   case ISD::FREEZE:
     Select_FREEZE(NodeToMatch);
     return;
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index da4e40953b39a..bb64f4ee70280 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10668,19 +10668,20 @@ static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, SDValue Idx,
                      DAG.getConstant(MaxIndex, dl, IdxVT));
 }
 
-SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
-                                                SDValue VecPtr, EVT VecVT,
-                                                SDValue Index) const {
+SDValue
+TargetLowering::getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr,
+                                        EVT VecVT, SDValue Index,
+                                        const SDNodeFlags PtrArithFlags) const {
   return getVectorSubVecPointer(
       DAG, VecPtr, VecVT,
       EVT::getVectorVT(*DAG.getContext(), VecVT.getVectorElementType(), 1),
-      Index);
+      Index, PtrArithFlags);
 }
 
-SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG,
-                                               SDValue VecPtr, EVT VecVT,
-                                               EVT SubVecVT,
-                                               SDValue Index) const {
+SDValue
+TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr,
+                                       EVT VecVT, EVT SubVecVT, SDValue Index,
+                                       const SDNodeFlags PtrArithFlags) const {
   SDLoc dl(Index);
   // Make sure the index type is big enough to compute in.
   Index = DAG.getZExtOrTrunc(Index, dl, VecPtr.getValueType());
@@ -10704,7 +10705,7 @@ SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG,
 
   Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index,
                       DAG.getConstant(EltSize, dl, IdxVT));
-  return DAG.getMemBasePlusOffset(VecPtr, Index, dl);
+  return DAG.getMemBasePlusOffset(VecPtr, Index, dl, PtrArithFlags);
 }
 
 //===----------------------------------------------------------------------===//
@@ -12073,22 +12074,32 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
       EVT::getVectorVT(*DAG.getContext(), AccVT.getVectorElementType(),
                        MulOpVT.getVectorElementCount());
 
-  unsigned ExtOpcLHS = N->getOpcode() == ISD::PARTIAL_REDUCE_UMLA
-                      ? ISD::ZERO_EXTEND
-                      : ISD::SIGN_EXTEND;
-  unsigned ExtOpcRHS = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA
-                      ? ISD::SIGN_EXTEND
-                      : ISD::ZERO_EXTEND;
+  unsigned ExtOpcLHS, ExtOpcRHS;
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+  case ISD::PARTIAL_REDUCE_UMLA:
+    ExtOpcLHS = ExtOpcRHS = ISD::ZERO_EXTEND;
+    break;
+  case ISD::PARTIAL_REDUCE_SMLA:
+    ExtOpcLHS = ExtOpcRHS = ISD::SIGN_EXTEND;
+    break;
+  case ISD::PARTIAL_REDUCE_FMLA:
+    ExtOpcLHS = ExtOpcRHS = ISD::FP_EXTEND;
+    break;
+  }
 
   if (ExtMulOpVT != MulOpVT) {
     MulLHS = DAG.getNode(ExtOpcLHS, DL, ExtMulOpVT, MulLHS);
     MulRHS = DAG.getNode(ExtOpcRHS, DL, ExtMulOpVT, MulRHS);
   }
   SDValue Input = MulLHS;
-  APInt ConstantOne;
-  if (!ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) ||
-      !ConstantOne.isOne())
+  if (N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA) {
+    if (!llvm::isOneOrOneSplatFP(MulRHS))
+      Input = DAG.getNode(ISD::FMUL, DL, ExtMulOpVT, MulLHS, MulRHS);
+  } else if (!llvm::isOneOrOneSplat(MulRHS)) {
     Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS);
+  }
 
   unsigned Stride = AccVT.getVectorMinNumElements();
   unsigned ScaleFactor = MulOpVT.getVectorMinNumElements() / Stride;
@@ -12098,10 +12109,13 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
   for (unsigned I = 0; I < ScaleFactor; I++)
     Subvectors.push_back(DAG.getExtractSubvector(DL, AccVT, Input, I * Stride));
 
+  unsigned FlatNode =
+      N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA ? ISD::FADD : ISD::ADD;
+
   // Flatten the subvector tree
   while (Subvectors.size() > 1) {
     Subvectors.push_back(
-        DAG.getNode(ISD::ADD, DL, AccVT, {Subvectors[0], Subvectors[1]}));
+        DAG.getNode(FlatNode, DL, AccVT, {Subvectors[0], Subvectors[1]}));
     Subvectors.pop_front();
     Subvectors.pop_front();
   }
@@ -12112,6 +12126,167 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
   return Subvectors[0];
 }
 
+/// Given a store node \p StoreNode, return true if it is safe to fold that node
+/// into \p FPNode, which expands to a library call with output pointers.
+static bool canFoldStoreIntoLibCallOutputPointers(StoreSDNode *StoreNode,
+                                                  SDNode *FPNode) {
+  SmallVector<const SDNode *, 8> Worklist;
+  SmallVector<const SDNode *, 8> DeferredNodes;
+  SmallPtrSet<const SDNode *, 16> Visited;
+
+  // Skip FPNode use by StoreNode (that's the use we want to fold into FPNode).
+  for (SDValue Op : StoreNode->ops())
+    if (Op.getNode() != FPNode)
+      Worklist.push_back(Op.getNode());
+
+  unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
+  while (!Worklist.empty()) {
+    const SDNode *Node = Worklist.pop_back_val();
+    auto [_, Inserted] = Visited.insert(Node);
+    if (!Inserted)
+      continue;
+
+    if (MaxSteps > 0 && Visited.size() >= MaxSteps)
+      return false;
+
+    // Reached the FPNode (would result in a cycle).
+    // OR Reached CALLSEQ_START (would result in nested call sequences).
+    if (Node == FPNode || Node->getOpcode() == ISD::CALLSEQ_START)
+      return false;
+
+    if (Node->getOpcode() == ISD::CALLSEQ_END) {
+      // Defer looking into call sequences (so we can check we're outside one).
+      // We still need to look through these for the predecessor check.
+      DeferredNodes.push_back(Node);
+      continue;
+    }
+
+    for (SDValue Op : Node->ops())
+      Worklist.push_back(Op.getNode());
+  }
+
+  // True if we're outside a call sequence and don't have the FPNode as a
+  // predecessor. No cycles or nested call sequences possible.
+  return !SDNode::hasPredecessorHelper(FPNode, Visited, DeferredNodes,
+                                       MaxSteps);
+}
+
+bool TargetLowering::expandMultipleResultFPLibCall(
+    SelectionDAG &DAG, RTLIB::Libcall LC, SDNode *Node,
+    SmallVectorImpl<SDValue> &Results,
+    std::optional<unsigned> CallRetResNo) const {
+  if (LC == RTLIB::UNKNOWN_LIBCALL)
+    return false;
+
+  RTLIB::LibcallImpl LibcallImpl = getLibcallImpl(LC);
+  if (LibcallImpl == RTLIB::Unsupported)
+    return false;
+
+  LLVMContext &Ctx = *DAG.getContext();
+  EVT VT = Node->getValueType(0);
+  unsigned NumResults = Node->getNumValues();
+
+  // Find users of the node that store the results (and share input chains). The
+  // destination pointers can be used instead of creating stack allocations.
+  SDValue StoresInChain;
+  SmallVector<StoreSDNode *, 2> ResultStores(NumResults);
+  for (SDNode *User : Node->users()) {
+    if (!ISD::isNormalStore(User))
+      continue;
+    auto *ST = cast<StoreSDNode>(User);
+    SDValue StoreValue = ST->getValue();
+    unsigned ResNo = StoreValue.getResNo();
+    // Ensure the store corresponds to an output pointer.
+    if (CallRetResNo == ResNo)
+      continue;
+    // Ensure the store to the default address space and not atomic or volatile.
+    if (!ST->isSimple() || ST->getAddressSpace() != 0)
+      continue;
+    // Ensure all store chains are the same (so they don't alias).
+    if (StoresInChain && ST->getChain() != StoresInChain)
+      continue;
+    // Ensure the store is properly aligned.
+    Type *StoreType = StoreValue.getValueType().getTypeForEVT(Ctx);
+    if (ST->getAlign() <
+        DAG.getDataLayout().getABITypeAlign(StoreType->getScalarType()))
+      continue;
+    // Avoid:
+    //  1. Creating cyclic dependencies.
+    //  2. Expanding the node to a call within a call sequence.
+    if (!canFoldStoreIntoLibCallOutputPointers(ST, Node))
+      continue;
+    ResultStores[ResNo] = ST;
+    StoresInChain = ST->getChain();
+  }
+
+  ArgListTy Args;
+
+  // Pass the arguments.
+  for (const SDValue &Op : Node->op_values()) {
+    EVT ArgVT = Op.getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(Ctx);
+    Args.emplace_back(Op, ArgTy);
+  }
+
+  // Pass the output pointers.
+  SmallVector<SDValue, 2> ResultPtrs(NumResults);
+  Type *PointerTy = PointerType::getUnqual(Ctx);
+  for (auto [ResNo, ST] : llvm::enumerate(ResultStores)) {
+    if (ResNo == CallRetResNo)
+      continue;
+    EVT ResVT = Node->getValueType(ResNo);
+    SDValue ResultPtr = ST ? ST->getBasePtr() : DAG.CreateStackTemporary(ResVT);
+    ResultPtrs[ResNo] = ResultPtr;
+    Args.emplace_back(ResultPtr, PointerTy);
+  }
+
+  SDLoc DL(Node);
+
+  if (RTLIB::RuntimeLibcallsInfo::hasVectorMaskArgument(LibcallImpl)) {
+    // Pass the vector mask (if required).
+    EVT MaskVT = getSetCCResultType(DAG.getDataLayout(), Ctx, VT);
+    SDValue Mask = DAG.getBoolConstant(true, DL, MaskVT, VT);
+    Args.emplace_back(Mask, MaskVT.getTypeForEVT(Ctx));
+  }
+
+  Type *RetType = CallRetResNo.has_value()
+                      ? Node->getValueType(*CallRetResNo).getTypeForEVT(Ctx)
+                      : Type::getVoidTy(Ctx);
+  SDValue InChain = StoresInChain ? StoresInChain : DAG.getEntryNode();
+  SDValue Callee = DAG.getExternalSymbol(getLibcallImplName(LibcallImpl).data(),
+                                         getPointerTy(DAG.getDataLayout()));
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL).setChain(InChain).setLibCallee(
+      getLibcallImplCallingConv(LibcallImpl), RetType, Callee, std::move(Args));
+
+  auto [Call, CallChain] = LowerCallTo(CLI);
+
+  for (auto [ResNo, ResultPtr] : llvm::enumerate(ResultPtrs)) {
+    if (ResNo == CallRetResNo) {
+      Results.push_back(Call);
+      continue;
+    }
+    MachinePointerInfo PtrInfo;
+    SDValue LoadResult = DAG.getLoad(Node->getValueType(ResNo), DL, CallChain,
+                                     ResultPtr, PtrInfo);
+    SDValue OutChain = LoadResult.getValue(1);
+
+    if (StoreSDNode *ST = ResultStores[ResNo]) {
+      // Replace store with the library call.
+      DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain);
+      PtrInfo = ST->getPointerInfo();
+    } else {
+      PtrInfo = MachinePointerInfo::getFixedStack(
+          DAG.getMachineFunction(),
+          cast<FrameIndexSDNode>(ResultPtr)->getIndex());
+    }
+
+    Results.push_back(LoadResult);
+  }
+
+  return true;
+}
+
 bool TargetLowering::LegalizeSetCCCondCode(SelectionDAG &DAG, EVT VT,
                                            SDValue &LHS, SDValue &RHS,
                                            SDValue &CC, SDValue Mask,
@@ -12382,8 +12557,10 @@ SDValue TargetLowering::scalarizeExtractedVectorLoad(EVT ResultVT,
       !IsFast)
     return SDValue();
 
-  SDValue NewPtr =
-      getVectorElementPointer(DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo);
+  // The original DAG loaded the entire vector from memory, so arithmetic
+  // within it must be inbounds.
+  SDValue NewPtr = getInboundsVectorElementPointer(
+      DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo);
 
   // We are replacing a vector load with a scalar load. The new load must have
   // identical memory op ordering to the original.
diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index f9ecb2c97b2e0..8ec4bfbb5a330 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -1509,10 +1509,9 @@ void SplitEditor::forceRecomputeVNI(const VNInfo &ParentVNI) {
   }
 
   // Trace value through phis.
-  SmallPtrSet<const VNInfo *, 8> Visited; ///< whether VNI was/is in worklist.
-  SmallVector<const VNInfo *, 4> WorkList;
-  Visited.insert(&ParentVNI);
-  WorkList.push_back(&ParentVNI);
+  ///< whether VNI was/is in worklist.
+  SmallPtrSet<const VNInfo *, 8> Visited = {&ParentVNI};
+  SmallVector<const VNInfo *, 4> WorkList = {&ParentVNI};
 
   const LiveInterval &ParentLI = Edit->getParent();
   const SlotIndexes &Indexes = *LIS.getSlotIndexes();
diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index 2a8234a37a167..5fd5d6cce23df 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -49,7 +49,6 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <optional>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
index 70c3b2cbae9a6..ebf6d1a52448e 100644
--- a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -198,7 +198,7 @@ void TargetFrameLowering::spillCalleeSavedRegister(
   } else {
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
     TII->storeRegToStackSlot(SaveBlock, MI, Reg, true, CS.getFrameIdx(), RC,
-                             TRI, Register());
+                             Register());
   }
 }
 
@@ -212,8 +212,7 @@ void TargetFrameLowering::restoreCalleeSavedRegister(
         .addReg(CS.getDstReg(), getKillRegState(true));
   } else {
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII->loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI,
-                              Register());
+    TII->loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, Register());
     assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!");
   }
 }
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 3c41bbeb4b327..d503d7a2345fd 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -58,9 +58,8 @@ static cl::opt<unsigned int> MaxAccumulatorWidth(
 
 TargetInstrInfo::~TargetInstrInfo() = default;
 
-const TargetRegisterClass *
-TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
-                             const TargetRegisterInfo *TRI) const {
+const TargetRegisterClass *TargetInstrInfo::getRegClass(const MCInstrDesc &MCID,
+                                                        unsigned OpNum) const {
   if (OpNum >= MCID.getNumOperands())
     return nullptr;
 
@@ -69,14 +68,14 @@ TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
 
   // TODO: Remove isLookupPtrRegClass in favor of isLookupRegClassByHwMode
   if (OpInfo.isLookupPtrRegClass())
-    return TRI->getPointerRegClass(RegClass);
+    return TRI.getPointerRegClass(RegClass);
 
   // Instructions like INSERT_SUBREG do not have fixed register classes.
   if (RegClass < 0)
     return nullptr;
 
   // Otherwise just look it up normally.
-  return TRI->getRegClass(RegClass);
+  return TRI.getRegClass(RegClass);
 }
 
 /// insertNoop - Insert a noop into the instruction stream at the specified
@@ -223,13 +222,11 @@ MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   //   %1.sub = INST %1.sub(tied), %0.sub, implicit-def %1
   SmallVector<unsigned> UpdateImplicitDefIdx;
   if (HasDef && MI.hasImplicitDef()) {
-    const TargetRegisterInfo *TRI =
-        MI.getMF()->getSubtarget().getRegisterInfo();
     for (auto [OpNo, MO] : llvm::enumerate(MI.implicit_operands())) {
       Register ImplReg = MO.getReg();
       if ((ImplReg.isVirtual() && ImplReg == Reg0) ||
           (ImplReg.isPhysical() && Reg0.isPhysical() &&
-           TRI->isSubRegisterEq(ImplReg, Reg0)))
+           TRI.isSubRegisterEq(ImplReg, Reg0)))
         UpdateImplicitDefIdx.push_back(OpNo + MI.getNumExplicitOperands());
     }
   }
@@ -425,28 +422,27 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
                                         unsigned SubIdx, unsigned &Size,
                                         unsigned &Offset,
                                         const MachineFunction &MF) const {
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   if (!SubIdx) {
-    Size = TRI->getSpillSize(*RC);
+    Size = TRI.getSpillSize(*RC);
     Offset = 0;
     return true;
   }
-  unsigned BitSize = TRI->getSubRegIdxSize(SubIdx);
+  unsigned BitSize = TRI.getSubRegIdxSize(SubIdx);
   // Convert bit size to byte size.
   if (BitSize % 8)
     return false;
 
-  int BitOffset = TRI->getSubRegIdxOffset(SubIdx);
+  int BitOffset = TRI.getSubRegIdxOffset(SubIdx);
   if (BitOffset < 0 || BitOffset % 8)
     return false;
 
   Size = BitSize / 8;
   Offset = (unsigned)BitOffset / 8;
 
-  assert(TRI->getSpillSize(*RC) >= (Offset + Size) && "bad subregister range");
+  assert(TRI.getSpillSize(*RC) >= (Offset + Size) && "bad subregister range");
 
   if (!MF.getDataLayout().isLittleEndian()) {
-    Offset = TRI->getSpillSize(*RC) - (Offset + Size);
+    Offset = TRI.getSpillSize(*RC) - (Offset + Size);
   }
   return true;
 }
@@ -454,8 +450,7 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
 void TargetInstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator I,
                                     Register DestReg, unsigned SubIdx,
-                                    const MachineInstr &Orig,
-                                    const TargetRegisterInfo &TRI) const {
+                                    const MachineInstr &Orig) const {
   MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
   MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, TRI);
   MBB.insert(I, MI);
@@ -726,7 +721,6 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
   // actual load size is.
   int64_t MemSize = 0;
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
 
   if (Flags & MachineMemOperand::MOStore) {
     MemSize = MFI.getObjectSize(FI);
@@ -735,7 +729,7 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
       int64_t OpSize = MFI.getObjectSize(FI);
 
       if (auto SubReg = MI.getOperand(OpIdx).getSubReg()) {
-        unsigned SubRegSize = TRI->getSubRegIdxSize(SubReg);
+        unsigned SubRegSize = TRI.getSubRegIdxSize(SubReg);
         if (SubRegSize > 0 && !(SubRegSize % 8))
           OpSize = SubRegSize / 8;
       }
@@ -800,11 +794,11 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
       // code.
       BuildMI(*MBB, Pos, MI.getDebugLoc(), get(TargetOpcode::KILL)).add(MO);
     } else {
-      storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI,
+      storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC,
                           Register());
     }
   } else
-    loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI, Register());
+    loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, Register());
 
   return &*--Pos;
 }
@@ -880,8 +874,8 @@ static void transferImplicitOperands(MachineInstr *MI,
   }
 }
 
-void TargetInstrInfo::lowerCopy(MachineInstr *MI,
-                                const TargetRegisterInfo *TRI) const {
+void TargetInstrInfo::lowerCopy(
+    MachineInstr *MI, const TargetRegisterInfo * /*Remove me*/) const {
   if (MI->allDefsAreDead()) {
     MI->setDesc(get(TargetOpcode::KILL));
     return;
@@ -911,7 +905,7 @@ void TargetInstrInfo::lowerCopy(MachineInstr *MI,
               SrcMO.getReg().isPhysical() ? SrcMO.isRenamable() : false);
 
   if (MI->getNumOperands() > 2)
-    transferImplicitOperands(MI, TRI);
+    transferImplicitOperands(MI, &TRI);
   MI->eraseFromParent();
 }
 
@@ -1327,8 +1321,7 @@ void TargetInstrInfo::reassociateOps(
   MachineFunction *MF = Root.getMF();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
-  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-  const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI);
+  const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, &TRI);
 
   MachineOperand &OpA = Prev.getOperand(OperandIndices[1]);
   MachineOperand &OpB = Root.getOperand(OperandIndices[2]);
@@ -1337,9 +1330,12 @@ void TargetInstrInfo::reassociateOps(
   MachineOperand &OpC = Root.getOperand(0);
 
   Register RegA = OpA.getReg();
+  unsigned SubRegA = OpA.getSubReg();
   Register RegB = OpB.getReg();
   Register RegX = OpX.getReg();
+  unsigned SubRegX = OpX.getSubReg();
   Register RegY = OpY.getReg();
+  unsigned SubRegY = OpY.getSubReg();
   Register RegC = OpC.getReg();
 
   if (RegA.isVirtual())
@@ -1357,6 +1353,7 @@ void TargetInstrInfo::reassociateOps(
   // recycling RegB because the MachineCombiner's computation of the critical
   // path requires a new register definition rather than an existing one.
   Register NewVR = MRI.createVirtualRegister(RC);
+  unsigned SubRegNewVR = 0;
   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
 
   auto [NewRootOpc, NewPrevOpc] = getReassociationOpcodes(Pattern, Root, Prev);
@@ -1369,6 +1366,7 @@ void TargetInstrInfo::reassociateOps(
 
   if (SwapPrevOperands) {
     std::swap(RegX, RegY);
+    std::swap(SubRegX, SubRegY);
     std::swap(KillX, KillY);
   }
 
@@ -1421,9 +1419,9 @@ void TargetInstrInfo::reassociateOps(
     if (Idx == 0)
       continue;
     if (Idx == PrevFirstOpIdx)
-      MIB1.addReg(RegX, getKillRegState(KillX));
+      MIB1.addReg(RegX, getKillRegState(KillX), SubRegX);
     else if (Idx == PrevSecondOpIdx)
-      MIB1.addReg(RegY, getKillRegState(KillY));
+      MIB1.addReg(RegY, getKillRegState(KillY), SubRegY);
     else
       MIB1.add(MO);
   }
@@ -1431,6 +1429,7 @@ void TargetInstrInfo::reassociateOps(
 
   if (SwapRootOperands) {
     std::swap(RegA, NewVR);
+    std::swap(SubRegA, SubRegNewVR);
     std::swap(KillA, KillNewVR);
   }
 
@@ -1442,9 +1441,9 @@ void TargetInstrInfo::reassociateOps(
     if (Idx == 0)
       continue;
     if (Idx == RootFirstOpIdx)
-      MIB2 = MIB2.addReg(RegA, getKillRegState(KillA));
+      MIB2 = MIB2.addReg(RegA, getKillRegState(KillA), SubRegA);
     else if (Idx == RootSecondOpIdx)
-      MIB2 = MIB2.addReg(NewVR, getKillRegState(KillNewVR));
+      MIB2 = MIB2.addReg(NewVR, getKillRegState(KillNewVR), SubRegNewVR);
     else
       MIB2 = MIB2.add(MO);
   }
@@ -1532,6 +1531,7 @@ void TargetInstrInfo::genAlternativeCodeSequence(
       if (IndexedReg.index() == 0)
         continue;
 
+      // FIXME: Losing subregisters
       MachineInstr *Instr = MRI.getUniqueVRegDef(IndexedReg.value());
       MachineInstrBuilder MIB;
       Register AccReg;
@@ -1704,8 +1704,7 @@ bool TargetInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
   // stack slot reference to depend on the instruction that does the
   // modification.
   const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  return MI.modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), TRI);
+  return MI.modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), &TRI);
 }
 
 // Provide a global flag for disabling the PreRA hazard recognizer that targets
@@ -1738,11 +1737,11 @@ CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
 // Default implementation of getMemOperandWithOffset.
 bool TargetInstrInfo::getMemOperandWithOffset(
     const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset,
-    bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const {
+    bool &OffsetIsScalable, const TargetRegisterInfo * /*RemoveMe*/) const {
   SmallVector<const MachineOperand *, 4> BaseOps;
   LocationSize Width = LocationSize::precise(0);
   if (!getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, OffsetIsScalable,
-                                     Width, TRI) ||
+                                     Width, &TRI) ||
       BaseOps.size() != 1)
     return false;
   BaseOp = BaseOps.front();
@@ -1863,7 +1862,6 @@ std::optional<ParamLoadedValue>
 TargetInstrInfo::describeLoadedValue(const MachineInstr &MI,
                                      Register Reg) const {
   const MachineFunction *MF = MI.getMF();
-  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   DIExpression *Expr = DIExpression::get(MF->getFunction().getContext(), {});
   int64_t Offset;
   bool OffsetIsScalable;
@@ -1894,7 +1892,6 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI,
     // Only describe memory which provably does not escape the function. As
     // described in llvm.org/PR43343, escaped memory may be clobbered by the
     // callee (or by another thread).
-    const auto &TII = MF->getSubtarget().getInstrInfo();
     const MachineFrameInfo &MFI = MF->getFrameInfo();
     const MachineMemOperand *MMO = MI.memoperands()[0];
     const PseudoSourceValue *PSV = MMO->getPseudoValue();
@@ -1905,8 +1902,7 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI,
       return std::nullopt;
 
     const MachineOperand *BaseOp;
-    if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable,
-                                      TRI))
+    if (!getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, &TRI))
       return std::nullopt;
 
     // FIXME: Scalable offsets are not yet handled in the offset code below.
@@ -2045,7 +2041,7 @@ bool TargetInstrInfo::getInsertSubregInputs(
 // Returns a MIRPrinter comment for this machine operand.
 std::string TargetInstrInfo::createMIROperandComment(
     const MachineInstr &MI, const MachineOperand &Op, unsigned OpIdx,
-    const TargetRegisterInfo *TRI) const {
+    const TargetRegisterInfo * /*RemoveMe*/) const {
 
   if (!MI.isInlineAsm())
     return "";
@@ -2078,12 +2074,8 @@ std::string TargetInstrInfo::createMIROperandComment(
   OS << F.getKindName();
 
   unsigned RCID;
-  if (!F.isImmKind() && !F.isMemKind() && F.hasRegClassConstraint(RCID)) {
-    if (TRI) {
-      OS << ':' << TRI->getRegClassName(TRI->getRegClass(RCID));
-    } else
-      OS << ":RC" << RCID;
-  }
+  if (!F.isImmKind() && !F.isMemKind() && F.hasRegClassConstraint(RCID))
+    OS << ':' << TRI.getRegClassName(TRI.getRegClass(RCID));
 
   if (F.isMemKind()) {
     InlineAsm::ConstraintCode MCID = F.getMemoryConstraintID();
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index b3535eaca5e9d..77d9b156e2672 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -59,7 +59,6 @@
 #include <cassert>
 #include <cstdint>
 #include <cstring>
-#include <iterator>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -425,11 +424,47 @@ RTLIB::Libcall RTLIB::getCOS(EVT RetVT) {
 }
 
 RTLIB::Libcall RTLIB::getSINCOS(EVT RetVT) {
+  // TODO: Tablegen should generate this function
+  if (RetVT.isVector()) {
+    if (!RetVT.isSimple())
+      return RTLIB::UNKNOWN_LIBCALL;
+    switch (RetVT.getSimpleVT().SimpleTy) {
+    case MVT::v4f32:
+      return RTLIB::SINCOS_V4F32;
+    case MVT::v2f64:
+      return RTLIB::SINCOS_V2F64;
+    case MVT::nxv4f32:
+      return RTLIB::SINCOS_NXV4F32;
+    case MVT::nxv2f64:
+      return RTLIB::SINCOS_NXV2F64;
+    default:
+      return RTLIB::UNKNOWN_LIBCALL;
+    }
+  }
+
   return getFPLibCall(RetVT, SINCOS_F32, SINCOS_F64, SINCOS_F80, SINCOS_F128,
                       SINCOS_PPCF128);
 }
 
 RTLIB::Libcall RTLIB::getSINCOSPI(EVT RetVT) {
+  // TODO: Tablegen should generate this function
+  if (RetVT.isVector()) {
+    if (!RetVT.isSimple())
+      return RTLIB::UNKNOWN_LIBCALL;
+    switch (RetVT.getSimpleVT().SimpleTy) {
+    case MVT::v4f32:
+      return RTLIB::SINCOSPI_V4F32;
+    case MVT::v2f64:
+      return RTLIB::SINCOSPI_V2F64;
+    case MVT::nxv4f32:
+      return RTLIB::SINCOSPI_NXV4F32;
+    case MVT::nxv2f64:
+      return RTLIB::SINCOSPI_NXV2F64;
+    default:
+      return RTLIB::UNKNOWN_LIBCALL;
+    }
+  }
+
   return getFPLibCall(RetVT, SINCOSPI_F32, SINCOSPI_F64, SINCOSPI_F80,
                       SINCOSPI_F128, SINCOSPI_PPCF128);
 }
@@ -440,6 +475,24 @@ RTLIB::Libcall RTLIB::getSINCOS_STRET(EVT RetVT) {
 }
 
 RTLIB::Libcall RTLIB::getMODF(EVT RetVT) {
+  // TODO: Tablegen should generate this function
+  if (RetVT.isVector()) {
+    if (!RetVT.isSimple())
+      return RTLIB::UNKNOWN_LIBCALL;
+    switch (RetVT.getSimpleVT().SimpleTy) {
+    case MVT::v4f32:
+      return RTLIB::MODF_V4F32;
+    case MVT::v2f64:
+      return RTLIB::MODF_V2F64;
+    case MVT::nxv4f32:
+      return RTLIB::MODF_NXV4F32;
+    case MVT::nxv2f64:
+      return RTLIB::MODF_NXV2F64;
+    default:
+      return RTLIB::UNKNOWN_LIBCALL;
+    }
+  }
+
   return getFPLibCall(RetVT, MODF_F32, MODF_F64, MODF_F80, MODF_F128,
                       MODF_PPCF128);
 }
@@ -697,9 +750,11 @@ ISD::CondCode TargetLoweringBase::getSoftFloatCmpLibcallPredicate(
 
 /// NOTE: The TargetMachine owns TLOF.
 TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
-    : TM(tm), Libcalls(TM.getTargetTriple(), TM.Options.ExceptionModel,
-                       TM.Options.FloatABIType, TM.Options.EABIVersion,
-                       TM.Options.MCOptions.getABIName()) {
+    : TM(tm),
+      RuntimeLibcallInfo(TM.getTargetTriple(), TM.Options.ExceptionModel,
+                         TM.Options.FloatABIType, TM.Options.EABIVersion,
+                         TM.Options.MCOptions.getABIName()),
+      Libcalls(RuntimeLibcallInfo) {
   initActions();
 
   // Perform these initializations only once.
diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
index c33bf8b014b55..16d86b42db4a3 100644
--- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
@@ -30,7 +30,7 @@ bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const {
   StringRef FP = FPAttr.getValueAsString();
   if (FP == "all")
     return true;
-  if (FP == "non-leaf")
+  if (FP == "non-leaf" || FP == "non-leaf-no-reserve")
     return MF.getFrameInfo().hasCalls();
   if (FP == "none" || FP == "reserved")
     return false;
@@ -45,6 +45,7 @@ bool TargetOptions::FramePointerIsReserved(const MachineFunction &MF) const {
 
   return StringSwitch<bool>(FPAttr.getValueAsString())
       .Cases({"all", "non-leaf", "reserved"}, true)
+      .Case(("non-leaf-no-reserve"), MF.getFrameInfo().hasCalls())
       .Case("none", false);
 }
 
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index 971f822fa6c53..a5c81afc57a80 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -133,7 +133,7 @@ Printable llvm::printReg(Register Reg, const TargetRegisterInfo *TRI,
   });
 }
 
-Printable llvm::printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+Printable llvm::printRegUnit(MCRegUnit Unit, const TargetRegisterInfo *TRI) {
   return Printable([Unit, TRI](raw_ostream &OS) {
     // Generic printout when TRI is missing.
     if (!TRI) {
diff --git a/llvm/lib/CodeGen/TargetSchedule.cpp b/llvm/lib/CodeGen/TargetSchedule.cpp
index 7ae9e0e37bbab..cd951a1a4f53e 100644
--- a/llvm/lib/CodeGen/TargetSchedule.cpp
+++ b/llvm/lib/CodeGen/TargetSchedule.cpp
@@ -25,7 +25,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
-#include <numeric>
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414738b71..c306fe6012c11 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,36 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
   if (!NewMI)
     return false;
 
-  LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
-  LLVM_DEBUG(dbgs() << "2addr:         TO 3-ADDR: " << *NewMI);
-
-  // If the old instruction is debug value tracked, an update is required.
-  if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-    assert(mi->getNumExplicitDefs() == 1);
-    assert(NewMI->getNumExplicitDefs() == 1);
-
-    // Find the old and new def location.
-    unsigned OldIdx = mi->defs().begin()->getOperandNo();
-    unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-    // Record that one def has been replaced by the other.
-    unsigned NewInstrNum = NewMI->getDebugInstrNum();
-    MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
-                                   std::make_pair(NewInstrNum, NewIdx));
-  }
-
-  MBB->erase(mi); // Nuke the old inst.
-
   for (MachineInstr &MI : MIS)
     DistanceMap.insert(std::make_pair(&MI, Dist++));
-  Dist--;
+
+  if (&*mi == NewMI) {
+    LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+  } else {
+    LLVM_DEBUG({
+      dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi;
+      dbgs() << "2addr:         TO 3-ADDR: " << *NewMI;
+    });
+
+    // If the old instruction is debug value tracked, an update is required.
+    if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+      assert(mi->getNumExplicitDefs() == 1);
+      assert(NewMI->getNumExplicitDefs() == 1);
+
+      // Find the old and new def location.
+      unsigned OldIdx = mi->defs().begin()->getOperandNo();
+      unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+      // Record that one def has been replaced by the other.
+      unsigned NewInstrNum = NewMI->getDebugInstrNum();
+      MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+                                     std::make_pair(NewInstrNum, NewIdx));
+    }
+
+    MBB->erase(mi); // Nuke the old inst.
+    Dist--;
+  }
+
   mi = NewMI;
   nmi = std::next(mi);
 
@@ -1329,6 +1336,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 
   bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
 
+  // Give targets a chance to convert bundled instructions.
+  bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
   // If the instruction is convertible to 3 Addr, instead
   // of returning try 3 Addr transformation aggressively and
   // use this variable to check later. Because it might be better.
@@ -1337,7 +1347,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
   //   addl     %esi, %edi
   //   movl     %edi, %eax
   //   ret
-  if (Commuted && !MI.isConvertibleTo3Addr())
+  if (Commuted && !ConvertibleTo3Addr)
     return false;
 
   if (shouldOnlyCommute)
@@ -1357,7 +1367,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
     regBKilled = isKilled(MI, regB, true);
   }
 
-  if (MI.isConvertibleTo3Addr()) {
+  if (ConvertibleTo3Addr) {
     // This instruction is potentially convertible to a true
     // three-address instruction.  Check if it is profitable.
     if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
@@ -1402,7 +1412,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
         // Unfold the load.
         LLVM_DEBUG(dbgs() << "2addr:   UNFOLDING: " << MI);
         const TargetRegisterClass *RC = TRI->getAllocatableClass(
-            TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI));
+            TII->getRegClass(UnfoldMCID, LoadRegIndex));
         Register Reg = MRI->createVirtualRegister(RC);
         SmallVector<MachineInstr *, 2> NewMIs;
         if (!TII->unfoldMemoryOperand(*MF, MI, Reg,
@@ -1665,6 +1675,17 @@ void TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
     // by SubRegB is compatible with RegA with no subregister. So regardless of
     // whether the dest oper writes a subreg, the source oper should not.
     MO.setSubReg(0);
+
+    // Update uses of RegB to uses of RegA inside the bundle.
+    if (MI->isBundle()) {
+      for (MachineOperand &MO : mi_bundle_ops(*MI)) {
+        if (MO.isReg() && MO.getReg() == RegB) {
+          assert(MO.getSubReg() == 0 && SubRegB == 0 &&
+                 "tied subregister uses in bundled instructions not supported");
+          MO.setReg(RegA);
+        }
+      }
+    }
   }
 
   if (AllUsesCopied) {
diff --git a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
index 2fd1dd5f84a91..53d166d277cb8 100644
--- a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
+++ b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
@@ -34,7 +34,6 @@
 #include <cassert>
 #include <iomanip>
 #include <limits>
-#include <memory>
 #include <sstream>
 
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp
index 1ea3e6bcb15ce..2f54578da5113 100644
--- a/llvm/lib/CodeGen/WasmEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp
@@ -85,6 +85,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/RuntimeLibcalls.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
@@ -273,8 +274,13 @@ bool WasmEHPrepareImpl::prepareEHPads(Function &F) {
   // instruction selection.
   CatchF = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_catch);
 
+  // FIXME: Verify this is really supported for current module.
+  StringRef UnwindCallPersonalityName =
+      RTLIB::RuntimeLibcallsInfo::getLibcallImplName(
+          RTLIB::impl__Unwind_CallPersonality);
+
   // _Unwind_CallPersonality() wrapper function, which calls the personality
-  CallPersonalityF = M.getOrInsertFunction("_Unwind_CallPersonality",
+  CallPersonalityF = M.getOrInsertFunction(UnwindCallPersonalityName,
                                            IRB.getInt32Ty(), IRB.getPtrTy());
   if (Function *F = dyn_cast<Function>(CallPersonalityF.getCallee()))
     F->setDoesNotThrow();
diff --git a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp
index fd54190b04468..dab1416d254a2 100644
--- a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp
+++ b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp
@@ -461,8 +461,6 @@ static bool searchConstantExprForGlobalVariables(
     Value *V, SmallDenseMap<GlobalVariable *, Value *> &GVLoadMap,
     SmallVector<GlobalVariableUse> &GVUses) {
 
-  SmallVector<Value *, 8> ReplacedOperands;
-
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
     if (globalVariableNeedsRedirect(GV)) {
       GVLoadMap[GV] = nullptr;
diff --git a/llvm/lib/CodeGenTypes/LowLevelType.cpp b/llvm/lib/CodeGenTypes/LowLevelType.cpp
index 4785f2652b00e..92b7fad3a0e24 100644
--- a/llvm/lib/CodeGenTypes/LowLevelType.cpp
+++ b/llvm/lib/CodeGenTypes/LowLevelType.cpp
@@ -54,9 +54,3 @@ LLVM_DUMP_METHOD void LLT::dump() const {
   dbgs() << '\n';
 }
 #endif
-
-const constexpr LLT::BitFieldInfo LLT::ScalarSizeFieldInfo;
-const constexpr LLT::BitFieldInfo LLT::PointerSizeFieldInfo;
-const constexpr LLT::BitFieldInfo LLT::PointerAddressSpaceFieldInfo;
-const constexpr LLT::BitFieldInfo LLT::VectorElementsFieldInfo;
-const constexpr LLT::BitFieldInfo LLT::VectorScalableFieldInfo;
diff --git a/llvm/lib/DWARFCFIChecker/Registers.h b/llvm/lib/DWARFCFIChecker/Registers.h
index a372c4c4345bd..915250de5aeae 100644
--- a/llvm/lib/DWARFCFIChecker/Registers.h
+++ b/llvm/lib/DWARFCFIChecker/Registers.h
@@ -17,7 +17,6 @@
 
 #include "llvm/MC/MCRegister.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include <iterator>
 
 namespace llvm {
 
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h
index 84757aea7045d..970abdc38f417 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerUnit.h
@@ -28,7 +28,7 @@ using MacroOffset2UnitMapTy = DenseMap<uint64_t, DwarfUnit *>;
 /// Base class for all Dwarf units(Compile unit/Type table unit).
 class DwarfUnit : public OutputSections {
 public:
-  virtual ~DwarfUnit() {}
+  virtual ~DwarfUnit() = default;
   DwarfUnit(LinkingGlobalData &GlobalData, unsigned ID,
             StringRef ClangModuleName)
       : OutputSections(GlobalData), ID(ID), ClangModuleName(ClangModuleName),
diff --git a/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h b/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h
index f67536ef7a1a8..8ccb4a502aaba 100644
--- a/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h
+++ b/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h
@@ -22,7 +22,7 @@ class StringEntryToDwarfStringPoolEntryMap {
 public:
   StringEntryToDwarfStringPoolEntryMap(LinkingGlobalData &GlobalData)
       : GlobalData(GlobalData) {}
-  ~StringEntryToDwarfStringPoolEntryMap() {}
+  ~StringEntryToDwarfStringPoolEntryMap() = default;
 
   /// Create DwarfStringPoolEntry for specified StringEntry if necessary.
   /// Initialize DwarfStringPoolEntry with initial values.
diff --git a/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp b/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp
index 34174f98b7e37..ca918f6e17b38 100644
--- a/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp
@@ -377,8 +377,10 @@ Error SyntheticTypeNameBuilder::addTypeName(UnitEntryPairTy InputUnitEntryPair,
   } break;
   }
 
-  // If name for the DIE is not determined yet add referenced types to the name.
-  if (!HasLinkageName && !HasShortName && !HasDeclFileName) {
+  // If name for the DIE is not determined yet or if the DIE is a typedef, add
+  // referenced types to the name.
+  if ((!HasLinkageName && !HasShortName && !HasDeclFileName) ||
+      InputUnitEntryPair.DieEntry->getTag() == dwarf::DW_TAG_typedef) {
     if (InputUnitEntryPair.CU->find(InputUnitEntryPair.DieEntry,
                                     getODRAttributes()))
       if (Error Err = addReferencedODRDies(InputUnitEntryPair, AddParentNames,
diff --git a/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp b/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp
index 1898fba004e88..c437c53b0481a 100644
--- a/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp
+++ b/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
-#include <utility>
 #include <vector>
 
 using namespace llvm;
diff --git a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
index 6c23ba8f3c466..23ab5344df1ed 100644
--- a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
+++ b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
@@ -102,7 +102,8 @@ std::optional<CVType> LazyRandomTypeCollection::tryGetType(TypeIndex Index) {
     return std::nullopt;
   }
 
-  assert(contains(Index));
+  if (!contains(Index))
+    return std::nullopt;
   return Records[Index.toArrayIndex()].Type;
 }
 
diff --git a/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp b/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
index 0bc65f8d0359a..49b7df98957af 100644
--- a/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
+++ b/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
@@ -28,7 +28,6 @@
 #include <cstddef>
 #include <cstdint>
 #include <string>
-#include <vector>
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index db5cc37c93f90..deafee80f559f 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -31,7 +31,6 @@
 #include <cinttypes>
 #include <cstdint>
 #include <string>
-#include <utility>
 
 using namespace llvm;
 using namespace dwarf;
@@ -129,6 +128,25 @@ prettyLanguageVersionString(const DWARFAttribute &AttrValue,
       static_cast<SourceLanguageName>(*LName), *LVersion);
 }
 
+static llvm::Expected<llvm::StringRef>
+getApplePropertyName(const DWARFDie &PropDIE) {
+  if (!PropDIE)
+    return llvm::createStringError("invalid DIE");
+
+  if (PropDIE.getTag() != DW_TAG_APPLE_property)
+    return llvm::createStringError("not referencing a DW_TAG_APPLE_property");
+
+  auto PropNameForm = PropDIE.find(DW_AT_APPLE_property_name);
+  if (!PropNameForm)
+    return "";
+
+  auto NameOrErr = PropNameForm->getAsCString();
+  if (!NameOrErr)
+    return NameOrErr.takeError();
+
+  return *NameOrErr;
+}
+
 static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
                           const DWARFAttribute &AttrValue, unsigned Indent,
                           DIDumpOptions DumpOpts) {
@@ -233,6 +251,15 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
             Die.getAttributeValueAsReferencedDie(FormValue).getName(
                 DINameKind::LinkageName))
       OS << Space << "\"" << Name << '\"';
+  } else if (Attr == DW_AT_APPLE_property) {
+    auto PropDIE = Die.getAttributeValueAsReferencedDie(FormValue);
+    if (auto PropNameOrErr = getApplePropertyName(PropDIE))
+      OS << Space << "\"" << *PropNameOrErr << '\"';
+    else
+      DumpOpts.RecoverableErrorHandler(createStringError(
+          errc::invalid_argument,
+          llvm::formatv("decoding DW_AT_APPLE_property_name: {}",
+                        toString(PropNameOrErr.takeError()))));
   } else if (Attr == DW_AT_type || Attr == DW_AT_containing_type) {
     DWARFDie D = resolveReferencedType(Die, FormValue);
     if (D && !D.isNULL()) {
@@ -676,7 +703,9 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent,
           DIDumpOptions ChildDumpOpts = DumpOpts;
           ChildDumpOpts.ShowParents = false;
           while (Child) {
-            Child.dump(OS, Indent + 2, ChildDumpOpts);
+            if (DumpOpts.FilterChildTag.empty() ||
+                llvm::is_contained(DumpOpts.FilterChildTag, Child.getTag()))
+              Child.dump(OS, Indent + 2, ChildDumpOpts);
             Child = Child.getSibling();
           }
         }
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp b/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
index a201fae84838c..db6170c784f80 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
@@ -17,7 +17,6 @@
 #include <cinttypes>
 #include <cstdint>
 #include <set>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnwindTablePrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnwindTablePrinter.cpp
index a88f4a554bcf0..a4bdd1f0a867c 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFUnwindTablePrinter.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFUnwindTablePrinter.cpp
@@ -15,7 +15,6 @@
 #include <cassert>
 #include <cinttypes>
 #include <cstdint>
-#include <optional>
 
 using namespace llvm;
 using namespace dwarf;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 5ab80e339a1ad..693454e249945 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -917,11 +917,10 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
     }
 
     // Check if the offset matches any of the sequence offset.
-    auto It =
-        std::find_if(LineTable->Sequences.begin(), LineTable->Sequences.end(),
-                     [SectionOffset](const auto &Sequence) {
-                       return Sequence.StmtSeqOffset == *SectionOffset;
-                     });
+    auto It = llvm::find_if(LineTable->Sequences,
+                            [SectionOffset](const auto &Sequence) {
+                              return Sequence.StmtSeqOffset == *SectionOffset;
+                            });
 
     if (It == LineTable->Sequences.end())
       ReportError(
diff --git a/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp b/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
index bb3411bb9568e..7890bcce6c7ca 100644
--- a/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
+++ b/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
@@ -21,7 +21,6 @@
 #include <cassert>
 #include <cstdint>
 #include <cstring>
-#include <memory>
 #include <utility>
 #include <vector>
 
diff --git a/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
index 91b3dd5c32b9f..c82edd9c330d2 100644
--- a/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
@@ -15,8 +15,6 @@
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/TimeProfiler.h"
 
-#include <map>
-
 using namespace llvm;
 using namespace llvm::msf;
 using namespace llvm::support;
diff --git a/llvm/lib/Demangle/ItaniumDemangle.cpp b/llvm/lib/Demangle/ItaniumDemangle.cpp
index 1009cc91ca12a..8e476cdafdb71 100644
--- a/llvm/lib/Demangle/ItaniumDemangle.cpp
+++ b/llvm/lib/Demangle/ItaniumDemangle.cpp
@@ -25,10 +25,6 @@
 using namespace llvm;
 using namespace llvm::itanium_demangle;
 
-constexpr const char *itanium_demangle::FloatData<float>::spec;
-constexpr const char *itanium_demangle::FloatData<double>::spec;
-constexpr const char *itanium_demangle::FloatData<long double>::spec;
-
 // <discriminator> := _ <non-negative number>      # when number < 10
 //                 := __ <non-negative number> _   # when number >= 10
 //  extension      := decimal-digit+               # at the end of string
diff --git a/llvm/lib/Demangle/MicrosoftDemangle.cpp b/llvm/lib/Demangle/MicrosoftDemangle.cpp
index b22928be3be50..769dbd4f2eb6e 100644
--- a/llvm/lib/Demangle/MicrosoftDemangle.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangle.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Demangle/StringViewExtras.h"
 #include "llvm/Demangle/Utility.h"
 
-#include <array>
 #include <cctype>
 #include <cstdio>
 #include <optional>
@@ -277,6 +276,18 @@ demanglePointerCVQualifiers(std::string_view &MangledName) {
   DEMANGLE_UNREACHABLE;
 }
 
+static NodeArrayNode *nodeListToNodeArray(ArenaAllocator &Arena, NodeList *Head,
+                                          size_t Count) {
+  NodeArrayNode *N = Arena.alloc<NodeArrayNode>();
+  N->Count = Count;
+  N->Nodes = Arena.allocArray<Node *>(Count);
+  for (size_t I = 0; I < Count; ++I) {
+    N->Nodes[I] = Head->N;
+    Head = Head->Next;
+  }
+  return N;
+}
+
 std::string_view Demangler::copyString(std::string_view Borrowed) {
   char *Stable = Arena.allocUnalignedBuffer(Borrowed.size());
   // This is not a micro-optimization, it avoids UB, should Borrowed be an null
@@ -323,8 +334,30 @@ Demangler::demangleSpecialTableSymbolNode(std::string_view &MangledName,
   }
 
   std::tie(STSN->Quals, IsMember) = demangleQualifiers(MangledName);
-  if (!consumeFront(MangledName, '@'))
-    STSN->TargetName = demangleFullyQualifiedTypeName(MangledName);
+
+  NodeList *TargetCurrent = nullptr;
+  NodeList *TargetHead = nullptr;
+  size_t Count = 0;
+  while (!consumeFront(MangledName, '@')) {
+    ++Count;
+
+    NodeList *Next = Arena.alloc<NodeList>();
+    if (TargetCurrent)
+      TargetCurrent->Next = Next;
+    else
+      TargetHead = Next;
+
+    TargetCurrent = Next;
+    QualifiedNameNode *QN = demangleFullyQualifiedTypeName(MangledName);
+    if (Error)
+      return nullptr;
+    assert(QN);
+    TargetCurrent->N = QN;
+  }
+
+  if (Count > 0)
+    STSN->TargetNames = nodeListToNodeArray(Arena, TargetHead, Count);
+
   return STSN;
 }
 
@@ -1605,18 +1638,6 @@ Demangler::demangleNameScopePiece(std::string_view &MangledName) {
   return demangleSimpleName(MangledName, /*Memorize=*/true);
 }
 
-static NodeArrayNode *nodeListToNodeArray(ArenaAllocator &Arena, NodeList *Head,
-                                          size_t Count) {
-  NodeArrayNode *N = Arena.alloc<NodeArrayNode>();
-  N->Count = Count;
-  N->Nodes = Arena.allocArray<Node *>(Count);
-  for (size_t I = 0; I < Count; ++I) {
-    N->Nodes[I] = Head->N;
-    Head = Head->Next;
-  }
-  return N;
-}
-
 QualifiedNameNode *
 Demangler::demangleNameScopeChain(std::string_view &MangledName,
                                   IdentifierNode *UnqualifiedName) {
diff --git a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
index 61e4961c714bc..17c6aab500049 100644
--- a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
@@ -662,9 +662,9 @@ void VcallThunkIdentifierNode::output(OutputBuffer &OB,
 void SpecialTableSymbolNode::output(OutputBuffer &OB, OutputFlags Flags) const {
   outputQualifiers(OB, Quals, false, true);
   Name->output(OB, Flags);
-  if (TargetName) {
+  if (TargetNames) {
     OB << "{for `";
-    TargetName->output(OB, Flags);
+    TargetNames->output(OB, Flags, "'s `");
     OB << "'}";
   }
 }
diff --git a/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index 87675be1fc8e1..9fe74898170a5 100644
--- a/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -41,7 +41,6 @@
 #include <map>
 #include <mutex>
 #include <string>
-#include <utility>
 #include <vector>
 
 #ifdef HAVE_FFI_CALL
diff --git a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
index 4669124ebe578..0b530fb1bc478 100644
--- a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
@@ -26,6 +26,7 @@ add_llvm_component_library(LLVMJITLink
   ELF_loongarch.cpp
   ELF_ppc64.cpp
   ELF_riscv.cpp
+  ELF_systemz.cpp
   ELF_x86.cpp
   ELF_x86_64.cpp
 
@@ -46,6 +47,7 @@ add_llvm_component_library(LLVMJITLink
   loongarch.cpp
   ppc64.cpp
   riscv.cpp
+  systemz.cpp
   x86.cpp
   x86_64.cpp
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.h
index 55442e0cee557..50ba2f822d832 100644
--- a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.h
@@ -23,8 +23,6 @@
 
 #define DEBUG_TYPE "jitlink"
 
-#include <list>
-
 namespace llvm {
 namespace jitlink {
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
index 87e451715811f..42f42eef00e5b 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ExecutionEngine/JITLink/ELF_loongarch.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_ppc64.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_riscv.h"
+#include "llvm/ExecutionEngine/JITLink/ELF_systemz.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_x86.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_x86_64.h"
 #include "llvm/Object/ELF.h"
@@ -98,6 +99,8 @@ createLinkGraphFromELFObject(MemoryBufferRef ObjectBuffer,
     return createLinkGraphFromELFObject_loongarch(ObjectBuffer, std::move(SSP));
   case ELF::EM_RISCV:
     return createLinkGraphFromELFObject_riscv(ObjectBuffer, std::move(SSP));
+  case ELF::EM_S390:
+    return createLinkGraphFromELFObject_systemz(ObjectBuffer, std::move(SSP));
   case ELF::EM_X86_64:
     return createLinkGraphFromELFObject_x86_64(ObjectBuffer, std::move(SSP));
   case ELF::EM_386:
@@ -135,6 +138,9 @@ void link_ELF(std::unique_ptr<LinkGraph> G,
   case Triple::riscv64:
     link_ELF_riscv(std::move(G), std::move(Ctx));
     return;
+  case Triple::systemz:
+    link_ELF_systemz(std::move(G), std::move(Ctx));
+    return;
   case Triple::x86_64:
     link_ELF_x86_64(std::move(G), std::move(Ctx));
     return;
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_systemz.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_systemz.cpp
new file mode 100644
index 0000000000000..29eeecceea766
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_systemz.cpp
@@ -0,0 +1,424 @@
+//===----- ELF_systemz.cpp - JIT linker implementation for ELF/systemz ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// ELF/systemz jit-link implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h"
+#include "llvm/ExecutionEngine/JITLink/systemz.h"
+#include "llvm/Object/ELFObjectFile.h"
+
+#include "DefineExternalSectionStartAndEndSymbols.h"
+#include "EHFrameSupportImpl.h"
+#include "ELFLinkGraphBuilder.h"
+#include "JITLinkGeneric.h"
+
+#define DEBUG_TYPE "jitlink"
+
+using namespace llvm;
+using namespace llvm::jitlink;
+
+namespace {
+
+constexpr StringRef ELFGOTSymbolName = "_GLOBAL_OFFSET_TABLE_";
+
+Error buildTables_ELF_systemz(LinkGraph &G) {
+  LLVM_DEBUG(dbgs() << "Visiting edges in graph:\n");
+  systemz::GOTTableManager GOT;
+  systemz::PLTTableManager PLT(GOT);
+  visitExistingEdges(G, GOT, PLT);
+  return Error::success();
+}
+
+} // namespace
+
+namespace llvm {
+namespace jitlink {
+class ELFJITLinker_systemz : public JITLinker<ELFJITLinker_systemz> {
+  friend class JITLinker<ELFJITLinker_systemz>;
+
+public:
+  ELFJITLinker_systemz(std::unique_ptr<JITLinkContext> Ctx,
+                       std::unique_ptr<LinkGraph> G,
+                       PassConfiguration PassConfig)
+      : JITLinker(std::move(Ctx), std::move(G), std::move(PassConfig)) {
+    if (shouldAddDefaultTargetPasses(getGraph().getTargetTriple()))
+      getPassConfig().PostAllocationPasses.push_back(
+          [this](LinkGraph &G) { return getOrCreateGOTSymbol(G); });
+  }
+
+private:
+  Symbol *GOTSymbol = nullptr;
+
+  Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const {
+    return systemz::applyFixup(G, B, E, GOTSymbol);
+  }
+
+  Error getOrCreateGOTSymbol(LinkGraph &G) {
+    auto DefineExternalGOTSymbolIfPresent =
+        createDefineExternalSectionStartAndEndSymbolsPass(
+            [&](LinkGraph &LG, Symbol &Sym) -> SectionRangeSymbolDesc {
+              if (Sym.getName() != nullptr &&
+                  *Sym.getName() == ELFGOTSymbolName)
+                if (auto *GOTSection = G.findSectionByName(
+                        systemz::GOTTableManager::getSectionName())) {
+                  GOTSymbol = &Sym;
+                  return {*GOTSection, true};
+                }
+              return {};
+            });
+
+    // Try to attach _GLOBAL_OFFSET_TABLE_ to the GOT if it's defined as an
+    // external.
+    if (auto Err = DefineExternalGOTSymbolIfPresent(G))
+      return Err;
+
+    // If we succeeded then we're done.
+    if (GOTSymbol)
+      return Error::success();
+
+    // Otherwise look for a GOT section: If it already has a start symbol we'll
+    // record it, otherwise we'll create our own.
+    // If there's a GOT section but we didn't find an external GOT symbol...
+    if (auto *GOTSection =
+            G.findSectionByName(systemz::GOTTableManager::getSectionName())) {
+
+      // Check for an existing defined symbol.
+      for (auto *Sym : GOTSection->symbols())
+        if (Sym->getName() != nullptr && *Sym->getName() == ELFGOTSymbolName) {
+          GOTSymbol = Sym;
+          return Error::success();
+        }
+
+      // If there's no defined symbol then create one.
+      SectionRange SR(*GOTSection);
+      if (SR.empty())
+        GOTSymbol =
+            &G.addAbsoluteSymbol(ELFGOTSymbolName, orc::ExecutorAddr(), 0,
+                                 Linkage::Strong, Scope::Local, true);
+      else
+        GOTSymbol =
+            &G.addDefinedSymbol(*SR.getFirstBlock(), 0, ELFGOTSymbolName, 0,
+                                Linkage::Strong, Scope::Local, false, true);
+    }
+
+    // If we still haven't found a GOT symbol then double check the externals.
+    // We may have a GOT-relative reference but no GOT section, in which case
+    // we just need to point the GOT symbol at some address in this graph.
+    if (!GOTSymbol) {
+      for (auto *Sym : G.external_symbols()) {
+        if (Sym->getName() != nullptr && *Sym->getName() == ELFGOTSymbolName) {
+          auto Blocks = G.blocks();
+          if (!Blocks.empty()) {
+            G.makeAbsolute(*Sym, (*Blocks.begin())->getAddress());
+            GOTSymbol = Sym;
+            break;
+          }
+        }
+      }
+    }
+
+    return Error::success();
+  }
+};
+
+class ELFLinkGraphBuilder_systemz
+    : public ELFLinkGraphBuilder<object::ELF64BE> {
+private:
+  using ELFT = object::ELF64BE;
+  using Base = ELFLinkGraphBuilder<ELFT>;
+  using Base::G; // Use LinkGraph pointer from base class.
+
+  Error addRelocations() override {
+    LLVM_DEBUG(dbgs() << "Processing relocations:\n");
+
+    using Base = ELFLinkGraphBuilder<ELFT>;
+    using Self = ELFLinkGraphBuilder_systemz;
+    for (const auto &RelSect : Base::Sections) {
+      if (RelSect.sh_type == ELF::SHT_REL)
+        // Validate the section to read relocation entries from.
+        return make_error<StringError>("No SHT_REL in valid " +
+                                           G->getTargetTriple().getArchName() +
+                                           " ELF object files",
+                                       inconvertibleErrorCode());
+
+      if (Error Err = Base::forEachRelaRelocation(RelSect, this,
+                                                  &Self::addSingleRelocation))
+        return Err;
+    }
+
+    return Error::success();
+  }
+
+  Error addSingleRelocation(const typename ELFT::Rela &Rel,
+                            const typename ELFT::Shdr &FixupSect,
+                            Block &BlockToFix) {
+    using support::big32_t;
+    using Base = ELFLinkGraphBuilder<ELFT>;
+    auto ELFReloc = Rel.getType(false);
+
+    // No reloc.
+    if (LLVM_UNLIKELY(ELFReloc == ELF::R_390_NONE))
+      return Error::success();
+
+    uint32_t SymbolIndex = Rel.getSymbol(false);
+    auto ObjSymbol = Base::Obj.getRelocationSymbol(Rel, Base::SymTabSec);
+    if (!ObjSymbol)
+      return ObjSymbol.takeError();
+
+    Symbol *GraphSymbol = Base::getGraphSymbol(SymbolIndex);
+    if (!GraphSymbol)
+      return make_error<StringError>(
+          formatv("Could not find symbol at given index, did you add it to "
+                  "JITSymbolTable? index: {0}, shndx: {1} Size of table: {2}",
+                  SymbolIndex, (*ObjSymbol)->st_shndx,
+                  Base::GraphSymbols.size()),
+          inconvertibleErrorCode());
+
+    // Validate the relocation kind.
+    int64_t Addend = Rel.r_addend;
+    Edge::Kind Kind = Edge::Invalid;
+
+    switch (ELFReloc) {
+    case ELF::R_390_PC64: {
+      Kind = systemz::Delta64;
+      break;
+    }
+    case ELF::R_390_PC32: {
+      Kind = systemz::Delta32;
+      break;
+    }
+    case ELF::R_390_PC16: {
+      Kind = systemz::Delta16;
+      break;
+    }
+    case ELF::R_390_PC32DBL: {
+      Kind = systemz::Delta32dbl;
+      break;
+    }
+    case ELF::R_390_PC24DBL: {
+      Kind = systemz::Delta24dbl;
+      break;
+    }
+    case ELF::R_390_PC16DBL: {
+      Kind = systemz::Delta16dbl;
+      break;
+    }
+    case ELF::R_390_PC12DBL: {
+      Kind = systemz::Delta12dbl;
+      break;
+    }
+    case ELF::R_390_64: {
+      Kind = systemz::Pointer64;
+      break;
+    }
+    case ELF::R_390_32: {
+      Kind = systemz::Pointer32;
+      break;
+    }
+    case ELF::R_390_20: {
+      Kind = systemz::Pointer20;
+      break;
+    }
+    case ELF::R_390_16: {
+      Kind = systemz::Pointer16;
+      break;
+    }
+    case ELF::R_390_12: {
+      Kind = systemz::Pointer12;
+      break;
+    }
+    case ELF::R_390_8: {
+      Kind = systemz::Pointer8;
+      break;
+    }
+    // Relocations targeting the PLT associated with the symbol.
+    case ELF::R_390_PLT64: {
+      Kind = systemz::DeltaPLT64;
+      break;
+    }
+    case ELF::R_390_PLT32: {
+      Kind = systemz::DeltaPLT32;
+      break;
+    }
+    case ELF::R_390_PLT32DBL: {
+      Kind = systemz::DeltaPLT32dbl;
+      break;
+    }
+    case ELF::R_390_PLT24DBL: {
+      Kind = systemz::DeltaPLT24dbl;
+      break;
+    }
+    case ELF::R_390_PLT16DBL: {
+      Kind = systemz::DeltaPLT16dbl;
+      break;
+    }
+    case ELF::R_390_PLT12DBL: {
+      Kind = systemz::DeltaPLT12dbl;
+      break;
+    }
+    case ELF::R_390_PLTOFF64: {
+      Kind = systemz::Delta64PLTFromGOT;
+      break;
+    }
+    case ELF::R_390_PLTOFF32: {
+      Kind = systemz::Delta32PLTFromGOT;
+      break;
+    }
+    case ELF::R_390_PLTOFF16: {
+      Kind = systemz::Delta16PLTFromGOT;
+      break;
+    }
+    // Relocations targeting the actual symbol (just relative to the GOT).
+    case ELF::R_390_GOTOFF64: {
+      Kind = systemz::Delta64FromGOT;
+      break;
+    }
+    case ELF::R_390_GOTOFF: {
+      Kind = systemz::Delta32FromGOT;
+      break;
+    }
+    case ELF::R_390_GOTOFF16: {
+      Kind = systemz::Delta16FromGOT;
+      break;
+    }
+    // Relocations targeting the GOT entry associated with the symbol.
+    case ELF::R_390_GOT64:
+    case ELF::R_390_GOTPLT64: {
+      Kind = systemz::RequestGOTAndTransformToDelta64FromGOT;
+      break;
+    }
+    case ELF::R_390_GOT32:
+    case ELF::R_390_GOTPLT32: {
+      Kind = systemz::RequestGOTAndTransformToDelta32FromGOT;
+      break;
+    }
+    case ELF::R_390_GOT20:
+    case ELF::R_390_GOTPLT20: {
+      Kind = systemz::RequestGOTAndTransformToDelta20FromGOT;
+      break;
+    }
+    case ELF::R_390_GOT16:
+    case ELF::R_390_GOTPLT16: {
+      Kind = systemz::RequestGOTAndTransformToDelta16FromGOT;
+      break;
+    }
+    case ELF::R_390_GOT12:
+    case ELF::R_390_GOTPLT12: {
+      Kind = systemz::RequestGOTAndTransformToDelta12FromGOT;
+      break;
+    }
+    case ELF::R_390_GOTENT:
+    case ELF::R_390_GOTPLTENT: {
+      Kind = systemz::RequestGOTAndTransformToDelta32dbl;
+      break;
+    }
+    // R_390_GOTPC and R_390_GOTPCDBL don't create GOT entry, they don't even
+    // have symbol.
+    case ELF::R_390_GOTPC: {
+      Kind = systemz::Delta32GOTBase;
+      break;
+    }
+    case ELF::R_390_GOTPCDBL: {
+      Kind = systemz::Delta32dblGOTBase;
+      break;
+    }
+    default:
+      return make_error<JITLinkError>(
+          "In " + G->getName() + ": Unsupported systemz relocation type " +
+          object::getELFRelocationTypeName(ELF::EM_S390, ELFReloc));
+    }
+    auto FixupAddress = orc::ExecutorAddr(FixupSect.sh_addr) + Rel.r_offset;
+    Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress();
+    Edge GE(Kind, Offset, *GraphSymbol, Addend);
+    LLVM_DEBUG({
+      dbgs() << "    ";
+      printEdge(dbgs(), BlockToFix, GE, systemz::getEdgeKindName(Kind));
+      dbgs() << "\n";
+    });
+
+    BlockToFix.addEdge(std::move(GE));
+
+    return Error::success();
+  }
+
+public:
+  ELFLinkGraphBuilder_systemz(StringRef FileName,
+                              const object::ELFFile<ELFT> &Obj,
+                              std::shared_ptr<orc::SymbolStringPool> SSP,
+                              Triple TT, SubtargetFeatures Features)
+      : ELFLinkGraphBuilder<ELFT>(Obj, std::move(SSP), std::move(TT),
+                                  std::move(Features), FileName,
+                                  systemz::getEdgeKindName) {}
+};
+
+Expected<std::unique_ptr<LinkGraph>> createLinkGraphFromELFObject_systemz(
+    MemoryBufferRef ObjectBuffer, std::shared_ptr<orc::SymbolStringPool> SSP) {
+  LLVM_DEBUG({
+    dbgs() << "Building jitlink graph for new input "
+           << ObjectBuffer.getBufferIdentifier() << "...\n";
+  });
+
+  auto ELFObj = object::ObjectFile::createELFObjectFile(ObjectBuffer);
+  if (!ELFObj)
+    return ELFObj.takeError();
+
+  auto Features = (*ELFObj)->getFeatures();
+  if (!Features)
+    return Features.takeError();
+
+  assert((*ELFObj)->getArch() == Triple::systemz &&
+         "Only SystemZ is supported");
+
+  auto &ELFObjFile = cast<object::ELFObjectFile<object::ELF64BE>>(**ELFObj);
+  return ELFLinkGraphBuilder_systemz(
+             (*ELFObj)->getFileName(), ELFObjFile.getELFFile(), std::move(SSP),
+             (*ELFObj)->makeTriple(), std::move(*Features))
+      .buildGraph();
+}
+
+void link_ELF_systemz(std::unique_ptr<LinkGraph> G,
+                      std::unique_ptr<JITLinkContext> Ctx) {
+  PassConfiguration Config;
+  const Triple &TT = G->getTargetTriple();
+  if (Ctx->shouldAddDefaultTargetPasses(TT)) {
+    // Add eh-frame passes.
+    Config.PrePrunePasses.push_back(DWARFRecordSectionSplitter(".eh_frame"));
+    Config.PrePrunePasses.push_back(
+        EHFrameEdgeFixer(".eh_frame", G->getPointerSize(), systemz::Pointer32,
+                         systemz::Pointer64, systemz::Delta32, systemz::Delta64,
+                         systemz::NegDelta32));
+    Config.PrePrunePasses.push_back(EHFrameNullTerminator(".eh_frame"));
+
+    // Add a mark-live pass.
+    if (auto MarkLive = Ctx->getMarkLivePass(TT))
+      Config.PrePrunePasses.push_back(std::move(MarkLive));
+    else
+      Config.PrePrunePasses.push_back(markAllSymbolsLive);
+
+    // Add an in-place GOT/Stubs build pass.
+    Config.PostPrunePasses.push_back(buildTables_ELF_systemz);
+
+    // Resolve any external section start / end symbols.
+    Config.PostAllocationPasses.push_back(
+        createDefineExternalSectionStartAndEndSymbolsPass(
+            identifyELFSectionStartAndEndSymbols));
+
+    // TODO: Add GOT/Stubs optimizer pass.
+    // Config.PreFixupPasses.push_back(systemz::optimizeGOTAndStubAccesses);
+  }
+
+  if (auto Err = Ctx->modifyPassConfig(*G, Config))
+    return Ctx->notifyFailed(std::move(Err));
+
+  ELFJITLinker_systemz::link(std::move(Ctx), std::move(G), std::move(Config));
+}
+
+} // namespace jitlink
+} // namespace llvm
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
index 6e316f105715d..d98ded1ee4c32 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ExecutionEngine/JITLink/XCOFF.h"
 #include "llvm/ExecutionEngine/JITLink/aarch64.h"
 #include "llvm/ExecutionEngine/JITLink/loongarch.h"
+#include "llvm/ExecutionEngine/JITLink/systemz.h"
 #include "llvm/ExecutionEngine/JITLink/x86.h"
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
 #include "llvm/Support/raw_ostream.h"
@@ -479,6 +480,8 @@ AnonymousPointerCreator getAnonymousPointerCreator(const Triple &TT) {
   case Triple::loongarch32:
   case Triple::loongarch64:
     return loongarch::createAnonymousPointer;
+  case Triple::systemz:
+    return systemz::createAnonymousPointer;
   default:
     return nullptr;
   }
@@ -495,6 +498,8 @@ PointerJumpStubCreator getPointerJumpStubCreator(const Triple &TT) {
   case Triple::loongarch32:
   case Triple::loongarch64:
     return loongarch::createAnonymousPointerJumpStub;
+  case Triple::systemz:
+    return systemz::createAnonymousPointerJumpStub;
   default:
     return nullptr;
   }
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
index 343218ec9ad18..91021e457532e 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
@@ -21,8 +21,6 @@
 #include "EHFrameSupportImpl.h"
 #include "JITLinkGeneric.h"
 
-#include <list>
-
 namespace llvm {
 namespace jitlink {
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/systemz.cpp b/llvm/lib/ExecutionEngine/JITLink/systemz.cpp
new file mode 100644
index 0000000000000..f6cc29fa6e6a1
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/JITLink/systemz.cpp
@@ -0,0 +1,114 @@
+//===---- systemz.cpp - Generic JITLink systemz edge kinds, utilities -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Generic utilities for graphs representing systemz objects.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/JITLink/systemz.h"
+
+#define DEBUG_TYPE "jitlink"
+
+namespace llvm {
+namespace jitlink {
+namespace systemz {
+
+const char NullPointerContent[8] = {0x00, 0x00, 0x00, 0x00,
+                                    0x00, 0x00, 0x00, 0x00};
+
+const char Pointer64JumpStubContent[8] = {
+    static_cast<char>(0xC4u),
+    0x18,
+    0x00,
+    0x00,
+    0x00,
+    0x00, // lgrl r1
+    static_cast<char>(0x07u),
+    static_cast<char>(0xF1u), // BCR 15, 1
+};
+
+const char *getEdgeKindName(Edge::Kind R) {
+  switch (R) {
+  case Pointer64:
+    return "Pointer64";
+  case Pointer32:
+    return "Pointer32";
+  case Pointer20:
+    return "Pointer20";
+  case Pointer16:
+    return "Pointer16";
+  case Pointer12:
+    return "Pointer12";
+  case Pointer8:
+    return "Pointer8";
+  case Delta64:
+    return "Delta64";
+  case Delta32:
+    return "Delta32";
+  case Delta16:
+    return "Delta16";
+  case Delta32dbl:
+    return "Delta32dbl";
+  case Delta24dbl:
+    return "Delta24dbl";
+  case Delta16dbl:
+    return "Delta16dbl";
+  case Delta12dbl:
+    return "Delta12dbl";
+  case NegDelta64:
+    return "NegDelta64";
+  case NegDelta32:
+    return "NegDelta32";
+  case DeltaPLT32dbl:
+    return "DeltaPLT32dbl";
+  case DeltaPLT24dbl:
+    return "DeltaPLT24dbl";
+  case DeltaPLT16dbl:
+    return "DeltaPLT16dbl";
+  case DeltaPLT12dbl:
+    return "DeltaPLT12dbl";
+  case DeltaPLT64:
+    return "DeltaPLT64";
+  case DeltaPLT32:
+    return "DeltaPLT32";
+  case Delta64FromGOT:
+    return "Delta64FromGOT";
+  case Delta32FromGOT:
+    return "Delta32FromGOT";
+  case Delta16FromGOT:
+    return "Delta16FromGOT";
+  case Delta64PLTFromGOT:
+    return "Delta64PLTFromGOT";
+  case Delta32PLTFromGOT:
+    return "Delta32PLTFromGOT";
+  case Delta16PLTFromGOT:
+    return "Delta16PLTFromGOT";
+  case Delta32GOTBase:
+    return "Delta32GOTBase";
+  case Delta32dblGOTBase:
+    return "Delta32dblGOTBase";
+  case RequestGOTAndTransformToDelta64FromGOT:
+    return "RequestGOTAndTransformToDelta64FromGOT";
+  case RequestGOTAndTransformToDelta32FromGOT:
+    return "RequestGOTAndTransformToDelta32FromGOT";
+  case RequestGOTAndTransformToDelta20FromGOT:
+    return "RequestGOTAndTransformToDelta20FromGOT";
+  case RequestGOTAndTransformToDelta16FromGOT:
+    return "RequestGOTAndTransformToDelta16FromGOT";
+  case RequestGOTAndTransformToDelta12FromGOT:
+    return "RequestGOTAndTransformToDelta12FromGOT";
+  case RequestGOTAndTransformToDelta32dbl:
+    return "RequestGOTAndTransformToDelta32dbl";
+  default:
+    return getGenericEdgeKindName(static_cast<Edge::Kind>(R));
+  }
+}
+
+} // namespace systemz
+} // namespace jitlink
+} // namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
index 7e606c6a473b6..4e7db822776cc 100644
--- a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
@@ -27,7 +27,7 @@
 namespace llvm {
 namespace orc {
 
-MemoryMapper::~MemoryMapper() {}
+MemoryMapper::~MemoryMapper() = default;
 
 InProcessMemoryMapper::InProcessMemoryMapper(size_t PageSize)
     : PageSize(PageSize) {}
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
index 927558649eb4d..ca8192bb99492 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
@@ -16,9 +16,11 @@ add_llvm_component_library(LLVMOrcTargetProcess
   ExecutorSharedMemoryMapperService.cpp
   DefaultHostBootstrapValues.cpp
   ExecutorResolver.cpp
+  LibraryResolver.cpp
   JITLoaderGDB.cpp
   JITLoaderPerf.cpp
   JITLoaderVTune.cpp
+  LibraryScanner.cpp
   OrcRTBootstrap.cpp
   RegisterEHFrames.cpp
   SimpleExecutorDylibManager.cpp
@@ -36,6 +38,8 @@ add_llvm_component_library(LLVMOrcTargetProcess
 
   LINK_COMPONENTS
   ${intel_jit_profiling}
+  BinaryFormat
+  Object
   OrcShared
   Support
   TargetParser
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
new file mode 100644
index 0000000000000..7e1d5285463c7
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
@@ -0,0 +1,367 @@
+//===- LibraryResolver.cpp - Library Resolution of Unresolved Symbols ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Library resolution impl for unresolved symbols
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+
+#include "llvm/ADT/StringSet.h"
+
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Error.h"
+
+#include <mutex>
+#include <thread>
+
+#define DEBUG_TYPE "orc-resolver"
+
+namespace llvm::orc {
+
+LibraryResolver::LibraryResolver(const LibraryResolver::Setup &S)
+    : LibPathCache(S.Cache ? S.Cache : std::make_shared<LibraryPathCache>()),
+      LibPathResolver(S.PResolver
+                          ? S.PResolver
+                          : std::make_shared<PathResolver>(LibPathCache)),
+      ScanHelper(S.BasePaths, LibPathCache, LibPathResolver),
+      FB(S.FilterBuilder), LibMgr(),
+      ShouldScanCall(S.ShouldScanCall ? S.ShouldScanCall
+                                      : [](StringRef) -> bool { return true; }),
+      scanBatchSize(S.ScanBatchSize) {
+
+  if (ScanHelper.getAllUnits().empty()) {
+    LLVM_DEBUG(dbgs() << "Warning: No base paths provided for scanning.\n");
+  }
+}
+
+std::unique_ptr<LibraryResolutionDriver>
+LibraryResolutionDriver::create(const LibraryResolver::Setup &S) {
+  auto LR = std::make_unique<LibraryResolver>(S);
+  return std::unique_ptr<LibraryResolutionDriver>(
+      new LibraryResolutionDriver(std::move(LR)));
+}
+
+void LibraryResolutionDriver::addScanPath(const std::string &Path, PathType K) {
+  LR->ScanHelper.addBasePath(Path, K);
+}
+
+bool LibraryResolutionDriver::markLibraryLoaded(StringRef Path) {
+  auto Lib = LR->LibMgr.getLibrary(Path);
+  if (!Lib)
+    return false;
+
+  Lib->setState(LibraryManager::LibState::Loaded);
+
+  return true;
+}
+
+bool LibraryResolutionDriver::markLibraryUnLoaded(StringRef Path) {
+  auto Lib = LR->LibMgr.getLibrary(Path);
+  if (!Lib)
+    return false;
+
+  Lib->setState(LibraryManager::LibState::Unloaded);
+
+  return true;
+}
+
+void LibraryResolutionDriver::resolveSymbols(
+    std::vector<std::string> Syms,
+    LibraryResolver::OnSearchComplete OnCompletion,
+    const SearchConfig &Config) {
+  LR->searchSymbolsInLibraries(Syms, std::move(OnCompletion), Config);
+}
+
+static bool shouldIgnoreSymbol(const object::SymbolRef &Sym,
+                               uint32_t IgnoreFlags) {
+  Expected<uint32_t> FlagsOrErr = Sym.getFlags();
+  if (!FlagsOrErr) {
+    consumeError(FlagsOrErr.takeError());
+    return true;
+  }
+
+  uint32_t Flags = *FlagsOrErr;
+
+  using Filter = SymbolEnumeratorOptions;
+  if ((IgnoreFlags & Filter::IgnoreUndefined) &&
+      (Flags & object::SymbolRef::SF_Undefined))
+    return true;
+  if ((IgnoreFlags & Filter::IgnoreIndirect) &&
+      (Flags & object::SymbolRef::SF_Indirect))
+    return true;
+  if ((IgnoreFlags & Filter::IgnoreWeak) &&
+      (Flags & object::SymbolRef::SF_Weak))
+    return true;
+
+  return false;
+}
+
+bool SymbolEnumerator::enumerateSymbols(StringRef Path, OnEachSymbolFn OnEach,
+                                        const SymbolEnumeratorOptions &Opts) {
+  if (Path.empty())
+    return false;
+
+  ObjectFileLoader ObjLoader(Path);
+
+  auto ObjOrErr = ObjLoader.getObjectFile();
+  if (!ObjOrErr) {
+    std::string ErrMsg;
+    handleAllErrors(ObjOrErr.takeError(),
+                    [&](const ErrorInfoBase &EIB) { ErrMsg = EIB.message(); });
+    LLVM_DEBUG(dbgs() << "Failed loading object file: " << Path
+                      << "\nError: " << ErrMsg << "\n");
+    return false;
+  }
+
+  object::ObjectFile *Obj = &ObjOrErr.get();
+
+  auto processSymbolRange =
+      [&](object::ObjectFile::symbol_iterator_range Range) -> EnumerateResult {
+    for (const auto &Sym : Range) {
+      if (shouldIgnoreSymbol(Sym, Opts.FilterFlags))
+        continue;
+
+      auto NameOrErr = Sym.getName();
+      if (!NameOrErr) {
+        consumeError(NameOrErr.takeError());
+        continue;
+      }
+
+      StringRef Name = *NameOrErr;
+      if (Name.empty())
+        continue;
+
+      EnumerateResult Res = OnEach(Name);
+      if (Res != EnumerateResult::Continue)
+        return Res;
+    }
+    return EnumerateResult::Continue;
+  };
+
+  EnumerateResult Res = processSymbolRange(Obj->symbols());
+  if (Res != EnumerateResult::Continue)
+    return Res == EnumerateResult::Stop;
+
+  if (Obj->isELF()) {
+    const auto *ElfObj = cast<object::ELFObjectFileBase>(Obj);
+    Res = processSymbolRange(ElfObj->getDynamicSymbolIterators());
+    if (Res != EnumerateResult::Continue)
+      return Res == EnumerateResult::Stop;
+  } else if (Obj->isCOFF()) {
+    const auto *CoffObj = cast<object::COFFObjectFile>(Obj);
+    for (auto I = CoffObj->export_directory_begin(),
+              E = CoffObj->export_directory_end();
+         I != E; ++I) {
+      StringRef Name;
+      if (I->getSymbolName(Name))
+        continue;
+      if (Name.empty())
+        continue;
+
+      EnumerateResult Res = OnEach(Name);
+      if (Res != EnumerateResult::Continue)
+        return Res == EnumerateResult::Stop;
+    }
+  } else if (Obj->isMachO()) {
+  }
+
+  return true;
+}
+
+class SymbolSearchContext {
+public:
+  SymbolSearchContext(SymbolQuery &Q) : Q(Q) {}
+
+  bool hasSearched(const LibraryInfo *Lib) const { return Searched.count(Lib); }
+
+  void markSearched(const LibraryInfo *Lib) { Searched.insert(Lib); }
+
+  inline bool allResolved() const { return Q.allResolved(); }
+
+  SymbolQuery &query() { return Q; }
+
+private:
+  SymbolQuery &Q;
+  DenseSet<const LibraryInfo *> Searched;
+};
+
+void LibraryResolver::resolveSymbolsInLibrary(
+    LibraryInfo &Lib, SymbolQuery &UnresolvedSymbols,
+    const SymbolEnumeratorOptions &Opts) {
+  LLVM_DEBUG(dbgs() << "Checking unresolved symbols "
+                    << " in library : " << Lib.getFileName() << "\n";);
+  StringSet<> DiscoveredSymbols;
+
+  if (!UnresolvedSymbols.hasUnresolved()) {
+    LLVM_DEBUG(dbgs() << "Skipping library: " << Lib.getFullPath()
+                      << " — unresolved symbols exist.\n";);
+    return;
+  }
+
+  bool HasEnumerated = false;
+  auto enumerateSymbolsIfNeeded = [&]() {
+    if (HasEnumerated)
+      return;
+
+    HasEnumerated = true;
+
+    LLVM_DEBUG(dbgs() << "Enumerating symbols in library: " << Lib.getFullPath()
+                      << "\n";);
+    SymbolEnumerator::enumerateSymbols(
+        Lib.getFullPath(),
+        [&](StringRef sym) {
+          DiscoveredSymbols.insert(sym);
+          return EnumerateResult::Continue;
+        },
+        Opts);
+  };
+
+  if (!Lib.hasFilter()) {
+    LLVM_DEBUG(dbgs() << "Building filter for library: " << Lib.getFullPath()
+                      << "\n";);
+    enumerateSymbolsIfNeeded();
+    if (DiscoveredSymbols.empty()) {
+      LLVM_DEBUG(dbgs() << "  No symbols and remove library : "
+                        << Lib.getFullPath() << "\n";);
+      LibMgr.removeLibrary(Lib.getFullPath());
+      return;
+    }
+    SmallVector<StringRef> SymbolVec;
+    SymbolVec.reserve(DiscoveredSymbols.size());
+    for (const auto &KV : DiscoveredSymbols)
+      SymbolVec.push_back(KV.first());
+
+    Lib.ensureFilterBuilt(FB, SymbolVec);
+    LLVM_DEBUG({
+      dbgs() << "DiscoveredSymbols : " << DiscoveredSymbols.size() << "\n";
+      for (const auto &KV : DiscoveredSymbols)
+        dbgs() << "DiscoveredSymbols : " << KV.first() << "\n";
+    });
+  }
+
+  const auto &Unresolved = UnresolvedSymbols.getUnresolvedSymbols();
+  bool HadAnySym = false;
+  LLVM_DEBUG(dbgs() << "Total unresolved symbols : " << Unresolved.size()
+                    << "\n";);
+  for (const auto &Sym : Unresolved) {
+    if (Lib.mayContain(Sym)) {
+      LLVM_DEBUG(dbgs() << "Checking symbol '" << Sym
+                        << "' in library: " << Lib.getFullPath() << "\n";);
+      enumerateSymbolsIfNeeded();
+      if (DiscoveredSymbols.count(Sym) > 0) {
+        LLVM_DEBUG(dbgs() << "  Resolved symbol: " << Sym
+                          << " in library: " << Lib.getFullPath() << "\n";);
+        UnresolvedSymbols.resolve(Sym, Lib.getFullPath());
+        HadAnySym = true;
+      }
+    }
+  }
+
+  using LibraryState = LibraryManager::LibState;
+  if (HadAnySym && Lib.getState() != LibraryState::Loaded)
+    Lib.setState(LibraryState::Queried);
+}
+
+void LibraryResolver::searchSymbolsInLibraries(
+    std::vector<std::string> &SymbolList, OnSearchComplete OnComplete,
+    const SearchConfig &Config) {
+  SymbolQuery Q(SymbolList);
+
+  using LibraryState = LibraryManager::LibState;
+  using LibraryType = PathType;
+  auto tryResolveFrom = [&](LibraryState S, LibraryType K) {
+    LLVM_DEBUG(dbgs() << "Trying resolve from state=" << static_cast<int>(S)
+                      << " type=" << static_cast<int>(K) << "\n";);
+
+    SymbolSearchContext Ctx(Q);
+    while (!Ctx.allResolved()) {
+      std::vector<std::shared_ptr<LibraryInfo>> Libs;
+      LibMgr.getLibraries(S, K, Libs, [&](const LibraryInfo &Lib) {
+        return !Ctx.hasSearched(&Lib);
+      });
+
+      if (Libs.empty() && !scanLibrariesIfNeeded(K, scanBatchSize))
+        break; // no more new libs to scan
+
+      for (auto &Lib : Libs) {
+        // can use Async here?
+        resolveSymbolsInLibrary(*Lib, Ctx.query(), Config.Options);
+        Ctx.markSearched(Lib.get());
+
+        if (Ctx.allResolved())
+          return;
+      }
+    }
+  };
+
+  for (const auto &[St, Ty] : Config.Policy.Plan) {
+    tryResolveFrom(St, Ty);
+    if (Q.allResolved())
+      break;
+  }
+
+  // done:
+  LLVM_DEBUG({
+    dbgs() << "Search complete.\n";
+    for (const auto &r : Q.getAllResults())
+      dbgs() << "Resolved Symbol:" << r->Name << " -> " << r->ResolvedLibPath
+             << "\n";
+  });
+
+  OnComplete(Q);
+}
+
+bool LibraryResolver::scanLibrariesIfNeeded(PathType PK, size_t BatchSize) {
+  LLVM_DEBUG(dbgs() << "LibraryResolver::scanLibrariesIfNeeded: Scanning for "
+                    << (PK == PathType::User ? "User" : "System")
+                    << " libraries\n";);
+  if (!ScanHelper.leftToScan(PK))
+    return false;
+
+  LibraryScanner Scanner(ScanHelper, LibMgr, ShouldScanCall);
+  Scanner.scanNext(PK, BatchSize);
+  return true;
+}
+
+bool LibraryResolver::symbolExistsInLibrary(const LibraryInfo &Lib,
+                                            StringRef SymName,
+                                            std::vector<std::string> *AllSyms) {
+  SymbolEnumeratorOptions Opts;
+  return symbolExistsInLibrary(Lib, SymName, AllSyms, Opts);
+}
+
+bool LibraryResolver::symbolExistsInLibrary(
+    const LibraryInfo &Lib, StringRef SymName,
+    std::vector<std::string> *AllSyms, const SymbolEnumeratorOptions &Opts) {
+  bool Found = false;
+
+  SymbolEnumerator::enumerateSymbols(
+      Lib.getFullPath(),
+      [&](StringRef Sym) {
+        if (AllSyms)
+          AllSyms->emplace_back(Sym.str());
+
+        if (Sym == SymName) {
+          Found = true;
+        }
+
+        return EnumerateResult::Continue;
+      },
+      Opts);
+
+  return Found;
+}
+
+} // end namespace llvm::orc
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
new file mode 100644
index 0000000000000..96f0b66adec92
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
@@ -0,0 +1,1161 @@
+//===- LibraryScanner.cpp - Provide Library Scanning Implementation ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/MachOUniversal.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/Triple.h"
+
+#ifdef LLVM_ON_UNIX
+#include <sys/stat.h>
+#include <unistd.h>
+#endif // LLVM_ON_UNIX
+
+#ifdef __APPLE__
+#include <sys/stat.h>
+#undef LC_LOAD_DYLIB
+#undef LC_RPATH
+#endif // __APPLE__
+
+#define DEBUG_TYPE "orc-scanner"
+
+namespace llvm::orc {
+
+void handleError(Error Err, StringRef context = "") {
+  consumeError(handleErrors(std::move(Err), [&](const ErrorInfoBase &EIB) {
+    dbgs() << "LLVM Error";
+    if (!context.empty())
+      dbgs() << " [" << context << "]";
+    dbgs() << ": " << EIB.message() << "\n";
+  }));
+}
+
+bool ObjectFileLoader::isArchitectureCompatible(const object::ObjectFile &Obj) {
+  Triple HostTriple(sys::getProcessTriple());
+  Triple ObjTriple = Obj.makeTriple();
+
+  LLVM_DEBUG({
+    dbgs() << "Host triple: " << HostTriple.str()
+           << ", Object triple: " << ObjTriple.str() << "\n";
+  });
+
+  if (ObjTriple.getArch() != Triple::UnknownArch &&
+      HostTriple.getArch() != ObjTriple.getArch())
+    return false;
+
+  if (ObjTriple.getOS() != Triple::UnknownOS &&
+      HostTriple.getOS() != ObjTriple.getOS())
+    return false;
+
+  if (ObjTriple.getEnvironment() != Triple::UnknownEnvironment &&
+      HostTriple.getEnvironment() != Triple::UnknownEnvironment &&
+      HostTriple.getEnvironment() != ObjTriple.getEnvironment())
+    return false;
+
+  return true;
+}
+
+Expected<object::OwningBinary<object::ObjectFile>>
+ObjectFileLoader::loadObjectFileWithOwnership(StringRef FilePath) {
+  LLVM_DEBUG(dbgs() << "ObjectFileLoader: Attempting to open file " << FilePath
+                    << "\n";);
+  auto BinOrErr = object::createBinary(FilePath);
+  if (!BinOrErr) {
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: Failed to open file " << FilePath
+                      << "\n";);
+    return BinOrErr.takeError();
+  }
+
+  LLVM_DEBUG(dbgs() << "ObjectFileLoader: Successfully opened file " << FilePath
+                    << "\n";);
+
+  auto OwningBin = BinOrErr->takeBinary();
+  object::Binary *Bin = OwningBin.first.get();
+
+  if (Bin->isArchive()) {
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: File is an archive, not supported: "
+                      << FilePath << "\n";);
+    return createStringError(std::errc::invalid_argument,
+                             "Archive files are not supported: %s",
+                             FilePath.str().c_str());
+  }
+
+#if defined(__APPLE__)
+  if (auto *UB = dyn_cast<object::MachOUniversalBinary>(Bin)) {
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: Detected Mach-O universal binary: "
+                      << FilePath << "\n";);
+    for (auto ObjForArch : UB->objects()) {
+      auto ObjOrErr = ObjForArch.getAsObjectFile();
+      if (!ObjOrErr) {
+        LLVM_DEBUG(
+            dbgs()
+                << "ObjectFileLoader: Skipping invalid architecture slice\n";);
+
+        consumeError(ObjOrErr.takeError());
+        continue;
+      }
+
+      std::unique_ptr<object::ObjectFile> Obj = std::move(ObjOrErr.get());
+      if (isArchitectureCompatible(*Obj)) {
+        LLVM_DEBUG(
+            dbgs() << "ObjectFileLoader: Found compatible object slice\n";);
+
+        return object::OwningBinary<object::ObjectFile>(
+            std::move(Obj), std::move(OwningBin.second));
+
+      } else {
+        LLVM_DEBUG(dbgs() << "ObjectFileLoader: Incompatible architecture "
+                             "slice skipped\n";);
+      }
+    }
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: No compatible slices found in "
+                         "universal binary\n";);
+    return createStringError(inconvertibleErrorCode(),
+                             "No compatible object found in fat binary: %s",
+                             FilePath.str().c_str());
+  }
+#endif
+
+  auto ObjOrErr =
+      object::ObjectFile::createObjectFile(Bin->getMemoryBufferRef());
+  if (!ObjOrErr) {
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: Failed to create object file\n";);
+    return ObjOrErr.takeError();
+  }
+  LLVM_DEBUG(dbgs() << "ObjectFileLoader: Detected object file\n";);
+
+  std::unique_ptr<object::ObjectFile> Obj = std::move(*ObjOrErr);
+  if (!isArchitectureCompatible(*Obj)) {
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: Incompatible architecture: "
+                      << FilePath << "\n";);
+    return createStringError(inconvertibleErrorCode(),
+                             "Incompatible object file: %s",
+                             FilePath.str().c_str());
+  }
+
+  LLVM_DEBUG(dbgs() << "ObjectFileLoader: Object file is compatible\n";);
+
+  return object::OwningBinary<object::ObjectFile>(std::move(Obj),
+                                                  std::move(OwningBin.second));
+}
+
+template <class ELFT>
+bool isELFSharedLibrary(const object::ELFFile<ELFT> &ELFObj) {
+  if (ELFObj.getHeader().e_type != ELF::ET_DYN)
+    return false;
+
+  auto PHOrErr = ELFObj.program_headers();
+  if (!PHOrErr) {
+    consumeError(PHOrErr.takeError());
+    return true;
+  }
+
+  for (auto Phdr : *PHOrErr) {
+    if (Phdr.p_type == ELF::PT_INTERP)
+      return false;
+  }
+
+  return true;
+}
+
+bool isSharedLibraryObject(object::ObjectFile &Obj) {
+  if (Obj.isELF()) {
+    if (auto *ELF32LE = dyn_cast<object::ELF32LEObjectFile>(&Obj))
+      return isELFSharedLibrary(ELF32LE->getELFFile());
+    if (auto *ELF64LE = dyn_cast<object::ELF64LEObjectFile>(&Obj))
+      return isELFSharedLibrary(ELF64LE->getELFFile());
+    if (auto *ELF32BE = dyn_cast<object::ELF32BEObjectFile>(&Obj))
+      return isELFSharedLibrary(ELF32BE->getELFFile());
+    if (auto *ELF64BE = dyn_cast<object::ELF64BEObjectFile>(&Obj))
+      return isELFSharedLibrary(ELF64BE->getELFFile());
+  } else if (Obj.isMachO()) {
+    const object::MachOObjectFile *MachO =
+        dyn_cast<object::MachOObjectFile>(&Obj);
+    if (!MachO) {
+      LLVM_DEBUG(dbgs() << "Failed to cast to MachOObjectFile.\n";);
+      return false;
+    }
+    LLVM_DEBUG({
+      bool Result =
+          MachO->getHeader().filetype == MachO::HeaderFileType::MH_DYLIB;
+      dbgs() << "Mach-O filetype: " << MachO->getHeader().filetype
+             << " (MH_DYLIB == " << MachO::HeaderFileType::MH_DYLIB
+             << "), shared: " << Result << "\n";
+    });
+
+    return MachO->getHeader().filetype == MachO::HeaderFileType::MH_DYLIB;
+  } else if (Obj.isCOFF()) {
+    const object::COFFObjectFile *coff = dyn_cast<object::COFFObjectFile>(&Obj);
+    if (!coff)
+      return false;
+    return coff->getCharacteristics() & COFF::IMAGE_FILE_DLL;
+  } else {
+    LLVM_DEBUG(dbgs() << "Binary is not an ObjectFile.\n";);
+  }
+
+  return false;
+}
+
+bool DylibPathValidator::isSharedLibrary(StringRef Path) {
+  LLVM_DEBUG(dbgs() << "Checking if path is a shared library: " << Path
+                    << "\n";);
+
+  auto FileType = sys::fs::get_file_type(Path, /*Follow*/ true);
+  if (FileType != sys::fs::file_type::regular_file) {
+    LLVM_DEBUG(dbgs() << "File type is not a regular file for path: " << Path
+                      << "\n";);
+    return false;
+  }
+
+  file_magic MagicCode;
+  identify_magic(Path, MagicCode);
+
+  // Skip archives.
+  if (MagicCode == file_magic::archive)
+    return false;
+
+  // Universal binary handling.
+#if defined(__APPLE__)
+  if (MagicCode == file_magic::macho_universal_binary) {
+    ObjectFileLoader ObjLoader(Path);
+    auto ObjOrErr = ObjLoader.getObjectFile();
+    if (!ObjOrErr) {
+      consumeError(ObjOrErr.takeError());
+      return false;
+    }
+    return isSharedLibraryObject(ObjOrErr.get());
+  }
+#endif
+
+  // Object file inspection for PE/COFF, ELF, and Mach-O
+  bool NeedsObjectInspection =
+#if defined(_WIN32)
+      (MagicCode == file_magic::pecoff_executable);
+#elif defined(__APPLE__)
+      (MagicCode == file_magic::macho_fixed_virtual_memory_shared_lib ||
+       MagicCode == file_magic::macho_dynamically_linked_shared_lib ||
+       MagicCode == file_magic::macho_dynamically_linked_shared_lib_stub);
+#elif defined(LLVM_ON_UNIX)
+#ifdef __CYGWIN__
+      (MagicCode == file_magic::pecoff_executable);
+#else
+      (MagicCode == file_magic::elf_shared_object);
+#endif
+#else
+#error "Unsupported platform."
+#endif
+
+  if (NeedsObjectInspection) {
+    ObjectFileLoader ObjLoader(Path);
+    auto ObjOrErr = ObjLoader.getObjectFile();
+    if (!ObjOrErr) {
+      consumeError(ObjOrErr.takeError());
+      return false;
+    }
+    return isSharedLibraryObject(ObjOrErr.get());
+  }
+
+  LLVM_DEBUG(dbgs() << "Path is not identified as a shared library: " << Path
+                    << "\n";);
+  return false;
+}
+
+void DylibSubstitutor::configure(StringRef LoaderPath) {
+  SmallString<512> ExecPath(sys::fs::getMainExecutable(nullptr, nullptr));
+  sys::path::remove_filename(ExecPath);
+
+  SmallString<512> LoaderDir;
+  if (LoaderPath.empty()) {
+    LoaderDir = ExecPath;
+  } else {
+    LoaderDir = LoaderPath.str();
+    if (!sys::fs::is_directory(LoaderPath))
+      sys::path::remove_filename(LoaderDir);
+  }
+
+#ifdef __APPLE__
+  Placeholders["@loader_path"] = std::string(LoaderDir);
+  Placeholders["@executable_path"] = std::string(ExecPath);
+#else
+  Placeholders["$origin"] = std::string(LoaderDir);
+#endif
+}
+
+std::optional<std::string>
+SearchPathResolver::resolve(StringRef Stem, const DylibSubstitutor &Subst,
+                            DylibPathValidator &Validator) const {
+  for (const auto &SP : Paths) {
+    std::string Base = Subst.substitute(SP);
+
+    SmallString<512> FullPath(Base);
+    if (!PlaceholderPrefix.empty() &&
+        Stem.starts_with_insensitive(PlaceholderPrefix))
+      FullPath.append(Stem.drop_front(PlaceholderPrefix.size()));
+    else
+      sys::path::append(FullPath, Stem);
+
+    LLVM_DEBUG(dbgs() << "SearchPathResolver::resolve FullPath = " << FullPath
+                      << "\n";);
+
+    if (auto Valid = Validator.validate(FullPath.str()))
+      return Valid;
+  }
+
+  return std::nullopt;
+}
+
+std::optional<std::string>
+DylibResolverImpl::tryWithExtensions(StringRef LibStem) const {
+  LLVM_DEBUG(dbgs() << "tryWithExtensions: baseName = " << LibStem << "\n";);
+  SmallVector<SmallString<256>, 8> Candidates;
+
+  // Add extensions by platform
+#if defined(__APPLE__)
+  Candidates.emplace_back(LibStem);
+  Candidates.back() += ".dylib";
+#elif defined(_WIN32)
+  Candidates.emplace_back(LibStem);
+  Candidates.back() += ".dll";
+#else
+  Candidates.emplace_back(LibStem);
+  Candidates.back() += ".so";
+#endif
+
+  // Optionally try "lib" prefix if not already there
+  StringRef FileName = sys::path::filename(LibStem);
+  StringRef Base = sys::path::parent_path(LibStem);
+  if (!FileName.starts_with("lib")) {
+    SmallString<256> WithPrefix(Base);
+    if (!WithPrefix.empty())
+      sys::path::append(WithPrefix, ""); // ensure separator if needed
+    WithPrefix += "lib";
+    WithPrefix += FileName;
+
+#if defined(__APPLE__)
+    WithPrefix += ".dylib";
+#elif defined(_WIN32)
+    WithPrefix += ".dll";
+#else
+    WithPrefix += ".so";
+#endif
+
+    Candidates.push_back(std::move(WithPrefix));
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "  Candidates to try:\n";
+    for (const auto &C : Candidates)
+      dbgs() << "    " << C << "\n";
+  });
+
+  // Try all variants using tryAllPaths
+  for (const auto &Name : Candidates) {
+
+    LLVM_DEBUG(dbgs() << "  Trying candidate: " << Name << "\n";);
+
+    for (const auto &R : Resolvers) {
+      if (auto Res = R.resolve(Name, Substitutor, Validator))
+        return Res;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "  -> No candidate Resolved.\n";);
+
+  return std::nullopt;
+}
+
+std::optional<std::string>
+DylibResolverImpl::resolve(StringRef LibStem, bool VariateLibStem) const {
+  LLVM_DEBUG(dbgs() << "Resolving library stem: " << LibStem << "\n";);
+
+  // If it is an absolute path, don't try iterate over the paths.
+  if (sys::path::is_absolute(LibStem)) {
+    LLVM_DEBUG(dbgs() << "  -> Absolute path detected.\n";);
+    return Validator.validate(LibStem);
+  }
+
+  if (!LibStem.starts_with_insensitive("@rpath")) {
+    if (auto norm = Validator.validate(Substitutor.substitute(LibStem))) {
+      LLVM_DEBUG(dbgs() << "  -> Resolved after substitution: " << *norm
+                        << "\n";);
+
+      return norm;
+    }
+  }
+
+  for (const auto &R : Resolvers) {
+    LLVM_DEBUG(dbgs() << "  -> Resolving via search path ... \n";);
+    if (auto Result = R.resolve(LibStem, Substitutor, Validator)) {
+      LLVM_DEBUG(dbgs() << "  -> Resolved via search path: " << *Result
+                        << "\n";);
+
+      return Result;
+    }
+  }
+
+  // Expand libStem with paths, extensions, etc.
+  // std::string foundName;
+  if (VariateLibStem) {
+    LLVM_DEBUG(dbgs() << "  -> Trying with extensions...\n";);
+
+    if (auto Norm = tryWithExtensions(LibStem)) {
+      LLVM_DEBUG(dbgs() << "  -> Resolved via tryWithExtensions: " << *Norm
+                        << "\n";);
+
+      return Norm;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "  -> Could not resolve: " << LibStem << "\n";);
+
+  return std::nullopt;
+}
+
+#ifndef _WIN32
+mode_t PathResolver::lstatCached(StringRef Path) {
+  // If already cached - retun cached result
+  if (auto Cache = LibPathCache->read_lstat(Path))
+    return *Cache;
+
+  // Not cached: perform lstat and store
+  struct stat buf{};
+  mode_t st_mode = (lstat(Path.str().c_str(), &buf) == -1) ? 0 : buf.st_mode;
+
+  LibPathCache->insert_lstat(Path, st_mode);
+
+  return st_mode;
+}
+
+std::optional<std::string> PathResolver::readlinkCached(StringRef Path) {
+  // If already cached - retun cached result
+  if (auto Cache = LibPathCache->read_link(Path))
+    return Cache;
+
+  // If result not in cache - call system function and cache result
+  char buf[PATH_MAX];
+  ssize_t len;
+  if ((len = readlink(Path.str().c_str(), buf, sizeof(buf))) != -1) {
+    buf[len] = '\0';
+    std::string s(buf);
+    LibPathCache->insert_link(Path, s);
+    return s;
+  }
+  return std::nullopt;
+}
+
+void createComponent(StringRef Path, StringRef BasePath, bool BaseIsResolved,
+                     SmallVector<StringRef, 16> &Component) {
+  StringRef Separator = sys::path::get_separator();
+  if (!BaseIsResolved) {
+    if (Path[0] == '~' &&
+        (Path.size() == 1 || sys::path::is_separator(Path[1]))) {
+      static SmallString<128> HomeP;
+      if (HomeP.str().empty())
+        sys::path::home_directory(HomeP);
+      StringRef(HomeP).split(Component, Separator, /*MaxSplit*/ -1,
+                             /*KeepEmpty*/ false);
+    } else if (BasePath.empty()) {
+      static SmallString<256> CurrentPath;
+      if (CurrentPath.str().empty())
+        sys::fs::current_path(CurrentPath);
+      StringRef(CurrentPath)
+          .split(Component, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+    } else {
+      BasePath.split(Component, Separator, /*MaxSplit*/ -1,
+                     /*KeepEmpty*/ false);
+    }
+  }
+
+  Path.split(Component, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+}
+
+void normalizePathSegments(SmallVector<StringRef, 16> &PathParts) {
+  SmallVector<StringRef, 16> NormalizedPath;
+  for (auto &Part : PathParts) {
+    if (Part == ".") {
+      continue;
+    } else if (Part == "..") {
+      if (!NormalizedPath.empty() && NormalizedPath.back() != "..") {
+        NormalizedPath.pop_back();
+      } else {
+        NormalizedPath.push_back("..");
+      }
+    } else {
+      NormalizedPath.push_back(Part);
+    }
+  }
+  PathParts.swap(NormalizedPath);
+}
+#endif
+
+std::optional<std::string> PathResolver::realpathCached(StringRef Path,
+                                                        std::error_code &EC,
+                                                        StringRef Base,
+                                                        bool BaseIsResolved,
+                                                        long SymLoopLevel) {
+  EC.clear();
+
+  if (Path.empty()) {
+    EC = std::make_error_code(std::errc::no_such_file_or_directory);
+    LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Empty path\n";);
+
+    return std::nullopt;
+  }
+
+  if (SymLoopLevel <= 0) {
+    EC = std::make_error_code(std::errc::too_many_symbolic_link_levels);
+    LLVM_DEBUG(
+        dbgs() << "PathResolver::realpathCached: Too many Symlink levels: "
+               << Path << "\n";);
+
+    return std::nullopt;
+  }
+
+  // If already cached - retun cached result
+  bool isRelative = sys::path::is_relative(Path);
+  if (!isRelative) {
+    if (auto Cached = LibPathCache->read_realpath(Path)) {
+      EC = Cached->ErrnoCode;
+      if (EC) {
+        LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Cached (error) for "
+                          << Path << "\n";);
+      } else {
+        LLVM_DEBUG(
+            dbgs() << "PathResolver::realpathCached: Cached (success) for "
+                   << Path << " => " << Cached->canonicalPath << "\n";);
+      }
+      return Cached->canonicalPath.empty()
+                 ? std::nullopt
+                 : std::make_optional(Cached->canonicalPath);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Resolving path: " << Path
+                    << "\n";);
+
+  // If result not in cache - call system function and cache result
+
+  StringRef Separator(sys::path::get_separator());
+  SmallString<256> Resolved(Separator);
+#ifndef _WIN32
+  SmallVector<StringRef, 16> Components;
+
+  if (isRelative) {
+    if (BaseIsResolved) {
+      Resolved.assign(Base);
+      LLVM_DEBUG(dbgs() << "  Using Resolved base: " << Base << "\n";);
+    }
+    createComponent(Path, Base, BaseIsResolved, Components);
+  } else {
+    Path.split(Components, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+  }
+
+  normalizePathSegments(Components);
+  LLVM_DEBUG({
+    for (auto &C : Components)
+      dbgs() << " " << C << " ";
+
+    dbgs() << "\n";
+  });
+
+  // Handle path list items
+  for (const auto &Component : Components) {
+    if (Component == ".")
+      continue;
+    if (Component == "..") {
+      // collapse "a/b/../c" to "a/c"
+      size_t S = Resolved.rfind(Separator);
+      if (S != llvm::StringRef::npos)
+        Resolved.resize(S);
+      if (Resolved.empty())
+        Resolved = Separator;
+      continue;
+    }
+
+    size_t oldSize = Resolved.size();
+    sys::path::append(Resolved, Component);
+    const char *ResolvedPath = Resolved.c_str();
+    LLVM_DEBUG(dbgs() << "  Processing Component: " << Component << " => "
+                      << ResolvedPath << "\n";);
+    mode_t st_mode = lstatCached(ResolvedPath);
+
+    if (S_ISLNK(st_mode)) {
+      LLVM_DEBUG(dbgs() << "    Found symlink: " << ResolvedPath << "\n";);
+
+      auto SymlinkOpt = readlinkCached(ResolvedPath);
+      if (!SymlinkOpt) {
+        EC = std::make_error_code(std::errc::no_such_file_or_directory);
+        LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC});
+        LLVM_DEBUG(dbgs() << "    Failed to read symlink: " << ResolvedPath
+                          << "\n";);
+
+        return std::nullopt;
+      }
+
+      StringRef Symlink = *SymlinkOpt;
+      LLVM_DEBUG(dbgs() << "    Symlink points to: " << Symlink << "\n";);
+
+      std::string resolvedBase = "";
+      if (sys::path::is_relative(Symlink)) {
+        Resolved.resize(oldSize);
+        resolvedBase = Resolved.str().str();
+      }
+
+      auto RealSymlink =
+          realpathCached(Symlink, EC, resolvedBase,
+                         /*BaseIsResolved=*/true, SymLoopLevel - 1);
+      if (!RealSymlink) {
+        LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC});
+        LLVM_DEBUG(dbgs() << "    Failed to resolve symlink target: " << Symlink
+                          << "\n";);
+
+        return std::nullopt;
+      }
+
+      Resolved.assign(*RealSymlink);
+      LLVM_DEBUG(dbgs() << "    Symlink Resolved to: " << Resolved << "\n";);
+
+    } else if (st_mode == 0) {
+      EC = std::make_error_code(std::errc::no_such_file_or_directory);
+      LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC});
+      LLVM_DEBUG(dbgs() << "    Component does not exist: " << ResolvedPath
+                        << "\n";);
+
+      return std::nullopt;
+    }
+  }
+#else
+  EC = sys::fs::real_path(Path, Resolved); // Windows fallback
+#endif
+
+  std::string Canonical = Resolved.str().str();
+  {
+    LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{
+                                            Canonical,
+                                            std::error_code() // success
+                                        });
+  }
+  LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Final Resolved: " << Path
+                    << " => " << Canonical << "\n";);
+  return Canonical;
+}
+
+void LibraryScanHelper::addBasePath(const std::string &Path, PathType K) {
+  std::error_code EC;
+  std::string Canon = resolveCanonical(Path, EC);
+  if (EC) {
+    LLVM_DEBUG(
+        dbgs()
+            << "LibraryScanHelper::addBasePath: Failed to canonicalize path: "
+            << Path << "\n";);
+    return;
+  }
+  std::unique_lock<std::shared_mutex> Lock(Mtx);
+  if (LibSearchPaths.count(Canon)) {
+    LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Already added: "
+                      << Canon << "\n";);
+    return;
+  }
+  K = K == PathType::Unknown ? classifyKind(Canon) : K;
+  auto SP = std::make_shared<LibrarySearchPath>(Canon, K);
+  LibSearchPaths[Canon] = SP;
+
+  if (K == PathType::User) {
+    LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Added User path: "
+                      << Canon << "\n";);
+    UnscannedUsr.push_back(StringRef(SP->BasePath));
+  } else {
+    LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Added System path: "
+                      << Canon << "\n";);
+    UnscannedSys.push_back(StringRef(SP->BasePath));
+  }
+}
+
+std::vector<std::shared_ptr<LibrarySearchPath>>
+LibraryScanHelper::getNextBatch(PathType K, size_t BatchSize) {
+  std::vector<std::shared_ptr<LibrarySearchPath>> Result;
+  auto &Queue = (K == PathType::User) ? UnscannedUsr : UnscannedSys;
+
+  std::unique_lock<std::shared_mutex> Lock(Mtx);
+
+  while (!Queue.empty() && (BatchSize == 0 || Result.size() < BatchSize)) {
+    StringRef Base = Queue.front();
+    auto It = LibSearchPaths.find(Base);
+    if (It != LibSearchPaths.end()) {
+      auto &SP = It->second;
+      ScanState Expected = ScanState::NotScanned;
+      if (SP->State.compare_exchange_strong(Expected, ScanState::Scanning)) {
+        Result.push_back(SP);
+      }
+    }
+    Queue.pop_front();
+  }
+
+  return Result;
+}
+
+bool LibraryScanHelper::isTrackedBasePath(StringRef Path) const {
+  std::error_code EC;
+  std::string Canon = resolveCanonical(Path, EC);
+  if (EC)
+    return false;
+
+  std::shared_lock<std::shared_mutex> Lock(Mtx);
+  return LibSearchPaths.count(Canon) > 0;
+}
+
+bool LibraryScanHelper::leftToScan(PathType K) const {
+  std::shared_lock<std::shared_mutex> Lock(Mtx);
+  for (const auto &KV : LibSearchPaths) {
+    const auto &SP = KV.second;
+    if (SP->Kind == K && SP->State == ScanState::NotScanned)
+      return true;
+  }
+  return false;
+}
+
+void LibraryScanHelper::resetToScan() {
+  std::shared_lock<std::shared_mutex> Lock(Mtx);
+
+  for (auto &[_, SP] : LibSearchPaths) {
+    ScanState Expected = ScanState::Scanned;
+
+    if (!SP->State.compare_exchange_strong(Expected, ScanState::NotScanned))
+      continue;
+
+    auto &TargetList =
+        (SP->Kind == PathType::User) ? UnscannedUsr : UnscannedSys;
+    TargetList.emplace_back(SP->BasePath);
+  }
+}
+
+std::vector<std::shared_ptr<LibrarySearchPath>>
+LibraryScanHelper::getAllUnits() const {
+  std::shared_lock<std::shared_mutex> Lock(Mtx);
+  std::vector<std::shared_ptr<LibrarySearchPath>> Result;
+  Result.reserve(LibSearchPaths.size());
+  for (const auto &[_, SP] : LibSearchPaths) {
+    Result.push_back(SP);
+  }
+  return Result;
+}
+
+std::string LibraryScanHelper::resolveCanonical(StringRef Path,
+                                                std::error_code &EC) const {
+  auto Canon = LibPathResolver->resolve(Path, EC);
+  return EC ? Path.str() : *Canon;
+}
+
+PathType LibraryScanHelper::classifyKind(StringRef Path) const {
+  // Detect home directory
+  const char *Home = getenv("HOME");
+  if (Home && Path.starts_with(Home))
+    return PathType::User;
+
+  static const std::array<std::string, 5> UserPrefixes = {
+      "/usr/local",    // often used by users for manual installs
+      "/opt/homebrew", // common on macOS
+      "/opt/local",    // MacPorts
+      "/home",         // Linux home dirs
+      "/Users",        // macOS user dirs
+  };
+
+  for (const auto &Prefix : UserPrefixes) {
+    if (Path.starts_with(Prefix))
+      return PathType::User;
+  }
+
+  return PathType::System;
+}
+
+Expected<LibraryDepsInfo> parseMachODeps(const object::MachOObjectFile &Obj) {
+  LibraryDepsInfo Libdeps;
+  LLVM_DEBUG(dbgs() << "Parsing Mach-O dependencies...\n";);
+  for (const auto &Command : Obj.load_commands()) {
+    switch (Command.C.cmd) {
+    case MachO::LC_LOAD_DYLIB: {
+      MachO::dylib_command dylibCmd = Obj.getDylibIDLoadCommand(Command);
+      const char *name = Command.Ptr + dylibCmd.dylib.name;
+      Libdeps.addDep(name);
+      LLVM_DEBUG(dbgs() << "  Found LC_LOAD_DYLIB: " << name << "\n";);
+    } break;
+    case MachO::LC_LOAD_WEAK_DYLIB:
+    case MachO::LC_REEXPORT_DYLIB:
+    case MachO::LC_LOAD_UPWARD_DYLIB:
+    case MachO::LC_LAZY_LOAD_DYLIB:
+      break;
+    case MachO::LC_RPATH: {
+      // Extract RPATH
+      MachO::rpath_command rpathCmd = Obj.getRpathCommand(Command);
+      const char *rpath = Command.Ptr + rpathCmd.path;
+      LLVM_DEBUG(dbgs() << "  Found LC_RPATH: " << rpath << "\n";);
+
+      SmallVector<StringRef, 4> RawPaths;
+      SplitString(StringRef(rpath), RawPaths,
+                  sys::EnvPathSeparator == ':' ? ":" : ";");
+
+      for (const auto &raw : RawPaths) {
+        Libdeps.addRPath(raw.str()); // Convert to std::string
+        LLVM_DEBUG(dbgs() << "    Parsed RPATH entry: " << raw << "\n";);
+      }
+      break;
+    }
+    }
+  }
+
+  return Expected<LibraryDepsInfo>(std::move(Libdeps));
+}
+
+template <class ELFT>
+static Expected<StringRef> getDynamicStrTab(const object::ELFFile<ELFT> &Elf) {
+  auto DynamicEntriesOrError = Elf.dynamicEntries();
+  if (!DynamicEntriesOrError)
+    return DynamicEntriesOrError.takeError();
+
+  for (const typename ELFT::Dyn &Dyn : *DynamicEntriesOrError) {
+    if (Dyn.d_tag == ELF::DT_STRTAB) {
+      auto MappedAddrOrError = Elf.toMappedAddr(Dyn.getPtr());
+      if (!MappedAddrOrError)
+        return MappedAddrOrError.takeError();
+      return StringRef(reinterpret_cast<const char *>(*MappedAddrOrError));
+    }
+  }
+
+  // If the dynamic segment is not present, we fall back on the sections.
+  auto SectionsOrError = Elf.sections();
+  if (!SectionsOrError)
+    return SectionsOrError.takeError();
+
+  for (const typename ELFT::Shdr &Sec : *SectionsOrError) {
+    if (Sec.sh_type == ELF::SHT_DYNSYM)
+      return Elf.getStringTableForSymtab(Sec);
+  }
+
+  return make_error<StringError>("dynamic string table not found",
+                                 inconvertibleErrorCode());
+}
+
+template <typename ELFT>
+Expected<LibraryDepsInfo> parseELF(const object::ELFFile<ELFT> &Elf) {
+  LibraryDepsInfo Deps;
+  Expected<StringRef> StrTabOrErr = getDynamicStrTab(Elf);
+  if (!StrTabOrErr)
+    return StrTabOrErr.takeError();
+
+  const char *Data = StrTabOrErr->data();
+
+  auto DynamicEntriesOrError = Elf.dynamicEntries();
+  if (!DynamicEntriesOrError) {
+    return DynamicEntriesOrError.takeError();
+  }
+
+  for (const typename ELFT::Dyn &Dyn : *DynamicEntriesOrError) {
+    switch (Dyn.d_tag) {
+    case ELF::DT_NEEDED:
+      Deps.addDep(Data + Dyn.d_un.d_val);
+      break;
+    case ELF::DT_RPATH: {
+      SmallVector<StringRef, 4> RawPaths;
+      SplitString(Data + Dyn.d_un.d_val, RawPaths,
+                  sys::EnvPathSeparator == ':' ? ":" : ";");
+      for (const auto &raw : RawPaths)
+        Deps.addRPath(raw.str());
+      break;
+    }
+    case ELF::DT_RUNPATH: {
+      SmallVector<StringRef, 4> RawPaths;
+      SplitString(Data + Dyn.d_un.d_val, RawPaths,
+                  sys::EnvPathSeparator == ':' ? ":" : ";");
+      for (const auto &raw : RawPaths)
+        Deps.addRunPath(raw.str());
+      break;
+    }
+    case ELF::DT_FLAGS_1:
+      // Check if this is not a pie executable.
+      if (Dyn.d_un.d_val & ELF::DF_1_PIE)
+        Deps.isPIE = true;
+      break;
+      // (Dyn.d_tag == ELF::DT_NULL) continue;
+      // (Dyn.d_tag == ELF::DT_AUXILIARY || Dyn.d_tag == ELF::DT_FILTER)
+    default:
+      break;
+    }
+  }
+
+  return Expected<LibraryDepsInfo>(std::move(Deps));
+}
+
+Expected<LibraryDepsInfo> parseELFDeps(const object::ELFObjectFileBase &Obj) {
+  using namespace object;
+  LLVM_DEBUG(dbgs() << "parseELFDeps: Detected ELF object\n";);
+  if (const auto *ELF = dyn_cast<ELF32LEObjectFile>(&Obj))
+    return parseELF(ELF->getELFFile());
+  else if (const auto *ELF = dyn_cast<ELF32BEObjectFile>(&Obj))
+    return parseELF(ELF->getELFFile());
+  else if (const auto *ELF = dyn_cast<ELF64LEObjectFile>(&Obj))
+    return parseELF(ELF->getELFFile());
+  else if (const auto *ELF = dyn_cast<ELF64BEObjectFile>(&Obj))
+    return parseELF(ELF->getELFFile());
+
+  LLVM_DEBUG(dbgs() << "parseELFDeps: Unknown ELF format\n";);
+  return createStringError(std::errc::not_supported, "Unknown ELF format");
+}
+
+Expected<LibraryDepsInfo> LibraryScanner::extractDeps(StringRef FilePath) {
+  LLVM_DEBUG(dbgs() << "extractDeps: Attempting to open file " << FilePath
+                    << "\n";);
+
+  ObjectFileLoader ObjLoader(FilePath);
+  auto ObjOrErr = ObjLoader.getObjectFile();
+  if (!ObjOrErr) {
+    LLVM_DEBUG(dbgs() << "extractDeps: Failed to open " << FilePath << "\n";);
+    return ObjOrErr.takeError();
+  }
+
+  object::ObjectFile *Obj = &ObjOrErr.get();
+
+  if (auto *elfObj = dyn_cast<object::ELFObjectFileBase>(Obj)) {
+    LLVM_DEBUG(dbgs() << "extractDeps: File " << FilePath
+                      << " is an ELF object\n";);
+
+    return parseELFDeps(*elfObj);
+  }
+
+  if (auto *macho = dyn_cast<object::MachOObjectFile>(Obj)) {
+    LLVM_DEBUG(dbgs() << "extractDeps: File " << FilePath
+                      << " is a Mach-O object\n";);
+    return parseMachODeps(*macho);
+  }
+
+  if (Obj->isCOFF()) {
+    // TODO: COFF support
+    return LibraryDepsInfo();
+  }
+
+  LLVM_DEBUG(dbgs() << "extractDeps: Unsupported binary format for file "
+                    << FilePath << "\n";);
+  return createStringError(inconvertibleErrorCode(),
+                           "Unsupported binary format: %s",
+                           FilePath.str().c_str());
+}
+
+std::optional<std::string> LibraryScanner::shouldScan(StringRef FilePath) {
+  std::error_code EC;
+
+  LLVM_DEBUG(dbgs() << "[shouldScan] Checking: " << FilePath << "\n";);
+
+  // [1] Check file existence early
+  if (!sys::fs::exists(FilePath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: file does not exist.\n";);
+
+    return std::nullopt;
+  }
+
+  // [2] Resolve to canonical path
+  auto CanonicalPathOpt = ScanHelper.resolve(FilePath, EC);
+  if (EC || !CanonicalPathOpt) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: failed to resolve path (EC="
+                      << EC.message() << ").\n";);
+
+    return std::nullopt;
+  }
+
+  const std::string &CanonicalPath = *CanonicalPathOpt;
+  LLVM_DEBUG(dbgs() << "  -> Canonical path: " << CanonicalPath << "\n");
+
+  // [3] Check if it's a directory — skip directories
+  if (sys::fs::is_directory(CanonicalPath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: path is a directory.\n";);
+
+    return std::nullopt;
+  }
+
+  // [4] Skip if it's not a shared library.
+  if (!DylibPathValidator::isSharedLibrary(CanonicalPath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: not a shared library.\n";);
+    return std::nullopt;
+  }
+
+  // [5] Skip if we've already seen this path (via cache)
+  if (ScanHelper.hasSeenOrMark(CanonicalPath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: already seen.\n";);
+
+    return std::nullopt;
+  }
+
+  // [6] Already tracked in LibraryManager?
+  if (LibMgr.hasLibrary(CanonicalPath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: already tracked by LibraryManager.\n";);
+
+    return std::nullopt;
+  }
+
+  // [7] Run user-defined hook (default: always true)
+  if (!ShouldScanCall(CanonicalPath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: user-defined hook rejected.\n";);
+
+    return std::nullopt;
+  }
+
+  LLVM_DEBUG(dbgs() << "  -> Accepted: ready to scan " << CanonicalPath
+                    << "\n";);
+  return CanonicalPath;
+}
+
+void LibraryScanner::handleLibrary(StringRef FilePath, PathType K, int level) {
+  LLVM_DEBUG(dbgs() << "LibraryScanner::handleLibrary: Scanning: " << FilePath
+                    << ", level=" << level << "\n";);
+  auto CanonPathOpt = shouldScan(FilePath);
+  if (!CanonPathOpt) {
+    LLVM_DEBUG(dbgs() << "  Skipped (shouldScan returned false): " << FilePath
+                      << "\n";);
+
+    return;
+  }
+  const std::string CanonicalPath = *CanonPathOpt;
+
+  auto DepsOrErr = extractDeps(CanonicalPath);
+  if (!DepsOrErr) {
+    LLVM_DEBUG(dbgs() << "  Failed to extract deps for: " << CanonicalPath
+                      << "\n";);
+    handleError(DepsOrErr.takeError());
+    return;
+  }
+
+  LibraryDepsInfo &Deps = *DepsOrErr;
+
+  LLVM_DEBUG({
+    dbgs() << "    Found deps : \n";
+    for (const auto &dep : Deps.deps)
+      dbgs() << "        : " << dep << "\n";
+    dbgs() << "    Found @rpath : " << Deps.rpath.size() << "\n";
+    for (const auto &r : Deps.rpath)
+      dbgs() << "     : " << r << "\n";
+    dbgs() << "    Found @runpath : \n";
+    for (const auto &r : Deps.runPath)
+      dbgs() << "     : " << r << "\n";
+  });
+
+  if (Deps.isPIE && level == 0) {
+    LLVM_DEBUG(dbgs() << "  Skipped PIE executable at top level: "
+                      << CanonicalPath << "\n";);
+
+    return;
+  }
+
+  bool Added = LibMgr.addLibrary(CanonicalPath, K);
+  if (!Added) {
+    LLVM_DEBUG(dbgs() << "  Already added: " << CanonicalPath << "\n";);
+    return;
+  }
+
+  // Heuristic 1: No RPATH/RUNPATH, skip deps
+  if (Deps.rpath.empty() && Deps.runPath.empty()) {
+    LLVM_DEBUG(
+        dbgs() << "LibraryScanner::handleLibrary: Skipping deps (Heuristic1): "
+               << CanonicalPath << "\n";);
+    return;
+  }
+
+  // Heuristic 2: All RPATH and RUNPATH already tracked
+  auto allTracked = [&](const auto &Paths) {
+    LLVM_DEBUG(dbgs() << "   Checking : " << Paths.size() << "\n";);
+    return std::all_of(Paths.begin(), Paths.end(), [&](StringRef P) {
+      LLVM_DEBUG(dbgs() << "      Checking isTrackedBasePath : " << P << "\n";);
+      return ScanHelper.isTrackedBasePath(
+          DylibResolver::resolvelinkerFlag(P, CanonicalPath));
+    });
+  };
+
+  if (allTracked(Deps.rpath) && allTracked(Deps.runPath)) {
+    LLVM_DEBUG(
+        dbgs() << "LibraryScanner::handleLibrary: Skipping deps (Heuristic2): "
+               << CanonicalPath << "\n";);
+    return;
+  }
+
+  DylibPathValidator Validator(ScanHelper.getPathResolver());
+  DylibResolver Resolver(Validator);
+  Resolver.configure(CanonicalPath,
+                     {{Deps.rpath, SearchPathType::RPath},
+                      {ScanHelper.getSearchPaths(), SearchPathType::UsrOrSys},
+                      {Deps.runPath, SearchPathType::RunPath}});
+  for (StringRef Dep : Deps.deps) {
+    LLVM_DEBUG(dbgs() << "  Resolving dep: " << Dep << "\n";);
+    auto DepFullOpt = Resolver.resolve(Dep);
+    if (!DepFullOpt) {
+      LLVM_DEBUG(dbgs() << "    Failed to resolve dep: " << Dep << "\n";);
+
+      continue;
+    }
+    LLVM_DEBUG(dbgs() << "    Resolved dep to: " << *DepFullOpt << "\n";);
+
+    handleLibrary(*DepFullOpt, K, level + 1);
+  }
+}
+
+void LibraryScanner::scanBaseDir(std::shared_ptr<LibrarySearchPath> SP) {
+  if (!sys::fs::is_directory(SP->BasePath) || SP->BasePath.empty()) {
+    LLVM_DEBUG(
+        dbgs() << "LibraryScanner::scanBaseDir: Invalid or empty basePath: "
+               << SP->BasePath << "\n";);
+    return;
+  }
+
+  LLVM_DEBUG(dbgs() << "LibraryScanner::scanBaseDir: Scanning directory: "
+                    << SP->BasePath << "\n";);
+  std::error_code EC;
+
+  SP->State.store(ScanState::Scanning);
+
+  for (sys::fs::directory_iterator It(SP->BasePath, EC), end; It != end && !EC;
+       It.increment(EC)) {
+    auto Entry = *It;
+    if (!Entry.status())
+      continue;
+
+    auto Status = *Entry.status();
+    if (sys::fs::is_regular_file(Status) || sys::fs::is_symlink_file(Status)) {
+      LLVM_DEBUG(dbgs() << "  Found file: " << Entry.path() << "\n";);
+      // async support ?
+      handleLibrary(Entry.path(), SP->Kind);
+    }
+  }
+
+  SP->State.store(ScanState::Scanned);
+}
+
+void LibraryScanner::scanNext(PathType K, size_t BatchSize) {
+  LLVM_DEBUG(dbgs() << "LibraryScanner::scanNext: Scanning next batch of size "
+                    << BatchSize << " for kind "
+                    << (K == PathType::User ? "User" : "System") << "\n";);
+
+  auto SearchPaths = ScanHelper.getNextBatch(K, BatchSize);
+  for (auto &SP : SearchPaths) {
+    LLVM_DEBUG(dbgs() << "  Scanning unit with basePath: " << SP->BasePath
+                      << "\n";);
+
+    scanBaseDir(SP);
+  }
+}
+
+} // end namespace llvm::orc
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
index fe881a1eef773..af24540ceebcb 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
@@ -36,8 +36,6 @@ Expected<ExecutorAddr> SimpleExecutorMemoryManager::reserve(uint64_t Size) {
 
 Expected<ExecutorAddr>
 SimpleExecutorMemoryManager::initialize(tpctypes::FinalizeRequest &FR) {
-  std::vector<shared::WrapperFunctionCall> DeallocationActions;
-
   if (FR.Segments.empty()) {
     if (FR.Actions.empty())
       return make_error<StringError>("Finalization request is empty",
diff --git a/llvm/lib/FileCheck/FileCheckImpl.h b/llvm/lib/FileCheck/FileCheckImpl.h
index a08502e4497e3..5851cfc4b5d5c 100644
--- a/llvm/lib/FileCheck/FileCheckImpl.h
+++ b/llvm/lib/FileCheck/FileCheckImpl.h
@@ -528,7 +528,7 @@ class ErrorDiagnostic : public ErrorInfo<ErrorDiagnostic> {
   SMRange getRange() const { return Range; }
 
   static Error get(const SourceMgr &SM, SMLoc Loc, const Twine &ErrMsg,
-                   SMRange Range = std::nullopt) {
+                   SMRange Range = {}) {
     return make_error<ErrorDiagnostic>(
         SM.GetMessage(Loc, SourceMgr::DK_Error, ErrMsg), Range);
   }
diff --git a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp
index b546e816419e3..4e1602703fb35 100644
--- a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp
+++ b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Frontend/Driver/CodeGenOptions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/SystemLibraries.h"
 #include "llvm/ProfileData/InstrProfCorrelator.h"
 #include "llvm/TargetParser/Triple.h"
 
@@ -25,35 +26,35 @@ TargetLibraryInfoImpl *createTLII(const llvm::Triple &TargetTriple,
   using VectorLibrary = llvm::driver::VectorLibrary;
   switch (Veclib) {
   case VectorLibrary::Accelerate:
-    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::Accelerate,
+    TLII->addVectorizableFunctionsFromVecLib(llvm::VectorLibrary::Accelerate,
                                              TargetTriple);
     break;
   case VectorLibrary::LIBMVEC:
-    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::LIBMVEC,
+    TLII->addVectorizableFunctionsFromVecLib(llvm::VectorLibrary::LIBMVEC,
                                              TargetTriple);
     break;
   case VectorLibrary::MASSV:
-    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::MASSV,
+    TLII->addVectorizableFunctionsFromVecLib(llvm::VectorLibrary::MASSV,
                                              TargetTriple);
     break;
   case VectorLibrary::SVML:
-    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::SVML,
+    TLII->addVectorizableFunctionsFromVecLib(llvm::VectorLibrary::SVML,
                                              TargetTriple);
     break;
   case VectorLibrary::SLEEF:
-    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::SLEEFGNUABI,
+    TLII->addVectorizableFunctionsFromVecLib(llvm::VectorLibrary::SLEEFGNUABI,
                                              TargetTriple);
     break;
   case VectorLibrary::Darwin_libsystem_m:
     TLII->addVectorizableFunctionsFromVecLib(
-        TargetLibraryInfoImpl::DarwinLibSystemM, TargetTriple);
+        llvm::VectorLibrary::DarwinLibSystemM, TargetTriple);
     break;
   case VectorLibrary::ArmPL:
-    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::ArmPL,
+    TLII->addVectorizableFunctionsFromVecLib(llvm::VectorLibrary::ArmPL,
                                              TargetTriple);
     break;
   case VectorLibrary::AMDLIBM:
-    TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::AMDLIBM,
+    TLII->addVectorizableFunctionsFromVecLib(llvm::VectorLibrary::AMDLIBM,
                                              TargetTriple);
     break;
   default:
diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
index c4aa2c7638450..86060d1d2b0b3 100644
--- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
+++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
@@ -29,7 +29,6 @@
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 #include <memory>
-#include <string>
 #include <utility>
 
 using namespace llvm;
@@ -148,21 +147,27 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef<ArrayRef<char>> Bufs,
     Image->setAlignment(Align(object::OffloadBinary::getAlignment()));
 
     StringRef Binary(Buf.data(), Buf.size());
-    assert(identify_magic(Binary) == file_magic::offload_binary &&
-           "Invalid binary format");
 
+    uint64_t BeginOffset = 0;
+    uint64_t EndOffset = Binary.size();
+
+    // Optionally use an offload binary for its offload dumping support.
     // The device image struct contains the pointer to the beginning and end of
     // the image stored inside of the offload binary. There should only be one
     // of these for each buffer so we parse it out manually.
-    const auto *Header =
-        reinterpret_cast<const object::OffloadBinary::Header *>(
-            Binary.bytes_begin());
-    const auto *Entry = reinterpret_cast<const object::OffloadBinary::Entry *>(
-        Binary.bytes_begin() + Header->EntryOffset);
-
-    auto *Begin = ConstantInt::get(getSizeTTy(M), Entry->ImageOffset);
-    auto *Size =
-        ConstantInt::get(getSizeTTy(M), Entry->ImageOffset + Entry->ImageSize);
+    if (identify_magic(Binary) == file_magic::offload_binary) {
+      const auto *Header =
+          reinterpret_cast<const object::OffloadBinary::Header *>(
+              Binary.bytes_begin());
+      const auto *Entry =
+          reinterpret_cast<const object::OffloadBinary::Entry *>(
+              Binary.bytes_begin() + Header->EntryOffset);
+      BeginOffset = Entry->ImageOffset;
+      EndOffset = Entry->ImageOffset + Entry->ImageSize;
+    }
+
+    auto *Begin = ConstantInt::get(getSizeTTy(M), BeginOffset);
+    auto *Size = ConstantInt::get(getSizeTTy(M), EndOffset);
     Constant *ZeroBegin[] = {Zero, Begin};
     Constant *ZeroSize[] = {Zero, Size};
 
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 0e5926ff0fb18..ac86fa859967e 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -528,9 +528,15 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
   Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION);
   Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
   auto Int32Ty = Type::getInt32Ty(Builder.getContext());
-  constexpr const size_t MaxDim = 3;
+  constexpr size_t MaxDim = 3;
   Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
-  Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
+
+  Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
+
+  Value *DynCGroupMemFallbackFlag =
+      Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
+  DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
+  Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
 
   assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
 
@@ -559,7 +565,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
                 Flags,
                 NumTeams3D,
                 NumThreads3D,
-                KernelArgs.DynCGGroupMem};
+                KernelArgs.DynCGroupMem};
 }
 
 void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
@@ -8224,7 +8230,8 @@ static void emitTargetCall(
     OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
     OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB,
     const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
-    bool HasNoWait) {
+    bool HasNoWait, Value *DynCGroupMem,
+    OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
   // Generate a function call to the host fallback implementation of the target
   // region. This is called by the host when no offload entry was generated for
   // the target region and when the offloading call fails at runtime.
@@ -8360,12 +8367,13 @@ static void emitTargetCall(
                                                    /*isSigned=*/false)
                            : Builder.getInt64(0);
 
-    // TODO: Use correct DynCGGroupMem
-    Value *DynCGGroupMem = Builder.getInt32(0);
+    // Request zero groupprivate bytes by default.
+    if (!DynCGroupMem)
+      DynCGroupMem = Builder.getInt32(0);
 
-    KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
-                                              NumTeamsC, NumThreadsC,
-                                              DynCGGroupMem, HasNoWait);
+    KArgs = OpenMPIRBuilder::TargetKernelArgs(
+        NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
+        HasNoWait, DynCGroupMemFallback);
 
     // Assume no error was returned because TaskBodyCB and
     // EmitTargetCallFallbackCB don't produce any.
@@ -8414,7 +8422,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
     OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
     OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
     CustomMapperCallbackTy CustomMapperCB,
-    const SmallVector<DependData> &Dependencies, bool HasNowait) {
+    const SmallVector<DependData> &Dependencies, bool HasNowait,
+    Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
 
   if (!updateToLocation(Loc))
     return InsertPointTy();
@@ -8437,7 +8446,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
   if (!Config.isTargetDevice())
     emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
                    IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
-                   CustomMapperCB, Dependencies, HasNowait);
+                   CustomMapperCB, Dependencies, HasNowait, DynCGroupMem,
+                   DynCGroupMemFallback);
   return Builder.saveIP();
 }
 
@@ -8460,9 +8470,8 @@ OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const {
                                                 Config.separator());
 }
 
-GlobalVariable *
-OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
-                                             unsigned AddressSpace) {
+GlobalVariable *OpenMPIRBuilder::getOrCreateInternalVariable(
+    Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
   auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
   if (Elem.second) {
     assert(Elem.second->getValueType() == Ty &&
@@ -8472,16 +8481,25 @@ OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
     // variable for possibly changing that to internal or private, or maybe
     // create different versions of the function for different OMP internal
     // variables.
+    const DataLayout &DL = M.getDataLayout();
+    // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
+    // default global AS is 1.
+    // See double-target-call-with-declare-target.f90 and
+    // declare-target-vars-in-target-region.f90 libomptarget
+    // tests.
+    unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
+                               : M.getTargetTriple().isAMDGPU()
+                                   ? 0
+                                   : DL.getDefaultGlobalsAddressSpace();
     auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
                        ? GlobalValue::InternalLinkage
                        : GlobalValue::CommonLinkage;
     auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
                                   Constant::getNullValue(Ty), Elem.first(),
                                   /*InsertBefore=*/nullptr,
-                                  GlobalValue::NotThreadLocal, AddressSpace);
-    const DataLayout &DL = M.getDataLayout();
+                                  GlobalValue::NotThreadLocal, AddressSpaceVal);
     const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
-    const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
+    const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
     GV->setAlignment(std::max(TypeAlign, PtrAlign));
     Elem.second = GV;
   }
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 95d954f6b8174..4d4ffe93a8067 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -758,14 +758,12 @@ void TypePrinting::printStructBody(StructType *STy, raw_ostream &OS) {
 
 AbstractSlotTrackerStorage::~AbstractSlotTrackerStorage() = default;
 
-namespace llvm {
-
 //===----------------------------------------------------------------------===//
 // SlotTracker Class: Enumerate slot numbers for unnamed values
 //===----------------------------------------------------------------------===//
 /// This class provides computation of slot numbers for LLVM Assembly writing.
 ///
-class SlotTracker : public AbstractSlotTrackerStorage {
+class llvm::SlotTracker : public AbstractSlotTrackerStorage {
 public:
   /// ValueMap - A mapping of Values to slot numbers.
   using ValueMap = DenseMap<const Value *, unsigned>;
@@ -943,8 +941,6 @@ class SlotTracker : public AbstractSlotTrackerStorage {
   void processDbgRecordMetadata(const DbgRecord &DVR);
 };
 
-} // end namespace llvm
-
 ModuleSlotTracker::ModuleSlotTracker(SlotTracker &Machine, const Module *M,
                                      const Function *F)
     : M(M), F(F), Machine(&Machine) {}
@@ -2935,7 +2931,7 @@ class AssemblyWriter {
 
   // printInfoComment - Print a little comment after the instruction indicating
   // which slot it occupies.
-  void printInfoComment(const Value &V);
+  void printInfoComment(const Value &V, bool isMaterializable = false);
 
   // printGCRelocateComment - print comment after call to the gc.relocate
   // intrinsic indicating base and derived pointer names.
@@ -3967,7 +3963,7 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
   if (Attrs.hasAttributes())
     Out << " #" << Machine.getAttributeGroupSlot(Attrs);
 
-  printInfoComment(*GV);
+  printInfoComment(*GV, GV->isMaterializable());
 }
 
 void AssemblyWriter::printAlias(const GlobalAlias *GA) {
@@ -4005,7 +4001,7 @@ void AssemblyWriter::printAlias(const GlobalAlias *GA) {
     Out << '"';
   }
 
-  printInfoComment(*GA);
+  printInfoComment(*GA, GA->isMaterializable());
   Out << '\n';
 }
 
@@ -4044,7 +4040,7 @@ void AssemblyWriter::printIFunc(const GlobalIFunc *GI) {
     printMetadataAttachments(MDs, ", ");
   }
 
-  printInfoComment(*GI);
+  printInfoComment(*GI, GI->isMaterializable());
   Out << '\n';
 }
 
@@ -4323,13 +4319,12 @@ void AssemblyWriter::printGCRelocateComment(const GCRelocateInst &Relocate) {
 
 /// printInfoComment - Print a little comment after the instruction indicating
 /// which slot it occupies.
-void AssemblyWriter::printInfoComment(const Value &V) {
+void AssemblyWriter::printInfoComment(const Value &V, bool isMaterializable) {
   if (const auto *Relocate = dyn_cast<GCRelocateInst>(&V))
     printGCRelocateComment(*Relocate);
 
-  if (AnnotationWriter) {
+  if (AnnotationWriter && !isMaterializable)
     AnnotationWriter->printInfoComment(V, Out);
-  }
 
   if (PrintInstDebugLocs) {
     if (auto *I = dyn_cast<Instruction>(&V)) {
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index b838e36c8824f..58b7ddd0381e5 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -730,7 +730,7 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
       // (arm|aarch64).neon.bfdot.*'.
       Intrinsic::ID ID =
           StringSwitch<Intrinsic::ID>(Name)
-              .Cases("v2f32.v8i8", "v4f32.v16i8",
+              .Cases({"v2f32.v8i8", "v4f32.v16i8"},
                      IsArm ? (Intrinsic::ID)Intrinsic::arm_neon_bfdot
                            : (Intrinsic::ID)Intrinsic::aarch64_neon_bfdot)
               .Default(Intrinsic::not_intrinsic);
@@ -1456,7 +1456,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
       if (F->arg_size() == 1) {
         Intrinsic::ID IID =
             StringSwitch<Intrinsic::ID>(Name)
-                .Cases("brev32", "brev64", Intrinsic::bitreverse)
+                .Cases({"brev32", "brev64"}, Intrinsic::bitreverse)
                 .Case("clz.i", Intrinsic::ctlz)
                 .Case("popc.i", Intrinsic::ctpop)
                 .Default(Intrinsic::not_intrinsic);
@@ -1504,6 +1504,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
       else if (Name.consume_front("fabs."))
         // nvvm.fabs.{f,ftz.f,d}
         Expand = Name == "f" || Name == "ftz.f" || Name == "d";
+      else if (Name.consume_front("ex2.approx."))
+        // nvvm.ex2.approx.{f,ftz.f,d,f16x2}
+        Expand =
+            Name == "f" || Name == "ftz.f" || Name == "d" || Name == "f16x2";
       else if (Name.consume_front("max.") || Name.consume_front("min."))
         // nvvm.{min,max}.{i,ii,ui,ull}
         Expand = Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
@@ -2550,6 +2554,11 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
     Intrinsic::ID IID = (Name == "fabs.ftz.f") ? Intrinsic::nvvm_fabs_ftz
                                                : Intrinsic::nvvm_fabs;
     Rep = Builder.CreateUnaryIntrinsic(IID, CI->getArgOperand(0));
+  } else if (Name.consume_front("ex2.approx.")) {
+    // nvvm.ex2.approx.{f,ftz.f,d,f16x2}
+    Intrinsic::ID IID = Name.starts_with("ftz") ? Intrinsic::nvvm_ex2_approx_ftz
+                                                : Intrinsic::nvvm_ex2_approx;
+    Rep = Builder.CreateUnaryIntrinsic(IID, CI->getArgOperand(0));
   } else if (Name.starts_with("atomic.load.add.f32.p") ||
              Name.starts_with("atomic.load.add.f64.p")) {
     Value *Ptr = CI->getArgOperand(0);
diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt
index 10572ff708bd3..ebdc2ca08d102 100644
--- a/llvm/lib/IR/CMakeLists.txt
+++ b/llvm/lib/IR/CMakeLists.txt
@@ -67,6 +67,7 @@ add_llvm_component_library(LLVMCore
   ReplaceConstant.cpp
   Statepoint.cpp
   StructuralHash.cpp
+  SystemLibraries.cpp
   Type.cpp
   TypedPointerType.cpp
   TypeFinder.cpp
diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h
index 51fb40bad201d..e3e8d895a63f4 100644
--- a/llvm/lib/IR/ConstantsContext.h
+++ b/llvm/lib/IR/ConstantsContext.h
@@ -535,7 +535,7 @@ struct ConstantPtrAuthKeyType {
 
   unsigned getHash() const { return hash_combine_range(Operands); }
 
-  using TypeClass = typename ConstantInfo<ConstantPtrAuth>::TypeClass;
+  using TypeClass = ConstantInfo<ConstantPtrAuth>::TypeClass;
 
   ConstantPtrAuth *create(TypeClass *Ty) const {
     return new ConstantPtrAuth(Operands[0], cast<ConstantInt>(Operands[1]),
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 27d8294b01264..604730e0d3004 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -3036,6 +3036,37 @@ LLVMDbgRecordRef LLVMGetPreviousDbgRecord(LLVMDbgRecordRef Rec) {
   return wrap(&*--I);
 }
 
+LLVMMetadataRef LLVMDbgRecordGetDebugLoc(LLVMDbgRecordRef Rec) {
+  return wrap(unwrap<DbgRecord>(Rec)->getDebugLoc().getAsMDNode());
+}
+
+LLVMDbgRecordKind LLVMDbgRecordGetKind(LLVMDbgRecordRef Rec) {
+  DbgRecord *Record = unwrap<DbgRecord>(Rec);
+  if (isa<DbgLabelRecord>(Record))
+    return LLVMDbgRecordLabel;
+  DbgVariableRecord *VariableRecord = dyn_cast<DbgVariableRecord>(Record);
+  assert(VariableRecord && "unexpected record");
+  if (VariableRecord->isDbgDeclare())
+    return LLVMDbgRecordDeclare;
+  if (VariableRecord->isDbgValue())
+    return LLVMDbgRecordValue;
+  assert(VariableRecord->isDbgAssign() && "unexpected record");
+  return LLVMDbgRecordAssign;
+}
+
+LLVMValueRef LLVMDbgVariableRecordGetValue(LLVMDbgRecordRef Rec,
+                                           unsigned OpIdx) {
+  return wrap(unwrap<DbgVariableRecord>(Rec)->getValue(OpIdx));
+}
+
+LLVMMetadataRef LLVMDbgVariableRecordGetVariable(LLVMDbgRecordRef Rec) {
+  return wrap(unwrap<DbgVariableRecord>(Rec)->getRawVariable());
+}
+
+LLVMMetadataRef LLVMDbgVariableRecordGetExpression(LLVMDbgRecordRef Rec) {
+  return wrap(unwrap<DbgVariableRecord>(Rec)->getRawExpression());
+}
+
 unsigned LLVMGetNumArgOperands(LLVMValueRef Instr) {
   if (FuncletPadInst *FPI = dyn_cast<FuncletPadInst>(unwrap(Instr))) {
     return FPI->arg_size();
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 58836068a4929..0f9d064857dc4 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -40,7 +40,6 @@
 #include <algorithm>
 #include <cassert>
 #include <optional>
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::at;
@@ -247,7 +246,7 @@ void DebugInfoFinder::processType(DIType *DT) {
   }
 }
 
-void DebugInfoFinder::processImportedEntity(DIImportedEntity *Import) {
+void DebugInfoFinder::processImportedEntity(const DIImportedEntity *Import) {
   auto *Entity = Import->getEntity();
   if (auto *T = dyn_cast<DIType>(Entity))
     processType(T);
@@ -307,15 +306,13 @@ void DebugInfoFinder::processSubprogram(DISubprogram *SP) {
     }
   }
 
-  for (auto *N : SP->getRetainedNodes()) {
-    if (auto *Var = dyn_cast_or_null<DILocalVariable>(N))
-      processVariable(Var);
-    else if (auto *Import = dyn_cast_or_null<DIImportedEntity>(N))
-      processImportedEntity(Import);
-  }
+  SP->forEachRetainedNode(
+      [this](const DILocalVariable *LV) { processVariable(LV); },
+      [](const DILabel *L) {},
+      [this](const DIImportedEntity *IE) { processImportedEntity(IE); });
 }
 
-void DebugInfoFinder::processVariable(DILocalVariable *DV) {
+void DebugInfoFinder::processVariable(const DILocalVariable *DV) {
   if (!NodesSeen.insert(DV).second)
     return;
   processScope(DV->getScope());
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index fafc3254120de..1a6a25e161803 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -962,16 +962,29 @@ DIType *DIDerivedType::getClassType() const {
   assert(getTag() == dwarf::DW_TAG_ptr_to_member_type);
   return cast_or_null<DIType>(getExtraData());
 }
+
+// Helper function to extract ConstantAsMetadata from ExtraData,
+// handling extra data MDTuple unwrapping if needed.
+static ConstantAsMetadata *extractConstantMetadata(Metadata *ExtraData) {
+  Metadata *ED = ExtraData;
+  if (auto *Tuple = dyn_cast_or_null<MDTuple>(ED)) {
+    if (Tuple->getNumOperands() != 1)
+      return nullptr;
+    ED = Tuple->getOperand(0);
+  }
+  return cast_or_null<ConstantAsMetadata>(ED);
+}
+
 uint32_t DIDerivedType::getVBPtrOffset() const {
   assert(getTag() == dwarf::DW_TAG_inheritance);
-  if (auto *CM = cast_or_null<ConstantAsMetadata>(getExtraData()))
+  if (auto *CM = extractConstantMetadata(getExtraData()))
     if (auto *CI = dyn_cast_or_null<ConstantInt>(CM->getValue()))
       return static_cast<uint32_t>(CI->getZExtValue());
   return 0;
 }
 Constant *DIDerivedType::getStorageOffsetInBits() const {
   assert(getTag() == dwarf::DW_TAG_member && isBitField());
-  if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
+  if (auto *C = extractConstantMetadata(getExtraData()))
     return C->getValue();
   return nullptr;
 }
@@ -980,13 +993,13 @@ Constant *DIDerivedType::getConstant() const {
   assert((getTag() == dwarf::DW_TAG_member ||
           getTag() == dwarf::DW_TAG_variable) &&
          isStaticMember());
-  if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
+  if (auto *C = extractConstantMetadata(getExtraData()))
     return C->getValue();
   return nullptr;
 }
 Constant *DIDerivedType::getDiscriminantValue() const {
   assert(getTag() == dwarf::DW_TAG_member && !isStaticMember());
-  if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
+  if (auto *C = extractConstantMetadata(getExtraData()))
     return C->getValue();
   return nullptr;
 }
@@ -1428,6 +1441,19 @@ bool DISubprogram::describes(const Function *F) const {
   assert(F && "Invalid function");
   return F->getSubprogram() == this;
 }
+
+const DIScope *DISubprogram::getRawRetainedNodeScope(const MDNode *N) {
+  return visitRetainedNode<DIScope *>(
+      N, [](const DILocalVariable *LV) { return LV->getScope(); },
+      [](const DILabel *L) { return L->getScope(); },
+      [](const DIImportedEntity *IE) { return IE->getScope(); },
+      [](const Metadata *N) { return nullptr; });
+}
+
+const DILocalScope *DISubprogram::getRetainedNodeScope(const MDNode *N) {
+  return cast<DILocalScope>(getRawRetainedNodeScope(N));
+}
+
 DILexicalBlockBase::DILexicalBlockBase(LLVMContext &C, unsigned ID,
                                        StorageType Storage,
                                        ArrayRef<Metadata *> Ops)
diff --git a/llvm/lib/IR/DebugLoc.cpp b/llvm/lib/IR/DebugLoc.cpp
index 01dafcab94ce9..bfba6e0cab6bf 100644
--- a/llvm/lib/IR/DebugLoc.cpp
+++ b/llvm/lib/IR/DebugLoc.cpp
@@ -10,10 +10,11 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugInfo.h"
 
+using namespace llvm;
+
 #if LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN
 #include "llvm/Support/Signals.h"
 
-namespace llvm {
 DbgLocOrigin::DbgLocOrigin(bool ShouldCollectTrace) {
   if (!ShouldCollectTrace)
     return;
@@ -30,11 +31,8 @@ void DbgLocOrigin::addTrace() {
   auto &[Depth, StackTrace] = StackTraces.emplace_back();
   Depth = sys::getStackTrace(StackTrace);
 }
-} // namespace llvm
 #endif
 
-using namespace llvm;
-
 #if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 DILocAndCoverageTracking::DILocAndCoverageTracking(const DILocation *L)
     : TrackingMDNodeRef(const_cast<DILocation *>(L)), DbgLocOrigin(!L),
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index d9357bba75510..6b1fd3907dc41 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -12,8 +12,9 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Compiler.h"
 
-namespace llvm {
+using namespace llvm;
 
+namespace llvm {
 template <typename T>
 DbgRecordParamRef<T>::DbgRecordParamRef(const T *Param)
     : Ref(const_cast<T *>(Param)) {}
@@ -28,6 +29,7 @@ template <typename T> T *DbgRecordParamRef<T>::get() const {
 template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DIExpression>;
 template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DILabel>;
 template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DILocalVariable>;
+} // namespace llvm
 
 DbgVariableRecord::DbgVariableRecord(const DbgVariableIntrinsic *DVI)
     : DbgRecord(ValueKind, DVI->getDebugLoc()),
@@ -755,5 +757,3 @@ iterator_range<simple_ilist<DbgRecord>::iterator> DbgMarker::cloneDebugInfoFrom(
     // We inserted a block at the end, return that range.
     return {First->getIterator(), StoredDbgRecords.end()};
 }
-
-} // end namespace llvm
diff --git a/llvm/lib/IR/EHPersonalities.cpp b/llvm/lib/IR/EHPersonalities.cpp
index 9297a82e7d2b0..12ae4748e1f4a 100644
--- a/llvm/lib/IR/EHPersonalities.cpp
+++ b/llvm/lib/IR/EHPersonalities.cpp
@@ -47,7 +47,8 @@ EHPersonality llvm::classifyEHPersonality(const Value *Pers) {
       .Case("__C_specific_handler", EHPersonality::MSVC_TableSEH)
       .Case("__CxxFrameHandler3", EHPersonality::MSVC_CXX)
       .Case("ProcessCLRException", EHPersonality::CoreCLR)
-      .Case("rust_eh_personality", EHPersonality::Rust)
+      // Rust mangles its personality function, so we can't test exact equality.
+      .EndsWith("rust_eh_personality", EHPersonality::Rust)
       .Case("__gxx_wasm_personality_v0", EHPersonality::Wasm_CXX)
       .Case("__xlcxx_personality_v1", EHPersonality::XL_CXX)
       .Case("__zos_cxx_personality_v2", EHPersonality::ZOS_CXX)
@@ -77,7 +78,8 @@ StringRef llvm::getEHPersonalityName(EHPersonality Pers) {
   case EHPersonality::CoreCLR:
     return "ProcessCLRException";
   case EHPersonality::Rust:
-    return "rust_eh_personality";
+    llvm_unreachable(
+        "Cannot get personality name of Rust personality, since it is mangled");
   case EHPersonality::Wasm_CXX:
     return "__gxx_wasm_personality_v0";
   case EHPersonality::XL_CXX:
diff --git a/llvm/lib/IR/FPEnv.cpp b/llvm/lib/IR/FPEnv.cpp
index 67f21d3756e93..c41d7b3181a37 100644
--- a/llvm/lib/IR/FPEnv.cpp
+++ b/llvm/lib/IR/FPEnv.cpp
@@ -19,9 +19,10 @@
 #include "llvm/IR/Intrinsics.h"
 #include <optional>
 
-namespace llvm {
+using namespace llvm;
 
-std::optional<RoundingMode> convertStrToRoundingMode(StringRef RoundingArg) {
+std::optional<RoundingMode>
+llvm::convertStrToRoundingMode(StringRef RoundingArg) {
   // For dynamic rounding mode, we use round to nearest but we will set the
   // 'exact' SDNodeFlag so that the value will not be rounded.
   return StringSwitch<std::optional<RoundingMode>>(RoundingArg)
@@ -34,7 +35,8 @@ std::optional<RoundingMode> convertStrToRoundingMode(StringRef RoundingArg) {
       .Default(std::nullopt);
 }
 
-std::optional<StringRef> convertRoundingModeToStr(RoundingMode UseRounding) {
+std::optional<StringRef>
+llvm::convertRoundingModeToStr(RoundingMode UseRounding) {
   std::optional<StringRef> RoundingStr;
   switch (UseRounding) {
   case RoundingMode::Dynamic:
@@ -62,7 +64,7 @@ std::optional<StringRef> convertRoundingModeToStr(RoundingMode UseRounding) {
 }
 
 std::optional<fp::ExceptionBehavior>
-convertStrToExceptionBehavior(StringRef ExceptionArg) {
+llvm::convertStrToExceptionBehavior(StringRef ExceptionArg) {
   return StringSwitch<std::optional<fp::ExceptionBehavior>>(ExceptionArg)
       .Case("fpexcept.ignore", fp::ebIgnore)
       .Case("fpexcept.maytrap", fp::ebMayTrap)
@@ -71,7 +73,7 @@ convertStrToExceptionBehavior(StringRef ExceptionArg) {
 }
 
 std::optional<StringRef>
-convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) {
+llvm::convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) {
   std::optional<StringRef> ExceptStr;
   switch (UseExcept) {
   case fp::ebStrict:
@@ -87,7 +89,7 @@ convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) {
   return ExceptStr;
 }
 
-Intrinsic::ID getConstrainedIntrinsicID(const Instruction &Instr) {
+Intrinsic::ID llvm::getConstrainedIntrinsicID(const Instruction &Instr) {
   Intrinsic::ID IID = Intrinsic::not_intrinsic;
   switch (Instr.getOpcode()) {
   case Instruction::FCmp:
@@ -127,5 +129,3 @@ Intrinsic::ID getConstrainedIntrinsicID(const Instruction &Instr) {
 
   return IID;
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index fc067459dcba3..31a294447152e 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -396,6 +396,9 @@ Function *Function::createWithDefaultAttr(FunctionType *Ty,
   case FramePointerKind::NonLeaf:
     B.addAttribute("frame-pointer", "non-leaf");
     break;
+  case FramePointerKind::NonLeafNoReserve:
+    B.addAttribute("frame-pointer", "non-leaf-no-reserve");
+    break;
   case FramePointerKind::All:
     B.addAttribute("frame-pointer", "all");
     break;
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 88dbd176e0d3f..95edb2e8e56d8 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -1019,8 +1019,7 @@ Value *IRBuilderBase::CreateSelectWithUnknownProfile(Value *C, Value *True,
                                                      const Twine &Name) {
   Value *Ret = CreateSelectFMF(C, True, False, {}, Name);
   if (auto *SI = dyn_cast<SelectInst>(Ret)) {
-    setExplicitlyUnknownBranchWeightsIfProfiled(
-        *SI, *SI->getParent()->getParent(), PassName);
+    setExplicitlyUnknownBranchWeightsIfProfiled(*SI, PassName);
   }
   return Ret;
 }
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 3b8fde8aff45f..cd39970f5111f 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -4171,6 +4171,16 @@ SwitchInstProfUpdateWrapper::removeCase(SwitchInst::CaseIt I) {
   return SI.removeCase(I);
 }
 
+void SwitchInstProfUpdateWrapper::replaceDefaultDest(SwitchInst::CaseIt I) {
+  auto *DestBlock = I->getCaseSuccessor();
+  if (Weights) {
+    auto Weight = getSuccessorWeight(I->getCaseIndex() + 1);
+    (*Weights)[0] = Weight.value();
+  }
+
+  SI.setDefaultDest(DestBlock);
+}
+
 void SwitchInstProfUpdateWrapper::addCase(
     ConstantInt *OnVal, BasicBlock *Dest,
     SwitchInstProfUpdateWrapper::CaseWeightOpt W) {
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index 87037c3a45140..ca7605ae53453 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -25,7 +25,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp
index 62fd62caad3d6..33947542bcf16 100644
--- a/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -34,8 +34,6 @@ static cl::opt<bool> ImportConstantsWithRefs(
     "import-constants-with-refs", cl::init(true), cl::Hidden,
     cl::desc("Import constant global variables with references"));
 
-constexpr uint32_t FunctionSummary::ParamAccess::RangeWidth;
-
 FunctionSummary FunctionSummary::ExternalNode =
     FunctionSummary::makeDummyFunctionSummary(
         SmallVector<FunctionSummary::EdgeTy, 0>());
@@ -88,8 +86,6 @@ std::pair<unsigned, unsigned> FunctionSummary::specialRefCounts() const {
   return {RORefCnt, WORefCnt};
 }
 
-constexpr uint64_t ModuleSummaryIndex::BitcodeSummaryVersion;
-
 uint64_t ModuleSummaryIndex::getFlags() const {
   uint64_t Flags = 0;
   // Flags & 0x4 is reserved. DO NOT REUSE.
diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp
index 39e5463cb6fc3..c3e54a0fc0c7e 100644
--- a/llvm/lib/IR/Operator.cpp
+++ b/llvm/lib/IR/Operator.cpp
@@ -17,7 +17,8 @@
 
 #include "ConstantsContext.h"
 
-namespace llvm {
+using namespace llvm;
+
 bool Operator::hasPoisonGeneratingFlags() const {
   switch (getOpcode()) {
   case Instruction::Add:
@@ -288,4 +289,3 @@ void FastMathFlags::print(raw_ostream &O) const {
       O << " afn";
   }
 }
-} // namespace llvm
diff --git a/llvm/lib/IR/PassRegistry.cpp b/llvm/lib/IR/PassRegistry.cpp
index 94afbb52d70e3..a91bb563af4bb 100644
--- a/llvm/lib/IR/PassRegistry.cpp
+++ b/llvm/lib/IR/PassRegistry.cpp
@@ -17,7 +17,6 @@
 #include "llvm/PassInfo.h"
 #include <cassert>
 #include <memory>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/IR/PassTimingInfo.cpp b/llvm/lib/IR/PassTimingInfo.cpp
index 4e27086e97ac5..cb1b91a98b036 100644
--- a/llvm/lib/IR/PassTimingInfo.cpp
+++ b/llvm/lib/IR/PassTimingInfo.cpp
@@ -32,10 +32,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "time-passes"
 
-namespace llvm {
+using namespace llvm;
 
-bool TimePassesIsEnabled = false;
-bool TimePassesPerRun = false;
+bool llvm::TimePassesIsEnabled = false;
+bool llvm::TimePassesPerRun = false;
 
 static cl::opt<bool, true> EnableTiming(
     "time-passes", cl::location(TimePassesIsEnabled), cl::Hidden,
@@ -139,7 +139,7 @@ PassTimingInfo *PassTimingInfo::TheTimeInfo;
 } // namespace legacy
 } // namespace
 
-Timer *getPassTimer(Pass *P) {
+Timer *llvm::getPassTimer(Pass *P) {
   legacy::PassTimingInfo::init();
   if (legacy::PassTimingInfo::TheTimeInfo)
     return legacy::PassTimingInfo::TheTimeInfo->getPassTimer(P, P);
@@ -148,7 +148,7 @@ Timer *getPassTimer(Pass *P) {
 
 /// If timing is enabled, report the times collected up to now and then reset
 /// them.
-void reportAndResetTimings(raw_ostream *OutStream) {
+void llvm::reportAndResetTimings(raw_ostream *OutStream) {
   if (legacy::PassTimingInfo::TheTimeInfo)
     legacy::PassTimingInfo::TheTimeInfo->print(OutStream);
 }
@@ -315,5 +315,3 @@ void TimePassesHandler::registerCallbacks(PassInstrumentationCallbacks &PIC) {
   PIC.registerAfterAnalysisCallback(
       [this](StringRef P, Any) { this->stopAnalysisTimer(P); });
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index fc2be5188f456..94dbe1f3988b8 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -274,9 +274,12 @@ void llvm::setExplicitlyUnknownBranchWeights(Instruction &I,
 }
 
 void llvm::setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I,
-                                                       Function &F,
-                                                       StringRef PassName) {
-  if (std::optional<Function::ProfileCount> EC = F.getEntryCount();
+                                                       StringRef PassName,
+                                                       const Function *F) {
+  F = F ? F : I.getFunction();
+  assert(F && "Either pass a instruction attached to a Function, or explicitly "
+              "pass the Function that it will be attached to");
+  if (std::optional<Function::ProfileCount> EC = F->getEntryCount();
       EC && EC->getCount() > 0)
     setExplicitlyUnknownBranchWeights(I, PassName);
 }
diff --git a/llvm/lib/IR/PseudoProbe.cpp b/llvm/lib/IR/PseudoProbe.cpp
index 59f218cc3683b..3c05f4b1f86a2 100644
--- a/llvm/lib/IR/PseudoProbe.cpp
+++ b/llvm/lib/IR/PseudoProbe.cpp
@@ -19,9 +19,7 @@
 
 using namespace llvm;
 
-namespace llvm {
-
-std::optional<PseudoProbe>
+static std::optional<PseudoProbe>
 extractProbeFromDiscriminator(const DILocation *DIL) {
   if (DIL) {
     auto Discriminator = DIL->getDiscriminator();
@@ -43,7 +41,7 @@ extractProbeFromDiscriminator(const DILocation *DIL) {
   return std::nullopt;
 }
 
-std::optional<PseudoProbe>
+static std::optional<PseudoProbe>
 extractProbeFromDiscriminator(const Instruction &Inst) {
   assert(isa<CallBase>(&Inst) && !isa<IntrinsicInst>(&Inst) &&
          "Only call instructions should have pseudo probe encodes as their "
@@ -53,7 +51,7 @@ extractProbeFromDiscriminator(const Instruction &Inst) {
   return std::nullopt;
 }
 
-std::optional<PseudoProbe> extractProbe(const Instruction &Inst) {
+std::optional<PseudoProbe> llvm::extractProbe(const Instruction &Inst) {
   if (const auto *II = dyn_cast<PseudoProbeInst>(&Inst)) {
     PseudoProbe Probe;
     Probe.Id = II->getIndex()->getZExtValue();
@@ -73,7 +71,7 @@ std::optional<PseudoProbe> extractProbe(const Instruction &Inst) {
   return std::nullopt;
 }
 
-void setProbeDistributionFactor(Instruction &Inst, float Factor) {
+void llvm::setProbeDistributionFactor(Instruction &Inst, float Factor) {
   assert(Factor >= 0 && Factor <= 1 &&
          "Distribution factor must be in [0, 1.0]");
   if (auto *II = dyn_cast<PseudoProbeInst>(&Inst)) {
@@ -111,5 +109,3 @@ void setProbeDistributionFactor(Instruction &Inst, float Factor) {
     }
   }
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/ReplaceConstant.cpp b/llvm/lib/IR/ReplaceConstant.cpp
index 962368f061851..b3586b45a23f2 100644
--- a/llvm/lib/IR/ReplaceConstant.cpp
+++ b/llvm/lib/IR/ReplaceConstant.cpp
@@ -16,7 +16,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 
-namespace llvm {
+using namespace llvm;
 
 static bool isExpandableUser(User *U) {
   return isa<ConstantExpr>(U) || isa<ConstantAggregate>(U);
@@ -49,10 +49,10 @@ static SmallVector<Instruction *, 4> expandUser(BasicBlock::iterator InsertPt,
   return NewInsts;
 }
 
-bool convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts,
-                                           Function *RestrictToFunc,
-                                           bool RemoveDeadConstants,
-                                           bool IncludeSelf) {
+bool llvm::convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts,
+                                                 Function *RestrictToFunc,
+                                                 bool RemoveDeadConstants,
+                                                 bool IncludeSelf) {
   // Find all expandable direct users of Consts.
   SmallVector<Constant *> Stack;
   for (Constant *C : Consts) {
@@ -121,5 +121,3 @@ bool convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts,
 
   return Changed;
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 77af29b9d70f6..ee23b58742b64 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -7,7 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/RuntimeLibcalls.h"
+#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/StringTable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/SystemLibraries.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/xxhash.h"
 #include "llvm/TargetParser/ARMTargetParser.h"
@@ -17,11 +20,64 @@
 using namespace llvm;
 using namespace RTLIB;
 
+#define GET_RUNTIME_LIBCALLS_INFO
 #define GET_INIT_RUNTIME_LIBCALL_NAMES
 #define GET_SET_TARGET_RUNTIME_LIBCALL_SETS
 #define DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
 #include "llvm/IR/RuntimeLibcalls.inc"
 
+RuntimeLibcallsInfo::RuntimeLibcallsInfo(const Triple &TT,
+                                         ExceptionHandling ExceptionModel,
+                                         FloatABI::ABIType FloatABI,
+                                         EABI EABIVersion, StringRef ABIName) {
+  // FIXME: The ExceptionModel parameter is to handle the field in
+  // TargetOptions. This interface fails to distinguish the forced disable
+  // case for targets which support exceptions by default. This should
+  // probably be a module flag and removed from TargetOptions.
+  if (ExceptionModel == ExceptionHandling::None)
+    ExceptionModel = TT.getDefaultExceptionHandling();
+
+  initLibcalls(TT, ExceptionModel, FloatABI, EABIVersion, ABIName);
+
+  // TODO: Tablegen should generate these sets
+  switch (ClVectorLibrary) {
+  case VectorLibrary::SLEEFGNUABI:
+    for (RTLIB::LibcallImpl Impl :
+         {RTLIB::impl__ZGVnN2vl8_modf, RTLIB::impl__ZGVnN4vl4_modff,
+          RTLIB::impl__ZGVsNxvl8_modf, RTLIB::impl__ZGVsNxvl4_modff,
+          RTLIB::impl__ZGVnN2vl8l8_sincos, RTLIB::impl__ZGVnN4vl4l4_sincosf,
+          RTLIB::impl__ZGVsNxvl8l8_sincos, RTLIB::impl__ZGVsNxvl4l4_sincosf,
+          RTLIB::impl__ZGVnN4vl4l4_sincospif, RTLIB::impl__ZGVnN2vl8l8_sincospi,
+          RTLIB::impl__ZGVsNxvl4l4_sincospif,
+          RTLIB::impl__ZGVsNxvl8l8_sincospi})
+      setAvailable(Impl);
+    break;
+  case VectorLibrary::ArmPL:
+    for (RTLIB::LibcallImpl Impl :
+         {RTLIB::impl_armpl_vmodfq_f64, RTLIB::impl_armpl_vmodfq_f32,
+          RTLIB::impl_armpl_svmodf_f64_x, RTLIB::impl_armpl_svmodf_f32_x,
+          RTLIB::impl_armpl_vsincosq_f64, RTLIB::impl_armpl_vsincosq_f32,
+          RTLIB::impl_armpl_svsincos_f64_x, RTLIB::impl_armpl_svsincos_f32_x,
+          RTLIB::impl_armpl_vsincospiq_f32, RTLIB::impl_armpl_vsincospiq_f64,
+          RTLIB::impl_armpl_svsincospi_f32_x,
+          RTLIB::impl_armpl_svsincospi_f64_x})
+      setAvailable(Impl);
+
+    for (RTLIB::LibcallImpl Impl :
+         {RTLIB::impl_armpl_vsincosq_f64, RTLIB::impl_armpl_vsincosq_f32})
+      setLibcallImplCallingConv(Impl, CallingConv::AArch64_VectorCall);
+
+    break;
+  default:
+    break;
+  }
+}
+
+RuntimeLibcallsInfo::RuntimeLibcallsInfo(const Module &M)
+    : RuntimeLibcallsInfo(M.getTargetTriple()) {
+  // TODO: Consider module flags
+}
+
 /// Set default libcall names. If a target wants to opt-out of a libcall it
 /// should be placed here.
 void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
@@ -72,3 +128,207 @@ bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) {
     return false;
   }
 }
+
+std::pair<FunctionType *, AttributeList>
+RuntimeLibcallsInfo::getFunctionTy(LLVMContext &Ctx, const Triple &TT,
+                                   const DataLayout &DL,
+                                   RTLIB::LibcallImpl LibcallImpl) const {
+  static constexpr Attribute::AttrKind CommonFnAttrs[] = {
+      Attribute::NoCallback, Attribute::NoFree, Attribute::NoSync,
+      Attribute::NoUnwind, Attribute::WillReturn};
+  static constexpr Attribute::AttrKind CommonPtrArgAttrs[] = {
+      Attribute::NoAlias, Attribute::WriteOnly, Attribute::NonNull};
+
+  switch (LibcallImpl) {
+  case RTLIB::impl___sincos_stret:
+  case RTLIB::impl___sincosf_stret: {
+    if (!darwinHasSinCosStret(TT)) // Non-darwin currently unexpected
+      return {};
+
+    Type *ScalarTy = LibcallImpl == RTLIB::impl___sincosf_stret
+                         ? Type::getFloatTy(Ctx)
+                         : Type::getDoubleTy(Ctx);
+
+    AttrBuilder FuncAttrBuilder(Ctx);
+    for (Attribute::AttrKind Attr : CommonFnAttrs)
+      FuncAttrBuilder.addAttribute(Attr);
+
+    const bool UseSret =
+        TT.isX86_32() || ((TT.isARM() || TT.isThumb()) &&
+                          ARM::computeTargetABI(TT) == ARM::ARM_ABI_APCS);
+
+    FuncAttrBuilder.addMemoryAttr(MemoryEffects::argumentOrErrnoMemOnly(
+        UseSret ? ModRefInfo::Mod : ModRefInfo::NoModRef, ModRefInfo::Mod));
+
+    AttributeList Attrs;
+    Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder);
+
+    if (UseSret) {
+      AttrBuilder AttrBuilder(Ctx);
+      StructType *StructTy = StructType::get(ScalarTy, ScalarTy);
+      AttrBuilder.addStructRetAttr(StructTy);
+      AttrBuilder.addAlignmentAttr(DL.getABITypeAlign(StructTy));
+      FunctionType *FuncTy = FunctionType::get(
+          Type::getVoidTy(Ctx), {DL.getAllocaPtrType(Ctx), ScalarTy}, false);
+
+      return {FuncTy, Attrs.addParamAttributes(Ctx, 0, AttrBuilder)};
+    }
+
+    Type *RetTy =
+        LibcallImpl == RTLIB::impl___sincosf_stret && TT.isX86_64()
+            ? static_cast<Type *>(FixedVectorType::get(ScalarTy, 2))
+            : static_cast<Type *>(StructType::get(ScalarTy, ScalarTy));
+
+    return {FunctionType::get(RetTy, {ScalarTy}, false), Attrs};
+  }
+  case RTLIB::impl_sqrtf:
+  case RTLIB::impl_sqrt: {
+    AttrBuilder FuncAttrBuilder(Ctx);
+
+    for (Attribute::AttrKind Attr : CommonFnAttrs)
+      FuncAttrBuilder.addAttribute(Attr);
+    FuncAttrBuilder.addMemoryAttr(MemoryEffects::errnoMemOnly(ModRefInfo::Mod));
+
+    AttributeList Attrs;
+    Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder);
+
+    Type *ScalarTy = LibcallImpl == RTLIB::impl_sqrtf ? Type::getFloatTy(Ctx)
+                                                      : Type::getDoubleTy(Ctx);
+    FunctionType *FuncTy = FunctionType::get(ScalarTy, {ScalarTy}, false);
+
+    Attrs = Attrs.addRetAttribute(
+        Ctx, Attribute::getWithNoFPClass(Ctx, fcNegInf | fcNegSubnormal |
+                                                  fcNegNormal));
+    return {FuncTy, Attrs};
+  }
+  case RTLIB::impl__ZGVnN2vl8_modf:
+  case RTLIB::impl__ZGVnN4vl4_modff:
+  case RTLIB::impl__ZGVsNxvl8_modf:
+  case RTLIB::impl__ZGVsNxvl4_modff:
+  case RTLIB::impl_armpl_vmodfq_f64:
+  case RTLIB::impl_armpl_vmodfq_f32:
+  case RTLIB::impl_armpl_svmodf_f64_x:
+  case RTLIB::impl_armpl_svmodf_f32_x: {
+    AttrBuilder FuncAttrBuilder(Ctx);
+
+    bool IsF32 = LibcallImpl == RTLIB::impl__ZGVnN4vl4_modff ||
+                 LibcallImpl == RTLIB::impl__ZGVsNxvl4_modff ||
+                 LibcallImpl == RTLIB::impl_armpl_vmodfq_f32 ||
+                 LibcallImpl == RTLIB::impl_armpl_svmodf_f32_x;
+
+    bool IsScalable = LibcallImpl == RTLIB::impl__ZGVsNxvl8_modf ||
+                      LibcallImpl == RTLIB::impl__ZGVsNxvl4_modff ||
+                      LibcallImpl == RTLIB::impl_armpl_svmodf_f64_x ||
+                      LibcallImpl == RTLIB::impl_armpl_svmodf_f32_x;
+
+    Type *ScalarTy = IsF32 ? Type::getFloatTy(Ctx) : Type::getDoubleTy(Ctx);
+    unsigned EC = IsF32 ? 4 : 2;
+    VectorType *VecTy = VectorType::get(ScalarTy, EC, IsScalable);
+
+    for (Attribute::AttrKind Attr : CommonFnAttrs)
+      FuncAttrBuilder.addAttribute(Attr);
+    FuncAttrBuilder.addMemoryAttr(MemoryEffects::argMemOnly(ModRefInfo::Mod));
+
+    AttributeList Attrs;
+    Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder);
+
+    {
+      AttrBuilder ArgAttrBuilder(Ctx);
+      for (Attribute::AttrKind AK : CommonPtrArgAttrs)
+        ArgAttrBuilder.addAttribute(AK);
+      ArgAttrBuilder.addAlignmentAttr(DL.getABITypeAlign(VecTy));
+      Attrs = Attrs.addParamAttributes(Ctx, 1, ArgAttrBuilder);
+    }
+
+    PointerType *PtrTy = PointerType::get(Ctx, 0);
+    SmallVector<Type *, 4> ArgTys = {VecTy, PtrTy};
+    if (hasVectorMaskArgument(LibcallImpl))
+      ArgTys.push_back(VectorType::get(Type::getInt1Ty(Ctx), EC, IsScalable));
+
+    return {FunctionType::get(VecTy, ArgTys, false), Attrs};
+  }
+  case RTLIB::impl__ZGVnN2vl8l8_sincos:
+  case RTLIB::impl__ZGVnN4vl4l4_sincosf:
+  case RTLIB::impl__ZGVsNxvl8l8_sincos:
+  case RTLIB::impl__ZGVsNxvl4l4_sincosf:
+  case RTLIB::impl_armpl_vsincosq_f64:
+  case RTLIB::impl_armpl_vsincosq_f32:
+  case RTLIB::impl_armpl_svsincos_f64_x:
+  case RTLIB::impl_armpl_svsincos_f32_x:
+  case RTLIB::impl__ZGVnN4vl4l4_sincospif:
+  case RTLIB::impl__ZGVnN2vl8l8_sincospi:
+  case RTLIB::impl__ZGVsNxvl4l4_sincospif:
+  case RTLIB::impl__ZGVsNxvl8l8_sincospi:
+  case RTLIB::impl_armpl_vsincospiq_f32:
+  case RTLIB::impl_armpl_vsincospiq_f64:
+  case RTLIB::impl_armpl_svsincospi_f32_x:
+  case RTLIB::impl_armpl_svsincospi_f64_x: {
+    AttrBuilder FuncAttrBuilder(Ctx);
+
+    bool IsF32 = LibcallImpl == RTLIB::impl__ZGVnN4vl4l4_sincospif ||
+                 LibcallImpl == RTLIB::impl__ZGVsNxvl4l4_sincospif ||
+                 LibcallImpl == RTLIB::impl_armpl_vsincospiq_f32 ||
+                 LibcallImpl == RTLIB::impl_armpl_svsincospi_f32_x ||
+                 LibcallImpl == RTLIB::impl__ZGVnN4vl4l4_sincosf ||
+                 LibcallImpl == RTLIB::impl__ZGVsNxvl4l4_sincosf ||
+                 LibcallImpl == RTLIB::impl_armpl_vsincosq_f32 ||
+                 LibcallImpl == RTLIB::impl_armpl_svsincos_f32_x;
+
+    Type *ScalarTy = IsF32 ? Type::getFloatTy(Ctx) : Type::getDoubleTy(Ctx);
+    unsigned EC = IsF32 ? 4 : 2;
+
+    bool IsScalable = LibcallImpl == RTLIB::impl__ZGVsNxvl8l8_sincos ||
+                      LibcallImpl == RTLIB::impl__ZGVsNxvl4l4_sincosf ||
+                      LibcallImpl == RTLIB::impl_armpl_svsincos_f32_x ||
+                      LibcallImpl == RTLIB::impl_armpl_svsincos_f64_x ||
+                      LibcallImpl == RTLIB::impl__ZGVsNxvl4l4_sincospif ||
+                      LibcallImpl == RTLIB::impl__ZGVsNxvl8l8_sincospi ||
+                      LibcallImpl == RTLIB::impl_armpl_svsincospi_f32_x ||
+                      LibcallImpl == RTLIB::impl_armpl_svsincospi_f64_x;
+    VectorType *VecTy = VectorType::get(ScalarTy, EC, IsScalable);
+
+    for (Attribute::AttrKind Attr : CommonFnAttrs)
+      FuncAttrBuilder.addAttribute(Attr);
+    FuncAttrBuilder.addMemoryAttr(MemoryEffects::argMemOnly(ModRefInfo::Mod));
+
+    AttributeList Attrs;
+    Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder);
+
+    {
+      AttrBuilder ArgAttrBuilder(Ctx);
+      for (Attribute::AttrKind AK : CommonPtrArgAttrs)
+        ArgAttrBuilder.addAttribute(AK);
+      ArgAttrBuilder.addAlignmentAttr(DL.getABITypeAlign(VecTy));
+      Attrs = Attrs.addParamAttributes(Ctx, 1, ArgAttrBuilder);
+      Attrs = Attrs.addParamAttributes(Ctx, 2, ArgAttrBuilder);
+    }
+
+    PointerType *PtrTy = PointerType::get(Ctx, 0);
+    SmallVector<Type *, 4> ArgTys = {VecTy, PtrTy, PtrTy};
+    if (hasVectorMaskArgument(LibcallImpl))
+      ArgTys.push_back(VectorType::get(Type::getInt1Ty(Ctx), EC, IsScalable));
+
+    return {FunctionType::get(Type::getVoidTy(Ctx), ArgTys, false), Attrs};
+  }
+  default:
+    return {};
+  }
+
+  return {};
+}
+
+bool RuntimeLibcallsInfo::hasVectorMaskArgument(RTLIB::LibcallImpl Impl) {
+  /// FIXME: This should be generated by tablegen and support the argument at an
+  /// arbitrary position
+  switch (Impl) {
+  case RTLIB::impl_armpl_svmodf_f64_x:
+  case RTLIB::impl_armpl_svmodf_f32_x:
+  case RTLIB::impl_armpl_svsincos_f32_x:
+  case RTLIB::impl_armpl_svsincos_f64_x:
+  case RTLIB::impl_armpl_svsincospi_f32_x:
+  case RTLIB::impl_armpl_svsincospi_f64_x:
+    return true;
+  default:
+    return false;
+  }
+}
diff --git a/llvm/lib/IR/SystemLibraries.cpp b/llvm/lib/IR/SystemLibraries.cpp
new file mode 100644
index 0000000000000..fa4ac2adb7296
--- /dev/null
+++ b/llvm/lib/IR/SystemLibraries.cpp
@@ -0,0 +1,34 @@
+//===-----------------------------------------------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/SystemLibraries.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+VectorLibrary llvm::ClVectorLibrary;
+
+static cl::opt<VectorLibrary, true> ClVectorLibraryOpt(
+    "vector-library", cl::Hidden, cl::desc("Vector functions library"),
+    cl::location(llvm::ClVectorLibrary), cl::init(VectorLibrary::NoLibrary),
+    cl::values(
+        clEnumValN(VectorLibrary::NoLibrary, "none",
+                   "No vector functions library"),
+        clEnumValN(VectorLibrary::Accelerate, "Accelerate",
+                   "Accelerate framework"),
+        clEnumValN(VectorLibrary::DarwinLibSystemM, "Darwin_libsystem_m",
+                   "Darwin libsystem_m"),
+        clEnumValN(VectorLibrary::LIBMVEC, "LIBMVEC",
+                   "GLIBC Vector Math library"),
+        clEnumValN(VectorLibrary::MASSV, "MASSV", "IBM MASS vector library"),
+        clEnumValN(VectorLibrary::SVML, "SVML", "Intel SVML library"),
+        clEnumValN(VectorLibrary::SLEEFGNUABI, "sleefgnuabi",
+                   "SIMD Library for Evaluating Elementary Functions"),
+        clEnumValN(VectorLibrary::ArmPL, "ArmPL", "Arm Performance Libraries"),
+        clEnumValN(VectorLibrary::AMDLIBM, "AMDLIBM",
+                   "AMD vector math library")));
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 0e9535d24a4cc..682448fe07352 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/RISCVTargetParser.h"
 #include <cassert>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/IR/Use.cpp b/llvm/lib/IR/Use.cpp
index 67882ba0144b4..504233575594d 100644
--- a/llvm/lib/IR/Use.cpp
+++ b/llvm/lib/IR/Use.cpp
@@ -9,7 +9,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 
-namespace llvm {
+using namespace llvm;
 
 void Use::swap(Use &RHS) {
   if (Val == RHS.Val)
@@ -42,5 +42,3 @@ void Use::zap(Use *Start, const Use *Stop, bool del) {
   if (del)
     ::operator delete(Start);
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/User.cpp b/llvm/lib/IR/User.cpp
index ab44cb4b8a3f7..9bb7c1298593a 100644
--- a/llvm/lib/IR/User.cpp
+++ b/llvm/lib/IR/User.cpp
@@ -11,8 +11,11 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IntrinsicInst.h"
 
+using namespace llvm;
+
 namespace llvm {
 class BasicBlock;
+}
 
 //===----------------------------------------------------------------------===//
 //                                 User Class
@@ -214,5 +217,3 @@ LLVM_NO_SANITIZE_MEMORY_ATTRIBUTE void User::operator delete(void *Usr) {
     ::operator delete(Storage);
   }
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/ValueSymbolTable.cpp b/llvm/lib/IR/ValueSymbolTable.cpp
index cd1cee16e7473..3bf52f6ef024e 100644
--- a/llvm/lib/IR/ValueSymbolTable.cpp
+++ b/llvm/lib/IR/ValueSymbolTable.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
 #include <cassert>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 7917712846990..fa18c3cd0f404 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -136,9 +136,7 @@ static cl::opt<bool> VerifyNoAliasScopeDomination(
     cl::desc("Ensure that llvm.experimental.noalias.scope.decl for identical "
              "scopes are not dominating"));
 
-namespace llvm {
-
-struct VerifierSupport {
+struct llvm::VerifierSupport {
   raw_ostream *OS;
   const Module &M;
   ModuleSlotTracker MST;
@@ -318,8 +316,6 @@ struct VerifierSupport {
   }
 };
 
-} // namespace llvm
-
 namespace {
 
 class Verifier : public InstVisitor<Verifier>, VerifierSupport {
@@ -1563,11 +1559,27 @@ void Verifier::visitDISubprogram(const DISubprogram &N) {
     auto *Node = dyn_cast<MDTuple>(RawNode);
     CheckDI(Node, "invalid retained nodes list", &N, RawNode);
     for (Metadata *Op : Node->operands()) {
-      CheckDI(Op && (isa<DILocalVariable>(Op) || isa<DILabel>(Op) ||
-                     isa<DIImportedEntity>(Op)),
+      CheckDI(Op, "nullptr in retained nodes", &N, Node);
+
+      auto True = [](const Metadata *) { return true; };
+      auto False = [](const Metadata *) { return false; };
+      bool IsTypeCorrect =
+          DISubprogram::visitRetainedNode<bool>(Op, True, True, True, False);
+      CheckDI(IsTypeCorrect,
               "invalid retained nodes, expected DILocalVariable, DILabel or "
               "DIImportedEntity",
               &N, Node, Op);
+
+      auto *RetainedNode = cast<DINode>(Op);
+      auto *RetainedNodeScope = dyn_cast_or_null<DILocalScope>(
+          DISubprogram::getRawRetainedNodeScope(RetainedNode));
+      CheckDI(RetainedNodeScope,
+              "invalid retained nodes, retained node is not local", &N, Node,
+              RetainedNode);
+      CheckDI(
+          RetainedNodeScope->getSubprogram() == &N,
+          "invalid retained nodes, retained node does not belong to subprogram",
+          &N, Node, RetainedNode, RetainedNodeScope);
     }
   }
   CheckDI(!hasConflictingReferenceFlags(N.getFlags()),
@@ -2484,7 +2496,8 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
 
   if (Attribute FPAttr = Attrs.getFnAttr("frame-pointer"); FPAttr.isValid()) {
     StringRef FP = FPAttr.getValueAsString();
-    if (FP != "all" && FP != "non-leaf" && FP != "none" && FP != "reserved")
+    if (FP != "all" && FP != "non-leaf" && FP != "none" && FP != "reserved" &&
+        FP != "non-leaf-no-reserve")
       CheckFailed("invalid value for 'frame-pointer' attribute: " + FP, V);
   }
 
@@ -2554,6 +2567,20 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
       CheckFailed("invalid value for 'denormal-fp-math-f32' attribute: " + S,
                   V);
   }
+
+  if (auto A = Attrs.getFnAttr("modular-format"); A.isValid()) {
+    StringRef S = A.getValueAsString();
+    SmallVector<StringRef> Args;
+    S.split(Args, ',');
+    Check(Args.size() >= 5,
+          "modular-format attribute requires at least 5 arguments", V);
+    unsigned FirstArgIdx;
+    Check(!Args[2].getAsInteger(10, FirstArgIdx),
+          "modular-format attribute first arg index is not an integer", V);
+    unsigned UpperBound = FT->getNumParams() + (FT->isVarArg() ? 1 : 0);
+    Check(FirstArgIdx > 0 && FirstArgIdx <= UpperBound,
+          "modular-format attribute first arg index is out of bounds", V);
+  }
 }
 void Verifier::verifyUnknownProfileMetadata(MDNode *MD) {
   Check(MD->getNumOperands() == 2,
@@ -6017,6 +6044,12 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     Check(cast<ConstantInt>(Call.getArgOperand(3))->getZExtValue() < 2,
           "cache type argument to llvm.prefetch must be 0-1", Call);
     break;
+  case Intrinsic::reloc_none: {
+    Check(isa<MDString>(
+              cast<MetadataAsValue>(Call.getArgOperand(0))->getMetadata()),
+          "llvm.reloc.none argument must be a metadata string", &Call);
+    break;
+  }
   case Intrinsic::stackprotector:
     Check(isa<AllocaInst>(Call.getArgOperand(1)->stripPointerCasts()),
           "llvm.stackprotector parameter #2 must resolve to an alloca.", Call);
@@ -6581,6 +6614,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     }
     break;
   }
+  case Intrinsic::vector_partial_reduce_fadd:
   case Intrinsic::vector_partial_reduce_add: {
     VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType());
     VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index b6182222f6f80..fefc733fa7697 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1076,63 +1076,59 @@ Expected<ArrayRef<SymbolResolution>>
 LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
                 ArrayRef<SymbolResolution> Res) {
   llvm::TimeTraceScope timeScope("LTO add thin LTO");
+  const auto BMID = BM.getModuleIdentifier();
   ArrayRef<SymbolResolution> ResTmp = Res;
   for (const InputFile::Symbol &Sym : Syms) {
     assert(!ResTmp.empty());
     const SymbolResolution &R = ResTmp.consume_front();
 
-    if (!Sym.getIRName().empty()) {
+    if (!Sym.getIRName().empty() && R.Prevailing) {
       auto GUID = GlobalValue::getGUIDAssumingExternalLinkage(
           GlobalValue::getGlobalIdentifier(Sym.getIRName(),
                                            GlobalValue::ExternalLinkage, ""));
-      if (R.Prevailing)
-        ThinLTO.setPrevailingModuleForGUID(GUID, BM.getModuleIdentifier());
+      ThinLTO.setPrevailingModuleForGUID(GUID, BMID);
     }
   }
 
-  if (Error Err =
-          BM.readSummary(ThinLTO.CombinedIndex, BM.getModuleIdentifier(),
-                         [&](GlobalValue::GUID GUID) {
-                           return ThinLTO.isPrevailingModuleForGUID(
-                               GUID, BM.getModuleIdentifier());
-                         }))
+  if (Error Err = BM.readSummary(
+          ThinLTO.CombinedIndex, BMID, [&](GlobalValue::GUID GUID) {
+            return ThinLTO.isPrevailingModuleForGUID(GUID, BMID);
+          }))
     return Err;
-  LLVM_DEBUG(dbgs() << "Module " << BM.getModuleIdentifier() << "\n");
+  LLVM_DEBUG(dbgs() << "Module " << BMID << "\n");
 
   for (const InputFile::Symbol &Sym : Syms) {
     assert(!Res.empty());
     const SymbolResolution &R = Res.consume_front();
 
-    if (!Sym.getIRName().empty()) {
+    if (!Sym.getIRName().empty() &&
+        (R.Prevailing || R.FinalDefinitionInLinkageUnit)) {
       auto GUID = GlobalValue::getGUIDAssumingExternalLinkage(
           GlobalValue::getGlobalIdentifier(Sym.getIRName(),
                                            GlobalValue::ExternalLinkage, ""));
       if (R.Prevailing) {
-        assert(
-            ThinLTO.isPrevailingModuleForGUID(GUID, BM.getModuleIdentifier()));
+        assert(ThinLTO.isPrevailingModuleForGUID(GUID, BMID));
 
         // For linker redefined symbols (via --wrap or --defsym) we want to
         // switch the linkage to `weak` to prevent IPOs from happening.
         // Find the summary in the module for this very GV and record the new
         // linkage so that we can switch it when we import the GV.
         if (R.LinkerRedefined)
-          if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(
-                  GUID, BM.getModuleIdentifier()))
+          if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(GUID, BMID))
             S->setLinkage(GlobalValue::WeakAnyLinkage);
       }
 
       // If the linker resolved the symbol to a local definition then mark it
       // as local in the summary for the module we are adding.
       if (R.FinalDefinitionInLinkageUnit) {
-        if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(
-                GUID, BM.getModuleIdentifier())) {
+        if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(GUID, BMID)) {
           S->setDSOLocal(true);
         }
       }
     }
   }
 
-  if (!ThinLTO.ModuleMap.insert({BM.getModuleIdentifier(), BM}).second)
+  if (!ThinLTO.ModuleMap.insert({BMID, BM}).second)
     return make_error<StringError>(
         "Expected at most one ThinLTO module per bitcode file",
         inconvertibleErrorCode());
@@ -1143,10 +1139,10 @@ LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
     // This is a fuzzy name matching where only modules with name containing the
     // specified switch values are going to be compiled.
     for (const std::string &Name : Conf.ThinLTOModulesToCompile) {
-      if (BM.getModuleIdentifier().contains(Name)) {
-        ThinLTO.ModulesToCompile->insert({BM.getModuleIdentifier(), BM});
-        LLVM_DEBUG(dbgs() << "[ThinLTO] Selecting " << BM.getModuleIdentifier()
-                          << " to compile\n");
+      if (BMID.contains(Name)) {
+        ThinLTO.ModulesToCompile->insert({BMID, BM});
+        LLVM_DEBUG(dbgs() << "[ThinLTO] Selecting " << BMID << " to compile\n");
+        break;
       }
     }
   }
@@ -1400,11 +1396,10 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
 SmallVector<const char *> LTO::getRuntimeLibcallSymbols(const Triple &TT) {
   RTLIB::RuntimeLibcallsInfo Libcalls(TT);
   SmallVector<const char *> LibcallSymbols;
-  ArrayRef<RTLIB::LibcallImpl> LibcallImpls = Libcalls.getLibcallImpls();
-  LibcallSymbols.reserve(LibcallImpls.size());
+  LibcallSymbols.reserve(Libcalls.getNumAvailableLibcallImpls());
 
-  for (RTLIB::LibcallImpl Impl : LibcallImpls) {
-    if (Impl != RTLIB::Unsupported)
+  for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+    if (Libcalls.isAvailable(Impl))
       LibcallSymbols.push_back(Libcalls.getLibcallImplName(Impl).data());
   }
 
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index f78d9b016d8c9..f215f39f41bfb 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -882,10 +882,7 @@ IRLinker::linkAppendingVarProto(GlobalVariable *DstGV,
   NG->copyAttributesFrom(SrcGV);
   forceRenaming(NG, SrcGV->getName());
 
-  Mapper.scheduleMapAppendingVariable(
-      *NG,
-      (DstGV && !DstGV->isDeclaration()) ? DstGV->getInitializer() : nullptr,
-      IsOldStructor, SrcElements);
+  Mapper.scheduleMapAppendingVariable(*NG, DstGV, IsOldStructor, SrcElements);
 
   // Replace any uses of the two global variables with uses of the new
   // global.
diff --git a/llvm/lib/MC/GOFFObjectWriter.cpp b/llvm/lib/MC/GOFFObjectWriter.cpp
index 71bd39763956e..a3eaaa743039d 100644
--- a/llvm/lib/MC/GOFFObjectWriter.cpp
+++ b/llvm/lib/MC/GOFFObjectWriter.cpp
@@ -520,7 +520,7 @@ GOFFObjectWriter::GOFFObjectWriter(
     std::unique_ptr<MCGOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS)
     : TargetObjectWriter(std::move(MOTW)), OS(OS) {}
 
-GOFFObjectWriter::~GOFFObjectWriter() {}
+GOFFObjectWriter::~GOFFObjectWriter() = default;
 
 uint64_t GOFFObjectWriter::writeObject() {
   uint64_t Size = GOFFWriter(OS, *Asm).writeObject();
diff --git a/llvm/lib/MC/MCDXContainerWriter.cpp b/llvm/lib/MC/MCDXContainerWriter.cpp
index 5eda039853ca8..ebed411454087 100644
--- a/llvm/lib/MC/MCDXContainerWriter.cpp
+++ b/llvm/lib/MC/MCDXContainerWriter.cpp
@@ -16,7 +16,7 @@
 
 using namespace llvm;
 
-MCDXContainerTargetWriter::~MCDXContainerTargetWriter() {}
+MCDXContainerTargetWriter::~MCDXContainerTargetWriter() = default;
 
 uint64_t DXContainerObjectWriter::writeObject() {
   auto &Asm = *this->Asm;
diff --git a/llvm/lib/MC/MCGOFFStreamer.cpp b/llvm/lib/MC/MCGOFFStreamer.cpp
index 8b228db0e8b30..ad6397bce70f0 100644
--- a/llvm/lib/MC/MCGOFFStreamer.cpp
+++ b/llvm/lib/MC/MCGOFFStreamer.cpp
@@ -20,7 +20,7 @@
 
 using namespace llvm;
 
-MCGOFFStreamer::~MCGOFFStreamer() {}
+MCGOFFStreamer::~MCGOFFStreamer() = default;
 
 GOFFObjectWriter &MCGOFFStreamer::getWriter() {
   return static_cast<GOFFObjectWriter &>(getAssembler().getWriter());
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index dd1bc2be5feb4..3c9ab8e108ddd 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -228,11 +228,9 @@ class AsmParser : public MCAsmParser {
     AssemblerDialect = i;
   }
 
-  void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) override;
-  bool Warning(SMLoc L, const Twine &Msg,
-               SMRange Range = std::nullopt) override;
-  bool printError(SMLoc L, const Twine &Msg,
-                  SMRange Range = std::nullopt) override;
+  void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
+  bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
+  bool printError(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
 
   const AsmToken &Lex() override;
 
@@ -312,7 +310,7 @@ class AsmParser : public MCAsmParser {
 
   void printMacroInstantiations();
   void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
-                    SMRange Range = std::nullopt) const {
+                    SMRange Range = {}) const {
     ArrayRef<SMRange> Ranges(Range);
     SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges);
   }
diff --git a/llvm/lib/MC/MCParser/COFFAsmParser.cpp b/llvm/lib/MC/MCParser/COFFAsmParser.cpp
index 5dd79946d8779..2a796fb1cfe11 100644
--- a/llvm/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/COFFAsmParser.cpp
@@ -21,7 +21,6 @@
 #include <cassert>
 #include <cstdint>
 #include <limits>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/MC/MCParser/COFFMasmParser.cpp b/llvm/lib/MC/MCParser/COFFMasmParser.cpp
index 04e12e56c4262..6e685c60a406e 100644
--- a/llvm/lib/MC/MCParser/COFFMasmParser.cpp
+++ b/llvm/lib/MC/MCParser/COFFMasmParser.cpp
@@ -20,7 +20,6 @@
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/SMLoc.h"
 #include <cstdint>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index 1a3752f71f065..c3faab89bb258 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -25,7 +25,6 @@
 #include "llvm/Support/SMLoc.h"
 #include <cassert>
 #include <cstdint>
-#include <utility>
 
 using namespace llvm;
 
@@ -695,15 +694,15 @@ bool ELFAsmParser::parseDirectivePrevious(StringRef DirName, SMLoc) {
 
 static MCSymbolAttr MCAttrForString(StringRef Type) {
   return StringSwitch<MCSymbolAttr>(Type)
-          .Cases("STT_FUNC", "function", MCSA_ELF_TypeFunction)
-          .Cases("STT_OBJECT", "object", MCSA_ELF_TypeObject)
-          .Cases("STT_TLS", "tls_object", MCSA_ELF_TypeTLS)
-          .Cases("STT_COMMON", "common", MCSA_ELF_TypeCommon)
-          .Cases("STT_NOTYPE", "notype", MCSA_ELF_TypeNoType)
-          .Cases("STT_GNU_IFUNC", "gnu_indirect_function",
-                 MCSA_ELF_TypeIndFunction)
-          .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
-          .Default(MCSA_Invalid);
+      .Cases({"STT_FUNC", "function"}, MCSA_ELF_TypeFunction)
+      .Cases({"STT_OBJECT", "object"}, MCSA_ELF_TypeObject)
+      .Cases({"STT_TLS", "tls_object"}, MCSA_ELF_TypeTLS)
+      .Cases({"STT_COMMON", "common"}, MCSA_ELF_TypeCommon)
+      .Cases({"STT_NOTYPE", "notype"}, MCSA_ELF_TypeNoType)
+      .Cases({"STT_GNU_IFUNC", "gnu_indirect_function"},
+             MCSA_ELF_TypeIndFunction)
+      .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
+      .Default(MCSA_Invalid);
 }
 
 /// parseDirectiveELFType
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index 8a8f11122673f..3a85770a2783d 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -483,11 +483,9 @@ class MasmParser : public MCAsmParser {
     AssemblerDialect = i;
   }
 
-  void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) override;
-  bool Warning(SMLoc L, const Twine &Msg,
-               SMRange Range = std::nullopt) override;
-  bool printError(SMLoc L, const Twine &Msg,
-                  SMRange Range = std::nullopt) override;
+  void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
+  bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
+  bool printError(SMLoc L, const Twine &Msg, SMRange Range = {}) override;
 
   enum ExpandKind { ExpandMacros, DoNotExpandMacros };
   const AsmToken &Lex(ExpandKind ExpandNextToken);
@@ -592,7 +590,7 @@ class MasmParser : public MCAsmParser {
   bool expandStatement(SMLoc Loc);
 
   void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
-                    SMRange Range = std::nullopt) const {
+                    SMRange Range = {}) const {
     ArrayRef<SMRange> Ranges(Range);
     SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges);
   }
@@ -5325,10 +5323,10 @@ void MasmParser::initializeDirectiveKindMap() {
 bool MasmParser::isMacroLikeDirective() {
   if (getLexer().is(AsmToken::Identifier)) {
     bool IsMacroLike = StringSwitch<bool>(getTok().getIdentifier())
-                           .CasesLower("repeat", "rept", true)
+                           .CasesLower({"repeat", "rept"}, true)
                            .CaseLower("while", true)
-                           .CasesLower("for", "irp", true)
-                           .CasesLower("forc", "irpc", true)
+                           .CasesLower({"for", "irp"}, true)
+                           .CasesLower({"forc", "irpc"}, true)
                            .Default(false);
     if (IsMacroLike)
       return true;
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index b493337b39317..11e42118a29ef 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -24,7 +24,6 @@
 #include <algorithm>
 #include <cassert>
 #include <limits>
-#include <memory>
 #include <sstream>
 #include <vector>
 
diff --git a/llvm/lib/MC/MCRegisterInfo.cpp b/llvm/lib/MC/MCRegisterInfo.cpp
index ba9ef00f9f0d8..7fd92bf974b95 100644
--- a/llvm/lib/MC/MCRegisterInfo.cpp
+++ b/llvm/lib/MC/MCRegisterInfo.cpp
@@ -221,7 +221,7 @@ bool MCRegisterInfo::regsOverlap(MCRegister RegA, MCRegister RegB) const {
   return false;
 }
 
-bool MCRegisterInfo::isArtificialRegUnit(unsigned Unit) const {
+bool MCRegisterInfo::isArtificialRegUnit(MCRegUnit Unit) const {
   for (MCRegUnitRootIterator Root(Unit, this); Root.isValid(); ++Root)
     if (isArtificial(*Root))
       return true;
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 39542bfbdd8e3..a8535dfa8a5d3 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -28,12 +28,12 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <string>
-#include <utility>
 #include <vector>
 
 using namespace llvm;
@@ -585,7 +585,12 @@ void MachObjectWriter::computeSymbolTable(
   unsigned Index = 1;
   for (MCSection &Sec : Asm)
     SectionIndexMap[&Sec] = Index++;
-  assert(Index <= 256 && "Too many sections!");
+
+  // Section indices begin from 1 in MachO. Only sections 1-255 can be indexed
+  // into section symbols. Referencing a section with index larger than 255 will
+  // not set n_sect for these symbols.
+  if (Index > 255)
+    getContext().reportError(SMLoc(), "Too many sections!");
 
   // Build the string table.
   for (const MCSymbol &Symbol : Asm.symbols()) {
@@ -622,7 +627,8 @@ void MachObjectWriter::computeSymbolTable(
       ExternalSymbolData.push_back(MSD);
     } else {
       MSD.SectionIndex = SectionIndexMap.lookup(&Symbol.getSection());
-      assert(MSD.SectionIndex && "Invalid section index!");
+      if (!MSD.SectionIndex)
+        getContext().reportError(SMLoc(), "Invalid section index!");
       ExternalSymbolData.push_back(MSD);
     }
   }
@@ -646,7 +652,8 @@ void MachObjectWriter::computeSymbolTable(
       LocalSymbolData.push_back(MSD);
     } else {
       MSD.SectionIndex = SectionIndexMap.lookup(&Symbol.getSection());
-      assert(MSD.SectionIndex && "Invalid section index!");
+      if (!MSD.SectionIndex)
+        getContext().reportError(SMLoc(), "Invalid section index!");
       LocalSymbolData.push_back(MSD);
     }
   }
diff --git a/llvm/lib/MC/SPIRVObjectWriter.cpp b/llvm/lib/MC/SPIRVObjectWriter.cpp
index 5e3713778286f..d693ea33d8d7b 100644
--- a/llvm/lib/MC/SPIRVObjectWriter.cpp
+++ b/llvm/lib/MC/SPIRVObjectWriter.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSPIRVObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCValue.h"
@@ -17,8 +18,10 @@ using namespace llvm;
 void SPIRVObjectWriter::writeHeader(const MCAssembler &Asm) {
   constexpr uint32_t MagicNumber = 0x07230203;
   constexpr uint32_t GeneratorID = 43;
-  constexpr uint32_t GeneratorMagicNumber =
-      (GeneratorID << 16) | (LLVM_VERSION_MAJOR);
+  const uint32_t GeneratorMagicNumber =
+      Asm.getContext().getTargetTriple().getVendor() == Triple::AMD
+          ? UINT16_MAX
+          : ((GeneratorID << 16) | (LLVM_VERSION_MAJOR));
   constexpr uint32_t Schema = 0;
 
   W.write<uint32_t>(MagicNumber);
diff --git a/llvm/lib/ObjCopy/COFF/COFFWriter.h b/llvm/lib/ObjCopy/COFF/COFFWriter.h
index 66d7f01c87f18..3ee0e06b92ae4 100644
--- a/llvm/lib/ObjCopy/COFF/COFFWriter.h
+++ b/llvm/lib/ObjCopy/COFF/COFFWriter.h
@@ -50,7 +50,7 @@ class COFFWriter {
   Expected<uint32_t> virtualAddressToFileAddress(uint32_t RVA);
 
 public:
-  virtual ~COFFWriter() {}
+  virtual ~COFFWriter() = default;
   Error write();
 
   COFFWriter(Object &Obj, raw_ostream &Out)
diff --git a/llvm/lib/ObjCopy/DXContainer/DXContainerObject.h b/llvm/lib/ObjCopy/DXContainer/DXContainerObject.h
index cbb09f5ec8e0d..710ae95e57495 100644
--- a/llvm/lib/ObjCopy/DXContainer/DXContainerObject.h
+++ b/llvm/lib/ObjCopy/DXContainer/DXContainerObject.h
@@ -12,7 +12,6 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Object/DXContainer.h"
-#include <vector>
 
 namespace llvm {
 namespace objcopy {
diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h
index 4f6473f515ddd..2783ef27ac9de 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObject.h
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.h
@@ -134,7 +134,7 @@ template <class ELFT> class ELFSectionWriter : public SectionWriter {
   using Elf_Sym = typename ELFT::Sym;
 
 public:
-  ~ELFSectionWriter() override {}
+  ~ELFSectionWriter() override = default;
   Error visit(const SymbolTableSection &Sec) override;
   Error visit(const RelocationSection &Sec) override;
   Error visit(const GnuDebugLinkSection &Sec) override;
@@ -180,7 +180,7 @@ template <class ELFT> class ELFSectionSizer : public MutableSectionVisitor {
 
 class BinarySectionWriter : public SectionWriter {
 public:
-  ~BinarySectionWriter() override {}
+  ~BinarySectionWriter() override = default;
 
   Error visit(const SymbolTableSection &Sec) override;
   Error visit(const RelocationSection &Sec) override;
@@ -346,7 +346,7 @@ template <class ELFT> class ELFWriter : public Writer {
   size_t totalSize() const;
 
 public:
-  ~ELFWriter() override {}
+  ~ELFWriter() override = default;
   bool WriteSectionHeaders;
 
   // For --only-keep-debug, select an alternative section/segment layout
@@ -367,7 +367,7 @@ class BinaryWriter : public Writer {
   uint64_t TotalSize = 0;
 
 public:
-  ~BinaryWriter() override {}
+  ~BinaryWriter() override = default;
   Error finalize() override;
   Error write() override;
   BinaryWriter(Object &Obj, raw_ostream &Out, const CommonConfig &Config)
@@ -784,7 +784,7 @@ class SectionIndexSection : public SectionBase {
   SymbolTableSection *Symbols = nullptr;
 
 public:
-  ~SectionIndexSection() override {}
+  ~SectionIndexSection() override = default;
   void addIndex(uint32_t Index) {
     assert(Size > 0);
     Indexes.push_back(Index);
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.cpp b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
index 8d2c02dc37c99..e45cc547ee446 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObject.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
@@ -9,7 +9,6 @@
 #include "MachOObject.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/SystemZ/zOSSupport.h"
-#include <unordered_set>
 
 using namespace llvm;
 using namespace llvm::objcopy::macho;
diff --git a/llvm/lib/ObjCopy/MachO/MachOReader.h b/llvm/lib/ObjCopy/MachO/MachOReader.h
index e315e6fd9b117..940ba4c2d879e 100644
--- a/llvm/lib/ObjCopy/MachO/MachOReader.h
+++ b/llvm/lib/ObjCopy/MachO/MachOReader.h
@@ -23,7 +23,7 @@ namespace macho {
 // raw binaries and regular MachO object files.
 class Reader {
 public:
-  virtual ~Reader(){};
+  virtual ~Reader() = default;
   virtual Expected<std::unique_ptr<Object>> create() const = 0;
 };
 
diff --git a/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h b/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h
index 8620548ed5991..47639ad82fa75 100644
--- a/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h
+++ b/llvm/lib/ObjCopy/XCOFF/XCOFFWriter.h
@@ -20,7 +20,7 @@ namespace xcoff {
 
 class XCOFFWriter {
 public:
-  virtual ~XCOFFWriter() {}
+  virtual ~XCOFFWriter() = default;
   XCOFFWriter(Object &Obj, raw_ostream &Out) : Obj(Obj), Out(Out) {}
   Error write();
 
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index 6da97f9b3755d..354c51d66419c 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -831,17 +831,17 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
   };
 
   uint8_t Version = 0;
-  uint8_t Feature = 0;
+  uint16_t Feature = 0;
   BBAddrMap::Features FeatEnable{};
   while (!ULEBSizeErr && !MetadataDecodeErr && Cur &&
          Cur.tell() < Content.size()) {
     Version = Data.getU8(Cur);
     if (!Cur)
       break;
-    if (Version < 2 || Version > 4)
+    if (Version < 2 || Version > 5)
       return createError("unsupported SHT_LLVM_BB_ADDR_MAP version: " +
                          Twine(static_cast<int>(Version)));
-    Feature = Data.getU8(Cur); // Feature byte
+    Feature = Version < 5 ? Data.getU8(Cur) : Data.getU16(Cur);
     if (!Cur)
       break;
     auto FeatEnableOrErr = BBAddrMap::Features::decode(Feature);
@@ -858,6 +858,11 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
                          "basic block hash feature is enabled: version = " +
                          Twine(static_cast<int>(Version)) +
                          " feature = " + Twine(static_cast<int>(Feature)));
+    if (FeatEnable.PostLinkCfg && Version < 5)
+      return createError("version should be >= 5 for SHT_LLVM_BB_ADDR_MAP when "
+                         "post link cfg feature is enabled: version = " +
+                         Twine(static_cast<int>(Version)) +
+                         " feature = " + Twine(static_cast<int>(Feature)));
     uint32_t NumBlocksInBBRange = 0;
     uint32_t NumBBRanges = 1;
     typename ELFFile<ELFT>::uintX_t RangeBaseAddress = 0;
@@ -946,6 +951,10 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
         uint64_t BBF = FeatEnable.BBFreq
                            ? readULEB128As<uint64_t>(Data, Cur, ULEBSizeErr)
                            : 0;
+        uint32_t PostLinkBBFreq =
+            FeatEnable.PostLinkCfg
+                ? readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr)
+                : 0;
 
         // Branch probability
         llvm::SmallVector<PGOAnalysisMap::PGOBBEntry::SuccessorEntry, 2>
@@ -955,13 +964,20 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
           for (uint64_t I = 0; I < SuccCount; ++I) {
             uint32_t BBID = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);
             uint32_t BrProb = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);
+            uint32_t PostLinkFreq =
+                FeatEnable.PostLinkCfg
+                    ? readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr)
+                    : 0;
+
             if (PGOAnalyses)
-              Successors.push_back({BBID, BranchProbability::getRaw(BrProb)});
+              Successors.push_back(
+                  {BBID, BranchProbability::getRaw(BrProb), PostLinkFreq});
           }
         }
 
         if (PGOAnalyses)
-          PGOBBEntries.push_back({BlockFrequency(BBF), std::move(Successors)});
+          PGOBBEntries.push_back(
+              {BlockFrequency(BBF), PostLinkBBFreq, std::move(Successors)});
       }
 
       if (PGOAnalyses)
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index f9fda23469ee5..3f0ecbe5e439b 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -311,6 +311,10 @@ static std::optional<std::string> hexagonAttrToFeatureString(unsigned Attr) {
     return "v73";
   case 75:
     return "v75";
+  case 79:
+    return "v79";
+  case 81:
+    return "v81";
   default:
     return {};
   }
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index e09dc947c2779..c2f4560c06c0d 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -1978,20 +1978,42 @@ uint64_t MachOObjectFile::getSectionSize(DataRefImpl Sec) const {
   return SectSize;
 }
 
-ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint32_t Offset,
+ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint64_t Offset,
                                                       uint64_t Size) const {
   return arrayRefFromStringRef(getData().substr(Offset, Size));
 }
 
 Expected<ArrayRef<uint8_t>>
 MachOObjectFile::getSectionContents(DataRefImpl Sec) const {
-  uint32_t Offset;
+  uint64_t Offset;
   uint64_t Size;
 
   if (is64Bit()) {
     MachO::section_64 Sect = getSection64(Sec);
     Offset = Sect.offset;
     Size = Sect.size;
+    // Check for large mach-o files where the section contents might exceed
+    // 4GB. MachO::section_64 objects only have 32 bit file offsets to the
+    // section contents and can overflow in dSYM files. We can track this and
+    // adjust the section offset to be 64 bit safe. If sections overflow then
+    // section ordering is enforced. If sections are not ordered, then an error
+    // will be returned stopping invalid section data from being returned.
+    uint64_t PrevTrueOffset = 0;
+    uint64_t SectOffsetAdjust = 0;
+    for (uint32_t SectIdx = 0; SectIdx < Sec.d.a; ++SectIdx) {
+      MachO::section_64 CurrSect =
+          getStruct<MachO::section_64>(*this, Sections[SectIdx]);
+      uint64_t CurrTrueOffset = (uint64_t)CurrSect.offset + SectOffsetAdjust;
+      if ((SectOffsetAdjust > 0) && (PrevTrueOffset > CurrTrueOffset))
+        return malformedError("section data exceeds 4GB and section file "
+                              "offsets are not ordered");
+      const uint64_t EndSectFileOffset =
+          (uint64_t)CurrSect.offset + CurrSect.size;
+      if (EndSectFileOffset > UINT32_MAX)
+        SectOffsetAdjust += EndSectFileOffset & 0xFFFFFFFF00000000ull;
+      PrevTrueOffset = CurrTrueOffset;
+    }
+    Offset += SectOffsetAdjust;
   } else {
     MachO::section Sect = getSection(Sec);
     Offset = Sect.offset;
diff --git a/llvm/lib/Object/WindowsMachineFlag.cpp b/llvm/lib/Object/WindowsMachineFlag.cpp
index caf357e8c136f..14c14f693ca96 100644
--- a/llvm/lib/Object/WindowsMachineFlag.cpp
+++ b/llvm/lib/Object/WindowsMachineFlag.cpp
@@ -23,8 +23,8 @@ using namespace llvm;
 COFF::MachineTypes llvm::getMachineType(StringRef S) {
   // Flags must be a superset of Microsoft lib.exe /machine flags.
   return StringSwitch<COFF::MachineTypes>(S.lower())
-      .Cases("x64", "amd64", COFF::IMAGE_FILE_MACHINE_AMD64)
-      .Cases("x86", "i386", COFF::IMAGE_FILE_MACHINE_I386)
+      .Cases({"x64", "amd64"}, COFF::IMAGE_FILE_MACHINE_AMD64)
+      .Cases({"x86", "i386"}, COFF::IMAGE_FILE_MACHINE_I386)
       .Case("arm", COFF::IMAGE_FILE_MACHINE_ARMNT)
       .Case("arm64", COFF::IMAGE_FILE_MACHINE_ARM64)
       .Case("arm64ec", COFF::IMAGE_FILE_MACHINE_ARM64EC)
diff --git a/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp b/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
index 3056251809308..5dd10f402f2ca 100644
--- a/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
+++ b/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
@@ -80,15 +80,14 @@ void ScalarEnumerationTraits<SymbolKind>::enumeration(IO &io,
                                                       SymbolKind &Value) {
   auto SymbolNames = getSymbolTypeNames();
   for (const auto &E : SymbolNames)
-    io.enumCase(Value, E.Name.str().c_str(), E.Value);
+    io.enumCase(Value, E.Name, E.Value);
 }
 
 void ScalarBitSetTraits<CompileSym2Flags>::bitset(IO &io,
                                                   CompileSym2Flags &Flags) {
   auto FlagNames = getCompileSym2FlagNames();
   for (const auto &E : FlagNames) {
-    io.bitSetCase(Flags, E.Name.str().c_str(),
-                  static_cast<CompileSym2Flags>(E.Value));
+    io.bitSetCase(Flags, E.Name, static_cast<CompileSym2Flags>(E.Value));
   }
 }
 
@@ -96,40 +95,35 @@ void ScalarBitSetTraits<CompileSym3Flags>::bitset(IO &io,
                                                   CompileSym3Flags &Flags) {
   auto FlagNames = getCompileSym3FlagNames();
   for (const auto &E : FlagNames) {
-    io.bitSetCase(Flags, E.Name.str().c_str(),
-                  static_cast<CompileSym3Flags>(E.Value));
+    io.bitSetCase(Flags, E.Name, static_cast<CompileSym3Flags>(E.Value));
   }
 }
 
 void ScalarBitSetTraits<ExportFlags>::bitset(IO &io, ExportFlags &Flags) {
   auto FlagNames = getExportSymFlagNames();
   for (const auto &E : FlagNames) {
-    io.bitSetCase(Flags, E.Name.str().c_str(),
-                  static_cast<ExportFlags>(E.Value));
+    io.bitSetCase(Flags, E.Name, static_cast<ExportFlags>(E.Value));
   }
 }
 
 void ScalarBitSetTraits<PublicSymFlags>::bitset(IO &io, PublicSymFlags &Flags) {
   auto FlagNames = getPublicSymFlagNames();
   for (const auto &E : FlagNames) {
-    io.bitSetCase(Flags, E.Name.str().c_str(),
-                  static_cast<PublicSymFlags>(E.Value));
+    io.bitSetCase(Flags, E.Name, static_cast<PublicSymFlags>(E.Value));
   }
 }
 
 void ScalarBitSetTraits<LocalSymFlags>::bitset(IO &io, LocalSymFlags &Flags) {
   auto FlagNames = getLocalFlagNames();
   for (const auto &E : FlagNames) {
-    io.bitSetCase(Flags, E.Name.str().c_str(),
-                  static_cast<LocalSymFlags>(E.Value));
+    io.bitSetCase(Flags, E.Name, static_cast<LocalSymFlags>(E.Value));
   }
 }
 
 void ScalarBitSetTraits<ProcSymFlags>::bitset(IO &io, ProcSymFlags &Flags) {
   auto FlagNames = getProcSymFlagNames();
   for (const auto &E : FlagNames) {
-    io.bitSetCase(Flags, E.Name.str().c_str(),
-                  static_cast<ProcSymFlags>(E.Value));
+    io.bitSetCase(Flags, E.Name, static_cast<ProcSymFlags>(E.Value));
   }
 }
 
@@ -137,15 +131,14 @@ void ScalarBitSetTraits<FrameProcedureOptions>::bitset(
     IO &io, FrameProcedureOptions &Flags) {
   auto FlagNames = getFrameProcSymFlagNames();
   for (const auto &E : FlagNames) {
-    io.bitSetCase(Flags, E.Name.str().c_str(),
-                  static_cast<FrameProcedureOptions>(E.Value));
+    io.bitSetCase(Flags, E.Name, static_cast<FrameProcedureOptions>(E.Value));
   }
 }
 
 void ScalarEnumerationTraits<CPUType>::enumeration(IO &io, CPUType &Cpu) {
   auto CpuNames = getCPUTypeNames();
   for (const auto &E : CpuNames) {
-    io.enumCase(Cpu, E.Name.str().c_str(), static_cast<CPUType>(E.Value));
+    io.enumCase(Cpu, E.Name, static_cast<CPUType>(E.Value));
   }
 }
 
@@ -177,7 +170,7 @@ void ScalarEnumerationTraits<RegisterId>::enumeration(IO &io, RegisterId &Reg) {
     RegNames = getRegisterNames(*CpuType);
 
   for (const auto &E : RegNames) {
-    io.enumCase(Reg, E.Name.str().c_str(), static_cast<RegisterId>(E.Value));
+    io.enumCase(Reg, E.Name, static_cast<RegisterId>(E.Value));
   }
   io.enumFallback<Hex16>(Reg);
 }
@@ -186,8 +179,7 @@ void ScalarEnumerationTraits<TrampolineType>::enumeration(
     IO &io, TrampolineType &Tramp) {
   auto TrampNames = getTrampolineNames();
   for (const auto &E : TrampNames) {
-    io.enumCase(Tramp, E.Name.str().c_str(),
-                static_cast<TrampolineType>(E.Value));
+    io.enumCase(Tramp, E.Name, static_cast<TrampolineType>(E.Value));
   }
 }
 
@@ -195,7 +187,7 @@ void ScalarEnumerationTraits<ThunkOrdinal>::enumeration(IO &io,
                                                         ThunkOrdinal &Ord) {
   auto ThunkNames = getThunkOrdinalNames();
   for (const auto &E : ThunkNames) {
-    io.enumCase(Ord, E.Name.str().c_str(), static_cast<ThunkOrdinal>(E.Value));
+    io.enumCase(Ord, E.Name, static_cast<ThunkOrdinal>(E.Value));
   }
 }
 
@@ -203,8 +195,7 @@ void ScalarEnumerationTraits<FrameCookieKind>::enumeration(
     IO &io, FrameCookieKind &FC) {
   auto ThunkNames = getFrameCookieKindNames();
   for (const auto &E : ThunkNames) {
-    io.enumCase(FC, E.Name.str().c_str(),
-                static_cast<FrameCookieKind>(E.Value));
+    io.enumCase(FC, E.Name, static_cast<FrameCookieKind>(E.Value));
   }
 }
 
@@ -212,8 +203,7 @@ void ScalarEnumerationTraits<JumpTableEntrySize>::enumeration(
     IO &io, JumpTableEntrySize &FC) {
   auto ThunkNames = getJumpTableEntrySizeNames();
   for (const auto &E : ThunkNames) {
-    io.enumCase(FC, E.Name.str().c_str(),
-                static_cast<JumpTableEntrySize>(E.Value));
+    io.enumCase(FC, E.Name, static_cast<JumpTableEntrySize>(E.Value));
   }
 }
 
diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
index 5dff9bad12b52..5019298baebbd 100644
--- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
@@ -589,55 +589,55 @@ void MappingTraits<DXContainerYAML::SignatureElement>::mapping(
 void ScalarEnumerationTraits<dxbc::PSV::SemanticKind>::enumeration(
     IO &IO, dxbc::PSV::SemanticKind &Value) {
   for (const auto &E : dxbc::PSV::getSemanticKinds())
-    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+    IO.enumCase(Value, E.Name, E.Value);
 }
 
 void ScalarEnumerationTraits<dxbc::PSV::ComponentType>::enumeration(
     IO &IO, dxbc::PSV::ComponentType &Value) {
   for (const auto &E : dxbc::PSV::getComponentTypes())
-    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+    IO.enumCase(Value, E.Name, E.Value);
 }
 
 void ScalarEnumerationTraits<dxbc::PSV::InterpolationMode>::enumeration(
     IO &IO, dxbc::PSV::InterpolationMode &Value) {
   for (const auto &E : dxbc::PSV::getInterpolationModes())
-    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+    IO.enumCase(Value, E.Name, E.Value);
 }
 
 void ScalarEnumerationTraits<dxbc::PSV::ResourceType>::enumeration(
     IO &IO, dxbc::PSV::ResourceType &Value) {
   for (const auto &E : dxbc::PSV::getResourceTypes())
-    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+    IO.enumCase(Value, E.Name, E.Value);
 }
 
 void ScalarEnumerationTraits<dxbc::PSV::ResourceKind>::enumeration(
     IO &IO, dxbc::PSV::ResourceKind &Value) {
   for (const auto &E : dxbc::PSV::getResourceKinds())
-    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+    IO.enumCase(Value, E.Name, E.Value);
 }
 
 void ScalarEnumerationTraits<dxbc::D3DSystemValue>::enumeration(
     IO &IO, dxbc::D3DSystemValue &Value) {
   for (const auto &E : dxbc::getD3DSystemValues())
-    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+    IO.enumCase(Value, E.Name, E.Value);
 }
 
 void ScalarEnumerationTraits<dxbc::SigMinPrecision>::enumeration(
     IO &IO, dxbc::SigMinPrecision &Value) {
   for (const auto &E : dxbc::getSigMinPrecisions())
-    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+    IO.enumCase(Value, E.Name, E.Value);
 }
 
 void ScalarEnumerationTraits<dxbc::SigComponentType>::enumeration(
     IO &IO, dxbc::SigComponentType &Value) {
   for (const auto &E : dxbc::getSigComponentTypes())
-    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+    IO.enumCase(Value, E.Name, E.Value);
 }
 
 void ScalarEnumerationTraits<dxbc::RootParameterType>::enumeration(
     IO &IO, dxbc::RootParameterType &Value) {
   for (const auto &E : dxbc::getRootParameterTypes())
-    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+    IO.enumCase(Value, E.Name, E.Value);
 }
 
 void ScalarEnumerationTraits<dxil::ResourceClass>::enumeration(
@@ -650,37 +650,37 @@ void ScalarEnumerationTraits<dxil::ResourceClass>::enumeration(
   };
 
   for (const auto &E : ResourceClasses)
-    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+    IO.enumCase(Value, E.Name, E.Value);
 }
 
 void ScalarEnumerationTraits<dxbc::SamplerFilter>::enumeration(
     IO &IO, dxbc::SamplerFilter &Value) {
   for (const auto &E : dxbc::getSamplerFilters())
-    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+    IO.enumCase(Value, E.Name, E.Value);
 }
 
 void ScalarEnumerationTraits<dxbc::StaticBorderColor>::enumeration(
     IO &IO, dxbc::StaticBorderColor &Value) {
   for (const auto &E : dxbc::getStaticBorderColors())
-    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+    IO.enumCase(Value, E.Name, E.Value);
 }
 
 void ScalarEnumerationTraits<dxbc::TextureAddressMode>::enumeration(
     IO &IO, dxbc::TextureAddressMode &Value) {
   for (const auto &E : dxbc::getTextureAddressModes())
-    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+    IO.enumCase(Value, E.Name, E.Value);
 }
 
 void ScalarEnumerationTraits<dxbc::ShaderVisibility>::enumeration(
     IO &IO, dxbc::ShaderVisibility &Value) {
   for (const auto &E : dxbc::getShaderVisibility())
-    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+    IO.enumCase(Value, E.Name, E.Value);
 }
 
 void ScalarEnumerationTraits<dxbc::ComparisonFunc>::enumeration(
     IO &IO, dxbc::ComparisonFunc &Value) {
   for (const auto &E : dxbc::getComparisonFuncs())
-    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
+    IO.enumCase(Value, E.Name, E.Value);
 }
 
 } // namespace yaml
diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp
index 8b75fbe8291f0..8530785d07c93 100644
--- a/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -1465,13 +1465,19 @@ void ELFState<ELFT>::writeSectionContent(
   for (const auto &[Idx, E] : llvm::enumerate(*Section.Entries)) {
     // Write version and feature values.
     if (Section.Type == llvm::ELF::SHT_LLVM_BB_ADDR_MAP) {
-      if (E.Version > 4)
+      if (E.Version > 5)
         WithColor::warning() << "unsupported SHT_LLVM_BB_ADDR_MAP version: "
                              << static_cast<int>(E.Version)
                              << "; encoding using the most recent version";
       CBA.write(E.Version);
-      CBA.write(E.Feature);
-      SHeader.sh_size += 2;
+      SHeader.sh_size += 1;
+      if (E.Version < 5) {
+        CBA.write(static_cast<uint8_t>(E.Feature));
+        SHeader.sh_size += 1;
+      } else {
+        CBA.write<uint16_t>(E.Feature, ELFT::Endianness);
+        SHeader.sh_size += 2;
+      }
     }
     auto FeatureOrErr = llvm::object::BBAddrMap::Features::decode(E.Feature);
     bool MultiBBRangeFeatureEnabled = false;
@@ -1556,11 +1562,15 @@ void ELFState<ELFT>::writeSectionContent(
     for (const auto &PGOBBE : PGOBBEntries) {
       if (PGOBBE.BBFreq)
         SHeader.sh_size += CBA.writeULEB128(*PGOBBE.BBFreq);
+      if (FeatureOrErr->PostLinkCfg || PGOBBE.PostLinkBBFreq.has_value())
+        SHeader.sh_size += CBA.writeULEB128(PGOBBE.PostLinkBBFreq.value_or(0));
       if (PGOBBE.Successors) {
         SHeader.sh_size += CBA.writeULEB128(PGOBBE.Successors->size());
-        for (const auto &[ID, BrProb] : *PGOBBE.Successors) {
+        for (const auto &[ID, BrProb, PostLinkBrFreq] : *PGOBBE.Successors) {
           SHeader.sh_size += CBA.writeULEB128(ID);
           SHeader.sh_size += CBA.writeULEB128(BrProb);
+          if (FeatureOrErr->PostLinkCfg || PostLinkBrFreq.has_value())
+            SHeader.sh_size += CBA.writeULEB128(PostLinkBrFreq.value_or(0));
         }
       }
     }
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index f8a84b075b779..d07c37edad241 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -37,8 +37,6 @@ unsigned Object::getMachine() const {
     return *Header.Machine;
   return llvm::ELF::EM_NONE;
 }
-
-constexpr StringRef SectionHeaderTable::TypeStr;
 } // namespace ELFYAML
 
 namespace yaml {
@@ -672,7 +670,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
       for (unsigned K = ELF::EF_AMDGPU_GENERIC_VERSION_MIN;
            K <= ELF::EF_AMDGPU_GENERIC_VERSION_MAX; ++K) {
         std::string Key = "EF_AMDGPU_GENERIC_VERSION_V" + std::to_string(K);
-        IO.maskedBitSetCase(Value, Key.c_str(),
+        IO.maskedBitSetCase(Value, Key,
                             K << ELF::EF_AMDGPU_GENERIC_VERSION_OFFSET,
                             ELF::EF_AMDGPU_GENERIC_VERSION);
       }
@@ -1886,7 +1884,7 @@ void MappingTraits<ELFYAML::BBAddrMapEntry>::mapping(
     IO &IO, ELFYAML::BBAddrMapEntry &E) {
   assert(IO.getContext() && "The IO context is not initialized");
   IO.mapRequired("Version", E.Version);
-  IO.mapOptional("Feature", E.Feature, Hex8(0));
+  IO.mapOptional("Feature", E.Feature, Hex16(0));
   IO.mapOptional("NumBBRanges", E.NumBBRanges);
   IO.mapOptional("BBRanges", E.BBRanges);
 }
@@ -1920,6 +1918,7 @@ void MappingTraits<ELFYAML::PGOAnalysisMapEntry::PGOBBEntry>::mapping(
     IO &IO, ELFYAML::PGOAnalysisMapEntry::PGOBBEntry &E) {
   assert(IO.getContext() && "The IO context is not initialized");
   IO.mapOptional("BBFreq", E.BBFreq);
+  IO.mapOptional("PostLinkBBFreq", E.PostLinkBBFreq);
   IO.mapOptional("Successors", E.Successors);
 }
 
@@ -1929,6 +1928,7 @@ void MappingTraits<ELFYAML::PGOAnalysisMapEntry::PGOBBEntry::SuccessorEntry>::
   assert(IO.getContext() && "The IO context is not initialized");
   IO.mapRequired("ID", E.ID);
   IO.mapRequired("BrProb", E.BrProb);
+  IO.mapOptional("PostLinkBrFreq", E.PostLinkBrFreq);
 }
 
 void MappingTraits<ELFYAML::GnuHashHeader>::mapping(IO &IO,
diff --git a/llvm/lib/ObjectYAML/GOFFYAML.cpp b/llvm/lib/ObjectYAML/GOFFYAML.cpp
index 60bc1f70274b2..ecd7fb646ea36 100644
--- a/llvm/lib/ObjectYAML/GOFFYAML.cpp
+++ b/llvm/lib/ObjectYAML/GOFFYAML.cpp
@@ -15,7 +15,7 @@
 namespace llvm {
 namespace GOFFYAML {
 
-Object::Object() {}
+Object::Object() = default;
 
 } // namespace GOFFYAML
 
diff --git a/llvm/lib/Option/ArgList.cpp b/llvm/lib/Option/ArgList.cpp
index 2f4e21257af09..e2fc32d90f15e 100644
--- a/llvm/lib/Option/ArgList.cpp
+++ b/llvm/lib/Option/ArgList.cpp
@@ -24,7 +24,6 @@
 #include <cstddef>
 #include <memory>
 #include <string>
-#include <utility>
 #include <vector>
 
 using namespace llvm;
@@ -230,10 +229,8 @@ StringRef ArgList::getSubCommand(
     HandleMultipleSubcommands(SubCommands);
     return {};
   }
-  if (!OtherPositionals.empty()) {
+  if (!OtherPositionals.empty())
     HandleOtherPositionals(OtherPositionals);
-    return {};
-  }
 
   if (SubCommands.size() == 1)
     return SubCommands.front();
diff --git a/llvm/lib/Option/OptTable.cpp b/llvm/lib/Option/OptTable.cpp
index 14e3b0d60886d..20398b5f582f4 100644
--- a/llvm/lib/Option/OptTable.cpp
+++ b/llvm/lib/Option/OptTable.cpp
@@ -25,7 +25,6 @@
 #include <map>
 #include <set>
 #include <string>
-#include <utility>
 #include <vector>
 
 using namespace llvm;
@@ -756,9 +755,8 @@ void OptTable::internalPrintHelp(
   // pairs.
   std::map<std::string, std::vector<OptionInfo>> GroupedOptionHelp;
 
-  auto ActiveSubCommand =
-      std::find_if(SubCommands.begin(), SubCommands.end(),
-                   [&](const auto &C) { return SubCommand == C.Name; });
+  auto ActiveSubCommand = llvm::find_if(
+      SubCommands, [&](const auto &C) { return SubCommand == C.Name; });
   if (!SubCommand.empty()) {
     assert(ActiveSubCommand != SubCommands.end() &&
            "Not a valid registered subcommand.");
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 3c9a27ac24015..0d190ea448931 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -45,7 +45,6 @@
 #include "llvm/Analysis/IR2Vec.h"
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/InlineAdvisor.h"
-#include "llvm/Analysis/InlineSizeEstimatorAnalysis.h"
 #include "llvm/Analysis/InstCount.h"
 #include "llvm/Analysis/KernelInfo.h"
 #include "llvm/Analysis/LastRunTrackingAnalysis.h"
@@ -67,6 +66,7 @@
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RuntimeLibcallInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionDivision.h"
@@ -899,6 +899,11 @@ Expected<bool> parseEntryExitInstrumenterPassOptions(StringRef Params) {
                                             "EntryExitInstrumenter");
 }
 
+Expected<bool> parseDropUnnecessaryAssumesPassOptions(StringRef Params) {
+  return PassBuilder::parseSinglePassOption(Params, "drop-deref",
+                                            "DropUnnecessaryAssumes");
+}
+
 Expected<bool> parseLoopExtractorPassOptions(StringRef Params) {
   return PassBuilder::parseSinglePassOption(Params, "single", "LoopExtractor");
 }
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index bd03ac090721c..dd73c04959732 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1298,10 +1298,18 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
 
 /// TODO: Should LTO cause any differences to this set of passes?
 void PassBuilder::addVectorPasses(OptimizationLevel Level,
-                                  FunctionPassManager &FPM, bool IsFullLTO) {
+                                  FunctionPassManager &FPM,
+                                  ThinOrFullLTOPhase LTOPhase) {
+  const bool IsFullLTO = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink;
+
   FPM.addPass(LoopVectorizePass(
       LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
 
+  // Drop dereferenceable assumes after vectorization, as they are no longer
+  // needed and can inhibit further optimization.
+  if (!isLTOPreLink(LTOPhase))
+    FPM.addPass(DropUnnecessaryAssumesPass(/*DropDereferenceable=*/true));
+
   FPM.addPass(InferAlignmentPass());
   if (IsFullLTO) {
     // The vectorizer may have significantly shortened a loop body; unroll
@@ -1572,7 +1580,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   // from the TargetLibraryInfo.
   OptimizePM.addPass(InjectTLIMappings());
 
-  addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false);
+  addVectorPasses(Level, OptimizePM, LTOPhase);
 
   invokeVectorizerEndEPCallbacks(OptimizePM, Level);
 
@@ -2162,7 +2170,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
 
   MainFPM.addPass(LoopDistributePass());
 
-  addVectorPasses(Level, MainFPM, /* IsFullLTO */ true);
+  addVectorPasses(Level, MainFPM, ThinOrFullLTOPhase::FullLTOPostLink);
 
   invokeVectorizerEndEPCallbacks(MainFPM, Level);
 
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 1853cdd45d0ee..074c328ef0931 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -35,6 +35,7 @@ MODULE_ANALYSIS("no-op-module", NoOpModuleAnalysis())
 MODULE_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
 MODULE_ANALYSIS("profile-summary", ProfileSummaryAnalysis())
 MODULE_ANALYSIS("reg-usage", PhysicalRegisterUsageAnalysis())
+MODULE_ANALYSIS("runtime-libcall-info", RuntimeLibraryAnalysis())
 MODULE_ANALYSIS("stack-safety", StackSafetyGlobalAnalysis())
 MODULE_ANALYSIS("verify", VerifierAnalysis())
 
@@ -358,7 +359,6 @@ FUNCTION_ANALYSIS("ephemerals", EphemeralValuesAnalysis())
 FUNCTION_ANALYSIS("func-properties", FunctionPropertiesAnalysis())
 FUNCTION_ANALYSIS("machine-function-info", MachineFunctionAnalysis(*TM))
 FUNCTION_ANALYSIS("gc-function", GCFunctionAnalysis())
-FUNCTION_ANALYSIS("inliner-size-estimator", InlineSizeEstimatorAnalysis())
 FUNCTION_ANALYSIS("last-run-tracking", LastRunTrackingAnalysis())
 FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis())
 FUNCTION_ANALYSIS("loops", LoopAnalysis())
@@ -431,7 +431,6 @@ FUNCTION_PASS("dot-post-dom", PostDomPrinter())
 FUNCTION_PASS("dot-post-dom-only", PostDomOnlyPrinter())
 FUNCTION_PASS("dse", DSEPass())
 FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(*TM))
-FUNCTION_PASS("drop-unnecessary-assumes", DropUnnecessaryAssumesPass())
 FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(*TM))
 FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(*TM))
 FUNCTION_PASS("expand-reductions", ExpandReductionsPass())
@@ -515,8 +514,6 @@ FUNCTION_PASS("print<domfrontier>", DominanceFrontierPrinterPass(errs()))
 FUNCTION_PASS("print<domtree>", DominatorTreePrinterPass(errs()))
 FUNCTION_PASS("print<func-properties>", FunctionPropertiesPrinterPass(errs()))
 FUNCTION_PASS("print<inline-cost>", InlineCostAnnotationPrinterPass(errs()))
-FUNCTION_PASS("print<inliner-size-estimator>",
-              InlineSizeEstimatorAnalysisPrinterPass(errs()))
 FUNCTION_PASS("print<lazy-value-info>", LazyValueInfoPrinterPass(errs()))
 FUNCTION_PASS("print<loops>", LoopPrinterPass(errs()))
 FUNCTION_PASS("print<memoryssa-walker>", MemorySSAWalkerPrinterPass(errs()))
@@ -583,6 +580,10 @@ FUNCTION_PASS_WITH_PARAMS(
     "early-cse", "EarlyCSEPass",
     [](bool UseMemorySSA) { return EarlyCSEPass(UseMemorySSA); },
     parseEarlyCSEPassOptions, "memssa")
+FUNCTION_PASS_WITH_PARAMS(
+    "drop-unnecessary-assumes", "DropUnnecessaryAssumesPass",
+    [](bool DropDereferenceable) { return DropUnnecessaryAssumesPass(DropDereferenceable); },
+    parseDropUnnecessaryAssumesPassOptions, "drop-deref")
 FUNCTION_PASS_WITH_PARAMS(
     "ee-instrument", "EntryExitInstrumenterPass",
     [](bool PostInlining) { return EntryExitInstrumenterPass(PostInlining); },
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index 7290a86503120..6b7e980d048a4 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -537,7 +537,7 @@ void IRChangedPrinter::handleAfter(StringRef PassID, std::string &Name,
   Out << "*** IR Dump After " << PassID << " on " << Name << " ***\n" << After;
 }
 
-IRChangedTester::~IRChangedTester() {}
+IRChangedTester::~IRChangedTester() = default;
 
 void IRChangedTester::registerCallbacks(PassInstrumentationCallbacks &PIC) {
   if (TestChanged != "")
@@ -1566,7 +1566,7 @@ void InLineChangePrinter::registerCallbacks(PassInstrumentationCallbacks &PIC) {
     TextChangeReporter<IRDataT<EmptyData>>::registerRequiredCallbacks(PIC);
 }
 
-TimeProfilingPassesHandler::TimeProfilingPassesHandler() {}
+TimeProfilingPassesHandler::TimeProfilingPassesHandler() = default;
 
 void TimeProfilingPassesHandler::registerCallbacks(
     PassInstrumentationCallbacks &PIC) {
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 02087355ab318..54987872f2d8b 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1690,7 +1690,7 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
       IndexedInstrProf::ProfVersion::CurrentVersion)
     return make_error<InstrProfError>(instrprof_error::unsupported_version);
 
-  static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version12,
+  static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version13,
                 "Please update the reader as needed when a new field is added "
                 "or when indexed profile version gets bumped.");
 
@@ -1723,10 +1723,11 @@ size_t Header::size() const {
     // of the header, and byte offset of existing fields shouldn't change when
     // indexed profile version gets incremented.
     static_assert(
-        IndexedInstrProf::ProfVersion::CurrentVersion == Version12,
+        IndexedInstrProf::ProfVersion::CurrentVersion == Version13,
         "Please update the size computation below if a new field has "
         "been added to the header; for a version bump without new "
         "fields, add a case statement to fall through to the latest version.");
+  case 13ull:
   case 12ull:
     return 72;
   case 11ull:
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index a3473514d4637..0f15ca8ff6df7 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -542,7 +542,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   // The WritePrevVersion handling will either need to be removed or updated
   // if the version is advanced beyond 12.
   static_assert(IndexedInstrProf::ProfVersion::CurrentVersion ==
-                IndexedInstrProf::ProfVersion::Version12);
+                IndexedInstrProf::ProfVersion::Version13);
   if (static_cast<bool>(ProfileKind & InstrProfKind::IRInstrumentation))
     Header.Version |= VARIANT_MASK_IR_PROF;
   if (static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive))
diff --git a/llvm/lib/Remarks/RemarkFormat.cpp b/llvm/lib/Remarks/RemarkFormat.cpp
index 1c52e352f9392..f9fd4af20e047 100644
--- a/llvm/lib/Remarks/RemarkFormat.cpp
+++ b/llvm/lib/Remarks/RemarkFormat.cpp
@@ -19,7 +19,7 @@ using namespace llvm::remarks;
 
 Expected<Format> llvm::remarks::parseFormat(StringRef FormatStr) {
   auto Result = StringSwitch<Format>(FormatStr)
-                    .Cases("", "yaml", Format::YAML)
+                    .Cases({"", "yaml"}, Format::YAML)
                     .Case("bitstream", Format::Bitstream)
                     .Default(Format::Unknown);
 
diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp
index fb6ff6203567a..6f5d072fb6913 100644
--- a/llvm/lib/SandboxIR/Context.cpp
+++ b/llvm/lib/SandboxIR/Context.cpp
@@ -637,7 +637,7 @@ Context::Context(LLVMContext &LLVMCtx)
     : LLVMCtx(LLVMCtx), IRTracker(*this),
       LLVMIRBuilder(LLVMCtx, ConstantFolder()) {}
 
-Context::~Context() {}
+Context::~Context() = default;
 
 void Context::clear() {
   // TODO: Ideally we should clear only function-scope objects, and keep global
diff --git a/llvm/lib/SandboxIR/Instruction.cpp b/llvm/lib/SandboxIR/Instruction.cpp
index 1a81d185acf76..9ae4c98723fba 100644
--- a/llvm/lib/SandboxIR/Instruction.cpp
+++ b/llvm/lib/SandboxIR/Instruction.cpp
@@ -1125,6 +1125,33 @@ void SwitchInst::setDefaultDest(BasicBlock *DefaultCase) {
   cast<llvm::SwitchInst>(Val)->setDefaultDest(
       cast<llvm::BasicBlock>(DefaultCase->Val));
 }
+
+template <typename LLVMCaseItT, typename BlockT, typename ConstT>
+ConstT *
+SwitchInst::CaseHandleImpl<LLVMCaseItT, BlockT, ConstT>::getCaseValue() const {
+  const auto &LLVMCaseHandle = *LLVMCaseIt;
+  auto *LLVMC = Ctx.getValue(LLVMCaseHandle.getCaseValue());
+  return cast<ConstT>(LLVMC);
+}
+
+template <typename LLVMCaseItT, typename BlockT, typename ConstT>
+BlockT *
+SwitchInst::CaseHandleImpl<LLVMCaseItT, BlockT, ConstT>::getCaseSuccessor()
+    const {
+  const auto &LLVMCaseHandle = *LLVMCaseIt;
+  auto *LLVMBB = LLVMCaseHandle.getCaseSuccessor();
+  return cast<BlockT>(Ctx.getValue(LLVMBB));
+}
+
+template class SwitchInst::CaseHandleImpl<llvm::SwitchInst::CaseIt, BasicBlock,
+                                          ConstantInt>;
+template class SwitchInst::CaseItImpl<llvm::SwitchInst::CaseIt, BasicBlock,
+                                      ConstantInt>;
+template class SwitchInst::CaseHandleImpl<llvm::SwitchInst::ConstCaseIt,
+                                          const BasicBlock, const ConstantInt>;
+template class SwitchInst::CaseItImpl<llvm::SwitchInst::ConstCaseIt,
+                                      const BasicBlock, const ConstantInt>;
+
 ConstantInt *SwitchInst::findCaseDest(BasicBlock *BB) {
   auto *LLVMC = cast<llvm::SwitchInst>(Val)->findCaseDest(
       cast<llvm::BasicBlock>(BB->Val));
diff --git a/llvm/lib/Support/AArch64BuildAttributes.cpp b/llvm/lib/Support/AArch64BuildAttributes.cpp
index 4a6b2fd538803..be4d1f1a8914e 100644
--- a/llvm/lib/Support/AArch64BuildAttributes.cpp
+++ b/llvm/lib/Support/AArch64BuildAttributes.cpp
@@ -67,8 +67,8 @@ StringRef AArch64BuildAttributes::getTypeStr(unsigned Type) {
 }
 SubsectionType AArch64BuildAttributes::getTypeID(StringRef Type) {
   return StringSwitch<SubsectionType>(Type)
-      .Cases("uleb128", "ULEB128", ULEB128)
-      .Cases("ntbs", "NTBS", NTBS)
+      .Cases({"uleb128", "ULEB128"}, ULEB128)
+      .Cases({"ntbs", "NTBS"}, NTBS)
       .Default(TYPE_NOT_FOUND);
 }
 StringRef AArch64BuildAttributes::getSubsectionTypeUnknownError() {
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index e21cf8e13d4dc..e2645fa46bbcd 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -269,12 +269,6 @@ bool APFloatBase::isRepresentableBy(const fltSemantics &A,
          A.precision <= B.precision;
 }
 
-constexpr RoundingMode APFloatBase::rmNearestTiesToEven;
-constexpr RoundingMode APFloatBase::rmTowardPositive;
-constexpr RoundingMode APFloatBase::rmTowardNegative;
-constexpr RoundingMode APFloatBase::rmTowardZero;
-constexpr RoundingMode APFloatBase::rmNearestTiesToAway;
-
 /* A tight upper bound on number of parts required to hold the value
    pow(5, power) is
 
diff --git a/llvm/lib/Support/BalancedPartitioning.cpp b/llvm/lib/Support/BalancedPartitioning.cpp
index 1914f4cc39d96..d859abddbcad8 100644
--- a/llvm/lib/Support/BalancedPartitioning.cpp
+++ b/llvm/lib/Support/BalancedPartitioning.cpp
@@ -231,7 +231,7 @@ unsigned BalancedPartitioning::runIteration(const FunctionNodeRange Nodes,
   }
 
   // Compute move gains
-  typedef std::pair<float, BPFunctionNode *> GainPair;
+  using GainPair = std::pair<float, BPFunctionNode *>;
   std::vector<GainPair> Gains;
   for (auto &N : Nodes) {
     bool FromLeftToRight = (N.Bucket == LeftBucket);
diff --git a/llvm/lib/Support/BranchProbability.cpp b/llvm/lib/Support/BranchProbability.cpp
index e3763449d16cb..143e58a05d3b7 100644
--- a/llvm/lib/Support/BranchProbability.cpp
+++ b/llvm/lib/Support/BranchProbability.cpp
@@ -20,8 +20,6 @@
 
 using namespace llvm;
 
-constexpr uint32_t BranchProbability::D;
-
 raw_ostream &BranchProbability::print(raw_ostream &OS) const {
   if (isUnknown())
     return OS << "?%";
@@ -111,3 +109,10 @@ uint64_t BranchProbability::scale(uint64_t Num) const {
 uint64_t BranchProbability::scaleByInverse(uint64_t Num) const {
   return ::scale<0>(Num, D, N);
 }
+
+BranchProbability BranchProbability::pow(unsigned N) const {
+  BranchProbability Res = BranchProbability::getOne();
+  for (unsigned I = 0; I < N; ++I)
+    Res *= *this;
+  return Res;
+}
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index 9491ec049f79d..dab8beeff7ca5 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -382,7 +382,7 @@ class CommandLineParser {
     RegisteredSubCommands.erase(sub);
   }
 
-  iterator_range<typename SmallPtrSet<SubCommand *, 4>::iterator>
+  iterator_range<SmallPtrSet<SubCommand *, 4>::iterator>
   getRegisteredSubcommands() {
     return make_range(RegisteredSubCommands.begin(),
                       RegisteredSubCommands.end());
@@ -2343,10 +2343,10 @@ namespace {
 class HelpPrinter {
 protected:
   const bool ShowHidden;
-  typedef SmallVector<std::pair<const char *, Option *>, 128>
-      StrOptionPairVector;
-  typedef SmallVector<std::pair<const char *, SubCommand *>, 128>
-      StrSubCommandPairVector;
+  using StrOptionPairVector =
+      SmallVector<std::pair<const char *, Option *>, 128>;
+  using StrSubCommandPairVector =
+      SmallVector<std::pair<const char *, SubCommand *>, 128>;
   // Print the options. Opts is assumed to be alphabetically sorted.
   virtual void printOptions(StrOptionPairVector &Opts, size_t MaxArgLen) {
     for (const auto &Opt : Opts)
@@ -2830,7 +2830,7 @@ StringMap<Option *> &cl::getRegisteredOptions(SubCommand &Sub) {
   return Sub.OptionsMap;
 }
 
-iterator_range<typename SmallPtrSet<SubCommand *, 4>::iterator>
+iterator_range<SmallPtrSet<SubCommand *, 4>::iterator>
 cl::getRegisteredSubcommands() {
   return GlobalParser->getRegisteredSubcommands();
 }
diff --git a/llvm/lib/Support/DAGDeltaAlgorithm.cpp b/llvm/lib/Support/DAGDeltaAlgorithm.cpp
index 981536473d124..3bfae147d18c0 100644
--- a/llvm/lib/Support/DAGDeltaAlgorithm.cpp
+++ b/llvm/lib/Support/DAGDeltaAlgorithm.cpp
@@ -47,16 +47,16 @@ class DAGDeltaAlgorithmImpl {
   friend class DeltaActiveSetHelper;
 
 public:
-  typedef DAGDeltaAlgorithm::change_ty change_ty;
-  typedef DAGDeltaAlgorithm::changeset_ty changeset_ty;
-  typedef DAGDeltaAlgorithm::changesetlist_ty changesetlist_ty;
-  typedef DAGDeltaAlgorithm::edge_ty edge_ty;
+  using change_ty = DAGDeltaAlgorithm::change_ty;
+  using changeset_ty = DAGDeltaAlgorithm::changeset_ty;
+  using changesetlist_ty = DAGDeltaAlgorithm::changesetlist_ty;
+  using edge_ty = DAGDeltaAlgorithm::edge_ty;
 
 private:
-  typedef std::vector<change_ty>::iterator pred_iterator_ty;
-  typedef std::vector<change_ty>::iterator succ_iterator_ty;
-  typedef std::set<change_ty>::iterator pred_closure_iterator_ty;
-  typedef std::set<change_ty>::iterator succ_closure_iterator_ty;
+  using pred_iterator_ty = std::vector<change_ty>::iterator;
+  using succ_iterator_ty = std::vector<change_ty>::iterator;
+  using pred_closure_iterator_ty = std::set<change_ty>::iterator;
+  using succ_closure_iterator_ty = std::set<change_ty>::iterator;
 
   DAGDeltaAlgorithm &DDA;
 
diff --git a/llvm/lib/Support/DeltaAlgorithm.cpp b/llvm/lib/Support/DeltaAlgorithm.cpp
index d763cded6e7ea..e91ee91a862f7 100644
--- a/llvm/lib/Support/DeltaAlgorithm.cpp
+++ b/llvm/lib/Support/DeltaAlgorithm.cpp
@@ -8,7 +8,6 @@
 #include "llvm/ADT/DeltaAlgorithm.h"
 #include <algorithm>
 #include <iterator>
-#include <set>
 using namespace llvm;
 
 DeltaAlgorithm::~DeltaAlgorithm() = default;
diff --git a/llvm/lib/Support/DynamicLibrary.cpp b/llvm/lib/Support/DynamicLibrary.cpp
index f1c15c00cedea..61566d3722419 100644
--- a/llvm/lib/Support/DynamicLibrary.cpp
+++ b/llvm/lib/Support/DynamicLibrary.cpp
@@ -23,7 +23,7 @@ using namespace llvm::sys;
 
 // All methods for HandleSet should be used holding SymbolsMutex.
 class DynamicLibrary::HandleSet {
-  typedef std::vector<void *> HandleList;
+  using HandleList = std::vector<void *>;
   HandleList Handles;
   void *Process = &Invalid;
 
diff --git a/llvm/lib/Support/MD5.cpp b/llvm/lib/Support/MD5.cpp
index 3bff4e177f781..32e2a2ed2f32f 100644
--- a/llvm/lib/Support/MD5.cpp
+++ b/llvm/lib/Support/MD5.cpp
@@ -43,7 +43,6 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Endian.h"
-#include <array>
 #include <cstdint>
 #include <cstring>
 
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 708e79d39cd21..8b95049eb9648 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -10,7 +10,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
-#include <optional>
 #include <sstream>
 
 #define DEBUG_TYPE "mustache"
@@ -34,6 +33,31 @@ static bool isContextFalsey(const json::Value *V) {
   return isFalsey(*V);
 }
 
+static void splitAndTrim(StringRef Str, SmallVectorImpl<StringRef> &Tokens) {
+  size_t CurrentPos = 0;
+  while (CurrentPos < Str.size()) {
+    // Find the next delimiter.
+    size_t DelimiterPos = Str.find('.', CurrentPos);
+
+    // If no delimiter is found, process the rest of the string.
+    if (DelimiterPos == StringRef::npos)
+      DelimiterPos = Str.size();
+
+    // Get the current part, which may have whitespace.
+    StringRef Part = Str.slice(CurrentPos, DelimiterPos);
+
+    // Manually trim the part without creating a new string object.
+    size_t Start = Part.find_first_not_of(" \t\r\n");
+    if (Start != StringRef::npos) {
+      size_t End = Part.find_last_not_of(" \t\r\n");
+      Tokens.push_back(Part.slice(Start, End + 1));
+    }
+
+    // Move past the delimiter for the next iteration.
+    CurrentPos = DelimiterPos + 1;
+  }
+}
+
 static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) {
   // We split the mustache string into an accessor.
   // For example:
@@ -46,13 +70,7 @@ static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) {
     // It's a literal, so it doesn't need to be saved.
     Tokens.push_back(".");
   } else {
-    while (!Str.empty()) {
-      StringRef Part;
-      std::tie(Part, Str) = Str.split('.');
-      // Each part of the accessor needs to be saved to the arena
-      // to ensure it has a stable address.
-      Tokens.push_back(Ctx.Saver.save(Part.trim()));
-    }
+    splitAndTrim(Str, Tokens);
   }
   // Now, allocate memory for the array of StringRefs in the arena.
   StringRef *ArenaTokens = Ctx.Allocator.Allocate<StringRef>(Tokens.size());
@@ -368,141 +386,99 @@ struct Tag {
   llvm_unreachable("Unknown json::Value::Kind");
 }
 
-static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
-                       StringRef Close) {
-  const StringLiteral TripleOpen("{{{");
-  const StringLiteral TripleClose("}}}");
-
-  size_t NormalOpenPos = Template.find(Open, StartPos);
-  size_t TripleOpenPos = Template.find(TripleOpen, StartPos);
-
-  Tag Result;
-
-  // Determine which tag comes first.
-  if (TripleOpenPos != StringRef::npos &&
-      (NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) {
-    // Found a triple mustache tag.
-    size_t EndPos =
-        Template.find(TripleClose, TripleOpenPos + TripleOpen.size());
-    if (EndPos == StringRef::npos)
-      return Result; // No closing tag found.
-
-    Result.TagKind = Tag::Kind::Triple;
-    Result.StartPosition = TripleOpenPos;
-    size_t ContentStart = TripleOpenPos + TripleOpen.size();
-    Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
-    Result.FullMatch = Template.substr(
-        TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos);
-  } else if (NormalOpenPos != StringRef::npos) {
-    // Found a normal mustache tag.
-    size_t EndPos = Template.find(Close, NormalOpenPos + Open.size());
-    if (EndPos == StringRef::npos)
-      return Result; // No closing tag found.
-
-    Result.TagKind = Tag::Kind::Normal;
-    Result.StartPosition = NormalOpenPos;
-    size_t ContentStart = NormalOpenPos + Open.size();
-    Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
-    Result.FullMatch =
-        Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos);
-  }
-
-  return Result;
-}
-
-static std::optional<std::pair<StringRef, StringRef>>
-processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) {
-  LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content
-                    << ", Kind: " << tagKindToString(T.TagKind) << "\n");
-  if (T.TagKind == Tag::Kind::Triple) {
-    Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx);
-    return std::nullopt;
-  }
-  StringRef Interpolated = T.Content;
-  if (!Interpolated.trim().starts_with("=")) {
-    char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
-    Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx);
-    return std::nullopt;
-  }
-  Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx);
-  StringRef DelimSpec = Interpolated.trim();
-  DelimSpec = DelimSpec.drop_front(1);
-  DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
-  DelimSpec = DelimSpec.trim();
-
-  std::pair<StringRef, StringRef> Ret = DelimSpec.split(' ');
-  LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << Ret.first
-                    << ", NewClose: " << Ret.second << "\n");
-  return Ret;
-}
-
 // Simple tokenizer that splits the template into tokens.
-// The mustache spec allows {{{ }}} to unescape variables,
-// but we don't support that here. An unescape variable
-// is represented only by {{& variable}}.
 static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) {
   LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n");
   SmallVector<Token> Tokens;
   SmallString<8> Open("{{");
   SmallString<8> Close("}}");
-  size_t Start = 0;
+  size_t Cursor = 0;
+  size_t TextStart = 0;
 
-  while (Start < Template.size()) {
-    LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start:" << Start << ", Open:'" << Open
-                      << "', Close:'" << Close << "'\n");
-    Tag T = findNextTag(Template, Start, Open, Close);
+  const StringLiteral TripleOpen("{{{");
+  const StringLiteral TripleClose("}}}");
 
-    if (T.TagKind == Tag::Kind::None) {
-      // No more tags, the rest is text.
-      Tokens.emplace_back(Template.substr(Start));
-      break;
+  while (Cursor < Template.size()) {
+    StringRef TemplateSuffix = Template.substr(Cursor);
+    StringRef TagOpen, TagClose;
+    Tag::Kind Kind;
+
+    // Determine which tag we've encountered.
+    if (TemplateSuffix.starts_with(TripleOpen)) {
+      Kind = Tag::Kind::Triple;
+      TagOpen = TripleOpen;
+      TagClose = TripleClose;
+    } else if (TemplateSuffix.starts_with(Open)) {
+      Kind = Tag::Kind::Normal;
+      TagOpen = Open;
+      TagClose = Close;
+    } else {
+      // Not at a tag, continue scanning.
+      ++Cursor;
+      continue;
     }
 
-    // Add the text before the tag.
-    if (T.StartPosition > Start) {
-      StringRef Text = Template.substr(Start, T.StartPosition - Start);
-      Tokens.emplace_back(Text);
+    // Found a tag, first add the preceding text.
+    if (Cursor > TextStart)
+      Tokens.emplace_back(Template.slice(TextStart, Cursor));
+
+    // Find the closing tag.
+    size_t EndPos = Template.find(TagClose, Cursor + TagOpen.size());
+    if (EndPos == StringRef::npos) {
+      // No closing tag, the rest is text.
+      Tokens.emplace_back(Template.substr(Cursor));
+      TextStart = Cursor = Template.size();
+      break;
     }
 
-    if (auto NewDelims = processTag(T, Tokens, Ctx)) {
-      std::tie(Open, Close) = *NewDelims;
+    // Extract tag content and full match.
+    size_t ContentStart = Cursor + TagOpen.size();
+    StringRef Content = Template.substr(ContentStart, EndPos - ContentStart);
+    StringRef FullMatch =
+        Template.substr(Cursor, (EndPos + TagClose.size()) - Cursor);
+
+    // Process the tag (inlined logic from processTag).
+    LLVM_DEBUG(dbgs() << "[Tag] " << FullMatch << ", Content: " << Content
+                      << ", Kind: " << tagKindToString(Kind) << "\n");
+    if (Kind == Tag::Kind::Triple) {
+      Tokens.emplace_back(FullMatch, Ctx.Saver.save("&" + Content), '&', Ctx);
+    } else { // Normal Tag
+      StringRef Interpolated = Content;
+      if (!Interpolated.trim().starts_with("=")) {
+        char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
+        Tokens.emplace_back(FullMatch, Interpolated, Front, Ctx);
+      } else { // Set Delimiter
+        Tokens.emplace_back(FullMatch, Interpolated, '=', Ctx);
+        StringRef DelimSpec = Interpolated.trim();
+        DelimSpec = DelimSpec.drop_front(1);
+        DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
+        DelimSpec = DelimSpec.trim();
+
+        auto [NewOpen, NewClose] = DelimSpec.split(' ');
+        LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen
+                          << ", NewClose: " << NewClose << "\n");
+        Open = NewOpen;
+        Close = NewClose;
+      }
     }
 
-    // Move past the tag.
-    Start = T.StartPosition + T.FullMatch.size();
+    // Move past the tag for the next iteration.
+    Cursor += FullMatch.size();
+    TextStart = Cursor;
   }
 
-  // Fix up white spaces for:
-  //   - open sections
-  //   - inverted sections
-  //   - close sections
-  //   - comments
-  //
-  // This loop attempts to find standalone tokens and tries to trim out
-  // the surrounding whitespace.
-  // For example:
-  // if you have the template string
-  //  {{#section}} \n Example \n{{/section}}
-  // The output should would be
-  // For example:
-  //  \n Example \n
+  // Add any remaining text after the last tag.
+  if (TextStart < Template.size())
+    Tokens.emplace_back(Template.substr(TextStart));
+
+  // Fix up white spaces for standalone tags.
   size_t LastIdx = Tokens.size() - 1;
   for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) {
     Token &CurrentToken = Tokens[Idx];
     Token::Type CurrentType = CurrentToken.getType();
-    // Check if token type requires cleanup.
-    bool RequiresCleanUp = requiresCleanUp(CurrentType);
-
-    if (!RequiresCleanUp)
+    if (!requiresCleanUp(CurrentType))
       continue;
 
-    // We adjust the token body if there's no text behind or ahead.
-    // A token is considered to have no text ahead if the right of the previous
-    // token is a newline followed by spaces.
-    // A token is considered to have no text behind if the left of the next
-    // token is spaces followed by a newline.
-    // eg.
-    //  "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3"
     bool HasTextBehind = hasTextBehind(Idx, Tokens);
     bool HasTextAhead = hasTextAhead(Idx, Tokens);
 
@@ -622,9 +598,16 @@ void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty,
   size_t Start = CurrentPtr;
   parseMustache(CurrentNode);
   const size_t End = CurrentPtr - 1;
+
+  size_t RawBodySize = 0;
+  for (size_t I = Start; I < End; ++I)
+    RawBodySize += Tokens[I].RawBody.size();
+
   SmallString<128> RawBody;
-  for (std::size_t I = Start; I < End; I++)
+  RawBody.reserve(RawBodySize);
+  for (std::size_t I = Start; I < End; ++I)
     RawBody += Tokens[I].RawBody;
+
   CurrentNode->setRawBody(Ctx.Saver.save(StringRef(RawBody)));
   Parent->addChild(CurrentNode);
 }
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index 246d90cce3a43..91f98cf7fac6c 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -14,24 +14,94 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/SpecialCaseList.h"
+#include "llvm/ADT/RadixTree.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/GlobPattern.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
 #include "llvm/Support/VirtualFileSystem.h"
-#include <algorithm>
-#include <limits>
+#include "llvm/Support/raw_ostream.h"
 #include <memory>
 #include <stdio.h>
 #include <string>
 #include <system_error>
 #include <utility>
+#include <variant>
+#include <vector>
 
 namespace llvm {
 
-Error SpecialCaseList::RegexMatcher::insert(StringRef Pattern,
-                                            unsigned LineNumber) {
+namespace {
+
+using Match = std::pair<StringRef, unsigned>;
+static constexpr Match NotMatched = {"", 0};
+
+// Lagacy v1 matcher.
+class RegexMatcher {
+public:
+  Error insert(StringRef Pattern, unsigned LineNumber);
+  void preprocess(bool BySize);
+
+  Match match(StringRef Query) const;
+
+  struct Reg {
+    Reg(StringRef Name, unsigned LineNo, Regex &&Rg)
+        : Name(Name), LineNo(LineNo), Rg(std::move(Rg)) {}
+    StringRef Name;
+    unsigned LineNo;
+    Regex Rg;
+  };
+
+  std::vector<Reg> RegExes;
+};
+
+class GlobMatcher {
+public:
+  Error insert(StringRef Pattern, unsigned LineNumber);
+  void preprocess(bool BySize);
+
+  Match match(StringRef Query) const;
+
+  struct Glob {
+    Glob(StringRef Name, unsigned LineNo, GlobPattern &&Pattern)
+        : Name(Name), LineNo(LineNo), Pattern(std::move(Pattern)) {}
+    StringRef Name;
+    unsigned LineNo;
+    GlobPattern Pattern;
+  };
+
+  std::vector<GlobMatcher::Glob> Globs;
+
+  RadixTree<iterator_range<StringRef::const_iterator>,
+            RadixTree<iterator_range<StringRef::const_reverse_iterator>,
+                      SmallVector<int, 1>>>
+      PrefixSuffixToGlob;
+
+  RadixTree<iterator_range<StringRef::const_iterator>, SmallVector<int, 1>>
+      SubstrToGlob;
+};
+
+/// Represents a set of patterns and their line numbers
+class Matcher {
+public:
+  Matcher(bool UseGlobs, bool RemoveDotSlash);
+
+  Error insert(StringRef Pattern, unsigned LineNumber);
+  void preprocess(bool BySize);
+  Match match(StringRef Query) const;
+
+  bool matchAny(StringRef Query) const { return match(Query).second > 0; }
+
+  std::variant<RegexMatcher, GlobMatcher> M;
+  bool RemoveDotSlash;
+};
+
+Error RegexMatcher::insert(StringRef Pattern, unsigned LineNumber) {
   if (Pattern.empty())
     return createStringError(errc::invalid_argument,
                              "Supplied regex was blank");
@@ -55,7 +125,7 @@ Error SpecialCaseList::RegexMatcher::insert(StringRef Pattern,
   return Error::success();
 }
 
-void SpecialCaseList::RegexMatcher::preprocess(bool BySize) {
+void RegexMatcher::preprocess(bool BySize) {
   if (BySize) {
     llvm::stable_sort(RegExes, [](const Reg &A, const Reg &B) {
       return A.Name.size() < B.Name.size();
@@ -63,16 +133,14 @@ void SpecialCaseList::RegexMatcher::preprocess(bool BySize) {
   }
 }
 
-void SpecialCaseList::RegexMatcher::match(
-    StringRef Query,
-    llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
+Match RegexMatcher::match(StringRef Query) const {
   for (const auto &R : reverse(RegExes))
     if (R.Rg.match(Query))
-      return Cb(R.Name, R.LineNo);
+      return {R.Name, R.LineNo};
+  return NotMatched;
 }
 
-Error SpecialCaseList::GlobMatcher::insert(StringRef Pattern,
-                                           unsigned LineNumber) {
+Error GlobMatcher::insert(StringRef Pattern, unsigned LineNumber) {
   if (Pattern.empty())
     return createStringError(errc::invalid_argument, "Supplied glob was blank");
 
@@ -83,14 +151,14 @@ Error SpecialCaseList::GlobMatcher::insert(StringRef Pattern,
   return Error::success();
 }
 
-void SpecialCaseList::GlobMatcher::preprocess(bool BySize) {
+void GlobMatcher::preprocess(bool BySize) {
   if (BySize) {
     llvm::stable_sort(Globs, [](const Glob &A, const Glob &B) {
       return A.Name.size() < B.Name.size();
     });
   }
 
-  for (const auto &G : reverse(Globs)) {
+  for (const auto &[Idx, G] : enumerate(Globs)) {
     StringRef Prefix = G.Pattern.prefix();
     StringRef Suffix = G.Pattern.suffix();
 
@@ -102,26 +170,28 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) {
         // But only if substring is not empty. Searching this tree is more
         // expensive.
         auto &V = SubstrToGlob.emplace(Substr).first->second;
-        V.emplace_back(&G);
+        V.emplace_back(Idx);
         continue;
       }
     }
 
     auto &SToGlob = PrefixSuffixToGlob.emplace(Prefix).first->second;
     auto &V = SToGlob.emplace(reverse(Suffix)).first->second;
-    V.emplace_back(&G);
+    V.emplace_back(Idx);
   }
 }
 
-void SpecialCaseList::GlobMatcher::match(
-    StringRef Query,
-    llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
+Match GlobMatcher::match(StringRef Query) const {
+  int Best = -1;
   if (!PrefixSuffixToGlob.empty()) {
     for (const auto &[_, SToGlob] : PrefixSuffixToGlob.find_prefixes(Query)) {
       for (const auto &[_, V] : SToGlob.find_prefixes(reverse(Query))) {
-        for (const auto *G : V) {
-          if (G->Pattern.match(Query)) {
-            Cb(G->Name, G->LineNo);
+        for (int Idx : reverse(V)) {
+          if (Best > Idx)
+            break;
+          const GlobMatcher::Glob &G = Globs[Idx];
+          if (G.Pattern.match(Query)) {
+            Best = Idx;
             // As soon as we find a match in the vector, we can break for this
             // vector, since the globs are already sorted by priority within the
             // prefix group. However, we continue searching other prefix groups
@@ -138,9 +208,12 @@ void SpecialCaseList::GlobMatcher::match(
     // possibilities. In most cases search will fail on first characters.
     for (StringRef Q = Query; !Q.empty(); Q = Q.drop_front()) {
       for (const auto &[_, V] : SubstrToGlob.find_prefixes(Q)) {
-        for (const auto *G : V) {
-          if (G->Pattern.match(Query)) {
-            Cb(G->Name, G->LineNo);
+        for (int Idx : reverse(V)) {
+          if (Best > Idx)
+            break;
+          const GlobMatcher::Glob &G = Globs[Idx];
+          if (G.Pattern.match(Query)) {
+            Best = Idx;
             // As soon as we find a match in the vector, we can break for this
             // vector, since the globs are already sorted by priority within the
             // prefix group. However, we continue searching other prefix groups
@@ -151,9 +224,12 @@ void SpecialCaseList::GlobMatcher::match(
       }
     }
   }
+  if (Best < 0)
+    return NotMatched;
+  return {Globs[Best].Name, Globs[Best].LineNo};
 }
 
-SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash)
+Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash)
     : RemoveDotSlash(RemoveDotSlash) {
   if (UseGlobs)
     M.emplace<GlobMatcher>();
@@ -161,21 +237,34 @@ SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash)
     M.emplace<RegexMatcher>();
 }
 
-Error SpecialCaseList::Matcher::insert(StringRef Pattern, unsigned LineNumber) {
+Error Matcher::insert(StringRef Pattern, unsigned LineNumber) {
   return std::visit([&](auto &V) { return V.insert(Pattern, LineNumber); }, M);
 }
 
-void SpecialCaseList::Matcher::preprocess(bool BySize) {
+void Matcher::preprocess(bool BySize) {
   return std::visit([&](auto &V) { return V.preprocess(BySize); }, M);
 }
 
-void SpecialCaseList::Matcher::match(
-    StringRef Query,
-    llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
+Match Matcher::match(StringRef Query) const {
   if (RemoveDotSlash)
     Query = llvm::sys::path::remove_leading_dotslash(Query);
-  return std::visit([&](auto &V) { return V.match(Query, Cb); }, M);
+  return std::visit([&](auto &V) -> Match { return V.match(Query); }, M);
 }
+} // namespace
+
+class SpecialCaseList::Section::SectionImpl {
+public:
+  void preprocess(bool OrderBySize);
+  const Matcher *findMatcher(StringRef Prefix, StringRef Category) const;
+
+  using SectionEntries = StringMap<StringMap<Matcher>>;
+
+  explicit SectionImpl(bool UseGlobs)
+      : SectionMatcher(UseGlobs, /*RemoveDotSlash=*/false) {}
+
+  Matcher SectionMatcher;
+  SectionEntries Entries;
+};
 
 // TODO: Refactor this to return Expected<...>
 std::unique_ptr<SpecialCaseList>
@@ -233,11 +322,11 @@ bool SpecialCaseList::createInternal(const MemoryBuffer *MB, std::string &Error,
 Expected<SpecialCaseList::Section *>
 SpecialCaseList::addSection(StringRef SectionStr, unsigned FileNo,
                             unsigned LineNo, bool UseGlobs) {
+  SectionStr = SectionStr.copy(StrAlloc);
   Sections.emplace_back(SectionStr, FileNo, UseGlobs);
   auto &Section = Sections.back();
 
-  SectionStr = SectionStr.copy(StrAlloc);
-  if (auto Err = Section.SectionMatcher.insert(SectionStr, LineNo)) {
+  if (auto Err = Section.Impl->SectionMatcher.insert(SectionStr, LineNo)) {
     return createStringError(errc::invalid_argument,
                              "malformed section at line " + Twine(LineNo) +
                                  ": '" + SectionStr +
@@ -264,11 +353,12 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
 
   bool RemoveDotSlash = Version > 2;
 
-  Section *CurrentSection;
-  if (auto Err = addSection("*", FileIdx, 1, true).moveInto(CurrentSection)) {
+  auto ErrOrSection = addSection("*", FileIdx, 1, true);
+  if (auto Err = ErrOrSection.takeError()) {
     Error = toString(std::move(Err));
     return false;
   }
+  Section::SectionImpl *CurrentImpl = ErrOrSection.get()->Impl.get();
 
   // This is the current list of prefixes for all existing users matching file
   // path. We may need parametrization in constructor in future.
@@ -290,12 +380,13 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
         return false;
       }
 
-      if (auto Err = addSection(Line.drop_front().drop_back(), FileIdx, LineNo,
-                                UseGlobs)
-                         .moveInto(CurrentSection)) {
+      auto ErrOrSection =
+          addSection(Line.drop_front().drop_back(), FileIdx, LineNo, UseGlobs);
+      if (auto Err = ErrOrSection.takeError()) {
         Error = toString(std::move(Err));
         return false;
       }
+      CurrentImpl = ErrOrSection.get()->Impl.get();
       continue;
     }
 
@@ -308,7 +399,7 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
     }
 
     auto [Pattern, Category] = Postfix.split("=");
-    auto [It, _] = CurrentSection->Entries[Prefix].try_emplace(
+    auto [It, _] = CurrentImpl->Entries[Prefix].try_emplace(
         Category, UseGlobs,
         RemoveDotSlash && llvm::is_contained(PathPrefixes, Prefix));
     Pattern = Pattern.copy(StrAlloc);
@@ -322,7 +413,7 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
   }
 
   for (Section &S : Sections)
-    S.preprocess(OrderBySize);
+    S.Impl->preprocess(OrderBySize);
 
   return true;
 }
@@ -339,7 +430,7 @@ std::pair<unsigned, unsigned>
 SpecialCaseList::inSectionBlame(StringRef Section, StringRef Prefix,
                                 StringRef Query, StringRef Category) const {
   for (const auto &S : reverse(Sections)) {
-    if (S.SectionMatcher.matchAny(Section)) {
+    if (S.Impl->SectionMatcher.matchAny(Section)) {
       unsigned Blame = S.getLastMatch(Prefix, Query, Category);
       if (Blame)
         return {S.FileIdx, Blame};
@@ -348,9 +439,22 @@ SpecialCaseList::inSectionBlame(StringRef Section, StringRef Prefix,
   return NotFound;
 }
 
-const SpecialCaseList::Matcher *
-SpecialCaseList::Section::findMatcher(StringRef Prefix,
-                                      StringRef Category) const {
+SpecialCaseList::Section::Section(StringRef Str, unsigned FileIdx,
+                                  bool UseGlobs)
+    : Name(Str), FileIdx(FileIdx),
+      Impl(std::make_unique<SectionImpl>(UseGlobs)) {}
+
+SpecialCaseList::Section::Section(Section &&) = default;
+
+SpecialCaseList::Section::~Section() = default;
+
+bool SpecialCaseList::Section::matchName(StringRef Name) const {
+  return Impl->SectionMatcher.matchAny(Name);
+}
+
+const Matcher *
+SpecialCaseList::Section::SectionImpl::findMatcher(StringRef Prefix,
+                                                   StringRef Category) const {
   SectionEntries::const_iterator I = Entries.find(Prefix);
   if (I == Entries.end())
     return nullptr;
@@ -361,7 +465,7 @@ SpecialCaseList::Section::findMatcher(StringRef Prefix,
   return &II->second;
 }
 
-LLVM_ABI void SpecialCaseList::Section::preprocess(bool OrderBySize) {
+void SpecialCaseList::Section::SectionImpl::preprocess(bool OrderBySize) {
   SectionMatcher.preprocess(false);
   for (auto &[K1, E] : Entries)
     for (auto &[K2, M] : E)
@@ -371,26 +475,21 @@ LLVM_ABI void SpecialCaseList::Section::preprocess(bool OrderBySize) {
 unsigned SpecialCaseList::Section::getLastMatch(StringRef Prefix,
                                                 StringRef Query,
                                                 StringRef Category) const {
-  unsigned LastLine = 0;
-  if (const Matcher *M = findMatcher(Prefix, Category)) {
-    M->match(Query, [&](StringRef, unsigned LineNo) {
-      LastLine = std::max(LastLine, LineNo);
-    });
-  }
-  return LastLine;
+  if (const Matcher *M = Impl->findMatcher(Prefix, Category))
+    return M->match(Query).second;
+  return 0;
 }
 
 StringRef SpecialCaseList::Section::getLongestMatch(StringRef Prefix,
                                                     StringRef Query,
                                                     StringRef Category) const {
-  StringRef LongestRule;
-  if (const Matcher *M = findMatcher(Prefix, Category)) {
-    M->match(Query, [&](StringRef Rule, unsigned) {
-      if (LongestRule.size() < Rule.size())
-        LongestRule = Rule;
-    });
-  }
-  return LongestRule;
+  if (const Matcher *M = Impl->findMatcher(Prefix, Category))
+    return M->match(Query).first;
+  return {};
+}
+
+bool SpecialCaseList::Section::hasPrefix(StringRef Prefix) const {
+  return Impl->Entries.find(Prefix) != Impl->Entries.end();
 }
 
 } // namespace llvm
diff --git a/llvm/lib/Support/StringRef.cpp b/llvm/lib/Support/StringRef.cpp
index b6a2f8aeadccf..2e8fba8cbfa37 100644
--- a/llvm/lib/Support/StringRef.cpp
+++ b/llvm/lib/Support/StringRef.cpp
@@ -17,11 +17,6 @@
 
 using namespace llvm;
 
-// MSVC emits references to this into the translation units which reference it.
-#ifndef _MSC_VER
-constexpr size_t StringRef::npos;
-#endif
-
 // strncasecmp() is not available on non-POSIX systems, so define an
 // alternative function here.
 static int ascii_strncasecmp(StringRef LHS, StringRef RHS) {
diff --git a/llvm/lib/Support/ThreadPool.cpp b/llvm/lib/Support/ThreadPool.cpp
index 69602688cf3fd..4779e673cc055 100644
--- a/llvm/lib/Support/ThreadPool.cpp
+++ b/llvm/lib/Support/ThreadPool.cpp
@@ -73,7 +73,7 @@ static LLVM_THREAD_LOCAL std::vector<ThreadPoolTaskGroup *>
 // WaitingForGroup == nullptr means all tasks regardless of their group.
 void StdThreadPool::processTasks(ThreadPoolTaskGroup *WaitingForGroup) {
   while (true) {
-    std::function<void()> Task;
+    llvm::unique_function<void()> Task;
     ThreadPoolTaskGroup *GroupOfTask;
     {
       std::unique_lock<std::mutex> LockGuard(QueueLock);
@@ -189,7 +189,7 @@ void StdThreadPool::processTasksWithJobserver() {
 
     // While we hold a job slot, process tasks from the internal queue.
     while (true) {
-      std::function<void()> Task;
+      llvm::unique_function<void()> Task;
       ThreadPoolTaskGroup *GroupOfTask = nullptr;
 
       {
diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp
index 9d45096dddd97..b08f5083e00a8 100644
--- a/llvm/lib/Support/Timer.cpp
+++ b/llvm/lib/Support/Timer.cpp
@@ -207,7 +207,7 @@ void TimeRecord::print(const TimeRecord &Total, raw_ostream &OS) const {
 
 namespace {
 
-typedef StringMap<Timer> Name2TimerMap;
+using Name2TimerMap = StringMap<Timer>;
 
 class Name2PairMap {
   StringMap<std::pair<TimerGroup*, Name2TimerMap> > Map;
diff --git a/llvm/lib/Support/UnicodeNameToCodepoint.cpp b/llvm/lib/Support/UnicodeNameToCodepoint.cpp
index 6f8e0915ab632..8f0d24ea1c1c6 100644
--- a/llvm/lib/Support/UnicodeNameToCodepoint.cpp
+++ b/llvm/lib/Support/UnicodeNameToCodepoint.cpp
@@ -251,10 +251,10 @@ constexpr const char *const HangulSyllables[][3] = {
 
 // Unicode 15.0
 // 3.12 Conjoining Jamo Behavior Common constants
-constexpr const char32_t SBase = 0xAC00;
-constexpr const uint32_t LCount = 19;
-constexpr const uint32_t VCount = 21;
-constexpr const uint32_t TCount = 28;
+constexpr char32_t SBase = 0xAC00;
+constexpr uint32_t LCount = 19;
+constexpr uint32_t VCount = 21;
+constexpr uint32_t TCount = 28;
 
 static std::size_t findSyllable(StringRef Name, bool Strict,
                                 char &PreviousInName, int &Pos, int Column) {
diff --git a/llvm/lib/Support/Unix/Unix.h b/llvm/lib/Support/Unix/Unix.h
index a1d44c69ab1ab..f24d524982b23 100644
--- a/llvm/lib/Support/Unix/Unix.h
+++ b/llvm/lib/Support/Unix/Unix.h
@@ -22,7 +22,6 @@
 #include "llvm/Support/Chrono.h"
 #include "llvm/Support/Errno.h"
 #include "llvm/Support/ErrorHandling.h"
-#include <algorithm>
 #include <assert.h>
 #include <cerrno>
 #include <cstdio>
diff --git a/llvm/lib/Support/Windows/Program.inc b/llvm/lib/Support/Windows/Program.inc
index ec785e407cc57..5dcd2c945bc85 100644
--- a/llvm/lib/Support/Windows/Program.inc
+++ b/llvm/lib/Support/Windows/Program.inc
@@ -23,7 +23,6 @@
 #include <fcntl.h>
 #include <io.h>
 #include <malloc.h>
-#include <numeric>
 #include <psapi.h>
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index 648d6a50287ec..bacbb76e09e6c 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -16,7 +16,6 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/WindowsError.h"
-#include <algorithm>
 #include <io.h>
 #include <signal.h>
 #include <stdio.h>
@@ -421,8 +420,13 @@ bool sys::RemoveFileOnSignal(StringRef Filename, std::string *ErrMsg) {
     return true;
   }
 
-  if (FilesToRemove == NULL)
+  if (FilesToRemove == NULL) {
     FilesToRemove = new std::vector<std::string>;
+    std::atexit([]() {
+      delete FilesToRemove;
+      FilesToRemove = NULL;
+    });
+  }
 
   FilesToRemove->push_back(std::string(Filename));
 
diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp
index 07b99896543bd..d6f27fb7e7b63 100644
--- a/llvm/lib/Support/raw_ostream.cpp
+++ b/llvm/lib/Support/raw_ostream.cpp
@@ -61,17 +61,6 @@
 
 using namespace llvm;
 
-constexpr raw_ostream::Colors raw_ostream::BLACK;
-constexpr raw_ostream::Colors raw_ostream::RED;
-constexpr raw_ostream::Colors raw_ostream::GREEN;
-constexpr raw_ostream::Colors raw_ostream::YELLOW;
-constexpr raw_ostream::Colors raw_ostream::BLUE;
-constexpr raw_ostream::Colors raw_ostream::MAGENTA;
-constexpr raw_ostream::Colors raw_ostream::CYAN;
-constexpr raw_ostream::Colors raw_ostream::WHITE;
-constexpr raw_ostream::Colors raw_ostream::SAVEDCOLOR;
-constexpr raw_ostream::Colors raw_ostream::RESET;
-
 raw_ostream::~raw_ostream() {
   // raw_ostream's subclasses should take care to flush the buffer
   // in their destructors.
diff --git a/llvm/lib/Support/raw_socket_stream.cpp b/llvm/lib/Support/raw_socket_stream.cpp
index 3b510d357fd5d..f71631730d072 100644
--- a/llvm/lib/Support/raw_socket_stream.cpp
+++ b/llvm/lib/Support/raw_socket_stream.cpp
@@ -332,7 +332,7 @@ ListeningSocket::~ListeningSocket() {
 raw_socket_stream::raw_socket_stream(int SocketFD)
     : raw_fd_stream(SocketFD, true) {}
 
-raw_socket_stream::~raw_socket_stream() {}
+raw_socket_stream::~raw_socket_stream() = default;
 
 Expected<std::unique_ptr<raw_socket_stream>>
 raw_socket_stream::createConnectedUnix(StringRef SocketPath) {
diff --git a/llvm/lib/TableGen/DetailedRecordsBackend.cpp b/llvm/lib/TableGen/DetailedRecordsBackend.cpp
index 1ed64356b7c62..b1152bf680c69 100644
--- a/llvm/lib/TableGen/DetailedRecordsBackend.cpp
+++ b/llvm/lib/TableGen/DetailedRecordsBackend.cpp
@@ -22,7 +22,6 @@
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include <string>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index afce803f3f568..8ad20b45f5e16 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -46,12 +46,11 @@ using namespace llvm;
 //    Context
 //===----------------------------------------------------------------------===//
 
-namespace llvm::detail {
 /// This class represents the internal implementation of the RecordKeeper.
 /// It contains all of the contextual static state of the Record classes. It is
 /// kept out-of-line to simplify dependencies, and also make it easier for
 /// internal classes to access the uniquer state of the keeper.
-struct RecordKeeperImpl {
+struct detail::RecordKeeperImpl {
   RecordKeeperImpl(RecordKeeper &RK)
       : SharedBitRecTy(RK), SharedIntRecTy(RK), SharedStringRecTy(RK),
         SharedDagRecTy(RK), AnyRecord(RK, {}), TheUnsetInit(RK),
@@ -99,7 +98,6 @@ struct RecordKeeperImpl {
 
   void dumpAllocationStats(raw_ostream &OS) const;
 };
-} // namespace llvm::detail
 
 void detail::RecordKeeperImpl::dumpAllocationStats(raw_ostream &OS) const {
   // Dump memory allocation related stats.
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index 30eae6e7837cb..e8e64695e1ac4 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -682,8 +682,10 @@ tgtok::TokKind TGLexer::LexExclaim() {
           .Case("instances", tgtok::XInstances)
           .Case("substr", tgtok::XSubstr)
           .Case("find", tgtok::XFind)
-          .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated.
-          .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated.
+          .Cases({"setdagop", "setop"},
+                 tgtok::XSetDagOp) // !setop is deprecated.
+          .Cases({"getdagop", "getop"},
+                 tgtok::XGetDagOp) // !getop is deprecated.
           .Case("setdagopname", tgtok::XSetDagOpName)
           .Case("getdagopname", tgtok::XGetDagOpName)
           .Case("getdagarg", tgtok::XGetDagArg)
diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h
index 753470dfb5374..a0ade6412024e 100644
--- a/llvm/lib/TableGen/TGLexer.h
+++ b/llvm/lib/TableGen/TGLexer.h
@@ -19,7 +19,6 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/SMLoc.h"
 #include <cassert>
-#include <memory>
 #include <set>
 #include <string>
 
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 1169f26a2ae37..97298f9d74171 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -655,16 +655,10 @@ Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) {
   BasicBlock *BB = BasicBlock::Create(M->getContext(), "", GuestExit);
   IRBuilder<> B(BB);
 
-  // Load the global symbol as a pointer to the check function.
-  Value *GuardFn;
-  if (cfguard_module_flag == 2 && !F->hasFnAttribute("guard_nocf"))
-    GuardFn = GuardFnCFGlobal;
-  else
-    GuardFn = GuardFnGlobal;
-  LoadInst *GuardCheckLoad = B.CreateLoad(PtrTy, GuardFn);
-
-  // Create new call instruction. The CFGuard check should always be a call,
-  // even if the original CallBase is an Invoke or CallBr instruction.
+  // Create new call instruction. The call check should always be a call,
+  // even if the original CallBase is an Invoke or CallBr instructio.
+  // This is treated as a direct call, so do not use GuardFnCFGlobal.
+  LoadInst *GuardCheckLoad = B.CreateLoad(PtrTy, GuardFnGlobal);
   Function *Thunk = buildExitThunk(F->getFunctionType(), F->getAttributes());
   CallInst *GuardCheck = B.CreateCall(
       GuardFnType, GuardCheckLoad, {F, Thunk});
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index c31a090bba77f..e8766bc1b8c62 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -3364,6 +3364,22 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     TS->emitARM64WinCFIPACSignLR();
     return;
 
+  case AArch64::SEH_SaveAnyRegI:
+    assert(MI->getOperand(1).getImm() <= 1008 &&
+           "SaveAnyRegQP SEH opcode offset must fit into 6 bits");
+    TS->emitARM64WinCFISaveAnyRegI(MI->getOperand(0).getImm(),
+                                   MI->getOperand(1).getImm());
+    return;
+
+  case AArch64::SEH_SaveAnyRegIP:
+    assert(MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1 &&
+           "Non-consecutive registers not allowed for save_any_reg");
+    assert(MI->getOperand(2).getImm() <= 1008 &&
+           "SaveAnyRegQP SEH opcode offset must fit into 6 bits");
+    TS->emitARM64WinCFISaveAnyRegIP(MI->getOperand(0).getImm(),
+                                    MI->getOperand(2).getImm());
+    return;
+
   case AArch64::SEH_SaveAnyRegQP:
     assert(MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1 &&
            "Non-consecutive registers not allowed for save_any_reg");
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 1b5a713bffdc9..34c85d588f9c4 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -601,6 +601,12 @@ def CSR_Win_AArch64_AAPCS_SwiftError
 def CSR_Win_AArch64_AAPCS_SwiftTail
     : CalleeSavedRegs<(sub CSR_Win_AArch64_AAPCS, X20, X22)>;
 
+def CSR_Win_AArch64_RT_MostRegs
+    : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS, (sequence "X%u", 9, 15))>;
+
+def CSR_Win_AArch64_RT_AllRegs
+    : CalleeSavedRegs<(add CSR_Win_AArch64_RT_MostRegs, (sequence "Q%u", 8, 31))>;
+
 // The Control Flow Guard check call uses a custom calling convention that also
 // preserves X0-X8 and Q0-Q7.
 def CSR_Win_AArch64_CFGuard_Check : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS,
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index cb831963759b5..7712d2a1d88d8 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -629,8 +629,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
     }
     const MCInstrDesc &MCID = TII->get(Opc);
     // Create a dummy virtual register for the SUBS def.
-    Register DestReg =
-        MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI));
+    Register DestReg = MRI->createVirtualRegister(TII->getRegClass(MCID, 0));
     // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
     BuildMI(*Head, Head->end(), TermDL, MCID)
         .addReg(DestReg, RegState::Define | RegState::Dead)
@@ -638,8 +637,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
         .addImm(0)
         .addImm(0);
     // SUBS uses the GPR*sp register classes.
-    MRI->constrainRegClass(HeadCond[2].getReg(),
-                           TII->getRegClass(MCID, 1, TRI));
+    MRI->constrainRegClass(HeadCond[2].getReg(), TII->getRegClass(MCID, 1));
   }
 
   Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end());
@@ -686,10 +684,10 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
   const MCInstrDesc &MCID = TII->get(Opc);
   MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(),
-                         TII->getRegClass(MCID, 0, TRI));
+                         TII->getRegClass(MCID, 0));
   if (CmpMI->getOperand(FirstOp + 1).isReg())
     MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(),
-                           TII->getRegClass(MCID, 1, TRI));
+                           TII->getRegClass(MCID, 1));
   MachineInstrBuilder MIB = BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
                                 .add(CmpMI->getOperand(FirstOp)); // Register Rn
   if (isZBranch)
diff --git a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 75361f5d313c6..4ff49a627c794 100644
--- a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -156,7 +156,7 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
         LLVM_DEBUG(dbgs() << "    Ignoring, def is tied operand.\n");
         continue;
       }
-      const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI);
+      const TargetRegisterClass *RC = TII->getRegClass(Desc, I);
       unsigned NewReg;
       if (RC == nullptr) {
         LLVM_DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index cf344980cbaae..18e246e5af57d 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -81,10 +81,7 @@ namespace {
 class AArch64FastISel final : public FastISel {
   class Address {
   public:
-    using BaseKind = enum {
-      RegBase,
-      FrameIndexBase
-    };
+    enum BaseKind { RegBase, FrameIndexBase };
 
   private:
     BaseKind Kind = RegBase;
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 0e94b78d11d83..7fd5254dfa536 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -625,6 +625,13 @@ def FeatureF16F32DOT : ExtensionWithMArch<"f16f32dot", "F16F32DOT", "FEAT_F16F32
 def FeatureF16F32MM : ExtensionWithMArch<"f16f32mm", "F16F32MM", "FEAT_F16F32MM",
   "Enable Armv9.7-A Advanced SIMD half-precision matrix multiply-accumulate to single-precision", [FeatureNEON, FeatureFullFP16]>;
 
+//===----------------------------------------------------------------------===//
+//  Future Architecture Technologies
+//===----------------------------------------------------------------------===//
+
+def FeatureMOPS_GO: ExtensionWithMArch<"mops-go", "MOPS_GO", "FEAT_MOPS_GO",
+  "Enable memset acceleration granule only">;
+
 //===----------------------------------------------------------------------===//
 //  Other Features
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 0f7b34c36055f..c934d9269ea1e 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1082,14 +1082,24 @@ AArch64FrameLowering::insertSEH(MachineBasicBlock::iterator MBBI,
   case AArch64::LDPXi: {
     Register Reg0 = MBBI->getOperand(0).getReg();
     Register Reg1 = MBBI->getOperand(1).getReg();
+
+    int SEHReg0 = RegInfo->getSEHRegNum(Reg0);
+    int SEHReg1 = RegInfo->getSEHRegNum(Reg1);
+
     if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
                 .addImm(Imm * 8)
                 .setMIFlag(Flag);
-    else
+    else if (SEHReg0 >= 19 && SEHReg1 >= 19)
       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
-                .addImm(RegInfo->getSEHRegNum(Reg0))
-                .addImm(RegInfo->getSEHRegNum(Reg1))
+                .addImm(SEHReg0)
+                .addImm(SEHReg1)
+                .addImm(Imm * 8)
+                .setMIFlag(Flag);
+    else
+      MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveAnyRegIP))
+                .addImm(SEHReg0)
+                .addImm(SEHReg1)
                 .addImm(Imm * 8)
                 .setMIFlag(Flag);
     break;
@@ -1097,10 +1107,16 @@ AArch64FrameLowering::insertSEH(MachineBasicBlock::iterator MBBI,
   case AArch64::STRXui:
   case AArch64::LDRXui: {
     int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
-    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
-              .addImm(Reg)
-              .addImm(Imm * 8)
-              .setMIFlag(Flag);
+    if (Reg >= 19)
+      MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
+                .addImm(Reg)
+                .addImm(Imm * 8)
+                .setMIFlag(Flag);
+    else
+      MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveAnyRegI))
+                .addImm(Reg)
+                .addImm(Imm * 8)
+                .setMIFlag(Flag);
     break;
   }
   case AArch64::STRDui:
@@ -1538,8 +1554,10 @@ static bool produceCompactUnwindFrame(const AArch64FrameLowering &AFL,
          !AFL.requiresSaveVG(MF) && !AFI->isSVECC();
 }
 
-static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
-                                             bool NeedsWinCFI, bool IsFirst,
+static bool invalidateWindowsRegisterPairing(bool SpillExtendedVolatile,
+                                             unsigned SpillCount, unsigned Reg1,
+                                             unsigned Reg2, bool NeedsWinCFI,
+                                             bool IsFirst,
                                              const TargetRegisterInfo *TRI) {
   // If we are generating register pairs for a Windows function that requires
   // EH support, then pair consecutive registers only.  There are no unwind
@@ -1552,8 +1570,18 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
     return true;
   if (!NeedsWinCFI)
     return false;
+
+  // ARM64EC introduced `save_any_regp`, which expects 16-byte alignment.
+  // This is handled by only allowing paired spills for registers spilled at
+  // even positions (which should be 16-byte aligned, as other GPRs/FPRs are
+  // 8-bytes). We carve out an exception for {FP,LR}, which does not require
+  // 16-byte alignment in the uop representation.
   if (TRI->getEncodingValue(Reg2) == TRI->getEncodingValue(Reg1) + 1)
-    return false;
+    return SpillExtendedVolatile
+               ? !((Reg1 == AArch64::FP && Reg2 == AArch64::LR) ||
+                   (SpillCount % 2) == 0)
+               : false;
+
   // If pairing a GPR with LR, the pair can be described by the save_lrpair
   // opcode. If this is the first register pair, it would end up with a
   // predecrement, but there's no save_lrpair_x opcode, so we can only do this
@@ -1569,12 +1597,15 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
 /// WindowsCFI requires that only consecutive registers can be paired.
 /// LR and FP need to be allocated together when the frame needs to save
 /// the frame-record. This means any other register pairing with LR is invalid.
-static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
-                                      bool UsesWinAAPCS, bool NeedsWinCFI,
-                                      bool NeedsFrameRecord, bool IsFirst,
+static bool invalidateRegisterPairing(bool SpillExtendedVolatile,
+                                      unsigned SpillCount, unsigned Reg1,
+                                      unsigned Reg2, bool UsesWinAAPCS,
+                                      bool NeedsWinCFI, bool NeedsFrameRecord,
+                                      bool IsFirst,
                                       const TargetRegisterInfo *TRI) {
   if (UsesWinAAPCS)
-    return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst,
+    return invalidateWindowsRegisterPairing(SpillExtendedVolatile, SpillCount,
+                                            Reg1, Reg2, NeedsWinCFI, IsFirst,
                                             TRI);
 
   // If we need to store the frame record, don't pair any register
@@ -1672,6 +1703,21 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL,
   }
 
   bool FPAfterSVECalleeSaves = IsWindows && AFI->getSVECalleeSavedStackSize();
+  // Windows AAPCS has x9-x15 as volatile registers, x16-x17 as intra-procedural
+  // scratch, x18 as platform reserved. However, clang has extended calling
+  // convensions such as preserve_most and preserve_all which treat these as
+  // CSR. As such, the ARM64 unwind uOPs bias registers by 19. We use ARM64EC
+  // uOPs which have separate restrictions. We need to check for that.
+  //
+  // NOTE: we currently do not account for the D registers as LLVM does not
+  // support non-ABI compliant D register spills.
+  bool SpillExtendedVolatile =
+      IsWindows && std::any_of(std::begin(CSI), std::end(CSI),
+                               [](const CalleeSavedInfo &CSI) {
+                                 const auto &Reg = CSI.getReg();
+                                 return Reg >= AArch64::X0 &&
+                                        Reg <= AArch64::X18;
+                               });
 
   int ZPRByteOffset = 0;
   int PPRByteOffset = 0;
@@ -1733,17 +1779,19 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL,
     if (unsigned(i + RegInc) < Count && !HasCSHazardPadding) {
       MCRegister NextReg = CSI[i + RegInc].getReg();
       bool IsFirst = i == FirstReg;
+      unsigned SpillCount = NeedsWinCFI ? FirstReg - i : i;
       switch (RPI.Type) {
       case RegPairInfo::GPR:
         if (AArch64::GPR64RegClass.contains(NextReg) &&
-            !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows,
-                                       NeedsWinCFI, NeedsFrameRecord, IsFirst,
-                                       TRI))
+            !invalidateRegisterPairing(
+                SpillExtendedVolatile, SpillCount, RPI.Reg1, NextReg, IsWindows,
+                NeedsWinCFI, NeedsFrameRecord, IsFirst, TRI))
           RPI.Reg2 = NextReg;
         break;
       case RegPairInfo::FPR64:
         if (AArch64::FPR64RegClass.contains(NextReg) &&
-            !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
+            !invalidateWindowsRegisterPairing(SpillExtendedVolatile, SpillCount,
+                                              RPI.Reg1, NextReg, NeedsWinCFI,
                                               IsFirst, TRI))
           RPI.Reg2 = NextReg;
         break;
@@ -2364,36 +2412,41 @@ void AArch64FrameLowering::determineStackHazardSlot(
     AFI->setStackHazardSlotIndex(ID);
   }
 
-  // Determine if we should use SplitSVEObjects. This should only be used if
-  // there's a possibility of a stack hazard between PPRs and ZPRs or FPRs.
+  if (!AFI->hasStackHazardSlotIndex())
+    return;
+
   if (SplitSVEObjects) {
-    if (!HasPPRCSRs && !HasPPRStackObjects) {
-      LLVM_DEBUG(
-          dbgs() << "Not using SplitSVEObjects as no PPRs are on the stack\n");
+    CallingConv::ID CC = MF.getFunction().getCallingConv();
+    if (AFI->isSVECC() || CC == CallingConv::AArch64_SVE_VectorCall) {
+      AFI->setSplitSVEObjects(true);
+      LLVM_DEBUG(dbgs() << "Using SplitSVEObjects for SVE CC function\n");
       return;
     }
 
-    if (!HasFPRCSRs && !HasFPRStackObjects) {
+    // We only use SplitSVEObjects in non-SVE CC functions if there's a
+    // possibility of a stack hazard between PPRs and ZPRs/FPRs.
+    LLVM_DEBUG(dbgs() << "Determining if SplitSVEObjects should be used in "
+                         "non-SVE CC function...\n");
+
+    // If another calling convention is explicitly set FPRs can't be promoted to
+    // ZPR callee-saves.
+    if (!is_contained({CallingConv::C, CallingConv::Fast}, CC)) {
       LLVM_DEBUG(
           dbgs()
-          << "Not using SplitSVEObjects as no FPRs or ZPRs are on the stack\n");
+          << "Calling convention is not supported with SplitSVEObjects\n");
       return;
     }
 
-    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-    if (MFI.hasVarSizedObjects() || TRI->hasStackRealignment(MF)) {
-      LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with variable "
-                           "sized objects or realignment\n");
+    if (!HasPPRCSRs && !HasPPRStackObjects) {
+      LLVM_DEBUG(
+          dbgs() << "Not using SplitSVEObjects as no PPRs are on the stack\n");
       return;
     }
 
-    // If another calling convention is explicitly set FPRs can't be promoted to
-    // ZPR callee-saves.
-    if (!is_contained({CallingConv::C, CallingConv::Fast,
-                       CallingConv::AArch64_SVE_VectorCall},
-                      MF.getFunction().getCallingConv())) {
+    if (!HasFPRCSRs && !HasFPRStackObjects) {
       LLVM_DEBUG(
-          dbgs() << "Calling convention is not supported with SplitSVEObjects");
+          dbgs()
+          << "Not using SplitSVEObjects as no FPRs or ZPRs are on the stack\n");
       return;
     }
 
@@ -2402,6 +2455,7 @@ void AArch64FrameLowering::determineStackHazardSlot(
     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
            "Expected SVE to be available for PPRs");
 
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     // With SplitSVEObjects the CS hazard padding is placed between the
     // PPRs and ZPRs. If there are any FPR CS there would be a hazard between
     // them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60aa61e993b26..eaa10ef031989 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -50,6 +50,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
@@ -104,7 +105,6 @@
 #include <vector>
 
 using namespace llvm;
-using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "aarch64-lower"
 
@@ -1052,15 +1052,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
 
-  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
-      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
-    // Issue __sincos_stret if available.
-    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
-  } else {
-    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
-  }
+  // Issue __sincos_stret if available.
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
   // Make floating-point constants legal for the large code model, so they don't
   // become loads from the constant pool.
@@ -1180,6 +1174,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setTargetDAGCombine(ISD::SHL);
   setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE);
+  setTargetDAGCombine(ISD::CTPOP);
 
   // In case of strict alignment, avoid an excessive number of byte wide stores.
   MaxStoresPerMemsetOptSize = 8;
@@ -1921,6 +1916,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
       setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
     }
+
+    // Handle floating-point partial reduction
+    if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
+      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, MVT::nxv4f32,
+                                MVT::nxv8f16, Legal);
+    }
   }
 
   // Handle non-aliasing elements mask
@@ -2288,6 +2289,11 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
                                 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
   }
 
+  if (Subtarget->hasSVE2p1() && VT.getVectorElementType() == MVT::f32) {
+    setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, VT,
+                              MVT::getVectorVT(MVT::f16, NumElts * 2), Custom);
+  }
+
   // Lower fixed length vector operations to scalable equivalents.
   setOperationAction(ISD::ABDS, VT, Default);
   setOperationAction(ISD::ABDU, VT, Default);
@@ -5346,35 +5352,6 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
   return SDValue();
 }
 
-SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  // For iOS, we want to call an alternative entry point: __sincos_stret,
-  // which returns the values in two S / D registers.
-  SDLoc DL(Op);
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
-  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-
-  ArgListTy Args;
-  Args.emplace_back(Arg, ArgTy);
-
-  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
-                                        : RTLIB::SINCOS_STRET_F32;
-  const char *LibcallName = getLibcallName(LC);
-  SDValue Callee =
-      DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
-
-  StructType *RetTy = StructType::get(ArgTy, ArgTy);
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CallingConv::ID CC = getLibcallCallingConv(LC);
-  CLI.setDebugLoc(DL)
-      .setChain(DAG.getEntryNode())
-      .setLibCallee(CC, RetTy, Callee, std::move(Args));
-
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-  return CallResult.first;
-}
-
 static MVT getSVEContainerType(EVT ContentTy);
 
 SDValue
@@ -5578,9 +5555,10 @@ SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
   SDLoc DL(Op);
 
   SDValue Chain = Op.getOperand(0);
-  SDValue FPCR_64 = DAG.getNode(
-      ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other},
-      {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)});
+  SDValue FPCR_64 =
+      DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other},
+                  {Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL,
+                                                MVT::i64)});
   Chain = FPCR_64.getValue(1);
   SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64);
   SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32,
@@ -5666,7 +5644,8 @@ SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
 
   // Set new value of FPCR.
   SDValue Ops2[] = {
-      Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
+      Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
+      FPCR};
   return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
 }
 
@@ -5689,9 +5668,9 @@ SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
       DAG.getConstant(AArch64::ReservedFPControlBits, DL, MVT::i64));
 
   // Set new value of FPCR.
-  SDValue Ops2[] = {Chain,
-                    DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
-                    FPSCRMasked};
+  SDValue Ops2[] = {
+      Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
+      FPSCRMasked};
   return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
 }
 
@@ -7323,17 +7302,19 @@ SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
 
   SDValue Compressed = DAG.getNode(
       ISD::INTRINSIC_WO_CHAIN, DL, Vec.getValueType(),
-      DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec);
+      DAG.getTargetConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask,
+      Vec);
 
   // compact fills with 0s, so if our passthru is all 0s, do nothing here.
   if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
     SDValue Offset = DAG.getNode(
         ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
-        DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask);
+        DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask,
+        Mask);
 
     SDValue IndexMask = DAG.getNode(
         ISD::INTRINSIC_WO_CHAIN, DL, MaskVT,
-        DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),
+        DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),
         DAG.getConstant(0, DL, MVT::i64), Offset);
 
     Compressed =
@@ -7462,10 +7443,10 @@ static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) {
                              DAG.getUNDEF(ExpVT), Exp, Zero);
   SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
                          AArch64SVEPredPattern::all);
-  SDValue FScale =
-      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XVT,
-                  DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
-                  VPg, VX, VExp);
+  SDValue FScale = DAG.getNode(
+      ISD::INTRINSIC_WO_CHAIN, DL, XVT,
+      DAG.getTargetConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64), VPg,
+      VX, VExp);
   SDValue Final =
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
   if (X.getValueType() != XScalarTy)
@@ -7723,8 +7704,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
     return LowerFP_TO_INT_SAT(Op, DAG);
-  case ISD::FSINCOS:
-    return LowerFSINCOS(Op, DAG);
   case ISD::GET_ROUNDING:
     return LowerGET_ROUNDING(Op, DAG);
   case ISD::SET_ROUNDING:
@@ -7911,6 +7890,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     return LowerPARTIAL_REDUCE_MLA(Op, DAG);
   }
 }
@@ -8130,7 +8110,7 @@ static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL,
       TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout()));
   SDValue TPIDR2_EL0 = DAG.getNode(
       ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain,
-      DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
+      DAG.getTargetConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
   // Copy the address of the TPIDR2 block into X0 before 'calling' the
   // RESTORE_ZA pseudo.
   SDValue Glue;
@@ -8145,7 +8125,7 @@ static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL,
   // Finally reset the TPIDR2_EL0 register to 0.
   Chain = DAG.getNode(
       ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
-      DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
+      DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
       DAG.getConstant(0, DL, MVT::i64));
   TPIDR2.Uses++;
   return Chain;
@@ -8740,7 +8720,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     if (Attrs.isNewZT0())
       Chain = DAG.getNode(
           ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
-          DAG.getConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32),
+          DAG.getTargetConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32),
           DAG.getTargetConstant(0, DL, MVT::i32));
   }
 
@@ -9553,7 +9533,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
     Chain = DAG.getNode(
         ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
-        DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
+        DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
         TPIDR2ObjAddr);
     OptimizationRemarkEmitter ORE(&MF.getFunction());
     ORE.emit([&]() {
@@ -11367,9 +11347,10 @@ SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
     break;
   }
 
+  // Note: This lowering only overrides NEON for v1i64 and v2i64, where we
+  // prefer using SVE if available.
   if (VT.isScalableVector() ||
-      useSVEForFixedLengthVectorVT(
-          VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
+      useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) {
     switch (Opcode) {
     default:
       llvm_unreachable("Wrong instruction");
@@ -13444,8 +13425,8 @@ SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) {
 
   return DAG.getNode(
       ISD::INTRINSIC_WO_CHAIN, DL, VT,
-      DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), SourceVec,
-      MaskSourceVec);
+      DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
+      SourceVec, MaskSourceVec);
 }
 
 // Gather data to see if the operation can be modelled as a
@@ -14301,14 +14282,16 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
     Shuffle = DAG.getNode(
         ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
-        DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
+        DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
+        V1Cst,
         DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
   } else {
     if (IndexLen == 8) {
       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
       Shuffle = DAG.getNode(
           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
-          DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
+          DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
+          V1Cst,
           DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
     } else {
       // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
@@ -14319,8 +14302,8 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
       //                   IndexLen));
       Shuffle = DAG.getNode(
           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
-          DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
-          V2Cst,
+          DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32),
+          V1Cst, V2Cst,
           DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
     }
   }
@@ -16473,10 +16456,10 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
     if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
       return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
                          DAG.getTargetConstant(Cnt, DL, MVT::i32));
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                       DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
-                                       MVT::i32),
-                       Op.getOperand(0), Op.getOperand(1));
+    return DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, VT,
+        DAG.getTargetConstant(Intrinsic::aarch64_neon_ushl, DL, MVT::i32),
+        Op.getOperand(0), Op.getOperand(1));
   case ISD::SRA:
   case ISD::SRL:
     if (VT.isScalableVector() &&
@@ -17591,6 +17574,7 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
     // udot instruction.
     if (SrcWidth * 4 <= DstWidth) {
       if (all_of(I->users(), [&](auto *U) {
+            using namespace llvm::PatternMatch;
             auto *SingleUser = cast<Instruction>(&*U);
             if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
               return true;
@@ -17862,6 +17846,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   // into shift / and masks. For the moment we do this just for uitofp (not
   // zext) to avoid issues with widening instructions.
   if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
+        using namespace llvm::PatternMatch;
         return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
                SI->getType()->getScalarSizeInBits() * 4 ==
                    SI->user_back()->getType()->getScalarSizeInBits();
@@ -18570,7 +18555,7 @@ bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
   case MVT::f64:
     return true;
   case MVT::bf16:
-    return VT.isScalableVector() && Subtarget->hasSVEB16B16() &&
+    return VT.isScalableVector() && Subtarget->hasBF16() &&
            Subtarget->isNonStreamingSVEorSME2Available();
   default:
     break;
@@ -20070,7 +20055,7 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
                                       : Intrinsic::aarch64_neon_vcvtfp2fxu;
   SDValue FixConv =
       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
-                  DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
+                  DAG.getTargetConstant(IntrinsicOpcode, DL, MVT::i32),
                   Op->getOperand(0), DAG.getTargetConstant(C, DL, MVT::i32));
   // We can handle smaller integers by generating an extra trunc.
   if (IntBits < FloatBits)
@@ -22329,6 +22314,37 @@ static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
 }
 
+// Attempt to combine the following patterns:
+//   SUB x, (CSET LO, (CMP a, b)) -> SBC x, 0, (CMP a, b)
+//   SUB (SUB x, y), (CSET LO, (CMP a, b)) -> SBC x, y, (CMP a, b)
+// The CSET may be preceded by a ZEXT.
+static SDValue performSubWithBorrowCombine(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() != ISD::SUB)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() == ISD::ZERO_EXTEND && N1.hasOneUse())
+    N1 = N1.getOperand(0);
+  if (!N1.hasOneUse() || getCSETCondCode(N1) != AArch64CC::LO)
+    return SDValue();
+
+  SDValue Flags = N1.getOperand(3);
+  if (Flags.getOpcode() != AArch64ISD::SUBS)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue N0 = N->getOperand(0);
+  if (N0->getOpcode() == ISD::SUB)
+    return DAG.getNode(AArch64ISD::SBC, DL, VT, N0.getOperand(0),
+                       N0.getOperand(1), Flags);
+  return DAG.getNode(AArch64ISD::SBC, DL, VT, N0, DAG.getConstant(0, DL, VT),
+                     Flags);
+}
+
 static SDValue performAddSubCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI) {
   // Try to change sum of two reductions.
@@ -22350,6 +22366,8 @@ static SDValue performAddSubCombine(SDNode *N,
     return Val;
   if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
     return Val;
+  if (SDValue Val = performSubWithBorrowCombine(N, DCI.DAG))
+    return Val;
 
   if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
     return Val;
@@ -26071,7 +26089,7 @@ static SDValue performCSELCombine(SDNode *N,
   // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
   // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
   if (SDValue Folded = foldCSELofCTTZ(N, DAG))
-		return Folded;
+    return Folded;
 
   // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)
   // if SUB(y, x) already exists and we can produce a swapped predicate for cc.
@@ -27326,8 +27344,8 @@ static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
   // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
   // `aarch64_sve_prfb_gather_uxtw_index`.
   SDLoc DL(N);
-  Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
-                           MVT::i64);
+  Ops[1] = DAG.getTargetConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index,
+                                 DL, MVT::i64);
 
   return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
 }
@@ -27878,6 +27896,35 @@ static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) {
       {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
 }
 
+static SDValue performCTPOPCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   SelectionDAG &DAG) {
+  using namespace llvm::SDPatternMatch;
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  // ctpop(zext(bitcast(vector_mask))) -> neg(signed_reduce_add(vector_mask))
+  SDValue Mask;
+  if (!sd_match(N->getOperand(0), m_ZExt(m_BitCast(m_Value(Mask)))))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  EVT MaskVT = Mask.getValueType();
+
+  if (VT.isVector() || !MaskVT.isFixedLengthVector() ||
+      MaskVT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  EVT ReduceInVT =
+      EVT::getVectorVT(*DAG.getContext(), VT, MaskVT.getVectorElementCount());
+
+  SDLoc DL(N);
+  // Sign extend to best fit ZeroOrNegativeOneBooleanContent.
+  SDValue ExtMask = DAG.getNode(ISD::SIGN_EXTEND, DL, ReduceInVT, Mask);
+  SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, ExtMask);
+  return DAG.getNegative(NegPopCount, DL, VT);
+}
+
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -28223,6 +28270,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performScalarToVectorCombine(N, DCI, DAG);
   case ISD::SHL:
     return performSHLCombine(N, DCI, DAG);
+  case ISD::CTPOP:
+    return performCTPOPCombine(N, DCI, DAG);
   }
   return SDValue();
 }
@@ -31161,10 +31210,10 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
 
   SDValue Shuffle;
   if (IsSingleOp)
-    Shuffle =
-        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
-                    DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
-                    Op1, SVEMask);
+    Shuffle = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
+        DAG.getTargetConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32), Op1,
+        SVEMask);
   else if (Subtarget.hasSVE2()) {
     if (!MinMaxEqual) {
       unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
@@ -31183,10 +31232,10 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
       SVEMask = convertToScalableVector(
           DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
     }
-    Shuffle =
-        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
-                    DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
-                    Op1, Op2, SVEMask);
+    Shuffle = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
+        DAG.getTargetConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32), Op1,
+        Op2, SVEMask);
   }
   Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
   return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
@@ -31346,8 +31395,8 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
       unsigned SegmentElts = VT.getVectorNumElements() / Segments;
       if (std::optional<unsigned> Lane =
               isDUPQMask(ShuffleMask, Segments, SegmentElts)) {
-        SDValue IID =
-            DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
+        SDValue IID = DAG.getTargetConstant(Intrinsic::aarch64_sve_dup_laneq,
+                                            DL, MVT::i64);
         return convertFromScalableVector(
             DAG, VT,
             DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2cb8ed29f252a..70bfae717fb76 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -745,7 +745,6 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 58a53af76e1b5..2bce5c89f8ba6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -12588,12 +12588,10 @@ class MOPSMemoryCopy<bits<2> opcode, bits<2> op1, bits<2> op2, string asm>
 class MOPSMemoryMove<bits<2> opcode, bits<2> op1, bits<2> op2, string asm>
   : MOPSMemoryCopyMoveBase<1, opcode, op1, op2, asm>;
 
-class MOPSMemorySetBase<bit isTagging, bits<2> opcode, bit op1, bit op2,
-                        string asm>
-  : I<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb),
-      (ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm),
-      asm, "\t[$Rd]!, $Rn!, $Rm",
-      "$Rd = $Rd_wb,$Rn = $Rn_wb", []>,
+class MOPSMemorySetBase<dag ins, string operands, bit isTagging, bits<2> opcode,
+                        bit op1, bit op2, bit op3, string asm>
+  : I<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb), ins,
+      asm, operands, "$Rd = $Rd_wb,$Rn = $Rn_wb", []>,
     Sched<[]> {
   bits<5> Rd;
   bits<5> Rn;
@@ -12605,20 +12603,34 @@ class MOPSMemorySetBase<bit isTagging, bits<2> opcode, bit op1, bit op2,
   let Inst{15-14} = opcode;
   let Inst{13} = op2;
   let Inst{12} = op1;
-  let Inst{11-10} = 0b01;
+  let Inst{11} = 0b0;
+  let Inst{10} = op3;
   let Inst{9-5} = Rn;
   let Inst{4-0} = Rd;
 
-  let DecoderMethod = "DecodeSETMemOpInstruction";
   let mayLoad = 0;
   let mayStore = 1;
 }
 
-class MOPSMemorySet<bits<2> opcode, bit op1, bit op2, string asm>
-  : MOPSMemorySetBase<0, opcode, op1, op2, asm>;
+class MOPSMemorySet<bits<2> opcode, bit op1, bit op2, bit op3, string asm>
+  : MOPSMemorySetBase<(ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm),
+                       "\t[$Rd]!, $Rn!, $Rm", 0, opcode, op1, op2, op3, asm> {
+  let DecoderMethod = "DecodeSETMemOpInstruction";
+}
+
+class MOPSMemorySetTagging<bits<2> opcode, bit op1, bit op2, bit op3, string asm>
+  : MOPSMemorySetBase<(ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm),
+                       "\t[$Rd]!, $Rn!, $Rm", 1, opcode, op1, op2, op3, asm> {
+  let DecoderMethod = "DecodeSETMemOpInstruction";
+}
 
-class MOPSMemorySetTagging<bits<2> opcode, bit op1, bit op2, string asm>
-  : MOPSMemorySetBase<1, opcode, op1, op2, asm>;
+class MOPSGoMemorySetTagging<bits<2> opcode, bit op1, bit op2, bit op3, string asm>
+  : MOPSMemorySetBase<(ins GPR64common:$Rd, GPR64:$Rn),
+                       "\t[$Rd]!, $Rn!", 1, opcode, op1, op2, op3, asm> {
+  // No `Rm` operand, as all bits must be set to 1
+  let Inst{20-16} = 0b11111;
+  let DecoderMethod = "DecodeSETMemGoOpInstruction";
+}
 
 multiclass MOPSMemoryCopyInsns<bits<2> opcode, string asm> {
   def ""   : MOPSMemoryCopy<opcode, 0b00, 0b00, asm>;
@@ -12659,17 +12671,27 @@ multiclass MOPSMemoryMoveInsns<bits<2> opcode, string asm> {
 }
 
 multiclass MOPSMemorySetInsns<bits<2> opcode, string asm> {
-  def "" : MOPSMemorySet<opcode, 0, 0, asm>;
-  def T  : MOPSMemorySet<opcode, 1, 0, asm # "t">;
-  def N  : MOPSMemorySet<opcode, 0, 1, asm # "n">;
-  def TN : MOPSMemorySet<opcode, 1, 1, asm # "tn">;
+  def "" : MOPSMemorySet<opcode, 0, 0, 1, asm>;
+  def T  : MOPSMemorySet<opcode, 1, 0, 1, asm # "t">;
+  def N  : MOPSMemorySet<opcode, 0, 1, 1, asm # "n">;
+  def TN : MOPSMemorySet<opcode, 1, 1, 1, asm # "tn">;
 }
 
 multiclass MOPSMemorySetTaggingInsns<bits<2> opcode, string asm> {
-  def "" : MOPSMemorySetTagging<opcode, 0, 0, asm>;
-  def T  : MOPSMemorySetTagging<opcode, 1, 0, asm # "t">;
-  def N  : MOPSMemorySetTagging<opcode, 0, 1, asm # "n">;
-  def TN : MOPSMemorySetTagging<opcode, 1, 1, asm # "tn">;
+  def "" : MOPSMemorySetTagging<opcode, 0, 0, 1, asm>;
+  def T  : MOPSMemorySetTagging<opcode, 1, 0, 1, asm # "t">;
+  def N  : MOPSMemorySetTagging<opcode, 0, 1, 1, asm # "n">;
+  def TN : MOPSMemorySetTagging<opcode, 1, 1, 1, asm # "tn">;
+}
+
+//----------------------------------------------------------------------------
+// MOPS Granule Only - FEAT_MOPS_GO
+//----------------------------------------------------------------------------
+multiclass MOPSGoMemorySetTaggingInsns<bits<2> opcode, string asm> {
+  def "" : MOPSGoMemorySetTagging<opcode, 0, 0, 0, asm>;
+  def T  : MOPSGoMemorySetTagging<opcode, 1, 0, 0, asm # "t">;
+  def N  : MOPSGoMemorySetTagging<opcode, 0, 1, 0, asm # "n">;
+  def TN : MOPSGoMemorySetTagging<opcode, 1, 1, 0, asm # "tn">;
 }
 
 //----------------------------------------------------------------------------
@@ -13292,18 +13314,24 @@ multiclass AtomicFPStore<bit R, bits<3> op0, string asm> {
   def H : BaseAtomicFPStore<FPR16, 0b01, R, op0, asm>;
 }
 
-class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind>
+class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind, list<dag> pattern>
   : BaseSIMDThreeSameVectorTied<1, 1, {size, 0}, 0b11101,
-                                V128, asm, ".16b", []> {
+                                V128, asm, ".16b", pattern> {
   let AsmString = !strconcat(asm, "{\t$Rd", kind, ", $Rn.16b, $Rm.16b",
                                    "|", kind, "\t$Rd, $Rn, $Rm}");
 }
 
-multiclass SIMDThreeSameVectorFP8MatrixMul<string asm>{
-    def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h">{
+multiclass SIMDThreeSameVectorFP8MatrixMul<string asm, SDPatternOperator OpNode>{
+    def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h",
+              [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd),
+                                               (v16i8 V128:$Rn),
+                                               (v16i8 V128:$Rm)))]> {
       let Predicates = [HasNEON, HasF8F16MM];
     }
-    def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s">{
+    def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s",
+              [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
+                                               (v16i8 V128:$Rn),
+                                               (v16i8 V128:$Rm)))]> {
       let Predicates = [HasNEON, HasF8F32MM];
     }
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 30b7b03f7a69a..52b216c7fe0f0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -197,6 +197,12 @@ def G_SMULL : AArch64GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_PMULL : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src1, type1:$src2);
+  let hasSideEffects = 0;
+}
+
 def G_UADDLP : AArch64GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
@@ -273,6 +279,7 @@ def : GINodeEquiv<G_FCMGT, AArch64fcmgt>;
 
 def : GINodeEquiv<G_BSP, AArch64bsp>;
 
+def : GINodeEquiv<G_PMULL, AArch64pmull>;
 def : GINodeEquiv<G_UMULL, AArch64umull>;
 def : GINodeEquiv<G_SMULL, AArch64smull>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index ccc8eb8a9706d..b93e562f4cee5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -91,7 +91,7 @@ static cl::opt<unsigned> GatherOptSearchLimit(
              "machine-combiner gather pattern optimization"));
 
 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
-    : AArch64GenInstrInfo(STI, AArch64::ADJCALLSTACKDOWN,
+    : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
                           AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
       RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
 
@@ -1217,6 +1217,8 @@ bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
     case AArch64::SEH_EpilogStart:
     case AArch64::SEH_EpilogEnd:
     case AArch64::SEH_PACSignLR:
+    case AArch64::SEH_SaveAnyRegI:
+    case AArch64::SEH_SaveAnyRegIP:
     case AArch64::SEH_SaveAnyRegQP:
     case AArch64::SEH_SaveAnyRegQPX:
     case AArch64::SEH_AllocZ:
@@ -1778,6 +1780,16 @@ static unsigned sForm(MachineInstr &Instr) {
   case AArch64::SUBSWri:
   case AArch64::SUBSXrr:
   case AArch64::SUBSXri:
+  case AArch64::ANDSWri:
+  case AArch64::ANDSWrr:
+  case AArch64::ANDSWrs:
+  case AArch64::ANDSXri:
+  case AArch64::ANDSXrr:
+  case AArch64::ANDSXrs:
+  case AArch64::BICSWrr:
+  case AArch64::BICSXrr:
+  case AArch64::BICSWrs:
+  case AArch64::BICSXrs:
     return Instr.getOpcode();
 
   case AArch64::ADDWrr:
@@ -1808,6 +1820,22 @@ static unsigned sForm(MachineInstr &Instr) {
     return AArch64::ANDSWri;
   case AArch64::ANDXri:
     return AArch64::ANDSXri;
+  case AArch64::ANDWrr:
+    return AArch64::ANDSWrr;
+  case AArch64::ANDWrs:
+    return AArch64::ANDSWrs;
+  case AArch64::ANDXrr:
+    return AArch64::ANDSXrr;
+  case AArch64::ANDXrs:
+    return AArch64::ANDSXrs;
+  case AArch64::BICWrr:
+    return AArch64::BICSWrr;
+  case AArch64::BICXrr:
+    return AArch64::BICSXrr;
+  case AArch64::BICWrs:
+    return AArch64::BICSWrs;
+  case AArch64::BICXrs:
+    return AArch64::BICSXrs;
   }
 }
 
@@ -1945,6 +1973,25 @@ static bool isSUBSRegImm(unsigned Opcode) {
   return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
 }
 
+static bool isANDOpcode(MachineInstr &MI) {
+  unsigned Opc = sForm(MI);
+  switch (Opc) {
+  case AArch64::ANDSWri:
+  case AArch64::ANDSWrr:
+  case AArch64::ANDSWrs:
+  case AArch64::ANDSXri:
+  case AArch64::ANDSXrr:
+  case AArch64::ANDSXrs:
+  case AArch64::BICSWrr:
+  case AArch64::BICSXrr:
+  case AArch64::BICSWrs:
+  case AArch64::BICSXrs:
+    return true;
+  default:
+    return false;
+  }
+}
+
 /// Check if CmpInstr can be substituted by MI.
 ///
 /// CmpInstr can be substituted:
@@ -1982,7 +2029,8 @@ static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
   // 1) MI and CmpInstr set N and V to the same value.
   // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
   //    signed overflow occurs, so CmpInstr could still be simplified away.
-  if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap))
+  // Note that Ands and Bics instructions always clear the V flag.
+  if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI))
     return false;
 
   AccessKind AccessToCheck = AK_Write;
@@ -5616,7 +5664,6 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator MBBI,
                                            Register SrcReg, bool isKill, int FI,
                                            const TargetRegisterClass *RC,
-                                           const TargetRegisterInfo *TRI,
                                            Register VReg,
                                            MachineInstr::MIFlag Flags) const {
   MachineFunction &MF = *MBB.getParent();
@@ -5630,7 +5677,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   bool Offset = true;
   MCRegister PNRReg = MCRegister::NoRegister;
   unsigned StackID = TargetStackID::Default;
-  switch (TRI->getSpillSize(*RC)) {
+  switch (RI.getSpillSize(*RC)) {
   case 1:
     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
       Opc = AArch64::STRBui;
@@ -5793,10 +5840,12 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
       .addMemOperand(MMO);
 }
 
-void AArch64InstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MBBI,
+                                            Register DestReg, int FI,
+                                            const TargetRegisterClass *RC,
+                                            Register VReg,
+                                            MachineInstr::MIFlag Flags) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
@@ -5808,7 +5857,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
   bool Offset = true;
   unsigned StackID = TargetStackID::Default;
   Register PNRReg = MCRegister::NoRegister;
-  switch (TRI->getSpillSize(*RC)) {
+  switch (TRI.getSpillSize(*RC)) {
   case 1:
     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
       Opc = AArch64::LDRBui;
@@ -6444,10 +6493,10 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
              "Mismatched register size in non subreg COPY");
       if (IsSpill)
         storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
-                            getRegClass(SrcReg), &TRI, Register());
+                            getRegClass(SrcReg), Register());
       else
         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
-                             getRegClass(DstReg), &TRI, Register());
+                             getRegClass(DstReg), Register());
       return &*--InsertPt;
     }
 
@@ -6465,8 +6514,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
       assert(SrcMO.getSubReg() == 0 &&
              "Unexpected subreg on physical register");
       storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
-                          FrameIndex, &AArch64::GPR64RegClass, &TRI,
-                          Register());
+                          FrameIndex, &AArch64::GPR64RegClass, Register());
       return &*--InsertPt;
     }
 
@@ -6500,7 +6548,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
         assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
                    TRI.getRegSizeInBits(*FillRC) &&
                "Mismatched regclass size on folded subreg COPY");
-        loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI,
+        loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC,
                              Register());
         MachineInstr &LoadMI = *--InsertPt;
         MachineOperand &LoadDst = LoadMI.getOperand(0);
@@ -9588,6 +9636,27 @@ AArch64InstrInfo::getOutliningCandidateInfo(
 
   unsigned NumBytesToCreateFrame = 0;
 
+  // Avoid splitting ADRP ADD/LDR pair into outlined functions.
+  // These instructions are fused together by the scheduler.
+  // Any candidate where ADRP is the last instruction should be rejected
+  // as that will lead to splitting ADRP pair.
+  MachineInstr &LastMI = RepeatedSequenceLocs[0].back();
+  MachineInstr &FirstMI = RepeatedSequenceLocs[0].front();
+  if (LastMI.getOpcode() == AArch64::ADRP &&
+      (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_PAGE) != 0 &&
+      (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_GOT) != 0) {
+    return std::nullopt;
+  }
+
+  // Similarly any candidate where the first instruction is ADD/LDR with a
+  // page offset should be rejected to avoid ADRP splitting.
+  if ((FirstMI.getOpcode() == AArch64::ADDXri ||
+       FirstMI.getOpcode() == AArch64::LDRXui) &&
+      (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 &&
+      (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) != 0) {
+    return std::nullopt;
+  }
+
   // We only allow outlining for functions having exactly matching return
   // address signing attributes, i.e., all share the same value for the
   // attribute "sign-return-address" and all share the same type of key they
@@ -10994,8 +11063,6 @@ static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
                            MachineBasicBlock::iterator InsertTo) {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
-  const TargetRegisterInfo *TRI =
-      MBB.getParent()->getSubtarget().getRegisterInfo();
   MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
   Register Result = 0;
   for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
@@ -11004,8 +11071,7 @@ static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
           MRI.getRegClass(NewMI->getOperand(0).getReg()));
       NewMI->getOperand(I).setReg(Result);
     } else if (I == ReplaceOprNum) {
-      MRI.constrainRegClass(ReplaceReg,
-                            TII->getRegClass(NewMI->getDesc(), I, TRI));
+      MRI.constrainRegClass(ReplaceReg, TII->getRegClass(NewMI->getDesc(), I));
       NewMI->getOperand(I).setReg(ReplaceReg);
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 179574a73aa01..979c9acbd48e1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -353,14 +353,13 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   // This tells target independent code that it is okay to pass instructions
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 2871a20e28b65..34a20f09d2806 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -405,6 +405,8 @@ def HasMTETC         : Predicate<"Subtarget->hasMTETC()">,
                                  AssemblerPredicateWithAll<(all_of FeatureMTETC), "mtetc">;
 def HasGCIE          : Predicate<"Subtarget->hasGCIE()">,
                                  AssemblerPredicateWithAll<(all_of FeatureGCIE), "gcie">;
+def HasMOPS_GO       : Predicate<"Subtarget->hasMOPS_GO()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureMOPS_GO), "mops-go">;
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def IsWindows        : Predicate<"Subtarget->isTargetWindows()">;
@@ -4444,6 +4446,11 @@ defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
                   [(AArch64Prefetch timm:$Rt,
                                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
 
+// PRFM falls back to PRFUM for negative or unaligned offsets (not a multiple
+// of 8).
+def : InstAlias<"prfm $Rt, [$Rn, $offset]",
+                (PRFUMi prfop:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+
 //---
 // (unscaled immediate, unprivileged)
 defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
@@ -5666,6 +5673,8 @@ let isPseudo = 1 in {
   def SEH_EpilogStart : Pseudo<(outs), (ins), []>, Sched<[]>;
   def SEH_EpilogEnd : Pseudo<(outs), (ins), []>, Sched<[]>;
   def SEH_PACSignLR : Pseudo<(outs), (ins), []>, Sched<[]>;
+  def SEH_SaveAnyRegI : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveAnyRegIP : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
   def SEH_SaveAnyRegQP : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
   def SEH_SaveAnyRegQPX : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
   def SEH_AllocZ : Pseudo<(outs), (ins i32imm:$offs), []>, Sched<[]>;
@@ -10860,6 +10869,15 @@ let Predicates = [HasMOPS, HasMTE], Defs = [NZCV], Size = 12, mayLoad = 0, maySt
                                           [], "$Rd = $Rd_wb,$Rn = $Rn_wb">, Sched<[]>;
 }
 
+//-----------------------------------------------------------------------------
+// MOPS Granule Only Protection (FEAT_MOPS_GO)
+
+let Predicates = [HasMOPS_GO, HasMTE] in {
+    defm SETGOP : MOPSGoMemorySetTaggingInsns<0b00, "setgop">;
+    defm SETGOM : MOPSGoMemorySetTaggingInsns<0b01, "setgom">;
+    defm SETGOE : MOPSGoMemorySetTaggingInsns<0b10, "setgoe">;
+}
+
 //-----------------------------------------------------------------------------
 // v8.3 Pointer Authentication late patterns
 
@@ -11415,7 +11433,7 @@ let Predicates = [HasF16F32MM] in
   defm FMMLA : SIMDThreeSameVectorFMLAWiden<"fmmla">;
 
 let Uses = [FPMR, FPCR] in
-  defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">;
+  defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla", int_aarch64_neon_fmmla>;
 
 //===----------------------------------------------------------------------===//
 // Contention Management Hints (FEAT_CMH)
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index e69fa32967a79..2ab7bf19da410 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1386,6 +1386,25 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
       if (MOP.isReg() && MOP.isKill())
         DefinedInBB.addReg(MOP.getReg());
 
+  // Copy over any implicit-def operands. This is like MI.copyImplicitOps, but
+  // only copies implicit defs and makes sure that each operand is only added
+  // once in case of duplicates.
+  auto CopyImplicitOps = [&](MachineBasicBlock::iterator MI1,
+                             MachineBasicBlock::iterator MI2) {
+    SmallSetVector<Register, 4> Ops;
+    for (const MachineOperand &MO :
+         llvm::drop_begin(MI1->operands(), MI1->getDesc().getNumOperands()))
+      if (MO.isReg() && MO.isImplicit() && MO.isDef())
+        Ops.insert(MO.getReg());
+    for (const MachineOperand &MO :
+         llvm::drop_begin(MI2->operands(), MI2->getDesc().getNumOperands()))
+      if (MO.isReg() && MO.isImplicit() && MO.isDef())
+        Ops.insert(MO.getReg());
+    for (auto Op : Ops)
+      MIB.addDef(Op, RegState::Implicit);
+  };
+  CopyImplicitOps(I, Paired);
+
   // Erase the old instructions.
   I->eraseFromParent();
   Paired->eraseFromParent();
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 04e76c7abd202..d25db89cca358 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -595,17 +595,17 @@ bool AArch64MIPeepholeOpt::splitTwoPartImm(
 
   // Determine register classes for destinations and register operands
   const TargetRegisterClass *FirstInstrDstRC =
-      TII->getRegClass(TII->get(Opcode.first), 0, TRI);
+      TII->getRegClass(TII->get(Opcode.first), 0);
   const TargetRegisterClass *FirstInstrOperandRC =
-      TII->getRegClass(TII->get(Opcode.first), 1, TRI);
+      TII->getRegClass(TII->get(Opcode.first), 1);
   const TargetRegisterClass *SecondInstrDstRC =
       (Opcode.first == Opcode.second)
           ? FirstInstrDstRC
-          : TII->getRegClass(TII->get(Opcode.second), 0, TRI);
+          : TII->getRegClass(TII->get(Opcode.second), 0);
   const TargetRegisterClass *SecondInstrOperandRC =
       (Opcode.first == Opcode.second)
           ? FirstInstrOperandRC
-          : TII->getRegClass(TII->get(Opcode.second), 1, TRI);
+          : TII->getRegClass(TII->get(Opcode.second), 1);
 
   // Get old registers destinations and new register destinations
   Register DstReg = MI.getOperand(0).getReg();
@@ -784,14 +784,14 @@ bool AArch64MIPeepholeOpt::visitUBFMXri(MachineInstr &MI) {
   }
 
   const TargetRegisterClass *DstRC64 =
-      TII->getRegClass(TII->get(MI.getOpcode()), 0, TRI);
+      TII->getRegClass(TII->get(MI.getOpcode()), 0);
   const TargetRegisterClass *DstRC32 =
       TRI->getSubRegisterClass(DstRC64, AArch64::sub_32);
   assert(DstRC32 && "Destination register class of UBFMXri doesn't have a "
                     "sub_32 subregister class");
 
   const TargetRegisterClass *SrcRC64 =
-      TII->getRegClass(TII->get(MI.getOpcode()), 1, TRI);
+      TII->getRegClass(TII->get(MI.getOpcode()), 1);
   const TargetRegisterClass *SrcRC32 =
       TRI->getSubRegisterClass(SrcRC64, AArch64::sub_32);
   assert(SrcRC32 && "Source register class of UBFMXri doesn't have a sub_32 "
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index 7e03b97584fe1..965585f40571b 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -253,6 +253,8 @@ static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
   case AArch64::SEH_SaveReg:
   case AArch64::SEH_SaveFRegP:
   case AArch64::SEH_SaveFReg:
+  case AArch64::SEH_SaveAnyRegI:
+  case AArch64::SEH_SaveAnyRegIP:
   case AArch64::SEH_SaveAnyRegQP:
   case AArch64::SEH_SaveAnyRegQPX:
     ImmOpnd = &MBBI->getOperand(ImmIdx);
@@ -370,6 +372,22 @@ SVEFrameSizes AArch64PrologueEpilogueCommon::getSVEStackFrameSizes() const {
           {ZPRCalleeSavesSize, PPRLocalsSize + ZPRLocalsSize}};
 }
 
+SVEStackAllocations AArch64PrologueEpilogueCommon::getSVEStackAllocations(
+    SVEFrameSizes const &SVE) {
+  StackOffset AfterZPRs = SVE.ZPR.LocalsSize;
+  StackOffset BeforePPRs = SVE.ZPR.CalleeSavesSize + SVE.PPR.CalleeSavesSize;
+  StackOffset AfterPPRs = {};
+  if (SVELayout == SVEStackLayout::Split) {
+    BeforePPRs = SVE.PPR.CalleeSavesSize;
+    // If there are no ZPR CSRs, place all local allocations after the ZPRs.
+    if (SVE.ZPR.CalleeSavesSize)
+      AfterPPRs += SVE.PPR.LocalsSize + SVE.ZPR.CalleeSavesSize;
+    else
+      AfterZPRs += SVE.PPR.LocalsSize; // Group allocation of locals.
+  }
+  return {BeforePPRs, AfterPPRs, AfterZPRs};
+}
+
 struct SVEPartitions {
   struct {
     MachineBasicBlock::iterator Begin, End;
@@ -687,16 +705,19 @@ void AArch64PrologueEmitter::emitPrologue() {
   // All of the remaining stack allocations are for locals.
   determineLocalsStackSize(NumBytes, PrologueSaveSize);
 
+  auto [PPR, ZPR] = getSVEStackFrameSizes();
+  SVEStackAllocations SVEAllocs = getSVEStackAllocations({PPR, ZPR});
+
   MachineBasicBlock::iterator FirstGPRSaveI = PrologueBeginI;
   if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
+    assert(!SVEAllocs.AfterPPRs &&
+           "unexpected SVE allocs after PPRs with CalleeSavesAboveFrameRecord");
     // If we're doing SVE saves first, we need to immediately allocate space
     // for fixed objects, then space for the SVE callee saves.
     //
     // Windows unwind requires that the scalable size is a multiple of 16;
     // that's handled when the callee-saved size is computed.
-    auto SaveSize =
-        StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()) +
-        StackOffset::getFixed(FixedObject);
+    auto SaveSize = SVEAllocs.BeforePPRs + StackOffset::getFixed(FixedObject);
     allocateStackSpace(PrologueBeginI, 0, SaveSize, false, StackOffset{},
                        /*FollowupAllocs=*/true);
     NumBytes -= FixedObject;
@@ -764,12 +785,11 @@ void AArch64PrologueEmitter::emitPrologue() {
   if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding))
     emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding);
 
-  auto [PPR, ZPR] = getSVEStackFrameSizes();
-  StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
   StackOffset NonSVELocalsSize = StackOffset::getFixed(NumBytes);
+  SVEAllocs.AfterZPRs += NonSVELocalsSize;
+
   StackOffset CFAOffset =
       StackOffset::getFixed(MFI.getStackSize()) - NonSVELocalsSize;
-
   MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI;
   // Allocate space for the callee saves and PPR locals (if any).
   if (SVELayout != SVEStackLayout::CalleeSavesAboveFrameRecord) {
@@ -780,31 +800,23 @@ void AArch64PrologueEmitter::emitPrologue() {
     if (EmitAsyncCFI)
       emitCalleeSavedSVELocations(AfterSVESavesI);
 
-    StackOffset AllocateBeforePPRs = SVECalleeSavesSize;
-    StackOffset AllocateAfterPPRs = PPR.LocalsSize;
-    if (SVELayout == SVEStackLayout::Split) {
-      AllocateBeforePPRs = PPR.CalleeSavesSize;
-      AllocateAfterPPRs = PPR.LocalsSize + ZPR.CalleeSavesSize;
-    }
-    allocateStackSpace(PPRRange.Begin, 0, AllocateBeforePPRs,
+    allocateStackSpace(PPRRange.Begin, 0, SVEAllocs.BeforePPRs,
                        EmitAsyncCFI && !HasFP, CFAOffset,
-                       MFI.hasVarSizedObjects() || AllocateAfterPPRs ||
-                           ZPR.LocalsSize || NonSVELocalsSize);
-    CFAOffset += AllocateBeforePPRs;
+                       MFI.hasVarSizedObjects() || SVEAllocs.AfterPPRs ||
+                           SVEAllocs.AfterZPRs);
+    CFAOffset += SVEAllocs.BeforePPRs;
     assert(PPRRange.End == ZPRRange.Begin &&
            "Expected ZPR callee saves after PPR locals");
-    allocateStackSpace(PPRRange.End, RealignmentPadding, AllocateAfterPPRs,
+    allocateStackSpace(PPRRange.End, 0, SVEAllocs.AfterPPRs,
                        EmitAsyncCFI && !HasFP, CFAOffset,
-                       MFI.hasVarSizedObjects() || ZPR.LocalsSize ||
-                           NonSVELocalsSize);
-    CFAOffset += AllocateAfterPPRs;
+                       MFI.hasVarSizedObjects() || SVEAllocs.AfterZPRs);
+    CFAOffset += SVEAllocs.AfterPPRs;
   } else {
     assert(SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord);
-    // Note: With CalleeSavesAboveFrameRecord, the SVE CS have already been
-    // allocated (and separate PPR locals are not supported, all SVE locals,
-    // both PPR and ZPR, are within the ZPR locals area).
-    assert(!PPR.LocalsSize && "Unexpected PPR locals!");
-    CFAOffset += SVECalleeSavesSize;
+    // Note: With CalleeSavesAboveFrameRecord, the SVE CS (BeforePPRs) have
+    // already been allocated. PPR locals (included in AfterPPRs) are not
+    // supported (note: this is asserted above).
+    CFAOffset += SVEAllocs.BeforePPRs;
   }
 
   // Allocate space for the rest of the frame including ZPR locals. Align the
@@ -815,9 +827,9 @@ void AArch64PrologueEmitter::emitPrologue() {
     // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have the
     // correct value here, as NumBytes also includes padding bytes, which
     // shouldn't be counted here.
-    allocateStackSpace(
-        AfterSVESavesI, RealignmentPadding, ZPR.LocalsSize + NonSVELocalsSize,
-        EmitAsyncCFI && !HasFP, CFAOffset, MFI.hasVarSizedObjects());
+    allocateStackSpace(AfterSVESavesI, RealignmentPadding, SVEAllocs.AfterZPRs,
+                       EmitAsyncCFI && !HasFP, CFAOffset,
+                       MFI.hasVarSizedObjects());
   }
 
   // If we need a base pointer, set it up here. It's whatever the value of the
@@ -1308,6 +1320,26 @@ AArch64EpilogueEmitter::AArch64EpilogueEmitter(MachineFunction &MF,
   SEHEpilogueStartI = MBB.end();
 }
 
+void AArch64EpilogueEmitter::moveSPBelowFP(MachineBasicBlock::iterator MBBI,
+                                           StackOffset Offset) {
+  // Other combinations could be supported, but are not currently needed.
+  assert(Offset.getScalable() < 0 && Offset.getFixed() <= 0 &&
+         "expected negative offset (with optional fixed portion)");
+  Register Base = AArch64::FP;
+  if (int64_t FixedOffset = Offset.getFixed()) {
+    // If we have a negative fixed offset, we need to first subtract it in a
+    // temporary register first (to avoid briefly deallocating the scalable
+    // portion of the offset).
+    Base = MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+    emitFrameOffset(MBB, MBBI, DL, Base, AArch64::FP,
+                    StackOffset::getFixed(FixedOffset), TII,
+                    MachineInstr::FrameDestroy);
+  }
+  emitFrameOffset(MBB, MBBI, DL, AArch64::SP, Base,
+                  StackOffset::getScalable(Offset.getScalable()), TII,
+                  MachineInstr::FrameDestroy);
+}
+
 void AArch64EpilogueEmitter::emitEpilogue() {
   MachineBasicBlock::iterator EpilogueEndI = MBB.getLastNonDebugInstr();
   if (MBB.end() != EpilogueEndI) {
@@ -1408,6 +1440,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
       AfterCSRPopSize += ProloguePopSize;
     }
   }
+
   // Move past the restores of the callee-saved registers.
   // If we plan on combining the sp bump of the local stack size and the callee
   // save stack size, we might need to adjust the CSR save and restore offsets.
@@ -1472,27 +1505,25 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
   StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
-  StackOffset SVEStackSize =
-      SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize;
-  MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin;
-  MachineBasicBlock::iterator RestoreEnd = PPRRange.End;
+  SVEStackAllocations SVEAllocs = getSVEStackAllocations({PPR, ZPR});
 
   // Deallocate the SVE area.
   if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
-    StackOffset SVELocalsSize = ZPR.LocalsSize + PPR.LocalsSize;
+    assert(!SVEAllocs.AfterPPRs &&
+           "unexpected SVE allocs after PPRs with CalleeSavesAboveFrameRecord");
     // If the callee-save area is before FP, restoring the FP implicitly
-    // deallocates non-callee-save SVE allocations.  Otherwise, deallocate them
+    // deallocates non-callee-save SVE allocations. Otherwise, deallocate them
     // explicitly.
     if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
       emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
-                      SVELocalsSize, TII, MachineInstr::FrameDestroy, false,
-                      NeedsWinCFI, &HasWinCFI);
+                      SVEAllocs.AfterZPRs, TII, MachineInstr::FrameDestroy,
+                      false, NeedsWinCFI, &HasWinCFI);
     }
 
     // Deallocate callee-save SVE registers.
-    emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
-                    SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false,
-                    NeedsWinCFI, &HasWinCFI);
+    emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP,
+                    SVEAllocs.BeforePPRs, TII, MachineInstr::FrameDestroy,
+                    false, NeedsWinCFI, &HasWinCFI);
   } else if (AFI->hasSVEStackSize()) {
     // If we have stack realignment or variable-sized objects we must use the FP
     // to restore SVE callee saves (as there is an unknown amount of
@@ -1501,69 +1532,53 @@ void AArch64EpilogueEmitter::emitEpilogue() {
         (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
                                                               : AArch64::SP;
     if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) {
-      // TODO: Support stack realigment and variable-sized objects.
-      assert(
-          SVELayout != SVEStackLayout::Split &&
-          "unexpected stack realignment or variable sized objects with split "
-          "SVE stack objects");
-
-      Register CalleeSaveBase = AArch64::FP;
-      if (int64_t CalleeSaveBaseOffset =
-              AFI->getCalleeSaveBaseToFrameRecordOffset()) {
-        // If we have have an non-zero offset to the non-SVE CS base we need to
-        // compute the base address by subtracting the offest in a temporary
-        // register first (to avoid briefly deallocating the SVE CS).
-        CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
-            &AArch64::GPR64RegClass);
-        emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
-                        StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
-                        MachineInstr::FrameDestroy);
+      if (ZPR.CalleeSavesSize || SVELayout != SVEStackLayout::Split) {
+        // The offset from the frame-pointer to the start of the ZPR saves.
+        StackOffset FPOffsetZPR =
+            -SVECalleeSavesSize - PPR.LocalsSize -
+            StackOffset::getFixed(AFI->getCalleeSaveBaseToFrameRecordOffset());
+        // Deallocate the stack space space by moving the SP to the start of the
+        // ZPR/PPR callee-save area.
+        moveSPBelowFP(ZPRRange.Begin, FPOffsetZPR);
       }
-      // The code below will deallocate the stack space space by moving the SP
-      // to the start of the SVE callee-save area.
-      emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
-                      -SVECalleeSavesSize, TII, MachineInstr::FrameDestroy);
-    } else if (BaseForSVEDealloc == AArch64::SP) {
-      auto CFAOffset =
-          SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize);
-
-      if (SVECalleeSavesSize) {
-        // Deallocate the non-SVE locals first before we can deallocate (and
-        // restore callee saves) from the SVE area.
-        auto NonSVELocals = StackOffset::getFixed(NumBytes);
-        emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP,
-                        NonSVELocals, TII, MachineInstr::FrameDestroy, false,
-                        NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
-        CFAOffset -= NonSVELocals;
-        NumBytes = 0;
+      // With split SVE, the predicates are stored in a separate area above the
+      // ZPR saves, so we must adjust the stack to the start of the PPRs.
+      if (PPR.CalleeSavesSize && SVELayout == SVEStackLayout::Split) {
+        // The offset from the frame-pointer to the start of the PPR saves.
+        StackOffset FPOffsetPPR = -PPR.CalleeSavesSize;
+        // Move to the start of the PPR area.
+        assert(!FPOffsetPPR.getFixed() && "expected only scalable offset");
+        emitFrameOffset(MBB, ZPRRange.End, DL, AArch64::SP, AArch64::FP,
+                        FPOffsetPPR, TII, MachineInstr::FrameDestroy);
       }
-
-      if (ZPR.LocalsSize) {
-        emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP,
-                        ZPR.LocalsSize, TII, MachineInstr::FrameDestroy, false,
-                        NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
-        CFAOffset -= ZPR.LocalsSize;
-      }
-
-      StackOffset SVECalleeSavesToDealloc = SVECalleeSavesSize;
-      if (SVELayout == SVEStackLayout::Split &&
-          (PPR.LocalsSize || ZPR.CalleeSavesSize)) {
-        assert(PPRRange.Begin == ZPRRange.End &&
-               "Expected PPR restores after ZPR");
-        emitFrameOffset(MBB, PPRRange.Begin, DL, AArch64::SP, AArch64::SP,
-                        PPR.LocalsSize + ZPR.CalleeSavesSize, TII,
-                        MachineInstr::FrameDestroy, false, NeedsWinCFI,
-                        &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
-        CFAOffset -= PPR.LocalsSize + ZPR.CalleeSavesSize;
-        SVECalleeSavesToDealloc -= ZPR.CalleeSavesSize;
+    } else if (BaseForSVEDealloc == AArch64::SP) {
+      auto NonSVELocals = StackOffset::getFixed(NumBytes);
+      auto CFAOffset = NonSVELocals + StackOffset::getFixed(PrologueSaveSize) +
+                       SVEAllocs.totalSize();
+
+      if (SVECalleeSavesSize || SVELayout == SVEStackLayout::Split) {
+        // Deallocate non-SVE locals now. This is needed to reach the SVE callee
+        // saves, but may also allow combining stack hazard bumps for split SVE.
+        SVEAllocs.AfterZPRs += NonSVELocals;
+        NumBytes -= NonSVELocals.getFixed();
       }
-
-      // If split SVE is on, this dealloc PPRs, otherwise, deallocs ZPRs + PPRs:
-      if (SVECalleeSavesToDealloc)
-        emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP,
-                        SVECalleeSavesToDealloc, TII,
-                        MachineInstr::FrameDestroy, false, NeedsWinCFI,
-                        &HasWinCFI, EmitCFI && !HasFP, CFAOffset);
+      // To deallocate the SVE stack adjust by the allocations in reverse.
+      emitFrameOffset(MBB, ZPRRange.Begin, DL, AArch64::SP, AArch64::SP,
+                      SVEAllocs.AfterZPRs, TII, MachineInstr::FrameDestroy,
+                      false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
+                      CFAOffset);
+      CFAOffset -= SVEAllocs.AfterZPRs;
+      assert(PPRRange.Begin == ZPRRange.End &&
+             "Expected PPR restores after ZPR");
+      emitFrameOffset(MBB, PPRRange.Begin, DL, AArch64::SP, AArch64::SP,
+                      SVEAllocs.AfterPPRs, TII, MachineInstr::FrameDestroy,
+                      false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
+                      CFAOffset);
+      CFAOffset -= SVEAllocs.AfterPPRs;
+      emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP,
+                      SVEAllocs.BeforePPRs, TII, MachineInstr::FrameDestroy,
+                      false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
+                      CFAOffset);
     }
 
     if (EmitCFI)
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
index bccaddaad9eec..7f297b5d337b0 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
@@ -33,6 +33,11 @@ struct SVEFrameSizes {
   } PPR, ZPR;
 };
 
+struct SVEStackAllocations {
+  StackOffset BeforePPRs, AfterPPRs, AfterZPRs;
+  StackOffset totalSize() const { return BeforePPRs + AfterPPRs + AfterZPRs; }
+};
+
 class AArch64PrologueEpilogueCommon {
 public:
   AArch64PrologueEpilogueCommon(MachineFunction &MF, MachineBasicBlock &MBB,
@@ -66,6 +71,7 @@ class AArch64PrologueEpilogueCommon {
   bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const;
 
   SVEFrameSizes getSVEStackFrameSizes() const;
+  SVEStackAllocations getSVEStackAllocations(SVEFrameSizes const &);
 
   MachineFunction &MF;
   MachineBasicBlock &MBB;
@@ -174,6 +180,10 @@ class AArch64EpilogueEmitter final : public AArch64PrologueEpilogueCommon {
 private:
   bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const;
 
+  /// A helper for moving the SP to a negative offset from the FP, without
+  /// deallocating any stack in the range FP to FP + Offset.
+  void moveSPBelowFP(MachineBasicBlock::iterator MBBI, StackOffset Offset);
+
   void emitSwiftAsyncContextFramePointer(MachineBasicBlock::iterator MBBI,
                                          const DebugLoc &DL) const;
 
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 5bfb19d9a7e61..ef5941c42f687 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -90,6 +90,16 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin())
     return getDarwinCalleeSavedRegs(MF);
 
+  if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
+    return MF->getSubtarget<AArch64Subtarget>().isTargetWindows()
+               ? CSR_Win_AArch64_RT_MostRegs_SaveList
+               : CSR_AArch64_RT_MostRegs_SaveList;
+
+  if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll)
+    return MF->getSubtarget<AArch64Subtarget>().isTargetWindows()
+               ? CSR_Win_AArch64_RT_AllRegs_SaveList
+               : CSR_AArch64_RT_AllRegs_SaveList;
+
   if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
     return CSR_Win_AArch64_CFGuard_Check_SaveList;
   if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows()) {
@@ -138,10 +148,6 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_AArch64_AAPCS_SwiftError_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::SwiftTail)
     return CSR_AArch64_AAPCS_SwiftTail_SaveList;
-  if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
-    return CSR_AArch64_RT_MostRegs_SaveList;
-  if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll)
-    return CSR_AArch64_RT_AllRegs_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::Win64)
     // This is for OSes other than Windows; Windows is a separate case further
     // above.
@@ -891,7 +897,7 @@ AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   const MCInstrDesc &MCID = TII->get(AArch64::ADDXri);
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   Register BaseReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
-  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this));
+  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0));
   unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
 
   BuildMI(*MBB, Ins, DL, MCID, BaseReg)
@@ -1117,24 +1123,89 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   }
 }
 
-// FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register allocation
-// where a consecutive multi-vector tuple is constructed from the same indices
-// of multiple strided loads. This may still result in unnecessary copies
-// between the loads and the tuple. Here we try to return a hint to assign the
-// contiguous ZPRMulReg starting at the same register as the first operand of
-// the pseudo, which should be a subregister of the first strided load.
+// We add regalloc hints for different cases:
+// * Choosing a better destination operand for predicated SVE instructions
+//   where the inactive lanes are undef, by choosing a register that is not
+//   unique to the other operands of the instruction.
 //
-// For example, if the first strided load has been assigned $z16_z20_z24_z28
-// and the operands of the pseudo are each accessing subregister zsub2, we
-// should look through through Order to find a contiguous register which
-// begins with $z24 (i.e. $z24_z25_z26_z27).
+// * Improve register allocation for SME multi-vector instructions where we can
+//   benefit from the strided- and contiguous register multi-vector tuples.
 //
+//   Here FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register
+//   allocation where a consecutive multi-vector tuple is constructed from the
+//   same indices of multiple strided loads. This may still result in
+//   unnecessary copies between the loads and the tuple. Here we try to return a
+//   hint to assign the contiguous ZPRMulReg starting at the same register as
+//   the first operand of the pseudo, which should be a subregister of the first
+//   strided load.
+//
+//   For example, if the first strided load has been assigned $z16_z20_z24_z28
+//   and the operands of the pseudo are each accessing subregister zsub2, we
+//   should look through through Order to find a contiguous register which
+//   begins with $z24 (i.e. $z24_z25_z26_z27).
 bool AArch64RegisterInfo::getRegAllocationHints(
     Register VirtReg, ArrayRef<MCPhysReg> Order,
     SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
     const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
-
   auto &ST = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64InstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // For predicated SVE instructions where the inactive lanes are undef,
+  // pick a destination register that is not unique to avoid introducing
+  // a movprfx.
+  const TargetRegisterClass *RegRC = MRI.getRegClass(VirtReg);
+  if (AArch64::ZPRRegClass.hasSubClassEq(RegRC)) {
+    bool ConsiderOnlyHints = TargetRegisterInfo::getRegAllocationHints(
+        VirtReg, Order, Hints, MF, VRM);
+
+    for (const MachineOperand &DefOp : MRI.def_operands(VirtReg)) {
+      const MachineInstr &Def = *DefOp.getParent();
+      if (DefOp.isImplicit() ||
+          (TII->get(Def.getOpcode()).TSFlags & AArch64::FalseLanesMask) !=
+              AArch64::FalseLanesUndef)
+        continue;
+
+      unsigned InstFlags =
+          TII->get(AArch64::getSVEPseudoMap(Def.getOpcode())).TSFlags;
+
+      for (MCPhysReg R : Order) {
+        auto AddHintIfSuitable = [&](MCPhysReg R,
+                                     const MachineOperand &MO) -> bool {
+          // R is a suitable register hint if R can reuse one of the other
+          // source operands.
+          if (VRM->getPhys(MO.getReg()) != R)
+            return false;
+          Hints.push_back(R);
+          return true;
+        };
+
+        switch (InstFlags & AArch64::DestructiveInstTypeMask) {
+        default:
+          break;
+        case AArch64::DestructiveTernaryCommWithRev:
+          AddHintIfSuitable(R, Def.getOperand(2)) ||
+              AddHintIfSuitable(R, Def.getOperand(3)) ||
+              AddHintIfSuitable(R, Def.getOperand(4));
+          break;
+        case AArch64::DestructiveBinaryComm:
+        case AArch64::DestructiveBinaryCommWithRev:
+          AddHintIfSuitable(R, Def.getOperand(2)) ||
+              AddHintIfSuitable(R, Def.getOperand(3));
+          break;
+        case AArch64::DestructiveBinary:
+        case AArch64::DestructiveBinaryImm:
+          AddHintIfSuitable(R, Def.getOperand(2));
+          break;
+        }
+      }
+    }
+
+    if (Hints.size())
+      return ConsiderOnlyHints;
+  }
+
   if (!ST.hasSME() || !ST.isStreaming())
     return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
                                                      VRM);
@@ -1147,8 +1218,7 @@ bool AArch64RegisterInfo::getRegAllocationHints(
   // FORM_TRANSPOSED_REG_TUPLE pseudo, we want to favour reducing copy
   // instructions over reducing the number of clobbered callee-save registers,
   // so we add the strided registers as a hint.
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  unsigned RegID = MRI.getRegClass(VirtReg)->getID();
+  unsigned RegID = RegRC->getID();
   if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID ||
       RegID == AArch64::ZPR4StridedOrContiguousRegClassID) {
 
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3b268dcbca600..c8c21c4822ffe 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -375,6 +375,11 @@ def AArch64fclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
                                node:$Zm)
                                ]>;
 
+def AArch64fdot : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
+                            [(int_aarch64_sve_fdot_x2 node:$Zd, node:$Zn, node:$Zm),
+                             (partial_reduce_fmla node:$Zd, node:$Zn, node:$Zm)
+                            ]>;
+
 def SDT_AArch64FCVT : SDTypeProfile<1, 3, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
   SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>, SDTCisSameAs<0,3>
@@ -2578,6 +2583,11 @@ let Predicates = [HasBF16, HasSVE_or_SME] in {
   defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>;
   defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>;
 
+  def : Pat<(nxv4f32 (AArch64fmla_p (SVEAllActive), nxv4f32:$acc,
+                (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zn, (undef))),
+                (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zm, (undef))))),
+            (BFMLALB_ZZZ nxv4f32:$acc, ZPR:$Zn, ZPR:$Zm)>;
+
   defm BFCVT_ZPmZ   : sve_bfloat_convert<"bfcvt", int_aarch64_sve_fcvt_bf16f32_v2, AArch64fcvtr_mt>;
   defm BFCVTNT_ZPmZ : sve_bfloat_convert_top<"bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32_v2>;
 } // End HasBF16, HasSVE_or_SME
@@ -3592,6 +3602,18 @@ let Predicates = [HasSVE_or_SME] in {
 
   def : Pat<(sext (i32 (vector_extract nxv4i32:$vec, VectorIndexS:$index))),
             (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
+
+  // Extracts of ``unsigned'' i8 or i16 elements lead to the zero-extend being
+  // transformed to an AND mask. The mask is redundant since UMOV already zeroes
+  // the high bits of the destination register.
+  def : Pat<(i32 (and (vector_extract nxv16i8:$vec, VectorIndexB:$index), 0xff)),
+            (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
+  def : Pat<(i32 (and (vector_extract nxv8i16:$vec, VectorIndexH:$index), 0xffff)),
+            (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)>;
+  def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index)))), (i64 0xff))),
+            (SUBREG_TO_REG (i64 0), (i32 (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)), sub_32)>;
+  def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract nxv8i16:$vec, VectorIndexH:$index)))), (i64 0xffff))),
+            (SUBREG_TO_REG (i64 0), (i32 (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)), sub_32)>;
   } // End HasNEON
 
   // Extract first element from vector.
@@ -4251,7 +4273,7 @@ defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>;
 let Predicates = [HasSVE2p1_or_SME2] in {
 defm FCLAMP_ZZZ : sve_fp_clamp<"fclamp", AArch64fclamp>;
 
-defm FDOT_ZZZ_S  : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>;
+defm FDOT_ZZZ_S  : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, AArch64fdot>;
 defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>;
 
 defm BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb>;
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index a67bd42aa16e0..d87bb522c99e8 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -46,7 +46,6 @@
 #include "llvm/Transforms/Utils/MemoryTaggingSupport.h"
 #include <cassert>
 #include <memory>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 068954f1764fb..0bf2b31b10846 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -54,7 +54,6 @@
 #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
 #include <memory>
 #include <optional>
-#include <string>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index fede586cf35bc..197aae6e03cb1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -308,9 +308,9 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
   return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
 }
 
-bool AArch64TTIImpl::areTypesABICompatible(
-    const Function *Caller, const Function *Callee,
-    const ArrayRef<Type *> &Types) const {
+bool AArch64TTIImpl::areTypesABICompatible(const Function *Caller,
+                                           const Function *Callee,
+                                           ArrayRef<Type *> Types) const {
   if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
     return false;
 
@@ -1032,6 +1032,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     }
     break;
   }
+  case Intrinsic::experimental_vector_extract_last_active:
+    if (ST->isSVEorStreamingSVEAvailable()) {
+      auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
+      // This should turn into chained clastb instructions.
+      return LegalCost;
+    }
+    break;
   default:
     break;
   }
@@ -2220,7 +2227,7 @@ static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
   return std::nullopt;
 }
 
-template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
+template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
 static std::optional<Instruction *>
 instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
                                   bool MergeIntoAddendOp) {
@@ -3000,9 +3007,9 @@ AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   llvm_unreachable("Unsupported register kind");
 }
 
-bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
-                                           ArrayRef<const Value *> Args,
-                                           Type *SrcOverrideTy) const {
+bool AArch64TTIImpl::isSingleExtWideningInstruction(
+    unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
+    Type *SrcOverrideTy) const {
   // A helper that returns a vector type from the given type. The number of
   // elements in type Ty determines the vector width.
   auto toVectorTy = [&](Type *ArgTy) {
@@ -3020,48 +3027,29 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
       (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
     return false;
 
-  // Determine if the operation has a widening variant. We consider both the
-  // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
-  // instructions.
-  //
-  // TODO: Add additional widening operations (e.g., shl, etc.) once we
-  //       verify that their extending operands are eliminated during code
-  //       generation.
   Type *SrcTy = SrcOverrideTy;
   switch (Opcode) {
-  case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
-  case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
+  case Instruction::Add:   // UADDW(2), SADDW(2).
+  case Instruction::Sub: { // USUBW(2), SSUBW(2).
     // The second operand needs to be an extend
     if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
       if (!SrcTy)
         SrcTy =
             toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
-    } else
+      break;
+    }
+
+    if (Opcode == Instruction::Sub)
       return false;
-    break;
-  case Instruction::Mul: { // SMULL(2), UMULL(2)
-    // Both operands need to be extends of the same type.
-    if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
-        (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
+
+    // UADDW(2), SADDW(2) can be commutted.
+    if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
       if (!SrcTy)
         SrcTy =
             toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
-    } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
-      // If one of the operands is a Zext and the other has enough zero bits to
-      // be treated as unsigned, we can still general a umull, meaning the zext
-      // is free.
-      KnownBits Known =
-          computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
-      if (Args[0]->getType()->getScalarSizeInBits() -
-              Known.Zero.countLeadingOnes() >
-          DstTy->getScalarSizeInBits() / 2)
-        return false;
-      if (!SrcTy)
-        SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
-                                           DstTy->getScalarSizeInBits() / 2));
-    } else
-      return false;
-    break;
+      break;
+    }
+    return false;
   }
   default:
     return false;
@@ -3092,6 +3080,73 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
   return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
 }
 
+Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
+                                                  ArrayRef<const Value *> Args,
+                                                  Type *SrcOverrideTy) const {
+  if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
+      Opcode != Instruction::Mul)
+    return nullptr;
+
+  // Exit early if DstTy is not a vector type whose elements are one of [i16,
+  // i32, i64]. SVE doesn't generally have the same set of instructions to
+  // perform an extend with the add/sub/mul. There are SMULLB style
+  // instructions, but they operate on top/bottom, requiring some sort of lane
+  // interleaving to be used with zext/sext.
+  unsigned DstEltSize = DstTy->getScalarSizeInBits();
+  if (!useNeonVector(DstTy) || Args.size() != 2 ||
+      (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
+    return nullptr;
+
+  auto getScalarSizeWithOverride = [&](const Value *V) {
+    if (SrcOverrideTy)
+      return SrcOverrideTy->getScalarSizeInBits();
+    return cast<Instruction>(V)
+        ->getOperand(0)
+        ->getType()
+        ->getScalarSizeInBits();
+  };
+
+  unsigned MaxEltSize = 0;
+  if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
+      (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
+    unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
+    unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
+    MaxEltSize = std::max(EltSize0, EltSize1);
+  } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
+             isa<SExtInst, ZExtInst>(Args[1])) {
+    unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
+    unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
+    // mul(sext, zext) will become smull(sext, zext) if the extends are large
+    // enough.
+    if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
+      return nullptr;
+    MaxEltSize = DstEltSize / 2;
+  } else if (Opcode == Instruction::Mul &&
+             (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
+    // If one of the operands is a Zext and the other has enough zero bits
+    // to be treated as unsigned, we can still generate a umull, meaning the
+    // zext is free.
+    KnownBits Known =
+        computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
+    if (Args[0]->getType()->getScalarSizeInBits() -
+            Known.Zero.countLeadingOnes() >
+        DstTy->getScalarSizeInBits() / 2)
+      return nullptr;
+
+    MaxEltSize =
+        getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
+  } else
+    return nullptr;
+
+  if (MaxEltSize * 2 > DstEltSize)
+    return nullptr;
+
+  Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
+  if (ExtTy->getPrimitiveSizeInBits() <= 64)
+    return nullptr;
+  return ExtTy;
+}
+
 // s/urhadd instructions implement the following pattern, making the
 // extends free:
 //   %x = add ((zext i8 -> i16), 1)
@@ -3152,7 +3207,24 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   if (I && I->hasOneUser()) {
     auto *SingleUser = cast<Instruction>(*I->user_begin());
     SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
-    if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
+    if (Type *ExtTy = isBinExtWideningInstruction(
+            SingleUser->getOpcode(), Dst, Operands,
+            Src != I->getOperand(0)->getType() ? Src : nullptr)) {
+      // The cost from Src->Src*2 needs to be added if required, the cost from
+      // Src*2->ExtTy is free.
+      if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
+        Type *DoubleSrcTy =
+            Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
+        return getCastInstrCost(Opcode, DoubleSrcTy, Src,
+                                TTI::CastContextHint::None, CostKind);
+      }
+
+      return 0;
+    }
+
+    if (isSingleExtWideningInstruction(
+            SingleUser->getOpcode(), Dst, Operands,
+            Src != I->getOperand(0)->getType() ? Src : nullptr)) {
       // For adds only count the second operand as free if both operands are
       // extends but not the same operation. (i.e both operands are not free in
       // add(sext, zext)).
@@ -3161,8 +3233,11 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
             (isa<CastInst>(SingleUser->getOperand(1)) &&
              cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
           return 0;
-      } else // Others are free so long as isWideningInstruction returned true.
+      } else {
+        // Others are free so long as isSingleExtWideningInstruction
+        // returned true.
         return 0;
+      }
     }
 
     // The cast will be free for the s/urhadd instructions
@@ -4141,6 +4216,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
             }))
       return *PromotedCost;
 
+  // If the operation is a widening instruction (smull or umull) and both
+  // operands are extends the cost can be cheaper by considering that the
+  // operation will operate on the narrowest type size possible (double the
+  // largest input size) and a further extend.
+  if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
+    if (ExtTy != Ty)
+      return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
+             getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
+                              TTI::CastContextHint::None, CostKind);
+    return LT.first;
+  }
+
   switch (ISD) {
   default:
     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
@@ -4374,10 +4461,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     // - two 2-cost i64 inserts, and
     // - two 1-cost muls.
     // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
-    // LT.first = 2 the cost is 28. If both operands are extensions it will not
-    // need to scalarize so the cost can be cheaper (smull or umull).
-    // so the cost can be cheaper (smull or umull).
-    if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
+    // LT.first = 2 the cost is 28.
+    if (LT.second != MVT::v2i64)
       return LT.first;
     return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
            (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
@@ -6122,7 +6207,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
 }
 
 static bool containsDecreasingPointers(Loop *TheLoop,
-                                       PredicatedScalarEvolution *PSE) {
+                                       PredicatedScalarEvolution *PSE,
+                                       const DominatorTree &DT) {
   const auto &Strides = DenseMap<Value *, const SCEV *>();
   for (BasicBlock *BB : TheLoop->blocks()) {
     // Scan the instructions in the block and look for addresses that are
@@ -6131,8 +6217,8 @@ static bool containsDecreasingPointers(Loop *TheLoop,
       if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
         Value *Ptr = getLoadStorePointerOperand(&I);
         Type *AccessTy = getLoadStoreType(&I);
-        if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
-                         /*ShouldCheckWrap=*/false)
+        if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
+                         /*Assume=*/true, /*ShouldCheckWrap=*/false)
                 .value_or(0) < 0)
           return true;
       }
@@ -6177,7 +6263,8 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
   // negative strides. This will require extra work to reverse the loop
   // predicate, which may be expensive.
   if (containsDecreasingPointers(TFI->LVL->getLoop(),
-                                 TFI->LVL->getPredicatedScalarEvolution()))
+                                 TFI->LVL->getPredicatedScalarEvolution(),
+                                 *TFI->LVL->getDominatorTree()))
     Required |= TailFoldingOpts::Reverse;
   if (Required == TailFoldingOpts::Disabled)
     Required |= TailFoldingOpts::Simple;
@@ -6650,10 +6737,15 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
           Ops.push_back(&Ext->getOperandUse(0));
         Ops.push_back(&Op);
 
-        if (isa<SExtInst>(Ext))
+        if (isa<SExtInst>(Ext)) {
           NumSExts++;
-        else
+        } else {
           NumZExts++;
+          // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
+          if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
+              I->getType()->getScalarSizeInBits())
+            NumSExts++;
+        }
 
         continue;
       }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index fe2e849258e3f..e62fdb6786843 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -59,9 +59,17 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
     VECTOR_LDST_FOUR_ELEMENTS
   };
 
-  bool isWideningInstruction(Type *DstTy, unsigned Opcode,
-                             ArrayRef<const Value *> Args,
-                             Type *SrcOverrideTy = nullptr) const;
+  /// Given a add/sub/mul operation, detect a widening addl/subl/mull pattern
+  /// where both operands can be treated like extends. Returns the minimal type
+  /// needed to compute the operation.
+  Type *isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
+                                    ArrayRef<const Value *> Args,
+                                    Type *SrcOverrideTy = nullptr) const;
+  /// Given a add/sub operation with a single extend operand, detect a
+  /// widening addw/subw pattern.
+  bool isSingleExtWideningInstruction(unsigned Opcode, Type *DstTy,
+                                      ArrayRef<const Value *> Args,
+                                      Type *SrcOverrideTy = nullptr) const;
 
   // A helper function called by 'getVectorInstrCost'.
   //
@@ -84,7 +92,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
                            const Function *Callee) const override;
 
   bool areTypesABICompatible(const Function *Caller, const Function *Callee,
-                             const ArrayRef<Type *> &Types) const override;
+                             ArrayRef<Type *> Types) const override;
 
   unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
                                 unsigned DefaultCallPenalty) const override;
@@ -304,7 +312,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
   }
 
   bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const {
-    if (!ST->hasSVE())
+    if (!ST->isSVEorStreamingSVEAvailable())
       return false;
 
     // For fixed vectors, avoid scalarization if using SVE for them.
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 6273cfc1005d6..7293b7fdb0d20 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -88,7 +88,7 @@ class AArch64AsmParser : public MCTargetAsmParser {
   StringRef Mnemonic; ///< Instruction mnemonic.
 
   // Map of register aliases registers via the .req directive.
-  StringMap<std::pair<RegKind, unsigned>> RegisterReqs;
+  StringMap<std::pair<RegKind, MCRegister>> RegisterReqs;
 
   class PrefixInfo {
   public:
@@ -165,7 +165,7 @@ class AArch64AsmParser : public MCTargetAsmParser {
   AArch64CC::CondCode parseCondCodeString(StringRef Cond,
                                           std::string &Suggestion);
   bool parseCondCode(OperandVector &Operands, bool invertCondCode);
-  unsigned matchRegisterNameAlias(StringRef Name, RegKind Kind);
+  MCRegister matchRegisterNameAlias(StringRef Name, RegKind Kind);
   bool parseRegister(OperandVector &Operands);
   bool parseSymbolicImmVal(const MCExpr *&ImmVal);
   bool parseNeonVectorList(OperandVector &Operands);
@@ -391,7 +391,7 @@ class AArch64Operand : public MCParsedAsmOperand {
   };
 
   struct RegOp {
-    unsigned RegNum;
+    MCRegister Reg;
     RegKind Kind;
     int ElementWidth;
 
@@ -417,7 +417,7 @@ class AArch64Operand : public MCParsedAsmOperand {
   };
 
   struct MatrixRegOp {
-    unsigned RegNum;
+    MCRegister Reg;
     unsigned ElementWidth;
     MatrixKind Kind;
   };
@@ -427,7 +427,7 @@ class AArch64Operand : public MCParsedAsmOperand {
   };
 
   struct VectorListOp {
-    unsigned RegNum;
+    MCRegister Reg;
     unsigned Count;
     unsigned Stride;
     unsigned NumElements;
@@ -688,12 +688,12 @@ class AArch64Operand : public MCParsedAsmOperand {
 
   MCRegister getReg() const override {
     assert(Kind == k_Register && "Invalid access!");
-    return Reg.RegNum;
+    return Reg.Reg;
   }
 
-  unsigned getMatrixReg() const {
+  MCRegister getMatrixReg() const {
     assert(Kind == k_MatrixRegister && "Invalid access!");
-    return MatrixReg.RegNum;
+    return MatrixReg.Reg;
   }
 
   unsigned getMatrixElementWidth() const {
@@ -716,9 +716,9 @@ class AArch64Operand : public MCParsedAsmOperand {
     return Reg.EqualityTy;
   }
 
-  unsigned getVectorListStart() const {
+  MCRegister getVectorListStart() const {
     assert(Kind == k_VectorList && "Invalid access!");
-    return VectorList.RegNum;
+    return VectorList.Reg;
   }
 
   unsigned getVectorListCount() const {
@@ -1264,15 +1264,15 @@ class AArch64Operand : public MCParsedAsmOperand {
   bool isNeonVectorRegLo() const {
     return Kind == k_Register && Reg.Kind == RegKind::NeonVector &&
            (AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
-                Reg.RegNum) ||
+                Reg.Reg) ||
             AArch64MCRegisterClasses[AArch64::FPR64_loRegClassID].contains(
-                Reg.RegNum));
+                Reg.Reg));
   }
 
   bool isNeonVectorReg0to7() const {
     return Kind == k_Register && Reg.Kind == RegKind::NeonVector &&
            (AArch64MCRegisterClasses[AArch64::FPR128_0to7RegClassID].contains(
-               Reg.RegNum));
+               Reg.Reg));
   }
 
   bool isMatrix() const { return Kind == k_MatrixRegister; }
@@ -1401,34 +1401,34 @@ class AArch64Operand : public MCParsedAsmOperand {
 
   bool isGPR32as64() const {
     return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
-      AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
+           AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.Reg);
   }
 
   bool isGPR64as32() const {
     return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
-      AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(Reg.RegNum);
+           AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(Reg.Reg);
   }
 
   bool isGPR64x8() const {
     return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
            AArch64MCRegisterClasses[AArch64::GPR64x8ClassRegClassID].contains(
-               Reg.RegNum);
+               Reg.Reg);
   }
 
   bool isWSeqPair() const {
     return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
            AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains(
-               Reg.RegNum);
+               Reg.Reg);
   }
 
   bool isXSeqPair() const {
     return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
            AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID].contains(
-               Reg.RegNum);
+               Reg.Reg);
   }
 
   bool isSyspXzrPair() const {
-    return isGPR64<AArch64::GPR64RegClassID>() && Reg.RegNum == AArch64::XZR;
+    return isGPR64<AArch64::GPR64RegClassID>() && Reg.Reg == AArch64::XZR;
   }
 
   template<int64_t Angle, int64_t Remainder>
@@ -1495,7 +1495,7 @@ class AArch64Operand : public MCParsedAsmOperand {
         isTypedVectorList<VectorKind, NumRegs, NumElements, ElementWidth>();
     if (!Res)
       return DiagnosticPredicate::NoMatch;
-    if (!AArch64MCRegisterClasses[RegClass].contains(VectorList.RegNum))
+    if (!AArch64MCRegisterClasses[RegClass].contains(VectorList.Reg))
       return DiagnosticPredicate::NearMatch;
     return DiagnosticPredicate::Match;
   }
@@ -1507,9 +1507,9 @@ class AArch64Operand : public MCParsedAsmOperand {
                                  ElementWidth, Stride>();
     if (!Res)
       return DiagnosticPredicate::NoMatch;
-    if ((VectorList.RegNum < (AArch64::Z0 + Stride)) ||
-        ((VectorList.RegNum >= AArch64::Z16) &&
-         (VectorList.RegNum < (AArch64::Z16 + Stride))))
+    if ((VectorList.Reg < (AArch64::Z0 + Stride)) ||
+        ((VectorList.Reg >= AArch64::Z16) &&
+         (VectorList.Reg < (AArch64::Z16 + Stride))))
       return DiagnosticPredicate::Match;
     return DiagnosticPredicate::NoMatch;
   }
@@ -1841,7 +1841,7 @@ class AArch64Operand : public MCParsedAsmOperand {
 
   void addPPRorPNRRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    unsigned Reg = getReg();
+    MCRegister Reg = getReg();
     // Normalise to PPR
     if (Reg >= AArch64::PN0 && Reg <= AArch64::PN15)
       Reg = Reg - AArch64::PN0 + AArch64::P0;
@@ -2336,13 +2336,12 @@ class AArch64Operand : public MCParsedAsmOperand {
   }
 
   static std::unique_ptr<AArch64Operand>
-  CreateReg(unsigned RegNum, RegKind Kind, SMLoc S, SMLoc E, MCContext &Ctx,
+  CreateReg(MCRegister Reg, RegKind Kind, SMLoc S, SMLoc E, MCContext &Ctx,
             RegConstraintEqualityTy EqTy = RegConstraintEqualityTy::EqualsReg,
             AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL,
-            unsigned ShiftAmount = 0,
-            unsigned HasExplicitAmount = false) {
+            unsigned ShiftAmount = 0, unsigned HasExplicitAmount = false) {
     auto Op = std::make_unique<AArch64Operand>(k_Register, Ctx);
-    Op->Reg.RegNum = RegNum;
+    Op->Reg.Reg = Reg;
     Op->Reg.Kind = Kind;
     Op->Reg.ElementWidth = 0;
     Op->Reg.EqualityTy = EqTy;
@@ -2354,28 +2353,26 @@ class AArch64Operand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static std::unique_ptr<AArch64Operand>
-  CreateVectorReg(unsigned RegNum, RegKind Kind, unsigned ElementWidth,
-                  SMLoc S, SMLoc E, MCContext &Ctx,
-                  AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL,
-                  unsigned ShiftAmount = 0,
-                  unsigned HasExplicitAmount = false) {
+  static std::unique_ptr<AArch64Operand> CreateVectorReg(
+      MCRegister Reg, RegKind Kind, unsigned ElementWidth, SMLoc S, SMLoc E,
+      MCContext &Ctx, AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL,
+      unsigned ShiftAmount = 0, unsigned HasExplicitAmount = false) {
     assert((Kind == RegKind::NeonVector || Kind == RegKind::SVEDataVector ||
             Kind == RegKind::SVEPredicateVector ||
             Kind == RegKind::SVEPredicateAsCounter) &&
            "Invalid vector kind");
-    auto Op = CreateReg(RegNum, Kind, S, E, Ctx, EqualsReg, ExtTy, ShiftAmount,
+    auto Op = CreateReg(Reg, Kind, S, E, Ctx, EqualsReg, ExtTy, ShiftAmount,
                         HasExplicitAmount);
     Op->Reg.ElementWidth = ElementWidth;
     return Op;
   }
 
   static std::unique_ptr<AArch64Operand>
-  CreateVectorList(unsigned RegNum, unsigned Count, unsigned Stride,
+  CreateVectorList(MCRegister Reg, unsigned Count, unsigned Stride,
                    unsigned NumElements, unsigned ElementWidth,
                    RegKind RegisterKind, SMLoc S, SMLoc E, MCContext &Ctx) {
     auto Op = std::make_unique<AArch64Operand>(k_VectorList, Ctx);
-    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Reg = Reg;
     Op->VectorList.Count = Count;
     Op->VectorList.Stride = Stride;
     Op->VectorList.NumElements = NumElements;
@@ -2586,10 +2583,10 @@ class AArch64Operand : public MCParsedAsmOperand {
   }
 
   static std::unique_ptr<AArch64Operand>
-  CreateMatrixRegister(unsigned RegNum, unsigned ElementWidth, MatrixKind Kind,
+  CreateMatrixRegister(MCRegister Reg, unsigned ElementWidth, MatrixKind Kind,
                        SMLoc S, SMLoc E, MCContext &Ctx) {
     auto Op = std::make_unique<AArch64Operand>(k_MatrixRegister, Ctx);
-    Op->MatrixReg.RegNum = RegNum;
+    Op->MatrixReg.Reg = Reg;
     Op->MatrixReg.ElementWidth = ElementWidth;
     Op->MatrixReg.Kind = Kind;
     Op->StartLoc = S;
@@ -2660,9 +2657,9 @@ void AArch64Operand::print(raw_ostream &OS, const MCAsmInfo &MAI) const {
     break;
   case k_VectorList: {
     OS << "<vectorlist ";
-    unsigned Reg = getVectorListStart();
+    MCRegister Reg = getVectorListStart();
     for (unsigned i = 0, e = getVectorListCount(); i != e; ++i)
-      OS << Reg + i * getVectorListStride() << " ";
+      OS << Reg.id() + i * getVectorListStride() << " ";
     OS << ">";
     break;
   }
@@ -2699,7 +2696,7 @@ void AArch64Operand::print(raw_ostream &OS, const MCAsmInfo &MAI) const {
     OS << getCMHPriorityHintName();
     break;
   case k_MatrixRegister:
-    OS << "<matrix " << getMatrixReg() << ">";
+    OS << "<matrix " << getMatrixReg().id() << ">";
     break;
   case k_MatrixTileList: {
     OS << "<matrixlist ";
@@ -2715,7 +2712,7 @@ void AArch64Operand::print(raw_ostream &OS, const MCAsmInfo &MAI) const {
     break;
   }
   case k_Register:
-    OS << "<register " << getReg() << ">";
+    OS << "<register " << getReg().id() << ">";
     if (!getShiftExtendAmount() && !hasShiftExtendAmount())
       break;
     [[fallthrough]];
@@ -3048,53 +3045,53 @@ ParseStatus AArch64AsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
 }
 
 // Matches a register name or register alias previously defined by '.req'
-unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
-                                                  RegKind Kind) {
-  unsigned RegNum = 0;
-  if ((RegNum = matchSVEDataVectorRegName(Name)))
-    return Kind == RegKind::SVEDataVector ? RegNum : 0;
+MCRegister AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
+                                                    RegKind Kind) {
+  MCRegister Reg = MCRegister();
+  if ((Reg = matchSVEDataVectorRegName(Name)))
+    return Kind == RegKind::SVEDataVector ? Reg : MCRegister();
 
-  if ((RegNum = matchSVEPredicateVectorRegName(Name)))
-    return Kind == RegKind::SVEPredicateVector ? RegNum : 0;
+  if ((Reg = matchSVEPredicateVectorRegName(Name)))
+    return Kind == RegKind::SVEPredicateVector ? Reg : MCRegister();
 
-  if ((RegNum = matchSVEPredicateAsCounterRegName(Name)))
-    return Kind == RegKind::SVEPredicateAsCounter ? RegNum : 0;
+  if ((Reg = matchSVEPredicateAsCounterRegName(Name)))
+    return Kind == RegKind::SVEPredicateAsCounter ? Reg : MCRegister();
 
-  if ((RegNum = MatchNeonVectorRegName(Name)))
-    return Kind == RegKind::NeonVector ? RegNum : 0;
+  if ((Reg = MatchNeonVectorRegName(Name)))
+    return Kind == RegKind::NeonVector ? Reg : MCRegister();
 
-  if ((RegNum = matchMatrixRegName(Name)))
-    return Kind == RegKind::Matrix ? RegNum : 0;
+  if ((Reg = matchMatrixRegName(Name)))
+    return Kind == RegKind::Matrix ? Reg : MCRegister();
 
- if (Name.equals_insensitive("zt0"))
+  if (Name.equals_insensitive("zt0"))
     return Kind == RegKind::LookupTable ? unsigned(AArch64::ZT0) : 0;
 
   // The parsed register must be of RegKind Scalar
-  if ((RegNum = MatchRegisterName(Name)))
-    return (Kind == RegKind::Scalar) ? RegNum : 0;
+  if ((Reg = MatchRegisterName(Name)))
+    return (Kind == RegKind::Scalar) ? Reg : MCRegister();
 
-  if (!RegNum) {
+  if (!Reg) {
     // Handle a few common aliases of registers.
-    if (auto RegNum = StringSwitch<unsigned>(Name.lower())
-                    .Case("fp", AArch64::FP)
-                    .Case("lr",  AArch64::LR)
-                    .Case("x31", AArch64::XZR)
-                    .Case("w31", AArch64::WZR)
-                    .Default(0))
-      return Kind == RegKind::Scalar ? RegNum : 0;
+    if (MCRegister Reg = StringSwitch<unsigned>(Name.lower())
+                             .Case("fp", AArch64::FP)
+                             .Case("lr", AArch64::LR)
+                             .Case("x31", AArch64::XZR)
+                             .Case("w31", AArch64::WZR)
+                             .Default(0))
+      return Kind == RegKind::Scalar ? Reg : MCRegister();
 
     // Check for aliases registered via .req. Canonicalize to lower case.
     // That's more consistent since register names are case insensitive, and
     // it's how the original entry was passed in from MC/MCParser/AsmParser.
     auto Entry = RegisterReqs.find(Name.lower());
     if (Entry == RegisterReqs.end())
-      return 0;
+      return MCRegister();
 
-    // set RegNum if the match is the right kind of register
+    // set Reg if the match is the right kind of register
     if (Kind == Entry->getValue().first)
-      RegNum = Entry->getValue().second;
+      Reg = Entry->getValue().second;
   }
-  return RegNum;
+  return Reg;
 }
 
 unsigned AArch64AsmParser::getNumRegsForRegKind(RegKind K) {
@@ -3122,8 +3119,8 @@ ParseStatus AArch64AsmParser::tryParseScalarRegister(MCRegister &RegNum) {
     return ParseStatus::NoMatch;
 
   std::string lowerCase = Tok.getString().lower();
-  unsigned Reg = matchRegisterNameAlias(lowerCase, RegKind::Scalar);
-  if (Reg == 0)
+  MCRegister Reg = matchRegisterNameAlias(lowerCase, RegKind::Scalar);
+  if (!Reg)
     return ParseStatus::NoMatch;
 
   RegNum = Reg;
@@ -3667,7 +3664,7 @@ ParseStatus AArch64AsmParser::tryParseMatrixRegister(OperandVector &Operands) {
   }
 
   // Try to parse matrix register.
-  unsigned Reg = matchRegisterNameAlias(Name, RegKind::Matrix);
+  MCRegister Reg = matchRegisterNameAlias(Name, RegKind::Matrix);
   if (!Reg)
     return ParseStatus::NoMatch;
 
@@ -3896,6 +3893,7 @@ static const struct Extension {
     {"f16mm", {AArch64::FeatureF16MM}},
     {"f16f32dot", {AArch64::FeatureF16F32DOT}},
     {"f16f32mm", {AArch64::FeatureF16F32MM}},
+    {"mops-go", {AArch64::FeatureMOPS_GO}},
 };
 
 static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
@@ -4130,12 +4128,12 @@ bool AArch64AsmParser::parseSyslAlias(StringRef Name, SMLoc NameLoc,
   SMLoc startLoc = getLoc();
   const AsmToken &regTok = getTok();
   StringRef reg = regTok.getString();
-  unsigned RegNum = matchRegisterNameAlias(reg.lower(), RegKind::Scalar);
-  if (!RegNum)
+  MCRegister Reg = matchRegisterNameAlias(reg.lower(), RegKind::Scalar);
+  if (!Reg)
     return TokError("expected register operand");
 
   Operands.push_back(AArch64Operand::CreateReg(
-      RegNum, RegKind::Scalar, startLoc, getLoc(), getContext(), EqualsReg));
+      Reg, RegKind::Scalar, startLoc, getLoc(), getContext(), EqualsReg));
 
   Lex(); // Eat token
   if (parseToken(AsmToken::Comma))
@@ -4453,7 +4451,7 @@ ParseStatus AArch64AsmParser::tryParseVectorRegister(MCRegister &Reg,
   // a '.'.
   size_t Start = 0, Next = Name.find('.');
   StringRef Head = Name.slice(Start, Next);
-  unsigned RegNum = matchRegisterNameAlias(Head, MatchKind);
+  MCRegister RegNum = matchRegisterNameAlias(Head, MatchKind);
 
   if (RegNum) {
     if (Next != StringRef::npos) {
@@ -4937,13 +4935,13 @@ ParseStatus AArch64AsmParser::tryParseZTOperand(OperandVector &Operands) {
   const AsmToken &Tok = getTok();
   std::string Name = Tok.getString().lower();
 
-  unsigned RegNum = matchRegisterNameAlias(Name, RegKind::LookupTable);
+  MCRegister Reg = matchRegisterNameAlias(Name, RegKind::LookupTable);
 
-  if (RegNum == 0)
+  if (!Reg)
     return ParseStatus::NoMatch;
 
   Operands.push_back(AArch64Operand::CreateReg(
-      RegNum, RegKind::LookupTable, StartLoc, getLoc(), getContext()));
+      Reg, RegKind::LookupTable, StartLoc, getLoc(), getContext()));
   Lex(); // Eat register.
 
   // Check if register is followed by an index
@@ -5997,6 +5995,33 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc,
                            " registers are the same");
     break;
   }
+  case AArch64::SETGOP:
+  case AArch64::SETGOPT:
+  case AArch64::SETGOPN:
+  case AArch64::SETGOPTN:
+  case AArch64::SETGOM:
+  case AArch64::SETGOMT:
+  case AArch64::SETGOMN:
+  case AArch64::SETGOMTN:
+  case AArch64::SETGOE:
+  case AArch64::SETGOET:
+  case AArch64::SETGOEN:
+  case AArch64::SETGOETN: {
+    MCRegister Xd_wb = Inst.getOperand(0).getReg();
+    MCRegister Xn_wb = Inst.getOperand(1).getReg();
+    MCRegister Xd = Inst.getOperand(2).getReg();
+    MCRegister Xn = Inst.getOperand(3).getReg();
+    if (Xd_wb != Xd)
+      return Error(Loc[0],
+                   "invalid SET instruction, Xd_wb and Xd do not match");
+    if (Xn_wb != Xn)
+      return Error(Loc[0],
+                   "invalid SET instruction, Xn_wb and Xn do not match");
+    if (Xd == Xn)
+      return Error(Loc[0], "invalid SET instruction, destination and size"
+                           " registers are the same");
+    break;
+  }
   }
 
   // Now check immediate ranges. Separate from the above as there is overlap
@@ -7651,7 +7676,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
   if (parseEOL())
     return true;
 
-  auto pair = std::make_pair(RegisterKind, (unsigned) RegNum);
+  auto pair = std::make_pair(RegisterKind, RegNum);
   if (RegisterReqs.insert(std::make_pair(Name, pair)).first->second != pair)
     Warning(L, "ignoring redefinition of register alias '" + Name + "'");
 
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index dc2feba42c871..4eb762a00d477 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -1532,6 +1532,32 @@ static DecodeStatus DecodeSETMemOpInstruction(MCInst &Inst, uint32_t insn,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeSETMemGoOpInstruction(MCInst &Inst, uint32_t insn,
+                                                uint64_t Addr,
+                                                const MCDisassembler *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+
+  // None of the registers may alias: if they do, then the instruction is not
+  // merely unpredictable but actually entirely unallocated.
+  if (Rd == Rn)
+    return MCDisassembler::Fail;
+
+  // Rd and Rn register operands are written back, so they appear
+  // twice in the operand list, once as outputs and once as inputs.
+  if (!DecodeSimpleRegisterClass<AArch64::GPR64commonRegClassID, 0, 31>(
+          Inst, Rd, Addr, Decoder) ||
+      !DecodeSimpleRegisterClass<AArch64::GPR64RegClassID, 0, 32>(
+          Inst, Rn, Addr, Decoder) ||
+      !DecodeSimpleRegisterClass<AArch64::GPR64commonRegClassID, 0, 31>(
+          Inst, Rd, Addr, Decoder) ||
+      !DecodeSimpleRegisterClass<AArch64::GPR64RegClassID, 0, 32>(
+          Inst, Rn, Addr, Decoder))
+    return MCDisassembler::Fail;
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodePRFMRegInstruction(MCInst &Inst, uint32_t insn,
                                              uint64_t Addr,
                                              const MCDisassembler *Decoder) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 14b0f9a564e01..394024693194c 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -5666,6 +5666,9 @@ AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
                                                MachineRegisterInfo &MRI) {
   LLT DstTy = MRI.getType(Dst);
   unsigned DstSize = DstTy.getSizeInBits();
+  assert((DstSize == 64 || DstSize == 128) &&
+         "Unexpected vector constant size");
+
   if (CV->isNullValue()) {
     if (DstSize == 128) {
       auto Mov =
@@ -5735,17 +5738,24 @@ AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
       // Try to create the new constants with MOVI, and if so generate a fneg
       // for it.
       if (auto *NewOp = TryMOVIWithBits(NegBits)) {
-        Register NewDst = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
+        Register NewDst = MRI.createVirtualRegister(
+            DstSize == 64 ? &AArch64::FPR64RegClass : &AArch64::FPR128RegClass);
         NewOp->getOperand(0).setReg(NewDst);
         return MIRBuilder.buildInstr(NegOpc, {Dst}, {NewDst});
       }
       return nullptr;
     };
     MachineInstr *R;
-    if ((R = TryWithFNeg(DefBits, 32, AArch64::FNEGv4f32)) ||
-        (R = TryWithFNeg(DefBits, 64, AArch64::FNEGv2f64)) ||
+    if ((R = TryWithFNeg(DefBits, 32,
+                         DstSize == 64 ? AArch64::FNEGv2f32
+                                       : AArch64::FNEGv4f32)) ||
+        (R = TryWithFNeg(DefBits, 64,
+                         DstSize == 64 ? AArch64::FNEGDr
+                                       : AArch64::FNEGv2f64)) ||
         (STI.hasFullFP16() &&
-         (R = TryWithFNeg(DefBits, 16, AArch64::FNEGv8f16))))
+         (R = TryWithFNeg(DefBits, 16,
+                          DstSize == 64 ? AArch64::FNEGv4f16
+                                        : AArch64::FNEGv8f16))))
       return R;
   }
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 5f93847bc680e..038ad77ae69b2 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1809,6 +1809,9 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return LowerBinOp(TargetOpcode::G_FMAXNUM);
   case Intrinsic::aarch64_neon_fminnm:
     return LowerBinOp(TargetOpcode::G_FMINNUM);
+  case Intrinsic::aarch64_neon_pmull:
+  case Intrinsic::aarch64_neon_pmull64:
+    return LowerBinOp(AArch64::G_PMULL);
   case Intrinsic::aarch64_neon_smull:
     return LowerBinOp(AArch64::G_SMULL);
   case Intrinsic::aarch64_neon_umull:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 6d2d70511e894..6b920f05227ad 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -560,6 +560,7 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI,
   case TargetOpcode::G_FCMP:
   case TargetOpcode::G_LROUND:
   case TargetOpcode::G_LLROUND:
+  case AArch64::G_PMULL:
     return true;
   case TargetOpcode::G_INTRINSIC:
     switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index cd8b2495a4250..67042b700c047 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -69,7 +69,7 @@ FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass();
 ModulePass *createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *);
 
 struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
-  AMDGPUSimplifyLibCallsPass() {}
+  AMDGPUSimplifyLibCallsPass() = default;
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
@@ -371,13 +371,13 @@ class AMDGPUPreloadKernelArgumentsPass
 class AMDGPUAnnotateUniformValuesPass
     : public PassInfoMixin<AMDGPUAnnotateUniformValuesPass> {
 public:
-  AMDGPUAnnotateUniformValuesPass() {}
+  AMDGPUAnnotateUniformValuesPass() = default;
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
 class SIModeRegisterPass : public PassInfoMixin<SIModeRegisterPass> {
 public:
-  SIModeRegisterPass() {}
+  SIModeRegisterPass() = default;
   PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &AM);
 };
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 54d94b1f8682e..b008354cfd462 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2069,6 +2069,7 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureMemoryAtomicFAddF32DenormalSupport,
    FeatureBVHDualAndBVH8Insts,
    FeatureWaitsBeforeSystemScopeStores,
+   FeatureD16Writes32BitVgpr
    ]>;
 
 def FeatureISAVersion12_50 : FeatureSet<
@@ -2143,6 +2144,7 @@ def FeatureISAVersion12_50 : FeatureSet<
    FeatureSupportsXNACK,
    FeatureXNACK,
    FeatureClusters,
+   FeatureD16Writes32BitVgpr,
 ]>;
 
 def FeatureISAVersion12_51 : FeatureSet<
@@ -2974,15 +2976,46 @@ def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">,
 def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">,
                       AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>;
 
+def NotNeedsAlignedVGPRs : Predicate<"!Subtarget->needsAlignedVGPRs()">,
+                      AssemblerPredicate<(all_of (not FeatureRequiresAlignedVGPRs))>;
+
+def isWave32 : Predicate<"Subtarget->isWave32()">,
+  AssemblerPredicate <(any_of FeatureWavefrontSize32,
+                              FeatureAssemblerPermissiveWavesize)>;
+def isWave64 : Predicate<"Subtarget->isWave64()">,
+  AssemblerPredicate <(any_of FeatureWavefrontSize64,
+                              FeatureAssemblerPermissiveWavesize)>;
+
+def isWave32Strict : Predicate<"Subtarget->isWave32()">,
+  AssemblerPredicate <(all_of FeatureWavefrontSize32)>;
+def isWave64Strict : Predicate<"Subtarget->isWave64()">,
+  AssemblerPredicate <(all_of FeatureWavefrontSize64)>;
+
 //===----------------------------------------------------------------------===//
 // HwModes
 //===----------------------------------------------------------------------===//
 
-// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement
+defvar DefaultMode_Wave64 = DefaultMode;
+defvar DefaultMode_Wave32 = HwMode<[isWave32, NotNeedsAlignedVGPRs]>;
+
+// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement. Implied
+// wave64.
 def AVAlign2LoadStoreMode : HwMode<[HasMAIInsts, NeedsAlignedVGPRs]>;
 
 // gfx1250, has alignment requirement but no AGPRs.
-def AlignedVGPRNoAGPRMode : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs]>;
+def AlignedVGPRNoAGPRMode_Wave32 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave32Strict]>;
+def AlignedVGPRNoAGPRMode_Wave64 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave64Strict]>;
+
+// FIXME: This should be able to only define a separate hwmode that
+// only depends on wavesize for just ValueTypes. These use different
+// HwMode namespaces. If we don't define the full set of modes used
+// for RegClassByHwMode, tablegen crashes for some reason
+def WaveSizeVT : ValueTypeByHwMode<[
+  DefaultMode_Wave64,
+  AVAlign2LoadStoreMode,
+  AlignedVGPRNoAGPRMode_Wave64,
+  DefaultMode_Wave32,
+  AlignedVGPRNoAGPRMode_Wave32], [i64, i64, i64, i32, i32]>;
 
 
 // Include AMDGPU TD files
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 1064e57b9da9e..8838a94a639eb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -52,7 +52,7 @@ struct ArgDescriptor {
   }
 
   static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
-    return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
+    return ArgDescriptor(Arg.Reg.id(), Mask, Arg.IsStack, Arg.IsSet);
   }
 
   bool isSet() const {
@@ -96,7 +96,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
 }
 
 struct KernArgPreloadDescriptor : public ArgDescriptor {
-  KernArgPreloadDescriptor() {}
+  KernArgPreloadDescriptor() = default;
   SmallVector<MCRegister> Regs;
 };
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 9907c88f4dfb8..56ab040706a13 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -38,9 +38,10 @@ enum ImplicitArgumentPositions {
 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
 
 enum ImplicitArgumentMask {
-  NOT_IMPLICIT_INPUT = 0,
+  UNKNOWN_INTRINSIC = 0,
 #include "AMDGPUAttributes.def"
-  ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
+  ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1,
+  NOT_IMPLICIT_INPUT
 };
 
 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
@@ -115,7 +116,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
     NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
     return QUEUE_PTR;
   default:
-    return NOT_IMPLICIT_INPUT;
+    return UNKNOWN_INTRINSIC;
   }
 }
 
@@ -534,6 +535,21 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       ImplicitArgumentMask AttrMask =
           intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
                               HasApertureRegs, SupportsGetDoorbellID, COV);
+
+      if (AttrMask == UNKNOWN_INTRINSIC) {
+        // Assume not-nocallback intrinsics may invoke a function which accesses
+        // implicit arguments.
+        //
+        // FIXME: This isn't really the correct check. We want to ensure it
+        // isn't calling any function that may use implicit arguments regardless
+        // of whether it's internal to the module or not.
+        //
+        // TODO: Ignoring callsite attributes.
+        if (!Callee->hasFnAttribute(Attribute::NoCallback))
+          return indicatePessimisticFixpoint();
+        continue;
+      }
+
       if (AttrMask != NOT_IMPLICIT_INPUT) {
         if ((IsNonEntryFunc || !NonKernelOnly))
           removeAssumedBits(AttrMask);
@@ -1357,7 +1373,10 @@ struct AAAMDGPUMinAGPRAlloc
       default:
         // Some intrinsics may use AGPRs, but if we have a choice, we are not
         // required to use AGPRs.
-        return true;
+
+        // Assume !nocallback intrinsics may call a function which requires
+        // AGPRs.
+        return CB.hasFnAttr(Attribute::NoCallback);
       }
 
       // TODO: Handle callsite attributes
@@ -1555,7 +1574,7 @@ struct AAAMDGPUClusterDimsFunction : public AAAMDGPUClusterDims {
 
   AMDGPU::ClusterDimsAttr Attr;
 
-  static constexpr const char AttrName[] = "amdgpu-cluster-dims";
+  static constexpr char AttrName[] = "amdgpu-cluster-dims";
 };
 
 AAAMDGPUClusterDims &
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index aff7096f26d67..0688f07873493 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -11,7 +11,6 @@
 
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include <memory>
-#include <vector>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1b559a628be08..f5081a9d2dd56 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1248,7 +1248,8 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
 
     SmallVector<EVT, 16> ValueVTs;
     SmallVector<uint64_t, 16> Offsets;
-    ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
+    ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
+                    &Offsets, ArgOffset);
 
     for (unsigned Value = 0, NumValues = ValueVTs.size();
          Value != NumValues; ++Value) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index aed325cf627bc..15ed60b46a9c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -116,8 +116,14 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
   if (!DstRC || DstRC != SrcRC)
     return false;
 
-  return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
-         RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
+  if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
+      !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
+    return false;
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+    MI.getOperand(0).setIsEarlyClobber(true);
+  }
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
@@ -224,13 +230,12 @@ bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
   Register VCCReg = I.getOperand(1).getReg();
   MachineInstr *Cmp;
 
-  if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+  // Set SCC as a side effect with S_CMP or S_OR.
+  if (STI.hasScalarCompareEq64()) {
     unsigned CmpOpc =
         STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
     Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
   } else {
-    // For gfx7 and earlier, S_CMP_LG_U64 doesn't exist, so we use S_OR_B64
-    // which sets SCC as a side effect.
     Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
     Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
               .addReg(VCCReg)
@@ -603,6 +608,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
   I.setDesc(TII.get(Opc));
   I.addOperand(*MF, MachineOperand::CreateImm(0));
   I.addImplicitDefUseOperands(*MF);
+  I.getOperand(0).setIsEarlyClobber(true);
   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
 }
 
@@ -3788,6 +3794,10 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
   MI.removeOperand(1); // Intrinsic ID
   MI.addOperand(VDst_In); // Readd VDst_In to the end
   MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+    MI.getOperand(0).setIsEarlyClobber(true);
+  }
   return true;
 }
 
@@ -6754,7 +6764,7 @@ bool AMDGPUInstructionSelector::selectSGetBarrierState(
     MachineInstr &I, Intrinsic::ID IntrID) const {
   MachineBasicBlock *MBB = I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
-  MachineOperand BarOp = I.getOperand(2);
+  const MachineOperand &BarOp = I.getOperand(2);
   std::optional<int64_t> BarValImm =
       getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
 
@@ -6807,8 +6817,8 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
     MachineInstr &I, Intrinsic::ID IntrID) const {
   MachineBasicBlock *MBB = I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
-  MachineOperand BarOp = I.getOperand(1);
-  MachineOperand CntOp = I.getOperand(2);
+  const MachineOperand &BarOp = I.getOperand(1);
+  const MachineOperand &CntOp = I.getOperand(2);
 
   // BarID = (BarOp >> 4) & 0x3F
   Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 0a5913293238a..fdff21b6ef8df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1565,8 +1565,11 @@ void SplitPtrStructs::processConditionals() {
     } else if (isa<SelectInst>(I)) {
       if (MaybeRsrc) {
         if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
-          ConditionalTemps.push_back(RsrcInst);
-          RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+          // Guard against conditionals that were already folded away.
+          if (RsrcInst != *MaybeRsrc) {
+            ConditionalTemps.push_back(RsrcInst);
+            RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+          }
         }
         for (Value *V : Seen)
           FoundRsrcs[V] = *MaybeRsrc;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
index 1e6589eb42c15..d7d0292083e1c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
@@ -58,6 +58,8 @@ class AMDGPULowerVGPREncoding {
   static constexpr unsigned BitsPerField = 2;
   static constexpr unsigned NumFields = 4;
   static constexpr unsigned FieldMask = (1 << BitsPerField) - 1;
+  static constexpr unsigned ModeWidth = NumFields * BitsPerField;
+  static constexpr unsigned ModeMask = (1 << ModeWidth) - 1;
   using ModeType = PackedVector<unsigned, BitsPerField,
                                 std::bitset<BitsPerField * NumFields>>;
 
@@ -82,12 +84,12 @@ class AMDGPULowerVGPREncoding {
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
 
+  // Current basic block.
+  MachineBasicBlock *MBB;
+
   /// Most recent s_set_* instruction.
   MachineInstr *MostRecentModeSet;
 
-  /// Whether the current mode is known.
-  bool CurrentModeKnown;
-
   /// Current mode bits.
   ModeTy CurrentMode;
 
@@ -108,10 +110,13 @@ class AMDGPULowerVGPREncoding {
   MachineInstr *Clause;
 
   /// Insert mode change before \p I. \returns true if mode was changed.
-  bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I);
+  bool setMode(ModeTy NewMode, ModeTy Mask,
+               MachineBasicBlock::instr_iterator I);
 
   /// Reset mode to default.
-  void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); }
+  void resetMode(MachineBasicBlock::instr_iterator I) {
+    setMode(ModeTy(), ModeTy::fullMask(), I);
+  }
 
   /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
   std::optional<unsigned> getMSBs(const MachineOperand &MO) const;
@@ -130,38 +135,43 @@ class AMDGPULowerVGPREncoding {
   /// Check if an instruction \p I is within a clause and returns a suitable
   /// iterator to insert mode change. It may also modify the S_CLAUSE
   /// instruction to extend it or drop the clause if it cannot be adjusted.
-  MachineInstr *handleClause(MachineInstr *I);
+  MachineBasicBlock::instr_iterator
+  handleClause(MachineBasicBlock::instr_iterator I);
 };
 
 bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask,
-                                      MachineInstr *I) {
+                                      MachineBasicBlock::instr_iterator I) {
   assert((NewMode.raw_bits() & ~Mask.raw_bits()).none());
 
-  if (CurrentModeKnown) {
-    auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();
+  auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();
 
-    if ((Delta & Mask.raw_bits()).none()) {
-      CurrentMask |= Mask;
-      return false;
-    }
+  if ((Delta & Mask.raw_bits()).none()) {
+    CurrentMask |= Mask;
+    return false;
+  }
 
-    if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
-      CurrentMode |= NewMode;
-      CurrentMask |= Mask;
+  if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
+    CurrentMode |= NewMode;
+    CurrentMask |= Mask;
 
-      MostRecentModeSet->getOperand(0).setImm(CurrentMode);
-      return true;
-    }
+    MachineOperand &Op = MostRecentModeSet->getOperand(0);
+
+    // Carry old mode bits from the existing instruction.
+    int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth);
+
+    Op.setImm(CurrentMode | OldModeBits);
+    return true;
   }
 
+  // Record previous mode into high 8 bits of the immediate.
+  int64_t OldModeBits = CurrentMode << ModeWidth;
+
   I = handleClause(I);
-  MostRecentModeSet =
-      BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
-          .addImm(NewMode);
+  MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
+                          .addImm(NewMode | OldModeBits);
 
   CurrentMode = NewMode;
   CurrentMask = Mask;
-  CurrentModeKnown = true;
   return true;
 }
 
@@ -233,21 +243,22 @@ bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
   if (Ops.first) {
     ModeTy NewMode, Mask;
     computeMode(NewMode, Mask, MI, Ops.first, Ops.second);
-    return setMode(NewMode, Mask, &MI);
+    return setMode(NewMode, Mask, MI.getIterator());
   }
   assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());
 
   return false;
 }
 
-MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) {
+MachineBasicBlock::instr_iterator
+AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) {
   if (!ClauseRemaining)
     return I;
 
   // A clause cannot start with a special instruction, place it right before
   // the clause.
   if (ClauseRemaining == ClauseLen) {
-    I = Clause->getPrevNode();
+    I = Clause->getPrevNode()->getIterator();
     assert(I->isBundle());
     return I;
   }
@@ -284,9 +295,9 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
   ClauseLen = ClauseRemaining = 0;
   CurrentMode.reset();
   CurrentMask.reset();
-  CurrentModeKnown = true;
   for (auto &MBB : MF) {
     MostRecentModeSet = nullptr;
+    this->MBB = &MBB;
 
     for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) {
       if (MI.isMetaInstruction())
@@ -294,17 +305,16 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
 
       if (MI.isTerminator() || MI.isCall()) {
         if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
-            MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
+            MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED)
           CurrentMode.reset();
-          CurrentModeKnown = true;
-        } else
-          resetMode(&MI);
+        else
+          resetMode(MI.getIterator());
         continue;
       }
 
       if (MI.isInlineAsm()) {
         if (TII->hasVGPRUses(MI))
-          resetMode(&MI);
+          resetMode(MI.getIterator());
         continue;
       }
 
@@ -323,14 +333,8 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
         --ClauseRemaining;
     }
 
-    // If we're falling through to a block that has at least one other
-    // predecessor, we no longer know the mode.
-    MachineBasicBlock *Next = MBB.getNextNode();
-    if (Next && Next->pred_size() >= 2 &&
-        llvm::is_contained(Next->predecessors(), &MBB)) {
-      if (CurrentMode.raw_bits().any())
-        CurrentModeKnown = false;
-    }
+    // Reset the mode if we are falling through.
+    resetMode(MBB.instr_end());
   }
 
   return Changed;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 680e7eb3de6be..844649ebb9ae6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -412,7 +412,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
                              *OutStreamer);
 
     if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
-      unsigned V = MI->getOperand(0).getImm();
+      unsigned V = MI->getOperand(0).getImm() & 0xff;
       OutStreamer->AddComment(
           " msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) +
           " src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
index cf2ab82537800..a3be0f51c2c2f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -48,7 +48,7 @@ class AMDGPUPerfHintAnalysis {
   FuncInfoMap FIM;
 
 public:
-  AMDGPUPerfHintAnalysis() {}
+  AMDGPUPerfHintAnalysis() = default;
 
   // OldPM
   bool runOnSCC(const GCNTargetMachine &TM, CallGraphSCC &SCC);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index e1879598f098a..907f8300de6d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineUniformityAnalysis.h"
@@ -34,9 +35,17 @@
 
 using namespace llvm;
 using namespace AMDGPU;
+using namespace llvm::MIPatternMatch;
 
 namespace {
 
+// AMDGPU-specific pattern matchers
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>
+m_GAMDGPUReadAnyLane(const SrcTy &Src) {
+  return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src);
+}
+
 class AMDGPURegBankLegalize : public MachineFunctionPass {
 public:
   static char ID;
@@ -160,10 +169,18 @@ AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
 
 Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
   // Src = G_AMDGPU_READANYLANE RALSrc
-  auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
-  if (RAL)
+  Register RALSrc;
+  if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc))))
     return RALSrc;
 
+  // TruncSrc = G_AMDGPU_READANYLANE RALSrc
+  // AextSrc = G_TRUNC TruncSrc
+  // Src = G_ANYEXT AextSrc
+  if (mi_match(Src, MRI,
+               m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) {
+    return RALSrc;
+  }
+
   // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
   // LoSgpr = G_AMDGPU_READANYLANE LoVgpr
   // HiSgpr = G_AMDGPU_READANYLANE HiVgpr
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index b84c30ecaac0b..1765d054a3c0d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -626,6 +626,23 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
+void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  assert(MRI.getType(Dst) == V2S16);
+  auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg());
+  auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg());
+  unsigned Opc = MI.getOpcode();
+  auto Flags = MI.getFlags();
+  auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32);
+  auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32);
+  auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32);
+  auto Op2Hi = B.buildTrunc(SgprRB_S16, Op2Hi32);
+  auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
+  auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
+  B.buildMergeLikeInstr(Dst, {Lo, Hi});
+  MI.eraseFromParent();
+}
+
 void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
   LLT DstTy = MRI.getType(Dst);
@@ -698,6 +715,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
     return lowerUnpackBitShift(MI);
   case UnpackMinMax:
     return lowerUnpackMinMax(MI);
+  case ScalarizeToS16:
+    return lowerSplitTo16(MI);
   case Ext32To64: {
     const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
     MachineInstrBuilder Hi;
@@ -849,10 +868,12 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
     return LLT::scalar(32);
   case Sgpr64:
   case Vgpr64:
+  case UniInVgprS64:
     return LLT::scalar(64);
   case Sgpr128:
   case Vgpr128:
     return LLT::scalar(128);
+  case SgprP0:
   case VgprP0:
     return LLT::pointer(0, 64);
   case SgprP1:
@@ -867,6 +888,8 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
   case SgprP5:
   case VgprP5:
     return LLT::pointer(5, 32);
+  case SgprP8:
+    return LLT::pointer(8, 128);
   case SgprV2S16:
   case VgprV2S16:
   case UniInVgprV2S16:
@@ -952,10 +975,12 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
   case Sgpr32_WF:
   case Sgpr64:
   case Sgpr128:
+  case SgprP0:
   case SgprP1:
   case SgprP3:
   case SgprP4:
   case SgprP5:
+  case SgprP8:
   case SgprPtr32:
   case SgprPtr64:
   case SgprPtr128:
@@ -972,6 +997,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
   case UniInVcc:
   case UniInVgprS16:
   case UniInVgprS32:
+  case UniInVgprS64:
   case UniInVgprV2S16:
   case UniInVgprV4S32:
   case UniInVgprB32:
@@ -1034,10 +1060,12 @@ void RegBankLegalizeHelper::applyMappingDst(
     case Sgpr32:
     case Sgpr64:
     case Sgpr128:
+    case SgprP0:
     case SgprP1:
     case SgprP3:
     case SgprP4:
     case SgprP5:
+    case SgprP8:
     case SgprV2S16:
     case SgprV2S32:
     case SgprV4S32:
@@ -1104,6 +1132,7 @@ void RegBankLegalizeHelper::applyMappingDst(
       break;
     }
     case UniInVgprS32:
+    case UniInVgprS64:
     case UniInVgprV2S16:
     case UniInVgprV4S32: {
       assert(Ty == getTyFromID(MethodIDs[OpIdx]));
@@ -1176,10 +1205,12 @@ void RegBankLegalizeHelper::applyMappingSrc(
     case Sgpr32:
     case Sgpr64:
     case Sgpr128:
+    case SgprP0:
     case SgprP1:
     case SgprP3:
     case SgprP4:
     case SgprP5:
+    case SgprP8:
     case SgprV2S16:
     case SgprV2S32:
     case SgprV4S32: {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index ad3ff1d374ec1..e7598f888e4b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -72,6 +72,7 @@ class RegBankLegalizeHelper {
   static constexpr LLT P6 = LLT::pointer(6, 32);
 
   MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32};
+  MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16};
   MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32};
   MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1};
 
@@ -121,6 +122,7 @@ class RegBankLegalizeHelper {
   void lowerV_BFE(MachineInstr &MI);
   void lowerS_BFE(MachineInstr &MI);
   void lowerSplitTo32(MachineInstr &MI);
+  void lowerSplitTo16(MachineInstr &MI);
   void lowerSplitTo32Select(MachineInstr &MI);
   void lowerSplitTo32SExtInReg(MachineInstr &MI);
   void lowerUnpackMinMax(MachineInstr &MI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 01abd358ff595..90114e44f1a48 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -66,6 +66,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return MRI.getType(Reg) == LLT::pointer(4, 64);
   case P5:
     return MRI.getType(Reg) == LLT::pointer(5, 32);
+  case P8:
+    return MRI.getType(Reg) == LLT::pointer(8, 128);
   case Ptr32:
     return isAnyPtr(MRI.getType(Reg), 32);
   case Ptr64:
@@ -108,6 +110,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg);
   case UniP5:
     return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg);
+  case UniP8:
+    return MRI.getType(Reg) == LLT::pointer(8, 128) && MUI.isUniform(Reg);
   case UniPtr32:
     return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniform(Reg);
   case UniPtr64:
@@ -202,7 +206,7 @@ bool PredicateMapping::match(const MachineInstr &MI,
   return true;
 }
 
-SetOfRulesForOpcode::SetOfRulesForOpcode() {}
+SetOfRulesForOpcode::SetOfRulesForOpcode() = default;
 
 SetOfRulesForOpcode::SetOfRulesForOpcode(FastRulesTypes FastTypes)
     : FastTypes(FastTypes) {}
@@ -913,14 +917,39 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
 
   addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
 
-  addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}});
+  addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}});
+
+  addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
+      .Uni(S64, {{Sgpr64}, {}});
+
+  addRulesForGOpcs({G_BLOCK_ADDR}).Any({{UniP0}, {{SgprP0}, {}}});
+
+  addRulesForGOpcs({G_GLOBAL_VALUE})
+      .Any({{UniP0}, {{SgprP0}, {}}})
+      .Any({{UniP1}, {{SgprP1}, {}}})
+      .Any({{UniP3}, {{SgprP3}, {}}})
+      .Any({{UniP4}, {{SgprP4}, {}}})
+      .Any({{UniP8}, {{SgprP8}, {}}});
+
+  addRulesForGOpcs({G_AMDGPU_WAVE_ADDRESS}).Any({{UniP5}, {{SgprP5}, {}}});
 
   bool hasSALUFloat = ST->hasSALUFloatInsts();
 
   addRulesForGOpcs({G_FADD}, Standard)
+      .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
+      .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
+      .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
       .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
       .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
-      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+      .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
+      .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
+      .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat)
+      .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16},
+           hasSALUFloat)
+      .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
+      .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}})
+      .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}});
 
   addRulesForGOpcs({G_FPTOUI})
       .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 030bd75f8cd10..7e4ce7b43dc3b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -63,6 +63,7 @@ enum UniformityLLTOpPredicateID {
   P3,
   P4,
   P5,
+  P8,
   Ptr32,
   Ptr64,
   Ptr128,
@@ -72,6 +73,7 @@ enum UniformityLLTOpPredicateID {
   UniP3,
   UniP4,
   UniP5,
+  UniP8,
   UniPtr32,
   UniPtr64,
   UniPtr128,
@@ -92,8 +94,10 @@ enum UniformityLLTOpPredicateID {
   V4S32,
 
   UniV2S16,
+  UniV2S32,
 
   DivV2S16,
+  DivV2S32,
 
   // B types
   B32,
@@ -134,10 +138,12 @@ enum RegBankLLTMappingApplyID {
   Sgpr32,
   Sgpr64,
   Sgpr128,
+  SgprP0,
   SgprP1,
   SgprP3,
   SgprP4,
   SgprP5,
+  SgprP8,
   SgprPtr32,
   SgprPtr64,
   SgprPtr128,
@@ -178,7 +184,9 @@ enum RegBankLLTMappingApplyID {
   UniInVcc,
   UniInVgprS16,
   UniInVgprS32,
+  UniInVgprS64,
   UniInVgprV2S16,
+  UniInVgprV2S32,
   UniInVgprV4S32,
   UniInVgprB32,
   UniInVgprB64,
@@ -217,6 +225,7 @@ enum LoweringMethodID {
   V_BFE,
   VgprToVccCopy,
   SplitTo32,
+  ScalarizeToS16,
   SplitTo32Select,
   SplitTo32SExtInReg,
   Ext32To64,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 54ba2f8c0d519..90d319f578f44 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -287,9 +287,6 @@ unsigned AMDGPURegisterBankInfo::getBreakDownCost(
 const RegisterBank &
 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
                                                LLT Ty) const {
-  if (&RC == &AMDGPU::SReg_1RegClass)
-    return AMDGPU::VCCRegBank;
-
   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
   // VCC-like use.
   if (TRI->isSGPRClass(&RC)) {
@@ -5081,17 +5078,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       unsigned MinNumRegsRequired = DstSize / 32;
 
       const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+      bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired);
+
       OpdsMapping[0] =
-          Info->getMinNumAGPRs() >= MinNumRegsRequired
-              ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
-              : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+          UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
+                      : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
 
       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
       OpdsMapping[4] =
-          Info->getMinNumAGPRs() >= MinNumRegsRequired
-              ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
-              : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+          UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+                      : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
 
       OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
       OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 1c1a6dac75a17..c37d3096afd3e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -15,7 +15,7 @@ def VGPRRegBank : RegisterBank<"VGPR",
 >;
 
 // It is helpful to distinguish conditions from ordinary SGPRs.
-def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
+def VCCRegBank : RegisterBank<"VCC", [SReg_32, SReg_64]>;
 
 def AGPRRegBank : RegisterBank <"AGPR",
   [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_288, AReg_320, AReg_352, AReg_384, AReg_512, AReg_1024]
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 0ea9add891111..b03d50f2d451d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -261,13 +261,6 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
 
         const Function *Callee = getCalleeFunction(*CalleeOp);
 
-        // Avoid crashing on undefined behavior with an illegal call to a
-        // kernel. If a callsite's calling convention doesn't match the
-        // function's, it's undefined behavior. If the callsite calling
-        // convention does match, that would have errored earlier.
-        if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
-          report_fatal_error("invalid call to entry function");
-
         auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
           return F == &MF.getFunction();
         };
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b28c50e3f5b6d..b87b54ffc4f12 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -816,7 +816,7 @@ parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
   Params.consume_front("strategy=");
   auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
                     .Case("dpp", ScanOptions::DPP)
-                    .Cases("iterative", "", ScanOptions::Iterative)
+                    .Cases({"iterative", ""}, ScanOptions::Iterative)
                     .Case("none", ScanOptions::None)
                     .Default(std::nullopt);
   if (Result)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 65e6ed9d1d428..c52eb4e477685 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -57,10 +57,11 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
                                      const UniformityInfo &UI,
                                      ValueMap<const Value *, bool> &Tracker) {
   llvm::Intrinsic::ID IID = II.getIntrinsicID();
-
+  /// We deliberately do not simplify readfirstlane with a uniform argument, so
+  /// that frontends can use it to force a copy to SGPR and thereby prevent the
+  /// backend from generating unwanted waterfall loops.
   switch (IID) {
   case Intrinsic::amdgcn_permlane64:
-  case Intrinsic::amdgcn_readfirstlane:
   case Intrinsic::amdgcn_readlane: {
     Value *Src = II.getArgOperand(0);
     if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
@@ -107,7 +108,7 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
     return Changed;
   }
   default:
-    llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic");
+    return false;
   }
   return false;
 }
@@ -121,16 +122,6 @@ static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) {
     auto *II = dyn_cast<IntrinsicInst>(&I);
     if (!II)
       continue;
-
-    switch (II->getIntrinsicID()) {
-    case Intrinsic::amdgcn_permlane64:
-    case Intrinsic::amdgcn_readfirstlane:
-    case Intrinsic::amdgcn_readlane:
-    case Intrinsic::amdgcn_ballot:
-      break;
-    default:
-      continue;
-    }
     IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
   }
   return IsChanged;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 733c5d520fb23..fe81a5efd9d51 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
   return NewRetBlock;
 }
 
+static BasicBlock *
+createDummyReturnBlock(Function &F,
+                       SmallVector<BasicBlock *, 4> &ReturningBlocks) {
+  BasicBlock *DummyReturnBB =
+      BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
+  Type *RetTy = F.getReturnType();
+  Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
+  ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
+  ReturningBlocks.push_back(DummyReturnBB);
+  return DummyReturnBB;
+}
+
+/// Handle conditional branch instructions (-> 2 targets) and callbr
+/// instructions with N targets.
+static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI,
+                          BasicBlock *DummyReturnBB,
+                          std::vector<DominatorTree::UpdateType> &Updates) {
+  SmallVector<BasicBlock *, 2> Successors(successors(BB));
+
+  // Create a new transition block to hold the conditional branch.
+  BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
+
+  Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
+
+  // 'Successors' become successors of TransitionBB instead of BB,
+  // and TransitionBB becomes a single successor of BB.
+  Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
+  for (BasicBlock *Successor : Successors) {
+    Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
+    Updates.emplace_back(DominatorTree::Delete, BB, Successor);
+  }
+
+  // Create a branch that will always branch to the transition block and
+  // references DummyReturnBB.
+  BB->getTerminator()->eraseFromParent();
+  BranchInst::Create(TransitionBB, DummyReturnBB,
+                     ConstantInt::getTrue(F.getContext()), BB);
+  Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+}
+
 bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
                                             const PostDominatorTree &PDT,
                                             const UniformityInfo &UA) {
-  assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
-
   if (PDT.root_size() == 0 ||
       (PDT.root_size() == 1 &&
-       !isa<BranchInst>(PDT.getRoot()->getTerminator())))
+       !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator())))
     return false;
 
   // Loop over all of the blocks in a function, tracking all of the blocks that
@@ -222,46 +260,28 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
       if (HasDivergentExitBlock)
         UnreachableBlocks.push_back(BB);
     } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
-
-      ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
-      if (DummyReturnBB == nullptr) {
-        DummyReturnBB = BasicBlock::Create(F.getContext(),
-                                           "DummyReturnBlock", &F);
-        Type *RetTy = F.getReturnType();
-        Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
-        ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
-        ReturningBlocks.push_back(DummyReturnBB);
-      }
+      if (!DummyReturnBB)
+        DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
 
       if (BI->isUnconditional()) {
         BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
         BI->eraseFromParent(); // Delete the unconditional branch.
         // Add a new conditional branch with a dummy edge to the return block.
-        BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
-        Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
-      } else { // Conditional branch.
-        SmallVector<BasicBlock *, 2> Successors(successors(BB));
-
-        // Create a new transition block to hold the conditional branch.
-        BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
-
-        Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
-
-        // 'Successors' become successors of TransitionBB instead of BB,
-        // and TransitionBB becomes a single successor of BB.
-        Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
-        for (BasicBlock *Successor : Successors) {
-          Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
-          Updates.emplace_back(DominatorTree::Delete, BB, Successor);
-        }
-
-        // Create a branch that will always branch to the transition block and
-        // references DummyReturnBB.
-        BB->getTerminator()->eraseFromParent();
-        BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
+        BranchInst::Create(LoopHeaderBB, DummyReturnBB,
+                           ConstantInt::getTrue(F.getContext()), BB);
         Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+      } else {
+        handleNBranch(F, BB, BI, DummyReturnBB, Updates);
       }
       Changed = true;
+    } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) {
+      if (!DummyReturnBB)
+        DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
+
+      handleNBranch(F, BB, CBI, DummyReturnBB, Updates);
+      Changed = true;
+    } else {
+      llvm_unreachable("unsupported block terminator");
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
index 61c5dcd5ebada..ded2f5ae1f8af 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
@@ -54,7 +54,7 @@ class AMDGPUWaitSGPRHazards {
   bool CullSGPRHazardsAtMemWait;
   unsigned CullSGPRHazardsMemWaitThreshold;
 
-  AMDGPUWaitSGPRHazards() {}
+  AMDGPUWaitSGPRHazards() = default;
 
   // Return the numeric ID 0-127 for a given SGPR.
   static std::optional<unsigned> sgprNumber(Register Reg,
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 09338c533fdf2..5e0486aa1dd49 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1865,7 +1865,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   unsigned getConstantBusLimit(unsigned Opcode) const;
   bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
   bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
-  unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
+  MCRegister findImplicitSGPRReadInVOP(const MCInst &Inst) const;
 
   bool isSupportedMnemo(StringRef Mnemo,
                         const FeatureBitset &FBS);
@@ -3665,7 +3665,8 @@ StringRef AMDGPUAsmParser::getMatchedVariantName() const {
   return "";
 }
 
-unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
+MCRegister
+AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
   const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
   for (MCPhysReg Reg : Desc.implicit_uses()) {
     switch (Reg) {
@@ -3679,7 +3680,7 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
       break;
     }
   }
-  return AMDGPU::NoRegister;
+  return MCRegister();
 }
 
 // NB: This code is correct only when used to check constant
@@ -3854,9 +3855,9 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(
     LiteralSize = 4;
   }
 
-  SmallDenseSet<unsigned> SGPRsUsed;
-  unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst);
-  if (SGPRUsed != AMDGPU::NoRegister) {
+  SmallDenseSet<MCRegister> SGPRsUsed;
+  MCRegister SGPRUsed = findImplicitSGPRReadInVOP(Inst);
+  if (SGPRUsed) {
     SGPRsUsed.insert(SGPRUsed);
     ++ConstantBusUseCount;
   }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index e3f3abae01648..dd3120f05ce26 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -1199,8 +1199,8 @@ void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
 
 // Given a wide tuple \p Reg check if it will overflow 256 registers.
 // \returns \p Reg on success or NoRegister otherwise.
-static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC,
-                                  const MCRegisterInfo &MRI) {
+static MCRegister CheckVGPROverflow(MCRegister Reg, const MCRegisterClass &RC,
+                                    const MCRegisterInfo &MRI) {
   unsigned NumRegs = RC.getSizeInBits() / 32;
   MCRegister Sub0 = MRI.getSubReg(Reg, AMDGPU::sub0);
   if (!Sub0)
@@ -1214,7 +1214,7 @@ static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC,
 
   assert(BaseReg && "Only vector registers expected");
 
-  return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : AMDGPU::NoRegister;
+  return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : MCRegister();
 }
 
 // Note that before gfx10, the MIMG encoding provided no information about
@@ -1456,9 +1456,8 @@ MCOperand AMDGPUDisassembler::errOperand(unsigned V,
   return MCOperand();
 }
 
-inline
-MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const {
-  return MCOperand::createReg(AMDGPU::getMCReg(RegId, STI));
+inline MCOperand AMDGPUDisassembler::createRegOperand(MCRegister Reg) const {
+  return MCOperand::createReg(AMDGPU::getMCReg(Reg, STI));
 }
 
 inline
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index d103d79fdabb9..ab130dbb08ff9 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -69,7 +69,7 @@ class AMDGPUDisassembler : public MCDisassembler {
 
   const char* getRegClassName(unsigned RegClassID) const;
 
-  MCOperand createRegOperand(unsigned int RegId) const;
+  MCOperand createRegOperand(MCRegister Reg) const;
   MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const;
   MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const;
   MCOperand createVGPR16Operand(unsigned RegIdx, bool IsHi) const;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 52cc4ca5a955c..1a14629fb66b3 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -435,7 +435,7 @@ void GCNHazardRecognizer::RecedeCycle() {
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
-using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
+enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound };
 
 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 959ce6904ce4d..1682abbdea169 100644
--- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -43,7 +43,7 @@ class GCNNSAReassignImpl {
   bool run(MachineFunction &MF);
 
 private:
-  using NSA_Status = enum {
+  enum NSA_Status {
     NOT_NSA,        // Not an NSA instruction
     FIXED,          // NSA which we cannot modify
     NON_CONTIGUOUS, // NSA with non-sequential address which we can try
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4deb2a9485e4d..62172a0bb89db 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -136,7 +136,7 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
           continue;
 
         if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
-          MachineOperand DefSrcMO = Def.getOperand(1);
+          const MachineOperand &DefSrcMO = Def.getOperand(1);
 
           // Immediates are not an issue and can be propagated in
           // postrapseudos pass. Only handle cases where defining
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 9fbf9e5fe8eeb..ba458ea4b519d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1228,18 +1228,20 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
       createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
 
   InitialOccupancy = DAG.MinOccupancy;
-  // Aggressivly try to reduce register pressure in the unclustered high RP
+  // Aggressively try to reduce register pressure in the unclustered high RP
   // stage. Temporarily increase occupancy target in the region.
+  TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
+                            ? InitialOccupancy + 1
+                            : InitialOccupancy;
+  IsAnyRegionScheduled = false;
   S.SGPRLimitBias = S.HighRPSGPRBias;
   S.VGPRLimitBias = S.HighRPVGPRBias;
-  if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
-    MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
 
   LLVM_DEBUG(
       dbgs()
       << "Retrying function scheduling without clustering. "
-         "Aggressivly try to reduce register pressure to achieve occupancy "
-      << DAG.MinOccupancy << ".\n");
+         "Aggressively try to reduce register pressure to achieve occupancy "
+      << TempTargetOccupancy << ".\n");
 
   return true;
 }
@@ -1320,9 +1322,16 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
   SavedMutations.swap(DAG.Mutations);
   S.SGPRLimitBias = S.VGPRLimitBias = 0;
   if (DAG.MinOccupancy > InitialOccupancy) {
+    assert(IsAnyRegionScheduled);
     LLVM_DEBUG(dbgs() << StageID
                       << " stage successfully increased occupancy to "
                       << DAG.MinOccupancy << '\n');
+  } else if (!IsAnyRegionScheduled) {
+    assert(DAG.MinOccupancy == InitialOccupancy);
+    LLVM_DEBUG(dbgs() << StageID
+                      << ": No regions scheduled, min occupancy stays at "
+                      << DAG.MinOccupancy << ", MFI occupancy stays at "
+                      << MFI.getOccupancy() << ".\n");
   }
 
   GCNSchedStage::finalizeGCNSchedStage();
@@ -1396,13 +1405,27 @@ bool UnclusteredHighRPStage::initGCNRegion() {
   // rescheduling of previous regions did not make occupancy drop back down to
   // the initial minimum).
   unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
+  // If no region has been scheduled yet, the DAG has not yet been updated with
+  // the occupancy target. So retrieve it from the temporary.
+  unsigned CurrentTargetOccupancy =
+      IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
   if (!DAG.RegionsWithExcessRP[RegionIdx] &&
-      (DAG.MinOccupancy <= InitialOccupancy ||
+      (CurrentTargetOccupancy <= InitialOccupancy ||
        DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
            InitialOccupancy))
     return false;
 
-  return GCNSchedStage::initGCNRegion();
+  bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
+  // If this is the first region scheduled during this stage, make the target
+  // occupancy changes in the DAG and MFI.
+  if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
+    IsAnyRegionScheduled = true;
+    if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) {
+      DAG.MinOccupancy = TempTargetOccupancy;
+      MFI.increaseOccupancy(MF, TempTargetOccupancy);
+    }
+  }
+  return IsSchedulingThisRegion;
 }
 
 bool ClusteredLowOccStage::initGCNRegion() {
@@ -2011,7 +2034,7 @@ void PreRARematStage::rematerialize() {
 
     // Rematerialize DefMI to its use block.
     TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
-                       AMDGPU::NoSubRegister, *DefMI, *DAG.TRI);
+                       AMDGPU::NoSubRegister, *DefMI);
     Remat.RematMI = &*std::prev(InsertPos);
     DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
 
@@ -2163,8 +2186,7 @@ void PreRARematStage::finalizeGCNSchedStage() {
     // Re-rematerialize MI at the end of its original region. Note that it may
     // not be rematerialized exactly in the same position as originally within
     // the region, but it should not matter much.
-    TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI,
-                       *DAG.TRI);
+    TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI);
     MachineInstr *NewMI = &*std::prev(InsertPos);
     DAG.LIS->InsertMachineInstrInMaps(*NewMI);
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 975781fea9452..95a931b9beb2a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -183,7 +183,7 @@ class ScheduleMetrics {
   unsigned BubbleCycles;
 
 public:
-  ScheduleMetrics() {}
+  ScheduleMetrics() = default;
   ScheduleMetrics(unsigned L, unsigned BC)
       : ScheduleLength(L), BubbleCycles(BC) {}
   unsigned getLength() const { return ScheduleLength; }
@@ -217,7 +217,7 @@ class RegionPressureMap {
   bool IsLiveOut;
 
 public:
-  RegionPressureMap() {}
+  RegionPressureMap() = default;
   RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut)
       : DAG(GCNDAG), IsLiveOut(LiveOut) {}
   // Build the Instr->LiveReg and RegionIdx->Instr maps
@@ -417,6 +417,10 @@ class UnclusteredHighRPStage : public GCNSchedStage {
 private:
   // Save the initial occupancy before starting this stage.
   unsigned InitialOccupancy;
+  // Save the temporary target occupancy before starting this stage.
+  unsigned TempTargetOccupancy;
+  // Track whether any region was scheduled by this stage.
+  bool IsAnyRegionScheduled;
 
 public:
   bool initGCNSchedStage() override;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 703ec0a4befa5..8ef5874d7baf9 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -336,7 +336,7 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI,
 
 // \returns a low 256 vgpr representing a high vgpr \p Reg [v256..v1023] or
 // \p Reg itself otherwise.
-static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) {
+static MCRegister getRegForPrinting(MCRegister Reg, const MCRegisterInfo &MRI) {
   unsigned Enc = MRI.getEncodingValue(Reg);
   unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
   if (Idx < 0x100)
@@ -355,10 +355,10 @@ static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) {
 }
 
 // Restore MSBs of a VGPR above 255 from the MCInstrAnalysis.
-static MCPhysReg getRegFromMIA(MCPhysReg Reg, unsigned OpNo,
-                               const MCInstrDesc &Desc,
-                               const MCRegisterInfo &MRI,
-                               const AMDGPUMCInstrAnalysis &MIA) {
+static MCRegister getRegFromMIA(MCRegister Reg, unsigned OpNo,
+                                const MCInstrDesc &Desc,
+                                const MCRegisterInfo &MRI,
+                                const AMDGPUMCInstrAnalysis &MIA) {
   unsigned VgprMSBs = MIA.getVgprMSBs();
   if (!VgprMSBs)
     return Reg;
@@ -403,10 +403,10 @@ void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O,
   }
 #endif
 
-  unsigned PrintReg = getRegForPrinting(Reg, MRI);
+  MCRegister PrintReg = getRegForPrinting(Reg, MRI);
   O << getRegisterName(PrintReg);
 
-  if (PrintReg != Reg.id())
+  if (PrintReg != Reg)
     O << " /*" << getRegisterName(Reg) << "*/";
 }
 
@@ -795,14 +795,24 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
     // Intention: print disassembler message when invalid code is decoded,
     // for example sgpr register used in VReg or VISrc(VReg or imm) operand.
     const MCOperandInfo &OpInfo = Desc.operands()[OpNo];
-    int16_t RCID = MII.getOpRegClassID(
-        OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo));
-    if (RCID != -1) {
+    if (OpInfo.RegClass != -1) {
+      int16_t RCID = MII.getOpRegClassID(
+          OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo));
       const MCRegisterClass &RC = MRI.getRegClass(RCID);
       auto Reg = mc2PseudoReg(Op.getReg());
       if (!RC.contains(Reg) && !isInlineValue(Reg)) {
-        O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC)
-          << "\' register class*/";
+        bool IsWaveSizeOp = OpInfo.isLookupRegClassByHwMode() &&
+                            (OpInfo.RegClass == AMDGPU::SReg_1 ||
+                             OpInfo.RegClass == AMDGPU::SReg_1_XEXEC);
+        // Suppress this comment for a mismatched wavesize. Some users expect to
+        // be able to assemble and disassemble modules with mixed wavesizes, but
+        // we do not know the subtarget in different functions in MC.
+        //
+        // TODO: Should probably print it anyway, maybe a more specific version.
+        if (!IsWaveSizeOp) {
+          O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC)
+            << "\' register class*/";
+        }
       }
     }
   } else if (Op.isImm()) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 013cfeb364048..28b4da8ab9ebb 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -168,7 +168,7 @@ bool AMDGPUMCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
 
 void AMDGPUMCInstrAnalysis::updateState(const MCInst &Inst, uint64_t Addr) {
   if (Inst.getOpcode() == AMDGPU::S_SET_VGPR_MSB_gfx12)
-    VgprMSBs = Inst.getOperand(0).getImm();
+    VgprMSBs = Inst.getOperand(0).getImm() & 0xff;
   else if (isTerminator(Inst))
     VgprMSBs = 0;
 }
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index d95013123aced..65dce74a1e894 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -2116,8 +2116,10 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p
   let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
   let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
 
+  // Set VADDR4 to NULL
+  let vaddr4 = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+
   // set to 0 based on SPG.
-  let vaddr4 = 0;
   let rsrc = 0;
   let vdata = 0;
   let d16 = 0;
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 2aa54c920a046..31eca049fd149 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1129,12 +1129,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) {
       // Add an extra level of chain to isolate this vector
       SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
-      // TODO: can the chain be replaced without creating a new store?
-      SDValue NewStore = DAG.getTruncStore(
-          NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT,
-          StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(),
-          StoreNode->getAAInfo());
-      StoreNode = cast<StoreSDNode>(NewStore);
+      SmallVector<SDValue, 4> NewOps(StoreNode->ops());
+      NewOps[0] = NewChain;
+      StoreNode = cast<StoreSDNode>(DAG.UpdateNodeOperands(StoreNode, NewOps));
     }
 
     return scalarizeVectorStore(StoreNode, DAG);
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 3e256cce97afb..01040854e1577 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 #include "R600GenInstrInfo.inc"
 
 R600InstrInfo::R600InstrInfo(const R600Subtarget &ST)
-    : R600GenInstrInfo(ST, -1, -1), RI(), ST(ST) {}
+    : R600GenInstrInfo(ST, RI, -1, -1), RI(), ST(ST) {}
 
 bool R600InstrInfo::isVector(const MachineInstr &MI) const {
   return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 6616b30410590..2c00e23d113cb 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -681,6 +681,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
         return false;
       MI->setDesc(TII->get(NewMFMAOpc));
       MI->untieRegOperand(0);
+      const MCInstrDesc &MCID = MI->getDesc();
+      for (unsigned I = 0; I < MI->getNumDefs(); ++I)
+        if (MCID.getOperandConstraint(I, MCOI::EARLY_CLOBBER) != -1)
+          MI->getOperand(I).setIsEarlyClobber(true);
     }
 
     // TODO: Should we try to avoid adding this to the candidate list?
@@ -709,7 +713,7 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
 
   // Verify the register is compatible with the operand.
   if (const TargetRegisterClass *OpRC =
-          TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) {
+          TII->getRegClass(MI->getDesc(), Fold.UseOpNo)) {
     const TargetRegisterClass *NewRC =
         TRI->getRegClassForReg(*MRI, New->getReg());
 
@@ -1129,40 +1133,11 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
   if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
     return false;
 
-  MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
   if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
     appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
     return true;
   }
 
-  // TODO: Verify the following code handles subregisters correctly.
-  // TODO: Handle extract of global reference
-  if (UseOp.getSubReg())
-    return false;
-
-  if (!OpToFold.isReg())
-    return false;
-
-  Register UseReg = OpToFold.getReg();
-  if (!UseReg.isVirtual())
-    return false;
-
-  // Maybe it is just a COPY of an immediate itself.
-
-  // FIXME: Remove this handling. There is already special case folding of
-  // immediate into copy in foldOperand. This is looking for the def of the
-  // value the folding started from in the first place.
-  MachineInstr *Def = MRI->getVRegDef(UseReg);
-  if (Def && TII->isFoldableCopy(*Def)) {
-    MachineOperand &DefOp = Def->getOperand(1);
-    if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
-      FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC,
-                              OpToFold.DefSubReg);
-      appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm);
-      return true;
-    }
-  }
-
   return false;
 }
 
@@ -1309,10 +1284,11 @@ void SIFoldOperandsImpl::foldOperand(
         continue;
 
       const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
-      const TargetRegisterClass *MovSrcRC =
-          TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[SrcIdx]));
 
-      if (MovSrcRC) {
+      int16_t RegClassID = TII->getOpRegClassID(MovDesc.operands()[SrcIdx]);
+      if (RegClassID != -1) {
+        const TargetRegisterClass *MovSrcRC = TRI->getRegClass(RegClassID);
+
         if (UseSubReg)
           MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
 
@@ -2419,7 +2395,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
 
   unsigned OpIdx = Op - &UseMI->getOperand(0);
   const MCInstrDesc &InstDesc = UseMI->getDesc();
-  const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx, TRI);
+  const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx);
   if (!OpRC || !TRI->isVectorSuperClass(OpRC))
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 5c39f7a3d6daa..aa5ea77f17291 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -2170,7 +2170,9 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
     return MFI.getStackSize() != 0;
   }
 
-  return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
+  return (frameTriviallyRequiresSP(MFI) &&
+          !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) ||
+         MFI.isFrameAddressTaken() ||
          MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
              MF) ||
          mayReserveScratchForCWSR(MF) ||
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8bb28084159e8..768c0abd2e3f1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17367,12 +17367,14 @@ void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
     // Abandon attempt if the dst size isn't large enough
     // - this is in fact an error but this is picked up elsewhere and
     // reported correctly.
-    uint32_t DstSize =
-        TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+    const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
+
+    uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
     if (DstSize < InitIdx)
       return;
   } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
-    InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+    const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
+    InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
   } else {
     return;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6dcbced010a5a..306d59d0867cd 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1288,18 +1288,32 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
 }
 
 void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
+  // On entry to a block with multiple predescessors, there may
+  // be pending SMEM and VMEM events active at the same time.
+  // In such cases, only clear one active event at a time.
+
   // Wait on XCNT is redundant if we are already waiting for a load to complete.
   // SMEM can return out of order, so only omit XCNT wait if we are waiting till
   // zero.
-  if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
-    return applyWaitcnt(X_CNT, 0);
+  if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
+    if (!hasMixedPendingEvents(X_CNT))
+      applyWaitcnt(X_CNT, 0);
+    else
+      PendingEvents &= ~(1 << SMEM_GROUP);
+    return;
+  }
 
   // If we have pending store we cannot optimize XCnt because we do not wait for
   // stores. VMEM loads retun in order, so if we only have loads XCnt is
   // decremented to the same number as LOADCnt.
   if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
-      !hasPendingEvent(STORE_CNT))
-    return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+      !hasPendingEvent(STORE_CNT)) {
+    if (!hasMixedPendingEvents(X_CNT))
+      applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+    else if (Wait.LoadCnt == 0)
+      PendingEvents &= ~(1 << VMEM_GROUP);
+    return;
+  }
 
   applyWaitcnt(X_CNT, Wait.XCnt);
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d930a21c2d7f5..00a5a27dc7c93 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -63,7 +63,8 @@ static cl::opt<bool> Fix16BitCopies(
   cl::ReallyHidden);
 
 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
-    : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+    : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
+                         AMDGPU::ADJCALLSTACKDOWN),
       RI(ST), ST(ST) {
   SchedModel.init(&ST);
 }
@@ -1667,8 +1668,7 @@ unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
 
 void SIInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-    bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
+    bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
     MachineInstr::MIFlag Flags) const {
   MachineFunction *MF = MBB.getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
@@ -1680,7 +1680,7 @@ void SIInstrInfo::storeRegToStackSlot(
   MachineMemOperand *MMO = MF->getMachineMemOperand(
       PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
       FrameInfo.getObjectAlign(FrameIndex));
-  unsigned SpillSize = TRI->getSpillSize(*RC);
+  unsigned SpillSize = RI.getSpillSize(*RC);
 
   MachineRegisterInfo &MRI = MF->getRegInfo();
   if (RI.isSGPRClass(RC)) {
@@ -1862,14 +1862,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MI,
                                        Register DestReg, int FrameIndex,
                                        const TargetRegisterClass *RC,
-                                       const TargetRegisterInfo *TRI,
                                        Register VReg,
                                        MachineInstr::MIFlag Flags) const {
   MachineFunction *MF = MBB.getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
   const DebugLoc &DL = MBB.findDebugLoc(MI);
-  unsigned SpillSize = TRI->getSpillSize(*RC);
+  unsigned SpillSize = RI.getSpillSize(*RC);
 
   MachinePointerInfo PtrInfo
     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
@@ -2518,8 +2517,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
 
 void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I, Register DestReg,
-                                unsigned SubIdx, const MachineInstr &Orig,
-                                const TargetRegisterInfo &RI) const {
+                                unsigned SubIdx,
+                                const MachineInstr &Orig) const {
 
   // Try shrinking the instruction to remat only the part needed for current
   // context.
@@ -2569,7 +2568,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
 
     const MCInstrDesc &TID = get(NewOpcode);
     const TargetRegisterClass *NewRC =
-        RI.getAllocatableClass(getRegClass(TID, 0, &RI));
+        RI.getAllocatableClass(getRegClass(TID, 0));
     MRI.setRegClass(DestReg, NewRC);
 
     UseMO->setReg(DestReg);
@@ -2599,7 +2598,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
     break;
   }
 
-  TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
+  TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig);
 }
 
 std::pair<MachineInstr*, MachineInstr*>
@@ -3612,7 +3611,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
           AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
       const MCInstrDesc &MovDesc = get(MovOp);
 
-      const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI);
+      const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
       if (Is16Bit) {
         // We just need to find a correctly sized register class, so the
         // subregister index compatibility doesn't matter since we're statically
@@ -3917,6 +3916,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
   if (isLDSDMA(MIa) || isLDSDMA(MIb))
     return false;
 
+  if (MIa.isBundle() || MIb.isBundle())
+    return false;
+
   // TODO: Should we check the address space from the MachineMemOperand? That
   // would allow us to distinguish objects we know don't alias based on the
   // underlying address space, even if it was lowered to a different one,
@@ -4044,10 +4046,29 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                                                  LiveVariables *LV,
                                                  LiveIntervals *LIS) const {
   MachineBasicBlock &MBB = *MI.getParent();
+  MachineInstr *CandidateMI = &MI;
+
+  if (MI.isBundle()) {
+    // This is a temporary placeholder for bundle handling that enables us to
+    // exercise the relevant code paths in the two-address instruction pass.
+    if (MI.getBundleSize() != 1)
+      return nullptr;
+    CandidateMI = MI.getNextNode();
+  }
+
   ThreeAddressUpdates U;
-  MachineInstr *NewMI = convertToThreeAddressImpl(MI, U);
+  MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
+  if (!NewMI)
+    return nullptr;
 
-  if (NewMI) {
+  if (MI.isBundle()) {
+    CandidateMI->eraseFromBundle();
+
+    for (MachineOperand &MO : MI.all_defs()) {
+      if (MO.isTied())
+        MI.untieRegOperand(MO.getOperandNo());
+    }
+  } else {
     updateLiveVariables(LV, MI, *NewMI);
     if (LIS) {
       LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
@@ -4088,7 +4109,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
         LV->getVarInfo(DefReg).AliveBlocks.clear();
     }
 
-    if (LIS) {
+    if (MI.isBundle()) {
+      VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
+      if (!VRI.Reads && !VRI.Writes) {
+        for (MachineOperand &MO : MI.all_uses()) {
+          if (MO.isReg() && MO.getReg() == DefReg) {
+            assert(MO.getSubReg() == 0 &&
+                   "tied sub-registers in bundles currently not supported");
+            MI.removeOperand(MO.getOperandNo());
+            break;
+          }
+        }
+
+        if (LIS)
+          LIS->shrinkToUses(&LIS->getInterval(DefReg));
+      }
+    } else if (LIS) {
       LiveInterval &DefLI = LIS->getInterval(DefReg);
 
       // We cannot delete the original instruction here, so hack out the use
@@ -4103,11 +4139,26 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
         }
       }
 
+      if (MI.isBundle()) {
+        VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
+        if (!VRI.Reads && !VRI.Writes) {
+          for (MachineOperand &MIOp : MI.uses()) {
+            if (MIOp.isReg() && MIOp.getReg() == DefReg) {
+              MIOp.setIsUndef(true);
+              MIOp.setReg(DummyReg);
+            }
+          }
+        }
+
+        MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
+                                                false, /*isUndef=*/true));
+      }
+
       LIS->shrinkToUses(&DefLI);
     }
   }
 
-  return NewMI;
+  return MI.isBundle() ? &MI : NewMI;
 }
 
 MachineInstr *
@@ -6021,19 +6072,6 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
   llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
 }
 
-// FIXME: This should not be an overridable function. All subtarget dependent
-// operand modifications should go through isLookupRegClassByHwMode in the
-// generic handling.
-const TargetRegisterClass *
-SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,
-                         const TargetRegisterInfo *TRI) const {
-  if (OpNum >= TID.getNumOperands())
-    return nullptr;
-  const MCOperandInfo &OpInfo = TID.operands()[OpNum];
-  int16_t RegClass = getOpRegClassID(OpInfo);
-  return RI.getRegClass(RegClass);
-}
-
 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
                                                       unsigned OpNo) const {
   const MCInstrDesc &Desc = get(MI.getOpcode());
@@ -6049,7 +6087,8 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
     return RI.getPhysRegBaseClass(Reg);
   }
 
-  return RI.getRegClass(getOpRegClassID(Desc.operands()[OpNo]));
+  int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
+  return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
 }
 
 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
@@ -6153,7 +6192,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
   // information.
   if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
       MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
-    constexpr const AMDGPU::OpName OpNames[] = {
+    constexpr AMDGPU::OpName OpNames[] = {
         AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
 
     for (auto [I, OpName] : enumerate(OpNames)) {
@@ -6215,8 +6254,8 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
 bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
     const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
     const MachineOperand *MO) const {
-  constexpr const unsigned NumOps = 3;
-  constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
+  constexpr unsigned NumOps = 3;
+  constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
       AMDGPU::OpName::src0,           AMDGPU::OpName::src1,
       AMDGPU::OpName::src2,           AMDGPU::OpName::src0_modifiers,
       AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
@@ -6801,7 +6840,7 @@ void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
     return;
 
   const TargetRegisterClass *DeclaredRC =
-      getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI);
+      getRegClass(MI.getDesc(), SAddr->getOperandNo());
 
   Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
   SAddr->setReg(ToSGPR);
@@ -7632,6 +7671,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   unsigned Opcode = Inst.getOpcode();
   unsigned NewOpcode = getVALUOp(Inst);
+  const DebugLoc &DL = Inst.getDebugLoc();
+
   // Handle some special cases
   switch (Opcode) {
   default:
@@ -7869,7 +7910,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     return;
   case AMDGPU::S_UADDO_PSEUDO:
   case AMDGPU::S_USUBO_PSEUDO: {
-    const DebugLoc &DL = Inst.getDebugLoc();
     MachineOperand &Dest0 = Inst.getOperand(0);
     MachineOperand &Dest1 = Inst.getOperand(1);
     MachineOperand &Src0 = Inst.getOperand(2);
@@ -7889,12 +7929,37 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
 
     legalizeOperands(*NewInstr, MDT);
     MRI.replaceRegWith(Dest0.getReg(), DestReg);
-    addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
-                                 Worklist);
+    addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
     Inst.eraseFromParent();
   }
     return;
+  case AMDGPU::S_LSHL1_ADD_U32:
+  case AMDGPU::S_LSHL2_ADD_U32:
+  case AMDGPU::S_LSHL3_ADD_U32:
+  case AMDGPU::S_LSHL4_ADD_U32: {
+    MachineOperand &Dest = Inst.getOperand(0);
+    MachineOperand &Src0 = Inst.getOperand(1);
+    MachineOperand &Src1 = Inst.getOperand(2);
+    unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32   ? 1
+                         : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
+                         : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
+                                                             : 4);
+
+    const TargetRegisterClass *NewRC =
+        RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
+    Register DestReg = MRI.createVirtualRegister(NewRC);
+    MachineInstr *NewInstr =
+        BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
+            .add(Src0)
+            .addImm(ShiftAmt)
+            .add(Src1);
 
+    legalizeOperands(*NewInstr, MDT);
+    MRI.replaceRegWith(Dest.getReg(), DestReg);
+    addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
+    Inst.eraseFromParent();
+  }
+    return;
   case AMDGPU::S_CSELECT_B32:
   case AMDGPU::S_CSELECT_B64:
     lowerSelect(Worklist, Inst, MDT);
@@ -7945,7 +8010,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     }
     legalizeOperands(*NewInstr, MDT);
     int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
-    MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+    const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
     addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
     Inst.eraseFromParent();
     return;
@@ -7985,13 +8050,12 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     legalizeOperandsVALUt16(*NewInstr, MRI);
     legalizeOperands(*NewInstr, MDT);
     int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
-    MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+    const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
     addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
     Inst.eraseFromParent();
     return;
   }
   case AMDGPU::S_CVT_HI_F32_F16: {
-    const DebugLoc &DL = Inst.getDebugLoc();
     Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     if (ST.useRealTrue16Insts()) {
@@ -8021,7 +8085,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
   }
   case AMDGPU::S_MINIMUM_F32:
   case AMDGPU::S_MAXIMUM_F32: {
-    const DebugLoc &DL = Inst.getDebugLoc();
     Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
                                  .addImm(0) // src0_modifiers
@@ -8039,7 +8102,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
   }
   case AMDGPU::S_MINIMUM_F16:
   case AMDGPU::S_MAXIMUM_F16: {
-    const DebugLoc &DL = Inst.getDebugLoc();
     Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
                                                     ? &AMDGPU::VGPR_16RegClass
                                                     : &AMDGPU::VGPR_32RegClass);
@@ -8063,7 +8125,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
   case AMDGPU::V_S_RCP_F16_e64:
   case AMDGPU::V_S_RSQ_F16_e64:
   case AMDGPU::V_S_SQRT_F16_e64: {
-    const DebugLoc &DL = Inst.getDebugLoc();
     Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
                                                     ? &AMDGPU::VGPR_16RegClass
                                                     : &AMDGPU::VGPR_32RegClass);
@@ -8183,7 +8244,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
                                    AMDGPU::OpName::src0_modifiers) >= 0)
       NewInstr.addImm(0);
     if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
-      MachineOperand Src = Inst.getOperand(1);
+      const MachineOperand &Src = Inst.getOperand(1);
       NewInstr->addOperand(Src);
     }
 
@@ -9199,7 +9260,7 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
-void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
                                                MachineInstr &SCCDefInst,
                                                SIInstrWorklist &Worklist,
                                                Register NewCond) const {
@@ -10160,7 +10221,7 @@ static bool followSubRegDef(MachineInstr &MI,
 }
 
 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
-                                     MachineRegisterInfo &MRI) {
+                                     const MachineRegisterInfo &MRI) {
   assert(MRI.isSSA());
   if (!P.Reg.isVirtual())
     return nullptr;
@@ -10618,6 +10679,44 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
   return false;
 }
 
+// SCC is already valid after SCCValid.
+// SCCRedefine will redefine SCC to the same value already available after
+// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
+// update kill/dead flags if necessary.
+static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
+                        const SIRegisterInfo &RI) {
+  MachineInstr *KillsSCC = nullptr;
+  if (SCCValid->getParent() != SCCRedefine->getParent())
+    return false;
+  for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
+                                     SCCRedefine->getIterator())) {
+    if (MI.modifiesRegister(AMDGPU::SCC, &RI))
+      return false;
+    if (MI.killsRegister(AMDGPU::SCC, &RI))
+      KillsSCC = &MI;
+  }
+  if (MachineOperand *SccDef =
+          SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
+    SccDef->setIsDead(false);
+  if (KillsSCC)
+    KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
+  SCCRedefine->eraseFromParent();
+  return true;
+}
+
+static bool foldableSelect(const MachineInstr &Def) {
+  if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
+      Def.getOpcode() != AMDGPU::S_CSELECT_B64)
+    return false;
+  bool Op1IsNonZeroImm =
+      Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
+  bool Op2IsZeroImm =
+      Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
+  if (!Op1IsNonZeroImm || !Op2IsZeroImm)
+    return false;
+  return true;
+}
+
 bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
                                        Register SrcReg2, int64_t CmpMask,
                                        int64_t CmpValue,
@@ -10633,23 +10732,10 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     if (CmpValue != 0)
       return false;
 
-    MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
-    if (!Def || Def->getParent() != CmpInstr.getParent())
+    MachineInstr *Def = MRI->getVRegDef(SrcReg);
+    if (!Def)
       return false;
 
-    const auto foldableSelect = [](MachineInstr *Def) -> bool {
-      if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
-          Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
-        bool Op1IsNonZeroImm =
-            Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
-        bool Op2IsZeroImm =
-            Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
-        if (Op1IsNonZeroImm && Op2IsZeroImm)
-          return true;
-      }
-      return false;
-    };
-
     // For S_OP that set SCC = DST!=0, do the transformation
     //
     //   s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
@@ -10660,24 +10746,38 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     //
     //   s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
     //   imm), 0)
-    if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def))
+    if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def))
       return false;
 
-    MachineInstr *KillsSCC = nullptr;
-    for (MachineInstr &MI :
-         make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
-      if (MI.modifiesRegister(AMDGPU::SCC, &RI))
-        return false;
-      if (MI.killsRegister(AMDGPU::SCC, &RI))
-        KillsSCC = &MI;
-    }
+    if (!optimizeSCC(Def, &CmpInstr, RI))
+      return false;
 
-    if (MachineOperand *SccDef =
-            Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
-      SccDef->setIsDead(false);
-    if (KillsSCC)
-      KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
-    CmpInstr.eraseFromParent();
+    // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
+    // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
+    // 64-bit foldableSelect then delete s_or_b32 in the sequence:
+    //    sX = s_cselect_b64 (non-zero imm), 0
+    //    sLo = copy sX.sub0
+    //    sHi = copy sX.sub1
+    //    sY = s_or_b32 sLo, sHi
+    if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
+        MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
+      const MachineOperand &OrOpnd1 = Def->getOperand(1);
+      const MachineOperand &OrOpnd2 = Def->getOperand(2);
+      if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
+        MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
+        MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
+        if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
+            Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
+            Def2->getOperand(1).isReg() &&
+            Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
+            Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
+            Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
+          MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
+          if (Select && foldableSelect(*Select))
+            optimizeSCC(Select, Def, RI);
+        }
+      }
+    }
     return true;
   };
 
@@ -10707,8 +10807,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
     // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
 
-    MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
-    if (!Def || Def->getParent() != CmpInstr.getParent())
+    MachineInstr *Def = MRI->getVRegDef(SrcReg);
+    if (!Def)
       return false;
 
     if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
@@ -10755,21 +10855,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
       return false;
 
-    MachineInstr *KillsSCC = nullptr;
-    for (MachineInstr &MI :
-         make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
-      if (MI.modifiesRegister(AMDGPU::SCC, &RI))
-        return false;
-      if (MI.killsRegister(AMDGPU::SCC, &RI))
-        KillsSCC = &MI;
-    }
-
-    MachineOperand *SccDef =
-        Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
-    SccDef->setIsDead(false);
-    if (KillsSCC)
-      KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
-    CmpInstr.eraseFromParent();
+    if (!optimizeSCC(Def, &CmpInstr, RI))
+      return false;
 
     if (!MRI->use_nodbg_empty(DefReg)) {
       assert(!IsReversedCC);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index dc23a21f959ce..2ecd94186e1e0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -172,7 +172,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI,
                                     SIInstrWorklist &Worklist) const;
 
-  void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+  void addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
                                     MachineInstr &SCCDefInst,
                                     SIInstrWorklist &Worklist,
                                     Register NewCond = Register()) const;
@@ -307,22 +307,19 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      Register DestReg, unsigned SubIdx,
-                     const MachineInstr &Orig,
-                     const TargetRegisterInfo &TRI) const override;
+                     const MachineInstr &Orig) const override;
 
   // Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp
   // instructions. Returns a pair of generated instructions.
@@ -1622,10 +1619,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   /// Return true if this opcode should not be used by codegen.
   bool isAsmOnlyOpcode(int MCOp) const;
 
-  const TargetRegisterClass *
-  getRegClass(const MCInstrDesc &TID, unsigned OpNum,
-              const TargetRegisterInfo *TRI) const override;
-
   void fixImplicitOperands(MachineInstr &MI) const;
 
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
@@ -1687,7 +1680,7 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI,
 /// skipping copy like instructions and subreg-manipulation pseudos.
 /// Following another subreg of a reg:subreg isn't supported.
 MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
-                               MachineRegisterInfo &MRI);
+                               const MachineRegisterInfo &MRI);
 
 /// \brief Return false if EXEC is not changed between the def of \p VReg at \p
 /// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index b7f63eceb5d5c..42e73ec070c15 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -6,13 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-def isWave32 : Predicate<"Subtarget->isWave32()">,
-  AssemblerPredicate <(any_of FeatureWavefrontSize32,
-                              FeatureAssemblerPermissiveWavesize)>;
-def isWave64 : Predicate<"Subtarget->isWave64()">,
-  AssemblerPredicate <(any_of FeatureWavefrontSize64,
-                              FeatureAssemblerPermissiveWavesize)>;
-
 class AMDGPUMnemonicAlias<string From, string To, string VariantName = "">
     : MnemonicAlias<From, To, VariantName>, PredicateControl;
 
@@ -776,11 +769,7 @@ def xnor : PatFrag <
 foreach I = 1-4 in {
 def shl#I#_add : PatFrag <
   (ops node:$src0, node:$src1),
-  (add (shl_oneuse $src0, (i32 I)), $src1)> {
-  // FIXME: Poor substitute for disabling pattern in SelectionDAG
-  let PredicateCode = [{return false;}];
-  let GISelPredicateCode = [{return true;}];
-}
+  (add (shl_oneuse $src0, (i32 I)), $src1)>;
 }
 
 multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 6f1feb1dc2996..6dd4b1d7bd000 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -791,6 +791,17 @@ def : GCNPat<
   (SI_CALL_ISEL $src0, (i64 0))
 >;
 
+// Funnel shift right (fshr) patterns for uniform inputs.
+// These patterns implement this using scalar instructions by constructing a 64-bit
+// value {a, b} and performing a single right shift.
+def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, i32:$src2),
+  (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), (S_AND_B32 $src2, (i32 31))), sub0))
+>;
+
+def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, (i32 ShiftAmt32Imm:$src2)),
+  (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), $src2), sub0))
+>;
+
 // Wrapper around s_swappc_b64 with extra $callee parameter to track
 // the called function after regalloc.
 def SI_CALL : SPseudoInstSI <
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index f0d1117664983..fcf91e0cf0a7c 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -233,10 +233,11 @@ class SILoadStoreOptimizer {
 
   void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
                       MachineBasicBlock::iterator InsertBefore,
-                      AMDGPU::OpName OpName, Register DestReg) const;
+                      const DebugLoc &DL, AMDGPU::OpName OpName,
+                      Register DestReg) const;
   Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
                            MachineBasicBlock::iterator InsertBefore,
-                           AMDGPU::OpName OpName) const;
+                           const DebugLoc &DL, AMDGPU::OpName OpName) const;
 
   unsigned read2Opcode(unsigned EltSize) const;
   unsigned read2ST64Opcode(unsigned EltSize) const;
@@ -1336,11 +1337,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
     int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
                                               AMDGPU::OpName::data1);
 
-    const TargetRegisterClass *DataRC0 =
-        TII->getRegClass(Write2Opc, Data0Idx, TRI);
+    const TargetRegisterClass *DataRC0 = TII->getRegClass(Write2Opc, Data0Idx);
 
-    const TargetRegisterClass *DataRC1 =
-        TII->getRegClass(Write2Opc, Data1Idx, TRI);
+    const TargetRegisterClass *DataRC1 = TII->getRegClass(Write2Opc, Data1Idx);
 
     if (unsigned SubReg = Data0->getSubReg()) {
       DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()),
@@ -1367,10 +1366,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
 // Paired.
 void SILoadStoreOptimizer::copyToDestRegs(
     CombineInfo &CI, CombineInfo &Paired,
-    MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName,
-    Register DestReg) const {
+    MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL,
+    AMDGPU::OpName OpName, Register DestReg) const {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
 
   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
 
@@ -1398,9 +1396,9 @@ void SILoadStoreOptimizer::copyToDestRegs(
 Register
 SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
                                       MachineBasicBlock::iterator InsertBefore,
+                                      const DebugLoc &DL,
                                       AMDGPU::OpName OpName) const {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
 
   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
 
@@ -1456,7 +1454,8 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
   Register DestReg = MRI->createVirtualRegister(SuperRC);
 
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   Register BaseReg = AddrReg->getReg();
   unsigned BaseSubReg = AddrReg->getSubReg();
@@ -1484,7 +1483,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
           .addImm(0)                                 // gds
           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1541,7 +1540,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
 
   const MCInstrDesc &Write2Desc = TII->get(Opc);
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   Register BaseReg = AddrReg->getReg();
   unsigned BaseSubReg = AddrReg->getSubReg();
@@ -1582,7 +1582,9 @@ MachineBasicBlock::iterator
 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
                                      MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
+
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
@@ -1607,7 +1609,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
 
   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1618,7 +1620,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
+
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
@@ -1639,7 +1643,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
   New.addImm(MergedOffset);
   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::sdst, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1650,7 +1654,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
@@ -1680,7 +1686,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
         .addImm(0)            // swz
         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1691,7 +1697,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
@@ -1731,7 +1739,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
           .addImm(0)            // swz
           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1742,12 +1750,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
   Register SrcReg =
-      copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
+      copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
 
   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .addReg(SrcReg, RegState::Kill);
@@ -1789,7 +1798,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
@@ -1807,7 +1818,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
        .addImm(CI.CPol)
        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1818,12 +1829,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
   Register SrcReg =
-      copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
+      copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
 
   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
@@ -2094,12 +2107,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
   Register SrcReg =
-      copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
+      copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
 
   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .addReg(SrcReg, RegState::Kill);
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 96131bd591a17..9b710013a09ce 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -75,7 +75,7 @@ Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF,
 bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) {
   assert(Changed || ConstrainRegs.empty());
   for (Register Reg : ConstrainRegs)
-    MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
+    MRI->constrainRegClass(Reg, TII->getRegisterInfo().getWaveMaskRegClass());
   ConstrainRegs.clear();
 
   return Changed;
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 40eeeb8a8630d..cbd08f0fb5dff 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -117,27 +117,26 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
   MachineFunction &MF = *SaveBlock.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *RI = ST.getRegisterInfo();
 
   MachineBasicBlock::iterator I = SaveBlock.begin();
-  if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
+  if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, RI)) {
     for (const CalleeSavedInfo &CS : CSI) {
       // Insert the spill to the stack frame.
       MCRegister Reg = CS.getReg();
 
       MachineInstrSpan MIS(I, &SaveBlock);
-      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(
+      const TargetRegisterClass *RC = RI->getMinimalPhysRegClass(
           Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32);
 
       // If this value was already livein, we probably have a direct use of the
       // incoming register value, so don't kill at the spill point. This happens
       // since we pass some special inputs (workgroup IDs) in the callee saved
       // range.
-      const bool IsLiveIn = isLiveIntoMBB(Reg, SaveBlock, TRI);
+      const bool IsLiveIn = isLiveIntoMBB(Reg, SaveBlock, RI);
       TII.storeRegToStackSlot(SaveBlock, I, Reg, !IsLiveIn, CS.getFrameIdx(),
-                              RC, TRI, Register());
+                              RC, Register());
 
       if (Indexes) {
         assert(std::distance(MIS.begin(), I) == 1);
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index bfac639b6de09..86ca22cfeffd8 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1334,20 +1334,21 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
   const MCInstrDesc &Desc = TII->get(MI.getOpcode());
   unsigned ConstantBusCount = 0;
   for (MachineOperand &Op : MI.explicit_uses()) {
-    if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
-      continue;
-
-    unsigned I = Op.getOperandNo();
+    if (Op.isReg()) {
+      if (TRI->isVGPR(*MRI, Op.getReg()))
+        continue;
 
-    int16_t RegClass = TII->getOpRegClassID(Desc.operands()[I]);
-    if (RegClass == -1 || !TRI->isVSSuperClass(TRI->getRegClass(RegClass)))
+      if (ST.hasSDWAScalar() && ConstantBusCount == 0) {
+        ++ConstantBusCount;
+        continue;
+      }
+    } else if (!Op.isImm())
       continue;
 
-    if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
-        TRI->isSGPRReg(*MRI, Op.getReg())) {
-      ++ConstantBusCount;
+    unsigned I = Op.getOperandNo();
+    const TargetRegisterClass *OpRC = TII->getRegClass(Desc, I);
+    if (!OpRC || !TRI->isVSSuperClass(OpRC))
       continue;
-    }
 
     Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 7431e111ec862..8785968569d92 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -296,7 +296,7 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
   for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
                                          E = MI.getIterator();
        I != E; ++I) {
-    if (I->isBundle())
+    if (I->isBundle() || I->isDebugInstr())
       continue;
     switch (I->getOpcode()) {
     case AMDGPU::S_SET_GPR_IDX_MODE:
@@ -640,7 +640,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
 }
 
 void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
-  MachineOperand DstOp = I.getOperand(0);
+  const MachineOperand &DstOp = I.getOperand(0);
 
   uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
   assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index a6c1af24e13e9..ecf3aee6048cd 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3046,7 +3046,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     if (!IsMUBUF && !MFI->isBottomOfStack()) {
       // Convert to a swizzled stack address by scaling by the wave size.
       // In an entry function/kernel the offset is already swizzled.
-      bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
+      bool IsSALU = isSGPRClass(TII->getRegClass(MI->getDesc(), FIOperandNum));
       bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
                      !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
       const TargetRegisterClass *RC = IsSALU && !LiveSCC
@@ -3741,18 +3741,11 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
                                     unsigned DstSubReg,
                                     const TargetRegisterClass *NewRC,
                                     LiveIntervals &LIS) const {
-  unsigned SrcSize = getRegSizeInBits(*SrcRC);
-  unsigned DstSize = getRegSizeInBits(*DstRC);
+  // TODO: This should be more aggressive, but be more cautious with very wide
+  // tuples.
   unsigned NewSize = getRegSizeInBits(*NewRC);
-
-  // Do not increase size of registers beyond dword, we would need to allocate
-  // adjacent registers and constraint regalloc more than needed.
-
-  // Always allow dword coalescing.
-  if (SrcSize <= 32 || DstSize <= 32)
-    return true;
-
-  return NewSize <= DstSize || NewSize <= SrcSize;
+  return NewSize <= 128 || NewSize <= getRegSizeInBits(*SrcRC) ||
+         NewSize <= getRegSizeInBits(*DstRC);
 }
 
 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
@@ -3788,7 +3781,7 @@ unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
   llvm_unreachable("Unexpected register pressure set!");
 }
 
-const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
+const int *SIRegisterInfo::getRegUnitPressureSets(MCRegUnit RegUnit) const {
   static const int Empty[] = { -1 };
 
   if (RegPressureIgnoredUnits[RegUnit])
@@ -3915,20 +3908,6 @@ const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const {
                                 : &AMDGPU::VReg_64RegClass;
 }
 
-const TargetRegisterClass *
-SIRegisterInfo::getRegClass(unsigned RCID) const {
-  switch ((int)RCID) {
-  case AMDGPU::SReg_1RegClassID:
-    return getBoolRC();
-  case AMDGPU::SReg_1_XEXECRegClassID:
-    return getWaveMaskRegClass();
-  case -1:
-    return nullptr;
-  default:
-    return AMDGPUGenRegisterInfo::getRegClass(RCID);
-  }
-}
-
 // Find reaching register definition
 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
                                               MachineInstr &Use,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 7b91ba7bc581f..cd4dc9bc4d037 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -357,7 +357,7 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
                              const MachineFunction &MF, const VirtRegMap *VRM,
                              const LiveRegMatrix *Matrix) const override;
 
-  const int *getRegUnitPressureSets(unsigned RegUnit) const override;
+  const int *getRegUnitPressureSets(MCRegUnit RegUnit) const override;
 
   MCRegister getReturnAddressReg(const MachineFunction &MF) const;
 
@@ -391,8 +391,6 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
 
   MCRegister getExec() const;
 
-  const TargetRegisterClass *getRegClass(unsigned RCID) const;
-
   // Find reaching register definition
   MachineInstr *findReachingDef(Register Reg, unsigned SubReg,
                                 MachineInstr &Use,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index fc8f46a0d2b93..abe12c17ae76c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -896,20 +896,6 @@ def SReg_64_Encodable : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v
   let Size = 64;
 }
 
-def SReg_1_XEXEC : SIRegisterClass<"AMDGPU", [i1], 32,
-  (add SReg_64_XEXEC, SReg_32_XEXEC)> {
-  let CopyCost = 1;
-  let isAllocatable = 0;
-  let HasSGPR = 1;
-}
-
-def SReg_1 : SIRegisterClass<"AMDGPU", [i1], 32,
-  (add SReg_1_XEXEC, EXEC, EXEC_LO, EXEC_HI)> {
-  let CopyCost = 1;
-  let isAllocatable = 0;
-  let HasSGPR = 1;
-}
-
 multiclass SRegClass<int numRegs,
                      list<ValueType> regTypes,
                      SIRegisterTuples regList,
@@ -1205,6 +1191,34 @@ defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>;
 defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_1024)>;
 }
 
+def SReg_1_XEXEC : SIRegisterClassLike<0, false, false, true>,
+  RegClassByHwMode<
+    [DefaultMode_Wave64,
+     AlignedVGPRNoAGPRMode_Wave64,
+     AVAlign2LoadStoreMode,
+     DefaultMode_Wave32,
+     AlignedVGPRNoAGPRMode_Wave32],
+    [SReg_64_XEXEC,
+     SReg_64_XEXEC,
+     SReg_64_XEXEC,
+     SReg_32_XM0_XEXEC, // FIXME: Why do the wave32 cases exclude m0?
+     SReg_32_XM0_XEXEC]
+>;
+
+def SReg_1 : SIRegisterClassLike<0, false, false, true>,
+  RegClassByHwMode<
+    [DefaultMode_Wave64,
+     AlignedVGPRNoAGPRMode_Wave64,
+     AVAlign2LoadStoreMode,
+     DefaultMode_Wave32,
+     AlignedVGPRNoAGPRMode_Wave32],
+    [SReg_64,
+     SReg_64,
+     SReg_64,
+     SReg_32,
+     SReg_32]
+>;
+
 //===----------------------------------------------------------------------===//
 //
 //  AlignTarget classes. Artifical classes to swap between
@@ -1212,17 +1226,36 @@ defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_102
 //
 //===----------------------------------------------------------------------===//
 
+// We have 3 orthogonal properties to consider. Unfortunately we need
+// to define the cross product of these states, minus unused
+// combinations.
+
 def AV_LdSt_32_Target : RegClassByHwMode<
-    [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
-    [VGPR_32, AV_32, VGPR_32]>, SIRegisterClassLike<32, true, true> {
+    [DefaultMode_Wave64,
+     DefaultMode_Wave32,
+     AVAlign2LoadStoreMode,
+     AlignedVGPRNoAGPRMode_Wave64,
+     AlignedVGPRNoAGPRMode_Wave32],
+    [VGPR_32,
+     VGPR_32,
+     AV_32,
+     VGPR_32,
+     VGPR_32]>,
+    SIRegisterClassLike<32, true, true> {
   let DecoderMethod = "decodeAVLdSt";
 }
 
 foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 1024 ] in {
   def VReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true>,
     RegClassByHwMode<
-      [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+      [DefaultMode_Wave64,
+       DefaultMode_Wave32,
+       AVAlign2LoadStoreMode,
+       AlignedVGPRNoAGPRMode_Wave64,
+       AlignedVGPRNoAGPRMode_Wave32],
       [!cast<RegisterClass>("VReg_"#RegSize),
+       !cast<RegisterClass>("VReg_"#RegSize),
+       !cast<RegisterClass>("VReg_"#RegSize#_Align2),
        !cast<RegisterClass>("VReg_"#RegSize#_Align2),
        !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
      let DecoderMethod = "DecodeVReg_"#RegSize#"RegisterClass";
@@ -1230,45 +1263,59 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10
 
   def AReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, false, true>,
     RegClassByHwMode<
-      [DefaultMode, AVAlign2LoadStoreMode, /*Unused combination*/],
+      [DefaultMode_Wave64, /*unused combination*/ AVAlign2LoadStoreMode, /*Unused combination*/ /*Unused combination*/],
       [!cast<RegisterClass>("AReg_"#RegSize),
+       /*unused combination*/
        !cast<RegisterClass>("AReg_"#RegSize#_Align2)
+       /*Unused combination*/
        /*Unused combination*/]> {
      let DecoderMethod = "DecodeAReg_"#RegSize#"RegisterClass";
    }
 
   def AV_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>,
     RegClassByHwMode<
-      [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+      [DefaultMode_Wave32,
+       DefaultMode_Wave64,
+       AVAlign2LoadStoreMode,
+       AlignedVGPRNoAGPRMode_Wave64,
+       AlignedVGPRNoAGPRMode_Wave32],
       [!cast<RegisterClass>("AV_"#RegSize),
+       !cast<RegisterClass>("AV_"#RegSize),
        !cast<RegisterClass>("AV_"#RegSize#_Align2),
+       !cast<RegisterClass>("VReg_"#RegSize#_Align2),
        !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
      let DecoderMethod = "DecodeAV_"#RegSize#"RegisterClass";
   }
 
   def AV_LdSt_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>,
     RegClassByHwMode<
-      [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+      [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
       [!cast<RegisterClass>("VReg_"#RegSize),
+       !cast<RegisterClass>("VReg_"#RegSize),
        !cast<RegisterClass>("AV_"#RegSize#_Align2),
+       !cast<RegisterClass>("VReg_"#RegSize#_Align2),
        !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
     let DecoderMethod = "decodeAVLdSt";
   }
 
   def AV_LdSt_#RegSize#_Align2 : SIRegisterClassLike<RegSize, true, true>,
     RegClassByHwMode<
-      [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+      [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
       [!cast<RegisterClass>("VReg_"#RegSize#_Align2),
+       !cast<RegisterClass>("VReg_"#RegSize#_Align2),
        !cast<RegisterClass>("AV_"#RegSize#_Align2),
+       !cast<RegisterClass>("VReg_"#RegSize#_Align2),
        !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
     let DecoderMethod = "decodeAVLdSt";
   }
 
   def AV_LdSt_#RegSize#_Align1 : SIRegisterClassLike<RegSize, true, true>,
     RegClassByHwMode<
-        [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+        [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
         [!cast<RegisterClass>("VReg_"#RegSize),
+         !cast<RegisterClass>("VReg_"#RegSize),
          !cast<RegisterClass>("AV_"#RegSize),
+         !cast<RegisterClass>("VReg_"#RegSize),
          !cast<RegisterClass>("VReg_"#RegSize)]> {
     let DecoderMethod = "decodeAVLdSt";
   }
@@ -1276,8 +1323,8 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10
 
 def VS_64_AlignTarget : SIRegisterClassLike<64, true, false, true>,
   RegClassByHwMode<
-      [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
-      [VS_64, VS_64_Align2, VS_64_Align2]> {
+      [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
+      [VS_64,              VS_64,              VS_64_Align2,          VS_64_Align2,                 VS_64_Align2]> {
   let DecoderMethod = "decodeSrcRegOrImm9";
 }
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 3e1b058726dbb..37bf2d2463ae2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -897,7 +897,7 @@ unsigned ComponentInfo::getIndexInParsedOperands(unsigned CompOprIdx) const {
 }
 
 std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
-    std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+    std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
     const MCRegisterInfo &MRI, bool SkipSrc, bool AllowSameVGPR,
     bool VOPD3) const {
 
@@ -914,12 +914,13 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
       BaseX = X;
     if (!BaseY)
       BaseY = Y;
-    if ((BaseX & BanksMask) == (BaseY & BanksMask))
+    if ((BaseX.id() & BanksMask) == (BaseY.id() & BanksMask))
       return true;
     if (BaseX != X /* This is 64-bit register */ &&
-        ((BaseX + 1) & BanksMask) == (BaseY & BanksMask))
+        ((BaseX.id() + 1) & BanksMask) == (BaseY.id() & BanksMask))
       return true;
-    if (BaseY != Y && (BaseX & BanksMask) == ((BaseY + 1) & BanksMask))
+    if (BaseY != Y &&
+        (BaseX.id() & BanksMask) == ((BaseY.id() + 1) & BanksMask))
       return true;
 
     // If both are 64-bit bank conflict will be detected yet while checking
@@ -968,7 +969,7 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
 // if the operand is not a register or not a VGPR.
 InstInfo::RegIndices
 InstInfo::getRegIndices(unsigned CompIdx,
-                        std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+                        std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
                         bool VOPD3) const {
   assert(CompIdx < COMPONENTS_NUM);
 
@@ -983,7 +984,7 @@ InstInfo::getRegIndices(unsigned CompIdx,
         Comp.hasRegSrcOperand(CompSrcIdx)
             ? GetRegIdx(CompIdx,
                         Comp.getIndexOfSrcInMCOperands(CompSrcIdx, VOPD3))
-            : 0;
+            : MCRegister();
   }
   return RegIndices;
 }
@@ -2697,8 +2698,8 @@ MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI) {
 
 MCRegister mc2PseudoReg(MCRegister Reg) { MAP_REG2REG }
 
-bool isInlineValue(unsigned Reg) {
-  switch (Reg) {
+bool isInlineValue(MCRegister Reg) {
+  switch (Reg.id()) {
   case AMDGPU::SRC_SHARED_BASE_LO:
   case AMDGPU::SRC_SHARED_BASE:
   case AMDGPU::SRC_SHARED_LIMIT_LO:
@@ -3361,7 +3362,7 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
                           : getGfx9BufferFormatInfo(Format);
 }
 
-const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
+const MCRegisterClass *getVGPRPhysRegClass(MCRegister Reg,
                                            const MCRegisterInfo &MRI) {
   const unsigned VGPRClasses[] = {
       AMDGPU::VGPR_16RegClassID,  AMDGPU::VGPR_32RegClassID,
@@ -3382,22 +3383,22 @@ const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
   return nullptr;
 }
 
-unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI) {
+unsigned getVGPREncodingMSBs(MCRegister Reg, const MCRegisterInfo &MRI) {
   unsigned Enc = MRI.getEncodingValue(Reg);
   unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
   return Idx >> 8;
 }
 
-MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs,
-                          const MCRegisterInfo &MRI) {
+MCRegister getVGPRWithMSBs(MCRegister Reg, unsigned MSBs,
+                           const MCRegisterInfo &MRI) {
   unsigned Enc = MRI.getEncodingValue(Reg);
   unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
   if (Idx >= 0x100)
-    return AMDGPU::NoRegister;
+    return MCRegister();
 
   const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI);
   if (!RC)
-    return AMDGPU::NoRegister;
+    return MCRegister();
 
   Idx |= MSBs << 8;
   if (RC->getID() == AMDGPU::VGPR_16RegClassID) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 5e3195b36fe4c..9f65f9326a73e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -909,7 +909,7 @@ class InstInfo {
   const ComponentInfo CompInfo[COMPONENTS_NUM];
 
 public:
-  using RegIndices = std::array<unsigned, Component::MAX_OPR_NUM>;
+  using RegIndices = std::array<MCRegister, Component::MAX_OPR_NUM>;
 
   InstInfo(const MCInstrDesc &OpX, const MCInstrDesc &OpY)
       : CompInfo{OpX, OpY} {}
@@ -932,9 +932,10 @@ class InstInfo {
   // even though it violates requirement to be from different banks.
   // If \p VOPD3 is set to true both dst registers allowed to be either odd
   // or even and instruction may have real src2 as opposed to tied accumulator.
-  bool hasInvalidOperand(std::function<unsigned(unsigned, unsigned)> GetRegIdx,
-                         const MCRegisterInfo &MRI, bool SkipSrc = false,
-                         bool AllowSameVGPR = false, bool VOPD3 = false) const {
+  bool
+  hasInvalidOperand(std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
+                    const MCRegisterInfo &MRI, bool SkipSrc = false,
+                    bool AllowSameVGPR = false, bool VOPD3 = false) const {
     return getInvalidCompOperandIndex(GetRegIdx, MRI, SkipSrc, AllowSameVGPR,
                                       VOPD3)
         .has_value();
@@ -949,14 +950,14 @@ class InstInfo {
   // If \p VOPD3 is set to true both dst registers allowed to be either odd
   // or even and instruction may have real src2 as opposed to tied accumulator.
   std::optional<unsigned> getInvalidCompOperandIndex(
-      std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+      std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
       const MCRegisterInfo &MRI, bool SkipSrc = false,
       bool AllowSameVGPR = false, bool VOPD3 = false) const;
 
 private:
   RegIndices
   getRegIndices(unsigned ComponentIdx,
-                std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+                std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
                 bool VOPD3) const;
 };
 
@@ -1599,7 +1600,7 @@ LLVM_READNONE
 MCRegister mc2PseudoReg(MCRegister Reg);
 
 LLVM_READNONE
-bool isInlineValue(unsigned Reg);
+bool isInlineValue(MCRegister Reg);
 
 /// Is this an AMDGPU specific source operand? These include registers,
 /// inline constants, literals and mandatory literals (KImm).
@@ -1798,16 +1799,16 @@ bool isIntrinsicAlwaysUniform(unsigned IntrID);
 
 /// \returns a register class for the physical register \p Reg if it is a VGPR
 /// or nullptr otherwise.
-const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
+const MCRegisterClass *getVGPRPhysRegClass(MCRegister Reg,
                                            const MCRegisterInfo &MRI);
 
 /// \returns the MODE bits which have to be set by the S_SET_VGPR_MSB for the
 /// physical register \p Reg.
-unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI);
+unsigned getVGPREncodingMSBs(MCRegister Reg, const MCRegisterInfo &MRI);
 
 /// If \p Reg is a low VGPR return a corresponding high VGPR with \p MSBs set.
-MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs,
-                          const MCRegisterInfo &MRI);
+MCRegister getVGPRWithMSBs(MCRegister Reg, unsigned MSBs,
+                           const MCRegisterInfo &MRI);
 
 // Returns a table for the opcode with a given \p Desc to map the VGPR MSB
 // set by the S_SET_VGPR_MSB to one of 4 sources. In case of VOPD returns 2
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 54f57e02ed47e..85adcab55b742 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -513,6 +513,13 @@ defm V_CVT_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_u16_f16",
 defm V_CVT_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_i16_f16",
   VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16, fp_to_sint>;
 
+
+let HasClamp = 0, HasOMod = 0 in {
+def V_TRANS_BF16_Profile :  VOPProfile <[bf16, bf16, untyped, untyped]>;
+def V_TRANS_BF16_t16_Profile :  VOPProfile_True16 <VOP_BF16_BF16>;
+def V_TRANS_BF16_fake16_Profile :  VOPProfile_Fake16 <VOP_BF16_BF16>;
+}
+
 let TRANS = 1, SchedRW = [WriteTrans32] in {
 defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
 defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>;
@@ -527,14 +534,30 @@ defm V_TANH_F16  : VOP1Inst_t16 <"v_tanh_f16",  VOP_F16_F16, int_amdgcn_tanh>;
 }
 
 let SubtargetPredicate = HasBF16TransInsts in {
-defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
-defm V_RCP_BF16  : VOP1Inst_t16 <"v_rcp_bf16",  VOP_BF16_BF16, AMDGPUrcp>;
-defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>;
-defm V_RSQ_BF16  : VOP1Inst_t16 <"v_rsq_bf16",  VOP_BF16_BF16, AMDGPUrsq>;
-defm V_LOG_BF16  : VOP1Inst_t16 <"v_log_bf16",  VOP_BF16_BF16, AMDGPUlogf16>;
-defm V_EXP_BF16  : VOP1Inst_t16 <"v_exp_bf16",  VOP_BF16_BF16, AMDGPUexpf16>;
-defm V_SIN_BF16  : VOP1Inst_t16 <"v_sin_bf16",  VOP_BF16_BF16, AMDGPUsin>;
-defm V_COS_BF16  : VOP1Inst_t16 <"v_cos_bf16",  VOP_BF16_BF16, AMDGPUcos>;
+defm V_TANH_BF16 : VOP1Inst_t16_with_profiles<"v_tanh_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               int_amdgcn_tanh>;
+defm V_RCP_BF16  : VOP1Inst_t16_with_profiles<"v_rcp_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUrcp>;
+defm V_SQRT_BF16 : VOP1Inst_t16_with_profiles<"v_sqrt_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               any_amdgcn_sqrt>;
+defm V_RSQ_BF16  : VOP1Inst_t16_with_profiles<"v_rsq_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUrsq>;
+defm V_LOG_BF16  : VOP1Inst_t16_with_profiles<"v_log_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUlogf16>;
+defm V_EXP_BF16  : VOP1Inst_t16_with_profiles<"v_exp_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUexpf16>;
+defm V_SIN_BF16  : VOP1Inst_t16_with_profiles<"v_sin_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUsin>;
+defm V_COS_BF16  : VOP1Inst_t16_with_profiles<"v_cos_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUcos>;
 }
 } // End TRANS = 1, SchedRW = [WriteTrans32]
 defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 4ae2c1ed04dae..786e75f081e44 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1707,7 +1707,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse
   defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2");
   defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
 
-  let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+  let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in {
     let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in
       def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
         let PseudoInstr = Instr#PseudoInstrSuffix;
@@ -1734,7 +1734,7 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P
     let mayRaiseFPException = 0;
     let ReadsModeReg = 0;
     let AsmMatchConverter = "cvtSWMMAC";
-
+    let isConvergent = 1;
     let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef";
   }
 }
@@ -1906,8 +1906,10 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32   : WMMAInstGFX12<"v_wmma_scale_f32_32x16
 defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">;
 } // End is_wmma_xdl = 1.
 
-defm V_WMMA_LD_SCALE_PAIRED_B32   : VOP3PInst<"v_wmma_ld_scale_paired_b32",   VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
-defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
+let isConvergent = 1 in {
+  defm V_WMMA_LD_SCALE_PAIRED_B32   : VOP3PInst<"v_wmma_ld_scale_paired_b32",   VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
+  defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
+}
 } // End SubtargetPredicate = isGFX125xOnly
 } // End WaveSizePredicate = isWave32
 
@@ -2216,7 +2218,7 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO
   let Inst{23-16} = LdScaleOp;
   let Inst{40-32} = scale_src0;
   let Inst{49-41} = scale_src1;
-  let Inst{58-50} = 0; // scale src2
+  let Inst{58-50} = 0x100; // scale src2 = vgpr0 (dummy)
   let Inst{59}    = matrix_b_scale{0}; // scale_op_sel_hi(0)
   let Inst{60}    = 0;                 // scale_op_sel_hi(1)
   let Inst{63-61} = {0, matrix_a_scale_fmt{1-0}}; // neg (lo)
@@ -2431,6 +2433,15 @@ multiclass VOP3P_Real_with_name_gfx12<bits<8> op,
                           string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> :
   VOP3P_Real_with_name<GFX12Gen, op, backing_ps_name, asmName>;
 
+multiclass VOP3P_Real_LD_SCALE_gfx1250<bits<8> op> {
+  defvar ps = !cast<VOP3P_Pseudo>(NAME);
+  def _gfx1250 :
+    VOP3P_Real_Gen<ps, GFX1250Gen, ps.Mnemonic>,
+    VOP3Pe_gfx11_gfx12<op, ps.Pfl> {
+      let Inst{58-50} = 0x100; // scale src2 = vgpr0 (dummy)
+    }
+}
+
 defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">;
 defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">;
 
@@ -2460,8 +2471,8 @@ defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>;
 defm V_FMA_MIXLO_BF16   : VOP3P_Realtriple<GFX1250Gen, 0x3e>;
 defm V_FMA_MIXHI_BF16   : VOP3P_Realtriple<GFX1250Gen, 0x3f>;
 
-defm V_WMMA_LD_SCALE_PAIRED_B32   : VOP3P_Real_gfx1250<0x35>;
-defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_gfx1250<0x3a>;
+defm V_WMMA_LD_SCALE_PAIRED_B32   : VOP3P_Real_LD_SCALE_gfx1250<0x35>;
+defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_LD_SCALE_gfx1250<0x3a>;
 
 let AssemblerPredicate = isGFX1250Plus in
 def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16",  "v_fma_mix_f32">;
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 2730ec52294e9..a829b807f33e8 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -1233,18 +1233,12 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
 // We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith()
 // complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place.
 multiclass ICMP_Pattern <PatFrags cond, Instruction inst, ValueType vt, dag dstInst = (inst $src0, $src1)> {
-  let WaveSizePredicate = isWave64 in
   def : GCNPat <
-    (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
-    (i64 (COPY_TO_REGCLASS dstInst, SReg_64))
+    (WaveSizeVT (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
+    dstInst
   >;
 
   let WaveSizePredicate = isWave32 in {
-    def : GCNPat <
-      (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
-      (i32 (COPY_TO_REGCLASS dstInst, SReg_32))
-    >;
-
     // Support codegen of i64 setcc in wave32 mode.
     def : GCNPat <
       (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 8325c628d68d6..ea3edb8ca6662 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1357,8 +1357,12 @@ class VOPBinOpClampPat<SDPatternOperator node, Instruction inst, ValueType vt> :
 
 class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
   dag src0 = !if(P.HasOMod,
-    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
-    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp));
+    !if(P.HasClamp,
+        (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+        (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i32:$omod)),
+    !if(P.HasClamp,
+        (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+        (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers)));
 
   list<dag> ret3 = [(set P.DstVT:$vdst,
     (DivergentFragOrOp<node, P>.ret (P.Src0VT src0),
@@ -2204,12 +2208,12 @@ include "VOP3PInstructions.td"
 include "VOPDInstructions.td"
 
 class ClassPat<Instruction inst, ValueType vt> : GCNPat <
-  (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)),
+  (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))),
   (inst i32:$src0_mods, vt:$src0, (V_MOV_B32_e32 timm:$mask))
 >;
 
 class ClassPat_t16<Instruction inst, ValueType vt> : GCNPat <
-  (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)),
+  (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))),
   (inst i32:$src0_mods, vt:$src0, SRCMODS.NONE, (V_MOV_B32_e32 timm:$mask))
 >;
 
diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.cpp b/llvm/lib/Target/ARC/ARCInstrInfo.cpp
index 05bcb3596ac48..e17ecbf87faae 100644
--- a/llvm/lib/Target/ARC/ARCInstrInfo.cpp
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.cpp
@@ -44,7 +44,8 @@ enum TSFlagsConstants {
 void ARCInstrInfo::anchor() {}
 
 ARCInstrInfo::ARCInstrInfo(const ARCSubtarget &ST)
-    : ARCGenInstrInfo(ST, ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), RI(ST) {}
+    : ARCGenInstrInfo(ST, RI, ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP),
+      RI(ST) {}
 
 static bool isZeroImm(const MachineOperand &Op) {
   return Op.isImm() && Op.getImm() == 0;
@@ -293,8 +294,7 @@ void ARCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
 void ARCInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg,
-    bool IsKill, int FrameIndex, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
+    bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
     MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBB.findDebugLoc(I);
   MachineFunction &MF = *MBB.getParent();
@@ -306,11 +306,11 @@ void ARCInstrInfo::storeRegToStackSlot(
       MFI.getObjectAlign(FrameIndex));
 
   assert(MMO && "Couldn't get MachineMemOperand for store to stack.");
-  assert(TRI->getSpillSize(*RC) == 4 &&
+  assert(TRI.getSpillSize(*RC) == 4 &&
          "Only support 4-byte stores to stack now.");
   assert(ARC::GPR32RegClass.hasSubClassEq(RC) &&
          "Only support GPR32 stores to stack now.");
-  LLVM_DEBUG(dbgs() << "Created store reg=" << printReg(SrcReg, TRI)
+  LLVM_DEBUG(dbgs() << "Created store reg=" << printReg(SrcReg, &TRI)
                     << " to FrameIndex=" << FrameIndex << "\n");
   BuildMI(MBB, I, DL, get(ARC::ST_rs9))
       .addReg(SrcReg, getKillRegState(IsKill))
@@ -323,7 +323,6 @@ void ARCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator I,
                                         Register DestReg, int FrameIndex,
                                         const TargetRegisterClass *RC,
-                                        const TargetRegisterInfo *TRI,
                                         Register VReg,
                                         MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBB.findDebugLoc(I);
@@ -335,11 +334,11 @@ void ARCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       MFI.getObjectAlign(FrameIndex));
 
   assert(MMO && "Couldn't get MachineMemOperand for store to stack.");
-  assert(TRI->getSpillSize(*RC) == 4 &&
+  assert(TRI.getSpillSize(*RC) == 4 &&
          "Only support 4-byte loads from stack now.");
   assert(ARC::GPR32RegClass.hasSubClassEq(RC) &&
          "Only support GPR32 stores to stack now.");
-  LLVM_DEBUG(dbgs() << "Created load reg=" << printReg(DestReg, TRI)
+  LLVM_DEBUG(dbgs() << "Created load reg=" << printReg(DestReg, &TRI)
                     << " from FrameIndex=" << FrameIndex << "\n");
   BuildMI(MBB, I, DL, get(ARC::LD_rs9))
       .addReg(DestReg, RegState::Define)
diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.h b/llvm/lib/Target/ARC/ARCInstrInfo.h
index 2cf05ba57bd4b..ebeaf877f8436 100644
--- a/llvm/lib/Target/ARC/ARCInstrInfo.h
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.h
@@ -70,14 +70,12 @@ class ARCInstrInfo : public ARCGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool IsKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool
diff --git a/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h b/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h
index cace92a2b8fb6..9c21121b382b4 100644
--- a/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h
@@ -14,7 +14,6 @@
 #define LLVM_LIB_TARGET_ARC_ARCMACHINEFUNCTIONINFO_H
 
 #include "llvm/CodeGen/MachineFunction.h"
-#include <vector>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 36b99087e0a32..2d2e62c80c702 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -97,7 +97,8 @@ void ARMAsmPrinter::emitXXStructor(const DataLayout &DL, const Constant *CV) {
 
   const MCExpr *E = MCSymbolRefExpr::create(
       GetARMGVSymbol(GV, ARMII::MO_NO_FLAG),
-      (Subtarget->isTargetELF() ? ARM::S_TARGET1 : ARM::S_None), OutContext);
+      (TM.getTargetTriple().isOSBinFormatELF() ? ARM::S_TARGET1 : ARM::S_None),
+      OutContext);
 
   OutStreamer->emitValue(E, Size);
 }
@@ -595,8 +596,7 @@ void ARMAsmPrinter::emitEndOfAsmFile(Module &M) {
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
 
   if (OptimizationGoals > 0 &&
-      (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
-       Subtarget->isTargetMuslAEABI()))
+      (TT.isTargetAEABI() || TT.isTargetGNUAEABI() || TT.isTargetMuslAEABI()))
     ATS.emitAttribute(ARMBuildAttrs::ABI_optimization_goals, OptimizationGoals);
   OptimizationGoals = -1;
 
@@ -884,9 +884,10 @@ static uint8_t getModifierSpecifier(ARMCP::ARMCPModifier Modifier) {
 
 MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
                                         unsigned char TargetFlags) {
-  if (Subtarget->isTargetMachO()) {
+  const Triple &TT = TM.getTargetTriple();
+  if (TT.isOSBinFormatMachO()) {
     bool IsIndirect =
-        (TargetFlags & ARMII::MO_NONLAZY) && Subtarget->isGVIndirectSymbol(GV);
+        (TargetFlags & ARMII::MO_NONLAZY) && getTM().isGVIndirectSymbol(GV);
 
     if (!IsIndirect)
       return getSymbol(GV);
@@ -903,9 +904,8 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
       StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
                                                    !GV->hasInternalLinkage());
     return MCSym;
-  } else if (Subtarget->isTargetCOFF()) {
-    assert(Subtarget->isTargetWindows() &&
-           "Windows is the only supported COFF target");
+  } else if (TT.isOSBinFormatCOFF()) {
+    assert(TT.isOSWindows() && "Windows is the only supported COFF target");
 
     bool IsIndirect =
         (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB));
@@ -932,7 +932,7 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
     }
 
     return MCSym;
-  } else if (Subtarget->isTargetELF()) {
+  } else if (TT.isOSBinFormatELF()) {
     return getSymbolPreferLocal(*GV);
   }
   llvm_unreachable("unexpected target");
@@ -978,7 +978,8 @@ void ARMAsmPrinter::emitMachineConstantPoolValue(
 
     // On Darwin, const-pool entries may get the "FOO$non_lazy_ptr" mangling, so
     // flag the global as MO_NONLAZY.
-    unsigned char TF = Subtarget->isTargetMachO() ? ARMII::MO_NONLAZY : 0;
+    unsigned char TF =
+        TM.getTargetTriple().isOSBinFormatMachO() ? ARMII::MO_NONLAZY : 0;
     MCSym = GetARMGVSymbol(GV, TF);
   } else if (ACPV->isMachineBasicBlock()) {
     const MachineBasicBlock *MBB = cast<ARMConstantPoolMBB>(ACPV)->getMBB();
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 22769dbf38719..6077c18463240 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -107,8 +107,9 @@ static const ARM_MLxEntry ARM_MLxTable[] = {
   { ARM::VMLSslfq,    ARM::VMULslfq,    ARM::VSUBfq,     false,  true  },
 };
 
-ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI)
-    : ARMGenInstrInfo(STI, ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
+ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI,
+                                   const ARMBaseRegisterInfo &TRI)
+    : ARMGenInstrInfo(STI, TRI, ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
       Subtarget(STI) {
   for (unsigned i = 0, e = std::size(ARM_MLxTable); i != e; ++i) {
     if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second)
@@ -928,15 +929,15 @@ ARMBaseInstrInfo::describeLoadedValue(const MachineInstr &MI,
   return TargetInstrInfo::describeLoadedValue(MI, Reg);
 }
 
-const MachineInstrBuilder &
-ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
-                          unsigned SubIdx, unsigned State,
-                          const TargetRegisterInfo *TRI) const {
+const MachineInstrBuilder &ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB,
+                                                     unsigned Reg,
+                                                     unsigned SubIdx,
+                                                     unsigned State) const {
   if (!SubIdx)
     return MIB.addReg(Reg, State);
 
   if (Register::isPhysicalRegister(Reg))
-    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
+    return MIB.addReg(getRegisterInfo().getSubReg(Reg, SubIdx), State);
   return MIB.addReg(Reg, State, SubIdx);
 }
 
@@ -944,18 +945,18 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator I,
                                            Register SrcReg, bool isKill, int FI,
                                            const TargetRegisterClass *RC,
-                                           const TargetRegisterInfo *TRI,
                                            Register VReg,
                                            MachineInstr::MIFlag Flags) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   Align Alignment = MFI.getObjectAlign(FI);
+  const ARMBaseRegisterInfo &TRI = getRegisterInfo();
 
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
       MFI.getObjectSize(FI), Alignment);
 
-  switch (TRI->getSpillSize(*RC)) {
+  switch (TRI.getSpillSize(*RC)) {
     case 2:
       if (ARM::HPRRegClass.hasSubClassEq(RC)) {
         BuildMI(MBB, I, DebugLoc(), get(ARM::VSTRH))
@@ -1010,8 +1011,8 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
         if (Subtarget.hasV5TEOps()) {
           MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::STRD));
-          AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
-          AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+          AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill));
+          AddDReg(MIB, SrcReg, ARM::gsub_1, 0);
           MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)
              .add(predOps(ARMCC::AL));
         } else {
@@ -1021,8 +1022,8 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                         .addFrameIndex(FI)
                                         .addMemOperand(MMO)
                                         .add(predOps(ARMCC::AL));
-          AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
-          AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+          AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill));
+          AddDReg(MIB, SrcReg, ARM::gsub_1, 0);
         }
       } else
         llvm_unreachable("Unknown reg class!");
@@ -1072,9 +1073,9 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                         .addFrameIndex(FI)
                                         .add(predOps(ARMCC::AL))
                                         .addMemOperand(MMO);
-          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
-          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
-          AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill));
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0);
+          AddDReg(MIB, SrcReg, ARM::dsub_2, 0);
         }
       } else
         llvm_unreachable("Unknown reg class!");
@@ -1104,10 +1105,10 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                         .addFrameIndex(FI)
                                         .add(predOps(ARMCC::AL))
                                         .addMemOperand(MMO);
-          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
-          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
-          MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
-                AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill));
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0);
+                AddDReg(MIB, SrcReg, ARM::dsub_3, 0);
         }
       } else
         llvm_unreachable("Unknown reg class!");
@@ -1124,14 +1125,14 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       .addFrameIndex(FI)
                                       .add(predOps(ARMCC::AL))
                                       .addMemOperand(MMO);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0, TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI);
-              AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill));
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0);
+              AddDReg(MIB, SrcReg, ARM::dsub_7, 0);
       } else
         llvm_unreachable("Unknown reg class!");
       break;
@@ -1207,10 +1208,12 @@ Register ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
   return false;
 }
 
-void ARMBaseInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void ARMBaseInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator I,
+                                            Register DestReg, int FI,
+                                            const TargetRegisterClass *RC,
+                                            Register VReg,
+                                            MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
   MachineFunction &MF = *MBB.getParent();
@@ -1220,7 +1223,8 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
       MFI.getObjectSize(FI), Alignment);
 
-  switch (TRI->getSpillSize(*RC)) {
+  const ARMBaseRegisterInfo &TRI = getRegisterInfo();
+  switch (TRI.getSpillSize(*RC)) {
   case 2:
     if (ARM::HPRRegClass.hasSubClassEq(RC)) {
       BuildMI(MBB, I, DL, get(ARM::VLDRH), DestReg)
@@ -1271,8 +1275,8 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
 
       if (Subtarget.hasV5TEOps()) {
         MIB = BuildMI(MBB, I, DL, get(ARM::LDRD));
-        AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
-        AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+        AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead);
+        AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead);
         MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)
            .add(predOps(ARMCC::AL));
       } else {
@@ -1282,8 +1286,8 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
                   .addFrameIndex(FI)
                   .addMemOperand(MMO)
                   .add(predOps(ARMCC::AL));
-        MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
-        MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead);
+        MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead);
       }
 
       if (DestReg.isPhysical())
@@ -1329,9 +1333,9 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
                                       .addFrameIndex(FI)
                                       .addMemOperand(MMO)
                                       .add(predOps(ARMCC::AL));
-        MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
-        MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
-        MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead);
         if (DestReg.isPhysical())
           MIB.addReg(DestReg, RegState::ImplicitDefine);
       }
@@ -1358,10 +1362,10 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
                                        .addFrameIndex(FI)
                                        .add(predOps(ARMCC::AL))
                                        .addMemOperand(MMO);
-         MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
-         MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
-         MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
-         MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead);
          if (DestReg.isPhysical())
            MIB.addReg(DestReg, RegState::ImplicitDefine);
        }
@@ -1379,14 +1383,14 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
                                     .addFrameIndex(FI)
                                     .add(predOps(ARMCC::AL))
                                     .addMemOperand(MMO);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead);
       if (DestReg.isPhysical())
         MIB.addReg(DestReg, RegState::ImplicitDefine);
     } else
@@ -1652,8 +1656,7 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
 void ARMBaseInstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I,
                                      Register DestReg, unsigned SubIdx,
-                                     const MachineInstr &Orig,
-                                     const TargetRegisterInfo &TRI) const {
+                                     const MachineInstr &Orig) const {
   unsigned Opcode = Orig.getOpcode();
   switch (Opcode) {
   default: {
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 2869e7f708046..04e2ab055cf1a 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -44,7 +44,8 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
 
 protected:
   // Can be only subclassed.
-  explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
+  explicit ARMBaseInstrInfo(const ARMSubtarget &STI,
+                            const ARMBaseRegisterInfo &TRI);
 
   void expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
                                 unsigned LoadImmOpc, unsigned LoadOpc) const;
@@ -125,7 +126,11 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
   // if there is not such an opcode.
   virtual unsigned getUnindexedOpcode(unsigned Opc) const = 0;
 
-  virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0;
+  const ARMBaseRegisterInfo &getRegisterInfo() const {
+    return static_cast<const ARMBaseRegisterInfo &>(
+        TargetInstrInfo::getRegisterInfo());
+  }
+
   const ARMSubtarget &getSubtarget() const { return Subtarget; }
 
   ScheduleHazardRecognizer *
@@ -211,14 +216,13 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
@@ -227,16 +231,14 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
 
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      Register DestReg, unsigned SubIdx,
-                     const MachineInstr &Orig,
-                     const TargetRegisterInfo &TRI) const override;
+                     const MachineInstr &Orig) const override;
 
   MachineInstr &
   duplicate(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
             const MachineInstr &Orig) const override;
 
   const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
-                                     unsigned SubIdx, unsigned State,
-                                     const TargetRegisterInfo *TRI) const;
+                                     unsigned SubIdx, unsigned State) const;
 
   bool produceSameValue(const MachineInstr &MI0, const MachineInstr &MI1,
                         const MachineRegisterInfo *MRI) const override;
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index ce1cdb35116cc..80921ce4fb4dd 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -708,7 +708,7 @@ ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const MCInstrDesc &MCID = TII.get(ADDriOpc);
   Register BaseReg = MRI.createVirtualRegister(&ARM::GPRRegClass);
-  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this));
+  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0));
 
   MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, MCID, BaseReg)
     .addFrameIndex(FrameIdx).addImm(Offset);
@@ -881,8 +881,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   Register PredReg = (PIdx == -1) ? Register() : MI.getOperand(PIdx+1).getReg();
 
   const MCInstrDesc &MCID = MI.getDesc();
-  const TargetRegisterClass *RegClass =
-      TII.getRegClass(MCID, FIOperandNum, this);
+  const TargetRegisterClass *RegClass = TII.getRegClass(MCID, FIOperandNum);
 
   if (Offset == 0 && (FrameReg.isVirtual() || RegClass->contains(FrameReg)))
     // Must be addrmode4/6.
diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index f43ec73db7e1f..80494d993f425 100644
--- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -51,7 +51,6 @@
 #include <cassert>
 #include <cstdint>
 #include <iterator>
-#include <utility>
 #include <vector>
 
 using namespace llvm;
diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp
index 14e1160e70dae..88d3b6f7d5bb9 100644
--- a/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -86,7 +86,7 @@ namespace {
   // All possible address modes, plus some.
 class Address {
 public:
-  using BaseKind = enum { RegBase, FrameIndexBase };
+  enum BaseKind { RegBase, FrameIndexBase };
 
 private:
   BaseKind Kind = RegBase;
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 138981ad92a87..21a113572ce93 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -2342,7 +2342,6 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   const ARMBaseInstrInfo &TII =
       *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   unsigned Limit = (1 << 12) - 1;
   for (auto &MBB : MF) {
     for (auto &MI : MBB) {
@@ -2364,7 +2363,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
           break;
 
         const MCInstrDesc &MCID = MI.getDesc();
-        const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i, TRI);
+        const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i);
         if (RegClass && !RegClass->contains(ARM::SP))
           HasNonSPFrameIndex = true;
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 6b0653457cbaf..f28640ce7b107 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1312,8 +1312,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
   }
 
-  setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-  setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
   // FP-ARMv8 implements a lot of rounding-like FP operations.
   if (Subtarget->hasFPARMv8Base()) {
@@ -2510,9 +2510,44 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
     if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
       Chain = DAG.getStackArgumentTokenFactor(Chain);
-      if (ByValTempChain)
+      if (ByValTempChain) {
+        // In case of large byval copies, re-using the stackframe for tail-calls
+        // can lead to overwriting incoming arguments on the stack. Force
+        // loading these stack arguments before the copy to avoid that.
+        SmallVector<SDValue, 8> IncomingLoad;
+        for (unsigned I = 0; I < OutVals.size(); ++I) {
+          if (Outs[I].Flags.isByVal())
+            continue;
+
+          SDValue OutVal = OutVals[I];
+          LoadSDNode *OutLN = dyn_cast_or_null<LoadSDNode>(OutVal);
+          if (!OutLN)
+            continue;
+
+          FrameIndexSDNode *FIN =
+              dyn_cast_or_null<FrameIndexSDNode>(OutLN->getBasePtr());
+          if (!FIN)
+            continue;
+
+          if (!MFI.isFixedObjectIndex(FIN->getIndex()))
+            continue;
+
+          for (const CCValAssign &VA : ArgLocs) {
+            if (VA.isMemLoc())
+              IncomingLoad.push_back(OutVal.getValue(1));
+          }
+        }
+
+        // Update the chain to force loads for potentially clobbered argument
+        // loads to happen before the byval copy.
+        if (!IncomingLoad.empty()) {
+          IncomingLoad.push_back(Chain);
+          Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, IncomingLoad);
+        }
+
         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
                             ByValTempChain);
+      }
       AfterFormalArgLoads = true;
     }
 
@@ -9855,76 +9890,6 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
 }
 
-SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
-  // For iOS, we want to call an alternative entry point: __sincos_stret,
-  // return values are passed via sret.
-  SDLoc dl(Op);
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
-  RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
-  RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC);
-  if (SincosStret == RTLIB::Unsupported)
-    return SDValue();
-
-  assert(Subtarget->isTargetDarwin());
-
-  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-  auto PtrVT = getPointerTy(DAG.getDataLayout());
-
-  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-
-  // Pair of floats / doubles used to pass the result.
-  Type *RetTy = StructType::get(ArgTy, ArgTy);
-  auto &DL = DAG.getDataLayout();
-
-  ArgListTy Args;
-  bool ShouldUseSRet = getTM().isAPCS_ABI();
-  SDValue SRet;
-  if (ShouldUseSRet) {
-    // Create stack object for sret.
-    const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
-    const Align StackAlign = DL.getPrefTypeAlign(RetTy);
-    int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
-    SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
-
-    ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext()));
-    Entry.IsSExt = false;
-    Entry.IsZExt = false;
-    Entry.IsSRet = true;
-    Args.push_back(Entry);
-    RetTy = Type::getVoidTy(*DAG.getContext());
-  }
-
-  Args.emplace_back(Arg, ArgTy);
-
-  StringRef LibcallName = getLibcallImplName(SincosStret);
-  CallingConv::ID CC = getLibcallImplCallingConv(SincosStret);
-  SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL));
-
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl)
-      .setChain(DAG.getEntryNode())
-      .setCallee(CC, RetTy, Callee, std::move(Args))
-      .setDiscardResult(ShouldUseSRet);
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-
-  if (!ShouldUseSRet)
-    return CallResult.first;
-
-  SDValue LoadSin =
-      DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
-
-  // Address of cos field.
-  SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
-                            DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
-  SDValue LoadCos =
-      DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
-
-  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
-                     LoadSin.getValue(0), LoadCos.getValue(0));
-}
-
 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
                                                   bool Signed,
                                                   SDValue &Chain) const {
@@ -10726,8 +10691,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VECREDUCE_SMAX:
     return LowerVecReduceMinMax(Op, DAG, Subtarget);
   case ISD::ATOMIC_LOAD:
-  case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
-  case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
+  case ISD::ATOMIC_STORE:
+    return LowerAtomicLoadStore(Op, DAG);
   case ISD::SDIVREM:
   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index bf3438b0d8803..bc2fec3c1bdb5 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -901,7 +901,6 @@ class VectorType;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                               const ARMSubtarget *ST) const;
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const;
     void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed,
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/llvm/lib/Target/ARM/ARMInstrInfo.cpp
index c684de7252e5d..f37054736b730 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -25,7 +25,8 @@
 #include "llvm/MC/MCInst.h"
 using namespace llvm;
 
-ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI) {}
+ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
+    : ARMBaseInstrInfo(STI, RI) {}
 
 /// Return the noop instruction to use for a noop.
 MCInst ARMInstrInfo::getNop() const {
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.h b/llvm/lib/Target/ARM/ARMInstrInfo.h
index 178d7a2c630e4..9feaf1440f2b2 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.h
@@ -35,7 +35,7 @@ class ARMInstrInfo : public ARMBaseInstrInfo {
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  const ARMRegisterInfo &getRegisterInfo() const override { return RI; }
+  const ARMRegisterInfo &getRegisterInfo() const { return RI; }
 
 private:
   void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index cd4299b7a1a53..db37b769efcad 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -2424,7 +2424,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(
           Ops.pop_back();
 
           const MCInstrDesc &MCID = TII->get(NewOpc);
-          const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI);
+          const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0);
           MRI->constrainRegClass(FirstReg, TRC);
           MRI->constrainRegClass(SecondReg, TRC);
 
@@ -3014,7 +3014,7 @@ static void AdjustBaseAndOffset(MachineInstr *MI, Register NewBaseReg,
   MachineFunction *MF = MI->getMF();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const MCInstrDesc &MCID = TII->get(MI->getOpcode());
-  const TargetRegisterClass *TRC = TII->getRegClass(MCID, BaseOp, TRI);
+  const TargetRegisterClass *TRC = TII->getRegClass(MCID, BaseOp);
   MRI.constrainRegClass(NewBaseReg, TRC);
 
   int OldOffset = MI->getOperand(BaseOp + 1).getImm();
@@ -3071,10 +3071,10 @@ static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset,
 
   const MCInstrDesc &MCID = TII->get(NewOpcode);
   // Constrain the def register class
-  const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI);
+  const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0);
   MRI.constrainRegClass(NewReg, TRC);
   // And do the same for the base operand
-  TRC = TII->getRegClass(MCID, 2, TRI);
+  TRC = TII->getRegClass(MCID, 2);
   MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC);
 
   unsigned AddrMode = (MCID.TSFlags & ARMII::AddrModeMask);
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 597d311989b2f..1719165fb6717 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -1051,7 +1051,6 @@ bool LowOverheadLoop::ValidateLiveOuts() {
       // check where it gets its false lanes from, if any.
       int InactiveIdx = findVPTInactiveOperandIdx(*MI);
       if (InactiveIdx != -1) {
-        SmallPtrSet<MachineInstr *, 2> Defs;
         MachineInstr *FalseSrc = RDI.getUniqueReachingMIDef(
             MI, MI->getOperand(InactiveIdx).getReg());
         if (FalseSrc) {
diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index 72eb3d0f8b7f4..b6897608a952c 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -19,7 +19,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/Support/ErrorHandling.h"
-#include <utility>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index ebfa593fbe9e6..bf7c962f02efc 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -47,9 +47,7 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
 
   // Only use a specialized AEABI function if the default version of this
   // Libcall is an AEABI function.
-  if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
-    return SDValue();
-
+  //
   // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
   // able to translate memset to memclr and use the value to index the function
   // name array.
@@ -61,12 +59,21 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
   } AEABILibcall;
   switch (LC) {
   case RTLIB::MEMCPY:
+    if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memcpy)
+      return SDValue();
+
     AEABILibcall = AEABI_MEMCPY;
     break;
   case RTLIB::MEMMOVE:
+    if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memmove)
+      return SDValue();
+
     AEABILibcall = AEABI_MEMMOVE;
     break;
   case RTLIB::MEMSET:
+    if (TLI->getLibcallImpl(LC) != RTLIB::impl___aeabi_memset)
+      return SDValue();
+
     AEABILibcall = AEABI_MEMSET;
     if (isNullConstant(Src))
       AEABILibcall = AEABI_MEMCLR;
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 58bc338b25856..7ec232ae9bac5 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -318,17 +318,7 @@ bool ARMSubtarget::isRWPI() const {
 }
 
 bool ARMSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
-  if (!TM.shouldAssumeDSOLocal(GV))
-    return true;
-
-  // 32 bit macho has no relocation for a-b if a is undefined, even if b is in
-  // the section that is being relocated. This means we have to use o load even
-  // for GVs that are known to be local to the dso.
-  if (isTargetMachO() && TM.isPositionIndependent() &&
-      (GV->isDeclarationForLinker() || GV->hasCommonLinkage()))
-    return true;
-
-  return false;
+  return TM.isGVIndirectSymbol(GV);
 }
 
 bool ARMSubtarget::isGVInGOT(const GlobalValue *GV) const {
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.h b/llvm/lib/Target/ARM/ARMTargetMachine.h
index c417c4c8bae65..1f74e9fdd1dc9 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.h
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -98,6 +98,20 @@ class ARMBaseTargetMachine : public CodeGenTargetMachineImpl {
     return true;
   }
 
+  bool isGVIndirectSymbol(const GlobalValue *GV) const {
+    if (!shouldAssumeDSOLocal(GV))
+      return true;
+
+    // 32 bit macho has no relocation for a-b if a is undefined, even if b is in
+    // the section that is being relocated. This means we have to use o load
+    // even for GVs that are known to be local to the dso.
+    if (getTargetTriple().isOSBinFormatMachO() && isPositionIndependent() &&
+        (GV->isDeclarationForLinker() || GV->hasCommonLinkage()))
+      return true;
+
+    return false;
+  }
+
   yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override;
   yaml::MachineFunctionInfo *
   convertFuncInfoToYAML(const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 9b250e6cac3ab..24f58a68c345d 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -2448,7 +2448,8 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
 //
 static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                  const DataLayout &DL,
-                                 const LoopAccessInfo *LAI) {
+                                 const LoopAccessInfo *LAI,
+                                 const DominatorTree &DT) {
   LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
 
   // If there are live-out values, it is probably a reduction. We can predicate
@@ -2498,7 +2499,8 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
       if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
         Value *Ptr = getLoadStorePointerOperand(&I);
         Type *AccessTy = getLoadStoreType(&I);
-        int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
+        int64_t NextStride =
+            getPtrStride(PSE, AccessTy, Ptr, L, DT).value_or(0);
         if (NextStride == 1) {
           // TODO: for now only allow consecutive strides of 1. We could support
           // other strides as long as it is uniform, but let's keep it simple
@@ -2585,7 +2587,8 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
     return false;
   }
 
-  return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
+  return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(),
+                              *LVL->getDominatorTree());
 }
 
 TailFoldingStyle
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index f60660b12baca..1bb670d195a98 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -426,15 +426,15 @@ class ARMAsmParser : public MCTargetAsmParser {
       VPTState.CurPosition = ~0U;
   }
 
-  void Note(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) {
+  void Note(SMLoc L, const Twine &Msg, SMRange Range = {}) {
     return getParser().Note(L, Msg, Range);
   }
 
-  bool Warning(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) {
+  bool Warning(SMLoc L, const Twine &Msg, SMRange Range = {}) {
     return getParser().Warning(L, Msg, Range);
   }
 
-  bool Error(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt) {
+  bool Error(SMLoc L, const Twine &Msg, SMRange Range = {}) {
     return getParser().Error(L, Msg, Range);
   }
 
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index e67db8e3159c0..b119146576569 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -1402,7 +1402,7 @@ static DecodeStatus DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
     Inst.addOperand(MCOperand::createImm(U | (imm << 4) | Rm));
   } else {
     if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
-    return MCDisassembler::Fail;
+      return MCDisassembler::Fail;
     Inst.addOperand(MCOperand::createImm(U));
   }
 
@@ -1922,7 +1922,7 @@ static DecodeStatus DecodeBranchImmInstruction(MCInst &Inst, unsigned Insn,
     imm |= fieldFromInstruction(Insn, 24, 1) << 1;
     if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<26>(imm) + 8,
                                   true, 4, Inst, Decoder))
-    Inst.addOperand(MCOperand::createImm(SignExtend32<26>(imm)));
+      Inst.addOperand(MCOperand::createImm(SignExtend32<26>(imm)));
     return S;
   }
 
@@ -3703,17 +3703,17 @@ static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
     Rdm |= fieldFromInstruction(Insn, 7, 1) << 3;
 
     if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
-    return MCDisassembler::Fail;
+      return MCDisassembler::Fail;
     Inst.addOperand(MCOperand::createReg(ARM::SP));
     if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
-    return MCDisassembler::Fail;
+      return MCDisassembler::Fail;
   } else if (Inst.getOpcode() == ARM::tADDspr) {
     unsigned Rm = fieldFromInstruction(Insn, 3, 4);
 
     Inst.addOperand(MCOperand::createReg(ARM::SP));
     Inst.addOperand(MCOperand::createReg(ARM::SP));
     if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
-    return MCDisassembler::Fail;
+      return MCDisassembler::Fail;
   }
 
   return S;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 01fe13b343926..f8196e460ae9c 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -1238,7 +1238,7 @@ uint64_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
   // Verify standard frame (lr/r7) was used.
   if (CFARegister != ARM::R7) {
     DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "frame register is "
-                                                   << CFARegister
+                                                   << CFARegister.id()
                                                    << " instead of r7\n");
     return CU::UNWIND_ARM_MODE_DWARF;
   }
diff --git a/llvm/lib/Target/ARM/MLxExpansionPass.cpp b/llvm/lib/Target/ARM/MLxExpansionPass.cpp
index 8e1bf1d957400..eb237b4275cc9 100644
--- a/llvm/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/llvm/lib/Target/ARM/MLxExpansionPass.cpp
@@ -283,7 +283,7 @@ MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
 
   const MCInstrDesc &MCID1 = TII->get(MulOpc);
   const MCInstrDesc &MCID2 = TII->get(AddSubOpc);
-  Register TmpReg = MRI->createVirtualRegister(TII->getRegClass(MCID1, 0, TRI));
+  Register TmpReg = MRI->createVirtualRegister(TII->getRegClass(MCID1, 0));
 
   MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg)
     .addReg(Src1Reg, getKillRegState(Src1Kill))
diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index 4b8c2fd569ead..01f588f0cdc38 100644
--- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -24,7 +24,7 @@
 using namespace llvm;
 
 Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI)
-    : ARMBaseInstrInfo(STI), RI(STI) {}
+    : ARMBaseInstrInfo(STI, RI), RI(STI) {}
 
 /// Return the noop instruction to use for a noop.
 MCInst Thumb1InstrInfo::getNop() const {
@@ -116,7 +116,6 @@ void Thumb1InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator I,
                                           Register SrcReg, bool isKill, int FI,
                                           const TargetRegisterClass *RC,
-                                          const TargetRegisterInfo *TRI,
                                           Register VReg,
                                           MachineInstr::MIFlag Flags) const {
   assert((RC == &ARM::tGPRRegClass ||
@@ -142,10 +141,12 @@ void Thumb1InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   }
 }
 
-void Thumb1InstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void Thumb1InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator I,
+                                           Register DestReg, int FI,
+                                           const TargetRegisterClass *RC,
+                                           Register VReg,
+                                           MachineInstr::MIFlag Flags) const {
   assert((RC->hasSuperClassEq(&ARM::tGPRRegClass) ||
           (DestReg.isPhysical() && isARMLowRegister(DestReg))) &&
          "Unknown regclass!");
diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/llvm/lib/Target/ARM/Thumb1InstrInfo.h
index 68b326c0ebef6..289a30a4ca1e4 100644
--- a/llvm/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.h
@@ -35,7 +35,7 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo {
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
+  const ThumbRegisterInfo &getRegisterInfo() const { return RI; }
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, Register DestReg, Register SrcReg,
@@ -43,14 +43,13 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo {
                    bool RenamableSrc = false) const override;
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool canCopyGluedNodeDuringSchedule(SDNode *N) const override;
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index f5653d459eac8..efb92c9bcac18 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -46,7 +46,7 @@ PreferNoCSEL("prefer-no-csel", cl::Hidden,
              cl::init(false));
 
 Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
-    : ARMBaseInstrInfo(STI), RI(STI) {}
+    : ARMBaseInstrInfo(STI, RI), RI(STI) {}
 
 /// Return the noop instruction to use for a noop.
 MCInst Thumb2InstrInfo::getNop() const {
@@ -165,7 +165,6 @@ void Thumb2InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator I,
                                           Register SrcReg, bool isKill, int FI,
                                           const TargetRegisterClass *RC,
-                                          const TargetRegisterInfo *TRI,
                                           Register VReg,
                                           MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
@@ -197,20 +196,22 @@ void Thumb2InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     }
 
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8));
-    AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
-    AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+    AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill));
+    AddDReg(MIB, SrcReg, ARM::gsub_1, 0);
     MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL));
     return;
   }
 
-  ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC, TRI,
+  ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC,
                                         Register());
 }
 
-void Thumb2InstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void Thumb2InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator I,
+                                           Register DestReg, int FI,
+                                           const TargetRegisterClass *RC,
+                                           Register VReg,
+                                           MachineInstr::MIFlag Flags) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MachineMemOperand *MMO = MF.getMachineMemOperand(
@@ -238,8 +239,8 @@ void Thumb2InstrInfo::loadRegFromStackSlot(
     }
 
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8));
-    AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
-    AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+    AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead);
+    AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead);
     MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL));
 
     if (DestReg.isPhysical())
@@ -247,8 +248,7 @@ void Thumb2InstrInfo::loadRegFromStackSlot(
     return;
   }
 
-  ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI,
-                                         Register());
+  ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, Register());
 }
 
 void Thumb2InstrInfo::expandLoadStackGuard(
@@ -564,7 +564,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   bool isSub = false;
 
   MachineFunction &MF = *MI.getParent()->getParent();
-  const TargetRegisterClass *RegClass = TII.getRegClass(Desc, FrameRegIdx, TRI);
+  const TargetRegisterClass *RegClass = TII.getRegClass(Desc, FrameRegIdx);
 
   // Memory operands in inline assembly always use AddrModeT2_i12.
   if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR)
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index 1b0bf2d499510..1e11cb37efc05 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -44,21 +44,20 @@ class Thumb2InstrInfo : public ARMBaseInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
+  const ThumbRegisterInfo &getRegisterInfo() const { return RI; }
 
   MachineInstr *optimizeSelect(MachineInstr &MI,
                                SmallPtrSetImpl<MachineInstr *> &SeenMIs,
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
index ce9908597dcac..6c37ba1411dde 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -30,8 +30,8 @@
 namespace llvm {
 
 AVRInstrInfo::AVRInstrInfo(const AVRSubtarget &STI)
-    : AVRGenInstrInfo(STI, AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), RI(),
-      STI(STI) {}
+    : AVRGenInstrInfo(STI, RI, AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP),
+      RI(), STI(STI) {}
 
 void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator MI,
@@ -126,8 +126,7 @@ Register AVRInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
 
 void AVRInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-    bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
+    bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
     MachineInstr::MIFlag Flags) const {
   MachineFunction &MF = *MBB.getParent();
   AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
@@ -142,9 +141,9 @@ void AVRInstrInfo::storeRegToStackSlot(
       MFI.getObjectAlign(FrameIndex));
 
   unsigned Opcode = 0;
-  if (TRI->isTypeLegalForClass(*RC, MVT::i8)) {
+  if (RI.isTypeLegalForClass(*RC, MVT::i8)) {
     Opcode = AVR::STDPtrQRr;
-  } else if (TRI->isTypeLegalForClass(*RC, MVT::i16)) {
+  } else if (RI.isTypeLegalForClass(*RC, MVT::i16)) {
     Opcode = AVR::STDWPtrQRr;
   } else {
     llvm_unreachable("Cannot store this register into a stack slot!");
@@ -161,7 +160,6 @@ void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MI,
                                         Register DestReg, int FrameIndex,
                                         const TargetRegisterClass *RC,
-                                        const TargetRegisterInfo *TRI,
                                         Register VReg,
                                         MachineInstr::MIFlag Flags) const {
   MachineFunction &MF = *MBB.getParent();
@@ -173,9 +171,9 @@ void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       MFI.getObjectAlign(FrameIndex));
 
   unsigned Opcode = 0;
-  if (TRI->isTypeLegalForClass(*RC, MVT::i8)) {
+  if (TRI.isTypeLegalForClass(*RC, MVT::i8)) {
     Opcode = AVR::LDDRdPtrQ;
-  } else if (TRI->isTypeLegalForClass(*RC, MVT::i16)) {
+  } else if (TRI.isTypeLegalForClass(*RC, MVT::i16)) {
     // Opcode = AVR::LDDWRdPtrQ;
     //: FIXME: remove this once PR13375 gets fixed
     Opcode = AVR::LDDWRdYQ;
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.h b/llvm/lib/Target/AVR/AVRInstrInfo.h
index 759aea2010962..4db535a990451 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.h
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.h
@@ -79,13 +79,11 @@ class AVRInstrInfo : public AVRGenInstrInfo {
                    bool RenamableSrc = false) const override;
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
   Register isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
diff --git a/llvm/lib/Target/AVR/AVRTargetTransformInfo.h b/llvm/lib/Target/AVR/AVRTargetTransformInfo.h
index 0daeeb8f11cfe..338a7c8082ca3 100644
--- a/llvm/lib/Target/AVR/AVRTargetTransformInfo.h
+++ b/llvm/lib/Target/AVR/AVRTargetTransformInfo.h
@@ -21,7 +21,6 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/IR/Function.h"
-#include <optional>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index fc794c4968b8c..48452f6d9391c 100644
--- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -252,7 +252,7 @@ class AVROperand : public MCParsedAsmOperand {
       O << "Token: \"" << getToken() << "\"";
       break;
     case k_Register:
-      O << "Register: " << getReg();
+      O << "Register: " << getReg().id();
       break;
     case k_Immediate:
       O << "Immediate: \"";
@@ -262,7 +262,7 @@ class AVROperand : public MCParsedAsmOperand {
     case k_Memri: {
       // only manually print the size for non-negative values,
       // as the sign is inserted automatically.
-      O << "Memri: \"" << getReg() << '+';
+      O << "Memri: \"" << getReg().id() << '+';
       MAI.printExpr(O, *getImm());
       O << "\"";
       break;
diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
index 5548ad1ebff5e..84a64ba0aa4ff 100644
--- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
+++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
@@ -82,7 +82,7 @@ static DecodeStatus DecodeGPR8RegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  unsigned Register = GPRDecoderTable[RegNo];
+  MCRegister Register = GPRDecoderTable[RegNo];
   Inst.addOperand(MCOperand::createReg(Register));
   return MCDisassembler::Success;
 }
@@ -174,7 +174,7 @@ static DecodeStatus decodeLoadStore(MCInst &Inst, unsigned Insn,
                                     uint64_t Address,
                                     const MCDisassembler *Decoder) {
   // Get the register will be loaded or stored.
-  unsigned RegVal = GPRDecoderTable[(Insn >> 4) & 0x1f];
+  MCRegister RegVal = GPRDecoderTable[(Insn >> 4) & 0x1f];
 
   // Decode LDD/STD with offset less than 8.
   if ((Insn & 0xf000) == 0x8000) {
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
index 4bb16e237db48..fbb130ccde681 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
@@ -96,7 +96,7 @@ AVRMCCodeEmitter::loadStorePostEncoder(const MCInst &MI, unsigned EncodedValue,
     EncodedValue |= (1 << 12);
 
   // Encode the pointer register.
-  switch (MI.getOperand(Idx).getReg()) {
+  switch (MI.getOperand(Idx).getReg().id()) {
   case AVR::R27R26:
     EncodedValue |= 0xc;
     break;
diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index d96f403d2f814..9f86322a81b3e 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -172,7 +172,7 @@ struct BPFOperand : public MCParsedAsmOperand {
       break;
     case Register:
       OS << "<register x";
-      OS << getReg() << ">";
+      OS << getReg().id() << ">";
       break;
     case Token:
       OS << "'" << getToken() << "'";
diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index 8c7bc2f0f6716..81303faf77b58 100644
--- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -97,7 +97,6 @@
 #define DEBUG_TYPE "bpf-abstract-member-access"
 
 namespace llvm {
-constexpr StringRef BPFCoreSharedInfo::AmaAttr;
 uint32_t BPFCoreSharedInfo::SeqNum;
 
 Instruction *BPFCoreSharedInfo::insertPassThrough(Module *M, BasicBlock *BB,
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index 77dc4a75a7d68..abe081c0c76fd 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -88,6 +88,16 @@ bool BPFAsmPrinter::doFinalization(Module &M) {
     }
   }
 
+  for (GlobalObject &GO : M.global_objects()) {
+    if (!GO.hasExternalWeakLinkage())
+      continue;
+
+    if (!SawTrapCall && GO.getName() == BPF_TRAP) {
+      GO.eraseFromParent();
+      break;
+    }
+  }
+
   return AsmPrinter::doFinalization(M);
 }
 
@@ -160,6 +170,16 @@ bool BPFAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 }
 
 void BPFAsmPrinter::emitInstruction(const MachineInstr *MI) {
+  if (MI->isCall()) {
+    for (const MachineOperand &Op : MI->operands()) {
+      if (Op.isGlobal()) {
+        if (const GlobalValue *GV = Op.getGlobal())
+          if (GV->getName() == BPF_TRAP)
+            SawTrapCall = true;
+      }
+    }
+  }
+
   BPF_MC::verifyInstructionPredicates(MI->getOpcode(),
                                       getSubtargetInfo().getFeatureBits());
 
@@ -195,6 +215,10 @@ void BPFAsmPrinter::emitJumpTableInfo() {
 
   const TargetLoweringObjectFile &TLOF = getObjFileLowering();
   const Function &F = MF->getFunction();
+
+  MCSection *Sec = OutStreamer->getCurrentSectionOnly();
+  MCSymbol *SecStart = Sec->getBeginSymbol();
+
   MCSection *JTS = TLOF.getSectionForJumpTable(F, TM);
   assert(MJTI->getEntryKind() == MachineJumpTableInfo::EK_BlockAddress);
   unsigned EntrySize = MJTI->getEntrySize(getDataLayout());
@@ -207,8 +231,10 @@ void BPFAsmPrinter::emitJumpTableInfo() {
     MCSymbol *JTStart = getJTPublicSymbol(JTI);
     OutStreamer->emitLabel(JTStart);
     for (const MachineBasicBlock *MBB : JTBBs) {
-      const MCExpr *LHS = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
-      OutStreamer->emitValue(LHS, EntrySize);
+      const MCExpr *Diff = MCBinaryExpr::createSub(
+          MCSymbolRefExpr::create(MBB->getSymbol(), OutContext),
+          MCSymbolRefExpr::create(SecStart, OutContext), OutContext);
+      OutStreamer->emitValue(Diff, EntrySize);
     }
     const MCExpr *JTSize =
         MCConstantExpr::create(JTBBs.size() * EntrySize, OutContext);
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.h b/llvm/lib/Target/BPF/BPFAsmPrinter.h
index 90ef2073609a6..75a1d7ed9f884 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.h
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.h
@@ -39,6 +39,7 @@ class BPFAsmPrinter : public AsmPrinter {
 private:
   BTFDebug *BTF;
   TargetMachine &TM;
+  bool SawTrapCall = false;
 
   const BPFTargetMachine &getBTM() const;
 };
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index 6e5520c3dbb18..3c61216cd9327 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -803,26 +803,6 @@ SDValue BPFTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   return getAddr(N, DAG);
 }
 
-const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch ((BPFISD::NodeType)Opcode) {
-  case BPFISD::FIRST_NUMBER:
-    break;
-  case BPFISD::RET_GLUE:
-    return "BPFISD::RET_GLUE";
-  case BPFISD::CALL:
-    return "BPFISD::CALL";
-  case BPFISD::SELECT_CC:
-    return "BPFISD::SELECT_CC";
-  case BPFISD::BR_CC:
-    return "BPFISD::BR_CC";
-  case BPFISD::Wrapper:
-    return "BPFISD::Wrapper";
-  case BPFISD::MEMCPY:
-    return "BPFISD::MEMCPY";
-  }
-  return nullptr;
-}
-
 static SDValue getTargetNode(ConstantPoolSDNode *N, const SDLoc &DL, EVT Ty,
                              SelectionDAG &DAG, unsigned Flags) {
   return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h
index 5243d4944667d..3d6e7c70df28b 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -20,17 +20,6 @@
 
 namespace llvm {
 class BPFSubtarget;
-namespace BPFISD {
-enum NodeType : unsigned {
-  FIRST_NUMBER = ISD::BUILTIN_OP_END,
-  RET_GLUE,
-  CALL,
-  SELECT_CC,
-  BR_CC,
-  Wrapper,
-  MEMCPY
-};
-}
 
 class BPFTargetLowering : public TargetLowering {
 public:
@@ -39,9 +28,6 @@ class BPFTargetLowering : public TargetLowering {
   // Provide custom lowering hooks for some operations.
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
-  // This method returns the name of a target specific DAG node.
-  const char *getTargetNodeName(unsigned Opcode) const override;
-
   // This method decides whether folding a constant offset
   // with the given GlobalAddress is legal.
   bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
index 409f8b4c253b8..095e2497eec17 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
@@ -27,7 +27,7 @@
 using namespace llvm;
 
 BPFInstrInfo::BPFInstrInfo(const BPFSubtarget &STI)
-    : BPFGenInstrInfo(STI, BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {}
+    : BPFGenInstrInfo(STI, RI, BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {}
 
 void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I,
@@ -127,7 +127,6 @@ void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I,
                                        Register SrcReg, bool IsKill, int FI,
                                        const TargetRegisterClass *RC,
-                                       const TargetRegisterInfo *TRI,
                                        Register VReg,
                                        MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
@@ -148,10 +147,12 @@ void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     llvm_unreachable("Can't store this register to stack slot");
 }
 
-void BPFInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void BPFInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        Register DestReg, int FI,
+                                        const TargetRegisterClass *RC,
+                                        Register VReg,
+                                        MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (I != MBB.end())
     DL = I->getDebugLoc();
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.h b/llvm/lib/Target/BPF/BPFInstrInfo.h
index 911e880166d29..d3ef9bc164f4a 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.h
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.h
@@ -39,14 +39,13 @@ class BPFInstrInfo : public BPFGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
   bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index 51c32b22510f0..bdacf9cc3a6ab 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -41,14 +41,12 @@ def BPFcallseq_start: SDNode<"ISD::CALLSEQ_START", SDT_BPFCallSeqStart,
                              [SDNPHasChain, SDNPOutGlue]>;
 def BPFcallseq_end  : SDNode<"ISD::CALLSEQ_END",   SDT_BPFCallSeqEnd,
                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-def BPFbrcc         : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC,
-                             [SDNPHasChain, SDNPOutGlue, SDNPInGlue]>;
+def BPFbrcc         : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC, [SDNPHasChain]>;
 
 def BPFselectcc     : SDNode<"BPFISD::SELECT_CC", SDT_BPFSelectCC>;
 def BPFWrapper      : SDNode<"BPFISD::Wrapper", SDT_BPFWrapper>;
 def BPFmemcpy       : SDNode<"BPFISD::MEMCPY", SDT_BPFMEMCPY,
-                             [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
-                              SDNPMayStore, SDNPMayLoad]>;
+                             [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def BPFIsLittleEndian : Predicate<"Subtarget->isLittleEndian()">;
 def BPFIsBigEndian    : Predicate<"!Subtarget->isLittleEndian()">;
 def BPFHasALU32 : Predicate<"Subtarget->getHasAlu32()">;
diff --git a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
index d3b0c0246581b..6a11ea6bf99d9 100644
--- a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
+++ b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
@@ -27,10 +27,6 @@
 
 #define DEBUG_TYPE "bpf-preserve-di-type"
 
-namespace llvm {
-constexpr StringRef BPFCoreSharedInfo::TypeIdAttr;
-} // namespace llvm
-
 using namespace llvm;
 
 namespace {
diff --git a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
index 3e29e6c7ed386..0e6d35dd3781f 100644
--- a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
@@ -10,12 +10,20 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "BPFSelectionDAGInfo.h"
 #include "BPFTargetMachine.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+
+#define GET_SDNODE_DESC
+#include "BPFGenSDNodeInfo.inc"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "bpf-selectiondag-info"
 
+BPFSelectionDAGInfo::BPFSelectionDAGInfo()
+    : SelectionDAGGenTargetInfo(BPFGenSDNodeInfo) {}
+
 SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
@@ -31,11 +39,7 @@ SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy(
   if (StoresNumEstimate > getCommonMaxStoresPerMemFunc())
     return SDValue();
 
-  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
-
-  Dst = DAG.getNode(BPFISD::MEMCPY, dl, VTs, Chain, Dst, Src,
-                    DAG.getConstant(CopyLen, dl, MVT::i64),
-                    DAG.getConstant(Alignment.value(), dl, MVT::i64));
-
-  return Dst.getValue(0);
+  return DAG.getNode(BPFISD::MEMCPY, dl, MVT::Other, Chain, Dst, Src,
+                     DAG.getConstant(CopyLen, dl, MVT::i64),
+                     DAG.getConstant(Alignment.value(), dl, MVT::i64));
 }
diff --git a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
index 79f05e57bb5cd..7345d2d7e4738 100644
--- a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
+++ b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
@@ -15,10 +15,15 @@
 
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
+#define GET_SDNODE_ENUM
+#include "BPFGenSDNodeInfo.inc"
+
 namespace llvm {
 
-class BPFSelectionDAGInfo : public SelectionDAGTargetInfo {
+class BPFSelectionDAGInfo : public SelectionDAGGenTargetInfo {
 public:
+  BPFSelectionDAGInfo();
+
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
                                   SDValue Size, Align Alignment,
@@ -27,9 +32,8 @@ class BPFSelectionDAGInfo : public SelectionDAGTargetInfo {
                                   MachinePointerInfo SrcPtrInfo) const override;
 
   unsigned getCommonMaxStoresPerMemFunc() const { return 128; }
-
 };
 
-}
+} // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/BPF/CMakeLists.txt b/llvm/lib/Target/BPF/CMakeLists.txt
index 3678f1335ca36..fa539a0a7b806 100644
--- a/llvm/lib/Target/BPF/CMakeLists.txt
+++ b/llvm/lib/Target/BPF/CMakeLists.txt
@@ -10,6 +10,7 @@ tablegen(LLVM BPFGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM BPFGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM BPFGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM BPFGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM BPFGenSDNodeInfo.inc -gen-sd-node-info)
 tablegen(LLVM BPFGenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM BPFGenGlobalISel.inc -gen-global-isel)
 tablegen(LLVM BPFGenRegisterBank.inc -gen-register-bank)
diff --git a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
index 7885d93cbad98..a2cf0a57675c7 100644
--- a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
+++ b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
@@ -48,7 +48,6 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <iterator>
diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
index e81bb4745faff..98798275e7979 100644
--- a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
@@ -476,7 +476,7 @@ bool CSKYFrameLowering::spillCalleeSavedRegisters(
     // Insert the spill to the stack frame.
     MCRegister Reg = CS.getReg();
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.storeRegToStackSlot(MBB, MI, Reg, true, CS.getFrameIdx(), RC, TRI,
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, CS.getFrameIdx(), RC,
                             Register());
   }
 
@@ -498,8 +498,7 @@ bool CSKYFrameLowering::restoreCalleeSavedRegisters(
   for (auto &CS : reverse(CSI)) {
     MCRegister Reg = CS.getReg();
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI,
-                             Register());
+    TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, Register());
     assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!");
   }
 
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
index e5b4f6eeb7b73..08f196b248029 100644
--- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
@@ -884,13 +884,13 @@ CSKYTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                .Case("{t4}", CSKY::R20)
                                .Case("{t5}", CSKY::R21)
                                .Case("{t6}", CSKY::R22)
-                               .Cases("{t7}", "{fp}", CSKY::R23)
-                               .Cases("{t8}", "{top}", CSKY::R24)
-                               .Cases("{t9}", "{bsp}", CSKY::R25)
+                               .Cases({"{t7}", "{fp}"}, CSKY::R23)
+                               .Cases({"{t8}", "{top}"}, CSKY::R24)
+                               .Cases({"{t9}", "{bsp}"}, CSKY::R25)
                                .Case("{r26}", CSKY::R26)
                                .Case("{r27}", CSKY::R27)
-                               .Cases("{gb}", "{rgb}", "{rdb}", CSKY::R28)
-                               .Cases("{tb}", "{rtb}", CSKY::R29)
+                               .Cases({"{gb}", "{rgb}", "{rdb}"}, CSKY::R28)
+                               .Cases({"{tb}", "{rtb}"}, CSKY::R29)
                                .Case("{svbr}", CSKY::R30)
                                .Case("{tls}", CSKY::R31)
                                .Default(CSKY::NoRegister);
@@ -907,38 +907,38 @@ CSKYTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   // use the ABI names in register constraint lists.
   if (Subtarget.useHardFloat()) {
     unsigned FReg = StringSwitch<unsigned>(Constraint.lower())
-                        .Cases("{fr0}", "{vr0}", CSKY::F0_32)
-                        .Cases("{fr1}", "{vr1}", CSKY::F1_32)
-                        .Cases("{fr2}", "{vr2}", CSKY::F2_32)
-                        .Cases("{fr3}", "{vr3}", CSKY::F3_32)
-                        .Cases("{fr4}", "{vr4}", CSKY::F4_32)
-                        .Cases("{fr5}", "{vr5}", CSKY::F5_32)
-                        .Cases("{fr6}", "{vr6}", CSKY::F6_32)
-                        .Cases("{fr7}", "{vr7}", CSKY::F7_32)
-                        .Cases("{fr8}", "{vr8}", CSKY::F8_32)
-                        .Cases("{fr9}", "{vr9}", CSKY::F9_32)
-                        .Cases("{fr10}", "{vr10}", CSKY::F10_32)
-                        .Cases("{fr11}", "{vr11}", CSKY::F11_32)
-                        .Cases("{fr12}", "{vr12}", CSKY::F12_32)
-                        .Cases("{fr13}", "{vr13}", CSKY::F13_32)
-                        .Cases("{fr14}", "{vr14}", CSKY::F14_32)
-                        .Cases("{fr15}", "{vr15}", CSKY::F15_32)
-                        .Cases("{fr16}", "{vr16}", CSKY::F16_32)
-                        .Cases("{fr17}", "{vr17}", CSKY::F17_32)
-                        .Cases("{fr18}", "{vr18}", CSKY::F18_32)
-                        .Cases("{fr19}", "{vr19}", CSKY::F19_32)
-                        .Cases("{fr20}", "{vr20}", CSKY::F20_32)
-                        .Cases("{fr21}", "{vr21}", CSKY::F21_32)
-                        .Cases("{fr22}", "{vr22}", CSKY::F22_32)
-                        .Cases("{fr23}", "{vr23}", CSKY::F23_32)
-                        .Cases("{fr24}", "{vr24}", CSKY::F24_32)
-                        .Cases("{fr25}", "{vr25}", CSKY::F25_32)
-                        .Cases("{fr26}", "{vr26}", CSKY::F26_32)
-                        .Cases("{fr27}", "{vr27}", CSKY::F27_32)
-                        .Cases("{fr28}", "{vr28}", CSKY::F28_32)
-                        .Cases("{fr29}", "{vr29}", CSKY::F29_32)
-                        .Cases("{fr30}", "{vr30}", CSKY::F30_32)
-                        .Cases("{fr31}", "{vr31}", CSKY::F31_32)
+                        .Cases({"{fr0}", "{vr0}"}, CSKY::F0_32)
+                        .Cases({"{fr1}", "{vr1}"}, CSKY::F1_32)
+                        .Cases({"{fr2}", "{vr2}"}, CSKY::F2_32)
+                        .Cases({"{fr3}", "{vr3}"}, CSKY::F3_32)
+                        .Cases({"{fr4}", "{vr4}"}, CSKY::F4_32)
+                        .Cases({"{fr5}", "{vr5}"}, CSKY::F5_32)
+                        .Cases({"{fr6}", "{vr6}"}, CSKY::F6_32)
+                        .Cases({"{fr7}", "{vr7}"}, CSKY::F7_32)
+                        .Cases({"{fr8}", "{vr8}"}, CSKY::F8_32)
+                        .Cases({"{fr9}", "{vr9}"}, CSKY::F9_32)
+                        .Cases({"{fr10}", "{vr10}"}, CSKY::F10_32)
+                        .Cases({"{fr11}", "{vr11}"}, CSKY::F11_32)
+                        .Cases({"{fr12}", "{vr12}"}, CSKY::F12_32)
+                        .Cases({"{fr13}", "{vr13}"}, CSKY::F13_32)
+                        .Cases({"{fr14}", "{vr14}"}, CSKY::F14_32)
+                        .Cases({"{fr15}", "{vr15}"}, CSKY::F15_32)
+                        .Cases({"{fr16}", "{vr16}"}, CSKY::F16_32)
+                        .Cases({"{fr17}", "{vr17}"}, CSKY::F17_32)
+                        .Cases({"{fr18}", "{vr18}"}, CSKY::F18_32)
+                        .Cases({"{fr19}", "{vr19}"}, CSKY::F19_32)
+                        .Cases({"{fr20}", "{vr20}"}, CSKY::F20_32)
+                        .Cases({"{fr21}", "{vr21}"}, CSKY::F21_32)
+                        .Cases({"{fr22}", "{vr22}"}, CSKY::F22_32)
+                        .Cases({"{fr23}", "{vr23}"}, CSKY::F23_32)
+                        .Cases({"{fr24}", "{vr24}"}, CSKY::F24_32)
+                        .Cases({"{fr25}", "{vr25}"}, CSKY::F25_32)
+                        .Cases({"{fr26}", "{vr26}"}, CSKY::F26_32)
+                        .Cases({"{fr27}", "{vr27}"}, CSKY::F27_32)
+                        .Cases({"{fr28}", "{vr28}"}, CSKY::F28_32)
+                        .Cases({"{fr29}", "{vr29}"}, CSKY::F29_32)
+                        .Cases({"{fr30}", "{vr30}"}, CSKY::F30_32)
+                        .Cases({"{fr31}", "{vr31}"}, CSKY::F31_32)
                         .Default(CSKY::NoRegister);
     if (FReg != CSKY::NoRegister) {
       assert(CSKY::F0_32 <= FReg && FReg <= CSKY::F31_32 && "Unknown fp-reg");
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
index 619a797be6dc7..3ab09902be3aa 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
@@ -24,8 +24,9 @@ using namespace llvm;
 #define GET_INSTRINFO_CTOR_DTOR
 #include "CSKYGenInstrInfo.inc"
 
-CSKYInstrInfo::CSKYInstrInfo(const CSKYSubtarget &STI)
-    : CSKYGenInstrInfo(STI, CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP),
+CSKYInstrInfo::CSKYInstrInfo(const CSKYSubtarget &STI,
+                             const CSKYRegisterInfo &TRI)
+    : CSKYGenInstrInfo(STI, TRI, CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP),
       STI(STI) {
   v2sf = STI.hasFPUv2SingleFloat();
   v2df = STI.hasFPUv2DoubleFloat();
@@ -393,7 +394,6 @@ void CSKYInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator I,
                                         Register SrcReg, bool IsKill, int FI,
                                         const TargetRegisterClass *RC,
-                                        const TargetRegisterInfo *TRI,
                                         Register VReg,
                                         MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
@@ -434,10 +434,12 @@ void CSKYInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       .addMemOperand(MMO);
 }
 
-void CSKYInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void CSKYInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         Register DestReg, int FI,
+                                         const TargetRegisterClass *RC,
+                                         Register VReg,
+                                         MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (I != MBB.end())
     DL = I->getDebugLoc();
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.h b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
index 6451c0af14fc0..d1cd0395f3b95 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.h
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
@@ -21,6 +21,7 @@
 
 namespace llvm {
 
+class CSKYRegisterInfo;
 class CSKYSubtarget;
 
 class CSKYInstrInfo : public CSKYGenInstrInfo {
@@ -33,7 +34,7 @@ class CSKYInstrInfo : public CSKYGenInstrInfo {
   const CSKYSubtarget &STI;
 
 public:
-  explicit CSKYInstrInfo(const CSKYSubtarget &STI);
+  CSKYInstrInfo(const CSKYSubtarget &STI, const CSKYRegisterInfo &RI);
 
   Register isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
@@ -42,14 +43,12 @@ class CSKYInstrInfo : public CSKYGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool IsKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
diff --git a/llvm/lib/Target/CSKY/CSKYSubtarget.cpp b/llvm/lib/Target/CSKY/CSKYSubtarget.cpp
index a554d1c0e739b..94e412ec81725 100644
--- a/llvm/lib/Target/CSKY/CSKYSubtarget.cpp
+++ b/llvm/lib/Target/CSKY/CSKYSubtarget.cpp
@@ -92,7 +92,7 @@ CSKYSubtarget::CSKYSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
                              StringRef FS, const TargetMachine &TM)
     : CSKYGenSubtargetInfo(TT, CPU, TuneCPU, FS),
       FrameLowering(initializeSubtargetDependencies(TT, CPU, TuneCPU, FS)),
-      InstrInfo(*this), RegInfo(), TLInfo(TM, *this) {
+      InstrInfo(*this, RegInfo), TLInfo(TM, *this) {
   TSInfo = std::make_unique<CSKYSelectionDAGInfo>();
 }
 
diff --git a/llvm/lib/Target/CSKY/CSKYSubtarget.h b/llvm/lib/Target/CSKY/CSKYSubtarget.h
index a3f2ddcb7165b..f5ad26a20d8a5 100644
--- a/llvm/lib/Target/CSKY/CSKYSubtarget.h
+++ b/llvm/lib/Target/CSKY/CSKYSubtarget.h
@@ -30,8 +30,8 @@ class CSKYSubtarget : public CSKYGenSubtargetInfo {
   virtual void anchor();
 
   CSKYFrameLowering FrameLowering;
-  CSKYInstrInfo InstrInfo;
   CSKYRegisterInfo RegInfo;
+  CSKYInstrInfo InstrInfo;
   CSKYTargetLowering TLInfo;
   std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
 
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index 8ace2d2777c74..95577dd668e1e 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -29,7 +29,6 @@
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <cstdint>
-#include <optional>
 
 using namespace llvm;
 using namespace llvm::dxil;
@@ -194,9 +193,10 @@ void DXContainerGlobals::addResourcesForPSV(Module &M, PSVRuntimeInfo &PSV) {
         dxbc::PSV::v2::ResourceBindInfo BindInfo;
         BindInfo.Type = Type;
         BindInfo.LowerBound = Binding.LowerBound;
-        assert(Binding.Size == UINT32_MAX ||
-               (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX &&
-                   "Resource range is too large");
+        assert(
+            (Binding.Size == UINT32_MAX ||
+             (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX) &&
+            "Resource range is too large");
         BindInfo.UpperBound = (Binding.Size == UINT32_MAX)
                                   ? UINT32_MAX
                                   : Binding.LowerBound + Binding.Size - 1;
@@ -284,6 +284,13 @@ void DXContainerGlobals::addPipelineStateValidationInfo(
     PSV.BaseData.NumThreadsX = MMI.EntryPropertyVec[0].NumThreadsX;
     PSV.BaseData.NumThreadsY = MMI.EntryPropertyVec[0].NumThreadsY;
     PSV.BaseData.NumThreadsZ = MMI.EntryPropertyVec[0].NumThreadsZ;
+    if (MMI.EntryPropertyVec[0].WaveSizeMin) {
+      PSV.BaseData.MinimumWaveLaneCount = MMI.EntryPropertyVec[0].WaveSizeMin;
+      PSV.BaseData.MaximumWaveLaneCount =
+          MMI.EntryPropertyVec[0].WaveSizeMax
+              ? MMI.EntryPropertyVec[0].WaveSizeMax
+              : MMI.EntryPropertyVec[0].WaveSizeMin;
+    }
     break;
   default:
     break;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 7ae500a55b92d..67437f6969b27 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -1079,6 +1079,15 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> {
   let attributes = [Attributes<DXIL1_0, []>];
 }
 
+def LegacyF16ToF32 : DXILOp<131, legacyF16ToF32> {
+  let Doc = "returns the float16 stored in the low-half of the uint converted "
+            "to a float";
+  let intrinsics = [IntrinSelect<int_dx_legacyf16tof32>];
+  let arguments = [Int32Ty];
+  let result = FloatTy;
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+}
+
 def WaveAllBitCount : DXILOp<135, waveAllOp> {
   let Doc = "returns the count of bits set to 1 across the wave";
   let intrinsics = [IntrinSelect<int_dx_wave_active_countbits>];
diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index d507d71b99fc9..9f1616f6960fe 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -304,40 +304,76 @@ bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
   GEPOperator *GOp = cast<GEPOperator>(&GEPI);
   Value *PtrOperand = GOp->getPointerOperand();
   Type *NewGEPType = GOp->getSourceElementType();
-  bool NeedsTransform = false;
 
   // Unwrap GEP ConstantExprs to find the base operand and element type
-  while (auto *CE = dyn_cast<ConstantExpr>(PtrOperand)) {
-    if (auto *GEPCE = dyn_cast<GEPOperator>(CE)) {
-      GOp = GEPCE;
-      PtrOperand = GEPCE->getPointerOperand();
-      NewGEPType = GEPCE->getSourceElementType();
-    } else
-      break;
+  while (auto *GEPCE = dyn_cast_or_null<GEPOperator>(
+             dyn_cast<ConstantExpr>(PtrOperand))) {
+    GOp = GEPCE;
+    PtrOperand = GEPCE->getPointerOperand();
+    NewGEPType = GEPCE->getSourceElementType();
   }
 
+  Type *const OrigGEPType = NewGEPType;
+  Value *const OrigOperand = PtrOperand;
+
   if (GlobalVariable *NewGlobal = lookupReplacementGlobal(PtrOperand)) {
     NewGEPType = NewGlobal->getValueType();
     PtrOperand = NewGlobal;
-    NeedsTransform = true;
   } else if (AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrOperand)) {
     Type *AllocatedType = Alloca->getAllocatedType();
     if (isa<ArrayType>(AllocatedType) &&
-        AllocatedType != GOp->getResultElementType()) {
+        AllocatedType != GOp->getResultElementType())
       NewGEPType = AllocatedType;
-      NeedsTransform = true;
+  } else
+    return false; // Only GEPs into an alloca or global variable are considered
+
+  // Defer changing i8 GEP types until dxil-flatten-arrays
+  if (OrigGEPType->isIntegerTy(8))
+    NewGEPType = OrigGEPType;
+
+  // If the original type is a "sub-type" of the new type, then ensure the gep
+  // correctly zero-indexes the extra dimensions to keep the offset calculation
+  // correct.
+  // Eg:
+  //  i32, [4 x i32] and [8 x [4 x i32]] are sub-types of [8 x [4 x i32]], etc.
+  //
+  // So then:
+  //   gep [4 x i32] %idx
+  //     -> gep [8 x [4 x i32]], i32 0, i32 %idx
+  //   gep i32 %idx
+  //     -> gep [8 x [4 x i32]], i32 0, i32 0, i32 %idx
+  uint32_t MissingDims = 0;
+  Type *SubType = NewGEPType;
+
+  // The new type will be in its array version; so match accordingly.
+  Type *const GEPArrType = equivalentArrayTypeFromVector(OrigGEPType);
+
+  while (SubType != GEPArrType) {
+    MissingDims++;
+
+    ArrayType *ArrType = dyn_cast<ArrayType>(SubType);
+    if (!ArrType) {
+      assert(SubType == GEPArrType &&
+             "GEP uses an DXIL invalid sub-type of alloca/global variable");
+      break;
     }
+
+    SubType = ArrType->getElementType();
   }
 
+  bool NeedsTransform = OrigOperand != PtrOperand ||
+                        OrigGEPType != NewGEPType || MissingDims != 0;
+
   if (!NeedsTransform)
     return false;
 
-  // Keep scalar GEPs scalar; dxil-flatten-arrays will do flattening later
-  if (!isa<ArrayType>(GOp->getSourceElementType()))
-    NewGEPType = GOp->getSourceElementType();
-
   IRBuilder<> Builder(&GEPI);
-  SmallVector<Value *, MaxVecSize> Indices(GOp->indices());
+  SmallVector<Value *, MaxVecSize> Indices;
+
+  for (uint32_t I = 0; I < MissingDims; I++)
+    Indices.push_back(Builder.getInt32(0));
+  llvm::append_range(Indices, GOp->indices());
+
   Value *NewGEP = Builder.CreateGEP(NewGEPType, PtrOperand, Indices,
                                     GOp->getName(), GOp->getNoWrapFlags());
 
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index ebb7c2607c0c8..e0d2dbde92150 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -197,6 +197,7 @@ static Value *expand16BitIsNormal(CallInst *Orig) {
 
 static bool isIntrinsicExpansion(Function &F) {
   switch (F.getIntrinsicID()) {
+  case Intrinsic::assume:
   case Intrinsic::abs:
   case Intrinsic::atan2:
   case Intrinsic::exp:
@@ -988,6 +989,9 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) {
   case Intrinsic::abs:
     Result = expandAbs(Orig);
     break;
+  case Intrinsic::assume:
+    Orig->eraseFromParent();
+    return true;
   case Intrinsic::atan2:
     Result = expandAtan2Intrinsic(Orig);
     break;
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 8720460cceb20..e46a393e50906 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -904,8 +904,6 @@ class OpLowerer {
       case Intrinsic::dx_resource_casthandle:
       // NOTE: llvm.dbg.value is supported as is in DXIL.
       case Intrinsic::dbg_value:
-      // NOTE: llvm.assume is supported as is in DXIL.
-      case Intrinsic::assume:
       case Intrinsic::not_intrinsic:
         if (F.use_empty())
           F.eraseFromParent();
diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
index 6579d3405cf39..057d87bc3c6a9 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
@@ -10,6 +10,7 @@
 #include "DirectX.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/DXILResource.h"
+#include "llvm/Frontend/HLSL/HLSLResource.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
@@ -20,6 +21,7 @@
 #include "llvm/IR/IntrinsicsDirectX.h"
 #include "llvm/IR/User.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
 #define DEBUG_TYPE "dxil-resource-access"
@@ -44,16 +46,28 @@ static Value *calculateGEPOffset(GetElementPtrInst *GEP, Value *PrevOffset,
   APInt ConstantOffset(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
   if (GEP->accumulateConstantOffset(DL, ConstantOffset)) {
     APInt Scaled = ConstantOffset.udiv(ScalarSize);
-    return ConstantInt::get(Type::getInt32Ty(GEP->getContext()), Scaled);
+    return ConstantInt::get(DL.getIndexType(GEP->getType()), Scaled);
   }
 
-  auto IndexIt = GEP->idx_begin();
-  assert(cast<ConstantInt>(IndexIt)->getZExtValue() == 0 &&
-         "GEP is not indexing through pointer");
-  ++IndexIt;
-  Value *Offset = *IndexIt;
-  assert(++IndexIt == GEP->idx_end() && "Too many indices in GEP");
-  return Offset;
+  unsigned NumIndices = GEP->getNumIndices();
+
+  // If we have a single index we're indexing into a top level array. This
+  // generally only happens with cbuffers.
+  if (NumIndices == 1)
+    return *GEP->idx_begin();
+
+  // If we have two indices, this should be a simple access through a pointer.
+  if (NumIndices == 2) {
+    auto IndexIt = GEP->idx_begin();
+    assert(cast<ConstantInt>(IndexIt)->getZExtValue() == 0 &&
+           "GEP is not indexing through pointer");
+    ++IndexIt;
+    Value *Offset = *IndexIt;
+    assert(++IndexIt == GEP->idx_end() && "Too many indices in GEP");
+    return Offset;
+  }
+
+  llvm_unreachable("Unhandled GEP structure for resource access");
 }
 
 static void createTypedBufferStore(IntrinsicInst *II, StoreInst *SI,
@@ -171,6 +185,127 @@ static void createRawLoad(IntrinsicInst *II, LoadInst *LI, Value *Offset) {
   LI->replaceAllUsesWith(V);
 }
 
+namespace {
+/// Helper for building a `load.cbufferrow` intrinsic given a simple type.
+struct CBufferRowIntrin {
+  Intrinsic::ID IID;
+  Type *RetTy;
+  unsigned int EltSize;
+  unsigned int NumElts;
+
+  CBufferRowIntrin(const DataLayout &DL, Type *Ty) {
+    assert(Ty == Ty->getScalarType() && "Expected scalar type");
+
+    switch (DL.getTypeSizeInBits(Ty)) {
+    case 16:
+      IID = Intrinsic::dx_resource_load_cbufferrow_8;
+      RetTy = StructType::get(Ty, Ty, Ty, Ty, Ty, Ty, Ty, Ty);
+      EltSize = 2;
+      NumElts = 8;
+      break;
+    case 32:
+      IID = Intrinsic::dx_resource_load_cbufferrow_4;
+      RetTy = StructType::get(Ty, Ty, Ty, Ty);
+      EltSize = 4;
+      NumElts = 4;
+      break;
+    case 64:
+      IID = Intrinsic::dx_resource_load_cbufferrow_2;
+      RetTy = StructType::get(Ty, Ty);
+      EltSize = 8;
+      NumElts = 2;
+      break;
+    default:
+      llvm_unreachable("Only 16, 32, and 64 bit types supported");
+    }
+  }
+};
+} // namespace
+
+static void createCBufferLoad(IntrinsicInst *II, LoadInst *LI, Value *Offset,
+                              dxil::ResourceTypeInfo &RTI) {
+  const DataLayout &DL = LI->getDataLayout();
+
+  Type *Ty = LI->getType();
+  assert(!isa<StructType>(Ty) && "Structs not handled yet");
+  CBufferRowIntrin Intrin(DL, Ty->getScalarType());
+
+  StringRef Name = LI->getName();
+  Value *Handle = II->getOperand(0);
+
+  IRBuilder<> Builder(LI);
+
+  ConstantInt *GlobalOffset = dyn_cast<ConstantInt>(II->getOperand(1));
+  assert(GlobalOffset && "CBuffer getpointer index must be constant");
+
+  unsigned int FixedOffset = GlobalOffset->getZExtValue();
+  // If we have a further constant offset we can just fold it in to the fixed
+  // offset.
+  if (auto *ConstOffset = dyn_cast_if_present<ConstantInt>(Offset)) {
+    FixedOffset += ConstOffset->getZExtValue();
+    Offset = nullptr;
+  }
+
+  Value *CurrentRow = ConstantInt::get(
+      Builder.getInt32Ty(), FixedOffset / hlsl::CBufferRowSizeInBytes);
+  unsigned int CurrentIndex =
+      (FixedOffset % hlsl::CBufferRowSizeInBytes) / Intrin.EltSize;
+
+  assert(!(CurrentIndex && Offset) &&
+         "Dynamic indexing into elements of cbuffer rows is not supported");
+  // At this point if we have a non-constant offset it has to be an array
+  // offset, so we can assume that it's a multiple of the row size.
+  if (Offset)
+    CurrentRow = FixedOffset ? Builder.CreateAdd(CurrentRow, Offset) : Offset;
+
+  auto *CBufLoad = Builder.CreateIntrinsic(
+      Intrin.RetTy, Intrin.IID, {Handle, CurrentRow}, nullptr, Name + ".load");
+  auto *Elt =
+      Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, Name + ".extract");
+
+  // At this point we've loaded the first scalar of our result, but our original
+  // type may have been a vector.
+  unsigned int Remaining =
+      ((DL.getTypeSizeInBits(Ty) / 8) / Intrin.EltSize) - 1;
+  if (Remaining == 0) {
+    // We only have a single element, so we're done.
+    Value *Result = Elt;
+
+    // However, if we loaded a <1 x T>, then we need to adjust the type.
+    if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
+      assert(VT->getNumElements() == 1 && "Can't have multiple elements here");
+      Result = Builder.CreateInsertElement(PoisonValue::get(VT), Result,
+                                           Builder.getInt32(0), Name);
+    }
+    LI->replaceAllUsesWith(Result);
+    return;
+  }
+
+  // Walk each element and extract it, wrapping to new rows as needed.
+  SmallVector<Value *> Extracts{Elt};
+  while (Remaining--) {
+    CurrentIndex %= Intrin.NumElts;
+
+    if (CurrentIndex == 0) {
+      CurrentRow = Builder.CreateAdd(CurrentRow,
+                                     ConstantInt::get(Builder.getInt32Ty(), 1));
+      CBufLoad = Builder.CreateIntrinsic(Intrin.RetTy, Intrin.IID,
+                                         {Handle, CurrentRow}, nullptr,
+                                         Name + ".load");
+    }
+
+    Extracts.push_back(Builder.CreateExtractValue(CBufLoad, {CurrentIndex++},
+                                                  Name + ".extract"));
+  }
+
+  // Finally, we build up the original loaded value.
+  Value *Result = PoisonValue::get(Ty);
+  for (int I = 0, E = Extracts.size(); I < E; ++I)
+    Result = Builder.CreateInsertElement(
+        Result, Extracts[I], Builder.getInt32(I), Name + formatv(".upto{}", I));
+  LI->replaceAllUsesWith(Result);
+}
+
 static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset,
                                 dxil::ResourceTypeInfo &RTI) {
   switch (RTI.getResourceKind()) {
@@ -179,6 +314,8 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset,
   case dxil::ResourceKind::RawBuffer:
   case dxil::ResourceKind::StructuredBuffer:
     return createRawLoad(II, LI, Offset);
+  case dxil::ResourceKind::CBuffer:
+    return createCBufferLoad(II, LI, Offset, RTI);
   case dxil::ResourceKind::Texture1D:
   case dxil::ResourceKind::Texture2D:
   case dxil::ResourceKind::Texture2DMS:
@@ -190,9 +327,8 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset,
   case dxil::ResourceKind::TextureCubeArray:
   case dxil::ResourceKind::FeedbackTexture2D:
   case dxil::ResourceKind::FeedbackTexture2DArray:
-  case dxil::ResourceKind::CBuffer:
   case dxil::ResourceKind::TBuffer:
-    // TODO: handle these
+    reportFatalUsageError("Load not yet implemented for resource type");
     return;
   case dxil::ResourceKind::Sampler:
   case dxil::ResourceKind::RTAccelerationStructure:
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h
index b990b6c7410ac..ec82aa93dd07c 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.h
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.h
@@ -21,7 +21,6 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/MC/DXContainerRootSignature.h"
 #include "llvm/Pass.h"
-#include <optional>
 
 namespace llvm {
 namespace dxil {
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.h b/llvm/lib/Target/DirectX/DXILShaderFlags.h
index f94f7997436ac..a0820572e5fed 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.h
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.h
@@ -22,7 +22,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdint>
-#include <memory>
 
 namespace llvm {
 class Module;
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index cf8b833b3e42e..e1a472fe57642 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -82,6 +82,7 @@ enum class EntryPropsTag {
   ASStateTag,
   WaveSize,
   EntryRootSig,
+  WaveRange = 23,
 };
 
 } // namespace
@@ -177,14 +178,15 @@ getTagValueAsMetadata(EntryPropsTag Tag, uint64_t Value, LLVMContext &Ctx) {
   case EntryPropsTag::ASStateTag:
   case EntryPropsTag::WaveSize:
   case EntryPropsTag::EntryRootSig:
+  case EntryPropsTag::WaveRange:
     llvm_unreachable("NYI: Unhandled entry property tag");
   }
   return MDVals;
 }
 
-static MDTuple *
-getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
-                       const Triple::EnvironmentType ShaderProfile) {
+static MDTuple *getEntryPropAsMetadata(Module &M, const EntryProperties &EP,
+                                       uint64_t EntryShaderFlags,
+                                       const ModuleMetadataInfo &MMDI) {
   SmallVector<Metadata *> MDVals;
   LLVMContext &Ctx = EP.Entry->getContext();
   if (EntryShaderFlags != 0)
@@ -195,12 +197,13 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
     // FIXME: support more props.
     // See https://github.com/llvm/llvm-project/issues/57948.
     // Add shader kind for lib entries.
-    if (ShaderProfile == Triple::EnvironmentType::Library &&
+    if (MMDI.ShaderProfile == Triple::EnvironmentType::Library &&
         EP.ShaderStage != Triple::EnvironmentType::Library)
       MDVals.append(getTagValueAsMetadata(EntryPropsTag::ShaderKind,
                                           getShaderStage(EP.ShaderStage), Ctx));
 
     if (EP.ShaderStage == Triple::EnvironmentType::Compute) {
+      // Handle mandatory "hlsl.numthreads"
       MDVals.emplace_back(ConstantAsMetadata::get(ConstantInt::get(
           Type::getInt32Ty(Ctx), static_cast<int>(EntryPropsTag::NumThreads))));
       Metadata *NumThreadVals[] = {ConstantAsMetadata::get(ConstantInt::get(
@@ -210,8 +213,48 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
                                    ConstantAsMetadata::get(ConstantInt::get(
                                        Type::getInt32Ty(Ctx), EP.NumThreadsZ))};
       MDVals.emplace_back(MDNode::get(Ctx, NumThreadVals));
+
+      // Handle optional "hlsl.wavesize". The fields are optionally represented
+      // if they are non-zero.
+      if (EP.WaveSizeMin != 0) {
+        bool IsWaveRange = VersionTuple(6, 8) <= MMDI.ShaderModelVersion;
+        bool IsWaveSize =
+            !IsWaveRange && VersionTuple(6, 6) <= MMDI.ShaderModelVersion;
+
+        if (!IsWaveRange && !IsWaveSize) {
+          reportError(M, "Shader model 6.6 or greater is required to specify "
+                         "the \"hlsl.wavesize\" function attribute");
+          return nullptr;
+        }
+
+        // A range is being specified if EP.WaveSizeMax != 0
+        if (EP.WaveSizeMax && !IsWaveRange) {
+          reportError(
+              M, "Shader model 6.8 or greater is required to specify "
+                 "wave size range values of the \"hlsl.wavesize\" function "
+                 "attribute");
+          return nullptr;
+        }
+
+        EntryPropsTag Tag =
+            IsWaveSize ? EntryPropsTag::WaveSize : EntryPropsTag::WaveRange;
+        MDVals.emplace_back(ConstantAsMetadata::get(
+            ConstantInt::get(Type::getInt32Ty(Ctx), static_cast<int>(Tag))));
+
+        SmallVector<Metadata *> WaveSizeVals = {ConstantAsMetadata::get(
+            ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizeMin))};
+        if (IsWaveRange) {
+          WaveSizeVals.push_back(ConstantAsMetadata::get(
+              ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizeMax)));
+          WaveSizeVals.push_back(ConstantAsMetadata::get(
+              ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizePref)));
+        }
+
+        MDVals.emplace_back(MDNode::get(Ctx, WaveSizeVals));
+      }
     }
   }
+
   if (MDVals.empty())
     return nullptr;
   return MDNode::get(Ctx, MDVals);
@@ -236,12 +279,11 @@ static MDTuple *constructEntryMetadata(const Function *EntryFn,
   return MDNode::get(Ctx, MDVals);
 }
 
-static MDTuple *emitEntryMD(const EntryProperties &EP, MDTuple *Signatures,
-                            MDNode *MDResources,
+static MDTuple *emitEntryMD(Module &M, const EntryProperties &EP,
+                            MDTuple *Signatures, MDNode *MDResources,
                             const uint64_t EntryShaderFlags,
-                            const Triple::EnvironmentType ShaderProfile) {
-  MDTuple *Properties =
-      getEntryPropAsMetadata(EP, EntryShaderFlags, ShaderProfile);
+                            const ModuleMetadataInfo &MMDI) {
+  MDTuple *Properties = getEntryPropAsMetadata(M, EP, EntryShaderFlags, MMDI);
   return constructEntryMetadata(EP.Entry, Signatures, MDResources, Properties,
                                 EP.Entry->getContext());
 }
@@ -523,10 +565,8 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM,
                    Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) +
                          "'"));
     }
-
-    EntryFnMDNodes.emplace_back(emitEntryMD(EntryProp, Signatures, ResourceMD,
-                                            EntryShaderFlags,
-                                            MMDI.ShaderProfile));
+    EntryFnMDNodes.emplace_back(emitEntryMD(
+        M, EntryProp, Signatures, ResourceMD, EntryShaderFlags, MMDI));
   }
 
   NamedMDNode *EntryPointsNamedMD =
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 26a8728e1f37c..48a9085820471 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -1169,8 +1169,8 @@ void DXILBitcodeWriter::writeModuleInfo() {
   // We need to hardcode a triple and datalayout that's compatible with the
   // historical DXIL triple and datalayout from DXC.
   StringRef Triple = "dxil-ms-dx";
-  StringRef DL = "e-m:e-p:32:32-i1:8-i8:8-i16:32-i32:32-i64:64-"
-                 "f16:32-f32:32-f64:64-n8:16:32:64";
+  StringRef DL = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-"
+                 "f16:16-f32:32-f64:64-n8:16:32:64";
   writeStringRecord(Stream, bitc::MODULE_CODE_TRIPLE, Triple, 0 /*TODO*/);
   writeStringRecord(Stream, bitc::MODULE_CODE_DATALAYOUT, DL, 0 /*TODO*/);
 
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h
index 8707b084f5465..7cbc092ea3525 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h
@@ -18,9 +18,7 @@
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/MemoryBufferRef.h"
-#include <map>
 #include <memory>
-#include <string>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
index 15def3637c5a7..b6bbb201f5c5d 100644
--- a/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
+++ b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
@@ -52,6 +52,7 @@ void DXILAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   emitGlobalConstant(GV->getDataLayout(), GV->getInitializer());
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeDirectXAsmPrinter() {
   RegisterAsmPrinter<DXILAsmPrinter> X(getTheDirectXTarget());
 }
diff --git a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
index bb2efa43d818c..401881d6d0f67 100644
--- a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
@@ -19,6 +19,6 @@
 using namespace llvm;
 
 DirectXInstrInfo::DirectXInstrInfo(const DirectXSubtarget &STI)
-    : DirectXGenInstrInfo(STI) {}
+    : DirectXGenInstrInfo(STI, RI) {}
 
 DirectXInstrInfo::~DirectXInstrInfo() {}
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index bcf84403b2c0d..84b1a313df2ea 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -53,7 +53,8 @@
 
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeDirectXTarget() {
   RegisterTargetMachine<DirectXTargetMachine> X(getTheDirectXTarget());
   auto *PR = PassRegistry::getPassRegistry();
   initializeDXILIntrinsicExpansionLegacyPass(*PR);
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index 60dfd9650937c..6cacbf6564db2 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -29,11 +29,12 @@ bool DirectXTTIImpl::isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
                                                             int OpdIdx) const {
   switch (ID) {
   case Intrinsic::dx_asdouble:
-  case Intrinsic::dx_isinf:
-  case Intrinsic::dx_isnan:
   case Intrinsic::dx_firstbitlow:
-  case Intrinsic::dx_firstbituhigh:
   case Intrinsic::dx_firstbitshigh:
+  case Intrinsic::dx_firstbituhigh:
+  case Intrinsic::dx_isinf:
+  case Intrinsic::dx_isnan:
+  case Intrinsic::dx_legacyf16tof32:
     return OpdIdx == 0;
   default:
     return OpdIdx == -1;
@@ -50,6 +51,7 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
   case Intrinsic::dx_frac:
   case Intrinsic::dx_isinf:
   case Intrinsic::dx_isnan:
+  case Intrinsic::dx_legacyf16tof32:
   case Intrinsic::dx_rsqrt:
   case Intrinsic::dx_saturate:
   case Intrinsic::dx_splitdouble:
diff --git a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
index 9a14c01f62ae7..62ad014f3739f 100644
--- a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
+++ b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
@@ -132,7 +132,8 @@ static MCRegisterInfo *createDirectXMCRegisterInfo(const Triple &Triple) {
 
 static MCInstrInfo *createDirectXMCInstrInfo() { return new MCInstrInfo(); }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeDirectXTargetMC() {
   Target &T = getTheDirectXTarget();
   RegisterMCAsmInfo<DirectXMCAsmInfo> X(T);
   TargetRegistry::RegisterMCInstrInfo(T, createDirectXMCInstrInfo);
diff --git a/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp
index ae01626e5229d..934bd1b0e8adb 100644
--- a/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp
+++ b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp
@@ -24,7 +24,8 @@ Target &getTheDirectXTarget() {
 
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeDirectXTargetInfo() {
   RegisterTarget<Triple::dxil, /*HasJIT=*/false> X(
       getTheDirectXTarget(), "dxil", "DirectX Intermediate Language", "DXIL");
 }
diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index b94b1484205ae..c18db982bfd97 100644
--- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -463,7 +463,7 @@ void HexagonOperand::print(raw_ostream &OS, const MCAsmInfo &MAI) const {
     break;
   case Register:
     OS << "<register R";
-    OS << getReg() << ">";
+    OS << getReg().id() << ">";
     break;
   case Token:
     OS << "'" << getToken() << "'";
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 68f53124f9db8..557a0a3f27819 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -1796,7 +1796,7 @@ namespace {
 
     const MachineDominatorTree &MDT;
     const HexagonInstrInfo &HII;
-    const HexagonRegisterInfo &HRI;
+    [[maybe_unused]] const HexagonRegisterInfo &HRI;
     MachineRegisterInfo &MRI;
     BitTracker &BT;
   };
@@ -1886,7 +1886,7 @@ bool BitSimplification::matchHalf(unsigned SelfR,
 
 bool BitSimplification::validateReg(BitTracker::RegisterRef R, unsigned Opc,
       unsigned OpNum) {
-  auto *OpRC = HII.getRegClass(HII.get(Opc), OpNum, &HRI);
+  auto *OpRC = HII.getRegClass(HII.get(Opc), OpNum);
   auto *RRC = HBS::getFinalVRegClass(R, MRI);
   return OpRC->hasSubClassEq(RRC);
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp b/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
index eca5ac140f3c3..bae3484eee1cb 100644
--- a/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
@@ -24,7 +24,6 @@
 #include <cstdint>
 #include <iterator>
 #include <map>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
index f4e36fa7dc767..e661c94690729 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
@@ -26,6 +26,7 @@ def tc_20a4bbec : InstrItinClass;
 def tc_227864f7 : InstrItinClass;
 def tc_257f6f7c : InstrItinClass;
 def tc_26a377fe : InstrItinClass;
+def tc_2a698a03 : InstrItinClass;
 def tc_2b4c548e : InstrItinClass;
 def tc_2c745bb8 : InstrItinClass;
 def tc_2d4051cd : InstrItinClass;
@@ -52,6 +53,7 @@ def tc_561aaa58 : InstrItinClass;
 def tc_56c4f9fe : InstrItinClass;
 def tc_56e64202 : InstrItinClass;
 def tc_58d21193 : InstrItinClass;
+def tc_57a4709c : InstrItinClass;
 def tc_5bf8afbb : InstrItinClass;
 def tc_5cdf8c84 : InstrItinClass;
 def tc_61bf7c03 : InstrItinClass;
@@ -220,6 +222,11 @@ class DepHVXItinV55 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -356,6 +363,11 @@ class DepHVXItinV55 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -812,6 +824,11 @@ class DepHVXItinV60 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -948,6 +965,11 @@ class DepHVXItinV60 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -1404,6 +1426,11 @@ class DepHVXItinV62 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -1540,6 +1567,11 @@ class DepHVXItinV62 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -1996,6 +2028,11 @@ class DepHVXItinV65 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -2132,6 +2169,11 @@ class DepHVXItinV65 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -2588,6 +2630,11 @@ class DepHVXItinV66 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -2724,6 +2771,11 @@ class DepHVXItinV66 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -3180,6 +3232,11 @@ class DepHVXItinV67 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -3316,6 +3373,11 @@ class DepHVXItinV67 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -3772,6 +3834,11 @@ class DepHVXItinV68 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -3908,6 +3975,11 @@ class DepHVXItinV68 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -4364,6 +4436,11 @@ class DepHVXItinV69 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -4500,6 +4577,11 @@ class DepHVXItinV69 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -4956,6 +5038,11 @@ class DepHVXItinV71 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -5092,6 +5179,11 @@ class DepHVXItinV71 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -5548,6 +5640,11 @@ class DepHVXItinV73 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -5684,6 +5781,11 @@ class DepHVXItinV73 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -6140,6 +6242,11 @@ class DepHVXItinV75 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -6276,6 +6383,11 @@ class DepHVXItinV75 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -6732,6 +6844,11 @@ class DepHVXItinV79 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -6868,6 +6985,11 @@ class DepHVXItinV79 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
@@ -7324,6 +7446,11 @@ class DepHVXItinV81 {
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
       [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2a698a03, /*SLOT0123,VSorVP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
@@ -7460,6 +7587,11 @@ class DepHVXItinV81 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_57a4709c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_XLANE]>], [9, 2],
diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
index f8f1c2ad07b75..b188134d60d39 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -29939,6 +29939,58 @@ let opNewValue = 0;
 let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vabs_qf16_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = vabs($Vu32.hf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabs_qf16_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = vabs($Vu32.qf16)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabs_qf32_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = vabs($Vu32.qf32)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabs_qf32_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = vabs($Vu32.sf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vabs_sf : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
@@ -31302,6 +31354,21 @@ let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_valign4 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = valign4($Vu32,$Vv32,$Rt8)",
+tc_57a4709c, TypeCVI_VA>, Enc_a30110, Requires<[UseHVXV81]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_valignb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
@@ -32583,6 +32650,32 @@ let isCVI = 1;
 let hasHvxTmp = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vconv_bf_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxWR:$Vuu32),
+"$Vd32.bf = $Vuu32.qf32",
+tc_2a698a03, TypeCVI_VS>, Enc_a33d04, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_f8_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.f8 = $Vu32.qf16",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vconv_h_hf : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
@@ -32596,6 +32689,19 @@ let opNewValue = 0;
 let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vconv_h_hf_rnd : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.h = $Vu32.hf:rnd",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vconv_hf_h : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
@@ -32635,6 +32741,71 @@ let opNewValue = 0;
 let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vconv_qf16_f8 : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxVR:$Vu32),
+"$Vdd32.qf16 = $Vu32.f8",
+tc_04da405a, TypeCVI_VP_VS>, Enc_dd766a, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_qf16_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = $Vu32.hf",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_qf16_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = $Vu32.qf16",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_qf32_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = $Vu32.qf32",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_qf32_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = $Vu32.sf",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vconv_sf_qf32 : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
@@ -33720,6 +33891,122 @@ let isHVXALU2SRC = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
+def V6_veqhf : HInst<
+(outs HvxQR:$Qd4),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.hf,$Vv32.hf)",
+tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b000111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_veqhf_and : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.hf,$Vv32.hf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b000111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqhf_or : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.hf,$Vv32.hf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b010111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isAccumulator = 1;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqhf_xor : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.hf,$Vv32.hf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b100111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqsf : HInst<
+(outs HvxQR:$Qd4),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.sf,$Vv32.sf)",
+tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b000011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_veqsf_and : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.sf,$Vv32.sf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b000011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqsf_or : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.sf,$Vv32.sf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b010011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isAccumulator = 1;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqsf_xor : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.sf,$Vv32.sf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-2} = 0b100011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
+let isHVXALU = 1;
+let isHVXALU2SRC = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
 def V6_veqw : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
@@ -34538,6 +34825,58 @@ let Inst{31-24} = 0b00011110;
 let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vilog2_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.w = vilog2($Vu32.hf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vilog2_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.w = vilog2($Vu32.qf16)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vilog2_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.w = vilog2($Vu32.qf32)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vilog2_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.w = vilog2($Vu32.sf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vinsertwr : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, IntRegs:$Rt32),
@@ -37170,6 +37509,58 @@ let isCVI = 1;
 let isHVXALU = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vneg_qf16_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = vneg($Vu32.hf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vneg_qf16_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf16 = vneg($Vu32.qf16)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vneg_qf32_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = vneg($Vu32.qf32)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vneg_qf32_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.qf32 = vneg($Vu32.sf)",
+tc_2a698a03, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vnormamth : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
diff --git a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
index 23f4b3aef7d10..c11483b961cc3 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
@@ -3830,6 +3830,122 @@ def: Pat<(int_hexagon_V6_vsub_hf_f8_128B HvxVR:$src1, HvxVR:$src2),
 
 // V81 HVX Instructions.
 
+def: Pat<(int_hexagon_V6_vabs_qf16_hf HvxVR:$src1),
+         (V6_vabs_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf16_hf_128B HvxVR:$src1),
+         (V6_vabs_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf16_qf16 HvxVR:$src1),
+         (V6_vabs_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf16_qf16_128B HvxVR:$src1),
+         (V6_vabs_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf32_qf32 HvxVR:$src1),
+         (V6_vabs_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf32_qf32_128B HvxVR:$src1),
+         (V6_vabs_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf32_sf HvxVR:$src1),
+         (V6_vabs_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vabs_qf32_sf_128B HvxVR:$src1),
+         (V6_vabs_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_valign4 HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_valign4 HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[UseHVXV81, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_valign4_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_valign4 HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[UseHVXV81, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vconv_bf_qf32 HvxWR:$src1),
+         (V6_vconv_bf_qf32 HvxWR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_bf_qf32_128B HvxWR:$src1),
+         (V6_vconv_bf_qf32 HvxWR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_f8_qf16 HvxVR:$src1),
+         (V6_vconv_f8_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_f8_qf16_128B HvxVR:$src1),
+         (V6_vconv_f8_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_h_hf_rnd HvxVR:$src1),
+         (V6_vconv_h_hf_rnd HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vconv_h_hf_rnd_128B HvxVR:$src1),
+         (V6_vconv_h_hf_rnd HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_f8 HvxVR:$src1),
+         (V6_vconv_qf16_f8 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_f8_128B HvxVR:$src1),
+         (V6_vconv_qf16_f8 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_hf HvxVR:$src1),
+         (V6_vconv_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_hf_128B HvxVR:$src1),
+         (V6_vconv_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_qf16 HvxVR:$src1),
+         (V6_vconv_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf16_qf16_128B HvxVR:$src1),
+         (V6_vconv_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf32_qf32 HvxVR:$src1),
+         (V6_vconv_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf32_qf32_128B HvxVR:$src1),
+         (V6_vconv_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf32_sf HvxVR:$src1),
+         (V6_vconv_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_qf32_sf_128B HvxVR:$src1),
+         (V6_vconv_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf HvxVR:$src1, HvxVR:$src2),
+         (V6_veqhf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_veqhf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqhf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqhf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqhf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqhf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqhf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqhf_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqhf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf HvxVR:$src1, HvxVR:$src2),
+         (V6_veqsf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_veqsf HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_veqsf_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_hf HvxVR:$src1),
+         (V6_vilog2_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_hf_128B HvxVR:$src1),
+         (V6_vilog2_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_qf16 HvxVR:$src1),
+         (V6_vilog2_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_qf16_128B HvxVR:$src1),
+         (V6_vilog2_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_qf32 HvxVR:$src1),
+         (V6_vilog2_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_qf32_128B HvxVR:$src1),
+         (V6_vilog2_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_sf HvxVR:$src1),
+         (V6_vilog2_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vilog2_sf_128B HvxVR:$src1),
+         (V6_vilog2_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf16_hf HvxVR:$src1),
+         (V6_vneg_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf16_hf_128B HvxVR:$src1),
+         (V6_vneg_qf16_hf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf16_qf16 HvxVR:$src1),
+         (V6_vneg_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf16_qf16_128B HvxVR:$src1),
+         (V6_vneg_qf16_qf16 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf32_qf32 HvxVR:$src1),
+         (V6_vneg_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf32_qf32_128B HvxVR:$src1),
+         (V6_vneg_qf32_qf32 HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf32_sf HvxVR:$src1),
+         (V6_vneg_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vneg_qf32_sf_128B HvxVR:$src1),
+         (V6_vneg_qf32_sf HvxVR:$src1)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
 def: Pat<(int_hexagon_V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2),
          (V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
 def: Pat<(int_hexagon_V6_vsub_hf_mix_128B HvxVR:$src1, HvxVR:$src2),
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index dd343d9fbe79f..df612262def5e 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1405,7 +1405,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB,
     bool IsKill = !HRI.isEHReturnCalleeSaveReg(Reg);
     int FI = I.getFrameIdx();
     const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg);
-    HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI, Register());
+    HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, Register());
     if (IsKill)
       MBB.addLiveIn(Reg);
   }
@@ -1470,7 +1470,7 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB,
     MCRegister Reg = I.getReg();
     const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg);
     int FI = I.getFrameIdx();
-    HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI, Register());
+    HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, Register());
   }
 
   return true;
@@ -1814,8 +1814,7 @@ bool HexagonFrameLowering::expandStoreVecPred(MachineBasicBlock &B,
     .addReg(SrcR, getKillRegState(IsKill))
     .addReg(TmpR0, RegState::Kill);
 
-  auto *HRI = B.getParent()->getSubtarget<HexagonSubtarget>().getRegisterInfo();
-  HII.storeRegToStackSlot(B, It, TmpR1, true, FI, RC, HRI, Register());
+  HII.storeRegToStackSlot(B, It, TmpR1, true, FI, RC, Register());
   expandStoreVec(B, std::prev(It), MRI, HII, NewRegs);
 
   NewRegs.push_back(TmpR0);
@@ -1844,9 +1843,7 @@ bool HexagonFrameLowering::expandLoadVecPred(MachineBasicBlock &B,
 
   BuildMI(B, It, DL, HII.get(Hexagon::A2_tfrsi), TmpR0)
     .addImm(0x01010101);
-  MachineFunction &MF = *B.getParent();
-  auto *HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
-  HII.loadRegFromStackSlot(B, It, TmpR1, FI, RC, HRI, Register());
+  HII.loadRegFromStackSlot(B, It, TmpR1, FI, RC, Register());
   expandLoadVec(B, std::prev(It), MRI, HII, NewRegs);
 
   BuildMI(B, It, DL, HII.get(Hexagon::V6_vandvrt), DstR)
@@ -2225,7 +2222,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
         if (!Bad) {
           // If the addressing mode is ok, check the register class.
           unsigned OpNum = Load ? 0 : 2;
-          auto *RC = HII.getRegClass(In.getDesc(), OpNum, &HRI);
+          auto *RC = HII.getRegClass(In.getDesc(), OpNum);
           RC = getCommonRC(SI.RC, RC);
           if (RC == nullptr)
             Bad = true;
@@ -2395,7 +2392,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
 
         HexagonBlockRanges::RegisterRef SrcRR = { SrcOp.getReg(),
                                                   SrcOp.getSubReg() };
-        auto *RC = HII.getRegClass(SI.getDesc(), 2, &HRI);
+        auto *RC = HII.getRegClass(SI.getDesc(), 2);
         // The this-> is needed to unconfuse MSVC.
         Register FoundR = this->findPhysReg(MF, Range, IM, DM, RC);
         LLVM_DEBUG(dbgs() << "Replacement reg:" << printReg(FoundR, &HRI)
diff --git a/llvm/lib/Target/Hexagon/HexagonGenMux.cpp b/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
index 74e5abe2599c7..c6fffde84af58 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -43,7 +43,6 @@
 #include <cassert>
 #include <iterator>
 #include <limits>
-#include <utility>
 
 #define DEBUG_TYPE "hexmux"
 
diff --git a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
index 9c81e9638f8e2..5344ed8446efc 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -30,7 +30,6 @@
 #include <cassert>
 #include <iterator>
 #include <queue>
-#include <utility>
 
 #define DEBUG_TYPE "gen-pred"
 
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 7ee280d8fc8b0..eadf02043841e 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1815,7 +1815,7 @@ struct WeightedLeaf {
   int Weight;
   int InsertionOrder;
 
-  WeightedLeaf() {}
+  WeightedLeaf() = default;
 
   WeightedLeaf(SDValue Value, int Weight, int InsertionOrder) :
     Value(Value), Weight(Weight), InsertionOrder(InsertionOrder) {
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 526b4de975915..04a97606cb7f8 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -3948,3 +3948,13 @@ HexagonTargetLowering::shouldExpandAtomicCmpXchgInIR(
     AtomicCmpXchgInst *AI) const {
   return AtomicExpansionKind::LLSC;
 }
+
+bool HexagonTargetLowering::isMaskAndCmp0FoldingBeneficial(
+    const Instruction &AndI) const {
+  // Only sink 'and' mask to cmp use block if it is masking a single bit since
+  // this will fold the and/cmp/br into a single tstbit instruction.
+  ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
+  if (!Mask)
+    return false;
+  return Mask->getValue().isPowerOf2();
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 8d04edbea5b43..4ac3e7671592a 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -160,6 +160,8 @@ class HexagonTargetLowering : public TargetLowering {
 
   bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
 
+  bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+
   /// Return true if an FMA operation is faster than a pair of mul and add
   /// instructions. fmuladd intrinsics will be expanded to FMAs when this
   /// method returns true (and FMAs are legal), otherwise fmuladd is
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 47726d6447ad8..7682af4543b7c 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -118,9 +118,9 @@ const int Hexagon_ADDI_OFFSET_MIN = -32768;
 void HexagonInstrInfo::anchor() {}
 
 HexagonInstrInfo::HexagonInstrInfo(const HexagonSubtarget &ST)
-    : HexagonGenInstrInfo(ST, Hexagon::ADJCALLSTACKDOWN,
+    : HexagonGenInstrInfo(ST, RegInfo, Hexagon::ADJCALLSTACKDOWN,
                           Hexagon::ADJCALLSTACKUP),
-      Subtarget(ST) {}
+      RegInfo(ST.getHwMode()), Subtarget(ST) {}
 
 namespace llvm {
 namespace HexagonFUnits {
@@ -964,7 +964,6 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator I,
                                            Register SrcReg, bool isKill, int FI,
                                            const TargetRegisterClass *RC,
-                                           const TargetRegisterInfo *TRI,
                                            Register VReg,
                                            MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBB.findDebugLoc(I);
@@ -1009,10 +1008,12 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   }
 }
 
-void HexagonInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void HexagonInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator I,
+                                            Register DestReg, int FI,
+                                            const TargetRegisterClass *RC,
+                                            Register VReg,
+                                            MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBB.findDebugLoc(I);
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -4753,6 +4754,19 @@ bool HexagonInstrInfo::getBundleNoShuf(const MachineInstr &MIB) const {
   return (Operand.isImm() && (Operand.getImm() & memShufDisabledMask) != 0);
 }
 
+bool HexagonInstrInfo::isQFPMul(const MachineInstr *MI) const {
+  return (MI->getOpcode() == Hexagon::V6_vmpy_qf16_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf16_mix_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_mix_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_sf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf16_mix_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf16 ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_mix_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_qf16 ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32);
+}
+
 // Addressing mode relations.
 short HexagonInstrInfo::changeAddrMode_abs_io(short Opc) const {
   return Opc >= 0 ? Hexagon::changeAddrMode_abs_io(Opc) : Opc;
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index c17e5277ae2e7..796b978a2c3f0 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -23,6 +23,8 @@
 #include <cstdint>
 #include <vector>
 
+#include "HexagonRegisterInfo.h"
+
 #define GET_INSTRINFO_HEADER
 #include "HexagonGenInstrInfo.inc"
 
@@ -36,6 +38,7 @@ class MachineOperand;
 class TargetRegisterInfo;
 
 class HexagonInstrInfo : public HexagonGenInstrInfo {
+  const HexagonRegisterInfo RegInfo;
   const HexagonSubtarget &Subtarget;
 
   enum BundleAttribute {
@@ -47,6 +50,8 @@ class HexagonInstrInfo : public HexagonGenInstrInfo {
 public:
   explicit HexagonInstrInfo(const HexagonSubtarget &ST);
 
+  const HexagonRegisterInfo &getRegisterInfo() const { return RegInfo; }
+
   /// TargetInstrInfo overrides.
 
   /// If the specified machine instruction is a direct
@@ -183,8 +188,7 @@ class HexagonInstrInfo : public HexagonGenInstrInfo {
   /// is true, the register operand is the last use and must be marked kill.
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   /// Load the specified register of the given register class from the specified
@@ -193,7 +197,7 @@ class HexagonInstrInfo : public HexagonGenInstrInfo {
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   /// This function is called for all pseudo instructions
@@ -532,6 +536,7 @@ class HexagonInstrInfo : public HexagonGenInstrInfo {
   }
 
   MCInst getNop() const override;
+  bool isQFPMul(const MachineInstr *MF) const;
 };
 
 /// \brief Create RegSubRegPair from a register MachineOperand
diff --git a/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp b/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp
index 7cbd81ff227e1..54969b2317ef4 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp
@@ -646,7 +646,7 @@ bool HexagonLoadStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
     MachineInstr *CombI;
     if (Acc != 0) {
       const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
-      const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI);
+      const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0);
       Register VReg = MF->getRegInfo().createVirtualRegister(RC);
       MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg).addImm(LowerAcc);
       NG.push_back(TfrI);
@@ -677,7 +677,7 @@ bool HexagonLoadStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
   } else {
     // Create vreg = A2_tfrsi #Acc; mem[hw] = vreg
     const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
-    const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI);
+    const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0);
     Register VReg = MF->getRegInfo().createVirtualRegister(RC);
     MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg).addImm(int(Acc));
     NG.push_back(TfrI);
diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index 6dd83c1d820f4..2ee3b9d3b1e27 100644
--- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -198,7 +198,7 @@ bool HexagonOptAddrMode::canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN,
     // Reaching Def to an offset register can't be a phi.
     if ((OffsetRegDN.Addr->getFlags() & NodeAttrs::PhiRef) &&
         MI.getParent() != UseMI.getParent())
-    return false;
+      return false;
 
     const MCInstrDesc &UseMID = UseMI.getDesc();
     if ((!UseMID.mayLoad() && !UseMID.mayStore()) ||
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index 85ce9447c2028..e40dbd251b5b7 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -3434,6 +3434,19 @@ let AddedComplexity = 100 in {
            (C2_not (S4_stored_locked I32:$Rs, I64:$Rt))>;
 }
 
+multiclass FloatClass<SDPatternOperator IntOp, InstHexagon MI,
+                      PatFrag RegPred> {
+  let AddedComplexity = 100 in {
+    def: Pat<(i1 (seteq (IntOp RegPred:$Rs, u5_0ImmPred_timm:$u5), 0)),
+             (C2_not (MI RegPred:$Rs, u5_0ImmPred_timm:$u5))>;
+    def: Pat<(i1 (setne (IntOp RegPred:$Rs, u5_0ImmPred_timm:$u5), 0)),
+             (MI RegPred:$Rs, u5_0ImmPred_timm:$u5)>;
+  }
+}
+
+defm : FloatClass<int_hexagon_F2_sfclass, F2_sfclass, F32>;
+defm : FloatClass<int_hexagon_F2_dfclass, F2_dfclass, F64>;
+
 def: Pat<(int_hexagon_instrprof_custom (HexagonAtPcrel tglobaladdr:$addr), u32_0ImmPred:$I),
          (PS_call_instrprof_custom tglobaladdr:$addr, imm:$I)>;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
index 1637b91f1fa12..d19920cfc9ea0 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -612,6 +612,9 @@ let Predicates = [UseHVX] in {
            (V6_vandvrt HvxVR:$Vs, (ToI32 0x01010101))>;
   def: Pat<(VecQ32 (trunc HVI32:$Vs)),
            (V6_vandvrt HvxVR:$Vs, (ToI32 0x01010101))>;
+  def: Pat<(VecQ16 (trunc HWI32:$Vss)),
+           (Combineq(VecQ32(V6_vandvrt (HiVec $Vss), (ToI32 0x01010101))),
+           (VecQ32 (V6_vandvrt (LoVec $Vss), (ToI32 0x01010101))))>;
 }
 
 let Predicates = [UseHVX] in {
diff --git a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
index 479ac90b7d526..6d66237730ded 100644
--- a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
@@ -58,7 +58,7 @@
 // are PHI inst.
 //
 //===----------------------------------------------------------------------===//
-#include <unordered_set>
+
 #define HEXAGON_QFP_OPTIMIZER "QFP optimizer pass"
 
 #include "Hexagon.h"
@@ -77,7 +77,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
-#include <vector>
 
 #define DEBUG_TYPE "hexagon-qfp-optimizer"
 
@@ -86,6 +85,9 @@ using namespace llvm;
 cl::opt<bool>
     DisableQFOptimizer("disable-qfp-opt", cl::init(false),
                        cl::desc("Disable optimization of Qfloat operations."));
+cl::opt<bool> DisableQFOptForMul(
+    "disable-qfp-opt-mul", cl::init(true),
+    cl::desc("Disable optimization of Qfloat operations for multiply."));
 
 namespace {
 const std::map<unsigned short, unsigned short> QFPInstMap{
@@ -101,18 +103,21 @@ const std::map<unsigned short, unsigned short> QFPInstMap{
     {Hexagon::V6_vmpy_qf16_mix_hf, Hexagon::V6_vmpy_qf16},
     {Hexagon::V6_vmpy_qf32_hf, Hexagon::V6_vmpy_qf32_mix_hf},
     {Hexagon::V6_vmpy_qf32_mix_hf, Hexagon::V6_vmpy_qf32_qf16},
-    {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32}};
+    {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32},
+    {Hexagon::V6_vilog2_sf, Hexagon::V6_vilog2_qf32},
+    {Hexagon::V6_vilog2_hf, Hexagon::V6_vilog2_qf16},
+    {Hexagon::V6_vabs_qf32_sf, Hexagon::V6_vabs_qf32_qf32},
+    {Hexagon::V6_vabs_qf16_hf, Hexagon::V6_vabs_qf16_qf16},
+    {Hexagon::V6_vneg_qf32_sf, Hexagon::V6_vneg_qf32_qf32},
+    {Hexagon::V6_vneg_qf16_hf, Hexagon::V6_vneg_qf16_qf16}};
 } // namespace
 
 namespace llvm {
-
 FunctionPass *createHexagonQFPOptimizer();
 void initializeHexagonQFPOptimizerPass(PassRegistry &);
-
 } // namespace llvm
 
 namespace {
-
 struct HexagonQFPOptimizer : public MachineFunctionPass {
 public:
   static char ID;
@@ -123,6 +128,10 @@ struct HexagonQFPOptimizer : public MachineFunctionPass {
 
   bool optimizeQfp(MachineInstr *MI, MachineBasicBlock *MBB);
 
+  bool optimizeQfpTwoOp(MachineInstr *MI, MachineBasicBlock *MBB);
+
+  bool optimizeQfpOneOp(MachineInstr *MI, MachineBasicBlock *MBB);
+
   StringRef getPassName() const override { return HEXAGON_QFP_OPTIMIZER; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -149,19 +158,69 @@ FunctionPass *llvm::createHexagonQFPOptimizer() {
 bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
                                       MachineBasicBlock *MBB) {
 
-  // Early exit:
-  // - if instruction is invalid or has too few operands (QFP ops need 2 sources
-  // + 1 dest),
-  // - or does not have a transformation mapping.
-  if (MI->getNumOperands() < 3)
+  if (MI->getNumOperands() == 2)
+    return optimizeQfpOneOp(MI, MBB);
+  else if (MI->getNumOperands() == 3)
+    return optimizeQfpTwoOp(MI, MBB);
+  else
     return false;
+}
+
+bool HexagonQFPOptimizer::optimizeQfpOneOp(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) {
+
+  unsigned Op0F = 0;
   auto It = QFPInstMap.find(MI->getOpcode());
   if (It == QFPInstMap.end())
     return false;
+
   unsigned short InstTy = It->second;
+  // Get the reachind defs of MI
+  MachineInstr *DefMI = MRI->getVRegDef(MI->getOperand(1).getReg());
+  MachineOperand &Res = MI->getOperand(0);
+  if (!Res.isReg())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "\n[Reaching Defs of operands]: "; DefMI->dump());
+  MachineInstr *ReachDefDef = nullptr;
+
+  // Get the reaching def of the reaching def to check for W reg def
+  if (DefMI->getNumOperands() > 1 && DefMI->getOperand(1).isReg() &&
+      DefMI->getOperand(1).getReg().isVirtual())
+    ReachDefDef = MRI->getVRegDef(DefMI->getOperand(1).getReg());
+  unsigned ReachDefOp = DefMI->getOpcode();
+  MachineInstrBuilder MIB;
+
+  // Check if the reaching def is a conversion
+  if (ReachDefOp == Hexagon::V6_vconv_sf_qf32 ||
+      ReachDefOp == Hexagon::V6_vconv_hf_qf16) {
+
+    // Return if the reaching def of reaching def is W type
+    if (ReachDefDef && MRI->getRegClass(ReachDefDef->getOperand(0).getReg()) ==
+                           &Hexagon::HvxWRRegClass)
+      return false;
+
+    // Analyze the use operands of the conversion to get their KILL status
+    MachineOperand &SrcOp = DefMI->getOperand(1);
+    Op0F = getKillRegState(SrcOp.isKill());
+    SrcOp.setIsKill(false);
+    MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+              .addReg(SrcOp.getReg(), Op0F, SrcOp.getSubReg());
+    LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
+    return true;
+  }
+  return false;
+}
+
+bool HexagonQFPOptimizer::optimizeQfpTwoOp(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) {
 
   unsigned Op0F = 0;
   unsigned Op1F = 0;
+  auto It = QFPInstMap.find(MI->getOpcode());
+  if (It == QFPInstMap.end())
+    return false;
+  unsigned short InstTy = It->second;
   // Get the reaching defs of MI, DefMI1 and DefMI2
   MachineInstr *DefMI1 = nullptr;
   MachineInstr *DefMI2 = nullptr;
@@ -174,6 +233,9 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
     return false;
 
   MachineOperand &Res = MI->getOperand(0);
+  if (!Res.isReg())
+    return false;
+
   MachineInstr *Inst1 = nullptr;
   MachineInstr *Inst2 = nullptr;
   LLVM_DEBUG(dbgs() << "\n[Reaching Defs of operands]: "; DefMI1->dump();
@@ -192,7 +254,8 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
   unsigned Def2OP = DefMI2->getOpcode();
 
   MachineInstrBuilder MIB;
-  // Case 1: Both reaching defs of MI are qf to sf/hf conversions
+
+  // Check if the both the reaching defs of MI are qf to sf/hf conversions
   if ((Def1OP == Hexagon::V6_vconv_sf_qf32 &&
        Def2OP == Hexagon::V6_vconv_sf_qf32) ||
       (Def1OP == Hexagon::V6_vconv_hf_qf16 &&
@@ -233,7 +296,7 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
     LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
     return true;
 
-    // Case 2: Left operand is conversion to sf/hf
+    // Check if left operand's reaching def is a conversion to sf/hf
   } else if (((Def1OP == Hexagon::V6_vconv_sf_qf32 &&
                Def2OP != Hexagon::V6_vconv_sf_qf32) ||
               (Def1OP == Hexagon::V6_vconv_hf_qf16 &&
@@ -257,7 +320,7 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
     LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
     return true;
 
-    // Case 2: Left operand is conversion to sf/hf
+    // Check if right operand's reaching def is a conversion to sf/hf
   } else if (((Def1OP != Hexagon::V6_vconv_sf_qf32 &&
                Def2OP == Hexagon::V6_vconv_sf_qf32) ||
               (Def1OP != Hexagon::V6_vconv_hf_qf16 &&
@@ -265,13 +328,6 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
              !DefMI1->isPHI() &&
              (MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf)) {
     // The second operand of original instruction is converted.
-    // In "mix" instructions, "qf" operand is always the first operand.
-
-    // Caveat: vsub is not commutative w.r.t operands.
-    if (InstTy == Hexagon::V6_vsub_qf16_mix ||
-        InstTy == Hexagon::V6_vsub_qf32_mix)
-      return false;
-
     if (Inst2 && MRI->getRegClass(Inst2->getOperand(0).getReg()) ==
                      &Hexagon::HvxWRRegClass)
       return false;
@@ -282,10 +338,26 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
     Op1F = getKillRegState(Src2.isKill());
     Src2.setIsKill(false);
     Op0F = getKillRegState(Src1.isKill());
-    MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
-              .addReg(Src2.getReg(), Op1F,
-                      Src2.getSubReg()) // Notice the operands are flipped.
-              .addReg(Src1.getReg(), Op0F, Src1.getSubReg());
+    if (InstTy == Hexagon::V6_vsub_qf16_mix ||
+        InstTy == Hexagon::V6_vsub_qf32_mix) {
+      if (!HST->useHVXV81Ops())
+        // vsub_(hf|sf)_mix insts are only avlbl on hvx81+
+        return false;
+      // vsub is not commutative w.r.t. operands -> treat it as a special case
+      // to choose the correct mix instruction.
+      if (Def2OP == Hexagon::V6_vconv_sf_qf32)
+        InstTy = Hexagon::V6_vsub_sf_mix;
+      else if (Def2OP == Hexagon::V6_vconv_hf_qf16)
+        InstTy = Hexagon::V6_vsub_hf_mix;
+      MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+                .addReg(Src1.getReg(), Op0F, Src1.getSubReg())
+                .addReg(Src2.getReg(), Op1F, Src2.getSubReg());
+    } else {
+      MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+                .addReg(Src2.getReg(), Op1F,
+                        Src2.getSubReg()) // Notice the operands are flipped.
+                .addReg(Src1.getReg(), Op0F, Src1.getSubReg());
+    }
     LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
     return true;
   }
@@ -316,15 +388,18 @@ bool HexagonQFPOptimizer::runOnMachineFunction(MachineFunction &MF) {
     while (MII != MBBI->instr_end()) {
       MachineInstr *MI = &*MII;
       ++MII; // As MI might be removed.
-
-      if (QFPInstMap.count(MI->getOpcode()) &&
-          MI->getOpcode() != Hexagon::V6_vconv_sf_qf32 &&
-          MI->getOpcode() != Hexagon::V6_vconv_hf_qf16) {
-        LLVM_DEBUG(dbgs() << "\n###Analyzing for removal: "; MI->dump());
-        if (optimizeQfp(MI, MBB)) {
-          MI->eraseFromParent();
-          LLVM_DEBUG(dbgs() << "\t....Removing....");
-          Changed = true;
+      if (QFPInstMap.count(MI->getOpcode())) {
+        auto OpC = MI->getOpcode();
+        if (DisableQFOptForMul && HII->isQFPMul(MI))
+          continue;
+        if (OpC != Hexagon::V6_vconv_sf_qf32 &&
+            OpC != Hexagon::V6_vconv_hf_qf16) {
+          LLVM_DEBUG(dbgs() << "\n###Analyzing for removal: "; MI->dump());
+          if (optimizeQfp(MI, MBB)) {
+            MI->eraseFromParent();
+            LLVM_DEBUG(dbgs() << "\t....Removing....");
+            Changed = true;
+          }
         }
       }
     }
diff --git a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
index 54f5608d460af..f375b25e4ceb8 100644
--- a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -34,7 +34,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <limits>
-#include <utility>
 
 using namespace llvm;
 using namespace rdf;
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index ce2de752f3b3a..66c8b0a67169d 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
-#include <map>
 #include <optional>
 
 using namespace llvm;
@@ -77,8 +76,7 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
       OptLevel(TM.getOptLevel()),
       CPUString(std::string(Hexagon_MC::selectHexagonCPU(CPU))),
       TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
-      RegInfo(getHwMode()), TLInfo(TM, *this),
-      InstrItins(getInstrItineraryForCPU(CPUString)) {
+      TLInfo(TM, *this), InstrItins(getInstrItineraryForCPU(CPUString)) {
   Hexagon_MC::addArchSubtarget(this, FS);
   // Beware of the default constructor of InstrItineraryData: it will
   // reset all members to 0.
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index 995f66d0551b4..30794f61218a1 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -100,7 +100,6 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
   // The following objects can use the TargetTriple, so they must be
   // declared after it.
   HexagonInstrInfo InstrInfo;
-  HexagonRegisterInfo RegInfo;
   HexagonTargetLowering TLInfo;
   HexagonSelectionDAGInfo TSInfo;
   HexagonFrameLowering FrameLowering;
@@ -122,7 +121,7 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
   }
   const HexagonInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const HexagonRegisterInfo *getRegisterInfo() const override {
-    return &RegInfo;
+    return &InstrInfo.getRegisterInfo();
   }
   const HexagonTargetLowering *getTargetLowering() const override {
     return &TLInfo;
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index cb88d1ac4af9f..d39b79a86753a 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -653,7 +653,7 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI,
   const MCInstrDesc& MCID = PacketMI.getDesc();
 
   // First operand is always the result.
-  const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0, HRI);
+  const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0);
   // Double regs can not feed into new value store: PRM section: 5.4.2.2.
   if (PacketRC == &Hexagon::DoubleRegsRegClass)
     return false;
@@ -866,7 +866,7 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI,
     return false;
 
   const MCInstrDesc& MCID = PI.getDesc();
-  const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0, HRI);
+  const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0);
   if (DisableVecDblNVStores && VecRC == &Hexagon::HvxWRRegClass)
     return false;
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index 9b6bc5ade379d..0b2279bb2cfe6 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -385,7 +385,7 @@ bool HexagonMCChecker::checkSlots() {
 bool HexagonMCChecker::checkPredicates() {
   // Check for proper use of new predicate registers.
   for (const auto &I : NewPreds) {
-    unsigned P = I;
+    MCRegister P = I;
 
     if (!Defs.count(P) || LatePreds.count(P) || Defs.count(Hexagon::P3_0)) {
       // Error out if the new predicate register is not defined,
@@ -398,7 +398,7 @@ bool HexagonMCChecker::checkPredicates() {
 
   // Check for proper use of auto-anded of predicate registers.
   for (const auto &I : LatePreds) {
-    unsigned P = I;
+    MCRegister P = I;
 
     if (LatePreds.count(P) > 1 || Defs.count(P)) {
       // Error out if predicate register defined "late" multiple times or
@@ -607,7 +607,7 @@ void HexagonMCChecker::checkRegisterCurDefs() {
 bool HexagonMCChecker::checkRegisters() {
   // Check for proper register definitions.
   for (const auto &I : Defs) {
-    unsigned R = I.first;
+    MCRegister R = I.first;
 
     if (isLoopRegister(R) && Defs.count(R) > 1 &&
         (HexagonMCInstrInfo::isInnerLoop(MCB) ||
@@ -620,8 +620,8 @@ bool HexagonMCChecker::checkRegisters() {
     if (SoftDefs.count(R)) {
       // Error out for explicit changes to registers also weakly defined
       // (e.g., "{ usr = r0; r0 = sfadd(...) }").
-      unsigned UsrR = Hexagon::USR; // Silence warning about mixed types in ?:.
-      unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
+      MCRegister UsrR = Hexagon::USR;
+      MCRegister BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
       reportErrorRegisters(BadR);
       return false;
     }
@@ -633,8 +633,8 @@ bool HexagonMCChecker::checkRegisters() {
       if (PM.count(Unconditional)) {
         // Error out on an unconditional change when there are any other
         // changes, conditional or not.
-        unsigned UsrR = Hexagon::USR;
-        unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
+        MCRegister UsrR = Hexagon::USR;
+        MCRegister BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
         reportErrorRegisters(BadR);
         return false;
       }
@@ -664,7 +664,7 @@ bool HexagonMCChecker::checkRegisters() {
 
   // Check for use of temporary definitions.
   for (const auto &I : TmpDefs) {
-    unsigned R = I;
+    MCRegister R = I;
 
     if (!Uses.count(R)) {
       // special case for vhist
@@ -765,12 +765,12 @@ void HexagonMCChecker::compoundRegisterMap(unsigned &Register) {
   }
 }
 
-void HexagonMCChecker::reportErrorRegisters(unsigned Register) {
+void HexagonMCChecker::reportErrorRegisters(MCRegister Register) {
   reportError("register `" + Twine(RI.getName(Register)) +
               "' modified more than once");
 }
 
-void HexagonMCChecker::reportErrorNewValue(unsigned Register) {
+void HexagonMCChecker::reportErrorNewValue(MCRegister Register) {
   reportError("register `" + Twine(RI.getName(Register)) +
               "' used with `.new' "
               "but not validly modified in the same packet");
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
index e9b87c5315fe4..8beee8d7ec8eb 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
@@ -39,41 +39,41 @@ class HexagonMCChecker {
   bool ReportErrors;
 
   /// Set of definitions: register #, if predicated, if predicated true.
-  using PredSense = std::pair<unsigned, bool>;
+  using PredSense = std::pair<MCRegister, bool>;
   static const PredSense Unconditional;
   using PredSet = std::multiset<PredSense>;
   using PredSetIterator = std::multiset<PredSense>::iterator;
 
-  using DefsIterator = DenseMap<unsigned, PredSet>::iterator;
-  DenseMap<unsigned, PredSet> Defs;
+  using DefsIterator = DenseMap<MCRegister, PredSet>::iterator;
+  DenseMap<MCRegister, PredSet> Defs;
 
   /// Set of weak definitions whose clashes should be enforced selectively.
-  using SoftDefsIterator = std::set<unsigned>::iterator;
-  std::set<unsigned> SoftDefs;
+  using SoftDefsIterator = std::set<MCRegister>::iterator;
+  std::set<MCRegister> SoftDefs;
 
   /// Set of temporary definitions not committed to the register file.
-  using TmpDefsIterator = std::set<unsigned>::iterator;
-  std::set<unsigned> TmpDefs;
+  using TmpDefsIterator = std::set<MCRegister>::iterator;
+  std::set<MCRegister> TmpDefs;
 
   /// Set of new predicates used.
-  using NewPredsIterator = std::set<unsigned>::iterator;
-  std::set<unsigned> NewPreds;
+  using NewPredsIterator = std::set<MCRegister>::iterator;
+  std::set<MCRegister> NewPreds;
 
   /// Set of predicates defined late.
-  using LatePredsIterator = std::multiset<unsigned>::iterator;
-  std::multiset<unsigned> LatePreds;
+  using LatePredsIterator = std::multiset<MCRegister>::iterator;
+  std::multiset<MCRegister> LatePreds;
 
   /// Set of uses.
-  using UsesIterator = std::set<unsigned>::iterator;
-  std::set<unsigned> Uses;
+  using UsesIterator = std::set<MCRegister>::iterator;
+  std::set<MCRegister> Uses;
 
   /// Pre-defined set of read-only registers.
-  using ReadOnlyIterator = std::set<unsigned>::iterator;
-  std::set<unsigned> ReadOnly;
+  using ReadOnlyIterator = std::set<MCRegister>::iterator;
+  std::set<MCRegister> ReadOnly;
 
   // Contains the vector-pair-registers with the even number
   // first ("v0:1", e.g.) used/def'd in this packet.
-  std::set<unsigned> ReversePairs;
+  std::set<MCRegister> ReversePairs;
 
   void init();
   void init(MCInst const &);
@@ -107,7 +107,7 @@ class HexagonMCChecker {
 
   static void compoundRegisterMap(unsigned &);
 
-  bool isLoopRegister(unsigned R) const {
+  bool isLoopRegister(MCRegister R) const {
     return (Hexagon::SA0 == R || Hexagon::LC0 == R || Hexagon::SA1 == R ||
             Hexagon::LC1 == R);
   }
@@ -120,8 +120,8 @@ class HexagonMCChecker {
                             MCSubtargetInfo const &STI, bool CopyReportErrors);
 
   bool check(bool FullCheck = true);
-  void reportErrorRegisters(unsigned Register);
-  void reportErrorNewValue(unsigned Register);
+  void reportErrorRegisters(MCRegister Register);
+  void reportErrorNewValue(MCRegister Register);
   void reportError(SMLoc Loc, Twine const &Msg);
   void reportNote(SMLoc Loc, Twine const &Msg);
   void reportError(Twine const &Msg);
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
index c5e57d0df22a7..712bdbe2af187 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
@@ -21,7 +21,6 @@
 #include "llvm/TargetParser/SubtargetFeature.h"
 #include <cstddef>
 #include <cstdint>
-#include <memory>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/Hexagon/RDFCopy.cpp b/llvm/lib/Target/Hexagon/RDFCopy.cpp
index 3b1d3bd89680b..4cab5da7b1caf 100644
--- a/llvm/lib/Target/Hexagon/RDFCopy.cpp
+++ b/llvm/lib/Target/Hexagon/RDFCopy.cpp
@@ -26,7 +26,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdint>
-#include <utility>
 
 using namespace llvm;
 using namespace rdf;
diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index cef77f1c512f6..0444c865f6866 100644
--- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -559,7 +559,7 @@ struct LanaiOperand : public MCParsedAsmOperand {
       OS << "Token: " << getToken() << "\n";
       break;
     case REGISTER:
-      OS << "Reg: %r" << getReg() << "\n";
+      OS << "Reg: %r" << getReg().id() << "\n";
       break;
     case MEMORY_IMM:
       OS << "MemImm: ";
@@ -567,14 +567,14 @@ struct LanaiOperand : public MCParsedAsmOperand {
       OS << '\n';
       break;
     case MEMORY_REG_IMM:
-      OS << "MemRegImm: " << getMemBaseReg() << "+";
+      OS << "MemRegImm: " << getMemBaseReg().id() << "+";
       MAI.printExpr(OS, *getMemOffset());
       OS << '\n';
       break;
     case MEMORY_REG_REG:
       assert(getMemOffset() == nullptr);
-      OS << "MemRegReg: " << getMemBaseReg() << "+"
-         << "%r" << getMemOffsetReg() << "\n";
+      OS << "MemRegReg: " << getMemBaseReg().id() << "+"
+         << "%r" << getMemOffsetReg().id() << "\n";
       break;
     }
   }
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
index 02ed1001cd0d3..14b7557e7f94a 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -27,7 +27,8 @@ using namespace llvm;
 #include "LanaiGenInstrInfo.inc"
 
 LanaiInstrInfo::LanaiInstrInfo(const LanaiSubtarget &STI)
-    : LanaiGenInstrInfo(STI, Lanai::ADJCALLSTACKDOWN, Lanai::ADJCALLSTACKUP),
+    : LanaiGenInstrInfo(STI, RegisterInfo, Lanai::ADJCALLSTACKDOWN,
+                        Lanai::ADJCALLSTACKUP),
       RegisterInfo() {}
 
 void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
@@ -48,8 +49,7 @@ void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 void LanaiInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
     Register SourceRegister, bool IsKill, int FrameIndex,
-    const TargetRegisterClass *RegisterClass,
-    const TargetRegisterInfo * /*RegisterInfo*/, Register /*VReg*/,
+    const TargetRegisterClass *RegisterClass, Register /*VReg*/,
     MachineInstr::MIFlag /*Flags*/) const {
   DebugLoc DL;
   if (Position != MBB.end()) {
@@ -69,8 +69,7 @@ void LanaiInstrInfo::storeRegToStackSlot(
 void LanaiInstrInfo::loadRegFromStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
     Register DestinationRegister, int FrameIndex,
-    const TargetRegisterClass *RegisterClass,
-    const TargetRegisterInfo * /*RegisterInfo*/, Register /*VReg*/,
+    const TargetRegisterClass *RegisterClass, Register /*VReg*/,
     MachineInstr::MIFlag /*Flags*/) const {
   DebugLoc DL;
   if (Position != MBB.end()) {
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.h b/llvm/lib/Target/Lanai/LanaiInstrInfo.h
index d98276243dc31..155e2f03be630 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.h
@@ -58,15 +58,13 @@ class LanaiInstrInfo : public LanaiGenInstrInfo {
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
       Register SourceRegister, bool IsKill, int FrameIndex,
-      const TargetRegisterClass *RegisterClass,
-      const TargetRegisterInfo *RegisterInfo, Register VReg,
+      const TargetRegisterClass *RegisterClass, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
       Register DestinationRegister, int FrameIndex,
-      const TargetRegisterClass *RegisterClass,
-      const TargetRegisterInfo *RegisterInfo, Register VReg,
+      const TargetRegisterClass *RegisterClass, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp b/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp
index 0ccebeb393267..6358e348fe424 100644
--- a/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp
@@ -60,7 +60,6 @@ bool LoongArchDeadRegisterDefinitions::runOnMachineFunction(
     return false;
 
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
   LLVM_DEBUG(dbgs() << "***** LoongArchDeadRegisterDefinitions *****\n");
 
@@ -86,7 +85,7 @@ bool LoongArchDeadRegisterDefinitions::runOnMachineFunction(
           continue;
         LLVM_DEBUG(dbgs() << "    Dead def operand #" << I << " in:\n      ";
                    MI.print(dbgs()));
-        const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI);
+        const TargetRegisterClass *RC = TII->getRegClass(Desc, I);
         if (!(RC && RC->contains(LoongArch::R0))) {
           LLVM_DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
           continue;
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
index 690dd73014e57..e86b21cf849cb 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -365,6 +365,7 @@ def : Pat<(f32 (uint_to_fp (i64 (sexti32 (i64 GPR:$src))))),
 // FP Rounding
 let Predicates = [HasBasicF, IsLA64] in {
 def : PatFpr<frint, FRINT_S, FPR32>;
+def : PatFpr<flog2, FLOGB_S, FPR32>;
 } // Predicates = [HasBasicF, IsLA64]
 
 let Predicates = [HasBasicF, IsLA32] in {
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
index daefbaa52d42a..2e88254aab4d5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
@@ -348,6 +348,7 @@ def : Pat<(bitconvert FPR64:$src), (MOVFR2GR_D FPR64:$src)>;
 // FP Rounding
 let Predicates = [HasBasicD, IsLA64] in {
 def : PatFpr<frint, FRINT_D, FPR64>;
+def : PatFpr<flog2, FLOGB_D, FPR64>;
 } // Predicates = [HasBasicD, IsLA64]
 
 /// Pseudo-instructions needed for the soft-float ABI with LA32D
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
index 1493bf4cba695..690b0639484d0 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
@@ -449,7 +449,7 @@ bool LoongArchFrameLowering::spillCalleeSavedRegisters(
     bool IsKill =
         !(Reg == LoongArch::R1 && MF->getFrameInfo().isReturnAddressTaken());
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, CS.getFrameIdx(), RC, TRI,
+    TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, CS.getFrameIdx(), RC,
                             Register());
   }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 80c96c6dc8eb6..cf4ffc82f6009 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -244,8 +244,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FP_TO_BF16, MVT::f32,
                        Subtarget.isSoftFPABI() ? LibCall : Custom);
 
-    if (Subtarget.is64Bit())
+    if (Subtarget.is64Bit()) {
       setOperationAction(ISD::FRINT, MVT::f32, Legal);
+      setOperationAction(ISD::FLOG2, MVT::f32, Legal);
+    }
 
     if (!Subtarget.hasBasicD()) {
       setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
@@ -291,8 +293,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FP_TO_BF16, MVT::f64,
                        Subtarget.isSoftFPABI() ? LibCall : Custom);
 
-    if (Subtarget.is64Bit())
+    if (Subtarget.is64Bit()) {
       setOperationAction(ISD::FRINT, MVT::f64, Legal);
+      setOperationAction(ISD::FLOG2, MVT::f64, Legal);
+    }
   }
 
   // Set operations for 'LSX' feature.
@@ -362,10 +366,17 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FMA, VT, Legal);
       setOperationAction(ISD::FSQRT, VT, Legal);
       setOperationAction(ISD::FNEG, VT, Legal);
+      setOperationAction(ISD::FLOG2, VT, Legal);
       setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT,
                          ISD::SETUGE, ISD::SETUGT},
                         VT, Expand);
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
+      setOperationAction(ISD::FCEIL, VT, Legal);
+      setOperationAction(ISD::FFLOOR, VT, Legal);
+      setOperationAction(ISD::FTRUNC, VT, Legal);
+      setOperationAction(ISD::FROUNDEVEN, VT, Legal);
+      setOperationAction(ISD::FMINNUM, VT, Legal);
+      setOperationAction(ISD::FMAXNUM, VT, Legal);
     }
     setOperationAction(ISD::CTPOP, GRLenVT, Legal);
     setOperationAction(ISD::FCEIL, {MVT::f32, MVT::f64}, Legal);
@@ -443,10 +454,17 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FMA, VT, Legal);
       setOperationAction(ISD::FSQRT, VT, Legal);
       setOperationAction(ISD::FNEG, VT, Legal);
+      setOperationAction(ISD::FLOG2, VT, Legal);
       setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT,
                          ISD::SETUGE, ISD::SETUGT},
                         VT, Expand);
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
+      setOperationAction(ISD::FCEIL, VT, Legal);
+      setOperationAction(ISD::FFLOOR, VT, Legal);
+      setOperationAction(ISD::FTRUNC, VT, Legal);
+      setOperationAction(ISD::FROUNDEVEN, VT, Legal);
+      setOperationAction(ISD::FMINNUM, VT, Legal);
+      setOperationAction(ISD::FMAXNUM, VT, Legal);
     }
   }
 
@@ -6612,6 +6630,11 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(LoongArchISD::VANY_NONZERO, DL, N->getValueType(0),
                          N->getOperand(1));
     break;
+  case Intrinsic::loongarch_lasx_concat_128_s:
+  case Intrinsic::loongarch_lasx_concat_128_d:
+  case Intrinsic::loongarch_lasx_concat_128:
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
   }
   return SDValue();
 }
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index c89212dae72d9..9fc862af7ea24 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -26,9 +26,9 @@ using namespace llvm;
 #include "LoongArchGenInstrInfo.inc"
 
 LoongArchInstrInfo::LoongArchInstrInfo(const LoongArchSubtarget &STI)
-    : LoongArchGenInstrInfo(STI, LoongArch::ADJCALLSTACKDOWN,
+    : LoongArchGenInstrInfo(STI, RegInfo, LoongArch::ADJCALLSTACKDOWN,
                             LoongArch::ADJCALLSTACKUP),
-      STI(STI) {}
+      RegInfo(STI.getHwMode()), STI(STI) {}
 
 MCInst LoongArchInstrInfo::getNop() const {
   return MCInstBuilder(LoongArch::ANDI)
@@ -113,14 +113,14 @@ void LoongArchInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 void LoongArchInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg,
     bool IsKill, int FI, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
-    MachineInstr::MIFlag Flags) const {
+
+    Register VReg, MachineInstr::MIFlag Flags) const {
   MachineFunction *MF = MBB.getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
 
   unsigned Opcode;
   if (LoongArch::GPRRegClass.hasSubClassEq(RC))
-    Opcode = TRI->getRegSizeInBits(LoongArch::GPRRegClass) == 32
+    Opcode = TRI.getRegSizeInBits(LoongArch::GPRRegClass) == 32
                  ? LoongArch::ST_W
                  : LoongArch::ST_D;
   else if (LoongArch::FPR32RegClass.hasSubClassEq(RC))
@@ -149,8 +149,8 @@ void LoongArchInstrInfo::storeRegToStackSlot(
 
 void LoongArchInstrInfo::loadRegFromStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DstReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+    int FI, const TargetRegisterClass *RC, Register VReg,
+    MachineInstr::MIFlag Flags) const {
   MachineFunction *MF = MBB.getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
   DebugLoc DL;
@@ -159,7 +159,7 @@ void LoongArchInstrInfo::loadRegFromStackSlot(
 
   unsigned Opcode;
   if (LoongArch::GPRRegClass.hasSubClassEq(RC))
-    Opcode = TRI->getRegSizeInBits(LoongArch::GPRRegClass) == 32
+    Opcode = RegInfo.getRegSizeInBits(LoongArch::GPRRegClass) == 32
                  ? LoongArch::LD_W
                  : LoongArch::LD_D;
   else if (LoongArch::FPR32RegClass.hasSubClassEq(RC))
@@ -378,12 +378,9 @@ bool LoongArchInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
   }
 }
 
-bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
-                                              const MachineBasicBlock *MBB,
-                                              const MachineFunction &MF) const {
-  if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
-    return true;
-
+bool LoongArchInstrInfo::isSafeToMove(const MachineInstr &MI,
+                                      const MachineBasicBlock *MBB,
+                                      const MachineFunction &MF) const {
   auto MII = MI.getIterator();
   auto MIE = MBB->end();
 
@@ -429,25 +426,25 @@ bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
     auto MO2 = Lu32I->getOperand(2).getTargetFlags();
     if (MO0 == LoongArchII::MO_PCREL_HI && MO1 == LoongArchII::MO_PCREL_LO &&
         MO2 == LoongArchII::MO_PCREL64_LO)
-      return true;
+      return false;
     if ((MO0 == LoongArchII::MO_GOT_PC_HI || MO0 == LoongArchII::MO_LD_PC_HI ||
          MO0 == LoongArchII::MO_GD_PC_HI) &&
         MO1 == LoongArchII::MO_GOT_PC_LO && MO2 == LoongArchII::MO_GOT_PC64_LO)
-      return true;
+      return false;
     if (MO0 == LoongArchII::MO_IE_PC_HI && MO1 == LoongArchII::MO_IE_PC_LO &&
         MO2 == LoongArchII::MO_IE_PC64_LO)
-      return true;
+      return false;
     if (MO0 == LoongArchII::MO_DESC_PC_HI &&
         MO1 == LoongArchII::MO_DESC_PC_LO &&
         MO2 == LoongArchII::MO_DESC64_PC_LO)
-      return true;
+      return false;
     break;
   }
   case LoongArch::LU52I_D: {
     auto MO = MI.getOperand(2).getTargetFlags();
     if (MO == LoongArchII::MO_PCREL64_HI || MO == LoongArchII::MO_GOT_PC64_HI ||
         MO == LoongArchII::MO_IE_PC64_HI || MO == LoongArchII::MO_DESC64_PC_HI)
-      return true;
+      return false;
     break;
   }
   default:
@@ -487,7 +484,7 @@ bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
         auto MO1 = LoongArchII::getDirectFlags(SecondOp->getOperand(2));
         auto MO2 = LoongArchII::getDirectFlags(Ld->getOperand(2));
         if (MO1 == LoongArchII::MO_DESC_PC_LO && MO2 == LoongArchII::MO_DESC_LD)
-          return true;
+          return false;
         break;
       }
       if (SecondOp == MIE ||
@@ -496,34 +493,34 @@ bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
       auto MO1 = LoongArchII::getDirectFlags(SecondOp->getOperand(2));
       if (MO0 == LoongArchII::MO_PCREL_HI && SecondOp->getOpcode() == AddiOp &&
           MO1 == LoongArchII::MO_PCREL_LO)
-        return true;
+        return false;
       if (MO0 == LoongArchII::MO_GOT_PC_HI && SecondOp->getOpcode() == LdOp &&
           MO1 == LoongArchII::MO_GOT_PC_LO)
-        return true;
+        return false;
       if ((MO0 == LoongArchII::MO_LD_PC_HI ||
            MO0 == LoongArchII::MO_GD_PC_HI) &&
           SecondOp->getOpcode() == AddiOp && MO1 == LoongArchII::MO_GOT_PC_LO)
-        return true;
+        return false;
       break;
     }
     case LoongArch::ADDI_W:
     case LoongArch::ADDI_D: {
       auto MO = LoongArchII::getDirectFlags(MI.getOperand(2));
       if (MO == LoongArchII::MO_PCREL_LO || MO == LoongArchII::MO_GOT_PC_LO)
-        return true;
+        return false;
       break;
     }
     case LoongArch::LD_W:
     case LoongArch::LD_D: {
       auto MO = LoongArchII::getDirectFlags(MI.getOperand(2));
       if (MO == LoongArchII::MO_GOT_PC_LO)
-        return true;
+        return false;
       break;
     }
     case LoongArch::PseudoDESC_CALL: {
       auto MO = LoongArchII::getDirectFlags(MI.getOperand(2));
       if (MO == LoongArchII::MO_DESC_CALL)
-        return true;
+        return false;
       break;
     }
     default:
@@ -531,6 +528,18 @@ bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
     }
   }
 
+  return true;
+}
+
+bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                              const MachineBasicBlock *MBB,
+                                              const MachineFunction &MF) const {
+  if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
+    return true;
+
+  if (!isSafeToMove(MI, MBB, MF))
+    return true;
+
   return false;
 }
 
@@ -656,13 +665,13 @@ void LoongArchInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
     if (FrameIndex == -1)
       report_fatal_error("The function size is incorrectly estimated.");
     storeRegToStackSlot(MBB, PCALAU12I, Scav, /*IsKill=*/true, FrameIndex,
-                        &LoongArch::GPRRegClass, TRI, Register());
+                        &LoongArch::GPRRegClass, Register());
     TRI->eliminateFrameIndex(std::prev(PCALAU12I.getIterator()),
                              /*SpAdj=*/0, /*FIOperandNum=*/1);
     PCALAU12I.getOperand(1).setMBB(&RestoreBB);
     ADDI.getOperand(2).setMBB(&RestoreBB);
     loadRegFromStackSlot(RestoreBB, RestoreBB.end(), Scav, FrameIndex,
-                         &LoongArch::GPRRegClass, TRI, Register());
+                         &LoongArch::GPRRegClass, Register());
     TRI->eliminateFrameIndex(RestoreBB.back(),
                              /*SpAdj=*/0, /*FIOperandNum=*/1);
   }
@@ -756,6 +765,155 @@ LoongArchInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
   return ArrayRef(TargetFlags);
 }
 
+bool LoongArchInstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
+                                             Register Reg,
+                                             const MachineInstr &AddrI,
+                                             ExtAddrMode &AM) const {
+  enum MemIOffsetType {
+    Imm14Shift2,
+    Imm12,
+    Imm11Shift1,
+    Imm10Shift2,
+    Imm9Shift3,
+    Imm8,
+    Imm8Shift1,
+    Imm8Shift2,
+    Imm8Shift3
+  };
+
+  MemIOffsetType OT;
+  switch (MemI.getOpcode()) {
+  default:
+    return false;
+  case LoongArch::LDPTR_W:
+  case LoongArch::LDPTR_D:
+  case LoongArch::STPTR_W:
+  case LoongArch::STPTR_D:
+    OT = Imm14Shift2;
+    break;
+  case LoongArch::LD_B:
+  case LoongArch::LD_H:
+  case LoongArch::LD_W:
+  case LoongArch::LD_D:
+  case LoongArch::LD_BU:
+  case LoongArch::LD_HU:
+  case LoongArch::LD_WU:
+  case LoongArch::ST_B:
+  case LoongArch::ST_H:
+  case LoongArch::ST_W:
+  case LoongArch::ST_D:
+  case LoongArch::FLD_S:
+  case LoongArch::FLD_D:
+  case LoongArch::FST_S:
+  case LoongArch::FST_D:
+  case LoongArch::VLD:
+  case LoongArch::VST:
+  case LoongArch::XVLD:
+  case LoongArch::XVST:
+  case LoongArch::VLDREPL_B:
+  case LoongArch::XVLDREPL_B:
+    OT = Imm12;
+    break;
+  case LoongArch::VLDREPL_H:
+  case LoongArch::XVLDREPL_H:
+    OT = Imm11Shift1;
+    break;
+  case LoongArch::VLDREPL_W:
+  case LoongArch::XVLDREPL_W:
+    OT = Imm10Shift2;
+    break;
+  case LoongArch::VLDREPL_D:
+  case LoongArch::XVLDREPL_D:
+    OT = Imm9Shift3;
+    break;
+  case LoongArch::VSTELM_B:
+  case LoongArch::XVSTELM_B:
+    OT = Imm8;
+    break;
+  case LoongArch::VSTELM_H:
+  case LoongArch::XVSTELM_H:
+    OT = Imm8Shift1;
+    break;
+  case LoongArch::VSTELM_W:
+  case LoongArch::XVSTELM_W:
+    OT = Imm8Shift2;
+    break;
+  case LoongArch::VSTELM_D:
+  case LoongArch::XVSTELM_D:
+    OT = Imm8Shift3;
+    break;
+  }
+
+  if (MemI.getOperand(0).getReg() == Reg)
+    return false;
+
+  if ((AddrI.getOpcode() != LoongArch::ADDI_W &&
+       AddrI.getOpcode() != LoongArch::ADDI_D) ||
+      !AddrI.getOperand(1).isReg() || !AddrI.getOperand(2).isImm())
+    return false;
+
+  int64_t OldOffset = MemI.getOperand(2).getImm();
+  int64_t Disp = AddrI.getOperand(2).getImm();
+  int64_t NewOffset = OldOffset + Disp;
+  if (!STI.is64Bit())
+    NewOffset = SignExtend64<32>(NewOffset);
+
+  if (!(OT == Imm14Shift2 && isShiftedInt<14, 2>(NewOffset) && STI.hasUAL()) &&
+      !(OT == Imm12 && isInt<12>(NewOffset)) &&
+      !(OT == Imm11Shift1 && isShiftedInt<11, 1>(NewOffset)) &&
+      !(OT == Imm10Shift2 && isShiftedInt<10, 2>(NewOffset)) &&
+      !(OT == Imm9Shift3 && isShiftedInt<9, 3>(NewOffset)) &&
+      !(OT == Imm8 && isInt<8>(NewOffset)) &&
+      !(OT == Imm8Shift1 && isShiftedInt<8, 1>(NewOffset)) &&
+      !(OT == Imm8Shift2 && isShiftedInt<8, 2>(NewOffset)) &&
+      !(OT == Imm8Shift3 && isShiftedInt<8, 3>(NewOffset)))
+    return false;
+
+  AM.BaseReg = AddrI.getOperand(1).getReg();
+  AM.ScaledReg = 0;
+  AM.Scale = 0;
+  AM.Displacement = NewOffset;
+  AM.Form = ExtAddrMode::Formula::Basic;
+  return true;
+}
+
+MachineInstr *
+LoongArchInstrInfo::emitLdStWithAddr(MachineInstr &MemI,
+                                     const ExtAddrMode &AM) const {
+  const DebugLoc &DL = MemI.getDebugLoc();
+  MachineBasicBlock &MBB = *MemI.getParent();
+
+  assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
+         "Addressing mode not supported for folding");
+
+  unsigned MemIOp = MemI.getOpcode();
+  switch (MemIOp) {
+  default:
+    return BuildMI(MBB, MemI, DL, get(MemIOp))
+        .addReg(MemI.getOperand(0).getReg(),
+                MemI.mayLoad() ? RegState::Define : 0)
+        .addReg(AM.BaseReg)
+        .addImm(AM.Displacement)
+        .setMemRefs(MemI.memoperands())
+        .setMIFlags(MemI.getFlags());
+  case LoongArch::VSTELM_B:
+  case LoongArch::VSTELM_H:
+  case LoongArch::VSTELM_W:
+  case LoongArch::VSTELM_D:
+  case LoongArch::XVSTELM_B:
+  case LoongArch::XVSTELM_H:
+  case LoongArch::XVSTELM_W:
+  case LoongArch::XVSTELM_D:
+    return BuildMI(MBB, MemI, DL, get(MemIOp))
+        .addReg(MemI.getOperand(0).getReg(), 0)
+        .addReg(AM.BaseReg)
+        .addImm(AM.Displacement)
+        .addImm(MemI.getOperand(3).getImm())
+        .setMemRefs(MemI.memoperands())
+        .setMIFlags(MemI.getFlags());
+  }
+}
+
 // Returns true if this is the sext.w pattern, addi.w rd, rs, 0.
 bool LoongArch::isSEXT_W(const MachineInstr &MI) {
   return MI.getOpcode() == LoongArch::ADDI_W && MI.getOperand(1).isReg() &&
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
index f25958a32bec4..9f7a0a2239a87 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -24,9 +24,13 @@ namespace llvm {
 class LoongArchSubtarget;
 
 class LoongArchInstrInfo : public LoongArchGenInstrInfo {
+  const LoongArchRegisterInfo RegInfo;
+
 public:
   explicit LoongArchInstrInfo(const LoongArchSubtarget &STI);
 
+  const LoongArchRegisterInfo &getRegisterInfo() const { return RegInfo; }
+
   MCInst getNop() const override;
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
@@ -36,13 +40,11 @@ class LoongArchInstrInfo : public LoongArchGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool IsKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DstReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   // Materializes the given integer Val into DstReg.
@@ -64,6 +66,9 @@ class LoongArchInstrInfo : public LoongArchGenInstrInfo {
   bool isBranchOffsetInRange(unsigned BranchOpc,
                              int64_t BrOffset) const override;
 
+  bool isSafeToMove(const MachineInstr &MI, const MachineBasicBlock *MBB,
+                    const MachineFunction &MF) const override;
+
   bool isSchedulingBoundary(const MachineInstr &MI,
                             const MachineBasicBlock *MBB,
                             const MachineFunction &MF) const override;
@@ -93,6 +98,12 @@ class LoongArchInstrInfo : public LoongArchGenInstrInfo {
   ArrayRef<std::pair<unsigned, const char *>>
   getSerializableBitmaskMachineOperandTargetFlags() const override;
 
+  bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg,
+                           const MachineInstr &AddrI,
+                           ExtAddrMode &AM) const override;
+  MachineInstr *emitLdStWithAddr(MachineInstr &MemI,
+                                 const ExtAddrMode &AM) const override;
+
 protected:
   const LoongArchSubtarget &STI;
 };
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 613dea6093f5f..00d52870f1727 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1558,6 +1558,10 @@ defm : PatXrXrF<fmul, "XVFMUL">;
 // XVFDIV_{S/D}
 defm : PatXrXrF<fdiv, "XVFDIV">;
 
+// XVFMAX_{S/D}, XVFMIN_{S/D}
+defm : PatXrXrF<fmaxnum, "XVFMAX">;
+defm : PatXrXrF<fminnum, "XVFMIN">;
+
 // XVFMADD_{S/D}
 def : Pat<(fma v8f32:$xj, v8f32:$xk, v8f32:$xa),
           (XVFMADD_S v8f32:$xj, v8f32:$xk, v8f32:$xa)>;
@@ -1593,6 +1597,9 @@ def : Pat<(fma_nsz (fneg v4f64:$xj), v4f64:$xk, v4f64:$xa),
 // XVFSQRT_{S/D}
 defm : PatXrF<fsqrt, "XVFSQRT">;
 
+// XVFLOGB_{S/D}
+defm : PatXrF<flog2, "XVFLOGB">;
+
 // XVRECIP_{S/D}
 def : Pat<(fdiv vsplatf32_fpimm_eq_1, v8f32:$xj),
           (XVFRECIP_S v8f32:$xj)>;
@@ -2024,6 +2031,24 @@ def : Pat<(v4i32(fp_to_uint v4f64:$vj)),
                (XVFTINTRZ_LU_D v4f64:$vj)),
               sub_128)>;
 
+// XVAVG_{B/H/W/D/BU/HU/WU/DU}, XVAVGR_{B/H/W/D/BU/HU/WU/DU}
+defm : VAvgPat<sra, "XVAVG_B", v32i8>;
+defm : VAvgPat<sra, "XVAVG_H", v16i16>;
+defm : VAvgPat<sra, "XVAVG_W", v8i32>;
+defm : VAvgPat<sra, "XVAVG_D", v4i64>;
+defm : VAvgPat<srl, "XVAVG_BU", v32i8>;
+defm : VAvgPat<srl, "XVAVG_HU", v16i16>;
+defm : VAvgPat<srl, "XVAVG_WU", v8i32>;
+defm : VAvgPat<srl, "XVAVG_DU", v4i64>;
+defm : VAvgrPat<sra, "XVAVGR_B", v32i8>;
+defm : VAvgrPat<sra, "XVAVGR_H", v16i16>;
+defm : VAvgrPat<sra, "XVAVGR_W", v8i32>;
+defm : VAvgrPat<sra, "XVAVGR_D", v4i64>;
+defm : VAvgrPat<srl, "XVAVGR_BU", v32i8>;
+defm : VAvgrPat<srl, "XVAVGR_HU", v16i16>;
+defm : VAvgrPat<srl, "XVAVGR_WU", v8i32>;
+defm : VAvgrPat<srl, "XVAVGR_DU", v4i64>;
+
 // abs
 def : Pat<(abs v32i8:$xj), (XVSIGNCOV_B v32i8:$xj, v32i8:$xj)>;
 def : Pat<(abs v16i16:$xj), (XVSIGNCOV_H v16i16:$xj, v16i16:$xj)>;
@@ -2088,6 +2113,37 @@ defm : subvector_subreg_lowering<LSX128, v2f64, LASX256, v4f64,  2,  sub_128>;
 defm : subvector_subreg_lowering<LSX128, v8i16, LASX256, v16i16, 8,  sub_128>;
 defm : subvector_subreg_lowering<LSX128, v16i8, LASX256, v32i8,  16, sub_128>;
 
+// LASX and LSX conversion
+def : Pat<(int_loongarch_lasx_cast_128_s (v4f32 LSX128:$src)),
+          (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_cast_128_d (v2f64 LSX128:$src)),
+          (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_cast_128 (v2i64 LSX128:$src)),
+          (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo_s (v8f32 LASX256:$src)),
+          (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo_d (v4f64 LASX256:$src)),
+          (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo (v4i64 LASX256:$src)),
+          (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi_s (v8f32 LASX256:$src)),
+          (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi_d (v4f64 LASX256:$src)),
+          (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi (v4i64 LASX256:$src)),
+          (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo (v4i64 LASX256:$src), (v2i64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi (v4i64 LASX256:$src), (v2i64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
 } // Predicates = [HasExtLASX]
 
 /// Intrinsic pattern
@@ -2403,6 +2459,12 @@ def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm),
 def : Pat<(int_loongarch_lasx_xvpickve_d_f v4f64:$xj, timm:$imm),
           (XVPICKVE_D v4f64:$xj, (to_valid_timm timm:$imm))>;
 
+// Vector floating-point conversion
+defm : PatXrF<fceil, "XVFRINTRP">;
+defm : PatXrF<ffloor, "XVFRINTRM">;
+defm : PatXrF<ftrunc, "XVFRINTRZ">;
+defm : PatXrF<froundeven, "XVFRINTRNE">;
+
 // load
 def : Pat<(int_loongarch_lasx_xvld GPR:$rj, timm:$imm),
           (XVLD GPR:$rj, (to_valid_timm timm:$imm))>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 4619c6bd248a6..6b74a4b5e5f6f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1518,6 +1518,18 @@ multiclass InsertExtractPatV2<ValueType vecty, ValueType elemty> {
   }
 }
 
+multiclass VAvgPat<SDPatternOperator OpNode, string Inst, ValueType vt> {
+  def : Pat<(OpNode (vt (add vt:$vj, vt:$vk)), (vt (vsplat_imm_eq_1))),
+            (!cast<LAInst>(Inst) vt:$vj, vt:$vk)>;
+}
+
+multiclass VAvgrPat<SDPatternOperator OpNode, string Inst, ValueType vt> {
+  def : Pat<(OpNode (vt (add (vt (add vt:$vj, vt:$vk)),
+                             (vt (vsplat_imm_eq_1)))),
+                    (vt (vsplat_imm_eq_1))),
+            (!cast<LAInst>(Inst) vt:$vj, vt:$vk)>;
+}
+
 let Predicates = [HasExtLSX] in {
 
 // VADD_{B/H/W/D}
@@ -1748,6 +1760,10 @@ defm : PatVrVrF<fmul, "VFMUL">;
 // VFDIV_{S/D}
 defm : PatVrVrF<fdiv, "VFDIV">;
 
+// VFMAX_{S/D}, VFMIN_{S/D}
+defm : PatVrVrF<fmaxnum, "VFMAX">;
+defm : PatVrVrF<fminnum, "VFMIN">;
+
 // VFMADD_{S/D}
 def : Pat<(fma v4f32:$vj, v4f32:$vk, v4f32:$va),
           (VFMADD_S v4f32:$vj, v4f32:$vk, v4f32:$va)>;
@@ -1783,6 +1799,9 @@ def : Pat<(fma_nsz (fneg v2f64:$vj), v2f64:$vk, v2f64:$va),
 // VFSQRT_{S/D}
 defm : PatVrF<fsqrt, "VFSQRT">;
 
+// VFLOGB_{S/D}
+defm : PatVrF<flog2, "VFLOGB">;
+
 // VFRECIP_{S/D}
 def : Pat<(fdiv vsplatf32_fpimm_eq_1, v4f32:$vj),
           (VFRECIP_S v4f32:$vj)>;
@@ -2154,6 +2173,24 @@ def : Pat<(f32 f32imm_vldi:$in),
 def : Pat<(f64 f64imm_vldi:$in),
           (f64 (EXTRACT_SUBREG (VLDI (to_f64imm_vldi f64imm_vldi:$in)), sub_64))>;
 
+// VAVG_{B/H/W/D/BU/HU/WU/DU}, VAVGR_{B/H/W/D/BU/HU/WU/DU}
+defm : VAvgPat<sra, "VAVG_B", v16i8>;
+defm : VAvgPat<sra, "VAVG_H", v8i16>;
+defm : VAvgPat<sra, "VAVG_W", v4i32>;
+defm : VAvgPat<sra, "VAVG_D", v2i64>;
+defm : VAvgPat<srl, "VAVG_BU", v16i8>;
+defm : VAvgPat<srl, "VAVG_HU", v8i16>;
+defm : VAvgPat<srl, "VAVG_WU", v4i32>;
+defm : VAvgPat<srl, "VAVG_DU", v2i64>;
+defm : VAvgrPat<sra, "VAVGR_B", v16i8>;
+defm : VAvgrPat<sra, "VAVGR_H", v8i16>;
+defm : VAvgrPat<sra, "VAVGR_W", v4i32>;
+defm : VAvgrPat<sra, "VAVGR_D", v2i64>;
+defm : VAvgrPat<srl, "VAVGR_BU", v16i8>;
+defm : VAvgrPat<srl, "VAVGR_HU", v8i16>;
+defm : VAvgrPat<srl, "VAVGR_WU", v4i32>;
+defm : VAvgrPat<srl, "VAVGR_DU", v2i64>;
+
 // abs
 def : Pat<(abs v16i8:$vj), (VSIGNCOV_B v16i8:$vj, v16i8:$vj)>;
 def : Pat<(abs v8i16:$vj), (VSIGNCOV_H v8i16:$vj, v8i16:$vj)>;
@@ -2519,6 +2556,11 @@ def : Pat<(f64 (froundeven FPR64:$fj)),
           (f64 (EXTRACT_SUBREG (VFRINTRNE_D (VREPLVEI_D
                (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), 0)), sub_64))>;
 
+defm : PatVrF<fceil, "VFRINTRP">;
+defm : PatVrF<ffloor, "VFRINTRM">;
+defm : PatVrF<ftrunc, "VFRINTRZ">;
+defm : PatVrF<froundeven, "VFRINTRNE">;
+
 // load
 def : Pat<(int_loongarch_lsx_vld GPR:$rj, timm:$imm),
           (VLD GPR:$rj, (to_valid_timm timm:$imm))>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp
index 3acbe4992273a..76a8ba1c90e50 100644
--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp
@@ -95,4 +95,4 @@ LoongArchSubtarget::LoongArchSubtarget(const Triple &TT, StringRef CPU,
     : LoongArchGenSubtargetInfo(TT, CPU, TuneCPU, FS),
       FrameLowering(
           initializeSubtargetDependencies(TT, CPU, TuneCPU, FS, ABIName)),
-      InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) {}
+      InstrInfo(*this), TLInfo(TM, *this) {}
diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
index 5e12bafebb0d5..2beff07949daf 100644
--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
@@ -45,7 +45,6 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
   LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown;
   LoongArchFrameLowering FrameLowering;
   LoongArchInstrInfo InstrInfo;
-  LoongArchRegisterInfo RegInfo;
   LoongArchTargetLowering TLInfo;
   SelectionDAGTargetInfo TSInfo;
 
@@ -78,7 +77,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
   }
   const LoongArchInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const LoongArchRegisterInfo *getRegisterInfo() const override {
-    return &RegInfo;
+    return &InstrInfo.getRegisterInfo();
   }
   const LoongArchTargetLowering *getTargetLowering() const override {
     return &TLInfo;
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index 9de4c9d83792b..92a9388e5cb7b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -62,6 +62,11 @@ static cl::opt<bool>
                           cl::desc("Enable the merge base offset pass"),
                           cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+    EnableSinkFold("loongarch-enable-sink-fold",
+                   cl::desc("Enable sinking and folding of instruction copies"),
+                   cl::init(true), cl::Hidden);
+
 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::Static);
 }
@@ -146,7 +151,9 @@ namespace {
 class LoongArchPassConfig : public TargetPassConfig {
 public:
   LoongArchPassConfig(LoongArchTargetMachine &TM, PassManagerBase &PM)
-      : TargetPassConfig(TM, PM) {}
+      : TargetPassConfig(TM, PM) {
+    setEnableSinkAndFold(EnableSinkFold);
+  }
 
   LoongArchTargetMachine &getLoongArchTargetMachine() const {
     return getTM<LoongArchTargetMachine>();
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
index f548a8dd0532b..5107c8def3799 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
@@ -111,4 +111,25 @@ bool LoongArchTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
   }
 }
 
-// TODO: Implement more hooks to provide TTI machinery for LoongArch.
+LoongArchTTIImpl::TTI::MemCmpExpansionOptions
+LoongArchTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+  TTI::MemCmpExpansionOptions Options;
+
+  if (!ST->hasUAL())
+    return Options;
+
+  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+  Options.NumLoadsPerBlock = Options.MaxNumLoads;
+  Options.AllowOverlappingLoads = true;
+
+  // TODO: Support for vectors.
+  if (ST->is64Bit()) {
+    Options.LoadSizes = {8, 4, 2, 1};
+    Options.AllowedTailExpansions = {3, 5, 6};
+  } else {
+    Options.LoadSizes = {4, 2, 1};
+    Options.AllowedTailExpansions = {3};
+  }
+
+  return Options;
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
index e3f16c7804994..9b479f9dc0dc5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
@@ -55,7 +55,8 @@ class LoongArchTTIImpl : public BasicTTIImplBase<LoongArchTTIImpl> {
 
   bool shouldExpandReduction(const IntrinsicInst *II) const override;
 
-  // TODO: Implement more hooks to provide TTI machinery for LoongArch.
+  TTI::MemCmpExpansionOptions
+  enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
index 7d5456555045b..6d69af5938e79 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
@@ -39,7 +39,7 @@ LoongArchELFObjectWriter::LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit)
     : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_LOONGARCH,
                               /*HasRelocationAddend=*/true) {}
 
-LoongArchELFObjectWriter::~LoongArchELFObjectWriter() {}
+LoongArchELFObjectWriter::~LoongArchELFObjectWriter() = default;
 
 unsigned LoongArchELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                                 const MCValue &Target,
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index f0e2bc4855187..08fa51d333346 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -38,7 +38,7 @@ class LoongArchMCCodeEmitter : public MCCodeEmitter {
   LoongArchMCCodeEmitter(MCContext &ctx, MCInstrInfo const &MCII)
       : Ctx(ctx), MCII(MCII) {}
 
-  ~LoongArchMCCodeEmitter() override {}
+  ~LoongArchMCCodeEmitter() override = default;
 
   void encodeInstruction(const MCInst &MI, SmallVectorImpl<char> &CB,
                          SmallVectorImpl<MCFixup> &Fixups,
diff --git a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
index e37f3a66fe11f..fb5cd5c29d7dc 100644
--- a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
+++ b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
@@ -690,9 +690,9 @@ bool M68kAsmParser::parseRegisterName(MCRegister &RegNo, SMLoc Loc,
     } else {
       // Floating point control register.
       RegNo = StringSwitch<unsigned>(RegisterNameLower)
-                  .Cases("fpc", "fpcr", M68k::FPC)
-                  .Cases("fps", "fpsr", M68k::FPS)
-                  .Cases("fpi", "fpiar", M68k::FPIAR)
+                  .Cases({"fpc", "fpcr"}, M68k::FPC)
+                  .Cases({"fps", "fpsr"}, M68k::FPS)
+                  .Cases({"fpi", "fpiar"}, M68k::FPIAR)
                   .Default(M68k::NoRegister);
       assert(RegNo != M68k::NoRegister &&
              "Unrecognized FP control register name");
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.cpp b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
index c6be190bd1245..91077ff5961a4 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.cpp
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
@@ -43,7 +43,7 @@ using namespace llvm;
 void M68kInstrInfo::anchor() {}
 
 M68kInstrInfo::M68kInstrInfo(const M68kSubtarget &STI)
-    : M68kGenInstrInfo(STI, M68k::ADJCALLSTACKDOWN, M68k::ADJCALLSTACKUP, 0,
+    : M68kGenInstrInfo(STI, RI, M68k::ADJCALLSTACKDOWN, M68k::ADJCALLSTACKUP, 0,
                        M68k::RET),
       Subtarget(STI), RI(STI) {}
 
@@ -838,15 +838,14 @@ bool M68kInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
 
 void M68kInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-    bool IsKill, int FrameIndex, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
+    bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
     MachineInstr::MIFlag Flags) const {
   const MachineFrameInfo &MFI = MBB.getParent()->getFrameInfo();
-  assert(MFI.getObjectSize(FrameIndex) >= TRI->getSpillSize(*RC) &&
+  assert(MFI.getObjectSize(FrameIndex) >= TRI.getSpillSize(*RC) &&
          "Stack slot is too small to store");
   (void)MFI;
 
-  unsigned Opc = getStoreRegOpcode(SrcReg, RC, TRI, Subtarget);
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, &TRI, Subtarget);
   DebugLoc DL = MBB.findDebugLoc(MI);
   // (0,FrameIndex) <- $reg
   M68k::addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIndex)
@@ -857,15 +856,14 @@ void M68kInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator MI,
                                          Register DstReg, int FrameIndex,
                                          const TargetRegisterClass *RC,
-                                         const TargetRegisterInfo *TRI,
                                          Register VReg,
                                          MachineInstr::MIFlag Flags) const {
   const MachineFrameInfo &MFI = MBB.getParent()->getFrameInfo();
-  assert(MFI.getObjectSize(FrameIndex) >= TRI->getSpillSize(*RC) &&
+  assert(MFI.getObjectSize(FrameIndex) >= TRI.getSpillSize(*RC) &&
          "Stack slot is too small to load");
   (void)MFI;
 
-  unsigned Opc = getLoadRegOpcode(DstReg, RC, TRI, Subtarget);
+  unsigned Opc = getLoadRegOpcode(DstReg, RC, &TRI, Subtarget);
   DebugLoc DL = MBB.findDebugLoc(MI);
   M68k::addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DstReg), FrameIndex);
 }
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.h b/llvm/lib/Target/M68k/M68kInstrInfo.h
index 97615d60caa0b..2b3789d768602 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.h
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.h
@@ -280,14 +280,12 @@ class M68kInstrInfo : public M68kGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool IsKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/M68k/M68kSubtarget.h b/llvm/lib/Target/M68k/M68kSubtarget.h
index 16ca7d2e6d0fd..4f9685814d9a9 100644
--- a/llvm/lib/Target/M68k/M68kSubtarget.h
+++ b/llvm/lib/Target/M68k/M68kSubtarget.h
@@ -27,8 +27,6 @@
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/Alignment.h"
 
-#include <string>
-
 #define GET_SUBTARGETINFO_HEADER
 #include "M68kGenSubtargetInfo.inc"
 
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
index fe83dc6e1abfb..51bafe4a4c56c 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
@@ -49,7 +49,7 @@ class M68kAsmBackend : public MCAsmBackend {
   M68kAsmBackend(const Target &T, const MCSubtargetInfo &STI)
       : MCAsmBackend(llvm::endianness::big),
         Allows32BitBranch(llvm::StringSwitch<bool>(STI.getCPU())
-                              .CasesLower("m68020", "m68030", "m68040", true)
+                              .CasesLower({"m68020", "m68030", "m68040"}, true)
                               .Default(false)) {}
 
   void applyFixup(const MCFragment &, const MCFixup &, const MCValue &,
diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index a31c8ec1b2bb5..a8891d686abe8 100644
--- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -230,7 +230,7 @@ class MSP430Operand : public MCParsedAsmOperand {
       O << "Token " << Tok;
       break;
     case k_Reg:
-      O << "Register " << Reg;
+      O << "Register " << Reg.id();
       break;
     case k_Imm:
       O << "Immediate ";
@@ -241,10 +241,10 @@ class MSP430Operand : public MCParsedAsmOperand {
       MAI.printExpr(O, *Mem.Offset);
       break;
     case k_IndReg:
-      O << "RegInd " << Reg;
+      O << "RegInd " << Reg.id();
       break;
     case k_PostIndReg:
-      O << "PostInc " << Reg;
+      O << "PostInc " << Reg.id();
       break;
     }
   }
diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
index 65b4820752c94..0fb4e9d9fcb62 100644
--- a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -26,13 +26,13 @@ using namespace llvm;
 void MSP430InstrInfo::anchor() {}
 
 MSP430InstrInfo::MSP430InstrInfo(const MSP430Subtarget &STI)
-    : MSP430GenInstrInfo(STI, MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
+    : MSP430GenInstrInfo(STI, RI, MSP430::ADJCALLSTACKDOWN,
+                         MSP430::ADJCALLSTACKUP),
       RI() {}
 
 void MSP430InstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-    bool isKill, int FrameIdx, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
+    bool isKill, int FrameIdx, const TargetRegisterClass *RC, Register VReg,
     MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (MI != MBB.end()) DL = MI->getDebugLoc();
@@ -56,10 +56,12 @@ void MSP430InstrInfo::storeRegToStackSlot(
     llvm_unreachable("Cannot store this register to stack slot!");
 }
 
-void MSP430InstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-    int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                           Register DestReg, int FrameIdx,
+                                           const TargetRegisterClass *RC,
+                                           Register VReg,
+                                           MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (MI != MBB.end()) DL = MI->getDebugLoc();
   MachineFunction &MF = *MBB.getParent();
diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.h b/llvm/lib/Target/MSP430/MSP430InstrInfo.h
index 316c136890bf8..c0a398452ef6d 100644
--- a/llvm/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.h
@@ -42,13 +42,11 @@ class MSP430InstrInfo : public MSP430GenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIdx, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIdx, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 97379d78ae4ae..6b28531764db9 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -151,7 +151,7 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool IsCpRestoreSet;
   bool CurForbiddenSlotAttr;
   int CpRestoreOffset;
-  unsigned GPReg;
+  MCRegister GPReg;
   unsigned CpSaveLocation;
   /// If true, then CpSaveLocation is a register, otherwise it's an offset.
   bool     CpSaveLocationIsRegister;
@@ -823,7 +823,7 @@ class MipsOperand : public MCParsedAsmOperand {
   };
 
   struct RegListOp {
-    SmallVector<unsigned, 10> *List;
+    SmallVector<MCRegister, 10> *List;
   };
 
   union {
@@ -1377,15 +1377,15 @@ class MipsOperand : public MCParsedAsmOperand {
     if (Size < 2 || Size > 5)
       return false;
 
-    unsigned R0 = RegList.List->front();
-    unsigned R1 = RegList.List->back();
+    MCRegister R0 = RegList.List->front();
+    MCRegister R1 = RegList.List->back();
     if (!((R0 == Mips::S0 && R1 == Mips::RA) ||
           (R0 == Mips::S0_64 && R1 == Mips::RA_64)))
       return false;
 
-    int PrevReg = *RegList.List->begin();
+    MCRegister PrevReg = RegList.List->front();
     for (int i = 1; i < Size - 1; i++) {
-      int Reg = (*(RegList.List))[i];
+      MCRegister Reg = (*(RegList.List))[i];
       if ( Reg != PrevReg + 1)
         return false;
       PrevReg = Reg;
@@ -1447,7 +1447,7 @@ class MipsOperand : public MCParsedAsmOperand {
     return static_cast<const MCConstantExpr *>(getMemOff())->getValue();
   }
 
-  const SmallVectorImpl<unsigned> &getRegList() const {
+  const SmallVectorImpl<MCRegister> &getRegList() const {
     assert((Kind == k_RegList) && "Invalid access!");
     return *(RegList.List);
   }
@@ -1548,12 +1548,13 @@ class MipsOperand : public MCParsedAsmOperand {
   }
 
   static std::unique_ptr<MipsOperand>
-  CreateRegList(SmallVectorImpl<unsigned> &Regs, SMLoc StartLoc, SMLoc EndLoc,
+  CreateRegList(SmallVectorImpl<MCRegister> &Regs, SMLoc StartLoc, SMLoc EndLoc,
                 MipsAsmParser &Parser) {
-    assert(Regs.size() > 0 && "Empty list not allowed");
+    assert(!Regs.empty() && "Empty list not allowed");
 
     auto Op = std::make_unique<MipsOperand>(k_RegList, Parser);
-    Op->RegList.List = new SmallVector<unsigned, 10>(Regs.begin(), Regs.end());
+    Op->RegList.List =
+        new SmallVector<MCRegister, 10>(Regs.begin(), Regs.end());
     Op->StartLoc = StartLoc;
     Op->EndLoc = EndLoc;
     return Op;
@@ -1684,7 +1685,7 @@ class MipsOperand : public MCParsedAsmOperand {
     case k_RegList:
       OS << "RegList< ";
       for (auto Reg : (*RegList.List))
-        OS << Reg << " ";
+        OS << Reg.id() << " ";
       OS <<  ">";
       break;
     }
@@ -6176,7 +6177,7 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) {
 
   CC = StringSwitch<unsigned>(Name)
            .Case("zero", 0)
-           .Cases("at", "AT", 1)
+           .Cases({"at", "AT"}, 1)
            .Case("a0", 4)
            .Case("a1", 5)
            .Case("a2", 6)
@@ -6848,9 +6849,9 @@ ParseStatus MipsAsmParser::parseInvNum(OperandVector &Operands) {
 
 ParseStatus MipsAsmParser::parseRegisterList(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
-  SmallVector<unsigned, 10> Regs;
-  unsigned RegNo;
-  unsigned PrevReg = Mips::NoRegister;
+  SmallVector<MCRegister, 10> Regs;
+  MCRegister Reg;
+  MCRegister PrevReg;
   bool RegRange = false;
   SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> TmpOperands;
 
@@ -6860,46 +6861,47 @@ ParseStatus MipsAsmParser::parseRegisterList(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   while (parseAnyRegister(TmpOperands).isSuccess()) {
     SMLoc E = getLexer().getLoc();
-    MipsOperand &Reg = static_cast<MipsOperand &>(*TmpOperands.back());
-    RegNo = isGP64bit() ? Reg.getGPR64Reg() : Reg.getGPR32Reg();
+    MipsOperand &RegOpnd = static_cast<MipsOperand &>(*TmpOperands.back());
+    Reg = isGP64bit() ? RegOpnd.getGPR64Reg() : RegOpnd.getGPR32Reg();
     if (RegRange) {
       // Remove last register operand because registers from register range
       // should be inserted first.
-      if ((isGP64bit() && RegNo == Mips::RA_64) ||
-          (!isGP64bit() && RegNo == Mips::RA)) {
-        Regs.push_back(RegNo);
+      if ((isGP64bit() && Reg == Mips::RA_64) ||
+          (!isGP64bit() && Reg == Mips::RA)) {
+        Regs.push_back(Reg);
       } else {
-        unsigned TmpReg = PrevReg + 1;
-        while (TmpReg <= RegNo) {
+        MCRegister TmpReg = PrevReg + 1;
+        while (TmpReg <= Reg) {
           if ((((TmpReg < Mips::S0) || (TmpReg > Mips::S7)) && !isGP64bit()) ||
               (((TmpReg < Mips::S0_64) || (TmpReg > Mips::S7_64)) &&
                isGP64bit()))
             return Error(E, "invalid register operand");
 
           PrevReg = TmpReg;
-          Regs.push_back(TmpReg++);
+          Regs.push_back(TmpReg);
+          TmpReg = TmpReg.id() + 1;
         }
       }
 
       RegRange = false;
     } else {
-      if ((PrevReg == Mips::NoRegister) &&
-          ((isGP64bit() && (RegNo != Mips::S0_64) && (RegNo != Mips::RA_64)) ||
-           (!isGP64bit() && (RegNo != Mips::S0) && (RegNo != Mips::RA))))
+      if (!PrevReg.isValid() &&
+          ((isGP64bit() && (Reg != Mips::S0_64) && (Reg != Mips::RA_64)) ||
+           (!isGP64bit() && (Reg != Mips::S0) && (Reg != Mips::RA))))
         return Error(E, "$16 or $31 expected");
-      if (!(((RegNo == Mips::FP || RegNo == Mips::RA ||
-              (RegNo >= Mips::S0 && RegNo <= Mips::S7)) &&
+      if (!(((Reg == Mips::FP || Reg == Mips::RA ||
+              (Reg >= Mips::S0 && Reg <= Mips::S7)) &&
              !isGP64bit()) ||
-            ((RegNo == Mips::FP_64 || RegNo == Mips::RA_64 ||
-              (RegNo >= Mips::S0_64 && RegNo <= Mips::S7_64)) &&
+            ((Reg == Mips::FP_64 || Reg == Mips::RA_64 ||
+              (Reg >= Mips::S0_64 && Reg <= Mips::S7_64)) &&
              isGP64bit())))
         return Error(E, "invalid register operand");
-      if ((PrevReg != Mips::NoRegister) && (RegNo != PrevReg + 1) &&
-          ((RegNo != Mips::FP && RegNo != Mips::RA && !isGP64bit()) ||
-           (RegNo != Mips::FP_64 && RegNo != Mips::RA_64 && isGP64bit())))
+      if (PrevReg.isValid() && (Reg != PrevReg + 1) &&
+          ((Reg != Mips::FP && Reg != Mips::RA && !isGP64bit()) ||
+           (Reg != Mips::FP_64 && Reg != Mips::RA_64 && isGP64bit())))
         return Error(E, "consecutive register numbers expected");
 
-      Regs.push_back(RegNo);
+      Regs.push_back(Reg);
     }
 
     if (Parser.getTok().is(AsmToken::Minus))
@@ -6913,7 +6915,7 @@ ParseStatus MipsAsmParser::parseRegisterList(OperandVector &Operands) {
     if (Parser.getTok().isNot(AsmToken::Dollar))
       break;
 
-    PrevReg = RegNo;
+    PrevReg = Reg;
   }
 
   SMLoc E = Parser.getTok().getLoc();
@@ -7780,7 +7782,7 @@ bool MipsAsmParser::parseDirectiveCpLocal(SMLoc Loc) {
   }
   getParser().Lex(); // Consume the EndOfStatement.
 
-  unsigned NewReg = RegOpnd.getGPR32Reg();
+  MCRegister NewReg = RegOpnd.getGPR32Reg();
   if (IsPicEnabled)
     GPReg = NewReg;
 
@@ -7835,7 +7837,6 @@ bool MipsAsmParser::parseDirectiveCpRestore(SMLoc Loc) {
 
 bool MipsAsmParser::parseDirectiveCPSetup() {
   MCAsmParser &Parser = getParser();
-  unsigned FuncReg;
   unsigned Save;
   bool SaveIsReg = true;
 
@@ -7852,7 +7853,7 @@ bool MipsAsmParser::parseDirectiveCPSetup() {
     return false;
   }
 
-  FuncReg = FuncRegOpnd.getGPR32Reg();
+  MCRegister FuncReg = FuncRegOpnd.getGPR32Reg();
   TmpReg.clear();
 
   if (!eatComma("unexpected token, expected comma"))
@@ -7878,7 +7879,7 @@ bool MipsAsmParser::parseDirectiveCPSetup() {
       reportParseError(SaveOpnd.getStartLoc(), "invalid register");
       return false;
     }
-    Save = SaveOpnd.getGPR32Reg();
+    Save = SaveOpnd.getGPR32Reg().id();
   }
 
   if (!eatComma("unexpected token, expected comma"))
@@ -8696,7 +8697,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
                        "expected general purpose register");
       return false;
     }
-    unsigned StackReg = StackRegOpnd.getGPR32Reg();
+    MCRegister StackReg = StackRegOpnd.getGPR32Reg();
 
     if (Parser.getTok().is(AsmToken::Comma))
       Parser.Lex();
diff --git a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 12e31c07aa15a..fd9eb9b8fe9a3 100644
--- a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -103,7 +103,7 @@ LLVMInitializeMipsDisassembler() {
                                          createMipselDisassembler);
 }
 
-static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) {
+static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) {
   const MCRegisterInfo *RegInfo = D->getContext().getRegisterInfo();
   return RegInfo->getRegClass(RC).getRegister(RegNo);
 }
@@ -123,7 +123,7 @@ static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo > 30 || RegNo % 2)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo / 2);
+  MCRegister Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo / 2);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -134,7 +134,7 @@ static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo >= 4)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::ACC64DSPRegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::ACC64DSPRegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -145,7 +145,7 @@ static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo >= 4)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::HI32DSPRegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::HI32DSPRegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -156,7 +156,7 @@ static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo >= 4)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::LO32DSPRegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::LO32DSPRegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -167,7 +167,7 @@ static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -178,7 +178,7 @@ static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::MSA128HRegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::MSA128HRegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -189,7 +189,7 @@ static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::MSA128WRegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::MSA128WRegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -200,7 +200,7 @@ static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::MSA128DRegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::MSA128DRegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -211,7 +211,7 @@ static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo > 7)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::MSACtrlRegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::MSACtrlRegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -222,7 +222,7 @@ static DecodeStatus DecodeCOP0RegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::COP0RegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::COP0RegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -233,7 +233,7 @@ static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -881,7 +881,7 @@ static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::GPR64RegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::GPR64RegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -891,7 +891,7 @@ static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst, unsigned RegNo,
                                                const MCDisassembler *Decoder) {
   if (RegNo > 7)
     return MCDisassembler::Fail;
-  unsigned Reg = getReg(Decoder, Mips::GPRMM16RegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::GPRMM16RegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -901,7 +901,7 @@ DecodeGPRMM16ZeroRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
                                const MCDisassembler *Decoder) {
   if (RegNo > 7)
     return MCDisassembler::Fail;
-  unsigned Reg = getReg(Decoder, Mips::GPRMM16ZeroRegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::GPRMM16ZeroRegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -911,7 +911,7 @@ DecodeGPRMM16MovePRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
                                 const MCDisassembler *Decoder) {
   if (RegNo > 7)
     return MCDisassembler::Fail;
-  unsigned Reg = getReg(Decoder, Mips::GPRMM16MovePRegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::GPRMM16MovePRegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -948,7 +948,7 @@ static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
                                              const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
-  unsigned Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -974,7 +974,7 @@ static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::FGR64RegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::FGR64RegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -985,7 +985,7 @@ static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::FGR32RegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::FGR32RegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -995,7 +995,7 @@ static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst, unsigned RegNo,
                                            const MCDisassembler *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
-  unsigned Reg = getReg(Decoder, Mips::CCRRegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::CCRRegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -1005,7 +1005,7 @@ static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst, unsigned RegNo,
                                            const MCDisassembler *Decoder) {
   if (RegNo > 7)
     return MCDisassembler::Fail;
-  unsigned Reg = getReg(Decoder, Mips::FCCRegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::FCCRegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -1016,7 +1016,7 @@ static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo,
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::FGRCCRegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, Mips::FGRCCRegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -1024,11 +1024,11 @@ static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeMem(MCInst &Inst, unsigned Insn, uint64_t Address,
                               const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
-  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
-  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+  unsigned RegNo = fieldFromInstruction(Insn, 16, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 21, 5);
 
-  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   if (Inst.getOpcode() == Mips::SC || Inst.getOpcode() == Mips::SC64 ||
       Inst.getOpcode() == Mips::SCD)
@@ -1044,14 +1044,14 @@ static DecodeStatus DecodeMem(MCInst &Inst, unsigned Insn, uint64_t Address,
 static DecodeStatus DecodeMemEVA(MCInst &Inst, unsigned Insn, uint64_t Address,
                                  const MCDisassembler *Decoder) {
   int Offset = SignExtend32<9>(Insn >> 7);
-  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
-  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+  unsigned RegNo = fieldFromInstruction(Insn, 16, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 21, 5);
 
-  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
-   if (Inst.getOpcode() == Mips::SCE)
-     Inst.addOperand(MCOperand::createReg(Reg));
+  if (Inst.getOpcode() == Mips::SCE)
+    Inst.addOperand(MCOperand::createReg(Reg));
 
   Inst.addOperand(MCOperand::createReg(Reg));
   Inst.addOperand(MCOperand::createReg(Base));
@@ -1064,11 +1064,11 @@ static DecodeStatus DecodeLoadByte15(MCInst &Inst, unsigned Insn,
                                      uint64_t Address,
                                      const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
-  unsigned Base = fieldFromInstruction(Insn, 16, 5);
-  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 16, 5);
+  unsigned RegNo = fieldFromInstruction(Insn, 21, 5);
 
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
-  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
+  MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo);
 
   Inst.addOperand(MCOperand::createReg(Reg));
   Inst.addOperand(MCOperand::createReg(Base));
@@ -1081,9 +1081,9 @@ static DecodeStatus DecodeCacheOp(MCInst &Inst, unsigned Insn, uint64_t Address,
                                   const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
   unsigned Hint = fieldFromInstruction(Insn, 16, 5);
-  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 21, 5);
 
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   Inst.addOperand(MCOperand::createReg(Base));
   Inst.addOperand(MCOperand::createImm(Offset));
@@ -1096,10 +1096,10 @@ static DecodeStatus DecodeCacheOpMM(MCInst &Inst, unsigned Insn,
                                     uint64_t Address,
                                     const MCDisassembler *Decoder) {
   int Offset = SignExtend32<12>(Insn & 0xfff);
-  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 16, 5);
   unsigned Hint = fieldFromInstruction(Insn, 21, 5);
 
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   Inst.addOperand(MCOperand::createReg(Base));
   Inst.addOperand(MCOperand::createImm(Offset));
@@ -1112,10 +1112,10 @@ static DecodeStatus DecodePrefeOpMM(MCInst &Inst, unsigned Insn,
                                     uint64_t Address,
                                     const MCDisassembler *Decoder) {
   int Offset = SignExtend32<9>(Insn & 0x1ff);
-  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 16, 5);
   unsigned Hint = fieldFromInstruction(Insn, 21, 5);
 
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   Inst.addOperand(MCOperand::createReg(Base));
   Inst.addOperand(MCOperand::createImm(Offset));
@@ -1129,9 +1129,9 @@ static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst, unsigned Insn,
                                              const MCDisassembler *Decoder) {
   int Offset = SignExtend32<9>(Insn >> 7);
   unsigned Hint = fieldFromInstruction(Insn, 16, 5);
-  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 21, 5);
 
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   Inst.addOperand(MCOperand::createReg(Base));
   Inst.addOperand(MCOperand::createImm(Offset));
@@ -1143,9 +1143,9 @@ static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst, unsigned Insn,
 static DecodeStatus DecodeSyncI(MCInst &Inst, unsigned Insn, uint64_t Address,
                                 const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
-  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 21, 5);
 
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   Inst.addOperand(MCOperand::createReg(Base));
   Inst.addOperand(MCOperand::createImm(Offset));
@@ -1157,9 +1157,9 @@ static DecodeStatus DecodeSyncI_MM(MCInst &Inst, unsigned Insn,
                                    uint64_t Address,
                                    const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
-  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 16, 5);
 
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   Inst.addOperand(MCOperand::createReg(Base));
   Inst.addOperand(MCOperand::createImm(Offset));
@@ -1170,9 +1170,9 @@ static DecodeStatus DecodeSyncI_MM(MCInst &Inst, unsigned Insn,
 static DecodeStatus DecodeSynciR6(MCInst &Inst, unsigned Insn, uint64_t Address,
                                   const MCDisassembler *Decoder) {
   int Immediate = SignExtend32<16>(Insn & 0xffff);
-  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 16, 5);
 
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   Inst.addOperand(MCOperand::createReg(Base));
   Inst.addOperand(MCOperand::createImm(Immediate));
@@ -1184,11 +1184,11 @@ static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
                                     uint64_t Address,
                                     const MCDisassembler *Decoder) {
   int Offset = SignExtend32<10>(fieldFromInstruction(Insn, 16, 10));
-  unsigned Reg = fieldFromInstruction(Insn, 6, 5);
-  unsigned Base = fieldFromInstruction(Insn, 11, 5);
+  unsigned RegNo = fieldFromInstruction(Insn, 6, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 11, 5);
 
-  Reg = getReg(Decoder, Mips::MSA128BRegClassID, Reg);
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   Inst.addOperand(MCOperand::createReg(Reg));
   Inst.addOperand(MCOperand::createReg(Base));
@@ -1288,9 +1288,9 @@ static DecodeStatus DecodeMemMMSPImm5Lsl2(MCInst &Inst, unsigned Insn,
                                           uint64_t Address,
                                           const MCDisassembler *Decoder) {
   unsigned Offset = Insn & 0x1F;
-  unsigned Reg = fieldFromInstruction(Insn, 5, 5);
+  unsigned RegNo = fieldFromInstruction(Insn, 5, 5);
 
-  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo);
 
   Inst.addOperand(MCOperand::createReg(Reg));
   Inst.addOperand(MCOperand::createReg(Mips::SP));
@@ -1303,9 +1303,9 @@ static DecodeStatus DecodeMemMMGPImm7Lsl2(MCInst &Inst, unsigned Insn,
                                           uint64_t Address,
                                           const MCDisassembler *Decoder) {
   unsigned Offset = Insn & 0x7F;
-  unsigned Reg = fieldFromInstruction(Insn, 7, 3);
+  unsigned RegNo = fieldFromInstruction(Insn, 7, 3);
 
-  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo);
 
   Inst.addOperand(MCOperand::createReg(Reg));
   Inst.addOperand(MCOperand::createReg(Mips::GP));
@@ -1342,11 +1342,11 @@ static DecodeStatus DecodeMemMMImm9(MCInst &Inst, unsigned Insn,
                                     uint64_t Address,
                                     const MCDisassembler *Decoder) {
   int Offset = SignExtend32<9>(Insn & 0x1ff);
-  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
-  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned RegNo = fieldFromInstruction(Insn, 21, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 16, 5);
 
-  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   if (Inst.getOpcode() == Mips::SCE_MM || Inst.getOpcode() == Mips::SC_MMR6)
     Inst.addOperand(MCOperand::createReg(Reg));
@@ -1362,11 +1362,11 @@ static DecodeStatus DecodeMemMMImm12(MCInst &Inst, unsigned Insn,
                                      uint64_t Address,
                                      const MCDisassembler *Decoder) {
   int Offset = SignExtend32<12>(Insn & 0x0fff);
-  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
-  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned RegNo = fieldFromInstruction(Insn, 21, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 16, 5);
 
-  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   switch (Inst.getOpcode()) {
   case Mips::SWM32_MM:
@@ -1396,11 +1396,11 @@ static DecodeStatus DecodeMemMMImm16(MCInst &Inst, unsigned Insn,
                                      uint64_t Address,
                                      const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
-  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
-  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned RegNo = fieldFromInstruction(Insn, 21, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 16, 5);
 
-  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   Inst.addOperand(MCOperand::createReg(Reg));
   Inst.addOperand(MCOperand::createReg(Base));
@@ -1412,11 +1412,11 @@ static DecodeStatus DecodeMemMMImm16(MCInst &Inst, unsigned Insn,
 static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn, uint64_t Address,
                                const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
-  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
-  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+  unsigned RegNo = fieldFromInstruction(Insn, 16, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 21, 5);
 
-  Reg = getReg(Decoder, Mips::FGR64RegClassID, Reg);
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Reg = getReg(Decoder, Mips::FGR64RegClassID, RegNo);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   Inst.addOperand(MCOperand::createReg(Reg));
   Inst.addOperand(MCOperand::createReg(Base));
@@ -1431,11 +1431,11 @@ static DecodeStatus DecodeFMemMMR2(MCInst &Inst, unsigned Insn,
   // This function is the same as DecodeFMem but with the Reg and Base fields
   // swapped according to microMIPS spec.
   int Offset = SignExtend32<16>(Insn & 0xffff);
-  unsigned Base = fieldFromInstruction(Insn, 16, 5);
-  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 16, 5);
+  unsigned RegNo = fieldFromInstruction(Insn, 21, 5);
 
-  Reg = getReg(Decoder, Mips::FGR64RegClassID, Reg);
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Reg = getReg(Decoder, Mips::FGR64RegClassID, RegNo);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   Inst.addOperand(MCOperand::createReg(Reg));
   Inst.addOperand(MCOperand::createReg(Base));
@@ -1447,11 +1447,11 @@ static DecodeStatus DecodeFMemMMR2(MCInst &Inst, unsigned Insn,
 static DecodeStatus DecodeFMem2(MCInst &Inst, unsigned Insn, uint64_t Address,
                                 const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
-  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
-  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+  unsigned RegNo = fieldFromInstruction(Insn, 16, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 21, 5);
 
-  Reg = getReg(Decoder, Mips::COP2RegClassID, Reg);
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   Inst.addOperand(MCOperand::createReg(Reg));
   Inst.addOperand(MCOperand::createReg(Base));
@@ -1463,11 +1463,11 @@ static DecodeStatus DecodeFMem2(MCInst &Inst, unsigned Insn, uint64_t Address,
 static DecodeStatus DecodeFMem3(MCInst &Inst, unsigned Insn, uint64_t Address,
                                 const MCDisassembler *Decoder) {
   int Offset = SignExtend32<16>(Insn & 0xffff);
-  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
-  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+  unsigned RegNo = fieldFromInstruction(Insn, 16, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 21, 5);
 
-  Reg = getReg(Decoder, Mips::COP3RegClassID, Reg);
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Reg = getReg(Decoder, Mips::COP3RegClassID, RegNo);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   Inst.addOperand(MCOperand::createReg(Reg));
   Inst.addOperand(MCOperand::createReg(Base));
@@ -1480,11 +1480,11 @@ static DecodeStatus DecodeFMemCop2R6(MCInst &Inst, unsigned Insn,
                                      uint64_t Address,
                                      const MCDisassembler *Decoder) {
   int Offset = SignExtend32<11>(Insn & 0x07ff);
-  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
-  unsigned Base = fieldFromInstruction(Insn, 11, 5);
+  unsigned RegNo = fieldFromInstruction(Insn, 16, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 11, 5);
 
-  Reg = getReg(Decoder, Mips::COP2RegClassID, Reg);
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   Inst.addOperand(MCOperand::createReg(Reg));
   Inst.addOperand(MCOperand::createReg(Base));
@@ -1497,11 +1497,11 @@ static DecodeStatus DecodeFMemCop2MMR6(MCInst &Inst, unsigned Insn,
                                        uint64_t Address,
                                        const MCDisassembler *Decoder) {
   int Offset = SignExtend32<11>(Insn & 0x07ff);
-  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
-  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned RegNo = fieldFromInstruction(Insn, 21, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 16, 5);
 
-  Reg = getReg(Decoder, Mips::COP2RegClassID, Reg);
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   Inst.addOperand(MCOperand::createReg(Reg));
   Inst.addOperand(MCOperand::createReg(Base));
@@ -1514,11 +1514,11 @@ static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst, unsigned Insn,
                                        uint64_t Address,
                                        const MCDisassembler *Decoder) {
   int64_t Offset = SignExtend64<9>((Insn >> 7) & 0x1ff);
-  unsigned Rt = fieldFromInstruction(Insn, 16, 5);
-  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+  unsigned RtNo = fieldFromInstruction(Insn, 16, 5);
+  unsigned BaseNo = fieldFromInstruction(Insn, 21, 5);
 
-  Rt = getReg(Decoder, Mips::GPR32RegClassID, Rt);
-  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  MCRegister Rt = getReg(Decoder, Mips::GPR32RegClassID, RtNo);
+  MCRegister Base = getReg(Decoder, Mips::GPR32RegClassID, BaseNo);
 
   if(Inst.getOpcode() == Mips::SC_R6 || Inst.getOpcode() == Mips::SCD_R6){
     Inst.addOperand(MCOperand::createReg(Rt));
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
index 6b013de274772..fd8eb33e20b26 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
@@ -67,7 +67,7 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() {
   Streamer->popSection();
 }
 
-void MipsRegInfoRecord::SetPhysRegUsed(unsigned Reg,
+void MipsRegInfoRecord::SetPhysRegUsed(MCRegister Reg,
                                        const MCRegisterInfo *MCRegInfo) {
   unsigned Value = 0;
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 1e1b9703d8062..01f18acf050d7 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -126,9 +126,9 @@ void MipsTargetStreamer::emitDirectiveSetDspr2() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetNoDsp() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetMips3D() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetNoMips3D() { forbidModuleDirective(); }
-void MipsTargetStreamer::emitDirectiveCpAdd(unsigned RegNo) {}
-void MipsTargetStreamer::emitDirectiveCpLoad(unsigned RegNo) {}
-void MipsTargetStreamer::emitDirectiveCpLocal(unsigned RegNo) {
+void MipsTargetStreamer::emitDirectiveCpAdd(MCRegister Reg) {}
+void MipsTargetStreamer::emitDirectiveCpLoad(MCRegister Reg) {}
+void MipsTargetStreamer::emitDirectiveCpLocal(MCRegister Reg) {
   // .cplocal $reg
   // This directive forces to use the alternate register for context pointer.
   // For example
@@ -141,17 +141,17 @@ void MipsTargetStreamer::emitDirectiveCpLocal(unsigned RegNo) {
   if (!getABI().IsN32() && !getABI().IsN64())
     return;
 
-  GPReg = RegNo;
+  GPReg = Reg;
 
   forbidModuleDirective();
 }
 bool MipsTargetStreamer::emitDirectiveCpRestore(
-    int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+    int Offset, function_ref<MCRegister()> GetATReg, SMLoc IDLoc,
     const MCSubtargetInfo *STI) {
   forbidModuleDirective();
   return true;
 }
-void MipsTargetStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+void MipsTargetStreamer::emitDirectiveCpsetup(MCRegister Reg, int RegOrOffset,
                                               const MCSymbol &Sym, bool IsReg) {
 }
 void MipsTargetStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
@@ -324,7 +324,7 @@ void MipsTargetStreamer::emitGPRestore(int Offset, SMLoc IDLoc,
 /// Emit a store instruction with an immediate offset.
 void MipsTargetStreamer::emitStoreWithImmOffset(
     unsigned Opcode, MCRegister SrcReg, MCRegister BaseReg, int64_t Offset,
-    function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+    function_ref<MCRegister()> GetATReg, SMLoc IDLoc,
     const MCSubtargetInfo *STI) {
   if (isInt<16>(Offset)) {
     emitRRI(Opcode, SrcReg, BaseReg, Offset, IDLoc, STI);
@@ -729,38 +729,38 @@ void MipsTargetAsmStreamer::emitFMask(unsigned FPUBitmask,
   OS << "," << FPUTopSavedRegOff << '\n';
 }
 
-void MipsTargetAsmStreamer::emitDirectiveCpAdd(unsigned RegNo) {
+void MipsTargetAsmStreamer::emitDirectiveCpAdd(MCRegister Reg) {
   OS << "\t.cpadd\t$"
-     << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n";
+     << StringRef(MipsInstPrinter::getRegisterName(Reg)).lower() << "\n";
   forbidModuleDirective();
 }
 
-void MipsTargetAsmStreamer::emitDirectiveCpLoad(unsigned RegNo) {
+void MipsTargetAsmStreamer::emitDirectiveCpLoad(MCRegister Reg) {
   OS << "\t.cpload\t$"
-     << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n";
+     << StringRef(MipsInstPrinter::getRegisterName(Reg)).lower() << "\n";
   forbidModuleDirective();
 }
 
-void MipsTargetAsmStreamer::emitDirectiveCpLocal(unsigned RegNo) {
+void MipsTargetAsmStreamer::emitDirectiveCpLocal(MCRegister Reg) {
   OS << "\t.cplocal\t$"
-     << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n";
-  MipsTargetStreamer::emitDirectiveCpLocal(RegNo);
+     << StringRef(MipsInstPrinter::getRegisterName(Reg)).lower() << "\n";
+  MipsTargetStreamer::emitDirectiveCpLocal(Reg);
 }
 
 bool MipsTargetAsmStreamer::emitDirectiveCpRestore(
-    int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+    int Offset, function_ref<MCRegister()> GetATReg, SMLoc IDLoc,
     const MCSubtargetInfo *STI) {
   MipsTargetStreamer::emitDirectiveCpRestore(Offset, GetATReg, IDLoc, STI);
   OS << "\t.cprestore\t" << Offset << "\n";
   return true;
 }
 
-void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo,
+void MipsTargetAsmStreamer::emitDirectiveCpsetup(MCRegister Reg,
                                                  int RegOrOffset,
                                                  const MCSymbol &Sym,
                                                  bool IsReg) {
   OS << "\t.cpsetup\t$"
-     << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << ", ";
+     << StringRef(MipsInstPrinter::getRegisterName(Reg)).lower() << ", ";
 
   if (IsReg)
     OS << "$"
@@ -1229,18 +1229,18 @@ void MipsTargetELFStreamer::emitFMask(unsigned FPUBitmask,
   FPROffset = FPUTopSavedRegOff;
 }
 
-void MipsTargetELFStreamer::emitDirectiveCpAdd(unsigned RegNo) {
+void MipsTargetELFStreamer::emitDirectiveCpAdd(MCRegister Reg) {
   // .cpadd $reg
   // This directive inserts code to add $gp to the argument's register
   // when support for position independent code is enabled.
   if (!Pic)
     return;
 
-  emitAddu(RegNo, RegNo, GPReg, getABI().IsN64(), &STI);
+  emitAddu(Reg, Reg, GPReg, getABI().IsN64(), &STI);
   forbidModuleDirective();
 }
 
-void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
+void MipsTargetELFStreamer::emitDirectiveCpLoad(MCRegister Reg) {
   // .cpload $reg
   // This directive expands to:
   // lui   $gp, %hi(_gp_disp)
@@ -1283,19 +1283,19 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   TmpInst.setOpcode(Mips::ADDu);
   TmpInst.addOperand(MCOperand::createReg(GPReg));
   TmpInst.addOperand(MCOperand::createReg(GPReg));
-  TmpInst.addOperand(MCOperand::createReg(RegNo));
+  TmpInst.addOperand(MCOperand::createReg(Reg));
   getStreamer().emitInstruction(TmpInst, STI);
 
   forbidModuleDirective();
 }
 
-void MipsTargetELFStreamer::emitDirectiveCpLocal(unsigned RegNo) {
+void MipsTargetELFStreamer::emitDirectiveCpLocal(MCRegister Reg) {
   if (Pic)
-    MipsTargetStreamer::emitDirectiveCpLocal(RegNo);
+    MipsTargetStreamer::emitDirectiveCpLocal(Reg);
 }
 
 bool MipsTargetELFStreamer::emitDirectiveCpRestore(
-    int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+    int Offset, function_ref<MCRegister()> GetATReg, SMLoc IDLoc,
     const MCSubtargetInfo *STI) {
   MipsTargetStreamer::emitDirectiveCpRestore(Offset, GetATReg, IDLoc, STI);
   // .cprestore offset
@@ -1315,7 +1315,7 @@ bool MipsTargetELFStreamer::emitDirectiveCpRestore(
   return true;
 }
 
-void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
+void MipsTargetELFStreamer::emitDirectiveCpsetup(MCRegister Reg,
                                                  int RegOrOffset,
                                                  const MCSymbol &Sym,
                                                  bool IsReg) {
@@ -1353,9 +1353,9 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
 
   // (d)addu  $gp, $gp, $funcreg
   if (getABI().IsN32())
-    emitRRR(Mips::ADDu, GPReg, GPReg, RegNo, SMLoc(), &STI);
+    emitRRR(Mips::ADDu, GPReg, GPReg, Reg, SMLoc(), &STI);
   else
-    emitRRR(Mips::DADDu, GPReg, GPReg, RegNo, SMLoc(), &STI);
+    emitRRR(Mips::DADDu, GPReg, GPReg, Reg, SMLoc(), &STI);
 }
 
 void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.h
index b726a80ce6b72..71b5d165a9cb3 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.h
@@ -98,13 +98,13 @@ class MipsTargetStreamer : public MCTargetStreamer {
   virtual void emitDirectiveSetHardFloat();
 
   // PIC support
-  virtual void emitDirectiveCpAdd(unsigned RegNo);
-  virtual void emitDirectiveCpLoad(unsigned RegNo);
-  virtual void emitDirectiveCpLocal(unsigned RegNo);
+  virtual void emitDirectiveCpAdd(MCRegister Reg);
+  virtual void emitDirectiveCpLoad(MCRegister Reg);
+  virtual void emitDirectiveCpLocal(MCRegister Reg);
   virtual bool emitDirectiveCpRestore(int Offset,
-                                      function_ref<unsigned()> GetATReg,
+                                      function_ref<MCRegister()> GetATReg,
                                       SMLoc IDLoc, const MCSubtargetInfo *STI);
-  virtual void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+  virtual void emitDirectiveCpsetup(MCRegister Reg, int RegOrOffset,
                                     const MCSymbol &Sym, bool IsReg);
   virtual void emitDirectiveCpreturn(unsigned SaveLocation,
                                      bool SaveLocationIsRegister);
@@ -164,7 +164,7 @@ class MipsTargetStreamer : public MCTargetStreamer {
   /// by reporting an error).
   void emitStoreWithImmOffset(unsigned Opcode, MCRegister SrcReg,
                               MCRegister BaseReg, int64_t Offset,
-                              function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+                              function_ref<MCRegister()> GetATReg, SMLoc IDLoc,
                               const MCSubtargetInfo *STI);
   void emitLoadWithImmOffset(unsigned Opcode, MCRegister DstReg,
                              MCRegister BaseReg, int64_t Offset,
@@ -205,7 +205,7 @@ class MipsTargetStreamer : public MCTargetStreamer {
   bool FrameInfoSet;
   int FrameOffset;
   unsigned FrameReg;
-  unsigned GPReg;
+  MCRegister GPReg;
   unsigned ReturnReg;
 
 private:
@@ -290,9 +290,9 @@ class MipsTargetAsmStreamer : public MipsTargetStreamer {
   void emitDirectiveSetHardFloat() override;
 
   // PIC support
-  void emitDirectiveCpAdd(unsigned RegNo) override;
-  void emitDirectiveCpLoad(unsigned RegNo) override;
-  void emitDirectiveCpLocal(unsigned RegNo) override;
+  void emitDirectiveCpAdd(MCRegister Reg) override;
+  void emitDirectiveCpLoad(MCRegister Reg) override;
+  void emitDirectiveCpLocal(MCRegister Reg) override;
 
   /// Emit a .cprestore directive.  If the offset is out of range then it will
   /// be synthesized using the assembler temporary.
@@ -301,9 +301,9 @@ class MipsTargetAsmStreamer : public MipsTargetStreamer {
   /// temporary and is only called when the assembler temporary is required. It
   /// must handle the case where no assembler temporary is available (typically
   /// by reporting an error).
-  bool emitDirectiveCpRestore(int Offset, function_ref<unsigned()> GetATReg,
+  bool emitDirectiveCpRestore(int Offset, function_ref<MCRegister()> GetATReg,
                               SMLoc IDLoc, const MCSubtargetInfo *STI) override;
-  void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+  void emitDirectiveCpsetup(MCRegister Reg, int RegOrOffset,
                             const MCSymbol &Sym, bool IsReg) override;
   void emitDirectiveCpreturn(unsigned SaveLocation,
                              bool SaveLocationIsRegister) override;
@@ -370,12 +370,12 @@ class MipsTargetELFStreamer : public MipsTargetStreamer {
   void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override;
 
   // PIC support
-  void emitDirectiveCpAdd(unsigned RegNo) override;
-  void emitDirectiveCpLoad(unsigned RegNo) override;
-  void emitDirectiveCpLocal(unsigned RegNo) override;
-  bool emitDirectiveCpRestore(int Offset, function_ref<unsigned()> GetATReg,
+  void emitDirectiveCpAdd(MCRegister Reg) override;
+  void emitDirectiveCpLoad(MCRegister Reg) override;
+  void emitDirectiveCpLocal(MCRegister Reg) override;
+  bool emitDirectiveCpRestore(int Offset, function_ref<MCRegister()> GetATReg,
                               SMLoc IDLoc, const MCSubtargetInfo *STI) override;
-  void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+  void emitDirectiveCpsetup(MCRegister Reg, int RegOrOffset,
                             const MCSymbol &Sym, bool IsReg) override;
   void emitDirectiveCpreturn(unsigned SaveLocation,
                              bool SaveLocationIsRegister) override;
diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
index 5d08f560c3c36..d23ec57d46e17 100644
--- a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -37,11 +37,7 @@ using namespace llvm;
 #define DEBUG_TYPE "mips16-instrinfo"
 
 Mips16InstrInfo::Mips16InstrInfo(const MipsSubtarget &STI)
-    : MipsInstrInfo(STI, Mips::Bimm16), RI(STI) {}
-
-const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
-  return RI;
-}
+    : MipsInstrInfo(STI, RI, Mips::Bimm16), RI(STI) {}
 
 /// isLoadFromStackSlot - If the specified machine instruction is a direct
 /// load from a stack slot, return the virtual or physical register number of
@@ -105,7 +101,6 @@ void Mips16InstrInfo::storeRegToStack(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator I,
                                       Register SrcReg, bool isKill, int FI,
                                       const TargetRegisterClass *RC,
-                                      const TargetRegisterInfo *TRI,
                                       int64_t Offset,
                                       MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
@@ -120,10 +115,12 @@ void Mips16InstrInfo::storeRegToStack(MachineBasicBlock &MBB,
       .addMemOperand(MMO);
 }
 
-void Mips16InstrInfo::loadRegFromStack(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    int64_t Offset, MachineInstr::MIFlag Flags) const {
+void Mips16InstrInfo::loadRegFromStack(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       Register DestReg, int FI,
+                                       const TargetRegisterClass *RC,
+                                       int64_t Offset,
+                                       MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
   MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
@@ -405,9 +402,9 @@ unsigned Mips16InstrInfo::loadImmediate(unsigned FrameReg, int64_t Imm,
       }
       if (SecondRegSaved)
         copyPhysReg(MBB, II, DL, SecondRegSavedTo, SecondRegSaved, true);
+    } else {
+      Available.reset(SpReg);
     }
-   else
-     Available.reset(SpReg);
     copyPhysReg(MBB, II, DL, SpReg, Mips::SP, false);
     BuildMI(MBB, II, DL, get(Mips::AdduRxRyRz16), Reg)
         .addReg(SpReg, RegState::Kill)
diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.h b/llvm/lib/Target/Mips/Mips16InstrInfo.h
index 1058e8c25fb5b..4300d086f0614 100644
--- a/llvm/lib/Target/Mips/Mips16InstrInfo.h
+++ b/llvm/lib/Target/Mips/Mips16InstrInfo.h
@@ -30,7 +30,7 @@ class Mips16InstrInfo : public MipsInstrInfo {
 public:
   explicit Mips16InstrInfo(const MipsSubtarget &STI);
 
-  const MipsRegisterInfo &getRegisterInfo() const override;
+  const Mips16RegisterInfo &getRegisterInfo() const { return RI; }
 
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
@@ -56,13 +56,14 @@ class Mips16InstrInfo : public MipsInstrInfo {
   void storeRegToStack(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
       bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, int64_t Offset,
+      int64_t Offset,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStack(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, int64_t Offset,
+
+      int64_t Offset,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp
index df0c8c13fa38d..06210b6b91b93 100644
--- a/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -82,7 +82,7 @@ class MipsFastISel final : public FastISel {
   // All possible address modes.
   class Address {
   public:
-    using BaseKind = enum { RegBase, FrameIndexBase };
+    enum BaseKind { RegBase, FrameIndexBase };
 
   private:
     BaseKind Kind = RegBase;
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/llvm/lib/Target/Mips/MipsInstrInfo.cpp
index bffdffa4af6a0..c879c46e49dd4 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.cpp
@@ -39,8 +39,9 @@ using namespace llvm;
 // Pin the vtable to this file.
 void MipsInstrInfo::anchor() {}
 
-MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBr)
-    : MipsGenInstrInfo(STI, Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
+MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI,
+                             const MipsRegisterInfo &RI, unsigned UncondBr)
+    : MipsGenInstrInfo(STI, RI, Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
       Subtarget(STI), UncondBrOpc(UncondBr) {}
 
 const MipsInstrInfo *MipsInstrInfo::create(MipsSubtarget &STI) {
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.h b/llvm/lib/Target/Mips/MipsInstrInfo.h
index 2337ae7c079e7..0b90972977d5e 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.h
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.h
@@ -55,7 +55,8 @@ class MipsInstrInfo : public MipsGenInstrInfo {
     BT_Indirect    // One indirct branch.
   };
 
-  explicit MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBrOpc);
+  explicit MipsInstrInfo(const MipsSubtarget &STI, const MipsRegisterInfo &RI,
+                         unsigned UncondBrOpc);
 
   MCInst getNop() const override;
 
@@ -130,7 +131,10 @@ class MipsInstrInfo : public MipsGenInstrInfo {
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
-  virtual const MipsRegisterInfo &getRegisterInfo() const = 0;
+  const MipsRegisterInfo &getRegisterInfo() const {
+    return static_cast<const MipsRegisterInfo &>(
+        TargetInstrInfo::getRegisterInfo());
+  }
 
   virtual unsigned getOppositeBranchOpc(unsigned Opc) const = 0;
 
@@ -143,31 +147,28 @@ class MipsInstrInfo : public MipsGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override {
-    storeRegToStack(MBB, MBBI, SrcReg, isKill, FrameIndex, RC, TRI, 0, Flags);
+    storeRegToStack(MBB, MBBI, SrcReg, isKill, FrameIndex, RC, 0, Flags);
   }
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override {
-    loadRegFromStack(MBB, MBBI, DestReg, FrameIndex, RC, TRI, 0, Flags);
+    loadRegFromStack(MBB, MBBI, DestReg, FrameIndex, RC, 0, Flags);
   }
 
   virtual void
   storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                   Register SrcReg, bool isKill, int FrameIndex,
-                  const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-                  int64_t Offset,
+                  const TargetRegisterClass *RC, int64_t Offset,
                   MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const = 0;
 
   virtual void loadRegFromStack(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, int64_t Offset,
+      int FrameIndex, const TargetRegisterClass *RC, int64_t Offset,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const = 0;
 
   virtual void adjustStackPtr(unsigned SP, int64_t Amount,
diff --git a/llvm/lib/Target/Mips/MipsOptionRecord.h b/llvm/lib/Target/Mips/MipsOptionRecord.h
index 7897095ef8941..2107baf9f14e5 100644
--- a/llvm/lib/Target/Mips/MipsOptionRecord.h
+++ b/llvm/lib/Target/Mips/MipsOptionRecord.h
@@ -58,7 +58,7 @@ class MipsRegInfoRecord : public MipsOptionRecord {
   ~MipsRegInfoRecord() override = default;
 
   void EmitMipsOptionRecord() override;
-  void SetPhysRegUsed(unsigned Reg, const MCRegisterInfo *MCRegInfo);
+  void SetPhysRegUsed(MCRegister Reg, const MCRegisterInfo *MCRegInfo);
 
 private:
   MipsELFStreamer *Streamer;
diff --git a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
index f08704a7e799c..942194cf31d44 100644
--- a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -172,7 +172,7 @@ void ExpandPseudo::expandLoadCCond(MachineBasicBlock &MBB, Iter I) {
   Register VR = MRI.createVirtualRegister(RC);
   Register Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
 
-  TII.loadRegFromStack(MBB, I, VR, FI, RC, &RegInfo, 0);
+  TII.loadRegFromStack(MBB, I, VR, FI, RC, 0);
   BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), Dst)
     .addReg(VR, RegState::Kill);
 }
@@ -189,7 +189,7 @@ void ExpandPseudo::expandStoreCCond(MachineBasicBlock &MBB, Iter I) {
 
   BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), VR)
     .addReg(Src, getKillRegState(I->getOperand(0).isKill()));
-  TII.storeRegToStack(MBB, I, VR, true, FI, RC, &RegInfo, 0);
+  TII.storeRegToStack(MBB, I, VR, true, FI, RC, 0);
 }
 
 void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I,
@@ -210,9 +210,9 @@ void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I,
   DebugLoc DL = I->getDebugLoc();
   const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
 
-  TII.loadRegFromStack(MBB, I, VR0, FI, RC, &RegInfo, 0);
+  TII.loadRegFromStack(MBB, I, VR0, FI, RC, 0);
   BuildMI(MBB, I, DL, Desc, Lo).addReg(VR0, RegState::Kill);
-  TII.loadRegFromStack(MBB, I, VR1, FI, RC, &RegInfo, RegSize);
+  TII.loadRegFromStack(MBB, I, VR1, FI, RC, RegSize);
   BuildMI(MBB, I, DL, Desc, Hi).addReg(VR1, RegState::Kill);
 }
 
@@ -234,9 +234,9 @@ void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I,
   DebugLoc DL = I->getDebugLoc();
 
   BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src);
-  TII.storeRegToStack(MBB, I, VR0, true, FI, RC, &RegInfo, 0);
+  TII.storeRegToStack(MBB, I, VR0, true, FI, RC, 0);
   BuildMI(MBB, I, DL, TII.get(MFHiOpc), VR1).addReg(Src, SrcKill);
-  TII.storeRegToStack(MBB, I, VR1, true, FI, RC, &RegInfo, RegSize);
+  TII.storeRegToStack(MBB, I, VR1, true, FI, RC, RegSize);
 }
 
 bool ExpandPseudo::expandCopy(MachineBasicBlock &MBB, Iter I) {
@@ -321,11 +321,9 @@ bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB,
     int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(MF, RC2);
     if (!Subtarget.isLittle())
       std::swap(LoReg, HiReg);
-    TII.storeRegToStack(MBB, I, LoReg, I->getOperand(1).isKill(), FI, RC,
-                        &RegInfo, 0);
-    TII.storeRegToStack(MBB, I, HiReg, I->getOperand(2).isKill(), FI, RC,
-                        &RegInfo, 4);
-    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, 0);
+    TII.storeRegToStack(MBB, I, LoReg, I->getOperand(1).isKill(), FI, RC, 0);
+    TII.storeRegToStack(MBB, I, HiReg, I->getOperand(2).isKill(), FI, RC, 4);
+    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, 0);
     return true;
   }
 
@@ -385,8 +383,8 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
     // We re-use the same spill slot each time so that the stack frame doesn't
     // grow too much in functions with a large number of moves.
     int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(MF, RC);
-    TII.storeRegToStack(MBB, I, SrcReg, Op1.isKill(), FI, RC, &RegInfo, 0);
-    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, Offset);
+    TII.storeRegToStack(MBB, I, SrcReg, Op1.isKill(), FI, RC, 0);
+    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, Offset);
     return true;
   }
 
@@ -480,8 +478,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
       if (!MBB.isLiveIn(ABI.GetEhDataReg(I)))
         MBB.addLiveIn(ABI.GetEhDataReg(I));
       TII.storeRegToStackSlot(MBB, MBBI, ABI.GetEhDataReg(I), false,
-                              MipsFI->getEhDataRegFI(I), RC, &RegInfo,
-                              Register());
+                              MipsFI->getEhDataRegFI(I), RC, Register());
     }
 
     // Emit .cfi_offset directives for eh data registers.
@@ -579,8 +576,7 @@ void MipsSEFrameLowering::emitInterruptPrologueStub(
       .setMIFlag(MachineInstr::FrameSetup);
 
   STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false,
-                                      MipsFI->getISRRegFI(0), PtrRC,
-                                      STI.getRegisterInfo(), 0);
+                                      MipsFI->getISRRegFI(0), PtrRC, 0);
 
   // Fetch and Spill Status
   MBB.addLiveIn(Mips::COP012);
@@ -590,8 +586,7 @@ void MipsSEFrameLowering::emitInterruptPrologueStub(
       .setMIFlag(MachineInstr::FrameSetup);
 
   STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false,
-                                      MipsFI->getISRRegFI(1), PtrRC,
-                                      STI.getRegisterInfo(), 0);
+                                      MipsFI->getISRRegFI(1), PtrRC, 0);
 
   // Build the configuration for disabling lower priority interrupts. Non EIC
   // interrupts need to be masked off with zero, EIC from the Cause register.
@@ -657,7 +652,6 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
 
   const MipsSEInstrInfo &TII =
       *static_cast<const MipsSEInstrInfo *>(STI.getInstrInfo());
-  const MipsRegisterInfo &RegInfo = *STI.getRegisterInfo();
 
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   MipsABIInfo ABI = STI.getABI();
@@ -690,8 +684,7 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
     // Insert instructions that restore eh data registers.
     for (int J = 0; J < 4; ++J) {
       TII.loadRegFromStackSlot(MBB, I, ABI.GetEhDataReg(J),
-                               MipsFI->getEhDataRegFI(J), RC, &RegInfo,
-                               Register());
+                               MipsFI->getEhDataRegFI(J), RC, Register());
     }
   }
 
@@ -722,17 +715,15 @@ void MipsSEFrameLowering::emitInterruptEpilogueStub(
   BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::EHB));
 
   // Restore EPC
-  STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1,
-                                           MipsFI->getISRRegFI(0), PtrRC,
-                                           STI.getRegisterInfo(), Register());
+  STI.getInstrInfo()->loadRegFromStackSlot(
+      MBB, MBBI, Mips::K1, MipsFI->getISRRegFI(0), PtrRC, Register());
   BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP014)
       .addReg(Mips::K1)
       .addImm(0);
 
   // Restore Status
-  STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1,
-                                           MipsFI->getISRRegFI(1), PtrRC,
-                                           STI.getRegisterInfo(), Register());
+  STI.getInstrInfo()->loadRegFromStackSlot(
+      MBB, MBBI, Mips::K1, MipsFI->getISRRegFI(1), PtrRC, Register());
   BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP012)
       .addReg(Mips::K1)
       .addImm(0);
@@ -795,7 +786,7 @@ bool MipsSEFrameLowering::spillCalleeSavedRegisters(
     // Insert the spill to the stack frame.
     bool IsKill = !IsRAAndRetAddrIsTaken;
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, I.getFrameIdx(), RC, TRI,
+    TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, I.getFrameIdx(), RC,
                             Register());
   }
 
diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
index dbdbb179a583d..a1d0aa089c089 100644
--- a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -28,11 +28,7 @@ static unsigned getUnconditionalBranch(const MipsSubtarget &STI) {
 }
 
 MipsSEInstrInfo::MipsSEInstrInfo(const MipsSubtarget &STI)
-    : MipsInstrInfo(STI, getUnconditionalBranch(STI)), RI(STI) {}
-
-const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const {
-  return RI;
-}
+    : MipsInstrInfo(STI, RI, getUnconditionalBranch(STI)), RI(STI) {}
 
 /// isLoadFromStackSlot - If the specified machine instruction is a direct
 /// load from a stack slot, return the virtual or physical register number of
@@ -213,7 +209,6 @@ void MipsSEInstrInfo::storeRegToStack(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator I,
                                       Register SrcReg, bool isKill, int FI,
                                       const TargetRegisterClass *RC,
-                                      const TargetRegisterInfo *TRI,
                                       int64_t Offset,
                                       MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
@@ -239,16 +234,16 @@ void MipsSEInstrInfo::storeRegToStack(MachineBasicBlock &MBB,
     Opc = Mips::SDC1;
   else if (Mips::FGR64RegClass.hasSubClassEq(RC))
     Opc = Mips::SDC164;
-  else if (TRI->isTypeLegalForClass(*RC, MVT::v16i8))
+  else if (RI.isTypeLegalForClass(*RC, MVT::v16i8))
     Opc = Mips::ST_B;
-  else if (TRI->isTypeLegalForClass(*RC, MVT::v8i16) ||
-           TRI->isTypeLegalForClass(*RC, MVT::v8f16))
+  else if (RI.isTypeLegalForClass(*RC, MVT::v8i16) ||
+           RI.isTypeLegalForClass(*RC, MVT::v8f16))
     Opc = Mips::ST_H;
-  else if (TRI->isTypeLegalForClass(*RC, MVT::v4i32) ||
-           TRI->isTypeLegalForClass(*RC, MVT::v4f32))
+  else if (RI.isTypeLegalForClass(*RC, MVT::v4i32) ||
+           RI.isTypeLegalForClass(*RC, MVT::v4f32))
     Opc = Mips::ST_W;
-  else if (TRI->isTypeLegalForClass(*RC, MVT::v2i64) ||
-           TRI->isTypeLegalForClass(*RC, MVT::v2f64))
+  else if (RI.isTypeLegalForClass(*RC, MVT::v2i64) ||
+           RI.isTypeLegalForClass(*RC, MVT::v2f64))
     Opc = Mips::ST_D;
   else if (Mips::LO32RegClass.hasSubClassEq(RC))
     Opc = Mips::SW;
@@ -285,10 +280,12 @@ void MipsSEInstrInfo::storeRegToStack(MachineBasicBlock &MBB,
     .addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
 }
 
-void MipsSEInstrInfo::loadRegFromStack(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    int64_t Offset, MachineInstr::MIFlag Flags) const {
+void MipsSEInstrInfo::loadRegFromStack(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       Register DestReg, int FI,
+                                       const TargetRegisterClass *RC,
+                                       int64_t Offset,
+                                       MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
   MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
@@ -317,16 +314,16 @@ void MipsSEInstrInfo::loadRegFromStack(
     Opc = Mips::LDC1;
   else if (Mips::FGR64RegClass.hasSubClassEq(RC))
     Opc = Mips::LDC164;
-  else if (TRI->isTypeLegalForClass(*RC, MVT::v16i8))
+  else if (RI.isTypeLegalForClass(*RC, MVT::v16i8))
     Opc = Mips::LD_B;
-  else if (TRI->isTypeLegalForClass(*RC, MVT::v8i16) ||
-           TRI->isTypeLegalForClass(*RC, MVT::v8f16))
+  else if (RI.isTypeLegalForClass(*RC, MVT::v8i16) ||
+           RI.isTypeLegalForClass(*RC, MVT::v8f16))
     Opc = Mips::LD_H;
-  else if (TRI->isTypeLegalForClass(*RC, MVT::v4i32) ||
-           TRI->isTypeLegalForClass(*RC, MVT::v4f32))
+  else if (RI.isTypeLegalForClass(*RC, MVT::v4i32) ||
+           RI.isTypeLegalForClass(*RC, MVT::v4f32))
     Opc = Mips::LD_W;
-  else if (TRI->isTypeLegalForClass(*RC, MVT::v2i64) ||
-           TRI->isTypeLegalForClass(*RC, MVT::v2f64))
+  else if (RI.isTypeLegalForClass(*RC, MVT::v2i64) ||
+           RI.isTypeLegalForClass(*RC, MVT::v2f64))
     Opc = Mips::LD_D;
   else if (Mips::HI32RegClass.hasSubClassEq(RC))
     Opc = Mips::LW;
@@ -682,8 +679,8 @@ MipsSEInstrInfo::compareOpndSize(unsigned Opc,
   const MCInstrDesc &Desc = get(Opc);
   assert(Desc.NumOperands == 2 && "Unary instruction expected.");
   const MipsRegisterInfo *RI = &getRegisterInfo();
-  unsigned DstRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 0, RI));
-  unsigned SrcRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 1, RI));
+  unsigned DstRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 0));
+  unsigned SrcRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 1));
 
   return std::make_pair(DstRegSize > SrcRegSize, DstRegSize < SrcRegSize);
 }
diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/llvm/lib/Target/Mips/MipsSEInstrInfo.h
index 2b4f55d184b8b..5c48ccdc27f02 100644
--- a/llvm/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.h
@@ -24,7 +24,7 @@ class MipsSEInstrInfo : public MipsInstrInfo {
 public:
   explicit MipsSEInstrInfo(const MipsSubtarget &STI);
 
-  const MipsRegisterInfo &getRegisterInfo() const override;
+  const MipsSERegisterInfo &getRegisterInfo() const { return RI; }
 
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
@@ -50,13 +50,12 @@ class MipsSEInstrInfo : public MipsInstrInfo {
   void storeRegToStack(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
       bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, int64_t Offset,
+      int64_t Offset,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStack(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, int64_t Offset,
+      int FrameIndex, const TargetRegisterClass *RC, int64_t Offset,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 1e0f747f8f7fc..95fd05f2a926f 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -66,7 +66,6 @@ void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);
 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
 void initializeNVPTXLowerAllocaPass(PassRegistry &);
 void initializeNVPTXLowerUnreachablePass(PassRegistry &);
-void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);
 void initializeNVPTXLowerArgsLegacyPassPass(PassRegistry &);
 void initializeNVPTXProxyRegErasurePass(PassRegistry &);
 void initializeNVPTXForwardParamsPassPass(PassRegistry &);
diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h
index caef8fe790adb..b832b82cbc30c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h
@@ -20,7 +20,7 @@ class MemoryLocation;
 
 class NVPTXAAResult : public AAResultBase {
 public:
-  NVPTXAAResult() {}
+  NVPTXAAResult() = default;
   NVPTXAAResult(NVPTXAAResult &&Arg) : AAResultBase(std::move(Arg)) {}
 
   /// Handle invalidation events from the new pass manager.
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 14ca867023e2a..9bbb3aad89c44 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -89,8 +89,6 @@
 #include <cstdint>
 #include <cstring>
 #include <string>
-#include <utility>
-#include <vector>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index c734d3d430073..7f190f33da808 100644
--- a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -28,10 +28,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-void initializeGenericToNVVMLegacyPassPass(PassRegistry &);
-}
-
 namespace {
 class GenericToNVVM {
 public:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 7e7ee754c250d..996d653940118 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1836,7 +1836,7 @@ bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
   return true;
 }
 
-NVPTXScopes::NVPTXScopes(LLVMContext &C) {
+NVPTXScopes::NVPTXScopes(LLVMContext &C) : Context(&C) {
   Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread;
   Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System;
   Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block;
@@ -1851,11 +1851,21 @@ NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const {
 
   auto S = Scopes.find(ID);
   if (S == Scopes.end()) {
-    // TODO:
-    // - Add API to LLVMContext to get the name of a single scope.
-    // - Use that API here to print an error containing the name
-    //   of this Unknown ID.
-    report_fatal_error(formatv("Could not find scope ID={}.", int(ID)));
+    auto scopeName = Context->getSyncScopeName(ID);
+    assert(scopeName.has_value() && "Scope name must exist.");
+
+    // Build list of supported syncscopes programmatically
+    SmallVector<StringRef> supportedScopes;
+    for (const auto &Entry : Scopes) {
+      if (auto name = Context->getSyncScopeName(Entry.first))
+        supportedScopes.push_back(name->empty() ? "<empty string>" : *name);
+    }
+
+    reportFatalUsageError(
+        formatv("NVPTX backend does not support syncscope \"{0}\" (ID={1}).\n"
+                "Supported syncscopes are: {2}.",
+                scopeName.value(), int(ID),
+                make_range(supportedScopes.begin(), supportedScopes.end())));
   }
   return S->second;
 }
@@ -1871,17 +1881,6 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
   (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH))          \
          : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
 
-#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32)   \
-  [&]() -> auto {                                                              \
-    if (is_mc && is_ch)                                                        \
-      return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH);      \
-    if (is_ch)                                                                 \
-      return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH);         \
-    if (is_mc)                                                                 \
-      return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC);         \
-    return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, );              \
-  }()
-
 static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim,
                                                        bool IsShared32,
                                                        bool IsCacheHint,
@@ -1925,112 +1924,6 @@ static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim,
   }
 }
 
-static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
-                                              bool IsMultiCast,
-                                              bool IsCacheHint, bool IsIm2Col) {
-  if (IsIm2Col) {
-    switch (Dim) {
-    case 3:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL, IsMultiCast,
-                                                 IsCacheHint, IsShared32);
-    case 4:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL, IsMultiCast,
-                                                 IsCacheHint, IsShared32);
-    case 5:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL, IsMultiCast,
-                                                 IsCacheHint, IsShared32);
-    default:
-      llvm_unreachable("Invalid Dimension in im2col mode for "
-                       "GetCpAsyncBulkTensorG2SOpcode.");
-    }
-  } else {
-    switch (Dim) {
-    case 1:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE, IsMultiCast,
-                                                 IsCacheHint, IsShared32);
-    case 2:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE, IsMultiCast,
-                                                 IsCacheHint, IsShared32);
-    case 3:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE, IsMultiCast,
-                                                 IsCacheHint, IsShared32);
-    case 4:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE, IsMultiCast,
-                                                 IsCacheHint, IsShared32);
-    case 5:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE, IsMultiCast,
-                                                 IsCacheHint, IsShared32);
-    default:
-      llvm_unreachable(
-          "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
-    }
-  }
-}
-
-static size_t GetDimsFromIntrinsic(unsigned IID) {
-  switch (IID) {
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
-    return 3;
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
-    return 4;
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
-    return 5;
-  default:
-    llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic.");
-  }
-}
-
-void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
-                                                         bool IsIm2Col) {
-  // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
-  // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2}
-  // multicast, cache_hint,
-  // multicast_flag, cache_hint_flag, cta_group_flag}
-  // NumOperands = {Chain, IID} + {Actual intrinsic args}
-  //             = {2}          + {8 + dims + im2col_offsets}
-  size_t NumOps = N->getNumOperands();
-  size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
-                            : (NumOps - 10);
-  // Offsets is always 'NumDims - 2' and only for im2col mode
-  size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
-  bool IsCacheHint = N->getConstantOperandVal(NumOps - 2) == 1;
-  bool IsMultiCast = N->getConstantOperandVal(NumOps - 3) == 1;
-  size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src}
-  size_t MultiCastIdx = NumBaseArgs + 2;         // for Chain and IID
-
-  unsigned CTAGroupVal = N->getConstantOperandVal(NumOps - 1);
-  if ((CTAGroupVal > 0) && !Subtarget->hasCpAsyncBulkTensorCTAGroupSupport())
-    report_fatal_error(
-        formatv("CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}",
-                Subtarget->getSmVersion()));
-
-  SDLoc DL(N);
-  SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));
-
-  // Push MultiCast operand, if available
-  if (IsMultiCast)
-    Ops.push_back(N->getOperand(MultiCastIdx));
-
-  // Push CacheHint operand, if available
-  if (IsCacheHint)
-    Ops.push_back(N->getOperand(MultiCastIdx + 1));
-
-  // Flag for CTA Group
-  Ops.push_back(getI32Imm(CTAGroupVal, DL));
-
-  // Finally, the chain operand
-  Ops.push_back(N->getOperand(0));
-
-  bool IsShared32 =
-      CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
-  unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode(
-      NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
-  ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
-}
-
 void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
                                                             unsigned RedOp,
                                                             bool IsIm2Col) {
@@ -2175,18 +2068,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
   switch (IID) {
   default:
     return false;
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d:
-    SelectCpAsyncBulkTensorG2SCommon(N);
-    return true;
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
-    SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true);
-    return true;
   case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index c912e709d0aa0..d525531766ddf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -35,6 +35,7 @@ struct NVPTXScopes {
 
 private:
   SmallMapVector<SyncScope::ID, NVPTX::Scope, 8> Scopes{};
+  LLVMContext *Context = nullptr;
 };
 
 class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
@@ -86,7 +87,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
   void SelectV2I64toI128(SDNode *N);
   void SelectI128toV2I64(SDNode *N);
-  void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
   void SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp,
                                            bool IsIm2Col = false);
   void SelectTcgen05Ld(SDNode *N, bool hasOffset = false);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 2f1a7ad2d401f..3e44e47c56ad7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -305,7 +305,8 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
                                uint64_t StartingOffset = 0) {
   SmallVector<EVT, 16> TempVTs;
   SmallVector<uint64_t, 16> TempOffsets;
-  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
+  ComputeValueVTs(TLI, DL, Ty, TempVTs, /*MemVTs=*/nullptr, &TempOffsets,
+                  StartingOffset);
 
   for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) {
     MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
@@ -5420,8 +5421,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
     return SDValue();
 
-  SmallVector<SDNode *> DeadCopyToRegs;
-
   // Check whether all outputs are either used by an extractelt or are
   // glue/chain nodes
   if (!all_of(N->uses(), [&](SDUse &U) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 6840c7ae8faf4..db2d96f5ff532 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -26,7 +26,7 @@ using namespace llvm;
 void NVPTXInstrInfo::anchor() {}
 
 NVPTXInstrInfo::NVPTXInstrInfo(const NVPTXSubtarget &STI)
-    : NVPTXGenInstrInfo(STI), RegInfo() {}
+    : NVPTXGenInstrInfo(STI, RegInfo), RegInfo() {}
 
 void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index dfde0cca0f00c..f0bdf472b96ed 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -139,7 +139,6 @@ def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
 def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
 def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
 def hasTcgen05MMAScaleInputDImm : Predicate<"Subtarget->hasTcgen05MMAScaleInputDImm()">;
-def hasTMACTAGroupSupport  : Predicate<"Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()">;
 def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">;
 
 class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
@@ -2268,7 +2267,7 @@ def : Pat<(f32 (fpround f64:$a)), (CVT_f32_f64 $a, CvtRN)>;
 def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE)>;
 // fpextend bf16 -> f32
-def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ, hasPTX<78>, hasSM<90>]>;
 def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
 
 // fpextend f16 -> f64
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index c923f0ec907e7..50827bd548ad5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -599,75 +599,15 @@ class TMA_IM2COL_UTIL<int dim, string mode> {
   string base_str = !interleave(!foreach(i, !range(offsets), "$im2col" # i), ", ");
 }
 
-// From Global to Shared memory (G2S)
-class G2S_STRINGS<int dim, string mode, bit mc, bit ch, bit is_shared32 = 0> {
-  string prefix = "cp.async.bulk.tensor";
-  string dir = "shared::cluster.global";
-  string completion = "mbarrier::complete_tx::bytes";
-  string inst_name = prefix
-                     # "." # dim # "d"
-                     # "." # dir
-                     # "." # mode
-                     # "." # completion
-                     # !if(mc, ".multicast::cluster", "")
-                     # !if(ch, ".L2::cache_hint", "");
-  string intr_name = "CP_ASYNC_BULK_TENSOR_G2S_"
-                     # dim # "D"
-                     # !if(is_shared32, "_SHARED32", "")
-                     # !if(!eq(mode, "tile"), "_TILE", "_IM2COL");
-}
-
 def CTAGroupFlags : Operand<i32> {
   let PrintMethod = "printCTAGroup";
 }
 
-multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, bit is_shared32, string mode> {
-  defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
-  defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
-  defvar asm_str_default = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
-  defvar rc = !if(is_shared32, B32, B64);
-
-  defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0);
-  defvar im2col_dag = !if(!eq(mode, "im2col"),
-    !dag(ins, !listsplat(B16, num_im2col), !foreach(i, !range(num_im2col), "im2col" # i)),
-    (ins));
-  defvar im2col_str = !interleave(!foreach(i, !range(num_im2col), "$im2col" # i), ", ");
-  defvar im2col_asm_str = ", {{" # im2col_str # "}}";
-
-  defvar asm_str = !if(!eq(mode, "im2col"),
-    !strconcat(asm_str_default, im2col_asm_str), asm_str_default);
+def tma_cta_group_imm0 : TImmLeaf<i32, [{return Imm == 0;}]>;
+def tma_cta_group_imm_any : TImmLeaf<i32, [{return Imm >= 0;}]>;
 
-  def "" : NVPTXInst<(outs),
-            !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, (ins CTAGroupFlags:$cg)),
-            !strconcat(G2S_STRINGS<dim, mode, 0, 0>.inst_name, asm_str, ";")>,
-            Requires<[hasPTX<80>, hasSM<90>]>;
-  def _MC : NVPTXInst<(outs),
-                  !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
-                       (ins B16:$mc, CTAGroupFlags:$cg)),
-                  !strconcat(G2S_STRINGS<dim, mode, 1, 0>.inst_name, asm_str, ", $mc;")>,
-                  Requires<[hasPTX<80>, hasSM<90>]>;
-  def _CH : NVPTXInst<(outs),
-                  !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
-                       (ins B64:$ch, CTAGroupFlags:$cg)),
-                  !strconcat(G2S_STRINGS<dim, mode, 0, 1>.inst_name, asm_str, ", $ch;")>,
-                  Requires<[hasPTX<80>, hasSM<90>]>;
-  def _MC_CH : NVPTXInst<(outs),
-                     !con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
-                          (ins B16:$mc, B64:$ch, CTAGroupFlags:$cg)),
-                     !strconcat(G2S_STRINGS<dim, mode, 1, 1>.inst_name, asm_str, ", $mc, $ch;")>,
-                     Requires<[hasPTX<80>, hasSM<90>]>;
-}
-
-foreach dim = [1, 2, 3, 4, 5] in {
-  foreach shared32 = [true, false] in {
-    foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
-      defm G2S_STRINGS<dim, mode, 0, 0, shared32>.intr_name :
-        CP_ASYNC_BULK_TENSOR_G2S_INTR<dim, shared32, mode>;
-    }
-  }
-}
-
-multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []> {
+multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred,
+                               TImmLeaf cta_group_type = tma_cta_group_imm_any> {
   defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
   defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
   defvar asm_str_base = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
@@ -697,10 +637,10 @@ multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []>
                          !setdagop(dims_dag, intr),
                          !setdagop(im2col_dag, intr),
                          (intr B16:$mc, B64:$ch));
-  defvar intr_dag_no_hints   = !con(intr_dag_base, (intr 0,  0,  timm:$cg));
-  defvar intr_dag_with_mc    = !con(intr_dag_base, (intr -1, 0,  timm:$cg));
-  defvar intr_dag_with_ch    = !con(intr_dag_base, (intr 0, -1,  timm:$cg));
-  defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, timm:$cg));
+  defvar intr_dag_no_hints   = !con(intr_dag_base, (intr 0,  0,  cta_group_type:$cg));
+  defvar intr_dag_with_mc    = !con(intr_dag_base, (intr -1, 0,  cta_group_type:$cg));
+  defvar intr_dag_with_ch    = !con(intr_dag_base, (intr 0, -1,  cta_group_type:$cg));
+  defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, cta_group_type:$cg));
 
   def "" : NVPTXInst<(outs), ins_dag,
              inst_name # asm_str # ";",
@@ -719,14 +659,30 @@ multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []>
                  [intr_dag_with_mc_ch]>,
                  Requires<pred>;
 }
+
+foreach dim = 1...5 in {
+  defm TMA_G2S_TILE_CG0_ # dim # "D"
+      : TMA_TENSOR_G2S_INTR<dim, "tile", [hasPTX<80>, hasSM<90>],
+                            tma_cta_group_imm0>;
+  defm TMA_G2S_TILE_ # dim # "D"
+      : TMA_TENSOR_G2S_INTR<dim, "tile",
+                            [callSubtarget<"hasTMABlackwellSupport">]>;
+}
 foreach dim = 3...5 in {
+  defm TMA_G2S_IM2COL_CG0_ # dim # "D"
+      : TMA_TENSOR_G2S_INTR<dim, "im2col", [hasPTX<80>, hasSM<90>],
+                            tma_cta_group_imm0>;
+  defm TMA_G2S_IM2COL_ # dim # "D"
+      : TMA_TENSOR_G2S_INTR<dim, "im2col",
+                            [callSubtarget<"hasTMABlackwellSupport">]>;
   foreach mode = ["im2col_w", "im2col_w_128"] in {
     defm TMA_G2S_ # !toupper(mode) # "_" # dim # "D"
-      : TMA_TENSOR_G2S_INTR<dim, mode, [hasTMACTAGroupSupport]>;
+        : TMA_TENSOR_G2S_INTR<dim, mode,
+                              [callSubtarget<"hasTMABlackwellSupport">]>;
   }
 }
 defm TMA_G2S_TILE_GATHER4_2D : TMA_TENSOR_G2S_INTR<5, "tile_gather4",
-                               [hasTMACTAGroupSupport]>;
+                               [callSubtarget<"hasTMABlackwellSupport">]>;
 
 multiclass TMA_TENSOR_G2S_CTA_INTR<int dim, string mode, list<Predicate> pred = []> {
   defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
@@ -784,7 +740,8 @@ foreach dim = 3...5 in {
     : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w", [hasPTX<86>, hasSM<100>]>;
 
   defm TMA_G2S_CTA_IM2COL_W_128_ # dim # "D"
-    : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128", [hasTMACTAGroupSupport]>;
+    : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128",
+                              [callSubtarget<"hasTMABlackwellSupport">]>;
 }
 defm TMA_G2S_CTA_TILE_GATHER4_2D : TMA_TENSOR_G2S_CTA_INTR<5, "tile_gather4",
                                    [hasPTX<86>, hasSM<100>]>;
@@ -835,7 +792,7 @@ foreach dim = 1...5 in {
   }
 }
 defm TMA_S2G_TILE_SCATTER4_2D : TMA_TENSOR_S2G_INTR<5, "tile_scatter4",
-                                [hasTMACTAGroupSupport]>;
+                                [callSubtarget<"hasTMABlackwellSupport">]>;
 
 def TMAReductionFlags : Operand<i32> {
   let PrintMethod = "printTmaReductionMode";
@@ -930,11 +887,11 @@ foreach dim = 3...5 in {
   foreach mode = ["im2col_w", "im2col_w_128"] in {
     defvar suffix = !toupper(mode) # "_" # dim # "D";
     defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode,
-                                   [hasTMACTAGroupSupport]>;
+                                   [callSubtarget<"hasTMABlackwellSupport">]>;
   }
 }
 defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4",
-                                     [hasTMACTAGroupSupport]>;
+                                     [callSubtarget<"hasTMABlackwellSupport">]>;
 
 //Prefetchu and Prefetch
 
@@ -1605,12 +1562,17 @@ def : Pat<(int_nvvm_saturate_d f64:$a),     (CVT_f64_f64 $a, CvtSAT)>;
 // Exp2  Log2
 //
 
-def : Pat<(int_nvvm_ex2_approx_ftz_f f32:$a), (EX2_APPROX_f32 $a, FTZ)>;
-def : Pat<(int_nvvm_ex2_approx_f f32:$a), (EX2_APPROX_f32 $a, NoFTZ)>;
+def : Pat<(f32 (int_nvvm_ex2_approx_ftz f32:$a)), (EX2_APPROX_f32 $a, FTZ)>;
+def : Pat<(f32 (int_nvvm_ex2_approx f32:$a)), (EX2_APPROX_f32 $a, NoFTZ)>;
 
 let Predicates = [hasPTX<70>, hasSM<75>] in {
-  def : Pat<(int_nvvm_ex2_approx_f16 f16:$a), (EX2_APPROX_f16 $a)>;
-  def : Pat<(int_nvvm_ex2_approx_f16x2 v2f16:$a), (EX2_APPROX_f16x2 $a)>;
+  def : Pat<(f16 (int_nvvm_ex2_approx f16:$a)), (EX2_APPROX_f16 $a)>;
+  def : Pat<(v2f16 (int_nvvm_ex2_approx v2f16:$a)), (EX2_APPROX_f16x2 $a)>;
+}
+
+let Predicates = [hasPTX<78>, hasSM<90>] in {
+  def : Pat<(bf16 (int_nvvm_ex2_approx_ftz bf16:$a)), (EX2_APPROX_bf16 $a)>;
+  def : Pat<(v2bf16 (int_nvvm_ex2_approx_ftz v2bf16:$a)), (EX2_APPROX_bf16x2 $a)>;
 }
 
 def LG2_APPROX_f32 :
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 194dbdc061a96..021b1f6d0bf57 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -166,18 +166,15 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   // f32x2 instructions in Blackwell family
   bool hasF32x2Instructions() const;
 
-  // TMA G2S copy with cta_group::1/2 support
-  bool hasCpAsyncBulkTensorCTAGroupSupport() const {
-    // TODO: Update/tidy-up after the family-conditional support arrives
-    switch (FullSmVersion) {
-    case 1003:
-    case 1013:
-      return PTXVersion >= 86;
-    case 1033:
-      return PTXVersion >= 88;
-    default:
-      return false;
-    }
+  // Checks support for following in TMA:
+  //  - cta_group::1/2 support
+  //  - im2col_w/w_128 mode support
+  //  - tile_gather4 mode support
+  //  - tile_scatter4 mode support
+  bool hasTMABlackwellSupport() const {
+    return hasPTXWithFamilySMs(90, {100, 110}) ||
+           hasPTXWithFamilySMs(88, {100, 101}) ||
+           hasPTXWithAccelSMs(86, {100, 101});
   }
 
   // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 729c077884f3a..64593e6439184 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -318,7 +318,7 @@ static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC,
       // answer. These include:
       //
       //   - nvvm_cos_approx_{f,ftz_f}
-      //   - nvvm_ex2_approx_{d,f,ftz_f}
+      //   - nvvm_ex2_approx(_ftz)
       //   - nvvm_lg2_approx_{d,f,ftz_f}
       //   - nvvm_sin_approx_{f,ftz_f}
       //   - nvvm_sqrt_approx_{f,ftz_f}
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index 4b5cb30fd3036..21d7768af3d06 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -25,9 +25,7 @@
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdarg>
-#include <set>
 #include <string>
-#include <vector>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
index b27bc3bd49315..a2f981e861511 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
@@ -619,11 +619,11 @@ bool PPCInstPrinter::showRegistersWithPercentPrefix(const char *RegName) const {
 /// getVerboseConditionalRegName - This method expands the condition register
 /// when requested explicitly or targetting Darwin.
 const char *
-PPCInstPrinter::getVerboseConditionRegName(unsigned RegNum,
+PPCInstPrinter::getVerboseConditionRegName(MCRegister Reg,
                                            unsigned RegEncoding) const {
   if (!FullRegNames && !MAI.useFullRegisterNames())
     return nullptr;
-  if (RegNum < PPC::CR0EQ || RegNum > PPC::CR7UN)
+  if (Reg < PPC::CR0EQ || Reg > PPC::CR7UN)
     return nullptr;
   const char *CRBits[] = {
     "lt", "gt", "eq", "un",
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
index 48f66ca26958e..01ff6255f2a03 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
@@ -23,7 +23,7 @@ class PPCInstPrinter : public MCInstPrinter {
 private:
   bool showRegistersWithPercentPrefix(const char *RegName) const;
   bool showRegistersWithPrefix() const;
-  const char *getVerboseConditionRegName(unsigned RegNum,
+  const char *getVerboseConditionRegName(MCRegister Reg,
                                          unsigned RegEncoding) const;
 
 public:
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index a088096c92a68..db37fbf395096 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -287,11 +287,11 @@ using llvm::MCPhysReg;
 
 namespace llvm {
 namespace PPC {
-static inline bool isVFRegister(unsigned Reg) {
+static inline bool isVFRegister(MCRegister Reg) {
   return Reg >= PPC::VF0 && Reg <= PPC::VF31;
 }
 
-static inline bool isVRRegister(unsigned Reg) {
+static inline bool isVRRegister(MCRegister Reg) {
   return Reg >= PPC::V0 && Reg <= PPC::V31;
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index bcb3f507e98d6..122738caa6827 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -2702,7 +2702,7 @@ static bool isSpecialLLVMGlobalArrayToSkip(const GlobalVariable *GV) {
 
 static bool isSpecialLLVMGlobalArrayForStaticInit(const GlobalVariable *GV) {
   return StringSwitch<bool>(GV->getName())
-      .Cases("llvm.global_ctors", "llvm.global_dtors", true)
+      .Cases({"llvm.global_ctors", "llvm.global_dtors"}, true)
       .Default(false);
 }
 
@@ -2750,6 +2750,10 @@ void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   if (isSpecialLLVMGlobalArrayToSkip(GV) || isSpecialLLVMGlobalArrayForStaticInit(GV))
     return;
 
+  // Ignore non-emitted data.
+  if (GV->getSection() == "llvm.metadata")
+    return;
+
   // If the Global Variable has the toc-data attribute, it needs to be emitted
   // when we emit the .toc section.
   if (GV->hasAttribute("toc-data")) {
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 910bc9d281259..aae3e49f6c70b 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -2520,11 +2520,11 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
         // saved vector registers.
         if (Subtarget.needsSwapsForVSXMemOps() &&
             !MF->getFunction().hasFnAttribute(Attribute::NoUnwind))
-          TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn,
-                                       I.getFrameIdx(), RC, TRI);
+          TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(),
+                                       RC);
         else
           TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(), RC,
-                                  TRI, Register());
+                                  Register());
       }
     }
   }
@@ -2690,10 +2690,9 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters(
         // saved vector registers.
         if (Subtarget.needsSwapsForVSXMemOps() &&
             !MF->getFunction().hasFnAttribute(Attribute::NoUnwind))
-          TII.loadRegFromStackSlotNoUpd(MBB, I, Reg, CSI[i].getFrameIdx(), RC,
-                                        TRI);
+          TII.loadRegFromStackSlotNoUpd(MBB, I, Reg, CSI[i].getFrameIdx(), RC);
         else
-          TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI,
+          TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC,
                                    Register());
 
         assert(I != MBB.begin() &&
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 20fc849ea4aa5..dd233e236e17f 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -657,6 +657,17 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
   setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);
 
+  if (Subtarget.isISA3_0() && isPPC64) {
+    setOperationAction(ISD::VP_STORE, MVT::v16i1, Custom);
+    setOperationAction(ISD::VP_STORE, MVT::v8i1, Custom);
+    setOperationAction(ISD::VP_STORE, MVT::v4i1, Custom);
+    setOperationAction(ISD::VP_STORE, MVT::v2i1, Custom);
+    setOperationAction(ISD::VP_LOAD, MVT::v16i1, Custom);
+    setOperationAction(ISD::VP_LOAD, MVT::v8i1, Custom);
+    setOperationAction(ISD::VP_LOAD, MVT::v4i1, Custom);
+    setOperationAction(ISD::VP_LOAD, MVT::v2i1, Custom);
+  }
+
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);
@@ -11917,6 +11928,62 @@ SDValue PPCTargetLowering::LowerIS_FPCLASS(SDValue Op,
   return getDataClassTest(LHS, Category, Dl, DAG, Subtarget);
 }
 
+// Adjust the length value for a load/store with length to account for the
+// instructions requiring a left justified length, and for non-byte element
+// types requiring scaling by element size.
+static SDValue AdjustLength(SDValue Val, unsigned Bits, bool Left,
+                            SelectionDAG &DAG) {
+  SDLoc dl(Val);
+  EVT VT = Val->getValueType(0);
+  unsigned LeftAdj = Left ? VT.getSizeInBits() - 8 : 0;
+  unsigned TypeAdj = llvm::countr_zero<uint32_t>(Bits / 8);
+  SDValue SHLAmt = DAG.getConstant(LeftAdj + TypeAdj, dl, VT);
+  return DAG.getNode(ISD::SHL, dl, VT, Val, SHLAmt);
+}
+
+SDValue PPCTargetLowering::LowerVP_LOAD(SDValue Op, SelectionDAG &DAG) const {
+  auto VPLD = cast<VPLoadSDNode>(Op);
+  bool Future = Subtarget.isISAFuture();
+  SDLoc dl(Op);
+  assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(3).getNode(), true) &&
+         "Mask predication not supported");
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Len = DAG.getNode(ISD::ANY_EXTEND, dl, PtrVT, VPLD->getOperand(4));
+  unsigned IID = Future ? Intrinsic::ppc_vsx_lxvrl : Intrinsic::ppc_vsx_lxvl;
+  unsigned EltBits = Op->getValueType(0).getScalarType().getSizeInBits();
+  Len = AdjustLength(Len, EltBits, !Future, DAG);
+  SDValue Ops[] = {VPLD->getChain(), DAG.getConstant(IID, dl, MVT::i32),
+                   VPLD->getOperand(1), Len};
+  SDVTList Tys = DAG.getVTList(Op->getValueType(0), MVT::Other);
+  SDValue VPL =
+      DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, Tys, Ops,
+                              VPLD->getMemoryVT(), VPLD->getMemOperand());
+  return VPL;
+}
+
+SDValue PPCTargetLowering::LowerVP_STORE(SDValue Op, SelectionDAG &DAG) const {
+  auto VPST = cast<VPStoreSDNode>(Op);
+  assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(4).getNode(), true) &&
+         "Mask predication not supported");
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDLoc dl(Op);
+  SDValue Len = DAG.getNode(ISD::ANY_EXTEND, dl, PtrVT, VPST->getOperand(5));
+  unsigned EltBits =
+      Op->getOperand(1).getValueType().getScalarType().getSizeInBits();
+  bool Future = Subtarget.isISAFuture();
+  unsigned IID = Future ? Intrinsic::ppc_vsx_stxvrl : Intrinsic::ppc_vsx_stxvl;
+  Len = AdjustLength(Len, EltBits, !Future, DAG);
+  SDValue Ops[] = {
+      VPST->getChain(), DAG.getConstant(IID, dl, MVT::i32),
+      DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, VPST->getOperand(1)),
+      VPST->getOperand(2), Len};
+  SDVTList Tys = DAG.getVTList(MVT::Other);
+  SDValue VPS =
+      DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops,
+                              VPST->getMemoryVT(), VPST->getMemOperand());
+  return VPS;
+}
+
 SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
                                                  SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -12771,6 +12838,10 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     if (Op->getFlags().hasNoFPExcept())
       return Op;
     return SDValue();
+  case ISD::VP_LOAD:
+    return LowerVP_LOAD(Op, DAG);
+  case ISD::VP_STORE:
+    return LowerVP_STORE(Op, DAG);
   }
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 880aca751d7d6..d967018982734 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1345,6 +1345,9 @@ namespace llvm {
     SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
 
+    SDValue LowerVP_LOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVP_STORE(SDValue Op, SelectionDAG &DAG) const;
+
     SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDMFVectorLoad(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index 23d6d8853800f..fe1eea2b33615 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -889,6 +889,7 @@ def : Pat<(v16i8 (rotl v16i8:$vA, v16i8:$vB)),
           (v16i8 (VRLB v16i8:$vA, v16i8:$vB))>;
 def : Pat<(v8i16 (rotl v8i16:$vA, v8i16:$vB)),
           (v8i16 (VRLH v8i16:$vA, v8i16:$vB))>;
+let Predicates = [IsNotISAFuture] in
 def : Pat<(v4i32 (rotl v4i32:$vA, v4i32:$vB)),
           (v4i32 (VRLW v4i32:$vA, v4i32:$vB))>;
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index da3efdc15f1e1..e417ffe6d3677 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -360,6 +360,10 @@ let Predicates = [HasVSX, IsISAFuture] in {
     def LXVPRLL : XForm_XTp5_RAB5<31, 621, (outs vsrprc:$XTp),
                                   (ins (memr $RA):$addr, g8rc:$RB),
                                   "lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>;
+    def LXVPB32X
+        : XForm_XTp5_RAB5<31, 877, (outs vsrprc:$XTp),
+                          (ins (memr $RA):$addr, g8rc:$RB),
+                          "lxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>;
   }
 
   let mayStore = 1 in {
@@ -376,6 +380,10 @@ let Predicates = [HasVSX, IsISAFuture] in {
         : XForm_XTp5_RAB5<31, 749, (outs),
                           (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB),
                           "stxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>;
+    def STXVPB32X
+        : XForm_XTp5_RAB5<31, 1005, (outs),
+                          (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB),
+                          "stxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>;
   }
 
   def VUPKHSNTOB : VXForm_VRTB5<387, 0, (outs vrrc:$VRT), (ins vrrc:$VRB),
@@ -412,6 +420,11 @@ let Predicates = [HasVSX, IsISAFuture] in {
       : VXForm_VRTAB5<323, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
                       "vucmprlh $VRT, $VRA, $VRB", []>;
 
+  def XVRLW : XX3Form_XTAB6<60, 184, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                            "xvrlw $XT, $XA, $XB",
+                            [(set v4i32:$XT, (int_ppc_vsx_xvrlw v4i32:$XA,
+                                                 v4i32:$XB))]>;
+
   // AES Acceleration Instructions
   def XXAESENCP : XX3Form_XTABp5_M2<194, (outs vsrprc:$XTp),
                                     (ins vsrprc:$XAp, vsrprc:$XBp, u2imm:$M),
@@ -539,6 +552,10 @@ def : Pat<(int_ppc_vsx_stxvprl v256i1:$XTp, addr:$RA, i64:$RB), (STXVPRL $XTp,
                                                                     $RA, $RB)>;
 def : Pat<(int_ppc_vsx_stxvprll v256i1:$XTp, addr:$RA, i64:$RB), (STXVPRLL $XTp,
                                                                      $RA, $RB)>;
+let Predicates = [HasVSX, IsISAFuture] in {
+  def : Pat<(v4i32 (rotl v4i32:$vA, v4i32:$vB)), (v4i32 (XVRLW v4i32:$vA,
+                                                     v4i32:$vB))>;
+}
 
 //---------------------------- Instruction aliases ---------------------------//
 // Predicate combinations available:
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 3014aa6bfe31e..366a7b6d0135a 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -89,7 +89,7 @@ static cl::opt<bool> EnableFMARegPressureReduction(
 void PPCInstrInfo::anchor() {}
 
 PPCInstrInfo::PPCInstrInfo(const PPCSubtarget &STI)
-    : PPCGenInstrInfo(STI, PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP,
+    : PPCGenInstrInfo(STI, RI, PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP,
                       /* CatchRetOpcode */ -1,
                       STI.isPPC64() ? PPC::BLR8 : PPC::BLR),
       Subtarget(STI), RI(STI.getTargetMachine()) {}
@@ -2014,8 +2014,7 @@ void PPCInstrInfo::StoreRegToStackSlot(
 
 void PPCInstrInfo::storeRegToStackSlotNoUpd(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg,
-    bool isKill, int FrameIdx, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI) const {
+    bool isKill, int FrameIdx, const TargetRegisterClass *RC) const {
   MachineFunction &MF = *MBB.getParent();
   SmallVector<MachineInstr *, 4> NewMIs;
 
@@ -2034,8 +2033,7 @@ void PPCInstrInfo::storeRegToStackSlotNoUpd(
 
 void PPCInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-    bool isKill, int FrameIdx, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
+    bool isKill, int FrameIdx, const TargetRegisterClass *RC, Register VReg,
     MachineInstr::MIFlag Flags) const {
   // We need to avoid a situation in which the value from a VRRC register is
   // spilled using an Altivec instruction and reloaded into a VSRC register
@@ -2045,7 +2043,7 @@ void PPCInstrInfo::storeRegToStackSlot(
   // the register is defined using an Altivec instruction and is then used by a
   // VSX instruction.
   RC = updatedRC(RC);
-  storeRegToStackSlotNoUpd(MBB, MI, SrcReg, isKill, FrameIdx, RC, TRI);
+  storeRegToStackSlotNoUpd(MBB, MI, SrcReg, isKill, FrameIdx, RC);
 }
 
 void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
@@ -2060,8 +2058,7 @@ void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
 
 void PPCInstrInfo::loadRegFromStackSlotNoUpd(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg,
-    int FrameIdx, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI) const {
+    int FrameIdx, const TargetRegisterClass *RC) const {
   MachineFunction &MF = *MBB.getParent();
   SmallVector<MachineInstr*, 4> NewMIs;
   DebugLoc DL;
@@ -2080,10 +2077,12 @@ void PPCInstrInfo::loadRegFromStackSlotNoUpd(
   NewMIs.back()->addMemOperand(MF, MMO);
 }
 
-void PPCInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-    int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        Register DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC,
+                                        Register VReg,
+                                        MachineInstr::MIFlag Flags) const {
   // We need to avoid a situation in which the value from a VRRC register is
   // spilled using an Altivec instruction and reloaded into a VSRC register
   // using a VSX instruction. The issue with this is that the VSX
@@ -2093,7 +2092,7 @@ void PPCInstrInfo::loadRegFromStackSlot(
   // VSX instruction.
   RC = updatedRC(RC);
 
-  loadRegFromStackSlotNoUpd(MBB, MI, DestReg, FrameIdx, RC, TRI);
+  loadRegFromStackSlotNoUpd(MBB, MI, DestReg, FrameIdx, RC);
 }
 
 bool PPCInstrInfo::
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index d67fc28935586..8b824bc219ab2 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -570,7 +570,8 @@ class PPCInstrInfo : public PPCGenInstrInfo {
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
       bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   // Emits a register spill without updating the register class for vector
@@ -579,13 +580,13 @@ class PPCInstrInfo : public PPCGenInstrInfo {
   void storeRegToStackSlotNoUpd(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MBBI,
                                 unsigned SrcReg, bool isKill, int FrameIndex,
-                                const TargetRegisterClass *RC,
-                                const TargetRegisterInfo *TRI) const;
+                                const TargetRegisterClass *RC) const;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   // Emits a register reload without updating the register class for vector
@@ -594,8 +595,7 @@ class PPCInstrInfo : public PPCGenInstrInfo {
   void loadRegFromStackSlotNoUpd(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI,
                                  unsigned DestReg, int FrameIndex,
-                                 const TargetRegisterClass *RC,
-                                 const TargetRegisterInfo *TRI) const;
+                                 const TargetRegisterClass *RC) const;
 
   unsigned getStoreOpcodeForSpill(const TargetRegisterClass *RC) const;
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
index b38dd4ae948c6..fc3cde3f464bb 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrMMA.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
@@ -202,7 +202,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
     RegConstraint<"@earlyclobber $AT">;
   def PM#NAME#WPP :
     MMIRR_XX3Form_XY4P2_XAB6<
-      opcode, !or(xo, 0x20), (outs acc:$AT),
+      opcode, !or(xo, 0x20), (outs wacc:$AT),
       !con((ins wacc:$ATi),
            !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
       !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
@@ -765,7 +765,7 @@ let Predicates = [MMA, IsISAFuture] in {
   def : Pat<(v512i1 (int_ppc_mma_xvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
             (XVF64GERWPN $ATi, $XA, RCCp.BToVSRC)>;
   def : Pat<(v512i1 (int_ppc_mma_xvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
-            (XVF64GERNP $ATi, $XA, RCCp.BToVSRC)>;
+            (XVF64GERWNP $ATi, $XA, RCCp.BToVSRC)>;
   def : Pat<(v512i1 (int_ppc_mma_xvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
             (XVF64GERWNN $ATi, $XA, RCCp.BToVSRC)>;
 
diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index 3640d2545b5ac..70df59d01d6c7 100644
--- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -1316,7 +1316,7 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
     // useless and possible to break some original well-form addressing mode
     // to make this pre-inc prep for it.
     if (PointerElementType->isIntegerTy(64)) {
-      const SCEV *LSCEV = SE->getSCEVAtScope(const_cast<Value *>(PtrValue), L);
+      const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L);
       const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV);
       if (!LARSCEV || LARSCEV->getLoop() != L)
         return false;
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 85b40727ff296..b3a7c829958ec 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -2023,7 +2023,7 @@ Register PPCRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   const TargetRegisterClass *RC = getPointerRegClass();
   Register BaseReg = MRI.createVirtualRegister(RC);
-  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this));
+  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0));
 
   BuildMI(*MBB, Ins, DL, MCID, BaseReg)
     .addFrameIndex(FrameIdx).addImm(Offset);
@@ -2051,7 +2051,7 @@ void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   const MCInstrDesc &MCID = MI.getDesc();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, FIOperandNum, this));
+  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, FIOperandNum));
 }
 
 bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index f275802fe1843..7d933588025fe 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -23,7 +23,6 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/TargetParser/Triple.h"
-#include <string>
 
 #define GET_SUBTARGETINFO_HEADER
 #include "PPCGenSubtargetInfo.inc"
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 000d29610678f..4ff489d482fa5 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -296,8 +296,9 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
                                    std::optional<Reloc::Model> RM,
                                    std::optional<CodeModel::Model> CM,
                                    CodeGenOptLevel OL, bool JIT)
-    : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU,
-                               computeFSAdditions(FS, OL, TT), Options,
+    : CodeGenTargetMachineImpl(T,
+                               TT.computeDataLayout(Options.MCOptions.ABIName),
+                               TT, CPU, computeFSAdditions(FS, OL, TT), Options,
                                getEffectiveRelocModel(TT, RM),
                                getEffectivePPCCodeModel(TT, CM, JIT), OL),
       TLOF(createTLOF(getTargetTriple())),
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 2fba090f2d501..e74f1bdec8008 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -24,6 +24,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "ppctti"
 
+static cl::opt<bool> Pwr9EVL("ppc-pwr9-evl",
+                             cl::desc("Allow vp.load and vp.store for pwr9"),
+                             cl::init(false), cl::Hidden);
+
 static cl::opt<bool> VecMaskCost("ppc-vec-mask-cost",
 cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden);
 
@@ -912,7 +916,7 @@ bool PPCTTIImpl::areInlineCompatible(const Function *Caller,
 
 bool PPCTTIImpl::areTypesABICompatible(const Function *Caller,
                                        const Function *Callee,
-                                       const ArrayRef<Type *> &Types) const {
+                                       ArrayRef<Type *> Types) const {
 
   // We need to ensure that argument promotion does not
   // attempt to promote pointers to MMA types (__vector_pair
@@ -1031,3 +1035,42 @@ bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
 bool PPCTTIImpl::supportsTailCallFor(const CallBase *CB) const {
   return TLI->supportsTailCallFor(CB);
 }
+
+// Target hook used by CodeGen to decide whether to expand vector predication
+// intrinsics into scalar operations or to use special ISD nodes to represent
+// them. The Target will not see the intrinsics.
+TargetTransformInfo::VPLegalization
+PPCTTIImpl::getVPLegalizationStrategy(const VPIntrinsic &PI) const {
+  using VPLegalization = TargetTransformInfo::VPLegalization;
+  unsigned Directive = ST->getCPUDirective();
+  VPLegalization DefaultLegalization = BaseT::getVPLegalizationStrategy(PI);
+  if (Directive != PPC::DIR_PWR10 && Directive != PPC::DIR_PWR_FUTURE &&
+      (!Pwr9EVL || Directive != PPC::DIR_PWR9))
+    return DefaultLegalization;
+
+  if (!ST->isPPC64())
+    return DefaultLegalization;
+
+  unsigned IID = PI.getIntrinsicID();
+  if (IID != Intrinsic::vp_load && IID != Intrinsic::vp_store)
+    return DefaultLegalization;
+
+  bool IsLoad = IID == Intrinsic::vp_load;
+  Type *VecTy = IsLoad ? PI.getType() : PI.getOperand(0)->getType();
+  EVT VT = TLI->getValueType(DL, VecTy, true);
+  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
+      VT != MVT::v16i8)
+    return DefaultLegalization;
+
+  auto IsAllTrueMask = [](Value *MaskVal) {
+    if (Value *SplattedVal = getSplatValue(MaskVal))
+      if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
+        return ConstValue->isAllOnesValue();
+    return false;
+  };
+  unsigned MaskIx = IsLoad ? 1 : 2;
+  if (!IsAllTrueMask(PI.getOperand(MaskIx)))
+    return DefaultLegalization;
+
+  return VPLegalization(VPLegalization::Legal, VPLegalization::Legal);
+}
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 475472ac3720f..f80ebdbce7f64 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -147,9 +147,12 @@ class PPCTTIImpl final : public BasicTTIImplBase<PPCTTIImpl> {
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const override;
   bool areTypesABICompatible(const Function *Caller, const Function *Callee,
-                             const ArrayRef<Type *> &Types) const override;
+                             ArrayRef<Type *> Types) const override;
   bool supportsTailCallFor(const CallBase *CB) const override;
 
+  TargetTransformInfo::VPLegalization
+  getVPLegalizationStrategy(const VPIntrinsic &PI) const override;
+
 private:
   // The following constant is used for estimating costs on power9.
   static const InstructionCost::CostType P9PipelineFlushEstimate = 80;
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index edde7ac487da3..10588b9739188 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -352,7 +352,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
   } Kind;
 
   struct RegOp {
-    MCRegister RegNum;
+    MCRegister Reg;
     bool IsGPRAsFPR;
   };
 
@@ -461,20 +461,18 @@ struct RISCVOperand final : public MCParsedAsmOperand {
   bool isReg() const override { return Kind == KindTy::Register; }
   bool isExpr() const { return Kind == KindTy::Expression; }
   bool isV0Reg() const {
-    return Kind == KindTy::Register && Reg.RegNum == RISCV::V0;
+    return Kind == KindTy::Register && Reg.Reg == RISCV::V0;
   }
   bool isAnyReg() const {
     return Kind == KindTy::Register &&
-           (RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.RegNum) ||
-            RISCVMCRegisterClasses[RISCV::FPR64RegClassID].contains(Reg.RegNum) ||
-            RISCVMCRegisterClasses[RISCV::VRRegClassID].contains(Reg.RegNum));
+           (RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.Reg) ||
+            RISCVMCRegisterClasses[RISCV::FPR64RegClassID].contains(Reg.Reg) ||
+            RISCVMCRegisterClasses[RISCV::VRRegClassID].contains(Reg.Reg));
   }
   bool isAnyRegC() const {
     return Kind == KindTy::Register &&
-           (RISCVMCRegisterClasses[RISCV::GPRCRegClassID].contains(
-                Reg.RegNum) ||
-            RISCVMCRegisterClasses[RISCV::FPR64CRegClassID].contains(
-                Reg.RegNum));
+           (RISCVMCRegisterClasses[RISCV::GPRCRegClassID].contains(Reg.Reg) ||
+            RISCVMCRegisterClasses[RISCV::FPR64CRegClassID].contains(Reg.Reg));
   }
   bool isImm() const override { return isExpr(); }
   bool isMem() const override { return false; }
@@ -488,35 +486,33 @@ struct RISCVOperand final : public MCParsedAsmOperand {
 
   bool isGPR() const {
     return Kind == KindTy::Register &&
-           RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.RegNum);
+           RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.Reg);
   }
 
   bool isGPRPair() const {
     return Kind == KindTy::Register &&
-           RISCVMCRegisterClasses[RISCV::GPRPairRegClassID].contains(
-               Reg.RegNum);
+           RISCVMCRegisterClasses[RISCV::GPRPairRegClassID].contains(Reg.Reg);
   }
 
   bool isGPRPairC() const {
     return Kind == KindTy::Register &&
-           RISCVMCRegisterClasses[RISCV::GPRPairCRegClassID].contains(
-               Reg.RegNum);
+           RISCVMCRegisterClasses[RISCV::GPRPairCRegClassID].contains(Reg.Reg);
   }
 
   bool isGPRPairNoX0() const {
     return Kind == KindTy::Register &&
            RISCVMCRegisterClasses[RISCV::GPRPairNoX0RegClassID].contains(
-               Reg.RegNum);
+               Reg.Reg);
   }
 
   bool isGPRF16() const {
     return Kind == KindTy::Register &&
-           RISCVMCRegisterClasses[RISCV::GPRF16RegClassID].contains(Reg.RegNum);
+           RISCVMCRegisterClasses[RISCV::GPRF16RegClassID].contains(Reg.Reg);
   }
 
   bool isGPRF32() const {
     return Kind == KindTy::Register &&
-           RISCVMCRegisterClasses[RISCV::GPRF32RegClassID].contains(Reg.RegNum);
+           RISCVMCRegisterClasses[RISCV::GPRF32RegClassID].contains(Reg.Reg);
   }
 
   bool isGPRAsFPR() const { return isGPR() && Reg.IsGPRAsFPR; }
@@ -991,7 +987,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
 
   MCRegister getReg() const override {
     assert(Kind == KindTy::Register && "Invalid type access!");
-    return Reg.RegNum;
+    return Reg.Reg;
   }
 
   StringRef getSysReg() const {
@@ -1047,7 +1043,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
       OS << "<fpimm: " << FPImm.Val << ">";
       break;
     case KindTy::Register:
-      OS << "<reg: " << RegName(Reg.RegNum) << " (" << Reg.RegNum
+      OS << "<reg: " << RegName(Reg.Reg) << " (" << Reg.Reg.id()
          << (Reg.IsGPRAsFPR ? ") GPRasFPR>" : ")>");
       break;
     case KindTy::Token:
@@ -1099,7 +1095,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
   static std::unique_ptr<RISCVOperand>
   createReg(MCRegister Reg, SMLoc S, SMLoc E, bool IsGPRAsFPR = false) {
     auto Op = std::make_unique<RISCVOperand>(KindTy::Register);
-    Op->Reg.RegNum = Reg;
+    Op->Reg.Reg = Reg;
     Op->Reg.IsGPRAsFPR = IsGPRAsFPR;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -1335,28 +1331,28 @@ unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
   bool IsRegVR = RISCVMCRegisterClasses[RISCV::VRRegClassID].contains(Reg);
 
   if (IsRegFPR64 && Kind == MCK_FPR128) {
-    Op.Reg.RegNum = convertFPR64ToFPR128(Reg);
+    Op.Reg.Reg = convertFPR64ToFPR128(Reg);
     return Match_Success;
   }
   // As the parser couldn't differentiate an FPR32 from an FPR64, coerce the
   // register from FPR64 to FPR32 or FPR64C to FPR32C if necessary.
   if ((IsRegFPR64 && Kind == MCK_FPR32) ||
       (IsRegFPR64C && Kind == MCK_FPR32C)) {
-    Op.Reg.RegNum = convertFPR64ToFPR32(Reg);
+    Op.Reg.Reg = convertFPR64ToFPR32(Reg);
     return Match_Success;
   }
   // As the parser couldn't differentiate an FPR16 from an FPR64, coerce the
   // register from FPR64 to FPR16 if necessary.
   if (IsRegFPR64 && Kind == MCK_FPR16) {
-    Op.Reg.RegNum = convertFPR64ToFPR16(Reg);
+    Op.Reg.Reg = convertFPR64ToFPR16(Reg);
     return Match_Success;
   }
   if (Kind == MCK_GPRAsFPR16 && Op.isGPRAsFPR()) {
-    Op.Reg.RegNum = Reg - RISCV::X0 + RISCV::X0_H;
+    Op.Reg.Reg = Reg - RISCV::X0 + RISCV::X0_H;
     return Match_Success;
   }
   if (Kind == MCK_GPRAsFPR32 && Op.isGPRAsFPR()) {
-    Op.Reg.RegNum = Reg - RISCV::X0 + RISCV::X0_W;
+    Op.Reg.Reg = Reg - RISCV::X0 + RISCV::X0_W;
     return Match_Success;
   }
 
@@ -1372,8 +1368,8 @@ unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
   // As the parser couldn't differentiate an VRM2/VRM4/VRM8 from an VR, coerce
   // the register from VR to VRM2/VRM4/VRM8 if necessary.
   if (IsRegVR && (Kind == MCK_VRM2 || Kind == MCK_VRM4 || Kind == MCK_VRM8)) {
-    Op.Reg.RegNum = convertVRToVRMx(*getContext().getRegisterInfo(), Reg, Kind);
-    if (!Op.Reg.RegNum)
+    Op.Reg.Reg = convertVRToVRMx(*getContext().getRegisterInfo(), Reg, Kind);
+    if (!Op.Reg.Reg)
       return Match_InvalidOperand;
     return Match_Success;
   }
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 0ff178e1f1959..e9088a4d9275c 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -58,6 +58,7 @@ add_llvm_target(RISCVCodeGen
   RISCVMoveMerger.cpp
   RISCVOptWInstrs.cpp
   RISCVPostRAExpandPseudoInsts.cpp
+  RISCVPromoteConstant.cpp
   RISCVPushPopOptimizer.cpp
   RISCVRedundantCopyElimination.cpp
   RISCVRegisterInfo.cpp
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index 81981732ee080..3d5a55c631301 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -92,6 +92,11 @@ class RISCVInstructionSelector : public InstructionSelector {
   void emitFence(AtomicOrdering FenceOrdering, SyncScope::ID FenceSSID,
                  MachineIRBuilder &MIB) const;
   bool selectUnmergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const;
+  void addVectorLoadStoreOperands(MachineInstr &I,
+                                  SmallVectorImpl<SrcOp> &SrcOps,
+                                  unsigned &CurOp, bool IsMasked,
+                                  bool IsStridedOrIndexed,
+                                  LLT *IndexVT = nullptr) const;
   bool selectIntrinsicWithSideEffects(MachineInstr &I,
                                       MachineIRBuilder &MIB) const;
 
@@ -716,6 +721,28 @@ static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) {
   return GenericOpc;
 }
 
+void RISCVInstructionSelector::addVectorLoadStoreOperands(
+    MachineInstr &I, SmallVectorImpl<SrcOp> &SrcOps, unsigned &CurOp,
+    bool IsMasked, bool IsStridedOrIndexed, LLT *IndexVT) const {
+  // Base Pointer
+  auto PtrReg = I.getOperand(CurOp++).getReg();
+  SrcOps.push_back(PtrReg);
+
+  // Stride or Index
+  if (IsStridedOrIndexed) {
+    auto StrideReg = I.getOperand(CurOp++).getReg();
+    SrcOps.push_back(StrideReg);
+    if (IndexVT)
+      *IndexVT = MRI->getType(StrideReg);
+  }
+
+  // Mask
+  if (IsMasked) {
+    auto MaskReg = I.getOperand(CurOp++).getReg();
+    SrcOps.push_back(MaskReg);
+  }
+}
+
 bool RISCVInstructionSelector::selectIntrinsicWithSideEffects(
     MachineInstr &I, MachineIRBuilder &MIB) const {
   // Find the intrinsic ID.
@@ -752,21 +779,7 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects(
       SrcOps.push_back(Register(RISCV::NoRegister));
     }
 
-    // Base Pointer
-    auto PtrReg = I.getOperand(CurOp++).getReg();
-    SrcOps.push_back(PtrReg);
-
-    // Stride
-    if (IsStrided) {
-      auto StrideReg = I.getOperand(CurOp++).getReg();
-      SrcOps.push_back(StrideReg);
-    }
-
-    // Mask
-    if (IsMasked) {
-      auto MaskReg = I.getOperand(CurOp++).getReg();
-      SrcOps.push_back(MaskReg);
-    }
+    addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, IsStrided);
 
     RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT));
     const RISCV::VLEPseudo *P =
@@ -795,6 +808,162 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects(
     I.eraseFromParent();
     return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI);
   }
+  case Intrinsic::riscv_vloxei:
+  case Intrinsic::riscv_vloxei_mask:
+  case Intrinsic::riscv_vluxei:
+  case Intrinsic::riscv_vluxei_mask: {
+    bool IsMasked = IntrinID == Intrinsic::riscv_vloxei_mask ||
+                    IntrinID == Intrinsic::riscv_vluxei_mask;
+    bool IsOrdered = IntrinID == Intrinsic::riscv_vloxei ||
+                     IntrinID == Intrinsic::riscv_vloxei_mask;
+    LLT VT = MRI->getType(I.getOperand(0).getReg());
+    unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+
+    // Result vector
+    const Register DstReg = I.getOperand(0).getReg();
+
+    // Sources
+    bool HasPassthruOperand = IntrinID != Intrinsic::riscv_vlm;
+    unsigned CurOp = 2;
+    SmallVector<SrcOp, 4> SrcOps; // Source registers.
+
+    // Passthru
+    if (HasPassthruOperand) {
+      auto PassthruReg = I.getOperand(CurOp++).getReg();
+      SrcOps.push_back(PassthruReg);
+    } else {
+      // Use NoRegister if there is no specified passthru.
+      SrcOps.push_back(Register());
+    }
+    LLT IndexVT;
+    addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, true, &IndexVT);
+
+    RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT));
+    RISCVVType::VLMUL IndexLMUL =
+        RISCVTargetLowering::getLMUL(getMVTForLLT(IndexVT));
+    unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
+    if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
+      reportFatalUsageError("The V extension does not support EEW=64 for index "
+                            "values when XLEN=32");
+    }
+    const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo(
+        IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
+        static_cast<unsigned>(IndexLMUL));
+
+    auto PseudoMI = MIB.buildInstr(P->Pseudo, {DstReg}, SrcOps);
+
+    // Select VL
+    auto VLOpFn = renderVLOp(I.getOperand(CurOp++));
+    for (auto &RenderFn : *VLOpFn)
+      RenderFn(PseudoMI);
+
+    // SEW
+    PseudoMI.addImm(Log2SEW);
+
+    // Policy
+    uint64_t Policy = RISCVVType::MASK_AGNOSTIC;
+    if (IsMasked)
+      Policy = I.getOperand(CurOp++).getImm();
+    PseudoMI.addImm(Policy);
+
+    // Memref
+    PseudoMI.cloneMemRefs(I);
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI);
+  }
+  case Intrinsic::riscv_vsm:
+  case Intrinsic::riscv_vse:
+  case Intrinsic::riscv_vse_mask:
+  case Intrinsic::riscv_vsse:
+  case Intrinsic::riscv_vsse_mask: {
+    bool IsMasked = IntrinID == Intrinsic::riscv_vse_mask ||
+                    IntrinID == Intrinsic::riscv_vsse_mask;
+    bool IsStrided = IntrinID == Intrinsic::riscv_vsse ||
+                     IntrinID == Intrinsic::riscv_vsse_mask;
+    LLT VT = MRI->getType(I.getOperand(1).getReg());
+    unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+
+    // Sources
+    unsigned CurOp = 1;
+    SmallVector<SrcOp, 4> SrcOps; // Source registers.
+
+    // Store value
+    auto PassthruReg = I.getOperand(CurOp++).getReg();
+    SrcOps.push_back(PassthruReg);
+
+    addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, IsStrided);
+
+    RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT));
+    const RISCV::VSEPseudo *P = RISCV::getVSEPseudo(
+        IsMasked, IsStrided, Log2SEW, static_cast<unsigned>(LMUL));
+
+    auto PseudoMI = MIB.buildInstr(P->Pseudo, {}, SrcOps);
+
+    // Select VL
+    auto VLOpFn = renderVLOp(I.getOperand(CurOp++));
+    for (auto &RenderFn : *VLOpFn)
+      RenderFn(PseudoMI);
+
+    // SEW
+    PseudoMI.addImm(Log2SEW);
+
+    // Memref
+    PseudoMI.cloneMemRefs(I);
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI);
+  }
+  case Intrinsic::riscv_vsoxei:
+  case Intrinsic::riscv_vsoxei_mask:
+  case Intrinsic::riscv_vsuxei:
+  case Intrinsic::riscv_vsuxei_mask: {
+    bool IsMasked = IntrinID == Intrinsic::riscv_vsoxei_mask ||
+                    IntrinID == Intrinsic::riscv_vsuxei_mask;
+    bool IsOrdered = IntrinID == Intrinsic::riscv_vsoxei ||
+                     IntrinID == Intrinsic::riscv_vsoxei_mask;
+    LLT VT = MRI->getType(I.getOperand(1).getReg());
+    unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+
+    // Sources
+    unsigned CurOp = 1;
+    SmallVector<SrcOp, 4> SrcOps; // Source registers.
+
+    // Store value
+    auto PassthruReg = I.getOperand(CurOp++).getReg();
+    SrcOps.push_back(PassthruReg);
+
+    LLT IndexVT;
+    addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, true, &IndexVT);
+
+    RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT));
+    RISCVVType::VLMUL IndexLMUL =
+        RISCVTargetLowering::getLMUL(getMVTForLLT(IndexVT));
+    unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
+    if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
+      reportFatalUsageError("The V extension does not support EEW=64 for index "
+                            "values when XLEN=32");
+    }
+    const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo(
+        IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
+        static_cast<unsigned>(IndexLMUL));
+
+    auto PseudoMI = MIB.buildInstr(P->Pseudo, {}, SrcOps);
+
+    // Select VL
+    auto VLOpFn = renderVLOp(I.getOperand(CurOp++));
+    for (auto &RenderFn : *VLOpFn)
+      RenderFn(PseudoMI);
+
+    // SEW
+    PseudoMI.addImm(Log2SEW);
+
+    // Memref
+    PseudoMI.cloneMemRefs(I);
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI);
+  }
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index e75dfe33814c6..d8dcd963050b5 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -407,7 +407,6 @@ enum OperandType : unsigned {
   OPERAND_SIMM5_PLUS1,
   OPERAND_SIMM6,
   OPERAND_SIMM6_NONZERO,
-  OPERAND_SIMM8,
   OPERAND_SIMM8_UNSIGNED,
   OPERAND_SIMM10,
   OPERAND_SIMM10_LSB0000_NONZERO,
@@ -701,7 +700,7 @@ enum RLISTENCODE {
 
 inline unsigned encodeRegList(MCRegister EndReg, bool IsRVE = false) {
   assert((!IsRVE || EndReg <= RISCV::X9) && "Invalid Rlist for RV32E");
-  switch (EndReg) {
+  switch (EndReg.id()) {
   case RISCV::X1:
     return RLISTENCODE::RA;
   case RISCV::X8:
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
index 98c873824bc1d..a2b75e4a42e76 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
@@ -11,7 +11,6 @@
 
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCFixup.h"
-#include <utility>
 
 #undef RISCV
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 5934c91cb4b9a..fd460e457a415 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -725,7 +725,7 @@ unsigned RISCVMCCodeEmitter::getVMaskReg(const MCInst &MI, unsigned OpNo,
   MCOperand MO = MI.getOperand(OpNo);
   assert(MO.isReg() && "Expected a register.");
 
-  switch (MO.getReg()) {
+  switch (MO.getReg().id()) {
   default:
     llvm_unreachable("Invalid mask register.");
   case RISCV::V0:
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
index 26f434b528584..cedaa8679ff1b 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
@@ -79,6 +79,32 @@ static void generateInstSeqImpl(int64_t Val, const MCSubtargetInfo &STI,
     }
   }
 
+  if (STI.hasFeature(RISCV::FeatureStdExtP)) {
+    // Check if the immediate is packed i8 or i10
+    int32_t Bit63To32 = Val >> 32;
+    int32_t Bit31To0 = Val;
+    int16_t Bit31To16 = Bit31To0 >> 16;
+    int16_t Bit15To0 = Bit31To0;
+    int8_t Bit15To8 = Bit15To0 >> 8;
+    int8_t Bit7To0 = Bit15To0;
+    if (Bit63To32 == Bit31To0) {
+      if (IsRV64 && isInt<10>(Bit63To32)) {
+        Res.emplace_back(RISCV::PLI_W, Bit63To32);
+        return;
+      }
+      if (Bit31To16 == Bit15To0) {
+        if (isInt<10>(Bit31To16)) {
+          Res.emplace_back(RISCV::PLI_H, Bit31To16);
+          return;
+        }
+        if (Bit15To8 == Bit7To0) {
+          Res.emplace_back(RISCV::PLI_B, Bit15To8);
+          return;
+        }
+      }
+    }
+  }
+
   if (isInt<32>(Val)) {
     // Depending on the active bits in the immediate Value v, the following
     // instruction sequences are emitted:
@@ -562,6 +588,9 @@ OpndKind Inst::getOpndKind() const {
   case RISCV::LUI:
   case RISCV::QC_LI:
   case RISCV::QC_E_LI:
+  case RISCV::PLI_B:
+  case RISCV::PLI_H:
+  case RISCV::PLI_W:
     return RISCVMatInt::Imm;
   case RISCV::ADD_UW:
     return RISCVMatInt::RegX0;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
index a82cd650f42fa..5df8edb2ee85a 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
@@ -21,7 +21,7 @@ namespace RISCVMatInt {
 
 enum OpndKind {
   RegImm, // ADDI/ADDIW/XORI/SLLI/SRLI/SLLI_UW/RORI/BSETI/BCLRI/TH_SRRI
-  Imm,    // LUI/QC_LI/QC_E_LI
+  Imm,    // LUI/QC_LI/QC_E_LI/PLI_B/PLI_H/PLI_W
   RegReg, // SH1ADD/SH2ADD/SH3ADD/PACK
   RegX0,  // ADD_UW
 };
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index ae9410193efe1..51e8e8574ed15 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -20,6 +20,7 @@
 namespace llvm {
 class FunctionPass;
 class InstructionSelector;
+class ModulePass;
 class PassRegistry;
 class RISCVRegisterBankInfo;
 class RISCVSubtarget;
@@ -111,6 +112,9 @@ void initializeRISCVO0PreLegalizerCombinerPass(PassRegistry &);
 FunctionPass *createRISCVPreLegalizerCombiner();
 void initializeRISCVPreLegalizerCombinerPass(PassRegistry &);
 
+ModulePass *createRISCVPromoteConstantPass();
+void initializeRISCVPromoteConstantPass(PassRegistry &);
+
 FunctionPass *createRISCVVLOptimizerPass();
 void initializeRISCVVLOptimizerPass(PassRegistry &);
 
diff --git a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
index 51180f548ca6d..5d3d9b5c4cf03 100644
--- a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
+++ b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
@@ -59,7 +59,6 @@ bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
   LLVM_DEBUG(dbgs() << "***** RISCVDeadRegisterDefinitions *****\n");
 
@@ -89,7 +88,7 @@ bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
         LLVM_DEBUG(dbgs() << "    Dead def operand #" << I << " in:\n      ";
                    MI.print(dbgs()));
         Register X0Reg;
-        const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI);
+        const TargetRegisterClass *RC = TII->getRegClass(Desc, I);
         if (RC && RC->contains(RISCV::X0)) {
           X0Reg = RISCV::X0;
         } else if (RC && RC->contains(RISCV::X0_W)) {
diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 410561855e181..60e0afdd99912 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -127,6 +127,14 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case RISCV::PseudoCCAND:
   case RISCV::PseudoCCOR:
   case RISCV::PseudoCCXOR:
+  case RISCV::PseudoCCMAX:
+  case RISCV::PseudoCCMAXU:
+  case RISCV::PseudoCCMIN:
+  case RISCV::PseudoCCMINU:
+  case RISCV::PseudoCCMUL:
+  case RISCV::PseudoCCLUI:
+  case RISCV::PseudoCCQC_LI:
+  case RISCV::PseudoCCQC_E_LI:
   case RISCV::PseudoCCADDW:
   case RISCV::PseudoCCSUBW:
   case RISCV::PseudoCCSLL:
@@ -217,6 +225,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
         .addImm(0);
   } else {
     unsigned NewOpc;
+    // clang-format off
     switch (MI.getOpcode()) {
     default:
       llvm_unreachable("Unexpected opcode!");
@@ -228,6 +237,14 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
     case RISCV::PseudoCCAND:   NewOpc = RISCV::AND;   break;
     case RISCV::PseudoCCOR:    NewOpc = RISCV::OR;    break;
     case RISCV::PseudoCCXOR:   NewOpc = RISCV::XOR;   break;
+    case RISCV::PseudoCCMAX:   NewOpc = RISCV::MAX;   break;
+    case RISCV::PseudoCCMIN:   NewOpc = RISCV::MIN;   break;
+    case RISCV::PseudoCCMAXU:  NewOpc = RISCV::MAXU;  break;
+    case RISCV::PseudoCCMINU:  NewOpc = RISCV::MINU;  break;
+    case RISCV::PseudoCCMUL:   NewOpc = RISCV::MUL;   break;
+    case RISCV::PseudoCCLUI:   NewOpc = RISCV::LUI;   break;
+    case RISCV::PseudoCCQC_LI:  NewOpc = RISCV::QC_LI;   break;
+    case RISCV::PseudoCCQC_E_LI: NewOpc = RISCV::QC_E_LI;   break;
     case RISCV::PseudoCCADDI:  NewOpc = RISCV::ADDI;  break;
     case RISCV::PseudoCCSLLI:  NewOpc = RISCV::SLLI;  break;
     case RISCV::PseudoCCSRLI:  NewOpc = RISCV::SRLI;  break;
@@ -250,12 +267,16 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
     case RISCV::PseudoCCNDS_BFOS: NewOpc = RISCV::NDS_BFOS; break;
     case RISCV::PseudoCCNDS_BFOZ: NewOpc = RISCV::NDS_BFOZ; break;
     }
+    // clang-format on
 
     if (NewOpc == RISCV::NDS_BFOZ || NewOpc == RISCV::NDS_BFOS) {
       BuildMI(TrueBB, DL, TII->get(NewOpc), DestReg)
           .add(MI.getOperand(5))
           .add(MI.getOperand(6))
           .add(MI.getOperand(7));
+    } else if (NewOpc == RISCV::LUI || NewOpc == RISCV::QC_LI ||
+               NewOpc == RISCV::QC_E_LI) {
+      BuildMI(TrueBB, DL, TII->get(NewOpc), DestReg).add(MI.getOperand(5));
     } else {
       BuildMI(TrueBB, DL, TII->get(NewOpc), DestReg)
           .add(MI.getOperand(5))
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index b4556f66473d6..0b964c4808d8a 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -956,6 +956,9 @@ def FeatureStdExtSsdbltrp
 def FeatureStdExtSmepmp
     : RISCVExtension<1, 0, "Enhanced Physical Memory Protection">;
 
+def FeatureStdExtSmpmpmt
+    : RISCVExperimentalExtension<0, 6, "PMP-based Memory Types Extension">;
+
 def FeatureStdExtSmrnmi
     : RISCVExtension<1, 0, "Resumable Non-Maskable Interrupts">;
 def HasStdExtSmrnmi : Predicate<"Subtarget->hasStdExtSmrnmi()">,
@@ -1851,6 +1854,16 @@ def TuneShortForwardBranchOpt
 def HasShortForwardBranchOpt : Predicate<"Subtarget->hasShortForwardBranchOpt()">;
 def NoShortForwardBranchOpt : Predicate<"!Subtarget->hasShortForwardBranchOpt()">;
 
+def TuneShortForwardBranchIMinMax
+    : SubtargetFeature<"short-forward-branch-i-minmax", "HasShortForwardBranchIMinMax",
+                       "true", "Enable short forward branch optimization for min,max instructions in Zbb",
+                       [TuneShortForwardBranchOpt]>;
+
+def TuneShortForwardBranchIMul
+    : SubtargetFeature<"short-forward-branch-i-mul", "HasShortForwardBranchIMul",
+                       "true", "Enable short forward branch optimization for mul instruction",
+                       [TuneShortForwardBranchOpt]>;
+
 // Some subtargets require a S2V transfer buffer to move scalars into vectors.
 // FIXME: Forming .vx/.vf/.wx/.wf can reduce register pressure.
 def TuneNoSinkSplatOperands
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index b37b7405a660f..f7fc9528920a6 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -291,12 +291,12 @@ static void emitSiFiveCLICPreemptibleSaves(MachineFunction &MF,
   // which affects other passes.
   TII->storeRegToStackSlot(MBB, MBBI, RISCV::X8, /* IsKill=*/true,
                            RVFI->getInterruptCSRFrameIndex(0),
-                           &RISCV::GPRRegClass, STI.getRegisterInfo(),
-                           Register(), MachineInstr::FrameSetup);
+                           &RISCV::GPRRegClass, Register(),
+                           MachineInstr::FrameSetup);
   TII->storeRegToStackSlot(MBB, MBBI, RISCV::X9, /* IsKill=*/true,
                            RVFI->getInterruptCSRFrameIndex(1),
-                           &RISCV::GPRRegClass, STI.getRegisterInfo(),
-                           Register(), MachineInstr::FrameSetup);
+                           &RISCV::GPRRegClass, Register(),
+                           MachineInstr::FrameSetup);
 
   // Put `mcause` into X8 (s0), and `mepc` into X9 (s1). If either of these are
   // used in the function, then they will appear in `getUnmanagedCSI` and will
@@ -357,14 +357,12 @@ static void emitSiFiveCLICPreemptibleRestores(MachineFunction &MF,
 
   // X8 and X9 need to be restored to their values on function entry, which we
   // saved onto the stack in `emitSiFiveCLICPreemptibleSaves`.
-  TII->loadRegFromStackSlot(MBB, MBBI, RISCV::X9,
-                            RVFI->getInterruptCSRFrameIndex(1),
-                            &RISCV::GPRRegClass, STI.getRegisterInfo(),
-                            Register(), MachineInstr::FrameSetup);
-  TII->loadRegFromStackSlot(MBB, MBBI, RISCV::X8,
-                            RVFI->getInterruptCSRFrameIndex(0),
-                            &RISCV::GPRRegClass, STI.getRegisterInfo(),
-                            Register(), MachineInstr::FrameSetup);
+  TII->loadRegFromStackSlot(
+      MBB, MBBI, RISCV::X9, RVFI->getInterruptCSRFrameIndex(1),
+      &RISCV::GPRRegClass, Register(), MachineInstr::FrameSetup);
+  TII->loadRegFromStackSlot(
+      MBB, MBBI, RISCV::X8, RVFI->getInterruptCSRFrameIndex(0),
+      &RISCV::GPRRegClass, Register(), MachineInstr::FrameSetup);
 }
 
 // Get the ID of the libcall used for spilling and restoring callee saved
@@ -789,6 +787,8 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
 
   // Unroll the probe loop depending on the number of iterations.
   if (Offset < ProbeSize * 5) {
+    uint64_t CFAAdjust = RealStackSize - Offset;
+
     uint64_t CurrentOffset = 0;
     while (CurrentOffset + ProbeSize <= Offset) {
       RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
@@ -802,7 +802,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
 
       CurrentOffset += ProbeSize;
       if (EmitCFI)
-        CFIBuilder.buildDefCFAOffset(CurrentOffset);
+        CFIBuilder.buildDefCFAOffset(CurrentOffset + CFAAdjust);
     }
 
     uint64_t Residual = Offset - CurrentOffset;
@@ -810,7 +810,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
       RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
                     StackOffset::getFixed(-Residual), Flag, getStackAlign());
       if (EmitCFI)
-        CFIBuilder.buildDefCFAOffset(Offset);
+        CFIBuilder.buildDefCFAOffset(RealStackSize);
 
       if (DynAllocation) {
         // s[d|w] zero, 0(sp)
@@ -2175,7 +2175,7 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters(
       MCRegister Reg = CS.getReg();
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
       TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg),
-                              CS.getFrameIdx(), RC, TRI, Register(),
+                              CS.getFrameIdx(), RC, Register(),
                               MachineInstr::FrameSetup);
     }
   };
@@ -2265,8 +2265,8 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters(
     for (auto &CS : CSInfo) {
       MCRegister Reg = CS.getReg();
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-      TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI,
-                               Register(), MachineInstr::FrameDestroy);
+      TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, Register(),
+                               MachineInstr::FrameDestroy);
       assert(MI != MBB.begin() &&
              "loadRegFromStackSlot didn't insert any code!");
     }
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index b25a05400fe31..1cbedb7d141e2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -371,8 +371,8 @@ void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, unsigned NF, bool IsMasked,
   RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
   unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
   if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
-    report_fatal_error("The V extension does not support EEW=64 for index "
-                       "values when XLEN=32");
+    reportFatalUsageError("The V extension does not support EEW=64 for index "
+                          "values when XLEN=32");
   }
   const RISCV::VLXSEGPseudo *P = RISCV::getVLXSEGPseudo(
       NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
@@ -444,8 +444,8 @@ void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, unsigned NF, bool IsMasked,
   RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
   unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
   if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
-    report_fatal_error("The V extension does not support EEW=64 for index "
-                       "values when XLEN=32");
+    reportFatalUsageError("The V extension does not support EEW=64 for index "
+                          "values when XLEN=32");
   }
   const RISCV::VSXSEGPseudo *P = RISCV::getVSXSEGPseudo(
       NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
@@ -991,6 +991,18 @@ static unsigned getSegInstNF(unsigned Intrinsic) {
   }
 }
 
+static bool isApplicableToPLI(int Val) {
+  // Check if the immediate is packed i8 or i10
+  int16_t Bit31To16 = Val >> 16;
+  int16_t Bit15To0 = Val;
+  int8_t Bit15To8 = Bit15To0 >> 8;
+  int8_t Bit7To0 = Val;
+  if (Bit31To16 != Bit15To0)
+    return false;
+
+  return isInt<10>(Bit31To16) || Bit15To8 == Bit7To0;
+}
+
 void RISCVDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we have already selected.
   if (Node->isMachineOpcode()) {
@@ -1034,6 +1046,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     if (!isInt<32>(Imm) && isUInt<32>(Imm) && hasAllWUsers(Node))
       Imm = SignExtend64<32>(Imm);
 
+    if (Subtarget->enablePExtCodeGen() && isApplicableToPLI(Imm) &&
+        hasAllWUsers(Node)) {
+      // If it's 4 packed 8-bit integers or 2 packed signed 16-bit integers, we
+      // can simply copy lower 32 bits to higher 32 bits to make it able to
+      // rematerialize to PLI_B or PLI_H
+      Imm = ((uint64_t)Imm << 32) | (Imm & 0xFFFFFFFF);
+    }
+
     ReplaceNode(Node, selectImm(CurDAG, DL, VT, Imm, *Subtarget).getNode());
     return;
   }
@@ -2223,8 +2243,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
       unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
       if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
-        report_fatal_error("The V extension does not support EEW=64 for index "
-                           "values when XLEN=32");
+        reportFatalUsageError("The V extension does not support EEW=64 for "
+                              "index values when XLEN=32");
       }
       const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo(
           IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
@@ -2457,8 +2477,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
       unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
       if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
-        report_fatal_error("The V extension does not support EEW=64 for index "
-                           "values when XLEN=32");
+        reportFatalUsageError("The V extension does not support EEW=64 for "
+                              "index values when XLEN=32");
       }
       const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo(
           IsMasked, IsOrdered, IndexLog2EEW,
@@ -2654,6 +2674,21 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       CurDAG->RemoveDeadNode(Node);
       return;
     }
+    if (Subtarget->enablePExtCodeGen()) {
+      bool Is32BitCast =
+          (VT == MVT::i32 && (SrcVT == MVT::v4i8 || SrcVT == MVT::v2i16)) ||
+          (SrcVT == MVT::i32 && (VT == MVT::v4i8 || VT == MVT::v2i16));
+      bool Is64BitCast =
+          (VT == MVT::i64 && (SrcVT == MVT::v8i8 || SrcVT == MVT::v4i16 ||
+                              SrcVT == MVT::v2i32)) ||
+          (SrcVT == MVT::i64 &&
+           (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32));
+      if (Is32BitCast || Is64BitCast) {
+        ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
+        CurDAG->RemoveDeadNode(Node);
+        return;
+      }
+    }
     break;
   }
   case ISD::INSERT_SUBVECTOR:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 56881f71934c4..5a081d54d0726 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -87,6 +87,11 @@ static cl::opt<bool>
                                "be combined with a shift"),
                       cl::init(true));
 
+// TODO: Support more ops
+static const unsigned ZvfbfaVPOps[] = {ISD::VP_FNEG, ISD::VP_FABS,
+                                       ISD::VP_FCOPYSIGN};
+static const unsigned ZvfbfaOps[] = {ISD::FNEG, ISD::FABS, ISD::FCOPYSIGN};
+
 RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                                          const RISCVSubtarget &STI)
     : TargetLowering(TM), Subtarget(STI) {
@@ -279,6 +284,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::riscv_nxv32i8x2, &RISCV::VRN2M4RegClass);
   }
 
+  // fixed vector is stored in GPRs for P extension packed operations
+  if (Subtarget.enablePExtCodeGen()) {
+    if (Subtarget.is64Bit()) {
+      addRegisterClass(MVT::v2i32, &RISCV::GPRRegClass);
+      addRegisterClass(MVT::v4i16, &RISCV::GPRRegClass);
+      addRegisterClass(MVT::v8i8, &RISCV::GPRRegClass);
+    } else {
+      addRegisterClass(MVT::v2i16, &RISCV::GPRRegClass);
+      addRegisterClass(MVT::v4i8, &RISCV::GPRRegClass);
+    }
+  }
+
   // Compute derived properties from the register classes.
   computeRegisterProperties(STI.getRegisterInfo());
 
@@ -487,6 +504,34 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       ISD::FTRUNC,       ISD::FRINT,         ISD::FROUND,
       ISD::FROUNDEVEN,   ISD::FCANONICALIZE};
 
+  if (Subtarget.enablePExtCodeGen()) {
+    setTargetDAGCombine(ISD::TRUNCATE);
+    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+    setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
+    SmallVector<MVT, 2> VTs;
+    if (Subtarget.is64Bit()) {
+      VTs.append({MVT::v2i32, MVT::v4i16, MVT::v8i8});
+      setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
+      setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
+      setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
+      setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+      setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
+      setOperationAction(ISD::LOAD, MVT::v2i16, Custom);
+      setOperationAction(ISD::LOAD, MVT::v4i8, Custom);
+    } else {
+      VTs.append({MVT::v2i16, MVT::v4i8});
+    }
+    setOperationAction(ISD::UADDSAT, VTs, Legal);
+    setOperationAction(ISD::SADDSAT, VTs, Legal);
+    setOperationAction(ISD::USUBSAT, VTs, Legal);
+    setOperationAction(ISD::SSUBSAT, VTs, Legal);
+    setOperationAction({ISD::AVGFLOORS, ISD::AVGFLOORU}, VTs, Legal);
+    setOperationAction({ISD::ABDS, ISD::ABDU}, VTs, Legal);
+    setOperationAction(ISD::BUILD_VECTOR, VTs, Custom);
+    setOperationAction(ISD::BITCAST, VTs, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VTs, Custom);
+  }
+
   if (Subtarget.hasStdExtZfbfmin()) {
     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
     setOperationAction(ISD::ConstantFP, MVT::bf16, Expand);
@@ -1208,6 +1253,61 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       }
     };
 
+    // Sets common actions for zvfbfa, some of instructions are supported
+    // natively so that we don't need to promote them.
+    const auto SetZvfbfaActions = [&](MVT VT) {
+      setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
+      setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT,
+                         Custom);
+      setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
+      setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom);
+      setOperationAction({ISD::LROUND, ISD::LLROUND}, VT, Custom);
+      setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT,
+                         Custom);
+      setOperationAction(ISD::SELECT_CC, VT, Expand);
+      setOperationAction({ISD::VP_SINT_TO_FP, ISD::VP_UINT_TO_FP}, VT, Custom);
+      setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::CONCAT_VECTORS,
+                          ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR,
+                          ISD::VECTOR_DEINTERLEAVE, ISD::VECTOR_INTERLEAVE,
+                          ISD::VECTOR_REVERSE, ISD::VECTOR_SPLICE,
+                          ISD::VECTOR_COMPRESS},
+                         VT, Custom);
+      setOperationAction(ISD::EXPERIMENTAL_VP_SPLICE, VT, Custom);
+      setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom);
+
+      setOperationAction(ISD::FCOPYSIGN, VT, Legal);
+      setOperationAction(ZvfbfaVPOps, VT, Custom);
+
+      MVT EltVT = VT.getVectorElementType();
+      if (isTypeLegal(EltVT))
+        setOperationAction({ISD::SPLAT_VECTOR, ISD::EXPERIMENTAL_VP_SPLAT,
+                            ISD::EXTRACT_VECTOR_ELT},
+                           VT, Custom);
+      else
+        setOperationAction({ISD::SPLAT_VECTOR, ISD::EXPERIMENTAL_VP_SPLAT},
+                           EltVT, Custom);
+      setOperationAction({ISD::LOAD, ISD::STORE, ISD::MLOAD, ISD::MSTORE,
+                          ISD::MGATHER, ISD::MSCATTER, ISD::VP_LOAD,
+                          ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
+                          ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER,
+                          ISD::VP_SCATTER},
+                         VT, Custom);
+      setOperationAction(ISD::VP_LOAD_FF, VT, Custom);
+
+      // Expand FP operations that need libcalls.
+      setOperationAction(FloatingPointLibCallOps, VT, Expand);
+
+      // Custom split nxv32[b]f16 since nxv32[b]f32 is not legal.
+      if (getLMUL(VT) == RISCVVType::LMUL_8) {
+        setOperationAction(ZvfhminZvfbfminPromoteOps, VT, Custom);
+        setOperationAction(ZvfhminZvfbfminPromoteVPOps, VT, Custom);
+      } else {
+        MVT F32VecVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
+        setOperationPromotedToType(ZvfhminZvfbfminPromoteOps, VT, F32VecVT);
+        setOperationPromotedToType(ZvfhminZvfbfminPromoteVPOps, VT, F32VecVT);
+      }
+    };
+
     if (Subtarget.hasVInstructionsF16()) {
       for (MVT VT : F16VecVTs) {
         if (!isTypeLegal(VT))
@@ -1222,7 +1322,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       }
     }
 
-    if (Subtarget.hasVInstructionsBF16Minimal()) {
+    if (Subtarget.hasVInstructionsBF16()) {
+      for (MVT VT : BF16VecVTs) {
+        if (!isTypeLegal(VT))
+          continue;
+        SetZvfbfaActions(VT);
+      }
+    } else if (Subtarget.hasVInstructionsBF16Minimal()) {
       for (MVT VT : BF16VecVTs) {
         if (!isTypeLegal(VT))
           continue;
@@ -1501,6 +1607,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
             // available.
             setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom);
           }
+          if (Subtarget.hasStdExtZvfbfa()) {
+            setOperationAction(ZvfbfaOps, VT, Custom);
+            setOperationAction(ZvfbfaVPOps, VT, Custom);
+          }
           setOperationAction(
               {ISD::VP_MERGE, ISD::VP_SELECT, ISD::VSELECT, ISD::SELECT}, VT,
               Custom);
@@ -1706,6 +1816,15 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   MaxLoadsPerMemcmp = Subtarget.getMaxLoadsPerMemcmp(/*OptSize=*/false);
 }
 
+TargetLoweringBase::LegalizeTypeAction
+RISCVTargetLowering::getPreferredVectorAction(MVT VT) const {
+  if (Subtarget.is64Bit() && Subtarget.enablePExtCodeGen())
+    if (VT == MVT::v2i16 || VT == MVT::v4i8)
+      return TypeWidenVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
 EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL,
                                             LLVMContext &Context,
                                             EVT VT) const {
@@ -4321,6 +4440,37 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   MVT XLenVT = Subtarget.getXLenVT();
 
   SDLoc DL(Op);
+  // Handle P extension packed vector BUILD_VECTOR with PLI for splat constants
+  if (Subtarget.enablePExtCodeGen()) {
+    bool IsPExtVector =
+        (VT == MVT::v2i16 || VT == MVT::v4i8) ||
+        (Subtarget.is64Bit() &&
+         (VT == MVT::v4i16 || VT == MVT::v8i8 || VT == MVT::v2i32));
+    if (IsPExtVector) {
+      if (SDValue SplatValue = cast<BuildVectorSDNode>(Op)->getSplatValue()) {
+        if (auto *C = dyn_cast<ConstantSDNode>(SplatValue)) {
+          int64_t SplatImm = C->getSExtValue();
+          bool IsValidImm = false;
+
+          // Check immediate range based on vector type
+          if (VT == MVT::v8i8 || VT == MVT::v4i8) {
+            // PLI_B uses 8-bit unsigned or unsigned immediate
+            IsValidImm = isUInt<8>(SplatImm) || isInt<8>(SplatImm);
+            if (isUInt<8>(SplatImm))
+              SplatImm = (int8_t)SplatImm;
+          } else {
+            // PLI_H and PLI_W use 10-bit signed immediate
+            IsValidImm = isInt<10>(SplatImm);
+          }
+
+          if (IsValidImm) {
+            SDValue Imm = DAG.getSignedTargetConstant(SplatImm, DL, XLenVT);
+            return DAG.getNode(RISCVISD::PLI, DL, VT, Imm);
+          }
+        }
+      }
+    }
+  }
 
   // Proper support for f16 requires Zvfh. bf16 always requires special
   // handling. We need to cast the scalar to integer and create an integer
@@ -7245,7 +7395,11 @@ static bool isPromotedOpNeedingSplit(SDValue Op,
   return (Op.getValueType() == MVT::nxv32f16 &&
           (Subtarget.hasVInstructionsF16Minimal() &&
            !Subtarget.hasVInstructionsF16())) ||
-         Op.getValueType() == MVT::nxv32bf16;
+         (Op.getValueType() == MVT::nxv32bf16 &&
+          Subtarget.hasVInstructionsBF16Minimal() &&
+          (!Subtarget.hasVInstructionsBF16() ||
+           (!llvm::is_contained(ZvfbfaOps, Op.getOpcode()) &&
+            !llvm::is_contained(ZvfbfaVPOps, Op.getOpcode()))));
 }
 
 static SDValue SplitVectorOp(SDValue Op, SelectionDAG &DAG) {
@@ -7472,6 +7626,19 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
       return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
     }
 
+    if (Subtarget.enablePExtCodeGen()) {
+      bool Is32BitCast =
+          (VT == MVT::i32 && (Op0VT == MVT::v4i8 || Op0VT == MVT::v2i16)) ||
+          (Op0VT == MVT::i32 && (VT == MVT::v4i8 || VT == MVT::v2i16));
+      bool Is64BitCast =
+          (VT == MVT::i64 && (Op0VT == MVT::v8i8 || Op0VT == MVT::v4i16 ||
+                              Op0VT == MVT::v2i32)) ||
+          (Op0VT == MVT::i64 &&
+           (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32));
+      if (Is32BitCast || Is64BitCast)
+        return Op;
+    }
+
     // Consider other scalar<->scalar casts as legal if the types are legal.
     // Otherwise expand them.
     if (!VT.isVector() && !Op0VT.isVector()) {
@@ -8144,6 +8311,17 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     auto *Store = cast<StoreSDNode>(Op);
     SDValue StoredVal = Store->getValue();
     EVT VT = StoredVal.getValueType();
+    if (Subtarget.enablePExtCodeGen()) {
+      if (VT == MVT::v2i16 || VT == MVT::v4i8) {
+        SDValue DL(Op);
+        SDValue Cast = DAG.getBitcast(MVT::i32, StoredVal);
+        SDValue NewStore =
+            DAG.getStore(Store->getChain(), DL, Cast, Store->getBasePtr(),
+                         Store->getPointerInfo(), Store->getBaseAlign(),
+                         Store->getMemOperand()->getFlags());
+        return NewStore;
+      }
+    }
     if (VT == MVT::f64) {
       assert(Subtarget.hasStdExtZdinx() && !Subtarget.hasStdExtZilsd() &&
              !Subtarget.is64Bit() && "Unexpected custom legalisation");
@@ -9186,7 +9364,7 @@ static SDValue lowerSelectToBinOp(SDNode *N, SelectionDAG &DAG,
           unsigned ShAmount = Log2_64(TrueM1);
           if (Subtarget.hasShlAdd(ShAmount))
             return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, CondV,
-                               DAG.getConstant(ShAmount, DL, VT), CondV);
+                               DAG.getTargetConstant(ShAmount, DL, VT), CondV);
         }
       }
       // (select c, y, 0) -> -c & y
@@ -10426,6 +10604,17 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
     return DAG.getNode(RISCVISD::FMV_H_X, DL, EltVT, IntExtract);
   }
 
+  if (Subtarget.enablePExtCodeGen() && VecVT.isFixedLengthVector()) {
+    if (VecVT != MVT::v4i16 && VecVT != MVT::v2i16 && VecVT != MVT::v8i8 &&
+        VecVT != MVT::v4i8 && VecVT != MVT::v2i32)
+      return SDValue();
+    SDValue Extracted = DAG.getBitcast(XLenVT, Vec);
+    unsigned ElemWidth = EltVT.getSizeInBits();
+    SDValue Shamt = DAG.getNode(ISD::MUL, DL, XLenVT, Idx,
+                                DAG.getConstant(ElemWidth, DL, XLenVT));
+    return DAG.getNode(ISD::SRL, DL, XLenVT, Extracted, Shamt);
+  }
+
   // If this is a fixed vector, we need to convert it to a scalable vector.
   MVT ContainerVT = VecVT;
   if (VecVT.isFixedLengthVector()) {
@@ -14568,6 +14757,21 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
+    if (Subtarget.is64Bit() && Subtarget.enablePExtCodeGen()) {
+      SDLoc DL(N);
+      SDValue ExtLoad =
+          DAG.getExtLoad(ISD::SEXTLOAD, DL, MVT::i64, Ld->getChain(),
+                         Ld->getBasePtr(), MVT::i32, Ld->getMemOperand());
+      if (N->getValueType(0) == MVT::v2i16) {
+        Results.push_back(DAG.getBitcast(MVT::v4i16, ExtLoad));
+        Results.push_back(ExtLoad.getValue(1));
+      } else if (N->getValueType(0) == MVT::v4i8) {
+        Results.push_back(DAG.getBitcast(MVT::v8i8, ExtLoad));
+        Results.push_back(ExtLoad.getValue(1));
+      }
+      return;
+    }
+
     assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
            "Unexpected custom legalisation");
 
@@ -14923,6 +15127,21 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, NewRes));
     break;
   }
+  case RISCVISD::PASUB:
+  case RISCVISD::PASUBU: {
+    MVT VT = N->getSimpleValueType(0);
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+    assert(VT == MVT::v2i16 || VT == MVT::v4i8);
+    MVT NewVT = MVT::v4i16;
+    if (VT == MVT::v4i8)
+      NewVT = MVT::v8i8;
+    SDValue Undef = DAG.getUNDEF(VT);
+    Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, NewVT, {Op0, Undef});
+    Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, NewVT, {Op1, Undef});
+    Results.push_back(DAG.getNode(N->getOpcode(), DL, NewVT, {Op0, Op1}));
+    return;
+  }
   case ISD::EXTRACT_VECTOR_ELT: {
     // Custom-legalize an EXTRACT_VECTOR_ELT where XLEN<SEW, as the SEW element
     // type is illegal (currently only vXi64 RV32).
@@ -15463,7 +15682,7 @@ static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG,
   SDValue NS = (C0 < C1) ? N0->getOperand(0) : N1->getOperand(0);
   SDValue NL = (C0 > C1) ? N0->getOperand(0) : N1->getOperand(0);
   SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, NL,
-                              DAG.getConstant(Diff, DL, VT), NS);
+                              DAG.getTargetConstant(Diff, DL, VT), NS);
   return DAG.getNode(ISD::SHL, DL, VT, SHADD, DAG.getConstant(Bits, DL, VT));
 }
 
@@ -15501,7 +15720,7 @@ static SDValue combineShlAddIAddImpl(SDNode *N, SDValue AddI, SDValue Other,
   int64_t AddConst = AddVal.getSExtValue();
 
   SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, SHLVal->getOperand(0),
-                              DAG.getConstant(ShlConst, DL, VT), Other);
+                              DAG.getTargetConstant(ShlConst, DL, VT), Other);
   return DAG.getNode(ISD::ADD, DL, VT, SHADD,
                      DAG.getSignedConstant(AddConst, DL, VT));
 }
@@ -16030,11 +16249,84 @@ static SDValue combineTruncSelectToSMaxUSat(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(ISD::TRUNCATE, DL, VT, Min);
 }
 
+// Handle P extension averaging subtraction pattern:
+// (vXiY (trunc (srl (sub ([s|z]ext vXiY:$a), ([s|z]ext vXiY:$b)), 1)))
+// -> PASUB/PASUBU
+static SDValue combinePExtTruncate(SDNode *N, SelectionDAG &DAG,
+                                   const RISCVSubtarget &Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  if (N0.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  MVT VecVT = VT.getSimpleVT();
+  if (VecVT != MVT::v4i16 && VecVT != MVT::v2i16 && VecVT != MVT::v8i8 &&
+      VecVT != MVT::v4i8 && VecVT != MVT::v2i32)
+    return SDValue();
+
+  // Check if shift amount is 1
+  SDValue ShAmt = N0.getOperand(1);
+  if (ShAmt.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(ShAmt.getNode());
+  if (!BV)
+    return SDValue();
+  SDValue Splat = BV->getSplatValue();
+  if (!Splat)
+    return SDValue();
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat);
+  if (!C)
+    return SDValue();
+  if (C->getZExtValue() != 1)
+    return SDValue();
+
+  // Check for SUB operation
+  SDValue Sub = N0.getOperand(0);
+  if (Sub.getOpcode() != ISD::SUB)
+    return SDValue();
+
+  SDValue LHS = Sub.getOperand(0);
+  SDValue RHS = Sub.getOperand(1);
+
+  // Check if both operands are sign/zero extends from the target
+  // type
+  bool IsSignExt = LHS.getOpcode() == ISD::SIGN_EXTEND &&
+                   RHS.getOpcode() == ISD::SIGN_EXTEND;
+  bool IsZeroExt = LHS.getOpcode() == ISD::ZERO_EXTEND &&
+                   RHS.getOpcode() == ISD::ZERO_EXTEND;
+
+  if (!IsSignExt && !IsZeroExt)
+    return SDValue();
+
+  SDValue A = LHS.getOperand(0);
+  SDValue B = RHS.getOperand(0);
+
+  // Check if the extends are from our target vector type
+  if (A.getValueType() != VT || B.getValueType() != VT)
+    return SDValue();
+
+  // Determine the instruction based on type and signedness
+  unsigned Opc;
+  if (IsSignExt)
+    Opc = RISCVISD::PASUB;
+  else if (IsZeroExt)
+    Opc = RISCVISD::PASUBU;
+  else
+    return SDValue();
+
+  // Create the machine node directly
+  return DAG.getNode(Opc, SDLoc(N), VT, {A, B});
+}
+
 static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
                                       const RISCVSubtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
+  if (VT.isFixedLengthVector() && Subtarget.enablePExtCodeGen())
+    return combinePExtTruncate(N, DAG, Subtarget);
+
   // Pre-promote (i1 (truncate (srl X, Y))) on RV64 with Zbs without zero
   // extending X. This is safe since we only need the LSB after the shift and
   // shift amounts larger than 31 would produce poison. If we wait until
@@ -16117,6 +16409,46 @@ static SDValue reverseZExtICmpCombine(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Res);
 }
 
+// (and (i1) f, (setcc c, 0, ne)) -> (czero.nez f, c)
+// (and (i1) f, (setcc c, 0, eq)) -> (czero.eqz f, c)
+// (and (setcc c, 0, ne), (i1) g) -> (czero.nez g, c)
+// (and (setcc c, 0, eq), (i1) g) -> (czero.eqz g, c)
+static SDValue combineANDOfSETCCToCZERO(SDNode *N, SelectionDAG &DAG,
+                                        const RISCVSubtarget &Subtarget) {
+  if (!Subtarget.hasCZEROLike())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  auto IsEqualCompZero = [](SDValue &V) -> bool {
+    if (V.getOpcode() == ISD::SETCC && isNullConstant(V.getOperand(1))) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(V.getOperand(2))->get();
+      if (ISD::isIntEqualitySetCC(CC))
+        return true;
+    }
+    return false;
+  };
+
+  if (!IsEqualCompZero(N0) || !N0.hasOneUse())
+    std::swap(N0, N1);
+  if (!IsEqualCompZero(N0) || !N0.hasOneUse())
+    return SDValue();
+
+  KnownBits Known = DAG.computeKnownBits(N1);
+  if (Known.getMaxValue().ugt(1))
+    return SDValue();
+
+  unsigned CzeroOpcode =
+      (cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETNE)
+          ? RISCVISD::CZERO_EQZ
+          : RISCVISD::CZERO_NEZ;
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  return DAG.getNode(CzeroOpcode, DL, VT, N1, N0.getOperand(0));
+}
+
 static SDValue reduceANDOfAtomicLoad(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
@@ -16180,7 +16512,9 @@ static SDValue performANDCombine(SDNode *N,
 
   if (SDValue V = reverseZExtICmpCombine(N, DAG, Subtarget))
     return V;
-
+  if (DCI.isAfterLegalizeDAG())
+    if (SDValue V = combineANDOfSETCCToCZERO(N, DAG, Subtarget))
+      return V;
   if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
     return V;
   if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
@@ -16495,6 +16829,76 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(Op, DL, VT, Shift1, Shift2);
 }
 
+static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX,
+                               unsigned ShY, bool AddX, unsigned Shift) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue X = N->getOperand(0);
+  // Put the shift first if we can fold a zext into the shift forming a slli.uw.
+  using namespace SDPatternMatch;
+  if (Shift != 0 &&
+      sd_match(X, m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
+    X = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT));
+    Shift = 0;
+  }
+  SDValue ShlAdd = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+                               DAG.getTargetConstant(ShY, DL, VT), X);
+  if (ShX != 0)
+    ShlAdd = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, ShlAdd,
+                         DAG.getTargetConstant(ShX, DL, VT), AddX ? X : ShlAdd);
+  if (Shift == 0)
+    return ShlAdd;
+  // Otherwise, put the shl last so that it can fold with following instructions
+  // (e.g. sext or add).
+  return DAG.getNode(ISD::SHL, DL, VT, ShlAdd, DAG.getConstant(Shift, DL, VT));
+}
+
+static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG,
+                                       uint64_t MulAmt, unsigned Shift) {
+  switch (MulAmt) {
+  // 3/5/9 -> (shYadd X, X)
+  case 3:
+    return getShlAddShlAdd(N, DAG, 0, 1, /*AddX=*/false, Shift);
+  case 5:
+    return getShlAddShlAdd(N, DAG, 0, 2, /*AddX=*/false, Shift);
+  case 9:
+    return getShlAddShlAdd(N, DAG, 0, 3, /*AddX=*/false, Shift);
+  // 3/5/9 * 3/5/9 -> (shXadd (shYadd X, X), (shYadd X, X))
+  case 5 * 3:
+    return getShlAddShlAdd(N, DAG, 2, 1, /*AddX=*/false, Shift);
+  case 9 * 3:
+    return getShlAddShlAdd(N, DAG, 3, 1, /*AddX=*/false, Shift);
+  case 5 * 5:
+    return getShlAddShlAdd(N, DAG, 2, 2, /*AddX=*/false, Shift);
+  case 9 * 5:
+    return getShlAddShlAdd(N, DAG, 3, 2, /*AddX=*/false, Shift);
+  case 9 * 9:
+    return getShlAddShlAdd(N, DAG, 3, 3, /*AddX=*/false, Shift);
+  default:
+    break;
+  }
+
+  int ShX;
+  if (int ShY = isShifted359(MulAmt - 1, ShX)) {
+    assert(ShX != 0 && "MulAmt=4,6,10 handled before");
+    // 2/4/8 * 3/5/9 + 1 -> (shXadd (shYadd X, X), X)
+    if (ShX <= 3)
+      return getShlAddShlAdd(N, DAG, ShX, ShY, /*AddX=*/true, Shift);
+    // 2^N * 3/5/9 + 1 -> (add (shYadd (shl X, N), (shl X, N)), X)
+    if (Shift == 0) {
+      SDLoc DL(N);
+      EVT VT = N->getValueType(0);
+      SDValue X = N->getOperand(0);
+      SDValue Shl =
+          DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShX, DL, VT));
+      SDValue ShlAdd = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl,
+                                   DAG.getTargetConstant(ShY, DL, VT), Shl);
+      return DAG.getNode(ISD::ADD, DL, VT, ShlAdd, X);
+    }
+  }
+  return SDValue();
+}
+
 // Try to expand a scalar multiply to a faster sequence.
 static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
                          TargetLowering::DAGCombinerInfo &DCI,
@@ -16524,99 +16928,34 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
   if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue()))
     return SDValue();
 
-  // WARNING: The code below is knowingly incorrect with regards to undef semantics.
-  // We're adding additional uses of X here, and in principle, we should be freezing
-  // X before doing so.  However, adding freeze here causes real regressions, and no
-  // other target properly freezes X in these cases either.
-  SDValue X = N->getOperand(0);
-
+  // WARNING: The code below is knowingly incorrect with regards to undef
+  // semantics.  We're adding additional uses of X here, and in principle, we
+  // should be freezing X before doing so.  However, adding freeze here causes
+  // real regressions, and no other target properly freezes X in these cases
+  // either.
   if (Subtarget.hasShlAdd(3)) {
-    int Shift;
-    if (int ShXAmount = isShifted359(MulAmt, Shift)) {
-      // 3/5/9 * 2^N -> shl (shXadd X, X), N
-      SDLoc DL(N);
-      SDValue X = N->getOperand(0);
-      // Put the shift first if we can fold a zext into the shift forming
-      // a slli.uw.
-      if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) &&
-          X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) {
-        SDValue Shl =
-            DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT));
-        return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl,
-                           DAG.getConstant(ShXAmount, DL, VT), Shl);
-      }
-      // Otherwise, put the shl second so that it can fold with following
-      // instructions (e.g. sext or add).
-      SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                                   DAG.getConstant(ShXAmount, DL, VT), X);
-      return DAG.getNode(ISD::SHL, DL, VT, Mul359,
-                         DAG.getConstant(Shift, DL, VT));
-    }
-
-    // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X)
-    int ShX;
-    int ShY;
-    switch (MulAmt) {
-    case 3 * 5:
-      ShY = 1;
-      ShX = 2;
-      break;
-    case 3 * 9:
-      ShY = 1;
-      ShX = 3;
-      break;
-    case 5 * 5:
-      ShX = ShY = 2;
-      break;
-    case 5 * 9:
-      ShY = 2;
-      ShX = 3;
-      break;
-    case 9 * 9:
-      ShX = ShY = 3;
-      break;
-    default:
-      ShX = ShY = 0;
-      break;
-    }
-    if (ShX) {
-      SDLoc DL(N);
-      SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                                   DAG.getConstant(ShY, DL, VT), X);
-      return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
-                         DAG.getConstant(ShX, DL, VT), Mul359);
-    }
+    // 3/5/9 * 2^N -> (shl (shXadd X, X), N)
+    // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples
+    // of 25 which happen to be quite common.
+    // (2/4/8 * 3/5/9 + 1) * 2^N
+    unsigned Shift = llvm::countr_zero(MulAmt);
+    if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift, Shift))
+      return V;
 
     // If this is a power 2 + 2/4/8, we can use a shift followed by a single
     // shXadd. First check if this a sum of two power of 2s because that's
     // easy. Then count how many zeros are up to the first bit.
-    if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
-      unsigned ScaleShift = llvm::countr_zero(MulAmt);
-      if (ScaleShift >= 1 && ScaleShift < 4) {
-        unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
-        SDLoc DL(N);
-        SDValue Shift1 =
-            DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
-        return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                           DAG.getConstant(ScaleShift, DL, VT), Shift1);
-      }
+    SDValue X = N->getOperand(0);
+    if (Shift >= 1 && Shift <= 3 && isPowerOf2_64(MulAmt & (MulAmt - 1))) {
+      unsigned ShiftAmt = llvm::countr_zero((MulAmt & (MulAmt - 1)));
+      SDLoc DL(N);
+      SDValue Shift1 =
+          DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
+      return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+                         DAG.getTargetConstant(Shift, DL, VT), Shift1);
     }
 
-    // 2^(1,2,3) * 3,5,9 + 1 -> (shXadd (shYadd x, x), x)
-    // This is the two instruction form, there are also three instruction
-    // variants we could implement.  e.g.
-    //   (2^(1,2,3) * 3,5,9 + 1) << C2
-    //   2^(C1>3) * 3,5,9 +/- 1
-    if (int ShXAmount = isShifted359(MulAmt - 1, Shift)) {
-      assert(Shift != 0 && "MulAmt=4,6,10 handled before");
-      if (Shift <= 3) {
-        SDLoc DL(N);
-        SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                                     DAG.getConstant(ShXAmount, DL, VT), X);
-        return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
-                           DAG.getConstant(Shift, DL, VT), X);
-      }
-    }
+    // TODO: 2^(C1>3) * 3/5/9 - 1
 
     // 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X))
     if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) {
@@ -16626,9 +16965,10 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
         SDLoc DL(N);
         SDValue Shift1 =
             DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
-        return DAG.getNode(ISD::ADD, DL, VT, Shift1,
-                           DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                                       DAG.getConstant(ScaleShift, DL, VT), X));
+        return DAG.getNode(
+            ISD::ADD, DL, VT, Shift1,
+            DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+                        DAG.getTargetConstant(ScaleShift, DL, VT), X));
       }
     }
 
@@ -16643,29 +16983,10 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
             DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShAmt, DL, VT));
         SDValue Mul359 =
             DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                        DAG.getConstant(Log2_64(Offset - 1), DL, VT), X);
+                        DAG.getTargetConstant(Log2_64(Offset - 1), DL, VT), X);
         return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359);
       }
     }
-
-    for (uint64_t Divisor : {3, 5, 9}) {
-      if (MulAmt % Divisor != 0)
-        continue;
-      uint64_t MulAmt2 = MulAmt / Divisor;
-      // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples
-      // of 25 which happen to be quite common.
-      if (int ShBAmount = isShifted359(MulAmt2, Shift)) {
-        SDLoc DL(N);
-        SDValue Mul359A =
-            DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                        DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
-        SDValue Mul359B =
-            DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359A,
-                        DAG.getConstant(ShBAmount, DL, VT), Mul359A);
-        return DAG.getNode(ISD::SHL, DL, VT, Mul359B,
-                           DAG.getConstant(Shift, DL, VT));
-      }
-    }
   }
 
   if (SDValue V = expandMulToAddOrSubOfShl(N, DAG, MulAmt))
@@ -17887,6 +18208,7 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
 
   SmallVector<SDNode *> Worklist;
   SmallPtrSet<SDNode *, 8> Inserted;
+  SmallPtrSet<SDNode *, 8> ExtensionsToRemove;
   Worklist.push_back(N);
   Inserted.insert(N);
   SmallVector<CombineResult> CombinesToApply;
@@ -17896,22 +18218,25 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
 
     NodeExtensionHelper LHS(Root, 0, DAG, Subtarget);
     NodeExtensionHelper RHS(Root, 1, DAG, Subtarget);
-    auto AppendUsersIfNeeded = [&Worklist, &Subtarget,
-                                &Inserted](const NodeExtensionHelper &Op) {
-      if (Op.needToPromoteOtherUsers()) {
-        for (SDUse &Use : Op.OrigOperand->uses()) {
-          SDNode *TheUser = Use.getUser();
-          if (!NodeExtensionHelper::isSupportedRoot(TheUser, Subtarget))
-            return false;
-          // We only support the first 2 operands of FMA.
-          if (Use.getOperandNo() >= 2)
-            return false;
-          if (Inserted.insert(TheUser).second)
-            Worklist.push_back(TheUser);
-        }
-      }
-      return true;
-    };
+    auto AppendUsersIfNeeded =
+        [&Worklist, &Subtarget, &Inserted,
+         &ExtensionsToRemove](const NodeExtensionHelper &Op) {
+          if (Op.needToPromoteOtherUsers()) {
+            // Remember that we're supposed to remove this extension.
+            ExtensionsToRemove.insert(Op.OrigOperand.getNode());
+            for (SDUse &Use : Op.OrigOperand->uses()) {
+              SDNode *TheUser = Use.getUser();
+              if (!NodeExtensionHelper::isSupportedRoot(TheUser, Subtarget))
+                return false;
+              // We only support the first 2 operands of FMA.
+              if (Use.getOperandNo() >= 2)
+                return false;
+              if (Inserted.insert(TheUser).second)
+                Worklist.push_back(TheUser);
+            }
+          }
+          return true;
+        };
 
     // Control the compile time by limiting the number of node we look at in
     // total.
@@ -17932,6 +18257,15 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
         std::optional<CombineResult> Res =
             FoldingStrategy(Root, LHS, RHS, DAG, Subtarget);
         if (Res) {
+          // If this strategy wouldn't remove an extension we're supposed to
+          // remove, reject it.
+          if (!Res->LHSExt.has_value() &&
+              ExtensionsToRemove.contains(LHS.OrigOperand.getNode()))
+            continue;
+          if (!Res->RHSExt.has_value() &&
+              ExtensionsToRemove.contains(RHS.OrigOperand.getNode()))
+            continue;
+
           Matched = true;
           CombinesToApply.push_back(*Res);
           // All the inputs that are extended need to be folded, otherwise
@@ -19794,7 +20128,9 @@ legalizeScatterGatherIndexType(SDLoc DL, SDValue &Index,
     // LLVM's legalization take care of the splitting.
     // FIXME: LLVM can't split VP_GATHER or VP_SCATTER yet.
     Index = DAG.getNode(ISD::SIGN_EXTEND, DL,
-                        IndexVT.changeVectorElementType(XLenVT), Index);
+                        EVT::getVectorVT(*DAG.getContext(), XLenVT,
+                                         IndexVT.getVectorElementCount()),
+                        Index);
   }
   IndexType = ISD::UNSIGNED_SCALED;
   return true;
@@ -22096,8 +22432,7 @@ static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI,
 
   MachineFunction &MF = *BB->getParent();
   DebugLoc DL = MI.getDebugLoc();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+  const RISCVInstrInfo &TII = *MF.getSubtarget<RISCVSubtarget>().getInstrInfo();
   Register LoReg = MI.getOperand(0).getReg();
   Register HiReg = MI.getOperand(1).getReg();
   Register SrcReg = MI.getOperand(2).getReg();
@@ -22106,7 +22441,7 @@ static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI,
   int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex(MF);
 
   TII.storeRegToStackSlot(*BB, MI, SrcReg, MI.getOperand(2).isKill(), FI, SrcRC,
-                          RI, Register());
+                          Register());
   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
   MachineMemOperand *MMOLo =
       MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 4, Align(8));
@@ -22132,8 +22467,7 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI,
 
   MachineFunction &MF = *BB->getParent();
   DebugLoc DL = MI.getDebugLoc();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+  const RISCVInstrInfo &TII = *MF.getSubtarget<RISCVSubtarget>().getInstrInfo();
   Register DstReg = MI.getOperand(0).getReg();
   Register LoReg = MI.getOperand(1).getReg();
   Register HiReg = MI.getOperand(2).getReg();
@@ -22156,7 +22490,7 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI,
       .addFrameIndex(FI)
       .addImm(4)
       .addMemOperand(MMOHi);
-  TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, RI, Register());
+  TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, Register());
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
@@ -23944,7 +24278,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                .Case("{t0}", RISCV::X5)
                                .Case("{t1}", RISCV::X6)
                                .Case("{t2}", RISCV::X7)
-                               .Cases("{s0}", "{fp}", RISCV::X8)
+                               .Cases({"{s0}", "{fp}"}, RISCV::X8)
                                .Case("{s1}", RISCV::X9)
                                .Case("{a0}", RISCV::X10)
                                .Case("{a1}", RISCV::X11)
@@ -23981,38 +24315,38 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   // use the ABI names in register constraint lists.
   if (Subtarget.hasStdExtF()) {
     unsigned FReg = StringSwitch<unsigned>(Constraint.lower())
-                        .Cases("{f0}", "{ft0}", RISCV::F0_F)
-                        .Cases("{f1}", "{ft1}", RISCV::F1_F)
-                        .Cases("{f2}", "{ft2}", RISCV::F2_F)
-                        .Cases("{f3}", "{ft3}", RISCV::F3_F)
-                        .Cases("{f4}", "{ft4}", RISCV::F4_F)
-                        .Cases("{f5}", "{ft5}", RISCV::F5_F)
-                        .Cases("{f6}", "{ft6}", RISCV::F6_F)
-                        .Cases("{f7}", "{ft7}", RISCV::F7_F)
-                        .Cases("{f8}", "{fs0}", RISCV::F8_F)
-                        .Cases("{f9}", "{fs1}", RISCV::F9_F)
-                        .Cases("{f10}", "{fa0}", RISCV::F10_F)
-                        .Cases("{f11}", "{fa1}", RISCV::F11_F)
-                        .Cases("{f12}", "{fa2}", RISCV::F12_F)
-                        .Cases("{f13}", "{fa3}", RISCV::F13_F)
-                        .Cases("{f14}", "{fa4}", RISCV::F14_F)
-                        .Cases("{f15}", "{fa5}", RISCV::F15_F)
-                        .Cases("{f16}", "{fa6}", RISCV::F16_F)
-                        .Cases("{f17}", "{fa7}", RISCV::F17_F)
-                        .Cases("{f18}", "{fs2}", RISCV::F18_F)
-                        .Cases("{f19}", "{fs3}", RISCV::F19_F)
-                        .Cases("{f20}", "{fs4}", RISCV::F20_F)
-                        .Cases("{f21}", "{fs5}", RISCV::F21_F)
-                        .Cases("{f22}", "{fs6}", RISCV::F22_F)
-                        .Cases("{f23}", "{fs7}", RISCV::F23_F)
-                        .Cases("{f24}", "{fs8}", RISCV::F24_F)
-                        .Cases("{f25}", "{fs9}", RISCV::F25_F)
-                        .Cases("{f26}", "{fs10}", RISCV::F26_F)
-                        .Cases("{f27}", "{fs11}", RISCV::F27_F)
-                        .Cases("{f28}", "{ft8}", RISCV::F28_F)
-                        .Cases("{f29}", "{ft9}", RISCV::F29_F)
-                        .Cases("{f30}", "{ft10}", RISCV::F30_F)
-                        .Cases("{f31}", "{ft11}", RISCV::F31_F)
+                        .Cases({"{f0}", "{ft0}"}, RISCV::F0_F)
+                        .Cases({"{f1}", "{ft1}"}, RISCV::F1_F)
+                        .Cases({"{f2}", "{ft2}"}, RISCV::F2_F)
+                        .Cases({"{f3}", "{ft3}"}, RISCV::F3_F)
+                        .Cases({"{f4}", "{ft4}"}, RISCV::F4_F)
+                        .Cases({"{f5}", "{ft5}"}, RISCV::F5_F)
+                        .Cases({"{f6}", "{ft6}"}, RISCV::F6_F)
+                        .Cases({"{f7}", "{ft7}"}, RISCV::F7_F)
+                        .Cases({"{f8}", "{fs0}"}, RISCV::F8_F)
+                        .Cases({"{f9}", "{fs1}"}, RISCV::F9_F)
+                        .Cases({"{f10}", "{fa0}"}, RISCV::F10_F)
+                        .Cases({"{f11}", "{fa1}"}, RISCV::F11_F)
+                        .Cases({"{f12}", "{fa2}"}, RISCV::F12_F)
+                        .Cases({"{f13}", "{fa3}"}, RISCV::F13_F)
+                        .Cases({"{f14}", "{fa4}"}, RISCV::F14_F)
+                        .Cases({"{f15}", "{fa5}"}, RISCV::F15_F)
+                        .Cases({"{f16}", "{fa6}"}, RISCV::F16_F)
+                        .Cases({"{f17}", "{fa7}"}, RISCV::F17_F)
+                        .Cases({"{f18}", "{fs2}"}, RISCV::F18_F)
+                        .Cases({"{f19}", "{fs3}"}, RISCV::F19_F)
+                        .Cases({"{f20}", "{fs4}"}, RISCV::F20_F)
+                        .Cases({"{f21}", "{fs5}"}, RISCV::F21_F)
+                        .Cases({"{f22}", "{fs6}"}, RISCV::F22_F)
+                        .Cases({"{f23}", "{fs7}"}, RISCV::F23_F)
+                        .Cases({"{f24}", "{fs8}"}, RISCV::F24_F)
+                        .Cases({"{f25}", "{fs9}"}, RISCV::F25_F)
+                        .Cases({"{f26}", "{fs10}"}, RISCV::F26_F)
+                        .Cases({"{f27}", "{fs11}"}, RISCV::F27_F)
+                        .Cases({"{f28}", "{ft8}"}, RISCV::F28_F)
+                        .Cases({"{f29}", "{ft9}"}, RISCV::F29_F)
+                        .Cases({"{f30}", "{ft10}"}, RISCV::F30_F)
+                        .Cases({"{f31}", "{ft11}"}, RISCV::F31_F)
                         .Default(RISCV::NoRegister);
     if (FReg != RISCV::NoRegister) {
       assert(RISCV::F0_F <= FReg && FReg <= RISCV::F31_F && "Unknown fp-reg");
@@ -25318,3 +25652,12 @@ ArrayRef<MCPhysReg> RISCVTargetLowering::getRoundingControlRegisters() const {
   }
   return {};
 }
+
+bool RISCVTargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
+  EVT VT = Y.getValueType();
+
+  if (VT.isVector())
+    return false;
+
+  return VT.getSizeInBits() <= Subtarget.getXLen();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 9e3e2a9443625..5cc427c867cfd 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -71,6 +71,9 @@ class RISCVTargetLowering : public TargetLowering {
 
   bool preferScalarizeSplat(SDNode *N) const override;
 
+  /// Customize the preferred legalization strategy for certain types.
+  LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
+
   bool softPromoteHalfType() const override { return true; }
 
   /// Return the register type for a given MVT, ensuring vectors are treated
@@ -465,6 +468,8 @@ class RISCVTargetLowering : public TargetLowering {
 
   ArrayRef<MCPhysReg> getRoundingControlRegisters() const override;
 
+  bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;
+
   /// Match a mask which "spreads" the leading elements of a vector evenly
   /// across the result.  Factor is the spread amount, and Index is the
   /// offset applied.
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 636e31c47ddba..bf9de0a4b5604 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1583,7 +1583,10 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
               if (!TII->isAddImmediate(*DeadMI, Reg))
                 continue;
               LIS->RemoveMachineInstrFromMaps(*DeadMI);
+              Register AddReg = DeadMI->getOperand(1).getReg();
               DeadMI->eraseFromParent();
+              if (AddReg.isVirtual())
+                LIS->shrinkToUses(&LIS->getInterval(AddReg));
             }
           }
         }
@@ -1869,11 +1872,15 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
   // Loop over the dead AVL values, and delete them now.  This has
   // to be outside the above loop to avoid invalidating iterators.
   for (auto *MI : ToDelete) {
+    assert(MI->getOpcode() == RISCV::ADDI);
+    Register AddReg = MI->getOperand(1).getReg();
     if (LIS) {
       LIS->removeInterval(MI->getOperand(0).getReg());
       LIS->RemoveMachineInstrFromMaps(*MI);
     }
     MI->eraseFromParent();
+    if (LIS && AddReg.isVirtual())
+      LIS->shrinkToUses(&LIS->getInterval(AddReg));
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp
index a1c8e23793b92..c58a5c07a34f7 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp
@@ -48,7 +48,7 @@ class VXRMInfo {
   } State = Uninitialized;
 
 public:
-  VXRMInfo() {}
+  VXRMInfo() = default;
 
   static VXRMInfo getUnknown() {
     VXRMInfo Info;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 912b82d294f44..fb914e97e2229 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -31,6 +31,7 @@
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -81,8 +82,9 @@ namespace llvm::RISCV {
 } // end namespace llvm::RISCV
 
 RISCVInstrInfo::RISCVInstrInfo(const RISCVSubtarget &STI)
-    : RISCVGenInstrInfo(STI, RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP),
-      STI(STI) {}
+    : RISCVGenInstrInfo(STI, RegInfo, RISCV::ADJCALLSTACKDOWN,
+                        RISCV::ADJCALLSTACKUP),
+      RegInfo(STI.getHwMode()), STI(STI) {}
 
 #define GET_INSTRINFO_HELPERS
 #include "RISCVGenInstrInfo.inc"
@@ -637,7 +639,6 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator I,
                                          Register SrcReg, bool IsKill, int FI,
                                          const TargetRegisterClass *RC,
-                                         const TargetRegisterInfo *TRI,
                                          Register VReg,
                                          MachineInstr::MIFlag Flags) const {
   MachineFunction *MF = MBB.getParent();
@@ -645,8 +646,8 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
   unsigned Opcode;
   if (RISCV::GPRRegClass.hasSubClassEq(RC)) {
-    Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
-             RISCV::SW : RISCV::SD;
+    Opcode = RegInfo.getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::SW
+                                                                : RISCV::SD;
   } else if (RISCV::GPRF16RegClass.hasSubClassEq(RC)) {
     Opcode = RISCV::SH_INX;
   } else if (RISCV::GPRF32RegClass.hasSubClassEq(RC)) {
@@ -703,7 +704,7 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
         .addFrameIndex(FI)
         .addMemOperand(MMO)
         .setMIFlag(Flags);
-    NumVRegSpilled += TRI->getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock;
+    NumVRegSpilled += RegInfo.getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock;
   } else {
     MachineMemOperand *MMO = MF->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
@@ -718,10 +719,12 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   }
 }
 
-void RISCVInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DstReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          Register DstReg, int FI,
+                                          const TargetRegisterClass *RC,
+                                          Register VReg,
+                                          MachineInstr::MIFlag Flags) const {
   MachineFunction *MF = MBB.getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
   DebugLoc DL =
@@ -729,8 +732,8 @@ void RISCVInstrInfo::loadRegFromStackSlot(
 
   unsigned Opcode;
   if (RISCV::GPRRegClass.hasSubClassEq(RC)) {
-    Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
-             RISCV::LW : RISCV::LD;
+    Opcode = RegInfo.getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::LW
+                                                                : RISCV::LD;
   } else if (RISCV::GPRF16RegClass.hasSubClassEq(RC)) {
     Opcode = RISCV::LH_INX;
   } else if (RISCV::GPRF32RegClass.hasSubClassEq(RC)) {
@@ -786,7 +789,7 @@ void RISCVInstrInfo::loadRegFromStackSlot(
         .addFrameIndex(FI)
         .addMemOperand(MMO)
         .setMIFlag(Flags);
-    NumVRegReloaded += TRI->getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock;
+    NumVRegReloaded += RegInfo.getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock;
   } else {
     MachineMemOperand *MMO = MF->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
@@ -869,7 +872,7 @@ std::optional<unsigned> getFoldedOpcode(MachineFunction &MF, MachineInstr &MI,
   }
 }
 
-// This is the version used during inline spilling
+// This is the version used during InlineSpiller::spillAroundUses
 MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
     MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
@@ -1377,14 +1380,14 @@ void RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
       report_fatal_error("underestimated function size");
 
     storeRegToStackSlot(MBB, MI, TmpGPR, /*IsKill=*/true, FrameIndex,
-                        &RISCV::GPRRegClass, TRI, Register());
+                        &RISCV::GPRRegClass, Register());
     TRI->eliminateFrameIndex(std::prev(MI.getIterator()),
                              /*SpAdj=*/0, /*FIOperandNum=*/1);
 
     MI.getOperand(1).setMBB(&RestoreBB);
 
     loadRegFromStackSlot(RestoreBB, RestoreBB.end(), TmpGPR, FrameIndex,
-                         &RISCV::GPRRegClass, TRI, Register());
+                         &RISCV::GPRRegClass, Register());
     TRI->eliminateFrameIndex(RestoreBB.back(),
                              /*SpAdj=*/0, /*FIOperandNum=*/1);
   }
@@ -1699,6 +1702,14 @@ unsigned getPredicatedOpcode(unsigned Opcode) {
   case RISCV::AND:   return RISCV::PseudoCCAND;
   case RISCV::OR:    return RISCV::PseudoCCOR;
   case RISCV::XOR:   return RISCV::PseudoCCXOR;
+  case RISCV::MAX:   return RISCV::PseudoCCMAX;
+  case RISCV::MAXU:  return RISCV::PseudoCCMAXU;
+  case RISCV::MIN:   return RISCV::PseudoCCMIN;
+  case RISCV::MINU:  return RISCV::PseudoCCMINU;
+  case RISCV::MUL:   return RISCV::PseudoCCMUL;
+  case RISCV::LUI:   return RISCV::PseudoCCLUI;
+  case RISCV::QC_LI:   return RISCV::PseudoCCQC_LI;
+  case RISCV::QC_E_LI:   return RISCV::PseudoCCQC_E_LI;
 
   case RISCV::ADDI:  return RISCV::PseudoCCADDI;
   case RISCV::SLLI:  return RISCV::PseudoCCSLLI;
@@ -1735,7 +1746,8 @@ unsigned getPredicatedOpcode(unsigned Opcode) {
 /// return the defining instruction.
 static MachineInstr *canFoldAsPredicatedOp(Register Reg,
                                            const MachineRegisterInfo &MRI,
-                                           const TargetInstrInfo *TII) {
+                                           const TargetInstrInfo *TII,
+                                           const RISCVSubtarget &STI) {
   if (!Reg.isVirtual())
     return nullptr;
   if (!MRI.hasOneNonDBGUse(Reg))
@@ -1743,6 +1755,15 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg,
   MachineInstr *MI = MRI.getVRegDef(Reg);
   if (!MI)
     return nullptr;
+
+  if (!STI.hasShortForwardBranchIMinMax() &&
+      (MI->getOpcode() == RISCV::MAX || MI->getOpcode() == RISCV::MIN ||
+       MI->getOpcode() == RISCV::MINU || MI->getOpcode() == RISCV::MAXU))
+    return nullptr;
+
+  if (!STI.hasShortForwardBranchIMul() && MI->getOpcode() == RISCV::MUL)
+    return nullptr;
+
   // Check if MI can be predicated and folded into the CCMOV.
   if (getPredicatedOpcode(MI->getOpcode()) == RISCV::INSTRUCTION_LIST_END)
     return nullptr;
@@ -1806,10 +1827,10 @@ RISCVInstrInfo::optimizeSelect(MachineInstr &MI,
 
   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   MachineInstr *DefMI =
-      canFoldAsPredicatedOp(MI.getOperand(5).getReg(), MRI, this);
+      canFoldAsPredicatedOp(MI.getOperand(5).getReg(), MRI, this, STI);
   bool Invert = !DefMI;
   if (!DefMI)
-    DefMI = canFoldAsPredicatedOp(MI.getOperand(4).getReg(), MRI, this);
+    DefMI = canFoldAsPredicatedOp(MI.getOperand(4).getReg(), MRI, this, STI);
   if (!DefMI)
     return nullptr;
 
@@ -2897,6 +2918,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
         case RISCVOp::OPERAND_UIMM9_LSB000:
           Ok = isShiftedUInt<6, 3>(Imm);
           break;
+        case RISCVOp::OPERAND_SIMM8_UNSIGNED:
+          Ok = isInt<8>(Imm);
+          break;
         case RISCVOp::OPERAND_SIMM10_LSB0000_NONZERO:
           Ok = isShiftedInt<6, 4>(Imm) && (Imm != 0);
           break;
@@ -2918,6 +2942,7 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
           // clang-format off
         CASE_OPERAND_SIMM(5)
         CASE_OPERAND_SIMM(6)
+        CASE_OPERAND_SIMM(10)
         CASE_OPERAND_SIMM(11)
         CASE_OPERAND_SIMM(12)
         CASE_OPERAND_SIMM(26)
@@ -3511,6 +3536,27 @@ RISCVInstrInfo::getOutliningCandidateInfo(
       Candidate.getMF()->getSubtarget<RISCVSubtarget>().hasStdExtZca() ? 2 : 4;
   unsigned CallOverhead = 0, FrameOverhead = 0;
 
+  // Count the number of CFI instructions in the candidate, if present.
+  unsigned CFICount = 0;
+  for (auto &I : Candidate) {
+    if (I.isCFIInstruction())
+      CFICount++;
+  }
+
+  // Ensure CFI coverage matches: comparing the number of CFIs in the candidate
+  // with the total number of CFIs in the parent function for each candidate.
+  // Outlining only a subset of a function’s CFIs would split the unwind state
+  // across two code regions and lead to incorrect address offsets between the
+  // outlined body and the remaining code. To preserve correct unwind info, we
+  // only outline when all CFIs in the function can be outlined together.
+  for (outliner::Candidate &C : RepeatedSequenceLocs) {
+    std::vector<MCCFIInstruction> CFIInstructions =
+        C.getMF()->getFrameInstructions();
+
+    if (CFICount > 0 && CFICount != CFIInstructions.size())
+      return std::nullopt;
+  }
+
   MachineOutlinerConstructionID MOCI = MachineOutlinerDefault;
   if (Candidate.back().isReturn()) {
     MOCI = MachineOutlinerTailCall;
@@ -3526,6 +3572,11 @@ RISCVInstrInfo::getOutliningCandidateInfo(
     FrameOverhead = InstrSizeCExt;
   }
 
+  // If we have CFI instructions, we can only outline if the outlined section
+  // can be a tail call.
+  if (MOCI != MachineOutlinerTailCall && CFICount > 0)
+    return std::nullopt;
+
   for (auto &C : RepeatedSequenceLocs)
     C.setCallInfo(MOCI, CallOverhead);
 
@@ -3547,13 +3598,11 @@ RISCVInstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
       MBB->getParent()->getSubtarget().getRegisterInfo();
   const auto &F = MI.getMF()->getFunction();
 
-  // We can manually strip out CFI instructions later.
+  // We can only outline CFI instructions if we will tail call the outlined
+  // function, or fix up the CFI offsets. Currently, CFI instructions are
+  // outlined only if in a tail call.
   if (MI.isCFIInstruction())
-    // If current function has exception handling code, we can't outline &
-    // strip these CFI instructions since it may break .eh_frame section
-    // needed in unwinding.
-    return F.needsUnwindTableEntry() ? outliner::InstrType::Illegal
-                                     : outliner::InstrType::Invisible;
+    return outliner::InstrType::Legal;
 
   if (cannotInsertTailCall(*MBB) &&
       (MI.isReturn() || isMIModifiesReg(MI, TRI, RISCV::X5)))
@@ -3580,21 +3629,6 @@ void RISCVInstrInfo::buildOutlinedFrame(
     MachineBasicBlock &MBB, MachineFunction &MF,
     const outliner::OutlinedFunction &OF) const {
 
-  // Strip out any CFI instructions
-  bool Changed = true;
-  while (Changed) {
-    Changed = false;
-    auto I = MBB.begin();
-    auto E = MBB.end();
-    for (; I != E; ++I) {
-      if (I->isCFIInstruction()) {
-        I->removeFromParent();
-        Changed = true;
-        break;
-      }
-    }
-  }
-
   if (OF.FrameConstructionID == MachineOutlinerTailCall)
     return;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index c5eddb9e90fbf..0ffe015b9fac8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -79,10 +79,13 @@ enum RISCVMachineCombinerPattern : unsigned {
 };
 
 class RISCVInstrInfo : public RISCVGenInstrInfo {
+  const RISCVRegisterInfo RegInfo;
 
 public:
   explicit RISCVInstrInfo(const RISCVSubtarget &STI);
 
+  const RISCVRegisterInfo &getRegisterInfo() const { return RegInfo; }
+
   MCInst getNop() const override;
 
   Register isLoadFromStackSlot(const MachineInstr &MI,
@@ -113,13 +116,13 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
       bool IsKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DstReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   using TargetInstrInfo::foldMemoryOperandImpl;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 7c89686ebfb3c..9cb53fb27a2d2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -768,7 +768,7 @@ def BGE  : BranchCC_rri<0b101, "bge">;
 def BLTU : BranchCC_rri<0b110, "bltu">;
 def BGEU : BranchCC_rri<0b111, "bgeu">;
 
-let IsSignExtendingOpW = 1 in {
+let IsSignExtendingOpW = 1, canFoldAsLoad = 1 in {
 def LB  : Load_ri<0b000, "lb">, Sched<[WriteLDB, ReadMemBase]>;
 def LH  : Load_ri<0b001, "lh">, Sched<[WriteLDH, ReadMemBase]>;
 def LW  : Load_ri<0b010, "lw">, Sched<[WriteLDW, ReadMemBase]>;
@@ -889,8 +889,10 @@ def CSRRCI : CSR_ii<0b111, "csrrci">;
 /// RV64I instructions
 
 let Predicates = [IsRV64] in {
+let canFoldAsLoad = 1 in {
 def LWU   : Load_ri<0b110, "lwu">, Sched<[WriteLDW, ReadMemBase]>;
 def LD    : Load_ri<0b011, "ld">, Sched<[WriteLDD, ReadMemBase]>;
+}
 def SD    : Store_rri<0b011, "sd">, Sched<[WriteSTD, ReadStoreData, ReadMemBase]>;
 
 let IsSignExtendingOpW = 1 in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index afac37d6337d4..4ffe3e62ac501 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -71,6 +71,7 @@ defvar DExtsRV64 = [DExt, ZdinxExt];
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasStdExtD] in {
+let canFoldAsLoad = 1 in
 def FLD : FPLoad_r<0b011, "fld", FPR64, WriteFLD64>;
 
 // Operands for stores are in the order srcreg, base, offset rather than
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index 6571d998246a7..b30f8ec820c15 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -330,6 +330,7 @@ class PseudoFROUND<DAGOperand Ty, ValueType vt, ValueType intvt = XLenVT>
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasStdExtF] in {
+let canFoldAsLoad = 1 in
 def FLW : FPLoad_r<0b010, "flw", FPR32, WriteFLD32>;
 
 // Operands for stores are in the order srcreg, base, offset rather than
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index 4cbbba3aa68cb..7637047aabf2d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -18,7 +18,7 @@
 // Operand and SDNode transformation definitions.
 //===----------------------------------------------------------------------===//
 
-def simm10 : RISCVSImmOp<10>;
+def simm10 : RISCVSImmOp<10>, TImmLeaf<XLenVT, "return isInt<10>(Imm);">;
 
 def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> {
   let RenderMethod = "addSImm8UnsignedOperands";
@@ -26,7 +26,7 @@ def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> {
 
 // A 8-bit signed immediate allowing range [-128, 255]
 // but represented as [-128, 127].
-def simm8_unsigned : RISCVOp {
+def simm8_unsigned : RISCVOp, TImmLeaf<XLenVT, "return isInt<8>(Imm);"> {
   let ParserMatchClass = SImm8UnsignedAsmOperand;
   let EncoderMethod = "getImmOpValue";
   let DecoderMethod = "decodeSImmOperand<8>";
@@ -1463,8 +1463,91 @@ let Predicates = [HasStdExtP, IsRV32] in {
 
 def riscv_absw : RVSDNode<"ABSW", SDTIntUnaryOp>;
 
-let Predicates = [HasStdExtP] in
-def : PatGpr<abs, ABS>;
+def SDT_RISCVPLI : SDTypeProfile<1, 1, [SDTCisVec<0>,
+                                        SDTCisInt<0>,
+                                        SDTCisInt<1>]>;
+def riscv_pli : RVSDNode<"PLI", SDT_RISCVPLI>;
+def SDT_RISCVPASUB : SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                          SDTCisInt<0>,
+                                          SDTCisSameAs<0, 1>,
+                                          SDTCisSameAs<0, 2>]>;
+def riscv_pasub : RVSDNode<"PASUB", SDT_RISCVPASUB>;
+def riscv_pasubu : RVSDNode<"PASUBU", SDT_RISCVPASUB>;
 
-let Predicates = [HasStdExtP, IsRV64] in
-def : PatGpr<riscv_absw, ABSW>;
+let Predicates = [HasStdExtP] in {
+  def : PatGpr<abs, ABS>;
+
+  // Basic 8-bit arithmetic patterns
+  def: Pat<(XLenVecI8VT (add GPR:$rs1, GPR:$rs2)), (PADD_B GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI8VT (sub GPR:$rs1, GPR:$rs2)), (PSUB_B GPR:$rs1, GPR:$rs2)>;
+
+  // Basic 16-bit arithmetic patterns
+  def: Pat<(XLenVecI16VT (add GPR:$rs1, GPR:$rs2)), (PADD_H GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI16VT (sub GPR:$rs1, GPR:$rs2)), (PSUB_H GPR:$rs1, GPR:$rs2)>;
+
+  // 8-bit saturating add/sub patterns
+  def: Pat<(XLenVecI8VT (saddsat GPR:$rs1, GPR:$rs2)), (PSADD_B GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI8VT (uaddsat GPR:$rs1, GPR:$rs2)), (PSADDU_B GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI8VT (ssubsat GPR:$rs1, GPR:$rs2)), (PSSUB_B GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI8VT (usubsat GPR:$rs1, GPR:$rs2)), (PSSUBU_B GPR:$rs1, GPR:$rs2)>;
+
+  // 16-bit saturating add/sub patterns
+  def: Pat<(XLenVecI16VT (saddsat GPR:$rs1, GPR:$rs2)), (PSADD_H GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI16VT (uaddsat GPR:$rs1, GPR:$rs2)), (PSADDU_H GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI16VT (ssubsat GPR:$rs1, GPR:$rs2)), (PSSUB_H GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI16VT (usubsat GPR:$rs1, GPR:$rs2)), (PSSUBU_H GPR:$rs1, GPR:$rs2)>;
+
+  // 8-bit averaging patterns
+  def: Pat<(XLenVecI8VT (avgfloors GPR:$rs1, GPR:$rs2)), (PAADD_B GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI8VT (avgflooru GPR:$rs1, GPR:$rs2)), (PAADDU_B GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI8VT (riscv_pasub GPR:$rs1, GPR:$rs2)), (PASUB_B GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI8VT (riscv_pasubu GPR:$rs1, GPR:$rs2)), (PASUBU_B GPR:$rs1, GPR:$rs2)>;
+
+  // 16-bit averaging patterns
+  def: Pat<(XLenVecI16VT (avgfloors GPR:$rs1, GPR:$rs2)), (PAADD_H GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI16VT (avgflooru GPR:$rs1, GPR:$rs2)), (PAADDU_H GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI16VT (riscv_pasub GPR:$rs1, GPR:$rs2)), (PASUB_H GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI16VT (riscv_pasubu GPR:$rs1, GPR:$rs2)), (PASUBU_H GPR:$rs1, GPR:$rs2)>;
+  
+  // 8-bit absolute difference patterns
+  def: Pat<(XLenVecI8VT (abds GPR:$rs1, GPR:$rs2)), (PDIF_B GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI8VT (abdu GPR:$rs1, GPR:$rs2)), (PDIFU_B GPR:$rs1, GPR:$rs2)>;
+  
+  // 16-bit absolute difference patterns
+  def: Pat<(XLenVecI16VT (abds GPR:$rs1, GPR:$rs2)), (PDIF_H GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI16VT (abdu GPR:$rs1, GPR:$rs2)), (PDIFU_H GPR:$rs1, GPR:$rs2)>;
+  
+
+  // 8-bit PLI SD node pattern
+  def: Pat<(XLenVecI8VT (riscv_pli simm8_unsigned:$imm8)), (PLI_B simm8_unsigned:$imm8)>;
+  // 16-bit PLI SD node pattern
+  def: Pat<(XLenVecI16VT (riscv_pli simm10:$imm10)), (PLI_H simm10:$imm10)>;
+
+} // Predicates = [HasStdExtP]
+
+let Predicates = [HasStdExtP, IsRV32] in {
+  // Load/Store patterns
+  def : StPat<store, SW, GPR, v4i8>;
+  def : StPat<store, SW, GPR, v2i16>;
+  def : LdPat<load, LW, v4i8>;
+  def : LdPat<load, LW, v2i16>;
+} // Predicates = [HasStdExtP, IsRV32]
+
+let Predicates = [HasStdExtP, IsRV64] in {
+  def : PatGpr<riscv_absw, ABSW>;
+
+  // 32-bit PLI SD node pattern
+  def: Pat<(v2i32 (riscv_pli simm10:$imm10)), (PLI_W simm10:$imm10)>;
+
+  // 32-bit averaging-sub patterns
+  def: Pat<(v2i32 (riscv_pasub GPR:$rs1, GPR:$rs2)), (PASUB_W GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(v2i32 (riscv_pasubu GPR:$rs1, GPR:$rs2)), (PASUBU_W GPR:$rs1, GPR:$rs2)>;
+
+  // Load/Store patterns
+  def : StPat<store, SD, GPR, v8i8>;
+  def : StPat<store, SD, GPR, v4i16>;
+  def : StPat<store, SD, GPR, v2i32>;
+  def : LdPat<load, LD, v8i8>;
+  def : LdPat<load, LD, v4i16>;
+  def : LdPat<load, LD, v2i32>;
+} // Predicates = [HasStdExtP, IsRV64]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
index 0114fbdc56302..5b1c13493bbf2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
@@ -69,6 +69,17 @@ class SFBALU_ri
   let Constraints = "$dst = $falsev";
 }
 
+class SFBLUI
+    : Pseudo<(outs GPR:$dst),
+             (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev,
+                  uimm20_lui:$imm), []> {
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let Size = 8;
+  let Constraints = "$dst = $falsev";
+}
+
 class SFBShift_ri
     : Pseudo<(outs GPR:$dst),
              (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1,
@@ -106,12 +117,19 @@ def PseudoCCSRA : SFBALU_rr;
 def PseudoCCAND : SFBALU_rr;
 def PseudoCCOR  : SFBALU_rr;
 def PseudoCCXOR : SFBALU_rr;
+def PseudoCCMAX : SFBALU_rr;
+def PseudoCCMIN : SFBALU_rr;
+def PseudoCCMAXU : SFBALU_rr;
+def PseudoCCMINU : SFBALU_rr;
+def PseudoCCMUL : SFBALU_rr;
 
 def PseudoCCADDI : SFBALU_ri;
 def PseudoCCANDI : SFBALU_ri;
 def PseudoCCORI  : SFBALU_ri;
 def PseudoCCXORI : SFBALU_ri;
 
+def PseudoCCLUI : SFBLUI;
+
 def PseudoCCSLLI : SFBShift_ri;
 def PseudoCCSRLI : SFBShift_ri;
 def PseudoCCSRAI : SFBShift_ri;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index b37ceaaee9cf4..c2b25c6294019 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -60,6 +60,8 @@ def immfour : RISCVOp {
   let DecoderMethod = "decodeImmFourOperand";
 }
 
+def tuimm2 : TImmLeaf<XLenVT, [{return isUInt<2>(Imm);}]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction class templates
 //===----------------------------------------------------------------------===//
@@ -557,8 +559,8 @@ multiclass VPatTernaryVMAQA_VV_VX<string intrinsic, string instruction,
 let Predicates = [HasVendorXTHeadBa] in {
 def : Pat<(add_like_non_imm12 (shl GPR:$rs2, uimm2:$uimm2), (XLenVT GPR:$rs1)),
           (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>;
-def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, uimm2:$uimm2, GPR:$rs1)),
-          (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>;
+def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, tuimm2:$uimm2, GPR:$rs1)),
+          (TH_ADDSL GPR:$rs1, GPR:$rs2, tuimm2:$uimm2)>;
 
 // Reuse complex patterns from StdExtZba
 def : Pat<(add_like_non_imm12 sh1add_op:$rs2, (XLenVT GPR:$rs1)),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 4537bfe8025ca..8a38fe2f5ae16 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -53,6 +53,8 @@ def uimm5gt3 : RISCVOp<XLenVT>, ImmLeaf<XLenVT,
   let OperandType = "OPERAND_UIMM5_GT3";
 }
 
+def tuimm5gt3 : TImmLeaf<XLenVT, [{return (Imm > 3) && isUInt<5>(Imm);}]>;
+
 def UImm5Plus1AsmOperand : AsmOperandClass {
   let Name = "UImm5Plus1";
   let RenderMethod = "addImmOperands";
@@ -815,6 +817,28 @@ class QCIRVInst48EJ<bits<2> func2, string opcodestr>
   let Inst{6-0} = 0b0011111;
 }
 
+class SFBQC_LI
+    : Pseudo<(outs GPR:$dst),
+             (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev,
+                  simm20_li:$imm), []> {
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let Size = 8;
+  let Constraints = "$dst = $falsev";
+}
+
+class SFBQC_E_LI
+    : Pseudo<(outs GPR:$dst),
+             (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev,
+                  bare_simm32:$imm), []> {
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let Size = 10;
+  let Constraints = "$dst = $falsev";
+}
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -1306,6 +1330,11 @@ def PseudoQC_E_SH : PseudoStore<"qc.e.sh">;
 def PseudoQC_E_SW : PseudoStore<"qc.e.sw">;
 } // Predicates = [HasVendorXqcilo, IsRV32]
 
+let Predicates = [HasShortForwardBranchOpt] in {
+def PseudoCCQC_LI : SFBQC_LI;
+def PseudoCCQC_E_LI : SFBQC_E_LI;
+}
+
 //===----------------------------------------------------------------------===//
 // Code Gen Patterns
 //===----------------------------------------------------------------------===//
@@ -1419,8 +1448,8 @@ def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12_lo:$imm12))),
           (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12_lo:$imm12)>;
 def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, (i32 uimm5gt3:$imm)), GPRNoX0:$rs2)),
           (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
-def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 uimm5gt3:$imm), GPRNoX0:$rs2)),
-          (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
+def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 tuimm5gt3:$imm), GPRNoX0:$rs2)),
+          (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, tuimm5gt3:$imm)>;
 } // Predicates = [HasVendorXqciac, IsRV32]
 
 /// Simple arithmetic operations
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index c31713e967b18..1c6a5afcda49b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -90,6 +90,7 @@ defvar ZfhminDExts = [ZfhminDExt, ZhinxminZdinxExt, ZhinxminZdinx32Ext];
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasHalfFPLoadStoreMove] in {
+let canFoldAsLoad = 1 in
 def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>;
 
 // Operands for stores are in the order srcreg, base, offset rather than
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
index b9c5b75983b1f..ffb2ac0756da4 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
@@ -701,5 +701,86 @@ let Predicates = [HasStdExtZvfbfa] in {
                   FRM_DYN,
                   fvti.AVL, fvti.Log2SEW, TA_MA)>;
   }
-}
+
+  foreach vti = AllBF16Vectors in {
+    // 13.12. Vector Floating-Point Sign-Injection Instructions
+    def : Pat<(fabs (vti.Vector vti.RegClass:$rs)),
+              (!cast<Instruction>("PseudoVFSGNJX_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW)
+                   (vti.Vector (IMPLICIT_DEF)),
+                   vti.RegClass:$rs, vti.RegClass:$rs, vti.AVL, vti.Log2SEW, TA_MA)>;
+    // Handle fneg with VFSGNJN using the same input for both operands.
+    def : Pat<(fneg (vti.Vector vti.RegClass:$rs)),
+              (!cast<Instruction>("PseudoVFSGNJN_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW)
+                   (vti.Vector (IMPLICIT_DEF)),
+                   vti.RegClass:$rs, vti.RegClass:$rs, vti.AVL, vti.Log2SEW, TA_MA)>;
+
+    def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
+                                     (vti.Vector vti.RegClass:$rs2))),
+              (!cast<Instruction>("PseudoVFSGNJ_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW)
+                   (vti.Vector (IMPLICIT_DEF)),
+                   vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>;
+    def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
+                                     (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs2)))),
+              (!cast<Instruction>("PseudoVFSGNJ_ALT_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_E"#vti.SEW)
+                   (vti.Vector (IMPLICIT_DEF)),
+                   vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>;
+
+    def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
+                                     (vti.Vector (fneg vti.RegClass:$rs2)))),
+              (!cast<Instruction>("PseudoVFSGNJN_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW)
+                   (vti.Vector (IMPLICIT_DEF)),
+                   vti.RegClass:$rs1, vti.RegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>;
+    def : Pat<(vti.Vector (fcopysign (vti.Vector vti.RegClass:$rs1),
+                                     (vti.Vector (fneg (SplatFPOp vti.ScalarRegClass:$rs2))))),
+              (!cast<Instruction>("PseudoVFSGNJN_ALT_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_E"#vti.SEW)
+                   (vti.Vector (IMPLICIT_DEF)),
+                   vti.RegClass:$rs1, vti.ScalarRegClass:$rs2, vti.AVL, vti.Log2SEW, TA_MA)>;
+
+    // 13.12. Vector Floating-Point Sign-Injection Instructions
+    def : Pat<(riscv_fabs_vl (vti.Vector vti.RegClass:$rs), (vti.Mask VMV0:$vm),
+                             VLOpFrag),
+              (!cast<Instruction>("PseudoVFSGNJX_ALT_VV_"# vti.LMul.MX #"_E"#vti.SEW#"_MASK")
+                   (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs,
+                   vti.RegClass:$rs, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
+                   TA_MA)>;
+    // Handle fneg with VFSGNJN using the same input for both operands.
+    def : Pat<(riscv_fneg_vl (vti.Vector vti.RegClass:$rs), (vti.Mask VMV0:$vm),
+                             VLOpFrag),
+              (!cast<Instruction>("PseudoVFSGNJN_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW #"_MASK")
+                   (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs,
+                   vti.RegClass:$rs, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
+                   TA_MA)>;
+
+    def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
+                                  (vti.Vector vti.RegClass:$rs2),
+                                  vti.RegClass:$passthru,
+                                  (vti.Mask VMV0:$vm),
+                                  VLOpFrag),
+              (!cast<Instruction>("PseudoVFSGNJ_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK")
+                   vti.RegClass:$passthru, vti.RegClass:$rs1,
+                   vti.RegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
+                   TAIL_AGNOSTIC)>;
+
+    def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
+                                  (riscv_fneg_vl vti.RegClass:$rs2,
+                                                 (vti.Mask true_mask),
+                                                 VLOpFrag),
+                                  srcvalue,
+                                  (vti.Mask true_mask),
+                                  VLOpFrag),
+              (!cast<Instruction>("PseudoVFSGNJN_ALT_VV_"# vti.LMul.MX#"_E"#vti.SEW)
+                   (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1,
+                   vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW, TA_MA)>;
+
+    def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
+                                  (SplatFPOp vti.ScalarRegClass:$rs2),
+                                  vti.RegClass:$passthru,
+                                  (vti.Mask VMV0:$vm),
+                                  VLOpFrag),
+              (!cast<Instruction>("PseudoVFSGNJ_ALT_V"#vti.ScalarSuffix#"_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK")
+                   vti.RegClass:$passthru, vti.RegClass:$rs1,
+                   vti.ScalarRegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW,
+                   TAIL_AGNOSTIC)>;
+    }
+  }
 } // Predicates = [HasStdExtZvfbfa]
diff --git a/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp b/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp
new file mode 100644
index 0000000000000..bf1f69f8e8d93
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp
@@ -0,0 +1,213 @@
+//==- RISCVPromoteConstant.cpp - Promote constant fp to global for RISC-V --==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-promote-const"
+#define RISCV_PROMOTE_CONSTANT_NAME "RISC-V Promote Constants"
+
+STATISTIC(NumPromoted, "Number of constant literals promoted to globals");
+STATISTIC(NumPromotedUses, "Number of uses of promoted literal constants");
+
+namespace {
+
+class RISCVPromoteConstant : public ModulePass {
+public:
+  static char ID;
+  RISCVPromoteConstant() : ModulePass(ID) {}
+
+  StringRef getPassName() const override { return RISCV_PROMOTE_CONSTANT_NAME; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.setPreservesCFG();
+  }
+
+  /// Iterate over the functions and promote the double fp constants that
+  /// would otherwise go into the constant pool to a constant array.
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+    // TargetMachine and Subtarget are needed to query isFPImmlegal.
+    const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+    const TargetMachine &TM = TPC.getTM<TargetMachine>();
+    bool Changed = false;
+    for (Function &F : M) {
+      const RISCVSubtarget &ST = TM.getSubtarget<RISCVSubtarget>(F);
+      const RISCVTargetLowering *TLI = ST.getTargetLowering();
+      Changed |= runOnFunction(F, TLI);
+    }
+    return Changed;
+  }
+
+private:
+  bool runOnFunction(Function &F, const RISCVTargetLowering *TLI);
+};
+} // end anonymous namespace
+
+char RISCVPromoteConstant::ID = 0;
+
+INITIALIZE_PASS(RISCVPromoteConstant, DEBUG_TYPE, RISCV_PROMOTE_CONSTANT_NAME,
+                false, false)
+
+ModulePass *llvm::createRISCVPromoteConstantPass() {
+  return new RISCVPromoteConstant();
+}
+
+bool RISCVPromoteConstant::runOnFunction(Function &F,
+                                         const RISCVTargetLowering *TLI) {
+  if (F.hasOptNone() || F.hasOptSize())
+    return false;
+
+  // Bail out and make no transformation if the target doesn't support
+  // doubles, or if we're not targeting RV64 as we currently see some
+  // regressions for those targets.
+  if (!TLI->isTypeLegal(MVT::f64) || !TLI->isTypeLegal(MVT::i64))
+    return false;
+
+  // Collect all unique double constants and their uses in the function. Use
+  // MapVector to preserve insertion order.
+  MapVector<ConstantFP *, SmallVector<Use *, 8>> ConstUsesMap;
+
+  for (Instruction &I : instructions(F)) {
+    for (Use &U : I.operands()) {
+      auto *C = dyn_cast<ConstantFP>(U.get());
+      if (!C || !C->getType()->isDoubleTy())
+        continue;
+      // Do not promote if it wouldn't be loaded from the constant pool.
+      if (TLI->isFPImmLegal(C->getValueAPF(), MVT::f64,
+                            /*ForCodeSize=*/false))
+        continue;
+      // Do not promote a constant if it is used as an immediate argument
+      // for an intrinsic.
+      if (auto *II = dyn_cast<IntrinsicInst>(U.getUser())) {
+        Function *IntrinsicFunc = II->getFunction();
+        unsigned OperandIdx = U.getOperandNo();
+        if (IntrinsicFunc && IntrinsicFunc->getAttributes().hasParamAttr(
+                                 OperandIdx, Attribute::ImmArg)) {
+          LLVM_DEBUG(dbgs() << "Skipping promotion of constant in: " << *II
+                            << " because operand " << OperandIdx
+                            << " must be an immediate.\n");
+          continue;
+        }
+      }
+      // Note: FP args to inline asm would be problematic if we had a
+      // constraint that required an immediate floating point operand. At the
+      // time of writing LLVM doesn't recognise such a constraint.
+      ConstUsesMap[C].push_back(&U);
+    }
+  }
+
+  int PromotableConstants = ConstUsesMap.size();
+  LLVM_DEBUG(dbgs() << "Found " << PromotableConstants
+                    << " promotable constants in " << F.getName() << "\n");
+  // Bail out if no promotable constants found, or if only one is found.
+  if (PromotableConstants < 2) {
+    LLVM_DEBUG(dbgs() << "Performing no promotions as insufficient promotable "
+                         "constants found\n");
+    return false;
+  }
+
+  NumPromoted += PromotableConstants;
+
+  // Create a global array containing the promoted constants.
+  Module *M = F.getParent();
+  Type *DoubleTy = Type::getDoubleTy(M->getContext());
+
+  SmallVector<Constant *, 16> ConstantVector;
+  for (auto const &Pair : ConstUsesMap)
+    ConstantVector.push_back(Pair.first);
+
+  ArrayType *ArrayTy = ArrayType::get(DoubleTy, ConstantVector.size());
+  Constant *GlobalArrayInitializer =
+      ConstantArray::get(ArrayTy, ConstantVector);
+
+  auto *GlobalArray = new GlobalVariable(
+      *M, ArrayTy,
+      /*isConstant=*/true, GlobalValue::InternalLinkage, GlobalArrayInitializer,
+      ".promoted_doubles." + F.getName());
+
+  // A cache to hold the loaded value for a given constant within a basic block.
+  DenseMap<std::pair<ConstantFP *, BasicBlock *>, Value *> LocalLoads;
+
+  // Replace all uses with the loaded value.
+  unsigned Idx = 0;
+  for (auto const &Pair : ConstUsesMap) {
+    ConstantFP *Const = Pair.first;
+    const SmallVector<Use *, 8> &Uses = Pair.second;
+
+    for (Use *U : Uses) {
+      Instruction *UserInst = cast<Instruction>(U->getUser());
+      BasicBlock *InsertionBB;
+
+      // If the user is a PHI node, we must insert the load in the
+      // corresponding predecessor basic block. Otherwise, it's inserted into
+      // the same block as the use.
+      if (auto *PN = dyn_cast<PHINode>(UserInst))
+        InsertionBB = PN->getIncomingBlock(*U);
+      else
+        InsertionBB = UserInst->getParent();
+
+      if (isa<CatchSwitchInst>(InsertionBB->getTerminator())) {
+        LLVM_DEBUG(dbgs() << "Bailing out: catchswitch means thre is no valid "
+                             "insertion point.\n");
+        return false;
+      }
+
+      auto CacheKey = std::make_pair(Const, InsertionBB);
+      Value *LoadedVal = nullptr;
+
+      // Re-use a load if it exists in the insertion block.
+      if (LocalLoads.count(CacheKey)) {
+        LoadedVal = LocalLoads.at(CacheKey);
+      } else {
+        // Otherwise, create a new GEP and Load at the correct insertion point.
+        // It is always safe to insert in the first insertion point in the BB,
+        // so do that and let other passes reorder.
+        IRBuilder<> Builder(InsertionBB, InsertionBB->getFirstInsertionPt());
+        Value *ElementPtr = Builder.CreateConstInBoundsGEP2_64(
+            GlobalArray->getValueType(), GlobalArray, 0, Idx, "double.addr");
+        LoadedVal = Builder.CreateLoad(DoubleTy, ElementPtr, "double.val");
+
+        // Cache the newly created load for this block.
+        LocalLoads[CacheKey] = LoadedVal;
+      }
+
+      U->set(LoadedVal);
+      ++NumPromotedUses;
+    }
+    ++Idx;
+  }
+
+  return true;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index e9f43b9a71648..84bb29433fb3b 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -438,18 +438,19 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II,
   TypeSize VRegSize = OldLoc.getValue().divideCoefficientBy(NumRegs);
 
   Register VLENB = 0;
-  unsigned PreHandledNum = 0;
+  unsigned VLENBShift = 0;
+  unsigned PrevHandledNum = 0;
   unsigned I = 0;
   while (I != NumRegs) {
     auto [LMulHandled, RegClass, Opcode] =
         getSpillReloadInfo(NumRegs - I, RegEncoding, IsSpill);
     auto [RegNumHandled, _] = RISCVVType::decodeVLMUL(LMulHandled);
     bool IsLast = I + RegNumHandled == NumRegs;
-    if (PreHandledNum) {
+    if (PrevHandledNum) {
       Register Step;
       // Optimize for constant VLEN.
       if (auto VLEN = STI.getRealVLen()) {
-        int64_t Offset = *VLEN / 8 * PreHandledNum;
+        int64_t Offset = *VLEN / 8 * PrevHandledNum;
         Step = MRI.createVirtualRegister(&RISCV::GPRRegClass);
         STI.getInstrInfo()->movImm(MBB, II, DL, Step, Offset);
       } else {
@@ -457,15 +458,21 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II,
           VLENB = MRI.createVirtualRegister(&RISCV::GPRRegClass);
           BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VLENB);
         }
-        uint32_t ShiftAmount = Log2_32(PreHandledNum);
-        if (ShiftAmount == 0)
-          Step = VLENB;
-        else {
-          Step = MRI.createVirtualRegister(&RISCV::GPRRegClass);
-          BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), Step)
-              .addReg(VLENB, getKillRegState(IsLast))
-              .addImm(ShiftAmount);
+        uint32_t ShiftAmount = Log2_32(PrevHandledNum);
+        // To avoid using an extra register, we shift the VLENB register and
+        // remember how much it has been shifted. We can then use relative
+        // shifts to adjust to the desired shift amount.
+        if (VLENBShift > ShiftAmount) {
+          BuildMI(MBB, II, DL, TII->get(RISCV::SRLI), VLENB)
+              .addReg(VLENB, RegState::Kill)
+              .addImm(VLENBShift - ShiftAmount);
+        } else if (VLENBShift < ShiftAmount) {
+          BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), VLENB)
+              .addReg(VLENB, RegState::Kill)
+              .addImm(ShiftAmount - VLENBShift);
         }
+        VLENBShift = ShiftAmount;
+        Step = VLENB;
       }
 
       BuildMI(MBB, II, DL, TII->get(RISCV::ADD), NewBase)
@@ -489,7 +496,7 @@ void RISCVRegisterInfo::lowerSegmentSpillReload(MachineBasicBlock::iterator II,
     if (IsSpill)
       MIB.addReg(Reg, RegState::Implicit);
 
-    PreHandledNum = RegNumHandled;
+    PrevHandledNum = RegNumHandled;
     RegEncoding += RegNumHandled;
     I += RegNumHandled;
   }
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 6605a5ccdfde2..87095e75d5dc4 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -222,6 +222,12 @@ def XLenFVT : ValueTypeByHwMode<[RV64],
                                 [f64]>;
 def XLenPairFVT : ValueTypeByHwMode<[RV32],
                                     [f64]>;
+
+// P extension
+def XLenVecI8VT : ValueTypeByHwMode<[RV32, RV64],
+                                    [v4i8, v8i8]>;
+def XLenVecI16VT : ValueTypeByHwMode<[RV32,  RV64],
+                                     [v2i16, v4i16]>;
 def XLenRI : RegInfoByHwMode<
       [RV32,              RV64],
       [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
@@ -238,7 +244,9 @@ class RISCVRegisterClass<list<ValueType> regTypes, int align, dag regList>
 }
 
 class GPRRegisterClass<dag regList>
-    : RISCVRegisterClass<[XLenVT, XLenFVT], 32, regList> {
+    : RISCVRegisterClass<[XLenVT, XLenFVT,
+                          // P extension packed vector types:
+                          XLenVecI8VT, XLenVecI16VT, v2i32], 32, regList> {
   let RegInfos = XLenRI;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index 24ebbc3007cec..4271a6816e05b 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -654,8 +654,17 @@ foreach mx = SchedMxList in {
   foreach sew = SchedSEWSet<mx>.val in {
     defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
 
-    defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
-    defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+    defvar VIRedLat = GetLMULValue<[5, 5, 5, 7, 11, 19, 35], mx>.c;
+    defvar VIRedOcc = GetLMULValue<[1, 1, 2, 2, 4, 10, 35], mx>.c;
+    let Latency = VIRedLat, ReleaseAtCycles = [VIRedOcc] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+
+      // Pattern for vredsum:                  5/5/5/7/11/19/35
+      // Pattern for vredand, vredor, vredxor: 4/4/4/6/10/18/34
+      // They are grouped together, so we use the worst-case vredsum latency.
+      // TODO: split vredand, vredor, vredxor into separate scheduling classe.
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+    }
   }
 }
 
@@ -663,7 +672,11 @@ foreach mx = SchedMxListWRed in {
   foreach sew = SchedSEWSet<mx, 0, 1>.val in {
     defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
 
-    defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+    defvar VIRedLat = GetLMULValue<[5, 5, 5, 7, 11, 19, 35], mx>.c;
+    defvar VIRedOcc = GetLMULValue<[1, 1, 2, 2, 4, 10, 35], mx>.c;
+    let Latency = VIRedLat, ReleaseAtCycles = [VIRedOcc] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+    }
   }
 }
 
@@ -671,9 +684,36 @@ foreach mx = SchedMxListF in {
   foreach sew = SchedSEWSet<mx, 1>.val in {
     defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
 
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+    // Latency for vfredmax.vs, vfredmin.vs: 12/12/15/21/33/57
+    // Latency for vfredusum.vs is slightly lower for e16/e32
+    // We use the worst-case
+    defvar VFRedLat = GetLMULValue<[12, 12, 12, 15, 21, 33, 57], mx>.c;
+    defvar VFRedOcc = GetLMULValue<[8, 8, 8, 8, 14, 20, 57], mx>.c;
+    let Latency = VFRedLat, ReleaseAtCycles = [VFRedOcc] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+foreach mx = SchedMxListF in {
+  foreach sew = SchedSEWSet<mx, 1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+
+    // Compute latency based on SEW
+    defvar VFRedOV_FromLat = !cond(
+      !eq(sew, 16) : ConstValueUntilLMULThenDouble<"MF4", 12, mx>.c,
+      !eq(sew, 32) : ConstValueUntilLMULThenDouble<"MF2", 12, mx>.c,
+      !eq(sew, 64) : ConstValueUntilLMULThenDouble<"M1", 12, mx>.c
+    );
+    defvar VFRedOV_FromOcc = !cond(
+      !eq(sew, 16) : GetLMULValue<[8, 8, 20, 24, 48, 96, 384], mx>.c,
+      !eq(sew, 32) : GetLMULValue<[8, 8,  8, 12, 24, 48, 192], mx>.c,
+      !eq(sew, 64) : GetLMULValue<[6, 6,  6,  6, 12, 24,  96], mx>.c
+    );
+    let Latency = VFRedOV_FromLat, ReleaseAtCycles = [VFRedOV_FromOcc] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+    }
   }
 }
 
@@ -681,8 +721,18 @@ foreach mx = SchedMxListFWRed in {
   foreach sew = SchedSEWSet<mx, 1, 1>.val in {
     defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
 
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defvar VFRedOVLat = !cond(
+      !eq(sew, 16) : ConstValueUntilLMULThenDouble<"MF4", 16, mx>.c,
+      !eq(sew, 32) : ConstValueUntilLMULThenDouble<"MF2", 16, mx>.c,
+    );
+    defvar VFRedOVOcc = !cond(
+      !eq(sew, 16) : GetLMULValue<[11, 11, 27, 32, 64, 128, 512], mx>.c,
+      !eq(sew, 32) : GetLMULValue<[11, 11, 11, 16, 32,  64, 256], mx>.c,
+    );
+    let Latency = VFRedOVLat, ReleaseAtCycles = [VFRedOVOcc] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+    }
   }
 }
 
@@ -700,39 +750,82 @@ foreach mx = SchedMxList in {
 }
 
 // 16. Vector Permutation Instructions
+// Slide
 foreach mx = SchedMxList in {
   defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
 
-  defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>;
+  // Latency for slide up: 4/4/8/16, ReleaseAtCycles is 2/4/8/16
+  defvar VSlideUpLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
+  defvar VSlideUpOcc = ConstOneUntilMF2ThenDouble<mx>.c;
+  let Latency = VSlideUpLat, ReleaseAtCycles =[VSlideUpOcc] in {
+    defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>;
+  }
 
-  defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>;
-  defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>;
+  // Latency for slide down: 4/5/9/17, ReleaseAtCycles is 3/5/9/17
+  defvar VSlideDownLat = GetLMULValue<[4, 4, 4, 4, 5, 9, 17], mx>.c;
+  defvar VSlideDownOcc = GetLMULValue<[1, 1, 1, 3, 5, 9, 17], mx>.c;
+  let Latency = VSlideDownLat, ReleaseAtCycles =[VSlideDownOcc] in {
+    defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>;
+  }
+  // The following group slide up and down together, so we use the worst-case
+  // (slide down) for all.
+  let Latency = VSlideDownLat, ReleaseAtCycles =[VSlideDownOcc] in {
+    defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>;
 
-  defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>;
-  defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>;
+  }
 }
 
-def : WriteRes<WriteVMovXS, [SMX60_VIEU]>;
-def : WriteRes<WriteVMovSX, [SMX60_VIEU]>;
-
-def : WriteRes<WriteVMovFS, [SMX60_VIEU]>;
-def : WriteRes<WriteVMovSF, [SMX60_VIEU]>;
+// ReleaseAtCycles is 2/2/2/2/2/3/6, but we can't set based on MX for now
+// TODO: Split this into separate WriteRes for each MX
+let Latency = 6, ReleaseAtCycles = [6] in {
+  def : WriteRes<WriteVMovXS, [SMX60_VIEU]>;
+}
 
-// Gather and Compress
-foreach mx = SchedMxList in {
-  foreach sew = SchedSEWSet<mx>.val in {
-    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
-    defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCase>;
-    defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCase>;
-    defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCase>;
-  }
+// ReleaseAtCycles is 1/1/1/1/1/2/4, but we can't set based on MX for now
+// TODO: Split this into separate WriteRes for each MX
+let Latency = 4, ReleaseAtCycles = [4] in {
+  def : WriteRes<WriteVMovSX, [SMX60_VIEU]>;
+  def : WriteRes<WriteVMovFS, [SMX60_VIEU]>;
+  def : WriteRes<WriteVMovSF, [SMX60_VIEU]>;
 }
 
+// Integer LMUL Gather and Compress
 foreach mx = SchedMxList in {
   defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
 
-  defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>;
-  defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>;
+  defvar VRGatherLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
+  let Latency = VRGatherLat, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in {
+    defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>;
+  }
+
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar IsWorstCaseSEW = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+
+    defvar VRGatherVVLat = GetLMULValue<[4, 4, 4, 4, 16, 64, 256], mx>.c;
+    defvar VRGatherVVOcc = GetLMULValue<[1, 1, 1, 4, 16, 64, 256], mx>.c;
+    let Latency = VRGatherVVLat, ReleaseAtCycles = [VRGatherVVOcc] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>;
+    }
+    // For sew == 8, latency is half of the other cases, except for the fractional LMULs (const 4 cycles)
+    defvar VRGatherEI16Lat = !if(!eq(sew, 8),
+      GetLMULValue<[4, 4, 4, 8, 32, 128, 256], mx>.c,
+      GetLMULValue<[4, 4, 4, 4, 16,  64, 256], mx>.c);
+    defvar VRGatherEI16Occ = !if(!eq(sew, 8),
+      GetLMULValue<[1, 1, 2, 8, 32, 128, 256], mx>.c,
+      GetLMULValue<[1, 1, 1, 4, 16,  64, 256], mx>.c);
+    let Latency = VRGatherEI16Lat, ReleaseAtCycles = [VRGatherEI16Occ] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>;
+    }
+
+    defvar VCompressVLat = GetLMULValue<[4, 4, 4, 4, 10, 36, 136], mx>.c;
+    defvar VCompressVOcc = GetLMULValue<[1, 1, 1, 3, 10, 36, 136], mx>.c;
+    let Latency = VCompressVLat, ReleaseAtCycles = [VCompressVOcc] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>;
+    }
+  }
 }
 
 // Others
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index 715ac4cedc649..926cc9ea547a6 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -69,6 +69,12 @@ static cl::opt<bool> UseMIPSCCMovInsn("use-riscv-mips-ccmov",
                                       cl::desc("Use 'mips.ccmov' instruction"),
                                       cl::init(true), cl::Hidden);
 
+static cl::opt<bool> EnablePExtCodeGen(
+    "enable-p-ext-codegen",
+    cl::desc("Turn on P Extension codegen(This is a temporary switch where "
+             "only partial codegen is currently supported)"),
+    cl::init(false), cl::Hidden);
+
 void RISCVSubtarget::anchor() {}
 
 RISCVSubtarget &
@@ -104,7 +110,7 @@ RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU,
       RVVVectorBitsMin(RVVVectorBitsMin), RVVVectorBitsMax(RVVVectorBitsMax),
       FrameLowering(
           initializeSubtargetDependencies(TT, CPU, TuneCPU, FS, ABIName)),
-      InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) {
+      InstrInfo(*this), TLInfo(TM, *this) {
   TSInfo = std::make_unique<RISCVSelectionDAGInfo>();
 }
 
@@ -145,6 +151,10 @@ bool RISCVSubtarget::useConstantPoolForLargeInts() const {
   return !RISCVDisableUsingConstantPoolForLargeInts;
 }
 
+bool RISCVSubtarget::enablePExtCodeGen() const {
+  return HasStdExtP && EnablePExtCodeGen;
+}
+
 unsigned RISCVSubtarget::getMaxBuildIntsCost() const {
   // Loading integer from constant pool needs two instructions (the reason why
   // the minimum cost is 2): an address calculation instruction and a load
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 4b4fc8f0d8e76..29df53c6c9893 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -112,7 +112,6 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
 
   RISCVFrameLowering FrameLowering;
   RISCVInstrInfo InstrInfo;
-  RISCVRegisterInfo RegInfo;
   RISCVTargetLowering TLInfo;
 
   /// Initializes using the passed in CPU and feature strings so that we can
@@ -140,7 +139,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   }
   const RISCVInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const RISCVRegisterInfo *getRegisterInfo() const override {
-    return &RegInfo;
+    return &InstrInfo.getRegisterInfo();
   }
   const RISCVTargetLowering *getTargetLowering() const override {
     return &TLInfo;
@@ -322,6 +321,8 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
     }
   }
 
+  bool enablePExtCodeGen() const;
+
   // Returns VLEN divided by DLEN. Where DLEN is the datapath width of the
   // vector hardware implementation which may be less than VLEN.
   unsigned getDLenFactor() const {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index ae54ff1515121..16ef67da83128 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -139,6 +139,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   initializeRISCVExpandAtomicPseudoPass(*PR);
   initializeRISCVRedundantCopyEliminationPass(*PR);
   initializeRISCVAsmPrinterPass(*PR);
+  initializeRISCVPromoteConstantPass(*PR);
 }
 
 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
@@ -462,6 +463,8 @@ void RISCVPassConfig::addIRPasses() {
 }
 
 bool RISCVPassConfig::addPreISel() {
+  if (TM->getOptLevel() != CodeGenOptLevel::None)
+    addPass(createRISCVPromoteConstantPass());
   if (TM->getOptLevel() != CodeGenOptLevel::None) {
     // Add a barrier before instruction selection so that we will not get
     // deleted block address after enabling default outlining. See D99707 for
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 7bc0b5b394828..dca6e9cffebb0 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -969,6 +969,13 @@ InstructionCost RISCVTTIImpl::getScalarizationOverhead(
   if (isa<ScalableVectorType>(Ty))
     return InstructionCost::getInvalid();
 
+  // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
+  // For now, skip all fixed vector cost analysis when P extension is available
+  // to avoid crashes in getMinRVVVectorSizeInBits()
+  if (ST->enablePExtCodeGen() && isa<FixedVectorType>(Ty)) {
+    return 1; // Treat as single instruction cost for now
+  }
+
   // A build_vector (which is m1 sized or smaller) can be done in no
   // worse than one vslide1down.vx per element in the type.  We could
   // in theory do an explode_vector in the inverse manner, but our
@@ -1625,6 +1632,14 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   if (!IsVectorType)
     return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
 
+  // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
+  // For now, skip all fixed vector cost analysis when P extension is available
+  // to avoid crashes in getMinRVVVectorSizeInBits()
+  if (ST->enablePExtCodeGen() &&
+      (isa<FixedVectorType>(Dst) || isa<FixedVectorType>(Src))) {
+    return 1; // Treat as single instruction cost for now
+  }
+
   // FIXME: Need to compute legalizing cost for illegal types.  The current
   // code handles only legal types and those which can be trivially
   // promoted to legal.
@@ -1683,7 +1698,8 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
       !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
                            SrcLT.second.getSizeInBits()) ||
       !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
-                           DstLT.second.getSizeInBits()))
+                           DstLT.second.getSizeInBits()) ||
+      SrcLT.first > 1 || DstLT.first > 1)
     return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
 
   // The split cost is handled by the base getCastInstrCost
@@ -2140,7 +2156,8 @@ InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   // Assume memory ops cost scale with the number of vector registers
   // possible accessed by the instruction.  Note that BasicTTI already
   // handles the LT.first term for us.
-  if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
+  if (ST->hasVInstructions() && LT.second.isVector() &&
+      CostKind != TTI::TCK_CodeSize)
     BaseCost *= TLI->getLMULCost(LT.second);
   return Cost + BaseCost;
 }
@@ -2321,6 +2338,13 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                  const Value *Op1) const {
   assert(Val->isVectorTy() && "This must be a vector type");
 
+  // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
+  // For now, skip all fixed vector cost analysis when P extension is available
+  // to avoid crashes in getMinRVVVectorSizeInBits()
+  if (ST->enablePExtCodeGen() && isa<FixedVectorType>(Val)) {
+    return 1; // Treat as single instruction cost for now
+  }
+
   if (Opcode != Instruction::ExtractElement &&
       Opcode != Instruction::InsertElement)
     return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index fdf9a4fe32fe6..e1ff243bb1a47 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -455,7 +455,7 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) {
     True->getOperand(1).setReg(MI.getOperand(2).getReg());
     // If True is masked then its passthru needs to be in VRNoV0.
     MRI->constrainRegClass(True->getOperand(1).getReg(),
-                           TII->getRegClass(True->getDesc(), 1, TRI));
+                           TII->getRegClass(True->getDesc(), 1));
   }
 
   MI.setDesc(TII->get(NewOpc));
@@ -675,7 +675,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
     if (Passthru.getReg().isValid())
       MRI->constrainRegClass(
           Passthru.getReg(),
-          TII->getRegClass(Src->getDesc(), SrcPassthru.getOperandNo(), TRI));
+          TII->getRegClass(Src->getDesc(), SrcPassthru.getOperandNo()));
   }
 
   if (RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags)) {
diff --git a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp
index 3e4a58a20f942..0798483462e18 100644
--- a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include <optional>
 #include <queue>
+#include <unordered_set>
 
 #define DEBUG_TYPE "spirv-convergence-region-analysis"
 
diff --git a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h
index ed0a1e10562a8..7f4e1a1791e9e 100644
--- a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h
+++ b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h
@@ -20,7 +20,6 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Dominators.h"
 #include <optional>
-#include <unordered_set>
 
 namespace llvm {
 class IntrinsicInst;
diff --git a/llvm/lib/Target/SPIRV/CMakeLists.txt b/llvm/lib/Target/SPIRV/CMakeLists.txt
index eab7b213756b3..79b76165cd57a 100644
--- a/llvm/lib/Target/SPIRV/CMakeLists.txt
+++ b/llvm/lib/Target/SPIRV/CMakeLists.txt
@@ -41,6 +41,7 @@ add_llvm_target(SPIRVCodeGen
   SPIRVPreLegalizerCombiner.cpp
   SPIRVPostLegalizer.cpp
   SPIRVPrepareFunctions.cpp
+  SPIRVPrepareGlobals.cpp
   SPIRVRegisterBankInfo.cpp
   SPIRVRegisterInfo.cpp
   SPIRVRegularizer.cpp
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h
index 0c9c3bc51f433..8f2ad48efa9d7 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h
@@ -16,11 +16,12 @@
 #include "MCTargetDesc/SPIRVBaseInfo.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCRegister.h"
 
 namespace llvm {
 class SPIRVInstPrinter : public MCInstPrinter {
 private:
-  SmallDenseMap<unsigned, SPIRV::InstructionSet::InstructionSet> ExtInstSetIDs;
+  SmallDenseMap<MCRegister, SPIRV::InstructionSet::InstructionSet> ExtInstSetIDs;
   void recordOpExtInstImport(const MCInst *MI);
 
 public:
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h
index f9ba5e2d55cba..d36453a4f078d 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h
@@ -15,7 +15,6 @@
 
 #include "llvm/Support/DataTypes.h"
 #include <cassert>
-#include <memory>
 
 namespace llvm {
 class MCAsmBackend;
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp
index 0a318e0e01e59..ed6d355670cbd 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVTargetStreamer.cpp
@@ -15,4 +15,4 @@
 using namespace llvm;
 
 SPIRVTargetStreamer::SPIRVTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
-SPIRVTargetStreamer::~SPIRVTargetStreamer() {}
+SPIRVTargetStreamer::~SPIRVTargetStreamer() = default;
diff --git a/llvm/lib/Target/SPIRV/SPIRV.h b/llvm/lib/Target/SPIRV/SPIRV.h
index efd49b930aa34..fa85ee781c249 100644
--- a/llvm/lib/Target/SPIRV/SPIRV.h
+++ b/llvm/lib/Target/SPIRV/SPIRV.h
@@ -31,6 +31,7 @@ FunctionPass *createSPIRVPreLegalizerCombiner();
 FunctionPass *createSPIRVPreLegalizerPass();
 FunctionPass *createSPIRVPostLegalizerPass();
 ModulePass *createSPIRVEmitIntrinsicsPass(SPIRVTargetMachine *TM);
+ModulePass *createSPIRVPrepareGlobalsPass();
 MachineFunctionPass *createSPIRVEmitNonSemanticDIPass(SPIRVTargetMachine *TM);
 InstructionSelector *
 createSPIRVInstructionSelector(const SPIRVTargetMachine &TM,
@@ -51,6 +52,7 @@ void initializeSPIRVLegalizePointerCastPass(PassRegistry &);
 void initializeSPIRVRegularizerPass(PassRegistry &);
 void initializeSPIRVMergeRegionExitTargetsPass(PassRegistry &);
 void initializeSPIRVPrepareFunctionsPass(PassRegistry &);
+void initializeSPIRVPrepareGlobalsPass(PassRegistry &);
 void initializeSPIRVStripConvergentIntrinsicsPass(PassRegistry &);
 void initializeSPIRVLegalizeImplicitBindingPass(PassRegistry &);
 } // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 640b014646f36..970b83de5ee33 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -577,6 +577,11 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
     if (MDNode *Node = F.getMetadata("intel_reqd_sub_group_size"))
       outputExecutionModeFromMDNode(FReg, Node,
                                     SPIRV::ExecutionMode::SubgroupSize, 0, 0);
+    if (MDNode *Node = F.getMetadata("max_work_group_size")) {
+      if (ST->canUseExtension(SPIRV::Extension::SPV_INTEL_kernel_attributes))
+        outputExecutionModeFromMDNode(
+            FReg, Node, SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, 3, 1);
+    }
     if (MDNode *Node = F.getMetadata("vec_type_hint")) {
       MCInst Inst;
       Inst.setOpcode(SPIRV::OpExecutionMode);
@@ -607,13 +612,10 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
         // Collect the SPIRVTypes for fp16, fp32, and fp64 and the constant of
         // type int32 with 0 value to represent the FP Fast Math Mode.
         std::vector<const MachineInstr *> SPIRVFloatTypes;
-        const MachineInstr *ConstZero = nullptr;
+        const MachineInstr *ConstZeroInt32 = nullptr;
         for (const MachineInstr *MI :
              MAI->getMSInstrs(SPIRV::MB_TypeConstVars)) {
-          // Skip if the instruction is not OpTypeFloat or OpConstant.
           unsigned OpCode = MI->getOpcode();
-          if (OpCode != SPIRV::OpTypeFloat && OpCode != SPIRV::OpConstantNull)
-            continue;
 
           // Collect the SPIRV type if it's a float.
           if (OpCode == SPIRV::OpTypeFloat) {
@@ -624,14 +626,18 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
               continue;
             }
             SPIRVFloatTypes.push_back(MI);
-          } else {
+            continue;
+          }
+
+          if (OpCode == SPIRV::OpConstantNull) {
             // Check if the constant is int32, if not skip it.
             const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
             MachineInstr *TypeMI = MRI.getVRegDef(MI->getOperand(1).getReg());
-            if (!TypeMI || TypeMI->getOperand(1).getImm() != 32)
-              continue;
-
-            ConstZero = MI;
+            bool IsInt32Ty = TypeMI &&
+                             TypeMI->getOpcode() == SPIRV::OpTypeInt &&
+                             TypeMI->getOperand(1).getImm() == 32;
+            if (IsInt32Ty)
+              ConstZeroInt32 = MI;
           }
         }
 
@@ -652,9 +658,9 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
           MCRegister TypeReg =
               MAI->getRegisterAlias(MF, MI->getOperand(0).getReg());
           Inst.addOperand(MCOperand::createReg(TypeReg));
-          assert(ConstZero && "There should be a constant zero.");
+          assert(ConstZeroInt32 && "There should be a constant zero.");
           MCRegister ConstReg = MAI->getRegisterAlias(
-              ConstZero->getMF(), ConstZero->getOperand(0).getReg());
+              ConstZeroInt32->getMF(), ConstZeroInt32->getOperand(0).getReg());
           Inst.addOperand(MCOperand::createReg(ConstReg));
           outputMCInst(Inst);
         }
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 56a38bb49b7e7..b2cbdb2ad7375 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -2390,6 +2390,15 @@ static bool generateBindlessImageINTELInst(const SPIRV::IncomingCall *Call,
   return buildBindlessImageINTELInst(Call, Opcode, MIRBuilder, GR);
 }
 
+static bool generateBlockingPipesInst(const SPIRV::IncomingCall *Call,
+                                      MachineIRBuilder &MIRBuilder,
+                                      SPIRVGlobalRegistry *GR) {
+  const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
+  unsigned Opcode =
+      SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode;
+  return buildOpFromWrapper(MIRBuilder, Opcode, Call, Register(0));
+}
+
 static bool
 generateTernaryBitwiseFunctionINTELInst(const SPIRV::IncomingCall *Call,
                                         MachineIRBuilder &MIRBuilder,
@@ -3050,6 +3059,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
     return generatePipeInst(Call.get(), MIRBuilder, GR);
   case SPIRV::PredicatedLoadStore:
     return generatePredicatedLoadStoreInst(Call.get(), MIRBuilder, GR);
+  case SPIRV::BlockingPipes:
+    return generateBlockingPipesInst(Call.get(), MIRBuilder, GR);
   }
   return false;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index c259ccee359b4..492a98e1995fe 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -71,6 +71,7 @@ def TernaryBitwiseINTEL : BuiltinGroup;
 def Block2DLoadStore : BuiltinGroup;
 def Pipe : BuiltinGroup;
 def PredicatedLoadStore : BuiltinGroup;
+def BlockingPipes : BuiltinGroup;
 
 //===----------------------------------------------------------------------===//
 // Class defining a demangled builtin record. The information in the record
@@ -1174,6 +1175,10 @@ defm : DemangledNativeBuiltin<"clock_read_sub_group", OpenCL_std, KernelClock, 0
 defm : DemangledNativeBuiltin<"clock_read_hilo_device", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
 defm : DemangledNativeBuiltin<"clock_read_hilo_work_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
 defm : DemangledNativeBuiltin<"clock_read_hilo_sub_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+
+//SPV_ALTERA_blocking_pipes
+defm : DemangledNativeBuiltin<"__spirv_WritePipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpWritePipeBlockingALTERA>;
+defm : DemangledNativeBuiltin<"__spirv_ReadPipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpReadPipeBlockingALTERA>;
 defm : DemangledNativeBuiltin<"__spirv_ReadClockKHR", OpenCL_std, KernelClock, 1, 1, OpReadClockKHR>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 9e11c3a281a1b..dd57b74d79a5e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -149,23 +149,23 @@ static FunctionType *getOriginalFunctionType(const Function &F) {
         return isa<MDString>(N->getOperand(0)) &&
                cast<MDString>(N->getOperand(0))->getString() == F.getName();
       });
-  // TODO: probably one function can have numerous type mutations,
-  // so we should support this.
   if (ThisFuncMDIt != NamedMD->op_end()) {
     auto *ThisFuncMD = *ThisFuncMDIt;
-    MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(1));
-    assert(MD && "MDNode operand is expected");
-    ConstantInt *Const = getConstInt(MD, 0);
-    if (Const) {
-      auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1));
-      assert(CMeta && "ConstantAsMetadata operand is expected");
-      assert(Const->getSExtValue() >= -1);
-      // Currently -1 indicates return value, greater values mean
-      // argument numbers.
-      if (Const->getSExtValue() == -1)
-        RetTy = CMeta->getType();
-      else
-        ArgTypes[Const->getSExtValue()] = CMeta->getType();
+    for (unsigned I = 1; I != ThisFuncMD->getNumOperands(); ++I) {
+      MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(I));
+      assert(MD && "MDNode operand is expected");
+      ConstantInt *Const = getConstInt(MD, 0);
+      if (Const) {
+        auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1));
+        assert(CMeta && "ConstantAsMetadata operand is expected");
+        assert(Const->getSExtValue() >= -1);
+        // Currently -1 indicates return value, greater values mean
+        // argument numbers.
+        if (Const->getSExtValue() == -1)
+          RetTy = CMeta->getType();
+        else
+          ArgTypes[Const->getSExtValue()] = CMeta->getType();
+      }
     }
   }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 96f5dee21bc2a..ac09b937a584a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -29,6 +29,8 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
          SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float16_add},
         {"SPV_EXT_shader_atomic_float_min_max",
          SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float_min_max},
+        {"SPV_INTEL_16bit_atomics",
+         SPIRV::Extension::Extension::SPV_INTEL_16bit_atomics},
         {"SPV_EXT_arithmetic_fence",
          SPIRV::Extension::Extension::SPV_EXT_arithmetic_fence},
         {"SPV_EXT_demote_to_helper_invocation",
@@ -107,6 +109,8 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
          SPIRV::Extension::Extension::SPV_INTEL_inline_assembly},
         {"SPV_INTEL_bindless_images",
          SPIRV::Extension::Extension::SPV_INTEL_bindless_images},
+        {"SPV_INTEL_bfloat16_arithmetic",
+         SPIRV::Extension::Extension::SPV_INTEL_bfloat16_arithmetic},
         {"SPV_INTEL_bfloat16_conversion",
          SPIRV::Extension::Extension::SPV_INTEL_bfloat16_conversion},
         {"SPV_KHR_subgroup_rotate",
@@ -155,7 +159,11 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
         {"SPV_INTEL_predicated_io",
          SPIRV::Extension::Extension::SPV_INTEL_predicated_io},
         {"SPV_KHR_maximal_reconvergence",
-         SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}};
+         SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence},
+        {"SPV_INTEL_kernel_attributes",
+         SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes},
+        {"SPV_ALTERA_blocking_pipes",
+         SPIRV::Extension::Extension::SPV_ALTERA_blocking_pipes}};
 
 bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
                                   StringRef ArgValue,
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index a151fd2fbdb7a..599cc35ca2e9d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -767,6 +767,8 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeHelper(
     Type *RefTy = deduceElementTypeHelper(Ref->getPointerOperand(), Visited,
                                           UnknownElemTypeI8);
     maybeAssignPtrType(Ty, I, RefTy, UnknownElemTypeI8);
+  } else if (auto *Ref = dyn_cast<IntToPtrInst>(I)) {
+    maybeAssignPtrType(Ty, I, Ref->getDestTy(), UnknownElemTypeI8);
   } else if (auto *Ref = dyn_cast<BitCastInst>(I)) {
     if (Type *Src = Ref->getSrcTy(), *Dest = Ref->getDestTy();
         isPointerTy(Src) && isPointerTy(Dest))
@@ -2149,7 +2151,9 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I,
   for (const auto &Op : I->operands()) {
     if (isa<ConstantPointerNull>(Op) || isa<UndefValue>(Op) ||
         // Check GetElementPtrConstantExpr case.
-        (isa<ConstantExpr>(Op) && isa<GEPOperator>(Op))) {
+        (isa<ConstantExpr>(Op) &&
+         (isa<GEPOperator>(Op) ||
+          (cast<ConstantExpr>(Op)->getOpcode() == CastInst::IntToPtr)))) {
       setInsertPointSkippingPhis(B, I);
       Type *OpTy = Op->getType();
       if (isa<UndefValue>(Op) && OpTy->isAggregateType()) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVIRMapping.h b/llvm/lib/Target/SPIRV/SPIRVIRMapping.h
index a329fd5ed9d29..c99d603d340ea 100644
--- a/llvm/lib/Target/SPIRV/SPIRVIRMapping.h
+++ b/llvm/lib/Target/SPIRV/SPIRVIRMapping.h
@@ -22,8 +22,6 @@
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 
-#include <type_traits>
-
 namespace llvm {
 namespace SPIRV {
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
index ba95ad822df75..4f8bf4312a380 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
@@ -24,7 +24,7 @@
 using namespace llvm;
 
 SPIRVInstrInfo::SPIRVInstrInfo(const SPIRVSubtarget &STI)
-    : SPIRVGenInstrInfo(STI) {}
+    : SPIRVGenInstrInfo(STI, RI) {}
 
 bool SPIRVInstrInfo::isConstantInstr(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index a61351eba03f8..03bd61bdf2cf6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -993,3 +993,9 @@ def OpPredicatedLoadINTEL: Op<6528, (outs ID:$res), (ins TYPE:$resType, ID:$ptr,
                   "$res = OpPredicatedLoadINTEL $resType $ptr $predicate $default_value">;
 def OpPredicatedStoreINTEL: Op<6529, (outs), (ins ID:$ptr, ID:$object, ID:$predicate, variable_ops),
                   "OpPredicatedStoreINTEL $ptr $object $predicate">;
+
+//SPV_ALTERA_blocking_pipes
+def OpReadPipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment),
+                   "OpReadPipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">;
+def OpWritePipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment),
+                   "OpWritePipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 3fea21e6e694c..fc87288a4a212 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -1210,8 +1210,16 @@ bool SPIRVInstructionSelector::selectUnOp(Register ResVReg,
     for (MachineRegisterInfo::def_instr_iterator DefIt =
              MRI->def_instr_begin(SrcReg);
          DefIt != MRI->def_instr_end(); DefIt = std::next(DefIt)) {
-      if ((*DefIt).getOpcode() == TargetOpcode::G_GLOBAL_VALUE ||
-          (*DefIt).getOpcode() == SPIRV::OpVariable) {
+      unsigned DefOpCode = DefIt->getOpcode();
+      if (DefOpCode == SPIRV::ASSIGN_TYPE) {
+        // We need special handling to look through the type assignment and see
+        // if this is a constant or a global
+        if (auto *VRD = getVRegDef(*MRI, DefIt->getOperand(1).getReg()))
+          DefOpCode = VRD->getOpcode();
+      }
+      if (DefOpCode == TargetOpcode::G_GLOBAL_VALUE ||
+          DefOpCode == TargetOpcode::G_CONSTANT ||
+          DefOpCode == SPIRV::OpVariable || DefOpCode == SPIRV::OpConstantI) {
         IsGV = true;
         break;
       }
@@ -3099,9 +3107,10 @@ bool SPIRVInstructionSelector::wrapIntoSpecConstantOp(
     SmallPtrSet<SPIRVType *, 4> Visited;
     if (!OpDefine || !OpType || isConstReg(MRI, OpDefine, Visited) ||
         OpDefine->getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
+        OpDefine->getOpcode() == TargetOpcode::G_INTTOPTR ||
         GR.isAggregateType(OpType)) {
       // The case of G_ADDRSPACE_CAST inside spv_const_composite() is processed
-      // by selectAddrSpaceCast()
+      // by selectAddrSpaceCast(), and G_INTTOPTR is processed by selectUnOp()
       CompositeArgs.push_back(OpReg);
       continue;
     }
@@ -3151,6 +3160,14 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     return selectInsertElt(ResVReg, ResType, I);
   case Intrinsic::spv_gep:
     return selectGEP(ResVReg, ResType, I);
+  case Intrinsic::spv_bitcast: {
+    Register OpReg = I.getOperand(2).getReg();
+    SPIRVType *OpType =
+        OpReg.isValid() ? GR.getSPIRVTypeForVReg(OpReg) : nullptr;
+    if (!GR.isBitcastCompatible(ResType, OpType))
+      report_fatal_error("incompatible result and operand types in a bitcast");
+    return selectOpWithSrcs(ResVReg, ResType, I, {OpReg}, SPIRV::OpBitcast);
+  }
   case Intrinsic::spv_unref_global:
   case Intrinsic::spv_init_global: {
     MachineInstr *MI = MRI->getVRegDef(I.getOperand(1).getReg());
@@ -3508,6 +3525,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
   case Intrinsic::spv_resource_nonuniformindex: {
     return selectResourceNonUniformIndex(ResVReg, ResType, I);
   }
+  case Intrinsic::spv_unpackhalf2x16: {
+    return selectExtInst(ResVReg, ResType, I, GL::UnpackHalf2x16);
+  }
+
   default: {
     std::string DiagMsg;
     raw_string_ostream OS(DiagMsg);
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
index 6e444c98de8da..4ce871b6f5e5d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
@@ -73,16 +73,23 @@ class SPIRVLegalizePointerCast : public FunctionPass {
   // Returns the loaded value.
   Value *loadVectorFromVector(IRBuilder<> &B, FixedVectorType *SourceType,
                               FixedVectorType *TargetType, Value *Source) {
-    assert(TargetType->getNumElements() <= SourceType->getNumElements());
     LoadInst *NewLoad = B.CreateLoad(SourceType, Source);
     buildAssignType(B, SourceType, NewLoad);
     Value *AssignValue = NewLoad;
     if (TargetType->getElementType() != SourceType->getElementType()) {
+      const DataLayout &DL = B.GetInsertBlock()->getModule()->getDataLayout();
+      [[maybe_unused]] TypeSize TargetTypeSize =
+          DL.getTypeSizeInBits(TargetType);
+      [[maybe_unused]] TypeSize SourceTypeSize =
+          DL.getTypeSizeInBits(SourceType);
+      assert(TargetTypeSize == SourceTypeSize);
       AssignValue = B.CreateIntrinsic(Intrinsic::spv_bitcast,
                                       {TargetType, SourceType}, {NewLoad});
       buildAssignType(B, TargetType, AssignValue);
+      return AssignValue;
     }
 
+    assert(TargetType->getNumElements() < SourceType->getNumElements());
     SmallVector<int> Mask(/* Size= */ TargetType->getNumElements());
     for (unsigned I = 0; I < TargetType->getNumElements(); ++I)
       Mask[I] = I;
@@ -109,6 +116,81 @@ class SPIRVLegalizePointerCast : public FunctionPass {
     return LI;
   }
 
+  // Loads elements from an array and constructs a vector.
+  Value *loadVectorFromArray(IRBuilder<> &B, FixedVectorType *TargetType,
+                             Value *Source) {
+    // Load each element of the array.
+    SmallVector<Value *, 4> LoadedElements;
+    for (unsigned i = 0; i < TargetType->getNumElements(); ++i) {
+      // Create a GEP to access the i-th element of the array.
+      SmallVector<Type *, 2> Types = {Source->getType(), Source->getType()};
+      SmallVector<Value *, 4> Args;
+      Args.push_back(B.getInt1(false));
+      Args.push_back(Source);
+      Args.push_back(B.getInt32(0));
+      Args.push_back(ConstantInt::get(B.getInt32Ty(), i));
+      auto *ElementPtr = B.CreateIntrinsic(Intrinsic::spv_gep, {Types}, {Args});
+      GR->buildAssignPtr(B, TargetType->getElementType(), ElementPtr);
+
+      // Load the value from the element pointer.
+      Value *Load = B.CreateLoad(TargetType->getElementType(), ElementPtr);
+      buildAssignType(B, TargetType->getElementType(), Load);
+      LoadedElements.push_back(Load);
+    }
+
+    // Build the vector from the loaded elements.
+    Value *NewVector = PoisonValue::get(TargetType);
+    buildAssignType(B, TargetType, NewVector);
+
+    for (unsigned i = 0; i < TargetType->getNumElements(); ++i) {
+      Value *Index = B.getInt32(i);
+      SmallVector<Type *, 4> Types = {TargetType, TargetType,
+                                      TargetType->getElementType(),
+                                      Index->getType()};
+      SmallVector<Value *> Args = {NewVector, LoadedElements[i], Index};
+      NewVector = B.CreateIntrinsic(Intrinsic::spv_insertelt, {Types}, {Args});
+      buildAssignType(B, TargetType, NewVector);
+    }
+    return NewVector;
+  }
+
+  // Stores elements from a vector into an array.
+  void storeArrayFromVector(IRBuilder<> &B, Value *SrcVector,
+                            Value *DstArrayPtr, ArrayType *ArrTy,
+                            Align Alignment) {
+    auto *VecTy = cast<FixedVectorType>(SrcVector->getType());
+
+    // Ensure the element types of the array and vector are the same.
+    assert(VecTy->getElementType() == ArrTy->getElementType() &&
+           "Element types of array and vector must be the same.");
+
+    for (unsigned i = 0; i < VecTy->getNumElements(); ++i) {
+      // Create a GEP to access the i-th element of the array.
+      SmallVector<Type *, 2> Types = {DstArrayPtr->getType(),
+                                      DstArrayPtr->getType()};
+      SmallVector<Value *, 4> Args;
+      Args.push_back(B.getInt1(false));
+      Args.push_back(DstArrayPtr);
+      Args.push_back(B.getInt32(0));
+      Args.push_back(ConstantInt::get(B.getInt32Ty(), i));
+      auto *ElementPtr = B.CreateIntrinsic(Intrinsic::spv_gep, {Types}, {Args});
+      GR->buildAssignPtr(B, ArrTy->getElementType(), ElementPtr);
+
+      // Extract the element from the vector and store it.
+      Value *Index = B.getInt32(i);
+      SmallVector<Type *, 3> EltTypes = {VecTy->getElementType(), VecTy,
+                                         Index->getType()};
+      SmallVector<Value *, 2> EltArgs = {SrcVector, Index};
+      Value *Element =
+          B.CreateIntrinsic(Intrinsic::spv_extractelt, {EltTypes}, {EltArgs});
+      buildAssignType(B, VecTy->getElementType(), Element);
+
+      Types = {Element->getType(), ElementPtr->getType()};
+      Args = {Element, ElementPtr, B.getInt16(2), B.getInt8(Alignment.value())};
+      B.CreateIntrinsic(Intrinsic::spv_store, {Types}, {Args});
+    }
+  }
+
   // Replaces the load instruction to get rid of the ptrcast used as source
   // operand.
   void transformLoad(IRBuilder<> &B, LoadInst *LI, Value *CastedOperand,
@@ -147,6 +229,8 @@ class SPIRVLegalizePointerCast : public FunctionPass {
     // - float v = s.m;
     else if (SST && SST->getTypeAtIndex(0u) == ToTy)
       Output = loadFirstValueFromAggregate(B, ToTy, OriginalOperand, LI);
+    else if (SAT && DVT && SAT->getElementType() == DVT->getElementType())
+      Output = loadVectorFromArray(B, DVT, OriginalOperand);
     else
       llvm_unreachable("Unimplemented implicit down-cast from load.");
 
@@ -281,6 +365,7 @@ class SPIRVLegalizePointerCast : public FunctionPass {
     auto *S_VT = dyn_cast<FixedVectorType>(FromTy);
     auto *D_ST = dyn_cast<StructType>(ToTy);
     auto *D_VT = dyn_cast<FixedVectorType>(ToTy);
+    auto *D_AT = dyn_cast<ArrayType>(ToTy);
 
     B.SetInsertPoint(BadStore);
     if (D_ST && isTypeFirstElementAggregate(FromTy, D_ST))
@@ -289,6 +374,8 @@ class SPIRVLegalizePointerCast : public FunctionPass {
       storeVectorFromVector(B, Src, Dst, Alignment);
     else if (D_VT && !S_VT && FromTy == D_VT->getElementType())
       storeToFirstValueAggregate(B, Src, Dst, D_VT, Alignment);
+    else if (D_AT && S_VT && S_VT->getElementType() == D_AT->getElementType())
+      storeArrayFromVector(B, Src, Dst, D_AT, Alignment);
     else
       llvm_unreachable("Unsupported ptrcast use in store. Please fix.");
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index f7cdfcb65623b..b8cd9c1358f00 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -249,17 +249,18 @@ static InstrSignature instrToSignature(const MachineInstr &MI,
   InstrSignature Signature{MI.getOpcode()};
   for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
     // The only decorations that can be applied more than once to a given <id>
-    // or structure member are UserSemantic(5635), CacheControlLoadINTEL (6442),
-    // and CacheControlStoreINTEL (6443). For all the rest of decorations, we
-    // will only add to the signature the Opcode, the id to which it applies,
-    // and the decoration id, disregarding any decoration flags. This will
-    // ensure that any subsequent decoration with the same id will be deemed as
-    // a duplicate. Then, at the call site, we will be able to handle duplicates
-    // in the best way.
+    // or structure member are FuncParamAttr (38), UserSemantic (5635),
+    // CacheControlLoadINTEL (6442), and CacheControlStoreINTEL (6443). For all
+    // the rest of decorations, we will only add to the signature the Opcode,
+    // the id to which it applies, and the decoration id, disregarding any
+    // decoration flags. This will ensure that any subsequent decoration with
+    // the same id will be deemed as a duplicate. Then, at the call site, we
+    // will be able to handle duplicates in the best way.
     unsigned Opcode = MI.getOpcode();
     if ((Opcode == SPIRV::OpDecorate) && i >= 2) {
       unsigned DecorationID = MI.getOperand(1).getImm();
-      if (DecorationID != SPIRV::Decoration::UserSemantic &&
+      if (DecorationID != SPIRV::Decoration::FuncParamAttr &&
+          DecorationID != SPIRV::Decoration::UserSemantic &&
           DecorationID != SPIRV::Decoration::CacheControlLoadINTEL &&
           DecorationID != SPIRV::Decoration::CacheControlStoreINTEL)
         continue;
@@ -613,8 +614,7 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI,
               << FinalFlags << "\n";
           MachineInstr *OrigMINonConst = const_cast<MachineInstr *>(OrigMI);
           MachineOperand &OrigFlagsOp = OrigMINonConst->getOperand(2);
-          OrigFlagsOp =
-              MachineOperand::CreateImm(static_cast<unsigned>(FinalFlags));
+          OrigFlagsOp = MachineOperand::CreateImm(FinalFlags);
           return; // Merge done, so we found a duplicate; don't add it to MAI.MS
         }
       }
@@ -1059,6 +1059,13 @@ static void addOpTypeImageReqs(const MachineInstr &MI,
   }
 }
 
+static bool isBFloat16Type(const SPIRVType *TypeDef) {
+  return TypeDef && TypeDef->getNumOperands() == 3 &&
+         TypeDef->getOpcode() == SPIRV::OpTypeFloat &&
+         TypeDef->getOperand(1).getImm() == 16 &&
+         TypeDef->getOperand(2).getImm() == SPIRV::FPEncoding::BFloat16KHR;
+}
+
 // Add requirements for handling atomic float instructions
 #define ATOM_FLT_REQ_EXT_MSG(ExtName)                                          \
   "The atomic float instruction requires the following SPIR-V "                \
@@ -1082,11 +1089,21 @@ static void AddAtomicFloatRequirements(const MachineInstr &MI,
     Reqs.addExtension(SPIRV::Extension::SPV_EXT_shader_atomic_float_add);
     switch (BitWidth) {
     case 16:
-      if (!ST.canUseExtension(
-              SPIRV::Extension::SPV_EXT_shader_atomic_float16_add))
-        report_fatal_error(ATOM_FLT_REQ_EXT_MSG("16_add"), false);
-      Reqs.addExtension(SPIRV::Extension::SPV_EXT_shader_atomic_float16_add);
-      Reqs.addCapability(SPIRV::Capability::AtomicFloat16AddEXT);
+      if (isBFloat16Type(TypeDef)) {
+        if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_16bit_atomics))
+          report_fatal_error(
+              "The atomic bfloat16 instruction requires the following SPIR-V "
+              "extension: SPV_INTEL_16bit_atomics",
+              false);
+        Reqs.addExtension(SPIRV::Extension::SPV_INTEL_16bit_atomics);
+        Reqs.addCapability(SPIRV::Capability::AtomicBFloat16AddINTEL);
+      } else {
+        if (!ST.canUseExtension(
+                SPIRV::Extension::SPV_EXT_shader_atomic_float16_add))
+          report_fatal_error(ATOM_FLT_REQ_EXT_MSG("16_add"), false);
+        Reqs.addExtension(SPIRV::Extension::SPV_EXT_shader_atomic_float16_add);
+        Reqs.addCapability(SPIRV::Capability::AtomicFloat16AddEXT);
+      }
       break;
     case 32:
       Reqs.addCapability(SPIRV::Capability::AtomicFloat32AddEXT);
@@ -1105,7 +1122,17 @@ static void AddAtomicFloatRequirements(const MachineInstr &MI,
     Reqs.addExtension(SPIRV::Extension::SPV_EXT_shader_atomic_float_min_max);
     switch (BitWidth) {
     case 16:
-      Reqs.addCapability(SPIRV::Capability::AtomicFloat16MinMaxEXT);
+      if (isBFloat16Type(TypeDef)) {
+        if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_16bit_atomics))
+          report_fatal_error(
+              "The atomic bfloat16 instruction requires the following SPIR-V "
+              "extension: SPV_INTEL_16bit_atomics",
+              false);
+        Reqs.addExtension(SPIRV::Extension::SPV_INTEL_16bit_atomics);
+        Reqs.addCapability(SPIRV::Capability::AtomicBFloat16MinMaxINTEL);
+      } else {
+        Reqs.addCapability(SPIRV::Capability::AtomicFloat16MinMaxEXT);
+      }
       break;
     case 32:
       Reqs.addCapability(SPIRV::Capability::AtomicFloat32MinMaxEXT);
@@ -1329,13 +1356,6 @@ void addPrintfRequirements(const MachineInstr &MI,
   }
 }
 
-static bool isBFloat16Type(const SPIRVType *TypeDef) {
-  return TypeDef && TypeDef->getNumOperands() == 3 &&
-         TypeDef->getOpcode() == SPIRV::OpTypeFloat &&
-         TypeDef->getOperand(1).getImm() == 16 &&
-         TypeDef->getOperand(2).getImm() == SPIRV::FPEncoding::BFloat16KHR;
-}
-
 void addInstrRequirements(const MachineInstr &MI,
                           SPIRV::ModuleAnalysisInfo &MAI,
                           const SPIRVSubtarget &ST) {
@@ -1436,6 +1456,8 @@ void addInstrRequirements(const MachineInstr &MI,
       addPrintfRequirements(MI, Reqs, ST);
       break;
     }
+    // TODO: handle bfloat16 extended instructions when
+    // SPV_INTEL_bfloat16_arithmetic is enabled.
     break;
   }
   case SPIRV::OpAliasDomainDeclINTEL:
@@ -1884,6 +1906,13 @@ void addInstrRequirements(const MachineInstr &MI,
     Reqs.addCapability(
         SPIRV::Capability::CooperativeMatrixCheckedInstructionsINTEL);
     break;
+  case SPIRV::OpReadPipeBlockingALTERA:
+  case SPIRV::OpWritePipeBlockingALTERA:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes);
+      Reqs.addCapability(SPIRV::Capability::BlockingPipesALTERA);
+    }
+    break;
   case SPIRV::OpCooperativeMatrixGetElementCoordINTEL:
     if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_joint_matrix))
       report_fatal_error("OpCooperativeMatrixGetElementCoordINTEL requires the "
@@ -2061,7 +2090,64 @@ void addInstrRequirements(const MachineInstr &MI,
     Reqs.addCapability(SPIRV::Capability::PredicatedIOINTEL);
     break;
   }
-
+  case SPIRV::OpFAddS:
+  case SPIRV::OpFSubS:
+  case SPIRV::OpFMulS:
+  case SPIRV::OpFDivS:
+  case SPIRV::OpFRemS:
+  case SPIRV::OpFMod:
+  case SPIRV::OpFNegate:
+  case SPIRV::OpFAddV:
+  case SPIRV::OpFSubV:
+  case SPIRV::OpFMulV:
+  case SPIRV::OpFDivV:
+  case SPIRV::OpFRemV:
+  case SPIRV::OpFNegateV: {
+    const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+    SPIRVType *TypeDef = MRI.getVRegDef(MI.getOperand(1).getReg());
+    if (TypeDef->getOpcode() == SPIRV::OpTypeVector)
+      TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg());
+    if (isBFloat16Type(TypeDef)) {
+      if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic))
+        report_fatal_error(
+            "Arithmetic instructions with bfloat16 arguments require the "
+            "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic",
+            false);
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic);
+      Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL);
+    }
+    break;
+  }
+  case SPIRV::OpOrdered:
+  case SPIRV::OpUnordered:
+  case SPIRV::OpFOrdEqual:
+  case SPIRV::OpFOrdNotEqual:
+  case SPIRV::OpFOrdLessThan:
+  case SPIRV::OpFOrdLessThanEqual:
+  case SPIRV::OpFOrdGreaterThan:
+  case SPIRV::OpFOrdGreaterThanEqual:
+  case SPIRV::OpFUnordEqual:
+  case SPIRV::OpFUnordNotEqual:
+  case SPIRV::OpFUnordLessThan:
+  case SPIRV::OpFUnordLessThanEqual:
+  case SPIRV::OpFUnordGreaterThan:
+  case SPIRV::OpFUnordGreaterThanEqual: {
+    const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+    MachineInstr *OperandDef = MRI.getVRegDef(MI.getOperand(2).getReg());
+    SPIRVType *TypeDef = MRI.getVRegDef(OperandDef->getOperand(1).getReg());
+    if (TypeDef->getOpcode() == SPIRV::OpTypeVector)
+      TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg());
+    if (isBFloat16Type(TypeDef)) {
+      if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic))
+        report_fatal_error(
+            "Relational instructions with bfloat16 arguments require the "
+            "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic",
+            false);
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic);
+      Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL);
+    }
+    break;
+  }
   default:
     break;
   }
@@ -2181,6 +2267,10 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
       MAI.Reqs.getAndAddRequirements(
           SPIRV::OperandCategory::ExecutionModeOperand,
           SPIRV::ExecutionMode::SubgroupSize, ST);
+    if (F.getMetadata("max_work_group_size"))
+      MAI.Reqs.getAndAddRequirements(
+          SPIRV::OperandCategory::ExecutionModeOperand,
+          SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, ST);
     if (F.getMetadata("vec_type_hint"))
       MAI.Reqs.getAndAddRequirements(
           SPIRV::OperandCategory::ExecutionModeOperand,
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index 2d19f6de604e4..44b6c66d361bf 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -81,7 +81,7 @@ struct RequirementHandler {
   void initAvailableCapabilitiesForVulkan(const SPIRVSubtarget &ST);
 
 public:
-  RequirementHandler() {}
+  RequirementHandler() = default;
   void clear() {
     MinimalCaps.clear();
     AllCaps.clear();
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index db6f2d61e8f29..d538009f0ecbe 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -192,31 +192,43 @@ static void buildOpBitcast(SPIRVGlobalRegistry *GR, MachineIRBuilder &MIB,
         .addUse(OpReg);
 }
 
-// We do instruction selections early instead of calling MIB.buildBitcast()
-// generating the general op code G_BITCAST. When MachineVerifier validates
-// G_BITCAST we see a check of a kind: if Source Type is equal to Destination
-// Type then report error "bitcast must change the type". This doesn't take into
-// account the notion of a typed pointer that is important for SPIR-V where a
-// user may and should use bitcast between pointers with different pointee types
-// (https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpBitcast).
-// It's important for correct lowering in SPIR-V, because interpretation of the
-// data type is not left to instructions that utilize the pointer, but encoded
-// by the pointer declaration, and the SPIRV target can and must handle the
-// declaration and use of pointers that specify the type of data they point to.
-// It's not feasible to improve validation of G_BITCAST using just information
-// provided by low level types of source and destination. Therefore we don't
-// produce G_BITCAST as the general op code with semantics different from
-// OpBitcast, but rather lower to OpBitcast immediately. As for now, the only
-// difference would be that CombinerHelper couldn't transform known patterns
-// around G_BUILD_VECTOR. See discussion
-// in https://github.com/llvm/llvm-project/pull/110270 for even more context.
-static void selectOpBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
-                             MachineIRBuilder MIB) {
+// We lower G_BITCAST to OpBitcast here to avoid a MachineVerifier error.
+// The verifier checks if the source and destination LLTs of a G_BITCAST are
+// different, but this check is too strict for SPIR-V's typed pointers, which
+// may have the same LLT but different SPIRVType (e.g. pointers to different
+// pointee types). By lowering to OpBitcast here, we bypass the verifier's
+// check. See discussion in https://github.com/llvm/llvm-project/pull/110270
+// for more context.
+//
+// We also handle the llvm.spv.bitcast intrinsic here. If the source and
+// destination SPIR-V types are the same, we lower it to a COPY to enable
+// further optimizations like copy propagation.
+static void lowerBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
+                          MachineIRBuilder MIB) {
   SmallVector<MachineInstr *, 16> ToErase;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
+      if (isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) {
+        Register DstReg = MI.getOperand(0).getReg();
+        Register SrcReg = MI.getOperand(2).getReg();
+        SPIRVType *DstType = GR->getSPIRVTypeForVReg(DstReg);
+        assert(
+            DstType &&
+            "Expected destination SPIR-V type to have been assigned already.");
+        SPIRVType *SrcType = GR->getSPIRVTypeForVReg(SrcReg);
+        assert(SrcType &&
+               "Expected source SPIR-V type to have been assigned already.");
+        if (DstType == SrcType) {
+          MIB.setInsertPt(*MI.getParent(), MI);
+          MIB.buildCopy(DstReg, SrcReg);
+          ToErase.push_back(&MI);
+          continue;
+        }
+      }
+
       if (MI.getOpcode() != TargetOpcode::G_BITCAST)
         continue;
+
       MIB.setInsertPt(*MI.getParent(), MI);
       buildOpBitcast(GR, MIB, MI.getOperand(0).getReg(),
                      MI.getOperand(1).getReg());
@@ -237,16 +249,11 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
   SmallVector<MachineInstr *, 10> ToErase;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
-      if (!isSpvIntrinsic(MI, Intrinsic::spv_bitcast) &&
-          !isSpvIntrinsic(MI, Intrinsic::spv_ptrcast))
+      if (!isSpvIntrinsic(MI, Intrinsic::spv_ptrcast))
         continue;
       assert(MI.getOperand(2).isReg());
       MIB.setInsertPt(*MI.getParent(), MI);
       ToErase.push_back(&MI);
-      if (isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) {
-        MIB.buildBitcast(MI.getOperand(0).getReg(), MI.getOperand(2).getReg());
-        continue;
-      }
       Register Def = MI.getOperand(0).getReg();
       Register Source = MI.getOperand(2).getReg();
       Type *ElemTy = getMDOperandAsType(MI.getOperand(3).getMetadata(), 0);
@@ -1089,7 +1096,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) {
   removeImplicitFallthroughs(MF, MIB);
   insertSpirvDecorations(MF, GR, MIB);
   insertInlineAsm(MF, GR, ST, MIB);
-  selectOpBitcasts(MF, GR, MIB);
+  lowerBitcasts(MF, GR, MIB);
 
   return true;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index 4e4e6fb4ab791..be88f334d2171 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -56,6 +56,13 @@ class SPIRVPrepareFunctions : public ModulePass {
   }
 };
 
+static cl::list<std::string> SPVAllowUnknownIntrinsics(
+    "spv-allow-unknown-intrinsics", cl::CommaSeparated,
+    cl::desc("Emit unknown intrinsics as calls to external functions. A "
+             "comma-separated input list of intrinsic prefixes must be "
+             "provided, and only intrinsics carrying a listed prefix get "
+             "emitted as described."),
+    cl::value_desc("intrinsic_prefix_0,intrinsic_prefix_1"), cl::ValueOptional);
 } // namespace
 
 char SPIRVPrepareFunctions::ID = 0;
@@ -445,6 +452,15 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
                                        EraseFromParent);
         Changed = true;
         break;
+      default:
+        if (TM.getTargetTriple().getVendor() == Triple::AMD ||
+            any_of(SPVAllowUnknownIntrinsics, [II](auto &&Prefix) {
+              if (Prefix.empty())
+                return false;
+              return II->getCalledFunction()->getName().starts_with(Prefix);
+            }))
+          Changed |= lowerIntrinsicToFunction(II);
+        break;
       }
     }
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
new file mode 100644
index 0000000000000..14b75d7d16a4d
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
@@ -0,0 +1,105 @@
+//===-- SPIRVPrepareGlobals.cpp - Prepare IR SPIRV globals ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The pass transforms IR globals that cannot be trivially mapped to SPIRV
+// into something that is trival to lower.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRV.h"
+#include "SPIRVUtils.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+namespace {
+
+struct SPIRVPrepareGlobals : public ModulePass {
+  static char ID;
+  SPIRVPrepareGlobals() : ModulePass(ID) {}
+
+  StringRef getPassName() const override {
+    return "SPIRV prepare global variables";
+  }
+
+  bool runOnModule(Module &M) override;
+};
+
+bool tryExtendLLVMBitcodeMarker(GlobalVariable &Bitcode) {
+  assert(Bitcode.getName() == "llvm.embedded.module");
+
+  ArrayType *AT = cast<ArrayType>(Bitcode.getValueType());
+  if (AT->getNumElements() != 0)
+    return false;
+
+  ArrayType *AT1 = ArrayType::get(AT->getElementType(), 1);
+  Constant *OneEltInit = Constant::getNullValue(AT1);
+  Bitcode.replaceInitializer(OneEltInit);
+  return true;
+}
+
+// In HIP, dynamic LDS variables are represented using 0-element global arrays
+// in the __shared__ language address-space.
+//
+//  extern __shared__ int LDS[];
+//
+// These are not representable in SPIRV directly.
+// To represent them, for AMD, we use an array with UINT32_MAX-elements.
+// These are reverse translated to 0-element arrays.
+bool tryExtendDynamicLDSGlobal(GlobalVariable &GV) {
+  constexpr unsigned WorkgroupAS =
+      storageClassToAddressSpace(SPIRV::StorageClass::Workgroup);
+  const bool IsWorkgroupExternal =
+      GV.hasExternalLinkage() && GV.getAddressSpace() == WorkgroupAS;
+  if (!IsWorkgroupExternal)
+    return false;
+
+  const ArrayType *AT = dyn_cast<ArrayType>(GV.getValueType());
+  if (!AT || AT->getNumElements() != 0)
+    return false;
+
+  constexpr auto UInt32Max = std::numeric_limits<uint32_t>::max();
+  ArrayType *NewAT = ArrayType::get(AT->getElementType(), UInt32Max);
+  GlobalVariable *NewGV = new GlobalVariable(
+      *GV.getParent(), NewAT, GV.isConstant(), GV.getLinkage(), nullptr, "",
+      &GV, GV.getThreadLocalMode(), WorkgroupAS, GV.isExternallyInitialized());
+  NewGV->takeName(&GV);
+  GV.replaceAllUsesWith(NewGV);
+  GV.eraseFromParent();
+
+  return true;
+}
+
+bool SPIRVPrepareGlobals::runOnModule(Module &M) {
+  const bool IsAMD = M.getTargetTriple().getVendor() == Triple::AMD;
+  if (!IsAMD)
+    return false;
+
+  bool Changed = false;
+  if (GlobalVariable *Bitcode = M.getNamedGlobal("llvm.embedded.module"))
+    Changed |= tryExtendLLVMBitcodeMarker(*Bitcode);
+
+  for (GlobalVariable &GV : make_early_inc_range(M.globals()))
+    Changed |= tryExtendDynamicLDSGlobal(GV);
+
+  return Changed;
+}
+char SPIRVPrepareGlobals::ID = 0;
+
+} // namespace
+
+INITIALIZE_PASS(SPIRVPrepareGlobals, "prepare-globals",
+                "SPIRV prepare global variables", false, false)
+
+namespace llvm {
+ModulePass *createSPIRVPrepareGlobalsPass() {
+  return new SPIRVPrepareGlobals();
+}
+} // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index ba09692fec515..ad6c9cd421b7c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -70,7 +70,6 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
     SPIRVVersion = VersionTuple(1, 3);
     break;
   case Triple::SPIRVSubArch_v14:
-  default:
     SPIRVVersion = VersionTuple(1, 4);
     break;
   case Triple::SPIRVSubArch_v15:
@@ -79,13 +78,19 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
   case Triple::SPIRVSubArch_v16:
     SPIRVVersion = VersionTuple(1, 6);
     break;
+  default:
+    if (TT.getVendor() == Triple::AMD)
+      SPIRVVersion = VersionTuple(1, 6);
+    else
+      SPIRVVersion = VersionTuple(1, 4);
   }
   OpenCLVersion = VersionTuple(2, 2);
 
   // Set the environment based on the target triple.
   if (TargetTriple.getOS() == Triple::Vulkan)
     Env = Shader;
-  else if (TargetTriple.getEnvironment() == Triple::OpenCL)
+  else if (TargetTriple.getEnvironment() == Triple::OpenCL ||
+           TargetTriple.getVendor() == Triple::AMD)
     Env = Kernel;
   else
     Env = Unknown;
@@ -93,6 +98,8 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
   // Set the default extensions based on the target triple.
   if (TargetTriple.getVendor() == Triple::Intel)
     Extensions.insert(SPIRV::Extension::SPV_INTEL_function_pointers);
+  if (TargetTriple.getVendor() == Triple::AMD)
+    Extensions = SPIRVExtensionsParser::getValidExtensions(TargetTriple);
 
   // The order of initialization is important.
   initAvailableExtensions(Extensions);
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 7d08b29a51a6e..f02a587013856 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -309,7 +309,7 @@ defm SPV_KHR_shader_clock : ExtensionOperand<54, [EnvVulkan, EnvOpenCL]>;
 defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55, [EnvOpenCL]>;
 defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56, [EnvVulkan]>;
 defm SPV_INTEL_fpga_reg : ExtensionOperand<57, [EnvOpenCL]>;
-defm SPV_INTEL_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>;
+defm SPV_ALTERA_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>;
 defm SPV_GOOGLE_user_type : ExtensionOperand<59, [EnvVulkan]>;
 defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60, [EnvVulkan]>;
 defm SPV_INTEL_kernel_attributes : ExtensionOperand<61, [EnvOpenCL]>;
@@ -387,6 +387,9 @@ defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>;
 defm SPV_KHR_bfloat16 : ExtensionOperand<126, [EnvVulkan, EnvOpenCL]>;
 defm SPV_INTEL_predicated_io : ExtensionOperand<127, [EnvOpenCL]>;
 defm SPV_KHR_maximal_reconvergence : ExtensionOperand<128, [EnvVulkan]>;
+defm SPV_INTEL_bfloat16_arithmetic
+    : ExtensionOperand<129, [EnvVulkan, EnvOpenCL]>;
+defm SPV_INTEL_16bit_atomics : ExtensionOperand<130, [EnvVulkan, EnvOpenCL]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Capabilities enum values and at the same time
@@ -564,12 +567,15 @@ defm FloatControls2
 defm AtomicFloat32AddEXT : CapabilityOperand<6033, 0, 0, [SPV_EXT_shader_atomic_float_add], []>;
 defm AtomicFloat64AddEXT : CapabilityOperand<6034, 0, 0, [SPV_EXT_shader_atomic_float_add], []>;
 defm AtomicFloat16AddEXT : CapabilityOperand<6095, 0, 0, [SPV_EXT_shader_atomic_float16_add], []>;
+defm AtomicBFloat16AddINTEL : CapabilityOperand<6255, 0, 0, [SPV_INTEL_16bit_atomics], []>;
 defm AtomicFloat16MinMaxEXT : CapabilityOperand<5616, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>;
 defm AtomicFloat32MinMaxEXT : CapabilityOperand<5612, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>;
 defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>;
+defm AtomicBFloat16MinMaxINTEL : CapabilityOperand<6256, 0, 0, [SPV_INTEL_16bit_atomics], []>;
 defm VariableLengthArrayINTEL : CapabilityOperand<5817, 0, 0, [SPV_INTEL_variable_length_array], []>;
 defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>;
 defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>;
+defm BFloat16ArithmeticINTEL : CapabilityOperand<6226, 0, 0, [SPV_INTEL_bfloat16_arithmetic], []>;
 defm BFloat16ConversionINTEL : CapabilityOperand<6115, 0, 0, [SPV_INTEL_bfloat16_conversion], []>;
 defm GlobalVariableHostAccessINTEL : CapabilityOperand<6187, 0, 0, [SPV_INTEL_global_variable_host_access], []>;
 defm HostAccessINTEL : CapabilityOperand<6188, 0, 0, [SPV_INTEL_global_variable_host_access], []>;
@@ -587,6 +593,11 @@ defm CooperativeMatrixBFloat16ComponentTypeINTEL : CapabilityOperand<6437, 0, 0,
 defm RoundToInfinityINTEL : CapabilityOperand<5582, 0, 0, [SPV_INTEL_float_controls2], []>;
 defm FloatingPointModeINTEL : CapabilityOperand<5583, 0, 0, [SPV_INTEL_float_controls2], []>;
 defm FunctionFloatControlINTEL : CapabilityOperand<5821, 0, 0, [SPV_INTEL_float_controls2], []>;
+defm KernelAttributesINTEL : CapabilityOperand<5892, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>;
+// TODO-SPIRV: add these once they are used / tested.
+// defm FPGAKernelAttributesINTEL : CapabilityOperand<5897, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>;
+// defm FPGAKernelAttributesv2INTEL : CapabilityOperand<6161, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>;
+// END TODO-SPIRV
 defm LongCompositesINTEL : CapabilityOperand<6089, 0, 0, [SPV_INTEL_long_composites], []>;
 defm BindlessImagesINTEL : CapabilityOperand<6528, 0, 0, [SPV_INTEL_bindless_images], []>;
 defm MemoryAccessAliasingINTEL : CapabilityOperand<5910, 0, 0, [SPV_INTEL_memory_access_aliasing], []>;
@@ -603,6 +614,7 @@ defm TensorFloat32RoundingINTEL : CapabilityOperand<6425, 0, 0, [SPV_INTEL_tenso
 defm BFloat16TypeKHR : CapabilityOperand<5116, 0, 0, [SPV_KHR_bfloat16], []>;
 defm BFloat16DotProductKHR : CapabilityOperand<5117, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR]>;
 defm BFloat16CooperativeMatrixKHR : CapabilityOperand<5118, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR, CooperativeMatrixKHR]>;
+defm BlockingPipesALTERA : CapabilityOperand<5945, 0, 0, [SPV_ALTERA_blocking_pipes], []>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define SourceLanguage enum values and at the same time
@@ -805,6 +817,15 @@ defm RoundingModeRTPINTEL : ExecutionModeOperand<5620, [RoundToInfinityINTEL]>;
 defm RoundingModeRTNINTEL : ExecutionModeOperand<5621, [RoundToInfinityINTEL]>;
 defm FloatingPointModeALTINTEL : ExecutionModeOperand<5622, [FloatingPointModeINTEL]>;
 defm FloatingPointModeIEEEINTEL : ExecutionModeOperand<5623, [FloatingPointModeINTEL]>;
+defm MaxWorkgroupSizeINTEL : ExecutionModeOperand<5893, [KernelAttributesINTEL]>;
+// TODO-SPIRV: Add the following once they are used / tested.
+// defm MaxWorkDimINTEL : ExecutionModeOperand<5894, [KernelAttributesINTEL]>;
+// defm NoGlobalOffsetINTEL : ExecutionModeOperand<5895, [KernelAttributesINTEL]>;
+// defm NumSIMDWorkitemsINTEL : ExecutionModeOperand<5896, [FPGAKernelAttributesINTEL]>;
+// defm SchedulerTargetFmaxMhzINTEL : ExecutionModeOperand<5903, [FPGAKernelAttributesINTEL]>;
+// defm StreamingInterfaceINTEL : ExecutionModeOperand<6154, [FPGAKernelAttributesv2INTEL]>;
+// defm RegisterMapInterfaceINTEL : ExecutionModeOperand<6160, [FPGAKernelAttributesv2INTEL]>;
+// END TODO-SPIRV
 defm FPFastMathDefault : ExecutionModeOperand<6028, [FloatControls2]>;
 defm MaximallyReconvergesKHR : ExecutionModeOperand<6023, [Shader]>;
 
@@ -1919,7 +1940,7 @@ defm GenericCastToPtr :  SpecConstantOpOperandsOperand<122, [], [Kernel]>;
 defm PtrCastToGeneric :  SpecConstantOpOperandsOperand<121, [], [Kernel]>;
 defm Bitcast :  SpecConstantOpOperandsOperand<124, [], []>;
 defm QuantizeToF16 :  SpecConstantOpOperandsOperand<116, [], [Shader]>;
-// Arithmetic 
+// Arithmetic
 defm SNegate :  SpecConstantOpOperandsOperand<126, [], []>;
 defm Not :  SpecConstantOpOperandsOperand<200, [], []>;
 defm IAdd :  SpecConstantOpOperandsOperand<128, [], []>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index 7dd0b95cd9763..10bbca225b20a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -59,6 +59,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTarget() {
   initializeSPIRVEmitIntrinsicsPass(PR);
   initializeSPIRVEmitNonSemanticDIPass(PR);
   initializeSPIRVPrepareFunctionsPass(PR);
+  initializeSPIRVPrepareGlobalsPass(PR);
   initializeSPIRVStripConvergentIntrinsicsPass(PR);
 }
 
@@ -69,7 +70,7 @@ static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
 }
 
 // Pin SPIRVTargetObjectFile's vtables to this file.
-SPIRVTargetObjectFile::~SPIRVTargetObjectFile() {}
+SPIRVTargetObjectFile::~SPIRVTargetObjectFile() = default;
 
 SPIRVTargetMachine::SPIRVTargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
@@ -172,6 +173,7 @@ void SPIRVPassConfig::addIRPasses() {
 
   addPass(createSPIRVRegularizerPass());
   addPass(createSPIRVPrepareFunctionsPass(TM));
+  addPass(createSPIRVPrepareGlobalsPass());
 }
 
 void SPIRVPassConfig::addISelPrepare() {
@@ -244,7 +246,8 @@ static cl::opt<bool> SPVEnableNonSemanticDI(
     cl::Optional, cl::init(false));
 
 void SPIRVPassConfig::addPreEmitPass() {
-  if (SPVEnableNonSemanticDI) {
+  if (SPVEnableNonSemanticDI ||
+      getSPIRVTargetMachine().getTargetTriple().getVendor() == Triple::AMD) {
     addPass(createSPIRVEmitNonSemanticDIPass(&getTM<SPIRVTargetMachine>()));
   }
 }
diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 7c1db3cfcd6b4..ef45d31a029d3 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -235,7 +235,7 @@ class SparcOperand : public MCParsedAsmOperand {
   };
 
   struct RegOp {
-    unsigned RegNum;
+    MCRegister Reg;
     RegisterKind Kind;
   };
 
@@ -244,8 +244,8 @@ class SparcOperand : public MCParsedAsmOperand {
   };
 
   struct MemOp {
-    unsigned Base;
-    unsigned OffsetReg;
+    MCRegister Base;
+    MCRegister OffsetReg;
     const MCExpr *Off;
   };
 
@@ -326,7 +326,7 @@ class SparcOperand : public MCParsedAsmOperand {
 
   MCRegister getReg() const override {
     assert((Kind == k_Register) && "Invalid access!");
-    return Reg.RegNum;
+    return Reg.Reg;
   }
 
   const MCExpr *getImm() const {
@@ -334,12 +334,12 @@ class SparcOperand : public MCParsedAsmOperand {
     return Imm.Val;
   }
 
-  unsigned getMemBase() const {
+  MCRegister getMemBase() const {
     assert((Kind == k_MemoryReg || Kind == k_MemoryImm) && "Invalid access!");
     return Mem.Base;
   }
 
-  unsigned getMemOffsetReg() const {
+  MCRegister getMemOffsetReg() const {
     assert((Kind == k_MemoryReg) && "Invalid access!");
     return Mem.OffsetReg;
   }
@@ -376,12 +376,16 @@ class SparcOperand : public MCParsedAsmOperand {
   void print(raw_ostream &OS, const MCAsmInfo &MAI) const override {
     switch (Kind) {
     case k_Token:     OS << "Token: " << getToken() << "\n"; break;
-    case k_Register:  OS << "Reg: #" << getReg() << "\n"; break;
+    case k_Register:
+      OS << "Reg: #" << getReg().id() << "\n";
+      break;
     case k_Immediate: OS << "Imm: " << getImm() << "\n"; break;
-    case k_MemoryReg: OS << "Mem: " << getMemBase() << "+"
-                         << getMemOffsetReg() << "\n"; break;
+    case k_MemoryReg:
+      OS << "Mem: " << getMemBase().id() << "+" << getMemOffsetReg().id()
+         << "\n";
+      break;
     case k_MemoryImm: assert(getMemOff() != nullptr);
-      OS << "Mem: " << getMemBase() << "+";
+      OS << "Mem: " << getMemBase().id() << "+";
       MAI.printExpr(OS, *getMemOff());
       OS << "\n";
       break;
@@ -432,7 +436,7 @@ class SparcOperand : public MCParsedAsmOperand {
 
     Inst.addOperand(MCOperand::createReg(getMemBase()));
 
-    assert(getMemOffsetReg() != 0 && "Invalid offset");
+    assert(getMemOffsetReg().isValid() && "Invalid offset");
     Inst.addOperand(MCOperand::createReg(getMemOffsetReg()));
   }
 
@@ -480,10 +484,10 @@ class SparcOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static std::unique_ptr<SparcOperand> CreateReg(unsigned RegNum, unsigned Kind,
+  static std::unique_ptr<SparcOperand> CreateReg(MCRegister Reg, unsigned Kind,
                                                  SMLoc S, SMLoc E) {
     auto Op = std::make_unique<SparcOperand>(k_Register);
-    Op->Reg.RegNum = RegNum;
+    Op->Reg.Reg = Reg;
     Op->Reg.Kind   = (SparcOperand::RegisterKind)Kind;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -540,7 +544,7 @@ class SparcOperand : public MCParsedAsmOperand {
       regIdx = Reg - Sparc::I0 + 24;
     if (regIdx % 2 || regIdx > 31)
       return false;
-    Op.Reg.RegNum = IntPairRegs[regIdx / 2];
+    Op.Reg.Reg = IntPairRegs[regIdx / 2];
     Op.Reg.Kind = rk_IntPairReg;
     return true;
   }
@@ -551,7 +555,7 @@ class SparcOperand : public MCParsedAsmOperand {
     unsigned regIdx = Reg - Sparc::F0;
     if (regIdx % 2 || regIdx > 31)
       return false;
-    Op.Reg.RegNum = DoubleRegs[regIdx / 2];
+    Op.Reg.Reg = DoubleRegs[regIdx / 2];
     Op.Reg.Kind = rk_DoubleReg;
     return true;
   }
@@ -574,7 +578,7 @@ class SparcOperand : public MCParsedAsmOperand {
       Reg = QuadFPRegs[regIdx / 2];
       break;
     }
-    Op.Reg.RegNum = Reg;
+    Op.Reg.Reg = Reg;
     Op.Reg.Kind = rk_QuadReg;
     return true;
   }
@@ -587,13 +591,13 @@ class SparcOperand : public MCParsedAsmOperand {
       regIdx = Reg - Sparc::C0;
     if (regIdx % 2 || regIdx > 31)
       return false;
-    Op.Reg.RegNum = CoprocPairRegs[regIdx / 2];
+    Op.Reg.Reg = CoprocPairRegs[regIdx / 2];
     Op.Reg.Kind = rk_CoprocPairReg;
     return true;
   }
 
   static std::unique_ptr<SparcOperand>
-  MorphToMEMrr(unsigned Base, std::unique_ptr<SparcOperand> Op) {
+  MorphToMEMrr(MCRegister Base, std::unique_ptr<SparcOperand> Op) {
     MCRegister offsetReg = Op->getReg();
     Op->Kind = k_MemoryReg;
     Op->Mem.Base = Base;
@@ -602,8 +606,8 @@ class SparcOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static std::unique_ptr<SparcOperand>
-  CreateMEMr(unsigned Base, SMLoc S, SMLoc E) {
+  static std::unique_ptr<SparcOperand> CreateMEMr(MCRegister Base, SMLoc S,
+                                                  SMLoc E) {
     auto Op = std::make_unique<SparcOperand>(k_MemoryReg);
     Op->Mem.Base = Base;
     Op->Mem.OffsetReg = Sparc::G0;  // always 0
@@ -614,11 +618,11 @@ class SparcOperand : public MCParsedAsmOperand {
   }
 
   static std::unique_ptr<SparcOperand>
-  MorphToMEMri(unsigned Base, std::unique_ptr<SparcOperand> Op) {
+  MorphToMEMri(MCRegister Base, std::unique_ptr<SparcOperand> Op) {
     const MCExpr *Imm  = Op->getImm();
     Op->Kind = k_MemoryImm;
     Op->Mem.Base = Base;
-    Op->Mem.OffsetReg = 0;
+    Op->Mem.OffsetReg = MCRegister();
     Op->Mem.Off = Imm;
     return Op;
   }
diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td
index 7137e5fbff4ff..38b0508885069 100644
--- a/llvm/lib/Target/Sparc/Sparc.td
+++ b/llvm/lib/Target/Sparc/Sparc.td
@@ -95,6 +95,9 @@ def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
 def TuneSlowRDPC : SubtargetFeature<"slow-rdpc", "HasSlowRDPC", "true",
                                     "rd %pc, %XX is slow", [FeatureV9]>;
 
+def TuneNoPredictor : SubtargetFeature<"no-predictor", "HasNoPredictor", "true",
+                                    "Processor has no branch predictor, branches stall execution", []>;
+
 //==== Features added predmoninantly for LEON subtarget support
 include "LeonFeatures.td"
 
@@ -174,12 +177,15 @@ def : Proc<"ultrasparc3",     [FeatureV9, FeatureV8Deprecated, FeatureVIS,
                                FeatureVIS2],
                               [TuneSlowRDPC]>;
 def : Proc<"niagara",         [FeatureV9, FeatureV8Deprecated, FeatureVIS,
-                               FeatureVIS2, FeatureUA2005]>;
+                               FeatureVIS2, FeatureUA2005],
+                              [TuneNoPredictor]>;
 def : Proc<"niagara2",        [FeatureV9, FeatureV8Deprecated, UsePopc,
-                               FeatureVIS, FeatureVIS2, FeatureUA2005]>;
+                               FeatureVIS, FeatureVIS2, FeatureUA2005],
+                              [TuneNoPredictor]>;
 def : Proc<"niagara3",        [FeatureV9, FeatureV8Deprecated, UsePopc,
                                FeatureVIS, FeatureVIS2, FeatureVIS3,
-                               FeatureUA2005, FeatureUA2007]>;
+                               FeatureUA2005, FeatureUA2007],
+                              [TuneNoPredictor]>;
 def : Proc<"niagara4",        [FeatureV9, FeatureV8Deprecated, UsePopc,
                                FeatureVIS, FeatureVIS2, FeatureVIS3,
                                FeatureUA2005, FeatureUA2007, FeatureOSA2011,
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index cbb7db68f7e7c..ae3c32687c207 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -2000,6 +2000,14 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
+  // Some processors have no branch predictor and have pipelines longer than
+  // what can be covered by the delay slot. This results in a stall, so mark
+  // branches to be expensive on those processors.
+  setJumpIsExpensive(Subtarget->hasNoPredictor());
+  // The high cost of branching means that using conditional moves will
+  // still be profitable even if the condition is predictable.
+  PredictableSelectIsExpensive = !isJumpExpensive();
+
   setMinFunctionAlignment(Align(4));
 
   computeRegisterProperties(Subtarget->getRegisterInfo());
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
index f66eb9dbee2dc..6596379061e60 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -38,8 +38,8 @@ static cl::opt<unsigned>
 void SparcInstrInfo::anchor() {}
 
 SparcInstrInfo::SparcInstrInfo(const SparcSubtarget &ST)
-    : SparcGenInstrInfo(ST, SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), RI(ST),
-      Subtarget(ST) {}
+    : SparcGenInstrInfo(ST, RI, SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP),
+      RI(ST), Subtarget(ST) {}
 
 /// isLoadFromStackSlot - If the specified machine instruction is a direct
 /// load from a stack slot, return the virtual or physical register number of
@@ -527,7 +527,6 @@ void SparcInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator I,
                                          Register SrcReg, bool isKill, int FI,
                                          const TargetRegisterClass *RC,
-                                         const TargetRegisterInfo *TRI,
                                          Register VReg,
                                          MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
@@ -564,10 +563,12 @@ void SparcInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     llvm_unreachable("Can't store this register to stack slot");
 }
 
-void SparcInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void SparcInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          Register DestReg, int FI,
+                                          const TargetRegisterClass *RC,
+                                          Register VReg,
+                                          MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
 
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.h b/llvm/lib/Target/Sparc/SparcInstrInfo.h
index 01d0204734943..273888f427992 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.h
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.h
@@ -92,14 +92,13 @@ class SparcInstrInfo : public SparcGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   Register getGlobalBaseReg(MachineFunction *MF) const;
diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.h b/llvm/lib/Target/Sparc/SparcSubtarget.h
index b1decca0a4f07..f575f6d7da37f 100644
--- a/llvm/lib/Target/Sparc/SparcSubtarget.h
+++ b/llvm/lib/Target/Sparc/SparcSubtarget.h
@@ -21,7 +21,6 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/Triple.h"
-#include <string>
 
 #define GET_SUBTARGETINFO_HEADER
 #include "SparcGenSubtargetInfo.inc"
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
index 275165d2acb07..a24543b699ab4 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
@@ -218,7 +218,7 @@ void SystemZInstPrinterCommon::printBDXAddrOperand(const MCInst *MI, int OpNum,
 
 void SystemZInstPrinterCommon::printBDLAddrOperand(const MCInst *MI, int OpNum,
                                                    raw_ostream &O) {
-  unsigned Base = MI->getOperand(OpNum).getReg();
+  MCRegister Base = MI->getOperand(OpNum).getReg();
   const MCOperand &DispMO = MI->getOperand(OpNum + 1);
   uint64_t Length = MI->getOperand(OpNum + 2).getImm();
   printOperand(DispMO, &MAI, O);
@@ -232,9 +232,9 @@ void SystemZInstPrinterCommon::printBDLAddrOperand(const MCInst *MI, int OpNum,
 
 void SystemZInstPrinterCommon::printBDRAddrOperand(const MCInst *MI, int OpNum,
                                                    raw_ostream &O) {
-  unsigned Base = MI->getOperand(OpNum).getReg();
+  MCRegister Base = MI->getOperand(OpNum).getReg();
   const MCOperand &DispMO = MI->getOperand(OpNum + 1);
-  unsigned Length = MI->getOperand(OpNum + 2).getReg();
+  MCRegister Length = MI->getOperand(OpNum + 2).getReg();
   printOperand(DispMO, &MAI, O);
   O << "(";
   printRegName(O, Length);
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index dcefff99db25b..570bbd884a244 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -360,12 +360,12 @@ bool SystemZELFFrameLowering::spillCalleeSavedRegisters(
     if (SystemZ::FP64BitRegClass.contains(Reg)) {
       MBB.addLiveIn(Reg);
       TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(),
-                               &SystemZ::FP64BitRegClass, TRI, Register());
+                               &SystemZ::FP64BitRegClass, Register());
     }
     if (SystemZ::VR128BitRegClass.contains(Reg)) {
       MBB.addLiveIn(Reg);
       TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(),
-                               &SystemZ::VR128BitRegClass, TRI, Register());
+                               &SystemZ::VR128BitRegClass, Register());
     }
   }
 
@@ -389,10 +389,10 @@ bool SystemZELFFrameLowering::restoreCalleeSavedRegisters(
     MCRegister Reg = I.getReg();
     if (SystemZ::FP64BitRegClass.contains(Reg))
       TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(),
-                                &SystemZ::FP64BitRegClass, TRI, Register());
+                                &SystemZ::FP64BitRegClass, Register());
     if (SystemZ::VR128BitRegClass.contains(Reg))
       TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(),
-                                &SystemZ::VR128BitRegClass, TRI, Register());
+                                &SystemZ::VR128BitRegClass, Register());
   }
 
   // Restore call-saved GPRs (but not call-clobbered varargs, which at
@@ -1157,12 +1157,12 @@ bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters(
     if (SystemZ::FP64BitRegClass.contains(Reg)) {
       MBB.addLiveIn(Reg);
       TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(),
-                               &SystemZ::FP64BitRegClass, TRI, Register());
+                               &SystemZ::FP64BitRegClass, Register());
     }
     if (SystemZ::VR128BitRegClass.contains(Reg)) {
       MBB.addLiveIn(Reg);
       TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(),
-                               &SystemZ::VR128BitRegClass, TRI, Register());
+                               &SystemZ::VR128BitRegClass, Register());
     }
   }
 
@@ -1189,10 +1189,10 @@ bool SystemZXPLINKFrameLowering::restoreCalleeSavedRegisters(
     MCRegister Reg = I.getReg();
     if (SystemZ::FP64BitRegClass.contains(Reg))
       TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(),
-                                &SystemZ::FP64BitRegClass, TRI, Register());
+                                &SystemZ::FP64BitRegClass, Register());
     if (SystemZ::VR128BitRegClass.contains(Reg))
       TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(),
-                                &SystemZ::VR128BitRegClass, TRI, Register());
+                                &SystemZ::VR128BitRegClass, Register());
   }
 
   // Restore call-saved GPRs (but not call-clobbered varargs, which at
diff --git a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
index 5313fba3bed1d..8fc339f59e60a 100644
--- a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
@@ -115,11 +115,10 @@ SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const {
 }
 
 bool SystemZHazardRecognizer::has4RegOps(const MachineInstr *MI) const {
-  const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
   const MCInstrDesc &MID = MI->getDesc();
   unsigned Count = 0;
   for (unsigned OpIdx = 0; OpIdx < MID.getNumOperands(); OpIdx++) {
-    const TargetRegisterClass *RC = TII->getRegClass(MID, OpIdx, TRI);
+    const TargetRegisterClass *RC = TII->getRegClass(MID, OpIdx);
     if (RC == nullptr)
       continue;
     if (OpIdx >= MID.getNumDefs() &&
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 2e21f27c9032f..eb1ce4a2101d7 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -60,7 +60,7 @@ static uint64_t allOnes(unsigned int Count) {
 void SystemZInstrInfo::anchor() {}
 
 SystemZInstrInfo::SystemZInstrInfo(const SystemZSubtarget &sti)
-    : SystemZGenInstrInfo(sti, -1, -1),
+    : SystemZGenInstrInfo(sti, RI, -1, -1),
       RI(sti.getSpecialRegisters()->getReturnFunctionAddressRegister(),
          sti.getHwMode()),
       STI(sti) {}
@@ -1023,8 +1023,8 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 void SystemZInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
     bool isKill, int FrameIdx, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
-    MachineInstr::MIFlag Flags) const {
+
+    Register VReg, MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
   // Callers may expect a single instruction, so keep 128-bit moves
@@ -1036,10 +1036,12 @@ void SystemZInstrInfo::storeRegToStackSlot(
                     FrameIdx);
 }
 
-void SystemZInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
-    int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MBBI,
+                                            Register DestReg, int FrameIdx,
+                                            const TargetRegisterClass *RC,
+                                            Register VReg,
+                                            MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
   // Callers may expect a single instruction, so keep 128-bit moves
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 7b9ad7b87a14f..4aecdd7498018 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -281,12 +281,14 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
       bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIdx, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
   MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
                                       LiveIntervals *LIS) const override;
diff --git a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
index 21a233b2ffa1d..b7a93e7babefe 100644
--- a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -216,6 +216,7 @@ static unsigned getInstSizeInBytes(const MachineInstr &MI,
           MI.isDebugOrPseudoInstr() || MI.isPosition() || MI.isKill() ||
           MI.isImplicitDef() || MI.getOpcode() == TargetOpcode::MEMBARRIER ||
           MI.getOpcode() == TargetOpcode::INIT_UNDEF || MI.isFakeUse() ||
+          MI.getOpcode() == TargetOpcode::RELOC_NONE ||
           // These have a size that may be zero:
           MI.isInlineAsm() || MI.getOpcode() == SystemZ::STACKMAP ||
           MI.getOpcode() == SystemZ::PATCHPOINT ||
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h b/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h
index 9d0adbb81d86d..87ec2564edcfb 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.h
@@ -16,7 +16,7 @@ namespace llvm {
 /// This implementation is used for SystemZ ELF targets.
 class SystemZELFTargetObjectFile : public TargetLoweringObjectFileELF {
 public:
-  SystemZELFTargetObjectFile() {}
+  SystemZELFTargetObjectFile() = default;
 
   /// Describe a TLS variable address within debug info.
   const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
diff --git a/llvm/lib/Target/Target.cpp b/llvm/lib/Target/Target.cpp
index ec673ef4cda52..7387571418c8d 100644
--- a/llvm/lib/Target/Target.cpp
+++ b/llvm/lib/Target/Target.cpp
@@ -37,6 +37,7 @@ inline LLVMTargetLibraryInfoRef wrap(const TargetLibraryInfoImpl *P) {
 
 void llvm::initializeTarget(PassRegistry &Registry) {
   initializeTargetLibraryInfoWrapperPassPass(Registry);
+  initializeRuntimeLibraryInfoWrapperPass(Registry);
   initializeTargetTransformInfoWrapperPassPass(Registry);
 }
 
diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
index 20f561a8dac34..9b47d237f0702 100644
--- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
+++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
@@ -54,7 +54,7 @@ class VEAsmParser : public MCTargetAsmParser {
                                uint64_t &ErrorInfo,
                                bool MatchingInlineAsm) override;
   bool parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override;
-  int parseRegisterName(MCRegister (*matchFn)(StringRef));
+  MCRegister parseRegisterName(MCRegister (*matchFn)(StringRef));
   ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
                                SMLoc &EndLoc) override;
   bool parseInstruction(ParseInstructionInfo &Info, StringRef Name,
@@ -169,7 +169,7 @@ class VEOperand : public MCParsedAsmOperand {
   };
 
   struct RegOp {
-    unsigned RegNum;
+    MCRegister Reg;
   };
 
   struct ImmOp {
@@ -177,8 +177,8 @@ class VEOperand : public MCParsedAsmOperand {
   };
 
   struct MemOp {
-    unsigned Base;
-    unsigned IndexReg;
+    MCRegister Base;
+    MCRegister IndexReg;
     const MCExpr *Index;
     const MCExpr *Offset;
   };
@@ -342,7 +342,7 @@ class VEOperand : public MCParsedAsmOperand {
 
   MCRegister getReg() const override {
     assert((Kind == k_Register) && "Invalid access!");
-    return Reg.RegNum;
+    return Reg.Reg;
   }
 
   const MCExpr *getImm() const {
@@ -350,14 +350,14 @@ class VEOperand : public MCParsedAsmOperand {
     return Imm.Val;
   }
 
-  unsigned getMemBase() const {
+  MCRegister getMemBase() const {
     assert((Kind == k_MemoryRegRegImm || Kind == k_MemoryRegImmImm ||
             Kind == k_MemoryRegImm) &&
            "Invalid access!");
     return Mem.Base;
   }
 
-  unsigned getMemIndexReg() const {
+  MCRegister getMemIndexReg() const {
     assert((Kind == k_MemoryRegRegImm || Kind == k_MemoryZeroRegImm) &&
            "Invalid access!");
     return Mem.IndexReg;
@@ -415,20 +415,21 @@ class VEOperand : public MCParsedAsmOperand {
       OS << "Token: " << getToken() << "\n";
       break;
     case k_Register:
-      OS << "Reg: #" << getReg() << "\n";
+      OS << "Reg: #" << getReg().id() << "\n";
       break;
     case k_Immediate:
       OS << "Imm: " << getImm() << "\n";
       break;
     case k_MemoryRegRegImm:
       assert(getMemOffset() != nullptr);
-      OS << "Mem: #" << getMemBase() << "+#" << getMemIndexReg() << "+";
+      OS << "Mem: #" << getMemBase().id() << "+#" << getMemIndexReg().id()
+         << "+";
       MAI.printExpr(OS, *getMemOffset());
       OS << "\n";
       break;
     case k_MemoryRegImmImm:
       assert(getMemIndex() != nullptr && getMemOffset() != nullptr);
-      OS << "Mem: #" << getMemBase() << "+";
+      OS << "Mem: #" << getMemBase().id() << "+";
       MAI.printExpr(OS, *getMemIndex());
       OS << "+";
       MAI.printExpr(OS, *getMemOffset());
@@ -436,7 +437,7 @@ class VEOperand : public MCParsedAsmOperand {
       break;
     case k_MemoryZeroRegImm:
       assert(getMemOffset() != nullptr);
-      OS << "Mem: 0+#" << getMemIndexReg() << "+";
+      OS << "Mem: 0+#" << getMemIndexReg().id() << "+";
       MAI.printExpr(OS, *getMemOffset());
       OS << "\n";
       break;
@@ -450,7 +451,7 @@ class VEOperand : public MCParsedAsmOperand {
       break;
     case k_MemoryRegImm:
       assert(getMemOffset() != nullptr);
-      OS << "Mem: #" << getMemBase() << "+";
+      OS << "Mem: #" << getMemBase().id() << "+";
       MAI.printExpr(OS, *getMemOffset());
       OS << "\n";
       break;
@@ -606,10 +607,10 @@ class VEOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static std::unique_ptr<VEOperand> CreateReg(unsigned RegNum, SMLoc S,
+  static std::unique_ptr<VEOperand> CreateReg(MCRegister Reg, SMLoc S,
                                               SMLoc E) {
     auto Op = std::make_unique<VEOperand>(k_Register);
-    Op->Reg.RegNum = RegNum;
+    Op->Reg.Reg = Reg;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
@@ -653,38 +654,38 @@ class VEOperand : public MCParsedAsmOperand {
   }
 
   static bool MorphToI32Reg(VEOperand &Op) {
-    unsigned Reg = Op.getReg();
+    MCRegister Reg = Op.getReg();
     unsigned regIdx = Reg - VE::SX0;
     if (regIdx > 63)
       return false;
-    Op.Reg.RegNum = I32Regs[regIdx];
+    Op.Reg.Reg = I32Regs[regIdx];
     return true;
   }
 
   static bool MorphToF32Reg(VEOperand &Op) {
-    unsigned Reg = Op.getReg();
+    MCRegister Reg = Op.getReg();
     unsigned regIdx = Reg - VE::SX0;
     if (regIdx > 63)
       return false;
-    Op.Reg.RegNum = F32Regs[regIdx];
+    Op.Reg.Reg = F32Regs[regIdx];
     return true;
   }
 
   static bool MorphToF128Reg(VEOperand &Op) {
-    unsigned Reg = Op.getReg();
+    MCRegister Reg = Op.getReg();
     unsigned regIdx = Reg - VE::SX0;
     if (regIdx % 2 || regIdx > 63)
       return false;
-    Op.Reg.RegNum = F128Regs[regIdx / 2];
+    Op.Reg.Reg = F128Regs[regIdx / 2];
     return true;
   }
 
   static bool MorphToVM512Reg(VEOperand &Op) {
-    unsigned Reg = Op.getReg();
+    MCRegister Reg = Op.getReg();
     unsigned regIdx = Reg - VE::VM0;
     if (regIdx % 2 || regIdx > 15)
       return false;
-    Op.Reg.RegNum = VM512Regs[regIdx / 2];
+    Op.Reg.Reg = VM512Regs[regIdx / 2];
     return true;
   }
 
@@ -696,16 +697,16 @@ class VEOperand : public MCParsedAsmOperand {
     if (regIdx > 31 || MISCRegs[regIdx] == VE::NoRegister)
       return false;
     Op.Kind = k_Register;
-    Op.Reg.RegNum = MISCRegs[regIdx];
+    Op.Reg.Reg = MISCRegs[regIdx];
     return true;
   }
 
   static std::unique_ptr<VEOperand>
-  MorphToMEMri(unsigned Base, std::unique_ptr<VEOperand> Op) {
+  MorphToMEMri(MCRegister Base, std::unique_ptr<VEOperand> Op) {
     const MCExpr *Imm = Op->getImm();
     Op->Kind = k_MemoryRegImm;
     Op->Mem.Base = Base;
-    Op->Mem.IndexReg = 0;
+    Op->Mem.IndexReg = MCRegister();
     Op->Mem.Index = nullptr;
     Op->Mem.Offset = Imm;
     return Op;
@@ -715,15 +716,16 @@ class VEOperand : public MCParsedAsmOperand {
   MorphToMEMzi(std::unique_ptr<VEOperand> Op) {
     const MCExpr *Imm = Op->getImm();
     Op->Kind = k_MemoryZeroImm;
-    Op->Mem.Base = 0;
-    Op->Mem.IndexReg = 0;
+    Op->Mem.Base = MCRegister();
+    Op->Mem.IndexReg = MCRegister();
     Op->Mem.Index = nullptr;
     Op->Mem.Offset = Imm;
     return Op;
   }
 
   static std::unique_ptr<VEOperand>
-  MorphToMEMrri(unsigned Base, unsigned Index, std::unique_ptr<VEOperand> Op) {
+  MorphToMEMrri(MCRegister Base, MCRegister Index,
+                std::unique_ptr<VEOperand> Op) {
     const MCExpr *Imm = Op->getImm();
     Op->Kind = k_MemoryRegRegImm;
     Op->Mem.Base = Base;
@@ -734,22 +736,22 @@ class VEOperand : public MCParsedAsmOperand {
   }
 
   static std::unique_ptr<VEOperand>
-  MorphToMEMrii(unsigned Base, const MCExpr *Index,
+  MorphToMEMrii(MCRegister Base, const MCExpr *Index,
                 std::unique_ptr<VEOperand> Op) {
     const MCExpr *Imm = Op->getImm();
     Op->Kind = k_MemoryRegImmImm;
     Op->Mem.Base = Base;
-    Op->Mem.IndexReg = 0;
+    Op->Mem.IndexReg = MCRegister();
     Op->Mem.Index = Index;
     Op->Mem.Offset = Imm;
     return Op;
   }
 
   static std::unique_ptr<VEOperand>
-  MorphToMEMzri(unsigned Index, std::unique_ptr<VEOperand> Op) {
+  MorphToMEMzri(MCRegister Index, std::unique_ptr<VEOperand> Op) {
     const MCExpr *Imm = Op->getImm();
     Op->Kind = k_MemoryZeroRegImm;
-    Op->Mem.Base = 0;
+    Op->Mem.Base = MCRegister();
     Op->Mem.IndexReg = Index;
     Op->Mem.Index = nullptr;
     Op->Mem.Offset = Imm;
@@ -760,8 +762,8 @@ class VEOperand : public MCParsedAsmOperand {
   MorphToMEMzii(const MCExpr *Index, std::unique_ptr<VEOperand> Op) {
     const MCExpr *Imm = Op->getImm();
     Op->Kind = k_MemoryZeroImmImm;
-    Op->Mem.Base = 0;
-    Op->Mem.IndexReg = 0;
+    Op->Mem.Base = MCRegister();
+    Op->Mem.IndexReg = MCRegister();
     Op->Mem.Index = Index;
     Op->Mem.Offset = Imm;
     return Op;
@@ -815,14 +817,14 @@ bool VEAsmParser::parseRegister(MCRegister &Reg, SMLoc &StartLoc,
 
 /// Parses a register name using a given matching function.
 /// Checks for lowercase or uppercase if necessary.
-int VEAsmParser::parseRegisterName(MCRegister (*matchFn)(StringRef)) {
+MCRegister VEAsmParser::parseRegisterName(MCRegister (*matchFn)(StringRef)) {
   StringRef Name = Parser.getTok().getString();
 
-  int RegNum = matchFn(Name);
+  MCRegister RegNum = matchFn(Name);
 
   // GCC supports case insensitive register names. All of the VE registers
   // are all lower case.
-  if (RegNum == VE::NoRegister) {
+  if (!RegNum) {
     RegNum = matchFn(Name.lower());
   }
 
diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp
index d5e804afd27fe..b9ac5d6254362 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -35,7 +35,7 @@ using namespace llvm;
 void VEInstrInfo::anchor() {}
 
 VEInstrInfo::VEInstrInfo(const VESubtarget &ST)
-    : VEGenInstrInfo(ST, VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {}
+    : VEGenInstrInfo(ST, RI, VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {}
 
 static bool IsIntegerCC(unsigned CC) { return (CC < VECC::CC_AF); }
 
@@ -459,7 +459,6 @@ void VEInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator I,
                                       Register SrcReg, bool isKill, int FI,
                                       const TargetRegisterClass *RC,
-                                      const TargetRegisterInfo *TRI,
                                       Register VReg,
                                       MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
@@ -519,10 +518,12 @@ void VEInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     report_fatal_error("Can't store this register to stack slot");
 }
 
-void VEInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void VEInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       Register DestReg, int FI,
+                                       const TargetRegisterClass *RC,
+                                       Register VReg,
+                                       MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (I != MBB.end())
     DL = I->getDebugLoc();
diff --git a/llvm/lib/Target/VE/VEInstrInfo.h b/llvm/lib/Target/VE/VEInstrInfo.h
index 408d3ab9e05f5..cedf7f21011ff 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.h
+++ b/llvm/lib/Target/VE/VEInstrInfo.h
@@ -92,13 +92,15 @@ class VEInstrInfo : public VEGenInstrInfo {
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
       bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
   /// } Stack Spill & Reload
 
diff --git a/llvm/lib/Target/WebAssembly/CMakeLists.txt b/llvm/lib/Target/WebAssembly/CMakeLists.txt
index 1e83cbeac50d6..17df119d62709 100644
--- a/llvm/lib/Target/WebAssembly/CMakeLists.txt
+++ b/llvm/lib/Target/WebAssembly/CMakeLists.txt
@@ -10,6 +10,7 @@ tablegen(LLVM WebAssemblyGenFastISel.inc -gen-fast-isel)
 tablegen(LLVM WebAssemblyGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM WebAssemblyGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM WebAssemblyGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM WebAssemblyGenSDNodeInfo.inc -gen-sd-node-info)
 tablegen(LLVM WebAssemblyGenSubtargetInfo.inc -gen-subtarget)
 
 add_public_tablegen_target(WebAssemblyCommonTableGen)
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
index d8bfed9dc0390..651f631c1ee55 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
@@ -317,8 +317,8 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
     const MCInstrDesc &Desc = MII.get(MI->getOpcode());
-    unsigned WAReg = Op.getReg();
-    if (int(WAReg) >= 0)
+    MCRegister WAReg = Op.getReg();
+    if (int(WAReg.id()) >= 0)
       printRegName(O, WAReg);
     else if (OpNo >= Desc.getNumDefs() && !IsVariadicDef)
       O << "$pop" << WebAssembly::getWARegStackId(WAReg);
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index fe9a4bada2430..5dc0e3aa91622 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -608,9 +608,9 @@ inline bool isLocalTee(unsigned Opc) {
 static const unsigned UnusedReg = -1u;
 
 // For a given stackified WAReg, return the id number to print with push/pop.
-unsigned inline getWARegStackId(unsigned Reg) {
-  assert(Reg & INT32_MIN);
-  return Reg & INT32_MAX;
+unsigned inline getWARegStackId(MCRegister Reg) {
+  assert(Reg.id() & INT32_MIN);
+  return Reg.id() & INT32_MAX;
 }
 
 } // end namespace WebAssembly
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
index 7845cdfaebec7..1bfc61f0ab611 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
@@ -76,7 +76,7 @@ class WebAssemblyException {
     BlockSet.insert(MBB);
   }
   ArrayRef<MachineBasicBlock *> getBlocks() const { return Blocks; }
-  using block_iterator = typename ArrayRef<MachineBasicBlock *>::const_iterator;
+  using block_iterator = ArrayRef<MachineBasicBlock *>::const_iterator;
   block_iterator block_begin() const { return getBlocks().begin(); }
   block_iterator block_end() const { return getBlocks().end(); }
   inline iterator_range<block_iterator> blocks() const {
@@ -96,7 +96,7 @@ class WebAssemblyException {
   void addSubException(std::unique_ptr<WebAssemblyException> E) {
     SubExceptions.push_back(std::move(E));
   }
-  using iterator = typename decltype(SubExceptions)::const_iterator;
+  using iterator = decltype(SubExceptions)::const_iterator;
   iterator begin() const { return SubExceptions.begin(); }
   iterator end() const { return SubExceptions.end(); }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 2666342d0c7b9..9d8e09c09e9ea 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -46,7 +46,7 @@ class WebAssemblyFastISel final : public FastISel {
   // All possible address modes.
   class Address {
   public:
-    using BaseKind = enum { RegBase, FrameIndexBase };
+    enum BaseKind { RegBase, FrameIndexBase };
 
   private:
     BaseKind Kind = RegBase;
@@ -988,20 +988,36 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
 bool WebAssemblyFastISel::selectTrunc(const Instruction *I) {
   const auto *Trunc = cast<TruncInst>(I);
 
-  Register Reg = getRegForValue(Trunc->getOperand(0));
-  if (Reg == 0)
+  const Value *Op = Trunc->getOperand(0);
+  MVT::SimpleValueType From = getSimpleType(Op->getType());
+  MVT::SimpleValueType To = getLegalType(getSimpleType(Trunc->getType()));
+  Register In = getRegForValue(Op);
+  if (In == 0)
     return false;
 
-  unsigned FromBitWidth = Trunc->getOperand(0)->getType()->getIntegerBitWidth();
-  unsigned ToBitWidth = Trunc->getType()->getIntegerBitWidth();
+  auto Truncate = [&](Register Reg) -> unsigned {
+    if (From == MVT::i64) {
+      if (To == MVT::i64)
+        return copyValue(Reg);
+
+      if (To == MVT::i1 || To == MVT::i8 || To == MVT::i16 || To == MVT::i32) {
+        Register Result = createResultReg(&WebAssembly::I32RegClass);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+                TII.get(WebAssembly::I32_WRAP_I64), Result)
+            .addReg(Reg);
+        return Result;
+      }
+    }
 
-  if (ToBitWidth <= 32 && (32 < FromBitWidth && FromBitWidth <= 64)) {
-    Register Result = createResultReg(&WebAssembly::I32RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
-            TII.get(WebAssembly::I32_WRAP_I64), Result)
-        .addReg(Reg);
-    Reg = Result;
-  }
+    if (From == MVT::i32)
+      return copyValue(Reg);
+
+    return 0;
+  };
+
+  unsigned Reg = Truncate(In);
+  if (Reg == 0)
+    return false;
 
   updateValueMap(Trunc, Reg);
   return true;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index 37a34573bb339..9fef3e6d8b089 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -24,6 +24,7 @@
 
 #include "WebAssembly.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
@@ -114,6 +115,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
   Wrapper->setAttributes(F->getAttributes());
   BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper);
   const DataLayout &DL = BB->getDataLayout();
+  IRBuilder<> Builder(BB);
 
   // Determine what arguments to pass.
   SmallVector<Value *, 4> Args;
@@ -140,10 +142,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
       Args.push_back(&*AI);
     } else {
       if (CastInst::isBitOrNoopPointerCastable(ArgType, ParamType, DL)) {
-        Instruction *PtrCast =
-            CastInst::CreateBitOrPointerCast(AI, ParamType, "cast");
-        PtrCast->insertInto(BB, BB->end());
-        Args.push_back(PtrCast);
+        Args.push_back(Builder.CreateBitOrPointerCast(AI, ParamType, "cast"));
       } else if (ArgType->isStructTy() || ParamType->isStructTy()) {
         LLVM_DEBUG(dbgs() << "createWrapper: struct param type in bitcast: "
                           << F->getName() << "\n");
@@ -166,24 +165,19 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
       for (; AI != AE; ++AI)
         Args.push_back(&*AI);
 
-    CallInst *Call = CallInst::Create(F, Args, "", BB);
+    CallInst *Call = Builder.CreateCall(F, Args);
 
-    Type *ExpectedRtnType = F->getFunctionType()->getReturnType();
-    Type *RtnType = Ty->getReturnType();
     // Determine what value to return.
     if (RtnType->isVoidTy()) {
-      ReturnInst::Create(M->getContext(), BB);
+      Builder.CreateRetVoid();
     } else if (ExpectedRtnType->isVoidTy()) {
       LLVM_DEBUG(dbgs() << "Creating dummy return: " << *RtnType << "\n");
-      ReturnInst::Create(M->getContext(), PoisonValue::get(RtnType), BB);
+      Builder.CreateRet(PoisonValue::get(RtnType));
     } else if (RtnType == ExpectedRtnType) {
-      ReturnInst::Create(M->getContext(), Call, BB);
+      Builder.CreateRet(Call);
     } else if (CastInst::isBitOrNoopPointerCastable(ExpectedRtnType, RtnType,
                                                     DL)) {
-      Instruction *Cast =
-          CastInst::CreateBitOrPointerCast(Call, RtnType, "cast");
-      Cast->insertInto(BB, BB->end());
-      ReturnInst::Create(M->getContext(), Cast, BB);
+      Builder.CreateRet(Builder.CreateBitOrPointerCast(Call, RtnType, "cast"));
     } else if (RtnType->isStructTy() || ExpectedRtnType->isStructTy()) {
       LLVM_DEBUG(dbgs() << "createWrapper: struct return type in bitcast: "
                         << F->getName() << "\n");
@@ -203,9 +197,8 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
     Wrapper = Function::Create(Ty, Function::PrivateLinkage,
                                F->getName() + "_bitcast_invalid", M);
     Wrapper->setAttributes(F->getAttributes());
-    BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper);
-    new UnreachableInst(M->getContext(), BB);
-    Wrapper->setName(F->getName() + "_bitcast_invalid");
+    IRBuilder<> Builder(BasicBlock::Create(M->getContext(), "body", Wrapper));
+    Builder.CreateUnreachable();
   } else if (!WrapperNeeded) {
     LLVM_DEBUG(dbgs() << "createWrapper: no wrapper needed: " << F->getName()
                       << "\n");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
deleted file mode 100644
index 23108e429eda8..0000000000000
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ /dev/null
@@ -1,64 +0,0 @@
-//- WebAssemblyISD.def - WebAssembly ISD ---------------------------*- C++ -*-//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file describes the various WebAssembly ISD node types.
-///
-//===----------------------------------------------------------------------===//
-
-// NOTE: NO INCLUDE GUARD DESIRED!
-
-HANDLE_NODETYPE(CALL)
-HANDLE_NODETYPE(RET_CALL)
-HANDLE_NODETYPE(RETURN)
-HANDLE_NODETYPE(ARGUMENT)
-HANDLE_NODETYPE(LOCAL_GET)
-HANDLE_NODETYPE(LOCAL_SET)
-// A wrapper node for TargetExternalSymbol, TargetGlobalAddress, and MCSymbol
-HANDLE_NODETYPE(Wrapper)
-// A special node for TargetGlobalAddress used in PIC code for
-// __memory_base/__table_base relative access.
-HANDLE_NODETYPE(WrapperREL)
-HANDLE_NODETYPE(BR_IF)
-HANDLE_NODETYPE(BR_TABLE)
-HANDLE_NODETYPE(DOT)
-HANDLE_NODETYPE(EXT_ADD_PAIRWISE_U)
-HANDLE_NODETYPE(EXT_ADD_PAIRWISE_S)
-HANDLE_NODETYPE(SHUFFLE)
-HANDLE_NODETYPE(SWIZZLE)
-HANDLE_NODETYPE(VEC_SHL)
-HANDLE_NODETYPE(VEC_SHR_S)
-HANDLE_NODETYPE(VEC_SHR_U)
-HANDLE_NODETYPE(NARROW_U)
-HANDLE_NODETYPE(EXTEND_LOW_S)
-HANDLE_NODETYPE(EXTEND_LOW_U)
-HANDLE_NODETYPE(EXTEND_HIGH_S)
-HANDLE_NODETYPE(EXTEND_HIGH_U)
-HANDLE_NODETYPE(CONVERT_LOW_S)
-HANDLE_NODETYPE(CONVERT_LOW_U)
-HANDLE_NODETYPE(PROMOTE_LOW)
-HANDLE_NODETYPE(TRUNC_SAT_ZERO_S)
-HANDLE_NODETYPE(TRUNC_SAT_ZERO_U)
-HANDLE_NODETYPE(DEMOTE_ZERO)
-HANDLE_NODETYPE(I64_ADD128)
-HANDLE_NODETYPE(I64_SUB128)
-HANDLE_NODETYPE(I64_MUL_WIDE_S)
-HANDLE_NODETYPE(I64_MUL_WIDE_U)
-
-// Memory intrinsics
-HANDLE_NODETYPE(GLOBAL_GET)
-HANDLE_NODETYPE(GLOBAL_SET)
-HANDLE_NODETYPE(TABLE_GET)
-HANDLE_NODETYPE(TABLE_SET)
-
-// Bulk memory instructions. These follow LLVM's expected semantics of
-// supporting out-of-bounds pointers if the length is zero, by inserting
-// a branch around Wasm's `memory.copy` and `memory.fill`, which would
-// otherwise trap.
-HANDLE_NODETYPE(MEMCPY)
-HANDLE_NODETYPE(MEMSET)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 7ec463bdc3b84..fc6c2903471a8 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -216,7 +216,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     // Combine fp_to_{s,u}int_sat or fp_round of concat_vectors or vice versa
     // into conversion ops
     setTargetDAGCombine({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
-                         ISD::FP_ROUND, ISD::CONCAT_VECTORS});
+                         ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_ROUND,
+                         ISD::CONCAT_VECTORS});
 
     setTargetDAGCombine(ISD::TRUNCATE);
 
@@ -942,20 +943,6 @@ MachineBasicBlock *WebAssemblyTargetLowering::EmitInstrWithCustomInserter(
   }
 }
 
-const char *
-WebAssemblyTargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) {
-  case WebAssemblyISD::FIRST_NUMBER:
-    break;
-#define HANDLE_NODETYPE(NODE)                                                  \
-  case WebAssemblyISD::NODE:                                                   \
-    return "WebAssemblyISD::" #NODE;
-#include "WebAssemblyISD.def"
-#undef HANDLE_NODETYPE
-  }
-  return nullptr;
-}
-
 std::pair<unsigned, const TargetRegisterClass *>
 WebAssemblyTargetLowering::getRegForInlineAsmConstraint(
     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
@@ -1830,11 +1817,8 @@ SDValue WebAssemblyTargetLowering::LowerLoad(SDValue Op,
 
     SDValue Idx = DAG.getTargetConstant(*Local, Base, MVT::i32);
     EVT LocalVT = LN->getValueType(0);
-    SDValue LocalGet = DAG.getNode(WebAssemblyISD::LOCAL_GET, DL, LocalVT,
-                                   {LN->getChain(), Idx});
-    SDValue Result = DAG.getMergeValues({LocalGet, LN->getChain()}, DL);
-    assert(Result->getNumValues() == 2 && "Loads must carry a chain!");
-    return Result;
+    return DAG.getNode(WebAssemblyISD::LOCAL_GET, DL, {LocalVT, MVT::Other},
+                       {LN->getChain(), Idx});
   }
 
   if (WebAssembly::isWasmVarAddressSpace(LN->getAddressSpace()))
@@ -3597,6 +3581,64 @@ static SDValue performMulCombine(SDNode *N,
   }
 }
 
+SDValue DoubleVectorWidth(SDValue In, unsigned RequiredNumElems,
+                          SelectionDAG &DAG) {
+  SDLoc DL(In);
+  LLVMContext &Ctx = *DAG.getContext();
+  EVT InVT = In.getValueType();
+  unsigned NumElems = InVT.getVectorNumElements() * 2;
+  EVT OutVT = EVT::getVectorVT(Ctx, InVT.getVectorElementType(), NumElems);
+  SDValue Concat =
+      DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, In, DAG.getPOISON(InVT));
+  if (NumElems < RequiredNumElems) {
+    return DoubleVectorWidth(Concat, RequiredNumElems, DAG);
+  }
+  return Concat;
+}
+
+SDValue performConvertFPCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT OutVT = N->getValueType(0);
+  if (!OutVT.isVector())
+    return SDValue();
+
+  EVT OutElTy = OutVT.getVectorElementType();
+  if (OutElTy != MVT::i8 && OutElTy != MVT::i16)
+    return SDValue();
+
+  unsigned NumElems = OutVT.getVectorNumElements();
+  if (!isPowerOf2_32(NumElems))
+    return SDValue();
+
+  EVT FPVT = N->getOperand(0)->getValueType(0);
+  if (FPVT.getVectorElementType() != MVT::f32)
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // First, convert to i32.
+  LLVMContext &Ctx = *DAG.getContext();
+  EVT IntVT = EVT::getVectorVT(Ctx, MVT::i32, NumElems);
+  SDValue ToInt = DAG.getNode(N->getOpcode(), DL, IntVT, N->getOperand(0));
+  APInt Mask = APInt::getLowBitsSet(IntVT.getScalarSizeInBits(),
+                                    OutVT.getScalarSizeInBits());
+  // Mask out the top MSBs.
+  SDValue Masked =
+      DAG.getNode(ISD::AND, DL, IntVT, ToInt, DAG.getConstant(Mask, DL, IntVT));
+
+  if (OutVT.getSizeInBits() < 128) {
+    // Create a wide enough vector that we can use narrow.
+    EVT NarrowedVT = OutElTy == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
+    unsigned NumRequiredElems = NarrowedVT.getVectorNumElements();
+    SDValue WideVector = DoubleVectorWidth(Masked, NumRequiredElems, DAG);
+    SDValue Trunc = truncateVectorWithNARROW(NarrowedVT, WideVector, DL, DAG);
+    return DAG.getBitcast(
+        OutVT, extractSubVector(Trunc, 0, DAG, DL, OutVT.getSizeInBits()));
+  } else {
+    return truncateVectorWithNARROW(OutVT, Masked, DL, DAG);
+  }
+  return SDValue();
+}
+
 SDValue
 WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
@@ -3623,6 +3665,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FP_ROUND:
   case ISD::CONCAT_VECTORS:
     return performVectorTruncZeroCombine(N, DCI);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return performConvertFPCombine(N, DCI.DAG);
   case ISD::TRUNCATE:
     return performTruncateCombine(N, DCI);
   case ISD::INTRINSIC_WO_CHAIN:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 472ec678534a4..f7052989b3c75 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -19,17 +19,6 @@
 
 namespace llvm {
 
-namespace WebAssemblyISD {
-
-enum NodeType : unsigned {
-  FIRST_NUMBER = ISD::BUILTIN_OP_END,
-#define HANDLE_NODETYPE(NODE) NODE,
-#include "WebAssemblyISD.def"
-#undef HANDLE_NODETYPE
-};
-
-} // end namespace WebAssemblyISD
-
 class WebAssemblySubtarget;
 
 class WebAssemblyTargetLowering final : public TargetLowering {
@@ -53,7 +42,6 @@ class WebAssemblyTargetLowering final : public TargetLowering {
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *MBB) const override;
-  const char *getTargetNodeName(unsigned Opcode) const override;
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 343d90e88950f..8b4e4fbbbd1e5 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -34,7 +34,7 @@ using namespace llvm;
 #include "WebAssemblyGenInstrInfo.inc"
 
 WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
-    : WebAssemblyGenInstrInfo(STI, WebAssembly::ADJCALLSTACKDOWN,
+    : WebAssemblyGenInstrInfo(STI, RI, WebAssembly::ADJCALLSTACKDOWN,
                               WebAssembly::ADJCALLSTACKUP,
                               WebAssembly::CATCHRET),
       RI(STI.getTargetTriple()) {}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
index fc82e5b4a61da..304c4f3fcb028 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
@@ -41,6 +41,11 @@ defm REF_TEST_FUNCREF : I<(outs I32:$res), (ins TypeIndex:$type, FUNCREF:$ref),
                           "ref.test\t$type, $ref", "ref.test $type", 0xfb14>,
                         Requires<[HasGC]>;
 
+defm REF_FUNC : I<(outs FUNCREF:$res), (ins function32_op:$func),
+                    (outs), (ins function32_op:$func), [],
+                    "ref.func\t$func", "ref.func $func", 0xd2>,
+                Requires<[HasReferenceTypes]>;
+
 defm "" : REF_I<FUNCREF, funcref, "func">;
 defm "" : REF_I<EXTERNREF, externref, "extern">;
 defm "" : REF_I<EXNREF, exnref, "exn">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index ff4d64693284a..0e913fb1ee669 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -207,13 +207,12 @@ template <> struct MappingTraits<WebAssemblyFunctionInfo> {
 template <> struct CustomMappingTraits<BBNumberMap> {
   static void inputOne(IO &YamlIO, StringRef Key,
                        BBNumberMap &SrcToUnwindDest) {
-    YamlIO.mapRequired(Key.str().c_str(),
-                       SrcToUnwindDest[std::atoi(Key.str().c_str())]);
+    YamlIO.mapRequired(Key, SrcToUnwindDest[std::atoi(Key.str().c_str())]);
   }
 
   static void output(IO &YamlIO, BBNumberMap &SrcToUnwindDest) {
-    for (auto KV : SrcToUnwindDest)
-      YamlIO.mapRequired(std::to_string(KV.first).c_str(), KV.second);
+    for (auto [Src, Dest] : SrcToUnwindDest)
+      YamlIO.mapRequired(std::to_string(Src), Dest);
   }
 };
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 45b0e7dc12263..f3c236ca8c9ce 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -532,13 +532,19 @@ struct StaticLibcallNameMap {
     // FIXME: This is broken if there are ever different triples compiled with
     // different libcalls.
     RTLIB::RuntimeLibcallsInfo RTCI(TT);
-    for (RTLIB::Libcall LC : RTLIB::libcalls()) {
-      StringRef NameLibcall = RTCI.getLibcallName(LC);
-      if (!NameLibcall.empty() &&
-          getRuntimeLibcallSignatures().Table[LC] != unsupported) {
-        assert(!Map.contains(NameLibcall) &&
-               "duplicate libcall names in name map");
-        Map[NameLibcall] = LC;
+
+    ArrayRef<RuntimeLibcallSignature> Table =
+        getRuntimeLibcallSignatures().Table;
+    for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+      if (!RTCI.isAvailable(Impl))
+        continue;
+      RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl);
+      if (Table[LC] != unsupported) {
+        StringRef NameLibcall =
+            RTLIB::RuntimeLibcallsInfo::getLibcallImplName(Impl);
+        // FIXME: Map should be to LibcallImpl
+        if (!Map.insert({NameLibcall, LC}).second)
+          llvm_unreachable("duplicate libcall names in name map");
       }
     }
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
index 2673c81eae40b..cf5cc41ea565b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
@@ -11,23 +11,31 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "WebAssemblySelectionDAGInfo.h"
 #include "WebAssemblyTargetMachine.h"
+
+#define GET_SDNODE_DESC
+#include "WebAssemblyGenSDNodeInfo.inc"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-selectiondag-info"
 
+WebAssemblySelectionDAGInfo::WebAssemblySelectionDAGInfo()
+    : SelectionDAGGenTargetInfo(WebAssemblyGenSDNodeInfo) {}
+
 WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() = default; // anchor
 
-bool WebAssemblySelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
+const char *
+WebAssemblySelectionDAGInfo::getTargetNodeName(unsigned Opcode) const {
   switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) {
-  default:
-    return false;
-  case WebAssemblyISD::GLOBAL_GET:
-  case WebAssemblyISD::GLOBAL_SET:
-  case WebAssemblyISD::TABLE_GET:
-  case WebAssemblyISD::TABLE_SET:
-    return true;
+  case WebAssemblyISD::CALL:
+    return "WebAssemblyISD::CALL";
+  case WebAssemblyISD::RET_CALL:
+    return "WebAssemblyISD::RET_CALL";
   }
+
+  return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
 }
 
 SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemcpy(
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
index 69c9af0966308..8775f4946d88d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
@@ -17,13 +17,26 @@
 
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
+#define GET_SDNODE_ENUM
+#include "WebAssemblyGenSDNodeInfo.inc"
+
 namespace llvm {
+namespace WebAssemblyISD {
+
+enum NodeType : unsigned {
+  CALL = GENERATED_OPCODE_END,
+  RET_CALL,
+};
 
-class WebAssemblySelectionDAGInfo final : public SelectionDAGTargetInfo {
+} // namespace WebAssemblyISD
+
+class WebAssemblySelectionDAGInfo final : public SelectionDAGGenTargetInfo {
 public:
+  WebAssemblySelectionDAGInfo();
+
   ~WebAssemblySelectionDAGInfo() override;
 
-  bool isTargetMemoryOpcode(unsigned Opcode) const override;
+  const char *getTargetNodeName(unsigned Opcode) const override;
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Op1, SDValue Op2,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h b/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h
index e92bf17641854..96b8a4e33cbb7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h
@@ -35,7 +35,7 @@ class SortRegion {
   virtual MachineBasicBlock *getHeader() const = 0;
   virtual bool contains(const MachineBasicBlock *MBB) const = 0;
   virtual unsigned getNumBlocks() const = 0;
-  using block_iterator = typename ArrayRef<MachineBasicBlock *>::const_iterator;
+  using block_iterator = ArrayRef<MachineBasicBlock *>::const_iterator;
   virtual iterator_range<block_iterator> blocks() const = 0;
   virtual bool isLoop() const = 0;
 };
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 92a9812df2127..70f7b889551a4 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -119,18 +119,82 @@ InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
     }
   }
 
-  // extend_low
   static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
+      // extend_low
       {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1},
       {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1},
       {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1},
       {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1},
       {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1},
       {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1},
+      // 2 x extend_low
       {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2},
       {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2},
       {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2},
       {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2},
+      // extend_low, extend_high
+      {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
+      {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
+      {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
+      {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
+      {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
+      {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
+      // 2x extend_low, extend_high
+      {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 4},
+      {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 4},
+      {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4},
+      {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4},
+      // shuffle
+      {ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 2},
+      {ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 4},
+      {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 2},
+      {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 4},
+      // narrow, and
+      {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2},
+      {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2},
+      // narrow, 2x and
+      {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3},
+      // 3x narrow, 4x and
+      {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 7},
+      {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7},
+      // 7x narrow, 8x and
+      {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 15},
+      // convert_i32x4
+      {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
+      {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
+      {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
+      {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
+      // extend_low, convert
+      {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2},
+      {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2},
+      {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
+      {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
+      // extend_low x 2, convert
+      {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
+      {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
+      {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
+      {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
+      // several shuffles
+      {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
+      {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
+      {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 10},
+      {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
+      /// trunc_sat, const, and, 3x narrow
+      {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 6},
+      {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 6},
+      {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 6},
+      {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 6},
+      /// trunc_sat, const, and, narrow
+      {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 4},
+      {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 4},
+      {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4},
+      {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4},
+      // 2x trunc_sat, const, 2x and, 3x narrow
+      {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 8},
+      {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 8},
+      // 2x trunc_sat, const, 2x and, narrow
+      {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 6},
+      {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 6},
   };
 
   if (const auto *Entry =
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 2573066cd5d63..4146c0ec6ab07 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -21,7 +21,6 @@
 
 #include "WebAssemblyTargetMachine.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
-#include <algorithm>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 127ee67517aea..bac3692aebf83 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1121,7 +1121,7 @@ class X86AsmParser : public MCTargetAsmParser {
     void setTypeInfo(AsmTypeInfo Type) { CurType = Type; }
   };
 
-  bool Error(SMLoc L, const Twine &Msg, SMRange Range = std::nullopt,
+  bool Error(SMLoc L, const Twine &Msg, SMRange Range = {},
              bool MatchingInlineAsm = false) {
     MCAsmParser &Parser = getParser();
     if (MatchingInlineAsm) {
@@ -2470,10 +2470,10 @@ bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
 // Report back its kind, or IOK_INVALID if does not evaluated as a known one
 unsigned X86AsmParser::IdentifyIntelInlineAsmOperator(StringRef Name) {
   return StringSwitch<unsigned>(Name)
-    .Cases("TYPE","type",IOK_TYPE)
-    .Cases("SIZE","size",IOK_SIZE)
-    .Cases("LENGTH","length",IOK_LENGTH)
-    .Default(IOK_INVALID);
+      .Cases({"TYPE", "type"}, IOK_TYPE)
+      .Cases({"SIZE", "size"}, IOK_SIZE)
+      .Cases({"LENGTH", "length"}, IOK_LENGTH)
+      .Default(IOK_INVALID);
 }
 
 /// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators.  The LENGTH operator
@@ -2516,8 +2516,8 @@ unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) {
 unsigned X86AsmParser::IdentifyMasmOperator(StringRef Name) {
   return StringSwitch<unsigned>(Name.lower())
       .Case("type", MOK_TYPE)
-      .Cases("size", "sizeof", MOK_SIZEOF)
-      .Cases("length", "lengthof", MOK_LENGTHOF)
+      .Cases({"size", "sizeof"}, MOK_SIZEOF)
+      .Cases({"length", "lengthof"}, MOK_LENGTHOF)
       .Default(MOK_INVALID);
 }
 
@@ -2581,21 +2581,21 @@ bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) {
 bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size,
                                                StringRef *SizeStr) {
   Size = StringSwitch<unsigned>(getTok().getString())
-    .Cases("BYTE", "byte", 8)
-    .Cases("WORD", "word", 16)
-    .Cases("DWORD", "dword", 32)
-    .Cases("FLOAT", "float", 32)
-    .Cases("LONG", "long", 32)
-    .Cases("FWORD", "fword", 48)
-    .Cases("DOUBLE", "double", 64)
-    .Cases("QWORD", "qword", 64)
-    .Cases("MMWORD","mmword", 64)
-    .Cases("XWORD", "xword", 80)
-    .Cases("TBYTE", "tbyte", 80)
-    .Cases("XMMWORD", "xmmword", 128)
-    .Cases("YMMWORD", "ymmword", 256)
-    .Cases("ZMMWORD", "zmmword", 512)
-    .Default(0);
+             .Cases({"BYTE", "byte"}, 8)
+             .Cases({"WORD", "word"}, 16)
+             .Cases({"DWORD", "dword"}, 32)
+             .Cases({"FLOAT", "float"}, 32)
+             .Cases({"LONG", "long"}, 32)
+             .Cases({"FWORD", "fword"}, 48)
+             .Cases({"DOUBLE", "double"}, 64)
+             .Cases({"QWORD", "qword"}, 64)
+             .Cases({"MMWORD", "mmword"}, 64)
+             .Cases({"XWORD", "xword"}, 80)
+             .Cases({"TBYTE", "tbyte"}, 80)
+             .Cases({"XMMWORD", "xmmword"}, 128)
+             .Cases({"YMMWORD", "ymmword"}, 256)
+             .Cases({"ZMMWORD", "zmmword"}, 512)
+             .Default(0);
   if (Size) {
     if (SizeStr)
       *SizeStr = getTok().getString();
@@ -2886,22 +2886,22 @@ bool X86AsmParser::parseATTOperand(OperandVector &Operands) {
 // otherwise the EFLAGS Condition Code enumerator.
 X86::CondCode X86AsmParser::ParseConditionCode(StringRef CC) {
   return StringSwitch<X86::CondCode>(CC)
-      .Case("o", X86::COND_O)          // Overflow
-      .Case("no", X86::COND_NO)        // No Overflow
-      .Cases("b", "nae", X86::COND_B)  // Below/Neither Above nor Equal
-      .Cases("ae", "nb", X86::COND_AE) // Above or Equal/Not Below
-      .Cases("e", "z", X86::COND_E)    // Equal/Zero
-      .Cases("ne", "nz", X86::COND_NE) // Not Equal/Not Zero
-      .Cases("be", "na", X86::COND_BE) // Below or Equal/Not Above
-      .Cases("a", "nbe", X86::COND_A)  // Above/Neither Below nor Equal
-      .Case("s", X86::COND_S)          // Sign
-      .Case("ns", X86::COND_NS)        // No Sign
-      .Cases("p", "pe", X86::COND_P)   // Parity/Parity Even
-      .Cases("np", "po", X86::COND_NP) // No Parity/Parity Odd
-      .Cases("l", "nge", X86::COND_L)  // Less/Neither Greater nor Equal
-      .Cases("ge", "nl", X86::COND_GE) // Greater or Equal/Not Less
-      .Cases("le", "ng", X86::COND_LE) // Less or Equal/Not Greater
-      .Cases("g", "nle", X86::COND_G)  // Greater/Neither Less nor Equal
+      .Case("o", X86::COND_O)            // Overflow
+      .Case("no", X86::COND_NO)          // No Overflow
+      .Cases({"b", "nae"}, X86::COND_B)  // Below/Neither Above nor Equal
+      .Cases({"ae", "nb"}, X86::COND_AE) // Above or Equal/Not Below
+      .Cases({"e", "z"}, X86::COND_E)    // Equal/Zero
+      .Cases({"ne", "nz"}, X86::COND_NE) // Not Equal/Not Zero
+      .Cases({"be", "na"}, X86::COND_BE) // Below or Equal/Not Above
+      .Cases({"a", "nbe"}, X86::COND_A)  // Above/Neither Below nor Equal
+      .Case("s", X86::COND_S)            // Sign
+      .Case("ns", X86::COND_NS)          // No Sign
+      .Cases({"p", "pe"}, X86::COND_P)   // Parity/Parity Even
+      .Cases({"np", "po"}, X86::COND_NP) // No Parity/Parity Odd
+      .Cases({"l", "nge"}, X86::COND_L)  // Less/Neither Greater nor Equal
+      .Cases({"ge", "nl"}, X86::COND_GE) // Greater or Equal/Not Less
+      .Cases({"le", "ng"}, X86::COND_LE) // Less or Equal/Not Greater
+      .Cases({"g", "nle"}, X86::COND_G)  // Greater/Neither Less nor Equal
       .Default(X86::COND_INVALID);
 }
 
@@ -4322,7 +4322,7 @@ bool X86AsmParser::matchAndEmitATTInstruction(
     SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands,
     MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) {
   X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
-  SMRange EmptyRange = std::nullopt;
+  SMRange EmptyRange;
   // In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode
   // when matching the instruction.
   if (ForcedDataPrefix == X86::Is32Bit)
@@ -4548,7 +4548,7 @@ bool X86AsmParser::matchAndEmitIntelInstruction(
     SMLoc IDLoc, unsigned &Opcode, MCInst &Inst, OperandVector &Operands,
     MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) {
   X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
-  SMRange EmptyRange = std::nullopt;
+  SMRange EmptyRange;
   // Find one unsized memory operand, if present.
   X86Operand *UnsizedMemOp = nullptr;
   for (const auto &Op : Operands) {
diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h
index 89ac53e0ecac9..a92272573bacd 100644
--- a/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -620,37 +620,6 @@ struct X86Operand final : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createReg(Reg));
   }
 
-  bool isTILEPair() const {
-    return Kind == Register &&
-           X86MCRegisterClasses[X86::TILERegClassID].contains(getReg());
-  }
-
-  void addTILEPairOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    MCRegister Reg = getReg();
-    switch (Reg.id()) {
-    default:
-      llvm_unreachable("Invalid tile register!");
-    case X86::TMM0:
-    case X86::TMM1:
-      Reg = X86::TMM0_TMM1;
-      break;
-    case X86::TMM2:
-    case X86::TMM3:
-      Reg = X86::TMM2_TMM3;
-      break;
-    case X86::TMM4:
-    case X86::TMM5:
-      Reg = X86::TMM4_TMM5;
-      break;
-    case X86::TMM6:
-    case X86::TMM7:
-      Reg = X86::TMM6_TMM7;
-      break;
-    }
-    Inst.addOperand(MCOperand::createReg(Reg));
-  }
-
   void addMemOperands(MCInst &Inst, unsigned N) const {
     assert((N == 5) && "Invalid number of operands!");
     if (getMemBaseReg())
diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 4927b453458ef..7d2b5eb900133 100644
--- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -810,10 +810,6 @@ static int readModRM(struct InternalInstruction *insn) {
       if (index > 7)                                                           \
         *valid = 0;                                                            \
       return prefix##_TMM0 + index;                                            \
-    case TYPE_TMM_PAIR:                                                        \
-      if (index > 7)                                                           \
-        *valid = 0;                                                            \
-      return prefix##_TMM0_TMM1 + (index / 2);                                 \
     case TYPE_VK:                                                              \
       index &= 0xf;                                                            \
       if (index > 7)                                                           \
@@ -2323,7 +2319,6 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
   case TYPE_YMM:
   case TYPE_ZMM:
   case TYPE_TMM:
-  case TYPE_TMM_PAIR:
   case TYPE_VK_PAIR:
   case TYPE_VK:
   case TYPE_DEBUGREG:
diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index dc9af2caa77b1..b0aa70be12d83 100644
--- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -535,12 +535,6 @@ namespace X86Disassembler {
   ENTRY(TMM6)                                                                  \
   ENTRY(TMM7)
 
-#define REGS_TMM_PAIRS                                                         \
-  ENTRY(TMM0_TMM1)                                                             \
-  ENTRY(TMM2_TMM3)                                                             \
-  ENTRY(TMM4_TMM5)                                                             \
-  ENTRY(TMM6_TMM7)
-
 #define ALL_EA_BASES                                                           \
   EA_BASES_16BIT                                                               \
   EA_BASES_32BIT                                                               \
@@ -565,7 +559,6 @@ namespace X86Disassembler {
   REGS_DEBUG                                                                   \
   REGS_CONTROL                                                                 \
   REGS_TMM                                                                     \
-  REGS_TMM_PAIRS                                                               \
   ENTRY(RIP)
 
 /// All possible values of the base field for effective-address
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 74de51c7eb1cc..e67b138afafec 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -1391,7 +1391,7 @@ class DarwinX86AsmBackend : public X86AsmBackend {
           return CU::UNWIND_MODE_DWARF;
 
         MCRegister Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true);
-        SavedRegs[SavedRegIdx++] = Reg;
+        SavedRegs[SavedRegIdx++] = Reg.id();
         StackAdjust += OffsetSize;
         MinAbsOffset = std::min(MinAbsOffset, std::abs(Inst.getOffset()));
         InstrOffset += PushInstrSize(Reg);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index 1c5f1663d4f52..88dd5431f586b 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -451,7 +451,7 @@ void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo,
   // the assembly would look something like:
   // "vp2intersect %zmm5, %zmm7, {%k2, %k3}"
   // but this can work too.
-  switch (MI->getOperand(OpNo).getReg()) {
+  switch (MI->getOperand(OpNo).getReg().id()) {
   case X86::K0_K1:
     printRegName(OS, X86::K0);
     return;
@@ -467,22 +467,3 @@ void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo,
   }
   llvm_unreachable("Unknown mask pair register name");
 }
-
-void X86InstPrinterCommon::printTILEPair(const MCInst *MI, unsigned OpNo,
-                                         raw_ostream &OS) {
-  switch (MI->getOperand(OpNo).getReg()) {
-  case X86::TMM0_TMM1:
-    printRegName(OS, X86::TMM0);
-    return;
-  case X86::TMM2_TMM3:
-    printRegName(OS, X86::TMM2);
-    return;
-  case X86::TMM4_TMM5:
-    printRegName(OS, X86::TMM4);
-    return;
-  case X86::TMM6_TMM7:
-    printRegName(OS, X86::TMM6);
-    return;
-  }
-  llvm_unreachable("Unknown mask pair register name");
-}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
index 2c9467ca7c615..cb55f2f0019b5 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
@@ -40,7 +40,6 @@ class X86InstPrinterCommon : public MCInstPrinter {
                       const MCSubtargetInfo &STI);
   void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
-  void printTILEPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index af5a69899844c..0c874b7e6d674 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -535,7 +535,7 @@ bool X86MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI,
   const MCRegisterClass &VR128XRC = MRI.getRegClass(X86::VR128XRegClassID);
   const MCRegisterClass &VR256XRC = MRI.getRegClass(X86::VR256XRegClassID);
 
-  auto ClearsSuperReg = [=](unsigned RegID) {
+  auto ClearsSuperReg = [=](MCRegister RegID) {
     // On X86-64, a general purpose integer register is viewed as a 64-bit
     // register internal to the processor.
     // An update to the lower 32 bits of a 64 bit integer register is
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
index 9c442319c220f..b722964a571b3 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
@@ -55,6 +55,7 @@ struct FPOInstruction {
     StackAlign,
     SetFrame,
   } Op;
+  // FIXME: This should be a union of MCRegister and unsigned.
   unsigned RegOrOffset;
 };
 
@@ -215,7 +216,7 @@ bool X86WinCOFFTargetStreamer::emitFPOSetFrame(MCRegister Reg, SMLoc L) {
   FPOInstruction Inst;
   Inst.Label = emitFPOLabel();
   Inst.Op = FPOInstruction::SetFrame;
-  Inst.RegOrOffset = Reg;
+  Inst.RegOrOffset = Reg.id();
   CurFPOData->Instructions.push_back(Inst);
   return false;
 }
@@ -226,7 +227,7 @@ bool X86WinCOFFTargetStreamer::emitFPOPushReg(MCRegister Reg, SMLoc L) {
   FPOInstruction Inst;
   Inst.Label = emitFPOLabel();
   Inst.Op = FPOInstruction::PushReg;
-  Inst.RegOrOffset = Reg;
+  Inst.RegOrOffset = Reg.id();
   CurFPOData->Instructions.push_back(Inst);
   return false;
 }
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 51b540a7a51d0..200ca80adb232 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_X86_X86_H
 #define LLVM_LIB_TARGET_X86_X86_H
 
+#include "llvm/CodeGen/MachineFunctionAnalysisManager.h"
 #include "llvm/IR/Analysis.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/CodeGen.h"
@@ -83,7 +84,14 @@ FunctionPass *createX86AvoidStoreForwardingBlocks();
 FunctionPass *createX86FlagsCopyLoweringPass();
 
 /// Return a pass that expands DynAlloca pseudo-instructions.
-FunctionPass *createX86DynAllocaExpander();
+class X86DynAllocaExpanderPass
+    : public PassInfoMixin<X86DynAllocaExpanderPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+
+FunctionPass *createX86DynAllocaExpanderLegacyPass();
 
 /// Return a pass that config the tile registers.
 FunctionPass *createX86TileConfigPass();
@@ -104,7 +112,15 @@ FunctionPass *createX86LowerTileCopyPass();
 /// CALL instruction. The pass does the same for each funclet as well. This
 /// ensures that the open interval of function start and end PCs contains all
 /// return addresses for the benefit of the Windows x64 unwinder.
-FunctionPass *createX86AvoidTrailingCallPass();
+class X86AvoidTrailingCallPass
+    : public PassInfoMixin<X86AvoidTrailingCallPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+  static bool isRequired() { return true; }
+};
+
+FunctionPass *createX86AvoidTrailingCallLegacyPass();
 
 /// Return a pass that optimizes the code-size of x86 call sequences. This is
 /// done by replacing esp-relative movs with pushes.
@@ -158,7 +174,16 @@ FunctionPass *createX86InsertX87waitPass();
 /// This pass optimizes arithmetic based on knowledge that is only used by
 /// a reduction sequence and is therefore safe to reassociate in interesting
 /// ways.
-FunctionPass *createX86PartialReductionPass();
+class X86PartialReductionPass : public PassInfoMixin<X86PartialReductionPass> {
+private:
+  const X86TargetMachine *TM;
+
+public:
+  X86PartialReductionPass(const X86TargetMachine *TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+FunctionPass *createX86PartialReductionLegacyPass();
 
 /// // Analyzes and emits pseudos to support Win x64 Unwind V2.
 FunctionPass *createX86WinEHUnwindV2Pass();
@@ -179,7 +204,18 @@ FunctionPass *createX86LowerAMXTypeLegacyPass();
 
 /// The pass transforms amx intrinsics to scalar operation if the function has
 /// optnone attribute or it is O0.
-FunctionPass *createX86LowerAMXIntrinsicsPass();
+class X86LowerAMXIntrinsicsPass
+    : public PassInfoMixin<X86LowerAMXIntrinsicsPass> {
+private:
+  const TargetMachine *TM;
+
+public:
+  X86LowerAMXIntrinsicsPass(const TargetMachine *TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+  static bool isRequired() { return true; }
+};
+
+FunctionPass *createX86LowerAMXIntrinsicsLegacyPass();
 
 InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
                                                   const X86Subtarget &,
@@ -202,12 +238,12 @@ void initializeX86FixupInstTuningPassPass(PassRegistry &);
 void initializeX86FixupVectorConstantsPassPass(PassRegistry &);
 void initializeWinEHStatePassPass(PassRegistry &);
 void initializeX86AvoidSFBPassPass(PassRegistry &);
-void initializeX86AvoidTrailingCallPassPass(PassRegistry &);
+void initializeX86AvoidTrailingCallLegacyPassPass(PassRegistry &);
 void initializeX86CallFrameOptimizationPass(PassRegistry &);
 void initializeX86CmovConverterPassPass(PassRegistry &);
 void initializeX86DAGToDAGISelLegacyPass(PassRegistry &);
 void initializeX86DomainReassignmentPass(PassRegistry &);
-void initializeX86DynAllocaExpanderPass(PassRegistry &);
+void initializeX86DynAllocaExpanderLegacyPass(PassRegistry &);
 void initializeX86ExecutionDomainFixPass(PassRegistry &);
 void initializeX86ExpandPseudoPass(PassRegistry &);
 void initializeX86FastPreTileConfigPass(PassRegistry &);
@@ -220,7 +256,7 @@ void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &);
 void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
 void initializeX86LowerTileCopyPass(PassRegistry &);
 void initializeX86OptimizeLEAPassPass(PassRegistry &);
-void initializeX86PartialReductionPass(PassRegistry &);
+void initializeX86PartialReductionLegacyPass(PassRegistry &);
 void initializeX86PreTileConfigPass(PassRegistry &);
 void initializeX86ReturnThunksPass(PassRegistry &);
 void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index a1fd366e59444..9e291a6ae431f 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -274,9 +274,6 @@ def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true",
 def FeatureAMXMOVRS : SubtargetFeature<"amx-movrs", "HasAMXMOVRS", "true",
                                        "Support AMX-MOVRS instructions",
                                        [FeatureAMXTILE]>;
-def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true",
-                                           "Support AMX amx-transpose instructions",
-                                           [FeatureAMXTILE]>;
 def FeatureAMXAVX512 : SubtargetFeature<"amx-avx512",
                                         "HasAMXAVX512", "true",
                                         "Support AMX-AVX512 instructions",
@@ -1177,8 +1174,7 @@ def ProcessorFeatures {
                                                   FeatureAMXMOVRS,
                                                   FeatureAMXAVX512,
                                                   FeatureAMXFP8,
-                                                  FeatureAMXTF32,
-                                                  FeatureAMXTRANSPOSE];
+                                                  FeatureAMXTF32];
   list<SubtargetFeature> DMRFeatures =
     !listconcat(GNRDFeatures, DMRAdditionalFeatures);
 
diff --git a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index d2e35277419f7..9473e8db3af93 100644
--- a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -387,8 +387,8 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
   MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
   MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
 
-  Register Reg1 = MRI->createVirtualRegister(
-      TII->getRegClass(TII->get(NLoadOpcode), 0, TRI));
+  Register Reg1 =
+      MRI->createVirtualRegister(TII->getRegClass(TII->get(NLoadOpcode), 0));
   MachineInstr *NewLoad =
       BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
               Reg1)
@@ -553,7 +553,7 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
 }
 
 unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
-  const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI);
+  const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0);
   return TRI->getRegSizeInBits(*TRC) / 8;
 }
 
diff --git a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
index 2ecf49382d29f..ebd4284f0f37d 100644
--- a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
+++ b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
@@ -37,6 +37,8 @@
 #include "X86Subtarget.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Analysis.h"
+#include "llvm/IR/PassManager.h"
 
 #define AVOIDCALL_DESC "X86 avoid trailing call pass"
 #define AVOIDCALL_NAME "x86-avoid-trailing-call"
@@ -46,9 +48,9 @@
 using namespace llvm;
 
 namespace {
-class X86AvoidTrailingCallPass : public MachineFunctionPass {
+class X86AvoidTrailingCallLegacyPass : public MachineFunctionPass {
 public:
-  X86AvoidTrailingCallPass() : MachineFunctionPass(ID) {}
+  X86AvoidTrailingCallLegacyPass() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -59,13 +61,14 @@ class X86AvoidTrailingCallPass : public MachineFunctionPass {
 };
 } // end anonymous namespace
 
-char X86AvoidTrailingCallPass::ID = 0;
+char X86AvoidTrailingCallLegacyPass::ID = 0;
 
-FunctionPass *llvm::createX86AvoidTrailingCallPass() {
-  return new X86AvoidTrailingCallPass();
+FunctionPass *llvm::createX86AvoidTrailingCallLegacyPass() {
+  return new X86AvoidTrailingCallLegacyPass();
 }
 
-INITIALIZE_PASS(X86AvoidTrailingCallPass, AVOIDCALL_NAME, AVOIDCALL_DESC, false, false)
+INITIALIZE_PASS(X86AvoidTrailingCallLegacyPass, AVOIDCALL_NAME, AVOIDCALL_DESC,
+                false, false)
 
 // A real instruction is a non-meta, non-pseudo instruction.  Some pseudos
 // expand to nothing, and some expand to code. This logic conservatively assumes
@@ -79,7 +82,7 @@ static bool isCallInstruction(const MachineInstr &MI) {
   return MI.isCall() && !MI.isReturn();
 }
 
-bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) {
+bool UpdatedOnX86AvoidTrailingCallPass(MachineFunction &MF) {
   const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
   const X86InstrInfo &TII = *STI.getInstrInfo();
   assert(STI.isTargetWin64() && "pass only runs on Win64");
@@ -134,3 +137,19 @@ bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) {
 
   return Changed;
 }
+
+bool X86AvoidTrailingCallLegacyPass::runOnMachineFunction(MachineFunction &MF) {
+  return UpdatedOnX86AvoidTrailingCallPass(MF);
+}
+
+PreservedAnalyses
+X86AvoidTrailingCallPass::run(MachineFunction &MF,
+                              MachineFunctionAnalysisManager &MFAM) {
+  bool Changed = UpdatedOnX86AvoidTrailingCallPass(MF);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA = PreservedAnalyses::none();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index c0c7f5adf06ef..ddbd10d8f7eda 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -272,7 +272,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
       const MachineOperand &Src2 = MI.getOperand(2);
       bool Is32BitReg = Opc == X86::ADD32ri_ND || Opc == X86::ADD32rr_ND;
       const MCInstrDesc &NewDesc =
-          ST.getInstrInfo()->get(Is32BitReg ? X86::LEA32r : X86::LEA64r);
+          ST.getInstrInfo()->get(Is32BitReg ? X86::LEA64_32r : X86::LEA64r);
       if (Is32BitReg)
         Src1 = getX86SubSuperRegister(Src1, 64);
       MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), NewDesc, Dst)
diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp
index 5d190114615de..2047a53199dd6 100644
--- a/llvm/lib/Target/X86/X86DomainReassignment.cpp
+++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -174,8 +174,8 @@ class InstrReplacerDstCOPY : public InstrConverterBase {
     MachineBasicBlock *MBB = MI->getParent();
     const DebugLoc &DL = MI->getDebugLoc();
 
-    Register Reg = MRI->createVirtualRegister(
-        TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo()));
+    Register Reg =
+        MRI->createVirtualRegister(TII->getRegClass(TII->get(DstOpcode), 0));
     MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg);
     for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
       Bld.add(MO);
diff --git a/llvm/lib/Target/X86/X86DynAllocaExpander.cpp b/llvm/lib/Target/X86/X86DynAllocaExpander.cpp
index c2a06efd4d46e..10f46f71bbbbd 100644
--- a/llvm/lib/Target/X86/X86DynAllocaExpander.cpp
+++ b/llvm/lib/Target/X86/X86DynAllocaExpander.cpp
@@ -20,22 +20,22 @@
 #include "X86Subtarget.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunctionAnalysisManager.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/Analysis.h"
 #include "llvm/IR/Function.h"
 
 using namespace llvm;
 
 namespace {
 
-class X86DynAllocaExpander : public MachineFunctionPass {
+class X86DynAllocaExpander {
 public:
-  X86DynAllocaExpander() : MachineFunctionPass(ID) {}
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
+  bool run(MachineFunction &MF);
 
 private:
   /// Strategies for lowering a DynAlloca.
@@ -61,22 +61,30 @@ class X86DynAllocaExpander : public MachineFunctionPass {
   unsigned SlotSize = 0;
   int64_t StackProbeSize = 0;
   bool NoStackArgProbe = false;
+};
+
+class X86DynAllocaExpanderLegacy : public MachineFunctionPass {
+public:
+  X86DynAllocaExpanderLegacy() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
+private:
   StringRef getPassName() const override { return "X86 DynAlloca Expander"; }
 
 public:
   static char ID;
 };
 
-char X86DynAllocaExpander::ID = 0;
+char X86DynAllocaExpanderLegacy::ID = 0;
 
 } // end anonymous namespace
 
-INITIALIZE_PASS(X86DynAllocaExpander, "x86-dyn-alloca-expander",
+INITIALIZE_PASS(X86DynAllocaExpanderLegacy, "x86-dyn-alloca-expander",
                 "X86 DynAlloca Expander", false, false)
 
-FunctionPass *llvm::createX86DynAllocaExpander() {
-  return new X86DynAllocaExpander();
+FunctionPass *llvm::createX86DynAllocaExpanderLegacyPass() {
+  return new X86DynAllocaExpanderLegacy();
 }
 
 /// Return the allocation amount for a DynAlloca instruction, or -1 if unknown.
@@ -277,7 +285,7 @@ void X86DynAllocaExpander::lower(MachineInstr *MI, Lowering L) {
       AmountDef->eraseFromParent();
 }
 
-bool X86DynAllocaExpander::runOnMachineFunction(MachineFunction &MF) {
+bool X86DynAllocaExpander::run(MachineFunction &MF) {
   if (!MF.getInfo<X86MachineFunctionInfo>()->hasDynAlloca())
     return false;
 
@@ -299,3 +307,19 @@ bool X86DynAllocaExpander::runOnMachineFunction(MachineFunction &MF) {
 
   return true;
 }
+
+bool X86DynAllocaExpanderLegacy::runOnMachineFunction(MachineFunction &MF) {
+  return X86DynAllocaExpander().run(MF);
+}
+
+PreservedAnalyses
+X86DynAllocaExpanderPass::run(MachineFunction &MF,
+                              MachineFunctionAnalysisManager &MFAM) {
+  bool Changed = X86DynAllocaExpander().run(MF);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA = PreservedAnalyses::none();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 4a9b824b0db14..e3c44c048f7bf 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -649,149 +649,6 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     MI.setDesc(TII->get(Opc));
     return true;
   }
-  // TILEPAIRLOAD is just for TILEPair spill, we don't have corresponding
-  // AMX instruction to support it. So, split it to 2 load instructions:
-  // "TILEPAIRLOAD TMM0:TMM1, Base, Scale, Index, Offset, Segment" -->
-  // "TILELOAD TMM0, Base, Scale, Index, Offset, Segment" +
-  // "TILELOAD TMM1, Base, Scale, Index, Offset + TMM_SIZE, Segment"
-  case X86::PTILEPAIRLOAD: {
-    int64_t Disp = MBBI->getOperand(1 + X86::AddrDisp).getImm();
-    Register TReg = MBBI->getOperand(0).getReg();
-    bool DstIsDead = MBBI->getOperand(0).isDead();
-    Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0);
-    Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1);
-    unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8;
-
-    MachineInstrBuilder MIBLo =
-        BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD))
-            .addReg(TReg0, RegState::Define | getDeadRegState(DstIsDead));
-    MachineInstrBuilder MIBHi =
-        BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD))
-            .addReg(TReg1, RegState::Define | getDeadRegState(DstIsDead));
-
-    for (int i = 0; i < X86::AddrNumOperands; ++i) {
-      MIBLo.add(MBBI->getOperand(1 + i));
-      if (i == X86::AddrDisp)
-        MIBHi.addImm(Disp + TmmSize);
-      else
-        MIBHi.add(MBBI->getOperand(1 + i));
-    }
-
-    // Make sure the first stride reg used in first tileload is alive.
-    MachineOperand &Stride =
-        MIBLo.getInstr()->getOperand(1 + X86::AddrIndexReg);
-    Stride.setIsKill(false);
-
-    // Split the memory operand, adjusting the offset and size for the halves.
-    MachineMemOperand *OldMMO = MBBI->memoperands().front();
-    MachineFunction *MF = MBB.getParent();
-    MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize);
-    MachineMemOperand *MMOHi =
-        MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize);
-
-    MIBLo.setMemRefs(MMOLo);
-    MIBHi.setMemRefs(MMOHi);
-
-    // Delete the pseudo.
-    MBB.erase(MBBI);
-    return true;
-  }
-  // Similar with TILEPAIRLOAD, TILEPAIRSTORE is just for TILEPair spill, no
-  // corresponding AMX instruction to support it. So, split it too:
-  // "TILEPAIRSTORE Base, Scale, Index, Offset, Segment, TMM0:TMM1" -->
-  // "TILESTORE Base, Scale, Index, Offset, Segment, TMM0" +
-  // "TILESTORE Base, Scale, Index, Offset + TMM_SIZE, Segment, TMM1"
-  case X86::PTILEPAIRSTORE: {
-    int64_t Disp = MBBI->getOperand(X86::AddrDisp).getImm();
-    Register TReg = MBBI->getOperand(X86::AddrNumOperands).getReg();
-    bool SrcIsKill = MBBI->getOperand(X86::AddrNumOperands).isKill();
-    Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0);
-    Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1);
-    unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8;
-
-    MachineInstrBuilder MIBLo =
-        BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED));
-    MachineInstrBuilder MIBHi =
-        BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED));
-
-    for (int i = 0; i < X86::AddrNumOperands; ++i) {
-      MIBLo.add(MBBI->getOperand(i));
-      if (i == X86::AddrDisp)
-        MIBHi.addImm(Disp + TmmSize);
-      else
-        MIBHi.add(MBBI->getOperand(i));
-    }
-    MIBLo.addReg(TReg0, getKillRegState(SrcIsKill));
-    MIBHi.addReg(TReg1, getKillRegState(SrcIsKill));
-
-    // Make sure the first stride reg used in first tilestore is alive.
-    MachineOperand &Stride = MIBLo.getInstr()->getOperand(X86::AddrIndexReg);
-    Stride.setIsKill(false);
-
-    // Split the memory operand, adjusting the offset and size for the halves.
-    MachineMemOperand *OldMMO = MBBI->memoperands().front();
-    MachineFunction *MF = MBB.getParent();
-    MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize);
-    MachineMemOperand *MMOHi =
-        MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize);
-
-    MIBLo.setMemRefs(MMOLo);
-    MIBHi.setMemRefs(MMOHi);
-
-    // Delete the pseudo.
-    MBB.erase(MBBI);
-    return true;
-  }
-  case X86::PT2RPNTLVWZ0V:
-  case X86::PT2RPNTLVWZ0T1V:
-  case X86::PT2RPNTLVWZ1V:
-  case X86::PT2RPNTLVWZ1T1V:
-  case X86::PT2RPNTLVWZ0RSV:
-  case X86::PT2RPNTLVWZ0RST1V:
-  case X86::PT2RPNTLVWZ1RSV:
-  case X86::PT2RPNTLVWZ1RST1V: {
-    for (unsigned i = 3; i > 0; --i)
-      MI.removeOperand(i);
-    unsigned Opc;
-    switch (Opcode) {
-    case X86::PT2RPNTLVWZ0V:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
-      break;
-    case X86::PT2RPNTLVWZ0T1V:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
-      break;
-    case X86::PT2RPNTLVWZ1V:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
-      break;
-    case X86::PT2RPNTLVWZ1T1V:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
-      break;
-    case X86::PT2RPNTLVWZ0RSV:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
-      break;
-    case X86::PT2RPNTLVWZ0RST1V:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
-      break;
-    case X86::PT2RPNTLVWZ1RSV:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
-      break;
-    case X86::PT2RPNTLVWZ1RST1V:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
-      break;
-    default:
-      llvm_unreachable("Impossible Opcode!");
-    }
-    MI.setDesc(TII->get(Opc));
-    return true;
-  }
-  case X86::PTTRANSPOSEDV:
-  case X86::PTCONJTFP16V: {
-    for (int i = 2; i > 0; --i)
-      MI.removeOperand(i);
-    MI.setDesc(TII->get(Opcode == X86::PTTRANSPOSEDV ? X86::TTRANSPOSED
-                                                     : X86::TCONJTFP16));
-    return true;
-  }
   case X86::PTCMMIMFP16PSV:
   case X86::PTCMMRLFP16PSV:
   case X86::PTDPBSSDV:
@@ -800,13 +657,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case X86::PTDPBUUDV:
   case X86::PTDPBF16PSV:
   case X86::PTDPFP16PSV:
-  case X86::PTTDPBF16PSV:
-  case X86::PTTDPFP16PSV:
-  case X86::PTTCMMIMFP16PSV:
-  case X86::PTTCMMRLFP16PSV:
-  case X86::PTCONJTCMMIMFP16PSV:
   case X86::PTMMULTF32PSV:
-  case X86::PTTMMULTF32PSV:
   case X86::PTDPBF8PSV:
   case X86::PTDPBHF8PSV:
   case X86::PTDPHBF8PSV:
@@ -816,6 +667,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
       MI.removeOperand(i);
     unsigned Opc;
     switch (Opcode) {
+      // clang-format off
     case X86::PTCMMIMFP16PSV:  Opc = X86::TCMMIMFP16PS; break;
     case X86::PTCMMRLFP16PSV:  Opc = X86::TCMMRLFP16PS; break;
     case X86::PTDPBSSDV:   Opc = X86::TDPBSSD; break;
@@ -824,40 +676,12 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     case X86::PTDPBUUDV:   Opc = X86::TDPBUUD; break;
     case X86::PTDPBF16PSV: Opc = X86::TDPBF16PS; break;
     case X86::PTDPFP16PSV: Opc = X86::TDPFP16PS; break;
-    case X86::PTTDPBF16PSV:
-      Opc = X86::TTDPBF16PS;
-      break;
-    case X86::PTTDPFP16PSV:
-      Opc = X86::TTDPFP16PS;
-      break;
-    case X86::PTTCMMIMFP16PSV:
-      Opc = X86::TTCMMIMFP16PS;
-      break;
-    case X86::PTTCMMRLFP16PSV:
-      Opc = X86::TTCMMRLFP16PS;
-      break;
-    case X86::PTCONJTCMMIMFP16PSV:
-      Opc = X86::TCONJTCMMIMFP16PS;
-      break;
-    case X86::PTMMULTF32PSV:
-      Opc = X86::TMMULTF32PS;
-      break;
-    case X86::PTTMMULTF32PSV:
-      Opc = X86::TTMMULTF32PS;
-      break;
-    case X86::PTDPBF8PSV:
-      Opc = X86::TDPBF8PS;
-      break;
-    case X86::PTDPBHF8PSV:
-      Opc = X86::TDPBHF8PS;
-      break;
-    case X86::PTDPHBF8PSV:
-      Opc = X86::TDPHBF8PS;
-      break;
-    case X86::PTDPHF8PSV:
-      Opc = X86::TDPHF8PS;
-      break;
-
+    case X86::PTMMULTF32PSV: Opc = X86::TMMULTF32PS; break;
+    case X86::PTDPBF8PSV: Opc = X86::TDPBF8PS; break;
+    case X86::PTDPBHF8PSV: Opc = X86::TDPBHF8PS; break;
+    case X86::PTDPHBF8PSV: Opc = X86::TDPHBF8PS; break;
+    case X86::PTDPHF8PSV: Opc = X86::TDPHF8PS; break;
+    // clang-format on
     default:
       llvm_unreachable("Unexpected Opcode");
     }
diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
index 787b71d425cb3..25799f4ac0ea0 100644
--- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
@@ -206,8 +206,7 @@ void X86FastPreTileConfig::spill(MachineBasicBlock::iterator Before,
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
   // Don't need shape information for tile store, becasue it is adjacent to
   // the tile def instruction.
-  TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, TRI,
-                           Register());
+  TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, Register());
   ++NumStores;
 
   // TODO: update DBG_VALUEs
@@ -267,24 +266,16 @@ void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI,
                     << printReg(TileReg, TRI) << '\n');
 }
 
-static unsigned getTileDefNum(MachineRegisterInfo *MRI, Register Reg) {
-  if (Reg.isVirtual()) {
-    unsigned RegClassID = MRI->getRegClass(Reg)->getID();
-    if (RegClassID == X86::TILERegClassID)
-      return 1;
-    if (RegClassID == X86::TILEPAIRRegClassID)
-      return 2;
-  } else {
-    if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
-      return 1;
-    if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
-      return 2;
+static bool isTileRegister(MachineRegisterInfo *MRI, Register Reg) {
+  if (Reg.isVirtual() &&
+      (MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)) {
+    return true;
   }
-  return 0;
-}
 
-static bool isTileRegister(MachineRegisterInfo *MRI, Register VirtReg) {
-  return getTileDefNum(MRI, VirtReg) > 0;
+  if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
+    return true;
+
+  return false;
 }
 
 static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
@@ -296,7 +287,7 @@ static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
   if (!MO.isReg())
     return false;
 
-  return getTileDefNum(MRI, MO.getReg()) > 0;
+  return isTileRegister(MRI, MO.getReg());
 }
 
 static ShapeT getShape(MachineRegisterInfo *MRI, Register TileReg) {
@@ -636,19 +627,7 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
       else if (dominates(MBB, LastShapeMI, ColMI))
         LastShapeMI = ColMI;
     }
-    unsigned TileDefNum = getTileDefNum(MRI, MI.getOperand(0).getReg());
-    if (TileDefNum > 1) {
-      for (unsigned I = 1; I < TileDefNum; I++) {
-        MachineOperand *ColxMO = &MI.getOperand(2 + I);
-        MachineInstr *ColxMI = MRI->getVRegDef(ColxMO->getReg());
-        if (ColxMI->getParent() == &MBB) {
-          if (!LastShapeMI)
-            LastShapeMI = ColxMI;
-          else if (dominates(MBB, LastShapeMI, ColxMI))
-            LastShapeMI = ColxMI;
-        }
-      }
-    }
+
     // If there is user live out of the tilecfg, spill it and reload in
     // before the user.
     Register TileReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp
index 11d331b11737f..d86ae36aa2a67 100644
--- a/llvm/lib/Target/X86/X86FastTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp
@@ -77,14 +77,14 @@ INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE,
 INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE,
                     "Fast Tile Register Configure", false, false)
 
-static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) {
+static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
   // There is no phi instruction after register allocation.
   assert(MI.isPHI() == false);
   // The instruction must have 3 operands: tile def, row, col.
   // It should be AMX pseudo instruction that have shape operand.
   if (MI.isDebugInstr() || MI.isCopy() || MI.getNumOperands() < 3 ||
       !MI.isPseudo())
-    return 0;
+    return false;
   MachineOperand &MO = MI.getOperand(0);
 
   if (MO.isReg()) {
@@ -93,24 +93,18 @@ static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) {
     // register is not rewritten yet.
     if (Reg.isVirtual()) {
       if (MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
-        return 1;
-      if (MRI->getRegClass(Reg)->getID() == X86::TILEPAIRRegClassID)
-        return 2;
+        return true;
     }
     if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
-      return 1;
-    if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
-      return 2;
+      return true;
   }
 
-  return 0;
+  return false;
 }
 
 static unsigned getTMMIndex(Register Reg) {
   if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
     return Reg - X86::TMM0;
-  if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
-    return (Reg - X86::TMM0_TMM1) * 2;
   llvm_unreachable("Invalid Tmm Reg!");
 }
 
@@ -120,17 +114,14 @@ bool X86FastTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
   bool Change = false;
   SmallVector<std::pair<unsigned, ShapeT>, 6> ShapeInfos;
   for (MachineInstr &MI : reverse(MBB)) {
-    unsigned DefNum = getNumDefTiles(MRI, MI);
-    if (DefNum == 0 && MI.getOpcode() != X86::PLDTILECFGV)
+    if (!isTileDef(MRI, MI) && MI.getOpcode() != X86::PLDTILECFGV)
       continue;
     // AMX instructions that define tile register.
     if (MI.getOpcode() != X86::PLDTILECFGV) {
       MachineOperand &Row = MI.getOperand(1);
       unsigned TMMIdx = getTMMIndex(MI.getOperand(0).getReg());
-      for (unsigned I = 0; I < DefNum; I++) {
-        MachineOperand &Col = MI.getOperand(2 + I);
-        ShapeInfos.push_back({TMMIdx + I, ShapeT(&Row, &Col)});
-      }
+      MachineOperand &Col = MI.getOperand(2);
+      ShapeInfos.push_back({TMMIdx, ShapeT(&Row, &Col)});
     } else { // PLDTILECFGV
       // Rewrite the shape information to memory. Stack slot should have
       // been initialized to zero in pre config.
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index ab6e6d0687b71..b3bf37a9a462c 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -50,7 +50,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <iterator>
 #include <utility>
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index a66a3213403b4..8bca6344d6521 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -3093,8 +3093,8 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
     MBB.addLiveIn(Reg);
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
 
-    TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI,
-                            Register(), MachineInstr::FrameSetup);
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, Register(),
+                            MachineInstr::FrameSetup);
   }
 
   return true;
@@ -3166,8 +3166,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(
       VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
 
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
-    TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, TRI,
-                             Register());
+    TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, Register());
   }
 
   // Clear the stack slot for spill base pointer register.
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 4393f6ecaa033..6c16fcfb282e8 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -337,23 +337,8 @@ namespace {
     // lowering but before ISEL.
     bool isAMXSDNode(SDNode *N) const {
       // Check if N is AMX SDNode:
-      // 1. check specific opcode since these carry MVT::Untyped instead of
-      // x86amx_type;
-      // 2. check result type;
-      // 3. check operand type;
-      switch (N->getOpcode()) {
-      default:
-        break;
-      case X86::PT2RPNTLVWZ0V:
-      case X86::PT2RPNTLVWZ0T1V:
-      case X86::PT2RPNTLVWZ1V:
-      case X86::PT2RPNTLVWZ1T1V:
-      case X86::PT2RPNTLVWZ0RSV:
-      case X86::PT2RPNTLVWZ0RST1V:
-      case X86::PT2RPNTLVWZ1RSV:
-      case X86::PT2RPNTLVWZ1RST1V:
-        return true;
-      }
+      // 1. check result type;
+      // 2. check operand type;
       for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
         if (N->getValueType(Idx) == MVT::x86amx)
           return true;
@@ -4743,9 +4728,9 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
   auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) {
     if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() &&
         ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) {
-      SDValue InnerOp = Op->getOperand(0);
+      SDValue InnerOp = getFoldableLogicOp(Op->getOperand(0));
 
-      if (!getFoldableLogicOp(InnerOp))
+      if (!InnerOp)
         return SDValue();
 
       N0 = InnerOp.getOperand(0);
@@ -5398,65 +5383,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       ReplaceNode(Node, CNode);
       return;
     }
-    case Intrinsic::x86_t2rpntlvwz0rs:
-    case Intrinsic::x86_t2rpntlvwz0rst1:
-    case Intrinsic::x86_t2rpntlvwz1rs:
-    case Intrinsic::x86_t2rpntlvwz1rst1:
-      if (!Subtarget->hasAMXMOVRS())
-        break;
-      [[fallthrough]];
-    case Intrinsic::x86_t2rpntlvwz0:
-    case Intrinsic::x86_t2rpntlvwz0t1:
-    case Intrinsic::x86_t2rpntlvwz1:
-    case Intrinsic::x86_t2rpntlvwz1t1: {
-      if (!Subtarget->hasAMXTRANSPOSE())
-        break;
-      auto *MFI =
-          CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
-      MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
-      unsigned Opc;
-      switch (IntNo) {
-      default:
-        llvm_unreachable("Unexpected intrinsic!");
-      case Intrinsic::x86_t2rpntlvwz0:
-        Opc = X86::PT2RPNTLVWZ0;
-        break;
-      case Intrinsic::x86_t2rpntlvwz0t1:
-        Opc = X86::PT2RPNTLVWZ0T1;
-        break;
-      case Intrinsic::x86_t2rpntlvwz1:
-        Opc = X86::PT2RPNTLVWZ1;
-        break;
-      case Intrinsic::x86_t2rpntlvwz1t1:
-        Opc = X86::PT2RPNTLVWZ1T1;
-        break;
-      case Intrinsic::x86_t2rpntlvwz0rs:
-        Opc = X86::PT2RPNTLVWZ0RS;
-        break;
-      case Intrinsic::x86_t2rpntlvwz0rst1:
-        Opc = X86::PT2RPNTLVWZ0RST1;
-        break;
-      case Intrinsic::x86_t2rpntlvwz1rs:
-        Opc = X86::PT2RPNTLVWZ1RS;
-        break;
-      case Intrinsic::x86_t2rpntlvwz1rst1:
-        Opc = X86::PT2RPNTLVWZ1RST1;
-        break;
-      }
-      // FIXME: Match displacement and scale.
-      unsigned TIndex = Node->getConstantOperandVal(2);
-      SDValue TReg = getI8Imm(TIndex, dl);
-      SDValue Base = Node->getOperand(3);
-      SDValue Scale = getI8Imm(1, dl);
-      SDValue Index = Node->getOperand(4);
-      SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
-      SDValue Segment = CurDAG->getRegister(0, MVT::i16);
-      SDValue Chain = Node->getOperand(0);
-      SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
-      MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
-      ReplaceNode(Node, CNode);
-      return;
-    }
     }
     break;
   }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49beadae63f03..6483e07afadee 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -635,6 +635,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FROUNDEVEN, VT, Action);
     setOperationAction(ISD::FTRUNC, VT, Action);
     setOperationAction(ISD::FLDEXP, VT, Action);
+    setOperationAction(ISD::FSINCOSPI, VT, Action);
   };
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
@@ -2572,8 +2573,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   // Combine sin / cos into _sincos_stret if it is available.
-  setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-  setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
   if (Subtarget.isTargetWin64()) {
     setOperationAction(ISD::SDIV, MVT::i128, Custom);
@@ -2653,6 +2654,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                        ISD::AVGCEILU,
                        ISD::AVGFLOORS,
                        ISD::AVGFLOORU,
+                       ISD::CTLZ,
+                       ISD::CTTZ,
+                       ISD::CTLZ_ZERO_UNDEF,
+                       ISD::CTTZ_ZERO_UNDEF,
                        ISD::BITREVERSE,
                        ISD::ADD,
                        ISD::FADD,
@@ -3454,6 +3459,12 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
       isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
     return true;
 
+  // If we have a large vector type (even if illegal), don't bitcast to large
+  // (illegal) scalar types. Better to load fewer vectors and extract.
+  if (LoadVT.isVector() && !BitcastVT.isVector() && LoadVT.isInteger() &&
+      BitcastVT.isInteger() && (LoadVT.getSizeInBits() % 128) == 0)
+    return false;
+
   return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
 }
 
@@ -22861,6 +22872,13 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
   if (!OpVT.isScalarInteger() || OpSize < 128)
     return SDValue();
 
+  // Don't do this if we're not supposed to use the FPU.
+  bool NoImplicitFloatOps =
+      DAG.getMachineFunction().getFunction().hasFnAttribute(
+          Attribute::NoImplicitFloat);
+  if (Subtarget.useSoftFloat() || NoImplicitFloatOps)
+    return SDValue();
+
   // Ignore a comparison with zero because that gets special treatment in
   // EmitTest(). But make an exception for the special case of a pair of
   // logically-combined vector-sized operands compared to zero. This pattern may
@@ -22883,13 +22901,9 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
   // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
   // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
   // Otherwise use PCMPEQ (plus AND) and mask testing.
-  bool NoImplicitFloatOps =
-      DAG.getMachineFunction().getFunction().hasFnAttribute(
-          Attribute::NoImplicitFloat);
-  if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
-      ((OpSize == 128 && Subtarget.hasSSE2()) ||
-       (OpSize == 256 && Subtarget.hasAVX()) ||
-       (OpSize == 512 && Subtarget.useAVX512Regs()))) {
+  if ((OpSize == 128 && Subtarget.hasSSE2()) ||
+      (OpSize == 256 && Subtarget.hasAVX()) ||
+      (OpSize == 512 && Subtarget.useAVX512Regs())) {
     bool HasPT = Subtarget.hasSSE41();
 
     // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
@@ -27946,67 +27960,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
                          Operation.getValue(1));
     }
-    case Intrinsic::x86_t2rpntlvwz0rs_internal:
-    case Intrinsic::x86_t2rpntlvwz0rst1_internal:
-    case Intrinsic::x86_t2rpntlvwz1rs_internal:
-    case Intrinsic::x86_t2rpntlvwz1rst1_internal:
-    case Intrinsic::x86_t2rpntlvwz0_internal:
-    case Intrinsic::x86_t2rpntlvwz0t1_internal:
-    case Intrinsic::x86_t2rpntlvwz1_internal:
-    case Intrinsic::x86_t2rpntlvwz1t1_internal: {
-      auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
-      X86MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
-      unsigned IntNo = Op.getConstantOperandVal(1);
-      unsigned Opc = 0;
-      switch (IntNo) {
-      default:
-        llvm_unreachable("Unexpected intrinsic!");
-      case Intrinsic::x86_t2rpntlvwz0_internal:
-        Opc = X86::PT2RPNTLVWZ0V;
-        break;
-      case Intrinsic::x86_t2rpntlvwz0t1_internal:
-        Opc = X86::PT2RPNTLVWZ0T1V;
-        break;
-      case Intrinsic::x86_t2rpntlvwz1_internal:
-        Opc = X86::PT2RPNTLVWZ1V;
-        break;
-      case Intrinsic::x86_t2rpntlvwz1t1_internal:
-        Opc = X86::PT2RPNTLVWZ1T1V;
-        break;
-      case Intrinsic::x86_t2rpntlvwz0rs_internal:
-        Opc = X86::PT2RPNTLVWZ0RSV;
-        break;
-      case Intrinsic::x86_t2rpntlvwz0rst1_internal:
-        Opc = X86::PT2RPNTLVWZ0RST1V;
-        break;
-      case Intrinsic::x86_t2rpntlvwz1rs_internal:
-        Opc = X86::PT2RPNTLVWZ1RSV;
-        break;
-      case Intrinsic::x86_t2rpntlvwz1rst1_internal:
-        Opc = X86::PT2RPNTLVWZ1RST1V;
-        break;
-      }
-
-      SDLoc DL(Op);
-      SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
-
-      SDValue Ops[] = {Op.getOperand(2),                       // Row
-                       Op.getOperand(3),                       // Col0
-                       Op.getOperand(4),                       // Col1
-                       Op.getOperand(5),                       // Base
-                       DAG.getTargetConstant(1, DL, MVT::i8),  // Scale
-                       Op.getOperand(6),                       // Index
-                       DAG.getTargetConstant(0, DL, MVT::i32), // Disp
-                       DAG.getRegister(0, MVT::i16),           // Segment
-                       Op.getOperand(0)};                      // Chain
-
-      MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
-      SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
-                                                SDValue(Res, 0));
-      SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
-                                                SDValue(Res, 0));
-      return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
-    }
     case Intrinsic::x86_atomic_bts_rm:
     case Intrinsic::x86_atomic_btc_rm:
     case Intrinsic::x86_atomic_btr_rm: {
@@ -30966,6 +30919,63 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
   }
 
+  if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) {
+    // On AVX512BW, we can use variable 16-bit shifts to implement variable
+    // 8-bit shifts. For this, we split the input into two vectors, RLo and RHi.
+    // The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane
+    // of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors
+    // can efficiently be merged together using a masked move.
+    MVT ExtVT = MVT::v32i16;
+
+    SDValue RLo, RHi;
+    // Isolate lower and upper lanes of Amt by masking odd lanes in AmtLo and
+    // right shifting AmtHi.
+    SDValue AmtLo = DAG.getNode(ISD::AND, dl, ExtVT, DAG.getBitcast(ExtVT, Amt),
+                                DAG.getConstant(0x00ff, dl, ExtVT));
+    SDValue AmtHi = getTargetVShiftByConstNode(
+        X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG);
+    switch (Opc) {
+    case ISD::SHL:
+      // Because we shift left, no bits from the high half can influence the low
+      // half, so we don't need to mask RLo. We do however need to mask RHi, to
+      // prevent high bits of an even lane overflowing into low bits of an odd
+      // lane.
+      RLo = DAG.getBitcast(ExtVT, R);
+      RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo,
+                        DAG.getConstant(0xff00, dl, ExtVT));
+      break;
+    case ISD::SRL:
+      // Same idea as above, but this time we need to make sure no low bits of
+      // an odd lane can overflow into high bits of an even lane.
+      RHi = DAG.getBitcast(ExtVT, R);
+      RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi,
+                        DAG.getConstant(0x00ff, dl, ExtVT));
+      break;
+    case ISD::SRA:
+      // For arithmetic right shifts, we want to sign extend each even lane of R
+      // such that the upper half of the corresponding lane of RLo is 0 or -1
+      // depending on the sign bit of the original lane. We do this using 2
+      // immediate shifts.
+      RHi = DAG.getBitcast(ExtVT, R);
+      RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG);
+      RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG);
+      break;
+    default:
+      llvm_unreachable("Unexpected Shift Op");
+    }
+
+    SDValue ShiftedLo =
+        DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RLo, AmtLo));
+    SDValue ShiftedHi =
+        DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RHi, AmtHi));
+
+    // To merge the shifted vectors back together, we select even lanes
+    // from ShiftedLo and odd lanes from ShiftedHi.
+    SDValue SelectMask = DAG.getBitcast(
+        MVT::v64i1, DAG.getConstant(0x5555555555555555, dl, MVT::i64));
+    return DAG.getSelect(dl, VT, SelectMask, ShiftedLo, ShiftedHi);
+  }
+
   if (VT == MVT::v16i8 ||
       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
       (VT == MVT::v64i8 && Subtarget.hasBWI())) {
@@ -33062,60 +33072,6 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
-static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
-                            SelectionDAG &DAG) {
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
-  bool isF64 = ArgVT == MVT::f64;
-
-  RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
-  const char *LibcallName = TLI.getLibcallName(LC);
-  if (!LibcallName)
-    return SDValue();
-
-  assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
-
-  // For MacOSX, we want to call an alternative entry point: __sincos_stret,
-  // which returns the values as { float, float } (in XMM0) or
-  // { double, double } (which is returned in XMM0, XMM1).
-  SDLoc dl(Op);
-  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-
-  TargetLowering::ArgListTy Args;
-  Args.emplace_back(Arg, ArgTy);
-
-  // Only optimize x86_64 for now. i386 is a bit messy. For f32,
-  // the small struct {f32, f32} is returned in (eax, edx). For f64,
-  // the results are returned via SRet in memory.
-  SDValue Callee =
-      DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
-
-  Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
-                      : (Type *)FixedVectorType::get(ArgTy, 4);
-
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl)
-      .setChain(DAG.getEntryNode())
-      .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
-
-  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
-
-  if (isF64)
-    // Returned in xmm0 and xmm1.
-    return CallResult.first;
-
-  // Returned in bits 0:31 and 32:64 xmm0.
-  SDValue SinVal =
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
-                  DAG.getVectorIdxConstant(0, dl));
-  SDValue CosVal =
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
-                  DAG.getVectorIdxConstant(1, dl));
-  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
-}
-
 /// Widen a vector input to a vector of NVT.  The
 /// input vector must have the same element type as NVT.
 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
@@ -33720,7 +33676,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ABDS:
   case ISD::ABDU:               return LowerABD(Op, Subtarget, DAG);
   case ISD::AVGCEILU:           return LowerAVG(Op, Subtarget, DAG);
-  case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
@@ -37745,10 +37700,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     assert (Imm < 8 && "Illegal tmm index");
     return X86::TMM0 + Imm;
   };
-  auto TMMImmToTMMPair = [](unsigned Imm) {
-    assert(Imm < 8 && "Illegal tmm pair index.");
-    return X86::TMM0_TMM1 + Imm / 2;
-  };
   switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Unexpected instr type to insert");
@@ -38129,53 +38080,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PTDPBHF8PS:
   case X86::PTDPHBF8PS:
   case X86::PTDPHF8PS:
-  case X86::PTTDPBF16PS:
-  case X86::PTTDPFP16PS:
-  case X86::PTTCMMIMFP16PS:
-  case X86::PTTCMMRLFP16PS:
-  case X86::PTCONJTCMMIMFP16PS:
-  case X86::PTMMULTF32PS:
-  case X86::PTTMMULTF32PS: {
+  case X86::PTMMULTF32PS: {
     unsigned Opc;
     switch (MI.getOpcode()) {
     default: llvm_unreachable("illegal opcode!");
+      // clang-format off
     case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
     case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
     case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
     case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
     case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
     case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
-    case X86::PTCMMIMFP16PS:
-      Opc = X86::TCMMIMFP16PS;
-      break;
-    case X86::PTCMMRLFP16PS:
-      Opc = X86::TCMMRLFP16PS;
-      break;
+    case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;
+    case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;
     case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
     case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
     case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
     case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
-    case X86::PTTDPBF16PS:
-      Opc = X86::TTDPBF16PS;
-      break;
-    case X86::PTTDPFP16PS:
-      Opc = X86::TTDPFP16PS;
-      break;
-    case X86::PTTCMMIMFP16PS:
-      Opc = X86::TTCMMIMFP16PS;
-      break;
-    case X86::PTTCMMRLFP16PS:
-      Opc = X86::TTCMMRLFP16PS;
-      break;
-    case X86::PTCONJTCMMIMFP16PS:
-      Opc = X86::TCONJTCMMIMFP16PS;
-      break;
-    case X86::PTMMULTF32PS:
-      Opc = X86::TMMULTF32PS;
-      break;
-    case X86::PTTMMULTF32PS:
-      Opc = X86::TTMMULTF32PS;
-      break;
+    case X86::PTMMULTF32PS: Opc = X86::TMMULTF32PS; break;
+      // clang-format on
     }
 
     MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
@@ -38246,70 +38169,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.eraseFromParent(); // The pseudo is gone now.
     return BB;
   }
-  case X86::PT2RPNTLVWZ0:
-  case X86::PT2RPNTLVWZ0T1:
-  case X86::PT2RPNTLVWZ1:
-  case X86::PT2RPNTLVWZ1T1:
-  case X86::PT2RPNTLVWZ0RS:
-  case X86::PT2RPNTLVWZ0RST1:
-  case X86::PT2RPNTLVWZ1RS:
-  case X86::PT2RPNTLVWZ1RST1: {
-    const DebugLoc &DL = MI.getDebugLoc();
-    unsigned Opc;
-#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
-    switch (MI.getOpcode()) {
-    default:
-      llvm_unreachable("Unexpected instruction!");
-    case X86::PT2RPNTLVWZ0:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
-      break;
-    case X86::PT2RPNTLVWZ0T1:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
-      break;
-    case X86::PT2RPNTLVWZ1:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
-      break;
-    case X86::PT2RPNTLVWZ1T1:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
-      break;
-    case X86::PT2RPNTLVWZ0RS:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
-      break;
-    case X86::PT2RPNTLVWZ0RST1:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
-      break;
-    case X86::PT2RPNTLVWZ1RS:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
-      break;
-    case X86::PT2RPNTLVWZ1RST1:
-      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
-      break;
-    }
-#undef GET_EGPR_IF_ENABLED
-    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
-    MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
-
-    MIB.add(MI.getOperand(1)); // base
-    MIB.add(MI.getOperand(2)); // scale
-    MIB.add(MI.getOperand(3)); // index
-    MIB.add(MI.getOperand(4)); // displacement
-    MIB.add(MI.getOperand(5)); // segment
-    MI.eraseFromParent();      // The pseudo is gone now.
-    return BB;
-  }
-  case X86::PTTRANSPOSED:
-  case X86::PTCONJTFP16: {
-    const DebugLoc &DL = MI.getDebugLoc();
-    unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
-                                                       : X86::TCONJTFP16;
-
-    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
-    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
-    MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
-
-    MI.eraseFromParent(); // The pseudo is gone now.
-    return BB;
-  }
   case X86::PTCVTROWPS2BF16Hrri:
   case X86::PTCVTROWPS2BF16Lrri:
   case X86::PTCVTROWPS2PHHrri:
@@ -45168,11 +45027,16 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
   case X86ISD::INSERTPS:
   case X86ISD::BLENDI:
   case X86ISD::PSHUFB:
+  case X86ISD::VZEXT_MOVL:
   case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+  case X86ISD::SHUFP:
   case X86ISD::UNPCKL:
   case X86ISD::UNPCKH:
   case X86ISD::VPERMILPV:
   case X86ISD::VPERMILPI:
+  case X86ISD::VPERMI:
   case X86ISD::VPERMV:
   case X86ISD::VPERMV3: {
     SmallVector<int, 8> Mask;
@@ -45198,6 +45062,16 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
     }
     break;
   }
+  case X86ISD::VBROADCAST: {
+    SDValue Src = Op.getOperand(0);
+    MVT SrcVT = Src.getSimpleValueType();
+    if (SrcVT.isVector()) {
+      APInt DemandedSrc = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
+      return DAG.isGuaranteedNotToBeUndefOrPoison(Src, DemandedSrc, PoisonOnly,
+                                                  Depth + 1);
+    }
+    return DAG.isGuaranteedNotToBeUndefOrPoison(Src, PoisonOnly, Depth + 1);
+  }
   }
   return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
       Op, DemandedElts, DAG, PoisonOnly, Depth);
@@ -45242,13 +45116,19 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
   // SSE target shuffles.
   case X86ISD::INSERTPS:
   case X86ISD::PSHUFB:
+  case X86ISD::VZEXT_MOVL:
   case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+  case X86ISD::SHUFP:
   case X86ISD::UNPCKL:
   case X86ISD::UNPCKH:
   case X86ISD::VPERMILPV:
   case X86ISD::VPERMILPI:
+  case X86ISD::VPERMI:
   case X86ISD::VPERMV:
   case X86ISD::VPERMV3:
+  case X86ISD::VBROADCAST:
     return false;
   // SSE comparisons handle all icmp/fcmp cases.
   // TODO: Add CMPM/MM with test coverage.
@@ -53502,40 +53382,45 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
 }
 
 // Look for a RMW operation that only touches one bit of a larger than legal
-// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value.
+// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single
+// i32 sub value.
 static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
                               SelectionDAG &DAG,
+                              TargetLowering::DAGCombinerInfo &DCI,
                               const X86Subtarget &Subtarget) {
   using namespace SDPatternMatch;
-
-  // Only handle normal stores and its chain was a matching normal load.
-  auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
-  if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld ||
-      !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
-      Ld->getBasePtr() != St->getBasePtr() ||
-      Ld->getOffset() != St->getOffset())
-    return SDValue();
-
-  SDValue LoadVal(Ld, 0);
   SDValue StoredVal = St->getValue();
   EVT VT = StoredVal.getValueType();
 
-  // Only narrow larger than legal scalar integers.
-  if (!VT.isScalarInteger() ||
+  // Only narrow normal stores of larger than legal scalar integers.
+  if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() ||
       VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32))
     return SDValue();
 
   // BTR: X & ~(1 << ShAmt)
   // BTS: X | (1 << ShAmt)
   // BTC: X ^ (1 << ShAmt)
-  SDValue ShAmt;
-  if (!StoredVal.hasOneUse() ||
-      !(sd_match(StoredVal, m_And(m_Specific(LoadVal),
+  //
+  // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
+  SDValue SrcVal, InsertBit, ShAmt;
+  if (!(sd_match(StoredVal, m_And(m_Value(SrcVal),
                                   m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
         sd_match(StoredVal,
-                 m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+                 m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
         sd_match(StoredVal,
-                 m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt))))))
+                 m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+        sd_match(
+            StoredVal,
+            m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
+                 m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
+    return SDValue();
+
+  // SrcVal must be a matching normal load further up the chain.
+  auto *Ld = dyn_cast<LoadSDNode>(peekThroughBitcasts(SrcVal));
+  if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
+      Ld->getBasePtr() != St->getBasePtr() ||
+      Ld->getOffset() != St->getOffset() ||
+      !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1)))
     return SDValue();
 
   // Ensure the shift amount is in bounds.
@@ -53543,6 +53428,13 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
   if (KnownAmt.getMaxValue().uge(VT.getSizeInBits()))
     return SDValue();
 
+  // If we're inserting a bit then it must be the LSB.
+  if (InsertBit) {
+    KnownBits KnownInsert = DAG.computeKnownBits(InsertBit);
+    if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1))
+      return SDValue();
+  }
+
   // Split the shift into an alignment shift that moves the active i32 block to
   // the bottom bits for truncation and a modulo shift that can act on the i32.
   EVT AmtVT = ShAmt.getValueType();
@@ -53550,6 +53442,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
                                  DAG.getSignedConstant(-32LL, DL, AmtVT));
   SDValue ModuloAmt =
       DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT));
+  ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8);
 
   // Compute the byte offset for the i32 block that is changed by the RMW.
   // combineTruncate will adjust the load for us in a similar way.
@@ -53561,18 +53454,41 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
                                             SDNodeFlags::NoUnsignedWrap);
 
   // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store.
-  SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
+  SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt);
   X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
 
-  SDValue Mask =
-      DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32),
-                  DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8));
-  if (StoredVal.getOpcode() == ISD::AND)
-    Mask = DAG.getNOT(DL, Mask, MVT::i32);
+  SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
+                             DAG.getConstant(1, DL, MVT::i32), ModuloAmt);
 
-  SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
-  return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
-                      Align(), St->getMemOperand()->getFlags());
+  SDValue Res;
+  if (InsertBit) {
+    SDValue BitMask =
+        DAG.getNode(ISD::SHL, DL, MVT::i32,
+                    DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt);
+    Res =
+        DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32));
+    Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask);
+  } else {
+    if (StoredVal.getOpcode() == ISD::AND)
+      Mask = DAG.getNOT(DL, Mask, MVT::i32);
+    Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
+  }
+
+  SDValue NewStore =
+      DAG.getStore(St->getChain(), DL, Res, NewPtr,
+                   MachinePointerInfo(St->getPointerInfo().getAddrSpace()),
+                   Align(), St->getMemOperand()->getFlags());
+
+  // If there are other uses of StoredVal, replace with a new load of the
+  // whole (updated) value.
+  if (!StoredVal.hasOneUse()) {
+    SDValue NewLoad =
+        DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand());
+    for (SDNode *User : StoredVal->users())
+      DCI.AddToWorklist(User);
+    DAG.ReplaceAllUsesWith(StoredVal, NewLoad);
+  }
+  return NewStore;
 }
 
 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
@@ -53801,7 +53717,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget))
+  if (SDValue R = narrowBitOpRMW(St, dl, DAG, DCI, Subtarget))
     return R;
 
   // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
@@ -54591,6 +54507,7 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
 static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget,
                                const SDLoc &DL) {
+  using namespace SDPatternMatch;
   if (!VT.isVector() || !Subtarget.hasSSSE3())
     return SDValue();
 
@@ -54600,42 +54517,19 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
     return SDValue();
 
   SDValue SSatVal = detectSSatPattern(In, VT);
-  if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
+  if (!SSatVal)
     return SDValue();
 
-  // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
-  // of multiplies from even/odd elements.
-  SDValue N0 = SSatVal.getOperand(0);
-  SDValue N1 = SSatVal.getOperand(1);
-
-  if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
-    return SDValue();
-
-  SDValue N00 = N0.getOperand(0);
-  SDValue N01 = N0.getOperand(1);
-  SDValue N10 = N1.getOperand(0);
-  SDValue N11 = N1.getOperand(1);
-
+  // See if this is a signed saturation of an ADD, adding pairs of multiplies
+  // from even/odd elements, from zero_extend/sign_extend operands.
+  //
   // TODO: Handle constant vectors and use knownbits/computenumsignbits?
-  // Canonicalize zero_extend to LHS.
-  if (N01.getOpcode() == ISD::ZERO_EXTEND)
-    std::swap(N00, N01);
-  if (N11.getOpcode() == ISD::ZERO_EXTEND)
-    std::swap(N10, N11);
-
-  // Ensure we have a zero_extend and a sign_extend.
-  if (N00.getOpcode() != ISD::ZERO_EXTEND ||
-      N01.getOpcode() != ISD::SIGN_EXTEND ||
-      N10.getOpcode() != ISD::ZERO_EXTEND ||
-      N11.getOpcode() != ISD::SIGN_EXTEND)
+  SDValue N00, N01, N10, N11;
+  if (!sd_match(SSatVal,
+                m_Add(m_Mul(m_ZExt(m_Value(N00)), m_SExt(m_Value(N01))),
+                      m_Mul(m_ZExt(m_Value(N10)), m_SExt(m_Value(N11))))))
     return SDValue();
 
-  // Peek through the extends.
-  N00 = N00.getOperand(0);
-  N01 = N01.getOperand(0);
-  N10 = N10.getOperand(0);
-  N11 = N11.getOperand(0);
-
   // Ensure the extend is from vXi8.
   if (N00.getValueType().getVectorElementType() != MVT::i8 ||
       N01.getValueType().getVectorElementType() != MVT::i8 ||
@@ -54768,9 +54662,11 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
       KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
       // Check the shift amount is byte aligned.
       // Check the truncation doesn't use any shifted in (zero) top bits.
+      // Check the shift amount doesn't depend on the original load.
       if (KnownAmt.countMinTrailingZeros() >= 3 &&
           KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() -
-                                     VT.getSizeInBits())) {
+                                     VT.getSizeInBits()) &&
+          !Ld->isPredecessorOf(ShAmt.getNode())) {
         EVT PtrVT = Ld->getBasePtr().getValueType();
         SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT);
         SDValue PtrByteOfs =
@@ -54779,10 +54675,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
         SDValue NewPtr = DAG.getMemBasePlusOffset(
             Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap);
         SDValue NewLoad =
-            DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(),
+            DAG.getLoad(VT, DL, Ld->getChain(), NewPtr,
+                        MachinePointerInfo(Ld->getPointerInfo().getAddrSpace()),
                         Align(), Ld->getMemOperand()->getFlags());
-        DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1),
-                                      NewLoad.getValue(1));
+        DAG.makeEquivalentMemoryOrdering(Ld, NewLoad);
         return NewLoad;
       }
     }
@@ -55270,6 +55166,65 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
   return combineFneg(N, DAG, DCI, Subtarget);
 }
 
+// Fold i256/i512 CTLZ/CTTZ patterns to make use of AVX512
+// vXi64 CTLZ/CTTZ and VECTOR_COMPRESS.
+// Compute the CTLZ/CTTZ of each element, add the element's bit offset, compress
+// the result to remove all zero elements (passthru is set to scalar bitwidth if
+// all elements are zero) and extract the lowest compressed element.
+static SDValue combineCTZ(SDNode *N, SelectionDAG &DAG,
+                          TargetLowering::DAGCombinerInfo &DCI,
+                          const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  unsigned Opc = N->getOpcode();
+  unsigned SizeInBits = VT.getSizeInBits();
+  assert((Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF || Opc == ISD::CTTZ ||
+          Opc == ISD::CTTZ_ZERO_UNDEF) &&
+         "Unsupported bit count");
+
+  if (VT.isScalarInteger() && Subtarget.hasCDI() &&
+      ((SizeInBits == 512 && Subtarget.useAVX512Regs()) ||
+       (SizeInBits == 256 && Subtarget.hasVLX() &&
+        X86::mayFoldLoad(N0, Subtarget)))) {
+    MVT VecVT = MVT::getVectorVT(MVT::i64, SizeInBits / 64);
+    MVT BoolVT = VecVT.changeVectorElementType(MVT::i1);
+    SDValue Vec = DAG.getBitcast(VecVT, N0);
+    SDLoc DL(N);
+
+    SmallVector<int, 8> RevMask;
+    SmallVector<SDValue, 8> Offsets;
+    for (unsigned I = 0, E = VecVT.getVectorNumElements(); I != E; ++I) {
+      RevMask.push_back((int)((E - 1) - I));
+      Offsets.push_back(DAG.getConstant(I * 64, DL, MVT::i64));
+    }
+
+    // CTLZ - reverse the elements as we want the top non-zero element at the
+    // bottom for compression.
+    unsigned VecOpc = ISD::CTTZ;
+    if (Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF) {
+      VecOpc = ISD::CTLZ;
+      Vec = DAG.getVectorShuffle(VecVT, DL, Vec, Vec, RevMask);
+    }
+
+    SDValue PassThrough = DAG.getUNDEF(VecVT);
+    if (Opc == ISD::CTLZ || Opc == ISD::CTTZ)
+      PassThrough = DAG.getConstant(SizeInBits, DL, VecVT);
+
+    SDValue IsNonZero = DAG.getSetCC(DL, BoolVT, Vec,
+                                     DAG.getConstant(0, DL, VecVT), ISD::SETNE);
+    SDValue Cnt = DAG.getNode(VecOpc, DL, VecVT, Vec);
+    Cnt = DAG.getNode(ISD::ADD, DL, VecVT, Cnt,
+                      DAG.getBuildVector(VecVT, DL, Offsets));
+    Cnt = DAG.getNode(ISD::VECTOR_COMPRESS, DL, VecVT, Cnt, IsNonZero,
+                      PassThrough);
+    Cnt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cnt,
+                      DAG.getVectorIdxConstant(0, DL));
+    return DAG.getZExtOrTrunc(Cnt, DL, VT);
+  }
+
+  return SDValue();
+}
+
 static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget &Subtarget) {
@@ -60993,6 +60948,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
+  case ISD::CTLZ:
+  case ISD::CTTZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTTZ_ZERO_UNDEF:return combineCTZ(N, DAG, DCI, Subtarget);
   case ISD::BITREVERSE:     return combineBITREVERSE(N, DAG, DCI, Subtarget);
   case ISD::AVGCEILS:
   case ISD::AVGCEILU:
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 69a5115201ef2..522782abd710f 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -338,188 +338,6 @@ let Predicates = [HasAMXFP8, In64BitMode] in {
   }
 }
 
-let Predicates = [HasAMXTILE, In64BitMode], isPseudo = true, SchedRW = [WriteSystem] in {
-  let mayStore = 1 in
-  def PTILEPAIRSTORE : PseudoI<(outs), (ins opaquemem:$src1, TILEPair:$src2), []>;
-  let mayLoad = 1 in
-  def PTILEPAIRLOAD : PseudoI<(outs TILEPair:$dst), (ins opaquemem:$src), []>;
-}
-
-multiclass T2RPNTLVW_Base<bits<8> op1, bits<8> op2, string rs, string suffix> {
-  def Z0#rs#suffix    : I<op1, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
-                          "t2rpntlvwz0" #!tolower(rs)# "\t{$src, $dst|$dst, $src}", []>, PS;
-  def Z0#rs#T1#suffix : I<op2, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
-                          "t2rpntlvwz0" #!tolower(rs)# "t1\t{$src, $dst|$dst, $src}", []>, PS;
-  def Z1#rs#suffix    : I<op1, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
-                          "t2rpntlvwz1" #!tolower(rs)# "\t{$src, $dst|$dst, $src}", []>, PD;
-  def Z1#rs#T1#suffix : I<op2, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
-                          "t2rpntlvwz1" #!tolower(rs)# "t1\t{$src, $dst|$dst, $src}", []>, PD;
-}
-
-let Predicates = [HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in
-  defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "">, T8, VEX;
-
-let Predicates = [HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in
-  defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "_EVEX">, T8, EVEX, NoCD8;
-
-let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in
-  defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "">, T_MAP5, VEX;
-
-let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in
-  defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "_EVEX">, T_MAP5, EVEX, NoCD8;
-
-let Predicates = [HasAMXTRANSPOSE, In64BitMode] in {
-  let SchedRW = [WriteSystem] in {
-    def TTRANSPOSED : I<0x5f, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src),
-                        "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8, XS;
-    let isPseudo = true in {
-      def PT2RPNTLVWZ0V : PseudoI<(outs TILEPair:$dst),
-                                  (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
-                                  []>;
-      def PT2RPNTLVWZ0T1V : PseudoI<(outs TILEPair:$dst),
-                                  (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
-                                  []>;
-      def PT2RPNTLVWZ1V : PseudoI<(outs TILEPair:$dst),
-                                  (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
-                                  []>;
-      def PT2RPNTLVWZ1T1V : PseudoI<(outs TILEPair:$dst),
-                                  (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
-                                  []>;
-    }
-
-    def PTTRANSPOSEDV : PseudoI<(outs TILE:$dst),
-                                (ins GR16:$src1, GR16:$src2, TILE:$src),
-                                [(set TILE: $dst,
-                                 (int_x86_ttransposed_internal GR16:$src1, GR16:$src2,
-                                  TILE:$src))]>;
-
-    let usesCustomInserter = 1 in {
-      def PT2RPNTLVWZ0 : PseudoI<(outs), (ins u8imm:$dst,
-                                 sibmem:$src1), []>;
-      def PT2RPNTLVWZ0T1 : PseudoI<(outs), (ins u8imm:$dst,
-                                   sibmem:$src1), []>;
-      def PT2RPNTLVWZ1 : PseudoI<(outs), (ins u8imm:$dst,
-                                 sibmem:$src1), []>;
-      def PT2RPNTLVWZ1T1 : PseudoI<(outs), (ins u8imm:$dst,
-                                   sibmem:$src1), []>;
-      def PTTRANSPOSED : PseudoI<(outs), (ins u8imm:$dst, u8imm:$src),
-                                 [(int_x86_ttransposed timm:$dst, timm:$src)]>;
-    }
-  }
-} // HasAMXTILE, HasAMXTRANSPOSE
-
-let Predicates = [HasAMXBF16, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
-  let Constraints = "$src1 = $dst" in
-    def TTDPBF16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
-                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
-                       "ttdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                       []>, VEX, VVVV, T8,XS;
-  let Constraints = "$src4 = $dst" in
-    def PTTDPBF16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
-                                GR16:$src2, GR16:$src3, TILE:$src4,
-                                TILE:$src5, TILE:$src6),
-                                [(set TILE: $dst,
-                                  (int_x86_ttdpbf16ps_internal GR16:$src1, GR16:$src2,
-                                   GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
-  let usesCustomInserter = 1 in
-    def PTTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
-                              [(int_x86_ttdpbf16ps timm:$src1, timm:$src2, timm:$src3)]>;
-}
-
-let Predicates = [HasAMXFP16, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
-  let Constraints = "$src1 = $dst" in
-    def TTDPFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
-                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
-                       "ttdpfp16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                       []>, VEX, VVVV, T8,XD;
-  let Constraints = "$src4 = $dst" in
-    def PTTDPFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
-                                GR16:$src2, GR16:$src3, TILE:$src4,
-                                TILE:$src5, TILE:$src6),
-                                [(set TILE: $dst,
-                                  (int_x86_ttdpfp16ps_internal GR16:$src1, GR16:$src2,
-                                   GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
-  let usesCustomInserter = 1 in
-    def PTTDPFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
-                              [(int_x86_ttdpfp16ps timm:$src1, timm:$src2, timm:$src3)]>;
-}
-
-let Predicates = [HasAMXCOMPLEX, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
-  let Constraints = "$src1 = $dst" in {
-    def TTCMMIMFP16PS : I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst),
-                          (ins TILE:$src1, TILE:$src2, TILE:$src3),
-                          "ttcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-                          []>, VEX, VVVV, T8,XD;
-    def TTCMMRLFP16PS: I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst),
-                         (ins TILE:$src1, TILE:$src2, TILE:$src3),
-                         "ttcmmrlfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-                         []>, VEX, VVVV, T8,XS;
-    def TCONJTCMMIMFP16PS : I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst),
-                          (ins TILE:$src1, TILE:$src2, TILE:$src3),
-                          "tconjtcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-                          []>, VEX, VVVV, WIG, T8,PS;
-  }
-  def TCONJTFP16 : I<0x6b, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src),
-                     "tconjtfp16\t{$src, $dst|$dst, $src}", []>, VEX, T8,PD;
-
-  let Constraints = "$src4 = $dst" in {
-    def PTTCMMIMFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
-                                  GR16:$src2, GR16:$src3, TILE:$src4,
-                                  TILE:$src5, TILE:$src6),
-                                  [(set TILE: $dst,
-                                    (int_x86_ttcmmimfp16ps_internal GR16:$src1, GR16:$src2,
-                                     GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
-    def PTTCMMRLFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
-                                  GR16:$src2, GR16:$src3, TILE:$src4,
-                                  TILE:$src5, TILE:$src6),
-                                  [(set TILE: $dst,
-                                    (int_x86_ttcmmrlfp16ps_internal GR16:$src1, GR16:$src2,
-                                     GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
-    def PTCONJTCMMIMFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
-                                      GR16:$src2, GR16:$src3, TILE:$src4,
-                                      TILE:$src5, TILE:$src6),
-                                      [(set TILE: $dst,
-                                        (int_x86_tconjtcmmimfp16ps_internal GR16:$src1, GR16:$src2,
-                                         GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
-  }
-  def PTCONJTFP16V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3),
-                             [(set TILE: $dst, (int_x86_tconjtfp16_internal GR16:$src1, GR16:$src2, TILE:$src3))]>;
-
-  let usesCustomInserter = 1 in {
-    def PTTCMMIMFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
-                                 [(int_x86_ttcmmimfp16ps timm:$src1, timm:$src2, timm:$src3)]>;
-    def PTTCMMRLFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
-                                 [(int_x86_ttcmmrlfp16ps timm:$src1, timm:$src2, timm:$src3)]>;
-    def PTCONJTCMMIMFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
-                                     [(int_x86_tconjtcmmimfp16ps timm:$src1, timm:$src2, timm:$src3)]>;
-    def PTCONJTFP16 : PseudoI<(outs), (ins u8imm:$dst, u8imm:$src),
-                              [(int_x86_tconjtfp16 timm:$dst, timm:$src)]>;
-  }
-}
-
-let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
-  let isPseudo = true in {
-    def PT2RPNTLVWZ0RSV   : PseudoI<(outs TILEPair:$dst),
-                              (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
-                              []>;
-    def PT2RPNTLVWZ0RST1V : PseudoI<(outs TILEPair:$dst),
-                              (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
-                              []>;
-    def PT2RPNTLVWZ1RSV   : PseudoI<(outs TILEPair:$dst),
-                              (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
-                              []>;
-    def PT2RPNTLVWZ1RST1V : PseudoI<(outs TILEPair:$dst),
-                              (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
-                              []>;
-  }
-  let  usesCustomInserter = 1 in {
-    def PT2RPNTLVWZ0RS   : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
-    def PT2RPNTLVWZ0RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
-    def PT2RPNTLVWZ1RS   : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
-    def PT2RPNTLVWZ1RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
-  }
-} // HasAMXMOVRS, HasAMXTRANSPOSE
-
 multiclass TILELOADDRS_Base<string suffix> {
   def suffix    : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src1),
                     "tileloaddrs\t{$src1, $dst|$dst, $src1}", []>, T8, XD;
@@ -721,29 +539,3 @@ let Predicates = [HasAMXTF32, In64BitMode] in {
     }
   } // SchedRW = [WriteSystem]
 } // HasAMXTF32
-
-let Predicates = [HasAMXTF32, HasAMXTRANSPOSE, In64BitMode] in {
-  let SchedRW = [WriteSystem] in {
-    let Constraints = "$src1 = $dst" in {
-      def TTMMULTF32PS: I<0x48, MRMSrcReg4VOp3, (outs TILE:$dst),
-                         (ins TILE:$src1, TILE:$src2, TILE:$src3),
-                         "ttmmultf32ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                         []>, VEX, VVVV, T8, PS;
-    }
-    let Constraints = "$src4 = $dst" in {
-      def PTTMMULTF32PSV : PseudoI<(outs TILE:$dst),
-                                   (ins GR16:$src1, GR16:$src2, GR16:$src3,
-                                    TILE:$src4, TILE:$src5, TILE:$src6),
-                                   [(set TILE:$dst,
-                                     (int_x86_ttmmultf32ps_internal GR16:$src1,
-                                      GR16:$src2, GR16:$src3, TILE:$src4,
-                                      TILE:$src5, TILE:$src6))]>;
-    }
-    let usesCustomInserter = 1 in {
-      def PTTMMULTF32PS : PseudoI<(outs),
-                                  (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
-                                  [(int_x86_ttmmultf32ps timm:$src1, timm:$src2,
-                                    timm:$src3)]>;
-    }
-  } // SchedRW = [WriteSystem]
-} // HasAMXTF32, HasAMXTRANSPOSE
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 1b748b7355716..70564973816b1 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3161,6 +3161,12 @@ multiclass avx512_mask_setop_w<SDPatternOperator Val> {
 defm KSET0 : avx512_mask_setop_w<immAllZerosV>;
 defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
 
+// 8-bit mask set operations for AVX512DQ
+let Predicates = [HasDQI] in {
+  defm KSET0B : avx512_mask_setop<VK8, v8i1, immAllZerosV>;
+  defm KSET1B : avx512_mask_setop<VK8, v8i1, immAllOnesV>;
+}
+
 // With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
 let Predicates = [HasAVX512] in {
   def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
@@ -3173,6 +3179,34 @@ let Predicates = [HasAVX512] in {
   def : Pat<(v1i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK1)>;
 }
 
+// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper
+// bits
+let Predicates = [HasDQI] in {
+  def : Pat<(v8i1 immAllZerosV), (KSET0B)>;
+  def : Pat<(v8i1 immAllOnesV), (KSET1B)>;
+}
+
+// Optimize bitconvert of all-ones constants to use kxnor instructions
+let Predicates = [HasDQI] in {
+  def : Pat<(v8i1(bitconvert(i8 255))), (KSET1B)>;
+  def : Pat<(v16i1(bitconvert(i16 255))), (COPY_TO_REGCLASS(KSET1B), VK16)>;
+}
+let Predicates = [HasBWI] in {
+  def : Pat<(v32i1(bitconvert(i32 -1))), (KSET1D)>;
+  def : Pat<(v64i1(bitconvert(i64 -1))), (KSET1Q)>;
+}
+// Submask patterns: lower N bits set in larger mask registers
+let Predicates = [HasBWI, HasDQI] in {
+  // v32i1 submasks
+  def : Pat<(v32i1(bitconvert(i32 255))), (COPY_TO_REGCLASS(KSET1B), VK32)>;
+  def : Pat<(v32i1(bitconvert(i32 65535))), (COPY_TO_REGCLASS(KSET1W), VK32)>;
+  // v64i1 submasks
+  def : Pat<(v64i1(bitconvert(i64 255))), (COPY_TO_REGCLASS(KSET1B), VK64)>;
+  def : Pat<(v64i1(bitconvert(i64 65535))), (COPY_TO_REGCLASS(KSET1W), VK64)>;
+  def : Pat<(v64i1(bitconvert(i64 4294967295))), (COPY_TO_REGCLASS(KSET1D),
+                                                     VK64)>;
+}
+
 // Patterns for kmask insert_subvector/extract_subvector to/from index=0
 multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subVT,
                                              RegisterClass RC, ValueType VT> {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 5c23f917d0530..cb0208a4a5f32 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -85,7 +85,7 @@ static cl::opt<unsigned> UndefRegClearance(
 void X86InstrInfo::anchor() {}
 
 X86InstrInfo::X86InstrInfo(const X86Subtarget &STI)
-    : X86GenInstrInfo(STI,
+    : X86GenInstrInfo(STI, RI,
                       (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
                                                : X86::ADJCALLSTACKDOWN32),
                       (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
@@ -93,10 +93,9 @@ X86InstrInfo::X86InstrInfo(const X86Subtarget &STI)
                       X86::CATCHRET, (STI.is64Bit() ? X86::RET64 : X86::RET32)),
       Subtarget(STI), RI(STI.getTargetTriple()) {}
 
-const TargetRegisterClass *
-X86InstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
-                          const TargetRegisterInfo *TRI) const {
-  auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum, TRI);
+const TargetRegisterClass *X86InstrInfo::getRegClass(const MCInstrDesc &MCID,
+                                                     unsigned OpNum) const {
+  auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum);
   // If the target does not have egpr, then r16-r31 will be resereved for all
   // instructions.
   if (!RC || !Subtarget.hasEGPR())
@@ -789,9 +788,11 @@ bool X86InstrInfo::isReMaterializableImpl(
   case X86::FsFLD0SS:
   case X86::FsFLD0SH:
   case X86::FsFLD0F128:
+  case X86::KSET0B:
   case X86::KSET0D:
   case X86::KSET0Q:
   case X86::KSET0W:
+  case X86::KSET1B:
   case X86::KSET1D:
   case X86::KSET1Q:
   case X86::KSET1W:
@@ -958,8 +959,7 @@ bool X86InstrInfo::isReMaterializableImpl(
 void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
                                  Register DestReg, unsigned SubIdx,
-                                 const MachineInstr &Orig,
-                                 const TargetRegisterInfo &TRI) const {
+                                 const MachineInstr &Orig) const {
   bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
   if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
                             MachineBasicBlock::LQR_Dead) {
@@ -4544,11 +4544,6 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
     return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
                 : GET_EGPR_IF_ENABLED(X86::TILESTORED);
 #undef GET_EGPR_IF_ENABLED
-  case 2048:
-    assert(X86::TILEPAIRRegClass.hasSubClassEq(RC) &&
-           "Unknown 2048-byte regclass");
-    assert(STI.hasAMXTILE() && "Using 2048-bit register requires AMX-TILE");
-    return Load ? X86::PTILEPAIRLOAD : X86::PTILEPAIRSTORE;
   }
 }
 
@@ -4743,8 +4738,6 @@ static bool isAMXOpcode(unsigned Opc) {
   case X86::TILESTORED:
   case X86::TILELOADD_EVEX:
   case X86::TILESTORED_EVEX:
-  case X86::PTILEPAIRLOAD:
-  case X86::PTILEPAIRSTORE:
     return true;
   }
 }
@@ -4757,8 +4750,7 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
   default:
     llvm_unreachable("Unexpected special opcode!");
   case X86::TILESTORED:
-  case X86::TILESTORED_EVEX:
-  case X86::PTILEPAIRSTORE: {
+  case X86::TILESTORED_EVEX: {
     // tilestored %tmm, (%sp, %idx)
     MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
     Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
@@ -4772,8 +4764,7 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
     break;
   }
   case X86::TILELOADD:
-  case X86::TILELOADD_EVEX:
-  case X86::PTILEPAIRLOAD: {
+  case X86::TILELOADD_EVEX: {
     // tileloadd (%sp, %idx), %tmm
     MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
     Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
@@ -4791,14 +4782,14 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
 void X86InstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
     bool isKill, int FrameIdx, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
-    MachineInstr::MIFlag Flags) const {
+
+    Register VReg, MachineInstr::MIFlag Flags) const {
   const MachineFunction &MF = *MBB.getParent();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
+  assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) &&
          "Stack slot too small for store");
 
-  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
+  unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16);
   bool isAligned =
       (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
       (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
@@ -4812,15 +4803,17 @@ void X86InstrInfo::storeRegToStackSlot(
         .setMIFlag(Flags);
 }
 
-void X86InstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-    int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        Register DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC,
+                                        Register VReg,
+                                        MachineInstr::MIFlag Flags) const {
   const MachineFunction &MF = *MBB.getParent();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
+  assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) &&
          "Load size exceeds stack slot");
-  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
+  unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16);
   bool isAligned =
       (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
       (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
@@ -5562,7 +5555,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
         return false;
       ShouldUpdateCC = true;
     } else if (ImmDelta != 0) {
-      unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg));
+      unsigned BitWidth = RI.getRegSizeInBits(*MRI->getRegClass(SrcReg));
       // Shift amount for min/max constants to adjust for 8/16/32 instruction
       // sizes.
       switch (OldCC) {
@@ -6361,12 +6354,16 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   // registers, since it is not usable as a write mask.
   // FIXME: A more advanced approach would be to choose the best input mask
   // register based on context.
+  case X86::KSET0B:
+    return Expand2AddrKreg(MIB, get(X86::KXORBkk), X86::K0);
   case X86::KSET0W:
     return Expand2AddrKreg(MIB, get(X86::KXORWkk), X86::K0);
   case X86::KSET0D:
     return Expand2AddrKreg(MIB, get(X86::KXORDkk), X86::K0);
   case X86::KSET0Q:
     return Expand2AddrKreg(MIB, get(X86::KXORQkk), X86::K0);
+  case X86::KSET1B:
+    return Expand2AddrKreg(MIB, get(X86::KXNORBkk), X86::K0);
   case X86::KSET1W:
     return Expand2AddrKreg(MIB, get(X86::KXNORWkk), X86::K0);
   case X86::KSET1D:
@@ -7244,7 +7241,6 @@ static void updateOperandRegConstraints(MachineFunction &MF,
                                         MachineInstr &NewMI,
                                         const TargetInstrInfo &TII) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
 
   for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) {
     MachineOperand &MO = NewMI.getOperand(Idx);
@@ -7256,7 +7252,7 @@ static void updateOperandRegConstraints(MachineFunction &MF,
       continue;
 
     auto *NewRC =
-        MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI));
+        MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx));
     if (!NewRC) {
       LLVM_DEBUG(
           dbgs() << "WARNING: Unable to update register constraint for operand "
@@ -7354,7 +7350,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
       unsigned SrcIdx = (Imm >> 6) & 3;
 
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       if ((Size == 0 || Size >= 16) && RCSize >= 16 &&
           (MI.getOpcode() != X86::INSERTPSrri || Alignment >= Align(4))) {
@@ -7379,7 +7375,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
     // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
     if (OpNum == 2) {
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) {
         unsigned NewOpCode =
@@ -7398,7 +7394,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
     // table twice.
     if (OpNum == 2) {
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) {
         MachineInstr *NewMI =
@@ -7533,7 +7529,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     bool NarrowToMOV32rm = false;
     if (Size) {
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       // Check if it's safe to fold the load. If the size of the object is
       // narrower than the load width, then it's not.
@@ -8127,9 +8123,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
              RC == &X86::VK32WMRegClass || RC == &X86::VK64WMRegClass;
     };
 
-    if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1, &RI)))
+    if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1)))
       MaskReg = Op1.getReg();
-    else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2, &RI)))
+    else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2)))
       MaskReg = Op2.getReg();
 
     if (MaskReg) {
@@ -8533,7 +8529,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
 
   const MCInstrDesc &MCID = get(Opc);
 
-  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
+  const TargetRegisterClass *RC = getRegClass(MCID, Index);
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   // TODO: Check if 32-byte or greater accesses are slow too?
   if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
@@ -8644,7 +8640,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
 
   // Emit the store instruction.
   if (UnfoldStore) {
-    const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI);
+    const TargetRegisterClass *DstRC = getRegClass(MCID, 0);
     auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
     unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
     bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
@@ -8676,7 +8672,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
   const MCInstrDesc &MCID = get(Opc);
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
+  const TargetRegisterClass *RC = getRegClass(MCID, Index);
   unsigned NumDefs = MCID.NumDefs;
   std::vector<SDValue> AddrOps;
   std::vector<SDValue> BeforeOps;
@@ -8727,7 +8723,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
   std::vector<EVT> VTs;
   const TargetRegisterClass *DstRC = nullptr;
   if (MCID.getNumDefs() > 0) {
-    DstRC = getRegClass(MCID, 0, &RI);
+    DstRC = getRegClass(MCID, 0);
     VTs.push_back(*TRI.legalclasstypes_begin(*DstRC));
   }
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 5f75559bd9598..a547fcd421411 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -246,9 +246,8 @@ class X86InstrInfo final : public X86GenInstrInfo {
   /// GR*RegClass (definition in TD file)
   /// ->
   /// GR*_NOREX2RegClass (Returned register class)
-  const TargetRegisterClass *
-  getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
-              const TargetRegisterInfo *TRI) const override;
+  const TargetRegisterClass *getRegClass(const MCInstrDesc &MCID,
+                                         unsigned OpNum) const override;
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
@@ -343,8 +342,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
   bool isReMaterializableImpl(const MachineInstr &MI) const override;
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      Register DestReg, unsigned SubIdx,
-                     const MachineInstr &Orig,
-                     const TargetRegisterInfo &TRI) const override;
+                     const MachineInstr &Orig) const override;
 
   /// Given an operand within a MachineInstr, insert preceding code to put it
   /// into the right format for a particular kind of LEA instruction. This may
@@ -469,14 +467,14 @@ class X86InstrInfo final : public X86GenInstrInfo {
                    bool RenamableSrc = false) const override;
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
       int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadStoreTileReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
diff --git a/llvm/lib/Target/X86/X86InstrOperands.td b/llvm/lib/Target/X86/X86InstrOperands.td
index 5207ecad127a2..6ba07f74d74c5 100644
--- a/llvm/lib/Target/X86/X86InstrOperands.td
+++ b/llvm/lib/Target/X86/X86InstrOperands.td
@@ -536,10 +536,3 @@ def VK8Pair : RegisterOperand<VK8PAIR, "printVKPair"> {
 def VK16Pair : RegisterOperand<VK16PAIR, "printVKPair"> {
   let ParserMatchClass = VK16PairAsmOperand;
 }
-
-let RenderMethod = "addTILEPairOperands" in
-  def TILEPairAsmOperand : AsmOperandClass { let Name = "TILEPair"; }
-
-def TILEPair : RegisterOperand<TILEPAIR, "printTILEPair"> {
-  let ParserMatchClass = TILEPairAsmOperand;
-}
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index c20bb05018b4d..98104a6fad1a9 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -183,7 +183,6 @@ def HasAMXINT8   : Predicate<"Subtarget->hasAMXINT8()">;
 def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">;
 def HasAMXFP8    : Predicate<"Subtarget->hasAMXFP8()">;
 def HasAMXMOVRS  : Predicate<"Subtarget->hasAMXMOVRS()">;
-def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">;
 def HasAMXAVX512 : Predicate<"Subtarget->hasAMXAVX512()">;
 def HasAMXTF32   : Predicate<"Subtarget->hasAMXTF32()">;
 def HasUINTR     : Predicate<"Subtarget->hasUINTR()">;
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
index 090060eaa65e1..3b96e706fb607 100644
--- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -115,9 +115,9 @@ struct MachineGadgetGraph : ImmutableGraph<MachineInstr *, int> {
   static constexpr MachineInstr *const ArgNodeSentinel = nullptr;
 
   using GraphT = ImmutableGraph<MachineInstr *, int>;
-  using Node = typename GraphT::Node;
-  using Edge = typename GraphT::Edge;
-  using size_type = typename GraphT::size_type;
+  using Node = GraphT::Node;
+  using Edge = GraphT::Edge;
+  using size_type = GraphT::size_type;
   MachineGadgetGraph(std::unique_ptr<Node[]> Nodes,
                      std::unique_ptr<Edge[]> Edges, size_type NodesSize,
                      size_type EdgesSize, int NumFences = 0, int NumGadgets = 0)
@@ -191,10 +191,10 @@ template <>
 struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits {
   using GraphType = MachineGadgetGraph;
   using Traits = llvm::GraphTraits<GraphType *>;
-  using NodeRef = typename Traits::NodeRef;
-  using EdgeRef = typename Traits::EdgeRef;
-  using ChildIteratorType = typename Traits::ChildIteratorType;
-  using ChildEdgeIteratorType = typename Traits::ChildEdgeIteratorType;
+  using NodeRef = Traits::NodeRef;
+  using EdgeRef = Traits::EdgeRef;
+  using ChildIteratorType = Traits::ChildIteratorType;
+  using ChildEdgeIteratorType = Traits::ChildEdgeIteratorType;
 
   DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
 
@@ -227,9 +227,6 @@ struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits {
 
 } // end namespace llvm
 
-constexpr MachineInstr *MachineGadgetGraph::ArgNodeSentinel;
-constexpr int MachineGadgetGraph::GadgetEdgeSentinel;
-
 char X86LoadValueInjectionLoadHardeningPass::ID = 0;
 
 void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage(
@@ -335,7 +332,7 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
   L.computePhiInfo();
 
   GraphBuilder Builder;
-  using GraphIter = typename GraphBuilder::BuilderNodeRef;
+  using GraphIter = GraphBuilder::BuilderNodeRef;
   DenseMap<MachineInstr *, GraphIter> NodeMap;
   int FenceCount = 0, GadgetCount = 0;
   auto MaybeAddNode = [&NodeMap, &Builder](MachineInstr *MI) {
diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
index 7f3393910da2c..662aec2c15241 100644
--- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
@@ -23,12 +23,15 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Analysis.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -40,7 +43,7 @@
 using namespace llvm;
 using namespace PatternMatch;
 
-#define DEBUG_TYPE "lower-amx-intrinsics"
+#define DEBUG_TYPE "x86-lower-amx-intrinsics"
 
 #ifndef NDEBUG
 static bool isV256I32Ty(Type *Ty) {
@@ -626,6 +629,37 @@ bool X86LowerAMXIntrinsics::visit() {
   return C;
 }
 
+namespace {
+bool shouldRunLowerAMXIntrinsics(const Function &F, const TargetMachine *TM) {
+  return X86ScalarizeAMX && (F.hasFnAttribute(Attribute::OptimizeNone) ||
+                             TM->getOptLevel() == CodeGenOptLevel::None);
+}
+
+bool runLowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) {
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
+  X86LowerAMXIntrinsics LAT(F, DTU, LI);
+  return LAT.visit();
+}
+} // namespace
+
+PreservedAnalyses X86LowerAMXIntrinsicsPass::run(Function &F,
+                                                 FunctionAnalysisManager &FAM) {
+  if (!shouldRunLowerAMXIntrinsics(F, TM))
+    return PreservedAnalyses::all();
+
+  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+  bool Changed = runLowerAMXIntrinsics(F, &DT, &LI);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA = PreservedAnalyses::none();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  return PA;
+}
+
 namespace {
 class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
 public:
@@ -634,21 +668,15 @@ class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
   X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) {}
 
   bool runOnFunction(Function &F) override {
-    if (!X86ScalarizeAMX)
-      return false;
     TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
-    if (!F.hasFnAttribute(Attribute::OptimizeNone) &&
-        TM->getOptLevel() != CodeGenOptLevel::None)
+    if (!shouldRunLowerAMXIntrinsics(F, TM))
       return false;
 
     auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
     auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
     auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
     auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
-    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-
-    X86LowerAMXIntrinsics LAT(F, DTU, LI);
-    return LAT.visit();
+    return runLowerAMXIntrinsics(F, DT, LI);
   }
   StringRef getPassName() const override { return "Lower AMX intrinsics"; }
 
@@ -668,6 +696,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_END(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName,
                     false, false)
 
-FunctionPass *llvm::createX86LowerAMXIntrinsicsPass() {
+FunctionPass *llvm::createX86LowerAMXIntrinsicsLegacyPass() {
   return new X86LowerAMXIntrinsicsLegacyPass();
 }
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 8ffd454f4f73e..2fc5d38ef5055 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -74,22 +74,6 @@ static bool isAMXCast(Instruction *II) {
          match(II, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(m_Value()));
 }
 
-// Some instructions may return more than one tiles.
-// e.g: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal
-static unsigned getNumDefTiles(IntrinsicInst *II) {
-  Type *Ty = II->getType();
-  if (Ty->isX86_AMXTy())
-    return 1;
-
-  unsigned Num = 0;
-  for (unsigned i = 0; i < Ty->getNumContainedTypes(); i++) {
-    Type *STy = Ty->getContainedType(i);
-    if (STy->isX86_AMXTy())
-      Num++;
-  }
-  return Num;
-}
-
 static bool isAMXIntrinsic(Value *I) {
   auto *II = dyn_cast<IntrinsicInst>(I);
   if (!II)
@@ -98,7 +82,7 @@ static bool isAMXIntrinsic(Value *I) {
     return false;
   // Check if return type or parameter is x86_amx. If it is x86_amx
   // the intrinsic must be x86 amx intrinsics.
-  if (getNumDefTiles(II) > 0)
+  if (II->getType()->isX86_AMXTy())
     return true;
   for (Value *V : II->args()) {
     if (V->getType()->isX86_AMXTy())
@@ -137,27 +121,7 @@ static Instruction *getFirstNonAllocaInTheEntryBlock(Function &F) {
   llvm_unreachable("No terminator in the entry block!");
 }
 
-class ShapeCalculator {
-private:
-  const TargetMachine *TM = nullptr;
-
-  // In AMX intrinsics we let Shape = {Row, Col}, but the
-  // RealCol = Col / ElementSize. We may use the RealCol
-  // as a new Row for other new created AMX intrinsics.
-  std::map<Value *, Value *> Col2Row, Row2Col;
-
-public:
-  ShapeCalculator(const TargetMachine *TargetM) : TM(TargetM) {}
-  std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo);
-  std::pair<Value *, Value *> getShape(PHINode *Phi);
-  Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity);
-  Value *getColFromRow(Instruction *II, Value *V, unsigned Granularity);
-};
-
-Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V,
-                                      unsigned Granularity) {
-  if (auto It = Col2Row.find(V); It != Col2Row.end())
-    return It->second;
+static Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity) {
   IRBuilder<> Builder(II);
   Value *RealRow = nullptr;
   if (isa<ConstantInt>(V))
@@ -186,47 +150,16 @@ Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V,
         getFirstNonAllocaInTheEntryBlock(*II->getFunction()));
     RealRow = NewBuilder.CreateUDiv(V, NewBuilder.getInt16(Granularity));
   }
-  Col2Row[V] = RealRow;
   return RealRow;
 }
 
-Value *ShapeCalculator::getColFromRow(Instruction *II, Value *V,
-                                      unsigned Granularity) {
-  if (auto It = Row2Col.find(V); It != Row2Col.end())
-    return It->second;
-  IRBuilder<> Builder(II);
-  Value *RealCol = nullptr;
-  if (isa<ConstantInt>(V))
-    RealCol =
-        Builder.getInt16((cast<ConstantInt>(V)->getSExtValue()) * Granularity);
-  else if (isa<Instruction>(V)) {
-    Builder.SetInsertPoint(cast<Instruction>(V));
-    RealCol = Builder.CreateNUWMul(V, Builder.getInt16(Granularity));
-    cast<Instruction>(RealCol)->moveAfter(cast<Instruction>(V));
-  } else {
-    // When it is not a const value and it is a function argument, we create
-    // Row at the entry bb.
-    IRBuilder<> NewBuilder(
-        getFirstNonAllocaInTheEntryBlock(*II->getFunction()));
-    RealCol = NewBuilder.CreateNUWMul(V, NewBuilder.getInt16(Granularity));
-  }
-  Row2Col[V] = RealCol;
-  return RealCol;
-}
-
 // TODO: Refine the row and col-in-bytes of tile to row and col of matrix.
-std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
-                                                      unsigned OpNo) {
-  (void)TM;
+std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
   IRBuilder<> Builder(II);
   Value *Row = nullptr, *Col = nullptr;
   switch (II->getIntrinsicID()) {
   default:
     llvm_unreachable("Expect amx intrinsics");
-  case Intrinsic::x86_t2rpntlvwz0_internal:
-  case Intrinsic::x86_t2rpntlvwz0t1_internal:
-  case Intrinsic::x86_t2rpntlvwz1_internal:
-  case Intrinsic::x86_t2rpntlvwz1t1_internal:
   case Intrinsic::x86_tileloadd64_internal:
   case Intrinsic::x86_tileloaddt164_internal:
   case Intrinsic::x86_tilestored64_internal:
@@ -271,13 +204,6 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
     }
     break;
   }
-  case Intrinsic::x86_ttransposed_internal:
-  case Intrinsic::x86_tconjtfp16_internal: {
-    assert((OpNo == 2) && "Illegal Operand Number.");
-    Row = getRowFromCol(II, II->getArgOperand(1), 4);
-    Col = getColFromRow(II, II->getArgOperand(0), 4);
-    break;
-  }
   case Intrinsic::x86_tcvtrowd2ps_internal:
   case Intrinsic::x86_tcvtrowps2bf16h_internal:
   case Intrinsic::x86_tcvtrowps2bf16l_internal:
@@ -289,34 +215,12 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
     Col = II->getArgOperand(1);
     break;
   }
-  case Intrinsic::x86_ttdpbf16ps_internal:
-  case Intrinsic::x86_ttdpfp16ps_internal:
-  case Intrinsic::x86_ttcmmimfp16ps_internal:
-  case Intrinsic::x86_ttcmmrlfp16ps_internal:
-  case Intrinsic::x86_tconjtcmmimfp16ps_internal:
-  case Intrinsic::x86_ttmmultf32ps_internal: {
-    switch (OpNo) {
-    case 3:
-      Row = II->getArgOperand(0);
-      Col = II->getArgOperand(1);
-      break;
-    case 4:
-      Row = getRowFromCol(II, II->getArgOperand(2), 4);
-      Col = getColFromRow(II, II->getArgOperand(0), 4);
-      break;
-    case 5:
-      Row = getRowFromCol(II, II->getArgOperand(2), 4);
-      Col = II->getArgOperand(1);
-      break;
-    }
-    break;
-  }
   }
 
   return std::make_pair(Row, Col);
 }
 
-std::pair<Value *, Value *> ShapeCalculator::getShape(PHINode *Phi) {
+static std::pair<Value *, Value *> getShape(PHINode *Phi) {
   Use &U = *(Phi->use_begin());
   unsigned OpNo = U.getOperandNo();
   User *V = U.getUser();
@@ -349,15 +253,14 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(PHINode *Phi) {
 namespace {
 class X86LowerAMXType {
   Function &Func;
-  ShapeCalculator *SC;
 
   // In AMX intrinsics we let Shape = {Row, Col}, but the
   // RealCol = Col / ElementSize. We may use the RealCol
   // as a new Row for other new created AMX intrinsics.
-  std::map<Value *, Value *> Col2Row, Row2Col;
+  std::map<Value *, Value *> Col2Row;
 
 public:
-  X86LowerAMXType(Function &F, ShapeCalculator *ShapeC) : Func(F), SC(ShapeC) {}
+  X86LowerAMXType(Function &F) : Func(F) {}
   bool visit();
   void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast);
   void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST);
@@ -374,7 +277,7 @@ void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
   Use &U = *(Bitcast->use_begin());
   unsigned OpNo = U.getOperandNo();
   auto *II = cast<IntrinsicInst>(U.getUser());
-  std::tie(Row, Col) = SC->getShape(II, OpNo);
+  std::tie(Row, Col) = getShape(II, OpNo);
   IRBuilder<> Builder(Bitcast);
   // Use the maximun column as stride.
   Value *Stride = Builder.getInt64(64);
@@ -454,7 +357,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) {
     Builder.CreateStore(Src, AllocaAddr);
     // TODO we can pick an constant operand for the shape.
     Value *Row = nullptr, *Col = nullptr;
-    std::tie(Row, Col) = SC->getShape(II, OpNo);
+    std::tie(Row, Col) = getShape(II, OpNo);
     std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
     Value *NewInst =
         Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, Args);
@@ -594,18 +497,11 @@ static Value *getAllocaPos(BasicBlock *BB) {
 
 static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) {
   assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!");
-  auto *II = dyn_cast<IntrinsicInst>(TileDef);
-  unsigned Idx = 0;
-  // Extract tile from multiple tiles' def.
-  if (auto *Extr = dyn_cast<ExtractValueInst>(TileDef)) {
-    assert(Extr->hasIndices() && "Tile extract miss index!");
-    Idx = Extr->getIndices()[0];
-    II = cast<IntrinsicInst>(Extr->getOperand(0));
-  }
+  auto *II = cast<IntrinsicInst>(TileDef);
 
   assert(II && "Not tile intrinsic!");
-  Value *Row = II->getOperand(Idx);
-  Value *Col = II->getOperand(Idx + 1);
+  Value *Row = II->getOperand(0);
+  Value *Col = II->getOperand(1);
 
   BasicBlock *BB = TileDef->getParent();
   BasicBlock::iterator Iter = TileDef->getIterator();
@@ -624,20 +520,14 @@ static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) {
 
   // Get tile shape.
   IntrinsicInst *II = nullptr;
-  unsigned Idx = 0;
   if (IsPHI) {
     Value *PhiOp = cast<PHINode>(V)->getIncomingValue(0);
     II = cast<IntrinsicInst>(PhiOp);
-  } else if (auto *Extr = dyn_cast<ExtractValueInst>(V)) {
-    // Extract tile from multiple tiles' def.
-    assert(Extr->hasIndices() && "Tile extract miss index!");
-    Idx = Extr->getIndices()[0];
-    II = cast<IntrinsicInst>(Extr->getOperand(0));
   } else {
     II = cast<IntrinsicInst>(V);
   }
-  Value *Row = II->getOperand(Idx);
-  Value *Col = II->getOperand(Idx + 1);
+  Value *Row = II->getOperand(0);
+  Value *Col = II->getOperand(1);
 
   Instruction *UserI = cast<Instruction>(U.getUser());
   IRBuilder<> Builder(UserI);
@@ -848,12 +738,10 @@ namespace {
 
 class X86LowerAMXCast {
   Function &Func;
-  ShapeCalculator *SC;
   std::unique_ptr<DominatorTree> DT;
 
 public:
-  X86LowerAMXCast(Function &F, ShapeCalculator *ShapeC)
-      : Func(F), SC(ShapeC), DT(nullptr) {}
+  X86LowerAMXCast(Function &F) : Func(F), DT(nullptr) {}
   bool combineCastStore(IntrinsicInst *Cast, StoreInst *ST);
   bool combineLoadCast(IntrinsicInst *Cast, LoadInst *LD);
   bool combineTilezero(IntrinsicInst *Cast);
@@ -932,7 +820,7 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi(
         if (!isa<UndefValue>(IncValue) && !IncConst->isZeroValue())
           return false;
         Value *Row = nullptr, *Col = nullptr;
-        std::tie(Row, Col) = SC->getShape(OldPN);
+        std::tie(Row, Col) = getShape(OldPN);
         // TODO: If it is not constant the Row and Col must domoniate tilezero
         // that we are going to create.
         if (!Row || !Col || !isa<Constant>(Row) || !isa<Constant>(Col))
@@ -1063,19 +951,6 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi(
   return true;
 }
 
-static Value *getShapeFromAMXIntrinsic(Value *Inst, unsigned ShapeIdx,
-                                       bool IsRow) {
-  if (!isAMXIntrinsic(Inst))
-    return nullptr;
-
-  auto *II = cast<IntrinsicInst>(Inst);
-  if (IsRow)
-    return II->getOperand(0);
-
-  assert(ShapeIdx < 2 && "Currently 2 shapes in 1 instruction at most!");
-  return II->getOperand(ShapeIdx + 1);
-}
-
 // %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %42)
 // store <256 x i32> %43, <256 x i32>* %p, align 64
 // -->
@@ -1090,38 +965,13 @@ bool X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) {
   if (!Tile->hasOneUse())
     return false;
 
-  // We don't fetch shape from tilestore, we only get shape from tiledef,
-  // so we can set the max tile shape to tilestore for special cases.
+  auto *II = cast<IntrinsicInst>(Tile);
+  // Tile is output from AMX intrinsic. The first operand of the
+  // intrinsic is row, the second operand of the intrinsic is column.
+  Value *Row = II->getOperand(0);
+  Value *Col = II->getOperand(1);
+
   IRBuilder<> Builder(ST);
-  Value *Row = nullptr;
-  Value *Col = nullptr;
-
-  if (isAMXIntrinsic(Tile)) {
-    auto *II = cast<IntrinsicInst>(Tile);
-    // Tile is output from AMX intrinsic. The first operand of the
-    // intrinsic is row, the second operand of the intrinsic is column.
-    Row = II->getOperand(0);
-    Col = II->getOperand(1);
-  } else {
-    // Now we supported multi-tiles value in structure, so we may get tile
-    // from extracting multi-tiles structure.
-    // For example:
-    // %6 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %1,
-    //      i16 %2, i16 %3, i8* %4, i64 %5)
-    // %7 = extractvalue { x86_amx, x86_amx } %6, 0
-    // %8 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %7)
-    // store <256 x i32> %8, <256 x i32>* %0, align 1024
-    //
-    // TODO: Currently we only handle extractvalue case, enhance me for other
-    // cases if possible.
-    auto *II = cast<ExtractValueInst>(Tile);
-    assert(II && "We meet unhandle source in fetching tile value!");
-    unsigned ShapeIdx = II->getIndices()[0];
-    Value *Tiles = II->getOperand(0);
-    Row = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, true);
-    Col = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, false);
-  }
-  assert(Row && Col && "Shape got failed!");
 
   // Stride should be equal to col(measured by bytes)
   Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty());
@@ -1146,7 +996,7 @@ bool X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) {
   // shape information through def-use chain.
   if (!isAMXIntrinsic(II))
     return false;
-  std::tie(Row, Col) = SC->getShape(II, OpNo);
+  std::tie(Row, Col) = getShape(II, OpNo);
   IRBuilder<> Builder(LD);
   // Stride should be equal to col(measured by bytes)
   Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty());
@@ -1189,7 +1039,7 @@ bool X86LowerAMXCast::combineTilezero(IntrinsicInst *Cast) {
   if (!isAMXIntrinsic(II))
     return false;
 
-  std::tie(Row, Col) = SC->getShape(II, OpNo);
+  std::tie(Row, Col) = getShape(II, OpNo);
 
   IRBuilder<> Builder(Cast);
   Value *NewInst =
@@ -1384,7 +1234,7 @@ bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) {
     Builder.CreateStore(Src, AllocaAddr);
     // TODO we can pick an constant operand for the shape.
     Value *Row = nullptr, *Col = nullptr;
-    std::tie(Row, Col) = SC->getShape(II, OpNo);
+    std::tie(Row, Col) = getShape(II, OpNo);
     std::array<Value *, 4> Args = {
         Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty())};
     Value *NewInst =
@@ -1445,14 +1295,13 @@ bool lowerAmxType(Function &F, const TargetMachine *TM,
     return false;
 
   bool C = false;
-  ShapeCalculator SC(TM);
-  X86LowerAMXCast LAC(F, &SC);
+  X86LowerAMXCast LAC(F);
   C |= LAC.combineAMXcast(TLI);
   // There might be remaining AMXcast after combineAMXcast and they should be
   // handled elegantly.
   C |= LAC.transformAllAMXCast();
 
-  X86LowerAMXType LAT(F, &SC);
+  X86LowerAMXType LAT(F);
   C |= LAT.visit();
 
   // Prepare for fast register allocation at O0.
diff --git a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
index 167bed132cd12..c9646053afac1 100644
--- a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -359,7 +359,7 @@ bool X86OptimizeLEAPass::chooseBestLEA(
     // example MOV8mr_NOREX. We could constrain the register class of the LEA
     // def to suit MI, however since this case is very rare and hard to
     // reproduce in a test it's just more reliable to skip the LEA.
-    if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI) !=
+    if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg) !=
         MRI->getRegClass(DefMI->getOperand(0).getReg()))
       continue;
 
diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp
index a25e4e0f464a4..898c83cf9b468 100644
--- a/llvm/lib/Target/X86/X86PartialReduction.cpp
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -16,10 +16,12 @@
 #include "X86TargetMachine.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Analysis.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/KnownBits.h"
@@ -30,39 +32,44 @@ using namespace llvm;
 
 namespace {
 
-class X86PartialReduction : public FunctionPass {
+class X86PartialReduction {
+  const X86TargetMachine *TM;
   const DataLayout *DL = nullptr;
   const X86Subtarget *ST = nullptr;
 
+public:
+  X86PartialReduction(const X86TargetMachine *TM) : TM(TM) {}
+  bool run(Function &F);
+
+private:
+  bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB);
+  bool trySADReplacement(Instruction *Op);
+};
+
+class X86PartialReductionLegacy : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid.
 
-  X86PartialReduction() : FunctionPass(ID) { }
+  X86PartialReductionLegacy() : FunctionPass(ID) {}
 
-  bool runOnFunction(Function &Fn) override;
+  bool runOnFunction(Function &F) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
   }
 
-  StringRef getPassName() const override {
-    return "X86 Partial Reduction";
-  }
-
-private:
-  bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB);
-  bool trySADReplacement(Instruction *Op);
+  StringRef getPassName() const override { return "X86 Partial Reduction"; }
 };
 }
 
-FunctionPass *llvm::createX86PartialReductionPass() {
-  return new X86PartialReduction();
+FunctionPass *llvm::createX86PartialReductionLegacyPass() {
+  return new X86PartialReductionLegacy();
 }
 
-char X86PartialReduction::ID = 0;
+char X86PartialReductionLegacy::ID = 0;
 
-INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE,
-                "X86 Partial Reduction", false, false)
+INITIALIZE_PASS(X86PartialReductionLegacy, DEBUG_TYPE, "X86 Partial Reduction",
+                false, false)
 
 // This function should be aligned with detectExtMul() in X86ISelLowering.cpp.
 static bool matchVPDPBUSDPattern(const X86Subtarget *ST, BinaryOperator *Mul,
@@ -494,17 +501,8 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
   }
 }
 
-bool X86PartialReduction::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
-
-  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
-  if (!TPC)
-    return false;
-
-  auto &TM = TPC->getTM<X86TargetMachine>();
-  ST = TM.getSubtargetImpl(F);
-
+bool X86PartialReduction::run(Function &F) {
+  ST = TM->getSubtargetImpl(F);
   DL = &F.getDataLayout();
 
   bool MadeChange = false;
@@ -540,3 +538,25 @@ bool X86PartialReduction::runOnFunction(Function &F) {
 
   return MadeChange;
 }
+
+bool X86PartialReductionLegacy::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  return X86PartialReduction(&TPC->getTM<X86TargetMachine>()).run(F);
+}
+
+PreservedAnalyses X86PartialReductionPass::run(Function &F,
+                                               FunctionAnalysisManager &FAM) {
+  bool Changed = X86PartialReduction(TM).run(F);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA = PreservedAnalyses::none();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def
index fc25d55d3059a..0d7095b18daa8 100644
--- a/llvm/lib/Target/X86/X86PassRegistry.def
+++ b/llvm/lib/Target/X86/X86PassRegistry.def
@@ -15,20 +15,22 @@
 #ifndef FUNCTION_PASS
 #define FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
+FUNCTION_PASS("x86-lower-amx-intrinsics", X86LowerAMXIntrinsicsPass(this))
 FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this))
+FUNCTION_PASS("x86-partial-reduction", X86PartialReductionPass(this))
 #undef FUNCTION_PASS
 
 #ifndef DUMMY_FUNCTION_PASS
 #define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
-DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this))
-DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction())
 DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass())
 #undef DUMMY_FUNCTION_PASS
 
 #ifndef MACHINE_FUNCTION_PASS
 #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
+MACHINE_FUNCTION_PASS("x86-avoid-trailing-call", X86AvoidTrailingCallPass())
+MACHINE_FUNCTION_PASS("x86-dyn-alloca-expander", X86DynAllocaExpanderPass())
 MACHINE_FUNCTION_PASS("x86-isel", X86ISelDAGToDAGPass(*this))
 #undef MACHINE_FUNCTION_PASS
 
@@ -36,13 +38,11 @@ MACHINE_FUNCTION_PASS("x86-isel", X86ISelDAGToDAGPass(*this))
 #define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME)
 #endif
 DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-SFB", X86AvoidSFBPass())
-DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-trailing-call", X86AvoidTrailingCallPass())
 DUMMY_MACHINE_FUNCTION_PASS("x86-cf-opt", X86CallFrameOptimization())
 DUMMY_MACHINE_FUNCTION_PASS("x86-cmov-conversion", X86CmovConverterPass())
 DUMMY_MACHINE_FUNCTION_PASS("x86-codege", FPS())
 DUMMY_MACHINE_FUNCTION_PASS("x86-compress-evex", CompressEVEXPass())
 DUMMY_MACHINE_FUNCTION_PASS("x86-domain-reassignment", X86DomainReassignment())
-DUMMY_MACHINE_FUNCTION_PASS("x86-dyn-alloca-expander", X86DynAllocaExpander())
 DUMMY_MACHINE_FUNCTION_PASS("x86-execution-domain-fix", X86ExecutionDomainFix())
 DUMMY_MACHINE_FUNCTION_PASS("fastpretileconfig", X86FastPreTileConfig())
 DUMMY_MACHINE_FUNCTION_PASS("fasttileconfig", X86FastTileConfig())
diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
index 2a1c49957bf7a..8a1d00d2f6427 100644
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -141,15 +141,10 @@ class X86PreTileConfig : public MachineFunctionPass {
     if (!MO.isReg() || !MO.getReg().isVirtual())
       return false;
 
-    unsigned Shapes = 0;
-    if (MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID)
-      Shapes = 1;
-    if (MRI->getRegClass(MO.getReg())->getID() == X86::TILEPAIRRegClassID)
-      Shapes = 2;
-    if (!Shapes)
+    if (MRI->getRegClass(MO.getReg())->getID() != X86::TILERegClassID)
       return false;
 
-    collectShapeInfo(MI, Shapes);
+    collectShapeInfo(MI);
     return true;
   }
 
@@ -165,7 +160,7 @@ class X86PreTileConfig : public MachineFunctionPass {
   }
 
   /// Collect the shape def information for later use.
-  void collectShapeInfo(MachineInstr &MI, unsigned Shapes);
+  void collectShapeInfo(MachineInstr &MI);
 
   /// Try to hoist shapes definded below AMX instructions.
   bool hoistShapesInBB(MachineBasicBlock *MBB, SmallVectorImpl<MIRef> &Shapes) {
@@ -231,7 +226,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
 INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
                     "Tile Register Pre-configure", false, false)
 
-void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) {
+void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) {
   auto RecordShape = [&](MachineInstr *MI, MachineBasicBlock *MBB) {
     MIRef MIR(MI, MBB);
     auto &Refs = ShapeBBs[MBB];
@@ -240,10 +235,8 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) {
       Refs.insert(I, MIR);
   };
 
-  // All shapes have same row in multi-tile operand.
-  SmallVector<Register, 8> WorkList;
-  for (unsigned I = 1; I < Shapes + 2; ++I)
-    WorkList.push_back(MI.getOperand(I).getReg());
+  SmallVector<Register, 8> WorkList(
+      {MI.getOperand(1).getReg(), MI.getOperand(2).getReg()});
   while (!WorkList.empty()) {
     Register R = WorkList.pop_back_val();
     MachineInstr *DefMI = MRI->getVRegDef(R);
@@ -252,13 +245,6 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) {
     if (DefMI->isMoveImmediate() || !DefVisited.insert(DefMI).second)
       continue;
 
-    // This happens when column = 0 in multi-tile operand.
-    if (DefMI->getOpcode() == X86::COPY) {
-      MachineInstr *MI = MRI->getVRegDef(DefMI->getOperand(1).getReg());
-      if (MI && MI->isMoveImmediate())
-        continue;
-    }
-
     if (DefMI->isPHI()) {
       for (unsigned I = 1; I < DefMI->getNumOperands(); I += 2)
         if (isLoopBackEdge(DefMBB, DefMI->getOperand(I + 1).getMBB()))
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 76979e37c4618..72f38133e21ff 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -597,10 +597,6 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
       Reserved.set(*AI);
   }
 
-  // Reserve low half pair registers in case they are used by RA aggressively.
-  Reserved.set(X86::TMM0_TMM1);
-  Reserved.set(X86::TMM2_TMM3);
-
   assert(checkAllSuperRegsMarked(Reserved,
                                  {X86::SIL, X86::DIL, X86::BPL, X86::SPL,
                                   X86::SIH, X86::DIH, X86::BPH, X86::SPH}));
@@ -621,7 +617,7 @@ unsigned X86RegisterInfo::getNumSupportedRegs(const MachineFunction &MF) const {
   // and try to return the minimum number of registers supported by the target.
   static_assert((X86::R15WH + 1 == X86::YMM0) && (X86::YMM15 + 1 == X86::K0) &&
                     (X86::K6_K7 + 1 == X86::TMMCFG) &&
-                    (X86::TMM6_TMM7 + 1 == X86::R16) &&
+                    (X86::TMM7 + 1 == X86::R16) &&
                     (X86::R31WH + 1 == X86::NUM_TARGET_REGS),
                 "Register number may be incorrect");
 
@@ -694,8 +690,7 @@ bool X86RegisterInfo::isFixedRegister(const MachineFunction &MF,
 }
 
 bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const {
-  return RC->getID() == X86::TILERegClassID ||
-         RC->getID() == X86::TILEPAIRRegClassID;
+  return RC->getID() == X86::TILERegClassID;
 }
 
 void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
@@ -1062,17 +1057,9 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
   case X86::PTDPFP16PSV:
   case X86::PTCMMIMFP16PSV:
   case X86::PTCMMRLFP16PSV:
-  case X86::PTTRANSPOSEDV:
-  case X86::PTTDPBF16PSV:
-  case X86::PTTDPFP16PSV:
-  case X86::PTTCMMIMFP16PSV:
-  case X86::PTTCMMRLFP16PSV:
-  case X86::PTCONJTCMMIMFP16PSV:
-  case X86::PTCONJTFP16V:
   case X86::PTILELOADDRSV:
   case X86::PTILELOADDRST1V:
   case X86::PTMMULTF32PSV:
-  case X86::PTTMMULTF32PSV:
   case X86::PTDPBF8PSV:
   case X86::PTDPBHF8PSV:
   case X86::PTDPHBF8PSV:
@@ -1083,56 +1070,7 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
     VRM->assignVirt2Shape(VirtReg, Shape);
     return Shape;
   }
-  case X86::PT2RPNTLVWZ0V:
-  case X86::PT2RPNTLVWZ0T1V:
-  case X86::PT2RPNTLVWZ1V:
-  case X86::PT2RPNTLVWZ1T1V:
-  case X86::PT2RPNTLVWZ0RSV:
-  case X86::PT2RPNTLVWZ0RST1V:
-  case X86::PT2RPNTLVWZ1RSV:
-  case X86::PT2RPNTLVWZ1RST1V: {
-    MachineOperand &MO1 = MI->getOperand(1);
-    MachineOperand &MO2 = MI->getOperand(2);
-    MachineOperand &MO3 = MI->getOperand(3);
-    ShapeT Shape({&MO1, &MO2, &MO1, &MO3}, MRI);
-    VRM->assignVirt2Shape(VirtReg, Shape);
-    return Shape;
-  }
-  }
-}
-
-static bool canHintShape(ShapeT &PhysShape, ShapeT &VirtShape) {
-  unsigned PhysShapeNum = PhysShape.getShapeNum();
-  unsigned VirtShapeNum = VirtShape.getShapeNum();
-
-  if (PhysShapeNum < VirtShapeNum)
-    return false;
-
-  if (PhysShapeNum == VirtShapeNum) {
-    if (PhysShapeNum == 1)
-      return PhysShape == VirtShape;
-
-    for (unsigned I = 0; I < PhysShapeNum; I++) {
-      ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I));
-      ShapeT VShape(VirtShape.getRow(I), VirtShape.getCol(I));
-      if (VShape != PShape)
-        return false;
-    }
-    return true;
-  }
-
-  // Hint subreg of mult-tile reg to single tile reg.
-  if (VirtShapeNum == 1) {
-    for (unsigned I = 0; I < PhysShapeNum; I++) {
-      ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I));
-      if (VirtShape == PShape)
-        return true;
-    }
   }
-
-  // Note: Currently we have no requirement for case of
-  // (VirtShapeNum > 1 and PhysShapeNum > VirtShapeNum)
-  return false;
 }
 
 bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
@@ -1153,7 +1091,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
   if (!VRM)
     return BaseImplRetVal;
 
-  if (ID != X86::TILERegClassID && ID != X86::TILEPAIRRegClassID) {
+  if (ID != X86::TILERegClassID) {
     if (DisableRegAllocNDDHints || !ST.hasNDD() ||
         !TRI.isGeneralPurposeRegisterClass(&RC))
       return BaseImplRetVal;
@@ -1204,7 +1142,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
       return;
     }
     ShapeT PhysShape = getTileShape(VReg, const_cast<VirtRegMap *>(VRM), MRI);
-    if (canHintShape(PhysShape, VirtShape))
+    if (PhysShape == VirtShape)
       Hints.push_back(PhysReg);
   };
 
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 99b7910131dc5..692e42ae5e752 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -30,8 +30,6 @@ let Namespace = "X86" in {
   def sub_ymm      : SubRegIndex<256>;
   def sub_mask_0   : SubRegIndex<-1>;
   def sub_mask_1   : SubRegIndex<-1, -1>;
-  def sub_t0       : SubRegIndex<8192>;
-  def sub_t1       : SubRegIndex<8192, 8192>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -432,10 +430,6 @@ def TMM4:  X86Reg<"tmm4",   4>;
 def TMM5:  X86Reg<"tmm5",   5>;
 def TMM6:  X86Reg<"tmm6",   6>;
 def TMM7:  X86Reg<"tmm7",   7>;
-// TMM register pairs
-def TPAIRS : RegisterTuples<[sub_t0, sub_t1],
-                            [(add TMM0, TMM2, TMM4, TMM6),
-                             (add TMM1, TMM3, TMM5, TMM7)]>;
 }
 
 // Floating point stack registers. These don't map one-to-one to the FP
@@ -862,9 +856,6 @@ def VK64WM  : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
 let CopyCost = -1 in // Don't allow copying of tile registers
 def TILE : RegisterClass<"X86", [x86amx], 8192,
                          (sequence "TMM%u", 0, 7)> {let Size = 8192;}
-// Need check alignment 3rd operand size=1024*2*8
-let isAllocatable = 1 in
-def TILEPAIR : RegisterClass<"X86", [untyped], 512, (add TPAIRS)> {let Size = 16384;}
 
 //===----------------------------------------------------------------------===//
 // Register categories.
diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index e0b3b61e29175..829a32eb37118 100644
--- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -54,7 +54,6 @@
 #include <cassert>
 #include <iterator>
 #include <optional>
-#include <utility>
 
 using namespace llvm;
 
@@ -841,7 +840,7 @@ getRegClassForUnfoldedLoad(const X86InstrInfo &TII, unsigned Opcode) {
   unsigned UnfoldedOpc = TII.getOpcodeAfterMemoryUnfold(
       Opcode, /*UnfoldLoad*/ true, /*UnfoldStore*/ false, &Index);
   const MCInstrDesc &MCID = TII.get(UnfoldedOpc);
-  return TII.getRegClass(MCID, Index, &TII.getRegisterInfo());
+  return TII.getRegClass(MCID, Index);
 }
 
 void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 9a76abcd351bf..c1214149dfa1d 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -50,7 +50,6 @@
 #include "llvm/Transforms/CFGuard.h"
 #include <memory>
 #include <optional>
-#include <string>
 
 using namespace llvm;
 
@@ -90,14 +89,14 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() {
   initializeX86ExecutionDomainFixPass(PR);
   initializeX86DomainReassignmentPass(PR);
   initializeX86AvoidSFBPassPass(PR);
-  initializeX86AvoidTrailingCallPassPass(PR);
+  initializeX86AvoidTrailingCallLegacyPassPass(PR);
   initializeX86SpeculativeLoadHardeningPassPass(PR);
   initializeX86SpeculativeExecutionSideEffectSuppressionPass(PR);
   initializeX86FlagsCopyLoweringPassPass(PR);
   initializeX86LoadValueInjectionLoadHardeningPassPass(PR);
   initializeX86LoadValueInjectionRetHardeningPassPass(PR);
   initializeX86OptimizeLEAPassPass(PR);
-  initializeX86PartialReductionPass(PR);
+  initializeX86PartialReductionLegacyPass(PR);
   initializePseudoProbeInserterPass(PR);
   initializeX86ReturnThunksPass(PR);
   initializeX86DAGToDAGISelLegacyPass(PR);
@@ -105,7 +104,7 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() {
   initializeX86AsmPrinterPass(PR);
   initializeX86FixupInstTuningPassPass(PR);
   initializeX86FixupVectorConstantsPassPass(PR);
-  initializeX86DynAllocaExpanderPass(PR);
+  initializeX86DynAllocaExpanderLegacyPass(PR);
   initializeX86SuppressAPXForRelocationPassPass(PR);
   initializeX86WinEHUnwindV2Pass(PR);
 }
@@ -422,14 +421,14 @@ void X86PassConfig::addIRPasses() {
 
   // We add both pass anyway and when these two passes run, we skip the pass
   // based on the option level and option attribute.
-  addPass(createX86LowerAMXIntrinsicsPass());
+  addPass(createX86LowerAMXIntrinsicsLegacyPass());
   addPass(createX86LowerAMXTypeLegacyPass());
 
   TargetPassConfig::addIRPasses();
 
   if (TM->getOptLevel() != CodeGenOptLevel::None) {
     addPass(createInterleavedAccessPass());
-    addPass(createX86PartialReductionPass());
+    addPass(createX86PartialReductionLegacyPass());
   }
 
   // Add passes that handle indirect branch removal and insertion of a retpoline
@@ -517,7 +516,7 @@ void X86PassConfig::addPreRegAlloc() {
 
   addPass(createX86SpeculativeLoadHardeningPass());
   addPass(createX86FlagsCopyLoweringPass());
-  addPass(createX86DynAllocaExpander());
+  addPass(createX86DynAllocaExpanderLegacyPass());
 
   if (getOptLevel() != CodeGenOptLevel::None)
     addPass(createX86PreTileConfigPass());
@@ -589,7 +588,7 @@ void X86PassConfig::addPreEmitPass2() {
   // Insert extra int3 instructions after trailing call instructions to avoid
   // issues in the unwinder.
   if (TT.isOSWindows() && TT.isX86_64())
-    addPass(createX86AvoidTrailingCallPass());
+    addPass(createX86AvoidTrailingCallLegacyPass());
 
   // Verify basic block incoming and outgoing cfa offset and register values and
   // correct CFA calculation rule where needed by inserting appropriate CFI
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 3d8d0a236a3c1..0b1430e373fc7 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -6562,7 +6562,7 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
 
 bool X86TTIImpl::areTypesABICompatible(const Function *Caller,
                                        const Function *Callee,
-                                       const ArrayRef<Type *> &Types) const {
+                                       ArrayRef<Type *> Types) const {
   if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
     return false;
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 133b3668a46c8..de5e1c297b1e4 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -296,7 +296,7 @@ class X86TTIImpl final : public BasicTTIImplBase<X86TTIImpl> {
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const override;
   bool areTypesABICompatible(const Function *Caller, const Function *Callee,
-                             const ArrayRef<Type *> &Type) const override;
+                             ArrayRef<Type *> Type) const override;
 
   uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override {
     return ST->getMaxInlineSizeThreshold();
diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp
index 17a44dde6480f..09ef8fbc12de9 100644
--- a/llvm/lib/Target/X86/X86TileConfig.cpp
+++ b/llvm/lib/Target/X86/X86TileConfig.cpp
@@ -74,63 +74,6 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
 INITIALIZE_PASS_END(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false,
                     false)
 
-unsigned getAMXRegNum(MachineRegisterInfo *MRI, Register Reg) {
-  if (Reg.isVirtual()) {
-    unsigned RegClassID = MRI->getRegClass(Reg)->getID();
-    if (RegClassID == X86::TILERegClassID)
-      return 1;
-    if (RegClassID == X86::TILEPAIRRegClassID)
-      return 2;
-  } else {
-    if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
-      return 1;
-    if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
-      return 2;
-  }
-  return 0;
-}
-
-static void collectVirtRegShapes(MachineRegisterInfo *MRI, VirtRegMap &VRM,
-                                 Register VirtReg,
-                                 SmallVector<ShapeT, 8> &Phys2Shapes) {
-  unsigned Num = getAMXRegNum(MRI, VirtReg);
-  MCRegister PhysReg = VRM.getPhys(VirtReg);
-  if (!PhysReg)
-    return;
-
-  if (Num == 1) {
-    unsigned Index = PhysReg - X86::TMM0;
-    if (!Phys2Shapes[Index].isValid()) {
-      ShapeT Shape = VRM.getShape(VirtReg);
-      Phys2Shapes[Index] = std::move(Shape);
-      return;
-    }
-  }
-  // Split tile pair shape info to 2 single tile shape info. e.g:
-  // Put TMM0_TMM1's Shape to TMM0's shape + TMM1's Shape in Phys2Shapes.
-  if (Num == 2) {
-    unsigned Index0 = (PhysReg - X86::TMM0_TMM1) * 2;
-    unsigned Index1 = (PhysReg - X86::TMM0_TMM1) * 2 + 1;
-
-    ShapeT Shape = VRM.getShape(VirtReg);
-    assert(Shape.getShapeNum() == 2 && "Unexpected shape number!");
-
-    if (!Phys2Shapes[Index0].isValid()) {
-      ShapeT Shape0(Shape.getRow(0), Shape.getCol(0), MRI);
-      Phys2Shapes[Index0] = std::move(Shape0);
-    }
-
-    if (!Phys2Shapes[Index1].isValid()) {
-      ShapeT Shape1(Shape.getRow(1), Shape.getCol(1), MRI);
-      Phys2Shapes[Index1] = std::move(Shape1);
-    }
-  }
-}
-
-static bool isAMXRegClass(MachineRegisterInfo *MRI, Register Reg) {
-  return getAMXRegNum(MRI, Reg) > 0;
-}
-
 bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   // Early exit in the common case of non-AMX code.
@@ -138,7 +81,7 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
-  const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+  const X86RegisterInfo *TRI = ST.getRegisterInfo();
   const TargetInstrInfo *TII = ST.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
@@ -176,24 +119,29 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
   assert(ConstMI && "Cannot find an insertion point");
 
   unsigned AMXRegNum = TRI->getRegClass(X86::TILERegClassID)->getNumRegs();
-  SmallVector<ShapeT, 8> Phys2Shapes(AMXRegNum, ShapeT());
+  SmallVector<Register, 8> Phys2Virt(AMXRegNum, 0);
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
     Register VirtReg = Register::index2VirtReg(I);
     if (MRI.reg_nodbg_empty(VirtReg))
       continue;
-    if (!isAMXRegClass(&MRI, VirtReg))
+    if (!TRI->isTileRegisterClass(MRI.getRegClass(VirtReg)))
+      continue;
+    MCRegister PhysReg = VRM.getPhys(VirtReg);
+    if (!PhysReg)
       continue;
-    collectVirtRegShapes(&MRI, VRM, VirtReg, Phys2Shapes);
+    unsigned Index = PhysReg - X86::TMM0;
+    if (!Phys2Virt[Index])
+      Phys2Virt[Index] = VirtReg;
   }
 
   // Fill in the shape of each tile physical register.
   for (unsigned I = 0; I < AMXRegNum; ++I) {
-    ShapeT Shape = Phys2Shapes[I];
-    if (!Shape.isValid())
+    if (!Phys2Virt[I])
       continue;
     DebugLoc DL;
     bool IsRow = true;
     MachineInstr *NewMI = nullptr;
+    ShapeT Shape = VRM.getShape(Phys2Virt[I]);
     for (auto &R : {Shape.getRow()->getReg(), Shape.getCol()->getReg()}) {
       // Here is the data format for the tile config.
       // 0      palette
@@ -222,14 +170,7 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
                    "Cannot initialize with different shapes");
             continue;
           }
-          if (DefMI.getOperand(1).isImm()) {
-            Imm = DefMI.getOperand(1).getImm();
-          } else {
-            assert(DefMI.getOpcode() == X86::MOV32r0 &&
-                   "The opcode is assumed to be MOV32r0 if the operand is not "
-                   "immediate.");
-            Imm = 0;
-          }
+          Imm = DefMI.getOperand(1).getImm();
 
           NewMI = addFrameReference(
                       BuildMI(MF.front(), ++ConstMI->getIterator(), DL,
diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp
index f6f7e92d98578..2f28ab36aa193 100644
--- a/llvm/lib/Target/X86/X86VZeroUpper.cpp
+++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -66,7 +66,7 @@ namespace {
                           MachineBasicBlock &MBB);
     void addDirtySuccessor(MachineBasicBlock &MBB);
 
-    using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY };
+    enum BlockExitState { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY };
 
     static const char* getBlockExitStateName(BlockExitState ST);
 
diff --git a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index 096ad08d8a3c9..0e00db495256c 100644
--- a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -69,7 +69,7 @@ static bool readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
   return true;
 }
 
-static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) {
+static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) {
   const MCRegisterInfo *RegInfo = D->getContext().getRegisterInfo();
   return RegInfo->getRegClass(RC).getRegister(RegNo);
 }
@@ -79,7 +79,7 @@ static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                               const MCDisassembler *Decoder) {
   if (RegNo > 11)
     return MCDisassembler::Fail;
-  unsigned Reg = getReg(Decoder, XCore::GRRegsRegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, XCore::GRRegsRegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -89,7 +89,7 @@ static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                              const MCDisassembler *Decoder) {
   if (RegNo > 15)
     return MCDisassembler::Fail;
-  unsigned Reg = getReg(Decoder, XCore::RRegsRegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, XCore::RRegsRegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
index cdb5186d23d3c..351a221c92ebd 100644
--- a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -432,7 +432,7 @@ bool XCoreFrameLowering::spillCalleeSavedRegisters(
     // Add the callee-saved register as live-in. It's killed at the spill.
     MBB.addLiveIn(Reg);
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI,
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC,
                             Register());
     if (emitFrameMoves) {
       auto Store = MI;
@@ -458,8 +458,7 @@ bool XCoreFrameLowering::restoreCalleeSavedRegisters(
            "LR & FP are always handled in emitEpilogue");
 
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.loadRegFromStackSlot(MBB, MI, Reg, CSR.getFrameIdx(), RC, TRI,
-                             Register());
+    TII.loadRegFromStackSlot(MBB, MI, Reg, CSR.getFrameIdx(), RC, Register());
     assert(MI != MBB.begin() &&
            "loadRegFromStackSlot didn't insert any code!");
     // Insert in reverse order.  loadRegFromStackSlot can insert multiple
diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
index 1a9133aad4dd3..075910c84fb84 100644
--- a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -43,7 +43,7 @@ namespace XCore {
 void XCoreInstrInfo::anchor() {}
 
 XCoreInstrInfo::XCoreInstrInfo(const XCoreSubtarget &ST)
-    : XCoreGenInstrInfo(ST, XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
+    : XCoreGenInstrInfo(ST, RI, XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
       RI() {}
 
 static bool isZeroImm(const MachineOperand &op) {
@@ -355,8 +355,8 @@ void XCoreInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 void XCoreInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg,
     bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
-    MachineInstr::MIFlag Flags) const {
+
+    Register VReg, MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (I != MBB.end() && !I->isDebugInstr())
     DL = I->getDebugLoc();
@@ -377,7 +377,6 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator I,
                                           Register DestReg, int FrameIndex,
                                           const TargetRegisterClass *RC,
-                                          const TargetRegisterInfo *TRI,
                                           Register VReg,
                                           MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.h b/llvm/lib/Target/XCore/XCoreInstrInfo.h
index 3543392653786..c4e399ebd3fd8 100644
--- a/llvm/lib/Target/XCore/XCoreInstrInfo.h
+++ b/llvm/lib/Target/XCore/XCoreInstrInfo.h
@@ -71,13 +71,15 @@ class XCoreInstrInfo : public XCoreGenInstrInfo {
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
       bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
       int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool reverseBranchCondition(
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
index bd4d4ebd2a729..5977a276b1236 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
@@ -320,7 +320,7 @@ XtensaMCCodeEmitter::getMemRegEncoding(const MCInst &MI, unsigned OpNo,
   case Xtensa::SSIP:
   case Xtensa::LSI:
   case Xtensa::LSIP:
-
+  case Xtensa::S32C1I:
     if (Res & 0x3) {
       report_fatal_error("Unexpected operand value!");
     }
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
index 4e730707dcb78..8d0fd078b2696 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
@@ -202,7 +202,7 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits,
     return FeatureBits[Xtensa::FeatureWindowed];
   case Xtensa::ATOMCTL:
   case Xtensa::SCOMPARE1:
-    return FeatureBits[Xtensa::FeatureWindowed];
+    return FeatureBits[Xtensa::FeatureS32C1I];
   case Xtensa::NoRegister:
     return false;
   }
diff --git a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
index cf9a2a052978d..1c0dc66a46144 100644
--- a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
@@ -314,7 +314,7 @@ bool XtensaFrameLowering::spillCalleeSavedRegisters(
     bool IsKill = !IsA0AndRetAddrIsTaken;
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
     TII.storeRegToStackSlot(EntryBlock, MI, Reg, IsKill, CSI[i].getFrameIdx(),
-                            RC, TRI, Register());
+                            RC, Register());
   }
 
   return true;
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
index b0f924f2cd58e..d7b05acea9411 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
@@ -48,7 +48,8 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI) {
 }
 
 XtensaInstrInfo::XtensaInstrInfo(const XtensaSubtarget &STI)
-    : XtensaGenInstrInfo(STI, Xtensa::ADJCALLSTACKDOWN, Xtensa::ADJCALLSTACKUP),
+    : XtensaGenInstrInfo(STI, RI, Xtensa::ADJCALLSTACKDOWN,
+                         Xtensa::ADJCALLSTACKUP),
       RI(STI), STI(STI) {}
 
 Register XtensaInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
@@ -114,21 +115,38 @@ void XtensaInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                   const DebugLoc &DL, Register DestReg,
                                   Register SrcReg, bool KillSrc,
                                   bool RenamableDest, bool RenamableSrc) const {
-  // The MOV instruction is not present in core ISA,
+  unsigned Opcode;
+
+  // The MOV instruction is not present in core ISA for AR registers,
   // so use OR instruction.
-  if (Xtensa::ARRegClass.contains(DestReg, SrcReg))
+  if (Xtensa::ARRegClass.contains(DestReg, SrcReg)) {
     BuildMI(MBB, MBBI, DL, get(Xtensa::OR), DestReg)
         .addReg(SrcReg, getKillRegState(KillSrc))
         .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (STI.hasSingleFloat() && Xtensa::FPRRegClass.contains(SrcReg) &&
+      Xtensa::FPRRegClass.contains(DestReg))
+    Opcode = Xtensa::MOV_S;
+  else if (STI.hasSingleFloat() && Xtensa::FPRRegClass.contains(SrcReg) &&
+           Xtensa::ARRegClass.contains(DestReg))
+    Opcode = Xtensa::RFR;
+  else if (STI.hasSingleFloat() && Xtensa::ARRegClass.contains(SrcReg) &&
+           Xtensa::FPRRegClass.contains(DestReg))
+    Opcode = Xtensa::WFR;
   else
     report_fatal_error("Impossible reg-to-reg copy");
+
+  BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
 void XtensaInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
     bool isKill, int FrameIdx, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
-    MachineInstr::MIFlag Flags) const {
+
+    Register VReg, MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   unsigned LoadOpcode, StoreOpcode;
   getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode, FrameIdx);
@@ -137,10 +155,12 @@ void XtensaInstrInfo::storeRegToStackSlot(
   addFrameReference(MIB, FrameIdx);
 }
 
-void XtensaInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
-    int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void XtensaInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MBBI,
+                                           Register DestReg, int FrameIdx,
+                                           const TargetRegisterClass *RC,
+                                           Register VReg,
+                                           MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   unsigned LoadOpcode, StoreOpcode;
   getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode, FrameIdx);
@@ -526,12 +546,12 @@ void XtensaInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
           "function code size is significantly larger than estimated");
 
     storeRegToStackSlot(MBB, L32R, ScavRegister, /*IsKill=*/true, FrameIndex,
-                        &Xtensa::ARRegClass, &RI, Register());
+                        &Xtensa::ARRegClass, Register());
     RI.eliminateFrameIndex(std::prev(L32R.getIterator()),
                            /*SpAdj=*/0, /*FIOperandNum=*/1);
 
     loadRegFromStackSlot(RestoreBB, RestoreBB.end(), ScavRegister, FrameIndex,
-                         &Xtensa::ARRegClass, &RI, Register());
+                         &Xtensa::ARRegClass, Register());
     RI.eliminateFrameIndex(RestoreBB.back(),
                            /*SpAdj=*/0, /*FIOperandNum=*/1);
     JumpToMBB = &RestoreBB;
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.h b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h
index 1808cb36d8a9b..0b46d6ce2fdb7 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.h
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h
@@ -56,14 +56,13 @@ class XtensaInstrInfo : public XtensaGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIdx, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   // Get the load and store opcodes for a given register class and offset.
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 0849fc7d55a32..c164762de2966 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -2192,7 +2192,6 @@ StringMap<bool> sys::getHostCPUFeatures() {
   bool HasLeaf1E = MaxLevel >= 0x1e &&
                    !getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX);
   Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave;
-  Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave;
   Features["amx-tf32"] = HasLeaf1E && ((EAX >> 6) & 1) && HasAMXSave;
   Features["amx-avx512"] = HasLeaf1E && ((EAX >> 7) & 1) && HasAMXSave;
   Features["amx-movrs"] = HasLeaf1E && ((EAX >> 8) & 1) && HasAMXSave;
diff --git a/llvm/lib/TargetParser/PPCTargetParser.cpp b/llvm/lib/TargetParser/PPCTargetParser.cpp
index d51044529a49d..f74d670df4306 100644
--- a/llvm/lib/TargetParser/PPCTargetParser.cpp
+++ b/llvm/lib/TargetParser/PPCTargetParser.cpp
@@ -48,9 +48,9 @@ StringRef normalizeCPUName(StringRef CPUName) {
   // accepting it. Clang has always ignored it and passed the
   // generic CPU ID to the back end.
   return StringSwitch<StringRef>(CPUName)
-      .Cases("common", "405", "generic")
-      .Cases("ppc440", "440fp", "440")
-      .Cases("630", "power3", "pwr3")
+      .Cases({"common", "405"}, "generic")
+      .Cases({"ppc440", "440fp"}, "440")
+      .Cases({"630", "power3"}, "pwr3")
       .Case("G3", "g3")
       .Case("G4", "g4")
       .Case("G4+", "g4+")
@@ -69,7 +69,7 @@ StringRef normalizeCPUName(StringRef CPUName) {
       .Case("power9", "pwr9")
       .Case("power10", "pwr10")
       .Case("power11", "pwr11")
-      .Cases("powerpc", "powerpc32", "ppc")
+      .Cases({"powerpc", "powerpc32"}, "ppc")
       .Case("powerpc64", "ppc64")
       .Case("powerpc64le", "ppc64le")
       .Default(CPUName);
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index f08a0c0ddd680..94ae64c6d3eed 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -14,7 +14,6 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/raw_ostream.h"
 
-#include <array>
 #include <atomic>
 #include <optional>
 #include <string>
diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
index d765d9ccb284d..d7359234b02f7 100644
--- a/llvm/lib/TargetParser/TargetDataLayout.cpp
+++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
@@ -208,7 +208,7 @@ static std::string computeMipsDataLayout(const Triple &TT, StringRef ABIName) {
   return Ret;
 }
 
-static std::string computePowerDataLayout(const Triple &T) {
+static std::string computePowerDataLayout(const Triple &T, StringRef ABIName) {
   bool is64Bit = T.isPPC64();
   std::string Ret;
 
@@ -228,7 +228,8 @@ static std::string computePowerDataLayout(const Triple &T) {
   // If the target ABI uses function descriptors, then the alignment of function
   // pointers depends on the alignment used to emit the descriptor. Otherwise,
   // function pointers are aligned to 32 bits because the instructions must be.
-  if ((T.getArch() == Triple::ppc64 && !T.isPPC64ELFv2ABI())) {
+  if ((T.getArch() == Triple::ppc64 &&
+       (!T.isPPC64ELFv2ABI() && ABIName != "elfv2"))) {
     Ret += "-Fi64";
   } else if (T.isOSAIX()) {
     Ret += is64Bit ? "-Fi64" : "-Fi32";
@@ -573,7 +574,7 @@ std::string Triple::computeDataLayout(StringRef ABIName) const {
   case Triple::ppcle:
   case Triple::ppc64:
   case Triple::ppc64le:
-    return computePowerDataLayout(*this);
+    return computePowerDataLayout(*this, ABIName);
   case Triple::r600:
   case Triple::amdgcn:
     return computeAMDDataLayout(*this);
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 975a271cfd6dc..96bef0e574a45 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -174,8 +174,8 @@ constexpr GPUInfo AMDGCNGPUs[] = {
     {{"gfx1153"},   {"gfx1153"}, GK_GFX1153, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
     {{"gfx1200"},   {"gfx1200"}, GK_GFX1200, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
     {{"gfx1201"},   {"gfx1201"}, GK_GFX1201, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
-    {{"gfx1250"},   {"gfx1250"}, GK_GFX1250, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
-    {{"gfx1251"},   {"gfx1251"}, GK_GFX1251, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
+    {{"gfx1250"},   {"gfx1250"}, GK_GFX1250, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK_ALWAYS},
+    {{"gfx1251"},   {"gfx1251"}, GK_GFX1251, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK_ALWAYS},
 
     {{"gfx9-generic"},      {"gfx9-generic"},    GK_GFX9_GENERIC,    FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
     {{"gfx10-1-generic"},   {"gfx10-1-generic"}, GK_GFX10_1_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index b13c795c1649c..37e8ad986aa55 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -143,7 +143,7 @@ constexpr FeatureBitset FeaturesDiamondRapids =
     FeatureAVXVNNIINT8 | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 |
     FeatureSM4 | FeatureEGPR | FeatureZU | FeatureCCMP | FeaturePush2Pop2 |
     FeaturePPX | FeatureNDD | FeatureNF | FeatureMOVRS | FeatureAMX_MOVRS |
-    FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32 | FeatureAMX_TRANSPOSE;
+    FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32;
 
 // Intel Atom processors.
 // Bonnell has feature parity with Core2 and adds MOVBE.
@@ -615,7 +615,6 @@ constexpr FeatureBitset ImpliedFeaturesAMX_FP16 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE;
-constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_MOVRS = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_AVX512 =
     FeatureAMX_TILE | FeatureAVX10_2;
diff --git a/llvm/lib/TargetParser/XtensaTargetParser.cpp b/llvm/lib/TargetParser/XtensaTargetParser.cpp
index 25725f2688cf3..208722ae06037 100644
--- a/llvm/lib/TargetParser/XtensaTargetParser.cpp
+++ b/llvm/lib/TargetParser/XtensaTargetParser.cpp
@@ -13,6 +13,7 @@
 #include "llvm/TargetParser/XtensaTargetParser.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
+#include <vector>
 
 namespace llvm {
 
diff --git a/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp b/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp
index cda07e81faf1e..f55bc9c1a28c2 100644
--- a/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp
+++ b/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp
@@ -32,7 +32,7 @@ using namespace llvm::MachO;
 using namespace llvm::MachO::DylibReader;
 
 using TripleVec = std::vector<Triple>;
-static typename TripleVec::iterator emplace(TripleVec &Container, Triple &&T) {
+static TripleVec::iterator emplace(TripleVec &Container, Triple &&T) {
   auto I = partition_point(Container, [=](const Triple &CT) {
     return std::forward_as_tuple(CT.getArch(), CT.getOS(),
                                  CT.getEnvironment()) <
diff --git a/llvm/lib/TextAPI/RecordVisitor.cpp b/llvm/lib/TextAPI/RecordVisitor.cpp
index d333b33092263..24971a70f2ddf 100644
--- a/llvm/lib/TextAPI/RecordVisitor.cpp
+++ b/llvm/lib/TextAPI/RecordVisitor.cpp
@@ -15,7 +15,7 @@
 using namespace llvm;
 using namespace llvm::MachO;
 
-RecordVisitor::~RecordVisitor() {}
+RecordVisitor::~RecordVisitor() = default;
 void RecordVisitor::visitObjCInterface(const ObjCInterfaceRecord &) {}
 void RecordVisitor::visitObjCCategory(const ObjCCategoryRecord &) {}
 
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 7a95df4b2a47c..b575d76e897d2 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1378,8 +1378,7 @@ static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU,
       IRB.CreateTrunc(Call->getArgOperand(1), ByteTy), BBNext, N);
   // We can't know the precise weights here, as they would depend on the value
   // distribution of Call->getArgOperand(1). So we just mark it as "unknown".
-  setExplicitlyUnknownBranchWeightsIfProfiled(*SI, *Call->getFunction(),
-                                              DEBUG_TYPE);
+  setExplicitlyUnknownBranchWeightsIfProfiled(*SI, DEBUG_TYPE);
   Type *IndexTy = DL.getIndexType(Call->getType());
   SmallVector<DominatorTree::UpdateType, 8> Updates;
 
diff --git a/llvm/lib/Transforms/Coroutines/CoroCloner.h b/llvm/lib/Transforms/Coroutines/CoroCloner.h
index e05fe28cb91f5..1e549f122b6ba 100644
--- a/llvm/lib/Transforms/Coroutines/CoroCloner.h
+++ b/llvm/lib/Transforms/Coroutines/CoroCloner.h
@@ -77,7 +77,7 @@ class BaseCloner {
       : OrigF(OrigF), Suffix(Suffix), Shape(Shape), FKind(FKind),
         Builder(OrigF.getContext()), TTI(TTI) {}
 
-  virtual ~BaseCloner() {}
+  virtual ~BaseCloner() = default;
 
   /// Create a clone for a continuation lowering.
   static Function *createClone(Function &OrigF, const Twine &Suffix,
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 50485615a9d4c..a6ac7610a2c7a 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -3619,7 +3619,7 @@ struct AAIntraFnReachabilityFunction final
       return true;
 
     RQITy StackRQI(A, From, To, ExclusionSet, false);
-    typename RQITy::Reachable Result;
+    RQITy::Reachable Result;
     if (!NonConstThis->checkQueryCache(A, StackRQI, Result))
       return NonConstThis->isReachableImpl(A, StackRQI,
                                            /*IsTemporaryRQI=*/true);
@@ -5185,6 +5185,7 @@ struct AADereferenceableCallSiteReturned final
 // ------------------------ Align Argument Attribute ------------------------
 
 namespace {
+
 static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA,
                                     Value &AssociatedValue, const Use *U,
                                     const Instruction *I, bool &TrackUse) {
@@ -5200,6 +5201,28 @@ static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA,
       TrackUse = true;
     return 0;
   }
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::ptrmask: {
+      // Is it appropriate to pull attribute in initialization?
+      const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>(
+          QueryingAA, IRPosition::value(*II->getOperand(1)), DepClassTy::NONE);
+      const auto *AlignAA = A.getAAFor<AAAlign>(
+          QueryingAA, IRPosition::value(*II), DepClassTy::NONE);
+      if (ConstVals && ConstVals->isValidState() && ConstVals->isAtFixpoint()) {
+        unsigned ShiftValue = std::min(ConstVals->getAssumedMinTrailingZeros(),
+                                       Value::MaxAlignmentExponent);
+        Align ConstAlign(UINT64_C(1) << ShiftValue);
+        if (ConstAlign >= AlignAA->getKnownAlign())
+          return Align(1).value();
+      }
+      if (AlignAA)
+        return AlignAA->getKnownAlign().value();
+      break;
+    }
+    default:
+      break;
+    }
 
   MaybeAlign MA;
   if (const auto *CB = dyn_cast<CallBase>(I)) {
@@ -5499,6 +5522,44 @@ struct AAAlignCallSiteReturned final
   AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A)
       : Base(IRP, A) {}
 
+  ChangeStatus updateImpl(Attributor &A) override {
+    Instruction *I = getIRPosition().getCtxI();
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::ptrmask: {
+        Align Alignment;
+        bool Valid = false;
+
+        const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>(
+            *this, IRPosition::value(*II->getOperand(1)), DepClassTy::REQUIRED);
+        if (ConstVals && ConstVals->isValidState()) {
+          unsigned ShiftValue =
+              std::min(ConstVals->getAssumedMinTrailingZeros(),
+                       Value::MaxAlignmentExponent);
+          Alignment = Align(UINT64_C(1) << ShiftValue);
+          Valid = true;
+        }
+
+        const auto *AlignAA =
+            A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))),
+                                DepClassTy::REQUIRED);
+        if (AlignAA && AlignAA->isValidState()) {
+          Alignment = std::max(AlignAA->getAssumedAlign(), Alignment);
+          Valid = true;
+        }
+
+        if (Valid)
+          return clampStateAndIndicateChange<StateType>(
+              this->getState(),
+              std::min(this->getAssumedAlign(), Alignment).value());
+        break;
+      }
+      default:
+        break;
+      }
+    }
+    return Base::updateImpl(A);
+  };
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); }
 };
@@ -10701,7 +10762,7 @@ struct AAInterFnReachabilityFunction
     auto *NonConstThis = const_cast<AAInterFnReachabilityFunction *>(this);
 
     RQITy StackRQI(A, From, To, ExclusionSet, false);
-    typename RQITy::Reachable Result;
+    RQITy::Reachable Result;
     if (!NonConstThis->checkQueryCache(A, StackRQI, Result))
       return NonConstThis->isReachableImpl(A, StackRQI,
                                            /*IsTemporaryRQI=*/true);
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index aa1346d9ee56a..94663ff928a0b 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -78,7 +78,6 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <memory>
 #include <set>
 #include <string>
 #include <system_error>
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 894d83fa530b1..d35ae4730a9f3 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -1034,11 +1034,11 @@ class IndexCallsiteContextGraph
 } // namespace
 
 template <>
-struct llvm::DenseMapInfo<typename CallsiteContextGraph<
+struct llvm::DenseMapInfo<CallsiteContextGraph<
     ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
     : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
 template <>
-struct llvm::DenseMapInfo<typename CallsiteContextGraph<
+struct llvm::DenseMapInfo<CallsiteContextGraph<
     IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
     : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
 template <>
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index d7eb745c81317..2a87a0f9aaa99 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -208,7 +208,7 @@ namespace KernelInfo {
 // };
 
 #define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX)                                    \
-  constexpr const unsigned MEMBER##Idx = IDX;
+  constexpr unsigned MEMBER##Idx = IDX;
 
 KERNEL_ENVIRONMENT_IDX(Configuration, 0)
 KERNEL_ENVIRONMENT_IDX(Ident, 1)
@@ -216,7 +216,7 @@ KERNEL_ENVIRONMENT_IDX(Ident, 1)
 #undef KERNEL_ENVIRONMENT_IDX
 
 #define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX)                      \
-  constexpr const unsigned MEMBER##Idx = IDX;
+  constexpr unsigned MEMBER##Idx = IDX;
 
 KERNEL_ENVIRONMENT_CONFIGURATION_IDX(UseGenericStateMachine, 0)
 KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MayUseNestedParallelism, 1)
@@ -258,7 +258,7 @@ KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxTeams)
 
 GlobalVariable *
 getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB) {
-  constexpr const int InitKernelEnvironmentArgNo = 0;
+  constexpr int InitKernelEnvironmentArgNo = 0;
   return cast<GlobalVariable>(
       KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo)
           ->stripPointerCasts());
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index e39e311dd795f..8e76b79cc8f87 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -83,7 +83,6 @@
 #include <cstdint>
 #include <functional>
 #include <limits>
-#include <map>
 #include <memory>
 #include <queue>
 #include <string>
@@ -2293,7 +2292,6 @@ bool SampleProfileLoader::runOnFunction(Function &F,
   // count value.
   if (!F.getEntryCount())
     F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
-  std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent())
                   .getManager();
   ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
diff --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
index 70b8614826826..b9fb7a3ae4b5b 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
@@ -18,6 +18,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/LongestCommonSequence.h"
 
+#include <unordered_set>
+
 using namespace llvm;
 using namespace sampleprof;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 3ddf182149e57..cbaff294819a2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3997,6 +3997,27 @@ static Value *foldOrUnsignedUMulOverflowICmp(BinaryOperator &I,
   return nullptr;
 }
 
+/// Fold select(X >s 0, 0, -X) | smax(X, 0) --> abs(X)
+///      select(X <s 0, -X, 0) | smax(X, 0) --> abs(X)
+static Value *FoldOrOfSelectSmaxToAbs(BinaryOperator &I,
+                                      InstCombiner::BuilderTy &Builder) {
+  Value *X;
+  Value *Sel;
+  if (match(&I,
+            m_c_Or(m_Value(Sel), m_OneUse(m_SMax(m_Value(X), m_ZeroInt()))))) {
+    auto NegX = m_Neg(m_Specific(X));
+    if (match(Sel, m_Select(m_SpecificICmp(ICmpInst::ICMP_SGT, m_Specific(X),
+                                           m_ZeroInt()),
+                            m_ZeroInt(), NegX)) ||
+        match(Sel, m_Select(m_SpecificICmp(ICmpInst::ICMP_SLT, m_Specific(X),
+                                           m_ZeroInt()),
+                            NegX, m_ZeroInt())))
+      return Builder.CreateBinaryIntrinsic(Intrinsic::abs, X,
+                                           Builder.getFalse());
+  }
+  return nullptr;
+}
+
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
@@ -4545,6 +4566,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
     if (Value *V = SimplifyAddWithRemainder(I))
       return replaceInstUsesWith(I, V);
 
+  if (Value *Res = FoldOrOfSelectSmaxToAbs(I, Builder))
+    return replaceInstUsesWith(I, Res);
+
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 92fca90ddb88a..8e4edefec42fd 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumeBundleQueries.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -4091,6 +4092,70 @@ Instruction *InstCombinerImpl::visitCallBrInst(CallBrInst &CBI) {
   return visitCallBase(CBI);
 }
 
+static Value *optimizeModularFormat(CallInst *CI, IRBuilderBase &B) {
+  if (!CI->hasFnAttr("modular-format"))
+    return nullptr;
+
+  SmallVector<StringRef> Args(
+      llvm::split(CI->getFnAttr("modular-format").getValueAsString(), ','));
+  // TODO: Make use of the first two arguments
+  unsigned FirstArgIdx;
+  [[maybe_unused]] bool Error;
+  Error = Args[2].getAsInteger(10, FirstArgIdx);
+  assert(!Error && "invalid first arg index");
+  --FirstArgIdx;
+  StringRef FnName = Args[3];
+  StringRef ImplName = Args[4];
+  ArrayRef<StringRef> AllAspects = ArrayRef<StringRef>(Args).drop_front(5);
+
+  if (AllAspects.empty())
+    return nullptr;
+
+  SmallVector<StringRef> NeededAspects;
+  for (StringRef Aspect : AllAspects) {
+    if (Aspect == "float") {
+      if (llvm::any_of(
+              llvm::make_range(std::next(CI->arg_begin(), FirstArgIdx),
+                               CI->arg_end()),
+              [](Value *V) { return V->getType()->isFloatingPointTy(); }))
+        NeededAspects.push_back("float");
+    } else {
+      // Unknown aspects are always considered to be needed.
+      NeededAspects.push_back(Aspect);
+    }
+  }
+
+  if (NeededAspects.size() == AllAspects.size())
+    return nullptr;
+
+  Module *M = CI->getModule();
+  LLVMContext &Ctx = M->getContext();
+  Function *Callee = CI->getCalledFunction();
+  FunctionCallee ModularFn = M->getOrInsertFunction(
+      FnName, Callee->getFunctionType(),
+      Callee->getAttributes().removeFnAttribute(Ctx, "modular-format"));
+  CallInst *New = cast<CallInst>(CI->clone());
+  New->setCalledFunction(ModularFn);
+  New->removeFnAttr("modular-format");
+  B.Insert(New);
+
+  const auto ReferenceAspect = [&](StringRef Aspect) {
+    SmallString<20> Name = ImplName;
+    Name += '_';
+    Name += Aspect;
+    Function *RelocNoneFn =
+        Intrinsic::getOrInsertDeclaration(M, Intrinsic::reloc_none);
+    B.CreateCall(RelocNoneFn,
+                 {MetadataAsValue::get(Ctx, MDString::get(Ctx, Name))});
+  };
+
+  llvm::sort(NeededAspects);
+  for (StringRef Request : NeededAspects)
+    ReferenceAspect(Request);
+
+  return New;
+}
+
 Instruction *InstCombinerImpl::tryOptimizeCall(CallInst *CI) {
   if (!CI->getCalledFunction()) return nullptr;
 
@@ -4112,6 +4177,10 @@ Instruction *InstCombinerImpl::tryOptimizeCall(CallInst *CI) {
     ++NumSimplified;
     return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With);
   }
+  if (Value *With = optimizeModularFormat(CI, Builder)) {
+    ++NumSimplified;
+    return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With);
+  }
 
   return nullptr;
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index d85e4f7590197..9bdd8cb71f7f3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -479,7 +479,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
                                      const Twine &NameStr = "",
                                      InsertPosition InsertBefore = nullptr) {
     auto *Sel = SelectInst::Create(C, S1, S2, NameStr, InsertBefore, nullptr);
-    setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, F, DEBUG_TYPE);
+    setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, DEBUG_TYPE, &F);
     return Sel;
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index f5130da818746..9572f9d702e1b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3599,6 +3599,21 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
                                  m_Not(m_Specific(SelCond->getTrueValue())));
       if (MayNeedFreeze)
         C = Builder.CreateFreeze(C);
+      if (!ProfcheckDisableMetadataFixes) {
+        Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr;
+        if (match(CondVal, m_LogicalAnd(m_Specific(C), m_Value(A2))) &&
+            SelCond) {
+          return SelectInst::Create(C, A, B, "", nullptr, SelCond);
+        } else if (match(FalseVal,
+                         m_LogicalAnd(m_Not(m_Value(C2)), m_Value(B2))) &&
+                   SelFVal) {
+          SelectInst *NewSI = SelectInst::Create(C, A, B, "", nullptr, SelFVal);
+          NewSI->swapProfMetadata();
+          return NewSI;
+        } else {
+          return createSelectInstWithUnknownProfile(C, A, B);
+        }
+      }
       return SelectInst::Create(C, A, B);
     }
 
@@ -3615,6 +3630,20 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
                                  m_Not(m_Specific(SelFVal->getTrueValue())));
       if (MayNeedFreeze)
         C = Builder.CreateFreeze(C);
+      if (!ProfcheckDisableMetadataFixes) {
+        Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr;
+        if (match(CondVal, m_LogicalAnd(m_Not(m_Value(C2)), m_Value(A2))) &&
+            SelCond) {
+          SelectInst *NewSI = SelectInst::Create(C, B, A, "", nullptr, SelCond);
+          NewSI->swapProfMetadata();
+          return NewSI;
+        } else if (match(FalseVal, m_LogicalAnd(m_Specific(C), m_Value(B2))) &&
+                   SelFVal) {
+          return SelectInst::Create(C, B, A, "", nullptr, SelFVal);
+        } else {
+          return createSelectInstWithUnknownProfile(C, B, A);
+        }
+      }
       return SelectInst::Create(C, B, A);
     }
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 67f837c7ed968..5bc9c28bed141 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1758,6 +1758,9 @@ static Value *simplifyOperationIntoSelectOperand(Instruction &I, SelectInst *SI,
                                     m_Specific(Op), m_Value(V))) &&
                isGuaranteedNotToBeUndefOrPoison(V)) {
       // Pass
+    } else if (match(Op, m_ZExt(m_Specific(SI->getCondition())))) {
+      V = IsTrueArm ? ConstantInt::get(Op->getType(), 1)
+                    : ConstantInt::getNullValue(Op->getType());
     } else {
       V = Op;
     }
@@ -2261,11 +2264,11 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) {
 }
 
 Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
-  if (!isa<Constant>(I.getOperand(1)))
-    return nullptr;
+  bool IsOtherParamConst = isa<Constant>(I.getOperand(1));
 
   if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
-    if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
+    if (Instruction *NewSel =
+            FoldOpIntoSelect(I, Sel, false, !IsOtherParamConst))
       return NewSel;
   } else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) {
     if (Instruction *NewPhi = foldOpIntoPhi(I, PN))
@@ -5624,8 +5627,15 @@ bool InstCombinerImpl::run() {
 
       for (Use &U : I->uses()) {
         User *User = U.getUser();
-        if (User->isDroppable())
-          continue;
+        if (User->isDroppable()) {
+          // Do not sink if there are dereferenceable assumes that would be
+          // removed.
+          auto II = dyn_cast<IntrinsicInst>(User);
+          if (II->getIntrinsicID() != Intrinsic::assume ||
+              !II->getOperandBundle("dereferenceable"))
+            continue;
+        }
+
         if (NumUsers > MaxSinkNumUsers)
           return std::nullopt;
 
diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 0688bc7ac08eb..726d94b27a7f2 100644
--- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -1992,6 +1992,8 @@ void CHR::addToMergedCondition(bool IsTrueBiased, Value *Cond,
 
   // Use logical and to avoid propagating poison from later conditions.
   MergedCondition = IRB.CreateLogicalAnd(MergedCondition, Cond);
+  setExplicitlyUnknownBranchWeightsIfProfiled(
+      *cast<Instruction>(MergedCondition), DEBUG_TYPE);
 }
 
 void CHR::transformScopes(SmallVectorImpl<CHRScope *> &CHRScopes) {
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index b5548d4f24a2f..8c8d16a6e3d25 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1944,6 +1944,10 @@ void InstrLowerer::emitNameData() {
   NamesVar = new GlobalVariable(M, NamesVal->getType(), true,
                                 GlobalValue::PrivateLinkage, NamesVal,
                                 getInstrProfNamesVarName());
+  if (isGPUProfTarget(M)) {
+    NamesVar->setLinkage(GlobalValue::ExternalLinkage);
+    NamesVar->setVisibility(GlobalValue::ProtectedVisibility);
+  }
 
   NamesSize = CompressedNameStr.size();
   setGlobalVariableLargeSection(TT, *NamesVar);
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index 2f256dfd7b0e2..b72d41a748857 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -127,15 +127,19 @@ static uint64_t computeStackId(const memprof::Frame &Frame) {
   return computeStackId(Frame.Function, Frame.LineOffset, Frame.Column);
 }
 
+static AllocationType getAllocType(const AllocationInfo *AllocInfo) {
+  return getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(),
+                      AllocInfo->Info.getAllocCount(),
+                      AllocInfo->Info.getTotalLifetime());
+}
+
 static AllocationType addCallStack(CallStackTrie &AllocTrie,
                                    const AllocationInfo *AllocInfo,
                                    uint64_t FullStackId) {
   SmallVector<uint64_t> StackIds;
   for (const auto &StackFrame : AllocInfo->CallStack)
     StackIds.push_back(computeStackId(StackFrame));
-  auto AllocType = getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(),
-                                AllocInfo->Info.getAllocCount(),
-                                AllocInfo->Info.getTotalLifetime());
+  auto AllocType = getAllocType(AllocInfo);
   std::vector<ContextTotalSize> ContextSizeInfo;
   if (recordContextSizeInfoForAnalysis()) {
     auto TotalSize = AllocInfo->Info.getTotalSize();
@@ -405,22 +409,39 @@ handleAllocSite(Instruction &I, CallBase *CI,
                 const std::set<const AllocationInfo *> &AllocInfoSet,
                 std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo>
                     &FullStackIdToAllocMatchInfo) {
+  // TODO: Remove this once the profile creation logic deduplicates contexts
+  // that are the same other than the IsInlineFrame bool. Until then, keep the
+  // largest.
+  DenseMap<uint64_t, const AllocationInfo *> UniqueFullContextIdAllocInfo;
+  for (auto *AllocInfo : AllocInfoSet) {
+    auto FullStackId = computeFullStackId(AllocInfo->CallStack);
+    auto [It, Inserted] =
+        UniqueFullContextIdAllocInfo.insert({FullStackId, AllocInfo});
+    // If inserted entry, done.
+    if (Inserted)
+      continue;
+    // Keep the larger one, or the noncold one if they are the same size.
+    auto CurSize = It->second->Info.getTotalSize();
+    auto NewSize = AllocInfo->Info.getTotalSize();
+    if ((CurSize > NewSize) ||
+        (CurSize == NewSize &&
+         getAllocType(AllocInfo) != AllocationType::NotCold))
+      continue;
+    It->second = AllocInfo;
+  }
   // We may match this instruction's location list to multiple MIB
   // contexts. Add them to a Trie specialized for trimming the contexts to
   // the minimal needed to disambiguate contexts with unique behavior.
   CallStackTrie AllocTrie(&ORE, MaxColdSize);
   uint64_t TotalSize = 0;
   uint64_t TotalColdSize = 0;
-  for (auto *AllocInfo : AllocInfoSet) {
+  for (auto &[FullStackId, AllocInfo] : UniqueFullContextIdAllocInfo) {
     // Check the full inlined call stack against this one.
     // If we found and thus matched all frames on the call, include
     // this MIB.
     if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack,
                                            InlinedCallStack)) {
       NumOfMemProfMatchedAllocContexts++;
-      uint64_t FullStackId = 0;
-      if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis())
-        FullStackId = computeFullStackId(AllocInfo->CallStack);
       auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
       TotalSize += AllocInfo->Info.getTotalSize();
       if (AllocType == AllocationType::Cold)
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 471c6ec633a57..ceeece41782f4 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3903,7 +3903,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   //          adding/"accumulating" %s. "Accumulation" stores the result in one
   //          of the source registers, but this accumulate vs. add distinction
   //          is lost when dealing with LLVM intrinsics.)
+  //
+  // ZeroPurifies means that multiplying a known-zero with an uninitialized
+  // value results in an initialized value. This is applicable for integer
+  // multiplication, but not floating-point (counter-example: NaN).
   void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
+                                  bool ZeroPurifies,
                                   unsigned EltSizeInBits = 0) {
     IRBuilder<> IRB(&I);
 
@@ -3945,7 +3950,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       assert(AccumulatorType == ReturnType);
     }
 
-    FixedVectorType *ImplicitReturnType = ReturnType;
+    FixedVectorType *ImplicitReturnType =
+        cast<FixedVectorType>(getShadowTy(ReturnType));
     // Step 1: instrument multiplication of corresponding vector elements
     if (EltSizeInBits) {
       ImplicitReturnType = cast<FixedVectorType>(
@@ -3964,30 +3970,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
              ReturnType->getNumElements() * ReductionFactor);
     }
 
-    // Multiplying an *initialized* zero by an uninitialized element results in
-    // an initialized zero element.
-    //
-    // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
-    // results in an unpoisoned value. We can therefore adapt the visitAnd()
-    // instrumentation:
-    //   OutShadow =   (SaNonZero & SbNonZero)
-    //               | (VaNonZero & SbNonZero)
-    //               | (SaNonZero & VbNonZero)
-    //   where non-zero is checked on a per-element basis (not per bit).
-    Value *SZero = Constant::getNullValue(Va->getType());
-    Value *VZero = Constant::getNullValue(Sa->getType());
-    Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero);
-    Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero);
-    Value *VaNonZero = IRB.CreateICmpNE(Va, VZero);
-    Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero);
-
-    Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero);
-    Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero);
-    Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero);
-
     // Each element of the vector is represented by a single bit (poisoned or
     // not) e.g., <8 x i1>.
-    Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
+    Value *SaNonZero = IRB.CreateIsNotNull(Sa);
+    Value *SbNonZero = IRB.CreateIsNotNull(Sb);
+    Value *And;
+    if (ZeroPurifies) {
+      // Multiplying an *initialized* zero by an uninitialized element results
+      // in an initialized zero element.
+      //
+      // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
+      // results in an unpoisoned value. We can therefore adapt the visitAnd()
+      // instrumentation:
+      //   OutShadow =   (SaNonZero & SbNonZero)
+      //               | (VaNonZero & SbNonZero)
+      //               | (SaNonZero & VbNonZero)
+      //   where non-zero is checked on a per-element basis (not per bit).
+      Value *VaInt = Va;
+      Value *VbInt = Vb;
+      if (!Va->getType()->isIntegerTy()) {
+        VaInt = CreateAppToShadowCast(IRB, Va);
+        VbInt = CreateAppToShadowCast(IRB, Vb);
+      }
+
+      Value *VaNonZero = IRB.CreateIsNotNull(VaInt);
+      Value *VbNonZero = IRB.CreateIsNotNull(VbInt);
+
+      Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero);
+      Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero);
+      Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero);
+
+      And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
+    } else {
+      And = IRB.CreateOr({SaNonZero, SbNonZero});
+    }
 
     // Extend <8 x i1> to <8 x i16>.
     // (The real pmadd intrinsic would have computed intermediate values of
@@ -5752,17 +5768,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
     case Intrinsic::x86_avx2_pmadd_ub_sw:
     case Intrinsic::x86_avx512_pmaddubs_w_512:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+                                 /*ZeroPurifies=*/true);
       break;
 
     // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_ssse3_pmadd_ub_sw:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+                                 /*ZeroPurifies=*/true, /*EltSizeInBits=*/8);
       break;
 
     // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_mmx_pmadd_wd:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+                                 /*ZeroPurifies=*/true, /*EltSizeInBits=*/16);
       break;
 
     // AVX Vector Neural Network Instructions: bytes
@@ -5848,7 +5867,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_avx2_vpdpbuuds_128:
     case Intrinsic::x86_avx2_vpdpbuuds_256:
     case Intrinsic::x86_avx10_vpdpbuuds_512:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4,
+                                 /*ZeroPurifies=*/true, /*EltSizeInBits=*/8);
       break;
 
     // AVX Vector Neural Network Instructions: words
@@ -5901,7 +5921,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_avx512_vpdpwssds_128:
     case Intrinsic::x86_avx512_vpdpwssds_256:
     case Intrinsic::x86_avx512_vpdpwssds_512:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+                                 /*ZeroPurifies=*/true, /*EltSizeInBits=*/16);
       break;
 
       // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single
diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
index 80e77e099c695..a2fad021e0480 100644
--- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
@@ -161,7 +161,7 @@ template <char NsanTypeId>
 class ShadowTypeConfigImpl : public ShadowTypeConfig {
 public:
   char getNsanTypeId() const override { return NsanTypeId; }
-  static constexpr const char kNsanTypeId = NsanTypeId;
+  static constexpr char kNsanTypeId = NsanTypeId;
 };
 
 // `double` (`d`) shadow type.
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index af53fa0bae468..02f06bebb8f0d 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -734,7 +734,7 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
   FunctionHash = (((uint64_t)JCH.getCRC()) << 28) + JC.getCRC();
 
   // Reserve bit 60-63 for other information purpose.
-  FunctionHash &= 0x0FFFFFFFFFFFFFFF;
+  FunctionHash &= NamedInstrProfRecord::FUNC_HASH_MASK;
   if (IsCS)
     NamedInstrProfRecord::setCSFlagInHash(FunctionHash);
   LLVM_DEBUG(dbgs() << "Function Hash Computation for " << F.getName() << ":\n"
diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
index 78d4a57ecea87..87eba5f2c5242 100644
--- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -58,6 +58,18 @@ static cl::opt<bool>
                           cl::desc("Writes always set the type"), cl::Hidden,
                           cl::init(false));
 
+static cl::opt<bool> ClOutlineInstrumentation(
+    "tysan-outline-instrumentation",
+    cl::desc("Uses function calls for all TySan instrumentation, reducing "
+             "ELF size"),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClVerifyOutlinedInstrumentation(
+    "tysan-verify-outlined-instrumentation",
+    cl::desc("Check types twice with both inlined instrumentation and "
+             "function calls. This verifies that they behave the same."),
+    cl::Hidden, cl::init(false));
+
 STATISTIC(NumInstrumentedAccesses, "Number of instrumented accesses");
 
 namespace {
@@ -105,12 +117,16 @@ struct TypeSanitizer {
   Regex AnonNameRegex;
   Type *IntptrTy;
   uint64_t PtrShift;
-  IntegerType *OrdTy;
+  IntegerType *OrdTy, *U64Ty;
 
   /// Callbacks to run-time library are computed in initializeCallbacks.
   FunctionCallee TysanCheck;
   FunctionCallee TysanCtorFunction;
 
+  FunctionCallee TysanIntrumentMemInst;
+  FunctionCallee TysanInstrumentWithShadowUpdate;
+  FunctionCallee TysanSetShadowType;
+
   /// Callback to set types for gloabls.
   Function *TysanGlobalsSetTypeFunction;
 };
@@ -130,6 +146,8 @@ TypeSanitizer::TypeSanitizer(Module &M)
 void TypeSanitizer::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(M.getContext());
   OrdTy = IRB.getInt32Ty();
+  U64Ty = IRB.getInt64Ty();
+  Type *BoolType = IRB.getInt1Ty();
 
   AttributeList Attr;
   Attr = Attr.addFnAttribute(M.getContext(), Attribute::NoUnwind);
@@ -144,6 +162,30 @@ void TypeSanitizer::initializeCallbacks(Module &M) {
 
   TysanCtorFunction =
       M.getOrInsertFunction(kTysanModuleCtorName, Attr, IRB.getVoidTy());
+
+  TysanIntrumentMemInst = M.getOrInsertFunction(
+      "__tysan_instrument_mem_inst", Attr, IRB.getVoidTy(),
+      IRB.getPtrTy(), // Pointer of data to be written to
+      IRB.getPtrTy(), // Pointer of data to write
+      U64Ty,          // Size of the data in bytes
+      BoolType        // Do we need to call memmove
+  );
+
+  TysanInstrumentWithShadowUpdate = M.getOrInsertFunction(
+      "__tysan_instrument_with_shadow_update", Attr, IRB.getVoidTy(),
+      IRB.getPtrTy(), // Pointer to data to be read
+      IRB.getPtrTy(), // Pointer to type descriptor
+      BoolType,       // Do we need to type check this
+      U64Ty,          // Size of data we access in bytes
+      OrdTy           // Flags
+  );
+
+  TysanSetShadowType = M.getOrInsertFunction(
+      "__tysan_set_shadow_type", Attr, IRB.getVoidTy(),
+      IRB.getPtrTy(), // Pointer of data to be written to
+      IRB.getPtrTy(), // Pointer to the new type descriptor
+      U64Ty           // Size of data we access in bytes
+  );
 }
 
 void TypeSanitizer::instrumentGlobals(Module &M) {
@@ -587,6 +629,29 @@ bool TypeSanitizer::instrumentWithShadowUpdate(
 
   Value *TD = IRB.CreateBitCast(TDGV, IRB.getPtrTy());
 
+  if (ClOutlineInstrumentation) {
+    if (!ForceSetType && (!ClWritesAlwaysSetType || IsRead)) {
+      // We need to check the type here. If the type is unknown, then the read
+      // sets the type. If the type is known, then it is checked. If the type
+      // doesn't match, then we call the runtime type check (which may yet
+      // determine that the mismatch is okay).
+
+      Constant *Flags =
+          ConstantInt::get(OrdTy, (int)IsRead | (((int)IsWrite) << 1));
+
+      IRB.CreateCall(TysanInstrumentWithShadowUpdate,
+                     {Ptr, TD,
+                      SanitizeFunction ? IRB.getTrue() : IRB.getFalse(),
+                      IRB.getInt64(AccessSize), Flags});
+    } else if (ForceSetType || IsWrite) {
+      // In the mode where writes always set the type, for a write (which does
+      // not also read), we just set the type.
+      IRB.CreateCall(TysanSetShadowType, {Ptr, TD, IRB.getInt64(AccessSize)});
+    }
+
+    return true;
+  }
+
   Value *ShadowDataInt = convertToShadowDataInt(IRB, Ptr, IntptrTy, PtrShift,
                                                 ShadowBase, AppMemMask);
   Type *Int8PtrPtrTy = PointerType::get(IRB.getContext(), 0);
@@ -838,37 +903,47 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
     }
   }
 
-  if (!ShadowBase)
-    ShadowBase = getShadowBase(*F);
-  if (!AppMemMask)
-    AppMemMask = getAppMemMask(*F);
+  if (ClOutlineInstrumentation) {
+    if (!Src)
+      Src = ConstantPointerNull::get(IRB.getPtrTy());
 
-  Value *ShadowDataInt = IRB.CreateAdd(
-      IRB.CreateShl(
-          IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask),
-          PtrShift),
-      ShadowBase);
-  Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy());
-
-  if (!Src) {
-    IRB.CreateMemSet(ShadowData, IRB.getInt8(0), IRB.CreateShl(Size, PtrShift),
-                     Align(1ull << PtrShift));
+    IRB.CreateCall(
+        TysanIntrumentMemInst,
+        {Dest, Src, Size, NeedsMemMove ? IRB.getTrue() : IRB.getFalse()});
     return true;
-  }
-
-  Value *SrcShadowDataInt = IRB.CreateAdd(
-      IRB.CreateShl(
-          IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask),
-          PtrShift),
-      ShadowBase);
-  Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy());
-
-  if (NeedsMemMove) {
-    IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData,
-                      Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
   } else {
-    IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData,
-                     Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
+    if (!ShadowBase)
+      ShadowBase = getShadowBase(*F);
+    if (!AppMemMask)
+      AppMemMask = getAppMemMask(*F);
+
+    Value *ShadowDataInt = IRB.CreateAdd(
+        IRB.CreateShl(
+            IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask),
+            PtrShift),
+        ShadowBase);
+    Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy());
+
+    if (!Src) {
+      IRB.CreateMemSet(ShadowData, IRB.getInt8(0),
+                       IRB.CreateShl(Size, PtrShift), Align(1ull << PtrShift));
+      return true;
+    }
+
+    Value *SrcShadowDataInt = IRB.CreateAdd(
+        IRB.CreateShl(
+            IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask),
+            PtrShift),
+        ShadowBase);
+    Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy());
+
+    if (NeedsMemMove) {
+      IRB.CreateMemMove(ShadowData, Align(1ull << PtrShift), SrcShadowData,
+                        Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
+    } else {
+      IRB.CreateMemCpy(ShadowData, Align(1ull << PtrShift), SrcShadowData,
+                       Align(1ull << PtrShift), IRB.CreateShl(Size, PtrShift));
+    }
   }
 
   return true;
@@ -890,6 +965,16 @@ PreservedAnalyses TypeSanitizerPass::run(Module &M,
   for (Function &F : M) {
     const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
     TySan.sanitizeFunction(F, TLI);
+    if (ClVerifyOutlinedInstrumentation && ClOutlineInstrumentation) {
+      // Outlined instrumentation is a new option, and so this exists to
+      // verify there is no difference in behaviour between the options.
+      // If the outlined instrumentation triggers a verification failure
+      // when the original inlined instrumentation does not, or vice versa,
+      // then there is a discrepency which should be investigated.
+      ClOutlineInstrumentation = false;
+      TySan.sanitizeFunction(F, TLI);
+      ClOutlineInstrumentation = true;
+    }
   }
 
   return PreservedAnalyses::none();
diff --git a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
index 89980d54ee897..4a7144fe6c77a 100644
--- a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
+++ b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
@@ -78,11 +78,16 @@ DropUnnecessaryAssumesPass::run(Function &F, FunctionAnalysisManager &FAM) {
       SmallVector<OperandBundleDef> KeptBundles;
       unsigned NumBundles = Assume->getNumOperandBundles();
       for (unsigned I = 0; I != NumBundles; ++I) {
-        auto IsDead = [](OperandBundleUse Bundle) {
+        auto IsDead = [&](OperandBundleUse Bundle) {
           // "ignore" operand bundles are always dead.
           if (Bundle.getTagName() == "ignore")
             return true;
 
+          // "dereferenceable" operand bundles are only dropped if requested
+          // (e.g., after loop vectorization has run).
+          if (Bundle.getTagName() == "dereferenceable")
+            return DropDereferenceable;
+
           // Bundles without arguments do not affect any specific values.
           // Always keep them for now.
           if (Bundle.Inputs.empty())
@@ -122,7 +127,8 @@ DropUnnecessaryAssumesPass::run(Function &F, FunctionAnalysisManager &FAM) {
 
     Value *Cond = Assume->getArgOperand(0);
     // Don't drop type tests, which have special semantics.
-    if (match(Cond, m_Intrinsic<Intrinsic::type_test>()))
+    if (match(Cond, m_Intrinsic<Intrinsic::type_test>()) ||
+        match(Cond, m_Intrinsic<Intrinsic::public_type_test>()))
       continue;
 
     SmallVector<Value *> Affected;
diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index a06f8325c90bf..d564e32e26526 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -514,7 +514,7 @@ class ValueTable {
 
 class GVNSink {
 public:
-  GVNSink() {}
+  GVNSink() = default;
 
   bool run(Function &F) {
     LLVM_DEBUG(dbgs() << "GVNSink: running on function @" << F.getName()
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 4ba4ba3850e58..eab1d4975ac96 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -196,6 +196,18 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
   return true;
 }
 
+// Ensure we stay within the bounds of fp values that can be represented as
+// integers without gaps, which are 2^24 and 2^53 for IEEE-754 single and double
+// precision respectively (both on negative and positive side).
+static bool isRepresentableAsExactInteger(ConstantFP *FPVal, int64_t IntVal) {
+  const auto &InitValueFltSema = FPVal->getValueAPF().getSemantics();
+  if (!APFloat::isIEEELikeFP(InitValueFltSema))
+    return false;
+
+  return isUIntN(APFloat::semanticsPrecision(InitValueFltSema),
+                 AbsoluteValue(IntVal));
+}
+
 /// If the loop has floating induction variable then insert corresponding
 /// integer induction variable if possible.
 /// For example,
@@ -212,7 +224,8 @@ bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
   auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge));
 
   int64_t InitValue;
-  if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue))
+  if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue) ||
+      !isRepresentableAsExactInteger(InitValueVal, InitValue))
     return false;
 
   // Check IV increment. Reject this PN if increment operation is not
@@ -262,7 +275,8 @@ bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
   ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1));
   int64_t ExitValue;
   if (ExitValueVal == nullptr ||
-      !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue))
+      !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue) ||
+      !isRepresentableAsExactInteger(ExitValueVal, ExitValue))
     return false;
 
   // Find new predicate for integer comparison.
diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 1099aa335e4c5..0c8b9043fcbbb 100644
--- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -65,7 +65,6 @@
 #include <cassert>
 #include <list>
 #include <tuple>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 19eccb9e17020..9ffa602416b05 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -1796,14 +1796,16 @@ struct LoopFuser {
     // mergeLatch may remove the only block in FC1.
     SE.forgetLoop(FC1.L);
     SE.forgetLoop(FC0.L);
-    // Forget block dispositions as well, so that there are no dangling
-    // pointers to erased/free'ed blocks.
-    SE.forgetBlockAndLoopDispositions();
 
     // Move instructions from FC0.Latch to FC1.Latch.
     // Note: mergeLatch requires an updated DT.
     mergeLatch(FC0, FC1);
 
+    // Forget block dispositions as well, so that there are no dangling
+    // pointers to erased/free'ed blocks. It should be done after mergeLatch()
+    // since merging the latches may affect the dispositions.
+    SE.forgetBlockAndLoopDispositions();
+
     // Merge the loops.
     SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
     for (BasicBlock *BB : Blocks) {
@@ -2092,14 +2094,16 @@ struct LoopFuser {
     // mergeLatch may remove the only block in FC1.
     SE.forgetLoop(FC1.L);
     SE.forgetLoop(FC0.L);
-    // Forget block dispositions as well, so that there are no dangling
-    // pointers to erased/free'ed blocks.
-    SE.forgetBlockAndLoopDispositions();
 
     // Move instructions from FC0.Latch to FC1.Latch.
     // Note: mergeLatch requires an updated DT.
     mergeLatch(FC0, FC1);
 
+    // Forget block dispositions as well, so that there are no dangling
+    // pointers to erased/free'ed blocks. It should be done after mergeLatch()
+    // since merging the latches may affect the dispositions.
+    SE.forgetBlockAndLoopDispositions();
+
     // Merge the loops.
     SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
     for (BasicBlock *BB : Blocks) {
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 019536ca91ae0..1730ec067c2cc 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -72,6 +72,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -105,6 +106,7 @@ STATISTIC(
 STATISTIC(NumShiftUntilZero,
           "Number of uncountable loops recognized as 'shift until zero' idiom");
 
+namespace llvm {
 bool DisableLIRP::All;
 static cl::opt<bool, true>
     DisableLIRPAll("disable-" DEBUG_TYPE "-all",
@@ -163,6 +165,10 @@ static cl::opt<bool> ForceMemsetPatternIntrinsic(
     cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false),
     cl::Hidden);
 
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+
+} // namespace llvm
+
 namespace {
 
 class LoopIdiomRecognize {
@@ -297,8 +303,6 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
   // but ORE cannot be preserved (see comment before the pass definition).
   OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
 
-  std::optional<PolynomialInfo> HR;
-
   LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
                          AR.MSSA, DL, ORE);
   if (!LIR.runOnLoop(&L))
@@ -3199,7 +3203,21 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
   // The loop trip count check.
   auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
                                        CurLoop->getName() + ".ivcheck");
-  Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+  SmallVector<uint32_t> BranchWeights;
+  const bool HasBranchWeights =
+      !ProfcheckDisableMetadataFixes &&
+      extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+  auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+  if (HasBranchWeights) {
+    if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1))
+      std::swap(BranchWeights[0], BranchWeights[1]);
+    // We're not changing the loop profile, so we can reuse the original loop's
+    // profile.
+    setBranchWeights(*BI, BranchWeights,
+                     /*IsExpected=*/false);
+  }
+
   LoopHeaderBB->getTerminator()->eraseFromParent();
 
   // Populate the IV PHI.
@@ -3368,10 +3386,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, ScalarEvolution *SE,
 ///     %start = <...>
 ///     %extraoffset = <...>
 ///     <...>
-///     br label %for.cond
+///     br label %loop
 ///
 ///   loop:
-///     %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
+///     %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
 ///     %nbits = add nsw i8 %iv, %extraoffset
 ///     %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
 ///     %val.shifted.iszero = icmp eq i8 %val.shifted, 0
@@ -3533,7 +3551,19 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
 
   // The loop terminator.
   Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
-  Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+  SmallVector<uint32_t> BranchWeights;
+  const bool HasBranchWeights =
+      !ProfcheckDisableMetadataFixes &&
+      extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+  auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+  if (HasBranchWeights) {
+    if (InvertedCond)
+      std::swap(BranchWeights[0], BranchWeights[1]);
+    // We're not changing the loop profile, so we can reuse the original loop's
+    // profile.
+    setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false);
+  }
   LoopHeaderBB->getTerminator()->eraseFromParent();
 
   // Populate the IV PHI.
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index a8839981e5478..1b770be3909a9 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -89,8 +89,8 @@ struct StoreToLoadForwardingCandidate {
   /// Return true if the dependence from the store to the load has an
   /// absolute distance of one.
   /// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop)
-  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
-                                 Loop *L) const {
+  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, Loop *L,
+                                 const DominatorTree &DT) const {
     Value *LoadPtr = Load->getPointerOperand();
     Value *StorePtr = Store->getPointerOperand();
     Type *LoadType = getLoadStoreType(Load);
@@ -102,8 +102,10 @@ struct StoreToLoadForwardingCandidate {
                DL.getTypeSizeInBits(getLoadStoreType(Store)) &&
            "Should be a known dependence");
 
-    int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0);
-    int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0);
+    int64_t StrideLoad =
+        getPtrStride(PSE, LoadType, LoadPtr, L, DT).value_or(0);
+    int64_t StrideStore =
+        getPtrStride(PSE, LoadType, StorePtr, L, DT).value_or(0);
     if (!StrideLoad || !StrideStore || StrideLoad != StrideStore)
       return false;
 
@@ -287,8 +289,8 @@ class LoadEliminationForLoop {
         // so deciding which one forwards is easy.  The later one forwards as
         // long as they both have a dependence distance of one to the load.
         if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
-            Cand.isDependenceDistanceOfOne(PSE, L) &&
-            OtherCand->isDependenceDistanceOfOne(PSE, L)) {
+            Cand.isDependenceDistanceOfOne(PSE, L, *DT) &&
+            OtherCand->isDependenceDistanceOfOne(PSE, L, *DT)) {
           // They are in the same block, the later one will forward to the load.
           if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
             OtherCand = &Cand;
@@ -538,7 +540,7 @@ class LoadEliminationForLoop {
 
       // Check whether the SCEV difference is the same as the induction step,
       // thus we load the value in the next iteration.
-      if (!Cand.isDependenceDistanceOfOne(PSE, L))
+      if (!Cand.isDependenceDistanceOfOne(PSE, L, *DT))
         continue;
 
       assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) &&
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index b9546c5fa236b..e902b71776973 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -393,6 +394,17 @@ class ConstantTerminatorFoldingImpl {
       DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
       ++NumLoopExitsDeleted;
     }
+    // We don't really need to add branch weights to DummySwitch, because all
+    // but one branches are just a temporary artifact - see the comment on top
+    // of this function. But, it's easy to estimate the weights, and it helps
+    // maintain a property of the overall compiler - that the branch weights
+    // don't "just get dropped" accidentally (i.e. profcheck)
+    if (DummySwitch->getParent()->getParent()->hasProfileData()) {
+      SmallVector<uint32_t> DummyBranchWeights(1 + DummySwitch->getNumCases());
+      // default. 100% probability, the rest are dead.
+      DummyBranchWeights[0] = 1;
+      setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false);
+    }
 
     assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
     if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 2bda9d83236e8..802ae4e9c28e3 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1327,7 +1327,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
   }
 
   // Do not attempt partial/runtime unrolling in FullLoopUnrolling
-  if (OnlyFullUnroll && (UP.Count < TripCount || UP.Count < MaxTripCount)) {
+  if (OnlyFullUnroll && ((!TripCount && !MaxTripCount) ||
+                         UP.Count < TripCount || UP.Count < MaxTripCount)) {
     LLVM_DEBUG(
         dbgs() << "Not attempting partial/runtime unroll in FullLoopUnroll.\n");
     return LoopUnrollResult::Unmodified;
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 3487e812a68a3..7e70ba274f161 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -245,11 +245,14 @@ raw_ostream &operator<<(raw_ostream &OS, ShapeInfo SI) {
 
 } // namespace
 
-static bool isUniformShape(Value *V) {
+static bool isShapePreserving(Value *V) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I)
     return true;
 
+  if (isa<SelectInst>(I))
+    return true;
+
   if (I->isBinaryOp())
     return true;
 
@@ -300,6 +303,16 @@ static bool isUniformShape(Value *V) {
   }
 }
 
+/// Return an iterator over the operands of \p I that should share shape
+/// information with \p I.
+static iterator_range<Use *> getShapedOperandsForInst(Instruction *I) {
+  assert(isShapePreserving(I) &&
+         "Can't retrieve shaped operands for an instruction that does not "
+         "preserve shape information");
+  auto Ops = I->operands();
+  return isa<SelectInst>(I) ? drop_begin(Ops) : Ops;
+}
+
 /// Return the ShapeInfo for the result of \p I, it it can be determined.
 static std::optional<ShapeInfo>
 computeShapeInfoForInst(Instruction *I,
@@ -329,9 +342,8 @@ computeShapeInfoForInst(Instruction *I,
       return OpShape->second;
   }
 
-  if (isUniformShape(I) || isa<SelectInst>(I)) {
-    auto Ops = I->operands();
-    auto ShapedOps = isa<SelectInst>(I) ? drop_begin(Ops) : Ops;
+  if (isShapePreserving(I)) {
+    auto ShapedOps = getShapedOperandsForInst(I);
     // Find the first operand that has a known shape and use that.
     for (auto &Op : ShapedOps) {
       auto OpShape = ShapeMap.find(Op.get());
@@ -710,10 +722,9 @@ class LowerMatrixIntrinsics {
       case Intrinsic::matrix_column_major_store:
         return true;
       default:
-        return isUniformShape(II);
+        break;
       }
-    return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V) ||
-           isa<SelectInst>(V);
+    return isShapePreserving(V) || isa<StoreInst>(V) || isa<LoadInst>(V);
   }
 
   /// Propagate the shape information of instructions to their users.
@@ -800,9 +811,8 @@ class LowerMatrixIntrinsics {
       } else if (isa<StoreInst>(V)) {
         // Nothing to do.  We forward-propagated to this so we would just
         // backward propagate to an instruction with an already known shape.
-      } else if (isUniformShape(V) || isa<SelectInst>(V)) {
-        auto Ops = cast<Instruction>(V)->operands();
-        auto ShapedOps = isa<SelectInst>(V) ? drop_begin(Ops) : Ops;
+      } else if (isShapePreserving(V)) {
+        auto ShapedOps = getShapedOperandsForInst(cast<Instruction>(V));
         // Propagate to all operands.
         ShapeInfo Shape = ShapeMap[V];
         for (Use &U : ShapedOps) {
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index bb6c879f4d47e..0f3e66476f055 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -40,6 +40,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ProfDataUtils.h"
@@ -329,15 +330,14 @@ static void buildPartialUnswitchConditionalBranch(
       HasBranchWeights ? ComputeProfFrom.getMetadata(LLVMContext::MD_prof)
                        : nullptr);
   if (!HasBranchWeights)
-    setExplicitlyUnknownBranchWeightsIfProfiled(
-        *BR, *BR->getParent()->getParent(), DEBUG_TYPE);
+    setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE);
 }
 
 /// Copy a set of loop invariant values, and conditionally branch on them.
 static void buildPartialInvariantUnswitchConditionalBranch(
     BasicBlock &BB, ArrayRef<Value *> ToDuplicate, bool Direction,
     BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, Loop &L,
-    MemorySSAUpdater *MSSAU) {
+    MemorySSAUpdater *MSSAU, const BranchInst &OriginalBranch) {
   ValueToValueMapTy VMap;
   for (auto *Val : reverse(ToDuplicate)) {
     Instruction *Inst = cast<Instruction>(Val);
@@ -377,8 +377,18 @@ static void buildPartialInvariantUnswitchConditionalBranch(
   IRBuilder<> IRB(&BB);
   IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated());
   Value *Cond = VMap[ToDuplicate[0]];
-  IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
-                   Direction ? &NormalSucc : &UnswitchedSucc);
+  // The expectation is that ToDuplicate[0] is the condition used by the
+  // OriginalBranch, case in which we can clone the profile metadata from there.
+  auto *ProfData =
+      !ProfcheckDisableMetadataFixes &&
+              ToDuplicate[0] == skipTrivialSelect(OriginalBranch.getCondition())
+          ? OriginalBranch.getMetadata(LLVMContext::MD_prof)
+          : nullptr;
+  auto *BR =
+      IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
+                       Direction ? &NormalSucc : &UnswitchedSucc, ProfData);
+  if (!ProfData)
+    setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE);
 }
 
 /// Rewrite the PHI nodes in an unswitched loop exit basic block.
@@ -2515,7 +2525,7 @@ static void unswitchNontrivialInvariants(
     // the branch in the split block.
     if (PartiallyInvariant)
       buildPartialInvariantUnswitchConditionalBranch(
-          *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU);
+          *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU, *BI);
     else {
       buildPartialUnswitchConditionalBranch(
           *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH,
@@ -2820,9 +2830,14 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
      MSSAU->getMemorySSA()->verifyMemorySSA();
 
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-  Instruction *DeoptBlockTerm =
-      SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true,
-                                GI->getMetadata(LLVMContext::MD_prof), &DTU, &LI);
+  // llvm.experimental.guard doesn't have branch weights. We can assume,
+  // however, that the deopt path is unlikely.
+  Instruction *DeoptBlockTerm = SplitBlockAndInsertIfThen(
+      GI->getArgOperand(0), GI, true,
+      !ProfcheckDisableMetadataFixes && EstimateProfile
+          ? MDBuilder(GI->getContext()).createUnlikelyBranchWeights()
+          : nullptr,
+      &DTU, &LI);
   BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
   // SplitBlockAndInsertIfThen inserts control flow that branches to
   // DeoptBlockTerm if the condition is true.  We want the opposite.
@@ -3186,10 +3201,14 @@ injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
   Builder.SetInsertPoint(TI);
   auto *InvariantBr =
       Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
+  // We don't know anything about the relation between the limits.
+  setExplicitlyUnknownBranchWeightsIfProfiled(*InvariantBr, DEBUG_TYPE);
 
   Builder.SetInsertPoint(CheckBlock);
-  Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0),
-                       TI->getSuccessor(1));
+  Builder.CreateCondBr(
+      TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1),
+      !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof)
+                                     : nullptr);
   TI->eraseFromParent();
 
   // Fixup phis.
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 0f3978f56045e..0a8f5ea2fdae1 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -143,8 +143,8 @@ struct SubGraphTraits {
   class WrappedSuccIterator
       : public iterator_adaptor_base<
             WrappedSuccIterator, BaseSuccIterator,
-            typename std::iterator_traits<BaseSuccIterator>::iterator_category,
-            NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> {
+            std::iterator_traits<BaseSuccIterator>::iterator_category, NodeRef,
+            std::ptrdiff_t, NodeRef *, NodeRef> {
     SmallDenseSet<RegionNode *> *Nodes;
 
   public:
@@ -558,11 +558,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
   } else {
     // Test for successors as back edge
     BasicBlock *BB = N->getNodeAs<BasicBlock>();
-    BranchInst *Term = cast<BranchInst>(BB->getTerminator());
-
-    for (BasicBlock *Succ : Term->successors())
-      if (Visited.count(Succ))
-        Loops[Succ] = BB;
+    if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()))
+      for (BasicBlock *Succ : Term->successors())
+        if (Visited.count(Succ))
+          Loops[Succ] = BB;
   }
 }
 
@@ -594,7 +593,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
 
   for (BasicBlock *P : predecessors(BB)) {
     // Ignore it if it's a branch from outside into our region entry
-    if (!ParentRegion->contains(P))
+    if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator()))
       continue;
 
     Region *R = RI->getRegionFor(P);
@@ -1402,13 +1401,17 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
 /// Run the transformation for each region found
 bool StructurizeCFG::run(Region *R, DominatorTree *DT,
                          const TargetTransformInfo *TTI) {
-  if (R->isTopLevelRegion())
+  // CallBr and its corresponding direct target blocks are for now ignored by
+  // this pass. This is not a limitation for the currently intended uses cases
+  // of callbr in the AMDGPU backend.
+  // Parent and child regions are not affected by this (current) restriction.
+  // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details.
+  if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator()))
     return false;
 
   this->DT = DT;
   this->TTI = TTI;
   Func = R->getEntry()->getParent();
-  assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");
 
   ParentRegion = R;
 
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 42b1fdf17f389..8aa8aa2c60800 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -39,36 +39,36 @@ using namespace llvm;
 STATISTIC(NumBroken, "Number of blocks inserted");
 
 namespace {
-  struct BreakCriticalEdges : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    BreakCriticalEdges() : FunctionPass(ID) {
-      initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry());
-    }
+struct BreakCriticalEdges : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  BreakCriticalEdges() : FunctionPass(ID) {
+    initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry());
+  }
 
-    bool runOnFunction(Function &F) override {
-      auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-      auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  bool runOnFunction(Function &F) override {
+    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
 
-      auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
-      auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+    auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+    auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
 
-      auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
-      auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
-      unsigned N =
-          SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI, nullptr, PDT));
-      NumBroken += N;
-      return N > 0;
-    }
+    auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+    auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+    unsigned N = SplitAllCriticalEdges(
+        F, CriticalEdgeSplittingOptions(DT, LI, nullptr, PDT));
+    NumBroken += N;
+    return N > 0;
+  }
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addPreserved<LoopInfoWrapperPass>();
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
 
-      // No loop canonicalization guarantees are broken by this pass.
-      AU.addPreservedID(LoopSimplifyID);
-    }
-  };
-}
+    // No loop canonicalization guarantees are broken by this pass.
+    AU.addPreservedID(LoopSimplifyID);
+  }
+};
+} // namespace
 
 char BreakCriticalEdges::ID = 0;
 INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges",
@@ -76,6 +76,7 @@ INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges",
 
 // Publicly exposed interface to pass...
 char &llvm::BreakCriticalEdgesID = BreakCriticalEdges::ID;
+
 FunctionPass *llvm::createBreakCriticalEdgesPass() {
   return new BreakCriticalEdges();
 }
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 573a78150ff3d..02b73e85d783f 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -1283,6 +1283,12 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
   case LibFunc_ilogbl:
   case LibFunc_logf:
   case LibFunc_logl:
+  case LibFunc_nextafter:
+  case LibFunc_nextafterf:
+  case LibFunc_nextafterl:
+  case LibFunc_nexttoward:
+  case LibFunc_nexttowardf:
+  case LibFunc_nexttowardl:
   case LibFunc_pow:
   case LibFunc_powf:
   case LibFunc_powl:
diff --git a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
index 7343c7913ecd0..9f6d89e97180f 100644
--- a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -40,22 +40,22 @@ using namespace llvm;
 
 namespace {
 
-  struct QuotRemPair {
-    Value *Quotient;
-    Value *Remainder;
-
-    QuotRemPair(Value *InQuotient, Value *InRemainder)
-        : Quotient(InQuotient), Remainder(InRemainder) {}
-  };
-
-  /// A quotient and remainder, plus a BB from which they logically "originate".
-  /// If you use Quotient or Remainder in a Phi node, you should use BB as its
-  /// corresponding predecessor.
-  struct QuotRemWithBB {
-    BasicBlock *BB = nullptr;
-    Value *Quotient = nullptr;
-    Value *Remainder = nullptr;
-  };
+struct QuotRemPair {
+  Value *Quotient;
+  Value *Remainder;
+
+  QuotRemPair(Value *InQuotient, Value *InRemainder)
+      : Quotient(InQuotient), Remainder(InRemainder) {}
+};
+
+/// A quotient and remainder, plus a BB from which they logically "originate".
+/// If you use Quotient or Remainder in a Phi node, you should use BB as its
+/// corresponding predecessor.
+struct QuotRemWithBB {
+  BasicBlock *BB = nullptr;
+  Value *Quotient = nullptr;
+  Value *Remainder = nullptr;
+};
 
 using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
 using BypassWidthsTy = DenseMap<unsigned, unsigned>;
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 5ba6f95f5fae8..0ca1fa2425a53 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -63,7 +63,6 @@
 #include <cstdint>
 #include <iterator>
 #include <map>
-#include <utility>
 #include <vector>
 
 using namespace llvm;
@@ -933,6 +932,7 @@ Function *CodeExtractor::constructFunctionDeclaration(
       case Attribute::CoroDestroyOnlyWhenComplete:
       case Attribute::CoroElideSafe:
       case Attribute::NoDivergenceSource:
+      case Attribute::NoCreateUndefOrPoison:
         continue;
       // Those attributes should be safe to propagate to the extracted function.
       case Attribute::AlwaysInline:
diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
index 0642d51cd2c21..dd8706cfb2855 100644
--- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
+++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
@@ -16,22 +16,62 @@
 
 using namespace llvm;
 
+static void mergeAttributes(LLVMContext &Ctx, const Module &M,
+                            const DataLayout &DL, const Triple &TT,
+                            Function *Func, FunctionType *FuncTy,
+                            AttributeList FuncAttrs) {
+  AttributeList OldAttrs = Func->getAttributes();
+  AttributeList NewAttrs = OldAttrs;
+
+  {
+    AttrBuilder OldBuilder(Ctx, OldAttrs.getFnAttrs());
+    AttrBuilder NewBuilder(Ctx, FuncAttrs.getFnAttrs());
+    OldBuilder.merge(NewBuilder);
+    NewAttrs = NewAttrs.addFnAttributes(Ctx, OldBuilder);
+  }
+
+  {
+    AttrBuilder OldBuilder(Ctx, OldAttrs.getRetAttrs());
+    AttrBuilder NewBuilder(Ctx, FuncAttrs.getRetAttrs());
+    OldBuilder.merge(NewBuilder);
+    NewAttrs = NewAttrs.addRetAttributes(Ctx, OldBuilder);
+  }
+
+  for (unsigned I = 0, E = FuncTy->getNumParams(); I != E; ++I) {
+    AttrBuilder OldBuilder(Ctx, OldAttrs.getParamAttrs(I));
+    AttrBuilder NewBuilder(Ctx, FuncAttrs.getParamAttrs(I));
+    OldBuilder.merge(NewBuilder);
+    NewAttrs = NewAttrs.addParamAttributes(Ctx, I, OldBuilder);
+  }
+
+  Func->setAttributes(NewAttrs);
+}
+
 PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M,
                                                   ModuleAnalysisManager &MAM) {
   RTLIB::RuntimeLibcallsInfo RTLCI(M.getTargetTriple());
   LLVMContext &Ctx = M.getContext();
+  const DataLayout &DL = M.getDataLayout();
+  const Triple &TT = M.getTargetTriple();
 
-  for (RTLIB::LibcallImpl Impl : RTLCI.getLibcallImpls()) {
-    if (Impl == RTLIB::Unsupported)
+  for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+    if (!RTLCI.isAvailable(Impl))
       continue;
 
-    // TODO: Declare with correct type, calling convention, and attributes.
+    auto [FuncTy, FuncAttrs] = RTLCI.getFunctionTy(Ctx, TT, DL, Impl);
 
-    FunctionType *FuncTy =
-        FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true);
+    // TODO: Declare with correct type, calling convention, and attributes.
+    if (!FuncTy)
+      FuncTy = FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true);
 
     StringRef FuncName = RTLCI.getLibcallImplName(Impl);
-    M.getOrInsertFunction(FuncName, FuncTy);
+
+    Function *Func =
+        cast<Function>(M.getOrInsertFunction(FuncName, FuncTy).getCallee());
+    if (Func->getFunctionType() == FuncTy) {
+      mergeAttributes(Ctx, M, DL, TT, Func, FuncTy, FuncAttrs);
+      Func->setCallingConv(RTLCI.getLibcallImplCallingConv(Impl));
+    }
   }
 
   return PreservedAnalyses::none();
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 46f29030ddb05..a03cf6e953e35 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3416,7 +3416,11 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C,
   // Create integer constant expression.
   auto createIntegerExpression = [&DIB](const Constant &CV) -> DIExpression * {
     const APInt &API = cast<ConstantInt>(&CV)->getValue();
-    std::optional<int64_t> InitIntOpt = API.trySExtValue();
+    std::optional<int64_t> InitIntOpt;
+    if (API.getBitWidth() == 1)
+      InitIntOpt = API.tryZExtValue();
+    else
+      InitIntOpt = API.trySExtValue();
     return InitIntOpt ? DIB.createConstantValueExpression(
                             static_cast<uint64_t>(*InitIntOpt))
                       : nullptr;
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 61ffb49a8c010..8da6a980ca6f5 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -378,7 +378,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
     if (P != Preheader) BackedgeBlocks.push_back(P);
   }
 
-  // Create and insert the new backedge block...
+  // Create and insert the new backedge block.
   BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(),
                                            Header->getName() + ".backedge", F);
   BranchInst *BETerminator = BranchInst::Create(Header, BEBlock);
@@ -737,39 +737,39 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI,
 }
 
 namespace {
-  struct LoopSimplify : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    LoopSimplify() : FunctionPass(ID) {
-      initializeLoopSimplifyPass(*PassRegistry::getPassRegistry());
-    }
+struct LoopSimplify : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  LoopSimplify() : FunctionPass(ID) {
+    initializeLoopSimplifyPass(*PassRegistry::getPassRegistry());
+  }
 
-    bool runOnFunction(Function &F) override;
+  bool runOnFunction(Function &F) override;
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionCacheTracker>();
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
 
-      // We need loop information to identify the loops...
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addPreserved<DominatorTreeWrapperPass>();
+    // We need loop information to identify the loops.
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
 
-      AU.addRequired<LoopInfoWrapperPass>();
-      AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
 
-      AU.addPreserved<BasicAAWrapperPass>();
-      AU.addPreserved<AAResultsWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-      AU.addPreserved<ScalarEvolutionWrapperPass>();
-      AU.addPreserved<SCEVAAWrapperPass>();
-      AU.addPreservedID(LCSSAID);
-      AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.
-      AU.addPreserved<BranchProbabilityInfoWrapperPass>();
-      AU.addPreserved<MemorySSAWrapperPass>();
-    }
+    AU.addPreserved<BasicAAWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<SCEVAAWrapperPass>();
+    AU.addPreservedID(LCSSAID);
+    AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added.
+    AU.addPreserved<BranchProbabilityInfoWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
+  }
 
-    /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
-    void verifyAnalysis() const override;
-  };
-}
+  /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
+  void verifyAnalysis() const override;
+};
+} // namespace
 
 char LoopSimplify::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
@@ -780,12 +780,12 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops",
                     false, false)
 
-// Publicly exposed interface to pass...
+// Publicly exposed interface to pass.
 char &llvm::LoopSimplifyID = LoopSimplify::ID;
 Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
 
 /// runOnFunction - Run down all loops in the CFG (recursively, but we could do
-/// it in any convenient order) inserting preheaders...
+/// it in any convenient order) inserting preheaders.
 ///
 bool LoopSimplify::runOnFunction(Function &F) {
   bool Changed = false;
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 4fe736ac29b0a..5b94897f4342f 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -66,7 +66,6 @@
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <assert.h>
 #include <numeric>
-#include <type_traits>
 #include <vector>
 
 namespace llvm {
@@ -499,9 +498,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
 
   const unsigned MaxTripCount = SE->getSmallConstantMaxTripCount(L);
   const bool MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L);
-  unsigned EstimatedLoopInvocationWeight = 0;
   std::optional<unsigned> OriginalTripCount =
-      llvm::getLoopEstimatedTripCount(L, &EstimatedLoopInvocationWeight);
+      llvm::getLoopEstimatedTripCount(L);
+  BranchProbability OriginalLoopProb = llvm::getLoopProbability(L);
 
   // Effectively "DCE" unrolled iterations that are beyond the max tripcount
   // and will never be executed.
@@ -592,11 +591,11 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
                                               : isEpilogProfitable(L);
 
   if (ULO.Runtime &&
-      !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount,
-                                  EpilogProfitability, ULO.UnrollRemainder,
-                                  ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
-                                  PreserveLCSSA, ULO.SCEVExpansionBudget,
-                                  ULO.RuntimeUnrollMultiExit, RemainderLoop)) {
+      !UnrollRuntimeLoopRemainder(
+          L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability,
+          ULO.UnrollRemainder, ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
+          PreserveLCSSA, ULO.SCEVExpansionBudget, ULO.RuntimeUnrollMultiExit,
+          RemainderLoop, OriginalTripCount, OriginalLoopProb)) {
     if (ULO.Force)
       ULO.Runtime = false;
     else {
@@ -1130,11 +1129,46 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     LI->erase(L);
     // We shouldn't try to use `L` anymore.
     L = nullptr;
-  } else if (OriginalTripCount) {
-    // Update the trip count. Note that the remainder has already logic
-    // computing it in `UnrollRuntimeLoopRemainder`.
-    setLoopEstimatedTripCount(L, *OriginalTripCount / ULO.Count,
-                              EstimatedLoopInvocationWeight);
+  } else {
+    // Update metadata for the loop's branch weights and estimated trip count:
+    // - If ULO.Runtime, UnrollRuntimeLoopRemainder sets the guard branch
+    //   weights, latch branch weights, and estimated trip count of the
+    //   remainder loop it creates.  It also sets the branch weights for the
+    //   unrolled loop guard it creates.  The branch weights for the unrolled
+    //   loop latch are adjusted below.  FIXME: Handle prologue loops.
+    // - Otherwise, if unrolled loop iteration latches become unconditional,
+    //   branch weights are adjusted above.  FIXME: Actually handle such
+    //   unconditional latches.
+    // - Otherwise, the original loop's branch weights are correct for the
+    //   unrolled loop, so do not adjust them.
+    // - In all cases, the unrolled loop's estimated trip count is set below.
+    //
+    // As an example of the last case, consider what happens if the unroll count
+    // is 4 for a loop with an estimated trip count of 10 when we do not create
+    // a remainder loop and all iterations' latches remain conditional.  Each
+    // unrolled iteration's latch still has the same probability of exiting the
+    // loop as it did when in the original loop, and thus it should still have
+    // the same branch weights.  Each unrolled iteration's non-zero probability
+    // of exiting already appropriately reduces the probability of reaching the
+    // remaining iterations just as it did in the original loop.  Trying to also
+    // adjust the branch weights of the final unrolled iteration's latch (i.e.,
+    // the backedge for the unrolled loop as a whole) to reflect its new trip
+    // count of 3 will erroneously further reduce its block frequencies.
+    // However, in case an analysis later needs to estimate the trip count of
+    // the unrolled loop as a whole without considering the branch weights for
+    // each unrolled iteration's latch within it, we store the new trip count as
+    // separate metadata.
+    if (!OriginalLoopProb.isUnknown() && ULO.Runtime && EpilogProfitability) {
+      // Where p is always the probability of executing at least 1 more
+      // iteration, the probability for at least n more iterations is p^n.
+      setLoopProbability(L, OriginalLoopProb.pow(ULO.Count));
+    }
+    if (OriginalTripCount) {
+      unsigned NewTripCount = *OriginalTripCount / ULO.Count;
+      if (!ULO.Runtime && *OriginalTripCount % ULO.Count)
+        ++NewTripCount;
+      setLoopEstimatedTripCount(L, NewTripCount);
+    }
   }
 
   // LoopInfo should not be valid, confirm that.
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index ca90bb65f5708..1e614bd29ee6e 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -53,7 +53,6 @@
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <assert.h>
 #include <memory>
-#include <type_traits>
 #include <vector>
 
 using namespace llvm;
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 7a2b8da6ffd21..6c9467bf4a005 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <cmath>
 
 using namespace llvm;
 
@@ -195,6 +196,42 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
   }
 }
 
+/// Assume, due to our position in the remainder loop or its guard, anywhere
+/// from 0 to \p N more iterations can possibly execute.  Among such cases in
+/// the original loop (with loop probability \p OriginalLoopProb), what is the
+/// probability of executing at least one more iteration?
+static BranchProbability
+probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) {
+  // OriginalLoopProb == 1 would produce a division by zero in the calculation
+  // below.  The problem is that case indicates an always infinite loop, but a
+  // remainder loop cannot be calculated at run time if the original loop is
+  // infinite as infinity % UnrollCount is undefined.  We then choose
+  // probabilities indicating that all remainder loop iterations will always
+  // execute.
+  //
+  // Currently, the remainder loop here is an epilogue, which cannot be reached
+  // if the original loop is infinite, so the aforementioned choice is
+  // arbitrary.
+  //
+  // FIXME: Branch weights still need to be fixed in the case of prologues
+  // (issue #135812).  In that case, the aforementioned choice seems reasonable
+  // for the goal of maintaining the original loop's block frequencies.  That
+  // is, an infinite loop's initial iterations are not skipped, and the prologue
+  // loop body might have unique blocks that execute a finite number of times
+  // if, for example, the original loop body contains conditionals like i <
+  // UnrollCount.
+  if (OriginalLoopProb == BranchProbability::getOne())
+    return BranchProbability::getOne();
+
+  // Each of these variables holds the original loop's probability that the
+  // number of iterations it will execute is some m in the specified range.
+  BranchProbability ProbOne = OriginalLoopProb;                // 1 <= m
+  BranchProbability ProbTooMany = ProbOne.pow(N + 1);          // N + 1 <= m
+  BranchProbability ProbNotTooMany = ProbTooMany.getCompl();   // 0 <= m <= N
+  BranchProbability ProbOneNotTooMany = ProbOne - ProbTooMany; // 1 <= m <= N
+  return ProbOneNotTooMany / ProbNotTooMany;
+}
+
 /// Connect the unrolling epilog code to the original loop.
 /// The unrolling epilog code contains code to execute the
 /// 'extra' iterations if the run-time trip count modulo the
@@ -221,7 +258,8 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
                           BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
                           ValueToValueMapTy &VMap, DominatorTree *DT,
                           LoopInfo *LI, bool PreserveLCSSA, ScalarEvolution &SE,
-                          unsigned Count, AssumptionCache &AC) {
+                          unsigned Count, AssumptionCache &AC,
+                          BranchProbability OriginalLoopProb) {
   BasicBlock *Latch = L->getLoopLatch();
   assert(Latch && "Loop must have a latch");
   BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
@@ -332,12 +370,19 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
                          PreserveLCSSA);
   // Add the branch to the exit block (around the epilog loop)
   MDNode *BranchWeights = nullptr;
-  if (hasBranchWeightMD(*Latch->getTerminator())) {
+  if (OriginalLoopProb.isUnknown() &&
+      hasBranchWeightMD(*Latch->getTerminator())) {
     // Assume equal distribution in interval [0, Count).
     MDBuilder MDB(B.getContext());
     BranchWeights = MDB.createBranchWeights(1, Count - 1);
   }
-  B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights);
+  BranchInst *RemainderLoopGuard =
+      B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights);
+  if (!OriginalLoopProb.isUnknown()) {
+    setBranchProbability(RemainderLoopGuard,
+                         probOfNextInRemainder(OriginalLoopProb, Count - 1),
+                         /*ForFirstTarget=*/true);
+  }
   InsertPt->eraseFromParent();
   if (DT) {
     auto *NewDom = DT->findNearestCommonDominator(Exit, NewExit);
@@ -357,14 +402,15 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
 /// The cloned blocks should be inserted between InsertTop and InsertBot.
 /// InsertTop should be new preheader, InsertBot new loop exit.
 /// Returns the new cloned loop that is created.
-static Loop *
-CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
-                const bool UnrollRemainder,
-                BasicBlock *InsertTop,
-                BasicBlock *InsertBot, BasicBlock *Preheader,
+static Loop *CloneLoopBlocks(Loop *L, Value *NewIter,
+                             const bool UseEpilogRemainder,
+                             const bool UnrollRemainder, BasicBlock *InsertTop,
+                             BasicBlock *InsertBot, BasicBlock *Preheader,
                              std::vector<BasicBlock *> &NewBlocks,
                              LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
-                             DominatorTree *DT, LoopInfo *LI, unsigned Count) {
+                             DominatorTree *DT, LoopInfo *LI, unsigned Count,
+                             std::optional<unsigned> OriginalTripCount,
+                             BranchProbability OriginalLoopProb) {
   StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
   BasicBlock *Header = L->getHeader();
   BasicBlock *Latch = L->getLoopLatch();
@@ -419,7 +465,8 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
           Builder.CreateAdd(NewIdx, One, NewIdx->getName() + ".next");
       Value *IdxCmp = Builder.CreateICmpNE(IdxNext, NewIter, NewIdx->getName() + ".cmp");
       MDNode *BranchWeights = nullptr;
-      if (hasBranchWeightMD(*LatchBR)) {
+      if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) &&
+          hasBranchWeightMD(*LatchBR)) {
         uint32_t ExitWeight;
         uint32_t BackEdgeWeight;
         if (Count >= 3) {
@@ -437,7 +484,29 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
         MDBuilder MDB(Builder.getContext());
         BranchWeights = MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
       }
-      Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights);
+      BranchInst *RemainderLoopLatch =
+          Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights);
+      if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) {
+        // Compute the total frequency of the original loop body from the
+        // remainder iterations.  Once we've reached them, the first of them
+        // always executes, so its frequency and probability are 1.
+        double FreqRemIters = 1;
+        if (Count > 2) {
+          BranchProbability ProbReaching = BranchProbability::getOne();
+          for (unsigned N = Count - 2; N >= 1; --N) {
+            ProbReaching *= probOfNextInRemainder(OriginalLoopProb, N);
+            FreqRemIters += double(ProbReaching.getNumerator()) /
+                            ProbReaching.getDenominator();
+          }
+        }
+        // Solve for the loop probability that would produce that frequency.
+        // Sum(i=0..inf)(Prob^i) = 1/(1-Prob) = FreqRemIters.
+        double ProbDouble = 1 - 1 / FreqRemIters;
+        BranchProbability Prob = BranchProbability::getBranchProbability(
+            std::round(ProbDouble * BranchProbability::getDenominator()),
+            BranchProbability::getDenominator());
+        setBranchProbability(RemainderLoopLatch, Prob, /*ForFirstTarget=*/true);
+      }
       NewIdx->addIncoming(Zero, InsertTop);
       NewIdx->addIncoming(IdxNext, NewBB);
       LatchBR->eraseFromParent();
@@ -461,6 +530,9 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
   Loop *NewLoop = NewLoops[L];
   assert(NewLoop && "L should have been cloned");
 
+  if (OriginalTripCount && UseEpilogRemainder)
+    setLoopEstimatedTripCount(NewLoop, *OriginalTripCount % Count);
+
   // Add unroll disable metadata to disable future unrolling for this loop.
   if (!UnrollRemainder)
     NewLoop->setLoopAlreadyUnrolled();
@@ -588,7 +660,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
     LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
     const TargetTransformInfo *TTI, bool PreserveLCSSA,
     unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit,
-    Loop **ResultLoop) {
+    Loop **ResultLoop, std::optional<unsigned> OriginalTripCount,
+    BranchProbability OriginalLoopProb) {
   LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
   LLVM_DEBUG(L->dump());
   LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
@@ -808,12 +881,23 @@ bool llvm::UnrollRuntimeLoopRemainder(
   BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
   // Branch to either remainder (extra iterations) loop or unrolling loop.
   MDNode *BranchWeights = nullptr;
-  if (hasBranchWeightMD(*Latch->getTerminator())) {
+  if ((OriginalLoopProb.isUnknown() || !UseEpilogRemainder) &&
+      hasBranchWeightMD(*Latch->getTerminator())) {
     // Assume loop is nearly always entered.
     MDBuilder MDB(B.getContext());
     BranchWeights = MDB.createBranchWeights(EpilogHeaderWeights);
   }
-  B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights);
+  BranchInst *UnrollingLoopGuard =
+      B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights);
+  if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) {
+    // The original loop's first iteration always happens.  Compute the
+    // probability of the original loop executing Count-1 iterations after that
+    // to complete the first iteration of the unrolled loop.
+    BranchProbability ProbOne = OriginalLoopProb;
+    BranchProbability ProbRest = ProbOne.pow(Count - 1);
+    setBranchProbability(UnrollingLoopGuard, ProbRest,
+                         /*ForFirstTarget=*/false);
+  }
   PreHeaderBR->eraseFromParent();
   if (DT) {
     if (UseEpilogRemainder)
@@ -840,9 +924,10 @@ bool llvm::UnrollRuntimeLoopRemainder(
   // iterations. This function adds the appropriate CFG connections.
   BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
   BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
-  Loop *remainderLoop = CloneLoopBlocks(
-      L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, InsertBot,
-      NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI, Count);
+  Loop *remainderLoop =
+      CloneLoopBlocks(L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop,
+                      InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT,
+                      LI, Count, OriginalTripCount, OriginalLoopProb);
 
   // Insert the cloned blocks into the function.
   F->splice(InsertBot->getIterator(), F, NewBlocks[0]->getIterator(), F->end());
@@ -941,7 +1026,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
     // Connect the epilog code to the original loop and update the
     // PHI functions.
     ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader,
-                  NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC);
+                  NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count, *AC,
+                  OriginalLoopProb);
 
     // Update counter in loop for unrolling.
     // Use an incrementing IV.  Pre-incr/post-incr is backedge/trip count.
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index b6ba82288aeb4..6e60b94be78e3 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -962,13 +962,54 @@ bool llvm::setLoopEstimatedTripCount(
   if (LatchBranch->getSuccessor(0) != L->getHeader())
     std::swap(BackedgeTakenWeight, LatchExitWeight);
 
-  MDBuilder MDB(LatchBranch->getContext());
-
   // Set/Update profile metadata.
-  LatchBranch->setMetadata(
-      LLVMContext::MD_prof,
-      MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight));
+  setBranchWeights(*LatchBranch, {BackedgeTakenWeight, LatchExitWeight},
+                   /*IsExpected=*/false);
+
+  return true;
+}
+
+BranchProbability llvm::getLoopProbability(Loop *L) {
+  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+  if (!LatchBranch)
+    return BranchProbability::getUnknown();
+  bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader();
+  return getBranchProbability(LatchBranch, FirstTargetIsLoop);
+}
 
+bool llvm::setLoopProbability(Loop *L, BranchProbability P) {
+  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+  if (!LatchBranch)
+    return false;
+  bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader();
+  return setBranchProbability(LatchBranch, P, FirstTargetIsLoop);
+}
+
+BranchProbability llvm::getBranchProbability(BranchInst *B,
+                                             bool ForFirstTarget) {
+  if (B->getNumSuccessors() != 2)
+    return BranchProbability::getUnknown();
+  uint64_t Weight0, Weight1;
+  if (!extractBranchWeights(*B, Weight0, Weight1))
+    return BranchProbability::getUnknown();
+  uint64_t Denominator = Weight0 + Weight1;
+  if (Denominator == 0)
+    return BranchProbability::getUnknown();
+  if (!ForFirstTarget)
+    std::swap(Weight0, Weight1);
+  return BranchProbability::getBranchProbability(Weight0, Denominator);
+}
+
+bool llvm::setBranchProbability(BranchInst *B, BranchProbability P,
+                                bool ForFirstTarget) {
+  if (B->getNumSuccessors() != 2)
+    return false;
+  BranchProbability Prob0 = P;
+  BranchProbability Prob1 = P.getCompl();
+  if (!ForFirstTarget)
+    std::swap(Prob0, Prob1);
+  setBranchWeights(*B, {Prob0.getNumerator(), Prob1.getNumerator()},
+                   /*IsExpected=*/false);
   return true;
 }
 
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index ec2e6c1ab796b..9c8b6ef83e56d 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -109,8 +110,12 @@ void LoopVersioning::versionLoop(
   // Insert the conditional branch based on the result of the memchecks.
   Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
   Builder.SetInsertPoint(OrigTerm);
-  Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
-                       VersionedLoop->getLoopPreheader());
+  auto *BI =
+      Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
+                           VersionedLoop->getLoopPreheader());
+  // We don't know what the probability of executing the versioned vs the
+  // unversioned variants is.
+  setExplicitlyUnknownBranchWeightsIfProfiled(*BI, DEBUG_TYPE);
   OrigTerm->eraseFromParent();
 
   // The loops merge in the original exit block.  This is now dominated by the
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index b03fb6213d61c..ed2a5c292fa54 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -80,6 +80,7 @@
 #include <algorithm>
 #include <cassert>
 #include <climits>
+#include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
@@ -301,7 +302,9 @@ class SimplifyCFGOpt {
 
   bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
                                              IRBuilder<> &Builder);
-
+  bool tryToSimplifyUncondBranchWithICmpSelectInIt(ICmpInst *ICI,
+                                                   SelectInst *Select,
+                                                   IRBuilder<> &Builder);
   bool hoistCommonCodeFromSuccessors(Instruction *TI, bool AllInstsEqOnly);
   bool hoistSuccIdenticalTerminatorToSwitchOrIf(
       Instruction *TI, Instruction *I1,
@@ -777,8 +780,10 @@ struct ConstantComparesGatherer {
       return false;
 
     // Add all values from the range to the set
-    for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
+    APInt Tmp = Span.getLower();
+    do
       Vals.push_back(ConstantInt::get(I->getContext(), Tmp));
+    while (++Tmp != Span.getUpper());
 
     UsedICmps++;
     return true;
@@ -5020,16 +5025,65 @@ bool SimplifyCFGOpt::simplifyIndirectBrOnSelect(IndirectBrInst *IBI,
 /// the PHI, merging the third icmp into the switch.
 bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt(
     ICmpInst *ICI, IRBuilder<> &Builder) {
+  // Select == nullptr means we assume that there is a hidden no-op select
+  // instruction of `_ = select %icmp, true, false` after `%icmp = icmp ...`
+  return tryToSimplifyUncondBranchWithICmpSelectInIt(ICI, nullptr, Builder);
+}
+
+/// Similar to tryToSimplifyUncondBranchWithICmpInIt, but handle a more generic
+/// case. This is called when we find an icmp instruction (a seteq/setne with a
+/// constant) and its following select instruction as the only TWO instructions
+/// in a block that ends with an uncond branch.  We are looking for a very
+/// specific pattern that occurs when "
+///    if (A == 1) return C1;
+///    if (A == 2) return C2;
+///    if (A < 3) return C3;
+///    return C4;
+/// " gets simplified.  In this case, we merge the first two "branches of icmp"
+/// into a switch, but then the default value goes to an uncond block with a lt
+/// icmp and select in it, as InstCombine can not simplify "A < 3" as "A == 2".
+/// After SimplifyCFG and other subsequent optimizations (e.g., SCCP), we might
+/// get something like:
+///
+/// case1:
+///   switch i8 %A, label %DEFAULT [ i8 0, label %end    i8 1, label %case2 ]
+/// case2:
+///   br label %end
+/// DEFAULT:
+///   %tmp = icmp eq i8 %A, 2
+///   %val = select i1 %tmp, i8 C3, i8 C4
+///   br label %end
+/// end:
+///   _ = phi i8 [ C1, %case1 ], [ C2, %case2 ], [ %val, %DEFAULT ]
+///
+/// We prefer to split the edge to 'end' so that there are TWO entries of V3/V4
+/// to the PHI, merging the icmp & select into the switch, as follows:
+///
+/// case1:
+///   switch i8 %A, label %DEFAULT [
+///     i8 0, label %end
+///     i8 1, label %case2
+///     i8 2, label %case3
+///   ]
+/// case2:
+///   br label %end
+/// case3:
+///   br label %end
+/// DEFAULT:
+///   br label %end
+/// end:
+///   _ = phi i8 [ C1, %case1 ], [ C2, %case2 ], [ C3, %case2 ], [ C4, %DEFAULT]
+bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpSelectInIt(
+    ICmpInst *ICI, SelectInst *Select, IRBuilder<> &Builder) {
   BasicBlock *BB = ICI->getParent();
 
-  // If the block has any PHIs in it or the icmp has multiple uses, it is too
-  // complex.
-  if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse())
+  // If the block has any PHIs in it or the icmp/select has multiple uses, it is
+  // too complex.
+  /// TODO: support multi-phis in succ BB of select's BB.
+  if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse() ||
+      (Select && !Select->hasOneUse()))
     return false;
 
-  Value *V = ICI->getOperand(0);
-  ConstantInt *Cst = cast<ConstantInt>(ICI->getOperand(1));
-
   // The pattern we're looking for is where our only predecessor is a switch on
   // 'V' and this block is the default case for the switch.  In this case we can
   // fold the compared value into the switch to simplify things.
@@ -5037,8 +5091,36 @@ bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt(
   if (!Pred || !isa<SwitchInst>(Pred->getTerminator()))
     return false;
 
+  Value *IcmpCond;
+  ConstantInt *NewCaseVal;
+  CmpPredicate Predicate;
+
+  // Match icmp X, C
+  if (!match(ICI,
+             m_ICmp(Predicate, m_Value(IcmpCond), m_ConstantInt(NewCaseVal))))
+    return false;
+
+  Value *SelectCond, *SelectTrueVal, *SelectFalseVal;
+  Instruction *User;
+  if (!Select) {
+    // If Select == nullptr, we can assume that there is a hidden no-op select
+    // just after icmp
+    SelectCond = ICI;
+    SelectTrueVal = Builder.getTrue();
+    SelectFalseVal = Builder.getFalse();
+    User = ICI->user_back();
+  } else {
+    SelectCond = Select->getCondition();
+    // Check if the select condition is the same as the icmp condition.
+    if (SelectCond != ICI)
+      return false;
+    SelectTrueVal = Select->getTrueValue();
+    SelectFalseVal = Select->getFalseValue();
+    User = Select->user_back();
+  }
+
   SwitchInst *SI = cast<SwitchInst>(Pred->getTerminator());
-  if (SI->getCondition() != V)
+  if (SI->getCondition() != IcmpCond)
     return false;
 
   // If BB is reachable on a non-default case, then we simply know the value of
@@ -5060,9 +5142,9 @@ bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt(
   // Ok, the block is reachable from the default dest.  If the constant we're
   // comparing exists in one of the other edges, then we can constant fold ICI
   // and zap it.
-  if (SI->findCaseValue(Cst) != SI->case_default()) {
+  if (SI->findCaseValue(NewCaseVal) != SI->case_default()) {
     Value *V;
-    if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+    if (Predicate == ICmpInst::ICMP_EQ)
       V = ConstantInt::getFalse(BB->getContext());
     else
       V = ConstantInt::getTrue(BB->getContext());
@@ -5073,25 +5155,30 @@ bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt(
     return requestResimplify();
   }
 
-  // The use of the icmp has to be in the 'end' block, by the only PHI node in
+  // The use of the select has to be in the 'end' block, by the only PHI node in
   // the block.
   BasicBlock *SuccBlock = BB->getTerminator()->getSuccessor(0);
-  PHINode *PHIUse = dyn_cast<PHINode>(ICI->user_back());
+  PHINode *PHIUse = dyn_cast<PHINode>(User);
   if (PHIUse == nullptr || PHIUse != &SuccBlock->front() ||
       isa<PHINode>(++BasicBlock::iterator(PHIUse)))
     return false;
 
-  // If the icmp is a SETEQ, then the default dest gets false, the new edge gets
-  // true in the PHI.
-  Constant *DefaultCst = ConstantInt::getTrue(BB->getContext());
-  Constant *NewCst = ConstantInt::getFalse(BB->getContext());
+  // If the icmp is a SETEQ, then the default dest gets SelectFalseVal, the new
+  // edge gets SelectTrueVal in the PHI.
+  Value *DefaultCst = SelectFalseVal;
+  Value *NewCst = SelectTrueVal;
 
-  if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+  if (ICI->getPredicate() == ICmpInst::ICMP_NE)
     std::swap(DefaultCst, NewCst);
 
-  // Replace ICI (which is used by the PHI for the default value) with true or
-  // false depending on if it is EQ or NE.
-  ICI->replaceAllUsesWith(DefaultCst);
+  // Replace Select (which is used by the PHI for the default value) with
+  // SelectFalseVal or SelectTrueVal depending on if ICI is EQ or NE.
+  if (Select) {
+    Select->replaceAllUsesWith(DefaultCst);
+    Select->eraseFromParent();
+  } else {
+    ICI->replaceAllUsesWith(DefaultCst);
+  }
   ICI->eraseFromParent();
 
   SmallVector<DominatorTree::UpdateType, 2> Updates;
@@ -5108,7 +5195,7 @@ bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt(
       NewW = ((uint64_t(*W0) + 1) >> 1);
       SIW.setSuccessorWeight(0, *NewW);
     }
-    SIW.addCase(Cst, NewBB, NewW);
+    SIW.addCase(NewCaseVal, NewBB, NewW);
     if (DTU)
       Updates.push_back({DominatorTree::Insert, Pred, NewBB});
   }
@@ -5211,8 +5298,7 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
     // We don't have any info about this condition.
     auto *Br = TrueWhenEqual ? Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB)
                              : Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
-    setExplicitlyUnknownBranchWeightsIfProfiled(*Br, *NewBB->getParent(),
-                                                DEBUG_TYPE);
+    setExplicitlyUnknownBranchWeightsIfProfiled(*Br, DEBUG_TYPE);
 
     OldTI->eraseFromParent();
 
@@ -5955,7 +6041,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
   }
 
   // Update weight for the newly-created conditional branch.
-  if (hasBranchWeightMD(*SI)) {
+  if (hasBranchWeightMD(*SI) && NewBI->isConditional()) {
     SmallVector<uint64_t, 8> Weights;
     getBranchWeights(SI, Weights);
     if (Weights.size() == 1 + SI->getNumCases()) {
@@ -5977,14 +6063,14 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
   }
 
   // Prune obsolete incoming values off the successors' PHI nodes.
-  for (auto BBI = Dest->begin(); isa<PHINode>(BBI); ++BBI) {
+  for (auto &PHI : make_early_inc_range(Dest->phis())) {
     unsigned PreviousEdges = Cases->size();
     if (Dest == SI->getDefaultDest())
       ++PreviousEdges;
     for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
-      cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
+      PHI.removeIncomingValue(SI->getParent());
   }
-  for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) {
+  for (auto &PHI : make_early_inc_range(OtherDest->phis())) {
     unsigned PreviousEdges = OtherCases->size();
     if (OtherDest == SI->getDefaultDest())
       ++PreviousEdges;
@@ -5993,7 +6079,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
     if (NewBI->isUnconditional())
       ++E;
     for (unsigned I = 0; I != E; ++I)
-      cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
+      PHI.removeIncomingValue(SI->getParent());
   }
 
   // Clean up the default block - it may have phis or other instructions before
@@ -6019,6 +6105,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
                                      const DataLayout &DL) {
   Value *Cond = SI->getCondition();
   KnownBits Known = computeKnownBits(Cond, DL, AC, SI);
+  SmallPtrSet<const Constant *, 4> KnownValues;
+  bool IsKnownValuesValid = collectPossibleValues(Cond, KnownValues, 4);
 
   // We can also eliminate cases by determining that their values are outside of
   // the limited range of the condition based on how many significant (non-sign)
@@ -6038,15 +6126,18 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
         UniqueSuccessors.push_back(Successor);
       ++It->second;
     }
-    const APInt &CaseVal = Case.getCaseValue()->getValue();
+    ConstantInt *CaseC = Case.getCaseValue();
+    const APInt &CaseVal = CaseC->getValue();
     if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
-        (CaseVal.getSignificantBits() > MaxSignificantBitsInCond)) {
-      DeadCases.push_back(Case.getCaseValue());
+        (CaseVal.getSignificantBits() > MaxSignificantBitsInCond) ||
+        (IsKnownValuesValid && !KnownValues.contains(CaseC))) {
+      DeadCases.push_back(CaseC);
       if (DTU)
         --NumPerSuccessorCases[Successor];
       LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal
                         << " is dead.\n");
-    }
+    } else if (IsKnownValuesValid)
+      KnownValues.erase(CaseC);
   }
 
   // If we can prove that the cases must cover all possible values, the
@@ -6057,33 +6148,41 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
   const unsigned NumUnknownBits =
       Known.getBitWidth() - (Known.Zero | Known.One).popcount();
   assert(NumUnknownBits <= Known.getBitWidth());
-  if (HasDefault && DeadCases.empty() &&
-      NumUnknownBits < 64 /* avoid overflow */) {
-    uint64_t AllNumCases = 1ULL << NumUnknownBits;
-    if (SI->getNumCases() == AllNumCases) {
+  if (HasDefault && DeadCases.empty()) {
+    if (IsKnownValuesValid && all_of(KnownValues, IsaPred<UndefValue>)) {
       createUnreachableSwitchDefault(SI, DTU);
       return true;
     }
-    // When only one case value is missing, replace default with that case.
-    // Eliminating the default branch will provide more opportunities for
-    // optimization, such as lookup tables.
-    if (SI->getNumCases() == AllNumCases - 1) {
-      assert(NumUnknownBits > 1 && "Should be canonicalized to a branch");
-      IntegerType *CondTy = cast<IntegerType>(Cond->getType());
-      if (CondTy->getIntegerBitWidth() > 64 ||
-          !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
-        return false;
 
-      uint64_t MissingCaseVal = 0;
-      for (const auto &Case : SI->cases())
-        MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue();
-      auto *MissingCase =
-          cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal));
-      SwitchInstProfUpdateWrapper SIW(*SI);
-      SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0));
-      createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false);
-      SIW.setSuccessorWeight(0, 0);
-      return true;
+    if (NumUnknownBits < 64 /* avoid overflow */) {
+      uint64_t AllNumCases = 1ULL << NumUnknownBits;
+      if (SI->getNumCases() == AllNumCases) {
+        createUnreachableSwitchDefault(SI, DTU);
+        return true;
+      }
+      // When only one case value is missing, replace default with that case.
+      // Eliminating the default branch will provide more opportunities for
+      // optimization, such as lookup tables.
+      if (SI->getNumCases() == AllNumCases - 1) {
+        assert(NumUnknownBits > 1 && "Should be canonicalized to a branch");
+        IntegerType *CondTy = cast<IntegerType>(Cond->getType());
+        if (CondTy->getIntegerBitWidth() > 64 ||
+            !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
+          return false;
+
+        uint64_t MissingCaseVal = 0;
+        for (const auto &Case : SI->cases())
+          MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue();
+        auto *MissingCase = cast<ConstantInt>(
+            ConstantInt::get(Cond->getType(), MissingCaseVal));
+        SwitchInstProfUpdateWrapper SIW(*SI);
+        SIW.addCase(MissingCase, SI->getDefaultDest(),
+                    SIW.getSuccessorWeight(0));
+        createUnreachableSwitchDefault(SI, DTU,
+                                       /*RemoveOrigDefaultBlock*/ false);
+        SIW.setSuccessorWeight(0, 0);
+        return true;
+      }
     }
   }
 
@@ -7569,6 +7668,81 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
   return true;
 }
 
+/// Tries to transform the switch when the condition is umin with a constant.
+/// In that case, the default branch can be replaced by the constant's branch.
+/// This method also removes dead cases when the simplification cannot replace
+/// the default branch.
+///
+/// For example:
+/// switch(umin(a, 3)) {
+/// case 0:
+/// case 1:
+/// case 2:
+/// case 3:
+/// case 4:
+///   // ...
+/// default:
+///   unreachable
+/// }
+///
+/// Transforms into:
+///
+/// switch(a) {
+/// case 0:
+/// case 1:
+/// case 2:
+/// default:
+///   // This is case 3
+/// }
+static bool simplifySwitchWhenUMin(SwitchInst *SI, DomTreeUpdater *DTU) {
+  Value *A;
+  ConstantInt *Constant;
+
+  if (!match(SI->getCondition(), m_UMin(m_Value(A), m_ConstantInt(Constant))))
+    return false;
+
+  SmallVector<DominatorTree::UpdateType> Updates;
+  SwitchInstProfUpdateWrapper SIW(*SI);
+  BasicBlock *BB = SIW->getParent();
+
+  // Dead cases are removed even when the simplification fails.
+  // A case is dead when its value is higher than the Constant.
+  for (auto I = SI->case_begin(), E = SI->case_end(); I != E;) {
+    if (!I->getCaseValue()->getValue().ugt(Constant->getValue())) {
+      ++I;
+      continue;
+    }
+    BasicBlock *DeadCaseBB = I->getCaseSuccessor();
+    DeadCaseBB->removePredecessor(BB);
+    Updates.push_back({DominatorTree::Delete, BB, DeadCaseBB});
+    I = SIW->removeCase(I);
+    E = SIW->case_end();
+  }
+
+  auto Case = SI->findCaseValue(Constant);
+  // If the case value is not found, `findCaseValue` returns the default case.
+  // In this scenario, since there is no explicit `case 3:`, the simplification
+  // fails. The simplification also fails when the switch’s default destination
+  // is reachable.
+  if (!SI->defaultDestUnreachable() || Case == SI->case_default()) {
+    if (DTU)
+      DTU->applyUpdates(Updates);
+    return !Updates.empty();
+  }
+
+  BasicBlock *Unreachable = SI->getDefaultDest();
+  SIW.replaceDefaultDest(Case);
+  SIW.removeCase(Case);
+  SIW->setCondition(A);
+
+  Updates.push_back({DominatorTree::Delete, BB, Unreachable});
+
+  if (DTU)
+    DTU->applyUpdates(Updates);
+
+  return true;
+}
+
 /// Tries to transform switch of powers of two to reduce switch range.
 /// For example, switch like:
 /// switch (C) { case 1: case 2: case 64: case 128: }
@@ -7632,7 +7806,38 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder,
     auto *DefaultCaseBB = SI->getDefaultDest();
     BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU);
     auto It = OrigBB->getTerminator()->getIterator();
+    SmallVector<uint32_t> Weights;
+    auto HasWeights =
+        !ProfcheckDisableMetadataFixes && extractBranchWeights(*SI, Weights);
     auto *BI = BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It);
+    if (HasWeights && any_of(Weights, [](const auto &V) { return V != 0; })) {
+      // IsPow2 covers a subset of the cases in which we'd go to the default
+      // label. The other is those powers of 2 that don't appear in the case
+      // statement. We don't know the distribution of the values coming in, so
+      // the safest is to split 50-50 the original probability to `default`.
+      uint64_t OrigDenominator =
+          sum_of(map_range(Weights, StaticCastTo<uint64_t>));
+      SmallVector<uint64_t> NewWeights(2);
+      NewWeights[1] = Weights[0] / 2;
+      NewWeights[0] = OrigDenominator - NewWeights[1];
+      setFittedBranchWeights(*BI, NewWeights, /*IsExpected=*/false);
+      // The probability of executing the default block stays constant. It was
+      //  p_d = Weights[0] / OrigDenominator
+      //  we rewrite as W/D
+      // We want to find the probability of the default branch of the switch
+      // statement. Let's call it X. We have W/D = W/2D + X * (1-W/2D)
+      // i.e. the original probability is the probability we go to the default
+      // branch from the BI branch, or we take the default branch on the SI.
+      // Meaning X = W / (2D - W), or (W/2) / (D - W/2)
+      // This matches using W/2 for the default branch probability numerator and
+      // D-W/2 as the denominator.
+      Weights[0] = NewWeights[1];
+      uint64_t CasesDenominator = OrigDenominator - Weights[0];
+      for (auto &W : drop_begin(Weights))
+        W = NewWeights[0] * static_cast<double>(W) / CasesDenominator;
+
+      setBranchWeights(*SI, Weights, /*IsExpected=*/false);
+    }
     // BI is handling the default case for SI, and so should share its DebugLoc.
     BI->setDebugLoc(SI->getDebugLoc());
     It->eraseFromParent();
@@ -8010,6 +8215,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   if (simplifyDuplicateSwitchArms(SI, DTU))
     return requestResimplify();
 
+  if (simplifySwitchWhenUMin(SI, DTU))
+    return requestResimplify();
+
   return false;
 }
 
@@ -8178,13 +8386,18 @@ bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI,
 
   // If the only instruction in the block is a seteq/setne comparison against a
   // constant, try to simplify the block.
-  if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) {
     if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) {
       ++I;
       if (I->isTerminator() &&
           tryToSimplifyUncondBranchWithICmpInIt(ICI, Builder))
         return true;
+      if (isa<SelectInst>(I) && I->getNextNode()->isTerminator() &&
+          tryToSimplifyUncondBranchWithICmpSelectInIt(ICI, cast<SelectInst>(I),
+                                                      Builder))
+        return true;
     }
+  }
 
   // See if we can merge an empty landing pad block with another which is
   // equivalent.
diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
index 94c5c1709f43e..e86ab13094b15 100644
--- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -158,6 +158,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
   SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix;
   // Redirect exiting edges through a control flow hub.
   ControlFlowHub CHub;
+  bool Changed = false;
 
   for (unsigned I = 0; I < ExitingBlocks.size(); ++I) {
     BasicBlock *BB = ExitingBlocks[I];
@@ -182,6 +183,10 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
         bool UpdatedLI = false;
         BasicBlock *NewSucc =
             SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI);
+        // SplitCallBrEdge modifies the CFG because it creates an intermediate
+        // block. So we need to set the changed flag no matter what the
+        // ControlFlowHub is going to do later.
+        Changed = true;
         // Even if CallBr and Succ do not have a common parent loop, we need to
         // add the new target block to the parent loop of the current loop.
         if (!UpdatedLI)
@@ -207,6 +212,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
   bool ChangedCFG;
   std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize(
       &DTU, GuardBlocks, "loop.exit", MaxBooleansInControlFlowHub.getValue());
+  ChangedCFG |= Changed;
   if (!ChangedCFG)
     return false;
 
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 8d8a60b6918fe..9021d8b289baf 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -77,7 +77,7 @@ struct WorklistEntry {
   };
   struct AppendingGVTy {
     GlobalVariable *GV;
-    Constant *InitPrefix;
+    GlobalVariable *OldGV;
   };
   struct AliasOrIFuncTy {
     GlobalValue *GV;
@@ -162,7 +162,7 @@ class Mapper {
 
   void scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init,
                                     unsigned MCID);
-  void scheduleMapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+  void scheduleMapAppendingVariable(GlobalVariable &GV, GlobalVariable *OldGV,
                                     bool IsOldCtorDtor,
                                     ArrayRef<Constant *> NewMembers,
                                     unsigned MCID);
@@ -173,7 +173,7 @@ class Mapper {
   void flush();
 
 private:
-  void mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+  void mapAppendingVariable(GlobalVariable &GV, GlobalVariable *OldGV,
                             bool IsOldCtorDtor,
                             ArrayRef<Constant *> NewMembers);
 
@@ -944,7 +944,7 @@ void Mapper::flush() {
           drop_begin(AppendingInits, PrefixSize));
       AppendingInits.resize(PrefixSize);
       mapAppendingVariable(*E.Data.AppendingGV.GV,
-                           E.Data.AppendingGV.InitPrefix,
+                           E.Data.AppendingGV.OldGV,
                            E.AppendingGVIsOldCtorDtor, ArrayRef(NewInits));
       break;
     }
@@ -1094,15 +1094,21 @@ void Mapper::remapFunction(Function &F) {
   }
 }
 
-void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+void Mapper::mapAppendingVariable(GlobalVariable &GV, GlobalVariable *OldGV,
                                   bool IsOldCtorDtor,
                                   ArrayRef<Constant *> NewMembers) {
+  Constant *InitPrefix =
+      (OldGV && !OldGV->isDeclaration()) ? OldGV->getInitializer() : nullptr;
+
   SmallVector<Constant *, 16> Elements;
   if (InitPrefix) {
     unsigned NumElements =
         cast<ArrayType>(InitPrefix->getType())->getNumElements();
     for (unsigned I = 0; I != NumElements; ++I)
       Elements.push_back(InitPrefix->getAggregateElement(I));
+    OldGV->setInitializer(nullptr);
+    if (InitPrefix->hasUseList() && InitPrefix->use_empty())
+      InitPrefix->destroyConstant();
   }
 
   PointerType *VoidPtrTy;
@@ -1148,7 +1154,7 @@ void Mapper::scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init,
 }
 
 void Mapper::scheduleMapAppendingVariable(GlobalVariable &GV,
-                                          Constant *InitPrefix,
+                                          GlobalVariable *OldGV,
                                           bool IsOldCtorDtor,
                                           ArrayRef<Constant *> NewMembers,
                                           unsigned MCID) {
@@ -1159,7 +1165,7 @@ void Mapper::scheduleMapAppendingVariable(GlobalVariable &GV,
   WE.Kind = WorklistEntry::MapAppendingVar;
   WE.MCID = MCID;
   WE.Data.AppendingGV.GV = &GV;
-  WE.Data.AppendingGV.InitPrefix = InitPrefix;
+  WE.Data.AppendingGV.OldGV = OldGV;
   WE.AppendingGVIsOldCtorDtor = IsOldCtorDtor;
   WE.AppendingGVNumNewMembers = NewMembers.size();
   Worklist.push_back(WE);
@@ -1282,12 +1288,12 @@ void ValueMapper::scheduleMapGlobalInitializer(GlobalVariable &GV,
 }
 
 void ValueMapper::scheduleMapAppendingVariable(GlobalVariable &GV,
-                                               Constant *InitPrefix,
+                                               GlobalVariable *OldGV,
                                                bool IsOldCtorDtor,
                                                ArrayRef<Constant *> NewMembers,
                                                unsigned MCID) {
   getAsMapper(pImpl)->scheduleMapAppendingVariable(
-      GV, InitPrefix, IsOldCtorDtor, NewMembers, MCID);
+      GV, OldGV, IsOldCtorDtor, NewMembers, MCID);
 }
 
 void ValueMapper::scheduleMapGlobalAlias(GlobalAlias &GA, Constant &Aliasee,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index fdfff16132093..e522d2f617d8a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -462,8 +462,9 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
 
   bool CanAddPredicate = !llvm::shouldOptimizeForSize(
       TheLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
-  int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides,
-                            CanAddPredicate, false).value_or(0);
+  int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, *DT, Strides,
+                            CanAddPredicate, false)
+                   .value_or(0);
   if (Stride == 1 || Stride == -1)
     return Stride;
   return 0;
@@ -2096,24 +2097,6 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
   for (const auto &Reduction : getReductionVars())
     ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
 
-  // TODO: handle non-reduction outside users when tail is folded by masking.
-  for (auto *AE : AllowedExit) {
-    // Check that all users of allowed exit values are inside the loop or
-    // are the live-out of a reduction.
-    if (ReductionLiveOuts.count(AE))
-      continue;
-    for (User *U : AE->users()) {
-      Instruction *UI = cast<Instruction>(U);
-      if (TheLoop->contains(UI))
-        continue;
-      LLVM_DEBUG(
-          dbgs()
-          << "LV: Cannot fold tail by masking, loop has an outside user for "
-          << *UI << "\n");
-      return false;
-    }
-  }
-
   for (const auto &Entry : getInductionVars()) {
     PHINode *OrigPhi = Entry.first;
     for (User *U : OrigPhi->users()) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 3fed003282f2b..04b05627fa769 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -167,7 +167,7 @@ class VPBuilder {
                               DebugLoc DL = DebugLoc::getUnknown(),
                               const Twine &Name = "") {
     return tryInsertInstruction(
-        new VPInstruction(Opcode, Operands, Flags, DL, Name));
+        new VPInstruction(Opcode, Operands, Flags, {}, DL, Name));
   }
 
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
@@ -184,7 +184,7 @@ class VPBuilder {
                                      DebugLoc DL = DebugLoc::getUnknown(),
                                      const Twine &Name = "") {
     return tryInsertInstruction(
-        new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
+        new VPInstruction(Opcode, Operands, WrapFlags, {}, DL, Name));
   }
 
   VPInstruction *createNot(VPValue *Operand,
@@ -205,7 +205,7 @@ class VPBuilder {
 
     return tryInsertInstruction(new VPInstruction(
         Instruction::BinaryOps::Or, {LHS, RHS},
-        VPRecipeWithIRFlags::DisjointFlagsTy(false), DL, Name));
+        VPRecipeWithIRFlags::DisjointFlagsTy(false), {}, DL, Name));
   }
 
   VPInstruction *createLogicalAnd(VPValue *LHS, VPValue *RHS,
@@ -221,7 +221,7 @@ class VPBuilder {
                std::optional<FastMathFlags> FMFs = std::nullopt) {
     auto *Select =
         FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
-                                 *FMFs, DL, Name)
+                                 *FMFs, {}, DL, Name)
              : new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
                                  DL, Name);
     return tryInsertInstruction(Select);
@@ -235,7 +235,7 @@ class VPBuilder {
     assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
            Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
     return tryInsertInstruction(
-        new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name));
+        new VPInstruction(Instruction::ICmp, {A, B}, Pred, {}, DL, Name));
   }
 
   /// Create a new FCmp VPInstruction with predicate \p Pred and operands \p A
@@ -246,7 +246,7 @@ class VPBuilder {
     assert(Pred >= CmpInst::FIRST_FCMP_PREDICATE &&
            Pred <= CmpInst::LAST_FCMP_PREDICATE && "invalid predicate");
     return tryInsertInstruction(
-        new VPInstruction(Instruction::FCmp, {A, B}, Pred, DL, Name));
+        new VPInstruction(Instruction::FCmp, {A, B}, Pred, {}, DL, Name));
   }
 
   VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset,
@@ -254,7 +254,7 @@ class VPBuilder {
                               const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset},
-                          GEPNoWrapFlags::none(), DL, Name));
+                          GEPNoWrapFlags::none(), {}, DL, Name));
   }
 
   VPInstruction *createNoWrapPtrAdd(VPValue *Ptr, VPValue *Offset,
@@ -262,7 +262,7 @@ class VPBuilder {
                                     DebugLoc DL = DebugLoc::getUnknown(),
                                     const Twine &Name = "") {
     return tryInsertInstruction(new VPInstruction(
-        VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, DL, Name));
+        VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, {}, DL, Name));
   }
 
   VPInstruction *createWidePtrAdd(VPValue *Ptr, VPValue *Offset,
@@ -270,7 +270,7 @@ class VPBuilder {
                                   const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(VPInstruction::WidePtrAdd, {Ptr, Offset},
-                          GEPNoWrapFlags::none(), DL, Name));
+                          GEPNoWrapFlags::none(), {}, DL, Name));
   }
 
   VPPhi *createScalarPhi(ArrayRef<VPValue *> IncomingValues, DebugLoc DL,
@@ -280,8 +280,7 @@ class VPBuilder {
 
   VPValue *createElementCount(Type *Ty, ElementCount EC) {
     VPlan &Plan = *getInsertBlock()->getPlan();
-    VPValue *RuntimeEC =
-        Plan.getOrAddLiveIn(ConstantInt::get(Ty, EC.getKnownMinValue()));
+    VPValue *RuntimeEC = Plan.getConstantInt(Ty, EC.getKnownMinValue());
     if (EC.isScalable()) {
       VPValue *VScale = createNaryOp(VPInstruction::VScale, {}, Ty);
       RuntimeEC = EC.getKnownMinValue() == 1
@@ -304,9 +303,11 @@ class VPBuilder {
   }
 
   VPInstruction *createScalarCast(Instruction::CastOps Opcode, VPValue *Op,
-                                  Type *ResultTy, DebugLoc DL) {
+                                  Type *ResultTy, DebugLoc DL,
+                                  const VPIRFlags &Flags = {},
+                                  const VPIRMetadata &Metadata = {}) {
     return tryInsertInstruction(
-        new VPInstructionWithType(Opcode, Op, ResultTy, {}, DL));
+        new VPInstructionWithType(Opcode, Op, ResultTy, DL, Flags, Metadata));
   }
 
   VPValue *createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 505fb435e91e6..835b0995cc4fc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -249,6 +249,11 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
                    "Use predicated EVL instructions for tail folding. If EVL "
                    "is unsupported, fallback to data-without-lane-mask.")));
 
+cl::opt<bool> llvm::EnableWideActiveLaneMask(
+    "enable-wide-lane-mask", cl::init(false), cl::Hidden,
+    cl::desc("Enable use of wide lane masks when used for control flow in "
+             "tail-folded loops"));
+
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
     cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -1232,6 +1237,30 @@ class LoopVectorizationCostModel {
   /// Superset of instructions that return true for isScalarWithPredication.
   bool isPredicatedInst(Instruction *I) const;
 
+  /// A helper function that returns how much we should divide the cost of a
+  /// predicated block by. Typically this is the reciprocal of the block
+  /// probability, i.e. if we return X we are assuming the predicated block will
+  /// execute once for every X iterations of the loop header so the block should
+  /// only contribute 1/X of its cost to the total cost calculation, but when
+  /// optimizing for code size it will just be 1 as code size costs don't depend
+  /// on execution probabilities.
+  ///
+  /// TODO: We should use actual block probability here, if available.
+  /// Currently, we always assume predicated blocks have a 50% chance of
+  /// executing, apart from blocks that are only predicated due to tail folding.
+  inline unsigned
+  getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
+                          BasicBlock *BB) const {
+    // If a block wasn't originally predicated but was predicated due to
+    // e.g. tail folding, don't divide the cost. Tail folded loops may still be
+    // predicated in the final vector loop iteration, but for most loops that
+    // don't have low trip counts we can expect their probability to be close to
+    // zero.
+    if (!Legal->blockNeedsPredication(BB))
+      return 1;
+    return CostKind == TTI::TCK_CodeSize ? 1 : 2;
+  }
+
   /// Return the costs for our two available strategies for lowering a
   /// div/rem operation which requires speculating at least one lane.
   /// First result is for scalarization (will be invalid for scalable
@@ -1290,6 +1319,12 @@ class LoopVectorizationCostModel {
     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
   }
 
+  /// Returns true if tail-folding is preferred over a scalar epilogue.
+  bool preferPredicatedLoop() const {
+    return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
+           ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate;
+  }
+
   /// Returns the TailFoldingStyle that is best for the current loop.
   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
     if (!ChosenTailFoldingStyle)
@@ -1350,6 +1385,17 @@ class LoopVectorizationCostModel {
     return getTailFoldingStyle() != TailFoldingStyle::None;
   }
 
+  /// Returns true if the use of wide lane masks is requested and the loop is
+  /// using tail-folding with a lane mask for control flow.
+  bool useWideActiveLaneMask() const {
+    if (!EnableWideActiveLaneMask)
+      return false;
+
+    TailFoldingStyle TF = getTailFoldingStyle();
+    return TF == TailFoldingStyle::DataAndControlFlow ||
+           TF == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+  }
+
   /// Return maximum safe number of elements to be processed per vector
   /// iteration, which do not prevent store-load forwarding and are safe with
   /// regard to the memory dependencies. Required for EVL-based VPlans to
@@ -2887,7 +2933,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
     // Scale the cost by the probability of executing the predicated blocks.
     // This assumes the predicated block for each vector lane is equally
     // likely.
-    ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
+    ScalarizationCost =
+        ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent());
   }
 
   InstructionCost SafeDivisorCost = 0;
@@ -3908,7 +3955,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
         continue;
 
       VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
-                            *CM.PSE.getSE());
+                            *CM.PSE.getSE(), OrigLoop);
       precomputeCosts(*Plan, VF, CostCtx);
       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4166,7 +4213,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       // Add on other costs that are modelled in VPlan, but not in the legacy
       // cost model.
       VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
-                            *CM.PSE.getSE());
+                            *CM.PSE.getSE(), OrigLoop);
       VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
       assert(VectorRegion && "Expected to have a vector region!");
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -4179,18 +4226,16 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
           // Selects are only modelled in the legacy cost model for safe
           // divisors.
           case Instruction::Select: {
-            VPValue *VPV = VPI->getVPSingleValue();
-            if (VPV->getNumUsers() == 1) {
-              if (auto *WR = dyn_cast<VPWidenRecipe>(*VPV->user_begin())) {
-                switch (WR->getOpcode()) {
-                case Instruction::UDiv:
-                case Instruction::SDiv:
-                case Instruction::URem:
-                case Instruction::SRem:
-                  continue;
-                default:
-                  break;
-                }
+            if (auto *WR =
+                    dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
+              switch (WR->getOpcode()) {
+              case Instruction::UDiv:
+              case Instruction::SDiv:
+              case Instruction::URem:
+              case Instruction::SRem:
+                continue;
+              default:
+                break;
               }
             }
             C += VPI->cost(VF, CostCtx);
@@ -4535,7 +4580,12 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
   // 3. We don't interleave if we think that we will spill registers to memory
   // due to the increased register pressure.
 
-  if (!CM.isScalarEpilogueAllowed())
+  // Only interleave tail-folded loops if wide lane masks are requested, as the
+  // overhead of multiple instructions to calculate the predicate is likely
+  // not beneficial. If a scalar epilogue is not allowed for any other reason,
+  // do not interleave.
+  if (!CM.isScalarEpilogueAllowed() &&
+      !(CM.preferPredicatedLoop() && CM.useWideActiveLaneMask()))
     return 1;
 
   if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
@@ -5032,7 +5082,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
       }
 
     // Scale the total scalar cost by block probability.
-    ScalarCost /= getPredBlockCostDivisor(CostKind);
+    ScalarCost /= getPredBlockCostDivisor(CostKind, I->getParent());
 
     // Compute the discount. A non-negative discount means the vector version
     // of the instruction costs more, and scalarizing would be beneficial.
@@ -5082,10 +5132,11 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
     // stores and instructions that may divide by zero) will now be
     // unconditionally executed. For the scalar case, we may not always execute
     // the predicated block, if it is an if-else block. Thus, scale the block's
-    // cost by the probability of executing it. blockNeedsPredication from
-    // Legal is used so as to not include all blocks in tail folded loops.
-    if (VF.isScalar() && Legal->blockNeedsPredication(BB))
-      BlockCost /= getPredBlockCostDivisor(CostKind);
+    // cost by the probability of executing it.
+    // getPredBlockCostDivisor will return 1 for blocks that are only predicated
+    // by the header mask when folding the tail.
+    if (VF.isScalar())
+      BlockCost /= getPredBlockCostDivisor(CostKind, BB);
 
     Cost += BlockCost;
   }
@@ -5164,7 +5215,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   // conditional branches, but may not be executed for each vector lane. Scale
   // the cost by the probability of executing the predicated block.
   if (isPredicatedInst(I)) {
-    Cost /= getPredBlockCostDivisor(CostKind);
+    Cost /= getPredBlockCostDivisor(CostKind, I->getParent());
 
     // Add the cost of an i1 extract and a branch
     auto *VecI1Ty =
@@ -6732,6 +6783,10 @@ bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
          SkipCostComputation.contains(UI);
 }
 
+unsigned VPCostContext::getPredBlockCostDivisor(BasicBlock *BB) const {
+  return CM.getPredBlockCostDivisor(CostKind, BB);
+}
+
 InstructionCost
 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
                                           VPCostContext &CostCtx) const {
@@ -6876,7 +6931,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
 
 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
                                                ElementCount VF) const {
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE(),
+                        OrigLoop);
   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
 
   // Now compute and add the VPlan-based cost.
@@ -6918,11 +6974,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
   // the more accurate VPlan-based cost model.
   for (VPRecipeBase &R : *Plan.getVectorPreheader()) {
     auto *VPI = dyn_cast<VPInstruction>(&R);
-    if (!VPI || VPI->getOpcode() != Instruction::Select ||
-        VPI->getNumUsers() != 1)
+    if (!VPI || VPI->getOpcode() != Instruction::Select)
       continue;
 
-    if (auto *WR = dyn_cast<VPWidenRecipe>(*VPI->user_begin())) {
+    if (auto *WR = dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
       switch (WR->getOpcode()) {
       case Instruction::UDiv:
       case Instruction::SDiv:
@@ -7110,12 +7165,13 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   // case, don't trigger the assertion, as the extra simplifications may cause a
   // different VF to be picked by the VPlan-based cost model.
   VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
-                        *CM.PSE.getSE());
+                        *CM.PSE.getSE(), OrigLoop);
   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
   // Verify that the VPlan-based and legacy cost models agree, except for VPlans
   // with early exits and plans with additional VPlan simplifications. The
   // legacy cost model doesn't properly model costs for such loops.
   assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
+          !Legal->getLAI()->getSymbolicStrides().empty() ||
           planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
                                                 CostCtx, OrigLoop,
                                                 BestFactor.Width) ||
@@ -7251,7 +7307,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   VPlanTransforms::narrowInterleaveGroups(
       BestVPlan, BestVF,
-      TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
+      TTI.getRegisterBitWidth(BestVF.isScalable()
+                                  ? TargetTransformInfo::RGK_ScalableVector
+                                  : TargetTransformInfo::RGK_FixedWidthVector));
   VPlanTransforms::removeDeadRecipes(BestVPlan);
 
   VPlanTransforms::convertToConcreteRecipes(BestVPlan);
@@ -7486,11 +7544,12 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
   });
 }
 
-VPWidenMemoryRecipe *
-VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
-                                  VFRange &Range) {
-  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
+                                                       VFRange &Range) {
+  assert((VPI->getOpcode() == Instruction::Load ||
+          VPI->getOpcode() == Instruction::Store) &&
          "Must be called with either a load or store");
+  Instruction *I = VPI->getUnderlyingInstr();
 
   auto WillWiden = [&](ElementCount VF) -> bool {
     LoopVectorizationCostModel::InstWidening Decision =
@@ -7520,7 +7579,8 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
   bool Consecutive =
       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
 
-  VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
+  VPValue *Ptr = VPI->getOpcode() == Instruction::Load ? VPI->getOperand(0)
+                                                       : VPI->getOperand(1);
   if (Consecutive) {
     auto *GEP = dyn_cast<GetElementPtrInst>(
         Ptr->getUnderlyingValue()->stripPointerCasts());
@@ -7534,78 +7594,86 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
           CM.foldTailByMasking() || !GEP
               ? GEPNoWrapFlags::none()
               : GEP->getNoWrapFlags().withoutNoUnsignedWrap();
-      VectorPtr =
-          new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
-                                       /*Stride*/ -1, Flags, I->getDebugLoc());
+      VectorPtr = new VPVectorEndPointerRecipe(
+          Ptr, &Plan.getVF(), getLoadStoreType(I),
+          /*Stride*/ -1, Flags, VPI->getDebugLoc());
     } else {
       VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
                                             GEP ? GEP->getNoWrapFlags()
                                                 : GEPNoWrapFlags::none(),
-                                            I->getDebugLoc());
+                                            VPI->getDebugLoc());
     }
     Builder.insert(VectorPtr);
     Ptr = VectorPtr;
   }
-  if (LoadInst *Load = dyn_cast<LoadInst>(I))
+  if (VPI->getOpcode() == Instruction::Load) {
+    auto *Load = cast<LoadInst>(I);
     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
-                                 Load->getAlign(), VPIRMetadata(*Load, LVer),
-                                 I->getDebugLoc());
+                                 VPIRMetadata(*Load, LVer), I->getDebugLoc());
+  }
 
   StoreInst *Store = cast<StoreInst>(I);
-  return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
-                                Reverse, Store->getAlign(),
-                                VPIRMetadata(*Store, LVer), I->getDebugLoc());
+  return new VPWidenStoreRecipe(*Store, Ptr, VPI->getOperand(0), Mask,
+                                Consecutive, Reverse,
+                                VPIRMetadata(*Store, LVer), VPI->getDebugLoc());
 }
 
-/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
-/// insert a recipe to expand the step for the induction recipe.
+/// Creates a VPWidenIntOrFpInductionRecipe for \p PhiR. If needed, it will
+/// also insert a recipe to expand the step for the induction recipe.
 static VPWidenIntOrFpInductionRecipe *
-createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
-                            VPValue *Start, const InductionDescriptor &IndDesc,
-                            VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
-  assert(IndDesc.getStartValue() ==
-         Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
+createWidenInductionRecipes(VPInstruction *PhiR,
+                            const InductionDescriptor &IndDesc, VPlan &Plan,
+                            ScalarEvolution &SE, Loop &OrigLoop) {
   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
          "step must be loop invariant");
 
+  VPValue *Start = PhiR->getOperand(0);
+  assert(Plan.getLiveIn(IndDesc.getStartValue()) == Start &&
+         "Start VPValue must match IndDesc's start value");
+
   VPValue *Step =
       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep());
-  if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
-    return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
-                                             IndDesc, TruncI,
-                                             TruncI->getDebugLoc());
-  }
-  assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
+
+  // Update wide induction increments to use the same step as the corresponding
+  // wide induction. This enables detecting induction increments directly in
+  // VPlan and removes redundant splats.
+  using namespace llvm::VPlanPatternMatch;
+  if (match(PhiR->getOperand(1), m_Add(m_Specific(PhiR), m_VPValue())))
+    PhiR->getOperand(1)->getDefiningRecipe()->setOperand(1, Step);
+
+  PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingInstr());
   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
-                                           IndDesc, Phi->getDebugLoc());
+                                           IndDesc, PhiR->getDebugLoc());
 }
 
-VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
-    PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
+VPHeaderPHIRecipe *
+VPRecipeBuilder::tryToOptimizeInductionPHI(VPInstruction *VPI, VFRange &Range) {
+  auto *Phi = cast<PHINode>(VPI->getUnderlyingInstr());
 
   // Check if this is an integer or fp induction. If so, build the recipe that
   // produces its scalar and vector values.
   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
-    return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
-                                       *PSE.getSE(), *OrigLoop);
+    return createWidenInductionRecipes(VPI, *II, Plan, *PSE.getSE(), *OrigLoop);
 
   // Check if this is pointer induction. If so, build the recipe for it.
   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep());
     return new VPWidenPointerInductionRecipe(
-        Phi, Operands[0], Step, &Plan.getVFxUF(), *II,
+        Phi, VPI->getOperand(0), Step, &Plan.getVFxUF(), *II,
         LoopVectorizationPlanner::getDecisionAndClampRange(
             [&](ElementCount VF) {
               return CM.isScalarAfterVectorization(Phi, VF);
             },
             Range),
-        Phi->getDebugLoc());
+        VPI->getDebugLoc());
   }
   return nullptr;
 }
 
-VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
-    TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
+VPWidenIntOrFpInductionRecipe *
+VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
+                                                VFRange &Range) {
+  auto *I = cast<TruncInst>(VPI->getUnderlyingInstr());
   // Optimize the special case where the source is a constant integer
   // induction variable. Notice that we can only optimize the 'trunc' case
   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
@@ -7620,21 +7688,24 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
     };
   };
 
-  if (LoopVectorizationPlanner::getDecisionAndClampRange(
-          IsOptimizableIVTruncate(I), Range)) {
+  if (!LoopVectorizationPlanner::getDecisionAndClampRange(
+          IsOptimizableIVTruncate(I), Range))
+    return nullptr;
 
-    auto *Phi = cast<PHINode>(I->getOperand(0));
-    const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
-    VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
-    return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
-                                       *OrigLoop);
-  }
-  return nullptr;
+  auto *WidenIV = cast<VPWidenIntOrFpInductionRecipe>(
+      VPI->getOperand(0)->getDefiningRecipe());
+  PHINode *Phi = WidenIV->getPHINode();
+  VPValue *Start = WidenIV->getStartValue();
+  const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
+  VPValue *Step =
+      vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep());
+  return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
+                                           IndDesc, I, VPI->getDebugLoc());
 }
 
-VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
-                                                   ArrayRef<VPValue *> Operands,
+VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
                                                    VFRange &Range) {
+  CallInst *CI = cast<CallInst>(VPI->getUnderlyingInstr());
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
       [this, CI](ElementCount VF) {
         return CM.isScalarWithPredication(CI, VF);
@@ -7651,7 +7722,8 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
              ID == Intrinsic::experimental_noalias_scope_decl))
     return nullptr;
 
-  SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
+  SmallVector<VPValue *, 4> Ops(VPI->op_begin(),
+                                VPI->op_begin() + CI->arg_size());
 
   // Is it beneficial to perform intrinsic call compared to lib call?
   bool ShouldUseVectorIntrinsic =
@@ -7663,7 +7735,7 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
                 Range);
   if (ShouldUseVectorIntrinsic)
     return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
-                                      CI->getDebugLoc());
+                                      VPI->getDebugLoc());
 
   Function *Variant = nullptr;
   std::optional<unsigned> MaskPos;
@@ -7710,13 +7782,13 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
         Mask = getBlockInMask(Builder.getInsertBlock());
       else
         Mask = Plan.getOrAddLiveIn(
-            ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext())));
+            ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext())));
 
       Ops.insert(Ops.begin() + *MaskPos, Mask);
     }
 
-    Ops.push_back(Operands.back());
-    return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
+    Ops.push_back(VPI->getOperand(VPI->getNumOperands() - 1));
+    return new VPWidenCallRecipe(CI, Variant, Ops, VPI->getDebugLoc());
   }
 
   return nullptr;
@@ -7736,9 +7808,9 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
                                                              Range);
 }
 
-VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
-                                           ArrayRef<VPValue *> Operands) {
-  switch (I->getOpcode()) {
+VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
+  auto *I = VPI->getUnderlyingInstr();
+  switch (VPI->getOpcode()) {
   default:
     return nullptr;
   case Instruction::SDiv:
@@ -7748,11 +7820,11 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
     // If not provably safe, use a select to form a safe divisor before widening the
     // div/rem operation itself.  Otherwise fall through to general handling below.
     if (CM.isPredicatedInst(I)) {
-      SmallVector<VPValue *> Ops(Operands);
+      SmallVector<VPValue *> Ops(VPI->operands());
       VPValue *Mask = getBlockInMask(Builder.getInsertBlock());
-      VPValue *One =
-          Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
-      auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
+      VPValue *One = Plan.getConstantInt(I->getType(), 1u);
+      auto *SafeRHS =
+          Builder.createSelect(Mask, Ops[1], One, VPI->getDebugLoc());
       Ops[1] = SafeRHS;
       return new VPWidenRecipe(*I, Ops);
     }
@@ -7777,8 +7849,8 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
   case Instruction::Sub:
   case Instruction::Xor:
   case Instruction::Freeze: {
-    SmallVector<VPValue *> NewOps(Operands);
-    if (Instruction::isBinaryOp(I->getOpcode())) {
+    SmallVector<VPValue *> NewOps(VPI->operands());
+    if (Instruction::isBinaryOp(VPI->getOpcode())) {
       // The legacy cost model uses SCEV to check if some of the operands are
       // constants. To match the legacy cost model's behavior, use SCEV to try
       // to replace operands with constants.
@@ -7795,7 +7867,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
         return Plan.getOrAddLiveIn(C->getValue());
       };
       // For Mul, the legacy cost model checks both operands.
-      if (I->getOpcode() == Instruction::Mul)
+      if (VPI->getOpcode() == Instruction::Mul)
         NewOps[0] = GetConstantViaSCEV(NewOps[0]);
       // For other binops, the legacy cost model only checks the second operand.
       NewOps[1] = GetConstantViaSCEV(NewOps[1]);
@@ -7803,20 +7875,18 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
     return new VPWidenRecipe(*I, NewOps);
   }
   case Instruction::ExtractValue: {
-    SmallVector<VPValue *> NewOps(Operands);
-    Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
+    SmallVector<VPValue *> NewOps(VPI->operands());
     auto *EVI = cast<ExtractValueInst>(I);
     assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
     unsigned Idx = EVI->getIndices()[0];
-    NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
+    NewOps.push_back(Plan.getConstantInt(32, Idx));
     return new VPWidenRecipe(*I, NewOps);
   }
   };
 }
 
-VPHistogramRecipe *
-VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
-                                     ArrayRef<VPValue *> Operands) {
+VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
+                                                        VPInstruction *VPI) {
   // FIXME: Support other operations.
   unsigned Opcode = HI->Update->getOpcode();
   assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
@@ -7824,7 +7894,7 @@ VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
 
   SmallVector<VPValue *, 3> HGramOps;
   // Bucket address.
-  HGramOps.push_back(Operands[1]);
+  HGramOps.push_back(VPI->getOperand(1));
   // Increment value.
   HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
 
@@ -7833,12 +7903,12 @@ VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
   if (Legal->isMaskRequired(HI->Store))
     HGramOps.push_back(getBlockInMask(Builder.getInsertBlock()));
 
-  return new VPHistogramRecipe(Opcode, HGramOps, HI->Store->getDebugLoc());
+  return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc());
 }
 
-VPReplicateRecipe *
-VPRecipeBuilder::handleReplication(Instruction *I, ArrayRef<VPValue *> Operands,
-                                   VFRange &Range) {
+VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI,
+                                                      VFRange &Range) {
+  auto *I = VPI->getUnderlyingInstr();
   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
       Range);
@@ -7894,8 +7964,8 @@ VPRecipeBuilder::handleReplication(Instruction *I, ArrayRef<VPValue *> Operands,
   assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
           (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
          "Should not predicate a uniform recipe");
-  auto *Recipe = new VPReplicateRecipe(I, Operands, IsUniform, BlockInMask,
-                                       VPIRMetadata(*I, LVer));
+  auto *Recipe = new VPReplicateRecipe(I, VPI->operands(), IsUniform,
+                                       BlockInMask, VPIRMetadata(*I, LVer));
   return Recipe;
 }
 
@@ -7934,6 +8004,26 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
         (!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)))
       ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second);
   }
+
+  // Check that all partial reductions in a chain are only used by other
+  // partial reductions with the same scale factor. Otherwise we end up creating
+  // users of scaled reductions where the types of the other operands don't
+  // match.
+  for (const auto &[Chain, Scale] : PartialReductionChains) {
+    auto AllUsersPartialRdx = [ScaleVal = Scale, this](const User *U) {
+      auto *UI = cast<Instruction>(U);
+      if (isa<PHINode>(UI) && UI->getParent() == OrigLoop->getHeader()) {
+        return all_of(UI->users(), [ScaleVal, this](const User *U) {
+          auto *UI = cast<Instruction>(U);
+          return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal;
+        });
+      }
+      return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal ||
+             !OrigLoop->contains(UI->getParent());
+    };
+    if (!all_of(Chain.Reduction->users(), AllUsersPartialRdx))
+      ScaledReductionMap.erase(Chain.Reduction);
+  }
 }
 
 bool VPRecipeBuilder::getScaledReductions(
@@ -8056,8 +8146,6 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
   // First, check for specific widening recipes that deal with inductions, Phi
   // nodes, calls and memory operations.
   VPRecipeBase *Recipe;
-  Instruction *Instr = R->getUnderlyingInstr();
-  SmallVector<VPValue *, 4> Operands(R->operands());
   if (auto *PhiR = dyn_cast<VPPhi>(R)) {
     VPBasicBlock *Parent = PhiR->getParent();
     [[maybe_unused]] VPRegionBlock *LoopRegionOf =
@@ -8065,15 +8153,15 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
     assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent &&
            "Non-header phis should have been handled during predication");
     auto *Phi = cast<PHINode>(R->getUnderlyingInstr());
-    assert(Operands.size() == 2 && "Must have 2 operands for header phis");
-    if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
+    assert(R->getNumOperands() == 2 && "Must have 2 operands for header phis");
+    if ((Recipe = tryToOptimizeInductionPHI(PhiR, Range)))
       return Recipe;
 
     VPHeaderPHIRecipe *PhiRecipe = nullptr;
     assert((Legal->isReductionVariable(Phi) ||
             Legal->isFixedOrderRecurrence(Phi)) &&
            "can only widen reductions and fixed-order recurrences here");
-    VPValue *StartV = Operands[0];
+    VPValue *StartV = R->getOperand(0);
     if (Legal->isReductionVariable(Phi)) {
       const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(Phi);
       assert(RdxDesc.getRecurrenceStartValue() ==
@@ -8093,13 +8181,15 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
     }
     // Add backedge value.
-    PhiRecipe->addOperand(Operands[1]);
+    PhiRecipe->addOperand(R->getOperand(1));
     return PhiRecipe;
   }
   assert(!R->isPhi() && "only VPPhi nodes expected at this point");
 
-  if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
-                                    cast<TruncInst>(Instr), Operands, Range)))
+  auto *VPI = cast<VPInstruction>(R);
+  Instruction *Instr = R->getUnderlyingInstr();
+  if (VPI->getOpcode() == Instruction::Trunc &&
+      (Recipe = tryToOptimizeInductionTruncate(VPI, Range)))
     return Recipe;
 
   // All widen recipes below deal only with VF > 1.
@@ -8107,82 +8197,71 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
           [&](ElementCount VF) { return VF.isScalar(); }, Range))
     return nullptr;
 
-  if (auto *CI = dyn_cast<CallInst>(Instr))
-    return tryToWidenCall(CI, Operands, Range);
+  if (VPI->getOpcode() == Instruction::Call)
+    return tryToWidenCall(VPI, Range);
 
-  if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
-    if (auto HistInfo = Legal->getHistogramInfo(SI))
-      return tryToWidenHistogram(*HistInfo, Operands);
+  if (VPI->getOpcode() == Instruction::Store)
+    if (auto HistInfo = Legal->getHistogramInfo(cast<StoreInst>(Instr)))
+      return tryToWidenHistogram(*HistInfo, VPI);
 
-  if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
-    return tryToWidenMemory(Instr, Operands, Range);
+  if (VPI->getOpcode() == Instruction::Load ||
+      VPI->getOpcode() == Instruction::Store)
+    return tryToWidenMemory(VPI, Range);
 
-  if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr)) {
-    if (auto PartialRed =
-            tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()))
-      return PartialRed;
-  }
+  if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
+    return tryToCreatePartialReduction(VPI, ScaleFactor.value());
 
   if (!shouldWiden(Instr, Range))
     return nullptr;
 
-  if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
-    return new VPWidenGEPRecipe(GEP, Operands);
+  if (VPI->getOpcode() == Instruction::GetElementPtr)
+    return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr), R->operands());
 
-  if (auto *SI = dyn_cast<SelectInst>(Instr)) {
-    return new VPWidenSelectRecipe(*SI, Operands);
-  }
+  if (VPI->getOpcode() == Instruction::Select)
+    return new VPWidenSelectRecipe(*cast<SelectInst>(Instr), R->operands());
 
-  if (auto *CI = dyn_cast<CastInst>(Instr)) {
-    return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
-                                 *CI);
+  if (Instruction::isCast(VPI->getOpcode())) {
+    auto *CastR = cast<VPInstructionWithType>(R);
+    auto *CI = cast<CastInst>(Instr);
+    return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0),
+                                 CastR->getResultType(), *CI);
   }
 
-  return tryToWiden(Instr, Operands);
+  return tryToWiden(VPI);
 }
 
 VPRecipeBase *
-VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
-                                             ArrayRef<VPValue *> Operands,
+VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction,
                                              unsigned ScaleFactor) {
-  assert(Operands.size() == 2 &&
+  assert(Reduction->getNumOperands() == 2 &&
          "Unexpected number of operands for partial reduction");
 
-  VPValue *BinOp = Operands[0];
-  VPValue *Accumulator = Operands[1];
-  VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
-  if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
-      isa<VPPartialReductionRecipe>(BinOpRecipe))
+  VPValue *BinOp = Reduction->getOperand(0);
+  VPValue *Accumulator = Reduction->getOperand(1);
+  if (isa<VPReductionPHIRecipe>(BinOp) || isa<VPPartialReductionRecipe>(BinOp))
     std::swap(BinOp, Accumulator);
 
-  if (ScaleFactor !=
-      vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()))
-    return nullptr;
+  assert(ScaleFactor ==
+             vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()) &&
+         "all accumulators in chain must have same scale factor");
 
   unsigned ReductionOpcode = Reduction->getOpcode();
+  auto *ReductionI = Reduction->getUnderlyingInstr();
   if (ReductionOpcode == Instruction::Sub) {
-    auto *const Zero = ConstantInt::get(Reduction->getType(), 0);
+    auto *const Zero = ConstantInt::get(ReductionI->getType(), 0);
     SmallVector<VPValue *, 2> Ops;
     Ops.push_back(Plan.getOrAddLiveIn(Zero));
     Ops.push_back(BinOp);
-    BinOp = new VPWidenRecipe(*Reduction, Ops);
+    BinOp = new VPWidenRecipe(*ReductionI, Ops);
     Builder.insert(BinOp->getDefiningRecipe());
     ReductionOpcode = Instruction::Add;
   }
 
   VPValue *Cond = nullptr;
-  if (CM.blockNeedsPredicationForAnyReason(Reduction->getParent())) {
-    assert((ReductionOpcode == Instruction::Add ||
-            ReductionOpcode == Instruction::Sub) &&
-           "Expected an ADD or SUB operation for predicated partial "
-           "reductions (because the neutral element in the mask is zero)!");
+  if (CM.blockNeedsPredicationForAnyReason(ReductionI->getParent()))
     Cond = getBlockInMask(Builder.getInsertBlock());
-    VPValue *Zero =
-        Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
-    BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
-  }
   return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
-                                      ScaleFactor, Reduction);
+                                      ScaleFactor, ReductionI);
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -8367,7 +8446,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
       VPRecipeBase *Recipe =
           RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range);
       if (!Recipe)
-        Recipe = RecipeBuilder.handleReplication(Instr, R.operands(), Range);
+        Recipe = RecipeBuilder.handleReplication(cast<VPInstruction>(SingleDef),
+                                                 Range);
 
       RecipeBuilder.setRecipe(Instr, Recipe);
       if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
@@ -8401,20 +8481,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
          "entry block must be set to a VPRegionBlock having a non-empty entry "
          "VPBasicBlock");
 
-  // Update wide induction increments to use the same step as the corresponding
-  // wide induction. This enables detecting induction increments directly in
-  // VPlan and removes redundant splats.
-  for (const auto &[Phi, ID] : Legal->getInductionVars()) {
-    auto *IVInc = cast<Instruction>(
-        Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
-    if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
-      continue;
-    VPWidenInductionRecipe *WideIV =
-        cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
-    VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
-    R->setOperand(1, WideIV->getStepValue());
-  }
-
   // TODO: We can't call runPass on these transforms yet, due to verifier
   // failures.
   VPlanTransforms::addExitUsersForFirstOrderRecurrences(*Plan, Range);
@@ -8441,7 +8507,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // and mulacc-reduction are implemented.
   if (!CM.foldTailWithEVL()) {
     VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
-                          *CM.PSE.getSE());
+                          *CM.PSE.getSE(), OrigLoop);
     VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
                              CostCtx, Range);
   }
@@ -8555,6 +8621,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
 void LoopVectorizationPlanner::adjustRecipesForReductions(
     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
   using namespace VPlanPatternMatch;
+  VPTypeAnalysis TypeInfo(*Plan);
   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
   VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
@@ -8639,9 +8706,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
         LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
         VecOp = FMulRecipe;
       } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs &&
-                 CurrentLinkI->getOpcode() == Instruction::Sub) {
-        Type *PhiTy = PhiR->getUnderlyingValue()->getType();
-        auto *Zero = Plan->getOrAddLiveIn(ConstantInt::get(PhiTy, 0));
+                 match(CurrentLink, m_Sub(m_VPValue(), m_VPValue()))) {
+        Type *PhiTy = TypeInfo.inferScalarType(PhiR);
+        auto *Zero = Plan->getConstantInt(PhiTy, 0);
         VPWidenRecipe *Sub = new VPWidenRecipe(
             Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {},
             VPIRMetadata(), CurrentLinkI->getDebugLoc());
@@ -8716,7 +8783,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
 
     const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
         cast<PHINode>(PhiR->getUnderlyingInstr()));
-    Type *PhiTy = PhiR->getUnderlyingValue()->getType();
+    Type *PhiTy = TypeInfo.inferScalarType(PhiR);
     // If tail is folded by masking, introduce selects between the phi
     // and the users outside the vector region of each reduction, at the
     // beginning of the dedicated latch block.
@@ -8726,7 +8793,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     // with fewer lanes than the VF. So the operands of the select would have
     // different numbers of lanes. Partial reductions mask the input instead.
     if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
-        !isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) {
+        !isa<VPPartialReductionRecipe>(OrigExitingVPV)) {
       VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent());
       std::optional<FastMathFlags> FMFs =
           PhiTy->isFloatingPointTy()
@@ -8823,7 +8890,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       if (FinalReductionResult == U || Parent->getParent())
         continue;
       U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
-      if (match(U, m_ExtractLastElement(m_VPValue())))
+      if (match(U, m_CombineOr(m_ExtractLastElement(m_VPValue()),
+                               m_ExtractLane(m_VPValue(), m_VPValue()))))
         cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);
     }
 
@@ -8855,8 +8923,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       ToDelete.push_back(Select);
 
       // Convert the reduction phi to operate on bools.
-      PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
-                              OrigLoop->getHeader()->getContext())));
+      PhiR->setOperand(0, Plan->getFalse());
       continue;
     }
 
@@ -8878,9 +8945,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       unsigned ScaleFactor =
           RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr())
               .value_or(1);
-      Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext());
-      auto *ScaleFactorVPV =
-          Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor));
+      auto *ScaleFactorVPV = Plan->getConstantInt(32, ScaleFactor);
       VPValue *StartV = PHBuilder.createNaryOp(
           VPInstruction::ReductionStartVector,
           {PhiR->getStartValue(), Iden, ScaleFactorVPV},
@@ -9911,7 +9976,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     bool ForceVectorization =
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
     VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
-                          CM.CostKind, *CM.PSE.getSE());
+                          CM.CostKind, *CM.PSE.getSE(), L);
     if (!ForceVectorization &&
         !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
                                      LVP.getPlanFor(VF.Width), SEL,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1b55a3b235228..cc53b0dd3577e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5368,7 +5368,6 @@ class BoUpSLP {
           Lane = P.first->ReorderIndices[Lane];
         assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
                "Couldn't find extract lane");
-        SmallVector<unsigned> OpIndices;
         for (unsigned OpIdx :
              seq<unsigned>(::getNumberOfPotentiallyCommutativeOps(
                  P.first->getMainOp()))) {
@@ -8815,7 +8814,6 @@ void BoUpSLP::buildExternalUses(
     const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
   DenseMap<Value *, unsigned> ScalarToExtUses;
-  SmallPtrSet<Value *, 4> ExternalUsers;
   // Collect the values that we need to extract from the tree.
   for (auto &TEPtr : VectorizableTree) {
     TreeEntry *Entry = TEPtr.get();
@@ -16846,6 +16844,16 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
     }
     return false;
   };
+  auto CheckNonSchedulableOrdering = [&](const TreeEntry *E,
+                                         Instruction *InsertPt) {
+    return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
+           !TEUseEI.UserTE->isCopyableElement(
+               const_cast<Instruction *>(TEInsertPt)) &&
+           isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
+           InsertPt->getNextNode() == TEInsertPt &&
+           (!E->hasCopyableElements() || !E->isCopyableElement(InsertPt) ||
+            !isUsedOutsideBlock(InsertPt));
+  };
   for (Value *V : VL) {
     if (isConstant(V) || !VisitedValue.insert(V).second)
       continue;
@@ -16928,6 +16936,11 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       // The node is reused - exit.
       if (CheckAndUseSameNode(TEPtr))
         break;
+      // The parent node is copyable with last inst used outside? And the last
+      // inst is the next inst for the lastinst of TEPtr? Exit, if yes, to
+      // preserve def-use chain.
+      if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
+        continue;
       VToTEs.insert(TEPtr);
     }
     if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
@@ -16963,7 +16976,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       if (none_of(TE->CombinedEntriesWithIndices,
                   [&](const auto &P) { return P.first == VTE->Idx; })) {
         Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
-        if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
+        if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst) ||
+            CheckNonSchedulableOrdering(VTE, &LastBundleInst))
           continue;
       }
       // The node is reused - exit.
@@ -20975,6 +20989,52 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
   if (isa<PHINode>(S.getMainOp()) ||
       isVectorLikeInstWithConstOps(S.getMainOp()))
     return nullptr;
+  // If the parent node is non-schedulable and the current node is copyable, and
+  // any of parent instructions are used outside several basic blocks or in
+  // bin-op node - cancel scheduling, it may cause wrong def-use deps in
+  // analysis, leading to a crash.
+  // Non-scheduled nodes may not have related ScheduleData model, which may lead
+  // to a skipped dep analysis.
+  if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
+      EI.UserTE->doesNotNeedToSchedule() &&
+      EI.UserTE->getOpcode() != Instruction::PHI &&
+      any_of(EI.UserTE->Scalars, [](Value *V) {
+        auto *I = dyn_cast<Instruction>(V);
+        if (!I || I->hasOneUser())
+          return false;
+        for (User *U : I->users()) {
+          auto *UI = cast<Instruction>(U);
+          if (isa<BinaryOperator>(UI))
+            return true;
+        }
+        return false;
+      }))
+    return std::nullopt;
+  if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
+      EI.UserTE->hasCopyableElements() &&
+      EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
+      all_of(VL, [&](Value *V) {
+        if (S.isCopyableElement(V))
+          return true;
+        return isUsedOutsideBlock(V);
+      }))
+    return std::nullopt;
+  // If any instruction is used outside block only and its operand is placed
+  // immediately before it, do not schedule, it may cause wrong def-use chain.
+  if (S.areInstructionsWithCopyableElements() && any_of(VL, [&](Value *V) {
+        if (isa<PoisonValue>(V) || S.isCopyableElement(V))
+          return false;
+        if (isUsedOutsideBlock(V)) {
+          for (Value *Op : cast<Instruction>(V)->operands()) {
+            auto *I = dyn_cast<Instruction>(Op);
+            if (!I)
+              continue;
+            return SLP->isVectorized(I) && I->getNextNode() == V;
+          }
+        }
+        return false;
+      }))
+    return std::nullopt;
   bool HasCopyables = S.areInstructionsWithCopyableElements();
   if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
        all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
@@ -21194,7 +21254,6 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     }
     ScheduledBundlesList.pop_back();
     SmallVector<ScheduleData *> ControlDependentMembers;
-    SmallPtrSet<Instruction *, 4> Visited;
     for (Value *V : VL) {
       if (S.isNonSchedulable(V))
         continue;
@@ -22134,6 +22193,27 @@ bool BoUpSLP::collectValuesToDemote(
         {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
          VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
 
+  if (E.isAltShuffle()) {
+    // Combining these opcodes may lead to incorrect analysis, skip for now.
+    auto IsDangerousOpcode = [](unsigned Opcode) {
+      switch (Opcode) {
+      case Instruction::Shl:
+      case Instruction::AShr:
+      case Instruction::LShr:
+      case Instruction::UDiv:
+      case Instruction::SDiv:
+      case Instruction::URem:
+      case Instruction::SRem:
+        return true;
+      default:
+        break;
+      }
+      return false;
+    };
+    if (IsDangerousOpcode(E.getAltOpcode()))
+      return FinalAnalysis();
+  }
+
   switch (E.getOpcode()) {
 
   // We can always demote truncations and extensions. Since truncations can
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
index 9c869dd1bbdca..d354933f9d4ec 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
@@ -92,7 +92,7 @@ void MemDGNode::print(raw_ostream &OS, bool PrintDeps) const {
   DGNode::print(OS, false);
   if (PrintDeps) {
     // Print memory preds.
-    static constexpr const unsigned Indent = 4;
+    static constexpr unsigned Indent = 4;
     for (auto *Pred : MemPreds)
       OS.indent(Indent) << "<-" << *Pred->getInstruction() << "\n";
   }
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
index 86dbd2171a560..5534da902b968 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
@@ -25,14 +25,14 @@ static cl::opt<bool>
                           "emit new instructions (*very* expensive)."));
 #endif // NDEBUG
 
-static constexpr const unsigned long StopAtDisabled =
+static constexpr unsigned long StopAtDisabled =
     std::numeric_limits<unsigned long>::max();
 static cl::opt<unsigned long>
     StopAt("sbvec-stop-at", cl::init(StopAtDisabled), cl::Hidden,
            cl::desc("Vectorize if the invocation count is < than this. 0 "
                     "disables vectorization."));
 
-static constexpr const unsigned long StopBundleDisabled =
+static constexpr unsigned long StopBundleDisabled =
     std::numeric_limits<unsigned long>::max();
 static cl::opt<unsigned long>
     StopBundle("sbvec-stop-bndl", cl::init(StopBundleDisabled), cl::Hidden,
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
index ed2f80ba8900a..2de692143c1b6 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
@@ -43,7 +43,7 @@ cl::opt<std::string> AllowFiles(
     "sbvec-allow-files", cl::init(".*"), cl::Hidden,
     cl::desc("Run the vectorizer only on file paths that match any in the "
              "list of comma-separated regex's."));
-static constexpr const char AllowFilesDelim = ',';
+static constexpr char AllowFilesDelim = ',';
 
 SandboxVectorizerPass::SandboxVectorizerPass() : FPM("fpm") {
   if (UserDefinedPassPipeline == DefaultPipelineMagicStr) {
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 41878e3c648e3..a7000aff06379 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -93,42 +93,37 @@ class VPRecipeBuilder {
   /// Range. The function should not be called for memory instructions or calls.
   bool shouldWiden(Instruction *I, VFRange &Range) const;
 
-  /// Check if the load or store instruction \p I should widened for \p
+  /// Check if the load or store instruction \p VPI should widened for \p
   /// Range.Start and potentially masked. Such instructions are handled by a
   /// recipe that takes an additional VPInstruction for the mask.
-  VPWidenMemoryRecipe *tryToWidenMemory(Instruction *I,
-                                        ArrayRef<VPValue *> Operands,
-                                        VFRange &Range);
+  VPWidenMemoryRecipe *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
 
-  /// Check if an induction recipe should be constructed for \p Phi. If so build
+  /// Check if an induction recipe should be constructed for \p VPI. If so build
   /// and return it. If not, return null.
-  VPHeaderPHIRecipe *tryToOptimizeInductionPHI(PHINode *Phi,
-                                               ArrayRef<VPValue *> Operands,
+  VPHeaderPHIRecipe *tryToOptimizeInductionPHI(VPInstruction *VPI,
                                                VFRange &Range);
 
-  /// Optimize the special case where the operand of \p I is a constant integer
-  /// induction variable.
+  /// Optimize the special case where the operand of \p VPI is a constant
+  /// integer induction variable.
   VPWidenIntOrFpInductionRecipe *
-  tryToOptimizeInductionTruncate(TruncInst *I, ArrayRef<VPValue *> Operands,
-                                 VFRange &Range);
+  tryToOptimizeInductionTruncate(VPInstruction *VPI, VFRange &Range);
 
-  /// Handle call instructions. If \p CI can be widened for \p Range.Start,
+  /// Handle call instructions. If \p VPI can be widened for \p Range.Start,
   /// return a new VPWidenCallRecipe or VPWidenIntrinsicRecipe. Range.End may be
   /// decreased to ensure same decision from \p Range.Start to \p Range.End.
-  VPSingleDefRecipe *tryToWidenCall(CallInst *CI, ArrayRef<VPValue *> Operands,
-                                    VFRange &Range);
+  VPSingleDefRecipe *tryToWidenCall(VPInstruction *VPI, VFRange &Range);
 
-  /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe
-  /// if it can. The function should only be called if the cost-model indicates
-  /// that widening should be performed.
-  VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef<VPValue *> Operands);
+  /// Check if \p VPI has an opcode that can be widened and return a
+  /// VPWidenRecipe if it can. The function should only be called if the
+  /// cost-model indicates that widening should be performed.
+  VPWidenRecipe *tryToWiden(VPInstruction *VPI);
 
   /// Makes Histogram count operations safe for vectorization, by emitting a
   /// llvm.experimental.vector.histogram.add intrinsic in place of the
   /// Load + Add|Sub + Store operations that perform the histogram in the
   /// original scalar loop.
   VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
-                                         ArrayRef<VPValue *> Operands);
+                                         VPInstruction *VPI);
 
   /// Examines reduction operations to see if the target can use a cheaper
   /// operation with a wider per-iteration input VF and narrower PHI VF.
@@ -171,8 +166,7 @@ class VPRecipeBuilder {
 
   /// Create and return a partial reduction recipe for a reduction instruction
   /// along with binary operation and reduction phi operands.
-  VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
-                                            ArrayRef<VPValue *> Operands,
+  VPRecipeBase *tryToCreatePartialReduction(VPInstruction *Reduction,
                                             unsigned ScaleFactor);
 
   /// Set the recipe created for given ingredient.
@@ -197,12 +191,10 @@ class VPRecipeBuilder {
     return Ingredient2Recipe[I];
   }
 
-  /// Build a VPReplicationRecipe for \p I using \p Operands. If it is
-  /// predicated, add the mask as last operand. Range.End may be decreased to
-  /// ensure same recipe behavior from \p Range.Start to \p Range.End.
-  VPReplicateRecipe *handleReplication(Instruction *I,
-                                       ArrayRef<VPValue *> Operands,
-                                       VFRange &Range);
+  /// Build a VPReplicationRecipe for \p VPI. If it is predicated, add the mask
+  /// as last operand. Range.End may be decreased to ensure same recipe behavior
+  /// from \p Range.Start to \p Range.End.
+  VPReplicateRecipe *handleReplication(VPInstruction *VPI, VFRange &Range);
 
   VPValue *getVPValueOrAddLiveIn(Value *V) {
     if (auto *I = dyn_cast<Instruction>(V)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 428a8f4c1348f..62dacf912e210 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -52,10 +52,6 @@
 using namespace llvm;
 using namespace llvm::VPlanPatternMatch;
 
-namespace llvm {
-extern cl::opt<bool> EnableVPlanNativePath;
-}
-
 /// @{
 /// Metadata attribute names
 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
@@ -103,20 +99,20 @@ VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def)
 
 VPValue::~VPValue() {
   assert(Users.empty() && "trying to delete a VPValue with remaining users");
-  if (Def)
+  if (VPDef *Def = getDefiningRecipe())
     Def->removeDefinedValue(this);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const {
-  if (const VPRecipeBase *R = dyn_cast_or_null<VPRecipeBase>(Def))
+  if (const VPRecipeBase *R = getDefiningRecipe())
     R->print(OS, "", SlotTracker);
   else
     printAsOperand(OS, SlotTracker);
 }
 
 void VPValue::dump() const {
-  const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this->Def);
+  const VPRecipeBase *Instr = getDefiningRecipe();
   VPSlotTracker SlotTracker(
       (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
   print(dbgs(), SlotTracker);
@@ -304,18 +300,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
   }
 
   bool IsSingleScalar = vputils::isSingleScalar(Def);
-
   VPLane LastLane(IsSingleScalar ? 0 : VF.getFixedValue() - 1);
-  // Check if there is a scalar value for the selected lane.
-  if (!hasScalarValue(Def, LastLane)) {
-    // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
-    // VPExpandSCEVRecipes can also be a single scalar.
-    assert((isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe,
-                VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
-           "unexpected recipe found to be invariant");
-    IsSingleScalar = true;
-    LastLane = 0;
-  }
 
   // We need to construct the vector value for a single-scalar value by
   // broadcasting the scalar to all lanes.
@@ -1443,7 +1428,7 @@ void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const {
 void VPSlotTracker::assignName(const VPValue *V) {
   assert(!VPValue2Name.contains(V) && "VPValue already has a name!");
   auto *UV = V->getUnderlyingValue();
-  auto *VPI = dyn_cast_or_null<VPInstruction>(V->getDefiningRecipe());
+  auto *VPI = dyn_cast_or_null<VPInstruction>(V);
   if (!UV && !(VPI && !VPI->getName().empty())) {
     VPValue2Name[V] = (Twine("vp<%") + Twine(NextSlot) + ">").str();
     NextSlot++;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1f10058ab4a9a..08f77b75400bd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -939,7 +939,7 @@ class VPIRMetadata {
   SmallVector<std::pair<unsigned, MDNode *>> Metadata;
 
 public:
-  VPIRMetadata() {}
+  VPIRMetadata() = default;
 
   /// Adds metatadata that can be preserved from the original instruction
   /// \p I.
@@ -950,18 +950,20 @@ class VPIRMetadata {
   VPIRMetadata(Instruction &I, LoopVersioning *LVer);
 
   /// Copy constructor for cloning.
-  VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {}
+  VPIRMetadata(const VPIRMetadata &Other) = default;
 
-  VPIRMetadata &operator=(const VPIRMetadata &Other) {
-    Metadata = Other.Metadata;
-    return *this;
-  }
+  VPIRMetadata &operator=(const VPIRMetadata &Other) = default;
 
   /// Add all metadata to \p I.
   void applyMetadata(Instruction &I) const;
 
   /// Add metadata with kind \p Kind and \p Node.
   void addMetadata(unsigned Kind, MDNode *Node) {
+    assert(none_of(Metadata,
+                   [Kind](const std::pair<unsigned, MDNode *> &P) {
+                     return P.first == Kind;
+                   }) &&
+           "Kind must appear at most once in Metadata");
     Metadata.emplace_back(Kind, Node);
   }
 
@@ -1045,6 +1047,13 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
     // It produces the lane index across all unrolled iterations. Unrolling will
     // add all copies of its original operand as additional operands.
     FirstActiveLane,
+    // Calculates the last active lane index of the vector predicate operands.
+    // The predicates must be prefix-masks (all 1s before all 0s). Used when
+    // tail-folding to extract the correct live-out value from the last active
+    // iteration. It produces the lane index across all unrolled iterations.
+    // Unrolling will add all copies of its original operand as additional
+    // operands.
+    LastActiveLane,
 
     // The opcodes below are used for VPInstructionWithType.
     //
@@ -1107,14 +1116,14 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
         VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {}
 
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                const VPIRFlags &Flags, DebugLoc DL = DebugLoc::getUnknown(),
-                const Twine &Name = "");
+                const VPIRFlags &Flags, const VPIRMetadata &MD = {},
+                DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "");
 
   VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
 
   VPInstruction *clone() override {
-    SmallVector<VPValue *, 2> Operands(operands());
-    auto *New = new VPInstruction(Opcode, Operands, *this, getDebugLoc(), Name);
+    auto *New = new VPInstruction(Opcode, operands(), *this, *this,
+                                  getDebugLoc(), Name);
     if (getUnderlyingValue())
       New->setUnderlyingValue(getUnderlyingInstr());
     return New;
@@ -1166,10 +1175,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
   bool opcodeMayReadOrWriteFromMemory() const;
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override;
+  bool usesFirstLaneOnly(const VPValue *Op) const override;
 
   /// Returns true if the recipe only uses the first part of operand \p Op.
-  bool onlyFirstPartUsed(const VPValue *Op) const override;
+  bool usesFirstPartOnly(const VPValue *Op) const override;
 
   /// Returns true if this VPInstruction produces a scalar value from a vector,
   /// e.g. by performing a reduction or extracting a lane.
@@ -1196,7 +1205,14 @@ class VPInstructionWithType : public VPInstruction {
   VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands,
                         Type *ResultTy, const VPIRFlags &Flags, DebugLoc DL,
                         const Twine &Name = "")
-      : VPInstruction(Opcode, Operands, Flags, DL, Name), ResultTy(ResultTy) {}
+      : VPInstruction(Opcode, Operands, Flags, {}, DL, Name),
+        ResultTy(ResultTy) {}
+
+  VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands,
+                        Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags,
+                        const VPIRMetadata &Metadata, const Twine &Name = "")
+      : VPInstruction(Opcode, Operands, Flags, Metadata, DL, Name),
+        ResultTy(ResultTy) {}
 
   static inline bool classof(const VPRecipeBase *R) {
     // VPInstructionWithType are VPInstructions with specific opcodes requiring
@@ -1221,10 +1237,9 @@ class VPInstructionWithType : public VPInstruction {
   }
 
   VPInstruction *clone() override {
-    SmallVector<VPValue *, 2> Operands(operands());
     auto *New =
-        new VPInstructionWithType(getOpcode(), Operands, getResultType(), *this,
-                                  getDebugLoc(), getName());
+        new VPInstructionWithType(getOpcode(), operands(), getResultType(),
+                                  *this, getDebugLoc(), getName());
     New->setUnderlyingValue(getUnderlyingValue());
     return New;
   }
@@ -1390,13 +1405,13 @@ class VPIRInstruction : public VPRecipeBase {
     return true;
   }
 
-  bool onlyFirstPartUsed(const VPValue *Op) const override {
+  bool usesFirstPartOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
   }
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -1625,7 +1640,7 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
              VPSlotTracker &SlotTracker) const override;
 #endif
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override;
+  bool usesFirstLaneOnly(const VPValue *Op) const override;
 };
 
 /// A recipe for widening Call instructions using library calls.
@@ -1722,7 +1737,9 @@ class VPHistogramRecipe : public VPRecipeBase {
 #endif
 };
 
-/// A recipe for widening select instructions.
+/// A recipe for widening select instructions. Supports both wide vector and
+/// single-scalar conditions, matching the behavior of LLVM IR's select
+/// instruction.
 struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
                                                public VPIRMetadata {
   VPWidenSelectRecipe(SelectInst &I, ArrayRef<VPValue *> Operands)
@@ -1757,15 +1774,11 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
     return getOperand(0);
   }
 
-  bool isInvariantCond() const {
-    return getCond()->isDefinedOutsideLoopRegions();
-  }
-
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
-    return Op == getCond() && isInvariantCond();
+    return Op == getCond() && Op->isDefinedOutsideLoopRegions();
   }
 };
 
@@ -1828,7 +1841,7 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     if (Op == getOperand(0))
@@ -1865,7 +1878,7 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
 
   void execute(VPTransformState &State) override;
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -1879,7 +1892,7 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
   }
 
   /// Returns true if the recipe only uses the first part of operand \p Op.
-  bool onlyFirstPartUsed(const VPValue *Op) const override {
+  bool usesFirstPartOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     assert(getNumOperands() <= 2 && "must have at most two operands");
@@ -1917,14 +1930,14 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
 
   Type *getSourceElementType() const { return SourceElementTy; }
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
   }
 
   /// Returns true if the recipe only uses the first part of operand \p Op.
-  bool onlyFirstPartUsed(const VPValue *Op) const override {
+  bool usesFirstPartOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     assert(getNumOperands() <= 2 && "must have at most two operands");
@@ -2105,7 +2118,7 @@ class VPWidenInductionRecipe : public VPHeaderPHIRecipe {
   }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // The recipe creates its own wide start value, so it only requests the
@@ -2320,7 +2333,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return Op == getStartValue();
@@ -2394,7 +2407,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
   bool isInLoop() const { return IsInLoop; }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return isOrdered() || isInLoop();
@@ -2463,13 +2476,13 @@ class LLVM_ABI_FOR_TEST VPBlendRecipe : public VPSingleDefRecipe {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // Recursing through Blend recipes only, must terminate at header phi's the
     // latest.
     return all_of(users(),
-                  [this](VPUser *U) { return U->onlyFirstLaneUsed(this); });
+                  [this](VPUser *U) { return U->usesFirstLaneOnly(this); });
   }
 };
 
@@ -2557,7 +2570,7 @@ class LLVM_ABI_FOR_TEST VPInterleaveBase : public VPRecipeBase,
                               VPCostContext &Ctx) const override;
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override = 0;
+  bool usesFirstLaneOnly(const VPValue *Op) const override = 0;
 
   /// Returns the number of stored operands of this interleave group. Returns 0
   /// for load interleave groups.
@@ -2603,7 +2616,7 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase {
              VPSlotTracker &SlotTracker) const override;
 #endif
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
@@ -2651,7 +2664,7 @@ class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase {
 #endif
 
   /// The recipe only uses the first lane of the address, and EVL operand.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return (Op == getAddr() && !llvm::is_contained(getStoredValues(), Op)) ||
@@ -2857,7 +2870,7 @@ class LLVM_ABI_FOR_TEST VPReductionEVLRecipe : public VPReductionRecipe {
   VPValue *getEVL() const { return getOperand(2); }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return Op == getEVL();
@@ -2919,7 +2932,7 @@ class LLVM_ABI_FOR_TEST VPReplicateRecipe : public VPRecipeWithIRFlags,
   bool isPredicated() const { return IsPredicated; }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return isSingleScalar();
@@ -3201,11 +3214,14 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
 
   VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
                       std::initializer_list<VPValue *> Operands,
-                      bool Consecutive, bool Reverse, Align Alignment,
+                      bool Consecutive, bool Reverse,
                       const VPIRMetadata &Metadata, DebugLoc DL)
       : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I),
-        Alignment(Alignment), Consecutive(Consecutive), Reverse(Reverse) {
+        Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive),
+        Reverse(Reverse) {
     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
+    assert((isa<VPVectorEndPointerRecipe>(getAddr()) || !Reverse) &&
+           "Reversed acccess without VPVectorEndPointerRecipe address?");
   }
 
 public:
@@ -3265,18 +3281,18 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
 struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
                                                    public VPValue {
   VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
-                    bool Consecutive, bool Reverse, Align Alignment,
+                    bool Consecutive, bool Reverse,
                     const VPIRMetadata &Metadata, DebugLoc DL)
       : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
-                            Reverse, Alignment, Metadata, DL),
+                            Reverse, Metadata, DL),
         VPValue(this, &Load) {
     setMask(Mask);
   }
 
   VPWidenLoadRecipe *clone() override {
     return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
-                                 getMask(), Consecutive, Reverse, getAlign(),
-                                 *this, getDebugLoc());
+                                 getMask(), Consecutive, Reverse, *this,
+                                 getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC);
@@ -3291,7 +3307,7 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // Widened, consecutive loads operations only demand the first lane of
@@ -3307,8 +3323,8 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
   VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL,
                        VPValue *Mask)
       : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
-                            {Addr, &EVL}, L.isConsecutive(), L.isReverse(),
-                            L.getAlign(), L, L.getDebugLoc()),
+                            {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L,
+                            L.getDebugLoc()),
         VPValue(this, &getIngredient()) {
     setMask(Mask);
   }
@@ -3332,7 +3348,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // Widened loads only demand the first lane of EVL and consecutive loads
@@ -3346,16 +3362,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
 struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
   VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
                      VPValue *Mask, bool Consecutive, bool Reverse,
-                     Align Alignment, const VPIRMetadata &Metadata, DebugLoc DL)
+                     const VPIRMetadata &Metadata, DebugLoc DL)
       : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
-                            Consecutive, Reverse, Alignment, Metadata, DL) {
+                            Consecutive, Reverse, Metadata, DL) {
     setMask(Mask);
   }
 
   VPWidenStoreRecipe *clone() override {
     return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
                                   getStoredValue(), getMask(), Consecutive,
-                                  Reverse, getAlign(), *this, getDebugLoc());
+                                  Reverse, *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
@@ -3373,7 +3389,7 @@ struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // Widened, consecutive stores only demand the first lane of their address,
@@ -3390,7 +3406,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
                         VPValue *Mask)
       : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
                             {Addr, S.getStoredValue(), &EVL}, S.isConsecutive(),
-                            S.isReverse(), S.getAlign(), S, S.getDebugLoc()) {
+                            S.isReverse(), S, S.getDebugLoc()) {
     setMask(Mask);
   }
 
@@ -3416,7 +3432,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     if (Op == getEVL()) {
@@ -3500,14 +3516,14 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
   }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
   }
 
   /// Returns true if the recipe only uses the first part of operand \p Op.
-  bool onlyFirstPartUsed(const VPValue *Op) const override {
+  bool usesFirstPartOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -3582,7 +3598,7 @@ class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe {
   }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -3692,7 +3708,7 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
   VPValue *getStepValue() const { return getOperand(2); }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -3757,7 +3773,7 @@ class LLVM_ABI_FOR_TEST VPScalarIVStepsRecipe : public VPRecipeWithIRFlags,
   VPValue *getStepValue() const { return getOperand(1); }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -3977,7 +3993,7 @@ class VPIRBasicBlock : public VPBasicBlock {
         IRBB(IRBB) {}
 
 public:
-  ~VPIRBasicBlock() override {}
+  ~VPIRBasicBlock() override = default;
 
   static inline bool classof(const VPBlockBase *V) {
     return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC;
@@ -4029,7 +4045,7 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {
         IsReplicator(IsReplicator) {}
 
 public:
-  ~VPRegionBlock() override {}
+  ~VPRegionBlock() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPBlockBase *V) {
@@ -4109,6 +4125,12 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {
   const VPCanonicalIVPHIRecipe *getCanonicalIV() const {
     return const_cast<VPRegionBlock *>(this)->getCanonicalIV();
   }
+
+  /// Return the type of the canonical IV for loop regions.
+  Type *getCanonicalIVType() { return getCanonicalIV()->getScalarType(); }
+  const Type *getCanonicalIVType() const {
+    return getCanonicalIV()->getScalarType();
+  }
 };
 
 inline VPRegionBlock *VPRecipeBase::getRegion() {
@@ -4387,15 +4409,25 @@ class VPlan {
   }
 
   /// Return a VPValue wrapping i1 true.
-  VPValue *getTrue() {
-    LLVMContext &Ctx = getContext();
-    return getOrAddLiveIn(ConstantInt::getTrue(Ctx));
-  }
+  VPValue *getTrue() { return getConstantInt(1, 1); }
 
   /// Return a VPValue wrapping i1 false.
-  VPValue *getFalse() {
-    LLVMContext &Ctx = getContext();
-    return getOrAddLiveIn(ConstantInt::getFalse(Ctx));
+  VPValue *getFalse() { return getConstantInt(1, 0); }
+
+  /// Return a VPValue wrapping a ConstantInt with the given type and value.
+  VPValue *getConstantInt(Type *Ty, uint64_t Val, bool IsSigned = false) {
+    return getOrAddLiveIn(ConstantInt::get(Ty, Val, IsSigned));
+  }
+
+  /// Return a VPValue wrapping a ConstantInt with the given bitwidth and value.
+  VPValue *getConstantInt(unsigned BitWidth, uint64_t Val,
+                          bool IsSigned = false) {
+    return getConstantInt(APInt(BitWidth, Val, IsSigned));
+  }
+
+  /// Return a VPValue wrapping a ConstantInt with the given APInt value.
+  VPValue *getConstantInt(const APInt &Val) {
+    return getOrAddLiveIn(ConstantInt::get(getContext(), Val));
   }
 
   /// Return the live-in VPValue for \p V, if there is one or nullptr otherwise.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 80a2e4bc3f754..fb0b029de3d41 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -115,6 +115,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   case VPInstruction::ExtractLane:
     return inferScalarType(R->getOperand(1));
   case VPInstruction::FirstActiveLane:
+  case VPInstruction::LastActiveLane:
     return Type::getIntNTy(Ctx, 64);
   case VPInstruction::ExtractLastElement:
   case VPInstruction::ExtractLastLanePerPart:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 65688a3f0b6be..92ff0dcf67927 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/MDBuilder.h"
 
 #define DEBUG_TYPE "vplan"
@@ -233,10 +234,15 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
       for (Value *Op : Inst->operands())
         VPOperands.push_back(getOrCreateVPOperand(Op));
 
-      // Build VPInstruction for any arbitrary Instruction without specific
-      // representation in VPlan.
-      NewR = cast<VPInstruction>(
-          VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst));
+      if (auto *CI = dyn_cast<CastInst>(Inst)) {
+        NewR = VPIRBuilder.createScalarCast(CI->getOpcode(), VPOperands[0],
+                                            CI->getType(), CI->getDebugLoc());
+        NewR->setUnderlyingValue(CI);
+      } else {
+        // Build VPInstruction for any arbitrary Instruction without specific
+        // representation in VPlan.
+        NewR = VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst);
+      }
     }
 
     IRDef2VPValue[Inst] = NewR;
@@ -612,8 +618,7 @@ void VPlanTransforms::addMiddleCheck(VPlan &Plan,
   if (!RequiresScalarEpilogueCheck)
     Cmp = Plan.getFalse();
   else if (TailFolded)
-    Cmp = Plan.getOrAddLiveIn(
-        ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext())));
+    Cmp = Plan.getTrue();
   else
     Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(),
                              &Plan.getVectorTripCount(), LatchDL, "cmp.n");
@@ -712,8 +717,8 @@ void VPlanTransforms::addMinimumIterationCheck(
       // additional overflow check is required before entering the vector loop.
 
       // Get the maximum unsigned value for the type.
-      VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get(
-          TripCountTy, cast<IntegerType>(TripCountTy)->getMask()));
+      VPValue *MaxUIntTripCount =
+          Plan.getConstantInt(cast<IntegerType>(TripCountTy)->getMask());
       VPValue *DistanceToMax = Builder.createNaryOp(
           Instruction::Sub, {MaxUIntTripCount, TripCountVPV},
           DebugLoc::getUnknown());
@@ -793,8 +798,8 @@ void VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
 
 bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
   auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * {
-    auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>(
-        RedPhiR->getBackedgeValue()->getDefiningRecipe());
+    auto *MinMaxR =
+        dyn_cast_or_null<VPRecipeWithIRFlags>(RedPhiR->getBackedgeValue());
     if (!MinMaxR)
       return nullptr;
 
@@ -824,7 +829,8 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
   };
 
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
-  VPReductionPHIRecipe *RedPhiR = nullptr;
+  SmallVector<std::pair<VPReductionPHIRecipe *, VPValue *>>
+      MinMaxNumReductionsToHandle;
   bool HasUnsupportedPhi = false;
   for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) {
     if (isa<VPCanonicalIVPHIRecipe, VPWidenIntOrFpInductionRecipe>(&R))
@@ -835,19 +841,20 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
       HasUnsupportedPhi = true;
       continue;
     }
-    // For now, only a single reduction is supported.
-    // TODO: Support multiple MaxNum/MinNum reductions and other reductions.
-    if (RedPhiR)
-      return false;
     if (!RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(
             Cur->getRecurrenceKind())) {
       HasUnsupportedPhi = true;
       continue;
     }
-    RedPhiR = Cur;
+
+    VPValue *MinMaxOp = GetMinMaxCompareValue(Cur);
+    if (!MinMaxOp)
+      return false;
+
+    MinMaxNumReductionsToHandle.emplace_back(Cur, MinMaxOp);
   }
 
-  if (!RedPhiR)
+  if (MinMaxNumReductionsToHandle.empty())
     return true;
 
   // We won't be able to resume execution in the scalar tail, if there are
@@ -856,14 +863,6 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
   if (HasUnsupportedPhi || !Plan.hasScalarTail())
     return false;
 
-  VPValue *MinMaxOp = GetMinMaxCompareValue(RedPhiR);
-  if (!MinMaxOp)
-    return false;
-
-  assert(RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(
-             RedPhiR->getRecurrenceKind()) &&
-         "unsupported reduction");
-
   /// Check if the vector loop of \p Plan can early exit and restart
   /// execution of last vector iteration in the scalar loop. This requires all
   /// recipes up to early exit point be side-effect free as they are
@@ -880,52 +879,68 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
   }
 
   VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
-  VPBuilder Builder(LatchVPBB->getTerminator());
-  auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
-  assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
+  VPBuilder LatchBuilder(LatchVPBB->getTerminator());
+  VPValue *AllNaNLanes = nullptr;
+  SmallPtrSet<VPValue *, 2> RdxResults;
+  for (const auto &[_, MinMaxOp] : MinMaxNumReductionsToHandle) {
+    VPValue *RedNaNLanes =
+        LatchBuilder.createFCmp(CmpInst::FCMP_UNO, MinMaxOp, MinMaxOp);
+    AllNaNLanes = AllNaNLanes ? LatchBuilder.createOr(AllNaNLanes, RedNaNLanes)
+                              : RedNaNLanes;
+  }
+
+  VPValue *AnyNaNLane =
+      LatchBuilder.createNaryOp(VPInstruction::AnyOf, {AllNaNLanes});
+  VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
+  VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->begin());
+  for (const auto &[RedPhiR, _] : MinMaxNumReductionsToHandle) {
+    assert(RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(
+               RedPhiR->getRecurrenceKind()) &&
+           "unsupported reduction");
+
+    // If we exit early due to NaNs, compute the final reduction result based on
+    // the reduction phi at the beginning of the last vector iteration.
+    auto *RdxResult = find_singleton<VPSingleDefRecipe>(
+        RedPhiR->users(), [](VPUser *U, bool) -> VPSingleDefRecipe * {
+          auto *VPI = dyn_cast<VPInstruction>(U);
+          if (VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult)
+            return VPI;
+          return nullptr;
+        });
+
+    auto *NewSel = MiddleBuilder.createSelect(AnyNaNLane, RedPhiR,
+                                              RdxResult->getOperand(1));
+    RdxResult->setOperand(1, NewSel);
+    assert(!RdxResults.contains(RdxResult) && "RdxResult already used");
+    RdxResults.insert(RdxResult);
+  }
+
+  auto *LatchExitingBranch = LatchVPBB->getTerminator();
+  assert(match(LatchExitingBranch, m_BranchOnCount(m_VPValue(), m_VPValue())) &&
          "Unexpected terminator");
-  auto *IsLatchExitTaken =
-      Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
-                         LatchExitingBranch->getOperand(1));
-
-  VPValue *IsNaN = Builder.createFCmp(CmpInst::FCMP_UNO, MinMaxOp, MinMaxOp);
-  VPValue *AnyNaN = Builder.createNaryOp(VPInstruction::AnyOf, {IsNaN});
-  auto *AnyExitTaken =
-      Builder.createNaryOp(Instruction::Or, {AnyNaN, IsLatchExitTaken});
-  Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
+  auto *IsLatchExitTaken = LatchBuilder.createICmp(
+      CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
+      LatchExitingBranch->getOperand(1));
+  auto *AnyExitTaken = LatchBuilder.createNaryOp(
+      Instruction::Or, {AnyNaNLane, IsLatchExitTaken});
+  LatchBuilder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
   LatchExitingBranch->eraseFromParent();
 
-  // If we exit early due to NaNs, compute the final reduction result based on
-  // the reduction phi at the beginning of the last vector iteration.
-  auto *RdxResult = find_singleton<VPSingleDefRecipe>(
-      RedPhiR->users(), [](VPUser *U, bool) -> VPSingleDefRecipe * {
-        auto *VPI = dyn_cast<VPInstruction>(U);
-        if (VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult)
-          return VPI;
-        return nullptr;
-      });
-
-  auto *MiddleVPBB = Plan.getMiddleBlock();
-  Builder.setInsertPoint(MiddleVPBB, MiddleVPBB->begin());
-  auto *NewSel =
-      Builder.createSelect(AnyNaN, RedPhiR, RdxResult->getOperand(1));
-  RdxResult->setOperand(1, NewSel);
-
-  auto *ScalarPH = Plan.getScalarPreheader();
-  // Update resume phis for inductions in the scalar preheader. If AnyNaN is
+  // Update resume phis for inductions in the scalar preheader. If AnyNaNLane is
   // true, the resume from the start of the last vector iteration via the
   // canonical IV, otherwise from the original value.
-  for (auto &R : ScalarPH->phis()) {
+  for (auto &R : Plan.getScalarPreheader()->phis()) {
     auto *ResumeR = cast<VPPhi>(&R);
     VPValue *VecV = ResumeR->getOperand(0);
-    if (VecV == RdxResult)
+    if (RdxResults.contains(VecV))
       continue;
     if (auto *DerivedIV = dyn_cast<VPDerivedIVRecipe>(VecV)) {
       if (DerivedIV->getNumUsers() == 1 &&
           DerivedIV->getOperand(1) == &Plan.getVectorTripCount()) {
-        auto *NewSel = Builder.createSelect(
-            AnyNaN, LoopRegion->getCanonicalIV(), &Plan.getVectorTripCount());
-        DerivedIV->moveAfter(&*Builder.getInsertPoint());
+        auto *NewSel =
+            MiddleBuilder.createSelect(AnyNaNLane, LoopRegion->getCanonicalIV(),
+                                       &Plan.getVectorTripCount());
+        DerivedIV->moveAfter(&*MiddleBuilder.getInsertPoint());
         DerivedIV->setOperand(1, NewSel);
         continue;
       }
@@ -937,15 +952,16 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
                            "FMaxNum/FMinNum reduction.\n");
       return false;
     }
-    auto *NewSel =
-        Builder.createSelect(AnyNaN, LoopRegion->getCanonicalIV(), VecV);
+    auto *NewSel = MiddleBuilder.createSelect(
+        AnyNaNLane, LoopRegion->getCanonicalIV(), VecV);
     ResumeR->setOperand(0, NewSel);
   }
 
   auto *MiddleTerm = MiddleVPBB->getTerminator();
-  Builder.setInsertPoint(MiddleTerm);
+  MiddleBuilder.setInsertPoint(MiddleTerm);
   VPValue *MiddleCond = MiddleTerm->getOperand(0);
-  VPValue *NewCond = Builder.createAnd(MiddleCond, Builder.createNot(AnyNaN));
+  VPValue *NewCond =
+      MiddleBuilder.createAnd(MiddleCond, MiddleBuilder.createNot(AnyNaNLane));
   MiddleTerm->setOperand(0, NewCond);
   return true;
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 2aaabd9ebdd04..caabfa7275b69 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -50,21 +50,6 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
                        int64_t Step);
 
-/// A helper function that returns how much we should divide the cost of a
-/// predicated block by. Typically this is the reciprocal of the block
-/// probability, i.e. if we return X we are assuming the predicated block will
-/// execute once for every X iterations of the loop header so the block should
-/// only contribute 1/X of its cost to the total cost calculation, but when
-/// optimizing for code size it will just be 1 as code size costs don't depend
-/// on execution probabilities.
-///
-/// TODO: We should use actual block probability here, if available. Currently,
-///       we always assume predicated blocks have a 50% chance of executing.
-inline unsigned
-getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind) {
-  return CostKind == TTI::TCK_CodeSize ? 1 : 2;
-}
-
 /// A range of powers-of-2 vectorization factors with fixed start and
 /// adjustable end. The range includes start and excludes end, e.g.,:
 /// [1, 16) = {1, 2, 4, 8}
@@ -350,13 +335,14 @@ struct VPCostContext {
   SmallPtrSet<Instruction *, 8> SkipCostComputation;
   TargetTransformInfo::TargetCostKind CostKind;
   ScalarEvolution &SE;
+  const Loop *L;
 
   VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
                 const VPlan &Plan, LoopVectorizationCostModel &CM,
                 TargetTransformInfo::TargetCostKind CostKind,
-                ScalarEvolution &SE)
+                ScalarEvolution &SE, const Loop *L)
       : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
-        CostKind(CostKind), SE(SE) {}
+        CostKind(CostKind), SE(SE), L(L) {}
 
   /// Return the cost for \p UI with \p VF using the legacy cost model as
   /// fallback until computing the cost of all recipes migrates to VPlan.
@@ -366,6 +352,10 @@ struct VPCostContext {
   /// has already been pre-computed.
   bool skipCostComputation(Instruction *UI, bool IsVector) const;
 
+  /// \returns how much the cost of a predicated block should be divided by.
+  /// Forwards to LoopVectorizationCostModel::getPredBlockCostDivisor.
+  unsigned getPredBlockCostDivisor(BasicBlock *BB) const;
+
   /// Returns the OperandInfo for \p V, if it is a live-in.
   TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index b5b98c64543e4..f34c99b84b1aa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -313,7 +313,8 @@ struct Recipe_match {
     // Check for recipes that do not have opcodes.
     if constexpr (std::is_same_v<RecipeTy, VPScalarIVStepsRecipe> ||
                   std::is_same_v<RecipeTy, VPCanonicalIVPHIRecipe> ||
-                  std::is_same_v<RecipeTy, VPDerivedIVRecipe>)
+                  std::is_same_v<RecipeTy, VPDerivedIVRecipe> ||
+                  std::is_same_v<RecipeTy, VPVectorEndPointerRecipe>)
       return DefR;
     else
       return DefR && DefR->getOpcode() == Opcode;
@@ -394,12 +395,24 @@ m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1) {
   return m_VPInstruction<Instruction::ExtractElement>(Op0, Op1);
 }
 
+template <typename Op0_t, typename Op1_t>
+inline VPInstruction_match<VPInstruction::ExtractLane, Op0_t, Op1_t>
+m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1) {
+  return m_VPInstruction<VPInstruction::ExtractLane>(Op0, Op1);
+}
+
 template <typename Op0_t>
 inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t>
 m_ExtractLastLanePerPart(const Op0_t &Op0) {
   return m_VPInstruction<VPInstruction::ExtractLastLanePerPart>(Op0);
 }
 
+template <typename Op0_t>
+inline VPInstruction_match<VPInstruction::ExtractPenultimateElement, Op0_t>
+m_ExtractPenultimateElement(const Op0_t &Op0) {
+  return m_VPInstruction<VPInstruction::ExtractPenultimateElement>(Op0);
+}
+
 template <typename Op0_t, typename Op1_t, typename Op2_t>
 inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t>
 m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
@@ -428,6 +441,16 @@ m_FirstActiveLane(const Op0_t &Op0) {
   return m_VPInstruction<VPInstruction::FirstActiveLane>(Op0);
 }
 
+template <typename Op0_t>
+inline VPInstruction_match<VPInstruction::LastActiveLane, Op0_t>
+m_LastActiveLane(const Op0_t &Op0) {
+  return m_VPInstruction<VPInstruction::LastActiveLane>(Op0);
+}
+
+inline VPInstruction_match<VPInstruction::StepVector> m_StepVector() {
+  return m_VPInstruction<VPInstruction::StepVector>();
+}
+
 template <unsigned Opcode, typename Op0_t>
 inline AllRecipe_match<Opcode, Op0_t> m_Unary(const Op0_t &Op0) {
   return AllRecipe_match<Opcode, Op0_t>(Op0);
@@ -473,6 +496,12 @@ m_c_Binary(const Op0_t &Op0, const Op1_t &Op1) {
   return AllRecipe_commutative_match<Opcode, Op0_t, Op1_t>(Op0, Op1);
 }
 
+template <typename Op0_t, typename Op1_t>
+inline AllRecipe_match<Instruction::Add, Op0_t, Op1_t> m_Add(const Op0_t &Op0,
+                                                             const Op1_t &Op1) {
+  return m_Binary<Instruction::Add, Op0_t, Op1_t>(Op0, Op1);
+}
+
 template <typename Op0_t, typename Op1_t>
 inline AllRecipe_commutative_match<Instruction::Add, Op0_t, Op1_t>
 m_c_Add(const Op0_t &Op0, const Op1_t &Op1) {
@@ -686,6 +715,64 @@ m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
   return VPDerivedIV_match<Op0_t, Op1_t, Op2_t>({Op0, Op1, Op2});
 }
 
+template <typename Addr_t, typename Mask_t> struct Load_match {
+  Addr_t Addr;
+  Mask_t Mask;
+
+  Load_match(Addr_t Addr, Mask_t Mask) : Addr(Addr), Mask(Mask) {}
+
+  template <typename OpTy> bool match(const OpTy *V) const {
+    auto *Load = dyn_cast<VPWidenLoadRecipe>(V);
+    if (!Load || !Addr.match(Load->getAddr()) || !Load->isMasked() ||
+        !Mask.match(Load->getMask()))
+      return false;
+    return true;
+  }
+};
+
+/// Match a (possibly reversed) masked load.
+template <typename Addr_t, typename Mask_t>
+inline Load_match<Addr_t, Mask_t> m_MaskedLoad(const Addr_t &Addr,
+                                               const Mask_t &Mask) {
+  return Load_match<Addr_t, Mask_t>(Addr, Mask);
+}
+
+template <typename Addr_t, typename Val_t, typename Mask_t> struct Store_match {
+  Addr_t Addr;
+  Val_t Val;
+  Mask_t Mask;
+
+  Store_match(Addr_t Addr, Val_t Val, Mask_t Mask)
+      : Addr(Addr), Val(Val), Mask(Mask) {}
+
+  template <typename OpTy> bool match(const OpTy *V) const {
+    auto *Store = dyn_cast<VPWidenStoreRecipe>(V);
+    if (!Store || !Addr.match(Store->getAddr()) ||
+        !Val.match(Store->getStoredValue()) || !Store->isMasked() ||
+        !Mask.match(Store->getMask()))
+      return false;
+    return true;
+  }
+};
+
+/// Match a (possibly reversed) masked store.
+template <typename Addr_t, typename Val_t, typename Mask_t>
+inline Store_match<Addr_t, Val_t, Mask_t>
+m_MaskedStore(const Addr_t &Addr, const Val_t &Val, const Mask_t &Mask) {
+  return Store_match<Addr_t, Val_t, Mask_t>(Addr, Val, Mask);
+}
+
+template <typename Op0_t, typename Op1_t>
+using VectorEndPointerRecipe_match =
+    Recipe_match<std::tuple<Op0_t, Op1_t>, 0,
+                 /*Commutative*/ false, VPVectorEndPointerRecipe>;
+
+template <typename Op0_t, typename Op1_t>
+VectorEndPointerRecipe_match<Op0_t, Op1_t> m_VecEndPtr(const Op0_t &Op0,
+                                                       const Op1_t &Op1) {
+  return VectorEndPointerRecipe_match<Op0_t, Op1_t>(Op0, Op1);
+}
+
 /// Match a call argument at a given argument index.
 template <typename Opnd_t> struct Argument_match {
   /// Call argument index to match.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index fb17d5dd62b9d..3579af21d8b07 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -44,11 +44,6 @@ class VPPredicator {
   /// possibly inserting new recipes at \p Dst (using Builder's insertion point)
   VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst);
 
-  /// Returns the *entry* mask for \p VPBB.
-  VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
-    return BlockMaskCache.lookup(VPBB);
-  }
-
   /// Record \p Mask as the *entry* mask of \p VPBB, which is expected to not
   /// already have a mask.
   void setBlockInMask(VPBasicBlock *VPBB, VPValue *Mask) {
@@ -68,6 +63,11 @@ class VPPredicator {
   }
 
 public:
+  /// Returns the *entry* mask for \p VPBB.
+  VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
+    return BlockMaskCache.lookup(VPBB);
+  }
+
   /// Returns the precomputed predicate of the edge from \p Src to \p Dst.
   VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const {
     return EdgeMaskCache.lookup({Src, Dst});
@@ -301,5 +301,34 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
 
     PrevVPBB = VPBB;
   }
+
+  // If we folded the tail and introduced a header mask, any extract of the
+  // last element must be updated to extract from the last active lane of the
+  // header mask instead (i.e., the lane corresponding to the last active
+  // iteration).
+  if (FoldTail) {
+    assert(Plan.getExitBlocks().size() == 1 &&
+           "only a single-exit block is supported currently");
+    VPBasicBlock *EB = Plan.getExitBlocks().front();
+    assert(EB->getSinglePredecessor() == Plan.getMiddleBlock() &&
+           "the exit block must have middle block as single predecessor");
+
+    VPBuilder B(Plan.getMiddleBlock()->getTerminator());
+    for (auto &P : EB->phis()) {
+      auto *ExitIRI = cast<VPIRPhi>(&P);
+      VPValue *Inc = ExitIRI->getIncomingValue(0);
+      VPValue *Op;
+      if (!match(Inc, m_ExtractLastElement(m_VPValue(Op))))
+        continue;
+
+      // Compute the index of the last active lane.
+      VPValue *HeaderMask = Predicator.getBlockInMask(Header);
+      VPValue *LastActiveLane =
+          B.createNaryOp(VPInstruction::LastActiveLane, HeaderMask);
+      auto *Ext =
+          B.createNaryOp(VPInstruction::ExtractLane, {LastActiveLane, Op});
+      Inc->replaceAllUsesWith(Ext);
+    }
+  }
   return Predicator.getBlockMaskCache();
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9a63c802047ea..5e46659227262 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -162,8 +162,12 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPPredInstPHISC:
   case VPVectorEndPointerSC:
     return false;
-  case VPInstructionSC:
-    return mayWriteToMemory();
+  case VPInstructionSC: {
+    auto *VPI = cast<VPInstruction>(this);
+    return mayWriteToMemory() ||
+           VPI->getOpcode() == VPInstruction::BranchOnCount ||
+           VPI->getOpcode() == VPInstruction::BranchOnCond;
+  }
   case VPWidenCallSC: {
     Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
     return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
@@ -307,18 +311,27 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
   std::optional<unsigned> Opcode;
   VPValue *Op = getVecOp();
   uint64_t MulConst;
+
+  InstructionCost CondCost = 0;
+  if (isConditional()) {
+    CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+    auto *VecTy = Ctx.Types.inferScalarType(Op);
+    auto *CondTy = Ctx.Types.inferScalarType(getCondOp());
+    CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy,
+                                          Pred, Ctx.CostKind);
+  }
+
   // If the partial reduction is predicated, a select will be operand 1.
   // If it isn't predicated and the mul isn't operating on a constant, then it
   // should have been turned into a VPExpressionRecipe.
   // FIXME: Replace the entire function with this once all partial reduction
   // variants are bundled into VPExpressionRecipe.
-  if (!match(Op, m_Select(m_VPValue(), m_VPValue(Op), m_VPValue())) &&
-      !match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) {
+  if (!match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) {
     auto *PhiType = Ctx.Types.inferScalarType(getChainOp());
     auto *InputType = Ctx.Types.inferScalarType(getVecOp());
-    return Ctx.TTI.getPartialReductionCost(getOpcode(), InputType, InputType,
-                                           PhiType, VF, TTI::PR_None,
-                                           TTI::PR_None, {}, Ctx.CostKind);
+    return CondCost + Ctx.TTI.getPartialReductionCost(
+                          getOpcode(), InputType, InputType, PhiType, VF,
+                          TTI::PR_None, TTI::PR_None, {}, Ctx.CostKind);
   }
 
   VPRecipeBase *OpR = Op->getDefiningRecipe();
@@ -343,7 +356,7 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
   // recipe.
   auto HandleWiden = [&](VPWidenRecipe *Widen) {
     if (match(Widen, m_Sub(m_ZeroInt(), m_VPValue(Op)))) {
-      Widen = dyn_cast<VPWidenRecipe>(Op->getDefiningRecipe());
+      Widen = dyn_cast<VPWidenRecipe>(Op);
     }
     Opcode = Widen->getOpcode();
     VPRecipeBase *ExtAR = Widen->getOperand(0)->getDefiningRecipe();
@@ -368,21 +381,21 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
     InputTypeA = Ctx.Types.inferScalarType(OpR->getOperand(0));
     ExtAType = GetExtendKind(OpR);
   } else if (isa<VPReductionPHIRecipe>(OpR)) {
-    auto RedPhiOp1R = getOperand(1)->getDefiningRecipe();
-    if (isa<VPWidenCastRecipe>(RedPhiOp1R)) {
+    if (auto RedPhiOp1R = dyn_cast_or_null<VPWidenCastRecipe>(getOperand(1))) {
       InputTypeA = Ctx.Types.inferScalarType(RedPhiOp1R->getOperand(0));
       ExtAType = GetExtendKind(RedPhiOp1R);
-    } else if (auto Widen = dyn_cast<VPWidenRecipe>(RedPhiOp1R))
+    } else if (auto Widen = dyn_cast_or_null<VPWidenRecipe>(getOperand(1)))
       HandleWiden(Widen);
   } else if (auto Widen = dyn_cast<VPWidenRecipe>(OpR)) {
     HandleWiden(Widen);
   } else if (auto Reduction = dyn_cast<VPPartialReductionRecipe>(OpR)) {
-    return Reduction->computeCost(VF, Ctx);
+    return CondCost + Reduction->computeCost(VF, Ctx);
   }
   auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
-  return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB,
-                                         PhiType, VF, ExtAType, ExtBType,
-                                         Opcode, Ctx.CostKind);
+  return CondCost + Ctx.TTI.getPartialReductionCost(
+                        getOpcode(), InputTypeA, InputTypeB, PhiType, VF,
+                        ExtAType, ExtBType, Opcode, Ctx.CostKind);
+  ;
 }
 
 void VPPartialReductionRecipe::execute(VPTransformState &State) {
@@ -391,12 +404,18 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
   assert(getOpcode() == Instruction::Add &&
          "Unhandled partial reduction opcode");
 
-  Value *BinOpVal = State.get(getOperand(1));
-  Value *PhiVal = State.get(getOperand(0));
+  Value *BinOpVal = State.get(getVecOp());
+  Value *PhiVal = State.get(getChainOp());
   assert(PhiVal && BinOpVal && "Phi and Mul must be set");
 
   Type *RetTy = PhiVal->getType();
 
+  if (isConditional()) {
+    Value *Cond = State.get(getCondOp());
+    Value *Zero = ConstantInt::get(BinOpVal->getType(), 0);
+    BinOpVal = Builder.CreateSelect(Cond, BinOpVal, Zero);
+  }
+
   CallInst *V =
       Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add,
                               {PhiVal, BinOpVal}, nullptr, "partial.reduce");
@@ -490,10 +509,10 @@ template class VPUnrollPartAccessor<3>;
 }
 
 VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                             const VPIRFlags &Flags, DebugLoc DL,
-                             const Twine &Name)
+                             const VPIRFlags &Flags, const VPIRMetadata &MD,
+                             DebugLoc DL, const Twine &Name)
     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
-      VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {
+      VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
   assert(flagsValidForOpcode(getOpcode()) &&
          "Set flags not supported for the provided opcode");
   assert((getNumOperandsForOpcode(Opcode) == -1u ||
@@ -528,6 +547,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
   case VPInstruction::ExtractLastLanePerPart:
   case VPInstruction::ExtractPenultimateElement:
   case VPInstruction::FirstActiveLane:
+  case VPInstruction::LastActiveLane:
   case VPInstruction::Not:
   case VPInstruction::Unpack:
     return 1;
@@ -655,7 +675,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
   }
   case Instruction::Select: {
     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
-    Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
+    Value *Cond =
+        State.get(getOperand(0),
+                  OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0)));
     Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
     Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
     return Builder.CreateSelect(Cond, Op1, Op2, Name);
@@ -1135,6 +1157,29 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
                                   {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
     return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
   }
+  case VPInstruction::LastActiveLane: {
+    Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
+    if (VF.isScalar())
+      return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
+                                        CmpInst::makeCmpResultType(ScalarTy),
+                                        CmpInst::ICMP_EQ, Ctx.CostKind);
+    // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
+    auto *PredTy = toVectorTy(ScalarTy, VF);
+    IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
+                                  Type::getInt64Ty(Ctx.LLVMCtx),
+                                  {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
+    InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
+    // Add cost of NOT operation on the predicate.
+    Cost += Ctx.TTI.getArithmeticInstrCost(
+        Instruction::Xor, PredTy, Ctx.CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        {TargetTransformInfo::OK_UniformConstantValue,
+         TargetTransformInfo::OP_None});
+    // Add cost of SUB operation on the index.
+    Cost += Ctx.TTI.getArithmeticInstrCost(
+        Instruction::Sub, Type::getInt64Ty(Ctx.LLVMCtx), Ctx.CostKind);
+    return Cost;
+  }
   case VPInstruction::FirstOrderRecurrenceSplice: {
     assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
     SmallVector<int> Mask(VF.getKnownMinValue());
@@ -1189,6 +1234,7 @@ bool VPInstruction::isVectorToScalar() const {
          getOpcode() == Instruction::ExtractElement ||
          getOpcode() == VPInstruction::ExtractLane ||
          getOpcode() == VPInstruction::FirstActiveLane ||
+         getOpcode() == VPInstruction::LastActiveLane ||
          getOpcode() == VPInstruction::ComputeAnyOfResult ||
          getOpcode() == VPInstruction::ComputeFindIVResult ||
          getOpcode() == VPInstruction::ComputeReductionResult ||
@@ -1241,6 +1287,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case Instruction::Select:
   case Instruction::PHI:
   case VPInstruction::AnyOf:
+  case VPInstruction::BranchOnCond:
+  case VPInstruction::BranchOnCount:
   case VPInstruction::Broadcast:
   case VPInstruction::BuildStructVector:
   case VPInstruction::BuildVector:
@@ -1252,6 +1300,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case VPInstruction::ExtractPenultimateElement:
   case VPInstruction::ActiveLaneMask:
   case VPInstruction::FirstActiveLane:
+  case VPInstruction::LastActiveLane:
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::LogicalAnd:
   case VPInstruction::Not:
@@ -1268,7 +1317,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   }
 }
 
-bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
+bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
     return vputils::onlyFirstLaneUsed(this);
@@ -1317,7 +1366,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   llvm_unreachable("switch should return");
 }
 
-bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
+bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   if (Instruction::isBinaryOp(getOpcode()))
     return vputils::onlyFirstPartUsed(this);
@@ -1428,6 +1477,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::FirstActiveLane:
     O << "first-active-lane";
     break;
+  case VPInstruction::LastActiveLane:
+    O << "last-active-lane";
+    break;
   case VPInstruction::ReductionStartVector:
     O << "reduction-start-vector";
     break;
@@ -1684,7 +1736,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
     if (!VFTy->getParamType(I.index())->isVectorTy())
       Arg = State.get(I.value(), VPLane(0));
     else
-      Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
+      Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
     Args.push_back(Arg);
   }
 
@@ -1753,7 +1805,7 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
                                            State.TTI))
       Arg = State.get(I.value(), VPLane(0));
     else
-      Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
+      Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
     if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
                                                State.TTI))
       TysForDecl.push_back(Arg->getType());
@@ -1835,7 +1887,7 @@ StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
   return Intrinsic::getBaseName(VectorIntrinsicID);
 }
 
-bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
+bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   return all_of(enumerate(operands()), [this, &Op](const auto &X) {
     auto [Idx, V] = X;
@@ -1962,16 +2014,13 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
   getOperand(1)->printAsOperand(O, SlotTracker);
   O << ", ";
   getOperand(2)->printAsOperand(O, SlotTracker);
-  O << (isInvariantCond() ? " (condition is loop invariant)" : "");
+  O << (vputils::isSingleScalar(getCond()) ? " (condition is single-scalar)"
+                                           : "");
 }
 #endif
 
 void VPWidenSelectRecipe::execute(VPTransformState &State) {
-  // The condition can be loop invariant but still defined inside the
-  // loop. This means that we can't just use the original 'cond' value.
-  // We have to take the 'vectorized' value and pick the first lane.
-  // Instcombine will make this a no-op.
-  Value *Cond = State.get(getCond(), isInvariantCond());
+  Value *Cond = State.get(getCond(), vputils::isSingleScalar(getCond()));
 
   Value *Op0 = State.get(getOperand(1));
   Value *Op1 = State.get(getOperand(2));
@@ -2372,9 +2421,8 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
     return false;
   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
-  auto *CanIV = getRegion()->getCanonicalIV();
   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
-         getScalarType() == CanIV->getScalarType();
+         getScalarType() == getRegion()->getCanonicalIVType();
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3167,26 +3215,30 @@ bool VPReplicateRecipe::shouldPack() const {
   });
 }
 
-/// Returns true if \p Ptr is a pointer computation for which the legacy cost
-/// model computes a SCEV expression when computing the address cost.
-static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
+/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
+/// which the legacy cost model computes a SCEV expression when computing the
+/// address cost. Computing SCEVs for VPValues is incomplete and returns
+/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
+/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
+static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE,
+                                        const Loop *L) {
   auto *PtrR = Ptr->getDefiningRecipe();
-  if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
-                  cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
+  if (!PtrR || !((isa<VPReplicateRecipe>(Ptr) &&
+                  cast<VPReplicateRecipe>(Ptr)->getOpcode() ==
                       Instruction::GetElementPtr) ||
-                 isa<VPWidenGEPRecipe>(PtrR) ||
+                 isa<VPWidenGEPRecipe>(Ptr) ||
                  match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
-    return false;
+    return nullptr;
 
   // We are looking for a GEP where all indices are either loop invariant or
   // inductions.
   for (VPValue *Opd : drop_begin(PtrR->operands())) {
     if (!Opd->isDefinedOutsideLoopRegions() &&
         !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
-      return false;
+      return nullptr;
   }
 
-  return true;
+  return vputils::getSCEVExprForVPValue(Ptr, SE, L);
 }
 
 /// Returns true if \p V is used as part of the address of another load or
@@ -3341,7 +3393,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
     // Scale the cost by the probability of executing the predicated blocks.
     // This assumes the predicated block for each vector lane is equally
     // likely.
-    ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind);
+    ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent());
     return ScalarCost;
   }
   case Instruction::Load:
@@ -3354,9 +3406,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
 
     bool IsLoad = UI->getOpcode() == Instruction::Load;
     const VPValue *PtrOp = getOperand(!IsLoad);
-    // TODO: Handle cases where we need to pass a SCEV to
-    // getAddressComputationCost.
-    if (shouldUseAddressAccessSCEV(PtrOp))
+    const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L);
+    if (isa_and_nonnull<SCEVCouldNotCompute>(PtrSCEV))
       break;
 
     Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
@@ -3374,7 +3425,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
     InstructionCost ScalarCost =
         ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
                               PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE,
-                              nullptr, Ctx.CostKind);
+                              PtrSCEV, Ctx.CostKind);
     if (isSingleScalar())
       return ScalarCost;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.h b/llvm/lib/Transforms/Vectorize/VPlanSLP.h
index 77ff36cc2c600..44972c68ba9c9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanSLP.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.h
@@ -89,8 +89,7 @@ class VPlanSlp {
   /// Width of the widest combined bundle in bits.
   unsigned WidestBundleBits = 0;
 
-  using MultiNodeOpTy =
-      typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
+  using MultiNodeOpTy = std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
 
   // Input operand bundles for the current multi node. Each multi node operand
   // bundle contains values not matching the multi node's opcode. They will
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 4d98014622224..e8fea6851dae5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -40,10 +40,6 @@
 using namespace llvm;
 using namespace VPlanPatternMatch;
 
-static cl::opt<bool> EnableWideActiveLaneMask(
-    "enable-wide-lane-mask", cl::init(false), cl::Hidden,
-    cl::desc("Enable use of wide get active lane mask instructions"));
-
 bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
     VPlan &Plan,
     function_ref<const InductionDescriptor *(PHINode *)>
@@ -91,14 +87,13 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
         if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
           NewRecipe = new VPWidenLoadRecipe(
               *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
-              false /*Consecutive*/, false /*Reverse*/, Load->getAlign(),
-              VPIRMetadata(*Load), Ingredient.getDebugLoc());
+              false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
+              Ingredient.getDebugLoc());
         } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
           NewRecipe = new VPWidenStoreRecipe(
               *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
               nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
-              Store->getAlign(), VPIRMetadata(*Store),
-              Ingredient.getDebugLoc());
+              VPIRMetadata(*Store), Ingredient.getDebugLoc());
         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
           NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
@@ -151,59 +146,64 @@ static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R) {
 
 static bool sinkScalarOperands(VPlan &Plan) {
   auto Iter = vp_depth_first_deep(Plan.getEntry());
+  bool ScalarVFOnly = Plan.hasScalarVFOnly();
   bool Changed = false;
+
+  SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
+  auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
+                                        VPBasicBlock *SinkTo, VPValue *Op) {
+    auto *Candidate =
+        dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
+    if (!Candidate)
+      return;
+
+    // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
+    // for now.
+    if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate))
+      return;
+
+    if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
+      return;
+
+    if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
+      if (!ScalarVFOnly && RepR->isSingleScalar())
+        return;
+
+    WorkList.insert({SinkTo, Candidate});
+  };
+
   // First, collect the operands of all recipes in replicate blocks as seeds for
   // sinking.
-  SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
   for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {
     VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
     if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
       continue;
-    VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]);
-    if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
+    VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
+    if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
       continue;
-    for (auto &Recipe : *VPBB) {
+    for (auto &Recipe : *VPBB)
       for (VPValue *Op : Recipe.operands())
-        if (auto *Def =
-                dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
-          WorkList.insert({VPBB, Def});
-    }
+        InsertIfValidSinkCandidate(VPBB, Op);
   }
 
-  bool ScalarVFOnly = Plan.hasScalarVFOnly();
   // Try to sink each replicate or scalar IV steps recipe in the worklist.
   for (unsigned I = 0; I != WorkList.size(); ++I) {
     VPBasicBlock *SinkTo;
     VPSingleDefRecipe *SinkCandidate;
     std::tie(SinkTo, SinkCandidate) = WorkList[I];
-    if (SinkCandidate->getParent() == SinkTo ||
-        SinkCandidate->mayHaveSideEffects() ||
-        SinkCandidate->mayReadOrWriteMemory())
-      continue;
-    if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
-      if (!ScalarVFOnly && RepR->isSingleScalar())
-        continue;
-    } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate))
-      continue;
 
-    bool NeedsDuplicating = false;
-    // All recipe users of the sink candidate must be in the same block SinkTo
-    // or all users outside of SinkTo must be uniform-after-vectorization (
-    // i.e., only first lane is used) . In the latter case, we need to duplicate
-    // SinkCandidate.
-    auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
-                            SinkCandidate](VPUser *U) {
-      auto *UI = cast<VPRecipeBase>(U);
-      if (UI->getParent() == SinkTo)
-        return true;
-      NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);
-      // We only know how to duplicate VPReplicateRecipes and
-      // VPScalarIVStepsRecipes for now.
-      return NeedsDuplicating &&
-             isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(SinkCandidate);
-    };
-    if (!all_of(SinkCandidate->users(), CanSinkWithUser))
+    // All recipe users of SinkCandidate must be in the same block SinkTo or all
+    // users outside of SinkTo must only use the first lane of SinkCandidate. In
+    // the latter case, we need to duplicate SinkCandidate.
+    auto UsersOutsideSinkTo =
+        make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
+          return cast<VPRecipeBase>(U)->getParent() != SinkTo;
+        });
+    if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
+          return !U->usesFirstLaneOnly(SinkCandidate);
+        }))
       continue;
+    bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
 
     if (NeedsDuplicating) {
       if (ScalarVFOnly)
@@ -228,9 +228,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
     }
     SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
     for (VPValue *Op : SinkCandidate->operands())
-      if (auto *Def =
-              dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
-        WorkList.insert({SinkTo, Def});
+      InsertIfValidSinkCandidate(SinkTo, Op);
     Changed = true;
   }
   return Changed;
@@ -582,10 +580,13 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
 
       // Check if R is a dead VPPhi <-> update cycle and remove it.
       auto *PhiR = dyn_cast<VPPhi>(&R);
-      if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1)
+      if (!PhiR || PhiR->getNumOperands() != 2)
+        continue;
+      VPUser *PhiUser = PhiR->getSingleUser();
+      if (!PhiUser)
         continue;
       VPValue *Incoming = PhiR->getOperand(1);
-      if (*PhiR->user_begin() != Incoming->getDefiningRecipe() ||
+      if (PhiUser != Incoming->getDefiningRecipe() ||
           Incoming->getNumUsers() != 1)
         continue;
       PhiR->replaceAllUsesWith(PhiR->getOperand(0));
@@ -699,8 +700,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
         continue;
 
       const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
-      VPValue *StartV =
-          Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
+      VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
       VPValue *StepV = PtrIV->getOperand(1);
       VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
           Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
@@ -805,8 +805,8 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
                                                VPValue *Op,
                                                ScalarEvolution &SE) {
   VPValue *Incoming, *Mask;
-  if (!match(Op, m_VPInstruction<VPInstruction::ExtractLane>(
-                     m_FirstActiveLane(m_VPValue(Mask)), m_VPValue(Incoming))))
+  if (!match(Op, m_ExtractLane(m_FirstActiveLane(m_VPValue(Mask)),
+                               m_VPValue(Incoming))))
     return nullptr;
 
   auto *WideIV = getOptimizableIVOf(Incoming, SE);
@@ -820,7 +820,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
   // Calculate the final index.
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   auto *CanonicalIV = LoopRegion->getCanonicalIV();
-  Type *CanonicalIVType = CanonicalIV->getScalarType();
+  Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
   VPBuilder B(cast<VPBasicBlock>(PredVPBB));
 
   DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
@@ -836,7 +836,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
   // changed it means the exit is using the incremented value, so we need to
   // add the step.
   if (Incoming != WideIV) {
-    VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
+    VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
     EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
   }
 
@@ -882,7 +882,7 @@ static VPValue *optimizeLatchExitInductionUser(
     return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
   if (ScalarTy->isPointerTy()) {
     Type *StepTy = TypeInfo.inferScalarType(Step);
-    auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0));
+    auto *Zero = Plan.getConstantInt(StepTy, 0);
     return B.createPtrAdd(EndValue,
                           B.createNaryOp(Instruction::Sub, {Zero, Step}),
                           DebugLoc::getUnknown(), "ind.escape");
@@ -1057,13 +1057,9 @@ static VPValue *tryToFoldLiveIns(VPSingleDefRecipe &R,
   return nullptr;
 }
 
-/// Try to simplify recipe \p R.
-static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
-  VPlan *Plan = R.getParent()->getPlan();
-
-  auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
-  if (!Def)
-    return;
+/// Try to simplify VPSingleDefRecipe \p Def.
+static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
+  VPlan *Plan = Def->getParent()->getPlan();
 
   // Simplification of live-in IR values for SingleDef recipes using
   // InstSimplifyFolder.
@@ -1073,7 +1069,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return Def->replaceAllUsesWith(V);
 
   // Fold PredPHI LiveIn -> LiveIn.
-  if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) {
+  if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
     VPValue *Op = PredPHI->getOperand(0);
     if (Op->isLiveIn())
       PredPHI->replaceAllUsesWith(Op);
@@ -1092,12 +1088,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
         return;
       if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
 
-        unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
+        unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
                                  ? Instruction::SExt
                                  : Instruction::ZExt;
         auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
                                             TruncTy);
-        if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
+        if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
           // UnderlyingExt has distinct return type, used to retain legacy cost.
           Ext->setUnderlyingValue(UnderlyingExt);
         }
@@ -1160,7 +1156,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
         Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
 
   // x && !x -> 0
-  if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X)))))
+  if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X)))))
     return Def->replaceAllUsesWith(Plan->getFalse());
 
   if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
@@ -1188,8 +1184,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return Def->replaceAllUsesWith(A);
 
   if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
-    return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1)
-                                                        : R.getOperand(0));
+    return Def->replaceAllUsesWith(
+        Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
 
   if (match(Def, m_Not(m_VPValue(A)))) {
     if (match(A, m_Not(m_VPValue(A))))
@@ -1218,12 +1214,23 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
         }
         // If Cmp doesn't have a debug location, use the one from the negation,
         // to preserve the location.
-        if (!Cmp->getDebugLoc() && R.getDebugLoc())
-          Cmp->setDebugLoc(R.getDebugLoc());
+        if (!Cmp->getDebugLoc() && Def->getDebugLoc())
+          Cmp->setDebugLoc(Def->getDebugLoc());
       }
     }
   }
 
+  // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
+  // This is useful for fmax/fmin without fast-math flags, where we need to
+  // check if any operand is NaN.
+  if (match(Def, m_BinaryOr(m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(X),
+                                          m_Deferred(X)),
+                            m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(Y),
+                                          m_Deferred(Y))))) {
+    VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
+    return Def->replaceAllUsesWith(NewCmp);
+  }
+
   // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
   if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
        match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
@@ -1245,7 +1252,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
   if (match(Def, m_Intrinsic<Intrinsic::vp_merge>(m_True(), m_VPValue(A),
                                                   m_VPValue(X), m_VPValue())) &&
       match(A, m_c_BinaryOr(m_Specific(X), m_VPValue(Y))) &&
-      TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) {
+      TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
     Def->setOperand(1, Def->getOperand(0));
     Def->setOperand(0, Y);
     return;
@@ -1253,35 +1260,50 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
 
   if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
     if (Phi->getOperand(0) == Phi->getOperand(1))
-      Def->replaceAllUsesWith(Phi->getOperand(0));
+      Phi->replaceAllUsesWith(Phi->getOperand(0));
     return;
   }
 
   // Look through ExtractLastElement (BuildVector ....).
-  if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()),
-                            m_ExtractLastLanePerPart(m_BuildVector())))) {
-    auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
+  if (match(Def, m_CombineOr(m_ExtractLastElement(m_BuildVector()),
+                             m_ExtractLastLanePerPart(m_BuildVector())))) {
+    auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
     Def->replaceAllUsesWith(
         BuildVector->getOperand(BuildVector->getNumOperands() - 1));
     return;
   }
 
   // Look through ExtractPenultimateElement (BuildVector ....).
-  if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
-                    m_BuildVector()))) {
-    auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
+  if (match(Def, m_ExtractPenultimateElement(m_BuildVector()))) {
+    auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
     Def->replaceAllUsesWith(
         BuildVector->getOperand(BuildVector->getNumOperands() - 2));
     return;
   }
 
   uint64_t Idx;
-  if (match(&R, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) {
-    auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
+  if (match(Def, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) {
+    auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
     Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
     return;
   }
 
+  if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
+    Def->replaceAllUsesWith(
+        Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
+    return;
+  }
+
+  // Look through broadcast of single-scalar when used as select conditions; in
+  // that case the scalar condition can be used directly.
+  if (match(Def,
+            m_Select(m_Broadcast(m_VPValue(C)), m_VPValue(), m_VPValue()))) {
+    assert(vputils::isSingleScalar(C) &&
+           "broadcast operand must be single-scalar");
+    Def->setOperand(0, C);
+    return;
+  }
+
   if (auto *Phi = dyn_cast<VPPhi>(Def)) {
     if (Phi->getNumOperands() == 1)
       Phi->replaceAllUsesWith(Phi->getOperand(0));
@@ -1298,7 +1320,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
       isa<VPPhi>(X)) {
     auto *Phi = cast<VPPhi>(X);
     if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
-        Phi->getNumUsers() == 1 && (*Phi->user_begin() == &R)) {
+        Phi->getSingleUser() == Def) {
       Phi->setOperand(0, Y);
       Def->replaceAllUsesWith(Phi);
       return;
@@ -1306,7 +1328,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
   }
 
   // VPVectorPointer for part 0 can be replaced by their start pointer.
-  if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) {
+  if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Def)) {
     if (VecPtr->isFirstPart()) {
       VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
       return;
@@ -1361,9 +1383,9 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
       Plan.getEntry());
   VPTypeAnalysis TypeInfo(Plan);
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
-    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-      simplifyRecipe(R, TypeInfo);
-    }
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB))
+      if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
+        simplifyRecipe(Def, TypeInfo);
   }
 }
 
@@ -1407,10 +1429,26 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
       // broadcasts.
       if (!vputils::isSingleScalar(RepOrWidenR) ||
           !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
-            return U->usesScalars(RepOrWidenR) ||
-                   match(cast<VPRecipeBase>(U),
-                         m_CombineOr(m_ExtractLastElement(m_VPValue()),
-                                     m_ExtractLastLanePerPart(m_VPValue())));
+            if (auto *Store = dyn_cast<VPWidenStoreRecipe>(U)) {
+              // VPWidenStore doesn't have users, and stores are always
+              // profitable to widen: hence, permitting address and mask
+              // operands, and single-scalar stored values is an important leaf
+              // condition. The assert must hold as we checked the RepOrWidenR
+              // operand against vputils::isSingleScalar.
+              assert(RepOrWidenR != Store->getStoredValue() ||
+                     vputils::isSingleScalar(Store->getStoredValue()));
+              return true;
+            }
+
+            if (auto *VPI = dyn_cast<VPInstruction>(U)) {
+              unsigned Opcode = VPI->getOpcode();
+              if (Opcode == VPInstruction::ExtractLastElement ||
+                  Opcode == VPInstruction::ExtractLastLanePerPart ||
+                  Opcode == VPInstruction::ExtractPenultimateElement)
+                return true;
+            }
+
+            return U->usesScalars(RepOrWidenR);
           }))
         continue;
 
@@ -1419,6 +1457,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
                                           true /*IsSingleScalar*/);
       Clone->insertBefore(RepOrWidenR);
       RepOrWidenR->replaceAllUsesWith(Clone);
+      if (isDeadRecipe(*RepOrWidenR))
+        RepOrWidenR->eraseFromParent();
     }
   }
 }
@@ -1565,22 +1605,23 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
 
     // Currently only handle cases where the single user is a header-mask
     // comparison with the backedge-taken-count.
-    if (!match(*WideIV->user_begin(),
-               m_ICmp(m_Specific(WideIV),
-                      m_Broadcast(
-                          m_Specific(Plan.getOrCreateBackedgeTakenCount())))))
+    VPUser *SingleUser = WideIV->getSingleUser();
+    if (!SingleUser ||
+        !match(SingleUser, m_ICmp(m_Specific(WideIV),
+                                  m_Broadcast(m_Specific(
+                                      Plan.getOrCreateBackedgeTakenCount())))))
       continue;
 
     // Update IV operands and comparison bound to use new narrower type.
-    auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
+    auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
     WideIV->setStartValue(NewStart);
-    auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
+    auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
     WideIV->setStepValue(NewStep);
 
     auto *NewBTC = new VPWidenCastRecipe(
         Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
     Plan.getVectorPreheader()->appendRecipe(NewBTC);
-    auto *Cmp = cast<VPInstruction>(*WideIV->user_begin());
+    auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
     Cmp->setOperand(1, NewBTC);
 
     MadeChange = true;
@@ -1693,8 +1734,7 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
 
   // When using wide lane masks, the return type of the get.active.lane.mask
   // intrinsic is VF x UF (last operand).
-  VPValue *ALMMultiplier =
-      Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
+  VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
   EntryALM->setOperand(2, ALMMultiplier);
   LoopALM->setOperand(2, ALMMultiplier);
 
@@ -1731,17 +1771,17 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
   if (match(Term, m_BranchOnCount()) ||
       match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
                       m_VPValue(), m_VPValue(), m_VPValue()))))) {
-    // Try to simplify the branch condition if TC <= VF * UF when the latch
-    // terminator is   BranchOnCount or BranchOnCond where the input is
-    // Not(ActiveLaneMask).
-    const SCEV *TripCount =
-        vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
-    assert(!isa<SCEVCouldNotCompute>(TripCount) &&
+    // Try to simplify the branch condition if VectorTC <= VF * UF when the
+    // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
+    const SCEV *VectorTripCount =
+        vputils::getSCEVExprForVPValue(&Plan.getVectorTripCount(), SE);
+    if (isa<SCEVCouldNotCompute>(VectorTripCount))
+      VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
+    assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
            "Trip count SCEV must be computable");
     ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
-    const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
-    if (TripCount->isZero() ||
-        !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
+    const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
+    if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
       return false;
   } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
     // For BranchOnCond, check if we can prove the condition to be true using VF
@@ -2015,6 +2055,32 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
     // Set the first operand of RecurSplice to FOR again, after replacing
     // all users.
     RecurSplice->setOperand(0, FOR);
+
+    // Check for users extracting at the penultimate active lane of the FOR.
+    // If only a single lane is active in the current iteration, we need to
+    // select the last element from the previous iteration (from the FOR phi
+    // directly).
+    for (VPUser *U : RecurSplice->users()) {
+      if (!match(U, m_ExtractLane(m_LastActiveLane(m_VPValue()),
+                                  m_Specific(RecurSplice))))
+        continue;
+
+      VPBuilder B(cast<VPInstruction>(U));
+      VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
+      Type *I64Ty = Type::getInt64Ty(Plan.getContext());
+      VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0));
+      VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1));
+      VPValue *PenultimateIndex =
+          B.createNaryOp(Instruction::Sub, {LastActiveLane, One});
+      VPValue *PenultimateLastIter =
+          B.createNaryOp(VPInstruction::ExtractLane,
+                         {PenultimateIndex, FOR->getBackedgeValue()});
+      VPValue *LastPrevIter =
+          B.createNaryOp(VPInstruction::ExtractLastElement, FOR);
+      VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
+      VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
+      cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
+    }
   }
   return true;
 }
@@ -2400,8 +2466,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       "index.part.next");
 
   // Create the active lane mask instruction in the VPlan preheader.
-  VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
-      ConstantInt::get(TopRegion->getCanonicalIV()->getScalarType(), 1));
+  VPValue *ALMMultiplier =
+      Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
   auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
                                         {EntryIncrement, TC, ALMMultiplier}, DL,
                                         "active.lane.mask.entry");
@@ -2501,7 +2567,7 @@ void VPlanTransforms::addActiveLaneMask(
   } else {
     VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
     VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
-        ConstantInt::get(LoopRegion->getCanonicalIV()->getScalarType(), 1));
+        ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
     LaneMask =
         B.createNaryOp(VPInstruction::ActiveLaneMask,
                        {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
@@ -2515,90 +2581,108 @@ void VPlanTransforms::addActiveLaneMask(
   HeaderMask->eraseFromParent();
 }
 
+template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
+  Op0_t In;
+  Op1_t &Out;
+
+  RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
+
+  template <typename OpTy> bool match(OpTy *V) const {
+    if (m_Specific(In).match(V)) {
+      Out = nullptr;
+      return true;
+    }
+    if (m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V))
+      return true;
+    return false;
+  }
+};
+
+/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
+/// Returns the remaining part \p Out if so, or nullptr otherwise.
+template <typename Op0_t, typename Op1_t>
+static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
+                                                          Op1_t &Out) {
+  return RemoveMask_match<Op0_t, Op1_t>(In, Out);
+}
+
 /// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
 /// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
 /// recipe could be created.
 /// \p HeaderMask  Header Mask.
 /// \p CurRecipe   Recipe to be transform.
 /// \p TypeInfo    VPlan-based type analysis.
-/// \p AllOneMask  The vector mask parameter of vector-predication intrinsics.
 /// \p EVL         The explicit vector length parameter of vector-predication
 /// intrinsics.
 static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
                                        VPRecipeBase &CurRecipe,
-                                       VPTypeAnalysis &TypeInfo,
-                                       VPValue &AllOneMask, VPValue &EVL) {
-  // FIXME: Don't transform recipes to EVL recipes if they're not masked by the
-  // header mask.
-  auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
-    assert(OrigMask && "Unmasked recipe when folding tail");
-    // HeaderMask will be handled using EVL.
-    VPValue *Mask;
-    if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask))))
-      return Mask;
-    return HeaderMask == OrigMask ? nullptr : OrigMask;
-  };
+                                       VPTypeAnalysis &TypeInfo, VPValue &EVL) {
+  VPlan *Plan = CurRecipe.getParent()->getPlan();
+  VPValue *Addr, *Mask, *EndPtr;
 
   /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
-  auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * {
-    auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr);
-    if (!EndPtr)
-      return Addr;
-    assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() &&
-           "VPVectorEndPointerRecipe with non-VF VF operand?");
-    assert(
-        all_of(EndPtr->users(),
-               [](VPUser *U) {
-                 return cast<VPWidenMemoryRecipe>(U)->isReverse();
-               }) &&
-        "VPVectorEndPointRecipe not used by reversed widened memory recipe?");
-    VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone();
-    EVLAddr->insertBefore(&CurRecipe);
-    EVLAddr->setOperand(1, &EVL);
-    return EVLAddr;
+  auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
+    auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
+    EVLEndPtr->insertBefore(&CurRecipe);
+    EVLEndPtr->setOperand(1, &EVL);
+    return EVLEndPtr;
   };
 
-  return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe)
-      .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
-        VPValue *NewMask = GetNewMask(L->getMask());
-        VPValue *NewAddr = GetNewAddr(L->getAddr());
-        return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
-      })
-      .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
-        VPValue *NewMask = GetNewMask(S->getMask());
-        VPValue *NewAddr = GetNewAddr(S->getAddr());
-        return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
-      })
-      .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) {
-        VPValue *NewMask = GetNewMask(IR->getMask());
-        return new VPInterleaveEVLRecipe(*IR, EVL, NewMask);
-      })
-      .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
-        VPValue *NewMask = GetNewMask(Red->getCondOp());
-        return new VPReductionEVLRecipe(*Red, EVL, NewMask);
-      })
-      .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
-        VPValue *LHS, *RHS;
-        // Transform select with a header mask condition
-        //   select(header_mask, LHS, RHS)
-        // into vector predication merge.
-        //   vp.merge(all-true, LHS, RHS, EVL)
-        if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
+  if (match(&CurRecipe,
+            m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
+      !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
+    return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
+                                    EVL, Mask);
+
+  if (match(&CurRecipe,
+            m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
+      match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
+      cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
+    return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
+                                    AdjustEndPtr(EndPtr), EVL, Mask);
+
+  if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(),
+                                      m_RemoveMask(HeaderMask, Mask))) &&
+      !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
+    return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
+                                     EVL, Mask);
+
+  if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(),
+                                      m_RemoveMask(HeaderMask, Mask))) &&
+      match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
+      cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
+    return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
+                                     AdjustEndPtr(EndPtr), EVL, Mask);
+
+  if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
+    if (Rdx->isConditional() &&
+        match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
+      return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
+
+  if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
+    if (Interleave->getMask() &&
+        match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
+      return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
+
+  VPValue *LHS, *RHS;
+  if (match(&CurRecipe,
+            m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
+    return new VPWidenIntrinsicRecipe(
+        Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
+        TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc());
+
+  if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
                                  m_VPValue(RHS))))
-          return nullptr;
-        // Use all true as the condition because this transformation is
-        // limited to selects whose condition is a header mask.
-        return new VPWidenIntrinsicRecipe(
-            Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
-            TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
-      })
-      .Default([&](VPRecipeBase *R) { return nullptr; });
+    return new VPWidenIntrinsicRecipe(
+        Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
+        TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc());
+
+  return nullptr;
 }
 
 /// Replace recipes with their EVL variants.
 static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
   VPTypeAnalysis TypeInfo(Plan);
-  VPValue *AllOneMask = Plan.getTrue();
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
 
@@ -2658,7 +2742,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
             ConstantInt::getSigned(Type::getInt32Ty(Plan.getContext()), -1));
         VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
             Intrinsic::experimental_vp_splice,
-            {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
+            {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
             TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
         VPSplice->insertBefore(&R);
         R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
@@ -2692,7 +2776,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
   for (VPUser *U : collectUsersRecursively(EVLMask)) {
     auto *CurRecipe = cast<VPRecipeBase>(U);
     VPRecipeBase *EVLRecipe =
-        optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
+        optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);
     if (!EVLRecipe)
       continue;
 
@@ -2773,7 +2857,7 @@ void VPlanTransforms::addExplicitVectorLength(
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
 
   auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
-  auto *CanIVTy = CanonicalIVPHI->getScalarType();
+  auto *CanIVTy = LoopRegion->getCanonicalIVType();
   VPValue *StartV = CanonicalIVPHI->getStartValue();
 
   // Create the ExplicitVectorLengthPhi recipe in the main loop.
@@ -2788,8 +2872,7 @@ void VPlanTransforms::addExplicitVectorLength(
 
   if (MaxSafeElements) {
     // Support for MaxSafeDist for correct loop emission.
-    VPValue *AVLSafe =
-        Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
+    VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
     VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
     AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
                                "safe_avl");
@@ -2902,9 +2985,8 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
 
   Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
   VPBuilder Builder(LatchExitingBr);
-  VPValue *Cmp =
-      Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
-                         Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy)));
+  VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
+                                    Plan.getConstantInt(AVLTy, 0));
   Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
   LatchExitingBr->eraseFromParent();
 }
@@ -2928,8 +3010,7 @@ void VPlanTransforms::replaceSymbolicStrides(
       // Only handle constant strides for now.
       continue;
 
-    auto *CI =
-        Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst));
+    auto *CI = Plan.getConstantInt(*StrideConst);
     if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
 
@@ -2944,7 +3025,7 @@ void VPlanTransforms::replaceSymbolicStrides(
       unsigned BW = U->getType()->getScalarSizeInBits();
       APInt C =
           isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
-      VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
+      VPValue *CI = Plan.getConstantInt(C);
       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
     }
     RewriteMap[StrideV] = PSE.getSCEV(StrideV);
@@ -3123,8 +3204,7 @@ void VPlanTransforms::createInterleaveGroups(
                    DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
                        IG->getIndex(IRInsertPos),
                    /*IsSigned=*/true);
-      VPValue *OffsetVPV =
-          Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
+      VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
       VPBuilder B(InsertPos);
       Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
     }
@@ -3390,6 +3470,34 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
         ToRemove.push_back(Expr);
       }
 
+      // Expand LastActiveLane into Not + FirstActiveLane + Sub.
+      auto *LastActiveL = dyn_cast<VPInstruction>(&R);
+      if (LastActiveL &&
+          LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
+        // Create Not(Mask) for all operands.
+        SmallVector<VPValue *, 2> NotMasks;
+        for (VPValue *Op : LastActiveL->operands()) {
+          VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
+          NotMasks.push_back(NotMask);
+        }
+
+        // Create FirstActiveLane on the inverted masks.
+        VPValue *FirstInactiveLane = Builder.createNaryOp(
+            VPInstruction::FirstActiveLane, NotMasks,
+            LastActiveL->getDebugLoc(), "first.inactive.lane");
+
+        // Subtract 1 to get the last active lane.
+        VPValue *One = Plan.getOrAddLiveIn(
+            ConstantInt::get(Type::getInt64Ty(Plan.getContext()), 1));
+        VPValue *LastLane = Builder.createNaryOp(
+            Instruction::Sub, {FirstInactiveLane, One},
+            LastActiveL->getDebugLoc(), "last.active.lane");
+
+        LastActiveL->replaceAllUsesWith(LastLane);
+        ToRemove.push_back(LastActiveL);
+        continue;
+      }
+
       VPValue *VectorStep;
       VPValue *ScalarStep;
       if (!match(&R, m_VPInstruction<VPInstruction::WideIVStep>(
@@ -3681,11 +3789,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
 
   // Try to match reduce.add(mul(...)).
   if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
-    auto *RecipeA =
-        dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
-    auto *RecipeB =
-        dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
-    auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
+    auto *RecipeA = dyn_cast_if_present<VPWidenCastRecipe>(A);
+    auto *RecipeB = dyn_cast_if_present<VPWidenCastRecipe>(B);
+    auto *Mul = cast<VPWidenRecipe>(VecOp);
 
     // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
     ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
@@ -3710,10 +3816,10 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
 
   // Match reduce.add(ext(mul(A, B))).
   if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
-    auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
-    auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
-    auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
-    auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
+    auto *Ext = cast<VPWidenCastRecipe>(VecOp);
+    auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
+    auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A);
+    auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B);
 
     // reduce.add(ext(mul(ext, const)))
     // -> reduce.add(ext(mul(ext, ext(const))))
@@ -3865,8 +3971,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
   VPBuilder Builder(VectorPH, VectorPH->begin());
   auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
   auto *TCMO = Builder.createNaryOp(
-      Instruction::Sub,
-      {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))},
+      Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
       DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
   BTC->replaceAllUsesWith(TCMO);
 }
@@ -3991,9 +4096,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
   if (TailByMasking) {
     TC = Builder.createNaryOp(
         Instruction::Add,
-        {TC, Builder.createNaryOp(
-                 Instruction::Sub,
-                 {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
+        {TC, Builder.createNaryOp(Instruction::Sub,
+                                  {Step, Plan.getConstantInt(TCTy, 1)})},
         DebugLoc::getCompilerGenerated(), "n.rnd.up");
   }
 
@@ -4015,8 +4119,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
   if (RequiresScalarEpilogue) {
     assert(!TailByMasking &&
            "requiring scalar epilogue is not supported with fail folding");
-    VPValue *IsZero = Builder.createICmp(
-        CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
+    VPValue *IsZero =
+        Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
     R = Builder.createSelect(IsZero, Step, R);
   }
 
@@ -4054,7 +4158,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
   }
   VF.replaceAllUsesWith(RuntimeVF);
 
-  VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
+  VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
   VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
   VFxUF.replaceAllUsesWith(MulByUF);
 }
@@ -4111,13 +4215,13 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
 /// is defined at \p Idx of a load interleave group.
 static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
                           VPValue *OpV, unsigned Idx) {
-  auto *DefR = OpV->getDefiningRecipe();
-  if (!DefR)
-    return WideMember0->getOperand(OpIdx) == OpV;
-  if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
-    return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
-
-  if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
+  VPValue *Member0Op = WideMember0->getOperand(OpIdx);
+  VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
+  if (!Member0OpR)
+    return Member0Op == OpV;
+  if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
+    return !W->getMask() && Member0Op == OpV;
+  if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
     return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
   return false;
 }
@@ -4126,8 +4230,9 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
 /// members both equal to \p VF. The interleave group must also access the full
 /// vector width \p VectorRegWidth.
 static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
-                                         unsigned VF, VPTypeAnalysis &TypeInfo,
-                                         unsigned VectorRegWidth) {
+                                         ElementCount VF,
+                                         VPTypeAnalysis &TypeInfo,
+                                         TypeSize VectorRegWidth) {
   if (!InterleaveR || InterleaveR->getMask())
     return false;
 
@@ -4149,9 +4254,11 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
       return false;
   }
 
-  unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
-  auto IG = InterleaveR->getInterleaveGroup();
-  return IG->getFactor() == VF && IG->getNumMembers() == VF &&
+  unsigned VFMin = VF.getKnownMinValue();
+  TypeSize GroupSize = TypeSize::get(
+      GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable());
+  const auto *IG = InterleaveR->getInterleaveGroup();
+  return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin &&
          GroupSize == VectorRegWidth;
 }
 
@@ -4163,18 +4270,70 @@ static bool isAlreadyNarrow(VPValue *VPV) {
   return RepR && RepR->isSingleScalar();
 }
 
+// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
+// a narrow variant.
+static VPValue *
+narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) {
+  auto *R = V->getDefiningRecipe();
+  if (!R || NarrowedOps.contains(V))
+    return V;
+
+  if (isAlreadyNarrow(V))
+    return V;
+
+  if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
+    for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
+      WideMember0->setOperand(
+          Idx,
+          narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
+    return V;
+  }
+
+  if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
+    // Narrow interleave group to wide load, as transformed VPlan will only
+    // process one original iteration.
+    auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
+    auto *L = new VPWidenLoadRecipe(
+        *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
+        /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
+    L->insertBefore(LoadGroup);
+    NarrowedOps.insert(L);
+    return L;
+  }
+
+  if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
+    assert(RepR->isSingleScalar() &&
+           isa<LoadInst>(RepR->getUnderlyingInstr()) &&
+           "must be a single scalar load");
+    NarrowedOps.insert(RepR);
+    return RepR;
+  }
+
+  auto *WideLoad = cast<VPWidenLoadRecipe>(R);
+  VPValue *PtrOp = WideLoad->getAddr();
+  if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
+    PtrOp = VecPtr->getOperand(0);
+  // Narrow wide load to uniform scalar load, as transformed VPlan will only
+  // process one original iteration.
+  auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
+                                  /*IsUniform*/ true,
+                                  /*Mask*/ nullptr, *WideLoad);
+  N->insertBefore(WideLoad);
+  NarrowedOps.insert(N);
+  return N;
+}
+
 void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
-                                             unsigned VectorRegWidth) {
+                                             TypeSize VectorRegWidth) {
   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
   if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
     return;
 
   VPTypeAnalysis TypeInfo(Plan);
 
-  unsigned VFMinVal = VF.getKnownMinValue();
   SmallVector<VPInterleaveRecipe *> StoreGroups;
   for (auto &R : *VectorLoop->getEntryBasicBlock()) {
-    if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount()))
+    if (isa<VPCanonicalIVPHIRecipe>(&R))
       continue;
 
     if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) &&
@@ -4206,7 +4365,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
       continue;
 
     // Bail out on non-consecutive interleave groups.
-    if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
+    if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo,
                                       VectorRegWidth))
       return;
 
@@ -4240,12 +4399,12 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
 
     // Check if all values feeding InterleaveR are matching wide recipes, which
     // operands that can be narrowed.
-    auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
-        InterleaveR->getStoredValues()[0]->getDefiningRecipe());
+    auto *WideMember0 =
+        dyn_cast_or_null<VPWidenRecipe>(InterleaveR->getStoredValues()[0]);
     if (!WideMember0)
       return;
     for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
-      auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
+      auto *R = dyn_cast_or_null<VPWidenRecipe>(V);
       if (!R || R->getOpcode() != WideMember0->getOpcode() ||
           R->getNumOperands() > 2)
         return;
@@ -4264,65 +4423,15 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
 
   // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
   SmallPtrSet<VPValue *, 4> NarrowedOps;
-  auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
-    auto *R = V->getDefiningRecipe();
-    if (!R || NarrowedOps.contains(V))
-      return V;
-    if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
-      // Narrow interleave group to wide load, as transformed VPlan will only
-      // process one original iteration.
-      auto *LI =
-          cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
-      auto *L = new VPWidenLoadRecipe(
-          *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
-          /*Reverse=*/false, LI->getAlign(), {}, LoadGroup->getDebugLoc());
-      L->insertBefore(LoadGroup);
-      NarrowedOps.insert(L);
-      return L;
-    }
-
-    if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
-      assert(RepR->isSingleScalar() &&
-             isa<LoadInst>(RepR->getUnderlyingInstr()) &&
-             "must be a single scalar load");
-      NarrowedOps.insert(RepR);
-      return RepR;
-    }
-    auto *WideLoad = cast<VPWidenLoadRecipe>(R);
-
-    VPValue *PtrOp = WideLoad->getAddr();
-    if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
-      PtrOp = VecPtr->getOperand(0);
-    // Narrow wide load to uniform scalar load, as transformed VPlan will only
-    // process one original iteration.
-    auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
-                                    /*IsUniform*/ true,
-                                    /*Mask*/ nullptr, *WideLoad);
-    N->insertBefore(WideLoad);
-    NarrowedOps.insert(N);
-    return N;
-  };
-
   // Narrow operation tree rooted at store groups.
   for (auto *StoreGroup : StoreGroups) {
-    VPValue *Res = nullptr;
-    VPValue *Member0 = StoreGroup->getStoredValues()[0];
-    if (isAlreadyNarrow(Member0)) {
-      Res = Member0;
-    } else if (auto *WideMember0 =
-                   dyn_cast<VPWidenRecipe>(Member0->getDefiningRecipe())) {
-      for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
-        WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx)));
-      Res = WideMember0;
-    } else {
-      Res = NarrowOp(Member0);
-    }
-
+    VPValue *Res =
+        narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
     auto *SI =
         cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
     auto *S = new VPWidenStoreRecipe(
         *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
-        /*Reverse=*/false, SI->getAlign(), {}, StoreGroup->getDebugLoc());
+        /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
     S->insertBefore(StoreGroup);
     StoreGroup->eraseFromParent();
   }
@@ -4334,17 +4443,17 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
   VPBuilder PHBuilder(Plan.getVectorPreheader());
 
   VPValue *UF = Plan.getOrAddLiveIn(
-      ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
+      ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
   if (VF.isScalable()) {
     VPValue *VScale = PHBuilder.createElementCount(
-        CanIV->getScalarType(), ElementCount::getScalable(1));
+        VectorLoop->getCanonicalIVType(), ElementCount::getScalable(1));
     VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
     Inc->setOperand(1, VScaleUF);
     Plan.getVF().replaceAllUsesWith(VScale);
   } else {
     Inc->setOperand(1, UF);
     Plan.getVF().replaceAllUsesWith(
-        Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
+        Plan.getConstantInt(CanIV->getScalarType(), 1));
   }
   removeDeadRecipes(Plan);
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index b28559b620e13..5e67d8fd2a0eb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -32,6 +32,7 @@ class VPRecipeBuilder;
 struct VFRange;
 
 extern cl::opt<bool> VerifyEachVPlan;
+extern cl::opt<bool> EnableWideActiveLaneMask;
 
 struct VPlanTransforms {
   /// Helper to run a VPlan transform \p Transform on \p VPlan, forwarding extra
@@ -348,7 +349,7 @@ struct VPlanTransforms {
   /// form of loop-aware SLP, where we use interleave groups to identify
   /// candidates.
   static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
-                                     unsigned VectorRegWidth);
+                                     TypeSize VectorRegWidth);
 
   /// Predicate and linearize the control-flow in the only loop region of
   /// \p Plan. If \p FoldTail is true, create a mask guarding the loop
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index cfd1a741ee841..221ca4ab05370 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -68,10 +68,9 @@ class UnrollState {
   void unrollWidenInductionByUF(VPWidenInductionRecipe *IV,
                                 VPBasicBlock::iterator InsertPtForPhi);
 
-  VPValue *getConstantVPV(unsigned Part) {
-    Type *CanIVIntTy =
-        Plan.getVectorLoopRegion()->getCanonicalIV()->getScalarType();
-    return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
+  VPValue *getConstantInt(unsigned Part) {
+    Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType();
+    return Plan.getConstantInt(CanIVIntTy, Part);
   }
 
 public:
@@ -138,7 +137,7 @@ void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
       for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
         remapOperands(&PartIR, Part);
         if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
-          ScalarIVSteps->addOperand(getConstantVPV(Part));
+          ScalarIVSteps->addOperand(getConstantInt(Part));
         }
 
         addRecipeForPart(&Part0R, &PartIR, Part);
@@ -250,7 +249,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
         for (unsigned Part = 1; Part != UF; ++Part)
           VPV2Parts[VPI][Part - 1] = StartV;
       }
-      Copy->addOperand(getConstantVPV(Part));
+      Copy->addOperand(getConstantInt(Part));
     } else {
       assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
              "unexpected header phi recipe not needing unrolled part");
@@ -319,7 +318,7 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
             VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Copy) ||
         match(Copy,
               m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()))
-      Copy->addOperand(getConstantVPV(Part));
+      Copy->addOperand(getConstantInt(Part));
 
     if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(R))
       Copy->setOperand(0, R.getOperand(0));
@@ -353,6 +352,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
     VPValue *Op1;
     if (match(&R, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(Op1))) ||
         match(&R, m_FirstActiveLane(m_VPValue(Op1))) ||
+        match(&R, m_LastActiveLane(m_VPValue(Op1))) ||
         match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>(
                       m_VPValue(), m_VPValue(), m_VPValue(Op1))) ||
         match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
@@ -365,17 +365,21 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
       continue;
     }
     VPValue *Op0;
-    if (match(&R, m_VPInstruction<VPInstruction::ExtractLane>(
-                      m_VPValue(Op0), m_VPValue(Op1)))) {
+    if (match(&R, m_ExtractLane(m_VPValue(Op0), m_VPValue(Op1)))) {
       addUniformForAllParts(cast<VPInstruction>(&R));
       for (unsigned Part = 1; Part != UF; ++Part)
         R.addOperand(getValueForPart(Op1, Part));
       continue;
     }
     if (match(&R, m_ExtractLastElement(m_VPValue(Op0))) ||
-        match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
-                      m_VPValue(Op0)))) {
+        match(&R, m_ExtractPenultimateElement(m_VPValue(Op0)))) {
       addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
+      if (isa<VPFirstOrderRecurrencePHIRecipe>(Op0)) {
+        assert(match(&R, m_ExtractLastElement(m_VPValue())) &&
+               "can only extract last element of FOR");
+        continue;
+      }
+
       if (Plan.hasScalarVFOnly()) {
         auto *I = cast<VPInstruction>(&R);
         // Extracting from end with VF = 1 implies retrieving the last or
@@ -475,8 +479,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
     if (LaneDefs != Def2LaneDefs.end())
       return LaneDefs->second[Lane.getKnownLane()];
 
-    VPValue *Idx =
-        Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+    VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane());
     return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
   }
 
@@ -510,8 +513,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
           cast<VPInstruction>(Op)->getOperand(Lane.getKnownLane()));
       continue;
     }
-    VPValue *Idx =
-        Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+    VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane());
     VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
     NewOps.push_back(Ext);
   }
@@ -585,7 +587,7 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
       /// Users that only demand the first lane can use the definition for lane
       /// 0.
       DefR->replaceUsesWithIf(LaneDefs[0], [DefR](VPUser &U, unsigned) {
-        return U.onlyFirstLaneUsed(DefR);
+        return U.usesFirstLaneOnly(DefR);
       });
 
       // Update each build vector user that currently has DefR as its only
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 4db92e7def3ed..e22c5dfdb9f38 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -18,12 +18,12 @@ using namespace llvm::VPlanPatternMatch;
 
 bool vputils::onlyFirstLaneUsed(const VPValue *Def) {
   return all_of(Def->users(),
-                [Def](const VPUser *U) { return U->onlyFirstLaneUsed(Def); });
+                [Def](const VPUser *U) { return U->usesFirstLaneOnly(Def); });
 }
 
 bool vputils::onlyFirstPartUsed(const VPValue *Def) {
   return all_of(Def->users(),
-                [Def](const VPUser *U) { return U->onlyFirstPartUsed(Def); });
+                [Def](const VPUser *U) { return U->usesFirstPartOnly(Def); });
 }
 
 bool vputils::onlyScalarValuesUsed(const VPValue *Def) {
@@ -32,22 +32,17 @@ bool vputils::onlyScalarValuesUsed(const VPValue *Def) {
 }
 
 VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) {
-  VPValue *Expanded = nullptr;
   if (auto *E = dyn_cast<SCEVConstant>(Expr))
-    Expanded = Plan.getOrAddLiveIn(E->getValue());
-  else {
-    auto *U = dyn_cast<SCEVUnknown>(Expr);
-    // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction
-    // value. Otherwise the value may be defined in a loop and using it directly
-    // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA
-    // form.
-    if (U && !isa<Instruction>(U->getValue())) {
-      Expanded = Plan.getOrAddLiveIn(U->getValue());
-    } else {
-      Expanded = new VPExpandSCEVRecipe(Expr);
-      Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe());
-    }
-  }
+    return Plan.getOrAddLiveIn(E->getValue());
+  // Skip SCEV expansion if Expr is a SCEVUnknown wrapping a non-instruction
+  // value. Otherwise the value may be defined in a loop and using it directly
+  // will break LCSSA form. The SCEV expansion takes care of preserving LCSSA
+  // form.
+  auto *U = dyn_cast<SCEVUnknown>(Expr);
+  if (U && !isa<Instruction>(U->getValue()))
+    return Plan.getOrAddLiveIn(U->getValue());
+  auto *Expanded = new VPExpandSCEVRecipe(Expr);
+  Plan.getEntry()->appendRecipe(Expanded);
   return Expanded;
 }
 
@@ -75,7 +70,8 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) {
          B == Plan.getBackedgeTakenCount();
 }
 
-const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
+const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V,
+                                           ScalarEvolution &SE, const Loop *L) {
   if (V->isLiveIn()) {
     if (Value *LiveIn = V->getLiveInIRValue())
       return SE.getSCEV(LiveIn);
@@ -86,6 +82,53 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
   return TypeSwitch<const VPRecipeBase *, const SCEV *>(V->getDefiningRecipe())
       .Case<VPExpandSCEVRecipe>(
           [](const VPExpandSCEVRecipe *R) { return R->getSCEV(); })
+      .Case<VPCanonicalIVPHIRecipe>([&SE, L](const VPCanonicalIVPHIRecipe *R) {
+        if (!L)
+          return SE.getCouldNotCompute();
+        const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L);
+        return SE.getAddRecExpr(Start, SE.getOne(Start->getType()), L,
+                                SCEV::FlagAnyWrap);
+      })
+      .Case<VPDerivedIVRecipe>([&SE, L](const VPDerivedIVRecipe *R) {
+        const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L);
+        const SCEV *IV = getSCEVExprForVPValue(R->getOperand(1), SE, L);
+        const SCEV *Scale = getSCEVExprForVPValue(R->getOperand(2), SE, L);
+        if (any_of(ArrayRef({Start, IV, Scale}), IsaPred<SCEVCouldNotCompute>))
+          return SE.getCouldNotCompute();
+
+        return SE.getAddExpr(SE.getTruncateOrSignExtend(Start, IV->getType()),
+                             SE.getMulExpr(IV, SE.getTruncateOrSignExtend(
+                                                   Scale, IV->getType())));
+      })
+      .Case<VPScalarIVStepsRecipe>([&SE, L](const VPScalarIVStepsRecipe *R) {
+        const SCEV *IV = getSCEVExprForVPValue(R->getOperand(0), SE, L);
+        const SCEV *Step = getSCEVExprForVPValue(R->getOperand(1), SE, L);
+        if (isa<SCEVCouldNotCompute>(IV) || isa<SCEVCouldNotCompute>(Step) ||
+            !Step->isOne())
+          return SE.getCouldNotCompute();
+        return SE.getMulExpr(SE.getTruncateOrSignExtend(IV, Step->getType()),
+                             Step);
+      })
+      .Case<VPReplicateRecipe>([&SE, L](const VPReplicateRecipe *R) {
+        if (R->getOpcode() != Instruction::GetElementPtr)
+          return SE.getCouldNotCompute();
+
+        const SCEV *Base = getSCEVExprForVPValue(R->getOperand(0), SE, L);
+        if (isa<SCEVCouldNotCompute>(Base))
+          return SE.getCouldNotCompute();
+
+        SmallVector<const SCEV *> IndexExprs;
+        for (VPValue *Index : drop_begin(R->operands())) {
+          const SCEV *IndexExpr = getSCEVExprForVPValue(Index, SE, L);
+          if (isa<SCEVCouldNotCompute>(IndexExpr))
+            return SE.getCouldNotCompute();
+          IndexExprs.push_back(IndexExpr);
+        }
+
+        Type *SrcElementTy = cast<GetElementPtrInst>(R->getUnderlyingInstr())
+                                 ->getSourceElementType();
+        return SE.getGEPExpr(Base, IndexExprs, SrcElementTy);
+      })
       .Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); });
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 37cd413da9079..c21a0e70c1392 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -37,7 +37,8 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr);
 
 /// Return the SCEV expression for \p V. Returns SCEVCouldNotCompute if no
 /// SCEV expression could be constructed.
-const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE);
+const SCEV *getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE,
+                                  const Loop *L = nullptr);
 
 /// Returns true if \p VPV is a single scalar, either because it produces the
 /// same value for all lanes or only has its first lane used.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 83e3fcaaeee2b..09fdf5a731816 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -150,6 +150,13 @@ class LLVM_ABI_FOR_TEST VPValue {
 
   bool hasOneUse() const { return getNumUsers() == 1; }
 
+  /// Return the single user of this value, or nullptr if there is not exactly
+  /// one user.
+  VPUser *getSingleUser() { return hasOneUse() ? *user_begin() : nullptr; }
+  const VPUser *getSingleUser() const {
+    return hasOneUse() ? *user_begin() : nullptr;
+  }
+
   void replaceAllUsesWith(VPValue *New);
 
   /// Go through the uses list for this VPValue and make each use point to \p
@@ -274,12 +281,12 @@ class VPUser {
   virtual bool usesScalars(const VPValue *Op) const {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
-    return onlyFirstLaneUsed(Op);
+    return usesFirstLaneOnly(Op);
   }
 
   /// Returns true if the VPUser only uses the first lane of operand \p Op.
   /// Conservatively returns false.
-  virtual bool onlyFirstLaneUsed(const VPValue *Op) const {
+  virtual bool usesFirstLaneOnly(const VPValue *Op) const {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return false;
@@ -287,7 +294,7 @@ class VPUser {
 
   /// Returns true if the VPUser only uses the first part of operand \p Op.
   /// Conservatively returns false.
-  virtual bool onlyFirstPartUsed(const VPValue *Op) const {
+  virtual bool usesFirstPartOnly(const VPValue *Op) const {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return false;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 91734a10cb2c8..2d63d2a787f88 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -18,6 +18,7 @@
 #include "VPlanDominatorTree.h"
 #include "VPlanHelpers.h"
 #include "VPlanPatternMatch.h"
+#include "VPlanUtils.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 
@@ -44,6 +45,9 @@ class VPlanVerifier {
   /// incoming value into EVL's recipe.
   bool verifyEVLRecipe(const VPInstruction &EVL) const;
 
+  /// Verify that \p LastActiveLane's operand is guaranteed to be a prefix-mask.
+  bool verifyLastActiveLaneRecipe(const VPInstruction &LastActiveLane) const;
+
   bool verifyVPBasicBlock(const VPBasicBlock *VPBB);
 
   bool verifyBlock(const VPBlockBase *VPB);
@@ -221,6 +225,44 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
   });
 }
 
+bool VPlanVerifier::verifyLastActiveLaneRecipe(
+    const VPInstruction &LastActiveLane) const {
+  assert(LastActiveLane.getOpcode() == VPInstruction::LastActiveLane &&
+         "must be called with VPInstruction::LastActiveLane");
+
+  if (LastActiveLane.getNumOperands() < 1) {
+    errs() << "LastActiveLane must have at least one operand\n";
+    return false;
+  }
+
+  const VPlan &Plan = *LastActiveLane.getParent()->getPlan();
+  // All operands must be prefix-mask. Currently we check for header masks or
+  // EVL-derived masks, as those are currently the only operands in practice,
+  // but this may need updating in the future.
+  for (VPValue *Op : LastActiveLane.operands()) {
+    if (vputils::isHeaderMask(Op, Plan))
+      continue;
+
+    // Masks derived from EVL are also fine.
+    auto BroadcastOrEVL =
+        m_CombineOr(m_Broadcast(m_EVL(m_VPValue())), m_EVL(m_VPValue()));
+    if (match(Op, m_CombineOr(m_ICmp(m_StepVector(), BroadcastOrEVL),
+                              m_ICmp(BroadcastOrEVL, m_StepVector()))))
+      continue;
+
+    errs() << "LastActiveLane operand ";
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    VPSlotTracker Tracker(&Plan);
+    Op->printAsOperand(errs(), Tracker);
+#endif
+    errs() << " must be prefix mask (a header mask or an "
+              "EVL-derived mask currently)\n";
+    return false;
+  }
+
+  return true;
+}
+
 bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
   if (!verifyPhiRecipes(VPBB))
     return false;
@@ -252,6 +294,13 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
 
       for (const VPUser *U : V->users()) {
         auto *UI = cast<VPRecipeBase>(U);
+        if (isa<VPIRPhi>(UI) &&
+            UI->getNumOperands() != UI->getParent()->getNumPredecessors()) {
+          errs() << "Phi-like recipe with different number of operands and "
+                    "predecessors.\n";
+          return false;
+        }
+
         if (auto *Phi = dyn_cast<VPPhiAccessors>(UI)) {
           for (const auto &[IncomingVPV, IncomingVPBB] :
                Phi->incoming_values_and_blocks()) {
@@ -306,6 +355,10 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
           return false;
         }
         break;
+      case VPInstruction::LastActiveLane:
+        if (!verifyLastActiveLaneRecipe(*VPI))
+          return false;
+        break;
       default:
         break;
       }
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index d6eb00da11dc8..f1890e4f5fb95 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -129,7 +129,9 @@ class VectorCombine {
   bool foldExtractedCmps(Instruction &I);
   bool foldBinopOfReductions(Instruction &I);
   bool foldSingleElementStore(Instruction &I);
-  bool scalarizeLoadExtract(Instruction &I);
+  bool scalarizeLoad(Instruction &I);
+  bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
+  bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
   bool scalarizeExtExtract(Instruction &I);
   bool foldConcatOfBoolMasks(Instruction &I);
   bool foldPermuteOfBinops(Instruction &I);
@@ -696,11 +698,11 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {
 /// shuffle.
 bool VectorCombine::foldInsExtFNeg(Instruction &I) {
   // Match an insert (op (extract)) pattern.
-  Value *DestVec;
-  uint64_t Index;
+  Value *DstVec;
+  uint64_t ExtIdx, InsIdx;
   Instruction *FNeg;
-  if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)),
-                             m_ConstantInt(Index))))
+  if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
+                             m_ConstantInt(InsIdx))))
     return false;
 
   // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
@@ -708,67 +710,74 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
   Instruction *Extract;
   if (!match(FNeg, m_FNeg(m_CombineAnd(
                        m_Instruction(Extract),
-                       m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index))))))
+                       m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
     return false;
 
-  auto *VecTy = cast<FixedVectorType>(I.getType());
-  auto *ScalarTy = VecTy->getScalarType();
+  auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
+  auto *DstVecScalarTy = DstVecTy->getScalarType();
   auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
-  if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType())
+  if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
     return false;
 
-  // Ignore bogus insert/extract index.
-  unsigned NumElts = VecTy->getNumElements();
-  if (Index >= NumElts)
+  // Ignore if insert/extract index is out of bounds or destination vector has
+  // one element
+  unsigned NumDstElts = DstVecTy->getNumElements();
+  unsigned NumSrcElts = SrcVecTy->getNumElements();
+  if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
     return false;
 
   // We are inserting the negated element into the same lane that we extracted
   // from. This is equivalent to a select-shuffle that chooses all but the
   // negated element from the destination vector.
-  SmallVector<int> Mask(NumElts);
+  SmallVector<int> Mask(NumDstElts);
   std::iota(Mask.begin(), Mask.end(), 0);
-  Mask[Index] = Index + NumElts;
+  Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
   InstructionCost OldCost =
-      TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) +
-      TTI.getVectorInstrCost(I, VecTy, CostKind, Index);
+      TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
+      TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
 
   // If the extract has one use, it will be eliminated, so count it in the
   // original cost. If it has more than one use, ignore the cost because it will
   // be the same before/after.
   if (Extract->hasOneUse())
-    OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index);
+    OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
 
   InstructionCost NewCost =
-      TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) +
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, VecTy,
-                         Mask, CostKind);
+      TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, DstVecTy,
+                         DstVecTy, Mask, CostKind);
 
-  bool NeedLenChg = SrcVecTy->getNumElements() != NumElts;
+  bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
   // If the lengths of the two vectors are not equal,
   // we need to add a length-change vector. Add this cost.
   SmallVector<int> SrcMask;
   if (NeedLenChg) {
-    SrcMask.assign(NumElts, PoisonMaskElem);
-    SrcMask[Index] = Index;
+    SrcMask.assign(NumDstElts, PoisonMaskElem);
+    SrcMask[ExtIdx % NumDstElts] = ExtIdx;
     NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
-                                  VecTy, SrcVecTy, SrcMask, CostKind);
+                                  DstVecTy, SrcVecTy, SrcMask, CostKind);
   }
 
+  LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
+                    << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
+                    << "\n");
   if (NewCost > OldCost)
     return false;
 
-  Value *NewShuf;
-  // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index
+  Value *NewShuf, *LenChgShuf = nullptr;
+  // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
   Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
   if (NeedLenChg) {
-    // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
-    Value *LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
-    NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask);
+    // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
+    LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
+    NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
+    Worklist.pushValue(LenChgShuf);
   } else {
-    // shuffle DestVec, (fneg SrcVec), Mask
-    NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
+    // shuffle DstVec, (fneg SrcVec), Mask
+    NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
   }
 
+  Worklist.pushValue(VecFNeg);
   replaceValue(I, *NewShuf);
   return true;
 }
@@ -1845,11 +1854,9 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
   return false;
 }
 
-/// Try to scalarize vector loads feeding extractelement instructions.
-bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
-  if (!TTI.allowVectorElementIndexingUsingGEP())
-    return false;
-
+/// Try to scalarize vector loads feeding extractelement or bitcast
+/// instructions.
+bool VectorCombine::scalarizeLoad(Instruction &I) {
   Value *Ptr;
   if (!match(&I, m_Load(m_Value(Ptr))))
     return false;
@@ -1859,35 +1866,30 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
   if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
     return false;
 
-  InstructionCost OriginalCost =
-      TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
-                          LI->getPointerAddressSpace(), CostKind);
-  InstructionCost ScalarizedCost = 0;
-
+  bool AllExtracts = true;
+  bool AllBitcasts = true;
   Instruction *LastCheckedInst = LI;
   unsigned NumInstChecked = 0;
-  DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
-  auto FailureGuard = make_scope_exit([&]() {
-    // If the transform is aborted, discard the ScalarizationResults.
-    for (auto &Pair : NeedFreeze)
-      Pair.second.discard();
-  });
 
-  // Check if all users of the load are extracts with no memory modifications
-  // between the load and the extract. Compute the cost of both the original
-  // code and the scalarized version.
+  // Check what type of users we have (must either all be extracts or
+  // bitcasts) and ensure no memory modifications between the load and
+  // its users.
   for (User *U : LI->users()) {
-    auto *UI = dyn_cast<ExtractElementInst>(U);
+    auto *UI = dyn_cast<Instruction>(U);
     if (!UI || UI->getParent() != LI->getParent())
       return false;
 
-    // If any extract is waiting to be erased, then bail out as this will
+    // If any user is waiting to be erased, then bail out as this will
     // distort the cost calculation and possibly lead to infinite loops.
     if (UI->use_empty())
       return false;
 
-    // Check if any instruction between the load and the extract may modify
-    // memory.
+    if (!isa<ExtractElementInst>(UI))
+      AllExtracts = false;
+    if (!isa<BitCastInst>(UI))
+      AllBitcasts = false;
+
+    // Check if any instruction between the load and the user may modify memory.
     if (LastCheckedInst->comesBefore(UI)) {
       for (Instruction &I :
            make_range(std::next(LI->getIterator()), UI->getIterator())) {
@@ -1899,6 +1901,35 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
       }
       LastCheckedInst = UI;
     }
+  }
+
+  if (AllExtracts)
+    return scalarizeLoadExtract(LI, VecTy, Ptr);
+  if (AllBitcasts)
+    return scalarizeLoadBitcast(LI, VecTy, Ptr);
+  return false;
+}
+
+/// Try to scalarize vector loads feeding extractelement instructions.
+bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
+                                         Value *Ptr) {
+  if (!TTI.allowVectorElementIndexingUsingGEP())
+    return false;
+
+  DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
+  auto FailureGuard = make_scope_exit([&]() {
+    // If the transform is aborted, discard the ScalarizationResults.
+    for (auto &Pair : NeedFreeze)
+      Pair.second.discard();
+  });
+
+  InstructionCost OriginalCost =
+      TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
+                          LI->getPointerAddressSpace(), CostKind);
+  InstructionCost ScalarizedCost = 0;
+
+  for (User *U : LI->users()) {
+    auto *UI = cast<ExtractElementInst>(U);
 
     auto ScalarIdx =
         canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
@@ -1920,7 +1951,7 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
                                                     nullptr, nullptr, CostKind);
   }
 
-  LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << I
+  LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
                     << "\n  LoadExtractCost: " << OriginalCost
                     << " vs ScalarizedCost: " << ScalarizedCost << "\n");
 
@@ -1966,6 +1997,72 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
   return true;
 }
 
+/// Try to scalarize vector loads feeding bitcast instructions.
+bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
+                                         Value *Ptr) {
+  InstructionCost OriginalCost =
+      TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
+                          LI->getPointerAddressSpace(), CostKind);
+
+  Type *TargetScalarType = nullptr;
+  unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
+
+  for (User *U : LI->users()) {
+    auto *BC = cast<BitCastInst>(U);
+
+    Type *DestTy = BC->getDestTy();
+    if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
+      return false;
+
+    unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
+    if (DestBitWidth != VecBitWidth)
+      return false;
+
+    // All bitcasts must target the same scalar type.
+    if (!TargetScalarType)
+      TargetScalarType = DestTy;
+    else if (TargetScalarType != DestTy)
+      return false;
+
+    OriginalCost +=
+        TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
+                             TTI.getCastContextHint(BC), CostKind, BC);
+  }
+
+  if (!TargetScalarType)
+    return false;
+
+  assert(!LI->user_empty() && "Unexpected load without bitcast users");
+  InstructionCost ScalarizedCost =
+      TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
+                          LI->getPointerAddressSpace(), CostKind);
+
+  LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
+                    << "\n  OriginalCost: " << OriginalCost
+                    << " vs ScalarizedCost: " << ScalarizedCost << "\n");
+
+  if (ScalarizedCost >= OriginalCost)
+    return false;
+
+  // Ensure we add the load back to the worklist BEFORE its users so they can
+  // erased in the correct order.
+  Worklist.push(LI);
+
+  Builder.SetInsertPoint(LI);
+  auto *ScalarLoad =
+      Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
+  ScalarLoad->setAlignment(LI->getAlign());
+  ScalarLoad->copyMetadata(*LI);
+
+  // Replace all bitcast users with the scalar load.
+  for (User *U : LI->users()) {
+    auto *BC = cast<BitCastInst>(U);
+    replaceValue(*BC, *ScalarLoad, false);
+  }
+
+  return true;
+}
+
 bool VectorCombine::scalarizeExtExtract(Instruction &I) {
   if (!TTI.allowVectorElementIndexingUsingGEP())
     return false;
@@ -2017,8 +2114,31 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) {
 
   Value *ScalarV = Ext->getOperand(0);
   if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
-                                 &DT))
-    ScalarV = Builder.CreateFreeze(ScalarV);
+                                 &DT)) {
+    // Check wether all lanes are extracted, all extracts trigger UB
+    // on poison, and the last extract (and hence all previous ones)
+    // are guaranteed to execute if Ext executes.  If so, we do not
+    // need to insert a freeze.
+    SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
+    bool AllExtractsTriggerUB = true;
+    ExtractElementInst *LastExtract = nullptr;
+    BasicBlock *ExtBB = Ext->getParent();
+    for (User *U : Ext->users()) {
+      auto *Extract = cast<ExtractElementInst>(U);
+      if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
+        AllExtractsTriggerUB = false;
+        break;
+      }
+      ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
+      if (!LastExtract || LastExtract->comesBefore(Extract))
+        LastExtract = Extract;
+    }
+    if (ExtractedLanes.size() != DstTy->getNumElements() ||
+        !AllExtractsTriggerUB ||
+        !isGuaranteedToTransferExecutionToSuccessor(Ext->getIterator(),
+                                                    LastExtract->getIterator()))
+      ScalarV = Builder.CreateFreeze(ScalarV);
+  }
   ScalarV = Builder.CreateBitCast(
       ScalarV,
       IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
@@ -4555,7 +4675,7 @@ bool VectorCombine::run() {
     if (IsVectorType) {
       if (scalarizeOpOrCmp(I))
         return true;
-      if (scalarizeLoadExtract(I))
+      if (scalarizeLoad(I))
         return true;
       if (scalarizeExtExtract(I))
         return true;
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll b/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll
index 7e1588f427be4..76f73e43a2355 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll
@@ -325,14 +325,14 @@ define void @extaddv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_16_32 = add <4 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <4 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <4 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <4 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <4 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <4 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <4 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <4 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64>
@@ -434,24 +434,24 @@ define void @extaddv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_8_16 = add <8 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = add <8 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <8 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <8 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = add <8 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <8 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <8 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = add <8 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <8 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <8 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = add <8 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <8 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <8 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = add <8 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32>
@@ -464,14 +464,14 @@ define void @extaddv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = add <8 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <8 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <8 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <8 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <8 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <8 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <8 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <8 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64>
@@ -573,24 +573,24 @@ define void @extaddv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = add <16 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = add <16 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <16 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <16 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = add <16 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <16 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <16 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = add <16 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <16 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <16 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = add <16 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <16 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <16 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = add <16 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32>
@@ -603,14 +603,14 @@ define void @extaddv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = add <16 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <16 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <16 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <16 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <16 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <16 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <16 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <16 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64>
@@ -1020,14 +1020,14 @@ define void @extsubv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_16_32 = sub <4 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <4 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <4 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <4 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <4 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <4 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <4 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <4 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64>
@@ -1129,24 +1129,24 @@ define void @extsubv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_8_16 = sub <8 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = sub <8 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <8 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <8 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = sub <8 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <8 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <8 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = sub <8 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <8 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <8 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = sub <8 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <8 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <8 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = sub <8 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32>
@@ -1159,14 +1159,14 @@ define void @extsubv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = sub <8 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <8 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <8 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <8 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <8 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <8 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <8 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <8 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64>
@@ -1268,24 +1268,24 @@ define void @extsubv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = sub <16 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = sub <16 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <16 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <16 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = sub <16 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <16 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <16 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = sub <16 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <16 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <16 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = sub <16 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <16 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <16 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = sub <16 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32>
@@ -1298,14 +1298,14 @@ define void @extsubv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = sub <16 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <16 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <16 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <16 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <16 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <16 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <16 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <16 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64>
@@ -1715,14 +1715,14 @@ define void @extmulv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_16_32 = mul <4 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <4 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <4 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <4 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <4 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <4 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <4 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <4 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64>
@@ -1824,24 +1824,24 @@ define void @extmulv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_8_16 = mul <8 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = mul <8 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <8 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <8 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = mul <8 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <8 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <8 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = mul <8 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <8 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <8 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = mul <8 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <8 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <8 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = mul <8 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32>
@@ -1854,14 +1854,14 @@ define void @extmulv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = mul <8 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <8 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <8 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <8 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <8 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <8 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <8 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <8 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64>
@@ -1963,24 +1963,24 @@ define void @extmulv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = mul <16 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = mul <16 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <16 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <16 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = mul <16 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <16 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <16 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = mul <16 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <16 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <16 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = mul <16 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <16 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <16 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = mul <16 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32>
@@ -1993,14 +1993,14 @@ define void @extmulv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = mul <16 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <16 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <16 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <16 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <16 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <16 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <16 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <16 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64>
diff --git a/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll b/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll
new file mode 100644
index 0000000000000..9efcf912076b0
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/extract-last-active.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=NEON
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=SVE
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sme -force-streaming | FileCheck %s --check-prefix=SME-STREAMING
+
+define void @extractions() {
+; NEON-LABEL: 'extractions'
+; NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> poison, i64 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, half poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i1> poison, i64 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, half poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SVE-LABEL: 'extractions'
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> poison, i64 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, half poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i1> poison, i64 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, half poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SME-STREAMING-LABEL: 'extractions'
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> poison, i64 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, half poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i1> poison, i64 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, half poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
+; SME-STREAMING-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+
+  ;; Legal types
+  %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison)
+  %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison)
+  %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison)
+  %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison)
+  %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison)
+  %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
+  %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
+  %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
+  %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
+  %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
+  %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
+  %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> poison, i64 poison)
+  %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, half poison)
+  %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
+  %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
+  %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
+
+  ;; Wider-than-legal
+  %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
+  %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison)
+  %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
+  %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison)
+  %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison)
+  %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
+  %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
+  %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
+  %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
+  %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
+  %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
+  %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i1> poison, i64 poison)
+  %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, half poison)
+  %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
+  %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
+  %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
+
+  ;; Narrower-than-legal
+  %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
+  %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison)
+  %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
+  %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison)
+  %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison)
+  %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
+  %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
+  %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
+  %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
+  %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
+  %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
+  %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
+  %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison)
+  %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
+  %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
+  %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
+
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
index fa53a184e317b..1920fc9b4a640 100644
--- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
@@ -1,17 +1,6 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=256 | FileCheck %s -D#VBITS=256
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=384 | FileCheck %s -D#VBITS=256
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=512 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=640 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=768 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=896 | FileCheck %s -D#VBITS=512
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1024 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1152 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1280 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1408 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1536 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1664 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1792 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1920 | FileCheck %s -D#VBITS=1024
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=2048 | FileCheck %s -D#VBITS=2048
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
diff --git a/llvm/test/Analysis/CostModel/AArch64/sincos.ll b/llvm/test/Analysis/CostModel/AArch64/sincos.ll
index 32408acb582d0..48537f6012dd5 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sincos.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sincos.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "sincos"
 ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s
 ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL -passes="print<cost-model>" -intrinsic-cost-strategy=intrinsic-cost -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefix=CHECK-VECLIB
+; RUN: opt < %s -mtriple=arm64-apple-macos10.9 -mattr=+neon -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefix=SINCOS_STRET %s
 
 define void @sincos() {
 ; CHECK-LABEL: 'sincos'
@@ -8,13 +9,11 @@ define void @sincos() {
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
-;
 ; CHECK:  Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
 ; CHECK:  Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
 ; CHECK:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
 ; CHECK:  Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
-;
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
@@ -26,18 +25,32 @@ define void @sincos() {
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
-;
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
-;
 ; CHECK-VECLIB:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 13 for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
 ; CHECK-VECLIB:  Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison)
 ; CHECK-VECLIB:  Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison)
+;
+; SINCOS_STRET-LABEL: 'sincos'
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
+; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
+; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
+; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
+; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison)
+; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison)
 ;
   %f16 = call { half, half } @llvm.sincos.f16(half poison)
   %f32 = call { float, float } @llvm.sincos.f32(float poison)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
index 1c40354892191..d9e26dc47b53f 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
@@ -1,218 +1,346 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -enable-no-nans-fp-math -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 -mattr=+sve | FileCheck %s
+; RUN: opt < %s -enable-no-nans-fp-math -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16,+sve | FileCheck %s --check-prefixes=CHECK,CHECK-BASE
+; RUN: opt < %s -enable-no-nans-fp-math -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16,+sve-b16b16,+sve | FileCheck %s --check-prefixes=CHECK,CHECK-BF16
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @fadd() {
 ; CHECK-LABEL: 'fadd'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fadd <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fadd <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fadd <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of Invalid for: %V1F32 = fadd <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fadd <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fadd <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fadd <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fadd <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fadd <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fadd <vscale x 4 x half> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fadd <vscale x 8 x half> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fadd <vscale x 16 x half> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of Invalid for: %V1F32 = fadd <vscale x 1 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fadd <vscale x 2 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fadd <vscale x 4 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fadd <vscale x 8 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fadd <vscale x 2 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fadd <vscale x 4 x double> poison, poison
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %V4F16 = fadd <vscale x 4 x half> undef, undef
-  %V8F16 = fadd <vscale x 8 x half> undef, undef
-  %V16F16 = fadd <vscale x 16 x half> undef, undef
+  %V4F16 = fadd <vscale x 4 x half> poison, poison
+  %V8F16 = fadd <vscale x 8 x half> poison, poison
+  %V16F16 = fadd <vscale x 16 x half> poison, poison
 
-  %V1F32 = fadd <vscale x 1 x float> undef, undef
-  %V2F32 = fadd <vscale x 2 x float> undef, undef
-  %V4F32 = fadd <vscale x 4 x float> undef, undef
-  %V8F32 = fadd <vscale x 8 x float> undef, undef
+  %V1F32 = fadd <vscale x 1 x float> poison, poison
+  %V2F32 = fadd <vscale x 2 x float> poison, poison
+  %V4F32 = fadd <vscale x 4 x float> poison, poison
+  %V8F32 = fadd <vscale x 8 x float> poison, poison
 
-  %V2F64 = fadd <vscale x 2 x double> undef, undef
-  %V4F64 = fadd <vscale x 4 x double> undef, undef
+  %V2F64 = fadd <vscale x 2 x double> poison, poison
+  %V4F64 = fadd <vscale x 4 x double> poison, poison
 
   ret void
 }
 
+define void @fadd_bf16() {
+; CHECK-LABEL: 'fadd_bf16'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fadd <vscale x 16 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison
+  %NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison
+  %NXV16BF16 = fadd <vscale x 16 x bfloat> poison, poison
+
+  ret void
+}
+
+
 define void @fsub() {
 ; CHECK-LABEL: 'fsub'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fsub <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fsub <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fsub <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of Invalid for: %V1F32 = fsub <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fsub <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fsub <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fsub <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fsub <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fsub <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fsub <vscale x 4 x half> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fsub <vscale x 8 x half> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fsub <vscale x 16 x half> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of Invalid for: %V1F32 = fsub <vscale x 1 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fsub <vscale x 2 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fsub <vscale x 4 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fsub <vscale x 8 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fsub <vscale x 2 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fsub <vscale x 4 x double> poison, poison
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %V4F16 = fsub <vscale x 4 x half> undef, undef
-  %V8F16 = fsub <vscale x 8 x half> undef, undef
-  %V16F16 = fsub <vscale x 16 x half> undef, undef
+  %V4F16 = fsub <vscale x 4 x half> poison, poison
+  %V8F16 = fsub <vscale x 8 x half> poison, poison
+  %V16F16 = fsub <vscale x 16 x half> poison, poison
 
-  %V1F32 = fsub <vscale x 1 x float> undef, undef
-  %V2F32 = fsub <vscale x 2 x float> undef, undef
-  %V4F32 = fsub <vscale x 4 x float> undef, undef
-  %V8F32 = fsub <vscale x 8 x float> undef, undef
+  %V1F32 = fsub <vscale x 1 x float> poison, poison
+  %V2F32 = fsub <vscale x 2 x float> poison, poison
+  %V4F32 = fsub <vscale x 4 x float> poison, poison
+  %V8F32 = fsub <vscale x 8 x float> poison, poison
 
-  %V2F64 = fsub <vscale x 2 x double> undef, undef
-  %V4F64 = fsub <vscale x 4 x double> undef, undef
+  %V2F64 = fsub <vscale x 2 x double> poison, poison
+  %V4F64 = fsub <vscale x 4 x double> poison, poison
+
+  ret void
+}
+
+define void @fsub_bf16() {
+; CHECK-LABEL: 'fsub_bf16'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fsub <vscale x 16 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison
+  %NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison
+  %NXV16BF16 = fsub <vscale x 16 x bfloat> poison, poison
 
   ret void
 }
 
 define void @fneg() {
 ; CHECK-LABEL: 'fneg'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F16 = fneg <vscale x 2 x half> undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fneg <vscale x 4 x half> undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fneg <vscale x 8 x half> undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fneg <vscale x 16 x half> undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fneg <vscale x 2 x float> undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fneg <vscale x 4 x float> undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fneg <vscale x 8 x float> undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fneg <vscale x 2 x double> undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fneg <vscale x 4 x double> undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F16 = fneg <vscale x 2 x half> poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fneg <vscale x 4 x half> poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fneg <vscale x 8 x half> poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fneg <vscale x 16 x half> poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fneg <vscale x 2 x float> poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fneg <vscale x 4 x float> poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fneg <vscale x 8 x float> poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fneg <vscale x 2 x double> poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fneg <vscale x 4 x double> poison
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %V2F16 = fneg <vscale x 2 x half> undef
-  %V4F16 = fneg <vscale x 4 x half> undef
-  %V8F16 = fneg <vscale x 8 x half> undef
-  %V16F16 = fneg <vscale x 16 x half> undef
+  %V2F16 = fneg <vscale x 2 x half> poison
+  %V4F16 = fneg <vscale x 4 x half> poison
+  %V8F16 = fneg <vscale x 8 x half> poison
+  %V16F16 = fneg <vscale x 16 x half> poison
 
-  %V2F32 = fneg <vscale x 2 x float> undef
-  %V4F32 = fneg <vscale x 4 x float> undef
-  %V8F32 = fneg <vscale x 8 x float> undef
+  %V2F32 = fneg <vscale x 2 x float> poison
+  %V4F32 = fneg <vscale x 4 x float> poison
+  %V8F32 = fneg <vscale x 8 x float> poison
 
-  %V2F64 = fneg <vscale x 2 x double> undef
-  %V4F64 = fneg <vscale x 4 x double> undef
+  %V2F64 = fneg <vscale x 2 x double> poison
+  %V4F64 = fneg <vscale x 4 x double> poison
+
+  ret void
+}
+
+define void @fneg_bf16() {
+; CHECK-LABEL: 'fneg_bf16'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV2BF16 = fneg <vscale x 2 x bfloat> poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fneg <vscale x 4 x bfloat> poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fneg <vscale x 8 x bfloat> poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fneg <vscale x 16 x bfloat> poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %NXV2BF16 = fneg <vscale x 2 x bfloat> poison
+  %NXV4BF16 = fneg <vscale x 4 x bfloat> poison
+  %NXV8BF16 = fneg <vscale x 8 x bfloat> poison
+  %NXV16BF16 = fneg <vscale x 16 x bfloat> poison
 
   ret void
 }
 
 define void @fmul() {
 ; CHECK-LABEL: 'fmul'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fmul <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fmul <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fmul <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fmul <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fmul <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fmul <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fmul <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fmul <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fmul <vscale x 4 x half> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fmul <vscale x 8 x half> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fmul <vscale x 16 x half> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fmul <vscale x 2 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fmul <vscale x 4 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fmul <vscale x 8 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fmul <vscale x 2 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fmul <vscale x 4 x double> poison, poison
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %V4F16 = fmul <vscale x 4 x half> undef, undef
-  %V8F16 = fmul <vscale x 8 x half> undef, undef
-  %V16F16 = fmul <vscale x 16 x half> undef, undef
+  %V4F16 = fmul <vscale x 4 x half> poison, poison
+  %V8F16 = fmul <vscale x 8 x half> poison, poison
+  %V16F16 = fmul <vscale x 16 x half> poison, poison
 
-  %V2F32 = fmul <vscale x 2 x float> undef, undef
-  %V4F32 = fmul <vscale x 4 x float> undef, undef
-  %V8F32 = fmul <vscale x 8 x float> undef, undef
+  %V2F32 = fmul <vscale x 2 x float> poison, poison
+  %V4F32 = fmul <vscale x 4 x float> poison, poison
+  %V8F32 = fmul <vscale x 8 x float> poison, poison
 
-  %V2F64 = fmul <vscale x 2 x double> undef, undef
-  %V4F64 = fmul <vscale x 4 x double> undef, undef
+  %V2F64 = fmul <vscale x 2 x double> poison, poison
+  %V4F64 = fmul <vscale x 4 x double> poison, poison
+
+  ret void
+}
+
+define void @fmul_bf16() {
+; CHECK-LABEL: 'fmul_bf16'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fmul <vscale x 16 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison
+  %NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison
+  %NXV16BF16 = fmul <vscale x 16 x bfloat> poison, poison
 
   ret void
 }
 
 define void @fdiv() {
 ; CHECK-LABEL: 'fdiv'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = fdiv <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V8F16 = fdiv <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V16F16 = fdiv <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2F32 = fdiv <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4F32 = fdiv <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8F32 = fdiv <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2F64 = fdiv <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4F64 = fdiv <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = fdiv <vscale x 4 x half> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V8F16 = fdiv <vscale x 8 x half> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V16F16 = fdiv <vscale x 16 x half> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2F32 = fdiv <vscale x 2 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4F32 = fdiv <vscale x 4 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8F32 = fdiv <vscale x 8 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2F64 = fdiv <vscale x 2 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4F64 = fdiv <vscale x 4 x double> poison, poison
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %V4F16 = fdiv <vscale x 4 x half> undef, undef
-  %V8F16 = fdiv <vscale x 8 x half> undef, undef
-  %V16F16 = fdiv <vscale x 16 x half> undef, undef
+  %V4F16 = fdiv <vscale x 4 x half> poison, poison
+  %V8F16 = fdiv <vscale x 8 x half> poison, poison
+  %V16F16 = fdiv <vscale x 16 x half> poison, poison
 
-  %V2F32 = fdiv <vscale x 2 x float> undef, undef
-  %V4F32 = fdiv <vscale x 4 x float> undef, undef
-  %V8F32 = fdiv <vscale x 8 x float> undef, undef
+  %V2F32 = fdiv <vscale x 2 x float> poison, poison
+  %V4F32 = fdiv <vscale x 4 x float> poison, poison
+  %V8F32 = fdiv <vscale x 8 x float> poison, poison
 
-  %V2F64 = fdiv <vscale x 2 x double> undef, undef
-  %V4F64 = fdiv <vscale x 4 x double> undef, undef
+  %V2F64 = fdiv <vscale x 2 x double> poison, poison
+  %V4F64 = fdiv <vscale x 4 x double> poison, poison
+
+  ret void
+}
+
+define void @fdiv_bf16() {
+; CHECK-LABEL: 'fdiv_bf16'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NXV4BF16 = fdiv <vscale x 4 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:4 Lat:4 SizeLat:4 for: %NXV8BF16 = fdiv <vscale x 8 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:4 Lat:4 SizeLat:4 for: %NXV16BF16 = fdiv <vscale x 16 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %NXV4BF16 = fdiv <vscale x 4 x bfloat> poison, poison
+  %NXV8BF16 = fdiv <vscale x 8 x bfloat> poison, poison
+  %NXV16BF16 = fdiv <vscale x 16 x bfloat> poison, poison
 
   ret void
 }
 
 define void @frem() {
 ; CHECK-LABEL: 'frem'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = frem <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V8F16 = frem <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V16F16 = frem <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V2F32 = frem <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F32 = frem <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V8F32 = frem <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V2F64 = frem <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F64 = frem <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = frem <vscale x 4 x half> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V8F16 = frem <vscale x 8 x half> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V16F16 = frem <vscale x 16 x half> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V2F32 = frem <vscale x 2 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F32 = frem <vscale x 4 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V8F32 = frem <vscale x 8 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V2F64 = frem <vscale x 2 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F64 = frem <vscale x 4 x double> poison, poison
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %V4F16 = frem <vscale x 4 x half> undef, undef
-  %V8F16 = frem <vscale x 8 x half> undef, undef
-  %V16F16 = frem <vscale x 16 x half> undef, undef
+  %V4F16 = frem <vscale x 4 x half> poison, poison
+  %V8F16 = frem <vscale x 8 x half> poison, poison
+  %V16F16 = frem <vscale x 16 x half> poison, poison
 
-  %V2F32 = frem <vscale x 2 x float> undef, undef
-  %V4F32 = frem <vscale x 4 x float> undef, undef
-  %V8F32 = frem <vscale x 8 x float> undef, undef
+  %V2F32 = frem <vscale x 2 x float> poison, poison
+  %V4F32 = frem <vscale x 4 x float> poison, poison
+  %V8F32 = frem <vscale x 8 x float> poison, poison
 
-  %V2F64 = frem <vscale x 2 x double> undef, undef
-  %V4F64 = frem <vscale x 4 x double> undef, undef
+  %V2F64 = frem <vscale x 2 x double> poison, poison
+  %V4F64 = frem <vscale x 4 x double> poison, poison
+
+  ret void
+}
+
+define void @frem_bf16() {
+; CHECK-LABEL: 'frem_bf16'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NXV4BF16 = frem <vscale x 4 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NXV8BF16 = frem <vscale x 8 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NXV16BF16 = frem <vscale x 16 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %NXV4BF16 = frem <vscale x 4 x bfloat> poison, poison
+  %NXV8BF16 = frem <vscale x 8 x bfloat> poison, poison
+  %NXV16BF16 = frem <vscale x 16 x bfloat> poison, poison
 
   ret void
 }
 
 define void @fma() {
 ; CHECK-LABEL: 'fma'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <vscale x 16 x half> @llvm.fma.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <vscale x 8 x float> @llvm.fma.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <vscale x 4 x double> @llvm.fma.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <vscale x 16 x half> @llvm.fma.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x float> poison)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <vscale x 8 x float> @llvm.fma.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x float> poison)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x double> poison)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <vscale x 4 x double> @llvm.fma.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x double> poison)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %V4F16 = call <vscale x 4 x half> @llvm.fma.v4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
-  %V8F16 = call <vscale x 8 x half> @llvm.fma.v8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
-  %V16F16 = call <vscale x 16 x half> @llvm.fma.v16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
+  %V4F16 = call <vscale x 4 x half> @llvm.fma.v4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
+  %V8F16 = call <vscale x 8 x half> @llvm.fma.v8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison)
+  %V16F16 = call <vscale x 16 x half> @llvm.fma.v16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison)
 
-  %V2F32 = call <vscale x 2 x float> @llvm.fma.v2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
-  %V4F32 = call <vscale x 4 x float> @llvm.fma.v4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
-  %V8F32 = call <vscale x 8 x float> @llvm.fma.v8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
+  %V2F32 = call <vscale x 2 x float> @llvm.fma.v2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x float> poison)
+  %V4F32 = call <vscale x 4 x float> @llvm.fma.v4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison)
+  %V8F32 = call <vscale x 8 x float> @llvm.fma.v8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x float> poison)
 
-  %V2F64 = call <vscale x 2 x double> @llvm.fma.v2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
-  %V4F64 = call <vscale x 4 x double> @llvm.fma.v4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
+  %V2F64 = call <vscale x 2 x double> @llvm.fma.v2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x double> poison)
+  %V4F64 = call <vscale x 4 x double> @llvm.fma.v4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x double> poison)
+
+  ret void
+}
+
+define void @fma_bf16() {
+; CHECK-BASE-LABEL: 'fma_bf16'
+; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fma.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-BF16-LABEL: 'fma_bf16'
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 2 for: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 2 for: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 4 for: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fma.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+  %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+  %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fma.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
 
   ret void
 }
 
 define void @fmuladd() {
 ; CHECK-LABEL: 'fmuladd'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <vscale x 16 x half> @llvm.fmuladd.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <vscale x 2 x float> @llvm.fmuladd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <vscale x 2 x double> @llvm.fmuladd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <vscale x 4 x double> @llvm.fmuladd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <vscale x 16 x half> @llvm.fmuladd.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <vscale x 2 x float> @llvm.fmuladd.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x float> poison)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x float> poison)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <vscale x 2 x double> @llvm.fmuladd.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x double> poison)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <vscale x 4 x double> @llvm.fmuladd.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x double> poison)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.v4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
-  %V8F16 = call <vscale x 8 x half> @llvm.fmuladd.v8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
-  %V16F16 = call <vscale x 16 x half> @llvm.fmuladd.v16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
+  %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.v4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
+  %V8F16 = call <vscale x 8 x half> @llvm.fmuladd.v8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison)
+  %V16F16 = call <vscale x 16 x half> @llvm.fmuladd.v16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison)
 
-  %V2F32 = call <vscale x 2 x float> @llvm.fmuladd.v2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
-  %V4F32 = call <vscale x 4 x float> @llvm.fmuladd.v4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
-  %V8F32 = call <vscale x 8 x float> @llvm.fmuladd.v8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
+  %V2F32 = call <vscale x 2 x float> @llvm.fmuladd.v2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x float> poison)
+  %V4F32 = call <vscale x 4 x float> @llvm.fmuladd.v4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison)
+  %V8F32 = call <vscale x 8 x float> @llvm.fmuladd.v8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x float> poison)
 
-  %V2F64 = call <vscale x 2 x double> @llvm.fmuladd.v2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
-  %V4F64 = call <vscale x 4 x double> @llvm.fmuladd.v4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
+  %V2F64 = call <vscale x 2 x double> @llvm.fmuladd.v2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x double> poison)
+  %V4F64 = call <vscale x 4 x double> @llvm.fmuladd.v4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x double> poison)
+
+  ret void
+}
+
+define void @fmuladd_bf16() {
+; CHECK-BASE-LABEL: 'fmuladd_bf16'
+; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fmuladd.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fmuladd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fmuladd.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-BF16-LABEL: 'fmuladd_bf16'
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 2 for: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fmuladd.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 2 for: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fmuladd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 4 for: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fmuladd.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fmuladd.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+  %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fmuladd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+  %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fmuladd.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
 
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll
index df40a962d0def..e128987c5ab8d 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll
@@ -1,19 +1,8 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s -D#VBITS=128
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=128 | FileCheck %s -D#VBITS=128
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=256 | FileCheck %s -D#VBITS=256
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=384 | FileCheck %s -D#VBITS=256
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=512 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=640 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=768 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=896 | FileCheck %s -D#VBITS=512
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1024 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1152 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1280 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1408 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1536 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1664 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1792 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1920 | FileCheck %s -D#VBITS=1024
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=2048 | FileCheck %s -D#VBITS=2048
 
 ; VBITS represents the useful bit size of a vector register from the code
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
index e64bce2d9c9e5..6dacd59f07fde 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -6239,3 +6239,13 @@ define void @legalization_crash() {
   fptoui <192 x float> undef to <192 x i1>
   ret void
 }
+
+; Test that types that need to be split go through BasicTTIImpl.
+define void @BitInt_crash() {
+; ZVE64X-LABEL: 'BitInt_crash'
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2043 for instruction: %1 = bitcast <16 x i64> poison to <512 x i2>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  bitcast <16 x i64> poison to <512 x i2>
+  ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll b/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll
index 4346507ba8f90..181a4494b036e 100644
--- a/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/SimpleSIVNoValidityCheck.ll
@@ -210,7 +210,7 @@ define void @t3(i64 %n, i64 %m, i64 %lb, ptr %a) {
 ; CHECK-NEXT:  Src: %2 = load i32, ptr %arrayidx6, align 4 --> Dst: %2 = load i32, ptr %arrayidx6, align 4
 ; CHECK-NEXT:    da analyze - none!
 ; CHECK-NEXT:  Src: %2 = load i32, ptr %arrayidx6, align 4 --> Dst: store i32 %2, ptr %arrayidx8, align 4
-; CHECK-NEXT:    da analyze - consistent anti [1 -2]!
+; CHECK-NEXT:    da analyze - anti [1 *]!
 ; CHECK-NEXT:  Src: store i32 %2, ptr %arrayidx8, align 4 --> Dst: store i32 %2, ptr %arrayidx8, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
diff --git a/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll b/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll
index 44bd9b7727910..71b93826ac260 100644
--- a/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/StrongSIV.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -disable-output "-passes=print<da>" -aa-pipeline=basic-aa 2>&1 \
-; RUN: | FileCheck %s
+; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-ALL
+; RUN: opt < %s -disable-output "-passes=print<da>" -aa-pipeline=basic-aa -da-enable-dependence-test=strong-siv 2>&1 \
+; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-STRONG-SIV
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.6.0"
@@ -423,19 +425,33 @@ for.end:                                          ; preds = %for.body
 ;;    *B++ = A[i + 2*n];
 
 define void @strong9(ptr %A, ptr %B, i64 %n) nounwind uwtable ssp {
-; CHECK-LABEL: 'strong9'
-; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
-; CHECK-NEXT:    da analyze - none!
-; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-NEXT:    da analyze - none!
-; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
-; CHECK-NEXT:    da analyze - confused!
-; CHECK-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-NEXT:    da analyze - none!
-; CHECK-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
-; CHECK-NEXT:    da analyze - confused!
-; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.02, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
-; CHECK-NEXT:    da analyze - none!
+; CHECK-ALL-LABEL: 'strong9'
+; CHECK-ALL-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
+; CHECK-ALL-NEXT:    da analyze - confused!
+; CHECK-ALL-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
+; CHECK-ALL-NEXT:    da analyze - confused!
+; CHECK-ALL-NEXT:  Src: store i32 %0, ptr %B.addr.02, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
+; CHECK-ALL-NEXT:    da analyze - none!
+;
+; CHECK-STRONG-SIV-LABEL: 'strong9'
+; CHECK-STRONG-SIV-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
+; CHECK-STRONG-SIV-NEXT:    da analyze - none!
+; CHECK-STRONG-SIV-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-STRONG-SIV-NEXT:    da analyze - flow [*|<]!
+; CHECK-STRONG-SIV-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
+; CHECK-STRONG-SIV-NEXT:    da analyze - confused!
+; CHECK-STRONG-SIV-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
+; CHECK-STRONG-SIV-NEXT:    da analyze - none!
+; CHECK-STRONG-SIV-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
+; CHECK-STRONG-SIV-NEXT:    da analyze - confused!
+; CHECK-STRONG-SIV-NEXT:  Src: store i32 %0, ptr %B.addr.02, align 4 --> Dst: store i32 %0, ptr %B.addr.02, align 4
+; CHECK-STRONG-SIV-NEXT:    da analyze - none!
 ;
 entry:
   %cmp1 = icmp eq i64 %n, 0
@@ -512,3 +528,45 @@ for.body:                                         ; preds = %entry, %for.body
 for.end:                                          ; preds = %for.body
   ret void
 }
+
+
+;;  for (long unsigned i = 0; i < 9223372036854775806; i++)
+;;    for (long unsigned j = 0; j < 2147483640; j++)
+;;      if (i < 3000000000)
+;;        A[i] = 0;
+;
+; FIXME: DependenceAnalysis fails to detect the dependency between A[i] and
+; itself, and the issue is not caused by the Strong SIV.
+define void @strong11(ptr %A) nounwind uwtable ssp {
+; CHECK-ALL-LABEL: 'strong11'
+; CHECK-ALL-NEXT:  Src: store i32 0, ptr %arrayidx, align 4 --> Dst: store i32 0, ptr %arrayidx, align 4
+; CHECK-ALL-NEXT:    da analyze - none!
+;
+; CHECK-STRONG-SIV-LABEL: 'strong11'
+; CHECK-STRONG-SIV-NEXT:  Src: store i32 0, ptr %arrayidx, align 4 --> Dst: store i32 0, ptr %arrayidx, align 4
+; CHECK-STRONG-SIV-NEXT:    da analyze - consistent output [0 S]!
+;
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.cond.cleanup3
+  %i.017 = phi i64 [ 0, %entry ], [ %inc8, %for.cond.cleanup3 ]
+  %cmp5 = icmp samesign ult i64 %i.017, 3000000000
+  %arrayidx = getelementptr inbounds nuw i32, ptr %A, i64 %i.017
+  br i1 %cmp5, label %for.body4.us, label %for.cond.cleanup3
+
+for.body4.us:                                     ; preds = %for.cond1.preheader, %for.body4.us
+  %j.016.us = phi i64 [ %inc.us, %for.body4.us ], [ 0, %for.cond1.preheader ]
+  store i32 0, ptr %arrayidx, align 4
+  %inc.us = add nuw nsw i64 %j.016.us, 1
+  %exitcond.not = icmp eq i64 %inc.us, 2147483640
+  br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4.us
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3
+  ret void
+
+for.cond.cleanup3:                                ; preds = %for.body4.us, %for.cond1.preheader
+  %inc8 = add nuw nsw i64 %i.017, 1
+  %exitcond19.not = icmp eq i64 %inc8, 9223372036854775806
+  br i1 %exitcond19.not, label %for.cond.cleanup, label %for.cond1.preheader
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll
index 7411dc9f5c053..df42c757a3b63 100644
--- a/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll
@@ -298,7 +298,8 @@ exit:
 }
 
 ; The value of step reccurence is not invariant with respect to the outer most
-; loop (the i-loop).
+; loop (the i-loop). It is theoretically multivariate monotonic by definition,
+; but we cannot handle non-affine addrec for now.
 ;
 ; offset_i = 0;
 ; for (int i = 0; i < 100; i++) {
@@ -312,7 +313,8 @@ define void @step_is_variant(ptr %a) {
 ; CHECK-NEXT:  Monotonicity check:
 ; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
 ; CHECK-NEXT:      Expr: {%offset.i,+,1}<nuw><nsw><%loop.j>
-; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: %offset.i
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
 ; CHECK-NEXT:    da analyze - confused!
@@ -346,6 +348,56 @@ exit:
   ret void
 }
 
+; The value of step reccurence is not invariant with respect to the outer most
+; loop (the i-loop). Actually, `offset_i` is not monotonic.
+;
+; offset_i = 0;
+; for (int i = 0; i < 100; i++) {
+;   for (int j = 0; j < 100; j++)
+;     a[offset_i + j] = 0;
+;   offset_i += (i % 2 == 0) ? -1 : 3;
+; }
+;
+define void @step_is_variant2(ptr %a) {
+; CHECK-LABEL: 'step_is_variant2'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {%offset.i,+,1}<nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: %offset.i
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop.i.header
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  %offset.i = phi i64 [ 0, %entry ], [ %offset.i.next, %loop.i.latch ]
+  %step.i.0 = phi i64 [ -1, %entry ], [ %step.i.1, %loop.i.latch ]
+  %step.i.1 = phi i64 [ 3, %entry ], [ %step.i.0, %loop.i.latch ]
+  br label %loop.j
+
+loop.j:
+  %j = phi i64 [ 0, %loop.i.header ], [ %j.inc, %loop.j ]
+  %offset = add nsw i64 %offset.i, %j
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, 100
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %offset.i.next = add nsw i64 %offset.i, %step.i.0
+  %exitcond.i = icmp eq i64 %i.inc, 100
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
 ; The AddRec doesn't have nsw flag for the j-loop, since the store may not be
 ; executed.
 ;
diff --git a/llvm/test/Analysis/DependenceAnalysis/same-sd-for-diff-becount-type-loops.ll b/llvm/test/Analysis/DependenceAnalysis/same-sd-for-diff-becount-type-loops.ll
index 66880b5a553ec..f7f869ddbbe82 100644
--- a/llvm/test/Analysis/DependenceAnalysis/same-sd-for-diff-becount-type-loops.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/same-sd-for-diff-becount-type-loops.ll
@@ -1,12 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -disable-output "-passes=print<da>" -aa-pipeline=basic-aa 2>&1 | FileCheck %s
 
 define void @f1() {
 ; CHECK-LABEL: 'f1'
-; CHECK-NEXT:  Src:  store i32 0, ptr null, align 4 --> Dst:  store i32 0, ptr null, align 4
+; CHECK-NEXT:  Src: store i32 0, ptr null, align 4 --> Dst: store i32 0, ptr null, align 4
 ; CHECK-NEXT:    da analyze - consistent output [S]!
-; CHECK-NEXT:  Src:  store i32 0, ptr null, align 4 --> Dst:  %2 = load i32, ptr null, align 4
+; CHECK-NEXT:  Src: store i32 0, ptr null, align 4 --> Dst: %2 = load i32, ptr null, align 4
 ; CHECK-NEXT:    da analyze - consistent flow [|<]!
-; CHECK-NEXT:  Src:  %2 = load i32, ptr null, align 4 --> Dst:  %2 = load i32, ptr null, align 4
+; CHECK-NEXT:  Src: %2 = load i32, ptr null, align 4 --> Dst: %2 = load i32, ptr null, align 4
 ; CHECK-NEXT:    da analyze - consistent input [S]!
 ;
 entry:
@@ -34,11 +35,11 @@ exit:                                             ; preds = %for.2.body
 
 define void @f2() {
 ; CHECK-LABEL: 'f2'
-; CHECK-NEXT:  Src:  store i32 0, ptr null, align 4 --> Dst:  store i32 0, ptr null, align 4
+; CHECK-NEXT:  Src: store i32 0, ptr null, align 4 --> Dst: store i32 0, ptr null, align 4
 ; CHECK-NEXT:    da analyze - consistent output [S]!
-; CHECK-NEXT:  Src:  store i32 0, ptr null, align 4 --> Dst:  %3 = load i32, ptr null, align 4
-; CHECK-NEXT:    da analyze - flow [|<] / assuming 1 loop level(s) fused:  [S|<]!
-; CHECK-NEXT:  Src:  %3 = load i32, ptr null, align 4 --> Dst:  %3 = load i32, ptr null, align 4
+; CHECK-NEXT:  Src: store i32 0, ptr null, align 4 --> Dst: %3 = load i32, ptr null, align 4
+; CHECK-NEXT:    da analyze - flow [|<] / assuming 1 loop level(s) fused: [S|<]!
+; CHECK-NEXT:  Src: %3 = load i32, ptr null, align 4 --> Dst: %3 = load i32, ptr null, align 4
 ; CHECK-NEXT:    da analyze - consistent input [S]!
 ;
 entry:
diff --git a/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll
index bf0fafcbfd6c9..6fd71ac8fe414 100644
--- a/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/strong-siv-overflow.ll
@@ -12,19 +12,24 @@
 ;     A[2*i - 4] = 2;
 ; }
 ;
-; FIXME: DependenceAnalysis currently detects no dependency between the two
-; stores, but it does exist. For example, each store will access A[0] when i
-; is 1 and 2 respectively.
-; The root cause is that the product of the BTC and the coefficient
-; ((1LL << 62) - 1 and 2) overflows in a signed sense.
+; FIXME: DependenceAnalysis fails to detect the dependency between the two
+; stores, and the issue is not caused by the Strong SIV.
 define void @strongsiv_const_ovfl(ptr %A) {
-; CHECK-LABEL: 'strongsiv_const_ovfl'
-; CHECK-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
-; CHECK-NEXT:    da analyze - none!
-; CHECK-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
-; CHECK-NEXT:    da analyze - none!
-; CHECK-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
-; CHECK-NEXT:    da analyze - none!
+; CHECK-ALL-LABEL: 'strongsiv_const_ovfl'
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+;
+; CHECK-STRONG-SIV-LABEL: 'strongsiv_const_ovfl'
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - none!
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - consistent output [1]!
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - none!
 ;
 entry:
   br label %loop.header
@@ -64,5 +69,4 @@ exit:
   ret void
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-ALL: {{.*}}
-; CHECK-STRONG-SIV: {{.*}}
+; CHECK: {{.*}}
diff --git a/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll
index c5ff9884a0c62..75be96380f078 100644
--- a/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/symbolic-rdiv-overflow.ll
@@ -13,7 +13,7 @@
 ; FIXME: DependenceAnalysis currently detects no dependency between the two
 ; stores, but it does exist. For example, each store will access A[0] when i
 ; is 1 and 0 respectively.
-; The root cause is that the product of the BTC and the coefficient 
+; The root cause is that the product of the BTC and the coefficient
 ; ((1LL << 62) - 1 and 2) overflows in a signed sense.
 define void @symbolicrdiv_prod_ovfl(ptr %A) {
 ; CHECK-ALL-LABEL: 'symbolicrdiv_prod_ovfl'
@@ -75,10 +75,10 @@ exit:
 ; FIXME: DependenceAnalysis currently detects no dependency between the two
 ; stores, but it does exist. For example,
 ;
-;  memory access           | i == 2^61 | i == 2^61 + 2^59 | i == 2^61 + 2^60  
+;  memory access           | i == 2^61 | i == 2^61 + 2^59 | i == 2^61 + 2^60
 ; -------------------------|-----------|------------------|-------------------
-;  A[2*i - 2^62] (offset0) |           | A[2^60]          | A[2^61]           
-;  A[-i + 2^62]  (offset1) | A[2^61]   |                  | A[2^60]           
+;  A[2*i - 2^62] (offset0) |           | A[2^60]          | A[2^61]
+;  A[-i + 2^62]  (offset1) | A[2^61]   |                  | A[2^60]
 ;
 ; The root cause is that the calculation of the differenct between the two
 ; constants (-2^62 and 2^62) overflows in a signed sense.
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll
index 4c2a9c3f29f02..d90a97f1651e6 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll
@@ -10,7 +10,7 @@
 ;   s0 += (1ULL << 62) + 1;
 ;   s1 += (1ULL << 62) + 2;
 ; }
-; FIXME: We cannot use inbounds on idx.0, idx.1 to infer no-wrap (and determine
+; We cannot use inbounds on idx.0, idx.1 to infer no-wrap (and determine
 ; there are no dependences), as the pointers are not dereferenced in all loop iterations.
 define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) {
 ; CHECK-LABEL: 'test_inbounds_gep_used_in_predicated_block'
@@ -19,9 +19,14 @@ define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) {
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %A High: (-4611686018427387705 + %A))
+; CHECK-NEXT:            Member: {%A,+,4611686018427387906}<%loop.header>
+; CHECK-NEXT:            Member: {%A,+,4611686018427387905}<%loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Expressions re-written:
 ;
@@ -63,9 +68,14 @@ define void @test_inbounds_gep_used_in_predicated_block_stored_value_operand(ptr
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %A High: (-4611686018427387705 + %A))
+; CHECK-NEXT:            Member: {%A,+,4611686018427387906}<%loop.header>
+; CHECK-NEXT:            Member: {%A,+,4611686018427387905}<%loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Expressions re-written:
 ;
@@ -109,9 +119,14 @@ define void @test_inbounds_gep_used_in_predicated_block_non_memop_user(ptr %A, i
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %A High: (-4611686018427387705 + %A))
+; CHECK-NEXT:            Member: {%A,+,4611686018427387906}<%loop.header>
+; CHECK-NEXT:            Member: {%A,+,4611686018427387905}<%loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Expressions re-written:
 ;
diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
index 362586af4f9b7..4fc506f1f5edf 100644
--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
@@ -87,6 +87,11 @@ declare void @llvm.nvvm.barrier(i32, i32)
 declare void @llvm.nvvm.barrier.sync(i32)
 declare void @llvm.nvvm.barrier.sync.cnt(i32, i32)
 
+declare float @llvm.nvvm.ex2.approx.f(float)
+declare double @llvm.nvvm.ex2.approx.d(double)
+declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>)
+declare float @llvm.nvvm.ex2.approx.ftz.f(float)
+
 ; CHECK-LABEL: @simple_upgrade
 define void @simple_upgrade(i32 %a, i64 %b, i16 %c) {
 ; CHECK: call i32 @llvm.bitreverse.i32(i32 %a)
@@ -355,3 +360,15 @@ define void @cta_barriers(i32 %x, i32 %y) {
   call void @llvm.nvvm.barrier.sync.cnt(i32 %x, i32 %y)
   ret void
 }
+
+define void @nvvm_ex2_approx(float %a, double %b, half %c, <2 x half> %d) {
+; CHECK: call float @llvm.nvvm.ex2.approx.f32(float %a)
+; CHECK: call double @llvm.nvvm.ex2.approx.f64(double %b)
+; CHECK: call <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half> %d)
+; CHECK: call float @llvm.nvvm.ex2.approx.ftz.f32(float %a)
+  %r1 = call float @llvm.nvvm.ex2.approx.f(float %a)
+  %r2 = call double @llvm.nvvm.ex2.approx.d(double %b)
+  %r3 = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %d)
+  %r4 = call float @llvm.nvvm.ex2.approx.ftz.f(float %a)
+  ret void
+}
diff --git a/llvm/test/Assembler/metadata-annotations.ll b/llvm/test/Assembler/metadata-annotations.ll
index 4fd471338cd0a..2a08a17849dbd 100644
--- a/llvm/test/Assembler/metadata-annotations.ll
+++ b/llvm/test/Assembler/metadata-annotations.ll
@@ -1,9 +1,23 @@
 ; RUN: llvm-as < %s | llvm-dis --materialize-metadata --show-annotations | FileCheck %s
 
+; CHECK: @global_var = global i32 1
+; CHECK: @alias = alias i32, ptr @global_var
+; CHECK: @ifunc = ifunc i32 (), ptr @ifunc_resolver
+@global_var = global i32 1
+@alias = alias i32, ptr @global_var
+@ifunc = ifunc i32 (), ptr @ifunc_resolver
+
+; CHECK: ; Materializable
+; CHECK-NEXT: define ptr @ifunc_resolver() {}
+define ptr @ifunc_resolver() {
+  ret ptr @defined_function
+}
+
 ; CHECK: ; Materializable
-; CHECK-NEXT: define dso_local i32 @test() {}
-define dso_local i32 @test() {
-entry:
-  ret i32 0
+; CHECK-NEXT: define void @defined_function() {}
+define void @defined_function() {
+  ret void
 }
 
+; CHECK: declare void @declared_function()
+declare void @declared_function()
diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll
index aef7810fe2c3b..107a98aebeeb8 100644
--- a/llvm/test/Bitcode/attributes.ll
+++ b/llvm/test/Bitcode/attributes.ll
@@ -521,6 +521,11 @@ define void @f_sanitize_alloc_token() sanitize_alloc_token {
         ret void;
 }
 
+; CHECK: define void @f_no_create_undef_or_poison() #56
+define void @f_no_create_undef_or_poison() nocreateundeforpoison {
+        ret void;
+}
+
 ; CHECK: define void @f87() [[FNRETTHUNKEXTERN:#[0-9]+]]
 define void @f87() fn_ret_thunk_extern { ret void }
 
@@ -633,6 +638,7 @@ define void @dead_on_return(ptr dead_on_return %p) {
 ; CHECK: attributes #53 = { sanitize_realtime }
 ; CHECK: attributes #54 = { sanitize_realtime_blocking }
 ; CHECK: attributes #55 = { sanitize_alloc_token }
+; CHECK: attributes #56 = { nocreateundeforpoison }
 ; CHECK: attributes [[FNRETTHUNKEXTERN]] = { fn_ret_thunk_extern }
 ; CHECK: attributes [[SKIPPROFILE]] = { skipprofile }
 ; CHECK: attributes [[OPTDEBUG]] = { optdebug }
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index f01422e3b0990..e547c3429058b 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -30,6 +30,7 @@ llvm_canonicalize_cmake_booleans(
   LLVM_INCLUDE_SPIRV_TOOLS_TESTS
   LLVM_APPEND_VC_REV
   LLVM_HAS_LOGF128
+  LLVM_ENABLE_ONDISK_CAS
   )
 
 configure_lit_site_cfg(
@@ -81,6 +82,7 @@ set(LLVM_TEST_DEPENDS
   llvm-bcanalyzer
   llvm-bitcode-strip
   llvm-c-test
+  llvm-cas
   llvm-cat
   llvm-cfi-verify
   llvm-cgdata
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir
index 6b84a8488e478..1950e602ec83a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir
@@ -1440,3 +1440,50 @@ body:             |
     %freeze:_(<4 x s32>) = G_FREEZE %extract
     $q0 = COPY %freeze(<4 x s32>)
     RET_ReallyLR implicit $x0
+...
+---
+name:            ubfx_does_not_generate_poison
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: ubfx_does_not_generate_poison
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c1:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]]
+    ; CHECK-NEXT: [[UBFX:%[0-9]+]]:_(s64) = G_UBFX [[FREEZE]], %c1(s64), %c1
+    ; CHECK-NEXT: $x0 = COPY [[UBFX]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %c1:_(s64) = G_CONSTANT i64 1
+    %1:_(s64) = G_UBFX %0, %c1, %c1
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            sbfx_does_not_generate_poison
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: sbfx_does_not_generate_poison
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c1:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]]
+    ; CHECK-NEXT: [[SBFX:%[0-9]+]]:_(s64) = G_SBFX [[FREEZE]], %c1(s64), %c1
+    ; CHECK-NEXT: $x0 = COPY [[SBFX]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $x0
+    %c1:_(s64) = G_CONSTANT i64 1
+    %1:_(s64) = G_SBFX %0, %c1, %c1
+    %2:_(s64) = G_FREEZE %1
+    $x0 = COPY %2(s64)
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-buildvector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-buildvector.mir
index 3f2bb1eed572b..94ea12d3c66d9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-buildvector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-buildvector.mir
@@ -22,7 +22,7 @@ body:             |
   ; CHECK-NEXT: %1:_ KnownBits:00001010 SignBits:4
   ; CHECK-NEXT: %2:_ KnownBits:0000?01? SignBits:4
   ; CHECK-NEXT: %idx:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000001 SignBits:63
-  ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:00001010 SignBits:4
     %0:_(s8) = G_CONSTANT i8 3
     %1:_(s8) = G_CONSTANT i8 10
     %2:_(<2 x s8>) = G_BUILD_VECTOR %0, %1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-extract-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-extract-vector.mir
new file mode 100644
index 0000000000000..ab576dfccc40c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-extract-vector.mir
@@ -0,0 +1,133 @@
+# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple aarch64 -passes="print<gisel-value-tracking>" %s -filetype=null 2>&1 | FileCheck %s
+
+---
+name: all_knownbits_const_idx
+body: |
+  bb.0:
+  ; CHECK-LABEL: name: @all_knownbits_const_idx
+  ; CHECK-NEXT: %0:_ KnownBits:00000011 SignBits:6
+  ; CHECK-NEXT: %1:_ KnownBits:00001010 SignBits:4
+  ; CHECK-NEXT: %2:_ KnownBits:0000?01? SignBits:4
+  ; CHECK-NEXT: %idx:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000001 SignBits:63
+  ; CHECK-NEXT: %4:_ KnownBits:00001010 SignBits:4
+    %0:_(s8) = G_CONSTANT i8 3
+    %1:_(s8) = G_CONSTANT i8 10
+    %2:_(<2 x s8>) = G_BUILD_VECTOR %0, %1
+    %idx:_(s64) = G_CONSTANT i64 1
+    %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %idx
+...
+---
+name: all_knownbits
+body: |
+  bb.0:
+  ; CHECK-LABEL: name: @all_knownbits
+  ; CHECK-NEXT: %0:_ KnownBits:00000011 SignBits:6
+  ; CHECK-NEXT: %1:_ KnownBits:00001010 SignBits:4
+  ; CHECK-NEXT: %2:_ KnownBits:0000?01? SignBits:4
+  ; CHECK-NEXT: %idx:_ KnownBits:???????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:0000?01? SignBits:4
+    %0:_(s8) = G_CONSTANT i8 3
+    %1:_(s8) = G_CONSTANT i8 10
+    %2:_(<2 x s8>) = G_BUILD_VECTOR %0, %1
+    %idx:_(s64) = COPY $d0
+    %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %idx
+...
+---
+name: no_knownbits_const_idx
+body: |
+  bb.0:
+  ; CHECK-LABEL: name: @no_knownbits_const_idx
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %idx:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000001 SignBits:63
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(<2 x s8>) = COPY $h0
+    %idx:_(s64) = G_CONSTANT i64 1
+    %1:_(s8) = G_EXTRACT_VECTOR_ELT %0, %idx
+...
+---
+name: no_knownbits
+body: |
+  bb.0:
+  ; CHECK-LABEL: name: @no_knownbits
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %idx:_ KnownBits:???????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(<2 x s8>) = COPY $h0
+    %idx:_(s64) = COPY $d1
+    %1:_(s8) = G_EXTRACT_VECTOR_ELT %0, %idx
+...
+---
+name: zext_const_idx
+body: |
+  bb.0:
+  ; CHECK-LABEL: name: @zext_const_idx
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %zext0:_ KnownBits:00000000???????? SignBits:8
+  ; CHECK-NEXT: %idx:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000001 SignBits:63
+  ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+    %0:_(<2 x s8>) = COPY $h0
+    %zext0:_(<2 x s16>) = G_ZEXT %0
+    %idx:_(s64) = G_CONSTANT i64 1
+    %1:_(s16) = G_EXTRACT_VECTOR_ELT %zext0, %idx
+...
+---
+name: zext
+body: |
+  bb.0:
+
+  ; CHECK-LABEL: name: @zext
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %zext0:_ KnownBits:00000000???????? SignBits:8
+  ; CHECK-NEXT: %idx:_ KnownBits:???????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+    %0:_(<2 x s8>) = COPY $h0
+    %zext0:_(<2 x s16>) = G_ZEXT %0
+    %idx:_(s64) = COPY $d1
+    %1:_(s16) = G_EXTRACT_VECTOR_ELT %zext0, %idx
+...
+---
+name: sext_const_idx
+body: |
+  bb.0:
+  ; CHECK-LABEL: name: @sext_const_idx
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %sext0:_ KnownBits:???????????????? SignBits:9
+  ; CHECK-NEXT: %idx:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000001 SignBits:63
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+    %0:_(<2 x s8>) = COPY $h0
+    %sext0:_(<2 x s16>) = G_SEXT %0
+    %idx:_(s64) = G_CONSTANT i64 1
+    %1:_(s16) = G_EXTRACT_VECTOR_ELT %sext0, %idx
+...
+---
+name: sext
+body: |
+  bb.0:
+  ; CHECK-LABEL: name: @sext
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %sext0:_ KnownBits:???????????????? SignBits:9
+  ; CHECK-NEXT: %idx:_ KnownBits:???????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+    %0:_(<2 x s8>) = COPY $h0
+    %sext0:_(<2 x s16>) = G_SEXT %0
+    %idx:_(s64) = COPY $d1
+    %1:_(s16) = G_EXTRACT_VECTOR_ELT %sext0, %idx
+...
+---
+# Verifies known bit computation bails if return type differs from vector
+# element type. Without bailing, the 8 lowest bits of %4 would be known.
+name: bail_on_different_return_type
+body: |
+  bb.0:
+  ; CHECK-LABEL: name: @bail_on_different_return_type
+  ; CHECK-NEXT: %0:_ KnownBits:00000011 SignBits:6
+  ; CHECK-NEXT: %1:_ KnownBits:00001010 SignBits:4
+  ; CHECK-NEXT: %2:_ KnownBits:0000?01? SignBits:4
+  ; CHECK-NEXT: %idx:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000001 SignBits:63
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
+    %0:_(s8) = G_CONSTANT i8 3
+    %1:_(s8) = G_CONSTANT i8 10
+    %2:_(<2 x s8>) = G_BUILD_VECTOR %0, %1
+    %idx:_(s64) = G_CONSTANT i64 1
+    %3:_(s16) = G_EXTRACT_VECTOR_ELT %2, %idx
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-add-sub-mul.ll b/llvm/test/CodeGen/AArch64/aarch64-combine-add-sub-mul.ll
index e086ab92421fb..33ea74912251e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-combine-add-sub-mul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-add-sub-mul.ll
@@ -52,12 +52,11 @@ define <2 x i64> @test_mul_sub_2x64_2(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c,
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
-; CHECK-NEXT:    // kill: def $q3 killed $q3 def $z3
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
+; CHECK-NEXT:    // kill: def $q3 killed $q3 def $z3
 ; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    sub v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    mul z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    sub v0.2d, v2.2d, v0.2d
 ; CHECK-NEXT:    ret
   %div = sdiv <2 x i64> %a, %b
   %mul = mul <2 x i64> %c, %d
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll
new file mode 100644
index 0000000000000..8d1abdd5380db
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm              < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm -global-isel < %s | FileCheck %s
+
+define <8 x half> @fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: fmmla.v8f16.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmmla v0.8h, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %vfmmla1.i = tail call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) #3
+  ret <8 x half> %vfmmla1.i
+}
+
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll
new file mode 100644
index 0000000000000..4c33567732687
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm              < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm -global-isel < %s | FileCheck %s
+
+define <4 x float> @fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: fmmla.v4f32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmmla v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %vfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) #3
+  ret <4 x float> %vfmmla1.i
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index b54f262dbbf4a..4894932d3c9b1 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -755,199 +755,117 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none
 ; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-SD-NEXT:    cbz w2, .LBB6_3
 ; CHECK-SD-NEXT:  // %bb.1: // %iter.check
-; CHECK-SD-NEXT:    str x25, [sp, #-64]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-SD-NEXT:    .cfi_offset w19, -8
-; CHECK-SD-NEXT:    .cfi_offset w20, -16
-; CHECK-SD-NEXT:    .cfi_offset w21, -24
-; CHECK-SD-NEXT:    .cfi_offset w22, -32
-; CHECK-SD-NEXT:    .cfi_offset w23, -40
-; CHECK-SD-NEXT:    .cfi_offset w24, -48
-; CHECK-SD-NEXT:    .cfi_offset w25, -64
-; CHECK-SD-NEXT:    sxtb x9, w1
 ; CHECK-SD-NEXT:    cmp w2, #3
-; CHECK-SD-NEXT:    mov w10, w2
+; CHECK-SD-NEXT:    mov w9, w2
 ; CHECK-SD-NEXT:    b.hi .LBB6_4
 ; CHECK-SD-NEXT:  // %bb.2:
-; CHECK-SD-NEXT:    mov x11, xzr
+; CHECK-SD-NEXT:    mov x10, xzr
 ; CHECK-SD-NEXT:    mov x8, xzr
 ; CHECK-SD-NEXT:    b .LBB6_13
 ; CHECK-SD-NEXT:  .LBB6_3:
-; CHECK-SD-NEXT:    mov x0, xzr
+; CHECK-SD-NEXT:    mov x8, xzr
+; CHECK-SD-NEXT:    mov x0, x8
 ; CHECK-SD-NEXT:    ret
 ; CHECK-SD-NEXT:  .LBB6_4: // %vector.main.loop.iter.check
-; CHECK-SD-NEXT:    dup v0.2d, x9
 ; CHECK-SD-NEXT:    cmp w2, #16
 ; CHECK-SD-NEXT:    b.hs .LBB6_6
 ; CHECK-SD-NEXT:  // %bb.5:
-; CHECK-SD-NEXT:    mov x11, xzr
+; CHECK-SD-NEXT:    mov x10, xzr
 ; CHECK-SD-NEXT:    mov x8, xzr
 ; CHECK-SD-NEXT:    b .LBB6_10
 ; CHECK-SD-NEXT:  .LBB6_6: // %vector.ph
+; CHECK-SD-NEXT:    mov w8, w1
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-SD-NEXT:    mov x8, v0.d[1]
-; CHECK-SD-NEXT:    and x12, x10, #0xc
+; CHECK-SD-NEXT:    sxtb x8, w8
+; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v6.2d, #0000000000000000
 ; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT:    and x11, x10, #0xfffffff0
-; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SD-NEXT:    and x11, x9, #0xc
 ; CHECK-SD-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-SD-NEXT:    mov x15, x0
 ; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-SD-NEXT:    and x16, x10, #0xfffffff0
-; CHECK-SD-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-SD-NEXT:    fmov x13, d0
-; CHECK-SD-NEXT:    fmov x14, d0
+; CHECK-SD-NEXT:    and x10, x9, #0xfffffff0
+; CHECK-SD-NEXT:    dup v16.4s, w8
+; CHECK-SD-NEXT:    mov x8, x0
+; CHECK-SD-NEXT:    and x12, x9, #0xfffffff0
 ; CHECK-SD-NEXT:  .LBB6_7: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT:    ldr q17, [x15], #16
-; CHECK-SD-NEXT:    subs x16, x16, #16
+; CHECK-SD-NEXT:    ldr q17, [x8], #16
+; CHECK-SD-NEXT:    subs x12, x12, #16
 ; CHECK-SD-NEXT:    ushll v18.8h, v17.8b, #0
-; CHECK-SD-NEXT:    ushll2 v19.8h, v17.16b, #0
-; CHECK-SD-NEXT:    ushll v17.4s, v18.4h, #0
-; CHECK-SD-NEXT:    ushll2 v20.4s, v19.8h, #0
-; CHECK-SD-NEXT:    ushll2 v18.4s, v18.8h, #0
-; CHECK-SD-NEXT:    ushll v19.4s, v19.4h, #0
-; CHECK-SD-NEXT:    ushll v21.2d, v17.2s, #0
-; CHECK-SD-NEXT:    ushll2 v22.2d, v20.4s, #0
-; CHECK-SD-NEXT:    ushll2 v17.2d, v17.4s, #0
-; CHECK-SD-NEXT:    ushll v23.2d, v18.2s, #0
-; CHECK-SD-NEXT:    ushll v20.2d, v20.2s, #0
-; CHECK-SD-NEXT:    ushll2 v18.2d, v18.4s, #0
-; CHECK-SD-NEXT:    fmov x17, d21
-; CHECK-SD-NEXT:    mov x2, v21.d[1]
-; CHECK-SD-NEXT:    ushll v21.2d, v19.2s, #0
-; CHECK-SD-NEXT:    ushll2 v19.2d, v19.4s, #0
-; CHECK-SD-NEXT:    fmov x18, d22
-; CHECK-SD-NEXT:    fmov x1, d17
-; CHECK-SD-NEXT:    fmov x3, d23
-; CHECK-SD-NEXT:    fmov x21, d20
-; CHECK-SD-NEXT:    fmov x22, d18
-; CHECK-SD-NEXT:    fmov x19, d21
-; CHECK-SD-NEXT:    mul x17, x13, x17
-; CHECK-SD-NEXT:    mov x4, v22.d[1]
-; CHECK-SD-NEXT:    fmov x24, d19
-; CHECK-SD-NEXT:    mov x5, v23.d[1]
-; CHECK-SD-NEXT:    mov x6, v21.d[1]
-; CHECK-SD-NEXT:    mov x7, v20.d[1]
-; CHECK-SD-NEXT:    mov x20, v18.d[1]
-; CHECK-SD-NEXT:    mov x23, v19.d[1]
-; CHECK-SD-NEXT:    mov x25, v17.d[1]
-; CHECK-SD-NEXT:    mul x18, x14, x18
-; CHECK-SD-NEXT:    mul x1, x13, x1
-; CHECK-SD-NEXT:    fmov d17, x17
-; CHECK-SD-NEXT:    mul x3, x13, x3
-; CHECK-SD-NEXT:    fmov d18, x18
-; CHECK-SD-NEXT:    mul x19, x13, x19
-; CHECK-SD-NEXT:    fmov d19, x1
-; CHECK-SD-NEXT:    mul x21, x13, x21
-; CHECK-SD-NEXT:    fmov d20, x3
-; CHECK-SD-NEXT:    mul x22, x13, x22
-; CHECK-SD-NEXT:    fmov d21, x19
-; CHECK-SD-NEXT:    mul x24, x13, x24
-; CHECK-SD-NEXT:    fmov d24, x21
-; CHECK-SD-NEXT:    mul x2, x8, x2
-; CHECK-SD-NEXT:    fmov d22, x22
-; CHECK-SD-NEXT:    mul x4, x8, x4
-; CHECK-SD-NEXT:    fmov d23, x24
-; CHECK-SD-NEXT:    mul x5, x8, x5
-; CHECK-SD-NEXT:    mov v17.d[1], x2
-; CHECK-SD-NEXT:    mul x6, x8, x6
-; CHECK-SD-NEXT:    mov v18.d[1], x4
-; CHECK-SD-NEXT:    mul x7, x8, x7
-; CHECK-SD-NEXT:    mov v20.d[1], x5
-; CHECK-SD-NEXT:    add v1.2d, v17.2d, v1.2d
-; CHECK-SD-NEXT:    mul x20, x8, x20
-; CHECK-SD-NEXT:    mov v21.d[1], x6
-; CHECK-SD-NEXT:    add v6.2d, v18.2d, v6.2d
-; CHECK-SD-NEXT:    mul x23, x8, x23
-; CHECK-SD-NEXT:    mov v24.d[1], x7
-; CHECK-SD-NEXT:    add v4.2d, v20.2d, v4.2d
-; CHECK-SD-NEXT:    mul x17, x8, x25
-; CHECK-SD-NEXT:    mov v22.d[1], x20
-; CHECK-SD-NEXT:    add v7.2d, v21.2d, v7.2d
-; CHECK-SD-NEXT:    mov v23.d[1], x23
-; CHECK-SD-NEXT:    add v16.2d, v24.2d, v16.2d
-; CHECK-SD-NEXT:    mov v19.d[1], x17
-; CHECK-SD-NEXT:    add v3.2d, v22.2d, v3.2d
-; CHECK-SD-NEXT:    add v5.2d, v23.2d, v5.2d
-; CHECK-SD-NEXT:    add v2.2d, v19.2d, v2.2d
+; CHECK-SD-NEXT:    ushll2 v17.8h, v17.16b, #0
+; CHECK-SD-NEXT:    ushll2 v19.4s, v18.8h, #0
+; CHECK-SD-NEXT:    ushll v20.4s, v17.4h, #0
+; CHECK-SD-NEXT:    ushll v18.4s, v18.4h, #0
+; CHECK-SD-NEXT:    ushll2 v17.4s, v17.8h, #0
+; CHECK-SD-NEXT:    smlal2 v2.2d, v16.4s, v19.4s
+; CHECK-SD-NEXT:    smlal2 v4.2d, v16.4s, v20.4s
+; CHECK-SD-NEXT:    smlal v6.2d, v16.2s, v20.2s
+; CHECK-SD-NEXT:    smlal v3.2d, v16.2s, v19.2s
+; CHECK-SD-NEXT:    smlal2 v1.2d, v16.4s, v18.4s
+; CHECK-SD-NEXT:    smlal v7.2d, v16.2s, v17.2s
+; CHECK-SD-NEXT:    smlal v0.2d, v16.2s, v18.2s
+; CHECK-SD-NEXT:    smlal2 v5.2d, v16.4s, v17.4s
 ; CHECK-SD-NEXT:    b.ne .LBB6_7
 ; CHECK-SD-NEXT:  // %bb.8: // %middle.block
-; CHECK-SD-NEXT:    add v1.2d, v1.2d, v7.2d
-; CHECK-SD-NEXT:    add v4.2d, v4.2d, v16.2d
-; CHECK-SD-NEXT:    cmp x11, x10
-; CHECK-SD-NEXT:    add v2.2d, v2.2d, v5.2d
-; CHECK-SD-NEXT:    add v3.2d, v3.2d, v6.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v6.2d
+; CHECK-SD-NEXT:    add v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT:    cmp x10, x9
 ; CHECK-SD-NEXT:    add v1.2d, v1.2d, v4.2d
-; CHECK-SD-NEXT:    add v2.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT:    add v2.2d, v2.2d, v5.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v3.2d
 ; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-SD-NEXT:    addp d1, v1.2d
-; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x8, d0
 ; CHECK-SD-NEXT:    b.eq .LBB6_15
 ; CHECK-SD-NEXT:  // %bb.9: // %vec.epilog.iter.check
-; CHECK-SD-NEXT:    cbz x12, .LBB6_13
+; CHECK-SD-NEXT:    cbz x11, .LBB6_13
 ; CHECK-SD-NEXT:  .LBB6_10: // %vec.epilog.ph
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    mov w11, w1
 ; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-SD-NEXT:    mov x13, x11
+; CHECK-SD-NEXT:    sxtb x11, w11
 ; CHECK-SD-NEXT:    movi v3.2d, #0x000000000000ff
-; CHECK-SD-NEXT:    fmov x14, d0
-; CHECK-SD-NEXT:    and x11, x10, #0xfffffffc
-; CHECK-SD-NEXT:    fmov x15, d0
-; CHECK-SD-NEXT:    sub x12, x13, x11
-; CHECK-SD-NEXT:    add x13, x0, x13
-; CHECK-SD-NEXT:    mov v1.d[0], x8
-; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    dup v2.2s, w11
+; CHECK-SD-NEXT:    mov x11, x10
+; CHECK-SD-NEXT:    and x10, x9, #0xfffffffc
+; CHECK-SD-NEXT:    mov v0.d[0], x8
+; CHECK-SD-NEXT:    sub x8, x11, x10
+; CHECK-SD-NEXT:    add x11, x0, x11
 ; CHECK-SD-NEXT:  .LBB6_11: // %vec.epilog.vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT:    ldr s0, [x13], #4
-; CHECK-SD-NEXT:    adds x12, x12, #4
-; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT:    ushll v4.2d, v0.2s, #0
-; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-NEXT:    ldr s4, [x11], #4
+; CHECK-SD-NEXT:    adds x8, x8, #4
+; CHECK-SD-NEXT:    ushll v4.8h, v4.8b, #0
+; CHECK-SD-NEXT:    ushll v4.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ushll v5.2d, v4.2s, #0
+; CHECK-SD-NEXT:    ushll2 v4.2d, v4.4s, #0
+; CHECK-SD-NEXT:    and v5.16b, v5.16b, v3.16b
 ; CHECK-SD-NEXT:    and v4.16b, v4.16b, v3.16b
-; CHECK-SD-NEXT:    and v0.16b, v0.16b, v3.16b
-; CHECK-SD-NEXT:    fmov x16, d4
-; CHECK-SD-NEXT:    fmov x18, d0
-; CHECK-SD-NEXT:    mov x17, v4.d[1]
-; CHECK-SD-NEXT:    mov x1, v0.d[1]
-; CHECK-SD-NEXT:    mul x16, x14, x16
-; CHECK-SD-NEXT:    mul x18, x15, x18
-; CHECK-SD-NEXT:    mul x17, x8, x17
-; CHECK-SD-NEXT:    fmov d0, x16
-; CHECK-SD-NEXT:    mul x1, x8, x1
-; CHECK-SD-NEXT:    fmov d4, x18
-; CHECK-SD-NEXT:    mov v0.d[1], x17
-; CHECK-SD-NEXT:    mov v4.d[1], x1
-; CHECK-SD-NEXT:    add v1.2d, v0.2d, v1.2d
-; CHECK-SD-NEXT:    add v2.2d, v4.2d, v2.2d
+; CHECK-SD-NEXT:    xtn v5.2s, v5.2d
+; CHECK-SD-NEXT:    xtn v4.2s, v4.2d
+; CHECK-SD-NEXT:    smlal v1.2d, v2.2s, v4.2s
+; CHECK-SD-NEXT:    smlal v0.2d, v2.2s, v5.2s
 ; CHECK-SD-NEXT:    b.ne .LBB6_11
 ; CHECK-SD-NEXT:  // %bb.12: // %vec.epilog.middle.block
-; CHECK-SD-NEXT:    add v0.2d, v1.2d, v2.2d
-; CHECK-SD-NEXT:    cmp x11, x10
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    cmp x10, x9
 ; CHECK-SD-NEXT:    addp d0, v0.2d
 ; CHECK-SD-NEXT:    fmov x8, d0
 ; CHECK-SD-NEXT:    b.eq .LBB6_15
 ; CHECK-SD-NEXT:  .LBB6_13: // %for.body.preheader
-; CHECK-SD-NEXT:    sub x10, x10, x11
-; CHECK-SD-NEXT:    add x11, x0, x11
+; CHECK-SD-NEXT:    sxtb x11, w1
+; CHECK-SD-NEXT:    sub x9, x9, x10
+; CHECK-SD-NEXT:    add x10, x0, x10
 ; CHECK-SD-NEXT:  .LBB6_14: // %for.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT:    ldrb w12, [x11], #1
-; CHECK-SD-NEXT:    subs x10, x10, #1
-; CHECK-SD-NEXT:    smaddl x8, w12, w9, x8
+; CHECK-SD-NEXT:    ldrb w12, [x10], #1
+; CHECK-SD-NEXT:    subs x9, x9, #1
+; CHECK-SD-NEXT:    smaddl x8, w12, w11, x8
 ; CHECK-SD-NEXT:    b.ne .LBB6_14
-; CHECK-SD-NEXT:  .LBB6_15:
-; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    ldr x25, [sp], #64 // 8-byte Folded Reload
+; CHECK-SD-NEXT:  .LBB6_15: // %for.cond.cleanup
 ; CHECK-SD-NEXT:    mov x0, x8
 ; CHECK-SD-NEXT:    ret
 ;
@@ -957,63 +875,64 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none
 ; CHECK-GI-NEXT:    cbz w2, .LBB6_7
 ; CHECK-GI-NEXT:  // %bb.1: // %iter.check
 ; CHECK-GI-NEXT:    movi d0, #0000000000000000
-; CHECK-GI-NEXT:    sxtb x9, w1
-; CHECK-GI-NEXT:    mov x11, xzr
+; CHECK-GI-NEXT:    mov x10, xzr
 ; CHECK-GI-NEXT:    cmp w2, #4
-; CHECK-GI-NEXT:    mov w10, w2
+; CHECK-GI-NEXT:    mov w9, w2
 ; CHECK-GI-NEXT:    b.lo .LBB6_12
 ; CHECK-GI-NEXT:  // %bb.2: // %vector.main.loop.iter.check
 ; CHECK-GI-NEXT:    movi d0, #0000000000000000
-; CHECK-GI-NEXT:    dup v1.2d, x9
-; CHECK-GI-NEXT:    mov x11, xzr
+; CHECK-GI-NEXT:    mov x10, xzr
 ; CHECK-GI-NEXT:    cmp w2, #16
 ; CHECK-GI-NEXT:    b.lo .LBB6_9
 ; CHECK-GI-NEXT:  // %bb.3: // %vector.ph
+; CHECK-GI-NEXT:    mov w8, w1
 ; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-NEXT:    xtn v2.2s, v1.2d
-; CHECK-GI-NEXT:    and x8, x10, #0xc
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    sxtb x8, w8
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-GI-NEXT:    and x11, x10, #0xfffffff0
-; CHECK-GI-NEXT:    movi v5.2d, #0000000000000000
 ; CHECK-GI-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-GI-NEXT:    mov x12, x0
+; CHECK-GI-NEXT:    and x10, x9, #0xfffffff0
+; CHECK-GI-NEXT:    dup v5.2d, x8
 ; CHECK-GI-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-GI-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-GI-NEXT:    and x13, x10, #0xfffffff0
-; CHECK-GI-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-GI-NEXT:    and x8, x9, #0xc
+; CHECK-GI-NEXT:    mov x11, x0
+; CHECK-GI-NEXT:    and x12, x9, #0xfffffff0
+; CHECK-GI-NEXT:    xtn v16.2s, v5.2d
+; CHECK-GI-NEXT:    movi v5.2d, #0000000000000000
 ; CHECK-GI-NEXT:  .LBB6_4: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT:    ldr q18, [x12], #16
-; CHECK-GI-NEXT:    subs x13, x13, #16
-; CHECK-GI-NEXT:    ushll v19.8h, v18.8b, #0
-; CHECK-GI-NEXT:    ushll2 v18.8h, v18.16b, #0
-; CHECK-GI-NEXT:    ushll v20.4s, v19.4h, #0
-; CHECK-GI-NEXT:    ushll2 v19.4s, v19.8h, #0
-; CHECK-GI-NEXT:    ushll v21.4s, v18.4h, #0
+; CHECK-GI-NEXT:    ldr q17, [x11], #16
+; CHECK-GI-NEXT:    subs x12, x12, #16
+; CHECK-GI-NEXT:    ushll v18.8h, v17.8b, #0
+; CHECK-GI-NEXT:    ushll2 v17.8h, v17.16b, #0
+; CHECK-GI-NEXT:    ushll v19.4s, v18.4h, #0
 ; CHECK-GI-NEXT:    ushll2 v18.4s, v18.8h, #0
-; CHECK-GI-NEXT:    mov d22, v20.d[1]
-; CHECK-GI-NEXT:    mov d23, v19.d[1]
-; CHECK-GI-NEXT:    mov d24, v21.d[1]
-; CHECK-GI-NEXT:    mov d25, v18.d[1]
-; CHECK-GI-NEXT:    smlal v0.2d, v2.2s, v20.2s
-; CHECK-GI-NEXT:    smlal v4.2d, v2.2s, v19.2s
-; CHECK-GI-NEXT:    smlal v6.2d, v2.2s, v21.2s
-; CHECK-GI-NEXT:    smlal v16.2d, v2.2s, v18.2s
-; CHECK-GI-NEXT:    smlal v3.2d, v2.2s, v22.2s
-; CHECK-GI-NEXT:    smlal v5.2d, v2.2s, v23.2s
-; CHECK-GI-NEXT:    smlal v7.2d, v2.2s, v24.2s
-; CHECK-GI-NEXT:    smlal v17.2d, v2.2s, v25.2s
+; CHECK-GI-NEXT:    ushll v20.4s, v17.4h, #0
+; CHECK-GI-NEXT:    ushll2 v17.4s, v17.8h, #0
+; CHECK-GI-NEXT:    mov d21, v19.d[1]
+; CHECK-GI-NEXT:    mov d22, v18.d[1]
+; CHECK-GI-NEXT:    mov d23, v20.d[1]
+; CHECK-GI-NEXT:    mov d24, v17.d[1]
+; CHECK-GI-NEXT:    smlal v0.2d, v16.2s, v19.2s
+; CHECK-GI-NEXT:    smlal v2.2d, v16.2s, v18.2s
+; CHECK-GI-NEXT:    smlal v4.2d, v16.2s, v20.2s
+; CHECK-GI-NEXT:    smlal v6.2d, v16.2s, v17.2s
+; CHECK-GI-NEXT:    smlal v1.2d, v16.2s, v21.2s
+; CHECK-GI-NEXT:    smlal v3.2d, v16.2s, v22.2s
+; CHECK-GI-NEXT:    smlal v5.2d, v16.2s, v23.2s
+; CHECK-GI-NEXT:    smlal v7.2d, v16.2s, v24.2s
 ; CHECK-GI-NEXT:    b.ne .LBB6_4
 ; CHECK-GI-NEXT:  // %bb.5: // %middle.block
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v3.2d
+; CHECK-GI-NEXT:    cmp x10, x9
 ; CHECK-GI-NEXT:    add v2.2d, v4.2d, v5.2d
-; CHECK-GI-NEXT:    cmp x11, x10
 ; CHECK-GI-NEXT:    add v3.2d, v6.2d, v7.2d
-; CHECK-GI-NEXT:    add v4.2d, v16.2d, v17.2d
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-GI-NEXT:    add v2.2d, v3.2d, v4.2d
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v3.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-GI-NEXT:    addp d0, v0.2d
 ; CHECK-GI-NEXT:    b.ne .LBB6_8
 ; CHECK-GI-NEXT:  // %bb.6:
@@ -1027,50 +946,54 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none
 ; CHECK-GI-NEXT:  .LBB6_8: // %vec.epilog.iter.check
 ; CHECK-GI-NEXT:    cbz x8, .LBB6_12
 ; CHECK-GI-NEXT:  .LBB6_9: // %vec.epilog.ph
+; CHECK-GI-NEXT:    mov w8, w1
 ; CHECK-GI-NEXT:    mov v0.d[1], xzr
-; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-GI-NEXT:    mov x12, x11
-; CHECK-GI-NEXT:    xtn v1.2s, v1.2d
-; CHECK-GI-NEXT:    and x11, x10, #0xfffffffc
-; CHECK-GI-NEXT:    sub x8, x12, x11
-; CHECK-GI-NEXT:    add x12, x0, x12
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    sxtb x8, w8
+; CHECK-GI-NEXT:    mov x11, x10
+; CHECK-GI-NEXT:    and x10, x9, #0xfffffffc
+; CHECK-GI-NEXT:    dup v2.2d, x8
+; CHECK-GI-NEXT:    sub x8, x11, x10
+; CHECK-GI-NEXT:    add x11, x0, x11
+; CHECK-GI-NEXT:    xtn v2.2s, v2.2d
 ; CHECK-GI-NEXT:  .LBB6_10: // %vec.epilog.vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT:    ldr w13, [x12], #4
+; CHECK-GI-NEXT:    ldr w12, [x11], #4
 ; CHECK-GI-NEXT:    adds x8, x8, #4
-; CHECK-GI-NEXT:    fmov s3, w13
-; CHECK-GI-NEXT:    uxtb w13, w13
+; CHECK-GI-NEXT:    fmov s3, w12
+; CHECK-GI-NEXT:    uxtb w12, w12
 ; CHECK-GI-NEXT:    mov b4, v3.b[2]
 ; CHECK-GI-NEXT:    mov b5, v3.b[1]
 ; CHECK-GI-NEXT:    mov b6, v3.b[3]
-; CHECK-GI-NEXT:    fmov s3, w13
-; CHECK-GI-NEXT:    fmov w14, s4
-; CHECK-GI-NEXT:    fmov w15, s5
-; CHECK-GI-NEXT:    fmov w16, s6
+; CHECK-GI-NEXT:    fmov s3, w12
+; CHECK-GI-NEXT:    fmov w13, s4
+; CHECK-GI-NEXT:    fmov w14, s5
+; CHECK-GI-NEXT:    fmov w15, s6
+; CHECK-GI-NEXT:    uxtb w13, w13
 ; CHECK-GI-NEXT:    uxtb w14, w14
 ; CHECK-GI-NEXT:    uxtb w15, w15
-; CHECK-GI-NEXT:    uxtb w16, w16
-; CHECK-GI-NEXT:    fmov s4, w14
-; CHECK-GI-NEXT:    mov v3.s[1], w15
-; CHECK-GI-NEXT:    mov v4.s[1], w16
-; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v3.2s
-; CHECK-GI-NEXT:    smlal v2.2d, v1.2s, v4.2s
+; CHECK-GI-NEXT:    fmov s4, w13
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    mov v4.s[1], w15
+; CHECK-GI-NEXT:    smlal v0.2d, v2.2s, v3.2s
+; CHECK-GI-NEXT:    smlal v1.2d, v2.2s, v4.2s
 ; CHECK-GI-NEXT:    b.ne .LBB6_10
 ; CHECK-GI-NEXT:  // %bb.11: // %vec.epilog.middle.block
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-GI-NEXT:    cmp x11, x10
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    cmp x10, x9
 ; CHECK-GI-NEXT:    addp d0, v0.2d
 ; CHECK-GI-NEXT:    fmov x8, d0
 ; CHECK-GI-NEXT:    b.eq .LBB6_14
 ; CHECK-GI-NEXT:  .LBB6_12: // %for.body.preheader
-; CHECK-GI-NEXT:    sub x10, x10, x11
-; CHECK-GI-NEXT:    add x11, x0, x11
+; CHECK-GI-NEXT:    sxtb x11, w1
+; CHECK-GI-NEXT:    sub x9, x9, x10
+; CHECK-GI-NEXT:    add x10, x0, x10
 ; CHECK-GI-NEXT:  .LBB6_13: // %for.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT:    ldrb w8, [x11], #1
+; CHECK-GI-NEXT:    ldrb w8, [x10], #1
 ; CHECK-GI-NEXT:    fmov x12, d0
-; CHECK-GI-NEXT:    subs x10, x10, #1
-; CHECK-GI-NEXT:    madd x8, x8, x9, x12
+; CHECK-GI-NEXT:    subs x9, x9, #1
+; CHECK-GI-NEXT:    madd x8, x8, x11, x12
 ; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    b.ne .LBB6_13
 ; CHECK-GI-NEXT:  .LBB6_14: // %for.cond.cleanup
diff --git a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll
index a84d666c1be6b..d1bcad4724e48 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll
@@ -24,8 +24,8 @@ loop:
   %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
-  %a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1
-  %b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1
+  %a = load <8 x i8>, ptr %ptr1_i, align 1
+  %b = load <8 x i8>, ptr %ptr2_i, align 1
   %vabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
   %vabd_ext = zext <8 x i8> %vabd to <8 x i16>
   %acc_next = add <8 x i16> %vabd_ext, %acc_phi
@@ -65,8 +65,8 @@ loop:
   %acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i
-  %a = load <4 x i16>, <4 x i16>* %ptr1_i, align 1
-  %b = load <4 x i16>, <4 x i16>* %ptr2_i, align 1
+  %a = load <4 x i16>, ptr %ptr1_i, align 1
+  %b = load <4 x i16>, ptr %ptr2_i, align 1
   %vabd = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
   %vmov = zext <4 x i16> %vabd to <4 x i32>
   %acc_next = add <4 x i32> %vmov, %acc_phi
@@ -116,8 +116,8 @@ loop:
   %acc_phi_lo = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next_lo, %loop ]
   %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
-  %a = load <16 x i8>, <16 x i8>* %ptr1_i, align 1
-  %b = load <16 x i8>, <16 x i8>* %ptr2_i, align 1
+  %a = load <16 x i8>, ptr %ptr1_i, align 1
+  %b = load <16 x i8>, ptr %ptr2_i, align 1
   %a_hi = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %b_hi = shufflevector <16 x i8> %b, <16 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %a_lo = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -160,8 +160,8 @@ loop:
   %acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i32, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i32, ptr %ptr2, i32 %i
-  %a = load <4 x i32>, <4 x i32>* %ptr1_i, align 1
-  %b = load <4 x i32>, <4 x i32>* %ptr2_i, align 1
+  %a = load <4 x i32>, ptr %ptr1_i, align 1
+  %b = load <4 x i32>, ptr %ptr2_i, align 1
   %vabd = tail call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %a, <4 x i32> %b)
   %acc_next = add <4 x i32> %acc_phi, %vabd
   %next_i = add i32 %i, 4
@@ -198,8 +198,8 @@ loop:
   ; Load values from ptr1 and ptr2
   %ptr1_i = getelementptr i32, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i32, ptr %ptr2, i32 %i
-  %a = load <4 x i32>, <4 x i32>* %ptr1_i, align 1
-  %b = load <4 x i32>, <4 x i32>* %ptr2_i, align 1
+  %a = load <4 x i32>, ptr %ptr1_i, align 1
+  %b = load <4 x i32>, ptr %ptr2_i, align 1
   ; Perform the intrinsic operation
   %vabd = tail call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %a, <4 x i32> %b)
   %acc_next = add <4 x i32> %acc_phi, %vabd
@@ -237,8 +237,8 @@ loop:
   %acc_phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i32, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i32, ptr %ptr2, i32 %i
-  %a = load <2 x i32>, <2 x i32>* %ptr1_i, align 1
-  %b = load <2 x i32>, <2 x i32>* %ptr2_i, align 1
+  %a = load <2 x i32>, ptr %ptr1_i, align 1
+  %b = load <2 x i32>, ptr %ptr2_i, align 1
   %vabd = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b)
   %acc_next = add <2 x i32> %acc_phi, %vabd
   %next_i = add i32 %i, 2
@@ -272,8 +272,8 @@ loop:
   %acc_phi = phi <8 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
-  %a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1
-  %b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1
+  %a = load <8 x i8>, ptr %ptr1_i, align 1
+  %b = load <8 x i8>, ptr %ptr2_i, align 1
   %vabd = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
   %acc_next = add <8 x i8> %acc_phi, %vabd
   %next_i = add i32 %i, 8
@@ -307,8 +307,8 @@ loop:
   %acc_phi = phi <16 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
-  %a = load <16 x i8>, <16 x i8>* %ptr1_i, align 1
-  %b = load <16 x i8>, <16 x i8>* %ptr2_i, align 1
+  %a = load <16 x i8>, ptr %ptr1_i, align 1
+  %b = load <16 x i8>, ptr %ptr2_i, align 1
   %vabd = tail call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %a, <16 x i8> %b)
   %acc_next = add <16 x i8> %acc_phi, %vabd
   %next_i = add i32 %i, 16
@@ -342,8 +342,8 @@ loop:
   %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i
-  %a = load <8 x i16>, <8 x i16>* %ptr1_i, align 1
-  %b = load <8 x i16>, <8 x i16>* %ptr2_i, align 1
+  %a = load <8 x i16>, ptr %ptr1_i, align 1
+  %b = load <8 x i16>, ptr %ptr2_i, align 1
   %vabd = tail call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %a, <8 x i16> %b)
   %acc_next = add <8 x i16> %acc_phi, %vabd
   %next_i = add i32 %i, 8
@@ -377,8 +377,8 @@ loop:
   %acc_phi = phi <8 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
-  %a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1
-  %b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1
+  %a = load <8 x i8>, ptr %ptr1_i, align 1
+  %b = load <8 x i8>, ptr %ptr2_i, align 1
   %vabd = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
   %acc_next = add <8 x i8> %acc_phi, %vabd
   %next_i = add i32 %i, 8
@@ -411,8 +411,8 @@ loop:
   %acc_phi = phi <4 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i
-  %a = load <4 x i16>, <4 x i16>* %ptr1_i, align 1
-  %b = load <4 x i16>, <4 x i16>* %ptr2_i, align 1
+  %a = load <4 x i16>, ptr %ptr1_i, align 1
+  %b = load <4 x i16>, ptr %ptr2_i, align 1
   %vabd = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
   %acc_next = add <4 x i16> %acc_phi, %vabd
   %next_i = add i32 %i, 4
@@ -445,8 +445,8 @@ loop:
   %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i
-  %a = load <8 x i16>, <8 x i16>* %ptr1_i, align 1
-  %b = load <8 x i16>, <8 x i16>* %ptr2_i, align 1
+  %a = load <8 x i16>, ptr %ptr1_i, align 1
+  %b = load <8 x i16>, ptr %ptr2_i, align 1
   %vabd = tail call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %a, <8 x i16> %b)
   %acc_next = add <8 x i16> %acc_phi, %vabd
   %next_i = add i32 %i, 8
@@ -480,8 +480,8 @@ loop:
   %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
-  %a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1
-  %b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1
+  %a = load <8 x i8>, ptr %ptr1_i, align 1
+  %b = load <8 x i8>, ptr %ptr2_i, align 1
   %vabd = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
   %vmov = zext <8 x i8> %vabd to <8 x i16>
   %acc_next = add <8 x i16> %vmov, %acc_phi
@@ -516,8 +516,8 @@ loop:
   %acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i
-  %a = load <4 x i16>, <4 x i16>* %ptr1_i, align 1
-  %b = load <4 x i16>, <4 x i16>* %ptr2_i, align 1
+  %a = load <4 x i16>, ptr %ptr1_i, align 1
+  %b = load <4 x i16>, ptr %ptr2_i, align 1
   %vabd = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b)
   %vmov = zext <4 x i16> %vabd to <4 x i32>
   %acc_next = add <4 x i32> %vmov, %acc_phi
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 0cd885e599817..e85e808921c87 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -1,10 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEON
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
-; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI:       warning: Instruction selection used fallback path for pmlsl2_v8i16_uzp1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmlsl_pmlsl2_v8i16_uzp1
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: smull_v8i8_v8i16:
@@ -1832,14 +1829,33 @@ entry:
 }
 
 define void @pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
-; CHECK-LABEL: pmlsl2_v8i16_uzp1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q2, [x1, #16]
-; CHECK-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
-; CHECK-NEXT:    sub v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: pmlsl2_v8i16_uzp1:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr q2, [x1, #16]
+; CHECK-NEON-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-NEON-NEXT:    str q0, [x0]
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: pmlsl2_v8i16_uzp1:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr q2, [x1, #16]
+; CHECK-SVE-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
+; CHECK-SVE-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
+; CHECK-SVE-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-SVE-NEXT:    str q0, [x0]
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: pmlsl2_v8i16_uzp1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q2, [x1, #16]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    xtn v2.8b, v2.8h
+; CHECK-GI-NEXT:    pmull v0.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
   %5 = getelementptr inbounds i32, ptr %3, i64 4
   %6 = load <8 x i16>, ptr %5, align 4
   %7 = trunc <8 x i16> %6 to <8 x i8>
@@ -1991,16 +2007,40 @@ define void @umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) {
 }
 
 define void @pmlsl_pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) {
-; CHECK-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    pmull v3.8h, v0.8b, v2.8b
-; CHECK-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
-; CHECK-NEXT:    add v0.8h, v3.8h, v0.8h
-; CHECK-NEXT:    sub v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    ldp q2, q3, [x1]
+; CHECK-NEON-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT:    pmull v3.8h, v0.8b, v2.8b
+; CHECK-NEON-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    add v0.8h, v3.8h, v0.8h
+; CHECK-NEON-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-NEON-NEXT:    str q0, [x0]
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ldp q2, q3, [x1]
+; CHECK-SVE-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
+; CHECK-SVE-NEXT:    pmull v3.8h, v0.8b, v2.8b
+; CHECK-SVE-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
+; CHECK-SVE-NEXT:    add v0.8h, v3.8h, v0.8h
+; CHECK-SVE-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-SVE-NEXT:    str q0, [x0]
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldp q2, q3, [x1]
+; CHECK-GI-NEXT:    mov d4, v0.d[1]
+; CHECK-GI-NEXT:    xtn v2.8b, v2.8h
+; CHECK-GI-NEXT:    xtn v3.8b, v3.8h
+; CHECK-GI-NEXT:    pmull v0.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    pmull v2.8h, v4.8b, v3.8b
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %5 = load <8 x i16>, ptr %3, align 4
   %6 = trunc <8 x i16> %5 to <8 x i8>
diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index cad5df0d9655e..68ab8902767b3 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -430,12 +430,12 @@ declare i32 @foo()
 
 ; Test case distilled from 126.gcc.
 ; The phi in sw.bb.i.i gets multiple operands for the %entry predecessor.
-define void @build_modify_expr() nounwind ssp {
+define void @build_modify_expr(i32 %cond) nounwind ssp {
 ; CHECK-LABEL: build_modify_expr:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    ret
 entry:
-  switch i32 undef, label %sw.bb.i.i [
+  switch i32 %cond, label %sw.bb.i.i [
     i32 69, label %if.end85
     i32 70, label %if.end85
     i32 71, label %if.end85
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
index 2a8b3ce2ae10b..8cb319b2c3368 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
@@ -1,11 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI:       warning: Instruction selection used fallback path for test_vmull_p8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_vmull_high_p8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_vmull_p64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_vmull_high_p64
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
 declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5
@@ -2721,14 +2716,24 @@ entry:
 }
 
 define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
-; CHECK-LABEL: test_vmull_p64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov d0, x1
-; CHECK-NEXT:    fmov d1, x0
-; CHECK-NEXT:    pmull v0.1q, v1.1d, v0.1d
-; CHECK-NEXT:    mov x1, v0.d[1]
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vmull_p64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov d0, x1
+; CHECK-SD-NEXT:    fmov d1, x0
+; CHECK-SD-NEXT:    pmull v0.1q, v1.1d, v0.1d
+; CHECK-SD-NEXT:    mov x1, v0.d[1]
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vmull_p64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov d0, x0
+; CHECK-GI-NEXT:    fmov d1, x1
+; CHECK-GI-NEXT:    pmull v0.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    fmov x1, d1
+; CHECK-GI-NEXT:    ret
 entry:
   %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b)
   %vmull3.i = bitcast <16 x i8> %vmull2.i to i128
@@ -2736,12 +2741,22 @@ entry:
 }
 
 define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
-; CHECK-LABEL: test_vmull_high_p64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
-; CHECK-NEXT:    mov x1, v0.d[1]
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vmull_high_p64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
+; CHECK-SD-NEXT:    mov x1, v0.d[1]
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vmull_high_p64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    mov d1, v1.d[1]
+; CHECK-GI-NEXT:    pmull v0.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    fmov x1, d1
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = extractelement <2 x i64> %a, i32 1
   %1 = extractelement <2 x i64> %b, i32 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir b/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir
new file mode 100644
index 0000000000000..8c31e7c2d1cec
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir
@@ -0,0 +1,55 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64-linux-gnu -run-pass peephole-opt -o - %s | FileCheck %s
+--- |
+  define i32 @test01() nounwind {
+  entry:
+    %0 = select i1 true, i32 1, i32 0
+    %1 = and i32 %0, 65535
+    %2 = icmp sgt i32 %1, 0
+    br i1 %2, label %if.then, label %if.end
+
+  if.then:                                      ; preds = %entry
+    ret i32 1
+
+  if.end:                                       ; preds = %entry
+    ret i32 0
+  }
+...
+---
+name:            test01
+registers:
+  - { id: 0, class: gpr32 }
+  - { id: 1, class: gpr32common }
+body:             |
+  ; CHECK-LABEL: name: test01
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 1
+  ; CHECK-NEXT:   [[ANDSWri:%[0-9]+]]:gpr32common = ANDSWri killed [[ANDSWri]], 15, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 12, %bb.2, implicit $nzcv
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.if.then:
+  ; CHECK-NEXT:   $w0 = MOVi32imm 1
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.if.end:
+  ; CHECK-NEXT:   $w0 = MOVi32imm 0
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+  bb.0.entry:
+    successors: %bb.2.if.end, %bb.1.if.then
+
+    %0 = MOVi32imm 1
+    %1 = ANDWri killed %1, 15
+    $wzr = SUBSWri killed %1, 0, 0, implicit-def $nzcv
+    Bcc 12, %bb.2.if.end, implicit $nzcv
+
+  bb.1.if.then:
+    $w0 = MOVi32imm 1
+    RET_ReallyLR implicit $w0
+
+  bb.2.if.end:
+    $w0 = MOVi32imm 0
+    RET_ReallyLR implicit $w0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index e6df9f2fb2c56..90abc7d389c13 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -2,44 +2,35 @@
 ; RUN: llc -mtriple=aarch64-none-elf -mattr=+aes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc -mtriple=aarch64-none-elf -mattr=+aes -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for pmull8h
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for commutable_pmull8h
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmulh_1s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_2s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_4s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_2d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_commuted_neg_2s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_commuted_neg_4s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_commuted_neg_2d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_2s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_4s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_2d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_2s_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_4s_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_2d_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmla_indexed_scalar_2s_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmla_indexed_scalar_4s_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmla_indexed_scalar_2d_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmulh_lane_1s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmlal_lane_1d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmlsl_lane_1d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmull_from_extract_dup_low
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmull_from_extract_dup_high
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmull_from_extract_duplane_low
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmull_from_extract_duplane_high
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for scalar_fmls_from_extract_v4f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32_1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32_1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmlal_d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmlsl_d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_pmull_64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_pmull_high_64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_commutable_pmull_64
+; CHECK-GI:	 warning: Instruction selection used fallback path for sqdmulh_1s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_4s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_4s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2s_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_4s_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2d_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmulh_lane_1s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_lane_1d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_lane_1d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v4f32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f64
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32_1
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32_1
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f64
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_d
 
 define <8 x i16> @smull8h(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: smull8h:
@@ -2895,11 +2886,18 @@ define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) {
 }
 
 define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) {
-; CHECK-LABEL: pmull_from_extract_dup_high:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v1.16b, w0
-; CHECK-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: pmull_from_extract_dup_high:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.16b, w0
+; CHECK-SD-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: pmull_from_extract_dup_high:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    dup v1.8b, w0
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    pmull v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
   %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 
@@ -2924,12 +2922,20 @@ define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs)
 }
 
 define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK-LABEL: pmull_from_extract_duplane_high:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    dup v1.16b, v1.b[0]
-; CHECK-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: pmull_from_extract_duplane_high:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    dup v1.16b, v1.b[0]
+; CHECK-SD-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: pmull_from_extract_duplane_high:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    dup v1.8b, v1.b[0]
+; CHECK-GI-NEXT:    pmull v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 
@@ -3245,21 +3251,35 @@ define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
 }
 
 define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
-; CHECK-LABEL: test_pmull_64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d0, x1
-; CHECK-NEXT:    fmov d1, x0
-; CHECK-NEXT:    pmull v0.1q, v1.1d, v0.1d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_pmull_64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov d0, x1
+; CHECK-SD-NEXT:    fmov d1, x0
+; CHECK-SD-NEXT:    pmull v0.1q, v1.1d, v0.1d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_pmull_64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov d0, x0
+; CHECK-GI-NEXT:    fmov d1, x1
+; CHECK-GI-NEXT:    pmull v0.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT:    ret
   %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
   ret <16 x i8> %val
 }
 
 define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
-; CHECK-LABEL: test_pmull_high_64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_pmull_high_64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_pmull_high_64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    mov d1, v1.d[1]
+; CHECK-GI-NEXT:    pmull v0.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT:    ret
   %l_hi = extractelement <2 x i64> %l, i32 1
   %r_hi = extractelement <2 x i64> %r, i32 1
   %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
@@ -3267,13 +3287,22 @@ define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
 }
 
 define <16 x i8> @test_commutable_pmull_64(i64 %l, i64 %r) nounwind {
-; CHECK-LABEL: test_commutable_pmull_64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d0, x1
-; CHECK-NEXT:    fmov d1, x0
-; CHECK-NEXT:    pmull v0.1q, v1.1d, v0.1d
-; CHECK-NEXT:    add v0.16b, v0.16b, v0.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_commutable_pmull_64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov d0, x1
+; CHECK-SD-NEXT:    fmov d1, x0
+; CHECK-SD-NEXT:    pmull v0.1q, v1.1d, v0.1d
+; CHECK-SD-NEXT:    add v0.16b, v0.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_commutable_pmull_64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov d0, x0
+; CHECK-GI-NEXT:    fmov d1, x1
+; CHECK-GI-NEXT:    pmull v2.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT:    pmull v0.1q, v1.1d, v0.1d
+; CHECK-GI-NEXT:    add v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %1 = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
   %2 = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %r, i64 %l)
   %3 = add <16 x i8> %1, %2
diff --git a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
index dbbfbea9176f6..f725c19081deb 100644
--- a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
+++ b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
@@ -188,11 +188,11 @@ entry:
 define <8 x i8> @test11(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
 ; CHECK-LABEL: test11:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ld1r { v1.8b }, [x0]
-; CHECK-NEXT:    ld1r { v2.8b }, [x1]
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-NEXT:    mov v0.h[3], v1.h[0]
+; CHECK-NEXT:    ld1r { v0.8b }, [x0]
+; CHECK-NEXT:    ld1r { v1.8b }, [x1]
+; CHECK-NEXT:    fmov d2, d0
+; CHECK-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-NEXT:    mov v0.h[3], v2.h[0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll b/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll
index bdbc99e2d98b0..75e7ac902274d 100644
--- a/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll
+++ b/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll
@@ -2,15 +2,58 @@
 
 declare void @called()
 declare void @escaped()
-define void @f(ptr %dst) {
+define void @f(ptr %dst, ptr readonly %f) {
   call void @called()
+; CHECK:         bl      "#called"
   store ptr @escaped, ptr %dst
-  ret void
+  call void %f()
+; CHECK:       adrp    x10, $iexit_thunk$cdecl$v$v
+; CHECK-NEXT:  add     x10, x10, :lo12:$iexit_thunk$cdecl$v$v
+; CHECK-NEXT:  str     x8, [x20]
+; CHECK-NEXT:  adrp    x8, __os_arm64x_check_icall_cfg
+; CHECK-NEXT:  ldr     x8, [x8, :lo12:__os_arm64x_check_icall_cfg]
+; CHECK-NEXT:  mov     x11,
+; CHECK-NEXT:  blr     x8
+; CHECK-NEXT:  blr     x11
+    ret void
 }
 
+; CHECK-LABEL:    .def "#called$exit_thunk";
+; CHECK-NEXT:     .scl 2;
+; CHECK-NEXT:     .type 32;
+; CHECK-NEXT:     .endef
+; CHECK-NEXT:     .section .wowthk$aa,"xr",discard,"#called$exit_thunk"
+; CHECK-NEXT:     .globl "#called$exit_thunk"            // -- Begin function #called$exit_thunk
+; CHECK-NEXT:     .p2align 2
+; CHECK-NEXT: "#called$exit_thunk":                   // @"#called$exit_thunk"
+; CHECK-NEXT:     .weak_anti_dep called
+; CHECK-NEXT: called = "#called"
+; CHECK-NEXT:     .weak_anti_dep "#called"
+; CHECK-NEXT: "#called" = "#called$exit_thunk"
+; CHECK-NEXT:    .seh_proc "#called$exit_thunk"
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT:     str x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     adrp x8, __os_arm64x_check_icall
+; CHECK-NEXT:     adrp x11, called
+; CHECK-NEXT:     add x11, x11, :lo12:called
+; CHECK-NEXT:     ldr x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT:     adrp x10, $iexit_thunk$cdecl$v$v
+; CHECK-NEXT:     add x10, x10, :lo12:$iexit_thunk$cdecl$v$v
+; CHECK-NEXT:     blr x8
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldr x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br x11
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+
 !llvm.module.flags = !{!0}
-!0 = !{i32 2, !"cfguard", i32 1}
+!0 = !{i32 2, !"cfguard", i32 2}
 
 ; CHECK-LABEL: .section .gfids$y,"dr"
 ; CHECK-NEXT:  .symidx escaped
+; CHECK-NEXT:  .symidx $iexit_thunk$cdecl$v$v
 ; CHECK-NOT:   .symidx
diff --git a/llvm/test/CodeGen/AArch64/cgdata-merge-local.ll b/llvm/test/CodeGen/AArch64/cgdata-merge-local.ll
index 608fe29e17398..d421b3f17caf8 100644
--- a/llvm/test/CodeGen/AArch64/cgdata-merge-local.ll
+++ b/llvm/test/CodeGen/AArch64/cgdata-merge-local.ll
@@ -54,9 +54,9 @@
 define i32 @f1(i32 %a) {
 entry:
   %idxprom = sext i32 %a to i64
-  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
-  %0 = load i32, i32* %arrayidx, align 4
-  %1 = load volatile i32, i32* @g1, align 4
+  %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = load volatile i32, ptr @g1, align 4
   %mul = mul nsw i32 %1, %0
   %add = add nsw i32 %mul, 1
   ret i32 %add
@@ -65,9 +65,9 @@ entry:
 define i32 @f2(i32 %a) {
 entry:
   %idxprom = sext i32 %a to i64
-  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
-  %0 = load i32, i32* %arrayidx, align 4
-  %1 = load volatile i32, i32* @g2, align 4
+  %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = load volatile i32, ptr @g2, align 4
   %mul = mul nsw i32 %1, %0
   %add = add nsw i32 %mul, 1
   ret i32 %add
diff --git a/llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll b/llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll
index 10f0e10f11d66..a9da1253de01d 100644
--- a/llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll
+++ b/llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll
@@ -19,9 +19,9 @@
 define i32 @f1(i32 %a) {
 entry:
   %idxprom = sext i32 %a to i64
-  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
-  %0 = load i32, i32* %arrayidx, align 4
-  %1 = load volatile i32, i32* @g1, align 4
+  %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = load volatile i32, ptr @g1, align 4
   %mul = mul nsw i32 %1, %0
   %add = add nsw i32 %mul, 1
   ret i32 %add
@@ -30,9 +30,9 @@ entry:
 define i32 @f2(i32 %a) {
 entry:
   %idxprom = sext i32 %a to i64
-  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
-  %0 = load i32, i32* %arrayidx, align 4
-  %1 = load volatile i32, i32* @g1, align 4
+  %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = load volatile i32, ptr @g1, align 4
   %mul = mul nsw i32 %1, %0
   %add = add nsw i32 %mul, 1
   ret i32 %add
diff --git a/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll b/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll
index 9986af7eb231c..7ab2aba8d75e2 100644
--- a/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll
+++ b/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll
@@ -12,9 +12,9 @@
 define i32 @0(i32 %a) {
 entry:
   %idxprom = sext i32 %a to i64
-  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
-  %0 = load i32, i32* %arrayidx, align 4
-  %1 = load volatile i32, i32* @g1, align 4
+  %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = load volatile i32, ptr @g1, align 4
   %mul = mul nsw i32 %1, %0
   %add = add nsw i32 %mul, 1
   ret i32 %add
@@ -23,9 +23,9 @@ entry:
 define i32 @1(i32 %a) {
 entry:
   %idxprom = sext i32 %a to i64
-  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
-  %0 = load i32, i32* %arrayidx, align 4
-  %1 = load volatile i32, i32* @g2, align 4
+  %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = load volatile i32, ptr @g2, align 4
   %mul = mul nsw i32 %1, %0
   %add = add nsw i32 %mul, 1
   ret i32 %add
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
index 533e831de0df8..258eaabee9376 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
@@ -14,13 +14,12 @@ define <vscale x 4 x double> @mull_add(<vscale x 4 x double> %a, <vscale x 4 x d
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fmul z7.d, z0.d, z1.d
 ; CHECK-NEXT:    fmul z1.d, z6.d, z1.d
-; CHECK-NEXT:    movprfx z3, z7
-; CHECK-NEXT:    fmla z3.d, p0/m, z6.d, z2.d
+; CHECK-NEXT:    fmad z6.d, p0/m, z2.d, z7.d
 ; CHECK-NEXT:    fnmsb z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    uzp2 z1.d, z4.d, z5.d
 ; CHECK-NEXT:    uzp1 z2.d, z4.d, z5.d
 ; CHECK-NEXT:    fadd z2.d, z2.d, z0.d
-; CHECK-NEXT:    fadd z1.d, z3.d, z1.d
+; CHECK-NEXT:    fadd z1.d, z6.d, z1.d
 ; CHECK-NEXT:    zip1 z0.d, z2.d, z1.d
 ; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
@@ -225,17 +224,14 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
 ; CHECK-NEXT:    fmul z1.d, z25.d, z1.d
 ; CHECK-NEXT:    fmul z3.d, z4.d, z24.d
 ; CHECK-NEXT:    fmul z24.d, z5.d, z24.d
-; CHECK-NEXT:    movprfx z7, z26
-; CHECK-NEXT:    fmla z7.d, p0/m, z25.d, z2.d
+; CHECK-NEXT:    fmad z25.d, p0/m, z2.d, z26.d
 ; CHECK-NEXT:    fnmsb z0.d, p0/m, z2.d, z1.d
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    fmla z1.d, p0/m, z6.d, z5.d
-; CHECK-NEXT:    movprfx z2, z24
-; CHECK-NEXT:    fnmls z2.d, p0/m, z4.d, z6.d
-; CHECK-NEXT:    fadd z2.d, z0.d, z2.d
-; CHECK-NEXT:    fadd z1.d, z7.d, z1.d
-; CHECK-NEXT:    zip1 z0.d, z2.d, z1.d
-; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
+; CHECK-NEXT:    fmla z3.d, p0/m, z6.d, z5.d
+; CHECK-NEXT:    fnmsb z4.d, p0/m, z6.d, z24.d
+; CHECK-NEXT:    fadd z1.d, z0.d, z4.d
+; CHECK-NEXT:    fadd z2.d, z25.d, z3.d
+; CHECK-NEXT:    zip1 z0.d, z1.d, z2.d
+; CHECK-NEXT:    zip2 z1.d, z1.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
index 1eed9722f57be..b68c0094f84de 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
@@ -200,12 +200,10 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
 ; CHECK-NEXT:    fmul z3.d, z2.d, z25.d
 ; CHECK-NEXT:    fmul z25.d, z24.d, z25.d
 ; CHECK-NEXT:    fmla z3.d, p0/m, z24.d, z0.d
-; CHECK-NEXT:    movprfx z24, z25
-; CHECK-NEXT:    fmla z24.d, p0/m, z26.d, z1.d
-; CHECK-NEXT:    movprfx z6, z24
-; CHECK-NEXT:    fmla z6.d, p0/m, z5.d, z4.d
+; CHECK-NEXT:    fmla z25.d, p0/m, z26.d, z1.d
+; CHECK-NEXT:    fmla z25.d, p0/m, z5.d, z4.d
 ; CHECK-NEXT:    fmla z3.d, p0/m, z26.d, z4.d
-; CHECK-NEXT:    fnmsb z2.d, p0/m, z0.d, z6.d
+; CHECK-NEXT:    fnmsb z2.d, p0/m, z0.d, z25.d
 ; CHECK-NEXT:    fmsb z1.d, p0/m, z5.d, z3.d
 ; CHECK-NEXT:    zip1 z0.d, z2.d, z1.d
 ; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
index c2fc959d8e101..583391cd22ef7 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
@@ -17,11 +17,10 @@ define <vscale x 4 x half> @complex_add_v4f16(<vscale x 4 x half> %a, <vscale x
 ; CHECK-NEXT:    uunpklo z3.d, z3.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    fsubr z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    zip2 z2.d, z0.d, z1.d
-; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
+; CHECK-NEXT:    fadd z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    zip2 z1.d, z0.d, z2.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z2.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
index 061fd07489284..00b0095e4309c 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
@@ -18,11 +18,10 @@ define <vscale x 4 x i16> @complex_mul_v4i16(<vscale x 4 x i16> %a, <vscale x 4
 ; CHECK-NEXT:    uzp2 z1.d, z1.d, z3.d
 ; CHECK-NEXT:    mul z5.d, z2.d, z0.d
 ; CHECK-NEXT:    mul z2.d, z2.d, z4.d
-; CHECK-NEXT:    movprfx z3, z5
-; CHECK-NEXT:    mla z3.d, p0/m, z1.d, z4.d
+; CHECK-NEXT:    mad z4.d, p0/m, z1.d, z5.d
 ; CHECK-NEXT:    msb z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    zip2 z1.d, z0.d, z3.d
-; CHECK-NEXT:    zip1 z0.d, z0.d, z3.d
+; CHECK-NEXT:    zip2 z1.d, z0.d, z4.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z4.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/divrem.ll b/llvm/test/CodeGen/AArch64/divrem.ll
index 5cd7e098d00bb..e3cbd17dc4c3f 100644
--- a/llvm/test/CodeGen/AArch64/divrem.ll
+++ b/llvm/test/CodeGen/AArch64/divrem.ll
@@ -2,7 +2,7 @@
 
 ; SDIVREM/UDIVREM DAG nodes are generated but expanded when lowering and
 ; should not generate select error.
-define <2 x i32> @test_udivrem(<2 x i32> %x, < 2 x i32> %y, < 2 x i32>* %z) {
+define <2 x i32> @test_udivrem(<2 x i32> %x, < 2 x i32> %y, ptr %z) {
 ; CHECK-LABEL: test_udivrem
 ; CHECK-DAG: udivrem
 ; CHECK-NOT: LLVM ERROR: Cannot select
@@ -12,10 +12,10 @@ define <2 x i32> @test_udivrem(<2 x i32> %x, < 2 x i32> %y, < 2 x i32>* %z) {
   ret <2 x i32> %1
 }
 
-define <4 x i32> @test_sdivrem(<4 x i32> %x,  ptr %y) {
+define <4 x i32> @test_sdivrem(<4 x i32> %x, ptr %y) {
 ; CHECK-LABEL: test_sdivrem
 ; CHECK-DAG: sdivrem
-  %div = sdiv <4 x i32> %x,  < i32 20, i32 20, i32 20, i32 20 >
+  %div = sdiv <4 x i32> %x, < i32 20, i32 20, i32 20, i32 20 >
   store <4 x i32> %div, ptr %y
   %1 = srem <4 x i32> %x, < i32 20, i32 20, i32 20, i32 20 >
   ret <4 x i32> %1
diff --git a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
index 35eafe8b7d99c..bb7ffb47d8dfe 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
@@ -68,13 +68,9 @@
 # CHECK:      early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.4)
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 2064, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2080
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
 #
 # CHECK-NEXT: $x8 = ADDXri $sp, 1040, 0
@@ -83,14 +79,10 @@
 # CHECK-NEXT: $x8 = ADDXri $sp, 2064, 0
 # CHECK-NEXT: STR_PXI $p0, killed $x8, 18 :: (store (<vscale x 1 x s16>) into %stack.1)
 #
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 2064, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
 # CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.4)
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
@@ -100,38 +92,26 @@
 # ASM:       str x29, [sp, #-16]!
 # ASM-NEXT:  .cfi_def_cfa_offset 16
 # ASM-NEXT:  .cfi_offset w29, -16
-# ASM-NEXT:  sub sp, sp, #1024
-# ASM-NEXT:  .cfi_def_cfa_offset 1040
-# ASM-NEXT:  addvl sp, sp, #-1
-# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
-# ASM-NEXT:  sub sp, sp, #1040
-# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
-# ASM-NEXT:  addvl sp, sp, #-2
+# ASM-NEXT:  sub sp, sp, #2064
+# ASM-NEXT:  .cfi_def_cfa_offset 2080
+# ASM-NEXT:  addvl sp, sp, #-3
 # ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG
 #
-# ASM:	     addvl sp, sp, #2
-# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
-# ASM-NEXT:  add sp, sp, #1024
-# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG
-# ASM-NEXT:  addvl sp, sp, #1
-# ASM-NEXT:  .cfi_def_cfa wsp, 1056
-# ASM-NEXT:  add sp, sp, #1040
-# ASM-NEXT:  .cfi_def_cfa_offset 16
+# ASM:	     add sp, sp, #2064
+# ASM-NEXT:  .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+# ASM-NEXT:  addvl   sp, sp, #3
+# ASM-NEXT:  .cfi_def_cfa wsp, 16
 # ASM-NEXT:  ldr x29, [sp], #16
 # ASM-NEXT:  .cfi_def_cfa_offset 0
 # ASM-NEXT:  .cfi_restore w29
 
 # UNWINDINFO:      DW_CFA_def_cfa_offset: +16
 # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
-# UNWINDINFO:      DW_CFA_def_cfa_offset: +1040
-# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO:      DW_CFA_def_cfa_offset: +2080
 # UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
 #
-# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO:      DW_CFA_def_cfa: reg31 +1056
-# UNWINDINFO:      DW_CFA_def_cfa_offset: +16
+# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
+# UNWINDINFO:      DW_CFA_def_cfa: reg31 +16
 # UNWINDINFO:      DW_CFA_def_cfa_offset: +0
 # UNWINDINFO-NEXT: DW_CFA_restore: reg29
 
@@ -182,63 +162,54 @@ body:             |
     RET_ReallyLR
 
 # CHECK-LABEL: name: test_allocate_split_sve_realigned
-# CHECK:       stackSize: 2080
+# CHECK:       stackSize: 1056
 
 # CHECK:      bb.0.entry:
 # CHECK:      liveins: $z0, $p0, $lr
-# CHECK:      $sp = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
-# CHECK-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.5)
-# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 129 :: (store (s64) into %stack.4)
-# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 1024, 0
+# CHECK:      early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.5), (store (s64) into %stack.4)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
-# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2, implicit $vg
-# CHECK-NEXT: $sp = frame-setup ANDXri killed $x9, 7930
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 2064, 0
+# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $x9, -3, implicit $vg
+# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]], 7930
 #
 # CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
 # CHECK-NEXT: $x8 = ADDPL_XXI $x8, -1, implicit $vg
-# CHECK-NEXT: STR_ZXI $z0, killed $x8, -1 :: (store (<vscale x 1 x s128>) into %stack.0)
-# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
-# CHECK-NEXT: STR_PXI $p0, killed $x8, -15 :: (store (<vscale x 1 x s16>) into %stack.1)
+# CHECK-NEXT: STR_ZXI $z0, killed $x8, -2 :: (store (<vscale x 1 x s128>) into %stack.0)
+# CHECK-NEXT: STR_PXI $p0, $fp, -6 :: (store (<vscale x 1 x s16>) into %stack.1)
 #
-# CHECK-NEXT: $sp = frame-destroy SUBXri $fp, 1024, 0
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1040
-# CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 129 :: (load (s64) from %stack.4)
-# CHECK-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.5)
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
+# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.5), (load (s64) from %stack.4)
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
 # CHECK-NEXT: RET_ReallyLR
 
 # ASM-LABEL: test_allocate_split_sve_realigned
-# ASM:	     sub sp, sp, #1040
-# ASM-NEXT:  .cfi_def_cfa_offset 1040
-# ASM-NEXT:  str x29, [sp, #1024]
-# ASM-NEXT:  str x30, [sp, #1032]
-# ASM-NEXT:  add x29, sp, #1024
+# ASM:	     stp	x29, x30, [sp, #-16]!
+# ASM-NEXT:  .cfi_def_cfa_offset 16
+# ASM-NEXT:  mov	x29, sp
 # ASM-NEXT:  .cfi_def_cfa w29, 16
 # ASM-NEXT:  .cfi_offset w30, -8
 # ASM-NEXT:  .cfi_offset w29, -16
 #
-# ASM:       sub sp, x29, #1024
-# ASM-NEXT:  .cfi_def_cfa wsp, 1040
-# ASM-NEXT:  ldr x30, [sp, #1032]
-# ASM-NEXT:  ldr x29, [sp, #1024]
-# ASM-NEXT:  add sp, sp, #1040
+# ASM:       mov	sp, x29
+# ASM-NEXT:  .cfi_def_cfa wsp, 16
+# ASM-NEXT:  ldp	x29, x30, [sp], #16
 # ASM-NEXT:  .cfi_def_cfa_offset 0
 # ASM-NEXT:  .cfi_restore w30
 # ASM-NEXT:  .cfi_restore w29
 
-# UNWINDINFO:       DW_CFA_def_cfa_offset: +1040
+# UNWINDINFO:       DW_CFA_def_cfa_offset: +16
 # UNWINDINFO:       DW_CFA_def_cfa: reg29 +16
 # UNWINDINFO-NEXT:  DW_CFA_offset: reg30 -8
 # UNWINDINFO-NEXT:  DW_CFA_offset: reg29 -16
 #
-# UNWINDINFO:       DW_CFA_def_cfa: reg31 +1040
+# UNWINDINFO:       DW_CFA_def_cfa: reg31 +16
 # UNWINDINFO:       DW_CFA_def_cfa_offset: +0
 # UNWINDINFO-NEXT:  DW_CFA_restore: reg30
 # UNWINDINFO-NEXT:  DW_CFA_restore: reg29
@@ -270,13 +241,9 @@ body:             |
 # CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.5)
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 2064, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2080
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
 #
 # CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0
@@ -286,14 +253,10 @@ body:             |
 # CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 2064, 0
 # CHECK-NEXT: STR_PXI $p0, killed $[[TMP]], 23
 #
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 2064, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
 # CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.5)
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
@@ -303,38 +266,27 @@ body:             |
 # ASM:       str x29, [sp, #-16]!
 # ASM-NEXT:  .cfi_def_cfa_offset 16
 # ASM-NEXT:  .cfi_offset w29, -16
-# ASM-NEXT:  sub sp, sp, #1024
-# ASM-NEXT:  .cfi_def_cfa_offset 1040
-# ASM-NEXT:  addvl sp, sp, #-1
-# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
-# ASM-NEXT:  sub sp, sp, #1040
-# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
-# ASM-NEXT:  addvl sp, sp, #-2
+# ASM-NEXT:  sub sp, sp, #2064
+# ASM-NEXT:  .cfi_def_cfa_offset 2080
+# ASM-NEXT:  addvl sp, sp, #-3
 # ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG
 #
-# ASM:       addvl sp, sp, #2
-# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
-# ASM-NEXT:  add sp, sp, #1024
-# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG
-# ASM-NEXT:  addvl sp, sp, #1
-# ASM-NEXT:  .cfi_def_cfa wsp, 1056
-# ASM-NEXT:  add sp, sp, #1040
-# ASM-NEXT:  .cfi_def_cfa_offset 16
+# ASM:       add sp, sp, #2064
+# ASM-NEXT:  .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+# ASM-NEXT:  addvl   sp, sp, #3
+# ASM-NEXT:  .cfi_def_cfa wsp, 16
 # ASM-NEXT:  ldr x29, [sp], #16
 # ASM-NEXT:  .cfi_def_cfa_offset 0
 # ASM-NEXT:  .cfi_restore w29
+# ASM-NEXT:  ret
 
 # UNWINDINFO: DW_CFA_def_cfa_offset: +16
 # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
-# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_offset: +2080
 # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
 #
-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056
-# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +16
 # UNWINDINFO: DW_CFA_def_cfa_offset: +0
 # UNWINDINFO-NEXT: DW_CFA_restore: reg29
 
@@ -385,10 +337,8 @@ body:             |
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 2064, 0
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg
 #
 # CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0
 # CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], -2
@@ -396,10 +346,8 @@ body:             |
 # CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], -3
 # CHECK-NEXT: STR_PXI $p0, $fp, -1
 #
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 2064, 0
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
 # CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.6), (load (s64) from %stack.5)
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
@@ -414,15 +362,11 @@ body:             |
 # ASM-NEXT:  .cfi_def_cfa w29, 16
 # ASM-NEXT:  .cfi_offset w30, -8
 # ASM-NEXT:  .cfi_offset w29, -16
-# ASM-NEXT:  sub sp, sp, #1024
-# ASM-NEXT:  addvl sp, sp, #-1
-# ASM-NEXT:  sub sp, sp, #1040
-# ASM-NEXT:  addvl sp, sp, #-2
+# ASM-NEXT:  sub sp, sp, #2064
+# ASM-NEXT:  addvl sp, sp, #-3
 #
-# ASM:       addvl sp, sp, #2
-# ASM-NEXT:  add sp, sp, #1024
-# ASM-NEXT:  addvl sp, sp, #1
-# ASM-NEXT:  add sp, sp, #1040
+# ASM:       add sp, sp, #2064
+# ASM-NEXT:  addvl sp, sp, #3
 # ASM-NEXT:  .cfi_def_cfa wsp, 16
 # ASM-NEXT:  ldp x29, x30, [sp], #16
 # ASM-NEXT:  .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/AArch64/frem-power2.ll b/llvm/test/CodeGen/AArch64/frem-power2.ll
index e1bc7426ad63e..179df026e25d6 100644
--- a/llvm/test/CodeGen/AArch64/frem-power2.ll
+++ b/llvm/test/CodeGen/AArch64/frem-power2.ll
@@ -85,6 +85,84 @@ entry:
   ret float %fmod
 }
 
+define float @frem2_exp(float %x) #0 {
+; CHECK-SD-LABEL: frem2_exp:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    bl expf
+; CHECK-SD-NEXT:    fmov s1, #0.50000000
+; CHECK-SD-NEXT:    fmov s2, #-2.00000000
+; CHECK-SD-NEXT:    fmul s1, s0, s1
+; CHECK-SD-NEXT:    frintz s1, s1
+; CHECK-SD-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem2_exp:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    bl expf
+; CHECK-GI-NEXT:    fmov s1, #2.00000000
+; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %a = tail call float @llvm.exp.f32(float %x)
+  %fmod = frem float %a, 2.0
+  ret float %fmod
+}
+
+define float @frem2_exp2(float %x) #0 {
+; CHECK-SD-LABEL: frem2_exp2:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    bl exp2f
+; CHECK-SD-NEXT:    fmov s1, #0.50000000
+; CHECK-SD-NEXT:    fmov s2, #-2.00000000
+; CHECK-SD-NEXT:    fmul s1, s0, s1
+; CHECK-SD-NEXT:    frintz s1, s1
+; CHECK-SD-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem2_exp2:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    bl exp2f
+; CHECK-GI-NEXT:    fmov s1, #2.00000000
+; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %a = tail call float @llvm.exp2.f32(float %x)
+  %fmod = frem float %a, 2.0
+  ret float %fmod
+}
+
+define float @frem2_exp10(float %x) #0 {
+; CHECK-SD-LABEL: frem2_exp10:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    bl exp10f
+; CHECK-SD-NEXT:    fmov s1, #0.50000000
+; CHECK-SD-NEXT:    fmov s2, #-2.00000000
+; CHECK-SD-NEXT:    fmul s1, s0, s1
+; CHECK-SD-NEXT:    frintz s1, s1
+; CHECK-SD-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: frem2_exp10:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    bl exp10f
+; CHECK-GI-NEXT:    fmov s1, #2.00000000
+; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    b fmodf
+entry:
+  %a = tail call float @llvm.exp10.f32(float %x)
+  %fmod = frem float %a, 2.0
+  ret float %fmod
+}
+
 define half @hrem2_nsz(half %x) {
 ; CHECK-SD-LABEL: hrem2_nsz:
 ; CHECK-SD:       // %bb.0: // %entry
@@ -630,3 +708,5 @@ entry:
   %fmod = frem float -12.50, %y
   ret float %fmod
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll
index 7f07ef476b8aa..1db776ea6f616 100644
--- a/llvm/test/CodeGen/AArch64/fsh.ll
+++ b/llvm/test/CodeGen/AArch64/fsh.ll
@@ -3537,27 +3537,22 @@ define <7 x i32> @rotl_v7i32_c(<7 x i32> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fmov s0, w0
 ; CHECK-SD-NEXT:    fmov s1, w4
-; CHECK-SD-NEXT:    adrp x8, .LCPI108_0
-; CHECK-SD-NEXT:    adrp x9, .LCPI108_1
-; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI108_0]
-; CHECK-SD-NEXT:    ldr q3, [x9, :lo12:.LCPI108_1]
 ; CHECK-SD-NEXT:    mov v0.s[1], w1
 ; CHECK-SD-NEXT:    mov v1.s[1], w5
 ; CHECK-SD-NEXT:    mov v0.s[2], w2
 ; CHECK-SD-NEXT:    mov v1.s[2], w6
 ; CHECK-SD-NEXT:    mov v0.s[3], w3
-; CHECK-SD-NEXT:    ushl v2.4s, v1.4s, v2.4s
-; CHECK-SD-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-SD-NEXT:    shl v4.4s, v0.4s, #3
-; CHECK-SD-NEXT:    usra v4.4s, v0.4s, #29
-; CHECK-SD-NEXT:    orr v0.16b, v1.16b, v2.16b
-; CHECK-SD-NEXT:    mov w1, v4.s[1]
-; CHECK-SD-NEXT:    mov w2, v4.s[2]
-; CHECK-SD-NEXT:    mov w3, v4.s[3]
-; CHECK-SD-NEXT:    mov w5, v0.s[1]
-; CHECK-SD-NEXT:    mov w6, v0.s[2]
-; CHECK-SD-NEXT:    fmov w0, s4
-; CHECK-SD-NEXT:    fmov w4, s0
+; CHECK-SD-NEXT:    shl v3.4s, v1.4s, #3
+; CHECK-SD-NEXT:    usra v3.4s, v1.4s, #29
+; CHECK-SD-NEXT:    shl v2.4s, v0.4s, #3
+; CHECK-SD-NEXT:    mov w5, v3.s[1]
+; CHECK-SD-NEXT:    mov w6, v3.s[2]
+; CHECK-SD-NEXT:    fmov w4, s3
+; CHECK-SD-NEXT:    usra v2.4s, v0.4s, #29
+; CHECK-SD-NEXT:    mov w1, v2.s[1]
+; CHECK-SD-NEXT:    mov w2, v2.s[2]
+; CHECK-SD-NEXT:    mov w3, v2.s[3]
+; CHECK-SD-NEXT:    fmov w0, s2
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: rotl_v7i32_c:
@@ -3614,27 +3609,22 @@ define <7 x i32> @rotr_v7i32_c(<7 x i32> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fmov s0, w0
 ; CHECK-SD-NEXT:    fmov s1, w4
-; CHECK-SD-NEXT:    adrp x8, .LCPI109_0
-; CHECK-SD-NEXT:    adrp x9, .LCPI109_1
-; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI109_0]
-; CHECK-SD-NEXT:    ldr q3, [x9, :lo12:.LCPI109_1]
 ; CHECK-SD-NEXT:    mov v0.s[1], w1
 ; CHECK-SD-NEXT:    mov v1.s[1], w5
 ; CHECK-SD-NEXT:    mov v0.s[2], w2
 ; CHECK-SD-NEXT:    mov v1.s[2], w6
 ; CHECK-SD-NEXT:    mov v0.s[3], w3
-; CHECK-SD-NEXT:    ushl v2.4s, v1.4s, v2.4s
-; CHECK-SD-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-SD-NEXT:    shl v4.4s, v0.4s, #29
-; CHECK-SD-NEXT:    usra v4.4s, v0.4s, #3
-; CHECK-SD-NEXT:    orr v0.16b, v1.16b, v2.16b
-; CHECK-SD-NEXT:    mov w1, v4.s[1]
-; CHECK-SD-NEXT:    mov w2, v4.s[2]
-; CHECK-SD-NEXT:    mov w3, v4.s[3]
-; CHECK-SD-NEXT:    mov w5, v0.s[1]
-; CHECK-SD-NEXT:    mov w6, v0.s[2]
-; CHECK-SD-NEXT:    fmov w0, s4
-; CHECK-SD-NEXT:    fmov w4, s0
+; CHECK-SD-NEXT:    shl v3.4s, v1.4s, #29
+; CHECK-SD-NEXT:    usra v3.4s, v1.4s, #3
+; CHECK-SD-NEXT:    shl v2.4s, v0.4s, #29
+; CHECK-SD-NEXT:    mov w5, v3.s[1]
+; CHECK-SD-NEXT:    mov w6, v3.s[2]
+; CHECK-SD-NEXT:    fmov w4, s3
+; CHECK-SD-NEXT:    usra v2.4s, v0.4s, #3
+; CHECK-SD-NEXT:    mov w1, v2.s[1]
+; CHECK-SD-NEXT:    mov w2, v2.s[2]
+; CHECK-SD-NEXT:    mov w3, v2.s[3]
+; CHECK-SD-NEXT:    fmov w0, s2
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: rotr_v7i32_c:
@@ -4132,36 +4122,31 @@ define <7 x i32> @fshl_v7i32_c(<7 x i32> %a, <7 x i32> %b) {
 ; CHECK-SD-LABEL: fshl_v7i32_c:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fmov s0, w0
-; CHECK-SD-NEXT:    fmov s2, w4
-; CHECK-SD-NEXT:    ldr s1, [sp, #24]
-; CHECK-SD-NEXT:    fmov s3, w7
+; CHECK-SD-NEXT:    fmov s1, w4
 ; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    fmov s2, w7
+; CHECK-SD-NEXT:    ldr s3, [sp, #24]
 ; CHECK-SD-NEXT:    add x9, sp, #32
-; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #40
-; CHECK-SD-NEXT:    adrp x10, .LCPI134_1
 ; CHECK-SD-NEXT:    mov v0.s[1], w1
-; CHECK-SD-NEXT:    mov v2.s[1], w5
-; CHECK-SD-NEXT:    ldr q5, [x10, :lo12:.LCPI134_1]
-; CHECK-SD-NEXT:    ld1 { v3.s }[1], [x8]
+; CHECK-SD-NEXT:    mov v1.s[1], w5
+; CHECK-SD-NEXT:    ld1 { v3.s }[1], [x9]
+; CHECK-SD-NEXT:    ld1 { v2.s }[1], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #8
-; CHECK-SD-NEXT:    ld1 { v1.s }[2], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #16
+; CHECK-SD-NEXT:    add x9, sp, #40
+; CHECK-SD-NEXT:    ld1 { v3.s }[2], [x9]
 ; CHECK-SD-NEXT:    mov v0.s[2], w2
-; CHECK-SD-NEXT:    mov v2.s[2], w6
-; CHECK-SD-NEXT:    ld1 { v3.s }[2], [x8]
-; CHECK-SD-NEXT:    adrp x8, .LCPI134_0
-; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI134_0]
-; CHECK-SD-NEXT:    ld1 { v3.s }[3], [x9]
+; CHECK-SD-NEXT:    mov v1.s[2], w6
+; CHECK-SD-NEXT:    ld1 { v2.s }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #16
+; CHECK-SD-NEXT:    ld1 { v2.s }[3], [x8]
 ; CHECK-SD-NEXT:    mov v0.s[3], w3
-; CHECK-SD-NEXT:    ushl v1.4s, v1.4s, v4.4s
-; CHECK-SD-NEXT:    ushl v2.4s, v2.4s, v5.4s
-; CHECK-SD-NEXT:    orr v1.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    shl v1.4s, v1.4s, #3
+; CHECK-SD-NEXT:    usra v1.4s, v3.4s, #29
 ; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #3
 ; CHECK-SD-NEXT:    mov w5, v1.s[1]
 ; CHECK-SD-NEXT:    mov w6, v1.s[2]
 ; CHECK-SD-NEXT:    fmov w4, s1
-; CHECK-SD-NEXT:    usra v0.4s, v3.4s, #29
+; CHECK-SD-NEXT:    usra v0.4s, v2.4s, #29
 ; CHECK-SD-NEXT:    mov w1, v0.s[1]
 ; CHECK-SD-NEXT:    mov w2, v0.s[2]
 ; CHECK-SD-NEXT:    mov w3, v0.s[3]
@@ -4225,36 +4210,31 @@ define <7 x i32> @fshr_v7i32_c(<7 x i32> %a, <7 x i32> %b) {
 ; CHECK-SD-LABEL: fshr_v7i32_c:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fmov s0, w0
-; CHECK-SD-NEXT:    fmov s2, w4
-; CHECK-SD-NEXT:    ldr s1, [sp, #24]
-; CHECK-SD-NEXT:    fmov s3, w7
+; CHECK-SD-NEXT:    fmov s1, w4
 ; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    fmov s2, w7
+; CHECK-SD-NEXT:    ldr s3, [sp, #24]
 ; CHECK-SD-NEXT:    add x9, sp, #32
-; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #40
-; CHECK-SD-NEXT:    adrp x10, .LCPI135_1
 ; CHECK-SD-NEXT:    mov v0.s[1], w1
-; CHECK-SD-NEXT:    mov v2.s[1], w5
-; CHECK-SD-NEXT:    ldr q5, [x10, :lo12:.LCPI135_1]
-; CHECK-SD-NEXT:    ld1 { v3.s }[1], [x8]
+; CHECK-SD-NEXT:    mov v1.s[1], w5
+; CHECK-SD-NEXT:    ld1 { v3.s }[1], [x9]
+; CHECK-SD-NEXT:    ld1 { v2.s }[1], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #8
-; CHECK-SD-NEXT:    ld1 { v1.s }[2], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #16
+; CHECK-SD-NEXT:    add x9, sp, #40
+; CHECK-SD-NEXT:    ld1 { v3.s }[2], [x9]
 ; CHECK-SD-NEXT:    mov v0.s[2], w2
-; CHECK-SD-NEXT:    mov v2.s[2], w6
-; CHECK-SD-NEXT:    ld1 { v3.s }[2], [x8]
-; CHECK-SD-NEXT:    adrp x8, .LCPI135_0
-; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI135_0]
-; CHECK-SD-NEXT:    ld1 { v3.s }[3], [x9]
+; CHECK-SD-NEXT:    mov v1.s[2], w6
+; CHECK-SD-NEXT:    ld1 { v2.s }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #16
+; CHECK-SD-NEXT:    ld1 { v2.s }[3], [x8]
 ; CHECK-SD-NEXT:    mov v0.s[3], w3
-; CHECK-SD-NEXT:    ushl v1.4s, v1.4s, v4.4s
-; CHECK-SD-NEXT:    ushl v2.4s, v2.4s, v5.4s
-; CHECK-SD-NEXT:    orr v1.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    shl v1.4s, v1.4s, #29
+; CHECK-SD-NEXT:    usra v1.4s, v3.4s, #3
 ; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #29
 ; CHECK-SD-NEXT:    mov w5, v1.s[1]
 ; CHECK-SD-NEXT:    mov w6, v1.s[2]
 ; CHECK-SD-NEXT:    fmov w4, s1
-; CHECK-SD-NEXT:    usra v0.4s, v3.4s, #3
+; CHECK-SD-NEXT:    usra v0.4s, v2.4s, #3
 ; CHECK-SD-NEXT:    mov w1, v0.s[1]
 ; CHECK-SD-NEXT:    mov w2, v0.s[2]
 ; CHECK-SD-NEXT:    mov w3, v0.s[3]
diff --git a/llvm/test/CodeGen/AArch64/highextractbitcast.ll b/llvm/test/CodeGen/AArch64/highextractbitcast.ll
index df4889b6f09de..bd6c168ce8776 100644
--- a/llvm/test/CodeGen/AArch64/highextractbitcast.ll
+++ b/llvm/test/CodeGen/AArch64/highextractbitcast.ll
@@ -1,10 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes CHECK,CHECK-LE
 ; RUN: llc -mtriple=aarch64_be-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-BE
-; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes CHECK,CHECK-GI
-
-; CHECK-GI:       warning: Instruction selection used fallback path for test_pmull_high_p8_128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_pmull_high_p8_64
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes CHECK,CHECK-GI
 
 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
@@ -521,12 +518,12 @@ entry:
 }
 
 define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) {
-; CHECK-LABEL: test_pmull_high_p8_128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov d0, x3
-; CHECK-NEXT:    fmov d1, x1
-; CHECK-NEXT:    pmull v0.8h, v1.8b, v0.8b
-; CHECK-NEXT:    ret
+; CHECK-LE-LABEL: test_pmull_high_p8_128:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    fmov d0, x3
+; CHECK-LE-NEXT:    fmov d1, x1
+; CHECK-LE-NEXT:    pmull v0.8h, v1.8b, v0.8b
+; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_pmull_high_p8_128:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -538,6 +535,15 @@ define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) {
 ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_pmull_high_p8_128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.d[0], x0
+; CHECK-GI-NEXT:    mov v1.d[0], x2
+; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v1.d[1], x3
+; CHECK-GI-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %a = bitcast i128 %aa to <16 x i8>
   %b = bitcast i128 %bb to <16 x i8>
diff --git a/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll
index 33c5ba7987974..8297fa2d4e3f9 100644
--- a/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll
+++ b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll
@@ -161,6 +161,338 @@ define i1 @lt64_u16_and_23(i64 %0) {
   ret i1 %3
 }
 
+define i1 @test_disjoint(i1 %0, i32 %1, i32 %2) {
+; CHECK-LABEL: test_disjoint:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr w9, w2, #0x800000
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    tst w9, w8
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i32 %2, 8388608
+  %4 = shl nuw i32 1, %1
+  %5 = and i32 %3, %4
+  %6 = icmp eq i32 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint2(i1 %0, i32 %1, i32 %2) {
+; CHECK-LABEL: test_disjoint2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr w9, w2, #0x800000
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    tst w9, w8
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i32 %2, 8388608
+  %4 = shl nuw i32 1, %1
+  %5 = and i32 %3, %4
+  %6 = icmp sgt i32 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint3(i1 %0, i32 %1, i32 %2) {
+; CHECK-LABEL: test_disjoint3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr w9, w2, #0x800000
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    tst w9, w8
+; CHECK-NEXT:    cset w8, mi
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i32 %2, 8388608
+  %4 = shl nuw i32 1, %1
+  %5 = and i32 %3, %4
+  %6 = icmp slt i32 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint4(i1 %0, i32 %1, i32 %2) {
+; CHECK-LABEL: test_disjoint4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr w9, w2, #0x800000
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    and w8, w9, w8
+; CHECK-NEXT:    cmp w8, #1
+; CHECK-NEXT:    cset w8, lt
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i32 %2, 8388608
+  %4 = shl nuw i32 1, %1
+  %5 = and i32 %3, %4
+  %6 = icmp sle i32 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint_inverse_4(i1 %0, i32 %1, i32 %2) {
+; CHECK-LABEL: test_disjoint_inverse_4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr w9, w2, #0x800000
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    bic w8, w9, w8
+; CHECK-NEXT:    cmp w8, #1
+; CHECK-NEXT:    cset w8, lt
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i32 %2, 8388608
+  %4 = shl nuw i32 1, %1
+  %not = xor i32 %4, -1
+  %5 = and i32 %3, %not
+  %6 = icmp sle i32 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint_inverse(i1 %0, i32 %1, i32 %2) {
+; CHECK-LABEL: test_disjoint_inverse:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr w9, w2, #0x800000
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    bics wzr, w9, w8
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i32 %2, 8388608
+  %4 = shl nuw i32 1, %1
+  %not = xor i32 %4, -1
+  %5 = and i32 %3, %not
+  %6 = icmp eq i32 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint2_inverse(i1 %0, i32 %1, i32 %2) {
+; CHECK-LABEL: test_disjoint2_inverse:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr w9, w2, #0x800000
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    bics wzr, w9, w8
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i32 %2, 8388608
+  %4 = shl nuw i32 1, %1
+  %not = xor i32 %4, -1
+  %5 = and i32 %3, %not
+  %6 = icmp sgt i32 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint3_inverse(i1 %0, i32 %1, i32 %2) {
+; CHECK-LABEL: test_disjoint3_inverse:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr w9, w2, #0x800000
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    bics wzr, w9, w8
+; CHECK-NEXT:    cset w8, mi
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i32 %2, 8388608
+  %4 = shl nuw i32 1, %1
+  %not = xor i32 %4, -1
+  %5 = and i32 %3, %not
+  %6 = icmp slt i32 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint_64(i1 %0, i64 %1, i64 %2) {
+; CHECK-LABEL: test_disjoint_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr x9, x2, #0x80000000000000
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    tst x9, x8
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i64 %2, 36028797018963968
+  %4 = shl nuw i64 1, %1
+  %5 = and i64 %3, %4
+  %6 = icmp eq i64 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint2_64(i1 %0, i64 %1, i64 %2) {
+; CHECK-LABEL: test_disjoint2_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr x9, x2, #0x80000000000000
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    tst x9, x8
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i64 %2, 36028797018963968
+  %4 = shl nuw i64 1, %1
+  %5 = and i64 %3, %4
+  %6 = icmp sgt i64 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint3_64(i1 %0, i64 %1, i64 %2) {
+; CHECK-LABEL: test_disjoint3_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr x9, x2, #0x80000000000000
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    tst x9, x8
+; CHECK-NEXT:    cset w8, mi
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i64 %2, 36028797018963968
+  %4 = shl nuw i64 1, %1
+  %5 = and i64 %3, %4
+  %6 = icmp slt i64 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint4_64(i1 %0, i64 %1, i64 %2) {
+; CHECK-LABEL: test_disjoint4_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr x9, x2, #0x80000000000000
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    and x8, x9, x8
+; CHECK-NEXT:    cmp x8, #1
+; CHECK-NEXT:    cset w8, lt
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i64 %2, 36028797018963968
+  %4 = shl nuw i64 1, %1
+  %5 = and i64 %3, %4
+  %6 = icmp sle i64 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint_inverse_4_64(i1 %0, i64 %1, i64 %2) {
+; CHECK-LABEL: test_disjoint_inverse_4_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr x9, x2, #0x80000000000000
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    bic x8, x9, x8
+; CHECK-NEXT:    cmp x8, #1
+; CHECK-NEXT:    cset w8, lt
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i64 %2, 36028797018963968
+  %4 = shl nuw i64 1, %1
+  %not = xor i64 %4, -1
+  %5 = and i64 %3, %not
+  %6 = icmp sle i64 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint_inverse_64(i1 %0, i64 %1, i64 %2) {
+; CHECK-LABEL: test_disjoint_inverse_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr x9, x2, #0x80000000000000
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    bics xzr, x9, x8
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i64 %2, 36028797018963968
+  %4 = shl nuw i64 1, %1
+  %not = xor i64 %4, -1
+  %5 = and i64 %3, %not
+  %6 = icmp eq i64 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint2_inverse_64(i1 %0, i64 %1, i64 %2) {
+; CHECK-LABEL: test_disjoint2_inverse_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr x9, x2, #0x80000000000000
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    bics xzr, x9, x8
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i64 %2, 36028797018963968
+  %4 = shl nuw i64 1, %1
+  %not = xor i64 %4, -1
+  %5 = and i64 %3, %not
+  %6 = icmp sgt i64 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint3_inverse_64(i1 %0, i64 %1, i64 %2) {
+; CHECK-LABEL: test_disjoint3_inverse_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr x9, x2, #0x80000000000000
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    bics xzr, x9, x8
+; CHECK-NEXT:    cset w8, mi
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i64 %2, 36028797018963968
+  %4 = shl nuw i64 1, %1
+  %not = xor i64 %4, -1
+  %5 = and i64 %3, %not
+  %6 = icmp slt i64 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
 ; negative test
 define i1 @lt3_u8(i8 %0) {
 ; CHECK-LABEL: lt3_u8:
diff --git a/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll b/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll
index 91cf605613b9e..c0c8894ce1f6b 100644
--- a/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll
@@ -85,7 +85,7 @@ define i64 @test_ldrsw_ldursw(ptr %p) #0 {
 ; CHECK-NEXT: add.2d v0, v[[V0]], v[[V1]]
 ; CHECK-NEXT: ret
 define <2 x i64> @test_ldrq_ldruq_invalidoffset(ptr %p) #0 {
-  %tmp1 = load <2 x i64>, < 2 x i64>* %p, align 8
+  %tmp1 = load <2 x i64>, ptr %p, align 8
   %add.ptr2 = getelementptr inbounds i64, ptr %p, i64 3
   %tmp2 = load <2 x i64>, ptr %add.ptr2, align 8
   %add = add nsw <2 x i64> %tmp1, %tmp2
diff --git a/llvm/test/CodeGen/AArch64/ldst-implicitop.mir b/llvm/test/CodeGen/AArch64/ldst-implicitop.mir
new file mode 100644
index 0000000000000..34e8cf282669c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ldst-implicitop.mir
@@ -0,0 +1,80 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=aarch64-- -run-pass=aarch64-ldst-opt -verify-machineinstrs -o - %s | FileCheck %s
+# Check that we copy implicit operands.
+---
+name:            impdef_op1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $lr
+    ; CHECK-LABEL: name: impdef_op1
+    ; CHECK: liveins: $lr
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4
+    ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5
+    ; CHECK-NEXT: RET_ReallyLR
+    renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    renamable $q20 = LDRQui renamable $lr, 4 :: (load (s128))
+    $q0 = ORRv16i8 $q4, killed $q4
+    $q1 = ORRv16i8 $q5, killed $q5
+    RET_ReallyLR
+...
+---
+name:            impdef_op2
+body:             |
+  bb.0:
+    liveins: $lr
+    ; CHECK-LABEL: name: impdef_op2
+    ; CHECK: liveins: $lr
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $q20, renamable $q5 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4
+    ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5
+    ; CHECK-NEXT: RET_ReallyLR
+    renamable $q20 = LDRQui renamable $lr, 3 :: (load (s128))
+    renamable $q5 = LDRQui renamable $lr, 4, implicit-def $q4_q5 :: (load (s128))
+    $q0 = ORRv16i8 $q4, killed $q4
+    $q1 = ORRv16i8 $q5, killed $q5
+    RET_ReallyLR
+...
+---
+name:            impdef_both
+body:             |
+  bb.0:
+    liveins: $lr
+    ; CHECK-LABEL: name: impdef_both
+    ; CHECK: liveins: $lr
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5, implicit-def $q20_q21 :: (load (s128))
+    ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4
+    ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5
+    ; CHECK-NEXT: $q2 = ORRv16i8 $q20, killed $q20
+    ; CHECK-NEXT: $q3 = ORRv16i8 $q21, killed $q21
+    ; CHECK-NEXT: RET_ReallyLR
+    renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    renamable $q20 = LDRQui renamable $lr, 4, implicit-def $q20_q21 :: (load (s128))
+    $q0 = ORRv16i8 $q4, killed $q4
+    $q1 = ORRv16i8 $q5, killed $q5
+    $q2 = ORRv16i8 $q20, killed $q20
+    $q3 = ORRv16i8 $q21, killed $q21
+    RET_ReallyLR
+...
+---
+name:            impdef_both_same
+body:             |
+  bb.0:
+    liveins: $lr
+    ; CHECK-LABEL: name: impdef_both_same
+    ; CHECK: liveins: $lr
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4
+    ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5
+    ; CHECK-NEXT: RET_ReallyLR
+    renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    renamable $q20 = LDRQui renamable $lr, 4, implicit-def $q4_q5 :: (load (s128))
+    $q0 = ORRv16i8 $q4, killed $q4
+    $q1 = ORRv16i8 $q5, killed $q5
+    RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
index 47fae5a01c931..f0abbaac2e68c 100644
--- a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
+++ b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
@@ -1148,11 +1148,10 @@ define <vscale x 4 x i64> @fshl_rot_illegal_i64(<vscale x 4 x i64> %a, <vscale x
 ; CHECK-NEXT:    and z3.d, z3.d, #0x3f
 ; CHECK-NEXT:    lslr z4.d, p0/m, z4.d, z0.d
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z2.d
-; CHECK-NEXT:    movprfx z2, z1
-; CHECK-NEXT:    lsl z2.d, p0/m, z2.d, z5.d
+; CHECK-NEXT:    lslr z5.d, p0/m, z5.d, z1.d
 ; CHECK-NEXT:    lsr z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    orr z0.d, z4.d, z0.d
-; CHECK-NEXT:    orr z1.d, z2.d, z1.d
+; CHECK-NEXT:    orr z1.d, z5.d, z1.d
 ; CHECK-NEXT:    ret
   %fshl = call <vscale x 4 x i64> @llvm.fshl.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> %b)
   ret <vscale x 4 x i64> %fshl
diff --git a/llvm/test/CodeGen/AArch64/llvm.sincos.ll b/llvm/test/CodeGen/AArch64/llvm.sincos.ll
index f1dcb2a478a0d..21da8645b9b16 100644
--- a/llvm/test/CodeGen/AArch64/llvm.sincos.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.sincos.ll
@@ -215,6 +215,133 @@ define { <2 x half>, <2 x half> } @test_sincos_v2f16(<2 x half> %a) nounwind {
   ret { <2 x half>, <2 x half> } %result
 }
 
+define { <3 x half>, <3 x half> } @test_sincos_v3f16(<3 x half> %a) nounwind {
+; CHECK-LABEL: test_sincos_v3f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h1, v0.h[1]
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    add x0, sp, #36
+; CHECK-NEXT:    add x1, sp, #32
+; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h1
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #28
+; CHECK-NEXT:    add x1, sp, #24
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #44
+; CHECK-NEXT:    add x1, sp, #40
+; CHECK-NEXT:    mov h0, v0.h[2]
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #60
+; CHECK-NEXT:    add x1, sp, #56
+; CHECK-NEXT:    mov h0, v0.h[3]
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldp s2, s0, [sp, #32]
+; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp s3, s1, [sp, #24]
+; CHECK-NEXT:    fcvt h4, s0
+; CHECK-NEXT:    fcvt h2, s2
+; CHECK-NEXT:    fcvt h0, s1
+; CHECK-NEXT:    fcvt h1, s3
+; CHECK-NEXT:    ldp s5, s3, [sp, #40]
+; CHECK-NEXT:    fcvt h3, s3
+; CHECK-NEXT:    mov v0.h[1], v4.h[0]
+; CHECK-NEXT:    fcvt h4, s5
+; CHECK-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-NEXT:    ldp s5, s2, [sp, #56]
+; CHECK-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-NEXT:    fcvt h2, s2
+; CHECK-NEXT:    fcvt h3, s5
+; CHECK-NEXT:    mov v1.h[2], v4.h[0]
+; CHECK-NEXT:    mov v0.h[3], v2.h[0]
+; CHECK-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ret
+;
+; NO-LIBCALL-LABEL: test_sincos_v3f16:
+; NO-LIBCALL:       // %bb.0:
+; NO-LIBCALL-NEXT:    sub sp, sp, #80
+; NO-LIBCALL-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NO-LIBCALL-NEXT:    mov h1, v0.h[1]
+; NO-LIBCALL-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; NO-LIBCALL-NEXT:    fcvt s8, h1
+; NO-LIBCALL-NEXT:    fmov s0, s8
+; NO-LIBCALL-NEXT:    bl sinf
+; NO-LIBCALL-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    fcvt h0, s0
+; NO-LIBCALL-NEXT:    fcvt s9, h1
+; NO-LIBCALL-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    fmov s0, s9
+; NO-LIBCALL-NEXT:    bl sinf
+; NO-LIBCALL-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    fcvt h0, s0
+; NO-LIBCALL-NEXT:    mov h1, v1.h[2]
+; NO-LIBCALL-NEXT:    fcvt s10, h1
+; NO-LIBCALL-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    mov v0.h[1], v1.h[0]
+; NO-LIBCALL-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    fmov s0, s10
+; NO-LIBCALL-NEXT:    bl sinf
+; NO-LIBCALL-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    fcvt h0, s0
+; NO-LIBCALL-NEXT:    mov h1, v1.h[3]
+; NO-LIBCALL-NEXT:    fcvt s11, h1
+; NO-LIBCALL-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    mov v1.h[2], v0.h[0]
+; NO-LIBCALL-NEXT:    fmov s0, s11
+; NO-LIBCALL-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    bl sinf
+; NO-LIBCALL-NEXT:    fcvt h0, s0
+; NO-LIBCALL-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    mov v1.h[3], v0.h[0]
+; NO-LIBCALL-NEXT:    fmov s0, s8
+; NO-LIBCALL-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    bl cosf
+; NO-LIBCALL-NEXT:    fcvt h0, s0
+; NO-LIBCALL-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    fmov s0, s9
+; NO-LIBCALL-NEXT:    bl cosf
+; NO-LIBCALL-NEXT:    fcvt h0, s0
+; NO-LIBCALL-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    mov v0.h[1], v1.h[0]
+; NO-LIBCALL-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    fmov s0, s10
+; NO-LIBCALL-NEXT:    bl cosf
+; NO-LIBCALL-NEXT:    fcvt h0, s0
+; NO-LIBCALL-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    mov v1.h[2], v0.h[0]
+; NO-LIBCALL-NEXT:    fmov s0, s11
+; NO-LIBCALL-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    bl cosf
+; NO-LIBCALL-NEXT:    fmov s1, s0
+; NO-LIBCALL-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; NO-LIBCALL-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NO-LIBCALL-NEXT:    fcvt h2, s1
+; NO-LIBCALL-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    mov v1.h[3], v2.h[0]
+; NO-LIBCALL-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; NO-LIBCALL-NEXT:    add sp, sp, #80
+; NO-LIBCALL-NEXT:    ret
+  %result = call { <3 x half>, <3 x half> } @llvm.sincos.v3f16(<3 x half> %a)
+  ret { <3 x half>, <3 x half> } %result
+}
+
 define { float, float } @test_sincos_f32(float %a) nounwind {
 ; CHECK-LABEL: test_sincos_f32:
 ; CHECK:       // %bb.0:
@@ -493,3 +620,71 @@ define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) nounwi
   %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a)
   ret { <2 x double>, <2 x double> } %result
 }
+
+define { <3 x double>, <3 x double> } @test_sincos_v3f64(<3 x double> %a) nounwind {
+; CHECK-LABEL: test_sincos_v3f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    add x0, sp, #16
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    fmov d8, d2
+; CHECK-NEXT:    fmov d9, d1
+; CHECK-NEXT:    bl sincos
+; CHECK-NEXT:    fmov d0, d9
+; CHECK-NEXT:    add x0, sp, #32
+; CHECK-NEXT:    add x1, sp, #24
+; CHECK-NEXT:    bl sincos
+; CHECK-NEXT:    fmov d0, d8
+; CHECK-NEXT:    add x0, sp, #72
+; CHECK-NEXT:    add x1, sp, #40
+; CHECK-NEXT:    bl sincos
+; CHECK-NEXT:    ldp d3, d0, [sp, #8]
+; CHECK-NEXT:    ldr d2, [sp, #72]
+; CHECK-NEXT:    ldp d4, d1, [sp, #24]
+; CHECK-NEXT:    ldr d5, [sp, #40]
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+;
+; NO-LIBCALL-LABEL: test_sincos_v3f64:
+; NO-LIBCALL:       // %bb.0:
+; NO-LIBCALL-NEXT:    stp d13, d12, [sp, #-64]! // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    stp d11, d10, [sp, #16] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    fmov d10, d0
+; NO-LIBCALL-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; NO-LIBCALL-NEXT:    fmov d8, d2
+; NO-LIBCALL-NEXT:    fmov d9, d1
+; NO-LIBCALL-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; NO-LIBCALL-NEXT:    bl sin
+; NO-LIBCALL-NEXT:    fmov d11, d0
+; NO-LIBCALL-NEXT:    fmov d0, d9
+; NO-LIBCALL-NEXT:    bl sin
+; NO-LIBCALL-NEXT:    fmov d12, d0
+; NO-LIBCALL-NEXT:    fmov d0, d8
+; NO-LIBCALL-NEXT:    bl sin
+; NO-LIBCALL-NEXT:    fmov d13, d0
+; NO-LIBCALL-NEXT:    fmov d0, d10
+; NO-LIBCALL-NEXT:    bl cos
+; NO-LIBCALL-NEXT:    fmov d10, d0
+; NO-LIBCALL-NEXT:    fmov d0, d9
+; NO-LIBCALL-NEXT:    bl cos
+; NO-LIBCALL-NEXT:    fmov d9, d0
+; NO-LIBCALL-NEXT:    fmov d0, d8
+; NO-LIBCALL-NEXT:    bl cos
+; NO-LIBCALL-NEXT:    fmov d5, d0
+; NO-LIBCALL-NEXT:    fmov d0, d11
+; NO-LIBCALL-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; NO-LIBCALL-NEXT:    fmov d3, d10
+; NO-LIBCALL-NEXT:    fmov d4, d9
+; NO-LIBCALL-NEXT:    fmov d1, d12
+; NO-LIBCALL-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    fmov d2, d13
+; NO-LIBCALL-NEXT:    ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    ldp d13, d12, [sp], #64 // 16-byte Folded Reload
+; NO-LIBCALL-NEXT:    ret
+  %result = call { <3 x double>, <3 x double> } @llvm.sincos.v3f64(<3 x double> %a)
+  ret { <3 x double>, <3 x double> } %result
+}
diff --git a/llvm/test/CodeGen/AArch64/llvm.sincospi.error.ll b/llvm/test/CodeGen/AArch64/llvm.sincospi.error.ll
new file mode 100644
index 0000000000000..d074d9ae24641
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/llvm.sincospi.error.ll
@@ -0,0 +1,13 @@
+; RUN: not llc -mtriple=aarch64-gnu-linux -filetype=null %s 2>&1 | FileCheck %s
+
+; CHECK: error: no libcall available for fsincospi
+define { float, float } @test_sincospi_f32(float %a) {
+  %result = call { float, float } @llvm.sincospi.f32(float %a)
+  ret { float, float } %result
+}
+
+; CHECK: error: no libcall available for fsincospi
+define { double, double } @test_sincospi_f64(double %a) {
+  %result = call { double, double } @llvm.sincospi.f64(double %a)
+  ret { double, double } %result
+}
diff --git a/llvm/test/CodeGen/AArch64/llvm.sincospi.ll b/llvm/test/CodeGen/AArch64/llvm.sincospi.ll
index d1d7d92adc05a..b386df077c09d 100644
--- a/llvm/test/CodeGen/AArch64/llvm.sincospi.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.sincospi.ll
@@ -1,268 +1,250 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-gnu-linux < %s | FileCheck -check-prefixes=CHECK %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=arm64-apple-macosx10.9 < %s | FileCheck %s
 
-define { half, half } @test_sincospi_f16(half %a) {
+define { half, half } @test_sincospi_f16(half %a) #0 {
 ; CHECK-LABEL: test_sincospi_f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    fcvt s0, h0
 ; CHECK-NEXT:    add x0, sp, #12
 ; CHECK-NEXT:    add x1, sp, #8
-; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    bl ___sincospif
 ; CHECK-NEXT:    ldp s1, s0, [sp, #8]
+; CHECK-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
 ; CHECK-NEXT:    fcvt h0, s0
 ; CHECK-NEXT:    fcvt h1, s1
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
   %result = call { half, half } @llvm.sincospi.f16(half %a)
   ret { half, half } %result
 }
 
-define half @test_sincospi_f16_only_use_sin(half %a) {
+define half @test_sincospi_f16_only_use_sin(half %a) #0 {
 ; CHECK-LABEL: test_sincospi_f16_only_use_sin:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    fcvt s0, h0
 ; CHECK-NEXT:    add x0, sp, #12
 ; CHECK-NEXT:    add x1, sp, #8
-; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    bl ___sincospif
 ; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
 ; CHECK-NEXT:    fcvt h0, s0
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
   %result = call { half, half } @llvm.sincospi.f16(half %a)
   %result.0 = extractvalue { half, half } %result, 0
   ret half %result.0
 }
 
-define half @test_sincospi_f16_only_use_cos(half %a) {
+define half @test_sincospi_f16_only_use_cos(half %a) #0 {
 ; CHECK-LABEL: test_sincospi_f16_only_use_cos:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    fcvt s0, h0
 ; CHECK-NEXT:    add x0, sp, #12
 ; CHECK-NEXT:    add x1, sp, #8
-; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    bl ___sincospif
 ; CHECK-NEXT:    ldr s0, [sp, #8]
+; CHECK-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
 ; CHECK-NEXT:    fcvt h0, s0
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
   %result = call { half, half } @llvm.sincospi.f16(half %a)
   %result.1 = extractvalue { half, half } %result, 1
   ret half %result.1
 }
 
-define { <2 x half>, <2 x half> } @test_sincospi_v2f16(<2 x half> %a) {
+define { <2 x half>, <2 x half> } @test_sincospi_v2f16(<2 x half> %a) #0 {
 ; CHECK-LABEL: test_sincospi_v2f16:
-; CHECK:       // %bb.0:
+; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h1, v0.h[1]
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    add x0, sp, #36
-; CHECK-NEXT:    add x1, sp, #32
-; CHECK-NEXT:    fcvt s0, h1
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h1, v0[1]
+; CHECK-NEXT:    str q0, [sp] ; 16-byte Folded Spill
 ; CHECK-NEXT:    add x0, sp, #28
 ; CHECK-NEXT:    add x1, sp, #24
+; CHECK-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h1
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr q0, [sp] ; 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #20
+; CHECK-NEXT:    add x1, sp, #16
 ; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr q0, [sp] ; 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #36
+; CHECK-NEXT:    add x1, sp, #32
+; CHECK-NEXT:    mov h0, v0[2]
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr q0, [sp] ; 16-byte Folded Reload
 ; CHECK-NEXT:    add x0, sp, #44
 ; CHECK-NEXT:    add x1, sp, #40
-; CHECK-NEXT:    mov h0, v0.h[2]
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    add x0, sp, #60
-; CHECK-NEXT:    add x1, sp, #56
-; CHECK-NEXT:    mov h0, v0.h[3]
+; CHECK-NEXT:    mov h0, v0[3]
 ; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldp s2, s0, [sp, #32]
-; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp s3, s1, [sp, #24]
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldp s2, s0, [sp, #24]
+; CHECK-NEXT:    ldp s3, s1, [sp, #16]
+; CHECK-NEXT:    ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
 ; CHECK-NEXT:    fcvt h4, s0
 ; CHECK-NEXT:    fcvt h2, s2
 ; CHECK-NEXT:    fcvt h0, s1
 ; CHECK-NEXT:    fcvt h1, s3
-; CHECK-NEXT:    ldp s5, s3, [sp, #40]
+; CHECK-NEXT:    ldp s5, s3, [sp, #32]
 ; CHECK-NEXT:    fcvt h3, s3
-; CHECK-NEXT:    mov v0.h[1], v4.h[0]
+; CHECK-NEXT:    mov.h v0[1], v4[0]
 ; CHECK-NEXT:    fcvt h4, s5
-; CHECK-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-NEXT:    ldp s5, s2, [sp, #56]
-; CHECK-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-NEXT:    mov.h v1[1], v2[0]
+; CHECK-NEXT:    ldp s5, s2, [sp, #40]
+; CHECK-NEXT:    mov.h v0[2], v3[0]
 ; CHECK-NEXT:    fcvt h2, s2
 ; CHECK-NEXT:    fcvt h3, s5
-; CHECK-NEXT:    mov v1.h[2], v4.h[0]
-; CHECK-NEXT:    mov v0.h[3], v2.h[0]
-; CHECK-NEXT:    mov v1.h[3], v3.h[0]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    mov.h v1[2], v4[0]
+; CHECK-NEXT:    mov.h v0[3], v2[0]
+; CHECK-NEXT:    mov.h v1[3], v3[0]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $q1
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
   %result = call { <2 x half>, <2 x half> } @llvm.sincospi.v2f16(<2 x half> %a)
   ret { <2 x half>, <2 x half> } %result
 }
 
-define { float, float } @test_sincospi_f32(float %a) {
+define { float, float } @test_sincospi_f32(float %a) #0 {
 ; CHECK-LABEL: test_sincospi_f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    add x0, sp, #12
 ; CHECK-NEXT:    add x1, sp, #8
-; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    bl ___sincospif
 ; CHECK-NEXT:    ldp s1, s0, [sp, #8]
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
   %result = call { float, float } @llvm.sincospi.f32(float %a)
   ret { float, float } %result
 }
 
-define { <3 x float>, <3 x float> } @test_sincospi_v3f32(<3 x float> %a) {
+define { <3 x float>, <3 x float> } @test_sincospi_v3f32(<3 x float> %a) #0 {
 ; CHECK-LABEL: test_sincospi_v3f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w30, -48
-; CHECK-NEXT:    add x0, sp, #20
-; CHECK-NEXT:    add x1, sp, #16
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    add x0, sp, #28
 ; CHECK-NEXT:    add x1, sp, #24
-; CHECK-NEXT:    add x19, sp, #28
-; CHECK-NEXT:    add x20, sp, #24
-; CHECK-NEXT:    mov s0, v0.s[1]
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    stp x22, x21, [sp, #48] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #64] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #80] ; 16-byte Folded Spill
+; CHECK-NEXT:    str q0, [sp] ; 16-byte Folded Spill
+; CHECK-NEXT:    ; kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr q0, [sp] ; 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #36
+; CHECK-NEXT:    add x1, sp, #32
+; CHECK-NEXT:    add x19, sp, #36
+; CHECK-NEXT:    add x20, sp, #32
+; CHECK-NEXT:    mov s0, v0[1]
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr q0, [sp] ; 16-byte Folded Reload
 ; CHECK-NEXT:    add x0, sp, #44
 ; CHECK-NEXT:    add x1, sp, #40
 ; CHECK-NEXT:    add x21, sp, #44
 ; CHECK-NEXT:    add x22, sp, #40
-; CHECK-NEXT:    mov s0, v0.s[2]
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldp s1, s0, [sp, #16]
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    ld1 { v0.s }[1], [x19]
-; CHECK-NEXT:    ld1 { v1.s }[1], [x20]
-; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ld1 { v0.s }[2], [x21]
-; CHECK-NEXT:    ld1 { v1.s }[2], [x22]
-; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    mov s0, v0[2]
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldp s1, s0, [sp, #24]
+; CHECK-NEXT:    ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
+; CHECK-NEXT:    ld1.s { v0 }[1], [x19]
+; CHECK-NEXT:    ld1.s { v1 }[1], [x20]
+; CHECK-NEXT:    ldp x20, x19, [sp, #64] ; 16-byte Folded Reload
+; CHECK-NEXT:    ld1.s { v0 }[2], [x21]
+; CHECK-NEXT:    ld1.s { v1 }[2], [x22]
+; CHECK-NEXT:    ldp x22, x21, [sp, #48] ; 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %result = call { <3 x float>, <3 x float> } @llvm.sincospi.v3f32(<3 x float> %a)
   ret { <3 x float>, <3 x float> } %result
 }
 
-define { <2 x float>, <2 x float> } @test_sincospi_v2f32(<2 x float> %a) {
+define { <2 x float>, <2 x float> } @test_sincospi_v2f32(<2 x float> %a) #0 {
 ; CHECK-LABEL: test_sincospi_v2f32:
-; CHECK:       // %bb.0:
+; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    add x0, sp, #44
-; CHECK-NEXT:    add x1, sp, #40
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    add x0, sp, #28
 ; CHECK-NEXT:    add x1, sp, #24
-; CHECK-NEXT:    add x19, sp, #28
-; CHECK-NEXT:    add x20, sp, #24
-; CHECK-NEXT:    mov s0, v0.s[1]
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldp s1, s0, [sp, #40]
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    ld1 { v0.s }[1], [x19]
-; CHECK-NEXT:    ld1 { v1.s }[1], [x20]
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    stp x20, x19, [sp, #32] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
+; CHECK-NEXT:    str q0, [sp] ; 16-byte Folded Spill
+; CHECK-NEXT:    ; kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr q0, [sp] ; 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #20
+; CHECK-NEXT:    add x1, sp, #16
+; CHECK-NEXT:    add x19, sp, #20
+; CHECK-NEXT:    add x20, sp, #16
+; CHECK-NEXT:    mov s0, v0[1]
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldp s1, s0, [sp, #24]
+; CHECK-NEXT:    ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
+; CHECK-NEXT:    ld1.s { v0 }[1], [x19]
+; CHECK-NEXT:    ld1.s { v1 }[1], [x20]
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] ; 16-byte Folded Reload
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $q1
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
   %result = call { <2 x float>, <2 x float> } @llvm.sincospi.v2f32(<2 x float> %a)
   ret { <2 x float>, <2 x float> } %result
 }
 
-define { double, double } @test_sincospi_f64(double %a) {
+define { double, double } @test_sincospi_f64(double %a) #0 {
 ; CHECK-LABEL: test_sincospi_f64:
-; CHECK:       // %bb.0:
+; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    add x0, sp, #24
-; CHECK-NEXT:    add x1, sp, #8
-; CHECK-NEXT:    bl sincospi
-; CHECK-NEXT:    ldr d0, [sp, #24]
-; CHECK-NEXT:    ldr d1, [sp, #8]
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #8
+; CHECK-NEXT:    mov x1, sp
+; CHECK-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    bl ___sincospi
+; CHECK-NEXT:    ldp d1, d0, [sp]
+; CHECK-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
   %result = call { double, double } @llvm.sincospi.f64(double %a)
   ret { double, double } %result
 }
 
-define { <2 x double>, <2 x double> } @test_sincospi_v2f64(<2 x double> %a) {
+define { <2 x double>, <2 x double> } @test_sincospi_v2f64(<2 x double> %a) #0 {
 ; CHECK-LABEL: test_sincospi_v2f64:
-; CHECK:       // %bb.0:
+; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    add x0, sp, #56
-; CHECK-NEXT:    add x1, sp, #40
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    bl sincospi
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    add x0, sp, #32
-; CHECK-NEXT:    add x1, sp, #24
-; CHECK-NEXT:    add x19, sp, #32
-; CHECK-NEXT:    add x20, sp, #24
-; CHECK-NEXT:    mov d0, v0.d[1]
-; CHECK-NEXT:    bl sincospi
-; CHECK-NEXT:    ldr d0, [sp, #56]
-; CHECK-NEXT:    ldr d1, [sp, #40]
-; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-NEXT:    ld1 { v0.d }[1], [x19]
-; CHECK-NEXT:    ld1 { v1.d }[1], [x20]
-; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #40
+; CHECK-NEXT:    add x1, sp, #32
+; CHECK-NEXT:    stp x20, x19, [sp, #48] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
+; CHECK-NEXT:    str q0, [sp] ; 16-byte Folded Spill
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    bl ___sincospi
+; CHECK-NEXT:    ldr q0, [sp] ; 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #24
+; CHECK-NEXT:    add x1, sp, #16
+; CHECK-NEXT:    add x19, sp, #24
+; CHECK-NEXT:    add x20, sp, #16
+; CHECK-NEXT:    mov d0, v0[1]
+; CHECK-NEXT:    bl ___sincospi
+; CHECK-NEXT:    ldp d1, d0, [sp, #32]
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
+; CHECK-NEXT:    ld1.d { v0 }[1], [x19]
+; CHECK-NEXT:    ld1.d { v1 }[1], [x20]
+; CHECK-NEXT:    ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
   %result = call { <2 x double>, <2 x double> } @llvm.sincospi.v2f64(<2 x double> %a)
   ret { <2 x double>, <2 x double> } %result
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-subregs.mir b/llvm/test/CodeGen/AArch64/machine-combiner-subregs.mir
new file mode 100644
index 0000000000000..c96a0385c3a4e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-subregs.mir
@@ -0,0 +1,35 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=aarch64-gnu-linux -mcpu=neoverse-n2 -run-pass=machine-combiner -o - %s | FileCheck %s
+
+# Make sure machine combiner doesn't drop subregister indexes.
+
+---
+name:            reassociate_adds2_reassoc
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0, $q1, $q2, $q3
+
+    ; CHECK-LABEL: name: reassociate_adds2_reassoc
+    ; CHECK: liveins: $q0, $q1, $q2, $q3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr128 = COPY $q2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr128 = COPY $q3
+    ; CHECK-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nsz reassoc nofpexcept FADDSrr [[COPY]].ssub, [[COPY1]].ssub, implicit $fpcr
+    ; CHECK-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = nsz reassoc nofpexcept FADDSrr [[COPY2]].ssub, [[COPY3]].ssub, implicit $fpcr
+    ; CHECK-NEXT: [[FADDSrr2:%[0-9]+]]:fpr32 = nsz reassoc nofpexcept FADDSrr killed [[FADDSrr1]], killed [[FADDSrr]], implicit $fpcr
+    ; CHECK-NEXT: $s0 = COPY [[FADDSrr2]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $s0
+    %0:fpr128 = COPY $q0
+    %1:fpr128 = COPY $q1
+    %2:fpr128 = COPY $q2
+    %3:fpr128 = COPY $q3
+    %4:fpr32 = nsz reassoc nofpexcept FADDSrr %0.ssub, %1.ssub, implicit $fpcr
+    %5:fpr32 = nsz reassoc nofpexcept FADDSrr %2.ssub, killed %4, implicit $fpcr
+    %6:fpr32 = nsz reassoc nofpexcept FADDSrr killed %5, %3.ssub, implicit $fpcr
+    $s0 = COPY %6
+    RET_ReallyLR implicit $s0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll
index 3230c9e946da7..b3a7ec961b736 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll
@@ -20,20 +20,17 @@ define i32 @sink_load_and_copy(i32 %n) {
 ; CHECK-NEXT:    b.lt .LBB0_3
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NEXT:    adrp x8, A
-; CHECK-NEXT:    mov w20, w19
-; CHECK-NEXT:    ldr w21, [x8, :lo12:A]
+; CHECK-NEXT:    mov w21, w19
+; CHECK-NEXT:    ldr w20, [x8, :lo12:A]
 ; CHECK-NEXT:  .LBB0_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    mov w0, w21
+; CHECK-NEXT:    mov w0, w20
 ; CHECK-NEXT:    bl _Z3usei
-; CHECK-NEXT:    sdiv w20, w20, w0
-; CHECK-NEXT:    subs w19, w19, #1
+; CHECK-NEXT:    sdiv w19, w19, w0
+; CHECK-NEXT:    subs w21, w21, #1
 ; CHECK-NEXT:    b.ne .LBB0_2
-; CHECK-NEXT:    b .LBB0_4
-; CHECK-NEXT:  .LBB0_3:
-; CHECK-NEXT:    mov w20, w19
-; CHECK-NEXT:  .LBB0_4: // %for.cond.cleanup
-; CHECK-NEXT:    mov w0, w20
+; CHECK-NEXT:  .LBB0_3: // %for.cond.cleanup
+; CHECK-NEXT:    mov w0, w19
 ; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -82,15 +79,12 @@ define i32 @cant_sink_successive_call(i32 %n) {
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    mov w0, w20
 ; CHECK-NEXT:    bl _Z3usei
-; CHECK-NEXT:    sdiv w21, w21, w0
-; CHECK-NEXT:    subs w19, w19, #1
+; CHECK-NEXT:    sdiv w19, w19, w0
+; CHECK-NEXT:    subs w21, w21, #1
 ; CHECK-NEXT:    b.ne .LBB1_2
-; CHECK-NEXT:    b .LBB1_4
-; CHECK-NEXT:  .LBB1_3:
-; CHECK-NEXT:    mov w21, w19
-; CHECK-NEXT:  .LBB1_4: // %for.cond.cleanup
+; CHECK-NEXT:  .LBB1_3: // %for.cond.cleanup
+; CHECK-NEXT:    mov w0, w19
 ; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov w0, w21
 ; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
@@ -139,15 +133,12 @@ define i32 @cant_sink_successive_store(ptr nocapture readnone %store, i32 %n) {
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    mov w0, w20
 ; CHECK-NEXT:    bl _Z3usei
-; CHECK-NEXT:    sdiv w21, w21, w0
-; CHECK-NEXT:    subs w19, w19, #1
+; CHECK-NEXT:    sdiv w19, w19, w0
+; CHECK-NEXT:    subs w21, w21, #1
 ; CHECK-NEXT:    b.ne .LBB2_2
-; CHECK-NEXT:    b .LBB2_4
-; CHECK-NEXT:  .LBB2_3:
-; CHECK-NEXT:    mov w21, w19
-; CHECK-NEXT:  .LBB2_4: // %for.cond.cleanup
+; CHECK-NEXT:  .LBB2_3: // %for.cond.cleanup
+; CHECK-NEXT:    mov w0, w19
 ; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov w0, w21
 ; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-adrp-got-split.mir b/llvm/test/CodeGen/AArch64/machine-outliner-adrp-got-split.mir
new file mode 100644
index 0000000000000..c397953b68f5e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-adrp-got-split.mir
@@ -0,0 +1,133 @@
+# RUN: llc -mtriple=aarch64---  -run-pass=machine-outliner -verify-machineinstrs %s -o - | FileCheck %s
+--- |
+
+  @x = common global i32 0, align 4
+
+  define i32 @adrp_add() #0 {
+    ret i32 0
+  }
+
+  define i32 @adrp_ldr() #0 {
+    ret i32 0
+  }
+
+  attributes #0 = { noinline noredzone }
+...
+---
+# Check that main function body doesn't split ADRP pair
+#
+# CHECK-LABEL: name: adrp_add
+# CHECK-DAG: bb.0:
+# CHECK: BL @OUTLINED_FUNCTION_[[F0:[0-9]+]]
+# CHECK-NEXT: BL @OUTLINED_FUNCTION_[[F2:[0-9]+]]
+# CHECK-NEXT: $lr = ORRXri $xzr, 1
+name:            adrp_add
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $x9 = ADRP target-flags(aarch64-page, aarch64-got) @x
+    $x12 = ADDXri $x9, target-flags(aarch64-pageoff, aarch64-got) @x, 0
+    $lr = ORRXri $xzr, 1
+  bb.1:
+  liveins: $lr
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $x9 = ADRP target-flags(aarch64-page, aarch64-got) @x
+    $x12 = ADDXri $x9, target-flags(aarch64-pageoff, aarch64-got) @x, 0
+    $lr = ORRXri $xzr, 1
+  bb.2:
+  liveins: $lr
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $x9 = ADRP target-flags(aarch64-page, aarch64-got) @x
+    $x12 = ADDXri $x9, target-flags(aarch64-pageoff, aarch64-got) @x, 0
+    $lr = ORRXri $xzr, 1
+  bb.3:
+  liveins: $lr
+    RET undef $lr
+...
+---
+# Check that main function body doesn't split ADRP pair
+#
+# CHECK-LABEL: name: adrp_ldr
+# CHECK-DAG: bb.0:
+# CHECK: BL @OUTLINED_FUNCTION_[[F0]]
+# CHECK-NEXT: BL @OUTLINED_FUNCTION_[[F1:[0-9]+]]
+# CHECK-NEXT: $lr = ORRXri $xzr, 1
+name:            adrp_ldr
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $lr
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $x9 = ADRP target-flags(aarch64-page, aarch64-got) @x
+    $x12 = LDRXui $x9, target-flags(aarch64-pageoff, aarch64-got) @x
+    $lr = ORRXri $xzr, 1
+  bb.1:
+  liveins: $lr
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $x9 = ADRP target-flags(aarch64-page, aarch64-got) @x
+    $x12 = LDRXui $x9, target-flags(aarch64-pageoff, aarch64-got) @x
+    $lr = ORRXri $xzr, 1
+  bb.2:
+  liveins: $lr
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 1
+    $x9 = ADRP target-flags(aarch64-page, aarch64-got) @x
+    $x12 = LDRXui $x9, target-flags(aarch64-pageoff, aarch64-got) @x
+    $lr = ORRXri $xzr, 1
+  bb.3:
+  liveins: $lr
+    RET undef $lr
+
+# Check that no outlined function split the ADRP pair apart
+#
+# CHECK: OUTLINED_FUNCTION_[[F0]]
+# CHECK-DAG: bb.0
+# CHECK: $w12 = ORRWri $wzr, 1
+# CHECK-NEXT: $w12 = ORRWri $wzr, 1
+# CHECK-NEXT: $w12 = ORRWri $wzr, 1
+# CHECK-NEXT: $w12 = ORRWri $wzr, 1
+# CHECK-NEXT: $w12 = ORRWri $wzr, 1
+# CHECK-NEXT: RET $lr
+
+# CHECK: OUTLINED_FUNCTION_[[F1]]
+# CHECK-DAG: bb.0
+# CHECK: $w12 = ORRWri $wzr, 1
+# CHECK-NEXT: $x9 = ADRP target-flags(aarch64-page, aarch64-got) @x
+# CHECK-NEXT: $x12 = LDRXui $x9, target-flags(aarch64-pageoff, aarch64-got) @x
+
+# CHECK: name: OUTLINED_FUNCTION_[[F2]]
+# CHECK-DAG: bb.0
+# CHECK: $w12 = ORRWri $wzr, 1
+# CHECK-NEXT: $x9 = ADRP target-flags(aarch64-page, aarch64-got) @x
+# CHECK-NEXT: $x12 = ADDXri $x9, target-flags(aarch64-pageoff, aarch64-got) @x, 0
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir b/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir
index b7fbdc09c1dd1..a635231fef7fb 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir
@@ -6,9 +6,9 @@
 #
 #; define void @"$s12"(...) {         define i64 @"$s5” (...) {             define void @"$s13"(...) {
 #   ...                                ...                                   ...
-#   %8 = load i1, i1* %7                                                     %8 = load i1, i1* %7
-#   %9 = load i4, i4*, %6              %9 = load i4, i4*, %6                 %9 = load i4, i4*, %6
-#   store i4 %9, i4* %5                store i4 %9, i4* %5                   store i4 %9, i4* %5
+#   %8 = load i1, ptr %7                                                     %8 = load i1, ptr %7
+#   %9 = load i4, ptr, %6              %9 = load i4, ptr, %6                 %9 = load i4, ptr, %6
+#   store i4 %9, ptr %5                store i4 %9, ptr %5                   store i4 %9, ptr %5
 #   ...                                ...                                   ...
 # }                                  }                                     }
 #
@@ -16,7 +16,7 @@
 #
 # define void @"$s12"(...) {         define i64 @"$s5” (...) {             define void @"$s13"(...) {
 #   ...                                ...                                   ...
-#   %8 = load i1, i1* %7                                                     %8 = load i1, i1* %7
+#   %8 = load i1, ptr %7                                                     %8 = load i1, ptr %7
 #   call void @outlined_function_1_1   call void @outlined_function_1_1      call void @outlined_function_1_1
 #   ...                                ...                                   ...
 # }                                  }                                     }
diff --git a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll
index e7e109170d6a1..338084295fc7f 100644
--- a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll
+++ b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll
@@ -16,13 +16,12 @@ define i32 @test(ptr %ptr) {
 ; CHECK-NEXT:    mov w9, wzr
 ; CHECK-NEXT:  LBB0_1: ; %.thread
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lsr w11, w9, #1
 ; CHECK-NEXT:    sub w10, w9, #1
-; CHECK-NEXT:    mov w9, w11
+; CHECK-NEXT:    lsr w9, w9, #1
 ; CHECK-NEXT:    tbnz w10, #0, LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb343
 ; CHECK-NEXT:    and w9, w10, #0x1
-; CHECK-NEXT:    mov w0, #-1
+; CHECK-NEXT:    mov w0, #-1 ; =0xffffffff
 ; CHECK-NEXT:    str w9, [x8]
 ; CHECK-NEXT:    ret
 bb:
diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-cmp-bcc.ll b/llvm/test/CodeGen/AArch64/misched-fusion-cmp-bcc.ll
index 700a060ef968f..0a10e80d998cd 100644
--- a/llvm/test/CodeGen/AArch64/misched-fusion-cmp-bcc.ll
+++ b/llvm/test/CodeGen/AArch64/misched-fusion-cmp-bcc.ll
@@ -15,10 +15,10 @@
 ; RUN: llc %s -o - -O0 -mtriple=aarch64-unknown -mcpu=ampere1b      | FileCheck %s
 
 
-define void @test_cmp_bcc_fusion(i32 %x, i32 %y, i32* %arr) {
+define void @test_cmp_bcc_fusion(i32 %x, i32 %y, ptr %arr) {
 entry:
   %cmp = icmp eq i32 %x, %y
-  store i32 %x, i32* %arr, align 4
+  store i32 %x, ptr %arr, align 4
   br i1 %cmp, label %if_true, label %if_false
 
 if_true:
diff --git a/llvm/test/CodeGen/AArch64/neon-extadd-extract.ll b/llvm/test/CodeGen/AArch64/neon-extadd-extract.ll
index 93a50ec305e1e..64cb3603f53a1 100644
--- a/llvm/test/CodeGen/AArch64/neon-extadd-extract.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extadd-extract.ll
@@ -734,7 +734,7 @@ define <1 x i64> @mullu_v2i32_0(<2 x i32> %s0, <2 x i32> %s1) {
 ; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    fmov x8, d0
 ; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    umull x8, w8, w9
 ; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/neon-mov.ll b/llvm/test/CodeGen/AArch64/neon-mov.ll
index 5be9394f61b30..4f657865e9f05 100644
--- a/llvm/test/CodeGen/AArch64/neon-mov.ll
+++ b/llvm/test/CodeGen/AArch64/neon-mov.ll
@@ -76,6 +76,15 @@ define <2 x i32> @movi2s_lsl16() {
    ret <2 x i32> <i32 16711680, i32 16711680>
 }
 
+define <2 x i32> @movi2s_fneg() {
+; CHECK-LABEL: movi2s_fneg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2s, #240, lsl #8
+; CHECK-NEXT:    fneg v0.2s, v0.2s
+; CHECK-NEXT:    ret
+   ret <2 x i32> <i32 2147545088, i32 2147545088>
+}
+
 define <2 x i32> @movi2s_lsl24() {
 ; CHECK-LABEL: movi2s_lsl24:
 ; CHECK:       // %bb.0:
@@ -149,6 +158,33 @@ define <4 x i16> @movi4h_lsl8() {
    ret <4 x i16> <i16 65280, i16 65280, i16 65280, i16 65280>
 }
 
+define <4 x i16> @movi4h_fneg() {
+; CHECK-NOFP16-SD-LABEL: movi4h_fneg:
+; CHECK-NOFP16-SD:       // %bb.0:
+; CHECK-NOFP16-SD-NEXT:    movi v0.4h, #127, lsl #8
+; CHECK-NOFP16-SD-NEXT:    fneg v0.2s, v0.2s
+; CHECK-NOFP16-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: movi4h_fneg:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    movi v0.4h, #127, lsl #8
+; CHECK-FP16-SD-NEXT:    fneg v0.2s, v0.2s
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-NOFP16-GI-LABEL: movi4h_fneg:
+; CHECK-NOFP16-GI:       // %bb.0:
+; CHECK-NOFP16-GI-NEXT:    adrp x8, .LCPI18_0
+; CHECK-NOFP16-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI18_0]
+; CHECK-NOFP16-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: movi4h_fneg:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    adrp x8, .LCPI18_0
+; CHECK-FP16-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI18_0]
+; CHECK-FP16-GI-NEXT:    ret
+   ret <4 x i16> <i16 32512, i16 65280, i16 32512, i16 65280>
+}
+
 define <8 x i16> @movi8h_lsl0() {
 ; CHECK-LABEL: movi8h_lsl0:
 ; CHECK:       // %bb.0:
@@ -180,14 +216,14 @@ define <8 x i16> @movi8h_fneg() {
 ;
 ; CHECK-NOFP16-GI-LABEL: movi8h_fneg:
 ; CHECK-NOFP16-GI:       // %bb.0:
-; CHECK-NOFP16-GI-NEXT:    adrp x8, .LCPI19_0
-; CHECK-NOFP16-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI19_0]
+; CHECK-NOFP16-GI-NEXT:    adrp x8, .LCPI21_0
+; CHECK-NOFP16-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI21_0]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: movi8h_fneg:
 ; CHECK-FP16-GI:       // %bb.0:
-; CHECK-FP16-GI-NEXT:    adrp x8, .LCPI19_0
-; CHECK-FP16-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI19_0]
+; CHECK-FP16-GI-NEXT:    adrp x8, .LCPI21_0
+; CHECK-FP16-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI21_0]
 ; CHECK-FP16-GI-NEXT:    ret
    ret <8 x i16> <i16 32512, i16 65280, i16 32512, i16 65280, i16 32512, i16 65280, i16 32512, i16 65280>
 }
@@ -275,6 +311,27 @@ define <4 x i16> @mvni4h_lsl8() {
    ret <4 x i16> <i16 61439, i16 61439, i16 61439, i16 61439>
 }
 
+define <4 x i16> @mvni4h_neg() {
+; CHECK-NOFP16-SD-LABEL: mvni4h_neg:
+; CHECK-NOFP16-SD:       // %bb.0:
+; CHECK-NOFP16-SD-NEXT:    mov w8, #33008 // =0x80f0
+; CHECK-NOFP16-SD-NEXT:    dup v0.4h, w8
+; CHECK-NOFP16-SD-NEXT:    ret
+;
+; CHECK-FP16-LABEL: mvni4h_neg:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    movi v0.4h, #240
+; CHECK-FP16-NEXT:    fneg v0.4h, v0.4h
+; CHECK-FP16-NEXT:    ret
+;
+; CHECK-NOFP16-GI-LABEL: mvni4h_neg:
+; CHECK-NOFP16-GI:       // %bb.0:
+; CHECK-NOFP16-GI-NEXT:    adrp x8, .LCPI32_0
+; CHECK-NOFP16-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI32_0]
+; CHECK-NOFP16-GI-NEXT:    ret
+   ret <4 x i16> <i16 33008, i16 33008, i16 33008, i16 33008>
+}
+
 define <8 x i16> @mvni8h_lsl0() {
 ; CHECK-LABEL: mvni8h_lsl0:
 ; CHECK:       // %bb.0:
@@ -306,8 +363,8 @@ define <8 x i16> @mvni8h_neg() {
 ;
 ; CHECK-NOFP16-GI-LABEL: mvni8h_neg:
 ; CHECK-NOFP16-GI:       // %bb.0:
-; CHECK-NOFP16-GI-NEXT:    adrp x8, .LCPI32_0
-; CHECK-NOFP16-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI32_0]
+; CHECK-NOFP16-GI-NEXT:    adrp x8, .LCPI35_0
+; CHECK-NOFP16-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI35_0]
 ; CHECK-NOFP16-GI-NEXT:    ret
    ret <8 x i16> <i16 33008, i16 33008, i16 33008, i16 33008, i16 33008, i16 33008, i16 33008, i16 33008>
 }
@@ -486,6 +543,33 @@ define <2 x double> @fmov2d_neg0() {
 	ret <2 x double> <double -0.0, double -0.0>
 }
 
+define <1 x double> @fmov1d_neg0() {
+; CHECK-NOFP16-SD-LABEL: fmov1d_neg0:
+; CHECK-NOFP16-SD:       // %bb.0:
+; CHECK-NOFP16-SD-NEXT:    movi d0, #0000000000000000
+; CHECK-NOFP16-SD-NEXT:    fneg d0, d0
+; CHECK-NOFP16-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: fmov1d_neg0:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    movi d0, #0000000000000000
+; CHECK-FP16-SD-NEXT:    fneg d0, d0
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-NOFP16-GI-LABEL: fmov1d_neg0:
+; CHECK-NOFP16-GI:       // %bb.0:
+; CHECK-NOFP16-GI-NEXT:    mov x8, #-9223372036854775808 // =0x8000000000000000
+; CHECK-NOFP16-GI-NEXT:    fmov d0, x8
+; CHECK-NOFP16-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: fmov1d_neg0:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    mov x8, #-9223372036854775808 // =0x8000000000000000
+; CHECK-FP16-GI-NEXT:    fmov d0, x8
+; CHECK-FP16-GI-NEXT:    ret
+	ret <1 x double> <double -0.0>
+}
+
 define <2 x i32> @movi1d_1() {
 ; CHECK-NOFP16-SD-LABEL: movi1d_1:
 ; CHECK-NOFP16-SD:       // %bb.0:
@@ -499,14 +583,14 @@ define <2 x i32> @movi1d_1() {
 ;
 ; CHECK-NOFP16-GI-LABEL: movi1d_1:
 ; CHECK-NOFP16-GI:       // %bb.0:
-; CHECK-NOFP16-GI-NEXT:    adrp x8, .LCPI52_0
-; CHECK-NOFP16-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI52_0]
+; CHECK-NOFP16-GI-NEXT:    adrp x8, .LCPI56_0
+; CHECK-NOFP16-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI56_0]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: movi1d_1:
 ; CHECK-FP16-GI:       // %bb.0:
-; CHECK-FP16-GI-NEXT:    adrp x8, .LCPI52_0
-; CHECK-FP16-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI52_0]
+; CHECK-FP16-GI-NEXT:    adrp x8, .LCPI56_0
+; CHECK-FP16-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI56_0]
 ; CHECK-FP16-GI-NEXT:    ret
   ret <2 x i32> <i32  -65536, i32 65535>
 }
@@ -517,31 +601,31 @@ define <2 x i32> @movi1d() {
 ; CHECK-NOFP16-SD-LABEL: movi1d:
 ; CHECK-NOFP16-SD:       // %bb.0:
 ; CHECK-NOFP16-SD-NEXT:    movi d1, #0x00ffffffff0000
-; CHECK-NOFP16-SD-NEXT:    adrp x8, .LCPI53_0
-; CHECK-NOFP16-SD-NEXT:    ldr d0, [x8, :lo12:.LCPI53_0]
+; CHECK-NOFP16-SD-NEXT:    adrp x8, .LCPI57_0
+; CHECK-NOFP16-SD-NEXT:    ldr d0, [x8, :lo12:.LCPI57_0]
 ; CHECK-NOFP16-SD-NEXT:    b test_movi1d
 ;
 ; CHECK-FP16-SD-LABEL: movi1d:
 ; CHECK-FP16-SD:       // %bb.0:
 ; CHECK-FP16-SD-NEXT:    movi d1, #0x00ffffffff0000
-; CHECK-FP16-SD-NEXT:    adrp x8, .LCPI53_0
-; CHECK-FP16-SD-NEXT:    ldr d0, [x8, :lo12:.LCPI53_0]
+; CHECK-FP16-SD-NEXT:    adrp x8, .LCPI57_0
+; CHECK-FP16-SD-NEXT:    ldr d0, [x8, :lo12:.LCPI57_0]
 ; CHECK-FP16-SD-NEXT:    b test_movi1d
 ;
 ; CHECK-NOFP16-GI-LABEL: movi1d:
 ; CHECK-NOFP16-GI:       // %bb.0:
-; CHECK-NOFP16-GI-NEXT:    adrp x8, .LCPI53_1
-; CHECK-NOFP16-GI-NEXT:    adrp x9, .LCPI53_0
-; CHECK-NOFP16-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI53_1]
-; CHECK-NOFP16-GI-NEXT:    ldr d1, [x9, :lo12:.LCPI53_0]
+; CHECK-NOFP16-GI-NEXT:    adrp x8, .LCPI57_1
+; CHECK-NOFP16-GI-NEXT:    adrp x9, .LCPI57_0
+; CHECK-NOFP16-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI57_1]
+; CHECK-NOFP16-GI-NEXT:    ldr d1, [x9, :lo12:.LCPI57_0]
 ; CHECK-NOFP16-GI-NEXT:    b test_movi1d
 ;
 ; CHECK-FP16-GI-LABEL: movi1d:
 ; CHECK-FP16-GI:       // %bb.0:
-; CHECK-FP16-GI-NEXT:    adrp x8, .LCPI53_1
-; CHECK-FP16-GI-NEXT:    adrp x9, .LCPI53_0
-; CHECK-FP16-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI53_1]
-; CHECK-FP16-GI-NEXT:    ldr d1, [x9, :lo12:.LCPI53_0]
+; CHECK-FP16-GI-NEXT:    adrp x8, .LCPI57_1
+; CHECK-FP16-GI-NEXT:    adrp x9, .LCPI57_0
+; CHECK-FP16-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI57_1]
+; CHECK-FP16-GI-NEXT:    ldr d1, [x9, :lo12:.LCPI57_0]
 ; CHECK-FP16-GI-NEXT:    b test_movi1d
   %1 = tail call <2 x i32> @test_movi1d(<2 x i32> <i32 -2147483648, i32 2147450880>, <2 x i32> <i32 -65536, i32 65535>)
   ret <2 x i32> %1
diff --git a/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll b/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll
index b7dde881291bb..1a85f803b9e57 100644
--- a/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll
+++ b/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll
@@ -19,7 +19,7 @@ define void @test_nopair_st(ptr %ptr, <2 x double> %v1, <2 x double> %v2) {
 ; SLOW-NOT: ldp
 ; FAST: ldp
 define <2 x i64> @test_nopair_ld(ptr %p) {
-  %tmp1 = load <2 x i64>, < 2 x i64>* %p, align 8
+  %tmp1 = load <2 x i64>, ptr %p, align 8
   %add.ptr2 = getelementptr inbounds i64, ptr %p, i64 2
   %tmp2 = load <2 x i64>, ptr %add.ptr2, align 8
   %add = add nsw <2 x i64> %tmp1, %tmp2
diff --git a/llvm/test/CodeGen/AArch64/popcount_vmask.ll b/llvm/test/CodeGen/AArch64/popcount_vmask.ll
new file mode 100644
index 0000000000000..e784ead2c9e5a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/popcount_vmask.ll
@@ -0,0 +1,315 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define i32 @vmask_popcount_i32_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    saddlv s0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <8 x i8> %a, %b
+  %t1 = bitcast <8 x i1> %mask to i8
+  %t2 = call i8 @llvm.ctpop(i8 %t1)
+  %t3 = zext i8 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
+; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <16 x i8> %a, %b
+  %t1 = bitcast <16 x i1> %mask to i16
+  %t2 = call i16 @llvm.ctpop(i16 %t1)
+  %t3 = zext i16 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    saddlv s0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <4 x i16> %a, %b
+  %t1 = bitcast <4 x i1> %mask to i4
+  %t2 = call i4 @llvm.ctpop(i4 %t1)
+  %t3 = zext i4 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    saddlv s0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <8 x i16> %a, %b
+  %t1 = bitcast <8 x i1> %mask to i8
+  %t2 = call i8 @llvm.ctpop(i8 %t1)
+  %t3 = zext i8 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <2 x i32> %a, %b
+  %t1 = bitcast <2 x i1> %mask to i2
+  %t2 = call i2 @llvm.ctpop(i2 %t1)
+  %t3 = zext i2 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <4 x i32> %a, %b
+  %t1 = bitcast <4 x i1> %mask to i4
+  %t2 = call i4 @llvm.ctpop(i4 %t1)
+  %t3 = zext i4 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    cset w0, lt
+; CHECK-NEXT:    ret
+  %mask = icmp slt <1 x i64> %a, %b
+  %t1 = bitcast <1 x i1> %mask to i1
+  %t2 = call i1 @llvm.ctpop(i1 %t1)
+  %t3 = zext i1 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <2 x i64> %a, %b
+  %t1 = bitcast <2 x i1> %mask to i2
+  %t2 = call i2 @llvm.ctpop(i2 %t1)
+  %t3 = zext i2 %t2 to i32
+  ret i32 %t3
+}
+
+define i64 @vmask_popcount_i64_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    saddlv s0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <8 x i8> %a, %b
+  %t1 = bitcast <8 x i1> %mask to i8
+  %t2 = call i8 @llvm.ctpop(i8 %t1)
+  %t3 = zext i8 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
+; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <16 x i8> %a, %b
+  %t1 = bitcast <16 x i1> %mask to i16
+  %t2 = call i16 @llvm.ctpop(i16 %t1)
+  %t3 = zext i16 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    saddlv s0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <4 x i16> %a, %b
+  %t1 = bitcast <4 x i1> %mask to i4
+  %t2 = call i4 @llvm.ctpop(i4 %t1)
+  %t3 = zext i4 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    saddlv s0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <8 x i16> %a, %b
+  %t1 = bitcast <8 x i1> %mask to i8
+  %t2 = call i8 @llvm.ctpop(i8 %t1)
+  %t3 = zext i8 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <2 x i32> %a, %b
+  %t1 = bitcast <2 x i1> %mask to i2
+  %t2 = call i2 @llvm.ctpop(i2 %t1)
+  %t3 = zext i2 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <4 x i32> %a, %b
+  %t1 = bitcast <4 x i1> %mask to i4
+  %t2 = call i4 @llvm.ctpop(i4 %t1)
+  %t3 = zext i4 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    cset w0, lt
+; CHECK-NEXT:    ret
+  %mask = icmp slt <1 x i64> %a, %b
+  %t1 = bitcast <1 x i1> %mask to i1
+  %t2 = call i1 @llvm.ctpop(i1 %t1)
+  %t3 = zext i1 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <2 x i64> %a, %b
+  %t1 = bitcast <2 x i1> %mask to i2
+  %t2 = call i2 @llvm.ctpop(i2 %t1)
+  %t3 = zext i2 %t2 to i64
+  ret i64 %t3
+}
+
+define i32 @non_vmask_popcount_1(half %a) {
+; CHECK-LABEL: non_vmask_popcount_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and w8, w8, #0xffff
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %t1 = bitcast half %a to i16
+  %t2 = call i16 @llvm.ctpop(i16 %t1)
+  %t3 = zext i16 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @non_vmask_popcount_2(<8 x i16> %a) {
+; CHECK-LABEL: non_vmask_popcount_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    umov w8, v0.b[0]
+; CHECK-NEXT:    umov w9, v0.b[1]
+; CHECK-NEXT:    umov w10, v0.b[2]
+; CHECK-NEXT:    and w8, w8, #0x3
+; CHECK-NEXT:    bfi w8, w9, #2, #2
+; CHECK-NEXT:    umov w9, v0.b[3]
+; CHECK-NEXT:    bfi w8, w10, #4, #2
+; CHECK-NEXT:    umov w10, v0.b[4]
+; CHECK-NEXT:    bfi w8, w9, #6, #2
+; CHECK-NEXT:    umov w9, v0.b[5]
+; CHECK-NEXT:    bfi w8, w10, #8, #2
+; CHECK-NEXT:    umov w10, v0.b[6]
+; CHECK-NEXT:    bfi w8, w9, #10, #2
+; CHECK-NEXT:    umov w9, v0.b[7]
+; CHECK-NEXT:    bfi w8, w10, #12, #2
+; CHECK-NEXT:    orr w8, w8, w9, lsl #14
+; CHECK-NEXT:    and w8, w8, #0xffff
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %mask = trunc <8 x i16> %a to <8 x i2>
+  %t1 = bitcast <8 x i2> %mask to i16
+  %t2 = call i16 @llvm.ctpop(i16 %t1)
+  %t3 = zext i16 %t2 to i32
+  ret i32 %t3
+}
diff --git a/llvm/test/CodeGen/AArch64/pr166870.ll b/llvm/test/CodeGen/AArch64/pr166870.ll
new file mode 100644
index 0000000000000..dc23f51987635
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr166870.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -O3 < %s -mtriple=aarch64 | FileCheck %s
+
+; The seemingly redundant mov where src_reg == dst_reg shouldn't be removed,
+; because it has the effect of zeroing the upper bits in x8.
+
+define i32 @ham(i32 %arg, i1 %arg1, i1 %arg2, ptr %arg3) nounwind {
+; CHECK-LABEL: ham:
+; CHECK:       // %bb.0: // %bb
+; CHECK-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    tbnz w1, #0, .LBB0_3
+; CHECK-NEXT:  // %bb.1: // %bb4
+; CHECK-NEXT:    tbnz w2, #0, .LBB0_3
+; CHECK-NEXT:  // %bb.2: // %bb5
+; CHECK-NEXT:    mov x19, x3
+; CHECK-NEXT:    mov w21, w1
+; CHECK-NEXT:    mov w20, w0
+; CHECK-NEXT:    bl zot
+; CHECK-NEXT:    tbz w21, #0, .LBB0_4
+; CHECK-NEXT:  .LBB0_3: // %bb6
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    mov w8, w20
+; CHECK-NEXT:    mov w20, wzr
+; CHECK-NEXT:    mov w8, w8
+; CHECK-NEXT:    mov w21, w8
+; CHECK-NEXT:  .LBB0_5: // %bb7
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    strb w20, [x19]
+; CHECK-NEXT:    cbnz x21, .LBB0_5
+; CHECK-NEXT:  // %bb.6: // %bb8
+; CHECK-NEXT:    // in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    bl quux
+; CHECK-NEXT:    b .LBB0_5
+bb:
+  br i1 %arg1, label %bb6, label %bb4
+
+bb4:
+  %load = load ptr, ptr null, align 8
+  br i1 %arg2, label %bb6, label %bb5
+
+bb5:
+  %call = call i32 @zot() #0
+  %zext = zext i32 %arg to i64
+  br i1 %arg1, label %bb6, label %bb7
+
+bb6:
+  ret i32 0
+
+bb7:
+  store i8 0, ptr %arg3, align 1
+  %icmp = icmp eq i64 %zext, 0
+  br i1 %icmp, label %bb8, label %bb7
+
+bb8:
+  call void @quux()
+  br label %bb7
+}
+
+declare i32 @zot()
+
+declare void @quux()
+
+attributes #0 = { returns_twice }
diff --git a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
index 7f0968c8eb339..f77ada4eae022 100644
--- a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
+++ b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck -check-prefix CHECK -check-prefix CHECK-DARWIN %s
+; RUN: llc < %s -mtriple=aarch64-unknown-windows-msvc | FileCheck -check-prefix CHECK -check-prefix CHECK-WIN %s
 
 declare void @standard_cc_func()
 declare preserve_mostcc void @preserve_mostcc_func()
@@ -8,18 +9,26 @@ declare preserve_mostcc void @preserve_mostcc_func()
 define preserve_mostcc void @preserve_mostcc1() nounwind {
 entry:
 ;CHECK-LABEL: preserve_mostcc1
-;CHECK-NOT:   stp
-;CHECK-NOT:   str
-;CHECK:       str     x15
-;CHECK-NEXT:  stp     x14, x13,
-;CHECK-NEXT:  stp     x12, x11,
-;CHECK-NEXT:  stp     x10, x9,
-;CHECK:       bl      _standard_cc_func
+;CHECK-DARWIN-NOT:   stp
+;CHECK-DARWIN-NOT:   str
+;CHECK-DARWIN:       str     x15
+;CHECK-DARWIN-NEXT:  stp     x14, x13,
+;CHECK-DARWIN-NEXT:  stp     x12, x11,
+;CHECK-DARWIN-NEXT:  stp     x10, x9,
+;CHECK-WIN:       stp     x15, x14
+;CHECK-WIN-NEXT:  stp     x13, x12,
+;CHECK-WIN-NEXT:  stp     x11, x10,
+;CHECK-WIN-NEXT:  stp     x9, x30
+;CHECK:       bl      {{_?}}standard_cc_func
   call void @standard_cc_func()
-;CHECK:       ldp     x10, x9,
-;CHECK-NEXT:  ldp     x12, x11,
-;CHECK-NEXT:  ldp     x14, x13,
-;CHECK-NEXT:  ldr     x15
+;CHECK-DARWIN:       ldp     x10, x9,
+;CHECK-DARWIN-NEXT:  ldp     x12, x11,
+;CHECK-DARWIN-NEXT:  ldp     x14, x13,
+;CHECK-DARWIN-NEXT:  ldr     x15
+;CHECK-WIN:       ldp     x9, x30
+;CHECK-WIN-NEXT:  ldp     x11, x10,
+;CHECK-WIN-NEXT:  ldp     x13, x12,
+;CHECK-WIN-NEXT:  ldp     x15, x14,
   ret void
 }
 
@@ -31,9 +40,10 @@ define preserve_mostcc void @preserve_mostcc2() nounwind {
 entry:
 ;CHECK-LABEL: preserve_mostcc2
 ;CHECK-NOT: x14
-;CHECK:     stp     x29, x30,
+;CHECK-DARWIN:     stp     x29, x30,
+;CHECK-WIN:     str     x30
 ;CHECK-NOT: x14
-;CHECK:     bl      _preserve_mostcc_func
+;CHECK:     bl      {{_?}}preserve_mostcc_func
   call preserve_mostcc void @preserve_mostcc_func()
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-bti-call.ll b/llvm/test/CodeGen/AArch64/ptrauth-bti-call.ll
index 0356a46ec1050..df5e1a9f1ee10 100644
--- a/llvm/test/CodeGen/AArch64/ptrauth-bti-call.ll
+++ b/llvm/test/CodeGen/AArch64/ptrauth-bti-call.ll
@@ -17,7 +17,7 @@
 ; CHECK-NEXT:  bti c
 ; CHECK-NEXT:  mov x16, x0
 ; CHECK-NEXT:  braaz x16
-define i32 @test_tailcall_ia_0(i32 ()* %arg0) #0 {
+define i32 @test_tailcall_ia_0(ptr %arg0) #0 {
   %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 0, i64 0) ]
   ret i32 %tmp0
 }
@@ -26,7 +26,7 @@ define i32 @test_tailcall_ia_0(i32 ()* %arg0) #0 {
 ; CHECK-NEXT:  bti c
 ; CHECK-NEXT:  mov x16, x0
 ; CHECK-NEXT:  brabz x16
-define i32 @test_tailcall_ib_0(i32 ()* %arg0) #0 {
+define i32 @test_tailcall_ib_0(ptr %arg0) #0 {
   %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 1, i64 0) ]
   ret i32 %tmp0
 }
@@ -36,7 +36,7 @@ define i32 @test_tailcall_ib_0(i32 ()* %arg0) #0 {
 ; CHECK-NEXT:  mov x16, x0
 ; CHECK-NEXT:  mov x17, #42
 ; CHECK-NEXT:  braa x16, x17
-define i32 @test_tailcall_ia_imm(i32 ()* %arg0) #0 {
+define i32 @test_tailcall_ia_imm(ptr %arg0) #0 {
   %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 0, i64 42) ]
   ret i32 %tmp0
 }
@@ -46,7 +46,7 @@ define i32 @test_tailcall_ia_imm(i32 ()* %arg0) #0 {
 ; CHECK-NEXT:  mov x16, x0
 ; CHECK-NEXT:  mov x17, #42
 ; CHECK-NEXT:  brab x16, x17
-define i32 @test_tailcall_ib_imm(i32 ()* %arg0) #0 {
+define i32 @test_tailcall_ib_imm(ptr %arg0) #0 {
   %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 1, i64 42) ]
   ret i32 %tmp0
 }
@@ -60,8 +60,8 @@ define i32 @test_tailcall_ib_imm(i32 ()* %arg0) #0 {
 ; ELF-NEXT:    ldr x1, [x1]
 ; ELF-NEXT:    mov x16, x0
 ; ELF-NEXT:    braa x16, x1
-define i32 @test_tailcall_ia_var(i32 ()* %arg0, i64* %arg1) #0 {
-  %tmp0 = load i64, i64* %arg1
+define i32 @test_tailcall_ia_var(ptr %arg0, ptr %arg1) #0 {
+  %tmp0 = load i64, ptr %arg1
   %tmp1 = tail call i32 %arg0() [ "ptrauth"(i32 0, i64 %tmp0) ]
   ret i32 %tmp1
 }
@@ -75,8 +75,8 @@ define i32 @test_tailcall_ia_var(i32 ()* %arg0, i64* %arg1) #0 {
 ; ELF-NEXT:    ldr x1, [x1]
 ; ELF-NEXT:    mov x16, x0
 ; ELF-NEXT:    brab x16, x1
-define i32 @test_tailcall_ib_var(i32 ()* %arg0, i64* %arg1) #0 {
-  %tmp0 = load i64, i64* %arg1
+define i32 @test_tailcall_ib_var(ptr %arg0, ptr %arg1) #0 {
+  %tmp0 = load i64, ptr %arg1
   %tmp1 = tail call i32 %arg0() [ "ptrauth"(i32 1, i64 %tmp0) ]
   ret i32 %tmp1
 }
@@ -85,7 +85,7 @@ define i32 @test_tailcall_ib_var(i32 ()* %arg0, i64* %arg1) #0 {
 ; CHECK-NEXT:  bti c
 ; CHECK-NEXT:  mov x16, x0
 ; CHECK-NEXT:  braa x16, x1
-define i32 @test_tailcall_ia_arg(i32 ()* %arg0, i64 %arg1) #0 {
+define i32 @test_tailcall_ia_arg(ptr %arg0, i64 %arg1) #0 {
   %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 0, i64 %arg1) ]
   ret i32 %tmp0
 }
@@ -94,7 +94,7 @@ define i32 @test_tailcall_ia_arg(i32 ()* %arg0, i64 %arg1) #0 {
 ; CHECK-NEXT:  bti c
 ; CHECK-NEXT:  mov x16, x0
 ; CHECK-NEXT:  brab x16, x1
-define i32 @test_tailcall_ib_arg(i32 ()* %arg0, i64 %arg1) #0 {
+define i32 @test_tailcall_ib_arg(ptr %arg0, i64 %arg1) #0 {
   %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 1, i64 %arg1) ]
   ret i32 %tmp0
 }
@@ -103,8 +103,8 @@ define i32 @test_tailcall_ib_arg(i32 ()* %arg0, i64 %arg1) #0 {
 ; CHECK-NEXT:  bti c
 ; CHECK-NEXT:  ldr x16, [x0]
 ; CHECK-NEXT:  braa x16, x1
-define i32 @test_tailcall_ia_arg_ind(i32 ()** %arg0, i64 %arg1) #0 {
-  %tmp0 = load i32 ()*, i32 ()** %arg0
+define i32 @test_tailcall_ia_arg_ind(ptr %arg0, i64 %arg1) #0 {
+  %tmp0 = load ptr, ptr %arg0
   %tmp1 = tail call i32 %tmp0() [ "ptrauth"(i32 0, i64 %arg1) ]
   ret i32 %tmp1
 }
@@ -113,8 +113,8 @@ define i32 @test_tailcall_ia_arg_ind(i32 ()** %arg0, i64 %arg1) #0 {
 ; CHECK-NEXT:  bti c
 ; CHECK-NEXT:  ldr x16, [x0]
 ; CHECK-NEXT:  brab x16, x1
-define i32 @test_tailcall_ib_arg_ind(i32 ()** %arg0, i64 %arg1) #0 {
-  %tmp0 = load i32 ()*, i32 ()** %arg0
+define i32 @test_tailcall_ib_arg_ind(ptr %arg0, i64 %arg1) #0 {
+  %tmp0 = load ptr, ptr %arg0
   %tmp1 = tail call i32 %tmp0() [ "ptrauth"(i32 1, i64 %arg1) ]
   ret i32 %tmp1
 }
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-call-rv-marker.ll b/llvm/test/CodeGen/AArch64/ptrauth-call-rv-marker.ll
index 9cf77b125e107..950db5fd6381f 100644
--- a/llvm/test/CodeGen/AArch64/ptrauth-call-rv-marker.ll
+++ b/llvm/test/CodeGen/AArch64/ptrauth-call-rv-marker.ll
@@ -4,18 +4,18 @@
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "arm64e-apple-iphoneos"
 
-declare i8* @foo0(i32)
-declare i8* @foo1()
+declare ptr @foo0(i32)
+declare ptr @foo1()
 
-declare void @llvm.objc.release(i8*)
-declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
-declare i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8*)
+declare void @llvm.objc.release(ptr)
+declare ptr @llvm.objc.retainAutoreleasedReturnValue(ptr)
+declare ptr @llvm.objc.unsafeClaimAutoreleasedReturnValue(ptr)
 
-declare void @foo2(i8*)
+declare void @foo2(ptr)
 
 declare void @foo(i64, i64, i64)
 
-define void @rv_marker_ptrauth_blraa(i8* ()** %arg0, i64 %arg1) {
+define void @rv_marker_ptrauth_blraa(ptr %arg0, i64 %arg1) {
 ; CHECK-LABEL: rv_marker_ptrauth_blraa
 ; CHECK:         ldr [[ADDR:x[0-9]+]], [
 ; CHECK-NEXT:    blraa [[ADDR]], x1
@@ -23,14 +23,14 @@ define void @rv_marker_ptrauth_blraa(i8* ()** %arg0, i64 %arg1) {
 ; CHECK-NEXT:    bl objc_retainAutoreleasedReturnValue
 ;
 entry:
-  %tmp0 = load i8* ()*, i8* ()** %arg0
-  %call0 = call i8* %tmp0() [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0() [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
 
-define void @rv_marker_ptrauth_blraa_unsafeClaim(i8* ()** %arg0, i64 %arg1) {
+define void @rv_marker_ptrauth_blraa_unsafeClaim(ptr %arg0, i64 %arg1) {
 ; CHECK-LABEL: rv_marker_ptrauth_blraa_unsafeClaim
 ; CHECK:         ldr [[ADDR:x[0-9]+]], [
 ; CHECK-NEXT:    blraa [[ADDR]], x1
@@ -38,14 +38,14 @@ define void @rv_marker_ptrauth_blraa_unsafeClaim(i8* ()** %arg0, i64 %arg1) {
 ; CHECK-NEXT:    bl objc_unsafeClaimAutoreleasedReturnValue
 ;
 entry:
-  %tmp0 = load i8* ()*, i8* ()** %arg0
-  %call0 = call i8* %tmp0() [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.unsafeClaimAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0() [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(ptr @llvm.objc.unsafeClaimAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
 
-define void @rv_marker_ptrauth_blraa_disc_imm16(i8* ()** %arg0) {
+define void @rv_marker_ptrauth_blraa_disc_imm16(ptr %arg0) {
 ; CHECK-LABEL: rv_marker_ptrauth_blraa_disc_imm16
 ; CHECK:         ldr [[ADDR:x[0-9]+]], [
 ; CHECK-NEXT:    mov x17, #45431
@@ -53,14 +53,14 @@ define void @rv_marker_ptrauth_blraa_disc_imm16(i8* ()** %arg0) {
 ; CHECK-NEXT:    mov x29, x29
 ; CHECK-NEXT:    bl objc_retainAutoreleasedReturnValue
 ;
-  %tmp0 = load i8* ()*, i8* ()** %arg0
-  %call0 = call i8* %tmp0() [ "ptrauth"(i32 1, i64 45431), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0() [ "ptrauth"(i32 1, i64 45431), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
 
-define void @rv_marker_ptrauth_blraa_multiarg(i8* (i64, i64, i64)** %arg0, i64 %arg1, i64 %a, i64 %b, i64 %c) {
+define void @rv_marker_ptrauth_blraa_multiarg(ptr %arg0, i64 %arg1, i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: rv_marker_ptrauth_blraa_multiarg
 ; CHECK:         mov  [[TMP:x[0-9]+]], x1
 ; CHECK-DAG:     ldr [[ADDR:x[0-9]+]]
@@ -71,28 +71,28 @@ define void @rv_marker_ptrauth_blraa_multiarg(i8* (i64, i64, i64)** %arg0, i64 %
 ; CHECK-NEXT:   bl objc_retainAutoreleasedReturnValue
 ;
 entry:
-  %tmp0 = load i8* (i64, i64, i64)*, i8* (i64, i64, i64)** %arg0
-  %call0 = call i8* %tmp0(i64 %c, i64 %b, i64 %a) [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0(i64 %c, i64 %b, i64 %a) [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
 
-define void @rv_marker_ptrauth_blrab(i8* ()** %arg0, i64 %arg1) {
+define void @rv_marker_ptrauth_blrab(ptr %arg0, i64 %arg1) {
 ; CHECK-LABEL: rv_marker_ptrauth_blrab
 ; CHECK:         ldr [[ADDR:x[0-9]+]], [
 ; CHECK-NEXT:    blrab [[ADDR]], x1
 ; CHECK-NEXT:   mov x29, x29
 ; CHECK-NEXT:   bl objc_retainAutoreleasedReturnValue
 ;
-  %tmp0 = load i8* ()*, i8* ()** %arg0
-  %call0 = call i8* %tmp0() [ "ptrauth"(i32 1, i64 %arg1), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0() [ "ptrauth"(i32 1, i64 %arg1), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
 
-define void @rv_marker_ptrauth_blrab_disc_imm16(i8* ()** %arg0) {
+define void @rv_marker_ptrauth_blrab_disc_imm16(ptr %arg0) {
 ; CHECK-LABEL: rv_marker_ptrauth_blrab_disc_imm16
 ; CHECK:         ldr [[ADDR:x[0-9]+]], [
 ; CHECK-NEXT:    mov x17, #256
@@ -100,42 +100,42 @@ define void @rv_marker_ptrauth_blrab_disc_imm16(i8* ()** %arg0) {
 ; CHECK-NEXT:   mov x29, x29
 ; CHECK-NEXT:   bl objc_retainAutoreleasedReturnValue
 ;
-  %tmp0 = load i8* ()*, i8* ()** %arg0
-  %call0 = call i8* %tmp0() [ "ptrauth"(i32 1, i64 256), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0() [ "ptrauth"(i32 1, i64 256), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
 
-define void @rv_marker_ptrauth_blraaz(i8* ()** %arg0) {
+define void @rv_marker_ptrauth_blraaz(ptr %arg0) {
 ; CHECK-LABEL: rv_marker_ptrauth_blraaz
 ; CHECK:         ldr [[ADDR:x[0-9]+]], [
 ; CHECK-NEXT:    blraaz [[ADDR]]
 ; CHECK-NEXT:   mov x29, x29
 ; CHECK-NEXT:   bl objc_retainAutoreleasedReturnValue
 ;
-  %tmp0 = load i8* ()*, i8* ()** %arg0
-  %call0 = call i8* %tmp0() [ "ptrauth"(i32 0, i64 0), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0() [ "ptrauth"(i32 0, i64 0), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
 
-define void @rv_marker_ptrauth_blrabz(i8* ()** %arg0) {
+define void @rv_marker_ptrauth_blrabz(ptr %arg0) {
 ; CHECK-LABEL: rv_marker_ptrauth_blrabz
 ; CHECK:         ldr [[ADDR:x[0-9]+]], [
 ; CHECK-NEXT:    blrabz [[ADDR]]
 ; CHECK-NEXT:   mov x29, x29
 ; CHECK-NEXT:   bl objc_retainAutoreleasedReturnValue
 ;
-  %tmp0 = load i8* ()*, i8* ()** %arg0
-  %call0 = call i8* %tmp0() [ "ptrauth"(i32 1, i64 0), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0() [ "ptrauth"(i32 1, i64 0), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
 
-define void @rv_marker_ptrauth_blrabz_multiarg(i8* (i64, i64, i64)** %arg0, i64 %a, i64 %b, i64 %c) {
+define void @rv_marker_ptrauth_blrabz_multiarg(ptr %arg0, i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: rv_marker_ptrauth_blrabz_multiarg
 ; CHECK:         mov  [[TMP:x[0-9]+]], x1
 ; CHECK-DAG:     ldr [[ADDR:x[0-9]+]], [
@@ -146,9 +146,9 @@ define void @rv_marker_ptrauth_blrabz_multiarg(i8* (i64, i64, i64)** %arg0, i64
 ; CHECK-NEXT:    mov x29, x29
 ; CHECK-NEXT:    bl objc_retainAutoreleasedReturnValue
 ;
-  %tmp0 = load i8* (i64, i64, i64)*, i8* (i64, i64, i64)** %arg0
-  %call0 = call i8* %tmp0(i64 %c, i64 %b, i64 %a) [ "ptrauth"(i32 1, i64 0), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0(i64 %c, i64 %b, i64 %a) [ "ptrauth"(i32 1, i64 0), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll b/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll
index 932cc946db0ea..02c643f101913 100644
--- a/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll
+++ b/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll
@@ -87,7 +87,7 @@
 ; CHECK-MACHO-NEXT:  _g.offset.ref.da.0:
 ; CHECK-MACHO-NEXT:    .quad (_g+16)@AUTH(da,0)
 
-@g.offset.ref.da.0 = constant ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 16), i32 2)
+@g.offset.ref.da.0 = constant ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 16), i32 2)
 
 ; CHECK-ELF-LABEL:     .globl g.big_offset.ref.da.0
 ; CHECK-ELF-NEXT:      .p2align 3
@@ -99,7 +99,7 @@
 ; CHECK-MACHO-NEXT:  _g.big_offset.ref.da.0:
 ; CHECK-MACHO-NEXT:    .quad (_g+2147549185)@AUTH(da,0)
 
-@g.big_offset.ref.da.0 = constant ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 add (i64 2147483648, i64 65537)), i32 2)
+@g.big_offset.ref.da.0 = constant ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 add (i64 2147483648, i64 65537)), i32 2)
 
 ; CHECK-ELF-LABEL:     .globl g.weird_ref.da.0
 ; CHECK-ELF-NEXT:      .p2align 3
@@ -111,7 +111,7 @@
 ; CHECK-MACHO-NEXT:  _g.weird_ref.da.0:
 ; CHECK-MACHO-NEXT:    .quad (_g+16)@AUTH(da,0)
 
-@g.weird_ref.da.0 = constant i64 ptrtoint (ptr inttoptr (i64 ptrtoint (ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 16), i32 2) to i64) to ptr) to i64)
+@g.weird_ref.da.0 = constant i64 ptrtoint (ptr inttoptr (i64 ptrtoint (ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 16), i32 2) to i64) to ptr) to i64)
 
 ; CHECK-ELF-LABEL:     .globl g_weak.ref.ia.42
 ; CHECK-ELF-NEXT:      .p2align 3
diff --git a/llvm/test/CodeGen/AArch64/regalloc-hint-movprfx.mir b/llvm/test/CodeGen/AArch64/regalloc-hint-movprfx.mir
new file mode 100644
index 0000000000000..05f583e2e692f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/regalloc-hint-movprfx.mir
@@ -0,0 +1,67 @@
+# RUN: llc -mtriple=aarch64 -mattr=+sve -start-before=greedy -stop-after=virtregrewriter -debug-only=regalloc %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=DBG
+# REQUIRES: asserts
+
+# Check that the register allocator gets hints to reuse registers of one of it's operands.
+---
+name:            prioritize_movprfx_hints
+tracksRegLiveness: true
+isSSA:           false
+noVRegs:         false
+body:             |
+  bb.0.entry:
+    liveins: $z0, $z1, $z2, $z3, $p0
+
+    ; DBG: Machine code for function prioritize_movprfx_hints
+    ;
+    ; DBG: selectOrSplit ZPR:%4
+    ; DBG-NEXT: hints: $z0 $z1{{$}}
+    ;
+    ; DBG: selectOrSplit ZPR:%5
+    ; DBG-NEXT: hints: $z2 $z3{{$}}
+    ;
+    ; DBG: [%0 -> $z3] ZPR
+    ; DBG: [%1 -> $z2] ZPR
+    ; DBG: [%2 -> $z1] ZPR
+    ; DBG: [%3 -> $z0] ZPR
+    ; DBG: [%4 -> $z0] ZPR
+    ; DBG: [%5 -> $z2] ZPR
+    ; DBG: [%6 -> $z0] ZPR
+    %0:zpr = COPY $z3
+    %1:zpr = COPY $z2
+    %2:zpr = COPY $z1
+    %3:zpr = COPY $z0
+    %4:zpr = SDIV_ZPZZ_D_UNDEF $p0, %3:zpr, %2:zpr
+    %5:zpr = MUL_ZPZZ_D_UNDEF $p0, %1:zpr, %0:zpr
+    %6:zpr = MUL_ZPZZ_D_UNDEF $p0, %5:zpr, %4:zpr
+    $z0 = COPY %6:zpr
+    RET_ReallyLR implicit $z0
+...
+
+# Check that the register allocator prioritises hints that are set by the register
+# allocator itself (i.e. to use z4 for the result register).
+---
+name:            prioritize_regalloc_hints
+isSSA:           false
+noVRegs:         false
+body:             |
+  bb.0.entry:
+    %0:zpr = FDUP_ZI_S 0, implicit $vg
+    %1:zpr = FDUP_ZI_S 16, implicit $vg
+    %2:zpr = FDUP_ZI_S 32, implicit $vg
+    %3:ppr_3b = PTRUE_S 31, implicit $vg
+
+    ; DBG: Machine code for function prioritize_regalloc_hints
+    ;
+    ; DBG: selectOrSplit ZPR:%4
+    ; DBG-NEXT: hints: $z4{{$}}
+    ;
+    ; DBG: [%0 -> $z0] ZPR
+    ; DBG: [%1 -> $z1] ZPR
+    ; DBG: [%2 -> $z2] ZPR
+    ; DBG: [%3 -> $p0] PPR_3b
+    ; DBG: [%4 -> $z4] ZPR
+
+    %4:zpr = FMLA_ZPZZZ_S_UNDEF %3, %0, %1, %2
+    $z4 = COPY %4
+    RET_ReallyLR implicit $z4
+...
diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
index 71c6380177b3a..8a0ac6d4ace7a 100644
--- a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
@@ -780,6 +780,7 @@ define <vscale x 4 x float> @llvm_tanh_vscale_f32(<vscale x 4 x float> %in) #0 {
 
 attributes #0 = { "target-features"="+sve" }
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR1]] = { "target-features"="+sve" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
diff --git a/llvm/test/CodeGen/AArch64/sbc.ll b/llvm/test/CodeGen/AArch64/sbc.ll
new file mode 100644
index 0000000000000..fff63c1709218
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sbc.ll
@@ -0,0 +1,392 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck --check-prefixes=CHECK,CHECK-SD %s
+; RUN: llc < %s -global-isel | FileCheck --check-prefixes=CHECK,CHECK-GI %s
+
+target triple = "aarch64-none-linux-gnu"
+
+define i32 @test_basic_i32(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_basic_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    sbc w0, w2, w3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_basic_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    sub w9, w2, w3
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    sub w0, w9, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i32 %a, %b
+  %carry = zext i1 %cc to i32
+  %sub = sub i32 %x, %y
+  %res = sub i32 %sub, %carry
+  ret i32 %res
+}
+
+define i64 @test_basic_i64(i64 %a, i64 %b, i64 %x, i64 %y) {
+; CHECK-SD-LABEL: test_basic_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp x0, x1
+; CHECK-SD-NEXT:    sbc x0, x2, x3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_basic_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp x0, x1
+; CHECK-GI-NEXT:    sub x9, x2, x3
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    sub x0, x9, x8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i64 %a, %b
+  %carry = zext i1 %cc to i64
+  %sub = sub i64 %x, %y
+  %res = sub i64 %sub, %carry
+  ret i64 %res
+}
+
+define i64 @test_mixed_i32_i64(i32 %a, i32 %b, i64 %x, i64 %y) {
+; CHECK-SD-LABEL: test_mixed_i32_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    sbc x0, x2, x3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_mixed_i32_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    sub x9, x2, x3
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    sub x0, x9, x8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i32 %a, %b
+  %carry = zext i1 %cc to i64
+  %sub = sub i64 %x, %y
+  %res = sub i64 %sub, %carry
+  ret i64 %res
+}
+
+define i32 @test_mixed_i64_i32(i64 %a, i64 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_mixed_i64_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp x0, x1
+; CHECK-SD-NEXT:    sbc w0, w2, w3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_mixed_i64_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp x0, x1
+; CHECK-GI-NEXT:    sub w9, w2, w3
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    sub w0, w9, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i64 %a, %b
+  %carry = zext i1 %cc to i32
+  %sub = sub i32 %x, %y
+  %res = sub i32 %sub, %carry
+  ret i32 %res
+}
+
+define i32 @test_only_borrow(i32 %a, i32 %b, i32 %x) {
+; CHECK-SD-LABEL: test_only_borrow:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    sbc w0, w2, wzr
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_only_borrow:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    sub w0, w2, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i32 %a, %b
+  %carry = zext i1 %cc to i32
+  %res = sub i32 %x, %carry
+  ret i32 %res
+}
+
+define i32 @test_sext_add(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_sext_add:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    sbc w0, w2, w3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_sext_add:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    sub w9, w2, w3
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    sbfx w8, w8, #0, #1
+; CHECK-GI-NEXT:    add w0, w9, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i32 %a, %b
+  %carry = sext i1 %cc to i32
+  %sub = sub i32 %x, %y
+  %res = add i32 %sub, %carry
+  ret i32 %res
+}
+
+; FIXME: This case could be supported with reversed operands to the CMP.
+define i32 @test_ugt(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_ugt:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    sub w8, w2, w3
+; CHECK-SD-NEXT:    cset w9, hi
+; CHECK-SD-NEXT:    sub w0, w8, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_ugt:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    sub w9, w2, w3
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    sub w0, w9, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ugt i32 %a, %b
+  %carry = zext i1 %cc to i32
+  %sub = sub i32 %x, %y
+  %res = sub i32 %sub, %carry
+  ret i32 %res
+}
+
+define i32 @test_unsupported_cc_slt(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_unsupported_cc_slt:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    sub w8, w2, w3
+; CHECK-SD-NEXT:    cset w9, lt
+; CHECK-SD-NEXT:    sub w0, w8, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_unsupported_cc_slt:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    sub w9, w2, w3
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    sub w0, w9, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp slt i32 %a, %b
+  %carry = zext i1 %cc to i32
+  %sub = sub i32 %x, %y
+  %res = sub i32 %sub, %carry
+  ret i32 %res
+}
+
+define i32 @test_unsupported_cc_sgt(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_unsupported_cc_sgt:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    sub w8, w2, w3
+; CHECK-SD-NEXT:    cset w9, gt
+; CHECK-SD-NEXT:    sub w0, w8, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_unsupported_cc_sgt:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    sub w9, w2, w3
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    sub w0, w9, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp sgt i32 %a, %b
+  %carry = zext i1 %cc to i32
+  %sub = sub i32 %x, %y
+  %res = sub i32 %sub, %carry
+  ret i32 %res
+}
+
+define i32 @test_multiple_setcc_uses(i32 %a, i32 %b, i32 %x) {
+; CHECK-SD-LABEL: test_multiple_setcc_uses:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    cset w0, lo
+; CHECK-SD-NEXT:    sub w19, w2, w0
+; CHECK-SD-NEXT:    bl use
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_multiple_setcc_uses:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    mov w19, w2
+; CHECK-GI-NEXT:    cset w20, lo
+; CHECK-GI-NEXT:    mov w0, w20
+; CHECK-GI-NEXT:    bl use
+; CHECK-GI-NEXT:    sub w0, w19, w20
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i32 %a, %b
+  %carry = zext i1 %cc to i32
+  %res = sub i32 %x, %carry
+  tail call void @use(i1 %cc)
+  ret i32 %res
+}
+
+define i32 @test_multiple_carry_uses(i32 %a, i32 %b, i32 %x) {
+; CHECK-SD-LABEL: test_multiple_carry_uses:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    cset w0, lo
+; CHECK-SD-NEXT:    sub w19, w2, w0
+; CHECK-SD-NEXT:    bl use
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_multiple_carry_uses:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    mov w19, w2
+; CHECK-GI-NEXT:    cset w20, lo
+; CHECK-GI-NEXT:    mov w0, w20
+; CHECK-GI-NEXT:    bl use
+; CHECK-GI-NEXT:    sub w0, w19, w20
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i32 %a, %b
+  %carry = zext i1 %cc to i32
+  %res = sub i32 %x, %carry
+  tail call void @use(i32 %carry)
+  ret i32 %res
+}
+
+define i32 @test_multiple_sub_uses(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_multiple_sub_uses:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    sub w8, w2, w3
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    mov w0, w8
+; CHECK-SD-NEXT:    sbc w19, w2, w3
+; CHECK-SD-NEXT:    bl use
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_multiple_sub_uses:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    sub w19, w2, w3
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    cset w20, lo
+; CHECK-GI-NEXT:    bl use
+; CHECK-GI-NEXT:    sub w0, w19, w20
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i32 %a, %b
+  %carry = zext i1 %cc to i32
+  %sub = sub i32 %x, %y
+  %res = sub i32 %sub, %carry
+  tail call void @use(i32 %sub)
+  ret i32 %res
+}
+
+define i8 @test_i8(i8 %a, i8 %b, i8 %x, i8 %y) {
+; CHECK-SD-LABEL: test_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xff
+; CHECK-SD-NEXT:    cmp w8, w1, uxtb
+; CHECK-SD-NEXT:    sbc w0, w2, w3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w0, #0xff
+; CHECK-GI-NEXT:    sub w9, w2, w3
+; CHECK-GI-NEXT:    cmp w8, w1, uxtb
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    sub w0, w9, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i8 %a, %b
+  %carry = zext i1 %cc to i8
+  %sub = sub i8 %x, %y
+  %res = sub i8 %sub, %carry
+  ret i8 %res
+}
+
+define i16 @test_i16(i16 %a, i16 %b, i16 %x, i16 %y) {
+; CHECK-SD-LABEL: test_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xffff
+; CHECK-SD-NEXT:    cmp w8, w1, uxth
+; CHECK-SD-NEXT:    sbc w0, w2, w3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w0, #0xffff
+; CHECK-GI-NEXT:    sub w9, w2, w3
+; CHECK-GI-NEXT:    cmp w8, w1, uxth
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    sub w0, w9, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i16 %a, %b
+  %carry = zext i1 %cc to i16
+  %sub = sub i16 %x, %y
+  %res = sub i16 %sub, %carry
+  ret i16 %res
+}
+
+define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) {
+; CHECK-SD-LABEL: test_v4i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    cmhi v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v4.4s, #1
+; CHECK-GI-NEXT:    cmhi v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    sub v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    sub v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult <4 x i32> %a, %b
+  %carry = zext <4 x i1> %cc to <4 x i32>
+  %sub = sub <4 x i32> %x, %y
+  %res = sub <4 x i32> %sub, %carry
+  ret <4 x i32> %res
+}
+
+declare void @use()
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/seh-extended-spills.ll b/llvm/test/CodeGen/AArch64/seh-extended-spills.ll
new file mode 100644
index 0000000000000..54f8e3f4c5a64
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/seh-extended-spills.ll
@@ -0,0 +1,50 @@
+; RUN: llc -mtriple aarch64-unknown-windows-msvc -filetype asm -o - %s | FileCheck %s
+; RUN: llc -mtriple aarch64-unknown-windows-msvc -filetype obj -o - %s | llvm-readobj -u - | FileCheck %s -check-prefix CHECK-UNWIND
+
+declare dso_local void @g(ptr noundef)
+define dso_local preserve_mostcc void @f(ptr noundef %p) #0 {
+entry:
+  %p.addr = alloca ptr, align 8
+  store ptr %p, ptr %p.addr, align 8
+  %0 = load ptr, ptr %p.addr, align 8
+  call void @g(ptr noundef %0)
+  ret void
+}
+
+attributes #0 = { nounwind uwtable(sync) }
+
+; CHECK: str x30, [sp, #16]
+; CHECK-NEXT: .seh_save_reg x30, 16
+; CHECK: str x9, [sp, #24]
+; CHECK-NEXT: .seh_save_any_reg x9, 24
+; CHECK: stp x10, x11, [sp, #32
+; CHECK-NEXT: .seh_save_any_reg_p x10, 32
+; CHECK: stp x12, x13, [sp, #48]
+; CHECK-NEXT: .seh_save_any_reg_p x12, 48
+; CHECK: stp x14, x15, [sp, #64]
+; CHECK-NEXT: .seh_save_any_reg_p x14, 64
+; CHECK: .seh_endprologue
+
+; CHECK: .seh_startepilogue
+; CHECK: ldp x14, x15, [sp, #64]
+; CHECK-NEXT: .seh_save_any_reg_p x14, 64
+; CHECK: ldp x12, x13, [sp, #48]
+; CHECK-NEXT: .seh_save_any_reg_p x12, 48
+; CHECK: ldp x10, x11, [sp, #32
+; CHECK-NEXT: .seh_save_any_reg_p x10, 32
+; CHECK: ldr x9, [sp, #24]
+; CHECK-NEXT: .seh_save_any_reg x9, 24
+; CHECK: ldr x30, [sp, #16]
+; CHECK-NEXT: .seh_save_reg x30, 16
+
+; CHECK: .seh_endepilogue
+
+; CHECK-UNWIND:  Prologue [
+; CHECK-UNWIND:    0xe74e04            ; stp x14, x15, [sp, #64]
+; CHECK-UNWIND:    0xe74c03            ; stp x12, x13, [sp, #48]
+; CHECK-UNWIND:    0xe74a02            ; stp x10, x11, [sp, #32]
+; CHECK-UNWIND:    0xe70903            ; str x9, [sp, #24]
+; CHECK-UNWIND:    0xd2c2              ; str x30, [sp, #16]
+; CHECK-UNWIND:    0x05                ; sub sp, #80
+; CHECK-UNWIND:    0xe4                ; end
+; CHECK-UNWIND:  ]
diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
index b947c943ba448..72f6646930624 100644
--- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
+++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
@@ -151,12 +151,11 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
 ; CHECK-NEXT:    bl use_f16
@@ -190,12 +189,11 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-NEXT:    bl use_f32
@@ -229,12 +227,11 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_f64
@@ -273,12 +270,11 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_v16i8
@@ -313,12 +309,11 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_v8i16
@@ -353,12 +348,11 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_v4i32
@@ -393,12 +387,11 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_v2i64
@@ -433,12 +426,11 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
 ; CHECK-NEXT:    bl use_v8f16
@@ -513,12 +505,11 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_v2f64
@@ -557,12 +548,11 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v16i8
@@ -596,12 +586,11 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v8i16
@@ -635,12 +624,11 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v4i32
@@ -674,12 +662,11 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v2i64
@@ -713,12 +700,11 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v8f16
@@ -752,12 +738,11 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v8bf16
@@ -791,12 +776,11 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v4f32
@@ -830,12 +814,11 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v2f64
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index f2163ad15bafc..df88f37195ed6 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -129,12 +129,11 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mrs x19, SVCR
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    mrs x19, SVCR
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    tbz w19, #0, .LBB4_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
index 690a39d12e6f1..9d8b077e9268e 100644
--- a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
@@ -19,20 +19,16 @@ define void @zpr_and_ppr_local(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vec
 ; CHECK-LABEL: zpr_and_ppr_local:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    sub sp, sp, #1024
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    sub sp, sp, #1024
-; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #2048
+; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    add x8, sp, #2048
 ; CHECK-NEXT:    str p0, [x8, #15, mul vl]
 ; CHECK-NEXT:    add x8, sp, #1024
 ; CHECK-NEXT:    str z0, [x8]
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #1024
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    add sp, sp, #2048
+; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %ppr_local = alloca <vscale x 16 x i1>
@@ -62,20 +58,16 @@ define void @zpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    sub sp, sp, #1024
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    sub sp, sp, #1024
-; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #2048
+; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    sub x8, x29, #1024
 ; CHECK-NEXT:    str p0, [x29, #-1, mul vl]
 ; CHECK-NEXT:    str z0, [x8, #-2, mul vl]
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #1024
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    add sp, sp, #2048
+; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %ppr_local = alloca <vscale x 16 x i1>
@@ -103,17 +95,15 @@ define void @fpr_and_ppr_local(<vscale x 16 x i1> %pred, double %double) "aarch6
 ; CHECK-LABEL: fpr_and_ppr_local:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    sub sp, sp, #2064
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    sub sp, sp, #1040
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    add x8, sp, #2064
 ; CHECK-NEXT:    str p0, [x8, #7, mul vl]
 ; CHECK-NEXT:    str d0, [sp, #1032]
-; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    add sp, sp, #2064
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #1040
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %ppr_local = alloca <vscale x 16 x i1>
@@ -144,17 +134,15 @@ define void @fpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, double %double) "aar
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    sub sp, sp, #2064
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    sub sp, sp, #1040
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    str p0, [x29, #-1, mul vl]
 ; CHECK-NEXT:    str d0, [sp, #1032]
-; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    add sp, sp, #2064
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #1040
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %ppr_local = alloca <vscale x 16 x i1>
@@ -749,36 +737,23 @@ entry:
 }
 declare ptr @memset(ptr, i32, i32)
 
-; FIXME: aarch64-split-sve-objects is currently not supported in this function
-; as it requires stack reealignment (for the 32-byte aligned alloca).
-; GPR CSRs
-; <hazard padding>
-; FPR CSRs
-; <hazrd padding>
-; <SVE locals (PPRs and ZPRs)> <--- hazard between PPRs and ZPRs here!
-; <realignment padding>
-; -> sp
 define void @zpr_and_ppr_local_realignment(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr) "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: zpr_and_ppr_local_realignment:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #1040
-; CHECK-NEXT:    sub x9, sp, #1040
-; CHECK-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK-NEXT:    add x29, sp, #1024
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    sub x9, sp, #2064
+; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    addvl x9, x9, #-2
-; CHECK-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
 ; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    sub x8, x29, #1024
-; CHECK-NEXT:    str p0, [x8, #-1, mul vl]
+; CHECK-NEXT:    str p0, [x29, #-1, mul vl]
 ; CHECK-NEXT:    str z0, [x8, #-2, mul vl]
 ; CHECK-NEXT:    str x0, [sp]
-; CHECK-NEXT:    sub sp, x29, #1024
-; CHECK-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %ppr_local = alloca <vscale x 16 x i1>
   %zpr_local = alloca <vscale x 16 x i8>
@@ -793,11 +768,8 @@ define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x
 ; CHECK-LABEL: zpr_and_ppr_local_stack_probing:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    sub sp, sp, #1024
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str xzr, [sp]
-; CHECK-NEXT:    sub sp, sp, #1824
-; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #2848
+; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0xb0, 0x16, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2864 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
@@ -806,10 +778,8 @@ define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x
 ; CHECK-NEXT:    add x8, sp, #1824
 ; CHECK-NEXT:    str z0, [x8]
 ; CHECK-NEXT:    str x0, [sp]
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #1024
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #1824
+; CHECK-NEXT:    add sp, sp, #2848
+; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" "aarch64_pstate_sm_compatible"
@@ -822,3 +792,313 @@ define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x
   store volatile i64 %gpr, ptr %gpr_local
   ret void
 }
+
+; Only PPR callee-saves + a VLA
+; Expect: No hazard padding. Frame pointer (x29), p4-p6 callee saves allocated
+; with `addvl #-1`, PPR saves restored using frame pointer `addvl sp, x29, #-1`.
+define aarch64_sve_vector_pcs void @only_ppr_csr_vla(i64 %n) {
+; CHECK-LABEL: only_ppr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    addvl sp, x29, #-1
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  call void (...) @llvm.fake.use(ptr %alloc)
+  tail call void asm sideeffect "", "~{p4},~{p5},~{p6}"()
+  ret void
+}
+
+; Only ZPR callee-saves + a VLA
+; Expect: Hazard padding, Frame pointer (x29), z8-z10 callee saves allocated
+; with `addvl #-3`. ZPR saves restored from `FP - 1024 + addvl #-3`.
+define aarch64_sve_vector_pcs void @only_zpr_csr_vla(i64 %n) {
+; CHECK-LABEL: only_zpr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1056
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    sub x8, x29, #1024
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    addvl sp, x8, #-3
+; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  call void (...) @llvm.fake.use(ptr %alloc)
+  tail call void asm sideeffect "", "~{z8},~{z9},~{z10}"()
+  ret void
+}
+
+; PPR+ZPR callee-saves + a VLA
+; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) and ZPR (z8-z10)
+; callee-saves allocated separately, with hazard padding of 1024 between the
+; areas. ZPR callee saves restored by `FP - 1024 + addvl #-4`, PPR callee saves
+; restored by `FP + addvl #-1`.
+define aarch64_sve_vector_pcs void @zpr_ppr_csr_vla(i64 %n) {
+; CHECK-LABEL: zpr_ppr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1056
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    sub x8, x29, #1024
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    addvl sp, x8, #-4
+; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, x29, #-1
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  call void (...) @llvm.fake.use(ptr %alloc)
+  tail call void asm sideeffect "", "~{p4},~{p5},~{p6},~{z8},~{z9},~{z10}"()
+  ret void
+}
+
+; Only PPR callee-saves (and ZPR/PPR locals) + a VLA
+; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) callee-saves, with
+; hazard padding after the PPR callee saves (1024) and after the FPR local area
+; (1024) -- coeleased to 2048. Only PPRs restored by moving the SP to
+; `FP + addvl #-1`.
+define void @sve_locals_only_ppr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) {
+; CHECK-LABEL: sve_locals_only_ppr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #2048
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    sub x8, x29, #1024
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str p0, [x29, #-9, mul vl]
+; CHECK-NEXT:    str z0, [x8, #-3, mul vl]
+; CHECK-NEXT:    addvl sp, x29, #-1
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  %ppr_local = alloca <vscale x 16 x i1>
+  %zpr_local = alloca <vscale x 16 x i8>
+  tail call void asm sideeffect "", "~{p4},~{p5},~{p6}"()
+  call void (...) @llvm.fake.use(ptr %alloc)
+  store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+  store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+  ret void
+}
+
+; Only ZPR callee-saves (and ZPR/PPR locals) + a VLA
+; Expect: Hazard padding, Frame pointer (x29), ZPR (z8-z10) callee-saves, with
+; hazard padding before the ZPR callee saves (1024) and after the ZPR local area
+; (1024). Only ZPRs restored by moving the SP to `FP - 1024 + addvl #-4`.
+define void @sve_locals_only_zpr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) {
+; CHECK-LABEL: sve_locals_only_zpr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1056
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    sub x8, x29, #1024
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str p0, [x29, #-1, mul vl]
+; CHECK-NEXT:    str z0, [x8, #-5, mul vl]
+; CHECK-NEXT:    addvl sp, x8, #-4
+; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  %ppr_local = alloca <vscale x 16 x i1>
+  %zpr_local = alloca <vscale x 16 x i8>
+  tail call void asm sideeffect "", "~{z8},~{z9},~{z10}"()
+  call void (...) @llvm.fake.use(ptr %alloc)
+  store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+  store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+  ret void
+}
+
+; PPR+ZPR callee-saves (and ZPR/PPR locals) + a VLA
+; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) and ZPR (z8-z10)
+; callee-saves, with hazard padding before the ZPR callee saves (1024) and after
+; the ZPR local area (1024). ZPRs restored by moving the SP to
+; `FP - 1024 + addvl #-5`, PPRs restored by moving SP to `FP + addvl #-1`.
+define void @sve_locals_zpr_ppr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) {
+; CHECK-LABEL: sve_locals_zpr_ppr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 24 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 32 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 40 * VG - 1056
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    sub x8, x29, #1024
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str p0, [x29, #-9, mul vl]
+; CHECK-NEXT:    str z0, [x8, #-6, mul vl]
+; CHECK-NEXT:    addvl sp, x8, #-5
+; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, x29, #-1
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  %ppr_local = alloca <vscale x 16 x i1>
+  %zpr_local = alloca <vscale x 16 x i8>
+  tail call void asm sideeffect "", "~{p4},~{p5},~{p6},~{z8},~{z9},~{z10}"()
+  call void (...) @llvm.fake.use(ptr %alloc)
+  store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+  store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index bdee359487ce6..05450468f87a7 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -975,8 +975,8 @@ define i32 @svecc_csr_d8(i32 noundef %num, <vscale x 4 x i32> %vs) "aarch64_psta
 ;
 ; CHECK64-LABEL: svecc_csr_d8:
 ; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #80
-; CHECK64-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK64-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK64-NEXT:    sub sp, sp, #64
 ; CHECK64-NEXT:    addvl sp, sp, #-1
 ; CHECK64-NEXT:    str z8, [sp] // 16-byte Folded Spill
 ; CHECK64-NEXT:    sub sp, sp, #64
@@ -988,30 +988,50 @@ define i32 @svecc_csr_d8(i32 noundef %num, <vscale x 4 x i32> %vs) "aarch64_psta
 ; CHECK64-NEXT:    //NO_APP
 ; CHECK64-NEXT:    add sp, sp, #64
 ; CHECK64-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT:    add sp, sp, #64
 ; CHECK64-NEXT:    addvl sp, sp, #1
-; CHECK64-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #80
+; CHECK64-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK64-NEXT:    ret
 ;
-; CHECK1024-LABEL: svecc_csr_d8:
-; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1040
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    addvl sp, sp, #-1
-; CHECK1024-NEXT:    str z8, [sp] // 16-byte Folded Spill
-; CHECK1024-NEXT:    sub sp, sp, #1024
-; CHECK1024-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2064 + 8 * VG
-; CHECK1024-NEXT:    .cfi_offset w29, -16
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
-; CHECK1024-NEXT:    mov w0, wzr
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    add sp, sp, #1024
-; CHECK1024-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
-; CHECK1024-NEXT:    addvl sp, sp, #1
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1040
-; CHECK1024-NEXT:    ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_csr_d8:
+; CHECK1024-NOSPLITSVE:       // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-1
+; CHECK1024-NOSPLITSVE-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2064 + 8 * VG
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w29, -16
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
+; CHECK1024-NOSPLITSVE-NEXT:    mov w0, wzr
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #1
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT:    ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_csr_d8:
+; CHECK1024-SPLITSVE:       // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-1
+; CHECK1024-SPLITSVE-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2064 + 8 * VG
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w29, -16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
+; CHECK1024-SPLITSVE-NEXT:    mov w0, wzr
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #1
+; CHECK1024-SPLITSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ret
 entry:
   tail call void asm sideeffect "", "~{d8}"() #1
   ret i32 0
@@ -1039,8 +1059,8 @@ define i32 @svecc_csr_d8d9(i32 noundef %num, <vscale x 4 x i32> %vs) "aarch64_ps
 ;
 ; CHECK64-LABEL: svecc_csr_d8d9:
 ; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #80
-; CHECK64-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK64-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK64-NEXT:    sub sp, sp, #64
 ; CHECK64-NEXT:    addvl sp, sp, #-2
 ; CHECK64-NEXT:    str z9, [sp] // 16-byte Folded Spill
 ; CHECK64-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
@@ -1055,33 +1075,56 @@ define i32 @svecc_csr_d8d9(i32 noundef %num, <vscale x 4 x i32> %vs) "aarch64_ps
 ; CHECK64-NEXT:    add sp, sp, #64
 ; CHECK64-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    add sp, sp, #64
 ; CHECK64-NEXT:    addvl sp, sp, #2
-; CHECK64-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #80
+; CHECK64-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK64-NEXT:    ret
 ;
-; CHECK1024-LABEL: svecc_csr_d8d9:
-; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1040
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    addvl sp, sp, #-2
-; CHECK1024-NEXT:    str z9, [sp] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    sub sp, sp, #1024
-; CHECK1024-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG
-; CHECK1024-NEXT:    .cfi_offset w29, -16
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1040
-; CHECK1024-NEXT:    mov w0, wzr
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    add sp, sp, #1024
-; CHECK1024-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    addvl sp, sp, #2
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1040
-; CHECK1024-NEXT:    ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_csr_d8d9:
+; CHECK1024-NOSPLITSVE:       // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-2
+; CHECK1024-NOSPLITSVE-NEXT:    str z9, [sp] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w29, -16
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1040
+; CHECK1024-NOSPLITSVE-NEXT:    mov w0, wzr
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #2
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT:    ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_csr_d8d9:
+; CHECK1024-SPLITSVE:       // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT:    str z9, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w29, -16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1040
+; CHECK1024-SPLITSVE-NEXT:    mov w0, wzr
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #2
+; CHECK1024-SPLITSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ret
 entry:
   tail call void asm sideeffect "", "~{d8},~{d9}"() #1
   ret i32 0
@@ -1108,8 +1151,8 @@ define i32 @svecc_csr_d8_allocd(double %d, <vscale x 4 x i32> %vs) "aarch64_psta
 ;
 ; CHECK64-LABEL: svecc_csr_d8_allocd:
 ; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #80
-; CHECK64-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK64-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK64-NEXT:    sub sp, sp, #64
 ; CHECK64-NEXT:    addvl sp, sp, #-1
 ; CHECK64-NEXT:    str z8, [sp] // 16-byte Folded Spill
 ; CHECK64-NEXT:    sub sp, sp, #80
@@ -1122,31 +1165,52 @@ define i32 @svecc_csr_d8_allocd(double %d, <vscale x 4 x i32> %vs) "aarch64_psta
 ; CHECK64-NEXT:    str d0, [sp, #72]
 ; CHECK64-NEXT:    add sp, sp, #80
 ; CHECK64-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT:    add sp, sp, #64
 ; CHECK64-NEXT:    addvl sp, sp, #1
-; CHECK64-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #80
+; CHECK64-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK64-NEXT:    ret
 ;
-; CHECK1024-LABEL: svecc_csr_d8_allocd:
-; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1040
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    addvl sp, sp, #-1
-; CHECK1024-NEXT:    str z8, [sp] // 16-byte Folded Spill
-; CHECK1024-NEXT:    sub sp, sp, #1040
-; CHECK1024-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
-; CHECK1024-NEXT:    .cfi_offset w29, -16
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
-; CHECK1024-NEXT:    mov w0, wzr
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    str d0, [sp, #1032]
-; CHECK1024-NEXT:    add sp, sp, #1040
-; CHECK1024-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
-; CHECK1024-NEXT:    addvl sp, sp, #1
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1040
-; CHECK1024-NEXT:    ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_csr_d8_allocd:
+; CHECK1024-NOSPLITSVE:       // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-1
+; CHECK1024-NOSPLITSVE-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w29, -16
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
+; CHECK1024-NOSPLITSVE-NEXT:    mov w0, wzr
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    str d0, [sp, #1032]
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #1
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT:    ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_csr_d8_allocd:
+; CHECK1024-SPLITSVE:       // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-1
+; CHECK1024-SPLITSVE-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1040
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w29, -16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
+; CHECK1024-SPLITSVE-NEXT:    mov w0, wzr
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    str d0, [sp, #1032]
+; CHECK1024-SPLITSVE-NEXT:    add sp, sp, #1040
+; CHECK1024-SPLITSVE-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #1
+; CHECK1024-SPLITSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ret
 entry:
   %a = alloca double
   tail call void asm sideeffect "", "~{d8}"() #1
@@ -1176,8 +1240,8 @@ define i32 @svecc_csr_d8_alloci64(i64 %d, <vscale x 4 x i32> %vs) "aarch64_pstat
 ;
 ; CHECK64-LABEL: svecc_csr_d8_alloci64:
 ; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #80
-; CHECK64-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK64-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK64-NEXT:    sub sp, sp, #64
 ; CHECK64-NEXT:    addvl sp, sp, #-1
 ; CHECK64-NEXT:    str z8, [sp] // 16-byte Folded Spill
 ; CHECK64-NEXT:    sub sp, sp, #80
@@ -1191,32 +1255,54 @@ define i32 @svecc_csr_d8_alloci64(i64 %d, <vscale x 4 x i32> %vs) "aarch64_pstat
 ; CHECK64-NEXT:    str x8, [sp, #8]
 ; CHECK64-NEXT:    add sp, sp, #80
 ; CHECK64-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT:    add sp, sp, #64
 ; CHECK64-NEXT:    addvl sp, sp, #1
-; CHECK64-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #80
+; CHECK64-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK64-NEXT:    ret
 ;
-; CHECK1024-LABEL: svecc_csr_d8_alloci64:
-; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1040
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    addvl sp, sp, #-1
-; CHECK1024-NEXT:    str z8, [sp] // 16-byte Folded Spill
-; CHECK1024-NEXT:    sub sp, sp, #1040
-; CHECK1024-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
-; CHECK1024-NEXT:    .cfi_offset w29, -16
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
-; CHECK1024-NEXT:    mov x8, x0
-; CHECK1024-NEXT:    mov w0, wzr
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    str x8, [sp, #8]
-; CHECK1024-NEXT:    add sp, sp, #1040
-; CHECK1024-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
-; CHECK1024-NEXT:    addvl sp, sp, #1
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1040
-; CHECK1024-NEXT:    ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_csr_d8_alloci64:
+; CHECK1024-NOSPLITSVE:       // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-1
+; CHECK1024-NOSPLITSVE-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w29, -16
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
+; CHECK1024-NOSPLITSVE-NEXT:    mov x8, x0
+; CHECK1024-NOSPLITSVE-NEXT:    mov w0, wzr
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    str x8, [sp, #8]
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #1
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT:    ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_csr_d8_alloci64:
+; CHECK1024-SPLITSVE:       // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-1
+; CHECK1024-SPLITSVE-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1040
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w29, -16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
+; CHECK1024-SPLITSVE-NEXT:    mov x8, x0
+; CHECK1024-SPLITSVE-NEXT:    mov w0, wzr
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    str x8, [sp, #8]
+; CHECK1024-SPLITSVE-NEXT:    add sp, sp, #1040
+; CHECK1024-SPLITSVE-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #1
+; CHECK1024-SPLITSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ret
 entry:
   %a = alloca i64
   tail call void asm sideeffect "", "~{d8}"() #1
@@ -1247,8 +1333,8 @@ define i32 @svecc_csr_d8_allocnxv4i32(i64 %d, <vscale x 4 x i32> %vs) "aarch64_p
 ;
 ; CHECK64-LABEL: svecc_csr_d8_allocnxv4i32:
 ; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #80
-; CHECK64-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK64-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK64-NEXT:    sub sp, sp, #64
 ; CHECK64-NEXT:    addvl sp, sp, #-1
 ; CHECK64-NEXT:    str z8, [sp] // 16-byte Folded Spill
 ; CHECK64-NEXT:    sub sp, sp, #64
@@ -1265,35 +1351,60 @@ define i32 @svecc_csr_d8_allocnxv4i32(i64 %d, <vscale x 4 x i32> %vs) "aarch64_p
 ; CHECK64-NEXT:    add sp, sp, #64
 ; CHECK64-NEXT:    addvl sp, sp, #1
 ; CHECK64-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT:    add sp, sp, #64
 ; CHECK64-NEXT:    addvl sp, sp, #1
-; CHECK64-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #80
+; CHECK64-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK64-NEXT:    ret
 ;
-; CHECK1024-LABEL: svecc_csr_d8_allocnxv4i32:
-; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1040
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    addvl sp, sp, #-1
-; CHECK1024-NEXT:    str z8, [sp] // 16-byte Folded Spill
-; CHECK1024-NEXT:    sub sp, sp, #1024
-; CHECK1024-NEXT:    addvl sp, sp, #-1
-; CHECK1024-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG
-; CHECK1024-NEXT:    .cfi_offset w29, -16
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
-; CHECK1024-NEXT:    mov z0.s, #0 // =0x0
-; CHECK1024-NEXT:    add x8, sp, #1024
-; CHECK1024-NEXT:    mov w0, wzr
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    str z0, [x8]
-; CHECK1024-NEXT:    add sp, sp, #1024
-; CHECK1024-NEXT:    addvl sp, sp, #1
-; CHECK1024-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
-; CHECK1024-NEXT:    addvl sp, sp, #1
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1040
-; CHECK1024-NEXT:    ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_csr_d8_allocnxv4i32:
+; CHECK1024-NOSPLITSVE:       // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-1
+; CHECK1024-NOSPLITSVE-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-1
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w29, -16
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
+; CHECK1024-NOSPLITSVE-NEXT:    mov z0.s, #0 // =0x0
+; CHECK1024-NOSPLITSVE-NEXT:    add x8, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    mov w0, wzr
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    str z0, [x8]
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #1
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #1
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1040
+; CHECK1024-NOSPLITSVE-NEXT:    ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_csr_d8_allocnxv4i32:
+; CHECK1024-SPLITSVE:       // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-1
+; CHECK1024-SPLITSVE-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-1
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w29, -16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1040
+; CHECK1024-SPLITSVE-NEXT:    mov z0.s, #0 // =0x0
+; CHECK1024-SPLITSVE-NEXT:    add x8, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    mov w0, wzr
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    str z0, [x8]
+; CHECK1024-SPLITSVE-NEXT:    add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #1
+; CHECK1024-SPLITSVE-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #1
+; CHECK1024-SPLITSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ret
 entry:
   %a = alloca <vscale x 4 x i32>
   tail call void asm sideeffect "", "~{d8}"() #1
@@ -1360,11 +1471,11 @@ define i32 @svecc_csr_x18_25_d8_15_allocdi64(i64 %d, double %e, <vscale x 4 x i3
 ;
 ; CHECK64-LABEL: svecc_csr_x18_25_d8_15_allocdi64:
 ; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #128
-; CHECK64-NEXT:    stp x29, x25, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x24, x23, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x22, x21, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x29, x25, [sp, #-64]! // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK64-NEXT:    sub sp, sp, #64
 ; CHECK64-NEXT:    addvl sp, sp, #-8
 ; CHECK64-NEXT:    str z15, [sp] // 16-byte Folded Spill
 ; CHECK64-NEXT:    str z14, [sp, #1, mul vl] // 16-byte Folded Spill
@@ -1409,80 +1520,139 @@ define i32 @svecc_csr_x18_25_d8_15_allocdi64(i64 %d, double %e, <vscale x 4 x i3
 ; CHECK64-NEXT:    ldr z10, [sp, #5, mul vl] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldr z9, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldr z8, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    add sp, sp, #64
 ; CHECK64-NEXT:    addvl sp, sp, #8
-; CHECK64-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x24, x23, [sp, #80] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x29, x25, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x25, [sp], #64 // 16-byte Folded Reload
 ; CHECK64-NEXT:    ret
 ;
-; CHECK1024-LABEL: svecc_csr_x18_25_d8_15_allocdi64:
-; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1088
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x25, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x24, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x23, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x22, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x21, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x20, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1080] // 8-byte Folded Spill
-; CHECK1024-NEXT:    addvl sp, sp, #-8
-; CHECK1024-NEXT:    str z15, [sp] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z14, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z13, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z12, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z11, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z10, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z9, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z8, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    sub sp, sp, #1056
-; CHECK1024-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xe0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0xc0, 0x00, 0x1e, 0x22 // sp + 2144 + 64 * VG
-; CHECK1024-NEXT:    .cfi_offset w19, -8
-; CHECK1024-NEXT:    .cfi_offset w20, -16
-; CHECK1024-NEXT:    .cfi_offset w21, -24
-; CHECK1024-NEXT:    .cfi_offset w22, -32
-; CHECK1024-NEXT:    .cfi_offset w23, -40
-; CHECK1024-NEXT:    .cfi_offset w24, -48
-; CHECK1024-NEXT:    .cfi_offset w25, -56
-; CHECK1024-NEXT:    .cfi_offset w29, -64
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088
-; CHECK1024-NEXT:    mov x8, x0
-; CHECK1024-NEXT:    mov w0, wzr
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    str x8, [sp, #8]
-; CHECK1024-NEXT:    str d0, [sp, #1048]
-; CHECK1024-NEXT:    add sp, sp, #1056
-; CHECK1024-NEXT:    ldr z15, [sp] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z14, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z13, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z12, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z11, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z10, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z9, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z8, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    addvl sp, sp, #8
-; CHECK1024-NEXT:    ldr x19, [sp, #1080] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x20, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x21, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x22, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x23, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x24, [sp, #1040] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x25, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1088
-; CHECK1024-NEXT:    ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_csr_x18_25_d8_15_allocdi64:
+; CHECK1024-NOSPLITSVE:       // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x25, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x24, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x23, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x22, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x21, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x20, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x19, [sp, #1080] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-8
+; CHECK1024-NOSPLITSVE-NEXT:    str z15, [sp] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z14, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z13, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z12, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z11, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z10, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z9, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z8, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1056
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xe0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0xc0, 0x00, 0x1e, 0x22 // sp + 2144 + 64 * VG
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w19, -8
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w20, -16
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w21, -24
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w22, -32
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w23, -40
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w24, -48
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w25, -56
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    mov x8, x0
+; CHECK1024-NOSPLITSVE-NEXT:    mov w0, wzr
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    str x8, [sp, #8]
+; CHECK1024-NOSPLITSVE-NEXT:    str d0, [sp, #1048]
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1056
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z15, [sp] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z14, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z13, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z12, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z11, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z10, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z9, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z8, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #8
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x19, [sp, #1080] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x20, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x21, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x22, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x23, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x24, [sp, #1040] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x25, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_csr_x18_25_d8_15_allocdi64:
+; CHECK1024-SPLITSVE:       // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT:    stp x29, x25, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-8
+; CHECK1024-SPLITSVE-NEXT:    str z15, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z14, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z13, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z12, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z11, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z10, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z9, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z8, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1056
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xe0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0xc0, 0x00, 0x1e, 0x22 // sp + 2144 + 64 * VG
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w20, -16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w21, -24
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w22, -32
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w23, -40
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w24, -48
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w25, -56
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    mov x8, x0
+; CHECK1024-SPLITSVE-NEXT:    mov w0, wzr
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    str x8, [sp, #8]
+; CHECK1024-SPLITSVE-NEXT:    str d0, [sp, #1048]
+; CHECK1024-SPLITSVE-NEXT:    add sp, sp, #1056
+; CHECK1024-SPLITSVE-NEXT:    ldr z15, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z14, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z13, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z12, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z11, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z10, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z9, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z8, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #8
+; CHECK1024-SPLITSVE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x29, x25, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ret
 entry:
   %a = alloca i64
   %b = alloca double
@@ -3512,14 +3682,13 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ;
 ; CHECK64-LABEL: svecc_call_dynamic_alloca:
 ; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #128
-; CHECK64-NEXT:    .cfi_def_cfa_offset 128
+; CHECK64-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK64-NEXT:    cntd x9
-; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x27, x26, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
-; CHECK64-NEXT:    add x29, sp, #64
+; CHECK64-NEXT:    stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK64-NEXT:    mov x29, sp
 ; CHECK64-NEXT:    .cfi_def_cfa w29, 64
 ; CHECK64-NEXT:    .cfi_offset w19, -8
 ; CHECK64-NEXT:    .cfi_offset w20, -16
@@ -3529,7 +3698,7 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    .cfi_offset w30, -56
 ; CHECK64-NEXT:    .cfi_offset w29, -64
-; CHECK64-NEXT:    addvl sp, sp, #-18
+; CHECK64-NEXT:    addvl sp, sp, #-2
 ; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -3542,49 +3711,51 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128
 ; CHECK64-NEXT:    sub sp, sp, #64
-; CHECK64-NEXT:    mov x19, sp
-; CHECK64-NEXT:    mov w2, w1
-; CHECK64-NEXT:    mov w8, w0
-; CHECK64-NEXT:    bl __arm_sme_state
-; CHECK64-NEXT:    mov w8, w8
-; CHECK64-NEXT:    mov x9, sp
-; CHECK64-NEXT:    mov x20, x0
-; CHECK64-NEXT:    add x8, x8, #15
-; CHECK64-NEXT:    and x8, x8, #0x1fffffff0
-; CHECK64-NEXT:    sub x8, x9, x8
-; CHECK64-NEXT:    mov sp, x8
-; CHECK64-NEXT:    //APP
-; CHECK64-NEXT:    //NO_APP
-; CHECK64-NEXT:    tbz w20, #0, .LBB35_2
-; CHECK64-NEXT:  // %bb.1: // %entry
-; CHECK64-NEXT:    smstop sm
-; CHECK64-NEXT:  .LBB35_2: // %entry
-; CHECK64-NEXT:    mov x0, x8
+; CHECK64-NEXT:    addvl sp, sp, #-16
+; CHECK64-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128
+; CHECK64-NEXT:    sub sp, sp, #64
+; CHECK64-NEXT:    mov x19, sp
+; CHECK64-NEXT:    mov w2, w1
+; CHECK64-NEXT:    mov w8, w0
+; CHECK64-NEXT:    bl __arm_sme_state
+; CHECK64-NEXT:    mov w8, w8
+; CHECK64-NEXT:    mov x9, sp
+; CHECK64-NEXT:    mov x20, x0
+; CHECK64-NEXT:    add x8, x8, #15
+; CHECK64-NEXT:    and x8, x8, #0x1fffffff0
+; CHECK64-NEXT:    sub x8, x9, x8
+; CHECK64-NEXT:    mov sp, x8
+; CHECK64-NEXT:    //APP
+; CHECK64-NEXT:    //NO_APP
+; CHECK64-NEXT:    tbz w20, #0, .LBB35_2
+; CHECK64-NEXT:  // %bb.1: // %entry
+; CHECK64-NEXT:    smstop sm
+; CHECK64-NEXT:  .LBB35_2: // %entry
+; CHECK64-NEXT:    mov x0, x8
 ; CHECK64-NEXT:    mov w1, #45 // =0x2d
 ; CHECK64-NEXT:    bl memset
 ; CHECK64-NEXT:    tbz w20, #0, .LBB35_4
@@ -3595,22 +3766,31 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    sub x8, x29, #64
 ; CHECK64-NEXT:    movk w0, #59491, lsl #16
 ; CHECK64-NEXT:    addvl sp, x8, #-18
-; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    addvl sp, x29, #-2
+; CHECK64-NEXT:    .cfi_restore z8
+; CHECK64-NEXT:    .cfi_restore z9
+; CHECK64-NEXT:    .cfi_restore z10
+; CHECK64-NEXT:    .cfi_restore z11
+; CHECK64-NEXT:    .cfi_restore z12
+; CHECK64-NEXT:    .cfi_restore z13
+; CHECK64-NEXT:    .cfi_restore z14
+; CHECK64-NEXT:    .cfi_restore z15
 ; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
@@ -3623,21 +3803,12 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    .cfi_restore z8
-; CHECK64-NEXT:    .cfi_restore z9
-; CHECK64-NEXT:    .cfi_restore z10
-; CHECK64-NEXT:    .cfi_restore z11
-; CHECK64-NEXT:    .cfi_restore z12
-; CHECK64-NEXT:    .cfi_restore z13
-; CHECK64-NEXT:    .cfi_restore z14
-; CHECK64-NEXT:    .cfi_restore z15
-; CHECK64-NEXT:    sub sp, x29, #64
-; CHECK64-NEXT:    .cfi_def_cfa wsp, 128
-; CHECK64-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr x28, [sp, #88] // 8-byte Folded Reload
-; CHECK64-NEXT:    ldp x27, x26, [sp, #96] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    mov sp, x29
+; CHECK64-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK64-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
+; CHECK64-NEXT:    ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK64-NEXT:    .cfi_restore w19
 ; CHECK64-NEXT:    .cfi_restore w20
@@ -3649,305 +3820,444 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    .cfi_restore w29
 ; CHECK64-NEXT:    ret
 ;
-; CHECK1024-LABEL: svecc_call_dynamic_alloca:
-; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1088
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT:    cntd x9
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x20, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1080] // 8-byte Folded Spill
-; CHECK1024-NEXT:    add x29, sp, #1024
-; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
-; CHECK1024-NEXT:    .cfi_offset w19, -8
-; CHECK1024-NEXT:    .cfi_offset w20, -16
-; CHECK1024-NEXT:    .cfi_offset w26, -24
-; CHECK1024-NEXT:    .cfi_offset w27, -32
-; CHECK1024-NEXT:    .cfi_offset w28, -40
-; CHECK1024-NEXT:    .cfi_offset vg, -48
-; CHECK1024-NEXT:    .cfi_offset w30, -56
-; CHECK1024-NEXT:    .cfi_offset w29, -64
-; CHECK1024-NEXT:    addvl sp, sp, #-18
-; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
-; CHECK1024-NEXT:    sub sp, sp, #1024
-; CHECK1024-NEXT:    mov x19, sp
-; CHECK1024-NEXT:    mov w2, w1
-; CHECK1024-NEXT:    mov w8, w0
-; CHECK1024-NEXT:    bl __arm_sme_state
-; CHECK1024-NEXT:    mov w8, w8
-; CHECK1024-NEXT:    mov x9, sp
-; CHECK1024-NEXT:    mov x20, x0
-; CHECK1024-NEXT:    add x8, x8, #15
-; CHECK1024-NEXT:    and x8, x8, #0x1fffffff0
-; CHECK1024-NEXT:    sub x8, x9, x8
-; CHECK1024-NEXT:    mov sp, x8
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    tbz w20, #0, .LBB35_2
-; CHECK1024-NEXT:  // %bb.1: // %entry
-; CHECK1024-NEXT:    smstop sm
-; CHECK1024-NEXT:  .LBB35_2: // %entry
-; CHECK1024-NEXT:    mov x0, x8
-; CHECK1024-NEXT:    mov w1, #45 // =0x2d
-; CHECK1024-NEXT:    bl memset
-; CHECK1024-NEXT:    tbz w20, #0, .LBB35_4
-; CHECK1024-NEXT:  // %bb.3: // %entry
-; CHECK1024-NEXT:    smstart sm
-; CHECK1024-NEXT:  .LBB35_4: // %entry
-; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
-; CHECK1024-NEXT:    sub x8, x29, #1024
-; CHECK1024-NEXT:    movk w0, #59491, lsl #16
-; CHECK1024-NEXT:    addvl sp, x8, #-18
-; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    .cfi_restore z8
-; CHECK1024-NEXT:    .cfi_restore z9
-; CHECK1024-NEXT:    .cfi_restore z10
-; CHECK1024-NEXT:    .cfi_restore z11
-; CHECK1024-NEXT:    .cfi_restore z12
-; CHECK1024-NEXT:    .cfi_restore z13
-; CHECK1024-NEXT:    .cfi_restore z14
-; CHECK1024-NEXT:    .cfi_restore z15
-; CHECK1024-NEXT:    sub sp, x29, #1024
-; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT:    ldr x19, [sp, #1080] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x20, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1088
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 0
-; CHECK1024-NEXT:    .cfi_restore w19
-; CHECK1024-NEXT:    .cfi_restore w20
-; CHECK1024-NEXT:    .cfi_restore w26
-; CHECK1024-NEXT:    .cfi_restore w27
-; CHECK1024-NEXT:    .cfi_restore w28
-; CHECK1024-NEXT:    .cfi_restore vg
-; CHECK1024-NEXT:    .cfi_restore w30
-; CHECK1024-NEXT:    .cfi_restore w29
-; CHECK1024-NEXT:    ret
-entry:
-  %ptr = alloca i8, i32 %P1
-  tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
-  %call = call ptr @memset(ptr noundef nonnull %ptr, i32 noundef 45, i32 noundef %P2)
-  ret i32 -396142473
-}
-
-
-define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
-; CHECK0-LABEL: svecc_call_realign:
-; CHECK0:       // %bb.0: // %entry
-; CHECK0-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
-; CHECK0-NEXT:    .cfi_def_cfa_offset 64
-; CHECK0-NEXT:    cntd x9
-; CHECK0-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
-; CHECK0-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
-; CHECK0-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK0-NEXT:    mov x29, sp
-; CHECK0-NEXT:    .cfi_def_cfa w29, 64
-; CHECK0-NEXT:    .cfi_offset w19, -8
-; CHECK0-NEXT:    .cfi_offset w26, -16
-; CHECK0-NEXT:    .cfi_offset w27, -24
-; CHECK0-NEXT:    .cfi_offset w28, -32
-; CHECK0-NEXT:    .cfi_offset vg, -48
-; CHECK0-NEXT:    .cfi_offset w30, -56
-; CHECK0-NEXT:    .cfi_offset w29, -64
-; CHECK0-NEXT:    addvl sp, sp, #-18
-; CHECK0-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * IncomingVG - 64
-; CHECK0-NEXT:    sub x9, sp, #1024
-; CHECK0-NEXT:    and sp, x9, #0xffffffffffffffe0
-; CHECK0-NEXT:    mov w2, w1
-; CHECK0-NEXT:    bl __arm_sme_state
-; CHECK0-NEXT:    mov x19, x0
-; CHECK0-NEXT:    //APP
-; CHECK0-NEXT:    //NO_APP
-; CHECK0-NEXT:    tbz w19, #0, .LBB36_2
-; CHECK0-NEXT:  // %bb.1: // %entry
-; CHECK0-NEXT:    smstop sm
-; CHECK0-NEXT:  .LBB36_2: // %entry
-; CHECK0-NEXT:    mov x0, sp
-; CHECK0-NEXT:    mov w1, #45 // =0x2d
-; CHECK0-NEXT:    bl memset
-; CHECK0-NEXT:    tbz w19, #0, .LBB36_4
-; CHECK0-NEXT:  // %bb.3: // %entry
-; CHECK0-NEXT:    smstart sm
-; CHECK0-NEXT:  .LBB36_4: // %entry
-; CHECK0-NEXT:    mov w0, #22647 // =0x5877
-; CHECK0-NEXT:    movk w0, #59491, lsl #16
-; CHECK0-NEXT:    addvl sp, x29, #-18
-; CHECK0-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    .cfi_restore z8
-; CHECK0-NEXT:    .cfi_restore z9
-; CHECK0-NEXT:    .cfi_restore z10
-; CHECK0-NEXT:    .cfi_restore z11
-; CHECK0-NEXT:    .cfi_restore z12
-; CHECK0-NEXT:    .cfi_restore z13
-; CHECK0-NEXT:    .cfi_restore z14
-; CHECK0-NEXT:    .cfi_restore z15
-; CHECK0-NEXT:    mov sp, x29
-; CHECK0-NEXT:    .cfi_def_cfa wsp, 64
-; CHECK0-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
-; CHECK0-NEXT:    .cfi_def_cfa_offset 0
-; CHECK0-NEXT:    .cfi_restore w19
-; CHECK0-NEXT:    .cfi_restore w26
-; CHECK0-NEXT:    .cfi_restore w27
-; CHECK0-NEXT:    .cfi_restore w28
-; CHECK0-NEXT:    .cfi_restore vg
-; CHECK0-NEXT:    .cfi_restore w30
-; CHECK0-NEXT:    .cfi_restore w29
-; CHECK0-NEXT:    ret
-;
-; CHECK64-LABEL: svecc_call_realign:
-; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #128
-; CHECK64-NEXT:    .cfi_def_cfa_offset 128
-; CHECK64-NEXT:    cntd x9
-; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x27, x26, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
-; CHECK64-NEXT:    add x29, sp, #64
+; CHECK1024-NOSPLITSVE-LABEL: svecc_call_dynamic_alloca:
+; CHECK1024-NOSPLITSVE:       // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa_offset 1088
+; CHECK1024-NOSPLITSVE-NEXT:    cntd x9
+; CHECK1024-NOSPLITSVE-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x20, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x19, [sp, #1080] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w19, -8
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w20, -16
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w26, -24
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w27, -32
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    mov x19, sp
+; CHECK1024-NOSPLITSVE-NEXT:    mov w2, w1
+; CHECK1024-NOSPLITSVE-NEXT:    mov w8, w0
+; CHECK1024-NOSPLITSVE-NEXT:    bl __arm_sme_state
+; CHECK1024-NOSPLITSVE-NEXT:    mov w8, w8
+; CHECK1024-NOSPLITSVE-NEXT:    mov x9, sp
+; CHECK1024-NOSPLITSVE-NEXT:    mov x20, x0
+; CHECK1024-NOSPLITSVE-NEXT:    add x8, x8, #15
+; CHECK1024-NOSPLITSVE-NEXT:    and x8, x8, #0x1fffffff0
+; CHECK1024-NOSPLITSVE-NEXT:    sub x8, x9, x8
+; CHECK1024-NOSPLITSVE-NEXT:    mov sp, x8
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    tbz w20, #0, .LBB35_2
+; CHECK1024-NOSPLITSVE-NEXT:  // %bb.1: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    smstop sm
+; CHECK1024-NOSPLITSVE-NEXT:  .LBB35_2: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    mov x0, x8
+; CHECK1024-NOSPLITSVE-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-NOSPLITSVE-NEXT:    bl memset
+; CHECK1024-NOSPLITSVE-NEXT:    tbz w20, #0, .LBB35_4
+; CHECK1024-NOSPLITSVE-NEXT:  // %bb.3: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    smstart sm
+; CHECK1024-NOSPLITSVE-NEXT:  .LBB35_4: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z8
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z9
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z10
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z11
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z12
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z13
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z14
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z15
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa wsp, 1088
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x19, [sp, #1080] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x20, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w19
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w20
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w26
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w27
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w28
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore vg
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w30
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w29
+; CHECK1024-NOSPLITSVE-NEXT:    ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_call_dynamic_alloca:
+; CHECK1024-SPLITSVE:       // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa_offset 64
+; CHECK1024-SPLITSVE-NEXT:    cntd x9
+; CHECK1024-SPLITSVE-NEXT:    stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    mov x29, sp
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w20, -16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w26, -24
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w27, -32
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w28, -40
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset vg, -48
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    mov x19, sp
+; CHECK1024-SPLITSVE-NEXT:    mov w2, w1
+; CHECK1024-SPLITSVE-NEXT:    mov w8, w0
+; CHECK1024-SPLITSVE-NEXT:    bl __arm_sme_state
+; CHECK1024-SPLITSVE-NEXT:    mov w8, w8
+; CHECK1024-SPLITSVE-NEXT:    mov x9, sp
+; CHECK1024-SPLITSVE-NEXT:    mov x20, x0
+; CHECK1024-SPLITSVE-NEXT:    add x8, x8, #15
+; CHECK1024-SPLITSVE-NEXT:    and x8, x8, #0x1fffffff0
+; CHECK1024-SPLITSVE-NEXT:    sub x8, x9, x8
+; CHECK1024-SPLITSVE-NEXT:    mov sp, x8
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    tbz w20, #0, .LBB35_2
+; CHECK1024-SPLITSVE-NEXT:  // %bb.1: // %entry
+; CHECK1024-SPLITSVE-NEXT:    smstop sm
+; CHECK1024-SPLITSVE-NEXT:  .LBB35_2: // %entry
+; CHECK1024-SPLITSVE-NEXT:    mov x0, x8
+; CHECK1024-SPLITSVE-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-SPLITSVE-NEXT:    bl memset
+; CHECK1024-SPLITSVE-NEXT:    tbz w20, #0, .LBB35_4
+; CHECK1024-SPLITSVE-NEXT:  // %bb.3: // %entry
+; CHECK1024-SPLITSVE-NEXT:    smstart sm
+; CHECK1024-SPLITSVE-NEXT:  .LBB35_4: // %entry
+; CHECK1024-SPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-SPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-SPLITSVE-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x29, #-2
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z9
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z10
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z11
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z12
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z13
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z14
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z15
+; CHECK1024-SPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    mov sp, x29
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK1024-SPLITSVE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w19
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w20
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w26
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w27
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w28
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore vg
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w30
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w29
+; CHECK1024-SPLITSVE-NEXT:    ret
+entry:
+  %ptr = alloca i8, i32 %P1
+  tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
+  %call = call ptr @memset(ptr noundef nonnull %ptr, i32 noundef 45, i32 noundef %P2)
+  ret i32 -396142473
+}
+
+
+define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
+; CHECK0-LABEL: svecc_call_realign:
+; CHECK0:       // %bb.0: // %entry
+; CHECK0-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_def_cfa_offset 64
+; CHECK0-NEXT:    cntd x9
+; CHECK0-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK0-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK0-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK0-NEXT:    mov x29, sp
+; CHECK0-NEXT:    .cfi_def_cfa w29, 64
+; CHECK0-NEXT:    .cfi_offset w19, -8
+; CHECK0-NEXT:    .cfi_offset w26, -16
+; CHECK0-NEXT:    .cfi_offset w27, -24
+; CHECK0-NEXT:    .cfi_offset w28, -32
+; CHECK0-NEXT:    .cfi_offset vg, -48
+; CHECK0-NEXT:    .cfi_offset w30, -56
+; CHECK0-NEXT:    .cfi_offset w29, -64
+; CHECK0-NEXT:    addvl sp, sp, #-18
+; CHECK0-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * IncomingVG - 64
+; CHECK0-NEXT:    sub x9, sp, #1024
+; CHECK0-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK0-NEXT:    mov w2, w1
+; CHECK0-NEXT:    bl __arm_sme_state
+; CHECK0-NEXT:    mov x19, x0
+; CHECK0-NEXT:    //APP
+; CHECK0-NEXT:    //NO_APP
+; CHECK0-NEXT:    tbz w19, #0, .LBB36_2
+; CHECK0-NEXT:  // %bb.1: // %entry
+; CHECK0-NEXT:    smstop sm
+; CHECK0-NEXT:  .LBB36_2: // %entry
+; CHECK0-NEXT:    mov x0, sp
+; CHECK0-NEXT:    mov w1, #45 // =0x2d
+; CHECK0-NEXT:    bl memset
+; CHECK0-NEXT:    tbz w19, #0, .LBB36_4
+; CHECK0-NEXT:  // %bb.3: // %entry
+; CHECK0-NEXT:    smstart sm
+; CHECK0-NEXT:  .LBB36_4: // %entry
+; CHECK0-NEXT:    mov w0, #22647 // =0x5877
+; CHECK0-NEXT:    movk w0, #59491, lsl #16
+; CHECK0-NEXT:    addvl sp, x29, #-18
+; CHECK0-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    .cfi_restore z8
+; CHECK0-NEXT:    .cfi_restore z9
+; CHECK0-NEXT:    .cfi_restore z10
+; CHECK0-NEXT:    .cfi_restore z11
+; CHECK0-NEXT:    .cfi_restore z12
+; CHECK0-NEXT:    .cfi_restore z13
+; CHECK0-NEXT:    .cfi_restore z14
+; CHECK0-NEXT:    .cfi_restore z15
+; CHECK0-NEXT:    mov sp, x29
+; CHECK0-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK0-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK0-NEXT:    .cfi_def_cfa_offset 0
+; CHECK0-NEXT:    .cfi_restore w19
+; CHECK0-NEXT:    .cfi_restore w26
+; CHECK0-NEXT:    .cfi_restore w27
+; CHECK0-NEXT:    .cfi_restore w28
+; CHECK0-NEXT:    .cfi_restore vg
+; CHECK0-NEXT:    .cfi_restore w30
+; CHECK0-NEXT:    .cfi_restore w29
+; CHECK0-NEXT:    ret
+;
+; CHECK64-LABEL: svecc_call_realign:
+; CHECK64:       // %bb.0: // %entry
+; CHECK64-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_def_cfa_offset 64
+; CHECK64-NEXT:    cntd x9
+; CHECK64-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK64-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK64-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK64-NEXT:    mov x29, sp
 ; CHECK64-NEXT:    .cfi_def_cfa w29, 64
-; CHECK64-NEXT:    .cfi_offset w19, -16
-; CHECK64-NEXT:    .cfi_offset w26, -24
-; CHECK64-NEXT:    .cfi_offset w27, -32
-; CHECK64-NEXT:    .cfi_offset w28, -40
+; CHECK64-NEXT:    .cfi_offset w19, -8
+; CHECK64-NEXT:    .cfi_offset w26, -16
+; CHECK64-NEXT:    .cfi_offset w27, -24
+; CHECK64-NEXT:    .cfi_offset w28, -32
 ; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    .cfi_offset w30, -56
 ; CHECK64-NEXT:    .cfi_offset w29, -64
-; CHECK64-NEXT:    addvl sp, sp, #-18
+; CHECK64-NEXT:    addvl sp, sp, #-2
 ; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -3960,30 +4270,32 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT:    sub sp, sp, #64
+; CHECK64-NEXT:    addvl sp, sp, #-16
+; CHECK64-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128
 ; CHECK64-NEXT:    sub x9, sp, #1088
 ; CHECK64-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; CHECK64-NEXT:    mov w2, w1
@@ -4006,34 +4318,23 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    sub x8, x29, #64
 ; CHECK64-NEXT:    movk w0, #59491, lsl #16
 ; CHECK64-NEXT:    addvl sp, x8, #-18
-; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    addvl sp, x29, #-2
 ; CHECK64-NEXT:    .cfi_restore z8
 ; CHECK64-NEXT:    .cfi_restore z9
 ; CHECK64-NEXT:    .cfi_restore z10
@@ -4042,12 +4343,23 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    .cfi_restore z13
 ; CHECK64-NEXT:    .cfi_restore z14
 ; CHECK64-NEXT:    .cfi_restore z15
-; CHECK64-NEXT:    sub sp, x29, #64
-; CHECK64-NEXT:    .cfi_def_cfa wsp, 128
-; CHECK64-NEXT:    ldp x26, x19, [sp, #104] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x28, x27, [sp, #88] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    mov sp, x29
+; CHECK64-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK64-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK64-NEXT:    .cfi_restore w19
 ; CHECK64-NEXT:    .cfi_restore w26
@@ -4058,140 +4370,270 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    .cfi_restore w29
 ; CHECK64-NEXT:    ret
 ;
-; CHECK1024-LABEL: svecc_call_realign:
-; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1088
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT:    cntd x9
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT:    add x29, sp, #1024
-; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
-; CHECK1024-NEXT:    .cfi_offset w19, -16
-; CHECK1024-NEXT:    .cfi_offset w26, -24
-; CHECK1024-NEXT:    .cfi_offset w27, -32
-; CHECK1024-NEXT:    .cfi_offset w28, -40
-; CHECK1024-NEXT:    .cfi_offset vg, -48
-; CHECK1024-NEXT:    .cfi_offset w30, -56
-; CHECK1024-NEXT:    .cfi_offset w29, -64
-; CHECK1024-NEXT:    addvl sp, sp, #-18
-; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
-; CHECK1024-NEXT:    sub x9, sp, #2048
-; CHECK1024-NEXT:    and sp, x9, #0xffffffffffffffe0
-; CHECK1024-NEXT:    mov w2, w1
-; CHECK1024-NEXT:    bl __arm_sme_state
-; CHECK1024-NEXT:    mov x19, x0
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    tbz w19, #0, .LBB36_2
-; CHECK1024-NEXT:  // %bb.1: // %entry
-; CHECK1024-NEXT:    smstop sm
-; CHECK1024-NEXT:  .LBB36_2: // %entry
-; CHECK1024-NEXT:    mov x0, sp
-; CHECK1024-NEXT:    mov w1, #45 // =0x2d
-; CHECK1024-NEXT:    bl memset
-; CHECK1024-NEXT:    tbz w19, #0, .LBB36_4
-; CHECK1024-NEXT:  // %bb.3: // %entry
-; CHECK1024-NEXT:    smstart sm
-; CHECK1024-NEXT:  .LBB36_4: // %entry
-; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
-; CHECK1024-NEXT:    sub x8, x29, #1024
-; CHECK1024-NEXT:    movk w0, #59491, lsl #16
-; CHECK1024-NEXT:    addvl sp, x8, #-18
-; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    .cfi_restore z8
-; CHECK1024-NEXT:    .cfi_restore z9
-; CHECK1024-NEXT:    .cfi_restore z10
-; CHECK1024-NEXT:    .cfi_restore z11
-; CHECK1024-NEXT:    .cfi_restore z12
-; CHECK1024-NEXT:    .cfi_restore z13
-; CHECK1024-NEXT:    .cfi_restore z14
-; CHECK1024-NEXT:    .cfi_restore z15
-; CHECK1024-NEXT:    sub sp, x29, #1024
-; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1088
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 0
-; CHECK1024-NEXT:    .cfi_restore w19
-; CHECK1024-NEXT:    .cfi_restore w26
-; CHECK1024-NEXT:    .cfi_restore w27
-; CHECK1024-NEXT:    .cfi_restore w28
-; CHECK1024-NEXT:    .cfi_restore vg
-; CHECK1024-NEXT:    .cfi_restore w30
-; CHECK1024-NEXT:    .cfi_restore w29
-; CHECK1024-NEXT:    ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_call_realign:
+; CHECK1024-NOSPLITSVE:       // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa_offset 1088
+; CHECK1024-NOSPLITSVE-NEXT:    cntd x9
+; CHECK1024-NOSPLITSVE-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w26, -24
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w27, -32
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    sub x9, sp, #2048
+; CHECK1024-NOSPLITSVE-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK1024-NOSPLITSVE-NEXT:    mov w2, w1
+; CHECK1024-NOSPLITSVE-NEXT:    bl __arm_sme_state
+; CHECK1024-NOSPLITSVE-NEXT:    mov x19, x0
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    tbz w19, #0, .LBB36_2
+; CHECK1024-NOSPLITSVE-NEXT:  // %bb.1: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    smstop sm
+; CHECK1024-NOSPLITSVE-NEXT:  .LBB36_2: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    mov x0, sp
+; CHECK1024-NOSPLITSVE-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-NOSPLITSVE-NEXT:    bl memset
+; CHECK1024-NOSPLITSVE-NEXT:    tbz w19, #0, .LBB36_4
+; CHECK1024-NOSPLITSVE-NEXT:  // %bb.3: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    smstart sm
+; CHECK1024-NOSPLITSVE-NEXT:  .LBB36_4: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z8
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z9
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z10
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z11
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z12
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z13
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z14
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z15
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa wsp, 1088
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w19
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w26
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w27
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w28
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore vg
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w30
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w29
+; CHECK1024-NOSPLITSVE-NEXT:    ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_call_realign:
+; CHECK1024-SPLITSVE:       // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa_offset 64
+; CHECK1024-SPLITSVE-NEXT:    cntd x9
+; CHECK1024-SPLITSVE-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    mov x29, sp
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w26, -16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w27, -24
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w28, -32
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset vg, -48
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    sub x9, sp, #2048
+; CHECK1024-SPLITSVE-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK1024-SPLITSVE-NEXT:    mov w2, w1
+; CHECK1024-SPLITSVE-NEXT:    bl __arm_sme_state
+; CHECK1024-SPLITSVE-NEXT:    mov x19, x0
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    tbz w19, #0, .LBB36_2
+; CHECK1024-SPLITSVE-NEXT:  // %bb.1: // %entry
+; CHECK1024-SPLITSVE-NEXT:    smstop sm
+; CHECK1024-SPLITSVE-NEXT:  .LBB36_2: // %entry
+; CHECK1024-SPLITSVE-NEXT:    mov x0, sp
+; CHECK1024-SPLITSVE-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-SPLITSVE-NEXT:    bl memset
+; CHECK1024-SPLITSVE-NEXT:    tbz w19, #0, .LBB36_4
+; CHECK1024-SPLITSVE-NEXT:  // %bb.3: // %entry
+; CHECK1024-SPLITSVE-NEXT:    smstart sm
+; CHECK1024-SPLITSVE-NEXT:  .LBB36_4: // %entry
+; CHECK1024-SPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-SPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-SPLITSVE-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x29, #-2
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z9
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z10
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z11
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z12
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z13
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z14
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z15
+; CHECK1024-SPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    mov sp, x29
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK1024-SPLITSVE-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w19
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w26
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w27
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w28
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore vg
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w30
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w29
+; CHECK1024-SPLITSVE-NEXT:    ret
 entry:
   %ptr = alloca i8, i32 1000, align 32
   tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
@@ -4311,13 +4753,12 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %
 ;
 ; CHECK64-LABEL: svecc_call_dynamic_and_scalable_alloca:
 ; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #128
-; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT:    add x29, sp, #64
-; CHECK64-NEXT:    stp x28, x27, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x26, x20, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
-; CHECK64-NEXT:    addvl sp, sp, #-18
+; CHECK64-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK64-NEXT:    str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK64-NEXT:    mov x29, sp
+; CHECK64-NEXT:    stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK64-NEXT:    addvl sp, sp, #-2
 ; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -4330,41 +4771,43 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %
 ; CHECK64-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    sub sp, sp, #64
+; CHECK64-NEXT:    addvl sp, sp, #-16
+; CHECK64-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK64-NEXT:    sub sp, sp, #112
 ; CHECK64-NEXT:    addvl sp, sp, #-1
 ; CHECK64-NEXT:    mov x19, sp
 ; CHECK64-NEXT:    .cfi_def_cfa w29, 64
-; CHECK64-NEXT:    .cfi_offset w19, -16
-; CHECK64-NEXT:    .cfi_offset w20, -24
-; CHECK64-NEXT:    .cfi_offset w26, -32
-; CHECK64-NEXT:    .cfi_offset w27, -40
+; CHECK64-NEXT:    .cfi_offset w19, -8
+; CHECK64-NEXT:    .cfi_offset w20, -16
+; CHECK64-NEXT:    .cfi_offset w26, -24
+; CHECK64-NEXT:    .cfi_offset w27, -32
 ; CHECK64-NEXT:    .cfi_offset w28, -48
 ; CHECK64-NEXT:    .cfi_offset w30, -56
 ; CHECK64-NEXT:    .cfi_offset w29, -64
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * VG - 128
 ; CHECK64-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK64-NEXT:    ubfiz x8, x0, #2, #32
 ; CHECK64-NEXT:    mov x9, sp
@@ -4385,22 +4828,23 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %
 ; CHECK64-NEXT:    sub x8, x29, #64
 ; CHECK64-NEXT:    movk w0, #59491, lsl #16
 ; CHECK64-NEXT:    addvl sp, x8, #-18
-; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    addvl sp, x29, #-2
 ; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
@@ -4413,131 +4857,243 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %
 ; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    sub sp, x29, #64
-; CHECK64-NEXT:    ldp x20, x19, [sp, #104] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
-; CHECK64-NEXT:    ldp x27, x26, [sp, #88] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x30, x28, [sp, #72] // 16-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    mov sp, x29
+; CHECK64-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK64-NEXT:    ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
 ; CHECK64-NEXT:    ret
 ;
-; CHECK1024-LABEL: svecc_call_dynamic_and_scalable_alloca:
-; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1088
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    add x29, sp, #1024
-; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x28, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x27, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x26, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x20, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT:    addvl sp, sp, #-18
-; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    sub sp, sp, #1072
-; CHECK1024-NEXT:    addvl sp, sp, #-1
-; CHECK1024-NEXT:    mov x19, sp
-; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
-; CHECK1024-NEXT:    .cfi_offset w19, -16
-; CHECK1024-NEXT:    .cfi_offset w20, -24
-; CHECK1024-NEXT:    .cfi_offset w26, -32
-; CHECK1024-NEXT:    .cfi_offset w27, -40
-; CHECK1024-NEXT:    .cfi_offset w28, -48
-; CHECK1024-NEXT:    .cfi_offset w30, -56
-; CHECK1024-NEXT:    .cfi_offset w29, -64
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088
-; CHECK1024-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK1024-NEXT:    ubfiz x8, x0, #2, #32
-; CHECK1024-NEXT:    mov x9, sp
-; CHECK1024-NEXT:    add x8, x8, #15
-; CHECK1024-NEXT:    and x8, x8, #0x7fffffff0
-; CHECK1024-NEXT:    sub x20, x9, x8
-; CHECK1024-NEXT:    mov sp, x20
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    add x0, x19, #8
-; CHECK1024-NEXT:    bl bar
-; CHECK1024-NEXT:    sub x0, x29, #1024
-; CHECK1024-NEXT:    addvl x0, x0, #-19
-; CHECK1024-NEXT:    bl bar
-; CHECK1024-NEXT:    mov x0, x20
-; CHECK1024-NEXT:    bl bar
-; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
-; CHECK1024-NEXT:    sub x8, x29, #1024
-; CHECK1024-NEXT:    movk w0, #59491, lsl #16
-; CHECK1024-NEXT:    addvl sp, x8, #-18
-; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    sub sp, x29, #1024
-; CHECK1024-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x20, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x26, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x27, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x28, [sp, #1040] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1088
-; CHECK1024-NEXT:    ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_call_dynamic_and_scalable_alloca:
+; CHECK1024-NOSPLITSVE:       // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x28, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x27, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x26, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x20, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1072
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-1
+; CHECK1024-NOSPLITSVE-NEXT:    mov x19, sp
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w20, -24
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w26, -32
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w27, -40
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w28, -48
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK1024-NOSPLITSVE-NEXT:    ubfiz x8, x0, #2, #32
+; CHECK1024-NOSPLITSVE-NEXT:    mov x9, sp
+; CHECK1024-NOSPLITSVE-NEXT:    add x8, x8, #15
+; CHECK1024-NOSPLITSVE-NEXT:    and x8, x8, #0x7fffffff0
+; CHECK1024-NOSPLITSVE-NEXT:    sub x20, x9, x8
+; CHECK1024-NOSPLITSVE-NEXT:    mov sp, x20
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    add x0, x19, #8
+; CHECK1024-NOSPLITSVE-NEXT:    bl bar
+; CHECK1024-NOSPLITSVE-NEXT:    sub x0, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    addvl x0, x0, #-19
+; CHECK1024-NOSPLITSVE-NEXT:    bl bar
+; CHECK1024-NOSPLITSVE-NEXT:    mov x0, x20
+; CHECK1024-NOSPLITSVE-NEXT:    bl bar
+; CHECK1024-NOSPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x20, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x26, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x27, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x28, [sp, #1040] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_call_dynamic_and_scalable_alloca:
+; CHECK1024-SPLITSVE:       // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    mov x29, sp
+; CHECK1024-SPLITSVE-NEXT:    stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1072
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-1
+; CHECK1024-SPLITSVE-NEXT:    mov x19, sp
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w20, -16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w26, -24
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w27, -32
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w28, -48
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK1024-SPLITSVE-NEXT:    ubfiz x8, x0, #2, #32
+; CHECK1024-SPLITSVE-NEXT:    mov x9, sp
+; CHECK1024-SPLITSVE-NEXT:    add x8, x8, #15
+; CHECK1024-SPLITSVE-NEXT:    and x8, x8, #0x7fffffff0
+; CHECK1024-SPLITSVE-NEXT:    sub x20, x9, x8
+; CHECK1024-SPLITSVE-NEXT:    mov sp, x20
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    add x0, x19, #8
+; CHECK1024-SPLITSVE-NEXT:    bl bar
+; CHECK1024-SPLITSVE-NEXT:    sub x0, x29, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl x0, x0, #-19
+; CHECK1024-SPLITSVE-NEXT:    bl bar
+; CHECK1024-SPLITSVE-NEXT:    mov x0, x20
+; CHECK1024-SPLITSVE-NEXT:    bl bar
+; CHECK1024-SPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-SPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-SPLITSVE-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x29, #-2
+; CHECK1024-SPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    mov sp, x29
+; CHECK1024-SPLITSVE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ret
 entry:
   %a = alloca i32, i32 10
   %b = alloca <vscale x 4 x i32>
diff --git a/llvm/test/CodeGen/AArch64/stackmap.ll b/llvm/test/CodeGen/AArch64/stackmap.ll
index 995d2545c6359..26221d0c26eb2 100644
--- a/llvm/test/CodeGen/AArch64/stackmap.ll
+++ b/llvm/test/CodeGen/AArch64/stackmap.ll
@@ -81,14 +81,14 @@
 ; CHECK-NEXT:   .hword  8
 ; CHECK-NEXT:   .hword  0
 ; CHECK-NEXT:   .hword  0
-; CHECK-NEXT:   .word   65535
+; CHECK-NEXT:   .word   -1
 ; SmallConstant
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .hword  8
 ; CHECK-NEXT:   .hword  0
 ; CHECK-NEXT:   .hword  0
-; CHECK-NEXT:   .word   65535
+; CHECK-NEXT:   .word   -1
 ; SmallConstant
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   0
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
index 0580f5e0b019a..582e8456c05b3 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
@@ -466,12 +466,10 @@ define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x
 define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c) {
 ; NOB16B16-LABEL: fmla_nxv4bf16:
 ; NOB16B16:       // %bb.0:
-; NOB16B16-NEXT:    lsl z1.s, z1.s, #16
-; NOB16B16-NEXT:    lsl z0.s, z0.s, #16
 ; NOB16B16-NEXT:    lsl z2.s, z2.s, #16
 ; NOB16B16-NEXT:    ptrue p0.s
-; NOB16B16-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
-; NOB16B16-NEXT:    bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT:    bfmlalb z2.s, z0.h, z1.h
+; NOB16B16-NEXT:    bfcvt z0.h, p0/m, z2.s
 ; NOB16B16-NEXT:    ret
 ;
 ; B16B16-LABEL: fmla_nxv4bf16:
@@ -486,24 +484,20 @@ define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x
 define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) {
 ; NOB16B16-LABEL: fmla_nxv8bf16:
 ; NOB16B16:       // %bb.0:
-; NOB16B16-NEXT:    uunpkhi z3.s, z1.h
-; NOB16B16-NEXT:    uunpkhi z4.s, z0.h
-; NOB16B16-NEXT:    uunpkhi z5.s, z2.h
+; NOB16B16-NEXT:    uunpkhi z3.s, z2.h
+; NOB16B16-NEXT:    uunpklo z2.s, z2.h
+; NOB16B16-NEXT:    uunpkhi z4.s, z1.h
+; NOB16B16-NEXT:    uunpkhi z5.s, z0.h
 ; NOB16B16-NEXT:    uunpklo z1.s, z1.h
 ; NOB16B16-NEXT:    uunpklo z0.s, z0.h
-; NOB16B16-NEXT:    uunpklo z2.s, z2.h
 ; NOB16B16-NEXT:    ptrue p0.s
 ; NOB16B16-NEXT:    lsl z3.s, z3.s, #16
-; NOB16B16-NEXT:    lsl z4.s, z4.s, #16
-; NOB16B16-NEXT:    lsl z5.s, z5.s, #16
-; NOB16B16-NEXT:    lsl z1.s, z1.s, #16
-; NOB16B16-NEXT:    lsl z0.s, z0.s, #16
 ; NOB16B16-NEXT:    lsl z2.s, z2.s, #16
-; NOB16B16-NEXT:    fmad z3.s, p0/m, z4.s, z5.s
-; NOB16B16-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
-; NOB16B16-NEXT:    bfcvt z1.h, p0/m, z3.s
-; NOB16B16-NEXT:    bfcvt z0.h, p0/m, z0.s
-; NOB16B16-NEXT:    uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT:    bfmlalb z3.s, z5.h, z4.h
+; NOB16B16-NEXT:    bfmlalb z2.s, z0.h, z1.h
+; NOB16B16-NEXT:    bfcvt z0.h, p0/m, z3.s
+; NOB16B16-NEXT:    bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT:    uzp1 z0.h, z1.h, z0.h
 ; NOB16B16-NEXT:    ret
 ;
 ; B16B16-LABEL: fmla_nxv8bf16:
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
index 5c58eab391972..16e8feb0dc5bb 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
@@ -1,79 +1,175 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s
+; RUN: llc -mattr=+sve,+bf16             < %s | FileCheck %s --check-prefixes=SVE
+; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s --check-prefixes=SVE-B16B16
 
 target triple = "aarch64-unknown-linux-gnu"
 
 define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
-; CHECK-LABEL: fmla_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmla_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z0.s, z0.h
+; SVE-NEXT:    uunpkhi z4.s, z2.h
+; SVE-NEXT:    uunpkhi z5.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    bfmlalb z3.s, z5.h, z4.h
+; SVE-NEXT:    bfmlalb z0.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z1.h, p0/m, z3.s
+; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT:    uzp1 z0.h, z0.h, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmla_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    ptrue p0.h
+; SVE-B16B16-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
   %res = fadd contract <vscale x 8 x bfloat> %acc, %mul
   ret <vscale x 8 x bfloat> %res
 }
 
 define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
-; CHECK-LABEL: fmla_nxv4bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmla_nxv4bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    bfmlalb z0.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmla_nxv4bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    ptrue p0.s
+; SVE-B16B16-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
   %res = fadd contract <vscale x 4 x bfloat> %acc, %mul
   ret <vscale x 4 x bfloat> %res
 }
 
 define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
-; CHECK-LABEL: fmla_nxv2bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmla_nxv2bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    ptrue p0.d
+; SVE-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmla_nxv2bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    ptrue p0.d
+; SVE-B16B16-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
   %res = fadd contract <vscale x 2 x bfloat> %acc, %mul
   ret <vscale x 2 x bfloat> %res
 }
 
 define <vscale x 8 x bfloat> @fmls_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
-; CHECK-LABEL: fmls_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmls_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    ptrue p0.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z0.s, z0.h
+; SVE-NEXT:    uunpkhi z5.s, z2.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    fneg z1.h, p0/m, z1.h
+; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    bfmlalb z3.s, z4.h, z5.h
+; SVE-NEXT:    bfmlalb z0.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z1.h, p0/m, z3.s
+; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT:    uzp1 z0.h, z0.h, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmls_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    ptrue p0.h
+; SVE-B16B16-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
   %res = fsub contract <vscale x 8 x bfloat> %acc, %mul
   ret <vscale x 8 x bfloat> %res
 }
 
 define <vscale x 4 x bfloat> @fmls_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
-; CHECK-LABEL: fmls_nxv4bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmls_nxv4bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    fneg z1.h, p0/m, z1.h
+; SVE-NEXT:    bfmlalb z0.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmls_nxv4bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    ptrue p0.s
+; SVE-B16B16-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
   %res = fsub contract <vscale x 4 x bfloat> %acc, %mul
   ret <vscale x 4 x bfloat> %res
 }
 
 define <vscale x 2 x bfloat> @fmls_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
-; CHECK-LABEL: fmls_nxv2bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmls_nxv2bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    ptrue p0.d
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    fneg z1.h, p0/m, z1.h
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; SVE-NEXT:    bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmls_nxv2bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    ptrue p0.d
+; SVE-B16B16-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
   %res = fsub contract <vscale x 2 x bfloat> %acc, %mul
   ret <vscale x 2 x bfloat> %res
 }
 
 define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
-; CHECK-LABEL: fmla_sel_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmla_sel_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    uunpkhi z5.s, z2.h
+; SVE-NEXT:    uunpkhi z6.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    bfmlalb z3.s, z6.h, z5.h
+; SVE-NEXT:    bfmlalb z4.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z1.h, p1/m, z3.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z4.s
+; SVE-NEXT:    uzp1 z1.h, z2.h, z1.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmla_sel_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
   %add = fadd contract <vscale x 8 x bfloat> %acc, %mul
   %res = select <vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %add, <vscale x 8 x bfloat> %acc
@@ -81,10 +177,17 @@ define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
 }
 
 define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
-; CHECK-LABEL: fmla_sel_nxv4bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmla_sel_nxv4bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    lsl z3.s, z0.s, #16
+; SVE-NEXT:    bfmlalb z3.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z0.h, p0/m, z3.s
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmla_sel_nxv4bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
   %add = fadd contract <vscale x 4 x bfloat> %acc, %mul
   %res = select <vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %add, <vscale x 4 x bfloat> %acc
@@ -92,10 +195,20 @@ define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
 }
 
 define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
-; CHECK-LABEL: fmla_sel_nxv2bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmla_sel_nxv2bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    lsl z3.s, z0.s, #16
+; SVE-NEXT:    ptrue p1.d
+; SVE-NEXT:    fmad z1.s, p1/m, z2.s, z3.s
+; SVE-NEXT:    bfcvt z0.h, p0/m, z1.s
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmla_sel_nxv2bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
   %add = fadd contract <vscale x 2 x bfloat> %acc, %mul
   %res = select <vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %add, <vscale x 2 x bfloat> %acc
@@ -103,10 +216,31 @@ define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale
 }
 
 define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
-; CHECK-LABEL: fmls_sel_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmls_sel_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    ptrue p1.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    uunpkhi z6.s, z2.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    fneg z1.h, p1/m, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    uunpkhi z5.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    bfmlalb z3.s, z5.h, z6.h
+; SVE-NEXT:    bfmlalb z4.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z1.h, p1/m, z3.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z4.s
+; SVE-NEXT:    uzp1 z1.h, z2.h, z1.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmls_sel_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
   %sub = fsub contract <vscale x 8 x bfloat> %acc, %mul
   %res = select <vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %sub, <vscale x 8 x bfloat> %acc
@@ -114,10 +248,19 @@ define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
 }
 
 define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
-; CHECK-LABEL: fmls_sel_nxv4bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmls_sel_nxv4bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z0.s, #16
+; SVE-NEXT:    fneg z1.h, p1/m, z1.h
+; SVE-NEXT:    bfmlalb z3.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z0.h, p0/m, z3.s
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmls_sel_nxv4bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
   %sub = fsub contract <vscale x 4 x bfloat> %acc, %mul
   %res = select <vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %sub, <vscale x 4 x bfloat> %acc
@@ -125,10 +268,21 @@ define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
 }
 
 define <vscale x 2 x bfloat> @fmls_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
-; CHECK-LABEL: fmls_sel_nxv2bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fmls_sel_nxv2bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    ptrue p1.d
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z0.s, #16
+; SVE-NEXT:    fneg z1.h, p1/m, z1.h
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmad z1.s, p1/m, z2.s, z3.s
+; SVE-NEXT:    bfcvt z0.h, p0/m, z1.s
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fmls_sel_nxv2bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT:    ret
   %mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
   %sub = fsub contract <vscale x 2 x bfloat> %acc, %mul
   %res = select <vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %sub, <vscale x 2 x bfloat> %acc
@@ -136,33 +290,90 @@ define <vscale x 2 x bfloat> @fmls_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale
 }
 
 define <vscale x 8 x bfloat> @fadd_sel_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fadd_sel_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    fadd z2.s, z3.s, z2.s
+; SVE-NEXT:    fadd z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fadd_sel_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfadd z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> zeroinitializer
   %fadd = fadd nsz <vscale x 8 x bfloat> %a, %sel
   ret <vscale x 8 x bfloat> %fadd
 }
 
 define <vscale x 8 x bfloat> @fsub_sel_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fsub_sel_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    fsub z2.s, z3.s, z2.s
+; SVE-NEXT:    fsub z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fsub_sel_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfsub z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> zeroinitializer
   %fsub = fsub <vscale x 8 x bfloat> %a, %sel
   ret <vscale x 8 x bfloat> %fsub
 }
 
 define <vscale x 8 x bfloat> @fadd_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_negzero_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fadd_sel_negzero_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    fadd z2.s, z3.s, z2.s
+; SVE-NEXT:    fadd z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fadd_sel_negzero_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfadd z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %nz = fneg <vscale x 8 x bfloat> zeroinitializer
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %nz
   %fadd = fadd <vscale x 8 x bfloat> %a, %sel
@@ -170,11 +381,30 @@ define <vscale x 8 x bfloat> @fadd_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a
 }
 
 define <vscale x 8 x bfloat> @fsub_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_negzero_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fsub_sel_negzero_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    fsub z2.s, z3.s, z2.s
+; SVE-NEXT:    fsub z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fsub_sel_negzero_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfsub z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %nz = fneg <vscale x 8 x bfloat> zeroinitializer
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %nz
   %fsub = fsub nsz <vscale x 8 x bfloat> %a, %sel
@@ -182,13 +412,46 @@ define <vscale x 8 x bfloat> @fsub_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a
 }
 
 define <vscale x 8 x bfloat> @fadd_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_fmul_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    sel z1.h, p0, z1.h, z3.h
-; CHECK-NEXT:    bfadd z0.h, z0.h, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fadd_sel_fmul_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    movi v3.2d, #0000000000000000
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    sel z1.h, p0, z1.h, z3.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z0.s, z0.h
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fadd z2.s, z3.s, z2.s
+; SVE-NEXT:    fadd z0.s, z0.s, z1.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z0.h, p1/m, z0.s
+; SVE-NEXT:    uzp1 z0.h, z0.h, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fadd_sel_fmul_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    movi v3.2d, #0000000000000000
+; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT:    sel z1.h, p0, z1.h, z3.h
+; SVE-B16B16-NEXT:    bfadd z0.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    ret
   %fmul = fmul <vscale x 8 x bfloat> %b, %c
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> zeroinitializer
   %fadd = fadd contract <vscale x 8 x bfloat> %a, %sel
@@ -196,12 +459,41 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <
 }
 
 define <vscale x 8 x bfloat> @fsub_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_fmul_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fsub_sel_fmul_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fsub z2.s, z3.s, z2.s
+; SVE-NEXT:    fsub z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fsub_sel_fmul_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT:    bfsub z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %fmul = fmul <vscale x 8 x bfloat> %b, %c
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> zeroinitializer
   %fsub = fsub contract <vscale x 8 x bfloat> %a, %sel
@@ -209,12 +501,41 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <
 }
 
 define <vscale x 8 x bfloat> @fadd_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_fmul_nsz_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fadd_sel_fmul_nsz_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fadd z2.s, z3.s, z2.s
+; SVE-NEXT:    fadd z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fadd_sel_fmul_nsz_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT:    bfadd z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %fmul = fmul <vscale x 8 x bfloat> %b, %c
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> zeroinitializer
   %fadd = fadd nsz contract <vscale x 8 x bfloat> %a, %sel
@@ -222,12 +543,41 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %
 }
 
 define <vscale x 8 x bfloat> @fsub_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_fmul_nsz_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fsub_sel_fmul_nsz_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fsub z2.s, z3.s, z2.s
+; SVE-NEXT:    fsub z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fsub_sel_fmul_nsz_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT:    bfsub z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %fmul = fmul <vscale x 8 x bfloat> %b, %c
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> zeroinitializer
   %fsub = fsub nsz contract <vscale x 8 x bfloat> %a, %sel
@@ -235,12 +585,41 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %
 }
 
 define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_fmul_negzero_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fadd_sel_fmul_negzero_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fadd z2.s, z3.s, z2.s
+; SVE-NEXT:    fadd z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fadd_sel_fmul_negzero_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT:    bfadd z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %fmul = fmul <vscale x 8 x bfloat> %b, %c
   %nz = fneg <vscale x 8 x bfloat> zeroinitializer
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> %nz
@@ -249,15 +628,50 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloa
 }
 
 define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_fmul_negzero_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    fmov h3, w8
-; CHECK-NEXT:    mov z3.h, h3
-; CHECK-NEXT:    sel z1.h, p0, z1.h, z3.h
-; CHECK-NEXT:    bfsub z0.h, z0.h, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fsub_sel_fmul_negzero_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    mov w8, #32768 // =0x8000
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    fmov h3, w8
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    mov z3.h, h3
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    sel z1.h, p0, z1.h, z3.h
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    uunpklo z0.s, z0.h
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fsub z2.s, z3.s, z2.s
+; SVE-NEXT:    fsub z0.s, z0.s, z1.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z0.h, p1/m, z0.s
+; SVE-NEXT:    uzp1 z0.h, z0.h, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fsub_sel_fmul_negzero_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    mov w8, #32768 // =0x8000
+; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT:    fmov h3, w8
+; SVE-B16B16-NEXT:    mov z3.h, h3
+; SVE-B16B16-NEXT:    sel z1.h, p0, z1.h, z3.h
+; SVE-B16B16-NEXT:    bfsub z0.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    ret
   %fmul = fmul <vscale x 8 x bfloat> %b, %c
   %nz = fneg <vscale x 8 x bfloat> zeroinitializer
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> %nz
@@ -266,12 +680,41 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloa
 }
 
 define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fadd z2.s, z3.s, z2.s
+; SVE-NEXT:    fadd z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT:    bfadd z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %fmul = fmul <vscale x 8 x bfloat> %b, %c
   %nz = fneg <vscale x 8 x bfloat> zeroinitializer
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> %nz
@@ -280,12 +723,41 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nsz_nxv8bf16(<vscale x 8 x b
 }
 
 define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z3.s, z2.h
+; SVE-NEXT:    uunpkhi z4.s, z1.h
+; SVE-NEXT:    uunpklo z2.s, z2.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    ptrue p1.s
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fmul z3.s, z4.s, z3.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    lsl z4.s, z4.s, #16
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    lsl z2.s, z2.s, #16
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    lsl z1.s, z1.s, #16
+; SVE-NEXT:    fsub z2.s, z3.s, z2.s
+; SVE-NEXT:    fsub z1.s, z4.s, z1.s
+; SVE-NEXT:    bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-NEXT:    ret
+;
+; SVE-B16B16-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16:
+; SVE-B16B16:       // %bb.0:
+; SVE-B16B16-NEXT:    bfmul z1.h, z1.h, z2.h
+; SVE-B16B16-NEXT:    bfsub z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT:    mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT:    ret
   %fmul = fmul <vscale x 8 x bfloat> %b, %c
   %nz = fneg <vscale x 8 x bfloat> zeroinitializer
   %sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %fmul, <vscale x 8 x bfloat> %nz
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
index c340df1385124..0cc2e04bfb315 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
@@ -12,6 +12,26 @@ define i8 @test_lane0_16xi8(<vscale x 16 x i8> %a) #0 {
   ret i8 %b
 }
 
+define i32 @test_lane0_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane0_16xi8_zext_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.b[0]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> %a, i32 0
+  %c = zext i8 %b to i32
+  ret i32 %c
+}
+
+define i64 @test_lane0_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane0_16xi8_zext_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.b[0]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> %a, i32 0
+  %c = zext i8 %b to i64
+  ret i64 %c
+}
+
 define i8 @test_lane15_16xi8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-LABEL: test_lane15_16xi8:
 ; CHECK:       // %bb.0:
@@ -21,6 +41,26 @@ define i8 @test_lane15_16xi8(<vscale x 16 x i8> %a) #0 {
   ret i8 %b
 }
 
+define i32 @test_lane15_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane15_16xi8_zext_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.b[15]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> %a, i32 15
+  %c = zext i8 %b to i32
+  ret i32 %c
+}
+
+define i64 @test_lane15_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane15_16xi8_zext_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.b[15]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> %a, i32 15
+  %c = zext i8 %b to i64
+  ret i64 %c
+}
+
 define i8 @test_lane16_16xi8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-LABEL: test_lane16_16xi8:
 ; CHECK:       // %bb.0:
@@ -31,6 +71,32 @@ define i8 @test_lane16_16xi8(<vscale x 16 x i8> %a) #0 {
   ret i8 %b
 }
 
+; FIXME: FMOV+AND -> UMOV.
+define i32 @test_lane16_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane16_16xi8_zext_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.b, z0.b[16]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and w0, w8, #0xff
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> %a, i32 16
+  %c = zext i8 %b to i32
+  ret i32 %c
+}
+
+; FIXME: FMOV+AND -> UMOV.
+define i64 @test_lane16_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane16_16xi8_zext_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.b, z0.b[16]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and x0, x8, #0xff
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> %a, i32 16
+  %c = zext i8 %b to i64
+  ret i64 %c
+}
+
 define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-LABEL: test_lane0_8xi16:
 ; CHECK:       // %bb.0:
@@ -40,6 +106,26 @@ define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) #0 {
   ret i16 %b
 }
 
+define i32 @test_lane0_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane0_8xi16_zext_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x i16> %a, i32 0
+  %c = zext i16 %b to i32
+  ret i32 %c
+}
+
+define i64 @test_lane0_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane0_8xi16_zext_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x i16> %a, i32 0
+  %c = zext i16 %b to i64
+  ret i64 %c
+}
+
 define i16 @test_lane7_8xi16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-LABEL: test_lane7_8xi16:
 ; CHECK:       // %bb.0:
@@ -49,6 +135,26 @@ define i16 @test_lane7_8xi16(<vscale x 8 x i16> %a) #0 {
   ret i16 %b
 }
 
+define i32 @test_lane7_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane7_8xi16_zext_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.h[7]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x i16> %a, i32 7
+  %c = zext i16 %b to i32
+  ret i32 %c
+}
+
+define i64 @test_lane7_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane7_8xi16_zext_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.h[7]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x i16> %a, i32 7
+  %c = zext i16 %b to i64
+  ret i64 %c
+}
+
 define i16 @test_lane8_8xi16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-LABEL: test_lane8_8xi16:
 ; CHECK:       // %bb.0:
@@ -59,6 +165,32 @@ define i16 @test_lane8_8xi16(<vscale x 8 x i16> %a) #0 {
   ret i16 %b
 }
 
+; FIXME: FMOV+AND -> UMOV.
+define i32 @test_lane8_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane8_8xi16_zext_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, z0.h[8]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and w0, w8, #0xffff
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x i16> %a, i32 8
+  %c = zext i16 %b to i32
+  ret i32 %c
+}
+
+; FIXME: FMOV+AND -> UMOV.
+define i64 @test_lane8_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane8_8xi16_zext_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, z0.h[8]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and x0, x8, #0xffff
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x i16> %a, i32 8
+  %c = zext i16 %b to i64
+  ret i64 %c
+}
+
 define i32 @test_lane0_4xi32(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: test_lane0_4xi32:
 ; CHECK:       // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
index 6c6a691760af3..52a77cb396909 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
@@ -147,15 +147,15 @@ define <2 x float> @extract_v2f32_nxv16f32_2(<vscale x 16 x float> %arg) {
 define <4 x i1> @extract_v4i1_nxv32i1_0(<vscale x 32 x i1> %arg) {
 ; CHECK-LABEL: extract_v4i1_nxv32i1_0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
-; CHECK-NEXT:    umov w8, v1.b[1]
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    umov w9, v1.b[2]
+; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    umov w8, v0.b[1]
+; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov v0.h[1], w8
+; CHECK-NEXT:    umov w8, v1.b[2]
+; CHECK-NEXT:    mov v0.h[2], w8
 ; CHECK-NEXT:    umov w8, v1.b[3]
-; CHECK-NEXT:    mov v0.h[2], w9
 ; CHECK-NEXT:    mov v0.h[3], w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %ext = call <4 x i1> @llvm.vector.extract.v4i1.nxv32i1(<vscale x 32 x i1> %arg, i64 0)
   ret <4 x i1> %ext
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
index e10313773c73e..72994100b2970 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
@@ -248,15 +248,15 @@ define <2 x i1> @extract_v2i1_nxv2i1(<vscale x 2 x i1> %inmask) {
 define <4 x i1> @extract_v4i1_nxv4i1(<vscale x 4 x i1> %inmask) {
 ; CHECK-LABEL: extract_v4i1_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    mov w8, v1.s[1]
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov w9, v1.s[2]
+; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov v0.h[1], w8
+; CHECK-NEXT:    mov w8, v1.s[2]
+; CHECK-NEXT:    mov v0.h[2], w8
 ; CHECK-NEXT:    mov w8, v1.s[3]
-; CHECK-NEXT:    mov v0.h[2], w9
 ; CHECK-NEXT:    mov v0.h[3], w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %mask = call <4 x i1> @llvm.vector.extract.v4i1.nxv4i1(<vscale x 4 x i1> %inmask, i64 0)
   ret <4 x i1> %mask
@@ -265,23 +265,23 @@ define <4 x i1> @extract_v4i1_nxv4i1(<vscale x 4 x i1> %inmask) {
 define <8 x i1> @extract_v8i1_nxv8i1(<vscale x 8 x i1> %inmask) {
 ; CHECK-LABEL: extract_v8i1_nxv8i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.h, p0/z, #1 // =0x1
-; CHECK-NEXT:    umov w8, v1.h[1]
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    umov w9, v1.h[2]
+; CHECK-NEXT:    mov z0.h, p0/z, #1 // =0x1
+; CHECK-NEXT:    umov w8, v0.h[1]
+; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov v0.b[1], w8
+; CHECK-NEXT:    umov w8, v1.h[2]
+; CHECK-NEXT:    mov v0.b[2], w8
 ; CHECK-NEXT:    umov w8, v1.h[3]
-; CHECK-NEXT:    mov v0.b[2], w9
-; CHECK-NEXT:    umov w9, v1.h[4]
 ; CHECK-NEXT:    mov v0.b[3], w8
+; CHECK-NEXT:    umov w8, v1.h[4]
+; CHECK-NEXT:    mov v0.b[4], w8
 ; CHECK-NEXT:    umov w8, v1.h[5]
-; CHECK-NEXT:    mov v0.b[4], w9
-; CHECK-NEXT:    umov w9, v1.h[6]
 ; CHECK-NEXT:    mov v0.b[5], w8
+; CHECK-NEXT:    umov w8, v1.h[6]
+; CHECK-NEXT:    mov v0.b[6], w8
 ; CHECK-NEXT:    umov w8, v1.h[7]
-; CHECK-NEXT:    mov v0.b[6], w9
 ; CHECK-NEXT:    mov v0.b[7], w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %mask = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> %inmask, i64 0)
   ret <8 x i1> %mask
@@ -292,9 +292,9 @@ define <8 x i1> @extract_v8i1_nxv8i1(<vscale x 8 x i1> %inmask) {
 define <16 x i1> @extract_v16i1_nxv16i1(<vscale x 16 x i1> %inmask) {
 ; CHECK-LABEL: extract_v16i1_nxv16i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.b[1], v1.b[1]
+; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    mov v0.b[1], v0.b[1]
 ; CHECK-NEXT:    mov v0.b[2], v1.b[2]
 ; CHECK-NEXT:    mov v0.b[3], v1.b[3]
 ; CHECK-NEXT:    mov v0.b[4], v1.b[4]
@@ -309,6 +309,7 @@ define <16 x i1> @extract_v16i1_nxv16i1(<vscale x 16 x i1> %inmask) {
 ; CHECK-NEXT:    mov v0.b[13], v1.b[13]
 ; CHECK-NEXT:    mov v0.b[14], v1.b[14]
 ; CHECK-NEXT:    mov v0.b[15], v1.b[15]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %mask = call <16 x i1> @llvm.vector.extract.v16i1.nxv16i1(<vscale x 16 x i1> %inmask, i64 0)
   ret <16 x i1> %mask
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
index 6fbae7edfec0a..2dda03e5c6dab 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
@@ -55,10 +55,9 @@ define void @fadd_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fadd z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    fadd z2.h, p0/m, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fadd_v32f16:
@@ -154,10 +153,9 @@ define void @fadd_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    fadd z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fadd_v16f32:
@@ -253,10 +251,9 @@ define void @fadd_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fadd z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    fadd z2.d, p0/m, z2.d, z3.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fadd_v8f64:
@@ -660,10 +657,9 @@ define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    ld1h { z5.h }, p0/z, [x2]
 ; VBITS_GE_256-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
-; VBITS_GE_256-NEXT:    movprfx z1, z5
-; VBITS_GE_256-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
+; VBITS_GE_256-NEXT:    fmad z3.h, p0/m, z4.h, z5.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fma_v32f16:
@@ -771,10 +767,9 @@ define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x2]
 ; VBITS_GE_256-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
-; VBITS_GE_256-NEXT:    movprfx z1, z5
-; VBITS_GE_256-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
+; VBITS_GE_256-NEXT:    fmad z3.s, p0/m, z4.s, z5.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fma_v16f32:
@@ -881,10 +876,9 @@ define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x2]
 ; VBITS_GE_256-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
-; VBITS_GE_256-NEXT:    movprfx z1, z5
-; VBITS_GE_256-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
+; VBITS_GE_256-NEXT:    fmad z3.d, p0/m, z4.d, z5.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fma_v8f64:
@@ -990,10 +984,9 @@ define void @fmul_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    fmul z2.h, p0/m, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmul_v32f16:
@@ -1089,10 +1082,9 @@ define void @fmul_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmul_v16f32:
@@ -1188,10 +1180,9 @@ define void @fmul_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fmul z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    fmul z2.d, p0/m, z2.d, z3.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmul_v8f64:
@@ -1827,10 +1818,9 @@ define void @fsub_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fsub z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    fsub z2.h, p0/m, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fsub_v32f16:
@@ -1926,10 +1916,9 @@ define void @fsub_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fsub z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    fsub z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fsub_v16f32:
@@ -2025,10 +2014,9 @@ define void @fsub_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    fsub z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    fsub z2.d, p0/m, z2.d, z3.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fsub_v8f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
index e1ec5ee5f6137..633b429db3dfd 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
@@ -64,10 +64,9 @@ define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    ld1h { z5.h }, p0/z, [x2]
 ; VBITS_GE_256-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
-; VBITS_GE_256-NEXT:    movprfx z1, z5
-; VBITS_GE_256-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
+; VBITS_GE_256-NEXT:    fmad z3.h, p0/m, z4.h, z5.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fma_v32f16:
@@ -181,10 +180,9 @@ define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x2]
 ; VBITS_GE_256-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
-; VBITS_GE_256-NEXT:    movprfx z1, z5
-; VBITS_GE_256-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
+; VBITS_GE_256-NEXT:    fmad z3.s, p0/m, z4.s, z5.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fma_v16f32:
@@ -297,10 +295,9 @@ define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    ld1d { z5.d }, p0/z, [x2]
 ; VBITS_GE_256-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
-; VBITS_GE_256-NEXT:    movprfx z1, z5
-; VBITS_GE_256-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
+; VBITS_GE_256-NEXT:    fmad z3.d, p0/m, z4.d, z5.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fma_v8f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
index de60deeafaf32..90a04995ff15e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
@@ -55,10 +55,9 @@ define void @fmaxnm_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_EQ_256-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_256-NEXT:    movprfx z1, z2
-; VBITS_EQ_256-NEXT:    fmaxnm z1.h, p0/m, z1.h, z3.h
+; VBITS_EQ_256-NEXT:    fmaxnm z2.h, p0/m, z2.h, z3.h
 ; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_EQ_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmaxnm_v32f16:
@@ -154,10 +153,9 @@ define void @fmaxnm_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_EQ_256-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_256-NEXT:    movprfx z1, z2
-; VBITS_EQ_256-NEXT:    fmaxnm z1.s, p0/m, z1.s, z3.s
+; VBITS_EQ_256-NEXT:    fmaxnm z2.s, p0/m, z2.s, z3.s
 ; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_EQ_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmaxnm_v16f32:
@@ -253,10 +251,9 @@ define void @fmaxnm_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_EQ_256-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_256-NEXT:    movprfx z1, z2
-; VBITS_EQ_256-NEXT:    fmaxnm z1.d, p0/m, z1.d, z3.d
+; VBITS_EQ_256-NEXT:    fmaxnm z2.d, p0/m, z2.d, z3.d
 ; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_EQ_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmaxnm_v8f64:
@@ -356,10 +353,9 @@ define void @fminnm_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_EQ_256-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_256-NEXT:    movprfx z1, z2
-; VBITS_EQ_256-NEXT:    fminnm z1.h, p0/m, z1.h, z3.h
+; VBITS_EQ_256-NEXT:    fminnm z2.h, p0/m, z2.h, z3.h
 ; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_EQ_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fminnm_v32f16:
@@ -455,10 +451,9 @@ define void @fminnm_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_EQ_256-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_256-NEXT:    movprfx z1, z2
-; VBITS_EQ_256-NEXT:    fminnm z1.s, p0/m, z1.s, z3.s
+; VBITS_EQ_256-NEXT:    fminnm z2.s, p0/m, z2.s, z3.s
 ; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_EQ_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fminnm_v16f32:
@@ -554,10 +549,9 @@ define void @fminnm_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_EQ_256-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_256-NEXT:    movprfx z1, z2
-; VBITS_EQ_256-NEXT:    fminnm z1.d, p0/m, z1.d, z3.d
+; VBITS_EQ_256-NEXT:    fminnm z2.d, p0/m, z2.d, z3.d
 ; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_EQ_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fminnm_v8f64:
@@ -657,10 +651,9 @@ define void @fmax_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_EQ_256-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_256-NEXT:    movprfx z1, z2
-; VBITS_EQ_256-NEXT:    fmax z1.h, p0/m, z1.h, z3.h
+; VBITS_EQ_256-NEXT:    fmax z2.h, p0/m, z2.h, z3.h
 ; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_EQ_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmax_v32f16:
@@ -756,10 +749,9 @@ define void @fmax_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_EQ_256-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_256-NEXT:    movprfx z1, z2
-; VBITS_EQ_256-NEXT:    fmax z1.s, p0/m, z1.s, z3.s
+; VBITS_EQ_256-NEXT:    fmax z2.s, p0/m, z2.s, z3.s
 ; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_EQ_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmax_v16f32:
@@ -855,10 +847,9 @@ define void @fmax_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_EQ_256-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_256-NEXT:    movprfx z1, z2
-; VBITS_EQ_256-NEXT:    fmax z1.d, p0/m, z1.d, z3.d
+; VBITS_EQ_256-NEXT:    fmax z2.d, p0/m, z2.d, z3.d
 ; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_EQ_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmax_v8f64:
@@ -958,10 +949,9 @@ define void @fmin_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_EQ_256-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_256-NEXT:    movprfx z1, z2
-; VBITS_EQ_256-NEXT:    fmin z1.h, p0/m, z1.h, z3.h
+; VBITS_EQ_256-NEXT:    fmin z2.h, p0/m, z2.h, z3.h
 ; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_EQ_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmin_v32f16:
@@ -1057,10 +1047,9 @@ define void @fmin_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_EQ_256-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_256-NEXT:    movprfx z1, z2
-; VBITS_EQ_256-NEXT:    fmin z1.s, p0/m, z1.s, z3.s
+; VBITS_EQ_256-NEXT:    fmin z2.s, p0/m, z2.s, z3.s
 ; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_EQ_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmin_v16f32:
@@ -1156,10 +1145,9 @@ define void @fmin_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_EQ_256-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
-; VBITS_EQ_256-NEXT:    movprfx z1, z2
-; VBITS_EQ_256-NEXT:    fmin z1.d, p0/m, z1.d, z3.d
+; VBITS_EQ_256-NEXT:    fmin z2.d, p0/m, z2.d, z3.d
 ; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_EQ_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fmin_v8f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-abd.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-abd.ll
index 08a974fa2d9f4..a91b392b7230a 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-abd.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-abd.ll
@@ -155,10 +155,9 @@ define void @sabd_v64i8_v64i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    sabd z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    sabd z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    sabd z2.b, p0/m, z2.b, z3.b
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: sabd_v64i8_v64i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
index 58fca3a2cf8b6..736239599836c 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
@@ -456,10 +456,9 @@ define void @mul_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    mul z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    mul z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    mul z2.b, p0/m, z2.b, z3.b
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: mul_v64i8:
@@ -555,10 +554,9 @@ define void @mul_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    mul z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    mul z2.h, p0/m, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: mul_v32i16:
@@ -654,10 +652,9 @@ define void @mul_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    mul z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    mul z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: mul_v16i32:
@@ -759,10 +756,9 @@ define void @mul_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    mul z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    mul z2.d, p0/m, z2.d, z3.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: mul_v8i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
index 4926684ddc2de..c56376887d966 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
@@ -55,10 +55,9 @@ define void @smax_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    smax z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    smax z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    smax z2.b, p0/m, z2.b, z3.b
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smax_v64i8:
@@ -154,10 +153,9 @@ define void @smax_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    smax z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    smax z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    smax z2.h, p0/m, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smax_v32i16:
@@ -253,10 +251,9 @@ define void @smax_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    smax z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    smax z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    smax z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smax_v16i32:
@@ -360,10 +357,9 @@ define void @smax_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    smax z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    smax z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    smax z2.d, p0/m, z2.d, z3.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smax_v8i64:
@@ -463,10 +459,9 @@ define void @smin_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    smin z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    smin z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    smin z2.b, p0/m, z2.b, z3.b
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smin_v64i8:
@@ -562,10 +557,9 @@ define void @smin_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    smin z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    smin z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    smin z2.h, p0/m, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smin_v32i16:
@@ -661,10 +655,9 @@ define void @smin_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    smin z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    smin z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    smin z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smin_v16i32:
@@ -768,10 +761,9 @@ define void @smin_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    smin z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    smin z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    smin z2.d, p0/m, z2.d, z3.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smin_v8i64:
@@ -871,10 +863,9 @@ define void @umax_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    umax z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    umax z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    umax z2.b, p0/m, z2.b, z3.b
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umax_v64i8:
@@ -970,10 +961,9 @@ define void @umax_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    umax z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    umax z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    umax z2.h, p0/m, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umax_v32i16:
@@ -1069,10 +1059,9 @@ define void @umax_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    umax z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    umax z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    umax z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umax_v16i32:
@@ -1176,10 +1165,9 @@ define void @umax_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    umax z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    umax z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    umax z2.d, p0/m, z2.d, z3.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umax_v8i64:
@@ -1279,10 +1267,9 @@ define void @umin_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    umin z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    umin z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    umin z2.b, p0/m, z2.b, z3.b
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umin_v64i8:
@@ -1378,10 +1365,9 @@ define void @umin_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    umin z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    umin z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    umin z2.h, p0/m, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umin_v32i16:
@@ -1477,10 +1463,9 @@ define void @umin_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    umin z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    umin z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    umin z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umin_v16i32:
@@ -1584,10 +1569,9 @@ define void @umin_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    umin z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    umin z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    umin z2.d, p0/m, z2.d, z3.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umin_v8i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
index 41cce354cc9de..dfbc23707e418 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
@@ -78,10 +78,9 @@ define void @smulh_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    smulh z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    smulh z2.b, p0/m, z2.b, z3.b
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smulh_v64i8:
@@ -209,10 +208,9 @@ define void @smulh_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    smulh z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    smulh z2.h, p0/m, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smulh_v32i16:
@@ -340,10 +338,9 @@ define void @smulh_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    smulh z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    smulh z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smulh_v16i32:
@@ -471,10 +468,9 @@ define void @smulh_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    smulh z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    smulh z2.d, p0/m, z2.d, z3.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: smulh_v8i64:
@@ -607,10 +603,9 @@ define void @umulh_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    umulh z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    umulh z2.b, p0/m, z2.b, z3.b
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umulh_v64i8:
@@ -739,10 +734,9 @@ define void @umulh_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    umulh z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    umulh z2.h, p0/m, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umulh_v32i16:
@@ -870,10 +864,9 @@ define void @umulh_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    umulh z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    umulh z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umulh_v16i32:
@@ -1001,10 +994,9 @@ define void @umulh_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    umulh z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    umulh z2.d, p0/m, z2.d, z3.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: umulh_v8i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
index 27be84419d59e..14204e965fb4d 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
@@ -616,10 +616,9 @@ define void @srem_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    movprfx z5, z3
 ; VBITS_GE_256-NEXT:    sdiv z5.s, p0/m, z5.s, z4.s
 ; VBITS_GE_256-NEXT:    mls z0.s, p0/m, z2.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z3
-; VBITS_GE_256-NEXT:    mls z1.s, p0/m, z5.s, z4.s
+; VBITS_GE_256-NEXT:    mls z3.s, p0/m, z5.s, z4.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: srem_v16i32:
@@ -744,11 +743,10 @@ define void @srem_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-NEXT:    movprfx z18, z16
 ; VBITS_GE_128-NEXT:    sdiv z18.d, p0/m, z18.d, z17.d
 ; VBITS_GE_128-NEXT:    msb z0.d, p0/m, z4.d, z1.d
-; VBITS_GE_128-NEXT:    movprfx z1, z2
-; VBITS_GE_128-NEXT:    mls z1.d, p0/m, z19.d, z3.d
+; VBITS_GE_128-NEXT:    mls z2.d, p0/m, z19.d, z3.d
 ; VBITS_GE_128-NEXT:    mls z16.d, p0/m, z18.d, z17.d
 ; VBITS_GE_128-NEXT:    mls z5.d, p0/m, z7.d, z6.d
-; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
+; VBITS_GE_128-NEXT:    stp q0, q2, [x0]
 ; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
 ; VBITS_GE_128-NEXT:    ret
 ;
@@ -765,10 +763,9 @@ define void @srem_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    movprfx z5, z3
 ; VBITS_GE_256-NEXT:    sdiv z5.d, p0/m, z5.d, z4.d
 ; VBITS_GE_256-NEXT:    mls z0.d, p0/m, z2.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z3
-; VBITS_GE_256-NEXT:    mls z1.d, p0/m, z5.d, z4.d
+; VBITS_GE_256-NEXT:    mls z3.d, p0/m, z5.d, z4.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: srem_v8i64:
@@ -1434,10 +1431,9 @@ define void @urem_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    movprfx z5, z3
 ; VBITS_GE_256-NEXT:    udiv z5.s, p0/m, z5.s, z4.s
 ; VBITS_GE_256-NEXT:    mls z0.s, p0/m, z2.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z3
-; VBITS_GE_256-NEXT:    mls z1.s, p0/m, z5.s, z4.s
+; VBITS_GE_256-NEXT:    mls z3.s, p0/m, z5.s, z4.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: urem_v16i32:
@@ -1562,11 +1558,10 @@ define void @urem_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-NEXT:    movprfx z18, z16
 ; VBITS_GE_128-NEXT:    udiv z18.d, p0/m, z18.d, z17.d
 ; VBITS_GE_128-NEXT:    msb z0.d, p0/m, z4.d, z1.d
-; VBITS_GE_128-NEXT:    movprfx z1, z2
-; VBITS_GE_128-NEXT:    mls z1.d, p0/m, z19.d, z3.d
+; VBITS_GE_128-NEXT:    mls z2.d, p0/m, z19.d, z3.d
 ; VBITS_GE_128-NEXT:    mls z16.d, p0/m, z18.d, z17.d
 ; VBITS_GE_128-NEXT:    mls z5.d, p0/m, z7.d, z6.d
-; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
+; VBITS_GE_128-NEXT:    stp q0, q2, [x0]
 ; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
 ; VBITS_GE_128-NEXT:    ret
 ;
@@ -1583,10 +1578,9 @@ define void @urem_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    movprfx z5, z3
 ; VBITS_GE_256-NEXT:    udiv z5.d, p0/m, z5.d, z4.d
 ; VBITS_GE_256-NEXT:    mls z0.d, p0/m, z2.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z3
-; VBITS_GE_256-NEXT:    mls z1.d, p0/m, z5.d, z4.d
+; VBITS_GE_256-NEXT:    mls z3.d, p0/m, z5.d, z4.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: urem_v8i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
index 0fa8c8f50e29c..a8afa90df96e4 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
@@ -57,10 +57,9 @@ define void @ashr_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    asr z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    asr z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    asr z2.b, p0/m, z2.b, z3.b
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: ashr_v64i8:
@@ -158,10 +157,9 @@ define void @ashr_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    asr z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    asr z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    asr z2.h, p0/m, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: ashr_v32i16:
@@ -259,10 +257,9 @@ define void @ashr_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    asr z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    asr z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    asr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: ashr_v16i32:
@@ -360,10 +357,9 @@ define void @ashr_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    asr z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    asr z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    asr z2.d, p0/m, z2.d, z3.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: ashr_v8i64:
@@ -465,10 +461,9 @@ define void @lshr_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    lsr z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    lsr z2.b, p0/m, z2.b, z3.b
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: lshr_v64i8:
@@ -566,10 +561,9 @@ define void @lshr_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    lsr z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    lsr z2.h, p0/m, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: lshr_v32i16:
@@ -667,10 +661,9 @@ define void @lshr_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    lsr z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    lsr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: lshr_v16i32:
@@ -768,10 +761,9 @@ define void @lshr_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    lsr z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    lsr z2.d, p0/m, z2.d, z3.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: lshr_v8i64:
@@ -871,10 +863,9 @@ define void @shl_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    lsl z1.b, p0/m, z1.b, z3.b
+; VBITS_GE_256-NEXT:    lsl z2.b, p0/m, z2.b, z3.b
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: shl_v64i8:
@@ -970,10 +961,9 @@ define void @shl_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    lsl z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    lsl z2.h, p0/m, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: shl_v32i16:
@@ -1069,10 +1059,9 @@ define void @shl_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    lsl z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    lsl z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: shl_v16i32:
@@ -1168,10 +1157,9 @@ define void @shl_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_256-NEXT:    movprfx z1, z2
-; VBITS_GE_256-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    lsl z2.d, p0/m, z2.d, z3.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: shl_v8i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll
index becddaea31267..b2ed8de369146 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll
@@ -1,19 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep ptrue
 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 ; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
index 41e4a38fad90b..8e807cda7166d 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
@@ -8,15 +8,15 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i1> @reshuffle_v4i1_nxv4i1(<vscale x 4 x i1> %a) #0 {
 ; CHECK-LABEL: reshuffle_v4i1_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    mov w8, v1.s[1]
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov w9, v1.s[2]
+; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov v0.h[1], w8
+; CHECK-NEXT:    mov w8, v1.s[2]
+; CHECK-NEXT:    mov v0.h[2], w8
 ; CHECK-NEXT:    mov w8, v1.s[3]
-; CHECK-NEXT:    mov v0.h[2], w9
 ; CHECK-NEXT:    mov v0.h[3], w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %el0 = extractelement <vscale x 4 x i1> %a, i32 0
   %el1 = extractelement <vscale x 4 x i1> %a, i32 1
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
index ba4a3a2042305..bd8f432579a08 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
@@ -28,53 +28,53 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    tbnz w1, #0, .LBB1_2
 ; CHECK-NEXT:  // %bb.1: // %vector.body
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    umov w8, v0.b[8]
-; CHECK-NEXT:    mov v1.b[1], v0.b[1]
-; CHECK-NEXT:    movprfx z3, z0
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #16
+; CHECK-NEXT:    umov w8, v2.b[8]
+; CHECK-NEXT:    mov v0.b[1], v2.b[1]
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #16
 ; CHECK-NEXT:    ext v4.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    mov v1.b[2], v0.b[2]
-; CHECK-NEXT:    mov v2.b[1], v0.b[9]
-; CHECK-NEXT:    mov v1.b[3], v0.b[3]
-; CHECK-NEXT:    mov v2.b[2], v0.b[10]
-; CHECK-NEXT:    mov v1.b[4], v0.b[4]
-; CHECK-NEXT:    mov v2.b[3], v0.b[11]
-; CHECK-NEXT:    mov v1.b[5], v0.b[5]
-; CHECK-NEXT:    mov v2.b[4], v0.b[12]
-; CHECK-NEXT:    mov v1.b[6], v0.b[6]
-; CHECK-NEXT:    mov v2.b[5], v0.b[13]
-; CHECK-NEXT:    mov v1.b[7], v0.b[7]
-; CHECK-NEXT:    mov v2.b[6], v0.b[14]
-; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    mov v2.b[7], v0.b[15]
-; CHECK-NEXT:    uunpklo z0.h, z3.b
+; CHECK-NEXT:    fmov s1, w8
+; CHECK-NEXT:    mov v0.b[2], v2.b[2]
+; CHECK-NEXT:    mov v1.b[1], v2.b[9]
+; CHECK-NEXT:    mov v0.b[3], v2.b[3]
+; CHECK-NEXT:    mov v1.b[2], v2.b[10]
+; CHECK-NEXT:    mov v0.b[4], v2.b[4]
+; CHECK-NEXT:    mov v1.b[3], v2.b[11]
+; CHECK-NEXT:    mov v0.b[5], v2.b[5]
+; CHECK-NEXT:    mov v1.b[4], v2.b[12]
+; CHECK-NEXT:    mov v0.b[6], v2.b[6]
+; CHECK-NEXT:    mov v1.b[5], v2.b[13]
+; CHECK-NEXT:    mov v0.b[7], v2.b[7]
+; CHECK-NEXT:    mov v1.b[6], v2.b[14]
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    mov v1.b[7], v2.b[15]
+; CHECK-NEXT:    uunpklo z2.h, z3.b
 ; CHECK-NEXT:    uunpklo z3.h, z4.b
-; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z2.h, z2.b
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    lsl z1.s, z1.s, #31
+; CHECK-NEXT:    uunpklo z1.h, z1.b
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    lsl z0.s, z0.s, #31
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    lsl z2.s, z2.s, #31
 ; CHECK-NEXT:    lsl z3.s, z3.s, #31
-; CHECK-NEXT:    asr z1.s, z1.s, #31
 ; CHECK-NEXT:    asr z0.s, z0.s, #31
+; CHECK-NEXT:    asr z2.s, z2.s, #31
 ; CHECK-NEXT:    asr z3.s, z3.s, #31
-; CHECK-NEXT:    lsl z2.s, z2.s, #31
-; CHECK-NEXT:    cmpne p3.s, p0/z, z1.s, #0
-; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, #0
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    lsl z1.s, z1.s, #31
+; CHECK-NEXT:    cmpne p3.s, p0/z, z0.s, #0
+; CHECK-NEXT:    cmpne p1.s, p0/z, z2.s, #0
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    cmpne p2.s, p0/z, z3.s, #0
-; CHECK-NEXT:    asr z2.s, z2.s, #31
-; CHECK-NEXT:    cmpne p0.s, p0/z, z2.s, #0
-; CHECK-NEXT:    st1w { z0.s }, p1, [x0, #2, mul vl]
-; CHECK-NEXT:    st1w { z0.s }, p2, [x0, #3, mul vl]
-; CHECK-NEXT:    st1w { z0.s }, p3, [x0]
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    asr z1.s, z1.s, #31
+; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
+; CHECK-NEXT:    st1w { z2.s }, p1, [x0, #2, mul vl]
+; CHECK-NEXT:    st1w { z2.s }, p2, [x0, #3, mul vl]
+; CHECK-NEXT:    st1w { z2.s }, p3, [x0]
+; CHECK-NEXT:    st1w { z2.s }, p0, [x0, #1, mul vl]
 ; CHECK-NEXT:  .LBB1_2: // %exit
 ; CHECK-NEXT:    ret
   %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
index 124f81e7864d1..39fe92aae0619 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
@@ -11,12 +11,12 @@ define void @test_sink_ptrue_into_ptest(i32 %n) {
 ; CHECK-NEXT:    whilelt p0.s, wzr, w0
 ; CHECK-NEXT:    b.pl .LBB0_3
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    cntw x9
+; CHECK-NEXT:    mov w9, wzr
+; CHECK-NEXT:    cntw x8
 ; CHECK-NEXT:  .LBB0_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    whilelt p0.s, w8, w0
-; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    whilelt p0.s, w9, w0
+; CHECK-NEXT:    add w9, w9, w8
 ; CHECK-NEXT:    b.mi .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: // %exit
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
index f2c882c370eab..20c06f0a1aff5 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
@@ -193,9 +193,8 @@ define void @fadd_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fadd z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fadd_v16f16:
@@ -397,9 +396,8 @@ define void @fadd_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fadd z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fadd_v8f32:
@@ -479,9 +477,8 @@ define void @fadd_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fadd z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fadd_v4f64:
@@ -703,9 +700,8 @@ define void @fdiv_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fdivr z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fdiv z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fdiv z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v16f16:
@@ -907,9 +903,8 @@ define void @fdiv_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fdivr z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fdiv z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fdiv z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v8f32:
@@ -989,9 +984,8 @@ define void @fdiv_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fdivr z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fdiv z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fdiv z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fdiv_v4f64:
@@ -1253,9 +1247,8 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    ldp q1, q5, [x2]
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fmad z0.h, p0/m, z2.h, z1.h
-; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmad z3.h, p0/m, z4.h, z5.h
+; CHECK-NEXT:    stp q0, q3, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fma_v16f16:
@@ -1501,9 +1494,8 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    ldp q1, q5, [x2]
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fmad z0.s, p0/m, z2.s, z1.s
-; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmad z3.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    stp q0, q3, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fma_v8f32:
@@ -1595,9 +1587,8 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    ldp q1, q5, [x2]
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fmad z0.d, p0/m, z2.d, z1.d
-; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmad z3.d, p0/m, z4.d, z5.d
+; CHECK-NEXT:    stp q0, q3, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fma_v4f64:
@@ -1824,9 +1815,8 @@ define void @fmul_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmul z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fmul_v16f16:
@@ -2028,9 +2018,8 @@ define void @fmul_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fmul_v8f32:
@@ -2110,9 +2099,8 @@ define void @fmul_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fmul z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmul z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fmul_v4f64:
@@ -3152,9 +3140,8 @@ define void @fsub_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fsubr z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fsub z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fsub z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fsub_v16f16:
@@ -3356,9 +3343,8 @@ define void @fsub_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fsubr z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fsub z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fsub z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fsub_v8f32:
@@ -3438,9 +3424,8 @@ define void @fsub_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fsubr z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fsub z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fsub z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fsub_v4f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
index 680cb4fb0a791..dbacd77315198 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
@@ -208,9 +208,8 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    ldp q1, q5, [x2]
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fmad z0.h, p0/m, z2.h, z1.h
-; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmad z3.h, p0/m, z4.h, z5.h
+; CHECK-NEXT:    stp q0, q3, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fma_v16f16:
@@ -526,9 +525,8 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    ldp q1, q5, [x2]
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fmad z0.s, p0/m, z2.s, z1.s
-; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmad z3.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    stp q0, q3, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fma_v8f32:
@@ -642,9 +640,8 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    ldp q1, q5, [x2]
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fmad z0.d, p0/m, z2.d, z1.d
-; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmad z3.d, p0/m, z4.d, z5.d
+; CHECK-NEXT:    stp q0, q3, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fma_v4f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
index 84aea185917fa..e53d6a9081154 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
@@ -143,9 +143,8 @@ define void @fmaxnm_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fmaxnm z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmaxnm z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v16f16:
@@ -347,9 +346,8 @@ define void @fmaxnm_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fmaxnm z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmaxnm z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v8f32:
@@ -448,9 +446,8 @@ define void @fmaxnm_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fmaxnm z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmaxnm z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fmaxnm_v4f64:
@@ -622,9 +619,8 @@ define void @fminnm_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fminnm z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v16f16:
@@ -826,9 +822,8 @@ define void @fminnm_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fminnm z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v8f32:
@@ -927,9 +922,8 @@ define void @fminnm_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fminnm z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fminnm_v4f64:
@@ -1101,9 +1095,8 @@ define void @fmax_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fmax z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmax z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fmax_v16f16:
@@ -1305,9 +1298,8 @@ define void @fmax_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fmax z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmax z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fmax_v8f32:
@@ -1406,9 +1398,8 @@ define void @fmax_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fmax z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmax z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fmax_v4f64:
@@ -1580,9 +1571,8 @@ define void @fmin_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fmin z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmin z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fmin_v16f16:
@@ -1784,9 +1774,8 @@ define void @fmin_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fmin z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmin z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fmin_v8f32:
@@ -1885,9 +1874,8 @@ define void @fmin_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fmin z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fmin z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fmin_v4f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
index 4360f3a12014a..02b5469c0ff85 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
@@ -975,9 +975,8 @@ define void @mul_v32i8(ptr %a, ptr %b) {
 ; SVE-NEXT:    ptrue p0.b, vl16
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    mul z0.b, p0/m, z0.b, z1.b
-; SVE-NEXT:    movprfx z1, z2
-; SVE-NEXT:    mul z1.b, p0/m, z1.b, z3.b
-; SVE-NEXT:    stp q0, q1, [x0]
+; SVE-NEXT:    mul z2.b, p0/m, z2.b, z3.b
+; SVE-NEXT:    stp q0, q2, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: mul_v32i8:
@@ -1286,9 +1285,8 @@ define void @mul_v16i16(ptr %a, ptr %b) {
 ; SVE-NEXT:    ptrue p0.h, vl8
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    mul z0.h, p0/m, z0.h, z1.h
-; SVE-NEXT:    movprfx z1, z2
-; SVE-NEXT:    mul z1.h, p0/m, z1.h, z3.h
-; SVE-NEXT:    stp q0, q1, [x0]
+; SVE-NEXT:    mul z2.h, p0/m, z2.h, z3.h
+; SVE-NEXT:    stp q0, q2, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: mul_v16i16:
@@ -1467,9 +1465,8 @@ define void @mul_v8i32(ptr %a, ptr %b) {
 ; SVE-NEXT:    ptrue p0.s, vl4
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    mul z0.s, p0/m, z0.s, z1.s
-; SVE-NEXT:    movprfx z1, z2
-; SVE-NEXT:    mul z1.s, p0/m, z1.s, z3.s
-; SVE-NEXT:    stp q0, q1, [x0]
+; SVE-NEXT:    mul z2.s, p0/m, z2.s, z3.s
+; SVE-NEXT:    stp q0, q2, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: mul_v8i32:
@@ -1599,9 +1596,8 @@ define void @mul_v4i64(ptr %a, ptr %b) {
 ; SVE-NEXT:    ptrue p0.d, vl2
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; SVE-NEXT:    movprfx z1, z2
-; SVE-NEXT:    mul z1.d, p0/m, z1.d, z3.d
-; SVE-NEXT:    stp q0, q1, [x0]
+; SVE-NEXT:    mul z2.d, p0/m, z2.d, z3.d
+; SVE-NEXT:    stp q0, q2, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: mul_v4i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index 1fdcd4f826870..8e1d61b51e2bb 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -779,9 +779,8 @@ define void @sdiv_v8i32(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    sdivr z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    sdiv z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v8i32:
@@ -886,9 +885,8 @@ define void @sdiv_v4i64(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    sdivr z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    sdiv z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v4i64:
@@ -1693,9 +1691,8 @@ define void @udiv_v8i32(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    udivr z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    udiv z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: udiv_v8i32:
@@ -1800,9 +1797,8 @@ define void @udiv_v4i64(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    udivr z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    udiv z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: udiv_v4i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
index 1bca7dd09d9b7..d858d8171926e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
@@ -179,9 +179,8 @@ define void @smax_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    smax z1.b, p0/m, z1.b, z3.b
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    smax z2.b, p0/m, z2.b, z3.b
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: smax_v32i8:
@@ -473,9 +472,8 @@ define void @smax_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    smax z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    smax z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: smax_v16i16:
@@ -651,9 +649,8 @@ define void @smax_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    smax z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    smax z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: smax_v8i32:
@@ -771,9 +768,8 @@ define void @smax_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    smax z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    smax z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: smax_v4i64:
@@ -985,9 +981,8 @@ define void @smin_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    smin z1.b, p0/m, z1.b, z3.b
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    smin z2.b, p0/m, z2.b, z3.b
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: smin_v32i8:
@@ -1279,9 +1274,8 @@ define void @smin_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    smin z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    smin z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: smin_v16i16:
@@ -1457,9 +1451,8 @@ define void @smin_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    smin z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    smin z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: smin_v8i32:
@@ -1577,9 +1570,8 @@ define void @smin_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    smin z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    smin z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: smin_v4i64:
@@ -1791,9 +1783,8 @@ define void @umax_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    umax z1.b, p0/m, z1.b, z3.b
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    umax z2.b, p0/m, z2.b, z3.b
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: umax_v32i8:
@@ -2085,9 +2076,8 @@ define void @umax_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    umax z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    umax z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: umax_v16i16:
@@ -2263,9 +2253,8 @@ define void @umax_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    umax z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    umax z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: umax_v8i32:
@@ -2383,9 +2372,8 @@ define void @umax_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    umax z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    umax z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: umax_v4i64:
@@ -2597,9 +2585,8 @@ define void @umin_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    umin z1.b, p0/m, z1.b, z3.b
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    umin z2.b, p0/m, z2.b, z3.b
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: umin_v32i8:
@@ -2891,9 +2878,8 @@ define void @umin_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    umin z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    umin z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: umin_v16i16:
@@ -3069,9 +3055,8 @@ define void @umin_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    umin z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    umin z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: umin_v8i32:
@@ -3189,9 +3174,8 @@ define void @umin_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    umin z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    umin z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: umin_v4i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
index 0c97eedd4362d..85b7b4d010062 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
@@ -294,9 +294,8 @@ define void @smulh_v32i8(ptr %a, ptr %b) {
 ; SVE-NEXT:    ptrue p0.b, vl16
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
-; SVE-NEXT:    movprfx z1, z2
-; SVE-NEXT:    smulh z1.b, p0/m, z1.b, z3.b
-; SVE-NEXT:    stp q0, q1, [x0]
+; SVE-NEXT:    smulh z2.b, p0/m, z2.b, z3.b
+; SVE-NEXT:    stp q0, q2, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: smulh_v32i8:
@@ -755,9 +754,8 @@ define void @smulh_v16i16(ptr %a, ptr %b) {
 ; SVE-NEXT:    ptrue p0.h, vl8
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
-; SVE-NEXT:    movprfx z1, z2
-; SVE-NEXT:    smulh z1.h, p0/m, z1.h, z3.h
-; SVE-NEXT:    stp q0, q1, [x0]
+; SVE-NEXT:    smulh z2.h, p0/m, z2.h, z3.h
+; SVE-NEXT:    stp q0, q2, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: smulh_v16i16:
@@ -1001,9 +999,8 @@ define void @smulh_v8i32(ptr %a, ptr %b) {
 ; SVE-NEXT:    ptrue p0.s, vl4
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
-; SVE-NEXT:    movprfx z1, z2
-; SVE-NEXT:    smulh z1.s, p0/m, z1.s, z3.s
-; SVE-NEXT:    stp q0, q1, [x0]
+; SVE-NEXT:    smulh z2.s, p0/m, z2.s, z3.s
+; SVE-NEXT:    stp q0, q2, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: smulh_v8i32:
@@ -1159,9 +1156,8 @@ define void @smulh_v4i64(ptr %a, ptr %b) {
 ; SVE-NEXT:    ptrue p0.d, vl2
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
-; SVE-NEXT:    movprfx z1, z2
-; SVE-NEXT:    smulh z1.d, p0/m, z1.d, z3.d
-; SVE-NEXT:    stp q0, q1, [x0]
+; SVE-NEXT:    smulh z2.d, p0/m, z2.d, z3.d
+; SVE-NEXT:    stp q0, q2, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: smulh_v4i64:
@@ -1494,9 +1490,8 @@ define void @umulh_v32i8(ptr %a, ptr %b) {
 ; SVE-NEXT:    ptrue p0.b, vl16
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
-; SVE-NEXT:    movprfx z1, z2
-; SVE-NEXT:    umulh z1.b, p0/m, z1.b, z3.b
-; SVE-NEXT:    stp q0, q1, [x0]
+; SVE-NEXT:    umulh z2.b, p0/m, z2.b, z3.b
+; SVE-NEXT:    stp q0, q2, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: umulh_v32i8:
@@ -1954,9 +1949,8 @@ define void @umulh_v16i16(ptr %a, ptr %b) {
 ; SVE-NEXT:    ptrue p0.h, vl8
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
-; SVE-NEXT:    movprfx z1, z2
-; SVE-NEXT:    umulh z1.h, p0/m, z1.h, z3.h
-; SVE-NEXT:    stp q0, q1, [x0]
+; SVE-NEXT:    umulh z2.h, p0/m, z2.h, z3.h
+; SVE-NEXT:    stp q0, q2, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: umulh_v16i16:
@@ -2200,9 +2194,8 @@ define void @umulh_v8i32(ptr %a, ptr %b) {
 ; SVE-NEXT:    ptrue p0.s, vl4
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
-; SVE-NEXT:    movprfx z1, z2
-; SVE-NEXT:    umulh z1.s, p0/m, z1.s, z3.s
-; SVE-NEXT:    stp q0, q1, [x0]
+; SVE-NEXT:    umulh z2.s, p0/m, z2.s, z3.s
+; SVE-NEXT:    stp q0, q2, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: umulh_v8i32:
@@ -2358,9 +2351,8 @@ define void @umulh_v4i64(ptr %a, ptr %b) {
 ; SVE-NEXT:    ptrue p0.d, vl2
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
-; SVE-NEXT:    movprfx z1, z2
-; SVE-NEXT:    umulh z1.d, p0/m, z1.d, z3.d
-; SVE-NEXT:    stp q0, q1, [x0]
+; SVE-NEXT:    umulh z2.d, p0/m, z2.d, z3.d
+; SVE-NEXT:    stp q0, q2, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: umulh_v4i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
index 372f6a06bf64b..c4b6c0e6e924c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
@@ -883,9 +883,8 @@ define void @srem_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    movprfx z5, z2
 ; CHECK-NEXT:    sdiv z5.s, p0/m, z5.s, z3.s
 ; CHECK-NEXT:    msb z0.s, p0/m, z4.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    mls z1.s, p0/m, z5.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    mls z2.s, p0/m, z5.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: srem_v8i32:
@@ -1013,9 +1012,8 @@ define void @srem_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    movprfx z5, z2
 ; CHECK-NEXT:    sdiv z5.d, p0/m, z5.d, z3.d
 ; CHECK-NEXT:    msb z0.d, p0/m, z4.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    mls z1.d, p0/m, z5.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    mls z2.d, p0/m, z5.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: srem_v4i64:
@@ -1933,9 +1931,8 @@ define void @urem_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    movprfx z5, z2
 ; CHECK-NEXT:    udiv z5.s, p0/m, z5.s, z3.s
 ; CHECK-NEXT:    msb z0.s, p0/m, z4.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    mls z1.s, p0/m, z5.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    mls z2.s, p0/m, z5.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: urem_v8i32:
@@ -2063,9 +2060,8 @@ define void @urem_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    movprfx z5, z2
 ; CHECK-NEXT:    udiv z5.d, p0/m, z5.d, z3.d
 ; CHECK-NEXT:    msb z0.d, p0/m, z4.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    mls z1.d, p0/m, z5.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    mls z2.d, p0/m, z5.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: urem_v4i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
index d0f99211e80fc..4cf8945575ded 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
@@ -195,9 +195,8 @@ define void @ashr_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    asrr z0.b, p0/m, z0.b, z1.b
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    asr z1.b, p0/m, z1.b, z3.b
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    asr z2.b, p0/m, z2.b, z3.b
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ashr_v32i8:
@@ -476,9 +475,8 @@ define void @ashr_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    asrr z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    asr z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    asr z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ashr_v16i16:
@@ -632,9 +630,8 @@ define void @ashr_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    asrr z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    asr z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    asr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ashr_v8i32:
@@ -739,9 +736,8 @@ define void @ashr_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    asrr z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    asr z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    asr z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ashr_v4i64:
@@ -965,9 +961,8 @@ define void @lshr_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    lsrr z0.b, p0/m, z0.b, z1.b
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    lsr z1.b, p0/m, z1.b, z3.b
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    lsr z2.b, p0/m, z2.b, z3.b
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: lshr_v32i8:
@@ -1246,9 +1241,8 @@ define void @lshr_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    lsrr z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    lsr z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    lsr z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: lshr_v16i16:
@@ -1402,9 +1396,8 @@ define void @lshr_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    lsrr z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    lsr z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    lsr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: lshr_v8i32:
@@ -1509,9 +1502,8 @@ define void @lshr_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    lsrr z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    lsr z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    lsr z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: lshr_v4i64:
@@ -1764,9 +1756,8 @@ define void @shl_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    lslr z0.b, p0/m, z0.b, z1.b
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    lsl z1.b, p0/m, z1.b, z3.b
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    lsl z2.b, p0/m, z2.b, z3.b
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: shl_v32i8:
@@ -2014,9 +2005,8 @@ define void @shl_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    lslr z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    lsl z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    lsl z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: shl_v16i16:
@@ -2170,9 +2160,8 @@ define void @shl_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    lslr z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    lsl z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    lsl z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: shl_v8i32:
@@ -2277,9 +2266,8 @@ define void @shl_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    lslr z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    lsl z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: shl_v4i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
index 74e5fe7352cfd..e9b2f539b30cc 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
@@ -954,9 +954,8 @@ define void @fadd_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fadd z2.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fadd_v16f16:
@@ -1170,9 +1169,8 @@ define void @fadd_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fadd z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fadd_v8f32:
@@ -1258,9 +1256,8 @@ define void @fadd_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    fadd z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fadd_v4f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index e0e88c47fb55c..e78671aaddf18 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -526,10 +526,9 @@ define void @zip_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    zip1 z5.d, z0.d, z2.d
 ; CHECK-NEXT:    trn2 z1.d, z1.d, z3.d
 ; CHECK-NEXT:    trn2 z0.d, z0.d, z2.d
-; CHECK-NEXT:    movprfx z2, z4
-; CHECK-NEXT:    fadd z2.d, p0/m, z2.d, z5.d
+; CHECK-NEXT:    fadd z4.d, p0/m, z4.d, z5.d
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    stp q2, q0, [x0]
+; CHECK-NEXT:    stp q4, q0, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip_v4f64:
@@ -2159,10 +2158,9 @@ define void @zip_vscale2_4(ptr %a, ptr %b) {
 ; CHECK-NEXT:    zip1 z5.d, z0.d, z2.d
 ; CHECK-NEXT:    trn2 z1.d, z1.d, z3.d
 ; CHECK-NEXT:    trn2 z0.d, z0.d, z2.d
-; CHECK-NEXT:    movprfx z2, z4
-; CHECK-NEXT:    fadd z2.d, p0/m, z2.d, z5.d
+; CHECK-NEXT:    fadd z4.d, p0/m, z4.d, z5.d
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    stp q2, q0, [x0]
+; CHECK-NEXT:    stp q4, q0, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zip_vscale2_4:
diff --git a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
index 6af26067cd6d6..0472d5c1935f5 100644
--- a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
@@ -36,10 +36,9 @@ define i32 @test(<vscale x 32 x i8> %bin.rdx, <vscale x 32 x i8> %bin.rdx2)  {
 ; CHECK-NEXT:    mla z0.s, p0/m, z25.s, z24.s
 ; CHECK-NEXT:    mad z2.s, p0/m, z6.s, z4.s
 ; CHECK-NEXT:    mad z1.s, p0/m, z3.s, z26.s
-; CHECK-NEXT:    movprfx z3, z5
-; CHECK-NEXT:    mla z3.s, p0/m, z28.s, z7.s
+; CHECK-NEXT:    mla z5.s, p0/m, z28.s, z7.s
 ; CHECK-NEXT:    add z0.s, z2.s, z0.s
-; CHECK-NEXT:    add z1.s, z3.s, z1.s
+; CHECK-NEXT:    add z1.s, z5.s, z1.s
 ; CHECK-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll
index 888e94d42f449..8f6f4510d8388 100644
--- a/llvm/test/CodeGen/AArch64/sve2-xar.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll
@@ -157,10 +157,9 @@ define <vscale x 2 x i64> @xar_nxv2i64_l_neg1(<vscale x 2 x i64> %x, <vscale x 2
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z3.d, z3.d, #0x3f
 ; CHECK-NEXT:    and z2.d, z2.d, #0x3f
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
+; CHECK-NEXT:    lslr z3.d, p0/m, z3.d, z0.d
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z2.d
-; CHECK-NEXT:    orr z0.d, z1.d, z0.d
+; CHECK-NEXT:    orr z0.d, z3.d, z0.d
 ; CHECK-NEXT:    ret
     %a = xor <vscale x 2 x i64> %x, %y
     %b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> %z)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
new file mode 100644
index 0000000000000..9dbe096ebdb57
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -global-isel -global-isel-abort=2 -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+; RUN: llc -global-isel -global-isel-abort=2 -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+
+target triple = "aarch64-linux-gnu"
+
+define <vscale x 4 x float> @fdot_wide_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; SVE2-LABEL: fdot_wide_nxv4f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    uunpklo z3.s, z1.h
+; SVE2-NEXT:    uunpklo z4.s, z2.h
+; SVE2-NEXT:    ptrue p0.s
+; SVE2-NEXT:    uunpkhi z1.s, z1.h
+; SVE2-NEXT:    uunpkhi z2.s, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fcvt z4.s, p0/m, z4.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fmul z3.s, z3.s, z4.s
+; SVE2-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE2-NEXT:    fadd z0.s, z0.s, z3.s
+; SVE2-NEXT:    fadd z0.s, z0.s, z1.s
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_nxv4f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    ret
+entry:
+  %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
+  %b.wide = fpext <vscale x 8 x half> %b to <vscale x 8 x float>
+  %mult = fmul <vscale x 8 x float> %a.wide, %b.wide
+  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %mult)
+  ret <vscale x 4 x float> %partial.reduce
+}
+
+define <vscale x 4 x float> @fdot_splat_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x half> %a) {
+; SVE2-LABEL: fdot_splat_nxv4f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    uunpklo z2.s, z1.h
+; SVE2-NEXT:    ptrue p0.s
+; SVE2-NEXT:    uunpkhi z1.s, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fadd z0.s, z0.s, z2.s
+; SVE2-NEXT:    fadd z0.s, z0.s, z1.s
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_splat_nxv4f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    fmov z2.h, #1.00000000
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    ret
+entry:
+  %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
+  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a.wide)
+  ret <vscale x 4 x float> %partial.reduce
+}
+
+define <vscale x 8 x half> @partial_reduce_nxv8f16(<vscale x 8 x half> %acc, <vscale x 16 x half> %a) {
+; CHECK-LABEL: partial_reduce_nxv8f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd z0.h, z0.h, z1.h
+; CHECK-NEXT:    fadd z0.h, z0.h, z2.h
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 8 x half> @llvm.vector.partial.reduce.fadd(<vscale x 8 x half> %acc, <vscale x 16 x half> %a)
+  ret <vscale x 8 x half> %partial.reduce
+}
+
+define <vscale x 4 x float> @partial_reduce_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x float> %a) {
+; CHECK-LABEL: partial_reduce_nxv4f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd z0.s, z0.s, z1.s
+; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a)
+  ret <vscale x 4 x float> %partial.reduce
+}
+
+define <vscale x 2 x double> @partial_reduce_nxv2f64(<vscale x 2 x double> %acc, <vscale x 4 x double> %a) {
+; CHECK-LABEL: partial_reduce_nxv2f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd z0.d, z0.d, z1.d
+; CHECK-NEXT:    fadd z0.d, z0.d, z2.d
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 2 x double> @llvm.vector.partial.reduce.fadd(<vscale x 2 x double> %acc, <vscale x 4 x double> %a)
+  ret <vscale x 2 x double> %partial.reduce
+}
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll
new file mode 100644
index 0000000000000..89216ce2cb72b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll
@@ -0,0 +1,230 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+
+target triple = "aarch64-linux-gnu"
+
+define void @fdot_wide_v8f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,0) {
+; SVE2-LABEL: fdot_wide_v8f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s, vl8
+; SVE2-NEXT:    mov x8, #8 // =0x8
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_v8f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ptrue p0.s, vl8
+; SVE2P1-NEXT:    ptrue p1.h, vl16
+; SVE2P1-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT:    ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT:    ret
+entry:
+  %acc = load <8 x float>, ptr %accptr
+  %a = load <16 x half>, ptr %aptr
+  %b = load <16 x half>, ptr %bptr
+  %a.wide = fpext <16 x half> %a to <16 x float>
+  %b.wide = fpext <16 x half> %b to <16 x float>
+  %mult = fmul <16 x float> %a.wide, %b.wide
+  %partial.reduce = call <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult)
+  store <8 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define void @fdot_wide_v16f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(4,0) {
+; SVE2-LABEL: fdot_wide_v16f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s, vl16
+; SVE2-NEXT:    mov x8, #16 // =0x10
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_v16f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ptrue p0.s, vl16
+; SVE2P1-NEXT:    ptrue p1.h, vl32
+; SVE2P1-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT:    ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT:    ret
+entry:
+  %acc = load <16 x float>, ptr %accptr
+  %a = load <32 x half>, ptr %aptr
+  %b = load <32 x half>, ptr %bptr
+  %a.wide = fpext <32 x half> %a to <32 x float>
+  %b.wide = fpext <32 x half> %b to <32 x float>
+  %mult = fmul <32 x float> %a.wide, %b.wide
+  %partial.reduce = call <16 x float> @llvm.vector.partial.reduce.fadd(<16 x float> %acc, <32 x float> %mult)
+  store <16 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define void @fdot_wide_v32f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(8,0) {
+; SVE2-LABEL: fdot_wide_v32f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s, vl32
+; SVE2-NEXT:    mov x8, #32 // =0x20
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_v32f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ptrue p0.s, vl32
+; SVE2P1-NEXT:    ptrue p1.h, vl64
+; SVE2P1-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT:    ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT:    ret
+entry:
+  %acc = load <32 x float>, ptr %accptr
+  %a = load <64 x half>, ptr %aptr
+  %b = load <64 x half>, ptr %bptr
+  %a.wide = fpext <64 x half> %a to <64 x float>
+  %b.wide = fpext <64 x half> %b to <64 x float>
+  %mult = fmul <64 x float> %a.wide, %b.wide
+  %partial.reduce = call <32 x float> @llvm.vector.partial.reduce.fadd(<32 x float> %acc, <64 x float> %mult)
+  store <32 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define void @fdot_wide_v64f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(16,0) {
+; SVE2-LABEL: fdot_wide_v64f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s, vl64
+; SVE2-NEXT:    mov x8, #64 // =0x40
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_v64f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ptrue p0.s, vl64
+; SVE2P1-NEXT:    ptrue p1.h, vl128
+; SVE2P1-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT:    ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT:    ret
+entry:
+  %acc = load <64 x float>, ptr %accptr
+  %a = load <128 x half>, ptr %aptr
+  %b = load <128 x half>, ptr %bptr
+  %a.wide = fpext <128 x half> %a to <128 x float>
+  %b.wide = fpext <128 x half> %b to <128 x float>
+  %mult = fmul <128 x float> %a.wide, %b.wide
+  %partial.reduce = call <64 x float> @llvm.vector.partial.reduce.fadd(<64 x float> %acc, <128 x float> %mult)
+  store <64 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define <4 x float> @fixed_fdot_wide(<4 x float> %acc, <8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: fixed_fdot_wide:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NEXT:    fcvtl v4.4s, v2.4h
+; CHECK-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-NEXT:    fmul v3.4s, v3.4s, v4.4s
+; CHECK-NEXT:    fmul v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = fpext <8 x half> %a to <8 x float>
+  %b.wide = fpext <8 x half> %b to <8 x float>
+  %mult = fmul <8 x float> %a.wide, %b.wide
+  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
+  ret <4 x float> %partial.reduce
+}
+
+define <8 x half> @partial_reduce_half(<8 x half> %acc, <16 x half> %a) {
+; CHECK-LABEL: partial_reduce_half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a)
+  ret <8 x half> %partial.reduce
+}
+
+define <4 x float> @partial_reduce_float(<4 x float> %acc, <8 x float> %a) {
+; CHECK-LABEL: partial_reduce_float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a)
+  ret <4 x float> %partial.reduce
+}
+
+define <2 x double> @partial_reduce_double(<2 x double> %acc, <4 x double> %a) {
+; CHECK-LABEL: partial_reduce_double:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    fadd v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a)
+  ret <2 x double> %partial.reduce
+}
diff --git a/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll
new file mode 100644
index 0000000000000..fe3eee06db65e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll
@@ -0,0 +1,151 @@
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+; GitHub issue #161036
+
+; Positive test : umin(sub(a,b),a) with scalar types should be folded
+define i64 @underflow_compare_fold_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs x8, x0, x1
+; CHECK-NEXT:  csel x0, x0, x8, lo
+; CHECK-NEXT:  ret
+  %sub = sub i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+; Positive test : umin(a,sub(a,b)) with scalar types should be folded
+define i64 @underflow_compare_fold_i64_commute(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64_commute
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs x8, x0, x1
+; CHECK-NEXT:  csel x0, x0, x8, lo
+; CHECK-NEXT:  ret
+  %sub = sub i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+  ret i64 %cond
+}
+
+; Positive test : multi-use is OK since the sub instruction still runs once
+define i64 @underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i64_multi_use
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs x8, x0, x1
+; CHECK-NEXT:  csel x0, x0, x8, lo
+; CHECK-NEXT:  str	x8, [x2]
+; CHECK-NEXT:  ret
+  %sub = sub i64 %a, %b
+  store i64 %sub, ptr addrspace(1) %ptr
+  %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs w8, w0, w1
+; CHECK-NEXT:  csel w0, w0, w8, lo
+; CHECK-NEXT:  ret
+  %sub = sub i32 %a, %b
+  %cond = tail call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_commute(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32_commute
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs w8, w0, w1
+; CHECK-NEXT:  csel w0, w0, w8, lo
+; CHECK-NEXT:  ret
+  %sub = sub i32 %a, %b
+  %cond = tail call i32 @llvm.umin.i32(i32 %a, i32 %sub)
+  ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i32_multi_use
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs w8, w0, w1
+; CHECK-NEXT:  csel w0, w0, w8, lo
+; CHECK-NEXT:  str	w8, [x2]
+; CHECK-NEXT:  ret
+  %sub = sub i32 %a, %b
+  store i32 %sub, ptr addrspace(1) %ptr
+  %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+; Negative test : i16
+define i16 @underflow_compare_fold_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16
+; CHECK-LABEL: %bb.0:
+; CHECK-LABEL: sub w8, w0, w1
+; CHECK-LABEL: and w9, w0, #0xffff
+; CHECK-LABEL: and w8, w8, #0xffff
+; CHECK-LABEL: cmp w8, w9
+; CHECK-LABEL: csel w0, w8, w9, lo
+; CHECK-LABEL: ret
+  %sub = sub i16 %a, %b
+  %cond = tail call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+; Negative test : i16
+define i16 @underflow_compare_fold_i16_commute(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16_commute
+; CHECK-LABEL: %bb.0:
+; CHECK-LABEL: sub w8, w0, w1
+; CHECK-LABEL: and w9, w0, #0xffff
+; CHECK-LABEL: and w8, w8, #0xffff
+; CHECK-LABEL: cmp w9, w8
+; CHECK-LABEL: csel w0, w9, w8, lo
+; CHECK-LABEL: ret
+  %sub = sub i16 %a, %b
+  %cond = tail call i16 @llvm.umin.i16(i16 %a, i16 %sub)
+  ret i16 %cond
+}
+
+; Negative test : i16
+define i16 @underflow_compare_fold_i16_multi_use(i16 %a, i16 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i16_multi_use
+; CHECK-LABEL: %bb.0:
+; CHECK-LABEL: sub w8, w0, w1
+; CHECK-LABEL: and w9, w0, #0xffff
+; CHECK-LABEL: and w10, w8, #0xffff
+; CHECK-LABEL: strh w8, [x2]
+; CHECK-LABEL: cmp w10, w9
+; CHECK-LABEL: csel w0, w10, w9, lo
+; CHECK-LABEL: ret
+  %sub = sub i16 %a, %b
+  store i16 %sub, ptr addrspace(1) %ptr
+  %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+; Negative test, vector types : umin(sub(a,b),a) but with vectors
+define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: underflow_compare_dontfold_vectors
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: sub v1.16b, v0.16b, v1.16b
+; CHECK-NEXT: umin v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ret
+  %sub = sub <16 x i8> %a, %b
+  %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
+  ret <16 x i8> %cond
+}
+
+; Negative test, pattern mismatch : umin(add(a,b),a)
+define i64 @umin_add(i64 %a, i64 %b) {
+; CHECK-LABEL: umin_add
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: add x8, x0, x1
+; CHECK-NEXT: cmp x8, x0
+; CHECK-NEXT: csel x0, x8, x0, lo
+; CHECK-NEXT: ret
+  %add = add i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
+  ret i64 %cond
+}
diff --git a/llvm/test/CodeGen/AArch64/vector-minmax.ll b/llvm/test/CodeGen/AArch64/vector-minmax.ll
new file mode 100644
index 0000000000000..6696f94d404c5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vector-minmax.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+neon,+sve | FileCheck %s --check-prefix=CHECK-SVE
+
+define <2 x i64> @smax_v2i64(<2 x i64> %a, <2 x i64> %b){
+; CHECK-LABEL: smax_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v2.2d, v0.2d, v1.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smax_v2i64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ptrue p0.d, vl2
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE-NEXT:    smax z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %0
+}
+
+define <2 x i64> @smin_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: smin_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v2.2d, v1.2d, v0.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smin_v2i64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ptrue p0.d, vl2
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE-NEXT:    smin z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %0
+}
+
+define <2 x i64> @umax_v2i64(<2 x i64> %a, <2 x i64> %b){
+; CHECK-LABEL: umax_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmhi v2.2d, v0.2d, v1.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umax_v2i64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ptrue p0.d, vl2
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE-NEXT:    umax z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %0
+}
+
+define <2 x i64> @umin_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: umin_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmhi v2.2d, v1.2d, v0.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umin_v2i64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ptrue p0.d, vl2
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE-NEXT:    umin z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %0
+}
+
+define <1 x i64> @smax_v1i64(<1 x i64> %a, <1 x i64> %b){
+; CHECK-LABEL: smax_v1i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt d2, d0, d1
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smax_v1i64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ptrue p0.d, vl1
+; CHECK-SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-SVE-NEXT:    smax z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <1 x i64> @llvm.smax.v2i64(<1 x i64> %a, <1 x i64> %b)
+  ret <1 x i64> %0
+}
+
+; This is legal for Neon, so this should use the Neon smax.
+define <4 x i32> @smax_v4i32(<4 x i32> %a, <4 x i32> %b){
+; CHECK-LABEL: smax_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smax_v4i32:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+}
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 74a717f1635a3..935189dec48ac 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -2835,11 +2835,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:  .LBB24_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ld1 { v0.16b }, [x1], #16
-; CHECK-BE-NEXT:    add x8, x0, #16
+; CHECK-BE-NEXT:    mov x8, x0
 ; CHECK-BE-NEXT:    ld1 { v1.8h }, [x0]
-; CHECK-BE-NEXT:    ld1 { v3.8h }, [x8]
-; CHECK-BE-NEXT:    add x9, x0, #48
-; CHECK-BE-NEXT:    add x10, x0, #32
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x9, x8, #48
+; CHECK-BE-NEXT:    ld1 { v3.8h }, [x0]
 ; CHECK-BE-NEXT:    subs w2, w2, #1
 ; CHECK-BE-NEXT:    ushll v2.8h, v0.8b, #0
 ; CHECK-BE-NEXT:    ushll2 v0.8h, v0.16b, #0
@@ -2847,11 +2847,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    umull2 v5.4s, v3.8h, v0.8h
 ; CHECK-BE-NEXT:    umull v0.4s, v3.4h, v0.4h
 ; CHECK-BE-NEXT:    umull2 v1.4s, v1.8h, v2.8h
-; CHECK-BE-NEXT:    st1 { v4.4s }, [x0]
-; CHECK-BE-NEXT:    mov x0, x8
+; CHECK-BE-NEXT:    st1 { v4.4s }, [x8]
+; CHECK-BE-NEXT:    add x8, x8, #32
 ; CHECK-BE-NEXT:    st1 { v5.4s }, [x9]
-; CHECK-BE-NEXT:    st1 { v0.4s }, [x10]
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x0]
 ; CHECK-BE-NEXT:    b.ne .LBB24_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    mov w0, wzr
@@ -2950,26 +2950,26 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:  .LBB25_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ld1 { v4.16b }, [x0]
-; CHECK-BE-NEXT:    add x9, x1, #48
-; CHECK-BE-NEXT:    add x8, x1, #32
-; CHECK-BE-NEXT:    ld1 { v18.4s }, [x9]
+; CHECK-BE-NEXT:    add x10, x1, #48
 ; CHECK-BE-NEXT:    ld1 { v16.4s }, [x1]
+; CHECK-BE-NEXT:    add x9, x1, #32
+; CHECK-BE-NEXT:    ld1 { v18.4s }, [x10]
 ; CHECK-BE-NEXT:    add x1, x1, #16
-; CHECK-BE-NEXT:    ld1 { v20.4s }, [x8]
+; CHECK-BE-NEXT:    ld1 { v20.4s }, [x9]
 ; CHECK-BE-NEXT:    ld1 { v22.4s }, [x1]
-; CHECK-BE-NEXT:    add x8, x0, #96
+; CHECK-BE-NEXT:    add x9, x0, #96
 ; CHECK-BE-NEXT:    tbl v5.16b, { v4.16b }, v3.16b
 ; CHECK-BE-NEXT:    tbl v6.16b, { v4.16b }, v2.16b
 ; CHECK-BE-NEXT:    tbl v7.16b, { v4.16b }, v1.16b
 ; CHECK-BE-NEXT:    tbl v4.16b, { v4.16b }, v0.16b
 ; CHECK-BE-NEXT:    ext v24.16b, v18.16b, v18.16b, #8
-; CHECK-BE-NEXT:    add x9, x0, #32
+; CHECK-BE-NEXT:    mov x8, x0
 ; CHECK-BE-NEXT:    ext v25.16b, v20.16b, v20.16b, #8
-; CHECK-BE-NEXT:    add x10, x0, #16
+; CHECK-BE-NEXT:    add x10, x0, #32
 ; CHECK-BE-NEXT:    subs w2, w2, #1
 ; CHECK-BE-NEXT:    ext v17.16b, v5.16b, v5.16b, #8
-; CHECK-BE-NEXT:    ext v19.16b, v6.16b, v6.16b, #8
 ; CHECK-BE-NEXT:    rev32 v5.8b, v5.8b
+; CHECK-BE-NEXT:    ext v19.16b, v6.16b, v6.16b, #8
 ; CHECK-BE-NEXT:    rev32 v21.8b, v7.8b
 ; CHECK-BE-NEXT:    rev32 v23.8b, v4.8b
 ; CHECK-BE-NEXT:    ext v7.16b, v7.16b, v7.16b, #8
@@ -2986,22 +2986,22 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    rev32 v4.8b, v4.8b
 ; CHECK-BE-NEXT:    umull v17.2d, v17.2s, v24.2s
 ; CHECK-BE-NEXT:    umull v19.2d, v19.2s, v25.2s
-; CHECK-BE-NEXT:    st1 { v5.2d }, [x8]
+; CHECK-BE-NEXT:    st1 { v5.2d }, [x9]
 ; CHECK-BE-NEXT:    umull v5.2d, v6.2s, v20.2s
 ; CHECK-BE-NEXT:    umull v6.2d, v7.2s, v21.2s
-; CHECK-BE-NEXT:    add x8, x0, #112
+; CHECK-BE-NEXT:    add x9, x0, #112
 ; CHECK-BE-NEXT:    umull v4.2d, v4.2s, v16.2s
-; CHECK-BE-NEXT:    st1 { v18.2d }, [x9]
-; CHECK-BE-NEXT:    add x9, x0, #80
+; CHECK-BE-NEXT:    st1 { v18.2d }, [x10]
+; CHECK-BE-NEXT:    add x10, x0, #80
 ; CHECK-BE-NEXT:    st1 { v22.2d }, [x0]
-; CHECK-BE-NEXT:    st1 { v17.2d }, [x8]
-; CHECK-BE-NEXT:    add x8, x0, #64
-; CHECK-BE-NEXT:    st1 { v19.2d }, [x9]
-; CHECK-BE-NEXT:    add x9, x0, #48
-; CHECK-BE-NEXT:    mov x0, x8
-; CHECK-BE-NEXT:    st1 { v5.2d }, [x8]
+; CHECK-BE-NEXT:    add x0, x0, #64
+; CHECK-BE-NEXT:    st1 { v17.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x8, #48
+; CHECK-BE-NEXT:    add x8, x8, #16
+; CHECK-BE-NEXT:    st1 { v19.2d }, [x10]
+; CHECK-BE-NEXT:    st1 { v5.2d }, [x0]
 ; CHECK-BE-NEXT:    st1 { v6.2d }, [x9]
-; CHECK-BE-NEXT:    st1 { v4.2d }, [x10]
+; CHECK-BE-NEXT:    st1 { v4.2d }, [x8]
 ; CHECK-BE-NEXT:    b.ne .LBB25_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    mov w0, wzr
@@ -3093,13 +3093,14 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:  .LBB26_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ld1 { v4.16b }, [x1], #16
-; CHECK-BE-NEXT:    add x8, x0, #32
+; CHECK-BE-NEXT:    mov x8, x0
+; CHECK-BE-NEXT:    add x9, x0, #32
 ; CHECK-BE-NEXT:    ld1 { v16.4s }, [x0]
-; CHECK-BE-NEXT:    add x9, x0, #48
-; CHECK-BE-NEXT:    add x10, x0, #16
-; CHECK-BE-NEXT:    ld1 { v17.4s }, [x8]
-; CHECK-BE-NEXT:    ld1 { v18.4s }, [x9]
-; CHECK-BE-NEXT:    ld1 { v19.4s }, [x10]
+; CHECK-BE-NEXT:    add x10, x0, #48
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    ld1 { v17.4s }, [x9]
+; CHECK-BE-NEXT:    ld1 { v18.4s }, [x10]
+; CHECK-BE-NEXT:    ld1 { v19.4s }, [x0]
 ; CHECK-BE-NEXT:    subs w2, w2, #1
 ; CHECK-BE-NEXT:    tbl v5.16b, { v4.16b }, v1.16b
 ; CHECK-BE-NEXT:    tbl v6.16b, { v4.16b }, v3.16b
@@ -3113,11 +3114,10 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    mul v6.4s, v17.4s, v6.4s
 ; CHECK-BE-NEXT:    mul v7.4s, v18.4s, v7.4s
 ; CHECK-BE-NEXT:    mul v4.4s, v19.4s, v4.4s
-; CHECK-BE-NEXT:    st1 { v5.4s }, [x0]
-; CHECK-BE-NEXT:    mov x0, x10
-; CHECK-BE-NEXT:    st1 { v6.4s }, [x8]
-; CHECK-BE-NEXT:    st1 { v7.4s }, [x9]
-; CHECK-BE-NEXT:    st1 { v4.4s }, [x10]
+; CHECK-BE-NEXT:    st1 { v5.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v6.4s }, [x9]
+; CHECK-BE-NEXT:    st1 { v7.4s }, [x10]
+; CHECK-BE-NEXT:    st1 { v4.4s }, [x0]
 ; CHECK-BE-NEXT:    b.ne .LBB26_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    mov w0, wzr
@@ -3246,11 +3246,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:  .LBB28_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ld1 { v0.16b }, [x1], #16
-; CHECK-BE-NEXT:    add x8, x0, #16
+; CHECK-BE-NEXT:    mov x8, x0
 ; CHECK-BE-NEXT:    ld1 { v1.8h }, [x0]
-; CHECK-BE-NEXT:    ld1 { v3.8h }, [x8]
-; CHECK-BE-NEXT:    add x9, x0, #48
-; CHECK-BE-NEXT:    add x10, x0, #32
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x9, x8, #48
+; CHECK-BE-NEXT:    ld1 { v3.8h }, [x0]
 ; CHECK-BE-NEXT:    subs w2, w2, #1
 ; CHECK-BE-NEXT:    ushll v2.8h, v0.8b, #0
 ; CHECK-BE-NEXT:    ushll2 v0.8h, v0.16b, #0
@@ -3258,11 +3258,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    smull2 v5.4s, v3.8h, v0.8h
 ; CHECK-BE-NEXT:    smull v0.4s, v3.4h, v0.4h
 ; CHECK-BE-NEXT:    smull2 v1.4s, v1.8h, v2.8h
-; CHECK-BE-NEXT:    st1 { v4.4s }, [x0]
-; CHECK-BE-NEXT:    mov x0, x8
+; CHECK-BE-NEXT:    st1 { v4.4s }, [x8]
+; CHECK-BE-NEXT:    add x8, x8, #32
 ; CHECK-BE-NEXT:    st1 { v5.4s }, [x9]
-; CHECK-BE-NEXT:    st1 { v0.4s }, [x10]
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x0]
 ; CHECK-BE-NEXT:    b.ne .LBB28_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    mov w0, wzr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index dd01112d97a18..c1e6b4fffa82d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -21,14 +21,14 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val,
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s6
 ; GFX10-NEXT:    s_mov_b32 s8, exec_lo
+; GFX10-NEXT:    s_mov_b32 s9, s5
 ; GFX10-NEXT:    s_add_i32 s6, s6, 1
-; GFX10-NEXT:    s_xor_b32 s8, s5, s8
+; GFX10-NEXT:    s_xor_b32 s5, s5, s8
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v0
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 s7, s7, exec_lo
-; GFX10-NEXT:    s_and_b32 s9, exec_lo, s5
-; GFX10-NEXT:    s_mov_b32 s5, s8
-; GFX10-NEXT:    s_or_b32 s7, s7, s9
+; GFX10-NEXT:    s_and_b32 s8, exec_lo, s9
+; GFX10-NEXT:    s_or_b32 s7, s7, s8
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX10-NEXT:  ; %bb.2: ; %exit
@@ -240,11 +240,11 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    s_and_saveexec_b32 s7, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX10-NEXT:  ; %bb.1: ; %loop.start.preheader
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:    ; implicit-def: $sgpr10
 ; GFX10-NEXT:    ; implicit-def: $sgpr11
 ; GFX10-NEXT:    ; implicit-def: $sgpr9
@@ -345,8 +345,8 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
 ; GFX10-LABEL: divergent_i1_icmp_used_outside_loop:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s6, 0
 ; GFX10-NEXT:    ; implicit-def: $sgpr7
 ; GFX10-NEXT:    s_branch .LBB5_2
 ; GFX10-NEXT:  .LBB5_1: ; %Flow
@@ -457,8 +457,8 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
 ; GFX10-LABEL: divergent_i1_freeze_used_outside_loop:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s1, exec_lo
-; GFX10-NEXT:    s_mov_b32 s2, 0
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_mov_b32 s2, 0
 ; GFX10-NEXT:    ; implicit-def: $sgpr4
 ; GFX10-NEXT:    ; implicit-def: $sgpr3
 ; GFX10-NEXT:    s_branch .LBB6_2
@@ -534,8 +534,8 @@ exit:
 define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) {
 ; GFX10-LABEL: loop_with_1break:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    ; implicit-def: $sgpr6
 ; GFX10-NEXT:    ; implicit-def: $sgpr7
 ; GFX10-NEXT:    ; implicit-def: $sgpr5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
index fd08ab88990ed..484536bd27f4e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -106,8 +106,8 @@ exit:
 define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a) {
 ; GFX10-LABEL: loop_with_1break:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    ; implicit-def: $sgpr5
 ; GFX10-NEXT:    s_branch .LBB2_2
 ; GFX10-NEXT:  .LBB2_1: ; %Flow
@@ -180,8 +180,8 @@ exit:
 define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) {
 ; GFX10-LABEL: loop_with_2breaks:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    ; implicit-def: $sgpr5
 ; GFX10-NEXT:    s_branch .LBB3_3
 ; GFX10-NEXT:  .LBB3_1: ; %Flow3
@@ -278,8 +278,8 @@ exit:
 define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) {
 ; GFX10-LABEL: loop_with_3breaks:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    ; implicit-def: $sgpr5
 ; GFX10-NEXT:    s_branch .LBB4_4
 ; GFX10-NEXT:  .LBB4_1: ; %Flow5
@@ -404,8 +404,8 @@ exit:
 define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) {
 ; GFX10-LABEL: loop_with_div_break_with_body:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    ; implicit-def: $sgpr6
 ; GFX10-NEXT:    ; implicit-def: $sgpr7
 ; GFX10-NEXT:    ; implicit-def: $sgpr5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
index d13d6a19d332a..69baf613fdfe5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
@@ -101,8 +101,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad
 ; GFX10-LABEL: loop_with_1break:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:    ; implicit-def: $sgpr10
 ; GFX10-NEXT:    ; implicit-def: $sgpr9
 ; GFX10-NEXT:    s_branch .LBB2_3
@@ -197,14 +197,14 @@ define void @nested_loops_temporal_divergence_inner(float %pre.cond.val, i32 %n.
 ; GFX10-LABEL: nested_loops_temporal_divergence_inner:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s8, 1.0, v0
-; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, 1.0, v0
 ; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:  .LBB3_1: ; %OuterHeader
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB3_2 Depth 2
 ; GFX10-NEXT:    s_ashr_i32 s7, s6, 31
-; GFX10-NEXT:    s_mov_b32 s4, s8
+; GFX10-NEXT:    s_mov_b32 s4, s5
 ; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], 2
 ; GFX10-NEXT:    ; implicit-def: $sgpr9
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s10
@@ -239,13 +239,13 @@ define void @nested_loops_temporal_divergence_inner(float %pre.cond.val, i32 %n.
 ; GFX10-NEXT:    s_add_i32 s6, s6, 1
 ; GFX10-NEXT:    v_add_co_u32 v6, s4, v4, v6
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v7, s4, v5, v7, s4
-; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
 ; GFX10-NEXT:    flat_store_byte v[6:7], v0
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX10-NEXT:  ; %bb.4: ; %exit
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -288,14 +288,14 @@ define void @nested_loops_temporal_divergence_outer(float %pre.cond.val, i32 %n.
 ; GFX10-LABEL: nested_loops_temporal_divergence_outer:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s8, 1.0, v0
-; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, 1.0, v0
 ; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:  .LBB4_1: ; %OuterHeader
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB4_2 Depth 2
 ; GFX10-NEXT:    s_ashr_i32 s7, s6, 31
-; GFX10-NEXT:    s_mov_b32 s4, s8
+; GFX10-NEXT:    s_mov_b32 s4, s5
 ; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], 2
 ; GFX10-NEXT:    ; implicit-def: $sgpr9
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s10
@@ -330,13 +330,13 @@ define void @nested_loops_temporal_divergence_outer(float %pre.cond.val, i32 %n.
 ; GFX10-NEXT:    s_add_i32 s6, s6, 1
 ; GFX10-NEXT:    v_add_co_u32 v6, s4, v4, v6
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v7, s4, v5, v7, s4
-; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
 ; GFX10-NEXT:    flat_store_byte v[6:7], v0
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX10-NEXT:  ; %bb.4: ; %exit
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -379,15 +379,15 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i
 ; GFX10-LABEL: nested_loops_temporal_divergence_both:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_lt_f32_e64 s8, 1.0, v0
-; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, 1.0, v0
 ; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:    ; implicit-def: $sgpr9
 ; GFX10-NEXT:  .LBB5_1: ; %OuterHeader
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB5_2 Depth 2
 ; GFX10-NEXT:    s_ashr_i32 s7, s6, 31
-; GFX10-NEXT:    s_mov_b32 s4, s8
+; GFX10-NEXT:    s_mov_b32 s4, s5
 ; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], 2
 ; GFX10-NEXT:    v_mov_b32_e32 v8, s10
 ; GFX10-NEXT:    v_mov_b32_e32 v9, s11
@@ -421,13 +421,13 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i
 ; GFX10-NEXT:    s_add_i32 s6, s6, 1
 ; GFX10-NEXT:    v_add_co_u32 v8, s4, v4, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s4, v5, v9, s4
-; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
 ; GFX10-NEXT:    flat_store_byte v[8:9], v0
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX10-NEXT:  ; %bb.4: ; %exit
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    flat_store_byte v[6:7], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll
new file mode 100644
index 0000000000000..e440beed1da79
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s
+
+define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) {
+; GFX11-FAKE16-LABEL: fadd_s16_uniform:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_f16_e64 v0, s0, s1
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: fadd_s16_uniform:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_f16_e64 v0.l, s0, s1
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_s16_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_add_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
+  %fadd = fadd half %a, %b
+  ret half %fadd
+}
+
+define amdgpu_ps half @fadd_s16_div(half %a, half %b) {
+; GFX11-FAKE16-LABEL: fadd_s16_div:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: fadd_s16_div:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: fadd_s16_div:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: fadd_s16_div:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+  %fadd = fadd half %a, %b
+  ret half %fadd
+}
+
+define amdgpu_ps float @fadd_s32_uniform(float inreg %a, float inreg %b) {
+; GFX11-LABEL: fadd_s32_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f32_e64 v0, s0, s1
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_s32_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_add_f32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
+  %fadd = fadd float %a, %b
+  ret float %fadd
+}
+
+define amdgpu_ps float @fadd_s32_div(float %a, float %b) {
+; GCN-LABEL: fadd_s32_div:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    ; return to shader part epilog
+  %fadd = fadd float %a, %b
+  ret float %fadd
+}
+
+define amdgpu_ps void @fadd_s64_uniform(double inreg %a, double inreg %b, ptr addrspace(1) %ptr) {
+; GFX11-LABEL: fadd_s64_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f64 v[2:3], s[0:1], s[2:3]
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fadd_s64_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_add_f64_e64 v[2:3], s[0:1], s[2:3]
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_endpgm
+  %fadd = fadd double %a, %b
+  store double %fadd, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_ps void @fadd_s64_div(double %a, double %b, ptr addrspace(1) %ptr) {
+; GFX11-LABEL: fadd_s64_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    global_store_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fadd_s64_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_add_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    s_endpgm
+  %fadd = fadd double %a, %b
+  store double %fadd, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_ps <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) {
+; GFX11-LABEL: fadd_v2s16_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_pk_add_f16 v0, s0, s1
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_v2s16_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX12-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX12-NEXT:    s_add_f16 s0, s0, s1
+; GFX12-NEXT:    s_add_f16 s1, s2, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
+  %fadd = fadd <2 x half> %a, %b
+  ret <2 x half> %fadd
+}
+
+define amdgpu_ps <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) {
+; GCN-LABEL: fadd_v2s16_div:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_add_f16 v0, v0, v1
+; GCN-NEXT:    ; return to shader part epilog
+  %fadd = fadd <2 x half> %a, %b
+  ret <2 x half> %fadd
+}
+
+define amdgpu_ps <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) {
+; GFX11-LABEL: fadd_v2s32_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f32_e64 v0, s0, s2
+; GFX11-NEXT:    v_add_f32_e64 v1, s1, s3
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_v2s32_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_add_f32 s0, s0, s2
+; GFX12-NEXT:    s_add_f32 s1, s1, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ; return to shader part epilog
+  %fadd = fadd <2 x float> %a, %b
+  ret <2 x float> %fadd
+}
+
+define amdgpu_ps <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) {
+; GCN-LABEL: fadd_v2s32_div:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
+; GCN-NEXT:    ; return to shader part epilog
+  %fadd = fadd <2 x float> %a, %b
+  ret <2 x float> %fadd
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value-addrspaces.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value-addrspaces.ll
new file mode 100644
index 0000000000000..cf9524b860fd2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value-addrspaces.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
+
+@flat = external global i32, align 4
+@global = external addrspace(1) global i32, align 4
+@lds = addrspace(3) global i32 poison, align 4
+@constant = external addrspace(4) constant i32, align 4
+@buf = external addrspace(8) global i8
+
+define ptr @global_value_as0_external() {
+; GCN-LABEL: global_value_as0_external:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, flat@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, flat@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  ret ptr @flat
+}
+
+define ptr addrspace(1) @global_value_as1_external() {
+; GCN-LABEL: global_value_as1_external:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, global@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, global@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  ret ptr addrspace(1) @global
+}
+
+define ptr addrspace(4) @global_value_as4_external() {
+; GCN-LABEL: global_value_as4_external:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, constant@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, constant@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  ret ptr addrspace(4) @constant
+}
+
+define amdgpu_kernel void @global_value_as3_lds_kernel(ptr addrspace(1) %out) {
+; GCN-LABEL: global_value_as3_lds_kernel:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    global_store_dword v0, v0, s[0:1]
+; GCN-NEXT:    s_endpgm
+  %addr = ptrtoint ptr addrspace(3) @lds to i32
+  store i32 %addr, ptr addrspace(1) %out
+  ret void
+}
+
+define void @global_value_as8_buffer_store(i32 %val) {
+; GCN-LABEL: global_value_as8_buffer_store:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[8:9]
+; GCN-NEXT:    s_add_u32 s8, s8, buf@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s9, s9, buf@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %val, ptr addrspace(8) @buf, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define i32 @global_value_as8_buffer_load(i32 %offset) {
+; GCN-LABEL: global_value_as8_buffer_load:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[8:9]
+; GCN-NEXT:    s_add_u32 s8, s8, buf@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s9, s9, buf@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %val = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) @buf, i32 %offset, i32 0, i32 0)
+  ret i32 %val
+}
+
+declare void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32, ptr addrspace(8) nocapture writeonly, i32, i32, i32 immarg) #0
+declare i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) #1
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
index 1a7ccf0835686..588802cbd56c7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
-; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
 
 define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr addrspace(1) %out) {
 ; GFX7-LABEL: fcmp_uniform_select:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir
index 67cc0169af619..b6652f605be19 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX7 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GF8 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GFX7 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GF8 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GFX11 %s
 
 ---
 name: test_copy_scc_vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
index e5cd0710359ac..70ff92f8eda92 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
@@ -66,7 +66,7 @@ define amdgpu_kernel void @asm_simple_agpr_clobber() {
 define i32 @asm_vgpr_early_clobber() {
   ; CHECK-LABEL: name: asm_vgpr_early_clobber
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %8, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %8, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %9
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
@@ -94,7 +94,7 @@ entry:
 define i32 @test_single_vgpr_output() nounwind {
   ; CHECK-LABEL: name: test_single_vgpr_output
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %8
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -106,7 +106,7 @@ entry:
 define i32 @test_single_sgpr_output_s32() nounwind {
   ; CHECK-LABEL: name: test_single_sgpr_output_s32
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %8
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -119,7 +119,7 @@ entry:
 define float @test_multiple_register_outputs_same() #0 {
   ; CHECK-LABEL: name: test_multiple_register_outputs_same
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 1835018 /* regdef:VGPR_32 */, def %9
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %8, 1245194 /* regdef:VGPR_32 */, def %9
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %9
   ; CHECK-NEXT:   [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]]
@@ -136,7 +136,7 @@ define float @test_multiple_register_outputs_same() #0 {
 define double @test_multiple_register_outputs_mixed() #0 {
   ; CHECK-LABEL: name: test_multiple_register_outputs_mixed
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 3407882 /* regdef:VReg_64 */, def %9
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %8, 2818058 /* regdef:VReg_64 */, def %9
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY %9
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
@@ -171,7 +171,7 @@ define amdgpu_kernel void @test_input_vgpr_imm() {
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[C]](s32)
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY1]]
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY1]]
   ; CHECK-NEXT:   S_ENDPGM 0
   call void asm sideeffect "v_mov_b32 v0, $0", "v"(i32 42)
   ret void
@@ -185,7 +185,7 @@ define amdgpu_kernel void @test_input_sgpr_imm() {
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[C]](s32)
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[COPY1]]
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[COPY1]]
   ; CHECK-NEXT:   S_ENDPGM 0
   call void asm sideeffect "s_mov_b32 s0, $0", "s"(i32 42)
   ret void
@@ -212,7 +212,7 @@ define float @test_input_vgpr(i32 %src) nounwind {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 1835017 /* reguse:VGPR_32 */, [[COPY1]]
+  ; CHECK-NEXT:   INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %9, 1245193 /* reguse:VGPR_32 */, [[COPY1]]
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %9
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -227,7 +227,7 @@ define i32 @test_memory_constraint(ptr addrspace(3) %a) nounwind {
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-  ; CHECK-NEXT:   INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3)
+  ; CHECK-NEXT:   INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1245194 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %9
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -244,7 +244,7 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind {
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32)
-  ; CHECK-NEXT:   INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &";", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %11
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -256,13 +256,13 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind {
 define i32 @test_sgpr_matching_constraint() nounwind {
   ; CHECK-LABEL: name: test_sgpr_matching_constraint
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %8
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %10
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %10
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %10
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32)
-  ; CHECK-NEXT:   INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %12, 2424841 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %12, 1835017 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3)
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY %12
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY4]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -285,7 +285,7 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind {
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32)
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32)
-  ; CHECK-NEXT:   INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 1835018 /* regdef:VGPR_32 */, def %12, 1835018 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5)
+  ; CHECK-NEXT:   INLINEASM &"; ", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %11, 1245194 /* regdef:VGPR_32 */, def %12, 1245194 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5)
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY %11
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY %12
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY %13
@@ -306,10 +306,10 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind {
 define i32 @test_sgpr_to_vgpr_move_matching_constraint() nounwind {
   ; CHECK-LABEL: name: test_sgpr_to_vgpr_move_matching_constraint
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %8
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %10
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll
index 82886ab9e7d55..e1ac8ba5e6db4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
 ; FIXME: Merge with DAG test
 
 @lds.external = external unnamed_addr addrspace(3) global [0 x i32]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
index cabb37c330b4a..3396eaedf359e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -new-reg-bank-select -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -new-reg-bank-select -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
-; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -global-isel < %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple=amdgcn -mcpu=tonga -global-isel < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -global-isel -new-reg-bank-select < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=amdgcn -mcpu=tonga -global-isel -new-reg-bank-select < %s 2>&1 | FileCheck %s
 
 ; CHECK: error: lds: unsupported initializer for address space
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 02d0e521e3b00..6facdfdec64ae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -104,109 +104,110 @@ define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) {
   ret <4 x i32> %res
 }
 
-define amdgpu_cs i16 @abs_vgpr_i16(i16 %arg) {
+define i16 @abs_vgpr_i16(i16 %arg) {
 ; GFX6-LABEL: abs_vgpr_i16:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 0, v0
 ; GFX6-NEXT:    v_max_i32_e32 v0, v0, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_i16:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_e32 v1, 0, v0
 ; GFX8-NEXT:    v_max_i16_e32 v0, v0, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_i16:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u16 v1, 0, v0
 ; GFX10-NEXT:    v_max_i16 v0, v0, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_sub_nc_u16 v1, 0, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_max_i16 v0, v0, v1
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
   ret i16 %res
 }
 
-define amdgpu_cs i32 @abs_vgpr_i32(i32 %arg) {
+define i32 @abs_vgpr_i32(i32 %arg) {
 ; GFX6-LABEL: abs_vgpr_i32:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 0, v0
 ; GFX6-NEXT:    v_max_i32_e32 v0, v0, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_i32:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 0, v0
 ; GFX8-NEXT:    v_max_i32_e32 v0, v0, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_i32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX10-NEXT:    v_max_i32_e32 v0, v0, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_max_i32_e32 v0, v0, v1
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
   ret i32 %res
 }
 
-define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
+define i64 @abs_vgpr_i64(i64 %arg) {
 ; GFX6-LABEL: abs_vgpr_i64:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_i64:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_i64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_mov_b32_e32 v3, v2
@@ -214,17 +215,15 @@ define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX1250-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
   ret i64 %res
 }
 
-define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
+define <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 ; GFX6-LABEL: abs_vgpr_v4i32:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
 ; GFX6-NEXT:    v_max_i32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0, v1
@@ -233,14 +232,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 ; GFX6-NEXT:    v_max_i32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, v3, v4
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_v4i32:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0, v0
 ; GFX8-NEXT:    v_max_i32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0, v1
@@ -249,14 +245,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 ; GFX8-NEXT:    v_max_i32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0, v3
 ; GFX8-NEXT:    v_max_i32_e32 v3, v3, v4
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_v4i32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 0, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 0, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 0, v2
@@ -265,14 +258,12 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 ; GFX10-NEXT:    v_max_i32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_max_i32_e32 v2, v2, v6
 ; GFX10-NEXT:    v_max_i32_e32 v3, v3, v7
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_v4i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1
 ; GFX1250-NEXT:    v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -281,13 +272,7 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_max_i32_e32 v2, v2, v6
 ; GFX1250-NEXT:    v_max_i32_e32 v3, v3, v7
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX1250-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
   ret <4 x i32> %res
 }
@@ -304,44 +289,43 @@ define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
   ret <2 x i8> %res
 }
 
-define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
+define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
 ; GFX6-LABEL: abs_vgpr_v2i8:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0, v0
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX6-NEXT:    v_max_i32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
 ; GFX6-NEXT:    v_max_i32_e32 v1, v1, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_v2i8:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_v2i8:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX10-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX10-NEXT:    v_sub_nc_u16 v2, 0, v0
 ; GFX10-NEXT:    v_sub_nc_u16 v3, 0, v1
 ; GFX10-NEXT:    v_max_i16 v0, v0, v2
 ; GFX10-NEXT:    v_max_i16 v1, v1, v3
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_v2i8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX1250-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -350,10 +334,7 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_max_i16 v0, v0, v2
 ; GFX1250-NEXT:    v_max_i16 v1, v1, v3
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
   ret <2 x i8> %res
 }
@@ -372,9 +353,10 @@ define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
   ret <3 x i8> %res
 }
 
-define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
+define <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 ; GFX6-LABEL: abs_vgpr_v3i8:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 8
@@ -384,13 +366,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 ; GFX6-NEXT:    v_max_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
 ; GFX6-NEXT:    v_max_i32_e32 v2, v2, v3
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_v3i8:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -398,13 +378,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 ; GFX8-NEXT:    v_sub_u16_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_max_i16_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_max_i16_sdwa v2, sext(v2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_v3i8:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX10-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX10-NEXT:    v_bfe_i32 v2, v2, 0, 8
@@ -414,13 +392,12 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 ; GFX10-NEXT:    v_max_i16 v0, v0, v3
 ; GFX10-NEXT:    v_max_i16 v1, v1, v4
 ; GFX10-NEXT:    v_max_i16 v2, v2, v5
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_v3i8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX1250-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX1250-NEXT:    v_bfe_i32 v2, v2, 0, 8
@@ -433,12 +410,7 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_max_i16 v1, v1, v4
 ; GFX1250-NEXT:    v_max_i16 v2, v2, v5
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1250-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
   ret <3 x i8> %res
 }
@@ -485,44 +457,44 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
   ret <2 x i16> %res
 }
 
-define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
+define <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
 ; GFX6-LABEL: abs_vgpr_v2i16:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0, v0
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_max_i32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
 ; GFX6-NEXT:    v_max_i32_e32 v1, v1, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_v2i16:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_sub_u16_e32 v1, 0, v0
 ; GFX8-NEXT:    v_sub_u16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_i16_e32 v1, v0, v1
 ; GFX8-NEXT:    v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_v2i16:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, 0, v0
 ; GFX10-NEXT:    v_pk_max_i16 v0, v0, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_v2i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_pk_sub_i16 v1, 0, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_pk_max_i16 v0, v0, v1
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
   ret <2 x i16> %res
 }
@@ -576,9 +548,10 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
   ret <3 x i16> %res
 }
 
-define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
+define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
 ; GFX6-LABEL: abs_vgpr_v3i16:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
@@ -588,13 +561,11 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
 ; GFX6-NEXT:    v_max_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
 ; GFX6-NEXT:    v_max_i32_e32 v2, v2, v3
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_v3i16:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_sub_u16_e32 v2, 0, v0
 ; GFX8-NEXT:    v_sub_u16_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -603,31 +574,27 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
 ; GFX8-NEXT:    v_max_i16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_max_i16_e32 v1, v1, v4
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_v3i16:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v2, 0, v0
 ; GFX10-NEXT:    v_sub_nc_u16 v3, 0, v1
 ; GFX10-NEXT:    v_pk_max_i16 v0, v0, v2
 ; GFX10-NEXT:    v_max_i16 v1, v1, v3
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_v3i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_pk_sub_i16 v2, 0, v0
 ; GFX1250-NEXT:    v_sub_nc_u16 v3, 0, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_pk_max_i16 v0, v0, v2
 ; GFX1250-NEXT:    v_max_i16 v1, v1, v3
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
   ret <3 x i16> %res
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index 5720b882f4e73..cc21305a5a193 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN %s
+; RUN: llc -verify-machineinstrs  -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN %s
 
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index e411c23c77bbe..7b5621ff3b5a9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -27,11 +27,11 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-NEXT:    v_mov_b32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
   %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
@@ -68,12 +68,12 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, v1
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
   %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index e86f7473363f7..37b5422be7e2f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s
 
 ; Note: we use MIR test checks + stop after legalizer to prevent
 ; tests from being optimized out.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
index 44b12a9f6fe81..61a61376d7ddd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s
 
 declare void @readsMem(ptr) #0
 declare void @writesMem(ptr) #1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 1cd9c0bfeb7e6..9a90faf723461 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -8,17 +8,16 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-LABEL: v_mul_i64_no_zext:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v7, s[0:1]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v7, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v6, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s0, v0, v2, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0
-; GFX10-NEXT:    global_store_dwordx2 v7, v[4:5], s[2:3]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v2, v4, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, v2, v5, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, v3, v4, v[1:2]
+; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul_i64_no_zext:
@@ -26,19 +25,17 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 3, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[0:1], v9, s[0:1]
-; GFX11-NEXT:    global_load_b64 v[2:3], v9, s[2:3]
+; GFX11-NEXT:    global_load_b64 v[2:3], v8, s[0:1]
+; GFX11-NEXT:    global_load_b64 v[4:5], v8, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[4:5], null, v0, v2, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v4, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6]
-; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v7
-; GFX11-NEXT:    global_store_b64 v9, v[4:5], s[2:3]
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v2, v5, v[1:2]
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v3, v4, v[6:7]
+; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[2:3]
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -58,18 +55,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
-; GFX10-NEXT:    global_load_dword v4, v3, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v1, s[2:3]
+; GFX10-NEXT:    global_load_dword v4, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s2, v0, v4, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v1, v4, v[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, v4, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s2, v3, v4, v[1:2]
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul_i64_zext_src1:
@@ -80,17 +75,17 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v1, s[2:3]
-; GFX11-NEXT:    global_load_b32 v5, v2, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[2:3], v1, s[2:3]
+; GFX11-NEXT:    global_load_b32 v5, v0, s[4:5]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v0, v5, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v5, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
-; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    global_store_b64 v0, v[2:3], s[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v3, v5, v[4:5]
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -110,18 +105,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v2, s[2:3]
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v3, s[6:7]
+; GFX10-NEXT:    global_load_dword v4, v1, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s2, v4, v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v4, v2, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s2, v4, v3, v[1:2]
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul_i64_zext_src0:
@@ -135,14 +128,14 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v5, v1, s[2:3]
-; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[2:3], v0, s[4:5]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v5, v0, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v2, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
-; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    global_store_b64 v0, v[2:3], s[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid
@@ -165,10 +158,10 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v1, v2, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, v3, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
@@ -179,15 +172,15 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-NEXT:    global_load_b32 v2, v0, s[2:3]
+; GFX11-NEXT:    global_load_b32 v3, v0, s[4:5]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v1, v0, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v3, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -209,18 +202,16 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v4, v2, s[2:3]
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX10-NEXT:    global_load_dword v4, v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s2, v4, v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v3, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v4, v2, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s2, v4, v3, v[1:2]
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul_i64_masked_src0_hi:
@@ -234,14 +225,14 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3]
-; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[2:3], v0, s[4:5]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v5, v0, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v2, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
-; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    global_store_b64 v0, v[2:3], s[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -389,22 +380,20 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[3:4], v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xfff00000, v0
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xfff00000, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s2, v6, v2, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, v5
-; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s2, v6, v3, v[0:1]
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xf00f, v1
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v0, v2, v[5:6]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    global_store_dwordx2 v0, v[4:5], s[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v5, v3, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s2, v5, v4, v[1:2]
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xf00f, v2
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s2, v1, v3, v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul_i64_partially_masked_src0:
@@ -414,24 +403,22 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
-; GFX11-NEXT:    global_load_b64 v[2:3], v2, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[1:2], v0, s[2:3]
+; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[4:5]
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xfff00000, v0
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xfff00000, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mad_u64_u32 v[4:5], null, v7, v2, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1]
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xf00f, v1
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6]
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v7, v3, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[5:6], null, v7, v4, v[1:2]
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xf00f, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0
-; GFX11-NEXT:    global_store_b64 v0, v[4:5], s[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v4, v3, v[5:6]
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -536,28 +523,28 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[2:3], v0, s[2:3]
-; GFX11-NEXT:    global_load_b64 v[4:5], v0, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[2:3]
+; GFX11-NEXT:    global_load_b64 v[5:6], v0, s[4:5]
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cmpx_ge_u64_e32 0, v[2:3]
+; GFX11-NEXT:    v_cmpx_ge_u64_e32 0, v[3:4]
 ; GFX11-NEXT:    s_xor_b32 s2, exec_lo, s2
 ; GFX11-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX11-NEXT:  ; %bb.1: ; %else
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v4, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v5, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2]
-; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX11-NEXT:    v_mov_b32_e32 v1, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v3, v6, v[4:5]
+; GFX11-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX11-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; GFX11-NEXT:  .LBB10_2: ; %Flow
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s2, s2
 ; GFX11-NEXT:    s_cbranch_execz .LBB10_4
 ; GFX11-NEXT:  ; %bb.3: ; %if
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mul_lo_u32 v1, v2, v5
+; GFX11-NEXT:    v_mul_lo_u32 v1, v3, v6
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:  .LBB10_4: ; %endif
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 637aaf7529364..3eecaccf0308f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -546,10 +546,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v4, v0
-; GCN-NEXT:    v_mov_b32_e32 v5, v1
-; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GCN-NEXT:    v_mov_b32_e32 v5, v2
+; GCN-NEXT:    v_mov_b32_e32 v6, v1
+; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GCN-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i64:
@@ -740,12 +741,13 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GCN-NEXT:    v_mov_b32_e32 v6, v0
 ; GCN-NEXT:    v_mov_b32_e32 v7, v1
 ; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
-; GCN-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1]
-; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
-; GCN-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v2, v8
-; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2]
-; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2]
+; GCN-NEXT:    v_mov_b32_e32 v8, v2
+; GCN-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v7, v4, v[0:1]
+; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0
+; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v9, v[10:11]
+; GCN-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v4, v[1:2]
+; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v9, v[10:11]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i96:
@@ -753,26 +755,26 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v1
+; GFX10-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v6, v5
-; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s4, v7, v4, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v6, v3, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v8
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2]
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s4, v7, v4, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v6, v8, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s4, v2, v8, v[9:10]
+; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s4, v6, v4, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v7, v8, v[9:10]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_i96:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
+; GFX11-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v9, v3
 ; GFX11-NEXT:    v_mul_lo_u32 v0, v6, v5
-; GFX11-NEXT:    v_mad_u64_u32 v[8:9], null, v7, v4, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v6, v3, 0
-; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9]
-; GFX11-NEXT:    v_mov_b32_e32 v2, v9
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2]
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2]
+; GFX11-NEXT:    v_mad_u64_u32 v[10:11], null, v7, v4, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v6, v9, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v8, v9, v[10:11]
+; GFX11-NEXT:    v_mad_u64_u32 v[10:11], null, v6, v4, v[1:2]
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v7, v9, v[10:11]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_mul_i96:
@@ -783,16 +785,16 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mul_lo_u32 v0, v6, v5
-; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v6, v3, 0
+; GFX12-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9]
-; GFX12-NEXT:    v_mov_b32_e32 v2, v8
+; GFX12-NEXT:    v_mul_lo_u32 v0, v6, v5
+; GFX12-NEXT:    v_mad_co_u64_u32 v[9:10], null, v7, v4, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v6, v8, 0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, v2, v8, v[9:10]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[9:10], null, v6, v4, v[1:2]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v7, v8, v[9:10]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: v_mul_i96:
@@ -808,10 +810,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX1250-NEXT:    v_mad_u32 v9, v2, v3, v5
 ; GFX1250-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[4:5], v6, v4, v[8:9]
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[2:3], v7, v3, v[4:5]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[10:11], v6, v4, v[8:9]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[4:5], v7, v3, v[10:11]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
+; GFX1250-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = mul i96 %num, %den
   ret i96 %result
@@ -1071,18 +1073,19 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX7-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v10, v2
-; GFX7-NEXT:    v_mul_lo_u32 v7, v8, v7
-; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
-; GFX7-NEXT:    v_mul_lo_u32 v6, v9, v6
-; GFX7-NEXT:    v_mov_b32_e32 v2, v11
-; GFX7-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX7-NEXT:    v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX7-NEXT:    v_mov_b32_e32 v12, v4
+; GFX7-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v11, v3
+; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14]
+; GFX7-NEXT:    v_mul_lo_u32 v4, v9, v6
+; GFX7-NEXT:    v_mul_lo_u32 v6, v8, v7
+; GFX7-NEXT:    v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2]
+; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14]
+; GFX7-NEXT:    v_addc_u32_e64 v3, s[4:5], v3, v6, s[4:5]
+; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_i128:
@@ -1092,18 +1095,19 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v2
-; GFX8-NEXT:    v_mul_lo_u32 v7, v8, v7
-; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
-; GFX8-NEXT:    v_mul_lo_u32 v6, v9, v6
-; GFX8-NEXT:    v_mov_b32_e32 v2, v11
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX8-NEXT:    v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX8-NEXT:    v_mov_b32_e32 v12, v4
+; GFX8-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v11, v3
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14]
+; GFX8-NEXT:    v_mul_lo_u32 v4, v9, v6
+; GFX8-NEXT:    v_mul_lo_u32 v6, v8, v7
+; GFX8-NEXT:    v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2]
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14]
+; GFX8-NEXT:    v_addc_u32_e64 v3, s[4:5], v3, v6, s[4:5]
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_i128:
@@ -1113,18 +1117,19 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX9-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v10, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, v8, v7
-; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
-; GFX9-NEXT:    v_mul_lo_u32 v6, v9, v6
-; GFX9-NEXT:    v_mov_b32_e32 v2, v11
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v6, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX9-NEXT:    v_mov_b32_e32 v12, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v11, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14]
+; GFX9-NEXT:    v_mul_lo_u32 v4, v9, v6
+; GFX9-NEXT:    v_mul_lo_u32 v6, v8, v7
+; GFX9-NEXT:    v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2]
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14]
+; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v3, v6, s[4:5]
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i128:
@@ -1133,19 +1138,19 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; GFX10-NEXT:    v_mov_b32_e32 v11, v3
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v8, v6, 0
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v8, v7
 ; GFX10-NEXT:    v_mul_lo_u32 v6, v9, v6
-; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[12:13], s4, v9, v5, v[0:1]
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v8, v4, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v11
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2]
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v7, s4, v12, v7, s4
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s4, v10, v5, v[6:7]
-; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s4, v3, v4, v[5:6]
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s4, v10, v4, v[12:13]
+; GFX10-NEXT:    v_mad_u64_u32 v[12:13], vcc_lo, v8, v5, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v9, v4, v[12:13]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s4, v3, v7, s4
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s4, v10, v5, v[3:4]
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s4, v11, v4, v[5:6]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_i128:
@@ -1157,11 +1162,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v6, 0
 ; GFX11-NEXT:    v_mul_lo_u32 v4, v9, v6
 ; GFX11-NEXT:    v_mul_lo_u32 v6, v8, v7
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v9, v5, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[13:14], null, v9, v5, v[0:1]
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v11, 0
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v10, v11, v[2:3]
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s0, v9, v11, v[1:2]
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v10, v11, v[13:14]
+; GFX11-NEXT:    v_mad_u64_u32 v[13:14], vcc_lo, v8, v5, v[1:2]
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s0, v9, v11, v[13:14]
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v6, s0
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo
 ; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4]
@@ -1176,28 +1181,26 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
-; GFX12-NEXT:    v_mov_b32_e32 v10, v2
+; GFX12-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v8, v6, 0
 ; GFX12-NEXT:    v_mul_lo_u32 v7, v8, v7
 ; GFX12-NEXT:    v_mul_lo_u32 v6, v9, v6
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[12:13], null, v9, v5, v[0:1]
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], null, v10, v4, v[11:12]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, v10, v4, v[12:13]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v2, v11
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[1:2]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[12:13]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v7, null, v12, v7, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v7, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo
-; GFX12-NEXT:    v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[5:6], null, v10, v5, v[3:4]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v11, v4, v[5:6]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: v_mul_i128:
@@ -1210,16 +1213,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX1250-NEXT:    v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1]
 ; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v8, v4, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[10:11], v2, v4, v[10:11]
-; GFX1250-NEXT:    v_mov_b32_e32 v12, v1
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[12:13], v2, v4, v[10:11]
+; GFX1250-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX1250-NEXT:    v_mul_lo_u32 v1, v9, v6
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mov_b32_e32 v13, v10
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[12:13]
+; GFX1250-NEXT:    v_mov_b32_e32 v11, v12
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[14:15], vcc_lo, v8, v5, v[10:11]
 ; GFX1250-NEXT:    v_mul_lo_u32 v8, v8, v7
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13]
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v8, null, v11, v8, s0
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[14:15]
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v8, null, v13, v8, s0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo
 ; GFX1250-NEXT:    v_mad_u32 v1, v2, v5, v1
@@ -2401,207 +2404,204 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX7-NEXT:    v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX7-NEXT:    v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX7-NEXT:    v_addc_u32_e32 v23, vcc, 0, v22, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v22, v18
-; GFX7-NEXT:    v_mov_b32_e32 v18, v19
-; GFX7-NEXT:    v_mov_b32_e32 v19, v16
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX7-NEXT:    v_mul_lo_u32 v16, v6, v9
-; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX7-NEXT:    v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
+; GFX7-NEXT:    v_mul_lo_u32 v29, v3, v12
+; GFX7-NEXT:    v_mul_lo_u32 v30, v2, v13
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19]
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19]
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21]
+; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19]
+; GFX7-NEXT:    v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21]
+; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17]
+; GFX7-NEXT:    v_addc_u32_e32 v28, vcc, 0, v20, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25]
+; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GFX7-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23]
+; GFX7-NEXT:    v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17]
+; GFX7-NEXT:    v_addc_u32_e32 v16, vcc, 0, v20, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23]
+; GFX7-NEXT:    v_mov_b32_e32 v22, v26
+; GFX7-NEXT:    v_addc_u32_e32 v23, vcc, 0, v16, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22]
+; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20]
+; GFX7-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17]
 ; GFX7-NEXT:    v_mul_lo_u32 v26, v4, v11
-; GFX7-NEXT:    v_mul_lo_u32 v27, v3, v12
-; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
 ; GFX7-NEXT:    v_mul_lo_u32 v25, v5, v10
-; GFX7-NEXT:    v_mul_lo_u32 v28, v2, v13
-; GFX7-NEXT:    v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
-; GFX7-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX7-NEXT:    v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX7-NEXT:    v_mov_b32_e32 v21, v20
-; GFX7-NEXT:    v_mov_b32_e32 v20, v11
-; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX7-NEXT:    v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX7-NEXT:    v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11]
-; GFX7-NEXT:    v_mul_lo_u32 v9, v1, v14
-; GFX7-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX7-NEXT:    v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20]
+; GFX7-NEXT:    v_mul_lo_u32 v24, v6, v9
+; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
+; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22]
+; GFX7-NEXT:    v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13]
+; GFX7-NEXT:    v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12]
+; GFX7-NEXT:    v_mul_lo_u32 v10, v1, v14
+; GFX7-NEXT:    v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0
+; GFX7-NEXT:    v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13]
+; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20]
+; GFX7-NEXT:    v_addc_u32_e64 v5, s[12:13], 0, v2, s[12:13]
+; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
 ; GFX7-NEXT:    v_mul_lo_u32 v0, v0, v15
-; GFX7-NEXT:    v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX7-NEXT:    v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX7-NEXT:    v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9]
+; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3]
+; GFX7-NEXT:    v_addc_u32_e64 v3, s[12:13], v4, v21, s[12:13]
+; GFX7-NEXT:    v_addc_u32_e64 v4, s[12:13], v28, v22, s[12:13]
+; GFX7-NEXT:    v_addc_u32_e64 v5, s[12:13], v5, v13, s[12:13]
+; GFX7-NEXT:    v_addc_u32_e64 v6, s[12:13], v23, v14, s[12:13]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[12:13], v27, v0, s[12:13]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[12:13], v0, v10, s[14:15]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v30, s[10:11]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v29, s[8:9]
 ; GFX7-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7]
 ; GFX7-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX7-NEXT:    v_addc_u32_e32 v0, vcc, v0, v16, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v10
+; GFX7-NEXT:    v_addc_u32_e32 v0, vcc, v0, v24, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v16
+; GFX7-NEXT:    v_mov_b32_e32 v1, v11
+; GFX7-NEXT:    v_mov_b32_e32 v2, v12
+; GFX7-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_i256:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, 0, v22, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v22, v18
-; GFX8-NEXT:    v_mov_b32_e32 v18, v19
-; GFX8-NEXT:    v_mov_b32_e32 v19, v16
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX8-NEXT:    v_mul_lo_u32 v16, v6, v9
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX8-NEXT:    v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
+; GFX8-NEXT:    v_mul_lo_u32 v29, v3, v12
+; GFX8-NEXT:    v_mul_lo_u32 v30, v2, v13
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19]
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19]
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21]
+; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19]
+; GFX8-NEXT:    v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21]
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17]
+; GFX8-NEXT:    v_addc_u32_e32 v28, vcc, 0, v20, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25]
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23]
+; GFX8-NEXT:    v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17]
+; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v20, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23]
+; GFX8-NEXT:    v_mov_b32_e32 v22, v26
+; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, 0, v16, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22]
+; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20]
+; GFX8-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17]
 ; GFX8-NEXT:    v_mul_lo_u32 v26, v4, v11
-; GFX8-NEXT:    v_mul_lo_u32 v27, v3, v12
-; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
 ; GFX8-NEXT:    v_mul_lo_u32 v25, v5, v10
-; GFX8-NEXT:    v_mul_lo_u32 v28, v2, v13
-; GFX8-NEXT:    v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
-; GFX8-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX8-NEXT:    v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX8-NEXT:    v_mov_b32_e32 v21, v20
-; GFX8-NEXT:    v_mov_b32_e32 v20, v11
-; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX8-NEXT:    v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX8-NEXT:    v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11]
-; GFX8-NEXT:    v_mul_lo_u32 v9, v1, v14
-; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX8-NEXT:    v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20]
+; GFX8-NEXT:    v_mul_lo_u32 v24, v6, v9
+; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
+; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22]
+; GFX8-NEXT:    v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13]
+; GFX8-NEXT:    v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12]
+; GFX8-NEXT:    v_mul_lo_u32 v10, v1, v14
+; GFX8-NEXT:    v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0
+; GFX8-NEXT:    v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13]
+; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20]
+; GFX8-NEXT:    v_addc_u32_e64 v5, s[12:13], 0, v2, s[12:13]
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v15
-; GFX8-NEXT:    v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX8-NEXT:    v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX8-NEXT:    v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9]
+; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3]
+; GFX8-NEXT:    v_addc_u32_e64 v3, s[12:13], v4, v21, s[12:13]
+; GFX8-NEXT:    v_addc_u32_e64 v4, s[12:13], v28, v22, s[12:13]
+; GFX8-NEXT:    v_addc_u32_e64 v5, s[12:13], v5, v13, s[12:13]
+; GFX8-NEXT:    v_addc_u32_e64 v6, s[12:13], v23, v14, s[12:13]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[12:13], v27, v0, s[12:13]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[12:13], v0, v10, s[14:15]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v30, s[10:11]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v29, s[8:9]
 ; GFX8-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7]
 ; GFX8-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX8-NEXT:    v_addc_u32_e32 v0, vcc, v0, v16, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v10
+; GFX8-NEXT:    v_addc_u32_e32 v0, vcc, v0, v24, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v16
+; GFX8-NEXT:    v_mov_b32_e32 v1, v11
+; GFX8-NEXT:    v_mov_b32_e32 v2, v12
+; GFX8-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_i256:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX9-NEXT:    v_addc_co_u32_e32 v22, vcc, 0, v22, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX9-NEXT:    v_addc_co_u32_e32 v22, vcc, 0, v22, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX9-NEXT:    v_addc_co_u32_e32 v23, vcc, 0, v22, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v22, v18
-; GFX9-NEXT:    v_mov_b32_e32 v18, v19
-; GFX9-NEXT:    v_mov_b32_e32 v19, v16
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX9-NEXT:    v_mul_lo_u32 v16, v6, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX9-NEXT:    v_addc_co_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
+; GFX9-NEXT:    v_mul_lo_u32 v29, v3, v12
+; GFX9-NEXT:    v_mul_lo_u32 v30, v2, v13
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19]
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19]
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21]
+; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19]
+; GFX9-NEXT:    v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21]
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17]
+; GFX9-NEXT:    v_addc_co_u32_e32 v28, vcc, 0, v20, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25]
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[6:7]
+; GFX9-NEXT:    v_addc_co_u32_e32 v20, vcc, 0, v20, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23]
+; GFX9-NEXT:    v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17]
+; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, 0, v20, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23]
+; GFX9-NEXT:    v_mov_b32_e32 v22, v26
+; GFX9-NEXT:    v_addc_co_u32_e32 v23, vcc, 0, v16, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22]
+; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20]
+; GFX9-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17]
 ; GFX9-NEXT:    v_mul_lo_u32 v26, v4, v11
-; GFX9-NEXT:    v_mul_lo_u32 v27, v3, v12
-; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
 ; GFX9-NEXT:    v_mul_lo_u32 v25, v5, v10
-; GFX9-NEXT:    v_mul_lo_u32 v28, v2, v13
-; GFX9-NEXT:    v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
-; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX9-NEXT:    v_addc_co_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX9-NEXT:    v_mov_b32_e32 v21, v20
-; GFX9-NEXT:    v_mov_b32_e32 v20, v11
-; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX9-NEXT:    v_addc_co_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[10:11], 0, v2, s[10:11]
-; GFX9-NEXT:    v_mul_lo_u32 v9, v1, v14
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20]
+; GFX9-NEXT:    v_mul_lo_u32 v24, v6, v9
+; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
+; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22]
+; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13]
+; GFX9-NEXT:    v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12]
+; GFX9-NEXT:    v_mul_lo_u32 v10, v1, v14
+; GFX9-NEXT:    v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0
+; GFX9-NEXT:    v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13]
+; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20]
+; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[12:13], 0, v2, s[12:13]
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v15
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[10:11], v17, v0, s[10:11]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[8:9], v0, v27, s[8:9]
+; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3]
+; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[12:13], v4, v21, s[12:13]
+; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[12:13], v28, v22, s[12:13]
+; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v5, v13, s[12:13]
+; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[12:13], v23, v14, s[12:13]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[12:13], v27, v0, s[12:13]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[12:13], v0, v10, s[14:15]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[10:11], v0, v30, s[10:11]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[8:9], v0, v29, s[8:9]
 ; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[6:7], v0, v26, s[6:7]
 ; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v16, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v10
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v24, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v16
+; GFX9-NEXT:    v_mov_b32_e32 v1, v11
+; GFX9-NEXT:    v_mov_b32_e32 v2, v12
+; GFX9-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i256:
@@ -2609,68 +2609,67 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v16, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v17, v1
+; GFX10-NEXT:    v_mov_b32_e32 v18, v2
+; GFX10-NEXT:    v_mov_b32_e32 v19, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v27, v6, v9
-; GFX10-NEXT:    v_mul_lo_u32 v28, v5, v10
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v14, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v16, v12, 0
-; GFX10-NEXT:    v_mul_lo_u32 v30, v17, v14
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19]
+; GFX10-NEXT:    v_mul_lo_u32 v30, v4, v11
+; GFX10-NEXT:    v_mul_lo_u32 v28, v5, v10
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s4, v17, v13, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v18, v12, v[2:3]
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s4, v16, v12, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[20:21], s4, v19, v11, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v11, v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s4
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s5, v4, v10, v[20:21]
+; GFX10-NEXT:    v_mad_u64_u32 v[20:21], vcc_lo, v18, v10, v[0:1]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v22, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v5, v9, v[2:3]
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s4, v16, v10, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[23:24], vcc_lo, v19, v9, v[20:21]
+; GFX10-NEXT:    v_mad_u64_u32 v[25:26], s4, v6, v8, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v9, v[2:3]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v22, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[21:22], vcc_lo, v4, v8, v[23:24]
+; GFX10-NEXT:    v_mov_b32_e32 v23, v25
 ; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s4
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[20:21], s4, v16, v10, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21]
-; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
-; GFX10-NEXT:    v_mov_b32_e32 v20, v22
-; GFX10-NEXT:    v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20]
-; GFX10-NEXT:    v_mov_b32_e32 v20, v18
-; GFX10-NEXT:    v_mov_b32_e32 v19, v22
-; GFX10-NEXT:    v_mul_lo_u32 v22, v16, v15
-; GFX10-NEXT:    v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20]
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s4, v18, v8, v[0:1]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], vcc_lo, v16, v13, v[22:23]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v29, s4, 0, v20, s4
+; GFX10-NEXT:    v_mov_b32_e32 v20, v3
+; GFX10-NEXT:    v_mad_u64_u32 v[22:23], s4, v17, v12, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[24:25], s6, v16, v11, v[20:21]
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, v16, v8, 0
-; GFX10-NEXT:    v_mul_lo_u32 v20, v4, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s6
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25]
-; GFX10-NEXT:    v_mul_lo_u32 v25, v3, v12
-; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15]
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
-; GFX10-NEXT:    v_mul_lo_u32 v24, v2, v13
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19]
-; GFX10-NEXT:    v_mov_b32_e32 v13, v1
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12]
-; GFX10-NEXT:    v_mov_b32_e32 v14, v21
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
-; GFX10-NEXT:    v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19]
-; GFX10-NEXT:    v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14]
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s8
-; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2]
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s8, 0, v6, s8
-; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11]
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13]
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s9, v9, v3, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s9, v29, v4, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s9, v14, v5, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s9, v26, v6, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s9, v23, v22, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s8, v9, v30, s8
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v24, s6
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v25, s7
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s5, v9, v20, s5
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
+; GFX10-NEXT:    v_mad_u64_u32 v[20:21], s5, v18, v11, v[22:23]
+; GFX10-NEXT:    v_mul_lo_u32 v22, v16, v15
+; GFX10-NEXT:    v_mul_lo_u32 v23, v17, v14
+; GFX10-NEXT:    v_mad_u64_u32 v[14:15], s6, v17, v10, v[24:25]
+; GFX10-NEXT:    v_mul_lo_u32 v24, v19, v12
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s6, 0, v3, s6
+; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s7, v19, v10, v[20:21]
+; GFX10-NEXT:    v_mul_lo_u32 v25, v18, v13
+; GFX10-NEXT:    v_mad_u64_u32 v[20:21], s6, v18, v9, v[14:15]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v15, s6, 0, v3, s6
+; GFX10-NEXT:    v_mad_u64_u32 v[13:14], s6, v4, v9, v[11:12]
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s8, v16, v9, v[1:2]
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s8
+; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s8, v19, v8, v[20:21]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v15, s8, 0, v15, s8
+; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s8, v5, v8, v[13:14]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s9, v17, v8, v[3:4]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s9, v16, v9, s9
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s9, v29, v10, s9
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s9, v15, v11, s9
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s9, v6, v12, s9
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s9, v26, v22, s9
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s8, v9, v23, s8
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v25, s6
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v24, s7
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s5, v9, v30, s5
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s4, v9, v28, s4
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v27, vcc_lo
 ; GFX10-NEXT:    v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2678,70 +2677,68 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
-; GFX11-NEXT:    v_dual_mov_b32 v18, v8 :: v_dual_mov_b32 v19, v7
-; GFX11-NEXT:    v_mul_lo_u32 v30, v4, v11
+; GFX11-NEXT:    v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v19, v3
+; GFX11-NEXT:    v_dual_mov_b32 v20, v8 :: v_dual_mov_b32 v21, v7
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v14, 0
-; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v16, v12, 0
-; GFX11-NEXT:    v_mul_lo_u32 v29, v17, v14
+; GFX11-NEXT:    v_mul_lo_u32 v27, v6, v9
+; GFX11-NEXT:    v_mul_lo_u32 v30, v4, v11
+; GFX11-NEXT:    v_mul_lo_u32 v31, v17, v14
 ; GFX11-NEXT:    v_mul_lo_u32 v28, v5, v10
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[7:8], s0, v17, v11, v[7:8]
-; GFX11-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[7:8], vcc_lo, v2, v10, v[7:8]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX11-NEXT:    v_mad_u64_u32 v[20:21], null, v16, v10, 0
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[7:8], vcc_lo, v3, v9, v[7:8]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[7:8], vcc_lo, v4, v18, v[7:8]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v27, null, 0, v24, vcc_lo
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[22:23], null, v6, v18, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX11-NEXT:    v_mov_b32_e32 v20, v8
-; GFX11-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s0
-; GFX11-NEXT:    v_mov_b32_e32 v21, v22
-; GFX11-NEXT:    v_mul_lo_u32 v22, v6, v9
-; GFX11-NEXT:    v_mad_u64_u32 v[24:25], vcc_lo, v2, v18, v[0:1]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v16, v13, v[20:21]
-; GFX11-NEXT:    v_mov_b32_e32 v6, v25
-; GFX11-NEXT:    v_mul_lo_u32 v25, v16, v15
-; GFX11-NEXT:    v_mad_u64_u32 v[20:21], vcc_lo, v17, v12, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v16, v11, v[6:7]
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v18, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s2
-; GFX11-NEXT:    v_mad_u64_u32 v[14:15], s1, v2, v11, v[20:21]
-; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v17, v10, v[6:7]
-; GFX11-NEXT:    v_mul_lo_u32 v20, v2, v13
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, v8, s2
-; GFX11-NEXT:    v_mov_b32_e32 v11, v1
-; GFX11-NEXT:    v_mad_u64_u32 v[13:14], s3, v3, v10, v[14:15]
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s2, v2, v9, v[6:7]
-; GFX11-NEXT:    v_mul_lo_u32 v21, v3, v12
-; GFX11-NEXT:    v_mov_b32_e32 v12, v24
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v8, s2
-; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v4, v9, v[13:14]
-; GFX11-NEXT:    v_mad_u64_u32 v[8:9], s4, v16, v9, v[11:12]
-; GFX11-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s4
-; GFX11-NEXT:    v_mad_u64_u32 v[3:4], s4, v3, v18, v[1:2]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v10, s4
-; GFX11-NEXT:    v_mad_u64_u32 v[5:6], s4, v5, v18, v[6:7]
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s5, v17, v18, v[8:9]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s5, v11, v3, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, s5, v26, v4, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s5, v10, v5, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s5, v27, v6, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v23, v25, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v29, s4
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v20, s2
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v21, s3
+; GFX11-NEXT:    v_mul_lo_u32 v15, v16, v15
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v17, v13, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v18, v12, v[2:3]
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v16, v12, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v19, v11, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v17, v11, v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s0
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v4, v10, v[7:8]
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], vcc_lo, v18, v10, v[0:1]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v9, v[2:3]
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v16, v10, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[25:26], vcc_lo, v19, v9, v[7:8]
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v6, v20, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v17, v9, v[2:3]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v22, vcc_lo
+; GFX11-NEXT:    v_mad_u64_u32 v[23:24], vcc_lo, v4, v20, v[25:26]
+; GFX11-NEXT:    v_mov_b32_e32 v25, v7
+; GFX11-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s0
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], s0, v18, v20, v[0:1]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v6, vcc_lo
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], vcc_lo, v16, v13, v[24:25]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v29, null, 0, v22, s0
+; GFX11-NEXT:    v_mov_b32_e32 v22, v3
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s0, v17, v12, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[24:25], s2, v16, v11, v[22:23]
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v20, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[22:23], s1, v18, v11, v[6:7]
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v17, v10, v[24:25]
+; GFX11-NEXT:    v_mul_lo_u32 v24, v19, v12
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, s2
+; GFX11-NEXT:    v_mad_u64_u32 v[11:12], s3, v19, v10, v[22:23]
+; GFX11-NEXT:    v_mul_lo_u32 v22, v18, v13
+; GFX11-NEXT:    v_mad_u64_u32 v[13:14], s2, v18, v9, v[6:7]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v18, null, 0, v3, s2
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v4, v9, v[11:12]
+; GFX11-NEXT:    v_mad_u64_u32 v[3:4], s4, v16, v9, v[1:2]
+; GFX11-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s4
+; GFX11-NEXT:    v_mad_u64_u32 v[9:10], s4, v19, v20, v[13:14]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v13, null, 0, v18, s4
+; GFX11-NEXT:    v_mad_u64_u32 v[11:12], s4, v5, v20, v[6:7]
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s5, v17, v20, v[3:4]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s5, v16, v9, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, s5, v29, v10, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s5, v13, v11, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s5, v26, v12, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v8, v15, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v31, s4
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v22, s2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v24, s3
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v30, s1
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v28, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, v7, v22, s0
-; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v28, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, v7, v27, vcc_lo
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v21, v20, v[9:10]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_mul_i256:
@@ -2752,100 +2749,98 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
+; GFX12-NEXT:    v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v19, v3
 ; GFX12-NEXT:    v_mul_lo_u32 v27, v6, v9
-; GFX12-NEXT:    v_mul_lo_u32 v28, v5, v10
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v16, v14, 0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
-; GFX12-NEXT:    v_mul_lo_u32 v30, v17, v14
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
+; GFX12-NEXT:    v_mul_lo_u32 v30, v4, v11
+; GFX12-NEXT:    v_mul_lo_u32 v28, v5, v10
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, v17, v13, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v18, v12, v[2:3]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, v16, v12, 0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], null, v19, v11, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v11, v[2:3]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
+; GFX12-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, v4, v10, v[20:21]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v18, v10, v[0:1]
 ; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v5, v9, v[2:3]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, v16, v10, 0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[23:24], vcc_lo, v19, v9, v[20:21]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[25:26], null, v6, v8, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[2:3]
 ; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v22, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[21:22], vcc_lo, v4, v8, v[23:24]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s0
-; GFX12-NEXT:    v_mov_b32_e32 v20, v22
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
+; GFX12-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v29, null, 0, v25, vcc_lo
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], s0, v18, v8, v[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v23, v25
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v29, null, 0, v20, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
-; GFX12-NEXT:    v_mov_b32_e32 v19, v22
-; GFX12-NEXT:    v_mul_lo_u32 v22, v16, v15
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v16, v8, 0
-; GFX12-NEXT:    v_mov_b32_e32 v20, v18
-; GFX12-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25]
-; GFX12-NEXT:    v_mul_lo_u32 v20, v4, v11
-; GFX12-NEXT:    v_mul_lo_u32 v25, v3, v12
+; GFX12-NEXT:    v_mov_b32_e32 v20, v3
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], vcc_lo, v16, v13, v[22:23]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[24:25], s2, v16, v11, v[20:21]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s2
-; GFX12-NEXT:    v_mul_lo_u32 v24, v2, v13
-; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19]
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX12-NEXT:    v_mad_co_u64_u32 v[22:23], s0, v17, v12, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v16, v8, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], s1, v18, v11, v[22:23]
+; GFX12-NEXT:    v_mul_lo_u32 v22, v16, v15
+; GFX12-NEXT:    v_mul_lo_u32 v23, v17, v14
+; GFX12-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v17, v10, v[24:25]
+; GFX12-NEXT:    v_mul_lo_u32 v24, v19, v12
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, s2
-; GFX12-NEXT:    v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v21
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, s2
+; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], s3, v19, v10, v[20:21]
+; GFX12-NEXT:    v_mul_lo_u32 v25, v18, v13
+; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], s2, v18, v9, v[14:15]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, s2
-; GFX12-NEXT:    v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14]
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v15, null, 0, v3, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[13:14], s2, v4, v9, v[11:12]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], s4, v16, v9, v[1:2]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
-; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2]
+; GFX12-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s4
+; GFX12-NEXT:    v_mad_co_u64_u32 v[9:10], s4, v19, v8, v[20:21]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v14, null, 0, v6, s4
-; GFX12-NEXT:    v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v15, null, 0, v15, s4
+; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], s4, v5, v8, v[13:14]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[3:4]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, s5, v16, v9, s5
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v4, s5, v29, v10, s5
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v5, s5, v14, v5, s5
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v5, s5, v15, v11, s5
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, s5, v6, v12, s5
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v23, v22, s5
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v26, v22, s5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v30, s4
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v24, s2
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v23, s4
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v25, s2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v25, s3
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v20, s1
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v24, s3
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v30, s1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v28, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v27, s0
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v27, vcc_lo
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
@@ -2855,87 +2850,89 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
-; GFX1250-NEXT:    v_mul_lo_u32 v27, v5, v10
-; GFX1250-NEXT:    v_mul_lo_u32 v29, v3, v12
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mul_lo_u32 v30, v4, v11
+; GFX1250-NEXT:    v_mul_lo_u32 v29, v5, v10
+; GFX1250-NEXT:    v_mul_lo_u32 v31, v3, v12
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v16, v14, 0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[18:19], v16, v12, 0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
-; GFX1250-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1]
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[20:21], v16, v10, 0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
+; GFX1250-NEXT:    v_mul_lo_u32 v32, v2, v13
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[18:19], v17, v13, v[0:1]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v16, v12, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[20:21], v2, v12, v[18:19]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s0
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v3, v11, v[20:21]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v4, v10, v[0:1]
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[18:19], v4, v10, v[0:1]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1]
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX1250-NEXT:    v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
-; GFX1250-NEXT:    v_mul_lo_u32 v22, v6, v9
-; GFX1250-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[20:21], v5, v9, v[18:19]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[18:19], v16, v10, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v28, null, 0, v26, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1]
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[24:25], v6, v8, v[20:21]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_dual_mov_b32 v18, v23 :: v_dual_mov_b32 v19, v24
+; GFX1250-NEXT:    v_mul_lo_u32 v24, v6, v9
+; GFX1250-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[18:19]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v2, v8, v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, s0
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
-; GFX1250-NEXT:    v_mul_lo_u32 v25, v4, v11
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_cndmask_b32_e64 v28, 0, 1, s2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
+; GFX1250-NEXT:    v_mov_b32_e32 v13, v18
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[22:23], s2, v16, v11, v[20:21]
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1]
+; GFX1250-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s2
 ; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v16, v8, 0
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
-; GFX1250-NEXT:    v_mul_lo_u32 v20, v2, v13
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v21, null, 0, v28, s2
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
-; GFX1250-NEXT:    v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v21, s2
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[26:27], s2, v17, v10, v[22:23]
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v33, null, 0, v11, s2
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[22:23], s3, v3, v10, v[20:21]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_mov_b32_e32 v12, v1
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s2, v2, v9, v[26:27]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], s6, v16, v9, v[12:13]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s4, v4, v9, v[22:23]
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v33, s2
 ; GFX1250-NEXT:    v_mul_lo_u32 v2, v16, v15
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
-; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
 ; GFX1250-NEXT:    v_mul_lo_u32 v9, v17, v14
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[10:11]
+; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, s2
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[18:19]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[20:21]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v5, s2, v1, v10, s2
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v6, s2, v26, v11, s2
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v6, s2, v28, v11, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v23, v2, s2
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v25, v2, s2
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, v15
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v9, s5
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v20, s4
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v29, s3
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v32, s4
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v31, s3
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v25, s1
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v27, s0
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v30, s1
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v29, s0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v22, vcc_lo
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v24, vcc_lo
 ; GFX1250-NEXT:    v_mad_u32 v7, v7, v8, v1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v14
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
@@ -2949,60 +2946,60 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
-; GFX7-NEXT:    buffer_load_dword v2, v[2:3], s[0:3], 0 addr64
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x50
+; GFX7-NEXT:    buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x50
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v3, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
 ; GFX7-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    flat_load_dword v2, v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x50
+; GFX8-NEXT:    flat_load_dword v4, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x50
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_load_dword v2, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x50
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x50
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_load_dword v2, v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0x50, v4, 0
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX12-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0x50, v4, 0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[2:3], 0x50, v2, 0
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[2:3], 0x50, v4, 0
 ; GFX1250-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX1250-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %in, align 4
@@ -3130,33 +3127,36 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x50
+; GFX7-NEXT:    v_mov_b32_e32 v6, 0x50
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
-; GFX7-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
-; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4]
+; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0
+; GFX7-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v6, v[5:6]
 ; GFX7-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_mul_u64_sext_with_vregs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dword v4, v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x50
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0x50
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
+; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6]
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_mul_u64_sext_with_vregs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x50
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x50
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
+; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
+; GFX9-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6]
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3176,24 +3176,24 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0x50, v4, 0
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v4
-; GFX11-NEXT:    v_mad_u64_u32 v[4:5], null, 0x50, v6, v[3:4]
-; GFX11-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v5, v3
+; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, 0x50, v6, v[5:6]
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_mul_u64_sext_with_vregs:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX12-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
+; GFX12-NEXT:    v_mad_co_i64_i32 v[2:3], null, 0x50, v4, 0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: s_mul_u64_sext_with_vregs:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_mad_nc_i64_i32 v[2:3], 0x50, v2, 0
+; GFX1250-NEXT:    v_mad_nc_i64_i32 v[2:3], 0x50, v4, 0
 ; GFX1250-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX1250-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir
index 137488f24a331..7ca3869b535e4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir
@@ -24,7 +24,7 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]]
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
-    ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %5(s32)
+    ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %5(s32)
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[FMUL]], %5, [[COPY2]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32)
@@ -33,7 +33,7 @@ body: |
     %2:vgpr(s32) = COPY %1(s32)
     %3:vgpr(s32) = G_FMUL %0, %2
     %4:sgpr(s32) = G_FCONSTANT float 1.000000e+00
-    INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %5:vgpr_32
+    INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %5:vgpr_32
     %6:vgpr(s32) = COPY %4(s32)
     %7:vgpr(s32) = nnan G_AMDGPU_FMED3 %3(s32), %5(s32), %6(s32)
     $vgpr0 = COPY %7(s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-wave-address.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-wave-address.mir
index f372c1f81948f..59716a250ff59 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-wave-address.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-wave-address.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -regbankselect-greedy -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -regbankselect-fast -o - %s | FileCheck %s
 
 # TODO: We could use scalar
 ---
@@ -25,8 +25,7 @@ body: |
     ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:sgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p5) = COPY [[AMDGPU_WAVE_ADDRESS]](p5)
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
-    ; CHECK-NEXT: G_STORE [[COPY]](p5), [[COPY1]](p1) :: (store (p5), addrspace 1)
+    ; CHECK-NEXT: G_STORE [[COPY]](p5), [[DEF]](p1) :: (store (p5), addrspace 1)
     %0:_(p1) = G_IMPLICIT_DEF
     %1:_(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
     G_STORE %1, %0 :: (store (p5), addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir
index a50c7fe0748b8..fc86dd884fac0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -march amdgcn -mcpu=fiji -run-pass=regbankselect %s -o - | FileCheck %s
+# RUN: llc -O0 -march amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s
 
 --- |
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
index 5240bf4f3a1d7..9aaa9635a7da1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
@@ -547,8 +547,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %
 ;
 ; NEW_RBS-LABEL: loop_with_2breaks:
 ; NEW_RBS:       ; %bb.0: ; %entry
-; NEW_RBS-NEXT:    s_mov_b32 s4, 0
 ; NEW_RBS-NEXT:    s_mov_b32 s0, 0
+; NEW_RBS-NEXT:    s_mov_b32 s4, 0
 ; NEW_RBS-NEXT:    ; implicit-def: $sgpr5
 ; NEW_RBS-NEXT:    s_branch .LBB16_3
 ; NEW_RBS-NEXT:  .LBB16_1: ; %Flow3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 4f2c454e13356..b7c84f1389197 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -31,128 +31,126 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v3, v0
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, v2
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v1
-; CHECK-NEXT:    v_sub_i32_e32 v10, vcc, 0, v2
-; CHECK-NEXT:    v_subb_u32_e32 v11, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v13, vcc, 0, v2
+; CHECK-NEXT:    v_subb_u32_e32 v14, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v6
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v3
-; CHECK-NEXT:    v_trunc_f32_e32 v8, v6
-; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v8
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v9, v3
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v12, v8
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0
-; CHECK-NEXT:    v_mov_b32_e32 v3, v7
-; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v3, v12, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v6
+; CHECK-NEXT:    v_trunc_f32_e32 v6, v6
+; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v6
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v12, v6
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v3, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v12, v[7:8]
+; CHECK-NEXT:    v_mul_lo_u32 v7, v12, v6
+; CHECK-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v3, v[8:9]
+; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v12, v6
-; CHECK-NEXT:    v_mul_lo_u32 v13, v9, v7
-; CHECK-NEXT:    v_mul_lo_u32 v14, v12, v7
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
-; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v13, v3
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v9, v3, v10
+; CHECK-NEXT:    v_mul_lo_u32 v11, v12, v10
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; CHECK-NEXT:    v_mul_hi_u32 v7, v12, v7
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v3
-; CHECK-NEXT:    v_addc_u32_e32 v12, vcc, v12, v6, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0
-; CHECK-NEXT:    v_mov_b32_e32 v3, v7
-; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v10, 31, v5
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v10
-; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v10, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v8, v3, v10
-; CHECK-NEXT:    v_mul_lo_u32 v3, v12, v6
-; CHECK-NEXT:    v_mul_lo_u32 v5, v9, v7
-; CHECK-NEXT:    v_xor_b32_e32 v11, v4, v10
-; CHECK-NEXT:    v_mul_hi_u32 v4, v9, v6
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_mul_hi_u32 v9, v12, v10
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT:    v_addc_u32_e32 v12, vcc, v12, v7, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v3, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v12, v[7:8]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v13, 31, v5
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
+; CHECK-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v3, v[8:9]
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v13, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v9, v4, v13
+; CHECK-NEXT:    v_mul_lo_u32 v4, v12, v6
+; CHECK-NEXT:    v_mul_lo_u32 v7, v3, v10
+; CHECK-NEXT:    v_xor_b32_e32 v11, v5, v13
+; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v12, v6
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, v12, v7
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v5, v9, v7
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_hi_u32 v6, v12, v7
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v5, v12, v10
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v7, v3, v10
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_mul_hi_u32 v7, v12, v10
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v12, v4, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v12, v5, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v5, v11, v3
-; CHECK-NEXT:    v_mul_lo_u32 v6, v8, v4
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v3
+; CHECK-NEXT:    v_mul_lo_u32 v6, v9, v4
+; CHECK-NEXT:    v_mul_hi_u32 v7, v9, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v11, v3
-; CHECK-NEXT:    v_mul_hi_u32 v9, v11, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v11, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_hi_u32 v6, v8, v4
+; CHECK-NEXT:    v_mul_hi_u32 v6, v9, v4
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v3, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v7, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v9, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v7, v[4:5]
-; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v11, v4, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v4, s[4:5], v11, v4
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v3
+; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v10, 0
+; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v6, v5
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v12, v[4:5]
+; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v9, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v10, v[5:6]
+; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v11, v7, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v5, s[4:5], v11, v7
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_subb_u32_e32 v5, vcc, v5, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v3, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, v8, v9, s[4:5]
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, 1, v7
-; CHECK-NEXT:    v_addc_u32_e32 v9, vcc, 0, v6, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v10
+; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, 0, v12, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v11, v2, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v8
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, 0, v9, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v3, v10, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v6
+; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, 0, v7, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v3, v13, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v1, v3
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v2, v3
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
@@ -218,67 +216,67 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_subb_u32 s5, 0, s11
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; CHECK-NEXT:    v_trunc_f32_e32 v2, v1
-; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v1
-; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v1
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_trunc_f32_e32 v1, v1
+; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT:    v_mul_lo_u32 v3, v6, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v0
+; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT:    v_mul_hi_u32 v3, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v0
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT:    v_mul_hi_u32 v6, v3, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v2, s13, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, s12, v1
 ; CHECK-NEXT:    v_mul_hi_u32 v4, s12, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v0, s13, v0
-; CHECK-NEXT:    v_mul_hi_u32 v5, s13, v1
+; CHECK-NEXT:    v_mov_b32_e32 v7, s13
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -291,39 +289,39 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s10, v2, v[1:2]
-; CHECK-NEXT:    v_mov_b32_e32 v5, s13
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v0, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v3, s13, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v3, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s10, v4, v[1:2]
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, s12, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
-; CHECK-NEXT:    v_mov_b32_e32 v3, s11
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[0:1], v5, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[0:1], s13, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; CHECK-NEXT:    v_mov_b32_e32 v1, s11
+; CHECK-NEXT:    v_subb_u32_e64 v2, s[0:1], v7, v4, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v3, s[0:1], s13, v4
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v2
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v0
 ; CHECK-NEXT:    v_subrev_i32_e32 v0, vcc, s10, v0
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v2
 ; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s11, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, v4, v5, s[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s10, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 1, v3
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; CHECK-NEXT:    s_xor_b64 s[0:1], s[6:7], s[8:9]
 ; CHECK-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; CHECK-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
@@ -379,266 +377,260 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v10, v4, v8
-; GISEL-NEXT:    v_xor_b32_e32 v4, v5, v8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v10
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v4
-; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, 0, v10
-; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v9
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v5
-; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v9
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
-; GISEL-NEXT:    v_mov_b32_e32 v5, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v9, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v9, v11
-; GISEL-NEXT:    v_mul_hi_u32 v17, v14, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT:    v_mul_hi_u32 v11, v9, v11
-; GISEL-NEXT:    v_mul_lo_u32 v13, v14, v12
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v17
+; GISEL-NEXT:    v_xor_b32_e32 v9, v5, v8
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v10
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v9
+; GISEL-NEXT:    v_sub_i32_e32 v17, vcc, 0, v10
+; GISEL-NEXT:    v_subb_u32_e32 v18, vcc, 0, v9, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
+; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v17, v15, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v16, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v16, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12]
+; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v15, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v4, v16, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v12
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v17, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v11, v16, v13
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT:    v_mul_hi_u32 v12, v15, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v5
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v11, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
-; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_mov_b32_e32 v5, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6]
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v9
-; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v11
-; GISEL-NEXT:    v_mul_lo_u32 v5, v14, v12
-; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v9
-; GISEL-NEXT:    v_mul_hi_u32 v1, v14, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v16, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v4
+; GISEL-NEXT:    v_addc_u32_e32 v16, vcc, v16, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v17, v15, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v16, v[5:6]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12]
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v17, v0, v5
+; GISEL-NEXT:    v_mul_lo_u32 v0, v16, v4
+; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT:    v_xor_b32_e32 v18, v1, v5
+; GISEL-NEXT:    v_mul_hi_u32 v1, v15, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v16, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v12
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v12
+; GISEL-NEXT:    v_mul_lo_u32 v1, v16, v13
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v11, v15, v13
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v12
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v16, v13
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v16, v0
-; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v1
-; GISEL-NEXT:    v_mul_hi_u32 v12, v15, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT:    v_xor_b32_e32 v8, v9, v8
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v16, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v4, v18, v0
+; GISEL-NEXT:    v_mul_lo_u32 v11, v17, v1
+; GISEL-NEXT:    v_mul_hi_u32 v12, v17, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v8
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v16, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT:    v_mul_hi_u32 v11, v15, v1
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v18, v1
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v12, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v1, v5
-; GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v10, v14, v[1:2]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v7
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v5
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v7, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v4, v0, v[12:13]
-; GISEL-NEXT:    v_xor_b32_e32 v7, v1, v5
-; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v5
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v7
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v13, v6
-; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, v15, v11
-; GISEL-NEXT:    v_sub_i32_e64 v11, s[4:5], v16, v12
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v13
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v16, v12, vcc
-; GISEL-NEXT:    v_subb_u32_e32 v13, vcc, v11, v4, vcc
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v1
-; GISEL-NEXT:    v_trunc_f32_e32 v16, v11
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v16
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v1
-; GISEL-NEXT:    v_sub_i32_e32 v19, vcc, 0, v7
-; GISEL-NEXT:    v_subb_u32_e32 v20, vcc, 0, v6, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v16
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v15, v10
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v15, v10
-; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v10
-; GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v15, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13]
-; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v17, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v18, v12
-; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], v1, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v18, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[6:7]
-; GISEL-NEXT:    v_mul_hi_u32 v11, v16, v11
-; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], v1, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[8:9]
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v15, v4
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v17, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[8:9]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, 1, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v15, v21, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v14, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, 1, v10
-; GISEL-NEXT:    v_addc_u32_e32 v21, vcc, 0, v15, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v17, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e32 v15, v15, v21, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT:    v_mul_lo_u32 v13, v16, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT:    v_mul_hi_u32 v13, v18, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v12, v16, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v18, v10
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v16, v11, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v0, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v19, v13, v[0:1]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[0:1]
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v11
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v3, v11, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v15, v1, v11
-; GISEL-NEXT:    v_mul_lo_u32 v1, v13, v10
-; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v0
-; GISEL-NEXT:    v_xor_b32_e32 v16, v2, v11
-; GISEL-NEXT:    v_mul_hi_u32 v2, v12, v10
-; GISEL-NEXT:    v_xor_b32_e32 v9, v4, v8
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v10
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v0, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v11, v18, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v15, 0
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v11, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v16, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v17, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v9, v15, v[11:12]
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v18, v13, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], v18, v13
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v9
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v4, v9, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, 1, v15
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v16, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v9
+; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v4
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v7, v4, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v7, v6, v4
+; GISEL-NEXT:    v_xor_b32_e32 v6, v17, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v17, v7
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v18, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v9
+; GISEL-NEXT:    v_mac_f32_e32 v17, 0x4f800000, v18
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v17
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v14, v0, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v12
+; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, 0, v13, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
+; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GISEL-NEXT:    v_trunc_f32_e32 v1, v1
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v17, v0
+; GISEL-NEXT:    v_sub_i32_e64 v19, s[4:5], 0, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v1
+; GISEL-NEXT:    v_subb_u32_e64 v20, s[4:5], 0, v6, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v14, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v19, v18, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v21, v18, v0
+; GISEL-NEXT:    v_mul_hi_u32 v22, v17, v0
+; GISEL-NEXT:    v_mul_hi_u32 v23, v18, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[9:10]
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v15, v12, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v0
+; GISEL-NEXT:    v_mul_lo_u32 v11, v18, v0
+; GISEL-NEXT:    v_ashrrev_i32_e32 v15, 31, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
+; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v21, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v22
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v9, v1
+; GISEL-NEXT:    v_mul_hi_u32 v9, v17, v0
+; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v23
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v9, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
+; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v9
+; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v17, v1
+; GISEL-NEXT:    v_addc_u32_e64 v14, s[4:5], v18, v0, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v19, v12, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v10, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v15
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v20, v12, v[8:9]
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v15, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v16, v2, v15
+; GISEL-NEXT:    v_mul_lo_u32 v2, v14, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v12, v10
+; GISEL-NEXT:    v_xor_b32_e32 v17, v3, v15
 ; GISEL-NEXT:    v_mul_hi_u32 v3, v12, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT:    v_addc_u32_e32 v0, vcc, v13, v0, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v2, v16, v1
-; GISEL-NEXT:    v_mul_lo_u32 v3, v15, v0
-; GISEL-NEXT:    v_mul_hi_u32 v4, v15, v1
-; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT:    v_xor_b32_e32 v10, v14, v8
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v10
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v10
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v14, v10
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_mul_hi_u32 v3, v15, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v14, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v17, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v16, v2
+; GISEL-NEXT:    v_mul_hi_u32 v9, v16, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v17, v0
+; GISEL-NEXT:    v_xor_b32_e32 v10, v13, v5
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v3
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v1, v2
-; GISEL-NEXT:    v_mul_hi_u32 v0, v16, v0
+; GISEL-NEXT:    v_mul_lo_u32 v9, v17, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
+; GISEL-NEXT:    v_mul_hi_u32 v8, v16, v2
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v0, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_mul_hi_u32 v8, v17, v2
 ; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v0, v1
-; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v9, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4]
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v10, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v15, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v16, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v16, v3
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v6
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v8, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v7, v13, v[3:4]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v5
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v10, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v12, v[8:9]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v16, v2
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v17, v10, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v17, v10
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v6
+; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v5, v6, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v6
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v8, v9, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v6
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v9, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v12
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v13, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v8
 ; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v6, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v4, v11, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v4, v15, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v5, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
@@ -667,100 +659,100 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v3, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v1
-; CGP-NEXT:    v_sub_i32_e32 v13, vcc, 0, v2
-; CGP-NEXT:    v_subb_u32_e32 v14, vcc, 0, v1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v16, vcc, 0, v2
+; CGP-NEXT:    v_subb_u32_e32 v17, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; CGP-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
-; CGP-NEXT:    v_trunc_f32_e32 v5, v4
-; CGP-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v5
-; CGP-NEXT:    v_cvt_u32_f32_e32 v12, v3
-; CGP-NEXT:    v_cvt_u32_f32_e32 v15, v5
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT:    v_mul_hi_u32 v16, v12, v3
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v5, v15, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT:    v_mul_lo_u32 v17, v12, v4
-; CGP-NEXT:    v_mul_lo_u32 v18, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v16
-; CGP-NEXT:    v_mul_hi_u32 v16, v12, v4
+; CGP-NEXT:    v_trunc_f32_e32 v4, v4
+; CGP-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v15, v3
+; CGP-NEXT:    v_cvt_u32_f32_e32 v14, v4
+; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v16, v15, 0
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5]
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v17, v15, v[12:13]
+; CGP-NEXT:    v_mul_lo_u32 v5, v14, v3
+; CGP-NEXT:    v_mul_hi_u32 v12, v15, v3
+; CGP-NEXT:    v_mul_lo_u32 v13, v15, v4
+; CGP-NEXT:    v_mul_hi_u32 v3, v14, v3
+; CGP-NEXT:    v_mul_lo_u32 v18, v14, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v15, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v17, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v18, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT:    v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v4, v14, v4
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v16, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v3
-; CGP-NEXT:    v_addc_u32_e32 v15, vcc, v15, v4, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT:    v_ashrrev_i32_e32 v13, 31, v11
-; CGP-NEXT:    v_mul_hi_u32 v16, v12, v3
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5]
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v13
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v11, v13, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v5, v13
-; CGP-NEXT:    v_mul_lo_u32 v5, v15, v3
-; CGP-NEXT:    v_mul_lo_u32 v14, v12, v4
-; CGP-NEXT:    v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT:    v_xor_b32_e32 v10, v10, v13
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v16
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v3
+; CGP-NEXT:    v_addc_u32_e32 v14, vcc, v14, v4, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v16, v15, 0
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5]
+; CGP-NEXT:    v_ashrrev_i32_e32 v16, 31, v11
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v17, v15, v[12:13]
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v16
+; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v11, v16, vcc
+; CGP-NEXT:    v_xor_b32_e32 v12, v5, v16
+; CGP-NEXT:    v_mul_lo_u32 v5, v14, v3
+; CGP-NEXT:    v_mul_lo_u32 v11, v15, v4
+; CGP-NEXT:    v_xor_b32_e32 v13, v10, v16
+; CGP-NEXT:    v_mul_hi_u32 v10, v15, v3
+; CGP-NEXT:    v_mul_hi_u32 v3, v14, v3
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v16, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
-; CGP-NEXT:    v_mul_hi_u32 v14, v12, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v16, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT:    v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, v14, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT:    v_mul_hi_u32 v11, v15, v4
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_mul_hi_u32 v4, v14, v4
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v12, v3
-; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v15, v4, vcc
-; CGP-NEXT:    v_mul_lo_u32 v5, v10, v3
-; CGP-NEXT:    v_mul_lo_u32 v12, v11, v4
-; CGP-NEXT:    v_mul_hi_u32 v14, v11, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v10, v3
-; CGP-NEXT:    v_mul_hi_u32 v15, v10, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v15, v3
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v14, v4, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
+; CGP-NEXT:    v_mul_lo_u32 v10, v12, v4
+; CGP-NEXT:    v_mul_hi_u32 v11, v12, v3
+; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v10, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT:    v_mul_hi_u32 v12, v11, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
+; CGP-NEXT:    v_mul_lo_u32 v11, v13, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT:    v_mul_hi_u32 v10, v12, v4
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v11, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v3, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v3
+; CGP-NEXT:    v_mul_hi_u32 v10, v13, v4
 ; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v15, v5
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v12, v[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v11, v3
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[4:5]
-; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v10, v4, vcc
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v10, v4
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v10, v5
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v2, v15, v[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v12, v3
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[10:11]
+; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v13, v4, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v13, v4
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v1
 ; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
@@ -771,13 +763,13 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, v10, v11, s[4:5]
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v14
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
+; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v15, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v15, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v12, v2, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v10
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, 0, v11, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
@@ -785,8 +777,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v14, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v3, v13, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
+; CGP-NEXT:    v_xor_b32_e32 v3, v16, v0
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v15, v2, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v0, v1, v3
 ; CGP-NEXT:    v_xor_b32_e32 v1, v2, v3
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
@@ -840,126 +832,126 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_xor_b32_e32 v3, v5, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v4
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v4
-; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v3, vcc
+; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v4
+; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
-; CGP-NEXT:    v_trunc_f32_e32 v7, v6
-; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v5
-; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v7
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7]
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7]
-; CGP-NEXT:    v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT:    v_mul_lo_u32 v15, v10, v6
-; CGP-NEXT:    v_mul_lo_u32 v16, v13, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v6
+; CGP-NEXT:    v_trunc_f32_e32 v6, v6
+; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
+; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v5
+; CGP-NEXT:    v_cvt_u32_f32_e32 v12, v6
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v13, 0
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7]
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v15, v13, v[10:11]
+; CGP-NEXT:    v_mul_lo_u32 v7, v12, v5
+; CGP-NEXT:    v_mul_hi_u32 v10, v13, v5
+; CGP-NEXT:    v_mul_lo_u32 v11, v13, v6
+; CGP-NEXT:    v_mul_hi_u32 v5, v12, v5
+; CGP-NEXT:    v_mul_lo_u32 v16, v12, v6
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v13, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v15, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v16, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v6, v12, v6
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v5
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, v13, v6, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7]
-; CGP-NEXT:    v_ashrrev_i32_e32 v11, 31, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7]
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v11
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v11, vcc
-; CGP-NEXT:    v_xor_b32_e32 v9, v7, v11
-; CGP-NEXT:    v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT:    v_mul_lo_u32 v12, v10, v6
-; CGP-NEXT:    v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT:    v_xor_b32_e32 v8, v8, v11
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v5
+; CGP-NEXT:    v_addc_u32_e32 v12, vcc, v12, v6, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v13, 0
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7]
+; CGP-NEXT:    v_ashrrev_i32_e32 v14, 31, v9
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v15, v13, v[10:11]
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v14
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v14, vcc
+; CGP-NEXT:    v_xor_b32_e32 v11, v7, v14
+; CGP-NEXT:    v_mul_lo_u32 v7, v12, v5
+; CGP-NEXT:    v_mul_lo_u32 v9, v13, v6
+; CGP-NEXT:    v_xor_b32_e32 v15, v8, v14
+; CGP-NEXT:    v_mul_hi_u32 v8, v13, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v12, v5
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v13, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
-; CGP-NEXT:    v_mul_hi_u32 v12, v10, v6
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; CGP-NEXT:    v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT:    v_mul_lo_u32 v8, v12, v6
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CGP-NEXT:    v_mul_hi_u32 v9, v13, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_mul_hi_u32 v6, v12, v6
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v13, v6, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v8, v5
-; CGP-NEXT:    v_mul_lo_u32 v10, v9, v6
-; CGP-NEXT:    v_mul_hi_u32 v12, v9, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v8, v5
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, v8, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT:    v_mul_hi_u32 v10, v9, v6
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v12, v6, vcc
+; CGP-NEXT:    v_mul_lo_u32 v7, v15, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, v11, v6
+; CGP-NEXT:    v_mul_hi_u32 v9, v11, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v15, v5
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v15, v6
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT:    v_mul_hi_u32 v8, v11, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v5, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v5
+; CGP-NEXT:    v_mul_hi_u32 v8, v15, v6
 ; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v7
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[6:7]
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[6:7]
-; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v8, v6, vcc
-; CGP-NEXT:    v_sub_i32_e64 v6, s[4:5], v8, v6
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CGP-NEXT:    v_subb_u32_e32 v6, vcc, v6, v3, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v8, v7
+; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[6:7]
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8]
+; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v15, v9, vcc
+; CGP-NEXT:    v_sub_i32_e64 v7, s[4:5], v15, v9
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_subb_u32_e32 v7, vcc, v7, v3, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v4
 ; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v3
-; CGP-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v7, v8, v9, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v6, v8, v9, s[4:5]
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v12
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v13, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v13, v4, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v8
 ; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v9, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v11, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v5, v14, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v2, v3, v5
 ; CGP-NEXT:    v_xor_b32_e32 v3, v4, v5
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
@@ -1049,82 +1041,82 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x12d8fb
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0xffed2705
+; CHECK-NEXT:    v_mov_b32_e32 v9, 0xffed2705
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
-; CHECK-NEXT:    v_trunc_f32_e32 v4, v3
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT:    v_trunc_f32_e32 v3, v3
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v10, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT:    v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT:    v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT:    v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT:    v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT:    v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT:    v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT:    v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT:    v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT:    v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v3, v8, v6
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v10, v6
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v2, v5, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, v4, v1
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0x12d8fb
+; CHECK-NEXT:    v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0x12d8fb
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT:    v_mul_lo_u32 v7, v5, v1
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v4, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
@@ -1133,40 +1125,40 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v1
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2]
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v3, v5, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v3, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[1:2]
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[4:5]
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 1, v7
-; CHECK-NEXT:    v_addc_u32_e32 v8, vcc, 0, v3, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; CHECK-NEXT:    v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, -1, v3, s[4:5]
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v7
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 1, v4
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v8, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v3
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v4, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v5, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %num, 1235195
   ret i64 %result
@@ -1186,77 +1178,75 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_subb_u32 s6, 0, 0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
-; GISEL-NEXT:    v_mov_b32_e32 v7, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v4
-; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v13
-; GISEL-NEXT:    v_mul_lo_u32 v4, v9, v13
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6]
+; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT:    v_mul_lo_u32 v9, v7, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
-; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v4, v14
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v7, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v7, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15]
+; GISEL-NEXT:    v_mul_lo_u32 v9, v17, v13
+; GISEL-NEXT:    v_mul_hi_u32 v18, v4, v13
+; GISEL-NEXT:    v_mul_hi_u32 v19, v17, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16]
 ; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GISEL-NEXT:    v_mul_lo_u32 v15, v16, v14
+; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v13
+; GISEL-NEXT:    v_mul_hi_u32 v15, v4, v13
 ; GISEL-NEXT:    s_subb_u32 s6, 0, 0
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v15
-; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v13
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v14
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v16, v14
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v4
-; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v4
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v17
-; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v14
-; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v4
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v9
+; GISEL-NEXT:    v_mul_lo_u32 v9, v17, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v13
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v9, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v9
+; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v14
+; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v15, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v16, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v19, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v18, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v18, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v19, v0
-; GISEL-NEXT:    v_mov_b32_e32 v7, 0x12d8fb
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
@@ -1269,149 +1259,147 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT:    v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v15, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v7, v16, v[1:2]
+; GISEL-NEXT:    v_mul_hi_u32 v1, v19, v1
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v0, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT:    v_add_i32_e32 v20, vcc, v1, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v20, v[1:2]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v18, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], v19, v13
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14]
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v19, v15, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], v19, v15
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v15
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v16, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v7
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, 1, v17
+; GISEL-NEXT:    v_addc_u32_e32 v18, vcc, 0, v20, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v13
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, -1, v14, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v19, -1, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1]
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v16
+; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v13
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v18, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v16, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v16, v18, v1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v5, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GISEL-NEXT:    v_mul_hi_u32 v1, v7, v13
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; GISEL-NEXT:    v_mul_hi_u32 v10, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v10, v8, v0
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v16, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v15, v13, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v10, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT:    v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v7, v0
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v8, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v14, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v5, v9
+; GISEL-NEXT:    v_ashrrev_i32_e32 v13, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8]
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v20, v16, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v13, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v14, v2, v13
+; GISEL-NEXT:    v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT:    v_xor_b32_e32 v15, v3, v13
+; GISEL-NEXT:    v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_xor_b32_e32 v8, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v15, v0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v14, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT:    v_xor_b32_e32 v7, v12, v9
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT:    v_mul_lo_u32 v6, v15, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v9, 0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v6, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v11, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], 0, v9, v[5:6]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v13, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v13, v3
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v7
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v9
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v11, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v7
+; GISEL-NEXT:    v_mul_hi_u32 v5, v15, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v5, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[3:4]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v9
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v7, v9, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v14, v2
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v15, v7
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v15, v7, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, -1, v6, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v10
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v11, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v6
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v11, v4, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v13
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v13
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v13, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_oddk_denom:
@@ -1424,27 +1412,26 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v8, v5
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
+; CGP-NEXT:    v_trunc_f32_e32 v5, v5
+; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v5
 ; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
-; CGP-NEXT:    v_mov_b32_e32 v9, v5
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
-; CGP-NEXT:    v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6]
+; CGP-NEXT:    v_mul_hi_u32 v11, v8, v4
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v4
+; CGP-NEXT:    v_mul_hi_u32 v10, v7, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v7, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v12
+; CGP-NEXT:    v_mul_hi_u32 v14, v7, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v8, v12
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
@@ -1452,41 +1439,40 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT:    v_addc_u32_e32 v17, vcc, v8, v9, vcc
-; CGP-NEXT:    v_mov_b32_e32 v4, v14
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT:    v_mul_lo_u32 v9, v16, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT:    v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v15, v16, v14
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT:    v_addc_u32_e32 v16, vcc, v8, v12, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14]
+; CGP-NEXT:    v_mul_lo_u32 v17, v16, v12
+; CGP-NEXT:    v_mul_hi_u32 v18, v4, v12
+; CGP-NEXT:    v_mul_hi_u32 v19, v16, v12
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15]
+; CGP-NEXT:    v_mul_lo_u32 v13, v4, v12
+; CGP-NEXT:    v_mul_hi_u32 v15, v4, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v9, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v9
-; CGP-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT:    v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT:    v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_mul_lo_u32 v14, v16, v12
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v19
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
+; CGP-NEXT:    v_mul_hi_u32 v17, v16, v12
+; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT:    v_xor_b32_e32 v18, v0, v12
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v14, v13
+; CGP-NEXT:    v_xor_b32_e32 v19, v1, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v15, v1
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v17, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v16, v1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v13, v19, v0
 ; CGP-NEXT:    v_mul_lo_u32 v14, v18, v1
 ; CGP-NEXT:    v_mul_hi_u32 v15, v18, v0
@@ -1504,12 +1490,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_mul_hi_u32 v1, v19, v1
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v0, v13
-; CGP-NEXT:    v_mul_hi_u32 v16, v19, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v1, v0
 ; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v13
 ; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v18, v0
 ; CGP-NEXT:    v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
@@ -1519,106 +1505,105 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, 1, v15
-; CGP-NEXT:    v_addc_u32_e32 v18, vcc, 0, v16, vcc
+; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; CGP-NEXT:    v_add_i32_e32 v18, vcc, 1, v15
+; CGP-NEXT:    v_addc_u32_e32 v19, vcc, 0, v16, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CGP-NEXT:    v_mov_b32_e32 v0, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v19, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, 1, v17
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, 0, v18, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT:    v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v13, v18, v13, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, v8, v0
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; CGP-NEXT:    v_mul_hi_u32 v5, v7, v0
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v15, v17, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT:    v_cndmask_b32_e32 v10, v16, v13, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT:    v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT:    v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT:    v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT:    v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT:    v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v17, -1, v14, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e32 v20, -1, v0, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6]
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v18
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1]
+; CGP-NEXT:    v_addc_u32_e32 v21, vcc, 0, v19, vcc
+; CGP-NEXT:    v_mul_lo_u32 v1, v7, v13
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v18, v5, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v19, v21, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v17
+; CGP-NEXT:    v_cndmask_b32_e32 v14, v15, v0, vcc
+; CGP-NEXT:    v_add_i32_e64 v0, s[4:5], v9, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT:    v_add_i32_e64 v0, s[4:5], v1, v0
+; CGP-NEXT:    v_mul_hi_u32 v1, v7, v13
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v9, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
+; CGP-NEXT:    v_mul_hi_u32 v10, v8, v13
+; CGP-NEXT:    v_add_i32_e64 v0, s[4:5], v1, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v9, v1
+; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v10, v1
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v7, v0
+; CGP-NEXT:    v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5]
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; CGP-NEXT:    v_ashrrev_i32_e32 v13, 31, v3
+; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2]
+; CGP-NEXT:    v_xor_b32_e32 v1, v5, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8]
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v13, vcc
+; CGP-NEXT:    v_xor_b32_e32 v7, v2, v13
+; CGP-NEXT:    v_mul_lo_u32 v2, v10, v0
+; CGP-NEXT:    v_mul_lo_u32 v6, v9, v5
+; CGP-NEXT:    v_xor_b32_e32 v8, v3, v13
+; CGP-NEXT:    v_mul_hi_u32 v3, v9, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT:    v_mul_lo_u32 v3, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT:    v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT:    v_mul_hi_u32 v6, v9, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT:    v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT:    v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v9
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v9, v0
+; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v10, v2, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT:    v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT:    v_xor_b32_e32 v11, v14, v12
+; CGP-NEXT:    v_mul_hi_u32 v9, v7, v3
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v12
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v12, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT:    v_mul_hi_u32 v3, v8, v3
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v8, v13, v2
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v5
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v3, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v3
+; CGP-NEXT:    v_mul_hi_u32 v6, v8, v2
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v6, v5
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4]
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v8, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
@@ -1626,24 +1611,24 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, -1, v6, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, 1, v7
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v8, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, 1, v9
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v10, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v6
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v13
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v13
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v13, vcc
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv <2 x i64> %num, <i64 1235195, i64 1235195>
   ret <2 x i64> %result
@@ -1679,126 +1664,126 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v5, v0
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v2
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v1
-; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, 0, v2
-; CHECK-NEXT:    v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v13, vcc, 0, v2
+; CHECK-NEXT:    v_subb_u32_e32 v14, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; CHECK-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
-; CHECK-NEXT:    v_trunc_f32_e32 v7, v6
-; CHECK-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v7
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v5
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v11, v7
-; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7]
-; CHECK-NEXT:    v_mul_hi_u32 v12, v8, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_mul_lo_u32 v7, v11, v5
+; CHECK-NEXT:    v_trunc_f32_e32 v6, v6
+; CHECK-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v12, v5
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v11, v6
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v13, v11, v[6:7]
+; CHECK-NEXT:    v_mul_lo_u32 v6, v11, v5
+; CHECK-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v12, v[7:8]
+; CHECK-NEXT:    v_mul_hi_u32 v7, v12, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT:    v_mul_lo_u32 v13, v8, v6
-; CHECK-NEXT:    v_mul_lo_u32 v14, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; CHECK-NEXT:    v_mul_hi_u32 v12, v8, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT:    v_mul_lo_u32 v8, v12, v9
+; CHECK-NEXT:    v_mul_lo_u32 v10, v11, v9
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_mul_hi_u32 v7, v12, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v5
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_mul_hi_u32 v8, v11, v9
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v12, v5
 ; CHECK-NEXT:    v_addc_u32_e32 v11, vcc, v11, v6, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v9, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v7, v3, v9
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v13, v11, v[6:7]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v13, 31, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
+; CHECK-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v12, v[7:8]
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v13, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v10, v3, v13
 ; CHECK-NEXT:    v_mul_lo_u32 v3, v11, v5
-; CHECK-NEXT:    v_mul_lo_u32 v10, v8, v6
-; CHECK-NEXT:    v_xor_b32_e32 v12, v4, v9
-; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v5
+; CHECK-NEXT:    v_mul_lo_u32 v6, v12, v9
+; CHECK-NEXT:    v_xor_b32_e32 v14, v4, v13
+; CHECK-NEXT:    v_mul_hi_u32 v4, v12, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v10, v3
-; CHECK-NEXT:    v_mul_hi_u32 v10, v8, v6
+; CHECK-NEXT:    v_mul_lo_u32 v4, v11, v9
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; CHECK-NEXT:    v_mul_hi_u32 v6, v12, v9
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v9
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v12, v3
 ; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v11, v4, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, v12, v3
-; CHECK-NEXT:    v_mul_lo_u32 v6, v7, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v12, v3
-; CHECK-NEXT:    v_mul_hi_u32 v10, v12, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, v14, v3
+; CHECK-NEXT:    v_mul_lo_u32 v6, v10, v4
+; CHECK-NEXT:    v_mul_hi_u32 v7, v10, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v14, v3
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v8, v12, v4
+; CHECK-NEXT:    v_mul_lo_u32 v7, v14, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_hi_u32 v6, v7, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_mul_hi_u32 v6, v10, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v3, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v8, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v10, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v8, v[4:5]
-; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v12, v4, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v4, s[4:5], v12, v4
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v3
+; CHECK-NEXT:    v_mul_hi_u32 v6, v14, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v6, v5
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5]
+; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v10, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6]
+; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v14, v7, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v5, s[4:5], v14, v7
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_subb_u32_e32 v5, vcc, v5, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v3, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[4:5]
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, 1, v8
-; CHECK-NEXT:    v_addc_u32_e32 v10, vcc, 0, v6, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v9
+; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, 0, v11, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v11, v2, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v7
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, 0, v10, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v6
+; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, 0, v7, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v3, v9, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v3, v13, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v1, v3
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v2, v3
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
@@ -1839,274 +1824,268 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-LABEL: v_sdiv_v2i64_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v9, 0x1000
-; GISEL-NEXT:    v_mov_b32_e32 v10, 0
-; GISEL-NEXT:    v_lshl_b64 v[7:8], v[9:10], v4
-; GISEL-NEXT:    v_lshl_b64 v[9:10], v[9:10], v6
+; GISEL-NEXT:    v_mov_b32_e32 v12, 0x1000
+; GISEL-NEXT:    v_mov_b32_e32 v13, 0
+; GISEL-NEXT:    v_lshl_b64 v[7:8], v[12:13], v4
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v4
 ; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v8, v4, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v8, v5, v4
-; GISEL-NEXT:    v_xor_b32_e32 v5, v7, v4
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v5
-; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, 0, v8
-; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v11
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v10
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
-; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v7
-; GISEL-NEXT:    v_trunc_f32_e32 v13, v11
-; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v13
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v17, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
-; GISEL-NEXT:    v_mov_b32_e32 v7, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8]
-; GISEL-NEXT:    v_mul_lo_u32 v7, v17, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT:    v_mul_lo_u32 v13, v14, v12
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v17, v12
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v18, v7
-; GISEL-NEXT:    v_mul_hi_u32 v18, v14, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT:    v_mul_hi_u32 v12, v17, v12
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
+; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v8
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v7
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, 0, v8
+; GISEL-NEXT:    v_subb_u32_e32 v18, vcc, 0, v7, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v9
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v5
+; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v5, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v16, v11, v[10:11]
+; GISEL-NEXT:    v_mul_lo_u32 v17, v11, v9
+; GISEL-NEXT:    v_mul_hi_u32 v19, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v20, v11, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v18, v5, v[14:15]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v9
+; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v9
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v17, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
+; GISEL-NEXT:    v_mul_hi_u32 v14, v5, v9
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
+; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v9
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT:    v_addc_u32_e32 v19, vcc, v11, v9, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v5, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v16, v19, v[11:12]
+; GISEL-NEXT:    v_mul_lo_u32 v9, v19, v10
+; GISEL-NEXT:    v_lshl_b64 v[12:13], v[12:13], v6
+; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v18, v5, v[14:15]
+; GISEL-NEXT:    v_mul_hi_u32 v14, v5, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v16
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v7
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v17, v11, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
-; GISEL-NEXT:    v_mov_b32_e32 v7, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v7
-; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v11
-; GISEL-NEXT:    v_mul_lo_u32 v13, v14, v12
-; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v7
-; GISEL-NEXT:    v_mul_hi_u32 v1, v14, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
+; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
+; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v9
+; GISEL-NEXT:    v_addc_u32_e64 v1, s[4:5], v1, v9, s[4:5]
+; GISEL-NEXT:    v_xor_b32_e32 v14, v0, v9
+; GISEL-NEXT:    v_xor_b32_e32 v15, v1, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v12
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v12
+; GISEL-NEXT:    v_mul_hi_u32 v1, v19, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v19, v16
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v16
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_mul_hi_u32 v12, v17, v12
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v19, v16
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v16, v0
-; GISEL-NEXT:    v_mul_lo_u32 v12, v15, v1
-; GISEL-NEXT:    v_mul_hi_u32 v13, v15, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v4
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v5, v15, v0
+; GISEL-NEXT:    v_mul_lo_u32 v10, v14, v1
+; GISEL-NEXT:    v_mul_hi_u32 v11, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT:    v_xor_b32_e32 v4, v9, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v1
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; GISEL-NEXT:    v_mul_hi_u32 v10, v14, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v16, v1
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v12, v15, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v12
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v8, v14, v[1:2]
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v6
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v10, v6, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v5, v0, v[12:13]
-; GISEL-NEXT:    v_xor_b32_e32 v10, v1, v6
-; GISEL-NEXT:    v_xor_b32_e32 v9, v9, v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v10
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v13, v9
-; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, v15, v11
-; GISEL-NEXT:    v_sub_i32_e64 v11, s[4:5], v16, v12
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v13
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v16, v12, vcc
-; GISEL-NEXT:    v_subb_u32_e32 v13, vcc, v11, v5, vcc
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v1
-; GISEL-NEXT:    v_trunc_f32_e32 v16, v11
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v16
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v0, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v10, v15, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v16, 0
+; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v10, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v8, v17, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[10:11]
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v15, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v15, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v7
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v5, v7, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, 1, v16
+; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v17, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v7
+; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v6, v10, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, -1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v12, v5
+; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v13, v5, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v10, v6, v5
+; GISEL-NEXT:    v_xor_b32_e32 v6, v12, v5
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v10
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v13, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v7
+; GISEL-NEXT:    v_mac_f32_e32 v12, 0x4f800000, v13
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v12
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v18, v0, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v14
+; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v15, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
+; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GISEL-NEXT:    v_trunc_f32_e32 v1, v1
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v14, v7, vcc
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v0
+; GISEL-NEXT:    v_sub_i32_e64 v19, s[4:5], 0, v10
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v1
-; GISEL-NEXT:    v_sub_i32_e32 v19, vcc, 0, v10
-; GISEL-NEXT:    v_subb_u32_e32 v20, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v16
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v15, v8
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v15, v8
-; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v8
-; GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v15, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13]
-; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v17, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, v18, v12
-; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], v1, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v18, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[6:7]
-; GISEL-NEXT:    v_mul_hi_u32 v11, v16, v11
-; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], v1, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[8:9]
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v15, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v17, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[8:9]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v15, v21, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v14, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, 1, v8
-; GISEL-NEXT:    v_addc_u32_e32 v21, vcc, 0, v15, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v8, v17, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v15, v21, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT:    v_mul_lo_u32 v15, v16, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v15, v11
-; GISEL-NEXT:    v_mul_hi_u32 v15, v18, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v12, v16, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v11
-; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, v16, v12, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v19, v15, v[0:1]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v12, 31, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v14, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[0:1]
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v12
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v3, v12, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v14, v1, v12
-; GISEL-NEXT:    v_mul_lo_u32 v1, v15, v11
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT:    v_xor_b32_e32 v16, v2, v12
-; GISEL-NEXT:    v_mul_hi_u32 v2, v13, v11
-; GISEL-NEXT:    v_mul_hi_u32 v4, v15, v11
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v2, v15, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GISEL-NEXT:    v_subb_u32_e64 v20, s[4:5], 0, v6, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v19, v14, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v15, v12, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v19, v18, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v21, v18, v0
+; GISEL-NEXT:    v_mul_hi_u32 v22, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v23, v18, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v14, v[7:8]
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v16, v13, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v15, v17, v12, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v14, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v18, v0
+; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v21, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v22
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v7, v1
+; GISEL-NEXT:    v_mul_hi_u32 v7, v14, v0
+; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v23
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v13, v8
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v7, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
+; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v7
+; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v1
+; GISEL-NEXT:    v_addc_u32_e64 v14, s[4:5], v18, v0, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v19, v13, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v19, v14, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v20, v13, v[7:8]
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v9
+; GISEL-NEXT:    v_mul_lo_u32 v2, v14, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v11
+; GISEL-NEXT:    v_xor_b32_e32 v16, v3, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_addc_u32_e32 v0, vcc, v15, v0, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v2, v16, v1
-; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v0
-; GISEL-NEXT:    v_mul_hi_u32 v4, v14, v1
-; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v11
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v11
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v14, v11
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_mul_hi_u32 v3, v14, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v3
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v1, v2
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v14, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v16, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v11, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v0, v1
-; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1]
-; GISEL-NEXT:    v_xor_b32_e32 v8, v8, v7
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v5, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4]
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v14, v2
+; GISEL-NEXT:    v_xor_b32_e32 v11, v15, v4
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v8, v16, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v2
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v0, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v7, v16, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v7, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v14, v[3:4]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v11, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v13, v[7:8]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
 ; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v16, v3, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v16, v3
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v9
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v6
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v10
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v6
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v5, v7, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v11
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v13, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v7, v8, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v13
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v14, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v7, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v7
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v4, v12, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v13, v2, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v4, v9, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v14, v3, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
@@ -2138,126 +2117,126 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v10, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v10, v4
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v11, v1
-; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v4
-; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v18, vcc, 0, v4
+; CGP-NEXT:    v_subb_u32_e32 v19, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v10, 0x4f800000, v11
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v10, v10
 ; CGP-NEXT:    v_mul_f32_e32 v10, 0x5f7ffffc, v10
 ; CGP-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v10
-; CGP-NEXT:    v_trunc_f32_e32 v12, v11
-; CGP-NEXT:    v_mac_f32_e32 v10, 0xcf800000, v12
-; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v10
-; CGP-NEXT:    v_cvt_u32_f32_e32 v16, v12
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
-; CGP-NEXT:    v_mul_hi_u32 v17, v13, v10
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT:    v_mul_lo_u32 v12, v16, v10
+; CGP-NEXT:    v_trunc_f32_e32 v11, v11
+; CGP-NEXT:    v_mac_f32_e32 v10, 0xcf800000, v11
+; CGP-NEXT:    v_cvt_u32_f32_e32 v17, v10
+; CGP-NEXT:    v_cvt_u32_f32_e32 v16, v11
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v18, v17, 0
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v18, v16, v[11:12]
+; CGP-NEXT:    v_mul_lo_u32 v11, v16, v10
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v19, v17, v[12:13]
+; CGP-NEXT:    v_mul_hi_u32 v12, v17, v10
 ; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT:    v_mul_lo_u32 v18, v13, v11
-; CGP-NEXT:    v_mul_lo_u32 v19, v16, v11
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
-; CGP-NEXT:    v_mul_hi_u32 v17, v13, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v18, v12
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v19, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT:    v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT:    v_mul_lo_u32 v13, v17, v14
+; CGP-NEXT:    v_mul_lo_u32 v15, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v17, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v15, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v10
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v13, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v10
 ; CGP-NEXT:    v_addc_u32_e32 v16, vcc, v16, v11, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
-; CGP-NEXT:    v_ashrrev_i32_e32 v14, 31, v9
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v14, vcc
-; CGP-NEXT:    v_xor_b32_e32 v12, v8, v14
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v18, v17, 0
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v18, v16, v[11:12]
+; CGP-NEXT:    v_ashrrev_i32_e32 v18, 31, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v18
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v19, v17, v[12:13]
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v18, vcc
+; CGP-NEXT:    v_xor_b32_e32 v15, v8, v18
 ; CGP-NEXT:    v_mul_lo_u32 v8, v16, v10
-; CGP-NEXT:    v_mul_lo_u32 v15, v13, v11
-; CGP-NEXT:    v_xor_b32_e32 v17, v9, v14
-; CGP-NEXT:    v_mul_hi_u32 v9, v13, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v17, v14
+; CGP-NEXT:    v_xor_b32_e32 v19, v9, v18
+; CGP-NEXT:    v_mul_hi_u32 v9, v17, v10
 ; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v16, v11
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT:    v_mul_hi_u32 v15, v13, v11
+; CGP-NEXT:    v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v17, v14
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; CGP-NEXT:    v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v16, v14
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v17, v8
 ; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v16, v9, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, v17, v8
-; CGP-NEXT:    v_mul_lo_u32 v11, v12, v9
-; CGP-NEXT:    v_mul_hi_u32 v13, v12, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v17, v8
-; CGP-NEXT:    v_mul_hi_u32 v15, v17, v9
+; CGP-NEXT:    v_mul_lo_u32 v10, v19, v8
+; CGP-NEXT:    v_mul_lo_u32 v11, v15, v9
+; CGP-NEXT:    v_mul_hi_u32 v12, v15, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v19, v8
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v17, v9
+; CGP-NEXT:    v_mul_lo_u32 v12, v19, v9
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v11, v12, v9
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v11, v15, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v8, v10
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v13, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v15, v10
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[9:10]
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v12, v8
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v1, v13, v[9:10]
-; CGP-NEXT:    v_subb_u32_e64 v10, s[4:5], v17, v9, vcc
-; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v17, v9
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v1
-; CGP-NEXT:    v_subb_u32_e32 v9, vcc, v9, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v8, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v19, v9
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v14, 0
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v11, v10
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v4, v16, v[9:10]
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v15, v8
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v1, v14, v[10:11]
+; CGP-NEXT:    v_subb_u32_e64 v9, s[4:5], v19, v12, vcc
+; CGP-NEXT:    v_sub_i32_e64 v10, s[4:5], v19, v12
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v1
+; CGP-NEXT:    v_subb_u32_e32 v10, vcc, v10, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v4
 ; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v1
-; CGP-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v10, v12, v15, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, 1, v13
-; CGP-NEXT:    v_addc_u32_e32 v15, vcc, 0, v11, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v1
+; CGP-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v9, v11, v12, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v14
+; CGP-NEXT:    v_addc_u32_e32 v12, vcc, 0, v16, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v16, v4, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v12
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v15, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v1
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v13, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v11
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v12, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v12, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v15, v8, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v8, v14, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v11, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v11, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v12, v8, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v14, v1, vcc
+; CGP-NEXT:    v_xor_b32_e32 v8, v18, v0
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v0, v1, v8
 ; CGP-NEXT:    v_xor_b32_e32 v1, v4, v8
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
@@ -2313,128 +2292,126 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v3, v6, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v4
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v8, v3
-; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v4
-; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v3, vcc
+; CGP-NEXT:    v_sub_i32_e32 v15, vcc, 0, v4
+; CGP-NEXT:    v_subb_u32_e32 v16, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v8
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; CGP-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v6
-; CGP-NEXT:    v_trunc_f32_e32 v10, v8
-; CGP-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v10
-; CGP-NEXT:    v_cvt_u32_f32_e32 v11, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v14, v10
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
-; CGP-NEXT:    v_mov_b32_e32 v6, v9
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7]
-; CGP-NEXT:    v_mul_lo_u32 v6, v14, v8
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v8
+; CGP-NEXT:    v_trunc_f32_e32 v8, v8
+; CGP-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v8
+; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; CGP-NEXT:    v_cvt_u32_f32_e32 v14, v8
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v15, v6, 0
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v15, v14, v[9:10]
+; CGP-NEXT:    v_mul_lo_u32 v9, v14, v8
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v6, v[10:11]
+; CGP-NEXT:    v_mul_hi_u32 v10, v6, v8
 ; CGP-NEXT:    v_mul_hi_u32 v8, v14, v8
-; CGP-NEXT:    v_mul_lo_u32 v15, v11, v9
-; CGP-NEXT:    v_mul_lo_u32 v16, v14, v9
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v15, v6
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v16, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v6, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v14, v12
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v6, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v15, v10
-; CGP-NEXT:    v_mul_hi_u32 v9, v14, v9
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v6
-; CGP-NEXT:    v_addc_u32_e32 v14, vcc, v14, v8, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
-; CGP-NEXT:    v_mov_b32_e32 v6, v9
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7]
-; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v7, v12, vcc
-; CGP-NEXT:    v_xor_b32_e32 v10, v5, v12
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v11, v14, v12
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CGP-NEXT:    v_addc_u32_e32 v14, vcc, v14, v9, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v15, v6, 0
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v15, v14, v[9:10]
+; CGP-NEXT:    v_ashrrev_i32_e32 v15, 31, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v15
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v6, v[10:11]
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v15, vcc
+; CGP-NEXT:    v_xor_b32_e32 v11, v5, v15
 ; CGP-NEXT:    v_mul_lo_u32 v5, v14, v8
-; CGP-NEXT:    v_mul_lo_u32 v7, v11, v9
-; CGP-NEXT:    v_xor_b32_e32 v13, v6, v12
-; CGP-NEXT:    v_mul_hi_u32 v6, v11, v8
+; CGP-NEXT:    v_mul_lo_u32 v9, v6, v12
+; CGP-NEXT:    v_xor_b32_e32 v13, v7, v15
+; CGP-NEXT:    v_mul_hi_u32 v7, v6, v8
 ; CGP-NEXT:    v_mul_hi_u32 v8, v14, v8
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v14, v9
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT:    v_mul_hi_u32 v7, v11, v9
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CGP-NEXT:    v_mul_lo_u32 v7, v14, v12
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; CGP-NEXT:    v_mul_hi_u32 v9, v6, v12
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_mul_hi_u32 v9, v14, v12
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT:    v_mul_hi_u32 v8, v14, v9
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v14, v6, vcc
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v14, v7, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, v10, v6
-; CGP-NEXT:    v_mul_hi_u32 v9, v10, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, v11, v6
+; CGP-NEXT:    v_mul_hi_u32 v9, v11, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT:    v_mul_hi_u32 v11, v13, v6
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v9, v13, v6
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT:    v_mul_hi_u32 v8, v10, v6
+; CGP-NEXT:    v_mul_hi_u32 v8, v11, v6
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v5, v7
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v11, v7
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v8, v[6:7]
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7]
-; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v13, v6, vcc
-; CGP-NEXT:    v_sub_i32_e64 v6, s[4:5], v13, v6
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CGP-NEXT:    v_subb_u32_e32 v6, vcc, v6, v3, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v5, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v5
+; CGP-NEXT:    v_mul_hi_u32 v8, v13, v6
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v8, v7
+; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v14, v[6:7]
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8]
+; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v13, v9, vcc
+; CGP-NEXT:    v_sub_i32_e64 v7, s[4:5], v13, v9
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_subb_u32_e32 v7, vcc, v7, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v4
 ; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v3
-; CGP-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v9
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v8, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v6, v8, v9, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v12
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v14, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v13, v4, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v10
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v11, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v11, v5, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v12, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v8
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; CGP-NEXT:    v_xor_b32_e32 v5, v15, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v14, v4, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v2, v3, v5
 ; CGP-NEXT:    v_xor_b32_e32 v3, v4, v5
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
@@ -2504,15 +2481,15 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) {
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v0
 ; CGP-NEXT:    v_rcp_f32_e32 v1, v1
 ; CGP-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v1
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v3
-; CGP-NEXT:    v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v4, v2
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
+; CGP-NEXT:    v_mul_lo_u32 v5, v1, v4
+; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0
+; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v0
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
 ; CGP-NEXT:    v_mul_lo_u32 v0, v1, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v5, v0
@@ -2537,198 +2514,194 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v9, 0
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v1
-; GISEL-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v9
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v10, 0
+; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, 0, v1
+; GISEL-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v10
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v13, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v4
-; GISEL-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v3
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v4
+; GISEL-NEXT:    v_trunc_f32_e32 v4, v4
+; GISEL-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v3
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v9, v3
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v3
+; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v4
+; GISEL-NEXT:    v_mul_hi_u32 v3, v9, v3
+; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v9, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v3
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v11, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT:    v_and_b32_e32 v10, 0xffffff, v0
-; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v4
-; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v3
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v3
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v12, 0xffffff, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v9, v3
+; GISEL-NEXT:    v_mul_hi_u32 v3, v9, v3
+; GISEL-NEXT:    v_mul_lo_u32 v7, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v4
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v5, v9, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v9, v4
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v11, v3, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v9, v3, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v4, 0, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v10, v3
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v3
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, 0, v0
-; GISEL-NEXT:    v_and_b32_e32 v11, 0xffffff, v2
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v5, 0, v3
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v3
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0
 ; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0
 ; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v6
-; GISEL-NEXT:    v_mov_b32_e32 v5, v8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[5:6]
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[5:6]
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v10, v7
-; GISEL-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v2
-; GISEL-NEXT:    v_trunc_f32_e32 v8, v6
-; GISEL-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v2
-; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], 0, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], 0, v3
-; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v8
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v5
-; GISEL-NEXT:    v_mov_b32_e32 v2, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v13, v15, v[2:3]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v14, v12, v[7:8]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, -1, v2, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v2, v15, v6
-; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v7
-; GISEL-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v6
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[8:9]
+; GISEL-NEXT:    v_mac_f32_e32 v11, 0x4f800000, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], 0, v0, v[5:6]
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v11
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v12, v7
+; GISEL-NEXT:    v_subb_u32_e64 v12, s[4:5], 0, v8, vcc
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v5
+; GISEL-NEXT:    v_sub_i32_e64 v15, s[4:5], 0, v3
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v6
+; GISEL-NEXT:    v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0
+; GISEL-NEXT:    v_sub_i32_e64 v17, s[4:5], 0, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v15, v14, v[6:7]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[7:8]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, -1, v6, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v6, v14, v5
+; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v9
+; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v5
+; GISEL-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v17, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v14, v9
+; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v15, v7
-; GISEL-NEXT:    v_mul_hi_u32 v6, v15, v6
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v7
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v9, v14, v9
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_mul_hi_u32 v7, v15, v7
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v2
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v15, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v10, 0
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v9, v1
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[2:3]
-; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v16, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v14, v10, v[6:7]
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v1
-; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v6
-; GISEL-NEXT:    v_mul_hi_u32 v14, v10, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v5
+; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, v14, v6, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v11, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v15, v14, v[6:7]
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, 1, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[7:8]
+; GISEL-NEXT:    v_mul_lo_u32 v6, v14, v5
+; GISEL-NEXT:    v_addc_u32_e32 v18, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v9
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v5
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v6
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v8, v14, v9
+; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v12, v6, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, 0, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, v11, v7
-; GISEL-NEXT:    v_mul_hi_u32 v14, v11, v5
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, 1, v2
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v13, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_mul_lo_u32 v9, 0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v11, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_mul_hi_u32 v8, v14, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v5, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v9, 0
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v14, v6, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, 0, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v6
+; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v5
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v15
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v18, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_mul_lo_u32 v8, 0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v13, 0, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v11, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v10, 0, v7
-; GISEL-NEXT:    v_mov_b32_e32 v1, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, v10, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v12, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v5
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v6
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], 0, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v15, v9, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v3, v13, v[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v18, v10, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], 0, v11, v[7:8]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v9
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], 0, v9, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v3
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
@@ -2736,8 +2709,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v9
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v10, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v11
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v13, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
@@ -2748,8 +2721,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_24bit:
@@ -2757,47 +2730,47 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v4
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT:    v_and_b32_e32 v4, 0xffffff, v6
-; CGP-NEXT:    v_sub_i32_e32 v6, vcc, 0, v3
+; CGP-NEXT:    v_and_b32_e32 v4, 0xffffff, v0
+; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v6
+; CGP-NEXT:    v_and_b32_e32 v9, 0xffffff, v2
 ; CGP-NEXT:    v_rcp_f32_e32 v1, v1
-; CGP-NEXT:    v_and_b32_e32 v8, 0xffffff, v0
-; CGP-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v1
-; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, v6, v5
-; CGP-NEXT:    v_rcp_f32_e32 v7, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
-; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v7
+; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v1
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v5, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT:    v_mul_lo_u32 v5, v1, v3
-; CGP-NEXT:    v_mul_lo_u32 v0, v0, v6
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v3
+; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v5
+; CGP-NEXT:    v_mul_lo_u32 v7, v1, v6
+; CGP-NEXT:    v_rcp_f32_e32 v8, v0
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
+; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v8
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v1
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v0
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v6, 0
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, 0, v5
+; CGP-NEXT:    v_mul_lo_u32 v6, v0, v2
+; CGP-NEXT:    v_mul_lo_u32 v0, v1, v3
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v1
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v3
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v4, v0
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v7, v1, v7, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT:    v_and_b32_e32 v8, 0xffffff, v2
-; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v5, v3
-; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v6, v1
-; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v8, v1, 0
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v6, 0
+; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v4, v3
+; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v2, v1
+; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v6, 0
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, 1, v7
-; CGP-NEXT:    v_mul_lo_u32 v5, v2, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v2, v5
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v7, v1, vcc
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v3, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v9, v4
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v3, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and <2 x i64> %num, <i64 16777215, i64 16777215>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 1441591a5fcce..9d6ffc9bbc0dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -172,68 +172,68 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    s_subb_u32 s15, 0, s9
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0
+; GFX8-NEXT:    v_mov_b32_e32 v8, s11
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s10, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s10, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX8-NEXT:    v_mul_hi_u32 v5, s11, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
@@ -246,36 +246,36 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v6, s11
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v0
+; GFX8-NEXT:    v_mul_hi_u32 v3, s11, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v3, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s10, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v5, s9
-; GFX8-NEXT:    v_subb_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v1, s[0:1], s11, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_subb_u32_e64 v2, s[0:1], v8, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e64 v3, s[0:1], s11, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v2
-; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s8, v0
-; GFX8-NEXT:    v_subbrev_u32_e64 v8, s[0:1], 0, v1, vcc
-; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v4
-; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
+; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s8, v0
+; GFX8-NEXT:    v_subbrev_u32_e64 v8, s[0:1], 0, v3, vcc
+; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v6
+; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v5
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s8, v7
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s8, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
 ; GFX8-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
@@ -283,20 +283,20 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v0, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v9, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v7, v10, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v0, v3, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[0:1]
 ; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[12:13]
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v4
-; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v3
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v6
+; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v5
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GFX8-NEXT:    v_xor_b32_e32 v4, s2, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s2, v3
@@ -312,6 +312,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-LABEL: sdivrem_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s2, s17, 31
 ; GFX9-NEXT:    s_ashr_i32 s4, s19, 31
@@ -332,67 +333,66 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    s_subb_u32 s11, 0, s7
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s7
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v4, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT:    v_mul_hi_u32 v6, s9, v1
+; GFX9-NEXT:    v_mul_hi_u32 v5, s9, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
@@ -400,67 +400,67 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v7, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s9
+; GFX9-NEXT:    v_add3_u32 v8, v3, v2, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s6, v8, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s8, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v2
-; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s7, v7, v[2:3]
+; GFX9-NEXT:    v_subb_co_u32_e64 v1, s[0:1], v1, v4, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, s9, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s6, v0
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v1
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s6, v0
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v2, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v7
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v8, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v8
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v9
-; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s6, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v5
+; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s6, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v10
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v0, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v10, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v8, v11, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v0, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[0:1]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[4:5]
-; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v3
+; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v3
+; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v6
-; GFX9-NEXT:    v_xor_b32_e32 v5, s2, v2
-; GFX9-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v4
+; GFX9-NEXT:    v_xor_b32_e32 v4, s2, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[12:13]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[14:15]
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v5, vcc
+; GFX9-NEXT:    global_store_dwordx2 v9, v[0:1], s[12:13]
+; GFX9-NEXT:    global_store_dwordx2 v9, v[2:3], s[14:15]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sdivrem_i64:
@@ -554,29 +554,29 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
 ; GFX10-NEXT:    v_add_co_u32 v0, s8, v5, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s8
+; GFX10-NEXT:    v_mul_hi_u32 v5, s1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v6, v2
 ; GFX10-NEXT:    v_add_co_u32 v0, s8, v0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s8
-; GFX10-NEXT:    v_add_co_u32 v5, s8, v0, v2
-; GFX10-NEXT:    v_mul_hi_u32 v2, s1, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s8
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v3
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s8, s6, v5, 0
-; GFX10-NEXT:    v_add3_u32 v3, v3, v6, v2
+; GFX10-NEXT:    v_add_co_u32 v6, s8, v0, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v4, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s8
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s8, s6, v6, 0
+; GFX10-NEXT:    v_add3_u32 v3, v2, v3, v5
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s8, s6, v3, v[1:2]
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s8, s7, v5, v[1:2]
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v5, 1
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s8, s7, v6, v[1:2]
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v6, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s1, v1
+; GFX10-NEXT:    v_sub_nc_u32_e32 v5, s1, v1
 ; GFX10-NEXT:    v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s7, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v0, s6
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v1
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s7, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
@@ -590,16 +590,16 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, v7, s0
 ; GFX10-NEXT:    v_sub_co_u32 v10, s0, v8, s6
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v5, s0, 0, v5, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v8, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_xor_b32_e32 v2, s4, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v3, s5, v3
@@ -1308,71 +1308,71 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    s_subb_u32 s17, 0, s9
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX8-NEXT:    v_mov_b32_e32 v8, s11
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; GFX8-NEXT:    s_xor_b64 s[16:17], s[4:5], s[6:7]
 ; GFX8-NEXT:    s_ashr_i32 s6, s19, 31
 ; GFX8-NEXT:    s_mov_b32 s7, s6
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s10, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s10, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX8-NEXT:    v_mul_hi_u32 v5, s11, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
@@ -1385,207 +1385,206 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v6, s11
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s10, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v5, s9
-; GFX8-NEXT:    s_ashr_i32 s10, s3, 31
-; GFX8-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s11, v1
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v0
+; GFX8-NEXT:    v_mul_hi_u32 v3, s11, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v3, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
+; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, s10, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_subb_u32_e64 v8, s[0:1], v8, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s11, v4
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, s8, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[0:1]
-; GFX8-NEXT:    v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc
-; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], 1, v4
-; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v9
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s8, v9
+; GFX8-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v0, vcc
+; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], 1, v6
+; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1]
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
+; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v9
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
+; GFX8-NEXT:    v_subrev_u32_e32 v14, vcc, s8, v3
+; GFX8-NEXT:    s_ashr_i32 s8, s3, 31
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v1
+; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v5
+; GFX8-NEXT:    s_add_u32 s10, s18, s6
 ; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
-; GFX8-NEXT:    s_add_u32 s0, s18, s6
-; GFX8-NEXT:    s_addc_u32 s1, s19, s6
-; GFX8-NEXT:    s_add_u32 s2, s2, s10
-; GFX8-NEXT:    s_mov_b32 s11, s10
-; GFX8-NEXT:    s_addc_u32 s3, s3, s10
-; GFX8-NEXT:    s_xor_b64 s[2:3], s[2:3], s[10:11]
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v14, s3
-; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s2
-; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, s8, v8
-; GFX8-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v0, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v14
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    s_addc_u32 s11, s19, s6
+; GFX8-NEXT:    s_add_u32 s0, s2, s8
+; GFX8-NEXT:    s_mov_b32 s9, s8
+; GFX8-NEXT:    s_addc_u32 s1, s3, s8
+; GFX8-NEXT:    s_xor_b64 s[2:3], s[0:1], s[8:9]
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX8-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v0, vcc
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v1, v12, vcc
-; GFX8-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX8-NEXT:    v_trunc_f32_e32 v11, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v11
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v12, v0
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    s_xor_b64 s[10:11], s[10:11], s[6:7]
 ; GFX8-NEXT:    s_sub_u32 s5, 0, s2
-; GFX8-NEXT:    s_subb_u32 s20, 0, s3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v11, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v12, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v10, v3, v10, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v15, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[18:19], s5, v5, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v3, v5, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[18:19], s20, v12, v[1:2]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v16, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v2, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v8, v12, v1
-; GFX8-NEXT:    v_mul_hi_u32 v2, v12, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v5, v0
-; GFX8-NEXT:    v_xor_b32_e32 v9, s17, v10
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[18:19], s5, v11, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v3, v14, vcc
+; GFX8-NEXT:    s_subb_u32 s20, 0, s3
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[18:19], s5, v12, v[1:2]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v15, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[18:19], s20, v11, v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v1, s[0:1]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v12, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, v11, v4
+; GFX8-NEXT:    v_mul_hi_u32 v3, v11, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v12, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v3, v5, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
-; GFX8-NEXT:    v_mul_hi_u32 v8, v12, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v12, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v11, v4
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v8
-; GFX8-NEXT:    v_mul_hi_u32 v1, v5, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v12, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v1, s16, v4
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s17
-; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s16, v1
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4]
-; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v9, v10, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v4, s4, v7
-; GFX8-NEXT:    v_mul_lo_u32 v7, v5, v2
-; GFX8-NEXT:    v_mul_lo_u32 v9, v8, v3
-; GFX8-NEXT:    v_mul_hi_u32 v11, v8, v2
-; GFX8-NEXT:    v_mul_hi_u32 v2, v5, v2
-; GFX8-NEXT:    v_xor_b32_e32 v6, s4, v6
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v11
+; GFX8-NEXT:    v_mul_hi_u32 v3, v12, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v11, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v12, v1, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s5, v11, v[3:4]
+; GFX8-NEXT:    v_xor_b32_e32 v6, s16, v6
+; GFX8-NEXT:    v_xor_b32_e32 v1, s17, v7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s17
+; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s16, v6
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v7, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s20, v10, v[4:5]
+; GFX8-NEXT:    v_mul_lo_u32 v4, v11, v2
+; GFX8-NEXT:    v_xor_b32_e32 v3, s4, v9
+; GFX8-NEXT:    v_mul_lo_u32 v7, v10, v6
+; GFX8-NEXT:    v_mul_hi_u32 v9, v10, v2
+; GFX8-NEXT:    v_mul_hi_u32 v2, v11, v2
+; GFX8-NEXT:    v_xor_b32_e32 v5, s4, v8
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v11, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT:    v_mul_hi_u32 v9, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v11, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v9
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v9, v11, v6
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT:    v_mul_hi_u32 v7, v10, v6
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v11, v9
-; GFX8-NEXT:    v_mul_hi_u32 v3, v5, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v10, s4
-; GFX8-NEXT:    v_mul_lo_u32 v7, s9, v2
-; GFX8-NEXT:    v_mul_lo_u32 v8, s8, v3
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v6, v10, vcc
-; GFX8-NEXT:    v_mul_hi_u32 v6, s8, v2
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT:    v_mul_hi_u32 v6, v11, v6
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v10, v2
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v11, v4, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v7, s11, v2
+; GFX8-NEXT:    v_mul_lo_u32 v9, s10, v6
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v3
+; GFX8-NEXT:    v_mul_hi_u32 v3, s10, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, s4
+; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v5, v8, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, s9, v3
-; GFX8-NEXT:    v_mul_hi_u32 v2, s9, v2
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
-; GFX8-NEXT:    v_mul_hi_u32 v8, s8, v3
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v7, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v7, s11, v6
+; GFX8-NEXT:    v_mul_hi_u32 v2, s11, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v8, v3
+; GFX8-NEXT:    v_mul_hi_u32 v8, s10, v6
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v2, v6
-; GFX8-NEXT:    v_mul_hi_u32 v9, s9, v3
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
-; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s9
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s8, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7]
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v2, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_mul_hi_u32 v6, s11, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v6, v7
+; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v12, s11
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s10, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_subb_u32_e64 v7, s[0:1], v10, v6, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s9, v6
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[0:1]
+; GFX8-NEXT:    v_subb_u32_e64 v6, s[0:1], v12, v8, vcc
+; GFX8-NEXT:    v_sub_u32_e64 v7, s[0:1], s11, v8
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v7
-; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, s2, v2
-; GFX8-NEXT:    v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v6
+; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v7, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v9, vcc, s2, v2
+; GFX8-NEXT:    v_subbrev_u32_e64 v12, s[0:1], 0, v7, vcc
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v11
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v14, s[0:1], 1, v8
-; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v6, v3, vcc
-; GFX8-NEXT:    v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v14
+; GFX8-NEXT:    v_add_u32_e64 v14, s[0:1], 1, v10
+; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GFX8-NEXT:    v_addc_u32_e64 v15, s[0:1], 0, v11, s[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 1, v14
 ; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v15, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s2, v11
+; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s2, v9
 ; GFX8-NEXT:    v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v14, v7, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v10
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, v9, v14, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v13, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v10, v7, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v11, v14, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, v2, v9, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[6:7], s[10:11]
-; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v3, s[0:1]
+; GFX8-NEXT:    s_xor_b64 s[0:1], s[6:7], s[8:9]
+; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v7
 ; GFX8-NEXT:    v_xor_b32_e32 v3, s1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v6, s1
+; GFX8-NEXT:    v_mov_b32_e32 v7, s1
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v6, s6, v9
-; GFX8-NEXT:    v_xor_b32_e32 v7, s6, v7
-; GFX8-NEXT:    v_mov_b32_e32 v8, s6
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s6, v6
-; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v7, v8, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v7, s6, v9
+; GFX8-NEXT:    v_xor_b32_e32 v8, s6, v6
+; GFX8-NEXT:    v_mov_b32_e32 v9, s6
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s6, v7
+; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v8, v9, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s13
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
@@ -1619,69 +1618,70 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    s_subb_u32 s17, 0, s9
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; GFX9-NEXT:    s_xor_b64 s[16:17], s[4:5], s[6:7]
 ; GFX9-NEXT:    s_ashr_i32 s6, s19, 31
 ; GFX9-NEXT:    s_mov_b32 s7, s6
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v4, v1
-; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX9-NEXT:    v_mul_hi_u32 v6, s11, v1
+; GFX9-NEXT:    v_mul_hi_u32 v5, s11, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
@@ -1693,205 +1693,203 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s11
-; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s10, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v4, s9
-; GFX9-NEXT:    s_ashr_i32 s10, s3, 31
-; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v0, s11, v1
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT:    v_add3_u32 v7, v3, v2, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
+; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s10, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[0:1], v8, v4, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
+; GFX9-NEXT:    v_sub_u32_e32 v0, s11, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT:    v_subrev_co_u32_e32 v9, vcc, s8, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v1, v2, s[0:1]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], 1, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s8, v9
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v0, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], 1, v6
+; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[0:1], 0, v7, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v1, v12, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v2
-; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s18, s6
-; GFX9-NEXT:    s_addc_u32 s1, s19, s6
-; GFX9-NEXT:    s_add_u32 s2, s2, s10
-; GFX9-NEXT:    s_mov_b32 s11, s10
-; GFX9-NEXT:    s_addc_u32 s3, s3, s10
-; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[10:11]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v14, vcc, s8, v3
+; GFX9-NEXT:    s_ashr_i32 s8, s3, 31
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v12, s[0:1], 1, v5
+; GFX9-NEXT:    s_add_u32 s10, s18, s6
+; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1]
+; GFX9-NEXT:    s_addc_u32 s11, s19, s6
+; GFX9-NEXT:    s_add_u32 s0, s2, s8
+; GFX9-NEXT:    s_mov_b32 s9, s8
+; GFX9-NEXT:    s_addc_u32 s1, s3, s8
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[0:1], s[8:9]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v15, s2
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v15
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s8, v9
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v16, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v16
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v17, v0
-; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[6:7]
 ; GFX9-NEXT:    s_sub_u32 s5, 0, s2
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v2, v13, vcc
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v13, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v11, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[18:19], s5, v11, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v3, v14, vcc
 ; GFX9-NEXT:    s_subb_u32 s20, 0, s3
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v3, v11, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v15, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v17, v1
-; GFX9-NEXT:    v_mul_hi_u32 v10, v17, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v10
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[18:19], s5, v12, v[1:2]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v15, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[18:19], s20, v11, v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v1, s[0:1]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v12, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v11, v4
+; GFX9-NEXT:    v_mul_hi_u32 v3, v11, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v12, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v10, v13, v1
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v17, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v13, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v12, v4
+; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v11, v4
+; GFX9-NEXT:    v_xor_b32_e32 v6, s16, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, v10, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v17, v0
-; GFX9-NEXT:    v_add3_u32 v1, v3, v2, v1
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v13, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v4, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1]
-; GFX9-NEXT:    v_xor_b32_e32 v5, s16, v5
-; GFX9-NEXT:    v_xor_b32_e32 v8, s17, v8
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v9, s17
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s16, v5
-; GFX9-NEXT:    v_xor_b32_e32 v4, s4, v7
-; GFX9-NEXT:    v_mul_lo_u32 v5, v11, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, v10, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v8, v9, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v8, v10, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v3
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v12, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v0
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0
+; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v12, v1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s5, v11, v[3:4]
+; GFX9-NEXT:    v_xor_b32_e32 v1, s17, v7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s17
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s16, v6
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s20, v10, v[4:5]
+; GFX9-NEXT:    v_mul_lo_u32 v4, v11, v2
+; GFX9-NEXT:    v_xor_b32_e32 v3, s4, v9
+; GFX9-NEXT:    v_mul_lo_u32 v7, v10, v6
+; GFX9-NEXT:    v_mul_hi_u32 v9, v10, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v11, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, v10, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, v11, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v3, v7, v5, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v10, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, s9, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, s8, v3
-; GFX9-NEXT:    v_mul_hi_u32 v9, s8, v2
-; GFX9-NEXT:    v_mul_hi_u32 v2, s9, v2
-; GFX9-NEXT:    v_mul_hi_u32 v12, s9, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
+; GFX9-NEXT:    v_xor_b32_e32 v5, s4, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v9, s9, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, s8, v3
-; GFX9-NEXT:    v_xor_b32_e32 v6, s4, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v9, v11, v6
+; GFX9-NEXT:    v_add_u32_e32 v4, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v7, v10, v6
+; GFX9-NEXT:    v_mul_hi_u32 v6, v11, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v9, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v2, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_add_u32_e32 v7, v9, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v4, v7, v4, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v10, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v11, v4, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, s11, v2
+; GFX9-NEXT:    v_mul_lo_u32 v9, s10, v6
+; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s4, v3
+; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s4
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s4, v4
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v8, vcc
-; GFX9-NEXT:    v_add_u32_e32 v6, v9, v7
-; GFX9-NEXT:    v_add3_u32 v8, v6, v11, v12
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4]
-; GFX9-NEXT:    v_mov_b32_e32 v9, s9
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s8, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s3, v10, v[6:7]
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v8, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v7, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, s11, v6
+; GFX9-NEXT:    v_mul_hi_u32 v2, s11, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, v8, v3
+; GFX9-NEXT:    v_mul_hi_u32 v8, s10, v6
+; GFX9-NEXT:    v_mul_hi_u32 v6, s11, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v2, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v7, v7, v8
+; GFX9-NEXT:    v_add3_u32 v11, v7, v9, v6
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4]
+; GFX9-NEXT:    v_mov_b32_e32 v12, s11
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s10, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v7
-; GFX9-NEXT:    v_sub_u32_e32 v6, s9, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v12, v8, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v6
+; GFX9-NEXT:    v_sub_u32_e32 v7, s11, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s2, v2
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v6
+; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v9, vcc, s2, v2
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v11
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v10
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v6, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v14
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v11, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v14
 ; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, 0, v15, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v14, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s2, v11
+; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s2, v9
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v15, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v15, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v10, v7, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v11, v14, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v2, v9, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[6:7], s[10:11]
-; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v3, s[0:1]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[6:7], s[8:9]
+; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v7
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s1, v8
-; GFX9-NEXT:    v_mov_b32_e32 v6, s1
+; GFX9-NEXT:    v_mov_b32_e32 v7, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v6, s6, v9
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v7, s6, v9
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0
-; GFX9-NEXT:    v_xor_b32_e32 v7, s6, v7
-; GFX9-NEXT:    v_mov_b32_e32 v8, s6
-; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s6, v6
-; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v8, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v8, s6, v6
+; GFX9-NEXT:    v_mov_b32_e32 v9, s6
+; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s6, v7
+; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v8, v9, vcc
 ; GFX9-NEXT:    global_store_dwordx4 v13, v[0:3], s[12:13]
 ; GFX9-NEXT:    global_store_dwordx4 v13, v[4:7], s[14:15]
 ; GFX9-NEXT:    s_endpgm
@@ -1917,21 +1915,21 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    s_subb_u32 s20, 0, s7
 ; GFX10-NEXT:    s_xor_b64 s[16:17], s[4:5], s[8:9]
 ; GFX10-NEXT:    s_ashr_i32 s8, s19, 31
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX10-NEXT:    s_ashr_i32 s10, s3, 31
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
 ; GFX10-NEXT:    s_add_u32 s18, s18, s8
 ; GFX10-NEXT:    s_addc_u32 s19, s19, s8
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
 ; GFX10-NEXT:    s_add_u32 s2, s2, s10
 ; GFX10-NEXT:    s_mov_b32 s11, s10
 ; GFX10-NEXT:    s_addc_u32 s3, s3, s10
-; GFX10-NEXT:    s_mov_b32 s9, s8
-; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[10:11]
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[10:11]
+; GFX10-NEXT:    s_mov_b32 s9, s8
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s2
-; GFX10-NEXT:    s_xor_b64 s[18:19], s[18:19], s[8:9]
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    s_xor_b64 s[18:19], s[18:19], s[8:9]
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1940,256 +1938,253 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v7, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v9, v2
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    v_trunc_f32_e32 v6, v4
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v7, v0
-; GFX10-NEXT:    v_mul_f32_e32 v4, 0xcf800000, v6
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s21, v7, 0
+; GFX10-NEXT:    v_trunc_f32_e32 v5, v4
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX10-NEXT:    v_mul_f32_e32 v4, 0xcf800000, v5
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v9, v5
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s21, v6, 0
 ; GFX10-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX10-NEXT:    s_sub_u32 s5, 0, s2
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v8, v3
-; GFX10-NEXT:    v_mul_hi_u32 v10, v9, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s22, s5, v8, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s22, s21, v9, v[1:2]
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v5, v6
-; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    v_mul_hi_u32 v6, v7, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s5, s21, v7, v[1:2]
+; GFX10-NEXT:    s_sub_u32 s5, 0, s2
+; GFX10-NEXT:    v_mul_lo_u32 v10, v7, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s23, s5, v8, 0
 ; GFX10-NEXT:    s_subb_u32 s22, 0, s3
-; GFX10-NEXT:    v_mul_hi_u32 v12, v8, v2
-; GFX10-NEXT:    v_mul_lo_u32 v11, v5, v2
-; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s23, s20, v7, v[4:5]
-; GFX10-NEXT:    v_mul_lo_u32 v4, v9, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s23, s5, v5, v[1:2]
-; GFX10-NEXT:    v_mul_hi_u32 v2, v5, v2
-; GFX10-NEXT:    v_mul_lo_u32 v13, v7, v3
-; GFX10-NEXT:    v_mul_lo_u32 v14, v9, v3
-; GFX10-NEXT:    v_mul_hi_u32 v15, v7, v3
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s23, s22, v8, v[0:1]
-; GFX10-NEXT:    v_mul_hi_u32 v1, v9, v3
-; GFX10-NEXT:    v_add_co_u32 v3, s23, v4, v13
+; GFX10-NEXT:    v_mul_hi_u32 v12, v7, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s23, s20, v6, v[3:4]
+; GFX10-NEXT:    v_mul_hi_u32 v11, v6, v0
+; GFX10-NEXT:    v_mul_hi_u32 v14, v9, v1
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s23, s5, v9, v[2:3]
+; GFX10-NEXT:    v_mul_hi_u32 v5, v8, v1
+; GFX10-NEXT:    v_mul_lo_u32 v13, v6, v3
+; GFX10-NEXT:    v_mul_lo_u32 v15, v7, v3
+; GFX10-NEXT:    v_mul_lo_u32 v2, v9, v1
+; GFX10-NEXT:    v_mul_hi_u32 v16, v6, v3
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s23, s22, v8, v[4:5]
+; GFX10-NEXT:    v_mul_hi_u32 v1, v7, v3
+; GFX10-NEXT:    v_add_co_u32 v3, s23, v10, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v10, s23, v14, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s23
-; GFX10-NEXT:    v_mul_lo_u32 v14, v8, v0
-; GFX10-NEXT:    v_add_co_u32 v3, s23, v3, v6
+; GFX10-NEXT:    v_add_co_u32 v10, s23, v15, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s23
+; GFX10-NEXT:    v_mul_lo_u32 v13, v8, v0
+; GFX10-NEXT:    v_add_co_u32 v3, s23, v3, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v6, s23, v10, v15
-; GFX10-NEXT:    v_mul_lo_u32 v15, v5, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s23
+; GFX10-NEXT:    v_mul_lo_u32 v15, v9, v0
+; GFX10-NEXT:    v_add_co_u32 v10, s23, v10, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s23
 ; GFX10-NEXT:    v_mul_hi_u32 v16, v8, v0
-; GFX10-NEXT:    v_mul_hi_u32 v17, v5, v0
+; GFX10-NEXT:    v_mul_hi_u32 v17, v9, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v4, v3
-; GFX10-NEXT:    v_add_co_u32 v4, s23, v11, v14
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v13, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v2, s23, v15, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v0, s23, v6, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v4, s23, v4, v12
+; GFX10-NEXT:    v_add_co_u32 v2, s23, v2, v13
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v12, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v2, s23, v2, v16
-; GFX10-NEXT:    v_add3_u32 v1, v3, v6, v1
-; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v7, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v10, v4
+; GFX10-NEXT:    v_add_co_u32 v11, s23, v15, v14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s23
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v9, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, s23, v10, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s23
+; GFX10-NEXT:    v_add_co_u32 v2, s23, v2, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s23
+; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v0
+; GFX10-NEXT:    v_add3_u32 v1, v3, v10, v1
+; GFX10-NEXT:    v_add_co_u32 v5, s23, v11, v16
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v4, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s23
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v7, v1, vcc_lo
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s23, s21, v6, 0
-; GFX10-NEXT:    v_add_co_u32 v2, s23, v2, v3
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v11, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s23
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0
+; GFX10-NEXT:    v_add_co_u32 v2, s23, v5, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v12, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s23
 ; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v8, v2
-; GFX10-NEXT:    v_mul_hi_u32 v11, v7, v0
-; GFX10-NEXT:    v_add3_u32 v3, v4, v3, v17
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v5, v3, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s23, s5, v8, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s21, s21, v7, v[1:2]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    v_mul_lo_u32 v12, v9, v2
-; GFX10-NEXT:    v_mul_hi_u32 v13, v8, v2
-; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s20, s20, v6, v[4:5]
-; GFX10-NEXT:    v_mul_lo_u32 v4, v7, v0
-; GFX10-NEXT:    v_mul_hi_u32 v5, v6, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s5, v9, v[1:2]
-; GFX10-NEXT:    v_mul_hi_u32 v2, v9, v2
-; GFX10-NEXT:    v_mul_lo_u32 v14, v6, v3
+; GFX10-NEXT:    v_mul_lo_u32 v10, v7, v0
+; GFX10-NEXT:    v_add3_u32 v5, v3, v4, v17
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s21, s21, v7, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s21, s5, v8, 0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v5, vcc_lo
+; GFX10-NEXT:    v_mul_hi_u32 v12, v7, v0
+; GFX10-NEXT:    v_mul_hi_u32 v11, v6, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s20, s20, v6, v[3:4]
+; GFX10-NEXT:    v_mul_hi_u32 v14, v9, v1
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s5, s5, v9, v[2:3]
+; GFX10-NEXT:    v_mul_hi_u32 v5, v8, v1
+; GFX10-NEXT:    v_mul_lo_u32 v13, v6, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v15, v7, v3
+; GFX10-NEXT:    v_mul_lo_u32 v2, v9, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v16, v6, v3
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s22, v8, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s22, v8, v[4:5]
 ; GFX10-NEXT:    v_mul_hi_u32 v1, v7, v3
-; GFX10-NEXT:    v_add_co_u32 v3, s5, v4, v14
+; GFX10-NEXT:    v_add_co_u32 v3, s5, v10, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v11, s5, v15, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v3, s5, v3, v5
-; GFX10-NEXT:    v_mul_lo_u32 v15, v8, v0
+; GFX10-NEXT:    v_add_co_u32 v10, s5, v15, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s5
+; GFX10-NEXT:    v_mul_lo_u32 v13, v8, v0
+; GFX10-NEXT:    v_add_co_u32 v3, s5, v3, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v5, s5, v11, v16
-; GFX10-NEXT:    v_mul_lo_u32 v16, v9, v0
+; GFX10-NEXT:    v_mul_lo_u32 v15, v9, v0
+; GFX10-NEXT:    v_add_co_u32 v10, s5, v10, v16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v3
-; GFX10-NEXT:    v_mul_hi_u32 v17, v8, v0
-; GFX10-NEXT:    v_mul_hi_u32 v0, v9, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v14, v11
-; GFX10-NEXT:    v_add_co_u32 v11, s5, v12, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v2, s5, v16, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v3, s5, v5, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v11, s5, v11, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v2, s5, v2, v17
-; GFX10-NEXT:    v_add3_u32 v1, v4, v5, v1
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v6, v3
+; GFX10-NEXT:    v_add_co_u32 v2, s5, v2, v13
+; GFX10-NEXT:    v_mul_hi_u32 v16, v8, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, v12, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v12, s5, v15, v14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v3, s5, v10, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v2, s5, v2, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v6, v3
+; GFX10-NEXT:    v_add3_u32 v1, v4, v10, v1
+; GFX10-NEXT:    v_add_co_u32 v5, s5, v12, v16
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v11, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s5
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v1, vcc_lo
+; GFX10-NEXT:    v_mul_hi_u32 v0, v9, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v6, s1, v3
-; GFX10-NEXT:    v_add_co_u32 v2, s5, v2, v4
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, v14, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
-; GFX10-NEXT:    v_mul_lo_u32 v11, s0, v1
+; GFX10-NEXT:    v_add_co_u32 v2, s5, v5, v2
+; GFX10-NEXT:    v_mul_lo_u32 v10, s0, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v13, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s5
 ; GFX10-NEXT:    v_mul_hi_u32 v7, s0, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v3, s1, v3
-; GFX10-NEXT:    v_mul_lo_u32 v12, s1, v1
-; GFX10-NEXT:    v_add3_u32 v0, v5, v4, v0
+; GFX10-NEXT:    v_mul_lo_u32 v11, s1, v1
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
+; GFX10-NEXT:    v_add3_u32 v0, v4, v5, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v4, s0, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v5, s1, v1
-; GFX10-NEXT:    v_add_co_u32 v1, s5, v6, v11
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v9, v0, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v1, s5, v6, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v3, s5, v12, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v3, s5, v11, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s5
 ; GFX10-NEXT:    v_add_co_u32 v1, s5, v1, v7
-; GFX10-NEXT:    v_mul_lo_u32 v0, s19, v2
-; GFX10-NEXT:    v_mul_lo_u32 v12, s18, v8
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v9, v0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX10-NEXT:    v_mul_lo_u32 v0, s19, v2
 ; GFX10-NEXT:    v_add_co_u32 v3, s5, v3, v4
+; GFX10-NEXT:    v_mul_lo_u32 v7, s18, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, v6, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v9, s18, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
 ; GFX10-NEXT:    v_mul_hi_u32 v2, s19, v2
-; GFX10-NEXT:    v_mul_lo_u32 v7, s19, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v6, v1
-; GFX10-NEXT:    v_add_co_u32 v6, s5, v0, v12
-; GFX10-NEXT:    v_mul_hi_u32 v13, s18, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v11, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v12, s5, v3, v1
-; GFX10-NEXT:    v_add_co_u32 v2, s20, v7, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s5
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s6, v12, 0
-; GFX10-NEXT:    v_add_co_u32 v6, s5, v6, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v9, s5, v2, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s20
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s5
-; GFX10-NEXT:    v_add3_u32 v4, v4, v7, v5
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, v11, v6
-; GFX10-NEXT:    v_mul_hi_u32 v5, s19, v8
-; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v12, 1
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
+; GFX10-NEXT:    v_mul_lo_u32 v6, s19, v8
+; GFX10-NEXT:    v_add_co_u32 v3, s5, v3, v1
+; GFX10-NEXT:    v_add_co_u32 v7, s20, v0, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v10, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s5
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s6, v3, 0
+; GFX10-NEXT:    v_mul_hi_u32 v11, s18, v8
+; GFX10-NEXT:    v_add_co_u32 v6, s5, v6, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s5
+; GFX10-NEXT:    v_add3_u32 v4, v4, v12, v5
+; GFX10-NEXT:    v_add_co_u32 v2, s5, v7, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s20
+; GFX10-NEXT:    v_mul_hi_u32 v7, s19, v8
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s5, s6, v4, v[1:2]
-; GFX10-NEXT:    v_add_co_u32 v6, s5, v9, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v4, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v7, 1
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s5, s7, v12, v[1:2]
-; GFX10-NEXT:    v_add3_u32 v5, v3, v9, v5
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s5, s2, v6, 0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v8, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, s0, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v9, s1, v1
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v15, s0, s1, v1, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v3, vcc_lo, v14, s6
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v16, s0, 0, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v14
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v6, s5, v6, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s5
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, v10, v5
+; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v3, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v4, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, v13, v2
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s5, s7, v3, v[1:2]
+; GFX10-NEXT:    v_add_co_u32 v5, s5, v6, v5
+; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, s0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s5
+; GFX10-NEXT:    v_mov_b32_e32 v9, 0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s1, v1
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v13, s0, s1, v1, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v7, v8, v2, v7
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v15, vcc_lo, v12, s6
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v16, s0, 0, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v13
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, -1, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v5, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v5, 0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v17, v14, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v16
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v19, v18, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v15
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v17, v20, v17, s0
-; GFX10-NEXT:    v_sub_co_u32 v1, s0, v3, s6
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v9, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s1, s3, v6, v[0:1]
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v17
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v13, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v12, v7, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v16, v9, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, s18, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s0
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v8, s1, s19, v0, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s19, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v14, v3, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v8
-; GFX10-NEXT:    v_xor_b32_e32 v1, s16, v1
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
-; GFX10-NEXT:    v_xor_b32_e32 v4, s17, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s0
-; GFX10-NEXT:    v_xor_b32_e32 v3, s4, v3
-; GFX10-NEXT:    v_xor_b32_e32 v7, s4, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v19, v18, s0
+; GFX10-NEXT:    v_add_co_u32 v18, s0, v10, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v19, s0, 0, v11, s0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s2, v7, v[1:2]
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX10-NEXT:    v_sub_co_u32 v2, s0, v15, s6
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v18, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v11, v19, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v15, v2, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s1, s3, v5, v[1:2]
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v3, v8, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v4, v10, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v16, v6, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, s18, v0
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v10, s1, s19, v1, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s19, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v12, v11, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v13, v4, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v10
+; GFX10-NEXT:    v_xor_b32_e32 v0, s16, v2
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v8
+; GFX10-NEXT:    v_xor_b32_e32 v2, s17, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, v2, s2
+; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, v8, s2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v14, s0, 0, v11, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v0, s0, v1, s16
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v1, s0, s17, v4, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v8
+; GFX10-NEXT:    v_sub_co_u32 v0, s0, v0, s16
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v1, s0, s17, v2, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v10
+; GFX10-NEXT:    v_xor_b32_e32 v2, s4, v6
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v9, v12, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v12, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
-; GFX10-NEXT:    v_add_co_u32 v15, s0, v6, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v16, s0, 0, v5, s0
+; GFX10-NEXT:    v_add_co_u32 v15, s0, v5, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v16, s0, 0, v7, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v12, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s0
 ; GFX10-NEXT:    v_add_co_u32 v12, s0, v15, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s0, 0, v16, s0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_sub_co_u32 v9, s0, v13, s2
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_sub_co_u32 v6, s0, v13, s2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v11, s0, 0, v11, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, v15, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v4
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v13, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v14, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v5, v15, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v14, v11, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v11, s4, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v5, v12, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s0
 ; GFX10-NEXT:    s_xor_b64 s[0:1], s[8:9], s[10:11]
-; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v3, s4
-; GFX10-NEXT:    v_xor_b32_e32 v3, s0, v6
-; GFX10-NEXT:    v_xor_b32_e32 v6, s1, v11
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v7, vcc_lo
-; GFX10-NEXT:    v_xor_b32_e32 v7, s8, v2
-; GFX10-NEXT:    v_xor_b32_e32 v8, s8, v8
-; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v3, s0
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v6, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v7, s8
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s8, v8, vcc_lo
-; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[12:13]
-; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[14:15]
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v2, s4
+; GFX10-NEXT:    v_xor_b32_e32 v2, s0, v12
+; GFX10-NEXT:    v_xor_b32_e32 v7, s1, v7
+; GFX10-NEXT:    v_xor_b32_e32 v8, s8, v3
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v11, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v10, s8, v6
+; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v2, s0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v7, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v8, s8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s8, v10, vcc_lo
+; GFX10-NEXT:    global_store_dwordx4 v9, v[0:3], s[12:13]
+; GFX10-NEXT:    global_store_dwordx4 v9, v[4:7], s[14:15]
 ; GFX10-NEXT:    s_endpgm
   %div = sdiv <2 x i64> %x, %y
   store <2 x i64> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 40b5db0a15447..39cf7b01fd6c0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -31,128 +31,128 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, 0, v0
-; CHECK-NEXT:    v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v12, vcc, 0, v0
+; CHECK-NEXT:    v_subb_u32_e32 v13, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
-; CHECK-NEXT:    v_trunc_f32_e32 v6, v3
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v6
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v11, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v3, v11, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v11, v2
-; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v6
-; CHECK-NEXT:    v_mul_lo_u32 v13, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT:    v_trunc_f32_e32 v3, v3
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v11, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v10, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v11, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[3:4]
+; CHECK-NEXT:    v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[6:7]
+; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, v11, v8
+; CHECK-NEXT:    v_mul_lo_u32 v9, v10, v8
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v12, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_mul_hi_u32 v7, v10, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v2
-; CHECK-NEXT:    v_addc_u32_e32 v11, vcc, v11, v3, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v5
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v9
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v9, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v5, v3, v9
-; CHECK-NEXT:    v_mul_lo_u32 v3, v11, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, v8, v6
-; CHECK-NEXT:    v_xor_b32_e32 v10, v4, v9
-; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v11, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v2
+; CHECK-NEXT:    v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v11, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v12, 31, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v12
+; CHECK-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[6:7]
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v12, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v9, v3, v12
+; CHECK-NEXT:    v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v11, v8
+; CHECK-NEXT:    v_xor_b32_e32 v13, v4, v12
+; CHECK-NEXT:    v_mul_hi_u32 v4, v11, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT:    v_mul_lo_u32 v4, v10, v8
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v10, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v11, v3, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, v10, v2
-; CHECK-NEXT:    v_mul_lo_u32 v6, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v7, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, v10, v3
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v11, v2
+; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v10, v3, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v4, v13, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v9, v3
+; CHECK-NEXT:    v_mul_hi_u32 v6, v9, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v13, v2
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v10, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT:    v_mul_hi_u32 v6, v5, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_mul_lo_u32 v6, v13, v3
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_mul_hi_u32 v5, v9, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v2, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v7, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, v7, v[3:4]
-; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v3, s[4:5], v10, v3
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v2, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v2
+; CHECK-NEXT:    v_mul_hi_u32 v5, v13, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v5, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4]
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5]
+; CHECK-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v6, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v4, s[4:5], v13, v6
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CHECK-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CHECK-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
 ; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v2, v0
-; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
+; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v1
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v9
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v9
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v12
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v12
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v12, vcc
 ; CHECK-NEXT:    ; implicit-def: $vgpr2
 ; CHECK-NEXT:    ; implicit-def: $vgpr4
 ; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -212,67 +212,67 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_subb_u32 s5, 0, s9
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; CHECK-NEXT:    v_trunc_f32_e32 v2, v1
-; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v1
-; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v1
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_trunc_f32_e32 v1, v1
+; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT:    v_mul_lo_u32 v3, v6, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v0
+; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT:    v_mul_hi_u32 v3, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v0
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT:    v_mul_hi_u32 v6, v3, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, s10, v1
 ; CHECK-NEXT:    v_mul_hi_u32 v4, s10, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v0, s11, v0
-; CHECK-NEXT:    v_mul_hi_u32 v5, s11, v1
+; CHECK-NEXT:    v_mov_b32_e32 v7, s11
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -285,19 +285,19 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v2, v[1:2]
-; CHECK-NEXT:    v_mov_b32_e32 v5, s11
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v0, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v3, s11, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v3, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[1:2]
+; CHECK-NEXT:    v_mov_b32_e32 v1, s9
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; CHECK-NEXT:    v_mov_b32_e32 v3, s9
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[0:1], v5, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[0:1], s11, v1
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; CHECK-NEXT:    v_sub_i32_e64 v3, s[0:1], s11, v4
+; CHECK-NEXT:    v_subb_u32_e64 v2, s[0:1], v7, v4, vcc
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
 ; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
@@ -372,261 +372,257 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_xor_b32_e32 v8, v9, v8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v5
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v8
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, 0, v5
-; GISEL-NEXT:    v_subb_u32_e32 v14, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, 0, v5
+; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, 0, v8, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v9
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v11, v9
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v11
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0
-; GISEL-NEXT:    v_mov_b32_e32 v4, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v15, v9
-; GISEL-NEXT:    v_mul_hi_u32 v16, v12, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
+; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v4, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v16, v15, v[10:11]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v15, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v4, v[11:12]
+; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v15, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v4, v13
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v16, v15, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT:    v_mul_hi_u32 v11, v12, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v16, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v15, v13
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v13
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v16, v11
-; GISEL-NEXT:    v_mul_hi_u32 v10, v15, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v4
-; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, v15, v9, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0
-; GISEL-NEXT:    v_mov_b32_e32 v4, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v12, v15, v13
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v4, v9
+; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, v15, v10, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v18, 0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v16, v15, v[10:11]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v13, v0, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v18, v[11:12]
+; GISEL-NEXT:    v_xor_b32_e32 v14, v0, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v0, v15, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v10
-; GISEL-NEXT:    v_xor_b32_e32 v14, v1, v4
-; GISEL-NEXT:    v_mul_hi_u32 v1, v12, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v18, v13
+; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v4
+; GISEL-NEXT:    v_mul_hi_u32 v1, v18, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v15, v9
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v15, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_mul_hi_u32 v11, v12, v10
+; GISEL-NEXT:    v_mul_lo_u32 v1, v15, v13
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v10, v18, v13
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v15, v13
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v18, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v15, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v14, v0
-; GISEL-NEXT:    v_mul_lo_u32 v10, v13, v1
-; GISEL-NEXT:    v_mul_hi_u32 v11, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT:    v_mul_lo_u32 v9, v16, v0
+; GISEL-NEXT:    v_mul_lo_u32 v10, v14, v1
+; GISEL-NEXT:    v_mul_hi_u32 v11, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v16, v0
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v14, v1
+; GISEL-NEXT:    v_mul_lo_u32 v11, v16, v1
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v1
+; GISEL-NEXT:    v_mul_hi_u32 v10, v14, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v1, v14, v1
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v9
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v0, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v1, v0
-; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v10, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v10
-; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v10
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v15, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0
-; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, 0, v6
-; GISEL-NEXT:    v_mac_f32_e32 v12, 0x4f800000, v15
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2]
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v12
-; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v8, v11, v[9:10]
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v1
-; GISEL-NEXT:    v_trunc_f32_e32 v12, v10
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v12
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v1
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v13, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v12, v[0:1]
-; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v14, v9, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v14, v9
-; GISEL-NEXT:    v_mul_lo_u32 v9, v12, v10
-; GISEL-NEXT:    v_mul_lo_u32 v14, v15, v0
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v11, v8
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v15, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v10
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v13, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v11, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, v14, s[6:7]
-; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v13, v5
-; GISEL-NEXT:    v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v14, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v19, v8
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s[8:9]
-; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v19, v8
-; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v14, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[6:7]
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v10, v16, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v13, 0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v10, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v5, v11, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v8, v13, v[9:10]
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v14, v0
+; GISEL-NEXT:    v_subb_u32_e64 v12, s[4:5], v16, v11, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v16, v11
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v5
+; GISEL-NEXT:    v_subb_u32_e32 v0, vcc, v0, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v8
+; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v10, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v1, v9, s[4:5]
+; GISEL-NEXT:    v_subbrev_u32_e64 v14, s[4:5], 0, v0, vcc
+; GISEL-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v1
+; GISEL-NEXT:    v_addc_u32_e64 v7, s[4:5], v7, v1, s[4:5]
+; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v1
+; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v1
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v7
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v5
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v9
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v14, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, v15, v16, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, v0, v8, vcc
+; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
+; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GISEL-NEXT:    v_trunc_f32_e32 v1, v1
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v17, v0
+; GISEL-NEXT:    v_sub_i32_e32 v19, vcc, 0, v6
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0
+; GISEL-NEXT:    v_subb_u32_e32 v20, vcc, 0, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v19, v18, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v21, v18, v0
+; GISEL-NEXT:    v_mul_hi_u32 v22, v17, v0
+; GISEL-NEXT:    v_mul_hi_u32 v23, v18, v0
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v13, v5
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v16, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v19, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v18, v1
-; GISEL-NEXT:    v_mul_hi_u32 v18, v15, v0
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
+; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v0
+; GISEL-NEXT:    v_mul_lo_u32 v9, v18, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v21, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v22
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT:    v_mul_hi_u32 v8, v17, v0
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v23
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v18
-; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v15, v1
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v12, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v5, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v17, v10, v[8:9]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v13, v2, v5
-; GISEL-NEXT:    v_mul_lo_u32 v2, v12, v0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT:    v_xor_b32_e32 v14, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v3, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v17, v1
+; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, v18, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v19, v14, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v19, v15, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v10, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v12, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v20, v14, v[8:9]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v12, 31, v3
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v11, v2, v12
+; GISEL-NEXT:    v_mul_lo_u32 v2, v15, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v14, v10
+; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v12
+; GISEL-NEXT:    v_mul_hi_u32 v3, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v8
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT:    v_mul_lo_u32 v3, v15, v10
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT:    v_mul_hi_u32 v8, v14, v10
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v15, v10
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v12, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v0
-; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v2
-; GISEL-NEXT:    v_mul_hi_u32 v9, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
-; GISEL-NEXT:    v_xor_b32_e32 v10, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v15, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v2
+; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v14, v2
+; GISEL-NEXT:    v_mul_lo_u32 v9, v13, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
-; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v2
+; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v9, v14, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1]
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v10, 0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v0, v[3:4]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v10, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[8:9]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v13, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v14, v3
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v5, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v10, v[8:9]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v13, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v13, v3
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v7
 ; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v2, v6
-; GISEL-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v2, v6
+; GISEL-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v6
 ; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v7
-; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v7
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v5
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v5
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v12
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v12
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64:
@@ -651,128 +647,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v0
-; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v16, vcc, 0, v0
+; CGP-NEXT:    v_subb_u32_e32 v17, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CGP-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
-; CGP-NEXT:    v_trunc_f32_e32 v4, v3
-; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; CGP-NEXT:    v_cvt_u32_f32_e32 v14, v4
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4]
-; CGP-NEXT:    v_mul_hi_u32 v15, v5, v2
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4]
-; CGP-NEXT:    v_mul_lo_u32 v4, v14, v2
+; CGP-NEXT:    v_trunc_f32_e32 v3, v3
+; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; CGP-NEXT:    v_cvt_u32_f32_e32 v15, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v14, v3
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v16, v15, 0
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[3:4]
+; CGP-NEXT:    v_mul_lo_u32 v3, v14, v2
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v15, v[4:5]
+; CGP-NEXT:    v_mul_hi_u32 v4, v15, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v14, v2
-; CGP-NEXT:    v_mul_lo_u32 v16, v5, v3
-; CGP-NEXT:    v_mul_lo_u32 v17, v14, v3
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v15
-; CGP-NEXT:    v_mul_hi_u32 v15, v5, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v16, v4
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v17, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT:    v_mul_hi_u32 v3, v14, v3
+; CGP-NEXT:    v_mul_lo_u32 v5, v15, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v14, v12
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v15, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v2
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CGP-NEXT:    v_mul_hi_u32 v5, v14, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v2
 ; CGP-NEXT:    v_addc_u32_e32 v14, vcc, v14, v3, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4]
-; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
-; CGP-NEXT:    v_mul_hi_u32 v15, v5, v2
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4]
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v12
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v11, v12, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v4, v12
-; CGP-NEXT:    v_mul_lo_u32 v4, v14, v2
-; CGP-NEXT:    v_mul_lo_u32 v13, v5, v3
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v16, v15, 0
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[3:4]
+; CGP-NEXT:    v_ashrrev_i32_e32 v16, 31, v11
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v16
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v15, v[4:5]
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v11, v16, vcc
+; CGP-NEXT:    v_xor_b32_e32 v13, v3, v16
+; CGP-NEXT:    v_mul_lo_u32 v3, v14, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v15, v12
+; CGP-NEXT:    v_xor_b32_e32 v17, v4, v16
+; CGP-NEXT:    v_mul_hi_u32 v4, v15, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v14, v2
-; CGP-NEXT:    v_xor_b32_e32 v10, v10, v12
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v15
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v4, v14, v12
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT:    v_mul_hi_u32 v5, v15, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v14, v3
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_mul_hi_u32 v13, v5, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v14, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v15, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT:    v_mul_hi_u32 v3, v14, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v14, v3, vcc
-; CGP-NEXT:    v_mul_lo_u32 v4, v10, v2
-; CGP-NEXT:    v_mul_lo_u32 v5, v11, v3
-; CGP-NEXT:    v_mul_hi_u32 v13, v11, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v10, v2
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v3
+; CGP-NEXT:    v_mul_lo_u32 v4, v17, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
+; CGP-NEXT:    v_mul_hi_u32 v10, v13, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v17, v2
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v10, v3
+; CGP-NEXT:    v_mul_lo_u32 v10, v17, v3
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT:    v_mul_hi_u32 v5, v11, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v5, v13, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v2, v4
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v13, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, v13, v[3:4]
-; CGP-NEXT:    v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v10, v3
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v2, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v2
+; CGP-NEXT:    v_mul_hi_u32 v5, v17, v3
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v5, v4
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v10, v[3:4]
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v13, v2
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v1, v12, v[4:5]
+; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v17, v10, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v17, v10
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[4:5]
 ; CGP-NEXT:    v_sub_i32_e32 v10, vcc, v2, v0
-; CGP-NEXT:    v_subbrev_u32_e64 v11, s[4:5], 0, v3, vcc
+; CGP-NEXT:    v_subbrev_u32_e64 v11, s[4:5], 0, v4, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v1
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v10, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v12
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v12
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v16
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v16
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v16, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr4
 ; CGP-NEXT:    ; implicit-def: $vgpr10
 ; CGP-NEXT:  .LBB2_2: ; %Flow1
@@ -820,128 +816,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_xor_b32_e32 v3, v4, v3
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v3
-; CGP-NEXT:    v_sub_i32_e32 v10, vcc, 0, v2
-; CGP-NEXT:    v_subb_u32_e32 v11, vcc, 0, v3, vcc
+; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v2
+; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v6, v5
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v12, v6
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6]
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6]
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v4
+; CGP-NEXT:    v_trunc_f32_e32 v5, v5
+; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v12, v5
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v13, 0
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[5:6]
+; CGP-NEXT:    v_mul_lo_u32 v5, v12, v4
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[6:7]
+; CGP-NEXT:    v_mul_hi_u32 v6, v13, v4
 ; CGP-NEXT:    v_mul_hi_u32 v4, v12, v4
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v5
-; CGP-NEXT:    v_mul_lo_u32 v15, v12, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_mul_hi_u32 v5, v12, v5
+; CGP-NEXT:    v_mul_lo_u32 v7, v13, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v12, v10
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CGP-NEXT:    v_mul_hi_u32 v6, v13, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v4
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT:    v_mul_hi_u32 v7, v12, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v4
 ; CGP-NEXT:    v_addc_u32_e32 v12, vcc, v12, v5, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6]
-; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6]
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v10
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v10, vcc
-; CGP-NEXT:    v_xor_b32_e32 v9, v6, v10
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v4
-; CGP-NEXT:    v_mul_lo_u32 v11, v7, v5
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v13, 0
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[5:6]
+; CGP-NEXT:    v_ashrrev_i32_e32 v14, 31, v9
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v14
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[6:7]
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v9, v14, vcc
+; CGP-NEXT:    v_xor_b32_e32 v11, v5, v14
+; CGP-NEXT:    v_mul_lo_u32 v5, v12, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, v13, v10
+; CGP-NEXT:    v_xor_b32_e32 v15, v6, v14
+; CGP-NEXT:    v_mul_hi_u32 v6, v13, v4
 ; CGP-NEXT:    v_mul_hi_u32 v4, v12, v4
-; CGP-NEXT:    v_xor_b32_e32 v8, v8, v10
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v6, v12, v10
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_mul_hi_u32 v7, v13, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v12, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, v12, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT:    v_mul_hi_u32 v5, v12, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
 ; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v12, v5, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v7, v9, v5
-; CGP-NEXT:    v_mul_hi_u32 v11, v9, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v8, v4
-; CGP-NEXT:    v_mul_hi_u32 v12, v8, v5
+; CGP-NEXT:    v_mul_lo_u32 v6, v15, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, v11, v5
+; CGP-NEXT:    v_mul_hi_u32 v8, v11, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v15, v4
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v8, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, v15, v5
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_mul_hi_u32 v7, v9, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v7, v11, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v4, v6
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6]
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v11, v[5:6]
-; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v8, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v8, v5
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v4, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v4
+; CGP-NEXT:    v_mul_hi_u32 v7, v15, v5
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v7, v6
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6]
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[6:7]
+; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v15, v8, vcc
+; CGP-NEXT:    v_sub_i32_e64 v6, s[4:5], v15, v8
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
-; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v3
+; CGP-NEXT:    v_subb_u32_e32 v6, vcc, v6, v3, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[4:5]
 ; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v4, v2
-; CGP-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; CGP-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v6, v3, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v14
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v14
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v14
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v14, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr6
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -977,82 +973,82 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x1000
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0xfffff000
+; CHECK-NEXT:    v_mov_b32_e32 v9, 0xfffff000
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
-; CHECK-NEXT:    v_trunc_f32_e32 v4, v3
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT:    v_trunc_f32_e32 v3, v3
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v10, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT:    v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT:    v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT:    v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT:    v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT:    v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT:    v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT:    v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT:    v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT:    v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v3, v8, v6
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v10, v6
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v2, v5, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, v4, v1
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0x1000
+; CHECK-NEXT:    v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0x1000
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT:    v_mul_lo_u32 v7, v5, v1
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v4, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
@@ -1060,39 +1056,39 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v7, v9, v1
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    v_mul_hi_u32 v7, v5, v1
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v0
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2]
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v5
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v4, v5
+; CHECK-NEXT:    v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v4, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v2, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, -1, v3, s[4:5]
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %num, 4096
   ret i64 %result
@@ -1112,71 +1108,69 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_subb_u32 s6, 0, 0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
-; GISEL-NEXT:    v_mov_b32_e32 v7, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v4
-; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v13
-; GISEL-NEXT:    v_mul_lo_u32 v4, v9, v13
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6]
+; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT:    v_mul_lo_u32 v9, v7, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
-; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v7, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v4, v14
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v17, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
-; GISEL-NEXT:    s_mov_b32 s6, 1
-; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GISEL-NEXT:    v_mul_lo_u32 v7, v16, v14
-; GISEL-NEXT:    s_subb_u32 s6, 0, 0
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v16, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v7, v17, v13
-; GISEL-NEXT:    v_mul_lo_u32 v13, v17, v14
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v14
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v7, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v7
-; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v7
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v7
+; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15]
+; GISEL-NEXT:    v_mul_lo_u32 v9, v17, v13
+; GISEL-NEXT:    v_mul_hi_u32 v18, v4, v13
+; GISEL-NEXT:    v_mul_hi_u32 v19, v17, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16]
+; GISEL-NEXT:    s_mov_b32 s6, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v13
+; GISEL-NEXT:    v_mul_hi_u32 v15, v4, v13
+; GISEL-NEXT:    s_subb_u32 s6, 0, 0
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v9
+; GISEL-NEXT:    v_mul_lo_u32 v9, v17, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v13
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v9, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v9
+; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v14
+; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v16, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v19, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v18, v1
@@ -1195,122 +1189,120 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT:    v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v18, v0
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v19, v13
-; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v1, v19, v1
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v0, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v1, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14]
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v18, v0
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v19, v15
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v19, v15, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT:    v_sub_i32_e32 v18, vcc, v16, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v16, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v18, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, -1, v1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v20, -1, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v18, v4
+; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v13
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v19, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v18, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, v19, v1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v5, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GISEL-NEXT:    v_mul_hi_u32 v1, v7, v13
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; GISEL-NEXT:    v_mul_hi_u32 v10, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v10, v8, v0
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v10, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT:    v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v7, v0
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v8, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v16, v14, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v5, v9
+; GISEL-NEXT:    v_ashrrev_i32_e32 v13, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8]
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v13, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v14, v2, v13
+; GISEL-NEXT:    v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT:    v_xor_b32_e32 v15, v3, v13
+; GISEL-NEXT:    v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_xor_b32_e32 v8, v11, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v15, v0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v14, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT:    v_xor_b32_e32 v7, v12, v9
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT:    v_mul_lo_u32 v6, v15, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v15, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v9
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v7, v9, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v14, v2
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v15, v7
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v15, v7, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1330,10 +1322,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v13
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v13
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v13, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_pow2k_denom:
@@ -1346,27 +1338,26 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v8, v5
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
+; CGP-NEXT:    v_trunc_f32_e32 v5, v5
+; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v5
 ; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
-; CGP-NEXT:    v_mov_b32_e32 v9, v5
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
-; CGP-NEXT:    v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6]
+; CGP-NEXT:    v_mul_hi_u32 v11, v8, v4
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v4
+; CGP-NEXT:    v_mul_hi_u32 v10, v7, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v7, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v12
+; CGP-NEXT:    v_mul_hi_u32 v14, v7, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v8, v12
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
@@ -1374,41 +1365,40 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT:    v_addc_u32_e32 v17, vcc, v8, v9, vcc
-; CGP-NEXT:    v_mov_b32_e32 v4, v14
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT:    v_mul_lo_u32 v9, v16, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT:    v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v15, v16, v14
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT:    v_addc_u32_e32 v16, vcc, v8, v12, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14]
+; CGP-NEXT:    v_mul_lo_u32 v17, v16, v12
+; CGP-NEXT:    v_mul_hi_u32 v18, v4, v12
+; CGP-NEXT:    v_mul_hi_u32 v19, v16, v12
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15]
+; CGP-NEXT:    v_mul_lo_u32 v13, v4, v12
+; CGP-NEXT:    v_mul_hi_u32 v15, v4, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v9, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v9
-; CGP-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT:    v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT:    v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_mul_lo_u32 v14, v16, v12
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v19
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
+; CGP-NEXT:    v_mul_hi_u32 v17, v16, v12
+; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT:    v_xor_b32_e32 v18, v0, v12
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v14, v13
+; CGP-NEXT:    v_xor_b32_e32 v19, v1, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v15, v1
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v17, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v16, v1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v13, v19, v0
 ; CGP-NEXT:    v_mul_lo_u32 v14, v18, v1
 ; CGP-NEXT:    v_mul_hi_u32 v15, v18, v0
@@ -1426,119 +1416,118 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v0, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; CGP-NEXT:    v_mul_hi_u32 v15, v19, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; CGP-NEXT:    v_sub_i32_e32 v14, vcc, v18, v0
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v0
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v14
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2]
+; CGP-NEXT:    v_sub_i32_e32 v15, vcc, v18, v0
 ; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v19, v13
-; CGP-NEXT:    v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; CGP-NEXT:    v_subb_u32_e64 v16, s[4:5], v19, v13, vcc
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
-; CGP-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
+; CGP-NEXT:    v_sub_i32_e32 v18, vcc, v15, v4
+; CGP-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v15, v4
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v18, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; CGP-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; CGP-NEXT:    v_mov_b32_e32 v0, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, -1, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
-; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v18, vcc
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
-; CGP-NEXT:    v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT:    v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, v8, v0
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; CGP-NEXT:    v_mul_hi_u32 v5, v7, v0
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v14, v16, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT:    v_cndmask_b32_e32 v10, v15, v17, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT:    v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT:    v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT:    v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT:    v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT:    v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; CGP-NEXT:    v_cndmask_b32_e64 v17, -1, v1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e32 v20, -1, v0, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6]
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v18, v4
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1]
+; CGP-NEXT:    v_subbrev_u32_e32 v21, vcc, 0, v19, vcc
+; CGP-NEXT:    v_mul_lo_u32 v1, v7, v13
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v18, v5, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v19, v21, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v17
+; CGP-NEXT:    v_cndmask_b32_e32 v14, v15, v0, vcc
+; CGP-NEXT:    v_add_i32_e64 v0, s[4:5], v9, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT:    v_add_i32_e64 v0, s[4:5], v1, v0
+; CGP-NEXT:    v_mul_hi_u32 v1, v7, v13
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v9, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
+; CGP-NEXT:    v_mul_hi_u32 v10, v8, v13
+; CGP-NEXT:    v_add_i32_e64 v0, s[4:5], v1, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v9, v1
+; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v10, v1
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v7, v0
+; CGP-NEXT:    v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5]
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; CGP-NEXT:    v_ashrrev_i32_e32 v13, 31, v3
+; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2]
+; CGP-NEXT:    v_xor_b32_e32 v1, v5, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8]
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v13, vcc
+; CGP-NEXT:    v_xor_b32_e32 v7, v2, v13
+; CGP-NEXT:    v_mul_lo_u32 v2, v10, v0
+; CGP-NEXT:    v_mul_lo_u32 v6, v9, v5
+; CGP-NEXT:    v_xor_b32_e32 v8, v3, v13
+; CGP-NEXT:    v_mul_hi_u32 v3, v9, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT:    v_mul_lo_u32 v3, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT:    v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT:    v_mul_hi_u32 v6, v9, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT:    v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT:    v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v9
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v9, v0
+; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v10, v2, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT:    v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT:    v_xor_b32_e32 v11, v14, v12
+; CGP-NEXT:    v_mul_hi_u32 v9, v7, v3
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v12
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v12, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT:    v_mul_hi_u32 v3, v8, v3
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v7, v13, v2
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v3, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v2
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v3
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v6
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4]
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v8, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1558,10 +1547,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v13
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v13
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v13, vcc
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i64> %num, <i64 4096, i64 4096>
   ret <2 x i64> %result
@@ -1573,82 +1562,82 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x12d8fb
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0xffed2705
+; CHECK-NEXT:    v_mov_b32_e32 v9, 0xffed2705
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
-; CHECK-NEXT:    v_trunc_f32_e32 v4, v3
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT:    v_trunc_f32_e32 v3, v3
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v10, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT:    v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT:    v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT:    v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT:    v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT:    v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT:    v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT:    v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT:    v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT:    v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v3, v8, v6
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v10, v6
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v2, v5, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, v4, v1
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0x12d8fb
+; CHECK-NEXT:    v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0x12d8fb
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT:    v_mul_lo_u32 v7, v5, v1
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v4, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
@@ -1656,39 +1645,39 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v7, v9, v1
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    v_mul_hi_u32 v7, v5, v1
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v0
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2]
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v5
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v4, v5
+; CHECK-NEXT:    v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v4, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v2, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, -1, v3, s[4:5]
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %num, 1235195
   ret i64 %result
@@ -1708,71 +1697,69 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_subb_u32 s6, 0, 0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
-; GISEL-NEXT:    v_mov_b32_e32 v7, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v4
-; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v13
-; GISEL-NEXT:    v_mul_lo_u32 v4, v9, v13
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6]
+; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT:    v_mul_lo_u32 v9, v7, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
-; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v7, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v4, v14
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v17, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15]
+; GISEL-NEXT:    v_mul_lo_u32 v9, v17, v13
+; GISEL-NEXT:    v_mul_hi_u32 v18, v4, v13
+; GISEL-NEXT:    v_mul_hi_u32 v19, v17, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16]
 ; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GISEL-NEXT:    v_mul_lo_u32 v7, v16, v14
+; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v13
+; GISEL-NEXT:    v_mul_hi_u32 v15, v4, v13
 ; GISEL-NEXT:    s_subb_u32 s6, 0, 0
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v16, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v7, v17, v13
-; GISEL-NEXT:    v_mul_lo_u32 v13, v17, v14
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v14
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v7, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v7
-; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v7
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v7
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v9
+; GISEL-NEXT:    v_mul_lo_u32 v9, v17, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v13
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v9, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v9
+; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v14
+; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v16, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v19, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v18, v1
@@ -1791,122 +1778,120 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT:    v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v18, v0
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v19, v13
-; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v1, v19, v1
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v0, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v1, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14]
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v18, v0
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v19, v15
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v19, v15, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT:    v_sub_i32_e32 v18, vcc, v16, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v16, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v18, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, -1, v1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v20, -1, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v18, v4
+; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v13
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v19, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v18, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, v19, v1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v5, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GISEL-NEXT:    v_mul_hi_u32 v1, v7, v13
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; GISEL-NEXT:    v_mul_hi_u32 v10, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v10, v8, v0
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v10, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT:    v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v7, v0
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v8, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v16, v14, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v5, v9
+; GISEL-NEXT:    v_ashrrev_i32_e32 v13, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8]
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v13, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v14, v2, v13
+; GISEL-NEXT:    v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT:    v_xor_b32_e32 v15, v3, v13
+; GISEL-NEXT:    v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_xor_b32_e32 v8, v11, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v15, v0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v14, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT:    v_xor_b32_e32 v7, v12, v9
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT:    v_mul_lo_u32 v6, v15, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v15, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v9
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v7, v9, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v14, v2
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v15, v7
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v15, v7, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1926,10 +1911,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v13
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v13
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v13, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_oddk_denom:
@@ -1942,27 +1927,26 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v8, v5
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
+; CGP-NEXT:    v_trunc_f32_e32 v5, v5
+; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v5
 ; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
-; CGP-NEXT:    v_mov_b32_e32 v9, v5
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
-; CGP-NEXT:    v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6]
+; CGP-NEXT:    v_mul_hi_u32 v11, v8, v4
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v4
+; CGP-NEXT:    v_mul_hi_u32 v10, v7, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v7, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v12
+; CGP-NEXT:    v_mul_hi_u32 v14, v7, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v8, v12
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
@@ -1970,41 +1954,40 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT:    v_addc_u32_e32 v17, vcc, v8, v9, vcc
-; CGP-NEXT:    v_mov_b32_e32 v4, v14
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT:    v_mul_lo_u32 v9, v16, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT:    v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v15, v16, v14
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT:    v_addc_u32_e32 v16, vcc, v8, v12, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14]
+; CGP-NEXT:    v_mul_lo_u32 v17, v16, v12
+; CGP-NEXT:    v_mul_hi_u32 v18, v4, v12
+; CGP-NEXT:    v_mul_hi_u32 v19, v16, v12
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15]
+; CGP-NEXT:    v_mul_lo_u32 v13, v4, v12
+; CGP-NEXT:    v_mul_hi_u32 v15, v4, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v9, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v9
-; CGP-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT:    v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT:    v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_mul_lo_u32 v14, v16, v12
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v19
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
+; CGP-NEXT:    v_mul_hi_u32 v17, v16, v12
+; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT:    v_xor_b32_e32 v18, v0, v12
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v14, v13
+; CGP-NEXT:    v_xor_b32_e32 v19, v1, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v15, v1
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v17, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v16, v1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v13, v19, v0
 ; CGP-NEXT:    v_mul_lo_u32 v14, v18, v1
 ; CGP-NEXT:    v_mul_hi_u32 v15, v18, v0
@@ -2022,119 +2005,118 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v0, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; CGP-NEXT:    v_mul_hi_u32 v15, v19, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; CGP-NEXT:    v_sub_i32_e32 v14, vcc, v18, v0
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v0
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v14
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2]
+; CGP-NEXT:    v_sub_i32_e32 v15, vcc, v18, v0
 ; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v19, v13
-; CGP-NEXT:    v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; CGP-NEXT:    v_subb_u32_e64 v16, s[4:5], v19, v13, vcc
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
-; CGP-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
+; CGP-NEXT:    v_sub_i32_e32 v18, vcc, v15, v4
+; CGP-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v15, v4
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v18, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; CGP-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; CGP-NEXT:    v_mov_b32_e32 v0, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, -1, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
-; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v18, vcc
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
-; CGP-NEXT:    v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT:    v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, v8, v0
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; CGP-NEXT:    v_mul_hi_u32 v5, v7, v0
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v14, v16, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT:    v_cndmask_b32_e32 v10, v15, v17, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT:    v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT:    v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT:    v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT:    v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT:    v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; CGP-NEXT:    v_cndmask_b32_e64 v17, -1, v1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e32 v20, -1, v0, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6]
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v18, v4
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1]
+; CGP-NEXT:    v_subbrev_u32_e32 v21, vcc, 0, v19, vcc
+; CGP-NEXT:    v_mul_lo_u32 v1, v7, v13
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v18, v5, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v19, v21, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v17
+; CGP-NEXT:    v_cndmask_b32_e32 v14, v15, v0, vcc
+; CGP-NEXT:    v_add_i32_e64 v0, s[4:5], v9, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT:    v_add_i32_e64 v0, s[4:5], v1, v0
+; CGP-NEXT:    v_mul_hi_u32 v1, v7, v13
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v9, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
+; CGP-NEXT:    v_mul_hi_u32 v10, v8, v13
+; CGP-NEXT:    v_add_i32_e64 v0, s[4:5], v1, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v9, v1
+; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v10, v1
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v7, v0
+; CGP-NEXT:    v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5]
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; CGP-NEXT:    v_ashrrev_i32_e32 v13, 31, v3
+; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2]
+; CGP-NEXT:    v_xor_b32_e32 v1, v5, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8]
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v13, vcc
+; CGP-NEXT:    v_xor_b32_e32 v7, v2, v13
+; CGP-NEXT:    v_mul_lo_u32 v2, v10, v0
+; CGP-NEXT:    v_mul_lo_u32 v6, v9, v5
+; CGP-NEXT:    v_xor_b32_e32 v8, v3, v13
+; CGP-NEXT:    v_mul_hi_u32 v3, v9, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT:    v_mul_lo_u32 v3, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT:    v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT:    v_mul_hi_u32 v6, v9, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT:    v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT:    v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v9
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v9, v0
+; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v10, v2, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT:    v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT:    v_xor_b32_e32 v11, v14, v12
+; CGP-NEXT:    v_mul_hi_u32 v9, v7, v3
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v12
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v12, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT:    v_mul_hi_u32 v3, v8, v3
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v7, v13, v2
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v3, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v2
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v3
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v6
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4]
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v8, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -2154,10 +2136,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v13
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v13
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v13, vcc
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i64> %num, <i64 1235195, i64 1235195>
   ret <2 x i64> %result
@@ -2193,130 +2175,128 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v1
-; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, 0, v0
-; CHECK-NEXT:    v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v12, vcc, 0, v0
+; CHECK-NEXT:    v_subb_u32_e32 v13, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v5
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v2
-; CHECK-NEXT:    v_trunc_f32_e32 v7, v5
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v7
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v11, v7
-; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT:    v_mov_b32_e32 v2, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3]
-; CHECK-NEXT:    v_mul_lo_u32 v2, v11, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v5
+; CHECK-NEXT:    v_trunc_f32_e32 v5, v5
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v5
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v11, v5
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v12, v2, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[6:7]
+; CHECK-NEXT:    v_mul_lo_u32 v6, v11, v5
+; CHECK-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v2, v[7:8]
+; CHECK-NEXT:    v_mul_hi_u32 v7, v2, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v6
-; CHECK-NEXT:    v_mul_lo_u32 v13, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v9
+; CHECK-NEXT:    v_mul_lo_u32 v10, v11, v9
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_mul_hi_u32 v7, v2, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v2
-; CHECK-NEXT:    v_addc_u32_e32 v11, vcc, v11, v5, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT:    v_mov_b32_e32 v2, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v9
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v4, v9, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v7, v2, v9
-; CHECK-NEXT:    v_mul_lo_u32 v2, v11, v5
-; CHECK-NEXT:    v_mul_lo_u32 v4, v8, v6
-; CHECK-NEXT:    v_xor_b32_e32 v10, v3, v9
-; CHECK-NEXT:    v_mul_hi_u32 v3, v8, v5
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_mul_hi_u32 v8, v11, v9
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_addc_u32_e32 v11, vcc, v11, v6, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v12, v2, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[6:7]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v12, 31, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
+; CHECK-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v2, v[7:8]
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v12, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v8, v3, v12
+; CHECK-NEXT:    v_mul_lo_u32 v3, v11, v5
+; CHECK-NEXT:    v_mul_lo_u32 v6, v2, v9
+; CHECK-NEXT:    v_xor_b32_e32 v10, v4, v12
+; CHECK-NEXT:    v_mul_hi_u32 v4, v2, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v3, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v6
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v4, v11, v9
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; CHECK-NEXT:    v_mul_hi_u32 v6, v2, v9
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v9
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v11, v3, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v11, v4, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v4, v10, v2
-; CHECK-NEXT:    v_mul_lo_u32 v5, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v6, v7, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v8, v3
+; CHECK-NEXT:    v_mul_hi_u32 v6, v8, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, v10, v3
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v10, v3
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_mul_hi_u32 v5, v7, v3
+; CHECK-NEXT:    v_mul_hi_u32 v5, v8, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v2, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, v6, v[3:4]
-; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v3, s[4:5], v10, v3
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v2, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v2
+; CHECK-NEXT:    v_mul_hi_u32 v5, v10, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v9, 0
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v5, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4]
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v9, v[4:5]
+; CHECK-NEXT:    v_subb_u32_e64 v3, s[4:5], v10, v6, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v4, s[4:5], v10, v6
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CHECK-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CHECK-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
 ; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v2, v0
-; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
+; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v1
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v9
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v9
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v12
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v12
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v12, vcc
 ; CHECK-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; CHECK-NEXT:    ; implicit-def: $vgpr3
 ; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -2351,224 +2331,220 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-LABEL: v_srem_v2i64_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v8, 0x1000
-; GISEL-NEXT:    v_mov_b32_e32 v9, 0
-; GISEL-NEXT:    v_lshl_b64 v[4:5], v[8:9], v4
+; GISEL-NEXT:    v_mov_b32_e32 v10, 0x1000
+; GISEL-NEXT:    v_mov_b32_e32 v11, 0
+; GISEL-NEXT:    v_lshl_b64 v[4:5], v[10:11], v4
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v5, v7, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v5, v7, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v5, v4, v7
-; GISEL-NEXT:    v_xor_b32_e32 v7, v10, v7
+; GISEL-NEXT:    v_xor_b32_e32 v7, v8, v7
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v5
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v10, v7
-; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, 0, v5
-; GISEL-NEXT:    v_subb_u32_e32 v15, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v10
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v7
+; GISEL-NEXT:    v_sub_i32_e32 v17, vcc, 0, v5
+; GISEL-NEXT:    v_subb_u32_e32 v18, vcc, 0, v7, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v8
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v12, v10
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v12
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; GISEL-NEXT:    v_mov_b32_e32 v4, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v10
-; GISEL-NEXT:    v_mul_hi_u32 v17, v13, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; GISEL-NEXT:    v_mul_hi_u32 v10, v16, v10
-; GISEL-NEXT:    v_mul_lo_u32 v12, v13, v11
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v17, v16, v11
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
-; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v11
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v17, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v4
+; GISEL-NEXT:    v_trunc_f32_e32 v8, v8
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v17, v4, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[9:10]
+; GISEL-NEXT:    v_mul_lo_u32 v9, v16, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13]
+; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v14
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v8, v16, v8
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT:    v_mul_hi_u32 v13, v4, v14
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
-; GISEL-NEXT:    v_mul_hi_u32 v11, v16, v11
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v4
-; GISEL-NEXT:    v_addc_u32_e32 v16, vcc, v16, v10, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; GISEL-NEXT:    v_mov_b32_e32 v4, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v4, v8
+; GISEL-NEXT:    v_addc_u32_e32 v16, vcc, v16, v9, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v17, v19, 0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[9:10]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v12, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v0, v16, v10
-; GISEL-NEXT:    v_mul_lo_u32 v14, v13, v11
-; GISEL-NEXT:    v_xor_b32_e32 v15, v1, v4
-; GISEL-NEXT:    v_mul_hi_u32 v1, v13, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v16, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v18, v19, v[12:13]
+; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v0, v16, v8
+; GISEL-NEXT:    v_mul_lo_u32 v9, v19, v14
+; GISEL-NEXT:    v_xor_b32_e32 v17, v1, v4
+; GISEL-NEXT:    v_mul_hi_u32 v1, v19, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v16, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT:    v_mul_hi_u32 v14, v13, v11
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT:    v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT:    v_mul_lo_u32 v1, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_mul_hi_u32 v9, v19, v14
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v16, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v0
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v16, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v15, v10
-; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v11
-; GISEL-NEXT:    v_lshl_b64 v[0:1], v[8:9], v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v15, v10
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v15, v11
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_mul_hi_u32 v9, v12, v11
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v19, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v16, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v8, v17, v0
+; GISEL-NEXT:    v_mul_lo_u32 v9, v15, v1
+; GISEL-NEXT:    v_mul_hi_u32 v12, v15, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v17, v0
+; GISEL-NEXT:    v_lshl_b64 v[10:11], v[10:11], v6
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v8, v6
-; GISEL-NEXT:    v_mul_hi_u32 v8, v15, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v8, v6
-; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0
-; GISEL-NEXT:    v_xor_b32_e32 v6, v0, v8
-; GISEL-NEXT:    v_xor_b32_e32 v8, v1, v8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v14, v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v16, v8
-; GISEL-NEXT:    v_mov_b32_e32 v0, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v11, v[0:1]
-; GISEL-NEXT:    v_mac_f32_e32 v14, 0x4f800000, v16
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v10, v14
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, 0, v6
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v10
-; GISEL-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v1
-; GISEL-NEXT:    v_trunc_f32_e32 v13, v10
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v13
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v1
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v13
-; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v12, v9
-; GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v16, v13, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v1, v13, v10
-; GISEL-NEXT:    v_subb_u32_e64 v18, s[4:5], v15, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v14, v[11:12]
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v15, v0
-; GISEL-NEXT:    v_mul_lo_u32 v12, v14, v11
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v18, v7
-; GISEL-NEXT:    v_subb_u32_e32 v0, vcc, v0, v7, vcc
-; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v14, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v9, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v18, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v1, v12, s[6:7]
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v9, v5
-; GISEL-NEXT:    v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v1, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v19, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v17, v1
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_mul_hi_u32 v9, v15, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v0
+; GISEL-NEXT:    v_mul_hi_u32 v9, v17, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v9, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v5, v12, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v7, v14, v[8:9]
+; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v15, v0
+; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], v17, v12, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v17, v12
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v5
 ; GISEL-NEXT:    v_subb_u32_e32 v0, vcc, v0, v7, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s[8:9]
-; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v19, v7
-; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v1, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[6:7]
-; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v19, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v1, v13, v10
-; GISEL-NEXT:    v_mul_lo_u32 v10, v13, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT:    v_mul_hi_u32 v15, v14, v11
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT:    v_mul_hi_u32 v11, v13, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v14, v7
+; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, v13, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, v1, v6, s[4:5]
+; GISEL-NEXT:    v_subbrev_u32_e64 v16, s[4:5], 0, v0, vcc
+; GISEL-NEXT:    v_ashrrev_i32_e32 v1, 31, v11
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v10, v1
+; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v11, v1, s[4:5]
+; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v1
+; GISEL-NEXT:    v_xor_b32_e32 v8, v8, v1
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v8
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v16, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v15, v5
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v9
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v16, v7
+; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v0, v7, vcc
+; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
+; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GISEL-NEXT:    v_trunc_f32_e32 v1, v1
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v17, v0
+; GISEL-NEXT:    v_sub_i32_e32 v19, vcc, 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v10, v11, s[4:5]
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0
+; GISEL-NEXT:    v_subb_u32_e32 v20, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v19, v18, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v21, v18, v0
+; GISEL-NEXT:    v_mul_hi_u32 v22, v17, v0
+; GISEL-NEXT:    v_mul_hi_u32 v23, v18, v0
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v15, v5
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v7, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v15, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[9:10]
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v0
+; GISEL-NEXT:    v_mul_lo_u32 v10, v18, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v21, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v22
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v0
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v13, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v11, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[1:2]
-; GISEL-NEXT:    v_xor_b32_e32 v1, v5, v4
-; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v17, v11, v[9:10]
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v18, v7, vcc
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v5
-; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v10, v11, v9
-; GISEL-NEXT:    v_xor_b32_e32 v14, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT:    v_mul_hi_u32 v9, v17, v0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v23
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v1
+; GISEL-NEXT:    v_addc_u32_e32 v16, vcc, v18, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v19, v15, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v13, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v19, v16, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v7, v4
+; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v20, v15, v[9:10]
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v7
+; GISEL-NEXT:    v_mul_lo_u32 v2, v16, v0
+; GISEL-NEXT:    v_mul_lo_u32 v9, v15, v11
+; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v7
+; GISEL-NEXT:    v_mul_hi_u32 v3, v15, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v16, v0
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v9
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT:    v_mul_hi_u32 v10, v11, v9
+; GISEL-NEXT:    v_mul_lo_u32 v3, v16, v11
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
+; GISEL-NEXT:    v_mul_hi_u32 v9, v15, v11
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
-; GISEL-NEXT:    v_mul_hi_u32 v9, v13, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v16, v11
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v16, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v12, v2
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
-; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v4
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v14, v2
+; GISEL-NEXT:    v_mul_lo_u32 v10, v13, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v12, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
@@ -2577,26 +2553,25 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v10, v14, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    v_mul_hi_u32 v9, v13, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v0, v[3:4]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v7, v4, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v5, v4, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10]
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v14, v3
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v13, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v13, v3
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v8
 ; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[4:5]
 ; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v2, v6
 ; GISEL-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v8
@@ -2611,13 +2586,13 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GISEL-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v5
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v5
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v7
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom:
@@ -2645,103 +2620,100 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v4, v1
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v10, v1
-; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v0
-; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v17, vcc, 0, v0
+; CGP-NEXT:    v_subb_u32_e32 v18, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v10
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v12, v10
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v12
-; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v16, v12
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT:    v_mov_b32_e32 v4, v11
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v4, v16, v10
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT:    v_mul_hi_u32 v12, v13, v10
+; CGP-NEXT:    v_trunc_f32_e32 v10, v10
+; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v10
+; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v16, v10
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v17, v4, 0
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[11:12]
+; CGP-NEXT:    v_mul_lo_u32 v11, v16, v10
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13]
+; CGP-NEXT:    v_mul_hi_u32 v12, v4, v10
 ; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT:    v_mul_lo_u32 v17, v13, v11
-; CGP-NEXT:    v_mul_lo_u32 v18, v16, v11
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v13, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v17, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v18, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v4, v14
+; CGP-NEXT:    v_mul_lo_u32 v15, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v4, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v15, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
-; CGP-NEXT:    v_mul_hi_u32 v11, v16, v11
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v4
-; CGP-NEXT:    v_addc_u32_e32 v16, vcc, v16, v10, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT:    v_mov_b32_e32 v4, v11
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; CGP-NEXT:    v_ashrrev_i32_e32 v14, 31, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v14
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v14, vcc
-; CGP-NEXT:    v_xor_b32_e32 v12, v4, v14
-; CGP-NEXT:    v_mul_lo_u32 v4, v16, v10
-; CGP-NEXT:    v_mul_lo_u32 v9, v13, v11
-; CGP-NEXT:    v_xor_b32_e32 v15, v8, v14
-; CGP-NEXT:    v_mul_hi_u32 v8, v13, v10
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v13, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; CGP-NEXT:    v_addc_u32_e32 v16, vcc, v16, v11, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v17, v4, 0
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[11:12]
+; CGP-NEXT:    v_ashrrev_i32_e32 v17, 31, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v17
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13]
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v17, vcc
+; CGP-NEXT:    v_xor_b32_e32 v15, v8, v17
+; CGP-NEXT:    v_mul_lo_u32 v8, v16, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v4, v14
+; CGP-NEXT:    v_xor_b32_e32 v18, v9, v17
+; CGP-NEXT:    v_mul_hi_u32 v9, v4, v10
 ; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v8, v16, v11
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT:    v_mul_hi_u32 v9, v13, v11
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_mul_hi_u32 v10, v16, v11
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v4, v14
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v16, v14
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v16, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v15, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v12, v8
-; CGP-NEXT:    v_mul_hi_u32 v11, v12, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v13, v15, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v16, v9, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v18, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, v15, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v15, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v18, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v15, v8
+; CGP-NEXT:    v_mul_lo_u32 v11, v18, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_mul_hi_u32 v10, v12, v8
+; CGP-NEXT:    v_mul_hi_u32 v10, v15, v8
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v4, v9
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v11, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v4
-; CGP-NEXT:    v_mov_b32_e32 v4, v9
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v0, v10, v[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v1, v11, v[9:10]
-; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v15, v9, vcc
-; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v15, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v11, v18, v8
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v9
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v4, 0
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v11, v10
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v0, v12, v[9:10]
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v1, v4, v[10:11]
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v15, v8
+; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v18, v12, vcc
+; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v18, v12
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v0
@@ -2754,11 +2726,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v1
 ; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v9, v1, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v15, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc
@@ -2766,10 +2738,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v14
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v14
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v14
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v14, vcc
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v17
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v17
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v17
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v17, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr11_vgpr12
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  .LBB8_2: ; %Flow1
@@ -2819,117 +2791,115 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v3, v4, v3
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v2
-; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v3, vcc
+; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v2
+; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v4
 ; CGP-NEXT:    v_trunc_f32_e32 v6, v6
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v11, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
-; CGP-NEXT:    v_mov_b32_e32 v4, v9
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v4, v6, v8
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v8
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v4, 0
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v6, v[9:10]
+; CGP-NEXT:    v_mul_lo_u32 v9, v6, v8
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[10:11]
+; CGP-NEXT:    v_mul_hi_u32 v10, v4, v8
 ; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT:    v_mul_lo_u32 v14, v11, v9
-; CGP-NEXT:    v_mul_lo_u32 v15, v6, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v4, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v6, v12
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v4, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
-; CGP-NEXT:    v_mul_hi_u32 v9, v6, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v4
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
-; CGP-NEXT:    v_mov_b32_e32 v4, v9
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5]
-; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v7
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v12
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v7, v12, vcc
-; CGP-NEXT:    v_xor_b32_e32 v7, v4, v12
-; CGP-NEXT:    v_mul_lo_u32 v4, v6, v8
-; CGP-NEXT:    v_mul_lo_u32 v10, v11, v9
-; CGP-NEXT:    v_xor_b32_e32 v13, v5, v12
-; CGP-NEXT:    v_mul_hi_u32 v5, v11, v8
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v11, v6, v12
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v9, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v4, 0
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v6, v[9:10]
+; CGP-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[10:11]
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v14, vcc
+; CGP-NEXT:    v_xor_b32_e32 v10, v5, v14
+; CGP-NEXT:    v_mul_lo_u32 v5, v6, v8
+; CGP-NEXT:    v_mul_lo_u32 v9, v4, v12
+; CGP-NEXT:    v_xor_b32_e32 v11, v7, v14
+; CGP-NEXT:    v_mul_hi_u32 v7, v4, v8
 ; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v5, v6, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v9
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_mul_hi_u32 v9, v6, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; CGP-NEXT:    v_mul_lo_u32 v7, v6, v12
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v13, v4
-; CGP-NEXT:    v_mul_lo_u32 v8, v7, v5
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v13, v4
-; CGP-NEXT:    v_mul_hi_u32 v10, v13, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CGP-NEXT:    v_mul_hi_u32 v9, v4, v12
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v13, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT:    v_mul_hi_u32 v8, v7, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v4, v6
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_mul_hi_u32 v9, v6, v12
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v6, v7, vcc
+; CGP-NEXT:    v_mul_lo_u32 v6, v11, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, v10, v5
+; CGP-NEXT:    v_mul_hi_u32 v8, v10, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v11, v4
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6]
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6]
-; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v13, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_mul_lo_u32 v8, v11, v5
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT:    v_mul_hi_u32 v7, v10, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v4, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v4
+; CGP-NEXT:    v_mul_hi_u32 v7, v11, v5
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v12, 0
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v7, v6
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6]
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v3, v12, v[6:7]
+; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v11, v8, vcc
+; CGP-NEXT:    v_sub_i32_e64 v6, s[4:5], v11, v8
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
-; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v3
+; CGP-NEXT:    v_subb_u32_e32 v6, vcc, v6, v3, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[4:5]
 ; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v4, v2
-; CGP-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; CGP-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v6, v3, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
@@ -2938,11 +2908,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v12
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v12
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v14
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v14
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v14
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v14, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; CGP-NEXT:    ; implicit-def: $vgpr5
 ; CGP-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -3004,15 +2974,15 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) {
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v0
 ; CGP-NEXT:    v_rcp_f32_e32 v1, v1
 ; CGP-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v1
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v3
-; CGP-NEXT:    v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v4, v2
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
+; CGP-NEXT:    v_mul_lo_u32 v5, v1, v4
+; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0
+; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v0
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
 ; CGP-NEXT:    v_mul_lo_u32 v0, v1, v3
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v5, v0
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v3
@@ -3035,196 +3005,192 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v7, 0
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, 0, v1
-; GISEL-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v7
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v9, 0
+; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, 0, v1
+; GISEL-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v9
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GISEL-NEXT:    v_subb_u32_e64 v13, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GISEL-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v4
-; GISEL-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v3
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT:    v_trunc_f32_e32 v4, v4
+; GISEL-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v3
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v4
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v10, v3
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v3
+; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v4
+; GISEL-NEXT:    v_mul_hi_u32 v3, v10, v3
+; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v10, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v3
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v11, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v9, v8, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5]
-; GISEL-NEXT:    v_and_b32_e32 v10, 0xffffff, v0
-; GISEL-NEXT:    v_mul_lo_u32 v0, v11, v3
-; GISEL-NEXT:    v_mul_lo_u32 v5, v8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v3
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v10, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v10, v3
+; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v3
+; GISEL-NEXT:    v_mul_lo_u32 v7, v11, v4
+; GISEL-NEXT:    v_mul_hi_u32 v3, v10, v3
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v11, v4
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v8, v10, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v10, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v11, v3, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v4, 0, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v10, v8
-; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, 0, v0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_mul_lo_u32 v5, 0, v8
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v4
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0
-; GISEL-NEXT:    v_mul_hi_u32 v6, 0, v8
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v7
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1]
-; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v7
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v11, v7
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v11
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v3
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, 0, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, v0, v4
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, v0, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_mul_lo_u32 v8, 0, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_mul_hi_u32 v7, v0, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v5, v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v3
+; GISEL-NEXT:    v_mul_hi_u32 v8, 0, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v10, 0
+; GISEL-NEXT:    v_mac_f32_e32 v11, 0x4f800000, v9
 ; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, 0, v3
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
-; GISEL-NEXT:    v_mov_b32_e32 v0, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[5:6]
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v11
 ; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v12, v[8:9]
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v10, v4
-; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], 0, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], 0, v5
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v7
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v8
-; GISEL-NEXT:    v_mul_hi_u32 v9, v12, v7
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v9, v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v7
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v5, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v8
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], 0, v10, v[6:7]
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v6
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v11, 0
+; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], 0, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v12, v[6:7]
+; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], 0, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v14, v11, v[9:10]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, v11, v6
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, -1, v4, s[4:5]
+; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v5
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v7, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v6
+; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v5
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v6
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v7, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v8
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v9
+; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v6
 ; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v5, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v7, v5
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v4
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v11, v5, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v7, 0
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT:    v_subbrev_u32_e32 v11, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, -1, v6, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v8, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v7, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v11, vcc, 0, v11, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v4
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v11
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v8, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, 0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v7
-; GISEL-NEXT:    v_mul_hi_u32 v13, v2, v4
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v0, v1
-; GISEL-NEXT:    v_subbrev_u32_e32 v12, vcc, 0, v11, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_mul_lo_u32 v8, 0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v2, v7
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v6, v5
+; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v4
+; GISEL-NEXT:    v_addc_u32_e64 v12, s[4:5], v12, v5, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v11, 0
+; GISEL-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[5:6]
+; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v0, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v11, v[6:7]
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v13, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v12, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v11, v8
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, -1, v5, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v4, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, v8, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1]
+; GISEL-NEXT:    v_mul_lo_u32 v6, v12, v8
+; GISEL-NEXT:    v_mul_hi_u32 v4, v12, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v8
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v8
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v12, v5, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, 0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v2, v5
+; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v4
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v16, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_mul_lo_u32 v7, 0, v5
+; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v4, v6
+; GISEL-NEXT:    v_mul_hi_u32 v12, 0, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, v11, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v10, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v8, v[5:6]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v15, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[5:6]
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v15, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[6:7]
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], 0, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v5
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v8
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], 0, v8, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v3
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -3264,15 +3230,15 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v5, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT:    v_mul_lo_u32 v0, v0, v6
-; CGP-NEXT:    v_mul_lo_u32 v5, v1, v3
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v6, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
+; CGP-NEXT:    v_mul_lo_u32 v5, v0, v6
+; CGP-NEXT:    v_mul_lo_u32 v7, v1, v3
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v8, v7
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v6, 0
 ; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v5, v3
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v3
 ; CGP-NEXT:    v_mul_lo_u32 v6, v1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index 1812e17800e71..10e83b70a57d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -189,15 +189,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
-; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX10-NEXT:    s_and_b32 s9, 0xffff, s2
 ; GFX10-NEXT:    s_lshr_b32 s5, s5, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX10-NEXT:    s_lshr_b32 s0, s7, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s6
-; GFX10-NEXT:    v_mov_b32_e32 v7, s1
-; GFX10-NEXT:    s_lshr_b32 s1, s9, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v9, s0
 ; GFX10-NEXT:    ds_write_b8 v1, v0
@@ -208,18 +204,22 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
 ; GFX10-NEXT:    ds_write_b8 v1, v8 offset:1
 ; GFX10-NEXT:    ds_write_b8 v1, v9 offset:5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    v_mov_b32_e32 v3, s2
-; GFX10-NEXT:    v_mov_b32_e32 v10, s1
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX10-NEXT:    s_and_b32 s9, 0xffff, s2
 ; GFX10-NEXT:    s_lshr_b32 s0, s2, 24
-; GFX10-NEXT:    ds_write_b8 v1, v7 offset:7
-; GFX10-NEXT:    ds_write_b8 v1, v3 offset:8
-; GFX10-NEXT:    ds_write_b8 v1, v10 offset:9
+; GFX10-NEXT:    v_mov_b32_e32 v7, s1
+; GFX10-NEXT:    s_lshr_b32 s1, s9, 8
+; GFX10-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX10-NEXT:    ds_write_b8 v1, v0 offset:10
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    s_and_b32 s0, 0xffff, s3
-; GFX10-NEXT:    s_lshr_b32 s1, s3, 16
+; GFX10-NEXT:    v_mov_b32_e32 v10, s1
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 8
+; GFX10-NEXT:    s_lshr_b32 s1, s3, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s3
+; GFX10-NEXT:    ds_write_b8 v1, v7 offset:7
+; GFX10-NEXT:    ds_write_b8 v1, v3 offset:8
+; GFX10-NEXT:    ds_write_b8 v1, v10 offset:9
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX10-NEXT:    s_lshr_b32 s0, s3, 24
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 9e412b6c7cd0a..c50b491bcb074 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -129,68 +129,67 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s9, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s8, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT:    v_mul_hi_u32 v5, s9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
@@ -203,54 +202,55 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2]
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, s8, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v0
+; GFX8-NEXT:    v_mul_hi_u32 v3, s9, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v3, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s9
-; GFX8-NEXT:    v_mov_b32_e32 v5, s11
-; GFX8-NEXT:    v_subb_u32_e64 v7, s[0:1], v2, v1, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s9, v1
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v6
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s8, v0
+; GFX8-NEXT:    v_subb_u32_e64 v5, s[0:1], v2, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s9, v4
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v7
-; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s10, v6
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v5
+; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s10, v3
 ; GFX8-NEXT:    v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc
-; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v4
-; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
+; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v6
+; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
-; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v4
+; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v8
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s10, v2
+; GFX8-NEXT:    v_subrev_u32_e32 v14, vcc, s10, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
-; GFX8-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
+; GFX8-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v0, vcc
 ; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v14, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v15, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v10, v13, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v9, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v0, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v7, v1, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v14, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
@@ -268,66 +268,67 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v4, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s17, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s16, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s16, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s17, v0
-; GFX9-NEXT:    v_mul_hi_u32 v6, s17, v1
+; GFX9-NEXT:    v_mul_hi_u32 v5, s17, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
@@ -339,53 +340,52 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s18, v5, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s18, v6, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s18, v3, v[1:2]
-; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s16, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s19, v5, v[1:2]
+; GFX9-NEXT:    v_add3_u32 v7, v3, v2, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s18, v7, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s19, v6, v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s17
-; GFX9-NEXT:    v_mov_b32_e32 v4, s19
-; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[0:1], v2, v1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s19, v8
-; GFX9-NEXT:    v_sub_u32_e32 v0, s17, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v7
+; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s16, v0
+; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v2, v4, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s19, v5
+; GFX9-NEXT:    v_sub_u32_e32 v0, s17, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s19, v8
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s18, v7
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s19, v5
+; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s18, v3
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v6
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v7, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s19, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s19, v9
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s18, v2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v15, vcc, s18, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v10
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v16, vcc, 0, v0, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v13, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v14, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v5, v0, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v10, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v15, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[12:13]
-; GFX9-NEXT:    global_store_dwordx2 v6, v[2:3], s[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v11, v14, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, v0, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v7, v1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v16, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v8, v[0:1], s[12:13]
+; GFX9-NEXT:    global_store_dwordx2 v8, v[2:3], s[14:15]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_i64:
@@ -468,31 +468,31 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v0, s0, v5, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX10-NEXT:    v_mul_hi_u32 v5, s17, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v6, v2
 ; GFX10-NEXT:    v_add_co_u32 v0, s0, v0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v0, v2
-; GFX10-NEXT:    v_mul_hi_u32 v2, s17, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v3
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s18, v5, 0
-; GFX10-NEXT:    v_add3_u32 v3, v3, v6, v2
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v0, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v4, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s18, v6, 0
+; GFX10-NEXT:    v_add3_u32 v3, v2, v3, v5
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s18, v3, v[1:2]
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s19, v5, v[1:2]
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v5, 1
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s19, v6, v[1:2]
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v6, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v7, vcc_lo, s16, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s17, v1
+; GFX10-NEXT:    v_sub_nc_u32_e32 v5, s17, v1
 ; GFX10-NEXT:    v_sub_co_ci_u32_e64 v8, s0, s17, v1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v6, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s18, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v7, s18
+; GFX10-NEXT:    v_sub_co_u32 v5, vcc_lo, v7, s18
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s19, v8
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s18, v6
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s18, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s19, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
@@ -503,18 +503,18 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s19, v8
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v10, v1, s0
-; GFX10-NEXT:    v_sub_co_u32 v10, s0, v6, s18
+; GFX10-NEXT:    v_sub_co_u32 v10, s0, v5, s18
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v14, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v9, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v0, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v9, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v4, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v8, v5, s0
 ; GFX10-NEXT:    global_store_dwordx2 v9, v[0:1], s[12:13]
 ; GFX10-NEXT:    global_store_dwordx2 v9, v[2:3], s[14:15]
 ; GFX10-NEXT:    s_endpgm
@@ -1005,72 +1005,70 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v9, s13
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT:    s_sub_u32 s2, 0, s14
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT:    s_subb_u32 s3, 0, s15
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s9, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s8, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT:    v_mov_b32_e32 v5, s13
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
@@ -1083,138 +1081,138 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v0, v2
-; GFX8-NEXT:    v_mul_hi_u32 v4, s9, v1
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s12, v7, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v4, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s12, v8, v[1:2]
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s13, v7, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v0
+; GFX8-NEXT:    v_mul_hi_u32 v3, s9, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s12, v8, 0
+; GFX8-NEXT:    v_add_u32_e64 v17, s[2:3], 1, v8
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v3, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s12, v10, v[1:2]
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s8, v0
-; GFX8-NEXT:    v_subb_u32_e64 v0, s[0:1], v3, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s9, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s13, v8, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s9
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s14
+; GFX8-NEXT:    v_subb_u32_e64 v0, s[0:1], v2, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s9, v4
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, s15
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX8-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v5
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v3, v4, s[0:1]
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, s15
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, s14
-; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v2, v5, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v3
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, s12, v1
-; GFX8-NEXT:    v_subbrev_u32_e64 v11, s[0:1], 0, v6, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GFX8-NEXT:    v_subb_u32_e32 v12, vcc, v2, v9, vcc
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v3, v6, s[0:1]
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
-; GFX8-NEXT:    v_trunc_f32_e32 v4, v3
-; GFX8-NEXT:    v_mul_f32_e32 v3, 0xcf800000, v4
-; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v12, v2
-; GFX8-NEXT:    v_add_u32_e64 v13, s[0:1], 1, v7
-; GFX8-NEXT:    v_addc_u32_e64 v14, s[0:1], 0, v8, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v15, v4
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v6, v5, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v10
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v4, v15, v2
-; GFX8-NEXT:    v_mul_lo_u32 v17, v12, v3
-; GFX8-NEXT:    v_mul_hi_u32 v6, v12, v2
-; GFX8-NEXT:    v_mul_hi_u32 v2, v15, v2
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v17
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v15, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v17, v4
-; GFX8-NEXT:    v_mul_hi_u32 v17, v12, v3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v17
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v17
-; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 1, v13
-; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v14, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, s12, v10
-; GFX8-NEXT:    v_mul_hi_u32 v3, v15, v3
-; GFX8-NEXT:    v_subbrev_u32_e32 v20, vcc, 0, v5, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, 0xcf800000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v4, v2
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v13, v2
+; GFX8-NEXT:    s_sub_u32 s8, 0, s14
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v14, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, s12, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s8, v13, 0
+; GFX8-NEXT:    v_subbrev_u32_e64 v16, s[0:1], 0, v12, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s8, v14, v[3:4]
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v16
+; GFX8-NEXT:    s_subb_u32 s9, 0, s15
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s9, v13, v[4:5]
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v15
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX8-NEXT:    v_mul_lo_u32 v5, v14, v2
+; GFX8-NEXT:    v_mul_lo_u32 v7, v13, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v16
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX8-NEXT:    v_mul_hi_u32 v4, v13, v2
+; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v7
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
+; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v5, v4
+; GFX8-NEXT:    v_subb_u32_e32 v4, vcc, v12, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
+; GFX8-NEXT:    v_mul_hi_u32 v2, v14, v2
+; GFX8-NEXT:    v_mul_lo_u32 v9, v14, v6
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
+; GFX8-NEXT:    v_mul_hi_u32 v7, v13, v6
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
+; GFX8-NEXT:    v_addc_u32_e64 v18, s[2:3], 0, v10, s[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 1, v17
+; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v18, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, s12, v15
+; GFX8-NEXT:    v_mul_hi_u32 v6, v14, v6
+; GFX8-NEXT:    v_subbrev_u32_e32 v20, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
-; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0
-; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, v15, v3, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v13, v17, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3]
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v14, v18, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6]
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v3, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v8, v13, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v7, v15, v4
-; GFX8-NEXT:    v_mul_lo_u32 v8, v12, v5
-; GFX8-NEXT:    v_mul_hi_u32 v9, v12, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v19, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v11, v20, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v2
+; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, v14, v4, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s8, v13, 0
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v17, v9, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s8, v14, v[5:6]
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[8:9], s[2:3], s9, v13, v[6:7]
+; GFX8-NEXT:    v_mul_lo_u32 v6, v14, v4
+; GFX8-NEXT:    v_mul_hi_u32 v9, v13, v4
+; GFX8-NEXT:    v_mul_lo_u32 v7, v13, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v18, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v10, v3, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v15, v19, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v16, v20, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v9, v15, v5
-; GFX8-NEXT:    v_mul_hi_u32 v4, v15, v4
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
-; GFX8-NEXT:    v_mul_hi_u32 v8, v12, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v9, v14, v8
+; GFX8-NEXT:    v_mul_hi_u32 v4, v14, v4
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
+; GFX8-NEXT:    v_mul_hi_u32 v7, v13, v8
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v9, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
-; GFX8-NEXT:    v_mul_hi_u32 v5, v15, v5
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v7
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v12, v4
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v15, v5, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, s11, v4
-; GFX8-NEXT:    v_mul_lo_u32 v8, s10, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v1, v6, s[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
+; GFX8-NEXT:    v_mul_hi_u32 v8, v14, v8
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v13, v4
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v14, v6, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v8, s11, v4
+; GFX8-NEXT:    v_mul_lo_u32 v9, s10, v7
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v1, v5, s[0:1]
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s10, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s11, v4
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v8, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v7, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v5, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, s11, v5
+; GFX8-NEXT:    v_mul_lo_u32 v5, s11, v7
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v8, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, s10, v5
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT:    v_mul_hi_u32 v8, s10, v7
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v8
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v4, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, s11, v5
-; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s14, v11, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v7, v1
-; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v8, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[1:2]
+; GFX8-NEXT:    v_mul_hi_u32 v7, s11, v7
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v5, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s14, v11, 0
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v7, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[5:6]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v0, v10, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s15
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s15, v11, v[8:9]
@@ -1274,65 +1272,66 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, s5
+; GFX9-NEXT:    s_sub_u32 s8, 0, s6
+; GFX9-NEXT:    s_subb_u32 s9, 0, s7
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT:    s_sub_u32 s2, 0, s6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT:    s_subb_u32 s3, 0, s7
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
+; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v4, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s17, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s16, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s16, v0
@@ -1349,135 +1348,132 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s4, v7, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s4, v8, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add3_u32 v8, v3, v2, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v7, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-NEXT:    v_add3_u32 v10, v3, v2, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s4, v10, v[1:2]
 ; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s16, v0
-; GFX9-NEXT:    v_subb_co_u32_e64 v0, s[0:1], v3, v2, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s5, v8, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s17
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s6
+; GFX9-NEXT:    v_subb_co_u32_e64 v0, s[0:1], v2, v4, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, s17, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s7
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v5
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s5, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v3, v4, s[0:1]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GFX9-NEXT:    v_sub_u32_e32 v2, s17, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v2, v5, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v3
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT:    v_subrev_co_u32_e32 v10, vcc, s4, v1
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v11, s[0:1], 0, v6, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v12, vcc, v2, v9, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v3, v6, s[0:1]
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
-; GFX9-NEXT:    v_trunc_f32_e32 v4, v3
-; GFX9-NEXT:    v_mul_f32_e32 v3, 0xcf800000, v4
-; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v2
-; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v7
-; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v15, v4
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s5, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v4, v15, v2
-; GFX9-NEXT:    v_mul_lo_u32 v17, v12, v3
-; GFX9-NEXT:    v_mul_hi_u32 v5, v12, v2
-; GFX9-NEXT:    v_mul_hi_u32 v2, v15, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v17
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, v15, v3
-; GFX9-NEXT:    v_add_u32_e32 v4, v17, v4
-; GFX9-NEXT:    v_mul_hi_u32 v17, v12, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, v15, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v17
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v17
-; GFX9-NEXT:    v_add_co_u32_e32 v17, vcc, 1, v13
-; GFX9-NEXT:    v_addc_co_u32_e32 v18, vcc, 0, v14, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v2
-; GFX9-NEXT:    v_add3_u32 v3, v5, v4, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v19, vcc, s4, v10
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v20, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s3, v12, v[2:3]
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v13, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v6, v15, v4
-; GFX9-NEXT:    v_mul_lo_u32 v7, v12, v5
-; GFX9-NEXT:    v_mul_hi_u32 v9, v12, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v15, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v14, v18, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v6, s[2:3], v6, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[2:3]
-; GFX9-NEXT:    v_add_co_u32_e64 v6, s[2:3], v6, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v9, v15, v5
-; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
-; GFX9-NEXT:    v_mul_hi_u32 v7, v12, v5
-; GFX9-NEXT:    v_mul_hi_u32 v5, v15, v5
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v9, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[2:3]
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v4, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[2:3]
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v4, v6
+; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_f32_e32 v4, 0xcf800000, v3
+; GFX9-NEXT:    v_add_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v13, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v14, v3
+; GFX9-NEXT:    v_subrev_co_u32_e32 v15, vcc, s4, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s8, v13, 0
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v16, s[0:1], 0, v12, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s8, v14, v[3:4]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v16
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s9, v13, v[4:5]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v15
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT:    v_mul_lo_u32 v5, v14, v2
+; GFX9-NEXT:    v_mul_lo_u32 v7, v13, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s5, v16
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX9-NEXT:    v_mul_hi_u32 v4, v13, v2
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[0:1], v5, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v12, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
+; GFX9-NEXT:    v_mul_hi_u32 v2, v14, v2
+; GFX9-NEXT:    v_mul_lo_u32 v9, v14, v6
+; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT:    v_mul_hi_u32 v7, v13, v6
+; GFX9-NEXT:    v_add_co_u32_e64 v17, s[2:3], 1, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v9, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v18, s[2:3], 0, v10, s[2:3]
 ; GFX9-NEXT:    v_add_u32_e32 v7, v9, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX9-NEXT:    v_add3_u32 v5, v7, v6, v5
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v12, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[2:3], v15, v5, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v6, s19, v4
-; GFX9-NEXT:    v_mul_lo_u32 v7, s18, v5
-; GFX9-NEXT:    v_mul_hi_u32 v9, s18, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v14, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v19, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v20, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, 1, v17
+; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, 0, v18, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v6, v14, v6
+; GFX9-NEXT:    v_subrev_co_u32_e32 v19, vcc, s4, v15
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v20, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v4, v7, v4, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v13, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, v14, v4, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s8, v13, 0
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v17, v9, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s8, v14, v[5:6]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[2:3], s9, v13, v[6:7]
+; GFX9-NEXT:    v_mul_lo_u32 v6, v14, v4
+; GFX9-NEXT:    v_mul_hi_u32 v9, v13, v4
+; GFX9-NEXT:    v_mul_lo_u32 v7, v13, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v18, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v10, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v15, v19, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v16, v20, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v9, s19, v5
-; GFX9-NEXT:    v_mul_hi_u32 v4, s19, v4
+; GFX9-NEXT:    v_mul_lo_u32 v9, v14, v8
+; GFX9-NEXT:    v_mul_hi_u32 v4, v14, v4
 ; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
-; GFX9-NEXT:    v_mul_hi_u32 v7, s18, v5
-; GFX9-NEXT:    v_mul_hi_u32 v13, s19, v5
+; GFX9-NEXT:    v_mul_hi_u32 v7, v13, v8
+; GFX9-NEXT:    v_mul_hi_u32 v8, v14, v8
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v9, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v4, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9-NEXT:    v_add_u32_e32 v7, v9, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v6, v7, v6, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v13, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v14, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, s19, v4
+; GFX9-NEXT:    v_mul_lo_u32 v9, s18, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v1, v5, s[0:1]
+; GFX9-NEXT:    v_mul_hi_u32 v1, s18, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, s19, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v8, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v5, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v5, s19, v7
+; GFX9-NEXT:    v_add_u32_e32 v1, v8, v1
+; GFX9-NEXT:    v_mul_hi_u32 v8, s18, v7
+; GFX9-NEXT:    v_mul_hi_u32 v7, s19, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v4, v1
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s6, v11, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v1, v8, s[0:1]
-; GFX9-NEXT:    v_add_u32_e32 v1, v9, v7
-; GFX9-NEXT:    v_add3_u32 v12, v1, v12, v13
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[2:3], s6, v12, v[1:2]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
+; GFX9-NEXT:    v_add3_u32 v12, v8, v1, v7
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[2:3], s6, v12, v[5:6]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v0, v10, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s7, v11, v[8:9]
@@ -1546,14 +1542,14 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v1
-; GFX10-NEXT:    v_trunc_f32_e32 v4, v2
-; GFX10-NEXT:    v_trunc_f32_e32 v5, v3
-; GFX10-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v4
-; GFX10-NEXT:    v_mul_f32_e32 v3, 0xcf800000, v5
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v9, v4
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v10, v5
-; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX10-NEXT:    v_trunc_f32_e32 v4, v3
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0xcf800000, v2
+; GFX10-NEXT:    v_mul_f32_e32 v5, 0xcf800000, v4
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v9, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v10, v4
+; GFX10-NEXT:    v_add_f32_e32 v0, v3, v0
+; GFX10-NEXT:    v_add_f32_e32 v1, v5, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v7, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v8, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s1, v7, 0
@@ -1662,119 +1658,119 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_mul_hi_u32 v4, s17, v4
 ; GFX10-NEXT:    v_mul_lo_u32 v9, s17, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v6, s19, v1
-; GFX10-NEXT:    v_mul_hi_u32 v10, s16, v2
-; GFX10-NEXT:    v_mul_hi_u32 v11, s17, v2
-; GFX10-NEXT:    v_mul_lo_u32 v2, s18, v0
+; GFX10-NEXT:    v_mul_lo_u32 v11, s18, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v7, s18, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v1, s19, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v12, s19, v0
-; GFX10-NEXT:    v_mul_hi_u32 v13, s18, v0
-; GFX10-NEXT:    v_mul_hi_u32 v14, s19, v0
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v3, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v9, v4
+; GFX10-NEXT:    v_add_co_u32 v3, s0, v3, v8
+; GFX10-NEXT:    v_mul_hi_u32 v10, s16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v6, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v1, s0, v12, v1
+; GFX10-NEXT:    v_add_co_u32 v4, s0, v9, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v0, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v12, v1
+; GFX10-NEXT:    v_mul_hi_u32 v13, s18, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v3, s0, v3, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v3, v0
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v6, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, v8, v5
-; GFX10-NEXT:    v_add_co_u32 v8, s0, v4, v0
+; GFX10-NEXT:    v_mul_hi_u32 v10, s19, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v8, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v9, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, v11, v6
+; GFX10-NEXT:    v_mul_hi_u32 v2, s17, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v12, v7
+; GFX10-NEXT:    v_add_co_u32 v7, s0, v4, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v10, s0, v1, v2
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s1, s4, v8, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, s6, v10, 0
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v9, v7
-; GFX10-NEXT:    v_add3_u32 v9, v5, v4, v11
-; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, v8, 1
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0
-; GFX10-NEXT:    v_add3_u32 v7, v7, v6, v14
+; GFX10-NEXT:    v_add_co_u32 v8, s0, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s4, v7, 0
+; GFX10-NEXT:    v_add3_u32 v9, v3, v4, v2
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, s6, v8, 0
+; GFX10-NEXT:    v_add3_u32 v10, v6, v5, v10
+; GFX10-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s0, s4, v9, v[1:2]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v9, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s0, s6, v7, v[3:4]
-; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s0, s5, v8, v[4:5]
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v12, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v13, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v7, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v9, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s0, s6, v10, v[3:4]
+; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v1, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v11, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, s16, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s7, v10, v[5:6]
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v5, s0, s17, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s4, v14
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s17, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s0
-; GFX10-NEXT:    v_sub_co_u32 v15, s0, s18, v2
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v16, s1, s19, v0, s0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v15
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s19, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v17, vcc_lo, v14, s4
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, s1, 0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s5, v5
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v23, s0, s7, v0, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s5, v18
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s0, s5, v7, v[4:5]
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s0, s7, v8, v[5:6]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v5, s17, v3
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v3, s0, s17, v3, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s5, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v15, vcc_lo, v14, s4
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v16, s0, 0, v0, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v17, s0, s18, v2
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v18, s1, s19, v4, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s4, v15
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s5, v0, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, s19, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s1
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s5, v16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s4, v17
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s5, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s5, v18
-; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s7, v16
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v21, v20, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s5, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0, -1, s1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX10-NEXT:    v_sub_co_u32 v0, s0, v17, s4
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v19, s0, 0, v1, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v6, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v6, s1, v15, s6
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v17, v0, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v12, s2, 0, v23, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v1, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v3, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v18, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s7, v16
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v14, v4, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v22, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s5, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v19, v2, s1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s5, v3
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v20, v5, s1
+; GFX10-NEXT:    v_sub_co_u32 v2, s1, v15, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s2, 0, v5
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v6, s1, 0, v0, s1
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s7, v18
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v1, s2
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v7, s0, s7, v4, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v17
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v5, s2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v15, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s1
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s0
+; GFX10-NEXT:    v_sub_co_u32 v11, s0, v17, s6
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v13, s1, 0, v7, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s7, v18
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v14, v2, s2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v13, vcc_lo, v10, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v14, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s7, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v13, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, 0, v14, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, vcc_lo, s7, v23, s1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_sub_co_u32 v8, s1, v6, s6
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, s1, 0, v18, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v13, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v14, v17, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v12, v18, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v9, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v13, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v15, v6, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v16, v8, s1
-; GFX10-NEXT:    global_store_dwordx4 v11, v[0:3], s[12:13]
-; GFX10-NEXT:    global_store_dwordx4 v11, v[4:7], s[14:15]
+; GFX10-NEXT:    v_add_co_u32 v14, vcc_lo, v8, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s7, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v14, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v16, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v7, vcc_lo, s7, v7, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_sub_co_u32 v5, s0, v11, s6
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v7, s0, 0, v7, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v14, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v13, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v3, v6, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v8, v9, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v10, v14, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v17, v11, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v18, v7, s0
+; GFX10-NEXT:    global_store_dwordx4 v12, v[0:3], s[12:13]
+; GFX10-NEXT:    global_store_dwordx4 v12, v[4:7], s[14:15]
 ; GFX10-NEXT:    s_endpgm
   %div = udiv <2 x i64> %x, %y
   store <2 x i64> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index b33b8a7d8cd72..4a22a911c60b7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -272,10 +272,6 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GFX906-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
 ; GFX906-NEXT:    buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
 ; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[0:1] offset:64
 ; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[0:1] offset:80
 ; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[0:1] offset:96
@@ -288,6 +284,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[0:1] offset:208
 ; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[0:1] offset:224
 ; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1] offset:240
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 74552a500ac51..d3ebd92f0677b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -3105,22 +3105,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i32_to_v128i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
@@ -3253,6 +3237,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr41
@@ -3284,14 +3284,13 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB12_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -3523,7 +3522,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_cbranch_execz .LBB12_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
 ; SI-NEXT:    v_add_i32_e32 v31, vcc, 3, v31
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v32
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -4317,22 +4315,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-LABEL: bitcast_v32i32_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -4437,6 +4419,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -4542,129 +4540,129 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
+; VI-NEXT:    v_mov_b32_e32 v55, v39
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
-; VI-NEXT:    v_mov_b32_e32 v55, v39
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
-; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v27
 ; VI-NEXT:    v_lshrrev_b32_e32 v59, 24, v10
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v9
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v9
@@ -5286,6 +5284,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v32i32_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -5302,9 +5304,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
@@ -5437,7 +5436,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -5493,7 +5491,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    s_waitcnt vmcnt(45)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -5508,7 +5506,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
+; GFX9-NEXT:    s_waitcnt vmcnt(47)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -5520,149 +5518,147 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
 ; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 8, v15
@@ -5670,7 +5666,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v12
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v11
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v10
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 24, v8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
@@ -5698,7 +5696,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    s_cbranch_execz .LBB12_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
 ; GFX9-NEXT:    v_add_u32_e32 v32, 3, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(44)
 ; GFX9-NEXT:    v_add_u32_e32 v31, 3, v31
 ; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
 ; GFX9-NEXT:    v_add_u32_e32 v30, 3, v30
@@ -6755,7 +6753,11 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -6776,10 +6778,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
@@ -7416,7 +7414,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -10666,7 +10664,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v16, s32
 ; GFX11-NEXT:    scratch_store_b32 off, v17, s32 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v18, s32 offset:8
@@ -11599,7 +11597,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-NEXT:    v_readlane_b32 s35, v16, 3
 ; GFX11-NEXT:    v_readlane_b32 s34, v16, 2
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v16, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:8
@@ -11812,13 +11810,26 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -11979,44 +11990,30 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:372
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB14_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -12025,11 +12022,11 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -12632,7 +12629,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB14_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -12646,8 +12642,8 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -13327,13 +13323,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -13470,34 +13478,20 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -13983,7 +13977,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v61
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -14561,13 +14554,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -14709,34 +14716,20 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -15223,7 +15216,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v61
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -16362,7 +16354,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:592
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:588
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:584
@@ -16395,7 +16387,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:456
@@ -17336,7 +17328,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
 ; GFX11-FAKE16-NEXT:  .LBB14_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:400
@@ -17369,7 +17361,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:528
@@ -18086,24 +18078,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_branch .LBB15_3
 ; SI-NEXT:  .LBB15_2:
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v55, v56
 ; SI-NEXT:    v_mov_b32_e32 v42, v46
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -18114,10 +18095,22 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:  .LBB15_3: ; %Flow
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_mov_b32_e32 v35, v57
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -18127,7 +18120,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v44
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -18722,13 +18714,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v19
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v47, 8, v7
 ; VI-NEXT:    v_lshlrev_b32_e32 v46, 8, v9
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v13
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -18956,11 +18948,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    s_or_b32 s4, s4, s5
@@ -18970,11 +18962,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_lshl_b32 s7, s23, 8
 ; VI-NEXT:    s_lshl_b32 s8, s27, 8
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -18982,6 +18971,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -19190,12 +19181,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_branch .LBB15_3
 ; VI-NEXT:  .LBB15_2:
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v44, v56
 ; VI-NEXT:    v_mov_b32_e32 v41, v33
 ; VI-NEXT:    v_mov_b32_e32 v50, v40
@@ -19213,6 +19198,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v54, v53
 ; VI-NEXT:    v_mov_b32_e32 v52, v36
 ; VI-NEXT:    v_mov_b32_e32 v49, v51
@@ -19222,7 +19213,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v51, v41
 ; VI-NEXT:    v_mov_b32_e32 v36, v44
 ; VI-NEXT:    v_mov_b32_e32 v53, v54
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v54, v60
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -19235,7 +19226,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:  ; %bb.4: ; %cmp.true
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v37
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
@@ -19820,8 +19810,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v11
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 8, v9
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -20000,16 +19990,18 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_lshl_b32 s6, s19, 8
 ; GFX9-NEXT:    s_lshl_b32 s7, s23, 8
 ; GFX9-NEXT:    s_lshl_b32 s8, s27, 8
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -20036,9 +20028,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -20054,14 +20045,16 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -20073,10 +20066,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20089,10 +20083,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -20106,17 +20102,22 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -20132,45 +20133,24 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20221,18 +20201,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB15_3
 ; GFX9-NEXT:  .LBB15_2:
-; GFX9-NEXT:    v_mov_b32_e32 v38, v51
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v43
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -20246,6 +20214,18 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_mov_b32_e32 v38, v51
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v31
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
@@ -20683,7 +20663,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32i32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -20716,7 +20696,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT:    s_clause 0x7
+; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -21573,7 +21553,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-TRUE16-NEXT:  .LBB15_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -21606,7 +21586,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT:    s_clause 0x7
+; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -21624,7 +21604,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -21657,7 +21637,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT:    s_clause 0x7
+; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -22514,7 +22494,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-FAKE16-NEXT:  .LBB15_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -22547,7 +22527,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT:    s_clause 0x7
+; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -23717,8 +23697,17 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
 ; SI-NEXT:    s_mov_b32 s72, s74
 ; SI-NEXT:    s_mov_b32 s73, s75
 ; SI-NEXT:    s_mov_b32 s74, s76
-; SI-NEXT:    v_readlane_b32 s75, v21, 0
-; SI-NEXT:    v_readlane_b32 s76, v21, 1
+; SI-NEXT:    s_mov_b32 s75, s77
+; SI-NEXT:    s_mov_b32 s76, s78
+; SI-NEXT:    s_mov_b32 s77, s79
+; SI-NEXT:    s_mov_b32 s78, s88
+; SI-NEXT:    s_mov_b32 s79, s89
+; SI-NEXT:    s_mov_b32 s88, s90
+; SI-NEXT:    s_mov_b32 s89, s91
+; SI-NEXT:    s_mov_b32 s90, s92
+; SI-NEXT:    s_mov_b32 s91, s93
+; SI-NEXT:    v_readlane_b32 s92, v21, 0
+; SI-NEXT:    v_readlane_b32 s93, v21, 1
 ; SI-NEXT:    s_cbranch_vccnz .LBB17_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    s_add_i32 s16, s16, 3
@@ -23780,16 +23769,16 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
 ; SI-NEXT:    s_lshl_b32 s62, s84, 16
 ; SI-NEXT:    s_and_b32 s73, s83, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s72, s83, 16
-; SI-NEXT:    s_and_b32 s77, s82, 0xffff0000
+; SI-NEXT:    s_and_b32 s75, s82, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s74, s82, 16
-; SI-NEXT:    s_and_b32 s79, s81, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s78, s81, 16
-; SI-NEXT:    s_and_b32 s89, s80, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s88, s80, 16
-; SI-NEXT:    s_and_b32 s91, s71, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s90, s71, 16
-; SI-NEXT:    s_and_b32 s93, s70, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s92, s70, 16
+; SI-NEXT:    s_and_b32 s77, s81, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s76, s81, 16
+; SI-NEXT:    s_and_b32 s79, s80, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s78, s80, 16
+; SI-NEXT:    s_and_b32 s89, s71, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s88, s71, 16
+; SI-NEXT:    s_and_b32 s91, s70, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s90, s70, 16
 ; SI-NEXT:    s_and_b32 s95, s29, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s94, s29, 16
 ; SI-NEXT:    s_and_b32 s31, s28, 0xffff0000
@@ -23814,8 +23803,8 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
 ; SI-NEXT:    s_lshl_b32 s66, s19, 16
 ; SI-NEXT:    s_and_b32 s69, s18, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s68, s18, 16
-; SI-NEXT:    s_and_b32 s76, s17, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s75, s17, 16
+; SI-NEXT:    s_and_b32 s93, s17, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s92, s17, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v21, s6, 2
 ; SI-NEXT:    s_lshl_b32 s6, s16, 16
@@ -23824,228 +23813,228 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
 ; SI-NEXT:    v_readlane_b32 s6, v21, 2
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s6
 ; SI-NEXT:    v_readlane_b32 s6, v21, 3
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s6
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_readlane_b32 s99, v20, 35
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s76
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s75
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s93
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s92
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s69
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s68
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s68
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s67
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s66
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s66
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s65
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s64
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s64
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s55
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s54
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s54
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s53
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s52
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s52
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s51
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s50
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s50
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s49
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s48
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s48
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s39
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s38
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s38
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s37
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s36
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s36
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s35
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s34
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s34
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s31
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s30
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s30
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s95
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s94
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s94
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s93
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s92
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s91
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s90
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s91
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s90
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s89
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s88
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s89
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s88
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s79
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s78
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s79
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s78
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s77
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s76
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s77
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s74
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s75
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s74
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s73
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s72
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s72
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s63
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s62
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s62
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s61
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s60
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s60
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s59
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s58
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s58
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s57
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s56
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s56
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s47
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s46
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s46
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s45
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s44
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s44
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s43
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s42
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s42
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s41
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s40
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s40
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s15
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s14
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s14
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s13
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s12
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s12
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s11
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s10
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s10
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s5
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s99, v20, 35
 ; SI-NEXT:    v_readlane_b32 s98, v20, 34
 ; SI-NEXT:    v_readlane_b32 s97, v20, 33
 ; SI-NEXT:    v_readlane_b32 s96, v20, 32
@@ -26129,7 +26118,10 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:60
@@ -26146,9 +26138,6 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:8
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v32
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -26714,7 +26703,7 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:16
@@ -27322,562 +27311,737 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v52, v30
-; SI-NEXT:    v_mov_b32_e32 v53, v28
-; SI-NEXT:    v_mov_b32_e32 v40, v12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:68
-; SI-NEXT:    v_mov_b32_e32 v55, v14
-; SI-NEXT:    v_mul_f32_e32 v14, 1.0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v63, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v5
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v8
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v7
+; SI-NEXT:    v_mov_b32_e32 v43, v21
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v10
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v11
+; SI-NEXT:    v_mov_b32_e32 v54, v29
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v55
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v43
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v16
-; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v54
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v41, v23
+; SI-NEXT:    v_mov_b32_e32 v29, v20
+; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v3
 ; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v9
-; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v11
-; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v40
-; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v17
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v19
-; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v18
-; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v21
-; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v20
-; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v23
-; SI-NEXT:    v_mul_f32_e32 v40, 1.0, v22
-; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v25
-; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v24
-; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v27
-; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v29
-; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v53
-; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v52
+; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v15
+; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v17
+; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v19
+; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v41
+; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v25
+; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v27
+; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s29
+; SI-NEXT:    v_mul_f32_e32 v9, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v11, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v14
+; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v16
+; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v22
+; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v28
+; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s22
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v32
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v33
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v34
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v35
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v36
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v37
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v38
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v39
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v48
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v49
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s16
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s29
-; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s28
+; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v48
-; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v26
-; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v51
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v45
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v37
-; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v50
-; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v38
-; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v49
-; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v39
-; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v30
-; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v28
-; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v12
-; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v31
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v60
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s27
+; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v18
+; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v29
+; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v24
+; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v26
+; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v30
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v51
+; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v53
+; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v55
+; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v40
 ; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v42
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v62
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v63
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v33
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v34
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v35
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v36
-; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v34, 1.0, s24
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v50
+; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v52
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v46
+; SI-NEXT:    v_mul_f32_e64 v48, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v45, 1.0, s28
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB19_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v0, v0, v3, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_alignbit_b32 v2, v2, v8, 16
-; SI-NEXT:    v_alignbit_b32 v3, v3, v9, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_alignbit_b32 v6, v6, v7, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v58
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v56
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v44
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_alignbit_b32 v5, v5, v11, 16
-; SI-NEXT:    v_alignbit_b32 v7, v7, v14, 16
-; SI-NEXT:    v_alignbit_b32 v8, v8, v54, 16
-; SI-NEXT:    v_alignbit_b32 v9, v9, v46, 16
-; SI-NEXT:    v_mov_b32_e32 v62, v61
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v61
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v60, v59
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v59
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v56, v47
-; SI-NEXT:    v_alignbit_b32 v13, v13, v47, 16
-; SI-NEXT:    v_mov_b32_e32 v46, v45
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v45
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v57
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; SI-NEXT:    v_alignbit_b32 v30, v30, v31, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v33
-; SI-NEXT:    v_alignbit_b32 v4, v4, v34, 16
-; SI-NEXT:    v_mov_b32_e32 v63, v44
-; SI-NEXT:    v_mov_b32_e32 v58, v57
-; SI-NEXT:    v_mov_b32_e32 v44, v43
-; SI-NEXT:    v_alignbit_b32 v16, v16, v43, 16
-; SI-NEXT:    v_mov_b32_e32 v43, v41
-; SI-NEXT:    v_alignbit_b32 v17, v17, v41, 16
-; SI-NEXT:    v_alignbit_b32 v18, v18, v40, 16
-; SI-NEXT:    v_mov_b32_e32 v40, v55
-; SI-NEXT:    v_alignbit_b32 v19, v19, v55, 16
-; SI-NEXT:    v_alignbit_b32 v20, v20, v48, 16
-; SI-NEXT:    v_mov_b32_e32 v48, v53
-; SI-NEXT:    v_alignbit_b32 v21, v21, v53, 16
-; SI-NEXT:    v_alignbit_b32 v22, v22, v52, 16
-; SI-NEXT:    v_mov_b32_e32 v52, v51
-; SI-NEXT:    v_alignbit_b32 v23, v23, v51, 16
-; SI-NEXT:    v_alignbit_b32 v24, v24, v50, 16
-; SI-NEXT:    v_mov_b32_e32 v50, v49
-; SI-NEXT:    v_alignbit_b32 v25, v25, v49, 16
-; SI-NEXT:    v_mov_b32_e32 v36, v39
-; SI-NEXT:    v_alignbit_b32 v26, v26, v39, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v34, v38
-; SI-NEXT:    v_alignbit_b32 v27, v27, v38, 16
-; SI-NEXT:    v_mov_b32_e32 v35, v37
-; SI-NEXT:    v_alignbit_b32 v28, v28, v37, 16
-; SI-NEXT:    v_mov_b32_e32 v37, v32
-; SI-NEXT:    v_alignbit_b32 v29, v29, v32, 16
-; SI-NEXT:    v_alignbit_b32 v31, v31, v42, 16
+; SI-NEXT:    v_mov_b32_e32 v0, v19
+; SI-NEXT:    v_mov_b32_e32 v37, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v63
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v57
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v61
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v44
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
-; SI-NEXT:    v_mov_b32_e32 v32, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v42
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_alignbit_b32 v10, v10, v61, 16
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_alignbit_b32 v12, v12, v54, 16
-; SI-NEXT:    v_mov_b32_e32 v41, v61
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[0:1], v[19:20], 16
+; SI-NEXT:    v_mov_b32_e32 v1, v48
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[1:2], v[48:49], 16
+; SI-NEXT:    v_mov_b32_e32 v2, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[2:3], v[14:15], 16
+; SI-NEXT:    v_mov_b32_e32 v3, v16
+; SI-NEXT:    v_mov_b32_e32 v20, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v35
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[3:4], v[16:17], 16
+; SI-NEXT:    v_mov_b32_e32 v4, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[4:5], v[18:19], 16
+; SI-NEXT:    v_mov_b32_e32 v5, v29
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[5:6], v[29:30], 16
+; SI-NEXT:    v_mov_b32_e32 v6, v45
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[6:7], v[45:46], 16
+; SI-NEXT:    v_mov_b32_e32 v7, v39
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[7:8], v[39:40], 16
+; SI-NEXT:    v_mov_b32_e32 v8, v9
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v49
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[8:9], v[9:10], 16
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v9
+; SI-NEXT:    v_mov_b32_e32 v9, v54
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[9:10], v[54:55], 16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v55, v13
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
+; SI-NEXT:    v_mov_b32_e32 v10, v11
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[10:11], v[11:12], 16
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v11
+; SI-NEXT:    v_mov_b32_e32 v11, v56
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[11:12], v[56:57], 16
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v56, v44
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
+; SI-NEXT:    v_lshr_b64 v[12:13], v[13:14], 16
+; SI-NEXT:    v_mov_b32_e32 v13, v58
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[13:14], v[58:59], 16
+; SI-NEXT:    v_mov_b32_e32 v14, v60
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[14:15], v[60:61], 16
+; SI-NEXT:    v_mov_b32_e32 v15, v62
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[15:16], v[62:63], 16
+; SI-NEXT:    v_mov_b32_e32 v16, v32
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[16:17], v[32:33], 16
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; SI-NEXT:    v_mov_b32_e32 v33, v34
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_alignbit_b32 v11, v11, v59, 16
-; SI-NEXT:    v_mov_b32_e32 v55, v59
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_alignbit_b32 v14, v14, v45, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v17
+; SI-NEXT:    v_mov_b32_e32 v40, v17
+; SI-NEXT:    v_lshr_b64 v[17:18], v[34:35], 16
+; SI-NEXT:    v_lshr_b64 v[18:19], v[47:48], 16
+; SI-NEXT:    v_lshr_b64 v[19:20], v[36:37], 16
+; SI-NEXT:    v_mov_b32_e32 v20, v38
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[20:21], v[38:39], 16
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT:    v_mov_b32_e32 v34, v47
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
+; SI-NEXT:    v_mov_b32_e32 v21, v22
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[21:22], v[22:23], 16
+; SI-NEXT:    v_mov_b32_e32 v22, v31
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[22:23], v[31:32], 16
+; SI-NEXT:    v_mov_b32_e32 v23, v24
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[23:24], v[24:25], 16
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v30
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v24
+; SI-NEXT:    v_mov_b32_e32 v24, v41
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[24:25], v[41:42], 16
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v41, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v50
+; SI-NEXT:    v_mov_b32_e32 v42, v51
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_alignbit_b32 v15, v15, v47, 16
-; SI-NEXT:    v_mov_b32_e32 v51, v47
-; SI-NEXT:    v_mov_b32_e32 v53, v45
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
+; SI-NEXT:    v_lshr_b64 v[25:26], v[26:27], 16
+; SI-NEXT:    v_mov_b32_e32 v26, v43
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[26:27], v[43:44], 16
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v43, v28
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v27
+; SI-NEXT:    v_lshr_b64 v[27:28], v[28:29], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[51:52], 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[53:54], 16
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v52, v53
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshr_b64 v[30:31], v[31:32], 16
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v57
+; SI-NEXT:    v_mov_b32_e32 v53, v31
+; SI-NEXT:    v_lshr_b64 v[31:32], v[31:32], 16
 ; SI-NEXT:    s_branch .LBB19_3
 ; SI-NEXT:  .LBB19_2:
-; SI-NEXT:    v_mov_b32_e32 v63, v44
-; SI-NEXT:    v_mov_b32_e32 v44, v43
-; SI-NEXT:    v_mov_b32_e32 v43, v41
-; SI-NEXT:    v_mov_b32_e32 v40, v55
-; SI-NEXT:    v_mov_b32_e32 v48, v53
-; SI-NEXT:    v_mov_b32_e32 v52, v51
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v62, v61
-; SI-NEXT:    v_mov_b32_e32 v60, v59
-; SI-NEXT:    v_mov_b32_e32 v58, v57
-; SI-NEXT:    v_mov_b32_e32 v56, v47
-; SI-NEXT:    v_mov_b32_e32 v46, v45
-; SI-NEXT:    v_mov_b32_e32 v50, v49
-; SI-NEXT:    v_mov_b32_e32 v36, v39
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v34, v38
-; SI-NEXT:    v_mov_b32_e32 v35, v37
-; SI-NEXT:    v_mov_b32_e32 v37, v32
+; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v56, v44
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v55, v13
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v33, v34
+; SI-NEXT:    v_mov_b32_e32 v34, v47
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v43, v28
+; SI-NEXT:    v_mov_b32_e32 v52, v53
+; SI-NEXT:    v_mov_b32_e32 v53, v0
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
-; SI-NEXT:    v_mov_b32_e32 v32, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v42
+; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v41, v26
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v42, v51
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:  .LBB19_3: ; %Flow
-; SI-NEXT:    v_mov_b32_e32 v38, v50
-; SI-NEXT:    v_mov_b32_e32 v39, v52
-; SI-NEXT:    v_mov_b32_e32 v49, v40
-; SI-NEXT:    v_mov_b32_e32 v50, v43
-; SI-NEXT:    v_mov_b32_e32 v43, v44
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v37, v34
+; SI-NEXT:    v_mov_b32_e32 v34, v33
+; SI-NEXT:    v_mov_b32_e32 v35, v56
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_mov_b32_e32 v32, v40
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v33, v38
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v51, v46
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v54, v46
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v44, v46
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v45, v56
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v47, v56
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v58, v60
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; SI-NEXT:    s_cbranch_vccnz .LBB19_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v44
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v57
+; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v61
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v49
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v40
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v39
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v63
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v62
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v60
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v59
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v58
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v46
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v40
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v35
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v32
+; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v53
 ; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[2:3], 16
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v60
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
 ; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
 ; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
 ; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
 ; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
 ; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v58
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT:    v_lshr_b64 v[3:4], v[3:4], 16
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v56
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v47
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT:    v_lshr_b64 v[5:6], v[5:6], 16
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v45
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v45
+; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v46
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v42
+; SI-NEXT:    v_lshr_b64 v[7:8], v[7:8], 16
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v44
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v52
+; SI-NEXT:    v_lshr_b64 v[8:9], v[8:9], 16
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v54
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v41
+; SI-NEXT:    v_lshr_b64 v[9:10], v[9:10], 16
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v51
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v55
+; SI-NEXT:    v_lshr_b64 v[10:11], v[10:11], 16
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v50
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_alignbit_b32 v11, v12, v11, 16
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v54
+; SI-NEXT:    v_lshr_b64 v[11:12], v[11:12], 16
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v55
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT:    v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v56
+; SI-NEXT:    v_lshr_b64 v[12:13], v[12:13], 16
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v48
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v53
+; SI-NEXT:    v_lshr_b64 v[13:14], v[13:14], 16
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v38
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v51
+; SI-NEXT:    v_lshr_b64 v[14:15], v[14:15], 16
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v33
 ; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_alignbit_b32 v15, v16, v15, 16
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v43
+; SI-NEXT:    v_lshr_b64 v[15:16], v[15:16], 16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_alignbit_b32 v16, v17, v16, 16
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v50
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT:    v_alignbit_b32 v17, v18, v17, 16
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_lshr_b64 v[16:17], v[16:17], 16
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v34
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT:    v_lshr_b64 v[17:18], v[17:18], 16
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v37
 ; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_alignbit_b32 v18, v19, v18, 16
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v49
+; SI-NEXT:    v_lshr_b64 v[18:19], v[18:19], 16
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v36
 ; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT:    v_alignbit_b32 v19, v20, v19, 16
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshr_b64 v[19:20], v[19:20], 16
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; SI-NEXT:    v_lshr_b64 v[32:33], v[32:33], 16
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT:    v_alignbit_b32 v20, v21, v20, 16
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v48
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT:    v_lshr_b64 v[20:21], v[20:21], 16
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT:    v_alignbit_b32 v21, v22, v21, 16
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT:    v_alignbit_b32 v22, v23, v22, 16
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT:    v_lshr_b64 v[21:22], v[21:22], 16
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT:    v_lshr_b64 v[22:23], v[22:23], 16
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT:    v_alignbit_b32 v23, v24, v23, 16
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT:    v_alignbit_b32 v24, v25, v24, 16
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT:    v_lshr_b64 v[23:24], v[23:24], 16
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT:    v_alignbit_b32 v25, v26, v25, 16
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT:    v_lshr_b64 v[24:25], v[24:25], 16
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v41
+; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT:    v_lshr_b64 v[25:26], v[25:26], 16
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT:    v_alignbit_b32 v26, v27, v26, 16
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v34
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT:    v_alignbit_b32 v27, v28, v27, 16
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT:    v_lshr_b64 v[26:27], v[26:27], 16
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v43
+; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; SI-NEXT:    v_lshr_b64 v[27:28], v[27:28], 16
+; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v42
 ; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT:    v_alignbit_b32 v28, v29, v28, 16
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v37
+; SI-NEXT:    v_lshr_b64 v[28:29], v[28:29], 16
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v52
 ; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT:    v_alignbit_b32 v29, v30, v29, 16
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshr_b64 v[29:30], v[29:30], 16
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT:    v_alignbit_b32 v30, v31, v30, 16
-; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v33
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT:    v_alignbit_b32 v31, v32, v31, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT:    v_lshr_b64 v[30:31], v[30:31], 16
+; SI-NEXT:    v_mov_b32_e32 v31, v32
 ; SI-NEXT:  .LBB19_5: ; %end
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -27905,36 +28069,39 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v31, v17
 ; VI-NEXT:    v_mov_b32_e32 v30, v16
 ; VI-NEXT:    v_mov_b32_e32 v29, v15
-; VI-NEXT:    v_mov_b32_e32 v28, v14
 ; VI-NEXT:    v_mov_b32_e32 v27, v13
-; VI-NEXT:    v_mov_b32_e32 v26, v12
 ; VI-NEXT:    v_mov_b32_e32 v25, v11
-; VI-NEXT:    v_mov_b32_e32 v24, v10
 ; VI-NEXT:    v_mov_b32_e32 v23, v9
-; VI-NEXT:    v_mov_b32_e32 v22, v8
 ; VI-NEXT:    v_mov_b32_e32 v21, v7
-; VI-NEXT:    v_mov_b32_e32 v20, v6
 ; VI-NEXT:    v_mov_b32_e32 v19, v5
-; VI-NEXT:    v_mov_b32_e32 v32, v4
 ; VI-NEXT:    v_mov_b32_e32 v17, v3
-; VI-NEXT:    v_mov_b32_e32 v16, v2
 ; VI-NEXT:    v_mov_b32_e32 v15, v1
+; VI-NEXT:    v_mov_b32_e32 v28, v14
+; VI-NEXT:    v_mov_b32_e32 v26, v12
+; VI-NEXT:    v_mov_b32_e32 v24, v10
+; VI-NEXT:    v_mov_b32_e32 v22, v8
+; VI-NEXT:    v_mov_b32_e32 v20, v6
+; VI-NEXT:    v_mov_b32_e32 v32, v4
+; VI-NEXT:    v_mov_b32_e32 v16, v2
 ; VI-NEXT:    v_mov_b32_e32 v14, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s16
-; VI-NEXT:    v_mov_b32_e32 v1, s17
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; VI-NEXT:    v_mov_b32_e32 v2, s18
+; VI-NEXT:    v_mov_b32_e32 v1, s17
 ; VI-NEXT:    v_mov_b32_e32 v3, s19
-; VI-NEXT:    v_mov_b32_e32 v4, s20
 ; VI-NEXT:    v_mov_b32_e32 v5, s21
-; VI-NEXT:    v_mov_b32_e32 v6, s22
 ; VI-NEXT:    v_mov_b32_e32 v7, s23
-; VI-NEXT:    v_mov_b32_e32 v8, s24
 ; VI-NEXT:    v_mov_b32_e32 v9, s25
-; VI-NEXT:    v_mov_b32_e32 v10, s26
 ; VI-NEXT:    v_mov_b32_e32 v11, s27
-; VI-NEXT:    v_mov_b32_e32 v12, s28
 ; VI-NEXT:    v_mov_b32_e32 v13, s29
+; VI-NEXT:    v_mov_b32_e32 v0, s16
+; VI-NEXT:    v_mov_b32_e32 v2, s18
+; VI-NEXT:    v_mov_b32_e32 v4, s20
+; VI-NEXT:    v_mov_b32_e32 v6, s22
+; VI-NEXT:    v_mov_b32_e32 v8, s24
+; VI-NEXT:    v_mov_b32_e32 v10, s26
+; VI-NEXT:    v_mov_b32_e32 v12, s28
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB19_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_cbranch_execnz .LBB19_3
@@ -27943,580 +28110,600 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
 ; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
 ; VI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
 ; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v15, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v15
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc
+; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
+; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
 ; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v15
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; VI-NEXT:    v_cndmask_b32_e32 v15, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT:    v_alignbit_b32 v15, v15, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v15, v18, v34, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; VI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
 ; VI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
 ; VI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v14, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v14
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v14
+; VI-NEXT:    v_lshrrev_b64 v[34:35], 16, v[33:34]
+; VI-NEXT:    v_or_b32_e32 v33, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    v_bfe_u32 v15, v14, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v35, v18, v33, vcc
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v14
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; VI-NEXT:    v_cndmask_b32_e32 v14, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT:    v_alignbit_b32 v14, v14, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v14, v15, v18, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
+; VI-NEXT:    v_lshrrev_b64 v[14:15], 16, v[35:36]
+; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v13
+; VI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
 ; VI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
 ; VI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v13, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v13
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v13
+; VI-NEXT:    v_or_b32_e32 v33, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v35, v18, v33, vcc
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT:    v_cndmask_b32_e32 v13, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT:    v_alignbit_b32 v13, v13, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v13, v15, v18, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v13
+; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; VI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
 ; VI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
 ; VI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v12, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v12
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v12
+; VI-NEXT:    v_lshrrev_b64 v[35:36], 16, v[35:36]
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_bfe_u32 v13, v12, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v36, v15, v18, vcc
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v12
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v12
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; VI-NEXT:    v_cndmask_b32_e32 v12, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT:    v_alignbit_b32 v12, v12, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v12, v13, v15, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; VI-NEXT:    v_lshrrev_b64 v[12:13], 16, v[36:37]
+; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; VI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
 ; VI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
 ; VI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v11, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v11
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v11
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v36, v15, v18, vcc
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; VI-NEXT:    v_cndmask_b32_e32 v11, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT:    v_alignbit_b32 v11, v11, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v11, v13, v15, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
+; VI-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; VI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
 ; VI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
 ; VI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v10, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v10
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v10
+; VI-NEXT:    v_lshrrev_b64 v[36:37], 16, v[36:37]
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    v_bfe_u32 v11, v10, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v37, v13, v15, vcc
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v10
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v10
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; VI-NEXT:    v_cndmask_b32_e32 v10, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT:    v_alignbit_b32 v10, v10, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v10, v11, v13, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[37:38]
+; VI-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
+; VI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
 ; VI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
 ; VI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v9, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v9
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v9
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v37, v13, v15, vcc
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT:    v_cndmask_b32_e32 v9, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT:    v_alignbit_b32 v9, v9, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; VI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
 ; VI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
 ; VI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v8, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v8
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v8
+; VI-NEXT:    v_lshrrev_b64 v[37:38], 16, v[37:38]
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v38, v11, v13, vcc
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v8
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; VI-NEXT:    v_cndmask_b32_e32 v8, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT:    v_alignbit_b32 v8, v8, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v11, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[38:39]
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; VI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
 ; VI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
 ; VI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v7, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v7
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v7
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v38, v11, v13, vcc
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT:    v_cndmask_b32_e32 v7, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT:    v_alignbit_b32 v7, v7, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v7, v9, v11, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v7
+; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; VI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
 ; VI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
 ; VI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v6, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v6
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v6
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v48, v9, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v6
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; VI-NEXT:    v_cndmask_b32_e32 v6, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT:    v_alignbit_b32 v6, v6, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v6, v7, v9, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[48:49]
+; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; VI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
 ; VI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
 ; VI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v5, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v5
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v5
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v48, v9, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT:    v_cndmask_b32_e32 v5, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT:    v_alignbit_b32 v5, v5, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v4
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; VI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
 ; VI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; VI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v4
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v4
+; VI-NEXT:    v_lshrrev_b64 v[48:49], 16, v[48:49]
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v49, v7, v9, vcc
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_cndmask_b32_e32 v4, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT:    v_alignbit_b32 v4, v4, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v4
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[49:50]
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; VI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
 ; VI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; VI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v3
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v3
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v49, v7, v9, vcc
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v3, v3, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v2
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; VI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; VI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v2
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v2
+; VI-NEXT:    v_lshrrev_b64 v[49:50], 16, v[49:50]
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v50, v5, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_alignbit_b32 v2, v2, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v2
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[50:51]
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; VI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v1
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v50, v5, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_alignbit_b32 v1, v1, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v0
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v0
+; VI-NEXT:    v_lshrrev_b64 v[50:51], 16, v[50:51]
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v51, v3, v5, vcc
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v31
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v31, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v31
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v31
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; VI-NEXT:    v_cndmask_b32_e32 v31, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT:    v_alignbit_b32 v31, v31, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v30
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v30, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v30
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v30
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
-; VI-NEXT:    v_cndmask_b32_e32 v30, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT:    v_alignbit_b32 v30, v30, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v29, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v29
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v29
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; VI-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT:    v_alignbit_b32 v29, v29, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v28
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v28, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v28
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v28
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; VI-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT:    v_alignbit_b32 v28, v28, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v27
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v27
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v27
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; VI-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT:    v_alignbit_b32 v27, v27, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v26
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v26, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v26
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v26
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; VI-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT:    v_alignbit_b32 v26, v26, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v25
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v25
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v25
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; VI-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT:    v_alignbit_b32 v25, v25, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v24
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v24, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v24
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v24
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; VI-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT:    v_alignbit_b32 v24, v24, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v23
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v23
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; VI-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT:    v_alignbit_b32 v23, v23, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v22
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v22, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v22
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v22
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; VI-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT:    v_alignbit_b32 v22, v22, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v21, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v21
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v21
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; VI-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT:    v_alignbit_b32 v21, v21, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v20
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v20, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v20
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v20
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; VI-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT:    v_alignbit_b32 v20, v20, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v19, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v19
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v19
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; VI-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT:    v_alignbit_b32 v19, v19, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v32
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v32
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; VI-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; VI-NEXT:    v_alignbit_b32 v32, v32, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v17, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v17
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v17
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT:    v_alignbit_b32 v17, v17, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v16
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    v_cndmask_b32_e32 v16, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT:    v_alignbit_b32 v16, v16, v18, 16
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v0
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[51:52]
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v31
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v31
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v51, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v30
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v30
+; VI-NEXT:    v_lshrrev_b64 v[51:52], 16, v[51:52]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v52, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v29
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v29
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[30:31], 16, v[52:53]
+; VI-NEXT:    v_cndmask_b32_e32 v52, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v28
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v28
+; VI-NEXT:    v_lshrrev_b64 v[52:53], 16, v[52:53]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v53, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v27
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v27
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[28:29], 16, v[53:54]
+; VI-NEXT:    v_cndmask_b32_e32 v53, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v26
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v26
+; VI-NEXT:    v_lshrrev_b64 v[53:54], 16, v[53:54]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v54, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v25
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v25
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[26:27], 16, v[54:55]
+; VI-NEXT:    v_cndmask_b32_e32 v54, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v24
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v24
+; VI-NEXT:    v_lshrrev_b64 v[38:39], 16, v[38:39]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v39, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v23
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v23
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[24:25], 16, v[39:40]
+; VI-NEXT:    v_cndmask_b32_e32 v39, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v22
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v22
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 16, v[39:40]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v40, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v21
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v21
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[22:23], 16, v[40:41]
+; VI-NEXT:    v_cndmask_b32_e32 v40, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v20
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v20
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 16, v[40:41]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v41, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v19
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v18, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v32
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v31, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v17
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v17
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[20:21], 16, v[41:42]
+; VI-NEXT:    v_cndmask_b32_e32 v41, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v15, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 16, v[54:55]
+; VI-NEXT:    v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT:    v_lshrrev_b64 v[41:42], 16, v[41:42]
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[32:33], 16, v[31:32]
+; VI-NEXT:    v_lshrrev_b64 v[16:17], 16, v[15:16]
+; VI-NEXT:    v_mov_b32_e32 v1, v50
+; VI-NEXT:    v_mov_b32_e32 v3, v49
+; VI-NEXT:    v_mov_b32_e32 v5, v48
+; VI-NEXT:    v_mov_b32_e32 v7, v38
+; VI-NEXT:    v_mov_b32_e32 v9, v37
+; VI-NEXT:    v_mov_b32_e32 v11, v36
+; VI-NEXT:    v_mov_b32_e32 v13, v35
+; VI-NEXT:    v_mov_b32_e32 v15, v34
+; VI-NEXT:    v_mov_b32_e32 v17, v41
+; VI-NEXT:    v_mov_b32_e32 v19, v18
+; VI-NEXT:    v_mov_b32_e32 v21, v40
+; VI-NEXT:    v_mov_b32_e32 v23, v39
+; VI-NEXT:    v_mov_b32_e32 v25, v54
+; VI-NEXT:    v_mov_b32_e32 v27, v53
+; VI-NEXT:    v_mov_b32_e32 v29, v52
+; VI-NEXT:    v_mov_b32_e32 v31, v51
 ; VI-NEXT:  .LBB19_3: ; %end
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v18, v32
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB19_4:
 ; VI-NEXT:    s_branch .LBB19_2
@@ -29181,7 +29368,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:280
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:276
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:272
@@ -29214,7 +29401,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:152
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:148
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:144
@@ -29247,7 +29434,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:16
@@ -30049,7 +30236,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:8
@@ -30082,7 +30269,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:136
@@ -30115,7 +30302,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:264
@@ -30155,7 +30342,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:288
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:284
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:280
@@ -30188,7 +30375,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:172
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:156
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:152
@@ -30221,7 +30408,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v157, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v158, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v168, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v169, s32 offset:28
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v170, s32 offset:24
@@ -30913,7 +31100,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v184, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v175, off, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v174, off, s32 offset:8
@@ -30946,7 +31133,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:116
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:120
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:128
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:136
@@ -30979,7 +31166,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:244
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:248
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:256
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:260
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:264
@@ -34732,7 +34919,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -34765,7 +34952,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -34798,7 +34985,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -34876,7 +35063,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -34909,7 +35096,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -34942,7 +35129,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -35000,6 +35187,10 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i32_to_v64i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -35016,10 +35207,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -35051,14 +35238,13 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; kill: killed $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB24_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 16
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v34, v30, v29, 16
@@ -35103,7 +35289,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_cbranch_execz .LBB24_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
 ; SI-NEXT:    v_add_i32_e32 v31, vcc, 3, v31
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v32
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
@@ -35356,7 +35541,7 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -36338,7 +36523,13 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -36370,12 +36561,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -36391,7 +36576,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v41
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -36608,7 +36792,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -37782,7 +37965,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -37815,7 +37998,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -37848,7 +38031,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -37926,7 +38109,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -37959,7 +38142,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -37992,7 +38175,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -40033,22 +40216,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32f32_to_v128i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
@@ -40181,6 +40348,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr41
@@ -40212,14 +40395,13 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB36_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -40451,7 +40633,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    s_cbranch_execz .LBB36_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
 ; SI-NEXT:    v_add_f32_e32 v31, 1.0, v31
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v32, 1.0, v32
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -41245,22 +41426,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-LABEL: bitcast_v32f32_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -41365,6 +41530,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -41470,129 +41651,129 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
+; VI-NEXT:    v_mov_b32_e32 v55, v39
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
-; VI-NEXT:    v_mov_b32_e32 v55, v39
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
-; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v27
 ; VI-NEXT:    v_lshrrev_b32_e32 v59, 24, v10
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v9
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v9
@@ -42214,6 +42395,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v32f32_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -42230,9 +42415,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
@@ -42365,7 +42547,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -42421,7 +42602,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    s_waitcnt vmcnt(45)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -42436,7 +42617,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
+; GFX9-NEXT:    s_waitcnt vmcnt(47)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -42448,149 +42629,147 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
 ; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 8, v15
@@ -42598,7 +42777,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v12
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v11
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v10
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 24, v8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
@@ -42626,7 +42807,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    s_cbranch_execz .LBB36_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
 ; GFX9-NEXT:    v_add_f32_e32 v32, 1.0, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(44)
 ; GFX9-NEXT:    v_add_f32_e32 v31, 1.0, v31
 ; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
 ; GFX9-NEXT:    v_add_f32_e32 v30, 1.0, v30
@@ -43666,7 +43847,11 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -43687,10 +43872,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
@@ -44310,7 +44491,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -44770,27 +44951,11 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[13:14], v[48:49], 24
-; SI-NEXT:    v_lshr_b64 v[17:18], v[48:49], 8
-; SI-NEXT:    v_add_f32_e64 v53, s23, 1.0
-; SI-NEXT:    v_add_f32_e64 v52, s22, 1.0
 ; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 24
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 16
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshr_b64 v[13:14], v[48:49], 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 8
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v2
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
@@ -44842,24 +45007,33 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v12
+; SI-NEXT:    v_lshr_b64 v[17:18], v[48:49], 8
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v16
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v16
+; SI-NEXT:    v_add_f32_e64 v53, s23, 1.0
+; SI-NEXT:    v_add_f32_e64 v52, s22, 1.0
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v16
+; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 24
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v21
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v21
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v21
+; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 16
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v26
@@ -44868,6 +45042,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v26
 ; SI-NEXT:    v_add_f32_e64 v41, s21, 1.0
 ; SI-NEXT:    v_add_f32_e64 v40, s20, 1.0
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v26
@@ -44875,6 +45051,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_add_f32_e64 v57, s16, 1.0
 ; SI-NEXT:    v_add_f32_e64 v46, s19, 1.0
 ; SI-NEXT:    v_add_f32_e64 v45, s18, 1.0
+; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 8
 ; SI-NEXT:    v_lshr_b64 v[31:32], v[40:41], 16
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -44885,6 +45062,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v30
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshr_b64 v[27:28], v[40:41], 24
 ; SI-NEXT:    v_lshr_b64 v[33:34], v[45:46], 24
 ; SI-NEXT:    v_lshr_b64 v[38:39], v[45:46], 8
@@ -45408,33 +45587,33 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v13, s98
+; SI-NEXT:    v_mov_b32_e32 v27, s62
 ; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v13, s46
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v13, s56
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v13, s58
-; SI-NEXT:    v_mov_b32_e32 v27, s62
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mov_b32_e32 v13, s46
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v27, s72
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mov_b32_e32 v13, s56
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v27, s74
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mov_b32_e32 v13, s58
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v27, s76
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v57, s16
@@ -45468,6 +45647,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v3, s6
 ; SI-NEXT:    v_mov_b32_e32 v4, s7
 ; SI-NEXT:    v_readlane_b32 s5, v61, 1
+; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v13, s60
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v27, s78
@@ -45809,17 +45989,16 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xff, v15
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
 ; SI-NEXT:    v_or_b32_e32 v13, v14, v13
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v17
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v17
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v14
@@ -46687,6 +46866,10 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    ; implicit-def: $sgpr46
 ; VI-NEXT:    s_branch .LBB37_2
 ; VI-NEXT:  .LBB37_4:
+; VI-NEXT:    v_mov_b32_e32 v53, s46
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s56
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 0
 ; VI-NEXT:    v_mov_b32_e32 v48, s4
@@ -46764,6 +46947,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 26
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s58
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 27
@@ -46841,6 +47027,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 51
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s60
 ; VI-NEXT:    v_readlane_b32 s4, v62, 52
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
@@ -46859,40 +47048,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 57
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
-; VI-NEXT:    v_mov_b32_e32 v53, s46
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s56
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s58
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s60
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s62
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s72
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s74
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s76
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s78
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s88
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s90
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v31, s16
 ; VI-NEXT:    v_mov_b32_e32 v32, s17
 ; VI-NEXT:    v_mov_b32_e32 v29, s18
@@ -46946,11 +47101,35 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v42, s82
 ; VI-NEXT:    v_mov_b32_e32 v37, s81
 ; VI-NEXT:    v_mov_b32_e32 v50, s80
-; VI-NEXT:    v_mov_b32_e32 v53, s30
-; VI-NEXT:    v_mov_b32_e32 v54, s34
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v39, s36
 ; VI-NEXT:    v_mov_b32_e32 v40, s38
 ; VI-NEXT:    v_mov_b32_e32 v41, s48
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s62
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s72
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s74
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s76
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s78
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s88
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s90
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s30
+; VI-NEXT:    v_mov_b32_e32 v54, s34
 ; VI-NEXT:  .LBB37_5: ; %end
 ; VI-NEXT:    v_lshlrev_b32_e32 v34, 8, v34
 ; VI-NEXT:    v_lshlrev_b32_e32 v35, 8, v35
@@ -48123,10 +48302,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 49
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v40, s4
-; GFX9-NEXT:    v_mov_b32_e32 v49, s52
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v40, s46
-; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -48175,6 +48352,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v40, s94
+; GFX9-NEXT:    v_mov_b32_e32 v49, s52
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
@@ -48222,6 +48400,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v54, s55
 ; GFX9-NEXT:    v_mov_b32_e32 v50, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v60, s54
+; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v49, s51
 ; GFX9-NEXT:    v_mov_b32_e32 v59, s50
 ; GFX9-NEXT:    v_mov_b32_e32 v58, s49
@@ -48646,7 +48825,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:76
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
@@ -48681,7 +48860,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11-NEXT:    v_writelane_b32 v76, s101, 5
 ; GFX11-NEXT:    s_mov_b32 vcc_hi, 0
 ; GFX11-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT:    s_clause 0x12
+; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:72
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:68
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:64
@@ -49601,7 +49780,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:80
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:96
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT:    s_clause 0x12
+; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:8
@@ -49663,7 +49842,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11-NEXT:    v_readlane_b32 s31, v75, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v75, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:76
 ; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:80
 ; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:84
@@ -49876,13 +50055,26 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -50043,44 +50235,30 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:372
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB38_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -50089,11 +50267,11 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -50696,7 +50874,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB38_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -50710,8 +50887,8 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -51391,13 +51568,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -51534,34 +51723,20 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -52047,7 +52222,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v61
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -52625,13 +52799,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -52773,34 +52961,20 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -53287,7 +53461,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v61
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -54426,7 +54599,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:592
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:588
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:584
@@ -54459,7 +54632,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:456
@@ -55400,7 +55573,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
 ; GFX11-FAKE16-NEXT:  .LBB38_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:400
@@ -55433,7 +55606,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:528
@@ -56150,24 +56323,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_branch .LBB39_3
 ; SI-NEXT:  .LBB39_2:
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v55, v56
 ; SI-NEXT:    v_mov_b32_e32 v42, v46
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -56178,10 +56340,22 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:  .LBB39_3: ; %Flow
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_mov_b32_e32 v35, v57
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -56191,7 +56365,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v44
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -56786,13 +56959,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v19
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v47, 8, v7
 ; VI-NEXT:    v_lshlrev_b32_e32 v46, 8, v9
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v13
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -57020,11 +57193,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    s_or_b32 s4, s4, s5
@@ -57034,11 +57207,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_lshl_b32 s7, s23, 8
 ; VI-NEXT:    s_lshl_b32 s8, s27, 8
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -57046,6 +57216,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -57254,12 +57426,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_branch .LBB39_3
 ; VI-NEXT:  .LBB39_2:
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v44, v56
 ; VI-NEXT:    v_mov_b32_e32 v41, v33
 ; VI-NEXT:    v_mov_b32_e32 v50, v40
@@ -57277,6 +57443,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v54, v53
 ; VI-NEXT:    v_mov_b32_e32 v52, v36
 ; VI-NEXT:    v_mov_b32_e32 v49, v51
@@ -57286,7 +57458,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v51, v41
 ; VI-NEXT:    v_mov_b32_e32 v36, v44
 ; VI-NEXT:    v_mov_b32_e32 v53, v54
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v54, v60
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -57299,7 +57471,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:  ; %bb.4: ; %cmp.true
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v37
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
@@ -57884,8 +58055,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v11
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 8, v9
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -58064,16 +58235,18 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_lshl_b32 s6, s19, 8
 ; GFX9-NEXT:    s_lshl_b32 s7, s23, 8
 ; GFX9-NEXT:    s_lshl_b32 s8, s27, 8
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -58100,9 +58273,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -58118,14 +58290,16 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -58137,10 +58311,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58153,10 +58328,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -58170,17 +58347,22 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -58196,45 +58378,24 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58285,18 +58446,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB39_3
 ; GFX9-NEXT:  .LBB39_2:
-; GFX9-NEXT:    v_mov_b32_e32 v38, v51
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v43
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -58310,6 +58459,18 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_mov_b32_e32 v38, v51
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v31
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
@@ -58747,7 +58908,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32f32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -58780,7 +58941,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT:    s_clause 0x7
+; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -59637,7 +59798,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-TRUE16-NEXT:  .LBB39_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -59670,7 +59831,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT:    s_clause 0x7
+; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -59688,7 +59849,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -59721,7 +59882,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT:    s_clause 0x7
+; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -60578,7 +60739,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-FAKE16-NEXT:  .LBB39_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -60611,7 +60772,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT:    s_clause 0x7
+; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -61932,214 +62093,213 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v61
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v61
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v60
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v59
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v59
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v58
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v57
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v57
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v56
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v47
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v47
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v46
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v45
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v45
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v44
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v43
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v43
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v42
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v41
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v41
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v40
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v55
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v55
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v54
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v53
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v53
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v52
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v51
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v51
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v50
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v49
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v49
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v39
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v39
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v38
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v37
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v37
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v36
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v35
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v35
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v34
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v33
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v33
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v31
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v31
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v29
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v29
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v27
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v27
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v25
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v25
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v23
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v23
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v21
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v21
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v19
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v19
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v17
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v17
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v15
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v15
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v13
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v13
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v11
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v11
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v9
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v9
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v7
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v7
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v5
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v5
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_readlane_b32 s99, v63, 35
 ; SI-NEXT:    v_readlane_b32 s98, v63, 34
 ; SI-NEXT:    v_readlane_b32 s97, v63, 33
@@ -62176,22 +62336,23 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; SI-NEXT:    v_readlane_b32 s34, v63, 2
 ; SI-NEXT:    v_readlane_b32 s31, v63, 1
 ; SI-NEXT:    v_readlane_b32 s30, v63, 0
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -64239,7 +64400,10 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:60
@@ -64256,9 +64420,6 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:8
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v32
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -64824,7 +64985,7 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:16
@@ -65432,562 +65593,737 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v52, v30
-; SI-NEXT:    v_mov_b32_e32 v53, v28
-; SI-NEXT:    v_mov_b32_e32 v40, v12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:68
-; SI-NEXT:    v_mov_b32_e32 v55, v14
-; SI-NEXT:    v_mul_f32_e32 v14, 1.0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v63, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v5
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v8
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v7
+; SI-NEXT:    v_mov_b32_e32 v43, v21
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v10
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v11
+; SI-NEXT:    v_mov_b32_e32 v54, v29
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v55
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v43
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v16
-; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v54
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v41, v23
+; SI-NEXT:    v_mov_b32_e32 v29, v20
+; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v3
 ; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v9
-; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v11
-; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v40
-; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v17
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v19
-; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v18
-; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v21
-; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v20
-; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v23
-; SI-NEXT:    v_mul_f32_e32 v40, 1.0, v22
-; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v25
-; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v24
-; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v27
-; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v29
-; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v53
-; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v52
+; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v15
+; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v17
+; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v19
+; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v41
+; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v25
+; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v27
+; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s29
+; SI-NEXT:    v_mul_f32_e32 v9, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v11, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v14
+; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v16
+; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v22
+; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v28
+; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s22
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v32
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v33
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v34
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v35
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v36
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v37
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v38
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v39
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v48
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v49
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s16
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s29
-; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s28
+; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v48
-; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v26
-; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v51
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v45
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v37
-; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v50
-; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v38
-; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v49
-; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v39
-; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v30
-; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v28
-; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v12
-; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v31
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v60
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s27
+; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v18
+; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v29
+; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v24
+; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v26
+; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v30
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v51
+; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v53
+; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v55
+; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v40
 ; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v42
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v62
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v63
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v33
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v34
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v35
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v36
-; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v34, 1.0, s24
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v50
+; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v52
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v46
+; SI-NEXT:    v_mul_f32_e64 v48, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v45, 1.0, s28
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB43_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v0, v0, v3, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_alignbit_b32 v2, v2, v8, 16
-; SI-NEXT:    v_alignbit_b32 v3, v3, v9, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_alignbit_b32 v6, v6, v7, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v58
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v56
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v44
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_alignbit_b32 v5, v5, v11, 16
-; SI-NEXT:    v_alignbit_b32 v7, v7, v14, 16
-; SI-NEXT:    v_alignbit_b32 v8, v8, v54, 16
-; SI-NEXT:    v_alignbit_b32 v9, v9, v46, 16
-; SI-NEXT:    v_mov_b32_e32 v62, v61
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v61
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v60, v59
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v59
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v56, v47
-; SI-NEXT:    v_alignbit_b32 v13, v13, v47, 16
-; SI-NEXT:    v_mov_b32_e32 v46, v45
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v45
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v57
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; SI-NEXT:    v_alignbit_b32 v30, v30, v31, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v33
-; SI-NEXT:    v_alignbit_b32 v4, v4, v34, 16
-; SI-NEXT:    v_mov_b32_e32 v63, v44
-; SI-NEXT:    v_mov_b32_e32 v58, v57
-; SI-NEXT:    v_mov_b32_e32 v44, v43
-; SI-NEXT:    v_alignbit_b32 v16, v16, v43, 16
-; SI-NEXT:    v_mov_b32_e32 v43, v41
-; SI-NEXT:    v_alignbit_b32 v17, v17, v41, 16
-; SI-NEXT:    v_alignbit_b32 v18, v18, v40, 16
-; SI-NEXT:    v_mov_b32_e32 v40, v55
-; SI-NEXT:    v_alignbit_b32 v19, v19, v55, 16
-; SI-NEXT:    v_alignbit_b32 v20, v20, v48, 16
-; SI-NEXT:    v_mov_b32_e32 v48, v53
-; SI-NEXT:    v_alignbit_b32 v21, v21, v53, 16
-; SI-NEXT:    v_alignbit_b32 v22, v22, v52, 16
-; SI-NEXT:    v_mov_b32_e32 v52, v51
-; SI-NEXT:    v_alignbit_b32 v23, v23, v51, 16
-; SI-NEXT:    v_alignbit_b32 v24, v24, v50, 16
-; SI-NEXT:    v_mov_b32_e32 v50, v49
-; SI-NEXT:    v_alignbit_b32 v25, v25, v49, 16
-; SI-NEXT:    v_mov_b32_e32 v36, v39
-; SI-NEXT:    v_alignbit_b32 v26, v26, v39, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v34, v38
-; SI-NEXT:    v_alignbit_b32 v27, v27, v38, 16
-; SI-NEXT:    v_mov_b32_e32 v35, v37
-; SI-NEXT:    v_alignbit_b32 v28, v28, v37, 16
-; SI-NEXT:    v_mov_b32_e32 v37, v32
-; SI-NEXT:    v_alignbit_b32 v29, v29, v32, 16
-; SI-NEXT:    v_alignbit_b32 v31, v31, v42, 16
+; SI-NEXT:    v_mov_b32_e32 v0, v19
+; SI-NEXT:    v_mov_b32_e32 v37, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v63
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v57
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v61
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v44
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
-; SI-NEXT:    v_mov_b32_e32 v32, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v42
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_alignbit_b32 v10, v10, v61, 16
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_alignbit_b32 v12, v12, v54, 16
-; SI-NEXT:    v_mov_b32_e32 v41, v61
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[0:1], v[19:20], 16
+; SI-NEXT:    v_mov_b32_e32 v1, v48
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[1:2], v[48:49], 16
+; SI-NEXT:    v_mov_b32_e32 v2, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[2:3], v[14:15], 16
+; SI-NEXT:    v_mov_b32_e32 v3, v16
+; SI-NEXT:    v_mov_b32_e32 v20, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v35
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[3:4], v[16:17], 16
+; SI-NEXT:    v_mov_b32_e32 v4, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[4:5], v[18:19], 16
+; SI-NEXT:    v_mov_b32_e32 v5, v29
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[5:6], v[29:30], 16
+; SI-NEXT:    v_mov_b32_e32 v6, v45
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[6:7], v[45:46], 16
+; SI-NEXT:    v_mov_b32_e32 v7, v39
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[7:8], v[39:40], 16
+; SI-NEXT:    v_mov_b32_e32 v8, v9
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v49
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[8:9], v[9:10], 16
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v9
+; SI-NEXT:    v_mov_b32_e32 v9, v54
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[9:10], v[54:55], 16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v55, v13
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
+; SI-NEXT:    v_mov_b32_e32 v10, v11
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[10:11], v[11:12], 16
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v11
+; SI-NEXT:    v_mov_b32_e32 v11, v56
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[11:12], v[56:57], 16
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v56, v44
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
+; SI-NEXT:    v_lshr_b64 v[12:13], v[13:14], 16
+; SI-NEXT:    v_mov_b32_e32 v13, v58
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[13:14], v[58:59], 16
+; SI-NEXT:    v_mov_b32_e32 v14, v60
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[14:15], v[60:61], 16
+; SI-NEXT:    v_mov_b32_e32 v15, v62
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[15:16], v[62:63], 16
+; SI-NEXT:    v_mov_b32_e32 v16, v32
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[16:17], v[32:33], 16
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; SI-NEXT:    v_mov_b32_e32 v33, v34
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_alignbit_b32 v11, v11, v59, 16
-; SI-NEXT:    v_mov_b32_e32 v55, v59
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_alignbit_b32 v14, v14, v45, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v17
+; SI-NEXT:    v_mov_b32_e32 v40, v17
+; SI-NEXT:    v_lshr_b64 v[17:18], v[34:35], 16
+; SI-NEXT:    v_lshr_b64 v[18:19], v[47:48], 16
+; SI-NEXT:    v_lshr_b64 v[19:20], v[36:37], 16
+; SI-NEXT:    v_mov_b32_e32 v20, v38
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[20:21], v[38:39], 16
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT:    v_mov_b32_e32 v34, v47
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
+; SI-NEXT:    v_mov_b32_e32 v21, v22
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[21:22], v[22:23], 16
+; SI-NEXT:    v_mov_b32_e32 v22, v31
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[22:23], v[31:32], 16
+; SI-NEXT:    v_mov_b32_e32 v23, v24
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[23:24], v[24:25], 16
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v30
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v24
+; SI-NEXT:    v_mov_b32_e32 v24, v41
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[24:25], v[41:42], 16
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v41, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v50
+; SI-NEXT:    v_mov_b32_e32 v42, v51
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
+; SI-NEXT:    v_lshr_b64 v[25:26], v[26:27], 16
+; SI-NEXT:    v_mov_b32_e32 v26, v43
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[26:27], v[43:44], 16
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v43, v28
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v27
+; SI-NEXT:    v_lshr_b64 v[27:28], v[28:29], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[51:52], 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[53:54], 16
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v52, v53
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshr_b64 v[30:31], v[31:32], 16
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_alignbit_b32 v15, v15, v47, 16
-; SI-NEXT:    v_mov_b32_e32 v51, v47
-; SI-NEXT:    v_mov_b32_e32 v53, v45
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v57
+; SI-NEXT:    v_mov_b32_e32 v53, v31
+; SI-NEXT:    v_lshr_b64 v[31:32], v[31:32], 16
 ; SI-NEXT:    s_branch .LBB43_3
 ; SI-NEXT:  .LBB43_2:
-; SI-NEXT:    v_mov_b32_e32 v63, v44
-; SI-NEXT:    v_mov_b32_e32 v44, v43
-; SI-NEXT:    v_mov_b32_e32 v43, v41
-; SI-NEXT:    v_mov_b32_e32 v40, v55
-; SI-NEXT:    v_mov_b32_e32 v48, v53
-; SI-NEXT:    v_mov_b32_e32 v52, v51
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v62, v61
-; SI-NEXT:    v_mov_b32_e32 v60, v59
-; SI-NEXT:    v_mov_b32_e32 v58, v57
-; SI-NEXT:    v_mov_b32_e32 v56, v47
-; SI-NEXT:    v_mov_b32_e32 v46, v45
-; SI-NEXT:    v_mov_b32_e32 v50, v49
-; SI-NEXT:    v_mov_b32_e32 v36, v39
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v34, v38
-; SI-NEXT:    v_mov_b32_e32 v35, v37
-; SI-NEXT:    v_mov_b32_e32 v37, v32
+; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v56, v44
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v55, v13
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v33, v34
+; SI-NEXT:    v_mov_b32_e32 v34, v47
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v43, v28
+; SI-NEXT:    v_mov_b32_e32 v52, v53
+; SI-NEXT:    v_mov_b32_e32 v53, v0
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
-; SI-NEXT:    v_mov_b32_e32 v32, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v42
+; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v41, v26
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v42, v51
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:  .LBB43_3: ; %Flow
-; SI-NEXT:    v_mov_b32_e32 v38, v50
-; SI-NEXT:    v_mov_b32_e32 v39, v52
-; SI-NEXT:    v_mov_b32_e32 v49, v40
-; SI-NEXT:    v_mov_b32_e32 v50, v43
-; SI-NEXT:    v_mov_b32_e32 v43, v44
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v37, v34
+; SI-NEXT:    v_mov_b32_e32 v34, v33
+; SI-NEXT:    v_mov_b32_e32 v35, v56
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_mov_b32_e32 v32, v40
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v33, v38
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v51, v46
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v54, v46
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v44, v46
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v45, v56
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v47, v56
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v58, v60
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; SI-NEXT:    s_cbranch_vccnz .LBB43_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v44
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v57
+; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v61
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v49
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v40
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v39
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v63
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v62
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v60
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v59
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v58
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v46
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v40
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v35
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v32
+; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v53
 ; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[2:3], 16
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v60
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
 ; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
 ; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
 ; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
 ; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
 ; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v58
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT:    v_lshr_b64 v[3:4], v[3:4], 16
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v56
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v47
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT:    v_lshr_b64 v[5:6], v[5:6], 16
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v45
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v45
+; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v46
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v42
+; SI-NEXT:    v_lshr_b64 v[7:8], v[7:8], 16
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v44
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v52
+; SI-NEXT:    v_lshr_b64 v[8:9], v[8:9], 16
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v54
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v41
+; SI-NEXT:    v_lshr_b64 v[9:10], v[9:10], 16
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v51
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v55
+; SI-NEXT:    v_lshr_b64 v[10:11], v[10:11], 16
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v50
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_alignbit_b32 v11, v12, v11, 16
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v54
+; SI-NEXT:    v_lshr_b64 v[11:12], v[11:12], 16
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v55
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT:    v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v56
+; SI-NEXT:    v_lshr_b64 v[12:13], v[12:13], 16
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v48
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v53
+; SI-NEXT:    v_lshr_b64 v[13:14], v[13:14], 16
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v38
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v51
+; SI-NEXT:    v_lshr_b64 v[14:15], v[14:15], 16
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v33
 ; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_alignbit_b32 v15, v16, v15, 16
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v43
+; SI-NEXT:    v_lshr_b64 v[15:16], v[15:16], 16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_alignbit_b32 v16, v17, v16, 16
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v50
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT:    v_alignbit_b32 v17, v18, v17, 16
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_lshr_b64 v[16:17], v[16:17], 16
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v34
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT:    v_lshr_b64 v[17:18], v[17:18], 16
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v37
 ; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_alignbit_b32 v18, v19, v18, 16
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v49
+; SI-NEXT:    v_lshr_b64 v[18:19], v[18:19], 16
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v36
 ; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT:    v_alignbit_b32 v19, v20, v19, 16
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshr_b64 v[19:20], v[19:20], 16
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; SI-NEXT:    v_lshr_b64 v[32:33], v[32:33], 16
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT:    v_alignbit_b32 v20, v21, v20, 16
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v48
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT:    v_lshr_b64 v[20:21], v[20:21], 16
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT:    v_alignbit_b32 v21, v22, v21, 16
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT:    v_alignbit_b32 v22, v23, v22, 16
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT:    v_lshr_b64 v[21:22], v[21:22], 16
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT:    v_lshr_b64 v[22:23], v[22:23], 16
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT:    v_alignbit_b32 v23, v24, v23, 16
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT:    v_alignbit_b32 v24, v25, v24, 16
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT:    v_lshr_b64 v[23:24], v[23:24], 16
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT:    v_alignbit_b32 v25, v26, v25, 16
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT:    v_lshr_b64 v[24:25], v[24:25], 16
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v41
+; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT:    v_lshr_b64 v[25:26], v[25:26], 16
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT:    v_alignbit_b32 v26, v27, v26, 16
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v34
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT:    v_alignbit_b32 v27, v28, v27, 16
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT:    v_lshr_b64 v[26:27], v[26:27], 16
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v43
+; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; SI-NEXT:    v_lshr_b64 v[27:28], v[27:28], 16
+; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v42
 ; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT:    v_alignbit_b32 v28, v29, v28, 16
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v37
+; SI-NEXT:    v_lshr_b64 v[28:29], v[28:29], 16
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v52
 ; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT:    v_alignbit_b32 v29, v30, v29, 16
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshr_b64 v[29:30], v[29:30], 16
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT:    v_alignbit_b32 v30, v31, v30, 16
-; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v33
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT:    v_alignbit_b32 v31, v32, v31, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT:    v_lshr_b64 v[30:31], v[30:31], 16
+; SI-NEXT:    v_mov_b32_e32 v31, v32
 ; SI-NEXT:  .LBB43_5: ; %end
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -66015,36 +66351,39 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; VI-NEXT:    v_mov_b32_e32 v31, v17
 ; VI-NEXT:    v_mov_b32_e32 v30, v16
 ; VI-NEXT:    v_mov_b32_e32 v29, v15
-; VI-NEXT:    v_mov_b32_e32 v28, v14
 ; VI-NEXT:    v_mov_b32_e32 v27, v13
-; VI-NEXT:    v_mov_b32_e32 v26, v12
 ; VI-NEXT:    v_mov_b32_e32 v25, v11
-; VI-NEXT:    v_mov_b32_e32 v24, v10
 ; VI-NEXT:    v_mov_b32_e32 v23, v9
-; VI-NEXT:    v_mov_b32_e32 v22, v8
 ; VI-NEXT:    v_mov_b32_e32 v21, v7
-; VI-NEXT:    v_mov_b32_e32 v20, v6
 ; VI-NEXT:    v_mov_b32_e32 v19, v5
-; VI-NEXT:    v_mov_b32_e32 v32, v4
 ; VI-NEXT:    v_mov_b32_e32 v17, v3
-; VI-NEXT:    v_mov_b32_e32 v16, v2
 ; VI-NEXT:    v_mov_b32_e32 v15, v1
+; VI-NEXT:    v_mov_b32_e32 v28, v14
+; VI-NEXT:    v_mov_b32_e32 v26, v12
+; VI-NEXT:    v_mov_b32_e32 v24, v10
+; VI-NEXT:    v_mov_b32_e32 v22, v8
+; VI-NEXT:    v_mov_b32_e32 v20, v6
+; VI-NEXT:    v_mov_b32_e32 v32, v4
+; VI-NEXT:    v_mov_b32_e32 v16, v2
 ; VI-NEXT:    v_mov_b32_e32 v14, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s16
-; VI-NEXT:    v_mov_b32_e32 v1, s17
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; VI-NEXT:    v_mov_b32_e32 v2, s18
+; VI-NEXT:    v_mov_b32_e32 v1, s17
 ; VI-NEXT:    v_mov_b32_e32 v3, s19
-; VI-NEXT:    v_mov_b32_e32 v4, s20
 ; VI-NEXT:    v_mov_b32_e32 v5, s21
-; VI-NEXT:    v_mov_b32_e32 v6, s22
 ; VI-NEXT:    v_mov_b32_e32 v7, s23
-; VI-NEXT:    v_mov_b32_e32 v8, s24
 ; VI-NEXT:    v_mov_b32_e32 v9, s25
-; VI-NEXT:    v_mov_b32_e32 v10, s26
 ; VI-NEXT:    v_mov_b32_e32 v11, s27
-; VI-NEXT:    v_mov_b32_e32 v12, s28
 ; VI-NEXT:    v_mov_b32_e32 v13, s29
+; VI-NEXT:    v_mov_b32_e32 v0, s16
+; VI-NEXT:    v_mov_b32_e32 v2, s18
+; VI-NEXT:    v_mov_b32_e32 v4, s20
+; VI-NEXT:    v_mov_b32_e32 v6, s22
+; VI-NEXT:    v_mov_b32_e32 v8, s24
+; VI-NEXT:    v_mov_b32_e32 v10, s26
+; VI-NEXT:    v_mov_b32_e32 v12, s28
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB43_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_cbranch_execnz .LBB43_3
@@ -66053,580 +66392,600 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
 ; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
 ; VI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
 ; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v15, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v15
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc
+; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
+; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
 ; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v15
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; VI-NEXT:    v_cndmask_b32_e32 v15, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT:    v_alignbit_b32 v15, v15, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v15, v18, v34, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; VI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
 ; VI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
 ; VI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v14, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v14
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v14
+; VI-NEXT:    v_lshrrev_b64 v[34:35], 16, v[33:34]
+; VI-NEXT:    v_or_b32_e32 v33, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    v_bfe_u32 v15, v14, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v35, v18, v33, vcc
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v14
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; VI-NEXT:    v_cndmask_b32_e32 v14, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT:    v_alignbit_b32 v14, v14, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v14, v15, v18, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
+; VI-NEXT:    v_lshrrev_b64 v[14:15], 16, v[35:36]
+; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v13
+; VI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
 ; VI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
 ; VI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v13, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v13
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v13
+; VI-NEXT:    v_or_b32_e32 v33, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v35, v18, v33, vcc
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT:    v_cndmask_b32_e32 v13, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT:    v_alignbit_b32 v13, v13, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v13, v15, v18, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v13
+; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; VI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
 ; VI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
 ; VI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v12, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v12
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v12
+; VI-NEXT:    v_lshrrev_b64 v[35:36], 16, v[35:36]
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_bfe_u32 v13, v12, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v36, v15, v18, vcc
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v12
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v12
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; VI-NEXT:    v_cndmask_b32_e32 v12, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT:    v_alignbit_b32 v12, v12, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v12, v13, v15, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; VI-NEXT:    v_lshrrev_b64 v[12:13], 16, v[36:37]
+; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; VI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
 ; VI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
 ; VI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v11, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v11
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v11
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v36, v15, v18, vcc
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; VI-NEXT:    v_cndmask_b32_e32 v11, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT:    v_alignbit_b32 v11, v11, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v11, v13, v15, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
+; VI-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; VI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
 ; VI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
 ; VI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v10, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v10
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v10
+; VI-NEXT:    v_lshrrev_b64 v[36:37], 16, v[36:37]
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    v_bfe_u32 v11, v10, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v37, v13, v15, vcc
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v10
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v10
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; VI-NEXT:    v_cndmask_b32_e32 v10, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT:    v_alignbit_b32 v10, v10, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v10, v11, v13, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[37:38]
+; VI-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
+; VI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
 ; VI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
 ; VI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v9, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v9
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v9
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v37, v13, v15, vcc
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT:    v_cndmask_b32_e32 v9, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT:    v_alignbit_b32 v9, v9, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; VI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
 ; VI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
 ; VI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v8, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v8
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v8
+; VI-NEXT:    v_lshrrev_b64 v[37:38], 16, v[37:38]
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v38, v11, v13, vcc
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v8
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; VI-NEXT:    v_cndmask_b32_e32 v8, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT:    v_alignbit_b32 v8, v8, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v11, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[38:39]
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; VI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
 ; VI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
 ; VI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v7, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v7
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v7
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v38, v11, v13, vcc
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT:    v_cndmask_b32_e32 v7, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT:    v_alignbit_b32 v7, v7, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v7, v9, v11, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v7
+; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; VI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
 ; VI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
 ; VI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v6, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v6
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v6
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v48, v9, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v6
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; VI-NEXT:    v_cndmask_b32_e32 v6, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT:    v_alignbit_b32 v6, v6, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v6, v7, v9, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[48:49]
+; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; VI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
 ; VI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
 ; VI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v5, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v5
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v5
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v48, v9, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT:    v_cndmask_b32_e32 v5, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT:    v_alignbit_b32 v5, v5, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v4
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; VI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
 ; VI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; VI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v4
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v4
+; VI-NEXT:    v_lshrrev_b64 v[48:49], 16, v[48:49]
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v49, v7, v9, vcc
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_cndmask_b32_e32 v4, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT:    v_alignbit_b32 v4, v4, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v4
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[49:50]
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; VI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
 ; VI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; VI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v3
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v3
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v49, v7, v9, vcc
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v3, v3, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v2
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; VI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; VI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v2
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v2
+; VI-NEXT:    v_lshrrev_b64 v[49:50], 16, v[49:50]
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v50, v5, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_alignbit_b32 v2, v2, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v2
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[50:51]
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; VI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v1
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v50, v5, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_alignbit_b32 v1, v1, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v0
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v0
+; VI-NEXT:    v_lshrrev_b64 v[50:51], 16, v[50:51]
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v51, v3, v5, vcc
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v31
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v31, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v31
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v31
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; VI-NEXT:    v_cndmask_b32_e32 v31, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT:    v_alignbit_b32 v31, v31, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v30
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v30, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v30
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v30
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
-; VI-NEXT:    v_cndmask_b32_e32 v30, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT:    v_alignbit_b32 v30, v30, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v29, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v29
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v29
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; VI-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT:    v_alignbit_b32 v29, v29, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v28
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v28, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v28
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v28
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; VI-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT:    v_alignbit_b32 v28, v28, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v27
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v27
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v27
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; VI-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT:    v_alignbit_b32 v27, v27, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v26
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v26, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v26
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v26
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; VI-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT:    v_alignbit_b32 v26, v26, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v25
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v25
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v25
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; VI-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT:    v_alignbit_b32 v25, v25, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v24
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v24, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v24
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v24
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; VI-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT:    v_alignbit_b32 v24, v24, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v23
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v23
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; VI-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT:    v_alignbit_b32 v23, v23, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v22
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v22, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v22
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v22
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; VI-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT:    v_alignbit_b32 v22, v22, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v21, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v21
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v21
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; VI-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT:    v_alignbit_b32 v21, v21, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v20
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v20, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v20
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v20
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; VI-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT:    v_alignbit_b32 v20, v20, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v19, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v19
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v19
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; VI-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT:    v_alignbit_b32 v19, v19, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v32
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v32
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; VI-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; VI-NEXT:    v_alignbit_b32 v32, v32, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v17, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v17
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v17
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT:    v_alignbit_b32 v17, v17, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v16
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    v_cndmask_b32_e32 v16, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT:    v_alignbit_b32 v16, v16, v18, 16
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v0
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[51:52]
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v31
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v31
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v51, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v30
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v30
+; VI-NEXT:    v_lshrrev_b64 v[51:52], 16, v[51:52]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v52, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v29
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v29
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[30:31], 16, v[52:53]
+; VI-NEXT:    v_cndmask_b32_e32 v52, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v28
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v28
+; VI-NEXT:    v_lshrrev_b64 v[52:53], 16, v[52:53]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v53, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v27
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v27
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[28:29], 16, v[53:54]
+; VI-NEXT:    v_cndmask_b32_e32 v53, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v26
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v26
+; VI-NEXT:    v_lshrrev_b64 v[53:54], 16, v[53:54]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v54, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v25
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v25
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[26:27], 16, v[54:55]
+; VI-NEXT:    v_cndmask_b32_e32 v54, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v24
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v24
+; VI-NEXT:    v_lshrrev_b64 v[38:39], 16, v[38:39]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v39, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v23
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v23
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[24:25], 16, v[39:40]
+; VI-NEXT:    v_cndmask_b32_e32 v39, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v22
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v22
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 16, v[39:40]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v40, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v21
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v21
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[22:23], 16, v[40:41]
+; VI-NEXT:    v_cndmask_b32_e32 v40, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v20
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v20
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 16, v[40:41]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v41, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v19
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v18, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v32
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v31, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v17
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v17
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[20:21], 16, v[41:42]
+; VI-NEXT:    v_cndmask_b32_e32 v41, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v15, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 16, v[54:55]
+; VI-NEXT:    v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT:    v_lshrrev_b64 v[41:42], 16, v[41:42]
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[32:33], 16, v[31:32]
+; VI-NEXT:    v_lshrrev_b64 v[16:17], 16, v[15:16]
+; VI-NEXT:    v_mov_b32_e32 v1, v50
+; VI-NEXT:    v_mov_b32_e32 v3, v49
+; VI-NEXT:    v_mov_b32_e32 v5, v48
+; VI-NEXT:    v_mov_b32_e32 v7, v38
+; VI-NEXT:    v_mov_b32_e32 v9, v37
+; VI-NEXT:    v_mov_b32_e32 v11, v36
+; VI-NEXT:    v_mov_b32_e32 v13, v35
+; VI-NEXT:    v_mov_b32_e32 v15, v34
+; VI-NEXT:    v_mov_b32_e32 v17, v41
+; VI-NEXT:    v_mov_b32_e32 v19, v18
+; VI-NEXT:    v_mov_b32_e32 v21, v40
+; VI-NEXT:    v_mov_b32_e32 v23, v39
+; VI-NEXT:    v_mov_b32_e32 v25, v54
+; VI-NEXT:    v_mov_b32_e32 v27, v53
+; VI-NEXT:    v_mov_b32_e32 v29, v52
+; VI-NEXT:    v_mov_b32_e32 v31, v51
 ; VI-NEXT:  .LBB43_3: ; %end
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v18, v32
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB43_4:
 ; VI-NEXT:    s_branch .LBB43_2
@@ -67291,7 +67650,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:280
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:276
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:272
@@ -67324,7 +67683,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:152
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:148
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:144
@@ -67357,7 +67716,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:16
@@ -68159,7 +68518,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:8
@@ -68192,7 +68551,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:136
@@ -68225,7 +68584,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:264
@@ -68265,7 +68624,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:288
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:284
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:280
@@ -68298,7 +68657,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:172
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:156
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:152
@@ -68331,7 +68690,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v157, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v158, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v168, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v169, s32 offset:28
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v170, s32 offset:24
@@ -69023,7 +69382,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v184, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v175, off, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v174, off, s32 offset:8
@@ -69056,7 +69415,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:116
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:120
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:128
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:136
@@ -69089,7 +69448,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:244
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:248
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:256
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:260
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:264
@@ -72813,7 +73172,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -72846,7 +73205,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -72879,7 +73238,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -72957,7 +73316,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -72990,7 +73349,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -73023,7 +73382,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -73081,6 +73440,10 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32f32_to_v64i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -73097,10 +73460,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -73132,14 +73491,13 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; kill: killed $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB48_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 16
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v34, v30, v29, 16
@@ -73184,7 +73542,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    s_cbranch_execz .LBB48_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
 ; SI-NEXT:    v_add_f32_e32 v31, 1.0, v31
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v32, 1.0, v32
 ; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
@@ -73437,7 +73794,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -74373,7 +74730,13 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -74405,12 +74768,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -74426,7 +74783,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v41
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -74643,7 +74999,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -75817,7 +76172,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -75850,7 +76205,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -75883,7 +76238,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -75961,7 +76316,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -75994,7 +76349,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -76027,7 +76382,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -77054,22 +77409,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16i64_to_v128i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
@@ -77202,6 +77541,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr41
@@ -77233,14 +77588,13 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB56_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -77501,7 +77855,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_addc_u32_e32 v28, vcc, 0, v28, vcc
 ; SI-NEXT:    v_add_i32_e32 v29, vcc, 3, v29
 ; SI-NEXT:    v_addc_u32_e32 v30, vcc, 0, v30, vcc
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v32
 ; SI-NEXT:    v_addc_u32_e32 v31, vcc, 0, v31, vcc
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
@@ -78266,22 +78619,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-LABEL: bitcast_v16i64_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -78386,6 +78723,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -78491,129 +78844,129 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
+; VI-NEXT:    v_mov_b32_e32 v55, v39
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
-; VI-NEXT:    v_mov_b32_e32 v55, v39
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
-; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v27
 ; VI-NEXT:    v_lshrrev_b32_e32 v59, 24, v10
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v9
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v9
@@ -79235,6 +79588,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v16i64_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -79251,9 +79608,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
@@ -79386,7 +79740,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -79442,7 +79795,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    s_waitcnt vmcnt(45)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -79457,7 +79810,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
+; GFX9-NEXT:    s_waitcnt vmcnt(47)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -79469,149 +79822,147 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
 ; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 8, v15
@@ -79619,7 +79970,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v12
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v11
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v10
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 24, v8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
@@ -79676,7 +80029,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v28, vcc, 0, v28, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v29, vcc, 3, v29
 ; GFX9-NEXT:    v_addc_co_u32_e32 v30, vcc, 0, v30, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(44)
 ; GFX9-NEXT:    v_add_co_u32_e32 v31, vcc, 3, v31
 ; GFX9-NEXT:    v_addc_co_u32_e32 v32, vcc, 0, v32, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
@@ -80712,7 +81065,11 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -80733,10 +81090,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
@@ -81381,7 +81734,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -84631,7 +84984,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v16, s32
 ; GFX11-NEXT:    scratch_store_b32 off, v17, s32 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v18, s32 offset:8
@@ -85566,7 +85919,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-NEXT:    v_readlane_b32 s35, v16, 3
 ; GFX11-NEXT:    v_readlane_b32 s34, v16, 2
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v16, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:8
@@ -85779,13 +86132,26 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -85946,44 +86312,30 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:372
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB58_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -85992,11 +86344,11 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -86599,7 +86951,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB58_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -86613,8 +86964,8 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -87294,13 +87645,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -87437,34 +87800,20 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -87950,7 +88299,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v61
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -88528,13 +88876,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -88676,34 +89038,20 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -89190,7 +89538,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v61
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -90329,7 +90676,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:592
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:588
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:584
@@ -90362,7 +90709,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:456
@@ -91303,7 +91650,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
 ; GFX11-FAKE16-NEXT:  .LBB58_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:400
@@ -91336,7 +91683,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:528
@@ -92053,24 +92400,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_branch .LBB59_3
 ; SI-NEXT:  .LBB59_2:
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v55, v56
 ; SI-NEXT:    v_mov_b32_e32 v42, v46
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -92081,10 +92417,22 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:  .LBB59_3: ; %Flow
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_mov_b32_e32 v35, v57
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -92094,7 +92442,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v44
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -92689,13 +93036,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v19
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v47, 8, v7
 ; VI-NEXT:    v_lshlrev_b32_e32 v46, 8, v9
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v13
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -92923,11 +93270,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    s_or_b32 s4, s4, s5
@@ -92937,11 +93284,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_lshl_b32 s7, s23, 8
 ; VI-NEXT:    s_lshl_b32 s8, s27, 8
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -92949,6 +93293,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -93157,12 +93503,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_branch .LBB59_3
 ; VI-NEXT:  .LBB59_2:
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v44, v56
 ; VI-NEXT:    v_mov_b32_e32 v41, v33
 ; VI-NEXT:    v_mov_b32_e32 v50, v40
@@ -93180,6 +93520,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v54, v53
 ; VI-NEXT:    v_mov_b32_e32 v52, v36
 ; VI-NEXT:    v_mov_b32_e32 v49, v51
@@ -93189,7 +93535,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v51, v41
 ; VI-NEXT:    v_mov_b32_e32 v36, v44
 ; VI-NEXT:    v_mov_b32_e32 v53, v54
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v54, v60
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -93202,7 +93548,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:  ; %bb.4: ; %cmp.true
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v37
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
@@ -93787,8 +94132,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v11
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 8, v9
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -93967,16 +94312,18 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_lshl_b32 s6, s19, 8
 ; GFX9-NEXT:    s_lshl_b32 s7, s23, 8
 ; GFX9-NEXT:    s_lshl_b32 s8, s27, 8
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -94003,9 +94350,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -94021,14 +94367,16 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -94040,10 +94388,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -94056,10 +94405,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -94073,17 +94424,22 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -94099,45 +94455,24 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -94188,18 +94523,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB59_3
 ; GFX9-NEXT:  .LBB59_2:
-; GFX9-NEXT:    v_mov_b32_e32 v38, v51
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v43
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -94213,6 +94536,18 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_mov_b32_e32 v38, v51
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v31
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
@@ -94650,7 +94985,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16i64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -94683,7 +95018,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT:    s_clause 0x7
+; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -95540,7 +95875,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-TRUE16-NEXT:  .LBB59_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -95573,7 +95908,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT:    s_clause 0x7
+; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -95591,7 +95926,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -95624,7 +95959,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT:    s_clause 0x7
+; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -96481,7 +96816,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-FAKE16-NEXT:  .LBB59_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -96514,7 +96849,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT:    s_clause 0x7
+; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -97697,229 +98032,229 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a
 ; SI-NEXT:    v_writelane_b32 v21, s8, 3
 ; SI-NEXT:  .LBB61_3: ; %end
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s69
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s68
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s68
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_readlane_b32 s4, v21, 2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s67
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s66
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s66
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s65
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s64
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s64
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s55
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s54
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s54
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s53
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s52
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s52
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s51
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s50
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s50
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s49
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s48
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s48
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s39
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s38
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s38
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s37
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s36
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s36
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s35
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s34
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s34
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s31
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s30
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s30
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s95
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s94
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s94
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s93
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s92
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s92
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s91
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s90
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s90
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s89
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s88
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s88
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s79
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s78
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s78
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s77
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s76
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s76
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s75
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s74
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s74
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s73
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s72
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s72
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s63
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s62
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s62
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s61
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s60
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s60
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s59
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s58
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s58
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s57
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s56
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s56
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s47
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s46
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s46
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s45
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s44
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s44
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s43
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s42
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s42
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s41
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s40
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s40
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s15
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s14
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s14
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s13
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s12
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s12
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s11
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s10
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s10
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
-; SI-NEXT:    v_readlane_b32 s4, v21, 2
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
 ; SI-NEXT:    v_readlane_b32 s4, v21, 3
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    v_readlane_b32 s4, v21, 0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
 ; SI-NEXT:    v_readlane_b32 s4, v21, 1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    v_readlane_b32 s99, v20, 35
@@ -100084,7 +100419,10 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:60
@@ -100101,9 +100439,6 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:8
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v32
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -100669,7 +101004,7 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:16
@@ -101277,562 +101612,737 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v52, v30
-; SI-NEXT:    v_mov_b32_e32 v53, v28
-; SI-NEXT:    v_mov_b32_e32 v40, v12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:68
-; SI-NEXT:    v_mov_b32_e32 v55, v14
-; SI-NEXT:    v_mul_f32_e32 v14, 1.0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v63, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v5
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v8
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v7
+; SI-NEXT:    v_mov_b32_e32 v43, v21
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v10
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v11
+; SI-NEXT:    v_mov_b32_e32 v54, v29
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v55
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v43
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v16
-; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v54
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v41, v23
+; SI-NEXT:    v_mov_b32_e32 v29, v20
+; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v3
 ; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v9
-; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v11
-; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v40
-; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v17
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v19
-; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v18
-; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v21
-; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v20
-; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v23
-; SI-NEXT:    v_mul_f32_e32 v40, 1.0, v22
-; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v25
-; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v24
-; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v27
-; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v29
-; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v53
-; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v52
+; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v15
+; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v17
+; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v19
+; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v41
+; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v25
+; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v27
+; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s29
+; SI-NEXT:    v_mul_f32_e32 v9, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v11, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v14
+; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v16
+; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v22
+; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v28
+; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s22
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v32
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v33
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v34
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v35
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v36
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v37
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v38
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v39
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v48
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v49
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s16
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s29
-; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s28
+; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v48
-; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v26
-; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v51
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v45
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v37
-; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v50
-; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v38
-; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v49
-; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v39
-; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v30
-; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v28
-; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v12
-; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v31
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v60
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s27
+; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v18
+; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v29
+; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v24
+; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v26
+; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v30
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v51
+; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v53
+; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v55
+; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v40
 ; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v42
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v62
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v63
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v33
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v34
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v35
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v36
-; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v34, 1.0, s24
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v50
+; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v52
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v46
+; SI-NEXT:    v_mul_f32_e64 v48, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v45, 1.0, s28
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB63_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v0, v0, v3, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_alignbit_b32 v2, v2, v8, 16
-; SI-NEXT:    v_alignbit_b32 v3, v3, v9, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_alignbit_b32 v6, v6, v7, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v58
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v56
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v44
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_alignbit_b32 v5, v5, v11, 16
-; SI-NEXT:    v_alignbit_b32 v7, v7, v14, 16
-; SI-NEXT:    v_alignbit_b32 v8, v8, v54, 16
-; SI-NEXT:    v_alignbit_b32 v9, v9, v46, 16
-; SI-NEXT:    v_mov_b32_e32 v62, v61
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v61
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v60, v59
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v59
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v56, v47
-; SI-NEXT:    v_alignbit_b32 v13, v13, v47, 16
-; SI-NEXT:    v_mov_b32_e32 v46, v45
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v45
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v57
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; SI-NEXT:    v_alignbit_b32 v30, v30, v31, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v33
-; SI-NEXT:    v_alignbit_b32 v4, v4, v34, 16
-; SI-NEXT:    v_mov_b32_e32 v63, v44
-; SI-NEXT:    v_mov_b32_e32 v58, v57
-; SI-NEXT:    v_mov_b32_e32 v44, v43
-; SI-NEXT:    v_alignbit_b32 v16, v16, v43, 16
-; SI-NEXT:    v_mov_b32_e32 v43, v41
-; SI-NEXT:    v_alignbit_b32 v17, v17, v41, 16
-; SI-NEXT:    v_alignbit_b32 v18, v18, v40, 16
-; SI-NEXT:    v_mov_b32_e32 v40, v55
-; SI-NEXT:    v_alignbit_b32 v19, v19, v55, 16
-; SI-NEXT:    v_alignbit_b32 v20, v20, v48, 16
-; SI-NEXT:    v_mov_b32_e32 v48, v53
-; SI-NEXT:    v_alignbit_b32 v21, v21, v53, 16
-; SI-NEXT:    v_alignbit_b32 v22, v22, v52, 16
-; SI-NEXT:    v_mov_b32_e32 v52, v51
-; SI-NEXT:    v_alignbit_b32 v23, v23, v51, 16
-; SI-NEXT:    v_alignbit_b32 v24, v24, v50, 16
-; SI-NEXT:    v_mov_b32_e32 v50, v49
-; SI-NEXT:    v_alignbit_b32 v25, v25, v49, 16
-; SI-NEXT:    v_mov_b32_e32 v36, v39
-; SI-NEXT:    v_alignbit_b32 v26, v26, v39, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v34, v38
-; SI-NEXT:    v_alignbit_b32 v27, v27, v38, 16
-; SI-NEXT:    v_mov_b32_e32 v35, v37
-; SI-NEXT:    v_alignbit_b32 v28, v28, v37, 16
-; SI-NEXT:    v_mov_b32_e32 v37, v32
-; SI-NEXT:    v_alignbit_b32 v29, v29, v32, 16
-; SI-NEXT:    v_alignbit_b32 v31, v31, v42, 16
+; SI-NEXT:    v_mov_b32_e32 v0, v19
+; SI-NEXT:    v_mov_b32_e32 v37, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v63
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v57
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v61
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v44
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
-; SI-NEXT:    v_mov_b32_e32 v32, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v42
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_alignbit_b32 v10, v10, v61, 16
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_alignbit_b32 v12, v12, v54, 16
-; SI-NEXT:    v_mov_b32_e32 v41, v61
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[0:1], v[19:20], 16
+; SI-NEXT:    v_mov_b32_e32 v1, v48
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[1:2], v[48:49], 16
+; SI-NEXT:    v_mov_b32_e32 v2, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[2:3], v[14:15], 16
+; SI-NEXT:    v_mov_b32_e32 v3, v16
+; SI-NEXT:    v_mov_b32_e32 v20, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v35
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[3:4], v[16:17], 16
+; SI-NEXT:    v_mov_b32_e32 v4, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[4:5], v[18:19], 16
+; SI-NEXT:    v_mov_b32_e32 v5, v29
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[5:6], v[29:30], 16
+; SI-NEXT:    v_mov_b32_e32 v6, v45
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[6:7], v[45:46], 16
+; SI-NEXT:    v_mov_b32_e32 v7, v39
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[7:8], v[39:40], 16
+; SI-NEXT:    v_mov_b32_e32 v8, v9
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v49
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[8:9], v[9:10], 16
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v9
+; SI-NEXT:    v_mov_b32_e32 v9, v54
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[9:10], v[54:55], 16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v55, v13
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
+; SI-NEXT:    v_mov_b32_e32 v10, v11
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[10:11], v[11:12], 16
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v11
+; SI-NEXT:    v_mov_b32_e32 v11, v56
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[11:12], v[56:57], 16
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v56, v44
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
+; SI-NEXT:    v_lshr_b64 v[12:13], v[13:14], 16
+; SI-NEXT:    v_mov_b32_e32 v13, v58
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[13:14], v[58:59], 16
+; SI-NEXT:    v_mov_b32_e32 v14, v60
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[14:15], v[60:61], 16
+; SI-NEXT:    v_mov_b32_e32 v15, v62
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[15:16], v[62:63], 16
+; SI-NEXT:    v_mov_b32_e32 v16, v32
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[16:17], v[32:33], 16
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; SI-NEXT:    v_mov_b32_e32 v33, v34
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_alignbit_b32 v11, v11, v59, 16
-; SI-NEXT:    v_mov_b32_e32 v55, v59
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_alignbit_b32 v14, v14, v45, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v17
+; SI-NEXT:    v_mov_b32_e32 v40, v17
+; SI-NEXT:    v_lshr_b64 v[17:18], v[34:35], 16
+; SI-NEXT:    v_lshr_b64 v[18:19], v[47:48], 16
+; SI-NEXT:    v_lshr_b64 v[19:20], v[36:37], 16
+; SI-NEXT:    v_mov_b32_e32 v20, v38
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[20:21], v[38:39], 16
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT:    v_mov_b32_e32 v34, v47
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
+; SI-NEXT:    v_mov_b32_e32 v21, v22
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[21:22], v[22:23], 16
+; SI-NEXT:    v_mov_b32_e32 v22, v31
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[22:23], v[31:32], 16
+; SI-NEXT:    v_mov_b32_e32 v23, v24
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[23:24], v[24:25], 16
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v30
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v24
+; SI-NEXT:    v_mov_b32_e32 v24, v41
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[24:25], v[41:42], 16
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v41, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v50
+; SI-NEXT:    v_mov_b32_e32 v42, v51
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
+; SI-NEXT:    v_lshr_b64 v[25:26], v[26:27], 16
+; SI-NEXT:    v_mov_b32_e32 v26, v43
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[26:27], v[43:44], 16
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v43, v28
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v27
+; SI-NEXT:    v_lshr_b64 v[27:28], v[28:29], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[51:52], 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[53:54], 16
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v52, v53
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_alignbit_b32 v15, v15, v47, 16
-; SI-NEXT:    v_mov_b32_e32 v51, v47
-; SI-NEXT:    v_mov_b32_e32 v53, v45
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshr_b64 v[30:31], v[31:32], 16
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v57
+; SI-NEXT:    v_mov_b32_e32 v53, v31
+; SI-NEXT:    v_lshr_b64 v[31:32], v[31:32], 16
 ; SI-NEXT:    s_branch .LBB63_3
 ; SI-NEXT:  .LBB63_2:
-; SI-NEXT:    v_mov_b32_e32 v63, v44
-; SI-NEXT:    v_mov_b32_e32 v44, v43
-; SI-NEXT:    v_mov_b32_e32 v43, v41
-; SI-NEXT:    v_mov_b32_e32 v40, v55
-; SI-NEXT:    v_mov_b32_e32 v48, v53
-; SI-NEXT:    v_mov_b32_e32 v52, v51
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v62, v61
-; SI-NEXT:    v_mov_b32_e32 v60, v59
-; SI-NEXT:    v_mov_b32_e32 v58, v57
-; SI-NEXT:    v_mov_b32_e32 v56, v47
-; SI-NEXT:    v_mov_b32_e32 v46, v45
-; SI-NEXT:    v_mov_b32_e32 v50, v49
-; SI-NEXT:    v_mov_b32_e32 v36, v39
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v34, v38
-; SI-NEXT:    v_mov_b32_e32 v35, v37
-; SI-NEXT:    v_mov_b32_e32 v37, v32
+; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v56, v44
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v55, v13
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v33, v34
+; SI-NEXT:    v_mov_b32_e32 v34, v47
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v43, v28
+; SI-NEXT:    v_mov_b32_e32 v52, v53
+; SI-NEXT:    v_mov_b32_e32 v53, v0
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
-; SI-NEXT:    v_mov_b32_e32 v32, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v42
+; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v41, v26
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v42, v51
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:  .LBB63_3: ; %Flow
-; SI-NEXT:    v_mov_b32_e32 v38, v50
-; SI-NEXT:    v_mov_b32_e32 v39, v52
-; SI-NEXT:    v_mov_b32_e32 v49, v40
-; SI-NEXT:    v_mov_b32_e32 v50, v43
-; SI-NEXT:    v_mov_b32_e32 v43, v44
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v37, v34
+; SI-NEXT:    v_mov_b32_e32 v34, v33
+; SI-NEXT:    v_mov_b32_e32 v35, v56
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_mov_b32_e32 v32, v40
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v33, v38
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v51, v46
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v54, v46
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v44, v46
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v45, v56
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v47, v56
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v58, v60
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; SI-NEXT:    s_cbranch_vccnz .LBB63_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v44
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v57
+; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v61
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v49
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v40
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v39
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v63
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v62
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v60
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v59
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v58
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v46
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v40
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v35
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v32
+; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v53
 ; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[2:3], 16
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v60
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
 ; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
 ; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
 ; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
 ; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
 ; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v58
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT:    v_lshr_b64 v[3:4], v[3:4], 16
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v56
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v47
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT:    v_lshr_b64 v[5:6], v[5:6], 16
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v45
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v45
+; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v46
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v42
+; SI-NEXT:    v_lshr_b64 v[7:8], v[7:8], 16
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v44
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v52
+; SI-NEXT:    v_lshr_b64 v[8:9], v[8:9], 16
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v54
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v41
+; SI-NEXT:    v_lshr_b64 v[9:10], v[9:10], 16
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v51
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v55
+; SI-NEXT:    v_lshr_b64 v[10:11], v[10:11], 16
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v50
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_alignbit_b32 v11, v12, v11, 16
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v54
+; SI-NEXT:    v_lshr_b64 v[11:12], v[11:12], 16
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v55
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT:    v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v56
+; SI-NEXT:    v_lshr_b64 v[12:13], v[12:13], 16
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v48
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v53
+; SI-NEXT:    v_lshr_b64 v[13:14], v[13:14], 16
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v38
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v51
+; SI-NEXT:    v_lshr_b64 v[14:15], v[14:15], 16
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v33
 ; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_alignbit_b32 v15, v16, v15, 16
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v43
+; SI-NEXT:    v_lshr_b64 v[15:16], v[15:16], 16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_alignbit_b32 v16, v17, v16, 16
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v50
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT:    v_alignbit_b32 v17, v18, v17, 16
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_lshr_b64 v[16:17], v[16:17], 16
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v34
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT:    v_lshr_b64 v[17:18], v[17:18], 16
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v37
 ; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_alignbit_b32 v18, v19, v18, 16
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v49
+; SI-NEXT:    v_lshr_b64 v[18:19], v[18:19], 16
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v36
 ; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT:    v_alignbit_b32 v19, v20, v19, 16
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshr_b64 v[19:20], v[19:20], 16
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; SI-NEXT:    v_lshr_b64 v[32:33], v[32:33], 16
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT:    v_alignbit_b32 v20, v21, v20, 16
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v48
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT:    v_lshr_b64 v[20:21], v[20:21], 16
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT:    v_alignbit_b32 v21, v22, v21, 16
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT:    v_alignbit_b32 v22, v23, v22, 16
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT:    v_lshr_b64 v[21:22], v[21:22], 16
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT:    v_lshr_b64 v[22:23], v[22:23], 16
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT:    v_alignbit_b32 v23, v24, v23, 16
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT:    v_alignbit_b32 v24, v25, v24, 16
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT:    v_lshr_b64 v[23:24], v[23:24], 16
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT:    v_alignbit_b32 v25, v26, v25, 16
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT:    v_lshr_b64 v[24:25], v[24:25], 16
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v41
+; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT:    v_lshr_b64 v[25:26], v[25:26], 16
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT:    v_alignbit_b32 v26, v27, v26, 16
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v34
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT:    v_alignbit_b32 v27, v28, v27, 16
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT:    v_lshr_b64 v[26:27], v[26:27], 16
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v43
+; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; SI-NEXT:    v_lshr_b64 v[27:28], v[27:28], 16
+; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v42
 ; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT:    v_alignbit_b32 v28, v29, v28, 16
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v37
+; SI-NEXT:    v_lshr_b64 v[28:29], v[28:29], 16
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v52
 ; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT:    v_alignbit_b32 v29, v30, v29, 16
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshr_b64 v[29:30], v[29:30], 16
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT:    v_alignbit_b32 v30, v31, v30, 16
-; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v33
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT:    v_alignbit_b32 v31, v32, v31, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT:    v_lshr_b64 v[30:31], v[30:31], 16
+; SI-NEXT:    v_mov_b32_e32 v31, v32
 ; SI-NEXT:  .LBB63_5: ; %end
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -101860,36 +102370,39 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v31, v17
 ; VI-NEXT:    v_mov_b32_e32 v30, v16
 ; VI-NEXT:    v_mov_b32_e32 v29, v15
-; VI-NEXT:    v_mov_b32_e32 v28, v14
 ; VI-NEXT:    v_mov_b32_e32 v27, v13
-; VI-NEXT:    v_mov_b32_e32 v26, v12
 ; VI-NEXT:    v_mov_b32_e32 v25, v11
-; VI-NEXT:    v_mov_b32_e32 v24, v10
 ; VI-NEXT:    v_mov_b32_e32 v23, v9
-; VI-NEXT:    v_mov_b32_e32 v22, v8
 ; VI-NEXT:    v_mov_b32_e32 v21, v7
-; VI-NEXT:    v_mov_b32_e32 v20, v6
 ; VI-NEXT:    v_mov_b32_e32 v19, v5
-; VI-NEXT:    v_mov_b32_e32 v32, v4
 ; VI-NEXT:    v_mov_b32_e32 v17, v3
-; VI-NEXT:    v_mov_b32_e32 v16, v2
 ; VI-NEXT:    v_mov_b32_e32 v15, v1
+; VI-NEXT:    v_mov_b32_e32 v28, v14
+; VI-NEXT:    v_mov_b32_e32 v26, v12
+; VI-NEXT:    v_mov_b32_e32 v24, v10
+; VI-NEXT:    v_mov_b32_e32 v22, v8
+; VI-NEXT:    v_mov_b32_e32 v20, v6
+; VI-NEXT:    v_mov_b32_e32 v32, v4
+; VI-NEXT:    v_mov_b32_e32 v16, v2
 ; VI-NEXT:    v_mov_b32_e32 v14, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s16
-; VI-NEXT:    v_mov_b32_e32 v1, s17
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; VI-NEXT:    v_mov_b32_e32 v2, s18
+; VI-NEXT:    v_mov_b32_e32 v1, s17
 ; VI-NEXT:    v_mov_b32_e32 v3, s19
-; VI-NEXT:    v_mov_b32_e32 v4, s20
 ; VI-NEXT:    v_mov_b32_e32 v5, s21
-; VI-NEXT:    v_mov_b32_e32 v6, s22
 ; VI-NEXT:    v_mov_b32_e32 v7, s23
-; VI-NEXT:    v_mov_b32_e32 v8, s24
 ; VI-NEXT:    v_mov_b32_e32 v9, s25
-; VI-NEXT:    v_mov_b32_e32 v10, s26
 ; VI-NEXT:    v_mov_b32_e32 v11, s27
-; VI-NEXT:    v_mov_b32_e32 v12, s28
 ; VI-NEXT:    v_mov_b32_e32 v13, s29
+; VI-NEXT:    v_mov_b32_e32 v0, s16
+; VI-NEXT:    v_mov_b32_e32 v2, s18
+; VI-NEXT:    v_mov_b32_e32 v4, s20
+; VI-NEXT:    v_mov_b32_e32 v6, s22
+; VI-NEXT:    v_mov_b32_e32 v8, s24
+; VI-NEXT:    v_mov_b32_e32 v10, s26
+; VI-NEXT:    v_mov_b32_e32 v12, s28
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB63_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_cbranch_execnz .LBB63_3
@@ -101898,580 +102411,600 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
 ; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
 ; VI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
 ; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v15, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v15
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc
+; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
+; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
 ; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v15
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; VI-NEXT:    v_cndmask_b32_e32 v15, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT:    v_alignbit_b32 v15, v15, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v15, v18, v34, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; VI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
 ; VI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
 ; VI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v14, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v14
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v14
+; VI-NEXT:    v_lshrrev_b64 v[34:35], 16, v[33:34]
+; VI-NEXT:    v_or_b32_e32 v33, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    v_bfe_u32 v15, v14, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v35, v18, v33, vcc
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v14
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; VI-NEXT:    v_cndmask_b32_e32 v14, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT:    v_alignbit_b32 v14, v14, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v14, v15, v18, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
+; VI-NEXT:    v_lshrrev_b64 v[14:15], 16, v[35:36]
+; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v13
+; VI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
 ; VI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
 ; VI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v13, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v13
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v13
+; VI-NEXT:    v_or_b32_e32 v33, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v35, v18, v33, vcc
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT:    v_cndmask_b32_e32 v13, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT:    v_alignbit_b32 v13, v13, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v13, v15, v18, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v13
+; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; VI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
 ; VI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
 ; VI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v12, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v12
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v12
+; VI-NEXT:    v_lshrrev_b64 v[35:36], 16, v[35:36]
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_bfe_u32 v13, v12, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v36, v15, v18, vcc
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v12
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v12
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; VI-NEXT:    v_cndmask_b32_e32 v12, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT:    v_alignbit_b32 v12, v12, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v12, v13, v15, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; VI-NEXT:    v_lshrrev_b64 v[12:13], 16, v[36:37]
+; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; VI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
 ; VI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
 ; VI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v11, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v11
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v11
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v36, v15, v18, vcc
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; VI-NEXT:    v_cndmask_b32_e32 v11, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT:    v_alignbit_b32 v11, v11, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v11, v13, v15, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
+; VI-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; VI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
 ; VI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
 ; VI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v10, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v10
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v10
+; VI-NEXT:    v_lshrrev_b64 v[36:37], 16, v[36:37]
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    v_bfe_u32 v11, v10, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v37, v13, v15, vcc
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v10
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v10
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; VI-NEXT:    v_cndmask_b32_e32 v10, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT:    v_alignbit_b32 v10, v10, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v10, v11, v13, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[37:38]
+; VI-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
+; VI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
 ; VI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
 ; VI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v9, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v9
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v9
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v37, v13, v15, vcc
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT:    v_cndmask_b32_e32 v9, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT:    v_alignbit_b32 v9, v9, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; VI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
 ; VI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
 ; VI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v8, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v8
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v8
+; VI-NEXT:    v_lshrrev_b64 v[37:38], 16, v[37:38]
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v38, v11, v13, vcc
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v8
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; VI-NEXT:    v_cndmask_b32_e32 v8, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT:    v_alignbit_b32 v8, v8, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v11, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[38:39]
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; VI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
 ; VI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
 ; VI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v7, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v7
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v7
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v38, v11, v13, vcc
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT:    v_cndmask_b32_e32 v7, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT:    v_alignbit_b32 v7, v7, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v7, v9, v11, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v7
+; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; VI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
 ; VI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
 ; VI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v6, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v6
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v6
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v48, v9, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v6
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; VI-NEXT:    v_cndmask_b32_e32 v6, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT:    v_alignbit_b32 v6, v6, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v6, v7, v9, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[48:49]
+; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; VI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
 ; VI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
 ; VI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v5, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v5
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v5
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v48, v9, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT:    v_cndmask_b32_e32 v5, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT:    v_alignbit_b32 v5, v5, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v4
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; VI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
 ; VI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; VI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v4
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v4
+; VI-NEXT:    v_lshrrev_b64 v[48:49], 16, v[48:49]
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v49, v7, v9, vcc
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_cndmask_b32_e32 v4, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT:    v_alignbit_b32 v4, v4, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v4
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[49:50]
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; VI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
 ; VI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; VI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v3
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v3
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v49, v7, v9, vcc
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v3, v3, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v2
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; VI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; VI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v2
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v2
+; VI-NEXT:    v_lshrrev_b64 v[49:50], 16, v[49:50]
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v50, v5, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_alignbit_b32 v2, v2, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v2
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[50:51]
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; VI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v1
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v50, v5, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_alignbit_b32 v1, v1, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v0
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v0
+; VI-NEXT:    v_lshrrev_b64 v[50:51], 16, v[50:51]
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v51, v3, v5, vcc
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v31
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v31, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v31
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v31
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; VI-NEXT:    v_cndmask_b32_e32 v31, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT:    v_alignbit_b32 v31, v31, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v30
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v30, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v30
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v30
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
-; VI-NEXT:    v_cndmask_b32_e32 v30, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT:    v_alignbit_b32 v30, v30, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v29, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v29
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v29
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; VI-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT:    v_alignbit_b32 v29, v29, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v28
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v28, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v28
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v28
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; VI-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT:    v_alignbit_b32 v28, v28, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v27
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v27
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v27
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; VI-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT:    v_alignbit_b32 v27, v27, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v26
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v26, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v26
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v26
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; VI-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT:    v_alignbit_b32 v26, v26, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v25
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v25
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v25
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; VI-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT:    v_alignbit_b32 v25, v25, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v24
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v24, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v24
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v24
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; VI-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT:    v_alignbit_b32 v24, v24, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v23
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v23
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; VI-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT:    v_alignbit_b32 v23, v23, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v22
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v22, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v22
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v22
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; VI-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT:    v_alignbit_b32 v22, v22, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v21, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v21
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v21
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; VI-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT:    v_alignbit_b32 v21, v21, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v20
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v20, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v20
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v20
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; VI-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT:    v_alignbit_b32 v20, v20, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v19, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v19
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v19
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; VI-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT:    v_alignbit_b32 v19, v19, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v32
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v32
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; VI-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; VI-NEXT:    v_alignbit_b32 v32, v32, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v17, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v17
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v17
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT:    v_alignbit_b32 v17, v17, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v16
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    v_cndmask_b32_e32 v16, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT:    v_alignbit_b32 v16, v16, v18, 16
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v0
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[51:52]
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v31
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v31
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v51, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v30
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v30
+; VI-NEXT:    v_lshrrev_b64 v[51:52], 16, v[51:52]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v52, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v29
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v29
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[30:31], 16, v[52:53]
+; VI-NEXT:    v_cndmask_b32_e32 v52, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v28
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v28
+; VI-NEXT:    v_lshrrev_b64 v[52:53], 16, v[52:53]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v53, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v27
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v27
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[28:29], 16, v[53:54]
+; VI-NEXT:    v_cndmask_b32_e32 v53, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v26
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v26
+; VI-NEXT:    v_lshrrev_b64 v[53:54], 16, v[53:54]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v54, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v25
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v25
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[26:27], 16, v[54:55]
+; VI-NEXT:    v_cndmask_b32_e32 v54, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v24
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v24
+; VI-NEXT:    v_lshrrev_b64 v[38:39], 16, v[38:39]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v39, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v23
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v23
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[24:25], 16, v[39:40]
+; VI-NEXT:    v_cndmask_b32_e32 v39, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v22
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v22
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 16, v[39:40]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v40, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v21
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v21
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[22:23], 16, v[40:41]
+; VI-NEXT:    v_cndmask_b32_e32 v40, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v20
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v20
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 16, v[40:41]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v41, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v19
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v18, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v32
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v31, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v17
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v17
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[20:21], 16, v[41:42]
+; VI-NEXT:    v_cndmask_b32_e32 v41, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v15, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 16, v[54:55]
+; VI-NEXT:    v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT:    v_lshrrev_b64 v[41:42], 16, v[41:42]
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[32:33], 16, v[31:32]
+; VI-NEXT:    v_lshrrev_b64 v[16:17], 16, v[15:16]
+; VI-NEXT:    v_mov_b32_e32 v1, v50
+; VI-NEXT:    v_mov_b32_e32 v3, v49
+; VI-NEXT:    v_mov_b32_e32 v5, v48
+; VI-NEXT:    v_mov_b32_e32 v7, v38
+; VI-NEXT:    v_mov_b32_e32 v9, v37
+; VI-NEXT:    v_mov_b32_e32 v11, v36
+; VI-NEXT:    v_mov_b32_e32 v13, v35
+; VI-NEXT:    v_mov_b32_e32 v15, v34
+; VI-NEXT:    v_mov_b32_e32 v17, v41
+; VI-NEXT:    v_mov_b32_e32 v19, v18
+; VI-NEXT:    v_mov_b32_e32 v21, v40
+; VI-NEXT:    v_mov_b32_e32 v23, v39
+; VI-NEXT:    v_mov_b32_e32 v25, v54
+; VI-NEXT:    v_mov_b32_e32 v27, v53
+; VI-NEXT:    v_mov_b32_e32 v29, v52
+; VI-NEXT:    v_mov_b32_e32 v31, v51
 ; VI-NEXT:  .LBB63_3: ; %end
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v18, v32
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB63_4:
 ; VI-NEXT:    s_branch .LBB63_2
@@ -103136,7 +103669,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:280
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:276
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:272
@@ -103169,7 +103702,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:152
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:148
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:144
@@ -103202,7 +103735,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:16
@@ -104004,7 +104537,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:8
@@ -104037,7 +104570,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:136
@@ -104070,7 +104603,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:264
@@ -104110,7 +104643,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:288
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:284
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:280
@@ -104143,7 +104676,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:172
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:156
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:152
@@ -104176,7 +104709,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v157, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v158, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v168, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v169, s32 offset:28
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v170, s32 offset:24
@@ -104868,7 +105401,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v184, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v175, off, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v174, off, s32 offset:8
@@ -104901,7 +105434,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:116
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:120
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:128
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:136
@@ -104934,7 +105467,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:244
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:248
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:256
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:260
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:264
@@ -108700,7 +109233,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -108733,7 +109266,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -108766,7 +109299,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -108844,7 +109377,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -108877,7 +109410,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -108910,7 +109443,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -108968,6 +109501,10 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16i64_to_v64i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -108984,10 +109521,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -109019,14 +109552,13 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; kill: killed $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB68_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 16
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v34, v30, v29, 16
@@ -109099,7 +109631,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_addc_u32_e32 v28, vcc, 0, v28, vcc
 ; SI-NEXT:    v_add_i32_e32 v29, vcc, 3, v29
 ; SI-NEXT:    v_addc_u32_e32 v30, vcc, 0, v30, vcc
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v32
 ; SI-NEXT:    v_addc_u32_e32 v31, vcc, 0, v31, vcc
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 16
@@ -109322,7 +109853,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -110320,7 +110851,13 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -110352,12 +110889,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -110373,7 +110904,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v41
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -110590,7 +111120,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -111764,7 +112293,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -111797,7 +112326,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -111830,7 +112359,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -111908,7 +112437,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -111941,7 +112470,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -111974,7 +112503,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -112032,22 +112561,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16f64_to_v128i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -112180,6 +112693,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr46
 ; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr42
@@ -112211,14 +112740,13 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB72_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v32, v31, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -112449,7 +112977,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB72_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
 ; SI-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
 ; SI-NEXT:    v_alignbit_b32 v33, v32, v31, 24
@@ -113228,22 +113755,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-LABEL: bitcast_v16f64_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -113346,6 +113857,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr57
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -113448,132 +113975,132 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[31:32]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[29:30]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[27:28]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[25:26]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[23:24]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[21:22]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[17:18]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[9:10]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[7:8]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[31:32]
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[29:30]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[27:28]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[25:26]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[23:24]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[21:22]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[19:20]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[17:18]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[15:16]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[13:14]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[11:12]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[9:10]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[7:8]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
-; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 24, v28
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v57, 8, v11
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v10
@@ -114184,6 +114711,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v16f64_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -114200,9 +114731,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; kill: killed $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
@@ -114335,7 +114863,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr48
 ; GFX9-NEXT:    ; kill: killed $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
@@ -114395,7 +114922,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
+; GFX9-NEXT:    s_waitcnt vmcnt(47)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -114408,7 +114935,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(33)
+; GFX9-NEXT:    s_waitcnt vmcnt(49)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -114416,152 +114943,151 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[19:20]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[17:18]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[3:4]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[29:30]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[27:28]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[25:26]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[23:24]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[21:22]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[19:20]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[17:18]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[15:16]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[11:12]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[9:10]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[7:8]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[5:6]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[3:4]
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
@@ -114571,6 +115097,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 8, v12
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 8, v11
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 16, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 8, v9
@@ -114599,7 +115126,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB72_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    s_waitcnt vmcnt(30)
+; GFX9-NEXT:    s_waitcnt vmcnt(46)
 ; GFX9-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
 ; GFX9-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
 ; GFX9-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
@@ -115628,7 +116155,11 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -115649,10 +116180,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
@@ -116272,7 +116799,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -117056,6 +117583,11 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v33, s4
 ; SI-NEXT:    v_readlane_b32 s4, v61, 39
 ; SI-NEXT:    v_mov_b32_e32 v30, s4
+; SI-NEXT:    v_mov_b32_e32 v29, s46
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s98
 ; SI-NEXT:    v_readlane_b32 s4, v61, 40
 ; SI-NEXT:    v_mov_b32_e32 v34, s4
 ; SI-NEXT:    v_readlane_b32 s4, v61, 41
@@ -117148,6 +117680,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s96
 ; SI-NEXT:    v_readlane_b32 s4, v62, 0
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -117204,20 +117740,69 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v25, s4
-; SI-NEXT:    v_mov_b32_e32 v29, s46
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v29, s98
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v29, s96
+; SI-NEXT:    v_readlane_b32 s4, v62, 14
+; SI-NEXT:    v_mov_b32_e32 v60, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 15
+; SI-NEXT:    v_mov_b32_e32 v31, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 16
+; SI-NEXT:    v_mov_b32_e32 v32, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 17
+; SI-NEXT:    v_mov_b32_e32 v18, s5
+; SI-NEXT:    v_mov_b32_e32 v46, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 0
+; SI-NEXT:    v_readlane_b32 s5, v61, 1
+; SI-NEXT:    v_mov_b32_e32 v59, s17
+; SI-NEXT:    v_mov_b32_e32 v58, s16
+; SI-NEXT:    v_mov_b32_e32 v45, s19
+; SI-NEXT:    v_mov_b32_e32 v44, s18
+; SI-NEXT:    v_mov_b32_e32 v53, s21
+; SI-NEXT:    v_mov_b32_e32 v52, s20
+; SI-NEXT:    v_mov_b32_e32 v39, s23
+; SI-NEXT:    v_mov_b32_e32 v38, s22
+; SI-NEXT:    v_mov_b32_e32 v24, s25
+; SI-NEXT:    v_mov_b32_e32 v23, s24
+; SI-NEXT:    v_mov_b32_e32 v22, s27
+; SI-NEXT:    v_mov_b32_e32 v21, s26
+; SI-NEXT:    v_mov_b32_e32 v20, s29
+; SI-NEXT:    v_mov_b32_e32 v19, s28
+; SI-NEXT:    v_mov_b32_e32 v16, s7
+; SI-NEXT:    v_mov_b32_e32 v15, s6
+; SI-NEXT:    v_mov_b32_e32 v14, s9
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s86
+; SI-NEXT:    v_mov_b32_e32 v13, s8
+; SI-NEXT:    v_mov_b32_e32 v12, s11
+; SI-NEXT:    v_mov_b32_e32 v11, s10
+; SI-NEXT:    v_mov_b32_e32 v10, s13
+; SI-NEXT:    v_mov_b32_e32 v9, s12
+; SI-NEXT:    v_mov_b32_e32 v8, s15
+; SI-NEXT:    v_mov_b32_e32 v7, s14
+; SI-NEXT:    v_mov_b32_e32 v6, s41
+; SI-NEXT:    v_mov_b32_e32 v5, s40
+; SI-NEXT:    v_mov_b32_e32 v4, s43
+; SI-NEXT:    v_mov_b32_e32 v3, s42
+; SI-NEXT:    v_mov_b32_e32 v2, s45
+; SI-NEXT:    v_mov_b32_e32 v1, s44
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v28, s38
+; SI-NEXT:    v_mov_b32_e32 v27, s36
+; SI-NEXT:    v_mov_b32_e32 v26, s34
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v25, s30
+; SI-NEXT:    v_mov_b32_e32 v56, s94
+; SI-NEXT:    v_mov_b32_e32 v55, s92
+; SI-NEXT:    v_mov_b32_e32 v54, s90
+; SI-NEXT:    v_mov_b32_e32 v42, s88
+; SI-NEXT:    v_mov_b32_e32 v41, s78
+; SI-NEXT:    v_mov_b32_e32 v40, s76
+; SI-NEXT:    v_mov_b32_e32 v50, s74
+; SI-NEXT:    v_mov_b32_e32 v49, s72
+; SI-NEXT:    v_mov_b32_e32 v48, s62
+; SI-NEXT:    v_mov_b32_e32 v47, s60
+; SI-NEXT:    v_mov_b32_e32 v36, s58
+; SI-NEXT:    v_mov_b32_e32 v35, s56
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
@@ -117260,165 +117845,108 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v29, s50
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s4, v62, 14
-; SI-NEXT:    v_mov_b32_e32 v60, s4
-; SI-NEXT:    v_readlane_b32 s4, v62, 15
-; SI-NEXT:    v_mov_b32_e32 v31, s4
-; SI-NEXT:    v_readlane_b32 s4, v62, 16
-; SI-NEXT:    v_mov_b32_e32 v32, s4
-; SI-NEXT:    v_readlane_b32 s4, v62, 17
-; SI-NEXT:    v_mov_b32_e32 v18, s5
-; SI-NEXT:    v_mov_b32_e32 v46, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 0
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 2
+; SI-NEXT:    v_readlane_b32 s5, v61, 3
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 1
-; SI-NEXT:    v_readlane_b32 s4, v61, 2
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 4
+; SI-NEXT:    v_readlane_b32 s5, v61, 5
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 3
-; SI-NEXT:    v_readlane_b32 s4, v61, 4
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 6
+; SI-NEXT:    v_readlane_b32 s5, v61, 7
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 5
-; SI-NEXT:    v_readlane_b32 s4, v61, 6
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 8
+; SI-NEXT:    v_readlane_b32 s5, v61, 9
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 7
-; SI-NEXT:    v_readlane_b32 s4, v61, 8
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 10
+; SI-NEXT:    v_readlane_b32 s5, v61, 11
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 9
-; SI-NEXT:    v_readlane_b32 s4, v61, 10
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 12
+; SI-NEXT:    v_readlane_b32 s5, v61, 13
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 11
-; SI-NEXT:    v_readlane_b32 s4, v61, 12
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 14
+; SI-NEXT:    v_readlane_b32 s5, v61, 15
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 13
-; SI-NEXT:    v_readlane_b32 s4, v61, 14
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 16
+; SI-NEXT:    v_readlane_b32 s5, v61, 17
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 15
-; SI-NEXT:    v_readlane_b32 s4, v61, 16
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 18
+; SI-NEXT:    v_readlane_b32 s5, v61, 19
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 17
-; SI-NEXT:    v_readlane_b32 s4, v61, 18
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 20
+; SI-NEXT:    v_readlane_b32 s5, v61, 21
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 19
-; SI-NEXT:    v_readlane_b32 s4, v61, 20
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 22
+; SI-NEXT:    v_readlane_b32 s5, v61, 23
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 21
-; SI-NEXT:    v_readlane_b32 s4, v61, 22
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 24
+; SI-NEXT:    v_readlane_b32 s5, v61, 25
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 23
-; SI-NEXT:    v_readlane_b32 s4, v61, 24
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 26
+; SI-NEXT:    v_readlane_b32 s5, v61, 27
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 25
-; SI-NEXT:    v_readlane_b32 s4, v61, 26
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 28
+; SI-NEXT:    v_readlane_b32 s5, v61, 29
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 27
-; SI-NEXT:    v_readlane_b32 s4, v61, 28
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 30
+; SI-NEXT:    v_readlane_b32 s5, v61, 31
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 29
-; SI-NEXT:    v_readlane_b32 s4, v61, 30
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 32
+; SI-NEXT:    v_readlane_b32 s5, v61, 33
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s48
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 31
-; SI-NEXT:    v_readlane_b32 s4, v61, 32
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_mov_b32_e32 v59, s17
-; SI-NEXT:    v_mov_b32_e32 v58, s16
-; SI-NEXT:    v_mov_b32_e32 v45, s19
-; SI-NEXT:    v_mov_b32_e32 v44, s18
-; SI-NEXT:    v_mov_b32_e32 v53, s21
-; SI-NEXT:    v_mov_b32_e32 v52, s20
-; SI-NEXT:    v_mov_b32_e32 v39, s23
-; SI-NEXT:    v_mov_b32_e32 v38, s22
-; SI-NEXT:    v_mov_b32_e32 v24, s25
-; SI-NEXT:    v_mov_b32_e32 v23, s24
-; SI-NEXT:    v_mov_b32_e32 v22, s27
-; SI-NEXT:    v_mov_b32_e32 v21, s26
-; SI-NEXT:    v_mov_b32_e32 v20, s29
-; SI-NEXT:    v_mov_b32_e32 v19, s28
-; SI-NEXT:    v_mov_b32_e32 v16, s7
-; SI-NEXT:    v_mov_b32_e32 v15, s6
-; SI-NEXT:    v_mov_b32_e32 v14, s9
-; SI-NEXT:    v_mov_b32_e32 v13, s8
-; SI-NEXT:    v_mov_b32_e32 v12, s11
-; SI-NEXT:    v_mov_b32_e32 v11, s10
-; SI-NEXT:    v_mov_b32_e32 v10, s13
-; SI-NEXT:    v_mov_b32_e32 v9, s12
-; SI-NEXT:    v_mov_b32_e32 v8, s15
-; SI-NEXT:    v_mov_b32_e32 v7, s14
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v5, s40
-; SI-NEXT:    v_mov_b32_e32 v4, s43
-; SI-NEXT:    v_mov_b32_e32 v3, s42
-; SI-NEXT:    v_mov_b32_e32 v2, s45
-; SI-NEXT:    v_mov_b32_e32 v1, s44
-; SI-NEXT:    v_mov_b32_e32 v28, s38
-; SI-NEXT:    v_mov_b32_e32 v27, s36
-; SI-NEXT:    v_mov_b32_e32 v26, s34
-; SI-NEXT:    v_mov_b32_e32 v25, s30
-; SI-NEXT:    v_mov_b32_e32 v56, s94
-; SI-NEXT:    v_mov_b32_e32 v55, s92
-; SI-NEXT:    v_mov_b32_e32 v54, s90
-; SI-NEXT:    v_mov_b32_e32 v42, s88
-; SI-NEXT:    v_mov_b32_e32 v41, s78
-; SI-NEXT:    v_mov_b32_e32 v40, s76
-; SI-NEXT:    v_mov_b32_e32 v50, s74
-; SI-NEXT:    v_mov_b32_e32 v49, s72
-; SI-NEXT:    v_mov_b32_e32 v48, s62
-; SI-NEXT:    v_mov_b32_e32 v47, s60
-; SI-NEXT:    v_mov_b32_e32 v36, s58
-; SI-NEXT:    v_mov_b32_e32 v35, s56
-; SI-NEXT:    v_readlane_b32 s5, v61, 33
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB73_5: ; %end
@@ -118690,6 +119218,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 11
 ; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    v_mov_b32_e32 v40, s48
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s38
 ; VI-NEXT:    v_readlane_b32 s4, v62, 12
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
@@ -118727,6 +119259,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 25
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s36
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 26
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
@@ -118764,6 +119299,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 37
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s34
 ; VI-NEXT:    v_readlane_b32 s4, v62, 38
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
@@ -118779,52 +119317,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 42
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
-; VI-NEXT:    v_mov_b32_e32 v40, s48
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s38
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s36
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s34
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s30
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s90
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s88
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s78
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s76
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s74
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s72
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s62
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s60
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s58
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s56
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    v_readlane_b32 s4, v62, 43
 ; VI-NEXT:    v_mov_b32_e32 v53, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 44
@@ -118834,6 +119326,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 46
 ; VI-NEXT:    v_mov_b32_e32 v58, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 47
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 48
 ; VI-NEXT:    v_mov_b32_e32 v54, s4
@@ -118846,17 +119339,17 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 52
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 53
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s30
 ; VI-NEXT:    v_mov_b32_e32 v49, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 54
 ; VI-NEXT:    v_mov_b32_e32 v61, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 55
 ; VI-NEXT:    v_mov_b32_e32 v36, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 56
-; VI-NEXT:    v_mov_b32_e32 v40, s46
 ; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 57
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v12, s5
 ; VI-NEXT:    v_mov_b32_e32 v1, s44
 ; VI-NEXT:    v_mov_b32_e32 v2, s45
@@ -118886,13 +119379,48 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v28, s21
 ; VI-NEXT:    v_mov_b32_e32 v29, s18
 ; VI-NEXT:    v_mov_b32_e32 v30, s19
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s90
 ; VI-NEXT:    v_mov_b32_e32 v31, s16
 ; VI-NEXT:    v_mov_b32_e32 v32, s17
 ; VI-NEXT:    v_mov_b32_e32 v42, s70
 ; VI-NEXT:    v_mov_b32_e32 v50, s4
-; VI-NEXT:    v_mov_b32_e32 v40, v43
 ; VI-NEXT:    v_mov_b32_e32 v46, v38
 ; VI-NEXT:    v_mov_b32_e32 v38, v34
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s88
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s78
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s76
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s74
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s72
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s62
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s60
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s58
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s56
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s46
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, v43
 ; VI-NEXT:  .LBB73_5: ; %end
 ; VI-NEXT:    v_lshlrev_b32_e32 v34, 8, v42
 ; VI-NEXT:    v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -119906,6 +120434,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $sgpr46
 ; GFX9-NEXT:    s_branch .LBB73_2
 ; GFX9-NEXT:  .LBB73_4:
+; GFX9-NEXT:    v_mov_b32_e32 v41, s66
+; GFX9-NEXT:    v_mov_b32_e32 v40, s36
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s34
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s81
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s71
@@ -119982,6 +120516,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 9
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s30
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 10
@@ -120040,71 +120578,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 28
 ; GFX9-NEXT:    v_mov_b32_e32 v29, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 29
-; GFX9-NEXT:    v_mov_b32_e32 v41, s66
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s4
-; GFX9-NEXT:    v_mov_b32_e32 v40, s36
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s34
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s30
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s94
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s92
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s90
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s88
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s78
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s76
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s74
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s72
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s62
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s60
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s58
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s56
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 30
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 31
 ; GFX9-NEXT:    v_mov_b32_e32 v44, s4
@@ -120119,6 +120596,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 36
 ; GFX9-NEXT:    v_mov_b32_e32 v55, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 37
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s94
 ; GFX9-NEXT:    v_mov_b32_e32 v61, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 38
 ; GFX9-NEXT:    v_mov_b32_e32 v42, s4
@@ -120143,7 +120624,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 48
 ; GFX9-NEXT:    v_mov_b32_e32 v60, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 49
-; GFX9-NEXT:    v_mov_b32_e32 v40, s46
 ; GFX9-NEXT:    v_mov_b32_e32 v12, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s44
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s45
@@ -120181,6 +120661,54 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v54, s64
 ; GFX9-NEXT:    v_mov_b32_e32 v52, s54
 ; GFX9-NEXT:    v_mov_b32_e32 v25, s4
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s92
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s90
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s88
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s78
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s76
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s74
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s72
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s62
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s60
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s58
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s56
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s46
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -120202,6 +120730,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v51, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v51, 8, v45
 ; GFX9-NEXT:    v_or_b32_sdwa v48, v48, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v40, 8, v56
 ; GFX9-NEXT:    v_or_b32_sdwa v50, v50, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
@@ -120252,22 +120782,20 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
 ; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v36, 8, v36
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v36, 8, v44
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v28, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v19, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 8, v30
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 8, v51
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v36, 8, v36
 ; GFX9-NEXT:    v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -120599,7 +121127,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
 ; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:88
@@ -120634,7 +121162,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11-NEXT:    v_writelane_b32 v77, s101, 5
 ; GFX11-NEXT:    s_mov_b32 vcc_hi, 0
 ; GFX11-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT:    s_clause 0x13
+; GFX11-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:76
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:72
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:68
@@ -121542,7 +122070,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:80
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:96
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT:    s_clause 0x13
+; GFX11-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v75, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:8
@@ -121605,7 +122133,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11-NEXT:    v_readlane_b32 s31, v76, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v76, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:80
 ; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:84
 ; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:88
@@ -121818,13 +122346,26 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -121985,44 +122526,30 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:372
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB74_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -122031,11 +122558,11 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -122638,7 +123165,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB74_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -122652,8 +123178,8 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -123333,13 +123859,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -123476,34 +124014,20 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -123989,7 +124513,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v61
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -124567,13 +125090,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -124715,34 +125252,20 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -125229,7 +125752,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v61
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -126368,7 +126890,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:592
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:588
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:584
@@ -126401,7 +126923,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:456
@@ -127342,7 +127864,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
 ; GFX11-FAKE16-NEXT:  .LBB74_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:400
@@ -127375,7 +127897,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:528
@@ -128092,24 +128614,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_branch .LBB75_3
 ; SI-NEXT:  .LBB75_2:
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v55, v56
 ; SI-NEXT:    v_mov_b32_e32 v42, v46
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -128120,10 +128631,22 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:  .LBB75_3: ; %Flow
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_mov_b32_e32 v35, v57
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -128133,7 +128656,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v44
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -128728,13 +129250,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v19
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v47, 8, v7
 ; VI-NEXT:    v_lshlrev_b32_e32 v46, 8, v9
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v13
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -128962,11 +129484,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    s_or_b32 s4, s4, s5
@@ -128976,11 +129498,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_lshl_b32 s7, s23, 8
 ; VI-NEXT:    s_lshl_b32 s8, s27, 8
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -128988,6 +129507,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -129196,12 +129717,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_branch .LBB75_3
 ; VI-NEXT:  .LBB75_2:
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v44, v56
 ; VI-NEXT:    v_mov_b32_e32 v41, v33
 ; VI-NEXT:    v_mov_b32_e32 v50, v40
@@ -129219,6 +129734,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v54, v53
 ; VI-NEXT:    v_mov_b32_e32 v52, v36
 ; VI-NEXT:    v_mov_b32_e32 v49, v51
@@ -129228,7 +129749,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v51, v41
 ; VI-NEXT:    v_mov_b32_e32 v36, v44
 ; VI-NEXT:    v_mov_b32_e32 v53, v54
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v54, v60
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -129241,7 +129762,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:  ; %bb.4: ; %cmp.true
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v37
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
@@ -129826,8 +130346,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v11
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 8, v9
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -130006,16 +130526,18 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_lshl_b32 s6, s19, 8
 ; GFX9-NEXT:    s_lshl_b32 s7, s23, 8
 ; GFX9-NEXT:    s_lshl_b32 s8, s27, 8
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -130042,9 +130564,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -130060,14 +130581,16 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -130079,10 +130602,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -130095,10 +130619,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -130112,17 +130638,22 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -130138,45 +130669,24 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -130227,18 +130737,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB75_3
 ; GFX9-NEXT:  .LBB75_2:
-; GFX9-NEXT:    v_mov_b32_e32 v38, v51
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v43
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -130252,6 +130750,18 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_mov_b32_e32 v38, v51
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v31
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
@@ -130689,7 +131199,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -130722,7 +131232,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT:    s_clause 0x7
+; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -131579,7 +132089,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-TRUE16-NEXT:  .LBB75_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -131612,7 +132122,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT:    s_clause 0x7
+; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -131630,7 +132140,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -131663,7 +132173,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT:    s_clause 0x7
+; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -132520,7 +133030,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-FAKE16-NEXT:  .LBB75_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -132553,7 +133063,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT:    s_clause 0x7
+; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -132588,22 +133098,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16f64_to_v64bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -132672,6 +133166,22 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; kill: killed $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr62
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -132703,7 +133213,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; kill: killed $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -132713,7 +133223,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v35, 0xffff0000, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v31
 ; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
@@ -132843,7 +133353,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB76_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
 ; SI-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
 ; SI-NEXT:    v_and_b32_e32 v35, 0xffff0000, v32
@@ -133554,94 +134063,92 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    v_writelane_b32 v62, s46, 3
 ; SI-NEXT:    s_cbranch_execnz .LBB77_4
 ; SI-NEXT:  .LBB77_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[35:36], s[44:45], 1.0
 ; SI-NEXT:    v_add_f64 v[3:4], s[6:7], 1.0
-; SI-NEXT:    v_add_f64 v[49:50], s[28:29], 1.0
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v35
 ; SI-NEXT:    v_add_f64 v[1:2], s[22:23], 1.0
 ; SI-NEXT:    v_add_f64 v[41:42], s[24:25], 1.0
-; SI-NEXT:    v_add_f64 v[27:28], s[40:41], 1.0
-; SI-NEXT:    v_add_f64 v[15:16], s[10:11], 1.0
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v35
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v50
-; SI-NEXT:    v_add_f64 v[31:32], s[42:43], 1.0
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v16
-; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v15
-; SI-NEXT:    v_and_b32_e32 v43, 0xffff0000, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v28
-; SI-NEXT:    v_and_b32_e32 v45, 0xffff0000, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v27
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v42
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v42
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v41
+; SI-NEXT:    v_and_b32_e32 v54, 0xffff0000, v42
+; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v42
+; SI-NEXT:    v_and_b32_e32 v40, 0xffff0000, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v55, 16, v41
 ; SI-NEXT:    v_and_b32_e32 v42, 0xffff0000, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_f64 v[2:3], s[20:21], 1.0
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; SI-NEXT:    v_add_f64 v[11:12], s[8:9], 1.0
-; SI-NEXT:    v_add_f64 v[7:8], s[4:5], 1.0
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; SI-NEXT:    v_and_b32_e32 v47, 0xffff0000, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v32
-; SI-NEXT:    v_and_b32_e32 v57, 0xffff0000, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v31
-; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v3
-; SI-NEXT:    v_add_f64 v[3:4], s[16:17], 1.0
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v11
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v5
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f64 v[51:52], s[26:27], 1.0
+; SI-NEXT:    v_add_f64 v[49:50], s[28:29], 1.0
+; SI-NEXT:    v_add_f64 v[35:36], s[44:45], 1.0
+; SI-NEXT:    v_add_f64 v[31:32], s[42:43], 1.0
+; SI-NEXT:    v_add_f64 v[27:28], s[40:41], 1.0
 ; SI-NEXT:    v_add_f64 v[23:24], s[14:15], 1.0
 ; SI-NEXT:    v_add_f64 v[19:20], s[12:13], 1.0
+; SI-NEXT:    v_add_f64 v[15:16], s[10:11], 1.0
+; SI-NEXT:    v_add_f64 v[11:12], s[8:9], 1.0
+; SI-NEXT:    v_add_f64 v[7:8], s[4:5], 1.0
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v46, 0xffff0000, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v3
 ; SI-NEXT:    v_add_f64 v[59:60], s[18:19], 1.0
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v8
-; SI-NEXT:    v_and_b32_e32 v37, 0xffff0000, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
-; SI-NEXT:    v_and_b32_e32 v39, 0xffff0000, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
-; SI-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v24
-; SI-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
-; SI-NEXT:    v_and_b32_e32 v61, 0xffff0000, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v36
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v50
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v49
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v49
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_f64 v[3:4], s[16:17], 1.0
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v8
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v8
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v16
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v24
+; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v28
+; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v32
+; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v31
+; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; SI-NEXT:    v_and_b32_e32 v34, 0xffff0000, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v36
+; SI-NEXT:    v_and_b32_e32 v36, 0xffff0000, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
+; SI-NEXT:    v_and_b32_e32 v38, 0xffff0000, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v50
+; SI-NEXT:    v_and_b32_e32 v48, 0xffff0000, v49
+; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v49
 ; SI-NEXT:    v_and_b32_e32 v50, 0xffff0000, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v52
 ; SI-NEXT:    v_and_b32_e32 v52, 0xffff0000, v51
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; SI-NEXT:    v_and_b32_e32 v36, 0xffff0000, v60
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v60
+; SI-NEXT:    v_and_b32_e32 v43, 0xffff0000, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v1
+; SI-NEXT:    v_and_b32_e32 v47, 0xffff0000, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v2
+; SI-NEXT:    v_and_b32_e32 v58, 0xffff0000, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v60
 ; SI-NEXT:    v_and_b32_e32 v60, 0xffff0000, v59
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v59
+; SI-NEXT:    v_and_b32_e32 v61, 0xffff0000, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    s_branch .LBB77_5
@@ -133716,15 +134223,18 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    ; kill: killed $sgpr46
 ; SI-NEXT:    s_branch .LBB77_2
 ; SI-NEXT:  .LBB77_4:
-; SI-NEXT:    v_mov_b32_e32 v1, s37
+; SI-NEXT:    v_mov_b32_e32 v1, s59
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v1, s36
-; SI-NEXT:    v_readlane_b32 s4, v62, 0
+; SI-NEXT:    v_mov_b32_e32 v1, s58
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v1, s34
-; SI-NEXT:    v_mov_b32_e32 v7, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s57
+; SI-NEXT:    v_readlane_b32 s4, v62, 0
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v1, s56
+; SI-NEXT:    v_mov_b32_e32 v61, s4
 ; SI-NEXT:    v_readlane_b32 s4, v62, 1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -133732,328 +134242,329 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    v_readlane_b32 s4, v62, 2
 ; SI-NEXT:    v_mov_b32_e32 v2, s4
 ; SI-NEXT:    v_readlane_b32 s4, v62, 3
-; SI-NEXT:    v_mov_b32_e32 v5, s59
-; SI-NEXT:    v_mov_b32_e32 v4, s58
-; SI-NEXT:    v_mov_b32_e32 v9, s57
-; SI-NEXT:    v_mov_b32_e32 v6, s56
-; SI-NEXT:    v_mov_b32_e32 v13, s99
-; SI-NEXT:    v_mov_b32_e32 v10, s98
-; SI-NEXT:    v_mov_b32_e32 v17, s97
-; SI-NEXT:    v_mov_b32_e32 v14, s96
-; SI-NEXT:    v_mov_b32_e32 v21, s87
-; SI-NEXT:    v_mov_b32_e32 v18, s86
-; SI-NEXT:    v_mov_b32_e32 v25, s85
-; SI-NEXT:    v_mov_b32_e32 v22, s84
-; SI-NEXT:    v_mov_b32_e32 v29, s83
-; SI-NEXT:    v_mov_b32_e32 v26, s82
-; SI-NEXT:    v_mov_b32_e32 v33, s81
-; SI-NEXT:    v_mov_b32_e32 v30, s80
-; SI-NEXT:    v_mov_b32_e32 v37, s71
-; SI-NEXT:    v_mov_b32_e32 v34, s70
-; SI-NEXT:    v_mov_b32_e32 v39, s69
-; SI-NEXT:    v_mov_b32_e32 v38, s68
-; SI-NEXT:    v_mov_b32_e32 v53, s67
-; SI-NEXT:    v_mov_b32_e32 v48, s66
-; SI-NEXT:    v_mov_b32_e32 v55, s65
-; SI-NEXT:    v_mov_b32_e32 v54, s64
-; SI-NEXT:    v_mov_b32_e32 v43, s55
-; SI-NEXT:    v_mov_b32_e32 v40, s54
-; SI-NEXT:    v_mov_b32_e32 v45, s53
-; SI-NEXT:    v_mov_b32_e32 v44, s52
-; SI-NEXT:    v_mov_b32_e32 v47, s51
-; SI-NEXT:    v_mov_b32_e32 v46, s50
-; SI-NEXT:    v_mov_b32_e32 v57, s49
-; SI-NEXT:    v_mov_b32_e32 v56, s48
-; SI-NEXT:    v_mov_b32_e32 v61, s39
-; SI-NEXT:    v_mov_b32_e32 v58, s38
-; SI-NEXT:    v_mov_b32_e32 v8, s35
-; SI-NEXT:    v_mov_b32_e32 v24, s31
-; SI-NEXT:    v_mov_b32_e32 v23, s30
+; SI-NEXT:    v_mov_b32_e32 v6, s99
+; SI-NEXT:    v_mov_b32_e32 v5, s98
+; SI-NEXT:    v_mov_b32_e32 v8, s97
+; SI-NEXT:    v_mov_b32_e32 v7, s96
+; SI-NEXT:    v_mov_b32_e32 v10, s87
+; SI-NEXT:    v_mov_b32_e32 v9, s86
+; SI-NEXT:    v_mov_b32_e32 v12, s85
+; SI-NEXT:    v_mov_b32_e32 v11, s84
+; SI-NEXT:    v_mov_b32_e32 v14, s83
+; SI-NEXT:    v_mov_b32_e32 v13, s82
+; SI-NEXT:    v_mov_b32_e32 v16, s81
+; SI-NEXT:    v_mov_b32_e32 v15, s80
+; SI-NEXT:    v_mov_b32_e32 v18, s71
+; SI-NEXT:    v_mov_b32_e32 v17, s70
+; SI-NEXT:    v_mov_b32_e32 v20, s69
+; SI-NEXT:    v_mov_b32_e32 v19, s68
+; SI-NEXT:    v_mov_b32_e32 v22, s67
+; SI-NEXT:    v_mov_b32_e32 v21, s66
+; SI-NEXT:    v_mov_b32_e32 v24, s65
+; SI-NEXT:    v_mov_b32_e32 v23, s64
+; SI-NEXT:    v_mov_b32_e32 v26, s55
+; SI-NEXT:    v_mov_b32_e32 v25, s54
+; SI-NEXT:    v_mov_b32_e32 v28, s53
+; SI-NEXT:    v_mov_b32_e32 v27, s52
+; SI-NEXT:    v_mov_b32_e32 v30, s51
+; SI-NEXT:    v_mov_b32_e32 v29, s50
+; SI-NEXT:    v_mov_b32_e32 v32, s49
+; SI-NEXT:    v_mov_b32_e32 v31, s48
+; SI-NEXT:    v_mov_b32_e32 v34, s39
+; SI-NEXT:    v_mov_b32_e32 v33, s38
+; SI-NEXT:    v_mov_b32_e32 v36, s37
+; SI-NEXT:    v_mov_b32_e32 v35, s36
+; SI-NEXT:    v_mov_b32_e32 v38, s35
+; SI-NEXT:    v_mov_b32_e32 v37, s34
+; SI-NEXT:    v_mov_b32_e32 v48, s31
+; SI-NEXT:    v_mov_b32_e32 v39, s30
 ; SI-NEXT:    v_mov_b32_e32 v50, s95
 ; SI-NEXT:    v_mov_b32_e32 v49, s94
 ; SI-NEXT:    v_mov_b32_e32 v52, s93
 ; SI-NEXT:    v_mov_b32_e32 v51, s92
-; SI-NEXT:    v_mov_b32_e32 v16, s91
-; SI-NEXT:    v_mov_b32_e32 v15, s90
-; SI-NEXT:    v_mov_b32_e32 v28, s89
-; SI-NEXT:    v_mov_b32_e32 v27, s88
+; SI-NEXT:    v_mov_b32_e32 v54, s91
+; SI-NEXT:    v_mov_b32_e32 v53, s90
+; SI-NEXT:    v_mov_b32_e32 v40, s89
+; SI-NEXT:    v_mov_b32_e32 v55, s88
 ; SI-NEXT:    v_mov_b32_e32 v42, s79
 ; SI-NEXT:    v_mov_b32_e32 v41, s78
-; SI-NEXT:    v_mov_b32_e32 v11, s77
-; SI-NEXT:    v_mov_b32_e32 v12, s76
-; SI-NEXT:    v_mov_b32_e32 v32, s75
-; SI-NEXT:    v_mov_b32_e32 v31, s74
-; SI-NEXT:    v_mov_b32_e32 v19, s73
-; SI-NEXT:    v_mov_b32_e32 v20, s72
-; SI-NEXT:    v_mov_b32_e32 v36, s63
-; SI-NEXT:    v_mov_b32_e32 v35, s62
+; SI-NEXT:    v_mov_b32_e32 v43, s77
+; SI-NEXT:    v_mov_b32_e32 v44, s76
+; SI-NEXT:    v_mov_b32_e32 v46, s75
+; SI-NEXT:    v_mov_b32_e32 v45, s74
+; SI-NEXT:    v_mov_b32_e32 v47, s73
+; SI-NEXT:    v_mov_b32_e32 v56, s72
+; SI-NEXT:    v_mov_b32_e32 v58, s63
+; SI-NEXT:    v_mov_b32_e32 v57, s62
 ; SI-NEXT:    v_mov_b32_e32 v60, s61
 ; SI-NEXT:    v_mov_b32_e32 v59, s60
 ; SI-NEXT:    v_mov_b32_e32 v3, s4
 ; SI-NEXT:  .LBB77_5: ; %end
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_alignbit_b32 v2, v2, v3, 16
+; SI-NEXT:    v_lshr_b64 v[2:3], v[3:4], 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v61
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v60
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v59
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v59
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v36
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v35
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v58
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v57
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v20
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v47
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v56
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v31
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v46
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v45
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v12
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v43
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v44
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v42
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v41
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v41
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v27
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v40
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v55
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v15
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v54
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v53
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v52
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v51
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v51
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v50
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v49
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v49
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v23
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v48
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v39
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_readlane_b32 s99, v63, 35
-; SI-NEXT:    v_readlane_b32 s98, v63, 34
-; SI-NEXT:    v_readlane_b32 s97, v63, 33
-; SI-NEXT:    v_readlane_b32 s96, v63, 32
-; SI-NEXT:    v_readlane_b32 s87, v63, 31
-; SI-NEXT:    v_readlane_b32 s86, v63, 30
-; SI-NEXT:    v_readlane_b32 s85, v63, 29
-; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    v_readlane_b32 s83, v63, 27
-; SI-NEXT:    v_readlane_b32 s82, v63, 26
-; SI-NEXT:    v_readlane_b32 s81, v63, 25
-; SI-NEXT:    v_readlane_b32 s80, v63, 24
-; SI-NEXT:    v_readlane_b32 s71, v63, 23
-; SI-NEXT:    v_readlane_b32 s70, v63, 22
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v37
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v35
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v61
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v58
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v34
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v33
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v57
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v56
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v31
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v47
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v46
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v29
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v45
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v44
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v27
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v43
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v40
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v25
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v55
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v54
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v23
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v53
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v48
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v21
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v39
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v38
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v19
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v37
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v34
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v17
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v33
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v30
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v15
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v29
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v26
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v13
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v22
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v11
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v18
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v9
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v14
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v7
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v10
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v5
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v6
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    v_readlane_b32 s99, v63, 35
+; SI-NEXT:    v_readlane_b32 s98, v63, 34
+; SI-NEXT:    v_readlane_b32 s97, v63, 33
+; SI-NEXT:    v_readlane_b32 s96, v63, 32
+; SI-NEXT:    v_readlane_b32 s87, v63, 31
+; SI-NEXT:    v_readlane_b32 s86, v63, 30
+; SI-NEXT:    v_readlane_b32 s85, v63, 29
+; SI-NEXT:    v_readlane_b32 s84, v63, 28
+; SI-NEXT:    v_readlane_b32 s83, v63, 27
+; SI-NEXT:    v_readlane_b32 s82, v63, 26
+; SI-NEXT:    v_readlane_b32 s81, v63, 25
+; SI-NEXT:    v_readlane_b32 s80, v63, 24
+; SI-NEXT:    v_readlane_b32 s71, v63, 23
+; SI-NEXT:    v_readlane_b32 s70, v63, 22
+; SI-NEXT:    v_readlane_b32 s69, v63, 21
+; SI-NEXT:    v_readlane_b32 s68, v63, 20
+; SI-NEXT:    v_readlane_b32 s67, v63, 19
+; SI-NEXT:    v_readlane_b32 s66, v63, 18
+; SI-NEXT:    v_readlane_b32 s65, v63, 17
+; SI-NEXT:    v_readlane_b32 s64, v63, 16
+; SI-NEXT:    v_readlane_b32 s55, v63, 15
+; SI-NEXT:    v_readlane_b32 s54, v63, 14
+; SI-NEXT:    v_readlane_b32 s53, v63, 13
+; SI-NEXT:    v_readlane_b32 s52, v63, 12
+; SI-NEXT:    v_readlane_b32 s51, v63, 11
+; SI-NEXT:    v_readlane_b32 s50, v63, 10
+; SI-NEXT:    v_readlane_b32 s49, v63, 9
+; SI-NEXT:    v_readlane_b32 s48, v63, 8
+; SI-NEXT:    v_readlane_b32 s39, v63, 7
+; SI-NEXT:    v_readlane_b32 s38, v63, 6
+; SI-NEXT:    v_readlane_b32 s37, v63, 5
+; SI-NEXT:    v_readlane_b32 s36, v63, 4
+; SI-NEXT:    v_readlane_b32 s35, v63, 3
+; SI-NEXT:    v_readlane_b32 s34, v63, 2
+; SI-NEXT:    v_readlane_b32 s31, v63, 1
+; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -136071,7 +136582,10 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:60
@@ -136088,9 +136602,6 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:8
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v32
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -136656,7 +137167,7 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:16
@@ -137264,562 +137775,737 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v52, v30
-; SI-NEXT:    v_mov_b32_e32 v53, v28
-; SI-NEXT:    v_mov_b32_e32 v40, v12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:68
-; SI-NEXT:    v_mov_b32_e32 v55, v14
-; SI-NEXT:    v_mul_f32_e32 v14, 1.0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v63, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v5
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v8
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v7
+; SI-NEXT:    v_mov_b32_e32 v43, v21
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v10
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v11
+; SI-NEXT:    v_mov_b32_e32 v54, v29
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v55
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v43
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v16
-; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v54
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v41, v23
+; SI-NEXT:    v_mov_b32_e32 v29, v20
+; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v3
 ; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v9
-; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v11
-; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v40
-; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v17
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v19
-; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v18
-; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v21
-; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v20
-; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v23
-; SI-NEXT:    v_mul_f32_e32 v40, 1.0, v22
-; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v25
-; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v24
-; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v27
-; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v29
-; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v53
-; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v52
+; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v15
+; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v17
+; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v19
+; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v41
+; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v25
+; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v27
+; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s29
+; SI-NEXT:    v_mul_f32_e32 v9, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v11, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v14
+; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v16
+; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v22
+; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v28
+; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s22
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v32
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v33
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v34
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v35
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v36
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v37
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v38
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v39
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v48
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v49
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s16
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s29
-; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s28
+; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v48
-; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v26
-; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v51
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v45
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v37
-; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v50
-; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v38
-; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v49
-; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v39
-; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v30
-; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v28
-; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v12
-; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v31
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v60
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s27
+; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v18
+; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v29
+; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v24
+; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v26
+; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v30
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v51
+; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v53
+; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v55
+; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v40
 ; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v42
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v62
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v63
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v33
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v34
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v35
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v36
-; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v34, 1.0, s24
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v50
+; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v52
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v46
+; SI-NEXT:    v_mul_f32_e64 v48, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v45, 1.0, s28
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB79_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v0, v0, v3, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_alignbit_b32 v2, v2, v8, 16
-; SI-NEXT:    v_alignbit_b32 v3, v3, v9, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_alignbit_b32 v6, v6, v7, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v58
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v56
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v44
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_alignbit_b32 v5, v5, v11, 16
-; SI-NEXT:    v_alignbit_b32 v7, v7, v14, 16
-; SI-NEXT:    v_alignbit_b32 v8, v8, v54, 16
-; SI-NEXT:    v_alignbit_b32 v9, v9, v46, 16
-; SI-NEXT:    v_mov_b32_e32 v62, v61
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v61
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v60, v59
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v59
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v56, v47
-; SI-NEXT:    v_alignbit_b32 v13, v13, v47, 16
-; SI-NEXT:    v_mov_b32_e32 v46, v45
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v45
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v57
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; SI-NEXT:    v_alignbit_b32 v30, v30, v31, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v33
-; SI-NEXT:    v_alignbit_b32 v4, v4, v34, 16
-; SI-NEXT:    v_mov_b32_e32 v63, v44
-; SI-NEXT:    v_mov_b32_e32 v58, v57
-; SI-NEXT:    v_mov_b32_e32 v44, v43
-; SI-NEXT:    v_alignbit_b32 v16, v16, v43, 16
-; SI-NEXT:    v_mov_b32_e32 v43, v41
-; SI-NEXT:    v_alignbit_b32 v17, v17, v41, 16
-; SI-NEXT:    v_alignbit_b32 v18, v18, v40, 16
-; SI-NEXT:    v_mov_b32_e32 v40, v55
-; SI-NEXT:    v_alignbit_b32 v19, v19, v55, 16
-; SI-NEXT:    v_alignbit_b32 v20, v20, v48, 16
-; SI-NEXT:    v_mov_b32_e32 v48, v53
-; SI-NEXT:    v_alignbit_b32 v21, v21, v53, 16
-; SI-NEXT:    v_alignbit_b32 v22, v22, v52, 16
-; SI-NEXT:    v_mov_b32_e32 v52, v51
-; SI-NEXT:    v_alignbit_b32 v23, v23, v51, 16
-; SI-NEXT:    v_alignbit_b32 v24, v24, v50, 16
-; SI-NEXT:    v_mov_b32_e32 v50, v49
-; SI-NEXT:    v_alignbit_b32 v25, v25, v49, 16
-; SI-NEXT:    v_mov_b32_e32 v36, v39
-; SI-NEXT:    v_alignbit_b32 v26, v26, v39, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v34, v38
-; SI-NEXT:    v_alignbit_b32 v27, v27, v38, 16
-; SI-NEXT:    v_mov_b32_e32 v35, v37
-; SI-NEXT:    v_alignbit_b32 v28, v28, v37, 16
-; SI-NEXT:    v_mov_b32_e32 v37, v32
-; SI-NEXT:    v_alignbit_b32 v29, v29, v32, 16
-; SI-NEXT:    v_alignbit_b32 v31, v31, v42, 16
+; SI-NEXT:    v_mov_b32_e32 v0, v19
+; SI-NEXT:    v_mov_b32_e32 v37, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v63
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v57
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v61
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v44
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
-; SI-NEXT:    v_mov_b32_e32 v32, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v42
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_alignbit_b32 v10, v10, v61, 16
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_alignbit_b32 v12, v12, v54, 16
-; SI-NEXT:    v_mov_b32_e32 v41, v61
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[0:1], v[19:20], 16
+; SI-NEXT:    v_mov_b32_e32 v1, v48
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[1:2], v[48:49], 16
+; SI-NEXT:    v_mov_b32_e32 v2, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[2:3], v[14:15], 16
+; SI-NEXT:    v_mov_b32_e32 v3, v16
+; SI-NEXT:    v_mov_b32_e32 v20, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v35
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[3:4], v[16:17], 16
+; SI-NEXT:    v_mov_b32_e32 v4, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[4:5], v[18:19], 16
+; SI-NEXT:    v_mov_b32_e32 v5, v29
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[5:6], v[29:30], 16
+; SI-NEXT:    v_mov_b32_e32 v6, v45
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[6:7], v[45:46], 16
+; SI-NEXT:    v_mov_b32_e32 v7, v39
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[7:8], v[39:40], 16
+; SI-NEXT:    v_mov_b32_e32 v8, v9
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v49
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[8:9], v[9:10], 16
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v9
+; SI-NEXT:    v_mov_b32_e32 v9, v54
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[9:10], v[54:55], 16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v55, v13
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
+; SI-NEXT:    v_mov_b32_e32 v10, v11
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[10:11], v[11:12], 16
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v11
+; SI-NEXT:    v_mov_b32_e32 v11, v56
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[11:12], v[56:57], 16
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v56, v44
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
+; SI-NEXT:    v_lshr_b64 v[12:13], v[13:14], 16
+; SI-NEXT:    v_mov_b32_e32 v13, v58
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[13:14], v[58:59], 16
+; SI-NEXT:    v_mov_b32_e32 v14, v60
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[14:15], v[60:61], 16
+; SI-NEXT:    v_mov_b32_e32 v15, v62
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[15:16], v[62:63], 16
+; SI-NEXT:    v_mov_b32_e32 v16, v32
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[16:17], v[32:33], 16
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; SI-NEXT:    v_mov_b32_e32 v33, v34
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_alignbit_b32 v11, v11, v59, 16
-; SI-NEXT:    v_mov_b32_e32 v55, v59
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_alignbit_b32 v14, v14, v45, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v17
+; SI-NEXT:    v_mov_b32_e32 v40, v17
+; SI-NEXT:    v_lshr_b64 v[17:18], v[34:35], 16
+; SI-NEXT:    v_lshr_b64 v[18:19], v[47:48], 16
+; SI-NEXT:    v_lshr_b64 v[19:20], v[36:37], 16
+; SI-NEXT:    v_mov_b32_e32 v20, v38
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[20:21], v[38:39], 16
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT:    v_mov_b32_e32 v34, v47
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
+; SI-NEXT:    v_mov_b32_e32 v21, v22
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[21:22], v[22:23], 16
+; SI-NEXT:    v_mov_b32_e32 v22, v31
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[22:23], v[31:32], 16
+; SI-NEXT:    v_mov_b32_e32 v23, v24
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[23:24], v[24:25], 16
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v30
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v24
+; SI-NEXT:    v_mov_b32_e32 v24, v41
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[24:25], v[41:42], 16
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v41, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v50
+; SI-NEXT:    v_mov_b32_e32 v42, v51
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
+; SI-NEXT:    v_lshr_b64 v[25:26], v[26:27], 16
+; SI-NEXT:    v_mov_b32_e32 v26, v43
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[26:27], v[43:44], 16
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v43, v28
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v27
+; SI-NEXT:    v_lshr_b64 v[27:28], v[28:29], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[51:52], 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[53:54], 16
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v52, v53
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshr_b64 v[30:31], v[31:32], 16
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_alignbit_b32 v15, v15, v47, 16
-; SI-NEXT:    v_mov_b32_e32 v51, v47
-; SI-NEXT:    v_mov_b32_e32 v53, v45
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v57
+; SI-NEXT:    v_mov_b32_e32 v53, v31
+; SI-NEXT:    v_lshr_b64 v[31:32], v[31:32], 16
 ; SI-NEXT:    s_branch .LBB79_3
 ; SI-NEXT:  .LBB79_2:
-; SI-NEXT:    v_mov_b32_e32 v63, v44
-; SI-NEXT:    v_mov_b32_e32 v44, v43
-; SI-NEXT:    v_mov_b32_e32 v43, v41
-; SI-NEXT:    v_mov_b32_e32 v40, v55
-; SI-NEXT:    v_mov_b32_e32 v48, v53
-; SI-NEXT:    v_mov_b32_e32 v52, v51
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v62, v61
-; SI-NEXT:    v_mov_b32_e32 v60, v59
-; SI-NEXT:    v_mov_b32_e32 v58, v57
-; SI-NEXT:    v_mov_b32_e32 v56, v47
-; SI-NEXT:    v_mov_b32_e32 v46, v45
-; SI-NEXT:    v_mov_b32_e32 v50, v49
-; SI-NEXT:    v_mov_b32_e32 v36, v39
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v34, v38
-; SI-NEXT:    v_mov_b32_e32 v35, v37
-; SI-NEXT:    v_mov_b32_e32 v37, v32
+; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v56, v44
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v55, v13
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v33, v34
+; SI-NEXT:    v_mov_b32_e32 v34, v47
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v43, v28
+; SI-NEXT:    v_mov_b32_e32 v52, v53
+; SI-NEXT:    v_mov_b32_e32 v53, v0
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
-; SI-NEXT:    v_mov_b32_e32 v32, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v42
+; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v41, v26
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v42, v51
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:  .LBB79_3: ; %Flow
-; SI-NEXT:    v_mov_b32_e32 v38, v50
-; SI-NEXT:    v_mov_b32_e32 v39, v52
-; SI-NEXT:    v_mov_b32_e32 v49, v40
-; SI-NEXT:    v_mov_b32_e32 v50, v43
-; SI-NEXT:    v_mov_b32_e32 v43, v44
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v37, v34
+; SI-NEXT:    v_mov_b32_e32 v34, v33
+; SI-NEXT:    v_mov_b32_e32 v35, v56
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_mov_b32_e32 v32, v40
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v33, v38
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v51, v46
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v54, v46
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v44, v46
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v45, v56
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v47, v56
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v58, v60
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; SI-NEXT:    s_cbranch_vccnz .LBB79_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v44
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v57
+; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v61
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v49
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v40
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v39
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v63
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v62
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v60
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v59
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v58
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v46
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v40
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v35
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v32
+; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v53
 ; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[2:3], 16
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v60
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
 ; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
 ; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
 ; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
 ; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
 ; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v58
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT:    v_lshr_b64 v[3:4], v[3:4], 16
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v56
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v47
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT:    v_lshr_b64 v[5:6], v[5:6], 16
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v45
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v45
+; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v46
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v42
+; SI-NEXT:    v_lshr_b64 v[7:8], v[7:8], 16
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v44
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v52
+; SI-NEXT:    v_lshr_b64 v[8:9], v[8:9], 16
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v54
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v41
+; SI-NEXT:    v_lshr_b64 v[9:10], v[9:10], 16
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v51
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v55
+; SI-NEXT:    v_lshr_b64 v[10:11], v[10:11], 16
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v50
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_alignbit_b32 v11, v12, v11, 16
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v54
+; SI-NEXT:    v_lshr_b64 v[11:12], v[11:12], 16
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v55
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT:    v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v56
+; SI-NEXT:    v_lshr_b64 v[12:13], v[12:13], 16
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v48
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v53
+; SI-NEXT:    v_lshr_b64 v[13:14], v[13:14], 16
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v38
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v51
+; SI-NEXT:    v_lshr_b64 v[14:15], v[14:15], 16
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v33
 ; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_alignbit_b32 v15, v16, v15, 16
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v43
+; SI-NEXT:    v_lshr_b64 v[15:16], v[15:16], 16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_alignbit_b32 v16, v17, v16, 16
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v50
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT:    v_alignbit_b32 v17, v18, v17, 16
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_lshr_b64 v[16:17], v[16:17], 16
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v34
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT:    v_lshr_b64 v[17:18], v[17:18], 16
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v37
 ; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_alignbit_b32 v18, v19, v18, 16
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v49
+; SI-NEXT:    v_lshr_b64 v[18:19], v[18:19], 16
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v36
 ; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT:    v_alignbit_b32 v19, v20, v19, 16
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshr_b64 v[19:20], v[19:20], 16
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; SI-NEXT:    v_lshr_b64 v[32:33], v[32:33], 16
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT:    v_alignbit_b32 v20, v21, v20, 16
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v48
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT:    v_lshr_b64 v[20:21], v[20:21], 16
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT:    v_alignbit_b32 v21, v22, v21, 16
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT:    v_alignbit_b32 v22, v23, v22, 16
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT:    v_lshr_b64 v[21:22], v[21:22], 16
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT:    v_lshr_b64 v[22:23], v[22:23], 16
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT:    v_alignbit_b32 v23, v24, v23, 16
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT:    v_alignbit_b32 v24, v25, v24, 16
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT:    v_lshr_b64 v[23:24], v[23:24], 16
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT:    v_alignbit_b32 v25, v26, v25, 16
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT:    v_lshr_b64 v[24:25], v[24:25], 16
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v41
+; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT:    v_lshr_b64 v[25:26], v[25:26], 16
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT:    v_alignbit_b32 v26, v27, v26, 16
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v34
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT:    v_alignbit_b32 v27, v28, v27, 16
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT:    v_lshr_b64 v[26:27], v[26:27], 16
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v43
+; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; SI-NEXT:    v_lshr_b64 v[27:28], v[27:28], 16
+; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v42
 ; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT:    v_alignbit_b32 v28, v29, v28, 16
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v37
+; SI-NEXT:    v_lshr_b64 v[28:29], v[28:29], 16
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v52
 ; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT:    v_alignbit_b32 v29, v30, v29, 16
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshr_b64 v[29:30], v[29:30], 16
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT:    v_alignbit_b32 v30, v31, v30, 16
-; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v33
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT:    v_alignbit_b32 v31, v32, v31, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT:    v_lshr_b64 v[30:31], v[30:31], 16
+; SI-NEXT:    v_mov_b32_e32 v31, v32
 ; SI-NEXT:  .LBB79_5: ; %end
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -137847,36 +138533,39 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; VI-NEXT:    v_mov_b32_e32 v31, v17
 ; VI-NEXT:    v_mov_b32_e32 v30, v16
 ; VI-NEXT:    v_mov_b32_e32 v29, v15
-; VI-NEXT:    v_mov_b32_e32 v28, v14
 ; VI-NEXT:    v_mov_b32_e32 v27, v13
-; VI-NEXT:    v_mov_b32_e32 v26, v12
 ; VI-NEXT:    v_mov_b32_e32 v25, v11
-; VI-NEXT:    v_mov_b32_e32 v24, v10
 ; VI-NEXT:    v_mov_b32_e32 v23, v9
-; VI-NEXT:    v_mov_b32_e32 v22, v8
 ; VI-NEXT:    v_mov_b32_e32 v21, v7
-; VI-NEXT:    v_mov_b32_e32 v20, v6
 ; VI-NEXT:    v_mov_b32_e32 v19, v5
-; VI-NEXT:    v_mov_b32_e32 v32, v4
 ; VI-NEXT:    v_mov_b32_e32 v17, v3
-; VI-NEXT:    v_mov_b32_e32 v16, v2
 ; VI-NEXT:    v_mov_b32_e32 v15, v1
+; VI-NEXT:    v_mov_b32_e32 v28, v14
+; VI-NEXT:    v_mov_b32_e32 v26, v12
+; VI-NEXT:    v_mov_b32_e32 v24, v10
+; VI-NEXT:    v_mov_b32_e32 v22, v8
+; VI-NEXT:    v_mov_b32_e32 v20, v6
+; VI-NEXT:    v_mov_b32_e32 v32, v4
+; VI-NEXT:    v_mov_b32_e32 v16, v2
 ; VI-NEXT:    v_mov_b32_e32 v14, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s16
-; VI-NEXT:    v_mov_b32_e32 v1, s17
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; VI-NEXT:    v_mov_b32_e32 v2, s18
+; VI-NEXT:    v_mov_b32_e32 v1, s17
 ; VI-NEXT:    v_mov_b32_e32 v3, s19
-; VI-NEXT:    v_mov_b32_e32 v4, s20
 ; VI-NEXT:    v_mov_b32_e32 v5, s21
-; VI-NEXT:    v_mov_b32_e32 v6, s22
 ; VI-NEXT:    v_mov_b32_e32 v7, s23
-; VI-NEXT:    v_mov_b32_e32 v8, s24
 ; VI-NEXT:    v_mov_b32_e32 v9, s25
-; VI-NEXT:    v_mov_b32_e32 v10, s26
 ; VI-NEXT:    v_mov_b32_e32 v11, s27
-; VI-NEXT:    v_mov_b32_e32 v12, s28
 ; VI-NEXT:    v_mov_b32_e32 v13, s29
+; VI-NEXT:    v_mov_b32_e32 v0, s16
+; VI-NEXT:    v_mov_b32_e32 v2, s18
+; VI-NEXT:    v_mov_b32_e32 v4, s20
+; VI-NEXT:    v_mov_b32_e32 v6, s22
+; VI-NEXT:    v_mov_b32_e32 v8, s24
+; VI-NEXT:    v_mov_b32_e32 v10, s26
+; VI-NEXT:    v_mov_b32_e32 v12, s28
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB79_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_cbranch_execnz .LBB79_3
@@ -137885,580 +138574,600 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
 ; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
 ; VI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
 ; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v15, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v15
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc
+; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
+; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
 ; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v15
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; VI-NEXT:    v_cndmask_b32_e32 v15, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT:    v_alignbit_b32 v15, v15, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v15, v18, v34, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; VI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
 ; VI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
 ; VI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v14, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v14
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v14
+; VI-NEXT:    v_lshrrev_b64 v[34:35], 16, v[33:34]
+; VI-NEXT:    v_or_b32_e32 v33, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    v_bfe_u32 v15, v14, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v35, v18, v33, vcc
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v14
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; VI-NEXT:    v_cndmask_b32_e32 v14, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT:    v_alignbit_b32 v14, v14, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v14, v15, v18, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
+; VI-NEXT:    v_lshrrev_b64 v[14:15], 16, v[35:36]
+; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v13
+; VI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; VI-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
 ; VI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
 ; VI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v13, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v13
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v13
+; VI-NEXT:    v_or_b32_e32 v33, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v35, v18, v33, vcc
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT:    v_cndmask_b32_e32 v13, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT:    v_alignbit_b32 v13, v13, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v13, v15, v18, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v13
+; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; VI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
 ; VI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
 ; VI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v12, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v12
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v12
+; VI-NEXT:    v_lshrrev_b64 v[35:36], 16, v[35:36]
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_bfe_u32 v13, v12, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v36, v15, v18, vcc
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v12
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v12
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; VI-NEXT:    v_cndmask_b32_e32 v12, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT:    v_alignbit_b32 v12, v12, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v12, v13, v15, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; VI-NEXT:    v_lshrrev_b64 v[12:13], 16, v[36:37]
+; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; VI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
 ; VI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
 ; VI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v11, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v11
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v11
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v36, v15, v18, vcc
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; VI-NEXT:    v_cndmask_b32_e32 v11, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT:    v_alignbit_b32 v11, v11, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v11, v13, v15, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v11
+; VI-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; VI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
 ; VI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
 ; VI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v10, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v10
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v10
+; VI-NEXT:    v_lshrrev_b64 v[36:37], 16, v[36:37]
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    v_bfe_u32 v11, v10, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v37, v13, v15, vcc
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v10
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v10
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; VI-NEXT:    v_cndmask_b32_e32 v10, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT:    v_alignbit_b32 v10, v10, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v10, v11, v13, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[37:38]
+; VI-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
+; VI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
 ; VI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
 ; VI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v9, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v9
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v9
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v37, v13, v15, vcc
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT:    v_cndmask_b32_e32 v9, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT:    v_alignbit_b32 v9, v9, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; VI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
 ; VI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
 ; VI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v8, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v8
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v8
+; VI-NEXT:    v_lshrrev_b64 v[37:38], 16, v[37:38]
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v38, v11, v13, vcc
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v8
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; VI-NEXT:    v_cndmask_b32_e32 v8, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT:    v_alignbit_b32 v8, v8, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v11, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[38:39]
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; VI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
 ; VI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
 ; VI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v7, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v7
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v7
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v38, v11, v13, vcc
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT:    v_cndmask_b32_e32 v7, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT:    v_alignbit_b32 v7, v7, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v7, v9, v11, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v7
+; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; VI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
 ; VI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
 ; VI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v6, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v6
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v6
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v48, v9, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v6
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; VI-NEXT:    v_cndmask_b32_e32 v6, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT:    v_alignbit_b32 v6, v6, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v6, v7, v9, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[48:49]
+; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; VI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
 ; VI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
 ; VI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v5, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v5
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v5
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v48, v9, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT:    v_cndmask_b32_e32 v5, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT:    v_alignbit_b32 v5, v5, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v4
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; VI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
 ; VI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; VI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v4
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v4
+; VI-NEXT:    v_lshrrev_b64 v[48:49], 16, v[48:49]
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v49, v7, v9, vcc
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_cndmask_b32_e32 v4, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT:    v_alignbit_b32 v4, v4, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v4
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[49:50]
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; VI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
 ; VI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; VI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v3
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v3
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v49, v7, v9, vcc
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v3, v3, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v2
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; VI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; VI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; VI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v2
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v2
+; VI-NEXT:    v_lshrrev_b64 v[49:50], 16, v[49:50]
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v50, v5, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_alignbit_b32 v2, v2, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v2
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[50:51]
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; VI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v1
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v50, v5, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_alignbit_b32 v1, v1, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v0
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v0
+; VI-NEXT:    v_lshrrev_b64 v[50:51], 16, v[50:51]
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v51, v3, v5, vcc
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v31
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v31, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v31
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v31
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; VI-NEXT:    v_cndmask_b32_e32 v31, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT:    v_alignbit_b32 v31, v31, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v30
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v30, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v30
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v30
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
-; VI-NEXT:    v_cndmask_b32_e32 v30, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT:    v_alignbit_b32 v30, v30, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v29, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v29
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v29
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; VI-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT:    v_alignbit_b32 v29, v29, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v28
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v28, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v28
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v28
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; VI-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT:    v_alignbit_b32 v28, v28, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v27
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v27
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v27
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; VI-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT:    v_alignbit_b32 v27, v27, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v26
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v26, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v26
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v26
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; VI-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT:    v_alignbit_b32 v26, v26, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v25
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v25
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v25
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; VI-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT:    v_alignbit_b32 v25, v25, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v24
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v24, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v24
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v24
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; VI-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT:    v_alignbit_b32 v24, v24, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v23
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v23
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; VI-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT:    v_alignbit_b32 v23, v23, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v22
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v22, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v22
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v22
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; VI-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT:    v_alignbit_b32 v22, v22, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v21, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v21
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v21
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; VI-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT:    v_alignbit_b32 v21, v21, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v20
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v20, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v20
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v20
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; VI-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT:    v_alignbit_b32 v20, v20, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v19, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v19
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v19
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; VI-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT:    v_alignbit_b32 v19, v19, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v32
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v32
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; VI-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; VI-NEXT:    v_alignbit_b32 v32, v32, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v17, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v17
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v17
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT:    v_alignbit_b32 v17, v17, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v16
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    v_cndmask_b32_e32 v16, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT:    v_alignbit_b32 v16, v16, v18, 16
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v0
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[51:52]
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v31
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v31
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v51, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v30
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v30
+; VI-NEXT:    v_lshrrev_b64 v[51:52], 16, v[51:52]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v52, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v29
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v29
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[30:31], 16, v[52:53]
+; VI-NEXT:    v_cndmask_b32_e32 v52, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v28
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v28
+; VI-NEXT:    v_lshrrev_b64 v[52:53], 16, v[52:53]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v53, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v27
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v27
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[28:29], 16, v[53:54]
+; VI-NEXT:    v_cndmask_b32_e32 v53, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v26
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v26
+; VI-NEXT:    v_lshrrev_b64 v[53:54], 16, v[53:54]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v54, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v25
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v25
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[26:27], 16, v[54:55]
+; VI-NEXT:    v_cndmask_b32_e32 v54, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v24
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v24
+; VI-NEXT:    v_lshrrev_b64 v[38:39], 16, v[38:39]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v39, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v23
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v23
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[24:25], 16, v[39:40]
+; VI-NEXT:    v_cndmask_b32_e32 v39, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v22
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v22
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 16, v[39:40]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v40, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v21
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v21
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[22:23], 16, v[40:41]
+; VI-NEXT:    v_cndmask_b32_e32 v40, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v20
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v20
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 16, v[40:41]
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v41, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v19
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v18, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v32
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v31, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v17
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v17
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_lshrrev_b64 v[20:21], 16, v[41:42]
+; VI-NEXT:    v_cndmask_b32_e32 v41, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 16, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
+; VI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v15, v3, v5, vcc
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 16, v[54:55]
+; VI-NEXT:    v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT:    v_lshrrev_b64 v[41:42], 16, v[41:42]
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[32:33], 16, v[31:32]
+; VI-NEXT:    v_lshrrev_b64 v[16:17], 16, v[15:16]
+; VI-NEXT:    v_mov_b32_e32 v1, v50
+; VI-NEXT:    v_mov_b32_e32 v3, v49
+; VI-NEXT:    v_mov_b32_e32 v5, v48
+; VI-NEXT:    v_mov_b32_e32 v7, v38
+; VI-NEXT:    v_mov_b32_e32 v9, v37
+; VI-NEXT:    v_mov_b32_e32 v11, v36
+; VI-NEXT:    v_mov_b32_e32 v13, v35
+; VI-NEXT:    v_mov_b32_e32 v15, v34
+; VI-NEXT:    v_mov_b32_e32 v17, v41
+; VI-NEXT:    v_mov_b32_e32 v19, v18
+; VI-NEXT:    v_mov_b32_e32 v21, v40
+; VI-NEXT:    v_mov_b32_e32 v23, v39
+; VI-NEXT:    v_mov_b32_e32 v25, v54
+; VI-NEXT:    v_mov_b32_e32 v27, v53
+; VI-NEXT:    v_mov_b32_e32 v29, v52
+; VI-NEXT:    v_mov_b32_e32 v31, v51
 ; VI-NEXT:  .LBB79_3: ; %end
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v18, v32
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB79_4:
 ; VI-NEXT:    s_branch .LBB79_2
@@ -139123,7 +139832,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:280
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:276
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:272
@@ -139156,7 +139865,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:152
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:148
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:144
@@ -139189,7 +139898,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:16
@@ -139991,7 +140700,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:8
@@ -140024,7 +140733,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:136
@@ -140057,7 +140766,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:264
@@ -140097,7 +140806,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:288
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:284
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:280
@@ -140130,7 +140839,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:172
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:156
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:152
@@ -140163,7 +140872,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v157, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v158, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v168, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v169, s32 offset:28
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v170, s32 offset:24
@@ -140855,7 +141564,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v184, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v175, off, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v174, off, s32 offset:8
@@ -140888,7 +141597,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:116
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:120
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:128
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:136
@@ -140921,7 +141630,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:244
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:248
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:256
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:260
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:264
@@ -140978,22 +141687,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16f64_to_v64f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -141062,6 +141755,22 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; kill: killed $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr47
@@ -141093,7 +141802,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -141144,7 +141853,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v52
 ; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v28
 ; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v22
@@ -141314,7 +142022,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
 ; SI-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
 ; SI-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
 ; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v9
@@ -144567,7 +145274,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -144600,7 +145307,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -144633,7 +145340,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -144711,7 +145418,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -144744,7 +145451,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -144777,7 +145484,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -144835,6 +145542,10 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16f64_to_v64i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -144851,10 +145562,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -144886,14 +145593,13 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; kill: killed $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB84_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v32, v31, 16
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v34, v30, v29, 16
@@ -144937,7 +145643,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB84_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
 ; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
 ; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
@@ -145175,7 +145880,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -146031,7 +146736,13 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -146063,12 +146774,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -146084,7 +146789,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v41
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -146301,7 +147005,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -147475,7 +148178,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -147508,7 +148211,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -147541,7 +148244,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -147619,7 +148322,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -147652,7 +148355,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -147685,7 +148388,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -147895,6 +148598,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v25
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:268
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:264
 ; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr9
@@ -147904,7 +148609,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:160
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:140
@@ -147944,38 +148649,39 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v29
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:212
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:192
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:208
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:200
@@ -147991,11 +148697,12 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v2
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 8, v3
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:240
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 8, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -148017,14 +148724,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 24, v2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:272
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:268
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:264
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:292
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:308
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:316
@@ -148032,11 +148731,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:304
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:300
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:296
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:272
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v1
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 24, v2
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 8, v3
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:324
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:340
@@ -148045,9 +148748,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:336
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:332
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:328
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 24, v1
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 24, v2
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:356
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:372
@@ -148057,7 +148762,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:368
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:364
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:360
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 8, v3
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 24, v1
@@ -149940,8 +150645,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v25
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v55, 8, v3
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v40, 8, v5
 ; VI-NEXT:    v_lshlrev_b16_e32 v41, 8, v7
 ; VI-NEXT:    v_lshlrev_b16_e32 v50, 8, v9
@@ -150037,13 +150742,25 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -150171,14 +150888,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v38, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -150186,26 +150908,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -150214,35 +150916,57 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr59
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr55
 ; VI-NEXT:    ; implicit-def: $vgpr40
 ; VI-NEXT:    ; implicit-def: $vgpr41
 ; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr49
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -150275,39 +150999,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(6)
-; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr59
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -150473,17 +151177,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr53
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -151168,8 +151864,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v27
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v54, 8, v3
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v41, 8, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v40, 8, v7
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v9
@@ -151280,13 +151976,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -151419,14 +152129,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v37, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -151434,26 +152149,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -151462,36 +152157,62 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s6
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr60
+; GFX9-NEXT:    ; implicit-def: $vgpr56
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr46
+; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr55
 ; GFX9-NEXT:    ; implicit-def: $vgpr54
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
 ; GFX9-NEXT:    ; implicit-def: $vgpr35
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr48
+; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -151514,49 +152235,25 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s6
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s6
 ; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr53
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s6
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr60
-; GFX9-NEXT:    ; implicit-def: $vgpr56
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr46
-; GFX9-NEXT:    ; implicit-def: $vgpr44
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -151722,17 +152419,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v28, v29, v28, s6
 ; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr48
-; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v29, v30, v29, s6
@@ -153078,7 +153767,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:580
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:576
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:572
@@ -153111,7 +153800,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:456
-; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:444
@@ -153940,7 +154629,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v116, v31, 0x5040100
 ; GFX11-FAKE16-NEXT:  .LBB88_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:400
@@ -153973,7 +154662,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:528
@@ -154017,8 +154706,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:332
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:328
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:324
@@ -154027,14 +154716,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:312
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:308
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:304
-; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
-; SI-NEXT:    s_mov_b32 s72, s21
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v43, s19, 0
-; SI-NEXT:    v_writelane_b32 v43, s18, 1
-; SI-NEXT:    v_writelane_b32 v43, s17, 2
-; SI-NEXT:    v_writelane_b32 v43, s16, 3
-; SI-NEXT:    s_mov_b32 s60, s24
+; SI-NEXT:    ; implicit-def: $vgpr44 : SGPR spill to VGPR lane
+; SI-NEXT:    s_mov_b32 s73, s21
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v44, s19, 0
+; SI-NEXT:    v_writelane_b32 v44, s18, 1
+; SI-NEXT:    v_writelane_b32 v44, s17, 2
+; SI-NEXT:    v_writelane_b32 v44, s16, 3
 ; SI-NEXT:    v_writelane_b32 v41, s30, 0
 ; SI-NEXT:    v_writelane_b32 v41, s31, 1
 ; SI-NEXT:    v_writelane_b32 v41, s34, 2
@@ -154059,7 +154747,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_writelane_b32 v41, s69, 21
 ; SI-NEXT:    v_writelane_b32 v41, s70, 22
 ; SI-NEXT:    v_writelane_b32 v41, s71, 23
-; SI-NEXT:    s_mov_b32 s77, s28
+; SI-NEXT:    s_mov_b32 s74, s29
+; SI-NEXT:    s_mov_b32 s78, s28
 ; SI-NEXT:    s_mov_b32 s76, s27
 ; SI-NEXT:    v_writelane_b32 v41, s80, 24
 ; SI-NEXT:    v_writelane_b32 v41, s81, 25
@@ -154070,39 +154759,43 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_writelane_b32 v41, s86, 30
 ; SI-NEXT:    v_writelane_b32 v41, s87, 31
 ; SI-NEXT:    v_writelane_b32 v41, s96, 32
+; SI-NEXT:    s_mov_b32 s47, s26
 ; SI-NEXT:    v_writelane_b32 v41, s97, 33
 ; SI-NEXT:    v_writelane_b32 v41, s98, 34
 ; SI-NEXT:    v_writelane_b32 v41, s99, 35
-; SI-NEXT:    s_mov_b32 s79, s26
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:164
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:160
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152
+; SI-NEXT:    v_readfirstlane_b32 s37, v22
+; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
 ; SI-NEXT:    v_readfirstlane_b32 s38, v20
-; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; SI-NEXT:    v_writelane_b32 v43, s37, 0
 ; SI-NEXT:    v_readfirstlane_b32 s39, v19
-; SI-NEXT:    v_writelane_b32 v42, s38, 0
+; SI-NEXT:    v_writelane_b32 v43, s38, 1
 ; SI-NEXT:    v_readfirstlane_b32 s48, v25
-; SI-NEXT:    v_writelane_b32 v42, s39, 1
+; SI-NEXT:    v_writelane_b32 v43, s39, 2
 ; SI-NEXT:    v_readfirstlane_b32 s49, v26
-; SI-NEXT:    v_writelane_b32 v42, s48, 2
+; SI-NEXT:    v_writelane_b32 v43, s48, 3
 ; SI-NEXT:    v_readfirstlane_b32 s50, v24
-; SI-NEXT:    v_writelane_b32 v42, s49, 3
+; SI-NEXT:    v_writelane_b32 v43, s49, 4
 ; SI-NEXT:    v_readfirstlane_b32 s51, v23
-; SI-NEXT:    v_writelane_b32 v42, s50, 4
+; SI-NEXT:    v_writelane_b32 v43, s50, 5
 ; SI-NEXT:    v_readfirstlane_b32 s52, v29
-; SI-NEXT:    v_writelane_b32 v42, s51, 5
+; SI-NEXT:    v_writelane_b32 v43, s51, 6
 ; SI-NEXT:    v_readfirstlane_b32 s53, v30
-; SI-NEXT:    v_writelane_b32 v42, s52, 6
+; SI-NEXT:    v_writelane_b32 v43, s52, 7
 ; SI-NEXT:    v_readfirstlane_b32 s54, v28
-; SI-NEXT:    v_writelane_b32 v42, s53, 7
+; SI-NEXT:    v_writelane_b32 v43, s53, 8
 ; SI-NEXT:    v_readfirstlane_b32 s55, v27
-; SI-NEXT:    v_writelane_b32 v42, s54, 8
-; SI-NEXT:    v_writelane_b32 v42, s55, 9
+; SI-NEXT:    v_writelane_b32 v43, s54, 9
+; SI-NEXT:    v_writelane_b32 v43, s55, 10
+; SI-NEXT:    s_mov_b32 s57, s24
 ; SI-NEXT:    v_readfirstlane_b32 s16, v1
 ; SI-NEXT:    v_readfirstlane_b32 s17, v2
-; SI-NEXT:    v_readfirstlane_b32 s18, v5
-; SI-NEXT:    v_readfirstlane_b32 s19, v6
-; SI-NEXT:    v_readfirstlane_b32 s88, v4
-; SI-NEXT:    v_readfirstlane_b32 s89, v3
-; SI-NEXT:    v_readfirstlane_b32 s90, v9
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s6, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:300
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:296
@@ -154110,33 +154803,34 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:288
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:280
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v32
-; SI-NEXT:    v_writelane_b32 v43, s4, 4
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_writelane_b32 v44, s4, 4
 ; SI-NEXT:    v_readfirstlane_b32 s4, v33
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:276
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:272
-; SI-NEXT:    v_writelane_b32 v43, s4, 5
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_writelane_b32 v44, s4, 5
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
-; SI-NEXT:    v_writelane_b32 v43, s4, 6
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_writelane_b32 v44, s4, 6
 ; SI-NEXT:    v_readfirstlane_b32 s4, v35
-; SI-NEXT:    v_writelane_b32 v43, s4, 7
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_writelane_b32 v44, s4, 7
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v36
-; SI-NEXT:    v_writelane_b32 v43, s4, 8
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_writelane_b32 v44, s4, 8
 ; SI-NEXT:    v_readfirstlane_b32 s4, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:268
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:264
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:256
-; SI-NEXT:    v_writelane_b32 v43, s4, 9
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_writelane_b32 v44, s4, 9
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v38
-; SI-NEXT:    v_writelane_b32 v43, s4, 10
+; SI-NEXT:    v_writelane_b32 v44, s4, 10
+; SI-NEXT:    v_readfirstlane_b32 s18, v5
+; SI-NEXT:    v_readfirstlane_b32 s19, v6
+; SI-NEXT:    v_readfirstlane_b32 s77, v4
+; SI-NEXT:    v_readfirstlane_b32 s89, v3
+; SI-NEXT:    v_readfirstlane_b32 s90, v9
 ; SI-NEXT:    v_readfirstlane_b32 s91, v10
 ; SI-NEXT:    v_readfirstlane_b32 s92, v8
 ; SI-NEXT:    v_readfirstlane_b32 s93, v7
@@ -154147,22 +154841,23 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s34, v16
 ; SI-NEXT:    v_readfirstlane_b32 s35, v15
 ; SI-NEXT:    v_readfirstlane_b32 s36, v21
-; SI-NEXT:    v_readfirstlane_b32 s37, v22
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_readfirstlane_b32 s24, v40
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
-; SI-NEXT:    v_writelane_b32 v43, s4, 11
+; SI-NEXT:    v_writelane_b32 v44, s4, 11
 ; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v39
-; SI-NEXT:    v_writelane_b32 v43, s4, 12
+; SI-NEXT:    v_writelane_b32 v44, s4, 12
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v48
-; SI-NEXT:    v_writelane_b32 v43, s4, 13
+; SI-NEXT:    v_writelane_b32 v44, s4, 13
 ; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v49
-; SI-NEXT:    v_writelane_b32 v43, s4, 14
+; SI-NEXT:    v_writelane_b32 v44, s4, 14
 ; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v50
-; SI-NEXT:    v_writelane_b32 v43, s4, 15
+; SI-NEXT:    v_writelane_b32 v44, s4, 15
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v51
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:252
@@ -154175,33 +154870,33 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s75, v32
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s61, v33
+; SI-NEXT:    v_readfirstlane_b32 s21, v33
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:224
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:220
-; SI-NEXT:    v_writelane_b32 v43, s4, 16
+; SI-NEXT:    v_writelane_b32 v44, s4, 16
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s43, v34
+; SI-NEXT:    v_readfirstlane_b32 s4, v34
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s40, v35
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s4, v36
+; SI-NEXT:    v_readfirstlane_b32 s61, v36
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s63, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:216
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:212
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:204
-; SI-NEXT:    v_writelane_b32 v43, s4, 17
+; SI-NEXT:    v_writelane_b32 v44, s4, 17
 ; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s59, v31
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s42, v38
+; SI-NEXT:    v_readfirstlane_b32 s56, v38
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s73, v39
+; SI-NEXT:    v_readfirstlane_b32 s43, v39
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s21, v48
+; SI-NEXT:    v_readfirstlane_b32 s46, v48
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_readfirstlane_b32 s57, v49
+; SI-NEXT:    v_readfirstlane_b32 s42, v49
 ; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s13, v50
 ; SI-NEXT:    s_waitcnt vmcnt(6)
@@ -154214,49 +154909,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:180
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:176
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s47, v32
+; SI-NEXT:    v_readfirstlane_b32 s88, v32
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s24, v33
+; SI-NEXT:    v_readfirstlane_b32 s79, v33
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:172
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:168
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:164
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:160
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_readfirstlane_b32 s78, v34
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_readfirstlane_b32 s4, v34
+; SI-NEXT:    v_writelane_b32 v44, s4, 18
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v35
-; SI-NEXT:    v_writelane_b32 v43, s4, 18
+; SI-NEXT:    v_writelane_b32 v44, s4, 19
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v36
-; SI-NEXT:    v_writelane_b32 v43, s4, 19
-; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    v_writelane_b32 v44, s4, 20
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v37
-; SI-NEXT:    v_writelane_b32 v43, s4, 20
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_writelane_b32 v44, s4, 21
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
-; SI-NEXT:    v_writelane_b32 v43, s4, 21
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_writelane_b32 v44, s4, 22
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v38
-; SI-NEXT:    v_writelane_b32 v43, s4, 22
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_writelane_b32 v44, s4, 23
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v39
-; SI-NEXT:    v_writelane_b32 v43, s4, 23
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_writelane_b32 v44, s4, 24
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v48
-; SI-NEXT:    v_writelane_b32 v43, s4, 24
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_writelane_b32 v44, s4, 25
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v49
-; SI-NEXT:    v_writelane_b32 v43, s4, 25
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_writelane_b32 v44, s4, 26
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v50
-; SI-NEXT:    v_writelane_b32 v43, s4, 26
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_writelane_b32 v44, s4, 27
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v51
-; SI-NEXT:    v_writelane_b32 v43, s4, 27
+; SI-NEXT:    v_writelane_b32 v44, s4, 28
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:144
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v33
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:140
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:136
@@ -154269,43 +154962,41 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:104
-; SI-NEXT:    v_writelane_b32 v43, s4, 28
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_writelane_b32 v44, s4, 29
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v52
-; SI-NEXT:    v_writelane_b32 v43, s4, 29
+; SI-NEXT:    v_writelane_b32 v44, s4, 30
 ; SI-NEXT:    v_readfirstlane_b32 s4, v53
-; SI-NEXT:    v_writelane_b32 v43, s4, 30
+; SI-NEXT:    v_writelane_b32 v44, s4, 31
 ; SI-NEXT:    v_readfirstlane_b32 s4, v54
-; SI-NEXT:    v_writelane_b32 v43, s4, 31
+; SI-NEXT:    v_writelane_b32 v44, s4, 32
 ; SI-NEXT:    v_readfirstlane_b32 s4, v55
-; SI-NEXT:    v_writelane_b32 v43, s4, 32
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_readfirstlane_b32 s4, v40
-; SI-NEXT:    v_writelane_b32 v43, s4, 33
-; SI-NEXT:    v_writelane_b32 v43, s22, 34
-; SI-NEXT:    v_writelane_b32 v43, s23, 35
-; SI-NEXT:    v_writelane_b32 v43, s72, 36
-; SI-NEXT:    v_writelane_b32 v43, s20, 37
-; SI-NEXT:    v_writelane_b32 v43, s79, 38
-; SI-NEXT:    v_writelane_b32 v43, s76, 39
-; SI-NEXT:    v_writelane_b32 v43, s25, 40
-; SI-NEXT:    v_writelane_b32 v43, s60, 41
-; SI-NEXT:    v_writelane_b32 v43, s29, 42
-; SI-NEXT:    v_writelane_b32 v43, s77, 43
-; SI-NEXT:    v_writelane_b32 v43, s16, 44
-; SI-NEXT:    v_writelane_b32 v43, s17, 45
-; SI-NEXT:    v_writelane_b32 v43, s18, 46
-; SI-NEXT:    v_writelane_b32 v43, s19, 47
-; SI-NEXT:    v_writelane_b32 v43, s88, 48
-; SI-NEXT:    v_writelane_b32 v43, s89, 49
-; SI-NEXT:    v_writelane_b32 v43, s90, 50
-; SI-NEXT:    v_writelane_b32 v43, s91, 51
-; SI-NEXT:    v_writelane_b32 v43, s92, 52
-; SI-NEXT:    v_writelane_b32 v43, s93, 53
-; SI-NEXT:    v_writelane_b32 v43, s94, 54
-; SI-NEXT:    v_writelane_b32 v43, s95, 55
+; SI-NEXT:    v_writelane_b32 v44, s4, 33
+; SI-NEXT:    v_writelane_b32 v44, s22, 34
+; SI-NEXT:    v_writelane_b32 v44, s23, 35
+; SI-NEXT:    v_writelane_b32 v44, s73, 36
+; SI-NEXT:    v_writelane_b32 v44, s20, 37
+; SI-NEXT:    v_writelane_b32 v44, s47, 38
+; SI-NEXT:    v_writelane_b32 v44, s76, 39
+; SI-NEXT:    v_writelane_b32 v44, s25, 40
+; SI-NEXT:    v_writelane_b32 v44, s57, 41
+; SI-NEXT:    v_writelane_b32 v44, s74, 42
+; SI-NEXT:    v_writelane_b32 v44, s78, 43
+; SI-NEXT:    v_writelane_b32 v44, s24, 44
+; SI-NEXT:    v_writelane_b32 v44, s16, 45
+; SI-NEXT:    v_writelane_b32 v44, s17, 46
+; SI-NEXT:    v_writelane_b32 v44, s18, 47
+; SI-NEXT:    v_writelane_b32 v44, s19, 48
+; SI-NEXT:    v_writelane_b32 v44, s77, 49
+; SI-NEXT:    v_writelane_b32 v44, s89, 50
+; SI-NEXT:    v_writelane_b32 v44, s90, 51
+; SI-NEXT:    v_writelane_b32 v44, s91, 52
+; SI-NEXT:    v_writelane_b32 v44, s92, 53
+; SI-NEXT:    v_writelane_b32 v44, s93, 54
+; SI-NEXT:    v_writelane_b32 v44, s94, 55
+; SI-NEXT:    v_writelane_b32 v44, s95, 56
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s62, v33
+; SI-NEXT:    v_readfirstlane_b32 s58, v33
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s10, v34
 ; SI-NEXT:    s_waitcnt vmcnt(8)
@@ -154313,7 +155004,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s28, v31
 ; SI-NEXT:    v_readfirstlane_b32 s27, v32
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s58, v36
+; SI-NEXT:    v_readfirstlane_b32 s29, v36
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_readfirstlane_b32 s69, v37
 ; SI-NEXT:    s_waitcnt vmcnt(5)
@@ -154344,32 +155035,31 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v12
 ; SI-NEXT:    v_readfirstlane_b32 vcc_hi, v11
-; SI-NEXT:    v_writelane_b32 v43, vcc_lo, 56
-; SI-NEXT:    v_writelane_b32 v43, vcc_hi, 57
-; SI-NEXT:    v_writelane_b32 v43, s30, 58
-; SI-NEXT:    v_writelane_b32 v43, s31, 59
-; SI-NEXT:    v_writelane_b32 v43, s34, 60
-; SI-NEXT:    v_writelane_b32 v43, s35, 61
-; SI-NEXT:    v_writelane_b32 v43, s36, 62
-; SI-NEXT:    v_writelane_b32 v43, s37, 63
+; SI-NEXT:    v_writelane_b32 v44, vcc_lo, 57
+; SI-NEXT:    v_writelane_b32 v44, vcc_hi, 58
+; SI-NEXT:    v_writelane_b32 v44, s30, 59
+; SI-NEXT:    v_writelane_b32 v44, s31, 60
+; SI-NEXT:    v_writelane_b32 v44, s34, 61
+; SI-NEXT:    v_writelane_b32 v44, s35, 62
+; SI-NEXT:    v_writelane_b32 v44, s36, 63
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s74, v31
+; SI-NEXT:    v_readfirstlane_b32 s60, v31
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s46, v32
+; SI-NEXT:    v_readfirstlane_b32 s62, v32
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s96, v33
+; SI-NEXT:    v_readfirstlane_b32 s83, v33
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s98, v34
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_readfirstlane_b32 s41, v35
+; SI-NEXT:    v_readfirstlane_b32 s81, v35
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s56, v36
+; SI-NEXT:    v_readfirstlane_b32 s72, v36
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_readfirstlane_b32 s87, v37
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_readfirstlane_b32 s99, v38
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_readfirstlane_b32 s81, v39
+; SI-NEXT:    v_readfirstlane_b32 s82, v39
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
@@ -154381,9 +155071,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s26, v48
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s83, v49
+; SI-NEXT:    v_readfirstlane_b32 s15, v49
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s82, v50
+; SI-NEXT:    v_readfirstlane_b32 s96, v50
 ; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s7, v51
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:16
@@ -154392,7 +155082,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s15, v31
+; SI-NEXT:    v_readfirstlane_b32 s41, v31
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s97, v32
 ; SI-NEXT:    s_waitcnt vmcnt(10)
@@ -154413,144 +155103,146 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s65, v48
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_readfirstlane_b32 s64, v49
-; SI-NEXT:    v_writelane_b32 v42, s64, 10
+; SI-NEXT:    v_writelane_b32 v43, s64, 11
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_readfirstlane_b32 s67, v50
-; SI-NEXT:    v_writelane_b32 v42, s65, 11
+; SI-NEXT:    v_writelane_b32 v43, s65, 12
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s84, v51
-; SI-NEXT:    v_writelane_b32 v42, s67, 12
-; SI-NEXT:    v_writelane_b32 v42, s84, 13
-; SI-NEXT:    v_writelane_b32 v42, s85, 14
-; SI-NEXT:    v_writelane_b32 v42, s86, 15
-; SI-NEXT:    v_writelane_b32 v42, s87, 16
-; SI-NEXT:    v_writelane_b32 v42, s8, 17
-; SI-NEXT:    v_writelane_b32 v42, s99, 18
-; SI-NEXT:    v_writelane_b32 v42, s12, 19
-; SI-NEXT:    v_writelane_b32 v42, s44, 20
-; SI-NEXT:    v_writelane_b32 v42, s97, 21
-; SI-NEXT:    v_writelane_b32 v42, s83, 22
-; SI-NEXT:    v_writelane_b32 v42, s82, 23
-; SI-NEXT:    v_writelane_b32 v42, s98, 24
-; SI-NEXT:    v_writelane_b32 v42, s96, 25
-; SI-NEXT:    v_writelane_b32 v42, s81, 26
-; SI-NEXT:    v_writelane_b32 v42, s9, 27
-; SI-NEXT:    v_writelane_b32 v42, s41, 28
-; SI-NEXT:    v_writelane_b32 v42, s80, 29
-; SI-NEXT:    v_writelane_b32 v42, s7, 30
-; SI-NEXT:    v_writelane_b32 v42, s56, 31
-; SI-NEXT:    v_writelane_b32 v42, s26, 32
-; SI-NEXT:    v_writelane_b32 v42, s15, 33
-; SI-NEXT:    v_writelane_b32 v42, s14, 34
-; SI-NEXT:    v_writelane_b32 v42, s69, 35
-; SI-NEXT:    v_writelane_b32 v42, s71, 36
-; SI-NEXT:    v_writelane_b32 v42, s70, 37
-; SI-NEXT:    v_writelane_b32 v42, s68, 38
-; SI-NEXT:    v_writelane_b32 v42, s74, 39
-; SI-NEXT:    v_writelane_b32 v42, s46, 40
-; SI-NEXT:    v_writelane_b32 v42, s11, 41
-; SI-NEXT:    v_writelane_b32 v42, s10, 42
-; SI-NEXT:    v_writelane_b32 v42, s62, 43
-; SI-NEXT:    v_writelane_b32 v42, s66, 44
-; SI-NEXT:    v_writelane_b32 v42, s58, 45
-; SI-NEXT:    v_writelane_b32 v42, s28, 46
-; SI-NEXT:    v_writelane_b32 v42, s27, 47
-; SI-NEXT:    v_writelane_b32 v42, s78, 48
-; SI-NEXT:    v_writelane_b32 v42, s24, 49
+; SI-NEXT:    v_writelane_b32 v43, s67, 13
+; SI-NEXT:    v_writelane_b32 v43, s84, 14
+; SI-NEXT:    v_writelane_b32 v43, s85, 15
+; SI-NEXT:    v_writelane_b32 v43, s86, 16
+; SI-NEXT:    v_writelane_b32 v43, s87, 17
+; SI-NEXT:    v_writelane_b32 v43, s8, 18
+; SI-NEXT:    v_writelane_b32 v43, s99, 19
+; SI-NEXT:    v_writelane_b32 v43, s12, 20
+; SI-NEXT:    v_writelane_b32 v43, s44, 21
+; SI-NEXT:    v_writelane_b32 v43, s97, 22
+; SI-NEXT:    v_writelane_b32 v43, s15, 23
+; SI-NEXT:    v_writelane_b32 v43, s96, 24
+; SI-NEXT:    v_writelane_b32 v43, s98, 25
+; SI-NEXT:    v_writelane_b32 v43, s83, 26
+; SI-NEXT:    v_writelane_b32 v43, s82, 27
+; SI-NEXT:    v_writelane_b32 v43, s9, 28
+; SI-NEXT:    v_writelane_b32 v43, s81, 29
+; SI-NEXT:    v_writelane_b32 v43, s80, 30
+; SI-NEXT:    v_writelane_b32 v43, s7, 31
+; SI-NEXT:    v_writelane_b32 v43, s72, 32
+; SI-NEXT:    v_writelane_b32 v43, s26, 33
+; SI-NEXT:    v_writelane_b32 v43, s41, 34
+; SI-NEXT:    v_writelane_b32 v43, s14, 35
+; SI-NEXT:    v_writelane_b32 v43, s69, 36
+; SI-NEXT:    v_writelane_b32 v43, s71, 37
+; SI-NEXT:    v_writelane_b32 v43, s70, 38
+; SI-NEXT:    v_writelane_b32 v43, s68, 39
+; SI-NEXT:    v_writelane_b32 v43, s60, 40
+; SI-NEXT:    v_writelane_b32 v43, s62, 41
+; SI-NEXT:    v_writelane_b32 v43, s11, 42
+; SI-NEXT:    v_writelane_b32 v43, s10, 43
+; SI-NEXT:    v_writelane_b32 v43, s58, 44
+; SI-NEXT:    v_writelane_b32 v43, s66, 45
+; SI-NEXT:    v_writelane_b32 v43, s29, 46
+; SI-NEXT:    v_writelane_b32 v43, s28, 47
+; SI-NEXT:    v_writelane_b32 v43, s27, 48
 ; SI-NEXT:    s_cbranch_scc0 .LBB89_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_readlane_b32 s4, v43, 3
+; SI-NEXT:    v_readlane_b32 s4, v44, 3
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    v_readlane_b32 s5, v43, 2
+; SI-NEXT:    v_readlane_b32 s5, v44, 2
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s5, s5, 24
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_writelane_b32 v42, s4, 56
-; SI-NEXT:    v_readlane_b32 s4, v43, 1
+; SI-NEXT:    v_writelane_b32 v43, s4, 58
+; SI-NEXT:    v_readlane_b32 s4, v44, 1
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    v_readlane_b32 s5, v43, 0
+; SI-NEXT:    v_readlane_b32 s5, v44, 0
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s5, s5, 24
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_writelane_b32 v42, s4, 57
+; SI-NEXT:    v_writelane_b32 v43, s4, 59
 ; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s72, 8
+; SI-NEXT:    s_lshl_b32 s5, s73, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    s_and_b32 s5, s22, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
 ; SI-NEXT:    s_mov_b32 s22, s6
 ; SI-NEXT:    s_lshl_b32 s6, s23, 24
-; SI-NEXT:    v_writelane_b32 v42, s4, 58
+; SI-NEXT:    v_writelane_b32 v43, s4, 60
 ; SI-NEXT:    s_or_b32 s4, s6, s5
-; SI-NEXT:    s_and_b32 s5, s60, 0xff
+; SI-NEXT:    s_and_b32 s5, s57, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
 ; SI-NEXT:    s_lshl_b32 s6, s25, 24
-; SI-NEXT:    v_writelane_b32 v42, s4, 59
-; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    v_writelane_b32 v42, s5, 60
-; SI-NEXT:    s_and_b32 s5, s79, 0xff
+; SI-NEXT:    v_writelane_b32 v43, s4, 61
+; SI-NEXT:    s_or_b32 s4, s6, s5
+; SI-NEXT:    s_and_b32 s5, s47, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
 ; SI-NEXT:    s_lshl_b32 s6, s76, 24
-; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    v_writelane_b32 v42, s5, 61
-; SI-NEXT:    s_and_b32 s5, s77, 0xff
-; SI-NEXT:    s_lshl_b32 s6, s29, 8
+; SI-NEXT:    v_writelane_b32 v43, s4, 62
+; SI-NEXT:    s_or_b32 s4, s6, s5
+; SI-NEXT:    s_and_b32 s5, s78, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s74, 8
 ; SI-NEXT:    s_or_b32 s5, s5, s6
 ; SI-NEXT:    s_and_b32 s6, s16, 0xff
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    s_lshl_b32 s16, s17, 24
-; SI-NEXT:    s_or_b32 s6, s16, s6
-; SI-NEXT:    v_writelane_b32 v42, s6, 62
+; SI-NEXT:    v_writelane_b32 v43, s4, 63
+; SI-NEXT:    s_or_b32 s4, s16, s6
 ; SI-NEXT:    s_and_b32 s6, s89, 0xff
+; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_lshl_b32 s16, s88, 24
-; SI-NEXT:    s_mov_b32 s4, s47
-; SI-NEXT:    s_or_b32 s47, s16, s6
+; SI-NEXT:    s_lshl_b32 s16, s77, 24
+; SI-NEXT:    v_writelane_b32 v42, s4, 0
+; SI-NEXT:    s_or_b32 s6, s16, s6
+; SI-NEXT:    v_writelane_b32 v42, s6, 1
 ; SI-NEXT:    s_and_b32 s6, s18, 0xff
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    s_lshl_b32 s16, s19, 24
-; SI-NEXT:    s_or_b32 s25, s16, s6
+; SI-NEXT:    s_or_b32 s76, s16, s6
 ; SI-NEXT:    s_and_b32 s6, s93, 0xff
 ; SI-NEXT:    s_lshl_b32 s16, s92, 8
 ; SI-NEXT:    s_or_b32 s6, s6, s16
 ; SI-NEXT:    s_and_b32 s16, s90, 0xff
 ; SI-NEXT:    s_lshl_b32 s16, s16, 16
 ; SI-NEXT:    s_lshl_b32 s17, s91, 24
-; SI-NEXT:    s_or_b32 s92, s17, s16
+; SI-NEXT:    s_or_b32 s77, s17, s16
 ; SI-NEXT:    s_and_b32 s16, vcc_hi, 0xff
 ; SI-NEXT:    s_lshl_b32 s16, s16, 16
 ; SI-NEXT:    s_lshl_b32 s17, vcc_lo, 24
-; SI-NEXT:    s_or_b32 s76, s17, s16
+; SI-NEXT:    s_or_b32 s25, s17, s16
 ; SI-NEXT:    s_and_b32 s16, s94, 0xff
 ; SI-NEXT:    s_lshl_b32 s16, s16, 16
 ; SI-NEXT:    s_lshl_b32 s17, s95, 24
-; SI-NEXT:    s_or_b32 s91, s17, s16
+; SI-NEXT:    s_or_b32 s74, s17, s16
 ; SI-NEXT:    s_and_b32 s16, s35, 0xff
 ; SI-NEXT:    s_lshl_b32 s17, s34, 8
 ; SI-NEXT:    s_or_b32 s16, s16, s17
 ; SI-NEXT:    s_and_b32 s17, s30, 0xff
 ; SI-NEXT:    s_lshl_b32 s17, s17, 16
 ; SI-NEXT:    s_lshl_b32 s18, s31, 24
-; SI-NEXT:    s_or_b32 s77, s18, s17
+; SI-NEXT:    s_or_b32 s78, s18, s17
 ; SI-NEXT:    s_and_b32 s17, s39, 0xff
 ; SI-NEXT:    s_lshl_b32 s17, s17, 16
 ; SI-NEXT:    s_lshl_b32 s18, s38, 24
-; SI-NEXT:    s_or_b32 s79, s18, s17
+; SI-NEXT:    s_mov_b32 s31, s88
+; SI-NEXT:    s_or_b32 s88, s18, s17
 ; SI-NEXT:    s_and_b32 s17, s36, 0xff
 ; SI-NEXT:    s_lshl_b32 s17, s17, 16
 ; SI-NEXT:    s_lshl_b32 s18, s37, 24
-; SI-NEXT:    s_or_b32 s93, s18, s17
+; SI-NEXT:    s_or_b32 s89, s18, s17
 ; SI-NEXT:    s_and_b32 s17, s51, 0xff
 ; SI-NEXT:    s_lshl_b32 s18, s50, 8
 ; SI-NEXT:    s_or_b32 s17, s17, s18
 ; SI-NEXT:    s_and_b32 s18, s48, 0xff
 ; SI-NEXT:    s_lshl_b32 s18, s18, 16
 ; SI-NEXT:    s_lshl_b32 s19, s49, 24
-; SI-NEXT:    s_or_b32 s89, s19, s18
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    v_writelane_b32 v43, s18, 49
 ; SI-NEXT:    s_and_b32 s18, s55, 0xff
 ; SI-NEXT:    s_lshl_b32 s18, s18, 16
 ; SI-NEXT:    s_lshl_b32 s19, s54, 24
-; SI-NEXT:    s_or_b32 s31, s19, s18
+; SI-NEXT:    s_mov_b32 s73, s79
+; SI-NEXT:    s_or_b32 s79, s19, s18
 ; SI-NEXT:    s_and_b32 s18, s52, 0xff
 ; SI-NEXT:    s_lshl_b32 s18, s18, 16
 ; SI-NEXT:    s_lshl_b32 s19, s53, 24
@@ -154561,7 +155253,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s19, s64, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_lshl_b32 s20, s65, 24
-; SI-NEXT:    s_or_b32 s60, s20, s19
+; SI-NEXT:    s_or_b32 s95, s20, s19
 ; SI-NEXT:    s_and_b32 s19, s12, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_lshl_b32 s20, s8, 24
@@ -154577,217 +155269,226 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_lshl_b32 s20, s97, 24
 ; SI-NEXT:    s_or_b32 s9, s20, s19
-; SI-NEXT:    s_and_b32 s19, s15, 0xff
+; SI-NEXT:    s_and_b32 s19, s41, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_lshl_b32 s20, s7, 24
 ; SI-NEXT:    s_or_b32 s7, s20, s19
-; SI-NEXT:    s_and_b32 s19, s82, 0xff
+; SI-NEXT:    s_and_b32 s19, s96, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s83, 24
-; SI-NEXT:    s_or_b32 s23, s20, s19
+; SI-NEXT:    s_lshl_b32 s20, s15, 24
+; SI-NEXT:    v_writelane_b32 v43, s12, 50
+; SI-NEXT:    s_or_b32 s12, s20, s19
 ; SI-NEXT:    s_and_b32 s19, s26, 0xff
-; SI-NEXT:    s_lshl_b32 s20, s81, 8
+; SI-NEXT:    s_lshl_b32 s20, s82, 8
 ; SI-NEXT:    s_or_b32 vcc_hi, s19, s20
 ; SI-NEXT:    s_and_b32 s19, s99, 0xff
-; SI-NEXT:    v_writelane_b32 v42, s9, 50
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_lshl_b32 s20, s87, 24
-; SI-NEXT:    v_writelane_b32 v42, s7, 51
-; SI-NEXT:    s_or_b32 s7, s20, s19
-; SI-NEXT:    s_and_b32 s19, s56, 0xff
+; SI-NEXT:    v_writelane_b32 v43, s9, 51
+; SI-NEXT:    s_or_b32 s9, s20, s19
+; SI-NEXT:    s_and_b32 s19, s72, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s41, 24
-; SI-NEXT:    v_writelane_b32 v42, s7, 52
-; SI-NEXT:    s_or_b32 s7, s20, s19
+; SI-NEXT:    s_lshl_b32 s20, s81, 24
+; SI-NEXT:    v_writelane_b32 v43, s9, 52
+; SI-NEXT:    s_or_b32 s9, s20, s19
 ; SI-NEXT:    s_and_b32 s19, s98, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s96, 24
-; SI-NEXT:    v_writelane_b32 v42, s7, 54
-; SI-NEXT:    s_or_b32 s7, s20, s19
-; SI-NEXT:    s_and_b32 s19, s46, 0xff
-; SI-NEXT:    s_lshl_b32 s20, s74, 8
+; SI-NEXT:    s_lshl_b32 s20, s83, 24
+; SI-NEXT:    v_writelane_b32 v43, s9, 54
+; SI-NEXT:    s_or_b32 s9, s20, s19
+; SI-NEXT:    s_and_b32 s19, s62, 0xff
+; SI-NEXT:    s_lshl_b32 s20, s60, 8
 ; SI-NEXT:    s_or_b32 s84, s19, s20
 ; SI-NEXT:    s_and_b32 s19, s71, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_lshl_b32 s20, s70, 24
-; SI-NEXT:    s_or_b32 s72, s20, s19
+; SI-NEXT:    v_writelane_b32 v43, s9, 53
+; SI-NEXT:    s_or_b32 s9, s20, s19
 ; SI-NEXT:    s_and_b32 s19, s11, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_lshl_b32 s20, s68, 24
-; SI-NEXT:    v_writelane_b32 v42, s7, 53
-; SI-NEXT:    s_or_b32 s7, s20, s19
+; SI-NEXT:    s_or_b32 s57, s20, s19
 ; SI-NEXT:    s_and_b32 s19, s14, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_lshl_b32 s20, s69, 24
+; SI-NEXT:    v_writelane_b32 v43, s9, 55
 ; SI-NEXT:    s_or_b32 s9, s20, s19
-; SI-NEXT:    s_and_b32 s19, s58, 0xff
+; SI-NEXT:    s_and_b32 s19, s29, 0xff
 ; SI-NEXT:    s_lshl_b32 s20, s66, 8
 ; SI-NEXT:    s_or_b32 s85, s19, s20
 ; SI-NEXT:    s_and_b32 s19, s10, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s62, 24
-; SI-NEXT:    s_or_b32 s49, s20, s19
+; SI-NEXT:    s_lshl_b32 s20, s58, 24
+; SI-NEXT:    v_writelane_b32 v43, s9, 56
+; SI-NEXT:    s_or_b32 s9, s20, s19
 ; SI-NEXT:    s_and_b32 s19, s27, 0xff
-; SI-NEXT:    v_writelane_b32 v42, s9, 55
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_lshl_b32 s20, s28, 24
-; SI-NEXT:    v_readlane_b32 s9, v43, 33
-; SI-NEXT:    s_or_b32 s50, s20, s19
-; SI-NEXT:    s_and_b32 s19, s9, 0xff
-; SI-NEXT:    v_readlane_b32 s9, v43, 32
+; SI-NEXT:    v_writelane_b32 v43, s9, 57
+; SI-NEXT:    s_or_b32 s23, s20, s19
+; SI-NEXT:    s_and_b32 s19, s24, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v44, 33
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_lshl_b32 s20, s9, 24
-; SI-NEXT:    v_readlane_b32 s9, v43, 31
-; SI-NEXT:    s_or_b32 s51, s20, s19
+; SI-NEXT:    v_readlane_b32 s9, v44, 32
+; SI-NEXT:    s_or_b32 s10, s20, s19
 ; SI-NEXT:    s_and_b32 s19, s9, 0xff
-; SI-NEXT:    v_readlane_b32 s9, v43, 30
+; SI-NEXT:    v_readlane_b32 s9, v44, 31
 ; SI-NEXT:    s_lshl_b32 s20, s9, 8
-; SI-NEXT:    v_readlane_b32 s9, v43, 29
+; SI-NEXT:    v_readlane_b32 s9, v44, 30
 ; SI-NEXT:    s_or_b32 s86, s19, s20
 ; SI-NEXT:    s_and_b32 s19, s9, 0xff
-; SI-NEXT:    v_readlane_b32 s9, v43, 28
+; SI-NEXT:    v_readlane_b32 s9, v44, 29
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_lshl_b32 s20, s9, 24
-; SI-NEXT:    v_readlane_b32 s9, v43, 27
-; SI-NEXT:    s_or_b32 s52, s20, s19
+; SI-NEXT:    v_readlane_b32 s9, v44, 28
+; SI-NEXT:    s_or_b32 s47, s20, s19
 ; SI-NEXT:    s_and_b32 s19, s9, 0xff
-; SI-NEXT:    v_readlane_b32 s9, v43, 26
+; SI-NEXT:    v_readlane_b32 s9, v44, 27
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_lshl_b32 s20, s9, 24
-; SI-NEXT:    v_readlane_b32 s9, v43, 25
-; SI-NEXT:    s_or_b32 s53, s20, s19
-; SI-NEXT:    s_and_b32 s19, s9, 0xff
-; SI-NEXT:    v_readlane_b32 s9, v43, 24
+; SI-NEXT:    v_readlane_b32 s11, v44, 26
+; SI-NEXT:    s_or_b32 s9, s20, s19
+; SI-NEXT:    s_and_b32 s19, s11, 0xff
+; SI-NEXT:    v_readlane_b32 s11, v44, 25
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s9, 24
-; SI-NEXT:    v_readlane_b32 s9, v43, 23
-; SI-NEXT:    s_or_b32 s54, s20, s19
-; SI-NEXT:    s_and_b32 s19, s9, 0xff
-; SI-NEXT:    v_readlane_b32 s9, v43, 22
-; SI-NEXT:    s_lshl_b32 s20, s9, 8
-; SI-NEXT:    v_readlane_b32 s9, v43, 21
+; SI-NEXT:    s_lshl_b32 s20, s11, 24
+; SI-NEXT:    v_readlane_b32 s11, v44, 24
+; SI-NEXT:    s_or_b32 s24, s20, s19
+; SI-NEXT:    s_mov_b32 s92, s11
+; SI-NEXT:    s_and_b32 s19, s11, 0xff
+; SI-NEXT:    v_readlane_b32 s11, v44, 23
+; SI-NEXT:    s_mov_b32 s36, s11
+; SI-NEXT:    s_lshl_b32 s20, s11, 8
+; SI-NEXT:    v_readlane_b32 s11, v44, 22
 ; SI-NEXT:    s_or_b32 s87, s19, s20
-; SI-NEXT:    s_and_b32 s19, s9, 0xff
-; SI-NEXT:    v_readlane_b32 s9, v43, 20
+; SI-NEXT:    s_mov_b32 s62, s11
+; SI-NEXT:    s_and_b32 s19, s11, 0xff
+; SI-NEXT:    v_readlane_b32 s11, v44, 21
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s9, 24
-; SI-NEXT:    v_readlane_b32 s9, v43, 19
-; SI-NEXT:    s_or_b32 s55, s20, s19
-; SI-NEXT:    s_mov_b32 s58, s9
-; SI-NEXT:    s_and_b32 s19, s9, 0xff
-; SI-NEXT:    v_readlane_b32 s9, v43, 18
+; SI-NEXT:    s_mov_b32 s30, s11
+; SI-NEXT:    s_lshl_b32 s20, s11, 24
+; SI-NEXT:    v_readlane_b32 s11, v44, 20
+; SI-NEXT:    s_or_b32 s58, s20, s19
+; SI-NEXT:    s_mov_b32 s91, s11
+; SI-NEXT:    s_and_b32 s19, s11, 0xff
+; SI-NEXT:    v_readlane_b32 s11, v44, 19
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s9, 24
-; SI-NEXT:    s_or_b32 s64, s20, s19
-; SI-NEXT:    s_and_b32 s19, s78, 0xff
+; SI-NEXT:    s_mov_b32 s35, s11
+; SI-NEXT:    s_lshl_b32 s20, s11, 24
+; SI-NEXT:    v_readlane_b32 s11, v44, 18
+; SI-NEXT:    s_mov_b32 s4, s46
+; SI-NEXT:    s_or_b32 s46, s20, s19
+; SI-NEXT:    s_and_b32 s19, s11, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s24, 24
-; SI-NEXT:    s_or_b32 s65, s20, s19
-; SI-NEXT:    s_and_b32 s19, s4, 0xff
+; SI-NEXT:    s_lshl_b32 s20, s73, 24
+; SI-NEXT:    s_mov_b32 s52, s73
+; SI-NEXT:    s_or_b32 s73, s20, s19
+; SI-NEXT:    s_and_b32 s19, s31, 0xff
 ; SI-NEXT:    s_lshl_b32 s20, s45, 8
 ; SI-NEXT:    s_or_b32 s26, s19, s20
 ; SI-NEXT:    s_and_b32 s19, s13, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s57, 24
-; SI-NEXT:    s_or_b32 s66, s20, s19
-; SI-NEXT:    s_and_b32 s19, s21, 0xff
-; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s73, 24
+; SI-NEXT:    s_lshl_b32 s20, s42, 24
 ; SI-NEXT:    s_or_b32 s67, s20, s19
-; SI-NEXT:    s_and_b32 s19, s42, 0xff
-; SI-NEXT:    v_readlane_b32 s88, v43, 17
+; SI-NEXT:    s_and_b32 s19, s4, 0xff
+; SI-NEXT:    s_lshl_b32 s19, s19, 16
+; SI-NEXT:    s_lshl_b32 s20, s43, 24
+; SI-NEXT:    s_mov_b32 s53, s42
+; SI-NEXT:    s_or_b32 s42, s20, s19
+; SI-NEXT:    s_and_b32 s19, s56, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_lshl_b32 s20, s59, 24
 ; SI-NEXT:    s_or_b32 s68, s20, s19
 ; SI-NEXT:    s_and_b32 s19, s63, 0xff
-; SI-NEXT:    s_lshl_b32 s20, s88, 8
+; SI-NEXT:    s_lshl_b32 s20, s61, 8
+; SI-NEXT:    v_readlane_b32 s93, v44, 17
 ; SI-NEXT:    s_or_b32 s27, s19, s20
 ; SI-NEXT:    s_and_b32 s19, s40, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s43, 24
-; SI-NEXT:    s_or_b32 s69, s20, s19
-; SI-NEXT:    s_and_b32 s19, s61, 0xff
-; SI-NEXT:    s_mov_b32 s39, s57
-; SI-NEXT:    s_mov_b32 s57, s7
+; SI-NEXT:    s_lshl_b32 s20, s93, 24
+; SI-NEXT:    s_or_b32 s70, s20, s19
+; SI-NEXT:    s_and_b32 s19, s21, 0xff
+; SI-NEXT:    s_mov_b32 s51, s59
+; SI-NEXT:    s_mov_b32 s59, s7
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_lshl_b32 s20, s75, 24
-; SI-NEXT:    v_readlane_b32 s7, v43, 16
-; SI-NEXT:    s_or_b32 s70, s20, s19
+; SI-NEXT:    v_readlane_b32 s7, v44, 16
+; SI-NEXT:    s_mov_b32 s48, s56
+; SI-NEXT:    s_mov_b32 s56, s10
+; SI-NEXT:    s_or_b32 s69, s20, s19
 ; SI-NEXT:    s_mov_b32 s10, s7
 ; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v43, 15
+; SI-NEXT:    v_readlane_b32 s7, v44, 15
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_mov_b32 s71, s7
 ; SI-NEXT:    s_lshl_b32 s20, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v43, 14
-; SI-NEXT:    s_or_b32 s62, s20, s19
-; SI-NEXT:    s_mov_b32 s15, s7
-; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v43, 13
+; SI-NEXT:    v_readlane_b32 s7, v44, 14
+; SI-NEXT:    s_mov_b32 s39, s75
+; SI-NEXT:    s_mov_b32 s75, s94
+; SI-NEXT:    s_or_b32 s94, s20, s19
 ; SI-NEXT:    s_mov_b32 s41, s7
+; SI-NEXT:    s_and_b32 s19, s7, 0xff
+; SI-NEXT:    v_readlane_b32 s7, v44, 13
+; SI-NEXT:    s_mov_b32 s14, s7
 ; SI-NEXT:    s_lshl_b32 s20, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v43, 12
+; SI-NEXT:    v_readlane_b32 s7, v44, 12
 ; SI-NEXT:    s_or_b32 s29, s19, s20
-; SI-NEXT:    s_mov_b32 s14, s7
+; SI-NEXT:    s_mov_b32 s81, s7
 ; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v43, 11
+; SI-NEXT:    v_readlane_b32 s7, v44, 11
+; SI-NEXT:    s_mov_b32 s55, s45
+; SI-NEXT:    s_mov_b32 s45, s9
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_mov_b32 s9, s7
 ; SI-NEXT:    s_lshl_b32 s20, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v43, 10
-; SI-NEXT:    s_or_b32 s80, s20, s19
-; SI-NEXT:    s_mov_b32 s56, s7
+; SI-NEXT:    v_readlane_b32 s7, v44, 10
+; SI-NEXT:    s_mov_b32 s38, s11
+; SI-NEXT:    s_or_b32 s11, s20, s19
+; SI-NEXT:    s_mov_b32 s72, s7
 ; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v43, 9
+; SI-NEXT:    v_readlane_b32 s7, v44, 9
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_mov_b32 s81, s7
-; SI-NEXT:    s_lshl_b32 s20, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v43, 8
-; SI-NEXT:    s_or_b32 s11, s20, s19
 ; SI-NEXT:    s_mov_b32 s82, s7
+; SI-NEXT:    s_lshl_b32 s20, s7, 24
+; SI-NEXT:    v_readlane_b32 s7, v44, 8
+; SI-NEXT:    s_or_b32 s80, s20, s19
+; SI-NEXT:    s_mov_b32 s83, s7
 ; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v43, 7
+; SI-NEXT:    v_readlane_b32 s7, v44, 7
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_mov_b32 s96, s7
 ; SI-NEXT:    s_lshl_b32 s20, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v43, 6
-; SI-NEXT:    s_mov_b32 s36, s63
-; SI-NEXT:    s_mov_b32 s63, s93
-; SI-NEXT:    s_mov_b32 s93, s61
-; SI-NEXT:    s_mov_b32 s61, s91
-; SI-NEXT:    s_mov_b32 s91, s75
-; SI-NEXT:    s_mov_b32 s75, s92
-; SI-NEXT:    s_or_b32 s92, s20, s19
+; SI-NEXT:    v_readlane_b32 s7, v44, 6
+; SI-NEXT:    s_mov_b32 s90, s31
+; SI-NEXT:    s_or_b32 s31, s20, s19
 ; SI-NEXT:    s_mov_b32 s98, s7
 ; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v43, 5
+; SI-NEXT:    v_readlane_b32 s7, v44, 5
 ; SI-NEXT:    s_mov_b32 s44, s7
 ; SI-NEXT:    s_lshl_b32 s20, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v43, 4
-; SI-NEXT:    s_mov_b32 s48, s13
-; SI-NEXT:    s_mov_b32 s13, s94
-; SI-NEXT:    s_mov_b32 s94, s21
+; SI-NEXT:    v_readlane_b32 s7, v44, 4
+; SI-NEXT:    s_mov_b32 s37, s43
+; SI-NEXT:    s_mov_b32 s43, s93
+; SI-NEXT:    s_mov_b32 s93, s21
 ; SI-NEXT:    s_or_b32 s21, s19, s20
 ; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    s_mov_b32 s95, s4
+; SI-NEXT:    s_mov_b32 s34, s4
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_lshl_b32 s20, s22, 24
-; SI-NEXT:    v_readlane_b32 s4, v42, 58
-; SI-NEXT:    s_mov_b32 s46, s45
-; SI-NEXT:    s_mov_b32 s34, s73
-; SI-NEXT:    s_mov_b32 s73, s12
-; SI-NEXT:    s_mov_b32 s37, s42
-; SI-NEXT:    s_mov_b32 s38, s59
-; SI-NEXT:    s_mov_b32 s59, s8
-; SI-NEXT:    s_mov_b32 s30, s88
-; SI-NEXT:    s_mov_b32 s88, s31
-; SI-NEXT:    s_mov_b32 s78, s40
-; SI-NEXT:    s_mov_b32 s31, s43
+; SI-NEXT:    v_readlane_b32 s4, v43, 60
+; SI-NEXT:    s_mov_b32 s54, s13
+; SI-NEXT:    s_mov_b32 s13, s12
+; SI-NEXT:    s_mov_b32 s50, s63
+; SI-NEXT:    s_mov_b32 s63, s95
+; SI-NEXT:    s_mov_b32 s49, s61
+; SI-NEXT:    s_mov_b32 s61, s8
+; SI-NEXT:    s_mov_b32 s60, s40
 ; SI-NEXT:    s_mov_b32 s12, s7
 ; SI-NEXT:    s_mov_b32 s7, s22
-; SI-NEXT:    s_or_b32 s83, s20, s19
+; SI-NEXT:    s_or_b32 s15, s20, s19
 ; SI-NEXT:    s_lshl_b32 s20, s4, 16
-; SI-NEXT:    s_lshl_b32 s74, s5, 16
+; SI-NEXT:    s_lshl_b32 s95, s5, 16
 ; SI-NEXT:    s_lshl_b32 s22, s6, 16
 ; SI-NEXT:    s_lshl_b32 s16, s16, 16
 ; SI-NEXT:    s_lshl_b32 s19, s17, 16
@@ -154799,16 +155500,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_lshl_b32 s97, s86, 16
 ; SI-NEXT:    s_lshl_b32 s28, s87, 16
 ; SI-NEXT:    s_lshl_b32 s87, s26, 16
-; SI-NEXT:    v_readlane_b32 s26, v42, 56
+; SI-NEXT:    v_readlane_b32 s26, v43, 58
 ; SI-NEXT:    s_lshl_b32 s86, s27, 16
-; SI-NEXT:    v_readlane_b32 s27, v42, 57
-; SI-NEXT:    v_readlane_b32 s35, v42, 61
+; SI-NEXT:    v_readlane_b32 s27, v43, 59
+; SI-NEXT:    v_readlane_b32 s66, v43, 63
 ; SI-NEXT:    s_lshl_b32 s85, s29, 16
-; SI-NEXT:    v_readlane_b32 s29, v42, 60
-; SI-NEXT:    v_readlane_b32 s24, v42, 59
-; SI-NEXT:    v_readlane_b32 s90, v42, 62
+; SI-NEXT:    v_readlane_b32 s29, v43, 62
+; SI-NEXT:    v_readlane_b32 s65, v43, 61
+; SI-NEXT:    v_readlane_b32 s64, v42, 0
 ; SI-NEXT:    s_lshl_b32 s84, s21, 16
-; SI-NEXT:    s_mov_b32 s21, s47
+; SI-NEXT:    v_readlane_b32 s21, v42, 1
 ; SI-NEXT:    s_cbranch_execnz .LBB89_3
 ; SI-NEXT:  .LBB89_2: ; %cmp.true
 ; SI-NEXT:    s_add_i32 s4, s98, 3
@@ -154823,10 +155524,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_or_b32 s5, s5, s6
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s5, s56, 3
+; SI-NEXT:    s_add_i32 s5, s72, 3
 ; SI-NEXT:    s_and_b32 s5, s5, 0xff
-; SI-NEXT:    s_lshl_b32 s6, s81, 8
-; SI-NEXT:    s_add_i32 s16, s82, 3
+; SI-NEXT:    s_lshl_b32 s6, s82, 8
+; SI-NEXT:    s_add_i32 s16, s83, 3
 ; SI-NEXT:    s_or_b32 s5, s6, s5
 ; SI-NEXT:    s_and_b32 s16, s16, 0xff
 ; SI-NEXT:    s_lshl_b32 s6, s96, 24
@@ -154835,10 +155536,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_or_b32 s6, s6, s16
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    s_add_i32 s6, s15, 3
+; SI-NEXT:    s_add_i32 s6, s41, 3
 ; SI-NEXT:    s_and_b32 s6, s6, 0xff
-; SI-NEXT:    s_lshl_b32 s16, s41, 8
-; SI-NEXT:    s_add_i32 s17, s14, 3
+; SI-NEXT:    s_lshl_b32 s16, s14, 8
+; SI-NEXT:    s_add_i32 s17, s81, 3
 ; SI-NEXT:    s_or_b32 s6, s16, s6
 ; SI-NEXT:    s_and_b32 s17, s17, 0xff
 ; SI-NEXT:    s_lshl_b32 s16, s9, 24
@@ -154849,7 +155550,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_or_b32 s6, s16, s6
 ; SI-NEXT:    s_add_i32 s16, s93, 3
 ; SI-NEXT:    s_and_b32 s16, s16, 0xff
-; SI-NEXT:    s_lshl_b32 s17, s91, 8
+; SI-NEXT:    s_lshl_b32 s17, s39, 8
 ; SI-NEXT:    s_add_i32 s18, s10, 3
 ; SI-NEXT:    s_or_b32 s16, s17, s16
 ; SI-NEXT:    s_and_b32 s18, s18, 0xff
@@ -154859,150 +155560,143 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_or_b32 s17, s17, s18
 ; SI-NEXT:    s_and_b32 s16, s16, 0xffff
 ; SI-NEXT:    s_or_b32 s16, s17, s16
-; SI-NEXT:    s_add_i32 s17, s36, 3
+; SI-NEXT:    s_add_i32 s17, s50, 3
 ; SI-NEXT:    s_and_b32 s17, s17, 0xff
-; SI-NEXT:    s_lshl_b32 s18, s30, 8
-; SI-NEXT:    s_add_i32 s19, s78, 3
+; SI-NEXT:    s_lshl_b32 s18, s49, 8
+; SI-NEXT:    s_add_i32 s19, s60, 3
 ; SI-NEXT:    s_or_b32 s17, s18, s17
 ; SI-NEXT:    s_and_b32 s19, s19, 0xff
-; SI-NEXT:    s_lshl_b32 s18, s31, 24
+; SI-NEXT:    s_lshl_b32 s18, s43, 24
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_addk_i32 s17, 0x300
 ; SI-NEXT:    s_or_b32 s18, s18, s19
 ; SI-NEXT:    s_and_b32 s17, s17, 0xffff
 ; SI-NEXT:    s_or_b32 s17, s18, s17
-; SI-NEXT:    s_add_i32 s18, s94, 3
+; SI-NEXT:    s_add_i32 s18, s34, 3
 ; SI-NEXT:    s_and_b32 s18, s18, 0xff
-; SI-NEXT:    s_lshl_b32 s19, s34, 8
-; SI-NEXT:    s_add_i32 s20, s37, 3
+; SI-NEXT:    s_lshl_b32 s19, s37, 8
+; SI-NEXT:    s_add_i32 s20, s48, 3
 ; SI-NEXT:    s_or_b32 s18, s19, s18
 ; SI-NEXT:    s_and_b32 s20, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s19, s38, 24
+; SI-NEXT:    s_lshl_b32 s19, s51, 24
 ; SI-NEXT:    s_lshl_b32 s20, s20, 16
 ; SI-NEXT:    s_addk_i32 s18, 0x300
 ; SI-NEXT:    s_or_b32 s19, s19, s20
 ; SI-NEXT:    s_and_b32 s18, s18, 0xffff
 ; SI-NEXT:    s_or_b32 s18, s19, s18
-; SI-NEXT:    s_add_i32 s19, s95, 3
+; SI-NEXT:    s_add_i32 s19, s90, 3
 ; SI-NEXT:    s_and_b32 s19, s19, 0xff
-; SI-NEXT:    s_lshl_b32 s20, s46, 8
-; SI-NEXT:    s_add_i32 s22, s48, 3
+; SI-NEXT:    s_lshl_b32 s20, s55, 8
+; SI-NEXT:    s_add_i32 s22, s54, 3
 ; SI-NEXT:    s_or_b32 s19, s20, s19
 ; SI-NEXT:    s_and_b32 s22, s22, 0xff
-; SI-NEXT:    s_lshl_b32 s20, s39, 24
+; SI-NEXT:    s_lshl_b32 s20, s53, 24
 ; SI-NEXT:    s_lshl_b32 s22, s22, 16
 ; SI-NEXT:    s_addk_i32 s19, 0x300
 ; SI-NEXT:    s_or_b32 s20, s20, s22
 ; SI-NEXT:    s_and_b32 s19, s19, 0xffff
 ; SI-NEXT:    s_or_b32 s19, s20, s19
-; SI-NEXT:    s_add_i32 s20, s58, 3
-; SI-NEXT:    v_readlane_b32 s7, v43, 18
+; SI-NEXT:    s_add_i32 s20, s91, 3
 ; SI-NEXT:    s_and_b32 s20, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s22, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v42, 49
+; SI-NEXT:    s_lshl_b32 s22, s35, 8
+; SI-NEXT:    s_add_i32 s23, s38, 3
 ; SI-NEXT:    s_or_b32 s20, s22, s20
-; SI-NEXT:    s_lshl_b32 s22, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v42, 48
-; SI-NEXT:    s_add_i32 s23, s7, 3
 ; SI-NEXT:    s_and_b32 s23, s23, 0xff
+; SI-NEXT:    s_lshl_b32 s22, s52, 24
 ; SI-NEXT:    s_lshl_b32 s23, s23, 16
 ; SI-NEXT:    s_addk_i32 s20, 0x300
 ; SI-NEXT:    s_or_b32 s22, s22, s23
 ; SI-NEXT:    s_and_b32 s20, s20, 0xffff
-; SI-NEXT:    v_readlane_b32 s7, v43, 23
 ; SI-NEXT:    s_or_b32 s20, s22, s20
-; SI-NEXT:    s_add_i32 s22, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v43, 22
+; SI-NEXT:    s_add_i32 s22, s92, 3
 ; SI-NEXT:    s_and_b32 s22, s22, 0xff
-; SI-NEXT:    s_lshl_b32 s23, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v43, 20
+; SI-NEXT:    s_lshl_b32 s23, s36, 8
+; SI-NEXT:    s_add_i32 s60, s62, 3
 ; SI-NEXT:    s_or_b32 s22, s23, s22
-; SI-NEXT:    s_lshl_b32 s23, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v43, 21
-; SI-NEXT:    s_add_i32 s60, s7, 3
 ; SI-NEXT:    s_and_b32 s60, s60, 0xff
+; SI-NEXT:    s_lshl_b32 s23, s30, 24
 ; SI-NEXT:    s_lshl_b32 s60, s60, 16
 ; SI-NEXT:    s_addk_i32 s22, 0x300
 ; SI-NEXT:    s_or_b32 s23, s23, s60
 ; SI-NEXT:    s_and_b32 s22, s22, 0xffff
-; SI-NEXT:    v_readlane_b32 s7, v43, 27
+; SI-NEXT:    v_readlane_b32 s7, v44, 28
 ; SI-NEXT:    s_or_b32 s22, s23, s22
 ; SI-NEXT:    s_add_i32 s23, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v43, 26
+; SI-NEXT:    v_readlane_b32 s7, v44, 27
 ; SI-NEXT:    s_and_b32 s23, s23, 0xff
 ; SI-NEXT:    s_lshl_b32 s60, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v43, 24
+; SI-NEXT:    v_readlane_b32 s7, v44, 25
 ; SI-NEXT:    s_or_b32 s23, s60, s23
 ; SI-NEXT:    s_lshl_b32 s60, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v43, 25
+; SI-NEXT:    v_readlane_b32 s7, v44, 26
 ; SI-NEXT:    s_add_i32 s61, s7, 3
 ; SI-NEXT:    s_and_b32 s61, s61, 0xff
 ; SI-NEXT:    s_lshl_b32 s61, s61, 16
 ; SI-NEXT:    s_addk_i32 s23, 0x300
 ; SI-NEXT:    s_or_b32 s60, s60, s61
 ; SI-NEXT:    s_and_b32 s23, s23, 0xffff
-; SI-NEXT:    v_readlane_b32 s7, v43, 31
+; SI-NEXT:    v_readlane_b32 s7, v44, 32
 ; SI-NEXT:    s_or_b32 s23, s60, s23
 ; SI-NEXT:    s_add_i32 s60, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v43, 30
+; SI-NEXT:    v_readlane_b32 s7, v44, 31
 ; SI-NEXT:    s_and_b32 s60, s60, 0xff
 ; SI-NEXT:    s_lshl_b32 s61, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v43, 28
+; SI-NEXT:    v_readlane_b32 s7, v44, 29
 ; SI-NEXT:    s_or_b32 s60, s61, s60
 ; SI-NEXT:    s_lshl_b32 s61, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v43, 29
+; SI-NEXT:    v_readlane_b32 s7, v44, 30
 ; SI-NEXT:    s_add_i32 s62, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v42, 47
+; SI-NEXT:    v_readlane_b32 s7, v43, 48
 ; SI-NEXT:    s_and_b32 s62, s62, 0xff
 ; SI-NEXT:    s_add_i32 s59, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v42, 46
+; SI-NEXT:    v_readlane_b32 s7, v43, 47
 ; SI-NEXT:    s_lshl_b32 s62, s62, 16
 ; SI-NEXT:    s_addk_i32 s60, 0x300
 ; SI-NEXT:    s_and_b32 s59, s59, 0xff
 ; SI-NEXT:    s_lshl_b32 s58, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v43, 32
+; SI-NEXT:    v_readlane_b32 s7, v44, 33
 ; SI-NEXT:    s_or_b32 s61, s61, s62
 ; SI-NEXT:    s_and_b32 s60, s60, 0xffff
 ; SI-NEXT:    s_or_b32 s58, s58, s59
 ; SI-NEXT:    s_lshl_b32 s59, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v43, 33
+; SI-NEXT:    v_readlane_b32 s7, v44, 44
 ; SI-NEXT:    s_or_b32 s60, s61, s60
 ; SI-NEXT:    s_add_i32 s61, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v42, 45
+; SI-NEXT:    v_readlane_b32 s7, v43, 46
 ; SI-NEXT:    s_add_i32 s57, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v42, 44
+; SI-NEXT:    v_readlane_b32 s7, v43, 45
 ; SI-NEXT:    s_lshl_b32 s56, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v42, 43
+; SI-NEXT:    v_readlane_b32 s7, v43, 44
 ; SI-NEXT:    s_lshl_b32 s47, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v42, 42
+; SI-NEXT:    v_readlane_b32 s7, v43, 43
 ; SI-NEXT:    s_add_i32 s46, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v42, 41
+; SI-NEXT:    v_readlane_b32 s7, v43, 42
 ; SI-NEXT:    s_add_i32 s45, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v42, 38
+; SI-NEXT:    v_readlane_b32 s7, v43, 39
 ; SI-NEXT:    s_lshl_b32 s42, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v42, 35
+; SI-NEXT:    v_readlane_b32 s7, v43, 36
 ; SI-NEXT:    s_lshl_b32 s15, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v42, 34
+; SI-NEXT:    v_readlane_b32 s7, v43, 35
 ; SI-NEXT:    s_and_b32 s45, s45, 0xff
 ; SI-NEXT:    s_add_i32 s14, s7, 3
 ; SI-NEXT:    s_or_b32 s42, s42, s45
 ; SI-NEXT:    s_and_b32 s14, s14, 0xff
 ; SI-NEXT:    s_lshl_b32 s14, s14, 16
 ; SI-NEXT:    s_addk_i32 s42, 0x300
-; SI-NEXT:    v_readlane_b32 s7, v42, 40
+; SI-NEXT:    v_readlane_b32 s7, v43, 41
 ; SI-NEXT:    s_and_b32 s57, s57, 0xff
 ; SI-NEXT:    s_or_b32 s14, s15, s14
 ; SI-NEXT:    s_and_b32 s15, s42, 0xffff
 ; SI-NEXT:    s_add_i32 s44, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v42, 39
+; SI-NEXT:    v_readlane_b32 s7, v43, 40
 ; SI-NEXT:    s_or_b32 s56, s56, s57
 ; SI-NEXT:    s_or_b32 s57, s14, s15
 ; SI-NEXT:    s_and_b32 s14, s44, 0xff
 ; SI-NEXT:    s_lshl_b32 s15, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v42, 37
+; SI-NEXT:    v_readlane_b32 s7, v43, 38
 ; SI-NEXT:    s_or_b32 s14, s15, s14
 ; SI-NEXT:    s_lshl_b32 s15, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v42, 36
+; SI-NEXT:    v_readlane_b32 s7, v43, 37
 ; SI-NEXT:    s_add_i32 s40, s7, 3
 ; SI-NEXT:    s_and_b32 s61, s61, 0xff
 ; SI-NEXT:    s_and_b32 s40, s40, 0xff
@@ -155017,15 +155711,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_or_b32 s58, s59, s58
 ; SI-NEXT:    s_or_b32 s59, s15, s14
 ; SI-NEXT:    s_add_i32 s14, s6, 0x3000000
-; SI-NEXT:    v_readlane_b32 s6, v42, 31
+; SI-NEXT:    v_readlane_b32 s6, v43, 32
 ; SI-NEXT:    s_add_i32 s11, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v42, 28
+; SI-NEXT:    v_readlane_b32 s7, v43, 29
 ; SI-NEXT:    s_and_b32 s6, s11, 0xff
 ; SI-NEXT:    s_lshl_b32 s8, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v42, 25
+; SI-NEXT:    v_readlane_b32 s7, v43, 26
 ; SI-NEXT:    s_or_b32 s6, s8, s6
 ; SI-NEXT:    s_lshl_b32 s8, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v42, 24
+; SI-NEXT:    v_readlane_b32 s7, v43, 25
 ; SI-NEXT:    s_add_i32 s24, s7, 3
 ; SI-NEXT:    s_and_b32 s11, s24, 0xff
 ; SI-NEXT:    s_addk_i32 s6, 0x300
@@ -155033,47 +155727,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; SI-NEXT:    s_or_b32 s8, s8, s11
 ; SI-NEXT:    s_or_b32 s8, s8, s6
-; SI-NEXT:    v_readlane_b32 s6, v42, 32
+; SI-NEXT:    v_readlane_b32 s6, v43, 33
 ; SI-NEXT:    s_add_i32 s12, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v42, 26
+; SI-NEXT:    v_readlane_b32 s7, v43, 27
 ; SI-NEXT:    s_and_b32 s6, s12, 0xff
 ; SI-NEXT:    s_lshl_b32 s11, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v42, 16
+; SI-NEXT:    v_readlane_b32 s7, v43, 17
 ; SI-NEXT:    s_or_b32 s6, s11, s6
 ; SI-NEXT:    s_lshl_b32 s11, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v42, 18
+; SI-NEXT:    v_readlane_b32 s7, v43, 19
 ; SI-NEXT:    s_add_i32 s12, s7, 3
 ; SI-NEXT:    s_and_b32 s12, s12, 0xff
 ; SI-NEXT:    s_addk_i32 s6, 0x300
 ; SI-NEXT:    s_lshl_b32 s12, s12, 16
-; SI-NEXT:    v_readlane_b32 s7, v42, 33
+; SI-NEXT:    v_readlane_b32 s7, v43, 34
 ; SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; SI-NEXT:    s_or_b32 s11, s11, s12
 ; SI-NEXT:    s_add_i32 s13, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v42, 30
+; SI-NEXT:    v_readlane_b32 s7, v43, 31
 ; SI-NEXT:    s_or_b32 s6, s11, s6
 ; SI-NEXT:    s_and_b32 s11, s13, 0xff
 ; SI-NEXT:    s_lshl_b32 s10, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v42, 22
+; SI-NEXT:    v_readlane_b32 s7, v43, 23
 ; SI-NEXT:    s_or_b32 s10, s10, s11
 ; SI-NEXT:    s_lshl_b32 s11, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v42, 23
+; SI-NEXT:    v_readlane_b32 s7, v43, 24
 ; SI-NEXT:    s_add_i32 s25, s7, 3
 ; SI-NEXT:    s_and_b32 s12, s25, 0xff
 ; SI-NEXT:    s_addk_i32 s10, 0x300
 ; SI-NEXT:    s_lshl_b32 s12, s12, 16
 ; SI-NEXT:    s_and_b32 s10, s10, 0xffff
 ; SI-NEXT:    s_or_b32 s11, s11, s12
-; SI-NEXT:    v_readlane_b32 s7, v42, 29
+; SI-NEXT:    v_readlane_b32 s7, v43, 30
 ; SI-NEXT:    s_or_b32 s10, s11, s10
 ; SI-NEXT:    s_add_i32 s9, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v42, 27
-; SI-NEXT:    v_readlane_b32 s11, v42, 20
+; SI-NEXT:    v_readlane_b32 s7, v43, 28
+; SI-NEXT:    v_readlane_b32 s11, v43, 21
 ; SI-NEXT:    s_and_b32 s9, s9, 0xff
 ; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_add_i32 s11, s11, 3
 ; SI-NEXT:    s_or_b32 s7, s7, s9
-; SI-NEXT:    v_readlane_b32 s9, v42, 21
+; SI-NEXT:    v_readlane_b32 s9, v43, 22
 ; SI-NEXT:    s_and_b32 s11, s11, 0xff
 ; SI-NEXT:    s_addk_i32 s7, 0x300
 ; SI-NEXT:    s_lshl_b32 s9, s9, 24
@@ -155081,15 +155775,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s7, s7, 0xffff
 ; SI-NEXT:    s_or_b32 s9, s9, s11
 ; SI-NEXT:    s_or_b32 s7, s9, s7
-; SI-NEXT:    v_readlane_b32 s9, v42, 19
+; SI-NEXT:    v_readlane_b32 s9, v43, 20
 ; SI-NEXT:    s_add_i32 s21, s9, 3
-; SI-NEXT:    v_readlane_b32 s11, v42, 17
-; SI-NEXT:    v_readlane_b32 s12, v42, 14
+; SI-NEXT:    v_readlane_b32 s11, v43, 18
+; SI-NEXT:    v_readlane_b32 s12, v43, 15
 ; SI-NEXT:    s_and_b32 s9, s21, 0xff
 ; SI-NEXT:    s_lshl_b32 s11, s11, 8
 ; SI-NEXT:    s_add_i32 s12, s12, 3
 ; SI-NEXT:    s_or_b32 s9, s11, s9
-; SI-NEXT:    v_readlane_b32 s11, v42, 15
+; SI-NEXT:    v_readlane_b32 s11, v43, 16
 ; SI-NEXT:    s_and_b32 s12, s12, 0xff
 ; SI-NEXT:    s_addk_i32 s9, 0x300
 ; SI-NEXT:    s_lshl_b32 s11, s11, 24
@@ -155097,15 +155791,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s9, s9, 0xffff
 ; SI-NEXT:    s_or_b32 s11, s11, s12
 ; SI-NEXT:    s_or_b32 s9, s11, s9
-; SI-NEXT:    v_readlane_b32 s11, v42, 13
+; SI-NEXT:    v_readlane_b32 s11, v43, 14
 ; SI-NEXT:    s_add_i32 s11, s11, 3
-; SI-NEXT:    v_readlane_b32 s12, v42, 12
-; SI-NEXT:    v_readlane_b32 s13, v42, 10
+; SI-NEXT:    v_readlane_b32 s12, v43, 13
+; SI-NEXT:    v_readlane_b32 s13, v43, 11
 ; SI-NEXT:    s_and_b32 s11, s11, 0xff
 ; SI-NEXT:    s_lshl_b32 s12, s12, 8
 ; SI-NEXT:    s_add_i32 s13, s13, 3
 ; SI-NEXT:    s_or_b32 s11, s12, s11
-; SI-NEXT:    v_readlane_b32 s12, v42, 11
+; SI-NEXT:    v_readlane_b32 s12, v43, 12
 ; SI-NEXT:    s_and_b32 s13, s13, 0xff
 ; SI-NEXT:    s_addk_i32 s11, 0x300
 ; SI-NEXT:    s_lshl_b32 s12, s12, 24
@@ -155113,16 +155807,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s11, s11, 0xffff
 ; SI-NEXT:    s_or_b32 s12, s12, s13
 ; SI-NEXT:    s_or_b32 s11, s12, s11
-; SI-NEXT:    v_readlane_b32 s12, v42, 9
+; SI-NEXT:    v_readlane_b32 s12, v43, 10
 ; SI-NEXT:    s_add_i32 s15, s16, 0x3000000
 ; SI-NEXT:    s_add_i32 s12, s12, 3
-; SI-NEXT:    v_readlane_b32 s13, v42, 8
-; SI-NEXT:    v_readlane_b32 s16, v42, 6
+; SI-NEXT:    v_readlane_b32 s13, v43, 9
+; SI-NEXT:    v_readlane_b32 s16, v43, 7
 ; SI-NEXT:    s_and_b32 s12, s12, 0xff
 ; SI-NEXT:    s_lshl_b32 s13, s13, 8
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_or_b32 s12, s13, s12
-; SI-NEXT:    v_readlane_b32 s13, v42, 7
+; SI-NEXT:    v_readlane_b32 s13, v43, 8
 ; SI-NEXT:    s_and_b32 s16, s16, 0xff
 ; SI-NEXT:    s_addk_i32 s12, 0x300
 ; SI-NEXT:    s_lshl_b32 s13, s13, 24
@@ -155130,16 +155824,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s12, s12, 0xffff
 ; SI-NEXT:    s_or_b32 s13, s13, s16
 ; SI-NEXT:    s_or_b32 s12, s13, s12
-; SI-NEXT:    v_readlane_b32 s13, v42, 5
+; SI-NEXT:    v_readlane_b32 s13, v43, 6
 ; SI-NEXT:    s_add_i32 s40, s17, 0x3000000
 ; SI-NEXT:    s_add_i32 s13, s13, 3
-; SI-NEXT:    v_readlane_b32 s16, v42, 4
-; SI-NEXT:    v_readlane_b32 s17, v42, 2
+; SI-NEXT:    v_readlane_b32 s16, v43, 5
+; SI-NEXT:    v_readlane_b32 s17, v43, 3
 ; SI-NEXT:    s_and_b32 s13, s13, 0xff
 ; SI-NEXT:    s_lshl_b32 s16, s16, 8
 ; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_or_b32 s13, s16, s13
-; SI-NEXT:    v_readlane_b32 s16, v42, 3
+; SI-NEXT:    v_readlane_b32 s16, v43, 4
 ; SI-NEXT:    s_and_b32 s17, s17, 0xff
 ; SI-NEXT:    s_addk_i32 s13, 0x300
 ; SI-NEXT:    s_lshl_b32 s16, s16, 24
@@ -155147,16 +155841,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s13, s13, 0xffff
 ; SI-NEXT:    s_or_b32 s16, s16, s17
 ; SI-NEXT:    s_or_b32 s13, s16, s13
-; SI-NEXT:    v_readlane_b32 s16, v42, 1
+; SI-NEXT:    v_readlane_b32 s16, v43, 2
 ; SI-NEXT:    s_add_i32 s41, s18, 0x3000000
 ; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    v_readlane_b32 s17, v42, 0
-; SI-NEXT:    v_readlane_b32 s18, v43, 62
+; SI-NEXT:    v_readlane_b32 s17, v43, 1
+; SI-NEXT:    v_readlane_b32 s18, v44, 63
 ; SI-NEXT:    s_and_b32 s16, s16, 0xff
 ; SI-NEXT:    s_lshl_b32 s17, s17, 8
 ; SI-NEXT:    s_add_i32 s18, s18, 3
 ; SI-NEXT:    s_or_b32 s16, s17, s16
-; SI-NEXT:    v_readlane_b32 s17, v43, 63
+; SI-NEXT:    v_readlane_b32 s17, v43, 0
 ; SI-NEXT:    s_and_b32 s18, s18, 0xff
 ; SI-NEXT:    s_addk_i32 s16, 0x300
 ; SI-NEXT:    s_lshl_b32 s17, s17, 24
@@ -155165,16 +155859,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_or_b32 s17, s17, s18
 ; SI-NEXT:    s_or_b32 s16, s17, s16
 ; SI-NEXT:    s_add_i32 s17, s16, 0x3000000
-; SI-NEXT:    v_readlane_b32 s16, v43, 61
+; SI-NEXT:    v_readlane_b32 s16, v44, 62
 ; SI-NEXT:    s_add_i32 s42, s19, 0x3000000
 ; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    v_readlane_b32 s18, v43, 60
-; SI-NEXT:    v_readlane_b32 s19, v43, 58
+; SI-NEXT:    v_readlane_b32 s18, v44, 61
+; SI-NEXT:    v_readlane_b32 s19, v44, 59
 ; SI-NEXT:    s_and_b32 s16, s16, 0xff
 ; SI-NEXT:    s_lshl_b32 s18, s18, 8
 ; SI-NEXT:    s_add_i32 s19, s19, 3
 ; SI-NEXT:    s_or_b32 s16, s18, s16
-; SI-NEXT:    v_readlane_b32 s18, v43, 59
+; SI-NEXT:    v_readlane_b32 s18, v44, 60
 ; SI-NEXT:    s_and_b32 s19, s19, 0xff
 ; SI-NEXT:    s_addk_i32 s16, 0x300
 ; SI-NEXT:    s_lshl_b32 s18, s18, 24
@@ -155182,16 +155876,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s16, s16, 0xffff
 ; SI-NEXT:    s_or_b32 s18, s18, s19
 ; SI-NEXT:    s_or_b32 s16, s18, s16
-; SI-NEXT:    v_readlane_b32 s18, v43, 57
+; SI-NEXT:    v_readlane_b32 s18, v44, 58
 ; SI-NEXT:    s_add_i32 s43, s20, 0x3000000
 ; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    v_readlane_b32 s19, v43, 56
-; SI-NEXT:    v_readlane_b32 s20, v43, 54
+; SI-NEXT:    v_readlane_b32 s19, v44, 57
+; SI-NEXT:    v_readlane_b32 s20, v44, 55
 ; SI-NEXT:    s_and_b32 s18, s18, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 8
 ; SI-NEXT:    s_add_i32 s20, s20, 3
 ; SI-NEXT:    s_or_b32 s18, s19, s18
-; SI-NEXT:    v_readlane_b32 s19, v43, 55
+; SI-NEXT:    v_readlane_b32 s19, v44, 56
 ; SI-NEXT:    s_and_b32 s20, s20, 0xff
 ; SI-NEXT:    s_addk_i32 s18, 0x300
 ; SI-NEXT:    s_lshl_b32 s19, s19, 24
@@ -155199,15 +155893,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s18, s18, 0xffff
 ; SI-NEXT:    s_or_b32 s19, s19, s20
 ; SI-NEXT:    s_or_b32 s18, s19, s18
-; SI-NEXT:    v_readlane_b32 s19, v43, 53
+; SI-NEXT:    v_readlane_b32 s19, v44, 54
 ; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    v_readlane_b32 s20, v43, 52
-; SI-NEXT:    v_readlane_b32 s21, v43, 50
+; SI-NEXT:    v_readlane_b32 s20, v44, 53
+; SI-NEXT:    v_readlane_b32 s21, v44, 51
 ; SI-NEXT:    s_and_b32 s19, s19, 0xff
 ; SI-NEXT:    s_lshl_b32 s20, s20, 8
 ; SI-NEXT:    s_add_i32 s21, s21, 3
 ; SI-NEXT:    s_or_b32 s19, s20, s19
-; SI-NEXT:    v_readlane_b32 s20, v43, 51
+; SI-NEXT:    v_readlane_b32 s20, v44, 52
 ; SI-NEXT:    s_and_b32 s21, s21, 0xff
 ; SI-NEXT:    s_addk_i32 s19, 0x300
 ; SI-NEXT:    s_lshl_b32 s20, s20, 24
@@ -155215,16 +155909,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s19, s19, 0xffff
 ; SI-NEXT:    s_or_b32 s20, s20, s21
 ; SI-NEXT:    s_or_b32 s19, s20, s19
-; SI-NEXT:    v_readlane_b32 s20, v43, 49
+; SI-NEXT:    v_readlane_b32 s20, v44, 50
 ; SI-NEXT:    s_add_i32 s44, s22, 0x3000000
 ; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    v_readlane_b32 s21, v43, 48
-; SI-NEXT:    v_readlane_b32 s22, v43, 46
+; SI-NEXT:    v_readlane_b32 s21, v44, 49
+; SI-NEXT:    v_readlane_b32 s22, v44, 47
 ; SI-NEXT:    s_and_b32 s20, s20, 0xff
 ; SI-NEXT:    s_lshl_b32 s21, s21, 8
 ; SI-NEXT:    s_add_i32 s22, s22, 3
 ; SI-NEXT:    s_or_b32 s20, s21, s20
-; SI-NEXT:    v_readlane_b32 s21, v43, 47
+; SI-NEXT:    v_readlane_b32 s21, v44, 48
 ; SI-NEXT:    s_and_b32 s22, s22, 0xff
 ; SI-NEXT:    s_addk_i32 s20, 0x300
 ; SI-NEXT:    s_lshl_b32 s21, s21, 24
@@ -155233,16 +155927,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_or_b32 s21, s21, s22
 ; SI-NEXT:    s_or_b32 s20, s21, s20
 ; SI-NEXT:    s_add_i32 s21, s20, 0x3000000
-; SI-NEXT:    v_readlane_b32 s20, v43, 43
+; SI-NEXT:    v_readlane_b32 s20, v44, 43
 ; SI-NEXT:    s_add_i32 s45, s23, 0x3000000
 ; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    v_readlane_b32 s22, v43, 42
-; SI-NEXT:    v_readlane_b32 s23, v43, 44
+; SI-NEXT:    v_readlane_b32 s22, v44, 42
+; SI-NEXT:    v_readlane_b32 s23, v44, 45
 ; SI-NEXT:    s_and_b32 s20, s20, 0xff
 ; SI-NEXT:    s_lshl_b32 s22, s22, 8
 ; SI-NEXT:    s_add_i32 s23, s23, 3
 ; SI-NEXT:    s_or_b32 s20, s22, s20
-; SI-NEXT:    v_readlane_b32 s22, v43, 45
+; SI-NEXT:    v_readlane_b32 s22, v44, 46
 ; SI-NEXT:    s_and_b32 s23, s23, 0xff
 ; SI-NEXT:    s_addk_i32 s20, 0x300
 ; SI-NEXT:    s_lshl_b32 s22, s22, 24
@@ -155251,15 +155945,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_or_b32 s22, s22, s23
 ; SI-NEXT:    s_or_b32 s20, s22, s20
 ; SI-NEXT:    s_add_i32 s22, s20, 0x3000000
-; SI-NEXT:    v_readlane_b32 s20, v43, 41
+; SI-NEXT:    v_readlane_b32 s20, v44, 41
 ; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    v_readlane_b32 s23, v43, 40
-; SI-NEXT:    v_readlane_b32 s24, v43, 38
+; SI-NEXT:    v_readlane_b32 s23, v44, 40
+; SI-NEXT:    v_readlane_b32 s24, v44, 38
 ; SI-NEXT:    s_and_b32 s20, s20, 0xff
 ; SI-NEXT:    s_lshl_b32 s23, s23, 8
 ; SI-NEXT:    s_add_i32 s24, s24, 3
 ; SI-NEXT:    s_or_b32 s20, s23, s20
-; SI-NEXT:    v_readlane_b32 s23, v43, 39
+; SI-NEXT:    v_readlane_b32 s23, v44, 39
 ; SI-NEXT:    s_and_b32 s24, s24, 0xff
 ; SI-NEXT:    s_addk_i32 s20, 0x300
 ; SI-NEXT:    s_lshl_b32 s23, s23, 24
@@ -155268,361 +155962,367 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_or_b32 s23, s23, s24
 ; SI-NEXT:    s_or_b32 s20, s23, s20
 ; SI-NEXT:    s_add_i32 s23, s20, 0x3000000
-; SI-NEXT:    v_readlane_b32 s20, v43, 37
+; SI-NEXT:    v_readlane_b32 s20, v44, 37
 ; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    v_readlane_b32 s24, v43, 36
-; SI-NEXT:    v_readlane_b32 s25, v43, 34
+; SI-NEXT:    v_readlane_b32 s24, v44, 36
+; SI-NEXT:    v_readlane_b32 s25, v44, 34
 ; SI-NEXT:    s_and_b32 s20, s20, 0xff
 ; SI-NEXT:    s_lshl_b32 s24, s24, 8
 ; SI-NEXT:    s_add_i32 s25, s25, 3
 ; SI-NEXT:    s_or_b32 s20, s24, s20
-; SI-NEXT:    v_readlane_b32 s24, v43, 35
+; SI-NEXT:    v_readlane_b32 s24, v44, 35
 ; SI-NEXT:    s_and_b32 s25, s25, 0xff
 ; SI-NEXT:    s_addk_i32 s20, 0x300
 ; SI-NEXT:    s_lshl_b32 s24, s24, 24
 ; SI-NEXT:    s_lshl_b32 s25, s25, 16
 ; SI-NEXT:    s_and_b32 s20, s20, 0xffff
 ; SI-NEXT:    s_or_b32 s24, s24, s25
-; SI-NEXT:    s_and_b32 s46, s46, 0xff
 ; SI-NEXT:    s_or_b32 s20, s24, s20
-; SI-NEXT:    v_readlane_b32 s24, v43, 3
-; SI-NEXT:    s_lshl_b32 s46, s46, 16
-; SI-NEXT:    s_addk_i32 s56, 0x300
+; SI-NEXT:    v_readlane_b32 s24, v44, 3
 ; SI-NEXT:    s_add_i32 s24, s24, 3
-; SI-NEXT:    v_readlane_b32 s25, v43, 2
-; SI-NEXT:    v_readlane_b32 s26, v43, 1
-; SI-NEXT:    s_or_b32 s46, s47, s46
-; SI-NEXT:    s_and_b32 s47, s56, 0xffff
-; SI-NEXT:    s_add_i32 s7, s7, 0x3000000
-; SI-NEXT:    s_add_i32 s9, s9, 0x3000000
+; SI-NEXT:    v_readlane_b32 s25, v44, 2
+; SI-NEXT:    v_readlane_b32 s26, v44, 1
 ; SI-NEXT:    s_and_b32 s24, s24, 0xff
 ; SI-NEXT:    s_lshl_b32 s25, s25, 8
 ; SI-NEXT:    s_add_i32 s26, s26, 3
-; SI-NEXT:    s_or_b32 s56, s46, s47
-; SI-NEXT:    s_add_i32 s47, s58, 0x3000000
-; SI-NEXT:    s_add_i32 s58, s59, 0x3000000
-; SI-NEXT:    s_add_i32 s10, s10, 0x3000000
 ; SI-NEXT:    s_or_b32 s24, s25, s24
-; SI-NEXT:    v_readlane_b32 s25, v43, 0
+; SI-NEXT:    v_readlane_b32 s25, v44, 0
 ; SI-NEXT:    s_and_b32 s26, s26, 0xff
-; SI-NEXT:    s_and_b32 s73, s9, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s59, s9, 16
-; SI-NEXT:    s_and_b32 s9, s7, 0xffff0000
-; SI-NEXT:    s_add_i32 s6, s6, 0x3000000
+; SI-NEXT:    s_add_i32 s13, s13, 0x3000000
 ; SI-NEXT:    s_addk_i32 s24, 0x300
 ; SI-NEXT:    s_lshl_b32 s25, s25, 24
 ; SI-NEXT:    s_lshl_b32 s26, s26, 16
-; SI-NEXT:    s_and_b32 s63, s17, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s79, s17, 16
-; SI-NEXT:    v_writelane_b32 v42, s9, 50
-; SI-NEXT:    s_lshl_b32 s17, s7, 16
-; SI-NEXT:    s_lshl_b32 s7, s10, 16
-; SI-NEXT:    s_add_i32 s8, s8, 0x3000000
+; SI-NEXT:    s_add_i32 s9, s9, 0x3000000
+; SI-NEXT:    s_add_i32 s11, s11, 0x3000000
+; SI-NEXT:    s_add_i32 s18, s18, 0x3000000
 ; SI-NEXT:    s_and_b32 s24, s24, 0xffff
 ; SI-NEXT:    s_or_b32 s25, s25, s26
-; SI-NEXT:    v_writelane_b32 v42, s7, 51
-; SI-NEXT:    s_and_b32 s7, s6, 0xffff0000
+; SI-NEXT:    s_and_b32 s89, s17, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s88, s17, 16
+; SI-NEXT:    s_and_b32 s17, s13, 0xffff0000
+; SI-NEXT:    s_add_i32 s7, s7, 0x3000000
 ; SI-NEXT:    s_or_b32 s24, s25, s24
-; SI-NEXT:    v_writelane_b32 v42, s7, 52
+; SI-NEXT:    s_and_b32 s74, s18, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s25, s18, 16
+; SI-NEXT:    v_writelane_b32 v43, s17, 49
+; SI-NEXT:    s_and_b32 s63, s11, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s18, s11, 16
+; SI-NEXT:    s_and_b32 s11, s9, 0xffff0000
+; SI-NEXT:    s_and_b32 s46, s46, 0xff
+; SI-NEXT:    s_add_i32 s6, s6, 0x3000000
+; SI-NEXT:    v_writelane_b32 v43, s11, 50
+; SI-NEXT:    s_lshl_b32 s61, s9, 16
+; SI-NEXT:    s_and_b32 s9, s7, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s46, s46, 16
+; SI-NEXT:    s_addk_i32 s56, 0x300
+; SI-NEXT:    s_add_i32 s8, s8, 0x3000000
+; SI-NEXT:    v_writelane_b32 v43, s9, 51
+; SI-NEXT:    s_lshl_b32 s17, s7, 16
+; SI-NEXT:    s_and_b32 s7, s6, 0xffff0000
+; SI-NEXT:    s_or_b32 s46, s47, s46
+; SI-NEXT:    s_and_b32 s47, s56, 0xffff
+; SI-NEXT:    v_writelane_b32 v43, s7, 52
 ; SI-NEXT:    s_and_b32 s7, s8, 0xffff0000
+; SI-NEXT:    s_or_b32 s56, s46, s47
+; SI-NEXT:    s_add_i32 s47, s58, 0x3000000
+; SI-NEXT:    s_add_i32 s58, s59, 0x3000000
+; SI-NEXT:    v_writelane_b32 v43, s7, 53
+; SI-NEXT:    s_lshl_b32 s7, s8, 16
+; SI-NEXT:    s_add_i32 s57, s57, 0x3000000
+; SI-NEXT:    v_writelane_b32 v43, s7, 54
+; SI-NEXT:    s_and_b32 s7, s58, 0xffff0000
 ; SI-NEXT:    s_add_i32 s4, s4, 0x3000000
 ; SI-NEXT:    s_add_i32 s5, s5, 0x3000000
 ; SI-NEXT:    s_add_i32 s46, s60, 0x3000000
 ; SI-NEXT:    s_add_i32 s56, s56, 0x3000000
-; SI-NEXT:    s_add_i32 s57, s57, 0x3000000
-; SI-NEXT:    s_add_i32 s11, s11, 0x3000000
+; SI-NEXT:    s_add_i32 s10, s10, 0x3000000
 ; SI-NEXT:    s_add_i32 s12, s12, 0x3000000
-; SI-NEXT:    s_add_i32 s13, s13, 0x3000000
 ; SI-NEXT:    s_add_i32 s16, s16, 0x3000000
-; SI-NEXT:    s_add_i32 s18, s18, 0x3000000
 ; SI-NEXT:    s_add_i32 s19, s19, 0x3000000
 ; SI-NEXT:    s_add_i32 s20, s20, 0x3000000
 ; SI-NEXT:    s_add_i32 s24, s24, 0x3000000
-; SI-NEXT:    v_writelane_b32 v42, s7, 53
-; SI-NEXT:    s_lshl_b32 s7, s8, 16
+; SI-NEXT:    v_writelane_b32 v43, s7, 55
+; SI-NEXT:    s_and_b32 s7, s57, 0xffff0000
 ; SI-NEXT:    s_and_b32 s27, s24, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s26, s24, 16
-; SI-NEXT:    s_and_b32 s24, s20, 0xffff0000
+; SI-NEXT:    s_and_b32 s65, s20, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s20, s20, 16
-; SI-NEXT:    s_and_b32 s35, s23, 0xffff0000
+; SI-NEXT:    s_and_b32 s66, s23, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s29, s23, 16
-; SI-NEXT:    s_and_b32 s90, s22, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s74, s22, 16
-; SI-NEXT:    s_and_b32 s25, s21, 0xffff0000
+; SI-NEXT:    s_and_b32 s64, s22, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s95, s22, 16
+; SI-NEXT:    s_and_b32 s76, s21, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s21, s21, 16
-; SI-NEXT:    s_and_b32 s75, s19, 0xffff0000
+; SI-NEXT:    s_and_b32 s77, s19, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s22, s19, 16
-; SI-NEXT:    s_and_b32 s61, s18, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s76, s18, 16
-; SI-NEXT:    s_and_b32 s77, s16, 0xffff0000
+; SI-NEXT:    s_and_b32 s78, s16, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s16, s16, 16
-; SI-NEXT:    s_and_b32 s89, s13, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s19, s13, 16
-; SI-NEXT:    s_and_b32 s13, s12, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s88, s12, 16
-; SI-NEXT:    s_and_b32 s60, s11, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s18, s11, 16
-; SI-NEXT:    s_and_b32 s23, s10, 0xffff0000
+; SI-NEXT:    s_and_b32 s75, s12, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s79, s12, 16
+; SI-NEXT:    s_and_b32 s13, s10, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s59, s10, 16
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    v_writelane_b32 v42, s7, 54
-; SI-NEXT:    s_and_b32 s72, s58, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s99, s58, 16
-; SI-NEXT:    s_and_b32 s7, s57, 0xffff0000
+; SI-NEXT:    v_writelane_b32 v43, s7, 56
 ; SI-NEXT:    s_lshl_b32 s57, s57, 16
-; SI-NEXT:    s_and_b32 s49, s56, 0xffff0000
+; SI-NEXT:    s_and_b32 s7, s56, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s8, s56, 16
-; SI-NEXT:    s_and_b32 s51, s47, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s50, s47, 16
-; SI-NEXT:    s_and_b32 s52, s46, 0xffff0000
+; SI-NEXT:    s_and_b32 s56, s47, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s23, s47, 16
+; SI-NEXT:    s_and_b32 s47, s46, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s97, s46, 16
-; SI-NEXT:    s_and_b32 s54, s45, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s53, s45, 16
-; SI-NEXT:    s_and_b32 s55, s44, 0xffff0000
+; SI-NEXT:    s_and_b32 s24, s45, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s45, s45, 16
+; SI-NEXT:    s_and_b32 s58, s44, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s28, s44, 16
-; SI-NEXT:    s_and_b32 s65, s43, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s64, s43, 16
-; SI-NEXT:    s_and_b32 s66, s42, 0xffff0000
+; SI-NEXT:    s_and_b32 s73, s43, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s46, s43, 16
+; SI-NEXT:    s_and_b32 s67, s42, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s87, s42, 16
 ; SI-NEXT:    s_and_b32 s68, s41, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s67, s41, 16
-; SI-NEXT:    s_and_b32 s69, s40, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s42, s41, 16
+; SI-NEXT:    s_and_b32 s70, s40, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s86, s40, 16
-; SI-NEXT:    s_and_b32 s62, s15, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s70, s15, 16
-; SI-NEXT:    s_and_b32 s80, s14, 0xffff0000
+; SI-NEXT:    s_and_b32 s94, s15, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s69, s15, 16
+; SI-NEXT:    s_and_b32 s11, s14, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s85, s14, 16
-; SI-NEXT:    s_and_b32 s92, s5, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s11, s5, 16
-; SI-NEXT:    s_and_b32 s83, s4, 0xffff0000
+; SI-NEXT:    s_and_b32 s31, s5, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s80, s5, 16
+; SI-NEXT:    s_and_b32 s15, s4, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s84, s4, 16
-; SI-NEXT:    v_writelane_b32 v42, s7, 55
+; SI-NEXT:    v_writelane_b32 v43, s7, 57
 ; SI-NEXT:  .LBB89_3: ; %end
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s27
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s26
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s26
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_readlane_b32 s4, v43, 49
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s24
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s20
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s65
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s20
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s35
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s29
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s66
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s29
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s90
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s74
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s64
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s95
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s25
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s21
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s76
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s21
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s75
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s22
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s77
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s22
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s61
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s76
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s74
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s25
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s77
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s78
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s63
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s79
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s89
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s88
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s89
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s19
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s19
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s13
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s88
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s75
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s79
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s60
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s18
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s63
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s18
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
+; SI-NEXT:    v_readlane_b32 s4, v43, 50
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s73
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s59
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s61
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 48, v0
-; SI-NEXT:    v_readlane_b32 s4, v42, 50
+; SI-NEXT:    v_readlane_b32 s4, v43, 51
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s17
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s17
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s23
-; SI-NEXT:    v_readlane_b32 s4, v42, 51
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s13
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s59
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
-; SI-NEXT:    v_readlane_b32 s4, v42, 52
+; SI-NEXT:    v_readlane_b32 s4, v43, 52
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s6
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
-; SI-NEXT:    v_readlane_b32 s4, v42, 53
+; SI-NEXT:    v_readlane_b32 s4, v43, 53
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
-; SI-NEXT:    v_readlane_b32 s4, v42, 54
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_readlane_b32 s4, v43, 54
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
+; SI-NEXT:    v_readlane_b32 s4, v43, 55
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s72
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s99
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s99
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
-; SI-NEXT:    v_readlane_b32 s4, v42, 55
+; SI-NEXT:    v_readlane_b32 s4, v43, 56
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s57
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s57
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
+; SI-NEXT:    v_readlane_b32 s4, v43, 57
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s49
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s8
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s8
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s51
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s50
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s56
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s23
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s52
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s97
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s47
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s97
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s54
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s53
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s24
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s45
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s55
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s28
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s58
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s28
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s65
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s64
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s73
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s46
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s66
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s87
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s67
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s87
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s68
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s67
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s42
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s69
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s86
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s70
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s86
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s62
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s70
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s94
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s69
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s80
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s85
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s11
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s85
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s92
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s11
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s31
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s80
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s83
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s84
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s15
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s84
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
@@ -155666,6 +156366,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -155676,99 +156377,109 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; kill: killed $sgpr8
-; SI-NEXT:    v_readlane_b32 s58, v43, 19
+; SI-NEXT:    ; kill: killed $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    s_mov_b32 s95, s47
-; SI-NEXT:    s_mov_b32 s94, s21
-; SI-NEXT:    s_mov_b32 s93, s61
-; SI-NEXT:    s_mov_b32 s34, s73
-; SI-NEXT:    s_mov_b32 s91, s75
-; SI-NEXT:    v_readlane_b32 s56, v43, 10
-; SI-NEXT:    s_mov_b32 s36, s63
-; SI-NEXT:    s_mov_b32 s38, s59
-; SI-NEXT:    s_mov_b32 s37, s42
-; SI-NEXT:    v_readlane_b32 s30, v43, 17
-; SI-NEXT:    v_readlane_b32 s98, v43, 6
-; SI-NEXT:    s_mov_b32 s46, s45
-; SI-NEXT:    s_mov_b32 s31, s43
-; SI-NEXT:    s_mov_b32 s78, s40
-; SI-NEXT:    v_readlane_b32 s15, v43, 14
-; SI-NEXT:    s_mov_b32 s39, s57
-; SI-NEXT:    s_mov_b32 s48, s13
-; SI-NEXT:    v_readlane_b32 s41, v43, 13
-; SI-NEXT:    v_readlane_b32 s44, v43, 5
-; SI-NEXT:    v_readlane_b32 s9, v43, 11
-; SI-NEXT:    v_readlane_b32 s14, v43, 12
-; SI-NEXT:    v_readlane_b32 s81, v43, 9
-; SI-NEXT:    v_readlane_b32 s10, v43, 16
-; SI-NEXT:    v_readlane_b32 s12, v43, 4
-; SI-NEXT:    v_readlane_b32 s96, v43, 7
-; SI-NEXT:    v_readlane_b32 s82, v43, 8
-; SI-NEXT:    v_readlane_b32 s71, v43, 15
+; SI-NEXT:    ; kill: killed $sgpr8
+; SI-NEXT:    v_readlane_b32 s92, v44, 24
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    v_readlane_b32 s91, v44, 20
+; SI-NEXT:    s_mov_b32 s90, s88
+; SI-NEXT:    v_readlane_b32 s36, v44, 23
+; SI-NEXT:    v_readlane_b32 s35, v44, 19
+; SI-NEXT:    v_readlane_b32 s62, v44, 22
+; SI-NEXT:    v_readlane_b32 s38, v44, 18
+; SI-NEXT:    s_mov_b32 s34, s46
+; SI-NEXT:    s_mov_b32 s93, s21
+; SI-NEXT:    s_mov_b32 s37, s43
+; SI-NEXT:    s_mov_b32 s39, s75
+; SI-NEXT:    v_readlane_b32 s72, v44, 10
+; SI-NEXT:    s_mov_b32 s50, s63
+; SI-NEXT:    s_mov_b32 s51, s59
+; SI-NEXT:    s_mov_b32 s48, s56
+; SI-NEXT:    v_readlane_b32 s30, v44, 21
+; SI-NEXT:    s_mov_b32 s49, s61
+; SI-NEXT:    s_mov_b32 s52, s79
+; SI-NEXT:    v_readlane_b32 s98, v44, 6
+; SI-NEXT:    s_mov_b32 s55, s45
+; SI-NEXT:    v_readlane_b32 s43, v44, 17
+; SI-NEXT:    s_mov_b32 s60, s40
+; SI-NEXT:    v_readlane_b32 s41, v44, 14
+; SI-NEXT:    s_mov_b32 s53, s42
+; SI-NEXT:    s_mov_b32 s54, s13
+; SI-NEXT:    v_readlane_b32 s14, v44, 13
+; SI-NEXT:    v_readlane_b32 s44, v44, 5
+; SI-NEXT:    v_readlane_b32 s9, v44, 11
+; SI-NEXT:    v_readlane_b32 s81, v44, 12
+; SI-NEXT:    v_readlane_b32 s82, v44, 9
+; SI-NEXT:    v_readlane_b32 s10, v44, 16
+; SI-NEXT:    v_readlane_b32 s12, v44, 4
+; SI-NEXT:    v_readlane_b32 s96, v44, 7
+; SI-NEXT:    v_readlane_b32 s83, v44, 8
+; SI-NEXT:    v_readlane_b32 s71, v44, 15
 ; SI-NEXT:    ; kill: killed $sgpr6
 ; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; kill: killed $sgpr8
 ; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr11
 ; SI-NEXT:    ; implicit-def: $sgpr26
 ; SI-NEXT:    ; implicit-def: $sgpr27
 ; SI-NEXT:    ; implicit-def: $sgpr20
-; SI-NEXT:    ; implicit-def: $sgpr24
+; SI-NEXT:    ; implicit-def: $sgpr65
 ; SI-NEXT:    ; implicit-def: $sgpr29
-; SI-NEXT:    ; implicit-def: $sgpr35
-; SI-NEXT:    ; implicit-def: $sgpr74
-; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr95
+; SI-NEXT:    ; implicit-def: $sgpr64
 ; SI-NEXT:    ; implicit-def: $sgpr21
-; SI-NEXT:    ; implicit-def: $sgpr25
-; SI-NEXT:    ; implicit-def: $sgpr22
-; SI-NEXT:    ; implicit-def: $sgpr75
 ; SI-NEXT:    ; implicit-def: $sgpr76
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $sgpr16
+; SI-NEXT:    ; implicit-def: $sgpr22
 ; SI-NEXT:    ; implicit-def: $sgpr77
-; SI-NEXT:    ; implicit-def: $sgpr79
-; SI-NEXT:    ; implicit-def: $sgpr63
-; SI-NEXT:    ; implicit-def: $sgpr19
-; SI-NEXT:    ; implicit-def: $sgpr89
+; SI-NEXT:    ; implicit-def: $sgpr25
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr16
+; SI-NEXT:    ; implicit-def: $sgpr78
 ; SI-NEXT:    ; implicit-def: $sgpr88
-; SI-NEXT:    ; implicit-def: $sgpr13
+; SI-NEXT:    ; implicit-def: $sgpr89
+; SI-NEXT:    ; implicit-def: $sgpr19
+; SI-NEXT:    ; implicit-def: $sgpr79
+; SI-NEXT:    ; implicit-def: $sgpr75
 ; SI-NEXT:    ; implicit-def: $sgpr18
-; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $sgpr59
-; SI-NEXT:    ; implicit-def: $sgpr73
+; SI-NEXT:    ; implicit-def: $sgpr63
+; SI-NEXT:    ; implicit-def: $sgpr61
 ; SI-NEXT:    ; implicit-def: $sgpr17
 ; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    ; implicit-def: $sgpr23
+; SI-NEXT:    ; implicit-def: $sgpr59
+; SI-NEXT:    ; implicit-def: $sgpr13
 ; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; implicit-def: $sgpr99
-; SI-NEXT:    ; implicit-def: $sgpr72
 ; SI-NEXT:    ; implicit-def: $sgpr57
 ; SI-NEXT:    ; kill: killed $sgpr8
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr49
-; SI-NEXT:    ; implicit-def: $sgpr50
-; SI-NEXT:    ; implicit-def: $sgpr51
+; SI-NEXT:    ; kill: killed $sgpr11
+; SI-NEXT:    ; implicit-def: $sgpr23
+; SI-NEXT:    ; implicit-def: $sgpr56
 ; SI-NEXT:    ; implicit-def: $sgpr97
-; SI-NEXT:    ; implicit-def: $sgpr52
-; SI-NEXT:    ; implicit-def: $sgpr53
-; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; implicit-def: $sgpr47
+; SI-NEXT:    ; implicit-def: $sgpr45
+; SI-NEXT:    ; implicit-def: $sgpr24
 ; SI-NEXT:    ; implicit-def: $sgpr28
-; SI-NEXT:    ; implicit-def: $sgpr55
-; SI-NEXT:    ; implicit-def: $sgpr64
-; SI-NEXT:    ; implicit-def: $sgpr65
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr73
 ; SI-NEXT:    ; implicit-def: $sgpr87
-; SI-NEXT:    ; implicit-def: $sgpr66
 ; SI-NEXT:    ; implicit-def: $sgpr67
+; SI-NEXT:    ; implicit-def: $sgpr42
 ; SI-NEXT:    ; implicit-def: $sgpr68
 ; SI-NEXT:    ; implicit-def: $sgpr86
-; SI-NEXT:    ; implicit-def: $sgpr69
 ; SI-NEXT:    ; implicit-def: $sgpr70
-; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr69
+; SI-NEXT:    ; implicit-def: $sgpr94
 ; SI-NEXT:    ; implicit-def: $sgpr85
-; SI-NEXT:    ; implicit-def: $sgpr80
 ; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr80
+; SI-NEXT:    ; implicit-def: $sgpr31
 ; SI-NEXT:    ; implicit-def: $sgpr84
-; SI-NEXT:    ; implicit-def: $sgpr83
+; SI-NEXT:    ; implicit-def: $sgpr15
 ; SI-NEXT:    s_branch .LBB89_2
 ;
 ; VI-LABEL: bitcast_v128i8_to_v64bf16_scalar:
@@ -155894,33 +156605,53 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v24
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v26
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
+; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
+; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
+; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
+; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
+; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
+; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
+; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
+; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
 ; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v2
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:248
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -155965,52 +156696,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:68
 ; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
-; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
-; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
-; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
-; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
-; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
-; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -156030,6 +156715,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
@@ -156038,7 +156724,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
@@ -156070,6 +156755,25 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB89_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
@@ -156094,15 +156798,18 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v2, v8
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
@@ -156152,10 +156859,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -156163,50 +156871,37 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(9)
-; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v42, v43
 ; VI-NEXT:    v_mov_b32_e32 v43, v37
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
@@ -156221,13 +156916,12 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -156249,21 +156943,28 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v56, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v63, v39
+; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v57, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -156281,11 +156982,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v53, v35
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v3, s4, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xff
@@ -156318,7 +157018,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    s_branch .LBB89_3
 ; VI-NEXT:  .LBB89_2:
 ; VI-NEXT:    v_mov_b32_e32 v47, v54
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -156339,6 +157038,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v58, v7
 ; VI-NEXT:    v_mov_b32_e32 v57, v5
 ; VI-NEXT:    v_mov_b32_e32 v56, v3
@@ -156930,29 +157630,51 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_load_ushort v5, off, s[0:3], s32 offset:224
 ; GFX9-NEXT:    buffer_load_ushort v9, off, s[0:3], s32 offset:232
 ; GFX9-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
+; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
+; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v46, 8, v46
 ; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v13
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v4
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
 ; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
@@ -157016,82 +157738,42 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:124
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
-; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
-; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
-; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
-; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
@@ -157112,6 +157794,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(55)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB89_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_and_b32 s4, s28, 0xff
@@ -157365,14 +158054,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
@@ -157382,7 +158070,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:  .LBB89_2:
 ; GFX9-NEXT:    v_mov_b32_e32 v58, v50
 ; GFX9-NEXT:    v_mov_b32_e32 v45, v59
-; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -157394,6 +158081,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v35
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v49, v39
@@ -157859,7 +158547,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64bf16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
+; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -158589,7 +159277,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v30.h, v182.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v43.l
 ; GFX11-TRUE16-NEXT:  .LBB89_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
+; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -158631,7 +159319,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
+; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -159415,7 +160103,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
 ; GFX11-FAKE16-NEXT:  .LBB89_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
+; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -161506,6 +162194,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-LABEL: bitcast_v64bf16_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -161522,9 +162213,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    ; implicit-def: $vgpr35
 ; VI-NEXT:    ; implicit-def: $vgpr45
 ; VI-NEXT:    ; implicit-def: $vgpr34
@@ -161713,166 +162401,165 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v29
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v28
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v28
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v27
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v27
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v26
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v26
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v25
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v25
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[9:10]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[7:8]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v23
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v23
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[5:6]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v5
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v21
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[3:4]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v21
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v20
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v4
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[1:2]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v20
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v3
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v19
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v2
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[31:32]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v18
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT:    v_lshrrev_b32_e32 v46, 24, v12
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v18
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v2
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v18
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[15:16]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v1
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v17
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[13:14]
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[11:12]
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[9:10]
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[7:8]
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[5:6]
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[3:4]
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[1:2]
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[31:32]
-; VI-NEXT:    v_lshrrev_b32_e32 v46, 24, v12
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v18
 ; VI-NEXT:    v_mov_b32_e32 v45, v46
 ; VI-NEXT:    v_lshrrev_b64 v[46:47], 24, v[29:30]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
 ; VI-NEXT:    v_lshrrev_b32_e32 v63, 8, v11
 ; VI-NEXT:    v_lshrrev_b32_e32 v50, 8, v31
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v17
 ; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v46, v63
 ; VI-NEXT:    v_mov_b32_e32 v63, v50
 ; VI-NEXT:    v_lshrrev_b64 v[50:51], 24, v[27:28]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 8, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v9
 ; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v5
 ; VI-NEXT:    v_mov_b32_e32 v51, v57
 ; VI-NEXT:    v_mov_b32_e32 v50, v56
 ; VI-NEXT:    v_lshrrev_b64 v[56:57], 24, v[25:26]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
 ; VI-NEXT:    v_mov_b32_e32 v57, v43
 ; VI-NEXT:    v_lshrrev_b64 v[43:44], 24, v[23:24]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v4
 ; VI-NEXT:    v_lshrrev_b64 v[43:44], 24, v[21:22]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v3
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v3
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v2
 ; VI-NEXT:    v_lshrrev_b64 v[43:44], 24, v[19:20]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v2
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
 ; VI-NEXT:    v_lshrrev_b64 v[43:44], 24, v[17:18]
 ; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v10
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v12
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
@@ -161885,6 +162572,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v54, 24, v6
 ; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
 ; VI-NEXT:    v_lshrrev_b32_e32 v38, 8, v6
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v37, 24, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v48, 8, v32
@@ -162518,27 +163206,27 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[21:22]
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v28
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[19:20]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[17:18]
-; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v28
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v28
+; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v28
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v27
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v27
+; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[17:18]
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v26
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v26
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v26
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
@@ -163282,49 +163970,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 24, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 24, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 8, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v42, 8, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 24, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 8, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 8, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 8, v24
-; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
-; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v23
-; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v22
-; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v22
-; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 16, v21
-; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v21
-; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 8, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 8, v19
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 24, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v60, 8, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 16, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 8, v17
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v18
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
@@ -163338,6 +163988,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v63
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(44)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v62
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v62
@@ -163355,130 +164006,168 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 24, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 24, v6
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v6
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[15:16]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v7
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 8, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 8, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v42, 8, v6
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 8, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 24, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 8, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(35)
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[15:16]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[13:14]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[11:12]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[9:10]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[7:8]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[5:6]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[3:4]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[1:2]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[62:63]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[29:30]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[27:28]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[25:26]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[23:24]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[21:22]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[19:20]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[17:18]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 8, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 8, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 8, v24
+; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v22
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 16, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v21
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 8, v20
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 8, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 24, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v60, 8, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 8, v17
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
@@ -163571,16 +164260,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_cndmask_b32_e32 v13, v18, v19, vcc
 ; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_mov_b32_e32 v59, v32
 ; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
 ; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
-; GFX9-NEXT:    v_mov_b32_e32 v58, v31
 ; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_perm_b32 v14, v13, v0, s7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v18, v19, vcc
@@ -163735,7 +164419,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
 ; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    s_waitcnt vmcnt(52)
+; GFX9-NEXT:    s_waitcnt vmcnt(50)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v62
 ; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
 ; GFX9-NEXT:    v_cndmask_b32_e32 v44, v18, v19, vcc
@@ -163750,7 +164434,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
 ; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
@@ -163891,8 +164574,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX9-NEXT:    v_mov_b32_e32 v59, v32
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX9-NEXT:    v_mov_b32_e32 v58, v31
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s6
 ; GFX9-NEXT:    v_or_b32_e32 v31, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
@@ -163958,6 +164643,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v41, 0x400000, v31
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX9-NEXT:    v_bfe_u32 v31, v13, 16, 1
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_perm_b32 v61, v28, v0, s7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v32, v41, vcc
 ; GFX9-NEXT:    v_add3_u32 v31, v31, v13, s6
@@ -163965,7 +164651,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX9-NEXT:    v_cndmask_b32_e32 v13, v31, v32, vcc
 ; GFX9-NEXT:    v_perm_b32 v41, v13, v0, s7
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v16
 ; GFX9-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; GFX9-NEXT:    v_bfe_u32 v31, v13, 16, 1
@@ -163994,24 +164680,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v45, 0x400000, v15
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX9-NEXT:    v_cndmask_b32_e32 v15, v31, v45, vcc
+; GFX9-NEXT:    v_perm_b32 v32, v16, v13, s7
 ; GFX9-NEXT:    v_perm_b32 v31, v15, v26, s7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v16
-; GFX9-NEXT:    v_perm_b32 v32, v16, v13, s7
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v26
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v14
-; GFX9-NEXT:    v_perm_b32 v42, v14, v11, s7
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
@@ -164031,12 +164707,19 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_perm_b32 v34, v30, v27, s7
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v26
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v27
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_perm_b32 v36, v44, v29, s7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v29
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v24
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v23
+; GFX9-NEXT:    v_perm_b32 v42, v14, v11, s7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
 ; GFX9-NEXT:    v_perm_b32 v38, v21, v43, s7
@@ -164045,6 +164728,24 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v20
+; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_perm_b32 v55, v12, v9, s7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v43
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_perm_b32 v51, v6, v17, s7
 ; GFX9-NEXT:    v_perm_b32 v40, v10, v7, s7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
@@ -164052,12 +164753,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v17
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v57
-; GFX9-NEXT:    v_perm_b32 v55, v12, v9, s7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v43
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v47
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -164101,7 +164798,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[31:32]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
@@ -164134,74 +164830,51 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[35:36]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[33:34]
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[60:61]
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[62:63]
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[43:44]
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[58:59]
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[58:59]
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[33:34]
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[60:61]
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[62:63]
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 24, v53
@@ -164214,15 +164887,26 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 8, v37
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v36
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v36
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[43:44]
 ; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v35
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 8, v35
 ; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 24, v34
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 8, v34
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v33
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[58:59]
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v61
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
@@ -164231,6 +164915,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v60
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v60
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
@@ -164255,31 +164942,33 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v49
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 16, v48
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v63, v16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v35
+; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v40
 ; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v48
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_mov_b32_e32 v63, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
-; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v40
-; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v48
 ; GFX9-NEXT:    v_mov_b32_e32 v62, v15
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 8, v35
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v34
-; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v34
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v44
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v44
-; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 16, v43
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v43
+; GFX9-NEXT:    s_waitcnt vmcnt(24)
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[58:59]
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v34
+; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 16, v43
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 24, v59
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 8, v58
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(19)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v60
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v61
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 8, v61
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 8, v60
@@ -164294,6 +164983,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v10, 8, v54
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v32, 8, v32
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v31, 8, v31
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -164302,6 +164995,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v31, 8, v39
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v51
@@ -164310,38 +165005,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v52
 ; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 8, v11
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v12, 8, v12
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 8, v13
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v14, 8, v14
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v43
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v60
 ; GFX9-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v18, 8, v31
 ; GFX9-NEXT:    v_or_b32_sdwa v18, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -164679,7 +165365,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v128i8:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x2
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v80, off, s32
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:240
@@ -164712,7 +165402,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1b
+; GFX11-TRUE16-NEXT:    s_clause 0x1b ; 112-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:112
@@ -164741,10 +165431,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:12
-; GFX11-TRUE16-NEXT:    s_clause 0x2
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v80, off, s32
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr181_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr152_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
@@ -165778,7 +166464,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:20
@@ -165811,7 +166497,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    s_clause 0x1b
+; GFX11-TRUE16-NEXT:    s_clause 0x1b ; 112-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:140
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:144
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:148
@@ -165846,7 +166532,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x15
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_clause 0x15 ; 88-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:92
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:88
@@ -165869,10 +166559,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:12
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr76
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
@@ -166991,7 +167677,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x15
+; GFX11-FAKE16-NEXT:    s_clause 0x15 ; 88-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:20
@@ -167038,40 +167724,43 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:16
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:40
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:68
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:76
@@ -167104,1627 +167793,1841 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_writelane_b32 v63, s81, 25
 ; SI-NEXT:    v_writelane_b32 v63, s82, 26
 ; SI-NEXT:    v_writelane_b32 v63, s83, 27
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v10
 ; SI-NEXT:    v_writelane_b32 v63, s84, 28
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v9
 ; SI-NEXT:    v_writelane_b32 v63, s85, 29
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v14
 ; SI-NEXT:    v_writelane_b32 v63, s86, 30
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v15
 ; SI-NEXT:    v_writelane_b32 v63, s87, 31
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v18
+; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v10
 ; SI-NEXT:    v_writelane_b32 v63, s96, 32
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v26
 ; SI-NEXT:    v_writelane_b32 v63, s97, 33
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; SI-NEXT:    v_writelane_b32 v63, s98, 34
-; SI-NEXT:    v_mov_b32_e32 v46, v21
 ; SI-NEXT:    v_writelane_b32 v63, s99, 35
-; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v6
-; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v8
-; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v7
-; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v12
-; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v11
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v16
-; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; SI-NEXT:    v_mul_f32_e32 v9, 1.0, v20
-; SI-NEXT:    v_mul_f32_e32 v12, 1.0, v19
-; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v22
-; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v46
+; SI-NEXT:    v_mul_f32_e32 v35, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v7
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v48
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v24
-; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v23
+; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v9
+; SI-NEXT:    v_mul_f32_e32 v9, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v11
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v14
+; SI-NEXT:    v_mul_f32_e32 v7, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v15
+; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v18
+; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v17
+; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; SI-NEXT:    v_mul_f32_e32 v12, 1.0, v19
+; SI-NEXT:    v_mul_f32_e32 v8, 1.0, v22
+; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v21
+; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v24
+; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v23
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
+; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v16
+; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v26
+; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; SI-NEXT:    v_mul_f32_e32 v10, 1.0, v38
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v52
-; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v25
-; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v28
-; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v27
-; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v30
-; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v29
-; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v33
-; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v34
+; SI-NEXT:    v_mul_f32_e32 v10, 1.0, v48
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v30
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v29
+; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v33
+; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v34
+; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v10, 1.0, v54
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v35
-; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v36
-; SI-NEXT:    v_mul_f32_e32 v35, 1.0, v37
-; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v38
-; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v39
-; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v49
-; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v50
-; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v51
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v53
-; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v54
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v55
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(6) expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v10, 1.0, v42
+; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v36
+; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v37
+; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v57
+; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v58
+; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v59
+; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v49
+; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v50
+; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v52
+; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v53
 ; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v40
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v42
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v43
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v44
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v45
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v23, 1.0, s29
-; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s28
+; SI-NEXT:    v_mul_f32_e32 v40, 1.0, v41
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(6) expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v10, 1.0, v43
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v44
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_mul_f32_e32 v14, 1.0, v45
+; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v52, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v30, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v34, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v54, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v41, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v42, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v45, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v48, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v43, 1.0, s29
+; SI-NEXT:    v_mul_f32_e64 v44, 1.0, s28
 ; SI-NEXT:    ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr61 : SGPR spill to VGPR lane
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB91_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v3
-; SI-NEXT:    v_mov_b32_e32 v42, v37
-; SI-NEXT:    v_alignbit_b32 v37, v2, v11, 16
-; SI-NEXT:    v_alignbit_b32 v11, v44, v4, 16
-; SI-NEXT:    v_readfirstlane_b32 s4, v37
-; SI-NEXT:    v_readfirstlane_b32 s5, v11
-; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 24
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v14
-; SI-NEXT:    v_writelane_b32 v62, s6, 0
-; SI-NEXT:    v_alignbit_b32 v2, v2, v15, 16
-; SI-NEXT:    v_writelane_b32 v62, s7, 1
-; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 16
-; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v7
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v14, v52, v6, 16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT:    v_readfirstlane_b32 s5, v14
-; SI-NEXT:    v_alignbit_b32 v2, v2, v19, 16
-; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 24
-; SI-NEXT:    s_lshr_b64 s[12:13], s[4:5], 16
-; SI-NEXT:    s_lshr_b64 s[16:17], s[4:5], 8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v10
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v19, v2, v8, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v23
-; SI-NEXT:    v_readfirstlane_b32 s5, v19
-; SI-NEXT:    v_alignbit_b32 v2, v2, v25, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v56
-; SI-NEXT:    s_lshr_b64 s[14:15], s[4:5], 24
-; SI-NEXT:    s_lshr_b64 s[18:19], s[4:5], 16
-; SI-NEXT:    s_lshr_b64 s[22:23], s[4:5], 8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    v_alignbit_b32 v47, v45, v47, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v32
-; SI-NEXT:    v_readfirstlane_b32 s5, v47
-; SI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v58
-; SI-NEXT:    s_lshr_b64 s[20:21], s[4:5], 24
-; SI-NEXT:    s_lshr_b64 s[24:25], s[4:5], 16
-; SI-NEXT:    s_lshr_b64 s[28:29], s[4:5], 8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    v_mov_b32_e32 v4, v58
-; SI-NEXT:    v_alignbit_b32 v58, v8, v41, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v59
-; SI-NEXT:    v_readfirstlane_b32 s5, v58
-; SI-NEXT:    v_alignbit_b32 v2, v2, v61, 16
-; SI-NEXT:    s_lshr_b64 s[26:27], s[4:5], 24
-; SI-NEXT:    s_lshr_b64 s[40:41], s[4:5], 16
-; SI-NEXT:    s_lshr_b64 s[44:45], s[4:5], 8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v2, v2, v60, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v23, v22
-; SI-NEXT:    v_mov_b32_e32 v40, v36
-; SI-NEXT:    s_mov_b64 vcc, 0
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 24, v56
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_alignbit_b32 v41, v15, v6, 16
-; SI-NEXT:    v_readfirstlane_b32 s5, v41
-; SI-NEXT:    s_lshr_b64 s[42:43], s[4:5], 24
-; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 16
-; SI-NEXT:    s_lshr_b64 s[58:59], s[4:5], 8
-; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v21
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v32
-; SI-NEXT:    v_alignbit_b32 v59, v1, v13, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s5, v59
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT:    s_lshr_b64 s[56:57], s[4:5], 24
-; SI-NEXT:    s_lshr_b64 s[60:61], s[4:5], 16
-; SI-NEXT:    s_lshr_b64 s[72:73], s[4:5], 8
-; SI-NEXT:    v_alignbit_b32 v61, v1, v17, 16
-; SI-NEXT:    v_readfirstlane_b32 s5, v61
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v58
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_alignbit_b32 v2, v2, v21, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
-; SI-NEXT:    v_alignbit_b32 v2, v2, v12, 16
-; SI-NEXT:    s_lshr_b64 s[62:63], s[4:5], 24
-; SI-NEXT:    s_lshr_b64 s[74:75], s[4:5], 16
-; SI-NEXT:    s_lshr_b64 s[78:79], s[4:5], 8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v22
-; SI-NEXT:    v_alignbit_b32 v60, v2, v20, 16
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v24
-; SI-NEXT:    v_alignbit_b32 v1, v2, v46, 16
-; SI-NEXT:    v_readfirstlane_b32 s5, v60
-; SI-NEXT:    s_lshr_b64 s[76:77], s[4:5], 24
-; SI-NEXT:    s_lshr_b64 s[88:89], s[4:5], 16
-; SI-NEXT:    s_lshr_b64 s[92:93], s[4:5], 8
+; SI-NEXT:    v_readfirstlane_b32 s4, v13
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v15
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v52
+; SI-NEXT:    s_lshr_b32 s7, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v30
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v34
+; SI-NEXT:    s_lshr_b64 s[86:87], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v54
+; SI-NEXT:    s_lshr_b32 s65, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v41
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v42
+; SI-NEXT:    s_lshr_b64 s[80:81], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v45
+; SI-NEXT:    s_lshr_b32 s69, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v43
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v44
+; SI-NEXT:    v_mov_b32_e32 v34, v35
+; SI-NEXT:    s_lshr_b64 s[66:67], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v34
+; SI-NEXT:    s_lshr_b32 s91, s4, 16
+; SI-NEXT:    v_mov_b32_e32 v30, v51
+; SI-NEXT:    v_readfirstlane_b32 s4, v47
+; SI-NEXT:    v_mov_b32_e32 v51, v46
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v51
+; SI-NEXT:    v_mov_b32_e32 v35, v6
+; SI-NEXT:    s_lshr_b64 s[52:53], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v35
+; SI-NEXT:    s_lshr_b32 s37, s4, 16
 ; SI-NEXT:    v_readfirstlane_b32 s4, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v1, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v23
-; SI-NEXT:    v_mov_b32_e32 v5, v28
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 24, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 8, v47
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 8, v41
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v61
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 8, v60
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v20
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v25, v2, v26, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v57
-; SI-NEXT:    v_readfirstlane_b32 s5, v25
-; SI-NEXT:    v_alignbit_b32 v2, v2, v16, 16
-; SI-NEXT:    s_lshr_b64 s[90:91], s[4:5], 24
-; SI-NEXT:    s_lshr_b64 s[94:95], s[4:5], 16
-; SI-NEXT:    s_lshr_b64 s[34:35], s[4:5], 8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v28
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v22, v2, v30, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v31
-; SI-NEXT:    v_readfirstlane_b32 s5, v22
-; SI-NEXT:    v_alignbit_b32 v2, v2, v27, 16
-; SI-NEXT:    s_lshr_b64 s[30:31], s[4:5], 24
-; SI-NEXT:    s_lshr_b64 s[36:37], s[4:5], 16
-; SI-NEXT:    s_lshr_b64 s[38:39], s[4:5], 8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v29
-; SI-NEXT:    v_alignbit_b32 v17, v2, v36, 16
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v35
-; SI-NEXT:    v_readfirstlane_b32 s5, v17
-; SI-NEXT:    v_alignbit_b32 v2, v2, v34, 16
-; SI-NEXT:    s_lshr_b64 s[48:49], s[4:5], 24
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_mov_b32_e32 v41, v5
+; SI-NEXT:    v_readfirstlane_b32 s4, v5
+; SI-NEXT:    v_mov_b32_e32 v5, v39
+; SI-NEXT:    s_lshr_b64 s[30:31], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v5
+; SI-NEXT:    s_lshr_b32 s89, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v9
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    s_lshr_b64 s[50:51], s[4:5], 16
-; SI-NEXT:    s_lshr_b64 s[52:53], s[4:5], 8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v42
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v20
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v5
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v22
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v29
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v29, v37
-; SI-NEXT:    v_mov_b32_e32 v37, v42
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v17
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v37
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v33
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 24, v4
+; SI-NEXT:    s_lshr_b32 s57, s4, 16
+; SI-NEXT:    v_mov_b32_e32 v42, v32
+; SI-NEXT:    v_readfirstlane_b32 s4, v32
+; SI-NEXT:    v_mov_b32_e32 v32, v4
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v32
+; SI-NEXT:    v_mov_b32_e32 v6, v55
+; SI-NEXT:    s_lshr_b64 s[92:93], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v6
+; SI-NEXT:    s_lshr_b32 s79, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v20
+; SI-NEXT:    v_mov_b32_e32 v39, v12
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v39
+; SI-NEXT:    v_mov_b32_e32 v9, v8
+; SI-NEXT:    s_waitcnt expcnt(6)
+; SI-NEXT:    v_mov_b32_e32 v43, v20
+; SI-NEXT:    s_lshr_b64 s[76:77], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v9
+; SI-NEXT:    v_mov_b32_e32 v20, v21
+; SI-NEXT:    v_readfirstlane_b32 s78, v18
+; SI-NEXT:    s_lshr_b32 s73, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v20
+; SI-NEXT:    v_mov_b32_e32 v18, v22
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v18
+; SI-NEXT:    s_lshr_b64 s[62:63], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v23
+; SI-NEXT:    s_lshr_b32 s59, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v28
+; SI-NEXT:    v_mov_b32_e32 v21, v25
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v27
+; SI-NEXT:    v_mov_b32_e32 v25, v26
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v25
+; SI-NEXT:    v_mov_b32_e32 v12, v29
+; SI-NEXT:    s_lshr_b32 s45, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v12
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_mov_b32_e32 v44, v1
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_mov_b32_e32 v1, v52
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    v_mov_b32_e32 v52, v17
+; SI-NEXT:    v_readfirstlane_b32 s4, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v29, v33
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v22, v24
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    s_lshr_b64 s[40:41], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v29
+; SI-NEXT:    s_lshr_b32 s29, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s6, v11
+; SI-NEXT:    v_readfirstlane_b32 s12, v40
+; SI-NEXT:    s_lshr_b64 s[96:97], s[6:7], 16
+; SI-NEXT:    s_mov_b32 s9, s96
+; SI-NEXT:    v_readfirstlane_b32 s88, v60
+; SI-NEXT:    s_lshr_b64 s[82:83], s[88:89], 16
+; SI-NEXT:    v_readfirstlane_b32 s64, v16
+; SI-NEXT:    s_lshr_b64 s[84:85], s[64:65], 16
+; SI-NEXT:    s_mov_b32 s87, s84
+; SI-NEXT:    v_readfirstlane_b32 s68, v48
+; SI-NEXT:    s_lshr_b64 s[70:71], s[68:69], 16
+; SI-NEXT:    s_mov_b32 s81, s70
+; SI-NEXT:    v_readfirstlane_b32 s90, v30
+; SI-NEXT:    s_lshr_b64 s[38:39], s[90:91], 16
+; SI-NEXT:    s_mov_b32 s67, s38
+; SI-NEXT:    v_readfirstlane_b32 s36, v3
+; SI-NEXT:    s_lshr_b64 s[98:99], s[36:37], 16
+; SI-NEXT:    s_mov_b32 s53, s98
+; SI-NEXT:    s_mov_b32 s31, s82
+; SI-NEXT:    v_readfirstlane_b32 s56, v7
+; SI-NEXT:    s_lshr_b64 s[94:95], s[56:57], 16
+; SI-NEXT:    s_mov_b32 s51, s94
+; SI-NEXT:    s_lshr_b64 s[74:75], s[78:79], 16
+; SI-NEXT:    s_mov_b32 s93, s74
+; SI-NEXT:    v_readfirstlane_b32 s72, v19
+; SI-NEXT:    s_lshr_b64 s[60:61], s[72:73], 16
+; SI-NEXT:    s_mov_b32 s77, s60
+; SI-NEXT:    v_readfirstlane_b32 s58, v21
+; SI-NEXT:    s_lshr_b64 s[54:55], s[58:59], 16
+; SI-NEXT:    s_mov_b32 s63, s54
+; SI-NEXT:    v_readfirstlane_b32 s44, v22
+; SI-NEXT:    s_lshr_b64 s[42:43], s[44:45], 16
+; SI-NEXT:    s_mov_b32 s47, s42
+; SI-NEXT:    v_mov_b32_e32 v26, v37
+; SI-NEXT:    v_readfirstlane_b32 s28, v26
+; SI-NEXT:    s_lshr_b64 s[26:27], s[28:29], 16
+; SI-NEXT:    s_mov_b32 s41, s26
+; SI-NEXT:    v_readfirstlane_b32 s22, v36
+; SI-NEXT:    v_readfirstlane_b32 s18, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 24, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v56
+; SI-NEXT:    v_mov_b32_e32 v3, v54
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 24, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v50
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mov_b32_e32 v54, v59
+; SI-NEXT:    s_lshr_b32 s78, s96, 8
+; SI-NEXT:    s_lshr_b32 s61, s84, 8
+; SI-NEXT:    s_lshr_b32 s72, s70, 8
+; SI-NEXT:    s_lshr_b32 s75, s38, 8
+; SI-NEXT:    s_lshr_b32 s58, s98, 8
+; SI-NEXT:    s_lshr_b32 s43, s82, 8
+; SI-NEXT:    s_lshr_b32 s44, s94, 8
+; SI-NEXT:    s_mov_b32 s64, s74
+; SI-NEXT:    s_lshr_b32 s27, s74, 8
+; SI-NEXT:    s_mov_b32 s90, s60
+; SI-NEXT:    s_lshr_b32 s28, s60, 8
+; SI-NEXT:    s_lshr_b32 s74, s54, 8
+; SI-NEXT:    s_mov_b32 s68, s42
+; SI-NEXT:    s_mov_b32 s56, s26
 ; SI-NEXT:    v_lshrrev_b32_e32 v16, 24, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 24, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 8, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 8, v59
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 24, v43
-; SI-NEXT:    v_mov_b32_e32 v31, v20
-; SI-NEXT:    v_mov_b32_e32 v20, v34
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_alignbit_b32 v30, v2, v36, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v48
-; SI-NEXT:    v_alignbit_b32 v2, v2, v39, 16
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT:    v_readfirstlane_b32 s5, v30
-; SI-NEXT:    s_lshr_b64 s[54:55], s[4:5], 24
-; SI-NEXT:    s_lshr_b64 s[64:65], s[4:5], 16
-; SI-NEXT:    s_lshr_b64 s[68:69], s[4:5], 8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v33
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v28, v36
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_alignbit_b32 v57, v2, v39, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v51
-; SI-NEXT:    v_readfirstlane_b32 s5, v57
-; SI-NEXT:    v_alignbit_b32 v2, v2, v50, 16
-; SI-NEXT:    s_lshr_b64 s[66:67], s[4:5], 24
-; SI-NEXT:    s_lshr_b64 s[70:71], s[4:5], 16
-; SI-NEXT:    s_lshr_b64 s[82:83], s[4:5], 8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v49
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v46, v2, v38, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v55
-; SI-NEXT:    v_readfirstlane_b32 s5, v46
-; SI-NEXT:    v_alignbit_b32 v2, v2, v54, 16
-; SI-NEXT:    s_lshr_b64 s[80:81], s[4:5], 24
-; SI-NEXT:    s_lshr_b64 s[84:85], s[4:5], 16
-; SI-NEXT:    s_lshr_b64 s[96:97], s[4:5], 8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v43
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v57
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v38, v2, v53, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 24, v18
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v49
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, v32
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v46
-; SI-NEXT:    v_readfirstlane_b32 s5, v38
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; SI-NEXT:    s_lshr_b64 s[86:87], s[4:5], 24
-; SI-NEXT:    s_lshr_b64 s[98:99], s[4:5], 16
-; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], 8
-; SI-NEXT:    v_mov_b32_e32 v32, v8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 8, v25
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 8, v30
-; SI-NEXT:    v_mov_b32_e32 v55, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v15
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v8, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v45
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 24, v34
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 24, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 24, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 24, v25
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_readfirstlane_b32 s4, v17
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_readfirstlane_b32 s4, v33
+; SI-NEXT:    s_lshr_b64 s[24:25], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v57
+; SI-NEXT:    s_lshr_b32 s23, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v58
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v59
+; SI-NEXT:    s_lshr_b64 s[16:17], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v50
+; SI-NEXT:    s_lshr_b32 s19, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v53
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_readfirstlane_b32 s4, v24
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v38
+; SI-NEXT:    s_lshr_b32 s13, s4, 16
+; SI-NEXT:    s_mov_b32 s5, s13
+; SI-NEXT:    v_writelane_b32 v61, s4, 26
+; SI-NEXT:    v_writelane_b32 v61, s5, 27
+; SI-NEXT:    v_readfirstlane_b32 s4, v46
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v10
+; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v56
+; SI-NEXT:    s_lshr_b64 s[20:21], s[12:13], 16
+; SI-NEXT:    s_lshr_b32 s13, s5, 16
+; SI-NEXT:    v_readfirstlane_b32 s12, v14
+; SI-NEXT:    s_lshr_b64 vcc, s[12:13], 16
+; SI-NEXT:    s_mov_b32 s5, vcc_lo
+; SI-NEXT:    s_mov_b32 s88, vcc_lo
+; SI-NEXT:    s_lshr_b32 s6, vcc_lo, 8
+; SI-NEXT:    s_lshr_b64 vcc, s[8:9], 24
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 4
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 5
+; SI-NEXT:    s_lshr_b64 vcc, s[8:9], 16
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 2
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 3
+; SI-NEXT:    s_lshr_b64 vcc, s[8:9], 8
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 0
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 1
+; SI-NEXT:    s_lshr_b64 vcc, s[86:87], 24
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 10
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 11
+; SI-NEXT:    s_lshr_b64 vcc, s[86:87], 16
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 8
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 9
+; SI-NEXT:    s_lshr_b64 vcc, s[86:87], 8
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 6
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 7
+; SI-NEXT:    s_lshr_b64 vcc, s[80:81], 24
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 16
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 17
+; SI-NEXT:    s_lshr_b64 vcc, s[80:81], 16
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 14
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 15
+; SI-NEXT:    s_lshr_b64 vcc, s[80:81], 8
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 12
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 13
+; SI-NEXT:    s_lshr_b64 vcc, s[66:67], 24
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 22
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 23
+; SI-NEXT:    s_lshr_b64 vcc, s[66:67], 16
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 20
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 21
+; SI-NEXT:    s_lshr_b64 vcc, s[66:67], 8
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 18
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 19
+; SI-NEXT:    s_lshr_b64 vcc, s[52:53], 24
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 28
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 29
+; SI-NEXT:    s_lshr_b64 vcc, s[52:53], 16
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 26
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 27
+; SI-NEXT:    s_lshr_b64 vcc, s[52:53], 8
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 24
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 25
+; SI-NEXT:    s_lshr_b64 vcc, s[30:31], 24
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 34
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 35
+; SI-NEXT:    s_lshr_b64 vcc, s[30:31], 16
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 32
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 33
+; SI-NEXT:    s_lshr_b64 vcc, s[30:31], 8
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 30
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 31
+; SI-NEXT:    s_lshr_b64 vcc, s[50:51], 24
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 40
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 41
+; SI-NEXT:    s_lshr_b64 vcc, s[50:51], 16
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 38
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 39
+; SI-NEXT:    s_lshr_b64 vcc, s[50:51], 8
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 36
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 37
+; SI-NEXT:    s_lshr_b64 vcc, s[92:93], 24
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 46
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 47
+; SI-NEXT:    s_lshr_b64 vcc, s[92:93], 16
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 44
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 45
+; SI-NEXT:    s_lshr_b64 vcc, s[92:93], 8
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 42
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 43
+; SI-NEXT:    s_lshr_b64 vcc, s[76:77], 24
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 52
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 53
+; SI-NEXT:    s_lshr_b64 vcc, s[76:77], 16
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 50
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 51
+; SI-NEXT:    s_lshr_b64 vcc, s[76:77], 8
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 48
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 49
+; SI-NEXT:    s_lshr_b64 vcc, s[62:63], 24
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 58
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 59
+; SI-NEXT:    s_lshr_b64 vcc, s[62:63], 16
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 56
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 57
+; SI-NEXT:    s_lshr_b64 vcc, s[62:63], 8
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 54
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 55
+; SI-NEXT:    s_lshr_b64 vcc, s[46:47], 24
+; SI-NEXT:    v_writelane_b32 v61, vcc_lo, 0
+; SI-NEXT:    v_writelane_b32 v61, vcc_hi, 1
+; SI-NEXT:    s_lshr_b64 vcc, s[46:47], 16
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 62
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 63
+; SI-NEXT:    s_lshr_b64 vcc, s[46:47], 8
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 60
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 61
+; SI-NEXT:    s_lshr_b64 vcc, s[40:41], 24
+; SI-NEXT:    v_writelane_b32 v61, vcc_lo, 6
+; SI-NEXT:    v_writelane_b32 v61, vcc_hi, 7
+; SI-NEXT:    s_lshr_b64 vcc, s[40:41], 16
+; SI-NEXT:    v_writelane_b32 v61, vcc_lo, 4
+; SI-NEXT:    s_lshr_b64 s[34:35], s[22:23], 16
+; SI-NEXT:    v_writelane_b32 v61, vcc_hi, 5
+; SI-NEXT:    s_lshr_b64 vcc, s[40:41], 8
+; SI-NEXT:    s_mov_b32 s25, s34
+; SI-NEXT:    v_writelane_b32 v61, vcc_lo, 2
+; SI-NEXT:    v_writelane_b32 v61, vcc_hi, 3
+; SI-NEXT:    s_lshr_b64 vcc, s[24:25], 24
+; SI-NEXT:    v_writelane_b32 v61, vcc_lo, 12
+; SI-NEXT:    v_writelane_b32 v61, vcc_hi, 13
+; SI-NEXT:    s_lshr_b64 vcc, s[24:25], 16
+; SI-NEXT:    v_writelane_b32 v61, vcc_lo, 10
+; SI-NEXT:    s_lshr_b64 s[14:15], s[18:19], 16
+; SI-NEXT:    v_writelane_b32 v61, vcc_hi, 11
+; SI-NEXT:    s_lshr_b64 vcc, s[24:25], 8
+; SI-NEXT:    s_mov_b32 s17, s14
+; SI-NEXT:    v_writelane_b32 v61, vcc_lo, 8
+; SI-NEXT:    v_writelane_b32 v61, vcc_hi, 9
+; SI-NEXT:    s_lshr_b64 vcc, s[16:17], 24
+; SI-NEXT:    v_writelane_b32 v61, vcc_lo, 18
+; SI-NEXT:    v_writelane_b32 v61, vcc_hi, 19
+; SI-NEXT:    s_lshr_b64 vcc, s[16:17], 16
+; SI-NEXT:    v_writelane_b32 v61, vcc_lo, 16
+; SI-NEXT:    v_writelane_b32 v61, vcc_hi, 17
+; SI-NEXT:    s_lshr_b64 vcc, s[16:17], 8
+; SI-NEXT:    s_mov_b32 s11, s20
+; SI-NEXT:    v_writelane_b32 v61, vcc_lo, 14
+; SI-NEXT:    v_writelane_b32 v61, vcc_hi, 15
+; SI-NEXT:    s_lshr_b64 vcc, s[10:11], 24
+; SI-NEXT:    v_writelane_b32 v61, vcc_lo, 24
+; SI-NEXT:    v_writelane_b32 v61, vcc_hi, 25
+; SI-NEXT:    s_lshr_b64 vcc, s[10:11], 16
+; SI-NEXT:    v_writelane_b32 v61, vcc_lo, 22
+; SI-NEXT:    v_writelane_b32 v61, vcc_hi, 23
+; SI-NEXT:    s_lshr_b64 vcc, s[10:11], 8
+; SI-NEXT:    v_writelane_b32 v61, vcc_lo, 20
+; SI-NEXT:    v_writelane_b32 v61, vcc_hi, 21
+; SI-NEXT:    s_lshr_b64 vcc, s[4:5], 24
+; SI-NEXT:    v_writelane_b32 v61, vcc_lo, 32
+; SI-NEXT:    v_writelane_b32 v61, vcc_hi, 33
+; SI-NEXT:    s_lshr_b64 vcc, s[4:5], 16
+; SI-NEXT:    v_writelane_b32 v61, vcc_lo, 30
+; SI-NEXT:    v_writelane_b32 v61, vcc_hi, 31
+; SI-NEXT:    s_lshr_b64 vcc, s[4:5], 8
+; SI-NEXT:    v_writelane_b32 v61, vcc_lo, 28
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 24, v23
+; SI-NEXT:    s_lshr_b32 s22, s42, 8
+; SI-NEXT:    s_lshr_b32 s21, s26, 8
+; SI-NEXT:    s_lshr_b32 s18, s34, 8
+; SI-NEXT:    s_mov_b32 s36, s14
+; SI-NEXT:    s_lshr_b32 s15, s14, 8
+; SI-NEXT:    s_mov_b32 s14, s20
+; SI-NEXT:    s_lshr_b32 s12, s20, 8
+; SI-NEXT:    v_writelane_b32 v61, vcc_hi, 29
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 24, v57
+; SI-NEXT:    v_mov_b32_e32 v59, v30
+; SI-NEXT:    v_mov_b32_e32 v31, v51
+; SI-NEXT:    v_mov_b32_e32 v60, v34
+; SI-NEXT:    v_mov_b32_e32 v30, v39
+; SI-NEXT:    v_mov_b32_e32 v19, v5
+; SI-NEXT:    v_mov_b32_e32 v39, v21
+; SI-NEXT:    v_mov_b32_e32 v21, v20
+; SI-NEXT:    v_mov_b32_e32 v34, v18
+; SI-NEXT:    v_mov_b32_e32 v18, v37
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v7, v26
+; SI-NEXT:    v_mov_b32_e32 v20, v2
+; SI-NEXT:    v_mov_b32_e32 v37, v17
+; SI-NEXT:    v_mov_b32_e32 v51, v33
+; SI-NEXT:    v_mov_b32_e32 v17, v9
+; SI-NEXT:    v_mov_b32_e32 v9, v10
+; SI-NEXT:    v_mov_b32_e32 v26, v25
 ; SI-NEXT:    s_branch .LBB91_3
 ; SI-NEXT:  .LBB91_2:
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $sgpr4
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_mov_b32_e32 v55, v49
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mov_b32_e32 v54, v59
 ; SI-NEXT:    v_writelane_b32 v62, s4, 0
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    v_mov_b32_e32 v40, v36
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
+; SI-NEXT:    v_writelane_b32 v62, s5, 1
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v59, v51
+; SI-NEXT:    v_writelane_b32 v62, s4, 2
+; SI-NEXT:    v_writelane_b32 v62, s5, 3
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v31, v46
+; SI-NEXT:    v_writelane_b32 v62, s4, 4
+; SI-NEXT:    v_writelane_b32 v62, s5, 5
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v34, v22
+; SI-NEXT:    v_writelane_b32 v62, s4, 6
+; SI-NEXT:    v_writelane_b32 v62, s5, 7
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v22, v24
+; SI-NEXT:    v_writelane_b32 v62, s4, 8
+; SI-NEXT:    v_writelane_b32 v62, s5, 9
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v7, v37
+; SI-NEXT:    v_writelane_b32 v62, s4, 10
+; SI-NEXT:    v_writelane_b32 v62, s5, 11
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_writelane_b32 v62, s4, 12
+; SI-NEXT:    v_writelane_b32 v62, s5, 13
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    v_writelane_b32 v62, s4, 14
+; SI-NEXT:    v_writelane_b32 v62, s5, 15
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr21
+; SI-NEXT:    ; implicit-def: $sgpr20
+; SI-NEXT:    ; implicit-def: $sgpr89
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    v_mov_b32_e32 v44, v1
+; SI-NEXT:    v_writelane_b32 v62, s4, 16
+; SI-NEXT:    v_writelane_b32 v62, s5, 17
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    v_mov_b32_e32 v52, v17
+; SI-NEXT:    v_writelane_b32 v62, s4, 18
+; SI-NEXT:    v_writelane_b32 v62, s5, 19
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v43, v20
+; SI-NEXT:    v_writelane_b32 v62, s4, 20
+; SI-NEXT:    v_writelane_b32 v62, s5, 21
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v42, v32
+; SI-NEXT:    v_writelane_b32 v62, s4, 22
+; SI-NEXT:    v_writelane_b32 v62, s5, 23
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v41, v5
+; SI-NEXT:    v_writelane_b32 v62, s4, 24
+; SI-NEXT:    v_writelane_b32 v62, s5, 25
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    s_mov_b64 vcc, -1
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    v_writelane_b32 v62, s4, 26
+; SI-NEXT:    v_writelane_b32 v62, s5, 27
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v60, v35
+; SI-NEXT:    v_writelane_b32 v62, s4, 28
+; SI-NEXT:    v_writelane_b32 v62, s5, 29
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    v_writelane_b32 v62, s5, 1
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
+; SI-NEXT:    v_mov_b32_e32 v35, v6
+; SI-NEXT:    v_writelane_b32 v62, s4, 30
+; SI-NEXT:    v_writelane_b32 v62, s5, 31
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v32, v4
+; SI-NEXT:    v_writelane_b32 v62, s4, 32
+; SI-NEXT:    v_writelane_b32 v62, s5, 33
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v30, v12
+; SI-NEXT:    v_writelane_b32 v62, s4, 34
+; SI-NEXT:    v_writelane_b32 v62, s5, 35
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v19, v39
+; SI-NEXT:    v_writelane_b32 v62, s4, 36
+; SI-NEXT:    v_writelane_b32 v62, s5, 37
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    v_mov_b32_e32 v39, v25
+; SI-NEXT:    v_writelane_b32 v62, s4, 38
+; SI-NEXT:    v_writelane_b32 v62, s5, 39
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v12, v29
+; SI-NEXT:    v_writelane_b32 v62, s4, 40
+; SI-NEXT:    v_writelane_b32 v62, s5, 41
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v20, v2
+; SI-NEXT:    v_writelane_b32 v62, s4, 42
+; SI-NEXT:    v_writelane_b32 v62, s5, 43
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v6, v55
+; SI-NEXT:    v_writelane_b32 v62, s4, 44
+; SI-NEXT:    v_writelane_b32 v62, s5, 45
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v17, v8
+; SI-NEXT:    v_writelane_b32 v62, s4, 46
+; SI-NEXT:    v_writelane_b32 v62, s5, 47
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v29, v33
+; SI-NEXT:    v_writelane_b32 v62, s4, 48
+; SI-NEXT:    v_writelane_b32 v62, s5, 49
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr96
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr7
+; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $sgpr86
+; SI-NEXT:    ; implicit-def: $sgpr84
+; SI-NEXT:    ; implicit-def: $sgpr61
+; SI-NEXT:    ; implicit-def: $sgpr65
+; SI-NEXT:    ; implicit-def: $vgpr16
+; SI-NEXT:    ; implicit-def: $sgpr80
+; SI-NEXT:    ; implicit-def: $sgpr70
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr69
+; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr38
+; SI-NEXT:    ; implicit-def: $sgpr75
+; SI-NEXT:    ; implicit-def: $sgpr91
+; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $sgpr98
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr82
+; SI-NEXT:    ; implicit-def: $sgpr43
+; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; implicit-def: $sgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $sgpr16
-; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $sgpr22
-; SI-NEXT:    ; implicit-def: $sgpr18
-; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    ; implicit-def: $sgpr27
+; SI-NEXT:    ; implicit-def: $vgpr18
 ; SI-NEXT:    ; implicit-def: $sgpr28
-; SI-NEXT:    ; implicit-def: $sgpr24
-; SI-NEXT:    ; implicit-def: $sgpr20
-; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; implicit-def: $sgpr74
 ; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $sgpr22
+; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    ; implicit-def: $sgpr18
 ; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $sgpr26
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $sgpr72
-; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr78
-; SI-NEXT:    ; implicit-def: $sgpr74
-; SI-NEXT:    ; implicit-def: $sgpr62
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; implicit-def: $sgpr15
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr57
 ; SI-NEXT:    ; implicit-def: $sgpr92
-; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr79
 ; SI-NEXT:    ; implicit-def: $sgpr76
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $sgpr34
-; SI-NEXT:    ; implicit-def: $sgpr94
 ; SI-NEXT:    ; implicit-def: $sgpr90
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $sgpr38
-; SI-NEXT:    ; implicit-def: $sgpr36
-; SI-NEXT:    ; implicit-def: $sgpr30
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $sgpr52
-; SI-NEXT:    ; implicit-def: $sgpr50
-; SI-NEXT:    ; implicit-def: $sgpr48
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $sgpr68
-; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr73
+; SI-NEXT:    ; implicit-def: $sgpr62
 ; SI-NEXT:    ; implicit-def: $sgpr54
-; SI-NEXT:    ; implicit-def: $sgpr82
-; SI-NEXT:    ; implicit-def: $sgpr70
-; SI-NEXT:    ; implicit-def: $sgpr66
-; SI-NEXT:    ; implicit-def: $sgpr96
-; SI-NEXT:    ; implicit-def: $sgpr84
-; SI-NEXT:    ; implicit-def: $sgpr80
-; SI-NEXT:    ; implicit-def: $sgpr4
-; SI-NEXT:    ; implicit-def: $sgpr98
-; SI-NEXT:    ; implicit-def: $sgpr86
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $sgpr59
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr68
+; SI-NEXT:    ; implicit-def: $sgpr45
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr29
+; SI-NEXT:    ; implicit-def: $sgpr24
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr23
+; SI-NEXT:    ; implicit-def: $sgpr16
+; SI-NEXT:    ; implicit-def: $sgpr36
+; SI-NEXT:    ; implicit-def: $sgpr19
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr13
+; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    v_writelane_b32 v62, s4, 50
+; SI-NEXT:    v_writelane_b32 v62, s5, 51
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v62, s4, 52
+; SI-NEXT:    v_writelane_b32 v62, s5, 53
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v62, s4, 54
+; SI-NEXT:    v_writelane_b32 v62, s5, 55
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v62, s4, 56
+; SI-NEXT:    v_writelane_b32 v62, s5, 57
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v62, s4, 58
+; SI-NEXT:    v_writelane_b32 v62, s5, 59
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v62, s4, 60
+; SI-NEXT:    v_writelane_b32 v62, s5, 61
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v62, s4, 62
+; SI-NEXT:    v_writelane_b32 v62, s5, 63
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v61, s4, 0
+; SI-NEXT:    v_writelane_b32 v61, s5, 1
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v61, s4, 2
+; SI-NEXT:    v_writelane_b32 v61, s5, 3
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v61, s4, 4
+; SI-NEXT:    v_writelane_b32 v61, s5, 5
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v61, s4, 6
+; SI-NEXT:    v_writelane_b32 v61, s5, 7
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v61, s4, 8
+; SI-NEXT:    v_writelane_b32 v61, s5, 9
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v61, s4, 10
+; SI-NEXT:    v_writelane_b32 v61, s5, 11
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v61, s4, 12
+; SI-NEXT:    v_writelane_b32 v61, s5, 13
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v61, s4, 14
+; SI-NEXT:    v_writelane_b32 v61, s5, 15
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v61, s4, 16
+; SI-NEXT:    v_writelane_b32 v61, s5, 17
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v61, s4, 18
+; SI-NEXT:    v_writelane_b32 v61, s5, 19
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v61, s4, 20
+; SI-NEXT:    v_writelane_b32 v61, s5, 21
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v61, s4, 22
+; SI-NEXT:    v_writelane_b32 v61, s5, 23
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v61, s4, 24
+; SI-NEXT:    v_writelane_b32 v61, s5, 25
+; SI-NEXT:    ; implicit-def: $sgpr5
+; SI-NEXT:    v_writelane_b32 v61, s4, 26
+; SI-NEXT:    v_writelane_b32 v61, s5, 27
+; SI-NEXT:    v_writelane_b32 v61, s20, 28
+; SI-NEXT:    v_writelane_b32 v61, s21, 29
+; SI-NEXT:    ; implicit-def: $sgpr20
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v61, s20, 30
+; SI-NEXT:    v_writelane_b32 v61, s21, 31
+; SI-NEXT:    v_writelane_b32 v61, s88, 32
+; SI-NEXT:    v_writelane_b32 v61, s89, 33
+; SI-NEXT:    ; implicit-def: $sgpr88
 ; SI-NEXT:  .LBB91_3: ; %Flow
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_andn2_b64 vcc, exec, vcc
 ; SI-NEXT:    s_cbranch_vccnz .LBB91_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v43
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v46
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_alignbit_b32 v7, v3, v2, 16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s4, v7
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v37
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_alignbit_b32 v10, v6, v4, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v33
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s52, v10
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v51
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_readfirstlane_b32 s4, v11
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v3
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_readfirstlane_b32 s12, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v24
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v53
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v9, v3, v2, 16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s86, v9
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v50
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v38
+; SI-NEXT:    v_readfirstlane_b32 s6, v9
+; SI-NEXT:    v_readfirstlane_b32 s8, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v40
+; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT:    s_lshr_b32 s9, s6, 16
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v12, v3, v2, 16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s80, v12
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_readfirstlane_b32 s6, v8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[8:9], 16
+; SI-NEXT:    s_lshr_b32 s9, s6, 16
+; SI-NEXT:    v_readfirstlane_b32 s8, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v54
+; SI-NEXT:    s_mov_b32 s7, s9
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v58
+; SI-NEXT:    v_writelane_b32 v61, s6, 26
+; SI-NEXT:    s_lshr_b64 s[20:21], s[8:9], 16
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT:    v_readfirstlane_b32 s8, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v49
+; SI-NEXT:    v_writelane_b32 v61, s7, 27
+; SI-NEXT:    v_readfirstlane_b32 s6, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v50
+; SI-NEXT:    s_lshr_b32 s9, s6, 16
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v13, v3, v2, 16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s66, v13
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v38, v4, v3, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v55
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v15
+; SI-NEXT:    v_readfirstlane_b32 s18, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v37
+; SI-NEXT:    s_lshr_b64 s[16:17], s[8:9], 16
+; SI-NEXT:    v_readfirstlane_b32 s6, v2
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    s_lshr_b32 s19, s6, 16
+; SI-NEXT:    v_readfirstlane_b32 s6, v3
+; SI-NEXT:    s_lshr_b32 s9, s6, 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v12
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v21
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v43
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v44
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v35
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    s_lshr_b64 s[26:27], s[18:19], 16
+; SI-NEXT:    s_mov_b32 s17, s26
+; SI-NEXT:    s_mov_b32 s11, s20
+; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v51
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_readfirstlane_b32 s8, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v36
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v1
+; SI-NEXT:    v_readfirstlane_b32 s22, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v52
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v57
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_alignbit_b32 v15, v7, v6, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v42
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v46, v6, v3, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v48
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v39
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT:    s_lshr_b64 s[24:25], s[8:9], 16
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_readfirstlane_b32 s8, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
+; SI-NEXT:    v_readfirstlane_b32 s6, v1
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_alignbit_b32 v23, v7, v6, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v53
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v57, v6, v3, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v54
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v28
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT:    s_lshr_b32 s23, s6, 16
+; SI-NEXT:    v_readfirstlane_b32 s6, v5
+; SI-NEXT:    v_readfirstlane_b32 s28, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v27
+; SI-NEXT:    s_lshr_b32 s9, s6, 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v29
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_alignbit_b32 v16, v7, v6, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v30, v6, v3, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v21
+; SI-NEXT:    s_lshr_b64 s[40:41], s[8:9], 16
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v28
+; SI-NEXT:    v_readfirstlane_b32 s8, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v22
+; SI-NEXT:    v_readfirstlane_b32 s6, v5
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v40
-; SI-NEXT:    v_alignbit_b32 v18, v9, v7, 16
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v34
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    v_alignbit_b32 v20, v10, v9, 16
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s38, v15
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s90, v16
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s30, v23
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s76, v18
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s62, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v5
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v4
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s5, v38
-; SI-NEXT:    v_readfirstlane_b32 s87, v46
-; SI-NEXT:    v_readfirstlane_b32 s81, v57
-; SI-NEXT:    v_readfirstlane_b32 s67, v30
-; SI-NEXT:    s_lshr_b64 s[54:55], s[66:67], 24
-; SI-NEXT:    s_lshr_b64 s[64:65], s[66:67], 16
-; SI-NEXT:    s_lshr_b64 s[68:69], s[66:67], 8
-; SI-NEXT:    s_lshr_b64 s[66:67], s[80:81], 24
-; SI-NEXT:    s_lshr_b64 s[70:71], s[80:81], 16
-; SI-NEXT:    s_lshr_b64 s[82:83], s[80:81], 8
-; SI-NEXT:    s_lshr_b64 s[80:81], s[86:87], 24
-; SI-NEXT:    s_lshr_b64 s[84:85], s[86:87], 16
-; SI-NEXT:    s_lshr_b64 s[96:97], s[86:87], 8
-; SI-NEXT:    s_lshr_b64 s[86:87], s[4:5], 24
-; SI-NEXT:    s_lshr_b64 s[98:99], s[4:5], 16
-; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], 8
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 24, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 8, v30
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v17, v7, v3, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT:    v_readfirstlane_b32 s53, v17
-; SI-NEXT:    s_lshr_b64 s[48:49], s[52:53], 24
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_alignbit_b32 v21, v12, v10, 16
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_readfirstlane_b32 s56, v21
-; SI-NEXT:    s_lshr_b64 s[50:51], s[52:53], 16
-; SI-NEXT:    s_lshr_b64 s[52:53], s[52:53], 8
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v6
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    s_lshr_b32 s29, s6, 16
+; SI-NEXT:    v_readfirstlane_b32 s6, v7
+; SI-NEXT:    v_readfirstlane_b32 s44, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v34
+; SI-NEXT:    s_lshr_b32 s9, s6, 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v26
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 16
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT:    v_readfirstlane_b32 s8, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v39
+; SI-NEXT:    v_readfirstlane_b32 s6, v7
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
-; SI-NEXT:    v_alignbit_b32 v22, v9, v3, 16
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT:    v_alignbit_b32 v23, v13, v12, 16
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v31
+; SI-NEXT:    s_lshr_b32 s45, s6, 16
+; SI-NEXT:    v_readfirstlane_b32 s6, v9
+; SI-NEXT:    v_readfirstlane_b32 s58, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v30
+; SI-NEXT:    s_lshr_b32 s9, s6, 16
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    s_lshr_b64 s[62:63], s[8:9], 16
+; SI-NEXT:    v_readfirstlane_b32 s8, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v23
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v9
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s42, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v22
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v17
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v57
-; SI-NEXT:    v_readfirstlane_b32 s39, v22
+; SI-NEXT:    v_readfirstlane_b32 s6, v9
+; SI-NEXT:    s_lshr_b32 s59, s6, 16
+; SI-NEXT:    v_readfirstlane_b32 s6, v10
+; SI-NEXT:    s_lshr_b32 s9, s6, 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[8:9], 16
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v17
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v11
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v42
+; SI-NEXT:    v_readfirstlane_b32 s6, v10
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    s_lshr_b32 s73, s6, 16
+; SI-NEXT:    v_readfirstlane_b32 s6, v11
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v11
+; SI-NEXT:    s_lshr_b32 s9, s6, 16
+; SI-NEXT:    v_readfirstlane_b32 s6, v18
+; SI-NEXT:    s_lshr_b32 s79, s6, 16
+; SI-NEXT:    s_lshr_b64 s[54:55], s[58:59], 16
+; SI-NEXT:    s_mov_b32 s63, s54
+; SI-NEXT:    s_lshr_b64 s[60:61], s[44:45], 16
+; SI-NEXT:    s_mov_b32 s47, s60
+; SI-NEXT:    s_lshr_b64 s[42:43], s[28:29], 16
+; SI-NEXT:    s_mov_b32 s41, s42
+; SI-NEXT:    s_lshr_b64 s[34:35], s[22:23], 16
+; SI-NEXT:    s_mov_b32 s25, s34
+; SI-NEXT:    v_readfirstlane_b32 s5, v14
+; SI-NEXT:    s_lshr_b32 s13, s5, 16
+; SI-NEXT:    s_lshr_b64 vcc, s[12:13], 16
+; SI-NEXT:    s_mov_b32 s5, vcc_lo
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 24, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v8
+; SI-NEXT:    s_lshr_b32 s22, s60, 8
+; SI-NEXT:    s_lshr_b32 s21, s42, 8
+; SI-NEXT:    s_lshr_b32 s18, s34, 8
+; SI-NEXT:    s_lshr_b32 s12, s20, 8
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 24, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 24, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 24, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v7
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v46
-; SI-NEXT:    s_lshr_b64 s[36:37], s[38:39], 16
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v21
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_alignbit_b32 v24, v15, v13, 16
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v14
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_alignbit_b32 v25, v10, v3, 16
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT:    v_readfirstlane_b32 s31, v25
-; SI-NEXT:    v_readfirstlane_b32 s26, v24
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    s_lshr_b64 s[94:95], s[30:31], 16
-; SI-NEXT:    s_lshr_b64 s[34:35], s[30:31], 8
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v23
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_alignbit_b32 v26, v16, v15, 16
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_readfirstlane_b32 s72, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v32
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_readfirstlane_b32 s20, v26
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 24, v21
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_alignbit_b32 v27, v18, v16, 16
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v60, v12, v3, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT:    v_readfirstlane_b32 s91, v60
-; SI-NEXT:    v_readfirstlane_b32 s14, v27
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 24, v10
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    s_lshr_b64 s[88:89], s[90:91], 16
-; SI-NEXT:    s_lshr_b64 s[92:93], s[90:91], 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_alignbit_b32 v29, v20, v18, 16
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT:    v_readfirstlane_b32 s8, v29
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_readfirstlane_b32 s8, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    s_lshr_b64 s[92:93], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[72:73], 16
+; SI-NEXT:    s_mov_b32 s77, s74
+; SI-NEXT:    s_lshr_b32 s28, s74, 8
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v12
-; SI-NEXT:    v_alignbit_b32 v61, v11, v3, 16
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    v_readfirstlane_b32 s77, v61
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 24, v12
-; SI-NEXT:    s_lshr_b64 s[74:75], s[76:77], 16
-; SI-NEXT:    s_lshr_b64 s[78:79], s[76:77], 8
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v61
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v13
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 24, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 24, v18
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_readfirstlane_b32 s78, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    s_lshr_b64 s[48:49], s[78:79], 16
+; SI-NEXT:    s_mov_b32 s93, s48
+; SI-NEXT:    s_lshr_b32 s27, s48, 8
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_alignbit_b32 v59, v36, v3, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v8
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT:    v_readfirstlane_b32 s63, v59
-; SI-NEXT:    s_lshr_b64 s[60:61], s[62:63], 16
-; SI-NEXT:    s_lshr_b64 s[72:73], s[62:63], 8
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 8, v59
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 8, v25
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 24, v20
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_readfirstlane_b32 s8, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_readfirstlane_b32 s6, v11
+; SI-NEXT:    s_lshr_b32 s9, s6, 16
+; SI-NEXT:    s_lshr_b64 s[50:51], s[8:9], 16
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v3
+; SI-NEXT:    v_readfirstlane_b32 s56, v11
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v41
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_readfirstlane_b32 s8, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v20
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v3
-; SI-NEXT:    v_alignbit_b32 v41, v49, v15, 16
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT:    v_readfirstlane_b32 s57, v41
+; SI-NEXT:    v_readfirstlane_b32 s6, v3
+; SI-NEXT:    s_lshr_b32 s57, s6, 16
+; SI-NEXT:    v_readfirstlane_b32 s6, v13
+; SI-NEXT:    s_lshr_b32 s9, s6, 16
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v19
+; SI-NEXT:    s_lshr_b64 s[30:31], s[8:9], 16
+; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    v_readfirstlane_b32 s6, v24
+; SI-NEXT:    s_lshr_b32 s89, s6, 16
+; SI-NEXT:    s_lshr_b64 s[94:95], s[56:57], 16
+; SI-NEXT:    s_mov_b32 s51, s94
+; SI-NEXT:    s_lshr_b32 s44, s94, 8
+; SI-NEXT:    s_mov_b32 s56, s42
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 24, v24
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
-; SI-NEXT:    s_lshr_b64 s[46:47], s[56:57], 16
-; SI-NEXT:    s_lshr_b64 s[58:59], s[56:57], 8
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 8, v41
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v15
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_readfirstlane_b32 s88, v11
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v31
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_readfirstlane_b32 s8, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    s_lshr_b64 s[82:83], s[88:89], 16
+; SI-NEXT:    s_mov_b32 s31, s82
+; SI-NEXT:    s_lshr_b32 s43, s82, 8
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_readfirstlane_b32 s6, v13
+; SI-NEXT:    s_lshr_b32 s9, s6, 16
+; SI-NEXT:    s_lshr_b64 s[52:53], s[8:9], 16
+; SI-NEXT:    v_readfirstlane_b32 s6, v4
+; SI-NEXT:    s_lshr_b32 s37, s6, 16
+; SI-NEXT:    s_mov_b32 s88, vcc_lo
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
-; SI-NEXT:    v_alignbit_b32 v58, v32, v16, 16
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; SI-NEXT:    v_readfirstlane_b32 s43, v58
-; SI-NEXT:    s_lshr_b64 s[40:41], s[42:43], 16
-; SI-NEXT:    s_lshr_b64 s[44:45], s[42:43], 8
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 24, v15
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v58
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_readfirstlane_b32 s36, v11
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_readfirstlane_b32 s8, v11
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v59
+; SI-NEXT:    s_lshr_b64 s[98:99], s[36:37], 16
+; SI-NEXT:    s_mov_b32 s53, s98
+; SI-NEXT:    s_lshr_b32 s58, s98, 8
+; SI-NEXT:    s_mov_b32 s36, s26
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v6
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_readfirstlane_b32 s6, v13
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v11
+; SI-NEXT:    v_readfirstlane_b32 s90, v13
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v60
+; SI-NEXT:    s_lshr_b32 s9, s6, 16
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    s_lshr_b64 s[66:67], s[8:9], 16
+; SI-NEXT:    v_readfirstlane_b32 s6, v11
+; SI-NEXT:    s_lshr_b32 s91, s6, 16
+; SI-NEXT:    s_lshr_b64 s[38:39], s[90:91], 16
+; SI-NEXT:    s_mov_b32 s67, s38
+; SI-NEXT:    s_lshr_b32 s75, s38, 8
+; SI-NEXT:    s_mov_b32 s90, s74
+; SI-NEXT:    s_lshr_b32 s74, s54, 8
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v6
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_readfirstlane_b32 s8, v13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_alignbit_b32 v47, v45, v16, 16
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT:    v_readfirstlane_b32 s27, v47
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 8, v47
-; SI-NEXT:    s_lshr_b64 s[24:25], s[26:27], 16
-; SI-NEXT:    s_lshr_b64 s[28:29], s[26:27], 8
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v6
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_readfirstlane_b32 s6, v15
+; SI-NEXT:    s_lshr_b32 s9, s6, 16
+; SI-NEXT:    s_lshr_b64 s[80:81], s[8:9], 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v6
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v13
+; SI-NEXT:    v_readfirstlane_b32 s68, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v45
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_readfirstlane_b32 s6, v13
+; SI-NEXT:    s_lshr_b32 s69, s6, 16
+; SI-NEXT:    s_lshr_b64 s[70:71], s[68:69], 16
+; SI-NEXT:    s_mov_b32 s81, s70
+; SI-NEXT:    s_lshr_b32 s72, s70, 8
+; SI-NEXT:    s_mov_b32 s68, s60
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_readfirstlane_b32 s8, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_readfirstlane_b32 s64, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_alignbit_b32 v19, v11, v16, 16
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT:    v_readfirstlane_b32 s21, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 8, v19
-; SI-NEXT:    s_lshr_b64 s[18:19], s[20:21], 16
-; SI-NEXT:    s_lshr_b64 s[22:23], s[20:21], 8
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_readfirstlane_b32 s6, v16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    s_lshr_b32 s9, s6, 16
+; SI-NEXT:    s_lshr_b64 s[86:87], s[8:9], 16
+; SI-NEXT:    v_readfirstlane_b32 s6, v12
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT:    s_lshr_b32 s65, s6, 16
+; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT:    s_lshr_b64 s[84:85], s[64:65], 16
+; SI-NEXT:    s_mov_b32 s87, s84
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 24, v6
+; SI-NEXT:    s_lshr_b32 s61, s84, 8
+; SI-NEXT:    s_mov_b32 s64, s48
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_readfirstlane_b32 s8, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_alignbit_b32 v14, v52, v16, 16
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; SI-NEXT:    v_readfirstlane_b32 s15, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v14
-; SI-NEXT:    s_lshr_b64 s[12:13], s[14:15], 16
-; SI-NEXT:    s_lshr_b64 s[16:17], s[14:15], 8
+; SI-NEXT:    v_readfirstlane_b32 s6, v16
+; SI-NEXT:    s_lshr_b32 s9, s6, 16
+; SI-NEXT:    v_readfirstlane_b32 s6, v6
+; SI-NEXT:    s_lshr_b32 s7, s6, 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 24, v12
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v11, v44, v16, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 24, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 8, v60
-; SI-NEXT:    v_readfirstlane_b32 s9, v11
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v11
-; SI-NEXT:    s_lshr_b64 s[6:7], s[8:9], 24
-; SI-NEXT:    s_lshr_b64 s[10:11], s[8:9], 8
-; SI-NEXT:    v_writelane_b32 v62, s6, 0
-; SI-NEXT:    v_writelane_b32 v62, s7, 1
-; SI-NEXT:    s_lshr_b64 s[6:7], s[8:9], 16
-; SI-NEXT:    s_lshr_b64 s[8:9], s[14:15], 24
-; SI-NEXT:    s_lshr_b64 s[14:15], s[20:21], 24
-; SI-NEXT:    s_lshr_b64 s[20:21], s[26:27], 24
-; SI-NEXT:    s_lshr_b64 s[26:27], s[42:43], 24
-; SI-NEXT:    s_lshr_b64 s[42:43], s[56:57], 24
-; SI-NEXT:    s_lshr_b64 s[56:57], s[62:63], 24
-; SI-NEXT:    s_lshr_b64 s[62:63], s[76:77], 24
-; SI-NEXT:    s_lshr_b64 s[76:77], s[90:91], 24
-; SI-NEXT:    s_lshr_b64 s[90:91], s[30:31], 24
-; SI-NEXT:    s_lshr_b64 s[30:31], s[38:39], 24
-; SI-NEXT:    s_lshr_b64 s[38:39], s[38:39], 8
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_readfirstlane_b32 s6, v15
+; SI-NEXT:    s_lshr_b64 s[96:97], s[6:7], 16
+; SI-NEXT:    s_mov_b32 s9, s96
+; SI-NEXT:    s_lshr_b64 s[14:15], s[8:9], 24
+; SI-NEXT:    v_writelane_b32 v62, s14, 4
+; SI-NEXT:    v_writelane_b32 v62, s15, 5
+; SI-NEXT:    s_lshr_b64 s[14:15], s[8:9], 16
+; SI-NEXT:    v_writelane_b32 v62, s14, 2
+; SI-NEXT:    v_writelane_b32 v62, s15, 3
+; SI-NEXT:    s_lshr_b64 s[14:15], s[8:9], 8
+; SI-NEXT:    v_writelane_b32 v62, s14, 0
+; SI-NEXT:    v_writelane_b32 v62, s15, 1
+; SI-NEXT:    s_lshr_b64 s[14:15], s[86:87], 24
+; SI-NEXT:    v_writelane_b32 v62, s14, 10
+; SI-NEXT:    v_writelane_b32 v62, s15, 11
+; SI-NEXT:    s_lshr_b64 s[14:15], s[86:87], 16
+; SI-NEXT:    v_writelane_b32 v62, s14, 8
+; SI-NEXT:    v_writelane_b32 v62, s15, 9
+; SI-NEXT:    s_lshr_b64 s[14:15], s[86:87], 8
+; SI-NEXT:    v_writelane_b32 v62, s14, 6
+; SI-NEXT:    v_writelane_b32 v62, s15, 7
+; SI-NEXT:    s_lshr_b64 s[14:15], s[80:81], 24
+; SI-NEXT:    v_writelane_b32 v62, s14, 16
+; SI-NEXT:    v_writelane_b32 v62, s15, 17
+; SI-NEXT:    s_lshr_b64 s[14:15], s[80:81], 16
+; SI-NEXT:    v_writelane_b32 v62, s14, 14
+; SI-NEXT:    v_writelane_b32 v62, s15, 15
+; SI-NEXT:    s_lshr_b64 s[14:15], s[80:81], 8
+; SI-NEXT:    v_writelane_b32 v62, s14, 12
+; SI-NEXT:    v_writelane_b32 v62, s15, 13
+; SI-NEXT:    s_lshr_b64 s[14:15], s[66:67], 24
+; SI-NEXT:    v_writelane_b32 v62, s14, 22
+; SI-NEXT:    v_writelane_b32 v62, s15, 23
+; SI-NEXT:    s_lshr_b64 s[14:15], s[66:67], 16
+; SI-NEXT:    v_writelane_b32 v62, s14, 20
+; SI-NEXT:    v_writelane_b32 v62, s15, 21
+; SI-NEXT:    s_lshr_b64 s[14:15], s[66:67], 8
+; SI-NEXT:    v_writelane_b32 v62, s14, 18
+; SI-NEXT:    v_writelane_b32 v62, s15, 19
+; SI-NEXT:    s_lshr_b64 s[14:15], s[52:53], 24
+; SI-NEXT:    v_writelane_b32 v62, s14, 28
+; SI-NEXT:    v_writelane_b32 v62, s15, 29
+; SI-NEXT:    s_lshr_b64 s[14:15], s[52:53], 16
+; SI-NEXT:    v_writelane_b32 v62, s14, 26
+; SI-NEXT:    v_writelane_b32 v62, s15, 27
+; SI-NEXT:    s_lshr_b64 s[14:15], s[52:53], 8
+; SI-NEXT:    v_writelane_b32 v62, s14, 24
+; SI-NEXT:    v_writelane_b32 v62, s15, 25
+; SI-NEXT:    s_lshr_b64 s[14:15], s[30:31], 24
+; SI-NEXT:    v_writelane_b32 v62, s14, 34
+; SI-NEXT:    v_writelane_b32 v62, s15, 35
+; SI-NEXT:    s_lshr_b64 s[14:15], s[30:31], 16
+; SI-NEXT:    v_writelane_b32 v62, s14, 32
+; SI-NEXT:    v_writelane_b32 v62, s15, 33
+; SI-NEXT:    s_lshr_b64 s[14:15], s[30:31], 8
+; SI-NEXT:    v_writelane_b32 v62, s14, 30
+; SI-NEXT:    v_writelane_b32 v62, s15, 31
+; SI-NEXT:    s_lshr_b64 s[14:15], s[50:51], 24
+; SI-NEXT:    v_writelane_b32 v62, s14, 40
+; SI-NEXT:    v_writelane_b32 v62, s15, 41
+; SI-NEXT:    s_lshr_b64 s[14:15], s[50:51], 16
+; SI-NEXT:    v_writelane_b32 v62, s14, 38
+; SI-NEXT:    v_writelane_b32 v62, s15, 39
+; SI-NEXT:    s_lshr_b64 s[14:15], s[50:51], 8
+; SI-NEXT:    v_writelane_b32 v62, s14, 36
+; SI-NEXT:    v_writelane_b32 v62, s15, 37
+; SI-NEXT:    s_lshr_b64 s[14:15], s[92:93], 24
+; SI-NEXT:    v_writelane_b32 v62, s14, 46
+; SI-NEXT:    v_writelane_b32 v62, s15, 47
+; SI-NEXT:    s_lshr_b64 s[14:15], s[92:93], 16
+; SI-NEXT:    v_writelane_b32 v62, s14, 44
+; SI-NEXT:    v_writelane_b32 v62, s15, 45
+; SI-NEXT:    s_lshr_b64 s[14:15], s[92:93], 8
+; SI-NEXT:    v_writelane_b32 v62, s14, 42
+; SI-NEXT:    v_writelane_b32 v62, s15, 43
+; SI-NEXT:    s_lshr_b64 s[14:15], s[76:77], 24
+; SI-NEXT:    v_writelane_b32 v62, s14, 52
+; SI-NEXT:    v_writelane_b32 v62, s15, 53
+; SI-NEXT:    s_lshr_b64 s[14:15], s[76:77], 16
+; SI-NEXT:    v_writelane_b32 v62, s14, 50
+; SI-NEXT:    v_writelane_b32 v62, s15, 51
+; SI-NEXT:    s_lshr_b64 s[14:15], s[76:77], 8
+; SI-NEXT:    v_writelane_b32 v62, s14, 48
+; SI-NEXT:    v_writelane_b32 v62, s15, 49
+; SI-NEXT:    s_lshr_b64 s[14:15], s[62:63], 24
+; SI-NEXT:    v_writelane_b32 v62, s14, 58
+; SI-NEXT:    v_writelane_b32 v62, s15, 59
+; SI-NEXT:    s_lshr_b64 s[14:15], s[62:63], 16
+; SI-NEXT:    v_writelane_b32 v62, s14, 56
+; SI-NEXT:    v_writelane_b32 v62, s15, 57
+; SI-NEXT:    s_lshr_b64 s[14:15], s[62:63], 8
+; SI-NEXT:    v_writelane_b32 v62, s14, 54
+; SI-NEXT:    v_writelane_b32 v62, s15, 55
+; SI-NEXT:    s_lshr_b64 s[14:15], s[46:47], 24
+; SI-NEXT:    v_writelane_b32 v61, s14, 0
+; SI-NEXT:    v_writelane_b32 v61, s15, 1
+; SI-NEXT:    s_lshr_b64 s[14:15], s[46:47], 16
+; SI-NEXT:    v_writelane_b32 v62, s14, 62
+; SI-NEXT:    v_writelane_b32 v62, s15, 63
+; SI-NEXT:    s_lshr_b64 s[14:15], s[46:47], 8
+; SI-NEXT:    v_writelane_b32 v62, s14, 60
+; SI-NEXT:    v_writelane_b32 v62, s15, 61
+; SI-NEXT:    s_lshr_b64 s[14:15], s[40:41], 24
+; SI-NEXT:    v_writelane_b32 v61, s14, 6
+; SI-NEXT:    v_writelane_b32 v61, s15, 7
+; SI-NEXT:    s_lshr_b64 s[14:15], s[40:41], 16
+; SI-NEXT:    v_writelane_b32 v61, s14, 4
+; SI-NEXT:    v_writelane_b32 v61, s15, 5
+; SI-NEXT:    s_lshr_b64 s[14:15], s[40:41], 8
+; SI-NEXT:    v_writelane_b32 v61, s14, 2
+; SI-NEXT:    v_writelane_b32 v61, s15, 3
+; SI-NEXT:    s_lshr_b64 s[14:15], s[24:25], 24
+; SI-NEXT:    v_writelane_b32 v61, s14, 12
+; SI-NEXT:    v_writelane_b32 v61, s15, 13
+; SI-NEXT:    s_lshr_b64 s[14:15], s[24:25], 16
+; SI-NEXT:    v_writelane_b32 v61, s14, 10
+; SI-NEXT:    v_writelane_b32 v61, s15, 11
+; SI-NEXT:    s_lshr_b64 s[14:15], s[24:25], 8
+; SI-NEXT:    v_writelane_b32 v61, s14, 8
+; SI-NEXT:    v_writelane_b32 v61, s15, 9
+; SI-NEXT:    s_lshr_b64 s[14:15], s[16:17], 24
+; SI-NEXT:    v_writelane_b32 v61, s14, 18
+; SI-NEXT:    v_writelane_b32 v61, s15, 19
+; SI-NEXT:    s_lshr_b64 s[14:15], s[16:17], 16
+; SI-NEXT:    v_writelane_b32 v61, s14, 16
+; SI-NEXT:    v_writelane_b32 v61, s15, 17
+; SI-NEXT:    s_lshr_b64 s[14:15], s[16:17], 8
+; SI-NEXT:    v_writelane_b32 v61, s14, 14
+; SI-NEXT:    v_writelane_b32 v61, s15, 15
+; SI-NEXT:    s_lshr_b64 s[14:15], s[10:11], 24
+; SI-NEXT:    v_writelane_b32 v61, s14, 24
+; SI-NEXT:    v_writelane_b32 v61, s15, 25
+; SI-NEXT:    s_lshr_b64 s[14:15], s[10:11], 16
+; SI-NEXT:    v_writelane_b32 v61, s14, 22
+; SI-NEXT:    v_writelane_b32 v61, s15, 23
+; SI-NEXT:    s_lshr_b64 s[14:15], s[10:11], 8
+; SI-NEXT:    v_writelane_b32 v61, s14, 20
+; SI-NEXT:    v_writelane_b32 v61, s15, 21
+; SI-NEXT:    s_lshr_b64 s[14:15], s[4:5], 24
+; SI-NEXT:    v_writelane_b32 v61, s14, 32
+; SI-NEXT:    v_writelane_b32 v61, s15, 33
+; SI-NEXT:    s_lshr_b64 s[14:15], s[4:5], 16
+; SI-NEXT:    v_writelane_b32 v61, s14, 30
+; SI-NEXT:    v_writelane_b32 v61, s15, 31
+; SI-NEXT:    s_lshr_b64 s[14:15], s[4:5], 8
+; SI-NEXT:    v_writelane_b32 v61, s14, 28
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 24, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 24, v5
+; SI-NEXT:    v_writelane_b32 v61, s15, 29
+; SI-NEXT:    s_lshr_b32 s78, s96, 8
+; SI-NEXT:    s_lshr_b32 s15, s26, 8
+; SI-NEXT:    s_mov_b32 s14, s20
+; SI-NEXT:    s_lshr_b32 s6, vcc_lo, 8
+; SI-NEXT:    v_mov_b32_e32 v14, v4
+; SI-NEXT:    v_mov_b32_e32 v4, v6
 ; SI-NEXT:  .LBB91_5: ; %end
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v29
-; SI-NEXT:    s_lshl_b32 s5, s10, 8
-; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    s_and_b32 s5, s6, 0xff
-; SI-NEXT:    v_readlane_b32 s6, v62, 0
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s6, s6, 24
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v11
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v44
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v16
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v1, v1, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    s_and_b32 s5, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s8, v62, 0
+; SI-NEXT:    v_readlane_b32 s9, v62, 1
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    v_readlane_b32 s8, v62, 2
+; SI-NEXT:    v_readlane_b32 s9, v62, 3
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 vcc_lo, v62, 4
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, vcc_lo, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    s_and_b32 s5, s96, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s78, 8
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    s_and_b32 s8, s7, 0xff
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    s_lshl_b32 s5, s16, 8
-; SI-NEXT:    s_lshl_b32 s6, s8, 24
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 8, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    s_lshl_b32 s4, s4, 8
-; SI-NEXT:    v_readlane_b32 s7, v62, 1
-; SI-NEXT:    v_readlane_b32 s99, v63, 35
-; SI-NEXT:    v_readlane_b32 s97, v63, 33
-; SI-NEXT:    v_readlane_b32 s87, v63, 31
-; SI-NEXT:    v_readlane_b32 s85, v63, 29
-; SI-NEXT:    v_readlane_b32 s83, v63, 27
-; SI-NEXT:    v_readlane_b32 s81, v63, 25
-; SI-NEXT:    v_readlane_b32 s71, v63, 23
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v48
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s8, v1
+; SI-NEXT:    v_readlane_b32 s8, v62, 6
+; SI-NEXT:    v_readlane_b32 vcc_hi, v62, 5
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    s_and_b32 s5, s12, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_and_b32 s5, s86, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v62, 7
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    v_readlane_b32 s8, v62, 8
+; SI-NEXT:    v_readlane_b32 s9, v62, 9
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 vcc_lo, v62, 10
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, vcc_lo, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_readlane_b32 vcc_hi, v62, 11
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s84, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s61, 8
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    s_and_b32 s8, s65, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v16
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s8, v1
+; SI-NEXT:    v_readlane_b32 s8, v62, 12
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    s_and_b32 s5, s80, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v62, 13
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    v_readlane_b32 s8, v62, 14
+; SI-NEXT:    v_readlane_b32 s9, v62, 15
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s60, v62, 16
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s60, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v7
-; SI-NEXT:    v_or_b32_e32 v1, v1, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v52
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v1, v1, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s70, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s72, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    s_lshl_b32 s5, s22, 8
-; SI-NEXT:    s_lshl_b32 s6, s14, 24
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 16, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v24
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    s_and_b32 s5, s18, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    s_and_b32 s8, s69, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v15
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s8, v1
+; SI-NEXT:    v_readlane_b32 s8, v62, 18
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    s_and_b32 s5, s66, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v62, 19
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    v_readlane_b32 s61, v62, 17
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    v_readlane_b32 s8, v62, 20
+; SI-NEXT:    v_readlane_b32 s9, v62, 21
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s60, v62, 22
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s60, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v27
-; SI-NEXT:    v_or_b32_e32 v1, v1, v5
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_lshl_b32 s5, s28, 8
-; SI-NEXT:    s_lshl_b32 s6, s20, 24
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    v_or_b32_e32 v1, v1, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s38, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s75, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 24, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v56
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    s_and_b32 s5, s24, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    s_and_b32 s8, s91, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v13
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s8, v1
+; SI-NEXT:    v_readlane_b32 s8, v62, 24
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    s_and_b32 s5, s52, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v62, 25
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    v_readlane_b32 s61, v62, 23
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    v_readlane_b32 s8, v62, 26
+; SI-NEXT:    v_readlane_b32 s9, v62, 27
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s60, v62, 28
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s60, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v47
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v9
-; SI-NEXT:    v_or_b32_e32 v1, v1, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v45
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v1, v1, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s98, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s58, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    s_lshl_b32 s5, s44, 8
-; SI-NEXT:    s_lshl_b32 s6, s26, 24
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 32, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v20
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    s_and_b32 s5, s40, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    s_and_b32 s8, s37, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v11
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s8, v1
+; SI-NEXT:    v_readlane_b32 s8, v62, 30
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    s_and_b32 s5, s30, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v62, 31
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    v_readlane_b32 s61, v62, 29
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    v_readlane_b32 s8, v62, 32
+; SI-NEXT:    v_readlane_b32 s9, v62, 33
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s60, v62, 34
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s60, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v58
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v13
-; SI-NEXT:    v_or_b32_e32 v1, v1, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    v_or_b32_e32 v1, v1, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s82, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s43, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    s_lshl_b32 s5, s58, 8
-; SI-NEXT:    s_lshl_b32 s6, s42, 24
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 40, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    s_and_b32 s5, s46, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    s_and_b32 s8, s89, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v47
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s8, v1
+; SI-NEXT:    v_readlane_b32 s8, v62, 36
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    s_and_b32 s5, s50, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v62, 37
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    v_readlane_b32 s8, v62, 38
+; SI-NEXT:    v_readlane_b32 s9, v62, 39
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s42, v62, 40
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s42, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v12
-; SI-NEXT:    v_or_b32_e32 v1, v1, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v49
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_lshl_b32 s5, s72, 8
-; SI-NEXT:    s_lshl_b32 s6, s56, 24
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s94, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s44, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    s_and_b32 s5, s60, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    s_and_b32 s8, s57, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s8, v1
+; SI-NEXT:    v_readlane_b32 s8, v62, 42
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    s_and_b32 s5, s92, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v62, 43
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    v_readlane_b32 s43, v62, 41
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    v_readlane_b32 s8, v62, 44
+; SI-NEXT:    v_readlane_b32 s9, v62, 45
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s42, v62, 46
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s42, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v59
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_lshl_b32 s5, s78, 8
-; SI-NEXT:    s_lshl_b32 s6, s62, 24
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s64, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s27, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    s_and_b32 s5, s74, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    s_and_b32 s8, s79, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v18
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s8, v1
+; SI-NEXT:    v_readlane_b32 s8, v62, 48
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_and_b32 s5, s76, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v62, 49
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    v_readlane_b32 s8, v62, 50
+; SI-NEXT:    v_readlane_b32 s9, v62, 51
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s26, v62, 52
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s26, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v61
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v10
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_lshl_b32 s5, s92, 8
-; SI-NEXT:    s_lshl_b32 s6, s76, 24
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s90, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s28, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    s_and_b32 s5, s88, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    s_and_b32 s8, s73, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v55
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s8, v1
+; SI-NEXT:    v_readlane_b32 s8, v62, 54
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_and_b32 s5, s62, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v62, 55
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    v_readlane_b32 s27, v62, 53
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    v_readlane_b32 s8, v62, 56
+; SI-NEXT:    v_readlane_b32 s9, v62, 57
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s26, v62, 58
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s26, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v60
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v23
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_lshl_b32 s5, s34, 8
-; SI-NEXT:    s_lshl_b32 s6, s90, 24
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s54, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s74, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    s_and_b32 s5, s94, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    s_and_b32 s8, s59, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v9
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s8, v1
+; SI-NEXT:    v_readlane_b32 s8, v62, 60
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_and_b32 s5, s46, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v62, 61
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    v_readlane_b32 s27, v62, 59
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    v_readlane_b32 s8, v62, 62
+; SI-NEXT:    v_readlane_b32 s9, v62, 63
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s26, v61, 0
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s26, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v18
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_lshl_b32 s5, s38, 8
-; SI-NEXT:    s_lshl_b32 s6, s30, 24
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s68, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s22, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    s_and_b32 s5, s36, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x50, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    s_and_b32 s8, s45, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s8, v1
+; SI-NEXT:    v_readlane_b32 s8, v61, 2
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    s_and_b32 s5, s40, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v61, 3
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    v_readlane_b32 s27, v61, 1
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    v_readlane_b32 s8, v61, 4
+; SI-NEXT:    v_readlane_b32 s9, v61, 5
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s26, v61, 6
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s26, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v22
-; SI-NEXT:    s_lshl_b32 s5, s52, 8
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT:    s_lshl_b32 s6, s48, 24
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s56, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s21, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    s_and_b32 s5, s50, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x58, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    s_and_b32 s8, s29, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v14
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s8, v1
+; SI-NEXT:    v_readlane_b32 s8, v61, 8
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    s_and_b32 s5, s24, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v61, 9
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    v_readlane_b32 s8, v61, 10
+; SI-NEXT:    v_readlane_b32 s9, v61, 11
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s20, v61, 12
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s20, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v17
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT:    s_lshl_b32 s5, s68, 8
-; SI-NEXT:    s_lshl_b32 s6, s54, 24
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s34, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s18, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    s_and_b32 s5, s64, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x60, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    s_and_b32 s8, s23, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v56
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s8, v1
+; SI-NEXT:    v_readlane_b32 s8, v61, 14
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_and_b32 s5, s16, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v61, 15
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    v_readlane_b32 s8, v61, 16
+; SI-NEXT:    v_readlane_b32 s9, v61, 17
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s16, v61, 18
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s16, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v30
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_lshl_b32 s5, s82, 8
-; SI-NEXT:    s_lshl_b32 s6, s66, 24
-; SI-NEXT:    v_readlane_b32 s82, v63, 26
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
+; SI-NEXT:    s_or_b32 s5, s5, s8
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    s_and_b32 s5, s70, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x68, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    s_and_b32 s5, s36, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s15, 8
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    s_and_b32 s8, s19, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v57
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT:    s_lshl_b32 s5, s96, 8
-; SI-NEXT:    s_lshl_b32 s6, s80, 24
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    v_readlane_b32 s61, v62, 35
+; SI-NEXT:    v_readlane_b32 s43, v62, 47
+; SI-NEXT:    v_readlane_b32 s27, v61, 7
+; SI-NEXT:    v_readlane_b32 s21, v61, 13
+; SI-NEXT:    v_readlane_b32 s17, v61, 19
+; SI-NEXT:    v_readlane_b32 s99, v63, 35
+; SI-NEXT:    v_readlane_b32 s98, v63, 34
+; SI-NEXT:    v_readlane_b32 s97, v63, 33
 ; SI-NEXT:    v_readlane_b32 s96, v63, 32
+; SI-NEXT:    v_readlane_b32 s87, v63, 31
+; SI-NEXT:    v_readlane_b32 s86, v63, 30
+; SI-NEXT:    v_readlane_b32 s85, v63, 29
+; SI-NEXT:    v_readlane_b32 s84, v63, 28
+; SI-NEXT:    v_readlane_b32 s83, v63, 27
+; SI-NEXT:    v_readlane_b32 s82, v63, 26
+; SI-NEXT:    v_readlane_b32 s81, v63, 25
 ; SI-NEXT:    v_readlane_b32 s80, v63, 24
+; SI-NEXT:    v_readlane_b32 s71, v63, 23
 ; SI-NEXT:    v_readlane_b32 s70, v63, 22
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_readlane_b32 s69, v63, 21
+; SI-NEXT:    v_readlane_b32 s68, v63, 20
+; SI-NEXT:    v_readlane_b32 s67, v63, 19
+; SI-NEXT:    v_readlane_b32 s66, v63, 18
+; SI-NEXT:    v_readlane_b32 s65, v63, 17
+; SI-NEXT:    v_readlane_b32 s64, v63, 16
+; SI-NEXT:    v_readlane_b32 s55, v63, 15
+; SI-NEXT:    v_readlane_b32 s54, v63, 14
+; SI-NEXT:    v_readlane_b32 s53, v63, 13
+; SI-NEXT:    v_readlane_b32 s52, v63, 12
+; SI-NEXT:    v_readlane_b32 s51, v63, 11
+; SI-NEXT:    v_readlane_b32 s50, v63, 10
+; SI-NEXT:    v_readlane_b32 s49, v63, 9
+; SI-NEXT:    v_readlane_b32 s48, v63, 8
+; SI-NEXT:    v_readlane_b32 s39, v63, 7
+; SI-NEXT:    v_readlane_b32 s38, v63, 6
+; SI-NEXT:    v_readlane_b32 s37, v63, 5
+; SI-NEXT:    v_readlane_b32 s36, v63, 4
+; SI-NEXT:    v_readlane_b32 s35, v63, 3
+; SI-NEXT:    v_readlane_b32 s34, v63, 2
+; SI-NEXT:    v_readlane_b32 s31, v63, 1
+; SI-NEXT:    v_readlane_b32 s30, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; SI-NEXT:    v_or_b32_e32 v1, s8, v1
+; SI-NEXT:    v_readlane_b32 s8, v61, 20
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    s_and_b32 s5, s10, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v61, 21
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    v_readlane_b32 s8, v61, 22
+; SI-NEXT:    v_readlane_b32 s9, v61, 23
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s10, v61, 24
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s10, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
+; SI-NEXT:    s_or_b32 s5, s5, s8
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x70, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    s_and_b32 s5, s14, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s12, 8
+; SI-NEXT:    s_or_b32 s5, s5, s8
+; SI-NEXT:    v_readlane_b32 s8, v61, 26
+; SI-NEXT:    v_readlane_b32 s9, v61, 27
+; SI-NEXT:    s_and_b32 s8, s9, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
+; SI-NEXT:    v_readlane_b32 s11, v61, 25
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; SI-NEXT:    v_or_b32_e32 v1, s8, v1
+; SI-NEXT:    v_readlane_b32 s8, v61, 28
+; SI-NEXT:    v_readlane_b32 s9, v61, 29
 ; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    s_and_b32 s5, s84, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s8, 8
+; SI-NEXT:    v_readlane_b32 s8, v61, 30
+; SI-NEXT:    v_readlane_b32 s9, v61, 31
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_and_b32 s5, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s8, v61, 32
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    v_or_b32_e32 v1, s5, v1
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v46
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    s_lshl_b32 s5, s86, 24
-; SI-NEXT:    v_readlane_b32 s86, v63, 30
-; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
+; SI-NEXT:    s_lshl_b32 s8, s8, 24
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_or_b32 s5, s8, s5
+; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x78, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v35
+; SI-NEXT:    s_and_b32 s4, s88, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s6, 8
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_and_b32 s5, s13, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    v_readlane_b32 s9, v61, 33
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s98, 0xff
-; SI-NEXT:    s_lshl_b32 s4, s4, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v38
-; SI-NEXT:    v_readlane_b32 s98, v63, 34
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -168733,8 +169636,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
 ; VI-NEXT:    v_writelane_b32 v63, s30, 0
 ; VI-NEXT:    v_writelane_b32 v63, s31, 1
@@ -168769,209 +169673,225 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_writelane_b32 v63, s86, 30
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
 ; VI-NEXT:    v_writelane_b32 v63, s87, 31
-; VI-NEXT:    v_readfirstlane_b32 s44, v3
-; VI-NEXT:    v_readfirstlane_b32 s45, v4
-; VI-NEXT:    v_readfirstlane_b32 s42, v5
-; VI-NEXT:    v_readfirstlane_b32 s43, v6
-; VI-NEXT:    v_readfirstlane_b32 s40, v7
-; VI-NEXT:    v_readfirstlane_b32 s41, v8
-; VI-NEXT:    v_readfirstlane_b32 s14, v9
-; VI-NEXT:    v_readfirstlane_b32 s15, v10
-; VI-NEXT:    v_readfirstlane_b32 s12, v11
-; VI-NEXT:    v_readfirstlane_b32 s13, v12
-; VI-NEXT:    v_readfirstlane_b32 s10, v13
-; VI-NEXT:    v_readfirstlane_b32 s11, v14
-; VI-NEXT:    v_readfirstlane_b32 s8, v15
-; VI-NEXT:    v_readfirstlane_b32 s9, v16
-; VI-NEXT:    v_readfirstlane_b32 s6, v17
-; VI-NEXT:    v_readfirstlane_b32 s7, v18
+; VI-NEXT:    v_readfirstlane_b32 s48, v3
+; VI-NEXT:    v_readfirstlane_b32 s49, v4
+; VI-NEXT:    v_readfirstlane_b32 s38, v5
+; VI-NEXT:    v_readfirstlane_b32 s39, v6
+; VI-NEXT:    v_readfirstlane_b32 s36, v7
+; VI-NEXT:    v_readfirstlane_b32 s37, v8
+; VI-NEXT:    v_readfirstlane_b32 s34, v9
+; VI-NEXT:    v_readfirstlane_b32 s35, v10
+; VI-NEXT:    v_readfirstlane_b32 s30, v11
+; VI-NEXT:    v_readfirstlane_b32 s31, v12
+; VI-NEXT:    v_readfirstlane_b32 s90, v13
+; VI-NEXT:    v_readfirstlane_b32 s91, v14
+; VI-NEXT:    v_readfirstlane_b32 s88, v15
+; VI-NEXT:    v_readfirstlane_b32 s89, v16
+; VI-NEXT:    v_readfirstlane_b32 s76, v17
+; VI-NEXT:    v_readfirstlane_b32 s77, v18
 ; VI-NEXT:    v_readfirstlane_b32 s4, v1
-; VI-NEXT:    s_and_b64 s[46:47], vcc, exec
+; VI-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; VI-NEXT:    v_readfirstlane_b32 s5, v2
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr61 : SGPR spill to VGPR lane
 ; VI-NEXT:    ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
 ; VI-NEXT:    s_cbranch_scc0 .LBB91_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    s_lshr_b32 s46, s5, 24
-; VI-NEXT:    v_writelane_b32 v62, s46, 57
-; VI-NEXT:    s_lshr_b32 s46, s5, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 56
-; VI-NEXT:    s_lshr_b32 s46, s5, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 55
-; VI-NEXT:    s_lshr_b32 s46, s4, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 54
-; VI-NEXT:    s_lshr_b32 s46, s4, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 53
-; VI-NEXT:    s_lshr_b32 s46, s29, 24
-; VI-NEXT:    v_writelane_b32 v62, s46, 52
-; VI-NEXT:    s_lshr_b32 s46, s29, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 51
-; VI-NEXT:    s_lshr_b32 s46, s29, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 50
-; VI-NEXT:    s_lshr_b32 s46, s28, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 49
-; VI-NEXT:    s_lshr_b32 s46, s28, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 48
-; VI-NEXT:    s_lshr_b32 s46, s27, 24
-; VI-NEXT:    v_writelane_b32 v62, s46, 47
-; VI-NEXT:    s_lshr_b32 s46, s27, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 46
-; VI-NEXT:    s_lshr_b32 s46, s27, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 45
-; VI-NEXT:    s_lshr_b32 s46, s26, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 44
-; VI-NEXT:    s_lshr_b32 s46, s26, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 43
-; VI-NEXT:    s_lshr_b32 s46, s25, 24
-; VI-NEXT:    v_writelane_b32 v62, s46, 42
-; VI-NEXT:    s_lshr_b32 s46, s25, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 41
-; VI-NEXT:    s_lshr_b32 s46, s25, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 40
-; VI-NEXT:    s_lshr_b32 s46, s24, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 39
-; VI-NEXT:    s_lshr_b32 s46, s24, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 38
-; VI-NEXT:    s_lshr_b32 s46, s23, 24
-; VI-NEXT:    v_writelane_b32 v62, s46, 37
-; VI-NEXT:    s_lshr_b32 s46, s23, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 36
-; VI-NEXT:    s_lshr_b32 s46, s23, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 35
-; VI-NEXT:    s_lshr_b32 s46, s22, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 34
-; VI-NEXT:    s_lshr_b32 s46, s22, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 33
-; VI-NEXT:    s_lshr_b32 s46, s21, 24
-; VI-NEXT:    v_writelane_b32 v62, s46, 32
-; VI-NEXT:    s_lshr_b32 s46, s21, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 31
-; VI-NEXT:    s_lshr_b32 s46, s21, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 30
-; VI-NEXT:    s_lshr_b32 s46, s20, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 29
-; VI-NEXT:    s_lshr_b32 s46, s20, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 28
-; VI-NEXT:    s_lshr_b32 s46, s19, 24
-; VI-NEXT:    v_writelane_b32 v62, s46, 27
-; VI-NEXT:    s_lshr_b32 s46, s19, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 26
-; VI-NEXT:    s_lshr_b32 s46, s19, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 25
-; VI-NEXT:    s_lshr_b32 s46, s18, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 24
-; VI-NEXT:    s_lshr_b32 s46, s18, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 23
-; VI-NEXT:    s_lshr_b32 s46, s17, 24
-; VI-NEXT:    v_writelane_b32 v62, s46, 22
-; VI-NEXT:    s_lshr_b32 s46, s17, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 21
-; VI-NEXT:    s_lshr_b32 s46, s17, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 20
-; VI-NEXT:    s_lshr_b32 s46, s16, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 19
-; VI-NEXT:    s_lshr_b32 s46, s16, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 18
-; VI-NEXT:    s_lshr_b32 s46, s7, 24
-; VI-NEXT:    v_writelane_b32 v62, s46, 17
-; VI-NEXT:    s_lshr_b32 s46, s7, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 16
-; VI-NEXT:    s_lshr_b32 s46, s7, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 15
-; VI-NEXT:    s_lshr_b32 s46, s6, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 14
-; VI-NEXT:    s_lshr_b32 s46, s6, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 13
-; VI-NEXT:    s_lshr_b32 s46, s9, 24
-; VI-NEXT:    v_writelane_b32 v62, s46, 12
-; VI-NEXT:    s_lshr_b32 s46, s9, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 11
-; VI-NEXT:    s_lshr_b32 s46, s9, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 10
-; VI-NEXT:    s_lshr_b32 s46, s8, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 9
-; VI-NEXT:    s_lshr_b32 s46, s8, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 8
-; VI-NEXT:    s_lshr_b32 s46, s11, 24
-; VI-NEXT:    v_writelane_b32 v62, s46, 7
-; VI-NEXT:    s_lshr_b32 s46, s11, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 6
-; VI-NEXT:    s_lshr_b32 s46, s11, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 5
-; VI-NEXT:    s_lshr_b32 s46, s10, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 4
-; VI-NEXT:    s_lshr_b32 s46, s10, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 3
-; VI-NEXT:    s_lshr_b32 s46, s13, 24
-; VI-NEXT:    v_writelane_b32 v62, s46, 2
-; VI-NEXT:    s_lshr_b32 s46, s13, 16
-; VI-NEXT:    v_writelane_b32 v62, s46, 1
-; VI-NEXT:    s_lshr_b32 s46, s12, 16
-; VI-NEXT:    s_lshr_b32 s80, s13, 8
-; VI-NEXT:    v_writelane_b32 v62, s46, 0
-; VI-NEXT:    s_lshr_b32 s81, s12, 8
-; VI-NEXT:    s_lshr_b32 s82, s15, 24
-; VI-NEXT:    s_lshr_b32 s83, s15, 16
-; VI-NEXT:    s_lshr_b32 s85, s15, 8
-; VI-NEXT:    s_lshr_b32 s84, s14, 16
-; VI-NEXT:    s_lshr_b32 s86, s14, 8
-; VI-NEXT:    s_lshr_b32 s87, s41, 24
-; VI-NEXT:    s_lshr_b32 s50, s41, 16
-; VI-NEXT:    s_lshr_b32 s52, s41, 8
-; VI-NEXT:    s_lshr_b32 s51, s40, 16
-; VI-NEXT:    s_lshr_b32 s53, s40, 8
-; VI-NEXT:    s_lshr_b32 s54, s43, 24
-; VI-NEXT:    s_lshr_b32 s55, s43, 16
-; VI-NEXT:    s_lshr_b32 s65, s43, 8
-; VI-NEXT:    s_lshr_b32 s64, s42, 16
-; VI-NEXT:    s_lshr_b32 s66, s42, 8
-; VI-NEXT:    s_lshr_b32 s67, s45, 24
-; VI-NEXT:    s_lshr_b32 s68, s45, 16
-; VI-NEXT:    s_lshr_b32 s70, s45, 8
-; VI-NEXT:    s_lshr_b32 s69, s44, 16
-; VI-NEXT:    s_lshr_b32 s71, s44, 8
-; VI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 24
-; VI-NEXT:    s_lshr_b64 s[56:57], s[28:29], 24
-; VI-NEXT:    s_lshr_b64 s[58:59], s[26:27], 24
-; VI-NEXT:    s_lshr_b64 s[60:61], s[24:25], 24
-; VI-NEXT:    s_lshr_b64 s[62:63], s[22:23], 24
-; VI-NEXT:    s_lshr_b64 s[72:73], s[20:21], 24
-; VI-NEXT:    s_lshr_b64 s[74:75], s[18:19], 24
-; VI-NEXT:    s_lshr_b64 s[76:77], s[16:17], 24
-; VI-NEXT:    s_lshr_b64 s[78:79], s[6:7], 24
-; VI-NEXT:    s_lshr_b64 s[88:89], s[8:9], 24
-; VI-NEXT:    s_lshr_b64 s[90:91], s[10:11], 24
-; VI-NEXT:    s_lshr_b64 s[30:31], s[12:13], 24
-; VI-NEXT:    s_lshr_b64 s[34:35], s[14:15], 24
-; VI-NEXT:    s_lshr_b64 s[36:37], s[40:41], 24
-; VI-NEXT:    s_lshr_b64 s[38:39], s[42:43], 24
-; VI-NEXT:    s_lshr_b64 s[48:49], s[44:45], 24
+; VI-NEXT:    s_lshr_b32 s6, s5, 24
+; VI-NEXT:    v_writelane_b32 v61, s6, 26
+; VI-NEXT:    s_lshr_b32 s6, s5, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 27
+; VI-NEXT:    s_lshr_b32 s6, s5, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 28
+; VI-NEXT:    s_lshr_b32 s6, s4, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 29
+; VI-NEXT:    s_lshr_b32 s6, s4, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 30
+; VI-NEXT:    s_lshr_b32 s6, s29, 24
+; VI-NEXT:    v_writelane_b32 v61, s6, 31
+; VI-NEXT:    s_lshr_b32 s6, s29, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 32
+; VI-NEXT:    s_lshr_b32 s6, s29, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 33
+; VI-NEXT:    s_lshr_b32 s6, s28, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 34
+; VI-NEXT:    s_lshr_b32 s6, s28, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 35
+; VI-NEXT:    s_lshr_b32 s6, s27, 24
+; VI-NEXT:    v_writelane_b32 v61, s6, 36
+; VI-NEXT:    s_lshr_b32 s6, s27, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 37
+; VI-NEXT:    s_lshr_b32 s6, s27, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 38
+; VI-NEXT:    s_lshr_b32 s6, s26, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 39
+; VI-NEXT:    s_lshr_b32 s6, s26, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 40
+; VI-NEXT:    s_lshr_b32 s6, s25, 24
+; VI-NEXT:    v_writelane_b32 v61, s6, 41
+; VI-NEXT:    s_lshr_b32 s6, s25, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 42
+; VI-NEXT:    s_lshr_b32 s6, s25, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 43
+; VI-NEXT:    s_lshr_b32 s6, s24, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 44
+; VI-NEXT:    s_lshr_b32 s6, s24, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 45
+; VI-NEXT:    s_lshr_b32 s6, s23, 24
+; VI-NEXT:    v_writelane_b32 v61, s6, 46
+; VI-NEXT:    s_lshr_b32 s6, s23, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 47
+; VI-NEXT:    s_lshr_b32 s6, s23, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 48
+; VI-NEXT:    s_lshr_b32 s6, s22, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 49
+; VI-NEXT:    s_lshr_b32 s6, s22, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 50
+; VI-NEXT:    s_lshr_b32 s6, s21, 24
+; VI-NEXT:    v_writelane_b32 v61, s6, 51
+; VI-NEXT:    s_lshr_b32 s6, s21, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 52
+; VI-NEXT:    s_lshr_b32 s6, s21, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 53
+; VI-NEXT:    s_lshr_b32 s6, s20, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 54
+; VI-NEXT:    s_lshr_b32 s6, s20, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 55
+; VI-NEXT:    s_lshr_b32 s6, s19, 24
+; VI-NEXT:    v_writelane_b32 v61, s6, 56
+; VI-NEXT:    s_lshr_b32 s6, s19, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 57
+; VI-NEXT:    s_lshr_b32 s6, s19, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 58
+; VI-NEXT:    s_lshr_b32 s6, s18, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 59
+; VI-NEXT:    s_lshr_b32 s6, s18, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 60
+; VI-NEXT:    s_lshr_b32 s6, s17, 24
+; VI-NEXT:    v_writelane_b32 v61, s6, 61
+; VI-NEXT:    s_lshr_b32 s6, s17, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 62
+; VI-NEXT:    s_lshr_b32 s6, s17, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 63
+; VI-NEXT:    s_lshr_b32 s6, s16, 16
+; VI-NEXT:    v_writelane_b32 v62, s6, 0
+; VI-NEXT:    s_lshr_b32 s6, s16, 8
+; VI-NEXT:    v_writelane_b32 v62, s6, 1
+; VI-NEXT:    s_lshr_b32 s6, s39, 24
+; VI-NEXT:    v_writelane_b32 v61, s6, 18
+; VI-NEXT:    s_lshr_b32 s6, s39, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 19
+; VI-NEXT:    s_lshr_b32 s6, s39, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 20
+; VI-NEXT:    s_lshr_b32 s6, s38, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 16
+; VI-NEXT:    s_lshr_b32 s6, s38, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 17
+; VI-NEXT:    s_lshr_b32 s6, s49, 24
+; VI-NEXT:    v_writelane_b32 v61, s6, 23
+; VI-NEXT:    s_lshr_b32 s6, s49, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 24
+; VI-NEXT:    s_lshr_b32 s6, s49, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 25
+; VI-NEXT:    s_lshr_b32 s6, s48, 16
+; VI-NEXT:    v_writelane_b32 v61, s6, 21
+; VI-NEXT:    s_lshr_b32 s6, s48, 8
+; VI-NEXT:    v_writelane_b32 v61, s6, 22
+; VI-NEXT:    s_lshr_b64 vcc, s[4:5], 24
+; VI-NEXT:    v_writelane_b32 v61, vcc_lo, 14
+; VI-NEXT:    v_writelane_b32 v61, vcc_hi, 15
+; VI-NEXT:    s_lshr_b64 vcc, s[28:29], 24
+; VI-NEXT:    v_writelane_b32 v61, vcc_lo, 12
+; VI-NEXT:    v_writelane_b32 v61, vcc_hi, 13
+; VI-NEXT:    s_lshr_b64 vcc, s[26:27], 24
+; VI-NEXT:    v_writelane_b32 v61, vcc_lo, 10
+; VI-NEXT:    v_writelane_b32 v61, vcc_hi, 11
+; VI-NEXT:    s_lshr_b64 vcc, s[24:25], 24
+; VI-NEXT:    v_writelane_b32 v61, vcc_lo, 8
+; VI-NEXT:    v_writelane_b32 v61, vcc_hi, 9
+; VI-NEXT:    s_lshr_b64 vcc, s[22:23], 24
+; VI-NEXT:    v_writelane_b32 v61, vcc_lo, 6
+; VI-NEXT:    v_writelane_b32 v61, vcc_hi, 7
+; VI-NEXT:    s_lshr_b64 vcc, s[20:21], 24
+; VI-NEXT:    v_writelane_b32 v61, vcc_lo, 4
+; VI-NEXT:    v_writelane_b32 v61, vcc_hi, 5
+; VI-NEXT:    s_lshr_b64 vcc, s[18:19], 24
+; VI-NEXT:    v_writelane_b32 v61, vcc_lo, 2
+; VI-NEXT:    v_writelane_b32 v61, vcc_hi, 3
+; VI-NEXT:    s_lshr_b64 vcc, s[16:17], 24
+; VI-NEXT:    v_writelane_b32 v61, vcc_lo, 0
+; VI-NEXT:    s_lshr_b32 s87, s77, 24
+; VI-NEXT:    s_lshr_b32 s43, s77, 16
+; VI-NEXT:    s_lshr_b32 s42, s77, 8
+; VI-NEXT:    s_lshr_b32 s13, s76, 16
+; VI-NEXT:    s_lshr_b32 s11, s76, 8
+; VI-NEXT:    s_lshr_b32 s86, s89, 24
+; VI-NEXT:    s_lshr_b32 s85, s89, 16
+; VI-NEXT:    s_lshr_b32 s84, s89, 8
+; VI-NEXT:    s_lshr_b32 s9, s88, 16
+; VI-NEXT:    s_lshr_b32 s7, s88, 8
+; VI-NEXT:    s_lshr_b32 s75, s91, 24
+; VI-NEXT:    s_lshr_b32 s74, s91, 16
+; VI-NEXT:    s_lshr_b32 s73, s91, 8
+; VI-NEXT:    s_lshr_b32 s79, s90, 16
+; VI-NEXT:    s_lshr_b32 s78, s90, 8
+; VI-NEXT:    s_lshr_b32 s60, s31, 24
+; VI-NEXT:    s_lshr_b32 s15, s31, 16
+; VI-NEXT:    s_lshr_b32 s14, s31, 8
+; VI-NEXT:    s_lshr_b32 s72, s30, 16
+; VI-NEXT:    s_lshr_b32 s61, s30, 8
+; VI-NEXT:    s_lshr_b32 s63, s35, 24
+; VI-NEXT:    s_lshr_b32 s57, s35, 16
+; VI-NEXT:    s_lshr_b32 s56, s35, 8
+; VI-NEXT:    s_lshr_b32 s83, s34, 16
+; VI-NEXT:    s_lshr_b32 s82, s34, 8
+; VI-NEXT:    s_lshr_b32 s41, s37, 24
+; VI-NEXT:    s_lshr_b32 s47, s37, 16
+; VI-NEXT:    s_lshr_b32 s46, s37, 8
+; VI-NEXT:    s_lshr_b32 s59, s36, 16
+; VI-NEXT:    s_lshr_b32 s45, s36, 8
+; VI-NEXT:    v_writelane_b32 v61, vcc_hi, 1
+; VI-NEXT:    s_lshr_b64 s[50:51], s[76:77], 24
+; VI-NEXT:    s_lshr_b64 s[52:53], s[88:89], 24
+; VI-NEXT:    s_lshr_b64 s[54:55], s[90:91], 24
+; VI-NEXT:    s_lshr_b64 s[64:65], s[30:31], 24
+; VI-NEXT:    s_lshr_b64 s[66:67], s[34:35], 24
+; VI-NEXT:    s_lshr_b64 s[68:69], s[36:37], 24
+; VI-NEXT:    s_lshr_b64 s[70:71], s[38:39], 24
+; VI-NEXT:    s_lshr_b64 s[80:81], s[48:49], 24
+; VI-NEXT:    s_mov_b32 s6, s17
+; VI-NEXT:    s_mov_b32 s8, s19
+; VI-NEXT:    s_mov_b32 s10, s21
+; VI-NEXT:    s_mov_b32 s12, s23
+; VI-NEXT:    s_mov_b32 s40, s25
+; VI-NEXT:    s_mov_b32 s44, s27
+; VI-NEXT:    s_mov_b32 s58, s29
+; VI-NEXT:    s_mov_b32 s62, s5
 ; VI-NEXT:    s_cbranch_execnz .LBB91_4
 ; VI-NEXT:  .LBB91_2: ; %cmp.true
-; VI-NEXT:    s_lshl_b32 s46, s45, 16
-; VI-NEXT:    v_mov_b32_e32 v31, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s46, v31
+; VI-NEXT:    s_lshl_b32 s6, s49, 16
+; VI-NEXT:    v_mov_b32_e32 v25, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v1, s6, v25
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s45, s45, 0xffff0000
+; VI-NEXT:    s_and_b32 s6, s49, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s45, v31
+; VI-NEXT:    v_add_f32_e32 v2, s6, v25
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
@@ -168979,53 +169899,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s45, s44, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s45, v31
-; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_lshrrev_b64 v[1:2], 16, v[1:2]
+; VI-NEXT:    s_lshl_b32 s6, s48, 16
+; VI-NEXT:    v_add_f32_e32 v2, s6, v25
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s44, s44, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s44, v31
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    s_lshl_b32 s44, s43, 16
-; VI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
-; VI-NEXT:    v_add_f32_e32 v3, s44, v31
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b32 s6, s48, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s6, v25
 ; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
 ; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    s_and_b32 s43, s43, 0xffff0000
+; VI-NEXT:    s_lshl_b32 s6, s39, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s43, v31
+; VI-NEXT:    v_add_f32_e32 v4, s6, v25
 ; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT:    s_and_b32 s6, s39, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT:    s_lshl_b32 s43, s42, 16
-; VI-NEXT:    v_alignbit_b32 v4, v4, v3, 16
-; VI-NEXT:    v_add_f32_e32 v3, s43, v31
-; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    s_and_b32 s42, s42, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
-; VI-NEXT:    v_add_f32_e32 v5, s42, v31
+; VI-NEXT:    v_add_f32_e32 v5, s6, v25
 ; VI-NEXT:    v_bfe_u32 v6, v5, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
@@ -169033,53 +169933,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT:    s_lshl_b32 s42, s41, 16
-; VI-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; VI-NEXT:    v_add_f32_e32 v5, s42, v31
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[4:5]
+; VI-NEXT:    s_lshl_b32 s6, s38, 16
+; VI-NEXT:    v_add_f32_e32 v5, s6, v25
 ; VI-NEXT:    v_bfe_u32 v6, v5, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v5
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT:    s_and_b32 s41, s41, 0xffff0000
+; VI-NEXT:    s_and_b32 s6, s38, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
-; VI-NEXT:    v_add_f32_e32 v6, s41, v31
+; VI-NEXT:    v_add_f32_e32 v6, s6, v25
 ; VI-NEXT:    v_bfe_u32 v7, v6, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
 ; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
 ; VI-NEXT:    v_or_b32_e32 v8, 0x400000, v6
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT:    s_lshl_b32 s6, s37, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT:    s_lshl_b32 s41, s40, 16
-; VI-NEXT:    v_alignbit_b32 v6, v6, v5, 16
-; VI-NEXT:    v_add_f32_e32 v5, s41, v31
-; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
-; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; VI-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT:    s_and_b32 s40, s40, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
-; VI-NEXT:    v_add_f32_e32 v7, s40, v31
-; VI-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v7
-; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v7
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT:    v_cndmask_b32_e32 v7, v8, v9, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT:    s_lshl_b32 s40, s15, 16
-; VI-NEXT:    v_alignbit_b32 v5, v7, v5, 16
-; VI-NEXT:    v_add_f32_e32 v7, s40, v31
+; VI-NEXT:    v_add_f32_e32 v7, s6, v25
 ; VI-NEXT:    v_bfe_u32 v8, v7, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v7
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
 ; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v7
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT:    s_and_b32 s15, s15, 0xffff0000
+; VI-NEXT:    s_and_b32 s6, s37, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v7, v8, v9, vcc
-; VI-NEXT:    v_add_f32_e32 v8, s15, v31
+; VI-NEXT:    v_add_f32_e32 v8, s6, v25
 ; VI-NEXT:    v_bfe_u32 v9, v8, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
 ; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
@@ -169087,53 +169967,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT:    s_lshl_b32 s15, s14, 16
-; VI-NEXT:    v_alignbit_b32 v8, v8, v7, 16
-; VI-NEXT:    v_add_f32_e32 v7, s15, v31
-; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT:    v_lshrrev_b64 v[7:8], 16, v[7:8]
+; VI-NEXT:    s_lshl_b32 s6, s36, 16
+; VI-NEXT:    v_add_f32_e32 v8, s6, v25
+; VI-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
 ; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT:    v_or_b32_e32 v10, 0x400000, v7
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT:    s_and_b32 s14, s14, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
-; VI-NEXT:    v_add_f32_e32 v9, s14, v31
-; VI-NEXT:    v_bfe_u32 v10, v9, 16, 1
-; VI-NEXT:    v_add_u32_e32 v10, vcc, v10, v9
-; VI-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
-; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v9
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT:    v_cndmask_b32_e32 v9, v10, v11, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT:    s_lshl_b32 s14, s13, 16
-; VI-NEXT:    v_alignbit_b32 v7, v9, v7, 16
-; VI-NEXT:    v_add_f32_e32 v9, s14, v31
+; VI-NEXT:    v_or_b32_e32 v10, 0x400000, v8
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; VI-NEXT:    s_and_b32 s6, s36, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
+; VI-NEXT:    v_add_f32_e32 v9, s6, v25
 ; VI-NEXT:    v_bfe_u32 v10, v9, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, v10, v9
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
 ; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v9
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT:    s_and_b32 s13, s13, 0xffff0000
+; VI-NEXT:    s_lshl_b32 s6, s35, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v9, v10, v11, vcc
-; VI-NEXT:    v_add_f32_e32 v10, s13, v31
+; VI-NEXT:    v_add_f32_e32 v10, s6, v25
 ; VI-NEXT:    v_bfe_u32 v11, v10, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v10
 ; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
 ; VI-NEXT:    v_or_b32_e32 v12, 0x400000, v10
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; VI-NEXT:    s_and_b32 s6, s35, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT:    s_lshl_b32 s13, s12, 16
-; VI-NEXT:    v_alignbit_b32 v10, v10, v9, 16
-; VI-NEXT:    v_add_f32_e32 v9, s13, v31
-; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
-; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
-; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
-; VI-NEXT:    v_or_b32_e32 v12, 0x400000, v9
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT:    s_and_b32 s12, s12, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v9, v11, v12, vcc
-; VI-NEXT:    v_add_f32_e32 v11, s12, v31
+; VI-NEXT:    v_add_f32_e32 v11, s6, v25
 ; VI-NEXT:    v_bfe_u32 v12, v11, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v12, vcc, v12, v11
 ; VI-NEXT:    v_add_u32_e32 v12, vcc, 0x7fff, v12
@@ -169141,53 +170001,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; VI-NEXT:    v_cndmask_b32_e32 v11, v12, v13, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT:    s_lshl_b32 s12, s11, 16
-; VI-NEXT:    v_alignbit_b32 v9, v11, v9, 16
-; VI-NEXT:    v_add_f32_e32 v11, s12, v31
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[10:11]
+; VI-NEXT:    s_lshl_b32 s6, s34, 16
+; VI-NEXT:    v_add_f32_e32 v11, s6, v25
 ; VI-NEXT:    v_bfe_u32 v12, v11, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v12, vcc, v12, v11
 ; VI-NEXT:    v_add_u32_e32 v12, vcc, 0x7fff, v12
 ; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v11
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; VI-NEXT:    s_and_b32 s11, s11, 0xffff0000
+; VI-NEXT:    s_and_b32 s6, s34, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v11, v12, v13, vcc
-; VI-NEXT:    v_add_f32_e32 v12, s11, v31
+; VI-NEXT:    v_add_f32_e32 v12, s6, v25
 ; VI-NEXT:    v_bfe_u32 v13, v12, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v12
 ; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
 ; VI-NEXT:    v_or_b32_e32 v14, 0x400000, v12
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; VI-NEXT:    s_lshl_b32 s6, s31, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v12, v13, v14, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT:    s_lshl_b32 s11, s10, 16
-; VI-NEXT:    v_alignbit_b32 v12, v12, v11, 16
-; VI-NEXT:    v_add_f32_e32 v11, s11, v31
-; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
-; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
-; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
-; VI-NEXT:    v_or_b32_e32 v14, 0x400000, v11
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; VI-NEXT:    s_and_b32 s10, s10, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v11, v13, v14, vcc
-; VI-NEXT:    v_add_f32_e32 v13, s10, v31
-; VI-NEXT:    v_bfe_u32 v14, v13, 16, 1
-; VI-NEXT:    v_add_u32_e32 v14, vcc, v14, v13
-; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x7fff, v14
-; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v13
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT:    v_cndmask_b32_e32 v13, v14, v15, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT:    s_lshl_b32 s10, s9, 16
-; VI-NEXT:    v_alignbit_b32 v11, v13, v11, 16
-; VI-NEXT:    v_add_f32_e32 v13, s10, v31
+; VI-NEXT:    v_add_f32_e32 v13, s6, v25
 ; VI-NEXT:    v_bfe_u32 v14, v13, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v14, vcc, v14, v13
 ; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x7fff, v14
 ; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v13
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT:    s_and_b32 s9, s9, 0xffff0000
+; VI-NEXT:    s_and_b32 s6, s31, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v13, v14, v15, vcc
-; VI-NEXT:    v_add_f32_e32 v14, s9, v31
+; VI-NEXT:    v_add_f32_e32 v14, s6, v25
 ; VI-NEXT:    v_bfe_u32 v15, v14, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v14
 ; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
@@ -169195,53 +170035,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; VI-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT:    s_lshl_b32 s9, s8, 16
-; VI-NEXT:    v_alignbit_b32 v14, v14, v13, 16
-; VI-NEXT:    v_add_f32_e32 v13, s9, v31
-; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
-; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    v_lshrrev_b64 v[13:14], 16, v[13:14]
+; VI-NEXT:    s_lshl_b32 s6, s30, 16
+; VI-NEXT:    v_add_f32_e32 v14, s6, v25
+; VI-NEXT:    v_bfe_u32 v15, v14, 16, 1
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v14
 ; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
-; VI-NEXT:    v_or_b32_e32 v16, 0x400000, v13
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT:    s_and_b32 s8, s8, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v13, v15, v16, vcc
-; VI-NEXT:    v_add_f32_e32 v15, s8, v31
-; VI-NEXT:    v_bfe_u32 v16, v15, 16, 1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v15
-; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v15
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; VI-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT:    s_lshl_b32 s8, s7, 16
-; VI-NEXT:    v_alignbit_b32 v13, v15, v13, 16
-; VI-NEXT:    v_add_f32_e32 v15, s8, v31
+; VI-NEXT:    v_or_b32_e32 v16, 0x400000, v14
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; VI-NEXT:    s_and_b32 s6, s30, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
+; VI-NEXT:    v_add_f32_e32 v15, s6, v25
 ; VI-NEXT:    v_bfe_u32 v16, v15, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v15
 ; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
 ; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v15
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; VI-NEXT:    s_and_b32 s7, s7, 0xffff0000
+; VI-NEXT:    s_lshl_b32 s6, s91, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
-; VI-NEXT:    v_add_f32_e32 v16, s7, v31
+; VI-NEXT:    v_add_f32_e32 v16, s6, v25
 ; VI-NEXT:    v_bfe_u32 v17, v16, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
 ; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v16
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
+; VI-NEXT:    s_and_b32 s6, s91, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT:    s_lshl_b32 s7, s6, 16
-; VI-NEXT:    v_alignbit_b32 v16, v16, v15, 16
-; VI-NEXT:    v_add_f32_e32 v15, s7, v31
-; VI-NEXT:    v_bfe_u32 v17, v15, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v15
-; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v15
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; VI-NEXT:    s_and_b32 s6, s6, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
-; VI-NEXT:    v_add_f32_e32 v17, s6, v31
+; VI-NEXT:    v_add_f32_e32 v17, s6, v25
 ; VI-NEXT:    v_bfe_u32 v18, v17, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v17
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
@@ -169249,53 +170069,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; VI-NEXT:    v_cndmask_b32_e32 v17, v18, v19, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT:    s_lshl_b32 s6, s17, 16
-; VI-NEXT:    v_alignbit_b32 v15, v17, v15, 16
-; VI-NEXT:    v_add_f32_e32 v17, s6, v31
+; VI-NEXT:    v_lshrrev_b64 v[16:17], 16, v[16:17]
+; VI-NEXT:    s_lshl_b32 s6, s90, 16
+; VI-NEXT:    v_add_f32_e32 v17, s6, v25
 ; VI-NEXT:    v_bfe_u32 v18, v17, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v17
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
 ; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v17
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT:    s_and_b32 s6, s17, 0xffff0000
+; VI-NEXT:    s_and_b32 s6, s90, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v17, v18, v19, vcc
-; VI-NEXT:    v_add_f32_e32 v18, s6, v31
+; VI-NEXT:    v_add_f32_e32 v18, s6, v25
 ; VI-NEXT:    v_bfe_u32 v19, v18, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v19, vcc, v19, v18
 ; VI-NEXT:    v_add_u32_e32 v19, vcc, 0x7fff, v19
 ; VI-NEXT:    v_or_b32_e32 v20, 0x400000, v18
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    s_lshl_b32 s6, s89, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v18, v19, v20, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; VI-NEXT:    s_lshl_b32 s6, s16, 16
-; VI-NEXT:    v_alignbit_b32 v18, v18, v17, 16
-; VI-NEXT:    v_add_f32_e32 v17, s6, v31
-; VI-NEXT:    v_bfe_u32 v19, v17, 16, 1
-; VI-NEXT:    v_add_u32_e32 v19, vcc, v19, v17
-; VI-NEXT:    v_add_u32_e32 v19, vcc, 0x7fff, v19
-; VI-NEXT:    v_or_b32_e32 v20, 0x400000, v17
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT:    s_and_b32 s6, s16, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc
-; VI-NEXT:    v_add_f32_e32 v19, s6, v31
-; VI-NEXT:    v_bfe_u32 v20, v19, 16, 1
-; VI-NEXT:    v_add_u32_e32 v20, vcc, v20, v19
-; VI-NEXT:    v_add_u32_e32 v20, vcc, 0x7fff, v20
-; VI-NEXT:    v_or_b32_e32 v21, 0x400000, v19
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; VI-NEXT:    v_cndmask_b32_e32 v19, v20, v21, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT:    s_lshl_b32 s6, s19, 16
-; VI-NEXT:    v_alignbit_b32 v17, v19, v17, 16
-; VI-NEXT:    v_add_f32_e32 v19, s6, v31
+; VI-NEXT:    v_add_f32_e32 v19, s6, v25
 ; VI-NEXT:    v_bfe_u32 v20, v19, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v20, vcc, v20, v19
 ; VI-NEXT:    v_add_u32_e32 v20, vcc, 0x7fff, v20
 ; VI-NEXT:    v_or_b32_e32 v21, 0x400000, v19
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; VI-NEXT:    s_and_b32 s6, s19, 0xffff0000
+; VI-NEXT:    s_and_b32 s6, s89, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v19, v20, v21, vcc
-; VI-NEXT:    v_add_f32_e32 v20, s6, v31
+; VI-NEXT:    v_add_f32_e32 v20, s6, v25
 ; VI-NEXT:    v_bfe_u32 v21, v20, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v21, vcc, v21, v20
 ; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x7fff, v21
@@ -169303,863 +170103,1089 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; VI-NEXT:    v_cndmask_b32_e32 v20, v21, v22, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT:    s_lshl_b32 s6, s18, 16
-; VI-NEXT:    v_alignbit_b32 v20, v20, v19, 16
-; VI-NEXT:    v_add_f32_e32 v19, s6, v31
-; VI-NEXT:    v_bfe_u32 v21, v19, 16, 1
-; VI-NEXT:    v_add_u32_e32 v21, vcc, v21, v19
+; VI-NEXT:    v_lshrrev_b64 v[19:20], 16, v[19:20]
+; VI-NEXT:    s_lshl_b32 s6, s88, 16
+; VI-NEXT:    v_add_f32_e32 v20, s6, v25
+; VI-NEXT:    v_bfe_u32 v21, v20, 16, 1
+; VI-NEXT:    v_add_u32_e32 v21, vcc, v21, v20
 ; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x7fff, v21
-; VI-NEXT:    v_or_b32_e32 v22, 0x400000, v19
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; VI-NEXT:    s_and_b32 s6, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v19, v21, v22, vcc
-; VI-NEXT:    v_add_f32_e32 v21, s6, v31
-; VI-NEXT:    v_bfe_u32 v22, v21, 16, 1
-; VI-NEXT:    v_add_u32_e32 v22, vcc, v22, v21
-; VI-NEXT:    v_add_u32_e32 v22, vcc, 0x7fff, v22
-; VI-NEXT:    v_or_b32_e32 v23, 0x400000, v21
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; VI-NEXT:    v_cndmask_b32_e32 v21, v22, v23, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT:    s_lshl_b32 s6, s21, 16
-; VI-NEXT:    v_alignbit_b32 v19, v21, v19, 16
-; VI-NEXT:    v_add_f32_e32 v21, s6, v31
+; VI-NEXT:    v_or_b32_e32 v22, 0x400000, v20
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; VI-NEXT:    s_and_b32 s6, s88, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v20, v21, v22, vcc
+; VI-NEXT:    v_add_f32_e32 v21, s6, v25
 ; VI-NEXT:    v_bfe_u32 v22, v21, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v22, vcc, v22, v21
 ; VI-NEXT:    v_add_u32_e32 v22, vcc, 0x7fff, v22
 ; VI-NEXT:    v_or_b32_e32 v23, 0x400000, v21
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; VI-NEXT:    s_and_b32 s6, s21, 0xffff0000
+; VI-NEXT:    s_lshl_b32 s6, s77, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v21, v22, v23, vcc
-; VI-NEXT:    v_add_f32_e32 v22, s6, v31
+; VI-NEXT:    v_add_f32_e32 v22, s6, v25
 ; VI-NEXT:    v_bfe_u32 v23, v22, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v23, vcc, v23, v22
 ; VI-NEXT:    v_add_u32_e32 v23, vcc, 0x7fff, v23
 ; VI-NEXT:    v_or_b32_e32 v24, 0x400000, v22
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; VI-NEXT:    s_and_b32 s6, s77, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v22, v23, v24, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT:    s_lshl_b32 s6, s20, 16
-; VI-NEXT:    v_alignbit_b32 v22, v22, v21, 16
-; VI-NEXT:    v_add_f32_e32 v21, s6, v31
-; VI-NEXT:    v_bfe_u32 v23, v21, 16, 1
-; VI-NEXT:    v_add_u32_e32 v23, vcc, v23, v21
-; VI-NEXT:    v_add_u32_e32 v23, vcc, 0x7fff, v23
-; VI-NEXT:    v_or_b32_e32 v24, 0x400000, v21
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; VI-NEXT:    s_and_b32 s6, s20, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v21, v23, v24, vcc
-; VI-NEXT:    v_add_f32_e32 v23, s6, v31
+; VI-NEXT:    v_add_f32_e32 v23, s6, v25
 ; VI-NEXT:    v_bfe_u32 v24, v23, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v24, vcc, v24, v23
 ; VI-NEXT:    v_add_u32_e32 v24, vcc, 0x7fff, v24
-; VI-NEXT:    v_or_b32_e32 v25, 0x400000, v23
+; VI-NEXT:    v_or_b32_e32 v26, 0x400000, v23
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; VI-NEXT:    v_cndmask_b32_e32 v23, v24, v25, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v23, v24, v26, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT:    s_lshl_b32 s6, s23, 16
-; VI-NEXT:    v_alignbit_b32 v21, v23, v21, 16
-; VI-NEXT:    v_add_f32_e32 v23, s6, v31
+; VI-NEXT:    v_lshrrev_b64 v[22:23], 16, v[22:23]
+; VI-NEXT:    s_lshl_b32 s6, s76, 16
+; VI-NEXT:    v_add_f32_e32 v23, s6, v25
 ; VI-NEXT:    v_bfe_u32 v24, v23, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v24, vcc, v24, v23
 ; VI-NEXT:    v_add_u32_e32 v24, vcc, 0x7fff, v24
-; VI-NEXT:    v_or_b32_e32 v25, 0x400000, v23
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; VI-NEXT:    s_and_b32 s6, s23, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v23, v24, v25, vcc
-; VI-NEXT:    v_add_f32_e32 v24, s6, v31
-; VI-NEXT:    v_bfe_u32 v25, v24, 16, 1
-; VI-NEXT:    v_add_u32_e32 v25, vcc, v25, v24
-; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x7fff, v25
-; VI-NEXT:    v_or_b32_e32 v26, 0x400000, v24
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; VI-NEXT:    v_cndmask_b32_e32 v24, v25, v26, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT:    s_lshl_b32 s6, s22, 16
-; VI-NEXT:    v_alignbit_b32 v24, v24, v23, 16
-; VI-NEXT:    v_add_f32_e32 v23, s6, v31
-; VI-NEXT:    v_bfe_u32 v25, v23, 16, 1
-; VI-NEXT:    v_add_u32_e32 v25, vcc, v25, v23
-; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x7fff, v25
 ; VI-NEXT:    v_or_b32_e32 v26, 0x400000, v23
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; VI-NEXT:    s_and_b32 s6, s22, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v23, v25, v26, vcc
-; VI-NEXT:    v_add_f32_e32 v25, s6, v31
-; VI-NEXT:    v_bfe_u32 v26, v25, 16, 1
-; VI-NEXT:    v_add_u32_e32 v26, vcc, v26, v25
+; VI-NEXT:    s_and_b32 s6, s76, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v23, v24, v26, vcc
+; VI-NEXT:    v_add_f32_e32 v24, s6, v25
+; VI-NEXT:    v_bfe_u32 v26, v24, 16, 1
+; VI-NEXT:    v_add_u32_e32 v26, vcc, v26, v24
 ; VI-NEXT:    v_add_u32_e32 v26, vcc, 0x7fff, v26
-; VI-NEXT:    v_or_b32_e32 v27, 0x400000, v25
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; VI-NEXT:    v_cndmask_b32_e32 v25, v26, v27, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT:    s_lshl_b32 s6, s25, 16
-; VI-NEXT:    v_alignbit_b32 v23, v25, v23, 16
-; VI-NEXT:    v_add_f32_e32 v25, s6, v31
-; VI-NEXT:    v_bfe_u32 v26, v25, 16, 1
-; VI-NEXT:    v_add_u32_e32 v26, vcc, v26, v25
-; VI-NEXT:    v_add_u32_e32 v26, vcc, 0x7fff, v26
-; VI-NEXT:    v_or_b32_e32 v27, 0x400000, v25
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; VI-NEXT:    s_and_b32 s6, s25, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v25, v26, v27, vcc
-; VI-NEXT:    v_add_f32_e32 v26, s6, v31
-; VI-NEXT:    v_bfe_u32 v27, v26, 16, 1
-; VI-NEXT:    v_add_u32_e32 v27, vcc, v27, v26
-; VI-NEXT:    v_add_u32_e32 v27, vcc, 0x7fff, v27
-; VI-NEXT:    v_or_b32_e32 v28, 0x400000, v26
+; VI-NEXT:    v_or_b32_e32 v27, 0x400000, v24
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; VI-NEXT:    s_lshl_b32 s6, s17, 16
+; VI-NEXT:    v_cndmask_b32_e32 v24, v26, v27, vcc
+; VI-NEXT:    v_add_f32_e32 v26, s6, v25
+; VI-NEXT:    v_readfirstlane_b32 s6, v26
+; VI-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT:    s_add_i32 s7, s7, s6
+; VI-NEXT:    s_add_i32 s8, s7, 0x7fff
+; VI-NEXT:    s_or_b32 s9, s6, 0x400000
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; VI-NEXT:    v_cndmask_b32_e32 v26, v27, v28, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT:    s_lshl_b32 s6, s24, 16
-; VI-NEXT:    v_alignbit_b32 v26, v26, v25, 16
-; VI-NEXT:    v_add_f32_e32 v25, s6, v31
-; VI-NEXT:    v_bfe_u32 v27, v25, 16, 1
-; VI-NEXT:    v_add_u32_e32 v27, vcc, v27, v25
-; VI-NEXT:    v_add_u32_e32 v27, vcc, 0x7fff, v27
-; VI-NEXT:    v_or_b32_e32 v28, 0x400000, v25
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; VI-NEXT:    s_and_b32 s6, s24, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v25, v27, v28, vcc
-; VI-NEXT:    v_add_f32_e32 v27, s6, v31
-; VI-NEXT:    v_bfe_u32 v28, v27, 16, 1
-; VI-NEXT:    v_add_u32_e32 v28, vcc, v28, v27
-; VI-NEXT:    v_add_u32_e32 v28, vcc, 0x7fff, v28
-; VI-NEXT:    v_or_b32_e32 v29, 0x400000, v27
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; VI-NEXT:    v_cndmask_b32_e32 v27, v28, v29, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT:    s_lshl_b32 s6, s27, 16
-; VI-NEXT:    v_alignbit_b32 v25, v27, v25, 16
-; VI-NEXT:    v_add_f32_e32 v27, s6, v31
-; VI-NEXT:    v_bfe_u32 v28, v27, 16, 1
-; VI-NEXT:    v_add_u32_e32 v28, vcc, v28, v27
-; VI-NEXT:    v_add_u32_e32 v28, vcc, 0x7fff, v28
-; VI-NEXT:    v_or_b32_e32 v29, 0x400000, v27
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; VI-NEXT:    s_and_b32 s6, s27, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v27, v28, v29, vcc
-; VI-NEXT:    v_add_f32_e32 v28, s6, v31
-; VI-NEXT:    v_bfe_u32 v29, v28, 16, 1
-; VI-NEXT:    v_add_u32_e32 v29, vcc, v29, v28
-; VI-NEXT:    v_add_u32_e32 v29, vcc, 0x7fff, v29
-; VI-NEXT:    v_or_b32_e32 v30, 0x400000, v28
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; VI-NEXT:    v_cndmask_b32_e32 v28, v29, v30, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT:    s_lshl_b32 s6, s26, 16
-; VI-NEXT:    v_alignbit_b32 v28, v28, v27, 16
-; VI-NEXT:    v_add_f32_e32 v27, s6, v31
-; VI-NEXT:    v_bfe_u32 v29, v27, 16, 1
-; VI-NEXT:    v_add_u32_e32 v29, vcc, v29, v27
-; VI-NEXT:    v_add_u32_e32 v29, vcc, 0x7fff, v29
-; VI-NEXT:    v_or_b32_e32 v30, 0x400000, v27
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; VI-NEXT:    s_and_b32 s6, s26, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v27, v29, v30, vcc
-; VI-NEXT:    v_add_f32_e32 v29, s6, v31
-; VI-NEXT:    v_bfe_u32 v30, v29, 16, 1
-; VI-NEXT:    v_add_u32_e32 v30, vcc, v30, v29
-; VI-NEXT:    v_add_u32_e32 v30, vcc, 0x7fff, v30
-; VI-NEXT:    v_or_b32_e32 v32, 0x400000, v29
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; VI-NEXT:    v_cndmask_b32_e32 v29, v30, v32, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT:    s_lshl_b32 s6, s29, 16
-; VI-NEXT:    v_alignbit_b32 v27, v29, v27, 16
-; VI-NEXT:    v_add_f32_e32 v29, s6, v31
-; VI-NEXT:    v_bfe_u32 v30, v29, 16, 1
-; VI-NEXT:    v_add_u32_e32 v30, vcc, v30, v29
-; VI-NEXT:    v_add_u32_e32 v30, vcc, 0x7fff, v30
-; VI-NEXT:    v_or_b32_e32 v32, 0x400000, v29
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; VI-NEXT:    s_and_b32 s6, s29, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v29, v30, v32, vcc
-; VI-NEXT:    v_add_f32_e32 v30, s6, v31
-; VI-NEXT:    v_bfe_u32 v32, v30, 16, 1
-; VI-NEXT:    v_add_u32_e32 v32, vcc, v32, v30
-; VI-NEXT:    v_add_u32_e32 v32, vcc, 0x7fff, v32
-; VI-NEXT:    v_or_b32_e32 v33, 0x400000, v30
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
-; VI-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT:    s_lshl_b32 s6, s28, 16
-; VI-NEXT:    v_alignbit_b32 v30, v30, v29, 16
-; VI-NEXT:    v_add_f32_e32 v29, s6, v31
-; VI-NEXT:    v_bfe_u32 v32, v29, 16, 1
-; VI-NEXT:    v_add_u32_e32 v32, vcc, v32, v29
-; VI-NEXT:    v_add_u32_e32 v32, vcc, 0x7fff, v32
-; VI-NEXT:    v_or_b32_e32 v33, 0x400000, v29
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; VI-NEXT:    s_and_b32 s6, s28, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v29, v32, v33, vcc
-; VI-NEXT:    v_add_f32_e32 v32, s6, v31
-; VI-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v32
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; VI-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; VI-NEXT:    s_lshl_b32 s6, s5, 16
-; VI-NEXT:    v_alignbit_b32 v29, v32, v29, 16
-; VI-NEXT:    v_add_f32_e32 v32, s6, v31
-; VI-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v32
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
+; VI-NEXT:    s_and_b64 s[6:7], vcc, exec
+; VI-NEXT:    s_cselect_b32 s6, s9, s8
+; VI-NEXT:    s_and_b32 s7, s17, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s8, s7, 0x10010
+; VI-NEXT:    s_add_i32 s8, s8, s7
+; VI-NEXT:    s_add_i32 s10, s8, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[8:9], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s10
+; VI-NEXT:    s_lshr_b32 s7, s7, 16
+; VI-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
+; VI-NEXT:    s_lshl_b32 s7, s16, 16
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s8, s7, 0x10010
+; VI-NEXT:    s_add_i32 s8, s8, s7
+; VI-NEXT:    s_add_i32 s10, s8, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[8:9], vcc, exec
+; VI-NEXT:    s_cselect_b32 s8, s7, s10
+; VI-NEXT:    s_and_b32 s7, s16, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[10:11], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s9, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s19, 16
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_lshr_b64 s[16:17], s[8:9], 16
+; VI-NEXT:    s_bfe_u32 s8, s7, 0x10010
+; VI-NEXT:    s_add_i32 s8, s8, s7
+; VI-NEXT:    s_add_i32 s10, s8, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[8:9], vcc, exec
+; VI-NEXT:    s_cselect_b32 s8, s7, s10
+; VI-NEXT:    s_and_b32 s7, s19, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[10:11], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s9, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s18, 16
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    s_lshr_b64 s[8:9], s[8:9], 16
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[10:11], vcc, exec
+; VI-NEXT:    s_cselect_b32 s10, s7, s9
+; VI-NEXT:    s_and_b32 s7, s18, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[12:13], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s11, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s21, 16
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[18:19], s[10:11], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[10:11], vcc, exec
+; VI-NEXT:    s_cselect_b32 s10, s7, s9
+; VI-NEXT:    s_and_b32 s7, s21, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[12:13], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s11, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s20, 16
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[10:11], s[10:11], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[12:13], vcc, exec
+; VI-NEXT:    s_cselect_b32 s12, s7, s9
+; VI-NEXT:    s_and_b32 s7, s20, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[14:15], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s13, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s23, 16
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[20:21], s[12:13], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[12:13], vcc, exec
+; VI-NEXT:    s_cselect_b32 s12, s7, s9
+; VI-NEXT:    s_and_b32 s7, s23, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[14:15], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s13, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s22, 16
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[12:13], s[12:13], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[14:15], vcc, exec
+; VI-NEXT:    s_cselect_b32 s14, s7, s9
+; VI-NEXT:    s_and_b32 s7, s22, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[22:23], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s15, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s25, 16
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[22:23], s[14:15], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[14:15], vcc, exec
+; VI-NEXT:    s_cselect_b32 s14, s7, s9
+; VI-NEXT:    s_and_b32 s7, s25, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[40:41], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s15, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s24, 16
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[40:41], s[14:15], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[14:15], vcc, exec
+; VI-NEXT:    s_cselect_b32 s14, s7, s9
+; VI-NEXT:    s_and_b32 s7, s24, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[24:25], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s15, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s27, 16
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[24:25], s[14:15], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[14:15], vcc, exec
+; VI-NEXT:    s_cselect_b32 s14, s7, s9
+; VI-NEXT:    s_and_b32 s7, s27, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[42:43], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s15, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s26, 16
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[44:45], s[14:15], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[14:15], vcc, exec
+; VI-NEXT:    s_cselect_b32 s14, s7, s9
+; VI-NEXT:    s_and_b32 s7, s26, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[26:27], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s15, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s29, 16
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[26:27], s[14:15], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[14:15], vcc, exec
+; VI-NEXT:    s_cselect_b32 s14, s7, s9
+; VI-NEXT:    s_and_b32 s7, s29, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[42:43], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s15, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s28, 16
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[58:59], s[14:15], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[14:15], vcc, exec
+; VI-NEXT:    s_cselect_b32 s14, s7, s9
+; VI-NEXT:    s_and_b32 s7, s28, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[28:29], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s15, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s5, 16
+; VI-NEXT:    v_add_f32_e32 v26, s7, v25
+; VI-NEXT:    v_readfirstlane_b32 s7, v26
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[28:29], s[14:15], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[14:15], vcc, exec
+; VI-NEXT:    s_cselect_b32 s14, s7, s9
 ; VI-NEXT:    s_and_b32 s5, s5, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; VI-NEXT:    v_add_f32_e32 v33, s5, v31
-; VI-NEXT:    v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT:    v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x7fff, v34
-; VI-NEXT:    v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
-; VI-NEXT:    v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; VI-NEXT:    v_add_f32_e32 v26, s5, v25
+; VI-NEXT:    v_readfirstlane_b32 s5, v26
+; VI-NEXT:    s_bfe_u32 s7, s5, 0x10010
+; VI-NEXT:    s_add_i32 s7, s7, s5
+; VI-NEXT:    s_addk_i32 s7, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s5, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    s_and_b64 s[42:43], vcc, exec
+; VI-NEXT:    s_cselect_b32 s5, s5, s7
+; VI-NEXT:    s_lshr_b32 s15, s5, 16
 ; VI-NEXT:    s_lshl_b32 s5, s4, 16
-; VI-NEXT:    v_alignbit_b32 v32, v33, v32, 16
-; VI-NEXT:    v_add_f32_e32 v33, s5, v31
-; VI-NEXT:    v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT:    v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x7fff, v34
+; VI-NEXT:    v_add_f32_e32 v26, s5, v25
+; VI-NEXT:    v_readfirstlane_b32 s5, v26
+; VI-NEXT:    s_bfe_u32 s7, s5, 0x10010
+; VI-NEXT:    s_add_i32 s7, s7, s5
+; VI-NEXT:    s_lshr_b64 s[62:63], s[14:15], 16
+; VI-NEXT:    s_addk_i32 s7, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s5, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; VI-NEXT:    s_and_b64 s[14:15], vcc, exec
+; VI-NEXT:    v_lshrrev_b64 v[23:24], 16, v[23:24]
+; VI-NEXT:    s_cselect_b32 s14, s5, s7
 ; VI-NEXT:    s_and_b32 s4, s4, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
-; VI-NEXT:    v_add_f32_e32 v31, s4, v31
-; VI-NEXT:    v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT:    v_bfe_u32 v34, v31, 16, 1
-; VI-NEXT:    v_add_u32_e32 v34, vcc, v34, v31
-; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x7fff, v34
-; VI-NEXT:    v_or_b32_e32 v35, 0x400000, v31
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; VI-NEXT:    v_cndmask_b32_e32 v31, v34, v35, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT:    v_alignbit_b32 v31, v31, v33, 16
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[29:30]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[27:28]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[25:26]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[21:22]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[19:20]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[17:18]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[15:16]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[13:14]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[11:12]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[9:10]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[7:8]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[5:6]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[3:4]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[1:2]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v32
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v30
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v24, v22
+; VI-NEXT:    v_add_f32_e32 v25, s4, v25
+; VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; VI-NEXT:    v_readfirstlane_b32 s4, v25
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; VI-NEXT:    v_lshrrev_b64 v[24:25], 24, v[23:24]
+; VI-NEXT:    v_lshrrev_b64 v[20:21], 16, v[20:21]
+; VI-NEXT:    v_mov_b32_e32 v21, v19
+; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; VI-NEXT:    v_lshrrev_b64 v[24:25], 24, v[20:21]
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_mov_b32_e32 v18, v16
+; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; VI-NEXT:    v_lshrrev_b64 v[24:25], 24, v[17:18]
+; VI-NEXT:    v_lshrrev_b64 v[14:15], 16, v[14:15]
+; VI-NEXT:    v_mov_b32_e32 v15, v13
+; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; VI-NEXT:    s_bfe_u32 s5, s4, 0x10010
+; VI-NEXT:    v_lshrrev_b64 v[24:25], 24, v[14:15]
+; VI-NEXT:    v_lshrrev_b64 v[11:12], 16, v[11:12]
+; VI-NEXT:    s_add_i32 s5, s5, s4
+; VI-NEXT:    v_mov_b32_e32 v12, v10
+; VI-NEXT:    s_add_i32 s7, s5, 0x7fff
+; VI-NEXT:    s_or_b32 s9, s4, 0x400000
+; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; VI-NEXT:    v_lshrrev_b64 v[24:25], 24, v[11:12]
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[8:9]
+; VI-NEXT:    s_cselect_b32 s4, s9, s7
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b64 v[5:6], 16, v[5:6]
+; VI-NEXT:    v_mov_b32_e32 v9, v7
+; VI-NEXT:    s_lshr_b32 s15, s4, 16
+; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    s_lshr_b64 s[4:5], s[14:15], 16
+; VI-NEXT:    v_lshrrev_b64 v[24:25], 24, v[8:9]
+; VI-NEXT:    v_mov_b32_e32 v3, v1
+; VI-NEXT:    s_mov_b32 s27, s44
+; VI-NEXT:    s_mov_b32 s29, s58
+; VI-NEXT:    s_mov_b32 s5, s62
+; VI-NEXT:    v_lshrrev_b64 v[30:31], 24, v[5:6]
+; VI-NEXT:    s_mov_b32 s17, s6
+; VI-NEXT:    s_mov_b32 s19, s8
+; VI-NEXT:    s_mov_b32 s21, s10
+; VI-NEXT:    s_mov_b32 s23, s12
+; VI-NEXT:    s_mov_b32 s25, s40
+; VI-NEXT:    s_lshr_b64 s[74:75], s[4:5], 24
+; VI-NEXT:    s_lshr_b64 s[78:79], s[28:29], 24
+; VI-NEXT:    s_lshr_b64 s[30:31], s[26:27], 24
+; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[2:3]
+; VI-NEXT:    s_lshr_b64 s[36:37], s[24:25], 24
+; VI-NEXT:    s_lshr_b64 s[38:39], s[22:23], 24
+; VI-NEXT:    s_lshr_b64 s[48:49], s[20:21], 24
+; VI-NEXT:    s_lshr_b64 s[50:51], s[18:19], 24
+; VI-NEXT:    s_lshr_b64 s[52:53], s[16:17], 24
+; VI-NEXT:    s_lshr_b32 s11, s62, 24
+; VI-NEXT:    s_lshr_b32 s13, s62, 16
+; VI-NEXT:    s_lshr_b32 s14, s62, 8
+; VI-NEXT:    s_lshr_b32 s15, s4, 16
+; VI-NEXT:    s_lshr_b32 s17, s4, 8
+; VI-NEXT:    s_lshr_b32 s19, s58, 24
+; VI-NEXT:    s_lshr_b32 s21, s58, 16
+; VI-NEXT:    s_lshr_b32 s23, s58, 8
+; VI-NEXT:    s_lshr_b32 s25, s28, 16
+; VI-NEXT:    s_lshr_b32 s27, s28, 8
+; VI-NEXT:    s_lshr_b32 s29, s44, 24
+; VI-NEXT:    s_lshr_b32 s41, s44, 16
+; VI-NEXT:    s_lshr_b32 s42, s44, 8
+; VI-NEXT:    s_lshr_b32 s43, s26, 16
+; VI-NEXT:    s_lshr_b32 s45, s26, 8
+; VI-NEXT:    s_lshr_b32 s46, s40, 24
+; VI-NEXT:    s_lshr_b32 s47, s40, 16
+; VI-NEXT:    s_lshr_b32 s56, s40, 8
+; VI-NEXT:    s_lshr_b32 s57, s24, 16
+; VI-NEXT:    s_lshr_b32 s59, s24, 8
+; VI-NEXT:    s_lshr_b32 s60, s12, 24
+; VI-NEXT:    s_lshr_b32 s61, s12, 16
+; VI-NEXT:    s_lshr_b32 s63, s12, 8
+; VI-NEXT:    s_lshr_b32 s72, s22, 16
+; VI-NEXT:    s_lshr_b32 s73, s22, 8
+; VI-NEXT:    s_lshr_b32 s75, s10, 24
+; VI-NEXT:    s_lshr_b32 s76, s10, 16
+; VI-NEXT:    s_lshr_b32 s77, s10, 8
+; VI-NEXT:    s_lshr_b32 s79, s20, 16
+; VI-NEXT:    s_lshr_b32 s88, s20, 8
+; VI-NEXT:    s_lshr_b32 s89, s8, 24
+; VI-NEXT:    s_lshr_b32 s90, s8, 16
+; VI-NEXT:    s_lshr_b32 s91, s8, 8
+; VI-NEXT:    s_lshr_b32 s31, s18, 16
+; VI-NEXT:    s_lshr_b32 s34, s18, 8
+; VI-NEXT:    s_lshr_b32 vcc_lo, s6, 24
+; VI-NEXT:    s_lshr_b32 vcc_hi, s6, 16
+; VI-NEXT:    s_lshr_b32 s35, s6, 8
+; VI-NEXT:    s_lshr_b32 s9, s16, 16
+; VI-NEXT:    s_lshr_b32 s7, s16, 8
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 24, v22
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v22
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v22
+; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v23
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 8, v23
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 24, v19
+; VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v19
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v19
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 24, v2
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
-; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v16
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v2
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v12
-; VI-NEXT:    v_lshrrev_b32_e32 v35, 24, v8
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v2
-; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[23:24]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
-; VI-NEXT:    v_lshrrev_b32_e32 v45, 24, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v55, 8, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v56, 8, v13
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v12
-; VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v9
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v7
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v15
-; VI-NEXT:    v_lshrrev_b32_e32 v50, 8, v15
-; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v48, 8, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v13
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v56, 16, v12
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v11
-; VI-NEXT:    v_lshrrev_b32_e32 v53, 8, v11
-; VI-NEXT:    v_lshrrev_b32_e32 v44, 24, v10
+; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v20
+; VI-NEXT:    v_lshrrev_b32_e32 v35, 24, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 8, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v17
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 8, v17
+; VI-NEXT:    v_lshrrev_b32_e32 v48, 24, v13
+; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v13
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 8, v13
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v14
+; VI-NEXT:    v_lshrrev_b32_e32 v52, 8, v14
+; VI-NEXT:    v_lshrrev_b32_e32 v53, 24, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v40, 8, v10
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v57, 8, v9
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v7
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v59, 24, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v58, 16, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v45, 8, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v42, 24, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v3
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v1
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 8, v10
+; VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v11
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v11
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 24, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v44, 8, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v45, 16, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v47, 24, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v56, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v59, 8, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v60, 24, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v24, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v26, 8, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v27, 8, v2
 ; VI-NEXT:    s_branch .LBB91_5
 ; VI-NEXT:  .LBB91_3:
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr7
+; VI-NEXT:    ; implicit-def: $sgpr9
+; VI-NEXT:    ; implicit-def: $sgpr8
+; VI-NEXT:    ; implicit-def: $sgpr11
+; VI-NEXT:    ; implicit-def: $sgpr10
+; VI-NEXT:    ; implicit-def: $sgpr13
+; VI-NEXT:    ; implicit-def: $sgpr12
+; VI-NEXT:    ; implicit-def: $sgpr41
+; VI-NEXT:    ; implicit-def: $sgpr40
+; VI-NEXT:    ; implicit-def: $sgpr45
+; VI-NEXT:    ; implicit-def: $sgpr44
+; VI-NEXT:    ; implicit-def: $sgpr59
+; VI-NEXT:    ; implicit-def: $sgpr58
+; VI-NEXT:    ; implicit-def: $sgpr63
+; VI-NEXT:    ; implicit-def: $sgpr62
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
 ; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr71
-; VI-NEXT:    ; implicit-def: $sgpr69
+; VI-NEXT:    ; implicit-def: $sgpr47
+; VI-NEXT:    ; implicit-def: $sgpr82
+; VI-NEXT:    ; implicit-def: $sgpr83
+; VI-NEXT:    ; implicit-def: $sgpr56
+; VI-NEXT:    ; implicit-def: $sgpr57
+; VI-NEXT:    ; implicit-def: $sgpr61
+; VI-NEXT:    ; implicit-def: $sgpr72
+; VI-NEXT:    ; implicit-def: $sgpr14
+; VI-NEXT:    ; implicit-def: $sgpr15
+; VI-NEXT:    ; implicit-def: $sgpr60
+; VI-NEXT:    ; implicit-def: $sgpr78
+; VI-NEXT:    ; implicit-def: $sgpr79
+; VI-NEXT:    ; implicit-def: $sgpr73
+; VI-NEXT:    ; implicit-def: $sgpr74
+; VI-NEXT:    ; implicit-def: $sgpr75
+; VI-NEXT:    ; implicit-def: $sgpr84
+; VI-NEXT:    ; implicit-def: $sgpr85
+; VI-NEXT:    ; implicit-def: $sgpr86
+; VI-NEXT:    ; implicit-def: $sgpr42
+; VI-NEXT:    ; implicit-def: $sgpr43
+; VI-NEXT:    ; implicit-def: $sgpr87
+; VI-NEXT:    ; implicit-def: $sgpr80
 ; VI-NEXT:    ; implicit-def: $sgpr70
 ; VI-NEXT:    ; implicit-def: $sgpr68
-; VI-NEXT:    ; implicit-def: $sgpr67
 ; VI-NEXT:    ; implicit-def: $sgpr66
 ; VI-NEXT:    ; implicit-def: $sgpr64
-; VI-NEXT:    ; implicit-def: $sgpr65
-; VI-NEXT:    ; implicit-def: $sgpr55
 ; VI-NEXT:    ; implicit-def: $sgpr54
-; VI-NEXT:    ; implicit-def: $sgpr53
-; VI-NEXT:    ; implicit-def: $sgpr51
 ; VI-NEXT:    ; implicit-def: $sgpr52
 ; VI-NEXT:    ; implicit-def: $sgpr50
-; VI-NEXT:    ; implicit-def: $sgpr87
-; VI-NEXT:    ; implicit-def: $sgpr86
-; VI-NEXT:    ; implicit-def: $sgpr84
-; VI-NEXT:    ; implicit-def: $sgpr85
-; VI-NEXT:    ; implicit-def: $sgpr83
-; VI-NEXT:    ; implicit-def: $sgpr82
-; VI-NEXT:    ; implicit-def: $sgpr81
-; VI-NEXT:    ; implicit-def: $sgpr80
-; VI-NEXT:    ; implicit-def: $sgpr76
-; VI-NEXT:    ; implicit-def: $sgpr74
-; VI-NEXT:    ; implicit-def: $sgpr72
-; VI-NEXT:    ; implicit-def: $sgpr62
-; VI-NEXT:    ; implicit-def: $sgpr60
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; kill: killed $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    v_writelane_b32 v61, s6, 0
+; VI-NEXT:    v_writelane_b32 v61, s7, 1
+; VI-NEXT:    v_writelane_b32 v61, s8, 2
+; VI-NEXT:    v_writelane_b32 v61, s9, 3
+; VI-NEXT:    v_writelane_b32 v61, s10, 4
+; VI-NEXT:    v_writelane_b32 v61, s11, 5
+; VI-NEXT:    v_writelane_b32 v61, s12, 6
+; VI-NEXT:    v_writelane_b32 v61, s13, 7
+; VI-NEXT:    v_writelane_b32 v61, s40, 8
+; VI-NEXT:    v_writelane_b32 v61, s41, 9
+; VI-NEXT:    v_writelane_b32 v61, s44, 10
+; VI-NEXT:    v_writelane_b32 v61, s45, 11
+; VI-NEXT:    v_writelane_b32 v61, s58, 12
+; VI-NEXT:    v_writelane_b32 v61, s59, 13
+; VI-NEXT:    v_writelane_b32 v61, s62, 14
+; VI-NEXT:    v_writelane_b32 v61, s63, 15
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr8
+; VI-NEXT:    ; implicit-def: $sgpr10
+; VI-NEXT:    ; implicit-def: $sgpr12
+; VI-NEXT:    ; implicit-def: $sgpr40
+; VI-NEXT:    ; implicit-def: $sgpr44
 ; VI-NEXT:    ; implicit-def: $sgpr58
-; VI-NEXT:    ; implicit-def: $sgpr56
-; VI-NEXT:    ; implicit-def: $sgpr48
-; VI-NEXT:    ; implicit-def: $sgpr38
-; VI-NEXT:    ; implicit-def: $sgpr36
-; VI-NEXT:    ; implicit-def: $sgpr34
-; VI-NEXT:    ; implicit-def: $sgpr30
-; VI-NEXT:    ; implicit-def: $sgpr90
-; VI-NEXT:    ; implicit-def: $sgpr88
-; VI-NEXT:    ; implicit-def: $sgpr78
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; kill: killed $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr46
+; VI-NEXT:    ; implicit-def: $sgpr62
 ; VI-NEXT:    s_branch .LBB91_2
 ; VI-NEXT:  .LBB91_4:
-; VI-NEXT:    v_mov_b32_e32 v33, s71
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s69
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s70
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s68
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s67
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s86
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s83
-; VI-NEXT:    v_mov_b32_e32 v31, s4
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s82
-; VI-NEXT:    v_readlane_b32 s4, v62, 0
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 1
-; VI-NEXT:    v_mov_b32_e32 v40, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 2
-; VI-NEXT:    v_mov_b32_e32 v44, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 3
-; VI-NEXT:    v_mov_b32_e32 v54, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 4
-; VI-NEXT:    v_mov_b32_e32 v53, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 5
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 6
-; VI-NEXT:    v_mov_b32_e32 v51, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 7
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 8
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 9
-; VI-NEXT:    v_mov_b32_e32 v56, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 10
-; VI-NEXT:    v_mov_b32_e32 v47, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 11
-; VI-NEXT:    v_mov_b32_e32 v48, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 12
-; VI-NEXT:    v_mov_b32_e32 v43, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 13
-; VI-NEXT:    v_mov_b32_e32 v46, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 14
-; VI-NEXT:    v_mov_b32_e32 v50, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 15
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 16
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 17
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 18
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 19
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 20
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 21
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 22
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 23
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 24
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 25
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 26
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 27
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 28
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 29
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 30
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 31
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 32
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 33
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 34
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 35
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 36
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 37
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 38
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 39
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 40
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 41
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 42
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 43
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 44
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 45
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 46
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 47
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 48
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 49
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 50
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 51
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 52
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 53
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 54
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 55
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 56
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 57
-; VI-NEXT:    v_mov_b32_e32 v42, s54
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_mov_b32_e32 v41, s46
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s56
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s58
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s60
-; VI-NEXT:    v_mov_b32_e32 v45, s72
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v45, s74
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v45, s76
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v45, s78
-; VI-NEXT:    v_mov_b32_e32 v55, s88
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v36, s66
-; VI-NEXT:    v_mov_b32_e32 v52, s64
-; VI-NEXT:    v_mov_b32_e32 v55, v50
-; VI-NEXT:    v_mov_b32_e32 v35, s30
-; VI-NEXT:    v_mov_b32_e32 v59, s87
-; VI-NEXT:    v_mov_b32_e32 v58, s34
-; VI-NEXT:    v_mov_b32_e32 v45, s36
-; VI-NEXT:    v_mov_b32_e32 v34, s38
-; VI-NEXT:    v_mov_b32_e32 v1, s44
-; VI-NEXT:    v_mov_b32_e32 v2, s45
-; VI-NEXT:    v_mov_b32_e32 v3, s42
-; VI-NEXT:    v_mov_b32_e32 v4, s43
-; VI-NEXT:    v_mov_b32_e32 v5, s40
-; VI-NEXT:    v_mov_b32_e32 v6, s41
-; VI-NEXT:    v_mov_b32_e32 v7, s14
-; VI-NEXT:    v_mov_b32_e32 v8, s15
-; VI-NEXT:    v_mov_b32_e32 v9, s12
-; VI-NEXT:    v_mov_b32_e32 v10, s13
-; VI-NEXT:    v_mov_b32_e32 v11, s10
-; VI-NEXT:    v_mov_b32_e32 v12, s11
-; VI-NEXT:    v_mov_b32_e32 v13, s8
-; VI-NEXT:    v_mov_b32_e32 v14, s9
-; VI-NEXT:    v_mov_b32_e32 v15, s6
-; VI-NEXT:    v_mov_b32_e32 v16, s7
-; VI-NEXT:    v_mov_b32_e32 v17, s16
-; VI-NEXT:    v_mov_b32_e32 v18, s17
-; VI-NEXT:    v_mov_b32_e32 v19, s18
-; VI-NEXT:    v_mov_b32_e32 v20, s19
-; VI-NEXT:    v_mov_b32_e32 v21, s20
-; VI-NEXT:    v_mov_b32_e32 v22, s21
-; VI-NEXT:    v_mov_b32_e32 v23, s22
-; VI-NEXT:    v_mov_b32_e32 v24, s23
-; VI-NEXT:    v_mov_b32_e32 v25, s24
-; VI-NEXT:    v_mov_b32_e32 v26, s25
-; VI-NEXT:    v_mov_b32_e32 v27, s26
-; VI-NEXT:    v_mov_b32_e32 v28, s27
-; VI-NEXT:    v_mov_b32_e32 v29, s28
-; VI-NEXT:    v_mov_b32_e32 v30, s29
-; VI-NEXT:    v_mov_b32_e32 v32, s5
-; VI-NEXT:    v_mov_b32_e32 v41, s62
-; VI-NEXT:    v_mov_b32_e32 v57, s81
-; VI-NEXT:    v_mov_b32_e32 v37, s84
-; VI-NEXT:    v_mov_b32_e32 v60, s52
-; VI-NEXT:    v_mov_b32_e32 v38, s51
-; VI-NEXT:    v_mov_b32_e32 v61, s65
-; VI-NEXT:    v_mov_b32_e32 v49, s66
-; VI-NEXT:    v_mov_b32_e32 v39, s55
-; VI-NEXT:    v_mov_b32_e32 v50, v46
-; VI-NEXT:    v_mov_b32_e32 v46, v48
-; VI-NEXT:    v_mov_b32_e32 v48, v47
-; VI-NEXT:    v_mov_b32_e32 v47, v56
-; VI-NEXT:    v_mov_b32_e32 v56, v51
-; VI-NEXT:    v_mov_b32_e32 v51, s90
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v35, s85
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v34, s48
-; VI-NEXT:    v_mov_b32_e32 v51, v53
-; VI-NEXT:    v_mov_b32_e32 v53, v54
-; VI-NEXT:    v_mov_b32_e32 v54, v40
-; VI-NEXT:    v_mov_b32_e32 v40, s80
-; VI-NEXT:    v_mov_b32_e32 v58, s50
-; VI-NEXT:    v_mov_b32_e32 v45, s53
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v28, s50
+; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v28, s52
+; VI-NEXT:    v_readlane_b32 s5, v61, 16
+; VI-NEXT:    v_mov_b32_e32 v57, s5
+; VI-NEXT:    v_readlane_b32 s5, v61, 17
+; VI-NEXT:    v_mov_b32_e32 v59, s5
+; VI-NEXT:    v_readlane_b32 s5, v61, 18
+; VI-NEXT:    v_mov_b32_e32 v47, s5
+; VI-NEXT:    v_readlane_b32 s5, v61, 19
+; VI-NEXT:    v_mov_b32_e32 v56, s5
+; VI-NEXT:    v_readlane_b32 s5, v61, 20
+; VI-NEXT:    v_mov_b32_e32 v58, s5
+; VI-NEXT:    v_readlane_b32 s5, v61, 21
+; VI-NEXT:    v_mov_b32_e32 v25, s5
+; VI-NEXT:    v_readlane_b32 s5, v61, 22
+; VI-NEXT:    v_mov_b32_e32 v27, s5
+; VI-NEXT:    v_readlane_b32 s5, v61, 23
+; VI-NEXT:    v_mov_b32_e32 v38, s79
+; VI-NEXT:    v_mov_b32_e32 v39, s78
+; VI-NEXT:    v_mov_b32_e32 v35, s75
+; VI-NEXT:    v_mov_b32_e32 v36, s74
+; VI-NEXT:    v_mov_b32_e32 v60, s5
+; VI-NEXT:    v_readlane_b32 s5, v61, 24
+; VI-NEXT:    v_mov_b32_e32 v14, s30
+; VI-NEXT:    v_mov_b32_e32 v13, s31
+; VI-NEXT:    v_readlane_b32 s74, v61, 14
+; VI-NEXT:    v_readlane_b32 s78, v61, 12
+; VI-NEXT:    v_readlane_b32 s30, v61, 10
+; VI-NEXT:    v_mov_b32_e32 v24, s5
+; VI-NEXT:    v_readlane_b32 s5, v61, 25
+; VI-NEXT:    v_mov_b32_e32 v8, s36
+; VI-NEXT:    v_mov_b32_e32 v7, s37
+; VI-NEXT:    v_mov_b32_e32 v5, s38
+; VI-NEXT:    v_mov_b32_e32 v4, s39
+; VI-NEXT:    v_mov_b32_e32 v2, s48
+; VI-NEXT:    v_mov_b32_e32 v1, s49
+; VI-NEXT:    v_readlane_b32 s75, v61, 15
+; VI-NEXT:    v_readlane_b32 s79, v61, 13
+; VI-NEXT:    v_readlane_b32 s31, v61, 11
+; VI-NEXT:    v_readlane_b32 s36, v61, 8
+; VI-NEXT:    v_readlane_b32 s38, v61, 6
+; VI-NEXT:    v_readlane_b32 s48, v61, 4
+; VI-NEXT:    v_readlane_b32 s50, v61, 2
+; VI-NEXT:    v_readlane_b32 s52, v61, 0
+; VI-NEXT:    v_mov_b32_e32 v12, s13
+; VI-NEXT:    v_mov_b32_e32 v15, s11
+; VI-NEXT:    v_mov_b32_e32 v3, s87
+; VI-NEXT:    v_mov_b32_e32 v6, s43
+; VI-NEXT:    v_mov_b32_e32 v9, s42
+; VI-NEXT:    v_mov_b32_e32 v33, s9
+; VI-NEXT:    v_mov_b32_e32 v34, s7
+; VI-NEXT:    v_mov_b32_e32 v18, s86
+; VI-NEXT:    v_mov_b32_e32 v21, s85
+; VI-NEXT:    v_mov_b32_e32 v32, s84
+; VI-NEXT:    v_mov_b32_e32 v37, s73
+; VI-NEXT:    v_mov_b32_e32 v51, s72
+; VI-NEXT:    v_mov_b32_e32 v52, s61
+; VI-NEXT:    v_mov_b32_e32 v48, s60
+; VI-NEXT:    v_mov_b32_e32 v49, s15
+; VI-NEXT:    v_mov_b32_e32 v50, s14
+; VI-NEXT:    v_mov_b32_e32 v40, s83
+; VI-NEXT:    v_mov_b32_e32 v41, s82
+; VI-NEXT:    v_mov_b32_e32 v53, s63
+; VI-NEXT:    v_mov_b32_e32 v54, s57
+; VI-NEXT:    v_mov_b32_e32 v55, s56
+; VI-NEXT:    v_mov_b32_e32 v45, s59
+; VI-NEXT:    v_mov_b32_e32 v46, s45
+; VI-NEXT:    v_mov_b32_e32 v42, s41
+; VI-NEXT:    v_mov_b32_e32 v43, s47
+; VI-NEXT:    v_mov_b32_e32 v44, s46
+; VI-NEXT:    v_mov_b32_e32 v26, s5
+; VI-NEXT:    v_mov_b32_e32 v23, s76
+; VI-NEXT:    v_mov_b32_e32 v22, s77
+; VI-NEXT:    v_mov_b32_e32 v20, s88
+; VI-NEXT:    v_mov_b32_e32 v19, s89
+; VI-NEXT:    v_mov_b32_e32 v17, s90
+; VI-NEXT:    v_mov_b32_e32 v16, s91
+; VI-NEXT:    v_mov_b32_e32 v11, s34
+; VI-NEXT:    v_mov_b32_e32 v10, s35
+; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v28, s54
+; VI-NEXT:    v_mov_b32_e32 v30, s70
+; VI-NEXT:    v_mov_b32_e32 v31, s80
+; VI-NEXT:    v_readlane_b32 s11, v61, 26
+; VI-NEXT:    v_readlane_b32 s13, v61, 27
+; VI-NEXT:    v_readlane_b32 s14, v61, 28
+; VI-NEXT:    v_readlane_b32 s15, v61, 29
+; VI-NEXT:    v_readlane_b32 s17, v61, 30
+; VI-NEXT:    v_readlane_b32 s19, v61, 31
+; VI-NEXT:    v_readlane_b32 s21, v61, 32
+; VI-NEXT:    v_readlane_b32 s23, v61, 33
+; VI-NEXT:    v_readlane_b32 s25, v61, 34
+; VI-NEXT:    v_readlane_b32 s27, v61, 35
+; VI-NEXT:    v_readlane_b32 s29, v61, 36
+; VI-NEXT:    v_readlane_b32 s41, v61, 37
+; VI-NEXT:    v_readlane_b32 s42, v61, 38
+; VI-NEXT:    v_readlane_b32 s43, v61, 39
+; VI-NEXT:    v_readlane_b32 s45, v61, 40
+; VI-NEXT:    v_readlane_b32 s46, v61, 41
+; VI-NEXT:    v_readlane_b32 s47, v61, 42
+; VI-NEXT:    v_readlane_b32 s56, v61, 43
+; VI-NEXT:    v_readlane_b32 s57, v61, 44
+; VI-NEXT:    v_readlane_b32 s59, v61, 45
+; VI-NEXT:    v_readlane_b32 s60, v61, 46
+; VI-NEXT:    v_readlane_b32 s61, v61, 47
+; VI-NEXT:    v_readlane_b32 s63, v61, 48
+; VI-NEXT:    v_readlane_b32 s72, v61, 49
+; VI-NEXT:    v_readlane_b32 s73, v61, 50
+; VI-NEXT:    v_readlane_b32 s75, v61, 51
+; VI-NEXT:    v_readlane_b32 s76, v61, 52
+; VI-NEXT:    v_readlane_b32 s77, v61, 53
+; VI-NEXT:    v_readlane_b32 s79, v61, 54
+; VI-NEXT:    v_readlane_b32 s88, v61, 55
+; VI-NEXT:    v_readlane_b32 s89, v61, 56
+; VI-NEXT:    v_readlane_b32 s90, v61, 57
+; VI-NEXT:    v_readlane_b32 s91, v61, 58
+; VI-NEXT:    v_readlane_b32 s31, v61, 59
+; VI-NEXT:    v_readlane_b32 s34, v61, 60
+; VI-NEXT:    v_readlane_b32 s37, v61, 9
+; VI-NEXT:    v_readlane_b32 vcc_lo, v61, 61
+; VI-NEXT:    v_readlane_b32 vcc_hi, v61, 62
+; VI-NEXT:    v_readlane_b32 s35, v61, 63
+; VI-NEXT:    v_readlane_b32 s9, v62, 0
+; VI-NEXT:    v_readlane_b32 s7, v62, 1
+; VI-NEXT:    v_readlane_b32 s39, v61, 7
+; VI-NEXT:    v_readlane_b32 s49, v61, 5
+; VI-NEXT:    v_readlane_b32 s51, v61, 3
+; VI-NEXT:    v_readlane_b32 s53, v61, 1
+; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v28, s64
+; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v28, s66
+; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v28, s68
+; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:  .LBB91_5: ; %end
-; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v33
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v17, v17, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_and_b32 s5, s16, 0xff
+; VI-NEXT:    s_lshl_b32 s7, s7, 8
+; VI-NEXT:    s_or_b32 s5, s5, s7
+; VI-NEXT:    s_and_b32 s7, s9, 0xff
+; VI-NEXT:    s_lshl_b32 s9, s52, 8
+; VI-NEXT:    s_or_b32 s7, s7, s9
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s7, s7, 16
+; VI-NEXT:    s_or_b32 s5, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v28, s5
+; VI-NEXT:    s_and_b32 s5, s6, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s35, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, vcc_hi, 0xff
+; VI-NEXT:    s_lshl_b32 s7, vcc_lo, 8
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    v_mov_b32_e32 v29, s5
+; VI-NEXT:    s_and_b32 s5, s18, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s34, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s31, 0xff
+; VI-NEXT:    s_lshl_b32 s7, s50, 8
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 4, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v29, v28, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v29, s5
+; VI-NEXT:    s_and_b32 s5, s8, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s91, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s90, 0xff
+; VI-NEXT:    s_lshl_b32 s7, s89, 8
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 8, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v29, v28, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v29, s5
+; VI-NEXT:    s_and_b32 s5, s20, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s88, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s79, 0xff
+; VI-NEXT:    s_lshl_b32 s7, s48, 8
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 12, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v29, v28, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v29, s5
+; VI-NEXT:    s_and_b32 s5, s10, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s77, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s76, 0xff
+; VI-NEXT:    s_lshl_b32 s7, s75, 8
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 16, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v29, v28, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v29, s5
+; VI-NEXT:    s_and_b32 s5, s22, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s73, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s72, 0xff
+; VI-NEXT:    s_lshl_b32 s7, s38, 8
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 20, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v29, v28, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v29, s5
+; VI-NEXT:    s_and_b32 s5, s12, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s63, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s61, 0xff
+; VI-NEXT:    s_lshl_b32 s7, s60, 8
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 24, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v29, v28, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v29, s5
+; VI-NEXT:    s_and_b32 s5, s24, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s59, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s57, 0xff
+; VI-NEXT:    s_lshl_b32 s7, s36, 8
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 28, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v29, v28, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v29, s5
+; VI-NEXT:    s_and_b32 s5, s40, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s56, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s47, 0xff
+; VI-NEXT:    s_lshl_b32 s7, s46, 8
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 32, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v29, v28, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v29, s5
+; VI-NEXT:    s_and_b32 s5, s26, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s45, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s43, 0xff
+; VI-NEXT:    s_lshl_b32 s7, s30, 8
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 36, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v29, v28, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v29, s5
+; VI-NEXT:    s_and_b32 s5, s44, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s42, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s41, 0xff
+; VI-NEXT:    s_lshl_b32 s7, s29, 8
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 40, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v29, v28, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v29, s5
+; VI-NEXT:    s_and_b32 s5, s28, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s27, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s25, 0xff
+; VI-NEXT:    s_lshl_b32 s7, s78, 8
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 44, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v29, v28, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v29, s5
+; VI-NEXT:    s_and_b32 s5, s58, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s23, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s21, 0xff
+; VI-NEXT:    s_lshl_b32 s7, s19, 8
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 48, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v29, v28, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v29, s5
+; VI-NEXT:    s_and_b32 s4, s4, 0xff
+; VI-NEXT:    s_lshl_b32 s5, s17, 8
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_and_b32 s5, s15, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s74, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 52, v0
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    buffer_store_dword v29, v28, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v29, s4
+; VI-NEXT:    s_and_b32 s4, s62, 0xff
+; VI-NEXT:    s_lshl_b32 s5, s14, 8
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_and_b32 s5, s13, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s11, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v27, 8, v31
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 56, v0
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    v_or_b32_sdwa v25, v25, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v29, v28, s[0:3], 0 offen
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 60, v0
+; VI-NEXT:    v_mov_b32_e32 v29, s4
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_add_u32_e32 v25, vcc, 64, v0
+; VI-NEXT:    buffer_store_dword v29, v28, s[0:3], 0 offen
+; VI-NEXT:    buffer_store_dword v2, v25, s[0:3], 0 offen
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v26
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v60
+; VI-NEXT:    v_or_b32_sdwa v2, v24, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x44, v0
+; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v59
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v30
+; VI-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x48, v0
+; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v58
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v47
+; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x4c, v0
+; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v46
+; VI-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_readlane_b32 s87, v63, 31
 ; VI-NEXT:    v_readlane_b32 s86, v63, 30
 ; VI-NEXT:    v_readlane_b32 s85, v63, 29
@@ -170192,393 +171218,121 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_readlane_b32 s34, v63, 2
 ; VI-NEXT:    v_readlane_b32 s31, v63, 1
 ; VI-NEXT:    v_readlane_b32 s30, v63, 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v33
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v33
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v34, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v17, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 4, v0
-; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 8, v0
-; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT:    v_or_b32_sdwa v17, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 12, v0
-; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT:    v_or_b32_sdwa v18, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 16, v0
-; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT:    v_or_b32_sdwa v17, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 20, v0
-; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v41
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT:    v_or_b32_sdwa v18, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 24, v0
-; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT:    v_or_b32_sdwa v17, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 28, v0
-; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT:    v_or_b32_sdwa v18, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 32, v0
-; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT:    v_or_b32_sdwa v17, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 36, v0
-; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT:    v_or_b32_sdwa v18, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 40, v0
-; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT:    v_or_b32_sdwa v17, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 44, v0
-; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT:    v_or_b32_sdwa v18, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 48, v0
-; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT:    v_or_b32_sdwa v17, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 52, v0
-; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT:    v_or_b32_sdwa v18, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 56, v0
-; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT:    v_or_b32_sdwa v17, v32, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 60, v0
-; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v17, vcc, 64, v0
-; VI-NEXT:    buffer_store_dword v1, v17, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x44, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v49
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x48, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v61
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v42
-; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x4c, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v45
-; VI-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT:    v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x50, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v60
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v59
-; VI-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v44
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v42
+; VI-NEXT:    v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x54, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v41
+; VI-NEXT:    v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT:    v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x58, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v35
-; VI-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v55
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v53
+; VI-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x5c, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v57
-; VI-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v52
+; VI-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT:    v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x60, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v40
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v44
-; VI-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v50
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v48
+; VI-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x64, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v53
-; VI-NEXT:    v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v39
+; VI-NEXT:    v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x68, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v37
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v35
+; VI-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x6c, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v34
+; VI-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT:    v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x70, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v48
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v43
-; VI-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v32
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v18
+; VI-NEXT:    v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x74, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v50
-; VI-NEXT:    v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v15
+; VI-NEXT:    v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT:    v_or_b32_sdwa v2, v12, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
+; VI-NEXT:    v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -172194,7 +172948,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:8
@@ -173744,7 +174498,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:8
@@ -173757,7 +174511,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:8
@@ -175314,7 +176068,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:8
@@ -175488,9 +176242,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v37
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:160
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:168
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:176
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v38
@@ -175508,6 +176259,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:220
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:192
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:200
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:160
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:168
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:176
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr37
@@ -175525,15 +176279,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(7) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:208
@@ -175669,34 +176423,37 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:360
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:368
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:384
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:360
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:88
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:80
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
@@ -175716,7 +176473,10 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:368
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:120
@@ -175726,7 +176486,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:216
@@ -175752,14 +176514,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:376
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:88
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
@@ -175882,7 +176636,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v47
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff, v42
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_and_b32_e32 v18, 0xff, v18
 ; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
 ; SI-NEXT:    v_and_b32_e32 v24, 0xff, v24
@@ -176540,25 +177293,18 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 3, v16
 ; SI-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_add_i32_e32 v34, vcc, 3, v34
 ; SI-NEXT:    v_and_b32_e32 v34, 0xff, v34
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v30
 ; SI-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_add_i32_e32 v28, vcc, 3, v28
 ; SI-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_add_i32_e32 v26, vcc, 3, v26
 ; SI-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_add_i32_e32 v24, vcc, 3, v24
 ; SI-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v22
 ; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v18
 ; SI-NEXT:    v_and_b32_e32 v18, 0xff, v18
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
@@ -177657,8 +178403,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v25
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v55, 8, v3
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v40, 8, v5
 ; VI-NEXT:    v_lshlrev_b16_e32 v41, 8, v7
 ; VI-NEXT:    v_lshlrev_b16_e32 v50, 8, v9
@@ -177754,13 +178500,25 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -177888,14 +178646,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v38, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -177903,26 +178666,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -177931,35 +178674,57 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr59
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr55
 ; VI-NEXT:    ; implicit-def: $vgpr40
 ; VI-NEXT:    ; implicit-def: $vgpr41
 ; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr49
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -177992,39 +178757,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(6)
-; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr59
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -178190,17 +178935,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr53
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -178885,8 +179622,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v27
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v54, 8, v3
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v41, 8, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v40, 8, v7
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v9
@@ -178997,13 +179734,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -179136,14 +179887,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v37, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -179151,26 +179907,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -179179,36 +179915,62 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s6
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr60
+; GFX9-NEXT:    ; implicit-def: $vgpr56
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr46
+; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr55
 ; GFX9-NEXT:    ; implicit-def: $vgpr54
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
 ; GFX9-NEXT:    ; implicit-def: $vgpr35
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr48
+; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -179231,49 +179993,25 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s6
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s6
 ; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr53
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s6
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr60
-; GFX9-NEXT:    ; implicit-def: $vgpr56
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr46
-; GFX9-NEXT:    ; implicit-def: $vgpr44
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -179439,17 +180177,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v28, v29, v28, s6
 ; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr48
-; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v29, v30, v29, s6
@@ -180795,7 +181525,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:580
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:576
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:572
@@ -180828,7 +181558,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:456
-; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:444
@@ -181657,7 +182387,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v116, v31, 0x5040100
 ; GFX11-FAKE16-NEXT:  .LBB92_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:400
@@ -181690,7 +182420,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:528
@@ -183515,33 +184245,53 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v24
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v26
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
+; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
+; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
+; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
+; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
+; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
+; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
+; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
+; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
 ; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v2
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:248
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -183586,52 +184336,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:68
 ; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
-; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
-; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
-; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
-; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
-; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
-; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -183651,6 +184355,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
@@ -183659,7 +184364,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
@@ -183691,6 +184395,25 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB93_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
@@ -183715,15 +184438,18 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v2, v8
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
@@ -183773,10 +184499,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -183784,50 +184511,37 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(9)
-; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v42, v43
 ; VI-NEXT:    v_mov_b32_e32 v43, v37
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
@@ -183842,13 +184556,12 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -183870,21 +184583,28 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v56, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v63, v39
+; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v57, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -183902,11 +184622,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v53, v35
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v3, s4, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xff
@@ -183939,7 +184658,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    s_branch .LBB93_3
 ; VI-NEXT:  .LBB93_2:
 ; VI-NEXT:    v_mov_b32_e32 v47, v54
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -183960,6 +184678,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v58, v7
 ; VI-NEXT:    v_mov_b32_e32 v57, v5
 ; VI-NEXT:    v_mov_b32_e32 v56, v3
@@ -184551,29 +185270,51 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_load_ushort v5, off, s[0:3], s32 offset:224
 ; GFX9-NEXT:    buffer_load_ushort v9, off, s[0:3], s32 offset:232
 ; GFX9-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
+; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
+; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v46, 8, v46
 ; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v13
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v4
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
 ; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
@@ -184637,82 +185378,42 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:124
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
-; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
-; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
-; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
-; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
@@ -184733,6 +185434,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(55)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB93_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_and_b32 s4, s28, 0xff
@@ -184986,14 +185694,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
@@ -185003,7 +185710,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:  .LBB93_2:
 ; GFX9-NEXT:    v_mov_b32_e32 v58, v50
 ; GFX9-NEXT:    v_mov_b32_e32 v45, v59
-; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -185015,6 +185721,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v35
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v49, v39
@@ -185480,7 +186187,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64f16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
+; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -186210,7 +186917,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v30.h, v182.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v43.l
 ; GFX11-TRUE16-NEXT:  .LBB93_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
+; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -186252,7 +186959,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
+; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -187036,7 +187743,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
 ; GFX11-FAKE16-NEXT:  .LBB93_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
+; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -189098,27 +189805,42 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v62, 16, v4
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v44, v12
 ; VI-NEXT:    v_mov_b32_e32 v12, v0
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v32, v20
 ; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v16
 ; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v43, v11
 ; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v62, 16, v4
-; VI-NEXT:    v_mov_b32_e32 v32, v20
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v55, v22
 ; VI-NEXT:    v_mov_b32_e32 v54, v21
 ; VI-NEXT:    v_mov_b32_e32 v31, v19
+; VI-NEXT:    v_mov_b32_e32 v43, v11
 ; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v44
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v43
 ; VI-NEXT:    ; implicit-def: $vgpr20
 ; VI-NEXT:    ; implicit-def: $vgpr57
 ; VI-NEXT:    ; implicit-def: $vgpr51
+; VI-NEXT:    ; implicit-def: $vgpr8
+; VI-NEXT:    ; implicit-def: $vgpr4
 ; VI-NEXT:    ; implicit-def: $vgpr41
 ; VI-NEXT:    ; implicit-def: $vgpr56
 ; VI-NEXT:    ; implicit-def: $vgpr63
@@ -189130,47 +189852,38 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr42
 ; VI-NEXT:    ; implicit-def: $vgpr45
 ; VI-NEXT:    ; implicit-def: $vgpr52
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
-; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v43
-; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v31
-; VI-NEXT:    ; implicit-def: $vgpr8
-; VI-NEXT:    ; implicit-def: $vgpr15
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
-; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v7
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
+; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
+; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v31
+; VI-NEXT:    ; implicit-def: $vgpr15
 ; VI-NEXT:    ; implicit-def: $vgpr13
 ; VI-NEXT:    ; implicit-def: $vgpr9
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr5
-; VI-NEXT:    ; implicit-def: $vgpr4
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr30
+; VI-NEXT:    ; implicit-def: $vgpr34
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v28
@@ -189179,38 +189892,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr29
 ; VI-NEXT:    ; implicit-def: $vgpr28
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v27
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v26
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
-; VI-NEXT:    ; implicit-def: $vgpr27
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; VI-NEXT:    ; implicit-def: $vgpr25
-; VI-NEXT:    ; implicit-def: $vgpr24
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v55
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v54
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr23
-; VI-NEXT:    ; implicit-def: $vgpr34
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr0
@@ -189254,8 +189935,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:    ; kill: killed $vgpr0
 ; VI-NEXT:    ; implicit-def: $vgpr0
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; VI-NEXT:    ; kill: killed $vgpr0
 ; VI-NEXT:    ; implicit-def: $vgpr0
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr5
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v27
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v26
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
+; VI-NEXT:    ; implicit-def: $vgpr27
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr25
+; VI-NEXT:    ; implicit-def: $vgpr24
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v55
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v54
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr23
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr10
@@ -189293,28 +190000,49 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v56, v38
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v45, v7
-; VI-NEXT:    v_mov_b32_e32 v63, v53
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v15, v3
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v28, v48
 ; VI-NEXT:    v_mov_b32_e32 v48, v16
 ; VI-NEXT:    v_mov_b32_e32 v16, v40
 ; VI-NEXT:    v_mov_b32_e32 v47, v39
+; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v63, v53
+; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v32
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v31
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v18
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v17
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v29, 24, v44
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 24, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, 24, v18
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v38
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v37
@@ -189326,83 +190054,20 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v62, v36
-; VI-NEXT:    v_lshrrev_b32_e32 v41, 24, v38
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v11
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v10
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 24, v11
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshrrev_b32_e32 v23, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 24, v7
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshrrev_b32_e32 v24, 8, v52
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshrrev_b32_e32 v57, 24, v53
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshrrev_b32_e32 v20, 8, v53
-; VI-NEXT:    v_lshrrev_b32_e32 v19, 8, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v25, 8, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v59
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v59
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v58
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v26
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 24, v27
-; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v27
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v33
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v34
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v62, v36
 ; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshrrev_b32_e32 v42, 24, v34
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshrrev_b32_e32 v22, 8, v35
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 24, v36
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v31
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v18
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v17
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[37:38]
@@ -189417,61 +190082,94 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[6:7]
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[2:3]
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[26:27]
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[33:34]
-; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v36
-; VI-NEXT:    v_lshrrev_b64 v[37:38], 24, v[35:36]
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v33
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 24, v38
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v30, 8, v50
+; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v49
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v40
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 24, v11
+; VI-NEXT:    v_lshrrev_b32_e32 v57, 24, v53
+; VI-NEXT:    v_lshrrev_b32_e32 v20, 8, v53
+; VI-NEXT:    v_lshrrev_b32_e32 v24, 8, v52
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 24, v27
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 24, v34
 ; VI-NEXT:    v_lshrrev_b64 v[10:11], 24, v[52:53]
 ; VI-NEXT:    v_lshrrev_b64 v[52:53], 24, v[58:59]
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v27
 ; VI-NEXT:    v_mov_b32_e32 v53, v63
-; VI-NEXT:    v_mov_b32_e32 v27, v19
-; VI-NEXT:    v_mov_b32_e32 v34, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 24, v55
+; VI-NEXT:    v_lshrrev_b32_e32 v63, 8, v40
+; VI-NEXT:    v_lshrrev_b32_e32 v23, 8, v6
 ; VI-NEXT:    v_mov_b32_e32 v7, v45
 ; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v31
-; VI-NEXT:    v_mov_b32_e32 v3, v15
-; VI-NEXT:    v_mov_b32_e32 v15, v29
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
-; VI-NEXT:    v_mov_b32_e32 v38, v56
-; VI-NEXT:    v_mov_b32_e32 v29, v41
 ; VI-NEXT:    v_mov_b32_e32 v45, v60
-; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v55
 ; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v49
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 8, v2
+; VI-NEXT:    s_waitcnt vmcnt(13)
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v59
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[2:3]
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v59
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v58
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v26
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[26:27]
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v34
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 24, v36
+; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v36
+; VI-NEXT:    v_lshrrev_b32_e32 v22, 8, v35
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v50
-; VI-NEXT:    v_lshrrev_b32_e32 v30, 8, v50
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v40
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[37:38], 24, v[35:36]
 ; VI-NEXT:    v_lshrrev_b64 v[35:36], 24, v[49:50]
 ; VI-NEXT:    v_lshrrev_b64 v[49:50], 24, v[39:40]
 ; VI-NEXT:    v_mov_b32_e32 v58, v51
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[33:34]
 ; VI-NEXT:    v_mov_b32_e32 v36, v62
 ; VI-NEXT:    v_lshrrev_b64 v[61:62], 24, v[54:55]
+; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
 ; VI-NEXT:    v_lshrrev_b64 v[50:51], 24, v[17:18]
-; VI-NEXT:    v_lshrrev_b32_e32 v63, 8, v40
-; VI-NEXT:    v_mov_b32_e32 v40, v16
-; VI-NEXT:    v_mov_b32_e32 v16, v48
-; VI-NEXT:    v_mov_b32_e32 v48, v28
-; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v27, v19
+; VI-NEXT:    v_mov_b32_e32 v34, v14
 ; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v40, v16
+; VI-NEXT:    v_mov_b32_e32 v16, v48
+; VI-NEXT:    v_mov_b32_e32 v48, v28
+; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshrrev_b32_e32 v25, 8, v3
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 24, v55
+; VI-NEXT:    v_mov_b32_e32 v3, v15
+; VI-NEXT:    v_mov_b32_e32 v15, v29
+; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v38, v56
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 8, v39
+; VI-NEXT:    v_mov_b32_e32 v29, v41
 ; VI-NEXT:    v_mov_b32_e32 v39, v47
 ; VI-NEXT:    v_mov_b32_e32 v47, v4
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v54
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v55
 ; VI-NEXT:  .LBB94_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB94_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v63, 0x200
 ; VI-NEXT:    v_add_f16_sdwa v21, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v21
@@ -189490,36 +190188,47 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    v_add_f16_e32 v31, 0x200, v31
 ; VI-NEXT:    v_add_f16_sdwa v23, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_or_b32_e32 v14, v31, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v23
 ; VI-NEXT:    v_add_f16_e32 v55, 0x200, v55
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_e32 v62, v55, v0
 ; VI-NEXT:    v_add_f16_sdwa v0, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    v_add_f16_e32 v54, 0x200, v54
 ; VI-NEXT:    v_or_b32_e32 v61, v54, v0
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v26, v54
 ; VI-NEXT:    v_mov_b32_e32 v27, v55
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_add_f16_sdwa v60, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v60
 ; VI-NEXT:    v_add_f16_e32 v25, 0x200, v25
 ; VI-NEXT:    v_or_b32_e32 v34, v25, v0
 ; VI-NEXT:    v_add_f16_sdwa v0, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v24, 0x200, v24
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v7, 0x200, v7
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v54, 0x200, v54
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v33, v24, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -189527,13 +190236,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v36, v2, v0
 ; VI-NEXT:    v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_e32 v35, v1, v0
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v25, 0x200, v25
+; VI-NEXT:    v_add_f16_e32 v24, 0x200, v24
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
@@ -189542,38 +190259,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v38, v2, v0
 ; VI-NEXT:    v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_e32 v37, v1, v0
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v0, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v9, 0x200, v9
+; VI-NEXT:    v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v8, 0x200, v8
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_or_b32_e32 v49, v9, v0
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_add_f16_sdwa v47, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v3, 0x200, v3
+; VI-NEXT:    v_or_b32_e32 v49, v9, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; VI-NEXT:    v_add_f16_sdwa v1, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v48, v8, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v47
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v9, v31
 ; VI-NEXT:    v_add_f16_sdwa v8, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v10, v32
@@ -189591,11 +190304,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
 ; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v53, v2, v0
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_add_f16_sdwa v3, v44, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v44, 0x200, v44
 ; VI-NEXT:    v_or_b32_e32 v52, v1, v0
@@ -189612,28 +190325,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v46, v2, v0
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; VI-NEXT:    v_or_b32_e32 v45, v1, v0
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_f16_sdwa v1, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v7, 0x200, v7
 ; VI-NEXT:    v_add_f16_e32 v6, 0x200, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v11
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v5, v7, v0
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_e32 v4, v6, v0
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
+; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_add_f16_sdwa v39, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_f16_sdwa v56, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v7, 0x200, v7
 ; VI-NEXT:    v_add_f16_e32 v6, 0x200, v6
@@ -189641,36 +190358,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v41, v7, v0
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v39
-; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_e32 v40, v6, v0
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v25, 0x200, v25
-; VI-NEXT:    v_add_f16_e32 v24, 0x200, v24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v42
 ; VI-NEXT:    v_or_b32_e32 v7, v25, v0
 ; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
-; VI-NEXT:    v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v19
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v54, 0x200, v54
-; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v25, 8, v46
 ; VI-NEXT:    v_or_b32_e32 v6, v24, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
@@ -189679,7 +190373,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v31, v43, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v28
 ; VI-NEXT:    v_or_b32_e32 v30, v2, v0
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_add_f16_sdwa v2, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v55, 0x200, v55
 ; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -189695,8 +190388,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v54, 8, v0
 ; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
-; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
@@ -189714,21 +190405,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
+; VI-NEXT:    v_mov_b32_e32 v32, v10
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[6:7]
-; VI-NEXT:    v_mov_b32_e32 v32, v10
 ; VI-NEXT:    v_mov_b32_e32 v31, v9
 ; VI-NEXT:    v_lshrrev_b32_e32 v10, 8, v41
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[6:7]
 ; VI-NEXT:    v_mov_b32_e32 v7, v11
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[10:11], 24, v[40:41]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v55, v27
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v54, v26
 ; VI-NEXT:    v_mov_b32_e32 v26, v20
 ; VI-NEXT:    v_lshrrev_b32_e32 v20, 8, v5
@@ -189736,23 +190427,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v5, v22
 ; VI-NEXT:    v_mov_b32_e32 v13, v21
 ; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[45:46]
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v53
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v52
 ; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[50:51]
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v50
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v48
 ; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[48:49]
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v49
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v30, 8, v36
@@ -189760,27 +190442,39 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b64 v[35:36], 24, v[35:36]
 ; VI-NEXT:    v_mov_b32_e32 v36, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v15
-; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v62
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v61
-; VI-NEXT:    v_lshrrev_b64 v[61:62], 24, v[61:62]
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v14
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v53
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v52
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v50
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v49
 ; VI-NEXT:    v_mov_b32_e32 v48, v56
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 8, v33
 ; VI-NEXT:    v_lshrrev_b64 v[49:50], 24, v[33:34]
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[14:15]
 ; VI-NEXT:    v_lshrrev_b32_e32 v14, 8, v58
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v62
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v61
+; VI-NEXT:    v_lshrrev_b64 v[61:62], 24, v[61:62]
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v14, 8, v57
-; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v9, v23
 ; VI-NEXT:    v_lshrrev_b32_e32 v23, 8, v40
+; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v14, v8
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v40, v42
 ; VI-NEXT:    v_bfe_u32 v8, v42, 8, 8
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v38
 ; VI-NEXT:    v_lshrrev_b32_e32 v22, 8, v37
 ; VI-NEXT:    v_lshrrev_b64 v[37:38], 24, v[37:38]
@@ -189797,26 +190491,24 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_bfe_u32 v51, v48, 8, 8
 ; VI-NEXT:    v_bfe_u32 v57, v7, 8, 8
 ; VI-NEXT:    v_bfe_u32 v58, v60, 8, 8
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_bfe_u32 v34, v62, 8, 8
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_bfe_u32 v2, v2, 8, 8
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; VI-NEXT:    v_bfe_u32 v34, v47, 8, 8
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_bfe_u32 v9, v9, 8, 8
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_bfe_u32 v5, v5, 8, 8
 ; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_bfe_u32 v13, v13, 8, 8
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(12)
+; VI-NEXT:    v_bfe_u32 v2, v2, 8, 8
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_bfe_u32 v42, v0, 8, 8
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    v_bfe_u32 v34, v62, 8, 8
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v34, v47, 8, 8
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
@@ -189986,16 +190678,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 56, v12
@@ -190067,14 +190758,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x54, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v56
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v49
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x58, v12
@@ -190082,41 +190772,42 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v63
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v58
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x5c, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v21
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v35
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x60, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v30
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v47
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x64, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v22
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v37
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x68, v12
@@ -190124,6 +190815,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v46
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
@@ -190161,17 +190853,16 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x74, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x78, v12
@@ -190179,6 +190870,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v45
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v34
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -190207,22 +190899,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v64f16_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -190285,6 +190961,23 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
@@ -190315,7 +191008,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr52
 ; GFX9-NEXT:    ; implicit-def: $vgpr51
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -190349,7 +191041,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    s_waitcnt vmcnt(33)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -190472,101 +191164,100 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(46)
+; GFX9-NEXT:    s_waitcnt vmcnt(62)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
+; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
+; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
-; GFX9-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
-; GFX9-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[31:32]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[29:30]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[27:28]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[25:26]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[23:24]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[21:22]
 ; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
 ; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[3:4]
 ; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[19:20]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 8, v10
@@ -190582,6 +191273,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v44, 8, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 24, v32
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 8, v20
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v19
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
@@ -190607,7 +191299,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[13:14]
 ; GFX9-NEXT:    v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
@@ -191633,7 +192325,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -191654,10 +192350,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
@@ -192293,7 +192985,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -194483,8 +195175,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v14, 8, v7
 ; VI-NEXT:    v_lshrrev_b64 v[7:8], 24, v[7:8]
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v10
@@ -194492,6 +195182,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v9
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v13
+; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v12
 ; VI-NEXT:    v_lshrrev_b64 v[13:14], 24, v[12:13]
@@ -194499,12 +195190,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v12, 8, v1
 ; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[1:2]
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v19
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
@@ -194512,14 +195197,20 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[3:4]
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[9:10], 24, v[9:10]
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v16
 ; VI-NEXT:    v_lshrrev_b64 v[16:17], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v19
 ; VI-NEXT:    v_lshrrev_b32_e32 v10, 8, v18
 ; VI-NEXT:    v_lshrrev_b64 v[17:18], 24, v[18:19]
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v31
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v15
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v35
 ; VI-NEXT:    v_lshrrev_b64 v[18:19], 24, v[34:35]
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
@@ -194554,6 +195245,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_bfe_u32 v11, v52, 8, 8
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v33
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v32
+; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v29
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
@@ -195713,42 +196405,42 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[9:10]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[11:12]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[21:22]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v4
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v3
+; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v6
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v6
+; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v5
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
+; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v8
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v8
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v26
@@ -196715,7 +197407,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:76
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
@@ -196750,7 +197442,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    v_writelane_b32 v76, s101, 5
 ; GFX11-NEXT:    s_mov_b32 s99, 0
 ; GFX11-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT:    s_clause 0x12
+; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:72
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:68
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:64
@@ -197669,7 +198361,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:80
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:96
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT:    s_clause 0x12
+; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:8
@@ -197731,7 +198423,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    v_readlane_b32 s31, v75, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v75, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:76
 ; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:80
 ; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:84
@@ -197782,11 +198474,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v54, v15
 ; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v15
 ; SI-NEXT:    v_mov_b32_e32 v57, v5
 ; SI-NEXT:    v_mov_b32_e32 v41, v3
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:392
@@ -197876,7 +198568,30 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v15
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v27
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v29
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v31
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v32
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v33
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v34
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:196
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:220
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:192
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v11
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -197884,28 +198599,21 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v27
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v17
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v29
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v23
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v31
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v31, 8, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v18
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v26
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 8, v10
+; SI-NEXT:    ; implicit-def: $vgpr26
 ; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr27
@@ -197913,240 +198621,211 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr18
 ; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160
+; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:88
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:184
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:88
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:80
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:216
 ; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:112
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:180
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:176
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v33
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 8, v10
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:172
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:196
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:220
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:192
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:216
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:212
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:208
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:228
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:252
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:224
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:172
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:248
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:244
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:240
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:236
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:256
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:80
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:244
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:276
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:272
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:240
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v11
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:268
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:292
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:316
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:236
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:312
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:304
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:276
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:272
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:300
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:348
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:320
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:304
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:268
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:344
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:340
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:336
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 24, v9
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:332
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v11
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:380
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:352
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:300
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:376
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:340
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:336
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 24, v9
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:388
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:384
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:372
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:368
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:332
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 24, v8
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:364
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:388
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:384
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v26
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 24, v8
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v11
 ; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:120
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76
@@ -198158,15 +198837,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v35
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:56
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
@@ -198202,7 +198885,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:48
@@ -198682,15 +199365,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v25, v6, v13
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v5
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v7, v25, v5, 16
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v26
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v11
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v8
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -200009,8 +200692,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v25
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v55, 8, v3
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v40, 8, v5
 ; VI-NEXT:    v_lshlrev_b16_e32 v41, 8, v7
 ; VI-NEXT:    v_lshlrev_b16_e32 v50, 8, v9
@@ -200106,13 +200789,25 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -200240,14 +200935,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v38, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -200255,26 +200955,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -200283,35 +200963,57 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr59
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr55
 ; VI-NEXT:    ; implicit-def: $vgpr40
 ; VI-NEXT:    ; implicit-def: $vgpr41
 ; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr49
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -200344,39 +201046,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(6)
-; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr59
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -200542,17 +201224,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr53
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -201237,8 +201911,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v27
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v54, 8, v3
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v41, 8, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v40, 8, v7
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v9
@@ -201349,13 +202023,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -201488,14 +202176,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v37, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -201503,26 +202196,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -201531,36 +202204,62 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s6
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr60
+; GFX9-NEXT:    ; implicit-def: $vgpr56
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr46
+; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr55
 ; GFX9-NEXT:    ; implicit-def: $vgpr54
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
 ; GFX9-NEXT:    ; implicit-def: $vgpr35
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr48
+; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -201583,49 +202282,25 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s6
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s6
 ; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr53
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s6
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr60
-; GFX9-NEXT:    ; implicit-def: $vgpr56
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr46
-; GFX9-NEXT:    ; implicit-def: $vgpr44
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -201791,17 +202466,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v28, v29, v28, s6
 ; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr48
-; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v29, v30, v29, s6
@@ -203147,7 +203814,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:580
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:576
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:572
@@ -203180,7 +203847,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:456
-; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:444
@@ -204009,7 +204676,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v116, v31, 0x5040100
 ; GFX11-FAKE16-NEXT:  .LBB96_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:400
@@ -204042,7 +204709,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:528
@@ -204087,7 +204754,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:332
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:328
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:324
@@ -204097,9 +204763,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:308
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:304
 ; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
-; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_writelane_b32 v41, s30, 0
-; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v43, s29, 0
 ; SI-NEXT:    v_writelane_b32 v43, s28, 1
 ; SI-NEXT:    v_writelane_b32 v43, s27, 2
@@ -204148,6 +204814,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_writelane_b32 v41, s96, 32
 ; SI-NEXT:    v_writelane_b32 v41, s97, 33
 ; SI-NEXT:    v_writelane_b32 v41, s98, 34
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:164
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:160
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152
 ; SI-NEXT:    v_readfirstlane_b32 s39, v26
 ; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
 ; SI-NEXT:    v_readfirstlane_b32 s47, v12
@@ -204170,9 +204842,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s59, v28
 ; SI-NEXT:    v_readfirstlane_b32 s60, v27
 ; SI-NEXT:    v_readfirstlane_b32 s11, v1
-; SI-NEXT:    v_readfirstlane_b32 s12, v2
-; SI-NEXT:    v_readfirstlane_b32 s13, v9
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    v_writelane_b32 v43, s4, 14
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:300
@@ -204181,30 +204851,28 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:288
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:280
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v32
 ; SI-NEXT:    v_writelane_b32 v43, s4, 15
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v33
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:276
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:272
 ; SI-NEXT:    v_writelane_b32 v43, s4, 16
-; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
 ; SI-NEXT:    v_writelane_b32 v43, s4, 17
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v35
 ; SI-NEXT:    v_writelane_b32 v43, s4, 18
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s44, v36
-; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s90, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:268
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:264
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:256
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s6, v38
+; SI-NEXT:    v_readfirstlane_b32 s12, v2
+; SI-NEXT:    v_readfirstlane_b32 s13, v9
 ; SI-NEXT:    v_readfirstlane_b32 s14, v10
 ; SI-NEXT:    v_readfirstlane_b32 s15, v8
 ; SI-NEXT:    v_readfirstlane_b32 s18, v7
@@ -204218,6 +204886,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s77, v15
 ; SI-NEXT:    v_readfirstlane_b32 s38, v25
 ; SI-NEXT:    v_writelane_b32 v41, s99, 35
+; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    v_readfirstlane_b32 s93, v55
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_readfirstlane_b32 s95, v40
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    v_writelane_b32 v43, s4, 19
@@ -204294,39 +204966,35 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_writelane_b32 v43, s4, 30
 ; SI-NEXT:    v_readfirstlane_b32 s4, v32
 ; SI-NEXT:    v_writelane_b32 v43, s4, 31
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:164
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:160
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
 ; SI-NEXT:    v_writelane_b32 v43, s4, 32
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s9, v35
-; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v37
 ; SI-NEXT:    v_writelane_b32 v43, s4, 33
 ; SI-NEXT:    v_readfirstlane_b32 s10, v36
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    v_writelane_b32 v43, s4, 34
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v38
 ; SI-NEXT:    v_writelane_b32 v43, s4, 35
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v39
 ; SI-NEXT:    v_writelane_b32 v43, s4, 36
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_readfirstlane_b32 s69, v48
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_readfirstlane_b32 s30, v49
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_readfirstlane_b32 s16, v50
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_readfirstlane_b32 s36, v51
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:144
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v33
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:140
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:136
@@ -204340,7 +205008,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:104
 ; SI-NEXT:    v_writelane_b32 v43, s4, 37
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v52
 ; SI-NEXT:    v_writelane_b32 v43, s4, 38
 ; SI-NEXT:    v_readfirstlane_b32 s4, v53
@@ -204367,9 +205035,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_writelane_b32 v43, s43, 58
 ; SI-NEXT:    v_writelane_b32 v43, s76, 59
 ; SI-NEXT:    v_writelane_b32 v43, s77, 60
-; SI-NEXT:    v_readfirstlane_b32 s93, v55
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_readfirstlane_b32 s95, v40
 ; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s17, v33
 ; SI-NEXT:    s_waitcnt vmcnt(9)
@@ -205938,33 +206603,53 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v24
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v26
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
+; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
+; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
+; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
+; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
+; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
+; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
+; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
+; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
 ; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v2
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:248
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -206009,52 +206694,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:68
 ; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
-; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
-; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
-; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
-; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
-; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
-; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -206074,6 +206713,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
@@ -206082,7 +206722,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
@@ -206114,6 +206753,25 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB97_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
@@ -206138,15 +206796,18 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v2, v8
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
@@ -206196,10 +206857,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -206207,50 +206869,37 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(9)
-; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v42, v43
 ; VI-NEXT:    v_mov_b32_e32 v43, v37
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
@@ -206265,13 +206914,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -206293,21 +206941,28 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v56, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v63, v39
+; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v57, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -206325,11 +206980,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v53, v35
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v3, s4, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xff
@@ -206362,7 +207016,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_branch .LBB97_3
 ; VI-NEXT:  .LBB97_2:
 ; VI-NEXT:    v_mov_b32_e32 v47, v54
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -206383,6 +207036,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v58, v7
 ; VI-NEXT:    v_mov_b32_e32 v57, v5
 ; VI-NEXT:    v_mov_b32_e32 v56, v3
@@ -206974,29 +207628,51 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_ushort v5, off, s[0:3], s32 offset:224
 ; GFX9-NEXT:    buffer_load_ushort v9, off, s[0:3], s32 offset:232
 ; GFX9-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
+; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
+; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v46, 8, v46
 ; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v13
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v4
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
 ; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
@@ -207060,82 +207736,42 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:124
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
-; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
-; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
-; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
-; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
@@ -207156,6 +207792,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(55)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB97_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_and_b32 s4, s28, 0xff
@@ -207409,14 +208052,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
@@ -207426,7 +208068,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:  .LBB97_2:
 ; GFX9-NEXT:    v_mov_b32_e32 v58, v50
 ; GFX9-NEXT:    v_mov_b32_e32 v45, v59
-; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -207438,6 +208079,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v35
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v49, v39
@@ -207903,7 +208545,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64i16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
+; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -208633,7 +209275,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v30.h, v182.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v43.l
 ; GFX11-TRUE16-NEXT:  .LBB97_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
+; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -208675,7 +209317,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
+; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -209459,7 +210101,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
 ; GFX11-FAKE16-NEXT:  .LBB97_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
+; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -209562,100 +210204,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:132
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:128
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; SI-NEXT:    ; kill: killed $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v13
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v19
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:120
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:116
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:112
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:104
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:88
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:84
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:80
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v56
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:72
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:56
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
@@ -209785,14 +210333,29 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:120
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
@@ -209809,13 +210372,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
@@ -209870,12 +210426,39 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v57
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v58
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v62
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr44
+; SI-NEXT:    ; implicit-def: $vgpr43
+; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr52
+; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr26
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr18
@@ -209885,36 +210468,81 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:116
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:112
 ; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v62
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:104
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v60
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v59
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v63
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:88
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:84
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:80
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v61
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v56
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:72
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
@@ -209936,6 +210564,18 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v47
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:40
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v58
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v57
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB98_2
@@ -211555,22 +212195,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-LABEL: bitcast_v64i16_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32
@@ -211588,6 +212212,22 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v9
 ; VI-NEXT:    ; kill: killed $vgpr35
 ; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
 ; VI-NEXT:    ; kill: killed $vgpr35
@@ -211884,14 +212524,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v9, v8
 ; VI-NEXT:    v_lshrrev_b64 v[7:8], 24, v[7:8]
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v7, v5
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v7, v6
 ; VI-NEXT:    v_lshrrev_b64 v[5:6], 24, v[5:6]
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
@@ -211923,10 +212561,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v3, v2
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[36:37]
-; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v5, v4
-; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 24, v30
@@ -211997,10 +212631,16 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, v18
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v5, v4
 ; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[21:22]
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, v46
 ; VI-NEXT:    v_lshrrev_b64 v[45:46], 24, v[17:18]
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 24, v26
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v24
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 24, v22
@@ -212201,9 +212841,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v15
 ; VI-NEXT:    v_lshrrev_b64 v[15:16], 24, v[15:16]
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v13, v41, v13
@@ -212211,38 +212848,35 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v15, 8, v13
 ; VI-NEXT:    v_lshrrev_b64 v[13:14], 24, v[13:14]
-; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
 ; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v11
 ; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[11:12]
-; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
 ; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v11, 8, v9
 ; VI-NEXT:    v_lshrrev_b64 v[9:10], 24, v[9:10]
-; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v7
 ; VI-NEXT:    v_lshrrev_b64 v[7:8], 24, v[7:8]
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v6
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v5
 ; VI-NEXT:    v_lshrrev_b64 v[5:6], 24, v[5:6]
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
@@ -212255,8 +212889,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v36
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[36:37]
-; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v30
@@ -212325,6 +212957,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v49, v53
 ; VI-NEXT:    v_mov_b32_e32 v53, v38
 ; VI-NEXT:    v_mov_b32_e32 v38, v55
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v18
 ; VI-NEXT:    v_lshrrev_b32_e32 v42, 8, v17
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
@@ -212336,6 +212969,13 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v55, v31
 ; VI-NEXT:    v_bfe_u32 v61, v53, 8, 8
 ; VI-NEXT:    v_bfe_u32 v31, v38, 8, 8
+; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; VI-NEXT:  .LBB98_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -212790,22 +213430,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v64i16_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -212868,6 +213492,23 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
@@ -212898,7 +213539,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr52
 ; GFX9-NEXT:    ; implicit-def: $vgpr51
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -212932,7 +213572,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    s_waitcnt vmcnt(33)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -213055,101 +213695,100 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(46)
+; GFX9-NEXT:    s_waitcnt vmcnt(62)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
+; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
+; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
-; GFX9-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
-; GFX9-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[31:32]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[29:30]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[27:28]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[25:26]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[23:24]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[21:22]
 ; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
 ; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[3:4]
 ; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[19:20]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 8, v10
@@ -213165,6 +213804,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v44, 8, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 24, v32
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 8, v20
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v19
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
@@ -213189,7 +213829,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[13:14]
 ; GFX9-NEXT:    v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
@@ -214215,7 +214855,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -214236,10 +214880,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
@@ -214875,7 +215515,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -215014,26 +215654,26 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s91, v32
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_readfirstlane_b32 s93, v33
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:16
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_readfirstlane_b32 s55, v34
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_readfirstlane_b32 s17, v35
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_readfirstlane_b32 s95, v36
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_readfirstlane_b32 s35, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s83, v38
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:80
@@ -215046,39 +215686,34 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s39, v1
 ; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
 ; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s77, v31
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s38, v32
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s48, v33
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s50, v39
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_readfirstlane_b32 s76, v48
+; SI-NEXT:    v_readfirstlane_b32 s77, v31
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s30, v49
+; SI-NEXT:    v_readfirstlane_b32 s38, v32
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_readfirstlane_b32 s34, v50
+; SI-NEXT:    v_readfirstlane_b32 s48, v33
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_readfirstlane_b32 s36, v51
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_readfirstlane_b32 s99, v34
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_readfirstlane_b32 s50, v39
 ; SI-NEXT:    v_readfirstlane_b32 s90, v35
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_readfirstlane_b32 s92, v36
 ; SI-NEXT:    v_writelane_b32 v41, s90, 11
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_readfirstlane_b32 s94, v37
 ; SI-NEXT:    v_writelane_b32 v41, s92, 12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_readfirstlane_b32 s30, v49
 ; SI-NEXT:    v_writelane_b32 v41, s94, 13
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_readfirstlane_b32 s34, v50
 ; SI-NEXT:    v_writelane_b32 v41, s30, 14
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_readfirstlane_b32 s36, v51
 ; SI-NEXT:    v_writelane_b32 v41, s34, 15
 ; SI-NEXT:    v_writelane_b32 v41, s36, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v38
 ; SI-NEXT:    v_writelane_b32 v41, s38, 17
+; SI-NEXT:    v_readfirstlane_b32 s76, v48
+; SI-NEXT:    v_readfirstlane_b32 s99, v34
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_writelane_b32 v41, s48, 18
 ; SI-NEXT:    v_writelane_b32 v41, s50, 19
@@ -218060,48 +218695,48 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[9:10]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v4
+; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
 ; GFX9-NEXT:    v_pk_add_u16 v12, s41, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, s40, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[11:12]
-; GFX9-NEXT:    v_pk_add_u16 v14, s43, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v13, s42, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[13:14]
-; GFX9-NEXT:    v_pk_add_u16 v22, s45, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v21, s44, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[21:22]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v4
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v3
+; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v6
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; GFX9-NEXT:    v_pk_add_u16 v14, s43, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_u16 v13, s42, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v6
+; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v5
+; GFX9-NEXT:    v_pk_add_u16 v22, s45, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_u16 v21, s44, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
+; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v8
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v8
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v26
@@ -219068,7 +219703,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:76
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
@@ -219103,7 +219738,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    v_writelane_b32 v76, s101, 5
 ; GFX11-NEXT:    s_mov_b32 s99, 0
 ; GFX11-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT:    s_clause 0x12
+; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:72
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:68
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:64
@@ -220022,7 +220657,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:80
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:96
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT:    s_clause 0x12
+; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:8
@@ -220084,7 +220719,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    v_readlane_b32 s31, v75, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v75, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:76
 ; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:80
 ; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:84
@@ -221471,6 +222106,8 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; VI-LABEL: bitcast_v64bf16_to_v64f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -221487,9 +222124,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -221738,7 +222373,6 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v40, 0x400000, v30
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; VI-NEXT:    v_cndmask_b32_e32 v30, v55, v40, vcc
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v55, 16, v31
 ; VI-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
 ; VI-NEXT:    v_bfe_u32 v40, v55, 16, 1
@@ -222104,6 +222738,9 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v64bf16_to_v64f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -222120,9 +222757,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -222341,7 +222976,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v40, 0x400000, v30
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX9-NEXT:    v_cndmask_b32_e32 v30, v55, v40, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v55, 16, v31
 ; GFX9-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
 ; GFX9-NEXT:    v_bfe_u32 v40, v55, 16, 1
@@ -222641,7 +223276,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:60
@@ -223201,7 +223836,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, v42 :: v_dual_mov_b32 v11, v43
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, v44 :: v_dual_mov_b32 v13, v45
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v46 :: v_dual_mov_b32 v15, v47
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:16
@@ -224996,19 +225631,19 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; VI-NEXT:    v_writelane_b32 v42, s31, 1
 ; VI-NEXT:    v_mov_b32_e32 v31, v17
 ; VI-NEXT:    v_mov_b32_e32 v30, v16
-; VI-NEXT:    v_mov_b32_e32 v29, v15
+; VI-NEXT:    v_mov_b32_e32 v32, v15
+; VI-NEXT:    v_mov_b32_e32 v33, v13
+; VI-NEXT:    v_mov_b32_e32 v34, v11
+; VI-NEXT:    v_mov_b32_e32 v35, v9
+; VI-NEXT:    v_mov_b32_e32 v36, v7
+; VI-NEXT:    v_mov_b32_e32 v37, v5
+; VI-NEXT:    v_mov_b32_e32 v38, v3
 ; VI-NEXT:    v_mov_b32_e32 v28, v14
-; VI-NEXT:    v_mov_b32_e32 v27, v13
 ; VI-NEXT:    v_mov_b32_e32 v26, v12
-; VI-NEXT:    v_mov_b32_e32 v25, v11
 ; VI-NEXT:    v_mov_b32_e32 v24, v10
-; VI-NEXT:    v_mov_b32_e32 v23, v9
 ; VI-NEXT:    v_mov_b32_e32 v22, v8
-; VI-NEXT:    v_mov_b32_e32 v21, v7
 ; VI-NEXT:    v_mov_b32_e32 v20, v6
-; VI-NEXT:    v_mov_b32_e32 v19, v5
-; VI-NEXT:    v_mov_b32_e32 v32, v4
-; VI-NEXT:    v_mov_b32_e32 v17, v3
+; VI-NEXT:    v_mov_b32_e32 v48, v4
 ; VI-NEXT:    v_mov_b32_e32 v16, v2
 ; VI-NEXT:    v_readfirstlane_b32 s30, v0
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
@@ -225019,583 +225654,595 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_cbranch_execnz .LBB101_4
 ; VI-NEXT:  .LBB101_2: ; %cmp.true
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    s_lshl_b32 s4, s30, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    s_and_b32 s4, s28, 0xffff0000
+; VI-NEXT:    v_mov_b32_e32 v17, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    s_lshl_b32 s4, s28, 16
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s30, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v2, s4, v17
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    s_lshl_b32 s4, s30, 16
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_lshl_b32 s4, s31, 16
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT:    v_add_f32_e32 v2, s4, v17
+; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; VI-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
+; VI-NEXT:    s_and_b32 s4, s26, 0xffff0000
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    s_and_b32 s4, s31, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s4, v0
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s29, 16
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_alignbit_b32 v14, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_alignbit_b32 v15, v4, v3, 16
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s29, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_lshrrev_b64 v[12:13], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s28, 16
-; VI-NEXT:    v_alignbit_b32 v13, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s28, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s27, 16
-; VI-NEXT:    v_alignbit_b32 v12, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s27, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
 ; VI-NEXT:    s_lshl_b32 s4, s26, 16
-; VI-NEXT:    v_alignbit_b32 v11, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s26, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s25, 16
-; VI-NEXT:    v_alignbit_b32 v10, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s25, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s24, 16
-; VI-NEXT:    v_alignbit_b32 v9, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_lshrrev_b64 v[14:15], 16, v[2:3]
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_and_b32 s4, s24, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s23, 16
-; VI-NEXT:    v_alignbit_b32 v8, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s22, 16
-; VI-NEXT:    v_alignbit_b32 v7, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    s_lshl_b32 s4, s24, 16
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s21, 16
-; VI-NEXT:    v_alignbit_b32 v6, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    s_lshl_b32 s4, s22, 16
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
 ; VI-NEXT:    s_lshl_b32 s4, s20, 16
-; VI-NEXT:    v_alignbit_b32 v5, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_alignbit_b32 v4, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    s_lshl_b32 s4, s18, 16
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    s_lshl_b32 s4, s16, 16
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v1, s4, v17
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    s_lshl_b32 s4, s17, 16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v17
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; VI-NEXT:    v_add_f32_e32 v1, s4, v17
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v18, v5, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v18, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v1
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v18, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v18, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v2
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT:    v_or_b32_e32 v33, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v18, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v1
+; VI-NEXT:    s_lshl_b32 s4, s19, 16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v17
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v49, v5, v7, vcc
+; VI-NEXT:    v_add_f32_e32 v5, s4, v17
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    s_lshl_b32 s4, s21, 16
+; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc
+; VI-NEXT:    v_add_f32_e32 v7, s4, v17
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
+; VI-NEXT:    v_add_f32_e32 v5, s4, v17
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
+; VI-NEXT:    v_mov_b32_e32 v1, v18
+; VI-NEXT:    v_cndmask_b32_e32 v18, v9, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    s_lshl_b32 s4, s23, 16
+; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc
+; VI-NEXT:    v_add_f32_e32 v7, s4, v17
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT:    v_lshrrev_b64 v[49:50], 16, v[49:50]
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    s_and_b32 s4, s25, 0xffff0000
+; VI-NEXT:    v_mov_b32_e32 v3, v49
+; VI-NEXT:    v_cndmask_b32_e32 v49, v9, v11, vcc
+; VI-NEXT:    v_add_f32_e32 v9, s4, v17
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    s_lshl_b32 s4, s25, 16
+; VI-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
+; VI-NEXT:    v_add_f32_e32 v11, s4, v17
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT:    s_and_b32 s4, s27, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
+; VI-NEXT:    v_add_f32_e32 v9, s4, v17
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; VI-NEXT:    v_mov_b32_e32 v5, v18
+; VI-NEXT:    v_cndmask_b32_e32 v18, v13, v15, vcc
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    s_lshl_b32 s4, s27, 16
+; VI-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
+; VI-NEXT:    v_add_f32_e32 v11, s4, v17
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT:    v_lshrrev_b64 v[49:50], 16, v[49:50]
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT:    v_mov_b32_e32 v7, v49
+; VI-NEXT:    v_cndmask_b32_e32 v49, v13, v15, vcc
+; VI-NEXT:    v_add_f32_e32 v13, s4, v17
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v9
+; VI-NEXT:    v_mov_b32_e32 v9, v18
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    s_lshl_b32 s4, s29, 16
+; VI-NEXT:    v_cndmask_b32_e32 v13, v15, v18, vcc
+; VI-NEXT:    v_add_f32_e32 v15, s4, v17
+; VI-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT:    v_or_b32_e32 v33, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v18, v33, vcc
-; VI-NEXT:    v_add_f32_e32 v18, s4, v0
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v18, v1, 16
-; VI-NEXT:    v_add_f32_e32 v18, s4, v0
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v0
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    s_and_b32 s4, s31, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
+; VI-NEXT:    v_add_f32_e32 v13, s4, v17
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    s_lshl_b32 s4, s31, 16
+; VI-NEXT:    v_cndmask_b32_e32 v13, v15, v19, vcc
+; VI-NEXT:    v_add_f32_e32 v15, s4, v17
+; VI-NEXT:    v_bfe_u32 v17, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v15
+; VI-NEXT:    v_lshrrev_b64 v[49:50], 16, v[49:50]
+; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    v_mov_b32_e32 v11, v49
+; VI-NEXT:    v_cndmask_b32_e32 v49, v17, v19, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; VI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; VI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v16
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v13
+; VI-NEXT:    v_mov_b32_e32 v13, v18
+; VI-NEXT:    v_bfe_u32 v18, v16, 16, 1
+; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
+; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v16
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    v_cndmask_b32_e32 v16, v33, v34, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v33, 16, v17
-; VI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT:    v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT:    v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x7fff, v34
-; VI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; VI-NEXT:    v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
-; VI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; VI-NEXT:    v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT:    v_bfe_u32 v34, v17, 16, 1
-; VI-NEXT:    v_add_u32_e32 v34, vcc, v34, v17
-; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x7fff, v34
-; VI-NEXT:    v_or_b32_e32 v35, 0x400000, v17
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT:    v_cndmask_b32_e32 v17, v34, v35, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v34, 16, v32
-; VI-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; VI-NEXT:    v_bfe_u32 v35, v34, 16, 1
-; VI-NEXT:    v_add_u32_e32 v35, vcc, v35, v34
-; VI-NEXT:    v_add_u32_e32 v35, vcc, 0x7fff, v35
-; VI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; VI-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v34, v34
-; VI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; VI-NEXT:    v_cndmask_b32_e32 v34, v35, v36, vcc
-; VI-NEXT:    v_bfe_u32 v35, v32, 16, 1
-; VI-NEXT:    v_add_u32_e32 v35, vcc, v35, v32
-; VI-NEXT:    v_add_u32_e32 v35, vcc, 0x7fff, v35
-; VI-NEXT:    v_or_b32_e32 v36, 0x400000, v32
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; VI-NEXT:    v_cndmask_b32_e32 v32, v35, v36, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v35, 16, v19
-; VI-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; VI-NEXT:    v_bfe_u32 v36, v35, 16, 1
-; VI-NEXT:    v_add_u32_e32 v36, vcc, v36, v35
-; VI-NEXT:    v_add_u32_e32 v36, vcc, 0x7fff, v36
-; VI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; VI-NEXT:    v_or_b32_e32 v37, 0x400000, v35
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
+; VI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v17
+; VI-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
+; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
+; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v16
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
+; VI-NEXT:    v_cndmask_b32_e32 v16, v17, v19, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; VI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v38
+; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; VI-NEXT:    v_bfe_u32 v19, v18, 16, 1
+; VI-NEXT:    v_add_u32_e32 v19, vcc, v19, v18
+; VI-NEXT:    v_add_u32_e32 v19, vcc, 0x7fff, v19
+; VI-NEXT:    v_or_b32_e32 v21, 0x400000, v18
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_cndmask_b32_e32 v18, v19, v21, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v19, 16, v38
 ; VI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; VI-NEXT:    v_cndmask_b32_e32 v35, v36, v37, vcc
-; VI-NEXT:    v_bfe_u32 v36, v19, 16, 1
-; VI-NEXT:    v_add_u32_e32 v36, vcc, v36, v19
-; VI-NEXT:    v_add_u32_e32 v36, vcc, 0x7fff, v36
-; VI-NEXT:    v_or_b32_e32 v37, 0x400000, v19
+; VI-NEXT:    v_bfe_u32 v21, v19, 16, 1
+; VI-NEXT:    v_add_u32_e32 v21, vcc, v21, v19
+; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x7fff, v21
+; VI-NEXT:    v_or_b32_e32 v23, 0x400000, v19
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; VI-NEXT:    v_cndmask_b32_e32 v19, v36, v37, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v36, 16, v20
-; VI-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; VI-NEXT:    v_bfe_u32 v37, v36, 16, 1
-; VI-NEXT:    v_add_u32_e32 v37, vcc, v37, v36
-; VI-NEXT:    v_add_u32_e32 v37, vcc, 0x7fff, v37
+; VI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v48
+; VI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
+; VI-NEXT:    v_cndmask_b32_e32 v38, v21, v23, vcc
+; VI-NEXT:    v_bfe_u32 v21, v19, 16, 1
+; VI-NEXT:    v_add_u32_e32 v21, vcc, v21, v19
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v18
+; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v48
+; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x7fff, v21
+; VI-NEXT:    v_or_b32_e32 v23, 0x400000, v19
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; VI-NEXT:    v_cndmask_b32_e32 v19, v21, v23, vcc
+; VI-NEXT:    v_bfe_u32 v21, v18, 16, 1
+; VI-NEXT:    v_add_u32_e32 v21, vcc, v21, v18
+; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x7fff, v21
+; VI-NEXT:    v_or_b32_e32 v23, 0x400000, v18
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_cndmask_b32_e32 v18, v21, v23, vcc
+; VI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v37
+; VI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; VI-NEXT:    v_bfe_u32 v23, v21, 16, 1
+; VI-NEXT:    v_add_u32_e32 v23, vcc, v23, v21
+; VI-NEXT:    v_add_u32_e32 v23, vcc, 0x7fff, v23
+; VI-NEXT:    v_or_b32_e32 v25, 0x400000, v21
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; VI-NEXT:    v_cndmask_b32_e32 v21, v23, v25, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v23, 16, v37
+; VI-NEXT:    v_lshrrev_b64 v[49:50], 16, v[49:50]
+; VI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; VI-NEXT:    v_bfe_u32 v25, v23, 16, 1
+; VI-NEXT:    v_mov_b32_e32 v15, v49
+; VI-NEXT:    v_add_u32_e32 v25, vcc, v25, v23
+; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v21
+; VI-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; VI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; VI-NEXT:    v_or_b32_e32 v38, 0x400000, v36
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
+; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x7fff, v25
 ; VI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; VI-NEXT:    v_cndmask_b32_e32 v36, v37, v38, vcc
-; VI-NEXT:    v_bfe_u32 v37, v20, 16, 1
-; VI-NEXT:    v_add_u32_e32 v37, vcc, v37, v20
-; VI-NEXT:    v_add_u32_e32 v37, vcc, 0x7fff, v37
-; VI-NEXT:    v_or_b32_e32 v38, 0x400000, v20
+; VI-NEXT:    v_or_b32_e32 v27, 0x400000, v23
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; VI-NEXT:    v_bfe_u32 v23, v20, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v48, v25, v27, vcc
+; VI-NEXT:    v_add_u32_e32 v23, vcc, v23, v20
+; VI-NEXT:    v_add_u32_e32 v23, vcc, 0x7fff, v23
+; VI-NEXT:    v_or_b32_e32 v25, 0x400000, v20
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; VI-NEXT:    v_cndmask_b32_e32 v20, v37, v38, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v37, 16, v21
-; VI-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; VI-NEXT:    v_bfe_u32 v38, v37, 16, 1
-; VI-NEXT:    v_add_u32_e32 v38, vcc, v38, v37
-; VI-NEXT:    v_add_u32_e32 v38, vcc, 0x7fff, v38
-; VI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; VI-NEXT:    v_or_b32_e32 v39, 0x400000, v37
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; VI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; VI-NEXT:    v_cndmask_b32_e32 v37, v38, v39, vcc
-; VI-NEXT:    v_bfe_u32 v38, v21, 16, 1
-; VI-NEXT:    v_add_u32_e32 v38, vcc, v38, v21
-; VI-NEXT:    v_add_u32_e32 v38, vcc, 0x7fff, v38
-; VI-NEXT:    v_or_b32_e32 v39, 0x400000, v21
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; VI-NEXT:    v_cndmask_b32_e32 v21, v38, v39, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v38, 16, v22
-; VI-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; VI-NEXT:    v_bfe_u32 v39, v38, 16, 1
-; VI-NEXT:    v_add_u32_e32 v39, vcc, v39, v38
-; VI-NEXT:    v_add_u32_e32 v39, vcc, 0x7fff, v39
+; VI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v21
+; VI-NEXT:    v_bfe_u32 v21, v20, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc
+; VI-NEXT:    v_add_u32_e32 v21, vcc, v21, v20
+; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x7fff, v21
+; VI-NEXT:    v_or_b32_e32 v25, 0x400000, v20
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; VI-NEXT:    v_cndmask_b32_e32 v20, v21, v25, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v23
+; VI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v36
+; VI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; VI-NEXT:    v_bfe_u32 v25, v23, 16, 1
+; VI-NEXT:    v_add_u32_e32 v25, vcc, v25, v23
+; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x7fff, v25
+; VI-NEXT:    v_or_b32_e32 v27, 0x400000, v23
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; VI-NEXT:    v_cndmask_b32_e32 v23, v25, v27, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v25, 16, v36
+; VI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; VI-NEXT:    v_bfe_u32 v27, v25, 16, 1
+; VI-NEXT:    v_add_u32_e32 v27, vcc, v27, v25
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v23
+; VI-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; VI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; VI-NEXT:    v_or_b32_e32 v48, 0x400000, v38
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
+; VI-NEXT:    v_add_u32_e32 v27, vcc, 0x7fff, v27
 ; VI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; VI-NEXT:    v_cndmask_b32_e32 v38, v39, v48, vcc
-; VI-NEXT:    v_bfe_u32 v39, v22, 16, 1
-; VI-NEXT:    v_add_u32_e32 v39, vcc, v39, v22
-; VI-NEXT:    v_add_u32_e32 v39, vcc, 0x7fff, v39
-; VI-NEXT:    v_or_b32_e32 v48, 0x400000, v22
+; VI-NEXT:    v_or_b32_e32 v29, 0x400000, v25
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; VI-NEXT:    v_bfe_u32 v25, v22, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v36, v27, v29, vcc
+; VI-NEXT:    v_add_u32_e32 v25, vcc, v25, v22
+; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x7fff, v25
+; VI-NEXT:    v_or_b32_e32 v27, 0x400000, v22
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; VI-NEXT:    v_cndmask_b32_e32 v22, v39, v48, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v39, 16, v23
-; VI-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
-; VI-NEXT:    v_bfe_u32 v48, v39, 16, 1
-; VI-NEXT:    v_add_u32_e32 v48, vcc, v48, v39
-; VI-NEXT:    v_add_u32_e32 v48, vcc, 0x7fff, v48
-; VI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; VI-NEXT:    v_or_b32_e32 v49, 0x400000, v39
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
-; VI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; VI-NEXT:    v_cndmask_b32_e32 v39, v48, v49, vcc
-; VI-NEXT:    v_bfe_u32 v48, v23, 16, 1
-; VI-NEXT:    v_add_u32_e32 v48, vcc, v48, v23
-; VI-NEXT:    v_add_u32_e32 v48, vcc, 0x7fff, v48
-; VI-NEXT:    v_or_b32_e32 v49, 0x400000, v23
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; VI-NEXT:    v_cndmask_b32_e32 v23, v48, v49, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v48, 16, v24
-; VI-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
-; VI-NEXT:    v_bfe_u32 v49, v48, 16, 1
-; VI-NEXT:    v_add_u32_e32 v49, vcc, v49, v48
-; VI-NEXT:    v_add_u32_e32 v49, vcc, 0x7fff, v49
+; VI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v23
+; VI-NEXT:    v_bfe_u32 v23, v22, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v25, v25, v27, vcc
+; VI-NEXT:    v_add_u32_e32 v23, vcc, v23, v22
+; VI-NEXT:    v_add_u32_e32 v23, vcc, 0x7fff, v23
+; VI-NEXT:    v_or_b32_e32 v27, 0x400000, v22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; VI-NEXT:    v_cndmask_b32_e32 v22, v23, v27, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v25
+; VI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v35
+; VI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; VI-NEXT:    v_bfe_u32 v27, v25, 16, 1
+; VI-NEXT:    v_add_u32_e32 v27, vcc, v27, v25
+; VI-NEXT:    v_add_u32_e32 v27, vcc, 0x7fff, v27
+; VI-NEXT:    v_or_b32_e32 v29, 0x400000, v25
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; VI-NEXT:    v_cndmask_b32_e32 v25, v27, v29, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v27, 16, v35
+; VI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; VI-NEXT:    v_bfe_u32 v29, v27, 16, 1
+; VI-NEXT:    v_add_u32_e32 v29, vcc, v29, v27
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v25
+; VI-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; VI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; VI-NEXT:    v_or_b32_e32 v50, 0x400000, v48
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
+; VI-NEXT:    v_add_u32_e32 v29, vcc, 0x7fff, v29
 ; VI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; VI-NEXT:    v_cndmask_b32_e32 v48, v49, v50, vcc
-; VI-NEXT:    v_bfe_u32 v49, v24, 16, 1
-; VI-NEXT:    v_add_u32_e32 v49, vcc, v49, v24
-; VI-NEXT:    v_add_u32_e32 v49, vcc, 0x7fff, v49
-; VI-NEXT:    v_or_b32_e32 v50, 0x400000, v24
+; VI-NEXT:    v_or_b32_e32 v35, 0x400000, v27
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
+; VI-NEXT:    v_bfe_u32 v27, v24, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v50, v29, v35, vcc
+; VI-NEXT:    v_add_u32_e32 v27, vcc, v27, v24
+; VI-NEXT:    v_add_u32_e32 v27, vcc, 0x7fff, v27
+; VI-NEXT:    v_or_b32_e32 v29, 0x400000, v24
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; VI-NEXT:    v_cndmask_b32_e32 v24, v49, v50, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v49, 16, v25
-; VI-NEXT:    v_add_f32_e32 v49, 0x40c00000, v49
-; VI-NEXT:    v_bfe_u32 v50, v49, 16, 1
-; VI-NEXT:    v_add_u32_e32 v50, vcc, v50, v49
-; VI-NEXT:    v_add_u32_e32 v50, vcc, 0x7fff, v50
-; VI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; VI-NEXT:    v_or_b32_e32 v51, 0x400000, v49
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
-; VI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; VI-NEXT:    v_cndmask_b32_e32 v49, v50, v51, vcc
-; VI-NEXT:    v_bfe_u32 v50, v25, 16, 1
-; VI-NEXT:    v_add_u32_e32 v50, vcc, v50, v25
-; VI-NEXT:    v_add_u32_e32 v50, vcc, 0x7fff, v50
-; VI-NEXT:    v_or_b32_e32 v51, 0x400000, v25
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; VI-NEXT:    v_cndmask_b32_e32 v25, v50, v51, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v50, 16, v26
-; VI-NEXT:    v_add_f32_e32 v50, 0x40c00000, v50
-; VI-NEXT:    v_bfe_u32 v51, v50, 16, 1
-; VI-NEXT:    v_add_u32_e32 v51, vcc, v51, v50
-; VI-NEXT:    v_add_u32_e32 v51, vcc, 0x7fff, v51
+; VI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v25
+; VI-NEXT:    v_bfe_u32 v25, v24, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v27, v27, v29, vcc
+; VI-NEXT:    v_add_u32_e32 v25, vcc, v25, v24
+; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x7fff, v25
+; VI-NEXT:    v_or_b32_e32 v29, 0x400000, v24
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; VI-NEXT:    v_cndmask_b32_e32 v24, v25, v29, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v27
+; VI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v34
+; VI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; VI-NEXT:    v_bfe_u32 v29, v27, 16, 1
+; VI-NEXT:    v_add_u32_e32 v29, vcc, v29, v27
+; VI-NEXT:    v_add_u32_e32 v29, vcc, 0x7fff, v29
+; VI-NEXT:    v_or_b32_e32 v35, 0x400000, v27
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
+; VI-NEXT:    v_cndmask_b32_e32 v27, v29, v35, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v29, 16, v34
+; VI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; VI-NEXT:    v_bfe_u32 v34, v29, 16, 1
+; VI-NEXT:    v_add_u32_e32 v34, vcc, v34, v29
+; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x7fff, v34
+; VI-NEXT:    v_or_b32_e32 v35, 0x400000, v29
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
+; VI-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v27
+; VI-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; VI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; VI-NEXT:    v_or_b32_e32 v52, 0x400000, v50
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; VI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; VI-NEXT:    v_cndmask_b32_e32 v50, v51, v52, vcc
-; VI-NEXT:    v_bfe_u32 v51, v26, 16, 1
-; VI-NEXT:    v_add_u32_e32 v51, vcc, v51, v26
-; VI-NEXT:    v_add_u32_e32 v51, vcc, 0x7fff, v51
+; VI-NEXT:    v_bfe_u32 v29, v26, 16, 1
+; VI-NEXT:    v_add_u32_e32 v29, vcc, v29, v26
+; VI-NEXT:    v_add_u32_e32 v29, vcc, 0x7fff, v29
 ; VI-NEXT:    v_or_b32_e32 v52, 0x400000, v26
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; VI-NEXT:    v_cndmask_b32_e32 v26, v51, v52, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v51, 16, v27
-; VI-NEXT:    v_add_f32_e32 v51, 0x40c00000, v51
-; VI-NEXT:    v_bfe_u32 v52, v51, 16, 1
-; VI-NEXT:    v_add_u32_e32 v52, vcc, v52, v51
+; VI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v27
+; VI-NEXT:    v_bfe_u32 v27, v26, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v29, v29, v52, vcc
+; VI-NEXT:    v_add_u32_e32 v27, vcc, v27, v26
+; VI-NEXT:    v_add_u32_e32 v27, vcc, 0x7fff, v27
+; VI-NEXT:    v_or_b32_e32 v52, 0x400000, v26
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    v_cndmask_b32_e32 v26, v27, v52, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v29
+; VI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v33
+; VI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; VI-NEXT:    v_bfe_u32 v52, v29, 16, 1
+; VI-NEXT:    v_add_u32_e32 v52, vcc, v52, v29
 ; VI-NEXT:    v_add_u32_e32 v52, vcc, 0x7fff, v52
-; VI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; VI-NEXT:    v_or_b32_e32 v53, 0x400000, v51
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
-; VI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; VI-NEXT:    v_cndmask_b32_e32 v51, v52, v53, vcc
-; VI-NEXT:    v_bfe_u32 v52, v27, 16, 1
-; VI-NEXT:    v_add_u32_e32 v52, vcc, v52, v27
+; VI-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
+; VI-NEXT:    v_or_b32_e32 v53, 0x400000, v29
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
+; VI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; VI-NEXT:    v_cndmask_b32_e32 v29, v52, v53, vcc
+; VI-NEXT:    v_bfe_u32 v52, v33, 16, 1
+; VI-NEXT:    v_add_u32_e32 v52, vcc, v52, v33
 ; VI-NEXT:    v_add_u32_e32 v52, vcc, 0x7fff, v52
-; VI-NEXT:    v_or_b32_e32 v53, 0x400000, v27
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; VI-NEXT:    v_cndmask_b32_e32 v27, v52, v53, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v52, 16, v28
-; VI-NEXT:    v_add_f32_e32 v52, 0x40c00000, v52
-; VI-NEXT:    v_bfe_u32 v53, v52, 16, 1
-; VI-NEXT:    v_add_u32_e32 v53, vcc, v53, v52
-; VI-NEXT:    v_add_u32_e32 v53, vcc, 0x7fff, v53
+; VI-NEXT:    v_or_b32_e32 v53, 0x400000, v33
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v29
+; VI-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; VI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; VI-NEXT:    v_or_b32_e32 v54, 0x400000, v52
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
 ; VI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; VI-NEXT:    v_cndmask_b32_e32 v52, v53, v54, vcc
-; VI-NEXT:    v_bfe_u32 v53, v28, 16, 1
-; VI-NEXT:    v_add_u32_e32 v53, vcc, v53, v28
-; VI-NEXT:    v_add_u32_e32 v53, vcc, 0x7fff, v53
+; VI-NEXT:    v_bfe_u32 v33, v28, 16, 1
+; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v28
+; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
 ; VI-NEXT:    v_or_b32_e32 v54, 0x400000, v28
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; VI-NEXT:    v_cndmask_b32_e32 v28, v53, v54, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v53, 16, v29
-; VI-NEXT:    v_add_f32_e32 v53, 0x40c00000, v53
-; VI-NEXT:    v_bfe_u32 v54, v53, 16, 1
-; VI-NEXT:    v_add_u32_e32 v54, vcc, v54, v53
+; VI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v29
+; VI-NEXT:    v_bfe_u32 v29, v28, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v33, v33, v54, vcc
+; VI-NEXT:    v_add_u32_e32 v29, vcc, v29, v28
+; VI-NEXT:    v_add_u32_e32 v29, vcc, 0x7fff, v29
+; VI-NEXT:    v_or_b32_e32 v54, 0x400000, v28
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
+; VI-NEXT:    v_cndmask_b32_e32 v28, v29, v54, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v33
+; VI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v32
+; VI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; VI-NEXT:    v_bfe_u32 v54, v33, 16, 1
+; VI-NEXT:    v_add_u32_e32 v54, vcc, v54, v33
 ; VI-NEXT:    v_add_u32_e32 v54, vcc, 0x7fff, v54
-; VI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; VI-NEXT:    v_or_b32_e32 v55, 0x400000, v53
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
-; VI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; VI-NEXT:    v_cndmask_b32_e32 v53, v54, v55, vcc
-; VI-NEXT:    v_bfe_u32 v54, v29, 16, 1
-; VI-NEXT:    v_add_u32_e32 v54, vcc, v54, v29
+; VI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; VI-NEXT:    v_or_b32_e32 v55, 0x400000, v33
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
+; VI-NEXT:    v_cndmask_b32_e32 v33, v54, v55, vcc
+; VI-NEXT:    v_bfe_u32 v54, v32, 16, 1
+; VI-NEXT:    v_add_u32_e32 v54, vcc, v54, v32
 ; VI-NEXT:    v_add_u32_e32 v54, vcc, 0x7fff, v54
-; VI-NEXT:    v_or_b32_e32 v55, 0x400000, v29
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; VI-NEXT:    v_cndmask_b32_e32 v29, v54, v55, vcc
+; VI-NEXT:    v_or_b32_e32 v55, 0x400000, v32
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
+; VI-NEXT:    v_cndmask_b32_e32 v32, v54, v55, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v54, 16, v30
-; VI-NEXT:    v_add_f32_e32 v54, 0x40c00000, v54
-; VI-NEXT:    v_bfe_u32 v55, v54, 16, 1
-; VI-NEXT:    v_add_u32_e32 v55, vcc, v55, v54
-; VI-NEXT:    v_add_u32_e32 v55, vcc, 0x7fff, v55
 ; VI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; VI-NEXT:    v_or_b32_e32 v40, 0x400000, v54
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
 ; VI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; VI-NEXT:    v_cndmask_b32_e32 v54, v55, v40, vcc
 ; VI-NEXT:    v_bfe_u32 v55, v30, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v55, vcc, v55, v30
 ; VI-NEXT:    v_add_u32_e32 v55, vcc, 0x7fff, v55
 ; VI-NEXT:    v_or_b32_e32 v40, 0x400000, v30
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; VI-NEXT:    v_add_f32_e32 v54, 0x40c00000, v54
 ; VI-NEXT:    v_cndmask_b32_e32 v30, v55, v40, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v55, 16, v31
-; VI-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
-; VI-NEXT:    v_bfe_u32 v40, v55, 16, 1
-; VI-NEXT:    v_add_u32_e32 v40, vcc, v40, v55
-; VI-NEXT:    v_add_u32_e32 v40, vcc, 0x7fff, v40
-; VI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; VI-NEXT:    v_or_b32_e32 v41, 0x400000, v55
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
-; VI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; VI-NEXT:    v_cndmask_b32_e32 v55, v40, v41, vcc
-; VI-NEXT:    v_bfe_u32 v40, v31, 16, 1
-; VI-NEXT:    v_add_u32_e32 v40, vcc, v40, v31
+; VI-NEXT:    v_bfe_u32 v55, v54, 16, 1
+; VI-NEXT:    v_add_u32_e32 v55, vcc, v55, v54
+; VI-NEXT:    v_add_u32_e32 v55, vcc, 0x7fff, v55
+; VI-NEXT:    v_or_b32_e32 v40, 0x400000, v54
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
+; VI-NEXT:    v_cndmask_b32_e32 v54, v55, v40, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v30
+; VI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v31
+; VI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; VI-NEXT:    v_bfe_u32 v40, v30, 16, 1
+; VI-NEXT:    v_add_u32_e32 v40, vcc, v40, v30
 ; VI-NEXT:    v_add_u32_e32 v40, vcc, 0x7fff, v40
-; VI-NEXT:    v_or_b32_e32 v41, 0x400000, v31
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; VI-NEXT:    v_cndmask_b32_e32 v31, v40, v41, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; VI-NEXT:    v_or_b32_e32 v41, 0x400000, v30
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; VI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; VI-NEXT:    v_bfe_u32 v31, v30, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[48:49], 16, v[48:49]
+; VI-NEXT:    v_cndmask_b32_e32 v40, v40, v41, vcc
+; VI-NEXT:    v_add_u32_e32 v31, vcc, v31, v30
+; VI-NEXT:    v_lshrrev_b64 v[36:37], 16, v[36:37]
+; VI-NEXT:    v_add_u32_e32 v31, vcc, 0x7fff, v31
+; VI-NEXT:    v_mov_b32_e32 v37, v48
+; VI-NEXT:    v_lshrrev_b64 v[48:49], 16, v[50:51]
+; VI-NEXT:    v_or_b32_e32 v41, 0x400000, v30
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; VI-NEXT:    v_lshrrev_b64 v[34:35], 16, v[34:35]
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; VI-NEXT:    v_cndmask_b32_e32 v30, v31, v41, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v40
+; VI-NEXT:    v_mov_b32_e32 v35, v48
+; VI-NEXT:    v_lshrrev_b64 v[48:49], 16, v[52:53]
 ; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT:    v_alignbit_b32 v31, v31, v55, 16
-; VI-NEXT:    v_alignbit_b32 v30, v30, v54, 16
-; VI-NEXT:    v_alignbit_b32 v29, v29, v53, 16
-; VI-NEXT:    v_alignbit_b32 v28, v28, v52, 16
-; VI-NEXT:    v_alignbit_b32 v27, v27, v51, 16
-; VI-NEXT:    v_alignbit_b32 v26, v26, v50, 16
-; VI-NEXT:    v_alignbit_b32 v25, v25, v49, 16
-; VI-NEXT:    v_alignbit_b32 v24, v24, v48, 16
-; VI-NEXT:    v_alignbit_b32 v23, v23, v39, 16
-; VI-NEXT:    v_alignbit_b32 v22, v22, v38, 16
-; VI-NEXT:    v_alignbit_b32 v21, v21, v37, 16
-; VI-NEXT:    v_alignbit_b32 v20, v20, v36, 16
-; VI-NEXT:    v_alignbit_b32 v19, v19, v35, 16
-; VI-NEXT:    v_alignbit_b32 v32, v32, v34, 16
-; VI-NEXT:    v_alignbit_b32 v17, v17, v33, 16
-; VI-NEXT:    v_alignbit_b32 v16, v16, v18, 16
+; VI-NEXT:    v_lshrrev_b64 v[32:33], 16, v[32:33]
+; VI-NEXT:    v_lshrrev_b64 v[50:51], 16, v[30:31]
+; VI-NEXT:    v_lshrrev_b64 v[38:39], 16, v[38:39]
+; VI-NEXT:    v_mov_b32_e32 v33, v48
+; VI-NEXT:    v_lshrrev_b64 v[30:31], 16, v[54:55]
+; VI-NEXT:    v_lshrrev_b64 v[28:29], 16, v[28:29]
+; VI-NEXT:    v_lshrrev_b64 v[26:27], 16, v[26:27]
+; VI-NEXT:    v_lshrrev_b64 v[24:25], 16, v[24:25]
+; VI-NEXT:    v_lshrrev_b64 v[22:23], 16, v[22:23]
+; VI-NEXT:    v_lshrrev_b64 v[20:21], 16, v[20:21]
+; VI-NEXT:    v_lshrrev_b64 v[48:49], 16, v[18:19]
+; VI-NEXT:    v_lshrrev_b64 v[16:17], 16, v[16:17]
+; VI-NEXT:    v_mov_b32_e32 v31, v50
 ; VI-NEXT:    s_branch .LBB101_5
 ; VI-NEXT:  .LBB101_3:
 ; VI-NEXT:    s_branch .LBB101_2
@@ -225619,7 +226266,14 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; VI-NEXT:  .LBB101_5: ; %end
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v18, v32
+; VI-NEXT:    v_mov_b32_e32 v17, v38
+; VI-NEXT:    v_mov_b32_e32 v18, v48
+; VI-NEXT:    v_mov_b32_e32 v19, v37
+; VI-NEXT:    v_mov_b32_e32 v21, v36
+; VI-NEXT:    v_mov_b32_e32 v23, v35
+; VI-NEXT:    v_mov_b32_e32 v25, v34
+; VI-NEXT:    v_mov_b32_e32 v27, v33
+; VI-NEXT:    v_mov_b32_e32 v29, v32
 ; VI-NEXT:    v_readlane_b32 s31, v42, 1
 ; VI-NEXT:    v_readlane_b32 s30, v42, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
@@ -229236,1105 +229890,1194 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v37, v0
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40
+; SI-NEXT:    s_waitcnt expcnt(6)
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:48
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:60
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:64
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:76
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v9
-; SI-NEXT:    v_mov_b32_e32 v46, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v43, v2
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v1
+; SI-NEXT:    v_mov_b32_e32 v37, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v44, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v45, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v46
-; SI-NEXT:    v_cvt_f16_f32_e32 v46, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v51, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, s27
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v57, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v49
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v41, v39
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v52
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v29, v53
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v43, v43
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, v44
 ; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_cvt_f16_f32_e32 v59, v39
+; SI-NEXT:    v_cvt_f16_f32_e32 v45, v45
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v46, v46
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v49
-; SI-NEXT:    s_waitcnt vmcnt(10) expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v52
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v47
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v56
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, v57
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v54
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v58
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, v59
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v61
 ; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v62
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v63, v63
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v50
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v51
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, s22
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, s26
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, s28
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, s29
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, v31
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, s19
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, s28
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, s29
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB103_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v23
-; SI-NEXT:    v_mov_b32_e32 v23, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v24
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v25
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v36
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v29
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v30
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v10
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v36
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v39
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v12
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v48
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v49
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v32
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v33
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v18
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v18, v20
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v50
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v20
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v51
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v16
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v52
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v17
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v40
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v19
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v15
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v22
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v23
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v23
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v24
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v54
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v31
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v48
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v33
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v50
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v27
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v51
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v26
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v55
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v25
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v40
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v46
-; SI-NEXT:    s_mov_b64 s[4:5], 0
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_mov_b32_e32 v51, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; SI-NEXT:    v_mov_b32_e32 v36, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v44
-; SI-NEXT:    v_mov_b32_e32 v50, v26
-; SI-NEXT:    v_mov_b32_e32 v33, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v38
-; SI-NEXT:    v_mov_b32_e32 v38, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v59
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v60
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v61
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v54
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v42
-; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v62
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v10
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v41
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v11
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v38
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v12
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v13
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v39
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v14
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v29
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v31
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v49
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v42
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v18
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v30
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v19
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v44
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v20
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v52
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v47
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v45
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v56
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v45
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v57
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v58
-; SI-NEXT:    v_mov_b32_e32 v58, v5
-; SI-NEXT:    v_mov_b32_e32 v59, v11
-; SI-NEXT:    v_mov_b32_e32 v60, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v63
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v55
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; SI-NEXT:    v_mov_b32_e32 v5, v23
-; SI-NEXT:    v_mov_b32_e32 v7, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v53
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v46
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v34
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v56
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v35
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v58
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v37
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v21
+; SI-NEXT:    v_mov_b32_e32 v38, v54
+; SI-NEXT:    v_mov_b32_e32 v52, v51
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v43
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v47
+; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v57
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v59
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v61
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v62
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v63
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; SI-NEXT:    v_mov_b32_e32 v11, v32
+; SI-NEXT:    v_mov_b32_e32 v27, v26
+; SI-NEXT:    v_mov_b32_e32 v25, v28
+; SI-NEXT:    v_mov_b32_e32 v30, v48
+; SI-NEXT:    v_mov_b32_e32 v32, v50
+; SI-NEXT:    v_mov_b32_e32 v39, v49
+; SI-NEXT:    v_mov_b32_e32 v48, v55
+; SI-NEXT:    v_mov_b32_e32 v49, v40
+; SI-NEXT:    v_mov_b32_e32 v50, v20
 ; SI-NEXT:    s_branch .LBB103_3
 ; SI-NEXT:  .LBB103_2:
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    v_mov_b32_e32 v33, v28
-; SI-NEXT:    v_mov_b32_e32 v50, v26
-; SI-NEXT:    v_mov_b32_e32 v36, v22
-; SI-NEXT:    v_mov_b32_e32 v51, v21
+; SI-NEXT:    v_mov_b32_e32 v11, v32
+; SI-NEXT:    v_mov_b32_e32 v32, v50
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mov_b32_e32 v52, v51
+; SI-NEXT:    v_mov_b32_e32 v38, v54
+; SI-NEXT:    v_mov_b32_e32 v18, v20
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    v_mov_b32_e32 v5, v6
+; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v27, v26
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v25, v28
+; SI-NEXT:    v_mov_b32_e32 v30, v48
+; SI-NEXT:    v_mov_b32_e32 v39, v49
+; SI-NEXT:    v_mov_b32_e32 v48, v55
+; SI-NEXT:    v_mov_b32_e32 v49, v40
 ; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr60
+; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:  .LBB103_3: ; %Flow
-; SI-NEXT:    v_mov_b32_e32 v19, v20
-; SI-NEXT:    v_mov_b32_e32 v6, v27
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_mov_b32_e32 v61, v2
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v51, v2
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; SI-NEXT:    s_cbranch_vccnz .LBB103_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v55
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v63
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v62
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v2
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v8
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v58
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, v46
+; SI-NEXT:    v_mov_b32_e32 v28, v38
+; SI-NEXT:    v_cvt_f32_f16_e32 v36, v24
+; SI-NEXT:    v_add_f32_e32 v58, 0x38000000, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v47
+; SI-NEXT:    v_add_f32_e32 v47, 0x38000000, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, v43
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v46, 0x38000000, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v44
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v42
+; SI-NEXT:    v_add_f32_e32 v43, 0x38000000, v33
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, v50
+; SI-NEXT:    v_add_f32_e32 v42, 0x38000000, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v51
+; SI-NEXT:    v_cvt_f32_f16_e32 v51, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v50, v52
+; SI-NEXT:    v_add_f32_e32 v55, 0x38000000, v33
+; SI-NEXT:    v_add_f32_e32 v54, 0x38000000, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v13
+; SI-NEXT:    v_add_f32_e32 v38, 0x38000000, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v30
+; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v37
+; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v21
+; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v33
+; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v17
+; SI-NEXT:    v_add_f32_e32 v32, 0x38000000, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v23
+; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v12
+; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v41
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v63
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v62
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v60
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v2
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v10
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v61
+; SI-NEXT:    v_add_f32_e32 v62, 0x38000000, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v59
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v9
+; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v57
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v56
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v55, 0x38000000, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v61
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v42
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v41
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v34
-; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v15
-; SI-NEXT:    v_add_f32_e32 v54, 0x38000000, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v53
-; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v51
-; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v35
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v10
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v19
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v7
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v32
-; SI-NEXT:    v_add_f32_e32 v42, 0x38000000, v42
-; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v52, 0x38000000, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v33
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v14
-; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v46
-; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
-; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v45
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v5
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_add_f32_e32 v45, 0x38000000, v45
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v49, 0x38000000, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v57
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v15
-; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v47
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v50
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v56
-; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v44
-; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v36
-; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v32, 0x38000000, v32
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v10
+; SI-NEXT:    v_add_f32_e32 v56, 0x38000000, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v45
+; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v16
+; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v29
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_cvt_f32_f16_e32 v29, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v20
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v18
+; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v31
+; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v29
+; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v39
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
 ; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
+; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
 ; SI-NEXT:    v_add_f32_e32 v36, 0x38000000, v36
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v50, 0x38000000, v50
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v51, 0x38000000, v51
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v50, 0x38000000, v50
 ; SI-NEXT:    v_add_f32_e32 v40, 0x38000000, v40
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v43, 0x38000000, v43
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_add_f32_e32 v44, 0x38000000, v44
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v46, 0x38000000, v46
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v47, 0x38000000, v47
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v56, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v56, 0x38000000, v56
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v57, 0x38000000, v57
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v58, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v58, 0x38000000, v58
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_cvt_f32_f16_e32 v53, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v53, 0x38000000, v53
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v61, v19
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v61, 0x38000000, v61
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v52, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v52, 0x38000000, v52
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v63, v19
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v63, 0x38000000, v63
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v49, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v49, 0x38000000, v49
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
 ; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v30, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v30
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v8
+; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT:    v_add_f32_e32 v45, 0x38000000, v45
+; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v57, 0x38000000, v57
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v20
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v21
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v23
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v20
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v21
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v23
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v39
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v49
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v20
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v21
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v23
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v52
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v53
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v63
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v20
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v19
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v57
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v31
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v61
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_add_f32_e32 v59, 0x38000000, v59
-; SI-NEXT:    v_cvt_f16_f32_e32 v59, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v59
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v60, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v60, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v20
 ; SI-NEXT:    v_add_f32_e32 v60, 0x38000000, v60
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v61, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v61, 0x38000000, v61
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v61
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v62, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v62, 0x38000000, v62
-; SI-NEXT:    v_cvt_f16_f32_e32 v62, v62
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v63, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v63, 0x38000000, v63
-; SI-NEXT:    v_cvt_f16_f32_e32 v63, v63
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v60
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v63
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v19
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v62
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v21
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v61
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v45
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v60
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v14
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v12
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v33
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v58
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v57
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v12
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v56
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v47
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v45
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v46
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v12
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v44
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v43
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v42
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v51
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v50
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v12
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v51
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v54
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v12
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v44
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v34
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v43
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v12
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v47
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v46
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v58
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v4
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v56
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v52
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v49
-; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v54
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
-; SI-NEXT:    v_mov_b32_e32 v17, v11
-; SI-NEXT:    v_mov_b32_e32 v16, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v62
+; SI-NEXT:    v_mov_b32_e32 v36, v12
+; SI-NEXT:    v_mov_b32_e32 v35, v19
+; SI-NEXT:    v_mov_b32_e32 v8, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v22
-; SI-NEXT:    v_mov_b32_e32 v22, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v4, v13
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB103_5: ; %end
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    buffer_store_dword v1, v37, s[0:3], 0 offen
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 4, v37
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v37
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v37
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 16, v37
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 20, v37
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v37
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v37
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v4
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v4
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v37
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v37
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v35
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 48, v37
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v37
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v36
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v37
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v16
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v37
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v37
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v37
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v40
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v13
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v37
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v8
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v37
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v59
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v38
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v37
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v60
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v58
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v37
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v10
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v37
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v15
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v37
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x60, v37
-; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v8
-; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x64, v37
-; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v39
-; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x68, v37
-; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v49
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v25
-; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x6c, v37
-; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v52
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v29
-; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x70, v37
-; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v30
-; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x74, v37
-; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v34
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v43
-; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x78, v37
-; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v26
-; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x7c, v37
-; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v33
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
@@ -231398,17 +232141,32 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v13
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT:    v_alignbit_b32 v5, v23, v5, 16
 ; SI-NEXT:    v_alignbit_b32 v2, v21, v2, 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v2, v20, v6, 16
 ; SI-NEXT:    v_alignbit_b32 v1, v61, v1, 16
+; SI-NEXT:    v_alignbit_b32 v5, v23, v5, 16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v2, v19, v3, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v1, v18, v4, 16
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
@@ -231418,57 +232176,63 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; SI-NEXT:    v_alignbit_b32 v7, v24, v7, 16
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
 ; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
 ; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v16
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
 ; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v17
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v10
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_alignbit_b32 v25, v45, v8, 16
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v9
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; SI-NEXT:    v_alignbit_b32 v8, v25, v8, 16
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
 ; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
 ; SI-NEXT:    v_alignbit_b32 v62, v63, v16, 16
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v33
 ; SI-NEXT:    v_alignbit_b32 v16, v62, v16, 16
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v10
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_alignbit_b32 v22, v34, v9, 16
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; SI-NEXT:    v_alignbit_b32 v9, v22, v9, 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
 ; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
@@ -231526,31 +232290,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_alignbit_b32 v15, v41, v15, 16
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v2, v20, v6, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v2, v19, v3, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v1, v18, v4, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB104_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
@@ -231855,6 +232596,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; VI-LABEL: bitcast_v64bf16_to_v64i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -231871,9 +232614,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -232122,7 +232863,6 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v40, 0x400000, v30
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; VI-NEXT:    v_cndmask_b32_e32 v30, v55, v40, vcc
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v55, 16, v31
 ; VI-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
 ; VI-NEXT:    v_bfe_u32 v40, v55, 16, 1
@@ -232488,6 +233228,9 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v64bf16_to_v64i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -232504,9 +233247,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -232725,7 +233466,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v40, 0x400000, v30
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX9-NEXT:    v_cndmask_b32_e32 v30, v55, v40, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v55, 16, v31
 ; GFX9-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
 ; GFX9-NEXT:    v_bfe_u32 v40, v55, 16, 1
@@ -234101,1088 +234842,1237 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:64
 ; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:72
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:76
-; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v4
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:72
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:76
+; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v13
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v5
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v17
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v21
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v12
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v25
+; SI-NEXT:    v_mov_b32_e32 v50, v27
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v29
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v20
-; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v6
-; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v7
-; SI-NEXT:    v_mul_f32_e32 v8, 1.0, v9
-; SI-NEXT:    v_mul_f32_e32 v6, 1.0, v10
-; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v11
-; SI-NEXT:    v_mul_f32_e32 v12, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v10, 1.0, v14
-; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v14, 1.0, v18
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v40, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v9
+; SI-NEXT:    v_mul_f32_e32 v9, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v11
+; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v14
+; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v15
+; SI-NEXT:    v_mul_f32_e32 v11, 1.0, v16
+; SI-NEXT:    v_mul_f32_e32 v63, 1.0, v18
 ; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v19
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v21
-; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v22
-; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v23
-; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v25
-; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; SI-NEXT:    v_mul_f32_e32 v63, 1.0, v28
-; SI-NEXT:    v_mul_f32_e32 v7, 1.0, v29
-; SI-NEXT:    v_mul_f32_e32 v11, 1.0, v30
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v30, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v22, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v28, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v21, 1.0, s28
+; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v20
+; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v22
+; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v23
+; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v24
+; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v26
+; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v50
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v28
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v30
+; SI-NEXT:    v_mul_f32_e64 v28, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v30, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v50, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s28
+; SI-NEXT:    v_mul_f32_e64 v26, 1.0, s29
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v17
-; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v24
-; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v26
-; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v32
-; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v33
+; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v32
+; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v33
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v34
-; SI-NEXT:    v_mul_f32_e32 v35, 1.0, v35
-; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v36
-; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v37
-; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v39
-; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v48
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v49
-; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v51
-; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v52
-; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v54
-; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v34
+; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v35
+; SI-NEXT:    v_mul_f32_e32 v14, 1.0, v36
+; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v37
+; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v38
+; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v39
+; SI-NEXT:    v_mul_f32_e32 v10, 1.0, v49
+; SI-NEXT:    v_mul_f32_e32 v12, 1.0, v51
+; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v53
 ; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v55
+; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    v_mul_f32_e32 v6, 1.0, v41
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v40
+; SI-NEXT:    v_mul_f32_e32 v8, 1.0, v43
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v41
+; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v44
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v43
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v44
+; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v45
+; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v47
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v46
+; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v57
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v57
+; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v59
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v58
-; SI-NEXT:    v_mul_f32_e64 v54, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v26, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v52, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v34, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v32, 1.0, s29
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v60
+; SI-NEXT:    v_mul_f32_e64 v34, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v36, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v53, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v38, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v60, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v49, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s26
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB105_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v20
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v54
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v26
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v52
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v21
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v50
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v45
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v15
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v14
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v19
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v24
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v27
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v11
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v62
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v54
+; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v38
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v60
+; SI-NEXT:    v_mov_b32_e32 v28, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v50
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v52
+; SI-NEXT:    v_mov_b32_e32 v50, v48
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v38, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
+; SI-NEXT:    v_mov_b32_e32 v41, v44
+; SI-NEXT:    v_mov_b32_e32 v52, v63
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v48
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v49
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mov_b32_e32 v7, v40
+; SI-NEXT:    v_mov_b32_e32 v48, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v12
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v35
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v25
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v36
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v53
+; SI-NEXT:    v_mov_b32_e32 v53, v58
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v39
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v51
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v48
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v49
-; SI-NEXT:    v_mov_b32_e32 v25, v1
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v40
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v46
+; SI-NEXT:    v_mov_b32_e32 v46, v9
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v51
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v35
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v58
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v34
-; SI-NEXT:    v_mov_b32_e32 v57, v13
-; SI-NEXT:    v_mov_b32_e32 v40, v3
-; SI-NEXT:    v_mov_b32_e32 v54, v50
-; SI-NEXT:    v_mov_b32_e32 v46, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; SI-NEXT:    v_mov_b32_e32 v44, v15
-; SI-NEXT:    v_mov_b32_e32 v9, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
-; SI-NEXT:    s_mov_b64 s[4:5], 0
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v59
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v10
-; SI-NEXT:    v_mov_b32_e32 v41, v27
-; SI-NEXT:    v_mov_b32_e32 v52, v62
-; SI-NEXT:    v_mov_b32_e32 v21, v58
-; SI-NEXT:    v_mov_b32_e32 v58, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v29
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v33
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v38
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v53
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v60
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v42
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v37
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v56
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v55
-; SI-NEXT:    v_mov_b32_e32 v55, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v42
-; SI-NEXT:    v_mov_b32_e32 v56, v47
-; SI-NEXT:    v_mov_b32_e32 v53, v5
-; SI-NEXT:    v_mov_b32_e32 v42, v43
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(1)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v63
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v13
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v3
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v50
-; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(1)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v56
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; SI-NEXT:    v_mov_b32_e32 v5, v19
-; SI-NEXT:    v_mov_b32_e32 v7, v15
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v63
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v61
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v61
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v58
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v47
-; SI-NEXT:    v_mov_b32_e32 v47, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v17
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v43
-; SI-NEXT:    v_mov_b32_e32 v1, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v18
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v14
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v39
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v10
+; SI-NEXT:    v_mov_b32_e32 v39, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v32
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v44
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v4
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v55
+; SI-NEXT:    v_mov_b32_e32 v5, v23
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v59
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v45
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v35
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v42
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v60
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v21
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v11
+; SI-NEXT:    v_mov_b32_e32 v9, v11
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v1
 ; SI-NEXT:    s_branch .LBB105_3
 ; SI-NEXT:  .LBB105_2:
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_mov_b32_e32 v25, v1
-; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_mov_b32_e32 v21, v58
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_mov_b32_e32 v52, v62
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    v_mov_b32_e32 v41, v44
+; SI-NEXT:    v_mov_b32_e32 v7, v40
+; SI-NEXT:    s_waitcnt expcnt(6)
+; SI-NEXT:    v_mov_b32_e32 v50, v48
+; SI-NEXT:    v_mov_b32_e32 v48, v27
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr62
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr27
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr44
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v39, v35
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v46, v9
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v53, v58
+; SI-NEXT:    v_mov_b32_e32 v28, v3
+; SI-NEXT:    v_mov_b32_e32 v38, v13
+; SI-NEXT:    s_mov_b64 s[4:5], -1
+; SI-NEXT:    v_mov_b32_e32 v52, v63
+; SI-NEXT:    v_mov_b32_e32 v5, v23
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $vgpr16
+; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v50
-; SI-NEXT:    v_mov_b32_e32 v56, v47
-; SI-NEXT:    v_mov_b32_e32 v9, v11
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v53, v5
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v40, v3
-; SI-NEXT:    v_mov_b32_e32 v44, v15
-; SI-NEXT:    v_mov_b32_e32 v57, v13
-; SI-NEXT:    v_mov_b32_e32 v46, v19
-; SI-NEXT:    v_mov_b32_e32 v41, v27
-; SI-NEXT:    s_mov_b64 s[4:5], -1
-; SI-NEXT:    v_mov_b32_e32 v42, v43
-; SI-NEXT:    v_mov_b32_e32 v3, v17
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:  .LBB105_3: ; %Flow
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v13, v37
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v19, v48
+; SI-NEXT:    v_mov_b32_e32 v63, v7
+; SI-NEXT:    v_mov_b32_e32 v58, v53
+; SI-NEXT:    v_mov_b32_e32 v37, v27
+; SI-NEXT:    v_mov_b32_e32 v48, v49
 ; SI-NEXT:    s_cbranch_vccnz .LBB105_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v59
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_mov_b32_e32 v7, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v41
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_mov_b32_e32 v17, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v30
-; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v31
-; SI-NEXT:    v_mov_b32_e32 v38, v9
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v38
-; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    v_and_b32_e32 v38, 0xffff0000, v31
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v45
+; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v55
+; SI-NEXT:    v_add_f32_e32 v37, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v48, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v13
+; SI-NEXT:    v_mov_b32_e32 v11, v50
+; SI-NEXT:    v_add_f32_e32 v50, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v32
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_alignbit_b32 v2, v4, v2, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v25
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v29
-; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v30
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v28
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v9
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_alignbit_b32 v2, v4, v2, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v34
-; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v29
-; SI-NEXT:    v_and_b32_e32 v34, 0xffff0000, v30
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v56
+; SI-NEXT:    v_add_f32_e32 v53, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v60
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v3
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v42
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v52
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v40, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v59, 0x40c00000, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v56, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v58
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v46
+; SI-NEXT:    v_add_f32_e32 v46, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; SI-NEXT:    v_add_f32_e32 v35, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v39
+; SI-NEXT:    v_add_f32_e32 v61, 0x40c00000, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v3
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_alignbit_b32 v2, v4, v2, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v32
-; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v27
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_alignbit_b32 v2, v4, v2, 16
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v40
-; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v4
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v26
-; SI-NEXT:    v_alignbit_b32 v2, v4, v2, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v11
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v54
-; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v19
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v15
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v34, 0x40c00000, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v34
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v3
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_add_f32_e32 v43, 0x40c00000, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v50
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v44
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v16
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v31
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v57
-; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v52, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v52
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v46
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v63
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v26
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v41
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v11
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v61
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v24
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v52
-; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v63
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v11
+; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v22
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v60
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v20
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v25
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v19
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v17
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v2
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v14
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v56
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v21
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v12
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v42
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v10
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_alignbit_b32 v4, v6, v4, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v53
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_alignbit_b32 v6, v8, v6, 16
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_alignbit_b32 v8, v10, v8, 16
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_alignbit_b32 v10, v12, v10, 16
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT:    v_alignbit_b32 v12, v14, v12, 16
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v23
-; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_alignbit_b32 v14, v18, v14, 16
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_alignbit_b32 v18, v20, v18, 16
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v23
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v47
-; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT:    v_alignbit_b32 v24, v24, v23, 16
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v26
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT:    v_alignbit_b32 v22, v21, v20, 16
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v2
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v16
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshr_b64 v[48:49], v[21:22], 16
-; SI-NEXT:    v_lshr_b64 v[49:50], v[7:8], 16
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v26
-; SI-NEXT:    v_alignbit_b32 v26, v59, v25, 16
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v27
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshr_b64 v[51:52], v[25:26], 16
-; SI-NEXT:    v_lshr_b64 v[52:53], v[1:2], 16
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v20
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_alignbit_b32 v16, v45, v16, 16
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT:    v_alignbit_b32 v28, v58, v27, 16
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
-; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT:    v_alignbit_b32 v20, v55, v20, 16
-; SI-NEXT:    v_lshr_b64 v[36:37], v[19:20], 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v29
-; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v2
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v29
-; SI-NEXT:    v_alignbit_b32 v35, v43, v32, 16
-; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v30
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshr_b64 v[62:63], v[34:35], 16
-; SI-NEXT:    v_lshr_b64 v[33:34], v[15:16], 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v2
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v30
-; SI-NEXT:    v_alignbit_b32 v39, v29, v32, 16
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshr_b64 v[31:32], v[38:39], 16
-; SI-NEXT:    v_lshr_b64 v[37:38], v[5:6], 16
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshr_b64 v[31:32], v[27:28], 16
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshr_b64 v[31:32], v[23:24], 16
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshr_b64 v[31:32], v[17:18], 16
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshr_b64 v[31:32], v[13:14], 16
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v2
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshr_b64 v[31:32], v[11:12], 16
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v2, v28
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshr_b64 v[31:32], v[9:10], 16
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshr_b64 v[31:32], v[3:4], 16
-; SI-NEXT:  .LBB105_5: ; %end
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshr_b64 v[2:3], v[27:28], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; SI-NEXT:    v_mov_b32_e32 v4, v38
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[4:5], v[37:38], 16
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; SI-NEXT:    v_mov_b32_e32 v6, v49
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[6:7], v[48:49], 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
+; SI-NEXT:    v_mov_b32_e32 v8, v51
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v52
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshr_b64 v[8:9], v[50:51], 16
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; SI-NEXT:    v_and_b32_e32 v50, 0xffff0000, v52
+; SI-NEXT:    v_lshr_b64 v[51:52], v[61:62], 16
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshr_b64 v[10:11], v[27:28], 16
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v12
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshr_b64 v[12:13], v[37:38], 16
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; SI-NEXT:    v_lshr_b64 v[14:15], v[53:54], 16
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v16
+; SI-NEXT:    v_mov_b32_e32 v16, v33
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v39
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshr_b64 v[16:17], v[32:33], 16
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; SI-NEXT:    v_mov_b32_e32 v18, v41
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[18:19], v[40:41], 16
+; SI-NEXT:    v_mov_b32_e32 v39, v59
+; SI-NEXT:    v_mov_b32_e32 v40, v60
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v20
+; SI-NEXT:    v_lshr_b64 v[20:21], v[39:40], 16
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; SI-NEXT:    v_lshr_b64 v[22:23], v[56:57], 16
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v24
+; SI-NEXT:    v_lshr_b64 v[24:25], v[46:47], 16
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v26
+; SI-NEXT:    v_lshr_b64 v[26:27], v[35:36], 16
+; SI-NEXT:    v_mov_b32_e32 v27, v30
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v34
+; SI-NEXT:    v_lshr_b64 v[33:34], v[29:30], 16
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v31
+; SI-NEXT:    v_mov_b32_e32 v30, v44
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[30:31], v[43:44], 16
+; SI-NEXT:    v_lshr_b64 v[43:44], v[32:33], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[50:51], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[37:38], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[44:45], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[25:26], 16
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[27:28], v[23:24], 16
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[27:28], v[21:22], 16
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[27:28], v[19:20], 16
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[27:28], v[17:18], 16
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[27:28], v[15:16], 16
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[27:28], v[13:14], 16
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[27:28], v[11:12], 16
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[27:28], v[9:10], 16
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[31:32], v[29:30], 16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[27:28], v[7:8], 16
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:  .LBB105_5: ; %end
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v33
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v43
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v32
+; SI-NEXT:    v_or_b32_e32 v3, v3, v5
+; SI-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v28
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v62
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v31
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v32
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v43
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v30
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v30
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v34
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v27
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v58
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v62
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v51
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v27
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v27
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v59
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v36
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v25
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v25
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v25
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v24
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v47
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v36
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v23
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v23
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v55
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v57
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v33
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v21
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v21
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v45
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v48
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v19
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v19
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v22
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v18
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v15
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v17
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v16
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v15
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v54
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v12
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v10
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v49
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v9
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v37
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v48
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v31
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v37
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v5
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v44
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -235215,19 +236105,19 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_writelane_b32 v42, s31, 1
 ; VI-NEXT:    v_mov_b32_e32 v31, v17
 ; VI-NEXT:    v_mov_b32_e32 v30, v16
-; VI-NEXT:    v_mov_b32_e32 v29, v15
+; VI-NEXT:    v_mov_b32_e32 v32, v15
+; VI-NEXT:    v_mov_b32_e32 v33, v13
+; VI-NEXT:    v_mov_b32_e32 v34, v11
+; VI-NEXT:    v_mov_b32_e32 v35, v9
+; VI-NEXT:    v_mov_b32_e32 v36, v7
+; VI-NEXT:    v_mov_b32_e32 v37, v5
+; VI-NEXT:    v_mov_b32_e32 v38, v3
 ; VI-NEXT:    v_mov_b32_e32 v28, v14
-; VI-NEXT:    v_mov_b32_e32 v27, v13
 ; VI-NEXT:    v_mov_b32_e32 v26, v12
-; VI-NEXT:    v_mov_b32_e32 v25, v11
 ; VI-NEXT:    v_mov_b32_e32 v24, v10
-; VI-NEXT:    v_mov_b32_e32 v23, v9
 ; VI-NEXT:    v_mov_b32_e32 v22, v8
-; VI-NEXT:    v_mov_b32_e32 v21, v7
 ; VI-NEXT:    v_mov_b32_e32 v20, v6
-; VI-NEXT:    v_mov_b32_e32 v19, v5
-; VI-NEXT:    v_mov_b32_e32 v32, v4
-; VI-NEXT:    v_mov_b32_e32 v17, v3
+; VI-NEXT:    v_mov_b32_e32 v48, v4
 ; VI-NEXT:    v_mov_b32_e32 v16, v2
 ; VI-NEXT:    v_readfirstlane_b32 s30, v0
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
@@ -235238,583 +236128,595 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_cbranch_execnz .LBB105_4
 ; VI-NEXT:  .LBB105_2: ; %cmp.true
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    s_lshl_b32 s4, s30, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    s_and_b32 s4, s28, 0xffff0000
+; VI-NEXT:    v_mov_b32_e32 v17, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    s_lshl_b32 s4, s28, 16
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s30, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v2, s4, v17
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    s_lshl_b32 s4, s30, 16
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_lshl_b32 s4, s31, 16
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT:    v_add_f32_e32 v2, s4, v17
+; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; VI-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
+; VI-NEXT:    s_and_b32 s4, s26, 0xffff0000
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    s_and_b32 s4, s31, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s4, v0
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s29, 16
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_alignbit_b32 v14, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_alignbit_b32 v15, v4, v3, 16
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s29, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s28, 16
-; VI-NEXT:    v_alignbit_b32 v13, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s28, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s27, 16
-; VI-NEXT:    v_alignbit_b32 v12, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s27, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_lshrrev_b64 v[12:13], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
 ; VI-NEXT:    s_lshl_b32 s4, s26, 16
-; VI-NEXT:    v_alignbit_b32 v11, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s26, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s25, 16
-; VI-NEXT:    v_alignbit_b32 v10, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s25, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s24, 16
-; VI-NEXT:    v_alignbit_b32 v9, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_lshrrev_b64 v[14:15], 16, v[2:3]
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_and_b32 s4, s24, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s23, 16
-; VI-NEXT:    v_alignbit_b32 v8, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s22, 16
-; VI-NEXT:    v_alignbit_b32 v7, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    s_lshl_b32 s4, s24, 16
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s21, 16
-; VI-NEXT:    v_alignbit_b32 v6, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    s_lshl_b32 s4, s22, 16
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
 ; VI-NEXT:    s_lshl_b32 s4, s20, 16
-; VI-NEXT:    v_alignbit_b32 v5, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_alignbit_b32 v4, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    s_lshl_b32 s4, s18, 16
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    s_lshl_b32 s4, s16, 16
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v17
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v1, s4, v17
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    s_lshl_b32 s4, s17, 16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v17
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; VI-NEXT:    v_add_f32_e32 v1, s4, v17
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v18, v5, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v18, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v1
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v18, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v18, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v2
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT:    v_or_b32_e32 v33, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v18, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v1
+; VI-NEXT:    s_lshl_b32 s4, s19, 16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v17
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v49, v5, v7, vcc
+; VI-NEXT:    v_add_f32_e32 v5, s4, v17
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    s_lshl_b32 s4, s21, 16
+; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc
+; VI-NEXT:    v_add_f32_e32 v7, s4, v17
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
+; VI-NEXT:    v_add_f32_e32 v5, s4, v17
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
+; VI-NEXT:    v_mov_b32_e32 v1, v18
+; VI-NEXT:    v_cndmask_b32_e32 v18, v9, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    s_lshl_b32 s4, s23, 16
+; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc
+; VI-NEXT:    v_add_f32_e32 v7, s4, v17
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT:    v_lshrrev_b64 v[49:50], 16, v[49:50]
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    s_and_b32 s4, s25, 0xffff0000
+; VI-NEXT:    v_mov_b32_e32 v3, v49
+; VI-NEXT:    v_cndmask_b32_e32 v49, v9, v11, vcc
+; VI-NEXT:    v_add_f32_e32 v9, s4, v17
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    s_lshl_b32 s4, s25, 16
+; VI-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
+; VI-NEXT:    v_add_f32_e32 v11, s4, v17
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT:    s_and_b32 s4, s27, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
+; VI-NEXT:    v_add_f32_e32 v9, s4, v17
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; VI-NEXT:    v_mov_b32_e32 v5, v18
+; VI-NEXT:    v_cndmask_b32_e32 v18, v13, v15, vcc
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    s_lshl_b32 s4, s27, 16
+; VI-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
+; VI-NEXT:    v_add_f32_e32 v11, s4, v17
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT:    v_lshrrev_b64 v[49:50], 16, v[49:50]
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT:    v_mov_b32_e32 v7, v49
+; VI-NEXT:    v_cndmask_b32_e32 v49, v13, v15, vcc
+; VI-NEXT:    v_add_f32_e32 v13, s4, v17
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v9
+; VI-NEXT:    v_mov_b32_e32 v9, v18
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    s_lshl_b32 s4, s29, 16
+; VI-NEXT:    v_cndmask_b32_e32 v13, v15, v18, vcc
+; VI-NEXT:    v_add_f32_e32 v15, s4, v17
+; VI-NEXT:    v_bfe_u32 v18, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT:    v_or_b32_e32 v33, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v18, v33, vcc
-; VI-NEXT:    v_add_f32_e32 v18, s4, v0
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v18, v1, 16
-; VI-NEXT:    v_add_f32_e32 v18, s4, v0
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v0
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v33, v34, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v18, 16
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
-; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v18
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    s_and_b32 s4, s31, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
+; VI-NEXT:    v_add_f32_e32 v13, s4, v17
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    v_lshrrev_b64 v[18:19], 16, v[18:19]
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    s_lshl_b32 s4, s31, 16
+; VI-NEXT:    v_cndmask_b32_e32 v13, v15, v19, vcc
+; VI-NEXT:    v_add_f32_e32 v15, s4, v17
+; VI-NEXT:    v_bfe_u32 v17, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v15
+; VI-NEXT:    v_lshrrev_b64 v[49:50], 16, v[49:50]
+; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    v_mov_b32_e32 v11, v49
+; VI-NEXT:    v_cndmask_b32_e32 v49, v17, v19, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; VI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; VI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; VI-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT:    v_bfe_u32 v33, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v16
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT:    v_or_b32_e32 v34, 0x400000, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v13
+; VI-NEXT:    v_mov_b32_e32 v13, v18
+; VI-NEXT:    v_bfe_u32 v18, v16, 16, 1
+; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v16
+; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v16
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    v_cndmask_b32_e32 v16, v33, v34, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v33, 16, v17
-; VI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT:    v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT:    v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x7fff, v34
-; VI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; VI-NEXT:    v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
-; VI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; VI-NEXT:    v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT:    v_bfe_u32 v34, v17, 16, 1
-; VI-NEXT:    v_add_u32_e32 v34, vcc, v34, v17
-; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x7fff, v34
-; VI-NEXT:    v_or_b32_e32 v35, 0x400000, v17
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT:    v_cndmask_b32_e32 v17, v34, v35, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v34, 16, v32
-; VI-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; VI-NEXT:    v_bfe_u32 v35, v34, 16, 1
-; VI-NEXT:    v_add_u32_e32 v35, vcc, v35, v34
-; VI-NEXT:    v_add_u32_e32 v35, vcc, 0x7fff, v35
-; VI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; VI-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v34, v34
-; VI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; VI-NEXT:    v_cndmask_b32_e32 v34, v35, v36, vcc
-; VI-NEXT:    v_bfe_u32 v35, v32, 16, 1
-; VI-NEXT:    v_add_u32_e32 v35, vcc, v35, v32
-; VI-NEXT:    v_add_u32_e32 v35, vcc, 0x7fff, v35
-; VI-NEXT:    v_or_b32_e32 v36, 0x400000, v32
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; VI-NEXT:    v_cndmask_b32_e32 v32, v35, v36, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v35, 16, v19
-; VI-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; VI-NEXT:    v_bfe_u32 v36, v35, 16, 1
-; VI-NEXT:    v_add_u32_e32 v36, vcc, v36, v35
-; VI-NEXT:    v_add_u32_e32 v36, vcc, 0x7fff, v36
-; VI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; VI-NEXT:    v_or_b32_e32 v37, 0x400000, v35
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
+; VI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v17
+; VI-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
+; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
+; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v16
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
+; VI-NEXT:    v_cndmask_b32_e32 v16, v17, v19, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; VI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v38
+; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; VI-NEXT:    v_bfe_u32 v19, v18, 16, 1
+; VI-NEXT:    v_add_u32_e32 v19, vcc, v19, v18
+; VI-NEXT:    v_add_u32_e32 v19, vcc, 0x7fff, v19
+; VI-NEXT:    v_or_b32_e32 v21, 0x400000, v18
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_cndmask_b32_e32 v18, v19, v21, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v19, 16, v38
 ; VI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; VI-NEXT:    v_cndmask_b32_e32 v35, v36, v37, vcc
-; VI-NEXT:    v_bfe_u32 v36, v19, 16, 1
-; VI-NEXT:    v_add_u32_e32 v36, vcc, v36, v19
-; VI-NEXT:    v_add_u32_e32 v36, vcc, 0x7fff, v36
-; VI-NEXT:    v_or_b32_e32 v37, 0x400000, v19
+; VI-NEXT:    v_bfe_u32 v21, v19, 16, 1
+; VI-NEXT:    v_add_u32_e32 v21, vcc, v21, v19
+; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x7fff, v21
+; VI-NEXT:    v_or_b32_e32 v23, 0x400000, v19
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; VI-NEXT:    v_cndmask_b32_e32 v19, v36, v37, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v36, 16, v20
-; VI-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; VI-NEXT:    v_bfe_u32 v37, v36, 16, 1
-; VI-NEXT:    v_add_u32_e32 v37, vcc, v37, v36
-; VI-NEXT:    v_add_u32_e32 v37, vcc, 0x7fff, v37
+; VI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v48
+; VI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
+; VI-NEXT:    v_cndmask_b32_e32 v38, v21, v23, vcc
+; VI-NEXT:    v_bfe_u32 v21, v19, 16, 1
+; VI-NEXT:    v_add_u32_e32 v21, vcc, v21, v19
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v18
+; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v48
+; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x7fff, v21
+; VI-NEXT:    v_or_b32_e32 v23, 0x400000, v19
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; VI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; VI-NEXT:    v_cndmask_b32_e32 v19, v21, v23, vcc
+; VI-NEXT:    v_bfe_u32 v21, v18, 16, 1
+; VI-NEXT:    v_add_u32_e32 v21, vcc, v21, v18
+; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x7fff, v21
+; VI-NEXT:    v_or_b32_e32 v23, 0x400000, v18
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; VI-NEXT:    v_cndmask_b32_e32 v18, v21, v23, vcc
+; VI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v37
+; VI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; VI-NEXT:    v_bfe_u32 v23, v21, 16, 1
+; VI-NEXT:    v_add_u32_e32 v23, vcc, v23, v21
+; VI-NEXT:    v_add_u32_e32 v23, vcc, 0x7fff, v23
+; VI-NEXT:    v_or_b32_e32 v25, 0x400000, v21
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; VI-NEXT:    v_cndmask_b32_e32 v21, v23, v25, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v23, 16, v37
+; VI-NEXT:    v_lshrrev_b64 v[49:50], 16, v[49:50]
+; VI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; VI-NEXT:    v_bfe_u32 v25, v23, 16, 1
+; VI-NEXT:    v_mov_b32_e32 v15, v49
+; VI-NEXT:    v_add_u32_e32 v25, vcc, v25, v23
+; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v21
+; VI-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; VI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; VI-NEXT:    v_or_b32_e32 v38, 0x400000, v36
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v36, v36
+; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x7fff, v25
 ; VI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; VI-NEXT:    v_cndmask_b32_e32 v36, v37, v38, vcc
-; VI-NEXT:    v_bfe_u32 v37, v20, 16, 1
-; VI-NEXT:    v_add_u32_e32 v37, vcc, v37, v20
-; VI-NEXT:    v_add_u32_e32 v37, vcc, 0x7fff, v37
-; VI-NEXT:    v_or_b32_e32 v38, 0x400000, v20
+; VI-NEXT:    v_or_b32_e32 v27, 0x400000, v23
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; VI-NEXT:    v_bfe_u32 v23, v20, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v48, v25, v27, vcc
+; VI-NEXT:    v_add_u32_e32 v23, vcc, v23, v20
+; VI-NEXT:    v_add_u32_e32 v23, vcc, 0x7fff, v23
+; VI-NEXT:    v_or_b32_e32 v25, 0x400000, v20
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; VI-NEXT:    v_cndmask_b32_e32 v20, v37, v38, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v37, 16, v21
-; VI-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; VI-NEXT:    v_bfe_u32 v38, v37, 16, 1
-; VI-NEXT:    v_add_u32_e32 v38, vcc, v38, v37
-; VI-NEXT:    v_add_u32_e32 v38, vcc, 0x7fff, v38
-; VI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; VI-NEXT:    v_or_b32_e32 v39, 0x400000, v37
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v37, v37
-; VI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; VI-NEXT:    v_cndmask_b32_e32 v37, v38, v39, vcc
-; VI-NEXT:    v_bfe_u32 v38, v21, 16, 1
-; VI-NEXT:    v_add_u32_e32 v38, vcc, v38, v21
-; VI-NEXT:    v_add_u32_e32 v38, vcc, 0x7fff, v38
-; VI-NEXT:    v_or_b32_e32 v39, 0x400000, v21
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; VI-NEXT:    v_cndmask_b32_e32 v21, v38, v39, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v38, 16, v22
-; VI-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; VI-NEXT:    v_bfe_u32 v39, v38, 16, 1
-; VI-NEXT:    v_add_u32_e32 v39, vcc, v39, v38
-; VI-NEXT:    v_add_u32_e32 v39, vcc, 0x7fff, v39
+; VI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v21
+; VI-NEXT:    v_bfe_u32 v21, v20, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc
+; VI-NEXT:    v_add_u32_e32 v21, vcc, v21, v20
+; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x7fff, v21
+; VI-NEXT:    v_or_b32_e32 v25, 0x400000, v20
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; VI-NEXT:    v_cndmask_b32_e32 v20, v21, v25, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v23
+; VI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v36
+; VI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; VI-NEXT:    v_bfe_u32 v25, v23, 16, 1
+; VI-NEXT:    v_add_u32_e32 v25, vcc, v25, v23
+; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x7fff, v25
+; VI-NEXT:    v_or_b32_e32 v27, 0x400000, v23
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; VI-NEXT:    v_cndmask_b32_e32 v23, v25, v27, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v25, 16, v36
+; VI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; VI-NEXT:    v_bfe_u32 v27, v25, 16, 1
+; VI-NEXT:    v_add_u32_e32 v27, vcc, v27, v25
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v23
+; VI-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; VI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; VI-NEXT:    v_or_b32_e32 v48, 0x400000, v38
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v38, v38
+; VI-NEXT:    v_add_u32_e32 v27, vcc, 0x7fff, v27
 ; VI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; VI-NEXT:    v_cndmask_b32_e32 v38, v39, v48, vcc
-; VI-NEXT:    v_bfe_u32 v39, v22, 16, 1
-; VI-NEXT:    v_add_u32_e32 v39, vcc, v39, v22
-; VI-NEXT:    v_add_u32_e32 v39, vcc, 0x7fff, v39
-; VI-NEXT:    v_or_b32_e32 v48, 0x400000, v22
+; VI-NEXT:    v_or_b32_e32 v29, 0x400000, v25
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; VI-NEXT:    v_bfe_u32 v25, v22, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v36, v27, v29, vcc
+; VI-NEXT:    v_add_u32_e32 v25, vcc, v25, v22
+; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x7fff, v25
+; VI-NEXT:    v_or_b32_e32 v27, 0x400000, v22
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; VI-NEXT:    v_cndmask_b32_e32 v22, v39, v48, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v39, 16, v23
-; VI-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
-; VI-NEXT:    v_bfe_u32 v48, v39, 16, 1
-; VI-NEXT:    v_add_u32_e32 v48, vcc, v48, v39
-; VI-NEXT:    v_add_u32_e32 v48, vcc, 0x7fff, v48
-; VI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; VI-NEXT:    v_or_b32_e32 v49, 0x400000, v39
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v39, v39
-; VI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; VI-NEXT:    v_cndmask_b32_e32 v39, v48, v49, vcc
-; VI-NEXT:    v_bfe_u32 v48, v23, 16, 1
-; VI-NEXT:    v_add_u32_e32 v48, vcc, v48, v23
-; VI-NEXT:    v_add_u32_e32 v48, vcc, 0x7fff, v48
-; VI-NEXT:    v_or_b32_e32 v49, 0x400000, v23
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; VI-NEXT:    v_cndmask_b32_e32 v23, v48, v49, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v48, 16, v24
-; VI-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
-; VI-NEXT:    v_bfe_u32 v49, v48, 16, 1
-; VI-NEXT:    v_add_u32_e32 v49, vcc, v49, v48
-; VI-NEXT:    v_add_u32_e32 v49, vcc, 0x7fff, v49
+; VI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v23
+; VI-NEXT:    v_bfe_u32 v23, v22, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v25, v25, v27, vcc
+; VI-NEXT:    v_add_u32_e32 v23, vcc, v23, v22
+; VI-NEXT:    v_add_u32_e32 v23, vcc, 0x7fff, v23
+; VI-NEXT:    v_or_b32_e32 v27, 0x400000, v22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; VI-NEXT:    v_cndmask_b32_e32 v22, v23, v27, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v25
+; VI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v35
+; VI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; VI-NEXT:    v_bfe_u32 v27, v25, 16, 1
+; VI-NEXT:    v_add_u32_e32 v27, vcc, v27, v25
+; VI-NEXT:    v_add_u32_e32 v27, vcc, 0x7fff, v27
+; VI-NEXT:    v_or_b32_e32 v29, 0x400000, v25
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; VI-NEXT:    v_cndmask_b32_e32 v25, v27, v29, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v27, 16, v35
+; VI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; VI-NEXT:    v_bfe_u32 v29, v27, 16, 1
+; VI-NEXT:    v_add_u32_e32 v29, vcc, v29, v27
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v25
+; VI-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; VI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; VI-NEXT:    v_or_b32_e32 v50, 0x400000, v48
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v48, v48
+; VI-NEXT:    v_add_u32_e32 v29, vcc, 0x7fff, v29
 ; VI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; VI-NEXT:    v_cndmask_b32_e32 v48, v49, v50, vcc
-; VI-NEXT:    v_bfe_u32 v49, v24, 16, 1
-; VI-NEXT:    v_add_u32_e32 v49, vcc, v49, v24
-; VI-NEXT:    v_add_u32_e32 v49, vcc, 0x7fff, v49
-; VI-NEXT:    v_or_b32_e32 v50, 0x400000, v24
+; VI-NEXT:    v_or_b32_e32 v35, 0x400000, v27
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
+; VI-NEXT:    v_bfe_u32 v27, v24, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v50, v29, v35, vcc
+; VI-NEXT:    v_add_u32_e32 v27, vcc, v27, v24
+; VI-NEXT:    v_add_u32_e32 v27, vcc, 0x7fff, v27
+; VI-NEXT:    v_or_b32_e32 v29, 0x400000, v24
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; VI-NEXT:    v_cndmask_b32_e32 v24, v49, v50, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v49, 16, v25
-; VI-NEXT:    v_add_f32_e32 v49, 0x40c00000, v49
-; VI-NEXT:    v_bfe_u32 v50, v49, 16, 1
-; VI-NEXT:    v_add_u32_e32 v50, vcc, v50, v49
-; VI-NEXT:    v_add_u32_e32 v50, vcc, 0x7fff, v50
-; VI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; VI-NEXT:    v_or_b32_e32 v51, 0x400000, v49
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v49, v49
-; VI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; VI-NEXT:    v_cndmask_b32_e32 v49, v50, v51, vcc
-; VI-NEXT:    v_bfe_u32 v50, v25, 16, 1
-; VI-NEXT:    v_add_u32_e32 v50, vcc, v50, v25
-; VI-NEXT:    v_add_u32_e32 v50, vcc, 0x7fff, v50
-; VI-NEXT:    v_or_b32_e32 v51, 0x400000, v25
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; VI-NEXT:    v_cndmask_b32_e32 v25, v50, v51, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v50, 16, v26
-; VI-NEXT:    v_add_f32_e32 v50, 0x40c00000, v50
-; VI-NEXT:    v_bfe_u32 v51, v50, 16, 1
-; VI-NEXT:    v_add_u32_e32 v51, vcc, v51, v50
-; VI-NEXT:    v_add_u32_e32 v51, vcc, 0x7fff, v51
+; VI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v25
+; VI-NEXT:    v_bfe_u32 v25, v24, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v27, v27, v29, vcc
+; VI-NEXT:    v_add_u32_e32 v25, vcc, v25, v24
+; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x7fff, v25
+; VI-NEXT:    v_or_b32_e32 v29, 0x400000, v24
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; VI-NEXT:    v_cndmask_b32_e32 v24, v25, v29, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v27
+; VI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v34
+; VI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; VI-NEXT:    v_bfe_u32 v29, v27, 16, 1
+; VI-NEXT:    v_add_u32_e32 v29, vcc, v29, v27
+; VI-NEXT:    v_add_u32_e32 v29, vcc, 0x7fff, v29
+; VI-NEXT:    v_or_b32_e32 v35, 0x400000, v27
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
+; VI-NEXT:    v_cndmask_b32_e32 v27, v29, v35, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v29, 16, v34
+; VI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; VI-NEXT:    v_bfe_u32 v34, v29, 16, 1
+; VI-NEXT:    v_add_u32_e32 v34, vcc, v34, v29
+; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x7fff, v34
+; VI-NEXT:    v_or_b32_e32 v35, 0x400000, v29
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
+; VI-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v27
+; VI-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; VI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; VI-NEXT:    v_or_b32_e32 v52, 0x400000, v50
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v50, v50
 ; VI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; VI-NEXT:    v_cndmask_b32_e32 v50, v51, v52, vcc
-; VI-NEXT:    v_bfe_u32 v51, v26, 16, 1
-; VI-NEXT:    v_add_u32_e32 v51, vcc, v51, v26
-; VI-NEXT:    v_add_u32_e32 v51, vcc, 0x7fff, v51
+; VI-NEXT:    v_bfe_u32 v29, v26, 16, 1
+; VI-NEXT:    v_add_u32_e32 v29, vcc, v29, v26
+; VI-NEXT:    v_add_u32_e32 v29, vcc, 0x7fff, v29
 ; VI-NEXT:    v_or_b32_e32 v52, 0x400000, v26
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; VI-NEXT:    v_cndmask_b32_e32 v26, v51, v52, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v51, 16, v27
-; VI-NEXT:    v_add_f32_e32 v51, 0x40c00000, v51
-; VI-NEXT:    v_bfe_u32 v52, v51, 16, 1
-; VI-NEXT:    v_add_u32_e32 v52, vcc, v52, v51
+; VI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v27
+; VI-NEXT:    v_bfe_u32 v27, v26, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v29, v29, v52, vcc
+; VI-NEXT:    v_add_u32_e32 v27, vcc, v27, v26
+; VI-NEXT:    v_add_u32_e32 v27, vcc, 0x7fff, v27
+; VI-NEXT:    v_or_b32_e32 v52, 0x400000, v26
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; VI-NEXT:    v_cndmask_b32_e32 v26, v27, v52, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v29
+; VI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v33
+; VI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; VI-NEXT:    v_bfe_u32 v52, v29, 16, 1
+; VI-NEXT:    v_add_u32_e32 v52, vcc, v52, v29
 ; VI-NEXT:    v_add_u32_e32 v52, vcc, 0x7fff, v52
-; VI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; VI-NEXT:    v_or_b32_e32 v53, 0x400000, v51
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v51, v51
-; VI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; VI-NEXT:    v_cndmask_b32_e32 v51, v52, v53, vcc
-; VI-NEXT:    v_bfe_u32 v52, v27, 16, 1
-; VI-NEXT:    v_add_u32_e32 v52, vcc, v52, v27
+; VI-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
+; VI-NEXT:    v_or_b32_e32 v53, 0x400000, v29
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
+; VI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; VI-NEXT:    v_cndmask_b32_e32 v29, v52, v53, vcc
+; VI-NEXT:    v_bfe_u32 v52, v33, 16, 1
+; VI-NEXT:    v_add_u32_e32 v52, vcc, v52, v33
 ; VI-NEXT:    v_add_u32_e32 v52, vcc, 0x7fff, v52
-; VI-NEXT:    v_or_b32_e32 v53, 0x400000, v27
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; VI-NEXT:    v_cndmask_b32_e32 v27, v52, v53, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v52, 16, v28
-; VI-NEXT:    v_add_f32_e32 v52, 0x40c00000, v52
-; VI-NEXT:    v_bfe_u32 v53, v52, 16, 1
-; VI-NEXT:    v_add_u32_e32 v53, vcc, v53, v52
-; VI-NEXT:    v_add_u32_e32 v53, vcc, 0x7fff, v53
+; VI-NEXT:    v_or_b32_e32 v53, 0x400000, v33
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v29
+; VI-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
 ; VI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; VI-NEXT:    v_or_b32_e32 v54, 0x400000, v52
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v52, v52
 ; VI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; VI-NEXT:    v_cndmask_b32_e32 v52, v53, v54, vcc
-; VI-NEXT:    v_bfe_u32 v53, v28, 16, 1
-; VI-NEXT:    v_add_u32_e32 v53, vcc, v53, v28
-; VI-NEXT:    v_add_u32_e32 v53, vcc, 0x7fff, v53
+; VI-NEXT:    v_bfe_u32 v33, v28, 16, 1
+; VI-NEXT:    v_add_u32_e32 v33, vcc, v33, v28
+; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x7fff, v33
 ; VI-NEXT:    v_or_b32_e32 v54, 0x400000, v28
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; VI-NEXT:    v_cndmask_b32_e32 v28, v53, v54, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v53, 16, v29
-; VI-NEXT:    v_add_f32_e32 v53, 0x40c00000, v53
-; VI-NEXT:    v_bfe_u32 v54, v53, 16, 1
-; VI-NEXT:    v_add_u32_e32 v54, vcc, v54, v53
+; VI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v29
+; VI-NEXT:    v_bfe_u32 v29, v28, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v33, v33, v54, vcc
+; VI-NEXT:    v_add_u32_e32 v29, vcc, v29, v28
+; VI-NEXT:    v_add_u32_e32 v29, vcc, 0x7fff, v29
+; VI-NEXT:    v_or_b32_e32 v54, 0x400000, v28
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
+; VI-NEXT:    v_cndmask_b32_e32 v28, v29, v54, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v33
+; VI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v32
+; VI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; VI-NEXT:    v_bfe_u32 v54, v33, 16, 1
+; VI-NEXT:    v_add_u32_e32 v54, vcc, v54, v33
 ; VI-NEXT:    v_add_u32_e32 v54, vcc, 0x7fff, v54
-; VI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; VI-NEXT:    v_or_b32_e32 v55, 0x400000, v53
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v53, v53
-; VI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; VI-NEXT:    v_cndmask_b32_e32 v53, v54, v55, vcc
-; VI-NEXT:    v_bfe_u32 v54, v29, 16, 1
-; VI-NEXT:    v_add_u32_e32 v54, vcc, v54, v29
+; VI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; VI-NEXT:    v_or_b32_e32 v55, 0x400000, v33
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
+; VI-NEXT:    v_cndmask_b32_e32 v33, v54, v55, vcc
+; VI-NEXT:    v_bfe_u32 v54, v32, 16, 1
+; VI-NEXT:    v_add_u32_e32 v54, vcc, v54, v32
 ; VI-NEXT:    v_add_u32_e32 v54, vcc, 0x7fff, v54
-; VI-NEXT:    v_or_b32_e32 v55, 0x400000, v29
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; VI-NEXT:    v_cndmask_b32_e32 v29, v54, v55, vcc
+; VI-NEXT:    v_or_b32_e32 v55, 0x400000, v32
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
+; VI-NEXT:    v_cndmask_b32_e32 v32, v54, v55, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v54, 16, v30
-; VI-NEXT:    v_add_f32_e32 v54, 0x40c00000, v54
-; VI-NEXT:    v_bfe_u32 v55, v54, 16, 1
-; VI-NEXT:    v_add_u32_e32 v55, vcc, v55, v54
-; VI-NEXT:    v_add_u32_e32 v55, vcc, 0x7fff, v55
 ; VI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; VI-NEXT:    v_or_b32_e32 v40, 0x400000, v54
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
 ; VI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; VI-NEXT:    v_cndmask_b32_e32 v54, v55, v40, vcc
 ; VI-NEXT:    v_bfe_u32 v55, v30, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v55, vcc, v55, v30
 ; VI-NEXT:    v_add_u32_e32 v55, vcc, 0x7fff, v55
 ; VI-NEXT:    v_or_b32_e32 v40, 0x400000, v30
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; VI-NEXT:    v_add_f32_e32 v54, 0x40c00000, v54
 ; VI-NEXT:    v_cndmask_b32_e32 v30, v55, v40, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v55, 16, v31
-; VI-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
-; VI-NEXT:    v_bfe_u32 v40, v55, 16, 1
-; VI-NEXT:    v_add_u32_e32 v40, vcc, v40, v55
-; VI-NEXT:    v_add_u32_e32 v40, vcc, 0x7fff, v40
-; VI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; VI-NEXT:    v_or_b32_e32 v41, 0x400000, v55
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v55, v55
-; VI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; VI-NEXT:    v_cndmask_b32_e32 v55, v40, v41, vcc
-; VI-NEXT:    v_bfe_u32 v40, v31, 16, 1
-; VI-NEXT:    v_add_u32_e32 v40, vcc, v40, v31
+; VI-NEXT:    v_bfe_u32 v55, v54, 16, 1
+; VI-NEXT:    v_add_u32_e32 v55, vcc, v55, v54
+; VI-NEXT:    v_add_u32_e32 v55, vcc, 0x7fff, v55
+; VI-NEXT:    v_or_b32_e32 v40, 0x400000, v54
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v54, v54
+; VI-NEXT:    v_cndmask_b32_e32 v54, v55, v40, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v30
+; VI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v31
+; VI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; VI-NEXT:    v_bfe_u32 v40, v30, 16, 1
+; VI-NEXT:    v_add_u32_e32 v40, vcc, v40, v30
 ; VI-NEXT:    v_add_u32_e32 v40, vcc, 0x7fff, v40
-; VI-NEXT:    v_or_b32_e32 v41, 0x400000, v31
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; VI-NEXT:    v_cndmask_b32_e32 v31, v40, v41, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; VI-NEXT:    v_or_b32_e32 v41, 0x400000, v30
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; VI-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; VI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; VI-NEXT:    v_bfe_u32 v31, v30, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[48:49], 16, v[48:49]
+; VI-NEXT:    v_cndmask_b32_e32 v40, v40, v41, vcc
+; VI-NEXT:    v_add_u32_e32 v31, vcc, v31, v30
+; VI-NEXT:    v_lshrrev_b64 v[36:37], 16, v[36:37]
+; VI-NEXT:    v_add_u32_e32 v31, vcc, 0x7fff, v31
+; VI-NEXT:    v_mov_b32_e32 v37, v48
+; VI-NEXT:    v_lshrrev_b64 v[48:49], 16, v[50:51]
+; VI-NEXT:    v_or_b32_e32 v41, 0x400000, v30
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; VI-NEXT:    v_lshrrev_b64 v[34:35], 16, v[34:35]
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; VI-NEXT:    v_cndmask_b32_e32 v30, v31, v41, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v40
+; VI-NEXT:    v_mov_b32_e32 v35, v48
+; VI-NEXT:    v_lshrrev_b64 v[48:49], 16, v[52:53]
 ; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT:    v_alignbit_b32 v31, v31, v55, 16
-; VI-NEXT:    v_alignbit_b32 v30, v30, v54, 16
-; VI-NEXT:    v_alignbit_b32 v29, v29, v53, 16
-; VI-NEXT:    v_alignbit_b32 v28, v28, v52, 16
-; VI-NEXT:    v_alignbit_b32 v27, v27, v51, 16
-; VI-NEXT:    v_alignbit_b32 v26, v26, v50, 16
-; VI-NEXT:    v_alignbit_b32 v25, v25, v49, 16
-; VI-NEXT:    v_alignbit_b32 v24, v24, v48, 16
-; VI-NEXT:    v_alignbit_b32 v23, v23, v39, 16
-; VI-NEXT:    v_alignbit_b32 v22, v22, v38, 16
-; VI-NEXT:    v_alignbit_b32 v21, v21, v37, 16
-; VI-NEXT:    v_alignbit_b32 v20, v20, v36, 16
-; VI-NEXT:    v_alignbit_b32 v19, v19, v35, 16
-; VI-NEXT:    v_alignbit_b32 v32, v32, v34, 16
-; VI-NEXT:    v_alignbit_b32 v17, v17, v33, 16
-; VI-NEXT:    v_alignbit_b32 v16, v16, v18, 16
+; VI-NEXT:    v_lshrrev_b64 v[32:33], 16, v[32:33]
+; VI-NEXT:    v_lshrrev_b64 v[50:51], 16, v[30:31]
+; VI-NEXT:    v_lshrrev_b64 v[38:39], 16, v[38:39]
+; VI-NEXT:    v_mov_b32_e32 v33, v48
+; VI-NEXT:    v_lshrrev_b64 v[30:31], 16, v[54:55]
+; VI-NEXT:    v_lshrrev_b64 v[28:29], 16, v[28:29]
+; VI-NEXT:    v_lshrrev_b64 v[26:27], 16, v[26:27]
+; VI-NEXT:    v_lshrrev_b64 v[24:25], 16, v[24:25]
+; VI-NEXT:    v_lshrrev_b64 v[22:23], 16, v[22:23]
+; VI-NEXT:    v_lshrrev_b64 v[20:21], 16, v[20:21]
+; VI-NEXT:    v_lshrrev_b64 v[48:49], 16, v[18:19]
+; VI-NEXT:    v_lshrrev_b64 v[16:17], 16, v[16:17]
+; VI-NEXT:    v_mov_b32_e32 v31, v50
 ; VI-NEXT:    s_branch .LBB105_5
 ; VI-NEXT:  .LBB105_3:
 ; VI-NEXT:    s_branch .LBB105_2
@@ -235838,7 +236740,14 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:  .LBB105_5: ; %end
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v18, v32
+; VI-NEXT:    v_mov_b32_e32 v17, v38
+; VI-NEXT:    v_mov_b32_e32 v18, v48
+; VI-NEXT:    v_mov_b32_e32 v19, v37
+; VI-NEXT:    v_mov_b32_e32 v21, v36
+; VI-NEXT:    v_mov_b32_e32 v23, v35
+; VI-NEXT:    v_mov_b32_e32 v25, v34
+; VI-NEXT:    v_mov_b32_e32 v27, v33
+; VI-NEXT:    v_mov_b32_e32 v29, v32
 ; VI-NEXT:    v_readlane_b32 s31, v42, 1
 ; VI-NEXT:    v_readlane_b32 s30, v42, 0
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
@@ -238881,54 +239790,54 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    v_writelane_b32 v40, s85, 29
 ; SI-NEXT:    v_writelane_b32 v40, s86, 30
 ; SI-NEXT:    v_writelane_b32 v40, s87, 31
+; SI-NEXT:    s_mov_b32 s74, s23
+; SI-NEXT:    s_mov_b32 s72, s21
+; SI-NEXT:    s_mov_b32 s61, s18
 ; SI-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_mov_b32 s60, s16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v41, s17, 0
-; SI-NEXT:    s_mov_b32 s61, s19
 ; SI-NEXT:    v_writelane_b32 v41, s60, 1
-; SI-NEXT:    s_mov_b32 s63, s18
-; SI-NEXT:    v_writelane_b32 v41, s61, 2
-; SI-NEXT:    s_mov_b32 s72, s21
-; SI-NEXT:    v_writelane_b32 v41, s63, 3
+; SI-NEXT:    v_writelane_b32 v41, s19, 2
+; SI-NEXT:    v_writelane_b32 v41, s61, 3
 ; SI-NEXT:    v_writelane_b32 v41, s72, 4
-; SI-NEXT:    s_mov_b32 s74, s23
 ; SI-NEXT:    v_writelane_b32 v41, s20, 5
 ; SI-NEXT:    v_writelane_b32 v41, s74, 6
-; SI-NEXT:    s_mov_b32 s75, s25
+; SI-NEXT:    s_mov_b32 s76, s25
 ; SI-NEXT:    v_writelane_b32 v41, s22, 7
-; SI-NEXT:    v_writelane_b32 v41, s75, 8
-; SI-NEXT:    s_mov_b32 s76, s27
+; SI-NEXT:    v_writelane_b32 v41, s76, 8
+; SI-NEXT:    s_mov_b32 s78, s27
 ; SI-NEXT:    v_writelane_b32 v41, s24, 9
-; SI-NEXT:    v_writelane_b32 v41, s76, 10
-; SI-NEXT:    s_mov_b32 s93, s29
+; SI-NEXT:    v_writelane_b32 v41, s78, 10
+; SI-NEXT:    s_mov_b32 s88, s29
 ; SI-NEXT:    v_writelane_b32 v41, s26, 11
-; SI-NEXT:    v_writelane_b32 v41, s93, 12
-; SI-NEXT:    v_readfirstlane_b32 s16, v2
+; SI-NEXT:    v_writelane_b32 v41, s88, 12
+; SI-NEXT:    v_readfirstlane_b32 s77, v2
 ; SI-NEXT:    v_writelane_b32 v41, s28, 13
-; SI-NEXT:    v_readfirstlane_b32 s73, v4
-; SI-NEXT:    v_writelane_b32 v41, s16, 14
-; SI-NEXT:    v_readfirstlane_b32 s89, v3
-; SI-NEXT:    v_writelane_b32 v41, s73, 15
-; SI-NEXT:    v_readfirstlane_b32 s90, v6
-; SI-NEXT:    v_writelane_b32 v41, s89, 16
-; SI-NEXT:    v_readfirstlane_b32 s91, v5
-; SI-NEXT:    v_writelane_b32 v41, s90, 17
-; SI-NEXT:    v_readfirstlane_b32 s34, v8
-; SI-NEXT:    v_writelane_b32 v41, s91, 18
-; SI-NEXT:    v_readfirstlane_b32 s35, v7
-; SI-NEXT:    v_writelane_b32 v41, s34, 19
-; SI-NEXT:    v_readfirstlane_b32 s36, v10
-; SI-NEXT:    v_writelane_b32 v41, s35, 20
-; SI-NEXT:    v_writelane_b32 v40, s96, 32
-; SI-NEXT:    v_readfirstlane_b32 s37, v9
-; SI-NEXT:    v_writelane_b32 v41, s36, 21
+; SI-NEXT:    v_readfirstlane_b32 s79, v4
+; SI-NEXT:    v_writelane_b32 v41, s77, 14
+; SI-NEXT:    v_readfirstlane_b32 s90, v3
+; SI-NEXT:    v_writelane_b32 v41, s79, 15
+; SI-NEXT:    v_readfirstlane_b32 s91, v6
+; SI-NEXT:    v_writelane_b32 v41, s90, 16
+; SI-NEXT:    v_readfirstlane_b32 s92, v5
+; SI-NEXT:    v_writelane_b32 v41, s91, 17
+; SI-NEXT:    v_readfirstlane_b32 s93, v8
+; SI-NEXT:    v_writelane_b32 v41, s92, 18
+; SI-NEXT:    v_readfirstlane_b32 s94, v7
+; SI-NEXT:    v_writelane_b32 v41, s93, 19
+; SI-NEXT:    v_readfirstlane_b32 s95, v10
+; SI-NEXT:    v_writelane_b32 v41, s94, 20
+; SI-NEXT:    v_readfirstlane_b32 s30, v9
+; SI-NEXT:    v_writelane_b32 v41, s95, 21
+; SI-NEXT:    v_readfirstlane_b32 s31, v12
+; SI-NEXT:    v_writelane_b32 v41, s30, 22
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s62, v31
+; SI-NEXT:    v_readfirstlane_b32 s21, v31
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_readfirstlane_b32 s80, v32
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_readfirstlane_b32 s69, v33
+; SI-NEXT:    v_readfirstlane_b32 s75, v33
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
@@ -238940,20 +239849,25 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s84, v34
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s68, v35
+; SI-NEXT:    v_readfirstlane_b32 s23, v35
 ; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s83, v36
 ; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s87, v38
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:80
-; SI-NEXT:    v_readfirstlane_b32 s6, v37
+; SI-NEXT:    v_readfirstlane_b32 s18, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32
+; SI-NEXT:    v_writelane_b32 v41, s31, 23
+; SI-NEXT:    v_readfirstlane_b32 s34, v11
+; SI-NEXT:    v_readfirstlane_b32 s35, v14
+; SI-NEXT:    v_readfirstlane_b32 s36, v13
+; SI-NEXT:    v_writelane_b32 v40, s96, 32
+; SI-NEXT:    v_readfirstlane_b32 s37, v16
 ; SI-NEXT:    v_writelane_b32 v40, s97, 33
-; SI-NEXT:    v_readfirstlane_b32 s38, v12
-; SI-NEXT:    v_writelane_b32 v41, s37, 22
+; SI-NEXT:    v_readfirstlane_b32 s38, v15
 ; SI-NEXT:    v_writelane_b32 v40, s98, 34
 ; SI-NEXT:    v_readfirstlane_b32 s14, v30
 ; SI-NEXT:    v_readfirstlane_b32 s15, v29
@@ -238963,21 +239877,13 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s11, v25
 ; SI-NEXT:    v_readfirstlane_b32 s8, v24
 ; SI-NEXT:    v_readfirstlane_b32 s9, v23
-; SI-NEXT:    v_readfirstlane_b32 s88, v22
-; SI-NEXT:    v_readfirstlane_b32 s29, v21
-; SI-NEXT:    v_readfirstlane_b32 s79, v20
-; SI-NEXT:    v_readfirstlane_b32 s27, v19
-; SI-NEXT:    v_readfirstlane_b32 s78, v18
-; SI-NEXT:    v_readfirstlane_b32 s25, v17
-; SI-NEXT:    v_readfirstlane_b32 s77, v16
-; SI-NEXT:    v_readfirstlane_b32 s23, v15
-; SI-NEXT:    v_readfirstlane_b32 s39, v14
-; SI-NEXT:    v_readfirstlane_b32 s21, v13
-; SI-NEXT:    v_readfirstlane_b32 s19, v11
-; SI-NEXT:    v_readfirstlane_b32 s18, v1
-; SI-NEXT:    v_writelane_b32 v41, s38, 23
+; SI-NEXT:    v_readfirstlane_b32 s89, v22
+; SI-NEXT:    v_readfirstlane_b32 s7, v21
+; SI-NEXT:    v_readfirstlane_b32 s25, v20
+; SI-NEXT:    v_readfirstlane_b32 s29, v19
+; SI-NEXT:    v_readfirstlane_b32 s39, v18
+; SI-NEXT:    v_readfirstlane_b32 s27, v17
 ; SI-NEXT:    v_writelane_b32 v40, s99, 35
-; SI-NEXT:    v_writelane_b32 v41, s39, 24
 ; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s58, v31
 ; SI-NEXT:    s_waitcnt vmcnt(11)
@@ -238997,261 +239903,289 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_readfirstlane_b32 s42, v34
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v38
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v1
+; SI-NEXT:    v_writelane_b32 v41, s5, 24
+; SI-NEXT:    v_writelane_b32 v41, s34, 25
+; SI-NEXT:    v_writelane_b32 v41, s35, 26
+; SI-NEXT:    v_writelane_b32 v41, s36, 27
+; SI-NEXT:    v_writelane_b32 v41, s37, 28
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_readfirstlane_b32 s43, v35
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_readfirstlane_b32 s40, v36
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s41, v37
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    v_writelane_b32 v41, s38, 29
+; SI-NEXT:    v_writelane_b32 v41, s39, 30
 ; SI-NEXT:    s_cbranch_scc0 .LBB107_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshl_b32 s4, s60, 16
-; SI-NEXT:    v_writelane_b32 v41, s4, 25
-; SI-NEXT:    s_lshl_b32 s4, s63, 16
-; SI-NEXT:    v_writelane_b32 v41, s4, 26
-; SI-NEXT:    s_lshl_b32 s4, s20, 16
-; SI-NEXT:    v_writelane_b32 v41, s4, 27
-; SI-NEXT:    s_lshl_b32 s4, s22, 16
-; SI-NEXT:    v_writelane_b32 v41, s4, 28
-; SI-NEXT:    s_lshl_b32 s4, s24, 16
-; SI-NEXT:    v_writelane_b32 v41, s4, 29
-; SI-NEXT:    s_lshl_b32 s4, s26, 16
-; SI-NEXT:    v_writelane_b32 v41, s4, 30
-; SI-NEXT:    s_lshl_b32 s4, s28, 16
-; SI-NEXT:    v_writelane_b32 v41, s4, 31
-; SI-NEXT:    s_lshl_b32 s4, s18, 16
 ; SI-NEXT:    v_writelane_b32 v41, s4, 32
-; SI-NEXT:    s_lshl_b32 s4, s89, 16
-; SI-NEXT:    v_writelane_b32 v41, s4, 33
-; SI-NEXT:    s_lshl_b32 s4, s91, 16
+; SI-NEXT:    s_lshl_b32 s4, s17, 16
+; SI-NEXT:    v_writelane_b32 v41, s4, 31
+; SI-NEXT:    s_lshl_b32 s4, s61, 16
 ; SI-NEXT:    v_writelane_b32 v41, s4, 34
-; SI-NEXT:    s_lshl_b32 s4, s35, 16
-; SI-NEXT:    v_writelane_b32 v41, s4, 35
-; SI-NEXT:    s_lshl_b32 s4, s37, 16
-; SI-NEXT:    s_lshl_b32 s7, s17, 16
-; SI-NEXT:    s_lshl_b32 s96, s61, 16
-; SI-NEXT:    s_lshl_b32 s99, s72, 16
-; SI-NEXT:    s_lshl_b32 s97, s74, 16
-; SI-NEXT:    s_lshl_b32 s92, s75, 16
-; SI-NEXT:    s_lshl_b32 s94, s76, 16
-; SI-NEXT:    s_lshl_b32 s95, s93, 16
-; SI-NEXT:    s_lshl_b32 s93, s16, 16
-; SI-NEXT:    s_lshl_b32 s30, s73, 16
-; SI-NEXT:    s_lshl_b32 s31, s90, 16
-; SI-NEXT:    s_lshl_b32 s34, s34, 16
+; SI-NEXT:    s_lshl_b32 s4, s19, 16
+; SI-NEXT:    v_writelane_b32 v41, s4, 33
+; SI-NEXT:    s_lshl_b32 s4, s20, 16
 ; SI-NEXT:    v_writelane_b32 v41, s4, 36
-; SI-NEXT:    s_lshl_b32 s35, s36, 16
-; SI-NEXT:    s_lshl_b32 s86, s19, 16
-; SI-NEXT:    s_lshl_b32 s36, s38, 16
-; SI-NEXT:    s_lshl_b32 s22, s21, 16
-; SI-NEXT:    s_lshl_b32 s37, s39, 16
-; SI-NEXT:    s_lshl_b32 s24, s23, 16
-; SI-NEXT:    s_lshl_b32 s38, s77, 16
-; SI-NEXT:    s_lshl_b32 s28, s25, 16
-; SI-NEXT:    s_lshl_b32 s39, s78, 16
-; SI-NEXT:    s_lshl_b32 s61, s27, 16
-; SI-NEXT:    s_lshl_b32 s48, s79, 16
-; SI-NEXT:    s_lshl_b32 s89, s29, 16
-; SI-NEXT:    s_lshl_b32 s49, s88, 16
-; SI-NEXT:    s_lshl_b32 s60, s9, 16
-; SI-NEXT:    s_lshl_b32 s50, s8, 16
-; SI-NEXT:    s_lshl_b32 s90, s11, 16
-; SI-NEXT:    s_lshl_b32 s91, s10, 16
-; SI-NEXT:    s_lshl_b32 s70, s13, 16
-; SI-NEXT:    s_lshl_b32 s51, s12, 16
-; SI-NEXT:    s_lshl_b32 s71, s15, 16
-; SI-NEXT:    s_lshl_b32 s52, s14, 16
-; SI-NEXT:    s_lshl_b32 s20, s41, 16
-; SI-NEXT:    s_lshl_b32 s53, s40, 16
-; SI-NEXT:    s_lshl_b32 s81, s43, 16
-; SI-NEXT:    s_lshl_b32 s54, s42, 16
-; SI-NEXT:    s_lshl_b32 s63, s45, 16
-; SI-NEXT:    s_lshl_b32 s55, s44, 16
-; SI-NEXT:    s_lshl_b32 s72, s47, 16
-; SI-NEXT:    s_lshl_b32 s64, s46, 16
-; SI-NEXT:    s_lshl_b32 s82, s57, 16
-; SI-NEXT:    s_lshl_b32 s65, s56, 16
-; SI-NEXT:    s_lshl_b32 s74, s59, 16
-; SI-NEXT:    s_lshl_b32 s66, s58, 16
-; SI-NEXT:    s_lshl_b32 s75, s87, 16
-; SI-NEXT:    s_mov_b32 s73, s6
-; SI-NEXT:    s_lshl_b32 s67, s6, 16
-; SI-NEXT:    s_lshl_b32 s76, s83, 16
-; SI-NEXT:    s_mov_b32 s16, s68
-; SI-NEXT:    s_lshl_b32 s68, s68, 16
-; SI-NEXT:    s_lshl_b32 s85, s84, 16
-; SI-NEXT:    s_mov_b32 s98, s69
-; SI-NEXT:    s_lshl_b32 s69, s69, 16
-; SI-NEXT:    s_lshl_b32 s17, s80, 16
-; SI-NEXT:    s_mov_b32 s6, s62
-; SI-NEXT:    s_lshl_b32 s26, s62, 16
+; SI-NEXT:    s_lshl_b32 s4, s72, 16
+; SI-NEXT:    v_writelane_b32 v41, s4, 35
+; SI-NEXT:    s_lshl_b32 s4, s74, 16
+; SI-NEXT:    s_lshl_b32 s16, s22, 16
+; SI-NEXT:    v_writelane_b32 v41, s4, 37
+; SI-NEXT:    s_lshl_b32 s6, s24, 16
+; SI-NEXT:    s_lshl_b32 s73, s76, 16
+; SI-NEXT:    s_lshl_b32 s98, s26, 16
+; SI-NEXT:    s_lshl_b32 s63, s78, 16
+; SI-NEXT:    s_lshl_b32 s96, s28, 16
+; SI-NEXT:    s_lshl_b32 s62, s88, 16
+; SI-NEXT:    s_lshl_b32 s97, s5, 16
+; SI-NEXT:    s_lshl_b32 s99, s77, 16
+; SI-NEXT:    s_lshl_b32 s85, s90, 16
+; SI-NEXT:    s_lshl_b32 s86, s79, 16
+; SI-NEXT:    s_lshl_b32 s81, s92, 16
+; SI-NEXT:    s_lshl_b32 s82, s91, 16
+; SI-NEXT:    s_lshl_b32 s70, s94, 16
+; SI-NEXT:    s_lshl_b32 s71, s93, 16
+; SI-NEXT:    s_lshl_b32 s68, s30, 16
+; SI-NEXT:    s_lshl_b32 s69, s95, 16
+; SI-NEXT:    s_lshl_b32 s66, s34, 16
+; SI-NEXT:    s_lshl_b32 s67, s31, 16
+; SI-NEXT:    s_lshl_b32 s64, s36, 16
+; SI-NEXT:    s_lshl_b32 s65, s35, 16
+; SI-NEXT:    s_lshl_b32 s54, s38, 16
+; SI-NEXT:    s_lshl_b32 s55, s37, 16
+; SI-NEXT:    s_lshl_b32 s52, s27, 16
+; SI-NEXT:    s_lshl_b32 s53, s39, 16
+; SI-NEXT:    s_lshl_b32 s50, s29, 16
+; SI-NEXT:    s_lshl_b32 s51, s25, 16
+; SI-NEXT:    s_lshl_b32 s48, s7, 16
+; SI-NEXT:    s_lshl_b32 s49, s89, 16
+; SI-NEXT:    s_lshl_b32 s38, s9, 16
+; SI-NEXT:    s_lshl_b32 s39, s8, 16
+; SI-NEXT:    s_lshl_b32 s37, s11, 16
+; SI-NEXT:    s_lshl_b32 s35, s10, 16
+; SI-NEXT:    s_lshl_b32 s31, s13, 16
+; SI-NEXT:    s_lshl_b32 s36, s12, 16
+; SI-NEXT:    s_lshl_b32 s95, s15, 16
+; SI-NEXT:    s_lshl_b32 s34, s14, 16
+; SI-NEXT:    s_lshl_b32 s93, s41, 16
+; SI-NEXT:    s_lshl_b32 s30, s40, 16
+; SI-NEXT:    s_lshl_b32 s91, s43, 16
+; SI-NEXT:    s_lshl_b32 s94, s42, 16
+; SI-NEXT:    s_lshl_b32 s92, s45, 16
+; SI-NEXT:    s_lshl_b32 s90, s44, 16
+; SI-NEXT:    s_lshl_b32 s88, s47, 16
+; SI-NEXT:    s_lshl_b32 s28, s46, 16
+; SI-NEXT:    s_lshl_b32 s78, s57, 16
+; SI-NEXT:    s_lshl_b32 s26, s56, 16
+; SI-NEXT:    s_lshl_b32 s76, s59, 16
+; SI-NEXT:    s_lshl_b32 s24, s58, 16
+; SI-NEXT:    s_lshl_b32 s74, s87, 16
+; SI-NEXT:    s_mov_b32 s77, s18
+; SI-NEXT:    s_lshl_b32 s22, s18, 16
+; SI-NEXT:    s_lshl_b32 s72, s83, 16
+; SI-NEXT:    s_mov_b32 s79, s23
+; SI-NEXT:    s_lshl_b32 s20, s23, 16
+; SI-NEXT:    s_lshl_b32 s61, s84, 16
+; SI-NEXT:    s_mov_b32 s18, s75
+; SI-NEXT:    s_lshl_b32 s19, s75, 16
+; SI-NEXT:    s_lshl_b32 s60, s80, 16
+; SI-NEXT:    s_lshl_b32 s17, s21, 16
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_branch .LBB107_3
 ; SI-NEXT:  .LBB107_2:
-; SI-NEXT:    ; implicit-def: $sgpr17
-; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    s_mov_b32 s16, s68
-; SI-NEXT:    ; implicit-def: $sgpr17
-; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    s_mov_b32 s73, s6
-; SI-NEXT:    ; implicit-def: $sgpr17
-; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    s_mov_b32 s6, s62
-; SI-NEXT:    ; implicit-def: $sgpr17
-; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    s_mov_b32 s98, s69
-; SI-NEXT:    ; implicit-def: $sgpr17
-; SI-NEXT:    ; kill: killed $sgpr17
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; kill: killed $sgpr6
+; SI-NEXT:    s_mov_b32 s79, s23
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; kill: killed $sgpr6
+; SI-NEXT:    s_mov_b32 s77, s18
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; kill: killed $sgpr6
+; SI-NEXT:    s_mov_b32 s18, s75
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; kill: killed $sgpr6
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
-; SI-NEXT:    ; implicit-def: $sgpr17
-; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    ; implicit-def: $sgpr7
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; kill: killed $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr16
+; SI-NEXT:    ; implicit-def: $sgpr73
+; SI-NEXT:    ; implicit-def: $sgpr98
+; SI-NEXT:    ; implicit-def: $sgpr63
 ; SI-NEXT:    ; implicit-def: $sgpr96
-; SI-NEXT:    ; implicit-def: $sgpr99
+; SI-NEXT:    ; implicit-def: $sgpr62
 ; SI-NEXT:    ; implicit-def: $sgpr97
-; SI-NEXT:    ; implicit-def: $sgpr92
-; SI-NEXT:    ; implicit-def: $sgpr94
-; SI-NEXT:    ; implicit-def: $sgpr95
-; SI-NEXT:    ; implicit-def: $sgpr93
-; SI-NEXT:    ; implicit-def: $sgpr30
-; SI-NEXT:    ; implicit-def: $sgpr31
-; SI-NEXT:    ; implicit-def: $sgpr34
-; SI-NEXT:    ; implicit-def: $sgpr35
+; SI-NEXT:    ; implicit-def: $sgpr99
+; SI-NEXT:    ; implicit-def: $sgpr85
 ; SI-NEXT:    ; implicit-def: $sgpr86
-; SI-NEXT:    ; implicit-def: $sgpr36
-; SI-NEXT:    ; implicit-def: $sgpr22
-; SI-NEXT:    ; implicit-def: $sgpr37
-; SI-NEXT:    ; implicit-def: $sgpr24
-; SI-NEXT:    ; implicit-def: $sgpr38
-; SI-NEXT:    ; implicit-def: $sgpr28
-; SI-NEXT:    ; implicit-def: $sgpr39
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $sgpr48
-; SI-NEXT:    ; implicit-def: $sgpr89
-; SI-NEXT:    ; implicit-def: $sgpr49
-; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $sgpr50
-; SI-NEXT:    ; implicit-def: $sgpr90
-; SI-NEXT:    ; implicit-def: $sgpr91
-; SI-NEXT:    ; implicit-def: $sgpr70
-; SI-NEXT:    ; implicit-def: $sgpr51
-; SI-NEXT:    ; implicit-def: $sgpr71
-; SI-NEXT:    ; implicit-def: $sgpr52
-; SI-NEXT:    ; implicit-def: $sgpr20
-; SI-NEXT:    ; implicit-def: $sgpr53
 ; SI-NEXT:    ; implicit-def: $sgpr81
-; SI-NEXT:    ; implicit-def: $sgpr54
-; SI-NEXT:    ; implicit-def: $sgpr63
-; SI-NEXT:    ; implicit-def: $sgpr55
-; SI-NEXT:    ; implicit-def: $sgpr72
-; SI-NEXT:    ; implicit-def: $sgpr64
 ; SI-NEXT:    ; implicit-def: $sgpr82
-; SI-NEXT:    ; implicit-def: $sgpr65
-; SI-NEXT:    ; implicit-def: $sgpr74
-; SI-NEXT:    ; implicit-def: $sgpr66
-; SI-NEXT:    ; implicit-def: $sgpr75
-; SI-NEXT:    ; implicit-def: $sgpr67
-; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr70
+; SI-NEXT:    ; implicit-def: $sgpr71
 ; SI-NEXT:    ; implicit-def: $sgpr68
-; SI-NEXT:    ; implicit-def: $sgpr85
 ; SI-NEXT:    ; implicit-def: $sgpr69
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr67
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr65
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; implicit-def: $sgpr55
+; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $sgpr53
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; implicit-def: $sgpr51
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr49
+; SI-NEXT:    ; implicit-def: $sgpr38
+; SI-NEXT:    ; implicit-def: $sgpr39
+; SI-NEXT:    ; implicit-def: $sgpr37
+; SI-NEXT:    ; implicit-def: $sgpr35
+; SI-NEXT:    ; implicit-def: $sgpr31
+; SI-NEXT:    ; implicit-def: $sgpr36
+; SI-NEXT:    ; implicit-def: $sgpr95
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr93
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr91
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr78
 ; SI-NEXT:    ; implicit-def: $sgpr26
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr24
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr22
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr20
+; SI-NEXT:    ; implicit-def: $sgpr61
+; SI-NEXT:    ; implicit-def: $sgpr19
+; SI-NEXT:    ; implicit-def: $sgpr60
 ; SI-NEXT:    ; implicit-def: $sgpr17
-; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    ; implicit-def: $sgpr17
-; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    ; implicit-def: $sgpr17
-; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    ; implicit-def: $sgpr17
-; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    ; implicit-def: $sgpr17
-; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    ; implicit-def: $sgpr17
-; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    ; implicit-def: $sgpr17
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; kill: killed $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; kill: killed $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:  .LBB107_3: ; %Flow
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_mov_b32 s4, s60
 ; SI-NEXT:    s_mov_b32 s5, s17
-; SI-NEXT:    s_mov_b32 s17, s86
-; SI-NEXT:    s_mov_b32 s86, s7
+; SI-NEXT:    s_mov_b32 s17, s61
+; SI-NEXT:    s_mov_b32 s60, s72
+; SI-NEXT:    s_mov_b32 s61, s74
+; SI-NEXT:    s_mov_b32 s72, s76
+; SI-NEXT:    s_mov_b32 s74, s78
+; SI-NEXT:    s_mov_b32 s76, s88
+; SI-NEXT:    s_mov_b32 s78, s92
+; SI-NEXT:    s_mov_b32 s88, s91
+; SI-NEXT:    s_mov_b32 s91, s93
+; SI-NEXT:    s_mov_b32 s92, s94
+; SI-NEXT:    s_mov_b32 s93, s95
+; SI-NEXT:    s_mov_b32 s94, s30
+; SI-NEXT:    s_mov_b32 s95, s31
+; SI-NEXT:    s_mov_b32 s30, s34
+; SI-NEXT:    s_mov_b32 s31, s37
+; SI-NEXT:    s_mov_b32 s34, s36
+; SI-NEXT:    s_mov_b32 s36, s38
+; SI-NEXT:    s_mov_b32 s37, s39
+; SI-NEXT:    s_mov_b32 s38, s48
+; SI-NEXT:    s_mov_b32 s39, s49
+; SI-NEXT:    s_mov_b32 s48, s50
+; SI-NEXT:    s_mov_b32 s49, s51
+; SI-NEXT:    s_mov_b32 s50, s52
+; SI-NEXT:    s_mov_b32 s51, s53
+; SI-NEXT:    s_mov_b32 s52, s54
+; SI-NEXT:    s_mov_b32 s53, s55
+; SI-NEXT:    s_mov_b32 s54, s6
+; SI-NEXT:    s_mov_b32 s55, s16
 ; SI-NEXT:    s_cbranch_vccnz .LBB107_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    s_lshl_b32 s5, s6, 16
-; SI-NEXT:    v_readlane_b32 s6, v41, 24
-; SI-NEXT:    s_lshl_b32 s20, s6, 16
-; SI-NEXT:    v_readlane_b32 s6, v41, 23
-; SI-NEXT:    s_lshl_b32 s17, s6, 16
-; SI-NEXT:    v_readlane_b32 s6, v41, 22
-; SI-NEXT:    s_lshl_b32 s61, s16, 16
-; SI-NEXT:    s_add_i32 s16, s6, 3
-; SI-NEXT:    v_readlane_b32 s6, v41, 21
-; SI-NEXT:    s_and_b32 s16, s16, 0xffff
-; SI-NEXT:    s_lshl_b32 s7, s6, 16
-; SI-NEXT:    v_readlane_b32 s6, v41, 20
-; SI-NEXT:    s_or_b32 s7, s7, s16
-; SI-NEXT:    s_add_i32 s6, s6, 3
-; SI-NEXT:    v_readlane_b32 s16, v41, 19
-; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    s_and_b32 s6, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s16, s16, 16
-; SI-NEXT:    s_and_b32 s19, s19, 0xffff
-; SI-NEXT:    s_or_b32 s6, s16, s6
-; SI-NEXT:    v_readlane_b32 s16, v41, 18
-; SI-NEXT:    s_lshl_b32 s60, s98, 16
-; SI-NEXT:    s_or_b32 s17, s17, s19
-; SI-NEXT:    s_add_i32 s98, s16, 3
-; SI-NEXT:    v_readlane_b32 s19, v41, 17
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    s_and_b32 s16, s98, 0xffff
-; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_add_i32 s11, s11, 3
-; SI-NEXT:    s_add_i32 s9, s9, 3
-; SI-NEXT:    s_and_b32 s21, s21, 0xffff
-; SI-NEXT:    s_or_b32 s16, s19, s16
-; SI-NEXT:    v_readlane_b32 s19, v41, 16
 ; SI-NEXT:    s_add_i32 s13, s13, 3
+; SI-NEXT:    s_and_b32 s13, s13, 0xffff
+; SI-NEXT:    s_lshl_b32 s12, s12, 16
+; SI-NEXT:    s_add_i32 s11, s11, 3
+; SI-NEXT:    v_readlane_b32 s6, v41, 30
+; SI-NEXT:    s_add_i32 s15, s15, 3
+; SI-NEXT:    s_or_b32 s12, s12, s13
 ; SI-NEXT:    s_and_b32 s11, s11, 0xffff
 ; SI-NEXT:    s_lshl_b32 s10, s10, 16
+; SI-NEXT:    s_lshl_b32 s13, s6, 16
+; SI-NEXT:    v_readlane_b32 s6, v41, 29
+; SI-NEXT:    s_and_b32 s15, s15, 0xffff
+; SI-NEXT:    s_lshl_b32 s14, s14, 16
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    s_lshl_b32 s11, s25, 16
+; SI-NEXT:    s_add_i32 s25, s6, 3
+; SI-NEXT:    v_readlane_b32 s6, v41, 28
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    s_lshl_b32 s15, s6, 16
+; SI-NEXT:    v_readlane_b32 s6, v41, 27
+; SI-NEXT:    s_add_i32 s23, s6, 3
+; SI-NEXT:    v_readlane_b32 s6, v41, 26
+; SI-NEXT:    s_add_i32 s9, s9, 3
+; SI-NEXT:    s_lshl_b32 s20, s6, 16
+; SI-NEXT:    v_readlane_b32 s6, v41, 25
+; SI-NEXT:    s_lshl_b32 s5, s21, 16
 ; SI-NEXT:    s_and_b32 s9, s9, 0xffff
 ; SI-NEXT:    s_lshl_b32 s8, s8, 16
-; SI-NEXT:    s_add_i32 s29, s29, 3
-; SI-NEXT:    s_or_b32 s20, s20, s21
-; SI-NEXT:    s_add_i32 s96, s19, 3
-; SI-NEXT:    v_readlane_b32 s21, v41, 15
-; SI-NEXT:    s_add_i32 s15, s15, 3
-; SI-NEXT:    s_and_b32 s13, s13, 0xffff
-; SI-NEXT:    s_lshl_b32 s12, s12, 16
-; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    s_add_i32 s7, s7, 3
+; SI-NEXT:    s_add_i32 s21, s6, 3
+; SI-NEXT:    v_readlane_b32 s6, v41, 23
 ; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s89, 16
+; SI-NEXT:    s_add_i32 s29, s29, 3
+; SI-NEXT:    s_lshl_b32 s19, s6, 16
+; SI-NEXT:    v_readlane_b32 s6, v41, 22
+; SI-NEXT:    s_or_b32 s7, s9, s7
 ; SI-NEXT:    s_and_b32 s9, s29, 0xffff
-; SI-NEXT:    s_lshl_b32 s11, s88, 16
 ; SI-NEXT:    s_add_i32 s27, s27, 3
-; SI-NEXT:    s_and_b32 s19, s96, 0xffff
-; SI-NEXT:    s_lshl_b32 s21, s21, 16
-; SI-NEXT:    s_and_b32 s15, s15, 0xffff
-; SI-NEXT:    s_lshl_b32 s14, s14, 16
-; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    s_add_i32 s16, s6, 3
+; SI-NEXT:    v_readlane_b32 s6, v41, 21
 ; SI-NEXT:    s_or_b32 s9, s11, s9
 ; SI-NEXT:    s_and_b32 s11, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s13, s79, 16
-; SI-NEXT:    s_add_i32 s25, s25, 3
-; SI-NEXT:    s_or_b32 s19, s21, s19
-; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    v_readlane_b32 s21, v41, 14
-; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s6, 16
+; SI-NEXT:    v_readlane_b32 s6, v41, 20
 ; SI-NEXT:    s_or_b32 s11, s13, s11
 ; SI-NEXT:    s_and_b32 s13, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s15, s78, 16
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    s_and_b32 s18, s18, 0xffff
-; SI-NEXT:    s_lshl_b32 s21, s21, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s6, s6, 3
+; SI-NEXT:    v_readlane_b32 s17, v41, 19
 ; SI-NEXT:    s_or_b32 s13, s15, s13
 ; SI-NEXT:    s_and_b32 s15, s23, 0xffff
-; SI-NEXT:    s_lshl_b32 s22, s77, 16
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s60, s18, 16
+; SI-NEXT:    s_or_b32 s15, s20, s15
+; SI-NEXT:    s_and_b32 s20, s21, 0xffff
+; SI-NEXT:    s_or_b32 s6, s17, s6
+; SI-NEXT:    v_readlane_b32 s17, v41, 18
+; SI-NEXT:    v_readlane_b32 s18, v41, 17
+; SI-NEXT:    s_or_b32 s19, s19, s20
+; SI-NEXT:    s_add_i32 s98, s17, 3
+; SI-NEXT:    s_lshl_b32 s20, s18, 16
+; SI-NEXT:    v_readlane_b32 s18, v41, 16
+; SI-NEXT:    s_and_b32 s17, s98, 0xffff
+; SI-NEXT:    s_add_i32 s96, s18, 3
+; SI-NEXT:    v_readlane_b32 s18, v41, 15
+; SI-NEXT:    s_or_b32 s17, s20, s17
+; SI-NEXT:    s_and_b32 s20, s96, 0xffff
+; SI-NEXT:    s_lshl_b32 s21, s18, 16
+; SI-NEXT:    v_readlane_b32 s18, v41, 24
+; SI-NEXT:    s_or_b32 s20, s21, s20
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    v_readlane_b32 s21, v41, 14
+; SI-NEXT:    s_and_b32 s18, s18, 0xffff
+; SI-NEXT:    s_lshl_b32 s21, s21, 16
 ; SI-NEXT:    s_or_b32 s18, s21, s18
 ; SI-NEXT:    v_readlane_b32 s21, v41, 13
-; SI-NEXT:    s_or_b32 s15, s22, s15
 ; SI-NEXT:    s_add_i32 s21, s21, 3
 ; SI-NEXT:    v_readlane_b32 s22, v41, 12
 ; SI-NEXT:    s_and_b32 s21, s21, 0xffff
@@ -239293,42 +240227,20 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    s_and_b32 s27, s27, 0xffff
 ; SI-NEXT:    s_lshl_b32 s28, s28, 16
 ; SI-NEXT:    s_or_b32 s27, s28, s27
-; SI-NEXT:    s_add_i32 s27, s27, 0x30000
-; SI-NEXT:    s_add_i32 s26, s26, 0x30000
-; SI-NEXT:    s_and_b32 s86, s27, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s27, s27, 16
-; SI-NEXT:    s_add_i32 s25, s25, 0x30000
-; SI-NEXT:    v_writelane_b32 v41, s27, 25
-; SI-NEXT:    s_and_b32 s96, s26, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s26, s26, 16
-; SI-NEXT:    s_add_i32 s24, s24, 0x30000
-; SI-NEXT:    v_writelane_b32 v41, s26, 26
-; SI-NEXT:    s_and_b32 s99, s25, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s25, s25, 16
-; SI-NEXT:    s_add_i32 s23, s23, 0x30000
-; SI-NEXT:    v_writelane_b32 v41, s25, 27
-; SI-NEXT:    s_and_b32 s97, s24, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s24, s24, 16
 ; SI-NEXT:    s_add_i32 s80, s80, 3
-; SI-NEXT:    s_add_i32 s22, s22, 0x30000
-; SI-NEXT:    v_writelane_b32 v41, s24, 28
-; SI-NEXT:    s_and_b32 s92, s23, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s23, s23, 16
+; SI-NEXT:    s_add_i32 s27, s27, 0x30000
 ; SI-NEXT:    s_and_b32 s4, s80, 0xffff
 ; SI-NEXT:    s_add_i32 s84, s84, 3
-; SI-NEXT:    s_add_i32 s21, s21, 0x30000
-; SI-NEXT:    v_writelane_b32 v41, s23, 29
-; SI-NEXT:    s_and_b32 s94, s22, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s22, s22, 16
+; SI-NEXT:    s_and_b32 s28, s27, 0xffff0000
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_and_b32 s5, s84, 0xffff
 ; SI-NEXT:    s_add_i32 s83, s83, 3
-; SI-NEXT:    s_add_i32 s18, s18, 0x30000
-; SI-NEXT:    v_writelane_b32 v41, s22, 30
-; SI-NEXT:    s_and_b32 s95, s21, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s21, s21, 16
+; SI-NEXT:    s_add_i32 s26, s26, 0x30000
+; SI-NEXT:    v_writelane_b32 v41, s28, 31
+; SI-NEXT:    s_lshl_b32 s27, s27, 16
 ; SI-NEXT:    s_or_b32 s5, s60, s5
 ; SI-NEXT:    s_and_b32 s60, s83, 0xffff
+; SI-NEXT:    s_lshl_b32 s61, s79, 16
 ; SI-NEXT:    s_add_i32 s87, s87, 3
 ; SI-NEXT:    s_add_i32 s59, s59, 3
 ; SI-NEXT:    s_add_i32 s57, s57, 3
@@ -239336,13 +240248,11 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    s_add_i32 s45, s45, 3
 ; SI-NEXT:    s_add_i32 s43, s43, 3
 ; SI-NEXT:    s_add_i32 s41, s41, 3
-; SI-NEXT:    s_add_i32 s19, s19, 0x30000
-; SI-NEXT:    v_writelane_b32 v41, s21, 31
-; SI-NEXT:    s_and_b32 s93, s18, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s18, s18, 16
-; SI-NEXT:    s_or_b32 s76, s61, s60
+; SI-NEXT:    v_writelane_b32 v41, s27, 32
+; SI-NEXT:    s_and_b32 s27, s26, 0xffff0000
+; SI-NEXT:    s_or_b32 vcc_lo, s61, s60
 ; SI-NEXT:    s_and_b32 s60, s87, 0xffff
-; SI-NEXT:    s_lshl_b32 s61, s73, 16
+; SI-NEXT:    s_lshl_b32 s61, s77, 16
 ; SI-NEXT:    s_and_b32 s59, s59, 0xffff
 ; SI-NEXT:    s_lshl_b32 s58, s58, 16
 ; SI-NEXT:    s_and_b32 s57, s57, 0xffff
@@ -239355,24 +240265,22 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    s_lshl_b32 s42, s42, 16
 ; SI-NEXT:    s_and_b32 s41, s41, 0xffff
 ; SI-NEXT:    s_lshl_b32 s40, s40, 16
-; SI-NEXT:    s_add_i32 s16, s16, 0x30000
-; SI-NEXT:    v_writelane_b32 v41, s18, 32
-; SI-NEXT:    s_lshl_b32 s18, s19, 16
-; SI-NEXT:    s_or_b32 s75, s61, s60
+; SI-NEXT:    s_add_i32 s25, s25, 0x30000
+; SI-NEXT:    v_writelane_b32 v41, s27, 33
+; SI-NEXT:    s_lshl_b32 s26, s26, 16
+; SI-NEXT:    s_or_b32 vcc_hi, s61, s60
 ; SI-NEXT:    s_or_b32 s58, s58, s59
 ; SI-NEXT:    s_or_b32 s56, s56, s57
 ; SI-NEXT:    s_or_b32 s46, s46, s47
 ; SI-NEXT:    s_or_b32 s44, s44, s45
 ; SI-NEXT:    s_or_b32 s42, s42, s43
 ; SI-NEXT:    s_or_b32 s40, s40, s41
-; SI-NEXT:    s_add_i32 s6, s6, 0x30000
-; SI-NEXT:    v_writelane_b32 v41, s18, 33
-; SI-NEXT:    s_and_b32 s31, s16, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s16, s16, 16
+; SI-NEXT:    v_writelane_b32 v41, s26, 34
+; SI-NEXT:    s_and_b32 s26, s25, 0xffff0000
 ; SI-NEXT:    s_add_i32 s4, s4, 0x30000
 ; SI-NEXT:    s_add_i32 s5, s5, 0x30000
-; SI-NEXT:    s_add_i32 s76, s76, 0x30000
-; SI-NEXT:    s_add_i32 s75, s75, 0x30000
+; SI-NEXT:    s_add_i32 vcc_lo, vcc_lo, 0x30000
+; SI-NEXT:    s_add_i32 vcc_hi, vcc_hi, 0x30000
 ; SI-NEXT:    s_add_i32 s58, s58, 0x30000
 ; SI-NEXT:    s_add_i32 s56, s56, 0x30000
 ; SI-NEXT:    s_add_i32 s46, s46, 0x30000
@@ -239383,294 +240291,311 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    s_add_i32 s12, s12, 0x30000
 ; SI-NEXT:    s_add_i32 s10, s10, 0x30000
 ; SI-NEXT:    s_add_i32 s8, s8, 0x30000
+; SI-NEXT:    s_add_i32 s7, s7, 0x30000
 ; SI-NEXT:    s_add_i32 s9, s9, 0x30000
 ; SI-NEXT:    s_add_i32 s11, s11, 0x30000
 ; SI-NEXT:    s_add_i32 s13, s13, 0x30000
 ; SI-NEXT:    s_add_i32 s15, s15, 0x30000
-; SI-NEXT:    s_add_i32 s20, s20, 0x30000
+; SI-NEXT:    s_add_i32 s19, s19, 0x30000
+; SI-NEXT:    s_add_i32 s16, s16, 0x30000
+; SI-NEXT:    s_add_i32 s6, s6, 0x30000
 ; SI-NEXT:    s_add_i32 s17, s17, 0x30000
-; SI-NEXT:    s_add_i32 s7, s7, 0x30000
-; SI-NEXT:    v_writelane_b32 v41, s16, 34
-; SI-NEXT:    s_and_b32 s34, s6, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_and_b32 s30, s19, 0xffff0000
-; SI-NEXT:    v_writelane_b32 v41, s6, 35
-; SI-NEXT:    s_and_b32 s35, s7, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s6, s7, 16
-; SI-NEXT:    s_and_b32 s36, s17, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s17, s17, 16
-; SI-NEXT:    s_and_b32 s37, s20, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s22, s20, 16
-; SI-NEXT:    s_and_b32 s38, s15, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s24, s15, 16
-; SI-NEXT:    s_and_b32 s39, s13, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s28, s13, 16
-; SI-NEXT:    s_and_b32 s48, s11, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s61, s11, 16
+; SI-NEXT:    s_add_i32 s20, s20, 0x30000
+; SI-NEXT:    s_add_i32 s18, s18, 0x30000
+; SI-NEXT:    s_add_i32 s21, s21, 0x30000
+; SI-NEXT:    s_add_i32 s22, s22, 0x30000
+; SI-NEXT:    s_add_i32 s23, s23, 0x30000
+; SI-NEXT:    s_add_i32 s24, s24, 0x30000
+; SI-NEXT:    v_writelane_b32 v41, s26, 35
+; SI-NEXT:    s_lshl_b32 s25, s25, 16
+; SI-NEXT:    v_writelane_b32 v41, s25, 36
+; SI-NEXT:    s_and_b32 s25, s24, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s55, s24, 16
+; SI-NEXT:    s_and_b32 s73, s23, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s54, s23, 16
+; SI-NEXT:    s_and_b32 s63, s22, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s98, s22, 16
+; SI-NEXT:    s_and_b32 s62, s21, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s96, s21, 16
+; SI-NEXT:    s_and_b32 s99, s18, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s97, s18, 16
+; SI-NEXT:    s_and_b32 s86, s20, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s85, s20, 16
+; SI-NEXT:    s_and_b32 s82, s17, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s81, s17, 16
+; SI-NEXT:    s_and_b32 s71, s6, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s70, s6, 16
+; SI-NEXT:    s_and_b32 s69, s16, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s68, s16, 16
+; SI-NEXT:    s_and_b32 s67, s19, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s66, s19, 16
+; SI-NEXT:    s_and_b32 s65, s15, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s64, s15, 16
+; SI-NEXT:    s_and_b32 s53, s13, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s52, s13, 16
+; SI-NEXT:    s_and_b32 s51, s11, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s50, s11, 16
 ; SI-NEXT:    s_and_b32 s49, s9, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s89, s9, 16
-; SI-NEXT:    s_and_b32 s50, s8, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s60, s8, 16
-; SI-NEXT:    s_and_b32 s91, s10, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s90, s10, 16
-; SI-NEXT:    s_and_b32 s51, s12, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s70, s12, 16
-; SI-NEXT:    s_and_b32 s52, s14, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s71, s14, 16
-; SI-NEXT:    s_and_b32 s53, s40, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s20, s40, 16
-; SI-NEXT:    s_and_b32 s54, s42, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s81, s42, 16
-; SI-NEXT:    s_and_b32 s55, s44, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s63, s44, 16
-; SI-NEXT:    s_and_b32 s64, s46, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s72, s46, 16
-; SI-NEXT:    s_and_b32 s65, s56, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s82, s56, 16
-; SI-NEXT:    s_and_b32 s66, s58, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s74, s58, 16
-; SI-NEXT:    s_and_b32 s67, s75, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s75, s75, 16
-; SI-NEXT:    s_and_b32 s68, s76, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s76, s76, 16
-; SI-NEXT:    s_and_b32 s69, s5, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s85, s5, 16
-; SI-NEXT:    s_and_b32 s26, s4, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s5, s4, 16
-; SI-NEXT:    v_writelane_b32 v41, s6, 36
+; SI-NEXT:    s_lshl_b32 s48, s9, 16
+; SI-NEXT:    s_and_b32 s39, s7, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s38, s7, 16
+; SI-NEXT:    s_and_b32 s37, s8, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s36, s8, 16
+; SI-NEXT:    s_and_b32 s35, s10, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s31, s10, 16
+; SI-NEXT:    s_and_b32 s34, s12, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s95, s12, 16
+; SI-NEXT:    s_and_b32 s30, s14, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s93, s14, 16
+; SI-NEXT:    s_and_b32 s94, s40, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s91, s40, 16
+; SI-NEXT:    s_and_b32 s92, s42, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s88, s42, 16
+; SI-NEXT:    s_and_b32 s90, s44, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s78, s44, 16
+; SI-NEXT:    s_and_b32 s28, s46, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s76, s46, 16
+; SI-NEXT:    s_and_b32 s26, s56, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s74, s56, 16
+; SI-NEXT:    s_and_b32 s24, s58, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s72, s58, 16
+; SI-NEXT:    s_and_b32 s22, vcc_hi, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s61, vcc_hi, 16
+; SI-NEXT:    s_and_b32 s20, vcc_lo, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s60, vcc_lo, 16
+; SI-NEXT:    s_and_b32 s19, s5, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s17, s5, 16
+; SI-NEXT:    s_and_b32 s5, s4, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_writelane_b32 v41, s25, 37
 ; SI-NEXT:  .LBB107_5: ; %end
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s86
-; SI-NEXT:    v_readlane_b32 s4, v41, 25
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_readlane_b32 s6, v41, 31
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; SI-NEXT:    v_readlane_b32 s6, v41, 32
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_readlane_b32 s6, v41, 33
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s96
-; SI-NEXT:    v_readlane_b32 s4, v41, 26
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; SI-NEXT:    v_readlane_b32 s6, v41, 34
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; SI-NEXT:    v_readlane_b32 s6, v41, 35
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s99
-; SI-NEXT:    v_readlane_b32 s4, v41, 27
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; SI-NEXT:    v_readlane_b32 s6, v41, 36
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
+; SI-NEXT:    v_readlane_b32 s6, v41, 37
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s97
-; SI-NEXT:    v_readlane_b32 s4, v41, 28
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s6
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s55
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s92
-; SI-NEXT:    v_readlane_b32 s4, v41, 29
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s73
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s54
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s94
-; SI-NEXT:    v_readlane_b32 s4, v41, 30
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s63
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s98
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s95
-; SI-NEXT:    v_readlane_b32 s4, v41, 31
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s62
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s96
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s93
-; SI-NEXT:    v_readlane_b32 s4, v41, 32
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s99
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s97
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s30
-; SI-NEXT:    v_readlane_b32 s4, v41, 33
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s86
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s85
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s31
-; SI-NEXT:    v_readlane_b32 s4, v41, 34
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s82
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s81
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s34
-; SI-NEXT:    v_readlane_b32 s4, v41, 35
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s71
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s70
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s35
-; SI-NEXT:    v_readlane_b32 s4, v41, 36
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s69
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s68
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s36
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s17
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s67
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s66
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s37
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s22
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s65
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s64
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s38
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s24
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s53
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s52
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s39
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s28
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s51
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s50
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s49
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s48
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s61
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s49
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s89
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s39
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s38
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s50
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s60
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s37
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s36
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s91
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s90
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s35
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s31
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s51
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s70
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s34
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s95
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s52
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s71
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s30
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s93
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s53
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s20
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s94
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s91
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s54
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s81
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s92
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s88
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s55
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s63
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s90
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s78
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s64
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s72
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s28
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s76
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s65
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s82
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s26
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s74
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s66
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s74
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s24
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s72
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s67
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s75
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s22
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s61
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s68
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s76
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s20
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s60
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s69
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s85
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s19
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s17
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s26
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s5
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s5
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    v_readlane_b32 s99, v40, 35
@@ -240180,38 +241105,39 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:92
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:88
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v8
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v2
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v22
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v14
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v14
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v18, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, v23
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v25
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v24
@@ -240222,7 +241148,6 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v25, v50
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v29
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v42
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, v47
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v38
@@ -241300,10 +242225,12 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -241315,7 +242242,24 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v26
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v49
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v54
+; SI-NEXT:    v_mov_b32_e32 v54, v15
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v12
+; SI-NEXT:    v_mov_b32_e32 v12, v42
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -241325,8 +242269,13 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v26, v3, v5
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v22
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v49
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
@@ -241335,39 +242284,22 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v22, v3, v5
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v18
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v49
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v54
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v18, v3, v5
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v16
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_mov_b32_e32 v54, v15
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v12
-; SI-NEXT:    v_mov_b32_e32 v12, v42
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
@@ -241385,8 +242317,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v14, v3, v5
@@ -241430,11 +242360,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v49
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
@@ -241571,27 +242496,27 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v12, v50, v1
 ; SI-NEXT:    v_lshr_b64 v[49:50], v[35:36], 16
-; SI-NEXT:    v_mov_b32_e32 v35, v44
-; SI-NEXT:    v_lshr_b64 v[44:45], v[25:26], 16
 ; SI-NEXT:    v_lshr_b64 v[50:51], v[21:22], 16
-; SI-NEXT:    v_lshr_b64 v[24:25], v[17:18], 16
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[42:43], 16
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[9:10], 16
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v35, v44
+; SI-NEXT:    v_lshr_b64 v[44:45], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[24:25], v[17:18], 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[40:41], 16
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[12:13], v[13:14], 16
 ; SI-NEXT:    v_lshr_b64 v[24:25], v[3:4], 16
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[1:2], 16
 ; SI-NEXT:    v_mov_b32_e32 v42, v61
 ; SI-NEXT:    v_mov_b32_e32 v61, v37
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
index c6211aae19c1b..22dd3a0438136 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
@@ -2853,50 +2853,52 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
-; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s22
 ; SI-NEXT:    s_cbranch_scc0 .LBB23_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; SI-NEXT:    v_alignbit_b32 v0, v0, v11, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v9, 16
-; SI-NEXT:    v_alignbit_b32 v2, v2, v7, 16
-; SI-NEXT:    v_alignbit_b32 v3, v3, v5, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
+; SI-NEXT:    v_lshr_b64 v[0:1], v[10:11], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
+; SI-NEXT:    v_lshr_b64 v[1:2], v[8:9], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v13
+; SI-NEXT:    v_lshr_b64 v[16:17], v[4:5], 16
+; SI-NEXT:    v_lshr_b64 v[2:3], v[6:7], 16
+; SI-NEXT:    v_mov_b32_e32 v3, v16
 ; SI-NEXT:    s_cbranch_execnz .LBB23_3
 ; SI-NEXT:  .LBB23_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v10
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v11
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v15
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v8
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v9
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v14
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v12
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v13
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT:    v_mov_b32_e32 v3, v4
 ; SI-NEXT:  .LBB23_3: ; %end
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB23_4:
@@ -2912,78 +2914,80 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3
 ; VI-NEXT:    s_cbranch_execnz .LBB23_4
 ; VI-NEXT:  .LBB23_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v7, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s4, v0
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v4, v1, 16
-; VI-NEXT:    v_add_f32_e32 v4, s4, v0
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; VI-NEXT:    v_lshrrev_b64 v[5:6], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v5
+; VI-NEXT:    v_mov_b32_e32 v3, v4
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB23_3:
 ; VI-NEXT:    s_branch .LBB23_2
@@ -7392,50 +7396,52 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a,
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
-; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s22
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; SI-NEXT:    v_alignbit_b32 v0, v0, v11, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v9, 16
-; SI-NEXT:    v_alignbit_b32 v2, v2, v7, 16
-; SI-NEXT:    v_alignbit_b32 v3, v3, v5, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
+; SI-NEXT:    v_lshr_b64 v[0:1], v[10:11], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
+; SI-NEXT:    v_lshr_b64 v[1:2], v[8:9], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v13
+; SI-NEXT:    v_lshr_b64 v[16:17], v[4:5], 16
+; SI-NEXT:    v_lshr_b64 v[2:3], v[6:7], 16
+; SI-NEXT:    v_mov_b32_e32 v3, v16
 ; SI-NEXT:    s_cbranch_execnz .LBB47_3
 ; SI-NEXT:  .LBB47_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v10
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v11
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v15
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v8
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v9
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v14
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v12
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v13
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT:    v_mov_b32_e32 v3, v4
 ; SI-NEXT:  .LBB47_3: ; %end
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB47_4:
@@ -7451,78 +7457,80 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a,
 ; VI-NEXT:    s_cbranch_execnz .LBB47_4
 ; VI-NEXT:  .LBB47_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v7, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s4, v0
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v4, v1, 16
-; VI-NEXT:    v_add_f32_e32 v4, s4, v0
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; VI-NEXT:    v_lshrrev_b64 v[5:6], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v5
+; VI-NEXT:    v_mov_b32_e32 v3, v4
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB47_3:
 ; VI-NEXT:    s_branch .LBB47_2
@@ -11581,50 +11589,52 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
-; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s22
 ; SI-NEXT:    s_cbranch_scc0 .LBB67_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; SI-NEXT:    v_alignbit_b32 v0, v0, v11, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v9, 16
-; SI-NEXT:    v_alignbit_b32 v2, v2, v7, 16
-; SI-NEXT:    v_alignbit_b32 v3, v3, v5, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
+; SI-NEXT:    v_lshr_b64 v[0:1], v[10:11], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
+; SI-NEXT:    v_lshr_b64 v[1:2], v[8:9], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v13
+; SI-NEXT:    v_lshr_b64 v[16:17], v[4:5], 16
+; SI-NEXT:    v_lshr_b64 v[2:3], v[6:7], 16
+; SI-NEXT:    v_mov_b32_e32 v3, v16
 ; SI-NEXT:    s_cbranch_execnz .LBB67_3
 ; SI-NEXT:  .LBB67_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v10
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v11
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v15
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v8
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v9
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v14
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v12
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v13
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT:    v_mov_b32_e32 v3, v4
 ; SI-NEXT:  .LBB67_3: ; %end
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB67_4:
@@ -11640,78 +11650,80 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3
 ; VI-NEXT:    s_cbranch_execnz .LBB67_4
 ; VI-NEXT:  .LBB67_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v7, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s4, v0
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v4, v1, 16
-; VI-NEXT:    v_add_f32_e32 v4, s4, v0
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; VI-NEXT:    v_lshrrev_b64 v[5:6], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v5
+; VI-NEXT:    v_mov_b32_e32 v3, v4
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB67_3:
 ; VI-NEXT:    s_branch .LBB67_2
@@ -15349,50 +15361,52 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a,
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
-; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s22
 ; SI-NEXT:    s_cbranch_scc0 .LBB83_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; SI-NEXT:    v_alignbit_b32 v0, v0, v11, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v9, 16
-; SI-NEXT:    v_alignbit_b32 v2, v2, v7, 16
-; SI-NEXT:    v_alignbit_b32 v3, v3, v5, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
+; SI-NEXT:    v_lshr_b64 v[0:1], v[10:11], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
+; SI-NEXT:    v_lshr_b64 v[1:2], v[8:9], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v13
+; SI-NEXT:    v_lshr_b64 v[16:17], v[4:5], 16
+; SI-NEXT:    v_lshr_b64 v[2:3], v[6:7], 16
+; SI-NEXT:    v_mov_b32_e32 v3, v16
 ; SI-NEXT:    s_cbranch_execnz .LBB83_3
 ; SI-NEXT:  .LBB83_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v10
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v11
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v15
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v8
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v9
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v14
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v12
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v13
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT:    v_mov_b32_e32 v3, v4
 ; SI-NEXT:  .LBB83_3: ; %end
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB83_4:
@@ -15408,78 +15422,80 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a,
 ; VI-NEXT:    s_cbranch_execnz .LBB83_4
 ; VI-NEXT:  .LBB83_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v7, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s4, v0
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v4, v1, 16
-; VI-NEXT:    v_add_f32_e32 v4, s4, v0
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; VI-NEXT:    v_lshrrev_b64 v[5:6], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v5
+; VI-NEXT:    v_mov_b32_e32 v3, v4
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB83_3:
 ; VI-NEXT:    s_branch .LBB83_2
@@ -18719,66 +18735,68 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
 ; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s17
 ; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s23
 ; SI-NEXT:    s_cbranch_scc0 .LBB95_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
 ; SI-NEXT:    s_cbranch_execnz .LBB95_3
 ; SI-NEXT:  .LBB95_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v14
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v15
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v2
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v13
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v12
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v11
-; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v14
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v6, v7, v3, 16
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; SI-NEXT:    v_lshr_b64 v[10:11], v[1:2], 16
-; SI-NEXT:    v_lshr_b64 v[8:9], v[5:6], 16
-; SI-NEXT:    v_alignbit_b32 v4, v12, v13, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v10
+; SI-NEXT:    v_lshr_b64 v[4:5], v[1:2], 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v9
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
+; SI-NEXT:    v_lshr_b64 v[2:3], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[6:7], v[7:8], 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v11
+; SI-NEXT:    v_lshr_b64 v[11:12], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[12:13], v[5:6], 16
 ; SI-NEXT:  .LBB95_3: ; %end
-; SI-NEXT:    v_mov_b32_e32 v1, v10
-; SI-NEXT:    v_mov_b32_e32 v5, v8
+; SI-NEXT:    v_mov_b32_e32 v1, v11
+; SI-NEXT:    v_mov_b32_e32 v3, v10
+; SI-NEXT:    v_mov_b32_e32 v5, v12
+; SI-NEXT:    v_mov_b32_e32 v7, v8
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB95_4:
 ; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr8
 ; SI-NEXT:    s_branch .LBB95_2
 ;
 ; VI-LABEL: bitcast_v8bf16_to_v8i16_scalar:
@@ -18790,78 +18808,80 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3
 ; VI-NEXT:    s_cbranch_execnz .LBB95_4
 ; VI-NEXT:  .LBB95_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    v_mov_b32_e32 v7, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v7
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v6, vcc
+; VI-NEXT:    v_add_f32_e32 v2, s4, v7
+; VI-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v7
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v0
-; VI-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
-; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v6, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v7
+; VI-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
+; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; VI-NEXT:    v_or_b32_e32 v8, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; VI-NEXT:    v_add_f32_e32 v3, s4, v0
-; VI-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc
+; VI-NEXT:    v_add_f32_e32 v6, s4, v7
+; VI-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v6
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
 ; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; VI-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT:    v_add_f32_e32 v7, s4, v7
+; VI-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc
+; VI-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v7
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v8, v9, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v3, v0, v3, 16
-; VI-NEXT:    v_alignbit_b32 v2, v7, v2, 16
-; VI-NEXT:    v_alignbit_b32 v1, v6, v1, 16
-; VI-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_cndmask_b32_e32 v7, v8, v9, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[2:3]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[6:7]
+; VI-NEXT:    v_lshrrev_b64 v[3:4], 16, v[4:5]
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v3
+; VI-NEXT:    v_mov_b32_e32 v3, v6
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB95_3:
 ; VI-NEXT:    s_branch .LBB95_2
@@ -21770,78 +21790,80 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i
 ; VI-NEXT:    s_cbranch_execnz .LBB103_4
 ; VI-NEXT:  .LBB103_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    v_mov_b32_e32 v7, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v7
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v7
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v7
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v6, vcc
+; VI-NEXT:    v_add_f32_e32 v2, s4, v7
+; VI-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v7
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v0
-; VI-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
-; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v6, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v7
+; VI-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
+; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; VI-NEXT:    v_or_b32_e32 v8, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; VI-NEXT:    v_add_f32_e32 v3, s4, v0
-; VI-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc
+; VI-NEXT:    v_add_f32_e32 v6, s4, v7
+; VI-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v6
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
 ; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; VI-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT:    v_add_f32_e32 v7, s4, v7
+; VI-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc
+; VI-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v7
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v8, v9, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v3, v0, v3, 16
-; VI-NEXT:    v_alignbit_b32 v2, v7, v2, 16
-; VI-NEXT:    v_alignbit_b32 v1, v6, v1, 16
-; VI-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_cndmask_b32_e32 v7, v8, v9, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[2:3]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[6:7]
+; VI-NEXT:    v_lshrrev_b64 v[3:4], 16, v[4:5]
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v3
+; VI-NEXT:    v_mov_b32_e32 v3, v6
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB103_3:
 ; VI-NEXT:    s_branch .LBB103_2
@@ -24532,92 +24554,92 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
-; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v26, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v23, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v24, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v28, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v27, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v30, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s22
 ; SI-NEXT:    s_cbranch_scc0 .LBB109_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; SI-NEXT:    v_alignbit_b32 v19, v1, v16, 16
-; SI-NEXT:    v_alignbit_b32 v20, v6, v8, 16
-; SI-NEXT:    v_lshr_b64 v[1:2], v[19:20], 8
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v30
+; SI-NEXT:    v_lshr_b64 v[19:20], v[8:9], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v27
+; SI-NEXT:    v_lshr_b64 v[22:23], v[25:26], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v29
+; SI-NEXT:    v_lshr_b64 v[20:21], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[23:24], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[0:1], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[10:11], v[22:23], 16
 ; SI-NEXT:    v_lshr_b64 v[3:4], v[19:20], 24
-; SI-NEXT:    v_alignbit_b32 v21, v2, v26, 16
-; SI-NEXT:    v_alignbit_b32 v22, v14, v24, 16
-; SI-NEXT:    v_lshr_b64 v[4:5], v[19:20], 16
-; SI-NEXT:    v_lshr_b64 v[10:11], v[21:22], 16
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v22
-; SI-NEXT:    v_lshr_b64 v[17:18], v[21:22], 24
-; SI-NEXT:    v_lshr_b64 v[11:12], v[21:22], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 8, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 8, v23
+; SI-NEXT:    v_lshr_b64 v[1:2], v[19:20], 8
+; SI-NEXT:    v_lshr_b64 v[17:18], v[22:23], 24
+; SI-NEXT:    v_lshr_b64 v[11:12], v[22:23], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB109_3
 ; SI-NEXT:  .LBB109_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v25
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v26
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v30
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v25
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_alignbit_b32 v21, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v23
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v24
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v15
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v9
-; SI-NEXT:    v_alignbit_b32 v22, v14, v1, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; SI-NEXT:    v_alignbit_b32 v19, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshr_b64 v[22:23], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v29
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v28
+; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v8
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; SI-NEXT:    v_alignbit_b32 v20, v6, v1, 16
+; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshr_b64 v[19:20], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v27
+; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; SI-NEXT:    v_lshr_b64 v[20:21], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[23:24], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[0:1], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[10:11], v[22:23], 16
 ; SI-NEXT:    v_lshr_b64 v[3:4], v[19:20], 24
-; SI-NEXT:    v_lshr_b64 v[10:11], v[21:22], 16
-; SI-NEXT:    v_lshr_b64 v[4:5], v[19:20], 16
 ; SI-NEXT:    v_lshr_b64 v[1:2], v[19:20], 8
-; SI-NEXT:    v_lshr_b64 v[17:18], v[21:22], 24
-; SI-NEXT:    v_lshr_b64 v[11:12], v[21:22], 8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v15
+; SI-NEXT:    v_lshr_b64 v[17:18], v[22:23], 24
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v12
+; SI-NEXT:    v_lshr_b64 v[11:12], v[22:23], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 8, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 8, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v7
 ; SI-NEXT:  .LBB109_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, v19
-; SI-NEXT:    v_mov_b32_e32 v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, v20
-; SI-NEXT:    v_mov_b32_e32 v8, v21
+; SI-NEXT:    v_mov_b32_e32 v5, v9
+; SI-NEXT:    v_mov_b32_e32 v8, v22
 ; SI-NEXT:    v_mov_b32_e32 v9, v11
 ; SI-NEXT:    v_mov_b32_e32 v11, v17
-; SI-NEXT:    v_mov_b32_e32 v12, v22
+; SI-NEXT:    v_mov_b32_e32 v12, v23
+; SI-NEXT:    v_mov_b32_e32 v13, v16
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB109_4:
 ; SI-NEXT:    ; implicit-def: $vgpr19
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr21
+; SI-NEXT:    ; implicit-def: $vgpr22
 ; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    ; implicit-def: $vgpr16
 ; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    s_branch .LBB109_2
@@ -24628,142 +24650,143 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3
 ; VI-NEXT:    s_cmp_lg_u32 s20, 0
 ; VI-NEXT:    s_cbranch_scc0 .LBB109_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    s_lshr_b32 s10, s19, 24
-; VI-NEXT:    s_lshr_b32 s11, s19, 16
-; VI-NEXT:    s_lshr_b32 s13, s19, 8
-; VI-NEXT:    s_lshr_b32 s12, s18, 16
-; VI-NEXT:    s_lshr_b32 s14, s18, 8
-; VI-NEXT:    s_lshr_b32 s15, s17, 24
-; VI-NEXT:    s_lshr_b32 s20, s17, 16
-; VI-NEXT:    s_lshr_b32 s22, s17, 8
-; VI-NEXT:    s_lshr_b32 s21, s16, 16
-; VI-NEXT:    s_lshr_b32 s23, s16, 8
+; VI-NEXT:    s_lshr_b32 s21, s19, 24
+; VI-NEXT:    s_lshr_b32 s20, s19, 16
+; VI-NEXT:    s_lshr_b32 s15, s19, 8
+; VI-NEXT:    s_lshr_b32 s23, s18, 16
+; VI-NEXT:    s_lshr_b32 s22, s18, 8
+; VI-NEXT:    s_lshr_b32 s12, s17, 24
+; VI-NEXT:    s_lshr_b32 s11, s17, 16
+; VI-NEXT:    s_lshr_b32 s10, s17, 8
+; VI-NEXT:    s_lshr_b32 s14, s16, 16
+; VI-NEXT:    s_lshr_b32 s13, s16, 8
 ; VI-NEXT:    s_lshr_b64 s[6:7], s[18:19], 24
 ; VI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
 ; VI-NEXT:    s_cbranch_execnz .LBB109_4
 ; VI-NEXT:  .LBB109_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v6, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v6
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v6
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v19, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v6
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v6
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT:    s_lshl_b32 s4, s19, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v6
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_alignbit_b32 v18, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v6
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    s_lshl_b32 s4, s18, 16
+; VI-NEXT:    v_lshrrev_b64 v[12:13], 16, v[2:3]
+; VI-NEXT:    v_add_f32_e32 v2, s4, v6
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_alignbit_b32 v17, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v16, v0, v1, 16
-; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
-; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
-; VI-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
-; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v6
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[2:3]
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v9, v12
+; VI-NEXT:    v_mov_b32_e32 v1, v4
+; VI-NEXT:    v_lshrrev_b64 v[16:17], 24, v[8:9]
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 24, v[0:1]
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 24, v12
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
+; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v11, 8, v0
 ; VI-NEXT:    s_branch .LBB109_5
 ; VI-NEXT:  .LBB109_3:
-; VI-NEXT:    ; implicit-def: $sgpr23
-; VI-NEXT:    ; implicit-def: $sgpr21
-; VI-NEXT:    ; implicit-def: $sgpr4
-; VI-NEXT:    ; implicit-def: $sgpr22
-; VI-NEXT:    ; implicit-def: $sgpr20
-; VI-NEXT:    ; implicit-def: $sgpr15
+; VI-NEXT:    ; implicit-def: $sgpr13
 ; VI-NEXT:    ; implicit-def: $sgpr14
+; VI-NEXT:    ; implicit-def: $sgpr4
+; VI-NEXT:    ; implicit-def: $sgpr10
+; VI-NEXT:    ; implicit-def: $sgpr11
 ; VI-NEXT:    ; implicit-def: $sgpr12
+; VI-NEXT:    ; implicit-def: $sgpr22
+; VI-NEXT:    ; implicit-def: $sgpr23
 ; VI-NEXT:    ; implicit-def: $sgpr6
-; VI-NEXT:    ; implicit-def: $sgpr13
-; VI-NEXT:    ; implicit-def: $sgpr11
-; VI-NEXT:    ; implicit-def: $sgpr10
+; VI-NEXT:    ; implicit-def: $sgpr15
+; VI-NEXT:    ; implicit-def: $sgpr20
+; VI-NEXT:    ; implicit-def: $sgpr21
 ; VI-NEXT:    s_branch .LBB109_2
 ; VI-NEXT:  .LBB109_4:
-; VI-NEXT:    v_mov_b32_e32 v18, s16
-; VI-NEXT:    v_mov_b32_e32 v19, s17
-; VI-NEXT:    v_mov_b32_e32 v16, s18
-; VI-NEXT:    v_mov_b32_e32 v17, s19
-; VI-NEXT:    v_mov_b32_e32 v1, s23
-; VI-NEXT:    v_mov_b32_e32 v2, s21
-; VI-NEXT:    v_mov_b32_e32 v5, s22
-; VI-NEXT:    v_mov_b32_e32 v6, s20
-; VI-NEXT:    v_mov_b32_e32 v7, s15
-; VI-NEXT:    v_mov_b32_e32 v9, s14
-; VI-NEXT:    v_mov_b32_e32 v10, s12
-; VI-NEXT:    v_mov_b32_e32 v13, s13
-; VI-NEXT:    v_mov_b32_e32 v14, s11
-; VI-NEXT:    v_mov_b32_e32 v15, s10
-; VI-NEXT:    v_mov_b32_e32 v11, s6
-; VI-NEXT:    v_mov_b32_e32 v3, s4
+; VI-NEXT:    v_mov_b32_e32 v8, s18
+; VI-NEXT:    v_mov_b32_e32 v12, s19
+; VI-NEXT:    v_mov_b32_e32 v0, s16
+; VI-NEXT:    v_mov_b32_e32 v4, s17
+; VI-NEXT:    v_mov_b32_e32 v10, s23
+; VI-NEXT:    v_mov_b32_e32 v9, s22
+; VI-NEXT:    v_mov_b32_e32 v15, s21
+; VI-NEXT:    v_mov_b32_e32 v14, s20
+; VI-NEXT:    v_mov_b32_e32 v13, s15
+; VI-NEXT:    v_mov_b32_e32 v2, s14
+; VI-NEXT:    v_mov_b32_e32 v11, s13
+; VI-NEXT:    v_mov_b32_e32 v7, s12
+; VI-NEXT:    v_mov_b32_e32 v6, s11
+; VI-NEXT:    v_mov_b32_e32 v5, s10
+; VI-NEXT:    v_mov_b32_e32 v16, s6
+; VI-NEXT:    v_mov_b32_e32 v17, s4
 ; VI-NEXT:  .LBB109_5: ; %end
-; VI-NEXT:    v_mov_b32_e32 v0, v18
-; VI-NEXT:    v_mov_b32_e32 v4, v19
-; VI-NEXT:    v_mov_b32_e32 v8, v16
-; VI-NEXT:    v_mov_b32_e32 v12, v17
+; VI-NEXT:    v_mov_b32_e32 v3, v17
+; VI-NEXT:    v_mov_b32_e32 v1, v11
+; VI-NEXT:    v_mov_b32_e32 v11, v16
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v8bf16_to_v16i8_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index 01e397d629ea9..155ec568a65d3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -4052,90 +4052,92 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a,
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e64 v22, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v23, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v21, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v17, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s29
-; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s28
-; SI-NEXT:    v_mul_f32_e32 v8, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v9, 1.0, v0
+; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v22, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v30, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v28, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v27, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v26, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s29
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v8, 1.0, v0
+; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s28
 ; SI-NEXT:    s_cbranch_scc0 .LBB23_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v8
-; SI-NEXT:    v_alignbit_b32 v0, v0, v23, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v21, 16
-; SI-NEXT:    v_alignbit_b32 v2, v2, v19, 16
-; SI-NEXT:    v_alignbit_b32 v3, v3, v17, 16
-; SI-NEXT:    v_alignbit_b32 v4, v4, v15, 16
-; SI-NEXT:    v_alignbit_b32 v5, v5, v13, 16
-; SI-NEXT:    v_alignbit_b32 v6, v6, v11, 16
-; SI-NEXT:    v_alignbit_b32 v7, v7, v9, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v30
+; SI-NEXT:    v_lshr_b64 v[0:1], v[22:23], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[20:21], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
+; SI-NEXT:    v_lshr_b64 v[2:3], v[18:19], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v28
+; SI-NEXT:    v_lshr_b64 v[3:4], v[16:17], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v27
+; SI-NEXT:    v_lshr_b64 v[4:5], v[14:15], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v24
+; SI-NEXT:    v_lshr_b64 v[5:6], v[12:13], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v25
+; SI-NEXT:    v_lshr_b64 v[32:33], v[8:9], 16
+; SI-NEXT:    v_lshr_b64 v[6:7], v[10:11], 16
+; SI-NEXT:    v_mov_b32_e32 v7, v32
 ; SI-NEXT:    s_cbranch_execnz .LBB23_3
 ; SI-NEXT:  .LBB23_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v22
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v23
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v31
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v22
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v30
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v20
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v21
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v18
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v29
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v18
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v16
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v28
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v14
-; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v15
+; SI-NEXT:    v_lshr_b64 v[3:4], v[3:4], 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v27
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v12
-; SI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v13
+; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v26
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v12
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v11
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v24
+; SI-NEXT:    v_lshr_b64 v[5:6], v[5:6], 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v25
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; SI-NEXT:    v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
-; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT:    v_lshr_b64 v[8:9], v[8:9], 16
+; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 16
+; SI-NEXT:    v_mov_b32_e32 v7, v8
 ; SI-NEXT:  .LBB23_3: ; %end
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB23_4:
@@ -4151,150 +4153,153 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a,
 ; VI-NEXT:    s_cbranch_execnz .LBB23_4
 ; VI-NEXT:  .LBB23_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s23, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v8, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v8
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v8
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT:    s_lshl_b32 s4, s22, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s22, 16
-; VI-NEXT:    v_alignbit_b32 v7, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; VI-NEXT:    s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    s_lshl_b32 s4, s21, 16
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[2:3]
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s21, 16
-; VI-NEXT:    v_alignbit_b32 v6, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
+; VI-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    s_lshl_b32 s4, s20, 16
-; VI-NEXT:    v_alignbit_b32 v5, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_alignbit_b32 v4, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[2:3]
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
+; VI-NEXT:    v_cndmask_b32_e32 v11, v3, v5, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v8, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v8, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
-; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[2:3]
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
-; VI-NEXT:    v_add_f32_e32 v8, s4, v0
-; VI-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
-; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT:    v_or_b32_e32 v10, 0x400000, v8
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_cndmask_b32_e32 v13, v5, v7, vcc
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v8, v1, 16
-; VI-NEXT:    v_add_f32_e32 v8, s4, v0
-; VI-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
-; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v10, 0x400000, v8
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
-; VI-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v0
-; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT:    v_or_b32_e32 v10, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v8, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    s_and_b32 s6, s16, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; VI-NEXT:    v_add_f32_e32 v3, s6, v8
+; VI-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v7
+; VI-NEXT:    v_lshrrev_b64 v[7:8], 16, v[0:1]
+; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[9:10]
+; VI-NEXT:    v_lshrrev_b64 v[9:10], 16, v[11:12]
+; VI-NEXT:    v_cndmask_b32_e32 v1, v16, v17, vcc
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[13:14]
+; VI-NEXT:    v_cndmask_b32_e64 v0, v5, v15, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v10
+; VI-NEXT:    v_mov_b32_e32 v3, v9
+; VI-NEXT:    v_mov_b32_e32 v5, v8
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB23_3:
 ; VI-NEXT:    s_branch .LBB23_2
@@ -11204,90 +11209,92 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e64 v22, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v23, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v21, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v17, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s29
-; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s28
-; SI-NEXT:    v_mul_f32_e32 v8, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v9, 1.0, v0
+; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v22, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v30, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v28, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v27, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v26, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s29
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v8, 1.0, v0
+; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s28
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v8
-; SI-NEXT:    v_alignbit_b32 v0, v0, v23, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v21, 16
-; SI-NEXT:    v_alignbit_b32 v2, v2, v19, 16
-; SI-NEXT:    v_alignbit_b32 v3, v3, v17, 16
-; SI-NEXT:    v_alignbit_b32 v4, v4, v15, 16
-; SI-NEXT:    v_alignbit_b32 v5, v5, v13, 16
-; SI-NEXT:    v_alignbit_b32 v6, v6, v11, 16
-; SI-NEXT:    v_alignbit_b32 v7, v7, v9, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v30
+; SI-NEXT:    v_lshr_b64 v[0:1], v[22:23], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[20:21], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
+; SI-NEXT:    v_lshr_b64 v[2:3], v[18:19], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v28
+; SI-NEXT:    v_lshr_b64 v[3:4], v[16:17], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v27
+; SI-NEXT:    v_lshr_b64 v[4:5], v[14:15], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v24
+; SI-NEXT:    v_lshr_b64 v[5:6], v[12:13], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v25
+; SI-NEXT:    v_lshr_b64 v[32:33], v[8:9], 16
+; SI-NEXT:    v_lshr_b64 v[6:7], v[10:11], 16
+; SI-NEXT:    v_mov_b32_e32 v7, v32
 ; SI-NEXT:    s_cbranch_execnz .LBB47_3
 ; SI-NEXT:  .LBB47_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v22
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v23
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v31
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v22
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v30
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v20
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v21
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v18
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v29
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v18
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v16
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v28
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v14
-; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v15
+; SI-NEXT:    v_lshr_b64 v[3:4], v[3:4], 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v27
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v12
-; SI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v13
+; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v26
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v12
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v11
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v24
+; SI-NEXT:    v_lshr_b64 v[5:6], v[5:6], 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v25
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; SI-NEXT:    v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
-; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT:    v_lshr_b64 v[8:9], v[8:9], 16
+; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 16
+; SI-NEXT:    v_mov_b32_e32 v7, v8
 ; SI-NEXT:  .LBB47_3: ; %end
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB47_4:
@@ -11303,150 +11310,153 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a
 ; VI-NEXT:    s_cbranch_execnz .LBB47_4
 ; VI-NEXT:  .LBB47_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s23, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v8, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v8
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v8
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT:    s_lshl_b32 s4, s22, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s22, 16
-; VI-NEXT:    v_alignbit_b32 v7, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; VI-NEXT:    s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    s_lshl_b32 s4, s21, 16
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[2:3]
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s21, 16
-; VI-NEXT:    v_alignbit_b32 v6, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
+; VI-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    s_lshl_b32 s4, s20, 16
-; VI-NEXT:    v_alignbit_b32 v5, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_alignbit_b32 v4, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[2:3]
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
+; VI-NEXT:    v_cndmask_b32_e32 v11, v3, v5, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v8, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v8, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
-; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[2:3]
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
-; VI-NEXT:    v_add_f32_e32 v8, s4, v0
-; VI-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
-; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT:    v_or_b32_e32 v10, 0x400000, v8
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_cndmask_b32_e32 v13, v5, v7, vcc
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v8, v1, 16
-; VI-NEXT:    v_add_f32_e32 v8, s4, v0
-; VI-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
-; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v10, 0x400000, v8
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
-; VI-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v0
-; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT:    v_or_b32_e32 v10, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v8, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    s_and_b32 s6, s16, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; VI-NEXT:    v_add_f32_e32 v3, s6, v8
+; VI-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v7
+; VI-NEXT:    v_lshrrev_b64 v[7:8], 16, v[0:1]
+; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[9:10]
+; VI-NEXT:    v_lshrrev_b64 v[9:10], 16, v[11:12]
+; VI-NEXT:    v_cndmask_b32_e32 v1, v16, v17, vcc
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[13:14]
+; VI-NEXT:    v_cndmask_b32_e64 v0, v5, v15, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v10
+; VI-NEXT:    v_mov_b32_e32 v3, v9
+; VI-NEXT:    v_mov_b32_e32 v5, v8
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB47_3:
 ; VI-NEXT:    s_branch .LBB47_2
@@ -17924,90 +17934,92 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a,
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e64 v22, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v23, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v21, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v17, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s29
-; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s28
-; SI-NEXT:    v_mul_f32_e32 v8, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v9, 1.0, v0
+; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v22, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v30, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v28, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v27, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v26, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s29
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v8, 1.0, v0
+; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s28
 ; SI-NEXT:    s_cbranch_scc0 .LBB67_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v8
-; SI-NEXT:    v_alignbit_b32 v0, v0, v23, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v21, 16
-; SI-NEXT:    v_alignbit_b32 v2, v2, v19, 16
-; SI-NEXT:    v_alignbit_b32 v3, v3, v17, 16
-; SI-NEXT:    v_alignbit_b32 v4, v4, v15, 16
-; SI-NEXT:    v_alignbit_b32 v5, v5, v13, 16
-; SI-NEXT:    v_alignbit_b32 v6, v6, v11, 16
-; SI-NEXT:    v_alignbit_b32 v7, v7, v9, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v30
+; SI-NEXT:    v_lshr_b64 v[0:1], v[22:23], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[20:21], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
+; SI-NEXT:    v_lshr_b64 v[2:3], v[18:19], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v28
+; SI-NEXT:    v_lshr_b64 v[3:4], v[16:17], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v27
+; SI-NEXT:    v_lshr_b64 v[4:5], v[14:15], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v24
+; SI-NEXT:    v_lshr_b64 v[5:6], v[12:13], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v25
+; SI-NEXT:    v_lshr_b64 v[32:33], v[8:9], 16
+; SI-NEXT:    v_lshr_b64 v[6:7], v[10:11], 16
+; SI-NEXT:    v_mov_b32_e32 v7, v32
 ; SI-NEXT:    s_cbranch_execnz .LBB67_3
 ; SI-NEXT:  .LBB67_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v22
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v23
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v31
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v22
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v30
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v20
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v21
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v18
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v29
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v18
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v16
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v28
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v14
-; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v15
+; SI-NEXT:    v_lshr_b64 v[3:4], v[3:4], 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v27
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v12
-; SI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v13
+; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v26
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v12
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v11
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v24
+; SI-NEXT:    v_lshr_b64 v[5:6], v[5:6], 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v25
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; SI-NEXT:    v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
-; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT:    v_lshr_b64 v[8:9], v[8:9], 16
+; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 16
+; SI-NEXT:    v_mov_b32_e32 v7, v8
 ; SI-NEXT:  .LBB67_3: ; %end
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB67_4:
@@ -18023,150 +18035,153 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a,
 ; VI-NEXT:    s_cbranch_execnz .LBB67_4
 ; VI-NEXT:  .LBB67_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s23, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v8, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v8
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v8
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT:    s_lshl_b32 s4, s22, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s22, 16
-; VI-NEXT:    v_alignbit_b32 v7, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; VI-NEXT:    s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    s_lshl_b32 s4, s21, 16
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[2:3]
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s21, 16
-; VI-NEXT:    v_alignbit_b32 v6, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
+; VI-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    s_lshl_b32 s4, s20, 16
-; VI-NEXT:    v_alignbit_b32 v5, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_alignbit_b32 v4, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[2:3]
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
+; VI-NEXT:    v_cndmask_b32_e32 v11, v3, v5, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v8, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v8, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
-; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[2:3]
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
-; VI-NEXT:    v_add_f32_e32 v8, s4, v0
-; VI-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
-; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT:    v_or_b32_e32 v10, 0x400000, v8
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_cndmask_b32_e32 v13, v5, v7, vcc
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v8, v1, 16
-; VI-NEXT:    v_add_f32_e32 v8, s4, v0
-; VI-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
-; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v10, 0x400000, v8
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
-; VI-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v0
-; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT:    v_or_b32_e32 v10, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v8, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    s_and_b32 s6, s16, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; VI-NEXT:    v_add_f32_e32 v3, s6, v8
+; VI-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v7
+; VI-NEXT:    v_lshrrev_b64 v[7:8], 16, v[0:1]
+; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[9:10]
+; VI-NEXT:    v_lshrrev_b64 v[9:10], 16, v[11:12]
+; VI-NEXT:    v_cndmask_b32_e32 v1, v16, v17, vcc
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[13:14]
+; VI-NEXT:    v_cndmask_b32_e64 v0, v5, v15, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v10
+; VI-NEXT:    v_mov_b32_e32 v3, v9
+; VI-NEXT:    v_mov_b32_e32 v5, v8
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB67_3:
 ; VI-NEXT:    s_branch .LBB67_2
@@ -24092,90 +24107,92 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e64 v22, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v23, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v21, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v17, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s29
-; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s28
-; SI-NEXT:    v_mul_f32_e32 v8, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v9, 1.0, v0
+; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v22, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v30, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v28, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v27, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v26, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s29
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v8, 1.0, v0
+; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s28
 ; SI-NEXT:    s_cbranch_scc0 .LBB83_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v8
-; SI-NEXT:    v_alignbit_b32 v0, v0, v23, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v21, 16
-; SI-NEXT:    v_alignbit_b32 v2, v2, v19, 16
-; SI-NEXT:    v_alignbit_b32 v3, v3, v17, 16
-; SI-NEXT:    v_alignbit_b32 v4, v4, v15, 16
-; SI-NEXT:    v_alignbit_b32 v5, v5, v13, 16
-; SI-NEXT:    v_alignbit_b32 v6, v6, v11, 16
-; SI-NEXT:    v_alignbit_b32 v7, v7, v9, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v30
+; SI-NEXT:    v_lshr_b64 v[0:1], v[22:23], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[20:21], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
+; SI-NEXT:    v_lshr_b64 v[2:3], v[18:19], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v28
+; SI-NEXT:    v_lshr_b64 v[3:4], v[16:17], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v27
+; SI-NEXT:    v_lshr_b64 v[4:5], v[14:15], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v24
+; SI-NEXT:    v_lshr_b64 v[5:6], v[12:13], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v25
+; SI-NEXT:    v_lshr_b64 v[32:33], v[8:9], 16
+; SI-NEXT:    v_lshr_b64 v[6:7], v[10:11], 16
+; SI-NEXT:    v_mov_b32_e32 v7, v32
 ; SI-NEXT:    s_cbranch_execnz .LBB83_3
 ; SI-NEXT:  .LBB83_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v22
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v23
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v31
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v22
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v30
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v20
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v21
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v18
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v29
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v18
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v16
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v28
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v14
-; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v15
+; SI-NEXT:    v_lshr_b64 v[3:4], v[3:4], 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v27
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v12
-; SI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v13
+; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v26
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v12
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v11
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v24
+; SI-NEXT:    v_lshr_b64 v[5:6], v[5:6], 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v25
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; SI-NEXT:    v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
-; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT:    v_lshr_b64 v[8:9], v[8:9], 16
+; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 16
+; SI-NEXT:    v_mov_b32_e32 v7, v8
 ; SI-NEXT:  .LBB83_3: ; %end
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB83_4:
@@ -24191,150 +24208,153 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %
 ; VI-NEXT:    s_cbranch_execnz .LBB83_4
 ; VI-NEXT:  .LBB83_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s23, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v8, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v8
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v8
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT:    s_lshl_b32 s4, s22, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s22, 16
-; VI-NEXT:    v_alignbit_b32 v7, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; VI-NEXT:    s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    s_lshl_b32 s4, s21, 16
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[2:3]
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s21, 16
-; VI-NEXT:    v_alignbit_b32 v6, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
+; VI-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    s_lshl_b32 s4, s20, 16
-; VI-NEXT:    v_alignbit_b32 v5, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_alignbit_b32 v4, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[2:3]
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
+; VI-NEXT:    v_cndmask_b32_e32 v11, v3, v5, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v8, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v8, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
-; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v8
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[2:3]
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
-; VI-NEXT:    v_add_f32_e32 v8, s4, v0
-; VI-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
-; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT:    v_or_b32_e32 v10, 0x400000, v8
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_cndmask_b32_e32 v13, v5, v7, vcc
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v8, v1, 16
-; VI-NEXT:    v_add_f32_e32 v8, s4, v0
-; VI-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
-; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v10, 0x400000, v8
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
-; VI-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v0
-; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT:    v_or_b32_e32 v10, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v8, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; VI-NEXT:    v_add_f32_e32 v3, s4, v8
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    s_and_b32 s6, s16, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; VI-NEXT:    v_add_f32_e32 v3, s6, v8
+; VI-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v7
+; VI-NEXT:    v_lshrrev_b64 v[7:8], 16, v[0:1]
+; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[9:10]
+; VI-NEXT:    v_lshrrev_b64 v[9:10], 16, v[11:12]
+; VI-NEXT:    v_cndmask_b32_e32 v1, v16, v17, vcc
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[13:14]
+; VI-NEXT:    v_cndmask_b32_e64 v0, v5, v15, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v10
+; VI-NEXT:    v_mov_b32_e32 v3, v9
+; VI-NEXT:    v_mov_b32_e32 v5, v8
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB83_3:
 ; VI-NEXT:    s_branch .LBB83_2
@@ -29752,120 +29772,124 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v30, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v28, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v22, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v27, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v26, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v23, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s28
-; SI-NEXT:    v_mul_f32_e64 v24, 1.0, s29
-; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_mul_f32_e64 v32, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v21, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v30, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v17, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v28, 1.0, s28
+; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s29
+; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v0
+; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v1
 ; SI-NEXT:    s_cbranch_scc0 .LBB95_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v29
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v27
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v15
 ; SI-NEXT:    s_cbranch_execnz .LBB95_3
 ; SI-NEXT:  .LBB95_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v30
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v31
-; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v31
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v32
+; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v12
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v28
-; SI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v29
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v26
-; SI-NEXT:    v_alignbit_b32 v4, v4, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v27
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_alignbit_b32 v8, v7, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v25
-; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v24
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v2
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v30
+; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v22
+; SI-NEXT:    v_lshr_b64 v[4:5], v[1:2], 16
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v23
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT:    v_alignbit_b32 v10, v11, v1, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v22
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_alignbit_b32 v14, v15, v7, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; SI-NEXT:    v_lshr_b64 v[8:9], v[1:2], 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v13
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v28
+; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v16
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
-; SI-NEXT:    v_alignbit_b32 v6, v7, v1, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v20
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; SI-NEXT:    v_lshr_b64 v[12:13], v[1:2], 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v27
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    v_alignbit_b32 v2, v3, v1, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v12
-; SI-NEXT:    v_lshr_b64 v[17:18], v[1:2], 16
-; SI-NEXT:    v_lshr_b64 v[18:19], v[5:6], 16
-; SI-NEXT:    v_lshr_b64 v[21:22], v[9:10], 16
-; SI-NEXT:    v_lshr_b64 v[19:20], v[13:14], 16
-; SI-NEXT:    v_alignbit_b32 v12, v24, v25, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v17
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v11
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v19
+; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v21
+; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
+; SI-NEXT:    v_lshr_b64 v[2:3], v[21:22], 16
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v6
+; SI-NEXT:    v_lshr_b64 v[6:7], v[19:20], 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v23
+; SI-NEXT:    v_lshr_b64 v[10:11], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[23:24], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[14:15], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[24:25], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[26:27], v[13:14], 16
 ; SI-NEXT:  .LBB95_3: ; %end
-; SI-NEXT:    v_mov_b32_e32 v1, v17
-; SI-NEXT:    v_mov_b32_e32 v5, v18
-; SI-NEXT:    v_mov_b32_e32 v9, v21
-; SI-NEXT:    v_mov_b32_e32 v13, v19
+; SI-NEXT:    v_mov_b32_e32 v1, v23
+; SI-NEXT:    v_mov_b32_e32 v3, v22
+; SI-NEXT:    v_mov_b32_e32 v5, v24
+; SI-NEXT:    v_mov_b32_e32 v7, v20
+; SI-NEXT:    v_mov_b32_e32 v9, v25
+; SI-NEXT:    v_mov_b32_e32 v11, v18
+; SI-NEXT:    v_mov_b32_e32 v13, v26
+; SI-NEXT:    v_mov_b32_e32 v15, v16
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB95_4:
 ; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr22
 ; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $vgpr12
+; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr16
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr19
 ; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    s_branch .LBB95_2
 ;
@@ -29878,150 +29902,154 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a
 ; VI-NEXT:    s_cbranch_execnz .LBB95_4
 ; VI-NEXT:  .LBB95_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_mov_b32_e32 v1, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v0, s4, v1
-; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_mov_b32_e32 v10, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v10
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v10
+; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    s_lshl_b32 s4, s17, 16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v2, s4, v10
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v10
+; VI-NEXT:    v_cndmask_b32_e32 v8, v3, v4, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_lshl_b32 s4, s17, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_cndmask_b32_e32 v8, v4, v5, vcc
+; VI-NEXT:    s_lshl_b32 s4, s18, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v10
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v10
 ; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
 ; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    s_lshl_b32 s4, s18, 16
+; VI-NEXT:    s_lshl_b32 s4, s19, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s4, v1
+; VI-NEXT:    v_add_f32_e32 v4, s4, v10
 ; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
 ; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_add_f32_e32 v4, s4, v1
-; VI-NEXT:    v_cndmask_b32_e32 v9, v5, v6, vcc
+; VI-NEXT:    v_add_f32_e32 v4, s4, v10
+; VI-NEXT:    v_cndmask_b32_e32 v11, v5, v6, vcc
 ; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    s_lshl_b32 s4, s19, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_add_f32_e32 v5, s4, v1
-; VI-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT:    v_add_f32_e32 v5, s4, v1
-; VI-NEXT:    v_cndmask_b32_e32 v10, v6, v7, vcc
+; VI-NEXT:    s_lshl_b32 s4, s20, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; VI-NEXT:    v_add_f32_e32 v4, s4, v10
+; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; VI-NEXT:    v_add_f32_e32 v5, s4, v10
 ; VI-NEXT:    v_bfe_u32 v6, v5, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; VI-NEXT:    s_and_b32 s5, s21, 0xffff0000
 ; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v5
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT:    s_lshl_b32 s4, s20, 16
-; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
-; VI-NEXT:    v_add_f32_e32 v6, s4, v1
-; VI-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
-; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_add_f32_e32 v13, s5, v10
 ; VI-NEXT:    s_lshl_b32 s5, s22, 16
-; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v6
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; VI-NEXT:    v_add_f32_e32 v6, s5, v1
-; VI-NEXT:    v_cndmask_b32_e32 v11, v7, v11, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; VI-NEXT:    v_add_f32_e32 v6, s5, v10
 ; VI-NEXT:    v_bfe_u32 v7, v6, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
 ; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; VI-NEXT:    v_or_b32_e32 v12, 0x400000, v6
+; VI-NEXT:    v_or_b32_e32 v14, 0x400000, v6
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; VI-NEXT:    s_and_b32 s5, s22, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v6, v7, v12, vcc
-; VI-NEXT:    v_add_f32_e32 v7, s5, v1
-; VI-NEXT:    v_bfe_u32 v12, v7, 16, 1
-; VI-NEXT:    v_add_u32_e32 v12, vcc, v12, v7
-; VI-NEXT:    v_add_u32_e32 v12, vcc, 0x7fff, v12
-; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v7
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT:    v_cndmask_b32_e32 v7, v12, v13, vcc
-; VI-NEXT:    s_lshl_b32 s5, s23, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
-; VI-NEXT:    v_add_f32_e32 v7, s5, v1
-; VI-NEXT:    v_bfe_u32 v13, v7, 16, 1
-; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v7
-; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
-; VI-NEXT:    v_or_b32_e32 v14, 0x400000, v7
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT:    s_and_b32 s5, s23, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v7, v13, v14, vcc
-; VI-NEXT:    v_add_f32_e32 v13, s5, v1
-; VI-NEXT:    v_bfe_u32 v14, v13, 16, 1
-; VI-NEXT:    v_add_u32_e32 v14, vcc, v14, v13
+; VI-NEXT:    v_cndmask_b32_e32 v6, v7, v14, vcc
+; VI-NEXT:    v_add_f32_e32 v7, s5, v10
+; VI-NEXT:    v_bfe_u32 v14, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v14, vcc, v14, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x7fff, v14
-; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v13
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v13, v14, v15, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT:    v_alignbit_b32 v6, v12, v6, 16
-; VI-NEXT:    v_add_f32_e32 v12, s4, v1
-; VI-NEXT:    v_alignbit_b32 v7, v13, v7, 16
-; VI-NEXT:    v_bfe_u32 v13, v12, 16, 1
-; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v12
-; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
-; VI-NEXT:    v_or_b32_e32 v14, 0x400000, v12
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[4:5]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v14, v15, vcc
+; VI-NEXT:    v_bfe_u32 v5, v13, 16, 1
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v13
 ; VI-NEXT:    s_lshl_b32 s4, s21, 16
-; VI-NEXT:    v_cndmask_b32_e32 v12, v13, v14, vcc
-; VI-NEXT:    v_add_f32_e32 v13, s4, v1
-; VI-NEXT:    v_bfe_u32 v14, v13, 16, 1
-; VI-NEXT:    v_add_u32_e32 v14, vcc, v14, v13
-; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x7fff, v14
-; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v13
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[6:7]
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v13
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT:    v_add_f32_e32 v1, s4, v1
-; VI-NEXT:    v_cndmask_b32_e32 v13, v14, v15, vcc
-; VI-NEXT:    v_bfe_u32 v14, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v14, vcc, v14, v1
-; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x7fff, v14
-; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v14, v15, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
+; VI-NEXT:    v_add_f32_e32 v13, s4, v10
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT:    v_bfe_u32 v7, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v13
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v14, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v13, v7, v14, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; VI-NEXT:    v_add_f32_e32 v5, s4, v10
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    s_lshl_b32 s4, s23, 16
+; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v15, vcc
+; VI-NEXT:    v_add_f32_e32 v7, s4, v10
+; VI-NEXT:    v_bfe_u32 v10, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v10, vcc, v10, v7
+; VI-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_cndmask_b32_e32 v15, v10, v15, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_alignbit_b32 v5, v1, v13, 16
-; VI-NEXT:    v_alignbit_b32 v4, v4, v11, 16
-; VI-NEXT:    v_alignbit_b32 v3, v3, v10, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v9, 16
-; VI-NEXT:    v_alignbit_b32 v1, v15, v8, 16
-; VI-NEXT:    v_alignbit_b32 v0, v14, v0, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b64 v[15:16], 16, v[15:16]
+; VI-NEXT:    v_lshrrev_b64 v[13:14], 16, v[13:14]
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[11:12]
+; VI-NEXT:    v_lshrrev_b64 v[7:8], 16, v[8:9]
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[2:3]
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v7
+; VI-NEXT:    v_mov_b32_e32 v3, v10
+; VI-NEXT:    v_mov_b32_e32 v5, v13
+; VI-NEXT:    v_mov_b32_e32 v7, v15
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB95_3:
 ; VI-NEXT:    s_branch .LBB95_2
@@ -35136,150 +35164,154 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg %
 ; VI-NEXT:    s_cbranch_execnz .LBB103_4
 ; VI-NEXT:  .LBB103_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_mov_b32_e32 v1, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v0, s4, v1
-; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_mov_b32_e32 v10, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v10
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v10
+; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    s_lshl_b32 s4, s17, 16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v2, s4, v10
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v10
+; VI-NEXT:    v_cndmask_b32_e32 v8, v3, v4, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_lshl_b32 s4, s17, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_cndmask_b32_e32 v8, v4, v5, vcc
+; VI-NEXT:    s_lshl_b32 s4, s18, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v10
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v10
 ; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
 ; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    s_lshl_b32 s4, s18, 16
+; VI-NEXT:    s_lshl_b32 s4, s19, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s4, v1
+; VI-NEXT:    v_add_f32_e32 v4, s4, v10
 ; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
 ; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_add_f32_e32 v4, s4, v1
-; VI-NEXT:    v_cndmask_b32_e32 v9, v5, v6, vcc
+; VI-NEXT:    v_add_f32_e32 v4, s4, v10
+; VI-NEXT:    v_cndmask_b32_e32 v11, v5, v6, vcc
 ; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    s_lshl_b32 s4, s19, 16
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_add_f32_e32 v5, s4, v1
-; VI-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT:    v_add_f32_e32 v5, s4, v1
-; VI-NEXT:    v_cndmask_b32_e32 v10, v6, v7, vcc
+; VI-NEXT:    s_lshl_b32 s4, s20, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; VI-NEXT:    v_add_f32_e32 v4, s4, v10
+; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; VI-NEXT:    v_add_f32_e32 v5, s4, v10
 ; VI-NEXT:    v_bfe_u32 v6, v5, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; VI-NEXT:    s_and_b32 s5, s21, 0xffff0000
 ; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v5
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT:    s_lshl_b32 s4, s20, 16
-; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
-; VI-NEXT:    v_add_f32_e32 v6, s4, v1
-; VI-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
-; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_add_f32_e32 v13, s5, v10
 ; VI-NEXT:    s_lshl_b32 s5, s22, 16
-; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v6
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; VI-NEXT:    v_add_f32_e32 v6, s5, v1
-; VI-NEXT:    v_cndmask_b32_e32 v11, v7, v11, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; VI-NEXT:    v_add_f32_e32 v6, s5, v10
 ; VI-NEXT:    v_bfe_u32 v7, v6, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
 ; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; VI-NEXT:    v_or_b32_e32 v12, 0x400000, v6
+; VI-NEXT:    v_or_b32_e32 v14, 0x400000, v6
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; VI-NEXT:    s_and_b32 s5, s22, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v6, v7, v12, vcc
-; VI-NEXT:    v_add_f32_e32 v7, s5, v1
-; VI-NEXT:    v_bfe_u32 v12, v7, 16, 1
-; VI-NEXT:    v_add_u32_e32 v12, vcc, v12, v7
-; VI-NEXT:    v_add_u32_e32 v12, vcc, 0x7fff, v12
-; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v7
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT:    v_cndmask_b32_e32 v7, v12, v13, vcc
-; VI-NEXT:    s_lshl_b32 s5, s23, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
-; VI-NEXT:    v_add_f32_e32 v7, s5, v1
-; VI-NEXT:    v_bfe_u32 v13, v7, 16, 1
-; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v7
-; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
-; VI-NEXT:    v_or_b32_e32 v14, 0x400000, v7
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT:    s_and_b32 s5, s23, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v7, v13, v14, vcc
-; VI-NEXT:    v_add_f32_e32 v13, s5, v1
-; VI-NEXT:    v_bfe_u32 v14, v13, 16, 1
-; VI-NEXT:    v_add_u32_e32 v14, vcc, v14, v13
+; VI-NEXT:    v_cndmask_b32_e32 v6, v7, v14, vcc
+; VI-NEXT:    v_add_f32_e32 v7, s5, v10
+; VI-NEXT:    v_bfe_u32 v14, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v14, vcc, v14, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x7fff, v14
-; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v13
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v13, v14, v15, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT:    v_alignbit_b32 v6, v12, v6, 16
-; VI-NEXT:    v_add_f32_e32 v12, s4, v1
-; VI-NEXT:    v_alignbit_b32 v7, v13, v7, 16
-; VI-NEXT:    v_bfe_u32 v13, v12, 16, 1
-; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v12
-; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
-; VI-NEXT:    v_or_b32_e32 v14, 0x400000, v12
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[4:5]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v14, v15, vcc
+; VI-NEXT:    v_bfe_u32 v5, v13, 16, 1
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v13
 ; VI-NEXT:    s_lshl_b32 s4, s21, 16
-; VI-NEXT:    v_cndmask_b32_e32 v12, v13, v14, vcc
-; VI-NEXT:    v_add_f32_e32 v13, s4, v1
-; VI-NEXT:    v_bfe_u32 v14, v13, 16, 1
-; VI-NEXT:    v_add_u32_e32 v14, vcc, v14, v13
-; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x7fff, v14
-; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v13
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[6:7]
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v13
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT:    v_add_f32_e32 v1, s4, v1
-; VI-NEXT:    v_cndmask_b32_e32 v13, v14, v15, vcc
-; VI-NEXT:    v_bfe_u32 v14, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v14, vcc, v14, v1
-; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x7fff, v14
-; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v14, v15, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
+; VI-NEXT:    v_add_f32_e32 v13, s4, v10
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT:    v_bfe_u32 v7, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v13
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v14, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v13, v7, v14, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; VI-NEXT:    v_add_f32_e32 v5, s4, v10
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    s_lshl_b32 s4, s23, 16
+; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v15, vcc
+; VI-NEXT:    v_add_f32_e32 v7, s4, v10
+; VI-NEXT:    v_bfe_u32 v10, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v10, vcc, v10, v7
+; VI-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_cndmask_b32_e32 v15, v10, v15, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_alignbit_b32 v5, v1, v13, 16
-; VI-NEXT:    v_alignbit_b32 v4, v4, v11, 16
-; VI-NEXT:    v_alignbit_b32 v3, v3, v10, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v9, 16
-; VI-NEXT:    v_alignbit_b32 v1, v15, v8, 16
-; VI-NEXT:    v_alignbit_b32 v0, v14, v0, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b64 v[15:16], 16, v[15:16]
+; VI-NEXT:    v_lshrrev_b64 v[13:14], 16, v[13:14]
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[11:12]
+; VI-NEXT:    v_lshrrev_b64 v[7:8], 16, v[8:9]
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[2:3]
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v7
+; VI-NEXT:    v_mov_b32_e32 v3, v10
+; VI-NEXT:    v_mov_b32_e32 v5, v13
+; VI-NEXT:    v_mov_b32_e32 v7, v15
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB103_3:
 ; VI-NEXT:    s_branch .LBB103_2
@@ -40103,186 +40135,205 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a,
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e64 v24, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v32, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v51, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v52, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v39, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v50, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v55, 1.0, s25
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_mul_f32_e64 v40, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v53, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v54, 1.0, s26
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mul_f32_e64 v42, 1.0, s29
+; SI-NEXT:    s_waitcnt expcnt(6)
+; SI-NEXT:    v_mul_f32_e64 v46, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v45, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s18
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    v_mul_f32_e64 v56, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v47, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s22
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mul_f32_e64 v58, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v24, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v57, 1.0, s27
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v43, 1.0, s28
-; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_mul_f32_e64 v60, 1.0, s29
+; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v0
+; SI-NEXT:    v_mul_f32_e64 v21, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v32, 1.0, s28
 ; SI-NEXT:    s_cbranch_scc0 .LBB109_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
-; SI-NEXT:    v_alignbit_b32 v48, v1, v32, 16
-; SI-NEXT:    v_alignbit_b32 v49, v6, v16, 16
-; SI-NEXT:    v_lshr_b64 v[1:2], v[48:49], 8
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v51
-; SI-NEXT:    v_alignbit_b32 v37, v2, v52, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v55
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v39
-; SI-NEXT:    v_alignbit_b32 v35, v2, v40, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v53
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v42
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v41
-; SI-NEXT:    v_alignbit_b32 v38, v14, v50, 16
-; SI-NEXT:    v_alignbit_b32 v36, v22, v54, 16
-; SI-NEXT:    v_alignbit_b32 v33, v2, v43, 16
-; SI-NEXT:    v_alignbit_b32 v34, v30, v0, 16
-; SI-NEXT:    v_lshr_b64 v[3:4], v[48:49], 24
-; SI-NEXT:    v_lshr_b64 v[11:12], v[37:38], 24
-; SI-NEXT:    v_lshr_b64 v[19:20], v[35:36], 24
-; SI-NEXT:    v_lshr_b64 v[27:28], v[33:34], 24
-; SI-NEXT:    v_lshr_b64 v[4:5], v[48:49], 16
-; SI-NEXT:    v_lshr_b64 v[12:13], v[37:38], 16
-; SI-NEXT:    v_lshr_b64 v[20:21], v[35:36], 16
-; SI-NEXT:    v_lshr_b64 v[28:29], v[33:34], 16
-; SI-NEXT:    v_lshr_b64 v[9:10], v[37:38], 8
-; SI-NEXT:    v_lshr_b64 v[17:18], v[35:36], 8
-; SI-NEXT:    v_lshr_b64 v[25:26], v[33:34], 8
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v49
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v39
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v38
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 24, v53
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 8, v36
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v41
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v34
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v56
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v58
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v46
+; SI-NEXT:    v_lshr_b64 v[53:54], v[16:17], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v47
+; SI-NEXT:    v_lshr_b64 v[39:40], v[24:25], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v57
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v60
+; SI-NEXT:    v_lshr_b64 v[50:51], v[8:9], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v45
+; SI-NEXT:    v_lshr_b64 v[54:55], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[40:41], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[42:43], v[32:33], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v59
+; SI-NEXT:    v_lshr_b64 v[51:52], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[43:44], v[29:30], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[53:54], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[53:54], 8
+; SI-NEXT:    v_lshr_b64 v[19:20], v[39:40], 24
+; SI-NEXT:    v_lshr_b64 v[34:35], v[39:40], 16
+; SI-NEXT:    v_lshr_b64 v[3:4], v[50:51], 24
+; SI-NEXT:    v_lshr_b64 v[48:49], v[50:51], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[50:51], 8
+; SI-NEXT:    v_lshr_b64 v[11:12], v[53:54], 24
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v45
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v47
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v51
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 24, v57
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 8, v54
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v59
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v40
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 8, v43
+; SI-NEXT:    v_lshr_b64 v[17:18], v[39:40], 8
+; SI-NEXT:    v_lshr_b64 v[27:28], v[42:43], 24
+; SI-NEXT:    v_lshr_b64 v[37:38], v[42:43], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[42:43], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB109_3
 ; SI-NEXT:  .LBB109_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v42
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v43
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v60
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v32
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_alignbit_b32 v33, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v41
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v31
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v55
-; SI-NEXT:    v_alignbit_b32 v34, v30, v0, 16
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v40
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshr_b64 v[42:43], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v59
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v58
+; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v24
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v35, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v53
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v54
-; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v1
-; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v23
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v51
-; SI-NEXT:    v_alignbit_b32 v36, v22, v0, 16
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v52
+; SI-NEXT:    v_lshr_b64 v[39:40], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v21
+; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v57
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v56
+; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v37, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v39
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v50
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v1
-; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v15
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v24
-; SI-NEXT:    v_alignbit_b32 v38, v14, v0, 16
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v32
+; SI-NEXT:    v_lshr_b64 v[53:54], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v47
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v46
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v8
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v48, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v1
+; SI-NEXT:    v_lshr_b64 v[50:51], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v45
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v15
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; SI-NEXT:    v_alignbit_b32 v49, v6, v0, 16
-; SI-NEXT:    v_lshr_b64 v[3:4], v[48:49], 24
-; SI-NEXT:    v_lshr_b64 v[11:12], v[37:38], 24
-; SI-NEXT:    v_lshr_b64 v[19:20], v[35:36], 24
-; SI-NEXT:    v_lshr_b64 v[27:28], v[33:34], 24
-; SI-NEXT:    v_lshr_b64 v[4:5], v[48:49], 16
-; SI-NEXT:    v_lshr_b64 v[1:2], v[48:49], 8
-; SI-NEXT:    v_lshr_b64 v[12:13], v[37:38], 16
-; SI-NEXT:    v_lshr_b64 v[9:10], v[37:38], 8
-; SI-NEXT:    v_lshr_b64 v[20:21], v[35:36], 16
-; SI-NEXT:    v_lshr_b64 v[17:18], v[35:36], 8
-; SI-NEXT:    v_lshr_b64 v[28:29], v[33:34], 16
-; SI-NEXT:    v_lshr_b64 v[25:26], v[33:34], 8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v49
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v38
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 8, v36
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v34
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; SI-NEXT:    v_lshr_b64 v[54:55], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[43:44], v[29:30], 16
+; SI-NEXT:    v_lshr_b64 v[40:41], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[51:52], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[53:54], 16
+; SI-NEXT:    v_lshr_b64 v[3:4], v[50:51], 24
+; SI-NEXT:    v_lshr_b64 v[48:49], v[50:51], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[50:51], 8
+; SI-NEXT:    v_lshr_b64 v[11:12], v[53:54], 24
+; SI-NEXT:    v_lshr_b64 v[9:10], v[53:54], 8
+; SI-NEXT:    v_lshr_b64 v[19:20], v[39:40], 24
+; SI-NEXT:    v_lshr_b64 v[34:35], v[39:40], 16
+; SI-NEXT:    v_lshr_b64 v[17:18], v[39:40], 8
+; SI-NEXT:    v_lshr_b64 v[27:28], v[42:43], 24
+; SI-NEXT:    v_lshr_b64 v[37:38], v[42:43], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[42:43], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v51
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 8, v54
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v40
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 8, v43
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v15
 ; SI-NEXT:    v_lshrrev_b32_e32 v23, 24, v23
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v31
 ; SI-NEXT:  .LBB109_3: ; %end
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v0, v48
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v49
-; SI-NEXT:    v_mov_b32_e32 v8, v37
-; SI-NEXT:    v_mov_b32_e32 v10, v12
-; SI-NEXT:    v_mov_b32_e32 v12, v38
-; SI-NEXT:    v_mov_b32_e32 v16, v35
-; SI-NEXT:    v_mov_b32_e32 v18, v20
-; SI-NEXT:    v_mov_b32_e32 v20, v36
-; SI-NEXT:    v_mov_b32_e32 v24, v33
-; SI-NEXT:    v_mov_b32_e32 v26, v28
-; SI-NEXT:    v_mov_b32_e32 v28, v34
+; SI-NEXT:    v_mov_b32_e32 v13, v20
+; SI-NEXT:    v_mov_b32_e32 v20, v40
+; SI-NEXT:    v_mov_b32_e32 v24, v42
+; SI-NEXT:    v_mov_b32_e32 v28, v43
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v0, v50
+; SI-NEXT:    v_mov_b32_e32 v2, v48
+; SI-NEXT:    v_mov_b32_e32 v4, v51
+; SI-NEXT:    v_mov_b32_e32 v5, v10
+; SI-NEXT:    v_mov_b32_e32 v8, v53
+; SI-NEXT:    v_mov_b32_e32 v10, v36
+; SI-NEXT:    v_mov_b32_e32 v12, v54
+; SI-NEXT:    v_mov_b32_e32 v16, v39
+; SI-NEXT:    v_mov_b32_e32 v18, v34
+; SI-NEXT:    v_mov_b32_e32 v21, v33
+; SI-NEXT:    v_mov_b32_e32 v26, v37
+; SI-NEXT:    v_mov_b32_e32 v29, v35
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB109_4:
-; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr12
+; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr27
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    s_branch .LBB109_2
 ;
 ; VI-LABEL: bitcast_v16bf16_to_v32i8_scalar:
@@ -40291,26 +40342,26 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a,
 ; VI-NEXT:    s_cmp_lg_u32 s24, 0
 ; VI-NEXT:    s_cbranch_scc0 .LBB109_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    s_lshr_b32 s14, s23, 24
-; VI-NEXT:    s_lshr_b32 s15, s23, 16
-; VI-NEXT:    s_lshr_b32 s25, s23, 8
-; VI-NEXT:    s_lshr_b32 s24, s22, 16
-; VI-NEXT:    s_lshr_b32 s26, s22, 8
-; VI-NEXT:    s_lshr_b32 s27, s21, 24
-; VI-NEXT:    s_lshr_b32 s28, s21, 16
-; VI-NEXT:    s_lshr_b32 s40, s21, 8
-; VI-NEXT:    s_lshr_b32 s29, s20, 16
-; VI-NEXT:    s_lshr_b32 s41, s20, 8
-; VI-NEXT:    s_lshr_b32 s42, s19, 24
-; VI-NEXT:    s_lshr_b32 s43, s19, 16
-; VI-NEXT:    s_lshr_b32 s45, s19, 8
-; VI-NEXT:    s_lshr_b32 s44, s18, 16
-; VI-NEXT:    s_lshr_b32 s46, s18, 8
-; VI-NEXT:    s_lshr_b32 s47, s17, 24
-; VI-NEXT:    s_lshr_b32 s56, s17, 16
-; VI-NEXT:    s_lshr_b32 s58, s17, 8
-; VI-NEXT:    s_lshr_b32 s57, s16, 16
-; VI-NEXT:    s_lshr_b32 s59, s16, 8
+; VI-NEXT:    s_lshr_b32 s57, s23, 24
+; VI-NEXT:    s_lshr_b32 s56, s23, 16
+; VI-NEXT:    s_lshr_b32 s47, s23, 8
+; VI-NEXT:    s_lshr_b32 s59, s22, 16
+; VI-NEXT:    s_lshr_b32 s58, s22, 8
+; VI-NEXT:    s_lshr_b32 s44, s21, 24
+; VI-NEXT:    s_lshr_b32 s43, s21, 16
+; VI-NEXT:    s_lshr_b32 s42, s21, 8
+; VI-NEXT:    s_lshr_b32 s46, s20, 16
+; VI-NEXT:    s_lshr_b32 s45, s20, 8
+; VI-NEXT:    s_lshr_b32 s29, s19, 24
+; VI-NEXT:    s_lshr_b32 s28, s19, 16
+; VI-NEXT:    s_lshr_b32 s27, s19, 8
+; VI-NEXT:    s_lshr_b32 s41, s18, 16
+; VI-NEXT:    s_lshr_b32 s40, s18, 8
+; VI-NEXT:    s_lshr_b32 s24, s17, 24
+; VI-NEXT:    s_lshr_b32 s15, s17, 16
+; VI-NEXT:    s_lshr_b32 s14, s17, 8
+; VI-NEXT:    s_lshr_b32 s26, s16, 16
+; VI-NEXT:    s_lshr_b32 s25, s16, 8
 ; VI-NEXT:    s_lshr_b64 s[10:11], s[22:23], 24
 ; VI-NEXT:    s_lshr_b64 s[8:9], s[20:21], 24
 ; VI-NEXT:    s_lshr_b64 s[6:7], s[18:19], 24
@@ -40336,225 +40387,225 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a,
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[0:1]
 ; VI-NEXT:    v_add_f32_e32 v0, s4, v2
-; VI-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v2
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    s_lshl_b32 s4, s19, 16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; VI-NEXT:    v_add_f32_e32 v3, s4, v2
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_alignbit_b32 v0, v3, v0, 16
 ; VI-NEXT:    v_add_f32_e32 v3, s4, v2
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; VI-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
+; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s4, v2
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; VI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_alignbit_b32 v9, v4, v3, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; VI-NEXT:    v_add_f32_e32 v3, s4, v2
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s4, v2
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
+; VI-NEXT:    v_lshrrev_b64 v[12:13], 16, v[5:6]
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT:    s_lshl_b32 s4, s21, 16
-; VI-NEXT:    v_alignbit_b32 v8, v4, v3, 16
+; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; VI-NEXT:    v_add_f32_e32 v3, s4, v2
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; VI-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
+; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s4, v2
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
+; VI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; VI-NEXT:    s_lshl_b32 s4, s21, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; VI-NEXT:    v_add_f32_e32 v3, s4, v2
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[5:6]
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT:    s_lshl_b32 s4, s20, 16
-; VI-NEXT:    v_alignbit_b32 v17, v4, v3, 16
+; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; VI-NEXT:    v_add_f32_e32 v3, s4, v2
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; VI-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
+; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s4, v2
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
+; VI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; VI-NEXT:    s_lshl_b32 s4, s20, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; VI-NEXT:    v_add_f32_e32 v3, s4, v2
+; VI-NEXT:    v_lshrrev_b64 v[20:21], 16, v[5:6]
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT:    s_lshl_b32 s4, s23, 16
-; VI-NEXT:    v_alignbit_b32 v16, v4, v3, 16
+; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; VI-NEXT:    v_add_f32_e32 v3, s4, v2
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; VI-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
+; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s4, v2
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
+; VI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; VI-NEXT:    s_lshl_b32 s4, s23, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; VI-NEXT:    v_add_f32_e32 v3, s4, v2
+; VI-NEXT:    v_lshrrev_b64 v[16:17], 16, v[5:6]
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_add_f32_e32 v3, s4, v2
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; VI-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
+; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s22, 16
-; VI-NEXT:    v_alignbit_b32 v25, v4, v3, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; VI-NEXT:    v_add_f32_e32 v3, s4, v2
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_lshrrev_b64 v[28:29], 16, v[5:6]
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; VI-NEXT:    s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; VI-NEXT:    v_add_f32_e32 v2, s4, v2
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_alignbit_b32 v24, v2, v3, 16
-; VI-NEXT:    v_lshrrev_b64 v[27:28], 24, v[24:25]
-; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[16:17]
-; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
-; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
-; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v25
-; VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v25
-; VI-NEXT:    v_lshrrev_b32_e32 v29, 8, v25
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v6, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[24:25], 16, v[5:6]
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v25, v28
+; VI-NEXT:    v_mov_b32_e32 v1, v4
+; VI-NEXT:    v_mov_b32_e32 v9, v12
+; VI-NEXT:    v_mov_b32_e32 v17, v20
+; VI-NEXT:    v_lshrrev_b64 v[36:37], 24, v[24:25]
+; VI-NEXT:    v_lshrrev_b64 v[37:38], 24, v[16:17]
+; VI-NEXT:    v_lshrrev_b64 v[34:35], 24, v[8:9]
+; VI-NEXT:    v_lshrrev_b64 v[32:33], 24, v[0:1]
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v28
+; VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
+; VI-NEXT:    v_lshrrev_b32_e32 v29, 8, v28
 ; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v24
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v24
-; VI-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
-; VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
-; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v17
+; VI-NEXT:    v_lshrrev_b32_e32 v25, 8, v24
+; VI-NEXT:    v_lshrrev_b32_e32 v23, 24, v20
+; VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v20
+; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v20
 ; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v15, 24, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v9
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 24, v12
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
 ; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; VI-NEXT:    s_branch .LBB109_5
 ; VI-NEXT:  .LBB109_3:
-; VI-NEXT:    ; implicit-def: $sgpr59
-; VI-NEXT:    ; implicit-def: $sgpr57
+; VI-NEXT:    ; implicit-def: $sgpr25
+; VI-NEXT:    ; implicit-def: $sgpr26
 ; VI-NEXT:    ; implicit-def: $sgpr4
-; VI-NEXT:    ; implicit-def: $sgpr58
-; VI-NEXT:    ; implicit-def: $sgpr56
-; VI-NEXT:    ; implicit-def: $sgpr47
-; VI-NEXT:    ; implicit-def: $sgpr46
-; VI-NEXT:    ; implicit-def: $sgpr44
-; VI-NEXT:    ; implicit-def: $sgpr6
-; VI-NEXT:    ; implicit-def: $sgpr45
-; VI-NEXT:    ; implicit-def: $sgpr43
-; VI-NEXT:    ; implicit-def: $sgpr42
+; VI-NEXT:    ; implicit-def: $sgpr14
+; VI-NEXT:    ; implicit-def: $sgpr15
+; VI-NEXT:    ; implicit-def: $sgpr24
+; VI-NEXT:    ; implicit-def: $sgpr40
 ; VI-NEXT:    ; implicit-def: $sgpr41
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr27
+; VI-NEXT:    ; implicit-def: $sgpr28
 ; VI-NEXT:    ; implicit-def: $sgpr29
+; VI-NEXT:    ; implicit-def: $sgpr45
+; VI-NEXT:    ; implicit-def: $sgpr46
 ; VI-NEXT:    ; implicit-def: $sgpr8
-; VI-NEXT:    ; implicit-def: $sgpr40
-; VI-NEXT:    ; implicit-def: $sgpr28
-; VI-NEXT:    ; implicit-def: $sgpr27
-; VI-NEXT:    ; implicit-def: $sgpr26
-; VI-NEXT:    ; implicit-def: $sgpr24
+; VI-NEXT:    ; implicit-def: $sgpr42
+; VI-NEXT:    ; implicit-def: $sgpr43
+; VI-NEXT:    ; implicit-def: $sgpr44
+; VI-NEXT:    ; implicit-def: $sgpr58
+; VI-NEXT:    ; implicit-def: $sgpr59
 ; VI-NEXT:    ; implicit-def: $sgpr10
-; VI-NEXT:    ; implicit-def: $sgpr25
-; VI-NEXT:    ; implicit-def: $sgpr15
-; VI-NEXT:    ; implicit-def: $sgpr14
+; VI-NEXT:    ; implicit-def: $sgpr47
+; VI-NEXT:    ; implicit-def: $sgpr56
+; VI-NEXT:    ; implicit-def: $sgpr57
 ; VI-NEXT:    s_branch .LBB109_2
 ; VI-NEXT:  .LBB109_4:
-; VI-NEXT:    v_mov_b32_e32 v0, s16
-; VI-NEXT:    v_mov_b32_e32 v1, s17
-; VI-NEXT:    v_mov_b32_e32 v8, s18
-; VI-NEXT:    v_mov_b32_e32 v9, s19
-; VI-NEXT:    v_mov_b32_e32 v16, s20
-; VI-NEXT:    v_mov_b32_e32 v17, s21
+; VI-NEXT:    v_mov_b32_e32 v26, s59
+; VI-NEXT:    v_mov_b32_e32 v25, s58
+; VI-NEXT:    v_mov_b32_e32 v31, s57
+; VI-NEXT:    v_mov_b32_e32 v30, s56
+; VI-NEXT:    v_mov_b32_e32 v29, s47
+; VI-NEXT:    v_mov_b32_e32 v18, s46
+; VI-NEXT:    v_mov_b32_e32 v17, s45
+; VI-NEXT:    v_mov_b32_e32 v23, s44
+; VI-NEXT:    v_mov_b32_e32 v22, s43
+; VI-NEXT:    v_mov_b32_e32 v21, s42
+; VI-NEXT:    v_mov_b32_e32 v10, s41
+; VI-NEXT:    v_mov_b32_e32 v9, s40
+; VI-NEXT:    v_mov_b32_e32 v15, s29
+; VI-NEXT:    v_mov_b32_e32 v14, s28
+; VI-NEXT:    v_mov_b32_e32 v13, s27
+; VI-NEXT:    v_mov_b32_e32 v2, s26
+; VI-NEXT:    v_mov_b32_e32 v1, s25
+; VI-NEXT:    v_mov_b32_e32 v7, s24
+; VI-NEXT:    v_mov_b32_e32 v6, s15
+; VI-NEXT:    v_mov_b32_e32 v5, s14
 ; VI-NEXT:    v_mov_b32_e32 v24, s22
-; VI-NEXT:    v_mov_b32_e32 v25, s23
-; VI-NEXT:    v_mov_b32_e32 v35, s59
-; VI-NEXT:    v_mov_b32_e32 v2, s57
-; VI-NEXT:    v_mov_b32_e32 v5, s58
-; VI-NEXT:    v_mov_b32_e32 v6, s56
-; VI-NEXT:    v_mov_b32_e32 v7, s47
-; VI-NEXT:    v_mov_b32_e32 v34, s46
-; VI-NEXT:    v_mov_b32_e32 v10, s44
-; VI-NEXT:    v_mov_b32_e32 v13, s45
-; VI-NEXT:    v_mov_b32_e32 v14, s43
-; VI-NEXT:    v_mov_b32_e32 v15, s42
-; VI-NEXT:    v_mov_b32_e32 v33, s41
-; VI-NEXT:    v_mov_b32_e32 v18, s29
-; VI-NEXT:    v_mov_b32_e32 v21, s40
-; VI-NEXT:    v_mov_b32_e32 v22, s28
-; VI-NEXT:    v_mov_b32_e32 v23, s27
-; VI-NEXT:    v_mov_b32_e32 v32, s26
-; VI-NEXT:    v_mov_b32_e32 v26, s24
-; VI-NEXT:    v_mov_b32_e32 v29, s25
-; VI-NEXT:    v_mov_b32_e32 v30, s15
-; VI-NEXT:    v_mov_b32_e32 v31, s14
-; VI-NEXT:    v_mov_b32_e32 v27, s10
-; VI-NEXT:    v_mov_b32_e32 v19, s8
-; VI-NEXT:    v_mov_b32_e32 v11, s6
-; VI-NEXT:    v_mov_b32_e32 v3, s4
+; VI-NEXT:    v_mov_b32_e32 v28, s23
+; VI-NEXT:    v_mov_b32_e32 v16, s20
+; VI-NEXT:    v_mov_b32_e32 v20, s21
+; VI-NEXT:    v_mov_b32_e32 v8, s18
+; VI-NEXT:    v_mov_b32_e32 v12, s19
+; VI-NEXT:    v_mov_b32_e32 v0, s16
+; VI-NEXT:    v_mov_b32_e32 v4, s17
+; VI-NEXT:    v_mov_b32_e32 v36, s10
+; VI-NEXT:    v_mov_b32_e32 v37, s8
+; VI-NEXT:    v_mov_b32_e32 v34, s6
+; VI-NEXT:    v_mov_b32_e32 v32, s4
 ; VI-NEXT:  .LBB109_5: ; %end
-; VI-NEXT:    v_mov_b32_e32 v4, v1
-; VI-NEXT:    v_mov_b32_e32 v12, v9
-; VI-NEXT:    v_mov_b32_e32 v20, v17
-; VI-NEXT:    v_mov_b32_e32 v28, v25
-; VI-NEXT:    v_mov_b32_e32 v1, v35
-; VI-NEXT:    v_mov_b32_e32 v9, v34
-; VI-NEXT:    v_mov_b32_e32 v17, v33
-; VI-NEXT:    v_mov_b32_e32 v25, v32
+; VI-NEXT:    v_mov_b32_e32 v3, v32
+; VI-NEXT:    v_mov_b32_e32 v11, v34
+; VI-NEXT:    v_mov_b32_e32 v19, v37
+; VI-NEXT:    v_mov_b32_e32 v27, v36
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v16bf16_to_v32i8_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 9041f64cb17fb..5b42f951b8fa3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -17964,14 +17964,6 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-LABEL: bitcast_v40i8_to_v20i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v34, v10
 ; VI-NEXT:    v_mov_b32_e32 v33, v8
 ; VI-NEXT:    v_mov_b32_e32 v35, v6
@@ -17988,6 +17980,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v31, v14
 ; VI-NEXT:    v_mov_b32_e32 v37, v12
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
@@ -18005,17 +18005,15 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v43, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b16_e32 v47, 8, v4
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b16_e32 v46, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_lshlrev_b16_e32 v44, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b16_e32 v45, 8, v10
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -18046,7 +18044,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -18101,14 +18099,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x300
 ; VI-NEXT:    v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v54
 ; VI-NEXT:    v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v53
 ; VI-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v51
 ; VI-NEXT:    v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v30
@@ -23918,18 +23916,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i8_to_v20f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v36, v4
 ; SI-NEXT:    v_mov_b32_e32 v31, v2
 ; SI-NEXT:    v_mov_b32_e32 v35, v0
@@ -23943,6 +23929,18 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 8, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 8, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 8, v5
@@ -23974,20 +23972,16 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 8, v0
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(7) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 8, v4
 ; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 8, v32
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 8, v33
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 8, v34
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr32
@@ -24027,7 +24021,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff, v30
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff, v50
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v56
 ; SI-NEXT:    v_cvt_f32_f16_e32 v32, v6
@@ -24105,18 +24099,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v59, v0
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 0x300, v0
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v53
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    s_movk_i32 s6, 0x300
 ; SI-NEXT:    v_or_b32_e32 v0, v58, v0
 ; SI-NEXT:    v_add_i32_e32 v34, vcc, s6, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v51
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v57, v0
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, s6, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v50
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v56, v0
@@ -24232,14 +24225,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-LABEL: bitcast_v40i8_to_v20f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v34, v10
 ; VI-NEXT:    v_mov_b32_e32 v33, v8
 ; VI-NEXT:    v_mov_b32_e32 v35, v6
@@ -24256,6 +24241,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v31, v14
 ; VI-NEXT:    v_mov_b32_e32 v37, v12
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
@@ -24273,17 +24266,15 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v43, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b16_e32 v47, 8, v4
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b16_e32 v46, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_lshlrev_b16_e32 v44, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b16_e32 v45, 8, v10
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -24314,7 +24305,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -24369,14 +24360,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x300
 ; VI-NEXT:    v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v54
 ; VI-NEXT:    v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v53
 ; VI-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v51
 ; VI-NEXT:    v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v30
@@ -28252,15 +28243,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i8_to_v5f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
 ; SI-NEXT:    v_mov_b32_e32 v35, v8
 ; SI-NEXT:    v_mov_b32_e32 v34, v6
@@ -28277,6 +28259,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -28295,17 +28286,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 8, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 8, v29
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 8, v6
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v10
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -28368,7 +28356,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v25, v8
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_and_b32_e32 v8, 0xff, v50
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v23
@@ -28508,7 +28496,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v25, v8
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v50
 ; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
@@ -28557,15 +28545,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-LABEL: bitcast_v40i8_to_v5f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v36, v10
 ; VI-NEXT:    v_mov_b32_e32 v35, v8
 ; VI-NEXT:    v_mov_b32_e32 v34, v6
@@ -28582,6 +28561,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v38, v14
 ; VI-NEXT:    v_mov_b32_e32 v37, v12
 ; VI-NEXT:    v_lshlrev_b16_e32 v56, 8, v1
@@ -28599,17 +28587,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v52, 8, v25
 ; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v29
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v19, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -28640,7 +28625,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -28748,7 +28733,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v7, 0x300, v7
 ; VI-NEXT:    v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v7, v7, v8
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_add_u16_e32 v8, 3, v50
 ; VI-NEXT:    v_add_u16_e32 v10, 3, v49
 ; VI-NEXT:    v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -28780,15 +28765,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v40i8_to_v5f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v10
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v8
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v6
@@ -28805,6 +28781,16 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v38, v14
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v12
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v56, 8, v1
@@ -28822,17 +28808,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v25
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v29
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v19, 8, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -28863,7 +28849,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -28971,7 +28957,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v7, 0x300, v7
 ; GFX9-NEXT:    v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_add_u16_e32 v8, 3, v50
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v49
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -32301,15 +32287,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i8_to_v5i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
 ; SI-NEXT:    v_mov_b32_e32 v35, v8
 ; SI-NEXT:    v_mov_b32_e32 v34, v6
@@ -32326,6 +32303,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -32344,17 +32330,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 8, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 8, v29
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 8, v6
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v10
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -32417,7 +32400,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v25, v8
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_and_b32_e32 v8, 0xff, v50
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v23
@@ -32557,7 +32540,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v25, v8
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v50
 ; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
@@ -32606,15 +32589,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-LABEL: bitcast_v40i8_to_v5i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v36, v10
 ; VI-NEXT:    v_mov_b32_e32 v35, v8
 ; VI-NEXT:    v_mov_b32_e32 v34, v6
@@ -32631,6 +32605,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v38, v14
 ; VI-NEXT:    v_mov_b32_e32 v37, v12
 ; VI-NEXT:    v_lshlrev_b16_e32 v56, 8, v1
@@ -32648,17 +32631,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v52, 8, v25
 ; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v29
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v19, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -32689,7 +32669,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -32797,7 +32777,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v7, 0x300, v7
 ; VI-NEXT:    v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v7, v7, v8
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_add_u16_e32 v8, 3, v50
 ; VI-NEXT:    v_add_u16_e32 v10, 3, v49
 ; VI-NEXT:    v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -32829,15 +32809,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v40i8_to_v5i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v10
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v8
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v6
@@ -32854,6 +32825,16 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v38, v14
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v12
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v56, 8, v1
@@ -32871,17 +32852,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v25
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v29
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v19, 8, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -32912,7 +32893,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -33020,7 +33001,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v7, 0x300, v7
 ; GFX9-NEXT:    v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_add_u16_e32 v8, 3, v50
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v49
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
index 73b57a52201af..8055ea8be5261 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
@@ -1392,20 +1392,20 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s18, 0
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s17
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s16
 ; SI-NEXT:    s_cbranch_scc0 .LBB15_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; SI-NEXT:    v_alignbit_b32 v0, v0, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; SI-NEXT:    v_lshr_b64 v[0:1], v[2:3], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB15_3
 ; SI-NEXT:  .LBB15_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
 ; SI-NEXT:  .LBB15_3: ; %end
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB15_4:
@@ -1421,24 +1421,24 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg
 ; VI-NEXT:    s_cbranch_execnz .LBB15_4
 ; VI-NEXT:  .LBB15_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v1, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v1
 ; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v1, s4, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
+; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB15_3:
 ; VI-NEXT:    s_branch .LBB15_2
@@ -3671,20 +3671,20 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s18, 0
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s17
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s16
 ; SI-NEXT:    s_cbranch_scc0 .LBB35_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; SI-NEXT:    v_alignbit_b32 v0, v0, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; SI-NEXT:    v_lshr_b64 v[0:1], v[2:3], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB35_3
 ; SI-NEXT:  .LBB35_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
 ; SI-NEXT:  .LBB35_3: ; %end
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB35_4:
@@ -3700,24 +3700,24 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre
 ; VI-NEXT:    s_cbranch_execnz .LBB35_4
 ; VI-NEXT:  .LBB35_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v1, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v1
 ; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v1, s4, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
+; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB35_3:
 ; VI-NEXT:    s_branch .LBB35_2
@@ -5581,24 +5581,25 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s18, 0
-; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s17
 ; SI-NEXT:    s_cbranch_scc0 .LBB51_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
 ; SI-NEXT:    s_cbranch_execnz .LBB51_3
 ; SI-NEXT:  .LBB51_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT:    v_lshr_b64 v[2:3], v[0:1], 16
 ; SI-NEXT:  .LBB51_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB51_4:
-; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    s_branch .LBB51_2
 ;
@@ -5611,24 +5612,24 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3
 ; VI-NEXT:    s_cbranch_execnz .LBB51_4
 ; VI-NEXT:  .LBB51_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v1, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v1
 ; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v1, s4, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
+; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB51_3:
 ; VI-NEXT:    s_branch .LBB51_2
@@ -7278,24 +7279,24 @@ define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i
 ; VI-NEXT:    s_cbranch_execnz .LBB63_4
 ; VI-NEXT:  .LBB63_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v1, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v1
 ; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v1, s4, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
+; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB63_3:
 ; VI-NEXT:    s_branch .LBB63_2
@@ -8720,20 +8721,20 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s18, 0
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s17
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s16
 ; SI-NEXT:    s_cbranch_scc0 .LBB73_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; SI-NEXT:    v_alignbit_b32 v0, v0, v2, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; SI-NEXT:    v_lshr_b64 v[0:1], v[2:3], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB73_3
 ; SI-NEXT:  .LBB73_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
 ; SI-NEXT:  .LBB73_3: ; %end
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB73_4:
@@ -8749,24 +8750,24 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3
 ; VI-NEXT:    s_cbranch_execnz .LBB73_4
 ; VI-NEXT:  .LBB73_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v1, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v1
 ; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v1, s4, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
+; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB73_3:
 ; VI-NEXT:    s_branch .LBB73_2
@@ -9336,30 +9337,31 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s18, 0
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s16
 ; SI-NEXT:    s_cbranch_scc0 .LBB77_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT:    v_alignbit_b32 v0, v2, v5, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 24, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SI-NEXT:    v_lshr_b64 v[4:5], v[1:2], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; SI-NEXT:    s_cbranch_execnz .LBB77_3
 ; SI-NEXT:  .LBB77_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; SI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SI-NEXT:    v_lshr_b64 v[4:5], v[1:2], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; SI-NEXT:  .LBB77_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v4
+; SI-NEXT:    v_mov_b32_e32 v1, v5
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB77_4:
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    s_branch .LBB77_2
 ;
@@ -9369,9 +9371,9 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32
 ; VI-NEXT:    s_cmp_lg_u32 s17, 0
 ; VI-NEXT:    s_cbranch_scc0 .LBB77_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    s_lshr_b32 s7, s16, 24
-; VI-NEXT:    s_lshr_b32 s6, s16, 16
-; VI-NEXT:    s_lshr_b32 s8, s16, 8
+; VI-NEXT:    s_lshr_b32 s6, s16, 24
+; VI-NEXT:    s_lshr_b32 s8, s16, 16
+; VI-NEXT:    s_lshr_b32 s7, s16, 8
 ; VI-NEXT:    s_cbranch_execnz .LBB77_4
 ; VI-NEXT:  .LBB77_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
@@ -9392,21 +9394,21 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; VI-NEXT:    v_alignbit_b32 v4, v2, v1, 16
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v3, 24, v4
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB77_3:
+; VI-NEXT:    ; implicit-def: $sgpr7
 ; VI-NEXT:    ; implicit-def: $sgpr8
 ; VI-NEXT:    ; implicit-def: $sgpr6
-; VI-NEXT:    ; implicit-def: $sgpr7
 ; VI-NEXT:    s_branch .LBB77_2
 ; VI-NEXT:  .LBB77_4:
-; VI-NEXT:    v_mov_b32_e32 v1, s8
-; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s8
 ; VI-NEXT:    v_mov_b32_e32 v0, s16
+; VI-NEXT:    v_mov_b32_e32 v3, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v2bf16_to_v4i8_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
index d5d2d4aafaa19..08038b90687c0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
@@ -290,34 +290,34 @@ define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i
 ; VI-NEXT:    s_cbranch_execnz .LBB1_4
 ; VI-NEXT:  .LBB1_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_mov_b32_e32 v1, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v1
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v1, s4, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v2, 16
-; VI-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB1_3:
 ; VI-NEXT:    s_branch .LBB1_2
@@ -964,16 +964,16 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3
 ; SI-NEXT:    s_cbranch_execnz .LBB5_3
 ; SI-NEXT:  .LBB5_2: ; %cmp.true
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v1
+; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshr_b64 v[3:4], v[1:2], 16
-; SI-NEXT:    v_alignbit_b32 v0, v5, v0, 16
 ; SI-NEXT:  .LBB5_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -992,34 +992,34 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3
 ; VI-NEXT:    s_cbranch_execnz .LBB5_4
 ; VI-NEXT:  .LBB5_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_mov_b32_e32 v1, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v1
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v1, s4, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v2, 16
-; VI-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB5_3:
 ; VI-NEXT:    s_branch .LBB5_2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index ee23420c2a662..88d521a0eaa8b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -2406,13 +2406,13 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i16_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v32, v2
 ; SI-NEXT:    v_mov_b32_e32 v31, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
@@ -2435,9 +2435,9 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -6495,172 +6495,211 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e64 v34, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s16
-; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v0
-; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v7
-; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v6
-; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v9
-; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v8
-; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v11
-; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v10
-; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v12
-; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v14
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mul_f32_e64 v60, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v59, 1.0, s19
+; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v9
+; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v11
+; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v15
 ; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e64 v63, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v62, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v61, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v58, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v57, 1.0, s29
+; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v0
+; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v14
 ; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; SI-NEXT:    v_mul_f32_e64 v54, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v55, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v52, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v53, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v50, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v51, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v48, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v49, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v38, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v39, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v36, 1.0, s29
-; SI-NEXT:    v_mul_f32_e64 v37, 1.0, s28
+; SI-NEXT:    v_mul_f32_e64 v54, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v52, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v50, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v48, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v38, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v36, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v34, 1.0, s28
 ; SI-NEXT:    s_cbranch_scc0 .LBB23_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v54
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v52
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v50
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v38
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v36
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
-; SI-NEXT:    v_alignbit_b32 v0, v0, v35, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v55, 16
-; SI-NEXT:    v_alignbit_b32 v2, v2, v53, 16
-; SI-NEXT:    v_alignbit_b32 v3, v3, v51, 16
-; SI-NEXT:    v_alignbit_b32 v4, v4, v49, 16
-; SI-NEXT:    v_alignbit_b32 v5, v5, v39, 16
-; SI-NEXT:    v_alignbit_b32 v6, v6, v37, 16
-; SI-NEXT:    v_alignbit_b32 v7, v7, v33, 16
-; SI-NEXT:    v_alignbit_b32 v8, v8, v31, 16
-; SI-NEXT:    v_alignbit_b32 v9, v9, v29, 16
-; SI-NEXT:    v_alignbit_b32 v10, v10, v27, 16
-; SI-NEXT:    v_alignbit_b32 v11, v11, v25, 16
-; SI-NEXT:    v_alignbit_b32 v12, v12, v23, 16
-; SI-NEXT:    v_alignbit_b32 v13, v13, v21, 16
-; SI-NEXT:    v_alignbit_b32 v14, v14, v19, 16
-; SI-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v60
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v59
+; SI-NEXT:    v_lshr_b64 v[0:1], v[54:55], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[52:53], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v63
+; SI-NEXT:    v_lshr_b64 v[2:3], v[50:51], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v62
+; SI-NEXT:    v_lshr_b64 v[3:4], v[48:49], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v61
+; SI-NEXT:    v_lshr_b64 v[4:5], v[38:39], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v58
+; SI-NEXT:    v_lshr_b64 v[5:6], v[36:37], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v57
+; SI-NEXT:    v_lshr_b64 v[6:7], v[34:35], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v56
+; SI-NEXT:    v_lshr_b64 v[7:8], v[32:33], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v47
+; SI-NEXT:    v_lshr_b64 v[8:9], v[30:31], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v46
+; SI-NEXT:    v_lshr_b64 v[9:10], v[28:29], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v45
+; SI-NEXT:    v_lshr_b64 v[10:11], v[26:27], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v44
+; SI-NEXT:    v_lshr_b64 v[11:12], v[24:25], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v43
+; SI-NEXT:    v_lshr_b64 v[12:13], v[22:23], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v42
+; SI-NEXT:    v_lshr_b64 v[13:14], v[20:21], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v41
+; SI-NEXT:    v_lshr_b64 v[14:15], v[18:19], 16
+; SI-NEXT:    v_mov_b32_e32 v15, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
+; SI-NEXT:    v_lshr_b64 v[39:40], v[16:17], 16
+; SI-NEXT:    v_mov_b32_e32 v17, v15
+; SI-NEXT:    v_mov_b32_e32 v15, v39
 ; SI-NEXT:    s_cbranch_execnz .LBB23_3
 ; SI-NEXT:  .LBB23_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v35
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v60
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v54
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v59
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v54
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v55
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v52
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v52
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v63
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v50
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v50
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v51
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v62
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v48
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v48
-; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v49
+; SI-NEXT:    v_lshr_b64 v[3:4], v[3:4], 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v61
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v38
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v38
-; SI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v39
+; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v58
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v36
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v36
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v37
+; SI-NEXT:    v_lshr_b64 v[5:6], v[5:6], 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v57
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v34
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v32
-; SI-NEXT:    v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v33
+; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 16
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v56
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v32
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v30
-; SI-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v31
+; SI-NEXT:    v_lshr_b64 v[7:8], v[7:8], 16
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v47
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v30
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v28
-; SI-NEXT:    v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v29
+; SI-NEXT:    v_lshr_b64 v[8:9], v[8:9], 16
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v46
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v28
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v26
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v27
+; SI-NEXT:    v_lshr_b64 v[9:10], v[9:10], 16
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v45
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v26
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v24
-; SI-NEXT:    v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v25
+; SI-NEXT:    v_lshr_b64 v[10:11], v[10:11], 16
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v44
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v24
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; SI-NEXT:    v_alignbit_b32 v11, v12, v11, 16
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v23
+; SI-NEXT:    v_lshr_b64 v[11:12], v[11:12], 16
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v43
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v22
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v20
-; SI-NEXT:    v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v21
+; SI-NEXT:    v_lshr_b64 v[12:13], v[12:13], 16
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v42
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v20
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v19
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT:    v_lshr_b64 v[13:14], v[13:14], 16
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v41
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v18
 ; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v16
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v17
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_alignbit_b32 v15, v16, v15, 16
+; SI-NEXT:    v_lshr_b64 v[16:17], v[16:17], 16
+; SI-NEXT:    v_lshr_b64 v[14:15], v[14:15], 16
+; SI-NEXT:    v_mov_b32_e32 v15, v16
 ; SI-NEXT:  .LBB23_3: ; %end
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB23_4:
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -6670,11 +6709,11 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v19, s30, 0
+; VI-NEXT:    v_writelane_b32 v20, s30, 0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    v_writelane_b32 v19, s31, 1
+; VI-NEXT:    v_writelane_b32 v20, s31, 1
 ; VI-NEXT:    v_readfirstlane_b32 s30, v0
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_readfirstlane_b32 s31, v1
@@ -6682,295 +6721,303 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_cbranch_execnz .LBB23_4
 ; VI-NEXT:  .LBB23_2: ; %cmp.true
-; VI-NEXT:    s_lshl_b32 s4, s31, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s31, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_mov_b32_e32 v16, 0x40c00000
 ; VI-NEXT:    s_lshl_b32 s4, s30, 16
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s30, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s4, v0
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_alignbit_b32 v15, v2, v1, 16
-; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    s_lshl_b32 s4, s29, 16
-; VI-NEXT:    v_alignbit_b32 v14, v1, v3, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v16
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s29, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    s_lshl_b32 s4, s28, 16
-; VI-NEXT:    v_alignbit_b32 v13, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s28, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v16
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    s_and_b32 s6, s28, 0xffff0000
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s27, 16
-; VI-NEXT:    v_alignbit_b32 v12, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s27, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
+; VI-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; VI-NEXT:    v_lshrrev_b64 v[14:15], 16, v[0:1]
+; VI-NEXT:    v_cndmask_b32_e64 v0, v3, v4, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NEXT:    s_and_b32 s5, s26, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[12:13], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s26, 16
-; VI-NEXT:    v_alignbit_b32 v11, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s26, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s25, 16
-; VI-NEXT:    v_alignbit_b32 v10, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s25, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s24, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s24, 16
-; VI-NEXT:    v_alignbit_b32 v9, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s24, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s23, 16
-; VI-NEXT:    v_alignbit_b32 v8, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s22, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s22, 16
-; VI-NEXT:    v_alignbit_b32 v7, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s21, 16
-; VI-NEXT:    v_alignbit_b32 v6, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s20, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s20, 16
-; VI-NEXT:    v_alignbit_b32 v5, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_alignbit_b32 v4, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v16, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v16, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s18, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v16, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v16, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v16, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v2
-; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v16, v17, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v16, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v16, v17, vcc
-; VI-NEXT:    v_add_f32_e32 v16, s4, v0
-; VI-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v16, v1, 16
-; VI-NEXT:    v_add_f32_e32 v16, s4, v0
-; VI-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    s_and_b32 s6, s16, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s6, v16
+; VI-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT:    s_lshl_b32 s6, s17, 16
+; VI-NEXT:    v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; VI-NEXT:    s_and_b32 s6, s17, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
+; VI-NEXT:    s_lshl_b32 s6, s19, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_cndmask_b32_e64 v17, v1, v5, s[4:5]
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    s_and_b32 s6, s19, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v3
+; VI-NEXT:    s_lshl_b32 s6, s21, 16
+; VI-NEXT:    v_mov_b32_e32 v1, v17
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_cndmask_b32_e64 v17, v5, v7, s[4:5]
+; VI-NEXT:    v_add_f32_e32 v5, s6, v16
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    s_and_b32 s6, s21, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v3, v9, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
+; VI-NEXT:    v_add_f32_e32 v5, s6, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; VI-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v5
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    s_lshl_b32 s6, s23, 16
+; VI-NEXT:    v_mov_b32_e32 v3, v17
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_cndmask_b32_e64 v17, v7, v9, s[4:5]
+; VI-NEXT:    v_add_f32_e32 v7, s6, v16
+; VI-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    s_and_b32 s6, s23, 0xffff0000
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v7, v7
+; VI-NEXT:    v_add_f32_e32 v7, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_bfe_u32 v13, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v7
+; VI-NEXT:    s_lshl_b32 s6, s25, 16
+; VI-NEXT:    v_mov_b32_e32 v5, v17
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_cndmask_b32_e64 v17, v9, v11, s[4:5]
+; VI-NEXT:    v_add_f32_e32 v9, s6, v16
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    s_and_b32 s6, s25, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v7, v13, v15, vcc
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v9, v9
+; VI-NEXT:    v_add_f32_e32 v9, s6, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; VI-NEXT:    v_bfe_u32 v15, v9, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v9
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_mov_b32_e32 v7, v17
+; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    v_cndmask_b32_e32 v9, v15, v17, vcc
+; VI-NEXT:    s_lshl_b32 s6, s27, 16
+; VI-NEXT:    v_cndmask_b32_e64 v17, v11, v13, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; VI-NEXT:    v_add_f32_e32 v11, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    s_and_b32 s6, s27, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v11, v11
+; VI-NEXT:    v_add_f32_e32 v11, s6, v16
+; VI-NEXT:    v_mov_b32_e32 v9, v17
+; VI-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v11
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT:    v_bfe_u32 v17, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v0
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    s_and_b32 s7, s31, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v17, v13, v15, s[4:5]
+; VI-NEXT:    v_add_f32_e32 v13, s7, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    s_lshl_b32 s6, s31, 16
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_mov_b32_e32 v11, v17
+; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_add_f32_e32 v13, s6, v16
+; VI-NEXT:    v_cndmask_b32_e32 v15, v15, v17, vcc
+; VI-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v13
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v17, v18, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v16, 16
+; VI-NEXT:    s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_add_f32_e32 v13, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v15
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    s_lshl_b32 s4, s29, 16
+; VI-NEXT:    v_cndmask_b32_e32 v13, v15, v19, vcc
+; VI-NEXT:    v_add_f32_e32 v15, s4, v16
+; VI-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v15
+; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    v_cndmask_b32_e32 v15, v16, v19, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
+; VI-NEXT:    v_lshrrev_b64 v[15:16], 16, v[15:16]
+; VI-NEXT:    v_lshrrev_b64 v[16:17], 16, v[17:18]
+; VI-NEXT:    v_mov_b32_e32 v13, v15
+; VI-NEXT:    v_mov_b32_e32 v15, v16
 ; VI-NEXT:    s_branch .LBB23_5
 ; VI-NEXT:  .LBB23_3:
 ; VI-NEXT:    s_branch .LBB23_2
@@ -6992,10 +7039,10 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB23_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v19, 1
-; VI-NEXT:    v_readlane_b32 s30, v19, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 1
+; VI-NEXT:    v_readlane_b32 s30, v20, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -11440,11 +11487,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v47
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:100
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -11453,6 +11495,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -11484,7 +11531,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v58
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; SI-NEXT:    v_or_b32_e32 v12, v54, v12
@@ -11723,7 +11769,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 3, v58
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -11972,11 +12017,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; VI-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; VI-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
-; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; VI-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -12016,16 +12061,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -12035,6 +12073,13 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -12044,11 +12089,10 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -12211,7 +12255,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v15, 0x300
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v40
 ; VI-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12221,7 +12265,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v11, 3, v23
 ; VI-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_add_u16_e32 v12, 3, v38
 ; VI-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12428,11 +12471,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
-; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -12476,16 +12519,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -12495,6 +12531,13 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -12504,11 +12547,10 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -12671,7 +12713,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_movk_i32 s6, 0x300
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v40
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12681,7 +12723,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v11, 3, v23
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u16_e32 v12, 3, v38
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -17323,13 +17364,13 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i16_to_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v32, v2
 ; SI-NEXT:    v_mov_b32_e32 v31, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
@@ -17352,9 +17393,9 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -21386,172 +21427,211 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e64 v34, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s16
-; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v0
-; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v7
-; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v6
-; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v9
-; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v8
-; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v11
-; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v10
-; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v12
-; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v14
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mul_f32_e64 v60, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v59, 1.0, s19
+; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v9
+; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v11
+; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v15
 ; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e64 v63, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v62, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v61, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v58, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v57, 1.0, s29
+; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v0
+; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v14
 ; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; SI-NEXT:    v_mul_f32_e64 v54, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v55, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v52, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v53, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v50, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v51, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v48, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v49, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v38, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v39, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v36, 1.0, s29
-; SI-NEXT:    v_mul_f32_e64 v37, 1.0, s28
+; SI-NEXT:    v_mul_f32_e64 v54, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v52, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v50, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v48, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v38, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v36, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v34, 1.0, s28
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v54
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v52
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v50
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v38
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v36
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
-; SI-NEXT:    v_alignbit_b32 v0, v0, v35, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v55, 16
-; SI-NEXT:    v_alignbit_b32 v2, v2, v53, 16
-; SI-NEXT:    v_alignbit_b32 v3, v3, v51, 16
-; SI-NEXT:    v_alignbit_b32 v4, v4, v49, 16
-; SI-NEXT:    v_alignbit_b32 v5, v5, v39, 16
-; SI-NEXT:    v_alignbit_b32 v6, v6, v37, 16
-; SI-NEXT:    v_alignbit_b32 v7, v7, v33, 16
-; SI-NEXT:    v_alignbit_b32 v8, v8, v31, 16
-; SI-NEXT:    v_alignbit_b32 v9, v9, v29, 16
-; SI-NEXT:    v_alignbit_b32 v10, v10, v27, 16
-; SI-NEXT:    v_alignbit_b32 v11, v11, v25, 16
-; SI-NEXT:    v_alignbit_b32 v12, v12, v23, 16
-; SI-NEXT:    v_alignbit_b32 v13, v13, v21, 16
-; SI-NEXT:    v_alignbit_b32 v14, v14, v19, 16
-; SI-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v60
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v59
+; SI-NEXT:    v_lshr_b64 v[0:1], v[54:55], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[52:53], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v63
+; SI-NEXT:    v_lshr_b64 v[2:3], v[50:51], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v62
+; SI-NEXT:    v_lshr_b64 v[3:4], v[48:49], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v61
+; SI-NEXT:    v_lshr_b64 v[4:5], v[38:39], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v58
+; SI-NEXT:    v_lshr_b64 v[5:6], v[36:37], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v57
+; SI-NEXT:    v_lshr_b64 v[6:7], v[34:35], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v56
+; SI-NEXT:    v_lshr_b64 v[7:8], v[32:33], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v47
+; SI-NEXT:    v_lshr_b64 v[8:9], v[30:31], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v46
+; SI-NEXT:    v_lshr_b64 v[9:10], v[28:29], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v45
+; SI-NEXT:    v_lshr_b64 v[10:11], v[26:27], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v44
+; SI-NEXT:    v_lshr_b64 v[11:12], v[24:25], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v43
+; SI-NEXT:    v_lshr_b64 v[12:13], v[22:23], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v42
+; SI-NEXT:    v_lshr_b64 v[13:14], v[20:21], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v41
+; SI-NEXT:    v_lshr_b64 v[14:15], v[18:19], 16
+; SI-NEXT:    v_mov_b32_e32 v15, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
+; SI-NEXT:    v_lshr_b64 v[39:40], v[16:17], 16
+; SI-NEXT:    v_mov_b32_e32 v17, v15
+; SI-NEXT:    v_mov_b32_e32 v15, v39
 ; SI-NEXT:    s_cbranch_execnz .LBB47_3
 ; SI-NEXT:  .LBB47_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v35
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v60
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v54
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v59
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v54
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v55
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v52
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v52
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v63
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v50
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v50
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v51
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v62
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v48
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v48
-; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v49
+; SI-NEXT:    v_lshr_b64 v[3:4], v[3:4], 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v61
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v38
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v38
-; SI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v39
+; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v58
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v36
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v36
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v37
+; SI-NEXT:    v_lshr_b64 v[5:6], v[5:6], 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v57
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v34
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v32
-; SI-NEXT:    v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v33
+; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 16
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v56
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v32
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v30
-; SI-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v31
+; SI-NEXT:    v_lshr_b64 v[7:8], v[7:8], 16
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v47
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v30
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v28
-; SI-NEXT:    v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v29
+; SI-NEXT:    v_lshr_b64 v[8:9], v[8:9], 16
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v46
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v28
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v26
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v27
+; SI-NEXT:    v_lshr_b64 v[9:10], v[9:10], 16
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v45
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v26
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v24
-; SI-NEXT:    v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v25
+; SI-NEXT:    v_lshr_b64 v[10:11], v[10:11], 16
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v44
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v24
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; SI-NEXT:    v_alignbit_b32 v11, v12, v11, 16
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v23
+; SI-NEXT:    v_lshr_b64 v[11:12], v[11:12], 16
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v43
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v22
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v20
-; SI-NEXT:    v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v21
+; SI-NEXT:    v_lshr_b64 v[12:13], v[12:13], 16
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v42
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v20
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v19
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT:    v_lshr_b64 v[13:14], v[13:14], 16
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v41
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v18
 ; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v16
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v17
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_alignbit_b32 v15, v16, v15, 16
+; SI-NEXT:    v_lshr_b64 v[16:17], v[16:17], 16
+; SI-NEXT:    v_lshr_b64 v[14:15], v[14:15], 16
+; SI-NEXT:    v_mov_b32_e32 v15, v16
 ; SI-NEXT:  .LBB47_3: ; %end
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB47_4:
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -21561,11 +21641,11 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v19, s30, 0
+; VI-NEXT:    v_writelane_b32 v20, s30, 0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    v_writelane_b32 v19, s31, 1
+; VI-NEXT:    v_writelane_b32 v20, s31, 1
 ; VI-NEXT:    v_readfirstlane_b32 s30, v0
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_readfirstlane_b32 s31, v1
@@ -21573,295 +21653,303 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_cbranch_execnz .LBB47_4
 ; VI-NEXT:  .LBB47_2: ; %cmp.true
-; VI-NEXT:    s_lshl_b32 s4, s31, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s31, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_mov_b32_e32 v16, 0x40c00000
 ; VI-NEXT:    s_lshl_b32 s4, s30, 16
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s30, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s4, v0
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_alignbit_b32 v15, v2, v1, 16
-; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    s_lshl_b32 s4, s29, 16
-; VI-NEXT:    v_alignbit_b32 v14, v1, v3, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v16
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s29, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    s_lshl_b32 s4, s28, 16
-; VI-NEXT:    v_alignbit_b32 v13, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s28, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v16
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    s_and_b32 s6, s28, 0xffff0000
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s27, 16
-; VI-NEXT:    v_alignbit_b32 v12, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s27, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
+; VI-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; VI-NEXT:    v_lshrrev_b64 v[14:15], 16, v[0:1]
+; VI-NEXT:    v_cndmask_b32_e64 v0, v3, v4, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NEXT:    s_and_b32 s5, s26, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[12:13], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s26, 16
-; VI-NEXT:    v_alignbit_b32 v11, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s26, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s25, 16
-; VI-NEXT:    v_alignbit_b32 v10, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s25, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s24, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s24, 16
-; VI-NEXT:    v_alignbit_b32 v9, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s24, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s23, 16
-; VI-NEXT:    v_alignbit_b32 v8, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s22, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s22, 16
-; VI-NEXT:    v_alignbit_b32 v7, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s21, 16
-; VI-NEXT:    v_alignbit_b32 v6, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s20, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s20, 16
-; VI-NEXT:    v_alignbit_b32 v5, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_alignbit_b32 v4, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v16, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v16, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s18, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v16, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v16, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v16, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v2
-; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v16, v17, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v16, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v16, v17, vcc
-; VI-NEXT:    v_add_f32_e32 v16, s4, v0
-; VI-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v16, v1, 16
-; VI-NEXT:    v_add_f32_e32 v16, s4, v0
-; VI-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    s_and_b32 s6, s16, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s6, v16
+; VI-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT:    s_lshl_b32 s6, s17, 16
+; VI-NEXT:    v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; VI-NEXT:    s_and_b32 s6, s17, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
+; VI-NEXT:    s_lshl_b32 s6, s19, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_cndmask_b32_e64 v17, v1, v5, s[4:5]
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    s_and_b32 s6, s19, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v3
+; VI-NEXT:    s_lshl_b32 s6, s21, 16
+; VI-NEXT:    v_mov_b32_e32 v1, v17
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_cndmask_b32_e64 v17, v5, v7, s[4:5]
+; VI-NEXT:    v_add_f32_e32 v5, s6, v16
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    s_and_b32 s6, s21, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v3, v9, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
+; VI-NEXT:    v_add_f32_e32 v5, s6, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; VI-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v5
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    s_lshl_b32 s6, s23, 16
+; VI-NEXT:    v_mov_b32_e32 v3, v17
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_cndmask_b32_e64 v17, v7, v9, s[4:5]
+; VI-NEXT:    v_add_f32_e32 v7, s6, v16
+; VI-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    s_and_b32 s6, s23, 0xffff0000
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v7, v7
+; VI-NEXT:    v_add_f32_e32 v7, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_bfe_u32 v13, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v7
+; VI-NEXT:    s_lshl_b32 s6, s25, 16
+; VI-NEXT:    v_mov_b32_e32 v5, v17
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_cndmask_b32_e64 v17, v9, v11, s[4:5]
+; VI-NEXT:    v_add_f32_e32 v9, s6, v16
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    s_and_b32 s6, s25, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v7, v13, v15, vcc
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v9, v9
+; VI-NEXT:    v_add_f32_e32 v9, s6, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; VI-NEXT:    v_bfe_u32 v15, v9, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v9
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_mov_b32_e32 v7, v17
+; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    v_cndmask_b32_e32 v9, v15, v17, vcc
+; VI-NEXT:    s_lshl_b32 s6, s27, 16
+; VI-NEXT:    v_cndmask_b32_e64 v17, v11, v13, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; VI-NEXT:    v_add_f32_e32 v11, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    s_and_b32 s6, s27, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v11, v11
+; VI-NEXT:    v_add_f32_e32 v11, s6, v16
+; VI-NEXT:    v_mov_b32_e32 v9, v17
+; VI-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v11
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT:    v_bfe_u32 v17, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v0
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    s_and_b32 s7, s31, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v17, v13, v15, s[4:5]
+; VI-NEXT:    v_add_f32_e32 v13, s7, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    s_lshl_b32 s6, s31, 16
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_mov_b32_e32 v11, v17
+; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_add_f32_e32 v13, s6, v16
+; VI-NEXT:    v_cndmask_b32_e32 v15, v15, v17, vcc
+; VI-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v13
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v17, v18, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v16, 16
+; VI-NEXT:    s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_add_f32_e32 v13, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v15
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    s_lshl_b32 s4, s29, 16
+; VI-NEXT:    v_cndmask_b32_e32 v13, v15, v19, vcc
+; VI-NEXT:    v_add_f32_e32 v15, s4, v16
+; VI-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v15
+; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    v_cndmask_b32_e32 v15, v16, v19, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
+; VI-NEXT:    v_lshrrev_b64 v[15:16], 16, v[15:16]
+; VI-NEXT:    v_lshrrev_b64 v[16:17], 16, v[17:18]
+; VI-NEXT:    v_mov_b32_e32 v13, v15
+; VI-NEXT:    v_mov_b32_e32 v15, v16
 ; VI-NEXT:    s_branch .LBB47_5
 ; VI-NEXT:  .LBB47_3:
 ; VI-NEXT:    s_branch .LBB47_2
@@ -21883,10 +21971,10 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB47_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v19, 1
-; VI-NEXT:    v_readlane_b32 s30, v19, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 1
+; VI-NEXT:    v_readlane_b32 s30, v20, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -26452,11 +26540,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v47
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:100
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -26465,6 +26548,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -26496,7 +26584,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v58
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; SI-NEXT:    v_or_b32_e32 v12, v54, v12
@@ -26735,7 +26822,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 3, v58
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -26984,11 +27070,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; VI-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; VI-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
-; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; VI-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -27028,16 +27114,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -27047,6 +27126,13 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -27056,11 +27142,10 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -27223,7 +27308,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v15, 0x300
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v40
 ; VI-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -27233,7 +27318,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v11, 3, v23
 ; VI-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_add_u16_e32 v12, 3, v38
 ; VI-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -27440,11 +27524,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
-; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -27488,16 +27572,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -27507,6 +27584,13 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -27516,11 +27600,10 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -27683,7 +27766,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_movk_i32 s6, 0x300
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v40
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -27693,7 +27776,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v11, 3, v23
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u16_e32 v12, 3, v38
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -31688,13 +31770,13 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i16_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v32, v2
 ; SI-NEXT:    v_mov_b32_e32 v31, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
@@ -31717,9 +31799,9 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -35785,172 +35867,211 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e64 v34, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s16
-; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v0
-; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v7
-; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v6
-; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v9
-; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v8
-; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v11
-; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v10
-; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v12
-; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v14
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mul_f32_e64 v60, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v59, 1.0, s19
+; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v9
+; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v11
+; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v15
 ; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e64 v63, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v62, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v61, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v58, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v57, 1.0, s29
+; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v0
+; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v14
 ; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; SI-NEXT:    v_mul_f32_e64 v54, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v55, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v52, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v53, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v50, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v51, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v48, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v49, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v38, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v39, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v36, 1.0, s29
-; SI-NEXT:    v_mul_f32_e64 v37, 1.0, s28
+; SI-NEXT:    v_mul_f32_e64 v54, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v52, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v50, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v48, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v38, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v36, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v34, 1.0, s28
 ; SI-NEXT:    s_cbranch_scc0 .LBB67_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v54
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v52
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v50
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v38
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v36
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
-; SI-NEXT:    v_alignbit_b32 v0, v0, v35, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v55, 16
-; SI-NEXT:    v_alignbit_b32 v2, v2, v53, 16
-; SI-NEXT:    v_alignbit_b32 v3, v3, v51, 16
-; SI-NEXT:    v_alignbit_b32 v4, v4, v49, 16
-; SI-NEXT:    v_alignbit_b32 v5, v5, v39, 16
-; SI-NEXT:    v_alignbit_b32 v6, v6, v37, 16
-; SI-NEXT:    v_alignbit_b32 v7, v7, v33, 16
-; SI-NEXT:    v_alignbit_b32 v8, v8, v31, 16
-; SI-NEXT:    v_alignbit_b32 v9, v9, v29, 16
-; SI-NEXT:    v_alignbit_b32 v10, v10, v27, 16
-; SI-NEXT:    v_alignbit_b32 v11, v11, v25, 16
-; SI-NEXT:    v_alignbit_b32 v12, v12, v23, 16
-; SI-NEXT:    v_alignbit_b32 v13, v13, v21, 16
-; SI-NEXT:    v_alignbit_b32 v14, v14, v19, 16
-; SI-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v60
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v59
+; SI-NEXT:    v_lshr_b64 v[0:1], v[54:55], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[52:53], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v63
+; SI-NEXT:    v_lshr_b64 v[2:3], v[50:51], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v62
+; SI-NEXT:    v_lshr_b64 v[3:4], v[48:49], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v61
+; SI-NEXT:    v_lshr_b64 v[4:5], v[38:39], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v58
+; SI-NEXT:    v_lshr_b64 v[5:6], v[36:37], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v57
+; SI-NEXT:    v_lshr_b64 v[6:7], v[34:35], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v56
+; SI-NEXT:    v_lshr_b64 v[7:8], v[32:33], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v47
+; SI-NEXT:    v_lshr_b64 v[8:9], v[30:31], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v46
+; SI-NEXT:    v_lshr_b64 v[9:10], v[28:29], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v45
+; SI-NEXT:    v_lshr_b64 v[10:11], v[26:27], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v44
+; SI-NEXT:    v_lshr_b64 v[11:12], v[24:25], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v43
+; SI-NEXT:    v_lshr_b64 v[12:13], v[22:23], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v42
+; SI-NEXT:    v_lshr_b64 v[13:14], v[20:21], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v41
+; SI-NEXT:    v_lshr_b64 v[14:15], v[18:19], 16
+; SI-NEXT:    v_mov_b32_e32 v15, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
+; SI-NEXT:    v_lshr_b64 v[39:40], v[16:17], 16
+; SI-NEXT:    v_mov_b32_e32 v17, v15
+; SI-NEXT:    v_mov_b32_e32 v15, v39
 ; SI-NEXT:    s_cbranch_execnz .LBB67_3
 ; SI-NEXT:  .LBB67_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v35
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v60
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v54
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v59
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v54
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v55
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v52
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v52
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v63
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v50
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v50
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v51
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v62
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v48
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v48
-; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v49
+; SI-NEXT:    v_lshr_b64 v[3:4], v[3:4], 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v61
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v38
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v38
-; SI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v39
+; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v58
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v36
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v36
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v37
+; SI-NEXT:    v_lshr_b64 v[5:6], v[5:6], 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v57
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v34
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v32
-; SI-NEXT:    v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v33
+; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 16
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v56
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v32
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v30
-; SI-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v31
+; SI-NEXT:    v_lshr_b64 v[7:8], v[7:8], 16
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v47
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v30
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v28
-; SI-NEXT:    v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v29
+; SI-NEXT:    v_lshr_b64 v[8:9], v[8:9], 16
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v46
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v28
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v26
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v27
+; SI-NEXT:    v_lshr_b64 v[9:10], v[9:10], 16
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v45
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v26
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v24
-; SI-NEXT:    v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v25
+; SI-NEXT:    v_lshr_b64 v[10:11], v[10:11], 16
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v44
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v24
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; SI-NEXT:    v_alignbit_b32 v11, v12, v11, 16
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v23
+; SI-NEXT:    v_lshr_b64 v[11:12], v[11:12], 16
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v43
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v22
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v20
-; SI-NEXT:    v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v21
+; SI-NEXT:    v_lshr_b64 v[12:13], v[12:13], 16
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v42
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v20
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v19
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT:    v_lshr_b64 v[13:14], v[13:14], 16
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v41
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v18
 ; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v16
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v17
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_alignbit_b32 v15, v16, v15, 16
+; SI-NEXT:    v_lshr_b64 v[16:17], v[16:17], 16
+; SI-NEXT:    v_lshr_b64 v[14:15], v[14:15], 16
+; SI-NEXT:    v_mov_b32_e32 v15, v16
 ; SI-NEXT:  .LBB67_3: ; %end
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB67_4:
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -35960,11 +36081,11 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v19, s30, 0
+; VI-NEXT:    v_writelane_b32 v20, s30, 0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    v_writelane_b32 v19, s31, 1
+; VI-NEXT:    v_writelane_b32 v20, s31, 1
 ; VI-NEXT:    v_readfirstlane_b32 s30, v0
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_readfirstlane_b32 s31, v1
@@ -35972,295 +36093,303 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_cbranch_execnz .LBB67_4
 ; VI-NEXT:  .LBB67_2: ; %cmp.true
-; VI-NEXT:    s_lshl_b32 s4, s31, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s31, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_mov_b32_e32 v16, 0x40c00000
 ; VI-NEXT:    s_lshl_b32 s4, s30, 16
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s30, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s4, v0
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_alignbit_b32 v15, v2, v1, 16
-; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    s_lshl_b32 s4, s29, 16
-; VI-NEXT:    v_alignbit_b32 v14, v1, v3, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v16
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s29, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    s_lshl_b32 s4, s28, 16
-; VI-NEXT:    v_alignbit_b32 v13, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s28, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v16
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    s_and_b32 s6, s28, 0xffff0000
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s27, 16
-; VI-NEXT:    v_alignbit_b32 v12, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s27, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
+; VI-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; VI-NEXT:    v_lshrrev_b64 v[14:15], 16, v[0:1]
+; VI-NEXT:    v_cndmask_b32_e64 v0, v3, v4, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NEXT:    s_and_b32 s5, s26, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[12:13], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s26, 16
-; VI-NEXT:    v_alignbit_b32 v11, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s26, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s25, 16
-; VI-NEXT:    v_alignbit_b32 v10, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s25, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s24, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s24, 16
-; VI-NEXT:    v_alignbit_b32 v9, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s24, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s23, 16
-; VI-NEXT:    v_alignbit_b32 v8, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s22, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s22, 16
-; VI-NEXT:    v_alignbit_b32 v7, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s21, 16
-; VI-NEXT:    v_alignbit_b32 v6, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s20, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s20, 16
-; VI-NEXT:    v_alignbit_b32 v5, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_alignbit_b32 v4, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v16, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v16, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s18, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v16, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v16, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v16, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v2
-; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v16, v17, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v16, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v16, v17, vcc
-; VI-NEXT:    v_add_f32_e32 v16, s4, v0
-; VI-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v16, v1, 16
-; VI-NEXT:    v_add_f32_e32 v16, s4, v0
-; VI-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    s_and_b32 s6, s16, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s6, v16
+; VI-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT:    s_lshl_b32 s6, s17, 16
+; VI-NEXT:    v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; VI-NEXT:    s_and_b32 s6, s17, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
+; VI-NEXT:    s_lshl_b32 s6, s19, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_cndmask_b32_e64 v17, v1, v5, s[4:5]
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    s_and_b32 s6, s19, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v3
+; VI-NEXT:    s_lshl_b32 s6, s21, 16
+; VI-NEXT:    v_mov_b32_e32 v1, v17
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_cndmask_b32_e64 v17, v5, v7, s[4:5]
+; VI-NEXT:    v_add_f32_e32 v5, s6, v16
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    s_and_b32 s6, s21, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v3, v9, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
+; VI-NEXT:    v_add_f32_e32 v5, s6, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; VI-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v5
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    s_lshl_b32 s6, s23, 16
+; VI-NEXT:    v_mov_b32_e32 v3, v17
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_cndmask_b32_e64 v17, v7, v9, s[4:5]
+; VI-NEXT:    v_add_f32_e32 v7, s6, v16
+; VI-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    s_and_b32 s6, s23, 0xffff0000
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v7, v7
+; VI-NEXT:    v_add_f32_e32 v7, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_bfe_u32 v13, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v7
+; VI-NEXT:    s_lshl_b32 s6, s25, 16
+; VI-NEXT:    v_mov_b32_e32 v5, v17
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_cndmask_b32_e64 v17, v9, v11, s[4:5]
+; VI-NEXT:    v_add_f32_e32 v9, s6, v16
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    s_and_b32 s6, s25, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v7, v13, v15, vcc
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v9, v9
+; VI-NEXT:    v_add_f32_e32 v9, s6, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; VI-NEXT:    v_bfe_u32 v15, v9, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v9
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_mov_b32_e32 v7, v17
+; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    v_cndmask_b32_e32 v9, v15, v17, vcc
+; VI-NEXT:    s_lshl_b32 s6, s27, 16
+; VI-NEXT:    v_cndmask_b32_e64 v17, v11, v13, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; VI-NEXT:    v_add_f32_e32 v11, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    s_and_b32 s6, s27, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v11, v11
+; VI-NEXT:    v_add_f32_e32 v11, s6, v16
+; VI-NEXT:    v_mov_b32_e32 v9, v17
+; VI-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v11
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT:    v_bfe_u32 v17, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v0
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    s_and_b32 s7, s31, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v17, v13, v15, s[4:5]
+; VI-NEXT:    v_add_f32_e32 v13, s7, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    s_lshl_b32 s6, s31, 16
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_mov_b32_e32 v11, v17
+; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_add_f32_e32 v13, s6, v16
+; VI-NEXT:    v_cndmask_b32_e32 v15, v15, v17, vcc
+; VI-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v13
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v17, v18, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v16, 16
+; VI-NEXT:    s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_add_f32_e32 v13, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v15
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    s_lshl_b32 s4, s29, 16
+; VI-NEXT:    v_cndmask_b32_e32 v13, v15, v19, vcc
+; VI-NEXT:    v_add_f32_e32 v15, s4, v16
+; VI-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v15
+; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    v_cndmask_b32_e32 v15, v16, v19, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
+; VI-NEXT:    v_lshrrev_b64 v[15:16], 16, v[15:16]
+; VI-NEXT:    v_lshrrev_b64 v[16:17], 16, v[17:18]
+; VI-NEXT:    v_mov_b32_e32 v13, v15
+; VI-NEXT:    v_mov_b32_e32 v15, v16
 ; VI-NEXT:    s_branch .LBB67_5
 ; VI-NEXT:  .LBB67_3:
 ; VI-NEXT:    s_branch .LBB67_2
@@ -36282,10 +36411,10 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB67_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v19, 1
-; VI-NEXT:    v_readlane_b32 s30, v19, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 1
+; VI-NEXT:    v_readlane_b32 s30, v20, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -40740,11 +40869,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v47
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:100
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -40753,6 +40877,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -40784,7 +40913,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v58
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; SI-NEXT:    v_or_b32_e32 v12, v54, v12
@@ -41023,7 +41151,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 3, v58
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -41272,11 +41399,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; VI-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; VI-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
-; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; VI-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -41316,16 +41443,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -41335,6 +41455,13 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -41344,11 +41471,10 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -41511,7 +41637,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v15, 0x300
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v40
 ; VI-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -41521,7 +41647,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v11, 3, v23
 ; VI-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_add_u16_e32 v12, 3, v38
 ; VI-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -41728,11 +41853,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
-; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -41776,16 +41901,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -41795,6 +41913,13 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -41804,11 +41929,10 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -41971,7 +42095,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_movk_i32 s6, 0x300
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v40
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -41981,7 +42105,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v11, 3, v23
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u16_e32 v12, 3, v38
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -45317,13 +45440,13 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i16_to_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v32, v2
 ; SI-NEXT:    v_mov_b32_e32 v31, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
@@ -45346,9 +45469,9 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -49244,172 +49367,211 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e64 v34, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s16
-; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v0
-; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v7
-; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v6
-; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v9
-; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v8
-; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v11
-; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v10
-; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v12
-; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v14
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mul_f32_e64 v60, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v59, 1.0, s19
+; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v9
+; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v11
+; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v15
 ; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e64 v63, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v62, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v61, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v58, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v57, 1.0, s29
+; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v0
+; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v14
 ; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; SI-NEXT:    v_mul_f32_e64 v54, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v55, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v52, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v53, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v50, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v51, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v48, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v49, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v38, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v39, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v36, 1.0, s29
-; SI-NEXT:    v_mul_f32_e64 v37, 1.0, s28
+; SI-NEXT:    v_mul_f32_e64 v54, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v52, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v50, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v48, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v38, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v36, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v34, 1.0, s28
 ; SI-NEXT:    s_cbranch_scc0 .LBB83_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v54
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v52
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v50
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v38
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v36
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
-; SI-NEXT:    v_alignbit_b32 v0, v0, v35, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v55, 16
-; SI-NEXT:    v_alignbit_b32 v2, v2, v53, 16
-; SI-NEXT:    v_alignbit_b32 v3, v3, v51, 16
-; SI-NEXT:    v_alignbit_b32 v4, v4, v49, 16
-; SI-NEXT:    v_alignbit_b32 v5, v5, v39, 16
-; SI-NEXT:    v_alignbit_b32 v6, v6, v37, 16
-; SI-NEXT:    v_alignbit_b32 v7, v7, v33, 16
-; SI-NEXT:    v_alignbit_b32 v8, v8, v31, 16
-; SI-NEXT:    v_alignbit_b32 v9, v9, v29, 16
-; SI-NEXT:    v_alignbit_b32 v10, v10, v27, 16
-; SI-NEXT:    v_alignbit_b32 v11, v11, v25, 16
-; SI-NEXT:    v_alignbit_b32 v12, v12, v23, 16
-; SI-NEXT:    v_alignbit_b32 v13, v13, v21, 16
-; SI-NEXT:    v_alignbit_b32 v14, v14, v19, 16
-; SI-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v60
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v59
+; SI-NEXT:    v_lshr_b64 v[0:1], v[54:55], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[52:53], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v63
+; SI-NEXT:    v_lshr_b64 v[2:3], v[50:51], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v62
+; SI-NEXT:    v_lshr_b64 v[3:4], v[48:49], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v61
+; SI-NEXT:    v_lshr_b64 v[4:5], v[38:39], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v58
+; SI-NEXT:    v_lshr_b64 v[5:6], v[36:37], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v57
+; SI-NEXT:    v_lshr_b64 v[6:7], v[34:35], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v56
+; SI-NEXT:    v_lshr_b64 v[7:8], v[32:33], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v47
+; SI-NEXT:    v_lshr_b64 v[8:9], v[30:31], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v46
+; SI-NEXT:    v_lshr_b64 v[9:10], v[28:29], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v45
+; SI-NEXT:    v_lshr_b64 v[10:11], v[26:27], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v44
+; SI-NEXT:    v_lshr_b64 v[11:12], v[24:25], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v43
+; SI-NEXT:    v_lshr_b64 v[12:13], v[22:23], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v42
+; SI-NEXT:    v_lshr_b64 v[13:14], v[20:21], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v41
+; SI-NEXT:    v_lshr_b64 v[14:15], v[18:19], 16
+; SI-NEXT:    v_mov_b32_e32 v15, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
+; SI-NEXT:    v_lshr_b64 v[39:40], v[16:17], 16
+; SI-NEXT:    v_mov_b32_e32 v17, v15
+; SI-NEXT:    v_mov_b32_e32 v15, v39
 ; SI-NEXT:    s_cbranch_execnz .LBB83_3
 ; SI-NEXT:  .LBB83_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v35
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v60
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v54
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v59
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v54
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v55
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v52
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v52
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v63
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v50
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v50
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v51
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v62
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v48
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v48
-; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v49
+; SI-NEXT:    v_lshr_b64 v[3:4], v[3:4], 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v61
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v38
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v38
-; SI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v39
+; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v58
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v36
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v36
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v37
+; SI-NEXT:    v_lshr_b64 v[5:6], v[5:6], 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v57
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v34
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v32
-; SI-NEXT:    v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v33
+; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 16
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v56
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v32
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v30
-; SI-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v31
+; SI-NEXT:    v_lshr_b64 v[7:8], v[7:8], 16
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v47
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v30
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v28
-; SI-NEXT:    v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v29
+; SI-NEXT:    v_lshr_b64 v[8:9], v[8:9], 16
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v46
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v28
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v26
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v27
+; SI-NEXT:    v_lshr_b64 v[9:10], v[9:10], 16
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v45
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v26
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v24
-; SI-NEXT:    v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v25
+; SI-NEXT:    v_lshr_b64 v[10:11], v[10:11], 16
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v44
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v24
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; SI-NEXT:    v_alignbit_b32 v11, v12, v11, 16
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v23
+; SI-NEXT:    v_lshr_b64 v[11:12], v[11:12], 16
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v43
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v22
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v20
-; SI-NEXT:    v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v21
+; SI-NEXT:    v_lshr_b64 v[12:13], v[12:13], 16
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v42
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v20
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v19
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT:    v_lshr_b64 v[13:14], v[13:14], 16
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v41
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v18
 ; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v16
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v17
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_alignbit_b32 v15, v16, v15, 16
+; SI-NEXT:    v_lshr_b64 v[16:17], v[16:17], 16
+; SI-NEXT:    v_lshr_b64 v[14:15], v[14:15], 16
+; SI-NEXT:    v_mov_b32_e32 v15, v16
 ; SI-NEXT:  .LBB83_3: ; %end
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB83_4:
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -49419,11 +49581,11 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v19, s30, 0
+; VI-NEXT:    v_writelane_b32 v20, s30, 0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    v_writelane_b32 v19, s31, 1
+; VI-NEXT:    v_writelane_b32 v20, s31, 1
 ; VI-NEXT:    v_readfirstlane_b32 s30, v0
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_readfirstlane_b32 s31, v1
@@ -49431,295 +49593,303 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_cbranch_execnz .LBB83_4
 ; VI-NEXT:  .LBB83_2: ; %cmp.true
-; VI-NEXT:    s_lshl_b32 s4, s31, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s31, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_mov_b32_e32 v16, 0x40c00000
 ; VI-NEXT:    s_lshl_b32 s4, s30, 16
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s30, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s4, v0
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_alignbit_b32 v15, v2, v1, 16
-; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    s_lshl_b32 s4, s29, 16
-; VI-NEXT:    v_alignbit_b32 v14, v1, v3, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v16
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s29, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    s_lshl_b32 s4, s28, 16
-; VI-NEXT:    v_alignbit_b32 v13, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s28, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v16
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    s_and_b32 s6, s28, 0xffff0000
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s27, 16
-; VI-NEXT:    v_alignbit_b32 v12, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s27, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
+; VI-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; VI-NEXT:    v_lshrrev_b64 v[14:15], 16, v[0:1]
+; VI-NEXT:    v_cndmask_b32_e64 v0, v3, v4, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NEXT:    s_and_b32 s5, s26, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[12:13], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s26, 16
-; VI-NEXT:    v_alignbit_b32 v11, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s26, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s25, 16
-; VI-NEXT:    v_alignbit_b32 v10, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s25, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s24, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s24, 16
-; VI-NEXT:    v_alignbit_b32 v9, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s24, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s23, 16
-; VI-NEXT:    v_alignbit_b32 v8, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s22, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s22, 16
-; VI-NEXT:    v_alignbit_b32 v7, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s21, 16
-; VI-NEXT:    v_alignbit_b32 v6, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s20, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s20, 16
-; VI-NEXT:    v_alignbit_b32 v5, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_alignbit_b32 v4, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v16, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v16, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s18, 0xffff0000
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s5, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_alignbit_b32 v3, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v16, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v16, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v16, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v2
-; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v16, v17, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v16, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v16, v17, vcc
-; VI-NEXT:    v_add_f32_e32 v16, s4, v0
-; VI-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v16, v1, 16
-; VI-NEXT:    v_add_f32_e32 v16, s4, v0
-; VI-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    s_and_b32 s6, s16, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; VI-NEXT:    v_add_f32_e32 v0, s6, v16
+; VI-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT:    s_lshl_b32 s6, s17, 16
+; VI-NEXT:    v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; VI-NEXT:    s_and_b32 s6, s17, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
+; VI-NEXT:    s_lshl_b32 s6, s19, 16
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_cndmask_b32_e64 v17, v1, v5, s[4:5]
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    s_and_b32 s6, s19, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v3
+; VI-NEXT:    s_lshl_b32 s6, s21, 16
+; VI-NEXT:    v_mov_b32_e32 v1, v17
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_cndmask_b32_e64 v17, v5, v7, s[4:5]
+; VI-NEXT:    v_add_f32_e32 v5, s6, v16
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    s_and_b32 s6, s21, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v3, v9, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v5, v5
+; VI-NEXT:    v_add_f32_e32 v5, s6, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; VI-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v5
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    s_lshl_b32 s6, s23, 16
+; VI-NEXT:    v_mov_b32_e32 v3, v17
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_cndmask_b32_e64 v17, v7, v9, s[4:5]
+; VI-NEXT:    v_add_f32_e32 v7, s6, v16
+; VI-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    s_and_b32 s6, s23, 0xffff0000
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v7, v7
+; VI-NEXT:    v_add_f32_e32 v7, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_bfe_u32 v13, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v7
+; VI-NEXT:    s_lshl_b32 s6, s25, 16
+; VI-NEXT:    v_mov_b32_e32 v5, v17
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_cndmask_b32_e64 v17, v9, v11, s[4:5]
+; VI-NEXT:    v_add_f32_e32 v9, s6, v16
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    s_and_b32 s6, s25, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v7, v13, v15, vcc
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v9, v9
+; VI-NEXT:    v_add_f32_e32 v9, s6, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; VI-NEXT:    v_bfe_u32 v15, v9, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v9
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_mov_b32_e32 v7, v17
+; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    v_cndmask_b32_e32 v9, v15, v17, vcc
+; VI-NEXT:    s_lshl_b32 s6, s27, 16
+; VI-NEXT:    v_cndmask_b32_e64 v17, v11, v13, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; VI-NEXT:    v_add_f32_e32 v11, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    s_and_b32 s6, s27, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v11, v11
+; VI-NEXT:    v_add_f32_e32 v11, s6, v16
+; VI-NEXT:    v_mov_b32_e32 v9, v17
+; VI-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v11
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT:    v_bfe_u32 v17, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v0
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    s_and_b32 s7, s31, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v17, v13, v15, s[4:5]
+; VI-NEXT:    v_add_f32_e32 v13, s7, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    s_lshl_b32 s6, s31, 16
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_mov_b32_e32 v11, v17
+; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_add_f32_e32 v13, s6, v16
+; VI-NEXT:    v_cndmask_b32_e32 v15, v15, v17, vcc
+; VI-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v13
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v17, v18, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v16, 16
+; VI-NEXT:    s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_add_f32_e32 v13, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v15
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    s_lshl_b32 s4, s29, 16
+; VI-NEXT:    v_cndmask_b32_e32 v13, v15, v19, vcc
+; VI-NEXT:    v_add_f32_e32 v15, s4, v16
+; VI-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v15
+; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    v_cndmask_b32_e32 v15, v16, v19, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
+; VI-NEXT:    v_lshrrev_b64 v[15:16], 16, v[15:16]
+; VI-NEXT:    v_lshrrev_b64 v[16:17], 16, v[17:18]
+; VI-NEXT:    v_mov_b32_e32 v13, v15
+; VI-NEXT:    v_mov_b32_e32 v15, v16
 ; VI-NEXT:    s_branch .LBB83_5
 ; VI-NEXT:  .LBB83_3:
 ; VI-NEXT:    s_branch .LBB83_2
@@ -49741,10 +49911,10 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
 ; VI-NEXT:    v_mov_b32_e32 v14, s30
 ; VI-NEXT:    v_mov_b32_e32 v15, s31
 ; VI-NEXT:  .LBB83_5: ; %end
-; VI-NEXT:    v_readlane_b32 s31, v19, 1
-; VI-NEXT:    v_readlane_b32 s30, v19, 0
+; VI-NEXT:    v_readlane_b32 s31, v20, 1
+; VI-NEXT:    v_readlane_b32 s30, v20, 0
 ; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -54188,11 +54358,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v47
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:100
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -54201,6 +54366,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -54232,7 +54402,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v58
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; SI-NEXT:    v_or_b32_e32 v12, v54, v12
@@ -54471,7 +54640,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 3, v58
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -54720,11 +54888,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; VI-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; VI-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
-; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; VI-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -54764,16 +54932,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -54783,6 +54944,13 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -54792,11 +54960,10 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -54959,7 +55126,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v15, 0x300
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v40
 ; VI-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -54969,7 +55136,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v11, 3, v23
 ; VI-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_add_u16_e32 v12, 3, v38
 ; VI-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -55176,11 +55342,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
-; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -55224,16 +55390,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -55243,6 +55402,13 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -55252,11 +55418,10 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -55419,7 +55584,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_movk_i32 s6, 0x300
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v40
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -55429,7 +55594,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v11, 3, v23
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u16_e32 v12, 3, v38
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -60580,6 +60744,8 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32bf16_to_v32i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -60596,8 +60762,6 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v63, 1.0, v0
 ; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v1
@@ -60661,9 +60825,8 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr28
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v55
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -61996,189 +62159,204 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_mul_f32_e64 v57, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v56, 1.0, s17
-; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v0
-; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v6
-; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v7
-; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v8
-; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v9
-; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v10
-; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v11
-; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v12
-; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v14
-; SI-NEXT:    v_mul_f32_e32 v40, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v16
-; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; SI-NEXT:    v_mul_f32_e64 v32, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s19
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v63, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v62, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v51, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v61, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v60, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v54, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v59, 1.0, s28
-; SI-NEXT:    v_mul_f32_e64 v58, 1.0, s29
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mul_f32_e64 v62, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v61, 1.0, s17
+; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v0
+; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v35, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v9
+; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v11
+; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v14
+; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v15
+; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v16
+; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v17
+; SI-NEXT:    v_mul_f32_e64 v52, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v50, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s23
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e64 v63, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v55, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v17, 1.0, s28
+; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s29
 ; SI-NEXT:    s_cbranch_scc0 .LBB95_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v57
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v56
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v63
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v62
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v51
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v61
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v60
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v54
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v59
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v39
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v47
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v52
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v45
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v43
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v41
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v58
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v49
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v46
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v53
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v44
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v55
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v42
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v40
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v62
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v61
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v63
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v48
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v60
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v59
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v58
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v57
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v56
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v52
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v50
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v55
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v47
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v31
 ; SI-NEXT:    s_cbranch_execnz .LBB95_3
 ; SI-NEXT:  .LBB95_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v56
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v57
-; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v28
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v62
-; SI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; SI-NEXT:    v_add_f32_e32 v53, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v53
+; SI-NEXT:    v_lshr_b64 v[4:5], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v9
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v63
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v60
-; SI-NEXT:    v_alignbit_b32 v4, v4, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v61
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; SI-NEXT:    v_lshr_b64 v[8:9], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v13
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v17
+; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_alignbit_b32 v8, v7, v2, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v58
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v59
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; SI-NEXT:    v_lshr_b64 v[12:13], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v59
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v60
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; SI-NEXT:    v_alignbit_b32 v12, v10, v2, 16
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v46
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v47
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
+; SI-NEXT:    v_lshr_b64 v[16:17], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v21
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v58
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_alignbit_b32 v16, v11, v2, 16
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v44
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v45
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
+; SI-NEXT:    v_lshr_b64 v[20:21], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v25
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v57
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
-; SI-NEXT:    v_alignbit_b32 v20, v14, v2, 16
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v42
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v43
-; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v13
+; SI-NEXT:    v_lshr_b64 v[24:25], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v29
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v56
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT:    v_alignbit_b32 v24, v15, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v41
-; SI-NEXT:    v_add_f32_e32 v41, 0x40c00000, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v40
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v14
+; SI-NEXT:    v_lshr_b64 v[28:29], v[2:3], 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v31
+; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v15
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v2
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v29
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v25
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v62
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v33
+; SI-NEXT:    v_add_f32_e32 v38, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v61
+; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v27
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v13
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v21
-; SI-NEXT:    v_alignbit_b32 v26, v27, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v55
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v35
+; SI-NEXT:    v_lshr_b64 v[38:39], v[38:39], 16
+; SI-NEXT:    v_add_f32_e32 v35, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v23
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v13
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v11
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v52
-; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT:    v_alignbit_b32 v22, v23, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v53
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
+; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v47
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_add_f32_e32 v38, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v19
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v11
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v10
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v39
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_alignbit_b32 v18, v19, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v49
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_alignbit_b32 v30, v31, v15, 16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v37
+; SI-NEXT:    v_add_f32_e32 v37, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v48
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v10
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v7
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v14
-; SI-NEXT:    v_alignbit_b32 v14, v15, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v54
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT:    v_lshr_b64 v[18:19], v[38:39], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v55
+; SI-NEXT:    v_add_f32_e32 v48, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v11
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT:    v_alignbit_b32 v10, v11, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v51
-; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v50
+; SI-NEXT:    v_add_f32_e32 v50, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v2
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v13
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v6
+; SI-NEXT:    v_lshr_b64 v[6:7], v[50:51], 16
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v9
+; SI-NEXT:    v_lshr_b64 v[10:11], v[48:49], 16
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v53
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v14
+; SI-NEXT:    v_lshr_b64 v[14:15], v[37:38], 16
+; SI-NEXT:    v_lshr_b64 v[40:41], v[5:6], 16
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; SI-NEXT:    v_lshr_b64 v[41:42], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[22:23], v[35:36], 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v52
+; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT:    v_lshr_b64 v[42:43], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[26:27], v[33:34], 16
+; SI-NEXT:    v_add_f32_e32 v52, 0x40c00000, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v0
+; SI-NEXT:    v_lshr_b64 v[43:44], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[30:31], v[31:32], 16
+; SI-NEXT:    v_lshr_b64 v[2:3], v[52:53], 16
+; SI-NEXT:    v_lshr_b64 v[44:45], v[21:22], 16
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
-; SI-NEXT:    v_alignbit_b32 v6, v7, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v32
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v28
-; SI-NEXT:    v_lshr_b64 v[33:34], v[1:2], 16
-; SI-NEXT:    v_lshr_b64 v[34:35], v[5:6], 16
-; SI-NEXT:    v_lshr_b64 v[35:36], v[9:10], 16
-; SI-NEXT:    v_lshr_b64 v[36:37], v[13:14], 16
-; SI-NEXT:    v_lshr_b64 v[37:38], v[17:18], 16
-; SI-NEXT:    v_lshr_b64 v[38:39], v[21:22], 16
-; SI-NEXT:    v_lshr_b64 v[50:51], v[25:26], 16
-; SI-NEXT:    v_lshr_b64 v[48:49], v[29:30], 16
-; SI-NEXT:    v_alignbit_b32 v28, v40, v41, 16
+; SI-NEXT:    v_lshr_b64 v[45:46], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[54:55], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[46:47], v[29:30], 16
 ; SI-NEXT:  .LBB95_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v5, v40
+; SI-NEXT:    v_mov_b32_e32 v9, v41
+; SI-NEXT:    v_mov_b32_e32 v13, v42
+; SI-NEXT:    v_mov_b32_e32 v17, v43
+; SI-NEXT:    v_mov_b32_e32 v21, v44
+; SI-NEXT:    v_mov_b32_e32 v25, v45
+; SI-NEXT:    v_mov_b32_e32 v29, v46
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -62195,48 +62373,55 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v1, v33
-; SI-NEXT:    v_mov_b32_e32 v5, v34
-; SI-NEXT:    v_mov_b32_e32 v9, v35
-; SI-NEXT:    v_mov_b32_e32 v13, v36
-; SI-NEXT:    v_mov_b32_e32 v17, v37
-; SI-NEXT:    v_mov_b32_e32 v21, v38
-; SI-NEXT:    v_mov_b32_e32 v25, v50
-; SI-NEXT:    v_mov_b32_e32 v29, v48
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v1, v54
+; SI-NEXT:    v_mov_b32_e32 v3, v53
+; SI-NEXT:    v_mov_b32_e32 v7, v51
+; SI-NEXT:    v_mov_b32_e32 v11, v49
+; SI-NEXT:    v_mov_b32_e32 v15, v38
+; SI-NEXT:    v_mov_b32_e32 v19, v39
+; SI-NEXT:    v_mov_b32_e32 v23, v36
+; SI-NEXT:    v_mov_b32_e32 v27, v34
+; SI-NEXT:    v_mov_b32_e32 v31, v32
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB95_4:
-; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr51
 ; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $vgpr43
+; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr44
+; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr27
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    s_branch .LBB95_2
 ;
@@ -62256,295 +62441,302 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_cbranch_execnz .LBB95_4
 ; VI-NEXT:  .LBB95_2: ; %cmp.true
-; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_mov_b32_e32 v1, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v0, s4, v1
-; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    s_lshl_b32 s5, s30, 16
-; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s5, v1
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s5, s30, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s5, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    s_lshl_b32 s5, s31, 16
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s5, v1
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    s_and_b32 s5, s31, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_add_f32_e32 v5, s5, v1
-; VI-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; VI-NEXT:    s_lshl_b32 s4, s29, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT:    v_alignbit_b32 v14, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
-; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_alignbit_b32 v15, v5, v4, 16
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s29, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v16, 0x40c00000
+; VI-NEXT:    s_lshl_b32 s4, s26, 16
+; VI-NEXT:    v_add_f32_e32 v4, s4, v16
 ; VI-NEXT:    s_lshl_b32 s4, s28, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v13, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s28, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    s_lshl_b32 s4, s27, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v12, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v16
+; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    s_lshl_b32 s4, s30, 16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v2, s4, v16
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s27, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    s_and_b32 s4, s30, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v16
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    s_lshl_b32 s4, s26, 16
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v11, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
+; VI-NEXT:    s_and_b32 s6, s26, 0xffff0000
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[14:15], 16, v[2:3]
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[12:13], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v0, v4, 16, 1
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7fff, v0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v1, 0x400000, v4
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s26, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    s_lshl_b32 s4, s25, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v10, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
+; VI-NEXT:    s_lshl_b32 s6, s24, 16
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; VI-NEXT:    s_and_b32 s6, s24, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7fff, v0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s25, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    s_lshl_b32 s4, s24, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v9, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
+; VI-NEXT:    s_lshl_b32 s6, s22, 16
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; VI-NEXT:    s_and_b32 s6, s22, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7fff, v0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s24, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    s_lshl_b32 s4, s23, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v8, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
+; VI-NEXT:    s_lshl_b32 s6, s20, 16
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; VI-NEXT:    s_and_b32 s6, s20, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7fff, v0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    s_lshl_b32 s4, s22, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v7, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
+; VI-NEXT:    s_lshl_b32 s6, s18, 16
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; VI-NEXT:    s_and_b32 s6, s18, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7fff, v0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT:    s_and_b32 s7, s16, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NEXT:    v_add_f32_e32 v5, s7, v16
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v0, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
+; VI-NEXT:    s_lshl_b32 s6, s16, 16
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7fff, v0
+; VI-NEXT:    v_or_b32_e32 v1, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; VI-NEXT:    v_bfe_u32 v0, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7fff, v0
 ; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    s_lshl_b32 s4, s21, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v6, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s17, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v3, s5, v16
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; VI-NEXT:    s_lshl_b32 s4, s17, 16
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
 ; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    s_lshl_b32 s4, s20, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v5, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v16, 0x400000, v3
+; VI-NEXT:    v_add_f32_e32 v3, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    s_and_b32 s5, s19, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v16, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s5, v16
+; VI-NEXT:    v_cndmask_b32_e32 v17, v5, v7, vcc
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v4, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v16, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v16, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v16, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v3
-; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v16, v17, vcc
-; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v3, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
-; VI-NEXT:    v_bfe_u32 v16, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v2
-; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v16, v17, vcc
-; VI-NEXT:    v_add_f32_e32 v16, s4, v1
-; VI-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT:    v_alignbit_b32 v2, v16, v2, 16
-; VI-NEXT:    v_add_f32_e32 v16, s4, v1
-; VI-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
+; VI-NEXT:    v_add_f32_e32 v3, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    s_and_b32 s5, s21, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; VI-NEXT:    v_add_f32_e32 v5, s5, v16
+; VI-NEXT:    v_mov_b32_e32 v1, v17
+; VI-NEXT:    v_cndmask_b32_e32 v17, v7, v9, vcc
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT:    s_lshl_b32 s4, s21, 16
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_add_f32_e32 v5, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; VI-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    s_and_b32 s5, s23, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; VI-NEXT:    v_add_f32_e32 v7, s5, v16
+; VI-NEXT:    v_mov_b32_e32 v3, v17
+; VI-NEXT:    v_cndmask_b32_e32 v17, v9, v11, vcc
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT:    s_lshl_b32 s4, s23, 16
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_add_f32_e32 v7, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; VI-NEXT:    v_bfe_u32 v11, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v7
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    s_and_b32 s5, s25, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; VI-NEXT:    v_add_f32_e32 v9, s5, v16
+; VI-NEXT:    v_mov_b32_e32 v5, v17
+; VI-NEXT:    v_cndmask_b32_e32 v17, v11, v13, vcc
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT:    s_lshl_b32 s4, s25, 16
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    v_add_f32_e32 v9, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc
+; VI-NEXT:    v_bfe_u32 v13, v9, 16, 1
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v9
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    s_and_b32 s5, s27, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
+; VI-NEXT:    v_add_f32_e32 v11, s5, v16
+; VI-NEXT:    v_mov_b32_e32 v7, v17
+; VI-NEXT:    v_cndmask_b32_e32 v17, v13, v15, vcc
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT:    s_lshl_b32 s4, s27, 16
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    v_add_f32_e32 v11, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v13, v13, v15, vcc
+; VI-NEXT:    v_bfe_u32 v15, v11, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v11
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    s_and_b32 s5, s29, 0xffff0000
+; VI-NEXT:    v_mov_b32_e32 v9, v17
+; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
+; VI-NEXT:    v_add_f32_e32 v13, s5, v16
+; VI-NEXT:    v_cndmask_b32_e32 v17, v15, v17, vcc
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    s_lshl_b32 s4, s29, 16
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_mov_b32_e32 v11, v17
+; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_add_f32_e32 v13, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v15, v15, v17, vcc
+; VI-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v13
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT:    v_add_f32_e32 v17, s4, v1
-; VI-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v17
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT:    v_add_f32_e32 v1, s4, v1
-; VI-NEXT:    v_cndmask_b32_e32 v17, v18, v19, vcc
-; VI-NEXT:    v_bfe_u32 v18, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v1
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v18, v19, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_alignbit_b32 v1, v1, v17, 16
-; VI-NEXT:    v_alignbit_b32 v0, v16, v0, 16
+; VI-NEXT:    s_and_b32 s4, s31, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_add_f32_e32 v13, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v15
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    s_lshl_b32 s4, s31, 16
+; VI-NEXT:    v_cndmask_b32_e32 v13, v15, v19, vcc
+; VI-NEXT:    v_add_f32_e32 v15, s4, v16
+; VI-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v15
+; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    v_cndmask_b32_e32 v15, v16, v19, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_lshrrev_b64 v[15:16], 16, v[15:16]
+; VI-NEXT:    v_mov_b32_e32 v13, v17
 ; VI-NEXT:    s_branch .LBB95_5
 ; VI-NEXT:  .LBB95_3:
 ; VI-NEXT:    s_branch .LBB95_2
@@ -64471,44 +64663,44 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v14
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v14
+; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v13
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v12
+; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v12
+; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v11
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v10
+; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v8
+; VI-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v8
+; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v7
+; VI-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v6
-; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[15:16]
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v5
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[13:14]
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
-; VI-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; VI-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
 ; VI-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
 ; VI-NEXT:    v_lshrrev_b32_e32 v50, 24, v16
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v5
 ; VI-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v42, 8, v9
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v6
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 24, v4
 ; VI-NEXT:    v_lshrrev_b32_e32 v55, 8, v4
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v3
@@ -67768,17 +67960,61 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:84
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:100
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v13
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v11
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v23
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v17
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 8, v21
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 24, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 24, v27
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr19
 ; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    ; implicit-def: $vgpr55
@@ -67793,25 +68029,24 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v31
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 8, v33
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v34
-; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 24, v35
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 8, v36
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 24, v37
-; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 24, v38
 ; SI-NEXT:    v_lshlrev_b32_e32 v22, 8, v25
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v39
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 8, v48
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v49
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v50
 ; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr36
@@ -67819,7 +68054,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
@@ -67833,57 +68067,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v13
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v15
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v11
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v23
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v17
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:12
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v29
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v49
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v50
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -67892,7 +68077,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v21, 0xff, v58
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v26
 ; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
@@ -68173,7 +68357,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB98_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v18
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -68198,7 +68381,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v3, v59, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_add_i32_e32 v55, vcc, s7, v1
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v42
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v8
@@ -68222,7 +68404,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_add_i32_e32 v54, vcc, s7, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v57
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -68430,8 +68611,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB98_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
@@ -68448,6 +68627,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v37
 ; SI-NEXT:    v_mov_b32_e32 v2, v48
@@ -68458,7 +68639,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v12, v32
 ; SI-NEXT:    v_mov_b32_e32 v14, v51
 ; SI-NEXT:    v_mov_b32_e32 v16, v34
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v18, v52
 ; SI-NEXT:    v_mov_b32_e32 v20, v36
 ; SI-NEXT:    v_mov_b32_e32 v22, v53
@@ -70196,13 +70376,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v46, v30
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
-; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:48
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:44
@@ -70219,6 +70398,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    v_readfirstlane_b32 s43, v1
 ; SI-NEXT:    v_readfirstlane_b32 s42, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 8, v3
@@ -70242,19 +70422,19 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v36
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 8, v48
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v39
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 8, v37
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v49
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 8, v30
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 24, v31
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 8, v33
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 24, v34
 ; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -70280,7 +70460,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(10) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(11) expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v60, v44
 ; SI-NEXT:    v_or_b32_e32 v44, v53, v9
 ; SI-NEXT:    v_or_b32_e32 v33, v1, v44
@@ -70725,12 +70905,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB99_3: ; %end
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
@@ -70747,6 +70921,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -70758,11 +70938,13 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v6, s5
 ; SI-NEXT:    v_mov_b32_e32 v7, s11
 ; SI-NEXT:    v_mov_b32_e32 v8, v37
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_mov_b32_e32 v10, v38
 ; SI-NEXT:    v_mov_b32_e32 v12, v33
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v14, v34
 ; SI-NEXT:    v_mov_b32_e32 v16, v48
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v18, v49
 ; SI-NEXT:    v_mov_b32_e32 v20, v35
 ; SI-NEXT:    v_mov_b32_e32 v22, v36
@@ -70770,7 +70952,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v26, v51
 ; SI-NEXT:    v_mov_b32_e32 v28, v54
 ; SI-NEXT:    v_mov_b32_e32 v30, v55
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB99_4:
 ; SI-NEXT:    v_mov_b32_e32 v39, v32
@@ -72188,6 +72369,8 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32f16_to_v32bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -72204,8 +72387,6 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v2
@@ -72273,9 +72454,8 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr28
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v63, v31
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr31
@@ -74902,295 +75082,302 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_cbranch_execnz .LBB103_4
 ; VI-NEXT:  .LBB103_2: ; %cmp.true
-; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_mov_b32_e32 v1, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v0, s4, v1
-; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    s_lshl_b32 s5, s30, 16
-; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s5, v1
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s5, s30, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s5, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    s_lshl_b32 s5, s31, 16
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s5, v1
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    s_and_b32 s5, s31, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_add_f32_e32 v5, s5, v1
-; VI-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; VI-NEXT:    s_lshl_b32 s4, s29, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT:    v_alignbit_b32 v14, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
-; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_alignbit_b32 v15, v5, v4, 16
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s29, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v16, 0x40c00000
+; VI-NEXT:    s_lshl_b32 s4, s26, 16
+; VI-NEXT:    v_add_f32_e32 v4, s4, v16
 ; VI-NEXT:    s_lshl_b32 s4, s28, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v13, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_add_f32_e32 v0, s4, v16
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s28, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    s_lshl_b32 s4, s27, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v12, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v16
+; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    s_lshl_b32 s4, s30, 16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v2, s4, v16
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s27, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    s_and_b32 s4, s30, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s4, v16
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    s_lshl_b32 s4, s26, 16
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v11, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
+; VI-NEXT:    s_and_b32 s6, s26, 0xffff0000
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[14:15], 16, v[2:3]
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[12:13], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v0, v4, 16, 1
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7fff, v0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v1, 0x400000, v4
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s26, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    s_lshl_b32 s4, s25, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v10, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
+; VI-NEXT:    s_lshl_b32 s6, s24, 16
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[10:11], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; VI-NEXT:    s_and_b32 s6, s24, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7fff, v0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s25, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    s_lshl_b32 s4, s24, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v9, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
+; VI-NEXT:    s_lshl_b32 s6, s22, 16
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; VI-NEXT:    s_and_b32 s6, s22, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7fff, v0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s24, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    s_lshl_b32 s4, s23, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v8, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
+; VI-NEXT:    s_lshl_b32 s6, s20, 16
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[6:7], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; VI-NEXT:    s_and_b32 s6, s20, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7fff, v0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s23, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    s_lshl_b32 s4, s22, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v7, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
+; VI-NEXT:    s_lshl_b32 s6, s18, 16
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; VI-NEXT:    s_and_b32 s6, s18, 0xffff0000
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s6, v16
 ; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7fff, v0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; VI-NEXT:    s_and_b32 s7, s16, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NEXT:    v_add_f32_e32 v5, s7, v16
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v0, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
+; VI-NEXT:    s_lshl_b32 s6, s16, 16
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7fff, v0
+; VI-NEXT:    v_or_b32_e32 v1, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_add_f32_e32 v3, s6, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; VI-NEXT:    v_bfe_u32 v0, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7fff, v0
 ; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    s_lshl_b32 s4, s21, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v6, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    s_and_b32 s5, s17, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v3, s5, v16
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; VI-NEXT:    s_lshl_b32 s4, s17, 16
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
 ; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    s_lshl_b32 s4, s20, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v5, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v16, 0x400000, v3
+; VI-NEXT:    v_add_f32_e32 v3, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    s_and_b32 s5, s19, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v16, vcc
+; VI-NEXT:    v_add_f32_e32 v3, s5, v16
+; VI-NEXT:    v_cndmask_b32_e32 v17, v5, v7, vcc
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; VI-NEXT:    s_lshl_b32 s4, s19, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v4, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v16, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v16, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v1
-; VI-NEXT:    v_bfe_u32 v16, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v3
-; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v16, v17, vcc
-; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_alignbit_b32 v3, v3, v2, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v1
-; VI-NEXT:    v_bfe_u32 v16, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v2
-; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v2, v16, v17, vcc
-; VI-NEXT:    v_add_f32_e32 v16, s4, v1
-; VI-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT:    v_alignbit_b32 v2, v16, v2, 16
-; VI-NEXT:    v_add_f32_e32 v16, s4, v1
-; VI-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
+; VI-NEXT:    v_add_f32_e32 v3, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    s_and_b32 s5, s21, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; VI-NEXT:    v_add_f32_e32 v5, s5, v16
+; VI-NEXT:    v_mov_b32_e32 v1, v17
+; VI-NEXT:    v_cndmask_b32_e32 v17, v7, v9, vcc
+; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
+; VI-NEXT:    s_lshl_b32 s4, s21, 16
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_add_f32_e32 v5, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; VI-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    s_and_b32 s5, s23, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; VI-NEXT:    v_add_f32_e32 v7, s5, v16
+; VI-NEXT:    v_mov_b32_e32 v3, v17
+; VI-NEXT:    v_cndmask_b32_e32 v17, v9, v11, vcc
+; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT:    s_lshl_b32 s4, s23, 16
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_add_f32_e32 v7, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; VI-NEXT:    v_bfe_u32 v11, v7, 16, 1
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v7
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    s_and_b32 s5, s25, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; VI-NEXT:    v_add_f32_e32 v9, s5, v16
+; VI-NEXT:    v_mov_b32_e32 v5, v17
+; VI-NEXT:    v_cndmask_b32_e32 v17, v11, v13, vcc
+; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT:    s_lshl_b32 s4, s25, 16
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    v_add_f32_e32 v9, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc
+; VI-NEXT:    v_bfe_u32 v13, v9, 16, 1
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v9
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    s_and_b32 s5, s27, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v9
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
+; VI-NEXT:    v_add_f32_e32 v11, s5, v16
+; VI-NEXT:    v_mov_b32_e32 v7, v17
+; VI-NEXT:    v_cndmask_b32_e32 v17, v13, v15, vcc
+; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT:    s_lshl_b32 s4, s27, 16
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    v_add_f32_e32 v11, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v13, v13, v15, vcc
+; VI-NEXT:    v_bfe_u32 v15, v11, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v11
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    s_and_b32 s5, s29, 0xffff0000
+; VI-NEXT:    v_mov_b32_e32 v9, v17
+; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v11
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
+; VI-NEXT:    v_add_f32_e32 v13, s5, v16
+; VI-NEXT:    v_cndmask_b32_e32 v17, v15, v17, vcc
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    s_lshl_b32 s4, s29, 16
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_mov_b32_e32 v11, v17
+; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_add_f32_e32 v13, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v15, v15, v17, vcc
+; VI-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v13
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT:    v_add_f32_e32 v17, s4, v1
-; VI-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v17
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT:    v_add_f32_e32 v1, s4, v1
-; VI-NEXT:    v_cndmask_b32_e32 v17, v18, v19, vcc
-; VI-NEXT:    v_bfe_u32 v18, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v1
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v18, v19, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_alignbit_b32 v1, v1, v17, 16
-; VI-NEXT:    v_alignbit_b32 v0, v16, v0, 16
+; VI-NEXT:    s_and_b32 s4, s31, 0xffff0000
+; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    v_add_f32_e32 v13, s4, v16
+; VI-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v15
+; VI-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; VI-NEXT:    v_add_u32_e32 v15, vcc, v15, v13
+; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x7fff, v15
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v13
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT:    s_lshl_b32 s4, s31, 16
+; VI-NEXT:    v_cndmask_b32_e32 v13, v15, v19, vcc
+; VI-NEXT:    v_add_f32_e32 v15, s4, v16
+; VI-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v15
+; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
+; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v15
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT:    v_cndmask_b32_e32 v15, v16, v19, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
+; VI-NEXT:    v_lshrrev_b64 v[17:18], 16, v[17:18]
+; VI-NEXT:    v_lshrrev_b64 v[15:16], 16, v[15:16]
+; VI-NEXT:    v_mov_b32_e32 v13, v17
 ; VI-NEXT:    s_branch .LBB103_5
 ; VI-NEXT:  .LBB103_3:
 ; VI-NEXT:    s_branch .LBB103_2
@@ -79163,13 +79350,12 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    ; implicit-def: $sgpr75
 ; VI-NEXT:    s_branch .LBB105_2
 ; VI-NEXT:  .LBB105_4:
-; VI-NEXT:    v_mov_b32_e32 v1, s58
 ; VI-NEXT:    v_mov_b32_e32 v53, s56
 ; VI-NEXT:    v_mov_b32_e32 v52, s42
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v52, s44
+; VI-NEXT:    v_mov_b32_e32 v1, s58
 ; VI-NEXT:    v_mov_b32_e32 v19, s67
 ; VI-NEXT:    v_mov_b32_e32 v12, s66
 ; VI-NEXT:    v_mov_b32_e32 v20, s65
@@ -79215,6 +79401,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    v_mov_b32_e32 v45, s78
 ; VI-NEXT:    v_mov_b32_e32 v42, s76
 ; VI-NEXT:    v_mov_b32_e32 v55, s74
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v54, s57
 ; VI-NEXT:    v_mov_b32_e32 v41, s59
 ; VI-NEXT:    v_mov_b32_e32 v44, s60
@@ -80286,6 +80473,14 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
@@ -80360,19 +80555,10 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 8, v31
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v33
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v34
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 8, v35
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v36
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 8, v37
@@ -80390,7 +80576,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xff, v55
 ; SI-NEXT:    v_or_b32_e32 v16, v19, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v34, v16
@@ -80403,7 +80589,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v18
 ; SI-NEXT:    v_or_b32_e32 v10, v12, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, v10
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v10, 0xff, v41
 ; SI-NEXT:    v_or_b32_e32 v8, v10, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v8
@@ -80428,6 +80613,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v56
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v29, v0
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v6
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v46
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v0
@@ -80634,13 +80820,12 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB106_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v56
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 3, v6
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; SI-NEXT:    v_or_b32_e32 v7, v3, v7
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v47
 ; SI-NEXT:    v_or_b32_e32 v6, v46, v6
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
@@ -80648,12 +80833,10 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v9, v35, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, s6, v6
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, s6, v7
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v42
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    v_or_b32_e32 v7, v39, v7
 ; SI-NEXT:    v_add_i32_e32 v23, vcc, s6, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
@@ -80852,13 +81035,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v1
 ; SI-NEXT:  .LBB106_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
@@ -80875,14 +81051,21 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v8, v33
 ; SI-NEXT:    v_mov_b32_e32 v10, v37
 ; SI-NEXT:    v_mov_b32_e32 v12, v49
 ; SI-NEXT:    v_mov_b32_e32 v14, v53
 ; SI-NEXT:    v_mov_b32_e32 v16, v32
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v18, v34
 ; SI-NEXT:    v_mov_b32_e32 v20, v36
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v22, v38
 ; SI-NEXT:    v_mov_b32_e32 v24, v48
 ; SI-NEXT:    v_mov_b32_e32 v26, v50
@@ -84461,22 +84644,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32bf16_to_v64i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32
@@ -84542,6 +84709,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v2
 ; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v4
 ; SI-NEXT:    v_mul_f32_e32 v35, 1.0, v3
@@ -84605,11 +84788,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v37
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v48
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v50
 ; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr50
@@ -87562,1321 +87743,1481 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-LABEL: bitcast_v32bf16_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_writelane_b32 v40, s30, 0
+; SI-NEXT:    v_writelane_b32 v40, s31, 1
+; SI-NEXT:    v_writelane_b32 v40, s34, 2
+; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s36, 4
+; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s38, 6
+; SI-NEXT:    v_writelane_b32 v40, s39, 7
+; SI-NEXT:    v_writelane_b32 v40, s48, 8
+; SI-NEXT:    v_writelane_b32 v40, s49, 9
+; SI-NEXT:    v_writelane_b32 v40, s50, 10
+; SI-NEXT:    v_writelane_b32 v40, s51, 11
+; SI-NEXT:    v_writelane_b32 v40, s52, 12
+; SI-NEXT:    v_writelane_b32 v40, s53, 13
+; SI-NEXT:    v_writelane_b32 v40, s54, 14
+; SI-NEXT:    v_writelane_b32 v40, s55, 15
+; SI-NEXT:    v_writelane_b32 v40, s64, 16
+; SI-NEXT:    v_writelane_b32 v40, s65, 17
+; SI-NEXT:    v_writelane_b32 v40, s66, 18
+; SI-NEXT:    v_writelane_b32 v40, s67, 19
+; SI-NEXT:    v_writelane_b32 v40, s68, 20
+; SI-NEXT:    v_writelane_b32 v40, s69, 21
+; SI-NEXT:    v_writelane_b32 v40, s70, 22
+; SI-NEXT:    v_writelane_b32 v40, s71, 23
+; SI-NEXT:    v_writelane_b32 v40, s80, 24
+; SI-NEXT:    v_writelane_b32 v40, s81, 25
+; SI-NEXT:    v_writelane_b32 v40, s82, 26
+; SI-NEXT:    v_writelane_b32 v40, s83, 27
+; SI-NEXT:    v_writelane_b32 v40, s84, 28
+; SI-NEXT:    v_writelane_b32 v40, s85, 29
+; SI-NEXT:    v_writelane_b32 v40, s86, 30
+; SI-NEXT:    v_writelane_b32 v40, s87, 31
+; SI-NEXT:    v_writelane_b32 v40, s96, 32
+; SI-NEXT:    v_writelane_b32 v40, s97, 33
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    v_writelane_b32 v40, s98, 34
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s17
-; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v6
-; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v8
-; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v7
-; SI-NEXT:    v_mul_f32_e32 v40, 1.0, v10
-; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v9
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v12
-; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v11
-; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v14
-; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v16
-; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v18
-; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v17
+; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v9
+; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v11
+; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v14
+; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v35, 1.0, v16
+; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v15
+; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v26, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s19
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v38, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v53, 1.0, s29
-; SI-NEXT:    v_mul_f32_e64 v55, 1.0, s28
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s29
+; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s28
+; SI-NEXT:    v_writelane_b32 v40, s99, 35
+; SI-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB109_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
-; SI-NEXT:    v_alignbit_b32 v23, v1, v3, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT:    v_alignbit_b32 v20, v1, v6, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; SI-NEXT:    v_alignbit_b32 v17, v1, v38, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v53
-; SI-NEXT:    v_alignbit_b32 v14, v1, v55, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v50
-; SI-NEXT:    v_alignbit_b32 v11, v1, v52, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v44
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v1, v46, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v61
-; SI-NEXT:    v_alignbit_b32 v21, v19, v4, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v35
-; SI-NEXT:    v_alignbit_b32 v4, v1, v25, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v36
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v26
-; SI-NEXT:    v_alignbit_b32 v18, v16, v7, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v33
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v39
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v40
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v57
-; SI-NEXT:    v_alignbit_b32 v3, v1, v37, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v32
-; SI-NEXT:    v_alignbit_b32 v24, v22, v2, 16
-; SI-NEXT:    v_alignbit_b32 v15, v13, v27, 16
-; SI-NEXT:    v_alignbit_b32 v12, v10, v49, 16
-; SI-NEXT:    v_alignbit_b32 v9, v7, v43, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v60, 16
-; SI-NEXT:    v_alignbit_b32 v2, v1, v34, 16
-; SI-NEXT:    v_readfirstlane_b32 s8, v23
-; SI-NEXT:    v_readfirstlane_b32 s9, v24
-; SI-NEXT:    v_readfirstlane_b32 s14, v20
-; SI-NEXT:    v_readfirstlane_b32 s15, v21
-; SI-NEXT:    v_readfirstlane_b32 s20, v17
-; SI-NEXT:    v_readfirstlane_b32 s21, v18
-; SI-NEXT:    v_readfirstlane_b32 s26, v14
-; SI-NEXT:    v_readfirstlane_b32 s27, v15
-; SI-NEXT:    v_readfirstlane_b32 s42, v11
-; SI-NEXT:    v_readfirstlane_b32 s43, v12
-; SI-NEXT:    v_readfirstlane_b32 s56, v8
-; SI-NEXT:    v_readfirstlane_b32 s57, v9
-; SI-NEXT:    v_readfirstlane_b32 s62, v4
-; SI-NEXT:    v_readfirstlane_b32 s63, v5
-; SI-NEXT:    v_readfirstlane_b32 s76, v3
-; SI-NEXT:    v_readfirstlane_b32 s77, v2
-; SI-NEXT:    s_lshr_b64 s[4:5], s[8:9], 24
-; SI-NEXT:    s_lshr_b64 s[6:7], s[8:9], 16
-; SI-NEXT:    s_lshr_b64 s[10:11], s[8:9], 8
-; SI-NEXT:    s_lshr_b64 s[8:9], s[14:15], 24
-; SI-NEXT:    s_lshr_b64 s[12:13], s[14:15], 16
-; SI-NEXT:    s_lshr_b64 s[16:17], s[14:15], 8
-; SI-NEXT:    s_lshr_b64 s[14:15], s[20:21], 24
-; SI-NEXT:    s_lshr_b64 s[18:19], s[20:21], 16
-; SI-NEXT:    s_lshr_b64 s[22:23], s[20:21], 8
-; SI-NEXT:    s_lshr_b64 s[20:21], s[26:27], 24
-; SI-NEXT:    s_lshr_b64 s[24:25], s[26:27], 16
-; SI-NEXT:    s_lshr_b64 s[28:29], s[26:27], 8
-; SI-NEXT:    s_lshr_b64 s[26:27], s[42:43], 24
-; SI-NEXT:    s_lshr_b64 s[40:41], s[42:43], 16
-; SI-NEXT:    s_lshr_b64 s[44:45], s[42:43], 8
-; SI-NEXT:    s_lshr_b64 s[42:43], s[56:57], 24
-; SI-NEXT:    s_lshr_b64 s[46:47], s[56:57], 16
-; SI-NEXT:    s_lshr_b64 s[58:59], s[56:57], 8
-; SI-NEXT:    s_lshr_b64 s[56:57], s[62:63], 24
-; SI-NEXT:    s_lshr_b64 s[60:61], s[62:63], 16
-; SI-NEXT:    s_lshr_b64 s[72:73], s[62:63], 8
-; SI-NEXT:    s_lshr_b64 s[62:63], s[76:77], 24
-; SI-NEXT:    s_lshr_b64 s[74:75], s[76:77], 16
-; SI-NEXT:    s_lshr_b64 s[76:77], s[76:77], 8
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 24, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 8, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 24, v29
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 8, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 8, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 24, v33
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 8, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 24, v39
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 8, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 24, v40
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 8, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 24, v57
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 8, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 24, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 8, v2
+; SI-NEXT:    v_readfirstlane_b32 s4, v19
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v3
+; SI-NEXT:    s_lshr_b64 s[74:75], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v1
+; SI-NEXT:    s_lshr_b32 s73, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s72, v2
+; SI-NEXT:    s_lshr_b64 s[76:77], s[72:73], 16
+; SI-NEXT:    s_mov_b32 s75, s76
+; SI-NEXT:    s_lshr_b64 s[4:5], s[74:75], 24
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v41, s4, 0
+; SI-NEXT:    v_writelane_b32 v41, s5, 1
+; SI-NEXT:    v_readfirstlane_b32 s4, v6
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v7
+; SI-NEXT:    s_lshr_b64 s[60:61], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v4
+; SI-NEXT:    s_lshr_b32 s59, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v10
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v11
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v8
+; SI-NEXT:    s_lshr_b32 s45, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v12
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v13
+; SI-NEXT:    s_lshr_b64 s[26:27], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v20
+; SI-NEXT:    s_lshr_b32 s25, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v24
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v25
+; SI-NEXT:    s_lshr_b64 s[16:17], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v22
+; SI-NEXT:    s_lshr_b32 s41, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v28
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v29
+; SI-NEXT:    s_lshr_b64 s[20:21], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v26
+; SI-NEXT:    s_lshr_b32 s19, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v32
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v33
+; SI-NEXT:    s_lshr_b64 s[12:13], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v30
+; SI-NEXT:    s_lshr_b32 s11, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v35
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v36
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v18
+; SI-NEXT:    v_readfirstlane_b32 s58, v5
+; SI-NEXT:    v_readfirstlane_b32 s44, v9
+; SI-NEXT:    v_readfirstlane_b32 s24, v21
+; SI-NEXT:    v_readfirstlane_b32 s40, v23
+; SI-NEXT:    v_readfirstlane_b32 s18, v27
+; SI-NEXT:    v_readfirstlane_b32 s10, v31
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v17
+; SI-NEXT:    s_lshr_b64 s[62:63], s[58:59], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[44:45], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[22:23], s[40:41], 16
+; SI-NEXT:    s_lshr_b64 s[28:29], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 16
+; SI-NEXT:    s_mov_b32 s61, s62
+; SI-NEXT:    s_mov_b32 s47, s56
+; SI-NEXT:    s_mov_b32 s27, s42
+; SI-NEXT:    s_mov_b32 s17, s22
+; SI-NEXT:    s_mov_b32 s21, s28
+; SI-NEXT:    s_mov_b32 s13, s14
+; SI-NEXT:    s_mov_b32 s7, s8
+; SI-NEXT:    s_lshr_b64 s[88:89], s[74:75], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[74:75], 8
+; SI-NEXT:    s_lshr_b64 s[90:91], s[60:61], 24
+; SI-NEXT:    s_lshr_b64 s[94:95], s[60:61], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[60:61], 8
+; SI-NEXT:    s_lshr_b64 s[34:35], s[46:47], 24
+; SI-NEXT:    s_lshr_b64 s[36:37], s[46:47], 16
+; SI-NEXT:    s_lshr_b64 s[38:39], s[46:47], 8
+; SI-NEXT:    s_lshr_b64 s[48:49], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[50:51], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[52:53], s[26:27], 8
+; SI-NEXT:    s_lshr_b64 s[54:55], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[64:65], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[66:67], s[16:17], 8
+; SI-NEXT:    s_lshr_b64 s[68:69], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[70:71], s[20:21], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 24, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 24, v4
+; SI-NEXT:    s_lshr_b32 s24, s76, 8
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; SI-NEXT:    s_lshr_b32 s23, s62, 8
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 24, v20
+; SI-NEXT:    s_lshr_b32 s18, s56, 8
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 24, v22
+; SI-NEXT:    s_lshr_b32 s17, s42, 8
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 24, v26
+; SI-NEXT:    s_lshr_b32 s15, s22, 8
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v30
+; SI-NEXT:    s_lshr_b32 s10, s28, 8
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v18
+; SI-NEXT:    s_lshr_b32 s9, s14, 8
+; SI-NEXT:    s_lshr_b32 s4, s8, 8
+; SI-NEXT:    s_lshr_b64 s[78:79], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[86:87], s[12:13], 24
+; SI-NEXT:    s_lshr_b64 s[96:97], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[98:99], s[12:13], 8
+; SI-NEXT:    s_lshr_b64 s[80:81], s[6:7], 24
+; SI-NEXT:    s_lshr_b64 s[82:83], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[84:85], s[6:7], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB109_3
 ; SI-NEXT:  .LBB109_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v53
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v55
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v35
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v36
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_alignbit_b32 v14, v14, v13, 16
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v38
+; SI-NEXT:    v_readfirstlane_b32 s4, v15
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v14
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v17
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v14
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v18
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v14
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v15
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v33
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v32
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v36
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v44
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v50
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v37
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v46
-; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v52
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v16
+; SI-NEXT:    v_readfirstlane_b32 s10, v15
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v31
+; SI-NEXT:    s_lshr_b32 s11, s4, 16
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v15
+; SI-NEXT:    s_lshr_b64 s[12:13], s[10:11], 16
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v30
+; SI-NEXT:    v_readfirstlane_b32 s10, v16
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
+; SI-NEXT:    v_readfirstlane_b32 s4, v15
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT:    v_readfirstlane_b32 s16, v16
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v27
+; SI-NEXT:    s_lshr_b32 s11, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v17
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v16
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v26
+; SI-NEXT:    v_readfirstlane_b32 s18, v17
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
+; SI-NEXT:    s_lshr_b32 s17, s4, 16
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
+; SI-NEXT:    s_lshr_b64 s[20:21], s[16:17], 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v16
+; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT:    v_readfirstlane_b32 s16, v17
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
+; SI-NEXT:    s_lshr_b32 s19, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v18
+; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v17
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT:    s_lshr_b32 s17, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v17
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT:    s_lshr_b32 s41, s4, 16
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_readfirstlane_b32 s4, v12
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v21
+; SI-NEXT:    v_readfirstlane_b32 s24, v13
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v12
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v20
+; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT:    s_lshr_b32 s25, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v12
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT:    s_lshr_b64 s[26:27], s[24:25], 16
+; SI-NEXT:    s_lshr_b32 s25, s4, 16
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v61
+; SI-NEXT:    v_readfirstlane_b32 s4, v10
+; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT:    s_lshr_b32 s45, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s44, v11
+; SI-NEXT:    v_readfirstlane_b32 s4, v8
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT:    s_lshr_b64 s[46:47], s[44:45], 16
+; SI-NEXT:    s_lshr_b32 s45, s4, 16
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_alignbit_b32 v3, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v25
+; SI-NEXT:    v_readfirstlane_b32 s4, v6
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT:    s_lshr_b32 s59, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s58, v7
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    v_alignbit_b32 v8, v8, v7, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v43
-; SI-NEXT:    v_alignbit_b32 v11, v11, v10, 16
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v49
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v1
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT:    s_lshr_b64 s[60:61], s[58:59], 16
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v57
-; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v7
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v40
-; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v10
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v39
-; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v60
-; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v6
-; SI-NEXT:    v_add_f32_e32 v34, 0x40c00000, v7
-; SI-NEXT:    v_add_f32_e32 v36, 0x40c00000, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v32
+; SI-NEXT:    v_readfirstlane_b32 s58, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v19
+; SI-NEXT:    v_readfirstlane_b32 s4, v4
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v34
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
-; SI-NEXT:    v_alignbit_b32 v2, v1, v2, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v9, v7, v9, 16
-; SI-NEXT:    v_alignbit_b32 v12, v10, v12, 16
-; SI-NEXT:    v_readfirstlane_b32 s76, v3
-; SI-NEXT:    v_readfirstlane_b32 s77, v2
-; SI-NEXT:    v_readfirstlane_b32 s62, v4
-; SI-NEXT:    v_readfirstlane_b32 s63, v5
-; SI-NEXT:    v_readfirstlane_b32 s56, v8
-; SI-NEXT:    v_readfirstlane_b32 s57, v9
-; SI-NEXT:    v_readfirstlane_b32 s42, v11
-; SI-NEXT:    v_readfirstlane_b32 s43, v12
-; SI-NEXT:    v_readfirstlane_b32 s26, v14
-; SI-NEXT:    s_lshr_b64 s[40:41], s[42:43], 16
-; SI-NEXT:    s_lshr_b64 s[44:45], s[42:43], 8
-; SI-NEXT:    s_lshr_b64 s[46:47], s[56:57], 16
-; SI-NEXT:    s_lshr_b64 s[58:59], s[56:57], 8
-; SI-NEXT:    s_lshr_b64 s[60:61], s[62:63], 16
-; SI-NEXT:    s_lshr_b64 s[72:73], s[62:63], 8
-; SI-NEXT:    s_lshr_b64 s[74:75], s[76:77], 16
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 8, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 8, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 8, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 8, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 24, v36
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 24, v34
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 24, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 24, v32
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_alignbit_b32 v17, v17, v16, 16
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    v_readfirstlane_b32 s20, v17
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT:    v_alignbit_b32 v20, v20, v19, 16
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v23, v23, v22, 16
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v13
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v33
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v16
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v35
-; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v13
-; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v33
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v31
-; SI-NEXT:    v_alignbit_b32 v15, v13, v15, 16
-; SI-NEXT:    v_alignbit_b32 v18, v16, v18, 16
-; SI-NEXT:    v_readfirstlane_b32 s27, v15
-; SI-NEXT:    v_readfirstlane_b32 s21, v18
-; SI-NEXT:    v_readfirstlane_b32 s14, v20
-; SI-NEXT:    v_readfirstlane_b32 s8, v23
-; SI-NEXT:    s_lshr_b64 s[18:19], s[20:21], 16
-; SI-NEXT:    s_lshr_b64 s[22:23], s[20:21], 8
-; SI-NEXT:    s_lshr_b64 s[24:25], s[26:27], 16
-; SI-NEXT:    s_lshr_b64 s[28:29], s[26:27], 8
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 8, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 8, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 24, v33
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v19
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v29
-; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v22
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v26
-; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v19
-; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v26
-; SI-NEXT:    v_alignbit_b32 v21, v19, v21, 16
-; SI-NEXT:    v_alignbit_b32 v24, v22, v24, 16
-; SI-NEXT:    v_readfirstlane_b32 s15, v21
-; SI-NEXT:    v_readfirstlane_b32 s9, v24
-; SI-NEXT:    s_lshr_b64 s[4:5], s[8:9], 24
-; SI-NEXT:    s_lshr_b64 s[6:7], s[8:9], 16
-; SI-NEXT:    s_lshr_b64 s[10:11], s[8:9], 8
-; SI-NEXT:    s_lshr_b64 s[8:9], s[14:15], 24
-; SI-NEXT:    s_lshr_b64 s[12:13], s[14:15], 16
-; SI-NEXT:    s_lshr_b64 s[16:17], s[14:15], 8
-; SI-NEXT:    s_lshr_b64 s[14:15], s[20:21], 24
-; SI-NEXT:    s_lshr_b64 s[20:21], s[26:27], 24
-; SI-NEXT:    s_lshr_b64 s[26:27], s[42:43], 24
-; SI-NEXT:    s_lshr_b64 s[42:43], s[56:57], 24
-; SI-NEXT:    s_lshr_b64 s[56:57], s[62:63], 24
-; SI-NEXT:    s_lshr_b64 s[62:63], s[76:77], 24
-; SI-NEXT:    s_lshr_b64 s[76:77], s[76:77], 8
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 8, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 8, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 24, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 24, v29
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    s_lshr_b32 s59, s4, 16
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_readfirstlane_b32 s4, v5
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    s_lshr_b32 s73, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s72, v3
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_readfirstlane_b32 s4, v1
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT:    s_lshr_b64 s[74:75], s[72:73], 16
+; SI-NEXT:    s_lshr_b32 s73, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s72, v2
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    s_lshr_b64 s[76:77], s[72:73], 16
+; SI-NEXT:    v_readfirstlane_b32 s40, v18
+; SI-NEXT:    v_readfirstlane_b32 s24, v13
+; SI-NEXT:    v_readfirstlane_b32 s44, v9
+; SI-NEXT:    s_mov_b32 s75, s76
+; SI-NEXT:    s_lshr_b64 s[14:15], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[28:29], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[16:17], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[22:23], s[40:41], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[44:45], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[58:59], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[74:75], 24
+; SI-NEXT:    s_mov_b32 s7, s8
+; SI-NEXT:    s_mov_b32 s13, s14
+; SI-NEXT:    s_mov_b32 s21, s28
+; SI-NEXT:    s_mov_b32 s17, s22
+; SI-NEXT:    s_mov_b32 s27, s42
+; SI-NEXT:    s_mov_b32 s47, s56
+; SI-NEXT:    s_mov_b32 s61, s62
+; SI-NEXT:    v_writelane_b32 v41, s78, 0
+; SI-NEXT:    v_writelane_b32 v41, s79, 1
+; SI-NEXT:    s_lshr_b64 s[88:89], s[74:75], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[74:75], 8
+; SI-NEXT:    s_lshr_b64 s[90:91], s[60:61], 24
+; SI-NEXT:    s_lshr_b64 s[94:95], s[60:61], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[60:61], 8
+; SI-NEXT:    s_lshr_b64 s[34:35], s[46:47], 24
+; SI-NEXT:    s_lshr_b64 s[36:37], s[46:47], 16
+; SI-NEXT:    s_lshr_b64 s[38:39], s[46:47], 8
+; SI-NEXT:    s_lshr_b64 s[48:49], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[50:51], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[52:53], s[26:27], 8
+; SI-NEXT:    s_lshr_b64 s[54:55], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[64:65], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[66:67], s[16:17], 8
+; SI-NEXT:    s_lshr_b64 s[68:69], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[70:71], s[20:21], 16
+; SI-NEXT:    s_lshr_b32 s24, s76, 8
+; SI-NEXT:    s_lshr_b32 s23, s62, 8
+; SI-NEXT:    s_lshr_b32 s18, s56, 8
+; SI-NEXT:    s_lshr_b32 s17, s42, 8
+; SI-NEXT:    s_lshr_b32 s15, s22, 8
+; SI-NEXT:    s_lshr_b32 s10, s28, 8
+; SI-NEXT:    s_lshr_b32 s9, s14, 8
+; SI-NEXT:    s_lshr_b32 s4, s8, 8
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 24, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 24, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 24, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 24, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 24, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v14
+; SI-NEXT:    s_lshr_b64 s[78:79], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[86:87], s[12:13], 24
+; SI-NEXT:    s_lshr_b64 s[96:97], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[98:99], s[12:13], 8
+; SI-NEXT:    s_lshr_b64 s[80:81], s[6:7], 24
+; SI-NEXT:    s_lshr_b64 s[82:83], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[84:85], s[6:7], 8
 ; SI-NEXT:  .LBB109_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; SI-NEXT:    s_lshl_b32 s5, s10, 8
-; SI-NEXT:    v_or_b32_e32 v23, s5, v23
-; SI-NEXT:    s_and_b32 s5, s6, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s4, s4, 24
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_or_b32_e32 v23, s4, v23
-; SI-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xff, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 8, v27
-; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; SI-NEXT:    s_lshl_b32 s4, s16, 8
-; SI-NEXT:    v_or_b32_e32 v23, v23, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 24, v30
-; SI-NEXT:    v_or_b32_e32 v20, s4, v20
-; SI-NEXT:    s_and_b32 s4, s12, 0xff
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; SI-NEXT:    v_or_b32_e32 v22, v24, v22
-; SI-NEXT:    s_lshl_b32 s4, s4, 16
-; SI-NEXT:    s_lshl_b32 s5, s8, 24
-; SI-NEXT:    v_or_b32_e32 v22, v23, v22
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 4, v0
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
-; SI-NEXT:    v_or_b32_e32 v20, s4, v20
+; SI-NEXT:    s_and_b32 s7, s74, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s92, 8
+; SI-NEXT:    s_or_b32 s7, s7, s13
+; SI-NEXT:    s_and_b32 s13, s88, 0xff
+; SI-NEXT:    v_readlane_b32 s74, v41, 0
+; SI-NEXT:    s_lshl_b32 s21, s74, 24
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    s_or_b32 s13, s21, s13
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s7, s7, s13
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    s_and_b32 s7, s76, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s24, 8
+; SI-NEXT:    s_or_b32 s7, s7, s13
+; SI-NEXT:    s_and_b32 s13, s73, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v48
+; SI-NEXT:    v_or_b32_e32 v2, s13, v2
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    v_or_b32_e32 v2, s7, v2
+; SI-NEXT:    s_and_b32 s7, s60, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s30, 8
+; SI-NEXT:    s_or_b32 s7, s7, s13
+; SI-NEXT:    s_and_b32 s13, s94, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    s_lshl_b32 s21, s90, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s13, s21, s13
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v20, v22, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s7, s7, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v58
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v19
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; SI-NEXT:    s_lshl_b32 s4, s22, 8
-; SI-NEXT:    v_or_b32_e32 v20, v20, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v62
-; SI-NEXT:    v_or_b32_e32 v17, s4, v17
-; SI-NEXT:    s_and_b32 s4, s18, 0xff
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; SI-NEXT:    v_or_b32_e32 v19, v21, v19
-; SI-NEXT:    s_lshl_b32 s4, s4, 16
-; SI-NEXT:    s_lshl_b32 s5, s14, 24
-; SI-NEXT:    v_or_b32_e32 v19, v20, v19
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 12, v0
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    v_or_b32_e32 v17, s4, v17
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s62, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s23, 8
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_or_b32 s7, s7, s13
+; SI-NEXT:    s_and_b32 s13, s59, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v39
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s13, v1
+; SI-NEXT:    v_or_b32_e32 v1, s7, v1
+; SI-NEXT:    s_and_b32 s7, s46, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s38, 8
+; SI-NEXT:    s_or_b32 s7, s7, s13
+; SI-NEXT:    s_and_b32 s13, s36, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    s_lshl_b32 s21, s34, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s13, s21, s13
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v17, v19, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s7, s7, s13
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s56, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s18, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v28
-; SI-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT:    s_lshl_b32 s4, s28, 8
-; SI-NEXT:    v_or_b32_e32 v17, v17, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v31
-; SI-NEXT:    v_or_b32_e32 v14, s4, v14
-; SI-NEXT:    s_and_b32 s4, s24, 0xff
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; SI-NEXT:    v_or_b32_e32 v16, v18, v16
-; SI-NEXT:    s_lshl_b32 s4, s4, 16
-; SI-NEXT:    s_lshl_b32 s5, s20, 24
-; SI-NEXT:    v_or_b32_e32 v16, v17, v16
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 20, v0
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
-; SI-NEXT:    v_or_b32_e32 v14, s4, v14
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s7, s7, s13
+; SI-NEXT:    s_and_b32 s13, s45, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v38
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s13, v1
+; SI-NEXT:    v_or_b32_e32 v1, s7, v1
+; SI-NEXT:    s_and_b32 s7, s26, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s52, 8
+; SI-NEXT:    s_or_b32 s7, s7, s13
+; SI-NEXT:    s_and_b32 s13, s50, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    s_lshl_b32 s18, s48, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s13, s18, s13
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s7, s7, s13
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s42, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s17, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v14, 0xff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v59
-; SI-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT:    s_lshl_b32 s4, s44, 8
-; SI-NEXT:    v_or_b32_e32 v14, v14, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v63
-; SI-NEXT:    v_or_b32_e32 v11, s4, v11
-; SI-NEXT:    s_and_b32 s4, s40, 0xff
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; SI-NEXT:    v_or_b32_e32 v13, v15, v13
-; SI-NEXT:    s_lshl_b32 s4, s4, 16
-; SI-NEXT:    s_lshl_b32 s5, s26, 24
-; SI-NEXT:    v_or_b32_e32 v13, v14, v13
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 28, v0
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
-; SI-NEXT:    v_or_b32_e32 v11, s4, v11
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s7, s7, s13
+; SI-NEXT:    s_and_b32 s13, s25, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v37
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s13, v1
+; SI-NEXT:    v_or_b32_e32 v1, s7, v1
+; SI-NEXT:    s_and_b32 s7, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s66, 8
+; SI-NEXT:    s_or_b32 s7, s7, s13
+; SI-NEXT:    s_and_b32 s13, s64, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    s_lshl_b32 s16, s54, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s13, s16, s13
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v11, v13, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s7, s7, s13
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s22, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s15, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 8, v47
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT:    s_lshl_b32 s4, s58, 8
-; SI-NEXT:    v_or_b32_e32 v11, v11, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v56
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s46, 0xff
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; SI-NEXT:    v_or_b32_e32 v10, v12, v10
-; SI-NEXT:    s_lshl_b32 s4, s4, 16
-; SI-NEXT:    s_lshl_b32 s5, s42, 24
-; SI-NEXT:    v_or_b32_e32 v10, v11, v10
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 36, v0
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s7, s7, s13
+; SI-NEXT:    s_and_b32 s13, s41, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v34
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s13, v1
+; SI-NEXT:    v_or_b32_e32 v1, s7, v1
+; SI-NEXT:    s_and_b32 s7, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s78, 8
+; SI-NEXT:    s_or_b32 s7, s7, s13
+; SI-NEXT:    s_and_b32 s13, s70, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    s_lshl_b32 s15, s68, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s13, s15, s13
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v8, v10, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s7, s7, s13
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s28, 0xff
+; SI-NEXT:    s_lshl_b32 s10, s10, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v41
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT:    s_lshl_b32 s4, s72, 8
-; SI-NEXT:    v_or_b32_e32 v8, v8, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v45
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s60, 0xff
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    v_or_b32_e32 v7, v9, v7
-; SI-NEXT:    s_lshl_b32 s4, s4, 16
-; SI-NEXT:    s_lshl_b32 s5, s56, 24
-; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 44, v0
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s7, s7, s10
+; SI-NEXT:    s_and_b32 s10, s19, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v16
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s10, v1
+; SI-NEXT:    v_or_b32_e32 v1, s7, v1
+; SI-NEXT:    s_and_b32 s7, s12, 0xff
+; SI-NEXT:    s_lshl_b32 s10, s98, 8
+; SI-NEXT:    s_or_b32 s7, s7, s10
+; SI-NEXT:    s_and_b32 s10, s96, 0xff
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
+; SI-NEXT:    s_lshl_b32 s12, s86, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s10, s12, s10
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s7, s7, s10
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s14, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v54
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    s_lshl_b32 s4, s76, 8
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v42
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s74, 0xff
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    s_lshl_b32 s4, s4, 16
-; SI-NEXT:    s_lshl_b32 s5, s62, 24
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 52, v0
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s11, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v15
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s9, v1
+; SI-NEXT:    v_or_b32_e32 v1, s7, v1
+; SI-NEXT:    s_and_b32 s6, s6, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s84, 8
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    s_and_b32 s7, s82, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s9, s80, 24
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s7, s9, s7
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v48
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v51
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s8, 0xff
+; SI-NEXT:    s_lshl_b32 s4, s4, 8
+; SI-NEXT:    s_and_b32 s5, s5, 0xff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s4, s6, s4
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v14
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    v_or_b32_e32 v1, s4, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
+; SI-NEXT:    v_readlane_b32 s75, v41, 1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    s_setpc_b64 s[30:31]
-; SI-NEXT:  .LBB109_4:
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $sgpr4
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $sgpr16
-; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $sgpr22
-; SI-NEXT:    ; implicit-def: $sgpr18
-; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $sgpr28
-; SI-NEXT:    ; implicit-def: $sgpr24
-; SI-NEXT:    ; implicit-def: $sgpr20
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $sgpr26
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr72
-; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    v_readlane_b32 s99, v40, 35
+; SI-NEXT:    v_readlane_b32 s98, v40, 34
+; SI-NEXT:    v_readlane_b32 s97, v40, 33
+; SI-NEXT:    v_readlane_b32 s96, v40, 32
+; SI-NEXT:    v_readlane_b32 s87, v40, 31
+; SI-NEXT:    v_readlane_b32 s86, v40, 30
+; SI-NEXT:    v_readlane_b32 s85, v40, 29
+; SI-NEXT:    v_readlane_b32 s84, v40, 28
+; SI-NEXT:    v_readlane_b32 s83, v40, 27
+; SI-NEXT:    v_readlane_b32 s82, v40, 26
+; SI-NEXT:    v_readlane_b32 s81, v40, 25
+; SI-NEXT:    v_readlane_b32 s80, v40, 24
+; SI-NEXT:    v_readlane_b32 s71, v40, 23
+; SI-NEXT:    v_readlane_b32 s70, v40, 22
+; SI-NEXT:    v_readlane_b32 s69, v40, 21
+; SI-NEXT:    v_readlane_b32 s68, v40, 20
+; SI-NEXT:    v_readlane_b32 s67, v40, 19
+; SI-NEXT:    v_readlane_b32 s66, v40, 18
+; SI-NEXT:    v_readlane_b32 s65, v40, 17
+; SI-NEXT:    v_readlane_b32 s64, v40, 16
+; SI-NEXT:    v_readlane_b32 s55, v40, 15
+; SI-NEXT:    v_readlane_b32 s54, v40, 14
+; SI-NEXT:    v_readlane_b32 s53, v40, 13
+; SI-NEXT:    v_readlane_b32 s52, v40, 12
+; SI-NEXT:    v_readlane_b32 s51, v40, 11
+; SI-NEXT:    v_readlane_b32 s50, v40, 10
+; SI-NEXT:    v_readlane_b32 s49, v40, 9
+; SI-NEXT:    v_readlane_b32 s48, v40, 8
+; SI-NEXT:    v_readlane_b32 s39, v40, 7
+; SI-NEXT:    v_readlane_b32 s38, v40, 6
+; SI-NEXT:    v_readlane_b32 s37, v40, 5
+; SI-NEXT:    v_readlane_b32 s36, v40, 4
+; SI-NEXT:    v_readlane_b32 s35, v40, 3
+; SI-NEXT:    v_readlane_b32 s34, v40, 2
+; SI-NEXT:    v_readlane_b32 s31, v40, 1
+; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:  .LBB109_4:
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v41, s4, 0
 ; SI-NEXT:    ; implicit-def: $sgpr74
-; SI-NEXT:    ; implicit-def: $sgpr62
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    v_writelane_b32 v41, s5, 1
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr24
+; SI-NEXT:    ; implicit-def: $sgpr73
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr23
+; SI-NEXT:    ; implicit-def: $sgpr59
+; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr38
+; SI-NEXT:    ; implicit-def: $sgpr36
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr18
+; SI-NEXT:    ; implicit-def: $sgpr45
+; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $sgpr26
+; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr17
+; SI-NEXT:    ; implicit-def: $sgpr25
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $sgpr16
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; implicit-def: $sgpr22
+; SI-NEXT:    ; implicit-def: $sgpr15
+; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $vgpr16
+; SI-NEXT:    ; implicit-def: $sgpr9
+; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    ; implicit-def: $sgpr41
+; SI-NEXT:    ; implicit-def: $sgpr20
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr70
+; SI-NEXT:    ; implicit-def: $sgpr68
+; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr19
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr98
+; SI-NEXT:    ; implicit-def: $sgpr96
+; SI-NEXT:    ; implicit-def: $sgpr86
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr11
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr84
+; SI-NEXT:    ; implicit-def: $sgpr82
+; SI-NEXT:    ; implicit-def: $sgpr80
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr5
 ; SI-NEXT:    s_branch .LBB109_2
 ;
 ; VI-LABEL: bitcast_v32bf16_to_v64i8_scalar:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v63, s30, 0
-; VI-NEXT:    v_writelane_b32 v63, s31, 1
-; VI-NEXT:    v_writelane_b32 v63, s34, 2
-; VI-NEXT:    v_writelane_b32 v63, s35, 3
-; VI-NEXT:    v_writelane_b32 v63, s36, 4
-; VI-NEXT:    v_writelane_b32 v63, s37, 5
-; VI-NEXT:    v_writelane_b32 v63, s38, 6
-; VI-NEXT:    v_writelane_b32 v63, s39, 7
-; VI-NEXT:    v_writelane_b32 v63, s48, 8
-; VI-NEXT:    v_writelane_b32 v63, s49, 9
-; VI-NEXT:    v_writelane_b32 v63, s50, 10
-; VI-NEXT:    v_writelane_b32 v63, s51, 11
-; VI-NEXT:    v_writelane_b32 v63, s52, 12
-; VI-NEXT:    v_writelane_b32 v63, s53, 13
-; VI-NEXT:    v_writelane_b32 v63, s54, 14
-; VI-NEXT:    v_writelane_b32 v63, s55, 15
-; VI-NEXT:    v_writelane_b32 v63, s64, 16
-; VI-NEXT:    v_writelane_b32 v63, s65, 17
-; VI-NEXT:    v_writelane_b32 v63, s66, 18
+; VI-NEXT:    v_writelane_b32 v4, s30, 0
+; VI-NEXT:    v_writelane_b32 v4, s31, 1
+; VI-NEXT:    v_writelane_b32 v4, s34, 2
+; VI-NEXT:    v_writelane_b32 v4, s35, 3
+; VI-NEXT:    v_writelane_b32 v4, s36, 4
+; VI-NEXT:    v_writelane_b32 v4, s37, 5
+; VI-NEXT:    v_writelane_b32 v4, s38, 6
+; VI-NEXT:    v_writelane_b32 v4, s39, 7
+; VI-NEXT:    v_writelane_b32 v4, s48, 8
+; VI-NEXT:    v_writelane_b32 v4, s49, 9
+; VI-NEXT:    v_writelane_b32 v4, s50, 10
+; VI-NEXT:    v_writelane_b32 v4, s51, 11
+; VI-NEXT:    v_writelane_b32 v4, s52, 12
+; VI-NEXT:    v_writelane_b32 v4, s53, 13
+; VI-NEXT:    v_writelane_b32 v4, s54, 14
+; VI-NEXT:    v_writelane_b32 v4, s55, 15
+; VI-NEXT:    v_writelane_b32 v4, s64, 16
+; VI-NEXT:    v_writelane_b32 v4, s65, 17
+; VI-NEXT:    v_writelane_b32 v4, s66, 18
+; VI-NEXT:    v_writelane_b32 v4, s67, 19
+; VI-NEXT:    v_writelane_b32 v4, s68, 20
+; VI-NEXT:    v_writelane_b32 v4, s69, 21
+; VI-NEXT:    v_writelane_b32 v4, s70, 22
+; VI-NEXT:    v_writelane_b32 v4, s71, 23
+; VI-NEXT:    v_writelane_b32 v4, s80, 24
+; VI-NEXT:    v_writelane_b32 v4, s81, 25
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; VI-NEXT:    v_writelane_b32 v63, s67, 19
+; VI-NEXT:    v_writelane_b32 v4, s82, 26
 ; VI-NEXT:    v_readfirstlane_b32 s4, v1
 ; VI-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; VI-NEXT:    v_readfirstlane_b32 s5, v2
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    s_cbranch_scc0 .LBB109_3
+; VI-NEXT:    v_writelane_b32 v4, s83, 27
+; VI-NEXT:    s_cbranch_scc0 .LBB109_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    s_lshr_b32 s56, s5, 24
-; VI-NEXT:    s_lshr_b32 s57, s5, 16
-; VI-NEXT:    s_lshr_b32 s59, s5, 8
-; VI-NEXT:    s_lshr_b32 s58, s4, 16
-; VI-NEXT:    s_lshr_b32 s60, s4, 8
-; VI-NEXT:    s_lshr_b32 s61, s29, 24
-; VI-NEXT:    s_lshr_b32 s62, s29, 16
-; VI-NEXT:    s_lshr_b32 s72, s29, 8
-; VI-NEXT:    s_lshr_b32 s63, s28, 16
-; VI-NEXT:    s_lshr_b32 s73, s28, 8
-; VI-NEXT:    s_lshr_b32 s74, s27, 24
-; VI-NEXT:    s_lshr_b32 s75, s27, 16
-; VI-NEXT:    s_lshr_b32 s77, s27, 8
-; VI-NEXT:    s_lshr_b32 s76, s26, 16
-; VI-NEXT:    s_lshr_b32 s78, s26, 8
-; VI-NEXT:    s_lshr_b32 s79, s25, 24
-; VI-NEXT:    s_lshr_b32 s88, s25, 16
-; VI-NEXT:    s_lshr_b32 s90, s25, 8
-; VI-NEXT:    s_lshr_b32 s89, s24, 16
-; VI-NEXT:    s_lshr_b32 s91, s24, 8
-; VI-NEXT:    s_lshr_b32 s30, s23, 24
-; VI-NEXT:    s_lshr_b32 s31, s23, 16
-; VI-NEXT:    s_lshr_b32 s35, s23, 8
-; VI-NEXT:    s_lshr_b32 s34, s22, 16
-; VI-NEXT:    s_lshr_b32 s36, s22, 8
-; VI-NEXT:    s_lshr_b32 s37, s21, 24
-; VI-NEXT:    s_lshr_b32 s38, s21, 16
-; VI-NEXT:    s_lshr_b32 s48, s21, 8
-; VI-NEXT:    s_lshr_b32 s39, s20, 16
-; VI-NEXT:    s_lshr_b32 s49, s20, 8
-; VI-NEXT:    s_lshr_b32 s50, s19, 24
-; VI-NEXT:    s_lshr_b32 s51, s19, 16
-; VI-NEXT:    s_lshr_b32 s53, s19, 8
-; VI-NEXT:    s_lshr_b32 s52, s18, 16
-; VI-NEXT:    s_lshr_b32 s54, s18, 8
-; VI-NEXT:    s_lshr_b32 s55, s17, 24
-; VI-NEXT:    s_lshr_b32 s64, s17, 16
-; VI-NEXT:    s_lshr_b32 s66, s17, 8
-; VI-NEXT:    s_lshr_b32 s65, s16, 16
-; VI-NEXT:    s_lshr_b32 s67, s16, 8
-; VI-NEXT:    s_lshr_b64 s[44:45], s[4:5], 24
-; VI-NEXT:    s_lshr_b64 s[42:43], s[28:29], 24
-; VI-NEXT:    s_lshr_b64 s[40:41], s[26:27], 24
-; VI-NEXT:    s_lshr_b64 s[14:15], s[24:25], 24
-; VI-NEXT:    s_lshr_b64 s[12:13], s[22:23], 24
-; VI-NEXT:    s_lshr_b64 s[10:11], s[20:21], 24
-; VI-NEXT:    s_lshr_b64 s[8:9], s[18:19], 24
-; VI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 24
-; VI-NEXT:    s_cbranch_execnz .LBB109_4
+; VI-NEXT:    s_lshr_b32 s7, s5, 24
+; VI-NEXT:    s_lshr_b32 s9, s5, 16
+; VI-NEXT:    s_lshr_b32 s11, s5, 8
+; VI-NEXT:    s_lshr_b32 s13, s4, 16
+; VI-NEXT:    s_lshr_b32 s15, s4, 8
+; VI-NEXT:    s_lshr_b32 s41, s29, 24
+; VI-NEXT:    s_lshr_b32 s47, s29, 16
+; VI-NEXT:    s_lshr_b32 s57, s29, 8
+; VI-NEXT:    s_lshr_b32 s88, s28, 16
+; VI-NEXT:    s_lshr_b32 s89, s28, 8
+; VI-NEXT:    s_lshr_b32 s90, s27, 24
+; VI-NEXT:    s_lshr_b32 s91, s27, 16
+; VI-NEXT:    s_lshr_b32 s30, s27, 8
+; VI-NEXT:    s_lshr_b32 s31, s26, 16
+; VI-NEXT:    s_lshr_b32 s34, s26, 8
+; VI-NEXT:    s_lshr_b32 s35, s25, 24
+; VI-NEXT:    s_lshr_b32 s36, s25, 16
+; VI-NEXT:    s_lshr_b32 s37, s25, 8
+; VI-NEXT:    s_lshr_b32 s38, s24, 16
+; VI-NEXT:    s_lshr_b32 s39, s24, 8
+; VI-NEXT:    s_lshr_b32 s48, s23, 24
+; VI-NEXT:    s_lshr_b32 s49, s23, 16
+; VI-NEXT:    s_lshr_b32 s50, s23, 8
+; VI-NEXT:    s_lshr_b32 s51, s22, 16
+; VI-NEXT:    s_lshr_b32 s52, s22, 8
+; VI-NEXT:    s_lshr_b32 s53, s21, 24
+; VI-NEXT:    s_lshr_b32 s54, s21, 16
+; VI-NEXT:    s_lshr_b32 s55, s21, 8
+; VI-NEXT:    s_lshr_b32 s64, s20, 16
+; VI-NEXT:    s_lshr_b32 s65, s20, 8
+; VI-NEXT:    s_lshr_b32 s66, s19, 24
+; VI-NEXT:    s_lshr_b32 s67, s19, 16
+; VI-NEXT:    s_lshr_b32 s68, s19, 8
+; VI-NEXT:    s_lshr_b32 s69, s18, 16
+; VI-NEXT:    s_lshr_b32 s70, s18, 8
+; VI-NEXT:    s_lshr_b32 s71, s17, 24
+; VI-NEXT:    s_lshr_b32 s80, s17, 16
+; VI-NEXT:    s_lshr_b32 s81, s17, 8
+; VI-NEXT:    s_lshr_b32 s82, s16, 16
+; VI-NEXT:    s_lshr_b32 s83, s16, 8
+; VI-NEXT:    s_lshr_b64 s[42:43], s[4:5], 24
+; VI-NEXT:    s_lshr_b64 s[44:45], s[28:29], 24
+; VI-NEXT:    s_lshr_b64 s[58:59], s[26:27], 24
+; VI-NEXT:    s_lshr_b64 s[60:61], s[24:25], 24
+; VI-NEXT:    s_lshr_b64 s[62:63], s[22:23], 24
+; VI-NEXT:    s_lshr_b64 s[72:73], s[20:21], 24
+; VI-NEXT:    s_lshr_b64 s[74:75], s[18:19], 24
+; VI-NEXT:    s_lshr_b64 s[76:77], s[16:17], 24
+; VI-NEXT:    s_mov_b32 s6, s17
+; VI-NEXT:    s_mov_b32 s8, s19
+; VI-NEXT:    s_mov_b32 s10, s21
+; VI-NEXT:    s_mov_b32 s12, s23
+; VI-NEXT:    s_mov_b32 s14, s25
+; VI-NEXT:    s_mov_b32 s40, s27
+; VI-NEXT:    s_mov_b32 s46, s29
+; VI-NEXT:    s_mov_b32 s56, s5
+; VI-NEXT:    s_cbranch_execnz .LBB109_3
 ; VI-NEXT:  .LBB109_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s6, s17, 16
-; VI-NEXT:    v_mov_b32_e32 v15, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s6, v15
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s6, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s6, v15
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_mov_b32_e32 v1, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v2, s6, v1
+; VI-NEXT:    v_readfirstlane_b32 s6, v2
+; VI-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT:    s_add_i32 s7, s7, s6
+; VI-NEXT:    s_add_i32 s8, s7, 0x7fff
+; VI-NEXT:    s_or_b32 s9, s6, 0x400000
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    s_lshl_b32 s6, s16, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s6, v15
-; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s6, s16, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s6, v15
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    s_lshl_b32 s6, s19, 16
-; VI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
-; VI-NEXT:    v_add_f32_e32 v3, s6, v15
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    s_and_b32 s6, s19, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v4, s6, v15
-; VI-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT:    s_lshl_b32 s6, s18, 16
-; VI-NEXT:    v_alignbit_b32 v4, v4, v3, 16
-; VI-NEXT:    v_add_f32_e32 v3, s6, v15
-; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    s_and_b32 s6, s18, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
-; VI-NEXT:    v_add_f32_e32 v5, s6, v15
-; VI-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT:    s_lshl_b32 s6, s21, 16
-; VI-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; VI-NEXT:    v_add_f32_e32 v5, s6, v15
-; VI-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
-; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT:    s_and_b32 s6, s21, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
-; VI-NEXT:    v_add_f32_e32 v6, s6, v15
-; VI-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
-; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; VI-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; VI-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT:    s_lshl_b32 s6, s20, 16
-; VI-NEXT:    v_alignbit_b32 v6, v6, v5, 16
-; VI-NEXT:    v_add_f32_e32 v5, s6, v15
-; VI-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v5
-; VI-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; VI-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT:    s_and_b32 s6, s20, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
-; VI-NEXT:    v_add_f32_e32 v7, s6, v15
-; VI-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v7
-; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v7
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT:    v_cndmask_b32_e32 v7, v8, v9, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT:    s_lshl_b32 s6, s23, 16
-; VI-NEXT:    v_alignbit_b32 v5, v7, v5, 16
-; VI-NEXT:    v_add_f32_e32 v7, s6, v15
-; VI-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v7
-; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT:    v_or_b32_e32 v9, 0x400000, v7
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT:    s_and_b32 s6, s23, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v7, v8, v9, vcc
-; VI-NEXT:    v_add_f32_e32 v8, s6, v15
-; VI-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v8
-; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT:    v_or_b32_e32 v10, 0x400000, v8
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT:    s_lshl_b32 s6, s22, 16
-; VI-NEXT:    v_alignbit_b32 v8, v8, v7, 16
-; VI-NEXT:    v_add_f32_e32 v7, s6, v15
-; VI-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v7
-; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT:    v_or_b32_e32 v10, 0x400000, v7
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT:    s_and_b32 s6, s22, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
-; VI-NEXT:    v_add_f32_e32 v9, s6, v15
-; VI-NEXT:    v_bfe_u32 v10, v9, 16, 1
-; VI-NEXT:    v_add_u32_e32 v10, vcc, v10, v9
-; VI-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
-; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v9
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT:    v_cndmask_b32_e32 v9, v10, v11, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT:    s_lshl_b32 s6, s25, 16
-; VI-NEXT:    v_alignbit_b32 v7, v9, v7, 16
-; VI-NEXT:    v_add_f32_e32 v9, s6, v15
-; VI-NEXT:    v_bfe_u32 v10, v9, 16, 1
-; VI-NEXT:    v_add_u32_e32 v10, vcc, v10, v9
-; VI-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
-; VI-NEXT:    v_or_b32_e32 v11, 0x400000, v9
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT:    s_and_b32 s6, s25, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v9, v10, v11, vcc
-; VI-NEXT:    v_add_f32_e32 v10, s6, v15
-; VI-NEXT:    v_bfe_u32 v11, v10, 16, 1
-; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v10
-; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
-; VI-NEXT:    v_or_b32_e32 v12, 0x400000, v10
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; VI-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT:    s_lshl_b32 s6, s24, 16
-; VI-NEXT:    v_alignbit_b32 v10, v10, v9, 16
-; VI-NEXT:    v_add_f32_e32 v9, s6, v15
-; VI-NEXT:    v_bfe_u32 v11, v9, 16, 1
-; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v9
-; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x7fff, v11
-; VI-NEXT:    v_or_b32_e32 v12, 0x400000, v9
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT:    s_and_b32 s6, s24, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v9, v11, v12, vcc
-; VI-NEXT:    v_add_f32_e32 v11, s6, v15
-; VI-NEXT:    v_bfe_u32 v12, v11, 16, 1
-; VI-NEXT:    v_add_u32_e32 v12, vcc, v12, v11
-; VI-NEXT:    v_add_u32_e32 v12, vcc, 0x7fff, v12
-; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v11
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; VI-NEXT:    v_cndmask_b32_e32 v11, v12, v13, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT:    s_lshl_b32 s6, s27, 16
-; VI-NEXT:    v_alignbit_b32 v9, v11, v9, 16
-; VI-NEXT:    v_add_f32_e32 v11, s6, v15
-; VI-NEXT:    v_bfe_u32 v12, v11, 16, 1
-; VI-NEXT:    v_add_u32_e32 v12, vcc, v12, v11
-; VI-NEXT:    v_add_u32_e32 v12, vcc, 0x7fff, v12
-; VI-NEXT:    v_or_b32_e32 v13, 0x400000, v11
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; VI-NEXT:    s_and_b32 s6, s27, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v11, v12, v13, vcc
-; VI-NEXT:    v_add_f32_e32 v12, s6, v15
-; VI-NEXT:    v_bfe_u32 v13, v12, 16, 1
-; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v12
-; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
-; VI-NEXT:    v_or_b32_e32 v14, 0x400000, v12
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; VI-NEXT:    v_cndmask_b32_e32 v12, v13, v14, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT:    s_lshl_b32 s6, s26, 16
-; VI-NEXT:    v_alignbit_b32 v12, v12, v11, 16
-; VI-NEXT:    v_add_f32_e32 v11, s6, v15
-; VI-NEXT:    v_bfe_u32 v13, v11, 16, 1
-; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v11
-; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x7fff, v13
-; VI-NEXT:    v_or_b32_e32 v14, 0x400000, v11
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; VI-NEXT:    s_and_b32 s6, s26, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v11, v13, v14, vcc
-; VI-NEXT:    v_add_f32_e32 v13, s6, v15
-; VI-NEXT:    v_bfe_u32 v14, v13, 16, 1
-; VI-NEXT:    v_add_u32_e32 v14, vcc, v14, v13
-; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x7fff, v14
-; VI-NEXT:    v_or_b32_e32 v16, 0x400000, v13
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT:    v_cndmask_b32_e32 v13, v14, v16, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT:    s_lshl_b32 s6, s29, 16
-; VI-NEXT:    v_alignbit_b32 v11, v13, v11, 16
-; VI-NEXT:    v_add_f32_e32 v13, s6, v15
-; VI-NEXT:    v_bfe_u32 v14, v13, 16, 1
-; VI-NEXT:    v_add_u32_e32 v14, vcc, v14, v13
-; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x7fff, v14
-; VI-NEXT:    v_or_b32_e32 v16, 0x400000, v13
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT:    s_and_b32 s6, s29, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v13, v14, v16, vcc
-; VI-NEXT:    v_add_f32_e32 v14, s6, v15
-; VI-NEXT:    v_bfe_u32 v16, v14, 16, 1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v14
-; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v14
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; VI-NEXT:    v_cndmask_b32_e32 v14, v16, v17, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT:    s_lshl_b32 s6, s28, 16
-; VI-NEXT:    v_alignbit_b32 v14, v14, v13, 16
-; VI-NEXT:    v_add_f32_e32 v13, s6, v15
-; VI-NEXT:    v_bfe_u32 v16, v13, 16, 1
-; VI-NEXT:    v_add_u32_e32 v16, vcc, v16, v13
-; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT:    v_or_b32_e32 v17, 0x400000, v13
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT:    s_and_b32 s6, s28, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v13, v16, v17, vcc
-; VI-NEXT:    v_add_f32_e32 v16, s6, v15
-; VI-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT:    s_lshl_b32 s6, s5, 16
-; VI-NEXT:    v_alignbit_b32 v13, v16, v13, 16
-; VI-NEXT:    v_add_f32_e32 v16, s6, v15
-; VI-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT:    v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
+; VI-NEXT:    s_and_b64 s[6:7], vcc, exec
+; VI-NEXT:    s_cselect_b32 s6, s9, s8
+; VI-NEXT:    s_and_b32 s7, s17, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s8, s7, 0x10010
+; VI-NEXT:    s_add_i32 s8, s8, s7
+; VI-NEXT:    s_add_i32 s10, s8, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[8:9], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s10
+; VI-NEXT:    s_lshr_b32 s7, s7, 16
+; VI-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
+; VI-NEXT:    s_lshl_b32 s7, s16, 16
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s8, s7, 0x10010
+; VI-NEXT:    s_add_i32 s8, s8, s7
+; VI-NEXT:    s_add_i32 s10, s8, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[8:9], vcc, exec
+; VI-NEXT:    s_cselect_b32 s8, s7, s10
+; VI-NEXT:    s_and_b32 s7, s16, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[10:11], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s9, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s19, 16
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_lshr_b64 s[16:17], s[8:9], 16
+; VI-NEXT:    s_bfe_u32 s8, s7, 0x10010
+; VI-NEXT:    s_add_i32 s8, s8, s7
+; VI-NEXT:    s_add_i32 s10, s8, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[8:9], vcc, exec
+; VI-NEXT:    s_cselect_b32 s8, s7, s10
+; VI-NEXT:    s_and_b32 s7, s19, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[10:11], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s9, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s18, 16
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    s_lshr_b64 s[8:9], s[8:9], 16
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[10:11], vcc, exec
+; VI-NEXT:    s_cselect_b32 s10, s7, s9
+; VI-NEXT:    s_and_b32 s7, s18, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[12:13], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s11, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s21, 16
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[18:19], s[10:11], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[10:11], vcc, exec
+; VI-NEXT:    s_cselect_b32 s10, s7, s9
+; VI-NEXT:    s_and_b32 s7, s21, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[12:13], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s11, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s20, 16
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[10:11], s[10:11], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[12:13], vcc, exec
+; VI-NEXT:    s_cselect_b32 s12, s7, s9
+; VI-NEXT:    s_and_b32 s7, s20, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[14:15], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s13, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s23, 16
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[20:21], s[12:13], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[12:13], vcc, exec
+; VI-NEXT:    s_cselect_b32 s12, s7, s9
+; VI-NEXT:    s_and_b32 s7, s23, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[14:15], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s13, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s22, 16
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[12:13], s[12:13], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[14:15], vcc, exec
+; VI-NEXT:    s_cselect_b32 s14, s7, s9
+; VI-NEXT:    s_and_b32 s7, s22, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[22:23], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s15, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s25, 16
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[22:23], s[14:15], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[14:15], vcc, exec
+; VI-NEXT:    s_cselect_b32 s14, s7, s9
+; VI-NEXT:    s_and_b32 s7, s25, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[40:41], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s15, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s24, 16
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[14:15], s[14:15], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[40:41], vcc, exec
+; VI-NEXT:    s_cselect_b32 s40, s7, s9
+; VI-NEXT:    s_and_b32 s7, s24, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[24:25], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s41, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s27, 16
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[24:25], s[40:41], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[40:41], vcc, exec
+; VI-NEXT:    s_cselect_b32 s40, s7, s9
+; VI-NEXT:    s_and_b32 s7, s27, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[42:43], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s41, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s26, 16
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[40:41], s[40:41], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[42:43], vcc, exec
+; VI-NEXT:    s_cselect_b32 s42, s7, s9
+; VI-NEXT:    s_and_b32 s7, s26, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[26:27], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s43, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s29, 16
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[26:27], s[42:43], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[42:43], vcc, exec
+; VI-NEXT:    s_cselect_b32 s42, s7, s9
+; VI-NEXT:    s_and_b32 s7, s29, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[44:45], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s43, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s28, 16
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[46:47], s[42:43], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[42:43], vcc, exec
+; VI-NEXT:    s_cselect_b32 s42, s7, s9
+; VI-NEXT:    s_and_b32 s7, s28, 0xffff0000
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[28:29], vcc, exec
+; VI-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-NEXT:    s_lshr_b32 s43, s7, 16
+; VI-NEXT:    s_lshl_b32 s7, s5, 16
+; VI-NEXT:    v_add_f32_e32 v2, s7, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v2
+; VI-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; VI-NEXT:    s_add_i32 s9, s9, s7
+; VI-NEXT:    s_lshr_b64 s[28:29], s[42:43], 16
+; VI-NEXT:    s_addk_i32 s9, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s7, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[42:43], vcc, exec
+; VI-NEXT:    s_cselect_b32 s42, s7, s9
 ; VI-NEXT:    s_and_b32 s5, s5, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT:    v_add_f32_e32 v17, s5, v15
-; VI-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v17
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT:    v_cndmask_b32_e32 v17, v18, v19, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; VI-NEXT:    v_add_f32_e32 v2, s5, v1
+; VI-NEXT:    v_readfirstlane_b32 s5, v2
+; VI-NEXT:    s_bfe_u32 s7, s5, 0x10010
+; VI-NEXT:    s_add_i32 s7, s7, s5
+; VI-NEXT:    s_addk_i32 s7, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s5, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[44:45], vcc, exec
+; VI-NEXT:    s_cselect_b32 s5, s5, s7
+; VI-NEXT:    s_lshr_b32 s43, s5, 16
 ; VI-NEXT:    s_lshl_b32 s5, s4, 16
-; VI-NEXT:    v_alignbit_b32 v16, v17, v16, 16
-; VI-NEXT:    v_add_f32_e32 v17, s5, v15
-; VI-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v17
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
+; VI-NEXT:    v_add_f32_e32 v2, s5, v1
+; VI-NEXT:    v_readfirstlane_b32 s5, v2
+; VI-NEXT:    s_bfe_u32 s7, s5, 0x10010
+; VI-NEXT:    s_add_i32 s7, s7, s5
+; VI-NEXT:    s_lshr_b64 s[56:57], s[42:43], 16
+; VI-NEXT:    s_addk_i32 s7, 0x7fff
+; VI-NEXT:    s_bitset1_b32 s5, 22
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    s_and_b64 s[42:43], vcc, exec
+; VI-NEXT:    s_cselect_b32 s42, s5, s7
 ; VI-NEXT:    s_and_b32 s4, s4, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT:    v_add_f32_e32 v15, s4, v15
-; VI-NEXT:    v_cndmask_b32_e32 v17, v18, v19, vcc
-; VI-NEXT:    v_bfe_u32 v18, v15, 16, 1
-; VI-NEXT:    v_add_u32_e32 v18, vcc, v18, v15
-; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT:    v_or_b32_e32 v19, 0x400000, v15
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; VI-NEXT:    v_cndmask_b32_e32 v15, v18, v19, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT:    v_alignbit_b32 v15, v15, v17, 16
-; VI-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
-; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
-; VI-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; VI-NEXT:    v_lshrrev_b64 v[17:18], 24, v[13:14]
-; VI-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
-; VI-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v28, 16, v15
-; VI-NEXT:    v_lshrrev_b32_e32 v29, 8, v15
-; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v13
-; VI-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
-; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v12
-; VI-NEXT:    v_lshrrev_b32_e32 v37, 8, v12
-; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v11
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 8, v11
-; VI-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v50, 8, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v52, 8, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v55, 8, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v42, 16, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v44, 8, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v45, 16, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v56, 24, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v57, 8, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v59, 16, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v61, 24, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v62, 16, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v18, 8, v1
-; VI-NEXT:    s_branch .LBB109_5
-; VI-NEXT:  .LBB109_3:
-; VI-NEXT:    ; implicit-def: $sgpr67
-; VI-NEXT:    ; implicit-def: $sgpr65
+; VI-NEXT:    v_add_f32_e32 v1, s4, v1
+; VI-NEXT:    v_readfirstlane_b32 s4, v1
+; VI-NEXT:    s_bfe_u32 s5, s4, 0x10010
+; VI-NEXT:    s_add_i32 s5, s5, s4
+; VI-NEXT:    s_add_i32 s7, s5, 0x7fff
+; VI-NEXT:    s_or_b32 s9, s4, 0x400000
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; VI-NEXT:    s_cselect_b32 s4, s9, s7
+; VI-NEXT:    s_lshr_b32 s43, s4, 16
+; VI-NEXT:    s_lshr_b64 s[4:5], s[42:43], 16
+; VI-NEXT:    s_mov_b32 s17, s6
+; VI-NEXT:    s_mov_b32 s19, s8
+; VI-NEXT:    s_mov_b32 s21, s10
+; VI-NEXT:    s_mov_b32 s23, s12
+; VI-NEXT:    s_mov_b32 s25, s14
+; VI-NEXT:    s_mov_b32 s27, s40
+; VI-NEXT:    s_mov_b32 s29, s46
+; VI-NEXT:    s_mov_b32 s5, s56
+; VI-NEXT:    s_lshr_b64 s[42:43], s[4:5], 24
+; VI-NEXT:    s_lshr_b64 s[44:45], s[28:29], 24
+; VI-NEXT:    s_lshr_b32 s7, s56, 24
+; VI-NEXT:    s_lshr_b32 s9, s56, 16
+; VI-NEXT:    s_lshr_b32 s11, s56, 8
+; VI-NEXT:    s_lshr_b32 s13, s4, 16
+; VI-NEXT:    s_lshr_b32 s15, s4, 8
+; VI-NEXT:    s_lshr_b32 s41, s46, 24
+; VI-NEXT:    s_lshr_b32 s47, s46, 16
+; VI-NEXT:    s_lshr_b32 s57, s46, 8
+; VI-NEXT:    s_lshr_b32 s88, s28, 16
+; VI-NEXT:    s_lshr_b32 s89, s28, 8
+; VI-NEXT:    s_lshr_b32 s90, s40, 24
+; VI-NEXT:    s_lshr_b32 s91, s40, 16
+; VI-NEXT:    s_lshr_b32 s30, s40, 8
+; VI-NEXT:    s_lshr_b32 s31, s26, 16
+; VI-NEXT:    s_lshr_b32 s34, s26, 8
+; VI-NEXT:    s_lshr_b32 s35, s14, 24
+; VI-NEXT:    s_lshr_b32 s36, s14, 16
+; VI-NEXT:    s_lshr_b32 s37, s14, 8
+; VI-NEXT:    s_lshr_b32 s38, s24, 16
+; VI-NEXT:    s_lshr_b32 s39, s24, 8
+; VI-NEXT:    s_lshr_b32 s48, s12, 24
+; VI-NEXT:    s_lshr_b32 s49, s12, 16
+; VI-NEXT:    s_lshr_b32 s50, s12, 8
+; VI-NEXT:    s_lshr_b32 s51, s22, 16
+; VI-NEXT:    s_lshr_b32 s52, s22, 8
+; VI-NEXT:    s_lshr_b32 s53, s10, 24
+; VI-NEXT:    s_lshr_b32 s54, s10, 16
+; VI-NEXT:    s_lshr_b32 s55, s10, 8
+; VI-NEXT:    s_lshr_b32 s64, s20, 16
+; VI-NEXT:    s_lshr_b32 s65, s20, 8
+; VI-NEXT:    s_lshr_b32 s66, s8, 24
+; VI-NEXT:    s_lshr_b32 s67, s8, 16
+; VI-NEXT:    s_lshr_b32 s68, s8, 8
+; VI-NEXT:    s_lshr_b32 s69, s18, 16
+; VI-NEXT:    s_lshr_b32 s70, s18, 8
+; VI-NEXT:    s_lshr_b32 s71, s6, 24
+; VI-NEXT:    s_lshr_b32 s80, s6, 16
+; VI-NEXT:    s_lshr_b32 s81, s6, 8
+; VI-NEXT:    s_lshr_b32 s82, s16, 16
+; VI-NEXT:    s_lshr_b32 s83, s16, 8
+; VI-NEXT:    s_lshr_b64 s[58:59], s[26:27], 24
+; VI-NEXT:    s_lshr_b64 s[60:61], s[24:25], 24
+; VI-NEXT:    s_lshr_b64 s[62:63], s[22:23], 24
+; VI-NEXT:    s_lshr_b64 s[72:73], s[20:21], 24
+; VI-NEXT:    s_lshr_b64 s[74:75], s[18:19], 24
+; VI-NEXT:    s_lshr_b64 s[76:77], s[16:17], 24
+; VI-NEXT:  .LBB109_3: ; %end
+; VI-NEXT:    s_and_b32 s5, s16, 0xff
+; VI-NEXT:    s_lshl_b32 s16, s83, 8
+; VI-NEXT:    s_or_b32 s5, s5, s16
+; VI-NEXT:    s_lshl_b32 s16, s76, 8
+; VI-NEXT:    s_and_b32 s17, s82, 0xff
+; VI-NEXT:    s_or_b32 s16, s17, s16
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s16, s16, 16
+; VI-NEXT:    s_or_b32 s5, s5, s16
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    s_and_b32 s5, s6, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s81, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s80, 0xff
+; VI-NEXT:    s_lshl_b32 s16, s71, 8
+; VI-NEXT:    s_or_b32 s6, s6, s16
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    s_and_b32 s5, s18, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s70, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s69, 0xff
+; VI-NEXT:    s_lshl_b32 s16, s74, 8
+; VI-NEXT:    s_or_b32 s6, s6, s16
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 4, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    s_and_b32 s5, s8, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s68, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s67, 0xff
+; VI-NEXT:    s_lshl_b32 s8, s66, 8
+; VI-NEXT:    s_or_b32 s6, s6, s8
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 8, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    s_and_b32 s5, s20, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s65, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s64, 0xff
+; VI-NEXT:    s_lshl_b32 s8, s72, 8
+; VI-NEXT:    s_or_b32 s6, s6, s8
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 12, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    s_and_b32 s5, s10, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s55, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s54, 0xff
+; VI-NEXT:    s_lshl_b32 s8, s53, 8
+; VI-NEXT:    s_or_b32 s6, s6, s8
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 16, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    s_and_b32 s5, s22, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s52, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s51, 0xff
+; VI-NEXT:    s_lshl_b32 s8, s62, 8
+; VI-NEXT:    s_or_b32 s6, s6, s8
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 20, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    s_and_b32 s5, s12, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s50, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s49, 0xff
+; VI-NEXT:    s_lshl_b32 s8, s48, 8
+; VI-NEXT:    s_or_b32 s6, s6, s8
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 24, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    s_and_b32 s5, s24, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s39, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s38, 0xff
+; VI-NEXT:    s_lshl_b32 s8, s60, 8
+; VI-NEXT:    s_or_b32 s6, s6, s8
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 28, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    s_and_b32 s5, s14, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s37, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s36, 0xff
+; VI-NEXT:    s_lshl_b32 s8, s35, 8
+; VI-NEXT:    s_or_b32 s6, s6, s8
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 32, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    s_and_b32 s5, s26, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s34, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s31, 0xff
+; VI-NEXT:    s_lshl_b32 s8, s58, 8
+; VI-NEXT:    s_or_b32 s6, s6, s8
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 36, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    s_and_b32 s5, s40, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s30, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s91, 0xff
+; VI-NEXT:    s_lshl_b32 s8, s90, 8
+; VI-NEXT:    s_or_b32 s6, s6, s8
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 40, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    s_and_b32 s5, s28, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s89, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s88, 0xff
+; VI-NEXT:    s_lshl_b32 s8, s44, 8
+; VI-NEXT:    s_or_b32 s6, s6, s8
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 44, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    s_and_b32 s5, s46, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s57, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s47, 0xff
+; VI-NEXT:    s_lshl_b32 s8, s41, 8
+; VI-NEXT:    s_or_b32 s6, s6, s8
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 48, v0
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    s_and_b32 s4, s4, 0xff
+; VI-NEXT:    s_lshl_b32 s5, s15, 8
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_and_b32 s5, s13, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s42, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 52, v0
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    s_and_b32 s4, s56, 0xff
+; VI-NEXT:    s_lshl_b32 s5, s11, 8
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_and_b32 s5, s9, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s7, 8
+; VI-NEXT:    s_or_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 56, v0
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 60, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT:    v_readlane_b32 s83, v4, 27
+; VI-NEXT:    v_readlane_b32 s82, v4, 26
+; VI-NEXT:    v_readlane_b32 s81, v4, 25
+; VI-NEXT:    v_readlane_b32 s80, v4, 24
+; VI-NEXT:    v_readlane_b32 s71, v4, 23
+; VI-NEXT:    v_readlane_b32 s70, v4, 22
+; VI-NEXT:    v_readlane_b32 s69, v4, 21
+; VI-NEXT:    v_readlane_b32 s68, v4, 20
+; VI-NEXT:    v_readlane_b32 s67, v4, 19
+; VI-NEXT:    v_readlane_b32 s66, v4, 18
+; VI-NEXT:    v_readlane_b32 s65, v4, 17
+; VI-NEXT:    v_readlane_b32 s64, v4, 16
+; VI-NEXT:    v_readlane_b32 s55, v4, 15
+; VI-NEXT:    v_readlane_b32 s54, v4, 14
+; VI-NEXT:    v_readlane_b32 s53, v4, 13
+; VI-NEXT:    v_readlane_b32 s52, v4, 12
+; VI-NEXT:    v_readlane_b32 s51, v4, 11
+; VI-NEXT:    v_readlane_b32 s50, v4, 10
+; VI-NEXT:    v_readlane_b32 s49, v4, 9
+; VI-NEXT:    v_readlane_b32 s48, v4, 8
+; VI-NEXT:    v_readlane_b32 s39, v4, 7
+; VI-NEXT:    v_readlane_b32 s38, v4, 6
+; VI-NEXT:    v_readlane_b32 s37, v4, 5
+; VI-NEXT:    v_readlane_b32 s36, v4, 4
+; VI-NEXT:    v_readlane_b32 s35, v4, 3
+; VI-NEXT:    v_readlane_b32 s34, v4, 2
+; VI-NEXT:    v_readlane_b32 s31, v4, 1
+; VI-NEXT:    v_readlane_b32 s30, v4, 0
+; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    s_mov_b64 exec, s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+; VI-NEXT:  .LBB109_4:
+; VI-NEXT:    ; implicit-def: $sgpr83
+; VI-NEXT:    ; implicit-def: $sgpr82
+; VI-NEXT:    ; implicit-def: $sgpr76
 ; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr81
+; VI-NEXT:    ; implicit-def: $sgpr80
+; VI-NEXT:    ; implicit-def: $sgpr71
+; VI-NEXT:    ; implicit-def: $sgpr70
+; VI-NEXT:    ; implicit-def: $sgpr69
+; VI-NEXT:    ; implicit-def: $sgpr74
+; VI-NEXT:    ; implicit-def: $sgpr8
+; VI-NEXT:    ; implicit-def: $sgpr68
+; VI-NEXT:    ; implicit-def: $sgpr67
 ; VI-NEXT:    ; implicit-def: $sgpr66
+; VI-NEXT:    ; implicit-def: $sgpr65
 ; VI-NEXT:    ; implicit-def: $sgpr64
+; VI-NEXT:    ; implicit-def: $sgpr72
+; VI-NEXT:    ; implicit-def: $sgpr10
 ; VI-NEXT:    ; implicit-def: $sgpr55
 ; VI-NEXT:    ; implicit-def: $sgpr54
-; VI-NEXT:    ; implicit-def: $sgpr52
-; VI-NEXT:    ; implicit-def: $sgpr8
 ; VI-NEXT:    ; implicit-def: $sgpr53
+; VI-NEXT:    ; implicit-def: $sgpr52
 ; VI-NEXT:    ; implicit-def: $sgpr51
+; VI-NEXT:    ; implicit-def: $sgpr62
+; VI-NEXT:    ; implicit-def: $sgpr12
 ; VI-NEXT:    ; implicit-def: $sgpr50
 ; VI-NEXT:    ; implicit-def: $sgpr49
-; VI-NEXT:    ; implicit-def: $sgpr39
-; VI-NEXT:    ; implicit-def: $sgpr10
 ; VI-NEXT:    ; implicit-def: $sgpr48
+; VI-NEXT:    ; implicit-def: $sgpr39
 ; VI-NEXT:    ; implicit-def: $sgpr38
+; VI-NEXT:    ; implicit-def: $sgpr60
+; VI-NEXT:    ; implicit-def: $sgpr14
 ; VI-NEXT:    ; implicit-def: $sgpr37
 ; VI-NEXT:    ; implicit-def: $sgpr36
-; VI-NEXT:    ; implicit-def: $sgpr34
-; VI-NEXT:    ; implicit-def: $sgpr12
 ; VI-NEXT:    ; implicit-def: $sgpr35
+; VI-NEXT:    ; implicit-def: $sgpr34
 ; VI-NEXT:    ; implicit-def: $sgpr31
+; VI-NEXT:    ; implicit-def: $sgpr58
+; VI-NEXT:    ; implicit-def: $sgpr40
 ; VI-NEXT:    ; implicit-def: $sgpr30
 ; VI-NEXT:    ; implicit-def: $sgpr91
-; VI-NEXT:    ; implicit-def: $sgpr89
-; VI-NEXT:    ; implicit-def: $sgpr14
 ; VI-NEXT:    ; implicit-def: $sgpr90
+; VI-NEXT:    ; implicit-def: $sgpr89
 ; VI-NEXT:    ; implicit-def: $sgpr88
-; VI-NEXT:    ; implicit-def: $sgpr79
-; VI-NEXT:    ; implicit-def: $sgpr78
-; VI-NEXT:    ; implicit-def: $sgpr76
-; VI-NEXT:    ; implicit-def: $sgpr40
-; VI-NEXT:    ; implicit-def: $sgpr77
-; VI-NEXT:    ; implicit-def: $sgpr75
-; VI-NEXT:    ; implicit-def: $sgpr74
-; VI-NEXT:    ; implicit-def: $sgpr73
-; VI-NEXT:    ; implicit-def: $sgpr63
-; VI-NEXT:    ; implicit-def: $sgpr42
-; VI-NEXT:    ; implicit-def: $sgpr72
-; VI-NEXT:    ; implicit-def: $sgpr62
-; VI-NEXT:    ; implicit-def: $sgpr61
-; VI-NEXT:    ; implicit-def: $sgpr60
-; VI-NEXT:    ; implicit-def: $sgpr58
 ; VI-NEXT:    ; implicit-def: $sgpr44
-; VI-NEXT:    ; implicit-def: $sgpr59
+; VI-NEXT:    ; implicit-def: $sgpr46
 ; VI-NEXT:    ; implicit-def: $sgpr57
+; VI-NEXT:    ; implicit-def: $sgpr47
+; VI-NEXT:    ; implicit-def: $sgpr41
+; VI-NEXT:    ; implicit-def: $sgpr15
+; VI-NEXT:    ; implicit-def: $sgpr13
+; VI-NEXT:    ; implicit-def: $sgpr42
 ; VI-NEXT:    ; implicit-def: $sgpr56
+; VI-NEXT:    ; implicit-def: $sgpr11
+; VI-NEXT:    ; implicit-def: $sgpr9
+; VI-NEXT:    ; implicit-def: $sgpr7
 ; VI-NEXT:    s_branch .LBB109_2
-; VI-NEXT:  .LBB109_4:
-; VI-NEXT:    v_mov_b32_e32 v19, s44
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v19, s42
-; VI-NEXT:    v_mov_b32_e32 v1, s16
-; VI-NEXT:    v_mov_b32_e32 v2, s17
-; VI-NEXT:    v_mov_b32_e32 v3, s18
-; VI-NEXT:    v_mov_b32_e32 v4, s19
-; VI-NEXT:    v_mov_b32_e32 v5, s20
-; VI-NEXT:    v_mov_b32_e32 v6, s21
-; VI-NEXT:    v_mov_b32_e32 v7, s22
-; VI-NEXT:    v_mov_b32_e32 v8, s23
-; VI-NEXT:    v_mov_b32_e32 v9, s24
-; VI-NEXT:    v_mov_b32_e32 v10, s25
-; VI-NEXT:    v_mov_b32_e32 v11, s26
-; VI-NEXT:    v_mov_b32_e32 v12, s27
-; VI-NEXT:    v_mov_b32_e32 v13, s28
-; VI-NEXT:    v_mov_b32_e32 v14, s29
-; VI-NEXT:    v_mov_b32_e32 v15, s4
-; VI-NEXT:    v_mov_b32_e32 v16, s5
-; VI-NEXT:    v_mov_b32_e32 v18, s67
-; VI-NEXT:    v_mov_b32_e32 v62, s65
-; VI-NEXT:    v_mov_b32_e32 v17, s66
-; VI-NEXT:    v_mov_b32_e32 v60, s64
-; VI-NEXT:    v_mov_b32_e32 v61, s55
-; VI-NEXT:    v_mov_b32_e32 v58, s54
-; VI-NEXT:    v_mov_b32_e32 v59, s52
-; VI-NEXT:    v_mov_b32_e32 v57, s53
-; VI-NEXT:    v_mov_b32_e32 v47, s51
-; VI-NEXT:    v_mov_b32_e32 v56, s50
-; VI-NEXT:    v_mov_b32_e32 v46, s49
-; VI-NEXT:    v_mov_b32_e32 v45, s39
-; VI-NEXT:    v_mov_b32_e32 v44, s48
-; VI-NEXT:    v_mov_b32_e32 v42, s38
-; VI-NEXT:    v_mov_b32_e32 v43, s37
-; VI-NEXT:    v_mov_b32_e32 v41, s36
-; VI-NEXT:    v_mov_b32_e32 v40, s34
-; VI-NEXT:    v_mov_b32_e32 v55, s35
-; VI-NEXT:    v_mov_b32_e32 v53, s31
-; VI-NEXT:    v_mov_b32_e32 v54, s30
-; VI-NEXT:    v_mov_b32_e32 v52, s91
-; VI-NEXT:    v_mov_b32_e32 v51, s89
-; VI-NEXT:    v_mov_b32_e32 v50, s90
-; VI-NEXT:    v_mov_b32_e32 v48, s88
-; VI-NEXT:    v_mov_b32_e32 v49, s79
-; VI-NEXT:    v_mov_b32_e32 v39, s78
-; VI-NEXT:    v_mov_b32_e32 v38, s76
-; VI-NEXT:    v_mov_b32_e32 v37, s77
-; VI-NEXT:    v_mov_b32_e32 v35, s75
-; VI-NEXT:    v_mov_b32_e32 v36, s74
-; VI-NEXT:    v_mov_b32_e32 v34, s73
-; VI-NEXT:    v_mov_b32_e32 v33, s63
-; VI-NEXT:    v_mov_b32_e32 v32, s72
-; VI-NEXT:    v_mov_b32_e32 v30, s62
-; VI-NEXT:    v_mov_b32_e32 v31, s61
-; VI-NEXT:    v_mov_b32_e32 v29, s60
-; VI-NEXT:    v_mov_b32_e32 v28, s58
-; VI-NEXT:    v_mov_b32_e32 v27, s59
-; VI-NEXT:    v_mov_b32_e32 v25, s57
-; VI-NEXT:    v_mov_b32_e32 v26, s56
-; VI-NEXT:    v_mov_b32_e32 v21, s12
-; VI-NEXT:    v_mov_b32_e32 v22, s10
-; VI-NEXT:    v_mov_b32_e32 v23, s8
-; VI-NEXT:    v_mov_b32_e32 v24, s6
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v19, s40
-; VI-NEXT:    v_mov_b32_e32 v20, s14
-; VI-NEXT:  .LBB109_5: ; %end
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v24
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v62, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v61
-; VI-NEXT:    v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v23
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v58
-; VI-NEXT:    v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v57
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v56
-; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 12, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v46
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v22
-; VI-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v44
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v43
-; VI-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v41
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v21
-; VI-NEXT:    v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 24, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v55
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v54
-; VI-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 28, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v52
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v20
-; VI-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v50
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v49
-; VI-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 36, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v39
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v19
-; VI-NEXT:    v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v37
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v36
-; VI-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v34
-; VI-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_readlane_b32 s67, v63, 19
-; VI-NEXT:    v_readlane_b32 s66, v63, 18
-; VI-NEXT:    v_readlane_b32 s65, v63, 17
-; VI-NEXT:    v_readlane_b32 s64, v63, 16
-; VI-NEXT:    v_readlane_b32 s55, v63, 15
-; VI-NEXT:    v_readlane_b32 s54, v63, 14
-; VI-NEXT:    v_readlane_b32 s53, v63, 13
-; VI-NEXT:    v_readlane_b32 s52, v63, 12
-; VI-NEXT:    v_readlane_b32 s51, v63, 11
-; VI-NEXT:    v_readlane_b32 s50, v63, 10
-; VI-NEXT:    v_readlane_b32 s49, v63, 9
-; VI-NEXT:    v_readlane_b32 s48, v63, 8
-; VI-NEXT:    v_readlane_b32 s39, v63, 7
-; VI-NEXT:    v_readlane_b32 s38, v63, 6
-; VI-NEXT:    v_readlane_b32 s37, v63, 5
-; VI-NEXT:    v_readlane_b32 s36, v63, 4
-; VI-NEXT:    v_readlane_b32 s35, v63, 3
-; VI-NEXT:    v_readlane_b32 s34, v63, 2
-; VI-NEXT:    v_readlane_b32 s31, v63, 1
-; VI-NEXT:    v_readlane_b32 s30, v63, 0
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 48, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v32
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v31
-; VI-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 52, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v29
-; VI-NEXT:    v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 56, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v27
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v26
-; VI-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 60, v0
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v32bf16_to_v64i8_scalar:
 ; GFX9:       ; %bb.0:
@@ -90429,6 +90770,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:92
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:124
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 8, v13
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v21
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v27
@@ -90458,28 +90801,30 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v17
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v20
-; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v24
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v28
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:40
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 8, v31
-; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 24, v32
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 24, v33
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 8, v34
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 24, v36
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:20
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -90496,8 +90841,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:124
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
@@ -90513,16 +90856,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v19
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:40
 ; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v23
 ; SI-NEXT:    ; kill: killed $vgpr3
@@ -90803,7 +91138,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB110_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v45
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v16
@@ -90829,7 +91163,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v5, v58, v5
 ; SI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, s7, v3
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v60
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v26
@@ -90841,7 +91174,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v5, v46, v5
 ; SI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; SI-NEXT:    v_add_i32_e32 v25, vcc, s7, v3
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v49
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v59
@@ -90854,7 +91186,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v5, v12, v5
 ; SI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, s7, v3
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v47
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v20
@@ -90868,7 +91199,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, s7, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v44
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v28
 ; SI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v24
@@ -91086,11 +91416,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v6
 ; SI-NEXT:  .LBB110_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v2, v43
 ; SI-NEXT:    v_mov_b32_e32 v10, v41
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v28, v40
 ; SI-NEXT:    v_mov_b32_e32 v30, v42
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
@@ -91109,6 +91436,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v4, v33
 ; SI-NEXT:    v_mov_b32_e32 v6, v39
 ; SI-NEXT:    v_mov_b32_e32 v8, v51
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
index 5d4df4bde1af8..7bd2c7a628ebd 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
@@ -6164,6 +6164,14 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v36f16_to_v18i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -6180,36 +6188,28 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v63, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v60, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v46, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v45, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v14
@@ -6224,14 +6224,12 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v36
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v39
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v48
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -13435,6 +13433,14 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v36f16_to_v18f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -13451,36 +13457,28 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v63, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v60, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v46, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v45, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v14
@@ -13495,14 +13493,12 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v36
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v39
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v48
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -19656,6 +19652,14 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v36f16_to_v9i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -19672,36 +19676,28 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v63, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v60, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v46, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v45, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v14
@@ -19716,14 +19712,12 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v36
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v39
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v48
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -25282,6 +25276,14 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v36f16_to_v9f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -25298,36 +25300,28 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v63, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v60, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v46, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v45, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v14
@@ -25342,14 +25336,12 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v36
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v39
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v48
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -26798,22 +26790,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v36i16_to_v36f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:12
@@ -26838,6 +26814,22 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; kill: killed $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr62
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -26865,7 +26857,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; kill: killed $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -26892,7 +26884,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v47, v9
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v10
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v39
 ; SI-NEXT:    v_cvt_f32_f16_e32 v45, v11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v12
@@ -26977,7 +26969,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v27
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v39, vcc, 3, v39
 ; SI-NEXT:    v_add_i32_e32 v34, vcc, 3, v34
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 44cfd6c28ca6a..8964ebd9cbd70 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -3541,6 +3541,17 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i16_to_v20i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -3562,17 +3573,6 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    v_mov_b32_e32 v37, v20
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_mov_b32_e32 v39, v16
@@ -3594,13 +3594,10 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v29
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -4914,7 +4911,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:296
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:292
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:288
@@ -4947,7 +4944,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:180
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:176
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:172
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:168
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:160
@@ -4980,7 +4977,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:44
-; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    s_clause 0xa ; 44-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:32
@@ -5073,7 +5070,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, v170
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:8
@@ -5106,7 +5103,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:136
@@ -5139,7 +5136,7 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    s_clause 0xa ; 44-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:264
@@ -8520,7 +8517,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:296
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:292
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:288
@@ -8553,7 +8550,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:180
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:176
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:172
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:168
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:160
@@ -8586,7 +8583,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:44
-; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    s_clause 0xa ; 44-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:32
@@ -8679,7 +8676,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, v170
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:8
@@ -8712,7 +8709,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:136
@@ -8745,7 +8742,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    s_clause 0xa ; 44-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:264
@@ -11740,6 +11737,17 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i16_to_v20f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -11761,17 +11769,6 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    v_mov_b32_e32 v37, v20
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_mov_b32_e32 v39, v16
@@ -11793,13 +11790,10 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v29
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -13113,7 +13107,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:296
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:292
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:288
@@ -13146,7 +13140,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:180
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:176
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:172
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:168
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:160
@@ -13179,7 +13173,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:44
-; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    s_clause 0xa ; 44-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:32
@@ -13272,7 +13266,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, v170
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:8
@@ -13305,7 +13299,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:136
@@ -13338,7 +13332,7 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    s_clause 0xa ; 44-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:264
@@ -16833,7 +16827,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:296
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:292
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:288
@@ -16866,7 +16860,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:180
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:176
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:172
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:168
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:160
@@ -16899,7 +16893,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:44
-; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    s_clause 0xa ; 44-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:32
@@ -16992,7 +16986,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, v170
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:8
@@ -17025,7 +17019,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:136
@@ -17058,7 +17052,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    s_clause 0xa ; 44-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:264
@@ -19249,6 +19243,17 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i16_to_v10i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -19270,17 +19275,6 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    v_mov_b32_e32 v37, v20
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_mov_b32_e32 v39, v16
@@ -19302,13 +19296,10 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v29
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -20622,7 +20613,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:296
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:292
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:288
@@ -20655,7 +20646,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:180
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:176
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:172
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:168
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:160
@@ -20688,7 +20679,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:44
-; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    s_clause 0xa ; 44-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:32
@@ -20781,7 +20772,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, v170
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:8
@@ -20814,7 +20805,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:136
@@ -20847,7 +20838,7 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    s_clause 0xa ; 44-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:264
@@ -24238,7 +24229,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:296
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:292
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:288
@@ -24271,7 +24262,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:180
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:176
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:172
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:168
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:160
@@ -24304,7 +24295,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:44
-; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    s_clause 0xa ; 44-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:32
@@ -24397,7 +24388,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, v170
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:8
@@ -24430,7 +24421,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:136
@@ -24463,7 +24454,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    s_clause 0xa ; 44-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:264
@@ -25988,6 +25979,17 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i16_to_v10f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -26009,17 +26011,6 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    v_mov_b32_e32 v37, v20
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_mov_b32_e32 v39, v16
@@ -26041,13 +26032,10 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v29
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -27361,7 +27349,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:296
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:292
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:288
@@ -27394,7 +27382,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:180
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:176
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:172
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:168
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:160
@@ -27427,7 +27415,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:44
-; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    s_clause 0xa ; 44-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:32
@@ -27520,7 +27508,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, v170
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:8
@@ -27553,7 +27541,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:136
@@ -27586,7 +27574,7 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    s_clause 0xa ; 44-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:264
@@ -31014,7 +31002,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:296
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:292
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:288
@@ -31047,7 +31035,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:180
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:176
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:172
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:168
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:160
@@ -31080,7 +31068,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:44
-; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    s_clause 0xa ; 44-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:32
@@ -31173,7 +31161,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, v170
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:8
@@ -31206,7 +31194,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:136
@@ -31239,7 +31227,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    s_clause 0xa ; 44-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:264
@@ -31389,6 +31377,17 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i16_to_v40f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
@@ -31405,17 +31404,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:8
 ; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; kill: killed $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr40
@@ -31472,7 +31460,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; kill: killed $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -31523,7 +31511,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v30
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v40, v48
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr3
@@ -31623,7 +31610,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v27
 ; SI-NEXT:    v_add_i32_e32 v39, vcc, 3, v39
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_add_i32_e32 v49, vcc, 3, v49
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -31643,7 +31629,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v38
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_add_i32_e32 v48, vcc, 3, v48
 ; SI-NEXT:    v_add_i32_e32 v24, vcc, 3, v24
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index 14e17ce49cca0..3d9c7681b3132 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -2272,30 +2272,30 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s20, 0
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s17
 ; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s19
 ; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s18
 ; SI-NEXT:    s_cbranch_scc0 .LBB23_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; SI-NEXT:    v_alignbit_b32 v0, v0, v5, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v3, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
+; SI-NEXT:    v_lshr_b64 v[0:1], v[5:6], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; SI-NEXT:    v_lshr_b64 v[1:2], v[3:4], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB23_3
 ; SI-NEXT:  .LBB23_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:  .LBB23_3: ; %end
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB23_4:
@@ -2311,42 +2311,43 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg
 ; VI-NEXT:    s_cbranch_execnz .LBB23_4
 ; VI-NEXT:  .LBB23_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v4, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v4
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v4
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v4
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v4
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v2
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB23_3:
 ; VI-NEXT:    s_branch .LBB23_2
@@ -5460,30 +5461,30 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s20, 0
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s17
 ; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s19
 ; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s18
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; SI-NEXT:    v_alignbit_b32 v0, v0, v5, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v3, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
+; SI-NEXT:    v_lshr_b64 v[0:1], v[5:6], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; SI-NEXT:    v_lshr_b64 v[1:2], v[3:4], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB47_3
 ; SI-NEXT:  .LBB47_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:  .LBB47_3: ; %end
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB47_4:
@@ -5499,42 +5500,43 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr
 ; VI-NEXT:    s_cbranch_execnz .LBB47_4
 ; VI-NEXT:  .LBB47_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v4, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v4
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v4
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v4
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v4
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v2
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB47_3:
 ; VI-NEXT:    s_branch .LBB47_2
@@ -8361,30 +8363,30 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s20, 0
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s17
 ; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s19
 ; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s18
 ; SI-NEXT:    s_cbranch_scc0 .LBB67_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; SI-NEXT:    v_alignbit_b32 v0, v0, v5, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v3, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
+; SI-NEXT:    v_lshr_b64 v[0:1], v[5:6], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; SI-NEXT:    v_lshr_b64 v[1:2], v[3:4], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB67_3
 ; SI-NEXT:  .LBB67_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:  .LBB67_3: ; %end
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB67_4:
@@ -8400,42 +8402,43 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3
 ; VI-NEXT:    s_cbranch_execnz .LBB67_4
 ; VI-NEXT:  .LBB67_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v4, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v4
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v4
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v4
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v4
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v2
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB67_3:
 ; VI-NEXT:    s_branch .LBB67_2
@@ -10937,30 +10940,30 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a,
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s20, 0
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s17
 ; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s19
 ; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s18
 ; SI-NEXT:    s_cbranch_scc0 .LBB83_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; SI-NEXT:    v_alignbit_b32 v0, v0, v5, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v3, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
+; SI-NEXT:    v_lshr_b64 v[0:1], v[5:6], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; SI-NEXT:    v_lshr_b64 v[1:2], v[3:4], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB83_3
 ; SI-NEXT:  .LBB83_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; SI-NEXT:  .LBB83_3: ; %end
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB83_4:
@@ -10976,42 +10979,43 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a,
 ; VI-NEXT:    s_cbranch_execnz .LBB83_4
 ; VI-NEXT:  .LBB83_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v4, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v4
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v4
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v4
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v4
+; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v2
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB83_3:
 ; VI-NEXT:    s_branch .LBB83_2
@@ -13150,39 +13154,40 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s20, 0
 ; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s19
 ; SI-NEXT:    s_cbranch_scc0 .LBB95_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; SI-NEXT:    s_cbranch_execnz .LBB95_3
 ; SI-NEXT:  .LBB95_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v7
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; SI-NEXT:    v_lshr_b64 v[4:5], v[1:2], 16
-; SI-NEXT:    v_alignbit_b32 v0, v6, v0, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; SI-NEXT:    v_lshr_b64 v[2:3], v[3:4], 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; SI-NEXT:    v_lshr_b64 v[5:6], v[1:2], 16
 ; SI-NEXT:  .LBB95_3: ; %end
-; SI-NEXT:    v_mov_b32_e32 v1, v4
+; SI-NEXT:    v_mov_b32_e32 v1, v5
+; SI-NEXT:    v_mov_b32_e32 v3, v4
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB95_4:
 ; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    s_branch .LBB95_2
 ;
 ; VI-LABEL: bitcast_v4bf16_to_v4i16_scalar:
@@ -13194,42 +13199,43 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3
 ; VI-NEXT:    s_cbranch_execnz .LBB95_4
 ; VI-NEXT:  .LBB95_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v3, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v3
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v3
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; VI-NEXT:    v_add_f32_e32 v2, s4, v3
+; VI-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
 ; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
-; VI-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_add_f32_e32 v3, s4, v3
+; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v1, v0, v1, 16
-; VI-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[2:3]
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v2
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB95_3:
 ; VI-NEXT:    s_branch .LBB95_2
@@ -15062,42 +15068,43 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i
 ; VI-NEXT:    s_cbranch_execnz .LBB103_4
 ; VI-NEXT:  .LBB103_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v3, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v3
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v3
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; VI-NEXT:    v_add_f32_e32 v2, s4, v3
+; VI-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
 ; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
-; VI-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_add_f32_e32 v3, s4, v3
+; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v1, v0, v1, 16
-; VI-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[2:3]
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v2
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB103_3:
 ; VI-NEXT:    s_branch .LBB103_2
@@ -16737,52 +16744,52 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s20, 0
-; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s18
 ; SI-NEXT:    s_cbranch_scc0 .LBB109_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; SI-NEXT:    v_alignbit_b32 v9, v1, v12, 16
-; SI-NEXT:    v_alignbit_b32 v10, v6, v8, 16
-; SI-NEXT:    v_lshr_b64 v[3:4], v[9:10], 24
-; SI-NEXT:    v_lshr_b64 v[4:5], v[9:10], 16
-; SI-NEXT:    v_lshr_b64 v[1:2], v[9:10], 8
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v15
+; SI-NEXT:    v_lshr_b64 v[11:12], v[0:1], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
+; SI-NEXT:    v_lshr_b64 v[12:13], v[5:6], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v14
+; SI-NEXT:    v_lshr_b64 v[3:4], v[11:12], 24
+; SI-NEXT:    v_lshr_b64 v[9:10], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[11:12], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v12
 ; SI-NEXT:    s_cbranch_execnz .LBB109_3
 ; SI-NEXT:  .LBB109_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v11
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v12
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v15
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; SI-NEXT:    v_alignbit_b32 v9, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
-; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshr_b64 v[11:12], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v14
+; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; SI-NEXT:    v_alignbit_b32 v10, v6, v1, 16
-; SI-NEXT:    v_lshr_b64 v[3:4], v[9:10], 24
-; SI-NEXT:    v_lshr_b64 v[4:5], v[9:10], 16
-; SI-NEXT:    v_lshr_b64 v[1:2], v[9:10], 8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v10
+; SI-NEXT:    v_lshr_b64 v[12:13], v[5:6], 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
+; SI-NEXT:    v_lshr_b64 v[3:4], v[11:12], 24
+; SI-NEXT:    v_lshr_b64 v[9:10], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[11:12], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v12
 ; SI-NEXT:  .LBB109_3: ; %end
-; SI-NEXT:    v_mov_b32_e32 v0, v9
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v10
+; SI-NEXT:    v_mov_b32_e32 v0, v11
+; SI-NEXT:    v_mov_b32_e32 v2, v9
+; SI-NEXT:    v_mov_b32_e32 v4, v12
+; SI-NEXT:    v_mov_b32_e32 v5, v8
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB109_4:
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; implicit-def: $vgpr8
 ; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    s_branch .LBB109_2
 ;
@@ -16793,11 +16800,11 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32
 ; VI-NEXT:    s_cbranch_scc0 .LBB109_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
-; VI-NEXT:    s_lshr_b32 s8, s17, 24
-; VI-NEXT:    s_lshr_b32 s5, s17, 16
-; VI-NEXT:    s_lshr_b32 s10, s17, 8
-; VI-NEXT:    s_lshr_b32 s9, s16, 16
-; VI-NEXT:    s_lshr_b32 s11, s16, 8
+; VI-NEXT:    s_lshr_b32 s5, s17, 24
+; VI-NEXT:    s_lshr_b32 s11, s17, 16
+; VI-NEXT:    s_lshr_b32 s8, s17, 8
+; VI-NEXT:    s_lshr_b32 s10, s16, 16
+; VI-NEXT:    s_lshr_b32 s9, s16, 8
 ; VI-NEXT:    s_cbranch_execnz .LBB109_4
 ; VI-NEXT:  .LBB109_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
@@ -16810,58 +16817,59 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v3, v2, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v5, v2, v3, vcc
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
-; VI-NEXT:    s_lshl_b32 s4, s16, 16
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
+; VI-NEXT:    v_lshrrev_b64 v[1:2], 16, v[5:6]
+; VI-NEXT:    s_lshl_b32 s4, s16, 16
+; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; VI-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v5, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v2, v6, v3, 16
-; VI-NEXT:    v_alignbit_b32 v1, v0, v4, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; VI-NEXT:    v_lshrrev_b64 v[9:10], 16, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v10, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
+; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[9:10]
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v9
 ; VI-NEXT:    v_mov_b32_e32 v4, v8
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB109_3:
-; VI-NEXT:    ; implicit-def: $sgpr11
 ; VI-NEXT:    ; implicit-def: $sgpr9
-; VI-NEXT:    ; implicit-def: $sgpr4
 ; VI-NEXT:    ; implicit-def: $sgpr10
-; VI-NEXT:    ; implicit-def: $sgpr5
+; VI-NEXT:    ; implicit-def: $sgpr4
 ; VI-NEXT:    ; implicit-def: $sgpr8
+; VI-NEXT:    ; implicit-def: $sgpr11
+; VI-NEXT:    ; implicit-def: $sgpr5
 ; VI-NEXT:    s_branch .LBB109_2
 ; VI-NEXT:  .LBB109_4:
-; VI-NEXT:    v_mov_b32_e32 v1, s11
-; VI-NEXT:    v_mov_b32_e32 v2, s9
-; VI-NEXT:    v_mov_b32_e32 v5, s10
-; VI-NEXT:    v_mov_b32_e32 v7, s8
-; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s16
-; VI-NEXT:    v_mov_b32_e32 v6, s5
+; VI-NEXT:    v_mov_b32_e32 v6, s11
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v7, s5
+; VI-NEXT:    v_mov_b32_e32 v5, s8
+; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    v_mov_b32_e32 v4, s17
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index 87d5157b3c340..ed407c1e20c14 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -3792,6 +3792,17 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v44i16_to_v22i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v39, v16
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -3814,17 +3825,6 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v39, v16
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v3
@@ -3842,9 +3842,8 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -5329,7 +5328,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:304
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:300
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:296
@@ -5362,7 +5361,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:180
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:176
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:172
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:168
@@ -5395,7 +5394,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:52
-; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    s_clause 0xc ; 52-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:44
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:40
@@ -5496,7 +5495,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v21, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:8
@@ -5529,7 +5528,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:136
@@ -5562,7 +5561,7 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    s_clause 0xc ; 52-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:264
@@ -9311,7 +9310,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:304
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:300
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:296
@@ -9344,7 +9343,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:180
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:176
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:172
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:168
@@ -9377,7 +9376,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:52
-; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    s_clause 0xc ; 52-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:44
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:40
@@ -9478,7 +9477,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v21, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:8
@@ -9511,7 +9510,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:136
@@ -9544,7 +9543,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    s_clause 0xc ; 52-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:264
@@ -12755,6 +12754,17 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v44i16_to_v22f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v39, v16
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -12777,17 +12787,6 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v39, v16
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v3
@@ -12805,9 +12804,8 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -14292,7 +14290,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:304
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:300
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:296
@@ -14325,7 +14323,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:180
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:176
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:172
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:168
@@ -14358,7 +14356,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:52
-; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    s_clause 0xc ; 52-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:44
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:40
@@ -14459,7 +14457,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v21, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:8
@@ -14492,7 +14490,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:136
@@ -14525,7 +14523,7 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    s_clause 0xc ; 52-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:264
@@ -18407,7 +18405,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:304
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:300
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:296
@@ -18440,7 +18438,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:180
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:176
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:172
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:168
@@ -18473,7 +18471,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:52
-; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    s_clause 0xc ; 52-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:44
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:40
@@ -18574,7 +18572,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v21, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:8
@@ -18607,7 +18605,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:136
@@ -18640,7 +18638,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    s_clause 0xc ; 52-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:264
@@ -21004,6 +21002,17 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v44i16_to_v11i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v39, v16
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -21026,17 +21035,6 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v39, v16
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v3
@@ -21054,9 +21052,8 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -22541,7 +22538,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:304
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:300
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:296
@@ -22574,7 +22571,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:180
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:176
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:172
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:168
@@ -22607,7 +22604,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:52
-; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    s_clause 0xc ; 52-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:44
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:40
@@ -22708,7 +22705,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v21, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:8
@@ -22741,7 +22738,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:136
@@ -22774,7 +22771,7 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    s_clause 0xc ; 52-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:264
@@ -26535,7 +26532,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:304
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:300
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:296
@@ -26568,7 +26565,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:180
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:176
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:172
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:168
@@ -26601,7 +26598,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:52
-; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    s_clause 0xc ; 52-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:44
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:40
@@ -26702,7 +26699,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v21, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:8
@@ -26735,7 +26732,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:136
@@ -26768,7 +26765,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    s_clause 0xc ; 52-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:264
@@ -28420,6 +28417,17 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v44i16_to_v11f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v39, v16
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -28442,17 +28450,6 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v39, v16
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v3
@@ -28470,9 +28467,8 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -29957,7 +29953,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:304
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:300
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:296
@@ -29990,7 +29986,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:180
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:176
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:172
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:168
@@ -30023,7 +30019,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:52
-; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    s_clause 0xc ; 52-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:44
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:40
@@ -30124,7 +30120,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v21, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:8
@@ -30157,7 +30153,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:136
@@ -30190,7 +30186,7 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    s_clause 0xc ; 52-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:264
@@ -33996,7 +33992,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:304
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:300
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:296
@@ -34029,7 +34025,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:180
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:176
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:172
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:168
@@ -34062,7 +34058,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:52
-; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    s_clause 0xc ; 52-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:44
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:40
@@ -34163,7 +34159,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v21, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:8
@@ -34196,7 +34192,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:136
@@ -34229,7 +34225,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    s_clause 0xc ; 52-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:264
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index fb2e94fc3b87a..9ec3f5c00ee23 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -4045,6 +4045,22 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48i16_to_v24i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -4069,22 +4085,6 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v5
@@ -4100,21 +4100,14 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -5806,7 +5799,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:308
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:304
@@ -5839,7 +5832,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:188
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:180
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:176
@@ -5872,7 +5865,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:60
-; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    s_clause 0xe ; 60-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:48
@@ -5979,7 +5972,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v23, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:8
@@ -6012,7 +6005,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:136
@@ -6045,7 +6038,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    s_clause 0xe ; 60-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:264
@@ -8179,6 +8172,8 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48f16_to_v24i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -8195,8 +8190,6 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -8223,34 +8216,34 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v20
@@ -10214,7 +10207,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:308
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:304
@@ -10247,7 +10240,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:188
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:180
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:176
@@ -10280,7 +10273,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:60
-; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    s_clause 0xe ; 60-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:48
@@ -10387,7 +10380,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v23, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:8
@@ -10420,7 +10413,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:136
@@ -10453,7 +10446,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    s_clause 0xe ; 60-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:264
@@ -13882,6 +13875,22 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48i16_to_v24f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -13906,22 +13915,6 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v5
@@ -13937,21 +13930,14 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -15643,7 +15629,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:308
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:304
@@ -15676,7 +15662,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:188
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:180
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:176
@@ -15709,7 +15695,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:60
-; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    s_clause 0xe ; 60-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:48
@@ -15816,7 +15802,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v23, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:8
@@ -15849,7 +15835,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:136
@@ -15882,7 +15868,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    s_clause 0xe ; 60-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:264
@@ -18157,6 +18143,8 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48f16_to_v24f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -18173,8 +18161,6 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -18201,34 +18187,34 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v20
@@ -20192,7 +20178,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:308
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:304
@@ -20225,7 +20211,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:188
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:180
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:176
@@ -20258,7 +20244,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:60
-; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    s_clause 0xe ; 60-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:48
@@ -20365,7 +20351,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v23, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:8
@@ -20398,7 +20384,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:136
@@ -20431,7 +20417,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    s_clause 0xe ; 60-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:264
@@ -22982,6 +22968,22 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48i16_to_v12i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -23006,22 +23008,6 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v5
@@ -23037,21 +23023,14 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -24743,7 +24722,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:308
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:304
@@ -24776,7 +24755,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:188
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:180
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:176
@@ -24809,7 +24788,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:60
-; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    s_clause 0xe ; 60-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:48
@@ -24916,7 +24895,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v23, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:8
@@ -24949,7 +24928,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:136
@@ -24982,7 +24961,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    s_clause 0xe ; 60-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:264
@@ -27128,6 +27107,8 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48f16_to_v12i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -27144,8 +27125,6 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -27172,34 +27151,34 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v20
@@ -29163,7 +29142,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:308
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:304
@@ -29196,7 +29175,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:188
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:180
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:176
@@ -29229,7 +29208,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:60
-; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    s_clause 0xe ; 60-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:48
@@ -29336,7 +29315,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v23, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:8
@@ -29369,7 +29348,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:136
@@ -29402,7 +29381,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    s_clause 0xe ; 60-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:264
@@ -31199,6 +31178,22 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48i16_to_v12f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -31223,22 +31218,6 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v5
@@ -31254,21 +31233,14 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -32960,7 +32932,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:308
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:304
@@ -32993,7 +32965,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:188
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:180
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:176
@@ -33026,7 +32998,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:60
-; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    s_clause 0xe ; 60-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:48
@@ -33133,7 +33105,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v23, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:8
@@ -33166,7 +33138,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:136
@@ -33199,7 +33171,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    s_clause 0xe ; 60-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:264
@@ -35392,6 +35364,8 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48f16_to_v12f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -35408,8 +35382,6 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -35436,34 +35408,34 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v20
@@ -37427,7 +37399,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:308
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:304
@@ -37460,7 +37432,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:188
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:180
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:176
@@ -37493,7 +37465,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:60
-; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    s_clause 0xe ; 60-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:48
@@ -37600,7 +37572,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v23, v185
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:8
@@ -37633,7 +37605,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:136
@@ -37666,7 +37638,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    s_clause 0xe ; 60-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:264
@@ -41255,6 +41227,11 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; SI-LABEL: bitcast_v48f16_to_v48i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
@@ -41271,11 +41248,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v3
@@ -41320,16 +41292,12 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v29, s29
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v32
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, v33
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v20
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v25, v35
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v37
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, s22
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index 07cdbef82d892..c7a199328012d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -4341,6 +4341,19 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v52i16_to_v26i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -4366,19 +4379,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v5
@@ -4394,17 +4394,12 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -4429,9 +4424,10 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -4443,10 +4439,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -5032,7 +5027,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -5099,6 +5093,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -5231,6 +5226,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB14_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -5245,9 +5243,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v33, v47, s6
@@ -5266,6 +5261,10 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -5294,10 +5293,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -6287,7 +6282,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -6320,7 +6315,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -6353,7 +6348,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -6465,7 +6460,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -6498,7 +6493,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -6531,7 +6526,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -9760,7 +9755,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -9827,6 +9821,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -9959,6 +9954,9 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -9973,9 +9971,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
@@ -9995,6 +9990,10 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -10023,10 +10022,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -10295,14 +10290,28 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v44
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -10318,22 +10327,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB19_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -10342,8 +10335,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
@@ -10363,10 +10356,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v57
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v62
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v36
@@ -10407,11 +10398,11 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v25, v38, v25
 ; SI-NEXT:    s_cbranch_execnz .LBB19_3
 ; SI-NEXT:  .LBB19_2: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
@@ -10425,7 +10416,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v52
@@ -10463,7 +10453,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -11113,7 +11102,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -11146,7 +11135,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -11179,7 +11168,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -11291,7 +11280,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -11324,7 +11313,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -11357,7 +11346,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -15076,6 +15065,19 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v52i16_to_v26f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -15101,19 +15103,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v5
@@ -15129,17 +15118,12 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -15164,9 +15148,10 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -15178,10 +15163,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -15767,7 +15751,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -15834,6 +15817,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -15966,6 +15950,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB30_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -15980,9 +15967,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v33, v47, s6
@@ -16001,6 +15985,10 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -16029,10 +16017,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -17022,7 +17006,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -17055,7 +17039,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -17088,7 +17072,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -17200,7 +17184,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -17233,7 +17217,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -17266,7 +17250,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -20653,7 +20637,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -20720,6 +20703,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -20852,6 +20836,9 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB34_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -20866,9 +20853,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
@@ -20888,6 +20872,10 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -20916,10 +20904,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -21188,14 +21172,28 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v44
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -21211,22 +21209,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB35_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -21235,8 +21217,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
@@ -21256,10 +21238,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v57
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v62
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v36
@@ -21300,11 +21280,11 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v25, v38, v25
 ; SI-NEXT:    s_cbranch_execnz .LBB35_3
 ; SI-NEXT:  .LBB35_2: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
@@ -21318,7 +21298,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v52
@@ -21356,7 +21335,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -22006,7 +21984,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -22039,7 +22017,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -22072,7 +22050,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -22184,7 +22162,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -22217,7 +22195,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -22250,7 +22228,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -25023,6 +25001,19 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v52i16_to_v13i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -25048,19 +25039,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v5
@@ -25076,17 +25054,12 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -25111,9 +25084,10 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -25125,10 +25099,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -25714,7 +25687,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -25781,6 +25753,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -25913,6 +25886,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB42_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -25927,9 +25903,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v33, v47, s6
@@ -25948,6 +25921,10 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -25976,10 +25953,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -26969,7 +26942,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -27002,7 +26975,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -27035,7 +27008,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -27147,7 +27120,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -27180,7 +27153,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -27213,7 +27186,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -30457,7 +30430,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -30524,6 +30496,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -30656,6 +30629,9 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB46_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -30670,9 +30646,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
@@ -30692,6 +30665,10 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -30720,10 +30697,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -30992,14 +30965,28 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v44
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -31015,22 +31002,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -31039,8 +31010,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
@@ -31060,10 +31031,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v57
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v62
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v36
@@ -31104,11 +31073,11 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v25, v38, v25
 ; SI-NEXT:    s_cbranch_execnz .LBB47_3
 ; SI-NEXT:  .LBB47_2: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
@@ -31122,7 +31091,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v52
@@ -31160,7 +31128,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -31810,7 +31777,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -31843,7 +31810,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -31876,7 +31843,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -31988,7 +31955,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -32021,7 +31988,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -32054,7 +32021,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -34053,6 +34020,19 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v52i16_to_v13f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -34078,19 +34058,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v5
@@ -34106,17 +34073,12 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -34141,9 +34103,10 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -34155,10 +34118,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -34744,7 +34706,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -34811,6 +34772,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -34943,6 +34905,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB50_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -34957,9 +34922,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v33, v47, s6
@@ -34978,6 +34940,10 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -35006,10 +34972,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -35999,7 +35961,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -36032,7 +35994,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -36065,7 +36027,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -36177,7 +36139,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -36210,7 +36172,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -36243,7 +36205,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -39539,7 +39501,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -39606,6 +39567,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -39738,6 +39700,9 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB54_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -39752,9 +39717,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
@@ -39774,6 +39736,10 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -39802,10 +39768,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -40074,14 +40036,28 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v44
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -40097,22 +40073,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB55_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -40121,8 +40081,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
@@ -40142,10 +40102,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v57
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v62
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v36
@@ -40186,11 +40144,11 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v25, v38, v25
 ; SI-NEXT:    s_cbranch_execnz .LBB55_3
 ; SI-NEXT:  .LBB55_2: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
@@ -40204,7 +40162,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v52
@@ -40242,7 +40199,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -40892,7 +40848,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -40925,7 +40881,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -40958,7 +40914,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -41070,7 +41026,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -41103,7 +41059,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -41136,7 +41092,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -45248,6 +45204,15 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-LABEL: bitcast_v52f16_to_v52i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
@@ -45264,15 +45229,6 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:24
 ; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v58, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v3
@@ -45317,26 +45273,19 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, s21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v54, s29
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v53, v32
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v33
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v30, v35
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v28
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v36
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v38
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v39
-; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v49
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, s22
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 8eb71e90f8504..77df03dcdcd9f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -4665,6 +4665,11 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v56i16_to_v28i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -4694,11 +4699,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v5
@@ -4715,9 +4715,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -5413,7 +5412,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -5486,6 +5484,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -5634,6 +5633,9 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB14_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -5648,9 +5650,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
@@ -5669,6 +5668,10 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -5697,10 +5700,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -6780,7 +6779,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -6813,7 +6812,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -6846,7 +6845,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -6960,7 +6959,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -6993,7 +6992,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -7026,7 +7025,7 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -10560,7 +10559,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -10633,6 +10631,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -10781,6 +10780,9 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -10795,9 +10797,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
@@ -10817,6 +10816,10 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -10845,10 +10848,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -11148,7 +11147,20 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -11156,7 +11168,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v61
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -11188,19 +11199,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v43
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, s17
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB19_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
@@ -11217,11 +11215,11 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_mov_b32_e32 v61, v44
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
 ; SI-NEXT:    v_mov_b32_e32 v39, v11
@@ -11299,6 +11297,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v27, v50, v27
 ; SI-NEXT:    s_cbranch_execnz .LBB19_3
 ; SI-NEXT:  .LBB19_2: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v55
@@ -11317,7 +11316,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v47
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
@@ -11585,7 +11583,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -12044,7 +12041,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -12077,7 +12074,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -12110,7 +12107,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -12224,7 +12221,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -12257,7 +12254,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -12290,7 +12287,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -16290,6 +16287,11 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v56i16_to_v28f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -16319,11 +16321,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v5
@@ -16340,9 +16337,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -17038,7 +17034,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -17111,6 +17106,7 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -17259,6 +17255,9 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB30_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -17273,9 +17272,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
@@ -17294,6 +17290,10 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -17322,10 +17322,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -18405,7 +18401,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -18438,7 +18434,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -18471,7 +18467,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -18585,7 +18581,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -18618,7 +18614,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -18651,7 +18647,7 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -22343,7 +22339,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -22416,6 +22411,7 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -22564,6 +22560,9 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB34_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -22578,9 +22577,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
@@ -22600,6 +22596,10 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -22628,10 +22628,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -22931,7 +22927,20 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -22939,7 +22948,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v61
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -22971,19 +22979,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v43
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, s17
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB35_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
@@ -23000,11 +22995,11 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_mov_b32_e32 v61, v44
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
 ; SI-NEXT:    v_mov_b32_e32 v39, v11
@@ -23082,6 +23077,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v27, v50, v27
 ; SI-NEXT:    s_cbranch_execnz .LBB35_3
 ; SI-NEXT:  .LBB35_2: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v55
@@ -23100,7 +23096,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v47
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
@@ -23368,7 +23363,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -23827,7 +23821,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -23860,7 +23854,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -23893,7 +23887,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -24007,7 +24001,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -24040,7 +24034,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -24073,7 +24067,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -27080,6 +27074,11 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v56i16_to_v14i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -27109,11 +27108,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v5
@@ -27130,9 +27124,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -27828,7 +27821,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -27901,6 +27893,7 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -28049,6 +28042,9 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB42_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -28063,9 +28059,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
@@ -28084,6 +28077,10 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -28112,10 +28109,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -29195,7 +29188,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -29228,7 +29221,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -29261,7 +29254,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -29375,7 +29368,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -29408,7 +29401,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -29441,7 +29434,7 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -32989,7 +32982,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -33062,6 +33054,7 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -33210,6 +33203,9 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB46_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -33224,9 +33220,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
@@ -33246,6 +33239,10 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -33274,10 +33271,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -33577,7 +33570,20 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -33585,7 +33591,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v61
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -33617,19 +33622,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v43
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, s17
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
@@ -33646,11 +33638,11 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_mov_b32_e32 v61, v44
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
 ; SI-NEXT:    v_mov_b32_e32 v39, v11
@@ -33728,6 +33720,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v27, v50, v27
 ; SI-NEXT:    s_cbranch_execnz .LBB47_3
 ; SI-NEXT:  .LBB47_2: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v55
@@ -33746,7 +33739,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v47
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
@@ -34014,7 +34006,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -34473,7 +34464,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -34506,7 +34497,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -34539,7 +34530,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -34653,7 +34644,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -34686,7 +34677,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -34719,7 +34710,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -36898,6 +36889,11 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v56i16_to_v14f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -36927,11 +36923,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v5
@@ -36948,9 +36939,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -37646,7 +37636,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -37719,6 +37708,7 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -37867,6 +37857,9 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB50_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -37881,9 +37874,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
@@ -37902,6 +37892,10 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -37930,10 +37924,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -39013,7 +39003,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -39046,7 +39036,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -39079,7 +39069,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -39193,7 +39183,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -39226,7 +39216,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -39259,7 +39249,7 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -42860,7 +42850,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -42933,6 +42922,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -43081,6 +43071,9 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB54_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -43095,9 +43088,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
@@ -43117,6 +43107,10 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -43145,10 +43139,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -43448,7 +43438,20 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -43456,7 +43459,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v61
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -43488,19 +43490,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v43
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, s17
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB55_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
@@ -43517,11 +43506,11 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_mov_b32_e32 v61, v44
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
 ; SI-NEXT:    v_mov_b32_e32 v39, v11
@@ -43599,6 +43588,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v27, v50, v27
 ; SI-NEXT:    s_cbranch_execnz .LBB55_3
 ; SI-NEXT:  .LBB55_2: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v55
@@ -43617,7 +43607,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v47
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
@@ -43885,7 +43874,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -44344,7 +44332,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -44377,7 +44365,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -44410,7 +44398,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -44524,7 +44512,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -44557,7 +44545,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -44590,7 +44578,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index 93c11f13ce3ce..c9e5771240078 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -5032,40 +5032,53 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -5096,27 +5109,10 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -5201,7 +5197,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; kill: killed $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v55
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v61
 ; SI-NEXT:    ; kill: killed $vgpr30
@@ -5346,7 +5341,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v55
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -5494,7 +5488,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v60i16_to_v30i32:
@@ -5776,7 +5770,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -5855,6 +5848,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -6019,6 +6013,9 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB14_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -6033,9 +6030,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
@@ -6054,6 +6048,10 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -6082,10 +6080,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -7241,7 +7235,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -7274,7 +7268,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -7307,7 +7301,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -7424,7 +7418,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -7457,7 +7451,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -7490,7 +7484,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -10345,6 +10339,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32
@@ -10373,23 +10370,12 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:84
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v7
@@ -10399,8 +10385,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v9
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v8
@@ -10422,9 +10406,18 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
@@ -10434,6 +10427,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
@@ -10471,7 +10465,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v61
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, v49
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v55
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -10486,6 +10479,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v32
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -11357,7 +11351,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -11436,6 +11429,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -11600,6 +11594,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -11614,9 +11611,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
@@ -11636,6 +11630,10 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -11664,10 +11662,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -11988,12 +11982,35 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v33
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
@@ -12003,7 +12020,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v44
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -12012,7 +12029,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v46
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -12021,7 +12038,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v57
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v58
 ; SI-NEXT:    v_cvt_f16_f32_e32 v58, s16
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -12032,38 +12049,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB19_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
-; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v34
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_or_b32_e32 v10, v32, v10
@@ -12088,12 +12079,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v12, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v11, v2
@@ -12202,12 +12193,10 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v40, v44
 ; SI-NEXT:    s_cbranch_vccnz .LBB19_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v58
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v33
@@ -12993,7 +12982,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -13026,7 +13015,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -13059,7 +13048,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -13176,7 +13165,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -13209,7 +13198,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -13242,7 +13231,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -17570,40 +17559,53 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -17634,27 +17636,10 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -17739,7 +17724,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; kill: killed $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v55
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v61
 ; SI-NEXT:    ; kill: killed $vgpr30
@@ -17884,7 +17868,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v55
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -18032,7 +18015,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v60i16_to_v30f32:
@@ -18314,7 +18297,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -18393,6 +18375,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -18557,6 +18540,9 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB30_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -18571,9 +18557,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
@@ -18592,6 +18575,10 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -18620,10 +18607,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -19779,7 +19762,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -19812,7 +19795,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -19845,7 +19828,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -19962,7 +19945,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -19995,7 +19978,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -20028,7 +20011,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -23044,6 +23027,9 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32
@@ -23072,23 +23058,12 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:84
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v7
@@ -23098,8 +23073,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v9
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v8
@@ -23121,9 +23094,18 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
@@ -23133,6 +23115,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
@@ -23170,7 +23153,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v61
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, v49
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v55
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -23185,6 +23167,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v32
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -24056,7 +24039,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -24135,6 +24117,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -24299,6 +24282,9 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB34_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -24313,9 +24299,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
@@ -24335,6 +24318,10 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -24363,10 +24350,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -24687,12 +24670,35 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v33
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
@@ -24702,7 +24708,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v44
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -24711,7 +24717,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v46
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -24720,7 +24726,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v57
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v58
 ; SI-NEXT:    v_cvt_f16_f32_e32 v58, s16
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -24731,38 +24737,12 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB35_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
-; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v34
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_or_b32_e32 v10, v32, v10
@@ -24787,12 +24767,12 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v12, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v11, v2
@@ -24901,12 +24881,10 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v40, v44
 ; SI-NEXT:    s_cbranch_vccnz .LBB35_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v58
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v33
@@ -25692,7 +25670,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -25725,7 +25703,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -25758,7 +25736,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -25875,7 +25853,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -25908,7 +25886,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -25941,7 +25919,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -29240,40 +29218,53 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -29304,27 +29295,10 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -29409,7 +29383,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; kill: killed $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v55
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v61
 ; SI-NEXT:    ; kill: killed $vgpr30
@@ -29554,7 +29527,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v55
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -29702,7 +29674,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v60i16_to_v15i64:
@@ -29984,7 +29956,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -30063,6 +30034,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -30227,6 +30199,9 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB42_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -30241,9 +30216,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
@@ -30262,6 +30234,10 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -30290,10 +30266,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -31449,7 +31421,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -31482,7 +31454,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -31515,7 +31487,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -31632,7 +31604,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -31665,7 +31637,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -31698,7 +31670,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -34570,6 +34542,9 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32
@@ -34598,23 +34573,12 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:84
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v7
@@ -34624,8 +34588,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v9
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v8
@@ -34647,9 +34609,18 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
@@ -34659,6 +34630,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
@@ -34696,7 +34668,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v61
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, v49
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v55
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -34711,6 +34682,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v32
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -35582,7 +35554,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -35661,6 +35632,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -35825,6 +35797,9 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB46_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -35839,9 +35814,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
@@ -35861,6 +35833,10 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -35889,10 +35865,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -36213,12 +36185,35 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v33
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
@@ -36228,7 +36223,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v44
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -36237,7 +36232,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v46
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -36246,7 +36241,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v57
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v58
 ; SI-NEXT:    v_cvt_f16_f32_e32 v58, s16
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -36257,38 +36252,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
-; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v34
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_or_b32_e32 v10, v32, v10
@@ -36313,12 +36282,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v12, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v11, v2
@@ -36427,12 +36396,10 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v40, v44
 ; SI-NEXT:    s_cbranch_vccnz .LBB47_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v58
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v33
@@ -37218,7 +37185,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -37251,7 +37218,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -37284,7 +37251,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -37401,7 +37368,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -37434,7 +37401,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -37467,7 +37434,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -39888,40 +39855,53 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -39952,27 +39932,10 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -40057,7 +40020,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; kill: killed $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v55
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v61
 ; SI-NEXT:    ; kill: killed $vgpr30
@@ -40202,7 +40164,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v55
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -40350,7 +40311,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v60i16_to_v15f64:
@@ -40632,7 +40593,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -40711,6 +40671,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -40875,6 +40836,9 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB50_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -40889,9 +40853,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
@@ -40910,6 +40871,10 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -40938,10 +40903,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -42097,7 +42058,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -42130,7 +42091,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -42163,7 +42124,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -42280,7 +42241,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -42313,7 +42274,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -42346,7 +42307,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -45262,6 +45223,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32
@@ -45290,23 +45254,12 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:84
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v7
@@ -45316,8 +45269,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v9
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v8
@@ -45339,9 +45290,18 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
@@ -45351,6 +45311,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
@@ -45388,7 +45349,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v61
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, v49
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v55
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -45403,6 +45363,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v32
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -46274,7 +46235,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -46353,6 +46313,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -46517,6 +46478,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB54_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -46531,9 +46495,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
@@ -46553,6 +46514,10 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -46581,10 +46546,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -46905,12 +46866,35 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v33
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
@@ -46920,7 +46904,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v44
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -46929,7 +46913,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v46
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -46938,7 +46922,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v57
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v58
 ; SI-NEXT:    v_cvt_f16_f32_e32 v58, s16
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -46949,38 +46933,12 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB55_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
-; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v34
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_or_b32_e32 v10, v32, v10
@@ -47005,12 +46963,12 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v12, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v11, v2
@@ -47119,12 +47077,10 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v40, v44
 ; SI-NEXT:    s_cbranch_vccnz .LBB55_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v58
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v33
@@ -47910,7 +47866,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
@@ -47943,7 +47899,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
@@ -47976,7 +47932,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
@@ -48093,7 +48049,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
@@ -48126,7 +48082,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
@@ -48159,7 +48115,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
@@ -51893,27 +51849,27 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v4
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v18
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v4
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v53, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v10
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
@@ -53259,6 +53215,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v44
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -53285,10 +53243,13 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v51, v11
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -53300,8 +53261,26 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v26
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v8
+; SI-NEXT:    v_mov_b32_e32 v8, v48
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v44
+; SI-NEXT:    v_lshr_b64 v[44:45], v[29:30], 16
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v38
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v43
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
@@ -53329,17 +53308,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v18, v3, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v16
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_mov_b32_e32 v51, v11
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v5
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
@@ -53382,52 +53355,32 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v6
 ; SI-NEXT:    v_lshr_b64 v[58:59], v[34:35], 16
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v8
-; SI-NEXT:    v_mov_b32_e32 v8, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v5
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v48
-; SI-NEXT:    v_or_b32_e32 v6, v3, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v44
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v31
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v48
 ; SI-NEXT:    v_mov_b32_e32 v59, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v38
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v60
-; SI-NEXT:    v_or_b32_e32 v4, v3, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v43
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshr_b64 v[47:48], v[17:18], 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshr_b64 v[44:45], v[29:30], 16
+; SI-NEXT:    v_or_b32_e32 v6, v3, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v24
 ; SI-NEXT:    v_cvt_f32_f16_e32 v24, v8
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v31
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v60
+; SI-NEXT:    v_or_b32_e32 v4, v3, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v39
+; SI-NEXT:    v_lshr_b64 v[47:48], v[17:18], 16
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v20
@@ -53524,14 +53477,15 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v32, v41
 ; SI-NEXT:    v_lshr_b64 v[40:41], v[21:22], 16
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[11:12], 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[56:57], 16
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v11, v24
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[8:9], v[9:10], 16
 ; SI-NEXT:    v_mov_b32_e32 v39, v31
 ; SI-NEXT:    v_mov_b32_e32 v31, v60
@@ -53541,7 +53495,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v37, v55
 ; SI-NEXT:    v_lshr_b64 v[55:56], v[5:6], 16
 ; SI-NEXT:    v_lshr_b64 v[24:25], v[3:4], 16
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[1:2], 16
 ; SI-NEXT:  .LBB59_3: ; %end
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v58
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
index 6ada0cb8c46f1..ab629e1a4d269 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
@@ -2214,40 +2214,40 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s22, 0
-; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s17
 ; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s19
 ; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s21
 ; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s20
 ; SI-NEXT:    s_cbranch_scc0 .LBB11_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; SI-NEXT:    v_alignbit_b32 v0, v0, v8, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v6, 16
-; SI-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
+; SI-NEXT:    v_lshr_b64 v[0:1], v[8:9], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
+; SI-NEXT:    v_lshr_b64 v[1:2], v[6:7], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
+; SI-NEXT:    v_lshr_b64 v[2:3], v[4:5], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB11_3
 ; SI-NEXT:  .LBB11_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v12
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v8
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v11
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v10
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
 ; SI-NEXT:  .LBB11_3: ; %end
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB11_4:
@@ -2263,60 +2263,61 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3
 ; VI-NEXT:    s_cbranch_execnz .LBB11_4
 ; VI-NEXT:  .LBB11_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v5
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v5
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v5
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v5
 ; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
-; VI-NEXT:    v_add_f32_e32 v3, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_lshrrev_b64 v[3:4], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v5
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v5
+; VI-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v3
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB11_3:
 ; VI-NEXT:    s_branch .LBB11_2
@@ -5430,40 +5431,40 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a,
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s22, 0
-; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s17
 ; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s19
 ; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s21
 ; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s20
 ; SI-NEXT:    s_cbranch_scc0 .LBB27_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; SI-NEXT:    v_alignbit_b32 v0, v0, v8, 16
-; SI-NEXT:    v_alignbit_b32 v1, v1, v6, 16
-; SI-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
+; SI-NEXT:    v_lshr_b64 v[0:1], v[8:9], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
+; SI-NEXT:    v_lshr_b64 v[1:2], v[6:7], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
+; SI-NEXT:    v_lshr_b64 v[2:3], v[4:5], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB27_3
 ; SI-NEXT:  .LBB27_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v12
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v8
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v11
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v10
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
 ; SI-NEXT:  .LBB27_3: ; %end
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB27_4:
@@ -5479,60 +5480,61 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a,
 ; VI-NEXT:    s_cbranch_execnz .LBB27_4
 ; VI-NEXT:  .LBB27_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v5
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v5
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v5
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v5
 ; VI-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NEXT:    v_add_f32_e32 v3, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
-; VI-NEXT:    v_add_f32_e32 v3, s4, v0
-; VI-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT:    v_lshrrev_b64 v[3:4], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v5
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v5
+; VI-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v3
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB27_3:
 ; VI-NEXT:    s_branch .LBB27_2
@@ -8098,70 +8100,71 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s22, 0
-; SI-NEXT:    v_mul_f32_e64 v17, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v21, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s20
 ; SI-NEXT:    s_cbranch_scc0 .LBB39_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v15
-; SI-NEXT:    v_alignbit_b32 v12, v1, v18, 16
-; SI-NEXT:    v_alignbit_b32 v13, v6, v16, 16
-; SI-NEXT:    v_lshr_b64 v[3:4], v[12:13], 24
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; SI-NEXT:    v_lshr_b64 v[4:5], v[12:13], 16
-; SI-NEXT:    v_alignbit_b32 v8, v10, v14, 16
-; SI-NEXT:    v_lshr_b64 v[1:2], v[12:13], 8
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
+; SI-NEXT:    v_lshr_b64 v[16:17], v[0:1], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v19
+; SI-NEXT:    v_lshr_b64 v[17:18], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[13:14], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[3:4], v[16:17], 24
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 8, v13
+; SI-NEXT:    v_lshr_b64 v[14:15], v[16:17], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB39_3
 ; SI-NEXT:  .LBB39_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v17
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v18
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_alignbit_b32 v12, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v15
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v21
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; SI-NEXT:    v_alignbit_b32 v13, v6, v1, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v14
-; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; SI-NEXT:    v_lshr_b64 v[3:4], v[12:13], 24
-; SI-NEXT:    v_alignbit_b32 v8, v10, v1, 16
-; SI-NEXT:    v_lshr_b64 v[4:5], v[12:13], 16
-; SI-NEXT:    v_lshr_b64 v[1:2], v[12:13], 8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
+; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshr_b64 v[16:17], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v20
+; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v19
+; SI-NEXT:    v_lshr_b64 v[17:18], v[5:6], 16
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; SI-NEXT:    v_lshr_b64 v[13:14], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[3:4], v[16:17], 24
+; SI-NEXT:    v_lshr_b64 v[14:15], v[16:17], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[16:17], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 8, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v5
 ; SI-NEXT:  .LBB39_3: ; %end
-; SI-NEXT:    v_mov_b32_e32 v0, v12
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v13
+; SI-NEXT:    v_mov_b32_e32 v0, v16
+; SI-NEXT:    v_mov_b32_e32 v2, v14
+; SI-NEXT:    v_mov_b32_e32 v4, v17
+; SI-NEXT:    v_mov_b32_e32 v5, v8
+; SI-NEXT:    v_mov_b32_e32 v8, v13
+; SI-NEXT:    v_mov_b32_e32 v9, v12
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB39_4:
-; SI-NEXT:    ; implicit-def: $vgpr12
+; SI-NEXT:    ; implicit-def: $vgpr16
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    s_branch .LBB39_2
 ;
@@ -8171,110 +8174,110 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3
 ; VI-NEXT:    s_cmp_lg_u32 s19, 0
 ; VI-NEXT:    s_cbranch_scc0 .LBB39_3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    s_lshr_b32 s19, s16, 8
-; VI-NEXT:    s_lshr_b32 s10, s18, 16
-; VI-NEXT:    s_lshr_b32 s11, s18, 8
+; VI-NEXT:    s_lshr_b32 s19, s18, 16
+; VI-NEXT:    s_lshr_b32 s15, s18, 8
 ; VI-NEXT:    s_lshr_b32 s12, s17, 24
-; VI-NEXT:    s_lshr_b32 s13, s17, 16
-; VI-NEXT:    s_lshr_b32 s15, s17, 8
+; VI-NEXT:    s_lshr_b32 s11, s17, 16
+; VI-NEXT:    s_lshr_b32 s10, s17, 8
 ; VI-NEXT:    s_lshr_b32 s14, s16, 16
+; VI-NEXT:    s_lshr_b32 s13, s16, 8
 ; VI-NEXT:    s_lshr_b64 s[6:7], s[18:19], 24
 ; VI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
 ; VI-NEXT:    s_cbranch_execnz .LBB39_4
 ; VI-NEXT:  .LBB39_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_mov_b32_e32 v3, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v3
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v3
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_alignbit_b32 v15, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[0:1]
+; VI-NEXT:    v_add_f32_e32 v0, s4, v3
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v3
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
-; VI-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
-; VI-NEXT:    v_alignbit_b32 v14, v2, v1, 16
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
+; VI-NEXT:    v_add_f32_e32 v2, s4, v3
+; VI-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v2
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v8, v0, v1, 16
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_add_f32_e32 v3, s4, v3
+; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; VI-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_lshrrev_b64 v[8:9], 16, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v1, v4
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x7fc07fc0
-; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[14:15]
+; VI-NEXT:    v_lshrrev_b64 v[14:15], 24, v[0:1]
 ; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
 ; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v15
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v15
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v15
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v14
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; VI-NEXT:    s_branch .LBB39_5
 ; VI-NEXT:  .LBB39_3:
-; VI-NEXT:    ; implicit-def: $sgpr19
+; VI-NEXT:    ; implicit-def: $sgpr13
 ; VI-NEXT:    ; implicit-def: $sgpr14
 ; VI-NEXT:    ; implicit-def: $sgpr4
-; VI-NEXT:    ; implicit-def: $sgpr15
-; VI-NEXT:    ; implicit-def: $sgpr13
-; VI-NEXT:    ; implicit-def: $sgpr12
-; VI-NEXT:    ; implicit-def: $sgpr11
 ; VI-NEXT:    ; implicit-def: $sgpr10
+; VI-NEXT:    ; implicit-def: $sgpr11
+; VI-NEXT:    ; implicit-def: $sgpr12
+; VI-NEXT:    ; implicit-def: $sgpr15
+; VI-NEXT:    ; implicit-def: $sgpr19
 ; VI-NEXT:    ; implicit-def: $sgpr6
 ; VI-NEXT:    s_branch .LBB39_2
 ; VI-NEXT:  .LBB39_4:
-; VI-NEXT:    v_mov_b32_e32 v14, s16
-; VI-NEXT:    v_mov_b32_e32 v15, s17
 ; VI-NEXT:    v_mov_b32_e32 v8, s18
-; VI-NEXT:    v_mov_b32_e32 v1, s19
+; VI-NEXT:    v_mov_b32_e32 v0, s16
+; VI-NEXT:    v_mov_b32_e32 v4, s17
+; VI-NEXT:    v_mov_b32_e32 v10, s19
+; VI-NEXT:    v_mov_b32_e32 v13, s15
 ; VI-NEXT:    v_mov_b32_e32 v2, s14
-; VI-NEXT:    v_mov_b32_e32 v5, s15
-; VI-NEXT:    v_mov_b32_e32 v6, s13
+; VI-NEXT:    v_mov_b32_e32 v1, s13
 ; VI-NEXT:    v_mov_b32_e32 v7, s12
-; VI-NEXT:    v_mov_b32_e32 v13, s11
-; VI-NEXT:    v_mov_b32_e32 v10, s10
+; VI-NEXT:    v_mov_b32_e32 v6, s11
+; VI-NEXT:    v_mov_b32_e32 v5, s10
 ; VI-NEXT:    v_mov_b32_e32 v11, s6
-; VI-NEXT:    v_mov_b32_e32 v3, s4
+; VI-NEXT:    v_mov_b32_e32 v14, s4
 ; VI-NEXT:  .LBB39_5: ; %end
-; VI-NEXT:    v_mov_b32_e32 v0, v14
-; VI-NEXT:    v_mov_b32_e32 v4, v15
+; VI-NEXT:    v_mov_b32_e32 v3, v14
 ; VI-NEXT:    v_mov_b32_e32 v9, v13
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -11854,33 +11857,32 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i
 ; VI-NEXT:    s_cbranch_execnz .LBB49_4
 ; VI-NEXT:  .LBB49_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    v_mov_b32_e32 v3, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v3
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v3, v2, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v3
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; VI-NEXT:    v_add_f32_e32 v2, s4, v3
+; VI-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
 ; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v3
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; VI-NEXT:    v_bfe_u32 v5, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v2
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
@@ -11889,25 +11891,27 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v3
 ; VI-NEXT:    v_bfe_u32 v6, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
 ; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
+; VI-NEXT:    v_add_f32_e32 v3, s4, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
-; VI-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; VI-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v2, v0, v2, 16
-; VI-NEXT:    v_alignbit_b32 v1, v5, v1, 16
-; VI-NEXT:    v_alignbit_b32 v0, v4, v3, 16
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[2:3]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[3:4], 16, v[4:5]
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v3
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB49_3:
 ; VI-NEXT:    s_branch .LBB49_2
@@ -12813,50 +12817,52 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s22, 0
 ; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s21
 ; SI-NEXT:    s_cbranch_scc0 .LBB53_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
 ; SI-NEXT:    s_cbranch_execnz .LBB53_3
 ; SI-NEXT:  .LBB53_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v10
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v11
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v2
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v9
-; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v8
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v10
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v9
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; SI-NEXT:    v_lshr_b64 v[6:7], v[1:2], 16
-; SI-NEXT:    v_alignbit_b32 v4, v5, v9, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshr_b64 v[6:7], v[4:5], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; SI-NEXT:    v_lshr_b64 v[2:3], v[3:4], 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v11
+; SI-NEXT:    v_lshr_b64 v[7:8], v[1:2], 16
 ; SI-NEXT:  .LBB53_3: ; %end
-; SI-NEXT:    v_mov_b32_e32 v1, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v3, v4
+; SI-NEXT:    v_mov_b32_e32 v4, v6
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB53_4:
 ; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    s_branch .LBB53_2
 ;
@@ -12869,33 +12875,32 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3
 ; VI-NEXT:    s_cbranch_execnz .LBB53_4
 ; VI-NEXT:  .LBB53_2: ; %cmp.true
 ; VI-NEXT:    s_lshl_b32 s4, s16, 16
-; VI-NEXT:    v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT:    v_mov_b32_e32 v3, 0x40c00000
+; VI-NEXT:    v_add_f32_e32 v0, s4, v3
+; VI-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_cndmask_b32_e32 v3, v2, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_add_f32_e32 v1, s4, v3
 ; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s17, 16
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; VI-NEXT:    v_add_f32_e32 v1, s4, v0
-; VI-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; VI-NEXT:    v_add_f32_e32 v2, s4, v3
+; VI-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
 ; VI-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT:    v_add_f32_e32 v2, s4, v3
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; VI-NEXT:    v_bfe_u32 v5, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v2
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
@@ -12904,25 +12909,27 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
 ; VI-NEXT:    s_lshl_b32 s4, s18, 16
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; VI-NEXT:    v_add_f32_e32 v2, s4, v0
+; VI-NEXT:    v_add_f32_e32 v2, s4, v3
 ; VI-NEXT:    v_bfe_u32 v6, v2, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; VI-NEXT:    s_and_b32 s4, s18, 0xffff0000
 ; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT:    v_add_f32_e32 v0, s4, v0
+; VI-NEXT:    v_add_f32_e32 v3, s4, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
-; VI-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; VI-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; VI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_alignbit_b32 v2, v0, v2, 16
-; VI-NEXT:    v_alignbit_b32 v1, v5, v1, 16
-; VI-NEXT:    v_alignbit_b32 v0, v4, v3, 16
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b64 v[2:3], 16, v[2:3]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_lshrrev_b64 v[3:4], 16, v[4:5]
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, v3
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB53_3:
 ; VI-NEXT:    s_branch .LBB53_2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll
new file mode 100644
index 0000000000000..d7d623ac89146
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s
+
+; Make sure we do not infer anything about implicit inputs through an
+; intrinsic call which is not nocallback.
+
+declare zeroext i32 @return_i32()
+
+define i32 @test_i32_return() gc "statepoint-example" {
+; CHECK-LABEL: define i32 @test_i32_return(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] gc "statepoint-example" {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SAFEPOINT_TOKEN:%.*]] = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[CALL1:%.*]] = call zeroext i32 @llvm.experimental.gc.result.i32(token [[SAFEPOINT_TOKEN]])
+; CHECK-NEXT:    ret i32 [[CALL1]]
+;
+entry:
+  %safepoint_token = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0)
+  %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
+  ret i32 %call1
+}
+
+declare token @llvm.experimental.gc.statepoint.p0(i64 immarg, i32 immarg, ptr, i32 immarg, i32 immarg, ...)
+declare i32 @llvm.experimental.gc.result.i32(token) #0
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
+;.
+; CHECK: attributes #[[ATTR0]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
new file mode 100644
index 0000000000000..71c509afa8e64
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-attributor -mcpu=gfx90a %s | FileCheck %s
+
+; Make sure we infer no inputs are used through some intrinsics
+
+define void @use_fake_use(i32 %arg) {
+; CHECK-LABEL: define void @use_fake_use(
+; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void (...) @llvm.fake.use(i32 [[ARG]])
+; CHECK-NEXT:    ret void
+;
+  call void (...) @llvm.fake.use(i32 %arg)
+  ret void
+}
+
+define void @use_donothing() {
+; CHECK-LABEL: define void @use_donothing(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.donothing()
+  ret void
+}
+
+define void @use_assume(i1 %arg) {
+; CHECK-LABEL: define void @use_assume(
+; CHECK-SAME: i1 [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ARG]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.assume(i1 %arg)
+  ret void
+}
+
+define void @use_trap() {
+; CHECK-LABEL: define void @use_trap(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.trap()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.trap()
+  ret void
+}
+
+define void @use_debugtrap() {
+; CHECK-LABEL: define void @use_debugtrap(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @llvm.debugtrap()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.debugtrap()
+  ret void
+}
+
+define void @use_ubsantrap() {
+; CHECK-LABEL: define void @use_ubsantrap(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @llvm.ubsantrap(i8 0)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.ubsantrap(i8 0)
+  ret void
+}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 51df8c34cc55e..df77e7de43bf6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -40,34 +40,33 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-LABEL: udiv_i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT:    s_sub_i32 s4, 0, s3
-; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GFX6-NEXT:    s_sub_i32 s2, 0, s5
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
-; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    s_mul_i32 s0, s0, s3
-; GFX6-NEXT:    s_sub_i32 s0, s2, s0
-; GFX6-NEXT:    s_sub_i32 s1, s0, s3
+; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX6-NEXT:    s_mul_i32 s6, s6, s5
+; GFX6-NEXT:    s_sub_i32 s4, s4, s6
+; GFX6-NEXT:    s_sub_i32 s6, s4, s5
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s5
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    s_cselect_b32 s4, s6, s4
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s5
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: udiv_i32:
@@ -138,31 +137,30 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-LABEL: urem_i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT:    s_sub_i32 s4, 0, s3
-; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GFX6-NEXT:    s_sub_i32 s2, 0, s5
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
-; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    s_mul_i32 s0, s0, s3
-; GFX6-NEXT:    s_sub_i32 s0, s2, s0
-; GFX6-NEXT:    s_sub_i32 s1, s0, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
-; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX6-NEXT:    s_sub_i32 s1, s0, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
-; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX6-NEXT:    s_mul_i32 s6, s6, s5
+; GFX6-NEXT:    s_sub_i32 s4, s4, s6
+; GFX6-NEXT:    s_sub_i32 s6, s4, s5
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s5
+; GFX6-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX6-NEXT:    s_sub_i32 s6, s4, s5
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s5
+; GFX6-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: urem_i32:
@@ -242,40 +240,39 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-LABEL: sdiv_i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_abs_i32 s8, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX6-NEXT:    s_sub_i32 s4, 0, s8
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    s_xor_b32 s1, s2, s3
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_abs_i32 s6, s5
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX6-NEXT:    s_sub_i32 s2, 0, s6
+; GFX6-NEXT:    s_abs_i32 s7, s4
+; GFX6-NEXT:    s_xor_b32 s4, s4, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 31
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 31
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_abs_i32 s0, s2
+; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s2, s2, s8
-; GFX6-NEXT:    s_sub_i32 s0, s0, s2
-; GFX6-NEXT:    s_sub_i32 s2, s0, s8
+; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX6-NEXT:    s_mul_i32 s5, s5, s6
+; GFX6-NEXT:    s_sub_i32 s5, s7, s5
+; GFX6-NEXT:    s_sub_i32 s7, s5, s6
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s5, s6
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX6-NEXT:    s_cselect_b32 s5, s7, s5
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s5, s6
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s1, v0
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_i32:
@@ -360,36 +357,35 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-LABEL: srem_i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_abs_i32 s3, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT:    s_sub_i32 s4, 0, s3
-; GFX6-NEXT:    s_abs_i32 s8, s2
-; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_abs_i32 s5, s5
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GFX6-NEXT:    s_sub_i32 s2, 0, s5
+; GFX6-NEXT:    s_abs_i32 s6, s4
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 31
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
+; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX6-NEXT:    s_mul_i32 s1, s1, s3
-; GFX6-NEXT:    s_sub_i32 s1, s8, s1
-; GFX6-NEXT:    s_sub_i32 s2, s1, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX6-NEXT:    s_sub_i32 s2, s1, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
-; GFX6-NEXT:    s_xor_b32 s1, s1, s0
-; GFX6-NEXT:    s_sub_i32 s0, s1, s0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX6-NEXT:    s_mul_i32 s7, s7, s5
+; GFX6-NEXT:    s_sub_i32 s6, s6, s7
+; GFX6-NEXT:    s_sub_i32 s7, s6, s5
+; GFX6-NEXT:    s_cmp_ge_u32 s6, s5
+; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX6-NEXT:    s_sub_i32 s7, s6, s5
+; GFX6-NEXT:    s_cmp_ge_u32 s6, s5
+; GFX6-NEXT:    s_cselect_b32 s5, s7, s6
+; GFX6-NEXT:    s_xor_b32 s5, s5, s4
+; GFX6-NEXT:    s_sub_i32 s4, s5, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: srem_i32:
@@ -5462,15 +5458,14 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-LABEL: udiv_i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_add_i32 s0, s3, 12
-; GFX6-NEXT:    s_lshr_b32 s0, s2, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_add_i32 s5, s5, 12
+; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: udiv_i32_pow2_shl_denom:
@@ -5503,16 +5498,15 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX6-LABEL: udiv_v2i32_pow2k_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    s_lshr_b32 s0, s2, 12
-; GFX6-NEXT:    s_lshr_b32 s1, s3, 12
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_lshr_b32 s4, s4, 12
+; GFX6-NEXT:    s_lshr_b32 s5, s5, 12
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: udiv_v2i32_pow2k_denom:
@@ -5546,19 +5540,18 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0x100101
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_lshr_b32 s0, s2, 12
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s3, v0
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX6-NEXT:    s_lshr_b32 s4, s4, 12
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 11, v0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
@@ -5855,16 +5848,15 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-LABEL: urem_i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s3
-; GFX6-NEXT:    s_add_i32 s0, s0, -1
-; GFX6-NEXT:    s_and_b32 s0, s2, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_lshl_b32 s5, 0x1000, s5
+; GFX6-NEXT:    s_add_i32 s5, s5, -1
+; GFX6-NEXT:    s_and_b32 s4, s4, s5
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: urem_i32_pow2_shl_denom:
@@ -5898,16 +5890,15 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX6-LABEL: urem_v2i32_pow2k_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    s_and_b32 s0, s2, 0xfff
-; GFX6-NEXT:    s_and_b32 s1, s3, 0xfff
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_and_b32 s4, s4, 0xfff
+; GFX6-NEXT:    s_and_b32 s5, s5, 0xfff
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: urem_v2i32_pow2k_denom:
@@ -6187,41 +6178,40 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-LABEL: sdiv_i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX6-NEXT:    s_abs_i32 s8, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX6-NEXT:    s_sub_i32 s4, 0, s8
-; GFX6-NEXT:    s_abs_i32 s9, s2
-; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_lshl_b32 s5, 0x1000, s5
+; GFX6-NEXT:    s_abs_i32 s6, s5
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX6-NEXT:    s_sub_i32 s2, 0, s6
+; GFX6-NEXT:    s_abs_i32 s7, s4
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
-; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    s_mul_i32 s0, s0, s8
-; GFX6-NEXT:    s_sub_i32 s0, s9, s0
-; GFX6-NEXT:    s_sub_i32 s1, s0, s8
+; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    s_mul_i32 s8, s8, s6
+; GFX6-NEXT:    s_sub_i32 s7, s7, s8
+; GFX6-NEXT:    s_sub_i32 s8, s7, s6
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s7, s6
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    s_cselect_b32 s7, s8, s7
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s7, s6
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX6-NEXT:    s_xor_b32 s0, s2, s3
+; GFX6-NEXT:    s_xor_b32 s4, s4, s5
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 31
-; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 31
+; GFX6-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
@@ -6279,22 +6269,21 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX6-LABEL: sdiv_v2i32_pow2k_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
-; GFX6-NEXT:    s_ashr_i32 s1, s3, 31
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 20
-; GFX6-NEXT:    s_add_i32 s0, s2, s0
-; GFX6-NEXT:    s_add_i32 s1, s3, s1
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 12
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 12
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_ashr_i32 s6, s4, 31
+; GFX6-NEXT:    s_ashr_i32 s7, s5, 31
+; GFX6-NEXT:    s_lshr_b32 s6, s6, 20
+; GFX6-NEXT:    s_lshr_b32 s7, s7, 20
+; GFX6-NEXT:    s_add_i32 s4, s4, s6
+; GFX6-NEXT:    s_add_i32 s5, s5, s7
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 12
+; GFX6-NEXT:    s_ashr_i32 s5, s5, 12
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
@@ -6334,22 +6323,21 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out,
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0x80080081
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mul_hi_i32 v0, s3, v0
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX6-NEXT:    s_add_i32 s0, s2, s0
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s3, v0
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 12
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    v_mul_hi_i32 v0, s5, v0
+; GFX6-NEXT:    s_ashr_i32 s6, s4, 31
+; GFX6-NEXT:    s_lshr_b32 s6, s6, 20
+; GFX6-NEXT:    s_add_i32 s4, s4, s6
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 12
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
-; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v0, v1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
@@ -6700,37 +6688,36 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-LABEL: srem_i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX6-NEXT:    s_abs_i32 s3, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT:    s_sub_i32 s4, 0, s3
-; GFX6-NEXT:    s_abs_i32 s8, s2
-; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s5
+; GFX6-NEXT:    s_abs_i32 s5, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GFX6-NEXT:    s_sub_i32 s2, 0, s5
+; GFX6-NEXT:    s_abs_i32 s6, s4
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
-; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    s_mul_i32 s0, s0, s3
-; GFX6-NEXT:    s_sub_i32 s0, s8, s0
-; GFX6-NEXT:    s_sub_i32 s1, s0, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
-; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX6-NEXT:    s_sub_i32 s1, s0, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
-; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
-; GFX6-NEXT:    s_ashr_i32 s1, s2, 31
-; GFX6-NEXT:    s_xor_b32 s0, s0, s1
-; GFX6-NEXT:    s_sub_i32 s0, s0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX6-NEXT:    s_mul_i32 s7, s7, s5
+; GFX6-NEXT:    s_sub_i32 s6, s6, s7
+; GFX6-NEXT:    s_sub_i32 s7, s6, s5
+; GFX6-NEXT:    s_cmp_ge_u32 s6, s5
+; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX6-NEXT:    s_sub_i32 s7, s6, s5
+; GFX6-NEXT:    s_cmp_ge_u32 s6, s5
+; GFX6-NEXT:    s_cselect_b32 s5, s7, s6
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 31
+; GFX6-NEXT:    s_xor_b32 s5, s5, s4
+; GFX6-NEXT:    s_sub_i32 s4, s5, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: srem_i32_pow2_shl_denom:
@@ -6785,24 +6772,23 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX6-LABEL: srem_v2i32_pow2k_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
-; GFX6-NEXT:    s_ashr_i32 s1, s3, 31
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 20
-; GFX6-NEXT:    s_add_i32 s0, s2, s0
-; GFX6-NEXT:    s_add_i32 s1, s3, s1
-; GFX6-NEXT:    s_and_b32 s0, s0, 0xfffff000
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xfffff000
-; GFX6-NEXT:    s_sub_i32 s0, s2, s0
-; GFX6-NEXT:    s_sub_i32 s1, s3, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_ashr_i32 s6, s4, 31
+; GFX6-NEXT:    s_lshr_b32 s6, s6, 20
+; GFX6-NEXT:    s_ashr_i32 s7, s5, 31
+; GFX6-NEXT:    s_add_i32 s6, s4, s6
+; GFX6-NEXT:    s_lshr_b32 s7, s7, 20
+; GFX6-NEXT:    s_and_b32 s6, s6, 0xfffff000
+; GFX6-NEXT:    s_sub_i32 s4, s4, s6
+; GFX6-NEXT:    s_add_i32 s6, s5, s7
+; GFX6-NEXT:    s_and_b32 s6, s6, 0xfffff000
+; GFX6-NEXT:    s_sub_i32 s5, s5, s6
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: srem_v2i32_pow2k_denom:
@@ -7772,7 +7758,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
 ; GFX6-NEXT:    s_ashr_i32 s8, s1, 31
@@ -7782,8 +7767,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[8:9]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX6-NEXT:    s_sub_u32 s12, 0, s10
-; GFX6-NEXT:    s_subb_u32 s13, 0, s11
+; GFX6-NEXT:    s_sub_u32 s0, 0, s10
+; GFX6-NEXT:    s_subb_u32 s1, 0, s11
 ; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -7792,128 +7777,121 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    s_mul_i32 s1, s12, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
-; GFX6-NEXT:    s_mul_i32 s15, s13, s0
-; GFX6-NEXT:    s_mul_i32 s16, s12, s0
-; GFX6-NEXT:    s_add_i32 s1, s17, s1
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s16
-; GFX6-NEXT:    s_add_i32 s1, s1, s15
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s1
-; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s16
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v3
-; GFX6-NEXT:    s_mul_i32 s17, s0, s1
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s1
-; GFX6-NEXT:    s_add_u32 s15, s15, s17
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v0
-; GFX6-NEXT:    s_addc_u32 s17, 0, s17
-; GFX6-NEXT:    s_mul_i32 s16, s14, s16
-; GFX6-NEXT:    v_readfirstlane_b32 s18, v4
-; GFX6-NEXT:    s_add_u32 s15, s15, s16
-; GFX6-NEXT:    s_addc_u32 s15, s17, s18
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
-; GFX6-NEXT:    s_addc_u32 s16, s16, 0
-; GFX6-NEXT:    s_mul_i32 s1, s14, s1
-; GFX6-NEXT:    s_add_u32 s1, s15, s1
-; GFX6-NEXT:    s_addc_u32 s15, 0, s16
-; GFX6-NEXT:    s_add_u32 s16, s0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_addc_u32 s14, s14, s15
-; GFX6-NEXT:    s_mul_i32 s0, s12, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX6-NEXT:    s_add_i32 s0, s1, s0
-; GFX6-NEXT:    s_mul_i32 s13, s13, s16
-; GFX6-NEXT:    s_mul_i32 s1, s12, s16
-; GFX6-NEXT:    s_add_i32 s0, s0, s13
-; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mul_hi_u32 v3, s14, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s16, v2
-; GFX6-NEXT:    v_mul_hi_u32 v1, s14, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s16, v0
-; GFX6-NEXT:    s_mul_i32 s13, s16, s0
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
-; GFX6-NEXT:    s_add_u32 s13, s17, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
-; GFX6-NEXT:    s_mul_i32 s1, s14, s1
-; GFX6-NEXT:    s_addc_u32 s15, 0, s15
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
-; GFX6-NEXT:    s_add_u32 s1, s13, s1
-; GFX6-NEXT:    s_addc_u32 s1, s15, s12
+; GFX6-NEXT:    v_mul_hi_u32 v2, s0, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX6-NEXT:    s_addc_u32 s12, s12, 0
-; GFX6-NEXT:    s_mul_i32 s0, s14, s0
-; GFX6-NEXT:    s_add_u32 s0, s1, s0
-; GFX6-NEXT:    s_addc_u32 s12, 0, s12
-; GFX6-NEXT:    s_add_u32 s15, s16, s0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_addc_u32 s14, s14, s12
-; GFX6-NEXT:    s_ashr_i32 s12, s7, 31
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s13, s0, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v2
+; GFX6-NEXT:    s_mul_i32 s14, s1, s2
+; GFX6-NEXT:    s_mul_i32 s15, s0, s2
+; GFX6-NEXT:    s_add_i32 s13, s16, s13
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s15
+; GFX6-NEXT:    s_add_i32 s13, s13, s14
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s13
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s15
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v3
+; GFX6-NEXT:    s_mul_i32 s16, s2, s13
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s13
+; GFX6-NEXT:    s_add_u32 s14, s14, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX6-NEXT:    s_mul_i32 s15, s12, s15
+; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v4
+; GFX6-NEXT:    s_add_u32 s14, s14, s15
+; GFX6-NEXT:    s_addc_u32 s14, s16, s17
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v1
+; GFX6-NEXT:    s_addc_u32 s15, s15, 0
+; GFX6-NEXT:    s_mul_i32 s13, s12, s13
+; GFX6-NEXT:    s_add_u32 s13, s14, s13
+; GFX6-NEXT:    s_addc_u32 s14, 0, s15
+; GFX6-NEXT:    s_add_u32 s13, s2, s13
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
+; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT:    s_addc_u32 s12, s12, s14
+; GFX6-NEXT:    s_mul_i32 s14, s0, s12
+; GFX6-NEXT:    s_mul_i32 s1, s1, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
+; GFX6-NEXT:    s_add_i32 s14, s15, s14
+; GFX6-NEXT:    s_mul_i32 s0, s0, s13
+; GFX6-NEXT:    s_add_i32 s1, s14, s1
+; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s13, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s12, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GFX6-NEXT:    s_mul_i32 s15, s13, s1
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
+; GFX6-NEXT:    s_add_u32 s15, s17, s15
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX6-NEXT:    s_mul_i32 s0, s12, s0
+; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v3
+; GFX6-NEXT:    s_add_u32 s0, s15, s0
+; GFX6-NEXT:    s_addc_u32 s0, s16, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    s_addc_u32 s14, s14, 0
+; GFX6-NEXT:    s_mul_i32 s1, s12, s1
+; GFX6-NEXT:    s_add_u32 s0, s0, s1
+; GFX6-NEXT:    s_addc_u32 s1, 0, s14
+; GFX6-NEXT:    s_add_u32 s14, s13, s0
+; GFX6-NEXT:    s_addc_u32 s15, s12, s1
+; GFX6-NEXT:    s_ashr_i32 s12, s7, 31
 ; GFX6-NEXT:    s_add_u32 s0, s6, s12
 ; GFX6-NEXT:    s_mov_b32 s13, s12
 ; GFX6-NEXT:    s_addc_u32 s1, s7, s12
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
-; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s15
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s6, v0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s15
+; GFX6-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v2
 ; GFX6-NEXT:    s_mov_b32 s0, s4
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s7, v2
-; GFX6-NEXT:    s_mul_i32 s1, s6, s14
+; GFX6-NEXT:    s_mul_i32 s1, s6, s15
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
 ; GFX6-NEXT:    s_add_u32 s1, s16, s1
 ; GFX6-NEXT:    s_addc_u32 s4, 0, s4
-; GFX6-NEXT:    s_mul_i32 s15, s7, s15
+; GFX6-NEXT:    s_mul_i32 s14, s7, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
-; GFX6-NEXT:    s_add_u32 s1, s1, s15
+; GFX6-NEXT:    s_add_u32 s1, s1, s14
 ; GFX6-NEXT:    s_addc_u32 s1, s4, s16
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX6-NEXT:    s_addc_u32 s4, s4, 0
-; GFX6-NEXT:    s_mul_i32 s14, s7, s14
-; GFX6-NEXT:    s_add_u32 s16, s1, s14
-; GFX6-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NEXT:    s_mul_i32 s14, s7, s15
+; GFX6-NEXT:    s_add_u32 s14, s1, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s14
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT:    s_addc_u32 s17, 0, s4
+; GFX6-NEXT:    s_addc_u32 s15, 0, s4
 ; GFX6-NEXT:    s_mov_b32 s1, s5
-; GFX6-NEXT:    s_mul_i32 s4, s10, s17
+; GFX6-NEXT:    s_mul_i32 s4, s10, s15
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX6-NEXT:    s_add_i32 s4, s5, s4
-; GFX6-NEXT:    s_mul_i32 s5, s11, s16
-; GFX6-NEXT:    s_add_i32 s18, s4, s5
-; GFX6-NEXT:    s_sub_i32 s14, s7, s18
-; GFX6-NEXT:    s_mul_i32 s4, s10, s16
+; GFX6-NEXT:    s_mul_i32 s5, s11, s14
+; GFX6-NEXT:    s_add_i32 s16, s4, s5
+; GFX6-NEXT:    s_sub_i32 s17, s7, s16
+; GFX6-NEXT:    s_mul_i32 s4, s10, s14
 ; GFX6-NEXT:    s_sub_u32 s6, s6, s4
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT:    s_or_b32 s15, s4, s5
-; GFX6-NEXT:    s_subb_u32 s19, s14, s11
-; GFX6-NEXT:    s_sub_u32 s20, s6, s10
-; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_subb_u32 s14, s19, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s14, s11
-; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s20, s10
+; GFX6-NEXT:    s_subb_u32 s17, s17, s11
+; GFX6-NEXT:    s_sub_u32 s18, s6, s10
+; GFX6-NEXT:    s_subb_u32 s17, s17, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s17, s11
 ; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s14, s11
-; GFX6-NEXT:    s_cselect_b32 s14, s19, s15
-; GFX6-NEXT:    s_add_u32 s15, s16, 1
-; GFX6-NEXT:    s_addc_u32 s19, s17, 0
-; GFX6-NEXT:    s_add_u32 s20, s16, 2
-; GFX6-NEXT:    s_addc_u32 s21, s17, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_cselect_b32 s14, s20, s15
-; GFX6-NEXT:    s_cselect_b32 s15, s21, s19
+; GFX6-NEXT:    s_cmp_ge_u32 s18, s10
+; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s17, s11
+; GFX6-NEXT:    s_cselect_b32 s17, s18, s19
+; GFX6-NEXT:    s_add_u32 s18, s14, 1
+; GFX6-NEXT:    s_addc_u32 s19, s15, 0
+; GFX6-NEXT:    s_add_u32 s20, s14, 2
+; GFX6-NEXT:    s_addc_u32 s21, s15, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX6-NEXT:    s_cselect_b32 s17, s20, s18
+; GFX6-NEXT:    s_cselect_b32 s18, s21, s19
 ; GFX6-NEXT:    s_or_b32 s4, s4, s5
-; GFX6-NEXT:    s_subb_u32 s4, s7, s18
+; GFX6-NEXT:    s_subb_u32 s4, s7, s16
 ; GFX6-NEXT:    s_cmp_ge_u32 s4, s11
 ; GFX6-NEXT:    s_cselect_b32 s5, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s6, s10
@@ -7921,13 +7899,14 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_cmp_eq_u32 s4, s11
 ; GFX6-NEXT:    s_cselect_b32 s4, s6, s5
 ; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s15, s17
-; GFX6-NEXT:    s_cselect_b32 s4, s14, s16
+; GFX6-NEXT:    s_cselect_b32 s5, s18, s15
+; GFX6-NEXT:    s_cselect_b32 s4, s17, s14
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[12:13], s[8:9]
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_sub_u32 s4, s4, s6
 ; GFX6-NEXT:    s_subb_u32 s5, s5, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -8278,8 +8257,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX6-NEXT:    s_sub_u32 s14, 0, s6
-; GFX6-NEXT:    s_subb_u32 s15, 0, s7
+; GFX6-NEXT:    s_sub_u32 s12, 0, s6
+; GFX6-NEXT:    s_subb_u32 s13, 0, s7
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8288,69 +8267,65 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s14, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
-; GFX6-NEXT:    s_mul_i32 s13, s14, s16
+; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
+; GFX6-NEXT:    s_mul_i32 s16, s12, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX6-NEXT:    s_mul_i32 s17, s15, s12
-; GFX6-NEXT:    s_mul_i32 s18, s14, s12
-; GFX6-NEXT:    s_add_i32 s13, s19, s13
+; GFX6-NEXT:    s_mul_i32 s17, s13, s15
+; GFX6-NEXT:    s_mul_i32 s18, s12, s15
+; GFX6-NEXT:    s_add_i32 s16, s19, s16
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s18
-; GFX6-NEXT:    s_add_i32 s13, s13, s17
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s13
+; GFX6-NEXT:    s_add_i32 s16, s16, s17
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s16
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s18
 ; GFX6-NEXT:    v_readfirstlane_b32 s17, v3
-; GFX6-NEXT:    s_mul_i32 s20, s12, s13
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s13
+; GFX6-NEXT:    s_mul_i32 s20, s15, s16
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s16
 ; GFX6-NEXT:    s_add_u32 s17, s17, s20
 ; GFX6-NEXT:    v_readfirstlane_b32 s20, v0
-; GFX6-NEXT:    s_mul_i32 s18, s16, s18
+; GFX6-NEXT:    s_mul_i32 s18, s14, s18
 ; GFX6-NEXT:    s_addc_u32 s20, 0, s20
 ; GFX6-NEXT:    v_readfirstlane_b32 s19, v4
 ; GFX6-NEXT:    s_add_u32 s17, s17, s18
 ; GFX6-NEXT:    s_addc_u32 s17, s20, s19
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v1
 ; GFX6-NEXT:    s_addc_u32 s18, s18, 0
-; GFX6-NEXT:    s_mul_i32 s13, s16, s13
-; GFX6-NEXT:    s_add_u32 s13, s17, s13
+; GFX6-NEXT:    s_mul_i32 s16, s14, s16
+; GFX6-NEXT:    s_add_u32 s16, s17, s16
 ; GFX6-NEXT:    s_addc_u32 s17, 0, s18
-; GFX6-NEXT:    s_add_u32 s18, s12, s13
-; GFX6-NEXT:    v_mov_b32_e32 v0, s18
-; GFX6-NEXT:    v_mul_hi_u32 v0, s14, v0
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_addc_u32 s16, s16, s17
-; GFX6-NEXT:    s_mul_i32 s12, s14, s16
-; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
-; GFX6-NEXT:    s_add_i32 s12, s13, s12
-; GFX6-NEXT:    s_mul_i32 s15, s15, s18
-; GFX6-NEXT:    s_mul_i32 s13, s14, s18
-; GFX6-NEXT:    s_add_i32 s12, s12, s15
-; GFX6-NEXT:    v_mov_b32_e32 v2, s13
-; GFX6-NEXT:    v_mov_b32_e32 v0, s12
-; GFX6-NEXT:    v_mul_hi_u32 v3, s16, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s18, v2
-; GFX6-NEXT:    v_mul_hi_u32 v1, s16, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s18, v0
-; GFX6-NEXT:    s_mul_i32 s15, s18, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX6-NEXT:    s_add_u32 s15, s19, s15
+; GFX6-NEXT:    s_add_u32 s15, s15, s16
+; GFX6-NEXT:    v_mov_b32_e32 v0, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_addc_u32 s14, s14, s17
+; GFX6-NEXT:    s_mul_i32 s16, s12, s14
+; GFX6-NEXT:    s_mul_i32 s13, s13, s15
 ; GFX6-NEXT:    v_readfirstlane_b32 s17, v0
-; GFX6-NEXT:    s_mul_i32 s13, s16, s13
-; GFX6-NEXT:    s_addc_u32 s17, 0, s17
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v3
-; GFX6-NEXT:    s_add_u32 s13, s15, s13
-; GFX6-NEXT:    s_addc_u32 s13, s17, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    s_addc_u32 s14, s14, 0
-; GFX6-NEXT:    s_mul_i32 s12, s16, s12
-; GFX6-NEXT:    s_add_u32 s12, s13, s12
-; GFX6-NEXT:    s_addc_u32 s14, 0, s14
-; GFX6-NEXT:    s_add_u32 s15, s18, s12
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_addc_u32 s14, s16, s14
+; GFX6-NEXT:    s_add_i32 s16, s17, s16
+; GFX6-NEXT:    s_mul_i32 s12, s12, s15
+; GFX6-NEXT:    s_add_i32 s13, s16, s13
+; GFX6-NEXT:    v_mov_b32_e32 v2, s12
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
+; GFX6-NEXT:    v_mul_hi_u32 v3, s14, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s15, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s14, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s15, v0
+; GFX6-NEXT:    s_mul_i32 s17, s15, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s19, v2
+; GFX6-NEXT:    s_add_u32 s17, s19, s17
+; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
+; GFX6-NEXT:    s_mul_i32 s12, s14, s12
+; GFX6-NEXT:    s_addc_u32 s18, 0, s18
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v3
+; GFX6-NEXT:    s_add_u32 s12, s17, s12
+; GFX6-NEXT:    s_addc_u32 s12, s18, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
+; GFX6-NEXT:    s_addc_u32 s16, s16, 0
+; GFX6-NEXT:    s_mul_i32 s13, s14, s13
+; GFX6-NEXT:    s_add_u32 s12, s12, s13
+; GFX6-NEXT:    s_addc_u32 s13, 0, s16
+; GFX6-NEXT:    s_add_u32 s15, s15, s12
+; GFX6-NEXT:    s_addc_u32 s14, s14, s13
 ; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
 ; GFX6-NEXT:    s_add_u32 s8, s8, s12
 ; GFX6-NEXT:    s_mov_b32 s13, s12
@@ -8374,40 +8349,37 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
 ; GFX6-NEXT:    s_addc_u32 s16, s16, 0
 ; GFX6-NEXT:    s_mul_i32 s14, s9, s14
-; GFX6-NEXT:    s_add_u32 s18, s15, s14
-; GFX6-NEXT:    v_mov_b32_e32 v0, s18
+; GFX6-NEXT:    s_add_u32 s17, s15, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s17
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT:    s_addc_u32 s19, 0, s16
-; GFX6-NEXT:    s_mul_i32 s14, s6, s19
+; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    s_mul_i32 s14, s6, s16
 ; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
 ; GFX6-NEXT:    s_add_i32 s14, s15, s14
-; GFX6-NEXT:    s_mul_i32 s15, s7, s18
-; GFX6-NEXT:    s_add_i32 s20, s14, s15
-; GFX6-NEXT:    s_sub_i32 s16, s9, s20
-; GFX6-NEXT:    s_mul_i32 s14, s6, s18
+; GFX6-NEXT:    s_mul_i32 s15, s7, s17
+; GFX6-NEXT:    s_add_i32 s18, s14, s15
+; GFX6-NEXT:    s_sub_i32 s19, s9, s18
+; GFX6-NEXT:    s_mul_i32 s14, s6, s17
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s14
 ; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s17, s14, s15
-; GFX6-NEXT:    s_subb_u32 s21, s16, s7
-; GFX6-NEXT:    s_sub_u32 s22, s8, s6
-; GFX6-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GFX6-NEXT:    s_or_b32 s16, s16, s17
-; GFX6-NEXT:    s_subb_u32 s16, s21, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s16, s7
-; GFX6-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s22, s6
+; GFX6-NEXT:    s_subb_u32 s19, s19, s7
+; GFX6-NEXT:    s_sub_u32 s20, s8, s6
+; GFX6-NEXT:    s_subb_u32 s19, s19, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s19, s7
 ; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s16, s7
-; GFX6-NEXT:    s_cselect_b32 s16, s21, s17
-; GFX6-NEXT:    s_add_u32 s17, s18, 1
-; GFX6-NEXT:    s_addc_u32 s21, s19, 0
-; GFX6-NEXT:    s_add_u32 s22, s18, 2
-; GFX6-NEXT:    s_addc_u32 s23, s19, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s22, s17
-; GFX6-NEXT:    s_cselect_b32 s17, s23, s21
+; GFX6-NEXT:    s_cmp_ge_u32 s20, s6
+; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s19, s7
+; GFX6-NEXT:    s_cselect_b32 s19, s20, s21
+; GFX6-NEXT:    s_add_u32 s20, s17, 1
+; GFX6-NEXT:    s_addc_u32 s21, s16, 0
+; GFX6-NEXT:    s_add_u32 s22, s17, 2
+; GFX6-NEXT:    s_addc_u32 s23, s16, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_cselect_b32 s19, s22, s20
+; GFX6-NEXT:    s_cselect_b32 s20, s23, s21
 ; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_subb_u32 s9, s9, s20
+; GFX6-NEXT:    s_subb_u32 s9, s9, s18
 ; GFX6-NEXT:    s_cmp_ge_u32 s9, s7
 ; GFX6-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s8, s6
@@ -8415,12 +8387,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_cmp_eq_u32 s9, s7
 ; GFX6-NEXT:    s_cselect_b32 s6, s6, s14
 ; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s7, s17, s19
-; GFX6-NEXT:    s_cselect_b32 s6, s16, s18
+; GFX6-NEXT:    s_cselect_b32 s7, s20, s16
+; GFX6-NEXT:    s_cselect_b32 s6, s19, s17
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[12:13], s[2:3]
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX6-NEXT:    s_sub_u32 s16, s6, s2
-; GFX6-NEXT:    s_subb_u32 s17, s7, s3
+; GFX6-NEXT:    s_sub_u32 s14, s6, s2
+; GFX6-NEXT:    s_subb_u32 s15, s7, s3
 ; GFX6-NEXT:    s_ashr_i32 s6, s1, 31
 ; GFX6-NEXT:    s_add_u32 s0, s0, s6
 ; GFX6-NEXT:    s_mov_b32 s7, s6
@@ -8428,8 +8400,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    s_sub_u32 s12, 0, s8
-; GFX6-NEXT:    s_subb_u32 s13, 0, s9
+; GFX6-NEXT:    s_sub_u32 s2, 0, s8
+; GFX6-NEXT:    s_subb_u32 s3, 0, s9
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8438,128 +8410,121 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s1, s12, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX6-NEXT:    s_mul_i32 s0, s13, s2
-; GFX6-NEXT:    s_add_i32 s1, s3, s1
-; GFX6-NEXT:    s_add_i32 s3, s1, s0
-; GFX6-NEXT:    s_mul_i32 s15, s12, s2
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s15
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mul_i32 s4, s2, s3
-; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s2, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    s_mul_i32 s13, s2, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v2
+; GFX6-NEXT:    s_mul_i32 s1, s3, s0
+; GFX6-NEXT:    s_add_i32 s13, s16, s13
+; GFX6-NEXT:    s_add_i32 s13, s13, s1
+; GFX6-NEXT:    s_mul_i32 s1, s2, s0
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s13
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s1
+; GFX6-NEXT:    s_mul_i32 s16, s0, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s15
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s3
-; GFX6-NEXT:    s_add_u32 s4, s18, s4
-; GFX6-NEXT:    s_addc_u32 s5, 0, s5
-; GFX6-NEXT:    s_mul_i32 s15, s14, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s1
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s13
+; GFX6-NEXT:    s_add_u32 s16, s18, s16
+; GFX6-NEXT:    s_addc_u32 s17, 0, s17
+; GFX6-NEXT:    s_mul_i32 s1, s12, s1
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
-; GFX6-NEXT:    s_add_u32 s4, s4, s15
-; GFX6-NEXT:    s_addc_u32 s4, s5, s18
-; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX6-NEXT:    s_addc_u32 s5, s5, 0
-; GFX6-NEXT:    s_mul_i32 s3, s14, s3
-; GFX6-NEXT:    s_add_u32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s4, 0, s5
-; GFX6-NEXT:    s_add_u32 s5, s2, s3
-; GFX6-NEXT:    v_mov_b32_e32 v0, s5
-; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_addc_u32 s4, s14, s4
-; GFX6-NEXT:    s_mul_i32 s2, s12, s4
-; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX6-NEXT:    s_add_i32 s2, s3, s2
-; GFX6-NEXT:    s_mul_i32 s13, s13, s5
-; GFX6-NEXT:    s_mul_i32 s3, s12, s5
-; GFX6-NEXT:    s_add_i32 s2, s2, s13
-; GFX6-NEXT:    v_mov_b32_e32 v2, s3
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    s_add_u32 s1, s16, s1
+; GFX6-NEXT:    s_addc_u32 s1, s17, s18
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
+; GFX6-NEXT:    s_addc_u32 s16, s16, 0
+; GFX6-NEXT:    s_mul_i32 s13, s12, s13
+; GFX6-NEXT:    s_add_u32 s1, s1, s13
+; GFX6-NEXT:    s_addc_u32 s13, 0, s16
+; GFX6-NEXT:    s_add_u32 s16, s0, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_addc_u32 s4, s12, s13
+; GFX6-NEXT:    s_mul_i32 s5, s2, s4
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
+; GFX6-NEXT:    s_add_i32 s5, s12, s5
+; GFX6-NEXT:    s_mul_i32 s3, s3, s16
+; GFX6-NEXT:    s_mul_i32 s2, s2, s16
+; GFX6-NEXT:    s_add_i32 s3, s5, s3
+; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s5, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s16, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX6-NEXT:    s_mul_i32 s13, s5, s2
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
-; GFX6-NEXT:    s_add_u32 s13, s15, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
-; GFX6-NEXT:    s_mul_i32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s14, 0, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
-; GFX6-NEXT:    s_add_u32 s3, s13, s3
-; GFX6-NEXT:    s_addc_u32 s3, s14, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX6-NEXT:    s_addc_u32 s12, s12, 0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s16, v0
+; GFX6-NEXT:    s_mul_i32 s12, s16, s3
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
+; GFX6-NEXT:    s_add_u32 s12, s17, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
 ; GFX6-NEXT:    s_mul_i32 s2, s4, s2
-; GFX6-NEXT:    s_add_u32 s2, s3, s2
-; GFX6-NEXT:    s_addc_u32 s12, 0, s12
-; GFX6-NEXT:    s_add_u32 s13, s5, s2
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_addc_u32 s12, s4, s12
+; GFX6-NEXT:    s_addc_u32 s13, 0, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX6-NEXT:    s_add_u32 s2, s12, s2
+; GFX6-NEXT:    s_addc_u32 s2, s13, s5
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX6-NEXT:    s_addc_u32 s5, s5, 0
+; GFX6-NEXT:    s_mul_i32 s3, s4, s3
+; GFX6-NEXT:    s_add_u32 s2, s2, s3
+; GFX6-NEXT:    s_addc_u32 s3, 0, s5
+; GFX6-NEXT:    s_add_u32 s12, s16, s2
+; GFX6-NEXT:    s_addc_u32 s13, s4, s3
 ; GFX6-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX6-NEXT:    s_add_u32 s2, s10, s4
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_addc_u32 s3, s11, s4
 ; GFX6-NEXT:    s_xor_b64 s[10:11], s[2:3], s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v0, s12
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s10, v0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s13
+; GFX6-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v2
-; GFX6-NEXT:    s_mul_i32 s2, s10, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    s_mul_i32 s2, s10, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s11, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v3
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT:    s_add_u32 s2, s15, s2
-; GFX6-NEXT:    s_addc_u32 s14, 0, s14
-; GFX6-NEXT:    s_mul_i32 s13, s11, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v1
-; GFX6-NEXT:    s_add_u32 s2, s2, s13
-; GFX6-NEXT:    s_addc_u32 s2, s14, s15
-; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
-; GFX6-NEXT:    s_addc_u32 s13, s13, 0
+; GFX6-NEXT:    s_add_u32 s2, s17, s2
+; GFX6-NEXT:    s_addc_u32 s16, 0, s16
 ; GFX6-NEXT:    s_mul_i32 s12, s11, s12
-; GFX6-NEXT:    s_add_u32 s18, s2, s12
-; GFX6-NEXT:    v_mov_b32_e32 v0, s18
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v1
+; GFX6-NEXT:    s_add_u32 s2, s2, s12
+; GFX6-NEXT:    s_addc_u32 s2, s16, s17
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
+; GFX6-NEXT:    s_addc_u32 s12, s12, 0
+; GFX6-NEXT:    s_mul_i32 s13, s11, s13
+; GFX6-NEXT:    s_add_u32 s16, s2, s13
+; GFX6-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT:    s_addc_u32 s19, 0, s13
-; GFX6-NEXT:    s_mul_i32 s12, s8, s19
+; GFX6-NEXT:    s_addc_u32 s17, 0, s12
+; GFX6-NEXT:    s_mul_i32 s12, s8, s17
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
 ; GFX6-NEXT:    s_add_i32 s12, s13, s12
-; GFX6-NEXT:    s_mul_i32 s13, s9, s18
-; GFX6-NEXT:    s_add_i32 s20, s12, s13
-; GFX6-NEXT:    s_sub_i32 s14, s11, s20
-; GFX6-NEXT:    s_mul_i32 s12, s8, s18
+; GFX6-NEXT:    s_mul_i32 s13, s9, s16
+; GFX6-NEXT:    s_add_i32 s18, s12, s13
+; GFX6-NEXT:    s_sub_i32 s19, s11, s18
+; GFX6-NEXT:    s_mul_i32 s12, s8, s16
 ; GFX6-NEXT:    s_sub_u32 s10, s10, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s15, s12, s13
-; GFX6-NEXT:    s_subb_u32 s21, s14, s9
-; GFX6-NEXT:    s_sub_u32 s22, s10, s8
-; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_subb_u32 s14, s21, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s14, s9
-; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s22, s8
+; GFX6-NEXT:    s_subb_u32 s19, s19, s9
+; GFX6-NEXT:    s_sub_u32 s20, s10, s8
+; GFX6-NEXT:    s_subb_u32 s19, s19, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s19, s9
 ; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s14, s9
-; GFX6-NEXT:    s_cselect_b32 s14, s21, s15
-; GFX6-NEXT:    s_add_u32 s15, s18, 1
-; GFX6-NEXT:    s_addc_u32 s21, s19, 0
-; GFX6-NEXT:    s_add_u32 s22, s18, 2
-; GFX6-NEXT:    s_addc_u32 s23, s19, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_cselect_b32 s14, s22, s15
-; GFX6-NEXT:    s_cselect_b32 s15, s23, s21
+; GFX6-NEXT:    s_cmp_ge_u32 s20, s8
+; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s19, s9
+; GFX6-NEXT:    s_cselect_b32 s19, s20, s21
+; GFX6-NEXT:    s_add_u32 s20, s16, 1
+; GFX6-NEXT:    s_addc_u32 s21, s17, 0
+; GFX6-NEXT:    s_add_u32 s22, s16, 2
+; GFX6-NEXT:    s_addc_u32 s23, s17, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_cselect_b32 s19, s22, s20
+; GFX6-NEXT:    s_cselect_b32 s20, s23, s21
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_subb_u32 s11, s11, s20
+; GFX6-NEXT:    s_subb_u32 s11, s11, s18
 ; GFX6-NEXT:    s_cmp_ge_u32 s11, s9
 ; GFX6-NEXT:    s_cselect_b32 s12, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s10, s8
@@ -8567,15 +8532,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_cmp_eq_u32 s11, s9
 ; GFX6-NEXT:    s_cselect_b32 s8, s8, s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s9, s15, s19
-; GFX6-NEXT:    s_cselect_b32 s8, s14, s18
+; GFX6-NEXT:    s_cselect_b32 s9, s20, s17
+; GFX6-NEXT:    s_cselect_b32 s8, s19, s16
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[8:9], s[4:5]
 ; GFX6-NEXT:    s_sub_u32 s4, s6, s4
 ; GFX6-NEXT:    s_subb_u32 s5, s7, s5
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    v_mov_b32_e32 v1, s17
+; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9015,105 +8980,100 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    s_sub_u32 s10, 0, s8
-; GFX6-NEXT:    s_subb_u32 s11, 0, s9
+; GFX6-NEXT:    s_sub_u32 s0, 0, s8
+; GFX6-NEXT:    s_subb_u32 s1, 0, s9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s10, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    s_mul_i32 s1, s10, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
-; GFX6-NEXT:    s_mul_i32 s13, s11, s0
-; GFX6-NEXT:    s_mul_i32 s14, s10, s0
-; GFX6-NEXT:    s_add_i32 s1, s15, s1
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s14
-; GFX6-NEXT:    s_add_i32 s1, s1, s13
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s1
-; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s13, v3
-; GFX6-NEXT:    s_mul_i32 s15, s0, s1
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s1
-; GFX6-NEXT:    s_add_u32 s13, s13, s15
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
-; GFX6-NEXT:    s_addc_u32 s15, 0, s15
-; GFX6-NEXT:    s_mul_i32 s14, s12, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v4
-; GFX6-NEXT:    s_add_u32 s13, s13, s14
-; GFX6-NEXT:    s_addc_u32 s13, s15, s16
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    s_addc_u32 s14, s14, 0
-; GFX6-NEXT:    s_mul_i32 s1, s12, s1
-; GFX6-NEXT:    s_add_u32 s1, s13, s1
-; GFX6-NEXT:    s_addc_u32 s13, 0, s14
-; GFX6-NEXT:    s_add_u32 s14, s0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s14
-; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_addc_u32 s12, s12, s13
-; GFX6-NEXT:    s_mul_i32 s0, s10, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX6-NEXT:    s_add_i32 s0, s1, s0
-; GFX6-NEXT:    s_mul_i32 s11, s11, s14
-; GFX6-NEXT:    s_mul_i32 s1, s10, s14
-; GFX6-NEXT:    s_add_i32 s0, s0, s11
-; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s14, v2
-; GFX6-NEXT:    v_mul_hi_u32 v1, s12, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s14, v0
-; GFX6-NEXT:    s_mul_i32 s11, s14, s0
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
-; GFX6-NEXT:    s_add_u32 s11, s15, s11
-; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
-; GFX6-NEXT:    s_mul_i32 s1, s12, s1
-; GFX6-NEXT:    s_addc_u32 s13, 0, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v3
-; GFX6-NEXT:    s_add_u32 s1, s11, s1
-; GFX6-NEXT:    s_addc_u32 s1, s13, s10
+; GFX6-NEXT:    v_mul_hi_u32 v2, s0, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v1
-; GFX6-NEXT:    s_addc_u32 s10, s10, 0
-; GFX6-NEXT:    s_mul_i32 s0, s12, s0
-; GFX6-NEXT:    s_add_u32 s0, s1, s0
-; GFX6-NEXT:    s_addc_u32 s10, 0, s10
-; GFX6-NEXT:    s_add_u32 s13, s14, s0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_addc_u32 s12, s12, s10
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s11, s0, s10
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v2
+; GFX6-NEXT:    s_mul_i32 s12, s1, s2
+; GFX6-NEXT:    s_mul_i32 s13, s0, s2
+; GFX6-NEXT:    s_add_i32 s11, s14, s11
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s13
+; GFX6-NEXT:    s_add_i32 s11, s11, s12
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s11
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
+; GFX6-NEXT:    s_mul_i32 s14, s2, s11
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s11
+; GFX6-NEXT:    s_add_u32 s12, s12, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
+; GFX6-NEXT:    s_mul_i32 s13, s10, s13
+; GFX6-NEXT:    s_addc_u32 s14, 0, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v4
+; GFX6-NEXT:    s_add_u32 s12, s12, s13
+; GFX6-NEXT:    s_addc_u32 s12, s14, s15
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v1
+; GFX6-NEXT:    s_addc_u32 s13, s13, 0
+; GFX6-NEXT:    s_mul_i32 s11, s10, s11
+; GFX6-NEXT:    s_add_u32 s11, s12, s11
+; GFX6-NEXT:    s_addc_u32 s12, 0, s13
+; GFX6-NEXT:    s_add_u32 s11, s2, s11
+; GFX6-NEXT:    v_mov_b32_e32 v0, s11
+; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT:    s_addc_u32 s10, s10, s12
+; GFX6-NEXT:    s_mul_i32 s12, s0, s10
+; GFX6-NEXT:    s_mul_i32 s1, s1, s11
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
+; GFX6-NEXT:    s_add_i32 s12, s13, s12
+; GFX6-NEXT:    s_mul_i32 s0, s0, s11
+; GFX6-NEXT:    s_add_i32 s1, s12, s1
+; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s11, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s10, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GFX6-NEXT:    s_mul_i32 s13, s11, s1
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
+; GFX6-NEXT:    s_add_u32 s13, s15, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
+; GFX6-NEXT:    s_mul_i32 s0, s10, s0
+; GFX6-NEXT:    s_addc_u32 s14, 0, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
+; GFX6-NEXT:    s_add_u32 s0, s13, s0
+; GFX6-NEXT:    s_addc_u32 s0, s14, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX6-NEXT:    s_addc_u32 s12, s12, 0
+; GFX6-NEXT:    s_mul_i32 s1, s10, s1
+; GFX6-NEXT:    s_add_u32 s0, s0, s1
+; GFX6-NEXT:    s_addc_u32 s1, 0, s12
+; GFX6-NEXT:    s_add_u32 s12, s11, s0
+; GFX6-NEXT:    s_addc_u32 s13, s10, s1
 ; GFX6-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX6-NEXT:    s_add_u32 s0, s6, s10
 ; GFX6-NEXT:    s_mov_b32 s11, s10
 ; GFX6-NEXT:    s_addc_u32 s1, s7, s10
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX6-NEXT:    v_mov_b32_e32 v0, s12
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s6, v0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s13
+; GFX6-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v2
 ; GFX6-NEXT:    s_mov_b32 s0, s4
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s7, v2
-; GFX6-NEXT:    s_mul_i32 s1, s6, s12
+; GFX6-NEXT:    s_mul_i32 s1, s6, s13
 ; GFX6-NEXT:    v_readfirstlane_b32 s14, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
 ; GFX6-NEXT:    s_add_u32 s1, s14, s1
 ; GFX6-NEXT:    s_addc_u32 s4, 0, s4
-; GFX6-NEXT:    s_mul_i32 s13, s7, s13
+; GFX6-NEXT:    s_mul_i32 s12, s7, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    s_add_u32 s1, s1, s13
+; GFX6-NEXT:    s_add_u32 s1, s1, s12
 ; GFX6-NEXT:    s_addc_u32 s1, s4, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX6-NEXT:    s_addc_u32 s4, s4, 0
-; GFX6-NEXT:    s_mul_i32 s12, s7, s12
+; GFX6-NEXT:    s_mul_i32 s12, s7, s13
 ; GFX6-NEXT:    s_add_u32 s12, s1, s12
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
@@ -9128,11 +9088,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_mul_i32 s4, s8, s12
 ; GFX6-NEXT:    s_sub_u32 s6, s6, s4
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s4, s5
 ; GFX6-NEXT:    s_subb_u32 s15, s13, s9
 ; GFX6-NEXT:    s_sub_u32 s16, s6, s8
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s17, s12, s13
 ; GFX6-NEXT:    s_subb_u32 s17, s15, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s17, s9
 ; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
@@ -9141,13 +9099,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_cmp_eq_u32 s17, s9
 ; GFX6-NEXT:    s_cselect_b32 s18, s19, s18
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_subb_u32 s15, s15, s9
-; GFX6-NEXT:    s_sub_u32 s19, s16, s8
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_subb_u32 s12, s15, 0
+; GFX6-NEXT:    s_subb_u32 s12, s15, s9
+; GFX6-NEXT:    s_sub_u32 s13, s16, s8
+; GFX6-NEXT:    s_subb_u32 s12, s12, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_cselect_b32 s13, s19, s16
+; GFX6-NEXT:    s_cselect_b32 s13, s13, s16
 ; GFX6-NEXT:    s_cselect_b32 s12, s12, s17
 ; GFX6-NEXT:    s_or_b32 s4, s4, s5
 ; GFX6-NEXT:    s_subb_u32 s4, s7, s14
@@ -9164,6 +9120,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_sub_u32 s4, s4, s10
 ; GFX6-NEXT:    s_subb_u32 s5, s5, s10
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -9405,8 +9362,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[6:7]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX6-NEXT:    s_sub_u32 s12, 0, s2
-; GFX6-NEXT:    s_subb_u32 s13, 0, s3
+; GFX6-NEXT:    s_sub_u32 s6, 0, s2
+; GFX6-NEXT:    s_subb_u32 s7, 0, s3
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9415,69 +9372,65 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX6-NEXT:    s_mul_i32 s7, s12, s14
+; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
+; GFX6-NEXT:    s_mul_i32 s14, s6, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
-; GFX6-NEXT:    s_mul_i32 s15, s13, s6
-; GFX6-NEXT:    s_mul_i32 s16, s12, s6
-; GFX6-NEXT:    s_add_i32 s7, s17, s7
+; GFX6-NEXT:    s_mul_i32 s15, s7, s13
+; GFX6-NEXT:    s_mul_i32 s16, s6, s13
+; GFX6-NEXT:    s_add_i32 s14, s17, s14
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s16
-; GFX6-NEXT:    s_add_i32 s7, s7, s15
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s7
+; GFX6-NEXT:    s_add_i32 s14, s14, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s14
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s16
 ; GFX6-NEXT:    v_readfirstlane_b32 s15, v3
-; GFX6-NEXT:    s_mul_i32 s18, s6, s7
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s7
+; GFX6-NEXT:    s_mul_i32 s18, s13, s14
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s14
 ; GFX6-NEXT:    s_add_u32 s15, s15, s18
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
-; GFX6-NEXT:    s_mul_i32 s16, s14, s16
+; GFX6-NEXT:    s_mul_i32 s16, s12, s16
 ; GFX6-NEXT:    s_addc_u32 s18, 0, s18
 ; GFX6-NEXT:    v_readfirstlane_b32 s17, v4
 ; GFX6-NEXT:    s_add_u32 s15, s15, s16
 ; GFX6-NEXT:    s_addc_u32 s15, s18, s17
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
 ; GFX6-NEXT:    s_addc_u32 s16, s16, 0
-; GFX6-NEXT:    s_mul_i32 s7, s14, s7
-; GFX6-NEXT:    s_add_u32 s7, s15, s7
+; GFX6-NEXT:    s_mul_i32 s14, s12, s14
+; GFX6-NEXT:    s_add_u32 s14, s15, s14
 ; GFX6-NEXT:    s_addc_u32 s15, 0, s16
-; GFX6-NEXT:    s_add_u32 s16, s6, s7
-; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX6-NEXT:    s_or_b32 s6, s6, s7
-; GFX6-NEXT:    s_addc_u32 s14, s14, s15
-; GFX6-NEXT:    s_mul_i32 s6, s12, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
-; GFX6-NEXT:    s_add_i32 s6, s7, s6
-; GFX6-NEXT:    s_mul_i32 s13, s13, s16
-; GFX6-NEXT:    s_mul_i32 s7, s12, s16
-; GFX6-NEXT:    s_add_i32 s6, s6, s13
-; GFX6-NEXT:    v_mov_b32_e32 v2, s7
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    v_mul_hi_u32 v3, s14, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s16, v2
-; GFX6-NEXT:    v_mul_hi_u32 v1, s14, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s16, v0
-; GFX6-NEXT:    s_mul_i32 s13, s16, s6
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
-; GFX6-NEXT:    s_add_u32 s13, s17, s13
+; GFX6-NEXT:    s_add_u32 s13, s13, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
+; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX6-NEXT:    s_addc_u32 s12, s12, s15
+; GFX6-NEXT:    s_mul_i32 s14, s6, s12
+; GFX6-NEXT:    s_mul_i32 s7, s7, s13
 ; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
-; GFX6-NEXT:    s_mul_i32 s7, s14, s7
-; GFX6-NEXT:    s_addc_u32 s15, 0, s15
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
-; GFX6-NEXT:    s_add_u32 s7, s13, s7
-; GFX6-NEXT:    s_addc_u32 s7, s15, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX6-NEXT:    s_addc_u32 s12, s12, 0
-; GFX6-NEXT:    s_mul_i32 s6, s14, s6
-; GFX6-NEXT:    s_add_u32 s6, s7, s6
-; GFX6-NEXT:    s_addc_u32 s12, 0, s12
-; GFX6-NEXT:    s_add_u32 s13, s16, s6
-; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX6-NEXT:    s_or_b32 s6, s6, s7
-; GFX6-NEXT:    s_addc_u32 s12, s14, s12
+; GFX6-NEXT:    s_add_i32 s14, s15, s14
+; GFX6-NEXT:    s_mul_i32 s6, s6, s13
+; GFX6-NEXT:    s_add_i32 s7, s14, s7
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NEXT:    v_mov_b32_e32 v0, s7
+; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s13, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s12, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GFX6-NEXT:    s_mul_i32 s15, s13, s7
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
+; GFX6-NEXT:    s_add_u32 s15, s17, s15
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX6-NEXT:    s_mul_i32 s6, s12, s6
+; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v3
+; GFX6-NEXT:    s_add_u32 s6, s15, s6
+; GFX6-NEXT:    s_addc_u32 s6, s16, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    s_addc_u32 s14, s14, 0
+; GFX6-NEXT:    s_mul_i32 s7, s12, s7
+; GFX6-NEXT:    s_add_u32 s6, s6, s7
+; GFX6-NEXT:    s_addc_u32 s7, 0, s14
+; GFX6-NEXT:    s_add_u32 s13, s13, s6
+; GFX6-NEXT:    s_addc_u32 s12, s12, s7
 ; GFX6-NEXT:    s_ashr_i32 s6, s9, 31
 ; GFX6-NEXT:    s_add_u32 s8, s8, s6
 ; GFX6-NEXT:    s_mov_b32 s7, s6
@@ -9514,11 +9467,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_mul_i32 s12, s2, s12
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s15, s12, s13
 ; GFX6-NEXT:    s_subb_u32 s17, s14, s3
 ; GFX6-NEXT:    s_sub_u32 s18, s8, s2
 ; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s19, s14, s15
 ; GFX6-NEXT:    s_subb_u32 s19, s17, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s19, s3
 ; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
@@ -9527,13 +9478,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_cmp_eq_u32 s19, s3
 ; GFX6-NEXT:    s_cselect_b32 s20, s21, s20
 ; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_subb_u32 s17, s17, s3
-; GFX6-NEXT:    s_sub_u32 s21, s18, s2
-; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_subb_u32 s14, s17, 0
+; GFX6-NEXT:    s_subb_u32 s14, s17, s3
+; GFX6-NEXT:    s_sub_u32 s15, s18, s2
+; GFX6-NEXT:    s_subb_u32 s14, s14, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b32 s15, s21, s18
+; GFX6-NEXT:    s_cselect_b32 s15, s15, s18
 ; GFX6-NEXT:    s_cselect_b32 s14, s14, s19
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
 ; GFX6-NEXT:    s_subb_u32 s9, s9, s16
@@ -9556,8 +9505,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[2:3]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX6-NEXT:    s_sub_u32 s8, 0, s6
-; GFX6-NEXT:    s_subb_u32 s9, 0, s7
+; GFX6-NEXT:    s_sub_u32 s2, 0, s6
+; GFX6-NEXT:    s_subb_u32 s3, 0, s7
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9566,70 +9515,66 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s1, s8, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX6-NEXT:    s_mul_i32 s0, s9, s2
-; GFX6-NEXT:    s_add_i32 s1, s3, s1
-; GFX6-NEXT:    s_add_i32 s3, s1, s0
-; GFX6-NEXT:    s_mul_i32 s13, s8, s2
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s13
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mul_i32 s4, s2, s3
-; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s2, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    s_mul_i32 s9, s2, s8
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v2
+; GFX6-NEXT:    s_mul_i32 s1, s3, s0
+; GFX6-NEXT:    s_add_i32 s9, s12, s9
+; GFX6-NEXT:    s_add_i32 s9, s9, s1
+; GFX6-NEXT:    s_mul_i32 s1, s2, s0
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s9
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s1
+; GFX6-NEXT:    s_mul_i32 s12, s0, s9
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s13
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s3
-; GFX6-NEXT:    s_add_u32 s4, s16, s4
-; GFX6-NEXT:    s_addc_u32 s5, 0, s5
-; GFX6-NEXT:    s_mul_i32 s13, s12, s13
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s1
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s9
+; GFX6-NEXT:    s_add_u32 s12, s16, s12
+; GFX6-NEXT:    s_addc_u32 s13, 0, s13
+; GFX6-NEXT:    s_mul_i32 s1, s8, s1
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
-; GFX6-NEXT:    s_add_u32 s4, s4, s13
-; GFX6-NEXT:    s_addc_u32 s4, s5, s16
-; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX6-NEXT:    s_addc_u32 s5, s5, 0
-; GFX6-NEXT:    s_mul_i32 s3, s12, s3
-; GFX6-NEXT:    s_add_u32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s4, 0, s5
-; GFX6-NEXT:    s_add_u32 s5, s2, s3
-; GFX6-NEXT:    v_mov_b32_e32 v0, s5
-; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_addc_u32 s4, s12, s4
-; GFX6-NEXT:    s_mul_i32 s2, s8, s4
-; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX6-NEXT:    s_add_i32 s2, s3, s2
-; GFX6-NEXT:    s_mul_i32 s9, s9, s5
-; GFX6-NEXT:    s_mul_i32 s3, s8, s5
-; GFX6-NEXT:    s_add_i32 s2, s2, s9
-; GFX6-NEXT:    v_mov_b32_e32 v2, s3
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    s_add_u32 s1, s12, s1
+; GFX6-NEXT:    s_addc_u32 s1, s13, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX6-NEXT:    s_addc_u32 s12, s12, 0
+; GFX6-NEXT:    s_mul_i32 s9, s8, s9
+; GFX6-NEXT:    s_add_u32 s1, s1, s9
+; GFX6-NEXT:    s_addc_u32 s9, 0, s12
+; GFX6-NEXT:    s_add_u32 s12, s0, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s12
+; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_addc_u32 s4, s8, s9
+; GFX6-NEXT:    s_mul_i32 s5, s2, s4
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    s_add_i32 s5, s8, s5
+; GFX6-NEXT:    s_mul_i32 s3, s3, s12
+; GFX6-NEXT:    s_mul_i32 s2, s2, s12
+; GFX6-NEXT:    s_add_i32 s3, s5, s3
+; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s5, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX6-NEXT:    s_mul_i32 s9, s5, s2
+; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_mul_i32 s8, s12, s3
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v2
-; GFX6-NEXT:    s_add_u32 s9, s13, s9
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
-; GFX6-NEXT:    s_mul_i32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s12, 0, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX6-NEXT:    s_add_u32 s3, s9, s3
-; GFX6-NEXT:    s_addc_u32 s3, s12, s8
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX6-NEXT:    s_addc_u32 s8, s8, 0
+; GFX6-NEXT:    s_add_u32 s8, s13, s8
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v0
 ; GFX6-NEXT:    s_mul_i32 s2, s4, s2
-; GFX6-NEXT:    s_add_u32 s2, s3, s2
-; GFX6-NEXT:    s_addc_u32 s8, 0, s8
-; GFX6-NEXT:    s_add_u32 s12, s5, s2
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_addc_u32 s13, s4, s8
+; GFX6-NEXT:    s_addc_u32 s9, 0, s9
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX6-NEXT:    s_add_u32 s2, s8, s2
+; GFX6-NEXT:    s_addc_u32 s2, s9, s5
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX6-NEXT:    s_addc_u32 s5, s5, 0
+; GFX6-NEXT:    s_mul_i32 s3, s4, s3
+; GFX6-NEXT:    s_add_u32 s2, s2, s3
+; GFX6-NEXT:    s_addc_u32 s3, 0, s5
+; GFX6-NEXT:    s_add_u32 s12, s12, s2
+; GFX6-NEXT:    s_addc_u32 s13, s4, s3
 ; GFX6-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX6-NEXT:    s_add_u32 s2, s10, s4
 ; GFX6-NEXT:    s_mov_b32 s5, s4
@@ -9667,11 +9612,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_mul_i32 s10, s6, s11
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s10
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT:    s_or_b32 s13, s10, s11
 ; GFX6-NEXT:    s_subb_u32 s17, s12, s7
 ; GFX6-NEXT:    s_sub_u32 s18, s8, s6
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s19, s12, s13
 ; GFX6-NEXT:    s_subb_u32 s19, s17, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s19, s7
 ; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
@@ -9680,13 +9623,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_cmp_eq_u32 s19, s7
 ; GFX6-NEXT:    s_cselect_b32 s20, s21, s20
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_subb_u32 s17, s17, s7
-; GFX6-NEXT:    s_sub_u32 s21, s18, s6
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_subb_u32 s12, s17, 0
+; GFX6-NEXT:    s_subb_u32 s12, s17, s7
+; GFX6-NEXT:    s_sub_u32 s13, s18, s6
+; GFX6-NEXT:    s_subb_u32 s12, s12, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b32 s13, s21, s18
+; GFX6-NEXT:    s_cselect_b32 s13, s13, s18
 ; GFX6-NEXT:    s_cselect_b32 s12, s12, s19
 ; GFX6-NEXT:    s_or_b32 s10, s10, s11
 ; GFX6-NEXT:    s_subb_u32 s9, s9, s16
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
new file mode 100644
index 0000000000000..7669ae21f6635
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
@@ -0,0 +1,510 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=GFX942 %s
+
+; These situations are "special" in that they either have an alloca that is not
+; in the entry block or that they have a dynamic alloca. Both situations affect
+; prolog/epilog generation.
+
+declare amdgpu_gfx void @foo()
+
+define amdgpu_cs_chain void @test_alloca() {
+; GFX12-LABEL: test_alloca:
+; GFX12:       ; %bb.0: ; %.entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_mov_b32 s0, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s0, 0x200
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca:
+; GFX942:       ; %bb.0: ; %.entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_mov_b32 s0, s32
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_add_i32 s32, s0, 0x400
+; GFX942-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NEXT:    s_endpgm
+.entry:
+  br label %SW_C
+
+SW_C:                                             ; preds = %.entry
+  %v = alloca i32, i32 1, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) {
+; GFX12-LABEL: test_alloca_var_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl2_add_u32 s0, s0, 15
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_b32 s0, s0, -16
+; GFX12-NEXT:    s_mov_b32 s1, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s1, s0
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_var_uniform:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_lshl2_add_u32 s0, s0, 15
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_and_b32 s0, s0, -16
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_lshl_b32 s0, s0, 6
+; GFX942-NEXT:    s_mov_b32 s1, s32
+; GFX942-NEXT:    s_add_i32 s32, s1, s0
+; GFX942-NEXT:    scratch_store_dword off, v0, s1
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_var(i32 %count) {
+; GFX12-LABEL: test_alloca_var:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0
+; GFX12-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s3, v1, s2
+; GFX12-NEXT:    s_bitset0_b32 s1, s2
+; GFX12-NEXT:    s_max_u32 s0, s0, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 s1, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_lshl_add_u32 v1, s0, 5, s1
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
+; GFX12-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_var:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX942-NEXT:    v_and_b32_e32 v1, -16, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v1, s3
+; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX942-NEXT:    s_max_u32 s2, s2, s4
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_mov_b32 s0, s32
+; GFX942-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NEXT:    v_lshl_add_u32 v1, s2, 6, v1
+; GFX942-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_and_call() {
+; GFX12-LABEL: test_alloca_and_call:
+; GFX12:       ; %bb.0: ; %.entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-NEXT:    s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_mov_b32 s2, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s2, 0x200
+; GFX12-NEXT:    scratch_store_b32 off, v0, s2
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_and_call:
+; GFX942:       ; %bb.0: ; %.entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_mov_b32 s2, s32
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_add_i32 s32, s2, 0x400
+; GFX942-NEXT:    scratch_store_dword off, v0, s2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    s_endpgm
+.entry:
+  br label %SW_C
+
+SW_C:                                             ; preds = %.entry
+  %v = alloca i32, i32 1, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  call amdgpu_gfx void @foo()
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count) {
+; GFX12-LABEL: test_alloca_and_call_var_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_lshl2_add_u32 s0, s0, 15
+; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_and_b32 s0, s0, -16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_mov_b32 s1, s32
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s1, s0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_and_call_var_uniform:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_lshl2_add_u32 s0, s0, 15
+; GFX942-NEXT:    s_and_b32 s0, s0, -16
+; GFX942-NEXT:    s_lshl_b32 s2, s0, 6
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_mov_b32 s3, s32
+; GFX942-NEXT:    s_add_i32 s32, s3, s2
+; GFX942-NEXT:    scratch_store_dword off, v0, s3
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  call amdgpu_gfx void @foo()
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) {
+; GFX12-LABEL: test_alloca_and_call_var:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0
+; GFX12-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s3, v1, s2
+; GFX12-NEXT:    s_bitset0_b32 s1, s2
+; GFX12-NEXT:    s_max_u32 s0, s0, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_mov_b32 s1, s32
+; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v1, s0, 5, s1
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
+; GFX12-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_and_call_var:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX942-NEXT:    v_and_b32_e32 v1, -16, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v1, s3
+; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX942-NEXT:    s_max_u32 s2, s2, s4
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s3, s32
+; GFX942-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-NEXT:    v_lshl_add_u32 v1, s2, 6, v1
+; GFX942-NEXT:    scratch_store_dword off, v0, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  call amdgpu_gfx void @foo()
+  ret void
+}
+
+define amdgpu_cs_chain void @test_call_and_alloca() {
+; GFX12-LABEL: test_call_and_alloca:
+; GFX12:       ; %bb.0: ; %.entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-NEXT:    s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    s_mov_b32 s4, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s4, 0x200
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    scratch_store_b32 off, v0, s4
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_call_and_alloca:
+; GFX942:       ; %bb.0: ; %.entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_mov_b32 s4, s32
+; GFX942-NEXT:    s_add_i32 s32, s4, 0x400
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    scratch_store_dword off, v0, s4
+; GFX942-NEXT:    s_endpgm
+.entry:
+  br label %SW_C
+
+SW_C:                                             ; preds = %.entry
+  %v = alloca i32, i32 1, align 4, addrspace(5)
+  call amdgpu_gfx void @foo()
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count) {
+; GFX12-LABEL: test_call_and_alloca_var_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_lshl2_add_u32 s0, s0, 15
+; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_and_b32 s0, s0, -16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_mov_b32 s4, s32
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX12-NEXT:    v_mov_b32_e32 v40, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s4, s0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    scratch_store_b32 off, v40, s4
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_call_and_alloca_var_uniform:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_lshl2_add_u32 s0, s0, 15
+; GFX942-NEXT:    s_and_b32 s0, s0, -16
+; GFX942-NEXT:    s_lshl_b32 s2, s0, 6
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_mov_b32 s4, s32
+; GFX942-NEXT:    v_mov_b32_e32 v40, 0
+; GFX942-NEXT:    s_add_i32 s32, s4, s2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    scratch_store_dword off, v40, s4
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  call amdgpu_gfx void @foo()
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_call_and_alloca_var(i32 %count) {
+; GFX12-LABEL: test_call_and_alloca_var:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX12-NEXT:    v_mov_b32_e32 v40, 0
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX12-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX12-NEXT:    s_bitset0_b32 s1, s2
+; GFX12-NEXT:    s_max_u32 s0, s0, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_mov_b32 s4, s32
+; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v0, s0, 5, s4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    scratch_store_b32 off, v40, s4
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_call_and_alloca_var:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX942-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX942-NEXT:    v_mov_b32_e32 v40, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX942-NEXT:    s_max_u32 s2, s2, s4
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s4, s32
+; GFX942-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-NEXT:    v_lshl_add_u32 v0, s2, 6, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    scratch_store_dword off, v40, s4
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  call amdgpu_gfx void @foo()
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
index f6ae516faf2f7..89d039406f375 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
@@ -1489,7 +1489,7 @@ attributes #2 = { noinline }
 !0 = !{float 3.0}
 ;.
 ; CHECK: attributes #[[ATTR0]] = { strictfp }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind memory(read) }
 ; CHECK: attributes #[[ATTR3]] = { noinline }
 ; CHECK: attributes #[[ATTR4]] = { nobuiltin }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
index c962c05d24ad0..5d79696572cf0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
@@ -239,7 +239,8 @@ define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1)
 ; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
 ; PASS-CHECK-NEXT:    br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
 ; PASS-CHECK:       [[IF]]:
-; PASS-CHECK-NEXT:    [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0
+; PASS-CHECK-NEXT:    [[FIRST_ACTIVE_ID:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 0)
+; PASS-CHECK-NEXT:    [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, [[FIRST_ACTIVE_ID]]
 ; PASS-CHECK-NEXT:    br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]]
 ; PASS-CHECK:       [[WORK]]:
 ; PASS-CHECK-NEXT:    store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -308,7 +309,8 @@ define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i3
 ; PASS-CHECK-NEXT:    [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
 ; PASS-CHECK-NEXT:    br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
 ; PASS-CHECK:       [[IF]]:
-; PASS-CHECK-NEXT:    [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]]
+; PASS-CHECK-NEXT:    [[FIRST_ACTIVE_ID:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[MYMASK]])
+; PASS-CHECK-NEXT:    [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[FIRST_ACTIVE_ID]]
 ; PASS-CHECK-NEXT:    br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]]
 ; PASS-CHECK:       [[WORK]]:
 ; PASS-CHECK-NEXT:    store i32 5, ptr addrspace(1) [[OUT]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index a7e828c95d69f..402ccd91fed8d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -248,12 +248,14 @@ define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
 ;
 ; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
 ; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT:    store i32 7, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 7)
+; PASS-CHECK-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
 ; PASS-CHECK-NEXT:    ret void
 ;
 ; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
 ; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT:    store i32 7, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 7)
+; DCE-CHECK-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
 ; DCE-CHECK-NEXT:    ret void
 ;
   %v = call i32 @llvm.amdgcn.readfirstlane(i32 7)
@@ -269,12 +271,14 @@ define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i3
 ;
 ; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
 ; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT:    store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
+; PASS-CHECK-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
 ; PASS-CHECK-NEXT:    ret void
 ;
 ; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
 ; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT:    store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
+; DCE-CHECK-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
 ; DCE-CHECK-NEXT:    ret void
 ;
   %v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
@@ -360,12 +364,16 @@ define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %ou
 ;
 ; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
 ; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT:    store i32 5, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT:    [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 5)
+; PASS-CHECK-NEXT:    [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
+; PASS-CHECK-NEXT:    store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
 ; PASS-CHECK-NEXT:    ret void
 ;
 ; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
 ; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT:    store i32 5, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT:    [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 5)
+; DCE-CHECK-NEXT:    [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
+; DCE-CHECK-NEXT:    store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
 ; DCE-CHECK-NEXT:    ret void
 ;
   %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
@@ -388,7 +396,8 @@ define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
 ; PASS-CHECK-NEXT:    [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 ; PASS-CHECK-NEXT:    [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
 ; PASS-CHECK-NEXT:    [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; PASS-CHECK-NEXT:    store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT:    [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
+; PASS-CHECK-NEXT:    store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
 ; PASS-CHECK-NEXT:    ret void
 ;
 ; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
@@ -396,7 +405,8 @@ define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
 ; DCE-CHECK-NEXT:    [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 ; DCE-CHECK-NEXT:    [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
 ; DCE-CHECK-NEXT:    [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; DCE-CHECK-NEXT:    store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT:    [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
+; DCE-CHECK-NEXT:    store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
 ; DCE-CHECK-NEXT:    ret void
 ;
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -537,13 +547,15 @@ define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
 ; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
 ; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
 ; PASS-CHECK-NEXT:    [[RANDOM:%.*]] = xor i32 123, 456
-; PASS-CHECK-NEXT:    store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[RANDOM]])
+; PASS-CHECK-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
 ; PASS-CHECK-NEXT:    ret void
 ;
 ; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
 ; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
 ; DCE-CHECK-NEXT:    [[RANDOM:%.*]] = xor i32 123, 456
-; DCE-CHECK-NEXT:    store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[RANDOM]])
+; DCE-CHECK-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
 ; DCE-CHECK-NEXT:    ret void
 ;
   %random = xor i32 123, 456
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index 29bfc253e2e7e..fe9ec8e6ef52a 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -123,27 +123,25 @@ define amdgpu_kernel void @s_and_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; GFX6-LABEL: s_and_i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_and_b32 s0, s2, s3
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_and_b32 s4, s4, s5
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, s0
-; GFX8-NEXT:    s_and_b32 s0, s2, s3
-; GFX8-NEXT:    s_mov_b32 s5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX8-NEXT:    s_and_b32 s4, s4, s5
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
   %and = and i32 %a, %b
   store i32 %and, ptr addrspace(1) %out, align 4
@@ -189,36 +187,34 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_0(ptr addrspace(1) %out,
 ; GFX6-LABEL: s_and_multi_use_constant_i32_0:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_and_b32 s0, s2, 0x12d687
-; GFX6-NEXT:    s_add_i32 s0, s0, s3
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0x12d687
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_and_b32 s4, s4, 0x12d687
+; GFX6-NEXT:    s_add_i32 s4, s4, s5
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_multi_use_constant_i32_0:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x12d687
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, s0
-; GFX8-NEXT:    s_and_b32 s0, s2, 0x12d687
-; GFX8-NEXT:    s_add_i32 s0, s0, s3
-; GFX8-NEXT:    s_mov_b32 s5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX8-NEXT:    s_and_b32 s4, s4, 0x12d687
+; GFX8-NEXT:    s_add_i32 s4, s4, s5
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0x12d687
-; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_endpgm
   %and = and i32 %a, 1234567
@@ -236,32 +232,30 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_1(ptr addrspace(1) %out,
 ; GFX6-LABEL: s_and_multi_use_constant_i32_1:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_and_b32 s0, s2, 0x12d687
-; GFX6-NEXT:    s_add_i32 s0, s0, s3
-; GFX6-NEXT:    s_add_i32 s0, s0, 0x12d687
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_and_b32 s4, s4, 0x12d687
+; GFX6-NEXT:    s_add_i32 s4, s4, s5
+; GFX6-NEXT:    s_add_i32 s4, s4, 0x12d687
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_and_multi_use_constant_i32_1:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, s0
-; GFX8-NEXT:    s_and_b32 s0, s2, 0x12d687
-; GFX8-NEXT:    s_add_i32 s0, s0, s3
-; GFX8-NEXT:    s_add_i32 s0, s0, 0x12d687
-; GFX8-NEXT:    s_mov_b32 s5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX8-NEXT:    s_and_b32 s4, s4, 0x12d687
+; GFX8-NEXT:    s_add_i32 s4, s4, s5
+; GFX8-NEXT:    s_add_i32 s4, s4, 0x12d687
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_endpgm
   %and = and i32 %a, 1234567
diff --git a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
index 18cf120a1d299..61645200690f5 100644
--- a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
@@ -27,24 +27,26 @@ define amdgpu_kernel void @any_extend_vector_inreg_v16i8_to_v4i32(ptr addrspace(
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:1
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:3
-; GFX6-NEXT:    s_lshr_b32 s8, s9, 16
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    s_lshr_b32 s7, s9, 16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[4:5], 8
-; GFX6-NEXT:    v_mov_b32_e32 v1, s11
-; GFX6-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:9
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[4:5], 8
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s11
+; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:9
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:2
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], 16
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, s5
-; GFX6-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
-; GFX6-NEXT:    v_alignbit_b32 v0, s8, v0, 16
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:12
+; GFX6-NEXT:    s_lshr_b32 s5, s4, 8
+; GFX6-NEXT:    s_lshr_b32 s4, s4, 24
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, s7
-; GFX6-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:12
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:5
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
-; GFX6-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:7
 ; GFX6-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index 9a4040a25419a..49977a4c64784 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -265,8 +265,7 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 2
-; GFX9-NEXT:    s_add_i32 s0, s0, s2
+; GFX9-NEXT:    s_lshl2_add_u32 s0, s2, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
@@ -282,9 +281,8 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sub_i32 s2, s2, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s0, s0, s2
+; GFX11-NEXT:    s_lshl2_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 6a95881067b93..ff74d1f71616d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2147,12 +2147,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3]
+; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s4, v2, s[2:3]
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
-; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2]
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v3
-; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, v4
+; GFX1164-NEXT:    v_mad_u64_u32 v[4:5], null, s5, v2, v[0:1]
+; GFX1164-NEXT:    buffer_store_b64 v[3:4], off, s[0:3], 0
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: add_i64_uniform:
@@ -2190,12 +2190,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3]
+; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s4, v2, s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
-; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2]
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v3
-; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT:    v_mov_b32_e32 v0, v4
+; GFX1132-NEXT:    v_mad_u64_u32 v[4:5], null, s5, v2, v[0:1]
+; GFX1132-NEXT:    buffer_store_b64 v[3:4], off, s[0:3], 0
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX1264-LABEL: add_i64_uniform:
@@ -6208,10 +6208,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s8, v4, 0
-; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s9, v4, v[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s9, v4, v[3:4]
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s7, v2
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 08a4f0cdad18f..f5ca24f59a286 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -1889,13 +1889,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
 ; GFX1164-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX1164-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, s[4:5]
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
-; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
+; GFX1164-NEXT:    v_mov_b32_e32 v0, v4
+; GFX1164-NEXT:    v_mad_u64_u32 v[4:5], null, s3, v2, v[0:1]
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v3
-; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT:    buffer_store_b64 v[3:4], off, s[0:3], 0
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: add_i64_uniform:
@@ -1926,13 +1926,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
 ; GFX1132-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX1132-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, s[4:5]
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
-; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
+; GFX1132-NEXT:    v_mov_b32_e32 v0, v4
+; GFX1132-NEXT:    v_mad_u64_u32 v[4:5], null, s3, v2, v[0:1]
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v3
-; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT:    buffer_store_b64 v[3:4], off, s[0:3], 0
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %additive acq_rel
@@ -5182,13 +5182,12 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
 ; GFX9-NEXT:  .LBB13_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[8:9], s2, v2, 0
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v2, v[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v2, v[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s8, v3
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
index 30a78648c186a..39618b05e6c71 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
@@ -368,7 +368,10 @@ define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
 define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
 ; GCN-LABEL: test_clamp_bf16_folding:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_exp_bf16_e64 v0, v0 clamp
+; GCN-NEXT:    v_exp_bf16_e32 v0, v0
+; GCN-NEXT:    v_nop
+; GCN-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GCN-NEXT:    v_pk_max_num_bf16 v0, v0, v0 clamp
 ; GCN-NEXT:    ; return to shader part epilog
   %exp = call bfloat @llvm.exp2.bf16(bfloat %src)
   %max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 30ad46d959b7e..393e9fecbb308 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -968,14 +968,14 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, 0, v29, vcc
 ; GFX8-NEXT:    s_movk_i32 s4, 0x70
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[28:29]
-; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
 ; GFX8-NEXT:    v_addc_u32_e32 v29, vcc, 0, v29, vcc
 ; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
 ; GFX8-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
 ; GFX8-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
-; GFX8-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
 ; GFX8-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -9552,6 +9552,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_addc_u32_e32 v34, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v35, vcc, 36, v1
 ; GFX8-NEXT:    v_addc_u32_e32 v36, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v37, vcc, 38, v1
 ; GFX8-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX8-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; GFX8-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
@@ -9563,7 +9564,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX8-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX8-NEXT:    buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX8-NEXT:    v_add_u32_e32 v37, vcc, 38, v1
 ; GFX8-NEXT:    flat_load_ushort v44, v[1:2]
 ; GFX8-NEXT:    v_addc_u32_e32 v38, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v48, vcc, 40, v1
@@ -46065,18 +46065,18 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) {
 ; GCN-LABEL: s_select_v3bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_mul_f32_e64 v1, 1.0, s1
-; GCN-NEXT:    v_mul_f32_e64 v2, 1.0, s0
-; GCN-NEXT:    v_mul_f32_e64 v3, 1.0, s4
-; GCN-NEXT:    v_mul_f32_e64 v4, 1.0, s3
+; GCN-NEXT:    v_mul_f32_e64 v2, 1.0, s1
+; GCN-NEXT:    v_mul_f32_e64 v1, 1.0, s0
+; GCN-NEXT:    v_mul_f32_e64 v4, 1.0, s4
+; GCN-NEXT:    v_mul_f32_e64 v3, 1.0, s3
 ; GCN-NEXT:    v_mul_f32_e64 v5, 1.0, s2
 ; GCN-NEXT:    v_mul_f32_e64 v6, 1.0, s5
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; GCN-NEXT:    v_alignbit_b32 v2, v3, v4, 16
+; GCN-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; GCN-NEXT:    v_lshr_b64 v[2:3], v[3:4], 16
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
@@ -46087,13 +46087,13 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
 ; GFX7-LABEL: s_select_v3bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    v_mul_f32_e64 v1, 1.0, s1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_mul_f32_e64 v2, 1.0, s0
-; GFX7-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX7-NEXT:    v_mul_f32_e64 v1, 1.0, s0
+; GFX7-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; GFX7-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_mul_f32_e64 v3, 1.0, s3
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v3, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT:    v_mul_f32_e64 v2, 1.0, s3
+; GFX7-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
 ; GFX7-NEXT:    v_mul_f32_e64 v3, 1.0, s2
 ; GFX7-NEXT:    v_mul_f32_e64 v4, 1.0, s5
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -46203,22 +46203,22 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
 define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) {
 ; GCN-LABEL: s_select_v4bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_mul_f32_e64 v1, 1.0, s1
-; GCN-NEXT:    v_mul_f32_e64 v2, 1.0, s0
-; GCN-NEXT:    v_mul_f32_e64 v3, 1.0, s5
-; GCN-NEXT:    v_mul_f32_e64 v4, 1.0, s4
-; GCN-NEXT:    v_mul_f32_e64 v5, 1.0, s3
-; GCN-NEXT:    v_mul_f32_e64 v6, 1.0, s2
-; GCN-NEXT:    v_mul_f32_e64 v7, 1.0, s7
-; GCN-NEXT:    v_mul_f32_e64 v8, 1.0, s6
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; GCN-NEXT:    v_alignbit_b32 v2, v3, v4, 16
-; GCN-NEXT:    v_alignbit_b32 v3, v5, v6, 16
-; GCN-NEXT:    v_alignbit_b32 v4, v7, v8, 16
+; GCN-NEXT:    v_mul_f32_e64 v2, 1.0, s1
+; GCN-NEXT:    v_mul_f32_e64 v1, 1.0, s0
+; GCN-NEXT:    v_mul_f32_e64 v4, 1.0, s5
+; GCN-NEXT:    v_mul_f32_e64 v3, 1.0, s4
+; GCN-NEXT:    v_mul_f32_e64 v6, 1.0, s3
+; GCN-NEXT:    v_mul_f32_e64 v5, 1.0, s2
+; GCN-NEXT:    v_mul_f32_e64 v8, 1.0, s7
+; GCN-NEXT:    v_mul_f32_e64 v7, 1.0, s6
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
+; GCN-NEXT:    v_lshr_b64 v[2:3], v[3:4], 16
+; GCN-NEXT:    v_lshr_b64 v[3:4], v[5:6], 16
+; GCN-NEXT:    v_lshr_b64 v[4:5], v[7:8], 16
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
@@ -46229,21 +46229,21 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ; GFX7-LABEL: s_select_v4bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    v_mul_f32_e64 v1, 1.0, s1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_mul_f32_e64 v2, 1.0, s0
-; GFX7-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX7-NEXT:    v_mul_f32_e64 v1, 1.0, s0
+; GFX7-NEXT:    v_lshr_b64 v[1:2], v[1:2], 16
 ; GFX7-NEXT:    v_mul_f32_e64 v2, 1.0, s5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_mul_f32_e64 v3, 1.0, s4
-; GFX7-NEXT:    v_alignbit_b32 v2, v2, v3, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT:    v_mul_f32_e64 v2, 1.0, s4
+; GFX7-NEXT:    v_lshr_b64 v[2:3], v[2:3], 16
 ; GFX7-NEXT:    v_mul_f32_e64 v3, 1.0, s3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_mul_f32_e64 v4, 1.0, s2
-; GFX7-NEXT:    v_alignbit_b32 v3, v3, v4, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX7-NEXT:    v_mul_f32_e64 v3, 1.0, s2
+; GFX7-NEXT:    v_lshr_b64 v[3:4], v[3:4], 16
 ; GFX7-NEXT:    v_mul_f32_e64 v4, 1.0, s7
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e64 v5, 1.0, s6
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v5, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX7-NEXT:    v_mul_f32_e64 v4, 1.0, s6
+; GFX7-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index c14678cafc7a4..c0d5f8a9d1c3b 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -120,17 +120,17 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %
 ; SI-LABEL: s_ubfe_sub_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_sub_i32 s3, 32, s3
-; SI-NEXT:    s_lshl_b32 s2, s2, s3
-; SI-NEXT:    s_lshr_b32 s2, s2, s3
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_sub_i32 s2, 32, s5
+; SI-NEXT:    s_lshl_b32 s4, s4, s2
+; SI-NEXT:    s_lshr_b32 s4, s4, s2
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_ubfe_sub_i32:
@@ -160,20 +160,20 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i
 ; SI-LABEL: s_ubfe_sub_multi_use_shl_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_sub_i32 s3, 32, s3
-; SI-NEXT:    s_lshl_b32 s2, s2, s3
-; SI-NEXT:    s_lshr_b32 s3, s2, s3
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v2, s3
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_sub_i32 s3, 32, s5
+; SI-NEXT:    s_lshl_b32 s4, s4, s3
+; SI-NEXT:    s_lshr_b32 s5, s4, s3
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
@@ -322,17 +322,17 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %
 ; SI-LABEL: s_sbfe_sub_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_sub_i32 s3, 32, s3
-; SI-NEXT:    s_lshl_b32 s2, s2, s3
-; SI-NEXT:    s_ashr_i32 s2, s2, s3
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_sub_i32 s2, 32, s5
+; SI-NEXT:    s_lshl_b32 s4, s4, s2
+; SI-NEXT:    s_ashr_i32 s4, s4, s2
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_sbfe_sub_i32:
@@ -362,20 +362,20 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i
 ; SI-LABEL: s_sbfe_sub_multi_use_shl_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_sub_i32 s3, 32, s3
-; SI-NEXT:    s_lshl_b32 s2, s2, s3
-; SI-NEXT:    s_ashr_i32 s3, s2, s3
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v2, s3
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_sub_i32 s3, 32, s5
+; SI-NEXT:    s_lshl_b32 s4, s4, s3
+; SI-NEXT:    s_ashr_i32 s5, s4, s3
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
index bd76f34832f0a..7326adae8cbcb 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
@@ -284,16 +284,15 @@ define amdgpu_kernel void @v_bfi_dont_applied_for_scalar_ops(ptr addrspace(1) %o
 ; GCN-LABEL: v_bfi_dont_applied_for_scalar_ops:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s3, s3, 0xffff0000
-; GCN-NEXT:    s_and_b32 s2, s2, 0xffff
-; GCN-NEXT:    s_or_b32 s2, s2, s3
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_and_b32 s2, s5, 0xffff0000
+; GCN-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NEXT:    s_or_b32 s4, s4, s2
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %shift = lshr i32 %b, 16
   %tr = trunc i32 %shift to i16
diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll
index a12b5ea4c0c21..172e07f6b792c 100644
--- a/llvm/test/CodeGen/AMDGPU/bfm.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfm.ll
@@ -6,14 +6,13 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y)
 ; SI-LABEL: s_bfm_pattern:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfm_b32 s2, s2, s3
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_bfm_b32 s4, s4, s5
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_bfm_pattern:
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index d4f56175d790c..e33b9ab0eda9e 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -362,31 +362,29 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32>
 ; SI-LABEL: s_brev_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_brev_b32 s0, s3
-; SI-NEXT:    s_brev_b32 s1, s2
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_brev_b32 s5, s5
+; SI-NEXT:    s_brev_b32 s4, s4
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; FLAT-LABEL: s_brev_v2i32:
 ; FLAT:       ; %bb.0:
 ; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; FLAT-NEXT:    s_mov_b32 s7, 0xf000
-; FLAT-NEXT:    s_mov_b32 s6, -1
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    s_mov_b32 s4, s0
-; FLAT-NEXT:    s_mov_b32 s5, s1
-; FLAT-NEXT:    s_brev_b32 s0, s3
-; FLAT-NEXT:    s_brev_b32 s1, s2
-; FLAT-NEXT:    v_mov_b32_e32 v0, s1
-; FLAT-NEXT:    v_mov_b32_e32 v1, s0
-; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; FLAT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; FLAT-NEXT:    s_brev_b32 s5, s5
+; FLAT-NEXT:    s_brev_b32 s4, s4
+; FLAT-NEXT:    s_mov_b32 s3, 0xf000
+; FLAT-NEXT:    s_mov_b32 s2, -1
+; FLAT-NEXT:    v_mov_b32_e32 v0, s4
+; FLAT-NEXT:    v_mov_b32_e32 v1, s5
+; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; FLAT-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: s_brev_v2i32:
@@ -405,16 +403,14 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32>
 ; GFX11-FLAT-LABEL: s_brev_v2i32:
 ; GFX11-FLAT:       ; %bb.0:
 ; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
 ; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FLAT-NEXT:    s_brev_b32 s2, s2
 ; GFX11-FLAT-NEXT:    s_brev_b32 s3, s3
 ; GFX11-FLAT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FLAT-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-FLAT-NEXT:    s_mov_b32 s4, s0
-; GFX11-FLAT-NEXT:    s_mov_b32 s5, s1
-; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
+; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: s_brev_v2i32:
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
index 8a39d9c517b50..34c0159dd3ddb 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
@@ -68,7 +68,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x04000000), %bb.7(0x7c000000)
   ; CHECK-NEXT:   liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4
+  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 1835018 /* regdef:SReg_32 */, def renamable $sgpr4
   ; CHECK-NEXT:   S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
@@ -149,7 +149,7 @@ body:             |
     successors: %bb.3(0x04000000), %bb.2(0x7c000000)
     liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
 
-    INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4
+    INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 1835018 /* regdef:SReg_32 */, def renamable $sgpr4
     S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
     S_CBRANCH_SCC1 %bb.2, implicit killed $scc
 
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
index a2f02052cbf36..4cf92b0127131 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
@@ -69,7 +69,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x04000000), %bb.7(0x7c000000)
   ; CHECK-NEXT:   liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4
+  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 1835018 /* regdef:SReg_32 */, def renamable $sgpr4
   ; CHECK-NEXT:   S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
@@ -151,7 +151,7 @@ body:             |
     successors: %bb.3(0x04000000), %bb.2(0x7c000000)
     liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
 
-    INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4
+    INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 1835018 /* regdef:SReg_32 */, def renamable $sgpr4
     S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
     S_CBRANCH_SCC1 %bb.2, implicit killed $scc
 
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index f8655a702180e..f465e3c505c02 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -280,7 +280,7 @@ bb0:
   br i1 %tmp, label %bb2, label %bb3
 
 bb2:
-  store volatile i32 17, ptr addrspace(1) undef
+  store volatile i32 17, ptr addrspace(1) poison
   br label %bb4
 
 bb3:
@@ -375,7 +375,7 @@ bb0:
   br i1 %cmp0, label %bb2, label %bb1
 
 bb1:
-  %val = load volatile i32, ptr addrspace(4) undef
+  %val = load volatile i32, ptr addrspace(4) poison
   %cmp1 = icmp eq i32 %val, 3
   br i1 %cmp1, label %bb3, label %bb2
 
@@ -512,7 +512,7 @@ loop_body:
   br label %loop
 
 ret:
-  store volatile i32 7, ptr addrspace(1) undef
+  store volatile i32 7, ptr addrspace(1) poison
   ret void
 }
 
@@ -622,7 +622,7 @@ bb14:                                             ; preds = %bb13, %bb9
   br label %bb19
 
 bb19:                                             ; preds = %bb14, %bb13, %bb9
-  %tmp20 = phi i32 [ undef, %bb9 ], [ undef, %bb13 ], [ %tmp18, %bb14 ]
+  %tmp20 = phi i32 [ poison, %bb9 ], [ poison, %bb13 ], [ %tmp18, %bb14 ]
   %tmp21 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %arg5
   store i32 %tmp20, ptr addrspace(1) %tmp21, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 142290a39f8f4..361bc78759bfa 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -2382,17 +2382,17 @@ define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v8, v2
-; SDAG-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
 ; SDAG-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v14, 8, v0
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
-; SDAG-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
+; SDAG-NEXT:    v_lshrrev_b64 v[11:12], 24, v[2:3]
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
 ; SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; SDAG-NEXT:    v_mov_b32_e32 v8, v2
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v14
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v13
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index 68313807c427f..04f8ad8a02303 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -450,23 +450,38 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v63, s12, v1
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v1, 0x100, v1
 ; GISEL-GFX942-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v0
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT:    scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT:    scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
 ; GISEL-GFX942-NEXT:    scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
@@ -976,23 +991,38 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v63, s12, v1
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v1, 0x100, v1
 ; GISEL-GFX942-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v0
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT:    scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT:    scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
 ; GISEL-GFX942-NEXT:    scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
@@ -1159,24 +1189,23 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s9, s12
 ; SDAG-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s6, s3
-; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s8, s1
 ; SDAG-GFX1100-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s13, s2
-; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-GFX1100-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
 ; SDAG-GFX1100-NEXT:    s_clause 0x1
 ; SDAG-GFX1100-NEXT:    s_load_b32 s13, s[4:5], 0x54
 ; SDAG-GFX1100-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s5, s12
 ; SDAG-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX1100-NEXT:    s_mov_b32 s4, s3
 ; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v5, s0
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
+; SDAG-GFX1100-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX1100-NEXT:    s_mov_b32 s3, s12
 ; SDAG-GFX1100-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s13, s2
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s2, s1
-; SDAG-GFX1100-NEXT:    s_mov_b32 s3, s12
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-GFX1100-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0)
@@ -1220,12 +1249,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s8, s1
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s9, s2
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s10, s3
-; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
 ; GISEL-GFX1100-NEXT:    s_clause 0x1
 ; GISEL-GFX1100-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
 ; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x54
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v5, s0
+; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s1
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s2
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s3
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 763f436997c21..37f4094806637 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -186,10 +186,12 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 5
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_alignbit_b32 v0, 5, s6, 16
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], 16
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -255,16 +257,15 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
 ; GFX6-LABEL: build_v2i32_from_v4i16_shuffle:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: build_v2i32_from_v4i16_shuffle:
diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index d7d697ef85b9f..00baf0a44368d 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -1026,102 +1026,100 @@ define i64 @sdiv64_known32(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v5, v2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v7, v1
-; GFX9-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v0
-; GFX9-NEXT:    v_sub_co_u32_e32 v11, vcc, 0, v6
-; GFX9-NEXT:    v_subb_co_u32_e32 v12, vcc, 0, v0, vcc
-; GFX9-NEXT:    v_madmk_f32 v1, v3, 0x4f800000, v1
-; GFX9-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX9-NEXT:    v_madmk_f32 v1, v3, 0xcf800000, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v10, v3
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v5, v11, v10
-; GFX9-NEXT:    v_mul_lo_u32 v8, v12, v1
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0
-; GFX9-NEXT:    v_add3_u32 v8, v4, v5, v8
-; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v8, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v9, v4
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v10, v8, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v13, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v9, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, v10, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, v11, v13
-; GFX9-NEXT:    v_mul_lo_u32 v8, v12, v1
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0
-; GFX9-NEXT:    v_add3_u32 v8, v4, v5, v8
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v8, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v8, 0
-; GFX9-NEXT:    v_mul_hi_u32 v12, v1, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v3, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v12, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v11, vcc
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v0
+; GFX9-NEXT:    v_sub_co_u32_e32 v10, vcc, 0, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v0, vcc
+; GFX9-NEXT:    v_madmk_f32 v4, v5, 0x4f800000, v4
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX9-NEXT:    v_madmk_f32 v4, v5, 0xcf800000, v4
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v5
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v4
+; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v8
+; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v9
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
+; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
+; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
+; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v13, v4, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v5, 0
-; GFX9-NEXT:    v_mul_hi_u32 v8, v7, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v8, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v1, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v2, v5, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v10, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v9, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, v0, v1
-; GFX9-NEXT:    v_mul_lo_u32 v9, v6, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v1, 0
-; GFX9-NEXT:    v_add3_u32 v4, v4, v9, v8
-; GFX9-NEXT:    v_sub_u32_e32 v8, v2, v4
-; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, v7, v3
-; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[4:5], v8, v0, vcc
-; GFX9-NEXT:    v_sub_co_u32_e64 v8, s[4:5], v3, v6
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[4:5], 0, v7, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v0
+; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v9, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v13
+; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v12
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
+; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
+; GFX9-NEXT:    v_mul_hi_u32 v11, v12, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v11, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v7, 0
+; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v2, v7, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v4, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, v0, v6
+; GFX9-NEXT:    v_mul_lo_u32 v9, v3, v7
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, v6, 0
+; GFX9-NEXT:    v_add3_u32 v5, v5, v9, v8
+; GFX9-NEXT:    v_sub_u32_e32 v8, v2, v5
+; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v4
+; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[4:5], v8, v0, vcc
+; GFX9-NEXT:    v_sub_co_u32_e64 v8, s[4:5], v1, v3
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v6
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v8, s[4:5]
-; GFX9-NEXT:    v_add_co_u32_e64 v8, s[4:5], 2, v1
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], 0, v5, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v9, v8, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e64 v8, s[4:5], 2, v6
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], 0, v7, s[4:5]
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
-; GFX9-NEXT:    v_add_co_u32_e64 v10, s[4:5], 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v6
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[4:5], 0, v5, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v10, s[4:5], 1, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[4:5], 0, v7, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v0
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v9, s[4:5]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v11, v9, s[4:5]
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v10, v8, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v0, vcc
 ; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX9-NEXT:  .LBB10_2: ; %Flow
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
new file mode 100644
index 0000000000000..04e472419ca61
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
@@ -0,0 +1,130 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
+
+; We've separated this file from call-args-inreg.ll since GlobalISel does not support the bfloat type.
+; Ideally, we should merge the two files once that support lands.
+
+declare hidden void @external_void_func_bf16_inreg(bfloat inreg) #0
+declare hidden void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg) #0
+
+define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_bf16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s17, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-NEXT:    v_writelane_b32 v40, s17, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_bf16_inreg@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_bf16_inreg@rel32@hi+12
+; GFX9-NEXT:    s_mov_b32 s0, s16
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_bf16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s1, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_bf16_inreg@rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_bf16_inreg@rel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  call void @external_void_func_bf16_inreg(bfloat inreg %arg)
+  ret void
+}
+
+define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v2bf16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s17, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-NEXT:    v_writelane_b32 v40, s17, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[18:19]
+; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2bf16_inreg@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2bf16_inreg@rel32@hi+12
+; GFX9-NEXT:    s_mov_b32 s0, s16
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_v2bf16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s1, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2bf16_inreg@rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2bf16_inreg@rel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  call void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
index d1cede64ce71d..f96007ae513bd 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GISEL %s
 
 declare hidden void @external_void_func_i8_inreg(i8 inreg) #0
 declare hidden void @external_void_func_i16_inreg(i32 inreg) #0
@@ -12,11 +14,9 @@ declare hidden void @external_void_func_v4i32_inreg(<4 x i32> inreg) #0
 declare hidden void @external_void_func_v8i32_inreg(<8 x i32> inreg) #0
 declare hidden void @external_void_func_v16i32_inreg(<16 x i32> inreg) #0
 declare hidden void @external_void_func_f16_inreg(half inreg) #0
-declare hidden void @external_void_func_bf16_inreg(bfloat inreg) #0
 declare hidden void @external_void_func_f32_inreg(float inreg) #0
 declare hidden void @external_void_func_f64_inreg(double inreg) #0
 declare hidden void @external_void_func_v2f16_inreg(<2 x half> inreg) #0
-declare hidden void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg) #0
 declare hidden void @external_void_func_v3f16_inreg(<3 x half> inreg) #0
 declare hidden void @external_void_func_v4f16_inreg(<4 x half> inreg) #0
 
@@ -212,35 +212,6 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_i64_inreg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s18, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 s[18:19]
-; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_i64_inreg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_i64_inreg@rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s0, s16
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s33, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: test_call_external_void_func_i64_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -273,35 +244,6 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v2i32_inreg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s18, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 s[18:19]
-; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2i32_inreg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2i32_inreg@rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s0, s16
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s33, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: test_call_external_void_func_v2i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -334,36 +276,6 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v3i32_inreg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s19, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-NEXT:    v_writelane_b32 v40, s19, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 s[20:21]
-; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v3i32_inreg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v3i32_inreg@rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s2, s18
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s0, s16
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s33, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: test_call_external_void_func_v3i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -396,37 +308,6 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v4i32_inreg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s20, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-NEXT:    v_writelane_b32 v40, s20, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 s[20:21]
-; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v4i32_inreg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v4i32_inreg@rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s3, s19
-; GFX9-NEXT:    s_mov_b32 s2, s18
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s0, s16
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s33, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: test_call_external_void_func_v4i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -459,41 +340,6 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v8i32_inreg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s24, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[26:27], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[26:27]
-; GFX9-NEXT:    v_writelane_b32 v40, s24, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 s[24:25]
-; GFX9-NEXT:    s_add_u32 s24, s24, external_void_func_v8i32_inreg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s25, s25, external_void_func_v8i32_inreg@rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s3, s19
-; GFX9-NEXT:    s_mov_b32 s2, s18
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s0, s16
-; GFX9-NEXT:    s_mov_b32 s16, s20
-; GFX9-NEXT:    s_mov_b32 s17, s21
-; GFX9-NEXT:    s_mov_b32 s18, s22
-; GFX9-NEXT:    s_mov_b32 s19, s23
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[24:25]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s33, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: test_call_external_void_func_v8i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -585,66 +431,6 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
   ret void
 }
 
-define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_bf16_inreg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s17, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-NEXT:    v_writelane_b32 v40, s17, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 s[18:19]
-; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_bf16_inreg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_bf16_inreg@rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s33, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: test_call_external_void_func_bf16_inreg:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s1, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_bf16_inreg@rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_bf16_inreg@rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  call void @external_void_func_bf16_inreg(bfloat inreg %arg)
-  ret void
-}
-
 define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
 ; GFX9-LABEL: test_call_external_void_func_f32_inreg:
 ; GFX9:       ; %bb.0:
@@ -706,35 +492,6 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_f64_inreg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s18, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 s[18:19]
-; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_f64_inreg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_f64_inreg@rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s0, s16
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s33, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: test_call_external_void_func_f64_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -826,97 +583,7 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
   ret void
 }
 
-
-define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v2bf16_inreg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s17, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-NEXT:    v_writelane_b32 v40, s17, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 s[18:19]
-; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2bf16_inreg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2bf16_inreg@rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s0, s16
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s33, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: test_call_external_void_func_v2bf16_inreg:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s1, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2bf16_inreg@rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2bf16_inreg@rel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  call void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
-  ret void
-}
-
 define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v3f16_inreg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s18, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 s[18:19]
-; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v3f16_inreg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v3f16_inreg@rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s0, s16
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s33, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: test_call_external_void_func_v3f16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -949,35 +616,6 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
 }
 
 define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v4f16_inreg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s18, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 s[18:19]
-; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v4f16_inreg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v4f16_inreg@rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s0, s16
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s33, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: test_call_external_void_func_v4f16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1010,35 +648,6 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
 }
 
 define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_p0_inreg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s18, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 s[18:19]
-; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_p0_inreg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p0_inreg@rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s0, s16
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s33, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: test_call_external_void_func_p0_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1071,35 +680,6 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
 }
 
 define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_p1_inreg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s18, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 s[18:19]
-; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_p1_inreg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_p1_inreg@rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s0, s16
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s33, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: test_call_external_void_func_p1_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1192,37 +772,6 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
 }
 
 define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v2p1_inreg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s20, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-NEXT:    v_writelane_b32 v40, s20, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 s[20:21]
-; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v2p1_inreg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v2p1_inreg@rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s3, s19
-; GFX9-NEXT:    s_mov_b32 s2, s18
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s0, s16
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s33, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: test_call_external_void_func_v2p1_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1255,35 +804,6 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
 }
 
 define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v2p5_inreg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s18, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[20:21]
-; GFX9-NEXT:    v_writelane_b32 v40, s18, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 s[18:19]
-; GFX9-NEXT:    s_add_u32 s18, s18, external_void_func_v2p5_inreg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s19, s19, external_void_func_v2p5_inreg@rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s0, s16
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s33, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: test_call_external_void_func_v2p5_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1316,38 +836,6 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
 }
 
 define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg %arg1, i64 inreg %arg2) #0 {
-; GFX9-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s21, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-NEXT:    v_writelane_b32 v40, s21, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 s[22:23]
-; GFX9-NEXT:    s_add_u32 s22, s22, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s3, s19
-; GFX9-NEXT:    s_mov_b32 s2, s18
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s0, s16
-; GFX9-NEXT:    s_mov_b32 s16, s20
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s33, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1380,46 +868,6 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
 }
 
 define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #0 {
-; GFX9-LABEL: test_call_external_void_func_a15i32_inreg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s29, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[40:41], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-NEXT:    v_writelane_b32 v40, s29, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 s[40:41]
-; GFX9-NEXT:    s_add_u32 s40, s40, external_void_func_a15i32_inreg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s41, s41, external_void_func_a15i32_inreg@rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s3, s19
-; GFX9-NEXT:    s_mov_b32 s2, s18
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s0, s16
-; GFX9-NEXT:    s_mov_b32 s16, s20
-; GFX9-NEXT:    s_mov_b32 s17, s21
-; GFX9-NEXT:    s_mov_b32 s18, s22
-; GFX9-NEXT:    s_mov_b32 s19, s23
-; GFX9-NEXT:    s_mov_b32 s20, s24
-; GFX9-NEXT:    s_mov_b32 s21, s25
-; GFX9-NEXT:    s_mov_b32 s22, s26
-; GFX9-NEXT:    s_mov_b32 s23, s27
-; GFX9-NEXT:    s_mov_b32 s24, s28
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[40:41]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s33, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: test_call_external_void_func_a15i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1454,47 +902,6 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
 
 ; FIXME: This should also fail
 define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inreg %arg0, i32 inreg %arg1) #1 {
-; GFX9-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s21, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-NEXT:    v_writelane_b32 v40, s21, 2
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    s_getpc_b64 s[22:23]
-; GFX9-NEXT:    s_add_u32 s22, s22, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s23, s23, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s3, s7
-; GFX9-NEXT:    s_mov_b32 s2, s6
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    s_mov_b32 s4, s8
-; GFX9-NEXT:    s_mov_b32 s5, s9
-; GFX9-NEXT:    s_mov_b32 s6, s10
-; GFX9-NEXT:    s_mov_b32 s7, s11
-; GFX9-NEXT:    s_mov_b32 s8, s15
-; GFX9-NEXT:    s_mov_b32 s9, s16
-; GFX9-NEXT:    s_mov_b32 s10, s17
-; GFX9-NEXT:    s_mov_b32 s11, s18
-; GFX9-NEXT:    s_mov_b32 s15, s19
-; GFX9-NEXT:    s_mov_b32 s16, s20
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_mov_b32 s33, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1529,3 +936,6 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre
 attributes #0 = { nounwind }
 attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-work-group-id-x" "amdgpu-no-work-group-id-y" "amdgpu-no-work-group-id-z" "amdgpu-no-work-item-id-x" "amdgpu-no-work-item-id-y" "amdgpu-no-work-item-id-z" }
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL: {{.*}}
+; SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 8e12e7e03947b..c407f7645315d 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -1,10 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -global-isel=0 -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,SDAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=HSA %s
+; RUN: llc -mtriple=amdgcn -global-isel=1 -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GISEL %s
 
 declare hidden void @external_void_func_i1(i1) #0
 declare hidden void @external_void_func_i1_signext(i1 signext) #0
@@ -100,24 +101,24 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_i1_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 1
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i1_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_i1_imm:
 ; GFX11:       ; %bb.0:
@@ -145,6 +146,25 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i1_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_i1(i1 true)
   ret void
 }
@@ -196,28 +216,28 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_i1_signext:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i1_signext:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s5
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_i1_signext:
 ; GFX11:       ; %bb.0:
@@ -253,6 +273,29 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; HSA-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i1_signext:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s5
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %var = load volatile i1, ptr addrspace(1) poison
   call void @external_void_func_i1_signext(i1 signext %var)
   ret void
@@ -306,28 +349,28 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_i1_zeroext:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i1_zeroext:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s5
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_i1_zeroext:
 ; GFX11:       ; %bb.0:
@@ -363,6 +406,29 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; HSA-NEXT:    v_and_b32_e32 v0, 1, v0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i1_zeroext:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s5
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %var = load volatile i1, ptr addrspace(1) poison
   call void @external_void_func_i1_zeroext(i1 zeroext %var)
   ret void
@@ -407,24 +473,24 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_i8_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i8_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s5
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-TRUE16-LABEL: test_call_external_void_func_i8_imm:
 ; GFX11-TRUE16:       ; %bb.0:
@@ -463,6 +529,25 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i8_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s5
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_i8(i8 123)
   ret void
 }
@@ -513,27 +598,27 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_i8_signext:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i8_signext:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 glc
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s5
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_i8_signext:
 ; GFX11:       ; %bb.0:
@@ -567,6 +652,28 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i8_signext:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GISEL-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 glc
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s5
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %var = load volatile i8, ptr addrspace(1) poison
   call void @external_void_func_i8_signext(i8 signext %var)
   ret void
@@ -617,27 +724,27 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_i8_zeroext:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i8_zeroext:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s5
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_i8_zeroext:
 ; GFX11:       ; %bb.0:
@@ -671,6 +778,28 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i8_zeroext:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s5
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %var = load volatile i8, ptr addrspace(1) poison
   call void @external_void_func_i8_zeroext(i8 zeroext %var)
   ret void
@@ -715,24 +844,24 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_i16_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i16_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-TRUE16-LABEL: test_call_external_void_func_i16_imm:
 ; GFX11-TRUE16:       ; %bb.0:
@@ -771,6 +900,25 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i16_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_i16(i16 123)
   ret void
 }
@@ -820,27 +968,27 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_i16_signext:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_sshort v0, off, s[0:3], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i16_signext:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_load_sshort v0, off, s[0:3], 0 glc
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s5
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_i16_signext:
 ; GFX11:       ; %bb.0:
@@ -874,6 +1022,28 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i16_signext:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GISEL-NEXT:    buffer_load_sshort v0, off, s[0:3], 0 glc
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s5
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %var = load volatile i16, ptr addrspace(1) poison
   call void @external_void_func_i16_signext(i16 signext %var)
   ret void
@@ -924,27 +1094,27 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_i16_zeroext:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i16_zeroext:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 glc
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s5
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_i16_zeroext:
 ; GFX11:       ; %bb.0:
@@ -978,6 +1148,28 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i16_zeroext:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GISEL-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 glc
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s5
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %var = load volatile i16, ptr addrspace(1) poison
   call void @external_void_func_i16_zeroext(i16 zeroext %var)
   ret void
@@ -1022,24 +1214,24 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_i32_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i32_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s5
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 42
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_i32_imm:
 ; GFX11:       ; %bb.0:
@@ -1067,6 +1259,25 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i32_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s5
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 42
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_i32(i32 42)
   ret void
 }
@@ -1112,25 +1323,25 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_i64_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i64_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_i64_imm:
 ; GFX11:       ; %bb.0:
@@ -1159,6 +1370,26 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i64_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_i64(i64 123)
   ret void
 }
@@ -1208,27 +1439,27 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v2i64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v2i64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i64:
 ; GFX11:       ; %bb.0:
@@ -1262,6 +1493,31 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v2i64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], 0
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %val = load <2 x i64>, ptr addrspace(1) null
   call void @external_void_func_v2i64(<2 x i64> %val)
   ret void
@@ -1312,27 +1568,27 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v2i64_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 3
-; GFX9-NEXT:    v_mov_b32_e32 v3, 4
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v2i64_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; SDAG-NEXT:    v_mov_b32_e32 v1, 2
+; SDAG-NEXT:    v_mov_b32_e32 v2, 3
+; SDAG-NEXT:    v_mov_b32_e32 v3, 4
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i64_imm:
 ; GFX11:       ; %bb.0:
@@ -1364,6 +1620,28 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v2i64_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GISEL-NEXT:    v_mov_b32_e32 v1, 2
+; GISEL-NEXT:    v_mov_b32_e32 v2, 3
+; GISEL-NEXT:    v_mov_b32_e32 v3, 4
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_v2i64(<2 x i64> <i64 8589934593, i64 17179869187>)
   ret void
 }
@@ -1417,29 +1695,29 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v3i64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 1
-; GFX9-NEXT:    v_mov_b32_e32 v5, 2
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3i64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 1
+; SDAG-NEXT:    v_mov_b32_e32 v5, 2
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i64:
 ; GFX11:       ; %bb.0:
@@ -1476,6 +1754,33 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3i64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], 0
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v4, 1
+; GISEL-NEXT:    v_mov_b32_e32 v5, 2
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %load = load <2 x i64>, ptr addrspace(1) null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 poison>, <3 x i32> <i32 0, i32 1, i32 2>
 
@@ -1536,31 +1841,31 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v4i64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 1
-; GFX9-NEXT:    v_mov_b32_e32 v5, 2
-; GFX9-NEXT:    v_mov_b32_e32 v6, 3
-; GFX9-NEXT:    v_mov_b32_e32 v7, 4
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v4i64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 1
+; SDAG-NEXT:    v_mov_b32_e32 v5, 2
+; SDAG-NEXT:    v_mov_b32_e32 v6, 3
+; SDAG-NEXT:    v_mov_b32_e32 v7, 4
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i64:
 ; GFX11:       ; %bb.0:
@@ -1600,6 +1905,35 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v4i64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], 0
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v4, 1
+; GISEL-NEXT:    v_mov_b32_e32 v5, 2
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    v_mov_b32_e32 v6, 3
+; GISEL-NEXT:    v_mov_b32_e32 v7, 4
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %load = load <2 x i64>, ptr addrspace(1) null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   call void @external_void_func_v4i64(<4 x i64> %val)
@@ -1645,24 +1979,24 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_f16_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4400
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_f16_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0x4400
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-TRUE16-LABEL: test_call_external_void_func_f16_imm:
 ; GFX11-TRUE16:       ; %bb.0:
@@ -1701,6 +2035,25 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_f16_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x4400
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_f16(half 4.0)
   ret void
 }
@@ -1744,24 +2097,24 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_f32_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 4.0
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_f32_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 4.0
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_f32_imm:
 ; GFX11:       ; %bb.0:
@@ -1789,6 +2142,25 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_f32_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 4.0
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_f32(float 4.0)
   ret void
 }
@@ -1834,25 +2206,25 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v2f32_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v2f32_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 1.0
+; SDAG-NEXT:    v_mov_b32_e32 v1, 2.0
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2f32_imm:
 ; GFX11:       ; %bb.0:
@@ -1881,6 +2253,26 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v2f32_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 2.0
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_v2f32(<2 x float> <float 1.0, float 2.0>)
   ret void
 }
@@ -1928,26 +2320,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v3f32_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 4.0
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3f32_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 1.0
+; SDAG-NEXT:    v_mov_b32_e32 v1, 2.0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 4.0
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3f32_imm:
 ; GFX11:       ; %bb.0:
@@ -1978,6 +2370,27 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3f32_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 2.0
+; GISEL-NEXT:    v_mov_b32_e32 v2, 4.0
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_v3f32(<3 x float> <float 1.0, float 2.0, float 4.0>)
   ret void
 }
@@ -2029,28 +2442,28 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v5f32_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 4.0
-; GFX9-NEXT:    v_mov_b32_e32 v3, -1.0
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0.5
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v5f32_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 1.0
+; SDAG-NEXT:    v_mov_b32_e32 v1, 2.0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 4.0
+; SDAG-NEXT:    v_mov_b32_e32 v3, -1.0
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0.5
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v5f32_imm:
 ; GFX11:       ; %bb.0:
@@ -2084,6 +2497,29 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v5f32_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 2.0
+; GISEL-NEXT:    v_mov_b32_e32 v2, 4.0
+; GISEL-NEXT:    v_mov_b32_e32 v3, -1.0
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0.5
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_v5f32(<5 x float> <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>)
   ret void
 }
@@ -2129,25 +2565,25 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_f64_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40100000
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_f64_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_f64_imm:
 ; GFX11:       ; %bb.0:
@@ -2176,6 +2612,26 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_f64_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_f64(double 4.0)
   ret void
 }
@@ -2225,27 +2681,27 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v2f64_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x40100000
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v2f64_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mov_b32_e32 v1, 2.0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2f64_imm:
 ; GFX11:       ; %bb.0:
@@ -2277,6 +2733,28 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v2f64_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 2.0
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_v2f64(<2 x double> <double 2.0, double 4.0>)
   ret void
 }
@@ -2330,29 +2808,29 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v3f64_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x40100000
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x40200000
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3f64_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mov_b32_e32 v1, 2.0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x40200000
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3f64_imm:
 ; GFX11:       ; %bb.0:
@@ -2387,6 +2865,30 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3f64_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 2.0
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x40200000
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_v3f64(<3 x double> <double 2.0, double 4.0, double 8.0>)
   ret void
 }
@@ -2436,26 +2938,26 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v2i16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v2i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i16:
 ; GFX11:       ; %bb.0:
@@ -2487,6 +2989,27 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v2i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_load_dword s8, s[0:1], 0x0
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %val = load <2 x i16>, ptr addrspace(1) poison
   call void @external_void_func_v2i16(<2 x i16> %val)
   ret void
@@ -2539,26 +3062,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v3i16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i16:
 ; GFX11:       ; %bb.0:
@@ -2590,6 +3113,28 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %val = load <3 x i16>, ptr addrspace(1) poison
   call void @external_void_func_v3i16(<3 x i16> %val)
   ret void
@@ -2643,26 +3188,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v3f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3f16:
 ; GFX11:       ; %bb.0:
@@ -2694,6 +3239,28 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %val = load <3 x half>, ptr addrspace(1) poison
   call void @external_void_func_v3f16(<3 x half> %val)
   ret void
@@ -2741,25 +3308,25 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v3i16_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
-; GFX9-NEXT:    v_mov_b32_e32 v1, 3
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3i16_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0x20001
+; SDAG-NEXT:    v_mov_b32_e32 v1, 3
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i16_imm:
 ; GFX11:       ; %bb.0:
@@ -2788,6 +3355,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3i16_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GISEL-NEXT:    v_mov_b32_e32 v1, 3
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_v3i16(<3 x i16> <i16 1, i16 2, i16 3>)
   ret void
 }
@@ -2834,25 +3421,25 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v3f16_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x40003c00
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4400
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3f16_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0x40003c00
+; SDAG-NEXT:    v_mov_b32_e32 v1, 0x4400
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3f16_imm:
 ; GFX11:       ; %bb.0:
@@ -2882,6 +3469,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3f16_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x40003c00
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x4400
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_v3f16(<3 x half> <half 1.0, half 2.0, half 4.0>)
   ret void
 }
@@ -2934,26 +3541,26 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v4i16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v4i16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i16:
 ; GFX11:       ; %bb.0:
@@ -2985,6 +3592,28 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v4i16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %val = load <4 x i16>, ptr addrspace(1) poison
   call void @external_void_func_v4i16(<4 x i16> %val)
   ret void
@@ -3033,25 +3662,25 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v4i16_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40003
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v4i16_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0x20001
+; SDAG-NEXT:    v_mov_b32_e32 v1, 0x40003
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i16_imm:
 ; GFX11:       ; %bb.0:
@@ -3081,6 +3710,26 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v4i16_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x40003
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>)
   ret void
 }
@@ -3132,26 +3781,26 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v2f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v2f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2f16:
 ; GFX11:       ; %bb.0:
@@ -3183,6 +3832,27 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v2f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_load_dword s8, s[0:1], 0x0
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %val = load <2 x half>, ptr addrspace(1) poison
   call void @external_void_func_v2f16(<2 x half> %val)
   ret void
@@ -3231,26 +3901,26 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v2i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v2i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i32:
 ; GFX11:       ; %bb.0:
@@ -3282,6 +3952,28 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v2i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %val = load <2 x i32>, ptr addrspace(1) poison
   call void @external_void_func_v2i32(<2 x i32> %val)
   ret void
@@ -3328,25 +4020,25 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v2i32_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 2
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v2i32_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; SDAG-NEXT:    v_mov_b32_e32 v1, 2
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i32_imm:
 ; GFX11:       ; %bb.0:
@@ -3375,6 +4067,26 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v2i32_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GISEL-NEXT:    v_mov_b32_e32 v1, 2
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_v2i32(<2 x i32> <i32 1, i32 2>)
   ret void
 }
@@ -3422,26 +4134,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v3i32_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 3
-; GFX9-NEXT:    v_mov_b32_e32 v1, 4
-; GFX9-NEXT:    v_mov_b32_e32 v2, 5
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3i32_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s5
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 3
+; SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; SDAG-NEXT:    v_mov_b32_e32 v2, 5
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i32_imm:
 ; GFX11:       ; %bb.0:
@@ -3472,6 +4184,27 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3i32_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s5
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 3
+; GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GISEL-NEXT:    v_mov_b32_e32 v2, 5
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>)
   ret void
 }
@@ -3521,27 +4254,27 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v3i32_i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 3
-; GFX9-NEXT:    v_mov_b32_e32 v1, 4
-; GFX9-NEXT:    v_mov_b32_e32 v2, 5
-; GFX9-NEXT:    v_mov_b32_e32 v3, 6
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3i32_i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s5
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 3
+; SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; SDAG-NEXT:    v_mov_b32_e32 v2, 5
+; SDAG-NEXT:    v_mov_b32_e32 v3, 6
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i32_i32:
 ; GFX11:       ; %bb.0:
@@ -3573,6 +4306,28 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3i32_i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s5
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 3
+; GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GISEL-NEXT:    v_mov_b32_e32 v2, 5
+; GISEL-NEXT:    v_mov_b32_e32 v3, 6
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6)
   ret void
 }
@@ -3620,26 +4375,26 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v4i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v4i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i32:
 ; GFX11:       ; %bb.0:
@@ -3671,6 +4426,30 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v4i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %val = load <4 x i32>, ptr addrspace(1) poison
   call void @external_void_func_v4i32(<4 x i32> %val)
   ret void
@@ -3721,27 +4500,27 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v4i32_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 3
-; GFX9-NEXT:    v_mov_b32_e32 v3, 4
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v4i32_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; SDAG-NEXT:    v_mov_b32_e32 v1, 2
+; SDAG-NEXT:    v_mov_b32_e32 v2, 3
+; SDAG-NEXT:    v_mov_b32_e32 v3, 4
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i32_imm:
 ; GFX11:       ; %bb.0:
@@ -3773,6 +4552,28 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v4i32_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GISEL-NEXT:    v_mov_b32_e32 v1, 2
+; GISEL-NEXT:    v_mov_b32_e32 v2, 3
+; GISEL-NEXT:    v_mov_b32_e32 v3, 4
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>)
   ret void
 }
@@ -3824,28 +4625,28 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v5i32_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 3
-; GFX9-NEXT:    v_mov_b32_e32 v3, 4
-; GFX9-NEXT:    v_mov_b32_e32 v4, 5
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v5i32_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; SDAG-NEXT:    v_mov_b32_e32 v1, 2
+; SDAG-NEXT:    v_mov_b32_e32 v2, 3
+; SDAG-NEXT:    v_mov_b32_e32 v3, 4
+; SDAG-NEXT:    v_mov_b32_e32 v4, 5
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v5i32_imm:
 ; GFX11:       ; %bb.0:
@@ -3879,6 +4680,29 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v5i32_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GISEL-NEXT:    v_mov_b32_e32 v1, 2
+; GISEL-NEXT:    v_mov_b32_e32 v2, 3
+; GISEL-NEXT:    v_mov_b32_e32 v3, 4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 5
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_v5i32(<5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5>)
   ret void
 }
@@ -3932,29 +4756,29 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v8i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v8i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v8i32:
 ; GFX11:       ; %bb.0:
@@ -3993,6 +4817,36 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v8i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[0:1], 0x0
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GISEL-NEXT:    v_mov_b32_e32 v2, s10
+; GISEL-NEXT:    v_mov_b32_e32 v3, s11
+; GISEL-NEXT:    v_mov_b32_e32 v4, s12
+; GISEL-NEXT:    v_mov_b32_e32 v5, s13
+; GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GISEL-NEXT:    v_mov_b32_e32 v7, s15
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %ptr = load ptr addrspace(1), ptr addrspace(4) poison
   %val = load <8 x i32>, ptr addrspace(1) %ptr
   call void @external_void_func_v8i32(<8 x i32> %val)
@@ -4052,31 +4906,31 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v8i32_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 3
-; GFX9-NEXT:    v_mov_b32_e32 v3, 4
-; GFX9-NEXT:    v_mov_b32_e32 v4, 5
-; GFX9-NEXT:    v_mov_b32_e32 v5, 6
-; GFX9-NEXT:    v_mov_b32_e32 v6, 7
-; GFX9-NEXT:    v_mov_b32_e32 v7, 8
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v8i32_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; SDAG-NEXT:    v_mov_b32_e32 v1, 2
+; SDAG-NEXT:    v_mov_b32_e32 v2, 3
+; SDAG-NEXT:    v_mov_b32_e32 v3, 4
+; SDAG-NEXT:    v_mov_b32_e32 v4, 5
+; SDAG-NEXT:    v_mov_b32_e32 v5, 6
+; SDAG-NEXT:    v_mov_b32_e32 v6, 7
+; SDAG-NEXT:    v_mov_b32_e32 v7, 8
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v8i32_imm:
 ; GFX11:       ; %bb.0:
@@ -4114,6 +4968,32 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v8i32_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GISEL-NEXT:    v_mov_b32_e32 v1, 2
+; GISEL-NEXT:    v_mov_b32_e32 v2, 3
+; GISEL-NEXT:    v_mov_b32_e32 v3, 4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 5
+; GISEL-NEXT:    v_mov_b32_e32 v5, 6
+; GISEL-NEXT:    v_mov_b32_e32 v6, 7
+; GISEL-NEXT:    v_mov_b32_e32 v7, 8
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   call void @external_void_func_v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>)
   ret void
 }
@@ -4171,31 +5051,31 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v16i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GFX9-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
-; GFX9-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v16i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SDAG-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; SDAG-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v16i32:
 ; GFX11:       ; %bb.0:
@@ -4238,6 +5118,44 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v16i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GISEL-NEXT:    v_mov_b32_e32 v2, s10
+; GISEL-NEXT:    v_mov_b32_e32 v3, s11
+; GISEL-NEXT:    v_mov_b32_e32 v4, s12
+; GISEL-NEXT:    v_mov_b32_e32 v5, s13
+; GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GISEL-NEXT:    v_mov_b32_e32 v7, s15
+; GISEL-NEXT:    v_mov_b32_e32 v8, s16
+; GISEL-NEXT:    v_mov_b32_e32 v9, s17
+; GISEL-NEXT:    v_mov_b32_e32 v10, s18
+; GISEL-NEXT:    v_mov_b32_e32 v11, s19
+; GISEL-NEXT:    v_mov_b32_e32 v12, s20
+; GISEL-NEXT:    v_mov_b32_e32 v13, s21
+; GISEL-NEXT:    v_mov_b32_e32 v14, s22
+; GISEL-NEXT:    v_mov_b32_e32 v15, s23
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %ptr = load ptr addrspace(1), ptr addrspace(4) poison
   %val = load <16 x i32>, ptr addrspace(1) %ptr
   call void @external_void_func_v16i32(<16 x i32> %val)
@@ -4253,6 +5171,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
 ; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@@ -4260,7 +5179,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
 ; VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
 ; VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
 ; VI-NEXT:    s_add_u32 s36, s36, s3
@@ -4272,7 +5190,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; VI-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
 ; VI-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    buffer_store_dword v31, off, s[36:39], s32
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; VI-NEXT:    s_endpgm
@@ -4285,6 +5203,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; CI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
 ; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@@ -4292,7 +5211,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; CI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
 ; CI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
 ; CI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; CI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
 ; CI-NEXT:    s_add_u32 s36, s36, s3
@@ -4304,42 +5222,42 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; CI-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
 ; CI-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT:    s_waitcnt vmcnt(7)
+; CI-NEXT:    s_waitcnt vmcnt(6)
 ; CI-NEXT:    buffer_store_dword v31, off, s[36:39], s32
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v32i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
-; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GFX9-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
-; GFX9-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; GFX9-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
-; GFX9-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; GFX9-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    buffer_store_dword v31, off, s[36:39], s32
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v32i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SDAG-NEXT:    s_mov_b32 s6, -1
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; SDAG-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; SDAG-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; SDAG-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; SDAG-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
+; SDAG-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_getpc_b64 s[8:9]
+; SDAG-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_waitcnt vmcnt(6)
+; SDAG-NEXT:    buffer_store_dword v31, off, s[36:39], s32
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v32i32:
 ; GFX11:       ; %bb.0:
@@ -4394,6 +5312,62 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; HSA-NEXT:    buffer_store_dword v31, off, s[0:3], s32
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[12:13]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v32i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s54, -1
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x40
+; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0x0
+; GISEL-NEXT:    s_mov_b32 s55, 0xe00000
+; GISEL-NEXT:    s_add_u32 s52, s52, s3
+; GISEL-NEXT:    s_addc_u32 s53, s53, 0
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s23
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[52:53]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12
+; GISEL-NEXT:    buffer_store_dword v0, off, s[52:55], s32
+; GISEL-NEXT:    v_mov_b32_e32 v0, s36
+; GISEL-NEXT:    v_mov_b32_e32 v1, s37
+; GISEL-NEXT:    v_mov_b32_e32 v2, s38
+; GISEL-NEXT:    v_mov_b32_e32 v3, s39
+; GISEL-NEXT:    v_mov_b32_e32 v4, s40
+; GISEL-NEXT:    v_mov_b32_e32 v5, s41
+; GISEL-NEXT:    v_mov_b32_e32 v6, s42
+; GISEL-NEXT:    v_mov_b32_e32 v7, s43
+; GISEL-NEXT:    v_mov_b32_e32 v8, s44
+; GISEL-NEXT:    v_mov_b32_e32 v9, s45
+; GISEL-NEXT:    v_mov_b32_e32 v10, s46
+; GISEL-NEXT:    v_mov_b32_e32 v11, s47
+; GISEL-NEXT:    v_mov_b32_e32 v12, s48
+; GISEL-NEXT:    v_mov_b32_e32 v13, s49
+; GISEL-NEXT:    v_mov_b32_e32 v14, s50
+; GISEL-NEXT:    v_mov_b32_e32 v15, s51
+; GISEL-NEXT:    v_mov_b32_e32 v16, s8
+; GISEL-NEXT:    v_mov_b32_e32 v17, s9
+; GISEL-NEXT:    v_mov_b32_e32 v18, s10
+; GISEL-NEXT:    v_mov_b32_e32 v19, s11
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GISEL-NEXT:    v_mov_b32_e32 v20, s12
+; GISEL-NEXT:    v_mov_b32_e32 v21, s13
+; GISEL-NEXT:    v_mov_b32_e32 v22, s14
+; GISEL-NEXT:    v_mov_b32_e32 v23, s15
+; GISEL-NEXT:    v_mov_b32_e32 v24, s16
+; GISEL-NEXT:    v_mov_b32_e32 v25, s17
+; GISEL-NEXT:    v_mov_b32_e32 v26, s18
+; GISEL-NEXT:    v_mov_b32_e32 v27, s19
+; GISEL-NEXT:    v_mov_b32_e32 v28, s20
+; GISEL-NEXT:    v_mov_b32_e32 v29, s21
+; GISEL-NEXT:    v_mov_b32_e32 v30, s22
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %ptr = load ptr addrspace(1), ptr addrspace(4) poison
   %val = load <32 x i32>, ptr addrspace(1) %ptr
   call void @external_void_func_v32i32(<32 x i32> %val)
@@ -4471,40 +5445,40 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v32i32_i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v32, off, s[4:7], 0
-; GFX9-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
-; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GFX9-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
-; GFX9-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; GFX9-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
-; GFX9-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; GFX9-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-NEXT:    buffer_store_dword v32, off, s[36:39], s32 offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-NEXT:    buffer_store_dword v31, off, s[36:39], s32
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v32i32_i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s5
+; SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SDAG-NEXT:    s_mov_b32 s6, -1
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v32, off, s[4:7], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; SDAG-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; SDAG-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; SDAG-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; SDAG-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
+; SDAG-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; SDAG-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_waitcnt vmcnt(8)
+; SDAG-NEXT:    buffer_store_dword v32, off, s[36:39], s32 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(8)
+; SDAG-NEXT:    buffer_store_dword v31, off, s[36:39], s32
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v32i32_i32:
 ; GFX11:       ; %bb.0:
@@ -4566,6 +5540,67 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; HSA-NEXT:    buffer_store_dword v31, off, s[0:3], s32
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v32i32_i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s54, -1
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x40
+; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0x0
+; GISEL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GISEL-NEXT:    s_mov_b32 s55, 0xe00000
+; GISEL-NEXT:    s_add_u32 s52, s52, s5
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_addc_u32 s53, s53, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    ; kill: killed $sgpr0_sgpr1
+; GISEL-NEXT:    ; kill: killed $sgpr0_sgpr1
+; GISEL-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:4
+; GISEL-NEXT:    v_mov_b32_e32 v0, s23
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[52:53]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12
+; GISEL-NEXT:    buffer_store_dword v0, off, s[52:55], s32
+; GISEL-NEXT:    v_mov_b32_e32 v0, s36
+; GISEL-NEXT:    v_mov_b32_e32 v1, s37
+; GISEL-NEXT:    v_mov_b32_e32 v2, s38
+; GISEL-NEXT:    v_mov_b32_e32 v3, s39
+; GISEL-NEXT:    v_mov_b32_e32 v4, s40
+; GISEL-NEXT:    v_mov_b32_e32 v5, s41
+; GISEL-NEXT:    v_mov_b32_e32 v6, s42
+; GISEL-NEXT:    v_mov_b32_e32 v7, s43
+; GISEL-NEXT:    v_mov_b32_e32 v8, s44
+; GISEL-NEXT:    v_mov_b32_e32 v9, s45
+; GISEL-NEXT:    v_mov_b32_e32 v10, s46
+; GISEL-NEXT:    v_mov_b32_e32 v11, s47
+; GISEL-NEXT:    v_mov_b32_e32 v12, s48
+; GISEL-NEXT:    v_mov_b32_e32 v13, s49
+; GISEL-NEXT:    v_mov_b32_e32 v14, s50
+; GISEL-NEXT:    v_mov_b32_e32 v15, s51
+; GISEL-NEXT:    v_mov_b32_e32 v16, s8
+; GISEL-NEXT:    v_mov_b32_e32 v17, s9
+; GISEL-NEXT:    v_mov_b32_e32 v18, s10
+; GISEL-NEXT:    v_mov_b32_e32 v19, s11
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GISEL-NEXT:    v_mov_b32_e32 v20, s12
+; GISEL-NEXT:    v_mov_b32_e32 v21, s13
+; GISEL-NEXT:    v_mov_b32_e32 v22, s14
+; GISEL-NEXT:    v_mov_b32_e32 v23, s15
+; GISEL-NEXT:    v_mov_b32_e32 v24, s16
+; GISEL-NEXT:    v_mov_b32_e32 v25, s17
+; GISEL-NEXT:    v_mov_b32_e32 v26, s18
+; GISEL-NEXT:    v_mov_b32_e32 v27, s19
+; GISEL-NEXT:    v_mov_b32_e32 v28, s20
+; GISEL-NEXT:    v_mov_b32_e32 v29, s21
+; GISEL-NEXT:    v_mov_b32_e32 v30, s22
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %ptr0 = load ptr addrspace(1), ptr addrspace(4) poison
   %val0 = load <32 x i32>, ptr addrspace(1) %ptr0
   %val1 = load i32, ptr addrspace(1) poison
@@ -4622,29 +5657,29 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_i32_func_i32_imm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    s_add_u32 s48, s48, s5
-; GFX9-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x24
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_mov_b32 s39, 0xf000
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_i32_func_i32_imm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s50, -1
+; SDAG-NEXT:    s_mov_b32 s51, 0xe00000
+; SDAG-NEXT:    s_add_u32 s48, s48, s5
+; SDAG-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x24
+; SDAG-NEXT:    s_addc_u32 s49, s49, 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 42
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_mov_b32 s39, 0xf000
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    buffer_store_dword v0, off, s[36:39], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_i32_func_i32_imm:
 ; GFX11:       ; %bb.0:
@@ -4682,6 +5717,30 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
 ; HSA-NEXT:    buffer_store_dword v0, off, s[36:39], 0
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_i32_func_i32_imm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s50, -1
+; GISEL-NEXT:    s_mov_b32 s51, 0xe00000
+; GISEL-NEXT:    s_add_u32 s48, s48, s5
+; GISEL-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x24
+; GISEL-NEXT:    s_addc_u32 s49, s49, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 42
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xf000
+; GISEL-NEXT:    buffer_store_dword v0, off, s[36:39], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_endpgm
   %val = call i32 @external_i32_func_i32(i32 42)
   store volatile i32 %val, ptr addrspace(1) %out
   ret void
@@ -4736,29 +5795,29 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_struct_i8_i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_struct_i8_i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; SDAG-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_struct_i8_i32:
 ; GFX11:       ; %bb.0:
@@ -4797,6 +5856,30 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_struct_i8_i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[8:9]
+; GISEL-NEXT:    s_add_u32 s8, s8, external_void_func_struct_i8_i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s9, s9, external_void_func_struct_i8_i32@rel32@hi+12
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GISEL-NEXT:    s_endpgm
   %ptr0 = load ptr addrspace(1), ptr addrspace(4) poison
   %val = load { i8, i32 }, ptr addrspace(1) %ptr0
   call void @external_void_func_struct_i8_i32({ i8, i32 } %val)
@@ -4860,34 +5943,34 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_byval_struct_i8_i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 3
-; GFX9-NEXT:    buffer_store_byte v0, off, s[36:39], 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
-; GFX9-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_movk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[36:39], s32
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_byval_struct_i8_i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    v_mov_b32_e32 v0, 3
+; SDAG-NEXT:    buffer_store_byte v0, off, s[36:39], 0
+; SDAG-NEXT:    v_mov_b32_e32 v0, 8
+; SDAG-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; SDAG-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_movk_i32 s32, 0x400
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    buffer_store_dword v1, off, s[36:39], s32
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-TRUE16-LABEL: test_call_external_void_func_byval_struct_i8_i32:
 ; GFX11-TRUE16:       ; %bb.0:
@@ -4948,6 +6031,35 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; HSA-NEXT:    buffer_store_dword v1, off, s[0:3], s32
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_byval_struct_i8_i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 3
+; GISEL-NEXT:    buffer_store_byte v0, off, s[36:39], 0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 8
+; GISEL-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; GISEL-NEXT:    buffer_load_dword v0, off, s[36:39], 0
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:4
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_movk_i32 s32, 0x400
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    buffer_store_dword v0, off, s[36:39], s32
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    buffer_store_dword v1, off, s[36:39], s32 offset:4
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %val = alloca { i8, i32 }, align 8, addrspace(5)
   %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 0
   %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 1
@@ -5034,44 +6146,44 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 3
-; GFX9-NEXT:    buffer_store_byte v0, off, s[36:39], 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
-; GFX9-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0
-; GFX9-NEXT:    s_movk_i32 s32, 0x800
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[36:39], s32
-; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:8
-; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:12
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s5
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    v_mov_b32_e32 v0, 3
+; SDAG-NEXT:    buffer_store_byte v0, off, s[36:39], 0
+; SDAG-NEXT:    v_mov_b32_e32 v0, 8
+; SDAG-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; SDAG-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:4
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; SDAG-NEXT:    s_movk_i32 s32, 0x800
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    buffer_store_dword v1, off, s[36:39], s32
+; SDAG-NEXT:    v_mov_b32_e32 v0, 8
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:8
+; SDAG-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:12
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-TRUE16-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
 ; GFX11-TRUE16:       ; %bb.0:
@@ -5170,6 +6282,45 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; HSA-NEXT:    buffer_store_dword v1, off, s[4:7], 0
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s5
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 3
+; GISEL-NEXT:    buffer_store_byte v0, off, s[36:39], 0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 8
+; GISEL-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; GISEL-NEXT:    buffer_load_dword v0, off, s[36:39], 0
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:4
+; GISEL-NEXT:    s_movk_i32 s32, 0x800
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    buffer_store_dword v0, off, s[36:39], s32
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    buffer_store_dword v1, off, s[36:39], s32 offset:4
+; GISEL-NEXT:    v_mov_b32_e32 v0, 8
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:8
+; GISEL-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:12
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_endpgm
   %in.val = alloca { i8, i32 }, align 8, addrspace(5)
   %out.val = alloca { i8, i32 }, align 8, addrspace(5)
   %in.gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %in.val, i32 0, i32 0
@@ -5272,47 +6423,47 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: test_call_external_void_func_v16i8:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s38, -1
-; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-NEXT:    v_mov_b32_e32 v12, v3
-; GFX9-NEXT:    v_mov_b32_e32 v1, v16
-; GFX9-NEXT:    v_mov_b32_e32 v2, v17
-; GFX9-NEXT:    v_mov_b32_e32 v3, v18
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v16i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s38, -1
+; SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; SDAG-NEXT:    s_add_u32 s36, s36, s3
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
+; SDAG-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; SDAG-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; SDAG-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; SDAG-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; SDAG-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; SDAG-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; SDAG-NEXT:    v_mov_b32_e32 v8, v2
+; SDAG-NEXT:    v_mov_b32_e32 v12, v3
+; SDAG-NEXT:    v_mov_b32_e32 v1, v16
+; SDAG-NEXT:    v_mov_b32_e32 v2, v17
+; SDAG-NEXT:    v_mov_b32_e32 v3, v18
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v16i8:
 ; GFX11:       ; %bb.0:
@@ -5384,6 +6535,56 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v3, v18
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v16i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s38, -1
+; GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GISEL-NEXT:    s_add_u32 s36, s36, s3
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_lshr_b32 s8, s0, 8
+; GISEL-NEXT:    s_lshr_b32 s9, s0, 16
+; GISEL-NEXT:    s_lshr_b32 s10, s0, 24
+; GISEL-NEXT:    s_lshr_b32 s11, s1, 8
+; GISEL-NEXT:    s_lshr_b32 s12, s1, 16
+; GISEL-NEXT:    s_lshr_b32 s13, s1, 24
+; GISEL-NEXT:    s_lshr_b32 s14, s2, 8
+; GISEL-NEXT:    s_lshr_b32 s15, s2, 16
+; GISEL-NEXT:    s_lshr_b32 s16, s2, 24
+; GISEL-NEXT:    s_lshr_b32 s17, s3, 8
+; GISEL-NEXT:    s_lshr_b32 s18, s3, 16
+; GISEL-NEXT:    s_lshr_b32 s19, s3, 24
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v4, s1
+; GISEL-NEXT:    v_mov_b32_e32 v8, s2
+; GISEL-NEXT:    v_mov_b32_e32 v12, s3
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v1, s8
+; GISEL-NEXT:    v_mov_b32_e32 v2, s9
+; GISEL-NEXT:    v_mov_b32_e32 v3, s10
+; GISEL-NEXT:    v_mov_b32_e32 v5, s11
+; GISEL-NEXT:    v_mov_b32_e32 v6, s12
+; GISEL-NEXT:    v_mov_b32_e32 v7, s13
+; GISEL-NEXT:    v_mov_b32_e32 v9, s14
+; GISEL-NEXT:    v_mov_b32_e32 v10, s15
+; GISEL-NEXT:    v_mov_b32_e32 v11, s16
+; GISEL-NEXT:    v_mov_b32_e32 v13, s17
+; GISEL-NEXT:    v_mov_b32_e32 v14, s18
+; GISEL-NEXT:    v_mov_b32_e32 v15, s19
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
   %ptr = load ptr addrspace(1), ptr addrspace(4) poison
   %val = load <16 x i8>, ptr addrspace(1) %ptr
   call void @external_void_func_v16i8(<16 x i8> %val)
@@ -5509,64 +6710,64 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s54, -1
-; GFX9-NEXT:    s_mov_b32 s55, 0xe00000
-; GFX9-NEXT:    s_add_u32 s52, s52, s5
-; GFX9-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x64
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xa4
-; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_addc_u32 s53, s53, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s23
-; GFX9-NEXT:    buffer_store_dword v0, off, s[52:55], s32
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[52:53]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:8
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[54:55]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s36
-; GFX9-NEXT:    v_mov_b32_e32 v1, s37
-; GFX9-NEXT:    v_mov_b32_e32 v2, s38
-; GFX9-NEXT:    v_mov_b32_e32 v3, s39
-; GFX9-NEXT:    v_mov_b32_e32 v4, s40
-; GFX9-NEXT:    v_mov_b32_e32 v5, s41
-; GFX9-NEXT:    v_mov_b32_e32 v6, s42
-; GFX9-NEXT:    v_mov_b32_e32 v7, s43
-; GFX9-NEXT:    v_mov_b32_e32 v8, s44
-; GFX9-NEXT:    v_mov_b32_e32 v9, s45
-; GFX9-NEXT:    v_mov_b32_e32 v10, s46
-; GFX9-NEXT:    v_mov_b32_e32 v11, s47
-; GFX9-NEXT:    v_mov_b32_e32 v12, s48
-; GFX9-NEXT:    v_mov_b32_e32 v13, s49
-; GFX9-NEXT:    v_mov_b32_e32 v14, s50
-; GFX9-NEXT:    v_mov_b32_e32 v15, s51
-; GFX9-NEXT:    v_mov_b32_e32 v16, s8
-; GFX9-NEXT:    v_mov_b32_e32 v17, s9
-; GFX9-NEXT:    v_mov_b32_e32 v18, s10
-; GFX9-NEXT:    v_mov_b32_e32 v19, s11
-; GFX9-NEXT:    v_mov_b32_e32 v20, s12
-; GFX9-NEXT:    v_mov_b32_e32 v21, s13
-; GFX9-NEXT:    v_mov_b32_e32 v22, s14
-; GFX9-NEXT:    v_mov_b32_e32 v23, s15
-; GFX9-NEXT:    v_mov_b32_e32 v24, s16
-; GFX9-NEXT:    v_mov_b32_e32 v25, s17
-; GFX9-NEXT:    v_mov_b32_e32 v26, s18
-; GFX9-NEXT:    v_mov_b32_e32 v27, s19
-; GFX9-NEXT:    v_mov_b32_e32 v28, s20
-; GFX9-NEXT:    v_mov_b32_e32 v29, s21
-; GFX9-NEXT:    v_mov_b32_e32 v30, s22
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: stack_passed_arg_alignment_v32i32_f64:
+; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT:    s_mov_b32 s54, -1
+; SDAG-NEXT:    s_mov_b32 s55, 0xe00000
+; SDAG-NEXT:    s_add_u32 s52, s52, s5
+; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x64
+; SDAG-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xa4
+; SDAG-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x24
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_addc_u32 s53, s53, 0
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v0, s23
+; SDAG-NEXT:    buffer_store_dword v0, off, s[52:55], s32
+; SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; SDAG-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:4
+; SDAG-NEXT:    v_mov_b32_e32 v0, s5
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], s[52:53]
+; SDAG-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:8
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; SDAG-NEXT:    v_mov_b32_e32 v0, s36
+; SDAG-NEXT:    v_mov_b32_e32 v1, s37
+; SDAG-NEXT:    v_mov_b32_e32 v2, s38
+; SDAG-NEXT:    v_mov_b32_e32 v3, s39
+; SDAG-NEXT:    v_mov_b32_e32 v4, s40
+; SDAG-NEXT:    v_mov_b32_e32 v5, s41
+; SDAG-NEXT:    v_mov_b32_e32 v6, s42
+; SDAG-NEXT:    v_mov_b32_e32 v7, s43
+; SDAG-NEXT:    v_mov_b32_e32 v8, s44
+; SDAG-NEXT:    v_mov_b32_e32 v9, s45
+; SDAG-NEXT:    v_mov_b32_e32 v10, s46
+; SDAG-NEXT:    v_mov_b32_e32 v11, s47
+; SDAG-NEXT:    v_mov_b32_e32 v12, s48
+; SDAG-NEXT:    v_mov_b32_e32 v13, s49
+; SDAG-NEXT:    v_mov_b32_e32 v14, s50
+; SDAG-NEXT:    v_mov_b32_e32 v15, s51
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    v_mov_b32_e32 v20, s12
+; SDAG-NEXT:    v_mov_b32_e32 v21, s13
+; SDAG-NEXT:    v_mov_b32_e32 v22, s14
+; SDAG-NEXT:    v_mov_b32_e32 v23, s15
+; SDAG-NEXT:    v_mov_b32_e32 v24, s16
+; SDAG-NEXT:    v_mov_b32_e32 v25, s17
+; SDAG-NEXT:    v_mov_b32_e32 v26, s18
+; SDAG-NEXT:    v_mov_b32_e32 v27, s19
+; SDAG-NEXT:    v_mov_b32_e32 v28, s20
+; SDAG-NEXT:    v_mov_b32_e32 v29, s21
+; SDAG-NEXT:    v_mov_b32_e32 v30, s22
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: stack_passed_arg_alignment_v32i32_f64:
 ; GFX11:       ; %bb.0: ; %entry
@@ -5662,6 +6863,65 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
 ; HSA-NEXT:    v_mov_b32_e32 v30, s22
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[24:25]
 ; HSA-NEXT:    s_endpgm
+;
+; GISEL-LABEL: stack_passed_arg_alignment_v32i32_f64:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT:    s_mov_b32 s54, -1
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x64
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xa4
+; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x24
+; GISEL-NEXT:    s_mov_b32 s55, 0xe00000
+; GISEL-NEXT:    s_add_u32 s52, s52, s5
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_addc_u32 s53, s53, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v0, s23
+; GISEL-NEXT:    buffer_store_dword v0, off, s[52:55], s32
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:4
+; GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GISEL-NEXT:    s_mov_b64 s[0:1], s[52:53]
+; GISEL-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:8
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, s36
+; GISEL-NEXT:    v_mov_b32_e32 v1, s37
+; GISEL-NEXT:    v_mov_b32_e32 v2, s38
+; GISEL-NEXT:    v_mov_b32_e32 v3, s39
+; GISEL-NEXT:    v_mov_b32_e32 v4, s40
+; GISEL-NEXT:    v_mov_b32_e32 v5, s41
+; GISEL-NEXT:    v_mov_b32_e32 v6, s42
+; GISEL-NEXT:    v_mov_b32_e32 v7, s43
+; GISEL-NEXT:    v_mov_b32_e32 v8, s44
+; GISEL-NEXT:    v_mov_b32_e32 v9, s45
+; GISEL-NEXT:    v_mov_b32_e32 v10, s46
+; GISEL-NEXT:    v_mov_b32_e32 v11, s47
+; GISEL-NEXT:    v_mov_b32_e32 v12, s48
+; GISEL-NEXT:    v_mov_b32_e32 v13, s49
+; GISEL-NEXT:    v_mov_b32_e32 v14, s50
+; GISEL-NEXT:    v_mov_b32_e32 v15, s51
+; GISEL-NEXT:    v_mov_b32_e32 v16, s8
+; GISEL-NEXT:    v_mov_b32_e32 v17, s9
+; GISEL-NEXT:    v_mov_b32_e32 v18, s10
+; GISEL-NEXT:    v_mov_b32_e32 v19, s11
+; GISEL-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GISEL-NEXT:    v_mov_b32_e32 v20, s12
+; GISEL-NEXT:    v_mov_b32_e32 v21, s13
+; GISEL-NEXT:    v_mov_b32_e32 v22, s14
+; GISEL-NEXT:    v_mov_b32_e32 v23, s15
+; GISEL-NEXT:    v_mov_b32_e32 v24, s16
+; GISEL-NEXT:    v_mov_b32_e32 v25, s17
+; GISEL-NEXT:    v_mov_b32_e32 v26, s18
+; GISEL-NEXT:    v_mov_b32_e32 v27, s19
+; GISEL-NEXT:    v_mov_b32_e32 v28, s20
+; GISEL-NEXT:    v_mov_b32_e32 v29, s21
+; GISEL-NEXT:    v_mov_b32_e32 v30, s22
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT:    s_endpgm
 entry:
   call void @stack_passed_f64_arg(<32 x i32> %val, double %tmp)
   ret void
@@ -5702,22 +6962,22 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; CI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:16
 ; CI-NEXT:    s_setpc_b64 s[4:5]
 ;
-; GFX9-LABEL: tail_call_byval_align16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    s_setpc_b64 s[4:5]
+; SDAG-LABEL: tail_call_byval_align16:
+; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
+; SDAG-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SDAG-NEXT:    s_getpc_b64 s[4:5]
+; SDAG-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:20
+; SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
+; SDAG-NEXT:    s_waitcnt vmcnt(2)
+; SDAG-NEXT:    buffer_store_dword v32, off, s[0:3], s32
+; SDAG-NEXT:    s_waitcnt vmcnt(1)
+; SDAG-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:16
+; SDAG-NEXT:    s_setpc_b64 s[4:5]
 ;
 ; GFX11-LABEL: tail_call_byval_align16:
 ; GFX11:       ; %bb.0: ; %entry
@@ -5749,6 +7009,23 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; HSA-NEXT:    s_waitcnt vmcnt(1)
 ; HSA-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:16
 ; HSA-NEXT:    s_setpc_b64 s[4:5]
+;
+; GISEL-LABEL: tail_call_byval_align16:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GISEL-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; GISEL-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:20
+; GISEL-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:16
+; GISEL-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca double, align 8, addrspace(5)
   tail call void @byval_align16_f64_arg(<32 x i32> %val, ptr addrspace(5) byval(double) align 16 %alloca)
diff --git a/llvm/test/CodeGen/AMDGPU/call-c-function.ll b/llvm/test/CodeGen/AMDGPU/call-c-function.ll
index e1bb3eab25efd..4fbc7271ba0c5 100644
--- a/llvm/test/CodeGen/AMDGPU/call-c-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-c-function.ll
@@ -1,21 +1,68 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel=0 -stop-after=finalize-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
+; RUN: llc -global-isel=0 -stop-after=finalize-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=SDAG -enable-var-scope %s
+; RUN: llc -global-isel=1 -stop-after=finalize-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GISEL -enable-var-scope %s
 
 ; Test that we don't explode on calls from shaders to functions with the C calling convention.
 
 define amdgpu_ps void @amdgpu_ps_call_default_cc() {
-  ; CHECK-LABEL: name: amdgpu_ps_call_default_cc
-  ; CHECK: bb.0.main_body:
-  ; CHECK-NEXT:   S_ENDPGM 0
+  ; SDAG-LABEL: name: amdgpu_ps_call_default_cc
+  ; SDAG: bb.0.main_body:
+  ; SDAG-NEXT:   S_ENDPGM 0
+  ;
+  ; GISEL-LABEL: name: amdgpu_ps_call_default_cc
+  ; GISEL: bb.1.main_body:
+  ; GISEL-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GISEL-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; GISEL-NEXT:   [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; GISEL-NEXT:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
+  ; GISEL-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]]
+  ; GISEL-NEXT:   $sgpr4_sgpr5 = COPY [[DEF]]
+  ; GISEL-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]]
+  ; GISEL-NEXT:   [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+  ; GISEL-NEXT:   $sgpr8_sgpr9 = COPY [[S_MOV_B]]
+  ; GISEL-NEXT:   $sgpr10_sgpr11 = COPY [[DEF1]]
+  ; GISEL-NEXT:   $sgpr12 = COPY [[DEF2]]
+  ; GISEL-NEXT:   $sgpr13 = COPY [[DEF2]]
+  ; GISEL-NEXT:   $sgpr14 = COPY [[DEF2]]
+  ; GISEL-NEXT:   $sgpr15 = COPY [[DEF2]]
+  ; GISEL-NEXT:   $vgpr31 = COPY [[DEF2]]
+  ; GISEL-NEXT:   [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+  ; GISEL-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_MOV_B1]], 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; GISEL-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GISEL-NEXT:   S_ENDPGM 0
 main_body:
   call void null()
   ret void
 }
 
 define amdgpu_gfx void @amdgpu_gfx_call_default_cc() {
-  ; CHECK-LABEL: name: amdgpu_gfx_call_default_cc
-  ; CHECK: bb.0.main_body:
-  ; CHECK-NEXT:   SI_RETURN
+  ; SDAG-LABEL: name: amdgpu_gfx_call_default_cc
+  ; SDAG: bb.0.main_body:
+  ; SDAG-NEXT:   SI_RETURN
+  ;
+  ; GISEL-LABEL: name: amdgpu_gfx_call_default_cc
+  ; GISEL: bb.1.main_body:
+  ; GISEL-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GISEL-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; GISEL-NEXT:   [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; GISEL-NEXT:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GISEL-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]]
+  ; GISEL-NEXT:   $sgpr4_sgpr5 = COPY [[DEF]]
+  ; GISEL-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]]
+  ; GISEL-NEXT:   [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+  ; GISEL-NEXT:   $sgpr8_sgpr9 = COPY [[S_MOV_B]]
+  ; GISEL-NEXT:   $sgpr10_sgpr11 = COPY [[DEF1]]
+  ; GISEL-NEXT:   $sgpr12 = COPY [[DEF2]]
+  ; GISEL-NEXT:   $sgpr13 = COPY [[DEF2]]
+  ; GISEL-NEXT:   $sgpr14 = COPY [[DEF2]]
+  ; GISEL-NEXT:   $sgpr15 = COPY [[DEF2]]
+  ; GISEL-NEXT:   $vgpr31 = COPY [[DEF2]]
+  ; GISEL-NEXT:   [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+  ; GISEL-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_MOV_B1]], 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+  ; GISEL-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GISEL-NEXT:   SI_RETURN
 main_body:
   call void null()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll
index 5f324df30f7e2..fe0b0188d2d37 100644
--- a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll
@@ -1,84 +1,341 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=GCN,GISEL %s
 
-; GCN-LABEL: {{^}}test_bitcast_return_type_noinline:
-; GCN: s_getpc_b64
-; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_noinline@rel32@lo+4
-; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_noinline@rel32@hi+12
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_bitcast_return_type_noinline() #0 {
+; SDAG-LABEL: test_bitcast_return_type_noinline:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; SDAG-NEXT:    s_add_i32 s12, s12, s17
+; SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; SDAG-NEXT:    s_add_u32 s0, s0, s17
+; SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; SDAG-NEXT:    s_mov_b32 s13, s15
+; SDAG-NEXT:    s_mov_b32 s12, s14
+; SDAG-NEXT:    s_getpc_b64 s[18:19]
+; SDAG-NEXT:    s_add_u32 s18, s18, ret_i32_noinline@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s19, s19, ret_i32_noinline@rel32@hi+12
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT:    v_or_b32_e32 v31, v0, v2
+; SDAG-NEXT:    s_mov_b32 s14, s16
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; SDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SDAG-NEXT:    flat_store_dword v[0:1], v0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_bitcast_return_type_noinline:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GISEL-NEXT:    s_add_i32 s12, s12, s17
+; GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GISEL-NEXT:    s_mov_b32 s13, s15
+; GISEL-NEXT:    s_mov_b32 s12, s14
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    v_or_b32_e32 v31, v0, v2
+; GISEL-NEXT:    s_getpc_b64 s[18:19]
+; GISEL-NEXT:    s_add_u32 s18, s18, ret_i32_noinline@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s19, s19, ret_i32_noinline@rel32@hi+12
+; GISEL-NEXT:    s_mov_b32 s14, s16
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GISEL-NEXT:    flat_store_dword v[0:1], v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_endpgm
   %val = call float @ret_i32_noinline()
   %op = fadd float %val, 1.0
   store volatile float %op, ptr addrspace(1) poison
   ret void
 }
 
-; GCN-LABEL: {{^}}test_bitcast_return_type_alwaysinline:
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_bitcast_return_type_alwaysinline() #0 {
+; SDAG-LABEL: test_bitcast_return_type_alwaysinline:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; SDAG-NEXT:    s_add_i32 s12, s12, s17
+; SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; SDAG-NEXT:    s_add_u32 s0, s0, s17
+; SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; SDAG-NEXT:    s_mov_b32 s13, s15
+; SDAG-NEXT:    s_mov_b32 s12, s14
+; SDAG-NEXT:    s_getpc_b64 s[18:19]
+; SDAG-NEXT:    s_add_u32 s18, s18, ret_i32_alwaysinline@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s19, s19, ret_i32_alwaysinline@rel32@hi+12
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT:    v_or_b32_e32 v31, v0, v2
+; SDAG-NEXT:    s_mov_b32 s14, s16
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; SDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SDAG-NEXT:    flat_store_dword v[0:1], v0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_bitcast_return_type_alwaysinline:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GISEL-NEXT:    s_add_i32 s12, s12, s17
+; GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GISEL-NEXT:    s_mov_b32 s13, s15
+; GISEL-NEXT:    s_mov_b32 s12, s14
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    v_or_b32_e32 v31, v0, v2
+; GISEL-NEXT:    s_getpc_b64 s[18:19]
+; GISEL-NEXT:    s_add_u32 s18, s18, ret_i32_alwaysinline@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s19, s19, ret_i32_alwaysinline@rel32@hi+12
+; GISEL-NEXT:    s_mov_b32 s14, s16
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GISEL-NEXT:    flat_store_dword v[0:1], v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_endpgm
   %val = call float @ret_i32_alwaysinline()
   %op = fadd float %val, 1.0
   store volatile float %op, ptr addrspace(1) poison
   ret void
 }
 
-; GCN-LABEL: {{^}}test_bitcast_argument_type:
-; GCN: s_getpc_b64
-; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4
-; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+12
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_bitcast_argument_type() #0 {
+; SDAG-LABEL: test_bitcast_argument_type:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; SDAG-NEXT:    s_add_i32 s12, s12, s17
+; SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; SDAG-NEXT:    s_add_u32 s0, s0, s17
+; SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; SDAG-NEXT:    s_mov_b32 s13, s15
+; SDAG-NEXT:    s_mov_b32 s12, s14
+; SDAG-NEXT:    s_getpc_b64 s[18:19]
+; SDAG-NEXT:    s_add_u32 s18, s18, ident_i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s19, s19, ident_i32@rel32@hi+12
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT:    v_or_b32_e32 v31, v0, v2
+; SDAG-NEXT:    v_mov_b32_e32 v0, 2.0
+; SDAG-NEXT:    s_mov_b32 s14, s16
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; SDAG-NEXT:    flat_store_dword v[0:1], v0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_bitcast_argument_type:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GISEL-NEXT:    s_add_i32 s12, s12, s17
+; GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GISEL-NEXT:    s_mov_b32 s13, s15
+; GISEL-NEXT:    s_mov_b32 s12, s14
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GISEL-NEXT:    s_getpc_b64 s[18:19]
+; GISEL-NEXT:    s_add_u32 s18, s18, ident_i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s19, s19, ident_i32@rel32@hi+12
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    v_or_b32_e32 v31, v0, v2
+; GISEL-NEXT:    v_mov_b32_e32 v0, 2.0
+; GISEL-NEXT:    s_mov_b32 s14, s16
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; GISEL-NEXT:    flat_store_dword v[0:1], v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_endpgm
   %val = call i32 @ident_i32(float 2.0)
   %op = add i32 %val, 1
   store volatile i32 %op, ptr addrspace(1) poison
   ret void
 }
 
-; GCN-LABEL: {{^}}test_bitcast_argument_and_return_types:
-; GCN: s_getpc_b64
-; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4
-; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+12
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_bitcast_argument_and_return_types() #0 {
+; SDAG-LABEL: test_bitcast_argument_and_return_types:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; SDAG-NEXT:    s_add_i32 s12, s12, s17
+; SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; SDAG-NEXT:    s_add_u32 s0, s0, s17
+; SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; SDAG-NEXT:    s_mov_b32 s13, s15
+; SDAG-NEXT:    s_mov_b32 s12, s14
+; SDAG-NEXT:    s_getpc_b64 s[18:19]
+; SDAG-NEXT:    s_add_u32 s18, s18, ident_i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s19, s19, ident_i32@rel32@hi+12
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT:    v_or_b32_e32 v31, v0, v2
+; SDAG-NEXT:    v_mov_b32_e32 v0, 2.0
+; SDAG-NEXT:    s_mov_b32 s14, s16
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; SDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SDAG-NEXT:    flat_store_dword v[0:1], v0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_bitcast_argument_and_return_types:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GISEL-NEXT:    s_add_i32 s12, s12, s17
+; GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GISEL-NEXT:    s_mov_b32 s13, s15
+; GISEL-NEXT:    s_mov_b32 s12, s14
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GISEL-NEXT:    s_getpc_b64 s[18:19]
+; GISEL-NEXT:    s_add_u32 s18, s18, ident_i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s19, s19, ident_i32@rel32@hi+12
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    v_or_b32_e32 v31, v0, v2
+; GISEL-NEXT:    v_mov_b32_e32 v0, 2.0
+; GISEL-NEXT:    s_mov_b32 s14, s16
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GISEL-NEXT:    flat_store_dword v[0:1], v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_endpgm
   %val = call float @ident_i32(float 2.0)
   %op = fadd float %val, 1.0
   store volatile float %op, ptr addrspace(1) poison
   ret void
 }
 
-; GCN-LABEL: {{^}}use_workitem_id_x:
-; GCN: s_waitcnt
-; GCN-NEXT: v_and_b32_e32 [[TMP:v[0-9]+]], 0x3ff, v31
-; GCN-NEXT: v_add_i32_e32 v0, vcc, [[TMP]], v0
-; GCN-NEXT: s_setpc_b64
 define hidden i32 @use_workitem_id_x(i32 %arg0) #3 {
+; GCN-LABEL: use_workitem_id_x:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %op = add i32 %id, %arg0
   ret i32 %op
 }
 
-; GCN-LABEL: {{^}}test_bitcast_use_workitem_id_x:
-; GCN: v_mov_b32_e32 v31, v0
-; GCN: s_getpc_b64
-; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@lo+4
-; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@hi+12
-; GCN: v_mov_b32_e32 v0, 9
-; GCN: s_swappc_b64
-; GCN: v_add_f32_e32
 define amdgpu_kernel void @test_bitcast_use_workitem_id_x() #3 {
+; SDAG-LABEL: test_bitcast_use_workitem_id_x:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; SDAG-NEXT:    s_add_i32 s12, s12, s17
+; SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; SDAG-NEXT:    s_add_u32 s0, s0, s17
+; SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; SDAG-NEXT:    s_mov_b32 s13, s15
+; SDAG-NEXT:    s_mov_b32 s12, s14
+; SDAG-NEXT:    v_mov_b32_e32 v31, v0
+; SDAG-NEXT:    s_getpc_b64 s[18:19]
+; SDAG-NEXT:    s_add_u32 s18, s18, use_workitem_id_x@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s19, s19, use_workitem_id_x@rel32@hi+12
+; SDAG-NEXT:    v_mov_b32_e32 v0, 9
+; SDAG-NEXT:    s_mov_b32 s14, s16
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; SDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SDAG-NEXT:    flat_store_dword v[0:1], v0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_bitcast_use_workitem_id_x:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GISEL-NEXT:    s_add_i32 s12, s12, s17
+; GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GISEL-NEXT:    v_mov_b32_e32 v31, v0
+; GISEL-NEXT:    s_mov_b32 s13, s15
+; GISEL-NEXT:    s_mov_b32 s12, s14
+; GISEL-NEXT:    s_getpc_b64 s[18:19]
+; GISEL-NEXT:    s_add_u32 s18, s18, use_workitem_id_x@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s19, s19, use_workitem_id_x@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 9
+; GISEL-NEXT:    s_mov_b32 s14, s16
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GISEL-NEXT:    flat_store_dword v[0:1], v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_endpgm
   %val = call float @use_workitem_id_x(i32 9)
   %op = fadd float %val, 1.0
   store volatile float %op, ptr addrspace(1) poison
   ret void
 }
 
-; GCN-LABEL: {{^}}test_invoke:
-; GCN: s_getpc_b64
-; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4
-; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+12
-; GCN: s_swappc_b64
 @_ZTIi = external global ptr
 declare i32 @__gxx_personality_v0(...)
 define amdgpu_kernel void @test_invoke() #0 personality ptr @__gxx_personality_v0 {
+; SDAG-LABEL: test_invoke:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; SDAG-NEXT:    s_add_i32 s12, s12, s17
+; SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; SDAG-NEXT:    s_add_u32 s0, s0, s17
+; SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; SDAG-NEXT:    s_mov_b32 s13, s15
+; SDAG-NEXT:    s_mov_b32 s12, s14
+; SDAG-NEXT:    s_getpc_b64 s[18:19]
+; SDAG-NEXT:    s_add_u32 s18, s18, ident_i32@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s19, s19, ident_i32@rel32@hi+12
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT:    v_or_b32_e32 v31, v0, v2
+; SDAG-NEXT:    v_mov_b32_e32 v0, 2.0
+; SDAG-NEXT:    s_mov_b32 s14, s16
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; SDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SDAG-NEXT:    flat_store_dword v[0:1], v0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_invoke:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GISEL-NEXT:    s_add_i32 s12, s12, s17
+; GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GISEL-NEXT:    s_mov_b32 s13, s15
+; GISEL-NEXT:    s_mov_b32 s12, s14
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GISEL-NEXT:    s_getpc_b64 s[18:19]
+; GISEL-NEXT:    s_add_u32 s18, s18, ident_i32@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s19, s19, ident_i32@rel32@hi+12
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    v_or_b32_e32 v31, v0, v2
+; GISEL-NEXT:    v_mov_b32_e32 v0, 2.0
+; GISEL-NEXT:    s_mov_b32 s14, s16
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GISEL-NEXT:    flat_store_dword v[0:1], v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_endpgm
   %val = invoke float @ident_i32(float 2.0)
           to label %continue unwind label %broken
 
@@ -96,14 +353,28 @@ continue:
 ; arguments before we lower any calls to them.
 
 define hidden i32 @ret_i32_noinline() #0 {
+; GCN-LABEL: ret_i32_noinline:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, 4
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   ret i32 4
 }
 
 define hidden i32 @ret_i32_alwaysinline() #1 {
+; GCN-LABEL: ret_i32_alwaysinline:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, 4
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   ret i32 4
 }
 
 define hidden i32 @ident_i32(i32 %i) #0 {
+; GCN-LABEL: ident_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   ret i32 %i
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll b/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll
index ffe536d347c53..4b5a49fc0c2e9 100644
--- a/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -simplify-mir -stop-after=finalize-isel < %s | FileCheck %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -simplify-mir -stop-after=finalize-isel < %s | FileCheck -check-prefixes=SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -simplify-mir -stop-after=finalize-isel < %s | FileCheck -check-prefixes=GISEL %s
 
 ; Check that call / asm get an implicit-def $mode added to them in
 ; strictfp functions.
@@ -7,46 +8,80 @@
 declare protected void @maybe_defs_mode() #0
 
 define float @call_changes_mode(float %x, float %y) #0 {
-  ; CHECK-LABEL: name: call_changes_mode
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $mode
-  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = COPY [[V_ADD_F32_e64_]]
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; SDAG-LABEL: name: call_changes_mode
+  ; SDAG: bb.0 (%ir-block.0):
+  ; SDAG-NEXT:   liveins: $vgpr0, $vgpr1
+  ; SDAG-NEXT: {{  $}}
+  ; SDAG-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; SDAG-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; SDAG-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; SDAG-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc
+  ; SDAG-NEXT:   [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; SDAG-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
+  ; SDAG-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $mode
+  ; SDAG-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; SDAG-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; SDAG-NEXT:   $vgpr0 = COPY [[V_ADD_F32_e64_]]
+  ; SDAG-NEXT:   SI_RETURN implicit $vgpr0
+  ;
+  ; GISEL-LABEL: name: call_changes_mode
+  ; GISEL: bb.1 (%ir-block.0):
+  ; GISEL-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GISEL-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GISEL-NEXT:   [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GISEL-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
+  ; GISEL-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def $scc
+  ; GISEL-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GISEL-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GISEL-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GISEL-NEXT:   $vgpr0 = COPY [[V_ADD_F32_e64_]]
+  ; GISEL-NEXT:   SI_RETURN implicit $vgpr0
   call void @maybe_defs_mode()
   %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore")
   ret float %val
 }
 
 define void @tail_call_changes_mode() #0 {
-  ; CHECK-LABEL: name: tail_call_changes_mode
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:ccr_sgpr_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc
-  ; CHECK-NEXT:   SI_TCRETURN killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, 0, csr_amdgpu, implicit-def $mode
+  ; SDAG-LABEL: name: tail_call_changes_mode
+  ; SDAG: bb.0 (%ir-block.0):
+  ; SDAG-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:ccr_sgpr_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc
+  ; SDAG-NEXT:   SI_TCRETURN killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, 0, csr_amdgpu, implicit-def $mode
+  ;
+  ; GISEL-LABEL: name: tail_call_changes_mode
+  ; GISEL: bb.1 (%ir-block.0):
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GISEL-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]]
+  ; GISEL-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:ccr_sgpr_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def $scc
+  ; GISEL-NEXT:   SI_TCRETURN [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   tail call void @maybe_defs_mode()
   ret void
 }
 
 define float @asm_changes_mode(float %x, float %y) #0 {
-  ; CHECK-LABEL: name: asm_changes_mode
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   INLINEASM &"; maybe defs mode", 1 /* sideeffect attdialect */, implicit-def $mode
-  ; CHECK-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = COPY [[V_ADD_F32_e64_]]
-  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  ; SDAG-LABEL: name: asm_changes_mode
+  ; SDAG: bb.0 (%ir-block.0):
+  ; SDAG-NEXT:   liveins: $vgpr0, $vgpr1
+  ; SDAG-NEXT: {{  $}}
+  ; SDAG-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; SDAG-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; SDAG-NEXT:   INLINEASM &"; maybe defs mode", 1 /* sideeffect attdialect */, implicit-def $mode
+  ; SDAG-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; SDAG-NEXT:   $vgpr0 = COPY [[V_ADD_F32_e64_]]
+  ; SDAG-NEXT:   SI_RETURN implicit $vgpr0
+  ;
+  ; GISEL-LABEL: name: asm_changes_mode
+  ; GISEL: bb.1 (%ir-block.0):
+  ; GISEL-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GISEL-NEXT:   INLINEASM &"; maybe defs mode", 1 /* sideeffect attdialect */, implicit-def $mode
+  ; GISEL-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GISEL-NEXT:   $vgpr0 = COPY [[V_ADD_F32_e64_]]
+  ; GISEL-NEXT:   SI_RETURN implicit $vgpr0
   call void asm sideeffect "; maybe defs mode", ""()
   %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore")
   ret float %val
diff --git a/llvm/test/CodeGen/AMDGPU/call-encoding.ll b/llvm/test/CodeGen/AMDGPU/call-encoding.ll
index 6954c340ca287..6c36c2424a66e 100644
--- a/llvm/test/CodeGen/AMDGPU/call-encoding.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-encoding.ll
@@ -1,5 +1,7 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=fiji -d - | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=gfx900 -d - | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=fiji -d - | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=gfx900 -d - | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=fiji -d - | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=gfx900 -d - | FileCheck --check-prefix=GCN %s
 ; XUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=hawaii -d - | FileCheck --check-prefixes=GCN,CI %s
 
 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 4df10497bcd27..b250227735bd3 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -1,8 +1,13 @@
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN,CI %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN,CI %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN,CI %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
 
 ; Make sure to run a GPU with the SGPR allocation bug.
 
diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index 61a195f9c314f..aed1079158154 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR,GISEL %s
 
 declare hidden void @external_void_func_void() #3
 
@@ -223,41 +227,6 @@ define hidden void @void_func_void_clobber_vcc() #2 {
 }
 
 define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(ptr addrspace(1) %out) #0 {
-; FLATSCR-LABEL: test_call_void_func_void_clobber_vcc:
-; FLATSCR:       ; %bb.0:
-; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
-; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
-; FLATSCR-NEXT:    s_add_u32 s8, s4, 8
-; FLATSCR-NEXT:    s_addc_u32 s9, s5, 0
-; FLATSCR-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; FLATSCR-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; FLATSCR-NEXT:    s_mov_b32 s14, s12
-; FLATSCR-NEXT:    s_mov_b32 s13, s11
-; FLATSCR-NEXT:    s_mov_b32 s12, s10
-; FLATSCR-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; FLATSCR-NEXT:    s_getpc_b64 s[16:17]
-; FLATSCR-NEXT:    s_add_u32 s16, s16, void_func_void_clobber_vcc@rel32@lo+4
-; FLATSCR-NEXT:    s_addc_u32 s17, s17, void_func_void_clobber_vcc@rel32@hi+12
-; FLATSCR-NEXT:    v_or3_b32 v31, v0, v1, v2
-; FLATSCR-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; FLATSCR-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; FLATSCR-NEXT:    s_mov_b32 s32, 0
-; FLATSCR-NEXT:    ;;#ASMSTART
-; FLATSCR-NEXT:    ; def vcc
-; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    s_mov_b64 s[34:35], vcc
-; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; FLATSCR-NEXT:    global_load_dword v0, v[0:1], off glc
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    s_mov_b64 vcc, s[34:35]
-; FLATSCR-NEXT:    global_load_dword v0, v[0:1], off glc
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    ; kill: killed $vgpr0_vgpr1
-; FLATSCR-NEXT:    ; kill: killed $vgpr0_vgpr1
-; FLATSCR-NEXT:    ;;#ASMSTART
-; FLATSCR-NEXT:    ; use vcc
-; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    s_endpgm
   %vcc = call i64 asm sideeffect "; def $0", "={vcc}"()
   call void @void_func_void_clobber_vcc()
   %val0 = load volatile i32, ptr addrspace(1) poison
@@ -463,51 +432,11 @@ define hidden void @void_func_void_clobber_s34() #2 {
 }
 
 define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 {
-; FLATSCR-LABEL: test_call_void_func_void_clobber_s33:
-; FLATSCR:       ; %bb.0:
-; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
-; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
-; FLATSCR-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; FLATSCR-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; FLATSCR-NEXT:    s_mov_b32 s14, s12
-; FLATSCR-NEXT:    s_mov_b32 s13, s11
-; FLATSCR-NEXT:    s_mov_b32 s12, s10
-; FLATSCR-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; FLATSCR-NEXT:    s_mov_b64 s[8:9], s[4:5]
-; FLATSCR-NEXT:    s_getpc_b64 s[16:17]
-; FLATSCR-NEXT:    s_add_u32 s16, s16, void_func_void_clobber_s33@rel32@lo+4
-; FLATSCR-NEXT:    s_addc_u32 s17, s17, void_func_void_clobber_s33@rel32@hi+12
-; FLATSCR-NEXT:    v_or3_b32 v31, v0, v1, v2
-; FLATSCR-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; FLATSCR-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; FLATSCR-NEXT:    s_mov_b32 s32, 0
-; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; FLATSCR-NEXT:    s_endpgm
   call void @void_func_void_clobber_s33()
   ret void
 }
 
 define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 {
-; FLATSCR-LABEL: test_call_void_func_void_clobber_s34:
-; FLATSCR:       ; %bb.0:
-; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
-; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
-; FLATSCR-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; FLATSCR-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; FLATSCR-NEXT:    s_mov_b32 s14, s12
-; FLATSCR-NEXT:    s_mov_b32 s13, s11
-; FLATSCR-NEXT:    s_mov_b32 s12, s10
-; FLATSCR-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; FLATSCR-NEXT:    s_mov_b64 s[8:9], s[4:5]
-; FLATSCR-NEXT:    s_getpc_b64 s[16:17]
-; FLATSCR-NEXT:    s_add_u32 s16, s16, void_func_void_clobber_s34@rel32@lo+4
-; FLATSCR-NEXT:    s_addc_u32 s17, s17, void_func_void_clobber_s34@rel32@hi+12
-; FLATSCR-NEXT:    v_or3_b32 v31, v0, v1, v2
-; FLATSCR-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; FLATSCR-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; FLATSCR-NEXT:    s_mov_b32 s32, 0
-; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; FLATSCR-NEXT:    s_endpgm
   call void @void_func_void_clobber_s34()
   ret void
 }
@@ -748,3 +677,6 @@ attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind noinline }
 attributes #3 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL: {{.*}}
+; SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/call-return-types.ll b/llvm/test/CodeGen/AMDGPU/call-return-types.ll
index c0f74fd85f0e6..21c3696ae98a9 100644
--- a/llvm/test/CodeGen/AMDGPU/call-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-return-types.ll
@@ -1,7 +1,12 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s
+
+; Ideally, we would also like to test GlobalISel with gfx11 but we are currently blocked on llvm-project#166501.
 
 declare void @external_void_func_void() #0
 
diff --git a/llvm/test/CodeGen/AMDGPU/call-skip.ll b/llvm/test/CodeGen/AMDGPU/call-skip.ll
index ea2bba1673a0b..e2ca278d687be 100644
--- a/llvm/test/CodeGen/AMDGPU/call-skip.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-skip.ll
@@ -1,4 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GISEL %s
 
 ; A call should be skipped if all lanes are zero, since we don't know
 ; what side effects should be avoided inside the call.
@@ -6,12 +8,37 @@ define hidden void @func() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}if_call:
-; GCN: s_and_saveexec_b64
-; GCN-NEXT: s_cbranch_execz [[END:.LBB[0-9]+_[0-9]+]]
-; GCN: s_swappc_b64
-; GCN: [[END]]:
 define void @if_call(i32 %flag) #0 {
+; GCN-LABEL: if_call:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s20, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[16:17]
+; GCN-NEXT:    v_writelane_b32 v1, s30, 0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v1, s31, 1
+; GCN-NEXT:    s_and_saveexec_b64 s[16:17], vcc
+; GCN-NEXT:    s_cbranch_execz .LBB1_2
+; GCN-NEXT:  ; %bb.1: ; %call
+; GCN-NEXT:    s_getpc_b64 s[18:19]
+; GCN-NEXT:    s_add_u32 s18, s18, func@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s19, s19, func@rel32@hi+12
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GCN-NEXT:  .LBB1_2: ; %end
+; GCN-NEXT:    s_or_b64 exec, exec, s[16:17]
+; GCN-NEXT:    v_readlane_b32 s31, v1, 1
+; GCN-NEXT:    v_readlane_b32 s30, v1, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b32 s33, s20
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cc = icmp eq i32 %flag, 0
   br i1 %cc, label %call, label %end
 
@@ -23,12 +50,20 @@ end:
   ret void
 }
 
-; GCN-LABEL: {{^}}if_asm:
-; GCN: s_and_saveexec_b64
-; GCN-NEXT: s_cbranch_execz [[END:.LBB[0-9]+_[0-9]+]]
-; GCN: ; sample asm
-; GCN: [[END]]:
 define void @if_asm(i32 %flag) #0 {
+; GCN-LABEL: if_asm:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT:    s_cbranch_execz .LBB2_2
+; GCN-NEXT:  ; %bb.1: ; %call
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; sample asm
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:  .LBB2_2: ; %end
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cc = icmp eq i32 %flag, 0
   br i1 %cc, label %call, label %end
 
@@ -40,11 +75,58 @@ end:
   ret void
 }
 
-; GCN-LABEL: {{^}}if_call_kernel:
-; GCN: s_and_saveexec_b64
-; GCN-NEXT: s_cbranch_execz .LBB3_2
-; GCN: s_swappc_b64
 define amdgpu_kernel void @if_call_kernel() #0 {
+; SDAG-LABEL: if_call_kernel:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_add_i32 s12, s12, s17
+; SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; SDAG-NEXT:    s_add_u32 s0, s0, s17
+; SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; SDAG-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB3_2
+; SDAG-NEXT:  ; %bb.1: ; %call
+; SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT:    s_getpc_b64 s[18:19]
+; SDAG-NEXT:    s_add_u32 s18, s18, func@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s19, s19, func@rel32@hi+12
+; SDAG-NEXT:    v_or_b32_e32 v31, v0, v2
+; SDAG-NEXT:    s_mov_b32 s12, s14
+; SDAG-NEXT:    s_mov_b32 s13, s15
+; SDAG-NEXT:    s_mov_b32 s14, s16
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; SDAG-NEXT:  .LBB3_2: ; %end
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: if_call_kernel:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_add_i32 s12, s12, s17
+; GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB3_2
+; GISEL-NEXT:  ; %bb.1: ; %call
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 20, v2
+; GISEL-NEXT:    s_getpc_b64 s[18:19]
+; GISEL-NEXT:    s_add_u32 s18, s18, func@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s19, s19, func@rel32@hi+12
+; GISEL-NEXT:    v_or_b32_e32 v31, v0, v1
+; GISEL-NEXT:    s_mov_b32 s12, s14
+; GISEL-NEXT:    s_mov_b32 s13, s15
+; GISEL-NEXT:    s_mov_b32 s14, s16
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GISEL-NEXT:  .LBB3_2: ; %end
+; GISEL-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %cc = icmp eq i32 %id, 0
   br i1 %cc, label %call, label %end
diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index 675acd0eedfc5..a52942cae1699 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GISEL %s
 
 ; Load argument depends on waitcnt which should be skipped.
 define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
@@ -27,24 +28,43 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
 
 ; Memory waitcnt with no register dependence on the call
 define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
-; GCN-LABEL: call_memory_no_dep:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
-; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s11
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_getpc_b64 s[8:9]
-; GCN-NEXT:    s_add_u32 s8, s8, func@rel32@lo+4
-; GCN-NEXT:    s_addc_u32 s9, s9, func@rel32@hi+12
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_store_dword v0, v0, s[6:7]
-; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GCN-NEXT:    s_endpgm
+; SDAG-LABEL: call_memory_no_dep:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; SDAG-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
+; SDAG-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; SDAG-NEXT:    s_add_u32 s0, s0, s11
+; SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    s_getpc_b64 s[8:9]
+; SDAG-NEXT:    s_add_u32 s8, s8, func@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s9, s9, func@rel32@hi+12
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    global_store_dword v0, v0, s[6:7]
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: call_memory_no_dep:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GISEL-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
+; GISEL-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; GISEL-NEXT:    s_add_u32 s0, s0, s11
+; GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    s_getpc_b64 s[8:9]
+; GISEL-NEXT:    s_add_u32 s8, s8, func@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s9, s9, func@rel32@hi+12
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_store_dword v0, v0, s[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GISEL-NEXT:    s_endpgm
   store i32 0, ptr addrspace(1) %ptr
   call void @func(i32 0)
   ret void
@@ -52,46 +72,82 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
 
 ; Should not wait after the call before memory
 define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #0 {
-; GCN-LABEL: call_no_wait_after_call:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
-; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
-; GCN-NEXT:    s_load_dwordx2 s[34:35], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s0, s0, s11
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_getpc_b64 s[8:9]
-; GCN-NEXT:    s_add_u32 s8, s8, func@rel32@lo+4
-; GCN-NEXT:    s_addc_u32 s9, s9, func@rel32@hi+12
-; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    v_mov_b32_e32 v40, 0
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GCN-NEXT:    global_store_dword v40, v40, s[34:35]
-; GCN-NEXT:    s_endpgm
+; SDAG-LABEL: call_no_wait_after_call:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
+; SDAG-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; SDAG-NEXT:    s_load_dwordx2 s[34:35], s[6:7], 0x0
+; SDAG-NEXT:    s_add_u32 s0, s0, s11
+; SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; SDAG-NEXT:    s_getpc_b64 s[8:9]
+; SDAG-NEXT:    s_add_u32 s8, s8, func@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s9, s9, func@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    v_mov_b32_e32 v40, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; SDAG-NEXT:    global_store_dword v40, v40, s[34:35]
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: call_no_wait_after_call:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
+; GISEL-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; GISEL-NEXT:    s_load_dwordx2 s[34:35], s[6:7], 0x0
+; GISEL-NEXT:    s_add_u32 s0, s0, s11
+; GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GISEL-NEXT:    s_getpc_b64 s[8:9]
+; GISEL-NEXT:    s_add_u32 s8, s8, func@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s9, s9, func@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    global_store_dword v0, v0, s[34:35]
+; GISEL-NEXT:    s_endpgm
   call void @func(i32 0)
   store i32 0, ptr addrspace(1) %ptr
   ret void
 }
 
 define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %ptr, i32) #0 {
-; GCN-LABEL: call_no_wait_after_call_return_val:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
-; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
-; GCN-NEXT:    s_load_dwordx2 s[34:35], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s0, s0, s11
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_getpc_b64 s[8:9]
-; GCN-NEXT:    s_add_u32 s8, s8, func.return@rel32@lo+4
-; GCN-NEXT:    s_addc_u32 s9, s9, func.return@rel32@hi+12
-; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    v_mov_b32_e32 v40, 0
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GCN-NEXT:    global_store_dword v40, v0, s[34:35]
-; GCN-NEXT:    s_endpgm
+; SDAG-LABEL: call_no_wait_after_call_return_val:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
+; SDAG-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; SDAG-NEXT:    s_load_dwordx2 s[34:35], s[6:7], 0x0
+; SDAG-NEXT:    s_add_u32 s0, s0, s11
+; SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; SDAG-NEXT:    s_getpc_b64 s[8:9]
+; SDAG-NEXT:    s_add_u32 s8, s8, func.return@rel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s9, s9, func.return@rel32@hi+12
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    v_mov_b32_e32 v40, 0
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; SDAG-NEXT:    global_store_dword v40, v0, s[34:35]
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: call_no_wait_after_call_return_val:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
+; GISEL-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; GISEL-NEXT:    s_load_dwordx2 s[34:35], s[6:7], 0x0
+; GISEL-NEXT:    s_add_u32 s0, s0, s11
+; GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GISEL-NEXT:    s_getpc_b64 s[8:9]
+; GISEL-NEXT:    s_add_u32 s8, s8, func.return@rel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s9, s9, func.return@rel32@hi+12
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    global_store_dword v1, v0, s[34:35]
+; GISEL-NEXT:    s_endpgm
   %rv = call i32 @func.return(i32 0)
   store i32 %rv, ptr addrspace(1) %ptr
   ret void
@@ -99,22 +155,39 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %
 
 ; Need to wait for the address dependency
 define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 {
-; GCN-LABEL: call_got_load:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
-; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s11
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_getpc_b64 s[6:7]
-; GCN-NEXT:    s_add_u32 s6, s6, got.func@gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s7, s7, got.func@gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
-; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
-; GCN-NEXT:    s_endpgm
+; SDAG-LABEL: call_got_load:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
+; SDAG-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; SDAG-NEXT:    s_add_u32 s0, s0, s11
+; SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; SDAG-NEXT:    s_getpc_b64 s[6:7]
+; SDAG-NEXT:    s_add_u32 s6, s6, got.func@gotpcrel32@lo+4
+; SDAG-NEXT:    s_addc_u32 s7, s7, got.func@gotpcrel32@hi+12
+; SDAG-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; SDAG-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    s_mov_b32 s32, 0
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: call_got_load:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
+; GISEL-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; GISEL-NEXT:    s_add_u32 s0, s0, s11
+; GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GISEL-NEXT:    s_getpc_b64 s[6:7]
+; GISEL-NEXT:    s_add_u32 s6, s6, got.func@gotpcrel32@lo+4
+; GISEL-NEXT:    s_addc_u32 s7, s7, got.func@gotpcrel32@hi+12
+; GISEL-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    s_mov_b32 s32, 0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GISEL-NEXT:    s_endpgm
   call void @got.func(i32 0)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll
new file mode 100644
index 0000000000000..253a6ec100eae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/callbr.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
+
+define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) {
+; CHECK-LABEL: callbr_inline_asm:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_load_dword v0, v[0:1]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  ; %bb.1: ; %fallthrough
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dword v[2:3], v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; CHECK-NEXT:  .LBB0_2: ; Inline asm indirect target
+; CHECK-NEXT:    ; %indirect
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dword v[4:5], v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+	%a = load i32, ptr %src, align 4
+	callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect]
+fallthrough:
+	store i32 %a, ptr %dst1, align 4
+	br label %ret
+indirect:
+	store i32 %a, ptr %dst2, align 4
+	br label %ret
+ret:
+	ret void
+}
+
+define void @callbr_self_loop(i1 %c) {
+; CHECK-LABEL: callbr_self_loop:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:  .LBB1_1: ; %callbr
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_branch .LBB1_1
+; CHECK-NEXT:  .LBB1_2: ; Inline asm indirect target
+; CHECK-NEXT:    ; %callbr.target.ret
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  br label %callbr
+callbr:
+  callbr void asm "", "!i"() to label %callbr [label %ret]
+ret:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index b96de173dc8c6..8d05317162e9c 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -702,8 +702,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_mov_b32 s10, -1
 ; CISI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CISI-NEXT:    s_add_u32 s4, s4, s6
-; CISI-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; CISI-NEXT:    s_or_b32 s6, s12, s13
 ; CISI-NEXT:    s_addc_u32 s5, s5, s7
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1
@@ -1674,8 +1672,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_mov_b32 s10, -1
 ; CISI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CISI-NEXT:    s_sub_u32 s4, s4, s6
-; CISI-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; CISI-NEXT:    s_or_b32 s6, s12, s13
 ; CISI-NEXT:    s_subb_u32 s5, s5, s7
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1
diff --git a/llvm/test/CodeGen/AMDGPU/cc-entry.ll b/llvm/test/CodeGen/AMDGPU/cc-entry.ll
new file mode 100644
index 0000000000000..d807f321c4009
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/cc-entry.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+
+define amdgpu_kernel void @entry_fn() {
+; CHECK-LABEL: entry_fn:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_sext_i32_i16 s5, s5
+; CHECK-NEXT:    s_add_co_u32 s4, s4, entry_fn@gotpcrel32@lo+8
+; CHECK-NEXT:    s_add_co_ci_u32 s5, s5, entry_fn@gotpcrel32@hi+16
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_endpgm
+entry:
+  call void @entry_fn()
+  ret void
+}
+
+define void @caller() {
+; CHECK-LABEL: caller:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    s_mov_b32 s0, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_or_saveexec_b32 s1, -1
+; CHECK-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_wait_alu 0xfffe
+; CHECK-NEXT:    s_mov_b32 exec_lo, s1
+; CHECK-NEXT:    s_add_co_i32 s32, s32, 16
+; CHECK-NEXT:    v_writelane_b32 v40, s0, 2
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_wait_alu 0xfffe
+; CHECK-NEXT:    s_sext_i32_i16 s5, s5
+; CHECK-NEXT:    s_add_co_u32 s4, s4, entry_fn@gotpcrel32@lo+12
+; CHECK-NEXT:    s_wait_alu 0xfffe
+; CHECK-NEXT:    s_add_co_ci_u32 s5, s5, entry_fn@gotpcrel32@hi+24
+; CHECK-NEXT:    v_mov_b32_e32 v0, v31
+; CHECK-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    s_wait_alu 0xfffe
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    v_readlane_b32 s0, v40, 2
+; CHECK-NEXT:    s_or_saveexec_b32 s1, -1
+; CHECK-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_wait_alu 0xfffe
+; CHECK-NEXT:    s_mov_b32 exec_lo, s1
+; CHECK-NEXT:    s_mov_b32 s33, s0
+; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    s_wait_alu 0xfffe
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  call void @entry_fn()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
index 03f1018c40b21..9e1444d9213e7 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
@@ -20,13 +20,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
     undef %2.sub0:areg_64 = COPY %0
     %2.sub1:areg_64 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -45,13 +45,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
     undef %2.sub0:areg_64_align2 = COPY %0
     %2.sub1:areg_64_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -72,7 +72,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY3]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -80,7 +80,7 @@ body:             |
     undef %3.sub0:areg_96 = COPY %0
     %3.sub1:areg_96 = COPY %1
     %3.sub2:areg_96 = COPY %2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %3
     SI_RETURN
 
 ...
@@ -101,7 +101,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY3]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -109,7 +109,7 @@ body:             |
     undef %3.sub0:areg_96_align2 = COPY %0
     %3.sub1:areg_96_align2 = COPY %1
     %3.sub2:areg_96_align2 = COPY %2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %3
     SI_RETURN
 
 ...
@@ -128,13 +128,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vreg_64 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128 = COPY %0
     %2.sub2_sub3:areg_128 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -153,13 +153,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vreg_64 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128_align2 = COPY %0
     %2.sub2_sub3:areg_128_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -178,13 +178,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr9
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:sgpr_32 = COPY $sgpr8
     %1:sgpr_32 = COPY $sgpr9
     undef %2.sub0:areg_64_align2 = COPY %0
     %2.sub1:areg_64_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -203,13 +203,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vreg_64 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96 = COPY %0
     %2.sub1_sub2:areg_96 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -228,13 +228,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vreg_64 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96_align2 = COPY %0
     %2.sub1_sub2:areg_96_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -253,13 +253,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vgpr_32 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96 = COPY %0
     %2.sub2:areg_96 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -278,13 +278,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vgpr_32 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96_align2 = COPY %0
     %2.sub2:areg_96_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -302,12 +302,12 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %2.sub0:areg_64 = COPY %0
     %2.sub1:areg_64 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -326,13 +326,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
     undef %2.sub0:areg_64_align2 = COPY %0
     %2.sub1:areg_64_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -350,12 +350,12 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_96 = COPY %0
     %1.sub1:areg_96 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %1
     SI_RETURN
 
 ...
@@ -373,12 +373,12 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_96_align2 = COPY %0
     %1.sub1:areg_96_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %1
     SI_RETURN
 
 ...
@@ -398,14 +398,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_128 = COPY %0
     %1.sub1:areg_128 = COPY %0
     %1.sub2:areg_128 = COPY %0
     %1.sub3:areg_128 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %1
     SI_RETURN
 
 ...
@@ -425,14 +425,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_128_align2 = COPY %0
     %1.sub1:areg_128_align2 = COPY %0
     %1.sub2:areg_128_align2 = COPY %0
     %1.sub3:areg_128_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1
     SI_RETURN
 
 ...
@@ -451,15 +451,15 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
     undef %2.sub0:areg_64 = COPY %0
     %2.sub1:areg_64 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2
-    INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, killed %0
+    INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 1245193 /* reguse:VGPR_32 */, killed %0
     SI_RETURN
 
 ...
@@ -477,14 +477,14 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_64 = COPY %0
     %1.sub1:areg_64 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, killed %0
+    INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 1245193 /* reguse:VGPR_32 */, killed %0
     SI_RETURN
 
 ...
@@ -503,16 +503,16 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:VReg_64 */, [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 2818057 /* reguse:VReg_64 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_64 = COPY %0
     %1.sub1:areg_64 = COPY %0
     undef %2.sub0:vreg_64 = COPY %0
     %2.sub1:vreg_64 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:VReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 2818057 /* reguse:VReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -533,13 +533,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     %0.sub1:vreg_64 = COPY $vgpr1
     undef %2.sub0:areg_64 = COPY %0.sub0
     %2.sub1:areg_64 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -558,13 +558,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     %0.sub1:vreg_64 = COPY $vgpr1
     undef %2.sub0:areg_64_align2 = COPY %0.sub0
     %2.sub1:areg_64_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -585,7 +585,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 =COPY $vgpr0
     %0.sub1:vreg_96 = COPY $vgpr1
@@ -593,7 +593,7 @@ body:             |
     undef %3.sub0:areg_96 = COPY %0.sub0
     %3.sub1:areg_96 = COPY %0.sub1
     %3.sub2:areg_96 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %3
     SI_RETURN
 
 ...
@@ -614,7 +614,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 =COPY $vgpr0
     %0.sub1:vreg_96 = COPY $vgpr1
@@ -622,7 +622,7 @@ body:             |
     undef %3.sub0:areg_96_align2 = COPY %0.sub0
     %3.sub1:areg_96_align2 = COPY %0.sub1
     %3.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %3
     SI_RETURN
 
 ...
@@ -641,13 +641,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
     %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
     %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
-    INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -668,13 +668,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1
     %0.sub1:vreg_128 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0
     %2.sub2_sub3:areg_128_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -693,13 +693,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:sreg_64 = COPY $sgpr8
     %0.sub1:sreg_64 = COPY $sgpr9
     undef %2.sub0:areg_64_align2 = COPY %0.sub0
     %2.sub1:areg_64_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -718,13 +718,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 =COPY $vgpr0
     %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96 = COPY %0.sub0
     %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -743,13 +743,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 =COPY $vgpr0
     %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96_align2 = COPY %0.sub0
     %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -768,13 +768,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
     %2.sub2:areg_96 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -793,13 +793,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
     %2.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -817,12 +817,12 @@ body:             |
     ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %2.sub0:areg_64 = COPY %0.sub0
     %2.sub1:areg_64 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -841,13 +841,13 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_96 = COPY %0.sub0
     %1.sub1:areg_96 = COPY %0.sub0
     %1.sub2:areg_96 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %1
     SI_RETURN
 
 ...
@@ -865,12 +865,12 @@ body:             |
     ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_96_align2 = COPY %0.sub0
     %1.sub1:areg_96_align2 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %1
     SI_RETURN
 
 ...
@@ -890,14 +890,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_128 = COPY %0.sub0
     %1.sub1:areg_128 = COPY %0.sub0
     %1.sub2:areg_128 = COPY %0.sub0
     %1.sub3:areg_128 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %1
     SI_RETURN
 
 ...
@@ -917,14 +917,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_128_align2 = COPY %0.sub0
     %1.sub1:areg_128_align2 = COPY %0.sub0
     %1.sub2:areg_128_align2 = COPY %0.sub0
     %1.sub3:areg_128_align2 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1
     SI_RETURN
 
 ...
@@ -943,13 +943,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     %0.sub1:vreg_64 = COPY $vgpr1
     undef %2.sub0:areg_64 = COPY %0.sub0
     %2.sub1:areg_64 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -968,13 +968,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64_align2 = COPY $vgpr0
     %0.sub1:vreg_64_align2 = COPY $vgpr1
     undef %2.sub0:areg_64_align2 = COPY %0.sub0
     %2.sub1:areg_64_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -995,7 +995,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 = COPY $vgpr0
     %0.sub1:vreg_96 = COPY $vgpr1
@@ -1003,7 +1003,7 @@ body:             |
     undef %3.sub0:areg_96 = COPY %0.sub0
     %3.sub1:areg_96 = COPY %0.sub1
     %3.sub2:areg_96 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %3
     SI_RETURN
 
 ...
@@ -1024,7 +1024,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96_align2 = COPY $vgpr0
     %0.sub1:vreg_96_align2 = COPY $vgpr1
@@ -1032,7 +1032,7 @@ body:             |
     undef %3.sub0:areg_96_align2 = COPY %0.sub0
     %3.sub1:areg_96_align2 = COPY %0.sub1
     %3.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %3
     SI_RETURN
 
 ...
@@ -1051,13 +1051,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
     %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
     %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
-    INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -1076,13 +1076,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1
     %0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1
     %2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3
-    INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1101,13 +1101,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:sreg_64 = COPY $sgpr8
     %0.sub1:sreg_64 = COPY $sgpr9
     undef %2.sub0:areg_64_align2 = COPY %0.sub0
     %2.sub1:areg_64_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1126,13 +1126,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 = COPY $vgpr0
     %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96 = COPY %0.sub0
     %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -1150,13 +1150,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 = COPY $vgpr0
     %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96 = COPY %0.sub2
     %2.sub1_sub2:areg_96 = COPY %0.sub0_sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -1176,13 +1176,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96_align2 = COPY $vgpr0
     %0.sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96_align2 = COPY %0.sub0
     %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1201,13 +1201,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
     %2.sub2:areg_96 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -1226,13 +1226,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96_align2 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96_align2 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96_align2 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
     %2.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1251,13 +1251,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
     %2.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1274,11 +1274,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %2:areg_64 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -1295,11 +1295,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr0_vgpr1
     %2:areg_64_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1316,11 +1316,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
     %3:areg_96 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %3
     SI_RETURN
 
 ...
@@ -1337,11 +1337,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
     %3:areg_96_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %3
     SI_RETURN
 
 ...
@@ -1358,11 +1358,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %2:areg_128 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -1379,11 +1379,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %2:areg_128_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1400,11 +1400,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:sreg_64 = COPY $sgpr8_sgpr9
     %2:areg_64_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1421,11 +1421,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
     %2:areg_96_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
index 126cbc643accf..856d1e66fee9d 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
@@ -20,10 +20,10 @@ body:             |
     ; CHECK-LABEL: name: foo1
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %2.sub0, 1245195 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
-    INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32
+    INLINEASM &"", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %0:vgpr_32, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32
     undef %2.sub0:vreg_64 = COPY killed %0
     %2.sub1:vreg_64 = COPY killed %1
     FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
@@ -41,10 +41,10 @@ body:             |
     ; CHECK-LABEL: name: foo2
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1245194 /* regdef:VGPR_32 */, def undef %2.sub0
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
-    INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32
+    INLINEASM &"", 0 /* attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 1245194 /* regdef:VGPR_32 */, def %0:vgpr_32
     undef %2.sub0:vreg_64 = COPY killed %0
     %2.sub1:vreg_64 = COPY killed %1
     FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
@@ -62,10 +62,10 @@ body:             |
     ; CHECK-LABEL: name: foo3
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %2.sub0, 1245195 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
-    INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32
+    INLINEASM &"", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %1:vgpr_32, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32
     undef %2.sub0:vreg_64 = COPY killed %1
     %2.sub1:vreg_64 = COPY killed %0
     FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
@@ -83,10 +83,10 @@ body:             |
     ; CHECK-LABEL: name: foo4
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1245194 /* regdef:VGPR_32 */, def undef %2.sub0
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
-    INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32
+    INLINEASM &"", 0 /* attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 1245194 /* regdef:VGPR_32 */, def %1:vgpr_32
     undef %2.sub0:vreg_64 = COPY killed %1
     %2.sub1:vreg_64 = COPY killed %0
     FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
index 3d315f8a12202..4cbd41c1b1965 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
@@ -647,20 +647,20 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar
 ; GCN-LABEL: sub_zext_setcc_commute:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GCN-NEXT:    buffer_load_dword v4, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
-; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
+; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
 ; GCN-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sub_zext_setcc_commute:
@@ -696,20 +696,20 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar
 ; GCN-LABEL: sub_sext_setcc_commute:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GCN-NEXT:    buffer_load_dword v4, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
-; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
+; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[0:3], 0 addr64
 ; GCN-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sub_sext_setcc_commute:
diff --git a/llvm/test/CodeGen/AMDGPU/compute-known-bits-nofpclass.ll b/llvm/test/CodeGen/AMDGPU/compute-known-bits-nofpclass.ll
new file mode 100644
index 0000000000000..244c3f7c2a96a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/compute-known-bits-nofpclass.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+define i32 @known_positive(float nofpclass(nan ninf nzero nsub nnorm) %signbit.zero) #0 {
+; CHECK-LABEL: known_positive:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %cast = bitcast float %signbit.zero to i32
+  %and = and i32 %cast, 2147483647
+  ret i32 %and
+}
+
+define i32 @known_positive_maybe_nan(float nofpclass(ninf nzero nsub nnorm) %signbit.zero) #0 {
+; CHECK-LABEL: known_positive_maybe_nan:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %cast = bitcast float %signbit.zero to i32
+  %and = and i32 %cast, 2147483647
+  ret i32 %and
+}
+
+define i32 @known_negative(float nofpclass(nan pinf pzero psub pnorm) %signbit.one) #0 {
+; CHECK-LABEL: known_negative:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %cast = bitcast float %signbit.one to i32
+  %or = or i32 %cast, -2147483648
+  ret i32 %or
+}
+
+define i32 @known_negative_maybe_nan(float nofpclass(pinf pzero psub pnorm) %signbit.one) #0 {
+; CHECK-LABEL: known_negative_maybe_nan:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_or_b32_e32 v0, 0x80000000, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %cast = bitcast float %signbit.one to i32
+  %or = or i32 %cast, -2147483648
+  ret i32 %or
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index f5227eed458d6..ef676ddc8070e 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -345,15 +345,13 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) {
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
 ; GFX9-NEXT:    v_fma_f32 v2, v2, v1, v3
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_not_b32_e32 v3, 63
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
 ; GFX9-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
-; GFX9-NEXT:    s_brev_b32 s4, -2
-; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v1, v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %y = sitofp i32 %y.i to float
   %y.fptosi = fptosi float %y to i32
@@ -370,4 +368,109 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) {
   ret float %pow_sign1
 }
 
+define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
+; GFX9-LABEL: test_pow_fast_f64integral_y:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s16, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-NEXT:    v_writelane_b32 v43, s16, 14
+; GFX9-NEXT:    v_writelane_b32 v43, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v43, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v43, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v43, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v43, s36, 4
+; GFX9-NEXT:    v_writelane_b32 v43, s37, 5
+; GFX9-NEXT:    v_writelane_b32 v43, s38, 6
+; GFX9-NEXT:    v_writelane_b32 v43, s39, 7
+; GFX9-NEXT:    v_writelane_b32 v43, s48, 8
+; GFX9-NEXT:    v_writelane_b32 v43, s49, 9
+; GFX9-NEXT:    v_writelane_b32 v43, s50, 10
+; GFX9-NEXT:    s_addk_i32 s32, 0x800
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_writelane_b32 v43, s51, 11
+; GFX9-NEXT:    v_mov_b32_e32 v42, v1
+; GFX9-NEXT:    v_writelane_b32 v43, s52, 12
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
+; GFX9-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-NEXT:    s_add_u32 s16, s16, _Z4log2d@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s17, s17, _Z4log2d@rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v43, s53, 13
+; GFX9-NEXT:    v_mov_b32_e32 v40, v31
+; GFX9-NEXT:    v_mov_b32_e32 v41, v2
+; GFX9-NEXT:    s_mov_b32 s50, s15
+; GFX9-NEXT:    s_mov_b32 s51, s14
+; GFX9-NEXT:    s_mov_b32 s52, s13
+; GFX9-NEXT:    s_mov_b32 s53, s12
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GFX9-NEXT:    s_mov_b64 s[36:37], s[8:9]
+; GFX9-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[4:5]
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    v_cvt_f64_i32_e32 v[2:3], v41
+; GFX9-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-NEXT:    s_add_u32 s16, s16, _Z4exp2d@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s17, s17, _Z4exp2d@rel32@hi+12
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_mov_b64 s[8:9], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT:    s_mov_b32 s12, s53
+; GFX9-NEXT:    s_mov_b32 s13, s52
+; GFX9-NEXT:    s_mov_b32 s14, s51
+; GFX9-NEXT:    s_mov_b32 s15, s50
+; GFX9-NEXT:    v_mov_b32_e32 v31, v40
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 31, v41
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v42
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT:    v_readlane_b32 s53, v43, 13
+; GFX9-NEXT:    v_readlane_b32 s52, v43, 12
+; GFX9-NEXT:    v_readlane_b32 s51, v43, 11
+; GFX9-NEXT:    v_readlane_b32 s50, v43, 10
+; GFX9-NEXT:    v_readlane_b32 s49, v43, 9
+; GFX9-NEXT:    v_readlane_b32 s48, v43, 8
+; GFX9-NEXT:    v_readlane_b32 s39, v43, 7
+; GFX9-NEXT:    v_readlane_b32 s38, v43, 6
+; GFX9-NEXT:    v_readlane_b32 s37, v43, 5
+; GFX9-NEXT:    v_readlane_b32 s36, v43, 4
+; GFX9-NEXT:    v_readlane_b32 s35, v43, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v43, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v43, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v43, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v43, 14
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %fabs = call fast double @llvm.fabs.f64(double %x)
+  %log2 = call fast double @_Z4log2d(double %fabs)
+  %pownI2F = sitofp i32 %y.i to double
+  %ylogx = fmul fast double %log2, %pownI2F
+  %exp2 = call fast nofpclass(nan ninf nzero nsub nnorm) double @_Z4exp2d(double %ylogx)
+  %ytou = zext i32 %y.i to i64
+  %yeven = shl i64 %ytou, 63
+  %x.i64 = bitcast double %x to i64
+  %pow_sign = and i64 %yeven, %x.i64
+  %pow_sign.f64 = bitcast i64 %pow_sign to double
+  %pow_sign1 = call fast double @llvm.copysign.f64(double %exp2, double %pow_sign.f64)
+  ret double %pow_sign1
+}
+
+declare hidden double @_Z4exp2d(double) #1
+declare hidden double @_Z4log2d(double) #1
+
 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { norecurse nounwind memory(read) }
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll b/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll
new file mode 100644
index 0000000000000..afd610f4911c6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll
@@ -0,0 +1,198 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+; Negative test, don't know %x is positive
+define half @copysign_known_signmask_f16(half %x, i16 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 15, v1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i16 %sign, 15
+  %signmask.bitcast = bitcast i16 %signmask to half
+  %result = call half @llvm.copysign.f16(half %x, half %signmask.bitcast)
+  ret half %result
+}
+
+; Negative test, don't know %x is positive
+define float @copysign_known_signmask_f32(float %x, i32 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i32 %sign, 31
+  %signmask.bitcast = bitcast i32 %signmask to float
+  %result = call float @llvm.copysign.f32(float %x, float %signmask.bitcast)
+  ret float %result
+}
+
+; Negative test, don't know %x is positive
+define double @copysign_known_signmask_f64(double %x, i64 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 31, v2
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i64 %sign, 63
+  %signmask.bitcast = bitcast i64 %signmask to double
+  %result = call double @llvm.copysign.f64(double %x, double %signmask.bitcast)
+  ret double %result
+}
+
+; Negative test, don't know %x is positive
+define float @copysign_known_signmask_f32_known_not_known_positive_mag_maybe_nan(float nofpclass(ninf nzero nsub nnorm) %sign.bit.known.zero, i32 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f32_known_not_known_positive_mag_maybe_nan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i32 %sign, 31
+  %signmask.bitcast = bitcast i32 %signmask to float
+  %result = call float @llvm.copysign.f32(float %sign.bit.known.zero, float %signmask.bitcast)
+  ret float %result
+}
+
+; Negative test, don't know %x is positive
+define float @copysign_known_signmask_f32_known_not_known_positive_mag_maybe_negzero(float nofpclass(nan ninf nsub nnorm) %sign.bit.known.zero, i32 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f32_known_not_known_positive_mag_maybe_negzero:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i32 %sign, 31
+  %signmask.bitcast = bitcast i32 %signmask to float
+  %result = call float @llvm.copysign.f32(float %sign.bit.known.zero, float %signmask.bitcast)
+  ret float %result
+}
+
+define half @copysign_known_signmask_f16_known_positive_mag(half nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i16 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f16_known_positive_mag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 15, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i16 %sign, 15
+  %signmask.bitcast = bitcast i16 %signmask to half
+  %result = call half @llvm.copysign.f16(half %sign.bit.known.zero, half %signmask.bitcast)
+  ret half %result
+}
+
+define float @copysign_known_signmask_f32_known_positive_mag(float nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i32 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 31, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i32 %sign, 31
+  %signmask.bitcast = bitcast i32 %signmask to float
+  %result = call float @llvm.copysign.f32(float %sign.bit.known.zero, float %signmask.bitcast)
+  ret float %result
+}
+
+define double @copysign_known_signmask_f64_known_positive_mag(double nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i64 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f64_known_positive_mag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 31, v2
+; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i64 %sign, 63
+  %signmask.bitcast = bitcast i64 %signmask to double
+  %result = call double @llvm.copysign.f64(double %sign.bit.known.zero, double %signmask.bitcast)
+  ret double %result
+}
+
+; exp always returns a positive result, excluding the unknown nan sign
+; bit.
+define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp(float %x, i32 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag__nnan_exp:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xc2aeac50
+; GFX9-NEXT:    v_add_f32_e32 v2, 0x42800000, v0
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x114b4ea4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 31, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signbit.known.zero = call nnan afn float @llvm.exp.f32(float %x)
+  %signmask = shl i32 %sign, 31
+  %signmask.bitcast = bitcast i32 %signmask to float
+  %result = call float @llvm.copysign.f32(float %signbit.known.zero, float %signmask.bitcast)
+  ret float %result
+}
+
+define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp2(float %x, i32 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag__nnan_exp2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x42800000
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    v_not_b32_e32 v2, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 31, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x)
+  %signmask = shl i32 %sign, 31
+  %signmask.bitcast = bitcast i32 %signmask to float
+  %result = call float @llvm.copysign.f32(float %signbit.known.zero, float %signmask.bitcast)
+  ret float %result
+}
+
+define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp10(float %x, i32 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag__nnan_exp10:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x42800000
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    v_not_b32_e32 v2, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 31, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x)
+  %signmask = shl i32 %sign, 31
+  %signmask.bitcast = bitcast i32 %signmask to float
+  %result = call float @llvm.copysign.f32(float %signbit.known.zero, float %signmask.bitcast)
+  ret float %result
+}
+
+define float @copysign_known_signmask_f32_known_positive_mag_through_fence(float nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i32 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag_through_fence:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    ;ARITH_FENCE
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i32 %sign, 31
+  %signmask.bitcast = bitcast i32 %signmask to float
+  %fence = call float @llvm.arithmetic.fence.f32(float %sign.bit.known.zero)
+  %result = call float @llvm.copysign.f32(float %fence, float %signmask.bitcast)
+  ret float %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll
index 613fdf388c0f1..0f45e99dd76c4 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll
@@ -64,13 +64,11 @@ define <2 x i32> @mullohi_2xu32(<2 x i32> %arg, <2 x i32> %arg1, ptr %arg2) {
 ; CHECK-LABEL: mullohi_2xu32:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v6, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, v1, 0
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v6, 0
-; CHECK-NEXT:    v_mov_b32_e32 v6, v1
-; CHECK-NEXT:    v_mov_b32_e32 v7, v3
-; CHECK-NEXT:    v_mov_b32_e32 v1, v2
-; CHECK-NEXT:    flat_store_dwordx2 v[4:5], v[6:7]
+; CHECK-NEXT:    v_mov_b32_e32 v2, v7
+; CHECK-NEXT:    flat_store_dwordx2 v[4:5], v[1:2]
+; CHECK-NEXT:    v_mov_b32_e32 v1, v6
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
index 54cbc25043db3..e841ec43fd064 100644
--- a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
@@ -193,14 +193,13 @@ define noundef i64 @urem64_3(i64 noundef %i)  {
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3]
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3]
-; GFX9-NEXT:    v_alignbit_b32 v2, v3, v2, 1
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, 3, 0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v2, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, 3, v[2:3]
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, s6, v[2:3]
+; GFX9-NEXT:    v_alignbit_b32 v2, v4, v3, 1
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, 3, 0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, 3, v[3:4]
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: urem64_3:
@@ -238,14 +237,13 @@ define noundef i64 @urem64_3(i64 noundef %i)  {
 ; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3]
 ; GFX1030-NEXT:    v_add_co_u32 v2, s4, v5, v3
 ; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s4
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v1, v[2:3]
-; GFX1030-NEXT:    v_alignbit_b32 v2, v3, v2, 1
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, v2, 3, 0
-; GFX1030-NEXT:    v_mov_b32_e32 v2, v5
-; GFX1030-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v3, 3, v[2:3]
-; GFX1030-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
+; GFX1030-NEXT:    v_mad_u64_u32 v[3:4], null, 0xaaaaaaaa, v1, v[2:3]
+; GFX1030-NEXT:    v_alignbit_b32 v2, v4, v3, 1
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v2, 3, 0
+; GFX1030-NEXT:    v_mad_u64_u32 v[3:4], null, v4, 3, v[3:4]
+; GFX1030-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX1030-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
 ; GFX1030-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %rem = urem i64 %i, 3
@@ -265,14 +263,13 @@ define noundef i64 @urem64_6(i64 noundef %i)  {
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3]
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3]
-; GFX9-NEXT:    v_alignbit_b32 v2, v3, v2, 2
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, 6, 0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
-; GFX9-NEXT:    v_mov_b32_e32 v2, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, 6, v[2:3]
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, s6, v[2:3]
+; GFX9-NEXT:    v_alignbit_b32 v2, v4, v3, 2
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, 6, 0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 2, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, 6, v[3:4]
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: urem64_6:
@@ -310,14 +307,13 @@ define noundef i64 @urem64_6(i64 noundef %i)  {
 ; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3]
 ; GFX1030-NEXT:    v_add_co_u32 v2, s4, v5, v3
 ; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s4
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v1, v[2:3]
-; GFX1030-NEXT:    v_alignbit_b32 v2, v3, v2, 2
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
-; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, v2, 6, 0
-; GFX1030-NEXT:    v_mov_b32_e32 v2, v5
-; GFX1030-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v3, 6, v[2:3]
-; GFX1030-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
+; GFX1030-NEXT:    v_mad_u64_u32 v[3:4], null, 0xaaaaaaaa, v1, v[2:3]
+; GFX1030-NEXT:    v_alignbit_b32 v2, v4, v3, 2
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v4, 2, v4
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v2, 6, 0
+; GFX1030-NEXT:    v_mad_u64_u32 v[3:4], null, v4, 6, v[3:4]
+; GFX1030-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX1030-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
 ; GFX1030-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %rem = urem i64 %i, 6
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 0fc54aeaef77b..ddac86b3719c2 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -1953,68 +1953,66 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; SDAG-NEXT:    v_mul_lo_u32 v14, v33, v3
 ; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0
-; SDAG-NEXT:    v_mul_lo_u32 v24, v27, v2
-; SDAG-NEXT:    v_mul_lo_u32 v25, v34, v31
-; SDAG-NEXT:    v_mul_lo_u32 v34, v32, v30
-; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v31, v33, 0
-; SDAG-NEXT:    v_mov_b32_e32 v15, 0
-; SDAG-NEXT:    v_mul_lo_u32 v38, v12, v7
-; SDAG-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v12, v6, 0
-; SDAG-NEXT:    v_mul_lo_u32 v39, v13, v6
+; SDAG-NEXT:    v_mul_lo_u32 v15, v27, v2
+; SDAG-NEXT:    v_mul_lo_u32 v23, v34, v31
+; SDAG-NEXT:    v_mul_lo_u32 v24, v32, v30
+; SDAG-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v31, v33, 0
+; SDAG-NEXT:    v_mov_b32_e32 v22, 0
+; SDAG-NEXT:    v_mul_lo_u32 v25, v12, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v6, 0
+; SDAG-NEXT:    v_mul_lo_u32 v34, v13, v6
 ; SDAG-NEXT:    v_mul_lo_u32 v19, v19, v37
-; SDAG-NEXT:    v_mul_lo_u32 v48, v18, v36
-; SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v37, v12, 0
+; SDAG-NEXT:    v_mul_lo_u32 v38, v18, v36
 ; SDAG-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; SDAG-NEXT:    v_mov_b32_e32 v14, v3
-; SDAG-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[14:15]
-; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, v16, v2
-; SDAG-NEXT:    v_add_i32_e64 v21, s[4:5], v21, v38
-; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v24
-; SDAG-NEXT:    v_mov_b32_e32 v14, v22
-; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v31, v27, v[14:15]
-; SDAG-NEXT:    v_xor_b32_e32 v24, v16, v28
-; SDAG-NEXT:    v_add_i32_e64 v21, s[4:5], v21, v39
+; SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v30, v33, v[21:22]
+; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, v16, v20
+; SDAG-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v25
+; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v15
+; SDAG-NEXT:    v_mov_b32_e32 v21, v6
+; SDAG-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v31, v27, v[21:22]
+; SDAG-NEXT:    v_xor_b32_e32 v16, v16, v28
+; SDAG-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v34
 ; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v32, v31, v[10:11]
-; SDAG-NEXT:    v_add_i32_e64 v22, s[4:5], v23, v3
-; SDAG-NEXT:    v_addc_u32_e64 v23, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v31, vcc, v17, v2, vcc
-; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[20:21]
-; SDAG-NEXT:    v_mov_b32_e32 v14, v7
-; SDAG-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v36, v12, v[14:15]
-; SDAG-NEXT:    v_add_i32_e64 v7, s[4:5], v25, v11
-; SDAG-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v30, v27, v[22:23]
-; SDAG-NEXT:    v_xor_b32_e32 v18, v31, v29
+; SDAG-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v15
+; SDAG-NEXT:    v_addc_u32_e64 v7, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v14, vcc, v17, v14, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[2:3]
+; SDAG-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v37, v12, 0
+; SDAG-NEXT:    v_add_i32_e64 v15, s[4:5], v23, v11
+; SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v30, v27, v[6:7]
+; SDAG-NEXT:    v_xor_b32_e32 v17, v14, v29
 ; SDAG-NEXT:    v_add_i32_e64 v3, s[4:5], v19, v3
-; SDAG-NEXT:    v_mov_b32_e32 v14, v16
-; SDAG-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v37, v13, v[14:15]
-; SDAG-NEXT:    v_add_i32_e64 v7, s[4:5], v34, v7
-; SDAG-NEXT:    v_add_i32_e64 v3, s[4:5], v48, v3
-; SDAG-NEXT:    v_add_i32_e64 v15, s[4:5], v17, v15
-; SDAG-NEXT:    v_addc_u32_e64 v16, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; SDAG-NEXT:    v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, v0, v10, vcc
-; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v36, v13, v[15:16]
-; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v1, v7, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v7, v0, v28
-; SDAG-NEXT:    v_add_i32_e32 v10, vcc, v10, v2
-; SDAG-NEXT:    v_addc_u32_e32 v11, vcc, v11, v3, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v3, v1, v29
-; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, v24, v28
-; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v18, v29, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v2, vcc, v7, v28, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v29, vcc
-; SDAG-NEXT:    v_sub_i32_e32 v6, vcc, v8, v6
-; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v9, v14, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v6, v6, v26
-; SDAG-NEXT:    v_subb_u32_e32 v4, vcc, v4, v10, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v7, v7, v35
-; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v5, v11, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v8, v4, v26
+; SDAG-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v36, v12, v[21:22]
+; SDAG-NEXT:    v_add_i32_e64 v18, s[4:5], v24, v15
+; SDAG-NEXT:    v_add_i32_e64 v3, s[4:5], v38, v3
+; SDAG-NEXT:    v_mov_b32_e32 v21, v11
+; SDAG-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v37, v13, v[21:22]
+; SDAG-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
+; SDAG-NEXT:    v_addc_u32_e64 v10, s[4:5], v7, v18, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, v0, v6, vcc
+; SDAG-NEXT:    v_add_i32_e64 v6, s[4:5], v12, v15
+; SDAG-NEXT:    v_addc_u32_e64 v7, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v1, v10, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v10, v0, v28
+; SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v36, v13, v[6:7]
+; SDAG-NEXT:    v_xor_b32_e32 v11, v1, v29
+; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, v16, v28
+; SDAG-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v2
+; SDAG-NEXT:    v_addc_u32_e64 v7, s[4:5], v7, v3, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v17, v29, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v2, vcc, v10, v28, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v11, v29, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v8, vcc, v8, v20
+; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, v9, v14, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v8, v8, v26
+; SDAG-NEXT:    v_subb_u32_e32 v4, vcc, v4, v6, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v6, v9, v35
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v5, v7, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v7, v4, v26
 ; SDAG-NEXT:    v_xor_b32_e32 v9, v5, v35
-; SDAG-NEXT:    v_sub_i32_e32 v4, vcc, v6, v26
-; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v7, v35, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v6, vcc, v8, v26, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v4, vcc, v8, v26
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v6, v35, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v6, vcc, v7, v26, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v9, v35, vcc
 ; SDAG-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -2407,51 +2405,49 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0
-; GISEL-NEXT:    v_mul_lo_u32 v24, v30, v19
-; GISEL-NEXT:    v_mul_lo_u32 v25, v29, v18
-; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v35, v20, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v35, v2, 0
-; GISEL-NEXT:    v_mul_lo_u32 v26, v35, v3
-; GISEL-NEXT:    v_mul_lo_u32 v27, v34, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v29, v32, v[14:15]
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[22:23]
-; GISEL-NEXT:    v_mov_b32_e32 v22, v19
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[2:3]
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v4, v20, v[14:15]
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v30, v32, v[1:2]
-; GISEL-NEXT:    v_mov_b32_e32 v23, v14
-; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v35, v21, v[22:23]
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v29, v31, v[1:2]
-; GISEL-NEXT:    v_addc_u32_e64 v3, s[6:7], v3, v24, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v34, v20, v[22:23]
-; GISEL-NEXT:    v_addc_u32_e64 v14, s[6:7], v15, v26, s[6:7]
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v25, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v26, v30, v19
+; GISEL-NEXT:    v_mul_lo_u32 v27, v29, v18
+; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v35, v20, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v35, v2, 0
+; GISEL-NEXT:    v_mul_lo_u32 v36, v35, v3
+; GISEL-NEXT:    v_mul_lo_u32 v37, v34, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v29, v32, v[14:15]
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[18:19]
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[24:25]
+; GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v4, v20, v[14:15]
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], vcc, v30, v32, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v35, v21, v[23:24]
+; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v29, v31, v[14:15]
+; GISEL-NEXT:    v_addc_u32_e64 v3, s[6:7], v3, v26, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[6:7], v34, v20, v[1:2]
+; GISEL-NEXT:    v_addc_u32_e64 v2, s[6:7], v25, v36, s[6:7]
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v3, v27, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v28
-; GISEL-NEXT:    v_addc_u32_e64 v0, s[4:5], v14, v27, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v12, s[4:5], v12, v18
-; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], v13, v22, s[4:5]
-; GISEL-NEXT:    v_xor_b32_e32 v16, v12, v33
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v10, v32, v[3:4]
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v28
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v4, v21, v[0:1]
-; GISEL-NEXT:    v_xor_b32_e32 v14, v14, v33
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[6:7], v11, v31, v[12:13]
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[6:7], v15, v28
+; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, v17, v18, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v28
+; GISEL-NEXT:    v_addc_u32_e64 v0, s[4:5], v2, v37, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v12, v22
+; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], v13, v14, s[4:5]
+; GISEL-NEXT:    v_xor_b32_e32 v22, v2, v33
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v10, v32, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v16, v28
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v4, v21, v[0:1]
+; GISEL-NEXT:    v_xor_b32_e32 v10, v14, v33
+; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v11, v31, v[2:3]
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[6:7], v18, v28
 ; GISEL-NEXT:    v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[8:9], v5, v20, v[3:4]
-; GISEL-NEXT:    v_sub_i32_e64 v4, s[8:9], v16, v33
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[8:9], v14, v33, s[8:9]
-; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v8, v2, vcc
-; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, v9, v10, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v28
-; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v23, s[4:5]
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v5, v20, v[12:13]
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[8:9], v22, v33
+; GISEL-NEXT:    v_subb_u32_e64 v5, s[8:9], v10, v33, s[8:9]
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v8, v19, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, v9, v16, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v28
+; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v15, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v7, v2, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v33
 ; GISEL-NEXT:    v_xor_b32_e32 v7, v8, v28
-; GISEL-NEXT:    v_xor_b32_e32 v8, v3, v33
-; GISEL-NEXT:    v_subb_u32_e64 v2, vcc, v2, v28, s[6:7]
+; GISEL-NEXT:    v_xor_b32_e32 v8, v2, v33
+; GISEL-NEXT:    v_subb_u32_e64 v2, vcc, v3, v28, s[6:7]
 ; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v7, v28, vcc
 ; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v33, s[8:9]
 ; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v8, v33, vcc
@@ -2814,52 +2810,50 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_or_b32_e32 v16, v16, v20
 ; SDAG-NEXT:  .LBB3_12: ; %Flow12
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT:    v_mul_lo_u32 v20, v32, v11
-; SDAG-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v32, v10, 0
-; SDAG-NEXT:    v_mul_lo_u32 v28, v30, v10
-; SDAG-NEXT:    v_mul_lo_u32 v29, v33, v8
-; SDAG-NEXT:    v_mul_lo_u32 v33, v31, v9
-; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v8, v32, 0
-; SDAG-NEXT:    v_mov_b32_e32 v21, 0
-; SDAG-NEXT:    v_mul_lo_u32 v34, v16, v15
-; SDAG-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v16, v14, 0
-; SDAG-NEXT:    v_mul_lo_u32 v35, v17, v14
-; SDAG-NEXT:    v_mul_lo_u32 v23, v23, v12
-; SDAG-NEXT:    v_mul_lo_u32 v36, v22, v13
-; SDAG-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v12, v16, 0
-; SDAG-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
-; SDAG-NEXT:    v_mov_b32_e32 v20, v11
-; SDAG-NEXT:    v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[20:21]
-; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
-; SDAG-NEXT:    v_add_i32_e64 v25, s[4:5], v25, v34
-; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], v19, v28
-; SDAG-NEXT:    v_mov_b32_e32 v20, v26
-; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[20:21]
-; SDAG-NEXT:    v_add_i32_e64 v25, s[4:5], v25, v35
-; SDAG-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v31, v8, v[18:19]
-; SDAG-NEXT:    v_add_i32_e64 v26, s[4:5], v27, v11
-; SDAG-NEXT:    v_addc_u32_e64 v27, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v1, v10, vcc
-; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25]
-; SDAG-NEXT:    v_mov_b32_e32 v20, v15
-; SDAG-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], v13, v16, v[20:21]
-; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], v29, v19
-; SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[26:27]
-; SDAG-NEXT:    v_add_i32_e64 v22, s[4:5], v23, v11
-; SDAG-NEXT:    v_mov_b32_e32 v20, v15
-; SDAG-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v12, v17, v[20:21]
-; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], v33, v19
-; SDAG-NEXT:    v_add_i32_e64 v20, s[4:5], v36, v22
-; SDAG-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v12
-; SDAG-NEXT:    v_addc_u32_e64 v16, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v18
-; SDAG-NEXT:    v_addc_u32_e64 v12, s[4:5], v9, v19, s[4:5]
+; SDAG-NEXT:    v_mul_lo_u32 v21, v32, v11
+; SDAG-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v32, v10, 0
+; SDAG-NEXT:    v_mul_lo_u32 v26, v30, v10
+; SDAG-NEXT:    v_mul_lo_u32 v27, v33, v8
+; SDAG-NEXT:    v_mul_lo_u32 v28, v31, v9
+; SDAG-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v8, v32, 0
+; SDAG-NEXT:    v_mov_b32_e32 v20, 0
+; SDAG-NEXT:    v_mul_lo_u32 v29, v16, v15
+; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0
+; SDAG-NEXT:    v_mul_lo_u32 v33, v17, v14
+; SDAG-NEXT:    v_mul_lo_u32 v34, v23, v12
+; SDAG-NEXT:    v_mul_lo_u32 v35, v22, v13
+; SDAG-NEXT:    v_add_i32_e32 v21, vcc, v25, v21
+; SDAG-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v9, v32, v[19:20]
+; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, v0, v18
+; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v29
+; SDAG-NEXT:    v_add_i32_e64 v25, s[4:5], v21, v26
+; SDAG-NEXT:    v_mov_b32_e32 v19, v14
+; SDAG-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v8, v30, v[19:20]
+; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v33
+; SDAG-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v31, v8, v[24:25]
+; SDAG-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v19
+; SDAG-NEXT:    v_addc_u32_e64 v15, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v1, v18, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[10:11]
+; SDAG-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v12, v16, 0
+; SDAG-NEXT:    v_add_i32_e64 v21, s[4:5], v27, v24
+; SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[14:15]
+; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], v34, v11
+; SDAG-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v13, v16, v[19:20]
+; SDAG-NEXT:    v_add_i32_e64 v16, s[4:5], v28, v21
+; SDAG-NEXT:    v_add_i32_e64 v21, s[4:5], v35, v11
+; SDAG-NEXT:    v_mov_b32_e32 v19, v14
+; SDAG-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v12, v17, v[19:20]
+; SDAG-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v23
+; SDAG-NEXT:    v_addc_u32_e64 v14, s[4:5], v9, v16, s[4:5]
 ; SDAG-NEXT:    v_subb_u32_e32 v2, vcc, v2, v8, vcc
-; SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v17, v[15:16]
-; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
+; SDAG-NEXT:    v_add_i32_e64 v8, s[4:5], v15, v12
+; SDAG-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v14, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v17, v[8:9]
 ; SDAG-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; SDAG-NEXT:    v_addc_u32_e32 v9, vcc, v9, v20, vcc
-; SDAG-NEXT:    v_sub_i32_e32 v4, vcc, v4, v14
+; SDAG-NEXT:    v_addc_u32_e32 v9, vcc, v9, v21, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v4, vcc, v4, v18
 ; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v5, v11, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v6, vcc, v6, v8, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v7, v9, vcc
@@ -3216,36 +3210,36 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0
-; GISEL-NEXT:    v_mul_lo_u32 v28, v8, v21
-; GISEL-NEXT:    v_mul_lo_u32 v29, v9, v20
+; GISEL-NEXT:    v_mul_lo_u32 v34, v8, v21
+; GISEL-NEXT:    v_mul_lo_u32 v35, v9, v20
 ; GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v12, v24, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[26:27], s[4:5], v12, v18, 0
-; GISEL-NEXT:    v_mul_lo_u32 v30, v12, v19
-; GISEL-NEXT:    v_mul_lo_u32 v31, v13, v18
-; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v9, v33, v[22:23]
-; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v13, v25, v[26:27]
-; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[18:19]
-; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[22:23]
-; GISEL-NEXT:    v_mad_u64_u32 v[17:18], vcc, v8, v33, v[17:18]
-; GISEL-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v12, v25, v[21:22]
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[6:7], v9, v32, v[17:18]
-; GISEL-NEXT:    v_addc_u32_e64 v17, s[6:7], v19, v28, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v13, v24, v[21:22]
-; GISEL-NEXT:    v_addc_u32_e64 v18, s[6:7], v23, v30, s[6:7]
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v17, v29, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v36, v12, v19
+; GISEL-NEXT:    v_mul_lo_u32 v37, v13, v18
+; GISEL-NEXT:    v_mad_u64_u32 v[28:29], s[4:5], v9, v33, v[22:23]
+; GISEL-NEXT:    v_mad_u64_u32 v[30:31], s[4:5], v13, v25, v[26:27]
+; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[28:29]
+; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[30:31]
+; GISEL-NEXT:    v_mad_u64_u32 v[26:27], vcc, v8, v33, v[17:18]
+; GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v12, v25, v[21:22]
+; GISEL-NEXT:    v_mad_u64_u32 v[21:22], s[6:7], v9, v32, v[26:27]
+; GISEL-NEXT:    v_addc_u32_e64 v12, s[6:7], v19, v34, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[6:7], v13, v24, v[17:18]
+; GISEL-NEXT:    v_addc_u32_e64 v13, s[6:7], v23, v36, s[6:7]
+; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v12, v35, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v18, v31, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v21, vcc
+; GISEL-NEXT:    v_addc_u32_e64 v13, s[4:5], v13, v37, s[4:5]
 ; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], v4, v20
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v5, v12, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[17:18]
-; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[6:7], v11, v32, v[16:17]
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[6:7], v15, v24, v[18:19]
-; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v9, vcc
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v13, s[4:5]
-; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v7, v11, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v5, v8, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[12:13]
+; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[13:14]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v11, v32, v[16:17]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[6:7], v15, v24, v[18:19]
+; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v22, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v9, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v7, v10, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %shl = urem <2 x i128> %lhs, %rhs
   ret <2 x i128> %shl
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index ab96dcf1f6069..8532a7f716ba7 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -390,16 +390,15 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32
 ; GCN-LABEL: uniform_vec_i16_LH:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s3, s3, 0xffff0000
-; GCN-NEXT:    s_and_b32 s2, s2, 0xffff
-; GCN-NEXT:    s_or_b32 s2, s2, s3
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_and_b32 s2, s5, 0xffff0000
+; GCN-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NEXT:    s_or_b32 s4, s4, s2
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: uniform_vec_i16_LH:
@@ -488,13 +487,13 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s3, s3, 16
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 16
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_lshr_b32 s0, s3, 16
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_alignbit_b32 v0, s0, v0, 16
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll
index 4c3fd40d7a25a..d8f9bc1a0e054 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll
@@ -5,15 +5,14 @@ define amdgpu_kernel void @uniform_sext_in_reg_i8_to_i32(ptr addrspace(1) %out,
 ; GCN-LABEL: uniform_sext_in_reg_i8_to_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_add_i32 s2, s2, s3
-; GCN-NEXT:    s_sext_i32_i8 s2, s2
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_add_i32 s2, s4, s5
+; GCN-NEXT:    s_sext_i32_i8 s4, s2
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %c = add i32 %a, %b ; add to prevent folding into extload
   %shl = shl i32 %c, 24
@@ -26,15 +25,14 @@ define amdgpu_kernel void @divergent_sext_in_reg_i8_to_i32(ptr addrspace(1) %out
 ; GCN-LABEL: divergent_sext_in_reg_i8_to_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_add_i32 s0, s2, s3
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_add_i32 s4, s4, s5
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %c = add i32 %a, %b ; add to prevent folding into extload
@@ -49,15 +47,14 @@ define amdgpu_kernel void @uniform_sext_in_reg_i16_to_i32(ptr addrspace(1) %out,
 ; GCN-LABEL: uniform_sext_in_reg_i16_to_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_add_i32 s2, s2, s3
-; GCN-NEXT:    s_sext_i32_i16 s2, s2
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_add_i32 s2, s4, s5
+; GCN-NEXT:    s_sext_i32_i16 s4, s2
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %c = add i32 %a, %b ; add to prevent folding into extload
   %shl = shl i32 %c, 16
@@ -70,15 +67,14 @@ define amdgpu_kernel void @divergent_sext_in_reg_i16_to_i32(ptr addrspace(1) %ou
 ; GCN-LABEL: divergent_sext_in_reg_i16_to_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_add_i32 s0, s2, s3
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_add_i32 s4, s4, s5
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %c = add i32 %a, %b ; add to prevent folding into extload
diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
index 007e3f0a6bdbc..076a99ff8588f 100644
--- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
+++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
@@ -3,6 +3,7 @@
 
 declare void @foo(ptr)
 declare i1 @bar(ptr)
+declare i32 @bar32(ptr)
 
 define void @musttail_call_without_return_value(ptr %p) {
 ; CHECK-LABEL: define void @musttail_call_without_return_value(
@@ -28,6 +29,31 @@ bb.1:
   ret void
 }
 
+define void @musttail_call_without_return_value_callbr(ptr %p) {
+; CHECK-LABEL: define void @musttail_call_without_return_value_callbr(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[P]], align 1
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i32 [[LOAD]])
+; CHECK-NEXT:            to label %[[BB_0:.*]] [label %bb.1]
+; CHECK:       [[BB_0]]:
+; CHECK-NEXT:    musttail call void @foo(ptr [[P]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[BB_1:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %load = load i32, ptr %p, align 1
+  callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
+
+bb.0:
+  musttail call void @foo(ptr %p)
+  ret void
+
+bb.1:
+  ret void
+}
+
 define i1 @musttail_call_with_return_value(ptr %p) {
 ; CHECK-LABEL: define i1 @musttail_call_with_return_value(
 ; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
@@ -51,3 +77,28 @@ bb.0:
 bb.1:
   ret i1 %load
 }
+
+define i32 @musttail_call_with_return_value_callbr(ptr %p) {
+; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[P]], align 1
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i32 [[LOAD]])
+; CHECK-NEXT:            to label %[[BB_0:.*]] [label %bb.1]
+; CHECK:       [[BB_0]]:
+; CHECK-NEXT:    [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]])
+; CHECK-NEXT:    ret i32 [[RET]]
+; CHECK:       [[BB_1:.*:]]
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %load = load i32, ptr %p, align 1
+  callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
+
+bb.0:
+  %ret = musttail call i32 @bar32(ptr %p)
+  ret i32 %ret
+
+bb.1:
+  ret i32 %load
+}
diff --git a/llvm/test/CodeGen/AMDGPU/ds-read2-write2-debug-info.ll b/llvm/test/CodeGen/AMDGPU/ds-read2-write2-debug-info.ll
new file mode 100644
index 0000000000000..0873003899e8a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ds-read2-write2-debug-info.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=debugify < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck %s
+
+@lds = addrspace(3) global [512 x float] poison, align 4
+
+define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
+; CHECK-LABEL: simple_write2_one_val_f32:
+; CHECK:       .Lfunc_begin0:
+; CHECK-NEXT:    .cfi_sections .debug_frame
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    .file 1 "/" "<stdin>"
+; CHECK-NEXT:    .loc 1 1 1 prologue_end ; <stdin>:1:1
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_write2_one_val_f32:1 <- $vgpr0
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_write2_one_val_f32:5 <- [DW_OP_plus_uconst 8, DW_OP_stack_value] $vgpr0
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_write2_one_val_f32:3 <- undef
+; CHECK-NEXT:    .loc 1 2 1 ; <stdin>:2:1
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_write2_one_val_f32:4 <- $vgpr0
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_write2_one_val_f32:2 <- undef
+; CHECK-NEXT:    .loc 1 3 1 ; <stdin>:3:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_dword v1, v0, s[0:1]
+; CHECK-NEXT:  .Ltmp2:
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_write2_one_val_f32:6 <- [DW_OP_plus_uconst 32, DW_OP_stack_value] $vgpr0
+; CHECK-NEXT:    .loc 1 0 0 is_stmt 0 ; <stdin>:0
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write2_b32 v0, v1, v1 offset1:8
+; CHECK-NEXT:    .loc 1 9 1 is_stmt 1 ; <stdin>:9:1
+; CHECK-NEXT:    s_endpgm
+; CHECK-NEXT:  .Ltmp3:
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %in.gep = getelementptr float, ptr addrspace(1) %in, i32 %x.i
+  %val = load float, ptr addrspace(1) %in.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  store float %val, ptr addrspace(3) %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  store float %val, ptr addrspace(3) %arrayidx1, align 4
+  ret void
+}
+
+define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 {
+; CHECK-LABEL: simple_read2_f32:
+; CHECK:       .Lfunc_begin1:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    .loc 1 11 1 prologue_end ; <stdin>:11:1
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CHECK-NEXT:  .Ltmp4:
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_read2_f32:8 <- $vgpr2
+; CHECK-NEXT:    .loc 1 0 0 is_stmt 0 ; <stdin>:0
+; CHECK-NEXT:    ds_read2_b32 v[0:1], v2 offset1:8
+; CHECK-NEXT:  .Ltmp5:
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_read2_f32:9 <- undef
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_read2_f32:11 <- [DW_OP_plus_uconst 32, DW_OP_stack_value] $vgpr2
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_read2_f32:12 <- undef
+; CHECK-NEXT:    .loc 1 10 1 is_stmt 1 ; <stdin>:10:1
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-NEXT:  .Ltmp6:
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_read2_f32:7 <- undef
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_read2_f32:10 <- [DW_OP_plus_uconst 8, DW_OP_stack_value] undef
+; CHECK-NEXT:    .loc 1 16 1 ; <stdin>:16:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_add_f32_e32 v0, v0, v1
+; CHECK-NEXT:  .Ltmp7:
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_read2_f32:13 <- $vgpr0
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_read2_f32:14 <- undef
+; CHECK-NEXT:    .loc 1 18 1 ; <stdin>:18:1
+; CHECK-NEXT:    global_store_dword v2, v0, s[0:1]
+; CHECK-NEXT:    .loc 1 19 1 ; <stdin>:19:1
+; CHECK-NEXT:    s_endpgm
+; CHECK-NEXT:  .Ltmp8:
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 0cae0e51107df..5cc68451d5ab7 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -851,12 +851,12 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    ds_write_b8 v0, v1 offset:9
+; CI-NEXT:    ds_write_b8 v0, v2 offset:13
 ; CI-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; CI-NEXT:    ds_write_b8 v0, v1 offset:5
 ; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; CI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; CI-NEXT:    ds_write_b8 v0, v1 offset:9
-; CI-NEXT:    ds_write_b8 v0, v2 offset:13
 ; CI-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
 ; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; CI-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
index 4dfdb56a69ff3..98472552d2bf1 100644
--- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
@@ -370,7 +370,7 @@ body:            |
     ; HAZARD-LABEL: name: inline_sdwa_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1
     ; HAZARD-NEXT: S_NOP 0
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; HAZARD-NEXT: S_ENDPGM 0
@@ -378,10 +378,10 @@ body:            |
     ; NOHAZARD-LABEL: name: inline_sdwa_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
+  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
   S_ENDPGM 0
 ...
@@ -397,17 +397,17 @@ body:            |
     ; HAZARD-NEXT: {{  $}}
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; HAZARD-NEXT: S_NOP 0
-    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
     ; NOHAZARD-LABEL: name: sdwa_inline_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
-    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1
     ; NOHAZARD-NEXT: S_ENDPGM 0
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
-  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
+  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1
   S_ENDPGM 0
 ...
 
@@ -421,19 +421,19 @@ body:            |
     ; HAZARD-LABEL: name: inline_inline_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1
     ; HAZARD-NEXT: S_NOP 0
-    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
     ; NOHAZARD-LABEL: name: inline_inline_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
-    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
-  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
+  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1
+  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1
   S_ENDPGM 0
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index c5db7a33f70e0..d19a260db3550 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -13,8 +13,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) {
 ; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
 ; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
@@ -53,12 +52,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) {
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, 15
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
 ; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -88,13 +86,12 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_load_dword s4, s[8:9], 0x0
-; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
 ; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
 ; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX9-SDAG-NEXT:    s_add_i32 s5, s32, 0x1fff
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, 0xffffe000
 ; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 10
@@ -137,12 +134,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, 15
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
 ; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -178,8 +174,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(
 ; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
 ; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 22
 ; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
@@ -218,12 +213,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, 15
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
 ; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -367,26 +361,26 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned() {
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
 ; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
 ; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s6, s4, 0xffffe000
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
-; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
 ; GFX9-SDAG-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s8, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s9, v0, s8
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s8
+; GFX9-SDAG-NEXT:    s_max_u32 s7, s7, s9
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB4_1
 ; GFX9-SDAG-NEXT:  ; %bb.2:
-; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
-; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xffffe000
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s7, 6, v0
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
 ; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -609,8 +603,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
 ; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.0
-; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 2
-; GFX9-SDAG-NEXT:    s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s5, s5, 15
 ; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
 ; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, -16
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
@@ -639,8 +632,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:  .LBB6_4: ; %bb.1
-; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
@@ -719,20 +711,17 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.0
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX11-SDAG-NEXT:    s_lshl2_add_u32 s1, s1, 15
 ; GFX11-SDAG-NEXT:    s_add_i32 s3, s32, 0x7ff
-; GFX11-SDAG-NEXT:    s_add_i32 s1, s1, 15
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_and_b32 s4, s1, -16
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_and_b32 s1, s3, 0xfffff800
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s3, s4, 5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
 ; GFX11-SDAG-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s4, s3
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_readlane_b32 s5, v0, s4
@@ -750,18 +739,16 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:  .LBB6_4: ; %bb.1
-; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, 15
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
-; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s33 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -866,9 +853,8 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_cbranch_execnz .LBB7_5
 ; GFX9-SDAG-NEXT:  .LBB7_4: ; %bb.0
-; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 2
 ; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0xfff
-; GFX9-SDAG-NEXT:    s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s5, s5, 15
 ; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xfffff000
 ; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, -16
 ; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 6
@@ -964,16 +950,15 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB7_5
 ; GFX11-SDAG-NEXT:  .LBB7_4: ; %bb.0
-; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s1, 2
+; GFX11-SDAG-NEXT:    s_lshl2_add_u32 s1, s1, 15
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
-; GFX11-SDAG-NEXT:    s_add_i32 s1, s32, 0x7ff
-; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, 0xfffff800
-; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s32, 0x7ff
+; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, -16
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 0xfffff800
+; GFX11-SDAG-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s0, s1
 ; GFX11-SDAG-NEXT:  .LBB7_5: ; %bb.2
 ; GFX11-SDAG-NEXT:    s_endpgm
 ; GFX11-SDAG-NEXT:  .LBB7_6:
@@ -1171,35 +1156,35 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s33
 ; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0x1fc0
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s34
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s34
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x4000
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s6, s4, 0xffffe000
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0
 ; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffe000
-; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
-; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x4000
 ; GFX9-SDAG-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s8, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s9, v0, s8
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s8
+; GFX9-SDAG-NEXT:    s_max_u32 s7, s7, s9
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX9-SDAG-NEXT:  ; %bb.2:
-; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
-; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xffffe000
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s7, 6, v0
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 10
 ; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s34
-; GFX9-SDAG-NEXT:    s_mov_b32 s34, s10
-; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s11
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s10
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
@@ -1240,34 +1225,35 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s33
 ; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 0x7f
-; GFX11-SDAG-NEXT:    s_mov_b32 s5, s34
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
-; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
-; GFX11-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-SDAG-NEXT:    s_mov_b32 s6, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x100
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s32, 0xfff
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 0xfffff000
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffff80
 ; GFX11-SDAG-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX11-SDAG-NEXT:  ; %bb.2:
-; GFX11-SDAG-NEXT:    s_add_i32 s1, s32, 0xfff
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 10
-; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, 0xfffff000
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
-; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s0 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
-; GFX11-SDAG-NEXT:    s_mov_b32 s34, s5
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s6
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
@@ -1850,20 +1836,20 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB14_6
 ; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.0
+; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s9, s6, 0xfffff000
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-SDAG-NEXT:    s_mov_b32 s10, 0
 ; GFX9-SDAG-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s11, v1, s9
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX9-SDAG-NEXT:    s_max_u32 s10, s10, s11
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s11, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s12, v1, s11
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s11
+; GFX9-SDAG-NEXT:    s_max_u32 s10, s10, s12
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB14_2
 ; GFX9-SDAG-NEXT:  ; %bb.3:
-; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
-; GFX9-SDAG-NEXT:    s_and_b32 s9, s6, 0xfffff000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s10, 6, v1
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
@@ -1894,7 +1880,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:  .LBB14_6: ; %bb.1
 ; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-SDAG-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
@@ -1912,7 +1897,8 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s34
 ; GFX9-SDAG-NEXT:    s_mov_b32 s34, s14
@@ -2016,27 +2002,26 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB14_6
 ; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.0
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
-; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, exec_lo
+; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GFX11-SDAG-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s4, s2
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s5, s4
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s5, v1, s4
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s4
-; GFX11-SDAG-NEXT:    s_max_u32 s3, s3, s5
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    v_readlane_b32 s6, v1, s5
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s4, s5
+; GFX11-SDAG-NEXT:    s_max_u32 s3, s3, s6
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_2
 ; GFX11-SDAG-NEXT:  ; %bb.3:
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
-; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
-; GFX11-SDAG-NEXT:    s_mov_b32 s4, exec_lo
-; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v2, s3, 5, s2
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x1ff0, v1
@@ -2059,31 +2044,30 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
 ; GFX11-SDAG-NEXT:  .LBB14_6: ; %bb.1
 ; GFX11-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v0, 2, 15
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX11-SDAG-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
 ; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v1, s2
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
 ; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_7
 ; GFX11-SDAG-NEXT:  ; %bb.8:
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 1
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s33 dlc
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s33 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s7
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s34, s8
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s7
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
@@ -2189,9 +2173,9 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s33
+; GFX9-SDAG-NEXT:    s_mov_b32 s12, s33
 ; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0xfc0
-; GFX9-SDAG-NEXT:    s_mov_b32 s12, s34
+; GFX9-SDAG-NEXT:    s_mov_b32 s13, s34
 ; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
 ; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff000
@@ -2201,24 +2185,24 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB15_4
 ; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.1
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v1, 2, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s9, s6, 0xfffff000
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-SDAG-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s10, v1, s9
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s11, v0, s10
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s11
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB15_2
 ; GFX9-SDAG-NEXT:  ; %bb.3:
-; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
-; GFX9-SDAG-NEXT:    s_and_b32 s6, s6, 0xfffff000
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v2, s8, 6, v1
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v2
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s9
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s8, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    ; implicit-def: $vgpr31
 ; GFX9-SDAG-NEXT:  .LBB15_4: ; %Flow
@@ -2248,8 +2232,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:  .LBB15_8: ; %bb.2
 ; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s34
-; GFX9-SDAG-NEXT:    s_mov_b32 s34, s12
-; GFX9-SDAG-NEXT:    s_mov_b32 s33, s11
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s13
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s12
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
@@ -2321,9 +2305,9 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_mov_b32 s5, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s6, s33
 ; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 63
-; GFX11-SDAG-NEXT:    s_mov_b32 s6, s34
+; GFX11-SDAG-NEXT:    s_mov_b32 s7, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-SDAG-NEXT:    s_and_not1_b32 s33, s33, 63
@@ -2333,28 +2317,28 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_4
 ; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.1
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v1, 2, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX11-SDAG-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s4, s3
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v1, s3
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s3, s4
+; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s5
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB15_2
 ; GFX11-SDAG-NEXT:  ; %bb.3:
-; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s2
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr31
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s1, 5, s2
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s2 dlc
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
 ; GFX11-SDAG-NEXT:  .LBB15_4: ; %Flow
 ; GFX11-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_8
@@ -2383,8 +2367,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:  .LBB15_8: ; %bb.2
 ; GFX11-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
-; GFX11-SDAG-NEXT:    s_mov_b32 s34, s6
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s5
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s7
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s6
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index dbdea8e3c533d..71af21a11c2ce 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -12,8 +12,6 @@ define i32 @s_add_co_select_user() {
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_add_u32 s7, s6, s6
-; GFX7-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX7-NEXT:    s_or_b32 s4, s4, s5
 ; GFX7-NEXT:    s_addc_u32 s8, s6, 0
 ; GFX7-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -88,15 +86,13 @@ bb:
 define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX7-LABEL: s_add_co_br_user:
 ; GFX7:       ; %bb.0: ; %bb
-; GFX7-NEXT:    s_load_dword s2, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_add_i32 s12, s12, s17
 ; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s2, s2
-; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_addc_u32 s0, s2, 0
+; GFX7-NEXT:    s_add_u32 s1, s0, s0
+; GFX7-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GFX7-NEXT:    s_cbranch_vccnz .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 6bcb086944c91..97e23fcdb2263 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -99,16 +99,15 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; SI-LABEL: fabs_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_and_b32 s0, s3, 0x7fffffff
-; SI-NEXT:    s_and_b32 s1, s2, 0x7fffffff
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_bitset0_b32 s5, 31
+; SI-NEXT:    s_bitset0_b32 s4, 31
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fabs_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 5fb50d0d89530..da08f4fcf8f3d 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -3755,42 +3755,44 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
 ; CI-NEXT:    v_cvt_f16_f32_e32 v13, v22
 ; CI-NEXT:    v_or_b32_e32 v10, v14, v10
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32
 ; CI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; CI-NEXT:    v_or_b32_e32 v17, v18, v17
 ; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT:    v_or_b32_e32 v17, v18, v17
 ; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:16
 ; CI-NEXT:    v_cvt_f16_f32_e32 v22, v27
-; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; CI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; CI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; CI-NEXT:    v_or_b32_e32 v13, v16, v13
 ; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:12
 ; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; CI-NEXT:    v_or_b32_e32 v19, v20, v19
 ; CI-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
 ; CI-NEXT:    v_cvt_f16_f32_e32 v21, v30
 ; CI-NEXT:    v_or_b32_e32 v20, v22, v20
 ; CI-NEXT:    v_cvt_f16_f32_e32 v22, v29
-; CI-NEXT:    s_waitcnt vmcnt(6)
+; CI-NEXT:    s_waitcnt vmcnt(8)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; CI-NEXT:    v_cvt_f32_f16_e32 v21, v21
 ; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
 ; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; CI-NEXT:    s_waitcnt vmcnt(5)
+; CI-NEXT:    s_waitcnt vmcnt(7)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
 ; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; CI-NEXT:    v_or_b32_e32 v21, v22, v21
 ; CI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT:    s_waitcnt vmcnt(3)
+; CI-NEXT:    s_waitcnt vmcnt(5)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; CI-NEXT:    s_waitcnt vmcnt(2)
+; CI-NEXT:    s_waitcnt vmcnt(4)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
 ; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; CI-NEXT:    v_cvt_f32_f16_e32 v32, v32
@@ -3802,6 +3804,27 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; CI-NEXT:    s_waitcnt vmcnt(6)
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT:    s_waitcnt vmcnt(5)
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; CI-NEXT:    v_or_b32_e32 v14, v15, v14
+; CI-NEXT:    s_waitcnt vmcnt(3)
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v16
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v18
+; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; CI-NEXT:    v_or_b32_e32 v12, v12, v15
+; CI-NEXT:    v_add_i32_e32 v15, vcc, 0x44, v0
+; CI-NEXT:    v_or_b32_e32 v11, v16, v11
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -3968,28 +3991,6 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_or_b32_e32 v31, v32, v31
 ; CI-NEXT:    v_add_i32_e32 v32, vcc, 0x48, v0
 ; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
-; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:4
-; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32
-; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; CI-NEXT:    v_or_b32_e32 v14, v15, v14
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v16
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v18
-; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; CI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; CI-NEXT:    v_or_b32_e32 v12, v12, v15
-; CI-NEXT:    v_or_b32_e32 v11, v16, v11
-; CI-NEXT:    v_add_i32_e32 v15, vcc, 0x44, v0
 ; CI-NEXT:    buffer_store_dword v11, v15, s[0:3], 0 offen
 ; CI-NEXT:    v_add_i32_e32 v11, vcc, 64, v0
 ; CI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 30bcdf97e26fd..4ff8bf23638f1 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -5023,20 +5023,20 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> in
 ; GFX8-NEXT:    s_add_i32 s6, s4, 0x7fff
 ; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], s0, s0
 ; GFX8-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX8-NEXT:    s_cselect_b32 s3, s3, s6
-; GFX8-NEXT:    s_bfe_u32 s0, s1, 0x10010
-; GFX8-NEXT:    s_add_i32 s0, s0, s1
-; GFX8-NEXT:    s_or_b32 s4, s1, 0x400000
-; GFX8-NEXT:    s_add_i32 s5, s0, 0x7fff
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[0:1], s1, s1
-; GFX8-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX8-NEXT:    s_cselect_b32 s0, s4, s5
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NEXT:    v_alignbit_b32 v0, s0, v0, 16
-; GFX8-NEXT:    s_mov_b32 s0, 0x7fff7fff
+; GFX8-NEXT:    s_cselect_b32 s0, s3, s6
+; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x10010
+; GFX8-NEXT:    s_add_i32 s4, s4, s1
+; GFX8-NEXT:    s_or_b32 s3, s1, 0x400000
+; GFX8-NEXT:    s_add_i32 s6, s4, 0x7fff
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], s1, s1
+; GFX8-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX8-NEXT:    s_cselect_b32 s1, s3, s6
+; GFX8-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
+; GFX8-NEXT:    s_mov_b32 s1, 0x7fff7fff
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v1
+; GFX8-NEXT:    v_bfi_b32 v0, s1, v0, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -5185,29 +5185,29 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> i
 ; GFX8-NEXT:    s_addk_i32 s8, 0x7fff
 ; GFX8-NEXT:    s_bitset1_b32 s5, 22
 ; GFX8-NEXT:    s_and_b64 s[6:7], s[6:7], exec
-; GFX8-NEXT:    s_cselect_b32 s5, s5, s8
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX8-NEXT:    s_bitcmp1_b32 s8, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_and_b64 s[6:7], s[12:13], exec
+; GFX8-NEXT:    s_cselect_b32 s6, s5, s8
+; GFX8-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX8-NEXT:    s_bitcmp1_b32 s5, 0
+; GFX8-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX8-NEXT:    s_and_b64 s[8:9], s[12:13], exec
 ; GFX8-NEXT:    v_cmp_u_f64_e64 s[2:3], s[2:3], s[2:3]
-; GFX8-NEXT:    s_cselect_b32 s6, 1, -1
-; GFX8-NEXT:    s_add_i32 s6, s8, s6
+; GFX8-NEXT:    s_cselect_b32 s7, 1, -1
+; GFX8-NEXT:    s_add_i32 s7, s5, s7
 ; GFX8-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX8-NEXT:    s_cselect_b32 s0, s8, s6
+; GFX8-NEXT:    s_cselect_b32 s0, s5, s7
 ; GFX8-NEXT:    s_bfe_u32 s1, s0, 0x10010
 ; GFX8-NEXT:    s_add_i32 s1, s1, s0
-; GFX8-NEXT:    s_add_i32 s6, s1, 0x7fff
+; GFX8-NEXT:    s_add_i32 s5, s1, 0x7fff
 ; GFX8-NEXT:    s_or_b32 s7, s0, 0x400000
 ; GFX8-NEXT:    s_and_b64 s[0:1], s[2:3], exec
-; GFX8-NEXT:    s_cselect_b32 s0, s7, s6
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX8-NEXT:    v_mov_b32_e32 v0, s5
-; GFX8-NEXT:    v_alignbit_b32 v0, s0, v0, 16
-; GFX8-NEXT:    s_mov_b32 s0, 0x7fff7fff
+; GFX8-NEXT:    s_cselect_b32 s0, s7, s5
+; GFX8-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], 16
+; GFX8-NEXT:    s_mov_b32 s1, 0x7fff7fff
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v1
+; GFX8-NEXT:    v_bfi_b32 v0, s1, v0, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -5421,19 +5421,19 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> i
 ; GFX8-NEXT:    s_addk_i32 s3, 0x7fff
 ; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], s1, s1
 ; GFX8-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX8-NEXT:    s_cselect_b32 s1, s1, s3
-; GFX8-NEXT:    s_bfe_u32 s3, s2, 0x10010
-; GFX8-NEXT:    s_add_i32 s3, s3, s2
-; GFX8-NEXT:    s_addk_i32 s3, 0x7fff
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], s2, s2
-; GFX8-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX8-NEXT:    s_cselect_b32 s2, s2, s3
-; GFX8-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX8-NEXT:    v_mov_b32_e32 v0, s1
-; GFX8-NEXT:    v_alignbit_b32 v0, s2, v0, 16
+; GFX8-NEXT:    s_cselect_b32 s4, s1, s3
+; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x10010
+; GFX8-NEXT:    s_add_i32 s1, s1, s2
+; GFX8-NEXT:    s_addk_i32 s1, 0x7fff
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[6:7], s2, s2
+; GFX8-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GFX8-NEXT:    s_cselect_b32 s1, s2, s1
+; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[4:5], 16
 ; GFX8-NEXT:    s_mov_b32 s1, 0x7fff7fff
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    v_bfi_b32 v0, s1, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_bfi_b32 v0, s1, v0, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index b826e6c469d8e..4d448e64f0921 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -333,18 +333,17 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo
 ; GFX67-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX67-NEXT:    v_mov_b32_e32 v0, 0x6f800000
 ; GFX67-NEXT:    v_mov_b32_e32 v1, 0x2f800000
-; GFX67-NEXT:    s_mov_b32 s7, 0xf000
-; GFX67-NEXT:    s_mov_b32 s6, -1
 ; GFX67-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX67-NEXT:    v_cmp_gt_f32_e64 vcc, |s3|, v0
+; GFX67-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX67-NEXT:    v_cmp_gt_f32_e64 vcc, |s5|, v0
 ; GFX67-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX67-NEXT:    v_mul_f32_e32 v1, s3, v0
+; GFX67-NEXT:    v_mul_f32_e32 v1, s5, v0
 ; GFX67-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX67-NEXT:    s_mov_b32 s4, s0
-; GFX67-NEXT:    s_mov_b32 s5, s1
-; GFX67-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX67-NEXT:    s_mov_b32 s3, 0xf000
+; GFX67-NEXT:    s_mov_b32 s2, -1
+; GFX67-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; GFX67-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX67-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX67-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX67-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_fdiv_25ulp_f32:
@@ -441,20 +440,19 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a
 ; GFX7-LABEL: s_fdiv_25ulp_ieee_f32:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_frexp_mant_f32_e32 v0, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX7-NEXT:    v_frexp_mant_f32_e32 v0, s5
 ; GFX7-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, s3
-; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v2, s2
-; GFX7-NEXT:    v_frexp_mant_f32_e32 v3, s2
+; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, s5
+; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v2, s4
+; GFX7-NEXT:    v_frexp_mant_f32_e32 v3, s4
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v3, v0
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v2, v1
-; GFX7-NEXT:    s_mov_b32 s4, s0
-; GFX7-NEXT:    s_mov_b32 s5, s1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_fdiv_25ulp_ieee_f32:
@@ -528,14 +526,13 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a,
 ; GFX67-LABEL: s_fdiv_fast_ieee_f32:
 ; GFX67:       ; %bb.0: ; %entry
 ; GFX67-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX67-NEXT:    s_mov_b32 s7, 0xf000
-; GFX67-NEXT:    s_mov_b32 s6, -1
 ; GFX67-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX67-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX67-NEXT:    s_mov_b32 s4, s0
-; GFX67-NEXT:    s_mov_b32 s5, s1
-; GFX67-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX67-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX67-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX67-NEXT:    v_rcp_f32_e32 v0, s5
+; GFX67-NEXT:    s_mov_b32 s3, 0xf000
+; GFX67-NEXT:    s_mov_b32 s2, -1
+; GFX67-NEXT:    v_mul_f32_e32 v0, s4, v0
+; GFX67-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX67-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_fdiv_fast_ieee_f32:
@@ -590,14 +587,13 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a,
 ; GFX67-LABEL: s_fdiv_f32_fast_math:
 ; GFX67:       ; %bb.0: ; %entry
 ; GFX67-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX67-NEXT:    s_mov_b32 s7, 0xf000
-; GFX67-NEXT:    s_mov_b32 s6, -1
 ; GFX67-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX67-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX67-NEXT:    s_mov_b32 s4, s0
-; GFX67-NEXT:    s_mov_b32 s5, s1
-; GFX67-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX67-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX67-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX67-NEXT:    v_rcp_f32_e32 v0, s5
+; GFX67-NEXT:    s_mov_b32 s3, 0xf000
+; GFX67-NEXT:    s_mov_b32 s2, -1
+; GFX67-NEXT:    v_mul_f32_e32 v0, s4, v0
+; GFX67-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX67-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_fdiv_f32_fast_math:
@@ -652,14 +648,13 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo
 ; GFX67-LABEL: s_fdiv_ulp25_f32_fast_math:
 ; GFX67:       ; %bb.0: ; %entry
 ; GFX67-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX67-NEXT:    s_mov_b32 s7, 0xf000
-; GFX67-NEXT:    s_mov_b32 s6, -1
 ; GFX67-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX67-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX67-NEXT:    s_mov_b32 s4, s0
-; GFX67-NEXT:    s_mov_b32 s5, s1
-; GFX67-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX67-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX67-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX67-NEXT:    v_rcp_f32_e32 v0, s5
+; GFX67-NEXT:    s_mov_b32 s3, 0xf000
+; GFX67-NEXT:    s_mov_b32 s2, -1
+; GFX67-NEXT:    v_mul_f32_e32 v0, s4, v0
+; GFX67-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX67-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_fdiv_ulp25_f32_fast_math:
@@ -877,14 +872,13 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a,
 ; GFX67-LABEL: s_fdiv_f32_arcp_ninf:
 ; GFX67:       ; %bb.0: ; %entry
 ; GFX67-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX67-NEXT:    s_mov_b32 s7, 0xf000
-; GFX67-NEXT:    s_mov_b32 s6, -1
 ; GFX67-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX67-NEXT:    v_rcp_f32_e32 v0, s3
-; GFX67-NEXT:    s_mov_b32 s4, s0
-; GFX67-NEXT:    s_mov_b32 s5, s1
-; GFX67-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX67-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX67-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX67-NEXT:    v_rcp_f32_e32 v0, s5
+; GFX67-NEXT:    s_mov_b32 s3, 0xf000
+; GFX67-NEXT:    s_mov_b32 s2, -1
+; GFX67-NEXT:    v_mul_f32_e32 v0, s4, v0
+; GFX67-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX67-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_fdiv_f32_arcp_ninf:
diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
index 0548bcf304c32..590d69b8eb869 100644
--- a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
@@ -1,6 +1,19 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -passes=finalizebundle-test %s -o - | FileCheck %s
 
+--- |
+
+  @foo = addrspace(3) global i32 poison
+
+  define void @test_overlap() { unreachable }
+  define void @test_dead_redef() { unreachable }
+  define void @test_tied() { unreachable }
+  define void @test_mmo_merge1() { unreachable }
+  define void @test_mmo_merge2() { unreachable }
+  define void @test_mmo_drop() { unreachable }
+
+...
+
 ---
 name: test_overlap
 body: |
@@ -34,3 +47,55 @@ body: |
     $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
     dead $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
 ...
+
+---
+name: test_tied
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_tied
+    ; CHECK: BUNDLE implicit-def %0, implicit-def %2, implicit %1:vgpr_32(tied-def 1), implicit $mode, implicit $exec {
+    ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32
+    ; CHECK-NEXT:   [[V_FMAC_F16_e32_:%[0-9]+]]:vgpr_32 = V_FMAC_F16_e32 internal [[COPY]], internal [[COPY]], %1:vgpr_32, implicit $mode, implicit $exec
+    ; CHECK-NEXT: }
+    %1:vgpr_32 = COPY %0:vgpr_32
+    %2:vgpr_32 = V_FMAC_F16_e32 %1, %1, %0, implicit $mode, implicit $exec
+...
+
+---
+name: test_mmo_merge1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_mmo_merge1
+    ; CHECK: BUNDLE implicit-def %0, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3) {
+    ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32
+    ; CHECK-NEXT:   DS_WRITE_B32_gfx9 %1:vgpr_32, internal [[COPY]], 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+    ; CHECK-NEXT: }
+    %1:vgpr_32 = COPY %0:vgpr_32
+    DS_WRITE_B32_gfx9 %0, %1, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+...
+
+---
+name: test_mmo_merge2
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_mmo_merge2
+    ; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3), (store (s32) into @foo + 4, addrspace 3) {
+    ; CHECK-NEXT:   DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+    ; CHECK-NEXT:   DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3)
+    ; CHECK-NEXT: }
+    DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+    DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3)
+...
+
+---
+name: test_mmo_drop
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_mmo_drop
+    ; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec {
+    ; CHECK-NEXT:   DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+    ; CHECK-NEXT:   DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec
+    ; CHECK-NEXT: }
+    DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+    DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
index b8f084d5f82ad..db32135939a5d 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
@@ -4,14 +4,24 @@
 define amdgpu_gs i32 @main() {
 ; CHECK-LABEL: main:
 ; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_bitcmp1_b32 0, 0
 ; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-NEXT:    s_cselect_b32 s1, -1, 0
+; CHECK-NEXT:    s_or_saveexec_b32 s2, -1
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_readfirstlane_b32 s1, v0
+; CHECK-NEXT:    s_mov_b32 exec_lo, s2
+; CHECK-NEXT:    s_or_b32 s0, s0, s1
+; CHECK-NEXT:    s_wait_alu 0xfffe
 ; CHECK-NEXT:    s_bitcmp1_b32 s0, 0
 ; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
+; CHECK-NEXT:    s_wait_alu 0xfffe
 ; CHECK-NEXT:    s_xor_b32 s0, s0, -1
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    s_wait_alu 0xfffe
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v1
 ; CHECK-NEXT:    s_wait_alu 0xf1ff
 ; CHECK-NEXT:    ; return to shader part epilog
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index eefc7811d42b6..357234080235a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -263,7 +263,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %v
 ; Uniformity edge cases
 ; --------------------------------------------------------------------------------
 
-@ptr.in.lds = internal addrspace(3) global ptr undef
+@ptr.in.lds = internal addrspace(3) global ptr poison
 
 ; Base pointer is uniform, but also in VGPRs
 define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i32 %data) {
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
index 32888d2acf1cd..3d0e2875e91a2 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
@@ -54,7 +54,7 @@ define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_neg2048(ptr inreg %s
 ; Uniformity edge cases
 ; --------------------------------------------------------------------------------
 
-@ptr.in.lds = internal addrspace(3) global ptr undef
+@ptr.in.lds = internal addrspace(3) global ptr poison
 
 ; Base pointer is uniform, but also in VGPRs
 define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %data) {
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index b0e6752386285..e01cb79382c05 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -524,7 +524,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX942-SDAG-NEXT:    s_lshl1_add_u32 s0, s0, 0
 ; GFX942-SDAG-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX942-SDAG-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX942-SDAG-NEXT:    v_add_u32_e32 v3, 2, v0
@@ -695,7 +695,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX942-SDAG-NEXT:    s_lshl1_add_u32 s0, s0, 0
 ; GFX942-SDAG-NEXT:    v_mad_u32_u24 v0, v0, 2, s0
 ; GFX942-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:1 sc0 sc1
 ; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -875,7 +875,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX942-SDAG-NEXT:    s_lshl1_add_u32 s0, s0, 0
 ; GFX942-SDAG-NEXT:    v_mad_u32_u24 v0, v0, 4, s0
 ; GFX942-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:1 sc0 sc1
 ; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -1054,7 +1054,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX942-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, 0
 ; GFX942-SDAG-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX942-SDAG-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX942-SDAG-NEXT:    v_add_u32_e32 v3, 2, v0
@@ -1225,7 +1225,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX942-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, 0
 ; GFX942-SDAG-NEXT:    v_mad_u32_u24 v0, v0, 2, s0
 ; GFX942-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:1 sc0 sc1
 ; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -1405,7 +1405,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX942-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, 0
 ; GFX942-SDAG-NEXT:    v_mad_u32_u24 v0, v0, 4, s0
 ; GFX942-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:1 sc0 sc1
 ; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index fc8883924dfbc..870b679a84d11 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -857,13 +857,13 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX9-LABEL: store_load_vindex_foo:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s32
-; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    s_mov_b32 s0, s32
+; GFX9-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -915,13 +915,13 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX9-PAL-LABEL: store_load_vindex_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s32
-; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT:    s_mov_b32 s0, s32
+; GFX9-PAL-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
@@ -929,8 +929,8 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX942-LABEL: store_load_vindex_foo:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v1, s32
-; GFX942-NEXT:    v_lshl_add_u32 v1, v0, 2, v1
+; GFX942-NEXT:    s_mov_b32 s0, s32
+; GFX942-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX942-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX942-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
@@ -2146,16 +2146,16 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9-LABEL: store_load_vindex_small_offset_foo:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_add_i32 s1, s32, 0x100
 ; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s32, 0x100
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    s_mov_b32 s0, s1
+; GFX9-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2214,16 +2214,16 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT:    s_add_i32 s1, s32, 0x100
 ; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x100
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT:    s_mov_b32 s0, s1
+; GFX9-PAL-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
@@ -2231,11 +2231,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX942-LABEL: store_load_vindex_small_offset_foo:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_add_i32 s1, s32, 0x100
 ; GFX942-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    s_add_i32 s0, s32, 0x100
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    v_lshl_add_u32 v1, v0, 2, v1
+; GFX942-NEXT:    s_mov_b32 s0, s1
+; GFX942-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX942-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX942-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
@@ -3447,16 +3447,16 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-LABEL: store_load_vindex_large_offset_foo:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX9-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    s_mov_b32 s0, s1
+; GFX9-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3516,16 +3516,16 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT:    s_mov_b32 s0, s1
+; GFX9-PAL-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
@@ -3533,11 +3533,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX942-LABEL: store_load_vindex_large_offset_foo:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX942-NEXT:    scratch_load_dword v1, off, s32 offset:4 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    v_lshl_add_u32 v1, v0, 2, v1
+; GFX942-NEXT:    s_mov_b32 s0, s1
+; GFX942-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX942-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX942-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
@@ -3940,12 +3940,12 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x24
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
@@ -4001,15 +4001,15 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[12:13]
 ; GFX9-PAL-NEXT:    s_mov_b32 s12, s0
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
 ; GFX9-PAL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s12, s11
 ; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
@@ -4020,11 +4020,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x24
 ; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_mov_b32 s1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    v_add_u32_e32 v0, s0, v0
-; GFX942-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX942-NEXT:    v_mov_b32_e32 v1, 15
+; GFX942-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX942-NEXT:    scratch_store_dword v0, v1, off offset:1024 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    scratch_load_dword v0, v0, off offset:1024 sc0 sc1
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 60ac0b943faf4..29163c111fc5e 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -8894,6 +8894,501 @@ define double @v_test_fmed3_r_i_i_f64_minimumnum_maximumnum(double %a) {
   ret double %med
 }
 
+define float @v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum(float %a) {
+; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-SDAG-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, 2.0, v0
+; SI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, 4.0, v0
+; SI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; VI-SDAG-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; VI-GISEL-NEXT:    v_max_f32_e32 v1, 2.0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; VI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-GISEL-NEXT:    v_min_f32_e32 v1, 4.0, v0
+; VI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX9-SDAG-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, 2.0, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, 4.0, v0
+; GFX9-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_max_f32_e32 v1, 2.0, v0
+; GFX11-GISEL-NEXT:    v_cmp_o_f32_e32 vcc_lo, 2.0, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_min_f32_e32 v1, 4.0, v0
+; GFX11-GISEL-NEXT:    v_cmp_o_f32_e32 vcc_lo, 4.0, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v1, vcc_lo
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a.add = fadd nnan float %a, 1.0
+  %max = call float @llvm.maximum.f32(float %a.add, float 2.0)
+  %med = call float @llvm.minimum.f32(float %max, float 4.0)
+  ret float %med
+}
+
+define <2 x half> @v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum(<2 x half> %a) {
+; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; SI-SDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-SDAG-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; SI-SDAG-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-GISEL-NEXT:    v_max_f32_e32 v3, 2.0, v0
+; SI-GISEL-NEXT:    v_max_f32_e32 v4, 2.0, v1
+; SI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; SI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-GISEL-NEXT:    v_min_f32_e32 v3, 4.0, v0
+; SI-GISEL-NEXT:    v_min_f32_e32 v4, 4.0, v1
+; SI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; SI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; VI-SDAG-NEXT:    v_add_f16_e32 v1, 1.0, v0
+; VI-SDAG-NEXT:    v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    v_max_f16_e32 v0, 2.0, v0
+; VI-SDAG-NEXT:    v_max_f16_e32 v1, 2.0, v1
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4400
+; VI-SDAG-NEXT:    v_min_f16_e32 v1, 4.0, v1
+; VI-SDAG-NEXT:    v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-SDAG-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_add_f16_e32 v1, 1.0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; VI-GISEL-NEXT:    v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; VI-GISEL-NEXT:    v_max_f32_e32 v2, 2.0, v1
+; VI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; VI-GISEL-NEXT:    v_max_f32_e32 v2, 2.0, v0
+; VI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v0
+; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; VI-GISEL-NEXT:    v_min_f32_e32 v2, 4.0, v1
+; VI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; VI-GISEL-NEXT:    v_min_f32_e32 v2, 4.0, v0
+; VI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VI-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[4:5], 2.0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v3, v1, 16, v3
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v3, v3, 4.0 op_sel_hi:[1,0]
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[4:5], 4.0, v0
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, 4.0, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, v3, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_min_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT:    v_pk_max_f16 v1, v0, 2.0 op_sel_hi:[1,0]
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 2.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 2.0, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v3, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 4.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v2, v1, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 4.0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v3, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT:    v_pk_max_f16 v1, v0, 2.0 op_sel_hi:[1,0]
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 2.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_o_f16_e64 s0, 2.0, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v1.l, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x7e00, v1.h, s0
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 4.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_pk_min_f16 v1, v0, 4.0 op_sel_hi:[1,0]
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_o_f16_e64 s0, 4.0, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v1.l, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x7e00, v1.h, s0
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+  %a.add = fadd nnan <2 x half> %a, splat (half 1.0)
+  %max = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a.add, <2 x half> splat (half 2.0))
+  %med = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max, <2 x half> splat (half 4.0))
+  ret <2 x half> %med
+}
+
+define half @v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum(half %a) {
+; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-SDAG-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, 2.0, v0
+; SI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, 4.0, v0
+; SI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; VI-SDAG-NEXT:    v_max_f16_e32 v0, 2.0, v0
+; VI-SDAG-NEXT:    v_min_f16_e32 v0, 4.0, v0
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; VI-GISEL-NEXT:    v_max_f32_e32 v2, 2.0, v0
+; VI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; VI-GISEL-NEXT:    v_min_f32_e32 v2, 4.0, v0
+; VI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX9-SDAG-NEXT:    v_med3_f16 v0, v0, 2.0, 4.0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX9-GISEL-NEXT:    v_max_f16_e32 v1, 2.0, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, 2.0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-GISEL-NEXT:    v_min_f16_e32 v1, 4.0, v0
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, 4.0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_f16 v0, v0, 2.0, 4.0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, 2.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 2.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v1, 4.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 4.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_f16 v0.l, v0.l, 2.0, 4.0
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, 2.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 2.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.h, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v1.l, 4.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 4.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+  %a.add = fadd nnan half %a, 1.0
+  %max = call half @llvm.maximum.f16(half %a.add, half 2.0)
+  %med = call half @llvm.minimum.f16(half %max, half 4.0)
+  ret half %med
+}
+
+define double @v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum(double %a) {
+; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; SI-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], 2.0
+; SI-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; SI-GISEL-NEXT:    v_max_f64 v[2:3], v[0:1], 2.0
+; SI-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, 2.0, v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-GISEL-NEXT:    v_min_f64 v[2:3], v[0:1], 4.0
+; SI-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, 4.0, v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; VI-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], 2.0
+; VI-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; VI-GISEL-NEXT:    v_max_f64 v[2:3], v[0:1], 2.0
+; VI-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, 2.0, v[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; VI-GISEL-NEXT:    v_min_f64 v[2:3], v[0:1], 4.0
+; VI-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, 4.0, v[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], 2.0
+; GFX9-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], v[0:1], 2.0
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, 2.0, v[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_min_f64 v[2:3], v[0:1], 4.0
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, 4.0, v[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], 2.0
+; GFX11-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_max_f64 v[2:3], v[0:1], 2.0
+; GFX11-GISEL-NEXT:    v_cmp_o_f64_e32 vcc_lo, 2.0, v[0:1]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0x7ff80000, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_min_f64 v[2:3], v[0:1], 4.0
+; GFX11-GISEL-NEXT:    v_cmp_o_f64_e32 vcc_lo, 4.0, v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0x7ff80000, v3, vcc_lo
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a.add = fadd nnan double %a, 1.0
+  %max = call double @llvm.maximum.f64(double %a.add, double 2.0)
+  %med = call double @llvm.minimum.f64(double %max, double 4.0)
+  ret double %med
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare float @llvm.fabs.f32(float) #0
 declare float @llvm.minnum.f32(float, float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
index defcffa641e64..39eefa1879870 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -75,9 +75,12 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_fast(ptr addrspace(1) %out
 ; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src:
 ; GCN: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
 
-; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#LOAD + 2]], 1.0
-; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#LOAD + 3]], 2.0
+; SI: s_mov_b64 s[[[#COPY:]]:{{[0-9]+}}], s{{\[}}[[#LOAD + 2]]:[[#LOAD + 3]]{{\]}}
+; SI-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#COPY]], 1.0
+; SI-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#COPY + 1]], 2.0
 
+; VI-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#LOAD + 2]], 1.0
+; VI-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#LOAD + 3]], 2.0
 ; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
 
 ; VI: v_cmp_ngt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
@@ -96,8 +99,12 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(ptr addrspace(1)
 ; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src_fast:
 ; GCN: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
 
-; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#LOAD + 2]], 1.0
-; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#LOAD + 3]], 2.0
+; SI: s_mov_b64 s[[[#COPY:]]:{{[0-9]+}}], s{{\[}}[[#LOAD + 2]]:[[#LOAD + 3]]{{\]}}
+; SI-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#COPY]], 1.0
+; SI-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#COPY + 1]], 2.0
+
+; VI-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#LOAD + 2]], 1.0
+; VI-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#LOAD + 3]], 2.0
 
 ; GCN: v_min_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
 define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src_fast(ptr addrspace(1) %out, float %a, float %b) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index a025c36f620c7..6c2ab5fb15a20 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -121,14 +121,13 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
 ; SICI-LABEL: fnearbyint_v2f32:
 ; SICI:       ; %bb.0: ; %entry
 ; SICI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SICI-NEXT:    s_mov_b32 s7, 0xf000
-; SICI-NEXT:    s_mov_b32 s6, -1
 ; SICI-NEXT:    s_waitcnt lgkmcnt(0)
-; SICI-NEXT:    s_mov_b32 s4, s0
-; SICI-NEXT:    s_mov_b32 s5, s1
-; SICI-NEXT:    v_rndne_f32_e32 v1, s3
-; SICI-NEXT:    v_rndne_f32_e32 v0, s2
-; SICI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SICI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SICI-NEXT:    s_mov_b32 s3, 0xf000
+; SICI-NEXT:    s_mov_b32 s2, -1
+; SICI-NEXT:    v_rndne_f32_e32 v1, s5
+; SICI-NEXT:    v_rndne_f32_e32 v0, s4
+; SICI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SICI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fnearbyint_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
index 10c60dfc9b34c..5424ebfcffcd1 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
@@ -409,7 +409,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
 ; CI-NEXT:    v_add_f32_e64 v1, s2, 2.0
 ; CI-NEXT:    v_add_f32_e64 v0, s3, 1.0
 ; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; CI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; CI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
 ; CI-NEXT:    v_or_b32_e32 v2, 0x80008000, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -441,7 +441,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -709,16 +709,16 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2bf16(ptr addrspace(1) %out, <2
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_and_b32 s3, s2, 0x7fff0000
-; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
-; CI-NEXT:    s_lshl_b32 s2, s2, 16
-; CI-NEXT:    v_mul_f32_e64 v0, s3, -4.0
-; CI-NEXT:    v_mul_f32_e64 v1, s2, -4.0
-; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT:    v_alignbit_b32 v2, v0, v1, 16
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    s_and_b32 s3, s2, 0x7fff
+; CI-NEXT:    s_and_b32 s2, s2, 0x7fff0000
+; CI-NEXT:    v_mul_f32_e64 v0, s2, -4.0
+; CI-NEXT:    s_lshl_b32 s2, s3, 16
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; CI-NEXT:    v_mul_f32_e64 v0, s2, -4.0
+; CI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; CI-NEXT:    v_mov_b32_e32 v2, s1
+; CI-NEXT:    v_mov_b32_e32 v1, s0
+; CI-NEXT:    flat_store_dword v[1:2], v0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fold_user_fneg_fabs_v2bf16:
@@ -749,10 +749,10 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2bf16(ptr addrspace(1) %out, <2
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_alignbit_b32 v2, v1, v0, 16
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    flat_store_dword v[1:2], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fold_user_fneg_fabs_v2bf16:
@@ -956,17 +956,17 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2bf16(ptr addrspa
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    s_and_b32 s1, s4, 0x7fff
+; CI-NEXT:    s_and_b32 s2, s4, 0x7fff0000
+; CI-NEXT:    v_mul_f32_e64 v4, s2, -4.0
 ; CI-NEXT:    s_lshl_b32 s1, s1, 16
+; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; CI-NEXT:    v_mul_f32_e64 v4, s1, -4.0
-; CI-NEXT:    s_and_b32 s1, s4, 0x7fff0000
-; CI-NEXT:    v_mul_f32_e64 v5, s1, -4.0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    s_and_b32 s0, s4, 0x7fff7fff
-; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; CI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; CI-NEXT:    v_lshr_b64 v[4:5], v[4:5], 16
 ; CI-NEXT:    v_mov_b32_e32 v5, s0
-; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_store_dword v[0:1], v5
 ; CI-NEXT:    flat_store_dword v[2:3], v4
@@ -1000,10 +1000,10 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2bf16(ptr addrspa
 ; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v5
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_and_b32 s0, s4, 0x7fff7fff
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; VI-NEXT:    v_lshrrev_b64 v[4:5], 16, v[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v5, s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index 1fa9bfa3cfa3f..214ccedd75170 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -199,16 +199,15 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %
 ; SI-LABEL: fneg_fabsf_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bitset1_b32 s3, 31
-; SI-NEXT:    s_bitset1_b32 s2, 31
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_bitset1_b32 s5, 31
+; SI-NEXT:    s_bitset1_b32 s4, 31
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fneg_fabsf_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
index 84b904ff67151..63aadaacbeb3a 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
@@ -627,18 +627,18 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    flat_load_dword v2, v[0:1]
+; CI-NEXT:    flat_load_dword v1, v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_xor_b32_e32 v3, 0x8000, v2
-; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; CI-NEXT:    v_mul_f32_e64 v2, -v2, v2
-; CI-NEXT:    v_mul_f32_e32 v3, v3, v4
-; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; CI-NEXT:    v_alignbit_b32 v2, v2, v3, 16
+; CI-NEXT:    v_xor_b32_e32 v2, 0x8000, v1
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT:    v_mul_f32_e64 v4, -v1, v1
+; CI-NEXT:    v_mul_f32_e32 v1, v2, v3
+; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; CI-NEXT:    v_lshr_b64 v[2:3], v[1:2], 16
+; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
@@ -648,34 +648,34 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
 ; GFX8-NEXT:    s_add_i32 s12, s12, s17
 ; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x8000
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x8000
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_xor_b32_sdwa v5, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX8-NEXT:    v_xor_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v4
-; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_xor_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_xor_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, v1, v5
+; GFX8-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v3, 16
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 16, v[1:2]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index c3f4ebe30152b..02235151a83e1 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -52,16 +52,15 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl
 ; SI-LABEL: s_fneg_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_xor_b32 s0, s3, 0x80000000
-; SI-NEXT:    s_xor_b32 s1, s2, 0x80000000
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_xor_b32 s5, s5, 0x80000000
+; SI-NEXT:    s_xor_b32 s4, s4, 0x80000000
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_fneg_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index 7ab8b30681eb1..0c5ed00b58d90 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -88,27 +88,24 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> %
 ; SI-LABEL: fp_to_sint_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_cvt_i32_f32_e32 v1, s3
-; SI-NEXT:    v_cvt_i32_f32_e32 v0, s2
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_cvt_i32_f32_e32 v1, s5
+; SI-NEXT:    v_cvt_i32_f32_e32 v0, s4
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fp_to_sint_v2i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_i32_f32_e32 v1, s3
 ; VI-NEXT:    v_cvt_i32_f32_e32 v0, s2
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: fp_to_sint_v2i32:
@@ -294,26 +291,25 @@ entry:
 define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %x) {
 ; SI-LABEL: fp_to_sint_v2i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s6, 0x2f800000
+; SI-NEXT:    s_mov_b32 s7, 0xcf800000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_mov_b32 s8, 0x2f800000
-; SI-NEXT:    s_mov_b32 s9, 0xcf800000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s0, s4
-; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    v_trunc_f32_e32 v0, s7
-; SI-NEXT:    v_trunc_f32_e32 v1, s6
-; SI-NEXT:    v_mul_f32_e64 v2, |v0|, s8
+; SI-NEXT:    v_trunc_f32_e32 v0, s5
+; SI-NEXT:    v_trunc_f32_e32 v1, s4
+; SI-NEXT:    v_mul_f32_e64 v2, |v0|, s6
 ; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; SI-NEXT:    v_mul_f32_e64 v4, |v1|, s8
+; SI-NEXT:    v_mul_f32_e64 v4, |v1|, s6
 ; SI-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
 ; SI-NEXT:    v_floor_f32_e32 v2, v2
 ; SI-NEXT:    v_floor_f32_e32 v4, v4
 ; SI-NEXT:    v_cvt_u32_f32_e32 v6, v2
-; SI-NEXT:    v_fma_f32 v0, v2, s9, |v0|
+; SI-NEXT:    v_fma_f32 v0, v2, s7, |v0|
 ; SI-NEXT:    v_cvt_u32_f32_e32 v2, v4
-; SI-NEXT:    v_fma_f32 v1, v4, s9, |v1|
+; SI-NEXT:    v_fma_f32 v1, v4, s7, |v1|
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; SI-NEXT:    v_xor_b32_e32 v4, v6, v3
 ; SI-NEXT:    v_cvt_u32_f32_e32 v1, v1
@@ -330,36 +326,35 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %
 ; VI-LABEL: fp_to_sint_v2i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s8, 0x2f800000
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s6, 0x2f800000
+; VI-NEXT:    s_mov_b32 s7, 0xcf800000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_trunc_f32_e32 v0, s3
-; VI-NEXT:    v_mul_f32_e64 v1, |v0|, s8
-; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; VI-NEXT:    v_trunc_f32_e32 v0, s5
+; VI-NEXT:    v_mul_f32_e64 v1, |v0|, s6
 ; VI-NEXT:    v_floor_f32_e32 v1, v1
-; VI-NEXT:    s_mov_b32 s0, 0xcf800000
-; VI-NEXT:    v_fma_f32 v2, v1, s0, |v0|
-; VI-NEXT:    v_trunc_f32_e32 v4, s2
-; VI-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; VI-NEXT:    v_mul_f32_e64 v3, |v4|, s8
-; VI-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; VI-NEXT:    v_floor_f32_e32 v3, v3
-; VI-NEXT:    v_cvt_u32_f32_e32 v5, v3
-; VI-NEXT:    v_fma_f32 v3, v3, s0, |v4|
+; VI-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; VI-NEXT:    v_fma_f32 v1, v1, s7, |v0|
 ; VI-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
-; VI-NEXT:    v_cvt_u32_f32_e32 v6, v3
-; VI-NEXT:    v_xor_b32_e32 v2, v2, v0
+; VI-NEXT:    v_trunc_f32_e32 v4, s4
+; VI-NEXT:    v_xor_b32_e32 v3, v2, v0
+; VI-NEXT:    v_mul_f32_e64 v2, |v4|, s6
+; VI-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; VI-NEXT:    v_floor_f32_e32 v2, v2
+; VI-NEXT:    v_cvt_u32_f32_e32 v5, v2
+; VI-NEXT:    v_fma_f32 v2, v2, s7, |v4|
+; VI-NEXT:    v_cvt_u32_f32_e32 v6, v2
 ; VI-NEXT:    v_xor_b32_e32 v1, v1, v0
-; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v0
-; VI-NEXT:    v_subb_u32_e32 v3, vcc, v1, v0, vcc
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v1, v0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
+; VI-NEXT:    v_subb_u32_e32 v3, vcc, v3, v0, vcc
 ; VI-NEXT:    v_xor_b32_e32 v0, v6, v1
 ; VI-NEXT:    v_xor_b32_e32 v4, v5, v1
 ; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: fp_to_sint_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
index 5428ba88975bc..c938475ab7675 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -48,27 +48,24 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x
 ; SI-LABEL: fp_to_uint_v2f32_to_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_cvt_u32_f32_e32 v1, s3
-; SI-NEXT:    v_cvt_u32_f32_e32 v0, s2
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_cvt_u32_f32_e32 v1, s5
+; SI-NEXT:    v_cvt_u32_f32_e32 v0, s4
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fp_to_uint_v2f32_to_v2i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_u32_f32_e32 v1, s3
 ; VI-NEXT:    v_cvt_u32_f32_e32 v0, s2
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: fp_to_uint_v2f32_to_v2i32:
@@ -241,32 +238,29 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x
 ; SI-LABEL: fp_to_uint_v2f32_to_v2i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s8, 0xcf800000
+; SI-NEXT:    s_mov_b32 s6, 0xcf800000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_trunc_f32_e32 v0, s3
-; SI-NEXT:    v_trunc_f32_e32 v2, s2
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_trunc_f32_e32 v0, s5
+; SI-NEXT:    v_trunc_f32_e32 v2, s4
 ; SI-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; SI-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; SI-NEXT:    v_floor_f32_e32 v4, v1
 ; SI-NEXT:    v_floor_f32_e32 v5, v3
 ; SI-NEXT:    v_cvt_u32_f32_e32 v3, v4
 ; SI-NEXT:    v_cvt_u32_f32_e32 v1, v5
-; SI-NEXT:    v_fma_f32 v0, v4, s8, v0
-; SI-NEXT:    v_fma_f32 v4, v5, s8, v2
+; SI-NEXT:    v_fma_f32 v0, v4, s6, v0
+; SI-NEXT:    v_fma_f32 v4, v5, s6, v2
 ; SI-NEXT:    v_cvt_u32_f32_e32 v2, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v4
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fp_to_uint_v2f32_to_v2i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_trunc_f32_e32 v0, s3
 ; VI-NEXT:    v_trunc_f32_e32 v4, s2
@@ -281,9 +275,9 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x
 ; VI-NEXT:    v_cvt_u32_f32_e32 v3, v5
 ; VI-NEXT:    v_cvt_u32_f32_e32 v1, v6
 ; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: fp_to_uint_v2f32_to_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index e0421575c3174..0c4a15f6a9d5e 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -61,34 +61,32 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s[6:7]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
 ; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0
-; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v1, s[4:5]
-; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
-; SDAG-NEXT:    v_mul_lo_u32 v13, v8, v2
-; SDAG-NEXT:    v_mov_b32_e32 v4, v1
-; SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5]
-; SDAG-NEXT:    v_mul_lo_u32 v14, v10, v3
-; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v4, v6
-; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5]
-; SDAG-NEXT:    v_add3_u32 v3, v3, v14, v13
-; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3]
-; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v7, v5
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, v1, s[4:5]
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0
+; SDAG-NEXT:    v_mul_lo_u32 v12, v8, v2
+; SDAG-NEXT:    v_mul_lo_u32 v13, v10, v3
+; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[1:2]
+; SDAG-NEXT:    v_add3_u32 v4, v4, v13, v12
+; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4]
+; SDAG-NEXT:    v_mov_b32_e32 v1, v5
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2]
+; SDAG-NEXT:    v_mul_lo_u32 v10, v9, v7
+; SDAG-NEXT:    v_mul_lo_u32 v9, v9, v11
+; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v6, v2
 ; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_mul_lo_u32 v3, v9, v11
-; SDAG-NEXT:    v_mul_lo_u32 v9, v9, v12
-; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6]
+; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6]
+; SDAG-NEXT:    v_add3_u32 v4, v9, v4, v10
 ; SDAG-NEXT:    ; implicit-def: $vgpr10
 ; SDAG-NEXT:    ; implicit-def: $vgpr8
-; SDAG-NEXT:    v_add3_u32 v3, v9, v2, v3
-; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v1
-; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; SDAG-NEXT:    ; implicit-def: $vgpr9
+; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v3
+; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5]
 ; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; SDAG-NEXT:    ; implicit-def: $vgpr9
 ; SDAG-NEXT:  .LBB0_4: ; %Flow
 ; SDAG-NEXT:    s_andn2_saveexec_b64 s[12:13], s[12:13]
 ; SDAG-NEXT:    s_cbranch_execz .LBB0_6
@@ -119,10 +117,11 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
 ; SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
 ; SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v3, v2
+; SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; SDAG-NEXT:  ; %bb.9: ; %Flow3
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SDAG-NEXT:  .LBB0_10: ; %fp-to-i-cleanup
@@ -234,37 +233,36 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB0_4
 ; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; GISEL-NEXT:    v_add_u32_e32 v7, 0xfffffbcd, v6
-; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xfffffbcd, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[4:5]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v1, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0
-; GISEL-NEXT:    v_add_u32_e32 v6, 0xfffffb8d, v6
-; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
-; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
-; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1]
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7]
+; GISEL-NEXT:    v_add_u32_e32 v3, 0xfffffb8d, v6
+; GISEL-NEXT:    v_sub_u32_e32 v6, 64, v2
+; GISEL-NEXT:    v_lshrrev_b64 v[6:7], v6, v[4:5]
+; GISEL-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[8:9], v11, v9, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, v3, 0, s[6:7]
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
-; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
-; GISEL-NEXT:    ; implicit-def: $vgpr9
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v12, v8, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[8:9], v10, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v10, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[5:6]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[8:9], v12, v9, v[3:4]
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v7, v8, v[5:6]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:    ; implicit-def: $vgpr9
 ; GISEL-NEXT:  .LBB0_4: ; %Flow
 ; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[16:17]
 ; GISEL-NEXT:    s_cbranch_execz .LBB0_6
@@ -275,17 +273,17 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
-; GISEL-NEXT:    v_mul_lo_u32 v6, v5, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
-; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v1, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v6, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v7, v9, v[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], vcc, v6, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v6, v6, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[4:5]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v6, s[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v10, vcc
 ; GISEL-NEXT:  .LBB0_6: ; %Flow1
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GISEL-NEXT:  .LBB0_7: ; %Flow2
@@ -428,34 +426,32 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s[6:7]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
 ; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0
-; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v1, s[4:5]
-; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
-; SDAG-NEXT:    v_mul_lo_u32 v13, v8, v2
-; SDAG-NEXT:    v_mov_b32_e32 v4, v1
-; SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5]
-; SDAG-NEXT:    v_mul_lo_u32 v14, v10, v3
-; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v4, v6
-; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5]
-; SDAG-NEXT:    v_add3_u32 v3, v3, v14, v13
-; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3]
-; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v7, v5
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, v1, s[4:5]
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0
+; SDAG-NEXT:    v_mul_lo_u32 v12, v8, v2
+; SDAG-NEXT:    v_mul_lo_u32 v13, v10, v3
+; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[1:2]
+; SDAG-NEXT:    v_add3_u32 v4, v4, v13, v12
+; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4]
+; SDAG-NEXT:    v_mov_b32_e32 v1, v5
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2]
+; SDAG-NEXT:    v_mul_lo_u32 v10, v9, v7
+; SDAG-NEXT:    v_mul_lo_u32 v9, v9, v11
+; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v6, v2
 ; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_mul_lo_u32 v3, v9, v11
-; SDAG-NEXT:    v_mul_lo_u32 v9, v9, v12
-; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6]
+; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6]
+; SDAG-NEXT:    v_add3_u32 v4, v9, v4, v10
 ; SDAG-NEXT:    ; implicit-def: $vgpr10
 ; SDAG-NEXT:    ; implicit-def: $vgpr8
-; SDAG-NEXT:    v_add3_u32 v3, v9, v2, v3
-; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v1
-; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; SDAG-NEXT:    ; implicit-def: $vgpr9
+; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v3
+; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5]
 ; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; SDAG-NEXT:    ; implicit-def: $vgpr9
 ; SDAG-NEXT:  .LBB1_4: ; %Flow
 ; SDAG-NEXT:    s_andn2_saveexec_b64 s[12:13], s[12:13]
 ; SDAG-NEXT:    s_cbranch_execz .LBB1_6
@@ -486,10 +482,11 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
 ; SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
 ; SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v3, v2
+; SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; SDAG-NEXT:  ; %bb.9: ; %Flow3
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SDAG-NEXT:  .LBB1_10: ; %fp-to-i-cleanup
@@ -601,37 +598,36 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB1_4
 ; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; GISEL-NEXT:    v_add_u32_e32 v7, 0xfffffbcd, v6
-; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xfffffbcd, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[4:5]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v1, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0
-; GISEL-NEXT:    v_add_u32_e32 v6, 0xfffffb8d, v6
-; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
-; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
-; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1]
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7]
+; GISEL-NEXT:    v_add_u32_e32 v3, 0xfffffb8d, v6
+; GISEL-NEXT:    v_sub_u32_e32 v6, 64, v2
+; GISEL-NEXT:    v_lshrrev_b64 v[6:7], v6, v[4:5]
+; GISEL-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[8:9], v11, v9, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, v3, 0, s[6:7]
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
-; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
-; GISEL-NEXT:    ; implicit-def: $vgpr9
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v12, v8, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[8:9], v10, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v10, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[5:6]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[8:9], v12, v9, v[3:4]
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v7, v8, v[5:6]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:    ; implicit-def: $vgpr9
 ; GISEL-NEXT:  .LBB1_4: ; %Flow
 ; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[16:17]
 ; GISEL-NEXT:    s_cbranch_execz .LBB1_6
@@ -642,17 +638,17 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
-; GISEL-NEXT:    v_mul_lo_u32 v6, v5, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
-; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v1, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v6, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v7, v9, v[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], vcc, v6, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v6, v6, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[4:5]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v6, s[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v10, vcc
 ; GISEL-NEXT:  .LBB1_6: ; %Flow1
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GISEL-NEXT:  .LBB1_7: ; %Flow2
@@ -796,30 +792,30 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
 ; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[7:8]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
-; SDAG-NEXT:    v_cndmask_b32_e64 v13, 0, v0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v1, s[4:5]
-; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v11, 0
-; SDAG-NEXT:    v_mul_lo_u32 v4, v9, v2
-; SDAG-NEXT:    v_mul_lo_u32 v14, v11, v3
-; SDAG-NEXT:    v_mov_b32_e32 v5, v1
-; SDAG-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[5:6]
-; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v5, v7
-; SDAG-NEXT:    v_mul_lo_u32 v7, v10, v13
-; SDAG-NEXT:    v_add3_u32 v3, v3, v14, v4
-; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v9, v[5:6]
-; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v10, v13, v[2:3]
-; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v8, v5
-; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_mul_lo_u32 v3, v10, v12
-; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v12, v9, v[5:6]
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, v0, s[4:5]
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v7, v11, 0
+; SDAG-NEXT:    v_cndmask_b32_e64 v13, 0, v1, s[4:5]
+; SDAG-NEXT:    v_mul_lo_u32 v8, v9, v2
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v11, v[5:6]
+; SDAG-NEXT:    v_mul_lo_u32 v12, v11, v3
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v11, v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v5, v0
+; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v9, v[5:6]
+; SDAG-NEXT:    v_add3_u32 v3, v3, v12, v8
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v7, v[2:3]
+; SDAG-NEXT:    v_add_co_u32_e64 v0, s[4:5], v1, v6
+; SDAG-NEXT:    v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_mul_lo_u32 v8, v10, v13
+; SDAG-NEXT:    v_mul_lo_u32 v7, v10, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[0:1]
 ; SDAG-NEXT:    ; implicit-def: $vgpr11
 ; SDAG-NEXT:    ; implicit-def: $vgpr9
 ; SDAG-NEXT:    ; implicit-def: $vgpr10
-; SDAG-NEXT:    v_add3_u32 v3, v7, v2, v3
-; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v1
-; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; SDAG-NEXT:    v_add3_u32 v3, v7, v3, v8
+; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v0, v2
+; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; SDAG-NEXT:    v_mov_b32_e32 v1, v5
 ; SDAG-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; SDAG-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; SDAG-NEXT:  .LBB2_4: ; %Flow
@@ -834,9 +830,9 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v7, s[4:5]
 ; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v11, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v6, v2
 ; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, v9, v[1:2]
-; SDAG-NEXT:    v_mov_b32_e32 v1, v5
-; SDAG-NEXT:    v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[1:2]
+; SDAG-NEXT:    v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[5:6]
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v4
 ; SDAG-NEXT:  .LBB2_6: ; %Flow1
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
@@ -845,10 +841,11 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
 ; SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
 ; SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v3, v2
+; SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; SDAG-NEXT:  ; %bb.9: ; %Flow3
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SDAG-NEXT:  .LBB2_10: ; %fp-to-i-cleanup
@@ -959,37 +956,36 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB2_4
 ; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; GISEL-NEXT:    v_add_u32_e32 v7, 0xffffff6a, v6
-; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffff6a, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[4:5]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v1, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0
-; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff2a, v6
-; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
-; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
-; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1]
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7]
+; GISEL-NEXT:    v_add_u32_e32 v3, 0xffffff2a, v6
+; GISEL-NEXT:    v_sub_u32_e32 v6, 64, v2
+; GISEL-NEXT:    v_lshrrev_b64 v[6:7], v6, v[4:5]
+; GISEL-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[8:9], v11, v8, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, v3, 0, s[6:7]
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
-; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
-; GISEL-NEXT:    ; implicit-def: $vgpr8
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v12, v9, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[8:9], v10, v8, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v10, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[5:6]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[8:9], v12, v8, v[3:4]
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v7, v9, v[5:6]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GISEL-NEXT:    ; implicit-def: $vgpr9
+; GISEL-NEXT:    ; implicit-def: $vgpr8
 ; GISEL-NEXT:  .LBB2_4: ; %Flow
 ; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[16:17]
 ; GISEL-NEXT:    s_cbranch_execz .LBB2_6
@@ -999,12 +995,14 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v6, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], vcc, v6, v8, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GISEL-NEXT:    v_mov_b32_e32 v2, v5
 ; GISEL-NEXT:  .LBB2_6: ; %Flow1
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GISEL-NEXT:  .LBB2_7: ; %Flow2
@@ -1148,30 +1146,30 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
 ; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[7:8]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
-; SDAG-NEXT:    v_cndmask_b32_e64 v13, 0, v0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v1, s[4:5]
-; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v11, 0
-; SDAG-NEXT:    v_mul_lo_u32 v4, v9, v2
-; SDAG-NEXT:    v_mul_lo_u32 v14, v11, v3
-; SDAG-NEXT:    v_mov_b32_e32 v5, v1
-; SDAG-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[5:6]
-; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v5, v7
-; SDAG-NEXT:    v_mul_lo_u32 v7, v10, v13
-; SDAG-NEXT:    v_add3_u32 v3, v3, v14, v4
-; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v9, v[5:6]
-; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v10, v13, v[2:3]
-; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v8, v5
-; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_mul_lo_u32 v3, v10, v12
-; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v12, v9, v[5:6]
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, v0, s[4:5]
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v7, v11, 0
+; SDAG-NEXT:    v_cndmask_b32_e64 v13, 0, v1, s[4:5]
+; SDAG-NEXT:    v_mul_lo_u32 v8, v9, v2
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v11, v[5:6]
+; SDAG-NEXT:    v_mul_lo_u32 v12, v11, v3
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v11, v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v5, v0
+; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v9, v[5:6]
+; SDAG-NEXT:    v_add3_u32 v3, v3, v12, v8
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v7, v[2:3]
+; SDAG-NEXT:    v_add_co_u32_e64 v0, s[4:5], v1, v6
+; SDAG-NEXT:    v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_mul_lo_u32 v8, v10, v13
+; SDAG-NEXT:    v_mul_lo_u32 v7, v10, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[0:1]
 ; SDAG-NEXT:    ; implicit-def: $vgpr11
 ; SDAG-NEXT:    ; implicit-def: $vgpr9
 ; SDAG-NEXT:    ; implicit-def: $vgpr10
-; SDAG-NEXT:    v_add3_u32 v3, v7, v2, v3
-; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v1
-; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; SDAG-NEXT:    v_add3_u32 v3, v7, v3, v8
+; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v0, v2
+; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; SDAG-NEXT:    v_mov_b32_e32 v1, v5
 ; SDAG-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; SDAG-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; SDAG-NEXT:  .LBB3_4: ; %Flow
@@ -1186,9 +1184,9 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v7, s[4:5]
 ; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v11, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v6, v2
 ; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, v9, v[1:2]
-; SDAG-NEXT:    v_mov_b32_e32 v1, v5
-; SDAG-NEXT:    v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[1:2]
+; SDAG-NEXT:    v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[5:6]
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v4
 ; SDAG-NEXT:  .LBB3_6: ; %Flow1
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
@@ -1197,10 +1195,11 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
 ; SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
 ; SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v3, v2
+; SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; SDAG-NEXT:  ; %bb.9: ; %Flow3
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SDAG-NEXT:  .LBB3_10: ; %fp-to-i-cleanup
@@ -1311,37 +1310,36 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB3_4
 ; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; GISEL-NEXT:    v_add_u32_e32 v7, 0xffffff6a, v6
-; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffff6a, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[4:5]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v1, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0
-; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff2a, v6
-; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
-; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
-; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1]
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7]
+; GISEL-NEXT:    v_add_u32_e32 v3, 0xffffff2a, v6
+; GISEL-NEXT:    v_sub_u32_e32 v6, 64, v2
+; GISEL-NEXT:    v_lshrrev_b64 v[6:7], v6, v[4:5]
+; GISEL-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[8:9], v11, v8, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, v3, 0, s[6:7]
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
-; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
-; GISEL-NEXT:    ; implicit-def: $vgpr8
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v12, v9, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[8:9], v10, v8, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v10, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[5:6]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[8:9], v12, v8, v[3:4]
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v7, v9, v[5:6]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GISEL-NEXT:    ; implicit-def: $vgpr9
+; GISEL-NEXT:    ; implicit-def: $vgpr8
 ; GISEL-NEXT:  .LBB3_4: ; %Flow
 ; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[16:17]
 ; GISEL-NEXT:    s_cbranch_execz .LBB3_6
@@ -1351,12 +1349,14 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v6, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], vcc, v6, v8, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GISEL-NEXT:    v_mov_b32_e32 v2, v5
 ; GISEL-NEXT:  .LBB3_6: ; %Flow1
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GISEL-NEXT:  .LBB3_7: ; %Flow2
@@ -1539,28 +1539,28 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
 ; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[7:8]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
-; SDAG-NEXT:    v_cndmask_b32_e64 v13, 0, v0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v1, s[4:5]
-; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v9, 0
-; SDAG-NEXT:    v_mul_lo_u32 v4, v10, v2
-; SDAG-NEXT:    v_mul_lo_u32 v14, v9, v3
-; SDAG-NEXT:    v_mov_b32_e32 v5, v1
-; SDAG-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[5:6]
-; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v5, v7
-; SDAG-NEXT:    v_mul_lo_u32 v7, v11, v13
-; SDAG-NEXT:    v_add3_u32 v3, v3, v14, v4
-; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v10, v[5:6]
-; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[2:3]
-; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v8, v5
-; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_mul_lo_u32 v3, v11, v12
-; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6]
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, v0, s[4:5]
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v7, v9, 0
+; SDAG-NEXT:    v_cndmask_b32_e64 v13, 0, v1, s[4:5]
+; SDAG-NEXT:    v_mul_lo_u32 v8, v10, v2
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[5:6]
+; SDAG-NEXT:    v_mul_lo_u32 v12, v9, v3
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v9, v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v5, v0
+; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[5:6]
+; SDAG-NEXT:    v_add3_u32 v3, v3, v12, v8
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v7, v[2:3]
+; SDAG-NEXT:    v_add_co_u32_e64 v0, s[4:5], v1, v6
+; SDAG-NEXT:    v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_mul_lo_u32 v8, v11, v13
+; SDAG-NEXT:    v_mul_lo_u32 v7, v11, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v10, v[0:1]
 ; SDAG-NEXT:    ; implicit-def: $vgpr9
-; SDAG-NEXT:    v_add3_u32 v3, v7, v2, v3
-; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v1
-; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; SDAG-NEXT:    v_add3_u32 v3, v7, v3, v8
+; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v0, v2
+; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; SDAG-NEXT:    v_mov_b32_e32 v1, v5
 ; SDAG-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; SDAG-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; SDAG-NEXT:  .LBB6_4: ; %Flow
@@ -1584,10 +1584,11 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
 ; SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
 ; SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
 ; SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v3, v2
+; SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; SDAG-NEXT:  ; %bb.9: ; %Flow3
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SDAG-NEXT:  .LBB6_10: ; %fp-to-i-cleanup
@@ -1699,34 +1700,33 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
 ; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB6_4
 ; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; GISEL-NEXT:    v_add_u32_e32 v10, 0xffffff7a, v5
-; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v10, v[6:7]
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffff7a, v5
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[6:7]
+; GISEL-NEXT:    v_add_u32_e32 v5, 0xffffff3a, v5
+; GISEL-NEXT:    v_sub_u32_e32 v3, 64, v2
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT:    v_lshrrev_b64 v[3:4], v3, v[6:7]
+; GISEL-NEXT:    v_lshlrev_b64 v[5:6], v5, v[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0
-; GISEL-NEXT:    v_add_u32_e32 v4, 0xffffff3a, v5
-; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v10
-; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[6:7]
-; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v4, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v7, v9, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
-; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v7, v8, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v5, v8, v[10:11]
+; GISEL-NEXT:    v_mul_lo_u32 v13, v12, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v7, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[10:11]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[10:11], v3, v7, s[10:11]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v5, v9, v[3:4]
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[10:11]
 ; GISEL-NEXT:    ; implicit-def: $vgpr5
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
 ; GISEL-NEXT:  .LBB6_4: ; %Flow
@@ -1887,28 +1887,28 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
 ; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[7:8]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
-; SDAG-NEXT:    v_cndmask_b32_e64 v13, 0, v0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v1, s[4:5]
-; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v9, 0
-; SDAG-NEXT:    v_mul_lo_u32 v4, v10, v2
-; SDAG-NEXT:    v_mul_lo_u32 v14, v9, v3
-; SDAG-NEXT:    v_mov_b32_e32 v5, v1
-; SDAG-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[5:6]
-; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v5, v7
-; SDAG-NEXT:    v_mul_lo_u32 v7, v11, v13
-; SDAG-NEXT:    v_add3_u32 v3, v3, v14, v4
-; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v10, v[5:6]
-; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[2:3]
-; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v8, v5
-; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_mul_lo_u32 v3, v11, v12
-; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6]
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, v0, s[4:5]
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v7, v9, 0
+; SDAG-NEXT:    v_cndmask_b32_e64 v13, 0, v1, s[4:5]
+; SDAG-NEXT:    v_mul_lo_u32 v8, v10, v2
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[5:6]
+; SDAG-NEXT:    v_mul_lo_u32 v12, v9, v3
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v9, v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v5, v0
+; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[5:6]
+; SDAG-NEXT:    v_add3_u32 v3, v3, v12, v8
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v7, v[2:3]
+; SDAG-NEXT:    v_add_co_u32_e64 v0, s[4:5], v1, v6
+; SDAG-NEXT:    v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_mul_lo_u32 v8, v11, v13
+; SDAG-NEXT:    v_mul_lo_u32 v7, v11, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v10, v[0:1]
 ; SDAG-NEXT:    ; implicit-def: $vgpr9
-; SDAG-NEXT:    v_add3_u32 v3, v7, v2, v3
-; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v1
-; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; SDAG-NEXT:    v_add3_u32 v3, v7, v3, v8
+; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v0, v2
+; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; SDAG-NEXT:    v_mov_b32_e32 v1, v5
 ; SDAG-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; SDAG-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; SDAG-NEXT:  .LBB7_4: ; %Flow
@@ -1932,10 +1932,11 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
 ; SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
 ; SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
 ; SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v3, v2
+; SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; SDAG-NEXT:  ; %bb.9: ; %Flow3
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SDAG-NEXT:  .LBB7_10: ; %fp-to-i-cleanup
@@ -2047,34 +2048,33 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
 ; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB7_4
 ; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; GISEL-NEXT:    v_add_u32_e32 v10, 0xffffff7a, v5
-; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v10, v[6:7]
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffff7a, v5
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[6:7]
+; GISEL-NEXT:    v_add_u32_e32 v5, 0xffffff3a, v5
+; GISEL-NEXT:    v_sub_u32_e32 v3, 64, v2
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT:    v_lshrrev_b64 v[3:4], v3, v[6:7]
+; GISEL-NEXT:    v_lshlrev_b64 v[5:6], v5, v[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0
-; GISEL-NEXT:    v_add_u32_e32 v4, 0xffffff3a, v5
-; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v10
-; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[6:7]
-; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v4, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v7, v9, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
-; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v7, v8, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v5, v8, v[10:11]
+; GISEL-NEXT:    v_mul_lo_u32 v13, v12, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v7, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[10:11]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[10:11], v3, v7, s[10:11]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v5, v9, v[3:4]
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[10:11]
 ; GISEL-NEXT:    ; implicit-def: $vgpr5
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
 ; GISEL-NEXT:  .LBB7_4: ; %Flow
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index ed1ee4527ed89..72c2003058a01 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -18,12 +18,15 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_lshr_b32 s1, s0, 1
-; SI-NEXT:    v_alignbit_b32 v0, s0, v0, 1
-; SI-NEXT:    s_not_b32 s0, s2
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    v_alignbit_b32 v0, s1, v0, v1
+; SI-NEXT:    s_mov_b32 s8, s1
+; SI-NEXT:    s_mov_b32 s9, s0
+; SI-NEXT:    s_lshr_b32 s3, s0, 1
+; SI-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
+; SI-NEXT:    s_not_b32 s2, s2
+; SI-NEXT:    s_mov_b32 s1, s3
+; SI-NEXT:    s_and_b32 s2, s2, 31
+; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -32,14 +35,17 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    s_mov_b32 s6, s1
+; VI-NEXT:    s_mov_b32 s7, s0
+; VI-NEXT:    s_lshr_b32 s3, s0, 1
 ; VI-NEXT:    s_not_b32 s2, s2
-; VI-NEXT:    s_lshr_b32 s1, s0, 1
-; VI-NEXT:    v_alignbit_b32 v0, s0, v0, 1
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_alignbit_b32 v2, s1, v0, v1
+; VI-NEXT:    s_lshr_b64 s[0:1], s[6:7], 1
+; VI-NEXT:    s_mov_b32 s1, s3
+; VI-NEXT:    s_and_b32 s2, s2, 31
+; VI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -49,12 +55,15 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_mov_b32 s4, s1
+; GFX9-NEXT:    s_mov_b32 s5, s0
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 1
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX9-NEXT:    s_not_b32 s2, s2
-; GFX9-NEXT:    s_lshr_b32 s1, s0, 1
-; GFX9-NEXT:    v_alignbit_b32 v1, s0, v1, 1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_alignbit_b32 v1, s1, v1, v2
+; GFX9-NEXT:    s_mov_b32 s1, s3
+; GFX9-NEXT:    s_and_b32 s2, s2, 31
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -77,13 +86,18 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, s1, 1
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX10-NEXT:    s_not_b32 s1, s2
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, v0, s1
-; GFX10-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX10-NEXT:    s_mov_b32 s4, s1
+; GFX10-NEXT:    s_mov_b32 s5, s0
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 1
+; GFX10-NEXT:    s_not_b32 s2, s2
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
+; GFX10-NEXT:    s_mov_b32 s1, s3
+; GFX10-NEXT:    s_and_b32 s2, s2, 31
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshl_i32:
@@ -91,14 +105,18 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, s1, 1
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX11-NEXT:    s_not_b32 s1, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, s1
-; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    s_mov_b32 s6, s1
+; GFX11-NEXT:    s_mov_b32 s7, s0
+; GFX11-NEXT:    s_lshr_b32 s3, s0, 1
+; GFX11-NEXT:    s_not_b32 s2, s2
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[6:7], 1
+; GFX11-NEXT:    s_mov_b32 s1, s3
+; GFX11-NEXT:    s_and_b32 s2, s2, 31
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
@@ -113,10 +131,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 25
+; SI-NEXT:    s_mov_b32 s0, s3
+; SI-NEXT:    s_mov_b32 s1, s2
+; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], 25
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -124,10 +144,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_alignbit_b32 v2, s2, v0, 25
+; VI-NEXT:    s_mov_b32 s4, s3
+; VI-NEXT:    s_mov_b32 s5, s2
+; VI-NEXT:    s_lshr_b64 s[2:3], s[4:5], 25
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -136,8 +158,10 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 25
+; GFX9-NEXT:    s_mov_b32 s4, s3
+; GFX9-NEXT:    s_mov_b32 s5, s2
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[4:5], 25
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -158,16 +182,22 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 25
+; GFX10-NEXT:    s_mov_b32 s4, s3
+; GFX10-NEXT:    s_mov_b32 s5, s2
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[4:5], 25
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshl_i32_imm:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, s2, s3, 25
+; GFX11-NEXT:    s_mov_b32 s4, s3
+; GFX11-NEXT:    s_mov_b32 s5, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[4:5], 25
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
 entry:
@@ -185,41 +215,51 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_alignbit_b32 v0, s1, v0, 1
-; SI-NEXT:    s_not_b32 s3, s5
-; SI-NEXT:    s_lshr_b32 s1, s1, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    v_alignbit_b32 v1, s1, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    s_not_b32 s1, s4
-; SI-NEXT:    v_alignbit_b32 v0, s0, v0, 1
-; SI-NEXT:    s_lshr_b32 s0, s0, 1
-; SI-NEXT:    v_mov_b32_e32 v2, s1
-; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v2
+; SI-NEXT:    s_mov_b32 s6, s3
+; SI-NEXT:    s_mov_b32 s7, s1
+; SI-NEXT:    s_lshr_b32 s12, s1, 1
+; SI-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
+; SI-NEXT:    s_not_b32 s1, s5
+; SI-NEXT:    s_mov_b32 s7, s12
+; SI-NEXT:    s_and_b32 s1, s1, 31
+; SI-NEXT:    s_mov_b32 s3, s0
+; SI-NEXT:    s_lshr_b64 s[6:7], s[6:7], s1
+; SI-NEXT:    s_lshr_b32 s5, s0, 1
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], 1
+; SI-NEXT:    s_not_b32 s2, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_and_b32 s2, s2, 31
+; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s6
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshl_v2i32:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x3c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    s_not_b32 s7, s7
-; VI-NEXT:    s_lshr_b32 s3, s1, 1
-; VI-NEXT:    v_alignbit_b32 v0, s1, v0, 1
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_not_b32 s1, s6
-; VI-NEXT:    v_alignbit_b32 v0, s0, v0, 1
-; VI-NEXT:    s_lshr_b32 s0, s0, 1
-; VI-NEXT:    v_mov_b32_e32 v2, s1
-; VI-NEXT:    v_alignbit_b32 v0, s0, v0, v2
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    s_mov_b32 s8, s3
+; VI-NEXT:    s_mov_b32 s9, s1
+; VI-NEXT:    s_lshr_b32 s10, s1, 1
+; VI-NEXT:    s_lshr_b64 s[8:9], s[8:9], 1
+; VI-NEXT:    s_not_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s9, s10
+; VI-NEXT:    s_and_b32 s1, s1, 31
+; VI-NEXT:    s_mov_b32 s3, s0
+; VI-NEXT:    s_lshr_b64 s[8:9], s[8:9], s1
+; VI-NEXT:    s_lshr_b32 s5, s0, 1
+; VI-NEXT:    s_lshr_b64 s[0:1], s[2:3], 1
+; VI-NEXT:    s_not_b32 s2, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_and_b32 s2, s2, 31
+; VI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s8
+; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
@@ -230,18 +270,23 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x3c
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 1
-; GFX9-NEXT:    v_alignbit_b32 v0, s1, v0, 1
+; GFX9-NEXT:    s_mov_b32 s4, s3
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_lshr_b32 s10, s1, 1
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
 ; GFX9-NEXT:    s_not_b32 s1, s9
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_alignbit_b32 v1, s3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    s_not_b32 s1, s8
-; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, 1
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v3
+; GFX9-NEXT:    s_mov_b32 s5, s10
+; GFX9-NEXT:    s_and_b32 s1, s1, 31
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s1
+; GFX9-NEXT:    s_mov_b32 s3, s0
+; GFX9-NEXT:    s_lshr_b32 s5, s0, 1
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[2:3], 1
+; GFX9-NEXT:    s_not_b32 s2, s8
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_and_b32 s2, s2, 31
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -271,14 +316,23 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v0, s1, s3, 1
-; GFX10-NEXT:    v_alignbit_b32 v3, s0, s2, 1
-; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX10-NEXT:    s_not_b32 s2, s7
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX10-NEXT:    s_not_b32 s3, s6
-; GFX10-NEXT:    v_alignbit_b32 v1, s1, v0, s2
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, v3, s3
+; GFX10-NEXT:    s_mov_b32 s4, s3
+; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_mov_b32 s3, s0
+; GFX10-NEXT:    s_lshr_b32 s10, s1, 1
+; GFX10-NEXT:    s_not_b32 s7, s7
+; GFX10-NEXT:    s_lshr_b32 s11, s0, 1
+; GFX10-NEXT:    s_not_b32 s6, s6
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX10-NEXT:    s_and_b32 s4, s7, 31
+; GFX10-NEXT:    s_and_b32 s5, s6, 31
+; GFX10-NEXT:    s_mov_b32 s3, s11
+; GFX10-NEXT:    s_mov_b32 s1, s10
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s5
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -288,16 +342,25 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, s1, s3, 1
-; GFX11-NEXT:    v_alignbit_b32 v3, s0, s2, 1
-; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX11-NEXT:    s_not_b32 s2, s7
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX11-NEXT:    s_not_b32 s3, s6
-; GFX11-NEXT:    v_alignbit_b32 v1, s1, v0, s2
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, v3, s3
+; GFX11-NEXT:    s_mov_b32 s8, s3
+; GFX11-NEXT:    s_mov_b32 s9, s1
+; GFX11-NEXT:    s_mov_b32 s3, s0
+; GFX11-NEXT:    s_lshr_b32 s10, s1, 1
+; GFX11-NEXT:    s_not_b32 s7, s7
+; GFX11-NEXT:    s_lshr_b32 s11, s0, 1
+; GFX11-NEXT:    s_not_b32 s6, s6
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX11-NEXT:    s_and_b32 s7, s7, 31
+; GFX11-NEXT:    s_and_b32 s6, s6, 31
+; GFX11-NEXT:    s_mov_b32 s3, s11
+; GFX11-NEXT:    s_mov_b32 s1, s10
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s6
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s7
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_endpgm
 entry:
@@ -314,10 +377,13 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    v_alignbit_b32 v1, s1, v0, 23
-; SI-NEXT:    v_alignbit_b32 v0, s0, v2, 25
+; SI-NEXT:    s_mov_b32 s8, s3
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_mov_b32 s3, s0
+; SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], 23
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], 25
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s8
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -326,11 +392,14 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_alignbit_b32 v1, s1, v0, 23
-; VI-NEXT:    v_alignbit_b32 v0, s0, v2, 25
+; VI-NEXT:    s_mov_b32 s6, s3
+; VI-NEXT:    s_mov_b32 s7, s1
+; VI-NEXT:    s_mov_b32 s3, s0
+; VI-NEXT:    s_lshr_b64 s[0:1], s[6:7], 23
+; VI-NEXT:    s_lshr_b64 s[2:3], s[2:3], 25
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s0
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -341,10 +410,13 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NEXT:    v_alignbit_b32 v1, s1, v0, 23
-; GFX9-NEXT:    v_alignbit_b32 v0, s0, v3, 25
+; GFX9-NEXT:    s_mov_b32 s4, s3
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s3, s0
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 23
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 25
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -369,8 +441,13 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s1, s3, 23
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, s2, 25
+; GFX10-NEXT:    s_mov_b32 s4, s3
+; GFX10-NEXT:    s_mov_b32 s3, s0
+; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[2:3], 25
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[4:5], 23
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -379,10 +456,15 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, s1, s3, 23
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, s2, 25
+; GFX11-NEXT:    s_mov_b32 s6, s3
+; GFX11-NEXT:    s_mov_b32 s3, s0
+; GFX11-NEXT:    s_mov_b32 s7, s1
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[2:3], 25
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[6:7], 23
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_endpgm
 entry:
@@ -395,104 +477,134 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
 ; SI-LABEL: fshl_v4i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x15
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x15
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_not_b32 s5, s19
-; SI-NEXT:    v_mov_b32_e32 v0, s15
-; SI-NEXT:    v_alignbit_b32 v0, s11, v0, 1
-; SI-NEXT:    s_lshr_b32 s4, s11, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_alignbit_b32 v3, s4, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s14
-; SI-NEXT:    s_not_b32 s5, s18
-; SI-NEXT:    v_alignbit_b32 v0, s10, v0, 1
-; SI-NEXT:    s_lshr_b32 s4, s10, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_alignbit_b32 v2, s4, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s13
-; SI-NEXT:    s_not_b32 s5, s17
-; SI-NEXT:    v_alignbit_b32 v0, s9, v0, 1
-; SI-NEXT:    s_lshr_b32 s4, s9, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_alignbit_b32 v1, s4, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s12
-; SI-NEXT:    s_not_b32 s5, s16
-; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
-; SI-NEXT:    s_lshr_b32 s4, s8, 1
-; SI-NEXT:    v_mov_b32_e32 v4, s5
-; SI-NEXT:    v_alignbit_b32 v0, s4, v0, v4
+; SI-NEXT:    s_mov_b32 s16, s15
+; SI-NEXT:    s_mov_b32 s17, s11
+; SI-NEXT:    s_lshr_b32 s18, s11, 1
+; SI-NEXT:    s_lshr_b64 s[16:17], s[16:17], 1
+; SI-NEXT:    s_not_b32 s7, s7
+; SI-NEXT:    s_mov_b32 s17, s18
+; SI-NEXT:    s_and_b32 s7, s7, 31
+; SI-NEXT:    s_mov_b32 s15, s10
+; SI-NEXT:    s_lshr_b64 s[16:17], s[16:17], s7
+; SI-NEXT:    s_lshr_b32 s7, s10, 1
+; SI-NEXT:    s_lshr_b64 s[10:11], s[14:15], 1
+; SI-NEXT:    s_not_b32 s6, s6
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_and_b32 s6, s6, 31
+; SI-NEXT:    s_lshr_b64 s[6:7], s[10:11], s6
+; SI-NEXT:    s_mov_b32 s10, s13
+; SI-NEXT:    s_mov_b32 s11, s9
+; SI-NEXT:    s_lshr_b32 s7, s9, 1
+; SI-NEXT:    s_lshr_b64 s[10:11], s[10:11], 1
+; SI-NEXT:    s_not_b32 s5, s5
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_and_b32 s5, s5, 31
+; SI-NEXT:    s_mov_b32 s13, s8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[10:11], s5
+; SI-NEXT:    s_lshr_b32 s5, s8, 1
+; SI-NEXT:    s_lshr_b64 s[8:9], s[12:13], 1
+; SI-NEXT:    s_not_b32 s4, s4
+; SI-NEXT:    s_mov_b32 s9, s5
+; SI-NEXT:    s_and_b32 s4, s4, 31
+; SI-NEXT:    s_lshr_b64 s[4:5], s[8:9], s4
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s16
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshl_v4i32:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s15
+; VI-NEXT:    s_mov_b32 s4, s15
+; VI-NEXT:    s_mov_b32 s5, s11
+; VI-NEXT:    s_lshr_b32 s16, s11, 1
+; VI-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
 ; VI-NEXT:    s_not_b32 s3, s3
-; VI-NEXT:    s_lshr_b32 s6, s11, 1
-; VI-NEXT:    v_alignbit_b32 v0, s11, v0, 1
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_alignbit_b32 v3, s6, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s14
-; VI-NEXT:    s_not_b32 s2, s2
-; VI-NEXT:    v_alignbit_b32 v0, s10, v0, 1
+; VI-NEXT:    s_mov_b32 s5, s16
+; VI-NEXT:    s_and_b32 s3, s3, 31
+; VI-NEXT:    s_mov_b32 s15, s10
+; VI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s3
 ; VI-NEXT:    s_lshr_b32 s3, s10, 1
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_alignbit_b32 v2, s3, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s13
+; VI-NEXT:    s_lshr_b64 s[10:11], s[14:15], 1
+; VI-NEXT:    s_not_b32 s2, s2
+; VI-NEXT:    s_mov_b32 s11, s3
+; VI-NEXT:    s_and_b32 s2, s2, 31
+; VI-NEXT:    s_lshr_b64 s[2:3], s[10:11], s2
+; VI-NEXT:    s_mov_b32 s10, s13
+; VI-NEXT:    s_mov_b32 s11, s9
+; VI-NEXT:    s_lshr_b32 s3, s9, 1
+; VI-NEXT:    s_lshr_b64 s[10:11], s[10:11], 1
 ; VI-NEXT:    s_not_b32 s1, s1
-; VI-NEXT:    v_alignbit_b32 v0, s9, v0, 1
-; VI-NEXT:    s_lshr_b32 s2, s9, 1
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_alignbit_b32 v1, s2, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s12
-; VI-NEXT:    s_not_b32 s0, s0
-; VI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
+; VI-NEXT:    s_mov_b32 s11, s3
+; VI-NEXT:    s_and_b32 s1, s1, 31
+; VI-NEXT:    s_mov_b32 s13, s8
+; VI-NEXT:    s_lshr_b64 s[10:11], s[10:11], s1
 ; VI-NEXT:    s_lshr_b32 s1, s8, 1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_alignbit_b32 v0, s1, v0, v4
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    s_lshr_b64 s[8:9], s[12:13], 1
+; VI-NEXT:    s_not_b32 s0, s0
+; VI-NEXT:    s_mov_b32 s9, s1
+; VI-NEXT:    s_and_b32 s0, s0, 31
+; VI-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
+; VI-NEXT:    v_mov_b32_e32 v4, s6
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s10
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s4
+; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fshl_v4i32:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, s15
+; GFX9-NEXT:    s_mov_b32 s5, s11
+; GFX9-NEXT:    s_lshr_b32 s16, s11, 1
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
 ; GFX9-NEXT:    s_not_b32 s3, s3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s15
-; GFX9-NEXT:    s_lshr_b32 s4, s11, 1
-; GFX9-NEXT:    v_alignbit_b32 v0, s11, v0, 1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_alignbit_b32 v3, s4, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s14
-; GFX9-NEXT:    s_not_b32 s2, s2
-; GFX9-NEXT:    v_alignbit_b32 v0, s10, v0, 1
+; GFX9-NEXT:    s_mov_b32 s5, s16
+; GFX9-NEXT:    s_and_b32 s3, s3, 31
+; GFX9-NEXT:    s_mov_b32 s15, s10
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s3
 ; GFX9-NEXT:    s_lshr_b32 s3, s10, 1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_alignbit_b32 v2, s3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[14:15], 1
+; GFX9-NEXT:    s_not_b32 s2, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_and_b32 s2, s2, 31
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[10:11], s2
+; GFX9-NEXT:    s_mov_b32 s10, s13
+; GFX9-NEXT:    s_mov_b32 s11, s9
+; GFX9-NEXT:    s_lshr_b32 s3, s9, 1
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], 1
 ; GFX9-NEXT:    s_not_b32 s1, s1
-; GFX9-NEXT:    v_alignbit_b32 v0, s9, v0, 1
-; GFX9-NEXT:    s_lshr_b32 s2, s9, 1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_alignbit_b32 v1, s2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s12
-; GFX9-NEXT:    s_not_b32 s0, s0
-; GFX9-NEXT:    v_alignbit_b32 v0, s8, v0, 1
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_and_b32 s1, s1, 31
+; GFX9-NEXT:    s_mov_b32 s13, s8
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s1
 ; GFX9-NEXT:    s_lshr_b32 s1, s8, 1
-; GFX9-NEXT:    v_mov_b32_e32 v5, s0
-; GFX9-NEXT:    v_alignbit_b32 v0, s1, v0, v5
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[12:13], 1
+; GFX9-NEXT:    s_not_b32 s0, s0
+; GFX9-NEXT:    s_mov_b32 s9, s1
+; GFX9-NEXT:    s_and_b32 s0, s0, 31
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -530,22 +642,40 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v0, s11, s15, 1
-; GFX10-NEXT:    v_alignbit_b32 v1, s10, s14, 1
-; GFX10-NEXT:    v_alignbit_b32 v5, s9, s13, 1
-; GFX10-NEXT:    v_alignbit_b32 v6, s8, s12, 1
-; GFX10-NEXT:    s_lshr_b32 s4, s11, 1
-; GFX10-NEXT:    s_not_b32 s3, s3
-; GFX10-NEXT:    s_lshr_b32 s5, s10, 1
-; GFX10-NEXT:    s_not_b32 s2, s2
-; GFX10-NEXT:    s_lshr_b32 s9, s9, 1
+; GFX10-NEXT:    s_mov_b32 s4, s15
+; GFX10-NEXT:    s_mov_b32 s5, s11
+; GFX10-NEXT:    s_mov_b32 s15, s10
+; GFX10-NEXT:    s_lshr_b32 s16, s11, 1
+; GFX10-NEXT:    s_not_b32 s11, s3
+; GFX10-NEXT:    s_lshr_b32 s17, s10, 1
+; GFX10-NEXT:    s_not_b32 s10, s2
+; GFX10-NEXT:    s_lshr_b32 s18, s9, 1
+; GFX10-NEXT:    s_mov_b32 s2, s13
+; GFX10-NEXT:    s_mov_b32 s3, s9
+; GFX10-NEXT:    s_lshr_b32 s19, s8, 1
+; GFX10-NEXT:    s_mov_b32 s13, s8
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
+; GFX10-NEXT:    s_and_b32 s11, s11, 31
+; GFX10-NEXT:    s_and_b32 s10, s10, 31
+; GFX10-NEXT:    s_mov_b32 s5, s16
+; GFX10-NEXT:    s_mov_b32 s9, s17
 ; GFX10-NEXT:    s_not_b32 s1, s1
-; GFX10-NEXT:    s_lshr_b32 s8, s8, 1
 ; GFX10-NEXT:    s_not_b32 s0, s0
-; GFX10-NEXT:    v_alignbit_b32 v3, s4, v0, s3
-; GFX10-NEXT:    v_alignbit_b32 v2, s5, v1, s2
-; GFX10-NEXT:    v_alignbit_b32 v1, s9, v5, s1
-; GFX10-NEXT:    v_alignbit_b32 v0, s8, v6, s0
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s11
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s10
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[12:13], 1
+; GFX10-NEXT:    s_mov_b32 s3, s18
+; GFX10-NEXT:    s_mov_b32 s11, s19
+; GFX10-NEXT:    s_and_b32 s0, s0, 31
+; GFX10-NEXT:    s_and_b32 s5, s1, 31
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[10:11], s0
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -555,24 +685,41 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
 ; GFX11-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, s11, s15, 1
-; GFX11-NEXT:    v_alignbit_b32 v1, s10, s14, 1
-; GFX11-NEXT:    v_alignbit_b32 v5, s9, s13, 1
-; GFX11-NEXT:    v_alignbit_b32 v6, s8, s12, 1
-; GFX11-NEXT:    s_lshr_b32 s6, s11, 1
-; GFX11-NEXT:    s_not_b32 s3, s3
-; GFX11-NEXT:    s_lshr_b32 s7, s10, 1
-; GFX11-NEXT:    s_not_b32 s2, s2
-; GFX11-NEXT:    s_lshr_b32 s9, s9, 1
+; GFX11-NEXT:    s_mov_b32 s6, s15
+; GFX11-NEXT:    s_mov_b32 s7, s11
+; GFX11-NEXT:    s_mov_b32 s15, s10
+; GFX11-NEXT:    s_lshr_b32 s16, s11, 1
+; GFX11-NEXT:    s_not_b32 s11, s3
+; GFX11-NEXT:    s_lshr_b32 s17, s10, 1
+; GFX11-NEXT:    s_not_b32 s10, s2
+; GFX11-NEXT:    s_lshr_b32 s18, s9, 1
+; GFX11-NEXT:    s_mov_b32 s2, s13
+; GFX11-NEXT:    s_mov_b32 s3, s9
+; GFX11-NEXT:    s_lshr_b32 s19, s8, 1
+; GFX11-NEXT:    s_mov_b32 s13, s8
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
+; GFX11-NEXT:    s_and_b32 s11, s11, 31
+; GFX11-NEXT:    s_and_b32 s10, s10, 31
+; GFX11-NEXT:    s_mov_b32 s7, s16
+; GFX11-NEXT:    s_mov_b32 s9, s17
 ; GFX11-NEXT:    s_not_b32 s1, s1
-; GFX11-NEXT:    s_lshr_b32 s8, s8, 1
 ; GFX11-NEXT:    s_not_b32 s0, s0
-; GFX11-NEXT:    v_alignbit_b32 v3, s6, v0, s3
-; GFX11-NEXT:    v_alignbit_b32 v2, s7, v1, s2
-; GFX11-NEXT:    v_alignbit_b32 v1, s9, v5, s1
-; GFX11-NEXT:    v_alignbit_b32 v0, s8, v6, s0
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], s11
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s10
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[12:13], 1
+; GFX11-NEXT:    s_mov_b32 s3, s18
+; GFX11-NEXT:    s_mov_b32 s11, s19
+; GFX11-NEXT:    s_and_b32 s0, s0, 31
+; GFX11-NEXT:    s_and_b32 s7, s1, 31
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[10:11], s0
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s7
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s6
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
 ; GFX11-NEXT:    s_endpgm
 entry:
@@ -589,14 +736,20 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s15
-; SI-NEXT:    v_mov_b32_e32 v1, s14
-; SI-NEXT:    v_alignbit_b32 v3, s11, v0, 31
-; SI-NEXT:    v_mov_b32_e32 v0, s13
-; SI-NEXT:    v_alignbit_b32 v2, s10, v1, 23
-; SI-NEXT:    v_alignbit_b32 v1, s9, v0, 25
-; SI-NEXT:    v_mov_b32_e32 v0, s12
-; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 31
+; SI-NEXT:    s_mov_b32 s4, s15
+; SI-NEXT:    s_mov_b32 s5, s11
+; SI-NEXT:    s_mov_b32 s15, s10
+; SI-NEXT:    s_mov_b32 s10, s13
+; SI-NEXT:    s_mov_b32 s11, s9
+; SI-NEXT:    s_mov_b32 s13, s8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], 31
+; SI-NEXT:    s_lshr_b64 s[6:7], s[14:15], 23
+; SI-NEXT:    s_lshr_b64 s[10:11], s[10:11], 25
+; SI-NEXT:    s_lshr_b64 s[8:9], s[12:13], 31
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s4
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -605,15 +758,21 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s15
-; VI-NEXT:    v_mov_b32_e32 v1, s14
-; VI-NEXT:    v_mov_b32_e32 v4, s13
-; VI-NEXT:    v_alignbit_b32 v3, s11, v0, 31
-; VI-NEXT:    v_alignbit_b32 v2, s10, v1, 23
-; VI-NEXT:    v_alignbit_b32 v1, s9, v4, 25
-; VI-NEXT:    v_mov_b32_e32 v0, s12
+; VI-NEXT:    s_mov_b32 s2, s15
+; VI-NEXT:    s_mov_b32 s3, s11
+; VI-NEXT:    s_mov_b32 s15, s10
+; VI-NEXT:    s_mov_b32 s6, s13
+; VI-NEXT:    s_mov_b32 s7, s9
+; VI-NEXT:    s_mov_b32 s13, s8
+; VI-NEXT:    s_lshr_b64 s[2:3], s[2:3], 31
+; VI-NEXT:    s_lshr_b64 s[4:5], s[14:15], 23
+; VI-NEXT:    s_lshr_b64 s[6:7], s[6:7], 25
+; VI-NEXT:    s_lshr_b64 s[8:9], s[12:13], 31
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_alignbit_b32 v0, s8, v0, 31
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
@@ -624,14 +783,20 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s15
-; GFX9-NEXT:    v_mov_b32_e32 v1, s14
-; GFX9-NEXT:    v_alignbit_b32 v3, s11, v0, 31
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
-; GFX9-NEXT:    v_alignbit_b32 v2, s10, v1, 23
-; GFX9-NEXT:    v_alignbit_b32 v1, s9, v0, 25
-; GFX9-NEXT:    v_mov_b32_e32 v0, s12
-; GFX9-NEXT:    v_alignbit_b32 v0, s8, v0, 31
+; GFX9-NEXT:    s_mov_b32 s2, s15
+; GFX9-NEXT:    s_mov_b32 s3, s11
+; GFX9-NEXT:    s_mov_b32 s15, s10
+; GFX9-NEXT:    s_mov_b32 s6, s13
+; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    s_mov_b32 s13, s8
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 31
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[14:15], 23
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[6:7], 25
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[12:13], 31
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -660,10 +825,20 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v3, s11, s15, 31
-; GFX10-NEXT:    v_alignbit_b32 v2, s10, s14, 23
-; GFX10-NEXT:    v_alignbit_b32 v1, s9, s13, 25
-; GFX10-NEXT:    v_alignbit_b32 v0, s8, s12, 31
+; GFX10-NEXT:    s_mov_b32 s2, s15
+; GFX10-NEXT:    s_mov_b32 s3, s11
+; GFX10-NEXT:    s_mov_b32 s15, s10
+; GFX10-NEXT:    s_mov_b32 s4, s13
+; GFX10-NEXT:    s_mov_b32 s5, s9
+; GFX10-NEXT:    s_mov_b32 s13, s8
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], 31
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[14:15], 23
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[12:13], 31
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], 25
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -672,12 +847,21 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v3, s11, s15, 31
-; GFX11-NEXT:    v_alignbit_b32 v2, s10, s14, 23
-; GFX11-NEXT:    v_alignbit_b32 v1, s9, s13, 25
-; GFX11-NEXT:    v_alignbit_b32 v0, s8, s12, 31
+; GFX11-NEXT:    s_mov_b32 s2, s15
+; GFX11-NEXT:    s_mov_b32 s3, s11
+; GFX11-NEXT:    s_mov_b32 s15, s10
+; GFX11-NEXT:    s_mov_b32 s4, s13
+; GFX11-NEXT:    s_mov_b32 s5, s9
+; GFX11-NEXT:    s_mov_b32 s13, s8
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], 31
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[14:15], 23
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[12:13], 31
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], 25
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4
+; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2
+; GFX11-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 entry:
@@ -691,17 +875,16 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
 ; SI-LABEL: orxor2or1:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_lshl_b32 s0, s2, 7
-; SI-NEXT:    s_or_b32 s0, s3, s0
-; SI-NEXT:    s_cmp_eq_u32 s0, 0
-; SI-NEXT:    s_cselect_b32 s0, s2, s3
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_lshl_b32 s6, s4, 7
+; SI-NEXT:    s_or_b32 s6, s5, s6
+; SI-NEXT:    s_cmp_eq_u32 s6, 0
+; SI-NEXT:    s_cselect_b32 s4, s4, s5
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: orxor2or1:
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index ef68f44bac203..7afb2cf317869 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -30,9 +30,11 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_mov_b32_e32 v1, s2
-; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v1
+; SI-NEXT:    s_mov_b32 s8, s1
+; SI-NEXT:    s_mov_b32 s9, s0
+; SI-NEXT:    s_and_b32 s0, s2, 31
+; SI-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -41,11 +43,13 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_alignbit_b32 v2, s0, v0, v1
+; VI-NEXT:    s_mov_b32 s6, s1
+; VI-NEXT:    s_mov_b32 s7, s0
+; VI-NEXT:    s_and_b32 s0, s2, 31
+; VI-NEXT:    s_lshr_b64 s[0:1], s[6:7], s0
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -55,9 +59,11 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_alignbit_b32 v1, s0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s4, s1
+; GFX9-NEXT:    s_mov_b32 s5, s0
+; GFX9-NEXT:    s_and_b32 s0, s2, 31
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -77,62 +83,45 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, s1, v0
-; GFX10-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX10-NEXT:    s_mov_b32 s4, s1
+; GFX10-NEXT:    s_mov_b32 s5, s0
+; GFX10-NEXT:    s_and_b32 s0, s2, 31
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-TRUE16-LABEL: fshr_i32:
-; GFX11-TRUE16:       ; %bb.0: ; %entry
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_alignbit_b32 v0, s0, s1, v0.l
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-FAKE16-LABEL: fshr_i32:
-; GFX11-FAKE16:       ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, s0, s1, v0
-; GFX11-FAKE16-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-FAKE16-NEXT:    s_endpgm
-;
-; GFX12-TRUE16-LABEL: fshr_i32:
-; GFX12-TRUE16:       ; %bb.0: ; %entry
-; GFX12-TRUE16-NEXT:    s_clause 0x1
-; GFX12-TRUE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x2c
-; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_alignbit_b32 v0, s0, s1, v0.l
-; GFX12-TRUE16-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-TRUE16-NEXT:    s_endpgm
-;
-; GFX12-FAKE16-LABEL: fshr_i32:
-; GFX12-FAKE16:       ; %bb.0: ; %entry
-; GFX12-FAKE16-NEXT:    s_clause 0x1
-; GFX12-FAKE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x2c
-; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_alignbit_b32 v0, s0, s1, v0
-; GFX12-FAKE16-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: fshr_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s6, s1
+; GFX11-NEXT:    s_mov_b32 s7, s0
+; GFX11-NEXT:    s_and_b32 s0, s2, 31
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[6:7], s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fshr_i32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x2c
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s6, s1
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_and_b32 s0, s2, 31
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshr_b64 s[0:1], s[6:7], s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-NEXT:    s_endpgm
 entry:
   %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
   store i32 %0, ptr addrspace(1) %in
@@ -146,10 +135,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 7
+; SI-NEXT:    s_mov_b32 s0, s3
+; SI-NEXT:    s_mov_b32 s1, s2
+; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], 7
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -157,10 +148,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_alignbit_b32 v2, s2, v0, 7
+; VI-NEXT:    s_mov_b32 s4, s3
+; VI-NEXT:    s_mov_b32 s5, s2
+; VI-NEXT:    s_lshr_b64 s[2:3], s[4:5], 7
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -169,8 +162,10 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 7
+; GFX9-NEXT:    s_mov_b32 s4, s3
+; GFX9-NEXT:    s_mov_b32 s5, s2
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[4:5], 7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -191,25 +186,34 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 7
+; GFX10-NEXT:    s_mov_b32 s4, s3
+; GFX10-NEXT:    s_mov_b32 s5, s2
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[4:5], 7
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fshr_i32_imm:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, s2, s3, 7
+; GFX11-NEXT:    s_mov_b32 s4, s3
+; GFX11-NEXT:    s_mov_b32 s5, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[4:5], 7
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: fshr_i32_imm:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_alignbit_b32 v1, s2, s3, 7
+; GFX12-NEXT:    s_mov_b32 s4, s3
+; GFX12-NEXT:    s_mov_b32 s5, s2
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshr_b64 s[2:3], s[4:5], 7
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -218,22 +222,125 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @fshr_i32_imm_src0(ptr addrspace(1) %in, i32 %x, i32 %y) {
+; SI-LABEL: fshr_i32_imm_src0:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s9, 7
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s8, s3
+; SI-NEXT:    s_and_b32 s0, s2, 31
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: fshr_i32_imm_src0:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s5, 7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s3
+; VI-NEXT:    s_and_b32 s2, s2, 31
+; VI-NEXT:    s_lshr_b64 s[2:3], s[4:5], s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fshr_i32_imm_src0:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_mov_b32 s5, 7
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, s3
+; GFX9-NEXT:    s_and_b32 s2, s2, 31
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[4:5], s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; R600-LABEL: fshr_i32_imm_src0:
+; R600:       ; %bb.0: ; %entry
+; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; R600-NEXT:     BIT_ALIGN_INT * T1.X, literal.x, KC0[2].W, KC0[2].Z,
+; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
+;
+; GFX10-LABEL: fshr_i32_imm_src0:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_mov_b32 s5, 7
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s4, s3
+; GFX10-NEXT:    s_and_b32 s2, s2, 31
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[4:5], s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fshr_i32_imm_src0:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_mov_b32 s5, 7
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s4, s3
+; GFX11-NEXT:    s_and_b32 s2, s2, 31
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[4:5], s2
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fshr_i32_imm_src0:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s5, 7
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s4, s3
+; GFX12-NEXT:    s_and_b32 s2, s2, 31
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshr_b64 s[2:3], s[4:5], s2
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %0 = call i32 @llvm.fshr.i32(i32 7, i32 %y, i32 %x)
+  store i32 %0, ptr addrspace(1) %in
+  ret void
+}
+
 define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
 ; SI-LABEL: fshr_v2i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xf
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    v_alignbit_b32 v1, s1, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v2
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_mov_b32 s6, s3
+; SI-NEXT:    s_mov_b32 s7, s1
+; SI-NEXT:    s_and_b32 s1, s5, 31
+; SI-NEXT:    s_mov_b32 s3, s0
+; SI-NEXT:    s_and_b32 s0, s4, 31
+; SI-NEXT:    s_lshr_b64 s[6:7], s[6:7], s1
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s6
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshr_v2i32:
@@ -242,13 +349,16 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_alignbit_b32 v1, s1, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_alignbit_b32 v0, s0, v2, v0
+; VI-NEXT:    s_mov_b32 s8, s3
+; VI-NEXT:    s_mov_b32 s9, s1
+; VI-NEXT:    s_and_b32 s1, s7, 31
+; VI-NEXT:    s_mov_b32 s3, s0
+; VI-NEXT:    s_and_b32 s0, s6, 31
+; VI-NEXT:    s_lshr_b64 s[8:9], s[8:9], s1
+; VI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -260,12 +370,15 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_alignbit_b32 v1, s1, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v3
+; GFX9-NEXT:    s_mov_b32 s4, s3
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_and_b32 s1, s7, 31
+; GFX9-NEXT:    s_mov_b32 s3, s0
+; GFX9-NEXT:    s_and_b32 s0, s6, 31
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s1
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[2:3], s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -286,79 +399,62 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ; GFX10-LABEL: fshr_v2i32:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
 ; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s7
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_alignbit_b32 v1, s1, s3, v0
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, s2, v2
-; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[8:9]
+; GFX10-NEXT:    s_mov_b32 s4, s3
+; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_mov_b32 s3, s0
+; GFX10-NEXT:    s_and_b32 s0, s6, 31
+; GFX10-NEXT:    s_and_b32 s6, s7, 31
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[2:3], s0
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[4:5], s6
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-TRUE16-LABEL: fshr_v2i32:
-; GFX11-TRUE16:       ; %bb.0: ; %entry
-; GFX11-TRUE16-NEXT:    s_clause 0x2
-; GFX11-TRUE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, s6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_alignbit_b32 v1, s1, s3, v0.l
-; GFX11-TRUE16-NEXT:    v_alignbit_b32 v0, s0, s2, v0.h
-; GFX11-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-FAKE16-LABEL: fshr_v2i32:
-; GFX11-FAKE16:       ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, s1, s3, v0
-; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, s0, s2, v2
-; GFX11-FAKE16-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
-; GFX11-FAKE16-NEXT:    s_endpgm
-;
-; GFX12-TRUE16-LABEL: fshr_v2i32:
-; GFX12-TRUE16:       ; %bb.0: ; %entry
-; GFX12-TRUE16-NEXT:    s_clause 0x2
-; GFX12-TRUE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s7
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, s6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_alignbit_b32 v1, s1, s3, v0.l
-; GFX12-TRUE16-NEXT:    v_alignbit_b32 v0, s0, s2, v0.h
-; GFX12-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-TRUE16-NEXT:    s_endpgm
-;
-; GFX12-FAKE16-LABEL: fshr_v2i32:
-; GFX12-FAKE16:       ; %bb.0: ; %entry
-; GFX12-FAKE16-NEXT:    s_clause 0x2
-; GFX12-FAKE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_alignbit_b32 v1, s1, s3, v0
-; GFX12-FAKE16-NEXT:    v_alignbit_b32 v0, s0, s2, v2
-; GFX12-FAKE16-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
-; GFX12-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: fshr_v2i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s8, s3
+; GFX11-NEXT:    s_mov_b32 s9, s1
+; GFX11-NEXT:    s_mov_b32 s3, s0
+; GFX11-NEXT:    s_and_b32 s0, s6, 31
+; GFX11-NEXT:    s_and_b32 s6, s7, 31
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[2:3], s0
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[8:9], s6
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fshr_v2i32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x2
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s3
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_and_b32 s0, s6, 31
+; GFX12-NEXT:    s_and_b32 s6, s7, 31
+; GFX12-NEXT:    s_lshr_b64 s[0:1], s[2:3], s0
+; GFX12-NEXT:    s_lshr_b64 s[2:3], s[8:9], s6
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-NEXT:    s_endpgm
 entry:
   %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
   store <2 x i32> %0, ptr addrspace(1) %in
@@ -373,10 +469,13 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    v_alignbit_b32 v1, s1, v0, 9
-; SI-NEXT:    v_alignbit_b32 v0, s0, v2, 7
+; SI-NEXT:    s_mov_b32 s8, s3
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_mov_b32 s3, s0
+; SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], 9
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], 7
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s8
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -385,11 +484,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_alignbit_b32 v1, s1, v0, 9
-; VI-NEXT:    v_alignbit_b32 v0, s0, v2, 7
+; VI-NEXT:    s_mov_b32 s6, s3
+; VI-NEXT:    s_mov_b32 s7, s1
+; VI-NEXT:    s_mov_b32 s3, s0
+; VI-NEXT:    s_lshr_b64 s[0:1], s[6:7], 9
+; VI-NEXT:    s_lshr_b64 s[2:3], s[2:3], 7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s0
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -400,10 +502,13 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NEXT:    v_alignbit_b32 v1, s1, v0, 9
-; GFX9-NEXT:    v_alignbit_b32 v0, s0, v3, 7
+; GFX9-NEXT:    s_mov_b32 s4, s3
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_mov_b32 s3, s0
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 9
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 7
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -428,8 +533,13 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, s1, s3, 9
-; GFX10-NEXT:    v_alignbit_b32 v0, s0, s2, 7
+; GFX10-NEXT:    s_mov_b32 s4, s3
+; GFX10-NEXT:    s_mov_b32 s3, s0
+; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[2:3], 7
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[4:5], 9
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -438,10 +548,15 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, s1, s3, 9
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, s2, 7
+; GFX11-NEXT:    s_mov_b32 s6, s3
+; GFX11-NEXT:    s_mov_b32 s3, s0
+; GFX11-NEXT:    s_mov_b32 s7, s1
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[2:3], 7
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[6:7], 9
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -450,10 +565,15 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_alignbit_b32 v1, s1, s3, 9
-; GFX12-NEXT:    v_alignbit_b32 v0, s0, s2, 7
+; GFX12-NEXT:    s_mov_b32 s6, s3
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s7, s1
+; GFX12-NEXT:    s_lshr_b64 s[0:1], s[2:3], 7
+; GFX12-NEXT:    s_lshr_b64 s[2:3], s[6:7], 9
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -462,28 +582,173 @@ entry:
   ret void
 }
 
-define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
-; SI-LABEL: fshr_v4i32:
+define amdgpu_kernel void @fshr_v2i32_imm_src1(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
+; SI-LABEL: fshr_v2i32_imm_src1:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x15
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s8, 9
+; SI-NEXT:    s_mov_b32 s10, 7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_and_b32 s1, s3, 31
+; SI-NEXT:    s_mov_b32 s11, s0
+; SI-NEXT:    s_and_b32 s0, s2, 31
+; SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], s1
+; SI-NEXT:    s_lshr_b64 s[0:1], s[10:11], s0
 ; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: fshr_v2i32_imm_src1:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s6, 9
+; VI-NEXT:    s_mov_b32 s8, 7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s7, s1
+; VI-NEXT:    s_and_b32 s1, s3, 31
+; VI-NEXT:    s_mov_b32 s9, s0
+; VI-NEXT:    s_and_b32 s0, s2, 31
+; VI-NEXT:    s_lshr_b64 s[6:7], s[6:7], s1
+; VI-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fshr_v2i32_imm_src1:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_mov_b32 s4, 9
+; GFX9-NEXT:    s_mov_b32 s8, 7
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_and_b32 s1, s3, 31
+; GFX9-NEXT:    s_mov_b32 s9, s0
+; GFX9-NEXT:    s_and_b32 s0, s2, 31
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s1
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    s_endpgm
+;
+; R600-LABEL: fshr_v2i32_imm_src1:
+; R600:       ; %bb.0: ; %entry
+; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].X, literal.x, KC0[3].Z,
+; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
+; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, literal.x, KC0[3].Y,
+; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
+; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX10-LABEL: fshr_v2i32_imm_src1:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT:    s_mov_b32 s4, 9
+; GFX10-NEXT:    s_mov_b32 s8, 7
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_mov_b32 s9, s0
+; GFX10-NEXT:    s_and_b32 s0, s2, 31
+; GFX10-NEXT:    s_and_b32 s2, s3, 31
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[4:5], s2
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fshr_v2i32_imm_src1:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_mov_b32 s6, 9
+; GFX11-NEXT:    s_mov_b32 s8, 7
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s7, s1
+; GFX11-NEXT:    s_mov_b32 s9, s0
+; GFX11-NEXT:    s_and_b32 s0, s2, 31
+; GFX11-NEXT:    s_and_b32 s2, s3, 31
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[6:7], s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fshr_v2i32_imm_src1:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s6, 9
+; GFX12-NEXT:    s_mov_b32 s8, 7
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s7, s1
+; GFX12-NEXT:    s_mov_b32 s9, s0
+; GFX12-NEXT:    s_and_b32 s0, s2, 31
+; GFX12-NEXT:    s_and_b32 s2, s3, 31
+; GFX12-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
+; GFX12-NEXT:    s_lshr_b64 s[2:3], s[6:7], s2
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> <i32 7, i32 9>, <2 x i32> %y)
+  store <2 x i32> %0, ptr addrspace(1) %in
+  ret void
+}
+
+define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
+; SI-LABEL: fshr_v4i32:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x15
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s15
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    v_alignbit_b32 v3, s11, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s14
-; SI-NEXT:    v_mov_b32_e32 v1, s2
-; SI-NEXT:    v_alignbit_b32 v2, s10, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s13
-; SI-NEXT:    v_mov_b32_e32 v1, s1
-; SI-NEXT:    v_alignbit_b32 v1, s9, v0, v1
-; SI-NEXT:    v_mov_b32_e32 v0, s12
-; SI-NEXT:    v_mov_b32_e32 v4, s0
-; SI-NEXT:    v_alignbit_b32 v0, s8, v0, v4
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    s_mov_b32 s4, s15
+; SI-NEXT:    s_mov_b32 s5, s11
+; SI-NEXT:    s_and_b32 s6, s19, 31
+; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; SI-NEXT:    s_mov_b32 s15, s10
+; SI-NEXT:    s_and_b32 s5, s18, 31
+; SI-NEXT:    s_lshr_b64 s[6:7], s[14:15], s5
+; SI-NEXT:    s_mov_b32 s10, s13
+; SI-NEXT:    s_mov_b32 s11, s9
+; SI-NEXT:    s_and_b32 s5, s17, 31
+; SI-NEXT:    s_lshr_b64 s[10:11], s[10:11], s5
+; SI-NEXT:    s_mov_b32 s13, s8
+; SI-NEXT:    s_and_b32 s5, s16, 31
+; SI-NEXT:    s_lshr_b64 s[8:9], s[12:13], s5
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s4
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fshr_v4i32:
@@ -492,19 +757,25 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s15
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s14
-; VI-NEXT:    v_alignbit_b32 v3, s11, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_alignbit_b32 v2, s10, v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s13
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_alignbit_b32 v1, s9, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s12
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_alignbit_b32 v0, s8, v0, v4
+; VI-NEXT:    s_mov_b32 s6, s15
+; VI-NEXT:    s_mov_b32 s7, s11
+; VI-NEXT:    s_and_b32 s3, s3, 31
+; VI-NEXT:    s_mov_b32 s15, s10
+; VI-NEXT:    s_and_b32 s2, s2, 31
+; VI-NEXT:    s_mov_b32 s10, s13
+; VI-NEXT:    s_mov_b32 s11, s9
+; VI-NEXT:    s_and_b32 s1, s1, 31
+; VI-NEXT:    s_mov_b32 s13, s8
+; VI-NEXT:    s_and_b32 s0, s0, 31
+; VI-NEXT:    s_lshr_b64 s[6:7], s[6:7], s3
+; VI-NEXT:    s_lshr_b64 s[2:3], s[14:15], s2
+; VI-NEXT:    s_lshr_b64 s[10:11], s[10:11], s1
+; VI-NEXT:    s_lshr_b64 s[0:1], s[12:13], s0
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s10
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s6
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
@@ -516,18 +787,24 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s15
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_alignbit_b32 v3, s11, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s14
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_alignbit_b32 v2, s10, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_alignbit_b32 v1, s9, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s12
-; GFX9-NEXT:    v_mov_b32_e32 v5, s0
-; GFX9-NEXT:    v_alignbit_b32 v0, s8, v0, v5
+; GFX9-NEXT:    s_mov_b32 s4, s15
+; GFX9-NEXT:    s_mov_b32 s5, s11
+; GFX9-NEXT:    s_and_b32 s3, s3, 31
+; GFX9-NEXT:    s_mov_b32 s15, s10
+; GFX9-NEXT:    s_and_b32 s2, s2, 31
+; GFX9-NEXT:    s_mov_b32 s10, s13
+; GFX9-NEXT:    s_mov_b32 s11, s9
+; GFX9-NEXT:    s_and_b32 s1, s1, 31
+; GFX9-NEXT:    s_mov_b32 s13, s8
+; GFX9-NEXT:    s_and_b32 s0, s0, 31
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s3
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[14:15], s2
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s1
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[12:13], s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -552,101 +829,87 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
 ; GFX10-LABEL: fshr_v4i32:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v6, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s3
+; GFX10-NEXT:    s_mov_b32 s4, s15
+; GFX10-NEXT:    s_mov_b32 s5, s11
+; GFX10-NEXT:    s_and_b32 s11, s3, 31
+; GFX10-NEXT:    s_mov_b32 s15, s10
+; GFX10-NEXT:    s_and_b32 s10, s2, 31
+; GFX10-NEXT:    s_mov_b32 s2, s13
+; GFX10-NEXT:    s_mov_b32 s3, s9
+; GFX10-NEXT:    s_and_b32 s16, s1, 31
+; GFX10-NEXT:    s_mov_b32 s13, s8
+; GFX10-NEXT:    s_and_b32 s8, s0, 31
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], s11
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[14:15], s10
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s16
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    v_mov_b32_e32 v4, s1
-; GFX10-NEXT:    v_mov_b32_e32 v5, s0
-; GFX10-NEXT:    v_alignbit_b32 v3, s11, s15, v0
-; GFX10-NEXT:    v_alignbit_b32 v2, s10, s14, v1
-; GFX10-NEXT:    v_alignbit_b32 v1, s9, s13, v4
-; GFX10-NEXT:    v_alignbit_b32 v0, s8, s12, v5
-; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[6:7]
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s0
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-TRUE16-LABEL: fshr_v4i32:
-; GFX11-TRUE16:       ; %bb.0: ; %entry
-; GFX11-TRUE16-NEXT:    s_clause 0x2
-; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54
-; GFX11-TRUE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
-; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_alignbit_b32 v3, s11, s15, v0.l
-; GFX11-TRUE16-NEXT:    v_alignbit_b32 v2, s10, s14, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_alignbit_b32 v1, s9, s13, v1.l
-; GFX11-TRUE16-NEXT:    v_alignbit_b32 v0, s8, s12, v4.l
-; GFX11-TRUE16-NEXT:    global_store_b128 v5, v[0:3], s[4:5]
-; GFX11-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-FAKE16-LABEL: fshr_v4i32:
-; GFX11-FAKE16:       ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54
-; GFX11-FAKE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
-; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_alignbit_b32 v3, s11, s15, v0
-; GFX11-FAKE16-NEXT:    v_alignbit_b32 v2, s10, s14, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, s9, s13, v4
-; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, s8, s12, v5
-; GFX11-FAKE16-NEXT:    global_store_b128 v6, v[0:3], s[4:5]
-; GFX11-FAKE16-NEXT:    s_endpgm
-;
-; GFX12-TRUE16-LABEL: fshr_v4i32:
-; GFX12-TRUE16:       ; %bb.0: ; %entry
-; GFX12-TRUE16-NEXT:    s_clause 0x2
-; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54
-; GFX12-TRUE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
-; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, 0
-; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_alignbit_b32 v3, s11, s15, v0.l
-; GFX12-TRUE16-NEXT:    v_alignbit_b32 v2, s10, s14, v0.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_alignbit_b32 v1, s9, s13, v1.l
-; GFX12-TRUE16-NEXT:    v_alignbit_b32 v0, s8, s12, v4.l
-; GFX12-TRUE16-NEXT:    global_store_b128 v5, v[0:3], s[4:5]
-; GFX12-TRUE16-NEXT:    s_endpgm
-;
-; GFX12-FAKE16-LABEL: fshr_v4i32:
-; GFX12-FAKE16:       ; %bb.0: ; %entry
-; GFX12-FAKE16-NEXT:    s_clause 0x2
-; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54
-; GFX12-FAKE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
-; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, 0
-; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_alignbit_b32 v3, s11, s15, v0
-; GFX12-FAKE16-NEXT:    v_alignbit_b32 v2, s10, s14, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT:    v_alignbit_b32 v1, s9, s13, v4
-; GFX12-FAKE16-NEXT:    v_alignbit_b32 v0, s8, s12, v5
-; GFX12-FAKE16-NEXT:    global_store_b128 v6, v[0:3], s[4:5]
-; GFX12-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: fshr_v4i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s6, s15
+; GFX11-NEXT:    s_mov_b32 s7, s11
+; GFX11-NEXT:    s_and_b32 s11, s3, 31
+; GFX11-NEXT:    s_mov_b32 s15, s10
+; GFX11-NEXT:    s_and_b32 s10, s2, 31
+; GFX11-NEXT:    s_mov_b32 s2, s13
+; GFX11-NEXT:    s_mov_b32 s3, s9
+; GFX11-NEXT:    s_and_b32 s16, s1, 31
+; GFX11-NEXT:    s_mov_b32 s13, s8
+; GFX11-NEXT:    s_and_b32 s8, s0, 31
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[6:7], s11
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[14:15], s10
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s6
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fshr_v4i32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x2
+; GFX12-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s6, s15
+; GFX12-NEXT:    s_mov_b32 s7, s11
+; GFX12-NEXT:    s_and_b32 s11, s3, 31
+; GFX12-NEXT:    s_mov_b32 s15, s10
+; GFX12-NEXT:    s_and_b32 s10, s2, 31
+; GFX12-NEXT:    s_mov_b32 s2, s13
+; GFX12-NEXT:    s_mov_b32 s3, s9
+; GFX12-NEXT:    s_and_b32 s16, s1, 31
+; GFX12-NEXT:    s_mov_b32 s13, s8
+; GFX12-NEXT:    s_and_b32 s8, s0, 31
+; GFX12-NEXT:    s_lshr_b64 s[0:1], s[6:7], s11
+; GFX12-NEXT:    s_lshr_b64 s[6:7], s[14:15], s10
+; GFX12-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
+; GFX12-NEXT:    s_lshr_b64 s[2:3], s[2:3], s16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
+; GFX12-NEXT:    s_endpgm
 entry:
   %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
   store <4 x i32> %0, ptr addrspace(1) %in
@@ -661,14 +924,20 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s15
-; SI-NEXT:    v_mov_b32_e32 v1, s14
-; SI-NEXT:    v_alignbit_b32 v3, s11, v0, 1
-; SI-NEXT:    v_mov_b32_e32 v0, s13
-; SI-NEXT:    v_alignbit_b32 v2, s10, v1, 9
-; SI-NEXT:    v_alignbit_b32 v1, s9, v0, 7
-; SI-NEXT:    v_mov_b32_e32 v0, s12
-; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
+; SI-NEXT:    s_mov_b32 s4, s15
+; SI-NEXT:    s_mov_b32 s5, s11
+; SI-NEXT:    s_mov_b32 s15, s10
+; SI-NEXT:    s_mov_b32 s10, s13
+; SI-NEXT:    s_mov_b32 s11, s9
+; SI-NEXT:    s_mov_b32 s13, s8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
+; SI-NEXT:    s_lshr_b64 s[6:7], s[14:15], 9
+; SI-NEXT:    s_lshr_b64 s[10:11], s[10:11], 7
+; SI-NEXT:    s_lshr_b64 s[8:9], s[12:13], 1
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s4
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -677,15 +946,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s15
-; VI-NEXT:    v_mov_b32_e32 v1, s14
-; VI-NEXT:    v_mov_b32_e32 v4, s13
-; VI-NEXT:    v_alignbit_b32 v3, s11, v0, 1
-; VI-NEXT:    v_alignbit_b32 v2, s10, v1, 9
-; VI-NEXT:    v_alignbit_b32 v1, s9, v4, 7
-; VI-NEXT:    v_mov_b32_e32 v0, s12
+; VI-NEXT:    s_mov_b32 s2, s15
+; VI-NEXT:    s_mov_b32 s3, s11
+; VI-NEXT:    s_mov_b32 s15, s10
+; VI-NEXT:    s_mov_b32 s6, s13
+; VI-NEXT:    s_mov_b32 s7, s9
+; VI-NEXT:    s_mov_b32 s13, s8
+; VI-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; VI-NEXT:    s_lshr_b64 s[4:5], s[14:15], 9
+; VI-NEXT:    s_lshr_b64 s[6:7], s[6:7], 7
+; VI-NEXT:    s_lshr_b64 s[8:9], s[12:13], 1
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
@@ -696,14 +971,20 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s15
-; GFX9-NEXT:    v_mov_b32_e32 v1, s14
-; GFX9-NEXT:    v_alignbit_b32 v3, s11, v0, 1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s13
-; GFX9-NEXT:    v_alignbit_b32 v2, s10, v1, 9
-; GFX9-NEXT:    v_alignbit_b32 v1, s9, v0, 7
-; GFX9-NEXT:    v_mov_b32_e32 v0, s12
-; GFX9-NEXT:    v_alignbit_b32 v0, s8, v0, 1
+; GFX9-NEXT:    s_mov_b32 s2, s15
+; GFX9-NEXT:    s_mov_b32 s3, s11
+; GFX9-NEXT:    s_mov_b32 s15, s10
+; GFX9-NEXT:    s_mov_b32 s6, s13
+; GFX9-NEXT:    s_mov_b32 s7, s9
+; GFX9-NEXT:    s_mov_b32 s13, s8
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[14:15], 9
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[6:7], 7
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[12:13], 1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -730,10 +1011,20 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v3, s11, s15, 1
-; GFX10-NEXT:    v_alignbit_b32 v2, s10, s14, 9
-; GFX10-NEXT:    v_alignbit_b32 v1, s9, s13, 7
-; GFX10-NEXT:    v_alignbit_b32 v0, s8, s12, 1
+; GFX10-NEXT:    s_mov_b32 s2, s15
+; GFX10-NEXT:    s_mov_b32 s3, s11
+; GFX10-NEXT:    s_mov_b32 s15, s10
+; GFX10-NEXT:    s_mov_b32 s4, s13
+; GFX10-NEXT:    s_mov_b32 s5, s9
+; GFX10-NEXT:    s_mov_b32 s13, s8
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[14:15], 9
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[12:13], 1
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], 7
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -742,12 +1033,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v3, s11, s15, 1
-; GFX11-NEXT:    v_alignbit_b32 v2, s10, s14, 9
-; GFX11-NEXT:    v_alignbit_b32 v1, s9, s13, 7
-; GFX11-NEXT:    v_alignbit_b32 v0, s8, s12, 1
+; GFX11-NEXT:    s_mov_b32 s2, s15
+; GFX11-NEXT:    s_mov_b32 s3, s11
+; GFX11-NEXT:    s_mov_b32 s15, s10
+; GFX11-NEXT:    s_mov_b32 s4, s13
+; GFX11-NEXT:    s_mov_b32 s5, s9
+; GFX11-NEXT:    s_mov_b32 s13, s8
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[14:15], 9
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[12:13], 1
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], 7
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4
+; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2
+; GFX11-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -756,12 +1056,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_alignbit_b32 v3, s11, s15, 1
-; GFX12-NEXT:    v_alignbit_b32 v2, s10, s14, 9
-; GFX12-NEXT:    v_alignbit_b32 v1, s9, s13, 7
-; GFX12-NEXT:    v_alignbit_b32 v0, s8, s12, 1
+; GFX12-NEXT:    s_mov_b32 s2, s15
+; GFX12-NEXT:    s_mov_b32 s3, s11
+; GFX12-NEXT:    s_mov_b32 s15, s10
+; GFX12-NEXT:    s_mov_b32 s4, s13
+; GFX12-NEXT:    s_mov_b32 s5, s9
+; GFX12-NEXT:    s_mov_b32 s13, s8
+; GFX12-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX12-NEXT:    s_lshr_b64 s[6:7], s[14:15], 9
+; GFX12-NEXT:    s_lshr_b64 s[8:9], s[12:13], 1
+; GFX12-NEXT:    s_lshr_b64 s[4:5], s[4:5], 7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4
+; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -770,6 +1079,194 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @fshr_v4i32_imm_src0(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) {
+; SI-LABEL: fshr_v4i32_imm_src0:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 33
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s6, s11
+; SI-NEXT:    s_and_b32 s4, s15, 31
+; SI-NEXT:    s_lshr_b64 s[4:5], s[6:7], s4
+; SI-NEXT:    s_mov_b32 s11, 9
+; SI-NEXT:    s_and_b32 s5, s14, 31
+; SI-NEXT:    s_lshr_b64 s[6:7], s[10:11], s5
+; SI-NEXT:    s_mov_b32 s11, 7
+; SI-NEXT:    s_mov_b32 s10, s9
+; SI-NEXT:    s_and_b32 s5, s13, 31
+; SI-NEXT:    s_lshr_b64 s[10:11], s[10:11], s5
+; SI-NEXT:    s_mov_b32 s9, 1
+; SI-NEXT:    s_and_b32 s5, s12, 31
+; SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], s5
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s4
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: fshr_v4i32_imm_src0:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VI-NEXT:    s_mov_b32 s1, 33
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s11
+; VI-NEXT:    s_and_b32 s4, s15, 31
+; VI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
+; VI-NEXT:    s_mov_b32 s11, 9
+; VI-NEXT:    s_and_b32 s1, s14, 31
+; VI-NEXT:    s_lshr_b64 s[4:5], s[10:11], s1
+; VI-NEXT:    s_mov_b32 s6, s9
+; VI-NEXT:    s_and_b32 s1, s13, 31
+; VI-NEXT:    s_lshr_b64 s[6:7], s[6:7], s1
+; VI-NEXT:    s_mov_b32 s9, 1
+; VI-NEXT:    s_and_b32 s1, s12, 31
+; VI-NEXT:    s_lshr_b64 s[8:9], s[8:9], s1
+; VI-NEXT:    v_mov_b32_e32 v5, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    v_mov_b32_e32 v4, s2
+; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fshr_v4i32_imm_src0:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX9-NEXT:    s_mov_b32 s1, 33
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 7
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s11
+; GFX9-NEXT:    s_and_b32 s4, s15, 31
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
+; GFX9-NEXT:    s_mov_b32 s11, 9
+; GFX9-NEXT:    s_and_b32 s1, s14, 31
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[10:11], s1
+; GFX9-NEXT:    s_mov_b32 s6, s9
+; GFX9-NEXT:    s_and_b32 s1, s13, 31
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[6:7], s1
+; GFX9-NEXT:    s_mov_b32 s9, 1
+; GFX9-NEXT:    s_and_b32 s1, s12, 31
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX9-NEXT:    s_endpgm
+;
+; R600-LABEL: fshr_v4i32_imm_src0:
+; R600:       ; %bb.0: ; %entry
+; R600-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     BIT_ALIGN_INT * T0.W, literal.x, KC0[4].X, KC0[5].X,
+; R600-NEXT:    33(4.624285e-44), 0(0.000000e+00)
+; R600-NEXT:     BIT_ALIGN_INT * T0.Z, literal.x, KC0[3].W, KC0[4].W,
+; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
+; R600-NEXT:     BIT_ALIGN_INT * T0.Y, literal.x, KC0[3].Z, KC0[4].Z,
+; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
+; R600-NEXT:     BIT_ALIGN_INT * T0.X, 1, KC0[3].Y, KC0[4].Y,
+; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX10-LABEL: fshr_v4i32_imm_src0:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT:    s_mov_b32 s1, 33
+; GFX10-NEXT:    s_mov_b32 s3, 7
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s11
+; GFX10-NEXT:    s_and_b32 s4, s15, 31
+; GFX10-NEXT:    s_mov_b32 s11, 9
+; GFX10-NEXT:    s_and_b32 s5, s14, 31
+; GFX10-NEXT:    s_mov_b32 s2, s9
+; GFX10-NEXT:    s_and_b32 s13, s13, 31
+; GFX10-NEXT:    s_mov_b32 s9, 1
+; GFX10-NEXT:    s_and_b32 s12, s12, 31
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[10:11], s5
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s12
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s13
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s0
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fshr_v4i32_imm_src0:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_mov_b32 s1, 33
+; GFX11-NEXT:    s_mov_b32 s3, 7
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s11
+; GFX11-NEXT:    s_and_b32 s6, s15, 31
+; GFX11-NEXT:    s_mov_b32 s11, 9
+; GFX11-NEXT:    s_and_b32 s7, s14, 31
+; GFX11-NEXT:    s_mov_b32 s2, s9
+; GFX11-NEXT:    s_and_b32 s13, s13, 31
+; GFX11-NEXT:    s_mov_b32 s9, 1
+; GFX11-NEXT:    s_and_b32 s12, s12, 31
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s6
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[10:11], s7
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s12
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s13
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s6
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fshr_v4i32_imm_src0:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s1, 33
+; GFX12-NEXT:    s_mov_b32 s3, 7
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s0, s11
+; GFX12-NEXT:    s_and_b32 s6, s15, 31
+; GFX12-NEXT:    s_mov_b32 s11, 9
+; GFX12-NEXT:    s_and_b32 s7, s14, 31
+; GFX12-NEXT:    s_mov_b32 s2, s9
+; GFX12-NEXT:    s_and_b32 s13, s13, 31
+; GFX12-NEXT:    s_mov_b32 s9, 1
+; GFX12-NEXT:    s_and_b32 s12, s12, 31
+; GFX12-NEXT:    s_lshr_b64 s[0:1], s[0:1], s6
+; GFX12-NEXT:    s_lshr_b64 s[6:7], s[10:11], s7
+; GFX12-NEXT:    s_lshr_b64 s[8:9], s[8:9], s12
+; GFX12-NEXT:    s_lshr_b64 s[2:3], s[2:3], s13
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> <i32 1, i32 7, i32 9, i32 33>, <4 x i32> %x, <4 x i32> %y)
+  store <4 x i32> %0, ptr addrspace(1) %in
+  ret void
+}
+
 define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
 ; GFX89-LABEL: v_fshr_i32:
 ; GFX89:       ; %bb.0:
@@ -2091,29 +2588,109 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fshr_v2i24:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_mul_hi_u32 v6, 0xaaaaaab, v4
-; GFX11-NEXT:    v_mul_hi_u32 v7, 0xaaaaaab, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
-; GFX11-NEXT:    v_mul_u32_u24_e32 v7, 24, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 8, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 8, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_alignbit_b32 v0, v0, v2, v4
-; GFX11-NEXT:    v_alignbit_b32 v1, v1, v3, v5
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fshr_v2i24:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mul_hi_u32 v6, 0xaaaaaab, v4
+; GFX11-TRUE16-NEXT:    v_mul_hi_u32 v7, 0xaaaaaab, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
+; GFX11-TRUE16-NEXT:    v_mul_u32_u24_e32 v7, 24, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 8, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 8, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_alignbit_b32 v0, v0, v2, v4.l
+; GFX11-TRUE16-NEXT:    v_alignbit_b32 v1, v1, v3, v5.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fshr_v2i24:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_mul_hi_u32 v6, 0xaaaaaab, v4
+; GFX11-FAKE16-NEXT:    v_mul_hi_u32 v7, 0xaaaaaab, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
+; GFX11-FAKE16-NEXT:    v_mul_u32_u24_e32 v7, 24, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 8, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 8, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v0, v2, v4
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, v1, v3, v5
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_fshr_v2i24:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_mul_hi_u32 v6, 0xaaaaaab, v4
+; GFX12-TRUE16-NEXT:    v_mul_hi_u32 v7, 0xaaaaaab, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
+; GFX12-TRUE16-NEXT:    v_mul_u32_u24_e32 v7, 24, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
+; GFX12-TRUE16-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 8, v4
+; GFX12-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 8, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_alignbit_b32 v0, v0, v2, v4.l
+; GFX12-TRUE16-NEXT:    v_alignbit_b32 v1, v1, v3, v5.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fshr_v2i24:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_mul_hi_u32 v6, 0xaaaaaab, v4
+; GFX12-FAKE16-NEXT:    v_mul_hi_u32 v7, 0xaaaaaab, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
+; GFX12-FAKE16-NEXT:    v_mul_u32_u24_e32 v7, 24, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
+; GFX12-FAKE16-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 8, v4
+; GFX12-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 8, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_alignbit_b32 v0, v0, v2, v4
+; GFX12-FAKE16-NEXT:    v_alignbit_b32 v1, v1, v3, v5
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)
   ret <2 x i24> %ret
 }
diff --git a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
index b5f0b2ff9ef4c..61902b5fd4661 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
@@ -18,8 +18,8 @@ define void @gep_noflags_alloca(i32 %idx, i32 %val) #0 {
 ; GFX9-LABEL: gep_noflags_alloca:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT:    s_lshr_b32 s4, s32, 6
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -45,8 +45,8 @@ define void @gep_inbounds_alloca(i32 %idx, i32 %val) #0 {
 ; GFX9-LABEL: gep_inbounds_alloca:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT:    s_lshr_b32 s4, s32, 6
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -72,8 +72,8 @@ define void @gep_nuw_alloca(i32 %idx, i32 %val) #0 {
 ; GFX9-LABEL: gep_nuw_alloca:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT:    s_lshr_b32 s4, s32, 6
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -99,8 +99,8 @@ define void @gep_nusw_alloca(i32 %idx, i32 %val) #0 {
 ; GFX9-LABEL: gep_nusw_alloca:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT:    s_lshr_b32 s4, s32, 6
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -126,8 +126,8 @@ define void @gep_inbounds_nuw_alloca(i32 %idx, i32 %val) #0 {
 ; GFX9-LABEL: gep_inbounds_nuw_alloca:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT:    s_lshr_b32 s4, s32, 6
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -153,8 +153,8 @@ define void @gep_nusw_nuw_alloca(i32 %idx, i32 %val) #0 {
 ; GFX9-LABEL: gep_nusw_nuw_alloca:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT:    s_lshr_b32 s4, s32, 6
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index b750d28ffa7d3..ba81446a4bc09 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -807,7 +807,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX10-NEXT:    buffer_store_dword v95, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    v_writelane_b32 v100, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    s_clause 0x1f
+; GFX10-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX10-NEXT:    buffer_load_dword v95, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v94, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    buffer_load_dword v93, off, s[0:3], s33 offset:8
@@ -863,7 +863,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s1, return_100xi32@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, return_100xi32@abs32@lo
 ; GFX11-NEXT:    s_addk_i32 s32, 0x90
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:124
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:120
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:116
@@ -898,7 +898,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s33
 ; GFX11-NEXT:    v_writelane_b32 v100, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v95, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v94, off, s33 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v93, off, s33 offset:8
@@ -2518,7 +2518,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-LABEL: return_72xi32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0xc
+; GFX11-NEXT:    s_clause 0xc ; 52-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:212
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:208
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:204
@@ -2551,23 +2551,23 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:96
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:92
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:88
+; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
-; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-NEXT:    s_clause 0x5
 ; GFX11-NEXT:    scratch_load_b32 v23, off, s32 offset:112
 ; GFX11-NEXT:    scratch_load_b32 v22, off, s32 offset:108
 ; GFX11-NEXT:    scratch_load_b32 v21, off, s32 offset:104
-; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
-; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    scratch_load_b32 v19, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:124
 ; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:120
+; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-NEXT:    s_clause 0x10
 ; GFX11-NEXT:    scratch_load_b32 v15, off, s32 offset:144
 ; GFX11-NEXT:    scratch_load_b32 v14, off, s32 offset:140
 ; GFX11-NEXT:    scratch_load_b32 v13, off, s32 offset:136
-; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-NEXT:    s_clause 0xd
 ; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:160
 ; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:156
 ; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:152
@@ -2608,7 +2608,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
 ; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    s_clause 0xc
+; GFX11-NEXT:    s_clause 0xc ; 52-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:164
 ; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:168
 ; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:172
@@ -2641,21 +2641,6 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    s_mov_b32 s34, s32
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0x28000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
@@ -2733,6 +2718,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    v_mov_b32_e32 v29, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 0
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:636
@@ -2914,21 +2914,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    s_mov_b32 s38, s34
 ; GFX10-NEXT:    s_mov_b32 s34, s32
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0x14000
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_writelane_b32 v63, s30, 0
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
@@ -2971,12 +2957,11 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
-; GFX10-NEXT:    v_writelane_b32 v63, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x200, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x200, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 0
@@ -3006,6 +2991,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 0
 ; GFX10-NEXT:    s_mov_b32 s37, return_72xi32@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s36, return_72xi32@abs32@lo
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    v_writelane_b32 v63, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX10-NEXT:    s_clause 0x28
@@ -3138,7 +3138,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:152
 ; GFX10-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:156
 ; GFX10-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:160
-; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:1536
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:1540
 ; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:1544
@@ -3151,7 +3151,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x400, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX10-NEXT:    s_clause 0xe
+; GFX10-NEXT:    s_clause 0xe ; 60-byte Folded Reload
 ; GFX10-NEXT:    buffer_load_dword v62, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8
@@ -3199,7 +3199,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_mov_b32 s36, s34
 ; GFX11-NEXT:    s_mov_b32 s34, s32
 ; GFX11-NEXT:    s_addk_i32 s32, 0xa00
-; GFX11-NEXT:    s_clause 0xb
+; GFX11-NEXT:    s_clause 0xb ; 48-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:40
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:36
@@ -3341,18 +3341,18 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_add_i32 s2, s32, 16
 ; GFX11-NEXT:    v_mov_b32_e32 v30, v46
 ; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s2
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1584
+; GFX11-NEXT:    s_clause 0x3 ; 64-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:1568
 ; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:1552
 ; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:1536
+; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1584
 ; GFX11-NEXT:    s_add_i32 s2, s33, 0x400
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_clause 0xb
+; GFX11-NEXT:    s_clause 0xb ; 48-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s33 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s33 offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index f80716939f618..93d7eeb085107 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -255,11 +255,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-SDAG-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:16
 ; GCN-SDAG-NEXT:    global_load_b128 v[30:33], v[0:1], off
 ; GCN-SDAG-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:64
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0x70
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[48:49], 48
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[38:39], 0x60
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[50:51], 32
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0x70
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[64:65], 16
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[38:39], 0x60
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[66:67], 0
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[52:53], 0x50
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[54:55], 64
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
index 7e1055b2a28a4..03b56cad85dac 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
@@ -11,7 +11,7 @@ body: |
     ; CHECK-LABEL: name: mimg_nsa
     ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) {
     ; CHECK-NEXT:   S_CLAUSE 1
     ; CHECK-NEXT:   $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
     ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
@@ -29,7 +29,7 @@ body: |
     ; CHECK-LABEL: name: mimg_nsa_mixed
     ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 {
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) {
     ; CHECK-NEXT:   S_CLAUSE 2
     ; CHECK-NEXT:   $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
     ; CHECK-NEXT:   $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx11 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
index 9689dda9932ed..68f9e839012c3 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
@@ -10,7 +10,7 @@ body: |
     ; CHECK-LABEL: name: mimg
     ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) {
     ; CHECK-NEXT:   S_CLAUSE 1
     ; CHECK-NEXT:   $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
     ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
@@ -28,7 +28,7 @@ body: |
     ; CHECK-LABEL: name: mimg_mixed
     ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 {
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) {
     ; CHECK-NEXT:   S_CLAUSE 2
     ; CHECK-NEXT:   $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
     ; CHECK-NEXT:   $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll
index 4aa49f2c9296d..1db476300c261 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll
+++ b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll
@@ -6,9 +6,9 @@ define amdgpu_kernel void @foo() {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1
-; CHECK-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0
-; CHECK-NEXT:    flat_store_b64 v[0:1], v[2:3]
+; CHECK-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1
+; CHECK-NEXT:    v_mov_b32_e32 v0, v1
+; CHECK-NEXT:    flat_store_b64 v[1:2], v[0:1]
 ; CHECK-NEXT:    s_endpgm
 entry:
   br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
index 6b9b77c228350..437b4e8b9b493 100644
--- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
@@ -1112,11 +1112,11 @@ body:             |
     ; GCN-NEXT: S_WAITCNT 0
     ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
     ; GCN-NEXT: S_NOP 0
-    ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, killed renamable $vgpr2
+    ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, killed renamable $vgpr2
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31
     S_WAITCNT 0
     renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, killed renamable $vgpr2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, killed renamable $vgpr2
     S_SETPC_B64_return undef $sgpr30_sgpr31
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
index 8bd6c0f2652cf..d24b3a23cb9cd 100644
--- a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
+++ b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
@@ -22,7 +22,7 @@ define amdgpu_kernel void @dynamic_shared_array_0(ptr addrspace(1) %out) {
 }
 
 ; CHECK-LABEL: {{^}}dynamic_shared_array_1:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0xc00
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0xc00
 ; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
 define amdgpu_kernel void @dynamic_shared_array_1(ptr addrspace(1) %out, i32 %cond) {
 entry:
@@ -49,7 +49,7 @@ endif:                                            ; preds = %else, %if
 }
 
 ; CHECK-LABEL: {{^}}dynamic_shared_array_2:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x4000
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x4000
 ; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
 define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -64,7 +64,7 @@ define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) {
 ; The offset to the dynamic shared memory array should be aligned on the type
 ; specified.
 ; CHECK-LABEL: {{^}}dynamic_shared_array_3:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x44
 ; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
 define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -80,7 +80,7 @@ define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) {
 ; The offset to the dynamic shared memory array should be aligned on the
 ; maximal one.
 ; CHECK-LABEL: {{^}}dynamic_shared_array_4:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x48
 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
 define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) {
@@ -99,7 +99,7 @@ define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) {
 
 ; Honor the explicit alignment from the specified variable.
 ; CHECK-LABEL: {{^}}dynamic_shared_array_5:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x44
 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
 define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) {
@@ -118,7 +118,7 @@ define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) {
 
 ; Honor the explicit alignment from the specified variable.
 ; CHECK-LABEL: {{^}}dynamic_shared_array_6:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x50
 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
 define amdgpu_kernel void @dynamic_shared_array_6(i32 %idx) {
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 10d61deed71cc..76f204dd0c16a 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -82,9 +82,9 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_mov_b32 s70, s68
 ; CHECK-NEXT:    s_mov_b32 s71, s68
 ; CHECK-NEXT:    v_writelane_b32 v7, s67, 31
-; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[60:67], s[68:71] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1
 ; CHECK-NEXT:    v_readlane_b32 s52, v7, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, v2
+; CHECK-NEXT:    v_mov_b32_e32 v3, v2
 ; CHECK-NEXT:    v_readlane_b32 s53, v7, 1
 ; CHECK-NEXT:    v_readlane_b32 s54, v7, 2
 ; CHECK-NEXT:    v_readlane_b32 s55, v7, 3
@@ -97,14 +97,14 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    v_readlane_b32 s60, v7, 8
 ; CHECK-NEXT:    v_readlane_b32 s61, v7, 9
 ; CHECK-NEXT:    v_readlane_b32 s62, v7, 10
-; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[52:59], s[68:71] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v4, v[2:3], s[52:59], s[68:71] dmask:0x1
 ; CHECK-NEXT:    v_readlane_b32 s63, v7, 11
 ; CHECK-NEXT:    v_readlane_b32 s64, v7, 12
 ; CHECK-NEXT:    v_readlane_b32 s65, v7, 13
 ; CHECK-NEXT:    v_readlane_b32 s66, v7, 14
 ; CHECK-NEXT:    v_readlane_b32 s67, v7, 15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mul_f32_e32 v0, v4, v3
+; CHECK-NEXT:    v_mul_f32_e32 v0, v4, v1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_3
@@ -118,13 +118,13 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    v_readlane_b32 s65, v7, 29
 ; CHECK-NEXT:    v_readlane_b32 s66, v7, 30
 ; CHECK-NEXT:    v_readlane_b32 s67, v7, 31
+; CHECK-NEXT:    v_mov_b32_e32 v1, v2
 ; CHECK-NEXT:    s_and_b64 vcc, exec, -1
 ; CHECK-NEXT:    v_readlane_b32 s53, v7, 17
 ; CHECK-NEXT:    v_readlane_b32 s54, v7, 18
 ; CHECK-NEXT:    v_readlane_b32 s55, v7, 19
+; CHECK-NEXT:    image_sample_lz v3, v[2:3], s[60:67], s[68:71] dmask:0x1
 ; CHECK-NEXT:    v_readlane_b32 s56, v7, 20
-; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[60:67], s[68:71] dmask:0x1
-; CHECK-NEXT:    v_mov_b32_e32 v1, v2
 ; CHECK-NEXT:    v_readlane_b32 s57, v7, 21
 ; CHECK-NEXT:    v_readlane_b32 s58, v7, 22
 ; CHECK-NEXT:    v_readlane_b32 s59, v7, 23
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index 3e2e43faca5aa..df635925b87df 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -36,26 +36,60 @@ loop:
   br label %loop
 }
 
+define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP:%.*]] []
+; IR:       loop:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP]] []
+; IR:       DummyReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  callbr void asm "", ""() to label %loop []
+
+loop:
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop []
+}
+
 define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loop_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT:    s_cbranch_execz .LBB1_3
+; SI-NEXT:    s_cbranch_execz .LBB2_3
 ; SI-NEXT:  ; %bb.1: ; %loop.preheader
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
 ; SI-NEXT:    s_and_b64 vcc, exec, -1
-; SI-NEXT:  .LBB1_2: ; %loop
+; SI-NEXT:  .LBB2_2: ; %loop
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccnz .LBB1_2
-; SI-NEXT:  .LBB1_3: ; %UnifiedReturnBlock
+; SI-NEXT:    s_cbranch_vccnz .LBB2_2
+; SI-NEXT:  .LBB2_3: ; %UnifiedReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loop_ret(
 ; IR-NEXT:  entry:
@@ -81,44 +115,93 @@ return:
   ret void
 }
 
+define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_ret_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:  ; %bb.1: ; %loop.preheader
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:  .LBB3_2: ; Inline asm indirect target
+; SI-NEXT:    ; %UnifiedReturnBlock
+; SI-NEXT:    ; Label of block must be emitted
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_ret_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP]], 1
+; IR-NEXT:    [[COND32:%.*]] = zext i1 [[COND]] to i32
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND32]])
+; IR-NEXT:            to label [[LOOP:%.*]] [label %UnifiedReturnBlock]
+; IR:       loop:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP]] []
+; IR:       UnifiedReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %cond = icmp eq i32 %tmp, 1
+  %cond32 = zext i1 %cond to i32
+  callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return]
+
+loop:
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop []
+
+return:
+  ret void
+}
+
 define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loops:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b64 s[2:3], -1
-; SI-NEXT:    s_cbranch_scc1 .LBB2_4
+; SI-NEXT:    s_cbranch_scc1 .LBB4_4
 ; SI-NEXT:  ; %bb.1:
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x378
 ; SI-NEXT:    s_and_b64 vcc, exec, -1
-; SI-NEXT:  .LBB2_2: ; %loop2
+; SI-NEXT:  .LBB4_2: ; %loop2
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccnz .LBB2_2
+; SI-NEXT:    s_cbranch_vccnz .LBB4_2
 ; SI-NEXT:  ; %bb.3: ; %Flow
 ; SI-NEXT:    s_mov_b64 s[2:3], 0
-; SI-NEXT:  .LBB2_4: ; %Flow2
+; SI-NEXT:  .LBB4_4: ; %Flow2
 ; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccz .LBB2_7
+; SI-NEXT:    s_cbranch_vccz .LBB4_7
 ; SI-NEXT:  ; %bb.5:
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
 ; SI-NEXT:    s_and_b64 vcc, exec, 0
-; SI-NEXT:  .LBB2_6: ; %loop1
+; SI-NEXT:  .LBB4_6: ; %loop1
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccz .LBB2_6
-; SI-NEXT:  .LBB2_7: ; %DummyReturnBlock
+; SI-NEXT:    s_cbranch_vccz .LBB4_6
+; SI-NEXT:  .LBB4_7: ; %DummyReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loops(
 ; IR-NEXT:  entry:
@@ -144,24 +227,78 @@ loop2:
   br label %loop2
 }
 
+define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loops_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:  ; %bb.1: ; %loop1
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  .LBB5_2: ; Inline asm indirect target
+; SI-NEXT:    ; %loop2.preheader
+; SI-NEXT:    ; Label of block must be emitted
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x378
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loops_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 poison)
+; IR-NEXT:            to label [[LOOP1:%.*]] [label %loop2]
+; IR:       loop1:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP1]] []
+; IR:       loop2:
+; IR-NEXT:    store volatile i32 888, ptr addrspace(1) [[OUT]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK1:%.*]], label [[DUMMYRETURNBLOCK]]
+; IR:       TransitionBlock1:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP2:%.*]] []
+; IR:       DummyReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  callbr void asm "", "r,!i"(i32 poison) to label %loop1 [label %loop2]
+
+loop1:
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop1 []
+
+loop2:
+  store volatile i32 888, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop2 []
+}
+
 define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loop_nest_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT:    s_cbranch_execz .LBB3_5
+; SI-NEXT:    s_cbranch_execz .LBB6_5
 ; SI-NEXT:  ; %bb.1: ; %outer_loop.preheader
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 3, v0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
-; SI-NEXT:  .LBB3_2: ; %outer_loop
+; SI-NEXT:  .LBB6_2: ; %outer_loop
 ; SI-NEXT:    ; =>This Loop Header: Depth=1
-; SI-NEXT:    ; Child Loop BB3_3 Depth 2
+; SI-NEXT:    ; Child Loop BB6_3 Depth 2
 ; SI-NEXT:    s_mov_b64 s[2:3], 0
-; SI-NEXT:  .LBB3_3: ; %inner_loop
-; SI-NEXT:    ; Parent Loop BB3_2 Depth=1
+; SI-NEXT:  .LBB6_3: ; %inner_loop
+; SI-NEXT:    ; Parent Loop BB6_2 Depth=1
 ; SI-NEXT:    ; => This Inner Loop Header: Depth=2
 ; SI-NEXT:    s_and_b64 s[8:9], exec, s[0:1]
 ; SI-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
@@ -169,13 +306,13 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; SI-NEXT:    s_cbranch_execnz .LBB3_3
+; SI-NEXT:    s_cbranch_execnz .LBB6_3
 ; SI-NEXT:  ; %bb.4: ; %loop.exit.guard
-; SI-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; SI-NEXT:    ; in Loop: Header=BB6_2 Depth=1
 ; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SI-NEXT:    s_mov_b64 vcc, 0
-; SI-NEXT:    s_branch .LBB3_2
-; SI-NEXT:  .LBB3_5: ; %UnifiedReturnBlock
+; SI-NEXT:    s_branch .LBB6_2
+; SI-NEXT:  .LBB6_5: ; %UnifiedReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loop_nest_ret(
 ; IR-NEXT:  entry:
@@ -212,4 +349,82 @@ return:
   ret void
 }
 
+define amdgpu_kernel void @infinite_loop_nest_ret_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_nest_ret_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:  ; %bb.1: ; %outer_loop.preheader
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_and_b64 s[0:1], exec, 0
+; SI-NEXT:    s_branch .LBB7_3
+; SI-NEXT:  .LBB7_2: ; %loop.exit.guard
+; SI-NEXT:    ; in Loop: Header=BB7_3 Depth=1
+; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT:    s_cbranch_vccnz .LBB7_5
+; SI-NEXT:  .LBB7_3: ; %outer_loop
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[2:3], -1
+; SI-NEXT:    s_mov_b64 vcc, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB7_2
+; SI-NEXT:  ; %bb.4: ; %TransitionBlock.target.outer_loop
+; SI-NEXT:    ; in Loop: Header=BB7_3 Depth=1
+; SI-NEXT:    s_mov_b64 s[2:3], 0
+; SI-NEXT:    s_branch .LBB7_2
+; SI-NEXT:  .LBB7_5: ; Inline asm indirect target
+; SI-NEXT:    ; %UnifiedReturnBlock
+; SI-NEXT:    ; Label of block must be emitted
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_nest_ret_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT:    [[COND1:%.*]] = icmp ne i32 [[TMP]], 1
+; IR-NEXT:    [[COND1_32:%.*]] = zext i1 [[COND1]] to i32
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND1_32]])
+; IR-NEXT:            to label [[OUTER_LOOP:%.*]] [label %UnifiedReturnBlock]
+; IR:       outer_loop:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[INNER_LOOP:%.*]] []
+; IR:       inner_loop:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    [[COND3:%.*]] = icmp eq i32 [[TMP]], 3
+; IR-NEXT:    [[COND3_32:%.*]] = zext i1 [[COND3]] to i32
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND3_32]])
+; IR-NEXT:            to label [[INNER_LOOP]] [label %outer_loop]
+; IR:       UnifiedReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %cond1 = icmp ne i32 %tmp, 1  ; avoid following BB optimizing away through the domination
+  %cond1_32 = zext i1 %cond1 to i32
+  callbr void asm "", "r,!i"(i32 %cond1_32) to label %outer_loop [label %return]
+
+outer_loop:
+  ; %cond2 = icmp eq i32 %tmp, 2
+  ; br i1 %cond2, label %outer_loop, label %inner_loop
+  callbr void asm "", ""() to label %inner_loop []
+
+inner_loop:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  %cond3 = icmp eq i32 %tmp, 3
+  %cond3_32 = zext i1 %cond3 to i32
+  callbr void asm "", "r,!i"(i32 %cond3_32) to label %inner_loop [label %outer_loop]
+
+return:
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
index 1445f6b7b58be..382a8d38fd652 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
@@ -486,7 +486,7 @@ body:             |
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-  ; CHECK-NEXT:   INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; CHECK-NEXT:   INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     S_NOP 0, implicit-def $agpr0
@@ -516,7 +516,7 @@ body:             |
     S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
     S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
     S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-    INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
+    INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
     S_ENDPGM 0
 
 ...
@@ -1368,7 +1368,7 @@ body:             |
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
   ; CHECK-NEXT:   early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -1408,7 +1408,7 @@ body:             |
     undef %2.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     early-clobber %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %2, 0, 0, 0, implicit $mode, implicit $exec
     early-clobber %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, %4
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, %4
     S_CBRANCH_VCCNZ %bb.1, implicit $vcc
     S_BRANCH %bb.2
 
@@ -1726,7 +1726,7 @@ body:             |
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
+  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -1763,7 +1763,7 @@ body:             |
     undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
     %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, %4
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, %4
     S_CBRANCH_VCCNZ %bb.1, implicit $vcc
     S_BRANCH %bb.2
 
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index 82b9458d09c80..9e1d59064cb5e 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,16 +8,16 @@
 define amdgpu_kernel void @s_input_output_i128() {
   ; GFX908-LABEL: name: s_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10354698 /* regdef:SGPR_128 */, def %13
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9764874 /* regdef:SGPR_128 */, def %13
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %13
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10354697 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9764873 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: s_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10354698 /* regdef:SGPR_128 */, def %11
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9764874 /* regdef:SGPR_128 */, def %11
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %11
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10354697 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9764873 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=s"()
   call void asm sideeffect "; use $0", "s"(i128 %val)
@@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() {
 define amdgpu_kernel void @v_input_output_i128() {
   ; GFX908-LABEL: name: v_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128 */, def %13
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7012362 /* regdef:VReg_128 */, def %13
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %13
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7602185 /* reguse:VReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7012361 /* reguse:VReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: v_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128_Align2 */, def %11
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:VReg_128_Align2 */, def %11
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:VReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:VReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=v"()
   call void asm sideeffect "; use $0", "v"(i128 %val)
@@ -47,16 +47,16 @@ define amdgpu_kernel void @a_input_output_i128() {
 
   ; GFX908-LABEL: name: a_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8519690 /* regdef:AReg_128 */, def %13
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:AReg_128 */, def %13
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY %13
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: a_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9240586 /* regdef:AReg_128_Align2 */, def %11
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8650762 /* regdef:AReg_128_Align2 */, def %11
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = call i128 asm sideeffect "; def $0", "=a"()
   call void asm sideeffect "; use $0", "a"(i128 %val)
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 44bd4090436ef..7cbf9aeacfe48 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1508,35 +1508,33 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2
 ; SI-LABEL: dynamic_insertelement_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; SI-NEXT:    s_mov_b32 s7, 0x100f000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_lshl_b32 s1, s3, 4
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_xor_b32 s0, s2, 0x50005
-; SI-NEXT:    s_lshl_b32 s1, 0xffff, s1
-; SI-NEXT:    s_and_b32 s0, s0, s1
-; SI-NEXT:    s_xor_b32 s0, s0, s2
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_lshl_b32 s5, s5, 4
+; SI-NEXT:    s_xor_b32 s6, s4, 0x50005
+; SI-NEXT:    s_lshl_b32 s5, 0xffff, s5
+; SI-NEXT:    s_and_b32 s5, s6, s5
+; SI-NEXT:    s_xor_b32 s4, s5, s4
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT:    s_mov_b32 s7, 0x1100f000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshl_b32 s1, s3, 4
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_xor_b32 s0, s2, 0x50005
-; VI-NEXT:    s_lshl_b32 s1, 0xffff, s1
-; VI-NEXT:    s_and_b32 s0, s0, s1
-; VI-NEXT:    s_xor_b32 s0, s0, s2
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; VI-NEXT:    s_lshl_b32 s5, s5, 4
+; VI-NEXT:    s_xor_b32 s6, s4, 0x50005
+; VI-NEXT:    s_lshl_b32 s5, 0xffff, s5
+; VI-NEXT:    s_and_b32 s5, s6, s5
+; VI-NEXT:    s_xor_b32 s4, s5, s4
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
   store <2 x i16> %vecins, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 76016e46426bd..92ea83fdfb982 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -238,11 +238,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s0, s2, 16
-; VI-NEXT:    v_alignbit_b32 v2, s0, v2, 16
+; VI-NEXT:    s_lshr_b32 s5, s2, 16
+; VI-NEXT:    s_lshr_b64 s[0:1], s[4:5], 16
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -256,11 +256,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v2, s4
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshr_b32 s0, s2, 16
-; CI-NEXT:    v_alignbit_b32 v2, s0, v2, 16
+; CI-NEXT:    s_lshr_b32 s5, s2, 16
+; CI-NEXT:    s_lshr_b64 s[0:1], s[4:5], 16
+; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
@@ -312,16 +312,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 s0, s4, 16
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s1, s2, 16
-; VI-NEXT:    v_alignbit_b32 v2, s1, v2, 16
+; VI-NEXT:    s_lshr_b32 s5, s2, 16
+; VI-NEXT:    s_lshr_b64 s[0:1], s[4:5], 16
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    ;;#ASMSTART
-; VI-NEXT:    ; use s0
+; VI-NEXT:    ; use s3
 ; VI-NEXT:    ;;#ASMEND
 ; VI-NEXT:    s_endpgm
 ;
@@ -334,16 +334,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s4
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_lshr_b32 s0, s4, 16
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    s_lshr_b32 s3, s4, 16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshr_b32 s1, s2, 16
-; CI-NEXT:    v_alignbit_b32 v2, s1, v2, 16
+; CI-NEXT:    s_lshr_b32 s5, s2, 16
+; CI-NEXT:    s_lshr_b64 s[0:1], s[4:5], 16
+; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    ;;#ASMSTART
-; CI-NEXT:    ; use s0
+; CI-NEXT:    ; use s3
 ; CI-NEXT:    ;;#ASMEND
 ; CI-NEXT:    s_endpgm
 ;
@@ -405,19 +405,19 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 s0, s4, 16
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s1, s2, 16
-; VI-NEXT:    v_alignbit_b32 v2, s1, v2, 16
+; VI-NEXT:    s_lshr_b32 s5, s2, 16
+; VI-NEXT:    s_lshr_b64 s[0:1], s[4:5], 16
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    ;;#ASMSTART
-; VI-NEXT:    ; use s0
+; VI-NEXT:    ; use s3
 ; VI-NEXT:    ;;#ASMEND
 ; VI-NEXT:    ;;#ASMSTART
-; VI-NEXT:    ; use s1
+; VI-NEXT:    ; use s5
 ; VI-NEXT:    ;;#ASMEND
 ; VI-NEXT:    s_endpgm
 ;
@@ -430,19 +430,19 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s4
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_lshr_b32 s0, s4, 16
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    s_lshr_b32 s3, s4, 16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshr_b32 s1, s2, 16
-; CI-NEXT:    v_alignbit_b32 v2, s1, v2, 16
+; CI-NEXT:    s_lshr_b32 s5, s2, 16
+; CI-NEXT:    s_lshr_b64 s[0:1], s[4:5], 16
+; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    ;;#ASMSTART
-; CI-NEXT:    ; use s0
+; CI-NEXT:    ; use s3
 ; CI-NEXT:    ;;#ASMEND
 ; CI-NEXT:    ;;#ASMSTART
-; CI-NEXT:    ; use s1
+; CI-NEXT:    ; use s5
 ; CI-NEXT:    ;;#ASMEND
 ; CI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 31b6b533866d4..5e2cec504c6a9 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -5775,28 +5775,26 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX7-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v8, vcc, v0, v6
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v4, v7, vcc
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v6
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v2, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v0, v8
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v2, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v6, v9, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v3, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5831,28 +5829,26 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 1, v0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, v0, v6
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v4, v7, vcc
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v6
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 1, v0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v7, vcc, 1, v5
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v2, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 1, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v1, vcc, v0, v8
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v2, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v6, v9, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v3, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v7, vcc, 1, v0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 1, v4
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5883,28 +5879,26 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX900-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX900-GISEL:       ; %bb.0: ; %entry
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v6
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v9, vcc, v4, v7, vcc
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v6
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v5
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, v4
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v1, vcc, v0, v8
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v2, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v11, vcc, v6, v9, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v3, v[5:6]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v4
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v9, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5935,29 +5929,29 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX90A-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX90A-GISEL:       ; %bb.0: ; %entry
 ; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v3, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v4
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, v0, v6
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v8, vcc, v1, v7, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v2, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v6, v3, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v2, v[6:7]
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v5, v5, v2
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v4
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v5, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v6, v[0:1]
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v4, v3, v0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v7, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v8, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, v[2:3]
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v3, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v6
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v8
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v10, vcc, v1, v9, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v3, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v2, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v10, v2, v[6:7]
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v6, v5, v8
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v4
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v6, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, v[0:1]
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v6, v3, v4
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[4:5]
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX90A-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5989,28 +5983,26 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 1
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v6, v2, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v6, v3, v[1:2]
-; GFX10-GISEL-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v6
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v7, v2, v[4:5]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v8, v2, 0
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v9, null, v4, v7, vcc_lo
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, v6
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v8, v3, v[1:2]
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 1
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v4, vcc_lo
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v5, v8, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v9, v2, v[6:7]
-; GFX10-GISEL-NEXT:    v_add_co_u32 v7, vcc_lo, v5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, v4
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v7, 0
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v6, vcc_lo
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v5, v10, v[2:3]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v3, v9, v[1:2]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v6, v8, v[4:5]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v4, v7, v[1:2]
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v2, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v8, v3, v[1:2]
+; GFX10-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v0, v8
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v9, v2, v[4:5]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v1, v2, 0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v9, null, v6, v9, vcc_lo
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v1, v3, v[5:6]
+; GFX10-GISEL-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 1
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v0, null, 0, v6, vcc_lo
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v4, v8, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v9, v2, v[7:8]
+; GFX10-GISEL-NEXT:    v_add_co_u32 v7, vcc_lo, v4, 1
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v4, v0, v[6:7]
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v4, null, 0, v2, vcc_lo
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v7, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v2, v8, v[3:4]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v5, v4, v[1:2]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v2, v7, v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -6049,37 +6041,35 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX11-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v7, vcc_lo, v0, 1
+; GFX11-GISEL-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, v1, vcc_lo
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v7, v2, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v7, v3, v[1:2]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v8, v2, v[4:5]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v7
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v4, v2, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v10, null, v5, v8, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v2, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v8, v3, v[1:2]
+; GFX11-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v0, v8
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v9, v2, v[4:5]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v1, v2, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v10, null, v6, v9, vcc_lo
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v1, v3, v[5:6]
 ; GFX11-GISEL-NEXT:    v_add_co_u32 v11, vcc_lo, v0, 1
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v12, null, 0, v5, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, v7
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v4, v3, v[1:2]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v6, v11, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v0, null, 0, v6, vcc_lo
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v4, v11, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v10, v2, v[7:8]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, v4
-; GFX11-GISEL-NEXT:    v_add_co_u32 v9, vcc_lo, v6, 1
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v6, v12, v[2:3]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v9, 0
+; GFX11-GISEL-NEXT:    v_add_co_u32 v9, vcc_lo, v4, 1
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v4, v0, v[6:7]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v9, 0
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v8, vcc_lo
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v8, v11, v[4:5]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v3, v10, v[1:2]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v9, v[6:7]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v8, v11, v[2:3]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v5, v10, v[1:2]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v3, v9, v[6:7]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1200-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -6410,50 +6400,44 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v12, vcc, 1, v0
 ; GFX7-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v14, vcc, 1, v2
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v0, v12
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v16, vcc, v8, v13, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v17, vcc, v2, v14
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v14
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v18, vcc, v9, v15, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v8, vcc
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v12, vcc, 1, v2
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v2, vcc, 0, v9, vcc
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v14, vcc, 1, v10
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v16, vcc, 1, v13
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v4, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v0, v12
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, v10, v13, vcc
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v16, vcc, 1, v2
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v3, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v9, vcc, v1, v16
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v11, v17, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v10, vcc
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v10, vcc, 1, v1
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v11, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v15, vcc, 1, v2
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7]
+; GFX7-GISEL-NEXT:    v_add_i32_e64 v16, s[4:5], 1, v8
+; GFX7-GISEL-NEXT:    v_addc_u32_e64 v6, s[4:5], 0, v13, s[4:5]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v14, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12]
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6515,50 +6499,44 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 1, v0
 ; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v14, vcc, 1, v2
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v3, vcc, v0, v12
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v16, vcc, v8, v13, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v17, vcc, v2, v14
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v14
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v18, vcc, v9, v15, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v11, vcc, 1, v0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v8, vcc
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 1, v2
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v2, vcc, 0, v9, vcc
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v14, vcc, 1, v10
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 1, v13
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v4, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v1, vcc, v0, v12
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, v10, v13, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 1, v2
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v9, vcc, v1, v16
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v11, v17, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v9, vcc, 1, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v10, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v10, vcc, 1, v1
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v11, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v15, vcc, 1, v2
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7]
+; GFX8-GISEL-NEXT:    v_add_u32_e64 v16, s[4:5], 1, v8
+; GFX8-GISEL-NEXT:    v_addc_u32_e64 v6, s[4:5], 0, v13, s[4:5]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v14, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12]
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6612,50 +6590,44 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v0
 ; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 1, v2
 ; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
 ; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v12
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v16, vcc, v8, v13, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v17, vcc, v2, v14
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v14
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v18, vcc, v9, v15, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v11, vcc, 1, v0
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v8, vcc
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, v6
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v9, vcc
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 1, v10
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, v7
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v16, vcc, 1, v13
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v17, vcc, 0, v4, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v1, vcc, v0, v12
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, v10, v13, vcc
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v16, vcc, 1, v2
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v17, vcc, 0, v3, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v9, vcc, v1, v16
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v12, vcc, v11, v17, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v9, vcc, 1, v0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v10, vcc
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v10, vcc, 1, v1
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v11, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v15, vcc, 1, v2
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e64 v16, s[4:5], 1, v8
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, v13, s[4:5]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v17, vcc, 0, v14, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6707,54 +6679,54 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX90A-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX90A-GISEL:       ; %bb.0: ; %entry
 ; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v10, vcc, 1, v0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v2
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v3, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v5, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v4, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v4, v[2:3]
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 1, v2
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[2:3]
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v8
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v7, 0
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, v0, v12
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v15, v6, v[8:9]
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v16, vcc, v1, v13, vcc
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v3, v3, v10
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, v2, v14
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v12, v5, 0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, v3, v15, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v4, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v4, v[10:11]
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v7, 0
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v9, v9, v12
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v6, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v6, v[10:11]
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v10, v5, v12
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v5, vcc, 1, v0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v11, vcc, 1, v2
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v12, vcc, 0, v3, vcc
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v13, vcc, 1, v8
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v9, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v5, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v5, v[0:1]
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v12, 0
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v15, vcc, 1, v4
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v8, v3, v6
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v11, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v11, v[0:1]
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v16, vcc, 0, v10, vcc
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v9, v7, v4
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v14, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v13, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v13, v[4:5]
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v16, 0
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v7, 0
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v10, vcc, v0, v10
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v6, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v6, v[8:9]
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v14, vcc, v1, v11, vcc
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v3, v3, v8
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, v2, v12
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v10, v4, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v10, v5, 0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, v3, v13, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v4, v[10:11]
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v12, v7, 0
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v9, v9, v4
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v6, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v6, v[10:11]
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v5, v5, v6
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v10, vcc, 1, v2
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v3, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v8
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v6, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v9, v6, v[0:1]
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v9, vcc
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v8, v3, v0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v11, 0
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 1, v4
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v10, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v10, v[0:1]
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v5, vcc
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v7, v7, v0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v12, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v13, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v12, v[2:3]
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v15, 0
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v14, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v14, v[4:5]
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v15, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v15, v[4:5]
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v3, v3, v6
 ; GFX90A-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i64:
@@ -6805,50 +6777,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_add_co_u32 v12, vcc_lo, v0, 1
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v13, null, 0, v1, vcc_lo
-; GFX10-GISEL-NEXT:    v_add_co_u32 v14, vcc_lo, v2, 1
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v12, v4, 0
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v15, null, 0, v3, vcc_lo
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v14, v6, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v12, v5, v[1:2]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v14, v7, v[3:4]
-; GFX10-GISEL-NEXT:    v_add_co_u32 v3, vcc_lo, v0, v12
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[10:11], null, v13, v4, v[8:9]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[11:12], null, v3, v4, 0
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v16, null, v10, v13, vcc_lo
-; GFX10-GISEL-NEXT:    v_add_co_u32 v17, vcc_lo, v2, v14
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v15, v6, v[9:10]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[13:14], null, v17, v6, 0
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v18, null, v8, v15, vcc_lo
-; GFX10-GISEL-NEXT:    v_add_co_u32 v19, vcc_lo, v0, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, v14
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v20, null, 0, v10, vcc_lo
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v11, v19, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[14:15], null, v3, v5, v[0:1]
-; GFX10-GISEL-NEXT:    v_add_co_u32 v15, vcc_lo, v2, 1
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v17, v7, v[1:2]
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v12, null, 0, v8, vcc_lo
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v13, v15, 0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, v10
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v16, v4, v[14:15]
-; GFX10-GISEL-NEXT:    v_add_co_u32 v14, vcc_lo, v11, 1
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v18, v6, v[0:1]
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, v8
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[10:11], null, v11, v20, v[1:2]
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v16, null, 0, v4, vcc_lo
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[11:12], null, v13, v12, v[6:7]
-; GFX10-GISEL-NEXT:    v_add_co_u32 v17, vcc_lo, v13, 1
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v9, v14, 0
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v18, null, 0, v5, vcc_lo
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[12:13], null, v4, v19, v[10:11]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v7, v17, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v5, v15, v[11:12]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v9, v16, v[1:2]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v7, v18, v[3:4]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v12, v14, v[5:6]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v4, v17, v[7:8]
+; GFX10-GISEL-NEXT:    v_add_co_u32 v11, vcc_lo, v0, 1
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v15, null, 0, v1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v16, vcc_lo, v2, 1
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v11, v4, 0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v17, null, 0, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v16, v6, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v11, v5, v[1:2]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v16, v7, v[3:4]
+; GFX10-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v0, v11
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[12:13], null, v15, v4, v[8:9]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[13:14], null, v17, v6, v[9:10]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v1, v4, 0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, v12, v15, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v16, vcc_lo, v2, v16
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v18, null, v13, v17, vcc_lo
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[10:11], null, v16, v6, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[14:15], null, v1, v5, v[9:10]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[15:16], null, v16, v7, v[11:12]
+; GFX10-GISEL-NEXT:    v_add_co_u32 v11, vcc_lo, v0, 1
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v0, null, 0, v12, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v12, vcc_lo, v2, 1
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[16:17], null, v3, v4, v[14:15]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v8, v11, 0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v13, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v13, vcc_lo, v8, 1
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[14:15], null, v18, v6, v[15:16]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v10, v12, 0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v15, null, 0, v16, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v17, vcc_lo, v10, 1
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e64 v18, null, 0, v14, vcc_lo
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v8, v0, v[5:6]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v6, v17, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v10, v1, v[7:8]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v13, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v16, v11, v[8:9]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v4, v15, v[1:2]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v14, v12, v[9:10]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v6, v18, v[3:4]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v7, v13, v[4:5]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v8, v17, v[9:10]
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6911,63 +6879,60 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v13, vcc_lo, v0, 1
+; GFX11-GISEL-NEXT:    v_add_co_u32 v11, vcc_lo, v0, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v14, null, 0, v1, vcc_lo
-; GFX11-GISEL-NEXT:    v_add_co_u32 v15, vcc_lo, v2, 1
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v13, v4, 0
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v16, null, 0, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v15, null, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v16, vcc_lo, v2, 1
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v11, v4, 0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v17, null, 0, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v15, v6, 0
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v13, v5, v[1:2]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v16, v6, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v11, v5, v[1:2]
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v15, v7, v[3:4]
-; GFX11-GISEL-NEXT:    v_add_co_u32 v3, vcc_lo, v0, v13
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[10:11], null, v14, v4, v[8:9]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[11:12], null, v16, v6, v[9:10]
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v17, null, v10, v14, vcc_lo
-; GFX11-GISEL-NEXT:    v_add_co_u32 v18, vcc_lo, v2, v15
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v3, v4, 0
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v19, null, v11, v16, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[12:13], null, v18, v6, 0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v20, vcc_lo, v0, 1
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v21, null, 0, v10, vcc_lo
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, v9
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v8, v20, 0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[13:14], null, v3, v5, v[0:1]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, v10
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[14:15], null, v18, v7, v[1:2]
-; GFX11-GISEL-NEXT:    v_add_co_u32 v18, vcc_lo, v2, 1
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v11, vcc_lo
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v16, v7, v[3:4]
+; GFX11-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v0, v11
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[12:13], null, v15, v4, v[8:9]
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[15:16], null, v17, v4, v[13:14]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v12, v18, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[10:11], null, v19, v6, v[14:15]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v8, v21, v[0:1]
-; GFX11-GISEL-NEXT:    v_add_co_u32 v14, vcc_lo, v8, 1
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v16, null, 0, v15, vcc_lo
-; GFX11-GISEL-NEXT:    v_add_co_u32 v17, vcc_lo, v12, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v12, v22, v[5:6]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v9, v14, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v4, v17, 0
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v19, null, 0, v10, vcc_lo
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[11:12], null, v15, v20, v[6:7]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[12:13], null, v10, v18, v[7:8]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v9, v16, v[1:2]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v4, v19, v[3:4]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[13:14], null, v17, v6, v[9:10]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v1, v4, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, v12, v15, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v18, vcc_lo, v2, v16
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v19, null, v13, v17, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[10:11], null, v18, v6, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[14:15], null, v1, v5, v[9:10]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[15:16], null, v18, v7, v[11:12]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[16:17], null, v3, v4, v[14:15]
+; GFX11-GISEL-NEXT:    v_add_co_u32 v14, vcc_lo, v0, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v11, v14, v[7:8]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v12, v17, v[8:9]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v0, null, 0, v12, vcc_lo
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v8, v14, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[17:18], null, v19, v6, v[15:16]
+; GFX11-GISEL-NEXT:    v_add_co_u32 v15, vcc_lo, v2, 1
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v13, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v13, vcc_lo, v8, 1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v10, v15, 0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v18, null, 0, v16, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v19, vcc_lo, v10, 1
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v20, null, 0, v17, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[11:12], null, v8, v0, v[5:6]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v10, v1, v[7:8]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v13, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v6, v19, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v16, v14, v[11:12]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[10:11], null, v17, v15, v[8:9]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v4, v18, v[1:2]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[11:12], null, v6, v20, v[3:4]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v9, v13, v[7:8]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v10, v19, v[11:12]
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 3eef616ba267d..ad894ce36c55b 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -97,8 +97,7 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
-; CHECK-NEXT:    s_add_i32 s0, s0, 4
+; CHECK-NEXT:    s_lshl2_add_u32 s0, s0, 4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    ds_write_b16 v1, v0
 ; CHECK-NEXT:    ds_write_b32 v2, v1
@@ -136,10 +135,9 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; CHECK-NEXT:    s_lshl_b32 s4, s17, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    s_add_i32 s4, s4, 4
+; CHECK-NEXT:    s_lshl2_add_u32 s4, s17, 4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
 ; CHECK-NEXT:    ds_write_b16 v1, v0
@@ -163,8 +161,7 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
-; CHECK-NEXT:    s_add_i32 s0, s0, 4
+; CHECK-NEXT:    s_lshl2_add_u32 s0, s0, 4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    ds_write_b16 v1, v0
 ; CHECK-NEXT:    ds_write_b32 v2, v1
@@ -202,10 +199,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; CHECK-NEXT:    s_lshl_b32 s4, s17, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    s_add_i32 s4, s4, 8
+; CHECK-NEXT:    s_lshl2_add_u32 s4, s17, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
 ; CHECK-NEXT:    ds_write_b16 v1, v0
@@ -229,8 +225,7 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
-; CHECK-NEXT:    s_add_i32 s0, s0, 8
+; CHECK-NEXT:    s_lshl2_add_u32 s0, s0, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    ds_write_b16 v1, v0
 ; CHECK-NEXT:    ds_write_b32 v2, v1
@@ -268,10 +263,9 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; CHECK-NEXT:    s_lshl_b32 s4, s17, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    s_add_i32 s4, s4, 8
+; CHECK-NEXT:    s_lshl2_add_u32 s4, s17, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
 ; CHECK-NEXT:    ds_write_b16 v1, v0
@@ -295,8 +289,7 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
-; CHECK-NEXT:    s_add_i32 s0, s0, 8
+; CHECK-NEXT:    s_lshl2_add_u32 s0, s0, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    ds_write_b16 v1, v0
 ; CHECK-NEXT:    ds_write_b32 v2, v1
@@ -334,10 +327,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
 ; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; CHECK-NEXT:    s_lshl_b32 s4, s17, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    s_add_i32 s4, s4, 8
+; CHECK-NEXT:    s_lshl2_add_u32 s4, s17, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
 ; CHECK-NEXT:    ds_write_b16 v1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
index 69a871f6f6ae5..fa0568d307907 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
@@ -262,12 +262,12 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) {
 ; ALIGNED-GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; ALIGNED-GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; ALIGNED-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; ALIGNED-GFX10-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; ALIGNED-GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
-; ALIGNED-GFX10-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; ALIGNED-GFX10-NEXT:    v_add_co_u32 v3, s0, s0, v0
+; ALIGNED-GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s0, s1, 0, s0
+; ALIGNED-GFX10-NEXT:    flat_load_dwordx2 v[0:1], v[3:4]
 ; ALIGNED-GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; ALIGNED-GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; ALIGNED-GFX10-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
+; ALIGNED-GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; ALIGNED-GFX10-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
 ; ALIGNED-GFX10-NEXT:    s_endpgm
 ;
 ; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v2:
@@ -275,12 +275,12 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) {
 ; UNALIGNED-GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; UNALIGNED-GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; UNALIGNED-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-GFX10-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; UNALIGNED-GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
-; UNALIGNED-GFX10-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; UNALIGNED-GFX10-NEXT:    v_add_co_u32 v3, s0, s0, v0
+; UNALIGNED-GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s0, s1, 0, s0
+; UNALIGNED-GFX10-NEXT:    flat_load_dwordx2 v[0:1], v[3:4]
 ; UNALIGNED-GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; UNALIGNED-GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; UNALIGNED-GFX10-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
+; UNALIGNED-GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; UNALIGNED-GFX10-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
 ; UNALIGNED-GFX10-NEXT:    s_endpgm
 ;
 ; ALIGNED-GFX11-LABEL: test_flat_misaligned_v2:
@@ -290,13 +290,13 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) {
 ; ALIGNED-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; ALIGNED-GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; ALIGNED-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; ALIGNED-GFX11-NEXT:    v_add_co_u32 v0, s0, s0, v0
+; ALIGNED-GFX11-NEXT:    v_add_co_u32 v3, s0, s0, v0
 ; ALIGNED-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; ALIGNED-GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
-; ALIGNED-GFX11-NEXT:    flat_load_b64 v[2:3], v[0:1]
+; ALIGNED-GFX11-NEXT:    v_add_co_ci_u32_e64 v4, null, s1, 0, s0
+; ALIGNED-GFX11-NEXT:    flat_load_b64 v[0:1], v[3:4]
 ; ALIGNED-GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; ALIGNED-GFX11-NEXT:    v_mov_b32_e32 v4, v2
-; ALIGNED-GFX11-NEXT:    flat_store_b64 v[0:1], v[3:4]
+; ALIGNED-GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; ALIGNED-GFX11-NEXT:    flat_store_b64 v[3:4], v[1:2]
 ; ALIGNED-GFX11-NEXT:    s_endpgm
 ;
 ; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v2:
@@ -306,13 +306,13 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) {
 ; UNALIGNED-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; UNALIGNED-GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; UNALIGNED-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-GFX11-NEXT:    v_add_co_u32 v0, s0, s0, v0
+; UNALIGNED-GFX11-NEXT:    v_add_co_u32 v3, s0, s0, v0
 ; UNALIGNED-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; UNALIGNED-GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
-; UNALIGNED-GFX11-NEXT:    flat_load_b64 v[2:3], v[0:1]
+; UNALIGNED-GFX11-NEXT:    v_add_co_ci_u32_e64 v4, null, s1, 0, s0
+; UNALIGNED-GFX11-NEXT:    flat_load_b64 v[0:1], v[3:4]
 ; UNALIGNED-GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; UNALIGNED-GFX11-NEXT:    v_mov_b32_e32 v4, v2
-; UNALIGNED-GFX11-NEXT:    flat_store_b64 v[0:1], v[3:4]
+; UNALIGNED-GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; UNALIGNED-GFX11-NEXT:    flat_store_b64 v[3:4], v[1:2]
 ; UNALIGNED-GFX11-NEXT:    s_endpgm
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -462,13 +462,12 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) {
 ; ALIGNED-GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; ALIGNED-GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; ALIGNED-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; ALIGNED-GFX10-NEXT:    v_add_co_u32 v5, s0, s0, v0
-; ALIGNED-GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s0, s1, 0, s0
-; ALIGNED-GFX10-NEXT:    flat_load_dwordx3 v[0:2], v[5:6]
+; ALIGNED-GFX10-NEXT:    v_add_co_u32 v4, s0, s0, v0
+; ALIGNED-GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s0, s1, 0, s0
+; ALIGNED-GFX10-NEXT:    flat_load_dwordx3 v[1:3], v[4:5]
 ; ALIGNED-GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; ALIGNED-GFX10-NEXT:    v_mov_b32_e32 v3, v0
-; ALIGNED-GFX10-NEXT:    v_mov_b32_e32 v4, v1
-; ALIGNED-GFX10-NEXT:    flat_store_dwordx3 v[5:6], v[2:4]
+; ALIGNED-GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; ALIGNED-GFX10-NEXT:    flat_store_dwordx3 v[4:5], v[0:2]
 ; ALIGNED-GFX10-NEXT:    s_endpgm
 ;
 ; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v3:
@@ -476,13 +475,12 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) {
 ; UNALIGNED-GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; UNALIGNED-GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; UNALIGNED-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-GFX10-NEXT:    v_add_co_u32 v5, s0, s0, v0
-; UNALIGNED-GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s0, s1, 0, s0
-; UNALIGNED-GFX10-NEXT:    flat_load_dwordx3 v[0:2], v[5:6]
+; UNALIGNED-GFX10-NEXT:    v_add_co_u32 v4, s0, s0, v0
+; UNALIGNED-GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s0, s1, 0, s0
+; UNALIGNED-GFX10-NEXT:    flat_load_dwordx3 v[1:3], v[4:5]
 ; UNALIGNED-GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; UNALIGNED-GFX10-NEXT:    v_mov_b32_e32 v3, v0
-; UNALIGNED-GFX10-NEXT:    v_mov_b32_e32 v4, v1
-; UNALIGNED-GFX10-NEXT:    flat_store_dwordx3 v[5:6], v[2:4]
+; UNALIGNED-GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; UNALIGNED-GFX10-NEXT:    flat_store_dwordx3 v[4:5], v[0:2]
 ; UNALIGNED-GFX10-NEXT:    s_endpgm
 ;
 ; ALIGNED-GFX11-LABEL: test_flat_misaligned_v3:
@@ -492,13 +490,13 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) {
 ; ALIGNED-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; ALIGNED-GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; ALIGNED-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; ALIGNED-GFX11-NEXT:    v_add_co_u32 v5, s0, s0, v0
+; ALIGNED-GFX11-NEXT:    v_add_co_u32 v4, s0, s0, v0
 ; ALIGNED-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; ALIGNED-GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s1, 0, s0
-; ALIGNED-GFX11-NEXT:    flat_load_b96 v[0:2], v[5:6]
+; ALIGNED-GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, s1, 0, s0
+; ALIGNED-GFX11-NEXT:    flat_load_b96 v[1:3], v[4:5]
 ; ALIGNED-GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; ALIGNED-GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
-; ALIGNED-GFX11-NEXT:    flat_store_b96 v[5:6], v[2:4]
+; ALIGNED-GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; ALIGNED-GFX11-NEXT:    flat_store_b96 v[4:5], v[0:2]
 ; ALIGNED-GFX11-NEXT:    s_endpgm
 ;
 ; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v3:
@@ -508,13 +506,13 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) {
 ; UNALIGNED-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; UNALIGNED-GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; UNALIGNED-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-GFX11-NEXT:    v_add_co_u32 v5, s0, s0, v0
+; UNALIGNED-GFX11-NEXT:    v_add_co_u32 v4, s0, s0, v0
 ; UNALIGNED-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; UNALIGNED-GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s1, 0, s0
-; UNALIGNED-GFX11-NEXT:    flat_load_b96 v[0:2], v[5:6]
+; UNALIGNED-GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, s1, 0, s0
+; UNALIGNED-GFX11-NEXT:    flat_load_b96 v[1:3], v[4:5]
 ; UNALIGNED-GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; UNALIGNED-GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
-; UNALIGNED-GFX11-NEXT:    flat_store_b96 v[5:6], v[2:4]
+; UNALIGNED-GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; UNALIGNED-GFX11-NEXT:    flat_store_b96 v[4:5], v[0:2]
 ; UNALIGNED-GFX11-NEXT:    s_endpgm
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -606,36 +604,33 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) {
 ; SPLIT:       ; %bb.0: ; %bb
 ; SPLIT-NEXT:    s_load_dword s0, s[4:5], 0x24
 ; SPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; SPLIT-NEXT:    v_lshl_add_u32 v5, v0, 2, s0
-; SPLIT-NEXT:    ds_read_b96 v[0:2], v5
+; SPLIT-NEXT:    v_lshl_add_u32 v4, v0, 2, s0
+; SPLIT-NEXT:    ds_read_b96 v[1:3], v4
 ; SPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; SPLIT-NEXT:    v_mov_b32_e32 v3, v0
-; SPLIT-NEXT:    v_mov_b32_e32 v4, v1
-; SPLIT-NEXT:    ds_write_b96 v5, v[2:4]
+; SPLIT-NEXT:    v_mov_b32_e32 v0, v3
+; SPLIT-NEXT:    ds_write_b96 v4, v[0:2]
 ; SPLIT-NEXT:    s_endpgm
 ;
 ; ALIGNED-GFX10-LABEL: test_local_aligned_v3:
 ; ALIGNED-GFX10:       ; %bb.0: ; %bb
 ; ALIGNED-GFX10-NEXT:    s_load_dword s0, s[4:5], 0x24
 ; ALIGNED-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; ALIGNED-GFX10-NEXT:    v_lshl_add_u32 v5, v0, 2, s0
-; ALIGNED-GFX10-NEXT:    ds_read_b96 v[0:2], v5
+; ALIGNED-GFX10-NEXT:    v_lshl_add_u32 v4, v0, 2, s0
+; ALIGNED-GFX10-NEXT:    ds_read_b96 v[1:3], v4
 ; ALIGNED-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; ALIGNED-GFX10-NEXT:    v_mov_b32_e32 v3, v0
-; ALIGNED-GFX10-NEXT:    v_mov_b32_e32 v4, v1
-; ALIGNED-GFX10-NEXT:    ds_write_b96 v5, v[2:4]
+; ALIGNED-GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; ALIGNED-GFX10-NEXT:    ds_write_b96 v4, v[0:2]
 ; ALIGNED-GFX10-NEXT:    s_endpgm
 ;
 ; UNALIGNED-GFX10-LABEL: test_local_aligned_v3:
 ; UNALIGNED-GFX10:       ; %bb.0: ; %bb
 ; UNALIGNED-GFX10-NEXT:    s_load_dword s0, s[4:5], 0x24
 ; UNALIGNED-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-GFX10-NEXT:    v_lshl_add_u32 v5, v0, 2, s0
-; UNALIGNED-GFX10-NEXT:    ds_read_b96 v[0:2], v5
+; UNALIGNED-GFX10-NEXT:    v_lshl_add_u32 v4, v0, 2, s0
+; UNALIGNED-GFX10-NEXT:    ds_read_b96 v[1:3], v4
 ; UNALIGNED-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-GFX10-NEXT:    v_mov_b32_e32 v3, v0
-; UNALIGNED-GFX10-NEXT:    v_mov_b32_e32 v4, v1
-; UNALIGNED-GFX10-NEXT:    ds_write_b96 v5, v[2:4]
+; UNALIGNED-GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; UNALIGNED-GFX10-NEXT:    ds_write_b96 v4, v[0:2]
 ; UNALIGNED-GFX10-NEXT:    s_endpgm
 ;
 ; ALIGNED-GFX11-LABEL: test_local_aligned_v3:
@@ -644,11 +639,11 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) {
 ; ALIGNED-GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; ALIGNED-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; ALIGNED-GFX11-NEXT:    v_lshl_add_u32 v5, v0, 2, s0
-; ALIGNED-GFX11-NEXT:    ds_load_b96 v[0:2], v5
+; ALIGNED-GFX11-NEXT:    v_lshl_add_u32 v4, v0, 2, s0
+; ALIGNED-GFX11-NEXT:    ds_load_b96 v[1:3], v4
 ; ALIGNED-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; ALIGNED-GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
-; ALIGNED-GFX11-NEXT:    ds_store_b96 v5, v[2:4]
+; ALIGNED-GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; ALIGNED-GFX11-NEXT:    ds_store_b96 v4, v[0:2]
 ; ALIGNED-GFX11-NEXT:    s_endpgm
 ;
 ; UNALIGNED-GFX11-LABEL: test_local_aligned_v3:
@@ -657,11 +652,11 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) {
 ; UNALIGNED-GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; UNALIGNED-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; UNALIGNED-GFX11-NEXT:    v_lshl_add_u32 v5, v0, 2, s0
-; UNALIGNED-GFX11-NEXT:    ds_load_b96 v[0:2], v5
+; UNALIGNED-GFX11-NEXT:    v_lshl_add_u32 v4, v0, 2, s0
+; UNALIGNED-GFX11-NEXT:    ds_load_b96 v[1:3], v4
 ; UNALIGNED-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
-; UNALIGNED-GFX11-NEXT:    ds_store_b96 v5, v[2:4]
+; UNALIGNED-GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; UNALIGNED-GFX11-NEXT:    ds_store_b96 v4, v[0:2]
 ; UNALIGNED-GFX11-NEXT:    s_endpgm
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -683,12 +678,12 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) {
 ; SPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; SPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; SPLIT-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; SPLIT-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
-; SPLIT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; SPLIT-NEXT:    v_add_co_u32 v3, s0, s0, v0
+; SPLIT-NEXT:    v_add_co_ci_u32_e64 v4, s0, s1, 0, s0
+; SPLIT-NEXT:    flat_load_dwordx2 v[0:1], v[3:4]
 ; SPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; SPLIT-NEXT:    v_mov_b32_e32 v4, v2
-; SPLIT-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
+; SPLIT-NEXT:    v_mov_b32_e32 v2, v0
+; SPLIT-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
 ; SPLIT-NEXT:    s_endpgm
 ;
 ; ALIGNED-GFX10-LABEL: test_flat_aligned_v2:
@@ -696,12 +691,12 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) {
 ; ALIGNED-GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; ALIGNED-GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; ALIGNED-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; ALIGNED-GFX10-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; ALIGNED-GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
-; ALIGNED-GFX10-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; ALIGNED-GFX10-NEXT:    v_add_co_u32 v3, s0, s0, v0
+; ALIGNED-GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s0, s1, 0, s0
+; ALIGNED-GFX10-NEXT:    flat_load_dwordx2 v[0:1], v[3:4]
 ; ALIGNED-GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; ALIGNED-GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; ALIGNED-GFX10-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
+; ALIGNED-GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; ALIGNED-GFX10-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
 ; ALIGNED-GFX10-NEXT:    s_endpgm
 ;
 ; UNALIGNED-GFX10-LABEL: test_flat_aligned_v2:
@@ -709,12 +704,12 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) {
 ; UNALIGNED-GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; UNALIGNED-GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; UNALIGNED-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-GFX10-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; UNALIGNED-GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
-; UNALIGNED-GFX10-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; UNALIGNED-GFX10-NEXT:    v_add_co_u32 v3, s0, s0, v0
+; UNALIGNED-GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s0, s1, 0, s0
+; UNALIGNED-GFX10-NEXT:    flat_load_dwordx2 v[0:1], v[3:4]
 ; UNALIGNED-GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; UNALIGNED-GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; UNALIGNED-GFX10-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
+; UNALIGNED-GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; UNALIGNED-GFX10-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
 ; UNALIGNED-GFX10-NEXT:    s_endpgm
 ;
 ; ALIGNED-GFX11-LABEL: test_flat_aligned_v2:
@@ -724,13 +719,13 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) {
 ; ALIGNED-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; ALIGNED-GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; ALIGNED-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; ALIGNED-GFX11-NEXT:    v_add_co_u32 v0, s0, s0, v0
+; ALIGNED-GFX11-NEXT:    v_add_co_u32 v3, s0, s0, v0
 ; ALIGNED-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; ALIGNED-GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
-; ALIGNED-GFX11-NEXT:    flat_load_b64 v[2:3], v[0:1]
+; ALIGNED-GFX11-NEXT:    v_add_co_ci_u32_e64 v4, null, s1, 0, s0
+; ALIGNED-GFX11-NEXT:    flat_load_b64 v[0:1], v[3:4]
 ; ALIGNED-GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; ALIGNED-GFX11-NEXT:    v_mov_b32_e32 v4, v2
-; ALIGNED-GFX11-NEXT:    flat_store_b64 v[0:1], v[3:4]
+; ALIGNED-GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; ALIGNED-GFX11-NEXT:    flat_store_b64 v[3:4], v[1:2]
 ; ALIGNED-GFX11-NEXT:    s_endpgm
 ;
 ; UNALIGNED-GFX11-LABEL: test_flat_aligned_v2:
@@ -740,13 +735,13 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) {
 ; UNALIGNED-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; UNALIGNED-GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; UNALIGNED-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-GFX11-NEXT:    v_add_co_u32 v0, s0, s0, v0
+; UNALIGNED-GFX11-NEXT:    v_add_co_u32 v3, s0, s0, v0
 ; UNALIGNED-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; UNALIGNED-GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
-; UNALIGNED-GFX11-NEXT:    flat_load_b64 v[2:3], v[0:1]
+; UNALIGNED-GFX11-NEXT:    v_add_co_ci_u32_e64 v4, null, s1, 0, s0
+; UNALIGNED-GFX11-NEXT:    flat_load_b64 v[0:1], v[3:4]
 ; UNALIGNED-GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; UNALIGNED-GFX11-NEXT:    v_mov_b32_e32 v4, v2
-; UNALIGNED-GFX11-NEXT:    flat_store_b64 v[0:1], v[3:4]
+; UNALIGNED-GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; UNALIGNED-GFX11-NEXT:    flat_store_b64 v[3:4], v[1:2]
 ; UNALIGNED-GFX11-NEXT:    s_endpgm
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -942,21 +937,19 @@ define amdgpu_kernel void @test_flat_v4_aligned8(ptr %arg) {
 ; SPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; SPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; SPLIT-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; SPLIT-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
-; SPLIT-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 8
-; SPLIT-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; SPLIT-NEXT:    v_add_co_u32 v6, s0, s0, v0
+; SPLIT-NEXT:    v_add_co_ci_u32_e64 v7, s0, s1, 0, s0
+; SPLIT-NEXT:    v_add_co_u32 v8, vcc_lo, v6, 8
+; SPLIT-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v7, vcc_lo
 ; SPLIT-NEXT:    s_clause 0x1
-; SPLIT-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; SPLIT-NEXT:    flat_load_dwordx2 v[6:7], v[2:3]
+; SPLIT-NEXT:    flat_load_dwordx2 v[0:1], v[6:7]
+; SPLIT-NEXT:    flat_load_dwordx2 v[3:4], v[8:9]
 ; SPLIT-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; SPLIT-NEXT:    v_mov_b32_e32 v8, v5
-; SPLIT-NEXT:    v_mov_b32_e32 v9, v4
+; SPLIT-NEXT:    v_mov_b32_e32 v2, v0
 ; SPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; SPLIT-NEXT:    v_mov_b32_e32 v4, v7
-; SPLIT-NEXT:    v_mov_b32_e32 v5, v6
-; SPLIT-NEXT:    flat_store_dwordx2 v[2:3], v[8:9]
-; SPLIT-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; SPLIT-NEXT:    v_mov_b32_e32 v5, v3
+; SPLIT-NEXT:    flat_store_dwordx2 v[8:9], v[1:2]
+; SPLIT-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
 ; SPLIT-NEXT:    s_endpgm
 ;
 ; ALIGNED-GFX10-LABEL: test_flat_v4_aligned8:
diff --git a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
index 3c55dcb486675..447cb62643384 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
@@ -6,8 +6,8 @@
 
 ; ELF:      Relocations [
 ; ELF-NEXT:   Section (3) .rel.text {
-; ELF-NEXT:     0x{{[0-9a-f]*}} R_AMDGPU_ABS32_LO lds.external
-; ELF-NEXT:     0x{{[0-9a-f]*}} R_AMDGPU_ABS32_LO lds.defined
+; ELF-NEXT:     0x{{[0-9A-F]*}} R_AMDGPU_ABS32_LO lds.external
+; ELF-NEXT:     0x{{[0-9A-F]*}} R_AMDGPU_ABS32_LO lds.defined
 ; ELF-NEXT:   }
 ; ELF-NEXT: ]
 
@@ -32,10 +32,10 @@
 ; ELF-NEXT: }
 
 ; GCN-LABEL: {{^}}test_basic:
-; GCN: v_mov_b32_e32 v1, lds.external@abs32@lo ; encoding: [0xff,0x02,0x02,0x7e,A,A,A,A]
+; GCN: s_mov_b32 s0, lds.external@abs32@lo ; encoding: [0xff,0x00,0x80,0xbe,A,A,A,A]
 ; GCN-NEXT:              ; fixup A - offset: 4, value: lds.external@abs32@lo, kind: FK_Data_4{{$}}
 ;
-; GCN: s_add_i32 s0, s0, lds.defined@abs32@lo ; encoding: [0x00,0xff,0x00,0x81,A,A,A,A]
+; GCN: s_lshl2_add_u32 s0, s2, lds.defined@abs32@lo ; encoding: [0x02,0xff,0x80,0x97,A,A,A,A]
 ; GCN-NEXT:          ; fixup A - offset: 4, value: lds.defined@abs32@lo, kind: FK_Data_4{{$}}
 ;
 ; GCN: .globl lds.external
diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
index fa52b96e9ea95..a245c475638f2 100644
--- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
+++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
@@ -5,74 +5,71 @@
 # source.
 # No more registers shall be defined
 ---
-name:            main
-alignment:       1
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
+name:            limit_coalesce
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr16, $sgpr17
+
+    ; CHECK-LABEL: name: limit_coalesce
+    ; CHECK: liveins: $sgpr16, $sgpr17
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub1:sgpr_64 = COPY $sgpr17
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr16
+    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: INLINEASM &"; def $0", 0 /* attdialect */, 2818058 /* regdef:VReg_64 */, def %4
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_128 = COPY %4.sub1
+    ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: SI_RETURN
+    %0:sgpr_32 = COPY killed $sgpr17
+    %1:sgpr_32 = COPY killed $sgpr16
+    undef %2.sub0:sgpr_64 = COPY killed %1
+    %2.sub1:sgpr_64 = COPY killed %0
+    %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    INLINEASM &"; def $0", 0 /* attdialect */, 2818058 /* regdef:VReg_64 */, def %4:vreg_64
+    undef %5.sub0:vreg_128 = COPY killed %4.sub1
+    GLOBAL_STORE_DWORDX4_SADDR killed %3, killed %5, killed %2, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    SI_RETURN
+...
+
+---
+name:            allow_coalesce
 tracksRegLiveness: true
 registers:
-  - { id: 1, class: sreg_32_xm0, preferred-register: '%1' }
-  - { id: 2, class: vreg_64, preferred-register: '%2' }
-  - { id: 3, class: vreg_64 }
-  - { id: 4, class: vreg_64 }
-  - { id: 5, class: vreg_64 }
-  - { id: 6, class: vreg_96 }
-  - { id: 7, class: vreg_96 }
-  - { id: 8, class: vreg_128 }
-  - { id: 9, class: vreg_128 }
-liveins:
-  - { reg: '$sgpr6', virtual-reg: '%1' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
+  - { id: 0, class: sreg_32_xm0, preferred-register: '%0' }
+  - { id: 1, class: vreg_64, preferred-register: '%1' }
 body:             |
-  bb.0.entry:
+  bb.0:
     liveins: $sgpr0, $vgpr0_vgpr1
 
-    ; CHECK-LABEL: name: main
+    ; CHECK-LABEL: name: allow_coalesce
     ; CHECK: liveins: $sgpr0, $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $sgpr0
-    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[DEF]].sub0
-    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr
-    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_96 = IMPLICIT_DEF
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:vreg_96 = COPY [[DEF1]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:vreg_96 = COPY [[DEF]].sub0
-    ; CHECK-NEXT: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr
-    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
-    ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF2]]
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub3:vreg_128 = COPY [[DEF]].sub0
-    ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr
-    %3 = IMPLICIT_DEF
-    undef %4.sub0 = COPY $sgpr0
-    %4.sub1 = COPY %3.sub0
-    undef %5.sub0 = COPY %4.sub1
-    %5.sub1 = COPY %4.sub0
-    FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %5, 0, 0, implicit $exec, implicit $flat_scr
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $sgpr0
+    ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, [[COPY]].sub0_sub1, 0, 0, implicit $exec, implicit $flat_scr
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_96 = IMPLICIT_DEF
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:vreg_96 = COPY [[DEF]]
+    ; CHECK-NEXT: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF1]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub3:vreg_128 = COPY undef [[COPY]].sub2
+    ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr
+    %2:vreg_64 = IMPLICIT_DEF
+    undef %3.sub0:vreg_64 = COPY $sgpr0
+    %3.sub1:vreg_64 = COPY %2.sub0
+    undef %4.sub0:vreg_64 = COPY %3.sub1
+    %4.sub1:vreg_64 = COPY %3.sub0
+    FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %4, 0, 0, implicit $exec, implicit $flat_scr
 
-    %6 = IMPLICIT_DEF
-    undef %7.sub0_sub1 = COPY %6
-    %7.sub2 = COPY %3.sub0
-    FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %7, 0, 0, implicit $exec, implicit $flat_scr
+    %5:vreg_96 = IMPLICIT_DEF
+    undef %6.sub0_sub1:vreg_96 = COPY %5
+    %6.sub2:vreg_96 = COPY %2.sub0
+    FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %6, 0, 0, implicit $exec, implicit $flat_scr
+
+    %7:vreg_128 = IMPLICIT_DEF
+    undef %8.sub0_sub1_sub2:vreg_128 = COPY %7
+    %8.sub3:vreg_128 = COPY %2.sub0
+    FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %8, 0, 0, implicit $exec, implicit $flat_scr
 
-    %8 = IMPLICIT_DEF
-    undef %9.sub0_sub1_sub2 = COPY %8
-    %9.sub3 = COPY %3.sub0
-    FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %9, 0, 0, implicit $exec, implicit $flat_scr
 ...
+
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index 704ea37117f32..8e7389ace9c5c 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -9,11 +9,11 @@
 ; RUN:   | FileCheck -check-prefix=GCN-O3 %s
 
 
-; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
-; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
-; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
 define void @empty() {
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
index 4719ab9090fa5..cbf697fafe683 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
@@ -1,13 +1,20 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -stop-after=postrapseudos -o - < %s | FileCheck -enable-var-scope -check-prefix=MIR %s
 
 
-; MIR-LABEL: name: gws_barrier_offset0{{$}}
-; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
-; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
-; MIR-NEXT: S_WAITCNT 0
-; MIR-NEXT: }
 define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
+  ; MIR-LABEL: name: gws_barrier_offset0
+  ; MIR: bb.0 (%ir-block.0):
+  ; MIR-NEXT:   liveins: $sgpr8_sgpr9
+  ; MIR-NEXT: {{  $}}
+  ; MIR-NEXT:   renamable $sgpr4 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset, align 16, addrspace 4)
+  ; MIR-NEXT:   $m0 = S_MOV_B32 0
+  ; MIR-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec
+  ; MIR-NEXT:   BUNDLE implicit killed renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") {
+  ; MIR-NEXT:     DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
+  ; MIR-NEXT:     S_WAITCNT 0
+  ; MIR-NEXT:   }
+  ; MIR-NEXT:   S_ENDPGM 0
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
   ret void
 }
@@ -17,5 +24,3 @@ declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { convergent inaccessiblememonly nounwind }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; MIR: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
index c5f6e2b0098ae..417b8e08cf669 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
@@ -35,7 +35,7 @@
 ; LOOP-NEXT: s_cbranch_scc1 [[LOOP]]
 
 ; MIR-LABEL: name: gws_barrier_offset0{{$}}
-; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
+; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec
 ; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
 ; MIR-NEXT: S_WAITCNT 0
 ; MIR-NEXT: }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index 4419b8c6f9862..af270e5adf75c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -13,9 +13,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-TRUE16-NEXT:    v_dot2_bf16_bf16 v0.l, s2, s3, v0.l
 ; SDAG-GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -26,9 +26,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; SDAG-GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-FAKE16-NEXT:    v_dot2_bf16_bf16 v1, s2, s3, v1
 ; SDAG-GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
index 0194d25a99cdc..72b47693c69f8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
@@ -12,9 +12,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-TRUE16-NEXT:    v_dot2_f16_f16 v0.l, s2, s3, v0.l
 ; SDAG-GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -25,9 +25,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; SDAG-GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-FAKE16-NEXT:    v_dot2_f16_f16 v1, s2, s3, v1
 ; SDAG-GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
@@ -38,9 +38,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
 ; GISEL-GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GISEL-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; GISEL-GFX11-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GISEL-GFX11-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GISEL-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; GISEL-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-TRUE16-NEXT:    v_dot2_f16_f16 v0.l, s2, s3, v0.l
 ; GISEL-GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -51,9 +51,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
 ; GISEL-GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GISEL-GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; GISEL-GFX11-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GISEL-GFX11-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GISEL-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; GISEL-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-FAKE16-NEXT:    v_dot2_f16_f16 v1, s2, s3, v1
 ; GISEL-GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
index 1ab4cb0f00192..d82d6bcb437cc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
@@ -781,16 +781,23 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr
 ; GISEL12-NEXT:    v_dual_mov_b32 v14, v38 :: v_dual_mov_b32 v15, v39
 ; GISEL12-NEXT:    s_wait_kmcnt 0x0
 ; GISEL12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GISEL12-NEXT:    v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v25, v1
-; GISEL12-NEXT:    v_dual_mov_b32 v26, v2 :: v_dual_mov_b32 v27, v3
-; GISEL12-NEXT:    v_dual_mov_b32 v28, v4 :: v_dual_mov_b32 v29, v5
-; GISEL12-NEXT:    v_dual_mov_b32 v30, v6 :: v_dual_mov_b32 v31, v7
-; GISEL12-NEXT:    v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
-; GISEL12-NEXT:    v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11
-; GISEL12-NEXT:    v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13
-; GISEL12-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15
+; GISEL12-NEXT:    v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
+; GISEL12-NEXT:    v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v43, v3
+; GISEL12-NEXT:    v_dual_mov_b32 v44, v4 :: v_dual_mov_b32 v45, v5
+; GISEL12-NEXT:    v_dual_mov_b32 v46, v6 :: v_dual_mov_b32 v47, v7
+; GISEL12-NEXT:    v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v49, v9
+; GISEL12-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v11
+; GISEL12-NEXT:    v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v53, v13
+; GISEL12-NEXT:    v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v55, v15
 ; GISEL12-NEXT:    s_mov_b32 exec_lo, s9
-; GISEL12-NEXT:    ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec
+; GISEL12-NEXT:    v_dual_mov_b32 v24, v40 :: v_dual_mov_b32 v25, v41
+; GISEL12-NEXT:    v_dual_mov_b32 v26, v42 :: v_dual_mov_b32 v27, v43
+; GISEL12-NEXT:    v_dual_mov_b32 v28, v44 :: v_dual_mov_b32 v29, v45
+; GISEL12-NEXT:    v_dual_mov_b32 v30, v46 :: v_dual_mov_b32 v31, v47
+; GISEL12-NEXT:    v_dual_mov_b32 v32, v48 :: v_dual_mov_b32 v33, v49
+; GISEL12-NEXT:    v_dual_mov_b32 v34, v50 :: v_dual_mov_b32 v35, v51
+; GISEL12-NEXT:    v_dual_mov_b32 v36, v52 :: v_dual_mov_b32 v37, v53
+; GISEL12-NEXT:    v_dual_mov_b32 v38, v54 :: v_dual_mov_b32 v39, v55
 ; GISEL12-NEXT:  .LBB5_2: ; %tail
 ; GISEL12-NEXT:    s_wait_alu 0xfffe
 ; GISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
@@ -946,24 +953,39 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr
 ; GISEL10-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GISEL10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL10-NEXT:    s_swappc_b64 s[30:31], s[12:13]
-; GISEL10-NEXT:    v_mov_b32_e32 v24, v0
-; GISEL10-NEXT:    v_mov_b32_e32 v25, v1
-; GISEL10-NEXT:    v_mov_b32_e32 v26, v2
-; GISEL10-NEXT:    v_mov_b32_e32 v27, v3
-; GISEL10-NEXT:    v_mov_b32_e32 v28, v4
-; GISEL10-NEXT:    v_mov_b32_e32 v29, v5
-; GISEL10-NEXT:    v_mov_b32_e32 v30, v6
-; GISEL10-NEXT:    v_mov_b32_e32 v31, v7
-; GISEL10-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL10-NEXT:    v_mov_b32_e32 v33, v9
-; GISEL10-NEXT:    v_mov_b32_e32 v34, v10
-; GISEL10-NEXT:    v_mov_b32_e32 v35, v11
-; GISEL10-NEXT:    v_mov_b32_e32 v36, v12
-; GISEL10-NEXT:    v_mov_b32_e32 v37, v13
-; GISEL10-NEXT:    v_mov_b32_e32 v38, v14
-; GISEL10-NEXT:    v_mov_b32_e32 v39, v15
+; GISEL10-NEXT:    v_mov_b32_e32 v40, v0
+; GISEL10-NEXT:    v_mov_b32_e32 v41, v1
+; GISEL10-NEXT:    v_mov_b32_e32 v42, v2
+; GISEL10-NEXT:    v_mov_b32_e32 v43, v3
+; GISEL10-NEXT:    v_mov_b32_e32 v44, v4
+; GISEL10-NEXT:    v_mov_b32_e32 v45, v5
+; GISEL10-NEXT:    v_mov_b32_e32 v46, v6
+; GISEL10-NEXT:    v_mov_b32_e32 v47, v7
+; GISEL10-NEXT:    v_mov_b32_e32 v48, v8
+; GISEL10-NEXT:    v_mov_b32_e32 v49, v9
+; GISEL10-NEXT:    v_mov_b32_e32 v50, v10
+; GISEL10-NEXT:    v_mov_b32_e32 v51, v11
+; GISEL10-NEXT:    v_mov_b32_e32 v52, v12
+; GISEL10-NEXT:    v_mov_b32_e32 v53, v13
+; GISEL10-NEXT:    v_mov_b32_e32 v54, v14
+; GISEL10-NEXT:    v_mov_b32_e32 v55, v15
 ; GISEL10-NEXT:    s_mov_b32 exec_lo, s9
-; GISEL10-NEXT:    ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec
+; GISEL10-NEXT:    v_mov_b32_e32 v24, v40
+; GISEL10-NEXT:    v_mov_b32_e32 v25, v41
+; GISEL10-NEXT:    v_mov_b32_e32 v26, v42
+; GISEL10-NEXT:    v_mov_b32_e32 v27, v43
+; GISEL10-NEXT:    v_mov_b32_e32 v28, v44
+; GISEL10-NEXT:    v_mov_b32_e32 v29, v45
+; GISEL10-NEXT:    v_mov_b32_e32 v30, v46
+; GISEL10-NEXT:    v_mov_b32_e32 v31, v47
+; GISEL10-NEXT:    v_mov_b32_e32 v32, v48
+; GISEL10-NEXT:    v_mov_b32_e32 v33, v49
+; GISEL10-NEXT:    v_mov_b32_e32 v34, v50
+; GISEL10-NEXT:    v_mov_b32_e32 v35, v51
+; GISEL10-NEXT:    v_mov_b32_e32 v36, v52
+; GISEL10-NEXT:    v_mov_b32_e32 v37, v53
+; GISEL10-NEXT:    v_mov_b32_e32 v38, v54
+; GISEL10-NEXT:    v_mov_b32_e32 v39, v55
 ; GISEL10-NEXT:  .LBB5_2: ; %tail
 ; GISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GISEL10-NEXT:    v_mov_b32_e32 v8, v24
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 22bc62acce15d..679b289e13969 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX90A-VGPR %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX942-VGPR %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX90A-VGPR %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX942-VGPR %s
 
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 7e30af96bb8b9..e7d7f87e4fc4c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC,GFX908,GFX908_A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A,GFX90A_42 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,GFX90A_42 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefix=GFX942-VGPR %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC,GFX908,GFX908_A %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A,GFX90A_42 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,GFX90A_42 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefix=GFX942-VGPR %s
 
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
@@ -3186,13 +3186,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
 ;
 ; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, 2
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, 64 cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT:    v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    s_nop 9
+; GFX942-VGPR-NEXT:    s_nop 8
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -4538,13 +4539,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm_splat:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, 2.0
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, 1.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    s_nop 8
+; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -4689,15 +4691,16 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16_imm_splat:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, 0x40004000
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0x3c003c00
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, v16
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, 0x40004000
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, v18
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], 1.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    s_nop 9
+; GFX942-VGPR-NEXT:    s_nop 8
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -4908,14 +4911,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm_splat:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v32, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v33, 2.0
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v32, 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 15
-; GFX942-VGPR-NEXT:    s_nop 0
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 02d29909c661c..d1ba892d7f7e1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -396,7 +396,8 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) {
 ;
 ; CHECK-GISEL-LABEL: test_readfirstlane_imm_f64:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    s_mov_b64 s[0:1], 0x4040000000000000
+; CHECK-GISEL-NEXT:    s_mov_b32 s0, 0
+; CHECK-GISEL-NEXT:    s_mov_b32 s1, 0x40400000
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    ; use s[0:1]
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
@@ -455,13 +456,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
 ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 32
 ; CHECK-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, 32
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -488,13 +490,15 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
 ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
 ; CHECK-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40400000
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -584,17 +588,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CHECK-SDAG-NEXT:    s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s3
-; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
@@ -624,17 +628,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CHECK-SDAG-NEXT:    s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s3
-; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 0c1448a0b8fb6..1d08097452ce6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -17,21 +17,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
 ; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[2:3]
 ; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[0:1]
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
 ; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s16
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
@@ -43,13 +41,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
 ; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
 ; GISEL-NEXT:    s_load_dword s16, s[4:5], 0x64
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
@@ -175,16 +172,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[2:3]
 ; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[0:1]
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[14:15]
 ; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[12:13]
 ; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[10:11]
@@ -207,16 +203,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
 ; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GISEL-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; GISEL-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[0:1]
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
@@ -520,21 +515,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1)
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
 ; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GCN-NEXT:    v_mov_b64_e32 v[16:17], s[2:3]
 ; GCN-NEXT:    v_mov_b64_e32 v[14:15], s[0:1]
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
 ; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
 ; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; GCN-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; GCN-NEXT:    v_mov_b32_e32 v5, s16
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
@@ -634,16 +627,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; GCN-NEXT:    v_mov_b64_e32 v[28:29], s[2:3]
 ; GCN-NEXT:    v_mov_b64_e32 v[26:27], s[0:1]
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b64_e32 v[24:25], s[14:15]
 ; GCN-NEXT:    v_mov_b64_e32 v[22:23], s[12:13]
 ; GCN-NEXT:    v_mov_b64_e32 v[20:21], s[10:11]
@@ -802,11 +794,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
 ; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s10
@@ -815,7 +807,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s2
@@ -833,12 +824,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
@@ -965,15 +955,14 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v24, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, s10
@@ -1003,15 +992,14 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
@@ -1317,11 +1305,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
 ; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s10
@@ -1330,7 +1318,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s2
@@ -1348,12 +1335,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
@@ -1481,11 +1467,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
 ; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s10
@@ -1494,7 +1480,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s2
@@ -1512,12 +1497,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
@@ -1645,11 +1629,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
 ; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s10
@@ -1658,7 +1642,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s2
@@ -1676,12 +1659,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
@@ -1809,11 +1791,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
 ; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s10
@@ -1822,7 +1804,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s2
@@ -1840,12 +1821,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
@@ -1972,15 +1952,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v24, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, s10
@@ -2010,15 +1989,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
@@ -2323,15 +2301,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v24, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, s10
@@ -2361,15 +2338,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
@@ -2674,15 +2650,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v24, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, s10
@@ -2712,15 +2687,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
@@ -3025,15 +2999,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v24, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, s10
@@ -3063,15 +3036,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
index d4aa2051dc28a..e421e2c8ebfc4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
@@ -1612,29 +1612,27 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0
 ; SI-LABEL: v_lshr_and:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshr_b32 s2, s2, s3
-; SI-NEXT:    s_and_b32 s2, s2, 7
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_lshr_b32 s2, s4, s5
+; SI-NEXT:    s_and_b32 s4, s2, 7
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_lshr_and:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_lshr_b32 s0, s2, s3
-; VI-NEXT:    s_and_b32 s0, s0, 7
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; VI-NEXT:    s_lshr_b32 s4, s4, s5
+; VI-NEXT:    s_and_b32 s4, s4, 7
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %c = lshr i32 %a, %b
   %d = and i32 %c, 7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index ac356fad5b2da..3897a0e028334 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -520,42 +520,41 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ;
 ; SI-SDAG-LABEL: s_exp_v2f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
-; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
-; SI-SDAG-NEXT:    s_mov_b32 s2, -1
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT:    v_mul_f32_e32 v2, s7, v0
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    v_mul_f32_e32 v2, s5, v0
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v3, v2
-; SI-SDAG-NEXT:    v_fma_f32 v4, s7, v0, -v2
+; SI-SDAG-NEXT:    v_fma_f32 v4, s5, v0, -v2
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v3
-; SI-SDAG-NEXT:    v_fma_f32 v4, s7, v1, v4
+; SI-SDAG-NEXT:    v_fma_f32 v4, s5, v1, v4
 ; SI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
-; SI-SDAG-NEXT:    v_mul_f32_e32 v5, s6, v0
-; SI-SDAG-NEXT:    v_exp_f32_e32 v2, v2
+; SI-SDAG-NEXT:    v_mul_f32_e32 v5, s4, v0
 ; SI-SDAG-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; SI-SDAG-NEXT:    v_exp_f32_e32 v2, v2
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v6, v5
-; SI-SDAG-NEXT:    v_fma_f32 v0, s6, v0, -v5
+; SI-SDAG-NEXT:    v_fma_f32 v0, s4, v0, -v5
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v7, v5, v6
-; SI-SDAG-NEXT:    v_fma_f32 v0, s6, v1, v0
+; SI-SDAG-NEXT:    v_fma_f32 v0, s4, v1, v0
 ; SI-SDAG-NEXT:    v_add_f32_e32 v0, v7, v0
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_i32_f32_e32 v5, v6
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v2, v2, v3
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc2ce8ed0
-; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v3
+; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s5, v3
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x42b17218
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7f800000
-; SI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s7, v4
+; SI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s5, v4
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v5
-; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s6, v3
+; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v3
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; SI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v4
-; SI-SDAG-NEXT:    s_mov_b32 s0, s4
-; SI-SDAG-NEXT:    s_mov_b32 s1, s5
+; SI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v4
+; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s2, -1
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; SI-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-SDAG-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index d12ebe49814d8..3928ec2dd76d3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -522,42 +522,41 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; SI-SDAG-LABEL: s_exp10_v2f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x40549a78
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x33979a37
-; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
-; SI-SDAG-NEXT:    s_mov_b32 s2, -1
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT:    v_mul_f32_e32 v2, s7, v0
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    v_mul_f32_e32 v2, s5, v0
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v3, v2
-; SI-SDAG-NEXT:    v_fma_f32 v4, s7, v0, -v2
+; SI-SDAG-NEXT:    v_fma_f32 v4, s5, v0, -v2
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v3
-; SI-SDAG-NEXT:    v_fma_f32 v4, s7, v1, v4
+; SI-SDAG-NEXT:    v_fma_f32 v4, s5, v1, v4
 ; SI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
-; SI-SDAG-NEXT:    v_mul_f32_e32 v5, s6, v0
-; SI-SDAG-NEXT:    v_exp_f32_e32 v2, v2
+; SI-SDAG-NEXT:    v_mul_f32_e32 v5, s4, v0
 ; SI-SDAG-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; SI-SDAG-NEXT:    v_exp_f32_e32 v2, v2
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v6, v5
-; SI-SDAG-NEXT:    v_fma_f32 v0, s6, v0, -v5
+; SI-SDAG-NEXT:    v_fma_f32 v0, s4, v0, -v5
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v7, v5, v6
-; SI-SDAG-NEXT:    v_fma_f32 v0, s6, v1, v0
+; SI-SDAG-NEXT:    v_fma_f32 v0, s4, v1, v0
 ; SI-SDAG-NEXT:    v_add_f32_e32 v0, v7, v0
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_i32_f32_e32 v5, v6
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v2, v2, v3
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc23369f4
-; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v3
+; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s5, v3
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x421a209b
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7f800000
-; SI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s7, v4
+; SI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s5, v4
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v5
-; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s6, v3
+; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v3
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; SI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v4
-; SI-SDAG-NEXT:    s_mov_b32 s0, s4
-; SI-SDAG-NEXT:    s_mov_b32 s1, s5
+; SI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v4
+; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s2, -1
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; SI-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-SDAG-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index e30a58699fadb..dd44a1a35067e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -176,26 +176,25 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42800000
-; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; SI-SDAG-NEXT:    s_mov_b32 s4, s0
-; SI-SDAG-NEXT:    s_mov_b32 s5, s1
-; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
-; SI-SDAG-NEXT:    v_add_f32_e32 v2, s3, v2
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
-; SI-SDAG-NEXT:    v_exp_f32_e32 v2, v2
+; SI-SDAG-NEXT:    s_and_b64 s[6:7], vcc, exec
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; SI-SDAG-NEXT:    v_add_f32_e32 v0, s2, v0
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, s5, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v0, s4, v0
+; SI-SDAG-NEXT:    v_exp_f32_e32 v2, v2
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffffc0, 0
-; SI-SDAG-NEXT:    v_ldexp_f32_e64 v1, v2, s0
-; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
-; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffffc0, 0
-; SI-SDAG-NEXT:    v_ldexp_f32_e64 v0, v0, s0
-; SI-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-SDAG-NEXT:    s_cselect_b32 s6, 0xffffffc0, 0
+; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s4, 0xffffffc0, 0
+; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s2, -1
+; SI-SDAG-NEXT:    v_ldexp_f32_e64 v1, v2, s6
+; SI-SDAG-NEXT:    v_ldexp_f32_e64 v0, v0, s4
+; SI-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
 ; SI-GISEL-LABEL: s_exp2_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index 18c462ffd0ff5..dd2cffd7bd161 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -77,17 +77,53 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
 ; GFX10CHECK-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10CHECK-NEXT:    s_endpgm
 ;
-; GFX11CHECK-LABEL: sgpr_isnan_f16:
-; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_clause 0x1
-; GFX11CHECK-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s2, s2, 3
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX11CHECK-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11CHECK-NEXT:    s_endpgm
+; GFX11SELDAG-TRUE16-LABEL: sgpr_isnan_f16:
+; GFX11SELDAG-TRUE16:       ; %bb.0:
+; GFX11SELDAG-TRUE16-NEXT:    s_clause 0x1
+; GFX11SELDAG-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11SELDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11SELDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 0
+; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, s2, v0.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11SELDAG-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11SELDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11SELDAG-FAKE16-LABEL: sgpr_isnan_f16:
+; GFX11SELDAG-FAKE16:       ; %bb.0:
+; GFX11SELDAG-FAKE16-NEXT:    s_clause 0x1
+; GFX11SELDAG-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11SELDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11SELDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11SELDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11SELDAG-FAKE16-NEXT:    v_cmp_class_f16_e64 s2, s2, 3
+; GFX11SELDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
+; GFX11SELDAG-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11SELDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX11GLISEL-TRUE16-LABEL: sgpr_isnan_f16:
+; GFX11GLISEL-TRUE16:       ; %bb.0:
+; GFX11GLISEL-TRUE16-NEXT:    s_clause 0x1
+; GFX11GLISEL-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11GLISEL-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11GLISEL-TRUE16-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 0
+; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, s2, v0.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11GLISEL-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11GLISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX11GLISEL-FAKE16-LABEL: sgpr_isnan_f16:
+; GFX11GLISEL-FAKE16:       ; %bb.0:
+; GFX11GLISEL-FAKE16-NEXT:    s_clause 0x1
+; GFX11GLISEL-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11GLISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11GLISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11GLISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11GLISEL-FAKE16-NEXT:    v_cmp_class_f16_e64 s2, s2, 3
+; GFX11GLISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
+; GFX11GLISEL-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11GLISEL-FAKE16-NEXT:    s_endpgm
   %result = call i1 @llvm.is.fpclass.f16(half %x, i32 3)
   %sext = sext i1 %result to i32
   store i32 %sext, ptr addrspace(1) %out, align 4
@@ -212,8 +248,9 @@ define i1 @snan_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: snan_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 1
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 1
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: snan_f16:
@@ -226,8 +263,9 @@ define i1 @snan_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: snan_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 1
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 1
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: snan_f16:
@@ -285,8 +323,9 @@ define i1 @qnan_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: qnan_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 2
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 2
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: qnan_f16:
@@ -299,8 +338,9 @@ define i1 @qnan_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: qnan_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 2
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 2
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: qnan_f16:
@@ -358,8 +398,9 @@ define i1 @posinf_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: posinf_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x200
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x200
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: posinf_f16:
@@ -372,8 +413,9 @@ define i1 @posinf_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: posinf_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x200
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x200
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: posinf_f16:
@@ -429,8 +471,9 @@ define i1 @neginf_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: neginf_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 4
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 4
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: neginf_f16:
@@ -443,8 +486,9 @@ define i1 @neginf_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: neginf_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 4
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 4
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: neginf_f16:
@@ -514,8 +558,9 @@ define i1 @posnormal_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: posnormal_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x100
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x100
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: posnormal_f16:
@@ -528,8 +573,9 @@ define i1 @posnormal_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: posnormal_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x100
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x100
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: posnormal_f16:
@@ -597,8 +643,9 @@ define i1 @negnormal_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: negnormal_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 8
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 8
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: negnormal_f16:
@@ -611,8 +658,9 @@ define i1 @negnormal_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: negnormal_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 8
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 8
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: negnormal_f16:
@@ -673,8 +721,9 @@ define i1 @possubnormal_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: possubnormal_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x80
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x80
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: possubnormal_f16:
@@ -687,8 +736,9 @@ define i1 @possubnormal_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: possubnormal_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x80
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x80
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: possubnormal_f16:
@@ -755,8 +805,9 @@ define i1 @negsubnormal_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: negsubnormal_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 16
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 16
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: negsubnormal_f16:
@@ -769,8 +820,9 @@ define i1 @negsubnormal_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: negsubnormal_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 16
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 16
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: negsubnormal_f16:
@@ -824,8 +876,9 @@ define i1 @poszero_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: poszero_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 64
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 64
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: poszero_f16:
@@ -838,8 +891,9 @@ define i1 @poszero_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: poszero_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 64
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 64
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: poszero_f16:
@@ -895,8 +949,9 @@ define i1 @negzero_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: negzero_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 32
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 32
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: negzero_f16:
@@ -909,8 +964,9 @@ define i1 @negzero_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: negzero_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 32
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 32
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: negzero_f16:
@@ -968,8 +1024,9 @@ define i1 @posfinite_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: posfinite_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x1c0
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x1c0
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: posfinite_f16:
@@ -982,8 +1039,9 @@ define i1 @posfinite_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: posfinite_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x1c0
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x1c0
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: posfinite_f16:
@@ -1047,8 +1105,9 @@ define i1 @negfinite_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: negfinite_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 56
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 56
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: negfinite_f16:
@@ -1061,8 +1120,9 @@ define i1 @negfinite_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: negfinite_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 56
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 56
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: negfinite_f16:
@@ -1120,8 +1180,9 @@ define i1 @isnan_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: isnan_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 3
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 3
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: isnan_f16:
@@ -1134,8 +1195,9 @@ define i1 @isnan_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: isnan_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 3
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 3
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: isnan_f16:
@@ -1195,8 +1257,9 @@ define i1 @not_isnan_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: not_isnan_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x3fc
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x3fc
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: not_isnan_f16:
@@ -1209,8 +1272,9 @@ define i1 @not_isnan_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: not_isnan_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x3fc
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x3fc
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: not_isnan_f16:
@@ -1336,11 +1400,13 @@ define <2 x i1> @isnan_v2f16(<2 x half> %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: isnan_v2f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 3
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.h, 3
+; GFX11GLISEL-TRUE16-NEXT:    v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 3
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.h, v3.l
 ; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, v2
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: isnan_v2f16:
@@ -1499,13 +1565,17 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: isnan_v3f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 3
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.h, 3
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v1.l, 3
-; GFX11GLISEL-TRUE16-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 3
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v4, 3
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v2.l
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.h, v3.l
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v1.l, v5.l
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, v3
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: isnan_v3f16:
@@ -1693,16 +1763,20 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: isnan_v4f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 3
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.h, 3
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v1.l, 3
+; GFX11GLISEL-TRUE16-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 3
+; GFX11GLISEL-TRUE16-NEXT:    v_dual_mov_b32 v4, 3 :: v_dual_mov_b32 v5, 3
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v2.l
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.h, v3.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v1.l, v6.l
 ; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, v4
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v1.h, 3
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v1.h, v7.l
 ; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, v5
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: isnan_v4f16:
@@ -1771,8 +1845,9 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: isnan_f16_strictfp:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 3
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 3
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: isnan_f16_strictfp:
@@ -1785,8 +1860,9 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: isnan_f16_strictfp:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 3
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 3
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: isnan_f16_strictfp:
@@ -1846,8 +1922,9 @@ define i1 @isinf_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: isinf_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x204
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x204
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: isinf_f16:
@@ -1860,8 +1937,9 @@ define i1 @isinf_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: isinf_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x204
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x204
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: isinf_f16:
@@ -1921,8 +1999,9 @@ define i1 @isfinite_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: isfinite_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x1f8
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x1f8
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: isfinite_f16:
@@ -1935,8 +2014,9 @@ define i1 @isfinite_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: isfinite_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x1f8
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x1f8
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: isfinite_f16:
@@ -1994,8 +2074,9 @@ define i1 @issubnormal_or_zero_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: issubnormal_or_zero_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0xf0
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0xf0
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: issubnormal_or_zero_f16:
@@ -2008,8 +2089,9 @@ define i1 @issubnormal_or_zero_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: issubnormal_or_zero_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0xf0
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0xf0
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: issubnormal_or_zero_f16:
@@ -2074,8 +2156,9 @@ define i1 @not_issubnormal_or_zero_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: not_issubnormal_or_zero_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x30f
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x30f
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: not_issubnormal_or_zero_f16:
@@ -2088,8 +2171,9 @@ define i1 @not_issubnormal_or_zero_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: not_issubnormal_or_zero_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x30f
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x30f
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: not_issubnormal_or_zero_f16:
@@ -2153,8 +2237,9 @@ define i1 @isnormal_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: isnormal_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x108
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x108
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: isnormal_f16:
@@ -2167,8 +2252,9 @@ define i1 @isnormal_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: isnormal_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x108
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x108
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: isnormal_f16:
@@ -2236,8 +2322,9 @@ define i1 @not_isnormal_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: not_isnormal_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x2f7
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x2f7
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: not_isnormal_f16:
@@ -2250,8 +2337,9 @@ define i1 @not_isnormal_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: not_isnormal_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x2f7
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x2f7
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: not_isnormal_f16:
@@ -2330,8 +2418,9 @@ define i1 @not_is_plus_normal_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: not_is_plus_normal_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x2ff
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x2ff
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: not_is_plus_normal_f16:
@@ -2344,8 +2433,9 @@ define i1 @not_is_plus_normal_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: not_is_plus_normal_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x2ff
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x2ff
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: not_is_plus_normal_f16:
@@ -2424,8 +2514,9 @@ define i1 @not_is_neg_normal_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: not_is_neg_normal_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x3f7
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x3f7
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: not_is_neg_normal_f16:
@@ -2438,8 +2529,9 @@ define i1 @not_is_neg_normal_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: not_is_neg_normal_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x3f7
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x3f7
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: not_is_neg_normal_f16:
@@ -2501,8 +2593,9 @@ define i1 @issubnormal_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: issubnormal_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x90
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x90
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: issubnormal_f16:
@@ -2515,8 +2608,9 @@ define i1 @issubnormal_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: issubnormal_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x90
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x90
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: issubnormal_f16:
@@ -2586,8 +2680,9 @@ define i1 @not_issubnormal_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: not_issubnormal_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x36f
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x36f
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: not_issubnormal_f16:
@@ -2600,8 +2695,9 @@ define i1 @not_issubnormal_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: not_issubnormal_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x36f
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x36f
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: not_issubnormal_f16:
@@ -2659,8 +2755,9 @@ define i1 @iszero_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: iszero_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x60
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x60
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: iszero_f16:
@@ -2673,8 +2770,9 @@ define i1 @iszero_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: iszero_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x60
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x60
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: iszero_f16:
@@ -2745,8 +2843,9 @@ define i1 @not_iszero_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: not_iszero_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x39f
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x39f
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: not_iszero_f16:
@@ -2759,8 +2858,9 @@ define i1 @not_iszero_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: not_iszero_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x39f
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x39f
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: not_iszero_f16:
@@ -2818,8 +2918,9 @@ define i1 @ispositive_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: ispositive_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x3c0
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x3c0
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: ispositive_f16:
@@ -2832,8 +2933,9 @@ define i1 @ispositive_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: ispositive_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x3c0
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x3c0
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: ispositive_f16:
@@ -2907,8 +3009,9 @@ define i1 @not_ispositive_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: not_ispositive_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 63
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 63
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: not_ispositive_f16:
@@ -2921,8 +3024,9 @@ define i1 @not_ispositive_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: not_ispositive_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 63
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 63
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: not_ispositive_f16:
@@ -2992,8 +3096,9 @@ define i1 @isnegative_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: isnegative_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 60
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 60
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: isnegative_f16:
@@ -3006,8 +3111,9 @@ define i1 @isnegative_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: isnegative_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 60
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 60
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: isnegative_f16:
@@ -3074,8 +3180,9 @@ define i1 @not_isnegative_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: not_isnegative_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x3c3
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x3c3
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: not_isnegative_f16:
@@ -3088,8 +3195,9 @@ define i1 @not_isnegative_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: not_isnegative_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x3c3
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x3c3
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: not_isnegative_f16:
@@ -3152,8 +3260,9 @@ define i1 @iszero_or_nan_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x63
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x63
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_f16:
@@ -3166,8 +3275,9 @@ define i1 @iszero_or_nan_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: iszero_or_nan_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x63
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x63
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: iszero_or_nan_f16:
@@ -3231,8 +3341,9 @@ define i1 @iszero_or_nan_f_daz(half %x) #0 {
 ; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_f_daz:
 ; GFX11SELDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x63
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x63
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_f_daz:
@@ -3245,8 +3356,9 @@ define i1 @iszero_or_nan_f_daz(half %x) #0 {
 ; GFX11GLISEL-TRUE16-LABEL: iszero_or_nan_f_daz:
 ; GFX11GLISEL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x63
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x63
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: iszero_or_nan_f_daz:
@@ -3310,8 +3422,9 @@ define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 {
 ; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX11SELDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x63
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x63
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_f_maybe_daz:
@@ -3324,8 +3437,9 @@ define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 {
 ; GFX11GLISEL-TRUE16-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX11GLISEL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x63
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x63
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: iszero_or_nan_f_maybe_daz:
@@ -3398,8 +3512,9 @@ define i1 @not_iszero_or_nan_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x39c
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x39c
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_f16:
@@ -3412,8 +3527,9 @@ define i1 @not_iszero_or_nan_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_nan_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x39c
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x39c
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_nan_f16:
@@ -3486,8 +3602,9 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 {
 ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_f_daz:
 ; GFX11SELDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x39c
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x39c
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_f_daz:
@@ -3500,8 +3617,9 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 {
 ; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_nan_f_daz:
 ; GFX11GLISEL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x39c
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x39c
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_nan_f_daz:
@@ -3574,8 +3692,9 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 {
 ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX11SELDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x39c
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x39c
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_f_maybe_daz:
@@ -3588,8 +3707,9 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 {
 ; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX11GLISEL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x39c
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x39c
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_nan_f_maybe_daz:
@@ -3653,8 +3773,9 @@ define i1 @iszero_or_qnan_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: iszero_or_qnan_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x62
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x62
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: iszero_or_qnan_f16:
@@ -3667,8 +3788,9 @@ define i1 @iszero_or_qnan_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: iszero_or_qnan_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x62
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x62
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: iszero_or_qnan_f16:
@@ -3737,8 +3859,9 @@ define i1 @iszero_or_snan_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: iszero_or_snan_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x61
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x61
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: iszero_or_snan_f16:
@@ -3751,8 +3874,9 @@ define i1 @iszero_or_snan_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: iszero_or_snan_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x61
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x61
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: iszero_or_snan_f16:
@@ -3841,8 +3965,9 @@ define i1 @not_iszero_or_qnan_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_qnan_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x39d
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x39d
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_qnan_f16:
@@ -3855,8 +3980,9 @@ define i1 @not_iszero_or_qnan_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_qnan_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x39d
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x39d
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_qnan_f16:
@@ -3942,8 +4068,9 @@ define i1 @not_iszero_or_snan_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_snan_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x39e
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x39e
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_snan_f16:
@@ -3956,8 +4083,9 @@ define i1 @not_iszero_or_snan_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_snan_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x39e
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x39e
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_snan_f16:
@@ -4018,8 +4146,9 @@ define i1 @isinf_or_nan_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: isinf_or_nan_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x207
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x207
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: isinf_or_nan_f16:
@@ -4032,8 +4161,9 @@ define i1 @isinf_or_nan_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: isinf_or_nan_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x207
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x207
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: isinf_or_nan_f16:
@@ -4094,8 +4224,9 @@ define i1 @not_isinf_or_nan_f16(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: not_isinf_or_nan_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x1f8
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x1f8
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: not_isinf_or_nan_f16:
@@ -4108,8 +4239,9 @@ define i1 @not_isinf_or_nan_f16(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: not_isinf_or_nan_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x1f8
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x1f8
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: not_isinf_or_nan_f16:
@@ -4170,8 +4302,9 @@ define i1 @isfinite_or_nan_f(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: isfinite_or_nan_f:
 ; GFX11SELDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x1fb
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x1fb
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: isfinite_or_nan_f:
@@ -4184,8 +4317,9 @@ define i1 @isfinite_or_nan_f(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: isfinite_or_nan_f:
 ; GFX11GLISEL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x1fb
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x1fb
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: isfinite_or_nan_f:
@@ -4246,8 +4380,9 @@ define i1 @not_isfinite_or_nan_f(half %x) {
 ; GFX11SELDAG-TRUE16-LABEL: not_isfinite_or_nan_f:
 ; GFX11SELDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x204
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x204
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: not_isfinite_or_nan_f:
@@ -4260,8 +4395,9 @@ define i1 @not_isfinite_or_nan_f(half %x) {
 ; GFX11GLISEL-TRUE16-LABEL: not_isfinite_or_nan_f:
 ; GFX11GLISEL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x204
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x204
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: not_isfinite_or_nan_f:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index b5038c8f606ab..fc6b2d95b2af8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -321,39 +321,38 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; SI-SDAG-LABEL: s_log_v2f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
-; SI-SDAG-NEXT:    s_mov_b32 s8, 0x3377d1cf
+; SI-SDAG-NEXT:    s_mov_b32 s8, 0x3f317217
 ; SI-SDAG-NEXT:    s_mov_b32 s9, 0x7f800000
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
-; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
-; SI-SDAG-NEXT:    s_cselect_b32 s0, 32, 0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v3, s0
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v3, s7, v3
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v0
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v3, s2
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v3, s5, v3
 ; SI-SDAG-NEXT:    v_log_f32_e32 v3, v3
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-SDAG-NEXT:    s_mov_b32 s0, s4
-; SI-SDAG-NEXT:    s_mov_b32 s1, s5
-; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-SDAG-NEXT:    s_mov_b32 s7, 0x3f317217
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; SI-SDAG-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v3
-; SI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
-; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s7, -v4
+; SI-SDAG-NEXT:    s_cselect_b32 s6, 32, 0
+; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
+; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s8, -v4
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; SI-SDAG-NEXT:    v_mov_b32_e32 v1, s4
-; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s8, v5
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s6, v1
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s5, v5
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s4, v1
 ; SI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
 ; SI-SDAG-NEXT:    v_log_f32_e32 v5, v1
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s9
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v5
-; SI-SDAG-NEXT:    v_fma_f32 v3, v5, s7, -v2
-; SI-SDAG-NEXT:    v_fma_f32 v3, v5, s8, v3
+; SI-SDAG-NEXT:    v_fma_f32 v3, v5, s8, -v2
+; SI-SDAG-NEXT:    v_fma_f32 v3, v5, s5, v3
 ; SI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, s9
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 7465b492d75ea..a141bceb3ce86 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -321,39 +321,38 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; SI-SDAG-LABEL: s_log10_v2f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
-; SI-SDAG-NEXT:    s_mov_b32 s8, 0x3284fbcf
+; SI-SDAG-NEXT:    s_mov_b32 s8, 0x3e9a209a
 ; SI-SDAG-NEXT:    s_mov_b32 s9, 0x7f800000
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
-; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
-; SI-SDAG-NEXT:    s_cselect_b32 s0, 32, 0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v3, s0
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v3, s7, v3
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v0
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v3, s2
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v3, s5, v3
 ; SI-SDAG-NEXT:    v_log_f32_e32 v3, v3
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-SDAG-NEXT:    s_mov_b32 s0, s4
-; SI-SDAG-NEXT:    s_mov_b32 s1, s5
-; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-SDAG-NEXT:    s_mov_b32 s7, 0x3e9a209a
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; SI-SDAG-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v3
-; SI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
-; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s7, -v4
+; SI-SDAG-NEXT:    s_cselect_b32 s6, 32, 0
+; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
+; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s8, -v4
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; SI-SDAG-NEXT:    v_mov_b32_e32 v1, s4
-; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s8, v5
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s6, v1
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s5, v5
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s4, v1
 ; SI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
 ; SI-SDAG-NEXT:    v_log_f32_e32 v5, v1
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s9
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v5
-; SI-SDAG-NEXT:    v_fma_f32 v3, v5, s7, -v2
-; SI-SDAG-NEXT:    v_fma_f32 v3, v5, s8, v3
+; SI-SDAG-NEXT:    v_fma_f32 v3, v5, s8, -v2
+; SI-SDAG-NEXT:    v_fma_f32 v3, v5, s5, v3
 ; SI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, s9
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 61a777f8877bb..b1407d39674ad 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -221,8 +221,6 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
 ; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
@@ -238,11 +236,11 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s2, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v3, v3
 ; SI-SDAG-NEXT:    v_log_f32_e32 v4, v1
-; SI-SDAG-NEXT:    s_mov_b32 s4, s0
-; SI-SDAG-NEXT:    s_mov_b32 s5, s1
+; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s2, -1
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v1, v3, v2
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v4, v0
-; SI-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
 ; SI-GISEL-LABEL: s_log2_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 8bb7274c84620..76b97e843d777 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -78,7 +78,6 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_mov_b32 s3, 0xfffff
-; SI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_bfe_u32 v4, v3, 20, 11
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 0xfffffc01, v4
@@ -93,12 +92,12 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
 ; SI-NEXT:    v_add_f64 v[6:7], v[2:3], -v[4:5]
-; SI-NEXT:    s_brev_b32 s2, -2
+; SI-NEXT:    v_mov_b32_e32 v2, 0x3ff00000
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
-; SI-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; SI-NEXT:    v_bfi_b32 v3, s2, v2, v3
-; SI-NEXT:    v_mov_b32_e32 v2, v1
-; SI-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; SI-NEXT:    s_brev_b32 s2, -2
+; SI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-NEXT:    v_bfi_b32 v2, s2, v2, v3
+; SI-NEXT:    v_add_f64 v[2:3], v[4:5], v[1:2]
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
@@ -113,16 +112,15 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; CI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
 ; CI-NEXT:    s_brev_b32 s2, -2
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[2:3]
 ; CI-NEXT:    v_add_f64 v[6:7], v[2:3], -v[4:5]
+; CI-NEXT:    v_mov_b32_e32 v2, 0x3ff00000
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
-; CI-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; CI-NEXT:    v_bfi_b32 v3, s2, v2, v3
-; CI-NEXT:    v_mov_b32_e32 v2, v1
-; CI-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; CI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; CI-NEXT:    v_bfi_b32 v2, s2, v2, v3
+; CI-NEXT:    v_add_f64 v[2:3], v[4:5], v[1:2]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 ; CI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index f93e5f06beff9..83c240c17ff1c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -10386,7 +10386,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x150
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[44:45], v[12:15]
-; GFX8-NEXT:    flat_store_dwordx4 v[42:43], v[4:7]
+; GFX8-NEXT:    flat_store_dwordx4 v[48:49], v[8:11]
+; GFX8-NEXT:    flat_store_dwordx4 v[50:51], v[16:19]
 ; GFX8-NEXT:    v_mov_b32_e32 v13, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v12, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x140
@@ -10395,10 +10396,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v14, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x130
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[46:47], v[0:3]
-; GFX8-NEXT:    flat_store_dwordx4 v[48:49], v[8:11]
-; GFX8-NEXT:    flat_store_dwordx4 v[50:51], v[16:19]
-; GFX8-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v17, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v16, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x120
@@ -10406,20 +10403,21 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v19, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v18, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x110
-; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    flat_store_dwordx4 v[42:43], v[4:7]
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v42, vcc_lo
 ; GFX8-NEXT:    v_mov_b32_e32 v43, vcc_hi
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v7, s5
+; GFX8-NEXT:    flat_store_dwordx4 v[46:47], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s9
-; GFX8-NEXT:    v_mov_b32_e32 v8, s12
-; GFX8-NEXT:    flat_store_dwordx4 v[52:53], v[20:23]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s13
-; GFX8-NEXT:    flat_store_dwordx4 v[54:55], v[24:27]
 ; GFX8-NEXT:    v_mov_b32_e32 v10, s14
 ; GFX8-NEXT:    v_mov_b32_e32 v11, s15
 ; GFX8-NEXT:    flat_store_dwordx4 v[56:57], v[28:31]
@@ -10588,6 +10586,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    flat_store_dwordx4 v[52:53], v[20:23]
+; GFX8-NEXT:    flat_store_dwordx4 v[54:55], v[24:27]
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 6f63384be90fd..2d60c5729ed52 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -9775,17 +9775,17 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_and_b32 s5, s4, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s6, s4, 24
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX6-NOHSA-NEXT:    v_alignbit_b32 v0, s6, v0, 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX6-NOHSA-NEXT:    s_or_b32 s4, s4, s5
-; GFX6-NOHSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v0
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s5, s4, 24
+; GFX6-NOHSA-NEXT:    s_and_b32 s2, s4, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s6, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
+; GFX6-NOHSA-NEXT:    s_or_b32 s5, s6, s2
+; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
@@ -9800,15 +9800,15 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-HSA-NEXT:    s_lshr_b32 s3, s2, 24
 ; GFX7-HSA-NEXT:    s_and_b32 s0, s2, 0xff00
-; GFX7-HSA-NEXT:    s_lshr_b32 s1, s2, 24
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-HSA-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX7-HSA-NEXT:    s_lshl_b32 s0, s0, 8
-; GFX7-HSA-NEXT:    v_alignbit_b32 v2, s1, v2, 16
-; GFX7-HSA-NEXT:    s_or_b32 s0, s2, s0
-; GFX7-HSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v2
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-HSA-NEXT:    s_and_b32 s4, s2, 0xff
+; GFX7-HSA-NEXT:    s_lshl_b32 s5, s0, 8
+; GFX7-HSA-NEXT:    s_lshr_b64 s[0:1], s[2:3], 16
+; GFX7-HSA-NEXT:    s_or_b32 s1, s4, s5
+; GFX7-HSA-NEXT:    s_and_b32 s0, s0, 0xff00ff
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX7-HSA-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX7-HSA-NEXT:    s_endpgm
 ;
@@ -9820,15 +9820,15 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s0, s2, 24
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT:    s_and_b32 s1, s2, 0xff
-; GFX8-NOHSA-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX8-NOHSA-NEXT:    v_alignbit_b32 v2, s0, v2, 16
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, s2, 0xff0000
-; GFX8-NOHSA-NEXT:    s_or_b32 s0, s1, s0
-; GFX8-NOHSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v2
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s3, s2, 24
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s2, 8
+; GFX8-NOHSA-NEXT:    s_and_b32 s4, s2, 0xff
+; GFX8-NOHSA-NEXT:    s_and_b32 s5, s0, 0xff0000
+; GFX8-NOHSA-NEXT:    s_lshr_b64 s[0:1], s[2:3], 16
+; GFX8-NOHSA-NEXT:    s_or_b32 s1, s4, s5
+; GFX8-NOHSA-NEXT:    s_and_b32 s0, s0, 0xff00ff
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
 ;
@@ -10062,26 +10062,28 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NOHSA-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_and_b32 s6, s4, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s7, s4, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s8, s5, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s9, s5, 24
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s5
-; GFX6-NOHSA-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX6-NOHSA-NEXT:    v_alignbit_b32 v0, s9, v0, 16
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s2, s4, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s7, s5, 24
+; GFX6-NOHSA-NEXT:    s_and_b32 s8, s4, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s9, s5, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s10, s5, 0xff
+; GFX6-NOHSA-NEXT:    s_and_b32 s11, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_mov_b32 s6, s5
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s9, s9, 8
 ; GFX6-NOHSA-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX6-NOHSA-NEXT:    v_alignbit_b32 v1, s7, v1, 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX6-NOHSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v0
-; GFX6-NOHSA-NEXT:    s_or_b32 s5, s5, s8
-; GFX6-NOHSA-NEXT:    s_or_b32 s4, s4, s6
-; GFX6-NOHSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v1
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
+; GFX6-NOHSA-NEXT:    s_mov_b32 s5, s2
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
+; GFX6-NOHSA-NEXT:    s_or_b32 s5, s10, s9
+; GFX6-NOHSA-NEXT:    s_or_b32 s7, s11, s8
+; GFX6-NOHSA-NEXT:    s_and_b32 s6, s6, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
@@ -10096,24 +10098,26 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT:    s_lshr_b32 s5, s3, 24
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s3
-; GFX7-HSA-NEXT:    v_alignbit_b32 v0, s5, v0, 16
 ; GFX7-HSA-NEXT:    s_and_b32 s0, s2, 0xff00
-; GFX7-HSA-NEXT:    s_lshr_b32 s1, s2, 24
-; GFX7-HSA-NEXT:    s_and_b32 s4, s3, 0xff00
-; GFX7-HSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-HSA-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX7-HSA-NEXT:    s_lshl_b32 s4, s4, 8
-; GFX7-HSA-NEXT:    v_alignbit_b32 v0, s1, v0, 16
-; GFX7-HSA-NEXT:    s_and_b32 s1, s2, 0xff
+; GFX7-HSA-NEXT:    s_and_b32 s5, s3, 0xff00
+; GFX7-HSA-NEXT:    s_and_b32 s6, s3, 0xff
+; GFX7-HSA-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX7-HSA-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX7-HSA-NEXT:    s_lshl_b32 s0, s0, 8
-; GFX7-HSA-NEXT:    s_or_b32 s3, s3, s4
-; GFX7-HSA-NEXT:    s_or_b32 s0, s1, s0
-; GFX7-HSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s3
+; GFX7-HSA-NEXT:    s_lshr_b32 s4, s2, 24
+; GFX7-HSA-NEXT:    s_lshr_b32 s1, s3, 24
+; GFX7-HSA-NEXT:    s_or_b32 s5, s6, s5
+; GFX7-HSA-NEXT:    s_or_b32 s6, s7, s0
+; GFX7-HSA-NEXT:    s_mov_b32 s0, s3
+; GFX7-HSA-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
+; GFX7-HSA-NEXT:    s_mov_b32 s3, s4
+; GFX7-HSA-NEXT:    s_and_b32 s7, s0, 0xff00ff
+; GFX7-HSA-NEXT:    s_lshr_b64 s[0:1], s[2:3], 16
+; GFX7-HSA-NEXT:    s_and_b32 s0, s0, 0xff00ff
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX7-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX7-HSA-NEXT:    s_endpgm
 ;
@@ -10122,28 +10126,29 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s0, s2, 24
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s1, s3, 24
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s4, s3, 0x80010
 ; GFX8-NOHSA-NEXT:    s_and_b32 s5, s3, 0xff
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX8-NOHSA-NEXT:    v_alignbit_b32 v0, s0, v0, 16
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, s2, 0xff
-; GFX8-NOHSA-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX8-NOHSA-NEXT:    s_and_b32 s3, s3, 0xff0000
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NOHSA-NEXT:    s_or_b32 s5, s5, s3
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s3, s2, 8
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s0, s2, 24
+; GFX8-NOHSA-NEXT:    s_or_b32 s4, s4, s1
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX8-NOHSA-NEXT:    s_and_b32 s3, s3, 0xff0000
-; GFX8-NOHSA-NEXT:    s_and_b32 s2, s2, 0xff0000
-; GFX8-NOHSA-NEXT:    s_or_b32 s1, s4, s1
-; GFX8-NOHSA-NEXT:    s_or_b32 s3, s5, s3
-; GFX8-NOHSA-NEXT:    s_or_b32 s0, s0, s2
-; GFX8-NOHSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NOHSA-NEXT:    s_or_b32 s6, s1, s3
+; GFX8-NOHSA-NEXT:    s_mov_b32 s3, s0
+; GFX8-NOHSA-NEXT:    s_lshr_b64 s[0:1], s[2:3], 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s0, s0, 0xff00ff
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
 ;
@@ -10500,43 +10505,48 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_and_b32 s8, s6, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s9, s6, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s10, s7, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s11, s7, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s12, s4, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s13, s4, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s14, s5, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s15, s5, 24
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s5
-; GFX6-NOHSA-NEXT:    s_and_b32 s5, s5, 0xff
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s12, s6, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s9, s7, 24
+; GFX6-NOHSA-NEXT:    s_and_b32 s13, s6, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s14, s7, 0xff00
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s15, s4, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s11, s5, 24
+; GFX6-NOHSA-NEXT:    s_and_b32 s16, s4, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s17, s5, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s18, s5, 0xff
+; GFX6-NOHSA-NEXT:    s_and_b32 s19, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_mov_b32 s10, s5
+; GFX6-NOHSA-NEXT:    s_and_b32 s20, s7, 0xff
+; GFX6-NOHSA-NEXT:    s_and_b32 s21, s6, 0xff
+; GFX6-NOHSA-NEXT:    s_mov_b32 s8, s7
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s17, s17, 8
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s16, s16, 8
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[10:11], s[10:11], 16
+; GFX6-NOHSA-NEXT:    s_mov_b32 s5, s15
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s5, s14, 8
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s11, s13, 8
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[8:9], s[8:9], 16
+; GFX6-NOHSA-NEXT:    s_mov_b32 s7, s12
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
+; GFX6-NOHSA-NEXT:    s_or_b32 s7, s18, s17
+; GFX6-NOHSA-NEXT:    s_or_b32 s9, s19, s16
+; GFX6-NOHSA-NEXT:    s_and_b32 s10, s10, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_or_b32 s5, s20, s5
+; GFX6-NOHSA-NEXT:    s_or_b32 s11, s21, s11
+; GFX6-NOHSA-NEXT:    s_and_b32 s8, s8, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_and_b32 s6, s6, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xff00ff
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s11
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s8
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s9
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
-; GFX6-NOHSA-NEXT:    s_and_b32 s7, s7, 0xff
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s6
-; GFX6-NOHSA-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX6-NOHSA-NEXT:    v_alignbit_b32 v0, s15, v0, 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s14, s14, 8
-; GFX6-NOHSA-NEXT:    v_alignbit_b32 v1, s13, v1, 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s12, s12, 8
-; GFX6-NOHSA-NEXT:    v_alignbit_b32 v2, s11, v2, 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s10, s10, 8
-; GFX6-NOHSA-NEXT:    v_alignbit_b32 v4, s9, v3, 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX6-NOHSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v0
-; GFX6-NOHSA-NEXT:    s_or_b32 s5, s5, s14
-; GFX6-NOHSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v1
-; GFX6-NOHSA-NEXT:    s_or_b32 s4, s4, s12
-; GFX6-NOHSA-NEXT:    v_and_b32_e32 v7, 0xff00ff, v2
-; GFX6-NOHSA-NEXT:    s_or_b32 s7, s7, s10
-; GFX6-NOHSA-NEXT:    s_or_b32 s6, s6, s8
-; GFX6-NOHSA-NEXT:    v_and_b32_e32 v5, 0xff00ff, v4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s7
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
@@ -10549,48 +10559,52 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT:    s_lshr_b32 s13, s5, 24
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7-HSA-NEXT:    v_alignbit_b32 v0, s13, v0, 16
-; GFX7-HSA-NEXT:    s_lshr_b32 s11, s4, 24
-; GFX7-HSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-HSA-NEXT:    v_alignbit_b32 v0, s11, v0, 16
-; GFX7-HSA-NEXT:    s_lshr_b32 s9, s7, 24
-; GFX7-HSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-HSA-NEXT:    v_alignbit_b32 v0, s9, v0, 16
-; GFX7-HSA-NEXT:    s_and_b32 s2, s6, 0xff00
-; GFX7-HSA-NEXT:    s_lshr_b32 s3, s6, 24
-; GFX7-HSA-NEXT:    s_and_b32 s8, s7, 0xff00
-; GFX7-HSA-NEXT:    s_and_b32 s10, s4, 0xff00
-; GFX7-HSA-NEXT:    s_and_b32 s12, s5, 0xff00
-; GFX7-HSA-NEXT:    v_and_b32_e32 v7, 0xff00ff, v0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-HSA-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX7-HSA-NEXT:    s_lshl_b32 s12, s12, 8
-; GFX7-HSA-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX7-HSA-NEXT:    s_lshl_b32 s10, s10, 8
-; GFX7-HSA-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX7-HSA-NEXT:    s_and_b32 s13, s5, 0xff00
+; GFX7-HSA-NEXT:    s_and_b32 s8, s4, 0xff00
+; GFX7-HSA-NEXT:    s_and_b32 s14, s5, 0xff
+; GFX7-HSA-NEXT:    s_lshl_b32 s13, s13, 8
+; GFX7-HSA-NEXT:    s_lshr_b32 s12, s4, 24
+; GFX7-HSA-NEXT:    s_or_b32 s13, s14, s13
+; GFX7-HSA-NEXT:    s_and_b32 s14, s4, 0xff
 ; GFX7-HSA-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX7-HSA-NEXT:    v_alignbit_b32 v0, s3, v0, 16
-; GFX7-HSA-NEXT:    s_and_b32 s3, s6, 0xff
+; GFX7-HSA-NEXT:    s_lshr_b32 s9, s5, 24
+; GFX7-HSA-NEXT:    s_or_b32 s14, s14, s8
+; GFX7-HSA-NEXT:    s_mov_b32 s8, s5
+; GFX7-HSA-NEXT:    s_mov_b32 s5, s12
+; GFX7-HSA-NEXT:    s_and_b32 s11, s7, 0xff00
+; GFX7-HSA-NEXT:    s_lshr_b64 s[8:9], s[8:9], 16
+; GFX7-HSA-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
+; GFX7-HSA-NEXT:    s_and_b32 s2, s6, 0xff00
+; GFX7-HSA-NEXT:    s_and_b32 s5, s7, 0xff
+; GFX7-HSA-NEXT:    s_lshl_b32 s9, s11, 8
+; GFX7-HSA-NEXT:    s_or_b32 s5, s5, s9
+; GFX7-HSA-NEXT:    s_and_b32 s9, s6, 0xff
 ; GFX7-HSA-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX7-HSA-NEXT:    s_or_b32 s5, s5, s12
-; GFX7-HSA-NEXT:    s_or_b32 s4, s4, s10
-; GFX7-HSA-NEXT:    s_or_b32 s7, s7, s8
-; GFX7-HSA-NEXT:    s_or_b32 s2, s3, s2
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX7-HSA-NEXT:    s_lshr_b32 s10, s6, 24
+; GFX7-HSA-NEXT:    s_lshr_b32 s3, s7, 24
+; GFX7-HSA-NEXT:    s_or_b32 s9, s9, s2
+; GFX7-HSA-NEXT:    s_mov_b32 s2, s7
+; GFX7-HSA-NEXT:    s_lshr_b64 s[2:3], s[2:3], 16
+; GFX7-HSA-NEXT:    s_mov_b32 s7, s10
+; GFX7-HSA-NEXT:    s_and_b32 s11, s2, 0xff00ff
+; GFX7-HSA-NEXT:    s_lshr_b64 s[2:3], s[6:7], 16
+; GFX7-HSA-NEXT:    s_and_b32 s8, s8, 0xff00ff
+; GFX7-HSA-NEXT:    s_and_b32 s4, s4, 0xff00ff
+; GFX7-HSA-NEXT:    s_and_b32 s2, s2, 0xff00ff
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GFX7-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v9, s3
-; GFX7-HSA-NEXT:    v_and_b32_e32 v5, 0xff00ff, v0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v6, s7
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v8, s2
-; GFX7-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s9
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s11
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX7-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s14
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s13
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX7-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX7-HSA-NEXT:    s_endpgm
@@ -10601,50 +10615,52 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s3, s4, 24
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT:    v_alignbit_b32 v0, s3, v0, 16
-; GFX8-NOHSA-NEXT:    s_and_b32 s3, s4, 0xff
-; GFX8-NOHSA-NEXT:    s_lshl_b32 s4, s4, 8
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s8, s5, 24
-; GFX8-NOHSA-NEXT:    s_and_b32 s4, s4, 0xff0000
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s3, s5, 24
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s9, s5, 0x80010
-; GFX8-NOHSA-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX8-NOHSA-NEXT:    s_or_b32 s4, s3, s4
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s3, s7, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s2, s6, 24
-; GFX8-NOHSA-NEXT:    s_or_b32 s8, s9, s8
-; GFX8-NOHSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v0
-; GFX8-NOHSA-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s9, s7, 0x80010
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NOHSA-NEXT:    s_and_b32 s10, s5, 0xff
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX8-NOHSA-NEXT:    s_or_b32 s3, s9, s3
-; GFX8-NOHSA-NEXT:    s_and_b32 s9, s7, 0xff
-; GFX8-NOHSA-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX8-NOHSA-NEXT:    v_alignbit_b32 v0, s2, v0, 16
-; GFX8-NOHSA-NEXT:    s_and_b32 s2, s6, 0xff
-; GFX8-NOHSA-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s2, s4, 24
 ; GFX8-NOHSA-NEXT:    s_and_b32 s5, s5, 0xff0000
-; GFX8-NOHSA-NEXT:    s_and_b32 s7, s7, 0xff0000
-; GFX8-NOHSA-NEXT:    s_and_b32 s6, s6, 0xff0000
-; GFX8-NOHSA-NEXT:    s_or_b32 s5, s10, s5
-; GFX8-NOHSA-NEXT:    s_or_b32 s7, s9, s7
-; GFX8-NOHSA-NEXT:    s_or_b32 s2, s2, s6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT:    s_or_b32 s9, s9, s3
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s3, s4, 8
+; GFX8-NOHSA-NEXT:    s_and_b32 s11, s4, 0xff
+; GFX8-NOHSA-NEXT:    s_or_b32 s10, s10, s5
+; GFX8-NOHSA-NEXT:    s_and_b32 s3, s3, 0xff0000
+; GFX8-NOHSA-NEXT:    s_mov_b32 s5, s2
+; GFX8-NOHSA-NEXT:    s_or_b32 s11, s11, s3
+; GFX8-NOHSA-NEXT:    s_lshr_b64 s[2:3], s[4:5], 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s4, s2, 0xff00ff
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s2, s7, 24
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s3, s7, 0x80010
+; GFX8-NOHSA-NEXT:    s_or_b32 s5, s3, s2
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s3, s7, 8
+; GFX8-NOHSA-NEXT:    s_and_b32 s2, s7, 0xff
+; GFX8-NOHSA-NEXT:    s_and_b32 s3, s3, 0xff0000
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s8, s6, 24
+; GFX8-NOHSA-NEXT:    s_or_b32 s12, s2, s3
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s3, s6, 8
+; GFX8-NOHSA-NEXT:    s_and_b32 s2, s6, 0xff
+; GFX8-NOHSA-NEXT:    s_and_b32 s3, s3, 0xff0000
+; GFX8-NOHSA-NEXT:    s_mov_b32 s7, s8
+; GFX8-NOHSA-NEXT:    s_or_b32 s13, s2, s3
+; GFX8-NOHSA-NEXT:    s_lshr_b64 s[2:3], s[6:7], 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s2, s2, 0xff00ff
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s3
-; GFX8-NOHSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s7
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s2
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[6:7], v[2:5]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s13
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s12
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s11
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
@@ -11272,81 +11288,92 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX6-NOHSA-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT:    s_and_b32 s12, s6, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s13, s6, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s14, s7, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s15, s7, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s16, s4, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s17, s4, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s18, s5, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s19, s5, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s20, s2, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s21, s2, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s22, s3, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s23, s3, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s24, s0, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s25, s0, 24
-; GFX6-NOHSA-NEXT:    s_and_b32 s26, s1, 0xff00
-; GFX6-NOHSA-NEXT:    s_lshr_b32 s27, s1, 24
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NOHSA-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NOHSA-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s20, s6, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s13, s7, 24
+; GFX6-NOHSA-NEXT:    s_and_b32 s21, s6, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s22, s7, 0xff00
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s23, s4, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s15, s5, 24
+; GFX6-NOHSA-NEXT:    s_and_b32 s24, s4, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s25, s5, 0xff00
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s26, s2, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s17, s3, 24
+; GFX6-NOHSA-NEXT:    s_and_b32 s27, s2, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s28, s3, 0xff00
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s29, s0, 24
+; GFX6-NOHSA-NEXT:    s_lshr_b32 s19, s1, 24
+; GFX6-NOHSA-NEXT:    s_and_b32 s30, s0, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s31, s1, 0xff00
+; GFX6-NOHSA-NEXT:    s_and_b32 s33, s1, 0xff
+; GFX6-NOHSA-NEXT:    s_and_b32 s34, s0, 0xff
+; GFX6-NOHSA-NEXT:    s_mov_b32 s18, s1
+; GFX6-NOHSA-NEXT:    s_and_b32 s35, s3, 0xff
+; GFX6-NOHSA-NEXT:    s_and_b32 s36, s2, 0xff
+; GFX6-NOHSA-NEXT:    s_mov_b32 s16, s3
+; GFX6-NOHSA-NEXT:    s_and_b32 s37, s5, 0xff
+; GFX6-NOHSA-NEXT:    s_and_b32 s38, s4, 0xff
+; GFX6-NOHSA-NEXT:    s_mov_b32 s14, s5
+; GFX6-NOHSA-NEXT:    s_and_b32 s39, s7, 0xff
+; GFX6-NOHSA-NEXT:    s_and_b32 s40, s6, 0xff
+; GFX6-NOHSA-NEXT:    s_mov_b32 s12, s7
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s31, s31, 8
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s30, s30, 8
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[18:19], s[18:19], 16
+; GFX6-NOHSA-NEXT:    s_mov_b32 s1, s29
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s1, s28, 8
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s19, s27, 8
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[16:17], s[16:17], 16
+; GFX6-NOHSA-NEXT:    s_mov_b32 s3, s26
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[2:3], s[2:3], 16
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s3, s25, 8
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s17, s24, 8
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[14:15], s[14:15], 16
+; GFX6-NOHSA-NEXT:    s_mov_b32 s5, s23
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s5, s22, 8
+; GFX6-NOHSA-NEXT:    s_lshl_b32 s15, s21, 8
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[12:13], s[12:13], 16
+; GFX6-NOHSA-NEXT:    s_mov_b32 s7, s20
+; GFX6-NOHSA-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
+; GFX6-NOHSA-NEXT:    s_or_b32 s7, s33, s31
+; GFX6-NOHSA-NEXT:    s_or_b32 s13, s34, s30
+; GFX6-NOHSA-NEXT:    s_and_b32 s18, s18, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_and_b32 s0, s0, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_or_b32 s1, s35, s1
+; GFX6-NOHSA-NEXT:    s_or_b32 s19, s36, s19
+; GFX6-NOHSA-NEXT:    s_and_b32 s16, s16, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_and_b32 s2, s2, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_or_b32 s3, s37, s3
+; GFX6-NOHSA-NEXT:    s_or_b32 s17, s38, s17
+; GFX6-NOHSA-NEXT:    s_and_b32 s14, s14, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_or_b32 s5, s39, s5
+; GFX6-NOHSA-NEXT:    s_or_b32 s15, s40, s15
+; GFX6-NOHSA-NEXT:    s_and_b32 s12, s12, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_and_b32 s6, s6, 0xff00ff
+; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xff00ff
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s15
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s12
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s17
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s3
-; GFX6-NOHSA-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s2
-; GFX6-NOHSA-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s5
-; GFX6-NOHSA-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v5, s4
-; GFX6-NOHSA-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s7
-; GFX6-NOHSA-NEXT:    s_and_b32 s7, s7, 0xff
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v7, s6
-; GFX6-NOHSA-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX6-NOHSA-NEXT:    v_alignbit_b32 v0, s27, v0, 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s26, s26, 8
-; GFX6-NOHSA-NEXT:    v_alignbit_b32 v1, s25, v1, 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s24, s24, 8
-; GFX6-NOHSA-NEXT:    v_alignbit_b32 v2, s23, v2, 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s22, s22, 8
-; GFX6-NOHSA-NEXT:    v_alignbit_b32 v8, s21, v3, 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s20, s20, 8
-; GFX6-NOHSA-NEXT:    v_alignbit_b32 v4, s19, v4, 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s18, s18, 8
-; GFX6-NOHSA-NEXT:    v_alignbit_b32 v9, s17, v5, 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s16, s16, 8
-; GFX6-NOHSA-NEXT:    v_alignbit_b32 v6, s15, v6, 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s14, s14, 8
-; GFX6-NOHSA-NEXT:    v_alignbit_b32 v10, s13, v7, 16
-; GFX6-NOHSA-NEXT:    s_lshl_b32 s12, s12, 8
-; GFX6-NOHSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v0
-; GFX6-NOHSA-NEXT:    s_or_b32 s1, s1, s26
-; GFX6-NOHSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v1
-; GFX6-NOHSA-NEXT:    s_or_b32 s0, s0, s24
-; GFX6-NOHSA-NEXT:    v_and_b32_e32 v7, 0xff00ff, v2
-; GFX6-NOHSA-NEXT:    s_or_b32 s3, s3, s22
-; GFX6-NOHSA-NEXT:    v_and_b32_e32 v5, 0xff00ff, v8
-; GFX6-NOHSA-NEXT:    s_or_b32 s2, s2, s20
-; GFX6-NOHSA-NEXT:    v_and_b32_e32 v11, 0xff00ff, v4
-; GFX6-NOHSA-NEXT:    s_or_b32 s5, s5, s18
-; GFX6-NOHSA-NEXT:    v_and_b32_e32 v9, 0xff00ff, v9
-; GFX6-NOHSA-NEXT:    s_or_b32 s4, s4, s16
-; GFX6-NOHSA-NEXT:    v_and_b32_e32 v15, 0xff00ff, v6
-; GFX6-NOHSA-NEXT:    s_or_b32 s7, s7, s14
-; GFX6-NOHSA-NEXT:    s_or_b32 s6, s6, s12
-; GFX6-NOHSA-NEXT:    v_and_b32_e32 v13, 0xff00ff, v10
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v12, s6
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v14, s7
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v8, s4
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v10, s5
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v6, s3
-; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s14
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s19
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s1
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s16
+; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GFX6-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v0, s13
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
+; GFX6-NOHSA-NEXT:    v_mov_b32_e32 v3, s18
 ; GFX6-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NOHSA-NEXT:    s_endpgm
 ;
@@ -11354,99 +11381,106 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x0
 ; GFX7-HSA-NEXT:    s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT:    s_lshr_b32 s25, s1, 24
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-HSA-NEXT:    v_alignbit_b32 v0, s25, v0, 16
-; GFX7-HSA-NEXT:    s_lshr_b32 s23, s0, 24
-; GFX7-HSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-HSA-NEXT:    v_alignbit_b32 v0, s23, v0, 16
-; GFX7-HSA-NEXT:    s_lshr_b32 s21, s3, 24
-; GFX7-HSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s3
-; GFX7-HSA-NEXT:    v_alignbit_b32 v0, s21, v0, 16
-; GFX7-HSA-NEXT:    s_lshr_b32 s19, s2, 24
-; GFX7-HSA-NEXT:    s_and_b32 s24, s1, 0xff00
-; GFX7-HSA-NEXT:    v_and_b32_e32 v7, 0xff00ff, v0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-HSA-NEXT:    s_and_b32 s22, s0, 0xff00
-; GFX7-HSA-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX7-HSA-NEXT:    s_lshl_b32 s24, s24, 8
-; GFX7-HSA-NEXT:    v_alignbit_b32 v0, s19, v0, 16
-; GFX7-HSA-NEXT:    s_lshr_b32 s17, s5, 24
+; GFX7-HSA-NEXT:    s_and_b32 s22, s1, 0xff00
+; GFX7-HSA-NEXT:    s_and_b32 s12, s0, 0xff00
+; GFX7-HSA-NEXT:    s_and_b32 s23, s1, 0xff
+; GFX7-HSA-NEXT:    s_lshl_b32 s22, s22, 8
+; GFX7-HSA-NEXT:    s_lshr_b32 s21, s0, 24
+; GFX7-HSA-NEXT:    s_or_b32 s22, s23, s22
+; GFX7-HSA-NEXT:    s_and_b32 s23, s0, 0xff
+; GFX7-HSA-NEXT:    s_lshl_b32 s12, s12, 8
+; GFX7-HSA-NEXT:    s_lshr_b32 s13, s1, 24
+; GFX7-HSA-NEXT:    s_or_b32 s23, s23, s12
+; GFX7-HSA-NEXT:    s_mov_b32 s12, s1
+; GFX7-HSA-NEXT:    s_mov_b32 s1, s21
 ; GFX7-HSA-NEXT:    s_and_b32 s20, s3, 0xff00
-; GFX7-HSA-NEXT:    s_or_b32 s24, s1, s24
-; GFX7-HSA-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX7-HSA-NEXT:    s_lshl_b32 s1, s22, 8
-; GFX7-HSA-NEXT:    v_and_b32_e32 v5, 0xff00ff, v0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7-HSA-NEXT:    s_and_b32 s18, s2, 0xff00
-; GFX7-HSA-NEXT:    s_or_b32 s22, s0, s1
+; GFX7-HSA-NEXT:    s_lshr_b64 s[12:13], s[12:13], 16
+; GFX7-HSA-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
+; GFX7-HSA-NEXT:    s_and_b32 s19, s2, 0xff00
+; GFX7-HSA-NEXT:    s_and_b32 s13, s0, 0xff00ff
 ; GFX7-HSA-NEXT:    s_and_b32 s0, s3, 0xff
 ; GFX7-HSA-NEXT:    s_lshl_b32 s1, s20, 8
-; GFX7-HSA-NEXT:    v_alignbit_b32 v0, s17, v0, 16
-; GFX7-HSA-NEXT:    s_lshr_b32 s15, s4, 24
-; GFX7-HSA-NEXT:    s_and_b32 s16, s5, 0xff00
-; GFX7-HSA-NEXT:    s_or_b32 s3, s0, s1
+; GFX7-HSA-NEXT:    s_or_b32 s20, s0, s1
 ; GFX7-HSA-NEXT:    s_and_b32 s0, s2, 0xff
-; GFX7-HSA-NEXT:    s_lshl_b32 s1, s18, 8
-; GFX7-HSA-NEXT:    v_and_b32_e32 v11, 0xff00ff, v0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-HSA-NEXT:    s_and_b32 s14, s4, 0xff00
-; GFX7-HSA-NEXT:    s_or_b32 s2, s0, s1
+; GFX7-HSA-NEXT:    s_lshl_b32 s1, s19, 8
+; GFX7-HSA-NEXT:    s_lshr_b32 s18, s2, 24
+; GFX7-HSA-NEXT:    s_or_b32 s19, s0, s1
+; GFX7-HSA-NEXT:    s_lshr_b32 s1, s3, 24
+; GFX7-HSA-NEXT:    s_mov_b32 s0, s3
+; GFX7-HSA-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
+; GFX7-HSA-NEXT:    s_mov_b32 s3, s18
+; GFX7-HSA-NEXT:    s_and_b32 s17, s5, 0xff00
+; GFX7-HSA-NEXT:    s_and_b32 s21, s0, 0xff00ff
+; GFX7-HSA-NEXT:    s_lshr_b64 s[0:1], s[2:3], 16
+; GFX7-HSA-NEXT:    s_and_b32 s10, s4, 0xff00
+; GFX7-HSA-NEXT:    s_and_b32 s2, s0, 0xff00ff
 ; GFX7-HSA-NEXT:    s_and_b32 s0, s5, 0xff
-; GFX7-HSA-NEXT:    s_lshl_b32 s1, s16, 8
-; GFX7-HSA-NEXT:    v_alignbit_b32 v0, s15, v0, 16
-; GFX7-HSA-NEXT:    s_and_b32 s12, s7, 0xff00
-; GFX7-HSA-NEXT:    s_lshr_b32 s13, s7, 24
-; GFX7-HSA-NEXT:    s_or_b32 s5, s0, s1
-; GFX7-HSA-NEXT:    v_and_b32_e32 v9, 0xff00ff, v0
+; GFX7-HSA-NEXT:    s_lshl_b32 s1, s17, 8
+; GFX7-HSA-NEXT:    s_lshr_b32 s16, s4, 24
+; GFX7-HSA-NEXT:    s_lshr_b32 s11, s5, 24
+; GFX7-HSA-NEXT:    s_or_b32 s3, s0, s1
 ; GFX7-HSA-NEXT:    s_and_b32 s0, s4, 0xff
-; GFX7-HSA-NEXT:    s_lshl_b32 s1, s14, 8
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s7
-; GFX7-HSA-NEXT:    s_and_b32 s10, s6, 0xff00
-; GFX7-HSA-NEXT:    s_or_b32 s4, s0, s1
-; GFX7-HSA-NEXT:    v_alignbit_b32 v0, s13, v0, 16
+; GFX7-HSA-NEXT:    s_lshl_b32 s1, s10, 8
+; GFX7-HSA-NEXT:    s_mov_b32 s10, s5
+; GFX7-HSA-NEXT:    s_or_b32 s17, s0, s1
+; GFX7-HSA-NEXT:    s_lshr_b64 s[0:1], s[10:11], 16
+; GFX7-HSA-NEXT:    s_mov_b32 s5, s16
+; GFX7-HSA-NEXT:    s_and_b32 s15, s7, 0xff00
+; GFX7-HSA-NEXT:    s_and_b32 s10, s0, 0xff00ff
+; GFX7-HSA-NEXT:    s_lshr_b64 s[0:1], s[4:5], 16
+; GFX7-HSA-NEXT:    s_and_b32 s14, s6, 0xff00
+; GFX7-HSA-NEXT:    s_and_b32 s4, s0, 0xff00ff
 ; GFX7-HSA-NEXT:    s_and_b32 s0, s7, 0xff
-; GFX7-HSA-NEXT:    s_lshl_b32 s1, s12, 8
-; GFX7-HSA-NEXT:    s_lshr_b32 s11, s6, 24
-; GFX7-HSA-NEXT:    v_and_b32_e32 v15, 0xff00ff, v0
-; GFX7-HSA-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-HSA-NEXT:    s_and_b32 s1, s6, 0xff
-; GFX7-HSA-NEXT:    s_lshl_b32 s6, s10, 8
-; GFX7-HSA-NEXT:    s_or_b32 s1, s1, s6
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v14, s0
+; GFX7-HSA-NEXT:    s_lshl_b32 s1, s15, 8
+; GFX7-HSA-NEXT:    s_or_b32 s5, s0, s1
+; GFX7-HSA-NEXT:    s_and_b32 s0, s6, 0xff
+; GFX7-HSA-NEXT:    s_lshl_b32 s1, s14, 8
+; GFX7-HSA-NEXT:    s_or_b32 s11, s0, s1
+; GFX7-HSA-NEXT:    s_lshr_b32 s1, s7, 24
+; GFX7-HSA-NEXT:    s_mov_b32 s0, s7
+; GFX7-HSA-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
+; GFX7-HSA-NEXT:    s_lshr_b32 s7, s6, 24
+; GFX7-HSA-NEXT:    s_and_b32 s14, s0, 0xff00ff
+; GFX7-HSA-NEXT:    s_lshr_b64 s[0:1], s[6:7], 16
+; GFX7-HSA-NEXT:    s_and_b32 s12, s12, 0xff00ff
+; GFX7-HSA-NEXT:    s_and_b32 s0, s0, 0xff00ff
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX7-HSA-NEXT:    s_add_u32 s0, s8, 48
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v12, s1
 ; GFX7-HSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v17, s1
-; GFX7-HSA-NEXT:    v_alignbit_b32 v0, s11, v0, 16
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v16, s0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX7-HSA-NEXT:    s_add_u32 s0, s8, 32
-; GFX7-HSA-NEXT:    v_and_b32_e32 v13, 0xff00ff, v0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s11
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s14
 ; GFX7-HSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX7-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v8, s4
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v13, s1
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v12, s0
+; GFX7-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX7-HSA-NEXT:    s_add_u32 s0, s8, 16
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v10, s5
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s17
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s3
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX7-HSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX7-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v9, s1
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v6, s3
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v8, s0
-; GFX7-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s22
+; GFX7-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s19
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s20
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s21
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GFX7-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s8
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s24
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s23
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s13
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s22
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s12
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX7-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX7-HSA-NEXT:    s_endpgm
@@ -11463,90 +11497,94 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s14, s14, 16
 ; GFX8-NOHSA-NEXT:    s_and_b32 s1, s1, 0xff0000
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s13, s0, 24
 ; GFX8-NOHSA-NEXT:    s_or_b32 s14, s15, s14
 ; GFX8-NOHSA-NEXT:    s_or_b32 s15, s16, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s16, s0, 8
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s13, s0, 24
 ; GFX8-NOHSA-NEXT:    s_and_b32 s1, s0, 0xff
-; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 8
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, s0, 0xff0000
-; GFX8-NOHSA-NEXT:    v_alignbit_b32 v0, s13, v0, 16
-; GFX8-NOHSA-NEXT:    s_or_b32 s13, s1, s0
+; GFX8-NOHSA-NEXT:    s_and_b32 s16, s16, 0xff0000
+; GFX8-NOHSA-NEXT:    s_or_b32 s16, s1, s16
+; GFX8-NOHSA-NEXT:    s_mov_b32 s1, s13
+; GFX8-NOHSA-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s13, s0, 0xff00ff
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s0, s3, 24
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s1, s3, 0x80010
-; GFX8-NOHSA-NEXT:    s_or_b32 s16, s1, s0
+; GFX8-NOHSA-NEXT:    s_or_b32 s17, s1, s0
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s3, 8
 ; GFX8-NOHSA-NEXT:    s_and_b32 s0, s3, 0xff
 ; GFX8-NOHSA-NEXT:    s_and_b32 s1, s1, 0xff0000
-; GFX8-NOHSA-NEXT:    s_or_b32 s3, s0, s1
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s12, s2, 24
+; GFX8-NOHSA-NEXT:    s_or_b32 s18, s0, s1
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s2, 8
 ; GFX8-NOHSA-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX8-NOHSA-NEXT:    s_and_b32 s1, s1, 0xff0000
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s12, s2, 24
-; GFX8-NOHSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT:    s_or_b32 s2, s0, s1
+; GFX8-NOHSA-NEXT:    s_mov_b32 s3, s12
+; GFX8-NOHSA-NEXT:    s_or_b32 s19, s0, s1
+; GFX8-NOHSA-NEXT:    s_lshr_b64 s[0:1], s[2:3], 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s2, s0, 0xff00ff
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s0, s5, 24
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s1, s5, 0x80010
-; GFX8-NOHSA-NEXT:    v_alignbit_b32 v0, s12, v0, 16
-; GFX8-NOHSA-NEXT:    s_or_b32 s12, s1, s0
+; GFX8-NOHSA-NEXT:    s_or_b32 s3, s1, s0
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s5, 8
 ; GFX8-NOHSA-NEXT:    s_and_b32 s0, s5, 0xff
 ; GFX8-NOHSA-NEXT:    s_and_b32 s1, s1, 0xff0000
-; GFX8-NOHSA-NEXT:    s_or_b32 s5, s0, s1
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s11, s4, 24
+; GFX8-NOHSA-NEXT:    s_or_b32 s12, s0, s1
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s4, 8
 ; GFX8-NOHSA-NEXT:    s_and_b32 s0, s4, 0xff
 ; GFX8-NOHSA-NEXT:    s_and_b32 s1, s1, 0xff0000
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s11, s4, 24
-; GFX8-NOHSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT:    s_or_b32 s4, s0, s1
+; GFX8-NOHSA-NEXT:    s_mov_b32 s5, s11
+; GFX8-NOHSA-NEXT:    s_or_b32 s20, s0, s1
+; GFX8-NOHSA-NEXT:    s_lshr_b64 s[0:1], s[4:5], 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s4, s0, 0xff00ff
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s0, s7, 24
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s1, s7, 0x80010
-; GFX8-NOHSA-NEXT:    s_or_b32 s0, s1, s0
-; GFX8-NOHSA-NEXT:    s_and_b32 s1, s7, 0xff
-; GFX8-NOHSA-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX8-NOHSA-NEXT:    v_alignbit_b32 v0, s11, v0, 16
-; GFX8-NOHSA-NEXT:    s_and_b32 s7, s7, 0xff0000
+; GFX8-NOHSA-NEXT:    s_or_b32 s5, s1, s0
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s7, 8
+; GFX8-NOHSA-NEXT:    s_and_b32 s0, s7, 0xff
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, s1, 0xff0000
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s10, s6, 24
-; GFX8-NOHSA-NEXT:    v_and_b32_e32 v5, 0xff00ff, v0
-; GFX8-NOHSA-NEXT:    s_or_b32 s1, s1, s7
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NOHSA-NEXT:    s_and_b32 s7, s6, 0xff
-; GFX8-NOHSA-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX8-NOHSA-NEXT:    s_and_b32 s6, s6, 0xff0000
-; GFX8-NOHSA-NEXT:    s_or_b32 s6, s7, s6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s0
+; GFX8-NOHSA-NEXT:    s_or_b32 s11, s0, s1
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s6, 8
+; GFX8-NOHSA-NEXT:    s_and_b32 s0, s6, 0xff
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, s1, 0xff0000
+; GFX8-NOHSA-NEXT:    s_mov_b32 s7, s10
+; GFX8-NOHSA-NEXT:    s_or_b32 s21, s0, s1
+; GFX8-NOHSA-NEXT:    s_lshr_b64 s[0:1], s[6:7], 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s0, s0, 0xff00ff
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 48
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s1
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s1
-; GFX8-NOHSA-NEXT:    v_alignbit_b32 v0, s10, v0, 16
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 32
-; GFX8-NOHSA-NEXT:    v_and_b32_e32 v7, 0xff00ff, v0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s21
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s11
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[10:11], v[6:9]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 16
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s12
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s20
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s12
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s16
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[6:7], v[2:5]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s13
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s19
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s18
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s17
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s8
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s15
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s14
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s9
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index bca39d06e941c..d23c49165ec70 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -4582,18 +4582,18 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[12:15]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[12:15]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v3
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v2
 ; GCN-HSA-NEXT:    v_bfe_i32 v22, v3, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v20, v2, 0, 16
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[20:23]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v9
@@ -5985,14 +5985,13 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v6, 16, v1
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v2, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v1, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v3, 0, 16
+; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
 ; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@@ -6011,23 +6010,22 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; GCN-HSA-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v1, 0, 16
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GCN-HSA-NEXT:    v_bfe_i32 v4, v3, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v6, 16, v1
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GCN-HSA-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64:
@@ -6044,11 +6042,10 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, v2
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v4, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v5, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v2, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v6, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v1, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v3, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index e55fb2cac0985..7203545ebf9a8 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -3313,12 +3313,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x90
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[24:27]
-; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[12:15]
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[24:27]
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
 ; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v7
 ; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v24, 31, v6
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, v6
@@ -3726,7 +3726,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    s_nop 0
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[37:40], s[0:1] offset:224
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[33:36], s[0:1] offset:240
-; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
 ; GCN-GFX900-HSA-NEXT:    buffer_load_dword v33, off, s[20:23], 0 ; 4-byte Folded Reload
 ; GCN-GFX900-HSA-NEXT:    s_nop 0
 ; GCN-GFX900-HSA-NEXT:    buffer_load_dword v34, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
@@ -3740,7 +3739,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v43, v26
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v29, v27
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v31, v28
-; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(12)
+; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(11)
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v60, 31, v3
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v58, 31, v2
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v28, 31, v1
@@ -3749,6 +3748,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v27, v1
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v57, v2
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v59, v3
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
 ; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v24
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v23
@@ -3758,7 +3758,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v21
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v8, v21
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v10, v22
-; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[33:36], s[0:1] offset:208
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[41:44], s[0:1] offset:160
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[29:32], s[0:1] offset:176
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index f879dc660203f..0c399d65d01cc 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -7788,19 +7788,18 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; GCN-NOHSA-VI-NEXT:    v_bfe_u32 v29, v13, 16, 8
 ; GCN-NOHSA-VI-NEXT:    v_bfe_u32 v35, v14, 8, 8
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v33, 0xff, v14
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, v53
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, v53
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-VI-NEXT:    v_bfe_u32 v16, v17, 8, 8
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v36, v53
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, 0xff, v17
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v54, 24, v17
 ; GCN-NOHSA-VI-NEXT:    v_bfe_u32 v52, v17, 16, 8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, v53
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, v53
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, v53
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, v53
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v36, v53
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, v53
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, v53
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v29, v53
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v10, v53
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:128
@@ -7810,7 +7809,7 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, v53
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
@@ -9829,14 +9828,14 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v1, 0xff00, v0
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xff00, v0
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v3, 0xff, v0
-; GCN-NOHSA-SI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v1, 0xff00ff, v0
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v0, v3, v2
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GCN-NOHSA-SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v1, v3, v2
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xff00ff, v0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_zextload_v4i8_to_v4i16:
@@ -9848,18 +9847,18 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out,
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    flat_load_dword v2, v[0:1]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-HSA-NEXT:    flat_load_dword v0, v[0:1]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xff00, v2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; GCN-HSA-NEXT:    v_alignbit_b32 v2, v4, v2, 16
-; GCN-HSA-NEXT:    v_lshlrev_b32_e32 v4, 8, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v2
-; GCN-HSA-NEXT:    v_or_b32_e32 v2, v5, v4
-; GCN-HSA-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xff00, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xff, v0
+; GCN-HSA-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
+; GCN-HSA-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GCN-HSA-NEXT:    v_or_b32_e32 v4, v5, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xff00ff, v0
+; GCN-HSA-NEXT:    flat_store_dwordx2 v[2:3], v[4:5]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v4i8_to_v4i16:
@@ -9878,10 +9877,10 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out,
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 24, v0
 ; GCN-NOHSA-VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v0
-; GCN-NOHSA-VI-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, 0xff0000, v2
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v3, 0xff0000, v2
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b64 v[1:2], 16, v[0:1]
+; GCN-NOHSA-VI-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v1, 0xff00ff, v1
-; GCN-NOHSA-VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
@@ -10180,33 +10179,39 @@ define amdgpu_kernel void @global_sextload_v4i8_to_v4i16(ptr addrspace(1) %out,
 define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v8i8_to_v8i16:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xff00, v0
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xff00, v1
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, 0xff, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v7, 0xff, v0
-; GCN-NOHSA-SI-NEXT:    v_alignbit_b32 v1, v5, v1, 16
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GCN-NOHSA-SI-NEXT:    v_alignbit_b32 v0, v3, v0, 16
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v2
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v3, 0xff00ff, v1
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v2, v6, v4
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v1, 0xff00ff, v0
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v0, v7, v5
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s6, v1
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s1, s0, 24
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s7, s6, 24
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s0, 0xff00
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s6, 0xff00
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s6, 0xff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s11, s0, 0xff
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s9, s9, 8
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s8, s8, 8
+; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s7, s10, s9
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s8, s11, s8
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s9, s0, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s6
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_zextload_v8i8_to_v8i16:
@@ -10222,20 +10227,26 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out,
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xff00, v0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xff00, v1
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xff, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xff, v0
-; GCN-HSA-NEXT:    v_alignbit_b32 v1, v7, v1, 16
-; GCN-HSA-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; GCN-HSA-NEXT:    v_alignbit_b32 v0, v3, v0, 16
-; GCN-HSA-NEXT:    v_lshlrev_b32_e32 v7, 8, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v1
-; GCN-HSA-NEXT:    v_or_b32_e32 v2, v8, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v0
-; GCN-HSA-NEXT:    v_or_b32_e32 v0, v9, v7
+; GCN-HSA-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-HSA-NEXT:    v_readfirstlane_b32 s2, v1
+; GCN-HSA-NEXT:    s_lshr_b32 s1, s0, 24
+; GCN-HSA-NEXT:    s_lshr_b32 s3, s2, 24
+; GCN-HSA-NEXT:    s_and_b32 s4, s0, 0xff00
+; GCN-HSA-NEXT:    s_and_b32 s5, s2, 0xff00
+; GCN-HSA-NEXT:    s_and_b32 s6, s2, 0xff
+; GCN-HSA-NEXT:    s_and_b32 s7, s0, 0xff
+; GCN-HSA-NEXT:    s_lshl_b32 s5, s5, 8
+; GCN-HSA-NEXT:    s_lshl_b32 s4, s4, 8
+; GCN-HSA-NEXT:    s_lshr_b64 s[2:3], s[2:3], 16
+; GCN-HSA-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
+; GCN-HSA-NEXT:    s_or_b32 s1, s6, s5
+; GCN-HSA-NEXT:    s_or_b32 s3, s7, s4
+; GCN-HSA-NEXT:    s_and_b32 s2, s2, 0xff00ff
+; GCN-HSA-NEXT:    s_and_b32 s0, s0, 0xff00ff
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s2
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
@@ -10253,22 +10264,26 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out,
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s4, v1
+; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s4, v0
+; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s6, v1
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s5, s4, 24
-; GCN-NOHSA-VI-NEXT:    s_bfe_u32 s6, s4, 0x80010
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s4, 0xff
-; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s4, s4, 8
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
-; GCN-NOHSA-VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
-; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s5, s5, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, 0xff0000
-; GCN-NOHSA-VI-NEXT:    v_alignbit_b32 v2, v2, v0, 16
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v3, 0xff0000, v1
-; GCN-NOHSA-VI-NEXT:    s_or_b32 s5, s6, s5
-; GCN-NOHSA-VI-NEXT:    s_or_b32 s4, s7, s4
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v1, 0xff00ff, v2
-; GCN-NOHSA-VI-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s7, s6, 24
+; GCN-NOHSA-VI-NEXT:    s_bfe_u32 s8, s6, 0x80010
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s9, s6, 0xff
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s6, s6, 8
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s11, s4, 8
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s10, s4, 0xff
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s7, s7, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, 0xff0000
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s11, s11, 0xff0000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s5, s8, s7
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s6, s9, s6
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s7, s10, s11
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, 0xff00ff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
@@ -10764,35 +10779,48 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xff00, v2
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, 0xff00, v3
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xff00, v0
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, 0xff00, v1
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xff, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v13, 0xff, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, 0xff, v3
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v15, 0xff, v2
-; GCN-NOHSA-SI-NEXT:    v_alignbit_b32 v1, v11, v1, 16
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
-; GCN-NOHSA-SI-NEXT:    v_alignbit_b32 v0, v9, v0, 16
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
-; GCN-NOHSA-SI-NEXT:    v_alignbit_b32 v7, v7, v3, 16
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; GCN-NOHSA-SI-NEXT:    v_alignbit_b32 v5, v5, v2, 16
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v3, 0xff00ff, v1
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v2, v12, v10
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v1, 0xff00ff, v0
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v0, v13, v8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v7, 0xff00ff, v7
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v6, v14, v6
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v5, 0xff00ff, v5
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v4, v15, v4
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s4, v2
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s6, v3
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s8, v0
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s5, s4, 24
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s7, s6, 24
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s4, 0xff00
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s13, s6, 0xff00
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s9, s8, 24
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s11, s10, 24
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s8, 0xff00
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s10, 0xff00
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s16, s10, 0xff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s17, s8, 0xff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s18, s6, 0xff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s19, s4, 0xff
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s15, s15, 8
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s14, s14, 8
+; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[10:11], s[10:11], 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], 16
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s9, s13, 8
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s11, s12, 8
+; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s5, s16, s15
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s7, s17, s14
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s9, s18, s9
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s11, s19, s11
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s6
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s10
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -10806,43 +10834,55 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xff00, v2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xff00, v3
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xff00, v0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 24, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xff00, v1
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 24, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xff, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v19, 0xff, v2
-; GCN-HSA-NEXT:    v_alignbit_b32 v7, v7, v3, 16
-; GCN-HSA-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; GCN-HSA-NEXT:    v_alignbit_b32 v5, v5, v2, 16
-; GCN-HSA-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xff, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xff, v0
-; GCN-HSA-NEXT:    v_alignbit_b32 v1, v15, v1, 16
-; GCN-HSA-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GCN-HSA-NEXT:    v_alignbit_b32 v0, v13, v0, 16
-; GCN-HSA-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xff00ff, v7
-; GCN-HSA-NEXT:    v_or_b32_e32 v6, v18, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xff00ff, v5
-; GCN-HSA-NEXT:    v_or_b32_e32 v4, v19, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v1
-; GCN-HSA-NEXT:    v_or_b32_e32 v2, v16, v14
-; GCN-HSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v0
-; GCN-HSA-NEXT:    v_or_b32_e32 v0, v17, v12
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT:    v_readfirstlane_b32 s6, v0
+; GCN-HSA-NEXT:    v_readfirstlane_b32 s8, v1
+; GCN-HSA-NEXT:    v_readfirstlane_b32 s2, v2
+; GCN-HSA-NEXT:    v_readfirstlane_b32 s4, v3
+; GCN-HSA-NEXT:    s_lshr_b32 s7, s6, 24
+; GCN-HSA-NEXT:    s_lshr_b32 s9, s8, 24
+; GCN-HSA-NEXT:    s_lshr_b32 s3, s2, 24
+; GCN-HSA-NEXT:    s_lshr_b32 s5, s4, 24
+; GCN-HSA-NEXT:    s_and_b32 s10, s2, 0xff00
+; GCN-HSA-NEXT:    s_and_b32 s11, s4, 0xff00
+; GCN-HSA-NEXT:    s_and_b32 s12, s6, 0xff00
+; GCN-HSA-NEXT:    s_and_b32 s13, s8, 0xff00
+; GCN-HSA-NEXT:    s_and_b32 s14, s8, 0xff
+; GCN-HSA-NEXT:    s_and_b32 s15, s6, 0xff
+; GCN-HSA-NEXT:    s_lshr_b64 s[8:9], s[8:9], 16
+; GCN-HSA-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
+; GCN-HSA-NEXT:    s_and_b32 s16, s4, 0xff
+; GCN-HSA-NEXT:    s_and_b32 s17, s2, 0xff
+; GCN-HSA-NEXT:    s_lshl_b32 s13, s13, 8
+; GCN-HSA-NEXT:    s_lshl_b32 s12, s12, 8
+; GCN-HSA-NEXT:    s_lshl_b32 s7, s11, 8
+; GCN-HSA-NEXT:    s_lshl_b32 s9, s10, 8
+; GCN-HSA-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
+; GCN-HSA-NEXT:    s_lshr_b64 s[2:3], s[2:3], 16
+; GCN-HSA-NEXT:    s_or_b32 s3, s14, s13
+; GCN-HSA-NEXT:    s_or_b32 s5, s15, s12
+; GCN-HSA-NEXT:    s_and_b32 s8, s8, 0xff00ff
+; GCN-HSA-NEXT:    s_and_b32 s6, s6, 0xff00ff
+; GCN-HSA-NEXT:    s_or_b32 s7, s16, s7
+; GCN-HSA-NEXT:    s_or_b32 s9, s17, s9
+; GCN-HSA-NEXT:    s_and_b32 s4, s4, 0xff00ff
+; GCN-HSA-NEXT:    s_and_b32 s2, s2, 0xff00ff
+; GCN-HSA-NEXT:    s_add_u32 s0, s0, 16
+; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s8
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v16i8_to_v16i16:
@@ -10859,42 +10899,50 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s4, v3
-; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s5, v1
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s5, 24
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s9, s4, 24
-; GCN-NOHSA-VI-NEXT:    s_bfe_u32 s10, s4, 0x80010
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s11, s4, 0xff
-; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s4, s4, 8
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
-; GCN-NOHSA-VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
-; GCN-NOHSA-VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v2
-; GCN-NOHSA-VI-NEXT:    s_bfe_u32 s7, s5, 0x80010
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s8, s5, 0xff
-; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s5, s5, 8
-; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s6, s6, 16
-; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s9, s9, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, 0xff0000
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, 0xff0000, v1
-; GCN-NOHSA-VI-NEXT:    v_alignbit_b32 v4, v4, v2, 16
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v7, 0xff0000, v5
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, 0xff0000
-; GCN-NOHSA-VI-NEXT:    s_or_b32 s6, s7, s6
-; GCN-NOHSA-VI-NEXT:    s_or_b32 s7, s10, s9
-; GCN-NOHSA-VI-NEXT:    s_or_b32 s4, s11, s4
-; GCN-NOHSA-VI-NEXT:    v_alignbit_b32 v3, v3, v0, 16
-; GCN-NOHSA-VI-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v5, 0xff00ff, v4
-; GCN-NOHSA-VI-NEXT:    v_or_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NOHSA-VI-NEXT:    s_or_b32 s5, s8, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v6, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, s7
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v1, 0xff00ff, v3
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s6
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s6, v0
+; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s4, v2
+; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s8, v3
+; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s9, v1
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s7, s6, 24
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s5, s4, 24
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s9, 24
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s13, s6, 0xff
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s14, s6, 8
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s15, s8, 24
+; GCN-NOHSA-VI-NEXT:    s_bfe_u32 s16, s8, 0x80010
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s17, s8, 0xff
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s8, s8, 8
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s19, s4, 8
+; GCN-NOHSA-VI-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_u32 s11, s9, 0x80010
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s12, s9, 0xff
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s9, s9, 8
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s18, s4, 0xff
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s10, s10, 16
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s7, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s8, s8, 0xff0000
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s15, s19, 0xff0000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s9, s9, 0xff0000
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s14, s14, 0xff0000
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s5, s11, s10
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s7, s16, s7
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s8, s17, s8
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s11, s18, s15
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, 0xff00ff
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s9, s12, s9
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s10, s13, s14
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, 0xff00ff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s6
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v6, s9
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, s5
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: global_zextload_v16i8_to_v16i16:
@@ -11767,71 +11815,97 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xff00, v2
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v5, 0xff00, v3
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, 0xff, v3
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v7, 0xff, v2
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v6, v6, v5
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v4, v7, v4
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 24, v3
-; GCN-NOHSA-SI-NEXT:    v_alignbit_b32 v3, v5, v3, 16
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v7, 0xff00ff, v3
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
-; GCN-NOHSA-SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v5, 0xff00ff, v2
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s4, v2
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s6, v3
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s8, v0
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s10, v1
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v3, 0xff00, v10
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 24, v10
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s12, v6
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s14, v7
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s16, v4
+; GCN-NOHSA-SI-NEXT:    v_readfirstlane_b32 s18, v5
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s5, s4, 24
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s7, s6, 24
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s20, s4, 0xff00
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s21, s6, 0xff00
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s9, s8, 24
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s11, s10, 24
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s22, s8, 0xff00
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s23, s10, 0xff00
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s13, s12, 24
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s15, s14, 24
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s24, s12, 0xff00
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s25, s14, 0xff00
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s17, s16, 24
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s18, 24
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s26, s16, 0xff00
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s27, s18, 0xff00
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s28, s18, 0xff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s29, s16, 0xff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s30, s14, 0xff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s31, s12, 0xff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s33, s10, 0xff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s34, s8, 0xff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s35, s6, 0xff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s36, s4, 0xff
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s27, s27, 8
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s26, s26, 8
+; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[18:19], s[18:19], 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[16:17], s[16:17], 16
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s17, s25, 8
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s19, s24, 8
+; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[14:15], s[14:15], 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[12:13], s[12:13], 16
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s13, s23, 8
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s15, s22, 8
+; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[10:11], s[10:11], 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], 16
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s9, s21, 8
+; GCN-NOHSA-SI-NEXT:    s_lshl_b32 s11, s20, 8
+; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s5, s28, s27
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s7, s29, s26
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s18, s18, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s16, s16, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s17, s30, s17
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s19, s31, s19
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s14, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s12, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s13, s33, s13
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s15, s34, s15
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s10, s10, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s9, s35, s9
+; GCN-NOHSA-SI-NEXT:    s_or_b32 s11, s36, s11
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s6, s6, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s8, s8, 0xff00ff
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s6
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v4, 24, v11
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 24, v8
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v9
-; GCN-NOHSA-SI-NEXT:    v_alignbit_b32 v7, v6, v9, 16
-; GCN-NOHSA-SI-NEXT:    v_alignbit_b32 v5, v5, v8, 16
-; GCN-NOHSA-SI-NEXT:    v_alignbit_b32 v12, v4, v11, 16
-; GCN-NOHSA-SI-NEXT:    v_alignbit_b32 v13, v2, v10, 16
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xff00, v11
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xff00, v8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, 0xff00, v9
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v1
-; GCN-NOHSA-SI-NEXT:    v_alignbit_b32 v14, v14, v1, 16
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v0
-; GCN-NOHSA-SI-NEXT:    v_alignbit_b32 v15, v15, v0, 16
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xff00, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v17, 0xff00, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xff, v0
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v2
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v2, v9, v0
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v0, v8, v6
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v6, v11, v4
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v4, v10, v3
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v17
-; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v16
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v10, v1, v3
-; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v8, v18, v8
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v3, 0xff00ff, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v1, 0xff00ff, v5
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v7, 0xff00ff, v12
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v5, 0xff00ff, v13
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v11, 0xff00ff, v14
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v9, 0xff00ff, v15
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s10
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s14
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s18
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
@@ -11844,88 +11918,112 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    v_readfirstlane_b32 s4, v2
+; GCN-HSA-NEXT:    v_readfirstlane_b32 s6, v3
+; GCN-HSA-NEXT:    v_readfirstlane_b32 s8, v0
+; GCN-HSA-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-HSA-NEXT:    s_lshr_b32 s5, s4, 24
+; GCN-HSA-NEXT:    s_lshr_b32 s7, s6, 24
+; GCN-HSA-NEXT:    s_and_b32 s12, s4, 0xff00
+; GCN-HSA-NEXT:    s_and_b32 s13, s6, 0xff00
+; GCN-HSA-NEXT:    s_lshr_b32 s9, s8, 24
+; GCN-HSA-NEXT:    s_lshr_b32 s11, s10, 24
+; GCN-HSA-NEXT:    s_and_b32 s14, s8, 0xff00
+; GCN-HSA-NEXT:    s_and_b32 s15, s10, 0xff00
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    s_lshr_b64 s[2:3], s[10:11], 16
+; GCN-HSA-NEXT:    s_and_b32 s17, s8, 0xff
+; GCN-HSA-NEXT:    s_lshr_b64 s[8:9], s[8:9], 16
+; GCN-HSA-NEXT:    s_and_b32 s3, s6, 0xff
+; GCN-HSA-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
+; GCN-HSA-NEXT:    s_and_b32 s16, s10, 0xff
+; GCN-HSA-NEXT:    s_lshl_b32 s14, s14, 8
+; GCN-HSA-NEXT:    s_lshl_b32 s9, s13, 8
+; GCN-HSA-NEXT:    s_and_b32 s10, s4, 0xff
+; GCN-HSA-NEXT:    s_lshl_b32 s11, s12, 8
+; GCN-HSA-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
+; GCN-HSA-NEXT:    s_and_b32 s8, s8, 0xff00ff
+; GCN-HSA-NEXT:    s_and_b32 s6, s6, 0xff00ff
+; GCN-HSA-NEXT:    s_lshl_b32 s15, s15, 8
+; GCN-HSA-NEXT:    s_or_b32 s7, s17, s14
+; GCN-HSA-NEXT:    s_and_b32 s2, s2, 0xff00ff
+; GCN-HSA-NEXT:    s_or_b32 s3, s3, s9
+; GCN-HSA-NEXT:    s_or_b32 s9, s10, s11
+; GCN-HSA-NEXT:    s_and_b32 s4, s4, 0xff00ff
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s8
+; GCN-HSA-NEXT:    s_or_b32 s5, s16, s15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s5
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT:    v_readfirstlane_b32 s6, v0
+; GCN-HSA-NEXT:    v_readfirstlane_b32 s8, v1
+; GCN-HSA-NEXT:    v_readfirstlane_b32 s2, v2
+; GCN-HSA-NEXT:    v_readfirstlane_b32 s4, v3
+; GCN-HSA-NEXT:    s_lshr_b32 s7, s6, 24
+; GCN-HSA-NEXT:    s_lshr_b32 s9, s8, 24
+; GCN-HSA-NEXT:    s_lshr_b32 s3, s2, 24
+; GCN-HSA-NEXT:    s_lshr_b32 s5, s4, 24
+; GCN-HSA-NEXT:    s_and_b32 s10, s2, 0xff00
+; GCN-HSA-NEXT:    s_and_b32 s11, s4, 0xff00
+; GCN-HSA-NEXT:    s_and_b32 s12, s6, 0xff00
+; GCN-HSA-NEXT:    s_and_b32 s13, s8, 0xff00
+; GCN-HSA-NEXT:    s_and_b32 s14, s8, 0xff
+; GCN-HSA-NEXT:    s_and_b32 s15, s6, 0xff
+; GCN-HSA-NEXT:    s_lshr_b64 s[8:9], s[8:9], 16
+; GCN-HSA-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
+; GCN-HSA-NEXT:    s_and_b32 s16, s4, 0xff
+; GCN-HSA-NEXT:    s_and_b32 s17, s2, 0xff
+; GCN-HSA-NEXT:    s_lshl_b32 s13, s13, 8
+; GCN-HSA-NEXT:    s_lshl_b32 s12, s12, 8
+; GCN-HSA-NEXT:    s_lshl_b32 s7, s11, 8
+; GCN-HSA-NEXT:    s_lshl_b32 s9, s10, 8
+; GCN-HSA-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
+; GCN-HSA-NEXT:    s_lshr_b64 s[2:3], s[2:3], 16
+; GCN-HSA-NEXT:    s_or_b32 s3, s14, s13
+; GCN-HSA-NEXT:    s_or_b32 s5, s15, s12
+; GCN-HSA-NEXT:    s_and_b32 s8, s8, 0xff00ff
+; GCN-HSA-NEXT:    s_and_b32 s6, s6, 0xff00ff
+; GCN-HSA-NEXT:    s_or_b32 s7, s16, s7
+; GCN-HSA-NEXT:    s_or_b32 s9, s17, s9
+; GCN-HSA-NEXT:    s_and_b32 s4, s4, 0xff00ff
+; GCN-HSA-NEXT:    s_and_b32 s10, s2, 0xff00ff
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s3
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 24, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xff00, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xff, v7
-; GCN-HSA-NEXT:    v_alignbit_b32 v7, v9, v7, 16
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xff00, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xff00ff, v7
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 24, v6
-; GCN-HSA-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
-; GCN-HSA-NEXT:    v_alignbit_b32 v7, v7, v6, 16
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GCN-HSA-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
-; GCN-HSA-NEXT:    v_or_b32_e32 v8, v17, v8
-; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xff00ff, v7
-; GCN-HSA-NEXT:    v_or_b32_e32 v6, v6, v16
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[6:9]
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xff, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xff00, v4
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xff00, v5
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v13, 0xff, v4
-; GCN-HSA-NEXT:    v_alignbit_b32 v5, v9, v5, 16
-; GCN-HSA-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
-; GCN-HSA-NEXT:    v_alignbit_b32 v9, v7, v4, 16
-; GCN-HSA-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; GCN-HSA-NEXT:    v_or_b32_e32 v6, v12, v8
-; GCN-HSA-NEXT:    v_or_b32_e32 v4, v13, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xff00ff, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xff00ff, v9
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xff00, v2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 24, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xff00, v3
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v13, 0xff00, v0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 24, v0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xff, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xff00, v1
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xff, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xff, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v11, 0xff, v2
-; GCN-HSA-NEXT:    v_alignbit_b32 v1, v5, v1, 16
-; GCN-HSA-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GCN-HSA-NEXT:    v_alignbit_b32 v0, v9, v0, 16
-; GCN-HSA-NEXT:    v_lshlrev_b32_e32 v5, 8, v13
-; GCN-HSA-NEXT:    v_alignbit_b32 v9, v12, v3, 16
-; GCN-HSA-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
-; GCN-HSA-NEXT:    v_alignbit_b32 v12, v19, v2, 16
-; GCN-HSA-NEXT:    v_lshlrev_b32_e32 v13, 8, v18
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v1
-; GCN-HSA-NEXT:    v_or_b32_e32 v2, v6, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v0
-; GCN-HSA-NEXT:    v_or_b32_e32 v0, v7, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xff00ff, v9
-; GCN-HSA-NEXT:    v_or_b32_e32 v6, v10, v8
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xff00ff, v12
-; GCN-HSA-NEXT:    v_or_b32_e32 v4, v11, v13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v32i8_to_v32i16:
@@ -11943,79 +12041,95 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s4, v3
+; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s6, v0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s6, v7
-; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s7, v5
-; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s5, v1
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s7, 24
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s11, s6, 24
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s17, s4, 24
-; GCN-NOHSA-VI-NEXT:    s_bfe_u32 s18, s4, 0x80010
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s19, s4, 0xff
-; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s4, s4, 8
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
-; GCN-NOHSA-VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v4
-; GCN-NOHSA-VI-NEXT:    v_lshlrev_b32_e32 v11, 8, v2
-; GCN-NOHSA-VI-NEXT:    s_bfe_u32 s9, s7, 0x80010
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s10, s7, 0xff
-; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s7, s7, 8
-; GCN-NOHSA-VI-NEXT:    s_bfe_u32 s12, s6, 0x80010
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s13, s6, 0xff
-; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s6, s6, 8
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s14, s5, 24
-; GCN-NOHSA-VI-NEXT:    s_bfe_u32 s15, s5, 0x80010
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s16, s5, 0xff
-; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s5, s5, 8
-; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s8, s8, 16
-; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s11, s11, 16
-; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s17, s17, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, 0xff0000
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 24, v6
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; GCN-NOHSA-VI-NEXT:    v_lshlrev_b32_e32 v9, 8, v6
-; GCN-NOHSA-VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xff0000, v5
-; GCN-NOHSA-VI-NEXT:    v_alignbit_b32 v13, v8, v2, 16
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v11, 0xff0000, v11
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, 0xff0000
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, 0xff0000
-; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s14, s14, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, 0xff0000
-; GCN-NOHSA-VI-NEXT:    s_or_b32 s8, s9, s8
-; GCN-NOHSA-VI-NEXT:    s_or_b32 s9, s12, s11
-; GCN-NOHSA-VI-NEXT:    s_or_b32 s11, s18, s17
-; GCN-NOHSA-VI-NEXT:    s_or_b32 s4, s19, s4
-; GCN-NOHSA-VI-NEXT:    v_alignbit_b32 v7, v7, v4, 16
-; GCN-NOHSA-VI-NEXT:    v_alignbit_b32 v1, v1, v6, 16
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v9, 0xff0000, v9
-; GCN-NOHSA-VI-NEXT:    v_alignbit_b32 v3, v3, v0, 16
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, 0xff0000, v10
-; GCN-NOHSA-VI-NEXT:    v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NOHSA-VI-NEXT:    v_or_b32_sdwa v12, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v13, 0xff00ff, v13
-; GCN-NOHSA-VI-NEXT:    s_or_b32 s7, s10, s7
-; GCN-NOHSA-VI-NEXT:    s_or_b32 s6, s13, s6
-; GCN-NOHSA-VI-NEXT:    s_or_b32 s10, s15, s14
-; GCN-NOHSA-VI-NEXT:    s_or_b32 s5, s16, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v14, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, s11
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v5, 0xff00ff, v7
-; GCN-NOHSA-VI-NEXT:    v_or_b32_sdwa v8, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v9, 0xff00ff, v1
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v1, 0xff00ff, v3
-; GCN-NOHSA-VI-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s10
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v10, s6
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, s9
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v6, s7
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, s8
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s8, v6
+; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s4, v2
+; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s10, v4
+; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s15, v5
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s7, s6, 24
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s9, s8, 24
+; GCN-NOHSA-VI-NEXT:    v_readfirstlane_b32 s14, v7
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s5, s4, 24
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s11, s10, 24
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s16, s15, 24
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s20, s10, 8
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s24, s8, 0xff
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s25, s8, 8
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s26, s13, 24
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s29, s6, 0xff
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s30, s6, 8
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s31, s12, 24
+; GCN-NOHSA-VI-NEXT:    s_bfe_u32 s33, s12, 0x80010
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s34, s12, 0xff
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s12, s12, 8
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s36, s4, 8
+; GCN-NOHSA-VI-NEXT:    s_lshr_b64 s[8:9], s[8:9], 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b64 s[6:7], s[6:7], 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_u32 s17, s15, 0x80010
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s18, s15, 0xff
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s15, s15, 8
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s19, s10, 0xff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s21, s14, 24
+; GCN-NOHSA-VI-NEXT:    s_bfe_u32 s22, s14, 0x80010
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s23, s14, 0xff
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s14, s14, 8
+; GCN-NOHSA-VI-NEXT:    s_bfe_u32 s27, s13, 0x80010
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s28, s13, 0xff
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s13, s13, 8
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s35, s4, 0xff
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s16, s16, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s20, s20, 0xff0000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b64 s[10:11], s[10:11], 16
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s9, s26, 16
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s7, s31, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s12, s12, 0xff0000
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s26, s36, 0xff0000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b64 s[4:5], s[4:5], 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s15, s15, 0xff0000
+; GCN-NOHSA-VI-NEXT:    s_lshl_b32 s11, s21, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s14, s14, 0xff0000
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s21, s25, 0xff0000
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s13, s13, 0xff0000
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s25, s30, 0xff0000
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s5, s17, s16
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s16, s19, s20
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s7, s33, s7
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s12, s34, s12
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s19, s35, s26
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, 0xff00ff
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s15, s18, s15
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s10, s10, 0xff00ff
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s11, s22, s11
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s14, s23, s14
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s17, s24, s21
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s8, s8, 0xff00ff
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s9, s27, s9
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s13, s28, s13
+; GCN-NOHSA-VI-NEXT:    s_or_b32 s18, s29, s25
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, 0xff00ff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s19
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s12
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s18
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s6
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v6, s13
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, s9
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v8, s17
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v9, s8
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v10, s14
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, s11
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s10
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s15
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: global_zextload_v32i8_to_v32i16:
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index bd191a37582c0..b4c0b7497b95f 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -3172,27 +3172,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32:
 ; VI-NO-DS128:       ; %bb.0:
 ; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NO-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; VI-NO-DS128-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
 ; VI-NO-DS128-NEXT:    s_mov_b32 s90, -1
 ; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NO-DS128-NEXT:    v_mov_b32_e32 v16, s1
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v16 offset1:1
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[17:20], v16 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
 ; VI-NO-DS128-NEXT:    s_mov_b32 s91, 0xe80000
 ; VI-NO-DS128-NEXT:    s_add_u32 s88, s88, s11
-; VI-NO-DS128-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
-; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v11
-; VI-NO-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
-; VI-NO-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
-; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v11
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v10
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v13
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v12
@@ -3200,7 +3198,6 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v18
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v17
-; VI-NO-DS128-NEXT:    ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v20
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v19
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v19
@@ -3243,17 +3240,19 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v56, 16, v19
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v55, 0xffff, v19
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    s_addc_u32 s89, s89, 0
+; VI-NO-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v54, 16, v20
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v53, 0xffff, v20
-; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
-; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v58, 16, v22
-; VI-NO-DS128-NEXT:    v_and_b32_e32 v57, 0xffff, v22
 ; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v18
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v18
 ; VI-NO-DS128-NEXT:    v_mov_b32_e32 v18, s0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v58, 16, v22
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v57, 0xffff, v22
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v21, 0xffff, v21
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v60, 16, v24
@@ -3296,21 +3295,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v56, s1
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v56 offset1:1
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v56 offset0:2 offset1:3
-; GFX9-NO-DS128-NEXT:    s_add_u32 s12, s12, s11
-; GFX9-NO-DS128-NEXT:    s_addc_u32 s13, s13, 0
-; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
-; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v11
-; GFX9-NO-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
-; GFX9-NO-DS128-NEXT:    s_nop 0
-; GFX9-NO-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[18:21], v56 offset0:4 offset1:5
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[22:25], v56 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    s_add_u32 s12, s12, s11
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v15
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v11
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v10
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v13
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v12
@@ -3337,9 +3332,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v36, 0xffff, v22
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[16:19], v56 offset0:8 offset1:9
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v56 offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT:    s_nop 0
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v39, 16, v25
-; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v38, 0xffff, v25
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v41, 16, v17
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v40, 0xffff, v17
@@ -3360,16 +3357,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v55, 16, v22
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v54, 0xffff, v22
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v56 offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v39, 16, v25
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v38, 0xffff, v25
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v57, 16, v17
-; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v56, 0xffff, v17
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v63, 16, v23
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v62, 0xffff, v23
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v57, 16, v17
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v56, 0xffff, v17
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v16
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v59, 16, v19
@@ -3806,9 +3804,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    ds_read_b128 v[16:19], v0 offset:16
 ; VI-DS128-NEXT:    s_mov_b32 s91, 0xe80000
 ; VI-DS128-NEXT:    s_add_u32 s88, s88, s11
-; VI-DS128-NEXT:    s_addc_u32 s89, s89, 0
-; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    ds_read_b128 v[20:23], v0 offset:32
+; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:48
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(3)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
+; VI-DS128-NEXT:    s_addc_u32 s89, s89, 0
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v10
 ; VI-DS128-NEXT:    v_mov_b32_e32 v4, v3
 ; VI-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v11
@@ -3825,23 +3825,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
 ; VI-DS128-NEXT:    buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
 ; VI-DS128-NEXT:    buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; VI-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v19
-; VI-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v18
-; VI-DS128-NEXT:    ds_read_b128 v[20:23], v0 offset:32
-; VI-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:48
-; VI-DS128-NEXT:    ds_read_b128 v[36:39], v0 offset:64
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
+; VI-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v19
+; VI-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v18
 ; VI-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v17
 ; VI-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v16
-; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v23
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v22
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
@@ -3850,21 +3843,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v22
 ; VI-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v21
 ; VI-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v20
-; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v27
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v26
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; VI-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v27
+; VI-DS128-NEXT:    ds_read_b128 v[36:39], v0 offset:64
 ; VI-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v26
 ; VI-DS128-NEXT:    v_and_b32_e32 v34, 0xffff, v25
 ; VI-DS128-NEXT:    v_and_b32_e32 v32, 0xffff, v24
 ; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:80
 ; VI-DS128-NEXT:    ds_read_b128 v[55:58], v0 offset:96
+; VI-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v42, 16, v39
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v40, 16, v38
-; VI-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v37
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v50, 16, v27
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
@@ -3875,16 +3872,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    v_and_b32_e32 v53, 0xffff, v25
 ; VI-DS128-NEXT:    v_and_b32_e32 v51, 0xffff, v24
 ; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:112
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v37
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v44, 16, v36
 ; VI-DS128-NEXT:    v_and_b32_e32 v41, 0xffff, v39
 ; VI-DS128-NEXT:    v_and_b32_e32 v39, 0xffff, v38
-; VI-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v37
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
 ; VI-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v25
 ; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v24
 ; VI-DS128-NEXT:    v_mov_b32_e32 v24, s0
+; VI-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v37
 ; VI-DS128-NEXT:    v_and_b32_e32 v43, 0xffff, v36
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v59, 16, v57
@@ -3943,9 +3941,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    ds_read_b128 v[8:11], v0
 ; GFX9-DS128-NEXT:    ds_read_b128 v[16:19], v0 offset:16
 ; GFX9-DS128-NEXT:    s_add_u32 s12, s12, s11
-; GFX9-DS128-NEXT:    s_addc_u32 s13, s13, 0
-; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    ds_read_b128 v[20:23], v0 offset:32
+; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:48
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
+; GFX9-DS128-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v10
 ; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v11
@@ -3964,24 +3964,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
 ; GFX9-DS128-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
 ; GFX9-DS128-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX9-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v19
-; GFX9-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v18
-; GFX9-DS128-NEXT:    ds_read_b128 v[20:23], v0 offset:32
-; GFX9-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    s_nop 0
-; GFX9-DS128-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:48
-; GFX9-DS128-NEXT:    ds_read_b128 v[36:39], v0 offset:64
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
+; GFX9-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v19
+; GFX9-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v18
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v17
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v16
-; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v23
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v22
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
@@ -3990,21 +3982,26 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v22
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v21
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v20
-; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v27
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v26
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v27
+; GFX9-DS128-NEXT:    ds_read_b128 v[36:39], v0 offset:64
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v26
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v34, 0xffff, v25
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v32, 0xffff, v24
 ; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:80
 ; GFX9-DS128-NEXT:    ds_read_b128 v[55:58], v0 offset:96
+; GFX9-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    s_nop 0
+; GFX9-DS128-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v42, 16, v39
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v40, 16, v38
-; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v37
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v50, 16, v27
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
@@ -4015,16 +4012,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v53, 0xffff, v25
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v51, 0xffff, v24
 ; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:112
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v37
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v44, 16, v36
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v41, 0xffff, v39
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v39, 0xffff, v38
-; GFX9-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v37
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v25
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v24
 ; GFX9-DS128-NEXT:    v_mov_b32_e32 v24, s0
+; GFX9-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v37
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v43, 0xffff, v36
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v59, 16, v57
@@ -4197,29 +4195,20 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32:
 ; VI-NO-DS128:       ; %bb.0:
 ; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NO-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; VI-NO-DS128-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
 ; VI-NO-DS128-NEXT:    s_mov_b32 s90, -1
 ; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NO-DS128-NEXT:    v_mov_b32_e32 v28, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v28 offset1:1
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
 ; VI-NO-DS128-NEXT:    s_mov_b32 s91, 0xe80000
 ; VI-NO-DS128-NEXT:    s_add_u32 s88, s88, s11
-; VI-NO-DS128-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v11
-; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
-; VI-NO-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
-; VI-NO-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
-; VI-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
-; VI-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
-; VI-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v10
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v13
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v12
-; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v30
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v24, v30, 0, 16
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v29
@@ -4229,7 +4218,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v40, 16, v31
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v39, v31, 0, 16
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
-; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v42, 16, v34
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v41, v34, 0, 16
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v44, 16, v33
@@ -4247,16 +4236,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v56, 16, v31
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v55, v31, 0, 16
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    s_addc_u32 s89, s89, 0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v11
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
+; VI-NO-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v54, 16, v32
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v53, v32, 0, 16
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v15
-; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v10, 0, 16
 ; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v32, 16, v31
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v31, v31, 0, 16
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v30
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v30, 0, 16
 ; VI-NO-DS128-NEXT:    v_mov_b32_e32 v30, s0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v10
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v13
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v12
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v15
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v10, 0, 16
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v13, 0, 16
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v12, 0, 16
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v14
@@ -4316,23 +4313,14 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v28, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v28 offset1:1
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
 ; GFX9-NO-DS128-NEXT:    s_add_u32 s12, s12, s11
 ; GFX9-NO-DS128-NEXT:    s_addc_u32 s13, s13, 0
-; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v11
-; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
-; GFX9-NO-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
-; GFX9-NO-DS128-NEXT:    s_nop 0
-; GFX9-NO-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
-; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
-; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v10
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v13
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v12
-; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v30
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v24, v30, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v29
@@ -4342,7 +4330,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v40, 16, v31
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v39, v31, 0, 16
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
-; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v42, 16, v34
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v41, v34, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v44, 16, v33
@@ -4360,16 +4348,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v56, 16, v31
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v55, v31, 0, 16
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v11
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT:    s_nop 0
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v54, 16, v32
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v53, v32, 0, 16
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v15
-; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v10, 0, 16
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v32, 16, v31
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v31, v31, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v30
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v30, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v30, s0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v10
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v13
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v12
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v15
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v10, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v13, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v6, v12, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v14
@@ -4857,10 +4853,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    v_mov_b32_e32 v32, s1
 ; VI-DS128-NEXT:    ds_read_b128 v[8:11], v32
 ; VI-DS128-NEXT:    ds_read_b128 v[16:19], v32 offset:16
+; VI-DS128-NEXT:    ds_read_b128 v[24:27], v32 offset:32
+; VI-DS128-NEXT:    ds_read_b128 v[33:36], v32 offset:48
 ; VI-DS128-NEXT:    s_mov_b32 s91, 0xe80000
 ; VI-DS128-NEXT:    s_add_u32 s88, s88, s11
 ; VI-DS128-NEXT:    s_addc_u32 s89, s89, 0
-; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(3)
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v11
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v10
 ; VI-DS128-NEXT:    v_bfe_i32 v2, v11, 0, 16
@@ -4873,12 +4871,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v4, 16, v8
 ; VI-DS128-NEXT:    v_bfe_i32 v5, v9, 0, 16
 ; VI-DS128-NEXT:    v_bfe_i32 v3, v8, 0, 16
-; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    ds_read_b128 v[24:27], v32 offset:32
-; VI-DS128-NEXT:    ds_read_b128 v[33:36], v32 offset:48
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v19
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v18
@@ -4899,8 +4891,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    ds_read_b128 v[36:39], v32 offset:64
 ; VI-DS128-NEXT:    ds_read_b128 v[40:43], v32 offset:80
 ; VI-DS128-NEXT:    ds_read_b128 v[56:59], v32 offset:96
+; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v23, 16, v25
-; VI-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v24
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v47, 16, v39
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v45, 16, v38
@@ -4913,14 +4908,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    v_bfe_i32 v52, v40, 0, 16
 ; VI-DS128-NEXT:    ds_read_b128 v[37:40], v32 offset:112
 ; VI-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v24
 ; VI-DS128-NEXT:    v_bfe_i32 v22, v25, 0, 16
 ; VI-DS128-NEXT:    v_bfe_i32 v20, v24, 0, 16
-; VI-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v35
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v38
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v37
 ; VI-DS128-NEXT:    v_bfe_i32 v2, v38, 0, 16
 ; VI-DS128-NEXT:    v_bfe_i32 v0, v37, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v35
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v31, 16, v34
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v33
 ; VI-DS128-NEXT:    v_bfe_i32 v24, v35, 0, 16
@@ -4985,9 +4981,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    v_mov_b32_e32 v32, s1
 ; GFX9-DS128-NEXT:    ds_read_b128 v[8:11], v32
 ; GFX9-DS128-NEXT:    ds_read_b128 v[16:19], v32 offset:16
+; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v32 offset:32
+; GFX9-DS128-NEXT:    ds_read_b128 v[33:36], v32 offset:48
 ; GFX9-DS128-NEXT:    s_add_u32 s12, s12, s11
 ; GFX9-DS128-NEXT:    s_addc_u32 s13, s13, 0
-; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v11
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v10
 ; GFX9-DS128-NEXT:    v_bfe_i32 v2, v11, 0, 16
@@ -5001,13 +4999,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v4, 16, v8
 ; GFX9-DS128-NEXT:    v_bfe_i32 v5, v9, 0, 16
 ; GFX9-DS128-NEXT:    v_bfe_i32 v3, v8, 0, 16
-; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    s_nop 0
-; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v32 offset:32
-; GFX9-DS128-NEXT:    ds_read_b128 v[33:36], v32 offset:48
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v19
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v18
@@ -5028,8 +5019,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    ds_read_b128 v[36:39], v32 offset:64
 ; GFX9-DS128-NEXT:    ds_read_b128 v[40:43], v32 offset:80
 ; GFX9-DS128-NEXT:    ds_read_b128 v[56:59], v32 offset:96
+; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    s_nop 0
+; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v23, 16, v25
-; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v24
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v47, 16, v39
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v45, 16, v38
@@ -5042,14 +5037,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    v_bfe_i32 v52, v40, 0, 16
 ; GFX9-DS128-NEXT:    ds_read_b128 v[37:40], v32 offset:112
 ; GFX9-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v24
 ; GFX9-DS128-NEXT:    v_bfe_i32 v22, v25, 0, 16
 ; GFX9-DS128-NEXT:    v_bfe_i32 v20, v24, 0, 16
-; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v35
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v38
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v37
 ; GFX9-DS128-NEXT:    v_bfe_i32 v2, v38, 0, 16
 ; GFX9-DS128-NEXT:    v_bfe_i32 v0, v37, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v35
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v31, 16, v34
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v33
 ; GFX9-DS128-NEXT:    v_bfe_i32 v24, v35, 0, 16
@@ -5738,20 +5734,19 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out,
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
 ; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_read_b64 v[0:1], v0
-; SI-NEXT:    v_mov_b32_e32 v9, s0
+; SI-NEXT:    v_mov_b32_e32 v8, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
-; SI-NEXT:    v_bfe_i32 v3, v3, 0, 16
-; SI-NEXT:    v_bfe_i32 v5, v0, 0, 16
-; SI-NEXT:    v_bfe_i32 v7, v4, 0, 16
-; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
-; SI-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
-; SI-NEXT:    ds_write2_b64 v9, v[3:4], v[1:2] offset0:2 offset1:3
-; SI-NEXT:    ds_write2_b64 v9, v[5:6], v[7:8] offset1:1
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v2, 16, v1
+; SI-NEXT:    v_bfe_i32 v4, v1, 0, 16
+; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v6, v5, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; SI-NEXT:    ds_write2_b64 v8, v[4:5], v[2:3] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v8, v[0:1], v[6:7] offset1:1
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64:
@@ -5761,20 +5756,20 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out,
 ; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
-; VI-NO-DS128-NEXT:    v_mov_b32_e32 v8, s0
 ; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v3, 0, 16
-; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
-; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-NO-DS128-NEXT:    ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3
-; VI-NO-DS128-NEXT:    ds_write2_b64 v8, v[0:1], v[2:3] offset1:1
+; VI-NO-DS128-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v7, v4, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v3, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v5, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v0, v[1:2], v[7:8] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
 ; VI-NO-DS128-NEXT:    s_endpgm
 ;
 ; GFX9-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64:
@@ -5783,20 +5778,20 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out,
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
-; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v9, s0
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v3, 0, 16
-; GFX9-NO-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
-; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX9-NO-DS128-NEXT:    ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3
-; GFX9-NO-DS128-NEXT:    ds_write2_b64 v8, v[0:1], v[2:3] offset1:1
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v7, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v3, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v5, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v9, v[1:2], v[7:8] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v9, v[3:4], v[5:6] offset1:1
 ; GFX9-NO-DS128-NEXT:    s_endpgm
 ;
 ; EG-LABEL: local_sextload_v4i16_to_v4i64:
@@ -5850,22 +5845,21 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out,
 ; VI-DS128-NEXT:    s_mov_b32 m0, -1
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
-; VI-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; VI-DS128-NEXT:    ds_read_b64 v[1:2], v0
 ; VI-DS128-NEXT:    v_mov_b32_e32 v8, s0
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-DS128-NEXT:    v_mov_b32_e32 v3, v1
-; VI-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; VI-DS128-NEXT:    v_bfe_i32 v4, v3, 0, 16
-; VI-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
-; VI-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v0, v2, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v4, v1, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v6, v3, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; VI-DS128-NEXT:    ds_write_b128 v8, v[4:7] offset:16
-; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3]
+; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:16
+; VI-DS128-NEXT:    ds_write_b128 v8, v[4:7]
 ; VI-DS128-NEXT:    s_endpgm
 ;
 ; GFX9-DS128-LABEL: local_sextload_v4i16_to_v4i64:
@@ -5873,22 +5867,21 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out,
 ; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-DS128-NEXT:    ds_read_b64 v[1:2], v0
 ; GFX9-DS128-NEXT:    v_mov_b32_e32 v8, s0
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DS128-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-DS128-NEXT:    v_bfe_i32 v4, v3, 0, 16
-; GFX9-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
-; GFX9-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v2, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX9-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v1, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v3, 0, 16
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX9-DS128-NEXT:    ds_write_b128 v8, v[4:7] offset:16
-; GFX9-DS128-NEXT:    ds_write_b128 v8, v[0:3]
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:16
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[4:7]
 ; GFX9-DS128-NEXT:    s_endpgm
   %load = load <4 x i16>, ptr addrspace(3) %in
   %ext = sext <4 x i16> %load to <4 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 6dc919988cc4f..b6eaaf1369ab4 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -326,12 +326,12 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX7-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_f32_e32 v2, 4.0, v1
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -347,12 +347,12 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX6-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v2, 4.0, v1
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v1
+; GFX6-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -440,12 +440,12 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_f32_e32 v2, 4.0, v1
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -462,12 +462,12 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v2, 4.0, v1
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v1
+; GFX6-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -880,13 +880,14 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_add_f64_e32 v[3:4], 4.0, v[1:2]
+; GFX12-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f64_e32 v[1:2], 4.0, v[3:4]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2]
+; GFX12-NEXT:    ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4]
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -913,13 +914,14 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_f64 v[3:4], v[1:2], 4.0
+; GFX11-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[1:2], v[3:4], 4.0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2]
+; GFX11-NEXT:    ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX11-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -936,14 +938,14 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_add_f64 v[3:4], v[1:2], 4.0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_add_f64 v[1:2], v[3:4], 4.0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX10-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
@@ -968,13 +970,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_add_f64 v[3:4], v[1:2], 4.0
-; GFX908-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    v_add_f64 v[1:2], v[3:4], 4.0
+; GFX908-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -990,13 +992,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_add_f64 v[3:4], v[1:2], 4.0
-; GFX8-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_add_f64 v[1:2], v[3:4], 4.0
+; GFX8-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1012,13 +1014,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_f64 v[3:4], v[1:2], 4.0
-; GFX7-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    v_add_f64 v[1:2], v[3:4], 4.0
+; GFX7-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1034,13 +1036,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX6-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_f64 v[3:4], v[1:2], 4.0
-; GFX6-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
+; GFX6-NEXT:    v_add_f64 v[1:2], v[3:4], 4.0
+; GFX6-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1063,13 +1065,14 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_add_f64_e32 v[3:4], 4.0, v[1:2]
+; GFX12-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f64_e32 v[1:2], 4.0, v[3:4]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528
+; GFX12-NEXT:    ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -1096,13 +1099,14 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_f64 v[3:4], v[1:2], 4.0
+; GFX11-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[1:2], v[3:4], 4.0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528
+; GFX11-NEXT:    ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX11-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1119,14 +1123,14 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_add_f64 v[3:4], v[1:2], 4.0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_add_f64 v[1:2], v[3:4], 4.0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
+; GFX10-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
@@ -1151,13 +1155,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_add_f64 v[3:4], v[1:2], 4.0
-; GFX908-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    v_add_f64 v[1:2], v[3:4], 4.0
+; GFX908-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1173,13 +1177,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_add_f64 v[3:4], v[1:2], 4.0
-; GFX8-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_add_f64 v[1:2], v[3:4], 4.0
+; GFX8-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1195,13 +1199,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_f64 v[3:4], v[1:2], 4.0
-; GFX7-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    v_add_f64 v[1:2], v[3:4], 4.0
+; GFX7-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1218,13 +1222,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX6-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_f64 v[3:4], v[0:1], 4.0
-; GFX6-NEXT:    ds_cmpst_rtn_b64 v[3:4], v2, v[0:1], v[3:4]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v1
+; GFX6-NEXT:    v_mov_b32_e32 v3, v0
+; GFX6-NEXT:    v_add_f64 v[0:1], v[3:4], 4.0
+; GFX6-NEXT:    ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2032,27 +2036,27 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    ds_load_b32 v2, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-TRUE16-NEXT:    ds_load_b32 v3, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX12-TRUE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX12-TRUE16-NEXT:    v_add_f16_e32 v4.l, 4.0, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT:    v_add_f16_e32 v3.l, 4.0, v3.l
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -2073,28 +2077,28 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-FAKE16-NEXT:    ds_load_b32 v2, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT:    ds_load_b32 v3, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX12-FAKE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-FAKE16-NEXT:    v_add_f16_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-FAKE16-NEXT:    v_add_f16_e32 v3, 4.0, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -2119,15 +2123,15 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX942-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX942-NEXT:    v_add_f16_e32 v4, 4.0, v4
-; GFX942-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX942-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX942-NEXT:    v_add_f16_e32 v3, 4.0, v3
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX942-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2140,27 +2144,27 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    ds_load_b32 v2, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-TRUE16-NEXT:    ds_load_b32 v3, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-TRUE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX11-TRUE16-NEXT:    v_add_f16_e32 v4.l, 4.0, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v3.l, 4.0, v3.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2175,28 +2179,28 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT:    ds_load_b32 v2, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT:    ds_load_b32 v3, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-FAKE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-FAKE16-NEXT:    v_add_f16_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v3, 4.0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2211,23 +2215,23 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX10-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    ds_read_b32 v2, v1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX10-NEXT:    ds_read_b32 v3, v1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX10-NEXT:    v_and_b32_e32 v0, 24, v0
-; GFX10-NEXT:    v_not_b32_e32 v3, v3
+; GFX10-NEXT:    v_not_b32_e32 v2, v2
 ; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX10-NEXT:    v_add_f16_e32 v4, 4.0, v4
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX10-NEXT:    v_add_f16_e32 v3, 4.0, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v1, v2, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
@@ -2249,15 +2253,15 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX90A-NEXT:    v_add_f16_e32 v4, 4.0, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX90A-NEXT:    v_add_f16_e32 v3, 4.0, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX90A-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2278,15 +2282,15 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX908-NEXT:    v_add_f16_e32 v4, 4.0, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX908-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX908-NEXT:    v_add_f16_e32 v3, 4.0, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2308,16 +2312,16 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX8-NEXT:    v_add_f16_e32 v4, 4.0, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX8-NEXT:    v_add_f16_e32 v3, 4.0, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2338,18 +2342,18 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX7-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX7-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2370,18 +2374,18 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX6-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX6-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2412,19 +2416,19 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-TRUE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f16_e32 v4.l, 4.0, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT:    v_add_f16_e32 v3.l, 4.0, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -2455,19 +2459,20 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-FAKE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add_f16_e32 v4, 4.0, v4
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-FAKE16-NEXT:    v_add_f16_e32 v3, 4.0, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -2493,15 +2498,15 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX942-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX942-NEXT:    v_add_f16_e32 v4, 4.0, v4
-; GFX942-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX942-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX942-NEXT:    v_add_f16_e32 v3, 4.0, v3
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX942-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2524,19 +2529,19 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX11-TRUE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f16_e32 v4.l, 4.0, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v3.l, 4.0, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2561,19 +2566,20 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX11-FAKE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add_f16_e32 v4, 4.0, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v3, 4.0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2596,16 +2602,16 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX10-NEXT:    v_add_f16_e32 v4, 4.0, v4
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX10-NEXT:    v_add_f16_e32 v3, 4.0, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
@@ -2628,15 +2634,15 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX90A-NEXT:    v_add_f16_e32 v4, 4.0, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX90A-NEXT:    v_add_f16_e32 v3, 4.0, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX90A-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2658,15 +2664,15 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX908-NEXT:    v_add_f16_e32 v4, 4.0, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX908-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX908-NEXT:    v_add_f16_e32 v3, 4.0, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2689,16 +2695,16 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX8-NEXT:    v_add_f16_e32 v4, 4.0, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX8-NEXT:    v_add_f16_e32 v3, 4.0, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2720,18 +2726,18 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX7-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX7-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2753,18 +2759,18 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX6-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX6-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3086,16 +3092,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_f16_e32 v2.l, 4.0, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add_f16_e32 v1.l, 4.0, v2.l
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -3118,16 +3124,17 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-FAKE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_add_f16_e32 v2, 4.0, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-FAKE16-NEXT:    v_add_f16_e32 v1, 4.0, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -3147,13 +3154,13 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX942-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_add_f16_e32 v2, 4.0, v1
-; GFX942-NEXT:    v_and_or_b32 v2, v1, s2, v2
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    v_add_f16_e32 v1, 4.0, v2
+; GFX942-NEXT:    v_and_or_b32 v1, v2, s2, v1
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3168,16 +3175,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX11-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_f16_e32 v2.l, 4.0, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v1.l, 4.0, v2.l
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3194,16 +3201,17 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX11-FAKE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_f16_e32 v2, 4.0, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v1, 4.0, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3220,15 +3228,15 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_add_f16_e32 v2, 4.0, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_add_f16_e32 v1, 4.0, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
@@ -3245,13 +3253,13 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_add_f16_e32 v2, 4.0, v1
-; GFX90A-NEXT:    v_and_or_b32 v2, v1, s6, v2
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
+; GFX90A-NEXT:    v_add_f16_e32 v1, 4.0, v2
+; GFX90A-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3267,13 +3275,13 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_add_f16_e32 v2, 4.0, v1
-; GFX908-NEXT:    v_and_or_b32 v2, v1, s6, v2
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX908-NEXT:    v_mov_b32_e32 v2, v1
+; GFX908-NEXT:    v_add_f16_e32 v1, 4.0, v2
+; GFX908-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3289,14 +3297,14 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_add_f16_e32 v2, 4.0, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_add_f16_e32 v1, 4.0, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3312,16 +3320,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX7-NEXT:    v_add_f32_e32 v2, 4.0, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3338,16 +3346,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX6-NEXT:    v_add_f32_e32 v2, 4.0, v2
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4297,38 +4305,38 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    ds_load_b32 v2, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-TRUE16-NEXT:    ds_load_b32 v3, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX12-TRUE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -4349,37 +4357,37 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-FAKE16-NEXT:    ds_load_b32 v2, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT:    ds_load_b32 v3, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX12-FAKE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -4405,22 +4413,22 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX942-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX942-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX942-NEXT:    v_add3_u32 v5, v5, v4, s2
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX942-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX942-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX942-NEXT:    v_add3_u32 v5, v5, v3, s2
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX942-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX942-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4433,38 +4441,38 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    ds_load_b32 v2, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-TRUE16-NEXT:    ds_load_b32 v3, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4479,37 +4487,37 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT:    ds_load_b32 v2, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT:    ds_load_b32 v3, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4524,28 +4532,28 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX10-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    ds_read_b32 v2, v1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX10-NEXT:    ds_read_b32 v3, v1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX10-NEXT:    v_and_b32_e32 v0, 24, v0
-; GFX10-NEXT:    v_not_b32_e32 v3, v3
+; GFX10-NEXT:    v_not_b32_e32 v2, v2
 ; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v1, v2, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
@@ -4568,20 +4576,20 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4603,20 +4611,20 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4638,22 +4646,22 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4674,18 +4682,18 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4706,18 +4714,18 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4748,29 +4756,30 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX12-TRUE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v5
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -4801,28 +4810,29 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX12-FAKE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-FAKE16-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -4849,22 +4859,22 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX942-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX942-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX942-NEXT:    v_add3_u32 v5, v5, v4, s2
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX942-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX942-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX942-NEXT:    v_add3_u32 v5, v5, v3, s2
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX942-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX942-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4888,28 +4898,29 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX11-TRUE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v5
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4935,27 +4946,28 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX11-FAKE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4978,21 +4990,21 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
@@ -5016,20 +5028,20 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5052,20 +5064,20 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5088,22 +5100,22 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5125,18 +5137,18 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5158,18 +5170,18 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5569,26 +5581,27 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v2, 4.0, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.h
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -5611,25 +5624,26 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add_f32_e32 v2, 4.0, v2
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-FAKE16-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -5650,21 +5664,21 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX942-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX942-NEXT:    v_add_f32_e32 v2, 4.0, v2
-; GFX942-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX942-NEXT:    v_add3_u32 v3, v3, v2, s2
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX942-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX942-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX942-NEXT:    v_add3_u32 v3, v3, v1, s2
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX942-NEXT:    v_and_or_b32 v2, v1, s3, v2
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX942-NEXT:    v_and_or_b32 v1, v2, s3, v1
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5680,25 +5694,26 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX11-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 4.0, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.h
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5716,24 +5731,25 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX11-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 4.0, v2
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5750,21 +5766,21 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_add_f32_e32 v2, 4.0, v2
-; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
@@ -5782,20 +5798,20 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX90A-NEXT:    v_add_f32_e32 v2, 4.0, v2
-; GFX90A-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v3, v3, v2, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT:    v_and_or_b32 v2, v1, s7, v2
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX90A-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX90A-NEXT:    v_add3_u32 v3, v3, v1, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX90A-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5812,20 +5828,20 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX908-NEXT:    v_add_f32_e32 v2, 4.0, v2
-; GFX908-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v3, v3, v2, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX908-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX908-NEXT:    v_and_or_b32 v2, v1, s7, v2
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX908-NEXT:    v_mov_b32_e32 v2, v1
+; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX908-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX908-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX908-NEXT:    v_add3_u32 v3, v3, v1, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX908-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5841,21 +5857,21 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v2, 4.0, v2
-; GFX8-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5871,16 +5887,16 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX7-NEXT:    v_add_f32_e32 v2, 4.0, v2
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5897,16 +5913,16 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX6-NEXT:    v_add_f32_e32 v2, 4.0, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6399,13 +6415,14 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_add_f16 v3, v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v2, v3, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6422,13 +6439,13 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_add_f16 v3, v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_pk_add_f16 v2, v3, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
@@ -6444,12 +6461,12 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_add_f16 v3, v2, v1
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_pk_add_f16 v2, v3, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6464,12 +6481,12 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_pk_add_f16 v3, v2, v1
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_pk_add_f16 v2, v3, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6485,14 +6502,14 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v4, v2, v1
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v4, v3, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6614,13 +6631,14 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_add_f16 v3, v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v2, v3, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6637,13 +6655,13 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_add_f16 v3, v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_pk_add_f16 v2, v3, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
@@ -6659,12 +6677,12 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_add_f16 v3, v2, v1
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_pk_add_f16 v2, v3, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6679,12 +6697,12 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_pk_add_f16 v3, v2, v1
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_pk_add_f16 v2, v3, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6700,14 +6718,14 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v4, v2, v1
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v4, v3, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7547,30 +7565,32 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX11-TRUE16-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v5, v3
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v5, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7591,30 +7611,32 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX11-FAKE16-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, v3, v2
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -7634,27 +7656,27 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX10-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
 ; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
@@ -7674,26 +7696,26 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v2
 ; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
 ; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v5, v4, s9
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX90A-NEXT:    v_perm_b32 v3, v5, v3, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7712,26 +7734,26 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX908-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
 ; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v4, v5, v4, s9
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7749,29 +7771,29 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v2
 ; GFX8-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
 ; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7888,30 +7910,32 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX11-TRUE16-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7932,30 +7956,32 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX11-FAKE16-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, v3, v2
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -7975,27 +8001,27 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX10-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
 ; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
@@ -8015,26 +8041,26 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v2
 ; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
 ; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v5, v4, s9
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX90A-NEXT:    v_perm_b32 v3, v5, v3, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8053,26 +8079,26 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX908-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
 ; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v4, v5, v4, s9
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8090,29 +8116,29 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v2
 ; GFX8-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
 ; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8849,20 +8875,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX7-NEXT:  ; %bb.5:
 ; GFX7-NEXT:    s_lshl_b32 s0, s3, 4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    ds_read_b32 v3, v1
+; GFX7-NEXT:    ds_read_b32 v2, v1
 ; GFX7-NEXT:    s_bcnt1_i32_b64 s0, s[8:9]
-; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 0x42280000, v2
+; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v3, s0
+; GFX7-NEXT:    v_mul_f32_e32 v3, 0x42280000, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB28_6: ; %atomicrmw.start2
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_f32_e32 v4, v3, v2
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v4, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v1, v4, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], v2, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB28_6
 ; GFX7-NEXT:  .LBB28_7: ; %Flow21
@@ -8973,20 +8999,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX6-NEXT:  ; %bb.5:
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    ds_read_b32 v3, v1
+; GFX6-NEXT:    ds_read_b32 v2, v1
 ; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[8:9]
-; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 0x42280000, v2
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v3, s0
+; GFX6-NEXT:    v_mul_f32_e32 v3, 0x42280000, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB28_6: ; %atomicrmw.start2
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v3, v2
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v4, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v1, v4, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], v2, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB28_6
 ; GFX6-NEXT:  .LBB28_7: ; %Flow19
@@ -9677,20 +9703,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX7-NEXT:  ; %bb.5:
 ; GFX7-NEXT:    s_lshl_b32 s0, s3, 4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    ds_read_b32 v3, v1
+; GFX7-NEXT:    ds_read_b32 v2, v1
 ; GFX7-NEXT:    s_bcnt1_i32_b64 s0, s[8:9]
-; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 0x42280000, v2
+; GFX7-NEXT:    v_cvt_f32_ubyte0_e32 v3, s0
+; GFX7-NEXT:    v_mul_f32_e32 v3, 0x42280000, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB29_6: ; %atomicrmw.start2
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_f32_e32 v4, v3, v2
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v4, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v1, v4, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], v2, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB29_6
 ; GFX7-NEXT:  .LBB29_7: ; %Flow21
@@ -9801,20 +9827,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX6-NEXT:  ; %bb.5:
 ; GFX6-NEXT:    s_lshl_b32 s0, s3, 4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    ds_read_b32 v3, v1
+; GFX6-NEXT:    ds_read_b32 v2, v1
 ; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[8:9]
-; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 0x42280000, v2
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v3, s0
+; GFX6-NEXT:    v_mul_f32_e32 v3, 0x42280000, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB29_6: ; %atomicrmw.start2
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v3, v2
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v4, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v1, v4, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], v2, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB29_6
 ; GFX6-NEXT:  .LBB29_7: ; %Flow19
@@ -10084,12 +10110,12 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
 ; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_f32_e32 v2, 4.0, v1
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10105,12 +10131,12 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
 ; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v2, 4.0, v1
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v1
+; GFX6-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
index d6b7d8ffaf1c5..8e094a7269a49 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -1598,29 +1598,29 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    ds_load_b32 v2, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-TRUE16-NEXT:    ds_load_b32 v3, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX12-TRUE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v4.l, v4.l, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v3.l, v3.l, v3.l
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v4.l, 4.0, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v3.l, 4.0, v3.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -1641,29 +1641,29 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-FAKE16-NEXT:    ds_load_b32 v2, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT:    ds_load_b32 v3, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX12-FAKE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v4, v4, v4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v3, v3, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v4, 4.0, v4
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -1688,16 +1688,16 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX942-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX942-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX942-NEXT:    v_max_f16_e32 v4, 4.0, v4
-; GFX942-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX942-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX942-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX942-NEXT:    v_max_f16_e32 v3, 4.0, v3
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX942-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1710,29 +1710,29 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    ds_load_b32 v2, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-TRUE16-NEXT:    ds_load_b32 v3, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-TRUE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX11-TRUE16-NEXT:    v_max_f16_e32 v4.l, v4.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v3.l, v3.l, v3.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f16_e32 v4.l, 4.0, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v3.l, 4.0, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1747,29 +1747,29 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT:    ds_load_b32 v2, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT:    ds_load_b32 v3, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-FAKE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v4, 4.0, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1784,24 +1784,24 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX10-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    ds_read_b32 v2, v1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX10-NEXT:    ds_read_b32 v3, v1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX10-NEXT:    v_and_b32_e32 v0, 24, v0
-; GFX10-NEXT:    v_not_b32_e32 v3, v3
+; GFX10-NEXT:    v_not_b32_e32 v2, v2
 ; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX10-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f16_e32 v4, 4.0, v4
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f16_e32 v3, 4.0, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v1, v2, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
@@ -1823,16 +1823,16 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_max_f16_e32 v4, 4.0, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX90A-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX90A-NEXT:    v_max_f16_e32 v3, 4.0, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX90A-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1853,16 +1853,16 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX908-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX908-NEXT:    v_max_f16_e32 v4, 4.0, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX908-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX908-NEXT:    v_max_f16_e32 v3, 4.0, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1884,17 +1884,17 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX8-NEXT:    v_max_f16_e32 v4, 4.0, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v3, 4.0, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1915,18 +1915,18 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX7-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX7-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1947,18 +1947,18 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX6-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX6-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1989,20 +1989,21 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-TRUE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v4.l, v4.l, v4.l
-; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v4.l, 4.0, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v3.l, v3.l, v3.l
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v3.l, 4.0, v3.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -2033,21 +2034,21 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-FAKE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v3, v3, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v4, v4, v4
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -2073,16 +2074,16 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX942-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX942-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX942-NEXT:    v_max_f16_e32 v4, 4.0, v4
-; GFX942-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX942-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX942-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX942-NEXT:    v_max_f16_e32 v3, 4.0, v3
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX942-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2105,20 +2106,21 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX11-TRUE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f16_e32 v4.l, v4.l, v4.l
-; GFX11-TRUE16-NEXT:    v_max_f16_e32 v4.l, 4.0, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v3.l, v3.l, v3.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v3.l, 4.0, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2143,21 +2145,21 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX11-FAKE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2180,17 +2182,17 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX10-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f16_e32 v4, 4.0, v4
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f16_e32 v3, 4.0, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
@@ -2213,16 +2215,16 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_max_f16_e32 v4, 4.0, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX90A-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX90A-NEXT:    v_max_f16_e32 v3, 4.0, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX90A-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2244,16 +2246,16 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX908-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX908-NEXT:    v_max_f16_e32 v4, 4.0, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX908-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX908-NEXT:    v_max_f16_e32 v3, 4.0, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2276,17 +2278,17 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX8-NEXT:    v_max_f16_e32 v4, 4.0, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v3, 4.0, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2308,18 +2310,18 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX7-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX7-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2341,18 +2343,18 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX6-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX6-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2685,17 +2687,18 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v2.l, v1.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v2.l, 4.0, v2.l
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, 4.0, v1.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -2718,18 +2721,18 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-FAKE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v2, v1, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v2, 4.0, v2
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v1, v2, v2
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v1, 4.0, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -2749,14 +2752,14 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX942-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_max_f16_e32 v2, v1, v1
-; GFX942-NEXT:    v_max_f16_e32 v2, 4.0, v2
-; GFX942-NEXT:    v_and_or_b32 v2, v1, s2, v2
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    v_max_f16_e32 v1, v2, v2
+; GFX942-NEXT:    v_max_f16_e32 v1, 4.0, v1
+; GFX942-NEXT:    v_and_or_b32 v1, v2, s2, v1
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2771,17 +2774,18 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX11-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_max_f16_e32 v2.l, v1.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f16_e32 v2.l, 4.0, v2.l
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v1.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v1.l, 4.0, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2798,18 +2802,18 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX11-FAKE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v2, v1, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v2, 4.0, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v2, v2
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, 4.0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_gl0_inv
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2826,16 +2830,16 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_max_f16_e32 v2, v1, v1
-; GFX10-NEXT:    v_max_f16_e32 v2, 4.0, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_max_f16_e32 v1, v2, v2
+; GFX10-NEXT:    v_max_f16_e32 v1, 4.0, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
@@ -2852,14 +2856,14 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f16_e32 v2, v1, v1
-; GFX90A-NEXT:    v_max_f16_e32 v2, 4.0, v2
-; GFX90A-NEXT:    v_and_or_b32 v2, v1, s6, v2
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
+; GFX90A-NEXT:    v_max_f16_e32 v1, v2, v2
+; GFX90A-NEXT:    v_max_f16_e32 v1, 4.0, v1
+; GFX90A-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2875,14 +2879,14 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_max_f16_e32 v2, v1, v1
-; GFX908-NEXT:    v_max_f16_e32 v2, 4.0, v2
-; GFX908-NEXT:    v_and_or_b32 v2, v1, s6, v2
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX908-NEXT:    v_mov_b32_e32 v2, v1
+; GFX908-NEXT:    v_max_f16_e32 v1, v2, v2
+; GFX908-NEXT:    v_max_f16_e32 v1, 4.0, v1
+; GFX908-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2898,15 +2902,15 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v2, v1, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_max_f16_e32 v2, 4.0, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_max_f16_e32 v1, v2, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_max_f16_e32 v1, 4.0, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2922,16 +2926,16 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX7-NEXT:    v_max_f32_e32 v2, 4.0, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT:    v_max_f32_e32 v1, 4.0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2948,16 +2952,16 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX6-NEXT:    v_max_f32_e32 v2, 4.0, v2
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT:    v_max_f32_e32 v1, 4.0, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3911,38 +3915,38 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    ds_load_b32 v2, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-TRUE16-NEXT:    ds_load_b32 v3, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX12-TRUE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v4, 4.0, v4
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, 4.0, v3
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -3963,37 +3967,37 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-FAKE16-NEXT:    ds_load_b32 v2, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT:    ds_load_b32 v3, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX12-FAKE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v4, 4.0, v4
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -4019,22 +4023,22 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX942-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX942-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX942-NEXT:    v_add3_u32 v5, v5, v4, s2
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX942-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX942-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX942-NEXT:    v_add3_u32 v5, v5, v3, s2
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX942-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX942-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4047,38 +4051,38 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    ds_load_b32 v2, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-TRUE16-NEXT:    ds_load_b32 v3, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4093,37 +4097,37 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT:    ds_load_b32 v2, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT:    ds_load_b32 v3, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4138,28 +4142,28 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX10-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    ds_read_b32 v2, v1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX10-NEXT:    ds_read_b32 v3, v1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX10-NEXT:    v_and_b32_e32 v0, 24, v0
-; GFX10-NEXT:    v_not_b32_e32 v3, v3
+; GFX10-NEXT:    v_not_b32_e32 v2, v2
 ; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v1, v2, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
@@ -4182,20 +4186,20 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4217,20 +4221,20 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4252,22 +4256,22 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4288,19 +4292,19 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4321,19 +4325,19 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4364,29 +4368,30 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX12-TRUE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v4, 4.0, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, 4.0, v3
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v5
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -4417,28 +4422,29 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX12-FAKE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v4, 4.0, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -4465,22 +4471,22 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX942-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX942-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX942-NEXT:    v_add3_u32 v5, v5, v4, s2
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX942-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX942-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX942-NEXT:    v_add3_u32 v5, v5, v3, s2
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX942-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX942-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4504,28 +4510,29 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX11-TRUE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v5
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4551,27 +4558,28 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX11-FAKE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4594,21 +4602,21 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
@@ -4632,20 +4640,20 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4668,20 +4676,20 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4704,22 +4712,22 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4741,19 +4749,19 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4775,19 +4783,19 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT:    v_max_f32_e32 v4, 4.0, v4
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_max_f32_e32 v3, 4.0, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5189,26 +5197,27 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v2, 4.0, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v1, 4.0, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.h
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -5231,25 +5240,26 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v2, 4.0, v2
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v1, 4.0, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -5270,21 +5280,21 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX942-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX942-NEXT:    v_max_f32_e32 v2, 4.0, v2
-; GFX942-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX942-NEXT:    v_add3_u32 v3, v3, v2, s2
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX942-NEXT:    v_max_f32_e32 v1, 4.0, v1
+; GFX942-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX942-NEXT:    v_add3_u32 v3, v3, v1, s2
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX942-NEXT:    v_and_or_b32 v2, v1, s3, v2
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX942-NEXT:    v_and_or_b32 v1, v2, s3, v1
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5300,25 +5310,26 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX11-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v2, 4.0, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v1, 4.0, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.h
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5336,24 +5347,25 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX11-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_max_f32_e32 v2, 4.0, v2
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_max_f32_e32 v1, 4.0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5370,21 +5382,21 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_max_f32_e32 v2, 4.0, v2
-; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v1, 4.0, v1
+; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
@@ -5402,20 +5414,20 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX90A-NEXT:    v_max_f32_e32 v2, 4.0, v2
-; GFX90A-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v3, v3, v2, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT:    v_and_or_b32 v2, v1, s7, v2
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT:    v_max_f32_e32 v1, 4.0, v1
+; GFX90A-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX90A-NEXT:    v_add3_u32 v3, v3, v1, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX90A-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5432,20 +5444,20 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX908-NEXT:    v_max_f32_e32 v2, 4.0, v2
-; GFX908-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v3, v3, v2, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX908-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX908-NEXT:    v_and_or_b32 v2, v1, s7, v2
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX908-NEXT:    v_mov_b32_e32 v2, v1
+; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX908-NEXT:    v_max_f32_e32 v1, 4.0, v1
+; GFX908-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX908-NEXT:    v_add3_u32 v3, v3, v1, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX908-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5461,21 +5473,21 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT:    v_max_f32_e32 v2, 4.0, v2
-; GFX8-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_max_f32_e32 v1, 4.0, v1
+; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5491,17 +5503,17 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_max_f32_e32 v2, 4.0, v2
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_max_f32_e32 v1, 4.0, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5518,17 +5530,17 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_max_f32_e32 v2, 4.0, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT:    v_max_f32_e32 v1, 4.0, v1
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6101,15 +6113,15 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v1
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6129,14 +6141,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX942-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_pk_max_f16 v3, v3, v1
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX942-NEXT:    v_pk_max_f16 v2, v2, v1
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6152,15 +6164,15 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v3, v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6178,14 +6190,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v3, v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
@@ -6202,13 +6214,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX90A-NEXT:    v_pk_max_f16 v3, v3, v1
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6224,13 +6236,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX908-NEXT:    v_pk_max_f16 v3, v3, v1
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6248,16 +6260,16 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v5, v3, v3
-; GFX8-NEXT:    v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v4, v4
+; GFX8-NEXT:    v_max_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_max_f16_e32 v5, v5, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6363,15 +6375,15 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v3, v1
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6391,14 +6403,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX942-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_pk_max_f16 v3, v3, v1
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX942-NEXT:    v_pk_max_f16 v2, v2, v1
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6414,15 +6426,15 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v3, v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6440,14 +6452,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v3, v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v2, v2, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
@@ -6464,13 +6476,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX90A-NEXT:    v_pk_max_f16 v3, v3, v1
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6486,13 +6498,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX908-NEXT:    v_pk_max_f16 v3, v3, v1
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_max_f16 v2, v2, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6510,16 +6522,16 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v5, v3, v3
-; GFX8-NEXT:    v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v4, v4
+; GFX8-NEXT:    v_max_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_max_f16_e32 v5, v5, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7589,31 +7601,34 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX12-TRUE16-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_max_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v4, v4, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v1
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v5, v3
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v5, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -7638,32 +7653,33 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX12-FAKE16-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v4, v4, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-FAKE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -7686,27 +7702,27 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX942-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX942-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX942-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX942-NEXT:    v_max_f32_e32 v5, v5, v1
-; GFX942-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX942-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX942-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT:    v_add3_u32 v6, v6, v4, s4
+; GFX942-NEXT:    v_add3_u32 v6, v6, v3, s4
 ; GFX942-NEXT:    v_add3_u32 v8, v8, v5, s4
 ; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX942-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[0:1]
-; GFX942-NEXT:    v_perm_b32 v4, v5, v4, s5
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
+; GFX942-NEXT:    v_perm_b32 v3, v5, v3, s5
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7724,30 +7740,32 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX11-TRUE16-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_max_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v3, v3, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v5, v3
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v5, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7768,30 +7786,32 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX11-FAKE16-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-FAKE16-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -7811,27 +7831,27 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    v_max_f32_e32 v5, v5, v1
-; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
 ; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
@@ -7851,26 +7871,26 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v1
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
 ; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v5, v4, s9
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX90A-NEXT:    v_perm_b32 v3, v5, v3, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7889,26 +7909,26 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX908-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    v_max_f32_e32 v5, v5, v1
-; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
 ; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v4, v5, v4, s9
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7926,29 +7946,29 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX8-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX8-NEXT:    v_max_f32_e32 v5, v5, v1
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
 ; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8047,31 +8067,34 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX12-TRUE16-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_max_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v4, v4, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v1
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -8096,32 +8119,33 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX12-FAKE16-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v4, v4, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v5, v5, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-FAKE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -8144,27 +8168,27 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX942-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX942-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX942-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX942-NEXT:    v_max_f32_e32 v5, v5, v1
-; GFX942-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX942-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX942-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT:    v_add3_u32 v6, v6, v4, s4
+; GFX942-NEXT:    v_add3_u32 v6, v6, v3, s4
 ; GFX942-NEXT:    v_add3_u32 v8, v8, v5, s4
 ; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX942-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[0:1]
-; GFX942-NEXT:    v_perm_b32 v4, v5, v4, s5
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
+; GFX942-NEXT:    v_perm_b32 v3, v5, v3, s5
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8182,30 +8206,32 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX11-TRUE16-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_max_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v3, v3, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -8226,30 +8252,32 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX11-FAKE16-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-FAKE16-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_max_f32_e32 v5, v5, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -8269,27 +8297,27 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    v_max_f32_e32 v5, v5, v1
-; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
 ; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
@@ -8309,26 +8337,26 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v1
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
 ; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v5, v4, s9
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX90A-NEXT:    v_perm_b32 v3, v5, v3, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8347,26 +8375,26 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX908-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    v_max_f32_e32 v5, v5, v1
-; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
 ; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v4, v5, v4, s9
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8384,29 +8412,29 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX8-NEXT:    v_max_f32_e32 v4, v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX8-NEXT:    v_max_f32_e32 v5, v5, v1
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
 ; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
index 11ed43d737634..0aa8d33ea7429 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
@@ -1598,29 +1598,29 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    ds_load_b32 v2, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-TRUE16-NEXT:    ds_load_b32 v3, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX12-TRUE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v4.l, v4.l, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v3.l, v3.l, v3.l
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v4.l, 4.0, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v3.l, 4.0, v3.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -1641,29 +1641,29 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-FAKE16-NEXT:    ds_load_b32 v2, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT:    ds_load_b32 v3, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX12-FAKE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v4, v4, v4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v3, v3, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v4, 4.0, v4
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -1688,16 +1688,16 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX942-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX942-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX942-NEXT:    v_min_f16_e32 v4, 4.0, v4
-; GFX942-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX942-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX942-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX942-NEXT:    v_min_f16_e32 v3, 4.0, v3
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX942-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1710,29 +1710,29 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    ds_load_b32 v2, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-TRUE16-NEXT:    ds_load_b32 v3, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-TRUE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX11-TRUE16-NEXT:    v_max_f16_e32 v4.l, v4.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v3.l, v3.l, v3.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_min_f16_e32 v4.l, 4.0, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v3.l, 4.0, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1747,29 +1747,29 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT:    ds_load_b32 v2, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT:    ds_load_b32 v3, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-FAKE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_min_f16_e32 v4, 4.0, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1784,24 +1784,24 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX10-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    ds_read_b32 v2, v1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX10-NEXT:    ds_read_b32 v3, v1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX10-NEXT:    v_and_b32_e32 v0, 24, v0
-; GFX10-NEXT:    v_not_b32_e32 v3, v3
+; GFX10-NEXT:    v_not_b32_e32 v2, v2
 ; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX10-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX10-NEXT:    v_min_f16_e32 v4, 4.0, v4
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX10-NEXT:    v_min_f16_e32 v3, 4.0, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v1, v2, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
@@ -1823,16 +1823,16 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_min_f16_e32 v4, 4.0, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX90A-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX90A-NEXT:    v_min_f16_e32 v3, 4.0, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX90A-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1853,16 +1853,16 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX908-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX908-NEXT:    v_min_f16_e32 v4, 4.0, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX908-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX908-NEXT:    v_min_f16_e32 v3, 4.0, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1884,17 +1884,17 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX8-NEXT:    v_min_f16_e32 v4, 4.0, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_min_f16_e32 v3, 4.0, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1915,18 +1915,18 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX7-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX7-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1947,18 +1947,18 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX6-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX6-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1989,20 +1989,21 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-TRUE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v4.l, v4.l, v4.l
-; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v4.l, 4.0, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v3.l, v3.l, v3.l
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v3.l, 4.0, v3.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -2033,21 +2034,21 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-FAKE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v3, v3, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v4, v4, v4
-; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -2073,16 +2074,16 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX942-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX942-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX942-NEXT:    v_min_f16_e32 v4, 4.0, v4
-; GFX942-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX942-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX942-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX942-NEXT:    v_min_f16_e32 v3, 4.0, v3
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX942-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2105,20 +2106,21 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX11-TRUE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f16_e32 v4.l, v4.l, v4.l
-; GFX11-TRUE16-NEXT:    v_min_f16_e32 v4.l, 4.0, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v3.l, v3.l, v3.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v3.l, 4.0, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2143,21 +2145,21 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX11-FAKE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX11-FAKE16-NEXT:    v_min_f16_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2180,17 +2182,17 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX10-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX10-NEXT:    v_min_f16_e32 v4, 4.0, v4
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX10-NEXT:    v_min_f16_e32 v3, 4.0, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
@@ -2213,16 +2215,16 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX90A-NEXT:    v_min_f16_e32 v4, 4.0, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX90A-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX90A-NEXT:    v_min_f16_e32 v3, 4.0, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX90A-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2244,16 +2246,16 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX908-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX908-NEXT:    v_min_f16_e32 v4, 4.0, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX908-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX908-NEXT:    v_min_f16_e32 v3, 4.0, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2276,17 +2278,17 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX8-NEXT:    v_min_f16_e32 v4, 4.0, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_min_f16_e32 v3, 4.0, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2308,18 +2310,18 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX7-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX7-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2341,18 +2343,18 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX6-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX6-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2685,17 +2687,18 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v2.l, v1.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v2.l, 4.0, v2.l
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v1.l, 4.0, v1.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -2718,18 +2721,18 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-FAKE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v2, v1, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v2, 4.0, v2
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v1, v2, v2
+; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v1, 4.0, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -2749,14 +2752,14 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX942-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_max_f16_e32 v2, v1, v1
-; GFX942-NEXT:    v_min_f16_e32 v2, 4.0, v2
-; GFX942-NEXT:    v_and_or_b32 v2, v1, s2, v2
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    v_max_f16_e32 v1, v2, v2
+; GFX942-NEXT:    v_min_f16_e32 v1, 4.0, v1
+; GFX942-NEXT:    v_and_or_b32 v1, v2, s2, v1
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2771,17 +2774,18 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX11-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_max_f16_e32 v2.l, v1.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_min_f16_e32 v2.l, 4.0, v2.l
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v1.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v1.l, 4.0, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2798,18 +2802,18 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX11-FAKE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v2, v1, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_min_f16_e32 v2, 4.0, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v2, v2
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v1, 4.0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_gl0_inv
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2826,16 +2830,16 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_max_f16_e32 v2, v1, v1
-; GFX10-NEXT:    v_min_f16_e32 v2, 4.0, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_max_f16_e32 v1, v2, v2
+; GFX10-NEXT:    v_min_f16_e32 v1, 4.0, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
@@ -2852,14 +2856,14 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f16_e32 v2, v1, v1
-; GFX90A-NEXT:    v_min_f16_e32 v2, 4.0, v2
-; GFX90A-NEXT:    v_and_or_b32 v2, v1, s6, v2
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
+; GFX90A-NEXT:    v_max_f16_e32 v1, v2, v2
+; GFX90A-NEXT:    v_min_f16_e32 v1, 4.0, v1
+; GFX90A-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2875,14 +2879,14 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_max_f16_e32 v2, v1, v1
-; GFX908-NEXT:    v_min_f16_e32 v2, 4.0, v2
-; GFX908-NEXT:    v_and_or_b32 v2, v1, s6, v2
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX908-NEXT:    v_mov_b32_e32 v2, v1
+; GFX908-NEXT:    v_max_f16_e32 v1, v2, v2
+; GFX908-NEXT:    v_min_f16_e32 v1, 4.0, v1
+; GFX908-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2898,15 +2902,15 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v2, v1, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f16_e32 v2, 4.0, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_max_f16_e32 v1, v2, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_min_f16_e32 v1, 4.0, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2922,16 +2926,16 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX7-NEXT:    v_min_f32_e32 v2, 4.0, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT:    v_min_f32_e32 v1, 4.0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2948,16 +2952,16 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX6-NEXT:    v_min_f32_e32 v2, 4.0, v2
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT:    v_min_f32_e32 v1, 4.0, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3911,38 +3915,38 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    ds_load_b32 v2, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-TRUE16-NEXT:    ds_load_b32 v3, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX12-TRUE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v4, 4.0, v4
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, 4.0, v3
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -3963,37 +3967,37 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-FAKE16-NEXT:    ds_load_b32 v2, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT:    ds_load_b32 v3, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX12-FAKE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v4, 4.0, v4
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -4019,22 +4023,22 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX942-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX942-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX942-NEXT:    v_add3_u32 v5, v5, v4, s2
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX942-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX942-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX942-NEXT:    v_add3_u32 v5, v5, v3, s2
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX942-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX942-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4047,38 +4051,38 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    ds_load_b32 v2, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-TRUE16-NEXT:    ds_load_b32 v3, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4093,37 +4097,37 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT:    ds_load_b32 v2, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT:    ds_load_b32 v3, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4138,28 +4142,28 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX10-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    ds_read_b32 v2, v1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX10-NEXT:    ds_read_b32 v3, v1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX10-NEXT:    v_and_b32_e32 v0, 24, v0
-; GFX10-NEXT:    v_not_b32_e32 v3, v3
+; GFX10-NEXT:    v_not_b32_e32 v2, v2
 ; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v1, v2, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
@@ -4182,20 +4186,20 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4217,20 +4221,20 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4252,22 +4256,22 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4288,19 +4292,19 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4321,19 +4325,19 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4364,29 +4368,30 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX12-TRUE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v4, 4.0, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, 4.0, v3
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v5
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -4417,28 +4422,29 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX12-FAKE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v4, 4.0, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -4465,22 +4471,22 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX942-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX942-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX942-NEXT:    v_add3_u32 v5, v5, v4, s2
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX942-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX942-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX942-NEXT:    v_add3_u32 v5, v5, v3, s2
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX942-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX942-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4504,28 +4510,29 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX11-TRUE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v5
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4551,27 +4558,28 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX11-FAKE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4594,21 +4602,21 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
@@ -4632,20 +4640,20 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4668,20 +4676,20 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4704,22 +4712,22 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4741,19 +4749,19 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4775,19 +4783,19 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT:    v_min_f32_e32 v4, 4.0, v4
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_min_f32_e32 v3, 4.0, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5189,26 +5197,27 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v2, 4.0, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v1, 4.0, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.h
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -5231,25 +5240,26 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v2, 4.0, v2
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v1, 4.0, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -5270,21 +5280,21 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX942-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX942-NEXT:    v_min_f32_e32 v2, 4.0, v2
-; GFX942-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX942-NEXT:    v_add3_u32 v3, v3, v2, s2
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX942-NEXT:    v_min_f32_e32 v1, 4.0, v1
+; GFX942-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX942-NEXT:    v_add3_u32 v3, v3, v1, s2
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX942-NEXT:    v_and_or_b32 v2, v1, s3, v2
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX942-NEXT:    v_and_or_b32 v1, v2, s3, v1
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5300,25 +5310,26 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX11-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v2, 4.0, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v1, 4.0, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.h
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5336,24 +5347,25 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX11-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_min_f32_e32 v2, 4.0, v2
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_min_f32_e32 v1, 4.0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5370,21 +5382,21 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_min_f32_e32 v2, 4.0, v2
-; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_min_f32_e32 v1, 4.0, v1
+; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
@@ -5402,20 +5414,20 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX90A-NEXT:    v_min_f32_e32 v2, 4.0, v2
-; GFX90A-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v3, v3, v2, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT:    v_and_or_b32 v2, v1, s7, v2
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT:    v_min_f32_e32 v1, 4.0, v1
+; GFX90A-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX90A-NEXT:    v_add3_u32 v3, v3, v1, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX90A-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5432,20 +5444,20 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX908-NEXT:    v_min_f32_e32 v2, 4.0, v2
-; GFX908-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v3, v3, v2, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX908-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX908-NEXT:    v_and_or_b32 v2, v1, s7, v2
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX908-NEXT:    v_mov_b32_e32 v2, v1
+; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX908-NEXT:    v_min_f32_e32 v1, 4.0, v1
+; GFX908-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX908-NEXT:    v_add3_u32 v3, v3, v1, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX908-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5461,21 +5473,21 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT:    v_min_f32_e32 v2, 4.0, v2
-; GFX8-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_min_f32_e32 v1, 4.0, v1
+; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5491,17 +5503,17 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_min_f32_e32 v2, 4.0, v2
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_min_f32_e32 v1, 4.0, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5518,17 +5530,17 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_min_f32_e32 v2, 4.0, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT:    v_min_f32_e32 v1, 4.0, v1
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6101,15 +6113,15 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v3, v1
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
+; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6129,14 +6141,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX942-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_pk_min_f16 v3, v3, v1
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX942-NEXT:    v_pk_min_f16 v2, v2, v1
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6152,15 +6164,15 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_min_f16 v3, v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX11-NEXT:    v_pk_min_f16 v2, v2, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6178,14 +6190,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT:    v_pk_min_f16 v3, v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_min_f16 v2, v2, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
@@ -6202,13 +6214,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX90A-NEXT:    v_pk_min_f16 v3, v3, v1
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_min_f16 v2, v2, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6224,13 +6236,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX908-NEXT:    v_pk_min_f16 v3, v3, v1
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_min_f16 v2, v2, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6248,16 +6260,16 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v5, v3, v3
-; GFX8-NEXT:    v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v4, v4
+; GFX8-NEXT:    v_min_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_min_f16_e32 v5, v5, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6363,15 +6375,15 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v3, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_min_num_f16 v3, v3, v1
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
+; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6391,14 +6403,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX942-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_pk_max_f16 v3, v2, v2
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_pk_max_f16 v2, v3, v3
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_pk_min_f16 v3, v3, v1
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX942-NEXT:    v_pk_min_f16 v2, v2, v1
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6414,15 +6426,15 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_min_f16 v3, v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX11-NEXT:    v_pk_min_f16 v2, v2, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6440,14 +6452,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX10-NEXT:    v_pk_min_f16 v3, v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX10-NEXT:    v_pk_min_f16 v2, v2, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
@@ -6464,13 +6476,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX90A-NEXT:    v_pk_min_f16 v3, v3, v1
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT:    v_pk_min_f16 v2, v2, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6486,13 +6498,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v3, v2, v2
-; GFX908-NEXT:    v_pk_min_f16 v3, v3, v1
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX908-NEXT:    v_pk_min_f16 v2, v2, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6510,16 +6522,16 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v5, v3, v3
-; GFX8-NEXT:    v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v4, v4
+; GFX8-NEXT:    v_min_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_min_f16_e32 v5, v5, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7589,31 +7601,34 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX12-TRUE16-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_min_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v4, v4, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v1
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v5, v3
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v5, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -7638,32 +7653,33 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX12-FAKE16-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v4, v4, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-FAKE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -7686,27 +7702,27 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX942-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX942-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX942-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX942-NEXT:    v_min_f32_e32 v5, v5, v1
-; GFX942-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX942-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX942-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT:    v_add3_u32 v6, v6, v4, s4
+; GFX942-NEXT:    v_add3_u32 v6, v6, v3, s4
 ; GFX942-NEXT:    v_add3_u32 v8, v8, v5, s4
 ; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX942-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[0:1]
-; GFX942-NEXT:    v_perm_b32 v4, v5, v4, s5
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
+; GFX942-NEXT:    v_perm_b32 v3, v5, v3, s5
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7724,30 +7740,32 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX11-TRUE16-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_min_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v3, v3, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v5, v3
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v5, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7768,30 +7786,32 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX11-FAKE16-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-FAKE16-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -7811,27 +7831,27 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX10-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    v_min_f32_e32 v5, v5, v1
-; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
 ; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
@@ -7851,26 +7871,26 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v1
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
 ; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v5, v4, s9
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX90A-NEXT:    v_perm_b32 v3, v5, v3, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7889,26 +7909,26 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX908-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    v_min_f32_e32 v5, v5, v1
-; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
 ; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v4, v5, v4, s9
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7926,29 +7946,29 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX8-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX8-NEXT:    v_min_f32_e32 v5, v5, v1
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
 ; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8047,31 +8067,34 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX12-TRUE16-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_min_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v4, v4, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v1
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -8096,32 +8119,33 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX12-FAKE16-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v4, v4, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v5, v5, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-FAKE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -8144,27 +8168,27 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX942-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX942-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX942-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX942-NEXT:    v_min_f32_e32 v5, v5, v1
-; GFX942-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX942-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX942-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT:    v_add3_u32 v6, v6, v4, s4
+; GFX942-NEXT:    v_add3_u32 v6, v6, v3, s4
 ; GFX942-NEXT:    v_add3_u32 v8, v8, v5, s4
 ; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX942-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[0:1]
-; GFX942-NEXT:    v_perm_b32 v4, v5, v4, s5
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
+; GFX942-NEXT:    v_perm_b32 v3, v5, v3, s5
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8182,30 +8206,32 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX11-TRUE16-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_min_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v3, v3, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -8226,30 +8252,32 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX11-FAKE16-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-FAKE16-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_min_f32_e32 v5, v5, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -8269,27 +8297,27 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX10-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    v_min_f32_e32 v5, v5, v1
-; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
 ; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
@@ -8309,26 +8337,26 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v1
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
 ; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v5, v4, s9
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX90A-NEXT:    v_perm_b32 v3, v5, v3, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8347,26 +8375,26 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX908-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    v_min_f32_e32 v5, v5, v1
-; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
 ; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v4, v5, v4, s9
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8384,29 +8412,29 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX8-NEXT:    v_min_f32_e32 v4, v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX8-NEXT:    v_min_f32_e32 v5, v5, v1
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
 ; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
index d74338caba1cd..929bb61ddabcf 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
@@ -453,13 +453,14 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_add_f32_e32 v2, -4.0, v1
+; GFX12-NEXT:    v_mov_b32_e32 v2, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f32_e32 v1, -4.0, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -478,12 +479,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX942-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -498,13 +499,14 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_f32_e32 v2, -4.0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v1, -4.0, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -521,13 +523,13 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX10-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_add_f32_e32 v2, -4.0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_add_f32_e32 v1, -4.0, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
@@ -543,12 +545,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX90A-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
+; GFX90A-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -563,12 +565,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX908-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX908-NEXT:    v_mov_b32_e32 v2, v1
+; GFX908-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -584,12 +586,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX8-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -605,12 +607,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX7-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -626,12 +628,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX6-NEXT:  .LBB2_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v1
+; GFX6-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -654,13 +656,14 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_add_f32_e32 v2, -4.0, v1
+; GFX12-NEXT:    v_mov_b32_e32 v2, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f32_e32 v1, -4.0, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -679,12 +682,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX942-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -699,13 +702,14 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_f32_e32 v2, -4.0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v1, -4.0, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -722,13 +726,13 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_add_f32_e32 v2, -4.0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_add_f32_e32 v1, -4.0, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
@@ -744,12 +748,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
+; GFX90A-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -764,12 +768,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
+; GFX908-NEXT:    v_mov_b32_e32 v2, v1
+; GFX908-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -785,12 +789,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -806,12 +810,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -828,12 +832,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v1
+; GFX6-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1296,13 +1300,14 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_add_f64_e32 v[3:4], -4.0, v[1:2]
+; GFX12-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f64_e32 v[1:2], -4.0, v[3:4]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2]
+; GFX12-NEXT:    ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4]
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -1321,12 +1326,12 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX942-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_add_f64 v[4:5], v[2:3], -4.0
-; GFX942-NEXT:    ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-NEXT:    v_add_f64 v[2:3], v[4:5], -4.0
+; GFX942-NEXT:    ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3]
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1341,13 +1346,14 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_f64 v[3:4], v[1:2], -4.0
+; GFX11-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[1:2], v[3:4], -4.0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2]
+; GFX11-NEXT:    ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX11-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1364,14 +1370,14 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_add_f64 v[3:4], v[1:2], -4.0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_add_f64 v[1:2], v[3:4], -4.0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX10-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
@@ -1387,12 +1393,12 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_add_f64 v[4:5], v[2:3], -4.0
-; GFX90A-NEXT:    ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_add_f64 v[2:3], v[4:5], -4.0
+; GFX90A-NEXT:    ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3]
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1407,13 +1413,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_add_f64 v[3:4], v[1:2], -4.0
-; GFX908-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    v_add_f64 v[1:2], v[3:4], -4.0
+; GFX908-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1429,13 +1435,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_add_f64 v[3:4], v[1:2], -4.0
-; GFX8-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_add_f64 v[1:2], v[3:4], -4.0
+; GFX8-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1451,13 +1457,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_f64 v[3:4], v[1:2], -4.0
-; GFX7-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    v_add_f64 v[1:2], v[3:4], -4.0
+; GFX7-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1473,13 +1479,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX6-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_f64 v[3:4], v[1:2], -4.0
-; GFX6-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
+; GFX6-NEXT:    v_add_f64 v[1:2], v[3:4], -4.0
+; GFX6-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1502,13 +1508,14 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_add_f64_e32 v[3:4], -4.0, v[1:2]
+; GFX12-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f64_e32 v[1:2], -4.0, v[3:4]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528
+; GFX12-NEXT:    ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -1527,12 +1534,12 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX942-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_add_f64 v[4:5], v[2:3], -4.0
-; GFX942-NEXT:    ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-NEXT:    v_add_f64 v[2:3], v[4:5], -4.0
+; GFX942-NEXT:    ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3] offset:65528
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1547,13 +1554,14 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_f64 v[3:4], v[1:2], -4.0
+; GFX11-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[1:2], v[3:4], -4.0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528
+; GFX11-NEXT:    ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX11-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1570,14 +1578,14 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_add_f64 v[3:4], v[1:2], -4.0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_add_f64 v[1:2], v[3:4], -4.0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
+; GFX10-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
@@ -1593,12 +1601,12 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_add_f64 v[4:5], v[2:3], -4.0
-; GFX90A-NEXT:    ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_add_f64 v[2:3], v[4:5], -4.0
+; GFX90A-NEXT:    ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3] offset:65528
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1613,13 +1621,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_add_f64 v[3:4], v[1:2], -4.0
-; GFX908-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    v_add_f64 v[1:2], v[3:4], -4.0
+; GFX908-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1635,13 +1643,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_add_f64 v[3:4], v[1:2], -4.0
-; GFX8-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_add_f64 v[1:2], v[3:4], -4.0
+; GFX8-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1657,13 +1665,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_f64 v[3:4], v[1:2], -4.0
-; GFX7-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    v_add_f64 v[1:2], v[3:4], -4.0
+; GFX7-NEXT:    ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1680,13 +1688,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX6-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_f64 v[3:4], v[0:1], -4.0
-; GFX6-NEXT:    ds_cmpst_rtn_b64 v[3:4], v2, v[0:1], v[3:4]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v1
+; GFX6-NEXT:    v_mov_b32_e32 v3, v0
+; GFX6-NEXT:    v_add_f64 v[0:1], v[3:4], -4.0
+; GFX6-NEXT:    ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2494,27 +2502,27 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    ds_load_b32 v2, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-TRUE16-NEXT:    ds_load_b32 v3, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX12-TRUE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX12-TRUE16-NEXT:    v_add_f16_e32 v4.l, -4.0, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT:    v_add_f16_e32 v3.l, -4.0, v3.l
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -2535,28 +2543,28 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-FAKE16-NEXT:    ds_load_b32 v2, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT:    ds_load_b32 v3, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX12-FAKE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-FAKE16-NEXT:    v_add_f16_e32 v4, -4.0, v4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-FAKE16-NEXT:    v_add_f16_e32 v3, -4.0, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -2581,15 +2589,15 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX942-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX942-NEXT:    v_add_f16_e32 v4, -4.0, v4
-; GFX942-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX942-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX942-NEXT:    v_add_f16_e32 v3, -4.0, v3
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX942-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2602,27 +2610,27 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    ds_load_b32 v2, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-TRUE16-NEXT:    ds_load_b32 v3, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-TRUE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX11-TRUE16-NEXT:    v_add_f16_e32 v4.l, -4.0, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v3.l, -4.0, v3.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2637,28 +2645,28 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT:    ds_load_b32 v2, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT:    ds_load_b32 v3, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-FAKE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-FAKE16-NEXT:    v_add_f16_e32 v4, -4.0, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v3, -4.0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2673,23 +2681,23 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX10-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    ds_read_b32 v2, v1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX10-NEXT:    ds_read_b32 v3, v1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX10-NEXT:    v_and_b32_e32 v0, 24, v0
-; GFX10-NEXT:    v_not_b32_e32 v3, v3
+; GFX10-NEXT:    v_not_b32_e32 v2, v2
 ; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX10-NEXT:    v_add_f16_e32 v4, -4.0, v4
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX10-NEXT:    v_add_f16_e32 v3, -4.0, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v1, v2, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
@@ -2711,15 +2719,15 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX90A-NEXT:    v_add_f16_e32 v4, -4.0, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX90A-NEXT:    v_add_f16_e32 v3, -4.0, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX90A-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2740,15 +2748,15 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX908-NEXT:    v_add_f16_e32 v4, -4.0, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX908-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX908-NEXT:    v_add_f16_e32 v3, -4.0, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2770,16 +2778,16 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX8-NEXT:    v_add_f16_e32 v4, -4.0, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX8-NEXT:    v_add_f16_e32 v3, -4.0, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2800,18 +2808,18 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX7-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX7-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2832,18 +2840,18 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
 ; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX6-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX6-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2874,19 +2882,19 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-TRUE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f16_e32 v4.l, -4.0, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT:    v_add_f16_e32 v3.l, -4.0, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -2917,19 +2925,20 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-FAKE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add_f16_e32 v4, -4.0, v4
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-FAKE16-NEXT:    v_add_f16_e32 v3, -4.0, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -2955,15 +2964,15 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX942-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX942-NEXT:    v_add_f16_e32 v4, -4.0, v4
-; GFX942-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX942-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX942-NEXT:    v_add_f16_e32 v3, -4.0, v3
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX942-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2986,19 +2995,19 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX11-TRUE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f16_e32 v4.l, -4.0, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v3.l, -4.0, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3023,19 +3032,20 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX11-FAKE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add_f16_e32 v4, -4.0, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v3, -4.0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3058,16 +3068,16 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX10-NEXT:    v_add_f16_e32 v4, -4.0, v4
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX10-NEXT:    v_add_f16_e32 v3, -4.0, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
@@ -3090,15 +3100,15 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX90A-NEXT:    v_add_f16_e32 v4, -4.0, v4
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX90A-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX90A-NEXT:    v_add_f16_e32 v3, -4.0, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX90A-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3120,15 +3130,15 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX908-NEXT:    v_add_f16_e32 v4, -4.0, v4
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX908-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
-; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX908-NEXT:    v_add_f16_e32 v3, -4.0, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3151,16 +3161,16 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX8-NEXT:    v_add_f16_e32 v4, -4.0, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX8-NEXT:    v_add_f16_e32 v3, -4.0, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3182,18 +3192,18 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX7-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX7-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3215,18 +3225,18 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX6-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX6-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3548,16 +3558,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_f16_e32 v2.l, -4.0, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add_f16_e32 v1.l, -4.0, v2.l
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -3580,16 +3590,17 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-FAKE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_add_f16_e32 v2, -4.0, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-FAKE16-NEXT:    v_add_f16_e32 v1, -4.0, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -3609,13 +3620,13 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX942-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_add_f16_e32 v2, -4.0, v1
-; GFX942-NEXT:    v_and_or_b32 v2, v1, s2, v2
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    v_add_f16_e32 v1, -4.0, v2
+; GFX942-NEXT:    v_and_or_b32 v1, v2, s2, v1
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3630,16 +3641,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX11-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_f16_e32 v2.l, -4.0, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v1.l, -4.0, v2.l
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3656,16 +3667,17 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX11-FAKE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_f16_e32 v2, -4.0, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v1, -4.0, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3682,15 +3694,15 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_add_f16_e32 v2, -4.0, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_add_f16_e32 v1, -4.0, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
@@ -3707,13 +3719,13 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_add_f16_e32 v2, -4.0, v1
-; GFX90A-NEXT:    v_and_or_b32 v2, v1, s6, v2
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
+; GFX90A-NEXT:    v_add_f16_e32 v1, -4.0, v2
+; GFX90A-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3729,13 +3741,13 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_add_f16_e32 v2, -4.0, v1
-; GFX908-NEXT:    v_and_or_b32 v2, v1, s6, v2
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX908-NEXT:    v_mov_b32_e32 v2, v1
+; GFX908-NEXT:    v_add_f16_e32 v1, -4.0, v2
+; GFX908-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3751,14 +3763,14 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_add_f16_e32 v2, -4.0, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_add_f16_e32 v1, -4.0, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3774,16 +3786,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX7-NEXT:    v_add_f32_e32 v2, -4.0, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT:    v_add_f32_e32 v1, -4.0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3800,16 +3812,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX6-NEXT:    v_add_f32_e32 v2, -4.0, v2
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT:    v_add_f32_e32 v1, -4.0, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4759,38 +4771,38 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    ds_load_b32 v2, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-TRUE16-NEXT:    ds_load_b32 v3, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX12-TRUE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -4811,37 +4823,37 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-FAKE16-NEXT:    ds_load_b32 v2, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT:    ds_load_b32 v3, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX12-FAKE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -4867,22 +4879,22 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX942-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX942-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX942-NEXT:    v_add3_u32 v5, v5, v4, s2
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX942-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX942-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX942-NEXT:    v_add3_u32 v5, v5, v3, s2
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX942-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX942-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4895,38 +4907,38 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    ds_load_b32 v2, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-TRUE16-NEXT:    ds_load_b32 v3, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4941,37 +4953,37 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT:    ds_load_b32 v2, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT:    ds_load_b32 v3, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 24, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4986,28 +4998,28 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX10-NEXT:    v_and_b32_e32 v1, -4, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    ds_read_b32 v2, v1
-; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX10-NEXT:    ds_read_b32 v3, v1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX10-NEXT:    v_and_b32_e32 v0, 24, v0
-; GFX10-NEXT:    v_not_b32_e32 v3, v3
+; GFX10-NEXT:    v_not_b32_e32 v2, v2
 ; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v1, v2, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
@@ -5030,20 +5042,20 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5065,20 +5077,20 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5100,23 +5112,23 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -5136,18 +5148,18 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5168,18 +5180,18 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5210,29 +5222,30 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX12-TRUE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v5
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -5263,28 +5276,29 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX12-FAKE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-FAKE16-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -5311,22 +5325,22 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX942-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX942-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX942-NEXT:    v_add3_u32 v5, v5, v4, s2
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX942-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX942-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX942-NEXT:    v_add3_u32 v5, v5, v3, s2
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX942-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX942-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5350,28 +5364,29 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX11-TRUE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v5
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5397,27 +5412,28 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX11-FAKE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5440,21 +5456,21 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX10-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v3, v4, v2, v3
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
@@ -5478,20 +5494,20 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5514,20 +5530,20 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5550,22 +5566,22 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5587,18 +5603,18 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5620,18 +5636,18 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
 ; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v1, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_add_f32_e32 v4, -4.0, v4
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v3, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v1, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v1, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, -4.0, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v1, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6031,26 +6047,27 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v2, -4.0, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v1, -4.0, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.h
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -6073,25 +6090,26 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX12-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add_f32_e32 v2, -4.0, v2
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-FAKE16-NEXT:    v_add_f32_e32 v1, -4.0, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -6112,21 +6130,21 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX942-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX942-NEXT:    v_add_f32_e32 v2, -4.0, v2
-; GFX942-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX942-NEXT:    v_add3_u32 v3, v3, v2, s2
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX942-NEXT:    v_add_f32_e32 v1, -4.0, v1
+; GFX942-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX942-NEXT:    v_add3_u32 v3, v3, v1, s2
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX942-NEXT:    v_and_or_b32 v2, v1, s3, v2
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX942-NEXT:    v_and_or_b32 v1, v2, s3, v1
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6142,25 +6160,26 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX11-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, -4.0, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, -4.0, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.h
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6178,24 +6197,25 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX11-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, -4.0, v2
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, -4.0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6212,21 +6232,21 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_add_f32_e32 v2, -4.0, v2
-; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_add_f32_e32 v1, -4.0, v1
+; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff0000, v2, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
@@ -6244,20 +6264,20 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX90A-NEXT:    v_add_f32_e32 v2, -4.0, v2
-; GFX90A-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v3, v3, v2, s6
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT:    v_and_or_b32 v2, v1, s7, v2
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT:    v_add_f32_e32 v1, -4.0, v1
+; GFX90A-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX90A-NEXT:    v_add3_u32 v3, v3, v1, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX90A-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6274,20 +6294,20 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX908-NEXT:    v_add_f32_e32 v2, -4.0, v2
-; GFX908-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v3, v3, v2, s6
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX908-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX908-NEXT:    v_and_or_b32 v2, v1, s7, v2
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX908-NEXT:    v_mov_b32_e32 v2, v1
+; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX908-NEXT:    v_add_f32_e32 v1, -4.0, v1
+; GFX908-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX908-NEXT:    v_add3_u32 v3, v3, v1, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX908-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6303,21 +6323,21 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v2, -4.0, v2
-; GFX8-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v1, -4.0, v1
+; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6333,16 +6353,16 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX7-NEXT:    v_add_f32_e32 v2, -4.0, v2
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v1, -4.0, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6359,16 +6379,16 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
 ; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX6-NEXT:    v_add_f32_e32 v2, -4.0, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v1, -4.0, v1
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6906,13 +6926,14 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6931,12 +6952,12 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX942-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6951,13 +6972,14 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6974,13 +6996,13 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
@@ -6996,12 +7018,12 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7016,12 +7038,12 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7037,14 +7059,14 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
 ; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_sub_f16_e32 v4, v2, v1
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_sub_f16_e32 v4, v3, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7149,13 +7171,14 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -7174,12 +7197,12 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX942-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7194,13 +7217,14 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7217,13 +7241,13 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
@@ -7239,12 +7263,12 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7259,12 +7283,12 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7280,14 +7304,14 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
 ; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_sub_f16_e32 v4, v2, v1
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_sub_f16_e32 v4, v3, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8357,31 +8381,34 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX12-TRUE16-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v4, v4, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v1
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v5, v3
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v5, v4
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -8406,32 +8433,33 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX12-FAKE16-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v3, v3, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-FAKE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -8454,27 +8482,27 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX942-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX942-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX942-NEXT:    v_sub_f32_e32 v3, v3, v2
 ; GFX942-NEXT:    v_sub_f32_e32 v5, v5, v1
-; GFX942-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX942-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX942-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT:    v_add3_u32 v6, v6, v4, s4
+; GFX942-NEXT:    v_add3_u32 v6, v6, v3, s4
 ; GFX942-NEXT:    v_add3_u32 v8, v8, v5, s4
 ; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX942-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[0:1]
-; GFX942-NEXT:    v_perm_b32 v4, v5, v4, s5
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
+; GFX942-NEXT:    v_perm_b32 v3, v5, v3, s5
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8492,30 +8520,32 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX11-TRUE16-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v5, v3
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v5, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -8536,30 +8566,32 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX11-FAKE16-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-FAKE16-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_sub_f32_e32 v3, v3, v2
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -8579,27 +8611,27 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX10-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT:    v_sub_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v1
-; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
 ; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
@@ -8619,26 +8651,26 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v3, v3, v2
 ; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v1
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
 ; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v5, v4, s9
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX90A-NEXT:    v_perm_b32 v3, v5, v3, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8657,26 +8689,26 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX908-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT:    v_sub_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v1
-; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
 ; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v4, v5, v4, s9
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8694,29 +8726,29 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
 ; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX8-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_sub_f32_e32 v3, v3, v2
 ; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v1
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
 ; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8815,31 +8847,34 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX12-TRUE16-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v4, v4, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v1
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
+; GFX12-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532
 ; GFX12-TRUE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -8864,32 +8899,33 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX12-FAKE16-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX12-FAKE16-NEXT:    v_sub_f32_e32 v3, v3, v2
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-FAKE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX12-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532
 ; GFX12-FAKE16-NEXT:    s_wait_dscnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -8912,27 +8948,27 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX942-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX942-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX942-NEXT:    v_sub_f32_e32 v3, v3, v2
 ; GFX942-NEXT:    v_sub_f32_e32 v5, v5, v1
-; GFX942-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX942-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX942-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT:    v_add3_u32 v6, v6, v4, s4
+; GFX942-NEXT:    v_add3_u32 v6, v6, v3, s4
 ; GFX942-NEXT:    v_add3_u32 v8, v8, v5, s4
 ; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX942-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[0:1]
-; GFX942-NEXT:    v_perm_b32 v4, v5, v4, s5
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[0:1]
+; GFX942-NEXT:    v_perm_b32 v3, v5, v3, s5
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8950,30 +8986,32 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX11-TRUE16-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
+; GFX11-TRUE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -8994,30 +9032,32 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX11-FAKE16-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-FAKE16-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_sub_f32_e32 v5, v5, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_sub_f32_e32 v3, v3, v2
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX11-FAKE16-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -9037,27 +9077,27 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX10-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX10-NEXT:    v_sub_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    v_sub_f32_e32 v5, v5, v1
-; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
 ; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v4, v4
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s4
-; GFX10-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v3, v5, v3, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
@@ -9077,26 +9117,26 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT:    v_sub_f32_e32 v3, v3, v2
 ; GFX90A-NEXT:    v_sub_f32_e32 v5, v5, v1
-; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
 ; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v4, v5, v4, s9
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX90A-NEXT:    v_perm_b32 v3, v5, v3, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9115,26 +9155,26 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX908-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT:    v_sub_f32_e32 v3, v3, v2
 ; GFX908-NEXT:    v_sub_f32_e32 v5, v5, v1
-; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT:    v_add3_u32 v6, v6, v3, s8
 ; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v4, v5, v4, s9
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX908-NEXT:    v_perm_b32 v3, v5, v3, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9152,29 +9192,29 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
 ; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX8-NEXT:    v_sub_f32_e32 v4, v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_sub_f32_e32 v3, v3, v2
 ; GFX8-NEXT:    v_sub_f32_e32 v5, v5, v1
-; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
 ; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX8-NEXT:    v_alignbit_b32 v3, v5, v3, 16
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9489,13 +9529,14 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
 ; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
-; GFX12-NEXT:    v_add_f32_e32 v2, -4.0, v1
+; GFX12-NEXT:    v_mov_b32_e32 v2, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f32_e32 v1, -4.0, v2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -9514,12 +9555,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
 ; GFX942-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX942-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX942-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9534,13 +9575,14 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
 ; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_f32_e32 v2, -4.0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v1, -4.0, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v1
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -9557,13 +9599,13 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
 ; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_add_f32_e32 v2, -4.0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_add_f32_e32 v1, -4.0, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX10-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
@@ -9579,12 +9621,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
 ; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
+; GFX90A-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9599,12 +9641,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
 ; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX908-NEXT:    v_mov_b32_e32 v2, v1
+; GFX908-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9620,12 +9662,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
 ; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9641,12 +9683,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
 ; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9662,12 +9704,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
 ; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v2, -4.0, v1
-; GFX6-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v1
+; GFX6-NEXT:    v_add_f32_e32 v1, -4.0, v2
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir
index e6a52342337f3..8ea9ec397fe06 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir
@@ -18,21 +18,21 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], 256, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets
     ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -53,27 +53,27 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, [[V_ADD_U32_e64_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], -156, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute
     ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     %2:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %2
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir
index 98b7f4a5aa1c5..71c47c80ae357 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir
@@ -21,9 +21,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]]
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_CO_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
@@ -31,9 +31,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
@@ -41,10 +41,10 @@ body:             |
     ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[COPY1]], implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
@@ -52,9 +52,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
@@ -62,15 +62,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[COPY1]], implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -88,42 +88,42 @@ body:             |
   bb.0:
     ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
     ; GFX803: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
     ; GFX900: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
     ; GFX942: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
     ; GFX10: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
     ; GFX12: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1, implicit $vcc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1, implicit $vcc
     SI_RETURN
 
 ...
@@ -144,9 +144,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]]
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_CO_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
@@ -154,9 +154,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
@@ -164,10 +164,10 @@ body:             |
     ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[COPY1]], implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
@@ -175,9 +175,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
@@ -185,15 +185,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[COPY1]], implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_CO_U32_e32 8, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 16, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -214,9 +214,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]]
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_CO_U32_e64_]], 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
@@ -224,9 +224,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
@@ -234,9 +234,9 @@ body:             |
     ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
@@ -244,9 +244,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
@@ -254,14 +254,14 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -279,42 +279,42 @@ body:             |
   bb.0:
     ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
     ; GFX803: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX803-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
     ; GFX900: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX900-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
     ; GFX942: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX942-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX10-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
     ; GFX12: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX12-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
     %0:vgpr_32, %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN implicit %2
 
 ...
@@ -332,42 +332,42 @@ body:             |
   bb.0:
     ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
     ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
     ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
     ; GFX942: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
     ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
     ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %0
     %1:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %1
     SI_RETURN
 
 ...
@@ -385,42 +385,42 @@ body:             |
   bb.0:
     ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
     ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
     ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
     ; GFX942: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
     ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
     ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %0
     %1:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %1
     SI_RETURN
 
 ...
@@ -443,9 +443,9 @@ body:             |
     ; GFX803-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX803-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX803-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets
@@ -454,9 +454,9 @@ body:             |
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX900-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets
@@ -465,9 +465,9 @@ body:             |
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets
@@ -476,9 +476,9 @@ body:             |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX10-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets
@@ -487,17 +487,17 @@ body:             |
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:sreg_32 = COPY $sgpr4
     %1:sreg_32 = COPY $sgpr5
 
     %2:sreg_32 = S_ADD_I32 %0, %stack.0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %2
     %3:sreg_32 = S_ADD_I32 %1, %stack.0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %3
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %3
     SI_RETURN
 
 ...
@@ -520,9 +520,9 @@ body:             |
     ; GFX803-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX803-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX803-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute
@@ -531,9 +531,9 @@ body:             |
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX900-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute
@@ -542,9 +542,9 @@ body:             |
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute
@@ -553,9 +553,9 @@ body:             |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX10-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute
@@ -564,17 +564,17 @@ body:             |
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:sreg_32 = COPY $sgpr4
     %1:sreg_32 = COPY $sgpr5
 
     %2:sreg_32 = S_ADD_I32 %stack.0, %0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %2
     %3:sreg_32 = S_ADD_I32 %stack.0, %1, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %3
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %3
     SI_RETURN
 
 ...
@@ -592,48 +592,48 @@ body:             |
   bb.0:
     ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
     ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX803-NEXT: S_NOP 0, implicit $scc
     ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX803-NEXT: SI_RETURN implicit $scc
     ;
     ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
     ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX900-NEXT: S_NOP 0, implicit $scc
     ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX900-NEXT: SI_RETURN implicit $scc
     ;
     ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
     ; GFX942: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX942-NEXT: S_NOP 0, implicit $scc
     ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX942-NEXT: SI_RETURN implicit $scc
     ;
     ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
     ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX10-NEXT: S_NOP 0, implicit $scc
     ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX10-NEXT: SI_RETURN implicit $scc
     ;
     ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
     ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX12-NEXT: S_NOP 0, implicit $scc
     ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX12-NEXT: SI_RETURN implicit $scc
     %0:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %0
     S_NOP 0, implicit $scc
     %1:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %1
     SI_RETURN implicit $scc
 
 ...
@@ -656,9 +656,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
@@ -667,9 +667,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
@@ -678,9 +678,9 @@ body:             |
     ; GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
@@ -689,9 +689,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
@@ -700,15 +700,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %vgpr_offset:vgpr_32 = COPY $vgpr0
     %0:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -731,9 +731,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
@@ -742,9 +742,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
@@ -753,9 +753,9 @@ body:             |
     ; GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
@@ -764,9 +764,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
@@ -775,15 +775,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %vgpr_offset:vgpr_32 = COPY $vgpr0
     %0:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -805,9 +805,9 @@ body:             |
     ; GFX803-NEXT: {{  $}}
     ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
@@ -815,9 +815,9 @@ body:             |
     ; GFX900-NEXT: {{  $}}
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
@@ -825,9 +825,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
@@ -836,9 +836,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
@@ -848,16 +848,16 @@ body:             |
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[COPY]], implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[COPY1]], implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -880,9 +880,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
@@ -891,9 +891,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
@@ -903,10 +903,10 @@ body:             |
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[COPY]], 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[COPY1]], 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
@@ -915,9 +915,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
@@ -926,15 +926,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[S_MOV_B32_]], 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[S_MOV_B32_]], 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -957,9 +957,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
@@ -968,9 +968,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
@@ -980,10 +980,10 @@ body:             |
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %sgpr_offset
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[COPY]], 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY %sgpr_offset
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[COPY1]], 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
@@ -992,9 +992,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
@@ -1003,15 +1003,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir
index 19ca463d7ecbb..f0868ffeeb7c5 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir
@@ -20,16 +20,16 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, [[V_ADD_U32_e64_]], implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets
     ; GFX942: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets
@@ -37,21 +37,21 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, [[V_ADD_U32_e64_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets
     ; GFX12: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -72,16 +72,16 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, [[V_ADD_U32_e64_]], implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets
     ; GFX942: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets
@@ -89,21 +89,21 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, [[V_ADD_U32_e64_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets
     ; GFX12: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -124,16 +124,16 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets
     ; GFX942: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets
@@ -141,21 +141,21 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets
     ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -178,9 +178,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets
@@ -188,9 +188,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets
@@ -199,9 +199,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets
@@ -209,15 +209,15 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %vgpr_offset:vgpr_32 = COPY $vgpr0
     %0:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -240,9 +240,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
@@ -250,9 +250,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
@@ -261,9 +261,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
@@ -271,15 +271,15 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %vgpr_offset:vgpr_32 = COPY $vgpr0
     %0:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -301,9 +301,9 @@ body:             |
     ; GFX900-NEXT: {{  $}}
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets
@@ -311,9 +311,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets
@@ -322,9 +322,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets
@@ -332,15 +332,15 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -363,9 +363,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets
@@ -373,9 +373,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets
@@ -384,9 +384,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets
@@ -394,15 +394,15 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -425,9 +425,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
@@ -435,9 +435,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
@@ -446,9 +446,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
@@ -456,15 +456,15 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -486,16 +486,16 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 1, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
     ; GFX942: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 1, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 1, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
@@ -503,21 +503,21 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 1, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
     ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 1, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 1, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, /*clamp*/1, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 16, %stack.0, /*clamp*/1, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll
index 4fa7c29bfde02..71005224dd1e5 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll
@@ -481,3 +481,15 @@ define void @dominance_not_in_program_order(ptr addrspace(7) inreg %arg) {
   %lsr.iv11 = phi ptr addrspace(7) [ %arg, %.loopexit ], [ %arg, %.preheader15 ]
   br label %.loopexit
 }
+
+;; iree-org/iree#22551 - crash on something that reduces to the below non-canonical select.
+define ptr addrspace(7) @noncanonical_const_cond(ptr addrspace(7) %x) {
+; CHECK-LABEL: define { ptr addrspace(8), i32 } @noncanonical_const_cond
+; CHECK-SAME: ({ ptr addrspace(8), i32 } [[RET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[X_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[RET]], 0
+; CHECK-NEXT:    [[X_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[RET]], 1
+; CHECK-NEXT:    ret { ptr addrspace(8), i32 } [[RET]]
+;
+  %ret = select i1 false, ptr addrspace(7) %x, ptr addrspace(7) %x
+  ret ptr addrspace(7) %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
index 1d1d3e4a68fee..9da7a79ba2fdf 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
@@ -15,24 +15,23 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr
 ; GFX12-NEXT:    s_mov_b32 s9, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s6, s3
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    s_mov_b32 s8, s1
 ; GFX12-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-NEXT:    s_mov_b32 s5, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s4, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
+; GFX12-NEXT:    s_mov_b32 s4, s3
+; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
 ; GFX12-NEXT:    s_mov_b32 s2, s1
-; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -63,10 +62,10 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr
 ; GFX12-NEXT:    s_mov_b32 s13, s2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
 ; GFX12-NEXT:    s_mov_b32 s5, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s4, s3
@@ -100,25 +99,24 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
 ; GFX12-NEXT:    s_mov_b32 s9, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s6, s3
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    s_mov_b32 s8, s1
 ; GFX12-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-NEXT:    s_mov_b32 s5, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s4, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s4, s3
+; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
 ; GFX12-NEXT:    s_mov_b32 s2, s1
-; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen
@@ -141,24 +139,23 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7)
 ; GFX12-NEXT:    s_mov_b32 s9, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s6, s3
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    s_mov_b32 s8, s1
 ; GFX12-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-NEXT:    s_mov_b32 s5, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s4, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
+; GFX12-NEXT:    s_mov_b32 s4, s3
+; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
 ; GFX12-NEXT:    s_mov_b32 s2, s1
-; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index fc36ed939d91d..84db54c2d537f 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -128,10 +128,10 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX10-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[10:11]
 ; GFX10-SDAG-NEXT:    s_mov_b32 s11, s2
 ; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[12:13], s[10:11]
-; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen slc
 ; GFX10-SDAG-NEXT:    s_clause 0x1
 ; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
 ; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen slc
 ; GFX10-SDAG-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SDAG-NEXT:    s_mov_b32 s5, s10
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -181,24 +181,23 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX11-SDAG-NEXT:    s_mov_b32 s9, s12
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_mov_b32 s6, s3
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s8, s1
 ; GFX11-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX11-SDAG-NEXT:    s_mov_b32 s13, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
 ; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-SDAG-NEXT:    s_mov_b32 s5, s12
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_mov_b32 s4, s3
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX11-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX11-SDAG-NEXT:    s_mov_b32 s13, s2
 ; GFX11-SDAG-NEXT:    s_mov_b32 s2, s1
-; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -215,12 +214,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s9, s2
 ; GFX11-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
 ; GFX11-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX11-GISEL-NEXT:    s_mov_b32 s6, s3
@@ -239,24 +238,23 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX12-SDAG-NEXT:    s_mov_b32 s9, s12
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_mov_b32 s6, s3
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    s_mov_b32 s8, s1
 ; GFX12-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX12-SDAG-NEXT:    s_mov_b32 s13, s2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-SDAG-NEXT:    s_mov_b32 s5, s12
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_mov_b32 s4, s3
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
+; GFX12-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX12-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX12-SDAG-NEXT:    s_mov_b32 s13, s2
 ; GFX12-SDAG-NEXT:    s_mov_b32 s2, s1
-; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
@@ -273,12 +271,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX12-GISEL-NEXT:    s_mov_b32 s9, s2
 ; GFX12-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
 ; GFX12-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX12-GISEL-NEXT:    s_mov_b32 s6, s3
@@ -413,11 +411,11 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[10:11]
 ; GFX10-SDAG-NEXT:    s_mov_b32 s11, s2
 ; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[12:13], s[10:11]
-; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_clause 0x1
 ; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
 ; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SDAG-NEXT:    s_mov_b32 s5, s10
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -468,25 +466,24 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-SDAG-NEXT:    s_mov_b32 s9, s12
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_mov_b32 s6, s3
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s8, s1
 ; GFX11-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX11-SDAG-NEXT:    s_mov_b32 s13, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-SDAG-NEXT:    s_mov_b32 s5, s12
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_mov_b32 s4, s3
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX11-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX11-SDAG-NEXT:    s_mov_b32 s13, s2
 ; GFX11-SDAG-NEXT:    s_mov_b32 s2, s1
-; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX11-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
@@ -503,13 +500,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s9, s2
 ; GFX11-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX11-GISEL-NEXT:    s_mov_b32 s6, s3
@@ -528,25 +525,24 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-SDAG-NEXT:    s_mov_b32 s9, s12
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_mov_b32 s6, s3
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    s_mov_b32 s8, s1
 ; GFX12-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX12-SDAG-NEXT:    s_mov_b32 s13, s2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-SDAG-NEXT:    s_mov_b32 s5, s12
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_mov_b32 s4, s3
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX12-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX12-SDAG-NEXT:    s_mov_b32 s13, s2
 ; GFX12-SDAG-NEXT:    s_mov_b32 s2, s1
-; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
@@ -563,13 +559,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX12-GISEL-NEXT:    s_mov_b32 s9, s2
 ; GFX12-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX12-GISEL-NEXT:    s_mov_b32 s6, s3
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll
index 69439d49e588f..de82dcdecda48 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll
@@ -102,10 +102,9 @@ define void @test_workgroup_id_x_non_kernel_optimized_fixed(ptr addrspace(1) %ou
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    s_lshl_b32 s0, ttmp9, 1
-; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_and_b32 s0, ttmp6, 15
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s1, s0
+; GFX1250-SDAG-NEXT:    s_lshl1_add_u32 s0, ttmp9, s0
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
index 497241cff392d..6b6658bd672de 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
@@ -234,19 +234,18 @@ define amdgpu_cs void @workgroup_id_optimized() "amdgpu-cluster-dims"="2,3,4" {
 ;
 ; GFX1250-SDAG-LABEL: workgroup_id_optimized:
 ; GFX1250-SDAG:       ; %bb.0: ; %.entry
-; GFX1250-SDAG-NEXT:    s_lshl_b32 s0, ttmp9, 1
-; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp6, 15
-; GFX1250-SDAG-NEXT:    s_lshr_b32 s2, ttmp7, 14
-; GFX1250-SDAG-NEXT:    s_add_co_i32 s1, s1, s0
-; GFX1250-SDAG-NEXT:    s_and_b32 s0, s2, 0x3fffc
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 14
 ; GFX1250-SDAG-NEXT:    s_and_b32 s2, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_and_b32 s0, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, s1, 0x3fffc
 ; GFX1250-SDAG-NEXT:    s_bfe_u32 s3, ttmp6, 0x40008
 ; GFX1250-SDAG-NEXT:    s_mul_i32 s2, s2, 3
 ; GFX1250-SDAG-NEXT:    s_bfe_u32 s4, ttmp6, 0x40004
-; GFX1250-SDAG-NEXT:    s_add_co_i32 s3, s3, s0
+; GFX1250-SDAG-NEXT:    s_lshl1_add_u32 s0, ttmp9, s0
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s3, s3, s1
 ; GFX1250-SDAG-NEXT:    s_add_co_i32 s4, s4, s2
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s4
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX1250-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
 ; GFX1250-SDAG-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 68506cec96a72..9056d40ad8878 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -36,20 +36,19 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
 ; CI-LABEL: s_lshr_v2i16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b32 s4, s0
-; CI-NEXT:    s_mov_b32 s5, s1
-; CI-NEXT:    s_and_b32 s0, s2, 0xffff
-; CI-NEXT:    s_lshr_b32 s1, s2, 16
-; CI-NEXT:    s_lshr_b32 s2, s3, 16
-; CI-NEXT:    s_lshr_b32 s1, s1, s2
-; CI-NEXT:    s_lshl_b32 s1, s1, 16
-; CI-NEXT:    s_lshr_b32 s0, s0, s3
-; CI-NEXT:    s_or_b32 s0, s0, s1
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; CI-NEXT:    s_and_b32 s6, s4, 0xffff
+; CI-NEXT:    s_lshr_b32 s4, s4, 16
+; CI-NEXT:    s_lshr_b32 s7, s5, 16
+; CI-NEXT:    s_lshr_b32 s4, s4, s7
+; CI-NEXT:    s_lshl_b32 s4, s4, 16
+; CI-NEXT:    s_lshr_b32 s5, s6, s5
+; CI-NEXT:    s_or_b32 s4, s5, s4
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_lshr_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index c117473581746..1e8a2d3ad9163 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -6429,7 +6429,7 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %22, 1835017 /* reguse:VGPR_32 */, [[V_CVT_I32_F64_e32_4]]
+  ; GFX908-NEXT:   INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %22, 1245193 /* reguse:VGPR_32 */, [[V_CVT_I32_F64_e32_4]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
@@ -6478,7 +6478,7 @@ body:             |
     %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
     %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
     %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
-    INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %22:vgpr_32, 1835017 /* reguse:VGPR_32 */, %4
+    INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %22:vgpr_32, 1245193 /* reguse:VGPR_32 */, %4
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 680942fcb4d4b..9ecd35e7ddd11 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -133,7 +133,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:  ; %bb.3:
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    v_add_nc_u32_e32 v45, -1, v42
-; CHECK-NEXT:    s_mov_b32 s53, 0
+; CHECK-NEXT:    s_mov_b32 s55, 0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v45
 ; CHECK-NEXT:    s_and_b32 exec_lo, exec_lo, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_25
@@ -141,7 +141,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v43, 10, v43
 ; CHECK-NEXT:    v_add_nc_u32_e32 v46, 0x3c05, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v47, 0
-; CHECK-NEXT:    s_mov_b32 s55, 0
+; CHECK-NEXT:    s_mov_b32 s53, 0
 ; CHECK-NEXT:  .LBB0_5: ; =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ; Child Loop BB0_8 Depth 2
 ; CHECK-NEXT:    ; Child Loop BB0_20 Depth 2
@@ -866,8 +866,8 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v0
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v42, 10, v42
-; CHECK-NEXT:    s_mov_b32 s52, 0
 ; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_mov_b32 s52, 0
 ; CHECK-NEXT:    ds_write_b8 v46, v43 offset:15364
 ; CHECK-NEXT:    v_add_nc_u32_e32 v45, -1, v41
 ; CHECK-NEXT:  .LBB1_1: ; %.37
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 08ec0c847e941..87d52684e588c 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -632,12 +632,12 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
 ; GFX1100-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX1100-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1100-NEXT:    v_mad_u64_u32 v[2:3], null, v5, v4, v[1:2]
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4]
+; GFX1100-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1150-LABEL: mad_i64_i32_extops_i32_i64:
@@ -775,13 +775,13 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
 ; GFX1100-LABEL: mad_u64_u32_bitops_lhs_mask_small:
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v0
-; GFX1100-NEXT:    v_mov_b32_e32 v6, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5]
-; GFX1100-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_and_b32 v5, 1, v6
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
+; GFX1100-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v6, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5]
+; GFX1100-NEXT:    v_and_b32_e32 v5, 1, v6
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mad_u64_u32 v[3:4], null, v5, v2, v[1:2]
+; GFX1100-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1150-LABEL: mad_u64_u32_bitops_lhs_mask_small:
@@ -863,11 +863,12 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_mov_b32_e32 v6, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5]
-; GFX1100-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v4, 1, v3
+; GFX1100-NEXT:    v_and_b32_e32 v4, 1, v3
+; GFX1100-NEXT:    v_mad_u64_u32 v[2:3], null, v6, v4, v[1:2]
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4]
+; GFX1100-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1150-LABEL: mad_u64_u32_bitops_rhs_mask_small:
@@ -1807,10 +1808,9 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 {
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v0, 0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0
-; GFX1100-NEXT:    v_mad_u64_u32 v[5:6], null, v2, v0, v[1:2]
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mad_u64_u32 v[5:6], null, v2, v0, v[4:5]
+; GFX1100-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4]
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1818,10 +1818,9 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 {
 ; GFX1150:       ; %bb.0:
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1150-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v0, 0
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0
-; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v0, v[1:2]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v0, v[4:5]
+; GFX1150-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4]
 ; GFX1150-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1833,10 +1832,9 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v1, v0, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v2, v0, v[1:2]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v2, v0, v[4:5]
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2126,23 +2124,21 @@ define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 {
 ; GFX1100-LABEL: lshr_mad_i64_negative_4:
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1]
+; GFX1100-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1100-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v1, v[0:1]
+; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v2, v[2:3]
+; GFX1100-NEXT:    v_mad_u64_u32 v[4:5], null, v3, v3, v[1:2]
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1100-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1150-LABEL: lshr_mad_i64_negative_4:
 ; GFX1150:       ; %bb.0:
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1150-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v0, v[0:1]
+; GFX1150-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, v0
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mov_b32_e32 v0, v4
-; GFX1150-NEXT:    v_mad_u64_u32 v[1:2], null, v1, v1, v[0:1]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1150-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v1, v[1:2]
+; GFX1150-NEXT:    v_mad_u64_u32 v[1:2], null, v2, v2, v[1:2]
 ; GFX1150-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: lshr_mad_i64_negative_4:
@@ -2152,12 +2148,10 @@ define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v1, v0, v[0:1]
+; GFX12-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v0, v4
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v1, v1, v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v2, v1, v[1:2]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v2, v2, v[1:2]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: lshr_mad_i64_negative_4:
diff --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
index 46b8df4b4537e..9cc0e6228a913 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -133,35 +133,33 @@ define amdgpu_kernel void @i16_mad24(ptr addrspace(1) %out, i16 %a, i16 %b, i16
 ; GCN-LABEL: i16_mad24:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_load_dword s4, s[4:5], 0xb
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s2, s2, 16
-; GCN-NEXT:    s_mul_i32 s2, s4, s2
-; GCN-NEXT:    s_add_i32 s2, s2, s3
-; GCN-NEXT:    s_sext_i32_i16 s2, s2
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_lshr_b32 s2, s4, 16
+; GCN-NEXT:    s_mul_i32 s2, s6, s2
+; GCN-NEXT:    s_add_i32 s2, s2, s5
+; GCN-NEXT:    s_sext_i32_i16 s4, s2
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: i16_mad24:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT:    s_load_dword s8, s[4:5], 0x2c
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, s0
-; GFX8-NEXT:    s_lshr_b32 s0, s2, 16
-; GFX8-NEXT:    s_mul_i32 s0, s8, s0
-; GFX8-NEXT:    s_add_i32 s0, s0, s3
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
-; GFX8-NEXT:    s_mov_b32 s5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX8-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX8-NEXT:    s_mul_i32 s4, s6, s4
+; GFX8-NEXT:    s_add_i32 s4, s4, s5
+; GFX8-NEXT:    s_sext_i32_i16 s4, s4
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 entry:
   %0 = mul i16 %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
index 8b2c5887c97ee..5095ad021fde3 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
@@ -33,7 +33,7 @@ name:            asm_write_vgpr_accvgpr_write_read
 body:             |
   bb.0:
 
-    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0
+    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0
     $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
 ...
 
@@ -47,7 +47,7 @@ name:            asm_write_vgpr_accvgpr_write_read_partialnop
 body:             |
   bb.0:
 
-    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0
+    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0
     S_NOP 0
     $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
 ...
@@ -60,7 +60,7 @@ name:            asm_write_vgpr_accvgpr_write_read_otherreg
 body:             |
   bb.0:
     liveins: $vgpr0
-    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr1
+    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr1
     $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
index 4b5a7c207055a..8dea9e87e140f 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
@@ -1620,15 +1620,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10_1-NEXT:    v_writelane_b32 v1, s55, 0
 ; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_1-NEXT:    s_lshl_b32 s4, s16, 2
-; GFX10_1-NEXT:    s_lshr_b32 s55, s32, 5
-; GFX10_1-NEXT:    s_add_i32 s55, s55, s4
+; GFX10_1-NEXT:    s_lshr_b32 s4, s32, 5
+; GFX10_1-NEXT:    s_addk_i32 s4, 0x4040
+; GFX10_1-NEXT:    s_lshl2_add_u32 s55, s16, s4
 ; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
-; GFX10_1-NEXT:    s_addk_i32 s55, 0x4040
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use alloca0 v0
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s55, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
@@ -1650,15 +1649,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10_3-NEXT:    v_writelane_b32 v1, s55, 0
 ; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_3-NEXT:    s_lshl_b32 s4, s16, 2
-; GFX10_3-NEXT:    s_lshr_b32 s55, s32, 5
-; GFX10_3-NEXT:    s_add_i32 s55, s55, s4
+; GFX10_3-NEXT:    s_lshr_b32 s4, s32, 5
+; GFX10_3-NEXT:    s_addk_i32 s4, 0x4040
+; GFX10_3-NEXT:    s_lshl2_add_u32 s55, s16, s4
 ; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
-; GFX10_3-NEXT:    s_addk_i32 s55, 0x4040
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use alloca0 v0
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s55, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
@@ -1677,15 +1675,15 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX11-NEXT:    s_add_i32 s2, s32, 0x8040
 ; GFX11-NEXT:    scratch_store_b32 off, v1, s2 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s1, s32, 64
 ; GFX11-NEXT:    v_writelane_b32 v1, s55, 0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-NEXT:    s_add_i32 s1, s32, 64
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-NEXT:    s_add_i32 s55, s32, s0
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4040
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use alloca0 v0
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    s_addk_i32 s55, 0x4040
+; GFX11-NEXT:    s_lshl2_add_u32 s55, s0, s1
 ; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s55, scc
@@ -1710,16 +1708,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_writelane_b32 v1, s55, 0
-; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_add_co_i32 s1, s32, 0x4000
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s32
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_co_i32 s55, s32, s0
+; GFX12-NEXT:    s_lshl2_add_u32 s55, s0, s1
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use alloca0 v0
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_addk_co_i32 s55, 0x4000
-; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s55, scc
 ; GFX12-NEXT:    ;;#ASMEND
@@ -1767,11 +1763,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
 ; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_lshr_b32 s4, s32, 6
+; GFX900-NEXT:    s_addk_i32 s4, 0x4040
 ; GFX900-NEXT:    v_writelane_b32 v1, s55, 0
-; GFX900-NEXT:    s_lshl_b32 s4, s16, 2
-; GFX900-NEXT:    s_lshr_b32 s55, s32, 6
-; GFX900-NEXT:    s_add_i32 s55, s55, s4
-; GFX900-NEXT:    s_addk_i32 s55, 0x4040
+; GFX900-NEXT:    s_lshl2_add_u32 s55, s16, s4
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -1796,10 +1791,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX942-NEXT:    s_add_i32 s1, s32, 0x8040
 ; GFX942-NEXT:    scratch_store_dword off, v1, s1 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX942-NEXT:    s_add_i32 s1, s32, 0x4040
 ; GFX942-NEXT:    v_writelane_b32 v1, s55, 0
-; GFX942-NEXT:    s_add_i32 s55, s32, s0
-; GFX942-NEXT:    s_addk_i32 s55, 0x4040
+; GFX942-NEXT:    s_lshl2_add_u32 s55, s0, s1
 ; GFX942-NEXT:    s_add_i32 s0, s32, 64
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index ae0805448d693..c48e25f36e99f 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -155,14 +155,13 @@ define amdgpu_kernel void @s_test_imax_sge_i32(ptr addrspace(1) %out, i32 %a, i3
 ; SI-LABEL: s_test_imax_sge_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_max_i32 s0, s2, s3
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_max_i32 s4, s4, s5
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: s_test_imax_sge_i32:
@@ -357,16 +356,15 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(ptr addrspace(1) %out, <2 x
 ; SI-LABEL: s_test_imax_sgt_imm_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_max_i32 s0, s3, 9
-; SI-NEXT:    s_max_i32 s1, s2, 9
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_max_i32 s5, s5, 9
+; SI-NEXT:    s_max_i32 s4, s4, 9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: s_test_imax_sgt_imm_v2i32:
@@ -472,14 +470,13 @@ define amdgpu_kernel void @s_test_imax_sgt_i32(ptr addrspace(1) %out, i32 %a, i3
 ; SI-LABEL: s_test_imax_sgt_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_max_i32 s0, s2, s3
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_max_i32 s4, s4, s5
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: s_test_imax_sgt_i32:
@@ -582,14 +579,13 @@ define amdgpu_kernel void @s_test_umax_uge_i32(ptr addrspace(1) %out, i32 %a, i3
 ; SI-LABEL: s_test_umax_uge_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_max_u32 s0, s2, s3
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_max_u32 s4, s4, s5
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: s_test_umax_uge_i32:
@@ -774,9 +770,9 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_load_b32 v0, v0, s[0:1] scale_offset
 ; GFX1250-NEXT:    s_load_b32 s6, s[0:1], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1250-NEXT:    global_load_b32 v0, v0, s[0:1] scale_offset
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_max_u32_e32 v0, s6, v0
@@ -817,14 +813,13 @@ define amdgpu_kernel void @s_test_umax_ugt_i32(ptr addrspace(1) %out, i32 %a, i3
 ; SI-LABEL: s_test_umax_ugt_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_max_u32 s0, s2, s3
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_max_u32 s4, s4, s5
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: s_test_umax_ugt_i32:
@@ -858,16 +853,15 @@ define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(ptr addrspace(1) %out, <2 x
 ; SI-LABEL: s_test_umax_ugt_imm_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_max_u32 s0, s3, 23
-; SI-NEXT:    s_max_u32 s1, s2, 15
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_max_u32 s5, s5, 23
+; SI-NEXT:    s_max_u32 s4, s4, 15
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: s_test_umax_ugt_imm_v2i32:
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
index ca4f5d22ca9a0..43752c22b1f3e 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
@@ -90,18 +90,18 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) {
 ; CHECK-NEXT:  .LBB0_13: ; %loop-memcpy-expansion2
 ; CHECK-NEXT:    ; Parent Loop BB0_11 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    v_mov_b32_e32 v6, s12
-; CHECK-NEXT:    v_mov_b32_e32 v7, s13
+; CHECK-NEXT:    v_mov_b32_e32 v6, s10
+; CHECK-NEXT:    v_mov_b32_e32 v7, s11
 ; CHECK-NEXT:    flat_load_dwordx4 v[10:13], v[6:7]
-; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, s12, v8
-; CHECK-NEXT:    s_add_u32 s12, s12, 16
+; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, s10, v8
+; CHECK-NEXT:    s_add_u32 s10, s10, 16
 ; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
-; CHECK-NEXT:    s_addc_u32 s13, s13, 0
-; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc, s[12:13], v[0:1]
-; CHECK-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc, s[10:11], v[0:1]
+; CHECK-NEXT:    s_or_b64 s[12:13], vcc, s[12:13]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[6:7], v[10:13]
-; CHECK-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; CHECK-NEXT:    s_andn2_b64 exec, exec, s[12:13]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_13
 ; CHECK-NEXT:  .LBB0_14: ; %Flow15
 ; CHECK-NEXT:    ; in Loop: Header=BB0_11 Depth=1
@@ -115,8 +115,8 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) {
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_9
 ; CHECK-NEXT:  ; %bb.16: ; %loop-memcpy-residual4.preheader
 ; CHECK-NEXT:    ; in Loop: Header=BB0_11 Depth=1
-; CHECK-NEXT:    s_mov_b64 s[12:13], 0
 ; CHECK-NEXT:    s_mov_b64 s[14:15], 0
+; CHECK-NEXT:    s_mov_b64 s[12:13], 0
 ; CHECK-NEXT:  .LBB0_17: ; %loop-memcpy-residual4
 ; CHECK-NEXT:    ; Parent Loop BB0_11 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index 02f39e25cb447..888a458a990ec 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -714,7 +714,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v4 offset:1
 ; ALIGNED-NEXT:    s_cbranch_vccnz .LBB0_1
 ; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -1468,7 +1468,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
 ; ALIGNED-NEXT:    global_store_byte v[16:17], v4, off offset:1
 ; ALIGNED-NEXT:    s_cbranch_vccnz .LBB1_1
 ; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -1854,6 +1854,10 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:220
 ; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:212
 ; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:143
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v38 offset:138
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v39 offset:142
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v39 offset:140
@@ -1862,10 +1866,6 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v37 offset:132
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v36 offset:130
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v36 offset:128
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:143
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
-; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v66 offset:139
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v67 offset:137
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:141
@@ -1901,14 +1901,6 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:60
 ; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:52
 ; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:48
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:111
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 24, v6
@@ -1923,6 +1915,14 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v25
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v19
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v15
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 24, v11
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:103
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v7
@@ -3438,7 +3438,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
 ; ALIGNED-NEXT:    s_cbranch_vccnz .LBB3_1
 ; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT:    s_clause 0x2f
+; ALIGNED-NEXT:    s_clause 0x2f ; 192-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v125, off, s[0:3], s32 offset:8
@@ -3741,23 +3741,23 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:  .LBB4_1: ; %load-store-loop
 ; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT:    s_clause 0x39
+; ALIGNED-NEXT:    s_clause 0x3e
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22
 ; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
 ; ALIGNED-NEXT:    buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:19
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
 ; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30
 ; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
 ; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
 ; ALIGNED-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
 ; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
 ; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35
 ; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
 ; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
@@ -3779,17 +3779,17 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53
 ; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54
 ; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55
-; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56
-; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57
-; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
 ; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:60
 ; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61
 ; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:62
 ; ALIGNED-NEXT:    buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59
 ; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:64
 ; ALIGNED-NEXT:    buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:65
 ; ALIGNED-NEXT:    buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:66
-; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59
 ; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:67
 ; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:68
 ; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:69
@@ -3797,57 +3797,96 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71
 ; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76
 ; ALIGNED-NEXT:    buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
 ; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78
 ; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT:    s_waitcnt vmcnt(57)
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT:    buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    s_clause 0x33
+; ALIGNED-NEXT:    buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(56)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(55)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(54)
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(53)
-; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(52)
-; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(51)
-; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(49)
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(48)
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(47)
 ; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(46)
 ; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(45)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v5
-; ALIGNED-NEXT:    s_waitcnt vmcnt(42)
-; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v9, 8, v8
-; ALIGNED-NEXT:    s_waitcnt vmcnt(40)
 ; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v10, 8, v6
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v12
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v15, 8, v14
 ; ALIGNED-NEXT:    v_lshl_or_b32 v8, v19, 8, v17
-; ALIGNED-NEXT:    s_waitcnt vmcnt(39)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v9, v16, 8, v13
-; ALIGNED-NEXT:    s_waitcnt vmcnt(37)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v10, v20, 8, v18
-; ALIGNED-NEXT:    s_waitcnt vmcnt(35)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
-; ALIGNED-NEXT:    s_waitcnt vmcnt(33)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v12, v28, 8, v25
-; ALIGNED-NEXT:    s_waitcnt vmcnt(31)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v13, v24, 8, v21
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(29)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v14, v27, 8, v26
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v6, 16, v5
@@ -3856,76 +3895,83 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
 ; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(27)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v15, v31, 8, v30
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(25)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v34, 8, v33
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(23)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v37, 8, v32
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(21)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v36, 8, v35
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(16)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v50, 8, v38
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v49, 8, v39
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v51, 8, v48
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v53, 8, v52
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v0, 16, v15
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v5, 16, v4
+; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v55, 8, v29
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v67, 8, v66
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v64, 8, v54
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v68, 8, v65
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v70, 8, v69
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
+; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
@@ -3934,52 +3980,82 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247
 ; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
-; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(42)
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(41)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(39)
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(38)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(36)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(35)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v81, 8, v3
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
+; ALIGNED-NEXT:    v_lshl_or_b32 v73, v13, 8, v16
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
@@ -4251,260 +4327,133 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v102, 8, v101
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT:    s_clause 0x3
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    v_lshl_or_b32 v77, v9, 8, v10
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT:    buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v127, 8, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v113, 8, v116
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT:    buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT:    buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v123, 8, v125
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v124, 8, v111
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT:    buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT:    buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT:    buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT:    buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT:    buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT:    buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT:    buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v105, 8, v110
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v93, 8, v94
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v92, 8, v104
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v95, 8, v108
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT:    buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT:    buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT:    buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT:    buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v79, 8, v90
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v78, 8, v88
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT:    buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT:    buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v72, 8, v76
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v75, 8, v63
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT:    buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT:    buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT:    buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT:    buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT:    buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT:    buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT:    buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v60, 8, v62
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v56, 8, v57
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v47, 8, v58
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v59, 8, v61
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT:    buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT:    buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT:    buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT:    buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v45
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v42, 8, v44
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT:    buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT:    buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v119, 8, v41
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v40, 8, v118
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x3e
-; ALIGNED-NEXT:    buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT:    buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT:    buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT:    buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT:    buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT:    buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT:    buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT:    buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT:    buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT:    buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT:    buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT:    buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT:    buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT:    buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT:    buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT:    buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT:    buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT:    buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223
-; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210
-; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT:    buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235
-; ALIGNED-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234
-; ALIGNED-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247
-; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT:    s_clause 0x6
-; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen
-; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT:    buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7
-; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v113, 8, v116
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v102, 8, v101
 ; ALIGNED-NEXT:    v_lshl_or_b32 v106, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v103, 8, v114
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v100, 8, v112
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v123, 8, v125
 ; ALIGNED-NEXT:    v_lshl_or_b32 v91, v4, 16, v3
-; ALIGNED-NEXT:    s_waitcnt vmcnt(60)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v97, 8, v98
-; ALIGNED-NEXT:    s_waitcnt vmcnt(58)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v93, 8, v94
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v87, 8, v96
-; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
-; ALIGNED-NEXT:    v_lshl_or_b32 v73, v13, 8, v16
-; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
-; ALIGNED-NEXT:    v_lshl_or_b32 v77, v9, 8, v10
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v105, 8, v110
 ; ALIGNED-NEXT:    v_lshl_or_b32 v89, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v84, 8, v86
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v85, 8, v83
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v92, 8, v104
 ; ALIGNED-NEXT:    v_lshl_or_b32 v74, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v68, 8, v81
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v95, 8, v108
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v65, 8, v71
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v78, 8, v88
 ; ALIGNED-NEXT:    v_lshl_or_b32 v46, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v67, 8, v80
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v54, 8, v53
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v79, 8, v90
 ; ALIGNED-NEXT:    v_lshl_or_b32 v117, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v55, 8, v70
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v75, 8, v63
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v52, 8, v64
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v72, 8, v76
 ; ALIGNED-NEXT:    v_lshl_or_b32 v115, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v39, 8, v50
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v69, 8, v48
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v56, 8, v57
 ; ALIGNED-NEXT:    v_lshl_or_b32 v99, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v36, 8, v38
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v60, 8, v62
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v32, 8, v33
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v82, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v35, 8, v37
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v31, 8, v34
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v47, 8, v58
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v59, 8, v61
 ; ALIGNED-NEXT:    v_lshl_or_b32 v66, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v29, 8, v30
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v26, 8, v28
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v42, 8, v44
 ; ALIGNED-NEXT:    v_lshl_or_b32 v51, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v23, 8, v24
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v25, 8, v21
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v45
 ; ALIGNED-NEXT:    v_lshl_or_b32 v49, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v18, 8, v20
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v14, 8, v15
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v40, 8, v118
 ; ALIGNED-NEXT:    v_lshl_or_b32 v27, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v17, 8, v19
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v119, 8, v41
 ; ALIGNED-NEXT:    v_lshl_or_b32 v22, v73, 16, v4
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v11, 8, v12
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT:    buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v77, 16, v73
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v6, 8, v8
 ; ALIGNED-NEXT:    v_lshl_or_b32 v77, v7, 8, v5
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen
+; ALIGNED-NEXT:    buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v77, 16, v73
 ; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:1
 ; ALIGNED-NEXT:    buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:3
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v121, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v73, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v77, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
@@ -4513,46 +4462,44 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v77, 16, v73
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v109, 8, v107
 ; ALIGNED-NEXT:    v_lshl_or_b32 v77, v1, 8, v120
+; ALIGNED-NEXT:    s_clause 0x2
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT:    buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v77, 16, v73
 ; ALIGNED-NEXT:    buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:13
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v73, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v73, 8, v1
+; ALIGNED-NEXT:    v_mov_b32_e32 v1, v107
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15
-; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v77, v107, 8, v0
-; ALIGNED-NEXT:    v_mov_b32_e32 v1, v107
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v77, 16, v73
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8
-; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9
-; ALIGNED-NEXT:    buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v120, 8, v122
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v77, v121, 8, v109
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v77, 16, v73
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x2
 ; ALIGNED-NEXT:    buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:18
 ; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:16
 ; ALIGNED-NEXT:    buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v2, 0x100, v2
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:232
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236
 ; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:228
 ; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:224
-; ALIGNED-NEXT:    v_add_nc_u32_e32 v2, 0x100, v2
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v126, 8, v77
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
@@ -4625,6 +4572,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v89, off, s[0:3], s32 offset:220
 ; ALIGNED-NEXT:    buffer_store_dword v91, off, s[0:3], s32 offset:212
 ; ALIGNED-NEXT:    buffer_store_dword v106, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v83 offset:202
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v85 offset:203
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v84 offset:201
@@ -4641,7 +4589,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v112 offset:198
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v114 offset:196
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v116 offset:192
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
 ; ALIGNED-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
 ; ALIGNED-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
@@ -4656,6 +4603,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v118 offset:186
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v40 offset:187
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v119 offset:185
@@ -4672,7 +4620,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v58 offset:182
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v61 offset:180
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v62 offset:176
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload
@@ -4684,6 +4631,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v63 offset:170
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v75 offset:171
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v72 offset:169
@@ -4700,7 +4648,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v104 offset:166
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v108 offset:164
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v110 offset:160
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:264
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload
@@ -4712,11 +4659,11 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v111 offset:154
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v124 offset:155
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v123 offset:153
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v127 offset:159
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:157
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload
@@ -5234,9 +5181,9 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:400
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v109 offset:10
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v121 offset:11
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:13
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v120 offset:9
@@ -5274,7 +5221,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0
 ; ALIGNED-NEXT:    s_cbranch_vccnz .LBB4_1
 ; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT:    s_clause 0x2f
+; ALIGNED-NEXT:    s_clause 0x2f ; 192-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v125, off, s[0:3], s32 offset:8
@@ -6797,7 +6744,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    s_cbranch_scc0 .LBB5_5
 ; ALIGNED-NEXT:  .LBB5_6: ; %Flow6
 ; ALIGNED-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -8296,7 +8243,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
 ; ALIGNED-NEXT:    s_cbranch_scc0 .LBB6_5
 ; ALIGNED-NEXT:  .LBB6_6: ; %Flow8
 ; ALIGNED-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -8848,14 +8795,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:60
 ; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:52
 ; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:48
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v70 offset:111
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 24, v6
@@ -8871,6 +8810,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 24, v23
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v19
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v15
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 24, v11
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:103
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 24, v7
@@ -9297,6 +9244,10 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:476
 ; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:468
 ; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:143
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v38 offset:138
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v39 offset:142
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v39 offset:140
@@ -9305,10 +9256,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v37 offset:132
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v36 offset:130
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v36 offset:128
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:143
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
-; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v66 offset:139
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v67 offset:137
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:141
@@ -9344,14 +9291,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:316
 ; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:308
 ; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:304
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:111
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 24, v6
@@ -9366,6 +9305,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v25
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v19
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v15
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 24, v11
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:103
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v7
@@ -12198,7 +12145,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    s_cbranch_scc0 .LBB8_5
 ; ALIGNED-NEXT:  .LBB8_6: ; %Flow19
 ; ALIGNED-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; ALIGNED-NEXT:    s_clause 0x2f
+; ALIGNED-NEXT:    s_clause 0x2f ; 192-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v125, off, s[0:3], s32 offset:8
@@ -12645,6 +12592,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-LABEL: memmove_p0_p5_sz2048:
 ; ALIGNED:       ; %bb.0: ; %entry
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT:    s_mov_b32 s6, exec_lo
 ; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
@@ -12693,34 +12645,29 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0
-; ALIGNED-NEXT:    s_mov_b32 s6, exec_lo
 ; ALIGNED-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
 ; ALIGNED-NEXT:    v_cmpx_ge_u32_e64 v2, v0
 ; ALIGNED-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; ALIGNED-NEXT:    s_cbranch_execz .LBB9_2
 ; ALIGNED-NEXT:  .LBB9_1: ; %memmove_fwd_loop
 ; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT:    s_clause 0x39
+; ALIGNED-NEXT:    s_clause 0x3e
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20
+; ALIGNED-NEXT:    buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22
 ; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
 ; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
 ; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
 ; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
-; ALIGNED-NEXT:    buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19
-; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
 ; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30
 ; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31
 ; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
 ; ALIGNED-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
 ; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
 ; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35
 ; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
 ; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
@@ -12742,17 +12689,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53
 ; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54
 ; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60
 ; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:56
 ; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57
 ; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
-; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59
 ; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61
 ; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62
 ; ALIGNED-NEXT:    buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63
 ; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64
 ; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65
 ; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66
-; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59
 ; ALIGNED-NEXT:    buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67
 ; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:68
 ; ALIGNED-NEXT:    buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:69
@@ -12760,58 +12707,94 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71
 ; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76
 ; ALIGNED-NEXT:    buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
 ; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78
 ; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT:    s_waitcnt vmcnt(57)
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT:    buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT:    buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    s_clause 0x30
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(56)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(55)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(54)
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(53)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(52)
 ; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(51)
 ; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(49)
-; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(48)
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(47)
 ; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(46)
 ; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(45)
 ; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v5
-; ALIGNED-NEXT:    s_waitcnt vmcnt(42)
-; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v9, 8, v8
-; ALIGNED-NEXT:    s_waitcnt vmcnt(40)
 ; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v9, 8, v8
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v10, 8, v6
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v12
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v15, 8, v14
 ; ALIGNED-NEXT:    v_lshl_or_b32 v8, v19, 8, v17
-; ALIGNED-NEXT:    s_waitcnt vmcnt(39)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v9, v16, 8, v13
-; ALIGNED-NEXT:    s_waitcnt vmcnt(37)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v10, v20, 8, v18
-; ALIGNED-NEXT:    s_waitcnt vmcnt(35)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
-; ALIGNED-NEXT:    s_waitcnt vmcnt(33)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v12, v27, 8, v25
-; ALIGNED-NEXT:    s_waitcnt vmcnt(31)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v13, v24, 8, v21
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(29)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v14, v28, 8, v26
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v6, 16, v5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v8, 16, v7
@@ -12819,82 +12802,81 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
 ; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(27)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v15, v30, 8, v29
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(25)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v32, 8, v34
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(23)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v36, 8, v31
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(21)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v35, 8, v33
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(16)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v48, 8, v37
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v49, 8, v38
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v50, 8, v39
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v51, 8, v52
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v0, 16, v15
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v5, 16, v4
+; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v54, 8, v53
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v55, 8, v65
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v66, 8, v64
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(61)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v68, 8, v67
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(61)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v70, 8, v69
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(61)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
+; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210
 ; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
@@ -12902,47 +12884,97 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0xc
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251
 ; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
-; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(44)
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
-; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(43)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(42)
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(41)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(40)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(33)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(32)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v81, 8, v3
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(16)
+; ALIGNED-NEXT:    v_lshl_or_b32 v95, v16, 8, v20
+; ALIGNED-NEXT:    s_waitcnt vmcnt(12)
+; ALIGNED-NEXT:    v_lshl_or_b32 v109, v11, 8, v12
+; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
@@ -13214,289 +13246,158 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v114, 8, v113
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT:    buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v126, 8, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v117, 8, v40
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v124, 8, v121
+; ALIGNED-NEXT:    v_lshl_or_b32 v110, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v115, 8, v118
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v112, 8, v116
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT:    buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v122, 8, v123
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v124, 8, v121
+; ALIGNED-NEXT:    v_lshl_or_b32 v93, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v101, 8, v102
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v105, 8, v104
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v99, 8, v100
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT:    buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT:    buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT:    buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT:    buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT:    buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT:    buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v108, 8, v120
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v105, 8, v104
+; ALIGNED-NEXT:    v_lshl_or_b32 v91, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v96, 8, v98
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v97, 8, v87
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v94, 8, v107
+; ALIGNED-NEXT:    v_lshl_or_b32 v77, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v71, 8, v85
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v106, 8, v111
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v69, 8, v83
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v88, 8, v90
+; ALIGNED-NEXT:    v_lshl_or_b32 v58, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v81, 8, v84
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v66, 8, v65
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT:    buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT:    buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT:    buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT:    buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v89, 8, v92
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v88, 8, v90
+; ALIGNED-NEXT:    v_lshl_or_b32 v41, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v67, 8, v82
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v78, 8, v75
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v64, 8, v68
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT:    buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT:    buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 8, v79
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v78, 8, v75
+; ALIGNED-NEXT:    v_lshl_or_b32 v119, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v51, 8, v54
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v80, 8, v52
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v60, 8, v61
+; ALIGNED-NEXT:    v_lshl_or_b32 v103, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v48, 8, v50
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT:    buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT:    buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT:    buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT:    buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT:    buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT:    buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT:    buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v72, 8, v74
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v60, 8, v61
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v36, 8, v37
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v86, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v39, 8, v49
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v35, 8, v38
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v59, 8, v62
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v63, 8, v73
+; ALIGNED-NEXT:    v_lshl_or_b32 v70, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v33, 8, v34
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v30, 8, v32
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v46, 8, v56
+; ALIGNED-NEXT:    v_lshl_or_b32 v55, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v27, 8, v28
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v29, 8, v25
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT:    buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT:    buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT:    buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT:    buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v47, 8, v57
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v46, 8, v56
+; ALIGNED-NEXT:    v_lshl_or_b32 v53, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v22, 8, v24
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v18, 8, v17
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v44, 8, v42
+; ALIGNED-NEXT:    v_lshl_or_b32 v31, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v19, 8, v23
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT:    buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT:    buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v45
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v44, 8, v42
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
+; ALIGNED-NEXT:    v_lshl_or_b32 v26, v95, 16, v4
+; ALIGNED-NEXT:    v_lshl_or_b32 v95, v13, 8, v14
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v21, v109, 16, v95
+; ALIGNED-NEXT:    v_lshl_or_b32 v95, v8, 8, v10
+; ALIGNED-NEXT:    v_lshl_or_b32 v109, v9, 8, v7
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x3e
-; ALIGNED-NEXT:    buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT:    buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT:    buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT:    buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT:    buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT:    buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT:    buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT:    buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT:    buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT:    buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT:    buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT:    buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT:    buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT:    buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT:    buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT:    buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT:    buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT:    buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT:    buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT:    buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT:    buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223
-; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210
-; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT:    buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT:    buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235
-; ALIGNED-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234
-; ALIGNED-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247
-; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT:    s_clause 0x5
-; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3
-; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v117, 8, v40
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v114, 8, v113
-; ALIGNED-NEXT:    v_lshl_or_b32 v110, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v115, 8, v118
-; ALIGNED-NEXT:    s_waitcnt vmcnt(61)
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v112, 8, v116
-; ALIGNED-NEXT:    v_lshl_or_b32 v93, v4, 16, v3
-; ALIGNED-NEXT:    s_waitcnt vmcnt(59)
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v101, 8, v102
-; ALIGNED-NEXT:    s_waitcnt vmcnt(57)
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v99, 8, v100
-; ALIGNED-NEXT:    s_waitcnt vmcnt(13)
-; ALIGNED-NEXT:    v_lshl_or_b32 v95, v16, 8, v20
-; ALIGNED-NEXT:    s_waitcnt vmcnt(9)
-; ALIGNED-NEXT:    v_lshl_or_b32 v109, v11, 8, v12
-; ALIGNED-NEXT:    v_lshl_or_b32 v91, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v96, 8, v98
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v97, 8, v87
-; ALIGNED-NEXT:    v_lshl_or_b32 v77, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v71, 8, v85
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v69, 8, v83
-; ALIGNED-NEXT:    v_lshl_or_b32 v58, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v81, 8, v84
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v66, 8, v65
-; ALIGNED-NEXT:    v_lshl_or_b32 v41, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v67, 8, v82
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v64, 8, v68
-; ALIGNED-NEXT:    v_lshl_or_b32 v119, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v51, 8, v54
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v80, 8, v52
-; ALIGNED-NEXT:    v_lshl_or_b32 v103, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v48, 8, v50
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v36, 8, v37
-; ALIGNED-NEXT:    v_lshl_or_b32 v86, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v39, 8, v49
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v35, 8, v38
-; ALIGNED-NEXT:    v_lshl_or_b32 v70, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v33, 8, v34
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v30, 8, v32
-; ALIGNED-NEXT:    v_lshl_or_b32 v55, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v27, 8, v28
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v29, 8, v25
-; ALIGNED-NEXT:    v_lshl_or_b32 v53, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v22, 8, v24
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v18, 8, v17
-; ALIGNED-NEXT:    v_lshl_or_b32 v31, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v19, 8, v23
-; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
-; ALIGNED-NEXT:    v_lshl_or_b32 v26, v95, 16, v4
-; ALIGNED-NEXT:    v_lshl_or_b32 v95, v13, 8, v14
-; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7
-; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v21, v109, 16, v95
-; ALIGNED-NEXT:    v_lshl_or_b32 v95, v8, 8, v10
-; ALIGNED-NEXT:    v_lshl_or_b32 v109, v9, 8, v7
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v15, v109, 16, v95
 ; ALIGNED-NEXT:    buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1
-; ALIGNED-NEXT:    v_lshl_or_b32 v109, v0, 8, v1
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v109, v0, 8, v1
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v95, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v95, v95, 8, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v109, 16, v95
 ; ALIGNED-NEXT:    v_lshl_or_b32 v95, v5, 8, v125
 ; ALIGNED-NEXT:    v_lshl_or_b32 v109, v4, 8, v6
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v109, 16, v95
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:15
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v95, v4, 8, v6
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v109, v0, 8, v1
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v109, 16, v95
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
@@ -13509,19 +13410,19 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v95, v4, 8, v0
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v109, 16, v95
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x2
 ; ALIGNED-NEXT:    buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:18
 ; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16
 ; ALIGNED-NEXT:    buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v2, 0x100, v2
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:232
 ; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:236
 ; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:228
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:224
-; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:704
 ; ALIGNED-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:708
-; ALIGNED-NEXT:    v_add_nc_u32_e32 v2, 0x100, v2
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v127, 8, v109
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
@@ -13590,6 +13491,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v91, off, s[0:3], s32 offset:220
 ; ALIGNED-NEXT:    buffer_store_dword v93, off, s[0:3], s32 offset:212
 ; ALIGNED-NEXT:    buffer_store_dword v110, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    v_lshl_or_b32 v127, v0, 16, v127
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v87 offset:202
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v97 offset:203
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v96 offset:201
@@ -13606,8 +13509,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v116 offset:198
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v118 offset:196
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v40 offset:192
-; ALIGNED-NEXT:    v_lshl_or_b32 v127, v0, 16, v127
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_add_u32 s4, s4, 0x100
 ; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
 ; ALIGNED-NEXT:    s_cmp_lg_u64 s[4:5], 0x800
@@ -13622,6 +13523,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v42 offset:186
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v44 offset:187
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v43 offset:185
@@ -13638,7 +13540,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v62 offset:182
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v73 offset:180
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v74 offset:176
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
@@ -13650,6 +13551,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v75 offset:170
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v78 offset:171
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v76 offset:169
@@ -13666,7 +13568,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v107 offset:166
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v111 offset:164
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v120 offset:160
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:264
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
@@ -13678,11 +13579,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v121 offset:154
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v124 offset:155
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v122 offset:153
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v126 offset:159
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:157
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload
@@ -14200,9 +14101,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:400
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v1 offset:10
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v5 offset:11
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:13
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload
@@ -14253,23 +14154,23 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    s_mov_b32 s7, -1
 ; ALIGNED-NEXT:  .LBB9_4: ; %memmove_bwd_loop
 ; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT:    s_clause 0x39
+; ALIGNED-NEXT:    s_clause 0x3e
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21
 ; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:23
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT:    buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT:    buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26
 ; ALIGNED-NEXT:    buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:19
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:29
 ; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:30
 ; ALIGNED-NEXT:    buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27
 ; ALIGNED-NEXT:    buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:32
 ; ALIGNED-NEXT:    buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:33
 ; ALIGNED-NEXT:    buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT:    buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27
 ; ALIGNED-NEXT:    buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:35
 ; ALIGNED-NEXT:    buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:36
 ; ALIGNED-NEXT:    buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:37
@@ -14291,17 +14192,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:53
 ; ALIGNED-NEXT:    buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:54
 ; ALIGNED-NEXT:    buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:55
-; ALIGNED-NEXT:    buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56
-; ALIGNED-NEXT:    buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57
-; ALIGNED-NEXT:    buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58
 ; ALIGNED-NEXT:    buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:60
 ; ALIGNED-NEXT:    buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:61
 ; ALIGNED-NEXT:    buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:62
 ; ALIGNED-NEXT:    buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59
 ; ALIGNED-NEXT:    buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:64
 ; ALIGNED-NEXT:    buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:65
 ; ALIGNED-NEXT:    buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:66
-; ALIGNED-NEXT:    buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59
 ; ALIGNED-NEXT:    buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:67
 ; ALIGNED-NEXT:    buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:68
 ; ALIGNED-NEXT:    buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:69
@@ -14309,57 +14210,97 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:71
 ; ALIGNED-NEXT:    buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:76
 ; ALIGNED-NEXT:    buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT:    buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75
 ; ALIGNED-NEXT:    buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78
 ; ALIGNED-NEXT:    buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT:    s_waitcnt vmcnt(57)
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT:    buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT:    buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    s_clause 0x34
+; ALIGNED-NEXT:    buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT:    buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT:    buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(56)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(55)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(54)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(53)
-; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(52)
-; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(51)
-; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(49)
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(48)
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(47)
 ; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(46)
 ; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(45)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v7, 8, v5
-; ALIGNED-NEXT:    s_waitcnt vmcnt(42)
-; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v9, 8, v8
-; ALIGNED-NEXT:    s_waitcnt vmcnt(40)
 ; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v10, 8, v6
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v12
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v15, 8, v14
 ; ALIGNED-NEXT:    v_lshl_or_b32 v8, v19, 8, v17
-; ALIGNED-NEXT:    s_waitcnt vmcnt(39)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v9, v16, 8, v13
-; ALIGNED-NEXT:    s_waitcnt vmcnt(37)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v10, v20, 8, v18
-; ALIGNED-NEXT:    s_waitcnt vmcnt(35)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
-; ALIGNED-NEXT:    s_waitcnt vmcnt(33)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v12, v28, 8, v25
-; ALIGNED-NEXT:    s_waitcnt vmcnt(31)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v13, v24, 8, v21
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(29)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v14, v27, 8, v26
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
@@ -14368,75 +14309,88 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
 ; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(27)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v15, v31, 8, v30
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(25)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v34, 8, v33
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(23)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v37, 8, v32
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(21)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v36, 8, v35
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(16)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v50, 8, v38
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v49, 8, v39
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v51, 8, v48
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v53, 8, v52
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v0, 16, v15
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v5, 16, v3
+; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v55, 8, v29
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v67, 8, v66
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v64, 8, v54
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v68, 8, v65
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v70, 8, v69
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83
 ; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x5
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210
 ; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
@@ -14445,52 +14399,83 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247
 ; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87
-; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(51)
+; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(50)
+; ALIGNED-NEXT:    buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(49)
+; ALIGNED-NEXT:    buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(48)
+; ALIGNED-NEXT:    buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(47)
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(46)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(44)
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(43)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(41)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(35)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v81, 8, v2
 ; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:81
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:80
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v62, v12, 8, v16
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
@@ -14720,302 +14705,169 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 8, v2
-; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:138
-; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:137
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:136
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
-; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147
-; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148
-; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:144
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v7
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v2
-; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT:    buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v126, 8, v2
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT:    buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT:    buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v121, 8, v123
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v124, 8, v111
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT:    buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT:    buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT:    buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT:    buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT:    buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT:    buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT:    buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v105, 8, v108
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v92, 8, v93
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v91, 8, v94
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v95, 8, v107
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT:    buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT:    buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT:    buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT:    buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v79, 8, v89
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v77, 8, v78
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT:    buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT:    buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v72, 8, v74
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v75, 8, v63
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT:    buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT:    buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT:    buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT:    buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT:    buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT:    buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT:    buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v59, 8, v61
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v56, 8, v47
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v46, 8, v58
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v57, 8, v60
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 8, v2
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:138
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT:    buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT:    buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT:    buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT:    buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:137
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:136
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v44
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v41, 8, v42
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT:    buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT:    buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:144
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v118, 8, v119
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v40, 8, v117
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v100, 8, v101
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x3e
-; ALIGNED-NEXT:    buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT:    buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT:    buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT:    buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT:    buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT:    buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT:    buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT:    buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT:    buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT:    buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT:    buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT:    buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT:    buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT:    buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT:    buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT:    buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT:    buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT:    buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT:    buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT:    buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT:    buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT:    buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT:    buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT:    buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT:    buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT:    buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT:    buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT:    buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT:    buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223
-; ALIGNED-NEXT:    buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT:    buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT:    buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210
-; ALIGNED-NEXT:    buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT:    buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT:    buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT:    buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT:    buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT:    buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT:    buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT:    buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT:    buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT:    buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT:    buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT:    buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT:    buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235
-; ALIGNED-NEXT:    buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT:    buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT:    buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234
-; ALIGNED-NEXT:    buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT:    buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT:    buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT:    buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT:    buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT:    buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT:    buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT:    buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT:    s_clause 0x4
 ; ALIGNED-NEXT:    buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:252
 ; ALIGNED-NEXT:    buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:253
 ; ALIGNED-NEXT:    buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:254
 ; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:255
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT:    s_clause 0x6
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen
-; ALIGNED-NEXT:    buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT:    buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT:    buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT:    buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT:    buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7
-; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v2
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v76, v8, 8, v10
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v126, 8, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v112, 8, v115
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v100, 8, v101
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v124, 8, v111
 ; ALIGNED-NEXT:    v_lshl_or_b32 v106, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v103, 8, v113
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v99, 8, v102
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v121, 8, v123
 ; ALIGNED-NEXT:    v_lshl_or_b32 v90, v3, 16, v2
-; ALIGNED-NEXT:    s_waitcnt vmcnt(60)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v87, 8, v97
-; ALIGNED-NEXT:    s_waitcnt vmcnt(58)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v92, 8, v93
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v86, 8, v96
-; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
-; ALIGNED-NEXT:    v_lshl_or_b32 v62, v12, 8, v16
-; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
-; ALIGNED-NEXT:    v_lshl_or_b32 v76, v8, 8, v10
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v105, 8, v108
 ; ALIGNED-NEXT:    v_lshl_or_b32 v88, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v83, 8, v84
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v85, 8, v82
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; ALIGNED-NEXT:    buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v91, 8, v94
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v68, 8, v80
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v95, 8, v107
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v65, 8, v70
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v77, 8, v78
 ; ALIGNED-NEXT:    v_lshl_or_b32 v45, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v67, 8, v71
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v52, 8, v53
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v79, 8, v89
 ; ALIGNED-NEXT:    v_lshl_or_b32 v116, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v55, 8, v69
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v75, 8, v63
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v51, 8, v54
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v72, 8, v74
 ; ALIGNED-NEXT:    v_lshl_or_b32 v114, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v38, 8, v50
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v66, 8, v39
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v56, 8, v47
 ; ALIGNED-NEXT:    v_lshl_or_b32 v98, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v35, 8, v37
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v59, 8, v61
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v32, 8, v31
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v46, 8, v58
 ; ALIGNED-NEXT:    v_lshl_or_b32 v81, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v33, 8, v36
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v30, 8, v34
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v57, 8, v60
 ; ALIGNED-NEXT:    v_lshl_or_b32 v64, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v27, 8, v29
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v41, 8, v42
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v26, 8, v28
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v44
 ; ALIGNED-NEXT:    v_lshl_or_b32 v49, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v22, 8, v24
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v23, 8, v21
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v40, 8, v117
 ; ALIGNED-NEXT:    v_lshl_or_b32 v48, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v17, 8, v19
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v118, 8, v119
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v14, 8, v13
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250
 ; ALIGNED-NEXT:    v_lshl_or_b32 v25, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v15, 8, v18
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen
 ; ALIGNED-NEXT:    v_lshl_or_b32 v20, v62, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v62, v9, 8, v11
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v76, 16, v62
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v62, v5, 8, v6
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v76, v7, 8, v1
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v76, 16, v62
 ; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:1
 ; ALIGNED-NEXT:    buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v120, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
@@ -15027,48 +14879,47 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 16, v62
 ; ALIGNED-NEXT:    v_lshl_or_b32 v62, v109, 8, v104
 ; ALIGNED-NEXT:    v_lshl_or_b32 v76, v122, 8, v110
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 16, v62
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x3
-; ALIGNED-NEXT:    buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT:    buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15
 ; ALIGNED-NEXT:    buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshl_or_b32 v62, v127, 8, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; ALIGNED-NEXT:    buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v76, v104, 8, v76
-; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 16, v62
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x2
 ; ALIGNED-NEXT:    buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:8
 ; ALIGNED-NEXT:    buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:9
 ; ALIGNED-NEXT:    buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 16, v62
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT:    buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v62, v110, 8, v122
+; ALIGNED-NEXT:    buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v76, v104, 8, v76
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v62, v127, 8, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 16, v62
+; ALIGNED-NEXT:    v_lshl_or_b32 v62, v110, 8, v122
 ; ALIGNED-NEXT:    v_lshl_or_b32 v76, v120, 8, v109
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 16, v62
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x2
 ; ALIGNED-NEXT:    buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:18
 ; ALIGNED-NEXT:    buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:16
 ; ALIGNED-NEXT:    buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v4, 0xffffff00, v4
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:488
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:492
 ; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:484
 ; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:480
-; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:704
 ; ALIGNED-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:708
-; ALIGNED-NEXT:    v_add_nc_u32_e32 v4, 0xffffff00, v4
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v125, 8, v62
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
@@ -15137,6 +14988,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v88, off, s[0:3], s32 offset:476
 ; ALIGNED-NEXT:    buffer_store_dword v90, off, s[0:3], s32 offset:468
 ; ALIGNED-NEXT:    buffer_store_dword v106, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT:    v_lshl_or_b32 v125, v0, 16, v125
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v82 offset:202
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v85 offset:203
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v83 offset:201
@@ -15153,8 +15006,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v102 offset:198
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v113 offset:196
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v115 offset:192
-; ALIGNED-NEXT:    v_lshl_or_b32 v125, v0, 16, v125
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_add_u32 s4, s4, 0xffffff00
 ; ALIGNED-NEXT:    s_addc_u32 s5, s5, -1
 ; ALIGNED-NEXT:    s_cmp_eq_u64 s[4:5], s[6:7]
@@ -15169,6 +15020,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:544
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v117 offset:186
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v40 offset:187
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v118 offset:185
@@ -15185,7 +15037,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v58 offset:182
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v60 offset:180
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v61 offset:176
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
@@ -15197,6 +15048,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v63 offset:170
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v75 offset:171
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v72 offset:169
@@ -15213,7 +15065,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v94 offset:166
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v107 offset:164
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v108 offset:160
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:520
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
@@ -15225,11 +15076,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:512
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v111 offset:154
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v124 offset:155
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v121 offset:153
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v126 offset:159
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:157
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload
@@ -15747,11 +15598,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v109 offset:10
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v120 offset:11
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v127 offset:13
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v110 offset:9
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:15
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload
@@ -15788,7 +15639,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    s_cbranch_scc0 .LBB9_4
 ; ALIGNED-NEXT:  .LBB9_5: ; %Flow11
 ; ALIGNED-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; ALIGNED-NEXT:    s_clause 0x2f
+; ALIGNED-NEXT:    s_clause 0x2f ; 192-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v125, off, s[0:3], s32 offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
index 14b0729b37302..953511db10b29 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
@@ -10,13 +10,13 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align
 ; CHECK-LABEL: memmove_p0_p0:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v8, 15, v4
-; CHECK-NEXT:    v_mov_b32_e32 v9, 0
-; CHECK-NEXT:    v_and_b32_e32 v6, -16, v4
-; CHECK-NEXT:    v_mov_b32_e32 v7, v5
+; CHECK-NEXT:    v_and_b32_e32 v6, 15, v4
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:    v_and_b32_e32 v8, -16, v4
+; CHECK-NEXT:    v_mov_b32_e32 v9, v5
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[8:9]
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[6:7]
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[8:9]
 ; CHECK-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
 ; CHECK-NEXT:    s_xor_b32 s7, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_3
@@ -33,10 +33,10 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v5, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v11, v1
-; CHECK-NEXT:    v_mov_b32_e32 v13, v7
+; CHECK-NEXT:    v_mov_b32_e32 v13, v9
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v10, v0
-; CHECK-NEXT:    v_mov_b32_e32 v12, v6
+; CHECK-NEXT:    v_mov_b32_e32 v12, v8
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB0_5: ; %memmove_fwd_main_loop
@@ -59,20 +59,20 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v7, s5
-; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v7, s5
+; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v8
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v9, s5
+; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v8
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v9, s5
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB0_8: ; %memmove_fwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    flat_load_ubyte v4, v[2:3]
-; CHECK-NEXT:    v_add_co_u32 v8, s5, v8, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v9, s5
+; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v7, s5
 ; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, 1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[6:7]
 ; CHECK-NEXT:    s_or_b32 s9, s5, s9
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v4
@@ -82,10 +82,10 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_8
 ; CHECK-NEXT:  .LBB0_9: ; %Flow28
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT:    ; implicit-def: $vgpr8_vgpr9
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; CHECK-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_2
@@ -104,11 +104,11 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:  .LBB0_12: ; %memmove_bwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    flat_load_ubyte v12, v[10:11]
-; CHECK-NEXT:    v_add_co_u32 v8, s4, v8, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v9, s4
+; CHECK-NEXT:    v_add_co_u32 v6, s4, v6, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v7, s4
 ; CHECK-NEXT:    v_add_co_u32 v10, s4, v10, -1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, null, -1, v11, s4
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[6:7]
 ; CHECK-NEXT:    s_or_b32 s8, s4, s8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[4:5], v12
@@ -129,19 +129,19 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB0_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v12, s4, v0, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, null, v1, v7, s4
-; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[4:5]
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v6, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
-; CHECK-NEXT:    v_mov_b32_e32 v7, v5
-; CHECK-NEXT:    v_mov_b32_e32 v6, v4
+; CHECK-NEXT:    v_mov_b32_e32 v11, v9
+; CHECK-NEXT:    v_mov_b32_e32 v10, v8
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v10
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v8, vcc_lo, v10, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v11, vcc_lo
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; CHECK-NEXT:    v_add_co_u32 v10, s4, v0, v10
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[8:9]
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, null, v1, v11, s4
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
+; CHECK-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_15
 ; CHECK-NEXT:  .LBB0_16: ; %Flow32
@@ -158,13 +158,13 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align
 ; CHECK-LABEL: memmove_p0_p1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v8, 15, v4
-; CHECK-NEXT:    v_mov_b32_e32 v9, 0
-; CHECK-NEXT:    v_and_b32_e32 v6, -16, v4
-; CHECK-NEXT:    v_mov_b32_e32 v7, v5
+; CHECK-NEXT:    v_and_b32_e32 v6, 15, v4
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:    v_and_b32_e32 v8, -16, v4
+; CHECK-NEXT:    v_mov_b32_e32 v9, v5
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[8:9]
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[6:7]
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[8:9]
 ; CHECK-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
 ; CHECK-NEXT:    s_xor_b32 s7, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB1_3
@@ -181,10 +181,10 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v5, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v11, v1
-; CHECK-NEXT:    v_mov_b32_e32 v13, v7
+; CHECK-NEXT:    v_mov_b32_e32 v13, v9
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v10, v0
-; CHECK-NEXT:    v_mov_b32_e32 v12, v6
+; CHECK-NEXT:    v_mov_b32_e32 v12, v8
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB1_5: ; %memmove_fwd_main_loop
@@ -207,20 +207,20 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB1_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v7, s5
-; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v7, s5
+; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v8
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v9, s5
+; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v8
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v9, s5
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB1_8: ; %memmove_fwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off
-; CHECK-NEXT:    v_add_co_u32 v8, s5, v8, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v9, s5
+; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v7, s5
 ; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, 1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[6:7]
 ; CHECK-NEXT:    s_or_b32 s9, s5, s9
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v4
@@ -230,10 +230,10 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    s_cbranch_execnz .LBB1_8
 ; CHECK-NEXT:  .LBB1_9: ; %Flow30
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT:    ; implicit-def: $vgpr8_vgpr9
 ; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; CHECK-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execz .LBB1_2
@@ -252,11 +252,11 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:  .LBB1_12: ; %memmove_bwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    global_load_ubyte v12, v[4:5], off
-; CHECK-NEXT:    v_add_co_u32 v8, s4, v8, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v9, s4
+; CHECK-NEXT:    v_add_co_u32 v6, s4, v6, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v7, s4
 ; CHECK-NEXT:    v_add_co_u32 v4, s4, v4, -1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, -1, v5, s4
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[6:7]
 ; CHECK-NEXT:    s_or_b32 s8, s4, s8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[10:11], v12
@@ -277,19 +277,19 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB1_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v12, s4, v0, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, null, v1, v7, s4
-; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[4:5], off
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v6, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
-; CHECK-NEXT:    v_mov_b32_e32 v7, v5
-; CHECK-NEXT:    v_mov_b32_e32 v6, v4
+; CHECK-NEXT:    v_mov_b32_e32 v11, v9
+; CHECK-NEXT:    v_mov_b32_e32 v10, v8
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v10
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v8, vcc_lo, v10, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v11, vcc_lo
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[4:5], off
+; CHECK-NEXT:    v_add_co_u32 v10, s4, v0, v10
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[8:9]
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, null, v1, v11, s4
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
+; CHECK-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB1_15
 ; CHECK-NEXT:  .LBB1_16: ; %Flow34
@@ -423,17 +423,17 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:  .LBB2_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ds_read_b128 v[7:10], v2
-; CHECK-NEXT:    v_add_co_u32 v3, vcc_lo, v5, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v4, null, -1, v6, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v11, vcc_lo, v0, v5
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v12, null, v1, v6, vcc_lo
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[3:4]
-; CHECK-NEXT:    v_mov_b32_e32 v6, v4
+; CHECK-NEXT:    v_mov_b32_e32 v3, v5
+; CHECK-NEXT:    v_mov_b32_e32 v4, v6
 ; CHECK-NEXT:    v_add_nc_u32_e32 v2, -16, v2
-; CHECK-NEXT:    v_mov_b32_e32 v5, v3
-; CHECK-NEXT:    s_or_b32 s7, s4, s7
+; CHECK-NEXT:    v_add_co_u32 v5, vcc_lo, v3, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v6, null, -1, v4, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v3, s4, v0, v3
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v4, null, v1, v4, s4
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[5:6]
+; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[7:10]
+; CHECK-NEXT:    flat_store_dwordx4 v[3:4], v[7:10]
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB2_15
 ; CHECK-NEXT:  .LBB2_16: ; %Flow36
@@ -450,13 +450,13 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align
 ; CHECK-LABEL: memmove_p0_p4:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v8, 15, v4
-; CHECK-NEXT:    v_mov_b32_e32 v9, 0
-; CHECK-NEXT:    v_and_b32_e32 v6, -16, v4
-; CHECK-NEXT:    v_mov_b32_e32 v7, v5
+; CHECK-NEXT:    v_and_b32_e32 v6, 15, v4
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:    v_and_b32_e32 v8, -16, v4
+; CHECK-NEXT:    v_mov_b32_e32 v9, v5
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[8:9]
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[6:7]
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[8:9]
 ; CHECK-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
 ; CHECK-NEXT:    s_xor_b32 s7, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB3_3
@@ -473,10 +473,10 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v5, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v11, v1
-; CHECK-NEXT:    v_mov_b32_e32 v13, v7
+; CHECK-NEXT:    v_mov_b32_e32 v13, v9
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v10, v0
-; CHECK-NEXT:    v_mov_b32_e32 v12, v6
+; CHECK-NEXT:    v_mov_b32_e32 v12, v8
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB3_5: ; %memmove_fwd_main_loop
@@ -499,20 +499,20 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB3_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v7, s5
-; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v7, s5
+; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v8
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v9, s5
+; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v8
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v9, s5
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB3_8: ; %memmove_fwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off
-; CHECK-NEXT:    v_add_co_u32 v8, s5, v8, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v9, s5
+; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v7, s5
 ; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, 1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[6:7]
 ; CHECK-NEXT:    s_or_b32 s9, s5, s9
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v4
@@ -522,10 +522,10 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    s_cbranch_execnz .LBB3_8
 ; CHECK-NEXT:  .LBB3_9: ; %Flow29
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT:    ; implicit-def: $vgpr8_vgpr9
 ; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; CHECK-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execz .LBB3_2
@@ -544,11 +544,11 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:  .LBB3_12: ; %memmove_bwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    global_load_ubyte v12, v[10:11], off
-; CHECK-NEXT:    v_add_co_u32 v8, s4, v8, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v9, s4
+; CHECK-NEXT:    v_add_co_u32 v6, s4, v6, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v7, s4
 ; CHECK-NEXT:    v_add_co_u32 v10, s4, v10, -1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, null, -1, v11, s4
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[6:7]
 ; CHECK-NEXT:    s_or_b32 s8, s4, s8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[4:5], v12
@@ -569,19 +569,19 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB3_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v12, s4, v0, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, null, v1, v7, s4
-; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[4:5], off
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v6, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
-; CHECK-NEXT:    v_mov_b32_e32 v7, v5
-; CHECK-NEXT:    v_mov_b32_e32 v6, v4
+; CHECK-NEXT:    v_mov_b32_e32 v11, v9
+; CHECK-NEXT:    v_mov_b32_e32 v10, v8
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v10
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v8, vcc_lo, v10, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v11, vcc_lo
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[4:5], off
+; CHECK-NEXT:    v_add_co_u32 v10, s4, v0, v10
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[8:9]
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, null, v1, v11, s4
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
+; CHECK-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB3_15
 ; CHECK-NEXT:  .LBB3_16: ; %Flow33
@@ -723,17 +723,17 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    v_add_co_u32 v3, vcc_lo, v5, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v4, null, -1, v6, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v11, vcc_lo, v0, v5
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v12, null, v1, v6, vcc_lo
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[3:4]
-; CHECK-NEXT:    v_mov_b32_e32 v6, v4
+; CHECK-NEXT:    v_mov_b32_e32 v3, v5
+; CHECK-NEXT:    v_mov_b32_e32 v4, v6
 ; CHECK-NEXT:    v_add_nc_u32_e32 v2, -16, v2
-; CHECK-NEXT:    v_mov_b32_e32 v5, v3
-; CHECK-NEXT:    s_or_b32 s7, s4, s7
+; CHECK-NEXT:    v_add_co_u32 v5, vcc_lo, v3, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v6, null, -1, v4, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v3, s4, v0, v3
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v4, null, v1, v4, s4
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[5:6]
+; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[7:10]
+; CHECK-NEXT:    flat_store_dwordx4 v[3:4], v[7:10]
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB4_15
 ; CHECK-NEXT:  .LBB4_16: ; %Flow36
@@ -751,13 +751,13 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align
 ; CHECK-LABEL: memmove_p1_p0:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v8, 15, v4
-; CHECK-NEXT:    v_mov_b32_e32 v9, 0
-; CHECK-NEXT:    v_and_b32_e32 v6, -16, v4
-; CHECK-NEXT:    v_mov_b32_e32 v7, v5
+; CHECK-NEXT:    v_and_b32_e32 v6, 15, v4
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:    v_and_b32_e32 v8, -16, v4
+; CHECK-NEXT:    v_mov_b32_e32 v9, v5
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[8:9]
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[6:7]
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[8:9]
 ; CHECK-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
 ; CHECK-NEXT:    s_xor_b32 s7, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB5_3
@@ -773,10 +773,10 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v5, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v11, v1
-; CHECK-NEXT:    v_mov_b32_e32 v13, v7
+; CHECK-NEXT:    v_mov_b32_e32 v13, v9
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v10, v0
-; CHECK-NEXT:    v_mov_b32_e32 v12, v6
+; CHECK-NEXT:    v_mov_b32_e32 v12, v8
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB5_5: ; %memmove_fwd_main_loop
@@ -799,20 +799,20 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB5_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v7, s5
-; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v7, s5
+; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v8
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v9, s5
+; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v8
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v9, s5
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB5_8: ; %memmove_fwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    flat_load_ubyte v4, v[2:3]
-; CHECK-NEXT:    v_add_co_u32 v8, s5, v8, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v9, s5
+; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v7, s5
 ; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, 1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[6:7]
 ; CHECK-NEXT:    s_or_b32 s9, s5, s9
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_store_byte v[0:1], v4, off
@@ -822,10 +822,10 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    s_cbranch_execnz .LBB5_8
 ; CHECK-NEXT:  .LBB5_9: ; %Flow30
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT:    ; implicit-def: $vgpr8_vgpr9
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; CHECK-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execz .LBB5_2
@@ -844,11 +844,11 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:  .LBB5_12: ; %memmove_bwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    flat_load_ubyte v12, v[10:11]
-; CHECK-NEXT:    v_add_co_u32 v8, s4, v8, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v9, s4
+; CHECK-NEXT:    v_add_co_u32 v6, s4, v6, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v7, s4
 ; CHECK-NEXT:    v_add_co_u32 v10, s4, v10, -1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, null, -1, v11, s4
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[6:7]
 ; CHECK-NEXT:    s_or_b32 s8, s4, s8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_store_byte v[4:5], v12, off
@@ -869,19 +869,19 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB5_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v12, s4, v0, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, null, v1, v7, s4
-; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[4:5]
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v6, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
-; CHECK-NEXT:    v_mov_b32_e32 v7, v5
-; CHECK-NEXT:    v_mov_b32_e32 v6, v4
+; CHECK-NEXT:    v_mov_b32_e32 v11, v9
+; CHECK-NEXT:    v_mov_b32_e32 v10, v8
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v10
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v8, vcc_lo, v10, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v11, vcc_lo
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; CHECK-NEXT:    v_add_co_u32 v10, s4, v0, v10
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[8:9]
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, null, v1, v11, s4
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[12:13], v[8:11], off
+; CHECK-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB5_15
 ; CHECK-NEXT:  .LBB5_16: ; %Flow34
@@ -897,13 +897,13 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align
 ; CHECK-LABEL: memmove_p1_p1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v8, 15, v4
-; CHECK-NEXT:    v_mov_b32_e32 v9, 0
-; CHECK-NEXT:    v_and_b32_e32 v6, -16, v4
-; CHECK-NEXT:    v_mov_b32_e32 v7, v5
+; CHECK-NEXT:    v_and_b32_e32 v6, 15, v4
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:    v_and_b32_e32 v8, -16, v4
+; CHECK-NEXT:    v_mov_b32_e32 v9, v5
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[8:9]
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[6:7]
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[8:9]
 ; CHECK-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
 ; CHECK-NEXT:    s_xor_b32 s7, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB6_3
@@ -919,10 +919,10 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v5, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v11, v1
-; CHECK-NEXT:    v_mov_b32_e32 v13, v7
+; CHECK-NEXT:    v_mov_b32_e32 v13, v9
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v10, v0
-; CHECK-NEXT:    v_mov_b32_e32 v12, v6
+; CHECK-NEXT:    v_mov_b32_e32 v12, v8
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB6_5: ; %memmove_fwd_main_loop
@@ -945,20 +945,20 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB6_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v7, s5
-; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v7, s5
+; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v8
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v9, s5
+; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v8
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v9, s5
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB6_8: ; %memmove_fwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off
-; CHECK-NEXT:    v_add_co_u32 v8, s5, v8, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v9, s5
+; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v7, s5
 ; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, 1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[6:7]
 ; CHECK-NEXT:    s_or_b32 s9, s5, s9
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_byte v[0:1], v4, off
@@ -968,10 +968,10 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    s_cbranch_execnz .LBB6_8
 ; CHECK-NEXT:  .LBB6_9: ; %Flow32
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT:    ; implicit-def: $vgpr8_vgpr9
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; CHECK-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execz .LBB6_2
@@ -990,11 +990,11 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:  .LBB6_12: ; %memmove_bwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    global_load_ubyte v12, v[10:11], off
-; CHECK-NEXT:    v_add_co_u32 v8, s4, v8, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v9, s4
+; CHECK-NEXT:    v_add_co_u32 v6, s4, v6, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v7, s4
 ; CHECK-NEXT:    v_add_co_u32 v10, s4, v10, -1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, null, -1, v11, s4
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[6:7]
 ; CHECK-NEXT:    s_or_b32 s8, s4, s8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_byte v[4:5], v12, off
@@ -1015,19 +1015,19 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB6_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v12, s4, v0, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, null, v1, v7, s4
-; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[4:5], off
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v6, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
-; CHECK-NEXT:    v_mov_b32_e32 v7, v5
-; CHECK-NEXT:    v_mov_b32_e32 v6, v4
+; CHECK-NEXT:    v_mov_b32_e32 v11, v9
+; CHECK-NEXT:    v_mov_b32_e32 v10, v8
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v10
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v8, vcc_lo, v10, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v11, vcc_lo
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[4:5], off
+; CHECK-NEXT:    v_add_co_u32 v10, s4, v0, v10
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[8:9]
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, null, v1, v11, s4
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[12:13], v[8:11], off
+; CHECK-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB6_15
 ; CHECK-NEXT:  .LBB6_16: ; %Flow36
@@ -1109,13 +1109,13 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align
 ; CHECK-LABEL: memmove_p1_p4:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v8, 15, v4
-; CHECK-NEXT:    v_mov_b32_e32 v9, 0
-; CHECK-NEXT:    v_and_b32_e32 v6, -16, v4
-; CHECK-NEXT:    v_mov_b32_e32 v7, v5
+; CHECK-NEXT:    v_and_b32_e32 v6, 15, v4
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:    v_and_b32_e32 v8, -16, v4
+; CHECK-NEXT:    v_mov_b32_e32 v9, v5
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[8:9]
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[6:7]
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[8:9]
 ; CHECK-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
 ; CHECK-NEXT:    s_xor_b32 s7, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB8_3
@@ -1131,10 +1131,10 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v5, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v11, v1
-; CHECK-NEXT:    v_mov_b32_e32 v13, v7
+; CHECK-NEXT:    v_mov_b32_e32 v13, v9
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v10, v0
-; CHECK-NEXT:    v_mov_b32_e32 v12, v6
+; CHECK-NEXT:    v_mov_b32_e32 v12, v8
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB8_5: ; %memmove_fwd_main_loop
@@ -1157,20 +1157,20 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB8_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v7, s5
-; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v7, s5
+; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v8
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v9, s5
+; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v8
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v9, s5
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB8_8: ; %memmove_fwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off
-; CHECK-NEXT:    v_add_co_u32 v8, s5, v8, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v9, s5
+; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v7, s5
 ; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, 1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[6:7]
 ; CHECK-NEXT:    s_or_b32 s9, s5, s9
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_byte v[0:1], v4, off
@@ -1180,10 +1180,10 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    s_cbranch_execnz .LBB8_8
 ; CHECK-NEXT:  .LBB8_9: ; %Flow31
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT:    ; implicit-def: $vgpr8_vgpr9
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; CHECK-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execz .LBB8_2
@@ -1202,11 +1202,11 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:  .LBB8_12: ; %memmove_bwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    global_load_ubyte v12, v[10:11], off
-; CHECK-NEXT:    v_add_co_u32 v8, s4, v8, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v9, s4
+; CHECK-NEXT:    v_add_co_u32 v6, s4, v6, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v7, s4
 ; CHECK-NEXT:    v_add_co_u32 v10, s4, v10, -1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, null, -1, v11, s4
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[6:7]
 ; CHECK-NEXT:    s_or_b32 s8, s4, s8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_byte v[4:5], v12, off
@@ -1227,19 +1227,19 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB8_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v12, s4, v0, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, null, v1, v7, s4
-; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[4:5], off
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v6, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
-; CHECK-NEXT:    v_mov_b32_e32 v7, v5
-; CHECK-NEXT:    v_mov_b32_e32 v6, v4
+; CHECK-NEXT:    v_mov_b32_e32 v11, v9
+; CHECK-NEXT:    v_mov_b32_e32 v10, v8
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v10
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v8, vcc_lo, v10, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v9, null, -1, v11, vcc_lo
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[4:5], off
+; CHECK-NEXT:    v_add_co_u32 v10, s4, v0, v10
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[8:9]
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, null, v1, v11, s4
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[12:13], v[8:11], off
+; CHECK-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB8_15
 ; CHECK-NEXT:  .LBB8_16: ; %Flow35
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index c6f7ce51f5ea2..9888204b997a9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -260,12 +260,11 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX10-WGP-LABEL: local_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX10-WGP-NEXT:    ds_read_b32 v1, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -274,12 +273,11 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX10-CU-LABEL: local_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-CU-NEXT:    s_mov_b32 s6, 2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX10-CU-NEXT:    ds_read_b32 v1, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -311,15 +309,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
-; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s6
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 2
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s7, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s7
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s6, v2
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -328,15 +324,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
-; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s6
-; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 2
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s7, 0x3ff
+; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s7
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s6, v2
+; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -345,15 +339,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
 ; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
 ; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX942-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 2
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX942-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
-; GFX942-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s2, v2
+; GFX942-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX942-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
@@ -362,15 +354,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX942-TGSPLIT-LABEL: local_nontemporal_load_1:
 ; GFX942-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
 ; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-TGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX942-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX942-TGSPLIT-NEXT:    s_mov_b32 s2, 2
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX942-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
-; GFX942-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s2, v2
+; GFX942-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX942-TGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
@@ -379,14 +369,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX11-WGP-LABEL: local_nontemporal_load_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX11-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX11-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -395,14 +384,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX11-CU-LABEL: local_nontemporal_load_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX11-CU-NEXT:    s_mov_b32 s2, 2
+; GFX11-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX11-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -411,15 +399,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX12-WGP-LABEL: local_nontemporal_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX12-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
-; GFX12-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX12-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX12-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -428,15 +414,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX12-CU-LABEL: local_nontemporal_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX12-CU-NEXT:    s_mov_b32 s2, 2
+; GFX12-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_wait_alu 0xfffe
-; GFX12-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX12-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -445,14 +429,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX1250-LABEL: local_nontemporal_load_1:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX1250-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX1250-NEXT:    s_mov_b32 s2, 2
+; GFX1250-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX1250-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX1250-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX1250-NEXT:    ds_load_b32 v1, v1
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -679,12 +662,11 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: local_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_mov_b32 s5, 2
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
@@ -692,12 +674,11 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: local_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_mov_b32 s5, 2
-; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, s5, s6
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
@@ -720,15 +701,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s5, 0x3ff
-; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s5
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s5, 2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, s5, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
@@ -736,15 +715,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s5, 0x3ff
-; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s5
-; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s5, 2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, s5, v1
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
+; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s5
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
@@ -752,15 +729,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
 ; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX942-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 2
-; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX942-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, s1, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX942-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
@@ -768,15 +743,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX942-TGSPLIT-LABEL: local_nontemporal_store_1:
 ; GFX942-TGSPLIT:       ; %bb.0: ; %entry
-; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX942-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 2
-; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX942-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, s1, v1
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX942-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX942-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
@@ -784,14 +757,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX11-WGP-LABEL: local_nontemporal_store_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX11-WGP-NEXT:    s_mov_b32 s1, 2
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
@@ -799,14 +771,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX11-CU-LABEL: local_nontemporal_store_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-CU-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX11-CU-NEXT:    s_mov_b32 s1, 2
-; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
@@ -814,15 +785,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX12-WGP-LABEL: local_nontemporal_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-WGP-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX12-WGP-NEXT:    s_mov_b32 s1, 2
-; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
-; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
@@ -830,15 +799,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX12-CU-LABEL: local_nontemporal_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-CU-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX12-CU-NEXT:    s_mov_b32 s1, 2
-; GFX12-CU-NEXT:    s_wait_alu 0xfffe
-; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
@@ -846,15 +813,14 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX1250-LABEL: local_nontemporal_store_1:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX1250-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX1250-NEXT:    s_mov_b32 s1, 2
-; GFX1250-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX1250-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX1250-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX1250-NEXT:    ds_store_b32 v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index d686e7a2d5b4c..33c516c61e42c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -208,12 +208,11 @@ define amdgpu_kernel void @local_volatile_load_1(
 ; GFX10-WGP-LABEL: local_volatile_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX10-WGP-NEXT:    ds_read_b32 v1, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -222,12 +221,11 @@ define amdgpu_kernel void @local_volatile_load_1(
 ; GFX10-CU-LABEL: local_volatile_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-CU-NEXT:    s_mov_b32 s6, 2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX10-CU-NEXT:    ds_read_b32 v1, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -259,14 +257,13 @@ define amdgpu_kernel void @local_volatile_load_1(
 ; GFX11-WGP-LABEL: local_volatile_load_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX11-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX11-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -275,14 +272,13 @@ define amdgpu_kernel void @local_volatile_load_1(
 ; GFX11-CU-LABEL: local_volatile_load_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX11-CU-NEXT:    s_mov_b32 s2, 2
+; GFX11-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX11-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -291,15 +287,13 @@ define amdgpu_kernel void @local_volatile_load_1(
 ; GFX12-WGP-LABEL: local_volatile_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX12-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
-; GFX12-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX12-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX12-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -308,15 +302,13 @@ define amdgpu_kernel void @local_volatile_load_1(
 ; GFX12-CU-LABEL: local_volatile_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX12-CU-NEXT:    s_mov_b32 s2, 2
+; GFX12-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_wait_alu 0xfffe
-; GFX12-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX12-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -325,14 +317,13 @@ define amdgpu_kernel void @local_volatile_load_1(
 ; GFX1250-LABEL: local_volatile_load_1:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX1250-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX1250-NEXT:    s_mov_b32 s2, 2
+; GFX1250-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX1250-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX1250-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX1250-NEXT:    ds_load_b32 v1, v1
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -511,12 +502,11 @@ define amdgpu_kernel void @local_volatile_store_1(
 ;
 ; GFX10-WGP-LABEL: local_volatile_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_mov_b32 s5, 2
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
@@ -524,12 +514,11 @@ define amdgpu_kernel void @local_volatile_store_1(
 ;
 ; GFX10-CU-LABEL: local_volatile_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_mov_b32 s5, 2
-; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, s5, s6
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
@@ -552,14 +541,13 @@ define amdgpu_kernel void @local_volatile_store_1(
 ;
 ; GFX11-WGP-LABEL: local_volatile_store_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX11-WGP-NEXT:    s_mov_b32 s1, 2
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
@@ -567,14 +555,13 @@ define amdgpu_kernel void @local_volatile_store_1(
 ;
 ; GFX11-CU-LABEL: local_volatile_store_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-CU-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX11-CU-NEXT:    s_mov_b32 s1, 2
-; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
@@ -582,15 +569,13 @@ define amdgpu_kernel void @local_volatile_store_1(
 ;
 ; GFX12-WGP-LABEL: local_volatile_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-WGP-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX12-WGP-NEXT:    s_mov_b32 s1, 2
-; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
-; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
@@ -598,15 +583,13 @@ define amdgpu_kernel void @local_volatile_store_1(
 ;
 ; GFX12-CU-LABEL: local_volatile_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-CU-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX12-CU-NEXT:    s_mov_b32 s1, 2
-; GFX12-CU-NEXT:    s_wait_alu 0xfffe
-; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
@@ -614,15 +597,14 @@ define amdgpu_kernel void @local_volatile_store_1(
 ;
 ; GFX1250-LABEL: local_volatile_store_1:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX1250-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX1250-NEXT:    s_mov_b32 s1, 2
-; GFX1250-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX1250-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX1250-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX1250-NEXT:    ds_store_b32 v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 89de17ecbd1e8..6c19722ad6e33 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -270,12 +270,11 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX10-WGP-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen slc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -286,12 +285,11 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-CU-NEXT:    s_mov_b32 s6, 2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX10-CU-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen slc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -330,15 +328,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
-; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s6
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 2
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s7, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s7
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s6, v2
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -349,15 +345,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
-; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s6
-; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 2
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s7, 0x3ff
+; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s7
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s6, v2
+; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -366,15 +360,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX942-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
 ; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX942-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 2
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX942-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
-; GFX942-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s2, v2
+; GFX942-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v1, v1, off nt
 ; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
@@ -383,15 +375,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX942-TGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX942-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
 ; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-TGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX942-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX942-TGSPLIT-NEXT:    s_mov_b32 s2, 2
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX942-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
-; GFX942-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s2, v2
+; GFX942-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX942-TGSPLIT-NEXT:    scratch_load_dword v1, v1, off nt
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
@@ -400,14 +390,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX11-WGP-LABEL: private_nontemporal_load_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX11-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX11-WGP-NEXT:    scratch_load_b32 v1, v1, off slc dlc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -416,14 +405,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX11-CU-LABEL: private_nontemporal_load_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX11-CU-NEXT:    s_mov_b32 s2, 2
+; GFX11-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX11-CU-NEXT:    scratch_load_b32 v1, v1, off slc dlc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -708,12 +696,11 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX10-WGP:       ; %bb.0: ; %entry
 ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_mov_b32 s5, 2
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v0, 2, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
@@ -723,12 +710,11 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX10-CU:       ; %bb.0: ; %entry
 ; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_mov_b32 s5, 2
-; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v0, s5, s6
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v0, 2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
@@ -758,15 +744,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s5, 0x3ff
-; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s5
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s5, 2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, s5, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, 2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
@@ -776,15 +760,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s5, 0x3ff
-; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s5
-; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s5, 2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, s5, v1
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
+; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, 2, s5
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
@@ -792,15 +774,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX942-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX942-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 2
-; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX942-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, s1, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX942-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, 2, s1
 ; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword v1, v0, off nt
@@ -808,15 +788,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX942-TGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX942-TGSPLIT:       ; %bb.0: ; %entry
-; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX942-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 2
-; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX942-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, s1, v1
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX942-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX942-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, 2, s1
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-TGSPLIT-NEXT:    scratch_store_dword v1, v0, off nt
@@ -824,14 +802,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX11-WGP-LABEL: private_nontemporal_store_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX11-WGP-NEXT:    s_mov_b32 s1, 2
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v0, s1, s2
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v0, 2, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    scratch_store_b32 v1, v0, off glc slc dlc
@@ -839,14 +816,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX11-CU-LABEL: private_nontemporal_store_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-CU-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX11-CU-NEXT:    s_mov_b32 s1, 2
-; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v0, s1, s2
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v0, 2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    scratch_store_b32 v1, v0, off glc slc dlc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index 7faa0621aa6d0..7c23b76cec3e9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -228,12 +228,11 @@ define amdgpu_kernel void @private_volatile_load_1(
 ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX10-WGP-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -244,12 +243,11 @@ define amdgpu_kernel void @private_volatile_load_1(
 ; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-CU-NEXT:    s_mov_b32 s6, 2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX10-CU-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -286,14 +284,13 @@ define amdgpu_kernel void @private_volatile_load_1(
 ; GFX11-WGP-LABEL: private_volatile_load_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX11-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX11-WGP-NEXT:    scratch_load_b32 v1, v1, off glc dlc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -302,14 +299,13 @@ define amdgpu_kernel void @private_volatile_load_1(
 ; GFX11-CU-LABEL: private_volatile_load_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX11-CU-NEXT:    s_mov_b32 s2, 2
+; GFX11-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX11-CU-NEXT:    scratch_load_b32 v1, v1, off glc dlc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -578,12 +574,11 @@ define amdgpu_kernel void @private_volatile_store_1(
 ; GFX10-WGP:       ; %bb.0: ; %entry
 ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_mov_b32 s5, 2
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v0, 2, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
@@ -594,12 +589,11 @@ define amdgpu_kernel void @private_volatile_store_1(
 ; GFX10-CU:       ; %bb.0: ; %entry
 ; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_mov_b32 s5, 2
-; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v0, s5, s6
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v0, 2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
@@ -629,14 +623,13 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX11-WGP-LABEL: private_volatile_store_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX11-WGP-NEXT:    s_mov_b32 s1, 2
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v0, s1, s2
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v0, 2, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    scratch_store_b32 v1, v0, off dlc
@@ -645,14 +638,13 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX11-CU-LABEL: private_volatile_store_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-CU-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX11-CU-NEXT:    s_mov_b32 s1, 2
-; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v0, s1, s2
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v0, 2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    scratch_store_b32 v1, v0, off dlc
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 4bb653848cbf0..e330c72ba0fc4 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
 
 
 ; Check that we do not copy agprs to vgprs and back inside the loop.
diff --git a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
index 71900a4d1c1e4..32800488f0633 100644
--- a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
@@ -90,19 +90,19 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh
 ; GFX12-GISEL-NEXT:    s_load_b256 s[20:27], s[2:3], 0x40
 ; GFX12-GISEL-NEXT:    s_load_b512 s[36:51], s[2:3], 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX12-GISEL-NEXT:    buffer_load_b32 v2, off, s[16:19], null
 ; GFX12-GISEL-NEXT:    buffer_load_b32 v3, off, s[20:23], null
 ; GFX12-GISEL-NEXT:    buffer_load_b32 v4, off, s[40:43], null
+; GFX12-GISEL-NEXT:    image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX12-GISEL-NEXT:    image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x2
 ; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0xac0, v2
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x1
-; GFX12-GISEL-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x1
 ; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s1, 0xac0, v3
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s2, 0xac0, v4
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x1
+; GFX12-GISEL-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
 ; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll
new file mode 100644
index 0000000000000..b7e6ed26876c4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+
+define amdgpu_kernel void @lshl1_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) {
+; CHECK-LABEL: lshl1_add:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
+; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_b32 s5, s[4:5], 0x54
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    s_mov_b32 s6, s3
+; CHECK-NEXT:    s_mov_b32 s7, s4
+; CHECK-NEXT:    s_mov_b32 s3, s4
+; CHECK-NEXT:    s_or_b64 s[6:7], s[6:7], s[4:5]
+; CHECK-NEXT:    s_mov_b32 s5, s2
+; CHECK-NEXT:    s_mov_b32 s2, s1
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_or_b64 s[4:5], s[2:3], s[4:5]
+; CHECK-NEXT:    v_lshl_add_u32 v1, v1, 1, s0
+; CHECK-NEXT:    buffer_store_b16 v0, v1, s[4:7], null offen
+; CHECK-NEXT:    s_endpgm
+  %vaddr = load volatile i32, ptr addrspace(1) %in, align 4
+  %1 = sext i32 %vaddr to i64
+  %gep = getelementptr i16, ptr addrspace(7) %in2, i64 %1
+  store i16 0, ptr addrspace(7) %gep, align 2
+  ret void
+}
+
+define amdgpu_kernel void @lshl2_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) {
+; CHECK-LABEL: lshl2_add:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
+; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_b32 s5, s[4:5], 0x54
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    s_mov_b32 s6, s3
+; CHECK-NEXT:    s_mov_b32 s7, s4
+; CHECK-NEXT:    s_mov_b32 s3, s4
+; CHECK-NEXT:    s_or_b64 s[6:7], s[6:7], s[4:5]
+; CHECK-NEXT:    s_mov_b32 s5, s2
+; CHECK-NEXT:    s_mov_b32 s2, s1
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_or_b64 s[4:5], s[2:3], s[4:5]
+; CHECK-NEXT:    v_lshl_add_u32 v1, v1, 2, s0
+; CHECK-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen
+; CHECK-NEXT:    s_endpgm
+  %vaddr = load volatile i32, ptr addrspace(1) %in, align 4
+  %1 = sext i32 %vaddr to i64
+  %gep = getelementptr i32, ptr addrspace(7) %in2, i64 %1
+  store i32 0, ptr addrspace(7) %gep, align 4
+  ret void
+}
+
+define amdgpu_kernel void @lshl3_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) {
+; CHECK-LABEL: lshl3_add:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
+; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_b32 s5, s[4:5], 0x54
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    global_load_b32 v2, v0, s[6:7] scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    s_mov_b32 s6, s3
+; CHECK-NEXT:    s_mov_b32 s7, s4
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_or_b64 s[6:7], s[6:7], s[4:5]
+; CHECK-NEXT:    s_mov_b32 s5, s2
+; CHECK-NEXT:    s_mov_b32 s2, s1
+; CHECK-NEXT:    s_mov_b32 s3, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_or_b64 s[4:5], s[2:3], s[4:5]
+; CHECK-NEXT:    v_lshl_add_u32 v2, v2, 3, s0
+; CHECK-NEXT:    buffer_store_b64 v[0:1], v2, s[4:7], null offen
+; CHECK-NEXT:    s_endpgm
+  %vaddr = load volatile i32, ptr addrspace(1) %in, align 4
+  %1 = sext i32 %vaddr to i64
+  %gep = getelementptr i64, ptr addrspace(7) %in2, i64 %1
+  store i64 0, ptr addrspace(7) %gep, align 8
+  ret void
+}
+
+define amdgpu_kernel void @lshl4_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) {
+; CHECK-LABEL: lshl4_add:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
+; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_b32 s5, s[4:5], 0x54
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    s_mov_b32 s9, s4
+; CHECK-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    global_load_b32 v3, v0, s[6:7] scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    s_mov_b32 s7, s4
+; CHECK-NEXT:    s_mov_b32 s6, s3
+; CHECK-NEXT:    s_mov_b32 s8, s1
+; CHECK-NEXT:    s_or_b64 s[6:7], s[6:7], s[4:5]
+; CHECK-NEXT:    s_mov_b32 s5, s2
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; CHECK-NEXT:    v_lshl_add_u32 v4, v3, 4, s0
+; CHECK-NEXT:    v_mov_b32_e32 v3, v0
+; CHECK-NEXT:    buffer_store_b128 v[0:3], v4, s[4:7], null offen
+; CHECK-NEXT:    s_endpgm
+  %vaddr = load volatile i32, ptr addrspace(1) %in, align 4
+  %1 = sext i32 %vaddr to i64
+  %gep = getelementptr i128, ptr addrspace(7) %in2, i64 %1
+  store i128 0, ptr addrspace(7) %gep, align 16
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
index 6e2d0f6503a20..7e2bfa666a19f 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
@@ -144,7 +144,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 {
 ; SICIVI: buffer_store_dword v{{[0-9]+}}, [[ADDR1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
 
 ; GFX9: global_load_dword [[VADDR:v[0-9]+]],
-; GFX9: v_lshlrev_b32_e32 [[ADDR:v[0-9]+]], 2, [[VADDR]]
+; GFX9: v_lshl_add_u32 [[ADDR:v[0-9]+]], [[VADDR]], 2, s{{[0-9]+}}
 ; GFX9-NOT [[ADDR]]
 ; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen offset:32
 define amdgpu_kernel void @store_private_unknown_bits_vaddr() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index d29847e40dc8b..4681d589ac217 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2989,34 +2989,33 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
 ; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4c
 ; VI-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x7c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    v_mov_b32_e32 v5, 0
+; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s10
-; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s12, v0, 0
+; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s12, v0, 0
 ; VI-NEXT:    s_mul_i32 s4, s12, s11
-; VI-NEXT:    v_mov_b32_e32 v6, s12
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; VI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s8, v6, 0
-; VI-NEXT:    s_mul_i32 s6, s13, s10
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s6, v3
-; VI-NEXT:    v_mov_b32_e32 v4, v1
-; VI-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s9, v6, v[4:5]
-; VI-NEXT:    v_mov_b32_e32 v8, s8
-; VI-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s14, v8, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v4, v6
-; VI-NEXT:    v_mov_b32_e32 v6, s13
-; VI-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], s8, v6, v[4:5]
+; VI-NEXT:    v_mov_b32_e32 v5, s12
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
+; VI-NEXT:    s_mul_i32 s4, s13, s10
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v0
+; VI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s8, v5, 0
+; VI-NEXT:    v_mov_b32_e32 v7, s8
+; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s14, v7, v[3:4]
+; VI-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s9, v5, v[1:2]
+; VI-NEXT:    v_mov_b32_e32 v7, s13
 ; VI-NEXT:    s_mul_i32 s6, s15, s8
-; VI-NEXT:    v_add_u32_e32 v4, vcc, s6, v9
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
-; VI-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
-; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s9, v6, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v1, v5
+; VI-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], s8, v7, v[1:2]
+; VI-NEXT:    v_add_u32_e32 v8, vcc, s6, v4
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v6, v2
+; VI-NEXT:    v_addc_u32_e64 v5, s[4:5], 0, 0, vcc
+; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s9, v7, v[4:5]
 ; VI-NEXT:    s_mul_i32 s6, s14, s9
-; VI-NEXT:    v_add_u32_e32 v4, vcc, s6, v4
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v8
+; VI-NEXT:    v_add_u32_e32 v6, vcc, s6, v8
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v4, v3
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -3370,67 +3369,66 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
-; VI-NEXT:    v_mov_b32_e32 v10, 0
+; VI-NEXT:    v_mov_b32_e32 v9, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_add_u32_e32 v12, vcc, s2, v2
-; VI-NEXT:    v_addc_u32_e32 v13, vcc, 0, v3, vcc
+; VI-NEXT:    v_add_u32_e32 v11, vcc, s2, v2
+; VI-NEXT:    v_addc_u32_e32 v12, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    flat_load_dwordx4 v[4:7], v[12:13]
+; VI-NEXT:    flat_load_dwordx4 v[4:7], v[11:12]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mul_lo_u32 v3, v4, v3
-; VI-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v4, v2, 0
-; VI-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
+; VI-NEXT:    v_mad_u64_u32 v[13:14], s[0:1], v4, v2, 0
 ; VI-NEXT:    v_mul_lo_u32 v2, v5, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v15, v3
-; VI-NEXT:    v_add_u32_e32 v15, vcc, v3, v2
-; VI-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[9:10]
-; VI-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v6, v0, v[14:15]
-; VI-NEXT:    v_mov_b32_e32 v9, v2
-; VI-NEXT:    v_mul_lo_u32 v2, v7, v0
-; VI-NEXT:    v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[9:10]
+; VI-NEXT:    v_mul_lo_u32 v10, v7, v0
+; VI-NEXT:    v_mad_u64_u32 v[7:8], s[0:1], v0, v4, 0
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v14, v3
+; VI-NEXT:    v_add_u32_e32 v14, vcc, v3, v2
+; VI-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[8:9]
+; VI-NEXT:    v_mad_u64_u32 v[13:14], s[0:1], v6, v0, v[13:14]
+; VI-NEXT:    v_mov_b32_e32 v8, v2
+; VI-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v5, v[8:9]
 ; VI-NEXT:    v_mul_lo_u32 v4, v6, v1
-; VI-NEXT:    v_add_u32_e32 v6, vcc, v2, v15
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v3, v10
+; VI-NEXT:    v_add_u32_e32 v6, vcc, v10, v14
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v3, v9
 ; VI-NEXT:    v_addc_u32_e64 v3, s[0:1], 0, 0, vcc
 ; VI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[2:3]
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v4, v6
-; VI-NEXT:    v_add_u32_e32 v10, vcc, v0, v14
-; VI-NEXT:    v_addc_u32_e32 v11, vcc, v1, v2, vcc
-; VI-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v0, v13
+; VI-NEXT:    v_addc_u32_e32 v10, vcc, v1, v2, vcc
+; VI-NEXT:    flat_store_dwordx4 v[11:12], v[7:10]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_mul_i128:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v11, 0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v14, s[0:1]
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v14, s[2:3]
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v13, s[0:1]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v13, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v10, v5, v2
-; GFX9-NEXT:    v_mul_lo_u32 v12, v4, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v4, v2, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0
-; GFX9-NEXT:    v_add3_u32 v9, v9, v12, v10
-; GFX9-NEXT:    v_mul_lo_u32 v15, v6, v1
-; GFX9-NEXT:    v_mov_b32_e32 v10, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11]
-; GFX9-NEXT:    v_mov_b32_e32 v10, v12
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[10:11]
-; GFX9-NEXT:    v_mul_lo_u32 v10, v7, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v6, v0, v[8:9]
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v13, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, 0, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[8:9]
-; GFX9-NEXT:    v_add3_u32 v5, v10, v7, v15
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v5, vcc
-; GFX9-NEXT:    global_store_dwordx4 v14, v[2:5], s[2:3]
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
+; GFX9-NEXT:    v_mul_lo_u32 v14, v5, v2
+; GFX9-NEXT:    v_mul_lo_u32 v15, v4, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10]
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v11
+; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[9:10]
+; GFX9-NEXT:    v_add3_u32 v3, v3, v15, v14
+; GFX9-NEXT:    v_mul_lo_u32 v4, v6, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3]
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v10
+; GFX9-NEXT:    v_mul_lo_u32 v14, v7, v0
+; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7]
+; GFX9-NEXT:    v_add3_u32 v3, v14, v3, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_store_dwordx4 v13, v[8:11], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_mul_i128:
@@ -3468,37 +3466,36 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v12, 0 :: v_dual_lshlrev_b32 v17, 4, v0
+; GFX11-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b128 v[0:3], v17, s[0:1]
-; GFX11-NEXT:    global_load_b128 v[4:7], v17, s[2:3]
+; GFX11-NEXT:    global_load_b128 v[0:3], v15, s[0:1]
+; GFX11-NEXT:    global_load_b128 v[4:7], v15, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mad_u64_u32 v[8:9], null, v0, v4, 0
-; GFX11-NEXT:    v_mul_lo_u32 v18, v5, v2
-; GFX11-NEXT:    v_mul_lo_u32 v3, v4, v3
-; GFX11-NEXT:    v_mad_u64_u32 v[15:16], null, v4, v2, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v11, v9
-; GFX11-NEXT:    v_add3_u32 v16, v16, v3, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mad_u64_u32 v[13:14], null, v1, v4, v[11:12]
-; GFX11-NEXT:    v_mul_lo_u32 v4, v6, v1
-; GFX11-NEXT:    v_mov_b32_e32 v11, v13
+; GFX11-NEXT:    v_mul_lo_u32 v16, v5, v2
+; GFX11-NEXT:    v_mad_u64_u32 v[13:14], null, v4, v2, 0
+; GFX11-NEXT:    v_mul_lo_u32 v17, v6, v1
+; GFX11-NEXT:    v_mul_lo_u32 v18, v7, v0
+; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v0, v5, v[11:12]
-; GFX11-NEXT:    v_mul_lo_u32 v12, v7, v0
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v14, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s0
-; GFX11-NEXT:    v_mad_u64_u32 v[10:11], null, v6, v0, v[15:16]
-; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3]
+; GFX11-NEXT:    v_mov_b32_e32 v9, v11
+; GFX11-NEXT:    v_mul_lo_u32 v11, v4, v3
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v0, v5, v[9:10]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v0, v12, v11, v4
-; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, v6, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v14, v14, v11, v16
+; GFX11-NEXT:    v_add_co_u32 v3, s0, v12, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, null, 0, 0, s0
+; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v6, v0, v[13:14]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v1, v5, v[3:4]
+; GFX11-NEXT:    v_add3_u32 v0, v18, v10, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, v6, v9
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v11, null, v7, v0, vcc_lo
-; GFX11-NEXT:    global_store_b128 v17, v[8:11], s[2:3]
+; GFX11-NEXT:    v_mov_b32_e32 v9, v2
+; GFX11-NEXT:    global_store_b128 v15, v[8:11], s[2:3]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_mul_i128:
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 3d9c2a29cb9c1..10d4eb029ee35 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -10,46 +10,43 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
 ; SI-LABEL: test_smul24_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfe_i32 s2, s2, 0x180000
-; SI-NEXT:    s_bfe_i32 s3, s3, 0x180000
-; SI-NEXT:    s_mul_i32 s2, s2, s3
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_bfe_i32 s2, s4, 0x180000
+; SI-NEXT:    s_bfe_i32 s4, s5, 0x180000
+; SI-NEXT:    s_mul_i32 s4, s2, s4
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_smul24_i32:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_bfe_i32 s0, s2, 0x180000
-; VI-NEXT:    s_bfe_i32 s1, s3, 0x180000
-; VI-NEXT:    s_mul_i32 s0, s0, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; VI-NEXT:    s_bfe_i32 s4, s4, 0x180000
+; VI-NEXT:    s_bfe_i32 s5, s5, 0x180000
+; VI-NEXT:    s_mul_i32 s4, s4, s5
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_smul24_i32:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
-; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX9-NEXT:    s_bfe_i32 s4, s4, 0x180000
+; GFX9-NEXT:    s_bfe_i32 s5, s5, 0x180000
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: test_smul24_i32:
@@ -127,16 +124,15 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32
 ; GFX9-LABEL: test_smulhi24_i64:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
-; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
-; GFX9-NEXT:    s_mul_hi_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX9-NEXT:    s_bfe_i32 s4, s4, 0x180000
+; GFX9-NEXT:    s_bfe_i32 s5, s5, 0x180000
+; GFX9-NEXT:    s_mul_hi_i32 s4, s4, s5
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: test_smulhi24_i64:
@@ -464,29 +460,26 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b
 ; SI-LABEL: test_smul24_i33:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_bfe_i32 s0, s8, 0x180000
-; SI-NEXT:    s_bfe_i32 s1, s2, 0x180000
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    s_mul_i32 s0, s1, s0
-; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, s1, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_bfe_i32 s4, s4, 0x180000
+; SI-NEXT:    s_bfe_i32 s5, s6, 0x180000
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    s_mul_i32 s4, s5, s4
+; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, s5, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 31
 ; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], 31
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_smul24_i33:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bfe_i32 s2, s2, 0x180000
 ; VI-NEXT:    s_bfe_i32 s3, s4, 0x180000
@@ -494,10 +487,10 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b
 ; VI-NEXT:    v_mul_hi_i32_i24_e32 v1, s2, v0
 ; VI-NEXT:    v_mul_i32_i24_e32 v0, s2, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
-; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    v_ashrrev_i64 v[0:1], 31, v[0:1]
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_smul24_i33:
@@ -577,31 +570,29 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33
 ; SI-LABEL: test_smulhi24_i33:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_mul_hi_i32_i24_e32 v0, s2, v0
+; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mul_hi_i32_i24_e32 v0, s6, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 1, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_smulhi24_i33:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    v_mul_hi_i32_i24_e32 v0, s2, v0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mul_hi_i32_i24_e32 v0, s6, v0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    v_and_b32_e32 v0, 1, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_smulhi24_i33:
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index e29da3a6b000f..1165401a93af8 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -10,46 +10,43 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b
 ; SI-LABEL: test_umul24_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s2, s2, 0xffffff
-; SI-NEXT:    s_and_b32 s3, s3, 0xffffff
-; SI-NEXT:    s_mul_i32 s2, s2, s3
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_and_b32 s2, s4, 0xffffff
+; SI-NEXT:    s_and_b32 s4, s5, 0xffffff
+; SI-NEXT:    s_mul_i32 s4, s2, s4
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_umul24_i32:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_and_b32 s0, s2, 0xffffff
-; VI-NEXT:    s_and_b32 s1, s3, 0xffffff
-; VI-NEXT:    s_mul_i32 s0, s0, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; VI-NEXT:    s_and_b32 s4, s4, 0xffffff
+; VI-NEXT:    s_and_b32 s5, s5, 0xffffff
+; VI-NEXT:    s_mul_i32 s4, s4, s5
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_umul24_i32:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    s_and_b32 s0, s2, 0xffffff
-; GFX9-NEXT:    s_and_b32 s1, s3, 0xffffff
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX9-NEXT:    s_and_b32 s4, s4, 0xffffff
+; GFX9-NEXT:    s_and_b32 s5, s5, 0xffffff
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 entry:
   %0 = shl i32 %a, 8
@@ -406,16 +403,15 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a,
 ; GFX9-LABEL: test_umulhi24_i32_i64:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    s_and_b32 s0, s2, 0xffffff
-; GFX9-NEXT:    s_and_b32 s1, s3, 0xffffff
-; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX9-NEXT:    s_and_b32 s4, s4, 0xffffff
+; GFX9-NEXT:    s_and_b32 s5, s5, 0xffffff
+; GFX9-NEXT:    s_mul_hi_u32 s4, s4, s5
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 entry:
   %a.24 = and i32 %a, 16777215
@@ -632,33 +628,31 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32
 ; SI-LABEL: test_umulhi16_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s2, s2, 0xffff
-; SI-NEXT:    s_and_b32 s3, s3, 0xffff
-; SI-NEXT:    s_mul_i32 s2, s2, s3
-; SI-NEXT:    s_lshr_b32 s2, s2, 16
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_and_b32 s2, s4, 0xffff
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_mul_i32 s2, s2, s4
+; SI-NEXT:    s_lshr_b32 s4, s2, 16
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_umulhi16_i32:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_and_b32 s0, s2, 0xffff
-; VI-NEXT:    s_and_b32 s1, s3, 0xffff
-; VI-NEXT:    s_mul_i32 s0, s0, s1
-; VI-NEXT:    s_lshr_b32 s0, s0, 16
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_mul_i32 s4, s4, s5
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_umulhi16_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll
new file mode 100644
index 0000000000000..74ee867959429
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll
@@ -0,0 +1,338 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; Check that nofpclass attributes on call returns are used in
+; selectiondag.
+
+define internal float @func_f32(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: func_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    global_load_dword v0, v[0:1], off glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %ld = load volatile float, ptr addrspace(1) %ptr
+  ret float %ld
+}
+
+define float @call_nofpclass_funcs_f32(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: call_nofpclass_funcs_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s18, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v4, s30, 0
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, func_f32@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, func_f32@rel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v4, s31, 1
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_mov_b32_e32 v3, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_min_f32_e32 v0, v3, v0
+; CHECK-NEXT:    v_readlane_b32 s31, v4, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v4, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    s_mov_b32 s33, s18
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %call0 = call nofpclass(nan) float @func_f32(ptr addrspace(1) %ptr)
+  %call1 = call nofpclass(nan) float @func_f32(ptr addrspace(1) %ptr)
+  %min = call float @llvm.minnum.f32(float %call0, float %call1)
+  ret float %min
+}
+
+define internal <2 x float> @func_v2f32(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: func_v2f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %ld = load volatile <2 x float>, ptr addrspace(1) %ptr
+  ret <2 x float> %ld
+}
+
+define <2 x float> @call_nofpclass_funcs_v2f32(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: call_nofpclass_funcs_v2f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s18, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v6, s30, 0
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, func_v2f32@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, func_v2f32@rel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v6, s31, 1
+; CHECK-NEXT:    v_mov_b32_e32 v2, v1
+; CHECK-NEXT:    v_mov_b32_e32 v3, v0
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_mov_b32_e32 v4, v0
+; CHECK-NEXT:    v_mov_b32_e32 v5, v1
+; CHECK-NEXT:    v_mov_b32_e32 v0, v3
+; CHECK-NEXT:    v_mov_b32_e32 v1, v2
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_min_f32_e32 v0, v4, v0
+; CHECK-NEXT:    v_min_f32_e32 v1, v5, v1
+; CHECK-NEXT:    v_readlane_b32 s31, v6, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    s_mov_b32 s33, s18
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %call0 = call nofpclass(nan) <2 x float> @func_v2f32(ptr addrspace(1) %ptr)
+  %call1 = call nofpclass(nan) <2 x float> @func_v2f32(ptr addrspace(1) %ptr)
+  %min = call <2 x float> @llvm.minnum.v2f32(<2 x float> %call0, <2 x float> %call1)
+  ret <2 x float> %min
+}
+
+define internal double @func_f64(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: func_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %ld = load volatile double, ptr addrspace(1) %ptr
+  ret double %ld
+}
+
+define double @call_nofpclass_funcs_f64(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: call_nofpclass_funcs_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s18, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v6, s30, 0
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, func_f64@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, func_f64@rel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v6, s31, 1
+; CHECK-NEXT:    v_mov_b32_e32 v4, v1
+; CHECK-NEXT:    v_mov_b32_e32 v5, v0
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    v_mov_b32_e32 v3, v1
+; CHECK-NEXT:    v_mov_b32_e32 v0, v5
+; CHECK-NEXT:    v_mov_b32_e32 v1, v4
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; CHECK-NEXT:    v_readlane_b32 s31, v6, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    s_mov_b32 s33, s18
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %call0 = call nofpclass(nan) double @func_f64(ptr addrspace(1) %ptr)
+  %call1 = call nofpclass(nan) double @func_f64(ptr addrspace(1) %ptr)
+  %min = call double @llvm.minnum.f64(double %call0, double %call1)
+  ret double %min
+}
+
+define float @call_nofpclass_intrinsic_f32(float %x, float %y, float %z) {
+; CHECK-LABEL: call_nofpclass_intrinsic_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_sqrt_f32_e32 v0, v0
+; CHECK-NEXT:    v_sqrt_f32_e32 v1, v1
+; CHECK-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %call0 = call nofpclass(nan) float @llvm.amdgcn.sqrt.f32(float %x)
+  %call1 = call nofpclass(nan) float @llvm.amdgcn.sqrt.f32(float %y)
+  %lt = fcmp olt float %call0, %call1
+  %min = select nsz i1 %lt, float %call0, float %call1
+  ret float %min
+}
+
+define <2 x half> @call_nofpclass_intrinsic_v2f16(float %x, float %y, float %z, float %w) {
+; CHECK-LABEL: call_nofpclass_intrinsic_v2f16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, v1
+; CHECK-NEXT:    v_cvt_pkrtz_f16_f32 v1, v2, v3
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; CHECK-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; CHECK-NEXT:    v_cmp_lt_f16_e32 vcc, v3, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; CHECK-NEXT:    s_mov_b32 s4, 0x5040100
+; CHECK-NEXT:    v_perm_b32 v0, v1, v0, s4
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %call0 = call nofpclass(nan) <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
+  %call1 = call nofpclass(nan) <2 x half> @llvm.amdgcn.cvt.pkrtz(float %z, float %w)
+  %lt = fcmp olt <2 x half> %call0, %call1
+  %min = select nsz <2 x i1> %lt, <2 x half> %call0, <2 x half> %call1
+  ret <2 x half> %min
+}
+
+define nofpclass(nan inf) { double, double } @aggregate() {
+; CHECK-LABEL: aggregate:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s16, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v40, s16, 2
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, aggregate@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, aggregate@gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
+; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
+; CHECK-NEXT:    s_mov_b32 s33, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %call.i.i = call { double, double } @aggregate()
+  ret { double, double } %call.i.i
+}
+
+declare hidden nofpclass(nan inf) { float, float } @aggregate_f32()
+
+define { float, float } @aggregate_use(float %z) {
+; CHECK-LABEL: aggregate_use:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s16, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
+; CHECK-NEXT:    v_writelane_b32 v41, s16, 2
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v41, s30, 0
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, aggregate_f32@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, aggregate_f32@rel32@hi+12
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v41, s31, 1
+; CHECK-NEXT:    v_mov_b32_e32 v40, v0
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_max_f32_e32 v2, v40, v40
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_min_f32_e32 v0, v0, v2
+; CHECK-NEXT:    v_min_f32_e32 v1, v1, v2
+; CHECK-NEXT:    v_readlane_b32 s31, v41, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v41, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    v_readlane_b32 s4, v41, 2
+; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
+; CHECK-NEXT:    s_mov_b32 s33, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+   %call = call nofpclass(nan inf) { float, float } @aggregate_f32()
+   %i = extractvalue { float, float } %call, 0
+   %i1 = extractvalue { float, float } %call, 1
+   %min0 = call float @llvm.minnum.f32(float %i, float %z)
+   %min1 = call float @llvm.minnum.f32(float %i1, float %z)
+   %insert.0 = insertvalue { float, float } poison, float %min0, 0
+   %insert.1 = insertvalue { float, float } %insert.0, float %min1, 1
+   ret { float, float } %insert.1
+}
+
+define internal <5 x double> @func_v5f64(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: func_v5f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v11, v1
+; CHECK-NEXT:    v_mov_b32_e32 v10, v0
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v[10:11], off glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[10:11], off offset:16 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[10:11], off offset:32 glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %ld = load volatile <5 x double>, ptr addrspace(1) %ptr
+  ret <5 x double> %ld
+}
+
+define <5 x double> @call_nofpclass_funcs_v5f64_non_mvt_vector(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: call_nofpclass_funcs_v5f64_non_mvt_vector:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s18, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; CHECK-NEXT:    buffer_store_dword v24, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v24, s30, 0
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, func_v5f64@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, func_v5f64@rel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v24, s31, 1
+; CHECK-NEXT:    v_mov_b32_e32 v22, v1
+; CHECK-NEXT:    v_mov_b32_e32 v23, v0
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_mov_b32_e32 v12, v0
+; CHECK-NEXT:    v_mov_b32_e32 v13, v1
+; CHECK-NEXT:    v_mov_b32_e32 v0, v23
+; CHECK-NEXT:    v_mov_b32_e32 v1, v22
+; CHECK-NEXT:    v_mov_b32_e32 v14, v2
+; CHECK-NEXT:    v_mov_b32_e32 v15, v3
+; CHECK-NEXT:    v_mov_b32_e32 v16, v4
+; CHECK-NEXT:    v_mov_b32_e32 v17, v5
+; CHECK-NEXT:    v_mov_b32_e32 v18, v6
+; CHECK-NEXT:    v_mov_b32_e32 v19, v7
+; CHECK-NEXT:    v_mov_b32_e32 v20, v8
+; CHECK-NEXT:    v_mov_b32_e32 v21, v9
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_min_f64 v[0:1], v[12:13], v[0:1]
+; CHECK-NEXT:    v_min_f64 v[2:3], v[14:15], v[2:3]
+; CHECK-NEXT:    v_min_f64 v[4:5], v[16:17], v[4:5]
+; CHECK-NEXT:    v_min_f64 v[6:7], v[18:19], v[6:7]
+; CHECK-NEXT:    v_min_f64 v[8:9], v[20:21], v[8:9]
+; CHECK-NEXT:    v_readlane_b32 s31, v24, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v24, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_load_dword v24, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    s_mov_b32 s33, s18
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %call0 = call nofpclass(nan) <5 x double> @func_v5f64(ptr addrspace(1) %ptr)
+  %call1 = call nofpclass(nan) <5 x double> @func_v5f64(ptr addrspace(1) %ptr)
+  %min = call <5 x double> @llvm.minnum.v5f64(<5 x double> %call0, <5 x double> %call1)
+  ret <5 x double> %min
+}
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 627f4ada95dba..c1f52173c7451 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -33,11 +33,10 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; MUBUF-NEXT:    s_mov_b32 s6, s32
 ; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, 1
-; MUBUF-NEXT:    s_lshl_b32 s7, s10, 2
 ; MUBUF-NEXT:    s_add_i32 s32, s6, 0x1000
 ; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s6
 ; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s6 offset:4
-; MUBUF-NEXT:    s_add_i32 s6, s6, s7
+; MUBUF-NEXT:    s_lshl2_add_u32 s6, s10, s6
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
 ; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
@@ -68,10 +67,9 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; FLATSCR-NEXT:    s_mov_b32 s2, s32
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
-; FLATSCR-NEXT:    s_lshl_b32 s3, s6, 2
 ; FLATSCR-NEXT:    s_add_i32 s32, s2, 0x1000
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s2
-; FLATSCR-NEXT:    s_add_i32 s2, s2, s3
+; FLATSCR-NEXT:    s_lshl2_add_u32 s2, s6, s2
 ; FLATSCR-NEXT:    scratch_load_dword v2, off, s2
 ; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -132,12 +130,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; MUBUF-NEXT:  ; %bb.1: ; %bb.0
 ; MUBUF-NEXT:    s_add_i32 s4, s32, 0xfff
 ; MUBUF-NEXT:    s_and_b32 s4, s4, 0xfffff000
-; MUBUF-NEXT:    s_lshl_b32 s5, s5, 2
 ; MUBUF-NEXT:    s_add_i32 s32, s4, 0x1000
 ; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, s4
 ; MUBUF-NEXT:    v_mov_b32_e32 v3, 1
-; MUBUF-NEXT:    s_add_i32 s4, s4, s5
+; MUBUF-NEXT:    s_lshl2_add_u32 s4, s5, s4
 ; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; MUBUF-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, s4
@@ -168,10 +165,9 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
 ; FLATSCR-NEXT:    s_and_b32 s0, s0, 0xfffff000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
-; FLATSCR-NEXT:    s_lshl_b32 s1, s1, 2
 ; FLATSCR-NEXT:    s_add_i32 s32, s0, 0x1000
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s0
-; FLATSCR-NEXT:    s_add_i32 s0, s0, s1
+; FLATSCR-NEXT:    s_lshl2_add_u32 s0, s1, s0
 ; FLATSCR-NEXT:    scratch_load_dword v2, off, s0
 ; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index fba42c494343b..fa452f3717f0e 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -2277,3 +2277,181 @@ body:             |
     S_ENDPGM 0
 
 ...
+
+---
+name:            s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000
+body:             |
+  ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]]
+  ; GCN-NEXT:   S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+    %0:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_32 = COPY %0
+    S_CMP_LG_U32 %2, 0, implicit-def $scc
+    %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+    %40:sreg_32_xm0_xexec = COPY %31.sub0:sreg_64_xexec
+    %41:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec
+    %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32_xm0_xexec, implicit-def $scc
+    S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+---
+# Do not delete s_or_b32 because of intervening def of scc
+name:            s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize_intervening
+body:             |
+  ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize_intervening
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]]
+  ; GCN-NEXT:   S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+  ; GCN-NEXT:   S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1
+  ; GCN-NEXT:   %sgpr4:sreg_32 = S_OR_B32 [[COPY1]], [[COPY2]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+    %0:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_32 = COPY %0
+    S_CMP_LG_U32 %2, 0, implicit-def $scc
+    %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+    S_CMP_LG_U32 %2, 0, implicit-def $scc
+    %40:sreg_32_xm0_xexec = COPY %31.sub0:sreg_64_xexec
+    %41:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec
+    %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32_xm0_xexec, implicit-def $scc
+    S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+
+---
+# Do not delete s_or_b32 since both operands are sub1.
+name:            s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize
+body:             |
+  ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]]
+  ; GCN-NEXT:   S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_CSELECT_B64_]].sub1
+  ; GCN-NEXT:   %sgpr4:sreg_32 = S_OR_B32 [[COPY1]], [[COPY2]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+    %0:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_32 = COPY %0
+    S_CMP_LG_U32 %2, 0, implicit-def $scc
+    %31:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    %40:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec
+    %41:sreg_32 = COPY %31.sub1:sreg_64_xexec
+    %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32, implicit-def $scc
+    S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+
+---
+name:            s_cselect_b64_undef_s_or_b32_s_cmp_lg_u32_0x00000000
+body:             |
+  ; GCN-LABEL: name: s_cselect_b64_undef_s_or_b32_s_cmp_lg_u32_0x00000000
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]]
+  ; GCN-NEXT:   S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+  ; GCN-NEXT:   %sgpr4:sreg_32 = S_OR_B32 undef %4:sreg_32_xm0_xexec, undef %5:sreg_32_xm0_xexec, implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+    %0:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_32 = COPY %0
+    S_CMP_LG_U32 %2, 0, implicit-def $scc
+    %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+    %sgpr4:sreg_32 = S_OR_B32 undef %40:sreg_32_xm0_xexec, undef %41:sreg_32_xm0_xexec, implicit-def $scc
+    S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+  bb.2:
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index 728067edcf399..9afaab5ebcfb6 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -136,27 +136,25 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
 ; GFX6-LABEL: scalar_or_i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_or_b32 s0, s2, s3
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_or_b32 s4, s4, s5
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: scalar_or_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, s0
-; GFX8-NEXT:    s_or_b32 s0, s2, s3
-; GFX8-NEXT:    s_mov_b32 s5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX8-NEXT:    s_or_b32 s4, s4, s5
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
   %or = or i32 %a, %b
   store i32 %or, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 78207c2cf605e..1177474f5b4f5 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -185,44 +185,47 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
 ; GFX900-NEXT:    global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
 ; GFX900-NEXT:    global_load_dwordx4 v[5:8], v0, s[0:1]
 ; GFX900-NEXT:    global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
 ; GFX900-NEXT:    global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
 ; GFX900-NEXT:    global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
 ; GFX900-NEXT:    global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(5)
 ; GFX900-NEXT:    v_add_f32_e32 v4, s43, v4
 ; GFX900-NEXT:    v_add_f32_e32 v3, s42, v3
 ; GFX900-NEXT:    v_add_f32_e32 v2, s41, v2
 ; GFX900-NEXT:    v_add_f32_e32 v1, s40, v1
-; GFX900-NEXT:    s_waitcnt vmcnt(6)
-; GFX900-NEXT:    v_add_f32_e32 v8, s39, v8
-; GFX900-NEXT:    v_add_f32_e32 v7, s38, v7
-; GFX900-NEXT:    v_add_f32_e32 v6, s37, v6
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_add_f32_e32 v32, s19, v32
 ; GFX900-NEXT:    v_add_f32_e32 v31, s18, v31
 ; GFX900-NEXT:    v_add_f32_e32 v30, s17, v30
 ; GFX900-NEXT:    v_add_f32_e32 v29, s16, v29
+; GFX900-NEXT:    s_waitcnt vmcnt(4)
+; GFX900-NEXT:    v_add_f32_e32 v8, s39, v8
+; GFX900-NEXT:    v_add_f32_e32 v7, s38, v7
+; GFX900-NEXT:    v_add_f32_e32 v6, s37, v6
 ; GFX900-NEXT:    v_add_f32_e32 v5, s36, v5
+; GFX900-NEXT:    s_waitcnt vmcnt(3)
 ; GFX900-NEXT:    v_add_f32_e32 v12, s51, v12
 ; GFX900-NEXT:    v_add_f32_e32 v11, s50, v11
 ; GFX900-NEXT:    v_add_f32_e32 v10, s49, v10
 ; GFX900-NEXT:    v_add_f32_e32 v9, s48, v9
+; GFX900-NEXT:    s_waitcnt vmcnt(2)
 ; GFX900-NEXT:    v_add_f32_e32 v16, s47, v16
 ; GFX900-NEXT:    v_add_f32_e32 v15, s46, v15
 ; GFX900-NEXT:    v_add_f32_e32 v14, s45, v14
 ; GFX900-NEXT:    v_add_f32_e32 v13, s44, v13
+; GFX900-NEXT:    s_waitcnt vmcnt(1)
 ; GFX900-NEXT:    v_add_f32_e32 v20, s15, v20
 ; GFX900-NEXT:    v_add_f32_e32 v19, s14, v19
 ; GFX900-NEXT:    v_add_f32_e32 v18, s13, v18
 ; GFX900-NEXT:    v_add_f32_e32 v17, s12, v17
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_add_f32_e32 v24, s11, v24
 ; GFX900-NEXT:    v_add_f32_e32 v23, s10, v23
 ; GFX900-NEXT:    v_add_f32_e32 v22, s9, v22
@@ -246,6 +249,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -255,9 +260,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; PACKED-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[40:41]
 ; PACKED-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[42:43]
 ; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(6)
@@ -293,6 +296,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -302,9 +307,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7)
 ; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[36:37]
 ; PACKED-GISEL-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[38:39]
 ; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(6)
@@ -340,11 +343,14 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ;
 ; GFX1250-SDAG-LABEL: fadd_v32_vs:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_clause 0x2
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[0:1] offset:16
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[0:1] offset:48
@@ -354,22 +360,18 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:96
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:64
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-SDAG-NEXT:    s_clause 0x1
-; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v44, s46
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
 ; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], v[32:33]
 ; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], v[34:35]
@@ -409,6 +411,9 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -421,10 +426,6 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:80
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:96
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-GISEL-NEXT:    s_clause 0x1
-; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[40:41]
@@ -1442,44 +1443,47 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
 ; GFX900-NEXT:    global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
 ; GFX900-NEXT:    global_load_dwordx4 v[5:8], v0, s[0:1]
 ; GFX900-NEXT:    global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
 ; GFX900-NEXT:    global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
 ; GFX900-NEXT:    global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
 ; GFX900-NEXT:    global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(5)
 ; GFX900-NEXT:    v_mul_f32_e32 v4, s43, v4
 ; GFX900-NEXT:    v_mul_f32_e32 v3, s42, v3
 ; GFX900-NEXT:    v_mul_f32_e32 v2, s41, v2
 ; GFX900-NEXT:    v_mul_f32_e32 v1, s40, v1
-; GFX900-NEXT:    s_waitcnt vmcnt(6)
-; GFX900-NEXT:    v_mul_f32_e32 v8, s39, v8
-; GFX900-NEXT:    v_mul_f32_e32 v7, s38, v7
-; GFX900-NEXT:    v_mul_f32_e32 v6, s37, v6
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_mul_f32_e32 v32, s19, v32
 ; GFX900-NEXT:    v_mul_f32_e32 v31, s18, v31
 ; GFX900-NEXT:    v_mul_f32_e32 v30, s17, v30
 ; GFX900-NEXT:    v_mul_f32_e32 v29, s16, v29
+; GFX900-NEXT:    s_waitcnt vmcnt(4)
+; GFX900-NEXT:    v_mul_f32_e32 v8, s39, v8
+; GFX900-NEXT:    v_mul_f32_e32 v7, s38, v7
+; GFX900-NEXT:    v_mul_f32_e32 v6, s37, v6
 ; GFX900-NEXT:    v_mul_f32_e32 v5, s36, v5
+; GFX900-NEXT:    s_waitcnt vmcnt(3)
 ; GFX900-NEXT:    v_mul_f32_e32 v12, s51, v12
 ; GFX900-NEXT:    v_mul_f32_e32 v11, s50, v11
 ; GFX900-NEXT:    v_mul_f32_e32 v10, s49, v10
 ; GFX900-NEXT:    v_mul_f32_e32 v9, s48, v9
+; GFX900-NEXT:    s_waitcnt vmcnt(2)
 ; GFX900-NEXT:    v_mul_f32_e32 v16, s47, v16
 ; GFX900-NEXT:    v_mul_f32_e32 v15, s46, v15
 ; GFX900-NEXT:    v_mul_f32_e32 v14, s45, v14
 ; GFX900-NEXT:    v_mul_f32_e32 v13, s44, v13
+; GFX900-NEXT:    s_waitcnt vmcnt(1)
 ; GFX900-NEXT:    v_mul_f32_e32 v20, s15, v20
 ; GFX900-NEXT:    v_mul_f32_e32 v19, s14, v19
 ; GFX900-NEXT:    v_mul_f32_e32 v18, s13, v18
 ; GFX900-NEXT:    v_mul_f32_e32 v17, s12, v17
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_mul_f32_e32 v24, s11, v24
 ; GFX900-NEXT:    v_mul_f32_e32 v23, s10, v23
 ; GFX900-NEXT:    v_mul_f32_e32 v22, s9, v22
@@ -1503,6 +1507,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -1512,9 +1518,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], s[40:41]
 ; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], s[42:43]
 ; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(6)
@@ -1550,6 +1554,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -1559,9 +1565,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7)
 ; PACKED-GISEL-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], s[36:37]
 ; PACKED-GISEL-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], s[38:39]
 ; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(6)
@@ -1597,11 +1601,14 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ;
 ; GFX1250-SDAG-LABEL: fmul_v32_vs:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_clause 0x2
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[0:1] offset:16
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[0:1] offset:48
@@ -1611,22 +1618,18 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:96
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:64
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-SDAG-NEXT:    s_clause 0x1
-; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v44, s46
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
 ; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
 ; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
@@ -1666,6 +1669,9 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -1678,10 +1684,6 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:80
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:96
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-GISEL-NEXT:    s_clause 0x1
-; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[40:41]
@@ -2273,44 +2275,47 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
 ; GFX900-NEXT:    global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
 ; GFX900-NEXT:    global_load_dwordx4 v[5:8], v0, s[0:1]
 ; GFX900-NEXT:    global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
 ; GFX900-NEXT:    global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
 ; GFX900-NEXT:    global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
 ; GFX900-NEXT:    global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(5)
 ; GFX900-NEXT:    v_fma_f32 v4, v4, s43, s43
 ; GFX900-NEXT:    v_fma_f32 v3, v3, s42, s42
 ; GFX900-NEXT:    v_fma_f32 v2, v2, s41, s41
 ; GFX900-NEXT:    v_fma_f32 v1, v1, s40, s40
-; GFX900-NEXT:    s_waitcnt vmcnt(6)
-; GFX900-NEXT:    v_fma_f32 v8, v8, s39, s39
-; GFX900-NEXT:    v_fma_f32 v7, v7, s38, s38
-; GFX900-NEXT:    v_fma_f32 v6, v6, s37, s37
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_fma_f32 v32, v32, s19, s19
 ; GFX900-NEXT:    v_fma_f32 v31, v31, s18, s18
 ; GFX900-NEXT:    v_fma_f32 v30, v30, s17, s17
 ; GFX900-NEXT:    v_fma_f32 v29, v29, s16, s16
+; GFX900-NEXT:    s_waitcnt vmcnt(4)
+; GFX900-NEXT:    v_fma_f32 v8, v8, s39, s39
+; GFX900-NEXT:    v_fma_f32 v7, v7, s38, s38
+; GFX900-NEXT:    v_fma_f32 v6, v6, s37, s37
 ; GFX900-NEXT:    v_fma_f32 v5, v5, s36, s36
+; GFX900-NEXT:    s_waitcnt vmcnt(3)
 ; GFX900-NEXT:    v_fma_f32 v12, v12, s51, s51
 ; GFX900-NEXT:    v_fma_f32 v11, v11, s50, s50
 ; GFX900-NEXT:    v_fma_f32 v10, v10, s49, s49
 ; GFX900-NEXT:    v_fma_f32 v9, v9, s48, s48
+; GFX900-NEXT:    s_waitcnt vmcnt(2)
 ; GFX900-NEXT:    v_fma_f32 v16, v16, s47, s47
 ; GFX900-NEXT:    v_fma_f32 v15, v15, s46, s46
 ; GFX900-NEXT:    v_fma_f32 v14, v14, s45, s45
 ; GFX900-NEXT:    v_fma_f32 v13, v13, s44, s44
+; GFX900-NEXT:    s_waitcnt vmcnt(1)
 ; GFX900-NEXT:    v_fma_f32 v20, v20, s15, s15
 ; GFX900-NEXT:    v_fma_f32 v19, v19, s14, s14
 ; GFX900-NEXT:    v_fma_f32 v18, v18, s13, s13
 ; GFX900-NEXT:    v_fma_f32 v17, v17, s12, s12
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_fma_f32 v24, v24, s11, s11
 ; GFX900-NEXT:    v_fma_f32 v23, v23, s10, s10
 ; GFX900-NEXT:    v_fma_f32 v22, v22, s9, s9
@@ -2334,6 +2339,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -2343,9 +2350,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41]
 ; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43]
 ; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(6)
@@ -2381,6 +2386,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -2390,9 +2397,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7)
 ; PACKED-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[36:37], s[36:37]
 ; PACKED-GISEL-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], s[38:39], s[38:39]
 ; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(6)
@@ -2430,6 +2435,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -2442,10 +2450,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:96
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:64
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-SDAG-NEXT:    s_clause 0x1
-; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[32:33], s[40:41]
 ; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[34:35], s[42:43]
 ; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[50:51]
@@ -2496,6 +2500,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -2508,10 +2515,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:80
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:96
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-GISEL-NEXT:    s_clause 0x1
-; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[40:41]
diff --git a/llvm/test/CodeGen/AMDGPU/packetizer.ll b/llvm/test/CodeGen/AMDGPU/packetizer.ll
index aab035f811434..b9bf13886d366 100644
--- a/llvm/test/CodeGen/AMDGPU/packetizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/packetizer.ll
@@ -1,13 +1,49 @@
-; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s
-; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s
-
-; CHECK: {{^}}test:
-; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X
-; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Y
-; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z
-; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s -check-prefix=R600
+; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s -check-prefix=CM
 
 define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
+; R600-LABEL: test:
+; R600:       ; %bb.0: ; %entry
+; R600-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     ADD_INT T0.Y, KC0[3].X, 1,
+; R600-NEXT:     ADD_INT T0.Z, KC0[3].Y, 1,
+; R600-NEXT:     ADD_INT T0.W, KC0[2].Z, 1,
+; R600-NEXT:     ADD_INT * T1.W, KC0[2].W, 1,
+; R600-NEXT:     BIT_ALIGN_INT T0.X, PS, PS, KC0[3].Z,
+; R600-NEXT:     BIT_ALIGN_INT T1.Y, PV.W, PV.W, KC0[3].Z,
+; R600-NEXT:     BIT_ALIGN_INT T0.Z, PV.Z, PV.Z, KC0[3].Z,
+; R600-NEXT:     BIT_ALIGN_INT * T0.W, PV.Y, PV.Y, KC0[3].Z,
+; R600-NEXT:     OR_INT T0.W, PV.W, PV.Z,
+; R600-NEXT:     OR_INT * T1.W, PV.Y, PV.X,
+; R600-NEXT:     OR_INT T0.X, PS, PV.W,
+; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; CM-LABEL: test:
+; CM:       ; %bb.0: ; %entry
+; CM-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
+; CM-NEXT:    CF_END
+; CM-NEXT:    PAD
+; CM-NEXT:    ALU clause starting at 4:
+; CM-NEXT:     ADD_INT T0.X, KC0[3].X, 1,
+; CM-NEXT:     ADD_INT T0.Y, KC0[3].Y, 1,
+; CM-NEXT:     ADD_INT T0.Z, KC0[2].Z, 1,
+; CM-NEXT:     ADD_INT * T0.W, KC0[2].W, 1,
+; CM-NEXT:     BIT_ALIGN_INT T1.X, PV.W, PV.W, KC0[3].Z,
+; CM-NEXT:     BIT_ALIGN_INT T1.Y, PV.Z, PV.Z, KC0[3].Z,
+; CM-NEXT:     BIT_ALIGN_INT T0.Z, PV.Y, PV.Y, KC0[3].Z,
+; CM-NEXT:     BIT_ALIGN_INT * T0.W, PV.X, PV.X, KC0[3].Z,
+; CM-NEXT:     OR_INT T0.Z, PV.W, PV.Z,
+; CM-NEXT:     OR_INT * T0.W, PV.Y, PV.X,
+; CM-NEXT:     OR_INT * T0.X, PV.W, PV.Z,
+; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   %shl = sub i32 32, %e
   %x = add i32 %x_arg, 1
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 2aae26b9470a8..6381db7b69cd4 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -11,10 +11,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX908: bb.0 (%ir-block.0):
   ; REGALLOC-GFX908-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX908-NEXT: {{  $}}
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef %6:agpr_32
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128 */, def %25
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, undef %6:agpr_32
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7012362 /* regdef:VReg_128 */, def %25
   ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:av_128 = COPY %25
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64 */, def %27
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2818058 /* regdef:VReg_64 */, def %27
   ; REGALLOC-GFX908-NEXT:   SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; REGALLOC-GFX908-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]]
   ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
@@ -36,10 +36,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX908-NEXT:   $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
   ; PEI-GFX908-NEXT:   $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
   ; PEI-GFX908-NEXT:   $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
-  ; PEI-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef renamable $agpr0
-  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+  ; PEI-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, undef renamable $agpr0
+  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7012362 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
-  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
+  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2818058 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -60,10 +60,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX90A: bb.0 (%ir-block.0):
   ; REGALLOC-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX90A-NEXT: {{  $}}
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef %6:agpr_32
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128_Align2 */, def %23
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, undef %6:agpr_32
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:VReg_128_Align2 */, def %23
   ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:av_128_align2 = COPY %23
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3670026 /* regdef:VReg_64_Align2 */, def %21
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64_Align2 */, def %21
   ; REGALLOC-GFX90A-NEXT:   [[COPY1:%[0-9]+]]:av_64_align2 = COPY %21
   ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
@@ -79,10 +79,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX90A: bb.0 (%ir-block.0):
   ; PEI-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; PEI-GFX90A-NEXT: {{  $}}
-  ; PEI-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef renamable $agpr0
-  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+  ; PEI-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, undef renamable $agpr0
+  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
-  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3670026 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
+  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; PEI-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
index d0d5cc11994af..025d9e63436d7 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
@@ -56,11 +56,11 @@ body:             |
     ; GCN-NEXT:   BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
     ; GCN-NEXT:   BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
     ; GCN-NEXT: }
-    ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
+    ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (load (s32)) {
     ; GCN-NEXT:   $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
     ; GCN-NEXT:   $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
     ; GCN-NEXT: }
-    ; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
+    ; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (store (s128)) {
     ; GCN-NEXT:   IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128))
     ; GCN-NEXT:   IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128))
     ; GCN-NEXT: }
@@ -359,6 +359,7 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; GCN-LABLE: name: no_sched_barrier_within_bundle
+    ; GCN-LABEL: name: no_sched_barrier_within_bundle
     ; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
     ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
     ; GCN-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
index 5fea0aee72ec7..e0266b9f1a5b0 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
@@ -9,7 +9,7 @@ body:             |
     ; GFX12-LABEL: name: post_bundle_vimage
     ; GFX12: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
+    ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) {
     ; GFX12-NEXT:   $vgpr5 = IMAGE_LOAD_V1_V1_gfx12 $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
     ; GFX12-NEXT:   $vgpr4 = IMAGE_LOAD_V1_V1_gfx12 killed $vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
     ; GFX12-NEXT: }
@@ -25,7 +25,7 @@ body:             |
     ; GFX12-LABEL: name: post_bundle_vsample
     ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 {
+    ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 :: (dereferenceable load (s128), addrspace 8) {
     ; GFX12-NEXT:   $vgpr6_vgpr7_vgpr8_vgpr9 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
     ; GFX12-NEXT:   $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr2, killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
     ; GFX12-NEXT: }
diff --git a/llvm/test/CodeGen/AMDGPU/private-function.ll b/llvm/test/CodeGen/AMDGPU/private-function.ll
new file mode 100644
index 0000000000000..8eefc9dfc5d7e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/private-function.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+
+define private void @foo() {
+; CHECK-LABEL: foo:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  ret void
+}
+
+@var = global ptr @foo
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 85a9aba1a0e51..b91bdd2b2fa71 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -398,11 +398,11 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    flat_load_dwordx2 v[18:19], v[4:5]
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xffffc800, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v3, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[6:7]
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xffffd000, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xffffd800, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, -1, v3, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[6:7]
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0xffffe000, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, -1, v3, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[4:5]
@@ -514,10 +514,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX900-NEXT:    v_add_co_u32_e32 v8, vcc, 0xffffb000, v2
 ; GFX900-NEXT:    v_addc_co_u32_e32 v9, vcc, -1, v3, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
-; GFX900-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
-; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v2
 ; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[8:9], off
+; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v2
 ; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
 ; GFX900-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
 ; GFX900-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off
@@ -526,13 +524,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, s3, v2
 ; GFX900-NEXT:    global_load_dwordx2 v[16:17], v[16:17], off offset:-2048
 ; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
+; GFX900-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
 ; GFX900-NEXT:    s_addk_i32 s5, 0x2000
 ; GFX900-NEXT:    s_cmp_gt_u32 s5, 0x3fffff
-; GFX900-NEXT:    s_waitcnt vmcnt(3)
+; GFX900-NEXT:    s_waitcnt vmcnt(5)
 ; GFX900-NEXT:    v_add_co_u32_e32 v22, vcc, v8, v4
 ; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
 ; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[14:15], off offset:-4096
-; GFX900-NEXT:    s_waitcnt vmcnt(3)
+; GFX900-NEXT:    s_waitcnt vmcnt(5)
 ; GFX900-NEXT:    v_add_co_u32_e64 v24, s[0:1], v18, v22
 ; GFX900-NEXT:    v_addc_co_u32_e64 v25, s[0:1], v19, v5, s[0:1]
 ; GFX900-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
@@ -540,13 +540,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v2
 ; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
 ; GFX900-NEXT:    global_load_dwordx2 v[4:5], v[4:5], off offset:-2048
-; GFX900-NEXT:    s_waitcnt vmcnt(5)
+; GFX900-NEXT:    s_waitcnt vmcnt(7)
 ; GFX900-NEXT:    v_add_co_u32_e32 v20, vcc, v20, v24
 ; GFX900-NEXT:    global_load_dwordx2 v[14:15], v[2:3], off
 ; GFX900-NEXT:    v_addc_co_u32_e32 v21, vcc, v21, v25, vcc
 ; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, 0x10000, v2
 ; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(5)
+; GFX900-NEXT:    s_waitcnt vmcnt(7)
 ; GFX900-NEXT:    v_add_co_u32_e32 v16, vcc, v16, v20
 ; GFX900-NEXT:    v_addc_co_u32_e32 v17, vcc, v17, v21, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(4)
@@ -734,10 +734,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_add_co_u32_e32 v12, vcc, 0xffffb000, v6
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, -1, v7, vcc
-; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
-; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
-; GFX90A-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
 ; GFX90A-NEXT:    global_load_dwordx2 v[12:13], v[12:13], off
+; GFX90A-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v7, vcc
 ; GFX90A-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
 ; GFX90A-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off
@@ -753,39 +751,42 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v7, vcc
 ; GFX90A-NEXT:    global_load_dwordx2 v[14:15], v[22:23], off offset:-2048
 ; GFX90A-NEXT:    global_load_dwordx2 v[30:31], v[6:7], off
+; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
+; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
 ; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x10000, v6
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
 ; GFX90A-NEXT:    s_addk_i32 s3, 0x2000
 ; GFX90A-NEXT:    s_cmp_gt_u32 s3, 0x3fffff
-; GFX90A-NEXT:    s_waitcnt vmcnt(8)
+; GFX90A-NEXT:    s_waitcnt vmcnt(10)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v13, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(7)
+; GFX90A-NEXT:    s_waitcnt vmcnt(9)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v18, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v19, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(6)
+; GFX90A-NEXT:    s_waitcnt vmcnt(8)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v20, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v21, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(5)
+; GFX90A-NEXT:    s_waitcnt vmcnt(7)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v16, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v17, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(4)
+; GFX90A-NEXT:    s_waitcnt vmcnt(6)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v24, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v25, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(3)
+; GFX90A-NEXT:    s_waitcnt vmcnt(5)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v26, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v27, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(2)
+; GFX90A-NEXT:    s_waitcnt vmcnt(4)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v28, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v29, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-NEXT:    s_waitcnt vmcnt(3)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v14, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(1)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v10, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v30, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
 ; GFX90A-NEXT:    s_cbranch_scc0 .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
index f67cbe381bfad..ddb522a82880b 100644
--- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -1,17 +1,17 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
 ; -global-isel=1 SI run line skipped since store not yet implemented.
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-SDAG -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
 
 declare i64 @llvm.readcyclecounter() #0
 
diff --git a/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir
new file mode 100644
index 0000000000000..f098618018839
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir
@@ -0,0 +1,131 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=register-coalescer -verify-coalescing -o - %s | FileCheck %s
+
+# This test is to check fix for failure with "Bad machine code: Defining instruction does not modify register" due to corrupt lane mask.
+
+---
+name:            reg_coalescer_subreg_liveness
+tracksRegLiveness: true
+liveins:
+body:             |
+  ; CHECK-LABEL: name: reg_coalescer_subreg_liveness
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0
+  ; CHECK-NEXT:   TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; CHECK-NEXT:   TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+  ; CHECK-NEXT:   TENSOR_LOAD_TO_LDS_D2 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+  ; CHECK-NEXT:   $vcc_lo = COPY $exec_lo
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 1
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc_lo
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $sgpr4_sgpr5
+
+    %0:sgpr_64 = COPY killed $sgpr4_sgpr5
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+    %2:sreg_32 = S_MOV_B32 1
+    undef %3.sub0:sgpr_128 = COPY %2
+    %4:sreg_32 = S_MOV_B32 0
+    undef %5.sub0:sgpr_256 = COPY %4
+    TENSOR_LOAD_TO_LDS_D2 %3, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+    %6:sgpr_128 = COPY killed %3
+    %6.sub1:sgpr_128 = COPY killed %1
+    %7:sreg_32 = COPY $exec_lo
+    %8:sreg_32 = COPY %2
+    %9:sreg_32 = COPY %4
+
+  bb.1:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+
+    %10:sreg_32 = COPY killed %8
+    undef %11.sub0:sgpr_128 = COPY %2
+    %11.sub1:sgpr_128 = COPY killed %10
+    %11.sub2:sgpr_128 = COPY %2
+    %11.sub3:sgpr_128 = COPY %2
+    TENSOR_LOAD_TO_LDS_D2 killed %11, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+    %12:sreg_32 = COPY killed %9
+    %13:sgpr_128 = COPY %6
+    %13.sub2:sgpr_128 = COPY killed %12
+    TENSOR_LOAD_TO_LDS_D2 killed %13, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+    $vcc_lo = COPY %7
+    %8:sreg_32 = COPY %4
+    %9:sreg_32 = COPY %2
+    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+...
+---
+name:            reg_coalescer_subreg_liveness_2
+tracksRegLiveness: true
+liveins:
+body:             |
+  ; CHECK-LABEL: name: reg_coalescer_subreg_liveness_2
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_LOAD_DWORD_IMM]], implicit [[S_MOV_B32_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $sgpr4_sgpr5
+
+    %0:sgpr_64 = COPY killed $sgpr4_sgpr5
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+    %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+    %3:sreg_32 = S_MOV_B32 1
+    undef %4.sub0:sgpr_128 = COPY %3
+    %5:sgpr_128 = COPY %4
+    %5.sub1:sgpr_128 = COPY killed %2
+    %6:sgpr_128 = COPY %5
+    %6.sub2:sgpr_128 = COPY killed %1
+    %7:sreg_32 = S_MOV_B32 0
+    undef %8.sub0:sgpr_256 = COPY %7
+    %9:sreg_32 = COPY %3
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %10:sreg_32 = COPY killed %9
+    undef %11.sub0:sgpr_128 = COPY %3
+    %11.sub1:sgpr_128 = COPY killed %10
+    S_NOP 0, implicit %5, implicit %8
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
index a97ef058ce5fa..db8908bcbac67 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
@@ -43,17 +43,17 @@ machineFunctionInfo:
 body:             |
   bb.0:
 
-    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:AGPR_32 */, implicit-def $agpr0
+    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1638410 /* regdef:AGPR_32 */, implicit-def $agpr0
     %14:vgpr_32 = COPY killed $agpr0
-    INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 39714826 /* regdef:VReg_512 */, def %7, 19136522 /* regdef:VReg_256 */, def %8, 7602186 /* regdef:VReg_128 */, def %9, 5636106 /* regdef:VReg_96 */, def %10, 5636106 /* regdef:VReg_96 */, def %11
+    INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 39125002 /* regdef:VReg_512 */, def %7, 18546698 /* regdef:VReg_256 */, def %8, 7012362 /* regdef:VReg_128 */, def %9, 5046282 /* regdef:VReg_96 */, def %10, 5046282 /* regdef:VReg_96 */, def %11
     INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 12 /* clobber */, implicit-def dead early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39714825 /* reguse:VReg_512 */, %7
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 19136521 /* reguse:VReg_256 */, %8
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7602185 /* reguse:VReg_128 */, %9
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5636105 /* reguse:VReg_96 */, %10
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5636105 /* reguse:VReg_96 */, %11
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39125001 /* reguse:VReg_512 */, %7
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 18546697 /* reguse:VReg_256 */, %8
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7012361 /* reguse:VReg_128 */, %9
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5046281 /* reguse:VReg_96 */, %10
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5046281 /* reguse:VReg_96 */, %11
     $agpr1 = COPY %14
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, killed $agpr1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, killed $agpr1
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 8abbdad893819..bbc04aa46adc5 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -203,28 +203,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_or_b32_e32 v13, v8, v14
 ; GFX9-NEXT:  .LBB0_6: ; %Flow3
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX9-NEXT:    v_mul_lo_u32 v18, v13, v5
+; GFX9-NEXT:    v_mul_lo_u32 v16, v13, v5
 ; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v23, v13, 0
-; GFX9-NEXT:    v_mov_b32_e32 v15, 0
-; GFX9-NEXT:    v_mul_lo_u32 v9, v11, v4
-; GFX9-NEXT:    v_mov_b32_e32 v14, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v13, v4, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v22, v13, v[14:15]
-; GFX9-NEXT:    v_add3_u32 v8, v8, v18, v9
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v12, v23, v[7:8]
-; GFX9-NEXT:    v_mov_b32_e32 v14, v16
-; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v23, v11, v[14:15]
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v4, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v22, v13, v[6:7]
+; GFX9-NEXT:    v_mul_lo_u32 v15, v11, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v12, v22
-; GFX9-NEXT:    v_mul_lo_u32 v12, v10, v23
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v17, v9
-; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[4:5], 0, 0, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v22, v11, v[9:10]
-; GFX9-NEXT:    v_add3_u32 v4, v12, v7, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v9, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v10, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v6, v13
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v23, v11, v[6:7]
+; GFX9-NEXT:    v_add3_u32 v9, v9, v16, v15
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v23, v[8:9]
+; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v14, v7
+; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[4:5], 0, 0, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v15, v10, v23
+; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v22, v11, v[12:13]
+; GFX9-NEXT:    v_add3_u32 v4, v15, v9, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v10, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v4, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v20
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v21
@@ -1590,25 +1589,24 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX9-NEXT:    v_mul_lo_u32 v19, v12, v7
 ; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v12, 0
-; GFX9-NEXT:    v_mov_b32_e32 v17, 0
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    v_mul_lo_u32 v18, v13, v6
-; GFX9-NEXT:    v_mov_b32_e32 v16, v8
-; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v6, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v5, v12, v[16:17]
-; GFX9-NEXT:    v_add3_u32 v10, v10, v19, v18
-; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v4, v[9:10]
-; GFX9-NEXT:    v_mov_b32_e32 v16, v11
-; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v4, v13, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v5, v12, v[8:9]
+; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v12, v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v16
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v13, v[8:9]
+; GFX9-NEXT:    v_add3_u32 v11, v11, v19, v18
+; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v4, v[10:11]
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v14, v5
-; GFX9-NEXT:    v_mul_lo_u32 v14, v15, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v11
-; GFX9-NEXT:    v_addc_co_u32_e64 v12, s[4:5], 0, 0, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v13, v[11:12]
-; GFX9-NEXT:    v_add3_u32 v6, v14, v9, v6
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v14, vcc, v17, v9
+; GFX9-NEXT:    v_mul_lo_u32 v12, v15, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[4:5], 0, 0, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v13, v[14:15]
+; GFX9-NEXT:    v_add3_u32 v6, v12, v11, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v10
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v6, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
index c552f9d283597..88a51e9ccf04c 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
+++ b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
@@ -1,10 +1,13 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s
 
+; Rematerialization test for fp64 constants (w/ intentionally high register pressure).
+; Check to make sure we have at least six constant MOVs, not necessarily consecutive, inside the loop.
+
 ; GCN-LABEL: {{^}}test_remat_sgpr:
 ; GCN-NOT:     v_writelane_b32
-; GCN-COUNT-4: s_mov_b32 s{{[0-9]+}}, 0x
 ; GCN:         {{^}}[[LOOP:.LBB[0-9_]+]]:
+; GCN-COUNT-6: {{s_mov_b32|v_mov_b32_e32}} {{[sv]}}{{[0-9]+}}, 0x
 ; GCN-NOT:     v_writelane_b32
 ; GCN:         s_cbranch_{{[^ ]+}} [[LOOP]]
 ; GCN: .sgpr_spill_count: 0
diff --git a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir
index 35be513c784bb..8ac85fa9c41a2 100644
--- a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir
+++ b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir
@@ -73,7 +73,7 @@ body: |
 # (1) %0.sub0 + %0.sub0 and (2) %0.sub1 + %0.sub1
 # Check that renaming (2) does not inadvertently rename (1).
 # CHECK-LABEL: name: test2
-# CHECK: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0, 1835018 /* regdef:VGPR_32 */, def dead %1.sub1, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %1.sub1(tied-def 5)
+# CHECK: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %0.sub0, 1245194 /* regdef:VGPR_32 */, def dead %1.sub1, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %1.sub1(tied-def 5)
 name: test2
 body: |
   bb.0:
@@ -81,7 +81,7 @@ body: |
 
   bb.1:
     undef %0.sub1:vreg_64 = V_ALIGNBIT_B32_e64 %0.sub0:vreg_64, %0.sub0:vreg_64, 16, implicit $exec
-    INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0:vreg_64, 1835018 /* regdef:VGPR_32 */, def %0.sub1:vreg_64, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0:vreg_64(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %0.sub1:vreg_64(tied-def 5)
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %0.sub0:vreg_64, 1245194 /* regdef:VGPR_32 */, def %0.sub1:vreg_64, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0:vreg_64(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %0.sub1:vreg_64(tied-def 5)
     S_BRANCH %bb.1
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
index 39f64185b9d57..cdd68630bf4ff 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
@@ -43,7 +43,7 @@ body:             |
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]]
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_64_Align2 */, [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3080201 /* reguse:VReg_64_Align2 */, [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
     %1:av_64_align2 = COPY $vgpr0_vgpr1
@@ -51,7 +51,7 @@ body:             |
     %3:areg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
     %4:vreg_128_align2 = COPY %3
     %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_64_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3080201 /* reguse:VReg_64_Align2 */, %5
     SI_RETURN
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
index c764bc17f0631..d7b713aa53b86 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
@@ -19,7 +19,7 @@ body:             |
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
@@ -30,7 +30,7 @@ body:             |
     %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4
     %5.sub2_sub3 = IMPLICIT_DEF
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
@@ -172,7 +172,7 @@ body:             |
     ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
@@ -183,7 +183,7 @@ body:             |
     undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4.sub2_sub3
     %5.sub2_sub3 = IMPLICIT_DEF
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
@@ -208,7 +208,7 @@ body:             |
     ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
@@ -219,7 +219,7 @@ body:             |
     undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub1:areg_128_align2 = COPY %4.sub2
     %5.sub2_sub3 = IMPLICIT_DEF
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
index c0f5a7737afb0..3f61c3dbfaf37 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
@@ -17,7 +17,7 @@ body:             |
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -26,7 +26,7 @@ body:             |
     %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
     %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 ...
@@ -47,7 +47,7 @@ body:             |
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -56,7 +56,7 @@ body:             |
     %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
     %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 ...
@@ -79,7 +79,7 @@ body:             |
     ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -90,7 +90,7 @@ body:             |
     %other_use:vreg_64_align2 = COPY %4.sub0_sub1
     %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     %6:areg_64_align2 = COPY %5
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %6:areg_64_align2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %6:areg_64_align2
     GLOBAL_STORE_DWORDX2 %0, %6, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     SI_RETURN
 ...
@@ -114,7 +114,7 @@ body:             |
     ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -126,7 +126,7 @@ body:             |
     undef %5.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4, 0, 0, 0, implicit $mode, implicit $exec
     %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     %7:areg_64_align2 = COPY %6
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %7
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %7
     GLOBAL_STORE_DWORDX2 %0, %7, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     SI_RETURN
 
@@ -151,7 +151,7 @@ body:             |
     ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -163,7 +163,7 @@ body:             |
     %other_use:vreg_64_align2 = COPY %5.sub0_sub1
     %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     undef %8.sub0_sub1:areg_128_align2 = COPY %6
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %8:areg_128_align2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %8:areg_128_align2
     GLOBAL_STORE_DWORDX4 %0, %8, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 
@@ -189,7 +189,7 @@ body:             |
     ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:agpr_32 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -202,7 +202,7 @@ body:             |
     %other_use1:vreg_64_align2 = COPY %5.sub0_sub1
     %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     %8:agpr_32 = COPY %6
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, %8:agpr_32
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, %8:agpr_32
     GLOBAL_STORE_DWORD %0, %8, 0, 0, implicit $exec :: (store (s32), addrspace 1)
     SI_RETURN
 
@@ -231,7 +231,7 @@ body:             |
     ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -245,7 +245,7 @@ body:             |
     %other_use1:vreg_64_align2 = COPY %4.sub2_sub3
     %other_use2:vreg_64 = COPY %4.sub1_sub2
     %6:areg_128_align2 = COPY %4
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %6:areg_128_align2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %6:areg_128_align2
     GLOBAL_STORE_DWORDX4 %0, %6, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 ...
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -287,7 +287,7 @@ body:             |
     %other_use1:vreg_64_align2 = COPY %4.sub2_sub3
     %other_use2:vreg_64 = COPY %4.sub1_sub2
     %6:areg_64 = COPY %4.sub1_sub2
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, %6:areg_64
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:AReg_64 */, %6:areg_64
     GLOBAL_STORE_DWORDX2 %0, %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     SI_RETURN
 ...
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -327,7 +327,7 @@ body:             |
     %other_use1:vreg_64_align2 = COPY %4.sub2_sub3
     %other_use2:vreg_64 = COPY %4.sub1_sub2
     %6:areg_64 = COPY %4.sub1_sub2
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, %6:areg_64
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:AReg_64 */, %6:areg_64
     GLOBAL_STORE_DWORDX2 %0, %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     SI_RETURN
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 8803f3ae4906f..fc799162e999a 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck %s
 
 target triple = "amdgcn-amd-amdhsa"
 
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 52ef811875f88..a6c019bf374d7 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -18,7 +18,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add
 ; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    v_mov_b32_e32 v1, v0
@@ -40,7 +40,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, v0
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
@@ -86,7 +86,7 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add
 ; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 16
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    v_mov_b32_e32 v1, v0
@@ -108,7 +108,7 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, v0
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
index af882c06e1b4e..b754a6b897159 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
@@ -37,7 +37,7 @@ body:             |
   ; CHECK-NEXT:   dead [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[V_ADD_U32_e32_]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
   ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub1:vreg_512 = COPY [[COPY]].sub1
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead [[COPY1]], 1835018 /* regdef:VGPR_32 */, def dead [[COPY]].sub1, 1835017 /* reguse:VGPR_32 */, [[COPY1]], 1835017 /* reguse:VGPR_32 */, [[COPY]].sub1
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead [[COPY1]], 1245194 /* regdef:VGPR_32 */, def dead [[COPY]].sub1, 1245193 /* reguse:VGPR_32 */, [[COPY1]], 1245193 /* reguse:VGPR_32 */, [[COPY]].sub1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub0:vreg_512 = COPY [[COPY]].sub0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub3:vreg_512 = COPY [[COPY]].sub3
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_1]]
@@ -63,7 +63,7 @@ body:             |
     undef %11.sub0:vreg_512 = COPY %4.sub0
     %12:vgpr_32 = COPY %4.sub0
     %11.sub1:vreg_512 = COPY %4.sub1
-    INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead %12:vgpr_32, 1835018 /* regdef:VGPR_32 */, def dead %4.sub1:vreg_512, 1835017 /* reguse:VGPR_32 */, %12:vgpr_32, 1835017 /* reguse:VGPR_32 */, %4.sub1:vreg_512
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead %12:vgpr_32, 1245194 /* regdef:VGPR_32 */, def dead %4.sub1:vreg_512, 1245193 /* reguse:VGPR_32 */, %12:vgpr_32, 1245193 /* reguse:VGPR_32 */, %4.sub1:vreg_512
     %11.sub2:vreg_512 = COPY undef %1
     %11.sub3:vreg_512 = COPY %4.sub3
     %11.sub5:vreg_512 = COPY undef %1
diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
index 3e8e1878e0be5..5edb9669d98eb 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
@@ -40,18 +40,18 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead %11
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead %11
   ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
   ; CHECK-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3)
-  ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %15, 1835018 /* regdef:VGPR_32 */, def %16
+  ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %15, 1245194 /* regdef:VGPR_32 */, def %16
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %21, 1835018 /* regdef:VGPR_32 */, def %22
+  ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %21, 1245194 /* regdef:VGPR_32 */, def %22
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_3]], 1835018 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_4]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 1835017 /* reguse:VGPR_32 */, %15, 1835017 /* reguse:VGPR_32 */, %16, 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_1]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_3]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_2]]
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_3]], 1245194 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_4]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 1245193 /* reguse:VGPR_32 */, %15, 1245193 /* reguse:VGPR_32 */, %16, 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_1]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_3]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_2]]
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]]
   ; CHECK-NEXT:   DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3)
   ; CHECK-NEXT:   DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3)
@@ -94,21 +94,21 @@ body:             |
     %10:vgpr_32 = IMPLICIT_DEF
 
   bb.1:
-    INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11:vgpr_32
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %11:vgpr_32
     GLOBAL_STORE_DWORD undef %12:vreg_64, %1, 0, 0, implicit $exec :: (store (s32), addrspace 1)
     %13:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3)
-    INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %15:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %16:vgpr_32
+    INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %15:vgpr_32, 1245194 /* regdef:VGPR_32 */, def %16:vgpr_32
     %17:vgpr_32 = DS_READ_B32_gfx9 %6, 0, 0, implicit $exec
     %18:vgpr_32 = DS_READ_B32_gfx9 %7, 0, 0, implicit $exec
     %19:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec
-    INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %21:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %22:vgpr_32
+    INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %21:vgpr_32, 1245194 /* regdef:VGPR_32 */, def %22:vgpr_32
     %23:vgpr_32 = DS_READ_B32_gfx9 %7, 0, 0, implicit $exec
     %24:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %5.sub1:vreg_64 = COPY %6
     %25:vgpr_32 = V_ADD_U32_e32 1, %10, implicit $exec
     %26:sreg_64_xexec = V_CMP_GT_U32_e64 64, %25, implicit $exec
     %27:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead %24:vgpr_32, 1835018 /* regdef:VGPR_32 */, def dead %27:vgpr_32, 1835017 /* reguse:VGPR_32 */, %13.sub0:vreg_64, 2147483657 /* reguse tiedto:$0 */, %24:vgpr_32(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %27:vgpr_32(tied-def 5), 1835017 /* reguse:VGPR_32 */, %15, 1835017 /* reguse:VGPR_32 */, %16, 1835017 /* reguse:VGPR_32 */, %18, 1835017 /* reguse:VGPR_32 */, %17, 1835017 /* reguse:VGPR_32 */, %23, 1835017 /* reguse:VGPR_32 */, %19
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead %24:vgpr_32, 1245194 /* regdef:VGPR_32 */, def dead %27:vgpr_32, 1245193 /* reguse:VGPR_32 */, %13.sub0:vreg_64, 2147483657 /* reguse tiedto:$0 */, %24:vgpr_32(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %27:vgpr_32(tied-def 5), 1245193 /* reguse:VGPR_32 */, %15, 1245193 /* reguse:VGPR_32 */, %16, 1245193 /* reguse:VGPR_32 */, %18, 1245193 /* reguse:VGPR_32 */, %17, 1245193 /* reguse:VGPR_32 */, %23, 1245193 /* reguse:VGPR_32 */, %19
     DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3)
     DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3)
     DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store (s64), addrspace 3)
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-no-unclustered-regions.mir b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-no-unclustered-regions.mir
new file mode 100644
index 0000000000000..f08facb503f24
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-no-unclustered-regions.mir
@@ -0,0 +1,56 @@
+# REQUIRES: asserts
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -stress-regalloc=4 -debug-only=machine-scheduler %s -o - 2>&1 | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @no_sched_metric_due_to_spills() #0 {
+    ret void
+  }
+
+  attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+...
+
+# This test checks for the following scenario: Unclustered high-RP-reschedule
+# stage raises the occupancy target temporarily but no region gets scheduled
+# because of constraints. Then, DAG and MFI min-occupancy should not be changed
+# at the end of the unclustered schedule stage.
+# CHECK: Retrying function scheduling without clustering. Aggressively try to reduce register pressure to achieve occupancy 5.
+# CHECK: Unclustered High Register Pressure Reschedule: No regions scheduled, min occupancy stays at 4, MFI occupancy stays at 4.
+
+---
+name:            no_sched_metric_due_to_spills
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       4
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1, $sgpr15
+
+    %0:sgpr_32 = COPY $sgpr15
+    %1:sgpr_64 = COPY $sgpr0_sgpr1
+    %2:vgpr_32 = COPY $vgpr0
+    %3:sgpr_128 = S_LOAD_DWORDX4_IMM %1, 0, 0 :: (dereferenceable invariant load (s128), addrspace 4)
+    undef %4.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %1, 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+    %5:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 32, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+    %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 64, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+    %7:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 84, 0 :: (dereferenceable invariant load (s32), addrspace 4)
+    %8:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 112, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+    %9:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 128, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+    %10:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 176, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+    %11:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 192, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+    %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1, 216, 0 :: (dereferenceable invariant load (s64), addrspace 4)
+    %13:sreg_32 = S_ADD_I32 %12.sub0, 127, implicit-def dead $scc
+    %14:sreg_32 = S_ASHR_I32 %13, 31, implicit-def dead $scc
+    %15:sreg_32 = S_LSHR_B32 %14, 25, implicit-def dead $scc
+    %16:sreg_32 = S_ADD_I32 %13, %15, implicit-def dead $scc
+    %17:sreg_32 = S_ASHR_I32 %16, 7, implicit-def dead $scc
+    %18:sreg_32 = S_ADD_I32 %12.sub1, 255, implicit-def dead $scc
+    %19:sreg_32 = S_ASHR_I32 %18, 31, implicit-def dead $scc
+    %20:sreg_32 = S_LSHR_B32 %19, 24, implicit-def dead $scc
+    %21:sreg_32 = S_ADD_I32 %18, %20, implicit-def dead $scc
+    %22:sreg_32 = S_ASHR_I32 %21, 8, implicit-def dead $scc
+    %23:sreg_32 = nsw S_MUL_I32 %22, %17
+    %24:sreg_32 = S_ASHR_I32 %0, 31, implicit-def dead $scc
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll b/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll
index 118c47e680709..cac1fe9605a17 100644
--- a/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll
@@ -46,7 +46,7 @@ define amdgpu_ps void @_amdgpu_ps_main(float %arg) {
 ; GFX900-NEXT:    s_mov_b64 exec, 0
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v1, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX900-NEXT:  .LBB0_5: ; %bb6
 ; GFX900-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
@@ -75,7 +75,7 @@ bb5:
 bb6:
   %i7 = phi float [ 0.000000e+00, %bb5 ], [ %i3, %bb1 ]
   %i8 = phi float [ 0.000000e+00, %bb5 ], [ 1.000000e+00, %bb1 ]
-  %i9 = phi float [ undef, %bb5 ], [ %i4, %bb1 ]
+  %i9 = phi float [ poison, %bb5 ], [ %i4, %bb1 ]
   %i10 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float %i7)
   %i11 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %i8, float %i9)
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> %i10, <2 x half> %i11, i1 false, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index 7a3bff8aed56e..840916aa63949 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -28,15 +28,20 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; SI-NEXT:    s_mov_b32 s7, 0xe8f000
 ; SI-NEXT:    s_add_u32 s4, s4, s0
 ; SI-NEXT:    s_addc_u32 s5, s5, 0
+; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -44,26 +49,19 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -76,19 +74,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
 ; SI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -97,8 +84,22 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
@@ -106,17 +107,16 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -153,37 +153,35 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; VI-NEXT:    s_mov_b32 s7, 0xe80000
 ; VI-NEXT:    s_add_u32 s4, s4, s0
 ; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
-; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -193,19 +191,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -213,24 +200,37 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
+; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -266,36 +266,33 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
-; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -305,26 +302,30 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX9-MUBUF-NEXT:    s_nop 0
@@ -333,16 +334,15 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -392,7 +392,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -435,6 +434,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
@@ -448,8 +448,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -457,6 +455,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -503,7 +503,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -546,6 +545,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
@@ -559,8 +559,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -568,6 +566,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -974,42 +974,43 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v7, 0x3f523be1
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v15, 0x3e319356
 ; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT:    s_clause 0x3
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -1024,8 +1025,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -1051,15 +1051,20 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; SI-NEXT:    s_mov_b32 s7, 0xe8f000
 ; SI-NEXT:    s_add_u32 s4, s4, s0
 ; SI-NEXT:    s_addc_u32 s5, s5, 0
+; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -1067,26 +1072,19 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
 ; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -1099,19 +1097,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
 ; SI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -1120,8 +1107,22 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
@@ -1129,17 +1130,16 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -1176,37 +1176,35 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; VI-NEXT:    s_mov_b32 s7, 0xe80000
 ; VI-NEXT:    s_add_u32 s4, s4, s0
 ; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
-; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -1216,19 +1214,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -1236,24 +1223,37 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
+; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -1289,36 +1289,33 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
-; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -1328,26 +1325,30 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX9-MUBUF-NEXT:    s_nop 0
@@ -1356,16 +1357,15 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -1415,7 +1415,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -1458,6 +1457,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
@@ -1471,8 +1471,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -1480,6 +1478,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -1526,7 +1526,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -1569,6 +1568,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
@@ -1582,8 +1582,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -1591,6 +1589,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -1997,42 +1997,43 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v7, 0x3f523be1
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v15, 0x3e319356
 ; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT:    s_clause 0x3
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -2047,8 +2048,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -2074,15 +2074,20 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; SI-NEXT:    s_mov_b32 s7, 0xe8f000
 ; SI-NEXT:    s_add_u32 s4, s4, s0
 ; SI-NEXT:    s_addc_u32 s5, s5, 0
+; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -2090,26 +2095,19 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -2122,19 +2120,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
 ; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -2143,8 +2130,22 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
@@ -2152,17 +2153,16 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -2199,37 +2199,35 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; VI-NEXT:    s_mov_b32 s7, 0xe80000
 ; VI-NEXT:    s_add_u32 s4, s4, s0
 ; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
-; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -2239,19 +2237,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -2259,24 +2246,37 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
+; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -2312,36 +2312,33 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
-; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -2351,26 +2348,30 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX9-MUBUF-NEXT:    s_nop 0
@@ -2379,16 +2380,15 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -2438,7 +2438,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -2481,6 +2480,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
@@ -2494,8 +2494,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -2503,6 +2501,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -2549,7 +2549,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -2592,6 +2591,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
@@ -2605,8 +2605,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -2614,6 +2612,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -3020,42 +3020,43 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v7, 0x3f523be1
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v15, 0x3e319356
 ; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT:    s_clause 0x3
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -3070,8 +3071,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -3097,15 +3097,20 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; SI-NEXT:    s_mov_b32 s7, 0xe8f000
 ; SI-NEXT:    s_add_u32 s4, s4, s0
 ; SI-NEXT:    s_addc_u32 s5, s5, 0
+; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -3113,26 +3118,19 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -3145,19 +3143,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
 ; SI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -3166,8 +3153,22 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
@@ -3175,17 +3176,16 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -3222,37 +3222,35 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; VI-NEXT:    s_mov_b32 s7, 0xe80000
 ; VI-NEXT:    s_add_u32 s4, s4, s0
 ; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
-; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -3262,19 +3260,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -3282,24 +3269,37 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
+; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -3334,65 +3334,66 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s5
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
-; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:236
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
-; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
-; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
-; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
+; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX9-MUBUF-NEXT:    s_nop 0
@@ -3401,16 +3402,15 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -3459,7 +3459,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -3502,6 +3501,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
@@ -3515,8 +3515,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -3524,6 +3522,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -3569,7 +3569,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -3612,6 +3611,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
@@ -3625,8 +3625,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -3634,6 +3632,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -4040,42 +4040,43 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v7, 0x3f523be1
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v15, 0x3e319356
 ; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT:    s_clause 0x3
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -4090,8 +4091,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -4117,15 +4117,20 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; SI-NEXT:    s_mov_b32 s7, 0xe8f000
 ; SI-NEXT:    s_add_u32 s4, s4, s0
 ; SI-NEXT:    s_addc_u32 s5, s5, 0
+; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -4133,26 +4138,19 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -4165,19 +4163,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
 ; SI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -4186,8 +4173,22 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
@@ -4195,17 +4196,16 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -4242,37 +4242,35 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; VI-NEXT:    s_mov_b32 s7, 0xe80000
 ; VI-NEXT:    s_add_u32 s4, s4, s0
 ; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
-; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -4282,19 +4280,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
@@ -4302,24 +4289,37 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
+; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -4354,36 +4354,33 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s5
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
-; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -4393,26 +4390,30 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX9-MUBUF-NEXT:    s_nop 0
@@ -4421,16 +4422,15 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -4479,7 +4479,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -4522,6 +4521,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
@@ -4535,8 +4535,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -4544,6 +4542,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -4589,7 +4589,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -4632,6 +4631,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
@@ -4645,8 +4645,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -4654,6 +4652,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -5060,42 +5060,43 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v7, 0x3f523be1
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v15, 0x3e319356
 ; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT:    s_clause 0x3
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -5110,8 +5111,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -5141,15 +5141,20 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    s_mov_b32 s11, 0xe8f000
 ; SI-NEXT:    s_add_u32 s8, s8, s6
 ; SI-NEXT:    s_addc_u32 s9, s9, 0
+; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
-; SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
 ; SI-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
 ; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
@@ -5157,26 +5162,19 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
-; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
-; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
-; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -5189,19 +5187,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
 ; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
-; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
+; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
 ; SI-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
@@ -5210,8 +5197,22 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
+; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
+; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
+; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
+; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
@@ -5219,17 +5220,16 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
-; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
-; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
-; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
 ; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
 ; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
+; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
+; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
+; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5267,37 +5267,35 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    s_mov_b32 s11, 0xe80000
 ; VI-NEXT:    s_add_u32 s8, s8, s6
 ; VI-NEXT:    s_addc_u32 s9, s9, 0
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
-; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
-; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
-; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -5307,19 +5305,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
 ; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
-; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
+; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
 ; VI-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
@@ -5327,24 +5314,37 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
+; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
+; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
+; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
+; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; VI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
-; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5380,36 +5380,33 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s11, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s8, s8, s5
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
-; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
-; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -5419,26 +5416,30 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:200
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
+; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
+; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
-; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:200
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX9-MUBUF-NEXT:    s_nop 0
@@ -5447,16 +5448,15 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5491,10 +5491,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    s_mov_b32 s10, -1
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v1, 0xbf20e7f4
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W32-MUBUF-NEXT:    s_mov_b32 s11, 0x31c16000
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f523be1
 ; GFX10_W32-MUBUF-NEXT:    s_add_u32 s8, s8, s5
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbefcd8a3
 ; GFX10_W32-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
@@ -5505,8 +5505,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:312
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -5549,6 +5547,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:196
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
@@ -5562,8 +5562,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -5571,6 +5569,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -5602,10 +5602,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    s_mov_b32 s10, -1
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v1, 0xbf20e7f4
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W64-MUBUF-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f523be1
 ; GFX10_W64-MUBUF-NEXT:    s_add_u32 s8, s8, s5
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbefcd8a3
 ; GFX10_W64-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
@@ -5616,8 +5616,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:312
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -5660,6 +5658,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:196
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
@@ -5673,8 +5673,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -5682,6 +5680,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -6093,10 +6093,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v6, 0x3f5f2ee2
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:320
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v4, v6
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
@@ -6105,29 +6105,31 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v10, 0xb7043519
+; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v11, 0xbe31934f
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT:    s_clause 0x3
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -6142,8 +6144,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -6172,15 +6173,20 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    s_mov_b32 s11, 0xe8f000
 ; SI-NEXT:    s_add_u32 s8, s8, s6
 ; SI-NEXT:    s_addc_u32 s9, s9, 0
+; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
-; SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
 ; SI-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
 ; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
@@ -6188,26 +6194,19 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
-; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
-; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
-; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -6220,19 +6219,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
-; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
 ; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
-; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
+; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
 ; SI-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
@@ -6241,8 +6229,22 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
+; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
+; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
+; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
+; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
@@ -6250,17 +6252,16 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
-; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
-; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
-; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
 ; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
 ; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
+; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
+; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
+; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6298,37 +6299,35 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    s_mov_b32 s11, 0xe80000
 ; VI-NEXT:    s_add_u32 s8, s8, s6
 ; VI-NEXT:    s_addc_u32 s9, s9, 0
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
-; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
-; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
-; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -6338,19 +6337,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
 ; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
-; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
+; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
 ; VI-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
@@ -6358,24 +6346,37 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
+; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
+; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
+; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
+; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; VI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
-; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6411,36 +6412,33 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s11, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s8, s8, s5
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
-; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
-; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -6450,26 +6448,30 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:200
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
+; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
+; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
-; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:200
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX9-MUBUF-NEXT:    s_nop 0
@@ -6478,16 +6480,15 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6522,10 +6523,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    s_mov_b32 s10, -1
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v1, 0xbf20e7f4
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W32-MUBUF-NEXT:    s_mov_b32 s11, 0x31c16000
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f523be1
 ; GFX10_W32-MUBUF-NEXT:    s_add_u32 s8, s8, s5
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbefcd8a3
 ; GFX10_W32-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
@@ -6536,8 +6537,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:312
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -6580,6 +6579,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:196
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
@@ -6593,8 +6594,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -6602,6 +6601,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -6633,10 +6634,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    s_mov_b32 s10, -1
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v1, 0xbf20e7f4
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W64-MUBUF-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f523be1
 ; GFX10_W64-MUBUF-NEXT:    s_add_u32 s8, s8, s5
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbefcd8a3
 ; GFX10_W64-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
@@ -6647,8 +6648,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:312
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -6691,6 +6690,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:196
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
@@ -6704,8 +6705,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -6713,6 +6712,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -7124,10 +7125,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v6, 0x3f5f2ee2
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:320
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v4, v6
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
@@ -7136,29 +7137,31 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v10, 0xb7043519
+; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v11, 0xbe31934f
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT:    s_clause 0x3
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -7173,8 +7176,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 71f5a94a7f245..74a6d7fe39362 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -8,7 +8,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_ashr_i32 s8, s1, 31
 ; GCN-NEXT:    s_add_u32 s0, s0, s8
@@ -17,8 +16,8 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[8:9]
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GCN-NEXT:    s_sub_u32 s12, 0, s10
-; GCN-NEXT:    s_subb_u32 s13, 0, s11
+; GCN-NEXT:    s_sub_u32 s0, 0, s10
+; GCN-NEXT:    s_subb_u32 s1, 0, s11
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -27,128 +26,121 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GCN-NEXT:    v_readfirstlane_b32 s14, v1
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_mul_i32 s1, s12, s14
-; GCN-NEXT:    v_readfirstlane_b32 s17, v2
-; GCN-NEXT:    s_mul_i32 s15, s13, s0
-; GCN-NEXT:    s_mul_i32 s16, s12, s0
-; GCN-NEXT:    s_add_i32 s1, s17, s1
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s16
-; GCN-NEXT:    s_add_i32 s1, s1, s15
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s1
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s16
-; GCN-NEXT:    v_readfirstlane_b32 s15, v3
-; GCN-NEXT:    s_mul_i32 s17, s0, s1
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s1
-; GCN-NEXT:    s_add_u32 s15, s15, s17
-; GCN-NEXT:    v_readfirstlane_b32 s17, v0
-; GCN-NEXT:    s_addc_u32 s17, 0, s17
-; GCN-NEXT:    s_mul_i32 s16, s14, s16
-; GCN-NEXT:    v_readfirstlane_b32 s18, v4
-; GCN-NEXT:    s_add_u32 s15, s15, s16
-; GCN-NEXT:    s_addc_u32 s15, s17, s18
-; GCN-NEXT:    v_readfirstlane_b32 s16, v1
-; GCN-NEXT:    s_addc_u32 s16, s16, 0
-; GCN-NEXT:    s_mul_i32 s1, s14, s1
-; GCN-NEXT:    s_add_u32 s1, s15, s1
-; GCN-NEXT:    s_addc_u32 s15, 0, s16
-; GCN-NEXT:    s_add_u32 s16, s0, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NEXT:    v_mul_hi_u32 v0, s12, v0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s14, s14, s15
-; GCN-NEXT:    s_mul_i32 s0, s12, s14
-; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s0, s1, s0
-; GCN-NEXT:    s_mul_i32 s13, s13, s16
-; GCN-NEXT:    s_mul_i32 s1, s12, s16
-; GCN-NEXT:    s_add_i32 s0, s0, s13
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mul_hi_u32 v3, s14, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s16, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s14, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s16, v0
-; GCN-NEXT:    s_mul_i32 s13, s16, s0
-; GCN-NEXT:    v_readfirstlane_b32 s17, v2
-; GCN-NEXT:    s_add_u32 s13, s17, s13
-; GCN-NEXT:    v_readfirstlane_b32 s15, v0
-; GCN-NEXT:    s_mul_i32 s1, s14, s1
-; GCN-NEXT:    s_addc_u32 s15, 0, s15
-; GCN-NEXT:    v_readfirstlane_b32 s12, v3
-; GCN-NEXT:    s_add_u32 s1, s13, s1
-; GCN-NEXT:    s_addc_u32 s1, s15, s12
+; GCN-NEXT:    v_mul_hi_u32 v2, s0, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s12, v1
-; GCN-NEXT:    s_addc_u32 s12, s12, 0
-; GCN-NEXT:    s_mul_i32 s0, s14, s0
-; GCN-NEXT:    s_add_u32 s0, s1, s0
-; GCN-NEXT:    s_addc_u32 s12, 0, s12
-; GCN-NEXT:    s_add_u32 s15, s16, s0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s14, s14, s12
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    s_mul_i32 s13, s0, s12
+; GCN-NEXT:    v_readfirstlane_b32 s16, v2
+; GCN-NEXT:    s_mul_i32 s14, s1, s2
+; GCN-NEXT:    s_mul_i32 s15, s0, s2
+; GCN-NEXT:    s_add_i32 s13, s16, s13
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s15
+; GCN-NEXT:    s_add_i32 s13, s13, s14
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s13
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s15
+; GCN-NEXT:    v_readfirstlane_b32 s14, v3
+; GCN-NEXT:    s_mul_i32 s16, s2, s13
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s13
+; GCN-NEXT:    s_add_u32 s14, s14, s16
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    s_mul_i32 s15, s12, s15
+; GCN-NEXT:    s_addc_u32 s16, 0, s16
+; GCN-NEXT:    v_readfirstlane_b32 s17, v4
+; GCN-NEXT:    s_add_u32 s14, s14, s15
+; GCN-NEXT:    s_addc_u32 s14, s16, s17
+; GCN-NEXT:    v_readfirstlane_b32 s15, v1
+; GCN-NEXT:    s_addc_u32 s15, s15, 0
+; GCN-NEXT:    s_mul_i32 s13, s12, s13
+; GCN-NEXT:    s_add_u32 s13, s14, s13
+; GCN-NEXT:    s_addc_u32 s14, 0, s15
+; GCN-NEXT:    s_add_u32 s13, s2, s13
+; GCN-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT:    s_addc_u32 s12, s12, s14
+; GCN-NEXT:    s_mul_i32 s14, s0, s12
+; GCN-NEXT:    s_mul_i32 s1, s1, s13
+; GCN-NEXT:    v_readfirstlane_b32 s15, v0
+; GCN-NEXT:    s_add_i32 s14, s15, s14
+; GCN-NEXT:    s_mul_i32 s0, s0, s13
+; GCN-NEXT:    s_add_i32 s1, s14, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mul_hi_u32 v3, s12, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s13, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s12, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GCN-NEXT:    s_mul_i32 s15, s13, s1
+; GCN-NEXT:    v_readfirstlane_b32 s17, v2
+; GCN-NEXT:    s_add_u32 s15, s17, s15
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    s_mul_i32 s0, s12, s0
+; GCN-NEXT:    s_addc_u32 s16, 0, s16
+; GCN-NEXT:    v_readfirstlane_b32 s14, v3
+; GCN-NEXT:    s_add_u32 s0, s15, s0
+; GCN-NEXT:    s_addc_u32 s0, s16, s14
+; GCN-NEXT:    v_readfirstlane_b32 s14, v1
+; GCN-NEXT:    s_addc_u32 s14, s14, 0
+; GCN-NEXT:    s_mul_i32 s1, s12, s1
+; GCN-NEXT:    s_add_u32 s0, s0, s1
+; GCN-NEXT:    s_addc_u32 s1, 0, s14
+; GCN-NEXT:    s_add_u32 s14, s13, s0
+; GCN-NEXT:    s_addc_u32 s15, s12, s1
 ; GCN-NEXT:    s_ashr_i32 s12, s7, 31
 ; GCN-NEXT:    s_add_u32 s0, s6, s12
 ; GCN-NEXT:    s_mov_b32 s13, s12
 ; GCN-NEXT:    s_addc_u32 s1, s7, s12
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NEXT:    v_mov_b32_e32 v0, s15
 ; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
-; GCN-NEXT:    v_mov_b32_e32 v2, s15
+; GCN-NEXT:    v_mov_b32_e32 v2, s14
 ; GCN-NEXT:    v_mul_hi_u32 v3, s6, v2
 ; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s7, v2
-; GCN-NEXT:    s_mul_i32 s1, s6, s14
+; GCN-NEXT:    s_mul_i32 s1, s6, s15
 ; GCN-NEXT:    v_readfirstlane_b32 s16, v3
 ; GCN-NEXT:    v_mul_hi_u32 v0, s7, v0
 ; GCN-NEXT:    s_add_u32 s1, s16, s1
 ; GCN-NEXT:    s_addc_u32 s4, 0, s4
-; GCN-NEXT:    s_mul_i32 s15, s7, s15
+; GCN-NEXT:    s_mul_i32 s14, s7, s14
 ; GCN-NEXT:    v_readfirstlane_b32 s16, v1
-; GCN-NEXT:    s_add_u32 s1, s1, s15
+; GCN-NEXT:    s_add_u32 s1, s1, s14
 ; GCN-NEXT:    s_addc_u32 s1, s4, s16
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NEXT:    s_addc_u32 s4, s4, 0
-; GCN-NEXT:    s_mul_i32 s14, s7, s14
-; GCN-NEXT:    s_add_u32 s16, s1, s14
-; GCN-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NEXT:    s_mul_i32 s14, s7, s15
+; GCN-NEXT:    s_add_u32 s14, s1, s14
+; GCN-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT:    s_addc_u32 s17, 0, s4
+; GCN-NEXT:    s_addc_u32 s15, 0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    s_mul_i32 s4, s10, s17
+; GCN-NEXT:    s_mul_i32 s4, s10, s15
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
 ; GCN-NEXT:    s_add_i32 s4, s5, s4
-; GCN-NEXT:    s_mul_i32 s5, s11, s16
-; GCN-NEXT:    s_add_i32 s18, s4, s5
-; GCN-NEXT:    s_sub_i32 s14, s7, s18
-; GCN-NEXT:    s_mul_i32 s4, s10, s16
+; GCN-NEXT:    s_mul_i32 s5, s11, s14
+; GCN-NEXT:    s_add_i32 s16, s4, s5
+; GCN-NEXT:    s_sub_i32 s17, s7, s16
+; GCN-NEXT:    s_mul_i32 s4, s10, s14
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s15, s4, s5
-; GCN-NEXT:    s_subb_u32 s19, s14, s11
-; GCN-NEXT:    s_sub_u32 s20, s6, s10
-; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT:    s_or_b32 s14, s14, s15
-; GCN-NEXT:    s_subb_u32 s14, s19, 0
-; GCN-NEXT:    s_cmp_ge_u32 s14, s11
-; GCN-NEXT:    s_cselect_b32 s15, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s20, s10
+; GCN-NEXT:    s_subb_u32 s17, s17, s11
+; GCN-NEXT:    s_sub_u32 s18, s6, s10
+; GCN-NEXT:    s_subb_u32 s17, s17, 0
+; GCN-NEXT:    s_cmp_ge_u32 s17, s11
 ; GCN-NEXT:    s_cselect_b32 s19, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s14, s11
-; GCN-NEXT:    s_cselect_b32 s14, s19, s15
-; GCN-NEXT:    s_add_u32 s15, s16, 1
-; GCN-NEXT:    s_addc_u32 s19, s17, 0
-; GCN-NEXT:    s_add_u32 s20, s16, 2
-; GCN-NEXT:    s_addc_u32 s21, s17, 0
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_cselect_b32 s14, s20, s15
-; GCN-NEXT:    s_cselect_b32 s15, s21, s19
+; GCN-NEXT:    s_cmp_ge_u32 s18, s10
+; GCN-NEXT:    s_cselect_b32 s18, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s17, s11
+; GCN-NEXT:    s_cselect_b32 s17, s18, s19
+; GCN-NEXT:    s_add_u32 s18, s14, 1
+; GCN-NEXT:    s_addc_u32 s19, s15, 0
+; GCN-NEXT:    s_add_u32 s20, s14, 2
+; GCN-NEXT:    s_addc_u32 s21, s15, 0
+; GCN-NEXT:    s_cmp_lg_u32 s17, 0
+; GCN-NEXT:    s_cselect_b32 s17, s20, s18
+; GCN-NEXT:    s_cselect_b32 s18, s21, s19
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_subb_u32 s4, s7, s18
+; GCN-NEXT:    s_subb_u32 s4, s7, s16
 ; GCN-NEXT:    s_cmp_ge_u32 s4, s11
 ; GCN-NEXT:    s_cselect_b32 s5, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s10
@@ -156,13 +148,14 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_cmp_eq_u32 s4, s11
 ; GCN-NEXT:    s_cselect_b32 s4, s6, s5
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s5, s15, s17
-; GCN-NEXT:    s_cselect_b32 s4, s14, s16
+; GCN-NEXT:    s_cselect_b32 s5, s18, s15
+; GCN-NEXT:    s_cselect_b32 s4, s17, s14
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[12:13], s[8:9]
 ; GCN-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-NEXT:    s_sub_u32 s4, s4, s6
 ; GCN-NEXT:    s_subb_u32 s5, s5, s7
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -202,8 +195,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s18, s16, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s10, s10, s11
 ; GCN-IR-NEXT:    s_addc_u32 s10, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s16, 63, s16
@@ -235,8 +226,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_sub_u32 s16, s16, s20
 ; GCN-IR-NEXT:    s_subb_u32 s17, s17, s21
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s20, s20, s21
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[8:9]
@@ -1150,8 +1139,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GCN-NEXT:    s_sub_u32 s2, 0, s6
-; GCN-NEXT:    s_subb_u32 s10, 0, s7
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_subb_u32 s8, 0, s7
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1161,115 +1149,109 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, s2, v0
-; GCN-NEXT:    v_readfirstlane_b32 s11, v1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v0
-; GCN-NEXT:    s_mul_i32 s9, s2, s11
-; GCN-NEXT:    v_readfirstlane_b32 s14, v2
-; GCN-NEXT:    s_mul_i32 s12, s10, s8
-; GCN-NEXT:    s_mul_i32 s13, s2, s8
-; GCN-NEXT:    s_add_i32 s9, s14, s9
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s13
-; GCN-NEXT:    s_add_i32 s9, s9, s12
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s9
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s13
-; GCN-NEXT:    v_readfirstlane_b32 s12, v3
-; GCN-NEXT:    s_mul_i32 s15, s8, s9
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s9
-; GCN-NEXT:    s_add_u32 s12, s12, s15
-; GCN-NEXT:    v_readfirstlane_b32 s15, v0
-; GCN-NEXT:    s_mul_i32 s13, s11, s13
-; GCN-NEXT:    s_addc_u32 s15, 0, s15
-; GCN-NEXT:    v_readfirstlane_b32 s14, v4
-; GCN-NEXT:    s_add_u32 s12, s12, s13
-; GCN-NEXT:    s_addc_u32 s12, s15, s14
-; GCN-NEXT:    v_readfirstlane_b32 s13, v1
-; GCN-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-NEXT:    s_mul_i32 s9, s11, s9
-; GCN-NEXT:    s_add_u32 s9, s12, s9
-; GCN-NEXT:    s_addc_u32 s12, 0, s13
-; GCN-NEXT:    s_add_u32 s13, s8, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-NEXT:    v_readfirstlane_b32 s9, v1
+; GCN-NEXT:    v_readfirstlane_b32 s3, v0
+; GCN-NEXT:    s_mul_i32 s10, s2, s9
+; GCN-NEXT:    v_readfirstlane_b32 s13, v2
+; GCN-NEXT:    s_mul_i32 s11, s8, s3
+; GCN-NEXT:    s_mul_i32 s12, s2, s3
+; GCN-NEXT:    s_add_i32 s10, s13, s10
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s12
+; GCN-NEXT:    s_add_i32 s10, s10, s11
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s10
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s12
+; GCN-NEXT:    v_readfirstlane_b32 s11, v3
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s10
+; GCN-NEXT:    s_mul_i32 s14, s3, s10
+; GCN-NEXT:    s_add_u32 s11, s11, s14
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s12, s9, s12
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    v_readfirstlane_b32 s13, v4
+; GCN-NEXT:    s_add_u32 s11, s11, s12
+; GCN-NEXT:    v_readfirstlane_b32 s15, v1
+; GCN-NEXT:    s_addc_u32 s11, s14, s13
+; GCN-NEXT:    s_addc_u32 s12, s15, 0
+; GCN-NEXT:    s_mul_i32 s10, s9, s10
+; GCN-NEXT:    s_add_u32 s10, s11, s10
+; GCN-NEXT:    s_addc_u32 s11, 0, s12
+; GCN-NEXT:    s_add_u32 s10, s3, s10
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_addc_u32 s11, s11, s12
-; GCN-NEXT:    s_mul_i32 s8, s2, s11
-; GCN-NEXT:    v_readfirstlane_b32 s9, v0
-; GCN-NEXT:    s_add_i32 s8, s9, s8
-; GCN-NEXT:    s_mul_i32 s10, s10, s13
-; GCN-NEXT:    s_mul_i32 s2, s2, s13
-; GCN-NEXT:    s_add_i32 s8, s8, s10
+; GCN-NEXT:    s_addc_u32 s9, s9, s11
+; GCN-NEXT:    s_mul_i32 s11, s2, s9
+; GCN-NEXT:    s_mul_i32 s8, s8, s10
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    s_add_i32 s11, s12, s11
+; GCN-NEXT:    s_mul_i32 s2, s2, s10
+; GCN-NEXT:    s_add_i32 s8, s11, s8
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mul_hi_u32 v3, s11, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s13, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s11, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s13, v0
-; GCN-NEXT:    s_mul_i32 s10, s13, s8
+; GCN-NEXT:    v_mul_hi_u32 v3, s9, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s10, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s9, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT:    s_mul_i32 s12, s10, s8
 ; GCN-NEXT:    v_readfirstlane_b32 s14, v2
-; GCN-NEXT:    s_add_u32 s10, s14, s10
-; GCN-NEXT:    v_readfirstlane_b32 s12, v0
-; GCN-NEXT:    s_mul_i32 s2, s11, s2
-; GCN-NEXT:    s_addc_u32 s12, 0, s12
-; GCN-NEXT:    v_readfirstlane_b32 s9, v3
-; GCN-NEXT:    s_add_u32 s2, s10, s2
-; GCN-NEXT:    s_addc_u32 s2, s12, s9
-; GCN-NEXT:    v_readfirstlane_b32 s9, v1
-; GCN-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-NEXT:    s_mul_i32 s8, s11, s8
+; GCN-NEXT:    s_add_u32 s12, s14, s12
+; GCN-NEXT:    v_readfirstlane_b32 s13, v0
+; GCN-NEXT:    s_mul_i32 s2, s9, s2
+; GCN-NEXT:    s_addc_u32 s13, 0, s13
+; GCN-NEXT:    v_readfirstlane_b32 s11, v3
+; GCN-NEXT:    s_add_u32 s2, s12, s2
+; GCN-NEXT:    s_addc_u32 s2, s13, s11
+; GCN-NEXT:    v_readfirstlane_b32 s11, v1
+; GCN-NEXT:    s_addc_u32 s11, s11, 0
+; GCN-NEXT:    s_mul_i32 s8, s9, s8
 ; GCN-NEXT:    s_add_u32 s2, s2, s8
-; GCN-NEXT:    s_addc_u32 s10, 0, s9
-; GCN-NEXT:    s_add_u32 s2, s13, s2
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_addc_u32 s8, s11, s10
+; GCN-NEXT:    s_addc_u32 s8, 0, s11
+; GCN-NEXT:    s_add_u32 s2, s10, s2
+; GCN-NEXT:    s_addc_u32 s8, s9, s8
 ; GCN-NEXT:    v_mul_hi_u32 v1, s2, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s8, 24
 ; GCN-NEXT:    s_mul_i32 s8, s8, 24
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    v_readfirstlane_b32 s10, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
 ; GCN-NEXT:    s_add_u32 s8, s10, s8
-; GCN-NEXT:    s_addc_u32 s12, 0, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    s_addc_u32 s10, 0, s9
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT:    s_mul_i32 s8, s7, s12
+; GCN-NEXT:    s_mul_i32 s8, s7, s10
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
-; GCN-NEXT:    s_add_i32 s13, s9, s8
-; GCN-NEXT:    s_sub_i32 s10, 0, s13
-; GCN-NEXT:    s_mul_i32 s8, s6, s12
-; GCN-NEXT:    s_sub_u32 s14, 24, s8
+; GCN-NEXT:    s_add_i32 s11, s9, s8
+; GCN-NEXT:    s_sub_i32 s12, 0, s11
+; GCN-NEXT:    s_mul_i32 s8, s6, s10
+; GCN-NEXT:    s_sub_u32 s13, 24, s8
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s11, s8, s9
-; GCN-NEXT:    s_subb_u32 s15, s10, s7
-; GCN-NEXT:    s_sub_u32 s16, s14, s6
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_subb_u32 s10, s15, 0
-; GCN-NEXT:    s_cmp_ge_u32 s10, s7
-; GCN-NEXT:    s_cselect_b32 s11, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s16, s6
+; GCN-NEXT:    s_subb_u32 s12, s12, s7
+; GCN-NEXT:    s_sub_u32 s14, s13, s6
+; GCN-NEXT:    s_subb_u32 s12, s12, 0
+; GCN-NEXT:    s_cmp_ge_u32 s12, s7
 ; GCN-NEXT:    s_cselect_b32 s15, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s10, s7
-; GCN-NEXT:    s_cselect_b32 s10, s15, s11
-; GCN-NEXT:    s_add_u32 s11, s12, 1
+; GCN-NEXT:    s_cmp_ge_u32 s14, s6
+; GCN-NEXT:    s_cselect_b32 s14, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s12, s7
+; GCN-NEXT:    s_cselect_b32 s12, s14, s15
+; GCN-NEXT:    s_add_u32 s14, s10, 1
 ; GCN-NEXT:    s_addc_u32 s15, 0, 0
-; GCN-NEXT:    s_add_u32 s16, s12, 2
+; GCN-NEXT:    s_add_u32 s16, s10, 2
 ; GCN-NEXT:    s_addc_u32 s17, 0, 0
-; GCN-NEXT:    s_cmp_lg_u32 s10, 0
-; GCN-NEXT:    s_cselect_b32 s10, s16, s11
-; GCN-NEXT:    s_cselect_b32 s11, s17, s15
+; GCN-NEXT:    s_cmp_lg_u32 s12, 0
+; GCN-NEXT:    s_cselect_b32 s12, s16, s14
+; GCN-NEXT:    s_cselect_b32 s14, s17, s15
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s8, 0, s13
+; GCN-NEXT:    s_subb_u32 s8, 0, s11
 ; GCN-NEXT:    s_cmp_ge_u32 s8, s7
 ; GCN-NEXT:    s_cselect_b32 s9, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s14, s6
+; GCN-NEXT:    s_cmp_ge_u32 s13, s6
 ; GCN-NEXT:    s_cselect_b32 s6, -1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s8, s7
 ; GCN-NEXT:    s_cselect_b32 s6, s6, s9
 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_cselect_b32 s7, s11, 0
-; GCN-NEXT:    s_cselect_b32 s6, s10, s12
+; GCN-NEXT:    s_cselect_b32 s7, s14, 0
+; GCN-NEXT:    s_cselect_b32 s6, s12, s10
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_subb_u32 s7, s7, s4
@@ -1303,8 +1285,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s12, s10, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
 ; GCN-IR-NEXT:    s_addc_u32 s8, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
@@ -1335,8 +1315,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s12, s12, s18
 ; GCN-IR-NEXT:    s_subb_u32 s13, s13, s19
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
index 002d43f937837..131656975ec40 100644
--- a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
+++ b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s -debugify-and-strip-all-safe | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX
 
 ---
@@ -40,6 +41,27 @@ body:             |
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
 ...
 
+---
+name:            meta_in_between
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: meta_in_between
+    ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: KILL $sgpr0
+    ; GCN-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+  S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+  $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+  KILL $sgpr0
+  $sgpr0 = IMPLICIT_DEF
+  S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+  $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+...
+
 ---
 name:            valu_write_in_between
 body:             |
diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
index 5c90957edd9f5..bcece19ae5fdd 100644
--- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
@@ -16,11 +16,11 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i
 ; GCN-NEXT:    s_cbranch_execz .LBB0_4
 ; GCN-NEXT:  ; %bb.3: ; %.then
 ; GCN-NEXT:    s_or_saveexec_b32 s1, -1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s1
+; GCN-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GCN-NEXT:    s_mov_b32 exec_lo, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    v_mov_b32_e32 v0, v1
 ; GCN-NEXT:    v_mov_b32_e32 v4, -1
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    buffer_store_dword v4, v0, s[4:7], 0 offen
diff --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
index a0bac532454f5..e589a6341ea0e 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
@@ -5,15 +5,14 @@ define amdgpu_kernel void @sext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a
 ; GCN-LABEL: sext_i16_to_i32_uniform:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_sext_i32_i16 s0, s2
-; GCN-NEXT:    s_add_i32 s0, s3, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    s_sext_i32_i16 s4, s4
+; GCN-NEXT:    s_add_i32 s4, s5, s4
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %sext = sext i16 %a to i32
   %res = add i32 %b, %sext
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index d8511c8f6be11..17db3799b0de5 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -22,63 +22,57 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
 ; VI-LABEL: s_shl_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshr_b32 s0, s3, 16
-; VI-NEXT:    s_lshr_b32 s1, s2, 16
-; VI-NEXT:    s_lshl_b32 s0, s1, s0
-; VI-NEXT:    s_lshl_b32 s1, s2, s3
-; VI-NEXT:    s_lshl_b32 s0, s0, 16
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_or_b32 s0, s1, s0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; VI-NEXT:    s_lshr_b32 s6, s5, 16
+; VI-NEXT:    s_lshr_b32 s7, s4, 16
+; VI-NEXT:    s_lshl_b32 s4, s4, s5
+; VI-NEXT:    s_lshl_b32 s5, s7, s6
+; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: s_shl_v2i16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b32 s4, s0
-; CI-NEXT:    s_mov_b32 s5, s1
-; CI-NEXT:    s_lshr_b32 s0, s2, 16
-; CI-NEXT:    s_lshr_b32 s1, s3, 16
-; CI-NEXT:    s_lshl_b32 s0, s0, s1
-; CI-NEXT:    s_lshl_b32 s1, s2, s3
-; CI-NEXT:    s_lshl_b32 s0, s0, 16
-; CI-NEXT:    s_and_b32 s1, s1, 0xffff
-; CI-NEXT:    s_or_b32 s0, s1, s0
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; CI-NEXT:    s_lshr_b32 s6, s4, 16
+; CI-NEXT:    s_lshr_b32 s7, s5, 16
+; CI-NEXT:    s_lshl_b32 s4, s4, s5
+; CI-NEXT:    s_lshl_b32 s5, s6, s7
+; CI-NEXT:    s_lshl_b32 s5, s5, 16
+; CI-NEXT:    s_and_b32 s4, s4, 0xffff
+; CI-NEXT:    s_or_b32 s4, s4, s5
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_shl_v2i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s3, s2
-; GFX10-NEXT:    s_mov_b32 s4, s0
-; GFX10-NEXT:    s_mov_b32 s5, s1
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_shl_v2i16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v0, s3, s2
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_endpgm
   %result = shl <2 x i16> %lhs, %rhs
   store <2 x i16> %result, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll b/llvm/test/CodeGen/AMDGPU/shlN_add.ll
similarity index 57%
rename from llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
rename to llvm/test/CodeGen/AMDGPU/shlN_add.ll
index 9f4a6f2f63f15..ba8ae9554d0e8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/shlN_add.ll
@@ -1,4 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX10-SDAG %s
+
 ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
@@ -7,6 +12,22 @@
 ; Test gfx9+ s_shl[1-4]_add_u32 pattern matching
 
 define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl1_add_u32:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_lshl1_add_u32 s0, s0, s1
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl1_add_u32:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-SDAG-NEXT:    s_add_i32 s0, s0, s1
+; GFX8-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl1_add_u32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_lshl1_add_u32 s0, s0, s1
+; GFX10-SDAG-NEXT:    ; return to shader part epilog
+;
 ; GFX9-LABEL: s_shl1_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl1_add_u32 s0, s0, s1
@@ -28,6 +49,22 @@ define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) {
 }
 
 define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl2_add_u32:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, s1
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl2_add_u32:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX8-SDAG-NEXT:    s_add_i32 s0, s0, s1
+; GFX8-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl2_add_u32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, s1
+; GFX10-SDAG-NEXT:    ; return to shader part epilog
+;
 ; GFX9-LABEL: s_shl2_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl2_add_u32 s0, s0, s1
@@ -49,6 +86,22 @@ define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) {
 }
 
 define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl3_add_u32:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_lshl3_add_u32 s0, s0, s1
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl3_add_u32:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX8-SDAG-NEXT:    s_add_i32 s0, s0, s1
+; GFX8-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl3_add_u32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_lshl3_add_u32 s0, s0, s1
+; GFX10-SDAG-NEXT:    ; return to shader part epilog
+;
 ; GFX9-LABEL: s_shl3_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl3_add_u32 s0, s0, s1
@@ -70,6 +123,22 @@ define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) {
 }
 
 define amdgpu_ps i32 @s_shl4_add_u32(i32 inreg %src0, i32 inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl4_add_u32:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_lshl4_add_u32 s0, s0, s1
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl4_add_u32:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX8-SDAG-NEXT:    s_add_i32 s0, s0, s1
+; GFX8-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl4_add_u32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_lshl4_add_u32 s0, s0, s1
+; GFX10-SDAG-NEXT:    ; return to shader part epilog
+;
 ; GFX9-LABEL: s_shl4_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl4_add_u32 s0, s0, s1
@@ -102,6 +171,25 @@ define amdgpu_ps i32 @s_shl5_add_u32(i32 inreg %src0, i32 inreg %src1) {
 }
 
 define i32 @v_shl1_add_u32(i32 %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: v_shl1_add_u32:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 1, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_shl1_add_u32:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_shl1_add_u32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 1, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: v_shl1_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -126,6 +214,25 @@ define i32 @v_shl1_add_u32(i32 %src0, i32 %src1) {
 }
 
 define i32 @v_shl2_add_u32(i32 %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: v_shl2_add_u32:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_shl2_add_u32:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_shl2_add_u32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: v_shl2_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -150,6 +257,25 @@ define i32 @v_shl2_add_u32(i32 %src0, i32 %src1) {
 }
 
 define i32 @v_shl3_add_u32(i32 %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: v_shl3_add_u32:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 3, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_shl3_add_u32:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_shl3_add_u32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 3, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: v_shl3_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -174,6 +300,25 @@ define i32 @v_shl3_add_u32(i32 %src0, i32 %src1) {
 }
 
 define i32 @v_shl4_add_u32(i32 %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: v_shl4_add_u32:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 4, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_shl4_add_u32:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_shl4_add_u32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 4, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: v_shl4_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -198,6 +343,25 @@ define i32 @v_shl4_add_u32(i32 %src0, i32 %src1) {
 }
 
 define i32 @v_shl5_add_u32(i32 %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: v_shl5_add_u32:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 5, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_shl5_add_u32:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_shl5_add_u32:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 5, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: v_shl5_add_u32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -224,6 +388,22 @@ define i32 @v_shl5_add_u32(i32 %src0, i32 %src1) {
 ; FIXME: Use v_lshl_add_u32
 ; shift is scalar, but add is vector.
 define amdgpu_ps float @shl1_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: shl1_add_u32_vgpr1:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 1, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: shl1_add_u32_vgpr1:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: shl1_add_u32_vgpr1:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 1, v0
+; GFX10-SDAG-NEXT:    ; return to shader part epilog
+;
 ; GFX9-LABEL: shl1_add_u32_vgpr1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
@@ -248,6 +428,22 @@ define amdgpu_ps float @shl1_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
 }
 
 define amdgpu_ps float @shl2_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: shl2_add_u32_vgpr1:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 2, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: shl2_add_u32_vgpr1:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: shl2_add_u32_vgpr1:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 2, v0
+; GFX10-SDAG-NEXT:    ; return to shader part epilog
+;
 ; GFX9-LABEL: shl2_add_u32_vgpr1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
@@ -272,6 +468,22 @@ define amdgpu_ps float @shl2_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
 }
 
 define amdgpu_ps float @shl3_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: shl3_add_u32_vgpr1:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 3, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: shl3_add_u32_vgpr1:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: shl3_add_u32_vgpr1:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 3, v0
+; GFX10-SDAG-NEXT:    ; return to shader part epilog
+;
 ; GFX9-LABEL: shl3_add_u32_vgpr1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 3
@@ -296,6 +508,22 @@ define amdgpu_ps float @shl3_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
 }
 
 define amdgpu_ps float @shl4_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: shl4_add_u32_vgpr1:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 4, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: shl4_add_u32_vgpr1:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: shl4_add_u32_vgpr1:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 4, v0
+; GFX10-SDAG-NEXT:    ; return to shader part epilog
+;
 ; GFX9-LABEL: shl4_add_u32_vgpr1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
@@ -320,6 +548,22 @@ define amdgpu_ps float @shl4_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
 }
 
 define amdgpu_ps float @shl5_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: shl5_add_u32_vgpr1:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: shl5_add_u32_vgpr1:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: shl5_add_u32_vgpr1:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, v0
+; GFX10-SDAG-NEXT:    ; return to shader part epilog
+;
 ; GFX9-LABEL: shl5_add_u32_vgpr1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 5
@@ -344,6 +588,26 @@ define amdgpu_ps float @shl5_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
 }
 
 define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl1_add_u32_v2:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_lshl1_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT:    s_lshl1_add_u32 s0, s0, s2
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl1_add_u32_v2:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-SDAG-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX8-SDAG-NEXT:    s_add_i32 s1, s1, s3
+; GFX8-SDAG-NEXT:    s_add_i32 s0, s0, s2
+; GFX8-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl1_add_u32_v2:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_lshl1_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT:    s_lshl1_add_u32 s1, s1, s3
+; GFX10-SDAG-NEXT:    ; return to shader part epilog
+;
 ; GFX9-LABEL: s_shl1_add_u32_v2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl1_add_u32 s0, s0, s2
@@ -369,6 +633,26 @@ define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
 }
 
 define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl2_add_u32_v2:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, s2
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl2_add_u32_v2:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX8-SDAG-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX8-SDAG-NEXT:    s_add_i32 s1, s1, s3
+; GFX8-SDAG-NEXT:    s_add_i32 s0, s0, s2
+; GFX8-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl2_add_u32_v2:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT:    s_lshl2_add_u32 s1, s1, s3
+; GFX10-SDAG-NEXT:    ; return to shader part epilog
+;
 ; GFX9-LABEL: s_shl2_add_u32_v2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl2_add_u32 s0, s0, s2
@@ -394,6 +678,26 @@ define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
 }
 
 define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl3_add_u32_v2:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_lshl3_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT:    s_lshl3_add_u32 s0, s0, s2
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl3_add_u32_v2:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX8-SDAG-NEXT:    s_lshl_b32 s1, s1, 3
+; GFX8-SDAG-NEXT:    s_add_i32 s1, s1, s3
+; GFX8-SDAG-NEXT:    s_add_i32 s0, s0, s2
+; GFX8-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl3_add_u32_v2:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_lshl3_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT:    s_lshl3_add_u32 s1, s1, s3
+; GFX10-SDAG-NEXT:    ; return to shader part epilog
+;
 ; GFX9-LABEL: s_shl3_add_u32_v2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl3_add_u32 s0, s0, s2
@@ -419,6 +723,26 @@ define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
 }
 
 define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl4_add_u32_v2:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_lshl4_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT:    s_lshl4_add_u32 s0, s0, s2
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl4_add_u32_v2:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX8-SDAG-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX8-SDAG-NEXT:    s_add_i32 s1, s1, s3
+; GFX8-SDAG-NEXT:    s_add_i32 s0, s0, s2
+; GFX8-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl4_add_u32_v2:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_lshl4_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT:    s_lshl4_add_u32 s1, s1, s3
+; GFX10-SDAG-NEXT:    ; return to shader part epilog
+;
 ; GFX9-LABEL: s_shl4_add_u32_v2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl4_add_u32 s0, s0, s2
@@ -444,6 +768,26 @@ define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
 }
 
 define amdgpu_ps <2 x i32> @s_shl_2_4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl_2_4_add_u32_v2:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_lshl4_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, s2
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl_2_4_add_u32_v2:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX8-SDAG-NEXT:    s_lshl_b32 s1, s1, 4
+; GFX8-SDAG-NEXT:    s_add_i32 s1, s1, s3
+; GFX8-SDAG-NEXT:    s_add_i32 s0, s0, s2
+; GFX8-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl_2_4_add_u32_v2:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT:    s_lshl4_add_u32 s1, s1, s3
+; GFX10-SDAG-NEXT:    ; return to shader part epilog
+;
 ; GFX9-LABEL: s_shl_2_4_add_u32_v2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl2_add_u32 s0, s0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll
index 9b3dc7f531021..287d1dde21403 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll
@@ -58,12 +58,11 @@ define void @v_shuffle_v2f32_v2f32__1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -111,12 +110,11 @@ define void @v_shuffle_v2f32_v2f32__3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -154,16 +152,14 @@ define void @v_shuffle_v2f32_v2f32__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v4, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -208,15 +204,14 @@ define void @v_shuffle_v2f32_v2f32__3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -304,12 +299,12 @@ define void @v_shuffle_v2f32_v2f32__3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -347,12 +342,11 @@ define void @v_shuffle_v2f32_v2f32__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v2f32__u_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v0, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -473,12 +467,11 @@ define void @v_shuffle_v2f32_v2f32__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v2f32__2_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v0, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -593,12 +586,12 @@ define void @v_shuffle_v2f32_v2f32__1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -724,12 +717,11 @@ define void @v_shuffle_v2f32_v2f32__1_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -874,11 +866,11 @@ define void @v_shuffle_v2f32_v2f32__1_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v4, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll
index 34043cd067b25..d5998e289c09d 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll
@@ -99,36 +99,33 @@ define void @v_shuffle_v2f32_v3f32__2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_u:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v3, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -194,36 +191,33 @@ define void @v_shuffle_v2f32_v3f32__5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_u:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v3, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -237,49 +231,45 @@ define void @v_shuffle_v2f32_v3f32__5_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v4, v[1:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v6, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_0:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:2]
+; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:4]
+; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
-; GFX90A-NEXT:    global_store_dwordx2 v5, v[2:3], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v7, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_0:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:2]
+; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[2:4]
+; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-NEXT:    global_store_dwordx2 v5, v[2:3], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-NEXT:    global_store_dwordx2 v7, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -293,46 +283,43 @@ define void @v_shuffle_v2f32_v3f32__5_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v4
-; GFX900-NEXT:    global_store_dwordx2 v5, v[0:1], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    global_store_dwordx2 v5, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:2]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:4]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    global_store_dwordx2 v5, v[0:1], s[16:17]
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v[0:2]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    global_store_dwordx2 v5, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:2]
-; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:4]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    global_store_dwordx2 v5, v[0:1], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def v[0:2]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v5, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -346,15 +333,14 @@ define void @v_shuffle_v2f32_v3f32__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v1, v5
-; GFX900-NEXT:    global_store_dwordx2 v6, v[1:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -362,15 +348,14 @@ define void @v_shuffle_v2f32_v3f32__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:2]
+; GFX90A-NEXT:    ; def v[2:4]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[4:6]
+; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v5, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -378,15 +363,14 @@ define void @v_shuffle_v2f32_v3f32__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:2]
+; GFX942-NEXT:    ; def v[2:4]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[4:6]
+; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v5, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -400,13 +384,12 @@ define void @v_shuffle_v2f32_v3f32__5_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -445,36 +428,37 @@ define void @v_shuffle_v2f32_v3f32__5_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -488,38 +472,37 @@ define void @v_shuffle_v2f32_v3f32__5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -533,12 +516,11 @@ define void @v_shuffle_v2f32_v3f32__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v3f32__u_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v0, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -659,13 +641,12 @@ define void @v_shuffle_v2f32_v3f32__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -703,12 +684,11 @@ define void @v_shuffle_v2f32_v3f32__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v3f32__3_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v0, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -918,36 +898,37 @@ define void @v_shuffle_v2f32_v3f32__2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1178,38 +1159,37 @@ define void @v_shuffle_v2f32_v3f32__2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1408,36 +1388,33 @@ define void @v_shuffle_v2f32_v3f32__2_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v3, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1654,42 +1631,42 @@ define void @v_shuffle_v2f32_v3f32__2_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v2
-; GFX900-NEXT:    global_store_dwordx2 v6, v[3:4], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v6, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[4:5], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v7, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[4:5], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    global_store_dwordx2 v7, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1939,43 +1916,42 @@ define void @v_shuffle_v2f32_v3f32__2_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, v2
-; GFX900-NEXT:    global_store_dwordx2 v6, v[4:5], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v6, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:2]
+; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v6
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[4:6]
+; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v7, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:2]
+; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[4:6]
+; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v6
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v6
+; GFX942-NEXT:    global_store_dwordx2 v7, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2175,7 +2151,7 @@ define void @s_shuffle_v2f32_v3f32__2_u() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2187,7 +2163,7 @@ define void @s_shuffle_v2f32_v3f32__2_u() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2199,7 +2175,7 @@ define void @s_shuffle_v2f32_v3f32__2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2274,7 +2250,7 @@ define void @s_shuffle_v2f32_v3f32__5_u() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2286,7 +2262,7 @@ define void @s_shuffle_v2f32_v3f32__5_u() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2298,7 +2274,7 @@ define void @s_shuffle_v2f32_v3f32__5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2317,11 +2293,11 @@ define void @s_shuffle_v2f32_v3f32__5_0() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s8
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s4
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2333,11 +2309,11 @@ define void @s_shuffle_v2f32_v3f32__5_0() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s8
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s4
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2347,13 +2323,13 @@ define void @s_shuffle_v2f32_v3f32__5_0() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:2]
+; GFX942-NEXT:    ; def s[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s4
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:6]
+; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s0
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2372,10 +2348,11 @@ define void @s_shuffle_v2f32_v3f32__5_1() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s9
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2387,10 +2364,11 @@ define void @s_shuffle_v2f32_v3f32__5_1() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s9
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2400,12 +2378,13 @@ define void @s_shuffle_v2f32_v3f32__5_1() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:10]
+; GFX942-NEXT:    ; def s[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s5
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2424,11 +2403,11 @@ define void @s_shuffle_v2f32_v3f32__5_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s10
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2440,11 +2419,11 @@ define void @s_shuffle_v2f32_v3f32__5_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s10
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2454,13 +2433,13 @@ define void @s_shuffle_v2f32_v3f32__5_2() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:2]
+; GFX942-NEXT:    ; def s[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s6
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:6]
+; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2479,8 +2458,8 @@ define void @s_shuffle_v2f32_v3f32__5_3() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s4
+; GFX900-NEXT:    s_mov_b32 s7, s4
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2492,8 +2471,8 @@ define void @s_shuffle_v2f32_v3f32__5_3() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s4
+; GFX90A-NEXT:    s_mov_b32 s7, s4
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2505,8 +2484,8 @@ define void @s_shuffle_v2f32_v3f32__5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s0
+; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2519,17 +2498,44 @@ define void @s_shuffle_v2f32_v3f32__5_3() {
 }
 
 define void @s_shuffle_v2f32_v3f32__5_4() {
-; GFX9-LABEL: s_shuffle_v2f32_v3f32__5_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:10]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:9]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2f32_v3f32__5_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:6]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s5
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2f32_v3f32__5_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:6]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s5
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2f32_v3f32__5_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:2]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:9]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x float> asm "; def $0", "=s"()
   %vec1 = call <3 x float> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> <i32 5, i32 4>
@@ -2544,8 +2550,8 @@ define void @s_shuffle_v2f32_v3f32__5_5() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s6
+; GFX900-NEXT:    s_mov_b32 s7, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2557,8 +2563,8 @@ define void @s_shuffle_v2f32_v3f32__5_5() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s6
+; GFX90A-NEXT:    s_mov_b32 s7, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2570,8 +2576,8 @@ define void @s_shuffle_v2f32_v3f32__5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s2
+; GFX942-NEXT:    s_mov_b32 s3, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2695,8 +2701,8 @@ define void @s_shuffle_v2f32_v3f32__2_0() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s4
+; GFX900-NEXT:    s_mov_b32 s7, s4
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2708,8 +2714,8 @@ define void @s_shuffle_v2f32_v3f32__2_0() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s4
+; GFX90A-NEXT:    s_mov_b32 s7, s4
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2721,8 +2727,8 @@ define void @s_shuffle_v2f32_v3f32__2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s0
+; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2929,17 +2935,44 @@ define void @s_shuffle_v2f32_v3f32__1_1() {
 }
 
 define void @s_shuffle_v2f32_v3f32__2_1() {
-; GFX9-LABEL: s_shuffle_v2f32_v3f32__2_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:10]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:9]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2f32_v3f32__2_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:6]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s5
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2f32_v3f32__2_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:6]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s5
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2f32_v3f32__2_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:2]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:9]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x float> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> <i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x float> %shuf)
@@ -3150,8 +3183,8 @@ define void @s_shuffle_v2f32_v3f32__2_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s6
+; GFX900-NEXT:    s_mov_b32 s7, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -3163,8 +3196,8 @@ define void @s_shuffle_v2f32_v3f32__2_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s6
+; GFX90A-NEXT:    s_mov_b32 s7, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -3176,8 +3209,8 @@ define void @s_shuffle_v2f32_v3f32__2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s2
+; GFX942-NEXT:    s_mov_b32 s3, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3388,7 +3421,7 @@ define void @s_shuffle_v2f32_v3f32__2_3() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -3400,7 +3433,7 @@ define void @s_shuffle_v2f32_v3f32__2_3() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -3412,7 +3445,7 @@ define void @s_shuffle_v2f32_v3f32__2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3635,10 +3668,11 @@ define void @s_shuffle_v2f32_v3f32__2_4() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s9
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -3650,10 +3684,11 @@ define void @s_shuffle_v2f32_v3f32__2_4() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s9
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -3663,12 +3698,13 @@ define void @s_shuffle_v2f32_v3f32__2_4() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:10]
+; GFX942-NEXT:    ; def s[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s5
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3897,11 +3933,11 @@ define void @s_shuffle_v2f32_v3f32__2_5() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s10
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s10
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -3913,11 +3949,11 @@ define void @s_shuffle_v2f32_v3f32__2_5() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s10
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s10
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -3927,13 +3963,13 @@ define void @s_shuffle_v2f32_v3f32__2_5() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:2]
+; GFX942-NEXT:    ; def s[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s6
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:6]
+; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s6
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll
index 676a521757bd8..a86ca0a4a23c6 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll
@@ -58,12 +58,11 @@ define void @v_shuffle_v2i32_v2i32__1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -111,12 +110,11 @@ define void @v_shuffle_v2i32_v2i32__3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -154,16 +152,14 @@ define void @v_shuffle_v2i32_v2i32__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v4, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -208,15 +204,14 @@ define void @v_shuffle_v2i32_v2i32__3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -304,12 +299,12 @@ define void @v_shuffle_v2i32_v2i32__3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -347,12 +342,11 @@ define void @v_shuffle_v2i32_v2i32__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v2i32__u_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v0, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -473,12 +467,11 @@ define void @v_shuffle_v2i32_v2i32__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v2i32__2_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v0, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -593,12 +586,12 @@ define void @v_shuffle_v2i32_v2i32__1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -724,12 +717,11 @@ define void @v_shuffle_v2i32_v2i32__1_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -874,11 +866,11 @@ define void @v_shuffle_v2i32_v2i32__1_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v4, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll
index f65340470feb1..d46ca61cff64d 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll
@@ -99,36 +99,33 @@ define void @v_shuffle_v2i32_v3i32__2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_u:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v3, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -194,36 +191,33 @@ define void @v_shuffle_v2i32_v3i32__5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_u:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v3, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -237,49 +231,45 @@ define void @v_shuffle_v2i32_v3i32__5_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v4, v[1:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v6, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_0:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:2]
+; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:4]
+; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
-; GFX90A-NEXT:    global_store_dwordx2 v5, v[2:3], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v7, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_0:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:2]
+; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[2:4]
+; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-NEXT:    global_store_dwordx2 v5, v[2:3], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-NEXT:    global_store_dwordx2 v7, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -293,46 +283,43 @@ define void @v_shuffle_v2i32_v3i32__5_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v4
-; GFX900-NEXT:    global_store_dwordx2 v5, v[0:1], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    global_store_dwordx2 v5, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:2]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:4]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    global_store_dwordx2 v5, v[0:1], s[16:17]
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v[0:2]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    global_store_dwordx2 v5, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:2]
-; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:4]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    global_store_dwordx2 v5, v[0:1], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def v[0:2]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v5, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -346,15 +333,14 @@ define void @v_shuffle_v2i32_v3i32__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v1, v5
-; GFX900-NEXT:    global_store_dwordx2 v6, v[1:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -362,15 +348,14 @@ define void @v_shuffle_v2i32_v3i32__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:2]
+; GFX90A-NEXT:    ; def v[2:4]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[4:6]
+; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v5, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -378,15 +363,14 @@ define void @v_shuffle_v2i32_v3i32__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:2]
+; GFX942-NEXT:    ; def v[2:4]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[4:6]
+; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v5, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -400,13 +384,12 @@ define void @v_shuffle_v2i32_v3i32__5_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -445,36 +428,37 @@ define void @v_shuffle_v2i32_v3i32__5_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -488,38 +472,37 @@ define void @v_shuffle_v2i32_v3i32__5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -533,12 +516,11 @@ define void @v_shuffle_v2i32_v3i32__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v3i32__u_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v0, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -659,13 +641,12 @@ define void @v_shuffle_v2i32_v3i32__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -703,12 +684,11 @@ define void @v_shuffle_v2i32_v3i32__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v3i32__3_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v0, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -918,36 +898,37 @@ define void @v_shuffle_v2i32_v3i32__2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1178,38 +1159,37 @@ define void @v_shuffle_v2i32_v3i32__2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1408,36 +1388,33 @@ define void @v_shuffle_v2i32_v3i32__2_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v3, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1654,42 +1631,42 @@ define void @v_shuffle_v2i32_v3i32__2_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v2
-; GFX900-NEXT:    global_store_dwordx2 v6, v[3:4], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v6, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[4:5], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v7, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[4:5], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    global_store_dwordx2 v7, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1939,43 +1916,42 @@ define void @v_shuffle_v2i32_v3i32__2_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, v2
-; GFX900-NEXT:    global_store_dwordx2 v6, v[4:5], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v6, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:2]
+; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v6
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[4:6]
+; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v7, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:2]
+; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[4:6]
+; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v6
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v6
+; GFX942-NEXT:    global_store_dwordx2 v7, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2175,7 +2151,7 @@ define void @s_shuffle_v2i32_v3i32__2_u() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2187,7 +2163,7 @@ define void @s_shuffle_v2i32_v3i32__2_u() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2199,7 +2175,7 @@ define void @s_shuffle_v2i32_v3i32__2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2274,7 +2250,7 @@ define void @s_shuffle_v2i32_v3i32__5_u() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2286,7 +2262,7 @@ define void @s_shuffle_v2i32_v3i32__5_u() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2298,7 +2274,7 @@ define void @s_shuffle_v2i32_v3i32__5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2317,11 +2293,11 @@ define void @s_shuffle_v2i32_v3i32__5_0() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s8
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s4
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2333,11 +2309,11 @@ define void @s_shuffle_v2i32_v3i32__5_0() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s8
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s4
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2347,13 +2323,13 @@ define void @s_shuffle_v2i32_v3i32__5_0() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:2]
+; GFX942-NEXT:    ; def s[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s4
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:6]
+; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s0
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2372,10 +2348,11 @@ define void @s_shuffle_v2i32_v3i32__5_1() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s9
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2387,10 +2364,11 @@ define void @s_shuffle_v2i32_v3i32__5_1() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s9
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2400,12 +2378,13 @@ define void @s_shuffle_v2i32_v3i32__5_1() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:10]
+; GFX942-NEXT:    ; def s[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s5
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2424,11 +2403,11 @@ define void @s_shuffle_v2i32_v3i32__5_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s10
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2440,11 +2419,11 @@ define void @s_shuffle_v2i32_v3i32__5_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s10
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2454,13 +2433,13 @@ define void @s_shuffle_v2i32_v3i32__5_2() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:2]
+; GFX942-NEXT:    ; def s[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s6
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:6]
+; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2479,8 +2458,8 @@ define void @s_shuffle_v2i32_v3i32__5_3() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s4
+; GFX900-NEXT:    s_mov_b32 s7, s4
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2492,8 +2471,8 @@ define void @s_shuffle_v2i32_v3i32__5_3() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s4
+; GFX90A-NEXT:    s_mov_b32 s7, s4
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2505,8 +2484,8 @@ define void @s_shuffle_v2i32_v3i32__5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s0
+; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2519,17 +2498,44 @@ define void @s_shuffle_v2i32_v3i32__5_3() {
 }
 
 define void @s_shuffle_v2i32_v3i32__5_4() {
-; GFX9-LABEL: s_shuffle_v2i32_v3i32__5_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:10]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:9]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i32_v3i32__5_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:6]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s5
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i32_v3i32__5_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:6]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s5
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i32_v3i32__5_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:2]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:9]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i32> asm "; def $0", "=s"()
   %vec1 = call <3 x i32> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> <i32 5, i32 4>
@@ -2544,8 +2550,8 @@ define void @s_shuffle_v2i32_v3i32__5_5() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s6
+; GFX900-NEXT:    s_mov_b32 s7, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2557,8 +2563,8 @@ define void @s_shuffle_v2i32_v3i32__5_5() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s6
+; GFX90A-NEXT:    s_mov_b32 s7, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2570,8 +2576,8 @@ define void @s_shuffle_v2i32_v3i32__5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s2
+; GFX942-NEXT:    s_mov_b32 s3, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2695,8 +2701,8 @@ define void @s_shuffle_v2i32_v3i32__2_0() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s4
+; GFX900-NEXT:    s_mov_b32 s7, s4
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2708,8 +2714,8 @@ define void @s_shuffle_v2i32_v3i32__2_0() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s4
+; GFX90A-NEXT:    s_mov_b32 s7, s4
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2721,8 +2727,8 @@ define void @s_shuffle_v2i32_v3i32__2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s0
+; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2929,17 +2935,44 @@ define void @s_shuffle_v2i32_v3i32__1_1() {
 }
 
 define void @s_shuffle_v2i32_v3i32__2_1() {
-; GFX9-LABEL: s_shuffle_v2i32_v3i32__2_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:10]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:9]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2i32_v3i32__2_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:6]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s5
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2i32_v3i32__2_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:6]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s5
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2i32_v3i32__2_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:2]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:9]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i32> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> <i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x i32> %shuf)
@@ -3150,8 +3183,8 @@ define void @s_shuffle_v2i32_v3i32__2_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s6
+; GFX900-NEXT:    s_mov_b32 s7, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -3163,8 +3196,8 @@ define void @s_shuffle_v2i32_v3i32__2_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s6
+; GFX90A-NEXT:    s_mov_b32 s7, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -3176,8 +3209,8 @@ define void @s_shuffle_v2i32_v3i32__2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s2
+; GFX942-NEXT:    s_mov_b32 s3, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3388,7 +3421,7 @@ define void @s_shuffle_v2i32_v3i32__2_3() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -3400,7 +3433,7 @@ define void @s_shuffle_v2i32_v3i32__2_3() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -3412,7 +3445,7 @@ define void @s_shuffle_v2i32_v3i32__2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3635,10 +3668,11 @@ define void @s_shuffle_v2i32_v3i32__2_4() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s9
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -3650,10 +3684,11 @@ define void @s_shuffle_v2i32_v3i32__2_4() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s9
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -3663,12 +3698,13 @@ define void @s_shuffle_v2i32_v3i32__2_4() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:10]
+; GFX942-NEXT:    ; def s[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s5
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3897,11 +3933,11 @@ define void @s_shuffle_v2i32_v3i32__2_5() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s10
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s10
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -3913,11 +3949,11 @@ define void @s_shuffle_v2i32_v3i32__2_5() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s10
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s10
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -3927,13 +3963,13 @@ define void @s_shuffle_v2i32_v3i32__2_5() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:2]
+; GFX942-NEXT:    ; def s[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s6
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:6]
+; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s6
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll
index 299dfba482953..02fb06ef54d42 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll
@@ -58,12 +58,11 @@ define void @v_shuffle_v2p3_v2p3__1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -111,12 +110,11 @@ define void @v_shuffle_v2p3_v2p3__3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -154,16 +152,14 @@ define void @v_shuffle_v2p3_v2p3__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v4, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -208,15 +204,14 @@ define void @v_shuffle_v2p3_v2p3__3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -304,12 +299,12 @@ define void @v_shuffle_v2p3_v2p3__3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -347,12 +342,11 @@ define void @v_shuffle_v2p3_v2p3__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v2p3__u_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v0, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -473,12 +467,11 @@ define void @v_shuffle_v2p3_v2p3__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v2p3__2_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v0, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -593,12 +586,12 @@ define void @v_shuffle_v2p3_v2p3__1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -724,12 +717,11 @@ define void @v_shuffle_v2p3_v2p3__1_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -874,11 +866,11 @@ define void @v_shuffle_v2p3_v2p3__1_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v4, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll
index 13e3d94c35446..d0f00f8363aed 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll
@@ -99,36 +99,33 @@ define void @v_shuffle_v2p3_v3p3__2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_u:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v3, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -194,36 +191,33 @@ define void @v_shuffle_v2p3_v3p3__5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_u:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_u:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v3, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -237,49 +231,45 @@ define void @v_shuffle_v2p3_v3p3__5_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v4, v[1:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v6, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_0:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:2]
+; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:4]
+; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
-; GFX90A-NEXT:    global_store_dwordx2 v5, v[2:3], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v7, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_0:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:2]
+; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[2:4]
+; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
-; GFX942-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-NEXT:    global_store_dwordx2 v5, v[2:3], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-NEXT:    global_store_dwordx2 v7, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -293,46 +283,43 @@ define void @v_shuffle_v2p3_v3p3__5_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v4
-; GFX900-NEXT:    global_store_dwordx2 v5, v[0:1], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    global_store_dwordx2 v5, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:2]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[2:4]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    global_store_dwordx2 v5, v[0:1], s[16:17]
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v[0:2]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    global_store_dwordx2 v5, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:2]
-; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[2:4]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    global_store_dwordx2 v5, v[0:1], s[0:1]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def v[0:2]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    global_store_dwordx2 v5, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -346,15 +333,14 @@ define void @v_shuffle_v2p3_v3p3__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v1, v5
-; GFX900-NEXT:    global_store_dwordx2 v6, v[1:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -362,15 +348,14 @@ define void @v_shuffle_v2p3_v3p3__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:2]
+; GFX90A-NEXT:    ; def v[2:4]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[4:6]
+; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v5, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -378,15 +363,14 @@ define void @v_shuffle_v2p3_v3p3__5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:2]
+; GFX942-NEXT:    ; def v[2:4]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[4:6]
+; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
-; GFX942-NEXT:    v_mov_b32_e32 v0, v6
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v5, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -400,13 +384,12 @@ define void @v_shuffle_v2p3_v3p3__5_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -445,36 +428,37 @@ define void @v_shuffle_v2p3_v3p3__5_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -488,38 +472,37 @@ define void @v_shuffle_v2p3_v3p3__5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -533,12 +516,11 @@ define void @v_shuffle_v2p3_v3p3__u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v3p3__u_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v0, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -659,13 +641,12 @@ define void @v_shuffle_v2p3_v3p3__2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -703,12 +684,11 @@ define void @v_shuffle_v2p3_v3p3__3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v3p3__3_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v0, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -918,36 +898,37 @@ define void @v_shuffle_v2p3_v3p3__2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1178,38 +1159,37 @@ define void @v_shuffle_v2p3_v3p3__2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1408,36 +1388,33 @@ define void @v_shuffle_v2p3_v3p3__2_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_3:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v3, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_3:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v3, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1654,42 +1631,42 @@ define void @v_shuffle_v2p3_v3p3__2_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v2
-; GFX900-NEXT:    global_store_dwordx2 v6, v[3:4], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v6, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_4:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[4:5], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v7, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_4:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    global_store_dwordx2 v3, v[4:5], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-NEXT:    global_store_dwordx2 v7, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1939,43 +1916,42 @@ define void @v_shuffle_v2p3_v3p3__2_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, v2
-; GFX900-NEXT:    global_store_dwordx2 v6, v[4:5], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v6, v[2:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:2]
+; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v6
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[4:6]
+; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
-; GFX90A-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX90A-NEXT:    global_store_dwordx2 v7, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:2]
+; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[4:6]
+; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_mov_b32_e32 v1, v6
-; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v6
+; GFX942-NEXT:    global_store_dwordx2 v7, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2175,7 +2151,7 @@ define void @s_shuffle_v2p3_v3p3__2_u() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2187,7 +2163,7 @@ define void @s_shuffle_v2p3_v3p3__2_u() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2199,7 +2175,7 @@ define void @s_shuffle_v2p3_v3p3__2_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2274,7 +2250,7 @@ define void @s_shuffle_v2p3_v3p3__5_u() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2286,7 +2262,7 @@ define void @s_shuffle_v2p3_v3p3__5_u() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2298,7 +2274,7 @@ define void @s_shuffle_v2p3_v3p3__5_u() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2317,11 +2293,11 @@ define void @s_shuffle_v2p3_v3p3__5_0() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s8
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s4
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2333,11 +2309,11 @@ define void @s_shuffle_v2p3_v3p3__5_0() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s8
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s4
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2347,13 +2323,13 @@ define void @s_shuffle_v2p3_v3p3__5_0() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:2]
+; GFX942-NEXT:    ; def s[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s4
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:6]
+; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s0
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2372,10 +2348,11 @@ define void @s_shuffle_v2p3_v3p3__5_1() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s9
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2387,10 +2364,11 @@ define void @s_shuffle_v2p3_v3p3__5_1() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s9
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2400,12 +2378,13 @@ define void @s_shuffle_v2p3_v3p3__5_1() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:10]
+; GFX942-NEXT:    ; def s[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s5
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2424,11 +2403,11 @@ define void @s_shuffle_v2p3_v3p3__5_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s10
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s10
-; GFX900-NEXT:    s_mov_b32 s9, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2440,11 +2419,11 @@ define void @s_shuffle_v2p3_v3p3__5_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s10
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s10
-; GFX90A-NEXT:    s_mov_b32 s9, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2454,13 +2433,13 @@ define void @s_shuffle_v2p3_v3p3__5_2() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:2]
+; GFX942-NEXT:    ; def s[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s6
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:6]
+; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s6
-; GFX942-NEXT:    s_mov_b32 s9, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2479,8 +2458,8 @@ define void @s_shuffle_v2p3_v3p3__5_3() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s4
+; GFX900-NEXT:    s_mov_b32 s7, s4
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2492,8 +2471,8 @@ define void @s_shuffle_v2p3_v3p3__5_3() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s4
+; GFX90A-NEXT:    s_mov_b32 s7, s4
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2505,8 +2484,8 @@ define void @s_shuffle_v2p3_v3p3__5_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s0
+; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2519,17 +2498,44 @@ define void @s_shuffle_v2p3_v3p3__5_3() {
 }
 
 define void @s_shuffle_v2p3_v3p3__5_4() {
-; GFX9-LABEL: s_shuffle_v2p3_v3p3__5_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:10]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:9]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p3_v3p3__5_4:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:6]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s5
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p3_v3p3__5_4:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:6]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s5
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p3_v3p3__5_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:2]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:9]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"()
   %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> <i32 5, i32 4>
@@ -2544,8 +2550,8 @@ define void @s_shuffle_v2p3_v3p3__5_5() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s6
+; GFX900-NEXT:    s_mov_b32 s7, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2557,8 +2563,8 @@ define void @s_shuffle_v2p3_v3p3__5_5() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s6
+; GFX90A-NEXT:    s_mov_b32 s7, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2570,8 +2576,8 @@ define void @s_shuffle_v2p3_v3p3__5_5() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s2
+; GFX942-NEXT:    s_mov_b32 s3, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2695,8 +2701,8 @@ define void @s_shuffle_v2p3_v3p3__2_0() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s4
+; GFX900-NEXT:    s_mov_b32 s7, s4
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2708,8 +2714,8 @@ define void @s_shuffle_v2p3_v3p3__2_0() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s4
+; GFX90A-NEXT:    s_mov_b32 s7, s4
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2721,8 +2727,8 @@ define void @s_shuffle_v2p3_v3p3__2_0() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s0
+; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2929,17 +2935,44 @@ define void @s_shuffle_v2p3_v3p3__1_1() {
 }
 
 define void @s_shuffle_v2p3_v3p3__2_1() {
-; GFX9-LABEL: s_shuffle_v2p3_v3p3__2_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; def s[8:10]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_mov_b32 s8, s10
-; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use s[8:9]
-; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: s_shuffle_v2p3_v3p3__2_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[4:6]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s5
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[8:9]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: s_shuffle_v2p3_v3p3__2_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s[4:6]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s5
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[8:9]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: s_shuffle_v2p3_v3p3__2_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def s[0:2]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s1
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; use s[8:9]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"()
   %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> <i32 2, i32 1>
   call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf)
@@ -3150,8 +3183,8 @@ define void @s_shuffle_v2p3_v3p3__2_2() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s6
+; GFX900-NEXT:    s_mov_b32 s7, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -3163,8 +3196,8 @@ define void @s_shuffle_v2p3_v3p3__2_2() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s6
+; GFX90A-NEXT:    s_mov_b32 s7, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -3176,8 +3209,8 @@ define void @s_shuffle_v2p3_v3p3__2_2() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s2
+; GFX942-NEXT:    s_mov_b32 s3, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3388,7 +3421,7 @@ define void @s_shuffle_v2p3_v3p3__2_3() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -3400,7 +3433,7 @@ define void @s_shuffle_v2p3_v3p3__2_3() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -3412,7 +3445,7 @@ define void @s_shuffle_v2p3_v3p3__2_3() {
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3635,10 +3668,11 @@ define void @s_shuffle_v2p3_v3p3__2_4() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s9
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -3650,10 +3684,11 @@ define void @s_shuffle_v2p3_v3p3__2_4() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s9
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -3663,12 +3698,13 @@ define void @s_shuffle_v2p3_v3p3__2_4() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:10]
+; GFX942-NEXT:    ; def s[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s5
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3897,11 +3933,11 @@ define void @s_shuffle_v2p3_v3p3__2_5() {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s10
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def s[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s6
-; GFX900-NEXT:    s_mov_b32 s9, s10
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:9]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -3913,11 +3949,11 @@ define void @s_shuffle_v2p3_v3p3__2_5() {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s10
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def s[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    s_mov_b32 s9, s10
+; GFX90A-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:9]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -3927,13 +3963,13 @@ define void @s_shuffle_v2p3_v3p3__2_5() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:2]
+; GFX942-NEXT:    ; def s[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_mov_b32 s3, s6
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[4:6]
+; GFX942-NEXT:    ; def s[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s2
-; GFX942-NEXT:    s_mov_b32 s9, s6
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:9]
 ; GFX942-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll
index 430f64164d24f..35cf10f1135c9 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll
@@ -58,12 +58,11 @@ define void @v_shuffle_v3f32_v2f32__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v2, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v2, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -111,12 +110,11 @@ define void @v_shuffle_v3f32_v2f32__3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v2, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v2, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -154,15 +152,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_0_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -208,15 +205,14 @@ define void @v_shuffle_v3f32_v2f32__3_1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -263,10 +259,10 @@ define void @v_shuffle_v3f32_v2f32__3_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -307,12 +303,12 @@ define void @v_shuffle_v3f32_v2f32__3_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v2, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -350,15 +346,15 @@ define void @v_shuffle_v3f32_v2f32__3_3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[3:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -403,14 +399,14 @@ define void @v_shuffle_v3f32_v2f32__3_3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -458,11 +454,11 @@ define void @v_shuffle_v3f32_v2f32__3_3_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -503,13 +499,13 @@ define void @v_shuffle_v3f32_v2f32__3_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -561,26 +557,25 @@ define void @v_shuffle_v3f32_v2f32__u_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__u_0_0:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:1]
+; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
-; GFX90A-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    global_store_dwordx3 v0, v[0:2], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v3f32_v2f32__u_0_0:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:1]
+; GFX942-NEXT:    ; def v[2:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    global_store_dwordx3 v0, v[0:2], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -638,13 +633,13 @@ define void @v_shuffle_v3f32_v2f32__1_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_0_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -696,26 +691,25 @@ define void @v_shuffle_v3f32_v2f32__2_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__2_0_0:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:1]
+; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
-; GFX90A-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    global_store_dwordx3 v0, v[0:2], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v3f32_v2f32__2_0_0:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:1]
+; GFX942-NEXT:    ; def v[2:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    global_store_dwordx3 v0, v[0:2], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -729,15 +723,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -784,15 +777,14 @@ define void @v_shuffle_v3f32_v2f32__3_u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -836,16 +828,15 @@ define void @v_shuffle_v3f32_v2f32__3_1_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[3:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -891,16 +882,15 @@ define void @v_shuffle_v3f32_v2f32__3_2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[3:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1033,13 +1023,13 @@ define void @v_shuffle_v3f32_v2f32__1_1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1121,16 +1111,15 @@ define void @v_shuffle_v3f32_v2f32__3_1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1176,15 +1165,14 @@ define void @v_shuffle_v3f32_v2f32__3_u_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1230,15 +1218,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_0_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:4]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1289,13 +1276,12 @@ define void @v_shuffle_v3f32_v2f32__3_2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1393,12 +1379,11 @@ define void @v_shuffle_v3f32_v2f32__1_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v2, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v2, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1446,13 +1431,13 @@ define void @v_shuffle_v3f32_v2f32__3_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1493,13 +1478,12 @@ define void @v_shuffle_v3f32_v2f32__3_u_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v2, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1539,30 +1523,29 @@ define void @v_shuffle_v3f32_v2f32__3_0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:3]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v5
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v4
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v[2:3]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
@@ -1570,16 +1553,16 @@ define void @v_shuffle_v3f32_v2f32__3_0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[2:3]
-; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
-; GFX942-NEXT:    v_mov_b32_e32 v0, v5
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def v[2:3]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, v3
+; GFX942-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX942-NEXT:    global_store_dwordx3 v6, v[0:2], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1595,14 +1578,14 @@ define void @v_shuffle_v3f32_v2f32__3_1_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1746,15 +1729,14 @@ define void @v_shuffle_v3f32_v2f32__1_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1844,12 +1826,12 @@ define void @v_shuffle_v3f32_v2f32__3_u_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v2, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1890,15 +1872,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1947,15 +1928,14 @@ define void @v_shuffle_v3f32_v2f32__3_1_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2001,12 +1981,13 @@ define void @v_shuffle_v3f32_v2f32__3_2_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2370,7 +2351,29 @@ define void @s_shuffle_v3f32_v2f32__3_3_u() {
 }
 
 define void @s_shuffle_v3f32_v2f32__3_3_0() {
-; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_0:
+; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_3_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[8:9]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s9
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x float> asm "; def $0", "=s"()
+  %vec1 = call <2 x float> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> <i32 3, i32 3, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3f32_v2f32__3_3_1() {
+; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -2380,13 +2383,13 @@ define void @s_shuffle_v3f32_v2f32__3_3_0() {
 ; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s9
-; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s10, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_0:
+; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -2396,13 +2399,13 @@ define void @s_shuffle_v3f32_v2f32__3_3_0() {
 ; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s9
-; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s10, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_0:
+; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -2412,115 +2415,31 @@ define void @s_shuffle_v3f32_v2f32__3_3_0() {
 ; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s9
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s10, s1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x float> asm "; def $0", "=s"()
   %vec1 = call <2 x float> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> <i32 3, i32 3, i32 0>
-  call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3f32_v2f32__3_3_1() {
-; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_1:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s9
-; GFX900-NEXT:    s_mov_b32 s10, s5
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_1:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s9
-; GFX90A-NEXT:    s_mov_b32 s10, s5
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_1:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:9]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s9
-; GFX942-NEXT:    s_mov_b32 s10, s1
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x float> asm "; def $0", "=s"()
-  %vec1 = call <2 x float> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> <i32 3, i32 3, i32 1>
+  %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> <i32 3, i32 3, i32 1>
   call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf)
   ret void
 }
 
 define void @s_shuffle_v3f32_v2f32__3_3_2() {
-; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_2:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s5
-; GFX900-NEXT:    s_mov_b32 s9, s5
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_2:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s5
-; GFX90A-NEXT:    s_mov_b32 s9, s5
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_2:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s1
-; GFX942-NEXT:    s_mov_b32 s9, s1
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_3_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s11
+; GFX9-NEXT:    s_mov_b32 s9, s11
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x float> asm "; def $0", "=s"()
   %vec1 = call <2 x float> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> <i32 3, i32 3, i32 2>
@@ -2549,44 +2468,17 @@ define void @s_shuffle_v3f32_v2f32__3_3_3() {
 }
 
 define void @s_shuffle_v3f32_v2f32__u_0_0() {
-; GFX900-LABEL: s_shuffle_v3f32_v2f32__u_0_0:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3f32_v2f32__u_0_0:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3f32_v2f32__u_0_0:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3f32_v2f32__u_0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s9, s10
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x float> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> <i32 poison, i32 0, i32 0>
   call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf)
@@ -2613,47 +2505,18 @@ define void @s_shuffle_v3f32_v2f32__0_0_0() {
 }
 
 define void @s_shuffle_v3f32_v2f32__1_0_0() {
-; GFX900-LABEL: s_shuffle_v3f32_v2f32__1_0_0:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s5
-; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3f32_v2f32__1_0_0:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s5
-; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3f32_v2f32__1_0_0:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s1
-; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3f32_v2f32__1_0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s11
+; GFX9-NEXT:    s_mov_b32 s9, s10
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x float> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> <i32 1, i32 0, i32 0>
   call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf)
@@ -2661,44 +2524,17 @@ define void @s_shuffle_v3f32_v2f32__1_0_0() {
 }
 
 define void @s_shuffle_v3f32_v2f32__2_0_0() {
-; GFX900-LABEL: s_shuffle_v3f32_v2f32__2_0_0:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3f32_v2f32__2_0_0:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3f32_v2f32__2_0_0:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3f32_v2f32__2_0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s9, s10
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x float> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> <i32 2, i32 0, i32 0>
   call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf)
@@ -2710,14 +2546,13 @@ define void @s_shuffle_v3f32_v2f32__3_0_0() {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
+; GFX900-NEXT:    ; def s[10:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[6:7]
+; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s7
-; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s8, s5
+; GFX900-NEXT:    s_mov_b32 s9, s10
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2727,14 +2562,13 @@ define void @s_shuffle_v3f32_v2f32__3_0_0() {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
+; GFX90A-NEXT:    ; def s[10:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[6:7]
+; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s7
-; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s8, s5
+; GFX90A-NEXT:    s_mov_b32 s9, s10
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2744,14 +2578,13 @@ define void @s_shuffle_v3f32_v2f32__3_0_0() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
+; GFX942-NEXT:    ; def s[10:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[2:3]
+; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s3
-; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s8, s1
+; GFX942-NEXT:    s_mov_b32 s9, s10
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2768,13 +2601,12 @@ define void @s_shuffle_v3f32_v2f32__3_u_0() {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
+; GFX900-NEXT:    ; def s[10:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[6:7]
+; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s7
-; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s8, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2784,13 +2616,12 @@ define void @s_shuffle_v3f32_v2f32__3_u_0() {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
+; GFX90A-NEXT:    ; def s[10:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[6:7]
+; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s8, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2800,13 +2631,12 @@ define void @s_shuffle_v3f32_v2f32__3_u_0() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
+; GFX942-NEXT:    ; def s[10:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[2:3]
+; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s8, s1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2823,14 +2653,13 @@ define void @s_shuffle_v3f32_v2f32__3_1_0() {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
+; GFX900-NEXT:    ; def s[10:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[6:7]
+; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s7
-; GFX900-NEXT:    s_mov_b32 s9, s5
-; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s8, s5
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2840,14 +2669,13 @@ define void @s_shuffle_v3f32_v2f32__3_1_0() {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
+; GFX90A-NEXT:    ; def s[10:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[6:7]
+; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s7
-; GFX90A-NEXT:    s_mov_b32 s9, s5
-; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s8, s5
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2857,14 +2685,13 @@ define void @s_shuffle_v3f32_v2f32__3_1_0() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
+; GFX942-NEXT:    ; def s[10:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[2:3]
+; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s3
-; GFX942-NEXT:    s_mov_b32 s9, s1
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s8, s1
+; GFX942-NEXT:    s_mov_b32 s9, s11
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2881,14 +2708,13 @@ define void @s_shuffle_v3f32_v2f32__3_2_0() {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
+; GFX900-NEXT:    ; def s[10:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[6:7]
+; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s7
-; GFX900-NEXT:    s_mov_b32 s9, s6
-; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s8, s5
+; GFX900-NEXT:    s_mov_b32 s9, s4
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2898,14 +2724,13 @@ define void @s_shuffle_v3f32_v2f32__3_2_0() {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
+; GFX90A-NEXT:    ; def s[10:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[6:7]
+; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s7
-; GFX90A-NEXT:    s_mov_b32 s9, s6
-; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s8, s5
+; GFX90A-NEXT:    s_mov_b32 s9, s4
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2915,14 +2740,13 @@ define void @s_shuffle_v3f32_v2f32__3_2_0() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
+; GFX942-NEXT:    ; def s[10:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[2:3]
+; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s3
-; GFX942-NEXT:    s_mov_b32 s9, s2
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s8, s1
+; GFX942-NEXT:    s_mov_b32 s9, s0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3344,47 +3168,18 @@ define void @s_shuffle_v3f32_v2f32__2_2_2() {
 }
 
 define void @s_shuffle_v3f32_v2f32__3_2_2() {
-; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_2_2:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s5
-; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_2_2:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s5
-; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_2_2:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s1
-; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_2_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s11
+; GFX9-NEXT:    s_mov_b32 s9, s10
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x float> asm "; def $0", "=s"()
   %vec1 = call <2 x float> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> <i32 3, i32 2, i32 2>
@@ -3393,44 +3188,17 @@ define void @s_shuffle_v3f32_v2f32__3_2_2() {
 }
 
 define void @s_shuffle_v3f32_v2f32__3_u_2() {
-; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_u_2:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s5
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_u_2:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s5
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_u_2:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s1
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_u_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s11
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x float> asm "; def $0", "=s"()
   %vec1 = call <2 x float> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> <i32 3, i32 poison, i32 2>
@@ -3446,11 +3214,10 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() {
 ; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[6:7]
+; GFX900-NEXT:    ; def s[10:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s7
+; GFX900-NEXT:    s_mov_b32 s8, s11
 ; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s6
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -3463,11 +3230,10 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() {
 ; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[6:7]
+; GFX90A-NEXT:    ; def s[10:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s11
 ; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s6
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -3480,11 +3246,10 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() {
 ; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[2:3]
+; GFX942-NEXT:    ; def s[10:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s3
+; GFX942-NEXT:    s_mov_b32 s8, s11
 ; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s2
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3497,53 +3262,20 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() {
 }
 
 define void @s_shuffle_v3f32_v2f32__3_1_2() {
-; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_1_2:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s5
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_1_2:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s5
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_1_2:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:9]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s1
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_1_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[8:9]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s11
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x float> asm "; def $0", "=s"()
   %vec1 = call <2 x float> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> <i32 3, i32 1, i32 2>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll
index ef670e963bdb6..befc1126d6fa4 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll
@@ -58,12 +58,11 @@ define void @v_shuffle_v3f32_v3f32__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -153,12 +152,11 @@ define void @v_shuffle_v3f32_v3f32__4_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v3f32__4_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -240,15 +238,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -355,9 +352,8 @@ define void @v_shuffle_v3f32_v3f32__5_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v1, v5
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -403,13 +399,12 @@ define void @v_shuffle_v3f32_v3f32__5_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -495,9 +490,8 @@ define void @v_shuffle_v3f32_v3f32__5_5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -540,14 +534,12 @@ define void @v_shuffle_v3f32_v3f32__5_5_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -597,16 +589,14 @@ define void @v_shuffle_v3f32_v3f32__5_5_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v4
-; GFX900-NEXT:    v_mov_b32_e32 v3, v4
-; GFX900-NEXT:    v_mov_b32_e32 v4, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v1, v2
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -656,15 +646,14 @@ define void @v_shuffle_v3f32_v3f32__5_5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v5
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -760,14 +749,13 @@ define void @v_shuffle_v3f32_v3f32__5_5_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v2
-; GFX900-NEXT:    v_mov_b32_e32 v4, v2
-; GFX900-NEXT:    v_mov_b32_e32 v5, v1
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v1, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -813,10 +801,10 @@ define void @v_shuffle_v3f32_v3f32__5_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -857,12 +845,11 @@ define void @v_shuffle_v3f32_v3f32__u_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v0, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -994,13 +981,12 @@ define void @v_shuffle_v3f32_v3f32__2_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1041,12 +1027,11 @@ define void @v_shuffle_v3f32_v3f32__3_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v0, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1088,14 +1073,12 @@ define void @v_shuffle_v3f32_v3f32__4_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1145,16 +1128,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1376,16 +1358,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    v_mov_b32_e32 v3, v1
-; GFX900-NEXT:    v_mov_b32_e32 v4, v0
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
+; GFX900-NEXT:    global_store_dwordx3 v7, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1574,10 +1555,10 @@ define void @v_shuffle_v3f32_v3f32__1_1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1704,15 +1685,14 @@ define void @v_shuffle_v3f32_v3f32__4_1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1816,15 +1796,14 @@ define void @v_shuffle_v3f32_v3f32__5_u_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v5
+; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1872,16 +1851,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v4
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    v_mov_b32_e32 v4, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v5
+; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1931,16 +1908,15 @@ define void @v_shuffle_v3f32_v3f32__5_2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v5
-; GFX900-NEXT:    v_mov_b32_e32 v4, v2
-; GFX900-NEXT:    v_mov_b32_e32 v5, v1
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v6
+; GFX900-NEXT:    v_mov_b32_e32 v1, v3
+; GFX900-NEXT:    global_store_dwordx3 v7, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1989,16 +1965,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v6, 0
-; GFX900-NEXT:    v_mov_b32_e32 v3, v4
-; GFX900-NEXT:    v_mov_b32_e32 v4, v2
-; GFX900-NEXT:    v_mov_b32_e32 v5, v1
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
+; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2100,12 +2075,12 @@ define void @v_shuffle_v3f32_v3f32__u_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v3f32__u_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2184,13 +2159,12 @@ define void @v_shuffle_v3f32_v3f32__1_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2232,10 +2206,10 @@ define void @v_shuffle_v3f32_v3f32__2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2274,12 +2248,12 @@ define void @v_shuffle_v3f32_v3f32__3_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v3f32__3_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2317,15 +2291,14 @@ define void @v_shuffle_v3f32_v3f32__4_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v4
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2374,13 +2347,13 @@ define void @v_shuffle_v3f32_v3f32__5_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v5
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2480,16 +2453,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v5
-; GFX900-NEXT:    v_mov_b32_e32 v4, v0
-; GFX900-NEXT:    v_mov_b32_e32 v5, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v6
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v7, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2591,15 +2563,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2751,12 +2723,11 @@ define void @v_shuffle_v3f32_v3f32__1_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2895,13 +2866,12 @@ define void @v_shuffle_v3f32_v3f32__5_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2988,16 +2958,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    v_mov_b32_e32 v4, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3103,10 +3071,8 @@ define void @v_shuffle_v3f32_v3f32__5_2_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v1, v5
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3306,9 +3272,8 @@ define void @v_shuffle_v3f32_v3f32__1_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    v_mov_b32_e32 v4, v3
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3456,10 +3421,10 @@ define void @v_shuffle_v3f32_v3f32__4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3545,13 +3510,12 @@ define void @v_shuffle_v3f32_v3f32__5_u_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3592,15 +3556,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, v0
-; GFX900-NEXT:    v_mov_b32_e32 v5, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3705,14 +3669,13 @@ define void @v_shuffle_v3f32_v3f32__5_2_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v4
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v5
+; GFX900-NEXT:    v_mov_b32_e32 v3, v4
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3760,13 +3723,12 @@ define void @v_shuffle_v3f32_v3f32__5_3_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    v_mov_b32_e32 v4, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3807,12 +3769,12 @@ define void @v_shuffle_v3f32_v3f32__u_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v3f32__u_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3913,9 +3875,9 @@ define void @v_shuffle_v3f32_v3f32__1_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX900-NEXT:    v_mov_b32_e32 v3, v4
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3963,15 +3925,15 @@ define void @v_shuffle_v3f32_v3f32__2_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX900-NEXT:    v_mov_b32_e32 v3, v2
-; GFX900-NEXT:    v_mov_b32_e32 v4, v5
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4060,13 +4022,12 @@ define void @v_shuffle_v3f32_v3f32__4_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v3f32__4_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4150,15 +4111,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4262,15 +4223,15 @@ define void @v_shuffle_v3f32_v3f32__5_2_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX900-NEXT:    v_mov_b32_e32 v3, v5
-; GFX900-NEXT:    v_mov_b32_e32 v4, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4316,14 +4277,13 @@ define void @v_shuffle_v3f32_v3f32__5_3_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v2
-; GFX900-NEXT:    v_mov_b32_e32 v4, v0
-; GFX900-NEXT:    v_mov_b32_e32 v5, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll
index ea4fac3b1d2b1..51d45922893b3 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll
@@ -58,12 +58,11 @@ define void @v_shuffle_v3i32_v2i32__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v2, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v2, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -111,12 +110,11 @@ define void @v_shuffle_v3i32_v2i32__3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v2, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v2, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -154,15 +152,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_0_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -208,15 +205,14 @@ define void @v_shuffle_v3i32_v2i32__3_1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -263,10 +259,10 @@ define void @v_shuffle_v3i32_v2i32__3_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -307,12 +303,12 @@ define void @v_shuffle_v3i32_v2i32__3_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v2, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -350,15 +346,15 @@ define void @v_shuffle_v3i32_v2i32__3_3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[3:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -403,14 +399,14 @@ define void @v_shuffle_v3i32_v2i32__3_3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -458,11 +454,11 @@ define void @v_shuffle_v3i32_v2i32__3_3_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -503,13 +499,13 @@ define void @v_shuffle_v3i32_v2i32__3_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -561,26 +557,25 @@ define void @v_shuffle_v3i32_v2i32__u_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__u_0_0:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:1]
+; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
-; GFX90A-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    global_store_dwordx3 v0, v[0:2], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v3i32_v2i32__u_0_0:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:1]
+; GFX942-NEXT:    ; def v[2:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    global_store_dwordx3 v0, v[0:2], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -638,13 +633,13 @@ define void @v_shuffle_v3i32_v2i32__1_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_0_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -696,26 +691,25 @@ define void @v_shuffle_v3i32_v2i32__2_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__2_0_0:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:1]
+; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
-; GFX90A-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    global_store_dwordx3 v0, v[0:2], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v3i32_v2i32__2_0_0:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:1]
+; GFX942-NEXT:    ; def v[2:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    global_store_dwordx3 v0, v[0:2], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -729,15 +723,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -784,15 +777,14 @@ define void @v_shuffle_v3i32_v2i32__3_u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -836,16 +828,15 @@ define void @v_shuffle_v3i32_v2i32__3_1_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[3:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -891,16 +882,15 @@ define void @v_shuffle_v3i32_v2i32__3_2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[3:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1033,13 +1023,13 @@ define void @v_shuffle_v3i32_v2i32__1_1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1121,16 +1111,15 @@ define void @v_shuffle_v3i32_v2i32__3_1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1176,15 +1165,14 @@ define void @v_shuffle_v3i32_v2i32__3_u_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1230,15 +1218,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_0_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:4]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1289,13 +1276,12 @@ define void @v_shuffle_v3i32_v2i32__3_2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1393,12 +1379,11 @@ define void @v_shuffle_v3i32_v2i32__1_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v2, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v2, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1446,13 +1431,13 @@ define void @v_shuffle_v3i32_v2i32__3_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1493,13 +1478,12 @@ define void @v_shuffle_v3i32_v2i32__3_u_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v2, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1539,30 +1523,29 @@ define void @v_shuffle_v3i32_v2i32__3_0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:3]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v5
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v4
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v[2:3]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
@@ -1570,16 +1553,16 @@ define void @v_shuffle_v3i32_v2i32__3_0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[2:3]
-; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
-; GFX942-NEXT:    v_mov_b32_e32 v0, v5
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def v[2:3]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, v3
+; GFX942-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX942-NEXT:    global_store_dwordx3 v6, v[0:2], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1595,14 +1578,14 @@ define void @v_shuffle_v3i32_v2i32__3_1_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1746,15 +1729,14 @@ define void @v_shuffle_v3i32_v2i32__1_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1844,12 +1826,12 @@ define void @v_shuffle_v3i32_v2i32__3_u_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v2, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1890,15 +1872,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1947,15 +1928,14 @@ define void @v_shuffle_v3i32_v2i32__3_1_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2001,12 +1981,13 @@ define void @v_shuffle_v3i32_v2i32__3_2_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2370,7 +2351,29 @@ define void @s_shuffle_v3i32_v2i32__3_3_u() {
 }
 
 define void @s_shuffle_v3i32_v2i32__3_3_0() {
-; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_0:
+; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_3_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[8:9]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s9
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x i32> asm "; def $0", "=s"()
+  %vec1 = call <2 x i32> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> <i32 3, i32 3, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3i32_v2i32__3_3_1() {
+; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -2380,13 +2383,13 @@ define void @s_shuffle_v3i32_v2i32__3_3_0() {
 ; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s9
-; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s10, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_0:
+; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -2396,13 +2399,13 @@ define void @s_shuffle_v3i32_v2i32__3_3_0() {
 ; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s9
-; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s10, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_0:
+; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -2412,115 +2415,31 @@ define void @s_shuffle_v3i32_v2i32__3_3_0() {
 ; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s9
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s10, s1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i32> asm "; def $0", "=s"()
   %vec1 = call <2 x i32> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> <i32 3, i32 3, i32 0>
-  call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3i32_v2i32__3_3_1() {
-; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_1:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s9
-; GFX900-NEXT:    s_mov_b32 s10, s5
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_1:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s9
-; GFX90A-NEXT:    s_mov_b32 s10, s5
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_1:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:9]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s9
-; GFX942-NEXT:    s_mov_b32 s10, s1
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x i32> asm "; def $0", "=s"()
-  %vec1 = call <2 x i32> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> <i32 3, i32 3, i32 1>
+  %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> <i32 3, i32 3, i32 1>
   call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf)
   ret void
 }
 
 define void @s_shuffle_v3i32_v2i32__3_3_2() {
-; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_2:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s5
-; GFX900-NEXT:    s_mov_b32 s9, s5
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_2:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s5
-; GFX90A-NEXT:    s_mov_b32 s9, s5
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_2:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s1
-; GFX942-NEXT:    s_mov_b32 s9, s1
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_3_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s11
+; GFX9-NEXT:    s_mov_b32 s9, s11
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i32> asm "; def $0", "=s"()
   %vec1 = call <2 x i32> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> <i32 3, i32 3, i32 2>
@@ -2549,44 +2468,17 @@ define void @s_shuffle_v3i32_v2i32__3_3_3() {
 }
 
 define void @s_shuffle_v3i32_v2i32__u_0_0() {
-; GFX900-LABEL: s_shuffle_v3i32_v2i32__u_0_0:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3i32_v2i32__u_0_0:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3i32_v2i32__u_0_0:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3i32_v2i32__u_0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s9, s10
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i32> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> <i32 poison, i32 0, i32 0>
   call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf)
@@ -2613,47 +2505,18 @@ define void @s_shuffle_v3i32_v2i32__0_0_0() {
 }
 
 define void @s_shuffle_v3i32_v2i32__1_0_0() {
-; GFX900-LABEL: s_shuffle_v3i32_v2i32__1_0_0:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s5
-; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3i32_v2i32__1_0_0:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s5
-; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3i32_v2i32__1_0_0:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s1
-; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3i32_v2i32__1_0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s11
+; GFX9-NEXT:    s_mov_b32 s9, s10
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i32> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> <i32 1, i32 0, i32 0>
   call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf)
@@ -2661,44 +2524,17 @@ define void @s_shuffle_v3i32_v2i32__1_0_0() {
 }
 
 define void @s_shuffle_v3i32_v2i32__2_0_0() {
-; GFX900-LABEL: s_shuffle_v3i32_v2i32__2_0_0:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3i32_v2i32__2_0_0:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3i32_v2i32__2_0_0:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3i32_v2i32__2_0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s9, s10
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i32> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> <i32 2, i32 0, i32 0>
   call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf)
@@ -2710,14 +2546,13 @@ define void @s_shuffle_v3i32_v2i32__3_0_0() {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
+; GFX900-NEXT:    ; def s[10:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[6:7]
+; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s7
-; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s8, s5
+; GFX900-NEXT:    s_mov_b32 s9, s10
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2727,14 +2562,13 @@ define void @s_shuffle_v3i32_v2i32__3_0_0() {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
+; GFX90A-NEXT:    ; def s[10:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[6:7]
+; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s7
-; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s8, s5
+; GFX90A-NEXT:    s_mov_b32 s9, s10
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2744,14 +2578,13 @@ define void @s_shuffle_v3i32_v2i32__3_0_0() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
+; GFX942-NEXT:    ; def s[10:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[2:3]
+; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s3
-; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s8, s1
+; GFX942-NEXT:    s_mov_b32 s9, s10
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2768,13 +2601,12 @@ define void @s_shuffle_v3i32_v2i32__3_u_0() {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
+; GFX900-NEXT:    ; def s[10:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[6:7]
+; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s7
-; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s8, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2784,13 +2616,12 @@ define void @s_shuffle_v3i32_v2i32__3_u_0() {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
+; GFX90A-NEXT:    ; def s[10:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[6:7]
+; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s8, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2800,13 +2631,12 @@ define void @s_shuffle_v3i32_v2i32__3_u_0() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
+; GFX942-NEXT:    ; def s[10:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[2:3]
+; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s8, s1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2823,14 +2653,13 @@ define void @s_shuffle_v3i32_v2i32__3_1_0() {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
+; GFX900-NEXT:    ; def s[10:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[6:7]
+; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s7
-; GFX900-NEXT:    s_mov_b32 s9, s5
-; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s8, s5
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2840,14 +2669,13 @@ define void @s_shuffle_v3i32_v2i32__3_1_0() {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
+; GFX90A-NEXT:    ; def s[10:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[6:7]
+; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s7
-; GFX90A-NEXT:    s_mov_b32 s9, s5
-; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s8, s5
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2857,14 +2685,13 @@ define void @s_shuffle_v3i32_v2i32__3_1_0() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
+; GFX942-NEXT:    ; def s[10:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[2:3]
+; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s3
-; GFX942-NEXT:    s_mov_b32 s9, s1
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s8, s1
+; GFX942-NEXT:    s_mov_b32 s9, s11
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2881,14 +2708,13 @@ define void @s_shuffle_v3i32_v2i32__3_2_0() {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
+; GFX900-NEXT:    ; def s[10:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[6:7]
+; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s7
-; GFX900-NEXT:    s_mov_b32 s9, s6
-; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s8, s5
+; GFX900-NEXT:    s_mov_b32 s9, s4
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2898,14 +2724,13 @@ define void @s_shuffle_v3i32_v2i32__3_2_0() {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
+; GFX90A-NEXT:    ; def s[10:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[6:7]
+; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s7
-; GFX90A-NEXT:    s_mov_b32 s9, s6
-; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s8, s5
+; GFX90A-NEXT:    s_mov_b32 s9, s4
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2915,14 +2740,13 @@ define void @s_shuffle_v3i32_v2i32__3_2_0() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
+; GFX942-NEXT:    ; def s[10:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[2:3]
+; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s3
-; GFX942-NEXT:    s_mov_b32 s9, s2
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s8, s1
+; GFX942-NEXT:    s_mov_b32 s9, s0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3344,47 +3168,18 @@ define void @s_shuffle_v3i32_v2i32__2_2_2() {
 }
 
 define void @s_shuffle_v3i32_v2i32__3_2_2() {
-; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_2_2:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s5
-; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_2_2:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s5
-; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_2_2:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s1
-; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_2_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s11
+; GFX9-NEXT:    s_mov_b32 s9, s10
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i32> asm "; def $0", "=s"()
   %vec1 = call <2 x i32> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> <i32 3, i32 2, i32 2>
@@ -3393,44 +3188,17 @@ define void @s_shuffle_v3i32_v2i32__3_2_2() {
 }
 
 define void @s_shuffle_v3i32_v2i32__3_u_2() {
-; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_u_2:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s5
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_u_2:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s5
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_u_2:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s1
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_u_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s11
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i32> asm "; def $0", "=s"()
   %vec1 = call <2 x i32> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> <i32 3, i32 poison, i32 2>
@@ -3446,11 +3214,10 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() {
 ; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[6:7]
+; GFX900-NEXT:    ; def s[10:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s7
+; GFX900-NEXT:    s_mov_b32 s8, s11
 ; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s6
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -3463,11 +3230,10 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() {
 ; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[6:7]
+; GFX90A-NEXT:    ; def s[10:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s11
 ; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s6
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -3480,11 +3246,10 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() {
 ; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[2:3]
+; GFX942-NEXT:    ; def s[10:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s3
+; GFX942-NEXT:    s_mov_b32 s8, s11
 ; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s2
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3497,53 +3262,20 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() {
 }
 
 define void @s_shuffle_v3i32_v2i32__3_1_2() {
-; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_1_2:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s5
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_1_2:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s5
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_1_2:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:9]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s1
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_1_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[8:9]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s11
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x i32> asm "; def $0", "=s"()
   %vec1 = call <2 x i32> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> <i32 3, i32 1, i32 2>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll
index 7061c13b28d03..89e6a2918a68c 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll
@@ -58,12 +58,11 @@ define void @v_shuffle_v3i32_v3i32__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -153,12 +152,11 @@ define void @v_shuffle_v3i32_v3i32__4_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v3i32__4_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -240,15 +238,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -355,9 +352,8 @@ define void @v_shuffle_v3i32_v3i32__5_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v1, v5
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -403,13 +399,12 @@ define void @v_shuffle_v3i32_v3i32__5_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -495,9 +490,8 @@ define void @v_shuffle_v3i32_v3i32__5_5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -540,14 +534,12 @@ define void @v_shuffle_v3i32_v3i32__5_5_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -597,16 +589,14 @@ define void @v_shuffle_v3i32_v3i32__5_5_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v4
-; GFX900-NEXT:    v_mov_b32_e32 v3, v4
-; GFX900-NEXT:    v_mov_b32_e32 v4, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v1, v2
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -656,15 +646,14 @@ define void @v_shuffle_v3i32_v3i32__5_5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v5
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -760,14 +749,13 @@ define void @v_shuffle_v3i32_v3i32__5_5_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v2
-; GFX900-NEXT:    v_mov_b32_e32 v4, v2
-; GFX900-NEXT:    v_mov_b32_e32 v5, v1
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v1, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -813,10 +801,10 @@ define void @v_shuffle_v3i32_v3i32__5_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -857,12 +845,11 @@ define void @v_shuffle_v3i32_v3i32__u_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v0, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -994,13 +981,12 @@ define void @v_shuffle_v3i32_v3i32__2_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1041,12 +1027,11 @@ define void @v_shuffle_v3i32_v3i32__3_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v0, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1088,14 +1073,12 @@ define void @v_shuffle_v3i32_v3i32__4_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1145,16 +1128,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1376,16 +1358,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    v_mov_b32_e32 v3, v1
-; GFX900-NEXT:    v_mov_b32_e32 v4, v0
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
+; GFX900-NEXT:    global_store_dwordx3 v7, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1574,10 +1555,10 @@ define void @v_shuffle_v3i32_v3i32__1_1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1704,15 +1685,14 @@ define void @v_shuffle_v3i32_v3i32__4_1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1816,15 +1796,14 @@ define void @v_shuffle_v3i32_v3i32__5_u_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v5
+; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1872,16 +1851,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v4
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    v_mov_b32_e32 v4, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v5
+; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1931,16 +1908,15 @@ define void @v_shuffle_v3i32_v3i32__5_2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v5
-; GFX900-NEXT:    v_mov_b32_e32 v4, v2
-; GFX900-NEXT:    v_mov_b32_e32 v5, v1
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v6
+; GFX900-NEXT:    v_mov_b32_e32 v1, v3
+; GFX900-NEXT:    global_store_dwordx3 v7, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1989,16 +1965,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v6, 0
-; GFX900-NEXT:    v_mov_b32_e32 v3, v4
-; GFX900-NEXT:    v_mov_b32_e32 v4, v2
-; GFX900-NEXT:    v_mov_b32_e32 v5, v1
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
+; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2100,12 +2075,12 @@ define void @v_shuffle_v3i32_v3i32__u_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v3i32__u_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2184,13 +2159,12 @@ define void @v_shuffle_v3i32_v3i32__1_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2232,10 +2206,10 @@ define void @v_shuffle_v3i32_v3i32__2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2274,12 +2248,12 @@ define void @v_shuffle_v3i32_v3i32__3_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v3i32__3_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2317,15 +2291,14 @@ define void @v_shuffle_v3i32_v3i32__4_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v4
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2374,13 +2347,13 @@ define void @v_shuffle_v3i32_v3i32__5_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v5
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2480,16 +2453,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v5
-; GFX900-NEXT:    v_mov_b32_e32 v4, v0
-; GFX900-NEXT:    v_mov_b32_e32 v5, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v6
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v7, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2591,15 +2563,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2751,12 +2723,11 @@ define void @v_shuffle_v3i32_v3i32__1_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2895,13 +2866,12 @@ define void @v_shuffle_v3i32_v3i32__5_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2988,16 +2958,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    v_mov_b32_e32 v4, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3103,10 +3071,8 @@ define void @v_shuffle_v3i32_v3i32__5_2_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v1, v5
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3306,9 +3272,8 @@ define void @v_shuffle_v3i32_v3i32__1_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    v_mov_b32_e32 v4, v3
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3456,10 +3421,10 @@ define void @v_shuffle_v3i32_v3i32__4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3545,13 +3510,12 @@ define void @v_shuffle_v3i32_v3i32__5_u_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3592,15 +3556,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, v0
-; GFX900-NEXT:    v_mov_b32_e32 v5, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3705,14 +3669,13 @@ define void @v_shuffle_v3i32_v3i32__5_2_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v4
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v5
+; GFX900-NEXT:    v_mov_b32_e32 v3, v4
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3760,13 +3723,12 @@ define void @v_shuffle_v3i32_v3i32__5_3_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    v_mov_b32_e32 v4, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3807,12 +3769,12 @@ define void @v_shuffle_v3i32_v3i32__u_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v3i32__u_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3913,9 +3875,9 @@ define void @v_shuffle_v3i32_v3i32__1_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX900-NEXT:    v_mov_b32_e32 v3, v4
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3963,15 +3925,15 @@ define void @v_shuffle_v3i32_v3i32__2_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX900-NEXT:    v_mov_b32_e32 v3, v2
-; GFX900-NEXT:    v_mov_b32_e32 v4, v5
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4060,13 +4022,12 @@ define void @v_shuffle_v3i32_v3i32__4_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v3i32__4_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4150,15 +4111,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4262,15 +4223,15 @@ define void @v_shuffle_v3i32_v3i32__5_2_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX900-NEXT:    v_mov_b32_e32 v3, v5
-; GFX900-NEXT:    v_mov_b32_e32 v4, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4316,14 +4277,13 @@ define void @v_shuffle_v3i32_v3i32__5_3_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v2
-; GFX900-NEXT:    v_mov_b32_e32 v4, v0
-; GFX900-NEXT:    v_mov_b32_e32 v5, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll
index bd0100a4ffdb5..25e087bd922ac 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll
@@ -58,12 +58,11 @@ define void @v_shuffle_v3p3_v2p3__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v2, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v2, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -111,12 +110,11 @@ define void @v_shuffle_v3p3_v2p3__3_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v2, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v2, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -154,15 +152,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_0_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -208,15 +205,14 @@ define void @v_shuffle_v3p3_v2p3__3_1_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -263,10 +259,10 @@ define void @v_shuffle_v3p3_v2p3__3_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -307,12 +303,12 @@ define void @v_shuffle_v3p3_v2p3__3_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v2, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -350,15 +346,15 @@ define void @v_shuffle_v3p3_v2p3__3_3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[3:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -403,14 +399,14 @@ define void @v_shuffle_v3p3_v2p3__3_3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -458,11 +454,11 @@ define void @v_shuffle_v3p3_v2p3__3_3_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -503,13 +499,13 @@ define void @v_shuffle_v3p3_v2p3__3_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -561,26 +557,25 @@ define void @v_shuffle_v3p3_v2p3__u_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__u_0_0:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:1]
+; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
-; GFX90A-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    global_store_dwordx3 v0, v[0:2], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v3p3_v2p3__u_0_0:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:1]
+; GFX942-NEXT:    ; def v[2:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    global_store_dwordx3 v0, v[0:2], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -638,13 +633,13 @@ define void @v_shuffle_v3p3_v2p3__1_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_0_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -696,26 +691,25 @@ define void @v_shuffle_v3p3_v2p3__2_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__2_0_0:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:1]
+; GFX90A-NEXT:    ; def v[2:3]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
-; GFX90A-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    global_store_dwordx3 v0, v[0:2], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v3p3_v2p3__2_0_0:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:1]
+; GFX942-NEXT:    ; def v[2:3]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v3, 0
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v0
-; GFX942-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    global_store_dwordx3 v0, v[0:2], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -729,15 +723,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -784,15 +777,14 @@ define void @v_shuffle_v3p3_v2p3__3_u_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -836,16 +828,15 @@ define void @v_shuffle_v3p3_v2p3__3_1_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[3:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -891,16 +882,15 @@ define void @v_shuffle_v3p3_v2p3__3_2_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_0:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[3:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1033,13 +1023,13 @@ define void @v_shuffle_v3p3_v2p3__1_1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1121,16 +1111,15 @@ define void @v_shuffle_v3p3_v2p3__3_1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1176,15 +1165,14 @@ define void @v_shuffle_v3p3_v2p3__3_u_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1230,15 +1218,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_0_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:4]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1289,13 +1276,12 @@ define void @v_shuffle_v3p3_v2p3__3_2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1393,12 +1379,11 @@ define void @v_shuffle_v3p3_v2p3__1_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v2, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v2, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1446,13 +1431,13 @@ define void @v_shuffle_v3p3_v2p3__3_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1493,13 +1478,12 @@ define void @v_shuffle_v3p3_v2p3__3_u_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v2, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1539,30 +1523,29 @@ define void @v_shuffle_v3p3_v2p3__3_0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_2:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:3]
-; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v5
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v4
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v[2:3]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
@@ -1570,16 +1553,16 @@ define void @v_shuffle_v3p3_v2p3__3_0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_2:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[2:3]
-; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:5]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v1, v2
-; GFX942-NEXT:    v_mov_b32_e32 v0, v5
-; GFX942-NEXT:    v_mov_b32_e32 v2, v4
+; GFX942-NEXT:    ;;#ASMSTART
+; GFX942-NEXT:    ; def v[2:3]
+; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, v3
+; GFX942-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX942-NEXT:    global_store_dwordx3 v6, v[0:2], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -1595,14 +1578,14 @@ define void @v_shuffle_v3p3_v2p3__3_1_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v0
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1746,15 +1729,14 @@ define void @v_shuffle_v3p3_v2p3__1_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1844,12 +1826,12 @@ define void @v_shuffle_v3p3_v2p3__3_u_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v2, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1890,15 +1872,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1947,15 +1928,14 @@ define void @v_shuffle_v3p3_v2p3__3_1_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2001,12 +1981,13 @@ define void @v_shuffle_v3p3_v2p3__3_2_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2370,7 +2351,29 @@ define void @s_shuffle_v3p3_v2p3__3_3_u() {
 }
 
 define void @s_shuffle_v3p3_v2p3__3_3_0() {
-; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_0:
+; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_3_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[8:9]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s9
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"()
+  %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"()
+  %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> <i32 3, i32 3, i32 0>
+  call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf)
+  ret void
+}
+
+define void @s_shuffle_v3p3_v2p3__3_3_1() {
+; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_1:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -2380,13 +2383,13 @@ define void @s_shuffle_v3p3_v2p3__3_3_0() {
 ; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s8, s9
-; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s10, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_0:
+; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_1:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -2396,13 +2399,13 @@ define void @s_shuffle_v3p3_v2p3__3_3_0() {
 ; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_mov_b32 s8, s9
-; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s10, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_0:
+; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
@@ -2412,115 +2415,31 @@ define void @s_shuffle_v3p3_v2p3__3_3_0() {
 ; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_mov_b32 s8, s9
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s10, s1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> <i32 3, i32 3, i32 0>
-  call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf)
-  ret void
-}
-
-define void @s_shuffle_v3p3_v2p3__3_3_1() {
-; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_1:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s9
-; GFX900-NEXT:    s_mov_b32 s10, s5
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_1:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s9
-; GFX90A-NEXT:    s_mov_b32 s10, s5
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_1:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:9]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s9
-; GFX942-NEXT:    s_mov_b32 s10, s1
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
-  %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"()
-  %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"()
-  %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> <i32 3, i32 3, i32 1>
+  %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> <i32 3, i32 3, i32 1>
   call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf)
   ret void
 }
 
 define void @s_shuffle_v3p3_v2p3__3_3_2() {
-; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_2:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s5
-; GFX900-NEXT:    s_mov_b32 s9, s5
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_2:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s5
-; GFX90A-NEXT:    s_mov_b32 s9, s5
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_2:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s1
-; GFX942-NEXT:    s_mov_b32 s9, s1
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_3_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s11
+; GFX9-NEXT:    s_mov_b32 s9, s11
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> <i32 3, i32 3, i32 2>
@@ -2549,44 +2468,17 @@ define void @s_shuffle_v3p3_v2p3__3_3_3() {
 }
 
 define void @s_shuffle_v3p3_v2p3__u_0_0() {
-; GFX900-LABEL: s_shuffle_v3p3_v2p3__u_0_0:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3p3_v2p3__u_0_0:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3p3_v2p3__u_0_0:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3p3_v2p3__u_0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s9, s10
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> <i32 poison, i32 0, i32 0>
   call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf)
@@ -2613,47 +2505,18 @@ define void @s_shuffle_v3p3_v2p3__0_0_0() {
 }
 
 define void @s_shuffle_v3p3_v2p3__1_0_0() {
-; GFX900-LABEL: s_shuffle_v3p3_v2p3__1_0_0:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s5
-; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3p3_v2p3__1_0_0:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s5
-; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3p3_v2p3__1_0_0:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s1
-; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3p3_v2p3__1_0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s11
+; GFX9-NEXT:    s_mov_b32 s9, s10
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> <i32 1, i32 0, i32 0>
   call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf)
@@ -2661,44 +2524,17 @@ define void @s_shuffle_v3p3_v2p3__1_0_0() {
 }
 
 define void @s_shuffle_v3p3_v2p3__2_0_0() {
-; GFX900-LABEL: s_shuffle_v3p3_v2p3__2_0_0:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3p3_v2p3__2_0_0:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3p3_v2p3__2_0_0:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3p3_v2p3__2_0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s9, s10
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> <i32 2, i32 0, i32 0>
   call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf)
@@ -2710,14 +2546,13 @@ define void @s_shuffle_v3p3_v2p3__3_0_0() {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
+; GFX900-NEXT:    ; def s[10:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[6:7]
+; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s7
-; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s8, s5
+; GFX900-NEXT:    s_mov_b32 s9, s10
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2727,14 +2562,13 @@ define void @s_shuffle_v3p3_v2p3__3_0_0() {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
+; GFX90A-NEXT:    ; def s[10:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[6:7]
+; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s7
-; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s8, s5
+; GFX90A-NEXT:    s_mov_b32 s9, s10
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2744,14 +2578,13 @@ define void @s_shuffle_v3p3_v2p3__3_0_0() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
+; GFX942-NEXT:    ; def s[10:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[2:3]
+; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s3
-; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s8, s1
+; GFX942-NEXT:    s_mov_b32 s9, s10
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2768,13 +2601,12 @@ define void @s_shuffle_v3p3_v2p3__3_u_0() {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
+; GFX900-NEXT:    ; def s[10:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[6:7]
+; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s7
-; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s8, s5
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2784,13 +2616,12 @@ define void @s_shuffle_v3p3_v2p3__3_u_0() {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
+; GFX90A-NEXT:    ; def s[10:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[6:7]
+; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s7
-; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s8, s5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2800,13 +2631,12 @@ define void @s_shuffle_v3p3_v2p3__3_u_0() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
+; GFX942-NEXT:    ; def s[10:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[2:3]
+; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s3
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s8, s1
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2823,14 +2653,13 @@ define void @s_shuffle_v3p3_v2p3__3_1_0() {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
+; GFX900-NEXT:    ; def s[10:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[6:7]
+; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s7
-; GFX900-NEXT:    s_mov_b32 s9, s5
-; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s8, s5
+; GFX900-NEXT:    s_mov_b32 s9, s11
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2840,14 +2669,13 @@ define void @s_shuffle_v3p3_v2p3__3_1_0() {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
+; GFX90A-NEXT:    ; def s[10:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[6:7]
+; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s7
-; GFX90A-NEXT:    s_mov_b32 s9, s5
-; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s8, s5
+; GFX90A-NEXT:    s_mov_b32 s9, s11
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2857,14 +2685,13 @@ define void @s_shuffle_v3p3_v2p3__3_1_0() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
+; GFX942-NEXT:    ; def s[10:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[2:3]
+; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s3
-; GFX942-NEXT:    s_mov_b32 s9, s1
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s8, s1
+; GFX942-NEXT:    s_mov_b32 s9, s11
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -2881,14 +2708,13 @@ define void @s_shuffle_v3p3_v2p3__3_2_0() {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
+; GFX900-NEXT:    ; def s[10:11]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[6:7]
+; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s7
-; GFX900-NEXT:    s_mov_b32 s9, s6
-; GFX900-NEXT:    s_mov_b32 s10, s4
+; GFX900-NEXT:    s_mov_b32 s8, s5
+; GFX900-NEXT:    s_mov_b32 s9, s4
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -2898,14 +2724,13 @@ define void @s_shuffle_v3p3_v2p3__3_2_0() {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
+; GFX90A-NEXT:    ; def s[10:11]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[6:7]
+; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s7
-; GFX90A-NEXT:    s_mov_b32 s9, s6
-; GFX90A-NEXT:    s_mov_b32 s10, s4
+; GFX90A-NEXT:    s_mov_b32 s8, s5
+; GFX90A-NEXT:    s_mov_b32 s9, s4
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -2915,14 +2740,13 @@ define void @s_shuffle_v3p3_v2p3__3_2_0() {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
+; GFX942-NEXT:    ; def s[10:11]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[2:3]
+; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s3
-; GFX942-NEXT:    s_mov_b32 s9, s2
-; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s8, s1
+; GFX942-NEXT:    s_mov_b32 s9, s0
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3344,47 +3168,18 @@ define void @s_shuffle_v3p3_v2p3__2_2_2() {
 }
 
 define void @s_shuffle_v3p3_v2p3__3_2_2() {
-; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_2_2:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s5
-; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_2_2:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s5
-; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_2_2:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s1
-; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_2_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s11
+; GFX9-NEXT:    s_mov_b32 s9, s10
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> <i32 3, i32 2, i32 2>
@@ -3393,44 +3188,17 @@ define void @s_shuffle_v3p3_v2p3__3_2_2() {
 }
 
 define void @s_shuffle_v3p3_v2p3__3_u_2() {
-; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_u_2:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s5
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_u_2:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s5
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_u_2:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s1
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_u_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s11
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> <i32 3, i32 poison, i32 2>
@@ -3446,11 +3214,10 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() {
 ; GFX900-NEXT:    ; def s[4:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[6:7]
+; GFX900-NEXT:    ; def s[10:11]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s7
+; GFX900-NEXT:    s_mov_b32 s8, s11
 ; GFX900-NEXT:    s_mov_b32 s9, s4
-; GFX900-NEXT:    s_mov_b32 s10, s6
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; use s[8:10]
 ; GFX900-NEXT:    ;;#ASMEND
@@ -3463,11 +3230,10 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() {
 ; GFX90A-NEXT:    ; def s[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[6:7]
+; GFX90A-NEXT:    ; def s[10:11]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s11
 ; GFX90A-NEXT:    s_mov_b32 s9, s4
-; GFX90A-NEXT:    s_mov_b32 s10, s6
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use s[8:10]
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -3480,11 +3246,10 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() {
 ; GFX942-NEXT:    ; def s[0:1]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[2:3]
+; GFX942-NEXT:    ; def s[10:11]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s3
+; GFX942-NEXT:    s_mov_b32 s8, s11
 ; GFX942-NEXT:    s_mov_b32 s9, s0
-; GFX942-NEXT:    s_mov_b32 s10, s2
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; use s[8:10]
 ; GFX942-NEXT:    ;;#ASMEND
@@ -3497,53 +3262,20 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() {
 }
 
 define void @s_shuffle_v3p3_v2p3__3_1_2() {
-; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_1_2:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[8:9]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def s[4:5]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s8, s5
-; GFX900-NEXT:    s_mov_b32 s10, s4
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; use s[8:10]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_1_2:
-; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[8:9]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def s[4:5]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_mov_b32 s8, s5
-; GFX90A-NEXT:    s_mov_b32 s10, s4
-; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; use s[8:10]
-; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_1_2:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[8:9]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def s[0:1]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_mov_b32 s8, s1
-; GFX942-NEXT:    s_mov_b32 s10, s0
-; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; use s[8:10]
-; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_1_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[8:9]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s[10:11]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s8, s11
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[8:10]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"()
   %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"()
   %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> <i32 3, i32 1, i32 2>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll
index cecd2a0e4b015..62b9da9fedb95 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll
@@ -58,12 +58,11 @@ define void @v_shuffle_v3p3_v3p3__1_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -153,12 +152,11 @@ define void @v_shuffle_v3p3_v3p3__4_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v3p3__4_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -240,15 +238,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -355,9 +352,8 @@ define void @v_shuffle_v3p3_v3p3__5_2_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v1, v5
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -403,13 +399,12 @@ define void @v_shuffle_v3p3_v3p3__5_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -495,9 +490,8 @@ define void @v_shuffle_v3p3_v3p3__5_5_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -540,14 +534,12 @@ define void @v_shuffle_v3p3_v3p3__5_5_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -597,16 +589,14 @@ define void @v_shuffle_v3p3_v3p3__5_5_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v4
-; GFX900-NEXT:    v_mov_b32_e32 v3, v4
-; GFX900-NEXT:    v_mov_b32_e32 v4, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v1, v2
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -656,15 +646,14 @@ define void @v_shuffle_v3p3_v3p3__5_5_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v5
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -760,14 +749,13 @@ define void @v_shuffle_v3p3_v3p3__5_5_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v2
-; GFX900-NEXT:    v_mov_b32_e32 v4, v2
-; GFX900-NEXT:    v_mov_b32_e32 v5, v1
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v1, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -813,10 +801,10 @@ define void @v_shuffle_v3p3_v3p3__5_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -857,12 +845,11 @@ define void @v_shuffle_v3p3_v3p3__u_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v0, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -994,13 +981,12 @@ define void @v_shuffle_v3p3_v3p3__2_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1041,12 +1027,11 @@ define void @v_shuffle_v3p3_v3p3__3_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v0, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1088,14 +1073,12 @@ define void @v_shuffle_v3p3_v3p3__4_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1145,16 +1128,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1376,16 +1358,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    v_mov_b32_e32 v3, v1
-; GFX900-NEXT:    v_mov_b32_e32 v4, v0
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
+; GFX900-NEXT:    global_store_dwordx3 v7, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1574,10 +1555,10 @@ define void @v_shuffle_v3p3_v3p3__1_1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1704,15 +1685,14 @@ define void @v_shuffle_v3p3_v3p3__4_1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1816,15 +1796,14 @@ define void @v_shuffle_v3p3_v3p3__5_u_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v5
+; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1872,16 +1851,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v4
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    v_mov_b32_e32 v4, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v5
+; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1931,16 +1908,15 @@ define void @v_shuffle_v3p3_v3p3__5_2_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v5
-; GFX900-NEXT:    v_mov_b32_e32 v4, v2
-; GFX900-NEXT:    v_mov_b32_e32 v5, v1
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v6
+; GFX900-NEXT:    v_mov_b32_e32 v1, v3
+; GFX900-NEXT:    global_store_dwordx3 v7, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1989,16 +1965,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v6, 0
-; GFX900-NEXT:    v_mov_b32_e32 v3, v4
-; GFX900-NEXT:    v_mov_b32_e32 v4, v2
-; GFX900-NEXT:    v_mov_b32_e32 v5, v1
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
+; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2100,12 +2075,12 @@ define void @v_shuffle_v3p3_v3p3__u_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v3p3__u_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2184,13 +2159,12 @@ define void @v_shuffle_v3p3_v3p3__1_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2232,10 +2206,10 @@ define void @v_shuffle_v3p3_v3p3__2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2274,12 +2248,12 @@ define void @v_shuffle_v3p3_v3p3__3_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v3p3__3_2_2:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2317,15 +2291,14 @@ define void @v_shuffle_v3p3_v3p3__4_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v4
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2374,13 +2347,13 @@ define void @v_shuffle_v3p3_v3p3__5_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v5
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2480,16 +2453,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[4:6]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v5
-; GFX900-NEXT:    v_mov_b32_e32 v4, v0
-; GFX900-NEXT:    v_mov_b32_e32 v5, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v6
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v7, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2591,15 +2563,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2751,12 +2723,11 @@ define void @v_shuffle_v3p3_v3p3__1_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v3, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2895,13 +2866,12 @@ define void @v_shuffle_v3p3_v3p3__5_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2988,16 +2958,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    v_mov_b32_e32 v4, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:4]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3103,10 +3071,8 @@ define void @v_shuffle_v3p3_v3p3__5_2_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v3
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v1, v5
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3306,9 +3272,8 @@ define void @v_shuffle_v3p3_v3p3__1_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    v_mov_b32_e32 v4, v3
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3456,10 +3421,10 @@ define void @v_shuffle_v3p3_v3p3__4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v1
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3545,13 +3510,12 @@ define void @v_shuffle_v3p3_v3p3__5_u_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3592,15 +3556,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, v0
-; GFX900-NEXT:    v_mov_b32_e32 v5, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3705,14 +3669,13 @@ define void @v_shuffle_v3p3_v3p3__5_2_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v0, v5
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    v_mov_b32_e32 v2, v4
-; GFX900-NEXT:    global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v5
+; GFX900-NEXT:    v_mov_b32_e32 v3, v4
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3760,13 +3723,12 @@ define void @v_shuffle_v3p3_v3p3__5_3_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v0
-; GFX900-NEXT:    v_mov_b32_e32 v4, v1
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3807,12 +3769,12 @@ define void @v_shuffle_v3p3_v3p3__u_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v3p3__u_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3913,9 +3875,9 @@ define void @v_shuffle_v3p3_v3p3__1_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX900-NEXT:    v_mov_b32_e32 v3, v4
-; GFX900-NEXT:    global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v5, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3963,15 +3925,15 @@ define void @v_shuffle_v3p3_v3p3__2_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX900-NEXT:    v_mov_b32_e32 v3, v2
-; GFX900-NEXT:    v_mov_b32_e32 v4, v5
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4060,13 +4022,12 @@ define void @v_shuffle_v3p3_v3p3__4_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v3p3__4_5_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4150,15 +4111,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:3]
+; GFX900-NEXT:    ; def v[2:4]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx3 v4, v[1:3], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    v_mov_b32_e32 v2, v4
+; GFX900-NEXT:    global_store_dwordx3 v5, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4262,15 +4223,15 @@ define void @v_shuffle_v3p3_v3p3__5_2_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[3:5]
+; GFX900-NEXT:    ; def v[0:2]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[3:5]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v6, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX900-NEXT:    v_mov_b32_e32 v3, v5
-; GFX900-NEXT:    v_mov_b32_e32 v4, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    global_store_dwordx3 v6, v[1:3], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4316,14 +4277,13 @@ define void @v_shuffle_v3p3_v3p3__5_3_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:2]
+; GFX900-NEXT:    ; def v[1:3]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v3, v2
-; GFX900-NEXT:    v_mov_b32_e32 v4, v0
-; GFX900-NEXT:    v_mov_b32_e32 v5, v2
-; GFX900-NEXT:    global_store_dwordx3 v6, v[3:5], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    v_mov_b32_e32 v2, v3
+; GFX900-NEXT:    global_store_dwordx3 v4, v[0:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll
index fa422e48bbce0..89ce868b03546 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll
@@ -103,12 +103,11 @@ define void @v_shuffle_v4bf16_v3bf16__2_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_u_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -203,12 +202,11 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -567,16 +565,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -744,14 +741,13 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1606,16 +1602,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_perm_b32 v2, v2, v2, s4
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v4, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2639,16 +2634,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_perm_b32 v2, v2, v2, s4
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v4, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3017,16 +3011,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_u_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3077,16 +3070,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_0_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3697,16 +3689,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-NEXT:    v_perm_b32 v2, v2, v2, s4
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4459,12 +4450,11 @@ define void @v_shuffle_v4bf16_v3bf16__2_3_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5982,14 +5972,13 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_u_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6223,14 +6212,13 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_3_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll
index ab297c02fe3b5..8e24d6e02f3ff 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll
@@ -100,12 +100,11 @@ define void @v_shuffle_v4bf16_v4bf16__2_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_u_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -238,12 +237,11 @@ define void @v_shuffle_v4bf16_v4bf16__6_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__6_u_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -721,16 +719,15 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -950,14 +947,13 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_4_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3870,16 +3866,15 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_u_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3928,16 +3923,15 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_0_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7088,12 +7082,11 @@ define void @v_shuffle_v4bf16_v4bf16__2_4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9087,14 +9080,13 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_u_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9377,14 +9369,13 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_4_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll
index e91433ac4c1f7..d1ff8c658c77d 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll
@@ -103,12 +103,11 @@ define void @v_shuffle_v4f16_v3f16__2_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_u_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -203,12 +202,11 @@ define void @v_shuffle_v4f16_v3f16__5_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_u_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -567,16 +565,15 @@ define void @v_shuffle_v4f16_v3f16__5_5_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -744,14 +741,13 @@ define void @v_shuffle_v4f16_v3f16__5_5_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_3_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1606,16 +1602,15 @@ define void @v_shuffle_v4f16_v3f16__5_u_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_perm_b32 v2, v2, v2, s4
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v4, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2639,16 +2634,15 @@ define void @v_shuffle_v4f16_v3f16__5_u_1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_perm_b32 v2, v2, v2, s4
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v4, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3017,16 +3011,15 @@ define void @v_shuffle_v4f16_v3f16__5_5_u_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3077,16 +3070,15 @@ define void @v_shuffle_v4f16_v3f16__5_5_0_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3697,16 +3689,15 @@ define void @v_shuffle_v4f16_v3f16__5_u_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-NEXT:    v_perm_b32 v2, v2, v2, s4
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4459,12 +4450,11 @@ define void @v_shuffle_v4f16_v3f16__2_3_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5982,14 +5972,13 @@ define void @v_shuffle_v4f16_v3f16__5_5_u_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_u_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6223,14 +6212,13 @@ define void @v_shuffle_v4f16_v3f16__5_5_3_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_3_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll
index 47100b9983559..8a9a0d1a7ef5d 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll
@@ -100,12 +100,11 @@ define void @v_shuffle_v4f16_v4f16__2_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_u_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -238,12 +237,11 @@ define void @v_shuffle_v4f16_v4f16__6_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4f16_v4f16__6_u_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -721,16 +719,15 @@ define void @v_shuffle_v4f16_v4f16__7_7_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -950,14 +947,13 @@ define void @v_shuffle_v4f16_v4f16__7_7_4_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3870,16 +3866,15 @@ define void @v_shuffle_v4f16_v4f16__7_7_u_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3928,16 +3923,15 @@ define void @v_shuffle_v4f16_v4f16__7_7_0_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7088,12 +7082,11 @@ define void @v_shuffle_v4f16_v4f16__2_4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9087,14 +9080,13 @@ define void @v_shuffle_v4f16_v4f16__7_7_u_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_u_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9377,14 +9369,13 @@ define void @v_shuffle_v4f16_v4f16__7_7_4_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll
index d4ee6fa20cad8..5828e40595f9f 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll
@@ -3272,9 +3272,8 @@ define void @v_shuffle_v4f32_v3f32__1_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -3287,8 +3286,7 @@ define void @v_shuffle_v4f32_v3f32__1_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -3416,12 +3414,11 @@ define void @v_shuffle_v4f32_v3f32__4_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    global_store_dwordx4 v7, v[0:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -3434,12 +3431,12 @@ define void @v_shuffle_v4f32_v3f32__4_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v7, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v2
-; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
 ; GFX942-NEXT:    global_store_dwordx4 v7, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3988,12 +3985,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    global_store_dwordx4 v7, v[0:3], s[16:17]
@@ -4007,11 +4003,10 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v7, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX942-NEXT:    global_store_dwordx4 v7, v[0:3], s[0:1]
@@ -4108,12 +4103,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    global_store_dwordx4 v7, v[0:3], s[16:17]
@@ -4127,12 +4121,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v7, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX942-NEXT:    global_store_dwordx4 v7, v[0:3], s[0:1]
@@ -6083,9 +6076,8 @@ define void @v_shuffle_v4f32_v3f32__1_5_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    global_store_dwordx4 v5, v[0:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -6102,8 +6094,7 @@ define void @v_shuffle_v4f32_v3f32__1_5_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX942-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -6241,9 +6232,8 @@ define void @v_shuffle_v4f32_v3f32__4_5_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -6256,8 +6246,7 @@ define void @v_shuffle_v4f32_v3f32__4_5_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -6716,16 +6705,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:2]
+; GFX90A-NEXT:    ; def v[2:4]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:4]
+; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
-; GFX90A-NEXT:    global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
+; GFX90A-NEXT:    global_store_dwordx4 v8, v[4:7], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6733,17 +6722,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:2]
+; GFX942-NEXT:    ; def v[2:4]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[2:4]
+; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX942-NEXT:    v_mov_b32_e32 v1, v4
-; GFX942-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT:    v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -6878,28 +6867,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_5(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:4]
+; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
-; GFX90A-NEXT:    global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
+; GFX90A-NEXT:    global_store_dwordx4 v3, v[4:7], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[2:4]
+; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v5, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX942-NEXT:    v_mov_b32_e32 v1, v4
-; GFX942-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-NEXT:    global_store_dwordx4 v3, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll
index 7b3a5a879f44f..1a7e281e7e138 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll
@@ -103,12 +103,11 @@ define void @v_shuffle_v4i16_v3i16__2_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_u_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -203,12 +202,11 @@ define void @v_shuffle_v4i16_v3i16__5_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_u_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -567,16 +565,15 @@ define void @v_shuffle_v4i16_v3i16__5_5_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -744,14 +741,13 @@ define void @v_shuffle_v4i16_v3i16__5_5_3_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_3_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1606,16 +1602,15 @@ define void @v_shuffle_v4i16_v3i16__5_u_0_0(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_perm_b32 v2, v2, v2, s4
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v4, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2639,16 +2634,15 @@ define void @v_shuffle_v4i16_v3i16__5_u_1_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[2:3]
 ; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_perm_b32 v2, v2, v2, s4
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
-; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v4, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3017,16 +3011,15 @@ define void @v_shuffle_v4i16_v3i16__5_5_u_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3077,16 +3070,15 @@ define void @v_shuffle_v4i16_v3i16__5_5_0_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3697,16 +3689,15 @@ define void @v_shuffle_v4i16_v3i16__5_u_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
+; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-NEXT:    v_perm_b32 v2, v2, v2, s4
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4459,12 +4450,11 @@ define void @v_shuffle_v4i16_v3i16__2_3_3_3(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_3_3_3:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5982,14 +5972,13 @@ define void @v_shuffle_v4i16_v3i16__5_5_u_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_u_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6223,14 +6212,13 @@ define void @v_shuffle_v4i16_v3i16__5_5_3_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_3_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll
index 2a371b7c7d2d3..05ebf49b997eb 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll
@@ -100,12 +100,11 @@ define void @v_shuffle_v4i16_v4i16__2_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_u_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -238,12 +237,11 @@ define void @v_shuffle_v4i16_v4i16__6_u_u_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4i16_v4i16__6_u_u_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -721,16 +719,15 @@ define void @v_shuffle_v4i16_v4i16__7_7_0_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -950,14 +947,13 @@ define void @v_shuffle_v4i16_v4i16__7_7_4_u(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_u:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3870,16 +3866,15 @@ define void @v_shuffle_v4i16_v4i16__7_7_u_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3928,16 +3923,15 @@ define void @v_shuffle_v4i16_v4i16__7_7_0_1(ptr addrspace(1) inreg %ptr) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[1:2]
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[2:3]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
+; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7088,12 +7082,11 @@ define void @v_shuffle_v4i16_v4i16__2_4_4_4(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_4_4_4:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    ;;#ASMSTART
 ; GFX900-NEXT:    ; def v[0:1]
 ; GFX900-NEXT:    ;;#ASMEND
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    global_store_dwordx2 v2, v[1:2], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9087,14 +9080,13 @@ define void @v_shuffle_v4i16_v4i16__7_7_u_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_u_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9377,14 +9369,13 @@ define void @v_shuffle_v4i16_v4i16__7_7_4_5(ptr addrspace(1) inreg %ptr) {
 ; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_5:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    ;;#ASMSTART
-; GFX900-NEXT:    ; def v[0:1]
-; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
-; GFX900-NEXT:    v_mov_b32_e32 v2, v0
-; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v[1:2]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
+; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll
index 1a669adf2b635..3a659e1753e97 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll
@@ -3272,9 +3272,8 @@ define void @v_shuffle_v4i32_v3i32__1_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -3287,8 +3286,7 @@ define void @v_shuffle_v4i32_v3i32__1_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -3416,12 +3414,11 @@ define void @v_shuffle_v4i32_v3i32__4_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    global_store_dwordx4 v7, v[0:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -3434,12 +3431,12 @@ define void @v_shuffle_v4i32_v3i32__4_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v7, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v2
-; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
 ; GFX942-NEXT:    global_store_dwordx4 v7, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3988,12 +3985,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    global_store_dwordx4 v7, v[0:3], s[16:17]
@@ -4007,11 +4003,10 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v7, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX942-NEXT:    global_store_dwordx4 v7, v[0:3], s[0:1]
@@ -4108,12 +4103,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    global_store_dwordx4 v7, v[0:3], s[16:17]
@@ -4127,12 +4121,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v7, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX942-NEXT:    global_store_dwordx4 v7, v[0:3], s[0:1]
@@ -6083,9 +6076,8 @@ define void @v_shuffle_v4i32_v3i32__1_5_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    global_store_dwordx4 v5, v[0:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -6102,8 +6094,7 @@ define void @v_shuffle_v4i32_v3i32__1_5_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX942-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -6241,9 +6232,8 @@ define void @v_shuffle_v4i32_v3i32__4_5_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -6256,8 +6246,7 @@ define void @v_shuffle_v4i32_v3i32__4_5_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -6716,16 +6705,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:2]
+; GFX90A-NEXT:    ; def v[2:4]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:4]
+; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
-; GFX90A-NEXT:    global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
+; GFX90A-NEXT:    global_store_dwordx4 v8, v[4:7], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6733,17 +6722,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:2]
+; GFX942-NEXT:    ; def v[2:4]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[2:4]
+; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX942-NEXT:    v_mov_b32_e32 v1, v4
-; GFX942-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT:    v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -6878,28 +6867,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_5(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:4]
+; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
-; GFX90A-NEXT:    global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
+; GFX90A-NEXT:    global_store_dwordx4 v3, v[4:7], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[2:4]
+; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v5, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX942-NEXT:    v_mov_b32_e32 v1, v4
-; GFX942-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-NEXT:    global_store_dwordx4 v3, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x i32> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll
index 8039e126590b9..f1c1e4b20f242 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll
@@ -3272,9 +3272,8 @@ define void @v_shuffle_v4p3_v3p3__1_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -3287,8 +3286,7 @@ define void @v_shuffle_v4p3_v3p3__1_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -3416,12 +3414,11 @@ define void @v_shuffle_v4p3_v3p3__4_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    global_store_dwordx4 v7, v[0:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -3434,12 +3431,12 @@ define void @v_shuffle_v4p3_v3p3__4_2_2_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v7, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v2
-; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0]
 ; GFX942-NEXT:    global_store_dwordx4 v7, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -3988,12 +3985,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    global_store_dwordx4 v7, v[0:3], s[16:17]
@@ -4007,11 +4003,10 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v7, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
+; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX942-NEXT:    global_store_dwordx4 v7, v[0:3], s[0:1]
@@ -4108,12 +4103,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[4:6]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    global_store_dwordx4 v7, v[0:3], s[16:17]
@@ -4127,12 +4121,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v7, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX942-NEXT:    ;;#ASMSTART
 ; GFX942-NEXT:    ; def v[4:6]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0]
+; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX942-NEXT:    global_store_dwordx4 v7, v[0:3], s[0:1]
@@ -6083,9 +6076,8 @@ define void @v_shuffle_v4p3_v3p3__1_5_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    global_store_dwordx4 v5, v[0:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -6102,8 +6094,7 @@ define void @v_shuffle_v4p3_v3p3__1_5_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX942-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -6241,9 +6232,8 @@ define void @v_shuffle_v4p3_v3p3__4_5_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -6256,8 +6246,7 @@ define void @v_shuffle_v4p3_v3p3__4_5_5_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
-; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX942-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -6716,16 +6705,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[0:2]
+; GFX90A-NEXT:    ; def v[2:4]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:4]
+; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
-; GFX90A-NEXT:    global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
+; GFX90A-NEXT:    global_store_dwordx4 v8, v[4:7], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6733,17 +6722,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[0:2]
+; GFX942-NEXT:    ; def v[2:4]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[2:4]
+; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX942-NEXT:    v_mov_b32_e32 v1, v4
-; GFX942-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT:    v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -6878,28 +6867,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_5(ptr addrspace(1) inreg %ptr) {
 ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[2:4]
+; GFX90A-NEXT:    ; def v[0:2]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
-; GFX90A-NEXT:    global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
+; GFX90A-NEXT:    global_store_dwordx4 v3, v[4:7], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX942-NEXT:    ;;#ASMSTART
-; GFX942-NEXT:    ; def v[2:4]
+; GFX942-NEXT:    ; def v[0:2]
 ; GFX942-NEXT:    ;;#ASMEND
-; GFX942-NEXT:    v_mov_b32_e32 v5, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
-; GFX942-NEXT:    v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0]
-; GFX942-NEXT:    v_mov_b32_e32 v1, v4
-; GFX942-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-NEXT:    global_store_dwordx4 v3, v[4:7], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
index 34de1e48bfb59..01bcdad3fc220 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
@@ -3,15 +3,16 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA
 
 define void @nested_inf_loop(i1 %0, i1 %1) {
-; OPT-LABEL: @nested_inf_loop(
-; OPT-NEXT:  BB:
-; OPT-NEXT:    br label [[BB1:%.*]]
-; OPT:       BB1:
-; OPT-NEXT:    [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]]
-; OPT-NEXT:    br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]]
-; OPT:       infloop:
-; OPT-NEXT:    br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]]
-; OPT:       DummyReturnBlock:
+; OPT-LABEL: define void @nested_inf_loop(
+; OPT-SAME: i1 [[TMP0:%.*]], i1 [[TMP1:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    br label %[[BB1:.*]]
+; OPT:       [[BB1]]:
+; OPT-NEXT:    [[BRMERGE:%.*]] = select i1 [[TMP0]], i1 true, i1 [[TMP1]]
+; OPT-NEXT:    br i1 [[BRMERGE]], label %[[BB1]], label %[[INFLOOP:.*]]
+; OPT:       [[INFLOOP]]:
+; OPT-NEXT:    br i1 true, label %[[INFLOOP]], label %[[DUMMYRETURNBLOCK:.*]]
+; OPT:       [[DUMMYRETURNBLOCK]]:
 ; OPT-NEXT:    ret void
 ;
 ; ISA-LABEL: nested_inf_loop:
@@ -63,3 +64,84 @@ BB4:
 BB3:
   br label %BB1
 }
+
+define void @nested_inf_loop_callbr(i32 %0, i32 %1) {
+; OPT-LABEL: define void @nested_inf_loop_callbr(
+; OPT-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    callbr void asm "", ""()
+; OPT-NEXT:            to label %[[BB1:.*]] []
+; OPT:       [[BB1]]:
+; OPT-NEXT:    callbr void asm "", "r,!i"(i32 [[TMP0]])
+; OPT-NEXT:            to label %[[BB3:.*]] [label %BB2]
+; OPT:       [[BB2:.*:]]
+; OPT-NEXT:    callbr void asm "", ""()
+; OPT-NEXT:            to label %[[BB4:.*]] []
+; OPT:       [[BB4]]:
+; OPT-NEXT:    br i1 true, label %[[TRANSITIONBLOCK:.*]], label %[[DUMMYRETURNBLOCK:.*]]
+; OPT:       [[TRANSITIONBLOCK]]:
+; OPT-NEXT:    callbr void asm "", "r,!i"(i32 [[TMP1]])
+; OPT-NEXT:            to label %[[BB3]] [label %BB4]
+; OPT:       [[BB3]]:
+; OPT-NEXT:    callbr void asm "", ""()
+; OPT-NEXT:            to label %[[BB1]] []
+; OPT:       [[DUMMYRETURNBLOCK]]:
+; OPT-NEXT:    ret void
+;
+; ISA-LABEL: nested_inf_loop_callbr:
+; ISA:       ; %bb.0: ; %BB
+; ISA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; ISA-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; ISA-NEXT:  .LBB1_1: ; %BB1
+; ISA-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
+; ISA-NEXT:    s_and_b64 s[8:9], s[4:5], exec
+; ISA-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; ISA-NEXT:  .LBB1_2: ; %BB3
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; ISA-NEXT:    s_and_b64 s[8:9], s[6:7], exec
+; ISA-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; ISA-NEXT:    s_branch .LBB1_1
+; ISA-NEXT:  .LBB1_3: ; Inline asm indirect target
+; ISA-NEXT:    ; %BB2
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    ; Label of block must be emitted
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    s_mov_b64 s[6:7], -1
+; ISA-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; ISA-NEXT:    s_cbranch_execz .LBB1_5
+; ISA-NEXT:  ; %bb.4: ; %TransitionBlock.target.BB3
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    s_xor_b64 s[6:7], exec, -1
+; ISA-NEXT:  .LBB1_5: ; %loop.exit.guard
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    s_or_b64 exec, exec, s[8:9]
+; ISA-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; ISA-NEXT:    s_mov_b64 s[6:7], 0
+; ISA-NEXT:    s_cbranch_vccz .LBB1_2
+; ISA-NEXT:  ; %bb.6: ; %DummyReturnBlock
+; ISA-NEXT:    s_setpc_b64 s[30:31]
+BB:
+  callbr void asm "", ""() to label %BB1 []
+
+BB1:
+  callbr void asm "", "r,!i"(i32 %0) to label %BB3 [label %BB2]
+
+BB2:
+  callbr void asm "", ""() to label %BB4 []
+
+BB4:
+  callbr void asm "", "r,!i"(i32 %1) to label %BB3 [label %BB4]
+
+BB3:
+  callbr void asm "", ""() to label %BB1 []
+}
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
index 4cbe682cf9f9f..004c27971131d 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
 ; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck %s
 
 declare void @llvm.trap()
@@ -70,8 +70,33 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
 ; CHECK-NEXT:    s_mov_b64 s[2:3], -1
 ; CHECK-NEXT:    s_trap 2
 ; CHECK-NEXT:    s_branch .LBB0_4
-
-
+; UNIFY-LABEL: @kernel(
+; UNIFY-NEXT:  entry:
+; UNIFY-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; UNIFY-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256
+; UNIFY-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; UNIFY:       if.then:
+; UNIFY-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; UNIFY-NEXT:    br i1 [[CMP1]], label [[IF_END6_SINK_SPLIT:%.*]], label [[COND_FALSE:%.*]]
+; UNIFY:       cond.false:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.else:
+; UNIFY-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[TID]], 10
+; UNIFY-NEXT:    br i1 [[CMP2]], label [[IF_THEN3:%.*]], label [[IF_END6:%.*]]
+; UNIFY:       if.then3:
+; UNIFY-NEXT:    [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0
+; UNIFY-NEXT:    br i1 [[CMP1_I7]], label [[IF_END6_SINK_SPLIT]], label [[COND_FALSE_I8:%.*]]
+; UNIFY:       cond.false.i8:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.end6.sink.split:
+; UNIFY-NEXT:    [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]]
+; UNIFY-NEXT:    store i32 [[A]], ptr addrspace(1) [[X1]], align 4
+; UNIFY-NEXT:    br label [[IF_END6]]
+; UNIFY:       if.end6:
+; UNIFY-NEXT:    ret void
+;
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %cmp = icmp eq i32 %n, 256
@@ -105,5 +130,129 @@ if.end6.sink.split:
 if.end6:
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; UNIFY: {{.*}}
+
+define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
+; CHECK-LABEL: kernel_callbr:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dword s1, s[8:9], 0x10
+; CHECK-NEXT:    s_load_dword s0, s[8:9], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_cmpk_eq_i32 s1, 0x100
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  ; %bb.1: ; %if.then
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  .LBB1_2: ; %if.end6.sink.split
+; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x8
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_store_dword v0, v1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  .LBB1_3: ; Inline asm indirect target
+; CHECK-NEXT:    ; %UnifiedReturnBlock
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_endpgm
+; CHECK-NEXT:  .LBB1_4: ; Inline asm indirect target
+; CHECK-NEXT:    ; %if.else
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 10, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  ; %bb.5: ; %if.then3
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_branch .LBB1_2
+; CHECK-NEXT:  .LBB1_6: ; Inline asm indirect target
+; CHECK-NEXT:    ; %cond.false.i8
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:  .LBB1_7: ; Inline asm indirect target
+; CHECK-NEXT:    ; %cond.false
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_trap 2
+; CHECK-NEXT:    ; divergent unreachable
+; CHECK-NEXT:    s_branch .LBB1_3
+; UNIFY-LABEL: @kernel_callbr(
+; UNIFY-NEXT:  entry:
+; UNIFY-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; UNIFY-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256
+; UNIFY-NEXT:    [[CMP32:%.*]] = zext i1 [[CMP]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP32]])
+; UNIFY-NEXT:            to label [[IF_THEN:%.*]] [label %if.else]
+; UNIFY:       if.then:
+; UNIFY-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; UNIFY-NEXT:    [[CMP1_32:%.*]] = zext i1 [[CMP1]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP1_32]])
+; UNIFY-NEXT:            to label [[IF_END6_SINK_SPLIT:%.*]] [label %cond.false]
+; UNIFY:       cond.false:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.else:
+; UNIFY-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[TID]], 10
+; UNIFY-NEXT:    [[CMP2_32:%.*]] = zext i1 [[CMP2]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP2_32]])
+; UNIFY-NEXT:            to label [[IF_THEN3:%.*]] [label %if.end6]
+; UNIFY:       if.then3:
+; UNIFY-NEXT:    [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0
+; UNIFY-NEXT:    [[CMP1_I7_32:%.*]] = zext i1 [[CMP1_I7]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP1_I7_32]])
+; UNIFY-NEXT:            to label [[IF_END6_SINK_SPLIT]] [label %cond.false.i8]
+; UNIFY:       cond.false.i8:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.end6.sink.split:
+; UNIFY-NEXT:    [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]]
+; UNIFY-NEXT:    store i32 [[A]], ptr addrspace(1) [[X1]], align 4
+; UNIFY-NEXT:    callbr void asm "", ""()
+; UNIFY-NEXT:            to label [[IF_END6:%.*]] []
+; UNIFY:       if.end6:
+; UNIFY-NEXT:    ret void
+;
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %cmp = icmp eq i32 %n, 256
+  %cmp32 = zext i1 %cmp to i32
+  callbr void asm "", "r,!i"(i32 %cmp32) to label %if.then [label %if.else]
+
+if.then:
+  %cmp1 = icmp eq i32 %a, 0
+  %cmp1_32 = zext i1 %cmp1 to i32
+  callbr void asm "", "r,!i"(i32 %cmp1_32) to label %if.end6.sink.split [label %cond.false]
+
+cond.false:
+  call void @llvm.trap()
+  unreachable
+
+if.else:
+  %cmp2 = icmp ult i32 %tid, 10
+  %cmp2_32 = zext i1 %cmp2 to i32
+  callbr void asm "", "r,!i"(i32 %cmp2_32) to label %if.then3 [label %if.end6]
+
+if.then3:
+  %cmp1.i7 = icmp eq i32 %a, 0
+  %cmp1.i7_32 = zext i1 %cmp1.i7 to i32
+  callbr void asm "", "r,!i"(i32 %cmp1.i7_32) to label %if.end6.sink.split [label %cond.false.i8]
+
+cond.false.i8:
+  call void @llvm.trap()
+  unreachable
+
+if.end6.sink.split:
+  %x1 = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %tid
+  store i32 %a, ptr addrspace(1) %x1, align 4
+  callbr void asm "", ""() to label %if.end6 []
+
+if.end6:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
index cb8bbde71f146..ece46b59ba49e 100644
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -6,29 +6,27 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 %
 ; SI-LABEL: s_sext_i1_to_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_eq_u32 s2, s3
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_cmp_eq_u32 s4, s5
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_sext_i1_to_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_eq_u32 s2, s3
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; VI-NEXT:    s_cmp_eq_u32 s4, s5
+; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i32
@@ -78,31 +76,29 @@ define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %
 ; SI-LABEL: s_sext_i1_to_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_eq_u32 s2, s3
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_cmp_eq_u32 s4, s5
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v1, v0
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_sext_i1_to_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_eq_u32 s2, s3
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; VI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; VI-NEXT:    s_cmp_eq_u32 s4, s5
+; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    v_mov_b32_e32 v1, v0
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i64
@@ -218,29 +214,27 @@ define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 %
 ; SI-LABEL: s_sext_i1_to_i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_eq_u32 s2, s3
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
-; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_cmp_eq_u32 s4, s5
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_sext_i1_to_i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_eq_u32 s2, s3
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
-; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; VI-NEXT:    s_cmp_eq_u32 s4, s5
+; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i16
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 5461532184fc5..e836366fd8dbf 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1797,8 +1797,8 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) {
 ; GFX10-WAVE32-NEXT:    s_cbranch_scc1 .LBB15_7
 ; GFX10-WAVE32-NEXT:  ; %bb.1: ; %.lr.ph
 ; GFX10-WAVE32-NEXT:    s_mov_b32 s1, exec_lo
-; GFX10-WAVE32-NEXT:    s_mov_b32 s0, 0
 ; GFX10-WAVE32-NEXT:    s_mov_b32 s2, 0
+; GFX10-WAVE32-NEXT:    s_mov_b32 s0, 0
 ; GFX10-WAVE32-NEXT:    s_branch .LBB15_3
 ; GFX10-WAVE32-NEXT:  .LBB15_2: ; %latch
 ; GFX10-WAVE32-NEXT:    ; in Loop: Header=BB15_3 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 47998767a948c..76f8f484fc763 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -369,42 +369,41 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0
 ; CI-LABEL: s_abs_v4i16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b32 s4, s0
-; CI-NEXT:    s_mov_b32 s5, s1
-; CI-NEXT:    s_ashr_i32 s0, s3, 16
-; CI-NEXT:    s_ashr_i32 s1, s2, 16
-; CI-NEXT:    s_lshr_b32 s8, s2, 16
-; CI-NEXT:    s_lshr_b32 s9, s3, 16
-; CI-NEXT:    s_sext_i32_i16 s10, s3
-; CI-NEXT:    s_sext_i32_i16 s11, s2
-; CI-NEXT:    s_sub_i32 s3, 0, s3
-; CI-NEXT:    s_sub_i32 s2, 0, s2
-; CI-NEXT:    s_sext_i32_i16 s3, s3
-; CI-NEXT:    s_sext_i32_i16 s2, s2
+; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; CI-NEXT:    s_ashr_i32 s6, s5, 16
+; CI-NEXT:    s_lshr_b32 s9, s5, 16
+; CI-NEXT:    s_sext_i32_i16 s10, s5
+; CI-NEXT:    s_sub_i32 s5, 0, s5
+; CI-NEXT:    s_ashr_i32 s7, s4, 16
+; CI-NEXT:    s_lshr_b32 s8, s4, 16
+; CI-NEXT:    s_sext_i32_i16 s11, s4
+; CI-NEXT:    s_sext_i32_i16 s5, s5
+; CI-NEXT:    s_sub_i32 s4, 0, s4
 ; CI-NEXT:    s_sub_i32 s9, 0, s9
-; CI-NEXT:    s_sub_i32 s8, 0, s8
+; CI-NEXT:    s_sext_i32_i16 s4, s4
 ; CI-NEXT:    s_sext_i32_i16 s9, s9
+; CI-NEXT:    s_sub_i32 s8, 0, s8
+; CI-NEXT:    s_max_i32 s5, s10, s5
 ; CI-NEXT:    s_sext_i32_i16 s8, s8
-; CI-NEXT:    s_max_i32 s2, s11, s2
-; CI-NEXT:    s_max_i32 s3, s10, s3
-; CI-NEXT:    s_max_i32 s1, s1, s8
-; CI-NEXT:    s_max_i32 s0, s0, s9
-; CI-NEXT:    s_add_i32 s3, s3, 2
-; CI-NEXT:    s_add_i32 s2, s2, 2
-; CI-NEXT:    s_lshl_b32 s0, s0, 16
-; CI-NEXT:    s_and_b32 s3, s3, 0xffff
-; CI-NEXT:    s_lshl_b32 s1, s1, 16
-; CI-NEXT:    s_and_b32 s2, s2, 0xffff
-; CI-NEXT:    s_or_b32 s0, s0, s3
-; CI-NEXT:    s_or_b32 s1, s1, s2
-; CI-NEXT:    s_add_i32 s0, s0, 0x20000
-; CI-NEXT:    s_add_i32 s1, s1, 0x20000
-; CI-NEXT:    v_mov_b32_e32 v0, s1
-; CI-NEXT:    v_mov_b32_e32 v1, s0
-; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; CI-NEXT:    s_max_i32 s6, s6, s9
+; CI-NEXT:    s_max_i32 s4, s11, s4
+; CI-NEXT:    s_add_i32 s5, s5, 2
+; CI-NEXT:    s_max_i32 s7, s7, s8
+; CI-NEXT:    s_lshl_b32 s6, s6, 16
+; CI-NEXT:    s_and_b32 s5, s5, 0xffff
+; CI-NEXT:    s_add_i32 s4, s4, 2
+; CI-NEXT:    s_or_b32 s5, s6, s5
+; CI-NEXT:    s_lshl_b32 s6, s7, 16
+; CI-NEXT:    s_and_b32 s4, s4, 0xffff
+; CI-NEXT:    s_or_b32 s4, s6, s4
+; CI-NEXT:    s_add_i32 s5, s5, 0x20000
+; CI-NEXT:    s_add_i32 s4, s4, 0x20000
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    v_mov_b32_e32 v1, s5
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
   %z0 = insertelement <4 x i16> poison, i16 0, i16 0
   %z1 = insertelement <4 x i16> %z0, i16 0, i16 1
diff --git a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
index 71e4755b58bf2..c90d7887f2ff6 100644
--- a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
+++ b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
@@ -3,9 +3,6 @@
 define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) {
 ; CHECK-LABEL: excess_soft_clause_reg_pressure:
 ; CHECK:  BB0_1: ; %for.cond28.preheader
-; CHECK:         s_load_dwordx16
-; CHECK-NEXT:    s_load_dwordx16
-
 ; CHECK:         global_load_dword
 ; CHECK-NEXT:    global_load_dword
 ; CHECK-NEXT:    global_load_dword
@@ -18,11 +15,23 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspa
 ; CHECK-NOT: v_readlane_b32
 
 ; CHECK:         s_load_dwordx16
+; CHECK-NEXT:    s_load_dwordx16
+
+; CHECK-NOT: v_writelane_b32
+; CHECK-NOT: v_readlane_b32
+
 ; CHECK:         s_load_dwordx16
+; CHECK-NEXT:    s_load_dwordx16
+
+; CHECK-NOT: v_writelane_b32
+; CHECK-NOT: v_readlane_b32
+
 ; CHECK:         s_load_dwordx16
+; CHECK-NEXT:    s_load_dwordx16
 
 ; CHECK-NOT: v_writelane_b32
 ; CHECK-NOT: v_readlane_b32
+
 entry:
   %i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %i2 = load i64, ptr addrspace(4) %i, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index da48af100d27b..1a0f75e048cb9 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -448,13 +448,13 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    v_mfma_f32_4x4x1f32 a[0:3], v2, v2, a[0:3]
-; GFX90A-NEXT:    s_nop 4
-; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[2:3]
 ; GFX90A-NEXT:    buffer_load_dword v2, off, s[8:11], 0 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[2:3]
+; GFX90A-NEXT:    s_waitcnt vmcnt(1)
 ; GFX90A-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 50056b62b3397..b5474b8974b29 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10314,7 +10314,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2050
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v4, 16
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:144
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:224
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2040
@@ -10327,12 +10328,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[19:22], v5, s[38:39] offset:192
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[15:18], v5, s[38:39] offset:176
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:160
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:144
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2020
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2070
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:128
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:112
@@ -10344,7 +10343,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:96
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[11:14], v5, s[38:39] offset:32
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:16
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:80
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
@@ -10358,10 +10359,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2080
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[11:14], v5, s[38:39] offset:32
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:16
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2060
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[7:10], v5, s[38:39]
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, 1
@@ -10468,13 +10466,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[6:9], s[36:37] offset:224
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2020
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2070
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[6:9], s[36:37] offset:208
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[19:22], s[36:37] offset:192
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[15:18], s[36:37] offset:176
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2070
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:160
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index b13829e0351f6..8ecc0ad65a944 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -12,10 +12,10 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
   ; GCN-NEXT:   [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
   ; GCN-NEXT:   [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
   ; GCN-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %14.sub0
+  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %14.sub0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
   ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %24:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
-  ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:VReg_64 */, %14
+  ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2818057 /* reguse:VReg_64 */, %14
   ; GCN-NEXT:   S_ENDPGM 0
   %v0 = call i32 asm sideeffect "; def $0", "=v"()
   %tmp = insertelement <2 x i32> poison, i32 %v0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
index ef96944abef0e..586579fcaeb93 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
@@ -20,33 +20,38 @@ define void @test() {
 ; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:  .LBB0_3: ; %bb.3
 ; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    ; implicit-def: $sgpr4
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_readfirstlane_b32 s6, v0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], -1
+; CHECK-NEXT:    s_mov_b32 s7, 0
+; CHECK-NEXT:    s_cmp_eq_u32 s6, s7
 ; CHECK-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    v_writelane_b32 v1, s4, 0
 ; CHECK-NEXT:    v_writelane_b32 v1, s5, 1
-; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], exec
+; CHECK-NEXT:    s_mov_b64 exec, -1
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v1 ; Reload Reuse
-; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
+; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_5
 ; CHECK-NEXT:  ; %bb.4: ; %bb.4
 ; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; CHECK-NEXT:    s_or_saveexec_b64 s[10:11], -1
 ; CHECK-NEXT:    v_accvgpr_read_b32 v1, a0 ; Reload Reuse
-; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
+; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    v_writelane_b32 v1, s4, 0
 ; CHECK-NEXT:    v_writelane_b32 v1, s5, 1
-; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; CHECK-NEXT:    s_or_saveexec_b64 s[10:11], -1
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v1 ; Reload Reuse
-; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
+; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
 ; CHECK-NEXT:  .LBB0_5: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; CHECK-NEXT:    s_or_saveexec_b64 s[10:11], -1
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_accvgpr_read_b32 v1, a0 ; Reload Reuse
-; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
+; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
 ; CHECK-NEXT:    v_readlane_b32 s4, v1, 0
 ; CHECK-NEXT:    v_readlane_b32 s5, v1, 1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll
index 2d54ac8283a3a..9686c9d30b97c 100644
--- a/llvm/test/CodeGen/AMDGPU/spillv16.ll
+++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-TRUE16
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-FAKE16
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16,+d16-write-vgpr32 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX12-TRUE16,GFX12-TRUE16-D16W32
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16,-d16-write-vgpr32 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX12-TRUE16,GFX12-TRUE16-D16W16
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-TRUE16
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-FAKE16
 
@@ -35,6 +37,26 @@ define void @spill_i16_alu() {
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-LABEL: spill_i16_alu:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-TRUE16-LABEL: spill_i16_alu:
 ; GFX1250-TRUE16:       ; %bb.0: ; %entry
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -126,6 +148,56 @@ define void @spill_i16_alu_two_vals() {
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-D16W32-LABEL: spill_i16_alu_two_vals:
+; GFX12-TRUE16-D16W32:       ; %bb.0: ; %entry
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX12-TRUE16-D16W32-NEXT:    scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill
+; GFX12-TRUE16-D16W32-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-D16W32-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-D16W32-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    scratch_load_d16_hi_b16 v0, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-D16W16-LABEL: spill_i16_alu_two_vals:
+; GFX12-TRUE16-D16W16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX12-TRUE16-D16W16-NEXT:    scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill
+; GFX12-TRUE16-D16W16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-D16W16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-D16W16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    scratch_load_d16_hi_b16 v0, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-D16W16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-TRUE16-LABEL: spill_i16_alu_two_vals:
 ; GFX1250-TRUE16:       ; %bb.0: ; %entry
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -223,6 +295,25 @@ define void @spill_i16() {
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-LABEL: spill_i16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-LABEL: spill_i16:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -282,6 +373,25 @@ define void @spill_half() {
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-LABEL: spill_half:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-LABEL: spill_half:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -341,6 +451,25 @@ define void @spill_i16_from_v2i16() {
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-LABEL: spill_i16_from_v2i16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-LABEL: spill_i16_from_v2i16:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -414,13 +543,39 @@ define void @spill_2xi16_from_v2i16() {
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-LABEL: spill_2xi16_from_v2i16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:10 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16:
 ; GFX1250-TRUE16:       ; %bb.0: ; %entry
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-TRUE16-NEXT:    scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT:    s_clause 0x1
+; GFX1250-TRUE16-NEXT:    s_clause 0x1 ; 4-byte Folded Spill
 ; GFX1250-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:12
 ; GFX1250-TRUE16-NEXT:    scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
@@ -444,7 +599,7 @@ define void @spill_2xi16_from_v2i16() {
 ; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS
 ; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-FAKE16-NEXT:    s_clause 0x1
+; GFX1250-FAKE16-NEXT:    s_clause 0x1 ; 4-byte Folded Spill
 ; GFX1250-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8
 ; GFX1250-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
 ; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
@@ -520,6 +675,32 @@ define void @spill_2xi16_from_v2i16_one_free_reg() {
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:10 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
 ; GFX1250-TRUE16:       ; %bb.0: ; %entry
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -595,6 +776,25 @@ define void @spill_v2i16() {
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-LABEL: spill_v2i16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-LABEL: spill_v2i16:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 364598f7cf6c0..90304b2c730cb 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -31,8 +31,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr10
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr8
   ; CHECK-NEXT:   undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4)
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %117:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   KILL undef %117:sgpr_128
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %125:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   KILL undef %125:sgpr_128
   ; CHECK-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc
@@ -44,138 +44,139 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.71, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 64, 0 :: (invariant load (s128) from %ir.88, addrspace 4)
   ; CHECK-NEXT:   KILL undef %74:sreg_64
   ; CHECK-NEXT:   KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
-  ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %112:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %87:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   KILL undef %112:sgpr_128
-  ; CHECK-NEXT:   KILL undef %87:sgpr_128
+  ; CHECK-NEXT:   KILL undef %89:sgpr_128
+  ; CHECK-NEXT:   KILL undef %118:sgpr_128
   ; CHECK-NEXT:   [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_1:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_2:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %148:sreg_32, 31, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %148:sreg_32, implicit-def $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.77, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.83, addrspace 4)
-  ; CHECK-NEXT:   KILL [[S_ADD_U32_2]].sub0, [[S_ADD_U32_2]].sub1
+  ; CHECK-NEXT:   undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.87, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.93, addrspace 4)
   ; CHECK-NEXT:   KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1
-  ; CHECK-NEXT:   [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %148:sreg_32, implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   KILL [[S_ADD_U32_2]].sub0, [[S_ADD_U32_2]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
+  ; CHECK-NEXT:   [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %169:sreg_32, implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %279:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %302:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %334:sgpr_128, undef %335:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %345:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.95, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 0, 0 :: (invariant load (s128) from %ir.100, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.105, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4)
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %329:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %340:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %363:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_13:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %361:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_13:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LSHL4_ADD_U32_:%[0-9]+]]:sreg_32 = S_LSHL4_ADD_U32 [[COPY12]], 16, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %383:sgpr_128, [[S_LSHL4_ADD_U32_]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 224, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.133, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 576, 0 :: (invariant load (s128) from %ir.138, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.122, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 224, 0 :: (invariant load (s128) from %ir.128, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.134, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.162, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_10:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM3]], -297, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -313, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM3]], -297, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_10:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -313, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_3]], 31, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_3]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_17]], 168, 0 :: (invariant load (s32) from %ir.260, align 8, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
+  ; CHECK-NEXT:   [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_4]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 0, 0 :: (invariant load (s128) from %ir.158, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4)
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]]
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.166, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.171, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s64) from %ir.269, addrspace 4)
+  ; CHECK-NEXT:   [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.193, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.199, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.211, addrspace 4)
   ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.204, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.209, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.216, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.221, addrspace 4)
   ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc
   ; CHECK-NEXT:   [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
   ; CHECK-NEXT:   [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]]
@@ -183,30 +184,30 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.280, addrspace 4)
+  ; CHECK-NEXT:   [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4)
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
   ; CHECK-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 160, 0 :: (invariant load (s128) from %ir.244, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %443:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
-  ; CHECK-NEXT:   KILL [[S_ADD_U32_15]].sub0, [[S_ADD_U32_15]].sub1
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %469:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
+  ; CHECK-NEXT:   KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1
+  ; CHECK-NEXT:   KILL undef %469:sreg_64
   ; CHECK-NEXT:   KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
-  ; CHECK-NEXT:   KILL undef %443:sreg_64
-  ; CHECK-NEXT:   [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.252, addrspace 4)
-  ; CHECK-NEXT:   [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s32) from %ir.291, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4)
+  ; CHECK-NEXT:   [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM23]]
@@ -216,28 +217,28 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]]
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]]
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY18]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_21]], 96, 0 :: (invariant load (s128) from %ir.309, addrspace 4)
-  ; CHECK-NEXT:   undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.315, addrspace 4)
-  ; CHECK-NEXT:   undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.321, addrspace 4)
+  ; CHECK-NEXT:   [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM25]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM26]]
   ; CHECK-NEXT:   KILL [[V_MOV_B32_e32_]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM27]]
+  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM25]]
   ; CHECK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
@@ -308,15 +309,15 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -216, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_36:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_35]], [[V_ADD_U32_e64_17]], implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_37:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_36]], [[V_ADD_U32_e64_18]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_38:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_7]], [[V_OR_B32_e64_37]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_39:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_8]], [[V_OR_B32_e64_38]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_40:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_9]], [[V_OR_B32_e64_39]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_41:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_10]], [[V_OR_B32_e64_40]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_42:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_11]], [[V_OR_B32_e64_41]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_43:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_12]], [[V_OR_B32_e64_42]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_44:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_13]], [[V_OR_B32_e64_43]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_38:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_6]], [[V_OR_B32_e64_37]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_39:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_7]], [[V_OR_B32_e64_38]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_40:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_8]], [[V_OR_B32_e64_39]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_41:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_9]], [[V_OR_B32_e64_40]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_42:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_10]], [[V_OR_B32_e64_41]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_43:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_11]], [[V_OR_B32_e64_42]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_44:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_12]], [[V_OR_B32_e64_43]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -457, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_45:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_14]], [[V_OR_B32_e64_44]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_45:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_13]], [[V_OR_B32_e64_44]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -458, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_46:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_45]], [[V_ADD_U32_e64_19]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -459, [[BUFFER_LOAD_FORMAT_X_IDXEN21]], 0, implicit $exec
@@ -324,15 +325,15 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -466, [[BUFFER_LOAD_FORMAT_X_IDXEN22]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_48:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_47]], [[V_ADD_U32_e64_21]], implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_49:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_48]], [[V_ADD_U32_e64_22]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_50:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_15]], [[V_OR_B32_e64_49]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_51:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_16]], [[V_OR_B32_e64_50]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_52:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_17]], [[V_OR_B32_e64_51]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_53:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_23]], [[V_OR_B32_e64_52]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_54:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_18]], [[V_OR_B32_e64_53]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_55:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_19]], [[V_OR_B32_e64_54]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_56:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_20]], [[V_OR_B32_e64_55]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_57:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_21]], [[V_OR_B32_e64_56]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_58:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_22]], [[V_OR_B32_e64_57]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_50:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_14]], [[V_OR_B32_e64_49]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_51:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_15]], [[V_OR_B32_e64_50]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_52:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_16]], [[V_OR_B32_e64_51]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_53:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_22]], [[V_OR_B32_e64_52]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_54:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_17]], [[V_OR_B32_e64_53]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_55:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_18]], [[V_OR_B32_e64_54]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_56:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_19]], [[V_OR_B32_e64_55]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_57:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_20]], [[V_OR_B32_e64_56]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_58:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_21]], [[V_OR_B32_e64_57]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -555, [[BUFFER_LOAD_FORMAT_X_IDXEN23]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -556, [[BUFFER_LOAD_FORMAT_X_IDXEN24]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_59:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_58]], [[V_ADD_U32_e64_23]], implicit $exec
@@ -349,13 +350,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %516:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %542:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
   ; CHECK-NEXT:   [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec
-  ; CHECK-NEXT:   [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
-  ; CHECK-NEXT:   [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
+  ; CHECK-NEXT:   [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_23]], [[V_OR_B32_e64_66]], implicit $exec
   ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec
   ; CHECK-NEXT:   undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
-  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %530:vgpr_32, undef %532:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %556:vgpr_32, undef %558:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
   ; CHECK-NEXT:   S_ENDPGM 0
 .expVert:
   %0 = extractelement <31 x i32> %userData, i64 2
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index ea9bb0417dfa4..862e2dd2de051 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -8,12 +8,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GCN-NEXT:    s_sub_u32 s10, 0, s8
-; GCN-NEXT:    s_subb_u32 s11, 0, s9
+; GCN-NEXT:    s_sub_u32 s0, 0, s8
+; GCN-NEXT:    s_subb_u32 s1, 0, s9
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -22,69 +21,65 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s10, v0
-; GCN-NEXT:    v_readfirstlane_b32 s12, v1
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_mul_i32 s1, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_mul_i32 s13, s11, s0
-; GCN-NEXT:    s_mul_i32 s14, s10, s0
-; GCN-NEXT:    s_add_i32 s1, s15, s1
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s14
-; GCN-NEXT:    s_add_i32 s1, s1, s13
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s1
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s14
-; GCN-NEXT:    v_readfirstlane_b32 s13, v3
-; GCN-NEXT:    s_mul_i32 s15, s0, s1
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s1
-; GCN-NEXT:    s_add_u32 s13, s13, s15
+; GCN-NEXT:    v_mul_hi_u32 v2, s0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    s_mul_i32 s11, s0, s10
+; GCN-NEXT:    v_readfirstlane_b32 s14, v2
+; GCN-NEXT:    s_mul_i32 s12, s1, s2
+; GCN-NEXT:    s_mul_i32 s13, s0, s2
+; GCN-NEXT:    s_add_i32 s11, s14, s11
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s13
+; GCN-NEXT:    s_add_i32 s11, s11, s12
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s13
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_mul_i32 s15, s2, s11
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s11
+; GCN-NEXT:    s_add_u32 s12, s12, s15
 ; GCN-NEXT:    v_readfirstlane_b32 s15, v0
-; GCN-NEXT:    s_mul_i32 s14, s12, s14
+; GCN-NEXT:    s_mul_i32 s13, s10, s13
 ; GCN-NEXT:    s_addc_u32 s15, 0, s15
-; GCN-NEXT:    v_readfirstlane_b32 s16, v4
-; GCN-NEXT:    s_add_u32 s13, s13, s14
-; GCN-NEXT:    s_addc_u32 s13, s15, s16
-; GCN-NEXT:    v_readfirstlane_b32 s14, v1
-; GCN-NEXT:    s_addc_u32 s14, s14, 0
-; GCN-NEXT:    s_mul_i32 s1, s12, s1
-; GCN-NEXT:    s_add_u32 s1, s13, s1
-; GCN-NEXT:    s_addc_u32 s13, 0, s14
-; GCN-NEXT:    s_add_u32 s14, s0, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s12, s12, s13
-; GCN-NEXT:    s_mul_i32 s0, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s0, s1, s0
-; GCN-NEXT:    s_mul_i32 s11, s11, s14
-; GCN-NEXT:    s_mul_i32 s1, s10, s14
-; GCN-NEXT:    s_add_i32 s0, s0, s11
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mul_hi_u32 v3, s12, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s14, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s12, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s14, v0
-; GCN-NEXT:    s_mul_i32 s11, s14, s0
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_add_u32 s11, s15, s11
+; GCN-NEXT:    v_readfirstlane_b32 s14, v4
+; GCN-NEXT:    s_add_u32 s12, s12, s13
+; GCN-NEXT:    s_addc_u32 s12, s15, s14
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-NEXT:    s_mul_i32 s11, s10, s11
+; GCN-NEXT:    s_add_u32 s11, s12, s11
+; GCN-NEXT:    s_addc_u32 s12, 0, s13
+; GCN-NEXT:    s_add_u32 s11, s2, s11
+; GCN-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT:    s_addc_u32 s10, s10, s12
+; GCN-NEXT:    s_mul_i32 s12, s0, s10
+; GCN-NEXT:    s_mul_i32 s1, s1, s11
 ; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s1, s12, s1
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    s_add_u32 s1, s11, s1
-; GCN-NEXT:    s_addc_u32 s1, s13, s10
-; GCN-NEXT:    v_readfirstlane_b32 s10, v1
-; GCN-NEXT:    s_addc_u32 s10, s10, 0
-; GCN-NEXT:    s_mul_i32 s0, s12, s0
-; GCN-NEXT:    s_add_u32 s0, s1, s0
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    s_add_u32 s11, s14, s0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s1, s12, s10
+; GCN-NEXT:    s_add_i32 s12, s13, s12
+; GCN-NEXT:    s_mul_i32 s0, s0, s11
+; GCN-NEXT:    s_add_i32 s1, s12, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mul_hi_u32 v3, s10, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s10, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    s_mul_i32 s13, s11, s1
+; GCN-NEXT:    v_readfirstlane_b32 s15, v2
+; GCN-NEXT:    s_add_u32 s13, s15, s13
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s0, s10, s0
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_add_u32 s0, s13, s0
+; GCN-NEXT:    s_addc_u32 s0, s14, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v1
+; GCN-NEXT:    s_addc_u32 s12, s12, 0
+; GCN-NEXT:    s_mul_i32 s1, s10, s1
+; GCN-NEXT:    s_add_u32 s0, s0, s1
+; GCN-NEXT:    s_addc_u32 s1, 0, s12
+; GCN-NEXT:    s_add_u32 s11, s11, s0
+; GCN-NEXT:    s_addc_u32 s1, s10, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s11
@@ -118,11 +113,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_mul_i32 s4, s8, s4
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s11, s4, s5
 ; GCN-NEXT:    s_subb_u32 s13, s10, s9
 ; GCN-NEXT:    s_sub_u32 s14, s6, s8
 ; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s15, s10, s11
 ; GCN-NEXT:    s_subb_u32 s15, s13, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s15, s9
 ; GCN-NEXT:    s_cselect_b32 s16, -1, 0
@@ -131,13 +124,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_cmp_eq_u32 s15, s9
 ; GCN-NEXT:    s_cselect_b32 s16, s17, s16
 ; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_subb_u32 s13, s13, s9
-; GCN-NEXT:    s_sub_u32 s17, s14, s8
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_subb_u32 s10, s13, 0
+; GCN-NEXT:    s_subb_u32 s10, s13, s9
+; GCN-NEXT:    s_sub_u32 s11, s14, s8
+; GCN-NEXT:    s_subb_u32 s10, s10, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s16, 0
-; GCN-NEXT:    s_cselect_b32 s11, s17, s14
+; GCN-NEXT:    s_cselect_b32 s11, s11, s14
 ; GCN-NEXT:    s_cselect_b32 s10, s10, s15
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
 ; GCN-NEXT:    s_subb_u32 s4, s7, s12
@@ -150,6 +141,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_cmp_lg_u32 s5, 0
 ; GCN-NEXT:    s_cselect_b32 s4, s10, s4
 ; GCN-NEXT:    s_cselect_b32 s5, s11, s6
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -180,8 +172,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -213,8 +203,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_sub_u32 s12, s12, s18
 ; GCN-IR-NEXT:    s_subb_u32 s13, s13, s19
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
@@ -968,81 +956,76 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; GCN-NEXT:    s_sub_u32 s10, 0, s4
-; GCN-NEXT:    s_subb_u32 s11, 0, s5
+; GCN-NEXT:    s_sub_u32 s8, 0, s4
+; GCN-NEXT:    s_subb_u32 s9, 0, s5
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s10, v0
-; GCN-NEXT:    v_readfirstlane_b32 s12, v1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v0
-; GCN-NEXT:    s_mul_i32 s9, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_mul_i32 s13, s11, s8
-; GCN-NEXT:    s_mul_i32 s14, s10, s8
-; GCN-NEXT:    s_add_i32 s9, s15, s9
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s14
-; GCN-NEXT:    s_add_i32 s9, s9, s13
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s9
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s14
-; GCN-NEXT:    v_readfirstlane_b32 s13, v3
-; GCN-NEXT:    s_mul_i32 s15, s8, s9
-; GCN-NEXT:    s_add_u32 s13, s13, s15
-; GCN-NEXT:    v_readfirstlane_b32 s15, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, v1, s9
-; GCN-NEXT:    s_addc_u32 s15, 0, s15
-; GCN-NEXT:    s_mul_i32 s14, s12, s14
-; GCN-NEXT:    v_readfirstlane_b32 s16, v4
-; GCN-NEXT:    s_add_u32 s13, s13, s14
-; GCN-NEXT:    s_addc_u32 s13, s15, s16
+; GCN-NEXT:    v_mul_hi_u32 v2, s8, v0
+; GCN-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    s_mul_i32 s11, s8, s10
+; GCN-NEXT:    v_readfirstlane_b32 s14, v2
+; GCN-NEXT:    s_mul_i32 s12, s9, s2
+; GCN-NEXT:    s_mul_i32 s13, s8, s2
+; GCN-NEXT:    s_add_i32 s11, s14, s11
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s13
+; GCN-NEXT:    s_add_i32 s11, s11, s12
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s13
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_mul_i32 s14, s2, s11
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s11
+; GCN-NEXT:    s_add_u32 s12, s12, s14
 ; GCN-NEXT:    v_readfirstlane_b32 s14, v0
-; GCN-NEXT:    s_addc_u32 s14, s14, 0
-; GCN-NEXT:    s_mul_i32 s9, s12, s9
-; GCN-NEXT:    s_add_u32 s9, s13, s9
-; GCN-NEXT:    s_addc_u32 s13, 0, s14
-; GCN-NEXT:    s_add_u32 s14, s8, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_addc_u32 s12, s12, s13
-; GCN-NEXT:    s_mul_i32 s8, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s9, v0
-; GCN-NEXT:    s_add_i32 s8, s9, s8
-; GCN-NEXT:    s_mul_i32 s11, s11, s14
-; GCN-NEXT:    s_mul_i32 s9, s10, s14
-; GCN-NEXT:    s_add_i32 s8, s8, s11
-; GCN-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mul_hi_u32 v3, s12, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s14, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s12, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s14, v0
-; GCN-NEXT:    s_mul_i32 s11, s14, s8
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_add_u32 s11, s15, s11
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    s_mul_i32 s13, s10, s13
+; GCN-NEXT:    v_readfirstlane_b32 s15, v4
+; GCN-NEXT:    s_add_u32 s12, s12, s13
+; GCN-NEXT:    s_addc_u32 s12, s14, s15
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-NEXT:    s_mul_i32 s11, s10, s11
+; GCN-NEXT:    s_add_u32 s11, s12, s11
+; GCN-NEXT:    s_addc_u32 s12, 0, s13
+; GCN-NEXT:    s_add_u32 s11, s2, s11
+; GCN-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT:    s_addc_u32 s10, s10, s12
+; GCN-NEXT:    s_mul_i32 s12, s8, s10
+; GCN-NEXT:    s_mul_i32 s9, s9, s11
 ; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s9, s12, s9
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    s_add_u32 s9, s11, s9
-; GCN-NEXT:    s_addc_u32 s9, s13, s10
-; GCN-NEXT:    v_readfirstlane_b32 s10, v1
-; GCN-NEXT:    s_addc_u32 s10, s10, 0
-; GCN-NEXT:    s_mul_i32 s8, s12, s8
-; GCN-NEXT:    s_add_u32 s8, s9, s8
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    s_add_u32 s11, s14, s8
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_addc_u32 s10, s12, s10
+; GCN-NEXT:    s_add_i32 s12, s13, s12
+; GCN-NEXT:    s_mul_i32 s8, s8, s11
+; GCN-NEXT:    s_add_i32 s9, s12, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-NEXT:    v_mul_hi_u32 v3, s10, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s10, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    s_mul_i32 s13, s11, s9
+; GCN-NEXT:    v_readfirstlane_b32 s15, v2
+; GCN-NEXT:    s_add_u32 s13, s15, s13
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s8, s10, s8
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_add_u32 s8, s13, s8
+; GCN-NEXT:    s_addc_u32 s8, s14, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v1
+; GCN-NEXT:    s_addc_u32 s12, s12, 0
+; GCN-NEXT:    s_mul_i32 s9, s10, s9
+; GCN-NEXT:    s_add_u32 s8, s8, s9
+; GCN-NEXT:    s_addc_u32 s9, 0, s12
+; GCN-NEXT:    s_add_u32 s11, s11, s8
+; GCN-NEXT:    s_addc_u32 s10, s10, s9
 ; GCN-NEXT:    s_ashr_i32 s8, s7, 31
 ; GCN-NEXT:    s_add_u32 s6, s6, s8
 ; GCN-NEXT:    s_mov_b32 s9, s8
@@ -1071,6 +1054,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GCN-NEXT:    s_addc_u32 s11, 0, s12
 ; GCN-NEXT:    s_mul_i32 s11, s4, s11
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_readfirstlane_b32 s12, v0
 ; GCN-NEXT:    s_add_i32 s11, s12, s11
 ; GCN-NEXT:    s_mul_i32 s12, s5, s10
@@ -1079,11 +1063,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_mul_i32 s10, s4, s10
 ; GCN-NEXT:    s_sub_u32 s6, s6, s10
 ; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s13, s10, s11
 ; GCN-NEXT:    s_subb_u32 s15, s12, s5
 ; GCN-NEXT:    s_sub_u32 s16, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_or_b32 s17, s12, s13
 ; GCN-NEXT:    s_subb_u32 s17, s15, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s17, s5
 ; GCN-NEXT:    s_cselect_b32 s18, -1, 0
@@ -1092,13 +1074,11 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_cmp_eq_u32 s17, s5
 ; GCN-NEXT:    s_cselect_b32 s18, s19, s18
 ; GCN-NEXT:    s_or_b32 s12, s12, s13
-; GCN-NEXT:    s_subb_u32 s15, s15, s5
-; GCN-NEXT:    s_sub_u32 s19, s16, s4
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_or_b32 s12, s12, s13
-; GCN-NEXT:    s_subb_u32 s12, s15, 0
+; GCN-NEXT:    s_subb_u32 s12, s15, s5
+; GCN-NEXT:    s_sub_u32 s13, s16, s4
+; GCN-NEXT:    s_subb_u32 s12, s12, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s18, 0
-; GCN-NEXT:    s_cselect_b32 s13, s19, s16
+; GCN-NEXT:    s_cselect_b32 s13, s13, s16
 ; GCN-NEXT:    s_cselect_b32 s12, s12, s17
 ; GCN-NEXT:    s_or_b32 s10, s10, s11
 ; GCN-NEXT:    s_subb_u32 s7, s7, s14
@@ -1156,8 +1136,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s16, s14, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s10, s10, s11
 ; GCN-IR-NEXT:    s_addc_u32 s10, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
@@ -1189,8 +1167,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_sub_u32 s14, s14, s20
 ; GCN-IR-NEXT:    s_subb_u32 s15, s15, s21
 ; GCN-IR-NEXT:    s_add_u32 s18, s18, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s20, s20, s21
 ; GCN-IR-NEXT:    s_addc_u32 s19, s19, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[2:3]
@@ -1316,8 +1292,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s5
 ; GCN-NEXT:    s_sub_u32 s2, 0, s4
-; GCN-NEXT:    s_subb_u32 s8, 0, s5
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_subb_u32 s6, 0, s5
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1327,72 +1302,68 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, s2, v0
-; GCN-NEXT:    v_readfirstlane_b32 s9, v1
-; GCN-NEXT:    v_readfirstlane_b32 s6, v0
-; GCN-NEXT:    s_mul_i32 s7, s2, s9
-; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_mul_i32 s10, s8, s6
-; GCN-NEXT:    s_mul_i32 s11, s2, s6
-; GCN-NEXT:    s_add_i32 s7, s12, s7
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s11
-; GCN-NEXT:    s_add_i32 s7, s7, s10
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s7
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s11
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    s_mul_i32 s13, s6, s7
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s7
-; GCN-NEXT:    s_add_u32 s10, s10, s13
-; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s11, s9, s11
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s12, v4
-; GCN-NEXT:    s_add_u32 s10, s10, s11
-; GCN-NEXT:    s_addc_u32 s10, s13, s12
-; GCN-NEXT:    v_readfirstlane_b32 s11, v1
-; GCN-NEXT:    s_addc_u32 s11, s11, 0
-; GCN-NEXT:    s_mul_i32 s7, s9, s7
-; GCN-NEXT:    s_add_u32 s7, s10, s7
-; GCN-NEXT:    s_addc_u32 s10, 0, s11
-; GCN-NEXT:    s_add_u32 s11, s6, s7
-; GCN-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NEXT:    v_readfirstlane_b32 s7, v1
+; GCN-NEXT:    v_readfirstlane_b32 s3, v0
+; GCN-NEXT:    s_mul_i32 s8, s2, s7
+; GCN-NEXT:    v_readfirstlane_b32 s11, v2
+; GCN-NEXT:    s_mul_i32 s9, s6, s3
+; GCN-NEXT:    s_mul_i32 s10, s2, s3
+; GCN-NEXT:    s_add_i32 s8, s11, s8
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s10
+; GCN-NEXT:    s_add_i32 s8, s8, s9
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s10
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s8
+; GCN-NEXT:    s_mul_i32 s12, s3, s8
+; GCN-NEXT:    s_add_u32 s9, s9, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    s_mul_i32 s10, s7, s10
+; GCN-NEXT:    s_addc_u32 s12, 0, s12
+; GCN-NEXT:    v_readfirstlane_b32 s11, v4
+; GCN-NEXT:    s_add_u32 s9, s9, s10
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s9, s12, s11
+; GCN-NEXT:    s_addc_u32 s10, s13, 0
+; GCN-NEXT:    s_mul_i32 s8, s7, s8
+; GCN-NEXT:    s_add_u32 s8, s9, s8
+; GCN-NEXT:    s_addc_u32 s9, 0, s10
+; GCN-NEXT:    s_add_u32 s8, s3, s8
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_addc_u32 s9, s9, s10
-; GCN-NEXT:    s_mul_i32 s6, s2, s9
-; GCN-NEXT:    v_readfirstlane_b32 s7, v0
-; GCN-NEXT:    s_add_i32 s6, s7, s6
-; GCN-NEXT:    s_mul_i32 s8, s8, s11
-; GCN-NEXT:    s_mul_i32 s2, s2, s11
-; GCN-NEXT:    s_add_i32 s6, s6, s8
+; GCN-NEXT:    s_addc_u32 s7, s7, s9
+; GCN-NEXT:    s_mul_i32 s9, s2, s7
+; GCN-NEXT:    s_mul_i32 s6, s6, s8
+; GCN-NEXT:    v_readfirstlane_b32 s10, v0
+; GCN-NEXT:    s_add_i32 s9, s10, s9
+; GCN-NEXT:    s_mul_i32 s2, s2, s8
+; GCN-NEXT:    s_add_i32 s6, s9, s6
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mul_hi_u32 v3, s9, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s9, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT:    s_mul_i32 s8, s11, s6
+; GCN-NEXT:    v_mul_hi_u32 v3, s7, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s8, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s7, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT:    s_mul_i32 s10, s8, s6
 ; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_add_u32 s8, s12, s8
-; GCN-NEXT:    v_readfirstlane_b32 s10, v0
-; GCN-NEXT:    s_mul_i32 s2, s9, s2
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    v_readfirstlane_b32 s7, v3
-; GCN-NEXT:    s_add_u32 s2, s8, s2
-; GCN-NEXT:    s_addc_u32 s2, s10, s7
-; GCN-NEXT:    v_readfirstlane_b32 s7, v1
-; GCN-NEXT:    s_addc_u32 s7, s7, 0
-; GCN-NEXT:    s_mul_i32 s6, s9, s6
+; GCN-NEXT:    s_add_u32 s10, s12, s10
+; GCN-NEXT:    v_readfirstlane_b32 s11, v0
+; GCN-NEXT:    s_mul_i32 s2, s7, s2
+; GCN-NEXT:    s_addc_u32 s11, 0, s11
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    s_add_u32 s2, s10, s2
+; GCN-NEXT:    s_addc_u32 s2, s11, s9
+; GCN-NEXT:    v_readfirstlane_b32 s9, v1
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_mul_i32 s6, s7, s6
 ; GCN-NEXT:    s_add_u32 s2, s2, s6
-; GCN-NEXT:    s_addc_u32 s8, 0, s7
-; GCN-NEXT:    s_add_u32 s2, s11, s2
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_addc_u32 s6, s9, s8
+; GCN-NEXT:    s_addc_u32 s6, 0, s9
+; GCN-NEXT:    s_add_u32 s2, s8, s2
+; GCN-NEXT:    s_addc_u32 s6, s7, s6
 ; GCN-NEXT:    v_mul_hi_u32 v1, s2, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, 24
 ; GCN-NEXT:    s_mul_i32 s6, s6, 24
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v0
 ; GCN-NEXT:    s_add_u32 s6, s8, s6
@@ -1401,16 +1372,15 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GCN-NEXT:    s_mul_i32 s7, s5, s6
 ; GCN-NEXT:    s_mul_i32 s6, s4, s6
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
 ; GCN-NEXT:    s_add_i32 s10, s8, s7
 ; GCN-NEXT:    s_sub_i32 s8, 0, s10
 ; GCN-NEXT:    s_sub_u32 s11, 24, s6
 ; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b32 s9, s6, s7
 ; GCN-NEXT:    s_subb_u32 s12, s8, s5
 ; GCN-NEXT:    s_sub_u32 s13, s11, s4
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s14, s8, s9
 ; GCN-NEXT:    s_subb_u32 s14, s12, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s14, s5
 ; GCN-NEXT:    s_cselect_b32 s15, -1, 0
@@ -1419,13 +1389,11 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_cmp_eq_u32 s14, s5
 ; GCN-NEXT:    s_cselect_b32 s15, s16, s15
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s12, s12, s5
-; GCN-NEXT:    s_sub_u32 s16, s13, s4
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_subb_u32 s8, s12, s5
+; GCN-NEXT:    s_sub_u32 s9, s13, s4
+; GCN-NEXT:    s_subb_u32 s8, s8, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s15, 0
-; GCN-NEXT:    s_cselect_b32 s9, s16, s13
+; GCN-NEXT:    s_cselect_b32 s9, s9, s13
 ; GCN-NEXT:    s_cselect_b32 s8, s8, s14
 ; GCN-NEXT:    s_or_b32 s6, s6, s7
 ; GCN-NEXT:    s_subb_u32 s6, 0, s10
@@ -1468,8 +1436,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s8, s2, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s9, s10, s11
 ; GCN-IR-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s2, 63, s2
@@ -1500,8 +1466,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s10, s10, s16
 ; GCN-IR-NEXT:    s_subb_u32 s11, s11, s17
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 9cb22dad86b88..802de8037cf6b 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -295,9 +295,9 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
 ; GCN-NEXT:    v_writelane_b32 v40, s16, 2
-; GCN-NEXT:    v_mov_b32_e32 v32, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s34, 3
 ; GCN-NEXT:    s_mov_b32 s34, s32
+; GCN-NEXT:    v_mov_b32_e32 v32, 0
 ; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:1024
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s34
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 5c113d80a9c80..0a5160145fbd8 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -11,14 +11,13 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; GFX6-LABEL: s_sub_i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_sub_i32 s0, s2, s3
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_sub_i32 s4, s4, s5
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_sub_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 6a273e55fd9a8..82ef28f7339b8 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -223,44 +223,39 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ; VI-LABEL: s_test_sub_v2i16_kernarg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshr_b32 s0, s3, 16
-; VI-NEXT:    s_lshr_b32 s1, s2, 16
-; VI-NEXT:    s_sub_i32 s0, s1, s0
-; VI-NEXT:    s_sub_i32 s1, s2, s3
-; VI-NEXT:    s_lshl_b32 s0, s0, 16
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_or_b32 s0, s1, s0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; VI-NEXT:    s_lshr_b32 s6, s5, 16
+; VI-NEXT:    s_lshr_b32 s7, s4, 16
+; VI-NEXT:    s_sub_i32 s4, s4, s5
+; VI-NEXT:    s_sub_i32 s5, s7, s6
+; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_test_sub_v2i16_kernarg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v0, s2, s3
-; GFX10-NEXT:    s_mov_b32 s4, s0
-; GFX10-NEXT:    s_mov_b32 s5, s1
-; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_sub_v2i16_kernarg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_i16 v0, s2, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_endpgm
   %add = sub <2 x i16> %a, %b
   store <2 x i16> %add, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
index da6b57c776796..2fd1e36c4181e 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
+++ b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
@@ -28,9 +28,9 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]]
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]]
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0, 1245194 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1
   ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; CHECK-NEXT:   $sgpr10 = S_MOV_B32 -1
   ; CHECK-NEXT:   S_BRANCH %bb.1
@@ -41,9 +41,9 @@ body:             |
 
   bb.1:
     %2:vgpr_32 = DS_READ_B32_gfx9 %1, 0, 0, implicit $exec :: (load (s32), addrspace 3)
-    INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3)
-    INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %2
-    INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0, 1835018 /* regdef:VGPR_32 */, def %0.sub1
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3)
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %2
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %0.sub0, 1245194 /* regdef:VGPR_32 */, def %0.sub1
     S_NOP 0, implicit %0.sub1
     $sgpr10 = S_MOV_B32 -1
     S_BRANCH %bb.1
@@ -69,9 +69,9 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]]
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]]
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1, 1245194 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0
   ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; CHECK-NEXT:   $sgpr10 = S_MOV_B32 -1
   ; CHECK-NEXT:   S_BRANCH %bb.1
@@ -82,9 +82,9 @@ body:             |
 
   bb.1:
     %2:vgpr_32 = DS_READ_B32_gfx9 %1, 0, 0, implicit $exec :: (load (s32), addrspace 3)
-    INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3)
-    INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %2
-    INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %0.sub1, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3)
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %2
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %0.sub1, 1245194 /* regdef:VGPR_32 */, def undef %0.sub0
     S_NOP 0, implicit %0.sub1
     $sgpr10 = S_MOV_B32 -1
     S_BRANCH %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
index 9484417e63c98..6706de13bb89b 100644
--- a/llvm/test/CodeGen/AMDGPU/true16-fold.mir
+++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
@@ -48,7 +48,9 @@ body:             |
     ; CHECK-LABEL: name: sgpr_lo16
     ; CHECK: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
     ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[V_ALIGNBIT_B32_t16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, [[DEF]], 0, killed [[DEF1]], 0, 30, 0, 0, implicit $exec
+    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 30
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY [[S_MOV_B32_]]
+    ; CHECK-NEXT: [[V_ALIGNBIT_B32_t16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, [[DEF]], 0, killed [[DEF1]], 0, killed [[COPY]], 0, 0, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_t16_e64_]]
     %0:sreg_32 = IMPLICIT_DEF
     %1:sreg_32 = IMPLICIT_DEF
@@ -232,3 +234,34 @@ body:             |
     $vgpr0 = COPY %3
     S_ENDPGM 0, implicit $vgpr0
 ...
+
+# Make sure the immediate materialized by the v_mov_b16 isn't
+# incorrectly folded into the bfi as 0.
+
+# FIXME: %4:vgpr_32 = COPY %3 is a direct copy from v16 to v32 and
+# should probably fail the verifier
+---
+name:            mov_v16_copy_v32_fold_b32_regression
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: mov_v16_copy_v32_fold_b32_regression
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 15360, 0, implicit $exec
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B16_t16_e64_]]
+    ; CHECK-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 32767, [[COPY2]], [[COPY1]], implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_BFI_B32_e64_]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %0:vgpr_32 = COPY $vgpr1
+    %1:vgpr_32 = COPY $vgpr0
+    %3:vgpr_16 = V_MOV_B16_t16_e64 0, 15360, 0, implicit $exec
+    %4:vgpr_32 = COPY %3
+    %5:vgpr_32 = V_BFI_B32_e64 32767, %4, %1, implicit $exec
+    $vgpr0 = COPY %5
+    SI_RETURN implicit $vgpr0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/true16-imm-folded-to-0-regression.ll b/llvm/test/CodeGen/AMDGPU/true16-imm-folded-to-0-regression.ll
new file mode 100644
index 0000000000000..0bebb5849ed81
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/true16-imm-folded-to-0-regression.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s
+
+; Make sure that the 16-bit constant 0x3c00 isn't folded as 0 into
+; v_bfi_b32.
+define i32 @mov16_bfi_fold_regression(half %arg, i32 %arg1) {
+; CHECK-LABEL: bfi_fold_regression:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b16_e32 v2.l, 0x3c00
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; CHECK-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v0.l, vcc_lo
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_pack_b32_f16 v0, v0.l, 0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %cmp = icmp eq i32 %arg1, 0
+  %call = call half @llvm.copysign.f16(half 0xH3C00, half %arg)
+  %select = select i1 %cmp, half 0xH3C00, half %call
+  %insertelement = insertelement <2 x half> zeroinitializer, half %select, i64 0
+  %bitcast = bitcast <2 x half> %insertelement to i32
+  ret i32 %bitcast
+}
+
+declare half @llvm.copysign.f16(half, half) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index d80ec6bd34945..8f8e2c0ba52fc 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -655,7 +655,7 @@ bb:
   br label %bb5
 
 bb5:                                              ; preds = %bb5.backedge, %bb
-  %tmp4.i.sroa.0.0 = phi <9 x double> [ undef, %bb ], [ %tmp4.i.sroa.0.1, %bb5.backedge ]
+  %tmp4.i.sroa.0.0 = phi <9 x double> [ poison, %bb ], [ %tmp4.i.sroa.0.1, %bb5.backedge ]
   %tmp14.1.i = load i32, ptr inttoptr (i64 128 to ptr), align 128
   store i32 0, ptr addrspace(5) null, align 4
   %tmp14.2.i = load i32, ptr inttoptr (i64 128 to ptr), align 128
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
new file mode 100644
index 0000000000000..8ae50d8e0e071
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
@@ -0,0 +1,56 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction -verify-each -o - | FileCheck --check-prefixes=GCN %s
+
+# Exercise very basic handling of BUNDLE'd instructions by the two-address-instruction pass.
+
+# This test is an example where it is best to keep the two-address instruction
+# and resolve the tie with a COPY that is expected to be coalesced.
+---
+name:            test_fmac_bundle
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: test_fmac_bundle
+    ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit [[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec {
+    ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]], killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec
+    ; GCN-NEXT: }
+    %10:vgpr_32 = COPY $vgpr0
+    %11:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed %2(tied-def 0), implicit $mode, implicit $exec {
+      %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit $mode, implicit $exec
+    }
+
+...
+
+# This test is an example where conversion to three-address form is beneficial.
+---
+name:            test_fmac_reuse_bundle
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: test_fmac_reuse_bundle
+    ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: BUNDLE implicit-def %3, implicit [[DEF]], implicit [[DEF1]], implicit [[COPY]], implicit $mode, implicit $exec {
+    ; GCN-NEXT:   [[V_FMA_F32_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F32_e64 0, killed [[DEF]], 0, killed [[DEF1]], 0, killed [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: }
+    ; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_FMA_F32_e64_]], [[COPY]], 0, implicit $exec
+    %2:vgpr_32 = COPY $vgpr0
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit %2(tied-def 0), implicit $mode, implicit $exec {
+      %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit $mode, implicit $exec
+    }
+    %4:vgpr_32 = V_ADD_U32_e64 %3, %2, 0, implicit $exec
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index bdd22f25e91c8..b000fae124ede 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -15,10 +15,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_add_u32 s2, s2, s8
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    s_or_b32 s0, s0, s1
 ; SI-NEXT:    s_addc_u32 s3, s3, s9
+; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
@@ -433,8 +431,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_add_u32 s4, s4, s6
-; SI-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; SI-NEXT:    s_or_b32 s6, s12, s13
 ; SI-NEXT:    s_addc_u32 s5, s5, s7
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 063c56faf9ce4..1f93bf7a68972 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -189,67 +189,65 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; SI-LABEL: s_udiv_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; SI-NEXT:    s_sub_i32 s4, 0, s3
-; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; SI-NEXT:    s_sub_i32 s2, 0, s5
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; SI-NEXT:    v_mul_lo_u32 v1, s4, v0
-; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    v_mul_lo_u32 v1, s2, v0
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_mul_hi_u32 v0, s2, v0
-; SI-NEXT:    v_readfirstlane_b32 s0, v0
-; SI-NEXT:    s_mul_i32 s0, s0, s3
-; SI-NEXT:    s_sub_i32 s0, s2, s0
-; SI-NEXT:    s_sub_i32 s1, s0, s3
+; SI-NEXT:    v_mul_hi_u32 v0, s4, v0
+; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    s_mul_i32 s6, s6, s5
+; SI-NEXT:    s_sub_i32 s4, s4, s6
+; SI-NEXT:    s_sub_i32 s6, s4, s5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_cmp_ge_u32 s0, s3
+; SI-NEXT:    s_cmp_ge_u32 s4, s5
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    s_cselect_b32 s0, s1, s0
+; SI-NEXT:    s_cselect_b32 s4, s6, s4
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; SI-NEXT:    s_cmp_ge_u32 s0, s3
+; SI-NEXT:    s_cmp_ge_u32 s4, s5
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_udiv_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; VI-NEXT:    s_sub_i32 s4, 0, s3
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; VI-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; VI-NEXT:    s_sub_i32 s2, 0, s5
+; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT:    v_mul_lo_u32 v1, s4, v0
-; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    v_mul_lo_u32 v1, s2, v0
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; VI-NEXT:    v_mul_hi_u32 v0, s2, v0
-; VI-NEXT:    v_readfirstlane_b32 s0, v0
-; VI-NEXT:    s_mul_i32 s0, s0, s3
-; VI-NEXT:    s_sub_i32 s0, s2, s0
-; VI-NEXT:    s_sub_i32 s1, s0, s3
+; VI-NEXT:    v_mul_hi_u32 v0, s4, v0
+; VI-NEXT:    v_readfirstlane_b32 s6, v0
+; VI-NEXT:    s_mul_i32 s6, s6, s5
+; VI-NEXT:    s_sub_i32 s4, s4, s6
+; VI-NEXT:    s_sub_i32 s6, s4, s5
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
-; VI-NEXT:    s_cmp_ge_u32 s0, s3
+; VI-NEXT:    s_cmp_ge_u32 s4, s5
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-NEXT:    s_cselect_b32 s0, s1, s0
+; VI-NEXT:    s_cselect_b32 s4, s6, s4
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
-; VI-NEXT:    s_cmp_ge_u32 s0, s3
+; VI-NEXT:    s_cmp_ge_u32 s4, s5
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GCN-LABEL: s_udiv_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index fd461ac80ea55..1c50f930facba 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -146,8 +146,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -179,8 +177,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_sub_u32 s12, s12, s16
 ; GCN-IR-NEXT:    s_subb_u32 s13, s13, s17
 ; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
 ; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[2:3], s[4:5]
@@ -720,8 +716,6 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s3, s3, 0xffff
@@ -733,25 +727,23 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
 ; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GCN-NEXT:    buffer_store_short v3, off, s[4:7], 0 offset:4
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_short v3, off, s[0:3], 0 offset:4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_udiv24_i48:
 ; GCN-IR:       ; %bb.0:
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_and_b32 s3, s3, 0xffff
@@ -763,17 +755,17 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-IR-NEXT:    s_mov_b32 s4, s0
-; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v4, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v4, vcc
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GCN-IR-NEXT:    buffer_store_short v3, off, s[4:7], 0 offset:4
-; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT:    buffer_store_short v3, off, s[0:3], 0 offset:4
+; GCN-IR-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = lshr i48 %x, 24
   %2 = lshr i48 %y, 24
@@ -786,12 +778,11 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-LABEL: s_test_udiv_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GCN-NEXT:    s_sub_u32 s6, 0, s2
-; GCN-NEXT:    s_subb_u32 s8, 0, s3
+; GCN-NEXT:    s_sub_u32 s4, 0, s2
+; GCN-NEXT:    s_subb_u32 s5, 0, s3
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -800,118 +791,112 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s6, v0
+; GCN-NEXT:    v_mul_hi_u32 v2, s4, v0
+; GCN-NEXT:    v_readfirstlane_b32 s6, v1
+; GCN-NEXT:    v_readfirstlane_b32 s7, v0
+; GCN-NEXT:    s_mul_i32 s8, s4, s6
+; GCN-NEXT:    v_readfirstlane_b32 s11, v2
+; GCN-NEXT:    s_mul_i32 s9, s5, s7
+; GCN-NEXT:    s_mul_i32 s10, s4, s7
+; GCN-NEXT:    s_add_i32 s8, s11, s8
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s10
+; GCN-NEXT:    s_add_i32 s8, s8, s9
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s10
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    s_mul_i32 s12, s7, s8
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s8
+; GCN-NEXT:    s_add_u32 s9, s9, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    s_mul_i32 s10, s6, s10
+; GCN-NEXT:    s_addc_u32 s12, 0, s12
+; GCN-NEXT:    v_readfirstlane_b32 s11, v4
+; GCN-NEXT:    s_add_u32 s9, s9, s10
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s9, s12, s11
+; GCN-NEXT:    s_mul_i32 s8, s6, s8
+; GCN-NEXT:    s_addc_u32 s10, s13, 0
+; GCN-NEXT:    s_add_u32 s8, s9, s8
+; GCN-NEXT:    s_addc_u32 s9, 0, s10
+; GCN-NEXT:    s_add_u32 s8, s7, s8
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GCN-NEXT:    s_addc_u32 s6, s6, s9
+; GCN-NEXT:    s_mul_i32 s9, s4, s6
+; GCN-NEXT:    s_mul_i32 s5, s5, s8
+; GCN-NEXT:    v_readfirstlane_b32 s10, v0
+; GCN-NEXT:    s_add_i32 s9, s10, s9
+; GCN-NEXT:    s_mul_i32 s4, s4, s8
+; GCN-NEXT:    s_add_i32 s5, s9, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NEXT:    v_mul_hi_u32 v3, s6, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s8, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT:    s_mul_i32 s10, s8, s5
+; GCN-NEXT:    v_readfirstlane_b32 s12, v2
+; GCN-NEXT:    s_add_u32 s10, s12, s10
+; GCN-NEXT:    v_readfirstlane_b32 s11, v0
+; GCN-NEXT:    s_mul_i32 s4, s6, s4
+; GCN-NEXT:    s_addc_u32 s11, 0, s11
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    s_add_u32 s4, s10, s4
+; GCN-NEXT:    s_addc_u32 s4, s11, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v1
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_mul_i32 s5, s6, s5
+; GCN-NEXT:    s_add_u32 s4, s4, s5
+; GCN-NEXT:    s_addc_u32 s5, 0, s9
+; GCN-NEXT:    s_add_u32 s4, s8, s4
+; GCN-NEXT:    s_addc_u32 s5, s6, s5
+; GCN-NEXT:    v_mul_hi_u32 v1, s4, 24
+; GCN-NEXT:    v_mul_hi_u32 v0, s5, 24
+; GCN-NEXT:    s_mul_i32 s5, s5, 24
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_readfirstlane_b32 s8, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
-; GCN-NEXT:    s_mul_i32 s5, s6, s9
-; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_mul_i32 s10, s8, s4
-; GCN-NEXT:    s_mul_i32 s11, s6, s4
-; GCN-NEXT:    s_add_i32 s5, s12, s5
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s11
-; GCN-NEXT:    s_add_i32 s5, s5, s10
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s5
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s11
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s5
-; GCN-NEXT:    s_mul_i32 s13, s4, s5
-; GCN-NEXT:    s_add_u32 s10, s10, s13
-; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s11, s9, s11
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s12, v4
-; GCN-NEXT:    s_add_u32 s10, s10, s11
-; GCN-NEXT:    v_readfirstlane_b32 s14, v1
-; GCN-NEXT:    s_addc_u32 s10, s13, s12
-; GCN-NEXT:    s_addc_u32 s11, s14, 0
-; GCN-NEXT:    s_mul_i32 s5, s9, s5
-; GCN-NEXT:    s_add_u32 s5, s10, s5
-; GCN-NEXT:    s_addc_u32 s10, 0, s11
-; GCN-NEXT:    s_add_u32 s11, s4, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_addc_u32 s9, s9, s10
-; GCN-NEXT:    s_mul_i32 s4, s6, s9
-; GCN-NEXT:    v_readfirstlane_b32 s5, v0
-; GCN-NEXT:    s_add_i32 s4, s5, s4
-; GCN-NEXT:    s_mul_i32 s8, s8, s11
-; GCN-NEXT:    s_mul_i32 s5, s6, s11
-; GCN-NEXT:    s_add_i32 s4, s4, s8
-; GCN-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mul_hi_u32 v3, s9, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s9, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT:    s_mul_i32 s8, s11, s4
-; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_add_u32 s8, s12, s8
-; GCN-NEXT:    v_readfirstlane_b32 s10, v0
-; GCN-NEXT:    s_mul_i32 s5, s9, s5
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    v_readfirstlane_b32 s6, v3
 ; GCN-NEXT:    s_add_u32 s5, s8, s5
-; GCN-NEXT:    s_addc_u32 s5, s10, s6
-; GCN-NEXT:    v_readfirstlane_b32 s6, v1
-; GCN-NEXT:    s_addc_u32 s6, s6, 0
-; GCN-NEXT:    s_mul_i32 s4, s9, s4
-; GCN-NEXT:    s_add_u32 s4, s5, s4
-; GCN-NEXT:    s_addc_u32 s6, 0, s6
-; GCN-NEXT:    s_add_u32 s8, s11, s4
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_addc_u32 s4, s9, s6
-; GCN-NEXT:    v_mul_hi_u32 v1, s8, 24
-; GCN-NEXT:    v_mul_hi_u32 v0, s4, 24
-; GCN-NEXT:    s_mul_i32 s4, s4, 24
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v1
-; GCN-NEXT:    v_readfirstlane_b32 s5, v0
-; GCN-NEXT:    s_add_u32 s4, s8, s4
-; GCN-NEXT:    s_addc_u32 s10, 0, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NEXT:    s_addc_u32 s8, 0, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_mul_i32 s0, s3, s10
+; GCN-NEXT:    s_mul_i32 s0, s3, s8
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s11, s1, s0
-; GCN-NEXT:    s_sub_i32 s8, 0, s11
-; GCN-NEXT:    s_mul_i32 s0, s2, s10
-; GCN-NEXT:    s_sub_u32 s12, 24, s0
+; GCN-NEXT:    s_add_i32 s9, s1, s0
+; GCN-NEXT:    s_sub_i32 s10, 0, s9
+; GCN-NEXT:    s_mul_i32 s0, s2, s8
+; GCN-NEXT:    s_sub_u32 s11, 24, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s9, s0, s1
-; GCN-NEXT:    s_subb_u32 s13, s8, s3
-; GCN-NEXT:    s_sub_u32 s14, s12, s2
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s8, s13, 0
-; GCN-NEXT:    s_cmp_ge_u32 s8, s3
-; GCN-NEXT:    s_cselect_b32 s9, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s14, s2
+; GCN-NEXT:    s_subb_u32 s10, s10, s3
+; GCN-NEXT:    s_sub_u32 s12, s11, s2
+; GCN-NEXT:    s_subb_u32 s10, s10, 0
+; GCN-NEXT:    s_cmp_ge_u32 s10, s3
 ; GCN-NEXT:    s_cselect_b32 s13, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s8, s3
-; GCN-NEXT:    s_cselect_b32 s8, s13, s9
-; GCN-NEXT:    s_add_u32 s9, s10, 1
+; GCN-NEXT:    s_cmp_ge_u32 s12, s2
+; GCN-NEXT:    s_cselect_b32 s12, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s10, s3
+; GCN-NEXT:    s_cselect_b32 s10, s12, s13
+; GCN-NEXT:    s_add_u32 s12, s8, 1
 ; GCN-NEXT:    s_addc_u32 s13, 0, 0
-; GCN-NEXT:    s_add_u32 s14, s10, 2
+; GCN-NEXT:    s_add_u32 s14, s8, 2
 ; GCN-NEXT:    s_addc_u32 s15, 0, 0
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s8, s14, s9
-; GCN-NEXT:    s_cselect_b32 s9, s15, s13
+; GCN-NEXT:    s_cmp_lg_u32 s10, 0
+; GCN-NEXT:    s_cselect_b32 s10, s14, s12
+; GCN-NEXT:    s_cselect_b32 s12, s15, s13
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_subb_u32 s0, 0, s11
+; GCN-NEXT:    s_subb_u32 s0, 0, s9
 ; GCN-NEXT:    s_cmp_ge_u32 s0, s3
 ; GCN-NEXT:    s_cselect_b32 s1, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s12, s2
+; GCN-NEXT:    s_cmp_ge_u32 s11, s2
 ; GCN-NEXT:    s_cselect_b32 s2, -1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s0, s3
 ; GCN-NEXT:    s_cselect_b32 s0, s2, s1
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s9, 0
-; GCN-NEXT:    s_cselect_b32 s1, s8, s10
+; GCN-NEXT:    s_cselect_b32 s0, s12, 0
+; GCN-NEXT:    s_cselect_b32 s1, s10, s8
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -937,8 +922,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -969,8 +952,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s10, s10, s16
 ; GCN-IR-NEXT:    s_subb_u32 s11, s11, s17
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
@@ -1307,8 +1288,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB11_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s11, s8, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1336,8 +1315,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s2, s2, s8
 ; GCN-IR-NEXT:    s_subb_u32 s3, s3, 0
 ; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s12, s12, s13
 ; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll
new file mode 100644
index 0000000000000..22e4a24435f12
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll
@@ -0,0 +1,236 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+
+define i16 @v_underflow_compare_fold_i16(i16 %a, i16 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_u16_e32 v1, v0, v1
+; GFX9-NEXT:    v_min_u16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_nc_u16 v0.h, v0.l, v1.l
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u16 v0.l, v0.h, v0.l
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i16 %a, %b
+  %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+define i32 @v_underflow_compare_fold_i32(i32 %a, i32 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v1
+; GFX9-NEXT:    v_min_u32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u32_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i32 %a, %b
+  %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+define i32 @v_underflow_compare_fold_i32_commute(i32 %a, i32 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v1
+; GFX9-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i32_commute:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i32 %a, %b
+  %cond = call i32 @llvm.umin.i32(i32 %a, i32 %sub)
+  ret i32 %cond
+}
+
+define i32 @v_underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i32_multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v1
+; GFX9-NEXT:    v_min_u32_e32 v0, v1, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i32_multi_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u32_e32 v0, v1, v0
+; GFX11-NEXT:    global_store_b32 v[2:3], v1, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i32 %a, %b
+  store i32 %sub, ptr addrspace(1) %ptr
+  %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+define i64 @v_underflow_compare_fold_i64(i64 %a, i64 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %a, %b
+  %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+define i64 @v_underflow_compare_fold_i64_commute(i64 %a, i64 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i64_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i64_commute:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %a, %b
+  %cond = call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+  ret i64 %cond
+}
+
+define i64 @v_underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i64_multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i64_multi_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT:    global_store_b64 v[4:5], v[2:3], off
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %a, %b
+  store i64 %sub, ptr addrspace(1) %ptr
+  %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+define amdgpu_ps i16 @s_underflow_compare_fold_i16(i16 inreg %a, i16 inreg %b) #0 {
+; GFX9-LABEL: s_underflow_compare_fold_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_sub_i32 s1, s0, s1
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX9-NEXT:    s_min_u32 s0, s1, s0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_underflow_compare_fold_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_sub_i32 s1, s0, s1
+; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_min_u32 s0, s1, s0
+; GFX11-NEXT:    ; return to shader part epilog
+  %sub = sub i16 %a, %b
+  %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+define amdgpu_ps i32 @s_underflow_compare_fold_i32(i32 inreg %a, i32 inreg %b) #0 {
+; GFX9-LABEL: s_underflow_compare_fold_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_sub_i32 s1, s0, s1
+; GFX9-NEXT:    s_min_u32 s0, s1, s0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_underflow_compare_fold_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_sub_i32 s1, s0, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_min_u32 s0, s1, s0
+; GFX11-NEXT:    ; return to shader part epilog
+  %sub = sub i32 %a, %b
+  %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+define amdgpu_ps i64 @s_underflow_compare_fold_i64(i64 inreg %a, i64 inreg %b) #0 {
+; GFX9-LABEL: s_underflow_compare_fold_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_sub_u32 s2, s0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_subb_u32 s3, s1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
+; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_underflow_compare_fold_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_sub_u32 s2, s0, s2
+; GFX11-NEXT:    s_subb_u32 s3, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s4, s[2:3], s[0:1]
+; GFX11-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX11-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX11-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %a, %b
+  %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll
index 50666bee325e8..684dc1a1f0092 100644
--- a/llvm/test/CodeGen/AMDGPU/update-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll
@@ -37,3 +37,42 @@ n28:                                               ; preds = %.loopexit, %n28
 n31:                                               ; preds =
   ret void
 }
+
+define amdgpu_ps void @_amdgpu_ps_main_callbr() local_unnamed_addr #3 {
+; IR-LABEL: @_amdgpu_ps_main_callbr(
+; IR-NEXT:  .entry:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[DOTLOOPEXIT:%.*]] []
+; IR:       .loopexit:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[N28:%.*]] []
+; IR:       n28:
+; IR-NEXT:    [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ]
+; IR-NEXT:    [[N29]] = fadd float [[DOT01]], 1.000000e+00
+; IR-NEXT:    [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00
+; IR-NEXT:    [[N30_32:%.*]] = zext i1 [[N30]] to i32
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[N30_32]])
+; IR-NEXT:            to label [[DOTLOOPEXIT]] [label %n28]
+; IR:       n31:
+; IR-NEXT:    ret void
+; IR:       DummyReturnBlock:
+; IR-NEXT:    ret void
+;
+.entry:
+  callbr void asm "", ""() to label %.loopexit []
+
+.loopexit:                                        ; preds = %n28, %.entry
+  callbr void asm "", ""() to label %n28 []
+
+n28:                                               ; preds = %.loopexit, %n28
+  %.01 = phi float [ 0.000000e+00, %.loopexit ], [ %n29, %n28 ]
+  %n29 = fadd float %.01, 1.0
+  %n30 = fcmp ogt float %n29, 4.000000e+00
+  %n30.32 = zext i1 %n30 to i32
+  callbr void asm "", "r,!i"(i32 %n30.32) to label %.loopexit [label %n28]
+
+n31:                                               ; preds =
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 137dc1fe42294..28e6627b87413 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -8,12 +8,11 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GCN-NEXT:    s_sub_u32 s10, 0, s8
-; GCN-NEXT:    s_subb_u32 s11, 0, s9
+; GCN-NEXT:    s_sub_u32 s0, 0, s8
+; GCN-NEXT:    s_subb_u32 s1, 0, s9
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -22,69 +21,65 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s10, v0
-; GCN-NEXT:    v_readfirstlane_b32 s12, v1
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_mul_i32 s1, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_mul_i32 s13, s11, s0
-; GCN-NEXT:    s_mul_i32 s14, s10, s0
-; GCN-NEXT:    s_add_i32 s1, s15, s1
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s14
-; GCN-NEXT:    s_add_i32 s1, s1, s13
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s1
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s14
-; GCN-NEXT:    v_readfirstlane_b32 s13, v3
-; GCN-NEXT:    s_mul_i32 s15, s0, s1
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s1
-; GCN-NEXT:    s_add_u32 s13, s13, s15
+; GCN-NEXT:    v_mul_hi_u32 v2, s0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    s_mul_i32 s11, s0, s10
+; GCN-NEXT:    v_readfirstlane_b32 s14, v2
+; GCN-NEXT:    s_mul_i32 s12, s1, s2
+; GCN-NEXT:    s_mul_i32 s13, s0, s2
+; GCN-NEXT:    s_add_i32 s11, s14, s11
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s13
+; GCN-NEXT:    s_add_i32 s11, s11, s12
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s13
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_mul_i32 s15, s2, s11
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s11
+; GCN-NEXT:    s_add_u32 s12, s12, s15
 ; GCN-NEXT:    v_readfirstlane_b32 s15, v0
-; GCN-NEXT:    s_mul_i32 s14, s12, s14
+; GCN-NEXT:    s_mul_i32 s13, s10, s13
 ; GCN-NEXT:    s_addc_u32 s15, 0, s15
-; GCN-NEXT:    v_readfirstlane_b32 s16, v4
-; GCN-NEXT:    s_add_u32 s13, s13, s14
-; GCN-NEXT:    s_addc_u32 s13, s15, s16
-; GCN-NEXT:    v_readfirstlane_b32 s14, v1
-; GCN-NEXT:    s_addc_u32 s14, s14, 0
-; GCN-NEXT:    s_mul_i32 s1, s12, s1
-; GCN-NEXT:    s_add_u32 s1, s13, s1
-; GCN-NEXT:    s_addc_u32 s13, 0, s14
-; GCN-NEXT:    s_add_u32 s14, s0, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s12, s12, s13
-; GCN-NEXT:    s_mul_i32 s0, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s0, s1, s0
-; GCN-NEXT:    s_mul_i32 s11, s11, s14
-; GCN-NEXT:    s_mul_i32 s1, s10, s14
-; GCN-NEXT:    s_add_i32 s0, s0, s11
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mul_hi_u32 v3, s12, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s14, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s12, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s14, v0
-; GCN-NEXT:    s_mul_i32 s11, s14, s0
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_add_u32 s11, s15, s11
+; GCN-NEXT:    v_readfirstlane_b32 s14, v4
+; GCN-NEXT:    s_add_u32 s12, s12, s13
+; GCN-NEXT:    s_addc_u32 s12, s15, s14
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-NEXT:    s_mul_i32 s11, s10, s11
+; GCN-NEXT:    s_add_u32 s11, s12, s11
+; GCN-NEXT:    s_addc_u32 s12, 0, s13
+; GCN-NEXT:    s_add_u32 s11, s2, s11
+; GCN-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT:    s_addc_u32 s10, s10, s12
+; GCN-NEXT:    s_mul_i32 s12, s0, s10
+; GCN-NEXT:    s_mul_i32 s1, s1, s11
 ; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s1, s12, s1
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    s_add_u32 s1, s11, s1
-; GCN-NEXT:    s_addc_u32 s1, s13, s10
-; GCN-NEXT:    v_readfirstlane_b32 s10, v1
-; GCN-NEXT:    s_addc_u32 s10, s10, 0
-; GCN-NEXT:    s_mul_i32 s0, s12, s0
-; GCN-NEXT:    s_add_u32 s0, s1, s0
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    s_add_u32 s11, s14, s0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s1, s12, s10
+; GCN-NEXT:    s_add_i32 s12, s13, s12
+; GCN-NEXT:    s_mul_i32 s0, s0, s11
+; GCN-NEXT:    s_add_i32 s1, s12, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mul_hi_u32 v3, s10, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s10, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    s_mul_i32 s13, s11, s1
+; GCN-NEXT:    v_readfirstlane_b32 s15, v2
+; GCN-NEXT:    s_add_u32 s13, s15, s13
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s0, s10, s0
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_add_u32 s0, s13, s0
+; GCN-NEXT:    s_addc_u32 s0, s14, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v1
+; GCN-NEXT:    s_addc_u32 s12, s12, 0
+; GCN-NEXT:    s_mul_i32 s1, s10, s1
+; GCN-NEXT:    s_add_u32 s0, s0, s1
+; GCN-NEXT:    s_addc_u32 s1, 0, s12
+; GCN-NEXT:    s_add_u32 s11, s11, s0
+; GCN-NEXT:    s_addc_u32 s1, s10, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s11
@@ -118,11 +113,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_mul_i32 s4, s8, s4
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s11, s4, s5
 ; GCN-NEXT:    s_subb_u32 s13, s10, s9
 ; GCN-NEXT:    s_sub_u32 s14, s6, s8
 ; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s15, s10, s11
 ; GCN-NEXT:    s_subb_u32 s15, s13, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s15, s9
 ; GCN-NEXT:    s_cselect_b32 s16, -1, 0
@@ -131,13 +124,11 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_cmp_eq_u32 s15, s9
 ; GCN-NEXT:    s_cselect_b32 s16, s17, s16
 ; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_subb_u32 s13, s13, s9
-; GCN-NEXT:    s_sub_u32 s17, s14, s8
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_subb_u32 s10, s13, 0
+; GCN-NEXT:    s_subb_u32 s10, s13, s9
+; GCN-NEXT:    s_sub_u32 s11, s14, s8
+; GCN-NEXT:    s_subb_u32 s10, s10, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s16, 0
-; GCN-NEXT:    s_cselect_b32 s11, s17, s14
+; GCN-NEXT:    s_cselect_b32 s11, s11, s14
 ; GCN-NEXT:    s_cselect_b32 s10, s10, s15
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
 ; GCN-NEXT:    s_subb_u32 s4, s7, s12
@@ -150,6 +141,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_cmp_lg_u32 s5, 0
 ; GCN-NEXT:    s_cselect_b32 s4, s10, s4
 ; GCN-NEXT:    s_cselect_b32 s5, s11, s6
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -180,8 +172,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -213,8 +203,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_sub_u32 s12, s12, s18
 ; GCN-IR-NEXT:    s_subb_u32 s13, s13, s19
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
@@ -803,12 +791,11 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-LABEL: s_test_urem_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GCN-NEXT:    s_sub_u32 s6, 0, s2
-; GCN-NEXT:    s_subb_u32 s8, 0, s3
+; GCN-NEXT:    s_sub_u32 s4, 0, s2
+; GCN-NEXT:    s_subb_u32 s5, 0, s3
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -817,77 +804,73 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s6, v0
+; GCN-NEXT:    v_mul_hi_u32 v2, s4, v0
+; GCN-NEXT:    v_readfirstlane_b32 s6, v1
+; GCN-NEXT:    v_readfirstlane_b32 s7, v0
+; GCN-NEXT:    s_mul_i32 s8, s4, s6
+; GCN-NEXT:    v_readfirstlane_b32 s11, v2
+; GCN-NEXT:    s_mul_i32 s9, s5, s7
+; GCN-NEXT:    s_mul_i32 s10, s4, s7
+; GCN-NEXT:    s_add_i32 s8, s11, s8
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s10
+; GCN-NEXT:    s_add_i32 s8, s8, s9
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s10
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    s_mul_i32 s12, s7, s8
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s8
+; GCN-NEXT:    s_add_u32 s9, s9, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    s_mul_i32 s10, s6, s10
+; GCN-NEXT:    s_addc_u32 s12, 0, s12
+; GCN-NEXT:    v_readfirstlane_b32 s11, v4
+; GCN-NEXT:    s_add_u32 s9, s9, s10
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s9, s12, s11
+; GCN-NEXT:    s_mul_i32 s8, s6, s8
+; GCN-NEXT:    s_addc_u32 s10, s13, 0
+; GCN-NEXT:    s_add_u32 s8, s9, s8
+; GCN-NEXT:    s_addc_u32 s9, 0, s10
+; GCN-NEXT:    s_add_u32 s8, s7, s8
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GCN-NEXT:    s_addc_u32 s6, s6, s9
+; GCN-NEXT:    s_mul_i32 s9, s4, s6
+; GCN-NEXT:    s_mul_i32 s5, s5, s8
+; GCN-NEXT:    v_readfirstlane_b32 s10, v0
+; GCN-NEXT:    s_add_i32 s9, s10, s9
+; GCN-NEXT:    s_mul_i32 s4, s4, s8
+; GCN-NEXT:    s_add_i32 s5, s9, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NEXT:    v_mul_hi_u32 v3, s6, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s8, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT:    s_mul_i32 s10, s8, s5
+; GCN-NEXT:    v_readfirstlane_b32 s12, v2
+; GCN-NEXT:    s_add_u32 s10, s12, s10
+; GCN-NEXT:    v_readfirstlane_b32 s11, v0
+; GCN-NEXT:    s_mul_i32 s4, s6, s4
+; GCN-NEXT:    s_addc_u32 s11, 0, s11
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    s_add_u32 s4, s10, s4
+; GCN-NEXT:    s_addc_u32 s4, s11, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v1
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_mul_i32 s5, s6, s5
+; GCN-NEXT:    s_add_u32 s4, s4, s5
+; GCN-NEXT:    s_addc_u32 s5, 0, s9
+; GCN-NEXT:    s_add_u32 s4, s8, s4
+; GCN-NEXT:    s_addc_u32 s5, s6, s5
+; GCN-NEXT:    v_mul_hi_u32 v1, s4, 24
+; GCN-NEXT:    v_mul_hi_u32 v0, s5, 24
+; GCN-NEXT:    s_mul_i32 s5, s5, 24
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_readfirstlane_b32 s8, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
-; GCN-NEXT:    s_mul_i32 s5, s6, s9
-; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_mul_i32 s10, s8, s4
-; GCN-NEXT:    s_mul_i32 s11, s6, s4
-; GCN-NEXT:    s_add_i32 s5, s12, s5
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s11
-; GCN-NEXT:    s_add_i32 s5, s5, s10
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s5
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s11
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s5
-; GCN-NEXT:    s_mul_i32 s13, s4, s5
-; GCN-NEXT:    s_add_u32 s10, s10, s13
-; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s11, s9, s11
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s12, v4
-; GCN-NEXT:    s_add_u32 s10, s10, s11
-; GCN-NEXT:    v_readfirstlane_b32 s14, v1
-; GCN-NEXT:    s_addc_u32 s10, s13, s12
-; GCN-NEXT:    s_addc_u32 s11, s14, 0
-; GCN-NEXT:    s_mul_i32 s5, s9, s5
-; GCN-NEXT:    s_add_u32 s5, s10, s5
-; GCN-NEXT:    s_addc_u32 s10, 0, s11
-; GCN-NEXT:    s_add_u32 s11, s4, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_addc_u32 s9, s9, s10
-; GCN-NEXT:    s_mul_i32 s4, s6, s9
-; GCN-NEXT:    v_readfirstlane_b32 s5, v0
-; GCN-NEXT:    s_add_i32 s4, s5, s4
-; GCN-NEXT:    s_mul_i32 s8, s8, s11
-; GCN-NEXT:    s_mul_i32 s5, s6, s11
-; GCN-NEXT:    s_add_i32 s4, s4, s8
-; GCN-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mul_hi_u32 v3, s9, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s9, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT:    s_mul_i32 s8, s11, s4
-; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_add_u32 s8, s12, s8
-; GCN-NEXT:    v_readfirstlane_b32 s10, v0
-; GCN-NEXT:    s_mul_i32 s5, s9, s5
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    v_readfirstlane_b32 s6, v3
 ; GCN-NEXT:    s_add_u32 s5, s8, s5
-; GCN-NEXT:    s_addc_u32 s5, s10, s6
-; GCN-NEXT:    v_readfirstlane_b32 s6, v1
-; GCN-NEXT:    s_addc_u32 s6, s6, 0
-; GCN-NEXT:    s_mul_i32 s4, s9, s4
-; GCN-NEXT:    s_add_u32 s4, s5, s4
-; GCN-NEXT:    s_addc_u32 s6, 0, s6
-; GCN-NEXT:    s_add_u32 s8, s11, s4
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_addc_u32 s4, s9, s6
-; GCN-NEXT:    v_mul_hi_u32 v1, s8, 24
-; GCN-NEXT:    v_mul_hi_u32 v0, s4, 24
-; GCN-NEXT:    s_mul_i32 s4, s4, 24
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v1
-; GCN-NEXT:    v_readfirstlane_b32 s5, v0
-; GCN-NEXT:    s_add_u32 s4, s8, s4
-; GCN-NEXT:    s_addc_u32 s8, 0, s5
+; GCN-NEXT:    s_addc_u32 s8, 0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
@@ -899,11 +882,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_mul_i32 s0, s2, s8
 ; GCN-NEXT:    s_sub_u32 s11, 24, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s0, s1
 ; GCN-NEXT:    s_subb_u32 s12, s9, s3
 ; GCN-NEXT:    s_sub_u32 s13, s11, s2
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s14, s8, s9
 ; GCN-NEXT:    s_subb_u32 s14, s12, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s14, s3
 ; GCN-NEXT:    s_cselect_b32 s15, -1, 0
@@ -912,13 +893,11 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_cmp_eq_u32 s14, s3
 ; GCN-NEXT:    s_cselect_b32 s15, s16, s15
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s12, s12, s3
-; GCN-NEXT:    s_sub_u32 s16, s13, s2
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_subb_u32 s8, s12, s3
+; GCN-NEXT:    s_sub_u32 s9, s13, s2
+; GCN-NEXT:    s_subb_u32 s8, s8, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s15, 0
-; GCN-NEXT:    s_cselect_b32 s9, s16, s13
+; GCN-NEXT:    s_cselect_b32 s9, s9, s13
 ; GCN-NEXT:    s_cselect_b32 s8, s8, s14
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_subb_u32 s0, 0, s10
@@ -931,6 +910,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_cmp_lg_u32 s1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, s8, s0
 ; GCN-NEXT:    s_cselect_b32 s1, s9, s11
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -956,8 +936,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB6_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -988,8 +966,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s10, s10, s16
 ; GCN-IR-NEXT:    s_subb_u32 s11, s11, s17
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
@@ -1077,8 +1053,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s11, s8, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1106,8 +1080,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s8, s8, s10
 ; GCN-IR-NEXT:    s_subb_u32 s9, s9, 0
 ; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s14, s14, s15
 ; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index e8db6471b6a46..8a54ad301f48a 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -15,10 +15,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_sub_u32 s2, s2, s8
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    s_or_b32 s0, s0, s1
 ; SI-NEXT:    s_subb_u32 s3, s3, s9
+; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
@@ -432,8 +430,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_sub_u32 s4, s4, s6
-; SI-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; SI-NEXT:    s_or_b32 s6, s12, s13
 ; SI-NEXT:    s_subb_u32 s5, s5, s7
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
index 4d5ade4abcef7..94448411cfd0e 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
@@ -2481,10 +2481,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v6, v1
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2502,10 +2503,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2524,8 +2526,8 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v4, v3, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v4, v2, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v5, v2, v[6:7]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v5, v2, v[6:7]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2623,12 +2625,11 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v0, v2, 0
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v6, v1
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v8
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v0, v3, v[8:9]
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v6, v2, v[9:10]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v11, v4, v[8:9]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2651,12 +2652,11 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v0, v2, 0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v6, v1
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v0, v3, v[8:9]
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v6, v2, v[9:10]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v11, v4, v[8:9]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2677,12 +2677,12 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v3, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v0, v2, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v2, v[8:9]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v7, v7, v0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v1, v2, v[8:9]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v8, v7, v10
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v6, v5, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v6, v4, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v7, v4, v[2:3]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v8, v4, v[2:3]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v6
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2701,13 +2701,12 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v3i64:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v7, v1
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s4, v6, v2, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s4, v6, v3, v[9:10]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s4, v8, v4, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s4, v7, v2, v[9:10]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s4, v8, v5, v[1:2]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s4, v0, v2, 0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s4, v0, v3, v[8:9]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s4, v7, v4, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s4, v6, v2, v[8:9]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s4, v7, v5, v[1:2]
 ; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s4, v2, v4, v[5:6]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2729,16 +2728,16 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v3i64:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v6, v2, 0
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v4, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[10:11], null, v6, v3, v[9:10]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[11:12], null, v7, v2, v[10:11]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v8, v5, v[1:2]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v11, v4, v[6:7]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v0, v2, 0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v0, v3, v[8:9]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v7, v4, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[10:11], null, v6, v2, v[9:10]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v7, v5, v[1:2]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v10, v4, v[8:9]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2808,18 +2807,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
 ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v4i64:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v0, v4, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v2, v6, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v2, v6, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v0, v4, 0
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v8, v1
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v10
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v0, v5, v[1:2]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v2, v7, v[10:11]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], v0, v5, v[12:13]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[13:14]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v9, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[15:16]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v17, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v9, v[3:4]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -2845,18 +2842,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
 ; GFX8-GISEL-LABEL: test_vector_reduce_mul_v4i64:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v0, v4, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v2, v6, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v2, v6, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v0, v4, 0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v8, v1
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v10
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v0, v5, v[1:2]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v2, v7, v[10:11]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], v0, v5, v[12:13]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[13:14]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v9, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[15:16]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v17, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v9, v[3:4]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -2881,16 +2876,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v0, v5, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v4, v[10:11]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v9, v9, v0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v7, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, v6, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v3, v6, v[0:1]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v2, v5, v0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v8, v2, 0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v3, v6, v[0:1]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v5, v10
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v9, v9, v12
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v8, v5, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v8, v4, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v9, v4, v[2:3]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v9, v4, v[2:3]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v6
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -2913,19 +2908,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v4i64:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s4, v2, v6, 0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v8, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v9, v1
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s4, v8, v4, 0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, v11
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s4, v2, v7, v[0:1]
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, v13
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s4, v12, v10, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s4, v8, v5, v[2:3]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s4, v3, v6, v[14:15]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s4, v9, v4, v[7:8]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s4, v12, v2, v[1:2]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s4, v3, v10, v[1:2]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s4, v2, v6, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s4, v0, v4, 0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v8, v1
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s4, v2, v7, v[10:11]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s4, v0, v5, v[12:13]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s4, v3, v6, v[1:2]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s4, v11, v9, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s4, v8, v4, v[12:13]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s4, v11, v2, v[1:2]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s4, v3, v9, v[1:2]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -2951,23 +2943,19 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v4i64:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[10:11], null, v2, v6, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[12:13], null, v8, v4, 0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, v11
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[14:15], null, v2, v7, v[0:1]
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, v13
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v12, v10, 0
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[15:16], null, v8, v5, v[2:3]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v3, v6, v[14:15]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v9, v4, v[15:16]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v2, v6, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[11:12], null, v0, v4, 0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v8, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[13:14], null, v2, v7, v[10:11]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[14:15], null, v0, v5, v[12:13]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v11, v9, 0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v12, v7, v[1:2]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v10, v[3:4]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[15:16], null, v3, v6, v[13:14]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v8, v4, v[14:15]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v11, v15, v[1:2]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v9, v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -3068,31 +3056,26 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v5, v13
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v13, v20
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v20
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v5, v12, v[18:19]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v8, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v4, v16, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v4, v20, v[13:14]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v14, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v6, v15, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v10, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v7, v14, v[19:20]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v2, v11, v[6:7]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v4, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v5, v23, v[7:8]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v3, v10, v[13:14]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v8, v[17:18]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v23, v4, v[19:20]
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v16, v[21:22]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v7, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[4:5]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3139,31 +3122,26 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, v13
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v13, v20
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v20
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v5, v12, v[18:19]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v8, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v4, v16, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v4, v20, v[13:14]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v14, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v6, v15, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v10, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v7, v14, v[19:20]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v2, v11, v[6:7]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v4, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v5, v23, v[7:8]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v3, v10, v[13:14]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v8, v[17:18]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v23, v4, v[19:20]
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v16, v[21:22]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v7, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[4:5]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3204,32 +3182,32 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v0, v9, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v0, v8, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v8, v[18:19]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[0:1], v1, v8, v[18:19]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v2, v11, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v17, v17, v0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, v10, v[8:9]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v3, v10, v[8:9]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v4, v13, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v10, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v5, v12, v[8:9]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v3, v4
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v5, v12, v[8:9]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v6, v15, 0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v18, v1, v18
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v3, v10
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v6, v14, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v7, v14, v[8:9]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v7, v14, v[8:9]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v17, v17, v20
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v5, v10
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v16, v1, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v5, v6
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v17, v2, v[8:9]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v5, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v16, v2, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v17, v2, v[8:9]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v7, v7, v2
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, v5, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v10, v4, v[0:1]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v18, v4, v[8:9]
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v3, v3, v0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v10, v7, v10
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v6, v3, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v6, v2, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v7, v2, v[4:5]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v10, v2, v[4:5]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v6
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3268,34 +3246,27 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v8i64:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s4, v0, v8, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s4, v2, v10, 0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v16, v1
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[21:22], s4, v6, v14, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[23:24], s4, v4, v12, 0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, v18
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v18, v20
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[25:26], s4, v0, v9, v[1:2]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s4, v2, v11, v[18:19]
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, v22
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, v24
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[29:30], s4, v4, v13, v[2:3]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[26:27], s4, v6, v15, v[0:1]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[27:28], s4, v19, v21, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[30:31], s4, v17, v23, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s4, v3, v10, v[1:2]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s4, v7, v14, v[26:27]
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, v28
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s4, v5, v12, v[29:30]
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, v31
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s4, v19, v6, v[0:1]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s4, v16, v8, v[25:26]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s4, v30, v27, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s4, v17, v3, v[4:5]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s4, v2, v21, v[5:6]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s4, v6, v23, v[3:4]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s4, v30, v4, v[1:2]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s4, v2, v27, v[3:4]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s4, v6, v14, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s4, v0, v8, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s4, v2, v10, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[22:23], s4, v4, v12, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s4, v6, v15, v[17:18]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[25:26], s4, v0, v9, v[19:20]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[26:27], s4, v2, v11, v[21:22]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[27:28], s4, v4, v13, v[23:24]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s4, v7, v14, v[24:25]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s4, v20, v16, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s4, v3, v10, v[26:27]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s4, v5, v12, v[27:28]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s4, v18, v22, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s4, v20, v13, v[7:8]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s4, v1, v8, v[25:26]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s4, v18, v4, v[3:4]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s4, v9, v16, v[10:11]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s4, v2, v6, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s4, v7, v22, v[3:4]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s4, v2, v4, v[1:2]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s4, v7, v6, v[1:2]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3339,39 +3310,34 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v8i64:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[17:18], null, v0, v8, 0
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[19:20], null, v2, v10, 0
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[21:22], null, v6, v14, 0
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[23:24], null, v4, v12, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v16, v1 :: v_dual_mov_b32 v1, v18
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v18, v20
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[25:26], null, v0, v9, v[1:2]
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, v22 :: v_dual_mov_b32 v1, v24
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[26:27], null, v2, v11, v[18:19]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[27:28], null, v6, v15, v[0:1]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[30:31], null, v4, v13, v[1:2]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[28:29], null, v19, v21, 0
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[31:32], null, v17, v23, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v7, v14, v[27:28]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v3, v10, v[26:27]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v5, v12, v[30:31]
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, v29
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, v32
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v16, v8, v[25:26]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v19, v0, v[1:2]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v31, v28, 0
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v17, v2, v[3:4]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v6, v21, v[4:5]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v9, v23, v[7:8]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v31, v2, v[1:2]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v3, v28, v[4:5]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[16:17], null, v6, v14, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[18:19], null, v0, v8, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[20:21], null, v2, v10, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[22:23], null, v4, v12, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[24:25], null, v6, v15, v[17:18]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[25:26], null, v0, v9, v[19:20]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[26:27], null, v2, v11, v[21:22]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[27:28], null, v4, v13, v[23:24]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[28:29], null, v7, v14, v[24:25]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v20, v16, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[13:14], null, v3, v10, v[26:27]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v5, v12, v[27:28]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v18, v22, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v20, v28, v[7:8]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[10:11], null, v1, v8, v[25:26]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v6, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v18, v9, v[3:4]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v13, v16, v[4:5]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v10, v22, v[7:8]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v2, v8, v[1:2]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v3, v6, v[4:5]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3550,63 +3516,52 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v9, v33
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v24, v25
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v0, v17, v[9:10]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[36:37], s[4:5], v8, v35, v[25:26]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[32:33]
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2]
-; GFX7-GISEL-NEXT:    buffer_load_dword v9, off, s[0:3], s32
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v17
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[36:37]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[31:32], s[4:5], v11, v26, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v2, v18, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v2, v19, v[9:10]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v8, v0, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[25:26], s[4:5], v8, v31, v[2:3]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v3, v18, v[10:11]
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, v18
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, v13
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, v18
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v8, v0, v[25:26]
+; GFX7-GISEL-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v12, v29, v[3:4]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v20, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v13, v28, v[10:11]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v8, v2, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[25:26], s[4:5], v4, v21, v[9:10]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[27:28], s[4:5], v8, v18, v[11:12]
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v0, v[4:5]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v15, v30, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v22, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v23, v[9:10]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v8, v3, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v8, v13, v[12:13]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v5, v20, v[25:26]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v22, v[14:15]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v8, v2, v[27:28]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[18:19]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, v11, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v24, v10, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v7, v[4:5]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v24, v12, v[6:7]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v17, v11, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[13:14]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v5, v6, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v16i64:
@@ -3695,63 +3650,52 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v9, v33
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v24, v25
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v0, v17, v[9:10]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[36:37], s[4:5], v8, v35, v[25:26]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[32:33]
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2]
-; GFX8-GISEL-NEXT:    buffer_load_dword v9, off, s[0:3], s32
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v17
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[36:37]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[31:32], s[4:5], v11, v26, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v2, v18, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v2, v19, v[9:10]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v8, v0, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[25:26], s[4:5], v8, v31, v[2:3]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v3, v18, v[10:11]
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v3, v18
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, v13
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v3, v18
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v8, v0, v[25:26]
+; GFX8-GISEL-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v12, v29, v[3:4]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v20, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v13, v28, v[10:11]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v8, v2, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[25:26], s[4:5], v4, v21, v[9:10]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[27:28], s[4:5], v8, v18, v[11:12]
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v0, v[4:5]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v15, v30, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v22, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v23, v[9:10]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v8, v3, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v8, v13, v[12:13]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v5, v20, v[25:26]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v22, v[14:15]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v8, v2, v[27:28]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[18:19]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, v11, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v24, v10, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v7, v[4:5]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v24, v12, v[6:7]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v17, v11, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[13:14]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v5, v6, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v16i64:
@@ -3827,65 +3771,65 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
 ; GFX9-GISEL-NEXT:    scratch_load_dword v31, off, s32
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[34:35], s[0:1], v0, v17, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[0:1], v0, v16, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v16, v[34:35]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[36:37], s[0:1], v1, v16, v[34:35]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v2, v19, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v33, v33, v0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, v18, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[34:35], s[0:1], v3, v18, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v4, v21, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v18, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v20, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v5, v20, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v5, v20, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v6, v23, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v19, v3, v4
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v20, v3, v18
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v6, v22, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v7, v22, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v7, v22, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v8, v25, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v20, v5, v6
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v21, v5, v18
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v8, v24, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v9, v24, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v9, v24, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v10, v27, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v7, v8
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v34, v1, v34
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v7, v18
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v10, v26, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v11, v26, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v11, v26, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v12, v29, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v3, v9, v10
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v3, v9, v18
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v12, v28, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v13, v28, v[16:17]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v11, v12
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v13, v28, v[16:17]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v33, v33, v36
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v11, v18
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v14, v30, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v14, v31, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v15, v30, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v15, v30, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v32, v1, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v9, v13, v14
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v11, v13, v18
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v33, v6, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v0, v3, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v32, v6, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v33, v6, v[16:17]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v11, v15, v6
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v0, v8, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, v3, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v18, v8, v[0:1]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v7, v7, v0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v34, v8, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v2, v5, 0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v13, v15, v18
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v15, v7, v0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v2, v5, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v19, v10, v[2:3]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v20, v10, v[8:9]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v4, v11, 0
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v4, v9, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v20, v12, v[4:5]
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v21, v12, v[8:9]
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v14, v0, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v11, v0, v[8:9]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v5, v0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v13, v0, v[8:9]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v6, v3, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v6, v2, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v7, v2, v[0:1]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v2, v9, v0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v15, v2, v[0:1]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v10, v5, v10
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v9, v6
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v4, v8, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v5, v8, v[2:3]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v10, v8, v[2:3]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v4
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v16i64:
@@ -3958,66 +3902,53 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
 ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v16i64:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[31:32], s4, v0, v16, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[33:34], s4, v2, v18, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[38:39], s4, v6, v22, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[35:36], s4, v0, v17, v[32:33]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[36:37], s4, v4, v20, 0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, v34
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s4, v1, v16, v[35:36]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s4, v2, v19, v[0:1]
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, v37
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, v39
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[34:35], s4, v8, v24, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[48:49], s4, v4, v21, v[1:2]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[49:50], s4, v10, v26, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s4, v6, v23, v[2:3]
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, v35
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s4, v5, v20, v[48:49]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[51:52], s4, v8, v25, v[2:3]
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, v50
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[52:53], s4, v10, v27, v[2:3]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[53:54], s4, v12, v28, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s4, v9, v24, v[51:52]
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, v54
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s4, v11, v26, v[52:53]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s4, v36, v53, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[54:55], s4, v12, v29, v[2:3]
-; GFX10-GISEL-NEXT:    buffer_load_dword v12, off, s[0:3], s32
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s4, v3, v18, v[0:1]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s4, v14, v30, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s4, v33, v49, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s4, v7, v22, v[1:2]
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s4, v31, v34, 0
+; GFX10-GISEL-NEXT:    buffer_load_dword v35, off, s[0:3], s32
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[31:32], s4, v14, v30, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[33:34], s4, v6, v22, 0
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s4, v14, v12, v[0:1]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s4, v38, v3, 0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, v8
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s4, v13, v28, v[54:55]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s4, v15, v30, v[0:1]
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, v18
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s4, v31, v9, v[1:2]
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s4, v33, v10, v[0:1]
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, v20
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s4, v38, v14, v[1:2]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s4, v17, v11, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s4, v36, v13, v[0:1]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s4, v7, v19, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s4, v6, v3, v[14:15]
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, v21
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s4, v2, v49, v[9:10]
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, v13
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s4, v5, v53, v[0:1]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s4, v17, v3, v[1:2]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s4, v12, v20, 0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s4, v7, v4, v[2:3]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s4, v16, v34, v[8:9]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s4, v9, v11, v[5:6]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s4, v13, v19, v[2:3]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s4, v12, v3, v[1:2]
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s4, v4, v20, v[1:2]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[36:37], s4, v14, v35, v[32:33]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[37:38], s4, v6, v23, v[34:35]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[34:35], s4, v33, v31, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[38:39], s4, v15, v30, v[36:37]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s4, v0, v16, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[36:37], s4, v7, v22, v[37:38]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s4, v2, v18, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[22:23], s4, v4, v20, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s4, v33, v38, v[35:36]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[37:38], s4, v0, v17, v[15:16]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[38:39], s4, v2, v19, v[7:8]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[35:36], s4, v36, v31, v[32:33]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[30:31], s4, v8, v24, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s4, v10, v26, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[15:16], s4, v1, v16, v[37:38]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s4, v3, v18, v[38:39]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s4, v12, v28, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s4, v4, v21, v[23:24]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s4, v8, v25, v[31:32]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s4, v10, v27, v[33:34]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s4, v12, v29, v[1:2]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s4, v5, v20, v[2:3]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s4, v9, v24, v[3:4]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s4, v11, v26, v[7:8]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s4, v6, v32, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s4, v13, v28, v[17:18]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s4, v22, v0, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s4, v14, v30, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s4, v6, v9, v[4:5]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s4, v22, v10, v[8:9]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s4, v3, v34, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s4, v14, v19, v[2:3]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s4, v1, v7, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s4, v16, v32, v[11:12]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s4, v18, v0, v[12:13]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s4, v15, v30, v[13:14]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s4, v3, v35, v[9:10]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s4, v1, v11, v[5:6]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v8, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s4, v10, v34, v[2:3]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s4, v12, v7, v[5:6]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s4, v4, v2, v[1:2]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s4, v5, v8, v[1:2]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v16i64:
@@ -4098,66 +4029,62 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v16i64:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    scratch_load_b32 v71, off, s32
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[31:32], null, v0, v16, 0
+; GFX11-GISEL-NEXT:    scratch_load_b32 v55, off, s32
 ; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[33:34], null, v2, v18, 0
 ; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[35:36], null, v4, v20, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[31:32], null, v0, v16, 0
 ; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[37:38], null, v6, v22, 0
 ; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[50:51], null, v10, v26, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[82:83], null, v2, v19, v[34:35]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[70:71], null, v0, v17, v[32:33]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[83:84], null, v4, v21, v[36:37]
 ; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[52:53], null, v12, v28, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[64:65], null, v14, v30, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[66:67], null, v33, v50, 0
 ; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[48:49], null, v8, v24, 0
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[54:55], null, v14, v30, 0
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[82:83], null, v0, v17, v[32:33]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[83:84], null, v2, v19, v[34:35]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[84:85], null, v4, v21, v[36:37]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[85:86], null, v6, v23, v[38:39]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[86:87], null, v10, v27, v[51:52]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[65:66], null, v31, v48, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[84:85], null, v6, v23, v[38:39]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[96:97], null, v1, v16, v[70:71]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v18, v[82:83]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[85:86], null, v10, v27, v[51:52]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[86:87], null, v12, v29, v[53:54]
 ; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[38:39], null, v8, v25, v[49:50]
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v64, v55
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[96:97], null, v12, v29, v[53:54]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[97:98], null, v1, v16, v[82:83]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v3, v18, v[83:84]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v5, v20, v[84:85]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v7, v22, v[85:86]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[67:68], null, v33, v50, 0
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[80:81], null, v37, v54, 0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, v66
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[69:70], null, v35, v52, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[80:81], null, v37, v64, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[16:17], null, v5, v20, v[83:84]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[68:69], null, v35, v52, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[17:18], null, v7, v22, v[84:85]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v9, v24, v[38:39]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v13, v28, v[86:87]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[53:54], null, v31, v48, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v14, v71, v[64:65]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v9, v24, v[38:39]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v11, v26, v[86:87]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v13, v28, v[96:97]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v15, v30, v[4:5]
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, v68
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v31, v5, v[0:1]
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, v81
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[10:11], null, v33, v6, v[4:5]
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, v70
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v14, v55, v[65:66]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v15, v30, v[1:2]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v11, v26, v[85:86]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v53, v68, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v37, v4, v[81:82]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v33, v1, v[67:68]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[10:11], null, v35, v7, v[69:70]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v66, v80, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[11:12], null, v17, v64, v[8:9]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v31, v6, v[54:55]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[12:13], null, v0, v50, v[9:10]
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v37, v8, v[0:1]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[11:12], null, v67, v80, 0
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[13:14], null, v35, v7, v[4:5]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v65, v69, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v66, v11, v[5:6]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v52, v[10:11]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v96, v48, v[7:8]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v53, v0, v[3:4]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v12, v80, v[8:9]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v4, 0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[14:15], null, v3, v54, v[5:6]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v50, v[10:11]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v2, v52, v[13:14]
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, v7
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v67, v14, v[0:1]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[12:13], null, v97, v48, v[9:10]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v6, v11, 0
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v65, v4, v[2:3]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v3, v80, v[7:8]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v12, v69, v[8:9]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v6, v4, v[1:2]
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v9, v11, v[7:8]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v5, v68, v[6:7]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v2, v9, v[1:2]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v7, v4, v[5:6]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v16i64:
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir
index 8a70a8acd28d3..32cc398740d62 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir
@@ -36,7 +36,7 @@ body:             |
     ; GCN-NEXT: v_add_f16_e64 v128.l /*v384.l*/, v129.l /*v385.l*/, v130.l /*v386.l*/
     $vgpr384_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr385_lo16, 0, undef $vgpr386_lo16, 0, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x8a
+    ; GCN-NEXT: s_set_vgpr_msb 0x458a
     ; ASM-SAME:                                         ;  msbs: dst=2 src0=2 src1=2 src2=0
     ; GCN-NEXT: v_add_f16_e64 v0.h /*v512.h*/, v1.h /*v513.h*/, v2.h /*v514.h*/
     $vgpr512_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr513_hi16, 0, undef $vgpr514_hi16, 0, 0, 0, implicit $exec, implicit $mode
@@ -50,7 +50,7 @@ body:             |
     ; GCN-NEXT: v_add_f16_e64 v128.l /*v640.l*/, v129.l /*v641.l*/, v130.l /*v642.l*/
     $vgpr640_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr641_lo16, 0, undef $vgpr642_lo16, 0, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0xcf
+    ; GCN-NEXT: s_set_vgpr_msb 0x8acf
     ; ASM-SAME:                                         ;  msbs: dst=3 src0=3 src1=3 src2=0
     ; GCN-NEXT: v_add_f16_e64 v0.h /*v768.h*/, v1.h /*v769.h*/, v2.h /*v770.h*/
     $vgpr768_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr769_hi16, 0, undef $vgpr770_hi16, 0, 0, 0, implicit $exec, implicit $mode
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
index f508df2292e90..7e1c28f8e7bbb 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
@@ -22,13 +22,13 @@ body:             |
     $vgpr257 = V_MOV_B32_e32 undef $vgpr510, implicit $exec
 
     ; Single bit change
-    ; GCN-NEXT: s_set_vgpr_msb 1
+    ; GCN-NEXT: s_set_vgpr_msb 0x4101
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=1 src1=0 src2=0
     ; GCN-NEXT: v_rcp_f32_e64 v255, v2 /*v258*/
     $vgpr255 = V_RCP_F32_e64 0, undef $vgpr258, 0, 0, implicit $exec, implicit $mode
 
     ; Reset
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x100
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=0 src2=0
     ; GCN-NEXT: v_rcp_f32_e64 v255, v1
     $vgpr255 = V_RCP_F32_e64 0, undef $vgpr1, 0, 0, implicit $exec, implicit $mode
@@ -40,7 +40,7 @@ body:             |
     ; GCN-NEXT: v_add_nc_u32_e32 v0, v253 /*v509*/, v252 /*v508*/
     $vgpr0 = V_ADD_U32_e32 undef $vgpr509, undef $vgpr508, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x44
+    ; GCN-NEXT: s_set_vgpr_msb 0x544
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=0 src1=1 src2=0
     ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
     ; GCN-NEXT: v_add_f32_e64 v2 /*v258*/, v0, v251 /*v507*/
@@ -48,7 +48,7 @@ body:             |
 
     ; VOP3
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x55
+    ; GCN-NEXT: s_set_vgpr_msb 0x4455
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=1 src1=1 src2=1
     ; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/
     $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode
@@ -58,32 +58,32 @@ body:             |
     $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode
 
     ; Tuple crossing the 256 boundary
-    ; GCN-NEXT: s_set_vgpr_msb 17
+    ; GCN-NEXT: s_set_vgpr_msb 0x5511
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=1 src1=0 src2=1
     ; GCN-NEXT: v_mqsad_u32_u8 v[254:257], v[2:3] /*v[258:259]*/, v0, v[244:247] /*v[500:503]*/
     $vgpr254_vgpr255_vgpr256_vgpr257 = V_MQSAD_U32_U8_e64 $vgpr258_vgpr259, $vgpr0, undef $vgpr500_vgpr501_vgpr502_vgpr503, 0, implicit $exec
 
     ; DPP/tied operand
-    ; GCN-NEXT: s_set_vgpr_msb 0x45
+    ; GCN-NEXT: s_set_vgpr_msb 0x1145
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=1 src1=1 src2=0
     ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
     ; GCN-NEXT: v_add_nc_u16_e64_dpp v0 /*v256*/, v1 /*v257*/, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
     $vgpr256 = V_ADD_NC_U16_fake16_e64_dpp $vgpr256, 0, $vgpr257, 0, undef $vgpr258, 0, 0, 1, 15, 15, 1, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 17
+    ; GCN-NEXT: s_set_vgpr_msb 0x4511
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=1 src1=0 src2=1
     ; GCN-NEXT: v_add3_u32_e64_dpp v0, v1 /*v257*/, v0, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
     $vgpr0 = V_ADD3_U32_e64_dpp $vgpr0, $vgpr257, $vgpr0, undef $vgpr258, 1, 15, 15, 1, implicit $exec
 
     ; DS (addr, data0, and data1 operands)
 
-    ; GCN-NEXT: s_set_vgpr_msb 20
+    ; GCN-NEXT: s_set_vgpr_msb 0x1114
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=1 src2=1
     ; GCN-NEXT: ds_store_2addr_b32 v0, v248 /*v504*/, v249 /*v505*/ offset1:1
     DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr504, undef $vgpr505, 0, 1, 0, implicit $exec
 
     ; Reset
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x1400
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=0 src2=0
     ; GCN-NEXT: ds_store_2addr_b32 v0, v248, v249 offset1:1
     DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr248, undef $vgpr249, 0, 1, 0, implicit $exec
@@ -93,13 +93,13 @@ body:             |
     ; GCN-NEXT: ds_load_b32 v0, v255 /*v511*/
     $vgpr0 = DS_READ_B32_gfx9 $vgpr511, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x44
+    ; GCN-NEXT: s_set_vgpr_msb 0x144
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=0 src1=1 src2=0
     ; GCN-NEXT: ds_add_rtn_u32 v255 /*v511*/, v0, v248 /*v504*/
     $vgpr511 = DS_ADD_RTN_U32_gfx9 $vgpr0, undef $vgpr504, 0, 0, implicit $exec
 
     ; Reset
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4400
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=0 src2=0
     ; GCN-NEXT: ds_add_rtn_u32 v0, v0, v0
     $vgpr0 = DS_ADD_RTN_U32_gfx9 $vgpr0, $vgpr0, 0, 0, implicit $exec
@@ -111,17 +111,17 @@ body:             |
     ; GCN-NEXT: global_load_b32 v2, v[2:3] /*v[258:259]*/, off
     $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr258_vgpr259, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 64
+    ; GCN-NEXT: s_set_vgpr_msb 0x140
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=0 src1=0 src2=0
     ; GCN-NEXT: global_load_b32 v255 /*v511*/, v0, s[0:1]
     $vgpr511 = GLOBAL_LOAD_DWORD_SADDR undef $sgpr0_sgpr1, $vgpr0, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 1
+    ; GCN-NEXT: s_set_vgpr_msb 0x4001
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=1 src1=0 src2=0
     ; GCN-NEXT: scratch_load_u8 v0, v255 /*v511*/, s0
     $vgpr0 = SCRATCH_LOAD_UBYTE_SVS $vgpr511, undef $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x100
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=0 src2=0
     ; GCN-NEXT: global_store_b32 v[0:1], v2, off
     GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
@@ -135,13 +135,13 @@ body:             |
     ; GCN-NEXT: global_store_b96 v[0:1] /*v[256:257]*/, v[244:246] /*v[500:502]*/, off
     GLOBAL_STORE_DWORDX3 $vgpr256_vgpr257, $vgpr500_vgpr501_vgpr502, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x44
+    ; GCN-NEXT: s_set_vgpr_msb 0x544
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=0 src1=1 src2=0
     ; GCN-NEXT: flat_atomic_add_u32 v254 /*v510*/, v[0:1], v255 /*v511*/ th:TH_ATOMIC_RETURN
     $vgpr510 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr511, 0, 1, implicit $exec, implicit $flat_scr
 
     ; Reset
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4400
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=0 src2=0
     ; GCN-NEXT: flat_atomic_add_u32 v0, v[0:1], v255 th:TH_ATOMIC_RETURN
     $vgpr0 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr255, 0, 1, implicit $exec, implicit $flat_scr
@@ -156,12 +156,12 @@ body:             |
     ; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0, s[8:11], s3 offen
     $vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr0, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x41
+    ; GCN-NEXT: s_set_vgpr_msb 0x4041
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=1 src1=0 src2=0
     ; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0 /*v256*/, s[8:11], s3 offen
     $vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr256, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4100
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=0 src2=0
     ; GCN-NEXT: buffer_store_b32 v0, v1, s[0:3], s3 offen
     BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec
@@ -171,7 +171,7 @@ body:             |
     ; GCN-NEXT: buffer_store_b32 v0 /*v256*/, v1 /*v257*/, s[0:3], s3 offen
     BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr256, $vgpr257, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4100
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=0 src2=0
     ; GCN-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s3 offen
     BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, implicit $exec
@@ -183,44 +183,44 @@ body:             |
 
     ; VGPRs above 512
 
-    ; GCN-NEXT: s_set_vgpr_msb 0xaa
+    ; GCN-NEXT: s_set_vgpr_msb 0x41aa
     ; ASM-SAME:                                         ;  msbs: dst=2 src0=2 src1=2 src2=2
     ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/
     $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0xab
+    ; GCN-NEXT: s_set_vgpr_msb 0xaaab
     ; ASM-SAME:                                         ;  msbs: dst=2 src0=3 src1=2 src2=2
     ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v0 /*v768*/, v2 /*v514*/, v3 /*v515*/
     $vgpr512 = V_FMA_F32_e64 0, undef $vgpr768, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0xae
+    ; GCN-NEXT: s_set_vgpr_msb 0xabae
     ; ASM-SAME:                                         ;  msbs: dst=2 src0=2 src1=3 src2=2
     ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v770*/, v3 /*v515*/
     $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr770, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0xba
+    ; GCN-NEXT: s_set_vgpr_msb 0xaeba
     ; ASM-SAME:                                         ;  msbs: dst=2 src0=2 src1=2 src2=3
     ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v771*/
     $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0xea
+    ; GCN-NEXT: s_set_vgpr_msb 0xbaea
     ; ASM-SAME:                                         ;  msbs: dst=3 src0=2 src1=2 src2=2
     ; GCN-NEXT: v_fma_f32 v255 /*v1023*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/
     $vgpr1023 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0xff
+    ; GCN-NEXT: s_set_vgpr_msb 0xeaff
     ; ASM-SAME:                                         ;  msbs: dst=3 src0=3 src1=3 src2=3
     ; GCN-NEXT: v_fma_f32 v0 /*v768*/, v1 /*v769*/, v2 /*v770*/, v3 /*v771*/
     $vgpr768 = V_FMA_F32_e64 0, undef $vgpr769, 0, undef $vgpr770, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x42
+    ; GCN-NEXT: s_set_vgpr_msb 0xff42
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=2 src1=0 src2=0
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0 /*v512*/
     $vgpr256 = V_MOV_B32_e32 undef $vgpr512, implicit $exec
 
     ; Reset
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4200
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=0 src1=0 src2=0
     ; GCN-NEXT: v_fma_f32 v0, v1, v2, v3
     $vgpr0 = V_FMA_F32_e64 0, undef $vgpr1, 0, undef $vgpr2, 0, undef $vgpr3, 0, 0, implicit $exec, implicit $mode
@@ -232,12 +232,12 @@ body:             |
     ; GCN-NEXT: global_store_b96 v[0:1] /*v[512:513]*/, v[0:2] /*v[512:514]*/, off
     GLOBAL_STORE_DWORDX3 $vgpr512_vgpr513, $vgpr512_vgpr513_vgpr514, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 11
+    ; GCN-NEXT: s_set_vgpr_msb 0xa0b
     ; ASM-SAME:                                         ;  msbs: dst=0 src0=3 src1=2 src2=0
     ; GCN-NEXT: global_store_b64 v[254:255] /*v[1022:1023]*/, v[254:255] /*v[766:767]*/, off
     GLOBAL_STORE_DWORDX2 $vgpr1022_vgpr1023, $vgpr766_vgpr767, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x55
+    ; GCN-NEXT: s_set_vgpr_msb 0xb55
     ; ASM-SAME:                                         ;  msbs: dst=1 src0=1 src1=1 src2=1
     ; GCN-NEXT: v_wmma_f32_16x16x32_bf16 v[14:21] /*v[270:277]*/, v[26:33] /*v[282:289]*/, v[34:41] /*v[290:297]*/, v[14:21] /*v[270:277]*/
     early-clobber $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, undef $vgpr282_vgpr283_vgpr284_vgpr285_vgpr286_vgpr287_vgpr288_vgpr289, 8, undef $vgpr290_vgpr291_vgpr292_vgpr293_vgpr294_vgpr295_vgpr296_vgpr297, 8, killed undef $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277, 0, 0, 0, 0, implicit $exec
@@ -247,6 +247,7 @@ body:             |
 ...
 
 # ASM-LABEL: {{^}}vopd:
+
 # DIS-LABEL: <vopd>:
 ---
 name:            vopd
@@ -262,35 +263,35 @@ body:             |
     ; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, v1, v2 :: v_dual_mul_f32 v0 /*v256*/, v3, v4
     $vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr2, undef $vgpr3, undef $vgpr4, implicit $mode, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x41
+    ; GCN-NEXT: s_set_vgpr_msb 0x4041
     ; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, s1, v2 :: v_dual_mul_f32 v0 /*v256*/, v44 /*v300*/, v4
     $vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $sgpr1, undef $vgpr2, undef $vgpr300, undef $vgpr4, implicit $mode, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 4
+    ; GCN-NEXT: s_set_vgpr_msb 0x4104
     ; GCN-NEXT: v_dual_sub_f32 v255, v1, v44 /*v300*/ :: v_dual_mul_f32 v6, v0, v1 /*v257*/
     $vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr300, undef $vgpr0, $vgpr257, implicit $mode, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 1
+    ; GCN-NEXT: s_set_vgpr_msb 0x401
     ; GCN-NEXT: v_dual_sub_f32 v255, 0, v1 :: v_dual_mul_f32 v6, v44 /*v300*/, v3
     $vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 0, undef $vgpr1, undef $vgpr300, undef $vgpr3, implicit $mode, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 64
+    ; GCN-NEXT: s_set_vgpr_msb 0x140
     ; GCN-NEXT: v_dual_fmamk_f32 v243 /*v499*/, v0, 0xa, v3 :: v_dual_fmac_f32 v0 /*v256*/, v1, v1
     $vgpr499, $vgpr256 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr3, undef $vgpr1, undef $vgpr1, $vgpr256, implicit $mode, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 5
+    ; GCN-NEXT: s_set_vgpr_msb 0x4005
     ; GCN-NEXT: v_dual_mov_b32 v2, v3 /*v259*/ :: v_dual_add_f32 v3, v1 /*v257*/, v2 /*v258*/
     $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1250 undef $vgpr259, undef $vgpr257, undef $vgpr258, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x44
+    ; GCN-NEXT: s_set_vgpr_msb 0x544
     ; GCN-NEXT: v_dual_fmamk_f32 v244 /*v500*/, v0, 0xa, v44 /*v300*/ :: v_dual_fmac_f32 v3 /*v259*/, v1, v1 /*v257*/
     $vgpr500, $vgpr259 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr300, undef $vgpr1, undef $vgpr257, $vgpr259, implicit $mode, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 16
+    ; GCN-NEXT: s_set_vgpr_msb 0x4410
     ; GCN-NEXT: v_dual_fma_f32 v0, v6, v6, v44 /*v300*/ :: v_dual_fma_f32 v1, v4, v5, v45 /*v301*/
     $vgpr0, $vgpr1 = V_DUAL_FMA_F32_e64_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, 0, undef $vgpr300, 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $vgpr301, implicit $mode, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x1000
     ; GCN-NEXT: v_dual_fmac_f32 v2, v6, v6 :: v_dual_fma_f32 v3, v4, v5, v3
     $vgpr2, $vgpr3 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, undef $vgpr2, 0, undef $vgpr4, 0, undef $vgpr5, 0, $vgpr3, implicit $mode, implicit $exec
 
@@ -298,7 +299,7 @@ body:             |
     ; GCN-NEXT: v_dual_fma_f32 v244 /*v500*/, v6, v7, v8 :: v_dual_add_f32 v3 /*v259*/, v4, v5
     $vgpr500, $vgpr259 = V_DUAL_FMA_F32_e64_X_ADD_F32_e32_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr7, 0, undef $vgpr8, 0, undef $vgpr4, 0, undef $vgpr5, implicit $mode, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0xae
+    ; GCN-NEXT: s_set_vgpr_msb 0x40ae
     ; GCN-NEXT: v_dual_fmac_f32 v2 /*v514*/, v6 /*v518*/, v8 /*v776*/ :: v_dual_fma_f32 v3 /*v515*/, v4 /*v516*/, v7 /*v775*/, v3 /*v515*/
     $vgpr514, $vgpr515 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr518, 0, undef $vgpr776, undef $vgpr514, 0, undef $vgpr516, 0, undef $vgpr775, 0, $vgpr515, implicit $mode, implicit $exec
 
@@ -319,31 +320,31 @@ body:             |
     ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2 /*v258*/, 0x1
     $vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 5
+    ; GCN-NEXT: s_set_vgpr_msb 0x4505
     ; GCN-NEXT: v_fmaak_f32 v0, v1 /*v257*/, v2 /*v258*/, 0x1
     $vgpr0 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x41
+    ; GCN-NEXT: s_set_vgpr_msb 0x541
     ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2, 0x1
     $vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr2, 1, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x44
+    ; GCN-NEXT: s_set_vgpr_msb 0x4144
     ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1, v2 /*v258*/, 0x1
     $vgpr256 = V_FMAAK_F32 undef $vgpr1, undef $vgpr258, 1, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x45
+    ; GCN-NEXT: s_set_vgpr_msb 0x4445
     ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2 /*v258*/
     $vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 5
+    ; GCN-NEXT: s_set_vgpr_msb 0x4505
     ; GCN-NEXT: v_fmamk_f32 v0, v1 /*v257*/, 0x1, v2 /*v258*/
     $vgpr0 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x41
+    ; GCN-NEXT: s_set_vgpr_msb 0x541
     ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2
     $vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr2, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0x44
+    ; GCN-NEXT: s_set_vgpr_msb 0x4144
     ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1, 0x1, v2 /*v258*/
     $vgpr256 = V_FMAMK_F32 undef $vgpr1, 1, undef $vgpr258, implicit $exec, implicit $mode
 
@@ -389,15 +390,15 @@ body:             |
     ; GCN-NEXT: v_lshlrev_b32_e64 v0, v0 /*v256*/, v2
     $vgpr0 = V_LSHLREV_B32_e64 undef $vgpr256, undef $vgpr2, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 4
+    ; GCN-NEXT: s_set_vgpr_msb 0x104
     ; GCN-NEXT: v_lshlrev_b32_e64 v0, v1, v0 /*v256*/
     $vgpr0 = V_LSHLREV_B32_e64 undef $vgpr1, undef $vgpr256, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 1
+    ; GCN-NEXT: s_set_vgpr_msb 0x401
     ; GCN-NEXT: v_subrev_nc_u32_e32 v0, v0 /*v256*/, v2
     $vgpr0 = V_SUBREV_U32_e32 undef $vgpr256, undef $vgpr2, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 4
+    ; GCN-NEXT: s_set_vgpr_msb 0x104
     ; GCN-NEXT: v_subrev_nc_u32_e32 v0, v1, v0 /*v256*/
     $vgpr0 = V_SUBREV_U32_e32 undef $vgpr1, undef $vgpr256, implicit $exec
 
@@ -417,7 +418,7 @@ body:             |
     ; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/
     $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x5500
     ; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v2
     $vgpr0 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec
 
@@ -431,7 +432,7 @@ body:             |
     ; GCN-NEXT: v_add_nc_u32_e32 v0 /*v256*/, v1, v2
     $vgpr256 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; GCN-NEXT: v_fma_f32 v3, v4, v5, s2
     $vgpr3 = V_FMA_F32_e64 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $sgpr2, 0, 0, implicit $exec, implicit $mode
 
@@ -439,17 +440,17 @@ body:             |
     ; GCN-NEXT: v_fma_f32 v3, v4 /*v260*/, v5, 1
     $vgpr3 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr5, 0, 1, 0, 0, implicit $exec, implicit $mode
 
-    ; GCN-NEXT: s_set_vgpr_msb 4
+    ; GCN-NEXT: s_set_vgpr_msb 0x104
     ; GCN-NEXT: v_mov_b32_e32 v0, v1
     $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
 
     ; GCN-NEXT: v_add_nc_u32_e32 v2, v1, v3 /*v259*/
     $vgpr2 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr259, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 1
+    ; GCN-NEXT: s_set_vgpr_msb 0x401
     ; GCN-NEXT: v_mov_b32_e32 v0, v0 /*v256*/
     ; GCN-NEXT: v_add_nc_u32_e32 v1, v1 /*v257*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 5
+    ; GCN-NEXT: s_set_vgpr_msb 0x105
     ; GCN-NEXT: v_add_nc_u32_e32 v2, v2 /*v258*/, v2 /*v258*/
     $vgpr0 = V_MOV_B32_e32 undef $vgpr256, implicit $exec
     $vgpr1 = V_ADD_U32_e32 undef $vgpr257, undef $vgpr1, implicit $exec
@@ -478,16 +479,18 @@ body:             |
     ; ASM: .LBB{{.*_1}}:
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
 
-    ; No mode switch on fall through
+    ; Reset on fallthrough block end
 
   bb.2:
     ; ASM-NEXT: %bb.2:
-    ; GCN-NEXT: s_nop 0
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 64
+    ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; GCN-NEXT: s_branch
-    S_NOP 0
+    $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
     S_BRANCH %bb.3
 
     ; Reset mode on terminator
@@ -496,7 +499,7 @@ body:             |
     ; ASM: .LBB{{.*_3}}:
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; GCN-NEXT: s_swap_pc_i64
     $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
     $exec = S_SWAPPC_B64 undef $sgpr0_sgpr1
@@ -518,7 +521,7 @@ body:             |
     ; GCN-NEXT: v_mov_b32_e32 v0, v1
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; GCN-NEXT: s_set_pc_i64
     $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
     $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
@@ -538,7 +541,7 @@ body:             |
     ; ASM-NEXT: %bb.7:
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; ASM-NEXT: ; return to shader part epilog
     $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
     SI_RETURN_TO_EPILOG undef $vgpr0, implicit-def $exec
@@ -556,7 +559,7 @@ body:             |
     ; ASM-NEXT: %bb.9:
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; GCN-NEXT: s_set_pc_i64
     $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
     S_SETPC_B64_return undef $sgpr0_sgpr1, implicit-def $exec
@@ -574,13 +577,14 @@ body:             |
     ; ASM: %bb.0:
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     $vgpr256 = V_MOV_B32_e32 undef $vgpr0, implicit $exec
 
   bb.1:
     ; ASM: .LBB{{[0-9]+}}_1:
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; GCN-NEXT: s_cbranch_scc0
     $vgpr257 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
     S_CBRANCH_SCC0 %bb.1, undef implicit $scc
@@ -604,7 +608,7 @@ body:             |
     ; ASM: %bb.0:
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; ASM:      def v0
     ; GCN-NOT:  s_set_vgpr_msb
     ; ASM:      use v0
@@ -638,7 +642,7 @@ body:             |
     ; GCN-NEXT: s_set_vgpr_msb 64
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
     ; GCN-NEXT: s_nop 0
-    ; GCN-NEXT: s_set_vgpr_msb 1
+    ; GCN-NEXT: s_set_vgpr_msb 0x4001
     ; GCN-NEXT: v_mov_b32_e32 v1, v0 /*v256*/
     BUNDLE implicit-def $vgpr256 {
       $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
@@ -680,7 +684,7 @@ body:             |
 
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
     ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; GCN-NEXT: v_mov_b32_e32 v2, v1
     ; GCN-NEXT: v_mov_b32_e32 v3, v1
     BUNDLE implicit-def $vgpr256, implicit-def $vgpr257, implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr1 {
@@ -709,7 +713,7 @@ body:             |
 
     ; GCN-NEXT: s_clause 0x3e
     ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x4000
     ; GCN-NEXT: v_mov_b32_e32 v1, v1
     ; GCN-NEXT: v_mov_b32_e32 v2, v1
     ; GCN-COUNT-60: v_mov_b32_e32 v1, v1
@@ -823,7 +827,7 @@ body:             |
     ; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v1, v2
     $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x500
     ; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v1, v2
     $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
@@ -835,11 +839,11 @@ body:             |
     ; GCN-NEXT: v_wmma_ld_scale16_paired_b64 v[0:1], v[2:3]
     V_WMMA_LD_SCALE16_PAIRED_B64 undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 5
+    ; GCN-NEXT: s_set_vgpr_msb 0x105
     ; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v[0:1], v[2:3]
     $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
-    ; GCN-NEXT: s_set_vgpr_msb 0
+    ; GCN-NEXT: s_set_vgpr_msb 0x500
     ; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v[0:1], v[2:3]
     $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index 1b8e126f19ae1..fe16f0d44dd1c 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -945,7 +945,6 @@ body: |
     $vgpr0 = V_MOV_B32_e32 0, implicit $exec
 ...
 
-# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
 ---
 name: wait_kmcnt_with_outstanding_vmem_2
 tracksRegLiveness: true
@@ -971,6 +970,7 @@ body: |
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   S_WAIT_KMCNT 0
   ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   S_WAIT_XCNT 0
   ; GCN-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   bb.0:
     liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
@@ -985,6 +985,225 @@ body: |
     $vgpr0 = V_MOV_B32_e32 0, implicit $exec
 ...
 
+---
+name: wait_kmcnt_and_wait_loadcnt
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GCN-LABEL: name: wait_kmcnt_and_wait_loadcnt
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   liveins: $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_WAIT_KMCNT 0
+  ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   S_WAIT_LOADCNT 0
+  ; GCN-NEXT:   $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+  bb.1:
+    liveins: $vgpr0_vgpr1, $sgpr2
+    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  bb.2:
+    liveins: $sgpr2
+    $sgpr2 = S_MOV_B32 $sgpr2
+    $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+...
+
+---
+name: implicit_handling_of_pending_vmem_group
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GCN-LABEL: name: implicit_handling_of_pending_vmem_group
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_WAIT_KMCNT 0
+  ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   S_WAIT_XCNT 0
+  ; GCN-NEXT:   $sgpr0 = S_MOV_B32 $sgpr0
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+  bb.1:
+    liveins: $vgpr0_vgpr1, $sgpr2
+    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  bb.2:
+    liveins: $sgpr0_sgpr1, $sgpr2
+    $sgpr2 = S_MOV_B32 $sgpr2
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $sgpr0 = S_MOV_B32 $sgpr0
+...
+
+---
+name: mixed_pending_events
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GCN-LABEL: name: mixed_pending_events
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 100, 0, implicit $exec
+  ; GCN-NEXT:   $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 200, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   liveins: $sgpr2, $vgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_WAIT_LOADCNT 1
+  ; GCN-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
+  ; GCN-NEXT:   S_WAIT_KMCNT 0
+  ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   S_WAIT_XCNT 0
+  ; GCN-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+  bb.1:
+    liveins: $vgpr0_vgpr1, $sgpr2
+    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 100, 0, implicit $exec
+    $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 200, 0, implicit $exec
+  bb.2:
+    liveins: $sgpr2, $vgpr2
+    $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
+    $sgpr2 = S_MOV_B32 $sgpr2
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+...
+
+---
+name: pending_vmem_event_between_block
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GCN-LABEL: name: pending_vmem_event_between_block
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT:   $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_WAIT_KMCNT 0
+  ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   S_WAIT_XCNT 1
+  ; GCN-NEXT:   $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   S_WAIT_XCNT 0
+  ; GCN-NEXT:   $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   $sgpr0 = S_MOV_B32 $sgpr0
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+  bb.1:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+    $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+    $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+  bb.2:
+    liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+    $sgpr2 = S_MOV_B32 $sgpr2
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    $sgpr0 = S_MOV_B32 $sgpr0
+...
+
+---
+name: flushing_vmem_cnt_on_block_entry
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GCN-LABEL: name: flushing_vmem_cnt_on_block_entry
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT:   $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_WAIT_XCNT 0
+  ; GCN-NEXT:   $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   $sgpr0 = S_MOV_B32 $sgpr0
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+  bb.1:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+    $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+    $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+  bb.2:
+    liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    $sgpr0 = S_MOV_B32 $sgpr0
+...
+
 ---
 name: wait_loadcnt_with_outstanding_smem
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll
index 19c8e842a1390..2b7e28362724b 100644
--- a/llvm/test/CodeGen/AMDGPU/while-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/while-break.ll
@@ -157,8 +157,8 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i
 ; GCN-LABEL: while_break_two_chains_of_phi:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-NEXT:    s_mov_b32 s2, 0
 ; GCN-NEXT:    s_mov_b32 s0, 0
+; GCN-NEXT:    s_mov_b32 s2, 0
 ; GCN-NEXT:    s_branch .LBB2_2
 ; GCN-NEXT:  .LBB2_1: ; %Flow1
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index a42c8ac706d27..75817105e74fd 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -3182,7 +3182,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v253 /*v509*/, s33 offset:1592
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v254 /*v510*/, s33 offset:1596
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v255 /*v511*/, s33 offset:1600
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0 /*v512*/, s33 offset:1604
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1 /*v513*/, s33 offset:1608
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2 /*v514*/, s33 offset:1612
@@ -3443,7 +3443,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v253 /*v765*/, s33 offset:2616
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v254 /*v766*/, s33 offset:2620
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v255 /*v767*/, s33 offset:2624
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0 /*v768*/, s33 offset:2628
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1 /*v769*/, s33 offset:2632
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2 /*v770*/, s33 offset:2636
@@ -3706,7 +3706,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v255 /*v1023*/, s33 offset:3648
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    v_writelane_b32 v40, s0, 3
@@ -4135,7 +4135,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v253 /*v509*/, off, s33 offset:1592
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v254 /*v510*/, off, s33 offset:1596
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v255 /*v511*/, off, s33 offset:1600
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0 /*v512*/, off, s33 offset:1604
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1 /*v513*/, off, s33 offset:1608
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2 /*v514*/, off, s33 offset:1612
@@ -4396,7 +4396,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v253 /*v765*/, off, s33 offset:2616
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v254 /*v766*/, off, s33 offset:2620
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v255 /*v767*/, off, s33 offset:2624
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0 /*v768*/, off, s33 offset:2628
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1 /*v769*/, off, s33 offset:2632
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2 /*v770*/, off, s33 offset:2636
@@ -4661,7 +4661,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 s33, s0
 ; GFX1250-DAGISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    s_set_pc_i64 s[30:31]
   %ret = call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent
   ret <2 x half> %ret
@@ -6346,7 +6346,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v253 /*v509*/, s32 offset:1588
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v254 /*v510*/, s32 offset:1592
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v255 /*v511*/, s32 offset:1596
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0 /*v512*/, s32 offset:1600
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1 /*v513*/, s32 offset:1604
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2 /*v514*/, s32 offset:1608
@@ -6607,7 +6607,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v253 /*v765*/, s32 offset:2612
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v254 /*v766*/, s32 offset:2616
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v255 /*v767*/, s32 offset:2620
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0 /*v768*/, s32 offset:2624
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1 /*v769*/, s32 offset:2628
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2 /*v770*/, s32 offset:2632
@@ -6872,7 +6872,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
 ; GFX1250-DAGISEL-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1250-DAGISEL-NEXT:    s_mov_b64 s[36:37], gfx_callee@abs64
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    v_swap_b32 v0, v1
 ; GFX1250-DAGISEL-NEXT:    s_xor_b32 exec_lo, s0, -1
 ; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
@@ -7283,7 +7283,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v253 /*v509*/, off, s32 offset:1588
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v254 /*v510*/, off, s32 offset:1592
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v255 /*v511*/, off, s32 offset:1596
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0 /*v512*/, off, s32 offset:1600
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1 /*v513*/, off, s32 offset:1604
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2 /*v514*/, off, s32 offset:1608
@@ -7544,7 +7544,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v253 /*v765*/, off, s32 offset:2612
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v254 /*v766*/, off, s32 offset:2616
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v255 /*v767*/, off, s32 offset:2620
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0 /*v768*/, off, s32 offset:2624
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1 /*v769*/, off, s32 offset:2628
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2 /*v770*/, off, s32 offset:2632
@@ -7807,7 +7807,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v255 /*v1023*/, off, s32 offset:3644
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    s_set_pc_i64 s[36:37]
   %ret = tail call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent
   ret <2 x half> %ret
@@ -9657,7 +9657,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v253 /*v509*/, s33 offset:1600
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v254 /*v510*/, s33 offset:1604
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v255 /*v511*/, s33 offset:1608
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0 /*v512*/, s33 offset:1612
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1 /*v513*/, s33 offset:1616
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2 /*v514*/, s33 offset:1620
@@ -9918,7 +9918,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v253 /*v765*/, s33 offset:2624
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v254 /*v766*/, s33 offset:2628
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v255 /*v767*/, s33 offset:2632
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0 /*v768*/, s33 offset:2636
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1 /*v769*/, s33 offset:2640
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2 /*v770*/, s33 offset:2644
@@ -10181,7 +10181,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v255 /*v1023*/, s33 offset:3656
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    s_clause 0x2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v42, s33
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v40, s33 offset:164
@@ -10616,7 +10616,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v253 /*v509*/, off, s33 offset:1600
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v254 /*v510*/, off, s33 offset:1604
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v255 /*v511*/, off, s33 offset:1608
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0 /*v512*/, off, s33 offset:1612
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1 /*v513*/, off, s33 offset:1616
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2 /*v514*/, off, s33 offset:1620
@@ -10877,7 +10877,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v253 /*v765*/, off, s33 offset:2624
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v254 /*v766*/, off, s33 offset:2628
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v255 /*v767*/, off, s33 offset:2632
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0 /*v768*/, off, s33 offset:2636
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1 /*v769*/, off, s33 offset:2640
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2 /*v770*/, off, s33 offset:2644
@@ -11142,7 +11142,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 s33, s0
 ; GFX1250-DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    s_set_pc_i64 s[30:31]
   %ret = call float(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, <8 x float> %x) convergent
   store float %ret, ptr %p
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir
new file mode 100644
index 0000000000000..df3e780c61f46
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir
@@ -0,0 +1,42 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -run-pass=machine-sink %s -o - | FileCheck %s
+
+---
+name:            wmma_test
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: wmma_test
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, [[DEF]], 8, [[DEF1]], 8, 0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[DEF2]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_256 = COPY %3.sub1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    %0:vreg_128 = IMPLICIT_DEF
+    %1:vreg_128 = IMPLICIT_DEF
+    %2:sreg_32 = IMPLICIT_DEF
+    early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, %0:vreg_128, 8, %1:vreg_128, 8, 0, 0, 0, implicit $exec
+    %4:sreg_32 = SI_IF %2:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    %5:vreg_256 = COPY %3.sub1:vreg_256
+
+  bb.2:
+    SI_END_CF %4:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 21f0c008366a9..0fdc1a83dddbd 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -2029,10 +2029,10 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; GFX9-W64-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
+; GFX9-W64-NEXT:    s_mov_b32 s2, 0
 ; GFX9-W64-NEXT:    buffer_store_dword v1, off, s[8:11], 0
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-W64-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
+; GFX9-W64-NEXT:    v_lshl_add_u32 v1, v2, 2, s2
 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index feb6ecd996516..92280b9ad8acf 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -298,14 +298,13 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b)
 ; SI-LABEL: scalar_xor_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_xor_b32 s0, s2, s3
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_xor_b32 s4, s4, s5
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: scalar_xor_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
index c3935821c31dd..d9f5ba92e116d 100644
--- a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
@@ -5,15 +5,14 @@ define amdgpu_kernel void @zext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a
 ; GCN-LABEL: zext_i16_to_i32_uniform:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_and_b32 s0, s2, 0xffff
-; GCN-NEXT:    s_add_i32 s0, s3, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NEXT:    s_add_i32 s4, s5, s4
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %zext = zext i16 %a to i32
   %res = add i32 %b, %zext
diff --git a/llvm/test/CodeGen/ARM/byval_struct_copy_tailcall.ll b/llvm/test/CodeGen/ARM/byval_struct_copy_tailcall.ll
new file mode 100644
index 0000000000000..50c676c425ce7
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/byval_struct_copy_tailcall.ll
@@ -0,0 +1,69 @@
+; RUN: llc -mtriple thumbv7em-apple-darwin -o - < %s | FileCheck %s
+
+%"struct.s1" = type { [19 x i32] }
+
+define void @f0(ptr byval(%"struct.s1") %0, ptr %1) #1 {
+; CHECK-LABEL: _f0:                                    @ @f0
+; CHECK-NEXT:  @ %bb.0:
+; CHECK-NEXT:  	sub	sp, #16
+; CHECK-NEXT:  	push	{r4, lr}
+; CHECK-NEXT:  	sub	sp, #76
+; CHECK-NEXT:  	add.w	r9, sp, #84
+; CHECK-NEXT:  	stm.w	r9, {r0, r1, r2, r3}
+; CHECK-NEXT:  	mov	r0, sp
+; CHECK-NEXT:  	add	r1, sp, #84
+; CHECK-NEXT:  	movs	r2, #76
+; CHECK-NEXT:  	mov	r3, r0
+; CHECK-NEXT:  LBB0_1:                                 @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:  	ldr	r4, [r1], #4
+; CHECK-NEXT:  	subs	r2, #4
+; CHECK-NEXT:  	str	r4, [r3], #4
+; CHECK-NEXT:  	bne	LBB0_1
+; CHECK-NEXT:  @ %bb.2:
+; CHECK-NEXT:  	add.w	r1, r0, #12
+; CHECK-NEXT:  	add	r2, sp, #100
+; CHECK-NEXT:  	ldr	r0, [sp, #160]
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldm.w	sp, {r1, r2, r3}
+; CHECK-NEXT:  	add	sp, #76
+; CHECK-NEXT:  	pop.w	{r4, lr}
+; CHECK-NEXT:  	add	sp, #16
+; CHECK-NEXT:  	b.w	_f1
+  tail call  void @f1(ptr %1, ptr byval(%"struct.s1") %0)
+  ret void
+}
+
+declare void @f1(ptr, ptr)
+
+attributes #1 = { nounwind "frame-pointes"="non-leaf" }
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
index cabd43edff9d6..9e243aec1128d 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
@@ -14,7 +14,6 @@ entry:
 }
 
 ; CHECK: _ZL10myCallbacki:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
 define internal void @_ZL10myCallbacki(i32 %value) !type !2 {
 entry:
   %sink = alloca i32, align 4
@@ -33,7 +32,7 @@ entry:
 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
 ; CHECK-NEXT: .byte   1
 ;; Function Entry PC
-; CHECK-NEXT: .long   [[LABEL_FUNC]]
+; CHECK-NEXT: .long _ZL10myCallbacki
 ;; Function type ID -5212364466660467813
 ; CHECK-NEXT: .long	1154849691
 ; CHECK-NEXT: .long	3081369122
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
index 3d3974ee6ba3b..8e8881ee722fb 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
@@ -11,7 +11,6 @@ declare !type !1 i32 @direct_bar(i8)
 declare !type !2 ptr @direct_baz(ptr)
 
 ; CHECK: ball:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
 define ptr @ball() {
 entry:
   call void @direct_foo()
@@ -42,7 +41,7 @@ entry:
 ;; Flags
 ; CHECK-NEXT: .byte   7
 ;; Function Entry PC
-; CHECK-NEXT: .long   [[LABEL_FUNC]]
+; CHECK-NEXT: .long ball
 ;; Function type ID -- set to 0 as no type metadata attached to function.
 ; CHECK-NEXT: .long   0
 ; CHECK-NEXT: .long   0
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
index 80360041c106a..35e570bdde405 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
@@ -29,6 +29,6 @@ declare !type !2 i32 @bar(i8 signext)
 
 ; CHECK:      Hex dump of section '.llvm.callgraph':
 ; CHECK-NEXT: 0x00000000 00050000 00008e19 0b7f3326 e3000154
-; CHECK-NEXT: 0x00000010 86bc5981 4b8e3000 05100000 00a150b8
+; CHECK-NEXT: 0x00000010 86bc5981 4b8e3000 05000000 00a150b8
 ;; Verify that the type id 0x308e4b8159bc8654 is in section.
 ; CHECK-NEXT: 0x00000020 3e0cfe3c b2015486 bc59814b 8e30
diff --git a/llvm/test/CodeGen/ARM/ldexp-fp128.ll b/llvm/test/CodeGen/ARM/ldexp-fp128.ll
new file mode 100644
index 0000000000000..93fcd39e824fb
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ldexp-fp128.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=armv7-unknown-linux < %s | FileCheck -check-prefix=LINUX %s
+
+define fp128 @testExpl(fp128 %val, i32 %a) {
+; LINUX-LABEL: testExpl:
+; LINUX:       @ %bb.0:
+; LINUX-NEXT:    push {r11, lr}
+; LINUX-NEXT:    sub sp, sp, #8
+; LINUX-NEXT:    ldr r12, [sp, #16]
+; LINUX-NEXT:    str r12, [sp]
+; LINUX-NEXT:    bl ldexpl
+; LINUX-NEXT:    add sp, sp, #8
+; LINUX-NEXT:    pop {r11, pc}
+  %call = tail call fp128 @ldexpl(fp128 %val, i32 %a)
+  ret fp128 %call
+}
+
+declare fp128 @ldexpl(fp128, i32) memory(none)
+
+define fp128 @test_ldexp_f128_i32(fp128 %val, i32 %a) {
+; LINUX-LABEL: test_ldexp_f128_i32:
+; LINUX:       @ %bb.0:
+; LINUX-NEXT:    push {r11, lr}
+; LINUX-NEXT:    sub sp, sp, #8
+; LINUX-NEXT:    ldr r12, [sp, #16]
+; LINUX-NEXT:    str r12, [sp]
+; LINUX-NEXT:    bl ldexpl
+; LINUX-NEXT:    add sp, sp, #8
+; LINUX-NEXT:    pop {r11, pc}
+  %call = tail call fp128 @llvm.ldexp.f128.i32(fp128 %val, i32 %a)
+  ret fp128 %call
+}
+
+define <2 x fp128> @test_ldexp_v2f128_v2i32(<2 x fp128> %val, <2 x i32> %a) {
+; LINUX-LABEL: test_ldexp_v2f128_v2i32:
+; LINUX:       @ %bb.0:
+; LINUX-NEXT:    push {r4, r5, r6, lr}
+; LINUX-NEXT:    vpush {d8}
+; LINUX-NEXT:    sub sp, sp, #8
+; LINUX-NEXT:    mov r5, r3
+; LINUX-NEXT:    add r3, sp, #40
+; LINUX-NEXT:    mov r6, r2
+; LINUX-NEXT:    mov r4, r0
+; LINUX-NEXT:    ldm r3, {r0, r1, r2, r3}
+; LINUX-NEXT:    vldr d8, [sp, #56]
+; LINUX-NEXT:    vst1.32 {d8[1]}, [sp:32]
+; LINUX-NEXT:    bl ldexpl
+; LINUX-NEXT:    ldr r12, [sp, #32]
+; LINUX-NEXT:    vst1.32 {d8[0]}, [sp:32]
+; LINUX-NEXT:    ldr lr, [sp, #36]
+; LINUX-NEXT:    str r0, [r4, #16]
+; LINUX-NEXT:    mov r0, r6
+; LINUX-NEXT:    str r1, [r4, #20]
+; LINUX-NEXT:    mov r1, r5
+; LINUX-NEXT:    str r2, [r4, #24]
+; LINUX-NEXT:    mov r2, r12
+; LINUX-NEXT:    str r3, [r4, #28]
+; LINUX-NEXT:    mov r3, lr
+; LINUX-NEXT:    bl ldexpl
+; LINUX-NEXT:    stm r4, {r0, r1, r2, r3}
+; LINUX-NEXT:    add sp, sp, #8
+; LINUX-NEXT:    vpop {d8}
+; LINUX-NEXT:    pop {r4, r5, r6, pc}
+  %call = tail call <2 x fp128> @llvm.ldexp.v2f128.v2i32(<2 x fp128> %val, <2 x i32> %a)
+  ret <2 x fp128> %call
+}
diff --git a/llvm/test/CodeGen/ARM/llvm.sincos.ll b/llvm/test/CodeGen/ARM/llvm.sincos.ll
index 9628405df6bcb..1448fac8d864f 100644
--- a/llvm/test/CodeGen/ARM/llvm.sincos.ll
+++ b/llvm/test/CodeGen/ARM/llvm.sincos.ll
@@ -1,223 +1,1004 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=thumbv7-gnu-linux < %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -mtriple=thumbv7-gnu-linux < %s | FileCheck -check-prefix=GNU %s
+; RUN: llc -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 < %s | FileCheck -check-prefix=GNUEABI %s
+; RUN: llc -mtriple=armv7-apple-ios6 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-NO-STRET %s
+; RUN: llc -mtriple=armv7-apple-ios7 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-WITH-STRET %s
+; RUN: llc -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefix=WATCHABI %s
 
 define { half, half } @test_sincos_f16(half %a) {
-; CHECK-LABEL: test_sincos_f16:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldr r0, [sp, #4]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    ldr r0, [sp]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    mov r1, r0
-; CHECK-NEXT:    mov r0, r4
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    pop {r4, pc}
+; GNU-LABEL: test_sincos_f16:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r4, lr}
+; GNU-NEXT:    sub sp, #8
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldr r0, [sp, #4]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    mov r4, r0
+; GNU-NEXT:    ldr r0, [sp]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    mov r1, r0
+; GNU-NEXT:    mov r0, r4
+; GNU-NEXT:    add sp, #8
+; GNU-NEXT:    pop {r4, pc}
+;
+; GNUEABI-LABEL: test_sincos_f16:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r4, lr}
+; GNUEABI-NEXT:    push {r4, lr}
+; GNUEABI-NEXT:    .pad #8
+; GNUEABI-NEXT:    sub sp, sp, #8
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp, #4]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    mov r4, r0
+; GNUEABI-NEXT:    ldr r0, [sp]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    mov r1, r0
+; GNUEABI-NEXT:    mov r0, r4
+; GNUEABI-NEXT:    add sp, sp, #8
+; GNUEABI-NEXT:    pop {r4, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f16:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, lr}
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    mov r1, r0
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    pop {r4, r5, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f16:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {r4, r5, lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    ldm sp, {r0, r4}
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    mov r5, r0
+; IOS-WITH-STRET-NEXT:    mov r0, r4
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, r5
+; IOS-WITH-STRET-NEXT:    add sp, sp, #8
+; IOS-WITH-STRET-NEXT:    pop {r4, r5, pc}
+;
+; WATCHABI-LABEL: test_sincos_f16:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    vcvtb.f32.f16 s0, s0
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s0
+; WATCHABI-NEXT:    vcvtb.f16.f32 s1, s1
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { half, half } @llvm.sincos.f16(half %a)
   ret { half, half } %result
 }
 
 define half @test_sincos_f16_only_use_sin(half %a) {
-; CHECK-LABEL: test_sincos_f16_only_use_sin:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldr r0, [sp, #4]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_f16_only_use_sin:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    sub sp, #8
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldr r0, [sp, #4]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    add sp, #8
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f16_only_use_sin:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .pad #8
+; GNUEABI-NEXT:    sub sp, sp, #8
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp, #4]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    add sp, sp, #8
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f16_only_use_sin:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {lr}
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    pop {lr}
+; IOS-NO-STRET-NEXT:    bx lr
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f16_only_use_sin:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    ldr r0, [sp]
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    add sp, sp, #8
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_f16_only_use_sin:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    vcvtb.f32.f16 s0, s0
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s0
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { half, half } @llvm.sincos.f16(half %a)
   %result.0 = extractvalue { half, half } %result, 0
   ret half %result.0
 }
 
 define half @test_sincos_f16_only_use_cos(half %a) {
-; CHECK-LABEL: test_sincos_f16_only_use_cos:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldr r0, [sp]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_f16_only_use_cos:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    sub sp, #8
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldr r0, [sp]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    add sp, #8
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f16_only_use_cos:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .pad #8
+; GNUEABI-NEXT:    sub sp, sp, #8
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    add sp, sp, #8
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f16_only_use_cos:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {lr}
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    pop {lr}
+; IOS-NO-STRET-NEXT:    bx lr
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f16_only_use_cos:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    ldr r0, [sp, #4]
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    add sp, sp, #8
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_f16_only_use_cos:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    vcvtb.f32.f16 s0, s0
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s1
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { half, half } @llvm.sincos.f16(half %a)
   %result.1 = extractvalue { half, half } %result, 1
   ret half %result.1
 }
 
 define { <2 x half>, <2 x half> } @test_sincos_v2f16(<2 x half> %a) {
-; CHECK-LABEL: test_sincos_v2f16:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vpush {d8}
-; CHECK-NEXT:    sub sp, #24
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #12
-; CHECK-NEXT:    add r2, sp, #8
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    mov r0, r4
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldr r0, [sp, #12]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    ldr r1, [sp, #4]
-; CHECK-NEXT:    strh.w r0, [sp, #22]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    strh.w r0, [sp, #20]
-; CHECK-NEXT:    add r0, sp, #20
-; CHECK-NEXT:    vld1.32 {d8[0]}, [r0:32]
-; CHECK-NEXT:    ldr r0, [sp, #8]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    ldr r1, [sp]
-; CHECK-NEXT:    strh.w r0, [sp, #18]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    strh.w r0, [sp, #16]
-; CHECK-NEXT:    add r0, sp, #16
-; CHECK-NEXT:    vmovl.u16 q9, d8
-; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
-; CHECK-NEXT:    vmovl.u16 q8, d16
-; CHECK-NEXT:    vmov.32 r0, d18[0]
-; CHECK-NEXT:    vmov.32 r1, d18[1]
-; CHECK-NEXT:    vmov.32 r2, d16[0]
-; CHECK-NEXT:    vmov.32 r3, d16[1]
-; CHECK-NEXT:    add sp, #24
-; CHECK-NEXT:    vpop {d8}
-; CHECK-NEXT:    pop {r4, pc}
+; GNU-LABEL: test_sincos_v2f16:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r4, lr}
+; GNU-NEXT:    vpush {d8}
+; GNU-NEXT:    sub sp, #24
+; GNU-NEXT:    mov r4, r0
+; GNU-NEXT:    mov r0, r1
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #12
+; GNU-NEXT:    add r2, sp, #8
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    mov r0, r4
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldr r0, [sp, #12]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    ldr r1, [sp, #4]
+; GNU-NEXT:    strh.w r0, [sp, #22]
+; GNU-NEXT:    mov r0, r1
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    strh.w r0, [sp, #20]
+; GNU-NEXT:    add r0, sp, #20
+; GNU-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; GNU-NEXT:    ldr r0, [sp, #8]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    ldr r1, [sp]
+; GNU-NEXT:    strh.w r0, [sp, #18]
+; GNU-NEXT:    mov r0, r1
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    strh.w r0, [sp, #16]
+; GNU-NEXT:    add r0, sp, #16
+; GNU-NEXT:    vmovl.u16 q9, d8
+; GNU-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; GNU-NEXT:    vmovl.u16 q8, d16
+; GNU-NEXT:    vmov.32 r0, d18[0]
+; GNU-NEXT:    vmov.32 r1, d18[1]
+; GNU-NEXT:    vmov.32 r2, d16[0]
+; GNU-NEXT:    vmov.32 r3, d16[1]
+; GNU-NEXT:    add sp, #24
+; GNU-NEXT:    vpop {d8}
+; GNU-NEXT:    pop {r4, pc}
+;
+; GNUEABI-LABEL: test_sincos_v2f16:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r4, lr}
+; GNUEABI-NEXT:    push {r4, lr}
+; GNUEABI-NEXT:    .vsave {d8}
+; GNUEABI-NEXT:    vpush {d8}
+; GNUEABI-NEXT:    .pad #24
+; GNUEABI-NEXT:    sub sp, sp, #24
+; GNUEABI-NEXT:    mov r4, r0
+; GNUEABI-NEXT:    mov r0, r1
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #12
+; GNUEABI-NEXT:    add r2, sp, #8
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    mov r0, r4
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp, #12]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    ldr r1, [sp, #4]
+; GNUEABI-NEXT:    strh r0, [sp, #22]
+; GNUEABI-NEXT:    mov r0, r1
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    strh r0, [sp, #20]
+; GNUEABI-NEXT:    add r0, sp, #20
+; GNUEABI-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; GNUEABI-NEXT:    ldr r0, [sp, #8]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    ldr r1, [sp]
+; GNUEABI-NEXT:    strh r0, [sp, #18]
+; GNUEABI-NEXT:    mov r0, r1
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    strh r0, [sp, #16]
+; GNUEABI-NEXT:    add r0, sp, #16
+; GNUEABI-NEXT:    vmovl.u16 q9, d8
+; GNUEABI-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; GNUEABI-NEXT:    vmovl.u16 q8, d16
+; GNUEABI-NEXT:    vmov.32 r0, d18[0]
+; GNUEABI-NEXT:    vmov.32 r1, d18[1]
+; GNUEABI-NEXT:    vmov.32 r2, d16[0]
+; GNUEABI-NEXT:    vmov.32 r3, d16[1]
+; GNUEABI-NEXT:    add sp, sp, #24
+; GNUEABI-NEXT:    vpop {d8}
+; GNUEABI-NEXT:    pop {r4, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_v2f16:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, lr}
+; IOS-NO-STRET-NEXT:    vpush {d8}
+; IOS-NO-STRET-NEXT:    sub sp, sp, #8
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    mov r0, r1
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    strh r0, [sp, #6]
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    strh r0, [sp, #4]
+; IOS-NO-STRET-NEXT:    add r0, sp, #4
+; IOS-NO-STRET-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    strh r0, [sp, #2]
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    strh r0, [sp]
+; IOS-NO-STRET-NEXT:    mov r0, sp
+; IOS-NO-STRET-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; IOS-NO-STRET-NEXT:    vmovl.u16 q9, d8
+; IOS-NO-STRET-NEXT:    vmovl.u16 q8, d16
+; IOS-NO-STRET-NEXT:    vmov.32 r0, d18[0]
+; IOS-NO-STRET-NEXT:    vmov.32 r1, d18[1]
+; IOS-NO-STRET-NEXT:    vmov.32 r2, d16[0]
+; IOS-NO-STRET-NEXT:    vmov.32 r3, d16[1]
+; IOS-NO-STRET-NEXT:    add sp, sp, #8
+; IOS-NO-STRET-NEXT:    vpop {d8}
+; IOS-NO-STRET-NEXT:    pop {r4, r5, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_v2f16:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {r4, r5, lr}
+; IOS-WITH-STRET-NEXT:    vpush {d8}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #24
+; IOS-WITH-STRET-NEXT:    mov r4, r0
+; IOS-WITH-STRET-NEXT:    mov r0, r1
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    add r0, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    mov r0, r4
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    ldr r0, [sp, #8]
+; IOS-WITH-STRET-NEXT:    ldr r4, [sp, #12]
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    ldm sp, {r1, r5}
+; IOS-WITH-STRET-NEXT:    strh r0, [sp, #22]
+; IOS-WITH-STRET-NEXT:    mov r0, r1
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    strh r0, [sp, #20]
+; IOS-WITH-STRET-NEXT:    add r0, sp, #20
+; IOS-WITH-STRET-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; IOS-WITH-STRET-NEXT:    mov r0, r4
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    strh r0, [sp, #18]
+; IOS-WITH-STRET-NEXT:    mov r0, r5
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    strh r0, [sp, #16]
+; IOS-WITH-STRET-NEXT:    add r0, sp, #16
+; IOS-WITH-STRET-NEXT:    vmovl.u16 q9, d8
+; IOS-WITH-STRET-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; IOS-WITH-STRET-NEXT:    vmovl.u16 q8, d16
+; IOS-WITH-STRET-NEXT:    vmov.32 r0, d18[0]
+; IOS-WITH-STRET-NEXT:    vmov.32 r1, d18[1]
+; IOS-WITH-STRET-NEXT:    vmov.32 r2, d16[0]
+; IOS-WITH-STRET-NEXT:    vmov.32 r3, d16[1]
+; IOS-WITH-STRET-NEXT:    add sp, sp, #24
+; IOS-WITH-STRET-NEXT:    vpop {d8}
+; IOS-WITH-STRET-NEXT:    pop {r4, r5, pc}
+;
+; WATCHABI-LABEL: test_sincos_v2f16:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    vpush {d10}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    vpush {d8}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 24
+; WATCHABI-NEXT:    .cfi_offset d10, -16
+; WATCHABI-NEXT:    .cfi_offset d8, -24
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 32
+; WATCHABI-NEXT:    vmov.f32 s16, s0
+; WATCHABI-NEXT:    vcvtb.f32.f16 s0, s1
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s0
+; WATCHABI-NEXT:    vcvtb.f32.f16 s4, s16
+; WATCHABI-NEXT:    vmov r0, s0
+; WATCHABI-NEXT:    vmov.f32 s0, s4
+; WATCHABI-NEXT:    vmov.f32 s20, s1
+; WATCHABI-NEXT:    strh.w r0, [sp, #6]
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s0
+; WATCHABI-NEXT:    vmov r0, s0
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s20
+; WATCHABI-NEXT:    strh.w r0, [sp, #4]
+; WATCHABI-NEXT:    add r0, sp, #4
+; WATCHABI-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; WATCHABI-NEXT:    vmov r0, s0
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s1
+; WATCHABI-NEXT:    strh.w r0, [sp, #2]
+; WATCHABI-NEXT:    vmov r0, s0
+; WATCHABI-NEXT:    vmovl.u16 q0, d16
+; WATCHABI-NEXT:    strh.w r0, [sp]
+; WATCHABI-NEXT:    mov r0, sp
+; WATCHABI-NEXT:    vld1.32 {d18[0]}, [r0:32]
+; WATCHABI-NEXT:    vmovl.u16 q1, d18
+; WATCHABI-NEXT:    vmov.f32 s2, s4
+; WATCHABI-NEXT:    vmov.f32 s3, s5
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    vpop {d8}
+; WATCHABI-NEXT:    vpop {d10}
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { <2 x half>, <2 x half> } @llvm.sincos.v2f16(<2 x half> %a)
   ret { <2 x half>, <2 x half> } %result
 }
 
 define { float, float } @test_sincos_f32(float %a) {
-; CHECK-LABEL: test_sincos_f32:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldrd r1, r0, [sp], #8
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_f32:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    sub sp, #8
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldrd r1, r0, [sp], #8
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f32:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .pad #8
+; GNUEABI-NEXT:    sub sp, sp, #8
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp, #4]
+; GNUEABI-NEXT:    ldr r1, [sp], #8
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f32:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, lr}
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    mov r1, r0
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    pop {r4, r5, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f32:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #8
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    pop {r0, r1}
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_f32:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { float, float } @llvm.sincos.f32(float %a)
   ret { float, float } %result
 }
 
 define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) {
-; CHECK-LABEL: test_sincos_v2f32:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vpush {d8}
-; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    vmov d8, r0, r1
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    add r1, sp, #12
-; CHECK-NEXT:    add r2, sp, #8
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    vldr s1, [sp, #4]
-; CHECK-NEXT:    vldr s3, [sp]
-; CHECK-NEXT:    vldr s0, [sp, #12]
-; CHECK-NEXT:    vldr s2, [sp, #8]
-; CHECK-NEXT:    vmov r0, r1, d0
-; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    vpop {d8}
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_v2f32:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    vpush {d8}
+; GNU-NEXT:    sub sp, #16
+; GNU-NEXT:    vmov d8, r0, r1
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    vmov r0, s17
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    vmov r0, s16
+; GNU-NEXT:    add r1, sp, #12
+; GNU-NEXT:    add r2, sp, #8
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    vldr s1, [sp, #4]
+; GNU-NEXT:    vldr s3, [sp]
+; GNU-NEXT:    vldr s0, [sp, #12]
+; GNU-NEXT:    vldr s2, [sp, #8]
+; GNU-NEXT:    vmov r0, r1, d0
+; GNU-NEXT:    vmov r2, r3, d1
+; GNU-NEXT:    add sp, #16
+; GNU-NEXT:    vpop {d8}
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_v2f32:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .vsave {d8}
+; GNUEABI-NEXT:    vpush {d8}
+; GNUEABI-NEXT:    .pad #16
+; GNUEABI-NEXT:    sub sp, sp, #16
+; GNUEABI-NEXT:    vmov d8, r0, r1
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    vmov r0, s17
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    vmov r0, s16
+; GNUEABI-NEXT:    add r1, sp, #12
+; GNUEABI-NEXT:    add r2, sp, #8
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    vldr s1, [sp, #4]
+; GNUEABI-NEXT:    vldr s3, [sp]
+; GNUEABI-NEXT:    vldr s0, [sp, #12]
+; GNUEABI-NEXT:    vldr s2, [sp, #8]
+; GNUEABI-NEXT:    vmov r0, r1, d0
+; GNUEABI-NEXT:    vmov r2, r3, d1
+; GNUEABI-NEXT:    add sp, sp, #16
+; GNUEABI-NEXT:    vpop {d8}
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_v2f32:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, r6, r7, lr}
+; IOS-NO-STRET-NEXT:    vpush {d8}
+; IOS-NO-STRET-NEXT:    vmov d8, r0, r1
+; IOS-NO-STRET-NEXT:    vmov r4, s17
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    vmov r6, s16
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    mov r0, r6
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    mov r7, r0
+; IOS-NO-STRET-NEXT:    mov r0, r6
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    mov r2, r0
+; IOS-NO-STRET-NEXT:    mov r0, r7
+; IOS-NO-STRET-NEXT:    mov r1, r5
+; IOS-NO-STRET-NEXT:    mov r3, r4
+; IOS-NO-STRET-NEXT:    vpop {d8}
+; IOS-NO-STRET-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_v2f32:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    vpush {d8}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #16
+; IOS-WITH-STRET-NEXT:    vmov d8, r0, r1
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    vmov r1, s17
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    vmov r1, s16
+; IOS-WITH-STRET-NEXT:    add r0, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    vldr s1, [sp]
+; IOS-WITH-STRET-NEXT:    vldr s3, [sp, #4]
+; IOS-WITH-STRET-NEXT:    vldr s0, [sp, #8]
+; IOS-WITH-STRET-NEXT:    vldr s2, [sp, #12]
+; IOS-WITH-STRET-NEXT:    vmov r0, r1, d0
+; IOS-WITH-STRET-NEXT:    vmov r2, r3, d1
+; IOS-WITH-STRET-NEXT:    add sp, sp, #16
+; IOS-WITH-STRET-NEXT:    vpop {d8}
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_v2f32:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    vpush {d8, d9, d10}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 32
+; WATCHABI-NEXT:    .cfi_offset d10, -16
+; WATCHABI-NEXT:    .cfi_offset d9, -24
+; WATCHABI-NEXT:    .cfi_offset d8, -32
+; WATCHABI-NEXT:    vmov.f64 d8, d0
+; WATCHABI-NEXT:    vmov.f32 s0, s17
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vmov.f32 s19, s0
+; WATCHABI-NEXT:    vmov.f32 s0, s16
+; WATCHABI-NEXT:    vmov.f32 s21, s1
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vmov.f32 s20, s1
+; WATCHABI-NEXT:    vmov.f32 s18, s0
+; WATCHABI-NEXT:    vmov.f64 d1, d10
+; WATCHABI-NEXT:    vmov.f64 d0, d9
+; WATCHABI-NEXT:    vpop {d8, d9, d10}
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> %a)
   ret { <2 x float>, <2 x float> } %result
 }
 
 define { double, double } @test_sincos_f64(double %a) {
-; CHECK-LABEL: test_sincos_f64:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    add r2, sp, #8
-; CHECK-NEXT:    mov r3, sp
-; CHECK-NEXT:    bl sincos
-; CHECK-NEXT:    ldrd r0, r1, [sp, #8]
-; CHECK-NEXT:    ldrd r2, r3, [sp], #16
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_f64:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    sub sp, #16
+; GNU-NEXT:    add r2, sp, #8
+; GNU-NEXT:    mov r3, sp
+; GNU-NEXT:    bl sincos
+; GNU-NEXT:    ldrd r0, r1, [sp, #8]
+; GNU-NEXT:    ldrd r2, r3, [sp], #16
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f64:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .pad #16
+; GNUEABI-NEXT:    sub sp, sp, #16
+; GNUEABI-NEXT:    add r2, sp, #8
+; GNUEABI-NEXT:    mov r3, sp
+; GNUEABI-NEXT:    bl sincos
+; GNUEABI-NEXT:    ldm sp, {r2, r3}
+; GNUEABI-NEXT:    ldr r0, [sp, #8]
+; GNUEABI-NEXT:    ldr r1, [sp, #12]
+; GNUEABI-NEXT:    add sp, sp, #16
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f64:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, r6, r7, lr}
+; IOS-NO-STRET-NEXT:    mov r4, r1
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    bl _sin
+; IOS-NO-STRET-NEXT:    mov r6, r0
+; IOS-NO-STRET-NEXT:    mov r7, r1
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    mov r1, r4
+; IOS-NO-STRET-NEXT:    bl _cos
+; IOS-NO-STRET-NEXT:    mov r2, r0
+; IOS-NO-STRET-NEXT:    mov r3, r1
+; IOS-NO-STRET-NEXT:    mov r0, r6
+; IOS-NO-STRET-NEXT:    mov r1, r7
+; IOS-NO-STRET-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f64:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #16
+; IOS-WITH-STRET-NEXT:    mov r2, r1
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincos_stret
+; IOS-WITH-STRET-NEXT:    vldr d16, [sp, #8]
+; IOS-WITH-STRET-NEXT:    ldm sp, {r0, r1}
+; IOS-WITH-STRET-NEXT:    vmov r2, r3, d16
+; IOS-WITH-STRET-NEXT:    add sp, sp, #16
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_f64:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    bl ___sincos_stret
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { double, double } @llvm.sincos.f64(double %a)
   ret { double, double } %result
 }
 
 define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) {
-; CHECK-LABEL: test_sincos_v2f64:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    mov r1, r3
-; CHECK-NEXT:    mov r12, r2
-; CHECK-NEXT:    add r2, sp, #24
-; CHECK-NEXT:    add r3, sp, #16
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r12
-; CHECK-NEXT:    bl sincos
-; CHECK-NEXT:    ldrd r0, r1, [sp, #40]
-; CHECK-NEXT:    add r2, sp, #8
-; CHECK-NEXT:    mov r3, sp
-; CHECK-NEXT:    bl sincos
-; CHECK-NEXT:    vldr d19, [sp, #8]
-; CHECK-NEXT:    vldr d18, [sp, #24]
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    vldr d16, [sp, #16]
-; CHECK-NEXT:    vst1.64 {d18, d19}, [r4]!
-; CHECK-NEXT:    vst1.64 {d16, d17}, [r4]
-; CHECK-NEXT:    add sp, #32
-; CHECK-NEXT:    pop {r4, pc}
+; GNU-LABEL: test_sincos_v2f64:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r4, lr}
+; GNU-NEXT:    sub sp, #32
+; GNU-NEXT:    mov r1, r3
+; GNU-NEXT:    mov r12, r2
+; GNU-NEXT:    add r2, sp, #24
+; GNU-NEXT:    add r3, sp, #16
+; GNU-NEXT:    mov r4, r0
+; GNU-NEXT:    mov r0, r12
+; GNU-NEXT:    bl sincos
+; GNU-NEXT:    ldrd r0, r1, [sp, #40]
+; GNU-NEXT:    add r2, sp, #8
+; GNU-NEXT:    mov r3, sp
+; GNU-NEXT:    bl sincos
+; GNU-NEXT:    vldr d19, [sp, #8]
+; GNU-NEXT:    vldr d18, [sp, #24]
+; GNU-NEXT:    vldr d17, [sp]
+; GNU-NEXT:    vldr d16, [sp, #16]
+; GNU-NEXT:    vst1.64 {d18, d19}, [r4]!
+; GNU-NEXT:    vst1.64 {d16, d17}, [r4]
+; GNU-NEXT:    add sp, #32
+; GNU-NEXT:    pop {r4, pc}
+;
+; GNUEABI-LABEL: test_sincos_v2f64:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r4, lr}
+; GNUEABI-NEXT:    push {r4, lr}
+; GNUEABI-NEXT:    .pad #32
+; GNUEABI-NEXT:    sub sp, sp, #32
+; GNUEABI-NEXT:    mov r1, r3
+; GNUEABI-NEXT:    mov r12, r2
+; GNUEABI-NEXT:    add r2, sp, #24
+; GNUEABI-NEXT:    add r3, sp, #16
+; GNUEABI-NEXT:    mov r4, r0
+; GNUEABI-NEXT:    mov r0, r12
+; GNUEABI-NEXT:    bl sincos
+; GNUEABI-NEXT:    ldr r0, [sp, #40]
+; GNUEABI-NEXT:    add r2, sp, #8
+; GNUEABI-NEXT:    ldr r1, [sp, #44]
+; GNUEABI-NEXT:    mov r3, sp
+; GNUEABI-NEXT:    bl sincos
+; GNUEABI-NEXT:    vldr d19, [sp, #8]
+; GNUEABI-NEXT:    vldr d18, [sp, #24]
+; GNUEABI-NEXT:    vldr d17, [sp]
+; GNUEABI-NEXT:    vldr d16, [sp, #16]
+; GNUEABI-NEXT:    vst1.64 {d18, d19}, [r4]!
+; GNUEABI-NEXT:    vst1.64 {d16, d17}, [r4]
+; GNUEABI-NEXT:    add sp, sp, #32
+; GNUEABI-NEXT:    pop {r4, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_v2f64:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, r6, r7, r8, r10, r11, lr}
+; IOS-NO-STRET-NEXT:    vpush {d8, d9, d10, d11}
+; IOS-NO-STRET-NEXT:    ldr r8, [sp, #64]
+; IOS-NO-STRET-NEXT:    mov r7, r1
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    mov r0, r3
+; IOS-NO-STRET-NEXT:    mov r6, r3
+; IOS-NO-STRET-NEXT:    mov r10, r2
+; IOS-NO-STRET-NEXT:    mov r1, r8
+; IOS-NO-STRET-NEXT:    bl _sin
+; IOS-NO-STRET-NEXT:    mov r11, r0
+; IOS-NO-STRET-NEXT:    mov r5, r1
+; IOS-NO-STRET-NEXT:    mov r0, r6
+; IOS-NO-STRET-NEXT:    mov r1, r8
+; IOS-NO-STRET-NEXT:    bl _cos
+; IOS-NO-STRET-NEXT:    vmov d9, r0, r1
+; IOS-NO-STRET-NEXT:    mov r0, r7
+; IOS-NO-STRET-NEXT:    mov r1, r10
+; IOS-NO-STRET-NEXT:    vmov d11, r11, r5
+; IOS-NO-STRET-NEXT:    bl _sin
+; IOS-NO-STRET-NEXT:    vmov d10, r0, r1
+; IOS-NO-STRET-NEXT:    mov r0, r7
+; IOS-NO-STRET-NEXT:    mov r1, r10
+; IOS-NO-STRET-NEXT:    bl _cos
+; IOS-NO-STRET-NEXT:    vmov d8, r0, r1
+; IOS-NO-STRET-NEXT:    vst1.32 {d10, d11}, [r4]!
+; IOS-NO-STRET-NEXT:    vst1.32 {d8, d9}, [r4]
+; IOS-NO-STRET-NEXT:    vpop {d8, d9, d10, d11}
+; IOS-NO-STRET-NEXT:    pop {r4, r5, r6, r7, r8, r10, r11, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_v2f64:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {r4, r5, r6, lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #32
+; IOS-WITH-STRET-NEXT:    mov r4, r2
+; IOS-WITH-STRET-NEXT:    ldr r2, [sp, #48]
+; IOS-WITH-STRET-NEXT:    mov r6, r0
+; IOS-WITH-STRET-NEXT:    add r0, sp, #16
+; IOS-WITH-STRET-NEXT:    mov r5, r1
+; IOS-WITH-STRET-NEXT:    mov r1, r3
+; IOS-WITH-STRET-NEXT:    bl ___sincos_stret
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    mov r1, r5
+; IOS-WITH-STRET-NEXT:    mov r2, r4
+; IOS-WITH-STRET-NEXT:    bl ___sincos_stret
+; IOS-WITH-STRET-NEXT:    vldr d17, [sp, #16]
+; IOS-WITH-STRET-NEXT:    vldr d16, [sp]
+; IOS-WITH-STRET-NEXT:    vldr d19, [sp, #24]
+; IOS-WITH-STRET-NEXT:    vldr d18, [sp, #8]
+; IOS-WITH-STRET-NEXT:    vst1.32 {d16, d17}, [r6]!
+; IOS-WITH-STRET-NEXT:    vst1.32 {d18, d19}, [r6]
+; IOS-WITH-STRET-NEXT:    add sp, sp, #32
+; IOS-WITH-STRET-NEXT:    pop {r4, r5, r6, pc}
+;
+; WATCHABI-LABEL: test_sincos_v2f64:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 56
+; WATCHABI-NEXT:    .cfi_offset d13, -16
+; WATCHABI-NEXT:    .cfi_offset d12, -24
+; WATCHABI-NEXT:    .cfi_offset d11, -32
+; WATCHABI-NEXT:    .cfi_offset d10, -40
+; WATCHABI-NEXT:    .cfi_offset d9, -48
+; WATCHABI-NEXT:    .cfi_offset d8, -56
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 64
+; WATCHABI-NEXT:    vorr q4, q0, q0
+; WATCHABI-NEXT:    vorr d0, d9, d9
+; WATCHABI-NEXT:    bl ___sincos_stret
+; WATCHABI-NEXT:    vorr d11, d0, d0
+; WATCHABI-NEXT:    vorr d0, d8, d8
+; WATCHABI-NEXT:    vorr d13, d1, d1
+; WATCHABI-NEXT:    bl ___sincos_stret
+; WATCHABI-NEXT:    vorr d12, d1, d1
+; WATCHABI-NEXT:    vorr d10, d0, d0
+; WATCHABI-NEXT:    vorr q1, q6, q6
+; WATCHABI-NEXT:    vorr q0, q5, q5
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a)
   ret { <2 x double>, <2 x double> } %result
 }
 
 define { fp128, fp128 } @test_sincos_f128(fp128 %a) {
-; CHECK-LABEL: test_sincos_f128:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    sub sp, #40
-; CHECK-NEXT:    mov r12, r3
-; CHECK-NEXT:    ldr r3, [sp, #56]
-; CHECK-NEXT:    add.w lr, sp, #8
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    add r0, sp, #24
-; CHECK-NEXT:    strd r0, lr, [sp]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    mov r1, r2
-; CHECK-NEXT:    mov r2, r12
-; CHECK-NEXT:    bl sincosl
-; CHECK-NEXT:    ldrd r2, r3, [sp, #16]
-; CHECK-NEXT:    ldrd r12, r1, [sp, #8]
-; CHECK-NEXT:    str r3, [r4, #28]
-; CHECK-NEXT:    ldrd r3, r5, [sp, #32]
-; CHECK-NEXT:    ldrd lr, r0, [sp, #24]
-; CHECK-NEXT:    strd r1, r2, [r4, #20]
-; CHECK-NEXT:    add.w r1, r4, #8
-; CHECK-NEXT:    stm.w r1, {r3, r5, r12}
-; CHECK-NEXT:    strd lr, r0, [r4]
-; CHECK-NEXT:    add sp, #40
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; GNU-LABEL: test_sincos_f128:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r4, r5, r7, lr}
+; GNU-NEXT:    sub sp, #40
+; GNU-NEXT:    mov r12, r3
+; GNU-NEXT:    ldr r3, [sp, #56]
+; GNU-NEXT:    add.w lr, sp, #8
+; GNU-NEXT:    mov r4, r0
+; GNU-NEXT:    add r0, sp, #24
+; GNU-NEXT:    strd r0, lr, [sp]
+; GNU-NEXT:    mov r0, r1
+; GNU-NEXT:    mov r1, r2
+; GNU-NEXT:    mov r2, r12
+; GNU-NEXT:    bl sincosl
+; GNU-NEXT:    ldrd r2, r3, [sp, #16]
+; GNU-NEXT:    ldrd r12, r1, [sp, #8]
+; GNU-NEXT:    str r3, [r4, #28]
+; GNU-NEXT:    ldrd r3, r5, [sp, #32]
+; GNU-NEXT:    ldrd lr, r0, [sp, #24]
+; GNU-NEXT:    strd r1, r2, [r4, #20]
+; GNU-NEXT:    add.w r1, r4, #8
+; GNU-NEXT:    stm.w r1, {r3, r5, r12}
+; GNU-NEXT:    strd lr, r0, [r4]
+; GNU-NEXT:    add sp, #40
+; GNU-NEXT:    pop {r4, r5, r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f128:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r4, r5, r11, lr}
+; GNUEABI-NEXT:    push {r4, r5, r11, lr}
+; GNUEABI-NEXT:    .pad #40
+; GNUEABI-NEXT:    sub sp, sp, #40
+; GNUEABI-NEXT:    mov r12, r3
+; GNUEABI-NEXT:    ldr r3, [sp, #56]
+; GNUEABI-NEXT:    mov r4, r0
+; GNUEABI-NEXT:    add r0, sp, #24
+; GNUEABI-NEXT:    add r5, sp, #8
+; GNUEABI-NEXT:    stm sp, {r0, r5}
+; GNUEABI-NEXT:    mov r0, r1
+; GNUEABI-NEXT:    mov r1, r2
+; GNUEABI-NEXT:    mov r2, r12
+; GNUEABI-NEXT:    bl sincosl
+; GNUEABI-NEXT:    add r3, sp, #12
+; GNUEABI-NEXT:    ldr r12, [sp, #8]
+; GNUEABI-NEXT:    ldm r3, {r1, r2, r3}
+; GNUEABI-NEXT:    str r3, [r4, #28]
+; GNUEABI-NEXT:    ldr r0, [sp, #32]
+; GNUEABI-NEXT:    ldr lr, [sp, #24]
+; GNUEABI-NEXT:    ldr r5, [sp, #28]
+; GNUEABI-NEXT:    ldr r3, [sp, #36]
+; GNUEABI-NEXT:    str r2, [r4, #24]
+; GNUEABI-NEXT:    str r1, [r4, #20]
+; GNUEABI-NEXT:    add r1, r4, #8
+; GNUEABI-NEXT:    stm r1, {r0, r3, r12}
+; GNUEABI-NEXT:    str r5, [r4, #4]
+; GNUEABI-NEXT:    str lr, [r4]
+; GNUEABI-NEXT:    add sp, sp, #40
+; GNUEABI-NEXT:    pop {r4, r5, r11, pc}
+;
+; IOS-LABEL: test_sincos_f128:
+; IOS:       @ %bb.0:
+; IOS-NEXT:    push {r4, r5, r6, r7, r8, lr}
+; IOS-NEXT:    ldr r8, [sp, #24]
+; IOS-NEXT:    mov r4, r0
+; IOS-NEXT:    mov r5, r3
+; IOS-NEXT:    mov r6, r2
+; IOS-NEXT:    mov r7, r1
+; IOS-NEXT:    mov r0, r1
+; IOS-NEXT:    mov r1, r2
+; IOS-NEXT:    mov r2, r3
+; IOS-NEXT:    mov r3, r8
+; IOS-NEXT:    bl _cosl
+; IOS-NEXT:    add r9, r4, #16
+; IOS-NEXT:    stm r9, {r0, r1, r2, r3}
+; IOS-NEXT:    mov r0, r7
+; IOS-NEXT:    mov r1, r6
+; IOS-NEXT:    mov r2, r5
+; IOS-NEXT:    mov r3, r8
+; IOS-NEXT:    bl _sinl
+; IOS-NEXT:    stm r4, {r0, r1, r2, r3}
+; IOS-NEXT:    pop {r4, r5, r6, r7, r8, pc}
+;
+; WATCHABI-LABEL: test_sincos_f128:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 24
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    .cfi_offset r6, -12
+; WATCHABI-NEXT:    .cfi_offset r5, -16
+; WATCHABI-NEXT:    .cfi_offset r4, -20
+; WATCHABI-NEXT:    .cfi_offset r8, -24
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 32
+; WATCHABI-NEXT:    ldr.w r8, [sp, #32]
+; WATCHABI-NEXT:    mov r4, r0
+; WATCHABI-NEXT:    mov r5, r3
+; WATCHABI-NEXT:    mov r6, r2
+; WATCHABI-NEXT:    mov r7, r1
+; WATCHABI-NEXT:    mov r0, r1
+; WATCHABI-NEXT:    mov r1, r2
+; WATCHABI-NEXT:    mov r2, r3
+; WATCHABI-NEXT:    mov r3, r8
+; WATCHABI-NEXT:    bl _cosl
+; WATCHABI-NEXT:    add.w r9, r4, #16
+; WATCHABI-NEXT:    stm.w r9, {r0, r1, r2, r3}
+; WATCHABI-NEXT:    mov r0, r7
+; WATCHABI-NEXT:    mov r1, r6
+; WATCHABI-NEXT:    mov r2, r5
+; WATCHABI-NEXT:    mov r3, r8
+; WATCHABI-NEXT:    bl _sinl
+; WATCHABI-NEXT:    stm r4!, {r0, r1, r2, r3}
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { fp128, fp128 } @llvm.sincos.f16(fp128 %a)
   ret { fp128, fp128 } %result
 }
diff --git a/llvm/test/CodeGen/ARM/llvm.sincospi.ll b/llvm/test/CodeGen/ARM/llvm.sincospi.ll
new file mode 100644
index 0000000000000..91bf0aaf1806a
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/llvm.sincospi.ll
@@ -0,0 +1,249 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=thumbv7-apple-ios7.0.0 < %s | FileCheck %s
+
+define { half, half } @test_sincospi_f16(half %a) #0 {
+; CHECK-LABEL: test_sincospi_f16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    bl ___extendhfsf2
+; CHECK-NEXT:    add r1, sp, #4
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr r0, [sp, #4]
+; CHECK-NEXT:    bl ___truncsfhf2
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    ldr r0, [sp]
+; CHECK-NEXT:    bl ___truncsfhf2
+; CHECK-NEXT:    mov r1, r0
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    pop {r4, pc}
+  %result = call { half, half } @llvm.sincospi.f16(half %a)
+  ret { half, half } %result
+}
+
+define half @test_sincospi_f16_only_use_sin(half %a) #0 {
+; CHECK-LABEL: test_sincospi_f16_only_use_sin:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    str lr, [sp, #-4]!
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    bl ___extendhfsf2
+; CHECK-NEXT:    add r1, sp, #4
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr r0, [sp, #4]
+; CHECK-NEXT:    bl ___truncsfhf2
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    ldr lr, [sp], #4
+; CHECK-NEXT:    bx lr
+  %result = call { half, half } @llvm.sincospi.f16(half %a)
+  %result.0 = extractvalue { half, half } %result, 0
+  ret half %result.0
+}
+
+define half @test_sincospi_f16_only_use_cos(half %a) #0 {
+; CHECK-LABEL: test_sincospi_f16_only_use_cos:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    str lr, [sp, #-4]!
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    bl ___extendhfsf2
+; CHECK-NEXT:    add r1, sp, #4
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr r0, [sp]
+; CHECK-NEXT:    bl ___truncsfhf2
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    ldr lr, [sp], #4
+; CHECK-NEXT:    bx lr
+  %result = call { half, half } @llvm.sincospi.f16(half %a)
+  %result.1 = extractvalue { half, half } %result, 1
+  ret half %result.1
+}
+
+define { <2 x half>, <2 x half> } @test_sincospi_v2f16(<2 x half> %a) #0 {
+; CHECK-LABEL: test_sincospi_v2f16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    vpush {d8}
+; CHECK-NEXT:    sub sp, #24
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    bl ___extendhfsf2
+; CHECK-NEXT:    add r1, sp, #12
+; CHECK-NEXT:    add r2, sp, #8
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    bl ___extendhfsf2
+; CHECK-NEXT:    add r1, sp, #4
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr r0, [sp, #12]
+; CHECK-NEXT:    bl ___truncsfhf2
+; CHECK-NEXT:    ldr r1, [sp, #4]
+; CHECK-NEXT:    strh.w r0, [sp, #22]
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    bl ___truncsfhf2
+; CHECK-NEXT:    strh.w r0, [sp, #20]
+; CHECK-NEXT:    add r0, sp, #20
+; CHECK-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; CHECK-NEXT:    ldr r0, [sp, #8]
+; CHECK-NEXT:    bl ___truncsfhf2
+; CHECK-NEXT:    ldr r1, [sp]
+; CHECK-NEXT:    strh.w r0, [sp, #18]
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    bl ___truncsfhf2
+; CHECK-NEXT:    strh.w r0, [sp, #16]
+; CHECK-NEXT:    add r0, sp, #16
+; CHECK-NEXT:    vmovl.u16 q9, d8
+; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    vmovl.u16 q8, d16
+; CHECK-NEXT:    vmov.32 r0, d18[0]
+; CHECK-NEXT:    vmov.32 r1, d18[1]
+; CHECK-NEXT:    vmov.32 r2, d16[0]
+; CHECK-NEXT:    vmov.32 r3, d16[1]
+; CHECK-NEXT:    add sp, #24
+; CHECK-NEXT:    vpop {d8}
+; CHECK-NEXT:    pop {r4, pc}
+  %result = call { <2 x half>, <2 x half> } @llvm.sincospi.v2f16(<2 x half> %a)
+  ret { <2 x half>, <2 x half> } %result
+}
+
+define { float, float } @test_sincospi_f32(float %a) #0 {
+; CHECK-LABEL: test_sincospi_f32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    str lr, [sp, #-4]!
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    add r1, sp, #4
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldrd r1, r0, [sp], #8
+; CHECK-NEXT:    ldr lr, [sp], #4
+; CHECK-NEXT:    bx lr
+  %result = call { float, float } @llvm.sincospi.f32(float %a)
+  ret { float, float } %result
+}
+
+define { <2 x float>, <2 x float> } @test_sincospi_v2f32(<2 x float> %a) #0 {
+; CHECK-LABEL: test_sincospi_v2f32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    str lr, [sp, #-4]!
+; CHECK-NEXT:    vpush {d8}
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    vmov d8, r0, r1
+; CHECK-NEXT:    add r1, sp, #4
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    vmov r0, s17
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    add r1, sp, #12
+; CHECK-NEXT:    add r2, sp, #8
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    vldr s1, [sp, #4]
+; CHECK-NEXT:    vldr s3, [sp]
+; CHECK-NEXT:    vldr s0, [sp, #12]
+; CHECK-NEXT:    vldr s2, [sp, #8]
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpop {d8}
+; CHECK-NEXT:    ldr lr, [sp], #4
+; CHECK-NEXT:    bx lr
+  %result = call { <2 x float>, <2 x float> } @llvm.sincospi.v2f32(<2 x float> %a)
+  ret { <2 x float>, <2 x float> } %result
+}
+
+define { <3 x float>, <3 x float> } @test_sincospi_v3f32(<3 x float> %a) #0 {
+; CHECK-LABEL: test_sincospi_v3f32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    mov r6, r2
+; CHECK-NEXT:    mov r7, r1
+; CHECK-NEXT:    add r1, sp, #12
+; CHECK-NEXT:    add r2, sp, #8
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    add r1, sp, #4
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr r0, [sp, #36]
+; CHECK-NEXT:    vmov d0, r7, r6
+; CHECK-NEXT:    mov r1, r4
+; CHECK-NEXT:    add.w r2, r4, #16
+; CHECK-NEXT:    vmov d1, r5, r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vldr s1, [sp, #8]
+; CHECK-NEXT:    vldr s3, [sp, #12]
+; CHECK-NEXT:    vldr s2, [sp, #4]
+; CHECK-NEXT:    vldr s0, [sp]
+; CHECK-NEXT:    vst1.32 {d1}, [r1:64]!
+; CHECK-NEXT:    vst1.32 {d0}, [r2:64]!
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+  %result = call { <3 x float>, <3 x float> } @llvm.sincospi.v3f32(<3 x float> %a)
+  ret { <3 x float>, <3 x float> } %result
+}
+
+define { double, double } @test_sincospi_f64(double %a) #0 {
+; CHECK-LABEL: test_sincospi_f64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push {r4, r7, lr}
+; CHECK-NEXT:    add r7, sp, #4
+; CHECK-NEXT:    sub sp, #20
+; CHECK-NEXT:    mov r4, sp
+; CHECK-NEXT:    bfc r4, #0, #3
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    add r2, sp, #8
+; CHECK-NEXT:    mov r3, sp
+; CHECK-NEXT:    bl ___sincospi
+; CHECK-NEXT:    subs r4, r7, #4
+; CHECK-NEXT:    ldrd r0, r1, [sp, #8]
+; CHECK-NEXT:    ldrd r2, r3, [sp]
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    pop {r4, r7, pc}
+  %result = call { double, double } @llvm.sincospi.f64(double %a)
+  ret { double, double } %result
+}
+
+define { <2 x double>, <2 x double> } @test_sincospi_v2f64(<2 x double> %a) #0 {
+; CHECK-LABEL: test_sincospi_v2f64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    add r7, sp, #16
+; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    mov r4, sp
+; CHECK-NEXT:    bfc r4, #0, #3
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    mov r6, r1
+; CHECK-NEXT:    ldr r1, [r7, #8]
+; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    mov r8, r2
+; CHECK-NEXT:    add r2, sp, #24
+; CHECK-NEXT:    add r3, sp, #16
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    bl ___sincospi
+; CHECK-NEXT:    add r2, sp, #8
+; CHECK-NEXT:    mov r3, sp
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    mov r1, r8
+; CHECK-NEXT:    bl ___sincospi
+; CHECK-NEXT:    vldr d19, [sp, #24]
+; CHECK-NEXT:    vldr d18, [sp, #8]
+; CHECK-NEXT:    vldr d17, [sp, #16]
+; CHECK-NEXT:    vldr d16, [sp]
+; CHECK-NEXT:    vst1.32 {d18, d19}, [r4]!
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r4]
+; CHECK-NEXT:    sub.w r4, r7, #16
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+  %result = call { <2 x double>, <2 x double> } @llvm.sincospi.v2f64(<2 x double> %a)
+  ret { <2 x double>, <2 x double> } %result
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll
index b85cb3a4f191c..6fff0d9b155ef 100644
--- a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll
@@ -450,7 +450,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; ARM7-NEXT:    .short 9 @ 0x9
 ; ARM7-NEXT:    .short 10 @ 0xa
 ; ARM7-NEXT:    .short 10 @ 0xa
-; ARM7-NEXT:    .short 10 @ 0xa
+; ARM7-NEXT:    .short 0 @ 0x0
 ; ARM7-NEXT:  .LCPI4_4:
 ; ARM7-NEXT:    .short 341 @ 0x155
 ; ARM7-NEXT:    .short 292 @ 0x124
@@ -502,7 +502,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; ARM8-NEXT:    .short 9 @ 0x9
 ; ARM8-NEXT:    .short 10 @ 0xa
 ; ARM8-NEXT:    .short 10 @ 0xa
-; ARM8-NEXT:    .short 10 @ 0xa
+; ARM8-NEXT:    .short 0 @ 0x0
 ; ARM8-NEXT:  .LCPI4_4:
 ; ARM8-NEXT:    .short 341 @ 0x155
 ; ARM8-NEXT:    .short 292 @ 0x124
@@ -554,7 +554,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; NEON7-NEXT:    .short 9 @ 0x9
 ; NEON7-NEXT:    .short 10 @ 0xa
 ; NEON7-NEXT:    .short 10 @ 0xa
-; NEON7-NEXT:    .short 10 @ 0xa
+; NEON7-NEXT:    .short 0 @ 0x0
 ; NEON7-NEXT:  .LCPI4_4:
 ; NEON7-NEXT:    .short 341 @ 0x155
 ; NEON7-NEXT:    .short 292 @ 0x124
@@ -606,7 +606,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; NEON8-NEXT:    .short 9 @ 0x9
 ; NEON8-NEXT:    .short 10 @ 0xa
 ; NEON8-NEXT:    .short 10 @ 0xa
-; NEON8-NEXT:    .short 10 @ 0xa
+; NEON8-NEXT:    .short 0 @ 0x0
 ; NEON8-NEXT:  .LCPI4_4:
 ; NEON8-NEXT:    .short 341 @ 0x155
 ; NEON8-NEXT:    .short 292 @ 0x124
diff --git a/llvm/test/CodeGen/ARM/xxstructor-nodef.ll b/llvm/test/CodeGen/ARM/xxstructor-nodef.ll
new file mode 100644
index 0000000000000..db17b2b1c21ab
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/xxstructor-nodef.ll
@@ -0,0 +1,7 @@
+; RUN: llc -mtriple=arm-unknown-linux-gnueabihf < %s | FileCheck %s
+
+; This test contains a llvm.global_ctors with no other definitions. Make sure we do not crash in that case.
+; CHECK: .section        .init_array,"aw",%init_array
+
+declare  ccc void @ghczmbignum_GHCziNumziBackendziSelected_init__prof_init()
+@llvm.global_ctors = appending global [1 x {i32, void ()*, i8* }] [{i32, void ()*, i8* }{i32  65535, void ()*  @ghczmbignum_GHCziNumziBackendziSelected_init__prof_init, i8*  null } ]
diff --git a/llvm/test/CodeGen/BPF/bpf_trap.ll b/llvm/test/CodeGen/BPF/bpf_trap.ll
new file mode 100644
index 0000000000000..ab8df5ff7cb0d
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/bpf_trap.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s | FileCheck %s
+;
+target triple = "bpf"
+
+define i32 @test(i8 %x) {
+entry:
+  %0 = and i8 %x, 3
+  switch i8 %0, label %default.unreachable4 [
+    i8 0, label %return
+    i8 1, label %sw.bb1
+    i8 2, label %sw.bb2
+    i8 3, label %sw.bb3
+  ]
+
+sw.bb1:                                           ; preds = %entry
+  br label %return
+
+sw.bb2:                                           ; preds = %entry
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+  br label %return
+
+default.unreachable4:                             ; preds = %entry
+  unreachable
+
+return:                                           ; preds = %entry, %sw.bb3, %sw.bb2, %sw.bb1
+  %retval.0 = phi i32 [ 12, %sw.bb1 ], [ 43, %sw.bb2 ], [ 54, %sw.bb3 ], [ 32, %entry ]
+  ret i32 %retval.0
+}
+
+; CHECK-NOT: __bpf_trap
diff --git a/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
index d5a1d63b644a8..b7d518639d70e 100644
--- a/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
+++ b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
@@ -84,8 +84,8 @@ llc -march=bpf -mcpu=v4 < test.ll \
 ; CHECK: 	.cfi_endproc
 ; CHECK: 	.section	.jumptables,"",@progbits
 ; CHECK: BPF.JT.0.0:
-; CHECK: 	.quad	LBB0_3
+; CHECK: 	.quad	LBB0_3-.text
 ; CHECK: 	.size	BPF.JT.0.0, 8
 ; CHECK: BPF.JT.0.1:
-; CHECK: 	.quad	LBB0_4
+; CHECK: 	.quad	LBB0_4-.text
 ; CHECK: 	.size	BPF.JT.0.1, 8
diff --git a/llvm/test/CodeGen/BPF/jump_table_global_var.ll b/llvm/test/CodeGen/BPF/jump_table_global_var.ll
index bbca46850843b..71c682f5530ed 100644
--- a/llvm/test/CodeGen/BPF/jump_table_global_var.ll
+++ b/llvm/test/CodeGen/BPF/jump_table_global_var.ll
@@ -78,6 +78,6 @@ llc -march=bpf -mcpu=v4 < test.ll \
 ; CHECK: 	.cfi_endproc
 ; CHECK: 	.section	.jumptables,"",@progbits
 ; CHECK: BPF.JT.0.0:
-; CHECK: 	.quad	LBB0_1
-; CHECK: 	.quad	LBB0_2
+; CHECK: 	.quad	LBB0_1-.text
+; CHECK: 	.quad	LBB0_2-.text
 ; CHECK: 	.size	BPF.JT.0.0, 16
diff --git a/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
index 682b025d665d6..eb1e5bff11013 100644
--- a/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
+++ b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
@@ -93,34 +93,34 @@ llc -march=bpf -mcpu=v4 -bpf-min-jump-table-entries=3 < test.ll \
 ; CHECK: 	.cfi_endproc
 ; CHECK: 	.section	.jumptables,"",@progbits
 ; CHECK: BPF.JT.0.0:
-; CHECK: 	.quad	LBB0_4
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_2
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_3
+; CHECK: 	.quad	LBB0_4-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_2-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_3-.text
 ; CHECK: 	.size	BPF.JT.0.0, 240
diff --git a/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll b/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll
index 895b68b5a9145..ce40085feb0d0 100644
--- a/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll
+++ b/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll
@@ -25,8 +25,7 @@ define i32 @test(i32, i32) local_unnamed_addr #0 {
   %11 = sub nsw i32 %7, %9
   %12 = icmp slt i32 %10, %11
   br i1 %12, label %5, label %13
-; CHECK: r1 = r3
-; CHECK: if r2 s> r3 goto -10 <test+0x40>
+; CHECK: if r2 s> r1 goto -10 <test+0x40>
 
 ; <label>:13:                                     ; preds = %5, %2
   %14 = phi i32 [ 0, %2 ], [ %9, %5 ]
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll
new file mode 100644
index 0000000000000..22fba8c1d5f8c
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll
@@ -0,0 +1,59 @@
+; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s
+;
+; Tests for indexed types in dynamically indexed arrays in cbuffers.
+;
+; struct S {
+;   float x[2];
+;   uint q;
+; };
+; cbuffer CB : register(b0) {
+;   uint32_t3 w[3]; // offset  0, size 12 (+4) * 3
+;   S v[3];         // offset 48, size 24 (+8) * 3
+; }
+%S = type <{ <{ [1 x <{ float, target("dx.Padding", 12) }>], float }>, i32 }>
+%__cblayout_CB = type <{
+  <{
+    [2 x <{ <3 x i32>, target("dx.Padding", 4) }>],
+    <3 x i32>
+  }>,
+  target("dx.Padding", 4),
+  <{
+    [2 x <{ %S, target("dx.Padding", 8) }>], %S
+  }>
+}>
+
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+
+; CHECK: define void @f
+define void @f(ptr %dst, i32 %idx) {
+entry:
+  %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4
+
+  ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb
+  %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4
+
+  ;; w[2].z
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2
+  ; CHECK: store i32 [[X]], ptr %dst
+  %w_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %w_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %w_ptr, i32 40
+  %w_load = load i32, ptr addrspace(2) %w_gep, align 4
+  store i32 %w_load, ptr %dst, align 4
+
+  ;; v[2].q
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 8)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4
+  ; CHECK: store i32 [[X]], ptr [[PTR]]
+  %v_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48)
+  %v_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %v_ptr, i32 84
+  %v_load = load i32, ptr addrspace(2) %v_gep, align 4
+  %v.i = getelementptr inbounds nuw i8, ptr %dst, i32 4
+  store i32 %v_load, ptr %v.i, align 4
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll
new file mode 100644
index 0000000000000..615fc5ea07eca
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll
@@ -0,0 +1,49 @@
+; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s
+;
+; Test for when we have indices into both the array and the vector: ie, s[1][3]
+
+; cbuffer CB : register(b0) {
+;   uint4 s[3]; // offset   0,  size 16        * 3
+; }
+%__cblayout_CB = type <{ [2 x <4 x i32>] }>
+
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+
+; CHECK: define void @f
+define void @f(ptr %dst) {
+entry:
+  %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4
+
+  ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb
+  %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4
+
+  ;; s[1][3]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3
+  ; CHECK: store i32 [[X]], ptr %dst
+  %i8_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %i8_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %i8_ptr, i32 28
+  %i8_vecext = load i32, ptr addrspace(2) %i8_gep, align 4
+  store i32 %i8_vecext, ptr %dst, align 4
+
+  ;; s[2].w
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3
+  ;;
+  ;; It would be nice to avoid the redundant vector creation here, but that's
+  ;; outside of the scope of this pass.
+  ;;
+  ; CHECK: [[X_VEC:%.*]] = insertelement <4 x i32> {{%.*}}, i32 [[X]], i32 3
+  ; CHECK: [[X_EXT:%.*]] = extractelement <4 x i32> [[X_VEC]], i32 3
+  ; CHECK: store i32 [[X_EXT]], ptr %dst
+  %typed_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %typed_gep = getelementptr <4 x i32>, ptr addrspace(2) %typed_ptr, i32 2
+  %typed_load = load <4 x i32>, ptr addrspace(2) %typed_gep, align 16
+  %typed_vecext = extractelement <4 x i32> %typed_load, i32 3
+  store i32 %typed_vecext, ptr %dst, align 4
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll
new file mode 100644
index 0000000000000..eabc07c2fbb68
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll
@@ -0,0 +1,30 @@
+; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s
+
+; cbuffer CB : register(b0) {
+;   float a1[3];
+; }
+%__cblayout_CB = type <{ [2 x <{ float, [12 x i8] }>], float }>
+
+@CB.cb = global target("dx.CBuffer", %__cblayout_CB) poison
+
+; CHECK: define void @f
+define void @f(ptr %dst) {
+entry:
+  %CB.cb_h = call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4
+
+  ;; a1[1]
+  ;; Note that the valid GEPs of a1 are `0, 0, 0`, `0, 0, 1`, and `0, 1`.
+  ;
+  ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb
+  ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1)
+  ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0
+  ; CHECK: store float [[X]], ptr %dst
+  %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8
+  %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %a1_gep = getelementptr inbounds <{ [2 x <{ float, [12 x i8] }>], float }>, ptr addrspace(2) %a1_ptr, i32 0, i32 0, i32 1
+  %a1 = load float, ptr addrspace(2) %a1_gep, align 4
+  store float %a1, ptr %dst, align 32
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll
new file mode 100644
index 0000000000000..6f6166e820a6f
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll
@@ -0,0 +1,145 @@
+; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s
+
+; cbuffer CB : register(b0) {
+;   float a1[3];        // offset   0,  size 4  (+12) * 3
+;   double3 a2[2];      // offset   48, size 24  (+8) * 2
+;   float16_t a3[2][2]; // offset  112, size  2 (+14) * 4
+;   uint64_t a4[3];     // offset  176, size  8  (+8) * 3
+;   int4 a5[2][3][4];   // offset  224, size 16       * 24
+;   uint16_t a6[1];     // offset  608, size  2 (+14) * 1
+;   int64_t a7[2];      // offset  624, size  8  (+8) * 2
+;   bool a8[4];         // offset  656, size  4 (+12) * 4
+; }
+%__cblayout_CB = type <{
+  <{ [2 x <{ float, target("dx.Padding", 12) }>], float }>, target("dx.Padding", 12),
+  <{ [1 x <{ <3 x double>, target("dx.Padding", 8) }>], <3 x double> }>, target("dx.Padding", 8),
+  <{ [3 x <{ half, target("dx.Padding", 14) }>], half }>, target("dx.Padding", 14),
+  <{ [2 x <{ i64, target("dx.Padding", 8) }>], i64 }>, target("dx.Padding", 8),
+  [24 x <4 x i32>],
+  [1 x i16], target("dx.Padding", 14),
+  <{ [1 x <{ i64, target("dx.Padding", 8) }>], i64 }>, target("dx.Padding", 8),
+  <{ [3 x <{ i32, target("dx.Padding", 12) }>], i32 }>
+}>
+
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+
+; CHECK: define void @f
+define void @f(ptr %dst) {
+entry:
+  %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4
+
+  ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb
+  %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4
+
+  ;; a1[1]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1)
+  ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0
+  ; CHECK: store float [[X]], ptr %dst
+  %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %a1_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a1_ptr, i32 16
+  %a1 = load float, ptr addrspace(2) %a1_gep, align 4
+  store float %a1, ptr %dst, align 32
+
+  ;; a2[1]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 5)
+  ; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0
+  ; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1
+  ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 6)
+  ; CHECK: [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0
+  ; CHECK: [[VEC0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0
+  ; CHECK: [[VEC1:%.*]] = insertelement <3 x double> [[VEC0]], double [[Y]], i32 1
+  ; CHECK: [[VEC2:%.*]] = insertelement <3 x double> [[VEC1]], double [[Z]], i32 2
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 8
+  ; CHECK: store <3 x double> [[VEC2]], ptr [[PTR]]
+  %a2_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48)
+  %a2_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a2_ptr, i32 32
+  %a2 = load <3 x double>, ptr addrspace(2) %a2_gep, align 8
+  %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 8
+  store <3 x double> %a2, ptr %a2.i, align 32
+
+  ;; a3[0][1]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 8)
+  ; CHECK: [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 0
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 32
+  ; CHECK: store half [[X]], ptr [[PTR]]
+  %a3_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 112)
+  %a3_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a3_ptr, i32 16
+  %a3 = load half, ptr addrspace(2) %a3_gep, align 2
+  %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 32
+  store half %a3, ptr %a3.i, align 2
+
+  ;; a4[1]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 12)
+  ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 40
+  ; CHECK: store i64 [[X]], ptr [[PTR]]
+  %a4_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 176)
+  %a4_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a4_ptr, i32 16
+  %a4 = load i64, ptr addrspace(2) %a4_gep, align 8
+  %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 40
+  store i64 %a4, ptr %a4.i, align 8
+
+  ;; a5[1][0][0]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 26)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+  ; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1
+  ; CHECK: [[Z:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2
+  ; CHECK: [[A:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3
+  ; CHECK: [[VEC0:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+  ; CHECK: [[VEC1:%.*]] = insertelement <4 x i32> [[VEC0]], i32 [[Y]], i32 1
+  ; CHECK: [[VEC2:%.*]] = insertelement <4 x i32> [[VEC1]], i32 [[Z]], i32 2
+  ; CHECK: [[VEC3:%.*]] = insertelement <4 x i32> [[VEC2]], i32 [[A]], i32 3
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 48
+  ; CHECK: store <4 x i32> [[VEC3]], ptr [[PTR]]
+  %a5_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 224)
+  %a5_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a5_ptr, i32 192
+  %a5 = load <4 x i32>, ptr addrspace(2) %a5_gep, align 4
+  %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 48
+  store <4 x i32> %a5, ptr %a5.i, align 4
+
+  ;; a6[0]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 38)
+  ; CHECK: [[X:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 0
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 64
+  ; CHECK: store i16 [[X]], ptr [[PTR]]
+  %a6_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 608)
+  %a6 = load i16, ptr addrspace(2) %a6_ptr, align 2
+  %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 64
+  store i16 %a6, ptr %a6.i, align 2
+
+  ;; a7[1]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 40)
+  ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 72
+  ; CHECK: store i64 [[X]], ptr [[PTR]]
+  %a7_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 624)
+  %a7_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a7_ptr, i32 16
+  %a7 = load i64, ptr addrspace(2) %a7_gep, align 8
+  %a7.i = getelementptr inbounds nuw i8, ptr %dst, i32 72
+  store i64 %a7, ptr %a7.i, align 8
+
+  ;; a8[1]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 42)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 80
+  ; CHECK: store i32 [[X]], ptr [[PTR]]
+  %a8_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 656)
+  %a8_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a8_ptr, i32 16
+  %a8 = load i32, ptr addrspace(2) %a8_gep, align 4, !range !0, !noundef !1
+  %a8.i = getelementptr inbounds nuw i8, ptr %dst, i32 80
+  store i32 %a8, ptr %a8.i, align 4
+
+  ret void
+}
+
+!0 = !{i32 0, i32 2}
+!1 = !{}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll
new file mode 100644
index 0000000000000..22994cfc3f48a
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll
@@ -0,0 +1,64 @@
+; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s
+;
+; Tests for indexed types in dynamically indexed arrays in cbuffers.
+;
+; Bug https://github.com/llvm/llvm-project/issues/164517
+; XFAIL: *
+;
+; struct S {
+;   float x[2];
+;   uint q;
+; };
+; cbuffer CB : register(b0) {
+;   uint32_t3 w[3]; // offset  0, size 12 (+4) * 3
+;   S v[3];         // offset 48, size 24 (+8) * 3
+; }
+%S = type <{ <{ [1 x <{ float, target("dx.Padding", 12) }>], float }>, i32 }>
+%__cblayout_CB = type <{
+  <{
+    [2 x <{ <3 x i32>, target("dx.Padding", 4) }>],
+    <3 x i32>
+  }>,
+  target("dx.Padding", 4),
+  <{
+    [2 x <{ %S, target("dx.Padding", 8) }>], %S
+  }>
+}>
+
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+
+; CHECK: define void @f
+define void @f(ptr %dst, i32 %idx) {
+entry:
+  %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4
+
+  ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb
+  %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4
+
+  ;; w[idx].z
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2
+  ; CHECK: store i32 [[X]], ptr %dst
+  %w_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %w_arrayidx = getelementptr <3 x i32>, ptr addrspace(2) %w_ptr, i32 %idx
+  %w_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %w_arrayidx, i32 4
+  %w_load = load i32, ptr addrspace(2) %w_gep, align 4
+  store i32 %w_load, ptr %dst, align 4
+
+  ;; v[idx].q
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4
+  ; CHECK: store i32 [[X]], ptr [[PTR]]
+  %v_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48)
+  %v_arrayidx = getelementptr <{ %struct.S, target("dx.Padding", 4) }>, ptr addrspace(2) %v_ptr, i32 %idx
+  %v_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %v_arrayidx, i32 8
+  %v_load = load i32, ptr addrspace(2) %v_gep, align 4
+  %v.i = getelementptr inbounds nuw i8, ptr %dst, i32 4
+  store i32 %v_load, ptr %v.i, align 4
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll
new file mode 100644
index 0000000000000..7daebaed70442
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll
@@ -0,0 +1,46 @@
+; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s
+;
+; Tests for dynamic indices into arrays in cbuffers.
+
+; cbuffer CB : register(b0) {
+;   uint s[10]; // offset   0,  size  4 (+12) * 10
+;   uint t[12]; // offset 160,  size  4 (+12) * 12
+; }
+%__cblayout_CB = type <{ <{ [9 x <{ i32, target("dx.Padding", 12) }>], i32 }>, target("dx.Padding", 12), <{ [11 x <{ i32, target("dx.Padding", 12) }>], i32 }> }>
+
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+
+; CHECK: define void @f
+define void @f(ptr %dst, i32 %idx) {
+entry:
+  %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4
+
+  ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb
+  %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4
+
+  ;; s[idx]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+  ; CHECK: store i32 [[X]], ptr %dst
+  %s_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %s_gep = getelementptr <{ i32, target("dx.Padding", 12) }>, ptr addrspace(2) %s_ptr, i32 %idx
+  %s_load = load i32, ptr addrspace(2) %s_gep, align 4
+  store i32 %s_load, ptr %dst, align 4
+
+  ;; t[idx]
+  ;
+  ; CHECK: [[T_IDX:%.*]] = add i32 10, %idx
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 [[T_IDX]])
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4
+  ; CHECK: store i32 [[X]], ptr [[PTR]]
+  %t_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 160)
+  %t_gep = getelementptr <{ i32, target("dx.Padding", 12) }>, ptr addrspace(2) %t_ptr, i32 %idx
+  %t_load = load i32, ptr addrspace(2) %t_gep, align 4
+  %t.i = getelementptr inbounds nuw i8, ptr %dst, i32 4
+  store i32 %t_load, ptr %t.i, align 4
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll
new file mode 100644
index 0000000000000..65c9a3ec966e9
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll
@@ -0,0 +1,101 @@
+; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s
+
+; cbuffer CB {
+;   float a1;     // offset  0, size  4
+;   int a2;       // offset  4, size  4
+;   bool a3;      // offset  8, size  4
+;   float16_t a4; // offset 12, size  2
+;   uint16_t a5;  // offset 14, size  2
+;   double a6;    // offset 16, size  8
+;   int64_t a7;   // offset 24, size  8
+; }
+%__cblayout_CB = type <{ float, i32, i32, half, i16, double, i64 }>
+
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+
+; CHECK: define void @f
+define void @f(ptr %dst) {
+entry:
+  %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4
+
+  ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb
+  %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8
+
+  ;; a1
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0)
+  ; CHECK: [[A1:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0
+  ; CHECK: store float [[A1]], ptr %dst
+  %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %a1 = load float, ptr addrspace(2) %a1_ptr, align 4
+  store float %a1, ptr %dst, align 8
+
+  ;; a2
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0)
+  ; CHECK: [[A2:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4
+  ; CHECK: store i32 [[A2]], ptr [[PTR]]
+  %a2_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 4)
+  %a2 = load i32, ptr addrspace(2) %a2_ptr, align 4
+  %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 4
+  store i32 %a2, ptr %a2.i, align 8
+
+  ;; a3
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0)
+  ; CHECK: [[A3:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 8
+  ; CHECK: store i32 [[A3]], ptr [[PTR]]
+  %a3_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 8)
+  %a3 = load i32, ptr addrspace(2) %a3_ptr, align 4
+  %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 8
+  store i32 %a3, ptr %a3.i, align 4
+
+  ;; a4
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0)
+  ; CHECK: [[A4:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 6
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 12
+  ; CHECK: store half [[A4]], ptr [[PTR]]
+  %a4_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 12)
+  %a4 = load half, ptr addrspace(2) %a4_ptr, align 2
+  %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 12
+  store half %a4, ptr %a4.i, align 4
+
+  ;; a5
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0)
+  ; CHECK: [[A5:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 7
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 14
+  ; CHECK: store i16 [[A5]], ptr [[PTR]]
+  %a5_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 14)
+  %a5 = load i16, ptr addrspace(2) %a5_ptr, align 2
+  %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 14
+  store i16 %a5, ptr %a5.i, align 2
+
+  ;; a6
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1)
+  ; CHECK: [[A6:%.*]] = extractvalue { double, double } [[LOAD]], 0
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 16
+  ; CHECK: store double [[A6]], ptr [[PTR]]
+  %a6_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 16)
+  %a6 = load double, ptr addrspace(2) %a6_ptr, align 8
+  %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 16
+  store double %a6, ptr %a6.i, align 8
+
+  ;; a7
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1)
+  ; CHECK: [[A7:%.*]] = extractvalue { i64, i64 } [[LOAD]], 1
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 24
+  ; CHECK: store i64 [[A7]], ptr [[PTR]]
+  %a7_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 24)
+  %a7 = load i64, ptr addrspace(2) %a7_ptr, align 8
+  %a7.i = getelementptr inbounds nuw i8, ptr %dst, i32 24
+  store i64 %a7, ptr %a7.i, align 8
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll
new file mode 100644
index 0000000000000..0156a1a0472ab
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll
@@ -0,0 +1,121 @@
+; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s
+
+; cbuffer CB {
+;   float3 a1;     // offset   0, size 12 (+4)
+;   double3 a2;    // offset  16, size 24
+;   float16_t2 a3; // offset  40, size  4 (+4)
+;   uint64_t3 a4;  // offset  48, size 24 (+8)
+;   int4 a5;       // offset  80, size 16
+;   uint16_t3 a6;  // offset  96, size  6
+; };
+%__cblayout_CB = type <{ <3 x float>, target("dx.Padding", 4), <3 x double>, <2 x half>, target("dx.Padding", 4), <3 x i64>, target("dx.Padding", 8), <4 x i32>, <3 x i16> }>
+
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+
+; CHECK: define void @f
+define void @f(ptr %dst) {
+entry:
+  %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4
+
+  ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb
+  %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8
+
+  ;; a1
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0)
+  ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0
+  ; CHECK: [[Y:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 1
+  ; CHECK: [[Z:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 2
+  ; CHECK: [[VEC0:%.*]] = insertelement <3 x float> poison, float [[X]], i32 0
+  ; CHECK: [[VEC1:%.*]] = insertelement <3 x float> [[VEC0]], float [[Y]], i32 1
+  ; CHECK: [[VEC2:%.*]] = insertelement <3 x float> [[VEC1]], float [[Z]], i32 2
+  ; CHECK: store <3 x float> [[VEC2]], ptr %dst
+  %a1_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %a1 = load <3 x float>, ptr addrspace(2) %a1_gep, align 16
+  store <3 x float> %a1, ptr %dst, align 4
+
+  ;; a2
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1)
+  ; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0
+  ; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1
+  ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2)
+  ; CHECK: [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0
+  ; CHECK: [[VEC0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0
+  ; CHECK: [[VEC1:%.*]] = insertelement <3 x double> [[VEC0]], double [[Y]], i32 1
+  ; CHECK: [[VEC2:%.*]] = insertelement <3 x double> [[VEC1]], double [[Z]], i32 2
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 16
+  ; CHECK: store <3 x double> [[VEC2]], ptr [[PTR]]
+  %a2_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 16)
+  %a2 = load <3 x double>, ptr addrspace(2) %a2_gep, align 32
+  %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 16
+  store <3 x double> %a2, ptr %a2.i, align 8
+
+  ;; a3
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2)
+  ; CHECK: [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 4
+  ; CHECK: [[Y:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 5
+  ; CHECK: [[VEC0:%.*]] = insertelement <2 x half> poison, half [[X]], i32 0
+  ; CHECK: [[VEC1:%.*]] = insertelement <2 x half> [[VEC0]], half [[Y]], i32 1
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 40
+  ; CHECK: store <2 x half> [[VEC1]], ptr [[PTR]]
+  %a3_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 40)
+  %a3 = load <2 x half>, ptr addrspace(2) %a3_gep, align 4
+  %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 40
+  store <2 x half> %a3, ptr %a3.i, align 2
+
+  ;; a4
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 3)
+  ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0
+  ; CHECK: [[Y:%.*]] = extractvalue { i64, i64 } [[LOAD]], 1
+  ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 4)
+  ; CHECK: [[Z:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0
+  ; CHECK: [[VEC0:%.*]] = insertelement <3 x i64> poison, i64 [[X]], i32 0
+  ; CHECK: [[VEC1:%.*]] = insertelement <3 x i64> [[VEC0]], i64 [[Y]], i32 1
+  ; CHECK: [[VEC2:%.*]] = insertelement <3 x i64> [[VEC1]], i64 [[Z]], i32 2
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 48
+  ; CHECK: store <3 x i64> [[VEC2]], ptr [[PTR]]
+  %a4_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48)
+  %a4 = load <3 x i64>, ptr addrspace(2) %a4_gep, align 32
+  %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 48
+  store <3 x i64> %a4, ptr %a4.i, align 8
+
+  ;; a5
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 5)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+  ; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1
+  ; CHECK: [[Z:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2
+  ; CHECK: [[A:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3
+  ; CHECK: [[VEC0:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+  ; CHECK: [[VEC1:%.*]] = insertelement <4 x i32> [[VEC0]], i32 [[Y]], i32 1
+  ; CHECK: [[VEC2:%.*]] = insertelement <4 x i32> [[VEC1]], i32 [[Z]], i32 2
+  ; CHECK: [[VEC3:%.*]] = insertelement <4 x i32> [[VEC2]], i32 [[A]], i32 3
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 72
+  ; CHECK: store <4 x i32> [[VEC3]], ptr [[PTR]]
+  %a5_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 80)
+  %a5 = load <4 x i32>, ptr addrspace(2) %a5_gep, align 16
+  %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 72
+  store <4 x i32> %a5, ptr %a5.i, align 4
+
+  ;; a6
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 6)
+  ; CHECK: [[X:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 0
+  ; CHECK: [[Y:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 1
+  ; CHECK: [[Z:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 2
+  ; CHECK: [[VEC0:%.*]] = insertelement <3 x i16> poison, i16 [[X]], i32 0
+  ; CHECK: [[VEC1:%.*]] = insertelement <3 x i16> [[VEC0]], i16 [[Y]], i32 1
+  ; CHECK: [[VEC2:%.*]] = insertelement <3 x i16> [[VEC1]], i16 [[Z]], i32 2
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 88
+  ; CHECK: store <3 x i16> [[VEC2]], ptr [[PTR]]
+  %a6_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 96)
+  %a6 = load <3 x i16>, ptr addrspace(2) %a6_gep, align 8
+  %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 88
+  store <3 x i16> %a6, ptr %a6.i, align 2
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/f16tof32.ll b/llvm/test/CodeGen/DirectX/f16tof32.ll
new file mode 100644
index 0000000000000..edc5c1942e8bd
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/f16tof32.ll
@@ -0,0 +1,57 @@
+; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.9-library %s | FileCheck %s
+
+define hidden noundef nofpclass(nan inf) float @_Z11test_scalarj(i32 noundef %p0) local_unnamed_addr #0 {
+entry:
+  ; CHECK : [[UINT:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 %p0)
+  ; CHECK : ret float [[UINT]]
+  %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %p0)
+  ret float %hlsl.f16tof32
+}
+
+define hidden noundef nofpclass(nan inf) <2 x float> @_Z10test_uint2Dv2_j(<2 x i32> noundef %p0) local_unnamed_addr #0 {
+entry:
+  ; CHECK: [[UINT2_0:%.*]] = extractelement <2 x i32> %p0, i64 0
+  ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT2_0]])
+  ; CHECK: [[UINT2_1:%.*]] = extractelement <2 x i32> %p0, i64 1
+  ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT2_1]])
+  ; CHECK: [[FLOAT2_0:%.*]] = insertelement <2 x float> poison, float [[FLOAT_0]], i64 0
+  ; CHECK: [[FLOAT2_1:%.*]] = insertelement <2 x float> [[FLOAT2_0]], float [[FLOAT_1]], i64 1
+  ; CHECK : ret <2 x float>  [[FLOAT2_1]]
+  %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %p0)
+  ret <2 x float> %hlsl.f16tof32
+}
+
+define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) local_unnamed_addr #0 {
+entry:
+  ; CHECK: [[UINT3_0:%.*]] = extractelement <3 x i32> %p0, i64 0
+  ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_0]])
+  ; CHECK: [[UINT3_1:%.*]] = extractelement <3 x i32> %p0, i64 1
+  ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_1]])
+  ; CHECK: [[UINT3_2:%.*]] = extractelement <3 x i32> %p0, i64 2
+  ; CHECK: [[FLOAT_2:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_2]])
+  ; CHECK: [[FLOAT3_0:%.*]] = insertelement <3 x float> poison, float [[FLOAT_0]], i64 0
+  ; CHECK: [[FLOAT3_1:%.*]] = insertelement <3 x float> [[FLOAT3_0]], float [[FLOAT_1]], i64 1
+  ; CHECK: [[FLOAT3_2:%.*]] = insertelement <3 x float> [[FLOAT3_1]], float [[FLOAT_2]], i64 2
+  ; CHECK : ret <3 x float> [[FLOAT3_2]]
+  %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %p0)
+  ret <3 x float> %hlsl.f16tof32
+}
+
+define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) local_unnamed_addr #0 {
+entry:
+  ; CHECK: [[UINT4_0:%.*]] = extractelement <4 x i32> %p0, i64 0
+  ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_0]])
+  ; CHECK: [[UINT4_1:%.*]] = extractelement <4 x i32> %p0, i64 1
+  ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_1]])
+  ; CHECK: [[UINT4_2:%.*]] = extractelement <4 x i32> %p0, i64 2
+  ; CHECK: [[FLOAT_2:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_2]])
+  ; CHECK: [[UINT4_3:%.*]] = extractelement <4 x i32> %p0, i64 3
+  ; CHECK: [[FLOAT_3:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_3]])
+  ; CHECK: [[FLOAT4_0:%.*]] = insertelement <4 x float> poison, float [[FLOAT_0]], i64 0
+  ; CHECK: [[FLOAT4_1:%.*]] = insertelement <4 x float> [[FLOAT4_0]], float [[FLOAT_1]], i64 1
+  ; CHECK: [[FLOAT4_2:%.*]] = insertelement <4 x float> [[FLOAT4_1]], float [[FLOAT_2]], i64 2
+  ; CHECK: [[FLOAT4_3:%.*]] = insertelement <4 x float> [[FLOAT4_2]], float [[FLOAT_3]], i64 3
+  ; CHECK : ret <4 x float> [[FLOAT4_3]]
+  %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %p0)
+  ret <4 x float> %hlsl.f16tof32
+}
diff --git a/llvm/test/CodeGen/DirectX/llvm_assume.ll b/llvm/test/CodeGen/DirectX/llvm_assume.ll
new file mode 100644
index 0000000000000..d739592b75d78
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/llvm_assume.ll
@@ -0,0 +1,9 @@
+; RUN: opt -S -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+define void @test_llvm_assume(i1 %0)  {
+; CHECK-LABEL: test_llvm_assume
+; CHECK-NEXT: ret void
+tail call void @llvm.assume(i1 %0)
+ret void
+}
+
diff --git a/llvm/test/CodeGen/DirectX/scalarize-alloca.ll b/llvm/test/CodeGen/DirectX/scalarize-alloca.ll
index a8557e47b0ea6..475935d2eb135 100644
--- a/llvm/test/CodeGen/DirectX/scalarize-alloca.ll
+++ b/llvm/test/CodeGen/DirectX/scalarize-alloca.ll
@@ -42,3 +42,68 @@ define void @alloca_2d_gep_test() {
   %3 = getelementptr inbounds nuw [2 x <2 x i32>], ptr %1, i32 0, i32 %2
   ret void
 }
+
+; CHECK-LABEL: subtype_array_test
+define void @subtype_array_test() {
+  ; SCHECK:  [[alloca_val:%.*]] = alloca [8 x [4 x i32]], align 4
+  ; FCHECK:  [[alloca_val:%.*]] = alloca [32 x i32], align 4
+  ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
+  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr [[alloca_val]], i32 0, i32 [[tid]]
+  ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 4
+  ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
+  ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr [[alloca_val]], i32 0, i32 [[flatidx]]
+  ; CHECK: ret void
+  %arr = alloca [8 x [4 x i32]], align 4
+  %i = tail call i32 @llvm.dx.thread.id(i32 0)
+  %gep = getelementptr inbounds nuw [4 x i32], ptr %arr, i32 %i
+  ret void
+}
+
+; CHECK-LABEL: subtype_vector_test
+define void @subtype_vector_test() {
+  ; SCHECK:  [[alloca_val:%.*]] = alloca [8 x [4 x i32]], align 4
+  ; FCHECK:  [[alloca_val:%.*]] = alloca [32 x i32], align 4
+  ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
+  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr [[alloca_val]], i32 0, i32 [[tid]]
+  ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 4
+  ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
+  ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr [[alloca_val]], i32 0, i32 [[flatidx]]
+  ; CHECK: ret void
+  %arr = alloca [8 x <4 x i32>], align 4
+  %i = tail call i32 @llvm.dx.thread.id(i32 0)
+  %gep = getelementptr inbounds nuw <4 x i32>, ptr %arr, i32 %i
+  ret void
+}
+
+; CHECK-LABEL: subtype_scalar_test
+define void @subtype_scalar_test() {
+  ; SCHECK:  [[alloca_val:%.*]] = alloca [8 x [4 x i32]], align 4
+  ; FCHECK:  [[alloca_val:%.*]] = alloca [32 x i32], align 4
+  ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
+  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr [[alloca_val]], i32 0, i32 0, i32 [[tid]]
+  ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 1
+  ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
+  ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr [[alloca_val]], i32 0, i32 [[flatidx]]
+  ; CHECK: ret void
+  %arr = alloca [8 x [4 x i32]], align 4
+  %i = tail call i32 @llvm.dx.thread.id(i32 0)
+  %gep = getelementptr inbounds nuw i32, ptr %arr, i32 %i
+  ret void
+}
+
+; CHECK-LABEL: subtype_i8_test
+define void @subtype_i8_test() {
+  ; SCHECK:  [[alloca_val:%.*]] = alloca [8 x [4 x i32]], align 4
+  ; FCHECK:  [[alloca_val:%.*]] = alloca [32 x i32], align 4
+  ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
+  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw i8, ptr [[alloca_val]], i32 [[tid]]
+  ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 1
+  ; FCHECK: [[flatidx_lshr:%.*]] = lshr i32 [[flatidx_mul]], 2
+  ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_lshr]]
+  ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr [[alloca_val]], i32 0, i32 [[flatidx]]
+  ; CHECK: ret void
+  %arr = alloca [8 x [4 x i32]], align 4
+  %i = tail call i32 @llvm.dx.thread.id(i32 0)
+  %gep = getelementptr inbounds nuw i8, ptr %arr, i32 %i
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/scalarize-global.ll b/llvm/test/CodeGen/DirectX/scalarize-global.ll
new file mode 100644
index 0000000000000..ca10f6ece5a85
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/scalarize-global.ll
@@ -0,0 +1,70 @@
+; RUN: opt -S -passes='dxil-data-scalarization' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=SCHECK,CHECK
+; RUN: opt -S -passes='dxil-data-scalarization,dxil-flatten-arrays' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=FCHECK,CHECK
+
+@"arrayofVecData" = local_unnamed_addr addrspace(3) global [8 x <4 x i32>] zeroinitializer, align 16
+@"vecData" = external addrspace(3) global <4 x i32>, align 4
+
+; SCHECK: [[arrayofVecData:@arrayofVecData.*]] = local_unnamed_addr addrspace(3) global [8 x [4 x i32]] zeroinitializer, align 16
+; FCHECK: [[arrayofVecData:@arrayofVecData.*]] = local_unnamed_addr addrspace(3) global [32 x i32] zeroinitializer, align 16
+; CHECK: [[vecData:@vecData.*]] = external addrspace(3) global [4 x i32], align 4
+
+; CHECK-LABEL: subtype_array_test
+define <4 x i32> @subtype_array_test() {
+  ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
+  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[tid]]
+  ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 4
+  ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
+  ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[flatidx]]
+  ; CHECK: [[x:%.*]] = load <4 x i32>, ptr addrspace(3) [[gep]], align 4
+  ; CHECK: ret <4 x i32> [[x]]
+  %i = tail call i32 @llvm.dx.thread.id(i32 0)
+  %gep = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @"arrayofVecData", i32 %i
+  %x = load <4 x i32>, ptr addrspace(3) %gep, align 4
+  ret <4 x i32> %x
+}
+
+; CHECK-LABEL: subtype_vector_test
+define <4 x i32> @subtype_vector_test() {
+  ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
+  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[tid]]
+  ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 4
+  ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
+  ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[flatidx]]
+  ; CHECK: [[x:%.*]] = load <4 x i32>, ptr addrspace(3) [[gep]], align 4
+  ; CHECK: ret <4 x i32> [[x]]
+  %i = tail call i32 @llvm.dx.thread.id(i32 0)
+  %gep = getelementptr inbounds nuw <4 x i32>, ptr addrspace(3) @"arrayofVecData", i32 %i
+  %x = load <4 x i32>, ptr addrspace(3) %gep, align 4
+  ret <4 x i32> %x
+}
+
+; CHECK-LABEL: subtype_scalar_test
+define <4 x i32> @subtype_scalar_test() {
+  ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
+  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 0, i32 [[tid]]
+  ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 1
+  ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
+  ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[flatidx]]
+  ; CHECK: [[x:%.*]] = load <4 x i32>, ptr addrspace(3) [[gep]], align 4
+  ; CHECK: ret <4 x i32> [[x]]
+  %i = tail call i32 @llvm.dx.thread.id(i32 0)
+  %gep = getelementptr inbounds nuw i32, ptr addrspace(3) @"arrayofVecData", i32 %i
+  %x = load <4 x i32>, ptr addrspace(3) %gep, align 4
+  ret <4 x i32> %x
+}
+
+; CHECK-LABEL: subtype_i8_test
+define <4 x i32> @subtype_i8_test() {
+  ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
+  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[arrayofVecData]], i32 [[tid]]
+  ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 1
+  ; FCHECK: [[flatidx_lshr:%.*]] = lshr i32 [[flatidx_mul]], 2
+  ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_lshr]]
+  ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[flatidx]]
+  ; CHECK: [[x:%.*]] = load <4 x i32>, ptr addrspace(3) [[gep]], align 4
+  ; CHECK: ret <4 x i32> [[x]]
+  %i = tail call i32 @llvm.dx.thread.id(i32 0)
+  %gep = getelementptr inbounds nuw i8, ptr addrspace(3) @"arrayofVecData", i32 %i
+  %x = load <4 x i32>, ptr addrspace(3) %gep, align 4
+  ret <4 x i32> %x
+}
diff --git a/llvm/test/CodeGen/DirectX/wavesize-md-errs.ll b/llvm/test/CodeGen/DirectX/wavesize-md-errs.ll
new file mode 100644
index 0000000000000..9016c5d7e8d44
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/wavesize-md-errs.ll
@@ -0,0 +1,31 @@
+; RUN: split-file %s %t
+; RUN: not opt -S --dxil-translate-metadata %t/low-sm.ll 2>&1 | FileCheck %t/low-sm.ll
+; RUN: not opt -S --dxil-translate-metadata %t/low-sm-for-range.ll 2>&1 | FileCheck %t/low-sm-for-range.ll
+
+; Test that wavesize metadata is only allowed on applicable shader model versions
+
+;--- low-sm.ll
+
+; CHECK: Shader model 6.6 or greater is required to specify the "hlsl.wavesize" function attribute
+
+target triple = "dxil-unknown-shadermodel6.5-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,0,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+;--- low-sm-for-range.ll
+
+; CHECK: Shader model 6.8 or greater is required to specify wave size range values of the "hlsl.wavesize" function attribute
+
+target triple = "dxil-unknown-shadermodel6.7-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,32,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/DirectX/wavesize-md-valid.ll b/llvm/test/CodeGen/DirectX/wavesize-md-valid.ll
new file mode 100644
index 0000000000000..3ad6c1d034252
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/wavesize-md-valid.ll
@@ -0,0 +1,96 @@
+; RUN: split-file %s %t
+; RUN: opt -S --dxil-translate-metadata %t/only.ll | FileCheck %t/only.ll
+; RUN: opt -S --dxil-translate-metadata %t/min.ll | FileCheck %t/min.ll
+; RUN: opt -S --dxil-translate-metadata %t/max.ll | FileCheck %t/max.ll
+; RUN: opt -S --dxil-translate-metadata %t/pref.ll | FileCheck %t/pref.ll
+
+; RUN: llc --filetype=obj %t/only.ll -o - | obj2yaml | FileCheck %t/only.ll --check-prefix=OBJ
+; RUN: llc --filetype=obj %t/min.ll -o - | obj2yaml | FileCheck %t/min.ll --check-prefix=OBJ
+; RUN: llc --filetype=obj %t/max.ll -o - | obj2yaml | FileCheck %t/max.ll --check-prefix=OBJ
+; RUN: llc --filetype=obj %t/pref.ll -o - | obj2yaml | FileCheck %t/pref.ll --check-prefix=OBJ
+
+; Test that wave size/range metadata is correctly generated with the correct tag
+
+;--- only.ll
+
+; CHECK: !dx.entryPoints = !{![[#ENTRY:]]}
+; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]}
+; CHECK: ![[#PROPS]] = !{{{.*}}i32 11, ![[#WAVE_SIZE:]]{{.*}}}
+; CHECK: ![[#WAVE_SIZE]] = !{i32 16}
+
+; OBJ: - Name:    PSV0
+; OBJ:   PSVInfo:
+; OBJ:     MinimumWaveLaneCount: 16
+; OBJ:     MaximumWaveLaneCount: 16
+
+target triple = "dxil-unknown-shadermodel6.6-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,0,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+;--- min.ll
+
+; CHECK: !dx.entryPoints = !{![[#ENTRY:]]}
+; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]}
+; CHECK: ![[#PROPS]] = !{{{.*}}i32 23, ![[#WAVE_SIZE:]]{{.*}}}
+; CHECK: ![[#WAVE_SIZE]] = !{i32 16, i32 0, i32 0}
+
+; OBJ: - Name:    PSV0
+; OBJ:   PSVInfo:
+; OBJ:     MinimumWaveLaneCount: 16
+; OBJ:     MaximumWaveLaneCount: 16
+
+target triple = "dxil-unknown-shadermodel6.8-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,0,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+;--- max.ll
+
+; CHECK: !dx.entryPoints = !{![[#ENTRY:]]}
+; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]}
+; CHECK: ![[#PROPS]] = !{{{.*}}i32 23, ![[#WAVE_SIZE:]]{{.*}}}
+; CHECK: ![[#WAVE_SIZE]] = !{i32 16, i32 32, i32 0}
+
+; OBJ: - Name:    PSV0
+; OBJ:   PSVInfo:
+; OBJ:     MinimumWaveLaneCount: 16
+; OBJ:     MaximumWaveLaneCount: 32
+
+target triple = "dxil-unknown-shadermodel6.8-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,32,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+;--- pref.ll
+
+; CHECK: !dx.entryPoints = !{![[#ENTRY:]]}
+; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]}
+; CHECK: ![[#PROPS]] = !{{{.*}}i32 23, ![[#WAVE_SIZE:]]{{.*}}}
+; CHECK: ![[#WAVE_SIZE]] = !{i32 16, i32 64, i32 32}
+
+; OBJ: - Name:    PSV0
+; OBJ:   PSVInfo:
+; OBJ:     MinimumWaveLaneCount: 16
+; OBJ:     MaximumWaveLaneCount: 64
+
+target triple = "dxil-unknown-shadermodel6.8-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,64,32" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll b/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll
new file mode 100644
index 0000000000000..67d2ad72ee2f4
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll
@@ -0,0 +1,50 @@
+; REQUIRES: x86-registered-target
+
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -split-machine-functions -O0 -mfs-psi-cutoff=0 -mfs-count-threshold=10000 | FileCheck %s
+
+;; Check that functions with optnone attribute are not split.
+; CHECK-LABEL: foo_optnone:
+; CHECK-NOT:   .section .text.split.foo_optnone
+; CHECK-NOT:   foo_optnone.cold:
+; CHECK:       .LBB0_2:
+; CHECK:       .size   foo_optnone
+
+define void @foo_optnone(i1 zeroext %0) nounwind optnone noinline !prof !14 !section_prefix !15 {
+entry:
+  br i1 %0, label %hot, label %cold, !prof !17
+
+hot:
+  %1 = call i32 @bar()
+  br label %exit
+
+cold:
+  %2 = call i32 @baz()
+  br label %exit
+
+exit:
+  %3 = tail call i32 @qux()
+  ret void
+}
+
+declare i32 @bar()
+declare i32 @baz()
+declare i32 @qux()
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 5}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999900, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 7000}
+!15 = !{!"function_section_prefix", !"hot"}
+!17 = !{!"branch_weights", i32 7000, i32 0}
diff --git a/llvm/test/CodeGen/Generic/reloc-none.ll b/llvm/test/CodeGen/Generic/reloc-none.ll
new file mode 100644
index 0000000000000..0c8b7a57aca83
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/reloc-none.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s | FileCheck %s
+
+; CHECK: .reloc {{.*}}, BFD_RELOC_NONE, foo
+
+define void @test_reloc_none() {
+  call void @llvm.reloc.none(metadata !"foo")
+  ret void
+}
+
+declare void @llvm.reloc.none(metadata)
diff --git a/llvm/test/CodeGen/Hexagon/and_mask_cmp0_sink.ll b/llvm/test/CodeGen/Hexagon/and_mask_cmp0_sink.ll
new file mode 100644
index 0000000000000..b5c3399ce6605
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/and_mask_cmp0_sink.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; Test that 'and' mask is sunk to the cmp use block only if it is masking a single bit
+; RUN: llc -march=hexagon --verify-machineinstrs < %s | FileCheck %s
+
+@A = global i32 zeroinitializer
+
+define i32 @and_sink1(i32 %a) {
+; CHECK-LABEL: and_sink1:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     p0 = !tstbit(r0,#11)
+; CHECK-NEXT:     r0 = ##A
+; CHECK-NEXT:    }
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_1: // %bb0
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    {
+; CHECK-NEXT:     if (p0) jump:nt .LBB0_1
+; CHECK-NEXT:     memw(r0+#0) = #0
+; CHECK-NEXT:    }
+; CHECK-NEXT:  // %bb.2: // %bb2
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = #0
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %and = and i32 %a, 2048
+  br label %bb0
+bb0:
+  %cmp = icmp eq i32 %and, 0
+  store i32 0, i32* @A
+  br i1 %cmp, label %bb0, label %bb2
+bb2:
+  ret i32 0
+}
+
+define i32 @and_sink2(i32 %a) {
+; CHECK-LABEL: and_sink2:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = and(r0,##2049)
+; CHECK-NEXT:     r0 = ##A
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     p0 = cmp.eq(r1,#0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB1_1: // %bb0
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    {
+; CHECK-NEXT:     if (p0) jump:nt .LBB1_1
+; CHECK-NEXT:     memw(r0+#0) = #0
+; CHECK-NEXT:    }
+; CHECK-NEXT:  // %bb.2: // %bb2
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = #0
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %and = and i32 %a, 2049
+  br label %bb0
+bb0:
+  %cmp = icmp eq i32 %and, 0
+  store i32 0, i32* @A
+  br i1 %cmp, label %bb0, label %bb2
+bb2:
+  ret i32 0
+}
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/xqf-fixup-qfp1.ll b/llvm/test/CodeGen/Hexagon/autohvx/xqf-fixup-qfp1.ll
new file mode 100644
index 0000000000000..9625a605910c2
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/xqf-fixup-qfp1.ll
@@ -0,0 +1,372 @@
+; REQUIRES: hexagon-registered-target, silver
+; This tests correct handling of register spills and fills of
+; qf operands during register allocation.
+
+; RUN: llc -mcpu=hexagonv79 -mattr=+hvx-length128b,+hvxv79,+hvx-ieee-fp,+hvx-qfloat,-long-calls -debug-only=handle-qfp %s 2>&1 -o - | FileCheck %s --check-prefixes V79-81,V79
+; RUN: llc -mcpu=hexagonv81 -mattr=+hvx-length128b,+hvxv81,+hvx-ieee-fp,+hvx-qfloat,-long-calls -debug-only=handle-qfp %s 2>&1 -o - | FileCheck %s --check-prefixes V79-81,V81
+
+; V79-81: Finding uses of:   renamable $w{{[0-9]+}} = V6_vmpy_qf32_hf
+; V79-81: Inserting after conv:   [[VREG0:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG0]]
+; V79-81-NEXT: Inserting after conv:   [[VREG1:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG1]]
+; V79-81: Finding uses of:   renamable $w{{[0-9]+}} = V6_vmpy_qf32_hf
+; V79-81: Inserting after conv:   [[VREG2:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG2]]
+; V79-81-NEXT: Inserting after conv:   [[VREG3:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG3]]
+; V79-81: Finding uses of:   renamable $w{{[0-9]+}} = V6_vmpy_qf32_hf
+; V79-81-DAG: Inserting after conv:   [[VREG4:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG4]]
+; V79-81-DAG: Inserting after conv:   [[VREG5:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG5]]
+; V79-81-DAG: Inserting new instruction:   $v{{[0-9]+}} = V6_vadd_sf killed renamable [[VREG2]], killed renamable [[VREG0]]
+; V79-81-DAG: Inserting new instruction:   $v{{[0-9]+}} = V6_vsub_sf killed renamable $v{{[0-9]+}}, killed renamable $v{{[0-9]+}}
+;
+; V79-81: Analyzing convert instruction:   renamable [[VREG6:\$v[0-9]+]] = V6_vconv_hf_qf32 killed renamable $w{{[0-9]+}}
+; V79: Inserting new instruction:   [[VREG30:\$v[0-9]+]] = V6_vd0
+; V79-NEXT: Inserting new instruction:   [[VREG7:\$v[0-9]+]] = V6_vadd_sf killed renamable [[VREG7]], killed [[VREG30]]
+; V79: Inserting new instruction:   [[VREG30]] = V6_vd0
+; V79-NEXT: Inserting new instruction:   [[VREG8:\$v[0-9]+]] = V6_vadd_sf killed renamable [[VREG8]], killed [[VREG30]]
+; V81: Inserting new instruction:  [[VREG7:\$v[0-9]+]] = V6_vconv_qf32_sf killed renamable [[VREG7]]
+; V81: Inserting new instruction:  [[VREG8:\$v[0-9]+]] = V6_vconv_qf32_sf killed renamable [[VREG8]]
+
+; V79-81: Analyzing convert instruction:   renamable [[VREG9:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable $v{{[0-9]+}}
+; V79: Inserting new instruction:   [[VREG30]] = V6_vd0
+; V79-NEXT: Inserting new instruction:   [[VREG10:\$v[0-9]+]] = V6_vadd_sf killed renamable [[VREG10]], killed [[VREG30]]
+; V81: Inserting new instruction:  [[VREG8:\$v[0-9]+]] = V6_vconv_qf32_sf killed renamable [[VREG8]]
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+@.str.1 = private unnamed_addr constant [9 x i8] c"0x%08lx \00", align 1
+@.str.3 = private unnamed_addr constant [173 x i8] c"/prj/qct/llvm/devops/aether/hexbuild/test_trees/MASTER/test/regress/features/hexagon/arch_v68/hvx_ieee_fp/hvx_ieee_fp_test.c:126 0 && \22ERROR: Failed to acquire HVX unit.\\n\22\00", align 1
+@__func__.main = private unnamed_addr constant [5 x i8] c"main\00", align 1
+@.str.5 = private unnamed_addr constant [33 x i8] c"half -3 converted to vhf = %.2f\0A\00", align 1
+@.str.6 = private unnamed_addr constant [35 x i8] c"uhalf 32k converted to vhf = %.2f\0A\00", align 1
+@.str.7 = private unnamed_addr constant [32 x i8] c"sf 0.5 converted to vhf = %.2f\0A\00", align 1
+@.str.8 = private unnamed_addr constant [32 x i8] c"vhf 4.0 conveted to ubyte = %d\0A\00", align 1
+@.str.9 = private unnamed_addr constant [32 x i8] c"vhf 2.0 conveted to uhalf = %d\0A\00", align 1
+@.str.10 = private unnamed_addr constant [30 x i8] c"byte 4 conveted to hf = %.2f\0A\00", align 1
+@.str.11 = private unnamed_addr constant [31 x i8] c"ubyte 4 conveted to hf = %.2f\0A\00", align 1
+@.str.12 = private unnamed_addr constant [27 x i8] c"hf -3 conveted to sf = %f\0A\00", align 1
+@.str.13 = private unnamed_addr constant [31 x i8] c"vhf 4.0 conveted to byte = %d\0A\00", align 1
+@.str.14 = private unnamed_addr constant [31 x i8] c"vhf 4.0 conveted to half = %d\0A\00", align 1
+@.str.16 = private unnamed_addr constant [33 x i8] c"max of hf 2.0 and hf 4.0 = %.2f\0A\00", align 1
+@.str.17 = private unnamed_addr constant [33 x i8] c"min of hf 2.0 and hf 4.0 = %.2f\0A\00", align 1
+@.str.18 = private unnamed_addr constant [32 x i8] c"max of sf 0.5 and sf 0.25 = %f\0A\00", align 1
+@.str.19 = private unnamed_addr constant [32 x i8] c"min of sf 0.5 and sf 0.25 = %f\0A\00", align 1
+@.str.21 = private unnamed_addr constant [25 x i8] c"negate of hf 4.0 = %.2f\0A\00", align 1
+@.str.22 = private unnamed_addr constant [23 x i8] c"abs of hf -6.0 = %.2f\0A\00", align 1
+@.str.23 = private unnamed_addr constant [23 x i8] c"negate of sf 0.5 = %f\0A\00", align 1
+@.str.24 = private unnamed_addr constant [22 x i8] c"abs of sf -0.25 = %f\0A\00", align 1
+@.str.26 = private unnamed_addr constant [32 x i8] c"hf add of 4.0 and -6.0  = %.2f\0A\00", align 1
+@.str.27 = private unnamed_addr constant [32 x i8] c"hf sub of 4.0 and -6.0  = %.2f\0A\00", align 1
+@.str.28 = private unnamed_addr constant [31 x i8] c"sf add of 0.5 and -0.25  = %f\0A\00", align 1
+@.str.29 = private unnamed_addr constant [31 x i8] c"sf sub of 0.5 and -0.25  = %f\0A\00", align 1
+@.str.30 = private unnamed_addr constant [36 x i8] c"sf add of hf 4.0 and hf -6.0  = %f\0A\00", align 1
+@.str.31 = private unnamed_addr constant [36 x i8] c"sf sub of hf 4.0 and hf -6.0  = %f\0A\00", align 1
+@.str.33 = private unnamed_addr constant [32 x i8] c"hf mpy of 4.0 and -6.0  = %.2f\0A\00", align 1
+@.str.34 = private unnamed_addr constant [35 x i8] c"hf accmpy of 4.0 and -6.0  = %.2f\0A\00", align 1
+@.str.35 = private unnamed_addr constant [36 x i8] c"sf mpy of hf 4.0 and hf -6.0  = %f\0A\00", align 1
+@.str.36 = private unnamed_addr constant [39 x i8] c"sf accmpy of hf 4.0 and hf -6.0  = %f\0A\00", align 1
+@.str.37 = private unnamed_addr constant [31 x i8] c"sf mpy of 0.5 and -0.25  = %f\0A\00", align 1
+@.str.39 = private unnamed_addr constant [25 x i8] c"w copy from sf 0.5 = %f\0A\00", align 1
+@str = private unnamed_addr constant [35 x i8] c"ERROR: Failed to acquire HVX unit.\00", align 1
+@str.40 = private unnamed_addr constant [25 x i8] c"\0AConversion intructions\0A\00", align 1
+@str.41 = private unnamed_addr constant [23 x i8] c"\0AMin/Max instructions\0A\00", align 1
+@str.42 = private unnamed_addr constant [23 x i8] c"\0Aabs/neg instructions\0A\00", align 1
+@str.43 = private unnamed_addr constant [23 x i8] c"\0Aadd/sub instructions\0A\00", align 1
+@str.44 = private unnamed_addr constant [24 x i8] c"\0Amultiply instructions\0A\00", align 1
+@str.45 = private unnamed_addr constant [19 x i8] c"\0Acopy instruction\0A\00", align 1
+
+declare dso_local void @print_vector_words(<32 x i32> noundef %x) local_unnamed_addr #0
+
+; Function Attrs: nofree nounwind optsize
+declare dso_local noundef i32 @printf(ptr nocapture noundef readonly, ...) local_unnamed_addr #0
+
+; Function Attrs: nounwind optsize
+define dso_local i32 @main(i32 noundef %argc, ptr nocapture noundef readnone %argv) local_unnamed_addr #1 {
+entry:
+  %call = tail call i32 @acquire_vector_unit(i8 noundef zeroext 0) #6
+  %tobool.not = icmp eq i32 %call, 0
+  br i1 %tobool.not, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %puts = tail call i32 @puts(ptr nonnull dereferenceable(1) @str)
+  tail call void @_Assert(ptr noundef nonnull @.str.3, ptr noundef nonnull @__func__.main) #7
+  unreachable
+
+if.end:                                           ; preds = %entry
+  tail call void @set_double_vector_mode() #6
+  %0 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 16384)
+  %1 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 17408)
+  %2 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 -14848)
+  %3 = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1056964608)
+  %4 = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1048576000)
+  %5 = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 -1098907648)
+  %6 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 -3)
+  %7 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 32768)
+  %puts147 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.40)
+  %8 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.hf.h.128B(<32 x i32> %6)
+  %bc.i = bitcast <32 x i32> %8 to <64 x half>
+  %9 = extractelement <64 x half> %bc.i, i64 0
+  %conv = fpext half %9 to double
+  %call12 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.5, double noundef %conv) #6
+  %10 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.hf.uh.128B(<32 x i32> %7)
+  %bc.i153 = bitcast <32 x i32> %10 to <64 x half>
+  %11 = extractelement <64 x half> %bc.i153, i64 0
+  %conv14 = fpext half %11 to double
+  %call15 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.6, double noundef %conv14) #6
+  %12 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.hf.sf.128B(<32 x i32> %3, <32 x i32> %3)
+  %bc.i155 = bitcast <32 x i32> %12 to <64 x half>
+  %13 = extractelement <64 x half> %bc.i155, i64 0
+  %conv17 = fpext half %13 to double
+  %call18 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.7, double noundef %conv17) #6
+  %14 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.ub.hf.128B(<32 x i32> %1, <32 x i32> %1)
+  %15 = bitcast <32 x i32> %14 to <128 x i8>
+  %conv.i = extractelement <128 x i8> %15, i64 0
+  %conv20 = zext i8 %conv.i to i32
+  %call21 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.8, i32 noundef %conv20) #6
+  %16 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.uh.hf.128B(<32 x i32> %0)
+  %17 = bitcast <32 x i32> %16 to <64 x i16>
+  %conv.i157 = extractelement <64 x i16> %17, i64 0
+  %conv23 = sext i16 %conv.i157 to i32
+  %call24 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.9, i32 noundef %conv23) #6
+  %18 = tail call <64 x i32> @llvm.hexagon.V6.vcvt.hf.b.128B(<32 x i32> %14)
+  %bc.i158 = bitcast <64 x i32> %18 to <128 x half>
+  %19 = extractelement <128 x half> %bc.i158, i64 0
+  %conv26 = fpext half %19 to double
+  %call27 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.10, double noundef %conv26) #6
+  %20 = tail call <64 x i32> @llvm.hexagon.V6.vcvt.hf.ub.128B(<32 x i32> %14)
+  %bc.i159 = bitcast <64 x i32> %20 to <128 x half>
+  %21 = extractelement <128 x half> %bc.i159, i64 0
+  %conv29 = fpext half %21 to double
+  %call30 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.11, double noundef %conv29) #6
+  %22 = tail call <64 x i32> @llvm.hexagon.V6.vcvt.sf.hf.128B(<32 x i32> %8)
+  %bc.i161 = bitcast <64 x i32> %22 to <64 x float>
+  %23 = extractelement <64 x float> %bc.i161, i64 0
+  %conv32 = fpext float %23 to double
+  %call33 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.12, double noundef %conv32) #6
+  %24 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.b.hf.128B(<32 x i32> %1, <32 x i32> %1)
+  %25 = bitcast <32 x i32> %24 to <128 x i8>
+  %conv.i162 = extractelement <128 x i8> %25, i64 0
+  %conv35 = zext i8 %conv.i162 to i32
+  %call36 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.13, i32 noundef %conv35) #6
+  %26 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.h.hf.128B(<32 x i32> %1)
+  %27 = bitcast <32 x i32> %26 to <64 x i16>
+  %conv.i163 = extractelement <64 x i16> %27, i64 0
+  %conv38 = sext i16 %conv.i163 to i32
+  %call39 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.14, i32 noundef %conv38) #6
+  %28 = tail call <32 x i32> @llvm.hexagon.V6.vfmax.hf.128B(<32 x i32> %0, <32 x i32> %1)
+  %puts148 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.41)
+  %bc.i164 = bitcast <32 x i32> %28 to <64 x half>
+  %29 = extractelement <64 x half> %bc.i164, i64 0
+  %conv42 = fpext half %29 to double
+  %call43 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.16, double noundef %conv42) #6
+  %30 = tail call <32 x i32> @llvm.hexagon.V6.vfmin.hf.128B(<32 x i32> %0, <32 x i32> %1)
+  %bc.i166 = bitcast <32 x i32> %30 to <64 x half>
+  %31 = extractelement <64 x half> %bc.i166, i64 0
+  %conv45 = fpext half %31 to double
+  %call46 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.17, double noundef %conv45) #6
+  %32 = tail call <32 x i32> @llvm.hexagon.V6.vfmax.sf.128B(<32 x i32> %3, <32 x i32> %4)
+  %bc.i168 = bitcast <32 x i32> %32 to <32 x float>
+  %33 = extractelement <32 x float> %bc.i168, i64 0
+  %conv48 = fpext float %33 to double
+  %call49 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.18, double noundef %conv48) #6
+  %34 = tail call <32 x i32> @llvm.hexagon.V6.vfmin.sf.128B(<32 x i32> %3, <32 x i32> %4)
+  %bc.i169 = bitcast <32 x i32> %34 to <32 x float>
+  %35 = extractelement <32 x float> %bc.i169, i64 0
+  %conv51 = fpext float %35 to double
+  %call52 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.19, double noundef %conv51) #6
+  %puts149 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.42)
+  %36 = tail call <32 x i32> @llvm.hexagon.V6.vfneg.hf.128B(<32 x i32> %1)
+  %bc.i170 = bitcast <32 x i32> %36 to <64 x half>
+  %37 = extractelement <64 x half> %bc.i170, i64 0
+  %conv55 = fpext half %37 to double
+  %call56 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.21, double noundef %conv55) #6
+  %38 = tail call <32 x i32> @llvm.hexagon.V6.vabs.hf.128B(<32 x i32> %2)
+  %bc.i172 = bitcast <32 x i32> %38 to <64 x half>
+  %39 = extractelement <64 x half> %bc.i172, i64 0
+  %conv58 = fpext half %39 to double
+  %call59 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.22, double noundef %conv58) #6
+  %40 = tail call <32 x i32> @llvm.hexagon.V6.vfneg.sf.128B(<32 x i32> %3)
+  %bc.i174 = bitcast <32 x i32> %40 to <32 x float>
+  %41 = extractelement <32 x float> %bc.i174, i64 0
+  %conv61 = fpext float %41 to double
+  %call62 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.23, double noundef %conv61) #6
+  %42 = tail call <32 x i32> @llvm.hexagon.V6.vabs.sf.128B(<32 x i32> %5)
+  %bc.i175 = bitcast <32 x i32> %42 to <32 x float>
+  %43 = extractelement <32 x float> %bc.i175, i64 0
+  %conv64 = fpext float %43 to double
+  %call65 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.24, double noundef %conv64) #6
+  %puts150 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.43)
+  %44 = tail call <32 x i32> @llvm.hexagon.V6.vadd.hf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+  %bc.i176 = bitcast <32 x i32> %44 to <64 x half>
+  %45 = extractelement <64 x half> %bc.i176, i64 0
+  %conv68 = fpext half %45 to double
+  %call69 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.26, double noundef %conv68) #6
+  %46 = tail call <32 x i32> @llvm.hexagon.V6.vsub.hf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+  %bc.i178 = bitcast <32 x i32> %46 to <64 x half>
+  %47 = extractelement <64 x half> %bc.i178, i64 0
+  %conv71 = fpext half %47 to double
+  %call72 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.27, double noundef %conv71) #6
+  %48 = tail call <32 x i32> @llvm.hexagon.V6.vadd.sf.sf.128B(<32 x i32> %3, <32 x i32> %5)
+  %bc.i180 = bitcast <32 x i32> %48 to <32 x float>
+  %49 = extractelement <32 x float> %bc.i180, i64 0
+  %conv74 = fpext float %49 to double
+  %call75 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.28, double noundef %conv74) #6
+  %50 = tail call <32 x i32> @llvm.hexagon.V6.vsub.sf.sf.128B(<32 x i32> %3, <32 x i32> %5)
+  %bc.i181 = bitcast <32 x i32> %50 to <32 x float>
+  %51 = extractelement <32 x float> %bc.i181, i64 0
+  %conv77 = fpext float %51 to double
+  %call78 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.29, double noundef %conv77) #6
+  %52 = tail call <64 x i32> @llvm.hexagon.V6.vadd.sf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+  %bc.i182 = bitcast <64 x i32> %52 to <64 x float>
+  %53 = extractelement <64 x float> %bc.i182, i64 0
+  %conv80 = fpext float %53 to double
+  %call81 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.30, double noundef %conv80) #6
+  %54 = tail call <64 x i32> @llvm.hexagon.V6.vsub.sf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+  %bc.i183 = bitcast <64 x i32> %54 to <64 x float>
+  %55 = extractelement <64 x float> %bc.i183, i64 0
+  %conv83 = fpext float %55 to double
+  %call84 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.31, double noundef %conv83) #6
+  %puts151 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.44)
+  %56 = tail call <32 x i32> @llvm.hexagon.V6.vmpy.hf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+  %bc.i184 = bitcast <32 x i32> %56 to <64 x half>
+  %57 = extractelement <64 x half> %bc.i184, i64 0
+  %conv87 = fpext half %57 to double
+  %call88 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.33, double noundef %conv87) #6
+  %58 = tail call <32 x i32> @llvm.hexagon.V6.vmpy.hf.hf.acc.128B(<32 x i32> %56, <32 x i32> %1, <32 x i32> %2)
+  %bc.i186 = bitcast <32 x i32> %58 to <64 x half>
+  %59 = extractelement <64 x half> %bc.i186, i64 0
+  %conv90 = fpext half %59 to double
+  %call91 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.34, double noundef %conv90) #6
+  %60 = tail call <64 x i32> @llvm.hexagon.V6.vmpy.sf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+  %bc.i188 = bitcast <64 x i32> %60 to <64 x float>
+  %61 = extractelement <64 x float> %bc.i188, i64 0
+  %conv93 = fpext float %61 to double
+  %call94 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.35, double noundef %conv93) #6
+  %62 = tail call <64 x i32> @llvm.hexagon.V6.vmpy.sf.hf.acc.128B(<64 x i32> %60, <32 x i32> %1, <32 x i32> %2)
+  %bc.i189 = bitcast <64 x i32> %62 to <64 x float>
+  %63 = extractelement <64 x float> %bc.i189, i64 0
+  %conv96 = fpext float %63 to double
+  %call97 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.36, double noundef %conv96) #6
+  %64 = tail call <32 x i32> @llvm.hexagon.V6.vmpy.sf.sf.128B(<32 x i32> %3, <32 x i32> %5)
+  %bc.i190 = bitcast <32 x i32> %64 to <32 x float>
+  %65 = extractelement <32 x float> %bc.i190, i64 0
+  %conv99 = fpext float %65 to double
+  %call100 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.37, double noundef %conv99) #6
+  %puts152 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.45)
+  %66 = tail call <32 x i32> @llvm.hexagon.V6.vassign.fp.128B(<32 x i32> %3)
+  %bc.i191 = bitcast <32 x i32> %66 to <32 x float>
+  %67 = extractelement <32 x float> %bc.i191, i64 0
+  %conv103 = fpext float %67 to double
+  %call104 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.39, double noundef %conv103) #6
+  ret i32 0
+}
+
+; Function Attrs: optsize
+declare dso_local i32 @acquire_vector_unit(i8 noundef zeroext) local_unnamed_addr #2
+
+; Function Attrs: noreturn nounwind optsize
+declare dso_local void @_Assert(ptr noundef, ptr noundef) local_unnamed_addr #3
+
+; Function Attrs: optsize
+declare dso_local void @set_double_vector_mode(...) local_unnamed_addr #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.hf.h.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.hf.uh.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.hf.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.ub.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.uh.hf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vcvt.hf.b.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vcvt.hf.ub.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vcvt.sf.hf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.b.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.h.hf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfmax.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfmin.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfmax.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfmin.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfneg.hf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vabs.hf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfneg.sf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vabs.sf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vadd.hf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vsub.hf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vadd.sf.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vsub.sf.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vadd.sf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vsub.sf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vmpy.hf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vmpy.hf.hf.acc.128B(<32 x i32>, <32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vmpy.sf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vmpy.sf.hf.acc.128B(<64 x i32>, <32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vmpy.sf.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vassign.fp.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32) #4
+
+; Function Attrs: nofree nounwind
+declare noundef i32 @putchar(i32 noundef) local_unnamed_addr #5
+
+; Function Attrs: nofree nounwind
+declare noundef i32 @puts(ptr nocapture noundef readonly) local_unnamed_addr #5
diff --git a/llvm/test/CodeGen/Hexagon/hvx-vsub-qf-sf-mix.ll b/llvm/test/CodeGen/Hexagon/hvx-vsub-qf-sf-mix.ll
new file mode 100644
index 0000000000000..cdb779f5c4e7d
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/hvx-vsub-qf-sf-mix.ll
@@ -0,0 +1,60 @@
+;; RUN: llc --mtriple=hexagon --mcpu=hexagonv81 --mattr=+hvxv81,+hvx-length128b %s -o - | FileCheck %s
+
+define void @mul_and_sub_1(ptr readonly %A, ptr readonly %B, ptr readonly %C, ptr writeonly %D) {
+entry:
+  %AVec = load <32 x float>, ptr %A, align 4
+  %BVec = load <32 x float>, ptr %B, align 4
+  %CVec = load <32 x float>, ptr %C, align 4
+  %AtBVec = fmul <32 x float> %AVec, %BVec
+
+  %DVec = fsub <32 x float> %CVec, %AtBVec
+  store <32 x float> %DVec, ptr %D, align 4
+  ret void
+}
+;; CHECK: mul_and_sub_1
+;; CHECK: vsub(v{{[0-9]+}}.sf,v{{[0-9]+}}.qf32)
+
+
+define void @mul_and_sub_2(ptr readonly %A, ptr readonly %B, ptr readonly %C, ptr writeonly %D) {
+entry:
+  %AVec = load <32 x float>, ptr %A, align 4
+  %BVec = load <32 x float>, ptr %B, align 4
+  %CVec = load <32 x float>, ptr %C, align 4
+  %AtBVec = fmul <32 x float> %AVec, %BVec
+
+  %DVec = fsub <32 x float> %AtBVec, %CVec
+  store <32 x float> %DVec, ptr %D, align 4
+  ret void
+}
+;; CHECK: mul_and_sub_2
+;; CHECK: vsub(v{{[0-9]+}}.qf32,v{{[0-9]+}}.sf)
+
+
+define void @mul_and_sub_3(ptr readonly %A, ptr readonly %B, ptr readonly %C, ptr writeonly %D) {
+entry:
+  %AVec = load <64 x half>, ptr %A, align 4
+  %BVec = load <64 x half>, ptr %B, align 4
+  %CVec = load <64 x half>, ptr %C, align 4
+  %AtBVec = fmul <64 x half> %AVec, %BVec
+
+  %DVec = fsub <64 x half> %CVec, %AtBVec
+  store <64 x half> %DVec, ptr %D, align 4
+  ret void
+}
+;; CHECK: mul_and_sub_3
+;; CHECK: vsub(v{{[0-9]+}}.hf,v{{[0-9]+}}.qf16)
+
+
+define void @mul_and_sub_4(ptr readonly %A, ptr readonly %B, ptr readonly %C, ptr writeonly %D) {
+entry:
+  %AVec = load <64 x half>, ptr %A, align 4
+  %BVec = load <64 x half>, ptr %B, align 4
+  %CVec = load <64 x half>, ptr %C, align 4
+  %AtBVec = fmul <64 x half> %AVec, %BVec
+
+  %DVec = fsub <64 x half> %AtBVec, %CVec
+  store <64 x half> %DVec, ptr %D, align 4
+  ret void
+}
+;; CHECK: mul_and_sub_4
+;; CHECK: vsub(v{{[0-9]+}}.qf16,v{{[0-9]+}}.hf)
diff --git a/llvm/test/CodeGen/Hexagon/instrprof-custom.ll b/llvm/test/CodeGen/Hexagon/instrprof-custom.ll
index 620b2acc49520..1c1965d44541f 100644
--- a/llvm/test/CodeGen/Hexagon/instrprof-custom.ll
+++ b/llvm/test/CodeGen/Hexagon/instrprof-custom.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=hexagon -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -mtriple=hexagon < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon --mattr=+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp < %s | FileCheck %s
 
 ; CHECK-LABEL: test1:
 ; CHECK: {{call my_instrprof_handler|r0 = #999}}
@@ -14,7 +14,4 @@ entry:
 }
 
 ; Function Attrs: inaccessiblememonly nofree nosync nounwind willreturn
-declare void @llvm.hexagon.instrprof.custom(ptr, i32) #1
-
-attributes #0 = { "target-features"="+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp,+hmxv68" }
-attributes #1 = { inaccessiblememonly nofree nosync nounwind willreturn }
+declare void @llvm.hexagon.instrprof.custom(ptr, i32)
diff --git a/llvm/test/CodeGen/Hexagon/isel-fclass.ll b/llvm/test/CodeGen/Hexagon/isel-fclass.ll
new file mode 100644
index 0000000000000..96b02106fa807
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/isel-fclass.ll
@@ -0,0 +1,86 @@
+; Tests lowering of sfclass/dfclass compares.
+; Sub-optimal code
+;         {
+;                p0 = sfclass(r0,#16)
+;                r0 = sfadd(r0,r0)
+;        }
+;        {
+;                r2 = p0
+;        }
+;        {
+;                if (p0.new) r0 = ##1065353216
+;                p0 = cmp.eq(r2,#0)
+;                jumpr r31
+;        }
+; With the patterns added, we should be generating
+;        {
+;                p0 = sfclass(r0,#16)
+;                r0 = sfadd(r0,r0)
+;        }
+;        {
+;                if (!p0) r0 = ##1065353216
+;                jumpr r31
+;        }
+
+; RUN: llc -march=hexagon -stop-after=hexagon-isel %s -o - | FileCheck %s
+
+; CHECK: bb.0.entry1
+; CHECK: F2_sfclass
+; CHECK-NOT: C2_cmp
+; CHECK: C2_not
+; CHECK: F2_sfadd
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+define float @test1(float noundef %x) {
+entry1:
+  %0 = tail call i32 @llvm.hexagon.F2.sfclass(float %x, i32 16)
+  %tobool.not = icmp eq i32 %0, 0
+  %add = fadd float %x, %x
+  %spec.select = select i1 %tobool.not, float 1.000000e+00, float %add
+  ret float %spec.select
+}
+
+; CHECK: bb.0.entry2
+; CHECK: F2_sfclass
+; CHECK-NOT: C2_cmp
+; CHECK: F2_sfadd
+define float @test2(float noundef %x) {
+entry2:
+  %0 = tail call i32 @llvm.hexagon.F2.sfclass(float %x, i32 16)
+  %tobool.not = icmp eq i32 %0, 0
+  %add = fadd float %x, %x
+  %spec.select = select i1 %tobool.not, float %add, float 1.000000e+00
+  ret float %spec.select
+}
+
+; CHECK: bb.0.entry3
+; CHECK: F2_dfclass
+; CHECK-NOT: C2_cmp
+; CHECK: C2_not
+; CHECK: F2_dfadd
+define double @test3(double noundef %x) {
+entry3:
+  %0 = tail call i32 @llvm.hexagon.F2.dfclass(double %x, i32 16)
+  %tobool.not = icmp eq i32 %0, 0
+  %add = fadd double %x, %x
+  %spec.select = select i1 %tobool.not, double 1.000000e+00, double %add
+  ret double %spec.select
+}
+
+; CHECK: bb.0.entry4
+; CHECK: F2_dfclass
+; CHECK-NOT: C2_cmp
+; CHECK: F2_dfadd
+define double @test4(double noundef %x) {
+entry4:
+  %0 = tail call i32 @llvm.hexagon.F2.dfclass(double %x, i32 16)
+  %tobool.not = icmp eq i32 %0, 0
+  %add = fadd double %x, %x
+  %spec.select = select i1 %tobool.not, double %add, double 1.000000e+00
+  ret double %spec.select
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare i32 @llvm.hexagon.F2.dfclass(double, i32 immarg)
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare i32 @llvm.hexagon.F2.sfclass(float, i32 immarg)
diff --git a/llvm/test/CodeGen/Hexagon/isel/trunc-vNi1-HVX.ll b/llvm/test/CodeGen/Hexagon/isel/trunc-vNi1-HVX.ll
new file mode 100644
index 0000000000000..1491729a17f30
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/isel/trunc-vNi1-HVX.ll
@@ -0,0 +1,18 @@
+; RUN: llc --mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s | FileCheck %s
+
+define void @f5(<64 x i32> %a0, ptr %a1) {
+; CHECK-LABEL: f5:
+; CHECK: [[REG0:(r[0-9]+)]] = ##16843009
+; CHECK-DAG: q[[Q0:[0-9]+]] = vand(v{{[0-9]+}},[[REG0]])
+; CHECK-DAG: q[[Q1:[0-9]+]] = vand(v{{[0-9]+}},[[REG0]])
+; CHECK: v{{[0-9]+}}.b = vpacke(v{{[0-9]+}}.h,v{{[0-9]+}}.h)
+; CHECK: v{{[0-9]+}}.b = vpacke(v{{[0-9]+}}.h,v{{[0-9]+}}.h)
+; CHECK: v[[VROR:[0-9]+]] = vror(v{{[0-9]+}},r{{[0-9]+}})
+; CHECK: v[[VOR:[0-9]+]] = vor(v[[VROR]],v{{[0-9]+}})
+; CHECK: q{{[0-9]+}} = vand(v[[VOR]],r{{[0-9]+}})
+b0:
+  %v0 = trunc <64 x i32> %a0 to <64 x i1>
+  store <64 x i1> %v0, ptr %a1, align 1
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/Hexagon/late_instr.ll b/llvm/test/CodeGen/Hexagon/late_instr.ll
index 93e5a7dba4b3b..6bd1261ed83d5 100644
--- a/llvm/test/CodeGen/Hexagon/late_instr.ll
+++ b/llvm/test/CodeGen/Hexagon/late_instr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon -disable-hsdr < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon -disable-hsdr -terminal-rule=0 < %s | FileCheck %s
 
 ; Check if instruction vandqrt.acc and its predecessor are scheduled in consecutive packets.
 ; CHECK: or(q{{[0-3]+}},q{{[0-3]+}})
diff --git a/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll b/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
index c16370c3b907d..527f27e56c334 100644
--- a/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
+++ b/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
@@ -2,7 +2,7 @@
 ; type as first parameter instead of a sf type without
 ; any conversion instruction of type sf = qf32
 
-; RUN: llc -mtriple=hexagon < %s -o - | FileCheck %s
+; RUN: llc -mtriple=hexagon -mattr=+hvx-length128b,+hvxv75,+v75 < %s -o - | FileCheck %s
 
 ; CHECK: [[V2:v[0-9]+]] = vxor([[V2]],[[V2]])
 ; CHECK: [[V0:v[0-9]+]].qf32 = vmpy([[V0]].sf,[[V2]].sf)
@@ -17,5 +17,3 @@ entry:
   store <64 x half> %conv17.ripple.vectorized, ptr %out_ptr, align 2
   ret void
 }
-
-attributes #0 = { "target-features"="+hvx-length128b,+hvxv75,+v75,-long-calls,-small-data" }
diff --git a/llvm/test/CodeGen/Hexagon/swp-carried-1.ll b/llvm/test/CodeGen/Hexagon/swp-carried-1.ll
index 6993bd672c01a..f2beadfbfa64b 100644
--- a/llvm/test/CodeGen/Hexagon/swp-carried-1.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-carried-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-setupcost-depth-limit=1 -disable-cgp-delete-phis < %s -pipeliner-experimental-cg=true | FileCheck %s
+; RUN: llc -mtriple=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-setupcost-depth-limit=1 -disable-cgp-delete-phis < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s
 
 ; Test that we generate the correct code when a loop carried value
 ; is scheduled one stage earlier than it's use. The code in
diff --git a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
index 006a8b6bfc94a..69b89a680ff5a 100644
--- a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
+; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s
 
 ; This version of the conv3x3 test has both loops. This test checks that the
 ; inner loop has 14 packets.
diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi11.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi11.ll
index d1b9c51c45a2d..0466b6df46142 100644
--- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi11.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi11.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon-unknown-elf -mcpu=hexagonv55 -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon-unknown-elf -mcpu=hexagonv55 -hexagon-initial-cfg-cleanup=0 -terminal-rule=0 < %s | FileCheck %s
 
 ; Test that the pipeliner correctly generates the operands in the
 ; epilog.
diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll
index ba479b696f16c..c6631bd9dc16d 100644
--- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon -hexagon-initial-cfg-cleanup=0 -pipeliner-experimental-cg=true -disable-cgp-delete-phis < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon -hexagon-initial-cfg-cleanup=0 -pipeliner-experimental-cg=true -disable-cgp-delete-phis -terminal-rule=0 < %s | FileCheck %s
 
 ; Test epilogue generation when reading loop-carried dependency from a previous
 ; stage. The first epilogue should read value from iteration N-1 of the kernel.
diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll
index 96a38939dc50e..d90e7c4cde1ca 100644
--- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon -O2 -enable-pipeliner -disable-block-placement=0 < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon -O2 -enable-pipeliner -disable-block-placement=0 -terminal-rule=0 < %s | FileCheck %s
 
 ; For the Phis generated in the epilog, test that we generate the correct
 ; names for the values coming from the prolog stages. The test belows
diff --git a/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll b/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll
index 6ca8e94200b7d..2a428ff941a71 100644
--- a/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon -enable-pipeliner-opt-size -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s
+; RUN: llc -mtriple=hexagon -enable-pipeliner-opt-size -hexagon-initial-cfg-cleanup=0 -terminal-rule=0 < %s -pipeliner-experimental-cg=true | FileCheck %s
 
 ; Test that we generate the correct names for the phis in the kernel for the
 ; incoming values. In this case, the loop contains a phi and has another phi
diff --git a/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
index 42efe60b96d48..a0aeb80a5fa93 100644
--- a/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon -mcpu=hexagonv60 -enable-pipeliner < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon -mcpu=hexagonv60 -enable-pipeliner -terminal-rule=0 < %s | FileCheck %s
 
 ; From coremark. Test that we pipeline the matrix multiplication bitextract
 ; function. The pipelined code should have two packets.
diff --git a/llvm/test/CodeGen/Hexagon/swp-order-copies.ll b/llvm/test/CodeGen/Hexagon/swp-order-copies.ll
index 1c9cc4a1cf9d8..bbaa8cd635f3e 100644
--- a/llvm/test/CodeGen/Hexagon/swp-order-copies.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-order-copies.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
+; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s
 
 ; Test that the instruction ordering code in the pipeliner fixes up dependences
 ; between post-increment register definitions and uses so that the register
diff --git a/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll b/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll
index 5f1780fce39d2..38893de0b0829 100644
--- a/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
+; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s
 
 ; Test that the pipeliner cause an assert and correctly pipelines the
 ; loop.
diff --git a/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll b/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll
index 6c8b0638ae5d1..5189812d522c6 100644
--- a/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
+; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s
 
 ; Test that the pipeliner generates correct code when attempting to reuse
 ; an existing phi. This test case contains a phi that references another
diff --git a/llvm/test/CodeGen/Hexagon/vect-qfp.mir b/llvm/test/CodeGen/Hexagon/vect-qfp.mir
new file mode 100644
index 0000000000000..6909591ffddf0
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect-qfp.mir
@@ -0,0 +1,202 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer -disable-qfp-opt-mul=false %s -o - | FileCheck %s --check-prefix=MUL-ENABLED
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s --check-prefix=DEFAULT
+# MUL-ENABLED-LABEL: name: qfpAdd32
+# MUL-ENABLED: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vadd_qf32_mix
+# MUL-ENABLED-NEXT: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# MUL-ENABLED-NEXT: V6_vadd_qf32
+# DEFAULT-LABEL: name: qfpAdd32
+# DEFAULT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vadd_qf32_mix
+# DEFAULT-NEXT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vS32Ub_ai
+# DEFAULT-NEXT: V6_vadd_qf32
+---
+name: qfpAdd32
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:intregs = COPY $r3
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %6:hvxvr = V6_vadd_sf %4:hvxvr, %5:hvxvr
+    %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+    %8:hvxvr = V6_vadd_sf %5:hvxvr, %7:hvxvr
+    %9:hvxvr = V6_vconv_sf_qf32 %8:hvxvr
+    V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr
+    %10:hvxvr = V6_vadd_sf %7:hvxvr, %9:hvxvr
+    %11:hvxvr = V6_vconv_sf_qf32 %10:hvxvr
+    V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
+...
+# MUL-ENABLED-LABEL: name: qfpAdd16
+# MUL-ENABLED: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vadd_qf16_mix
+# MUL-ENABLED-NEXT: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# MUL-ENABLED-NEXT: V6_vadd_qf16
+# DEFAULT-LABEL: name: qfpAdd16
+# DEFAULT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vadd_qf16_mix
+# DEFAULT-NEXT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vS32Ub_ai
+# DEFAULT-NEXT: V6_vadd_qf16
+---
+name: qfpAdd16
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:intregs = COPY $r3
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %6:hvxvr = V6_vadd_hf %4:hvxvr, %5:hvxvr
+    %7:hvxvr = V6_vconv_hf_qf16 %6:hvxvr
+    %8:hvxvr = V6_vadd_hf %5:hvxvr, %7:hvxvr
+    %9:hvxvr = V6_vconv_hf_qf16 %8:hvxvr
+    V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr
+    %10:hvxvr = V6_vadd_hf %7:hvxvr, %9:hvxvr
+    %11:hvxvr = V6_vconv_hf_qf16 %10:hvxvr
+    V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
+...
+# MUL-ENABLED-LABEL: name: qfpSub32
+# MUL-ENABLED: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vsub_qf32_mix
+# MUL-ENABLED-NEXT: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# MUL-ENABLED-NEXT: V6_vsub_qf32
+# DEFAULT-LABEL: name: qfpSub32
+# DEFAULT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vsub_qf32_mix
+# DEFAULT-NEXT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vS32Ub_ai
+# DEFAULT-NEXT: V6_vsub_qf32
+---
+name: qfpSub32
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:intregs = COPY $r3
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %6:hvxvr = V6_vsub_sf %4:hvxvr, %5:hvxvr
+    %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+    %8:hvxvr = V6_vsub_sf %7:hvxvr, %5:hvxvr
+    %9:hvxvr = V6_vconv_sf_qf32 %8:hvxvr
+    V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr
+    %10:hvxvr = V6_vsub_sf %7:hvxvr, %9:hvxvr
+    %11:hvxvr = V6_vconv_sf_qf32 %10:hvxvr
+    V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
+...
+# MUL-ENABLED-LABEL: name: qfpSub16
+# MUL-ENABLED: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vsub_qf16_mix
+# MUL-ENABLED-NEXT: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# MUL-ENABLED-NEXT: V6_vsub_qf16
+# DEFAULT-LABEL: name: qfpSub16
+# DEFAULT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vsub_qf16_mix
+# DEFAULT-NEXT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vS32Ub_ai
+# DEFAULT-NEXT: V6_vsub_qf16
+---
+name: qfpSub16
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:intregs = COPY $r3
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %6:hvxvr = V6_vsub_hf %4:hvxvr, %5:hvxvr
+    %7:hvxvr = V6_vconv_hf_qf16 %6:hvxvr
+    %8:hvxvr = V6_vsub_hf %7:hvxvr, %5:hvxvr
+    %9:hvxvr = V6_vconv_hf_qf16 %8:hvxvr
+    V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr
+    %10:hvxvr = V6_vsub_hf %7:hvxvr, %9:hvxvr
+    %11:hvxvr = V6_vconv_hf_qf16 %10:hvxvr
+    V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
+...
+# MUL-ENABLED-LABEL: name: qfpMul32
+# MUL-ENABLED: V6_vmpy_qf32_sf
+# MUL-ENABLED-NEXT: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vmpy_qf32_sf
+# MUL-ENABLED-NEXT: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vmpy_qf32
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# DEFAULT-LABEL: name: qfpMul32
+# DEFAULT: V6_vmpy_qf32_sf
+# DEFAULT-NEXT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vmpy_qf32_sf
+# DEFAULT-NEXT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vmpy_qf32_sf
+# DEFAULT-NEXT: V6_vS32Ub_ai
+---
+name: qfpMul32
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:intregs = COPY $r3
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %6:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %7:hvxvr = V6_vmpy_qf32_sf %4:hvxvr, %5:hvxvr
+    %8:hvxvr = V6_vconv_sf_qf32 %7:hvxvr
+    %9:hvxvr = V6_vmpy_qf32_sf %5:hvxvr, %6:hvxvr
+    %10:hvxvr = V6_vconv_sf_qf32 %9:hvxvr
+    %11:hvxvr = V6_vmpy_qf32_sf %8:hvxvr, %10:hvxvr
+    V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
+...
+# MUL-ENABLED-LABEL: name: qfpMul16
+# MUL-ENABLED: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vmpy_qf16_mix_hf
+# MUL-ENABLED-NEXT: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# MUL-ENABLED-NEXT: V6_vmpy_qf16
+# DEFAULT-LABEL: name: qfpMul16
+# DEFAULT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vmpy_qf16_hf
+# DEFAULT-NEXT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vS32Ub_ai
+# DEFAULT-NEXT: V6_vmpy_qf16_hf
+---
+name: qfpMul16
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:intregs = COPY $r3
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %6:hvxvr = V6_vmpy_qf16_hf %4:hvxvr, %5:hvxvr
+    %7:hvxvr = V6_vconv_hf_qf16 %6:hvxvr
+    %8:hvxvr = V6_vmpy_qf16_hf %5:hvxvr, %7:hvxvr
+    %9:hvxvr = V6_vconv_hf_qf16 %8:hvxvr
+    V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr
+    %10:hvxvr = V6_vmpy_qf16_hf %7:hvxvr, %9:hvxvr
+    %11:hvxvr = V6_vconv_hf_qf16 %10:hvxvr
+    V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
diff --git a/llvm/test/CodeGen/Hexagon/vect/vect-qfp-unary.mir b/llvm/test/CodeGen/Hexagon/vect/vect-qfp-unary.mir
new file mode 100644
index 0000000000000..482edc8dc242b
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect/vect-qfp-unary.mir
@@ -0,0 +1,97 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+
+# CHECK: name: qfp_vilog32
+# CHECK: V6_vilog2_qf32
+---
+name: qfp_vilog32
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    $v0 = V6_vL32Ub_ai $r0, 0
+    $v1 = V6_vconv_sf_qf32 $v0
+    $v2 = V6_vilog2_sf $v1
+    V6_vS32Ub_ai $r2, 0, $v2
+...
+
+# CHECK-LABEL: name: qfp_vilog16
+# CHECK: V6_vilog2_qf16
+---
+name: qfp_vilog16
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    $v0 = V6_vL32Ub_ai $r0, 0
+    $v1 = V6_vconv_hf_qf16 $v0
+    $v2 = V6_vilog2_hf $v1
+    V6_vS32Ub_ai $r2, 0, $v2
+...
+
+# CHECK: name: qfp_vneg32
+# CHECK: V6_vneg_qf32_qf32
+---
+name: qfp_vneg32
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    $v0 = V6_vL32Ub_ai $r0, 0
+    $v1 = V6_vconv_sf_qf32 $v0
+    $v2 = V6_vneg_qf32_sf $v1
+    $v3 = V6_vconv_sf_qf32 $v2
+    V6_vS32Ub_ai $r2, 0, $v3
+...
+
+# CHECK-LABEL: name: qfp_vneg16
+# CHECK: V6_vneg_qf16_qf16
+---
+name: qfp_vneg16
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    $v0 = V6_vL32Ub_ai $r0, 0
+    $v1 = V6_vconv_hf_qf16 $v0
+    $v2 = V6_vneg_qf16_hf $v1
+    $v3 = V6_vconv_hf_qf16 $v2
+    V6_vS32Ub_ai $r2, 0, $v3
+...
+
+# CHECK: name: qfp_vabs32
+# CHECK: V6_vabs_qf32_qf32
+---
+name: qfp_vabs32
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    $v0 = V6_vL32Ub_ai $r0, 0
+    $v1 = V6_vconv_sf_qf32 $v0
+    $v2 = V6_vabs_qf32_sf $v1
+    $v3 = V6_vconv_sf_qf32 $v2
+    V6_vS32Ub_ai $r2, 0, $v3
+...
+
+# CHECK-LABEL: name: qfp_vabs16
+# CHECK: V6_vabs_qf16_qf16
+---
+name: qfp_vabs16
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    $v0 = V6_vL32Ub_ai $r0, 0
+    $v1 = V6_vconv_hf_qf16 $v0
+    $v2 = V6_vabs_qf16_hf $v1
+    $v3 = V6_vconv_hf_qf16 $v2
+    V6_vS32Ub_ai $r2, 0, $v3
+...
diff --git a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll
index 559bb68741e12..930cf8152b756 100644
--- a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll
+++ b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll
@@ -6,11 +6,11 @@
 target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
 target triple = "hexagon"
 
-define i32 @fred(ptr %a0) #0 {
+define i32 @fred(ptr %a0, i32 %cond) #0 {
 ; CHECK-LABEL: fred:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     if (p0) jump:nt .LBB0_2
+; CHECK-NEXT:     p0 = cmp.eq(r1,#5); if (!p0.new) jump:t .LBB0_2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:  // %bb.1: // %b2
 ; CHECK-NEXT:    {
@@ -40,7 +40,7 @@ define i32 @fred(ptr %a0) #0 {
 ; CHECK-NEXT:     jumpr r31
 ; CHECK-NEXT:    }
 b0:
-  switch i32 undef, label %b14 [
+  switch i32 %cond, label %b14 [
     i32 5, label %b2
     i32 3, label %b1
   ]
diff --git a/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll b/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll
new file mode 100644
index 0000000000000..36670fa801b36
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll
@@ -0,0 +1,2239 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: sed 's/iGRLen/i32/g' %s | llc --mtriple=loongarch32 --mattr=+ual \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,LA32,LA32-UAL
+; RUN: sed 's/iGRLen/i64/g' %s | llc --mtriple=loongarch64 --mattr=+ual \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,LA64,LA64-UAL
+; RUN: sed 's/iGRLen/i32/g' %s | llc --mtriple=loongarch32 --mattr=-ual \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,LA32,LA32-NUAL
+; RUN: sed 's/iGRLen/i64/g' %s | llc --mtriple=loongarch64 --mattr=-ual \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,LA64,LA64-NUAL
+
+declare signext i32 @bcmp(ptr, ptr, iGRLen) nounwind readonly
+declare signext i32 @memcmp(ptr, ptr, iGRLen) nounwind readonly
+
+define signext i32 @bcmp_size_0(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-LABEL: bcmp_size_0:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    move $a2, $zero
+; LA32-NEXT:    bl bcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: bcmp_size_0:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    move $a2, $zero
+; LA64-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 0)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_1(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: bcmp_size_1:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.bu $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.bu $a1, $a1, 0
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_1:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.bu $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.bu $a1, $a1, 0
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_1:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 1
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_1:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 1
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 1)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_2(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: bcmp_size_2:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.hu $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.hu $a1, $a1, 0
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_2:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.hu $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.hu $a1, $a1, 0
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_2:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 2
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_2:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 2
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 2)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_3(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: bcmp_size_3:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.hu $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.hu $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.bu $a0, $a0, 2
+; LA32-UAL-NEXT:    ld.bu $a1, $a1, 2
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a0, $a2, $a0
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_3:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.hu $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.hu $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.bu $a0, $a0, 2
+; LA64-UAL-NEXT:    ld.bu $a1, $a1, 2
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a0, $a2, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_3:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 3
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_3:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 3
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 3)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_4(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: bcmp_size_4:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_4:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_4:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 4
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_4:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 4
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_5(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: bcmp_size_5:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.bu $a0, $a0, 4
+; LA32-UAL-NEXT:    ld.bu $a1, $a1, 4
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a0, $a2, $a0
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_5:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.bu $a0, $a0, 4
+; LA64-UAL-NEXT:    ld.bu $a1, $a1, 4
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a0, $a2, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_5:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 5
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_5:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 5
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 5)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: bcmp_size_6:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.hu $a0, $a0, 4
+; LA32-UAL-NEXT:    ld.hu $a1, $a1, 4
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a0, $a2, $a0
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_6:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.hu $a0, $a0, 4
+; LA64-UAL-NEXT:    ld.hu $a1, $a1, 4
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a0, $a2, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_6:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 6
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_6:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 6
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 6)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_7(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: bcmp_size_7:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 3
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 3
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a0, $a2, $a0
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_7:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 3
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 3
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a0, $a2, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_7:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 7
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_7:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 7
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 7)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_8(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: bcmp_size_8:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 4
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a0, $a2, $a0
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_8:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 0
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_8:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 8
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_8:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 8
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 8)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_15(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: bcmp_size_15:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a5, $a1, 4
+; LA32-UAL-NEXT:    ld.w $a6, $a0, 8
+; LA32-UAL-NEXT:    ld.w $a7, $a1, 8
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 11
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 11
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a3, $a4, $a5
+; LA32-UAL-NEXT:    xor $a4, $a6, $a7
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a1, $a2, $a3
+; LA32-UAL-NEXT:    or $a0, $a4, $a0
+; LA32-UAL-NEXT:    or $a0, $a1, $a0
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_15:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 7
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 7
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a0, $a2, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_15:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 15
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_15:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 15
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 15)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_16(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: bcmp_size_16:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a5, $a1, 4
+; LA32-UAL-NEXT:    ld.w $a6, $a0, 8
+; LA32-UAL-NEXT:    ld.w $a7, $a1, 8
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 12
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 12
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a3, $a4, $a5
+; LA32-UAL-NEXT:    xor $a4, $a6, $a7
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a1, $a2, $a3
+; LA32-UAL-NEXT:    or $a0, $a4, $a0
+; LA32-UAL-NEXT:    or $a0, $a1, $a0
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_16:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 8
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a0, $a2, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_16:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 16
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_16:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 16
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 16)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-LABEL: bcmp_size_31:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 31
+; LA32-NEXT:    bl bcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_31:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.d $a4, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a5, $a1, 8
+; LA64-UAL-NEXT:    ld.d $a6, $a0, 16
+; LA64-UAL-NEXT:    ld.d $a7, $a1, 16
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 23
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 23
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a3, $a4, $a5
+; LA64-UAL-NEXT:    xor $a4, $a6, $a7
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a1, $a2, $a3
+; LA64-UAL-NEXT:    or $a0, $a4, $a0
+; LA64-UAL-NEXT:    or $a0, $a1, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_31:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 31
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 31)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-LABEL: bcmp_size_32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 32
+; LA32-NEXT:    bl bcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_32:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.d $a4, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a5, $a1, 8
+; LA64-UAL-NEXT:    ld.d $a6, $a0, 16
+; LA64-UAL-NEXT:    ld.d $a7, $a1, 16
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 24
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 24
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a3, $a4, $a5
+; LA64-UAL-NEXT:    xor $a4, $a6, $a7
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a1, $a2, $a3
+; LA64-UAL-NEXT:    or $a0, $a4, $a0
+; LA64-UAL-NEXT:    or $a0, $a1, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_32:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 32
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 32)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-LABEL: bcmp_size_63:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 63
+; LA32-NEXT:    bl bcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: bcmp_size_63:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    ori $a2, $zero, 63
+; LA64-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 63)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-LABEL: bcmp_size_64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 64
+; LA32-NEXT:    bl bcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: bcmp_size_64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    ori $a2, $zero, 64
+; LA64-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 64)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_127(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-LABEL: bcmp_size_127:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 127
+; LA32-NEXT:    bl bcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: bcmp_size_127:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    ori $a2, $zero, 127
+; LA64-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 127)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_128(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-LABEL: bcmp_size_128:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 128
+; LA32-NEXT:    bl bcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: bcmp_size_128:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    ori $a2, $zero, 128
+; LA64-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 128)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_runtime(ptr %s1, ptr %s2, iGRLen %len) nounwind optsize {
+; LA32-LABEL: bcmp_size_runtime:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    bl bcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: bcmp_size_runtime:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen %len)
+  ret i32 %bcmp
+}
+
+define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: bcmp_eq_zero:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    sltui $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_eq_zero:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    sltui $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_eq_zero:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 4
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    sltui $a0, $a0, 1
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_eq_zero:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 4
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    sltui $a0, $a0, 1
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4)
+  %ret = icmp eq i32 %bcmp, 0
+  ret i1 %ret
+}
+
+define i1 @bcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: bcmp_lt_zero:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    move $a0, $zero
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_lt_zero:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    move $a0, $zero
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_lt_zero:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 4
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    srli.w $a0, $a0, 31
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_lt_zero:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 4
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    slti $a0, $a0, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4)
+  %ret = icmp slt i32 %bcmp, 0
+  ret i1 %ret
+}
+
+define i1 @bcmp_gt_zero(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: bcmp_gt_zero:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_gt_zero:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_gt_zero:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 4
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    slt $a0, $zero, $a0
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_gt_zero:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 4
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    slt $a0, $zero, $a0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4)
+  %ret = icmp sgt i32 %bcmp, 0
+  ret i1 %ret
+}
+
+define signext i32 @memcmp_size_0(ptr %s1, ptr %s2) nounwind optsize {
+; CHECK-LABEL: memcmp_size_0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    move $a0, $zero
+; CHECK-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 0)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_1(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: memcmp_size_1:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.bu $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.bu $a1, $a1, 0
+; LA32-UAL-NEXT:    sub.w $a0, $a0, $a1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_1:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.bu $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.bu $a1, $a1, 0
+; LA64-UAL-NEXT:    sub.d $a0, $a0, $a1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_1:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 1
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_1:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 1
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 1)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: memcmp_size_2:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.hu $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.hu $a1, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a2, $a0, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 8
+; LA32-UAL-NEXT:    or $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a2, $a1, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 8
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    lu12i.w $a2, 15
+; LA32-UAL-NEXT:    ori $a2, $a2, 4095
+; LA32-UAL-NEXT:    and $a0, $a0, $a2
+; LA32-UAL-NEXT:    and $a1, $a1, $a2
+; LA32-UAL-NEXT:    sub.w $a0, $a0, $a1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_2:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.h $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.h $a1, $a1, 0
+; LA64-UAL-NEXT:    revb.2h $a0, $a0
+; LA64-UAL-NEXT:    revb.2h $a1, $a1
+; LA64-UAL-NEXT:    bstrpick.d $a0, $a0, 15, 0
+; LA64-UAL-NEXT:    bstrpick.d $a1, $a1, 15, 0
+; LA64-UAL-NEXT:    sub.d $a0, $a0, $a1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_2:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 2
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_2:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 2
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 2)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: memcmp_size_3:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.bu $a2, $a0, 2
+; LA32-UAL-NEXT:    ld.hu $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.bu $a3, $a1, 2
+; LA32-UAL-NEXT:    ld.hu $a1, $a1, 0
+; LA32-UAL-NEXT:    lu12i.w $a4, 15
+; LA32-UAL-NEXT:    ori $a4, $a4, 3840
+; LA32-UAL-NEXT:    and $a5, $a0, $a4
+; LA32-UAL-NEXT:    or $a2, $a5, $a2
+; LA32-UAL-NEXT:    slli.w $a2, $a2, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a2, $a0
+; LA32-UAL-NEXT:    and $a2, $a1, $a4
+; LA32-UAL-NEXT:    or $a2, $a2, $a3
+; LA32-UAL-NEXT:    slli.w $a2, $a2, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a2, $a1
+; LA32-UAL-NEXT:    sltu $a2, $a0, $a1
+; LA32-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA32-UAL-NEXT:    sub.w $a0, $a0, $a2
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_3:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.bu $a2, $a0, 2
+; LA64-UAL-NEXT:    ld.hu $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.bu $a3, $a1, 2
+; LA64-UAL-NEXT:    ld.hu $a1, $a1, 0
+; LA64-UAL-NEXT:    slli.d $a2, $a2, 16
+; LA64-UAL-NEXT:    or $a0, $a0, $a2
+; LA64-UAL-NEXT:    slli.d $a2, $a3, 16
+; LA64-UAL-NEXT:    or $a1, $a1, $a2
+; LA64-UAL-NEXT:    revb.2w $a0, $a0
+; LA64-UAL-NEXT:    addi.w $a0, $a0, 0
+; LA64-UAL-NEXT:    revb.2w $a1, $a1
+; LA64-UAL-NEXT:    addi.w $a1, $a1, 0
+; LA64-UAL-NEXT:    sltu $a2, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA64-UAL-NEXT:    sub.d $a0, $a0, $a2
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_3:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 3
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_3:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 3
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 3)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: memcmp_size_4:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a2, $a0, 8
+; LA32-UAL-NEXT:    lu12i.w $a3, 15
+; LA32-UAL-NEXT:    ori $a3, $a3, 3840
+; LA32-UAL-NEXT:    and $a2, $a2, $a3
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a3
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a2, $a1, 8
+; LA32-UAL-NEXT:    and $a2, $a2, $a3
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    and $a3, $a1, $a3
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a3
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    sltu $a2, $a0, $a1
+; LA32-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA32-UAL-NEXT:    sub.w $a0, $a0, $a2
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_4:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA64-UAL-NEXT:    revb.2w $a0, $a0
+; LA64-UAL-NEXT:    addi.w $a0, $a0, 0
+; LA64-UAL-NEXT:    revb.2w $a1, $a1
+; LA64-UAL-NEXT:    addi.w $a1, $a1, 0
+; LA64-UAL-NEXT:    sltu $a2, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA64-UAL-NEXT:    sub.d $a0, $a0, $a2
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_4:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 4
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_4:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 4
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: memcmp_size_5:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a4, $a2, 8
+; LA32-UAL-NEXT:    lu12i.w $a5, 15
+; LA32-UAL-NEXT:    ori $a5, $a5, 3840
+; LA32-UAL-NEXT:    and $a4, $a4, $a5
+; LA32-UAL-NEXT:    srli.w $a6, $a2, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    and $a6, $a2, $a5
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a2, $a2, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a6
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    srli.w $a4, $a3, 8
+; LA32-UAL-NEXT:    and $a4, $a4, $a5
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    and $a5, $a3, $a5
+; LA32-UAL-NEXT:    slli.w $a5, $a5, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    or $a3, $a3, $a4
+; LA32-UAL-NEXT:    bne $a2, $a3, .LBB26_2
+; LA32-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA32-UAL-NEXT:    ld.bu $a0, $a0, 4
+; LA32-UAL-NEXT:    ld.bu $a1, $a1, 4
+; LA32-UAL-NEXT:    sub.w $a0, $a0, $a1
+; LA32-UAL-NEXT:    ret
+; LA32-UAL-NEXT:  .LBB26_2: # %res_block
+; LA32-UAL-NEXT:    sltu $a0, $a2, $a3
+; LA32-UAL-NEXT:    sub.w $a0, $zero, $a0
+; LA32-UAL-NEXT:    ori $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_5:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.bu $a2, $a0, 4
+; LA64-UAL-NEXT:    ld.wu $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.bu $a3, $a1, 4
+; LA64-UAL-NEXT:    ld.wu $a1, $a1, 0
+; LA64-UAL-NEXT:    slli.d $a2, $a2, 32
+; LA64-UAL-NEXT:    or $a0, $a0, $a2
+; LA64-UAL-NEXT:    slli.d $a2, $a3, 32
+; LA64-UAL-NEXT:    or $a1, $a1, $a2
+; LA64-UAL-NEXT:    revb.d $a0, $a0
+; LA64-UAL-NEXT:    revb.d $a1, $a1
+; LA64-UAL-NEXT:    sltu $a2, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA64-UAL-NEXT:    sub.d $a0, $a0, $a2
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_5:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 5
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_5:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 5
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 5)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: memcmp_size_6:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    lu12i.w $a2, 15
+; LA32-UAL-NEXT:    ori $a6, $a2, 3840
+; LA32-UAL-NEXT:    and $a5, $a5, $a6
+; LA32-UAL-NEXT:    srli.w $a7, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a7
+; LA32-UAL-NEXT:    and $a7, $a3, $a6
+; LA32-UAL-NEXT:    slli.w $a7, $a7, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a7
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a6
+; LA32-UAL-NEXT:    srli.w $a7, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a7
+; LA32-UAL-NEXT:    and $a6, $a4, $a6
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB27_3
+; LA32-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA32-UAL-NEXT:    ld.hu $a0, $a0, 4
+; LA32-UAL-NEXT:    ld.hu $a1, $a1, 4
+; LA32-UAL-NEXT:    srli.w $a3, $a0, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 8
+; LA32-UAL-NEXT:    or $a0, $a0, $a3
+; LA32-UAL-NEXT:    srli.w $a3, $a1, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 8
+; LA32-UAL-NEXT:    or $a1, $a1, $a3
+; LA32-UAL-NEXT:    ori $a2, $a2, 4095
+; LA32-UAL-NEXT:    and $a3, $a0, $a2
+; LA32-UAL-NEXT:    and $a4, $a1, $a2
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB27_3
+; LA32-UAL-NEXT:  # %bb.2:
+; LA32-UAL-NEXT:    move $a0, $zero
+; LA32-UAL-NEXT:    ret
+; LA32-UAL-NEXT:  .LBB27_3: # %res_block
+; LA32-UAL-NEXT:    sltu $a0, $a3, $a4
+; LA32-UAL-NEXT:    sub.w $a0, $zero, $a0
+; LA32-UAL-NEXT:    ori $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_6:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.hu $a2, $a0, 4
+; LA64-UAL-NEXT:    ld.wu $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.hu $a3, $a1, 4
+; LA64-UAL-NEXT:    ld.wu $a1, $a1, 0
+; LA64-UAL-NEXT:    slli.d $a2, $a2, 32
+; LA64-UAL-NEXT:    or $a0, $a0, $a2
+; LA64-UAL-NEXT:    slli.d $a2, $a3, 32
+; LA64-UAL-NEXT:    or $a1, $a1, $a2
+; LA64-UAL-NEXT:    revb.d $a0, $a0
+; LA64-UAL-NEXT:    revb.d $a1, $a1
+; LA64-UAL-NEXT:    sltu $a2, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA64-UAL-NEXT:    sub.d $a0, $a0, $a2
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_6:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 6
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_6:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 6
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 6)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: memcmp_size_7:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    lu12i.w $a2, 15
+; LA32-UAL-NEXT:    ori $a2, $a2, 3840
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB28_3
+; LA32-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 3
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 3
+; LA32-UAL-NEXT:    srli.w $a3, $a0, 8
+; LA32-UAL-NEXT:    and $a3, $a3, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a2
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a3, $a0, $a3
+; LA32-UAL-NEXT:    srli.w $a0, $a1, 8
+; LA32-UAL-NEXT:    and $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    and $a2, $a1, $a2
+; LA32-UAL-NEXT:    slli.w $a2, $a2, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    or $a4, $a1, $a0
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB28_3
+; LA32-UAL-NEXT:  # %bb.2:
+; LA32-UAL-NEXT:    move $a0, $zero
+; LA32-UAL-NEXT:    ret
+; LA32-UAL-NEXT:  .LBB28_3: # %res_block
+; LA32-UAL-NEXT:    sltu $a0, $a3, $a4
+; LA32-UAL-NEXT:    sub.w $a0, $zero, $a0
+; LA32-UAL-NEXT:    ori $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_7:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA64-UAL-NEXT:    revb.2w $a2, $a2
+; LA64-UAL-NEXT:    addi.w $a4, $a2, 0
+; LA64-UAL-NEXT:    revb.2w $a3, $a3
+; LA64-UAL-NEXT:    addi.w $a5, $a3, 0
+; LA64-UAL-NEXT:    bne $a4, $a5, .LBB28_3
+; LA64-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 3
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 3
+; LA64-UAL-NEXT:    revb.2w $a2, $a0
+; LA64-UAL-NEXT:    addi.w $a0, $a2, 0
+; LA64-UAL-NEXT:    revb.2w $a3, $a1
+; LA64-UAL-NEXT:    addi.w $a1, $a3, 0
+; LA64-UAL-NEXT:    bne $a0, $a1, .LBB28_3
+; LA64-UAL-NEXT:  # %bb.2:
+; LA64-UAL-NEXT:    move $a0, $zero
+; LA64-UAL-NEXT:    ret
+; LA64-UAL-NEXT:  .LBB28_3: # %res_block
+; LA64-UAL-NEXT:    addi.w $a0, $a3, 0
+; LA64-UAL-NEXT:    addi.w $a1, $a2, 0
+; LA64-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA64-UAL-NEXT:    sub.d $a0, $zero, $a0
+; LA64-UAL-NEXT:    ori $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_7:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 7
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_7:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 7
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 7)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: memcmp_size_8:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    lu12i.w $a2, 15
+; LA32-UAL-NEXT:    ori $a2, $a2, 3840
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB29_3
+; LA32-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 4
+; LA32-UAL-NEXT:    srli.w $a3, $a0, 8
+; LA32-UAL-NEXT:    and $a3, $a3, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a2
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a3, $a0, $a3
+; LA32-UAL-NEXT:    srli.w $a0, $a1, 8
+; LA32-UAL-NEXT:    and $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    and $a2, $a1, $a2
+; LA32-UAL-NEXT:    slli.w $a2, $a2, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    or $a4, $a1, $a0
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB29_3
+; LA32-UAL-NEXT:  # %bb.2:
+; LA32-UAL-NEXT:    move $a0, $zero
+; LA32-UAL-NEXT:    ret
+; LA32-UAL-NEXT:  .LBB29_3: # %res_block
+; LA32-UAL-NEXT:    sltu $a0, $a3, $a4
+; LA32-UAL-NEXT:    sub.w $a0, $zero, $a0
+; LA32-UAL-NEXT:    ori $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_8:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 0
+; LA64-UAL-NEXT:    revb.d $a0, $a0
+; LA64-UAL-NEXT:    revb.d $a1, $a1
+; LA64-UAL-NEXT:    sltu $a2, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA64-UAL-NEXT:    sub.d $a0, $a0, $a2
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_8:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 8
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_8:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 8
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 8)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: memcmp_size_15:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    lu12i.w $a2, 15
+; LA32-UAL-NEXT:    ori $a2, $a2, 3840
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB30_5
+; LA32-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 4
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB30_5
+; LA32-UAL-NEXT:  # %bb.2: # %loadbb2
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 8
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 8
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB30_5
+; LA32-UAL-NEXT:  # %bb.3: # %loadbb3
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 11
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 11
+; LA32-UAL-NEXT:    srli.w $a3, $a0, 8
+; LA32-UAL-NEXT:    and $a3, $a3, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a2
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a3, $a0, $a3
+; LA32-UAL-NEXT:    srli.w $a0, $a1, 8
+; LA32-UAL-NEXT:    and $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    and $a2, $a1, $a2
+; LA32-UAL-NEXT:    slli.w $a2, $a2, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    or $a4, $a1, $a0
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB30_5
+; LA32-UAL-NEXT:  # %bb.4:
+; LA32-UAL-NEXT:    move $a0, $zero
+; LA32-UAL-NEXT:    ret
+; LA32-UAL-NEXT:  .LBB30_5: # %res_block
+; LA32-UAL-NEXT:    sltu $a0, $a3, $a4
+; LA32-UAL-NEXT:    sub.w $a0, $zero, $a0
+; LA32-UAL-NEXT:    ori $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_15:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB30_3
+; LA64-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 7
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 7
+; LA64-UAL-NEXT:    revb.d $a2, $a0
+; LA64-UAL-NEXT:    revb.d $a3, $a1
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB30_3
+; LA64-UAL-NEXT:  # %bb.2:
+; LA64-UAL-NEXT:    move $a0, $zero
+; LA64-UAL-NEXT:    ret
+; LA64-UAL-NEXT:  .LBB30_3: # %res_block
+; LA64-UAL-NEXT:    sltu $a0, $a2, $a3
+; LA64-UAL-NEXT:    sub.d $a0, $zero, $a0
+; LA64-UAL-NEXT:    ori $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_15:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 15
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_15:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 15
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 15)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: memcmp_size_16:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    lu12i.w $a2, 15
+; LA32-UAL-NEXT:    ori $a2, $a2, 3840
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB31_5
+; LA32-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 4
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB31_5
+; LA32-UAL-NEXT:  # %bb.2: # %loadbb2
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 8
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 8
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB31_5
+; LA32-UAL-NEXT:  # %bb.3: # %loadbb3
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 12
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 12
+; LA32-UAL-NEXT:    srli.w $a3, $a0, 8
+; LA32-UAL-NEXT:    and $a3, $a3, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a2
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a3, $a0, $a3
+; LA32-UAL-NEXT:    srli.w $a0, $a1, 8
+; LA32-UAL-NEXT:    and $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    and $a2, $a1, $a2
+; LA32-UAL-NEXT:    slli.w $a2, $a2, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    or $a4, $a1, $a0
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB31_5
+; LA32-UAL-NEXT:  # %bb.4:
+; LA32-UAL-NEXT:    move $a0, $zero
+; LA32-UAL-NEXT:    ret
+; LA32-UAL-NEXT:  .LBB31_5: # %res_block
+; LA32-UAL-NEXT:    sltu $a0, $a3, $a4
+; LA32-UAL-NEXT:    sub.w $a0, $zero, $a0
+; LA32-UAL-NEXT:    ori $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_16:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB31_3
+; LA64-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 8
+; LA64-UAL-NEXT:    revb.d $a2, $a0
+; LA64-UAL-NEXT:    revb.d $a3, $a1
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB31_3
+; LA64-UAL-NEXT:  # %bb.2:
+; LA64-UAL-NEXT:    move $a0, $zero
+; LA64-UAL-NEXT:    ret
+; LA64-UAL-NEXT:  .LBB31_3: # %res_block
+; LA64-UAL-NEXT:    sltu $a0, $a2, $a3
+; LA64-UAL-NEXT:    sub.d $a0, $zero, $a0
+; LA64-UAL-NEXT:    ori $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_16:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 16
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_16:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 16
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 16)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-LABEL: memcmp_size_31:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 31
+; LA32-NEXT:    bl memcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_31:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB32_5
+; LA64-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 8
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB32_5
+; LA64-UAL-NEXT:  # %bb.2: # %loadbb2
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 16
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 16
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB32_5
+; LA64-UAL-NEXT:  # %bb.3: # %loadbb3
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 23
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 23
+; LA64-UAL-NEXT:    revb.d $a2, $a0
+; LA64-UAL-NEXT:    revb.d $a3, $a1
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB32_5
+; LA64-UAL-NEXT:  # %bb.4:
+; LA64-UAL-NEXT:    move $a0, $zero
+; LA64-UAL-NEXT:    ret
+; LA64-UAL-NEXT:  .LBB32_5: # %res_block
+; LA64-UAL-NEXT:    sltu $a0, $a2, $a3
+; LA64-UAL-NEXT:    sub.d $a0, $zero, $a0
+; LA64-UAL-NEXT:    ori $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_31:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 31
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 31)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-LABEL: memcmp_size_32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 32
+; LA32-NEXT:    bl memcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_32:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB33_5
+; LA64-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 8
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB33_5
+; LA64-UAL-NEXT:  # %bb.2: # %loadbb2
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 16
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 16
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB33_5
+; LA64-UAL-NEXT:  # %bb.3: # %loadbb3
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 24
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 24
+; LA64-UAL-NEXT:    revb.d $a2, $a0
+; LA64-UAL-NEXT:    revb.d $a3, $a1
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB33_5
+; LA64-UAL-NEXT:  # %bb.4:
+; LA64-UAL-NEXT:    move $a0, $zero
+; LA64-UAL-NEXT:    ret
+; LA64-UAL-NEXT:  .LBB33_5: # %res_block
+; LA64-UAL-NEXT:    sltu $a0, $a2, $a3
+; LA64-UAL-NEXT:    sub.d $a0, $zero, $a0
+; LA64-UAL-NEXT:    ori $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_32:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 32
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 32)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_63(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-LABEL: memcmp_size_63:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 63
+; LA32-NEXT:    bl memcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: memcmp_size_63:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    ori $a2, $zero, 63
+; LA64-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 63)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_64(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-LABEL: memcmp_size_64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 64
+; LA32-NEXT:    bl memcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: memcmp_size_64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    ori $a2, $zero, 64
+; LA64-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 64)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_127(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-LABEL: memcmp_size_127:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 127
+; LA32-NEXT:    bl memcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: memcmp_size_127:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    ori $a2, $zero, 127
+; LA64-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 127)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_128(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-LABEL: memcmp_size_128:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 128
+; LA32-NEXT:    bl memcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: memcmp_size_128:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    ori $a2, $zero, 128
+; LA64-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 128)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_runtime(ptr %s1, ptr %s2, iGRLen %len) nounwind optsize {
+; LA32-LABEL: memcmp_size_runtime:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    bl memcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: memcmp_size_runtime:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen %len)
+  ret i32 %memcmp
+}
+
+define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: memcmp_eq_zero:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    sltui $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_eq_zero:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    sltui $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_eq_zero:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    ld.bu $a2, $a1, 1
+; LA32-NUAL-NEXT:    ld.bu $a3, $a1, 0
+; LA32-NUAL-NEXT:    ld.bu $a4, $a1, 2
+; LA32-NUAL-NEXT:    ld.bu $a1, $a1, 3
+; LA32-NUAL-NEXT:    slli.w $a2, $a2, 8
+; LA32-NUAL-NEXT:    or $a2, $a2, $a3
+; LA32-NUAL-NEXT:    slli.w $a3, $a4, 16
+; LA32-NUAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-NUAL-NEXT:    or $a1, $a1, $a3
+; LA32-NUAL-NEXT:    or $a1, $a1, $a2
+; LA32-NUAL-NEXT:    ld.bu $a2, $a0, 1
+; LA32-NUAL-NEXT:    ld.bu $a3, $a0, 0
+; LA32-NUAL-NEXT:    ld.bu $a4, $a0, 2
+; LA32-NUAL-NEXT:    ld.bu $a0, $a0, 3
+; LA32-NUAL-NEXT:    slli.w $a2, $a2, 8
+; LA32-NUAL-NEXT:    or $a2, $a2, $a3
+; LA32-NUAL-NEXT:    slli.w $a3, $a4, 16
+; LA32-NUAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-NUAL-NEXT:    or $a0, $a0, $a3
+; LA32-NUAL-NEXT:    or $a0, $a0, $a2
+; LA32-NUAL-NEXT:    xor $a0, $a0, $a1
+; LA32-NUAL-NEXT:    sltui $a0, $a0, 1
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_eq_zero:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    ld.bu $a2, $a1, 1
+; LA64-NUAL-NEXT:    ld.bu $a3, $a1, 0
+; LA64-NUAL-NEXT:    ld.bu $a4, $a1, 2
+; LA64-NUAL-NEXT:    ld.b $a1, $a1, 3
+; LA64-NUAL-NEXT:    slli.d $a2, $a2, 8
+; LA64-NUAL-NEXT:    or $a2, $a2, $a3
+; LA64-NUAL-NEXT:    slli.d $a3, $a4, 16
+; LA64-NUAL-NEXT:    slli.d $a1, $a1, 24
+; LA64-NUAL-NEXT:    or $a1, $a1, $a3
+; LA64-NUAL-NEXT:    or $a1, $a1, $a2
+; LA64-NUAL-NEXT:    ld.bu $a2, $a0, 1
+; LA64-NUAL-NEXT:    ld.bu $a3, $a0, 0
+; LA64-NUAL-NEXT:    ld.bu $a4, $a0, 2
+; LA64-NUAL-NEXT:    ld.b $a0, $a0, 3
+; LA64-NUAL-NEXT:    slli.d $a2, $a2, 8
+; LA64-NUAL-NEXT:    or $a2, $a2, $a3
+; LA64-NUAL-NEXT:    slli.d $a3, $a4, 16
+; LA64-NUAL-NEXT:    slli.d $a0, $a0, 24
+; LA64-NUAL-NEXT:    or $a0, $a0, $a3
+; LA64-NUAL-NEXT:    or $a0, $a0, $a2
+; LA64-NUAL-NEXT:    xor $a0, $a0, $a1
+; LA64-NUAL-NEXT:    sltui $a0, $a0, 1
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4)
+  %ret = icmp eq i32 %memcmp, 0
+  ret i1 %ret
+}
+
+define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: memcmp_lt_zero:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a2, $a0, 8
+; LA32-UAL-NEXT:    lu12i.w $a3, 15
+; LA32-UAL-NEXT:    ori $a3, $a3, 3840
+; LA32-UAL-NEXT:    and $a2, $a2, $a3
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a3
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a2, $a1, 8
+; LA32-UAL-NEXT:    and $a2, $a2, $a3
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    and $a3, $a1, $a3
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a3
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    sltu $a0, $a0, $a1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_lt_zero:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA64-UAL-NEXT:    revb.2w $a0, $a0
+; LA64-UAL-NEXT:    addi.w $a0, $a0, 0
+; LA64-UAL-NEXT:    revb.2w $a1, $a1
+; LA64-UAL-NEXT:    addi.w $a1, $a1, 0
+; LA64-UAL-NEXT:    sltu $a0, $a0, $a1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_lt_zero:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 4
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    srli.w $a0, $a0, 31
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_lt_zero:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 4
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    slti $a0, $a0, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4)
+  %ret = icmp slt i32 %memcmp, 0
+  ret i1 %ret
+}
+
+define i1 @memcmp_gt_zero(ptr %s1, ptr %s2) nounwind optsize {
+; LA32-UAL-LABEL: memcmp_gt_zero:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a2, $a0, 8
+; LA32-UAL-NEXT:    lu12i.w $a3, 15
+; LA32-UAL-NEXT:    ori $a3, $a3, 3840
+; LA32-UAL-NEXT:    and $a2, $a2, $a3
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a3
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a2, $a1, 8
+; LA32-UAL-NEXT:    and $a2, $a2, $a3
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    and $a3, $a1, $a3
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a3
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_gt_zero:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA64-UAL-NEXT:    revb.2w $a0, $a0
+; LA64-UAL-NEXT:    addi.w $a0, $a0, 0
+; LA64-UAL-NEXT:    revb.2w $a1, $a1
+; LA64-UAL-NEXT:    addi.w $a1, $a1, 0
+; LA64-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_gt_zero:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 4
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    slt $a0, $zero, $a0
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_gt_zero:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 4
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    slt $a0, $zero, $a0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4)
+  %ret = icmp sgt i32 %memcmp, 0
+  ret i1 %ret
+}
diff --git a/llvm/test/CodeGen/LoongArch/expandmemcmp.ll b/llvm/test/CodeGen/LoongArch/expandmemcmp.ll
new file mode 100644
index 0000000000000..c1bf850baa8c3
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/expandmemcmp.ll
@@ -0,0 +1,3106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: sed 's/iGRLen/i32/g' %s | llc --mtriple=loongarch32 --mattr=+ual \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,LA32,LA32-UAL
+; RUN: sed 's/iGRLen/i64/g' %s | llc --mtriple=loongarch64 --mattr=+ual \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,LA64,LA64-UAL
+; RUN: sed 's/iGRLen/i32/g' %s | llc --mtriple=loongarch32 --mattr=-ual \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,LA32,LA32-NUAL
+; RUN: sed 's/iGRLen/i64/g' %s | llc --mtriple=loongarch64 --mattr=-ual \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,LA64,LA64-NUAL
+
+declare signext i32 @bcmp(ptr, ptr, iGRLen) nounwind readonly
+declare signext i32 @memcmp(ptr, ptr, iGRLen) nounwind readonly
+
+define signext i32 @bcmp_size_0(ptr %s1, ptr %s2) nounwind {
+; LA32-LABEL: bcmp_size_0:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    move $a2, $zero
+; LA32-NEXT:    bl bcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: bcmp_size_0:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    move $a2, $zero
+; LA64-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 0)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_1(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: bcmp_size_1:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.bu $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.bu $a1, $a1, 0
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_1:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.bu $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.bu $a1, $a1, 0
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_1:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 1
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_1:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 1
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 1)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_2(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: bcmp_size_2:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.hu $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.hu $a1, $a1, 0
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_2:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.hu $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.hu $a1, $a1, 0
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_2:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 2
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_2:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 2
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 2)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_3(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: bcmp_size_3:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.hu $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.hu $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.bu $a0, $a0, 2
+; LA32-UAL-NEXT:    ld.bu $a1, $a1, 2
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a0, $a2, $a0
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_3:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.hu $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.hu $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.bu $a0, $a0, 2
+; LA64-UAL-NEXT:    ld.bu $a1, $a1, 2
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a0, $a2, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_3:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 3
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_3:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 3
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 3)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_4(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: bcmp_size_4:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_4:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_4:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 4
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_4:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 4
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_5(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: bcmp_size_5:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.bu $a0, $a0, 4
+; LA32-UAL-NEXT:    ld.bu $a1, $a1, 4
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a0, $a2, $a0
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_5:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.bu $a0, $a0, 4
+; LA64-UAL-NEXT:    ld.bu $a1, $a1, 4
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a0, $a2, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_5:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 5
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_5:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 5
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 5)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_6(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: bcmp_size_6:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.hu $a0, $a0, 4
+; LA32-UAL-NEXT:    ld.hu $a1, $a1, 4
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a0, $a2, $a0
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_6:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.hu $a0, $a0, 4
+; LA64-UAL-NEXT:    ld.hu $a1, $a1, 4
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a0, $a2, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_6:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 6
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_6:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 6
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 6)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_7(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: bcmp_size_7:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 3
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 3
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a0, $a2, $a0
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_7:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 3
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 3
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a0, $a2, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_7:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 7
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_7:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 7
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 7)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_8(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: bcmp_size_8:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 4
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a0, $a2, $a0
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_8:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 0
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_8:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 8
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_8:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 8
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 8)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_15(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: bcmp_size_15:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a5, $a1, 4
+; LA32-UAL-NEXT:    ld.w $a6, $a0, 8
+; LA32-UAL-NEXT:    ld.w $a7, $a1, 8
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 11
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 11
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a3, $a4, $a5
+; LA32-UAL-NEXT:    xor $a4, $a6, $a7
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a1, $a2, $a3
+; LA32-UAL-NEXT:    or $a0, $a4, $a0
+; LA32-UAL-NEXT:    or $a0, $a1, $a0
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_15:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 7
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 7
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a0, $a2, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_15:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 15
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_15:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 15
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 15)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_16(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: bcmp_size_16:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a5, $a1, 4
+; LA32-UAL-NEXT:    ld.w $a6, $a0, 8
+; LA32-UAL-NEXT:    ld.w $a7, $a1, 8
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 12
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 12
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a3, $a4, $a5
+; LA32-UAL-NEXT:    xor $a4, $a6, $a7
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a1, $a2, $a3
+; LA32-UAL-NEXT:    or $a0, $a4, $a0
+; LA32-UAL-NEXT:    or $a0, $a1, $a0
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_16:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 8
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a0, $a2, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_16:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 16
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_16:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 16
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 16)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: bcmp_size_31:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a5, $a1, 4
+; LA32-UAL-NEXT:    ld.w $a6, $a0, 8
+; LA32-UAL-NEXT:    ld.w $a7, $a1, 8
+; LA32-UAL-NEXT:    ld.w $t0, $a0, 12
+; LA32-UAL-NEXT:    ld.w $t1, $a1, 12
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a3, $a4, $a5
+; LA32-UAL-NEXT:    xor $a4, $a6, $a7
+; LA32-UAL-NEXT:    xor $a5, $t0, $t1
+; LA32-UAL-NEXT:    ld.w $a6, $a0, 16
+; LA32-UAL-NEXT:    ld.w $a7, $a1, 16
+; LA32-UAL-NEXT:    ld.w $t0, $a0, 20
+; LA32-UAL-NEXT:    ld.w $t1, $a1, 20
+; LA32-UAL-NEXT:    ld.w $t2, $a0, 24
+; LA32-UAL-NEXT:    ld.w $t3, $a1, 24
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 27
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 27
+; LA32-UAL-NEXT:    xor $a6, $a6, $a7
+; LA32-UAL-NEXT:    xor $a7, $t0, $t1
+; LA32-UAL-NEXT:    xor $t0, $t2, $t3
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a1, $a2, $a3
+; LA32-UAL-NEXT:    or $a2, $a4, $a5
+; LA32-UAL-NEXT:    or $a3, $a6, $a7
+; LA32-UAL-NEXT:    or $a0, $t0, $a0
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    or $a0, $a3, $a0
+; LA32-UAL-NEXT:    or $a0, $a1, $a0
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_31:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.d $a4, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a5, $a1, 8
+; LA64-UAL-NEXT:    ld.d $a6, $a0, 16
+; LA64-UAL-NEXT:    ld.d $a7, $a1, 16
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 23
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 23
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a3, $a4, $a5
+; LA64-UAL-NEXT:    xor $a4, $a6, $a7
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a1, $a2, $a3
+; LA64-UAL-NEXT:    or $a0, $a4, $a0
+; LA64-UAL-NEXT:    or $a0, $a1, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_31:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 31
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_31:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 31
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 31)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: bcmp_size_32:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a5, $a1, 4
+; LA32-UAL-NEXT:    ld.w $a6, $a0, 8
+; LA32-UAL-NEXT:    ld.w $a7, $a1, 8
+; LA32-UAL-NEXT:    ld.w $t0, $a0, 12
+; LA32-UAL-NEXT:    ld.w $t1, $a1, 12
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a3, $a4, $a5
+; LA32-UAL-NEXT:    xor $a4, $a6, $a7
+; LA32-UAL-NEXT:    xor $a5, $t0, $t1
+; LA32-UAL-NEXT:    ld.w $a6, $a0, 16
+; LA32-UAL-NEXT:    ld.w $a7, $a1, 16
+; LA32-UAL-NEXT:    ld.w $t0, $a0, 20
+; LA32-UAL-NEXT:    ld.w $t1, $a1, 20
+; LA32-UAL-NEXT:    ld.w $t2, $a0, 24
+; LA32-UAL-NEXT:    ld.w $t3, $a1, 24
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 28
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 28
+; LA32-UAL-NEXT:    xor $a6, $a6, $a7
+; LA32-UAL-NEXT:    xor $a7, $t0, $t1
+; LA32-UAL-NEXT:    xor $t0, $t2, $t3
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a1, $a2, $a3
+; LA32-UAL-NEXT:    or $a2, $a4, $a5
+; LA32-UAL-NEXT:    or $a3, $a6, $a7
+; LA32-UAL-NEXT:    or $a0, $t0, $a0
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    or $a0, $a3, $a0
+; LA32-UAL-NEXT:    or $a0, $a1, $a0
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_32:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.d $a4, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a5, $a1, 8
+; LA64-UAL-NEXT:    ld.d $a6, $a0, 16
+; LA64-UAL-NEXT:    ld.d $a7, $a1, 16
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 24
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 24
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a3, $a4, $a5
+; LA64-UAL-NEXT:    xor $a4, $a6, $a7
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a1, $a2, $a3
+; LA64-UAL-NEXT:    or $a0, $a4, $a0
+; LA64-UAL-NEXT:    or $a0, $a1, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_size_32:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 32
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_32:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 32
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 32)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
+; LA32-LABEL: bcmp_size_63:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 63
+; LA32-NEXT:    bl bcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_63:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.d $a4, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a5, $a1, 8
+; LA64-UAL-NEXT:    ld.d $a6, $a0, 16
+; LA64-UAL-NEXT:    ld.d $a7, $a1, 16
+; LA64-UAL-NEXT:    ld.d $t0, $a0, 24
+; LA64-UAL-NEXT:    ld.d $t1, $a1, 24
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a3, $a4, $a5
+; LA64-UAL-NEXT:    xor $a4, $a6, $a7
+; LA64-UAL-NEXT:    xor $a5, $t0, $t1
+; LA64-UAL-NEXT:    ld.d $a6, $a0, 32
+; LA64-UAL-NEXT:    ld.d $a7, $a1, 32
+; LA64-UAL-NEXT:    ld.d $t0, $a0, 40
+; LA64-UAL-NEXT:    ld.d $t1, $a1, 40
+; LA64-UAL-NEXT:    ld.d $t2, $a0, 48
+; LA64-UAL-NEXT:    ld.d $t3, $a1, 48
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 55
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 55
+; LA64-UAL-NEXT:    xor $a6, $a6, $a7
+; LA64-UAL-NEXT:    xor $a7, $t0, $t1
+; LA64-UAL-NEXT:    xor $t0, $t2, $t3
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a1, $a2, $a3
+; LA64-UAL-NEXT:    or $a2, $a4, $a5
+; LA64-UAL-NEXT:    or $a3, $a6, $a7
+; LA64-UAL-NEXT:    or $a0, $t0, $a0
+; LA64-UAL-NEXT:    or $a1, $a1, $a2
+; LA64-UAL-NEXT:    or $a0, $a3, $a0
+; LA64-UAL-NEXT:    or $a0, $a1, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_63:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 63
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 63)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
+; LA32-LABEL: bcmp_size_64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 64
+; LA32-NEXT:    bl bcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_size_64:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.d $a4, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a5, $a1, 8
+; LA64-UAL-NEXT:    ld.d $a6, $a0, 16
+; LA64-UAL-NEXT:    ld.d $a7, $a1, 16
+; LA64-UAL-NEXT:    ld.d $t0, $a0, 24
+; LA64-UAL-NEXT:    ld.d $t1, $a1, 24
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a3, $a4, $a5
+; LA64-UAL-NEXT:    xor $a4, $a6, $a7
+; LA64-UAL-NEXT:    xor $a5, $t0, $t1
+; LA64-UAL-NEXT:    ld.d $a6, $a0, 32
+; LA64-UAL-NEXT:    ld.d $a7, $a1, 32
+; LA64-UAL-NEXT:    ld.d $t0, $a0, 40
+; LA64-UAL-NEXT:    ld.d $t1, $a1, 40
+; LA64-UAL-NEXT:    ld.d $t2, $a0, 48
+; LA64-UAL-NEXT:    ld.d $t3, $a1, 48
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 56
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 56
+; LA64-UAL-NEXT:    xor $a6, $a6, $a7
+; LA64-UAL-NEXT:    xor $a7, $t0, $t1
+; LA64-UAL-NEXT:    xor $t0, $t2, $t3
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a1, $a2, $a3
+; LA64-UAL-NEXT:    or $a2, $a4, $a5
+; LA64-UAL-NEXT:    or $a3, $a6, $a7
+; LA64-UAL-NEXT:    or $a0, $t0, $a0
+; LA64-UAL-NEXT:    or $a1, $a1, $a2
+; LA64-UAL-NEXT:    or $a0, $a3, $a0
+; LA64-UAL-NEXT:    or $a0, $a1, $a0
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_size_64:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 64
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 64)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_127(ptr %s1, ptr %s2) nounwind {
+; LA32-LABEL: bcmp_size_127:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 127
+; LA32-NEXT:    bl bcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: bcmp_size_127:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    ori $a2, $zero, 127
+; LA64-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 127)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_128(ptr %s1, ptr %s2) nounwind {
+; LA32-LABEL: bcmp_size_128:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 128
+; LA32-NEXT:    bl bcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: bcmp_size_128:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    ori $a2, $zero, 128
+; LA64-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 128)
+  ret i32 %bcmp
+}
+
+define signext i32 @bcmp_size_runtime(ptr %s1, ptr %s2, iGRLen %len) nounwind {
+; LA32-LABEL: bcmp_size_runtime:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    bl bcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: bcmp_size_runtime:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen %len)
+  ret i32 %bcmp
+}
+
+define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: bcmp_eq_zero:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a5, $a1, 4
+; LA32-UAL-NEXT:    ld.w $a6, $a0, 8
+; LA32-UAL-NEXT:    ld.w $a7, $a1, 8
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 12
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 12
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a3, $a4, $a5
+; LA32-UAL-NEXT:    xor $a4, $a6, $a7
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a1, $a2, $a3
+; LA32-UAL-NEXT:    or $a0, $a4, $a0
+; LA32-UAL-NEXT:    or $a0, $a1, $a0
+; LA32-UAL-NEXT:    sltui $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_eq_zero:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 8
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a0, $a2, $a0
+; LA64-UAL-NEXT:    sltui $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_eq_zero:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 16
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    sltui $a0, $a0, 1
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_eq_zero:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 16
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    sltui $a0, $a0, 1
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 16)
+  %ret = icmp eq i32 %bcmp, 0
+  ret i1 %ret
+}
+
+define i1 @bcmp_lt_zero(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: bcmp_lt_zero:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    move $a0, $zero
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_lt_zero:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    move $a0, $zero
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_lt_zero:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 4
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    srli.w $a0, $a0, 31
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_lt_zero:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 4
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    slti $a0, $a0, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4)
+  %ret = icmp slt i32 %bcmp, 0
+  ret i1 %ret
+}
+
+define i1 @bcmp_gt_zero(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: bcmp_gt_zero:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_gt_zero:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_gt_zero:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 4
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    slt $a0, $zero, $a0
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_gt_zero:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 4
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    slt $a0, $zero, $a0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4)
+  %ret = icmp sgt i32 %bcmp, 0
+  ret i1 %ret
+}
+
+define i1 @bcmp_le_zero(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: bcmp_le_zero:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:    slti $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_le_zero:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:    slti $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_le_zero:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 4
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    slti $a0, $a0, 1
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_le_zero:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 4
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    slti $a0, $a0, 1
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4)
+  %ret = icmp slt i32 %bcmp, 1
+  ret i1 %ret
+}
+
+define i1 @bcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: bcmp_ge_zero:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ori $a0, $zero, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: bcmp_ge_zero:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ori $a0, $zero, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: bcmp_ge_zero:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 4
+; LA32-NUAL-NEXT:    bl bcmp
+; LA32-NUAL-NEXT:    addi.w $a1, $zero, -1
+; LA32-NUAL-NEXT:    slt $a0, $a1, $a0
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: bcmp_ge_zero:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 4
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    addi.w $a1, $zero, -1
+; LA64-NUAL-NEXT:    slt $a0, $a1, $a0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4)
+  %ret = icmp sgt i32 %bcmp, -1
+  ret i1 %ret
+}
+
+define signext i32 @memcmp_size_0(ptr %s1, ptr %s2) nounwind {
+; CHECK-LABEL: memcmp_size_0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    move $a0, $zero
+; CHECK-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 0)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_1(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: memcmp_size_1:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.bu $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.bu $a1, $a1, 0
+; LA32-UAL-NEXT:    sub.w $a0, $a0, $a1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_1:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.bu $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.bu $a1, $a1, 0
+; LA64-UAL-NEXT:    sub.d $a0, $a0, $a1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_1:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 1
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_1:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 1
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 1)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: memcmp_size_2:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.hu $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.hu $a1, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a2, $a0, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 8
+; LA32-UAL-NEXT:    or $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a2, $a1, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 8
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    lu12i.w $a2, 15
+; LA32-UAL-NEXT:    ori $a2, $a2, 4095
+; LA32-UAL-NEXT:    and $a0, $a0, $a2
+; LA32-UAL-NEXT:    and $a1, $a1, $a2
+; LA32-UAL-NEXT:    sub.w $a0, $a0, $a1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_2:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.h $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.h $a1, $a1, 0
+; LA64-UAL-NEXT:    revb.2h $a0, $a0
+; LA64-UAL-NEXT:    revb.2h $a1, $a1
+; LA64-UAL-NEXT:    bstrpick.d $a0, $a0, 15, 0
+; LA64-UAL-NEXT:    bstrpick.d $a1, $a1, 15, 0
+; LA64-UAL-NEXT:    sub.d $a0, $a0, $a1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_2:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 2
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_2:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 2
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 2)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: memcmp_size_3:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.bu $a2, $a0, 2
+; LA32-UAL-NEXT:    ld.hu $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.bu $a3, $a1, 2
+; LA32-UAL-NEXT:    ld.hu $a1, $a1, 0
+; LA32-UAL-NEXT:    lu12i.w $a4, 15
+; LA32-UAL-NEXT:    ori $a4, $a4, 3840
+; LA32-UAL-NEXT:    and $a5, $a0, $a4
+; LA32-UAL-NEXT:    or $a2, $a5, $a2
+; LA32-UAL-NEXT:    slli.w $a2, $a2, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a2, $a0
+; LA32-UAL-NEXT:    and $a2, $a1, $a4
+; LA32-UAL-NEXT:    or $a2, $a2, $a3
+; LA32-UAL-NEXT:    slli.w $a2, $a2, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a2, $a1
+; LA32-UAL-NEXT:    sltu $a2, $a0, $a1
+; LA32-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA32-UAL-NEXT:    sub.w $a0, $a0, $a2
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_3:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.bu $a2, $a0, 2
+; LA64-UAL-NEXT:    ld.hu $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.bu $a3, $a1, 2
+; LA64-UAL-NEXT:    ld.hu $a1, $a1, 0
+; LA64-UAL-NEXT:    slli.d $a2, $a2, 16
+; LA64-UAL-NEXT:    or $a0, $a0, $a2
+; LA64-UAL-NEXT:    slli.d $a2, $a3, 16
+; LA64-UAL-NEXT:    or $a1, $a1, $a2
+; LA64-UAL-NEXT:    revb.2w $a0, $a0
+; LA64-UAL-NEXT:    addi.w $a0, $a0, 0
+; LA64-UAL-NEXT:    revb.2w $a1, $a1
+; LA64-UAL-NEXT:    addi.w $a1, $a1, 0
+; LA64-UAL-NEXT:    sltu $a2, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA64-UAL-NEXT:    sub.d $a0, $a0, $a2
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_3:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 3
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_3:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 3
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 3)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: memcmp_size_4:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a2, $a0, 8
+; LA32-UAL-NEXT:    lu12i.w $a3, 15
+; LA32-UAL-NEXT:    ori $a3, $a3, 3840
+; LA32-UAL-NEXT:    and $a2, $a2, $a3
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a3
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a2, $a1, 8
+; LA32-UAL-NEXT:    and $a2, $a2, $a3
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    and $a3, $a1, $a3
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a3
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    sltu $a2, $a0, $a1
+; LA32-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA32-UAL-NEXT:    sub.w $a0, $a0, $a2
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_4:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA64-UAL-NEXT:    revb.2w $a0, $a0
+; LA64-UAL-NEXT:    addi.w $a0, $a0, 0
+; LA64-UAL-NEXT:    revb.2w $a1, $a1
+; LA64-UAL-NEXT:    addi.w $a1, $a1, 0
+; LA64-UAL-NEXT:    sltu $a2, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA64-UAL-NEXT:    sub.d $a0, $a0, $a2
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_4:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 4
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_4:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 4
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: memcmp_size_5:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a4, $a2, 8
+; LA32-UAL-NEXT:    lu12i.w $a5, 15
+; LA32-UAL-NEXT:    ori $a5, $a5, 3840
+; LA32-UAL-NEXT:    and $a4, $a4, $a5
+; LA32-UAL-NEXT:    srli.w $a6, $a2, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    and $a6, $a2, $a5
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a2, $a2, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a6
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    srli.w $a4, $a3, 8
+; LA32-UAL-NEXT:    and $a4, $a4, $a5
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    and $a5, $a3, $a5
+; LA32-UAL-NEXT:    slli.w $a5, $a5, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    or $a3, $a3, $a4
+; LA32-UAL-NEXT:    bne $a2, $a3, .LBB28_2
+; LA32-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA32-UAL-NEXT:    ld.bu $a0, $a0, 4
+; LA32-UAL-NEXT:    ld.bu $a1, $a1, 4
+; LA32-UAL-NEXT:    sub.w $a0, $a0, $a1
+; LA32-UAL-NEXT:    ret
+; LA32-UAL-NEXT:  .LBB28_2: # %res_block
+; LA32-UAL-NEXT:    sltu $a0, $a2, $a3
+; LA32-UAL-NEXT:    sub.w $a0, $zero, $a0
+; LA32-UAL-NEXT:    ori $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_5:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.bu $a2, $a0, 4
+; LA64-UAL-NEXT:    ld.wu $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.bu $a3, $a1, 4
+; LA64-UAL-NEXT:    ld.wu $a1, $a1, 0
+; LA64-UAL-NEXT:    slli.d $a2, $a2, 32
+; LA64-UAL-NEXT:    or $a0, $a0, $a2
+; LA64-UAL-NEXT:    slli.d $a2, $a3, 32
+; LA64-UAL-NEXT:    or $a1, $a1, $a2
+; LA64-UAL-NEXT:    revb.d $a0, $a0
+; LA64-UAL-NEXT:    revb.d $a1, $a1
+; LA64-UAL-NEXT:    sltu $a2, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA64-UAL-NEXT:    sub.d $a0, $a0, $a2
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_5:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 5
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_5:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 5
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 5)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: memcmp_size_6:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    lu12i.w $a2, 15
+; LA32-UAL-NEXT:    ori $a6, $a2, 3840
+; LA32-UAL-NEXT:    and $a5, $a5, $a6
+; LA32-UAL-NEXT:    srli.w $a7, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a7
+; LA32-UAL-NEXT:    and $a7, $a3, $a6
+; LA32-UAL-NEXT:    slli.w $a7, $a7, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a7
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a6
+; LA32-UAL-NEXT:    srli.w $a7, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a7
+; LA32-UAL-NEXT:    and $a6, $a4, $a6
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB29_3
+; LA32-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA32-UAL-NEXT:    ld.hu $a0, $a0, 4
+; LA32-UAL-NEXT:    ld.hu $a1, $a1, 4
+; LA32-UAL-NEXT:    srli.w $a3, $a0, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 8
+; LA32-UAL-NEXT:    or $a0, $a0, $a3
+; LA32-UAL-NEXT:    srli.w $a3, $a1, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 8
+; LA32-UAL-NEXT:    or $a1, $a1, $a3
+; LA32-UAL-NEXT:    ori $a2, $a2, 4095
+; LA32-UAL-NEXT:    and $a3, $a0, $a2
+; LA32-UAL-NEXT:    and $a4, $a1, $a2
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB29_3
+; LA32-UAL-NEXT:  # %bb.2:
+; LA32-UAL-NEXT:    move $a0, $zero
+; LA32-UAL-NEXT:    ret
+; LA32-UAL-NEXT:  .LBB29_3: # %res_block
+; LA32-UAL-NEXT:    sltu $a0, $a3, $a4
+; LA32-UAL-NEXT:    sub.w $a0, $zero, $a0
+; LA32-UAL-NEXT:    ori $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_6:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.hu $a2, $a0, 4
+; LA64-UAL-NEXT:    ld.wu $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.hu $a3, $a1, 4
+; LA64-UAL-NEXT:    ld.wu $a1, $a1, 0
+; LA64-UAL-NEXT:    slli.d $a2, $a2, 32
+; LA64-UAL-NEXT:    or $a0, $a0, $a2
+; LA64-UAL-NEXT:    slli.d $a2, $a3, 32
+; LA64-UAL-NEXT:    or $a1, $a1, $a2
+; LA64-UAL-NEXT:    revb.d $a0, $a0
+; LA64-UAL-NEXT:    revb.d $a1, $a1
+; LA64-UAL-NEXT:    sltu $a2, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA64-UAL-NEXT:    sub.d $a0, $a0, $a2
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_6:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 6
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_6:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 6
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 6)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: memcmp_size_7:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    lu12i.w $a2, 15
+; LA32-UAL-NEXT:    ori $a2, $a2, 3840
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB30_3
+; LA32-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 3
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 3
+; LA32-UAL-NEXT:    srli.w $a3, $a0, 8
+; LA32-UAL-NEXT:    and $a3, $a3, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a2
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a3, $a0, $a3
+; LA32-UAL-NEXT:    srli.w $a0, $a1, 8
+; LA32-UAL-NEXT:    and $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    and $a2, $a1, $a2
+; LA32-UAL-NEXT:    slli.w $a2, $a2, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    or $a4, $a1, $a0
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB30_3
+; LA32-UAL-NEXT:  # %bb.2:
+; LA32-UAL-NEXT:    move $a0, $zero
+; LA32-UAL-NEXT:    ret
+; LA32-UAL-NEXT:  .LBB30_3: # %res_block
+; LA32-UAL-NEXT:    sltu $a0, $a3, $a4
+; LA32-UAL-NEXT:    sub.w $a0, $zero, $a0
+; LA32-UAL-NEXT:    ori $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_7:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA64-UAL-NEXT:    revb.2w $a2, $a2
+; LA64-UAL-NEXT:    addi.w $a4, $a2, 0
+; LA64-UAL-NEXT:    revb.2w $a3, $a3
+; LA64-UAL-NEXT:    addi.w $a5, $a3, 0
+; LA64-UAL-NEXT:    bne $a4, $a5, .LBB30_3
+; LA64-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 3
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 3
+; LA64-UAL-NEXT:    revb.2w $a2, $a0
+; LA64-UAL-NEXT:    addi.w $a0, $a2, 0
+; LA64-UAL-NEXT:    revb.2w $a3, $a1
+; LA64-UAL-NEXT:    addi.w $a1, $a3, 0
+; LA64-UAL-NEXT:    bne $a0, $a1, .LBB30_3
+; LA64-UAL-NEXT:  # %bb.2:
+; LA64-UAL-NEXT:    move $a0, $zero
+; LA64-UAL-NEXT:    ret
+; LA64-UAL-NEXT:  .LBB30_3: # %res_block
+; LA64-UAL-NEXT:    addi.w $a0, $a3, 0
+; LA64-UAL-NEXT:    addi.w $a1, $a2, 0
+; LA64-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA64-UAL-NEXT:    sub.d $a0, $zero, $a0
+; LA64-UAL-NEXT:    ori $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_7:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 7
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_7:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 7
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 7)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: memcmp_size_8:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    lu12i.w $a2, 15
+; LA32-UAL-NEXT:    ori $a2, $a2, 3840
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB31_3
+; LA32-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 4
+; LA32-UAL-NEXT:    srli.w $a3, $a0, 8
+; LA32-UAL-NEXT:    and $a3, $a3, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a2
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a3, $a0, $a3
+; LA32-UAL-NEXT:    srli.w $a0, $a1, 8
+; LA32-UAL-NEXT:    and $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    and $a2, $a1, $a2
+; LA32-UAL-NEXT:    slli.w $a2, $a2, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    or $a4, $a1, $a0
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB31_3
+; LA32-UAL-NEXT:  # %bb.2:
+; LA32-UAL-NEXT:    move $a0, $zero
+; LA32-UAL-NEXT:    ret
+; LA32-UAL-NEXT:  .LBB31_3: # %res_block
+; LA32-UAL-NEXT:    sltu $a0, $a3, $a4
+; LA32-UAL-NEXT:    sub.w $a0, $zero, $a0
+; LA32-UAL-NEXT:    ori $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_8:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 0
+; LA64-UAL-NEXT:    revb.d $a0, $a0
+; LA64-UAL-NEXT:    revb.d $a1, $a1
+; LA64-UAL-NEXT:    sltu $a2, $a0, $a1
+; LA64-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA64-UAL-NEXT:    sub.d $a0, $a0, $a2
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_8:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 8
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_8:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 8
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 8)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: memcmp_size_15:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    lu12i.w $a2, 15
+; LA32-UAL-NEXT:    ori $a2, $a2, 3840
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB32_5
+; LA32-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 4
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB32_5
+; LA32-UAL-NEXT:  # %bb.2: # %loadbb2
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 8
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 8
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB32_5
+; LA32-UAL-NEXT:  # %bb.3: # %loadbb3
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 11
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 11
+; LA32-UAL-NEXT:    srli.w $a3, $a0, 8
+; LA32-UAL-NEXT:    and $a3, $a3, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a2
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a3, $a0, $a3
+; LA32-UAL-NEXT:    srli.w $a0, $a1, 8
+; LA32-UAL-NEXT:    and $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    and $a2, $a1, $a2
+; LA32-UAL-NEXT:    slli.w $a2, $a2, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    or $a4, $a1, $a0
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB32_5
+; LA32-UAL-NEXT:  # %bb.4:
+; LA32-UAL-NEXT:    move $a0, $zero
+; LA32-UAL-NEXT:    ret
+; LA32-UAL-NEXT:  .LBB32_5: # %res_block
+; LA32-UAL-NEXT:    sltu $a0, $a3, $a4
+; LA32-UAL-NEXT:    sub.w $a0, $zero, $a0
+; LA32-UAL-NEXT:    ori $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_15:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB32_3
+; LA64-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 7
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 7
+; LA64-UAL-NEXT:    revb.d $a2, $a0
+; LA64-UAL-NEXT:    revb.d $a3, $a1
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB32_3
+; LA64-UAL-NEXT:  # %bb.2:
+; LA64-UAL-NEXT:    move $a0, $zero
+; LA64-UAL-NEXT:    ret
+; LA64-UAL-NEXT:  .LBB32_3: # %res_block
+; LA64-UAL-NEXT:    sltu $a0, $a2, $a3
+; LA64-UAL-NEXT:    sub.d $a0, $zero, $a0
+; LA64-UAL-NEXT:    ori $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_15:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 15
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_15:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 15
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 15)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: memcmp_size_16:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    lu12i.w $a2, 15
+; LA32-UAL-NEXT:    ori $a2, $a2, 3840
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB33_5
+; LA32-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 4
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB33_5
+; LA32-UAL-NEXT:  # %bb.2: # %loadbb2
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 8
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 8
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB33_5
+; LA32-UAL-NEXT:  # %bb.3: # %loadbb3
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 12
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 12
+; LA32-UAL-NEXT:    srli.w $a3, $a0, 8
+; LA32-UAL-NEXT:    and $a3, $a3, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a2
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a3, $a0, $a3
+; LA32-UAL-NEXT:    srli.w $a0, $a1, 8
+; LA32-UAL-NEXT:    and $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    and $a2, $a1, $a2
+; LA32-UAL-NEXT:    slli.w $a2, $a2, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    or $a4, $a1, $a0
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB33_5
+; LA32-UAL-NEXT:  # %bb.4:
+; LA32-UAL-NEXT:    move $a0, $zero
+; LA32-UAL-NEXT:    ret
+; LA32-UAL-NEXT:  .LBB33_5: # %res_block
+; LA32-UAL-NEXT:    sltu $a0, $a3, $a4
+; LA32-UAL-NEXT:    sub.w $a0, $zero, $a0
+; LA32-UAL-NEXT:    ori $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_16:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB33_3
+; LA64-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 8
+; LA64-UAL-NEXT:    revb.d $a2, $a0
+; LA64-UAL-NEXT:    revb.d $a3, $a1
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB33_3
+; LA64-UAL-NEXT:  # %bb.2:
+; LA64-UAL-NEXT:    move $a0, $zero
+; LA64-UAL-NEXT:    ret
+; LA64-UAL-NEXT:  .LBB33_3: # %res_block
+; LA64-UAL-NEXT:    sltu $a0, $a2, $a3
+; LA64-UAL-NEXT:    sub.d $a0, $zero, $a0
+; LA64-UAL-NEXT:    ori $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_16:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 16
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_16:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 16
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 16)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: memcmp_size_31:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    lu12i.w $a2, 15
+; LA32-UAL-NEXT:    ori $a2, $a2, 3840
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB34_9
+; LA32-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 4
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB34_9
+; LA32-UAL-NEXT:  # %bb.2: # %loadbb2
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 8
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 8
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB34_9
+; LA32-UAL-NEXT:  # %bb.3: # %loadbb3
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 12
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 12
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB34_9
+; LA32-UAL-NEXT:  # %bb.4: # %loadbb4
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 16
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 16
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB34_9
+; LA32-UAL-NEXT:  # %bb.5: # %loadbb5
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 20
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 20
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB34_9
+; LA32-UAL-NEXT:  # %bb.6: # %loadbb6
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 24
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 24
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB34_9
+; LA32-UAL-NEXT:  # %bb.7: # %loadbb7
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 27
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 27
+; LA32-UAL-NEXT:    srli.w $a3, $a0, 8
+; LA32-UAL-NEXT:    and $a3, $a3, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a2
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a3, $a0, $a3
+; LA32-UAL-NEXT:    srli.w $a0, $a1, 8
+; LA32-UAL-NEXT:    and $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    and $a2, $a1, $a2
+; LA32-UAL-NEXT:    slli.w $a2, $a2, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    or $a4, $a1, $a0
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB34_9
+; LA32-UAL-NEXT:  # %bb.8:
+; LA32-UAL-NEXT:    move $a0, $zero
+; LA32-UAL-NEXT:    ret
+; LA32-UAL-NEXT:  .LBB34_9: # %res_block
+; LA32-UAL-NEXT:    sltu $a0, $a3, $a4
+; LA32-UAL-NEXT:    sub.w $a0, $zero, $a0
+; LA32-UAL-NEXT:    ori $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_31:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB34_5
+; LA64-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 8
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB34_5
+; LA64-UAL-NEXT:  # %bb.2: # %loadbb2
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 16
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 16
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB34_5
+; LA64-UAL-NEXT:  # %bb.3: # %loadbb3
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 23
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 23
+; LA64-UAL-NEXT:    revb.d $a2, $a0
+; LA64-UAL-NEXT:    revb.d $a3, $a1
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB34_5
+; LA64-UAL-NEXT:  # %bb.4:
+; LA64-UAL-NEXT:    move $a0, $zero
+; LA64-UAL-NEXT:    ret
+; LA64-UAL-NEXT:  .LBB34_5: # %res_block
+; LA64-UAL-NEXT:    sltu $a0, $a2, $a3
+; LA64-UAL-NEXT:    sub.d $a0, $zero, $a0
+; LA64-UAL-NEXT:    ori $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_31:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 31
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_31:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 31
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 31)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: memcmp_size_32:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    lu12i.w $a2, 15
+; LA32-UAL-NEXT:    ori $a2, $a2, 3840
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB35_9
+; LA32-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 4
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB35_9
+; LA32-UAL-NEXT:  # %bb.2: # %loadbb2
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 8
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 8
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB35_9
+; LA32-UAL-NEXT:  # %bb.3: # %loadbb3
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 12
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 12
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB35_9
+; LA32-UAL-NEXT:  # %bb.4: # %loadbb4
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 16
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 16
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB35_9
+; LA32-UAL-NEXT:  # %bb.5: # %loadbb5
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 20
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 20
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB35_9
+; LA32-UAL-NEXT:  # %bb.6: # %loadbb6
+; LA32-UAL-NEXT:    ld.w $a3, $a0, 24
+; LA32-UAL-NEXT:    ld.w $a4, $a1, 24
+; LA32-UAL-NEXT:    srli.w $a5, $a3, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a3, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a3, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a6
+; LA32-UAL-NEXT:    or $a3, $a3, $a5
+; LA32-UAL-NEXT:    srli.w $a5, $a4, 8
+; LA32-UAL-NEXT:    and $a5, $a5, $a2
+; LA32-UAL-NEXT:    srli.w $a6, $a4, 24
+; LA32-UAL-NEXT:    or $a5, $a5, $a6
+; LA32-UAL-NEXT:    and $a6, $a4, $a2
+; LA32-UAL-NEXT:    slli.w $a6, $a6, 8
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 24
+; LA32-UAL-NEXT:    or $a4, $a4, $a6
+; LA32-UAL-NEXT:    or $a4, $a4, $a5
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB35_9
+; LA32-UAL-NEXT:  # %bb.7: # %loadbb7
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 28
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 28
+; LA32-UAL-NEXT:    srli.w $a3, $a0, 8
+; LA32-UAL-NEXT:    and $a3, $a3, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a3, $a3, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a2
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a3, $a0, $a3
+; LA32-UAL-NEXT:    srli.w $a0, $a1, 8
+; LA32-UAL-NEXT:    and $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    and $a2, $a1, $a2
+; LA32-UAL-NEXT:    slli.w $a2, $a2, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    or $a4, $a1, $a0
+; LA32-UAL-NEXT:    bne $a3, $a4, .LBB35_9
+; LA32-UAL-NEXT:  # %bb.8:
+; LA32-UAL-NEXT:    move $a0, $zero
+; LA32-UAL-NEXT:    ret
+; LA32-UAL-NEXT:  .LBB35_9: # %res_block
+; LA32-UAL-NEXT:    sltu $a0, $a3, $a4
+; LA32-UAL-NEXT:    sub.w $a0, $zero, $a0
+; LA32-UAL-NEXT:    ori $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_32:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB35_5
+; LA64-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 8
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB35_5
+; LA64-UAL-NEXT:  # %bb.2: # %loadbb2
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 16
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 16
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB35_5
+; LA64-UAL-NEXT:  # %bb.3: # %loadbb3
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 24
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 24
+; LA64-UAL-NEXT:    revb.d $a2, $a0
+; LA64-UAL-NEXT:    revb.d $a3, $a1
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB35_5
+; LA64-UAL-NEXT:  # %bb.4:
+; LA64-UAL-NEXT:    move $a0, $zero
+; LA64-UAL-NEXT:    ret
+; LA64-UAL-NEXT:  .LBB35_5: # %res_block
+; LA64-UAL-NEXT:    sltu $a0, $a2, $a3
+; LA64-UAL-NEXT:    sub.d $a0, $zero, $a0
+; LA64-UAL-NEXT:    ori $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_size_32:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 32
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_32:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 32
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 32)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_63(ptr %s1, ptr %s2) nounwind {
+; LA32-LABEL: memcmp_size_63:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 63
+; LA32-NEXT:    bl memcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_63:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB36_9
+; LA64-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 8
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB36_9
+; LA64-UAL-NEXT:  # %bb.2: # %loadbb2
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 16
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 16
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB36_9
+; LA64-UAL-NEXT:  # %bb.3: # %loadbb3
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 24
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 24
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB36_9
+; LA64-UAL-NEXT:  # %bb.4: # %loadbb4
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 32
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 32
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB36_9
+; LA64-UAL-NEXT:  # %bb.5: # %loadbb5
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 40
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 40
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB36_9
+; LA64-UAL-NEXT:  # %bb.6: # %loadbb6
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 48
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 48
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB36_9
+; LA64-UAL-NEXT:  # %bb.7: # %loadbb7
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 55
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 55
+; LA64-UAL-NEXT:    revb.d $a2, $a0
+; LA64-UAL-NEXT:    revb.d $a3, $a1
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB36_9
+; LA64-UAL-NEXT:  # %bb.8:
+; LA64-UAL-NEXT:    move $a0, $zero
+; LA64-UAL-NEXT:    ret
+; LA64-UAL-NEXT:  .LBB36_9: # %res_block
+; LA64-UAL-NEXT:    sltu $a0, $a2, $a3
+; LA64-UAL-NEXT:    sub.d $a0, $zero, $a0
+; LA64-UAL-NEXT:    ori $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_63:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 63
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 63)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_64(ptr %s1, ptr %s2) nounwind {
+; LA32-LABEL: memcmp_size_64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 64
+; LA32-NEXT:    bl memcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_size_64:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB37_9
+; LA64-UAL-NEXT:  # %bb.1: # %loadbb1
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 8
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB37_9
+; LA64-UAL-NEXT:  # %bb.2: # %loadbb2
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 16
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 16
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB37_9
+; LA64-UAL-NEXT:  # %bb.3: # %loadbb3
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 24
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 24
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB37_9
+; LA64-UAL-NEXT:  # %bb.4: # %loadbb4
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 32
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 32
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB37_9
+; LA64-UAL-NEXT:  # %bb.5: # %loadbb5
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 40
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 40
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB37_9
+; LA64-UAL-NEXT:  # %bb.6: # %loadbb6
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 48
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 48
+; LA64-UAL-NEXT:    revb.d $a2, $a2
+; LA64-UAL-NEXT:    revb.d $a3, $a3
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB37_9
+; LA64-UAL-NEXT:  # %bb.7: # %loadbb7
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 56
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 56
+; LA64-UAL-NEXT:    revb.d $a2, $a0
+; LA64-UAL-NEXT:    revb.d $a3, $a1
+; LA64-UAL-NEXT:    bne $a2, $a3, .LBB37_9
+; LA64-UAL-NEXT:  # %bb.8:
+; LA64-UAL-NEXT:    move $a0, $zero
+; LA64-UAL-NEXT:    ret
+; LA64-UAL-NEXT:  .LBB37_9: # %res_block
+; LA64-UAL-NEXT:    sltu $a0, $a2, $a3
+; LA64-UAL-NEXT:    sub.d $a0, $zero, $a0
+; LA64-UAL-NEXT:    ori $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_size_64:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 64
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 64)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_127(ptr %s1, ptr %s2) nounwind {
+; LA32-LABEL: memcmp_size_127:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 127
+; LA32-NEXT:    bl memcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: memcmp_size_127:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    ori $a2, $zero, 127
+; LA64-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 127)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_128(ptr %s1, ptr %s2) nounwind {
+; LA32-LABEL: memcmp_size_128:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ori $a2, $zero, 128
+; LA32-NEXT:    bl memcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: memcmp_size_128:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    ori $a2, $zero, 128
+; LA64-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 128)
+  ret i32 %memcmp
+}
+
+define signext i32 @memcmp_size_runtime(ptr %s1, ptr %s2, iGRLen %len) nounwind {
+; LA32-LABEL: memcmp_size_runtime:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    bl memcmp
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: memcmp_size_runtime:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen %len)
+  ret i32 %memcmp
+}
+
+define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: memcmp_eq_zero:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a2, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a3, $a1, 0
+; LA32-UAL-NEXT:    ld.w $a4, $a0, 4
+; LA32-UAL-NEXT:    ld.w $a5, $a1, 4
+; LA32-UAL-NEXT:    ld.w $a6, $a0, 8
+; LA32-UAL-NEXT:    ld.w $a7, $a1, 8
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 12
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 12
+; LA32-UAL-NEXT:    xor $a2, $a2, $a3
+; LA32-UAL-NEXT:    xor $a3, $a4, $a5
+; LA32-UAL-NEXT:    xor $a4, $a6, $a7
+; LA32-UAL-NEXT:    xor $a0, $a0, $a1
+; LA32-UAL-NEXT:    or $a1, $a2, $a3
+; LA32-UAL-NEXT:    or $a0, $a4, $a0
+; LA32-UAL-NEXT:    or $a0, $a1, $a0
+; LA32-UAL-NEXT:    sltui $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_eq_zero:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.d $a2, $a0, 0
+; LA64-UAL-NEXT:    ld.d $a3, $a1, 0
+; LA64-UAL-NEXT:    ld.d $a0, $a0, 8
+; LA64-UAL-NEXT:    ld.d $a1, $a1, 8
+; LA64-UAL-NEXT:    xor $a2, $a2, $a3
+; LA64-UAL-NEXT:    xor $a0, $a0, $a1
+; LA64-UAL-NEXT:    or $a0, $a2, $a0
+; LA64-UAL-NEXT:    sltui $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_eq_zero:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 16
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    sltui $a0, $a0, 1
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_eq_zero:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 16
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    sltui $a0, $a0, 1
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 16)
+  %ret = icmp eq i32 %memcmp, 0
+  ret i1 %ret
+}
+
+define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: memcmp_lt_zero:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a2, $a0, 8
+; LA32-UAL-NEXT:    lu12i.w $a3, 15
+; LA32-UAL-NEXT:    ori $a3, $a3, 3840
+; LA32-UAL-NEXT:    and $a2, $a2, $a3
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a3
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a2, $a1, 8
+; LA32-UAL-NEXT:    and $a2, $a2, $a3
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    and $a3, $a1, $a3
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a3
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    sltu $a0, $a0, $a1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_lt_zero:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA64-UAL-NEXT:    revb.2w $a0, $a0
+; LA64-UAL-NEXT:    addi.w $a0, $a0, 0
+; LA64-UAL-NEXT:    revb.2w $a1, $a1
+; LA64-UAL-NEXT:    addi.w $a1, $a1, 0
+; LA64-UAL-NEXT:    sltu $a0, $a0, $a1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_lt_zero:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 4
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    srli.w $a0, $a0, 31
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_lt_zero:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 4
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    slti $a0, $a0, 0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4)
+  %ret = icmp slt i32 %memcmp, 0
+  ret i1 %ret
+}
+
+define i1 @memcmp_gt_zero(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: memcmp_gt_zero:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a2, $a0, 8
+; LA32-UAL-NEXT:    lu12i.w $a3, 15
+; LA32-UAL-NEXT:    ori $a3, $a3, 3840
+; LA32-UAL-NEXT:    and $a2, $a2, $a3
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a3
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a2, $a1, 8
+; LA32-UAL-NEXT:    and $a2, $a2, $a3
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    and $a3, $a1, $a3
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a3
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_gt_zero:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA64-UAL-NEXT:    revb.2w $a0, $a0
+; LA64-UAL-NEXT:    addi.w $a0, $a0, 0
+; LA64-UAL-NEXT:    revb.2w $a1, $a1
+; LA64-UAL-NEXT:    addi.w $a1, $a1, 0
+; LA64-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_gt_zero:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 4
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    slt $a0, $zero, $a0
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_gt_zero:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 4
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    slt $a0, $zero, $a0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4)
+  %ret = icmp sgt i32 %memcmp, 0
+  ret i1 %ret
+}
+
+define i1 @memcmp_le_zero(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: memcmp_le_zero:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a2, $a0, 8
+; LA32-UAL-NEXT:    lu12i.w $a3, 15
+; LA32-UAL-NEXT:    ori $a3, $a3, 3840
+; LA32-UAL-NEXT:    and $a2, $a2, $a3
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a3
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a2, $a1, 8
+; LA32-UAL-NEXT:    and $a2, $a2, $a3
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    and $a3, $a1, $a3
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a3
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA32-UAL-NEXT:    xori $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_le_zero:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA64-UAL-NEXT:    revb.2w $a0, $a0
+; LA64-UAL-NEXT:    addi.w $a0, $a0, 0
+; LA64-UAL-NEXT:    revb.2w $a1, $a1
+; LA64-UAL-NEXT:    addi.w $a1, $a1, 0
+; LA64-UAL-NEXT:    sltu $a0, $a1, $a0
+; LA64-UAL-NEXT:    xori $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_le_zero:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 4
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    slti $a0, $a0, 1
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_le_zero:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 4
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    slti $a0, $a0, 1
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4)
+  %ret = icmp slt i32 %memcmp, 1
+  ret i1 %ret
+}
+
+define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
+; LA32-UAL-LABEL: memcmp_ge_zero:
+; LA32-UAL:       # %bb.0: # %entry
+; LA32-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA32-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA32-UAL-NEXT:    srli.w $a2, $a0, 8
+; LA32-UAL-NEXT:    lu12i.w $a3, 15
+; LA32-UAL-NEXT:    ori $a3, $a3, 3840
+; LA32-UAL-NEXT:    and $a2, $a2, $a3
+; LA32-UAL-NEXT:    srli.w $a4, $a0, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    and $a4, $a0, $a3
+; LA32-UAL-NEXT:    slli.w $a4, $a4, 8
+; LA32-UAL-NEXT:    slli.w $a0, $a0, 24
+; LA32-UAL-NEXT:    or $a0, $a0, $a4
+; LA32-UAL-NEXT:    or $a0, $a0, $a2
+; LA32-UAL-NEXT:    srli.w $a2, $a1, 8
+; LA32-UAL-NEXT:    and $a2, $a2, $a3
+; LA32-UAL-NEXT:    srli.w $a4, $a1, 24
+; LA32-UAL-NEXT:    or $a2, $a2, $a4
+; LA32-UAL-NEXT:    and $a3, $a1, $a3
+; LA32-UAL-NEXT:    slli.w $a3, $a3, 8
+; LA32-UAL-NEXT:    slli.w $a1, $a1, 24
+; LA32-UAL-NEXT:    or $a1, $a1, $a3
+; LA32-UAL-NEXT:    or $a1, $a1, $a2
+; LA32-UAL-NEXT:    sltu $a0, $a0, $a1
+; LA32-UAL-NEXT:    xori $a0, $a0, 1
+; LA32-UAL-NEXT:    ret
+;
+; LA64-UAL-LABEL: memcmp_ge_zero:
+; LA64-UAL:       # %bb.0: # %entry
+; LA64-UAL-NEXT:    ld.w $a0, $a0, 0
+; LA64-UAL-NEXT:    ld.w $a1, $a1, 0
+; LA64-UAL-NEXT:    revb.2w $a0, $a0
+; LA64-UAL-NEXT:    addi.w $a0, $a0, 0
+; LA64-UAL-NEXT:    revb.2w $a1, $a1
+; LA64-UAL-NEXT:    addi.w $a1, $a1, 0
+; LA64-UAL-NEXT:    sltu $a0, $a0, $a1
+; LA64-UAL-NEXT:    xori $a0, $a0, 1
+; LA64-UAL-NEXT:    ret
+;
+; LA32-NUAL-LABEL: memcmp_ge_zero:
+; LA32-NUAL:       # %bb.0: # %entry
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:    ori $a2, $zero, 4
+; LA32-NUAL-NEXT:    bl memcmp
+; LA32-NUAL-NEXT:    addi.w $a1, $zero, -1
+; LA32-NUAL-NEXT:    slt $a0, $a1, $a0
+; LA32-NUAL-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:    addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:    ret
+;
+; LA64-NUAL-LABEL: memcmp_ge_zero:
+; LA64-NUAL:       # %bb.0: # %entry
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:    ori $a2, $zero, 4
+; LA64-NUAL-NEXT:    pcaddu18i $ra, %call36(memcmp)
+; LA64-NUAL-NEXT:    jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:    addi.w $a1, $zero, -1
+; LA64-NUAL-NEXT:    slt $a0, $a1, $a0
+; LA64-NUAL-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT:    addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iGRLen 4)
+  %ret = icmp sgt i32 %memcmp, -1
+  ret i1 %ret
+}
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/flog2.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/flog2.ll
index 93fcd421e4bd7..e02a2e7cce9b2 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/flog2.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/flog2.ll
@@ -12,8 +12,8 @@ define float @flog2_s(float %x) nounwind {
 ;
 ; LA64-LABEL: flog2_s:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    pcaddu18i $t8, %call36(log2f)
-; LA64-NEXT:    jr $t8
+; LA64-NEXT:    flogb.s $fa0, $fa0
+; LA64-NEXT:    ret
   %y = call float @llvm.log2.f32(float %x)
   ret float %y
 }
@@ -25,8 +25,8 @@ define double @flog2_d(double %x) nounwind {
 ;
 ; LA64-LABEL: flog2_d:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    pcaddu18i $t8, %call36(log2)
-; LA64-NEXT:    jr $t8
+; LA64-NEXT:    flogb.d $fa0, $fa0
+; LA64-NEXT:    ret
   %y = call double @llvm.log2.f64(double %x)
   ret double %y
 }
diff --git a/llvm/test/CodeGen/LoongArch/issue163681.ll b/llvm/test/CodeGen/LoongArch/issue163681.ll
new file mode 100644
index 0000000000000..f6df349253045
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/issue163681.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch64 -code-model=large --verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+@.str = external constant [1 x i8]
+
+define void @caller(ptr %0) {
+; CHECK-LABEL: caller:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi.d $sp, $sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset 1, -8
+; CHECK-NEXT:    ld.w $a2, $zero, 0
+; CHECK-NEXT:    ld.d $a1, $a0, 0
+; CHECK-NEXT:    beqz $a2, .LBB0_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    pcalau12i $a0, %got_pc_hi20(.str)
+; CHECK-NEXT:    addi.d $a2, $zero, %got_pc_lo12(.str)
+; CHECK-NEXT:    lu32i.d $a2, %got64_pc_lo20(.str)
+; CHECK-NEXT:    lu52i.d $a2, $a2, %got64_pc_hi12(.str)
+; CHECK-NEXT:    ldx.d $a2, $a2, $a0
+; CHECK-NEXT:    move $a0, $zero
+; CHECK-NEXT:    jirl $ra, $zero, 0
+; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    pcalau12i $a0, %got_pc_hi20(.str)
+; CHECK-NEXT:    addi.d $a2, $zero, %got_pc_lo12(.str)
+; CHECK-NEXT:    lu32i.d $a2, %got64_pc_lo20(.str)
+; CHECK-NEXT:    lu52i.d $a2, $a2, %got64_pc_hi12(.str)
+; CHECK-NEXT:    ldx.d $a2, $a2, $a0
+; CHECK-NEXT:    move $a0, $zero
+; CHECK-NEXT:    move $a3, $zero
+; CHECK-NEXT:    jirl $ra, $zero, 0
+; CHECK-NEXT:  .LBB0_3:
+; CHECK-NEXT:    st.d $zero, $zero, 0
+; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ret
+  %2 = load i32, ptr null, align 4
+  %3 = icmp eq i32 %2, 0
+  %4 = load i64, ptr %0, align 8
+  br i1 %3, label %6, label %5
+
+5:                                                ; preds = %1
+  call void null(ptr null, i64 %4, ptr @.str)
+  br label %7
+
+6:                                                ; preds = %1
+  tail call void null(ptr null, i64 %4, ptr @.str, i32 0)
+  br label %7
+
+7:                                                ; preds = %6, %5
+  store ptr null, ptr null, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
index ba2118fb94f63..b3155c9313a8a 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
@@ -106,6 +106,69 @@ define void @ctlz_v4i64(ptr %src, ptr %dst) nounwind {
   ret void
 }
 
+define void @not_ctlz_v32i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvxori.b $xr0, $xr0, 255
+; CHECK-NEXT:    xvclz.b $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <32 x i8>, ptr %src
+  %neg = xor <32 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %neg, i1 false)
+  store <32 x i8> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v16i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvrepli.b $xr1, -1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvclz.h $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i16>, ptr %src
+  %neg = xor <16 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %neg, i1 false)
+  store <16 x i16> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v8i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvrepli.b $xr1, -1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvclz.w $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i32>, ptr %src
+  %neg = xor <8 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %neg, i1 false)
+  store <8 x i32> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v4i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvrepli.b $xr1, -1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvclz.d $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i64>, ptr %src
+  %neg = xor <4 x i64> %v, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %neg, i1 false)
+  store <4 x i64> %res, ptr %dst
+  ret void
+}
+
 declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>)
 declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>)
 declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll
index 48ec98c3a74bb..8e08e1ee9e094 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll
@@ -5,40 +5,10 @@
 define void @minnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: minnum_v8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr0, $a2, 0
-; CHECK-NEXT:    xvld $xr1, $a1, 0
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 5
-; CHECK-NEXT:    xvpickve.w $xr3, $xr1, 5
-; CHECK-NEXT:    fmin.s $fa2, $fa3, $fa2
-; CHECK-NEXT:    xvpickve.w $xr3, $xr0, 4
-; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 4
-; CHECK-NEXT:    fmin.s $fa3, $fa4, $fa3
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 16
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 6
-; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 6
-; CHECK-NEXT:    fmin.s $fa2, $fa4, $fa2
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 32
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 7
-; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 7
-; CHECK-NEXT:    fmin.s $fa2, $fa4, $fa2
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 48
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 1
-; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 1
-; CHECK-NEXT:    fmin.s $fa2, $fa4, $fa2
-; CHECK-NEXT:    xvpickve.w $xr4, $xr0, 0
-; CHECK-NEXT:    xvpickve.w $xr5, $xr1, 0
-; CHECK-NEXT:    fmin.s $fa4, $fa5, $fa4
-; CHECK-NEXT:    vextrins.w $vr4, $vr2, 16
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 2
-; CHECK-NEXT:    xvpickve.w $xr5, $xr1, 2
-; CHECK-NEXT:    fmin.s $fa2, $fa5, $fa2
-; CHECK-NEXT:    vextrins.w $vr4, $vr2, 32
-; CHECK-NEXT:    xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT:    xvpickve.w $xr1, $xr1, 3
-; CHECK-NEXT:    fmin.s $fa0, $fa1, $fa0
-; CHECK-NEXT:    vextrins.w $vr4, $vr0, 48
-; CHECK-NEXT:    xvpermi.q $xr4, $xr3, 2
-; CHECK-NEXT:    xvst $xr4, $a0, 0
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvfmin.s $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %v0 = load <8 x float>, ptr %x
@@ -51,23 +21,9 @@ entry:
 define void @minnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: minnum_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr0, $a2, 0
-; CHECK-NEXT:    xvld $xr1, $a1, 0
-; CHECK-NEXT:    xvpickve.d $xr2, $xr0, 3
-; CHECK-NEXT:    xvpickve.d $xr3, $xr1, 3
-; CHECK-NEXT:    fmin.d $fa2, $fa3, $fa2
-; CHECK-NEXT:    xvpickve.d $xr3, $xr0, 2
-; CHECK-NEXT:    xvpickve.d $xr4, $xr1, 2
-; CHECK-NEXT:    fmin.d $fa3, $fa4, $fa3
-; CHECK-NEXT:    vextrins.d $vr3, $vr2, 16
-; CHECK-NEXT:    xvpickve.d $xr2, $xr0, 1
-; CHECK-NEXT:    xvpickve.d $xr4, $xr1, 1
-; CHECK-NEXT:    fmin.d $fa2, $fa4, $fa2
-; CHECK-NEXT:    xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT:    xvpickve.d $xr1, $xr1, 0
-; CHECK-NEXT:    fmin.d $fa0, $fa1, $fa0
-; CHECK-NEXT:    vextrins.d $vr0, $vr2, 16
-; CHECK-NEXT:    xvpermi.q $xr0, $xr3, 2
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvfmin.d $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -81,40 +37,10 @@ entry:
 define void @maxnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: maxnum_v8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr0, $a2, 0
-; CHECK-NEXT:    xvld $xr1, $a1, 0
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 5
-; CHECK-NEXT:    xvpickve.w $xr3, $xr1, 5
-; CHECK-NEXT:    fmax.s $fa2, $fa3, $fa2
-; CHECK-NEXT:    xvpickve.w $xr3, $xr0, 4
-; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 4
-; CHECK-NEXT:    fmax.s $fa3, $fa4, $fa3
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 16
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 6
-; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 6
-; CHECK-NEXT:    fmax.s $fa2, $fa4, $fa2
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 32
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 7
-; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 7
-; CHECK-NEXT:    fmax.s $fa2, $fa4, $fa2
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 48
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 1
-; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 1
-; CHECK-NEXT:    fmax.s $fa2, $fa4, $fa2
-; CHECK-NEXT:    xvpickve.w $xr4, $xr0, 0
-; CHECK-NEXT:    xvpickve.w $xr5, $xr1, 0
-; CHECK-NEXT:    fmax.s $fa4, $fa5, $fa4
-; CHECK-NEXT:    vextrins.w $vr4, $vr2, 16
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 2
-; CHECK-NEXT:    xvpickve.w $xr5, $xr1, 2
-; CHECK-NEXT:    fmax.s $fa2, $fa5, $fa2
-; CHECK-NEXT:    vextrins.w $vr4, $vr2, 32
-; CHECK-NEXT:    xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT:    xvpickve.w $xr1, $xr1, 3
-; CHECK-NEXT:    fmax.s $fa0, $fa1, $fa0
-; CHECK-NEXT:    vextrins.w $vr4, $vr0, 48
-; CHECK-NEXT:    xvpermi.q $xr4, $xr3, 2
-; CHECK-NEXT:    xvst $xr4, $a0, 0
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvfmax.s $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %v0 = load <8 x float>, ptr %x
@@ -127,23 +53,9 @@ entry:
 define void @maxnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: maxnum_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr0, $a2, 0
-; CHECK-NEXT:    xvld $xr1, $a1, 0
-; CHECK-NEXT:    xvpickve.d $xr2, $xr0, 3
-; CHECK-NEXT:    xvpickve.d $xr3, $xr1, 3
-; CHECK-NEXT:    fmax.d $fa2, $fa3, $fa2
-; CHECK-NEXT:    xvpickve.d $xr3, $xr0, 2
-; CHECK-NEXT:    xvpickve.d $xr4, $xr1, 2
-; CHECK-NEXT:    fmax.d $fa3, $fa4, $fa3
-; CHECK-NEXT:    vextrins.d $vr3, $vr2, 16
-; CHECK-NEXT:    xvpickve.d $xr2, $xr0, 1
-; CHECK-NEXT:    xvpickve.d $xr4, $xr1, 1
-; CHECK-NEXT:    fmax.d $fa2, $fa4, $fa2
-; CHECK-NEXT:    xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT:    xvpickve.d $xr1, $xr1, 0
-; CHECK-NEXT:    fmax.d $fa0, $fa1, $fa0
-; CHECK-NEXT:    vextrins.d $vr0, $vr2, 16
-; CHECK-NEXT:    xvpermi.q $xr0, $xr3, 2
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvfmax.d $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll
new file mode 100644
index 0000000000000..fa5f27edf615e
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+;; ceilf
+define void @ceil_v8f32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: ceil_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvfrintrp.s $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x float>, ptr %a0
+  %r = call <8 x float> @llvm.ceil.v8f32(<8 x float> %v0)
+  store <8 x float> %r, ptr %res
+  ret void
+}
+
+;; ceil
+define void @ceil_v4f64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: ceil_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvfrintrp.d $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x double>, ptr %a0
+  %r = call <4 x double> @llvm.ceil.v4f64(<4 x double> %v0)
+  store <4 x double> %r, ptr %res
+  ret void
+}
+
+;; floorf
+define void @floor_v8f32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: floor_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvfrintrm.s $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x float>, ptr %a0
+  %r = call <8 x float> @llvm.floor.v8f32(<8 x float> %v0)
+  store <8 x float> %r, ptr %res
+  ret void
+}
+
+;; floor
+define void @floor_v4f64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: floor_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvfrintrm.d $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x double>, ptr %a0
+  %r = call <4 x double> @llvm.floor.v4f64(<4 x double> %v0)
+  store <4 x double> %r, ptr %res
+  ret void
+}
+
+;; truncf
+define void @trunc_v8f32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: trunc_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvfrintrz.s $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x float>, ptr %a0
+  %r = call <8 x float> @llvm.trunc.v8f32(<8 x float> %v0)
+  store <8 x float> %r, ptr %res
+  ret void
+}
+
+;; trunc
+define void @trunc_v4f64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: trunc_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvfrintrz.d $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x double>, ptr %a0
+  %r = call <4 x double> @llvm.trunc.v4f64(<4 x double> %v0)
+  store <4 x double> %r, ptr %res
+  ret void
+}
+
+;; roundevenf
+define void @roundeven_v8f32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: roundeven_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvfrintrne.s $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x float>, ptr %a0
+  %r = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %v0)
+  store <8 x float> %r, ptr %res
+  ret void
+}
+
+;; roundeven
+define void @roundeven_v4f64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: roundeven_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvfrintrne.d $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x double>, ptr %a0
+  %r = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %v0)
+  store <4 x double> %r, ptr %res
+  ret void
+}
+
+declare <8 x float> @llvm.ceil.v8f32(<8 x float>)
+declare <4 x double> @llvm.ceil.v4f64(<4 x double>)
+declare <8 x float> @llvm.floor.v8f32(<8 x float>)
+declare <4 x double> @llvm.floor.v4f64(<4 x double>)
+declare <8 x float> @llvm.trunc.v8f32(<8 x float>)
+declare <4 x double> @llvm.trunc.v4f64(<4 x double>)
+declare <8 x float> @llvm.roundeven.v8f32(<8 x float>)
+declare <4 x double> @llvm.roundeven.v4f64(<4 x double>)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll
new file mode 100644
index 0000000000000..006713ccabf47
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll
@@ -0,0 +1,303 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+declare <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float>)
+
+define void @lasx_cast_128_s(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_cast_128_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x float>, ptr %va
+  %b = call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> %a)
+  store <8 x float> %b, ptr %vd
+  ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double>)
+
+define void @lasx_cast_128_d(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_cast_128_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x double>, ptr %va
+  %b = call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> %a)
+  store <4 x double> %b, ptr %vd
+  ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64>)
+
+define void @lasx_cast_128(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_cast_128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x i64>, ptr %va
+  %b = call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> %a)
+  store <4 x i64> %b, ptr %vd
+  ret void
+}
+
+declare <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float>, <4 x float>)
+
+define void @lasx_concat_128_s(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_concat_128_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x float>, ptr %va
+  %b = load <4 x float>, ptr %vb
+  %c = call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> %a, <4 x float> %b)
+  store <8 x float> %c, ptr %vd
+  ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double>, <2 x double>)
+
+define void @lasx_concat_128_d(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_concat_128_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x double>, ptr %va
+  %b = load <2 x double>, ptr %vb
+  %c = call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> %a, <2 x double> %b)
+  store <4 x double> %c, ptr %vd
+  ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64>, <2 x i64>)
+
+define void @lasx_concat_128(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_concat_128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x i64>, ptr %va
+  %b = load <2 x i64>, ptr %vb
+  %c = call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> %a, <2 x i64> %b)
+  store <4 x i64> %c, ptr %vd
+  ret void
+}
+
+declare <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float>)
+
+define void @lasx_extract_128_lo_s(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_lo_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x float>, ptr %va
+  %c = call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> %a)
+  store <4 x float> %c, ptr %vd
+  ret void
+}
+
+declare <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double>)
+
+define void @lasx_extract_128_lo_d(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_lo_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x double>, ptr %va
+  %c = call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> %a)
+  store <2 x double> %c, ptr %vd
+  ret void
+}
+
+declare <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64>)
+
+define void @lasx_extract_128_lo(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_lo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %va
+  %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> %a)
+  store <2 x i64> %c, ptr %vd
+  ret void
+}
+
+declare <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float>)
+
+define void @lasx_extract_128_hi_s(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_hi_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x float>, ptr %va
+  %c = call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> %a)
+  store <4 x float> %c, ptr %vd
+  ret void
+}
+
+declare <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double>)
+
+define void @lasx_extract_128_hi_d(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_hi_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x double>, ptr %va
+  %c = call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> %a)
+  store <2 x double> %c, ptr %vd
+  ret void
+}
+
+declare <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64>)
+
+define void @lasx_extract_128_hi(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_hi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %va
+  %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> %a)
+  store <2 x i64> %c, ptr %vd
+  ret void
+}
+
+declare <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float>, <4 x float>)
+
+define void @lasx_insert_128_lo_s(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_lo_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x float>, ptr %va
+  %b = load <4 x float>, ptr %vb
+  %c = call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> %a, <4 x float> %b)
+  store <8 x float> %c, ptr %vd
+  ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double>, <2 x double>)
+
+define void @lasx_insert_128_lo_d(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_lo_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x double>, ptr %va
+  %b = load <2 x double>, ptr %vb
+  %c = call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> %a, <2 x double> %b)
+  store <4 x double> %c, ptr %vd
+  ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64>, <2 x i64>)
+
+define void @lasx_insert_128_lo(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_lo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %va
+  %b = load <2 x i64>, ptr %vb
+  %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> %a, <2 x i64> %b)
+  store <4 x i64> %c, ptr %vd
+  ret void
+}
+
+declare <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float>, <4 x float>)
+
+define void @lasx_insert_128_hi_s(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_hi_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x float>, ptr %va
+  %b = load <4 x float>, ptr %vb
+  %c = call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> %a, <4 x float> %b)
+  store <8 x float> %c, ptr %vd
+  ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double>, <2 x double>)
+
+define void @lasx_insert_128_hi_d(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_hi_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x double>, ptr %va
+  %b = load <2 x double>, ptr %vb
+  %c = call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> %a, <2 x double> %b)
+  store <4 x double> %c, ptr %vd
+  ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64>, <2 x i64>)
+
+define void @lasx_insert_128_hi(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_hi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %va
+  %b = load <2 x i64>, ptr %vb
+  %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> %a, <2 x i64> %b)
+  store <4 x i64> %c, ptr %vd
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll
index 2a5a8fa05d646..5c5c19935080b 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avg.ll
@@ -1,14 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64
 
 define void @xvavg_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: xvavg_b:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvadd.b $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvsrai.b $xr0, $xr0, 1
+; CHECK-NEXT:    xvavg.b $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -25,8 +24,7 @@ define void @xvavg_h(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvadd.h $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvsrai.h $xr0, $xr0, 1
+; CHECK-NEXT:    xvavg.h $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -43,8 +41,7 @@ define void @xvavg_w(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvadd.w $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvsrai.w $xr0, $xr0, 1
+; CHECK-NEXT:    xvavg.w $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -57,14 +54,22 @@ entry:
 }
 
 define void @xvavg_d(ptr %res, ptr %a, ptr %b) nounwind {
-; CHECK-LABEL: xvavg_d:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvadd.d $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvsrai.d $xr0, $xr0, 1
-; CHECK-NEXT:    xvst $xr0, $a0, 0
-; CHECK-NEXT:    ret
+; LA32-LABEL: xvavg_d:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    xvld $xr0, $a1, 0
+; LA32-NEXT:    xvld $xr1, $a2, 0
+; LA32-NEXT:    xvadd.d $xr0, $xr0, $xr1
+; LA32-NEXT:    xvsrai.d $xr0, $xr0, 1
+; LA32-NEXT:    xvst $xr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: xvavg_d:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    xvld $xr0, $a1, 0
+; LA64-NEXT:    xvld $xr1, $a2, 0
+; LA64-NEXT:    xvavg.d $xr0, $xr0, $xr1
+; LA64-NEXT:    xvst $xr0, $a0, 0
+; LA64-NEXT:    ret
 entry:
   %va = load <4 x i64>, ptr %a
   %vb = load <4 x i64>, ptr %b
@@ -79,8 +84,7 @@ define void @xvavg_bu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvadd.b $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvsrli.b $xr0, $xr0, 1
+; CHECK-NEXT:    xvavg.bu $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -97,8 +101,7 @@ define void @xvavg_hu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvadd.h $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvsrli.h $xr0, $xr0, 1
+; CHECK-NEXT:    xvavg.hu $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -115,8 +118,7 @@ define void @xvavg_wu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvadd.w $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvsrli.w $xr0, $xr0, 1
+; CHECK-NEXT:    xvavg.wu $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -129,14 +131,22 @@ entry:
 }
 
 define void @xvavg_du(ptr %res, ptr %a, ptr %b) nounwind {
-; CHECK-LABEL: xvavg_du:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvadd.d $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvsrli.d $xr0, $xr0, 1
-; CHECK-NEXT:    xvst $xr0, $a0, 0
-; CHECK-NEXT:    ret
+; LA32-LABEL: xvavg_du:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    xvld $xr0, $a1, 0
+; LA32-NEXT:    xvld $xr1, $a2, 0
+; LA32-NEXT:    xvadd.d $xr0, $xr0, $xr1
+; LA32-NEXT:    xvsrli.d $xr0, $xr0, 1
+; LA32-NEXT:    xvst $xr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: xvavg_du:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    xvld $xr0, $a1, 0
+; LA64-NEXT:    xvld $xr1, $a2, 0
+; LA64-NEXT:    xvavg.du $xr0, $xr0, $xr1
+; LA64-NEXT:    xvst $xr0, $a0, 0
+; LA64-NEXT:    ret
 entry:
   %va = load <4 x i64>, ptr %a
   %vb = load <4 x i64>, ptr %b
@@ -151,9 +161,7 @@ define void @xvavgr_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvadd.b $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvaddi.bu $xr0, $xr0, 1
-; CHECK-NEXT:    xvsrai.b $xr0, $xr0, 1
+; CHECK-NEXT:    xvavgr.b $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -171,9 +179,7 @@ define void @xvavgr_h(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvadd.h $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvaddi.hu $xr0, $xr0, 1
-; CHECK-NEXT:    xvsrai.h $xr0, $xr0, 1
+; CHECK-NEXT:    xvavgr.h $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -191,9 +197,7 @@ define void @xvavgr_w(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvadd.w $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvaddi.wu $xr0, $xr0, 1
-; CHECK-NEXT:    xvsrai.w $xr0, $xr0, 1
+; CHECK-NEXT:    xvavgr.w $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -207,15 +211,23 @@ entry:
 }
 
 define void @xvavgr_d(ptr %res, ptr %a, ptr %b) nounwind {
-; CHECK-LABEL: xvavgr_d:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvadd.d $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvaddi.du $xr0, $xr0, 1
-; CHECK-NEXT:    xvsrai.d $xr0, $xr0, 1
-; CHECK-NEXT:    xvst $xr0, $a0, 0
-; CHECK-NEXT:    ret
+; LA32-LABEL: xvavgr_d:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    xvld $xr0, $a1, 0
+; LA32-NEXT:    xvld $xr1, $a2, 0
+; LA32-NEXT:    xvadd.d $xr0, $xr0, $xr1
+; LA32-NEXT:    xvaddi.du $xr0, $xr0, 1
+; LA32-NEXT:    xvsrai.d $xr0, $xr0, 1
+; LA32-NEXT:    xvst $xr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: xvavgr_d:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    xvld $xr0, $a1, 0
+; LA64-NEXT:    xvld $xr1, $a2, 0
+; LA64-NEXT:    xvavgr.d $xr0, $xr0, $xr1
+; LA64-NEXT:    xvst $xr0, $a0, 0
+; LA64-NEXT:    ret
 entry:
   %va = load <4 x i64>, ptr %a
   %vb = load <4 x i64>, ptr %b
@@ -231,9 +243,7 @@ define void @xvavgr_bu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvadd.b $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvaddi.bu $xr0, $xr0, 1
-; CHECK-NEXT:    xvsrli.b $xr0, $xr0, 1
+; CHECK-NEXT:    xvavgr.bu $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -251,9 +261,7 @@ define void @xvavgr_hu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvadd.h $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvaddi.hu $xr0, $xr0, 1
-; CHECK-NEXT:    xvsrli.h $xr0, $xr0, 1
+; CHECK-NEXT:    xvavgr.hu $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -271,9 +279,7 @@ define void @xvavgr_wu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvadd.w $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvaddi.wu $xr0, $xr0, 1
-; CHECK-NEXT:    xvsrli.w $xr0, $xr0, 1
+; CHECK-NEXT:    xvavgr.wu $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -287,15 +293,23 @@ entry:
 }
 
 define void @xvavgr_du(ptr %res, ptr %a, ptr %b) nounwind {
-; CHECK-LABEL: xvavgr_du:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvadd.d $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvaddi.du $xr0, $xr0, 1
-; CHECK-NEXT:    xvsrli.d $xr0, $xr0, 1
-; CHECK-NEXT:    xvst $xr0, $a0, 0
-; CHECK-NEXT:    ret
+; LA32-LABEL: xvavgr_du:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    xvld $xr0, $a1, 0
+; LA32-NEXT:    xvld $xr1, $a2, 0
+; LA32-NEXT:    xvadd.d $xr0, $xr0, $xr1
+; LA32-NEXT:    xvaddi.du $xr0, $xr0, 1
+; LA32-NEXT:    xvsrli.d $xr0, $xr0, 1
+; LA32-NEXT:    xvst $xr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: xvavgr_du:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    xvld $xr0, $a1, 0
+; LA64-NEXT:    xvld $xr1, $a2, 0
+; LA64-NEXT:    xvavgr.du $xr0, $xr0, $xr1
+; LA64-NEXT:    xvst $xr0, $a0, 0
+; LA64-NEXT:    ret
 entry:
   %va = load <4 x i64>, ptr %a
   %vb = load <4 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avgfloor-ceil.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avgfloor-ceil.ll
new file mode 100644
index 0000000000000..c82adcb250c64
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/avgfloor-ceil.ll
@@ -0,0 +1,379 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+define void @xvavg_b(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvand.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrai.b $xr0, $xr0, 1
+; CHECK-NEXT:    xvadd.b $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <32 x i8>, ptr %a
+  %vb = load <32 x i8>, ptr %b
+  %ea = sext <32 x i8> %va to <32 x i16>
+  %eb = sext <32 x i8> %vb to <32 x i16>
+  %add = add <32 x i16> %ea, %eb
+  %shr = lshr <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %r = trunc <32 x i16> %shr to <32 x i8>
+  store <32 x i8> %r, ptr %res
+  ret void
+}
+
+define void @xvavg_h(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvand.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrai.h $xr0, $xr0, 1
+; CHECK-NEXT:    xvadd.h $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i16>, ptr %a
+  %vb = load <16 x i16>, ptr %b
+  %ea = sext <16 x i16> %va to <16 x i32>
+  %eb = sext <16 x i16> %vb to <16 x i32>
+  %add = add <16 x i32> %ea, %eb
+  %shr = lshr <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = trunc <16 x i32> %shr to <16 x i16>
+  store <16 x i16> %r, ptr %res
+  ret void
+}
+
+define void @xvavg_w(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvand.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrai.w $xr0, $xr0, 1
+; CHECK-NEXT:    xvadd.w $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %ea = sext <8 x i32> %va to <8 x i64>
+  %eb = sext <8 x i32> %vb to <8 x i64>
+  %add = add <8 x i64> %ea, %eb
+  %shr = lshr <8 x i64> %add, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %r = trunc <8 x i64> %shr to <8 x i32>
+  store <8 x i32> %r, ptr %res
+  ret void
+}
+
+define void @xvavg_d(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvand.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrai.d $xr0, $xr0, 1
+; CHECK-NEXT:    xvadd.d $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %ea = sext <4 x i64> %va to <4 x i128>
+  %eb = sext <4 x i64> %vb to <4 x i128>
+  %add = add <4 x i128> %ea, %eb
+  %shr = lshr <4 x i128> %add, <i128 1, i128 1, i128 1, i128 1>
+  %r = trunc <4 x i128> %shr to <4 x i64>
+  store <4 x i64> %r, ptr %res
+  ret void
+}
+
+define void @xvavg_bu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_bu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvand.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrli.b $xr0, $xr0, 1
+; CHECK-NEXT:    xvadd.b $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <32 x i8>, ptr %a
+  %vb = load <32 x i8>, ptr %b
+  %ea = zext <32 x i8> %va to <32 x i16>
+  %eb = zext <32 x i8> %vb to <32 x i16>
+  %add = add <32 x i16> %ea, %eb
+  %shr = lshr <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %r = trunc <32 x i16> %shr to <32 x i8>
+  store <32 x i8> %r, ptr %res
+  ret void
+}
+
+define void @xvavg_hu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_hu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvand.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrli.h $xr0, $xr0, 1
+; CHECK-NEXT:    xvadd.h $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i16>, ptr %a
+  %vb = load <16 x i16>, ptr %b
+  %ea = zext <16 x i16> %va to <16 x i32>
+  %eb = zext <16 x i16> %vb to <16 x i32>
+  %add = add <16 x i32> %ea, %eb
+  %shr = lshr <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = trunc <16 x i32> %shr to <16 x i16>
+  store <16 x i16> %r, ptr %res
+  ret void
+}
+
+define void @xvavg_wu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_wu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvand.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrli.w $xr0, $xr0, 1
+; CHECK-NEXT:    xvadd.w $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %ea = zext <8 x i32> %va to <8 x i64>
+  %eb = zext <8 x i32> %vb to <8 x i64>
+  %add = add <8 x i64> %ea, %eb
+  %shr = lshr <8 x i64> %add, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %r = trunc <8 x i64> %shr to <8 x i32>
+  store <8 x i32> %r, ptr %res
+  ret void
+}
+
+define void @xvavg_du(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavg_du:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvand.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrli.d $xr0, $xr0, 1
+; CHECK-NEXT:    xvadd.d $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %ea = zext <4 x i64> %va to <4 x i128>
+  %eb = zext <4 x i64> %vb to <4 x i128>
+  %add = add <4 x i128> %ea, %eb
+  %shr = lshr <4 x i128> %add, <i128 1, i128 1, i128 1, i128 1>
+  %r = trunc <4 x i128> %shr to <4 x i64>
+  store <4 x i64> %r, ptr %res
+  ret void
+}
+
+define void @xvavgr_b(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvor.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrai.b $xr0, $xr0, 1
+; CHECK-NEXT:    xvsub.b $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <32 x i8>, ptr %a
+  %vb = load <32 x i8>, ptr %b
+  %ea = sext <32 x i8> %va to <32 x i16>
+  %eb = sext <32 x i8> %vb to <32 x i16>
+  %add = add <32 x i16> %ea, %eb
+  %add1 = add <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %r = trunc <32 x i16> %shr to <32 x i8>
+  store <32 x i8> %r, ptr %res
+  ret void
+}
+
+define void @xvavgr_h(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvor.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrai.h $xr0, $xr0, 1
+; CHECK-NEXT:    xvsub.h $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i16>, ptr %a
+  %vb = load <16 x i16>, ptr %b
+  %ea = sext <16 x i16> %va to <16 x i32>
+  %eb = sext <16 x i16> %vb to <16 x i32>
+  %add = add <16 x i32> %ea, %eb
+  %add1 = add <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = trunc <16 x i32> %shr to <16 x i16>
+  store <16 x i16> %r, ptr %res
+  ret void
+}
+
+define void @xvavgr_w(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvor.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrai.w $xr0, $xr0, 1
+; CHECK-NEXT:    xvsub.w $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %ea = sext <8 x i32> %va to <8 x i64>
+  %eb = sext <8 x i32> %vb to <8 x i64>
+  %add = add <8 x i64> %ea, %eb
+  %add1 = add <8 x i64> %add, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %shr = lshr <8 x i64> %add1, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %r = trunc <8 x i64> %shr to <8 x i32>
+  store <8 x i32> %r, ptr %res
+  ret void
+}
+
+define void @xvavgr_d(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvor.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrai.d $xr0, $xr0, 1
+; CHECK-NEXT:    xvsub.d $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %ea = sext <4 x i64> %va to <4 x i128>
+  %eb = sext <4 x i64> %vb to <4 x i128>
+  %add = add <4 x i128> %ea, %eb
+  %add1 = add <4 x i128> %add, <i128 1, i128 1, i128 1, i128 1>
+  %shr = lshr <4 x i128> %add1, <i128 1, i128 1, i128 1, i128 1>
+  %r = trunc <4 x i128> %shr to <4 x i64>
+  store <4 x i64> %r, ptr %res
+  ret void
+}
+
+define void @xvavgr_bu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_bu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvor.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrli.b $xr0, $xr0, 1
+; CHECK-NEXT:    xvsub.b $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <32 x i8>, ptr %a
+  %vb = load <32 x i8>, ptr %b
+  %ea = zext <32 x i8> %va to <32 x i16>
+  %eb = zext <32 x i8> %vb to <32 x i16>
+  %add = add <32 x i16> %ea, %eb
+  %add1 = add <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %r = trunc <32 x i16> %shr to <32 x i8>
+  store <32 x i8> %r, ptr %res
+  ret void
+}
+
+define void @xvavgr_hu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_hu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvor.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrli.h $xr0, $xr0, 1
+; CHECK-NEXT:    xvsub.h $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i16>, ptr %a
+  %vb = load <16 x i16>, ptr %b
+  %ea = zext <16 x i16> %va to <16 x i32>
+  %eb = zext <16 x i16> %vb to <16 x i32>
+  %add = add <16 x i32> %ea, %eb
+  %add1 = add <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = trunc <16 x i32> %shr to <16 x i16>
+  store <16 x i16> %r, ptr %res
+  ret void
+}
+
+define void @xvavgr_wu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_wu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvor.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrli.w $xr0, $xr0, 1
+; CHECK-NEXT:    xvsub.w $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %ea = zext <8 x i32> %va to <8 x i64>
+  %eb = zext <8 x i32> %vb to <8 x i64>
+  %add = add <8 x i64> %ea, %eb
+  %add1 = add <8 x i64> %add, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %shr = lshr <8 x i64> %add1, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %r = trunc <8 x i64> %shr to <8 x i32>
+  store <8 x i32> %r, ptr %res
+  ret void
+}
+
+define void @xvavgr_du(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvavgr_du:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvor.v $xr2, $xr0, $xr1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsrli.d $xr0, $xr0, 1
+; CHECK-NEXT:    xvsub.d $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %ea = zext <4 x i64> %va to <4 x i128>
+  %eb = zext <4 x i64> %vb to <4 x i128>
+  %add = add <4 x i128> %ea, %eb
+  %add1 = add <4 x i128> %add, <i128 1, i128 1, i128 1, i128 1>
+  %shr = lshr <4 x i128> %add1, <i128 1, i128 1, i128 1, i128 1>
+  %r = trunc <4 x i128> %shr to <4 x i64>
+  store <4 x i64> %r, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/flog2.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/flog2.ll
index 68f2e3ab488e1..6b5f5751e5706 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/flog2.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/flog2.ll
@@ -1,166 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefix=LA32
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefix=LA64
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
 
 declare <8 x float> @llvm.log2.v8f32(<8 x float>)
 declare <4 x double> @llvm.log2.v4f64(<4 x double>)
 
 define void @flog2_v8f32(ptr %res, ptr %a) nounwind {
-; LA32-LABEL: flog2_v8f32:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $sp, $sp, -128
-; LA32-NEXT:    st.w $ra, $sp, 124 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $fp, $sp, 120 # 4-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvst $xr0, $sp, 80 # 32-byte Folded Spill
-; LA32-NEXT:    move $fp, $a0
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 5
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 4
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $xr0
-; LA32-NEXT:    vld $vr1, $sp, 48 # 16-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr0, $vr1, 16
-; LA32-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 6
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    xvld $xr1, $sp, 48 # 32-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr1, $vr0, 32
-; LA32-NEXT:    xvst $xr1, $sp, 48 # 32-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 7
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    xvld $xr1, $sp, 48 # 32-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr1, $vr0, 48
-; LA32-NEXT:    xvst $xr1, $sp, 48 # 32-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 1
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 0
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $xr0
-; LA32-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr0, $vr1, 16
-; LA32-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 2
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr1, $vr0, 32
-; LA32-NEXT:    xvst $xr1, $sp, 16 # 32-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.w $xr0, $xr0, 3
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr1, $vr0, 48
-; LA32-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
-; LA32-NEXT:    xvpermi.q $xr1, $xr0, 2
-; LA32-NEXT:    xvst $xr1, $fp, 0
-; LA32-NEXT:    ld.w $fp, $sp, 120 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $ra, $sp, 124 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 128
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: flog2_v8f32:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    addi.d $sp, $sp, -128
-; LA64-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvst $xr0, $sp, 80 # 32-byte Folded Spill
-; LA64-NEXT:    move $fp, $a0
-; LA64-NEXT:    xvpickve.w $xr0, $xr0, 5
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.w $xr0, $xr0, 4
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $xr0
-; LA64-NEXT:    vld $vr1, $sp, 48 # 16-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr0, $vr1, 16
-; LA64-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.w $xr0, $xr0, 6
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    xvld $xr1, $sp, 48 # 32-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr1, $vr0, 32
-; LA64-NEXT:    xvst $xr1, $sp, 48 # 32-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.w $xr0, $xr0, 7
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    xvld $xr1, $sp, 48 # 32-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr1, $vr0, 48
-; LA64-NEXT:    xvst $xr1, $sp, 48 # 32-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.w $xr0, $xr0, 1
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.w $xr0, $xr0, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $xr0
-; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr0, $vr1, 16
-; LA64-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.w $xr0, $xr0, 2
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr1, $vr0, 32
-; LA64-NEXT:    xvst $xr1, $sp, 16 # 32-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.w $xr0, $xr0, 3
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr1, $vr0, 48
-; LA64-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
-; LA64-NEXT:    xvpermi.q $xr1, $xr0, 2
-; LA64-NEXT:    xvst $xr1, $fp, 0
-; LA64-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 128
-; LA64-NEXT:    ret
+; CHECK-LABEL: flog2_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvflogb.s $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %v = load <8 x float>, ptr %a
   %r = call <8 x float> @llvm.log2.v8f32(<8 x float> %v)
@@ -169,93 +20,12 @@ entry:
 }
 
 define void @flog2_v4f64(ptr %res, ptr %a) nounwind {
-; LA32-LABEL: flog2_v4f64:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $sp, $sp, -112
-; LA32-NEXT:    st.w $ra, $sp, 108 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $fp, $sp, 104 # 4-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvst $xr0, $sp, 64 # 32-byte Folded Spill
-; LA32-NEXT:    move $fp, $a0
-; LA32-NEXT:    xvpickve.d $xr0, $xr0, 3
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
-; LA32-NEXT:    bl log2
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; LA32-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.d $xr0, $xr0, 2
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
-; LA32-NEXT:    bl log2
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
-; LA32-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
-; LA32-NEXT:    vextrins.d $vr0, $vr1, 16
-; LA32-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.d $xr0, $xr0, 1
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
-; LA32-NEXT:    bl log2
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; LA32-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA32-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
-; LA32-NEXT:    xvpickve.d $xr0, $xr0, 0
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
-; LA32-NEXT:    bl log2
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
-; LA32-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
-; LA32-NEXT:    vextrins.d $vr0, $vr1, 16
-; LA32-NEXT:    xvld $xr1, $sp, 32 # 32-byte Folded Reload
-; LA32-NEXT:    xvpermi.q $xr0, $xr1, 2
-; LA32-NEXT:    xvst $xr0, $fp, 0
-; LA32-NEXT:    ld.w $fp, $sp, 104 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $ra, $sp, 108 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 112
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: flog2_v4f64:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    addi.d $sp, $sp, -112
-; LA64-NEXT:    st.d $ra, $sp, 104 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 96 # 8-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvst $xr0, $sp, 64 # 32-byte Folded Spill
-; LA64-NEXT:    move $fp, $a0
-; LA64-NEXT:    xvpickve.d $xr0, $xr0, 3
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.d $xr0, $xr0, 2
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
-; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
-; LA64-NEXT:    vextrins.d $vr0, $vr1, 16
-; LA64-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.d $xr0, $xr0, 1
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
-; LA64-NEXT:    xvpickve.d $xr0, $xr0, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
-; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    vextrins.d $vr0, $vr1, 16
-; LA64-NEXT:    xvld $xr1, $sp, 32 # 32-byte Folded Reload
-; LA64-NEXT:    xvpermi.q $xr0, $xr1, 2
-; LA64-NEXT:    xvst $xr0, $fp, 0
-; LA64-NEXT:    ld.d $fp, $sp, 96 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 104 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 112
-; LA64-NEXT:    ret
+; CHECK-LABEL: flog2_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvflogb.d $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %v = load <4 x double>, ptr %a
   %r = call <4 x double> @llvm.log2.v4f64(<4 x double> %v)
diff --git a/llvm/test/CodeGen/LoongArch/ldptr.ll b/llvm/test/CodeGen/LoongArch/ldptr.ll
index c3656a6bdafba..9bafa10c47e3f 100644
--- a/llvm/test/CodeGen/LoongArch/ldptr.ll
+++ b/llvm/test/CodeGen/LoongArch/ldptr.ll
@@ -24,8 +24,7 @@ define signext i32 @ldptr_w(ptr %p) nounwind {
 ; LA32-LABEL: ldptr_w:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    addi.w $a0, $a0, 2047
-; LA32-NEXT:    addi.w $a0, $a0, 1
-; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    ld.w $a0, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: ldptr_w:
@@ -81,10 +80,9 @@ entry:
 define i64 @ldptr_d(ptr %p) nounwind {
 ; LA32-LABEL: ldptr_d:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $a0, $a0, 2047
-; LA32-NEXT:    addi.w $a1, $a0, 1
-; LA32-NEXT:    ld.w $a0, $a1, 0
-; LA32-NEXT:    ld.w $a1, $a1, 4
+; LA32-NEXT:    addi.w $a1, $a0, 2047
+; LA32-NEXT:    ld.w $a0, $a1, 1
+; LA32-NEXT:    ld.w $a1, $a1, 5
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: ldptr_d:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
index a9a38e8f75f9c..6ac7d51de253b 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
@@ -106,6 +106,69 @@ define void @ctlz_v2i64(ptr %src, ptr %dst) nounwind {
   ret void
 }
 
+define void @not_ctlz_v16i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vxori.b $vr0, $vr0, 255
+; CHECK-NEXT:    vclz.b $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i8>, ptr %src
+  %neg = xor <16 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %neg, i1 false)
+  store <16 x i8> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v8i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, -1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vclz.h $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i16>, ptr %src
+  %neg = xor <8 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %neg, i1 false)
+  store <8 x i16> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v4i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, -1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vclz.w $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i32>, ptr %src
+  %neg = xor <4 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %neg, i1 false)
+  store <4 x i32> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v2i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, -1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vclz.d $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i64>, ptr %src
+  %neg = xor <2 x i64> %v, <i64 -1, i64 -1>
+  %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %neg, i1 false)
+  store <2 x i64> %res, ptr %dst
+  ret void
+}
+
 declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
 declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
 declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll
index 27ecb759c2ea3..c17309230ee72 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll
@@ -5,24 +5,10 @@
 define void @minnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: minnum_v4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vld $vr0, $a2, 0
-; CHECK-NEXT:    vld $vr1, $a1, 0
-; CHECK-NEXT:    vreplvei.w $vr2, $vr0, 1
-; CHECK-NEXT:    vreplvei.w $vr3, $vr1, 1
-; CHECK-NEXT:    fmin.s $fa2, $fa3, $fa2
-; CHECK-NEXT:    vreplvei.w $vr3, $vr0, 0
-; CHECK-NEXT:    vreplvei.w $vr4, $vr1, 0
-; CHECK-NEXT:    fmin.s $fa3, $fa4, $fa3
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 16
-; CHECK-NEXT:    vreplvei.w $vr2, $vr0, 2
-; CHECK-NEXT:    vreplvei.w $vr4, $vr1, 2
-; CHECK-NEXT:    fmin.s $fa2, $fa4, $fa2
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 32
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 3
-; CHECK-NEXT:    fmin.s $fa0, $fa1, $fa0
-; CHECK-NEXT:    vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT:    vst $vr3, $a0, 0
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vfmin.s $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %v0 = load <4 x float>, ptr %x
@@ -35,15 +21,9 @@ entry:
 define void @minnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: minnum_v2f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vld $vr0, $a2, 0
-; CHECK-NEXT:    vld $vr1, $a1, 0
-; CHECK-NEXT:    vreplvei.d $vr2, $vr0, 1
-; CHECK-NEXT:    vreplvei.d $vr3, $vr1, 1
-; CHECK-NEXT:    fmin.d $fa2, $fa3, $fa2
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT:    fmin.d $fa0, $fa1, $fa0
-; CHECK-NEXT:    vextrins.d $vr0, $vr2, 16
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vfmin.d $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -57,24 +37,10 @@ entry:
 define void @maxnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: maxnum_v4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vld $vr0, $a2, 0
-; CHECK-NEXT:    vld $vr1, $a1, 0
-; CHECK-NEXT:    vreplvei.w $vr2, $vr0, 1
-; CHECK-NEXT:    vreplvei.w $vr3, $vr1, 1
-; CHECK-NEXT:    fmax.s $fa2, $fa3, $fa2
-; CHECK-NEXT:    vreplvei.w $vr3, $vr0, 0
-; CHECK-NEXT:    vreplvei.w $vr4, $vr1, 0
-; CHECK-NEXT:    fmax.s $fa3, $fa4, $fa3
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 16
-; CHECK-NEXT:    vreplvei.w $vr2, $vr0, 2
-; CHECK-NEXT:    vreplvei.w $vr4, $vr1, 2
-; CHECK-NEXT:    fmax.s $fa2, $fa4, $fa2
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 32
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 3
-; CHECK-NEXT:    fmax.s $fa0, $fa1, $fa0
-; CHECK-NEXT:    vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT:    vst $vr3, $a0, 0
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vfmax.s $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %v0 = load <4 x float>, ptr %x
@@ -87,15 +53,9 @@ entry:
 define void @maxnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: maxnum_v2f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vld $vr0, $a2, 0
-; CHECK-NEXT:    vld $vr1, $a1, 0
-; CHECK-NEXT:    vreplvei.d $vr2, $vr0, 1
-; CHECK-NEXT:    vreplvei.d $vr3, $vr1, 1
-; CHECK-NEXT:    fmax.d $fa2, $fa3, $fa2
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT:    fmax.d $fa0, $fa1, $fa0
-; CHECK-NEXT:    vextrins.d $vr0, $vr2, 16
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vfmax.d $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll
new file mode 100644
index 0000000000000..cb01ac0358ab3
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+;; ceilf
+define void @ceil_v4f32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: ceil_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vfrintrp.s $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x float>, ptr %a0
+  %r = call <4 x float> @llvm.ceil.v4f32(<4 x float> %v0)
+  store <4 x float> %r, ptr %res
+  ret void
+}
+
+;; ceil
+define void @ceil_v2f64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: ceil_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vfrintrp.d $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x double>, ptr %a0
+  %r = call <2 x double> @llvm.ceil.v2f64(<2 x double> %v0)
+  store <2 x double> %r, ptr %res
+  ret void
+}
+
+;; floorf
+define void @floor_v4f32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: floor_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vfrintrm.s $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x float>, ptr %a0
+  %r = call <4 x float> @llvm.floor.v4f32(<4 x float> %v0)
+  store <4 x float> %r, ptr %res
+  ret void
+}
+
+;; floor
+define void @floor_v2f64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: floor_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vfrintrm.d $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x double>, ptr %a0
+  %r = call <2 x double> @llvm.floor.v2f64(<2 x double> %v0)
+  store <2 x double> %r, ptr %res
+  ret void
+}
+
+;; truncf
+define void @trunc_v4f32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: trunc_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vfrintrz.s $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x float>, ptr %a0
+  %r = call <4 x float> @llvm.trunc.v4f32(<4 x float> %v0)
+  store <4 x float> %r, ptr %res
+  ret void
+}
+
+;; trunc
+define void @trunc_v2f64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: trunc_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vfrintrz.d $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x double>, ptr %a0
+  %r = call <2 x double> @llvm.trunc.v2f64(<2 x double> %v0)
+  store <2 x double> %r, ptr %res
+  ret void
+}
+
+;; roundevenf
+define void @roundeven_v4f32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: roundeven_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vfrintrne.s $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x float>, ptr %a0
+  %r = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %v0)
+  store <4 x float> %r, ptr %res
+  ret void
+}
+
+;; roundeven
+define void @roundeven_v2f64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: roundeven_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vfrintrne.d $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x double>, ptr %a0
+  %r = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %v0)
+  store <2 x double> %r, ptr %res
+  ret void
+}
+
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>)
+declare <4 x float> @llvm.floor.v4f32(<4 x float>)
+declare <2 x double> @llvm.floor.v2f64(<2 x double>)
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>)
+declare <4 x float> @llvm.roundeven.v4f32(<4 x float>)
+declare <2 x double> @llvm.roundeven.v2f64(<2 x double>)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll
index 20b8898436cc4..334af22edee59 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avg.ll
@@ -1,14 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
-; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA32
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA64
 
 define void @vavg_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: vavg_b:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vadd.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vsrai.b $vr0, $vr0, 1
+; CHECK-NEXT:    vavg.b $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -25,8 +24,7 @@ define void @vavg_h(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vadd.h $vr0, $vr0, $vr1
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 1
+; CHECK-NEXT:    vavg.h $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -43,8 +41,7 @@ define void @vavg_w(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vadd.w $vr0, $vr0, $vr1
-; CHECK-NEXT:    vsrai.w $vr0, $vr0, 1
+; CHECK-NEXT:    vavg.w $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -57,14 +54,22 @@ entry:
 }
 
 define void @vavg_d(ptr %res, ptr %a, ptr %b) nounwind {
-; CHECK-LABEL: vavg_d:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vadd.d $vr0, $vr0, $vr1
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 1
-; CHECK-NEXT:    vst $vr0, $a0, 0
-; CHECK-NEXT:    ret
+; LA32-LABEL: vavg_d:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    vld $vr0, $a1, 0
+; LA32-NEXT:    vld $vr1, $a2, 0
+; LA32-NEXT:    vadd.d $vr0, $vr0, $vr1
+; LA32-NEXT:    vsrai.d $vr0, $vr0, 1
+; LA32-NEXT:    vst $vr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: vavg_d:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    vld $vr1, $a2, 0
+; LA64-NEXT:    vavg.d $vr0, $vr0, $vr1
+; LA64-NEXT:    vst $vr0, $a0, 0
+; LA64-NEXT:    ret
 entry:
   %va = load <2 x i64>, ptr %a
   %vb = load <2 x i64>, ptr %b
@@ -79,8 +84,7 @@ define void @vavg_bu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vadd.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vsrli.b $vr0, $vr0, 1
+; CHECK-NEXT:    vavg.bu $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -97,8 +101,7 @@ define void @vavg_hu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vadd.h $vr0, $vr0, $vr1
-; CHECK-NEXT:    vsrli.h $vr0, $vr0, 1
+; CHECK-NEXT:    vavg.hu $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -115,8 +118,7 @@ define void @vavg_wu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vadd.w $vr0, $vr0, $vr1
-; CHECK-NEXT:    vsrli.w $vr0, $vr0, 1
+; CHECK-NEXT:    vavg.wu $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -129,14 +131,22 @@ entry:
 }
 
 define void @vavg_du(ptr %res, ptr %a, ptr %b) nounwind {
-; CHECK-LABEL: vavg_du:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vadd.d $vr0, $vr0, $vr1
-; CHECK-NEXT:    vsrli.d $vr0, $vr0, 1
-; CHECK-NEXT:    vst $vr0, $a0, 0
-; CHECK-NEXT:    ret
+; LA32-LABEL: vavg_du:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    vld $vr0, $a1, 0
+; LA32-NEXT:    vld $vr1, $a2, 0
+; LA32-NEXT:    vadd.d $vr0, $vr0, $vr1
+; LA32-NEXT:    vsrli.d $vr0, $vr0, 1
+; LA32-NEXT:    vst $vr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: vavg_du:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    vld $vr1, $a2, 0
+; LA64-NEXT:    vavg.du $vr0, $vr0, $vr1
+; LA64-NEXT:    vst $vr0, $a0, 0
+; LA64-NEXT:    ret
 entry:
   %va = load <2 x i64>, ptr %a
   %vb = load <2 x i64>, ptr %b
@@ -151,9 +161,7 @@ define void @vavgr_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vadd.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vaddi.bu $vr0, $vr0, 1
-; CHECK-NEXT:    vsrai.b $vr0, $vr0, 1
+; CHECK-NEXT:    vavgr.b $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -171,9 +179,7 @@ define void @vavgr_h(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vadd.h $vr0, $vr0, $vr1
-; CHECK-NEXT:    vaddi.hu $vr0, $vr0, 1
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 1
+; CHECK-NEXT:    vavgr.h $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -191,9 +197,7 @@ define void @vavgr_w(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vadd.w $vr0, $vr0, $vr1
-; CHECK-NEXT:    vaddi.wu $vr0, $vr0, 1
-; CHECK-NEXT:    vsrai.w $vr0, $vr0, 1
+; CHECK-NEXT:    vavgr.w $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -207,15 +211,23 @@ entry:
 }
 
 define void @vavgr_d(ptr %res, ptr %a, ptr %b) nounwind {
-; CHECK-LABEL: vavgr_d:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vadd.d $vr0, $vr0, $vr1
-; CHECK-NEXT:    vaddi.du $vr0, $vr0, 1
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 1
-; CHECK-NEXT:    vst $vr0, $a0, 0
-; CHECK-NEXT:    ret
+; LA32-LABEL: vavgr_d:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    vld $vr0, $a1, 0
+; LA32-NEXT:    vld $vr1, $a2, 0
+; LA32-NEXT:    vadd.d $vr0, $vr0, $vr1
+; LA32-NEXT:    vaddi.du $vr0, $vr0, 1
+; LA32-NEXT:    vsrai.d $vr0, $vr0, 1
+; LA32-NEXT:    vst $vr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: vavgr_d:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    vld $vr1, $a2, 0
+; LA64-NEXT:    vavgr.d $vr0, $vr0, $vr1
+; LA64-NEXT:    vst $vr0, $a0, 0
+; LA64-NEXT:    ret
 entry:
   %va = load <2 x i64>, ptr %a
   %vb = load <2 x i64>, ptr %b
@@ -231,9 +243,7 @@ define void @vavgr_bu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vadd.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vaddi.bu $vr0, $vr0, 1
-; CHECK-NEXT:    vsrli.b $vr0, $vr0, 1
+; CHECK-NEXT:    vavgr.bu $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -251,9 +261,7 @@ define void @vavgr_hu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vadd.h $vr0, $vr0, $vr1
-; CHECK-NEXT:    vaddi.hu $vr0, $vr0, 1
-; CHECK-NEXT:    vsrli.h $vr0, $vr0, 1
+; CHECK-NEXT:    vavgr.hu $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -271,9 +279,7 @@ define void @vavgr_wu(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vadd.w $vr0, $vr0, $vr1
-; CHECK-NEXT:    vaddi.wu $vr0, $vr0, 1
-; CHECK-NEXT:    vsrli.w $vr0, $vr0, 1
+; CHECK-NEXT:    vavgr.wu $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -287,15 +293,23 @@ entry:
 }
 
 define void @vavgr_du(ptr %res, ptr %a, ptr %b) nounwind {
-; CHECK-LABEL: vavgr_du:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vadd.d $vr0, $vr0, $vr1
-; CHECK-NEXT:    vaddi.du $vr0, $vr0, 1
-; CHECK-NEXT:    vsrli.d $vr0, $vr0, 1
-; CHECK-NEXT:    vst $vr0, $a0, 0
-; CHECK-NEXT:    ret
+; LA32-LABEL: vavgr_du:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    vld $vr0, $a1, 0
+; LA32-NEXT:    vld $vr1, $a2, 0
+; LA32-NEXT:    vadd.d $vr0, $vr0, $vr1
+; LA32-NEXT:    vaddi.du $vr0, $vr0, 1
+; LA32-NEXT:    vsrli.d $vr0, $vr0, 1
+; LA32-NEXT:    vst $vr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: vavgr_du:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    vld $vr1, $a2, 0
+; LA64-NEXT:    vavgr.du $vr0, $vr0, $vr1
+; LA64-NEXT:    vst $vr0, $a0, 0
+; LA64-NEXT:    ret
 entry:
   %va = load <2 x i64>, ptr %a
   %vb = load <2 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avgfloor-ceil.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avgfloor-ceil.ll
new file mode 100644
index 0000000000000..bb4df64a48284
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/avgfloor-ceil.ll
@@ -0,0 +1,379 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+define void @vavg_b(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vand.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrai.b $vr0, $vr0, 1
+; CHECK-NEXT:    vadd.b $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i8>, ptr %a
+  %vb = load <16 x i8>, ptr %b
+  %ea = sext <16 x i8> %va to <16 x i16>
+  %eb = sext <16 x i8> %vb to <16 x i16>
+  %add = add <16 x i16> %ea, %eb
+  %shr = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %r = trunc <16 x i16> %shr to <16 x i8>
+  store <16 x i8> %r, ptr %res
+  ret void
+}
+
+define void @vavg_h(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vand.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrai.h $vr0, $vr0, 1
+; CHECK-NEXT:    vadd.h $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i16>, ptr %a
+  %vb = load <8 x i16>, ptr %b
+  %ea = sext <8 x i16> %va to <8 x i32>
+  %eb = sext <8 x i16> %vb to <8 x i32>
+  %add = add <8 x i32> %ea, %eb
+  %shr = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = trunc <8 x i32> %shr to <8 x i16>
+  store <8 x i16> %r, ptr %res
+  ret void
+}
+
+define void @vavg_w(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vand.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrai.w $vr0, $vr0, 1
+; CHECK-NEXT:    vadd.w $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i32>, ptr %a
+  %vb = load <4 x i32>, ptr %b
+  %ea = sext <4 x i32> %va to <4 x i64>
+  %eb = sext <4 x i32> %vb to <4 x i64>
+  %add = add <4 x i64> %ea, %eb
+  %shr = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
+  %r = trunc <4 x i64> %shr to <4 x i32>
+  store <4 x i32> %r, ptr %res
+  ret void
+}
+
+define void @vavg_d(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vand.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrai.d $vr0, $vr0, 1
+; CHECK-NEXT:    vadd.d $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <2 x i64>, ptr %a
+  %vb = load <2 x i64>, ptr %b
+  %ea = sext <2 x i64> %va to <2 x i128>
+  %eb = sext <2 x i64> %vb to <2 x i128>
+  %add = add <2 x i128> %ea, %eb
+  %shr = lshr <2 x i128> %add, <i128 1, i128 1>
+  %r = trunc <2 x i128> %shr to <2 x i64>
+  store <2 x i64> %r, ptr %res
+  ret void
+}
+
+define void @vavg_bu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_bu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vand.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.b $vr0, $vr0, 1
+; CHECK-NEXT:    vadd.b $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i8>, ptr %a
+  %vb = load <16 x i8>, ptr %b
+  %ea = zext <16 x i8> %va to <16 x i16>
+  %eb = zext <16 x i8> %vb to <16 x i16>
+  %add = add <16 x i16> %ea, %eb
+  %shr = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %r = trunc <16 x i16> %shr to <16 x i8>
+  store <16 x i8> %r, ptr %res
+  ret void
+}
+
+define void @vavg_hu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_hu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vand.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.h $vr0, $vr0, 1
+; CHECK-NEXT:    vadd.h $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i16>, ptr %a
+  %vb = load <8 x i16>, ptr %b
+  %ea = zext <8 x i16> %va to <8 x i32>
+  %eb = zext <8 x i16> %vb to <8 x i32>
+  %add = add <8 x i32> %ea, %eb
+  %shr = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = trunc <8 x i32> %shr to <8 x i16>
+  store <8 x i16> %r, ptr %res
+  ret void
+}
+
+define void @vavg_wu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_wu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vand.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.w $vr0, $vr0, 1
+; CHECK-NEXT:    vadd.w $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i32>, ptr %a
+  %vb = load <4 x i32>, ptr %b
+  %ea = zext <4 x i32> %va to <4 x i64>
+  %eb = zext <4 x i32> %vb to <4 x i64>
+  %add = add <4 x i64> %ea, %eb
+  %shr = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
+  %r = trunc <4 x i64> %shr to <4 x i32>
+  store <4 x i32> %r, ptr %res
+  ret void
+}
+
+define void @vavg_du(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavg_du:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vand.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.d $vr0, $vr0, 1
+; CHECK-NEXT:    vadd.d $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <2 x i64>, ptr %a
+  %vb = load <2 x i64>, ptr %b
+  %ea = zext <2 x i64> %va to <2 x i128>
+  %eb = zext <2 x i64> %vb to <2 x i128>
+  %add = add <2 x i128> %ea, %eb
+  %shr = lshr <2 x i128> %add, <i128 1, i128 1>
+  %r = trunc <2 x i128> %shr to <2 x i64>
+  store <2 x i64> %r, ptr %res
+  ret void
+}
+
+define void @vavgr_b(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vor.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrai.b $vr0, $vr0, 1
+; CHECK-NEXT:    vsub.b $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i8>, ptr %a
+  %vb = load <16 x i8>, ptr %b
+  %ea = sext <16 x i8> %va to <16 x i16>
+  %eb = sext <16 x i8> %vb to <16 x i16>
+  %add = add <16 x i16> %ea, %eb
+  %add1 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %r = trunc <16 x i16> %shr to <16 x i8>
+  store <16 x i8> %r, ptr %res
+  ret void
+}
+
+define void @vavgr_h(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vor.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrai.h $vr0, $vr0, 1
+; CHECK-NEXT:    vsub.h $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i16>, ptr %a
+  %vb = load <8 x i16>, ptr %b
+  %ea = sext <8 x i16> %va to <8 x i32>
+  %eb = sext <8 x i16> %vb to <8 x i32>
+  %add = add <8 x i32> %ea, %eb
+  %add1 = add <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = trunc <8 x i32> %shr to <8 x i16>
+  store <8 x i16> %r, ptr %res
+  ret void
+}
+
+define void @vavgr_w(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vor.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrai.w $vr0, $vr0, 1
+; CHECK-NEXT:    vsub.w $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i32>, ptr %a
+  %vb = load <4 x i32>, ptr %b
+  %ea = sext <4 x i32> %va to <4 x i64>
+  %eb = sext <4 x i32> %vb to <4 x i64>
+  %add = add <4 x i64> %ea, %eb
+  %add1 = add <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
+  %shr = lshr <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1>
+  %r = trunc <4 x i64> %shr to <4 x i32>
+  store <4 x i32> %r, ptr %res
+  ret void
+}
+
+define void @vavgr_d(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vor.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrai.d $vr0, $vr0, 1
+; CHECK-NEXT:    vsub.d $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <2 x i64>, ptr %a
+  %vb = load <2 x i64>, ptr %b
+  %ea = sext <2 x i64> %va to <2 x i128>
+  %eb = sext <2 x i64> %vb to <2 x i128>
+  %add = add <2 x i128> %ea, %eb
+  %add1 = add <2 x i128> %add, <i128 1, i128 1>
+  %shr = lshr <2 x i128> %add1, <i128 1, i128 1>
+  %r = trunc <2 x i128> %shr to <2 x i64>
+  store <2 x i64> %r, ptr %res
+  ret void
+}
+
+define void @vavgr_bu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_bu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vor.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.b $vr0, $vr0, 1
+; CHECK-NEXT:    vsub.b $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i8>, ptr %a
+  %vb = load <16 x i8>, ptr %b
+  %ea = zext <16 x i8> %va to <16 x i16>
+  %eb = zext <16 x i8> %vb to <16 x i16>
+  %add = add <16 x i16> %ea, %eb
+  %add1 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %r = trunc <16 x i16> %shr to <16 x i8>
+  store <16 x i8> %r, ptr %res
+  ret void
+}
+
+define void @vavgr_hu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_hu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vor.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.h $vr0, $vr0, 1
+; CHECK-NEXT:    vsub.h $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i16>, ptr %a
+  %vb = load <8 x i16>, ptr %b
+  %ea = zext <8 x i16> %va to <8 x i32>
+  %eb = zext <8 x i16> %vb to <8 x i32>
+  %add = add <8 x i32> %ea, %eb
+  %add1 = add <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = trunc <8 x i32> %shr to <8 x i16>
+  store <8 x i16> %r, ptr %res
+  ret void
+}
+
+define void @vavgr_wu(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_wu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vor.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.w $vr0, $vr0, 1
+; CHECK-NEXT:    vsub.w $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i32>, ptr %a
+  %vb = load <4 x i32>, ptr %b
+  %ea = zext <4 x i32> %va to <4 x i64>
+  %eb = zext <4 x i32> %vb to <4 x i64>
+  %add = add <4 x i64> %ea, %eb
+  %add1 = add <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
+  %shr = lshr <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1>
+  %r = trunc <4 x i64> %shr to <4 x i32>
+  store <4 x i32> %r, ptr %res
+  ret void
+}
+
+define void @vavgr_du(ptr %res, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: vavgr_du:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vor.v $vr2, $vr0, $vr1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.d $vr0, $vr0, 1
+; CHECK-NEXT:    vsub.d $vr0, $vr2, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <2 x i64>, ptr %a
+  %vb = load <2 x i64>, ptr %b
+  %ea = zext <2 x i64> %va to <2 x i128>
+  %eb = zext <2 x i64> %vb to <2 x i128>
+  %add = add <2 x i128> %ea, %eb
+  %add1 = add <2 x i128> %add, <i128 1, i128 1>
+  %shr = lshr <2 x i128> %add1, <i128 1, i128 1>
+  %r = trunc <2 x i128> %shr to <2 x i64>
+  store <2 x i64> %r, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/flog2.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/flog2.ll
index e5e75ec617b51..87cc7c6dbc708 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/flog2.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/flog2.ll
@@ -1,98 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefix=LA32
-; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefix=LA64
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
 
 declare <4 x float> @llvm.log2.v4f32(<4 x float>)
 declare <2 x double> @llvm.log2.v2f64(<2 x double>)
 
 define void @flog2_v4f32(ptr %res, ptr %a) nounwind {
-; LA32-LABEL: flog2_v4f32:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $sp, $sp, -48
-; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
-; LA32-NEXT:    vld $vr0, $a1, 0
-; LA32-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA32-NEXT:    move $fp, $a0
-; LA32-NEXT:    vreplvei.w $vr0, $vr0, 1
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $vr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA32-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA32-NEXT:    vreplvei.w $vr0, $vr0, 0
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $vr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr0, $vr1, 16
-; LA32-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA32-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA32-NEXT:    vreplvei.w $vr0, $vr0, 2
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $vr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr1, $vr0, 32
-; LA32-NEXT:    vst $vr1, $sp, 0 # 16-byte Folded Spill
-; LA32-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA32-NEXT:    vreplvei.w $vr0, $vr0, 3
-; LA32-NEXT:    # kill: def $f0 killed $f0 killed $vr0
-; LA32-NEXT:    bl log2f
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA32-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
-; LA32-NEXT:    vextrins.w $vr1, $vr0, 48
-; LA32-NEXT:    vst $vr1, $fp, 0
-; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 48
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: flog2_v4f32:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    addi.d $sp, $sp, -48
-; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $a1, 0
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    move $fp, $a0
-; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr0, $vr1, 16
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    vreplvei.w $vr0, $vr0, 2
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr1, $vr0, 32
-; LA64-NEXT:    vst $vr1, $sp, 0 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    vreplvei.w $vr0, $vr0, 3
-; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2f)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
-; LA64-NEXT:    vextrins.w $vr1, $vr0, 48
-; LA64-NEXT:    vst $vr1, $fp, 0
-; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 48
-; LA64-NEXT:    ret
+; CHECK-LABEL: flog2_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vflogb.s $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %v = load <4 x float>, ptr %a
   %r = call <4 x float> @llvm.log2.v4f32(<4 x float> %v)
@@ -101,59 +20,12 @@ entry:
 }
 
 define void @flog2_v2f64(ptr %res, ptr %a) nounwind {
-; LA32-LABEL: flog2_v2f64:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $sp, $sp, -48
-; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
-; LA32-NEXT:    vld $vr0, $a1, 0
-; LA32-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA32-NEXT:    move $fp, $a0
-; LA32-NEXT:    vreplvei.d $vr0, $vr0, 1
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
-; LA32-NEXT:    bl log2
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; LA32-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA32-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
-; LA32-NEXT:    vreplvei.d $vr0, $vr0, 0
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
-; LA32-NEXT:    bl log2
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; LA32-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
-; LA32-NEXT:    vextrins.d $vr0, $vr1, 16
-; LA32-NEXT:    vst $vr0, $fp, 0
-; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 48
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: flog2_v2f64:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    addi.d $sp, $sp, -48
-; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $a1, 0
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA64-NEXT:    move $fp, $a0
-; LA64-NEXT:    vreplvei.d $vr0, $vr0, 1
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
-; LA64-NEXT:    vreplvei.d $vr0, $vr0, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
-; LA64-NEXT:    pcaddu18i $ra, %call36(log2)
-; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    vextrins.d $vr0, $vr1, 16
-; LA64-NEXT:    vst $vr0, $fp, 0
-; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 48
-; LA64-NEXT:    ret
+; CHECK-LABEL: flog2_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vflogb.d $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %v = load <2 x double>, ptr %a
   %r = call <2 x double> @llvm.log2.v2f64(<2 x double> %v)
diff --git a/llvm/test/CodeGen/LoongArch/memcmp.ll b/llvm/test/CodeGen/LoongArch/memcmp.ll
index c4aaf9a75a852..c3811c0357793 100644
--- a/llvm/test/CodeGen/LoongArch/memcmp.ll
+++ b/llvm/test/CodeGen/LoongArch/memcmp.ll
@@ -7,15 +7,24 @@
 define signext i32 @test1(ptr %buffer1, ptr %buffer2) {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset 1, -8
-; CHECK-NEXT:    ori $a2, $zero, 16
-; CHECK-NEXT:    pcaddu18i $ra, %call36(memcmp)
-; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ld.d $a2, $a0, 0
+; CHECK-NEXT:    ld.d $a3, $a1, 0
+; CHECK-NEXT:    revb.d $a2, $a2
+; CHECK-NEXT:    revb.d $a3, $a3
+; CHECK-NEXT:    bne $a2, $a3, .LBB0_3
+; CHECK-NEXT:  # %bb.1: # %loadbb1
+; CHECK-NEXT:    ld.d $a0, $a0, 8
+; CHECK-NEXT:    ld.d $a1, $a1, 8
+; CHECK-NEXT:    revb.d $a2, $a0
+; CHECK-NEXT:    revb.d $a3, $a1
+; CHECK-NEXT:    bne $a2, $a3, .LBB0_3
+; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:    move $a0, $zero
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_3: # %res_block
+; CHECK-NEXT:    sltu $a0, $a2, $a3
+; CHECK-NEXT:    sub.d $a0, $zero, $a0
+; CHECK-NEXT:    ori $a0, $a0, 1
 ; CHECK-NEXT:    ret
 entry:
   %call = call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 16)
diff --git a/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
new file mode 100644
index 0000000000000..93f73e5cd30ff
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
@@ -0,0 +1,746 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx --verify-machineinstrs < %s \
+; RUN:   | FileCheck --check-prefix=LA32 %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx --verify-machineinstrs < %s \
+; RUN:   | FileCheck --check-prefix=LA64 %s
+
+%struct.S = type { i64, i64, i8 }
+%struct.F = type { float, double, float }
+%struct.V = type { <4 x i32>, <4 x i32>, <16 x i16> }
+
+define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_i64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -48
+; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s5, $sp, 16 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s6, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a1, $a0, 4
+; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    add.w $s2, $a4, $a0
+; LA32-NEXT:    bnez $a1, .LBB0_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    move $s5, $zero
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s6, $zero
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB0_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    ld.w $a0, $s2, 12
+; LA32-NEXT:    ld.w $a1, $s2, 8
+; LA32-NEXT:    add.w $a0, $a0, $s6
+; LA32-NEXT:    add.w $s3, $a1, $s3
+; LA32-NEXT:    sltu $a1, $s3, $a1
+; LA32-NEXT:    addi.w $s4, $s4, 1
+; LA32-NEXT:    sltui $a2, $s4, 1
+; LA32-NEXT:    add.w $s5, $s5, $a2
+; LA32-NEXT:    xor $a2, $s4, $s1
+; LA32-NEXT:    xor $a3, $s5, $s0
+; LA32-NEXT:    or $a2, $a2, $a3
+; LA32-NEXT:    add.w $s6, $a0, $a1
+; LA32-NEXT:    bnez $a2, .LBB0_2
+; LA32-NEXT:    b .LBB0_4
+; LA32-NEXT:  .LBB0_3:
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s6, $zero
+; LA32-NEXT:  .LBB0_4: # %for.cond.cleanup
+; LA32-NEXT:    st.w $s3, $s2, 8
+; LA32-NEXT:    st.w $s6, $s2, 12
+; LA32-NEXT:    ld.w $s6, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s5, $sp, 16 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 48
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_i64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s2, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    slli.d $a1, $a0, 4
+; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT:    add.d $s1, $a2, $a0
+; LA64-NEXT:    blez $s0, .LBB0_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    move $s2, $zero
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB0_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $a0, $s1, 8
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    add.d $s2, $a0, $s2
+; LA64-NEXT:    bnez $s0, .LBB0_2
+; LA64-NEXT:    b .LBB0_4
+; LA64-NEXT:  .LBB0_3:
+; LA64-NEXT:    move $s2, $zero
+; LA64-NEXT:  .LBB0_4: # %for.cond.cleanup
+; LA64-NEXT:    st.d $s2, $s1, 8
+; LA64-NEXT:    ld.d $s2, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 1
+  %cmp4 = icmp sgt i64 %n, 0
+  br i1 %cmp4, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %s.05 = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  call void @f(ptr %a)
+  %0 = load i64, ptr %y
+  %add = add nsw i64 %0, %s.05
+  %inc = add nuw nsw i64 %i.06, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %s.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  store i64 %s.0.lcssa, ptr %y
+  ret void
+}
+
+define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_f32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -48
+; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT:    fst.d $fs0, $sp, 8 # 8-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a1, $a0, 4
+; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    add.w $s2, $a4, $a0
+; LA32-NEXT:    bnez $a1, .LBB1_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    movgr2fr.w $fs0, $zero
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB1_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    fld.s $fa0, $s2, 16
+; LA32-NEXT:    addi.w $s3, $s3, 1
+; LA32-NEXT:    sltui $a0, $s3, 1
+; LA32-NEXT:    add.w $s4, $s4, $a0
+; LA32-NEXT:    xor $a0, $s3, $s1
+; LA32-NEXT:    xor $a1, $s4, $s0
+; LA32-NEXT:    or $a0, $a0, $a1
+; LA32-NEXT:    fadd.s $fs0, $fa0, $fs0
+; LA32-NEXT:    bnez $a0, .LBB1_2
+; LA32-NEXT:    b .LBB1_4
+; LA32-NEXT:  .LBB1_3:
+; LA32-NEXT:    movgr2fr.w $fs0, $zero
+; LA32-NEXT:  .LBB1_4: # %for.cond.cleanup
+; LA32-NEXT:    fst.s $fs0, $s2, 16
+; LA32-NEXT:    fld.d $fs0, $sp, 8 # 8-byte Folded Reload
+; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 48
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_f32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    fst.d $fs0, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    slli.d $a1, $a0, 4
+; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT:    add.d $s1, $a2, $a0
+; LA64-NEXT:    blez $s0, .LBB1_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    movgr2fr.w $fs0, $zero
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB1_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    fld.s $fa0, $s1, 16
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    fadd.s $fs0, $fa0, $fs0
+; LA64-NEXT:    bnez $s0, .LBB1_2
+; LA64-NEXT:    b .LBB1_4
+; LA64-NEXT:  .LBB1_3:
+; LA64-NEXT:    movgr2fr.w $fs0, $zero
+; LA64-NEXT:  .LBB1_4: # %for.cond.cleanup
+; LA64-NEXT:    fst.s $fs0, $s1, 16
+; LA64-NEXT:    fld.d $fs0, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 2
+  %cmp4 = icmp sgt i64 %n, 0
+  br i1 %cmp4, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %s.05 = phi float [ 0.0, %entry ], [ %add, %for.body ]
+  call void @f(ptr %a)
+  %0 = load float, ptr %y
+  %add = fadd float %0, %s.05
+  %inc = add nuw nsw i64 %i.06, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %s.0.lcssa = phi float [ 0.0, %entry ], [ %add, %for.body ]
+  store float %s.0.lcssa, ptr %y
+  ret void
+}
+
+define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_v4i32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -48
+; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a0, $a0, 6
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    add.w $s2, $a4, $a0
+; LA32-NEXT:    bnez $a1, .LBB2_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    vrepli.b $vr0, 0
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB2_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    vld $vr0, $s2, 16
+; LA32-NEXT:    addi.w $s3, $s3, 1
+; LA32-NEXT:    sltui $a0, $s3, 1
+; LA32-NEXT:    add.w $s4, $s4, $a0
+; LA32-NEXT:    xor $a0, $s3, $s1
+; LA32-NEXT:    xor $a1, $s4, $s0
+; LA32-NEXT:    or $a0, $a0, $a1
+; LA32-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT:    vadd.w $vr1, $vr0, $vr1
+; LA32-NEXT:    vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT:    bnez $a0, .LBB2_2
+; LA32-NEXT:    b .LBB2_4
+; LA32-NEXT:  .LBB2_3:
+; LA32-NEXT:    vrepli.b $vr0, 0
+; LA32-NEXT:  .LBB2_4: # %for.cond.cleanup
+; LA32-NEXT:    vst $vr0, $s2, 16
+; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 48
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_v4i32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    slli.d $a0, $a0, 6
+; LA64-NEXT:    add.d $s1, $a2, $a0
+; LA64-NEXT:    blez $a1, .LBB2_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    vrepli.b $vr0, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB2_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    vld $vr0, $s1, 16
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vadd.w $vr1, $vr0, $vr1
+; LA64-NEXT:    vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    bnez $s0, .LBB2_2
+; LA64-NEXT:    b .LBB2_4
+; LA64-NEXT:  .LBB2_3:
+; LA64-NEXT:    vrepli.b $vr0, 0
+; LA64-NEXT:  .LBB2_4: # %for.cond.cleanup
+; LA64-NEXT:    vst $vr0, $s1, 16
+; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 1
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %sum.0 = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  call void @f(ptr %a)
+  %v = load <4 x i32>, ptr %y
+  %addv = add <4 x i32> %v, %sum.0
+  %inc = add nuw nsw i64 %i.0, 1
+  %exitcond = icmp eq i64 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %sum.lcssa = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  store <4 x i32> %sum.lcssa, ptr %y
+  ret void
+}
+
+define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_v16i16:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -80
+; LA32-NEXT:    st.w $ra, $sp, 76 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 72 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 68 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 64 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 52 # 4-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a0, $a0, 6
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    add.w $s2, $a4, $a0
+; LA32-NEXT:    bnez $a1, .LBB3_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    xvrepli.b $xr0, 0
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB3_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    xvld $xr0, $s2, 32
+; LA32-NEXT:    addi.w $s3, $s3, 1
+; LA32-NEXT:    sltui $a0, $s3, 1
+; LA32-NEXT:    add.w $s4, $s4, $a0
+; LA32-NEXT:    xor $a0, $s3, $s1
+; LA32-NEXT:    xor $a1, $s4, $s0
+; LA32-NEXT:    or $a0, $a0, $a1
+; LA32-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT:    xvadd.h $xr1, $xr0, $xr1
+; LA32-NEXT:    xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT:    bnez $a0, .LBB3_2
+; LA32-NEXT:    b .LBB3_4
+; LA32-NEXT:  .LBB3_3:
+; LA32-NEXT:    xvrepli.b $xr0, 0
+; LA32-NEXT:  .LBB3_4: # %for.cond.cleanup
+; LA32-NEXT:    xvst $xr0, $s2, 32
+; LA32-NEXT:    ld.w $s4, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 64 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 68 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 72 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 76 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 80
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_v16i16:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT:    slli.d $a0, $a0, 6
+; LA64-NEXT:    add.d $s1, $a2, $a0
+; LA64-NEXT:    blez $a1, .LBB3_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    xvrepli.b $xr0, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB3_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    xvld $xr0, $s1, 32
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT:    xvadd.h $xr1, $xr0, $xr1
+; LA64-NEXT:    xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT:    bnez $s0, .LBB3_2
+; LA64-NEXT:    b .LBB3_4
+; LA64-NEXT:  .LBB3_3:
+; LA64-NEXT:    xvrepli.b $xr0, 0
+; LA64-NEXT:  .LBB3_4: # %for.cond.cleanup
+; LA64-NEXT:    xvst $xr0, $s1, 32
+; LA64-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 80
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 2
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %sum.0 = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  call void @f(ptr %a)
+  %v = load <16 x i16>, ptr %y
+  %addv = add <16 x i16> %v, %sum.0
+  %inc = add nuw nsw i64 %i.0, 1
+  %exitcond = icmp eq i64 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %sum.lcssa = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  store <16 x i16> %sum.lcssa, ptr %y
+  ret void
+}
+
+define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_extracti8:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -48
+; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a1, $a0, 4
+; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    add.w $s2, $a4, $a0
+; LA32-NEXT:    bnez $a1, .LBB4_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    vrepli.b $vr0, 0
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB4_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    vldrepl.b $vr0, $s2, 16
+; LA32-NEXT:    addi.w $s3, $s3, 1
+; LA32-NEXT:    sltui $a0, $s3, 1
+; LA32-NEXT:    add.w $s4, $s4, $a0
+; LA32-NEXT:    xor $a0, $s3, $s1
+; LA32-NEXT:    xor $a1, $s4, $s0
+; LA32-NEXT:    or $a0, $a0, $a1
+; LA32-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT:    vadd.b $vr1, $vr0, $vr1
+; LA32-NEXT:    vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT:    bnez $a0, .LBB4_2
+; LA32-NEXT:    b .LBB4_4
+; LA32-NEXT:  .LBB4_3:
+; LA32-NEXT:    vrepli.b $vr0, 0
+; LA32-NEXT:  .LBB4_4: # %for.cond.cleanup
+; LA32-NEXT:    vstelm.b $vr0, $s2, 16, 1
+; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 48
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_extracti8:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    slli.d $a1, $a0, 4
+; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT:    add.d $s1, $a2, $a0
+; LA64-NEXT:    blez $s0, .LBB4_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    vrepli.b $vr0, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB4_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    vldrepl.b $vr0, $s1, 16
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vadd.b $vr1, $vr0, $vr1
+; LA64-NEXT:    vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    bnez $s0, .LBB4_2
+; LA64-NEXT:    b .LBB4_4
+; LA64-NEXT:  .LBB4_3:
+; LA64-NEXT:    vrepli.b $vr0, 0
+; LA64-NEXT:  .LBB4_4: # %for.cond.cleanup
+; LA64-NEXT:    vstelm.b $vr0, $s1, 16, 1
+; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 2
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %sum.0 = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  call void @f(ptr %a)
+  %e = load i8, ptr %y
+  %ins0 = insertelement <16 x i8> poison, i8 %e, i32 0
+  %v = shufflevector <16 x i8> %ins0, <16 x i8> poison, <16 x i32> zeroinitializer
+  %addv = add <16 x i8> %v, %sum.0
+  %inc = add nuw nsw i64 %i.0, 1
+  %exitcond = icmp eq i64 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %sum.lcssa = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  %res = extractelement <16 x i8> %sum.lcssa, i32 1
+  store i8 %res, ptr %y
+  ret void
+}
+
+define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_extractf64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -80
+; LA32-NEXT:    st.w $ra, $sp, 76 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 72 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 68 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 64 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 52 # 4-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a1, $a0, 4
+; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    add.w $s2, $a4, $a0
+; LA32-NEXT:    bnez $a1, .LBB5_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    xvrepli.b $xr0, 0
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB5_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    xvldrepl.d $xr0, $s2, 8
+; LA32-NEXT:    addi.w $s3, $s3, 1
+; LA32-NEXT:    sltui $a0, $s3, 1
+; LA32-NEXT:    add.w $s4, $s4, $a0
+; LA32-NEXT:    xor $a0, $s3, $s1
+; LA32-NEXT:    xor $a1, $s4, $s0
+; LA32-NEXT:    or $a0, $a0, $a1
+; LA32-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT:    xvfadd.d $xr1, $xr0, $xr1
+; LA32-NEXT:    xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT:    bnez $a0, .LBB5_2
+; LA32-NEXT:    b .LBB5_4
+; LA32-NEXT:  .LBB5_3:
+; LA32-NEXT:    xvrepli.b $xr0, 0
+; LA32-NEXT:  .LBB5_4: # %for.cond.cleanup
+; LA32-NEXT:    xvstelm.d $xr0, $s2, 8, 1
+; LA32-NEXT:    ld.w $s4, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 64 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 68 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 72 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 76 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 80
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_extractf64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    slli.d $a1, $a0, 4
+; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT:    add.d $s1, $a2, $a0
+; LA64-NEXT:    blez $s0, .LBB5_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    xvrepli.b $xr0, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB5_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    xvldrepl.d $xr0, $s1, 8
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT:    xvfadd.d $xr1, $xr0, $xr1
+; LA64-NEXT:    xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT:    bnez $s0, .LBB5_2
+; LA64-NEXT:    b .LBB5_4
+; LA64-NEXT:  .LBB5_3:
+; LA64-NEXT:    xvrepli.b $xr0, 0
+; LA64-NEXT:  .LBB5_4: # %for.cond.cleanup
+; LA64-NEXT:    xvstelm.d $xr0, $s1, 8, 1
+; LA64-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 80
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 1
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %sum.0 = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  call void @f(ptr %a)
+  %e = load double, ptr %y
+  %ins0 = insertelement <4 x double> poison, double %e, i32 0
+  %v = shufflevector <4 x double> %ins0, <4 x double> poison, <4 x i32> zeroinitializer
+  %addv = fadd <4 x double> %v, %sum.0
+  %inc = add nuw nsw i64 %i.0, 1
+  %exitcond = icmp eq i64 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %sum.lcssa = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  %res = extractelement <4 x double> %sum.lcssa, i32 1
+  store double %res, ptr %y
+  ret void
+}
+
+declare void @f(ptr)
diff --git a/llvm/test/CodeGen/LoongArch/stptr.ll b/llvm/test/CodeGen/LoongArch/stptr.ll
index d70f9f4ba1603..23b433aa15856 100644
--- a/llvm/test/CodeGen/LoongArch/stptr.ll
+++ b/llvm/test/CodeGen/LoongArch/stptr.ll
@@ -23,8 +23,7 @@ define void @stptr_w(ptr %p, i32 signext %val) nounwind {
 ; LA32-LABEL: stptr_w:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $a0, $a0, 2047
-; LA32-NEXT:    addi.w $a0, $a0, 1
-; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    st.w $a1, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: stptr_w:
@@ -77,9 +76,8 @@ define void @stptr_d(ptr %p, i64 %val) nounwind {
 ; LA32-LABEL: stptr_d:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $a0, $a0, 2047
-; LA32-NEXT:    addi.w $a0, $a0, 1
-; LA32-NEXT:    st.w $a2, $a0, 4
-; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    st.w $a2, $a0, 5
+; LA32-NEXT:    st.w $a1, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: stptr_d:
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
index d3c0da9862245..8af4277f12c65 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
@@ -1439,11 +1439,8 @@ Key: PSUBWrm:  [ 0.00  0.00 ]
 Key: PSUBWrr:  [ 0.00  0.00 ]
 Key: PSWAPDrm:  [ 0.00  0.00 ]
 Key: PSWAPDrr:  [ 0.00  0.00 ]
-Key: PT:  [ 0.00  0.00 ]
 Key: PTCMMIMFP:  [ 0.00  0.00 ]
 Key: PTCMMRLFP:  [ 0.00  0.00 ]
-Key: PTCONJTCMMIMFP:  [ 0.00  0.00 ]
-Key: PTCONJTFP:  [ 0.00  0.00 ]
 Key: PTCVTROWD:  [ 0.00  0.00 ]
 Key: PTCVTROWPS:  [ 0.00  0.00 ]
 Key: PTDPBF:  [ 0.00  0.00 ]
@@ -1471,20 +1468,11 @@ Key: PTILEMOVROWrre:  [ 0.00  0.00 ]
 Key: PTILEMOVROWrreV:  [ 0.00  0.00 ]
 Key: PTILEMOVROWrri:  [ 0.00  0.00 ]
 Key: PTILEMOVROWrriV:  [ 0.00  0.00 ]
-Key: PTILEPAIRLOAD:  [ 0.00  0.00 ]
-Key: PTILEPAIRSTORE:  [ 0.00  0.00 ]
 Key: PTILESTORED:  [ 0.00  0.00 ]
 Key: PTILESTOREDV:  [ 0.00  0.00 ]
 Key: PTILEZERO:  [ 0.00  0.00 ]
 Key: PTILEZEROV:  [ 0.00  0.00 ]
 Key: PTMMULTF:  [ 0.00  0.00 ]
-Key: PTTCMMIMFP:  [ 0.00  0.00 ]
-Key: PTTCMMRLFP:  [ 0.00  0.00 ]
-Key: PTTDPBF:  [ 0.00  0.00 ]
-Key: PTTDPFP:  [ 0.00  0.00 ]
-Key: PTTMMULTF:  [ 0.00  0.00 ]
-Key: PTTRANSPOSED:  [ 0.00  0.00 ]
-Key: PTTRANSPOSEDV:  [ 0.00  0.00 ]
 Key: PTWRITE:  [ 0.00  0.00 ]
 Key: PTWRITEm:  [ 0.00  0.00 ]
 Key: PTWRITEr:  [ 0.00  0.00 ]
@@ -1543,6 +1531,7 @@ Key: RDSSPQ:  [ 0.00  0.00 ]
 Key: RDTSC:  [ 0.00  0.00 ]
 Key: RDTSCP:  [ 0.00  0.00 ]
 Key: REG_SEQUENCE:  [ 0.00  0.00 ]
+Key: RELOC_NONE:  [ 0.00  0.00 ]
 Key: REPNE_PREFIX:  [ 0.00  0.00 ]
 Key: REP_MOVSB:  [ 0.00  0.00 ]
 Key: REP_MOVSD:  [ 0.00  0.00 ]
@@ -1717,8 +1706,6 @@ Key: TAILJMPm:  [ 0.00  0.00 ]
 Key: TAILJMPr:  [ 0.00  0.00 ]
 Key: TCMMIMFP:  [ 0.00  0.00 ]
 Key: TCMMRLFP:  [ 0.00  0.00 ]
-Key: TCONJTCMMIMFP:  [ 0.00  0.00 ]
-Key: TCONJTFP:  [ 0.00  0.00 ]
 Key: TCRETURN_HIPE:  [ 0.00  0.00 ]
 Key: TCRETURN_WIN:  [ 0.00  0.00 ]
 Key: TCRETURN_WINmi:  [ 0.00  0.00 ]
@@ -1764,12 +1751,6 @@ Key: TPAUSE:  [ 0.00  0.00 ]
 Key: TRAP:  [ 0.00  0.00 ]
 Key: TST_F:  [ 0.00  0.00 ]
 Key: TST_Fp:  [ 0.00  0.00 ]
-Key: TTCMMIMFP:  [ 0.00  0.00 ]
-Key: TTCMMRLFP:  [ 0.00  0.00 ]
-Key: TTDPBF:  [ 0.00  0.00 ]
-Key: TTDPFP:  [ 0.00  0.00 ]
-Key: TTMMULTF:  [ 0.00  0.00 ]
-Key: TTRANSPOSED:  [ 0.00  0.00 ]
 Key: TZCNT:  [ 0.00  0.00 ]
 Key: TZMSK:  [ 0.00  0.00 ]
 Key: UBSAN_UD:  [ 0.00  0.00 ]
@@ -7034,7 +7015,6 @@ Key: PhyReg_VR256:  [ 0.00  0.00 ]
 Key: PhyReg_VR512:  [ 0.00  0.00 ]
 Key: PhyReg_VR512_0_15:  [ 0.00  0.00 ]
 Key: PhyReg_TILE:  [ 0.00  0.00 ]
-Key: PhyReg_TILEPAIR:  [ 0.00  0.00 ]
 Key: VirtReg_GR8:  [ 0.00  0.00 ]
 Key: VirtReg_GRH8:  [ 0.00  0.00 ]
 Key: VirtReg_GR8_NOREX2:  [ 0.00  0.00 ]
@@ -7170,4 +7150,3 @@ Key: VirtReg_VR256:  [ 0.00  0.00 ]
 Key: VirtReg_VR512:  [ 0.00  0.00 ]
 Key: VirtReg_VR512_0_15:  [ 0.00  0.00 ]
 Key: VirtReg_TILE:  [ 0.00  0.00 ]
-Key: VirtReg_TILEPAIR:  [ 0.00  0.00 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
index c6e5508248b9b..e13342641d359 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
@@ -1439,11 +1439,8 @@ Key: PSUBWrm:  [ 0.00  0.00 ]
 Key: PSUBWrr:  [ 0.00  0.00 ]
 Key: PSWAPDrm:  [ 0.00  0.00 ]
 Key: PSWAPDrr:  [ 0.00  0.00 ]
-Key: PT:  [ 0.00  0.00 ]
 Key: PTCMMIMFP:  [ 0.00  0.00 ]
 Key: PTCMMRLFP:  [ 0.00  0.00 ]
-Key: PTCONJTCMMIMFP:  [ 0.00  0.00 ]
-Key: PTCONJTFP:  [ 0.00  0.00 ]
 Key: PTCVTROWD:  [ 0.00  0.00 ]
 Key: PTCVTROWPS:  [ 0.00  0.00 ]
 Key: PTDPBF:  [ 0.00  0.00 ]
@@ -1471,20 +1468,11 @@ Key: PTILEMOVROWrre:  [ 0.00  0.00 ]
 Key: PTILEMOVROWrreV:  [ 0.00  0.00 ]
 Key: PTILEMOVROWrri:  [ 0.00  0.00 ]
 Key: PTILEMOVROWrriV:  [ 0.00  0.00 ]
-Key: PTILEPAIRLOAD:  [ 0.00  0.00 ]
-Key: PTILEPAIRSTORE:  [ 0.00  0.00 ]
 Key: PTILESTORED:  [ 0.00  0.00 ]
 Key: PTILESTOREDV:  [ 0.00  0.00 ]
 Key: PTILEZERO:  [ 0.00  0.00 ]
 Key: PTILEZEROV:  [ 0.00  0.00 ]
 Key: PTMMULTF:  [ 0.00  0.00 ]
-Key: PTTCMMIMFP:  [ 0.00  0.00 ]
-Key: PTTCMMRLFP:  [ 0.00  0.00 ]
-Key: PTTDPBF:  [ 0.00  0.00 ]
-Key: PTTDPFP:  [ 0.00  0.00 ]
-Key: PTTMMULTF:  [ 0.00  0.00 ]
-Key: PTTRANSPOSED:  [ 0.00  0.00 ]
-Key: PTTRANSPOSEDV:  [ 0.00  0.00 ]
 Key: PTWRITE:  [ 0.00  0.00 ]
 Key: PTWRITEm:  [ 0.00  0.00 ]
 Key: PTWRITEr:  [ 0.00  0.00 ]
@@ -1543,6 +1531,7 @@ Key: RDSSPQ:  [ 0.00  0.00 ]
 Key: RDTSC:  [ 0.00  0.00 ]
 Key: RDTSCP:  [ 0.00  0.00 ]
 Key: REG_SEQUENCE:  [ 0.00  0.00 ]
+Key: RELOC_NONE:  [ 0.00  0.00 ]
 Key: REPNE_PREFIX:  [ 0.00  0.00 ]
 Key: REP_MOVSB:  [ 0.00  0.00 ]
 Key: REP_MOVSD:  [ 0.00  0.00 ]
@@ -1717,8 +1706,6 @@ Key: TAILJMPm:  [ 0.00  0.00 ]
 Key: TAILJMPr:  [ 0.00  0.00 ]
 Key: TCMMIMFP:  [ 0.00  0.00 ]
 Key: TCMMRLFP:  [ 0.00  0.00 ]
-Key: TCONJTCMMIMFP:  [ 0.00  0.00 ]
-Key: TCONJTFP:  [ 0.00  0.00 ]
 Key: TCRETURN_HIPE:  [ 0.00  0.00 ]
 Key: TCRETURN_WIN:  [ 0.00  0.00 ]
 Key: TCRETURN_WINmi:  [ 0.00  0.00 ]
@@ -1764,12 +1751,6 @@ Key: TPAUSE:  [ 0.00  0.00 ]
 Key: TRAP:  [ 0.00  0.00 ]
 Key: TST_F:  [ 0.00  0.00 ]
 Key: TST_Fp:  [ 0.00  0.00 ]
-Key: TTCMMIMFP:  [ 0.00  0.00 ]
-Key: TTCMMRLFP:  [ 0.00  0.00 ]
-Key: TTDPBF:  [ 0.00  0.00 ]
-Key: TTDPFP:  [ 0.00  0.00 ]
-Key: TTMMULTF:  [ 0.00  0.00 ]
-Key: TTRANSPOSED:  [ 0.00  0.00 ]
 Key: TZCNT:  [ 0.00  0.00 ]
 Key: TZMSK:  [ 0.00  0.00 ]
 Key: UBSAN_UD:  [ 0.00  0.00 ]
@@ -7034,7 +7015,6 @@ Key: PhyReg_VR256:  [ 0.00  0.00 ]
 Key: PhyReg_VR512:  [ 0.00  0.00 ]
 Key: PhyReg_VR512_0_15:  [ 0.00  0.00 ]
 Key: PhyReg_TILE:  [ 0.00  0.00 ]
-Key: PhyReg_TILEPAIR:  [ 0.00  0.00 ]
 Key: VirtReg_GR8:  [ 0.00  0.00 ]
 Key: VirtReg_GRH8:  [ 0.00  0.00 ]
 Key: VirtReg_GR8_NOREX2:  [ 0.00  0.00 ]
@@ -7170,4 +7150,3 @@ Key: VirtReg_VR256:  [ 0.00  0.00 ]
 Key: VirtReg_VR512:  [ 0.00  0.00 ]
 Key: VirtReg_VR512_0_15:  [ 0.00  0.00 ]
 Key: VirtReg_TILE:  [ 0.00  0.00 ]
-Key: VirtReg_TILEPAIR:  [ 0.00  0.00 ]
diff --git a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
deleted file mode 100644
index bd8d882cda39b..0000000000000
--- a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; REQUIRES: have_tflite
-; REQUIRES: x86_64-linux
-;
-; Check that we log the currently in development features correctly with both the default
-; case and with a learned policy.
-;
-; RUN: llc -o /dev/null -mtriple=x86_64-linux-unknown -regalloc=greedy \
-; RUN:   -regalloc-enable-advisor=development \
-; RUN:   -regalloc-training-log=%t1 \
-; RUN:   -regalloc-enable-development-features < %S/Inputs/input.ll
-; RUN: %python %S/../../../lib/Analysis/models/log_reader.py %t1 > %t1.readable
-; RUN: FileCheck --input-file %t1.readable %s
-
-; RUN: rm -rf %t && mkdir %t
-; RUN: %python %S/../../../lib/Analysis/models/gen-regalloc-eviction-test-model.py %t_savedmodel
-; RUN: %python %S/../../../lib/Analysis/models/saved-model-to-tflite.py %t_savedmodel %t
-; RUN: llc -o /dev/null -mtriple=x86_64-linux-unknown -regalloc=greedy \
-; RUN:   -regalloc-enable-advisor=development \
-; RUN:   -regalloc-training-log=%t2 -regalloc-model=%t \
-; RUN:   -regalloc-enable-development-features < %S/Inputs/input.ll
-; RUN: %python %S/../../../lib/Analysis/models/log_reader.py %t2 > %t2.readable
-; RUN: FileCheck --input-file %t2.readable %s
-
-; CHECK-NOT: nan
-; Check the first five opcodes in the first eviction problem
-; Also, the first eviction problem is significantly less than 300 instructions. Check
-; that there is a zero value.
-; Note: we're regex-ing some of the opcodes to avoid test flakyness.
-; CHECK: instructions: 20,{{([0-9]{4})}},1{{([0-9]{3})}},2{{([0-9]{3})}},{{.*}},0,
-; Only the candidate virtreg and the 10th LR are included in this problem. Make
-; sure the other LRs have values of zero. There are 2700 0s followed by some 1s.
-; There's a limit to how many repetitions can be matched.
-; CHECK: instructions_mapping: {{(((0,){27}){100})}}
-; CHECK-SAME: 1
-; Indexing 300 back from where the candidate vr actual resides due to the fact
-; that not all the values between the 10th LR and the candidate are zero.
-; CHECK-SAME-COUNT-6600: 0,
-; CHECK-SAME: 1
-; Ensure that we can still go through the mapping matrices for the rest of the
-; eviction problems to make sure we haven't hit the end of the matrix above.
-; There are a total of 23 eviction problems with this test.
-; CHECK-LABEL: observation: 16
-; Make sure that we're exporting the mbb_frequencies. Don't actually check
-; values due to all values being floating point/liable to change very easily.
-; CHECK: mbb_frequencies:
-; Make sure that we have the mbb_mapping feature, and that the first couple
-; of values are correct.
-; CHECK: mbb_mapping: 0,0,0,0,1,1,1
diff --git a/llvm/test/CodeGen/NVPTX/atomics-b128.ll b/llvm/test/CodeGen/NVPTX/atomics-b128.ll
index b2a3f94d11a16..3057e91e8ebe4 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-b128.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-b128.ll
@@ -756,24 +756,24 @@ define i128 @test_atomicrmw_and(ptr %ptr, i128 %val) {
 ; CHECK-NEXT:    ld.v2.b64 {%rd11, %rd12}, [%rd3];
 ; CHECK-NEXT:  $L__BB34_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    and.b64 %rd6, %rd11, %rd4;
-; CHECK-NEXT:    and.b64 %rd7, %rd12, %rd5;
+; CHECK-NEXT:    mov.b64 %rd2, %rd12;
+; CHECK-NEXT:    mov.b64 %rd1, %rd11;
+; CHECK-NEXT:    and.b64 %rd6, %rd1, %rd4;
+; CHECK-NEXT:    and.b64 %rd7, %rd2, %rd5;
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:    .reg .b128 cmp, swap, dst;
-; CHECK-NEXT:    mov.b128 cmp, {%rd11, %rd12};
+; CHECK-NEXT:    mov.b128 cmp, {%rd1, %rd2};
 ; CHECK-NEXT:    mov.b128 swap, {%rd6, %rd7};
 ; CHECK-NEXT:    atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap;
-; CHECK-NEXT:    mov.b128 {%rd1, %rd2}, dst;
+; CHECK-NEXT:    mov.b128 {%rd11, %rd12}, dst;
 ; CHECK-NEXT:    }
-; CHECK-NEXT:    xor.b64 %rd8, %rd2, %rd12;
-; CHECK-NEXT:    xor.b64 %rd9, %rd1, %rd11;
+; CHECK-NEXT:    xor.b64 %rd8, %rd12, %rd2;
+; CHECK-NEXT:    xor.b64 %rd9, %rd11, %rd1;
 ; CHECK-NEXT:    or.b64 %rd10, %rd9, %rd8;
 ; CHECK-NEXT:    setp.ne.b64 %p1, %rd10, 0;
-; CHECK-NEXT:    mov.b64 %rd11, %rd1;
-; CHECK-NEXT:    mov.b64 %rd12, %rd2;
 ; CHECK-NEXT:    @%p1 bra $L__BB34_1;
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd11, %rd12};
 ; CHECK-NEXT:    ret;
   %ret = atomicrmw and ptr %ptr, i128 %val monotonic
   ret i128 %ret
@@ -791,24 +791,24 @@ define i128 @test_atomicrmw_or(ptr %ptr, i128 %val) {
 ; CHECK-NEXT:    ld.v2.b64 {%rd11, %rd12}, [%rd3];
 ; CHECK-NEXT:  $L__BB35_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    or.b64 %rd6, %rd11, %rd4;
-; CHECK-NEXT:    or.b64 %rd7, %rd12, %rd5;
+; CHECK-NEXT:    mov.b64 %rd2, %rd12;
+; CHECK-NEXT:    mov.b64 %rd1, %rd11;
+; CHECK-NEXT:    or.b64 %rd6, %rd1, %rd4;
+; CHECK-NEXT:    or.b64 %rd7, %rd2, %rd5;
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:    .reg .b128 cmp, swap, dst;
-; CHECK-NEXT:    mov.b128 cmp, {%rd11, %rd12};
+; CHECK-NEXT:    mov.b128 cmp, {%rd1, %rd2};
 ; CHECK-NEXT:    mov.b128 swap, {%rd6, %rd7};
 ; CHECK-NEXT:    atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap;
-; CHECK-NEXT:    mov.b128 {%rd1, %rd2}, dst;
+; CHECK-NEXT:    mov.b128 {%rd11, %rd12}, dst;
 ; CHECK-NEXT:    }
-; CHECK-NEXT:    xor.b64 %rd8, %rd2, %rd12;
-; CHECK-NEXT:    xor.b64 %rd9, %rd1, %rd11;
+; CHECK-NEXT:    xor.b64 %rd8, %rd12, %rd2;
+; CHECK-NEXT:    xor.b64 %rd9, %rd11, %rd1;
 ; CHECK-NEXT:    or.b64 %rd10, %rd9, %rd8;
 ; CHECK-NEXT:    setp.ne.b64 %p1, %rd10, 0;
-; CHECK-NEXT:    mov.b64 %rd11, %rd1;
-; CHECK-NEXT:    mov.b64 %rd12, %rd2;
 ; CHECK-NEXT:    @%p1 bra $L__BB35_1;
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd11, %rd12};
 ; CHECK-NEXT:    ret;
   %ret = atomicrmw or ptr %ptr, i128 %val monotonic
   ret i128 %ret
@@ -826,24 +826,24 @@ define i128 @test_atomicrmw_xor(ptr %ptr, i128 %val) {
 ; CHECK-NEXT:    ld.v2.b64 {%rd11, %rd12}, [%rd3];
 ; CHECK-NEXT:  $L__BB36_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    xor.b64 %rd6, %rd11, %rd4;
-; CHECK-NEXT:    xor.b64 %rd7, %rd12, %rd5;
+; CHECK-NEXT:    mov.b64 %rd2, %rd12;
+; CHECK-NEXT:    mov.b64 %rd1, %rd11;
+; CHECK-NEXT:    xor.b64 %rd6, %rd1, %rd4;
+; CHECK-NEXT:    xor.b64 %rd7, %rd2, %rd5;
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:    .reg .b128 cmp, swap, dst;
-; CHECK-NEXT:    mov.b128 cmp, {%rd11, %rd12};
+; CHECK-NEXT:    mov.b128 cmp, {%rd1, %rd2};
 ; CHECK-NEXT:    mov.b128 swap, {%rd6, %rd7};
 ; CHECK-NEXT:    atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap;
-; CHECK-NEXT:    mov.b128 {%rd1, %rd2}, dst;
+; CHECK-NEXT:    mov.b128 {%rd11, %rd12}, dst;
 ; CHECK-NEXT:    }
-; CHECK-NEXT:    xor.b64 %rd8, %rd2, %rd12;
-; CHECK-NEXT:    xor.b64 %rd9, %rd1, %rd11;
+; CHECK-NEXT:    xor.b64 %rd8, %rd12, %rd2;
+; CHECK-NEXT:    xor.b64 %rd9, %rd11, %rd1;
 ; CHECK-NEXT:    or.b64 %rd10, %rd9, %rd8;
 ; CHECK-NEXT:    setp.ne.b64 %p1, %rd10, 0;
-; CHECK-NEXT:    mov.b64 %rd11, %rd1;
-; CHECK-NEXT:    mov.b64 %rd12, %rd2;
 ; CHECK-NEXT:    @%p1 bra $L__BB36_1;
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd11, %rd12};
 ; CHECK-NEXT:    ret;
   %ret = atomicrmw xor ptr %ptr, i128 %val monotonic
   ret i128 %ret
@@ -861,29 +861,29 @@ define i128 @test_atomicrmw_min(ptr %ptr, i128 %val) {
 ; CHECK-NEXT:    ld.v2.b64 {%rd11, %rd12}, [%rd3];
 ; CHECK-NEXT:  $L__BB37_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    setp.lt.u64 %p1, %rd11, %rd4;
-; CHECK-NEXT:    setp.eq.b64 %p2, %rd12, %rd5;
+; CHECK-NEXT:    mov.b64 %rd2, %rd12;
+; CHECK-NEXT:    mov.b64 %rd1, %rd11;
+; CHECK-NEXT:    setp.lt.u64 %p1, %rd1, %rd4;
+; CHECK-NEXT:    setp.eq.b64 %p2, %rd2, %rd5;
 ; CHECK-NEXT:    and.pred %p3, %p2, %p1;
-; CHECK-NEXT:    setp.lt.s64 %p4, %rd12, %rd5;
+; CHECK-NEXT:    setp.lt.s64 %p4, %rd2, %rd5;
 ; CHECK-NEXT:    or.pred %p5, %p3, %p4;
-; CHECK-NEXT:    selp.b64 %rd6, %rd12, %rd5, %p5;
-; CHECK-NEXT:    selp.b64 %rd7, %rd11, %rd4, %p5;
+; CHECK-NEXT:    selp.b64 %rd6, %rd2, %rd5, %p5;
+; CHECK-NEXT:    selp.b64 %rd7, %rd1, %rd4, %p5;
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:    .reg .b128 cmp, swap, dst;
-; CHECK-NEXT:    mov.b128 cmp, {%rd11, %rd12};
+; CHECK-NEXT:    mov.b128 cmp, {%rd1, %rd2};
 ; CHECK-NEXT:    mov.b128 swap, {%rd7, %rd6};
 ; CHECK-NEXT:    atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap;
-; CHECK-NEXT:    mov.b128 {%rd1, %rd2}, dst;
+; CHECK-NEXT:    mov.b128 {%rd11, %rd12}, dst;
 ; CHECK-NEXT:    }
-; CHECK-NEXT:    xor.b64 %rd8, %rd2, %rd12;
-; CHECK-NEXT:    xor.b64 %rd9, %rd1, %rd11;
+; CHECK-NEXT:    xor.b64 %rd8, %rd12, %rd2;
+; CHECK-NEXT:    xor.b64 %rd9, %rd11, %rd1;
 ; CHECK-NEXT:    or.b64 %rd10, %rd9, %rd8;
 ; CHECK-NEXT:    setp.ne.b64 %p6, %rd10, 0;
-; CHECK-NEXT:    mov.b64 %rd11, %rd1;
-; CHECK-NEXT:    mov.b64 %rd12, %rd2;
 ; CHECK-NEXT:    @%p6 bra $L__BB37_1;
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd11, %rd12};
 ; CHECK-NEXT:    ret;
   %ret = atomicrmw min ptr %ptr, i128 %val monotonic
   ret i128 %ret
@@ -901,29 +901,29 @@ define i128 @test_atomicrmw_max(ptr %ptr, i128 %val) {
 ; CHECK-NEXT:    ld.v2.b64 {%rd11, %rd12}, [%rd3];
 ; CHECK-NEXT:  $L__BB38_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    setp.gt.u64 %p1, %rd11, %rd4;
-; CHECK-NEXT:    setp.eq.b64 %p2, %rd12, %rd5;
+; CHECK-NEXT:    mov.b64 %rd2, %rd12;
+; CHECK-NEXT:    mov.b64 %rd1, %rd11;
+; CHECK-NEXT:    setp.gt.u64 %p1, %rd1, %rd4;
+; CHECK-NEXT:    setp.eq.b64 %p2, %rd2, %rd5;
 ; CHECK-NEXT:    and.pred %p3, %p2, %p1;
-; CHECK-NEXT:    setp.gt.s64 %p4, %rd12, %rd5;
+; CHECK-NEXT:    setp.gt.s64 %p4, %rd2, %rd5;
 ; CHECK-NEXT:    or.pred %p5, %p3, %p4;
-; CHECK-NEXT:    selp.b64 %rd6, %rd12, %rd5, %p5;
-; CHECK-NEXT:    selp.b64 %rd7, %rd11, %rd4, %p5;
+; CHECK-NEXT:    selp.b64 %rd6, %rd2, %rd5, %p5;
+; CHECK-NEXT:    selp.b64 %rd7, %rd1, %rd4, %p5;
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:    .reg .b128 cmp, swap, dst;
-; CHECK-NEXT:    mov.b128 cmp, {%rd11, %rd12};
+; CHECK-NEXT:    mov.b128 cmp, {%rd1, %rd2};
 ; CHECK-NEXT:    mov.b128 swap, {%rd7, %rd6};
 ; CHECK-NEXT:    atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap;
-; CHECK-NEXT:    mov.b128 {%rd1, %rd2}, dst;
+; CHECK-NEXT:    mov.b128 {%rd11, %rd12}, dst;
 ; CHECK-NEXT:    }
-; CHECK-NEXT:    xor.b64 %rd8, %rd2, %rd12;
-; CHECK-NEXT:    xor.b64 %rd9, %rd1, %rd11;
+; CHECK-NEXT:    xor.b64 %rd8, %rd12, %rd2;
+; CHECK-NEXT:    xor.b64 %rd9, %rd11, %rd1;
 ; CHECK-NEXT:    or.b64 %rd10, %rd9, %rd8;
 ; CHECK-NEXT:    setp.ne.b64 %p6, %rd10, 0;
-; CHECK-NEXT:    mov.b64 %rd11, %rd1;
-; CHECK-NEXT:    mov.b64 %rd12, %rd2;
 ; CHECK-NEXT:    @%p6 bra $L__BB38_1;
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd11, %rd12};
 ; CHECK-NEXT:    ret;
   %ret = atomicrmw max ptr %ptr, i128 %val monotonic
   ret i128 %ret
@@ -941,29 +941,29 @@ define i128 @test_atomicrmw_umin(ptr %ptr, i128 %val) {
 ; CHECK-NEXT:    ld.v2.b64 {%rd11, %rd12}, [%rd3];
 ; CHECK-NEXT:  $L__BB39_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    setp.lt.u64 %p1, %rd11, %rd4;
-; CHECK-NEXT:    setp.eq.b64 %p2, %rd12, %rd5;
+; CHECK-NEXT:    mov.b64 %rd2, %rd12;
+; CHECK-NEXT:    mov.b64 %rd1, %rd11;
+; CHECK-NEXT:    setp.lt.u64 %p1, %rd1, %rd4;
+; CHECK-NEXT:    setp.eq.b64 %p2, %rd2, %rd5;
 ; CHECK-NEXT:    and.pred %p3, %p2, %p1;
-; CHECK-NEXT:    setp.lt.u64 %p4, %rd12, %rd5;
+; CHECK-NEXT:    setp.lt.u64 %p4, %rd2, %rd5;
 ; CHECK-NEXT:    or.pred %p5, %p3, %p4;
-; CHECK-NEXT:    selp.b64 %rd6, %rd12, %rd5, %p5;
-; CHECK-NEXT:    selp.b64 %rd7, %rd11, %rd4, %p5;
+; CHECK-NEXT:    selp.b64 %rd6, %rd2, %rd5, %p5;
+; CHECK-NEXT:    selp.b64 %rd7, %rd1, %rd4, %p5;
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:    .reg .b128 cmp, swap, dst;
-; CHECK-NEXT:    mov.b128 cmp, {%rd11, %rd12};
+; CHECK-NEXT:    mov.b128 cmp, {%rd1, %rd2};
 ; CHECK-NEXT:    mov.b128 swap, {%rd7, %rd6};
 ; CHECK-NEXT:    atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap;
-; CHECK-NEXT:    mov.b128 {%rd1, %rd2}, dst;
+; CHECK-NEXT:    mov.b128 {%rd11, %rd12}, dst;
 ; CHECK-NEXT:    }
-; CHECK-NEXT:    xor.b64 %rd8, %rd2, %rd12;
-; CHECK-NEXT:    xor.b64 %rd9, %rd1, %rd11;
+; CHECK-NEXT:    xor.b64 %rd8, %rd12, %rd2;
+; CHECK-NEXT:    xor.b64 %rd9, %rd11, %rd1;
 ; CHECK-NEXT:    or.b64 %rd10, %rd9, %rd8;
 ; CHECK-NEXT:    setp.ne.b64 %p6, %rd10, 0;
-; CHECK-NEXT:    mov.b64 %rd11, %rd1;
-; CHECK-NEXT:    mov.b64 %rd12, %rd2;
 ; CHECK-NEXT:    @%p6 bra $L__BB39_1;
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd11, %rd12};
 ; CHECK-NEXT:    ret;
   %ret = atomicrmw umin ptr %ptr, i128 %val monotonic
   ret i128 %ret
@@ -981,29 +981,29 @@ define i128 @test_atomicrmw_umax(ptr %ptr, i128 %val) {
 ; CHECK-NEXT:    ld.v2.b64 {%rd11, %rd12}, [%rd3];
 ; CHECK-NEXT:  $L__BB40_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    setp.gt.u64 %p1, %rd11, %rd4;
-; CHECK-NEXT:    setp.eq.b64 %p2, %rd12, %rd5;
+; CHECK-NEXT:    mov.b64 %rd2, %rd12;
+; CHECK-NEXT:    mov.b64 %rd1, %rd11;
+; CHECK-NEXT:    setp.gt.u64 %p1, %rd1, %rd4;
+; CHECK-NEXT:    setp.eq.b64 %p2, %rd2, %rd5;
 ; CHECK-NEXT:    and.pred %p3, %p2, %p1;
-; CHECK-NEXT:    setp.gt.u64 %p4, %rd12, %rd5;
+; CHECK-NEXT:    setp.gt.u64 %p4, %rd2, %rd5;
 ; CHECK-NEXT:    or.pred %p5, %p3, %p4;
-; CHECK-NEXT:    selp.b64 %rd6, %rd12, %rd5, %p5;
-; CHECK-NEXT:    selp.b64 %rd7, %rd11, %rd4, %p5;
+; CHECK-NEXT:    selp.b64 %rd6, %rd2, %rd5, %p5;
+; CHECK-NEXT:    selp.b64 %rd7, %rd1, %rd4, %p5;
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:    .reg .b128 cmp, swap, dst;
-; CHECK-NEXT:    mov.b128 cmp, {%rd11, %rd12};
+; CHECK-NEXT:    mov.b128 cmp, {%rd1, %rd2};
 ; CHECK-NEXT:    mov.b128 swap, {%rd7, %rd6};
 ; CHECK-NEXT:    atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap;
-; CHECK-NEXT:    mov.b128 {%rd1, %rd2}, dst;
+; CHECK-NEXT:    mov.b128 {%rd11, %rd12}, dst;
 ; CHECK-NEXT:    }
-; CHECK-NEXT:    xor.b64 %rd8, %rd2, %rd12;
-; CHECK-NEXT:    xor.b64 %rd9, %rd1, %rd11;
+; CHECK-NEXT:    xor.b64 %rd8, %rd12, %rd2;
+; CHECK-NEXT:    xor.b64 %rd9, %rd11, %rd1;
 ; CHECK-NEXT:    or.b64 %rd10, %rd9, %rd8;
 ; CHECK-NEXT:    setp.ne.b64 %p6, %rd10, 0;
-; CHECK-NEXT:    mov.b64 %rd11, %rd1;
-; CHECK-NEXT:    mov.b64 %rd12, %rd2;
 ; CHECK-NEXT:    @%p6 bra $L__BB40_1;
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd11, %rd12};
 ; CHECK-NEXT:    ret;
   %ret = atomicrmw umax ptr %ptr, i128 %val monotonic
   ret i128 %ret
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
index e2762bac45a35..313be95c03192 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
@@ -63,32 +63,32 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    ld.b32 %r46, [%r1];
 ; CHECKPTX62-NEXT:  $L__BB0_1: // %atomicrmw.start45
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT:    shr.u32 %r20, %r46, %r2;
+; CHECKPTX62-NEXT:    mov.b32 %r4, %r46;
+; CHECKPTX62-NEXT:    shr.u32 %r20, %r4, %r2;
 ; CHECKPTX62-NEXT:    cvt.u16.u32 %rs2, %r20;
 ; CHECKPTX62-NEXT:    add.rn.f16 %rs3, %rs2, %rs1;
 ; CHECKPTX62-NEXT:    cvt.u32.u16 %r21, %rs3;
 ; CHECKPTX62-NEXT:    shl.b32 %r22, %r21, %r2;
-; CHECKPTX62-NEXT:    and.b32 %r23, %r46, %r3;
+; CHECKPTX62-NEXT:    and.b32 %r23, %r4, %r3;
 ; CHECKPTX62-NEXT:    or.b32 %r24, %r23, %r22;
-; CHECKPTX62-NEXT:    atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24;
-; CHECKPTX62-NEXT:    setp.ne.b32 %p1, %r4, %r46;
-; CHECKPTX62-NEXT:    mov.b32 %r46, %r4;
+; CHECKPTX62-NEXT:    atom.relaxed.sys.cas.b32 %r46, [%r1], %r4, %r24;
+; CHECKPTX62-NEXT:    setp.ne.b32 %p1, %r46, %r4;
 ; CHECKPTX62-NEXT:    @%p1 bra $L__BB0_1;
 ; CHECKPTX62-NEXT:  // %bb.2: // %atomicrmw.end44
 ; CHECKPTX62-NEXT:    ld.b32 %r47, [%r1];
 ; CHECKPTX62-NEXT:  $L__BB0_3: // %atomicrmw.start27
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT:    shr.u32 %r25, %r47, %r2;
+; CHECKPTX62-NEXT:    mov.b32 %r5, %r47;
+; CHECKPTX62-NEXT:    shr.u32 %r25, %r5, %r2;
 ; CHECKPTX62-NEXT:    cvt.u16.u32 %rs4, %r25;
 ; CHECKPTX62-NEXT:    mov.b16 %rs5, 0x3C00;
 ; CHECKPTX62-NEXT:    add.rn.f16 %rs6, %rs4, %rs5;
 ; CHECKPTX62-NEXT:    cvt.u32.u16 %r26, %rs6;
 ; CHECKPTX62-NEXT:    shl.b32 %r27, %r26, %r2;
-; CHECKPTX62-NEXT:    and.b32 %r28, %r47, %r3;
+; CHECKPTX62-NEXT:    and.b32 %r28, %r5, %r3;
 ; CHECKPTX62-NEXT:    or.b32 %r29, %r28, %r27;
-; CHECKPTX62-NEXT:    atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29;
-; CHECKPTX62-NEXT:    setp.ne.b32 %p2, %r5, %r47;
-; CHECKPTX62-NEXT:    mov.b32 %r47, %r5;
+; CHECKPTX62-NEXT:    atom.relaxed.sys.cas.b32 %r47, [%r1], %r5, %r29;
+; CHECKPTX62-NEXT:    setp.ne.b32 %p2, %r47, %r5;
 ; CHECKPTX62-NEXT:    @%p2 bra $L__BB0_3;
 ; CHECKPTX62-NEXT:  // %bb.4: // %atomicrmw.end26
 ; CHECKPTX62-NEXT:    and.b32 %r6, %r14, -4;
@@ -100,16 +100,16 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    ld.global.b32 %r48, [%r6];
 ; CHECKPTX62-NEXT:  $L__BB0_5: // %atomicrmw.start9
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT:    shr.u32 %r33, %r48, %r7;
+; CHECKPTX62-NEXT:    mov.b32 %r9, %r48;
+; CHECKPTX62-NEXT:    shr.u32 %r33, %r9, %r7;
 ; CHECKPTX62-NEXT:    cvt.u16.u32 %rs7, %r33;
 ; CHECKPTX62-NEXT:    add.rn.f16 %rs8, %rs7, %rs1;
 ; CHECKPTX62-NEXT:    cvt.u32.u16 %r34, %rs8;
 ; CHECKPTX62-NEXT:    shl.b32 %r35, %r34, %r7;
-; CHECKPTX62-NEXT:    and.b32 %r36, %r48, %r8;
+; CHECKPTX62-NEXT:    and.b32 %r36, %r9, %r8;
 ; CHECKPTX62-NEXT:    or.b32 %r37, %r36, %r35;
-; CHECKPTX62-NEXT:    atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37;
-; CHECKPTX62-NEXT:    setp.ne.b32 %p3, %r9, %r48;
-; CHECKPTX62-NEXT:    mov.b32 %r48, %r9;
+; CHECKPTX62-NEXT:    atom.relaxed.sys.global.cas.b32 %r48, [%r6], %r9, %r37;
+; CHECKPTX62-NEXT:    setp.ne.b32 %p3, %r48, %r9;
 ; CHECKPTX62-NEXT:    @%p3 bra $L__BB0_5;
 ; CHECKPTX62-NEXT:  // %bb.6: // %atomicrmw.end8
 ; CHECKPTX62-NEXT:    and.b32 %r10, %r15, -4;
@@ -121,16 +121,16 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    ld.shared.b32 %r49, [%r10];
 ; CHECKPTX62-NEXT:  $L__BB0_7: // %atomicrmw.start
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT:    shr.u32 %r41, %r49, %r11;
+; CHECKPTX62-NEXT:    mov.b32 %r13, %r49;
+; CHECKPTX62-NEXT:    shr.u32 %r41, %r13, %r11;
 ; CHECKPTX62-NEXT:    cvt.u16.u32 %rs9, %r41;
 ; CHECKPTX62-NEXT:    add.rn.f16 %rs10, %rs9, %rs1;
 ; CHECKPTX62-NEXT:    cvt.u32.u16 %r42, %rs10;
 ; CHECKPTX62-NEXT:    shl.b32 %r43, %r42, %r11;
-; CHECKPTX62-NEXT:    and.b32 %r44, %r49, %r12;
+; CHECKPTX62-NEXT:    and.b32 %r44, %r13, %r12;
 ; CHECKPTX62-NEXT:    or.b32 %r45, %r44, %r43;
-; CHECKPTX62-NEXT:    atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45;
-; CHECKPTX62-NEXT:    setp.ne.b32 %p4, %r13, %r49;
-; CHECKPTX62-NEXT:    mov.b32 %r49, %r13;
+; CHECKPTX62-NEXT:    atom.relaxed.sys.shared.cas.b32 %r49, [%r10], %r13, %r45;
+; CHECKPTX62-NEXT:    setp.ne.b32 %p4, %r49, %r13;
 ; CHECKPTX62-NEXT:    @%p4 bra $L__BB0_7;
 ; CHECKPTX62-NEXT:  // %bb.8: // %atomicrmw.end
 ; CHECKPTX62-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index e6c6a73eef14d..f5eefaa57fc09 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -63,33 +63,33 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    ld.b32 %r46, [%r1];
 ; CHECKPTX71-NEXT:  $L__BB0_1: // %atomicrmw.start45
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r20, %r46, %r2;
+; CHECKPTX71-NEXT:    mov.b32 %r4, %r46;
+; CHECKPTX71-NEXT:    shr.u32 %r20, %r4, %r2;
 ; CHECKPTX71-NEXT:    cvt.u16.u32 %rs2, %r20;
 ; CHECKPTX71-NEXT:    mov.b16 %rs3, 0x3F80;
 ; CHECKPTX71-NEXT:    fma.rn.bf16 %rs4, %rs2, %rs3, %rs1;
 ; CHECKPTX71-NEXT:    cvt.u32.u16 %r21, %rs4;
 ; CHECKPTX71-NEXT:    shl.b32 %r22, %r21, %r2;
-; CHECKPTX71-NEXT:    and.b32 %r23, %r46, %r3;
+; CHECKPTX71-NEXT:    and.b32 %r23, %r4, %r3;
 ; CHECKPTX71-NEXT:    or.b32 %r24, %r23, %r22;
-; CHECKPTX71-NEXT:    atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24;
-; CHECKPTX71-NEXT:    setp.ne.b32 %p1, %r4, %r46;
-; CHECKPTX71-NEXT:    mov.b32 %r46, %r4;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.cas.b32 %r46, [%r1], %r4, %r24;
+; CHECKPTX71-NEXT:    setp.ne.b32 %p1, %r46, %r4;
 ; CHECKPTX71-NEXT:    @%p1 bra $L__BB0_1;
 ; CHECKPTX71-NEXT:  // %bb.2: // %atomicrmw.end44
 ; CHECKPTX71-NEXT:    ld.b32 %r47, [%r1];
 ; CHECKPTX71-NEXT:  $L__BB0_3: // %atomicrmw.start27
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r25, %r47, %r2;
+; CHECKPTX71-NEXT:    mov.b32 %r5, %r47;
+; CHECKPTX71-NEXT:    shr.u32 %r25, %r5, %r2;
 ; CHECKPTX71-NEXT:    cvt.u16.u32 %rs5, %r25;
 ; CHECKPTX71-NEXT:    mov.b16 %rs6, 0x3F80;
 ; CHECKPTX71-NEXT:    fma.rn.bf16 %rs7, %rs5, %rs6, %rs6;
 ; CHECKPTX71-NEXT:    cvt.u32.u16 %r26, %rs7;
 ; CHECKPTX71-NEXT:    shl.b32 %r27, %r26, %r2;
-; CHECKPTX71-NEXT:    and.b32 %r28, %r47, %r3;
+; CHECKPTX71-NEXT:    and.b32 %r28, %r5, %r3;
 ; CHECKPTX71-NEXT:    or.b32 %r29, %r28, %r27;
-; CHECKPTX71-NEXT:    atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29;
-; CHECKPTX71-NEXT:    setp.ne.b32 %p2, %r5, %r47;
-; CHECKPTX71-NEXT:    mov.b32 %r47, %r5;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.cas.b32 %r47, [%r1], %r5, %r29;
+; CHECKPTX71-NEXT:    setp.ne.b32 %p2, %r47, %r5;
 ; CHECKPTX71-NEXT:    @%p2 bra $L__BB0_3;
 ; CHECKPTX71-NEXT:  // %bb.4: // %atomicrmw.end26
 ; CHECKPTX71-NEXT:    and.b32 %r6, %r14, -4;
@@ -101,17 +101,17 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    ld.global.b32 %r48, [%r6];
 ; CHECKPTX71-NEXT:  $L__BB0_5: // %atomicrmw.start9
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r33, %r48, %r7;
+; CHECKPTX71-NEXT:    mov.b32 %r9, %r48;
+; CHECKPTX71-NEXT:    shr.u32 %r33, %r9, %r7;
 ; CHECKPTX71-NEXT:    cvt.u16.u32 %rs8, %r33;
 ; CHECKPTX71-NEXT:    mov.b16 %rs9, 0x3F80;
 ; CHECKPTX71-NEXT:    fma.rn.bf16 %rs10, %rs8, %rs9, %rs1;
 ; CHECKPTX71-NEXT:    cvt.u32.u16 %r34, %rs10;
 ; CHECKPTX71-NEXT:    shl.b32 %r35, %r34, %r7;
-; CHECKPTX71-NEXT:    and.b32 %r36, %r48, %r8;
+; CHECKPTX71-NEXT:    and.b32 %r36, %r9, %r8;
 ; CHECKPTX71-NEXT:    or.b32 %r37, %r36, %r35;
-; CHECKPTX71-NEXT:    atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37;
-; CHECKPTX71-NEXT:    setp.ne.b32 %p3, %r9, %r48;
-; CHECKPTX71-NEXT:    mov.b32 %r48, %r9;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.global.cas.b32 %r48, [%r6], %r9, %r37;
+; CHECKPTX71-NEXT:    setp.ne.b32 %p3, %r48, %r9;
 ; CHECKPTX71-NEXT:    @%p3 bra $L__BB0_5;
 ; CHECKPTX71-NEXT:  // %bb.6: // %atomicrmw.end8
 ; CHECKPTX71-NEXT:    and.b32 %r10, %r15, -4;
@@ -123,17 +123,17 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    ld.shared.b32 %r49, [%r10];
 ; CHECKPTX71-NEXT:  $L__BB0_7: // %atomicrmw.start
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r41, %r49, %r11;
+; CHECKPTX71-NEXT:    mov.b32 %r13, %r49;
+; CHECKPTX71-NEXT:    shr.u32 %r41, %r13, %r11;
 ; CHECKPTX71-NEXT:    cvt.u16.u32 %rs11, %r41;
 ; CHECKPTX71-NEXT:    mov.b16 %rs12, 0x3F80;
 ; CHECKPTX71-NEXT:    fma.rn.bf16 %rs13, %rs11, %rs12, %rs1;
 ; CHECKPTX71-NEXT:    cvt.u32.u16 %r42, %rs13;
 ; CHECKPTX71-NEXT:    shl.b32 %r43, %r42, %r11;
-; CHECKPTX71-NEXT:    and.b32 %r44, %r49, %r12;
+; CHECKPTX71-NEXT:    and.b32 %r44, %r13, %r12;
 ; CHECKPTX71-NEXT:    or.b32 %r45, %r44, %r43;
-; CHECKPTX71-NEXT:    atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45;
-; CHECKPTX71-NEXT:    setp.ne.b32 %p4, %r13, %r49;
-; CHECKPTX71-NEXT:    mov.b32 %r49, %r13;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.shared.cas.b32 %r49, [%r10], %r13, %r45;
+; CHECKPTX71-NEXT:    setp.ne.b32 %p4, %r49, %r13;
 ; CHECKPTX71-NEXT:    @%p4 bra $L__BB0_7;
 ; CHECKPTX71-NEXT:  // %bb.8: // %atomicrmw.end
 ; CHECKPTX71-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index 6ea02f35e9626..a4b49f7136d1d 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -442,22 +442,22 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
 ; CHECK-NEXT:    cvt.f32.f16 %r10, %rs1;
 ; CHECK-NEXT:  $L__BB24_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u32 %r8, %r17, %r1;
+; CHECK-NEXT:    mov.b32 %r3, %r17;
+; CHECK-NEXT:    shr.u32 %r8, %r3, %r1;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r8;
 ; CHECK-NEXT:    cvt.f32.f16 %r9, %rs2;
 ; CHECK-NEXT:    add.rn.f32 %r11, %r9, %r10;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r11;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs3;
 ; CHECK-NEXT:    shl.b32 %r13, %r12, %r1;
-; CHECK-NEXT:    and.b32 %r14, %r17, %r2;
+; CHECK-NEXT:    and.b32 %r14, %r3, %r2;
 ; CHECK-NEXT:    or.b32 %r15, %r14, %r13;
 ; CHECK-NEXT:    membar.sys;
-; CHECK-NEXT:    atom.cas.b32 %r3, [%rd1], %r17, %r15;
-; CHECK-NEXT:    setp.ne.b32 %p1, %r3, %r17;
-; CHECK-NEXT:    mov.b32 %r17, %r3;
+; CHECK-NEXT:    atom.cas.b32 %r17, [%rd1], %r3, %r15;
+; CHECK-NEXT:    setp.ne.b32 %p1, %r17, %r3;
 ; CHECK-NEXT:    @%p1 bra $L__BB24_1;
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
-; CHECK-NEXT:    shr.u32 %r16, %r3, %r1;
+; CHECK-NEXT:    shr.u32 %r16, %r17, %r1;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %r16;
 ; CHECK-NEXT:    ret;
   %ret = atomicrmw fadd ptr %addr, half %val seq_cst
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index 4d930cd9e57c0..3626613cf8511 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM70 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM80-FTZ %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM90-FTZ %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s
 ; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %}
 ; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %}
@@ -55,13 +56,24 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fadd_param_0];
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs2, [test_fadd_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    add.rn.ftz.f32 %r3, %r2, %r1;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %r3;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fadd(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fadd_param_0];
+; SM90-FTZ-NEXT:    ld.param.b16 %rs2, [test_fadd_param_1];
+; SM90-FTZ-NEXT:    add.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fadd(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<4>;
@@ -118,13 +130,24 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fsub_param_0];
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs2, [test_fsub_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %r3, %r2, %r1;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %r3;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fsub(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fsub_param_0];
+; SM90-FTZ-NEXT:    ld.param.b16 %rs2, [test_fsub_param_1];
+; SM90-FTZ-NEXT:    sub.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fsub(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<4>;
@@ -195,16 +218,27 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_faddx2_param_0];
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_faddx2_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    add.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r5, %rs2;
 ; SM80-FTZ-NEXT:    add.rn.ftz.f32 %r6, %r5, %r4;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r7, %r6, %r3;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r7;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_faddx2(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_faddx2_param_0];
+; SM90-FTZ-NEXT:    ld.param.b32 %r2, [test_faddx2_param_1];
+; SM90-FTZ-NEXT:    add.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_faddx2(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
@@ -275,16 +309,27 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fsubx2_param_0];
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fsubx2_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r5, %rs2;
 ; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %r6, %r5, %r4;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r7, %r6, %r3;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r7;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fsubx2(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_fsubx2_param_0];
+; SM90-FTZ-NEXT:    ld.param.b32 %r2, [test_fsubx2_param_1];
+; SM90-FTZ-NEXT:    sub.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fsubx2(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
@@ -355,16 +400,27 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fmulx2_param_0];
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fmulx2_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    mul.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r5, %rs2;
 ; SM80-FTZ-NEXT:    mul.rn.ftz.f32 %r6, %r5, %r4;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r7, %r6, %r3;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r7;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fmulx2(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_fmulx2_param_0];
+; SM90-FTZ-NEXT:    ld.param.b32 %r2, [test_fmulx2_param_1];
+; SM90-FTZ-NEXT:    mul.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fmulx2(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
@@ -441,16 +497,34 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    div.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r5, %rs2;
 ; SM80-FTZ-NEXT:    div.rn.ftz.f32 %r6, %r5, %r4;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r7, %r6, %r3;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r7;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fdiv(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<5>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<8>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
+; SM90-FTZ-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs3;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM90-FTZ-NEXT:    div.rn.ftz.f32 %r3, %r2, %r1;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs4;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs2;
+; SM90-FTZ-NEXT:    div.rn.ftz.f32 %r6, %r5, %r4;
+; SM90-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r7, %r6, %r3;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r7;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fdiv(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<5>;
@@ -527,10 +601,21 @@ define float @test_fpext_float(bfloat %a) #0 {
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fpext_float_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs1;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r1;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fpext_float(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fpext_float_param_0];
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fpext_float(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -585,6 +670,17 @@ define bfloat @test_fptrunc_float(float %a) #0 {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fptrunc_float(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_fptrunc_float_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %r1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fptrunc_float(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -637,12 +733,23 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs1;
 ; SM80-FTZ-NEXT:    add.rn.ftz.f32 %r2, %r1, 0f3F800000;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %r2;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fadd_imm_1(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
+; SM90-FTZ-NEXT:    mov.b16 %rs2, 0x3F80;
+; SM90-FTZ-NEXT:    add.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fadd_imm_1(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<4>;
@@ -750,18 +857,43 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM80-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r4;
 ; SM80-FTZ-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
 ; SM80-FTZ-NEXT:    mov.b32 {%rs7, %rs8}, %r2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs8;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r6, %rs7;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r7, %rs6;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r8, %rs5;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r9, %rs4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r10, %rs3;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r11, %rs2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r12, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r5, %rs8;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r6, %rs7;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r7, %rs6;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r8, %rs5;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r9, %rs4;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r10, %rs3;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r11, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r12, %rs1;
 ; SM80-FTZ-NEXT:    st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
 ; SM80-FTZ-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_extload_bf16x8(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<9>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<13>;
+; SM90-FTZ-NEXT:    .reg .b64 %rd<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
+; SM90-FTZ-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
+; SM90-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r4;
+; SM90-FTZ-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; SM90-FTZ-NEXT:    mov.b32 {%rs7, %rs8}, %r2;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs8;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r6, %rs7;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r7, %rs6;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r8, %rs5;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r9, %rs4;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r10, %rs3;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r11, %rs2;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r12, %rs1;
+; SM90-FTZ-NEXT:    st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
+; SM90-FTZ-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_extload_bf16x8(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<9>;
@@ -825,12 +957,24 @@ define i16 @test_fptosi_i16(bfloat %a) {
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fptosi_i16_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs1;
 ; SM80-FTZ-NEXT:    cvt.rzi.ftz.s16.f32 %rs2, %r1;
 ; SM80-FTZ-NEXT:    cvt.u32.u16 %r2, %rs2;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fptosi_i16(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fptosi_i16_param_0];
+; SM90-FTZ-NEXT:    cvt.rzi.s16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    cvt.u32.u16 %r1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fptosi_i16(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -880,12 +1024,24 @@ define i16 @test_fptoui_i16(bfloat %a) {
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fptoui_i16_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs1;
 ; SM80-FTZ-NEXT:    cvt.rzi.ftz.u16.f32 %rs2, %r1;
 ; SM80-FTZ-NEXT:    cvt.u32.u16 %r2, %rs2;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fptoui_i16(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fptoui_i16_param_0];
+; SM90-FTZ-NEXT:    cvt.rzi.u16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    cvt.u32.u16 %r1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fptoui_i16(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -945,6 +1101,16 @@ define bfloat @test_sitofp_i16(i16 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_sitofp_i16(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_sitofp_i16_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.s16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_sitofp_i16(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -1002,6 +1168,16 @@ define bfloat @test_uitofp_i8(i8 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_uitofp_i8(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b8 %rs1, [test_uitofp_i8_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.u16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_uitofp_i8(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -1070,6 +1246,21 @@ define bfloat @test_uitofp_i1(i1 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_uitofp_i1(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .pred %p<2>;
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b8 %rs1, [test_uitofp_i1_param_0];
+; SM90-FTZ-NEXT:    and.b16 %rs2, %rs1, 1;
+; SM90-FTZ-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; SM90-FTZ-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; SM90-FTZ-NEXT:    cvt.rn.bf16.u32 %rs3, %r1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_uitofp_i1(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<2>;
@@ -1132,6 +1323,16 @@ define bfloat @test_uitofp_i16(i16 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_uitofp_i16(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_uitofp_i16_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.u16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_uitofp_i16(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -1188,6 +1389,17 @@ define bfloat @test_uitofp_i32(i32 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_uitofp_i32(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_uitofp_i32_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.u32 %rs1, %r1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_uitofp_i32(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1248,6 +1460,17 @@ define bfloat @test_uitofp_i64(i64 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_uitofp_i64(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT:    .reg .b64 %rd<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b64 %rd1, [test_uitofp_i64_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.u64 %rs1, %rd1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_uitofp_i64(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1302,12 +1525,22 @@ define bfloat @test_roundeven(bfloat %a) {
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_roundeven_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs1;
 ; SM80-FTZ-NEXT:    cvt.rni.ftz.f32.f32 %r2, %r1;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %r2;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_roundeven(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_roundeven_param_0];
+; SM90-FTZ-NEXT:    cvt.rni.bf16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_roundeven(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -1372,6 +1605,17 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_maximum(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_maximum_param_0];
+; SM90-FTZ-NEXT:    ld.param.b16 %rs2, [test_maximum_param_1];
+; SM90-FTZ-NEXT:    max.NaN.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_maximum(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<4>;
@@ -1430,6 +1674,17 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_maxnum(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_maxnum_param_0];
+; SM90-FTZ-NEXT:    ld.param.b16 %rs2, [test_maxnum_param_1];
+; SM90-FTZ-NEXT:    max.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_maxnum(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<4>;
@@ -1511,6 +1766,17 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_maximum_v2(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_maximum_v2_param_0];
+; SM90-FTZ-NEXT:    ld.param.b32 %r2, [test_maximum_v2_param_1];
+; SM90-FTZ-NEXT:    max.NaN.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_maximum_v2(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
@@ -1583,6 +1849,17 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_maxnum_v2(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_maxnum_v2_param_0];
+; SM90-FTZ-NEXT:    ld.param.b32 %r2, [test_maxnum_v2_param_1];
+; SM90-FTZ-NEXT:    max.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_maxnum_v2(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
new file mode 100644
index 0000000000000..4d81fdc67736d
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -mcpu=sm_100a -mtriple=nvptx64 -mattr=+ptx86 %s -o /dev/null 2>&1 | FileCheck %s
+
+; Test that we get a clear error message when using an unsupported syncscope.
+
+; CHECK: NVPTX backend does not support syncscope "agent"
+; CHECK: Supported syncscopes are: singlethread, <empty string>, block, cluster, device
+define i32 @cmpxchg_unsupported_syncscope_agent(ptr %addr, i32 %cmp, i32 %new) {
+  %result = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("agent") monotonic monotonic
+  %value = extractvalue { i32, i1 } %result, 0
+  ret i32 %value
+}
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
index b5c43fd259a75..d653895efa340 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 target triple = "nvptx64-nvidia-cuda"
 
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
index 57342dc9a49c5..5de1ac887b76c 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 target triple = "nvptx64-nvidia-cuda"
 
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll
index 6296d5af8ab18..2f5c1ef4670da 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 target triple = "nvptx64-nvidia-cuda"
 
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll
index e5ae3875a0ede..a2b2c2f27fa5e 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 target triple = "nvptx64-nvidia-cuda"
 
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll
index 7d04adaa774c3..e4c48ddddea18 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 target triple = "nvptx64-nvidia-cuda"
 
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
index b0fe77c1a83be..727bb3b3aa8fd 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
 ; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
 ; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 target triple = "nvptx64-nvidia-cuda"
 
@@ -29,10 +33,10 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_1];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_1d_param_2];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_3];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1}], [%rd2], %rs1;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -48,10 +52,10 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_1d_param_1];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_2];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_1d_param_3];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3}], [%r2], %rs1;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -79,10 +83,10 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_2d_param_2];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_3];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_4];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -99,10 +103,10 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_2];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_2d_param_3];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_2d_param_4];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -131,10 +135,10 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_3];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_4];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_5];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -152,10 +156,10 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_3];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_3d_param_4];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_3d_param_5];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -185,10 +189,10 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_4];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_5];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_6];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -207,10 +211,10 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_4];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_4d_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_4d_param_6];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -241,10 +245,10 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_5];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_6];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_7];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -264,10 +268,10 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_5d_param_6];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_tile_5d_param_7];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -297,10 +301,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_4];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_5];
 ; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1};
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1};
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4;
-; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -319,10 +323,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_3d_param_4];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_3d_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1};
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1};
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -354,10 +358,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_6];
 ; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7];
 ; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
-; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -378,10 +382,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2col_4d_param_6];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -415,10 +419,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8];
 ; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9];
 ; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3};
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3};
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4;
-; CHECK-PTX64-NEXT:    ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -441,10 +445,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3};
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3};
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/f16-ex2.ll b/llvm/test/CodeGen/NVPTX/f16-ex2.ll
index ee79f9d6d056f..af3fe67269205 100644
--- a/llvm/test/CodeGen/NVPTX/f16-ex2.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-ex2.ll
@@ -1,12 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s
-; RUN: %if ptxas-sm_75 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %}
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK-FP16 %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
 target triple = "nvptx64-nvidia-cuda"
 
 declare half @llvm.nvvm.ex2.approx.f16(half)
-declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>)
+declare <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half>)
+declare bfloat @llvm.nvvm.ex2.approx.ftz.bf16(bfloat)
+declare <2 x bfloat> @llvm.nvvm.ex2.approx.ftz.v2bf16(<2 x bfloat>)
 
-; CHECK-LABEL: ex2_half
 define half @ex2_half(half %0) {
 ; CHECK-FP16-LABEL: ex2_half(
 ; CHECK-FP16:       {
@@ -21,7 +22,6 @@ define half @ex2_half(half %0) {
   ret half %res
 }
 
-; CHECK-LABEL: ex2_2xhalf
 define <2 x half> @ex2_2xhalf(<2 x half> %0) {
 ; CHECK-FP16-LABEL: ex2_2xhalf(
 ; CHECK-FP16:       {
@@ -32,6 +32,34 @@ define <2 x half> @ex2_2xhalf(<2 x half> %0) {
 ; CHECK-FP16-NEXT:    ex2.approx.f16x2 %r2, %r1;
 ; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-FP16-NEXT:    ret;
-  %res = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %0)
+  %res = call <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half> %0)
   ret <2 x half> %res
 }
+
+define bfloat @ex2_bfloat(bfloat %0) {
+; CHECK-FP16-LABEL: ex2_bfloat(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0:
+; CHECK-FP16-NEXT:    ld.param.b16 %rs1, [ex2_bfloat_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.ftz.bf16 %rs2, %rs1;
+; CHECK-FP16-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-FP16-NEXT:    ret;
+  %res = call bfloat @llvm.nvvm.ex2.approx.ftz.bf16(bfloat %0)
+  ret bfloat %res
+}
+
+define <2 x bfloat> @ex2_2xbfloat(<2 x bfloat> %0) {
+; CHECK-FP16-LABEL: ex2_2xbfloat(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .b32 %r<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0:
+; CHECK-FP16-NEXT:    ld.param.b32 %r1, [ex2_2xbfloat_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.ftz.bf16x2 %r2, %r1;
+; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-FP16-NEXT:    ret;
+  %res = call <2 x bfloat> @llvm.nvvm.ex2.approx.ftz.v2bf16(<2 x bfloat> %0)
+  ret <2 x bfloat> %res
+}
diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
index 796d80d3c2c39..97b9d35be371e 100644
--- a/llvm/test/CodeGen/NVPTX/f32-ex2.ll
+++ b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
@@ -3,7 +3,8 @@
 ; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %}
 target triple = "nvptx-nvidia-cuda"
 
-declare float @llvm.nvvm.ex2.approx.f(float)
+declare float @llvm.nvvm.ex2.approx.f32(float)
+declare float @llvm.nvvm.ex2.approx.ftz.f32(float)
 
 ; CHECK-LABEL: ex2_float
 define float @ex2_float(float %0) {
@@ -16,7 +17,7 @@ define float @ex2_float(float %0) {
 ; CHECK-NEXT:    ex2.approx.f32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
-  %res = call float @llvm.nvvm.ex2.approx.f(float %0)
+  %res = call float @llvm.nvvm.ex2.approx.f32(float %0)
   ret float %res
 }
 
@@ -31,6 +32,6 @@ define float @ex2_float_ftz(float %0) {
 ; CHECK-NEXT:    ex2.approx.ftz.f32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
-  %res = call float @llvm.nvvm.ex2.approx.ftz.f(float %0)
+  %res = call float @llvm.nvvm.ex2.approx.ftz.f32(float %0)
   ret float %res
 }
diff --git a/llvm/test/CodeGen/PowerPC/annotate-metadata.ll b/llvm/test/CodeGen/PowerPC/annotate-metadata.ll
new file mode 100644
index 0000000000000..4149b56e0ea95
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/annotate-metadata.ll
@@ -0,0 +1,15 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc-ibm-aix-xcoff < \
+; RUN: %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc64le-unknown-linux < \
+; RUN: %s | FileCheck %s
+
+@.str = private unnamed_addr constant [12 x i8] c"MY_METADATA\00", section "llvm.metadata"
+@.str.1 = private unnamed_addr constant [10 x i8] c"my_file.c\00", section "llvm.metadata"
+@global.annotations = appending global [3 x { ptr, ptr, ptr, i32, ptr }] [{ ptr, ptr, ptr, i32, ptr } { ptr @a, ptr @.str, ptr @.str.1, i32 100, ptr null }, { ptr, ptr, ptr, i32, ptr } { ptr @b, ptr @.str, ptr @.str.1, i32 200, ptr null }, { ptr, ptr, ptr, i32, ptr } { ptr @c, ptr @.str, ptr @.str.1, i32 300, ptr null }], section "llvm.metadata"
+
+@a = global i32 1
+@b = global i32 2
+@c = global i32 3
+
+; CHECK-NOT: metadata
+; CHECK-NOT: annotations
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-fp128.ll b/llvm/test/CodeGen/PowerPC/ctrloop-fp128.ll
index d6dd959365401..fdb01314a7d4c 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-fp128.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-fp128.ll
@@ -49,15 +49,15 @@ define void @fmul_ctrloop_fp128() nounwind {
 ; PWR8-NEXT:    #
 ; PWR8-NEXT:    lxvd2x 0, 30, 28
 ; PWR8-NEXT:    vmr 2, 31
-; PWR8-NEXT:    addi 26, 30, 16
+; PWR8-NEXT:    mr 26, 30
+; PWR8-NEXT:    addi 30, 30, 16
 ; PWR8-NEXT:    xxswapd 35, 0
 ; PWR8-NEXT:    bl __mulkf3
 ; PWR8-NEXT:    nop
 ; PWR8-NEXT:    addi 29, 29, -1
 ; PWR8-NEXT:    xxswapd 0, 34
 ; PWR8-NEXT:    cmpldi 29, 0
-; PWR8-NEXT:    stxvd2x 0, 30, 27
-; PWR8-NEXT:    mr 30, 26
+; PWR8-NEXT:    stxvd2x 0, 26, 27
 ; PWR8-NEXT:    bc 12, 1, .LBB0_1
 ; PWR8-NEXT:  # %bb.2: # %for.end
 ; PWR8-NEXT:    li 3, 48
diff --git a/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll b/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll
index 12078adbbc2f3..383dcdb06c331 100644
--- a/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll
+++ b/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=ppc32-- | FileCheck %s --check-prefixes=CHECK,CHECK32,CHECK32_32
 ; RUN: llc < %s -mtriple=ppc32-- -mcpu=ppc64 | FileCheck %s --check-prefixes=CHECK,CHECK32,CHECK32_64
 ; RUN: llc < %s -mtriple=powerpc64le-- | FileCheck %s --check-prefixes=CHECK,CHECK64
+; RUN: llc < %s -mcpu=future -mtriple=powerpc64le-- | FileCheck %s --check-prefix=FUTURE
 
 declare i8 @llvm.fshl.i8(i8, i8, i8)
 declare i16 @llvm.fshl.i16(i16, i16, i16)
@@ -24,6 +25,13 @@ define i8 @rotl_i8_const_shift(i8 %x) {
 ; CHECK-NEXT:    rlwimi 4, 3, 3, 0, 28
 ; CHECK-NEXT:    mr 3, 4
 ; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: rotl_i8_const_shift:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    rotlwi 4, 3, 27
+; FUTURE-NEXT:    rlwimi 4, 3, 3, 0, 28
+; FUTURE-NEXT:    mr 3, 4
+; FUTURE-NEXT:    blr
   %f = call i8 @llvm.fshl.i8(i8 %x, i8 %x, i8 3)
   ret i8 %f
 }
@@ -43,6 +51,11 @@ define i64 @rotl_i64_const_shift(i64 %x) {
 ; CHECK64:       # %bb.0:
 ; CHECK64-NEXT:    rotldi 3, 3, 3
 ; CHECK64-NEXT:    blr
+;
+; FUTURE-LABEL: rotl_i64_const_shift:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    rotldi 3, 3, 3
+; FUTURE-NEXT:    blr
   %f = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 3)
   ret i64 %f
 }
@@ -60,6 +73,17 @@ define i16 @rotl_i16(i16 %x, i16 %z) {
 ; CHECK-NEXT:    srw 4, 5, 4
 ; CHECK-NEXT:    or 3, 3, 4
 ; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: rotl_i16:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    clrlwi 6, 4, 28
+; FUTURE-NEXT:    neg 4, 4
+; FUTURE-NEXT:    clrlwi 5, 3, 16
+; FUTURE-NEXT:    clrlwi 4, 4, 28
+; FUTURE-NEXT:    slw 3, 3, 6
+; FUTURE-NEXT:    srw 4, 5, 4
+; FUTURE-NEXT:    or 3, 3, 4
+; FUTURE-NEXT:    blr
   %f = call i16 @llvm.fshl.i16(i16 %x, i16 %x, i16 %z)
   ret i16 %f
 }
@@ -69,6 +93,11 @@ define i32 @rotl_i32(i32 %x, i32 %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    rotlw 3, 3, 4
 ; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: rotl_i32:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    rotlw 3, 3, 4
+; FUTURE-NEXT:    blr
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 %z)
   ret i32 %f
 }
@@ -100,6 +129,11 @@ define i64 @rotl_i64(i64 %x, i64 %z) {
 ; CHECK64:       # %bb.0:
 ; CHECK64-NEXT:    rotld 3, 3, 4
 ; CHECK64-NEXT:    blr
+;
+; FUTURE-LABEL: rotl_i64:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    rotld 3, 3, 4
+; FUTURE-NEXT:    blr
   %f = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 %z)
   ret i64 %f
 }
@@ -124,6 +158,11 @@ define <4 x i32> @rotl_v4i32(<4 x i32> %x, <4 x i32> %z) {
 ; CHECK64:       # %bb.0:
 ; CHECK64-NEXT:    vrlw 2, 2, 3
 ; CHECK64-NEXT:    blr
+;
+; FUTURE-LABEL: rotl_v4i32:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    xvrlw 34, 34, 35
+; FUTURE-NEXT:    blr
   %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %z)
   ret <4 x i32> %f
 }
@@ -150,6 +189,12 @@ define <4 x i32> @rotl_v4i32_const_shift(<4 x i32> %x) {
 ; CHECK64-NEXT:    vspltisw 3, 3
 ; CHECK64-NEXT:    vrlw 2, 2, 3
 ; CHECK64-NEXT:    blr
+;
+; FUTURE-LABEL: rotl_v4i32_const_shift:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    vspltisw 3, 3
+; FUTURE-NEXT:    xvrlw 34, 34, 35
+; FUTURE-NEXT:    blr
   %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
   ret <4 x i32> %f
 }
@@ -163,6 +208,13 @@ define i8 @rotr_i8_const_shift(i8 %x) {
 ; CHECK-NEXT:    rlwimi 4, 3, 5, 0, 26
 ; CHECK-NEXT:    mr 3, 4
 ; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: rotr_i8_const_shift:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    rotlwi 4, 3, 29
+; FUTURE-NEXT:    rlwimi 4, 3, 5, 0, 26
+; FUTURE-NEXT:    mr 3, 4
+; FUTURE-NEXT:    blr
   %f = call i8 @llvm.fshr.i8(i8 %x, i8 %x, i8 3)
   ret i8 %f
 }
@@ -172,6 +224,11 @@ define i32 @rotr_i32_const_shift(i32 %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    rotlwi 3, 3, 29
 ; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: rotr_i32_const_shift:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    rotlwi 3, 3, 29
+; FUTURE-NEXT:    blr
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 3)
   ret i32 %f
 }
@@ -189,6 +246,17 @@ define i16 @rotr_i16(i16 %x, i16 %z) {
 ; CHECK-NEXT:    slw 3, 3, 4
 ; CHECK-NEXT:    or 3, 5, 3
 ; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: rotr_i16:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    clrlwi 6, 4, 28
+; FUTURE-NEXT:    neg 4, 4
+; FUTURE-NEXT:    clrlwi 5, 3, 16
+; FUTURE-NEXT:    clrlwi 4, 4, 28
+; FUTURE-NEXT:    srw 5, 5, 6
+; FUTURE-NEXT:    slw 3, 3, 4
+; FUTURE-NEXT:    or 3, 5, 3
+; FUTURE-NEXT:    blr
   %f = call i16 @llvm.fshr.i16(i16 %x, i16 %x, i16 %z)
   ret i16 %f
 }
@@ -199,6 +267,12 @@ define i32 @rotr_i32(i32 %x, i32 %z) {
 ; CHECK-NEXT:    neg 4, 4
 ; CHECK-NEXT:    rotlw 3, 3, 4
 ; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: rotr_i32:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    neg 4, 4
+; FUTURE-NEXT:    rotlw 3, 3, 4
+; FUTURE-NEXT:    blr
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 %z)
   ret i32 %f
 }
@@ -231,6 +305,12 @@ define i64 @rotr_i64(i64 %x, i64 %z) {
 ; CHECK64-NEXT:    neg 4, 4
 ; CHECK64-NEXT:    rotld 3, 3, 4
 ; CHECK64-NEXT:    blr
+;
+; FUTURE-LABEL: rotr_i64:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    neg 4, 4
+; FUTURE-NEXT:    rotld 3, 3, 4
+; FUTURE-NEXT:    blr
   %f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 %z)
   ret i64 %f
 }
@@ -263,6 +343,12 @@ define <4 x i32> @rotr_v4i32(<4 x i32> %x, <4 x i32> %z) {
 ; CHECK64-NEXT:    vsubuwm 3, 4, 3
 ; CHECK64-NEXT:    vrlw 2, 2, 3
 ; CHECK64-NEXT:    blr
+;
+; FUTURE-LABEL: rotr_v4i32:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    vnegw 3, 3
+; FUTURE-NEXT:    xvrlw 34, 34, 35
+; FUTURE-NEXT:    blr
   %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %z)
   ret <4 x i32> %f
 }
@@ -293,6 +379,12 @@ define <4 x i32> @rotr_v4i32_const_shift(<4 x i32> %x) {
 ; CHECK64-NEXT:    vsubuwm 3, 4, 3
 ; CHECK64-NEXT:    vrlw 2, 2, 3
 ; CHECK64-NEXT:    blr
+;
+; FUTURE-LABEL: rotr_v4i32_const_shift:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    xxspltiw 0, 29
+; FUTURE-NEXT:    xvrlw 34, 34, 0
+; FUTURE-NEXT:    blr
   %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
   ret <4 x i32> %f
 }
@@ -301,6 +393,10 @@ define i32 @rotl_i32_shift_by_bitwidth(i32 %x) {
 ; CHECK-LABEL: rotl_i32_shift_by_bitwidth:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: rotl_i32_shift_by_bitwidth:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    blr
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 32)
   ret i32 %f
 }
@@ -309,6 +405,10 @@ define i32 @rotr_i32_shift_by_bitwidth(i32 %x) {
 ; CHECK-LABEL: rotr_i32_shift_by_bitwidth:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: rotr_i32_shift_by_bitwidth:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    blr
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 32)
   ret i32 %f
 }
@@ -317,6 +417,10 @@ define <4 x i32> @rotl_v4i32_shift_by_bitwidth(<4 x i32> %x) {
 ; CHECK-LABEL: rotl_v4i32_shift_by_bitwidth:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: rotl_v4i32_shift_by_bitwidth:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    blr
   %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
   ret <4 x i32> %f
 }
@@ -325,6 +429,10 @@ define <4 x i32> @rotr_v4i32_shift_by_bitwidth(<4 x i32> %x) {
 ; CHECK-LABEL: rotr_v4i32_shift_by_bitwidth:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: rotr_v4i32_shift_by_bitwidth:
+; FUTURE:       # %bb.0:
+; FUTURE-NEXT:    blr
   %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
   ret <4 x i32> %f
 }
diff --git a/llvm/test/CodeGen/PowerPC/licm-xxsplti.ll b/llvm/test/CodeGen/PowerPC/licm-xxsplti.ll
index 55482a0c5ff2c..786988fae08c8 100644
--- a/llvm/test/CodeGen/PowerPC/licm-xxsplti.ll
+++ b/llvm/test/CodeGen/PowerPC/licm-xxsplti.ll
@@ -23,11 +23,11 @@ define void @_Z3fooPfS_Pi(ptr noalias nocapture noundef %_a, ptr noalias nocaptu
 ; AIX64-NEXT:  # %bb.2: # %for.body.preheader.new
 ; AIX64-NEXT:    rlwinm 6, 5, 0, 1, 30
 ; AIX64-NEXT:    xxspltib 0, 6
-; AIX64-NEXT:    addi 9, 4, -8
+; AIX64-NEXT:    addi 11, 4, -8
 ; AIX64-NEXT:    addi 7, 3, -8
 ; AIX64-NEXT:    li 8, 8
-; AIX64-NEXT:    li 10, 12
-; AIX64-NEXT:    li 11, 4
+; AIX64-NEXT:    li 9, 12
+; AIX64-NEXT:    li 10, 4
 ; AIX64-NEXT:    addi 6, 6, -2
 ; AIX64-NEXT:    rldicl 6, 6, 63, 1
 ; AIX64-NEXT:    addi 6, 6, 1
@@ -36,16 +36,16 @@ define void @_Z3fooPfS_Pi(ptr noalias nocapture noundef %_a, ptr noalias nocaptu
 ; AIX64-NEXT:    .align 4
 ; AIX64-NEXT:  L..BB0_3: # %for.body
 ; AIX64-NEXT:    #
-; AIX64-NEXT:    lxvwsx 1, 9, 8
+; AIX64-NEXT:    lxvwsx 1, 11, 8
 ; AIX64-NEXT:    addi 6, 6, 2
 ; AIX64-NEXT:    xxland 1, 1, 0
 ; AIX64-NEXT:    xscvspdpn 1, 1
 ; AIX64-NEXT:    stfsu 1, 8(7)
-; AIX64-NEXT:    lxvwsx 1, 9, 10
-; AIX64-NEXT:    addi 9, 9, 8
+; AIX64-NEXT:    lxvwsx 1, 11, 9
+; AIX64-NEXT:    addi 11, 11, 8
 ; AIX64-NEXT:    xxland 1, 1, 0
 ; AIX64-NEXT:    xxsldwi 1, 1, 1, 3
-; AIX64-NEXT:    stfiwx 1, 7, 11
+; AIX64-NEXT:    stfiwx 1, 7, 10
 ; AIX64-NEXT:    bdnz L..BB0_3
 ; AIX64-NEXT:  L..BB0_4: # %for.cond.cleanup.loopexit.unr-lcssa
 ; AIX64-NEXT:    andi. 5, 5, 1
@@ -70,27 +70,27 @@ define void @_Z3fooPfS_Pi(ptr noalias nocapture noundef %_a, ptr noalias nocaptu
 ; AIX32-NEXT:  # %bb.2: # %for.body.preheader.new
 ; AIX32-NEXT:    xxspltib 0, 6
 ; AIX32-NEXT:    addi 12, 4, -8
-; AIX32-NEXT:    addi 9, 3, -8
+; AIX32-NEXT:    addi 8, 3, -8
 ; AIX32-NEXT:    rlwinm 7, 5, 0, 1, 30
-; AIX32-NEXT:    li 8, 0
-; AIX32-NEXT:    li 10, 8
-; AIX32-NEXT:    li 11, 12
+; AIX32-NEXT:    li 9, 8
+; AIX32-NEXT:    li 10, 12
+; AIX32-NEXT:    li 11, 0
 ; AIX32-NEXT:    .align 4
 ; AIX32-NEXT:  L..BB0_3: # %for.body
 ; AIX32-NEXT:    #
-; AIX32-NEXT:    lxvwsx 1, 12, 10
+; AIX32-NEXT:    lxvwsx 1, 12, 9
+; AIX32-NEXT:    lxvwsx 2, 12, 10
 ; AIX32-NEXT:    addic 6, 6, 2
-; AIX32-NEXT:    addze 8, 8
+; AIX32-NEXT:    addi 12, 12, 8
+; AIX32-NEXT:    addze 11, 11
 ; AIX32-NEXT:    xor 0, 6, 7
-; AIX32-NEXT:    or. 0, 0, 8
+; AIX32-NEXT:    or. 0, 0, 11
 ; AIX32-NEXT:    xxland 1, 1, 0
 ; AIX32-NEXT:    xscvspdpn 1, 1
-; AIX32-NEXT:    stfsu 1, 8(9)
-; AIX32-NEXT:    lxvwsx 1, 12, 11
-; AIX32-NEXT:    addi 12, 12, 8
-; AIX32-NEXT:    xxland 1, 1, 0
+; AIX32-NEXT:    stfsu 1, 8(8)
+; AIX32-NEXT:    xxland 1, 2, 0
 ; AIX32-NEXT:    xscvspdpn 1, 1
-; AIX32-NEXT:    stfs 1, 4(9)
+; AIX32-NEXT:    stfs 1, 4(8)
 ; AIX32-NEXT:    bne 0, L..BB0_3
 ; AIX32-NEXT:  L..BB0_4: # %for.cond.cleanup.loopexit.unr-lcssa
 ; AIX32-NEXT:    andi. 5, 5, 1
@@ -116,11 +116,11 @@ define void @_Z3fooPfS_Pi(ptr noalias nocapture noundef %_a, ptr noalias nocaptu
 ; LINUX64LE-NEXT:  # %bb.2: # %for.body.preheader.new
 ; LINUX64LE-NEXT:    rlwinm 6, 5, 0, 1, 30
 ; LINUX64LE-NEXT:    xxspltib 0, 6
-; LINUX64LE-NEXT:    addi 8, 4, -8
+; LINUX64LE-NEXT:    addi 11, 4, -8
 ; LINUX64LE-NEXT:    addi 7, 3, -8
-; LINUX64LE-NEXT:    li 9, 8
-; LINUX64LE-NEXT:    li 10, 12
-; LINUX64LE-NEXT:    li 11, 4
+; LINUX64LE-NEXT:    li 8, 8
+; LINUX64LE-NEXT:    li 9, 12
+; LINUX64LE-NEXT:    li 10, 4
 ; LINUX64LE-NEXT:    addi 6, 6, -2
 ; LINUX64LE-NEXT:    rldicl 6, 6, 63, 1
 ; LINUX64LE-NEXT:    addi 6, 6, 1
@@ -129,16 +129,16 @@ define void @_Z3fooPfS_Pi(ptr noalias nocapture noundef %_a, ptr noalias nocaptu
 ; LINUX64LE-NEXT:    .p2align 4
 ; LINUX64LE-NEXT:  .LBB0_3: # %for.body
 ; LINUX64LE-NEXT:    #
-; LINUX64LE-NEXT:    lxvwsx 1, 8, 9
+; LINUX64LE-NEXT:    lxvwsx 1, 11, 8
 ; LINUX64LE-NEXT:    addi 6, 6, 2
 ; LINUX64LE-NEXT:    xxland 1, 1, 0
 ; LINUX64LE-NEXT:    xxsldwi 1, 1, 1, 3
 ; LINUX64LE-NEXT:    xscvspdpn 1, 1
 ; LINUX64LE-NEXT:    stfsu 1, 8(7)
-; LINUX64LE-NEXT:    lxvwsx 1, 8, 10
-; LINUX64LE-NEXT:    addi 8, 8, 8
+; LINUX64LE-NEXT:    lxvwsx 1, 11, 9
+; LINUX64LE-NEXT:    addi 11, 11, 8
 ; LINUX64LE-NEXT:    xxland 1, 1, 0
-; LINUX64LE-NEXT:    stxvrwx 1, 7, 11
+; LINUX64LE-NEXT:    stxvrwx 1, 7, 10
 ; LINUX64LE-NEXT:    bdnz .LBB0_3
 ; LINUX64LE-NEXT:  .LBB0_4: # %for.cond.cleanup.loopexit.unr-lcssa
 ; LINUX64LE-NEXT:    andi. 5, 5, 1
diff --git a/llvm/test/CodeGen/PowerPC/llvm.sincos.ll b/llvm/test/CodeGen/PowerPC/llvm.sincos.ll
index aaf81ff814488..5b4e91c449522 100644
--- a/llvm/test/CodeGen/PowerPC/llvm.sincos.ll
+++ b/llvm/test/CodeGen/PowerPC/llvm.sincos.ll
@@ -26,30 +26,6 @@ define { ppc_fp128, ppc_fp128 } @test_sincos_ppcf128(ppc_fp128 %a) {
   ret { ppc_fp128, ppc_fp128 } %result
 }
 
-define { ppc_fp128, ppc_fp128 } @test_sincospi_ppcf128(ppc_fp128 %a) {
-; CHECK-LABEL: test_sincospi_ppcf128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    mflr r0
-; CHECK-NEXT:    stdu r1, -64(r1)
-; CHECK-NEXT:    std r0, 80(r1)
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    addi r5, r1, 48
-; CHECK-NEXT:    addi r6, r1, 32
-; CHECK-NEXT:    bl sincospil
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    lfd f1, 48(r1)
-; CHECK-NEXT:    lfd f2, 56(r1)
-; CHECK-NEXT:    lfd f3, 32(r1)
-; CHECK-NEXT:    lfd f4, 40(r1)
-; CHECK-NEXT:    addi r1, r1, 64
-; CHECK-NEXT:    ld r0, 16(r1)
-; CHECK-NEXT:    mtlr r0
-; CHECK-NEXT:    blr
-  %result = call { ppc_fp128, ppc_fp128 } @llvm.sincospi.ppcf128(ppc_fp128 %a)
-  ret { ppc_fp128, ppc_fp128 } %result
-}
-
 ; FIXME: This could be made a tail call with the default expansion of llvm.sincos.
 define void @test_sincos_ppcf128_void_tail_call(ppc_fp128 %a, ptr noalias %out_sin, ptr noalias %out_cos) {
 ; CHECK-LABEL: test_sincos_ppcf128_void_tail_call:
@@ -73,29 +49,6 @@ define void @test_sincos_ppcf128_void_tail_call(ppc_fp128 %a, ptr noalias %out_s
   ret void
 }
 
-; FIXME: This could be made a tail call with the default expansion of llvm.sincospi.
-define void @test_sincospi_ppcf128_void_tail_call(ppc_fp128 %a, ptr noalias %out_sin, ptr noalias %out_cos) {
-; CHECK-LABEL: test_sincospi_ppcf128_void_tail_call:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    mflr r0
-; CHECK-NEXT:    stdu r1, -32(r1)
-; CHECK-NEXT:    std r0, 48(r1)
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl sincospil
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    addi r1, r1, 32
-; CHECK-NEXT:    ld r0, 16(r1)
-; CHECK-NEXT:    mtlr r0
-; CHECK-NEXT:    blr
-  %result = tail call { ppc_fp128, ppc_fp128 } @llvm.sincospi.ppcf128(ppc_fp128 %a)
-  %result.0 = extractvalue { ppc_fp128, ppc_fp128 } %result, 0
-  %result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 1
-  store ppc_fp128 %result.0, ptr %out_sin, align 16
-  store ppc_fp128 %result.1, ptr %out_cos, align 16
-  ret void
-}
-
 ; NOTE: This would need a struct-return library call for llvm.sincos to become a tail call.
 define { ppc_fp128, ppc_fp128 } @test_sincos_ppcf128_tail_call(ppc_fp128 %a) {
 ; CHECK-LABEL: test_sincos_ppcf128_tail_call:
@@ -120,28 +73,3 @@ define { ppc_fp128, ppc_fp128 } @test_sincos_ppcf128_tail_call(ppc_fp128 %a) {
   %result = tail call { ppc_fp128, ppc_fp128 } @llvm.sincos.ppcf128(ppc_fp128 %a)
   ret { ppc_fp128, ppc_fp128 } %result
 }
-
-; NOTE: This would need a struct-return library call for llvm.sincospi to become a tail call.
-define { ppc_fp128, ppc_fp128 } @test_sincospi_ppcf128_tail_call(ppc_fp128 %a) {
-; CHECK-LABEL: test_sincospi_ppcf128_tail_call:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    mflr r0
-; CHECK-NEXT:    stdu r1, -64(r1)
-; CHECK-NEXT:    std r0, 80(r1)
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    addi r5, r1, 48
-; CHECK-NEXT:    addi r6, r1, 32
-; CHECK-NEXT:    bl sincospil
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    lfd f1, 48(r1)
-; CHECK-NEXT:    lfd f2, 56(r1)
-; CHECK-NEXT:    lfd f3, 32(r1)
-; CHECK-NEXT:    lfd f4, 40(r1)
-; CHECK-NEXT:    addi r1, r1, 64
-; CHECK-NEXT:    ld r0, 16(r1)
-; CHECK-NEXT:    mtlr r0
-; CHECK-NEXT:    blr
-  %result = tail call { ppc_fp128, ppc_fp128 } @llvm.sincospi.ppcf128(ppc_fp128 %a)
-  ret { ppc_fp128, ppc_fp128 } %result
-}
diff --git a/llvm/test/CodeGen/PowerPC/llvm.sincospi.ll b/llvm/test/CodeGen/PowerPC/llvm.sincospi.ll
new file mode 100644
index 0000000000000..75e7559386f16
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/llvm.sincospi.ll
@@ -0,0 +1,21 @@
+; RUN: not llc -mtriple=powerpc64le-gnu-linux -filetype=null %s 2>&1 | FileCheck %s
+
+; CHECK: error: no libcall available for fsincospi
+define { half, half } @test_sincospi_f16(half %a) #0 {
+  %result = call { half, half } @llvm.sincospi.f16(half %a)
+  ret { half, half } %result
+}
+
+; CHECK: error: no libcall available for fsincospi
+define { float, float } @test_sincospi_f32(float %a) #0 {
+  %result = call { float, float } @llvm.sincospi.f32(float %a)
+  ret { float, float } %result
+}
+
+; CHECK: error: no libcall available for fsincospi
+define { double, double } @test_sincospi_f64(double %a) #0 {
+  %result = call { double, double } @llvm.sincospi.f64(double %a)
+  ret { double, double } %result
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll b/llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll
new file mode 100644
index 0000000000000..c332f441e8b00
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll
@@ -0,0 +1,26 @@
+; XFAIL: *
+; UNSUPPORTED: expensive_checks
+; FIXME: asserts
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-gnu-linux -filetype=null -enable-legalize-types-checking=0 \
+; RUN:   -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names %s
+
+define { ppc_fp128, ppc_fp128 } @test_sincospi_ppcf128(ppc_fp128 %a) {
+  %result = call { ppc_fp128, ppc_fp128 } @llvm.sincospi.ppcf128(ppc_fp128 %a)
+  ret { ppc_fp128, ppc_fp128 } %result
+}
+
+; FIXME: This could be made a tail call with the default expansion of llvm.sincospi.
+define void @test_sincospi_ppcf128_void_tail_call(ppc_fp128 %a, ptr noalias %out_sin, ptr noalias %out_cos) {
+  %result = tail call { ppc_fp128, ppc_fp128 } @llvm.sincospi.ppcf128(ppc_fp128 %a)
+  %result.0 = extractvalue { ppc_fp128, ppc_fp128 } %result, 0
+  %result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 1
+  store ppc_fp128 %result.0, ptr %out_sin, align 16
+  store ppc_fp128 %result.1, ptr %out_cos, align 16
+  ret void
+}
+
+; NOTE: This would need a struct-return library call for llvm.sincospi to become a tail call.
+define { ppc_fp128, ppc_fp128 } @test_sincospi_ppcf128_tail_call(ppc_fp128 %a) {
+  %result = tail call { ppc_fp128, ppc_fp128 } @llvm.sincospi.ppcf128(ppc_fp128 %a)
+  ret { ppc_fp128, ppc_fp128 } %result
+}
diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
index cc38e250f183f..4e0394ee4fb8c 100644
--- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
+++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
@@ -189,8 +189,8 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) {
 ; CHECK-NEXT:    cmplwi r4, 0
 ; CHECK-NEXT:    beq cr0, .LBB2_4
 ; CHECK-NEXT:  # %bb.1: # %bb3.preheader
-; CHECK-NEXT:    std r25, -56(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    addi r10, r3, 4002
 ; CHECK-NEXT:    li r3, 0
 ; CHECK-NEXT:    li r5, -1
@@ -198,7 +198,6 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) {
 ; CHECK-NEXT:    li r7, 3
 ; CHECK-NEXT:    li r8, 5
 ; CHECK-NEXT:    li r9, 9
-; CHECK-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
@@ -215,7 +214,7 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) {
 ; CHECK-NEXT:    ldx r28, r10, r8
 ; CHECK-NEXT:    ld r27, 12(r10)
 ; CHECK-NEXT:    ld r26, 8(r10)
-; CHECK-NEXT:    ldx r25, r10, r9
+; CHECK-NEXT:    ldx r12, r10, r9
 ; CHECK-NEXT:    addi r10, r10, 1
 ; CHECK-NEXT:    mulld r11, r11, r0
 ; CHECK-NEXT:    mulld r11, r11, r30
@@ -223,7 +222,7 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) {
 ; CHECK-NEXT:    mulld r11, r11, r28
 ; CHECK-NEXT:    mulld r11, r11, r27
 ; CHECK-NEXT:    mulld r11, r11, r26
-; CHECK-NEXT:    maddld r3, r11, r25, r3
+; CHECK-NEXT:    maddld r3, r11, r12, r3
 ; CHECK-NEXT:    bdnz .LBB2_2
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -232,7 +231,6 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) {
 ; CHECK-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    add r3, r3, r4
 ; CHECK-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld r25, -56(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    blr
 ; CHECK-NEXT:  .LBB2_4:
 ; CHECK-NEXT:    addi r3, r4, 0
diff --git a/llvm/test/CodeGen/PowerPC/milicode32.ll b/llvm/test/CodeGen/PowerPC/milicode32.ll
index 78d036202fe4e..b69b997254d2c 100644
--- a/llvm/test/CodeGen/PowerPC/milicode32.ll
+++ b/llvm/test/CodeGen/PowerPC/milicode32.ll
@@ -68,4 +68,94 @@ entry:
   ret i32 %call
 }
 
+define i32 @strlen_test_fp_strict(ptr noundef %str) nounwind {
+; CHECK-AIX-32-P9-LABEL: strlen_test_fp_strict:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    mflr r0
+; CHECK-AIX-32-P9-NEXT:    stwu r1, -64(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r0, 72(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, 60(r1)
+; CHECK-AIX-32-P9-NEXT:    bl .___strlen[PR]
+; CHECK-AIX-32-P9-NEXT:    nop
+; CHECK-AIX-32-P9-NEXT:    addi r1, r1, 64
+; CHECK-AIX-32-P9-NEXT:    lwz r0, 8(r1)
+; CHECK-AIX-32-P9-NEXT:    mtlr r0
+; CHECK-AIX-32-P9-NEXT:    blr
+;
+; CHECK-LINUX32-P9-LABEL: strlen_test_fp_strict:
+; CHECK-LINUX32-P9:       # %bb.0: # %entry
+; CHECK-LINUX32-P9-NEXT:    mflr r0
+; CHECK-LINUX32-P9-NEXT:    stwu r1, -16(r1)
+; CHECK-LINUX32-P9-NEXT:    stw r0, 20(r1)
+; CHECK-LINUX32-P9-NEXT:    stw r3, 12(r1)
+; CHECK-LINUX32-P9-NEXT:    bl strlen
+; CHECK-LINUX32-P9-NEXT:    lwz r0, 20(r1)
+; CHECK-LINUX32-P9-NEXT:    addi r1, r1, 16
+; CHECK-LINUX32-P9-NEXT:    mtlr r0
+; CHECK-LINUX32-P9-NEXT:    blr
+entry:
+  %str.addr = alloca ptr, align 4
+  store ptr %str, ptr %str.addr, align 4
+  %0 = load ptr, ptr %str.addr, align 4
+  %call = call i32 @strlen(ptr noundef %0) #0
+  ret i32 %call
+}
+
 declare i32 @strlen(ptr noundef) nounwind
+attributes #0 = { strictfp }
+
+define ptr @test_memmove(ptr noundef %destination, ptr noundef %source, i32 noundef %num) #0 {
+; CHECK-AIX-32-P9-LABEL: test_memmove:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    mflr r0
+; CHECK-AIX-32-P9-NEXT:    stwu r1, -80(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r0, 88(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r31, 76(r1) # 4-byte Folded Spill
+; CHECK-AIX-32-P9-NEXT:    mr r31, r3
+; CHECK-AIX-32-P9-NEXT:    stw r3, 72(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r4, 68(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r5, 64(r1)
+; CHECK-AIX-32-P9-NEXT:    bl .___memmove[PR]
+; CHECK-AIX-32-P9-NEXT:    nop
+; CHECK-AIX-32-P9-NEXT:    mr r3, r31
+; CHECK-AIX-32-P9-NEXT:    lwz r31, 76(r1) # 4-byte Folded Reload
+; CHECK-AIX-32-P9-NEXT:    addi r1, r1, 80
+; CHECK-AIX-32-P9-NEXT:    lwz r0, 8(r1)
+; CHECK-AIX-32-P9-NEXT:    mtlr r0
+; CHECK-AIX-32-P9-NEXT:    blr
+;
+; CHECK-LINUX32-P9-LABEL: test_memmove:
+; CHECK-LINUX32-P9:       # %bb.0: # %entry
+; CHECK-LINUX32-P9-NEXT:    mflr r0
+; CHECK-LINUX32-P9-NEXT:    stwu r1, -32(r1)
+; CHECK-LINUX32-P9-NEXT:    stw r0, 36(r1)
+; CHECK-LINUX32-P9-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-LINUX32-P9-NEXT:    .cfi_offset lr, 4
+; CHECK-LINUX32-P9-NEXT:    .cfi_offset r30, -8
+; CHECK-LINUX32-P9-NEXT:    stw r30, 24(r1) # 4-byte Folded Spill
+; CHECK-LINUX32-P9-NEXT:    mr r30, r3
+; CHECK-LINUX32-P9-NEXT:    stw r3, 20(r1)
+; CHECK-LINUX32-P9-NEXT:    stw r4, 16(r1)
+; CHECK-LINUX32-P9-NEXT:    stw r5, 12(r1)
+; CHECK-LINUX32-P9-NEXT:    bl memmove
+; CHECK-LINUX32-P9-NEXT:    mr r3, r30
+; CHECK-LINUX32-P9-NEXT:    lwz r30, 24(r1) # 4-byte Folded Reload
+; CHECK-LINUX32-P9-NEXT:    lwz r0, 36(r1)
+; CHECK-LINUX32-P9-NEXT:    addi r1, r1, 32
+; CHECK-LINUX32-P9-NEXT:    mtlr r0
+; CHECK-LINUX32-P9-NEXT:    blr
+entry:
+  %destination.addr = alloca ptr, align 4
+  %source.addr = alloca ptr, align 4
+  %num.addr = alloca i32, align 4
+  store ptr %destination, ptr %destination.addr, align 4
+  store ptr %source, ptr %source.addr, align 4
+  store i32 %num, ptr %num.addr, align 4
+  %0 = load ptr, ptr %destination.addr, align 4
+  %1 = load ptr, ptr %source.addr, align 4
+  %2 = load i32, ptr %num.addr, align 4
+  call void @llvm.memmove.p0.p0.i32(ptr align 1 %0, ptr align 1 %1, i32 %2, i1 false)
+  ret ptr %0
+}
+
+declare void @llvm.memmove.p0.p0.i32(ptr writeonly captures(none), ptr readonly captures(none), i32, i1 immarg)
diff --git a/llvm/test/CodeGen/PowerPC/milicode64.ll b/llvm/test/CodeGen/PowerPC/milicode64.ll
index 8b87529d9a6d8..2dbf4140a0fa4 100644
--- a/llvm/test/CodeGen/PowerPC/milicode64.ll
+++ b/llvm/test/CodeGen/PowerPC/milicode64.ll
@@ -100,3 +100,82 @@ entry:
 }
 
 declare i64 @strlen(ptr noundef) nounwind
+
+define ptr @test_memmove(ptr noundef %destination, ptr noundef %source, i64 noundef %num) #0 {
+; CHECK-LE-P9-LABEL: test_memmove:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mflr r0
+; CHECK-LE-P9-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-LE-P9-NEXT:    .cfi_offset lr, 16
+; CHECK-LE-P9-NEXT:    .cfi_offset r30, -16
+; CHECK-LE-P9-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-LE-P9-NEXT:    stdu r1, -80(r1)
+; CHECK-LE-P9-NEXT:    std r0, 96(r1)
+; CHECK-LE-P9-NEXT:    mr r30, r3
+; CHECK-LE-P9-NEXT:    std r3, 56(r1)
+; CHECK-LE-P9-NEXT:    std r4, 48(r1)
+; CHECK-LE-P9-NEXT:    std r5, 40(r1)
+; CHECK-LE-P9-NEXT:    bl memmove
+; CHECK-LE-P9-NEXT:    nop
+; CHECK-LE-P9-NEXT:    mr r3, r30
+; CHECK-LE-P9-NEXT:    addi r1, r1, 80
+; CHECK-LE-P9-NEXT:    ld r0, 16(r1)
+; CHECK-LE-P9-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-LE-P9-NEXT:    mtlr r0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_memmove:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mflr r0
+; CHECK-BE-P9-NEXT:    stdu r1, -160(r1)
+; CHECK-BE-P9-NEXT:    std r0, 176(r1)
+; CHECK-BE-P9-NEXT:    .cfi_def_cfa_offset 160
+; CHECK-BE-P9-NEXT:    .cfi_offset lr, 16
+; CHECK-BE-P9-NEXT:    .cfi_offset r30, -16
+; CHECK-BE-P9-NEXT:    std r30, 144(r1) # 8-byte Folded Spill
+; CHECK-BE-P9-NEXT:    mr r30, r3
+; CHECK-BE-P9-NEXT:    std r3, 136(r1)
+; CHECK-BE-P9-NEXT:    std r4, 128(r1)
+; CHECK-BE-P9-NEXT:    std r5, 120(r1)
+; CHECK-BE-P9-NEXT:    bl memmove
+; CHECK-BE-P9-NEXT:    nop
+; CHECK-BE-P9-NEXT:    mr r3, r30
+; CHECK-BE-P9-NEXT:    ld r30, 144(r1) # 8-byte Folded Reload
+; CHECK-BE-P9-NEXT:    addi r1, r1, 160
+; CHECK-BE-P9-NEXT:    ld r0, 16(r1)
+; CHECK-BE-P9-NEXT:    mtlr r0
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_memmove:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mflr r0
+; CHECK-AIX-64-P9-NEXT:    stdu r1, -144(r1)
+; CHECK-AIX-64-P9-NEXT:    std r0, 160(r1)
+; CHECK-AIX-64-P9-NEXT:    std r31, 136(r1) # 8-byte Folded Spill
+; CHECK-AIX-64-P9-NEXT:    mr r31, r3
+; CHECK-AIX-64-P9-NEXT:    std r3, 128(r1)
+; CHECK-AIX-64-P9-NEXT:    std r4, 120(r1)
+; CHECK-AIX-64-P9-NEXT:    std r5, 112(r1)
+; CHECK-AIX-64-P9-NEXT:    bl .___memmove64[PR]
+; CHECK-AIX-64-P9-NEXT:    nop
+; CHECK-AIX-64-P9-NEXT:    mr r3, r31
+; CHECK-AIX-64-P9-NEXT:    ld r31, 136(r1) # 8-byte Folded Reload
+; CHECK-AIX-64-P9-NEXT:    addi r1, r1, 144
+; CHECK-AIX-64-P9-NEXT:    ld r0, 16(r1)
+; CHECK-AIX-64-P9-NEXT:    mtlr r0
+; CHECK-AIX-64-P9-NEXT:    blr
+entry:
+  %destination.addr = alloca ptr, align 8
+  %source.addr = alloca ptr, align 8
+  %num.addr = alloca i64, align 8
+  store ptr %destination, ptr %destination.addr, align 8
+  store ptr %source, ptr %source.addr, align 8
+  store i64 %num, ptr %num.addr, align 8
+  %0 = load ptr, ptr %destination.addr, align 8
+  %1 = load ptr, ptr %source.addr, align 8
+  %2 = load i64, ptr %num.addr, align 8
+  call void @llvm.memmove.p0.p0.i64(ptr align 1 %0, ptr align 1 %1, i64 %2, i1 false)
+  ret ptr %0
+}
+
+declare void @llvm.memmove.p0.p0.i32(ptr writeonly captures(none), ptr readonly captures(none), i32, i1 immarg)
diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll b/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll
index 7e2f744ac1d71..94121f09e36be 100644
--- a/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll
@@ -5,6 +5,12 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
 
 define void @testMultiply(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c) local_unnamed_addr #0 {
 ; CHECK-LABEL: testMultiply:
@@ -91,6 +97,91 @@ define void @testMultiply(ptr nocapture noundef readonly %a, ptr nocapture nound
 ; CHECK-BE-NEXT:    ld r30, -16(r1)
 ; CHECK-BE-NEXT:    mtlr r0
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: testMultiply:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    mflr r0
+; CHECK-LE-WACC-NEXT:    std r30, -16(r1)
+; CHECK-LE-WACC-NEXT:    std r0, 16(r1)
+; CHECK-LE-WACC-NEXT:    clrldi r0, r1, 59
+; CHECK-LE-WACC-NEXT:    subfic r0, r0, -128
+; CHECK-LE-WACC-NEXT:    mr r30, r1
+; CHECK-LE-WACC-NEXT:    stdux r1, r1, r0
+; CHECK-LE-WACC-NEXT:    stxv v30, -64(r30) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    stxv v31, -48(r30) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    lxv v31, 0(r3)
+; CHECK-LE-WACC-NEXT:    lxv v30, 0(r4)
+; CHECK-LE-WACC-NEXT:    addi r3, r1, 32
+; CHECK-LE-WACC-NEXT:    std r29, -24(r30) # 8-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    vmr v2, v31
+; CHECK-LE-WACC-NEXT:    vmr v3, v30
+; CHECK-LE-WACC-NEXT:    mr r29, r5
+; CHECK-LE-WACC-NEXT:    bl _Z15buildVectorPairPu13__vector_pairDv16_hS0_@notoc
+; CHECK-LE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-LE-WACC-NEXT:    xvf32gerpp wacc0, v31, v30
+; CHECK-LE-WACC-NEXT:    lxv vs0, 48(r1)
+; CHECK-LE-WACC-NEXT:    lxv vs1, 32(r1)
+; CHECK-LE-WACC-NEXT:    xvf32gerpp wacc0, vs1, vs0
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v5, 0(r29)
+; CHECK-LE-WACC-NEXT:    pstxv v4, 8(r29), 0
+; CHECK-LE-WACC-NEXT:    stxv v3, 16(r29)
+; CHECK-LE-WACC-NEXT:    pstxv v2, 24(r29), 0
+; CHECK-LE-WACC-NEXT:    lxv v31, -48(r30) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    lxv v30, -64(r30) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    ld r29, -24(r30) # 8-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    mr r1, r30
+; CHECK-LE-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-LE-WACC-NEXT:    ld r30, -16(r1)
+; CHECK-LE-WACC-NEXT:    mtlr r0
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testMultiply:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    mflr r0
+; CHECK-BE-WACC-NEXT:    std r30, -16(r1)
+; CHECK-BE-WACC-NEXT:    std r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    clrldi r0, r1, 59
+; CHECK-BE-WACC-NEXT:    subfic r0, r0, -224
+; CHECK-BE-WACC-NEXT:    mr r30, r1
+; CHECK-BE-WACC-NEXT:    stdux r1, r1, r0
+; CHECK-BE-WACC-NEXT:    stxv v30, -64(r30) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    stxv v31, -48(r30) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    lxv v31, 0(r3)
+; CHECK-BE-WACC-NEXT:    lxv v30, 0(r4)
+; CHECK-BE-WACC-NEXT:    addi r3, r1, 128
+; CHECK-BE-WACC-NEXT:    std r29, -24(r30) # 8-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    vmr v2, v31
+; CHECK-BE-WACC-NEXT:    vmr v3, v30
+; CHECK-BE-WACC-NEXT:    mr r29, r5
+; CHECK-BE-WACC-NEXT:    bl _Z15buildVectorPairPu13__vector_pairDv16_hS0_
+; CHECK-BE-WACC-NEXT:    nop
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, v31, v30
+; CHECK-BE-WACC-NEXT:    lxv vs0, 128(r1)
+; CHECK-BE-WACC-NEXT:    lxv vs1, 144(r1)
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, vs0, vs1
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    vmr v1, v2
+; CHECK-BE-WACC-NEXT:    vmr v7, v4
+; CHECK-BE-WACC-NEXT:    vmr v0, v3
+; CHECK-BE-WACC-NEXT:    vmr v6, v5
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r29)
+; CHECK-BE-WACC-NEXT:    pstxv v3, 8(r29), 0
+; CHECK-BE-WACC-NEXT:    stxv v4, 16(r29)
+; CHECK-BE-WACC-NEXT:    pstxv v5, 24(r29), 0
+; CHECK-BE-WACC-NEXT:    lxv v31, -48(r30) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    lxv v30, -64(r30) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    ld r29, -24(r30) # 8-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    mr r1, r30
+; CHECK-BE-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    ld r30, -16(r1)
+; CHECK-BE-WACC-NEXT:    mtlr r0
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %vP = alloca <256 x i1>, align 32
   call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %vP)
diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
index 059d60a9608f8..bc5d5bed36e9b 100644
--- a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
@@ -3,10 +3,18 @@
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
 ; RUN:   -disable-auto-paired-vec-st=false < %s | FileCheck %s \
 ; RUN:   --check-prefix=LE-PAIRED
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -disable-auto-paired-vec-st=false < %s | FileCheck %s \
+; RUN:   --check-prefix=LE-PAIRED-WACC
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr -disable-auto-paired-vec-st=false < %s | \
 ; RUN:   FileCheck %s --check-prefix=BE-PAIRED
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -disable-auto-paired-vec-st=false < %s | \
+; RUN:   FileCheck %s --check-prefix=BE-PAIRED-WACC
 ; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-vsr-nums-as-vr \
 ; RUN:   -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s \
 ; RUN:   | FileCheck %s --check-prefix=LE-PWR9
@@ -36,6 +44,20 @@ define dso_local void @testLdSt(i64 %SrcIdx, i64 %DstIdx) {
 ; LE-PAIRED-NEXT:    pstxv vs3, f@PCREL+128(0), 1
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testLdSt:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    plxv v3, f@PCREL+64(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v5, f@PCREL+96(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v2, f@PCREL+80(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v4, f@PCREL+112(0), 1
+; LE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; LE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; LE-PAIRED-WACC-NEXT:    pstxv v4, f@PCREL+176(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v5, f@PCREL+160(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v2, f@PCREL+144(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v3, f@PCREL+128(0), 1
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testLdSt:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r3, r2, f@toc@ha
@@ -50,6 +72,22 @@ define dso_local void @testLdSt(i64 %SrcIdx, i64 %DstIdx) {
 ; BE-PAIRED-NEXT:    stxv vs2, 160(r3)
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testLdSt:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r3, r2, f@toc@ha
+; BE-PAIRED-WACC-NEXT:    addi r3, r3, f@toc@l
+; BE-PAIRED-WACC-NEXT:    lxv v3, 112(r3)
+; BE-PAIRED-WACC-NEXT:    lxv v5, 80(r3)
+; BE-PAIRED-WACC-NEXT:    lxv v2, 96(r3)
+; BE-PAIRED-WACC-NEXT:    lxv v4, 64(r3)
+; BE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; BE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; BE-PAIRED-WACC-NEXT:    stxv v5, 176(r3)
+; BE-PAIRED-WACC-NEXT:    stxv v4, 160(r3)
+; BE-PAIRED-WACC-NEXT:    stxv v3, 144(r3)
+; BE-PAIRED-WACC-NEXT:    stxv v2, 128(r3)
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testLdSt:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r3, r2, f@toc@ha
@@ -147,6 +185,25 @@ define dso_local void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) {
 ; LE-PAIRED-NEXT:    stxv vs2, 16(r4)
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testXLdSt:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    paddi r5, 0, f@PCREL, 1
+; LE-PAIRED-WACC-NEXT:    sldi r3, r3, 6
+; LE-PAIRED-WACC-NEXT:    add r6, r5, r3
+; LE-PAIRED-WACC-NEXT:    lxvx v3, r5, r3
+; LE-PAIRED-WACC-NEXT:    lxv v2, 16(r6)
+; LE-PAIRED-WACC-NEXT:    lxv v5, 32(r6)
+; LE-PAIRED-WACC-NEXT:    lxv v4, 48(r6)
+; LE-PAIRED-WACC-NEXT:    sldi r3, r4, 6
+; LE-PAIRED-WACC-NEXT:    add r4, r5, r3
+; LE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; LE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; LE-PAIRED-WACC-NEXT:    stxvx v3, r5, r3
+; LE-PAIRED-WACC-NEXT:    stxv v4, 48(r4)
+; LE-PAIRED-WACC-NEXT:    stxv v5, 32(r4)
+; LE-PAIRED-WACC-NEXT:    stxv v2, 16(r4)
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testXLdSt:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r5, r2, f@toc@ha
@@ -165,6 +222,26 @@ define dso_local void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) {
 ; BE-PAIRED-NEXT:    stxv vs2, 32(r4)
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testXLdSt:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r5, r2, f@toc@ha
+; BE-PAIRED-WACC-NEXT:    addi r5, r5, f@toc@l
+; BE-PAIRED-WACC-NEXT:    sldi r3, r3, 6
+; BE-PAIRED-WACC-NEXT:    add r6, r5, r3
+; BE-PAIRED-WACC-NEXT:    lxvx v2, r5, r3
+; BE-PAIRED-WACC-NEXT:    lxv v5, 48(r6)
+; BE-PAIRED-WACC-NEXT:    lxv v3, 16(r6)
+; BE-PAIRED-WACC-NEXT:    lxv v4, 32(r6)
+; BE-PAIRED-WACC-NEXT:    sldi r3, r4, 6
+; BE-PAIRED-WACC-NEXT:    add r4, r5, r3
+; BE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; BE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; BE-PAIRED-WACC-NEXT:    stxvx v2, r5, r3
+; BE-PAIRED-WACC-NEXT:    stxv v5, 48(r4)
+; BE-PAIRED-WACC-NEXT:    stxv v4, 32(r4)
+; BE-PAIRED-WACC-NEXT:    stxv v3, 16(r4)
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testXLdSt:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r5, r2, f@toc@ha
@@ -263,6 +340,20 @@ define dso_local void @testUnalignedLdSt() {
 ; LE-PAIRED-NEXT:    pstxv vs3, f@PCREL+19(0), 1
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testUnalignedLdSt:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    plxv v3, f@PCREL+11(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v5, f@PCREL+43(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v2, f@PCREL+27(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v4, f@PCREL+59(0), 1
+; LE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; LE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; LE-PAIRED-WACC-NEXT:    pstxv v4, f@PCREL+67(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v5, f@PCREL+51(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v2, f@PCREL+35(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v3, f@PCREL+19(0), 1
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testUnalignedLdSt:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r3, r2, f@toc@ha
@@ -277,6 +368,22 @@ define dso_local void @testUnalignedLdSt() {
 ; BE-PAIRED-NEXT:    pstxv vs2, 51(r3), 0
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testUnalignedLdSt:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r3, r2, f@toc@ha
+; BE-PAIRED-WACC-NEXT:    addi r3, r3, f@toc@l
+; BE-PAIRED-WACC-NEXT:    plxv v3, 59(r3), 0
+; BE-PAIRED-WACC-NEXT:    plxv v5, 27(r3), 0
+; BE-PAIRED-WACC-NEXT:    plxv v2, 43(r3), 0
+; BE-PAIRED-WACC-NEXT:    plxv v4, 11(r3), 0
+; BE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; BE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; BE-PAIRED-WACC-NEXT:    pstxv v5, 67(r3), 0
+; BE-PAIRED-WACC-NEXT:    pstxv v4, 51(r3), 0
+; BE-PAIRED-WACC-NEXT:    pstxv v3, 35(r3), 0
+; BE-PAIRED-WACC-NEXT:    pstxv v2, 19(r3), 0
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testUnalignedLdSt:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r3, r2, f@toc@ha
@@ -381,6 +488,14 @@ define dso_local void @testLdStPair(i64 %SrcIdx, i64 %DstIdx) {
 ; LE-PAIRED-NEXT:    pstxv vs1, g@PCREL+64(0), 1
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testLdStPair:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    plxv vs0, g@PCREL+48(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv vs1, g@PCREL+32(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv vs0, g@PCREL+80(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv vs1, g@PCREL+64(0), 1
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testLdStPair:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r3, r2, g@toc@ha
@@ -391,6 +506,16 @@ define dso_local void @testLdStPair(i64 %SrcIdx, i64 %DstIdx) {
 ; BE-PAIRED-NEXT:    stxv vs0, 64(r3)
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testLdStPair:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r3, r2, g@toc@ha
+; BE-PAIRED-WACC-NEXT:    addi r3, r3, g@toc@l
+; BE-PAIRED-WACC-NEXT:    lxv vs0, 48(r3)
+; BE-PAIRED-WACC-NEXT:    lxv vs1, 32(r3)
+; BE-PAIRED-WACC-NEXT:    stxv vs0, 80(r3)
+; BE-PAIRED-WACC-NEXT:    stxv vs1, 64(r3)
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testLdStPair:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r3, r2, g@toc@ha
@@ -460,6 +585,19 @@ define dso_local void @testXLdStPair(i64 %SrcIdx, i64 %DstIdx) {
 ; LE-PAIRED-NEXT:    stxv vs1, 16(r4)
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testXLdStPair:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    sldi r3, r3, 5
+; LE-PAIRED-WACC-NEXT:    paddi r5, 0, g@PCREL, 1
+; LE-PAIRED-WACC-NEXT:    add r6, r5, r3
+; LE-PAIRED-WACC-NEXT:    lxvx vs0, r5, r3
+; LE-PAIRED-WACC-NEXT:    lxv vs1, 16(r6)
+; LE-PAIRED-WACC-NEXT:    sldi r3, r4, 5
+; LE-PAIRED-WACC-NEXT:    add r4, r5, r3
+; LE-PAIRED-WACC-NEXT:    stxvx vs0, r5, r3
+; LE-PAIRED-WACC-NEXT:    stxv vs1, 16(r4)
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testXLdStPair:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r5, r2, g@toc@ha
@@ -474,6 +612,20 @@ define dso_local void @testXLdStPair(i64 %SrcIdx, i64 %DstIdx) {
 ; BE-PAIRED-NEXT:    stxv vs1, 16(r4)
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testXLdStPair:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r5, r2, g@toc@ha
+; BE-PAIRED-WACC-NEXT:    sldi r3, r3, 5
+; BE-PAIRED-WACC-NEXT:    addi r5, r5, g@toc@l
+; BE-PAIRED-WACC-NEXT:    add r6, r5, r3
+; BE-PAIRED-WACC-NEXT:    lxvx vs0, r5, r3
+; BE-PAIRED-WACC-NEXT:    lxv vs1, 16(r6)
+; BE-PAIRED-WACC-NEXT:    sldi r3, r4, 5
+; BE-PAIRED-WACC-NEXT:    add r4, r5, r3
+; BE-PAIRED-WACC-NEXT:    stxvx vs0, r5, r3
+; BE-PAIRED-WACC-NEXT:    stxv vs1, 16(r4)
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testXLdStPair:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r5, r2, g@toc@ha
@@ -548,6 +700,14 @@ define dso_local void @testUnalignedLdStPair() {
 ; LE-PAIRED-NEXT:    pstxv vs1, g@PCREL+19(0), 1
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testUnalignedLdStPair:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    plxv vs0, g@PCREL+27(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv vs1, g@PCREL+11(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv vs0, g@PCREL+35(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv vs1, g@PCREL+19(0), 1
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testUnalignedLdStPair:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r3, r2, g@toc@ha
@@ -558,6 +718,16 @@ define dso_local void @testUnalignedLdStPair() {
 ; BE-PAIRED-NEXT:    pstxv vs0, 19(r3), 0
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testUnalignedLdStPair:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r3, r2, g@toc@ha
+; BE-PAIRED-WACC-NEXT:    addi r3, r3, g@toc@l
+; BE-PAIRED-WACC-NEXT:    plxv vs0, 27(r3), 0
+; BE-PAIRED-WACC-NEXT:    plxv vs1, 11(r3), 0
+; BE-PAIRED-WACC-NEXT:    pstxv vs0, 35(r3), 0
+; BE-PAIRED-WACC-NEXT:    pstxv vs1, 19(r3), 0
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testUnalignedLdStPair:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r3, r2, g@toc@ha
diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll
index abc65bed5bf6c..9db8ba1c9eb09 100644
--- a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll
@@ -13,6 +13,13 @@
 ; RUN:   -mcpu=pwr11 -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
 
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
+
 declare <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1>, <16 x i8>, <16 x i8>)
 declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
 declare void @foo()
@@ -119,6 +126,101 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i
 ; CHECK-BE-NEXT:    ld r0, 16(r1)
 ; CHECK-BE-NEXT:    mtlr r0
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: intrinsics1:
+; CHECK-LE-WACC:       # %bb.0:
+; CHECK-LE-WACC-NEXT:    mflr r0
+; CHECK-LE-WACC-NEXT:    std r0, 16(r1)
+; CHECK-LE-WACC-NEXT:    stdu r1, -176(r1)
+; CHECK-LE-WACC-NEXT:    .cfi_def_cfa_offset 176
+; CHECK-LE-WACC-NEXT:    .cfi_offset lr, 16
+; CHECK-LE-WACC-NEXT:    .cfi_offset r30, -16
+; CHECK-LE-WACC-NEXT:    .cfi_offset v28, -80
+; CHECK-LE-WACC-NEXT:    .cfi_offset v29, -64
+; CHECK-LE-WACC-NEXT:    .cfi_offset v30, -48
+; CHECK-LE-WACC-NEXT:    .cfi_offset v31, -32
+; CHECK-LE-WACC-NEXT:    stxv v28, 96(r1) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    stxv v29, 112(r1) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    stxv v30, 128(r1) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    stxv v31, 144(r1) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    vmr v31, v5
+; CHECK-LE-WACC-NEXT:    vmr v29, v3
+; CHECK-LE-WACC-NEXT:    vmr v30, v4
+; CHECK-LE-WACC-NEXT:    vmr v28, v2
+; CHECK-LE-WACC-NEXT:    std r30, 160(r1) # 8-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    ld r30, 272(r1)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp60, vsp62, 0
+; CHECK-LE-WACC-NEXT:    xvf16ger2pp wacc0, v2, v4
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxvp vsp36, 64(r1)
+; CHECK-LE-WACC-NEXT:    stxvp vsp34, 32(r1)
+; CHECK-LE-WACC-NEXT:    bl foo@notoc
+; CHECK-LE-WACC-NEXT:    lxvp vsp34, 64(r1)
+; CHECK-LE-WACC-NEXT:    lxvp vsp36, 32(r1)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-LE-WACC-NEXT:    xvf16ger2pp wacc0, v28, v30
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r30)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r30)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r30)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r30)
+; CHECK-LE-WACC-NEXT:    lxv v31, 144(r1) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    lxv v30, 128(r1) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    lxv v29, 112(r1) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    lxv v28, 96(r1) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    ld r30, 160(r1) # 8-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    addi r1, r1, 176
+; CHECK-LE-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-LE-WACC-NEXT:    mtlr r0
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: intrinsics1:
+; CHECK-BE-WACC:       # %bb.0:
+; CHECK-BE-WACC-NEXT:    mflr r0
+; CHECK-BE-WACC-NEXT:    std r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    stdu r1, -256(r1)
+; CHECK-BE-WACC-NEXT:    .cfi_def_cfa_offset 256
+; CHECK-BE-WACC-NEXT:    .cfi_offset lr, 16
+; CHECK-BE-WACC-NEXT:    .cfi_offset r30, -16
+; CHECK-BE-WACC-NEXT:    .cfi_offset v28, -80
+; CHECK-BE-WACC-NEXT:    .cfi_offset v29, -64
+; CHECK-BE-WACC-NEXT:    .cfi_offset v30, -48
+; CHECK-BE-WACC-NEXT:    .cfi_offset v31, -32
+; CHECK-BE-WACC-NEXT:    stxv v28, 176(r1) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    stxv v29, 192(r1) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    stxv v30, 208(r1) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    stxv v31, 224(r1) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    vmr v31, v5
+; CHECK-BE-WACC-NEXT:    vmr v29, v3
+; CHECK-BE-WACC-NEXT:    vmr v30, v4
+; CHECK-BE-WACC-NEXT:    vmr v28, v2
+; CHECK-BE-WACC-NEXT:    std r30, 240(r1) # 8-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    ld r30, 368(r1)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp60, vsp62, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2pp wacc0, v2, v4
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxvp vsp36, 112(r1)
+; CHECK-BE-WACC-NEXT:    stxvp vsp34, 144(r1)
+; CHECK-BE-WACC-NEXT:    bl foo
+; CHECK-BE-WACC-NEXT:    nop
+; CHECK-BE-WACC-NEXT:    lxvp vsp34, 112(r1)
+; CHECK-BE-WACC-NEXT:    lxvp vsp36, 144(r1)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2pp wacc0, v28, v30
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r30)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r30)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r30)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r30)
+; CHECK-BE-WACC-NEXT:    lxv v31, 224(r1) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    lxv v30, 208(r1) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    lxv v29, 192(r1) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    lxv v28, 176(r1) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    ld r30, 240(r1) # 8-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    addi r1, r1, 256
+; CHECK-BE-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    mtlr r0
+; CHECK-BE-WACC-NEXT:    blr
   %1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4)
   %2 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc3)
   tail call void @foo()
diff --git a/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll
index e932aec2c7134..7b36fa4f64f71 100644
--- a/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll
@@ -5,6 +5,12 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
 
 ; Function Attrs: nofree nounwind writeonly
 define dso_local void @test1(ptr nocapture readnone %vqp, ptr nocapture readnone %vpp, <16 x i8> %vc, ptr nocapture %resp) {
@@ -27,6 +33,26 @@ define dso_local void @test1(ptr nocapture readnone %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test1:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    xvi16ger2 wacc0, v2, v2
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test1:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvi16ger2 wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -57,6 +83,26 @@ define dso_local void @test2(ptr nocapture readnone %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test2:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    pmxvi16ger2 wacc0, v2, v2, 0, 0, 0
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test2:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvi16ger2 wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -97,6 +143,36 @@ define dso_local void @test3(ptr nocapture readonly %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test3:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-LE-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-LE-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-LE-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-LE-WACC-NEXT:    xvi8ger4spp wacc0, v2, v2
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test3:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi8ger4spp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -138,6 +214,36 @@ define dso_local void @test4(ptr nocapture readonly %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test4:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-LE-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-LE-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-LE-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-LE-WACC-NEXT:    xvi16ger2pp wacc0, v2, v2
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test4:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi16ger2pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -179,6 +285,36 @@ define dso_local void @test5(ptr nocapture readonly %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test5:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-LE-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-LE-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-LE-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-LE-WACC-NEXT:    pmxvi8ger4spp wacc0, v2, v2, 0, 0, 0
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test5:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvi8ger4spp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -220,6 +356,36 @@ define dso_local void @test6(ptr nocapture readonly %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test6:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-LE-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-LE-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-LE-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-LE-WACC-NEXT:    pmxvi16ger2pp wacc0, v2, v2, 0, 0, 0
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test6:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvi16ger2pp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
index 8fbc9d785796d..3505cbb197bf9 100644
--- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
@@ -5,6 +5,12 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
 
 ; assemble_acc
 declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
@@ -32,6 +38,28 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: ass_acc:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    vmr v3, v2
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: ass_acc:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    vmr v3, v2
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %ptr, align 64
@@ -66,6 +94,28 @@ define void @int_xxmtacc(ptr %ptr, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: int_xxmtacc:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    vmr v3, v2
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: int_xxmtacc:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    vmr v3, v2
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
 ; One xxmtacc is generated from the call to assemble.acc then one xxmtacc is
 ; generated from the call to xxmtacc then one xxmfacc is generated for the store
@@ -101,6 +151,28 @@ define void @int_xxmfacc(ptr %ptr, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: int_xxmfacc:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    vmr v3, v2
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: int_xxmfacc:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    vmr v3, v2
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
 ; One xxmtacc is generated from the call to assemble.acc then one xxmfacc is
 ; generated from the call to xxmfacc then one xxmfacc is generated for the store
@@ -132,6 +204,26 @@ define void @int_xxsetaccz(ptr %ptr) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: int_xxsetaccz:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: int_xxsetaccz:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   store <512 x i1> %0, ptr %ptr, align 64
@@ -160,6 +252,26 @@ define void @disass_acc(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4) {
 ; CHECK-BE-NEXT:    stxv vs2, 0(r5)
 ; CHECK-BE-NEXT:    stxv vs3, 0(r6)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: disass_acc:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-WACC-NEXT:    stxv v4, 0(r4)
+; CHECK-WACC-NEXT:    stxv v3, 0(r5)
+; CHECK-WACC-NEXT:    stxv v2, 0(r6)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: disass_acc:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 0(r4)
+; CHECK-BE-WACC-NEXT:    stxv v4, 0(r5)
+; CHECK-BE-WACC-NEXT:    stxv v5, 0(r6)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %0)
@@ -219,6 +331,50 @@ define void @testBranch(ptr %ptr, <16 x i8> %vc, i32 %val) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testBranch:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    cmplwi r7, 0
+; CHECK-WACC-NEXT:    beq cr0, .LBB5_2
+; CHECK-WACC-NEXT:  # %bb.1: # %if.then
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    b .LBB5_3
+; CHECK-WACC-NEXT:  .LBB5_2: # %if.else
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvi4ger8pp wacc0, v2, v2
+; CHECK-WACC-NEXT:  .LBB5_3: # %if.end
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testBranch:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    cmplwi r7, 0
+; CHECK-BE-WACC-NEXT:    beq cr0, .LBB5_2
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %if.then
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    b .LBB5_3
+; CHECK-BE-WACC-NEXT:  .LBB5_2: # %if.else
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi4ger8pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:  .LBB5_3: # %if.end
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %tobool = icmp eq i32 %val, 0
   br i1 %tobool, label %if.else, label %if.then
@@ -273,6 +429,36 @@ define void @testcse(ptr %res, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 112(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 96(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testcse:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    stxv v4, 112(r3)
+; CHECK-WACC-NEXT:    stxv v5, 96(r3)
+; CHECK-WACC-NEXT:    stxv v2, 80(r3)
+; CHECK-WACC-NEXT:    stxv v3, 64(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testcse:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    stxv v5, 112(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 96(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 80(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 64(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
@@ -320,6 +506,42 @@ define void @testcse2(ptr %res, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 112(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 96(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testcse2:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    xvf32gerpp wacc1, v2, v2
+; CHECK-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 112(r3)
+; CHECK-WACC-NEXT:    stxv v5, 96(r3)
+; CHECK-WACC-NEXT:    stxv v2, 80(r3)
+; CHECK-WACC-NEXT:    stxv v3, 64(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testcse2:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc1, v2, v2
+; CHECK-BE-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 112(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 96(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 80(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 64(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
@@ -367,6 +589,42 @@ define void @testcse3(ptr %res, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 112(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 96(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testcse3:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    xvf32gerpp wacc1, v2, v2
+; CHECK-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 112(r3)
+; CHECK-WACC-NEXT:    stxv v5, 96(r3)
+; CHECK-WACC-NEXT:    stxv v2, 80(r3)
+; CHECK-WACC-NEXT:    stxv v3, 64(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testcse3:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc1, v2, v2
+; CHECK-BE-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 112(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 96(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 80(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 64(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   %1 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -475,6 +733,104 @@ define void @testcse4(ptr %res, i32 %lim, ptr %vc) {
 ; CHECK-BE-NEXT:    bdnz .LBB9_2
 ; CHECK-BE-NEXT:  # %bb.3: # %for.cond.cleanup
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testcse4:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    cmpwi r4, 1
+; CHECK-WACC-NEXT:    bltlr cr0
+; CHECK-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-WACC-NEXT:    clrldi r4, r4, 32
+; CHECK-WACC-NEXT:    mtctr r4
+; CHECK-WACC-NEXT:    li r4, 0
+; CHECK-WACC-NEXT:    li r6, 0
+; CHECK-WACC-NEXT:    .p2align 4
+; CHECK-WACC-NEXT:  .LBB9_2: # %for.body
+; CHECK-WACC-NEXT:    #
+; CHECK-WACC-NEXT:    rldic r7, r6, 4, 28
+; CHECK-WACC-NEXT:    add r8, r5, r7
+; CHECK-WACC-NEXT:    lxvx vs0, r5, r7
+; CHECK-WACC-NEXT:    lxv vs1, 16(r8)
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc2
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    xvf32gerpp wacc2, vs0, vs1
+; CHECK-WACC-NEXT:    lxv vs0, 32(r8)
+; CHECK-WACC-NEXT:    lxv vs1, 48(r8)
+; CHECK-WACC-NEXT:    rldic r7, r4, 6, 26
+; CHECK-WACC-NEXT:    addi r4, r4, 3
+; CHECK-WACC-NEXT:    addi r6, r6, 6
+; CHECK-WACC-NEXT:    xvf32gerpn wacc1, vs0, vs1
+; CHECK-WACC-NEXT:    lxv vs0, 64(r8)
+; CHECK-WACC-NEXT:    lxv vs1, 80(r8)
+; CHECK-WACC-NEXT:    add r8, r3, r7
+; CHECK-WACC-NEXT:    xvf32gernp wacc0, vs0, vs1
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc2, 0
+; CHECK-WACC-NEXT:    stxvx v3, r3, r7
+; CHECK-WACC-NEXT:    stxv v4, 48(r8)
+; CHECK-WACC-NEXT:    stxv v5, 32(r8)
+; CHECK-WACC-NEXT:    stxv v2, 16(r8)
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-WACC-NEXT:    stxv v4, 112(r8)
+; CHECK-WACC-NEXT:    stxv v5, 96(r8)
+; CHECK-WACC-NEXT:    stxv v2, 80(r8)
+; CHECK-WACC-NEXT:    stxv v3, 64(r8)
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 176(r8)
+; CHECK-WACC-NEXT:    stxv v5, 160(r8)
+; CHECK-WACC-NEXT:    stxv v2, 144(r8)
+; CHECK-WACC-NEXT:    stxv v3, 128(r8)
+; CHECK-WACC-NEXT:    bdnz .LBB9_2
+; CHECK-WACC-NEXT:  # %bb.3: # %for.cond.cleanup
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testcse4:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    cmpwi r4, 1
+; CHECK-BE-WACC-NEXT:    bltlr cr0
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-BE-WACC-NEXT:    clrldi r4, r4, 32
+; CHECK-BE-WACC-NEXT:    mtctr r4
+; CHECK-BE-WACC-NEXT:    li r4, 0
+; CHECK-BE-WACC-NEXT:    li r6, 0
+; CHECK-BE-WACC-NEXT:    .p2align 4
+; CHECK-BE-WACC-NEXT:  .LBB9_2: # %for.body
+; CHECK-BE-WACC-NEXT:    #
+; CHECK-BE-WACC-NEXT:    rldic r7, r6, 4, 28
+; CHECK-BE-WACC-NEXT:    add r8, r5, r7
+; CHECK-BE-WACC-NEXT:    lxvx vs0, r5, r7
+; CHECK-BE-WACC-NEXT:    lxv vs1, 16(r8)
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc2
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc2, vs0, vs1
+; CHECK-BE-WACC-NEXT:    lxv vs0, 32(r8)
+; CHECK-BE-WACC-NEXT:    lxv vs1, 48(r8)
+; CHECK-BE-WACC-NEXT:    rldic r7, r4, 6, 26
+; CHECK-BE-WACC-NEXT:    addi r4, r4, 3
+; CHECK-BE-WACC-NEXT:    addi r6, r6, 6
+; CHECK-BE-WACC-NEXT:    xvf32gerpn wacc1, vs0, vs1
+; CHECK-BE-WACC-NEXT:    lxv vs0, 64(r8)
+; CHECK-BE-WACC-NEXT:    lxv vs1, 80(r8)
+; CHECK-BE-WACC-NEXT:    add r8, r3, r7
+; CHECK-BE-WACC-NEXT:    xvf32gernp wacc0, vs0, vs1
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc2, 0
+; CHECK-BE-WACC-NEXT:    stxvx v2, r3, r7
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r8)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r8)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r8)
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 112(r8)
+; CHECK-BE-WACC-NEXT:    stxv v4, 96(r8)
+; CHECK-BE-WACC-NEXT:    stxv v3, 80(r8)
+; CHECK-BE-WACC-NEXT:    stxv v2, 64(r8)
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 176(r8)
+; CHECK-BE-WACC-NEXT:    stxv v4, 160(r8)
+; CHECK-BE-WACC-NEXT:    stxv v3, 144(r8)
+; CHECK-BE-WACC-NEXT:    stxv v2, 128(r8)
+; CHECK-BE-WACC-NEXT:    bdnz .LBB9_2
+; CHECK-BE-WACC-NEXT:  # %bb.3: # %for.cond.cleanup
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %cmp55 = icmp sgt i32 %lim, 0
   br i1 %cmp55, label %for.body.preheader, label %for.cond.cleanup
@@ -600,6 +956,71 @@ define void @testRedundantPrimeUnprime(ptr %dst, <16 x i8> %vc) nounwind {
 ; CHECK-BE-NEXT:    ld r0, 16(r1)
 ; CHECK-BE-NEXT:    mtlr r0
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testRedundantPrimeUnprime:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    mflr r0
+; CHECK-WACC-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-WACC-NEXT:    std r0, 16(r1)
+; CHECK-WACC-NEXT:    stdu r1, -112(r1)
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp32, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v0, 48(r3)
+; CHECK-WACC-NEXT:    stxv v1, 32(r3)
+; CHECK-WACC-NEXT:    stxv v4, 16(r3)
+; CHECK-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-WACC-NEXT:    mr r30, r3
+; CHECK-WACC-NEXT:    stxvp vsp36, 64(r1)
+; CHECK-WACC-NEXT:    stxvp vsp34, 32(r1)
+; CHECK-WACC-NEXT:    bl testRedundantPrimeUnprimeF@notoc
+; CHECK-WACC-NEXT:    lxvp vsp34, 64(r1)
+; CHECK-WACC-NEXT:    lxvp vsp36, 32(r1)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 112(r30)
+; CHECK-WACC-NEXT:    stxv v5, 96(r30)
+; CHECK-WACC-NEXT:    stxv v2, 80(r30)
+; CHECK-WACC-NEXT:    stxv v3, 64(r30)
+; CHECK-WACC-NEXT:    addi r1, r1, 112
+; CHECK-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-WACC-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-WACC-NEXT:    mtlr r0
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testRedundantPrimeUnprime:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    mflr r0
+; CHECK-BE-WACC-NEXT:    std r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    stdu r1, -192(r1)
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    std r30, 176(r1) # 8-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp32, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v1, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v0, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v5, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 0(r3)
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-BE-WACC-NEXT:    mr r30, r3
+; CHECK-BE-WACC-NEXT:    stxvp vsp36, 112(r1)
+; CHECK-BE-WACC-NEXT:    stxvp vsp34, 144(r1)
+; CHECK-BE-WACC-NEXT:    bl testRedundantPrimeUnprimeF
+; CHECK-BE-WACC-NEXT:    nop
+; CHECK-BE-WACC-NEXT:    lxvp vsp34, 112(r1)
+; CHECK-BE-WACC-NEXT:    lxvp vsp36, 144(r1)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 112(r30)
+; CHECK-BE-WACC-NEXT:    stxv v4, 96(r30)
+; CHECK-BE-WACC-NEXT:    stxv v3, 80(r30)
+; CHECK-BE-WACC-NEXT:    stxv v2, 64(r30)
+; CHECK-BE-WACC-NEXT:    ld r30, 176(r1) # 8-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    addi r1, r1, 192
+; CHECK-BE-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    mtlr r0
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   store <512 x i1> %0, ptr %dst, align 64
@@ -646,6 +1067,38 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test_ldst_1:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    plxvp vsp36, 8(r4), 0
+; CHECK-WACC-NEXT:    pmxvf64gernn wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test_ldst_1:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    plxvp vsp36, 8(r4), 0
+; CHECK-BE-WACC-NEXT:    pmxvf64gernn wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = getelementptr i8, ptr %vpp, i64 8
@@ -688,6 +1141,38 @@ define void @test_ldst_2(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test_ldst_2:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxvp vsp36, 0(r4)
+; CHECK-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test_ldst_2:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxvp vsp36, 0(r4)
+; CHECK-BE-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp)
@@ -729,6 +1214,38 @@ define void @test_ldst_3(ptr nocapture readonly %vqp, i64 %offs, ptr %vpp, <16 x
 ; CHECK-BE-NEXT:    stxv vs3, 48(r9)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r9)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test_ldst_3:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxvp vsp36, 0(r5)
+; CHECK-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r9)
+; CHECK-WACC-NEXT:    stxv v5, 32(r9)
+; CHECK-WACC-NEXT:    stxv v2, 16(r9)
+; CHECK-WACC-NEXT:    stxv v3, 0(r9)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test_ldst_3:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxvp vsp36, 0(r5)
+; CHECK-BE-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r9)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r9)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r9)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r9)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp)
diff --git a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll
index ac6ad41633492..ff860b8d6ff22 100644
--- a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll
@@ -5,6 +5,12 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -enable-subreg-liveness -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -enable-subreg-liveness -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -enable-subreg-liveness -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
 
 declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
 declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>)
@@ -56,6 +62,46 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: intrinsics1:
+; CHECK-WACC:       # %bb.0:
+; CHECK-WACC-NEXT:    vmr v1, v4
+; CHECK-WACC-NEXT:    vmr v4, v3
+; CHECK-WACC-NEXT:    vmr v0, v2
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvi4ger8pp wacc0, v2, v4
+; CHECK-WACC-NEXT:    ld r3, 96(r1)
+; CHECK-WACC-NEXT:    xvf16ger2pp wacc0, v0, v1
+; CHECK-WACC-NEXT:    vmr v3, v2
+; CHECK-WACC-NEXT:    vmr v2, v5
+; CHECK-WACC-NEXT:    pmxvf32gerpn wacc0, v4, v5, 0, 0
+; CHECK-WACC-NEXT:    pmxvf64gernp wacc0, vsp34, v0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: intrinsics1:
+; CHECK-BE-WACC:       # %bb.0:
+; CHECK-BE-WACC-NEXT:    vmr v1, v4
+; CHECK-BE-WACC-NEXT:    vmr v4, v3
+; CHECK-BE-WACC-NEXT:    vmr v0, v2
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi4ger8pp wacc0, v2, v4
+; CHECK-BE-WACC-NEXT:    ld r3, 112(r1)
+; CHECK-BE-WACC-NEXT:    xvf16ger2pp wacc0, v0, v1
+; CHECK-BE-WACC-NEXT:    vmr v3, v2
+; CHECK-BE-WACC-NEXT:    vmr v2, v5
+; CHECK-BE-WACC-NEXT:    pmxvf32gerpn wacc0, v4, v5, 0, 0
+; CHECK-BE-WACC-NEXT:    pmxvf64gernp wacc0, vsp34, v0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
   %1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc3, <16 x i8> %vc2, <16 x i8> %vc4)
   %2 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc2)
   %3 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %2, <16 x i8> %vc1, <16 x i8> %vc3)
@@ -115,6 +161,46 @@ define void @intrinsics2(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4, ptr %ptr) {
 ; CHECK-BE-NEXT:    stxv vs2, 0(r5)
 ; CHECK-BE-NEXT:    stxv vs3, 0(r6)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: intrinsics2:
+; CHECK-WACC:       # %bb.0:
+; CHECK-WACC-NEXT:    lxv v2, 0(r3)
+; CHECK-WACC-NEXT:    lxv v4, 0(r5)
+; CHECK-WACC-NEXT:    lxv v3, 0(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r6)
+; CHECK-WACC-NEXT:    vmr v1, v2
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-WACC-NEXT:    xvi8ger4pp wacc0, v2, v3
+; CHECK-WACC-NEXT:    xvf16ger2pn wacc0, v2, v4
+; CHECK-WACC-NEXT:    vmr v0, v5
+; CHECK-WACC-NEXT:    pmxvf32gernn wacc0, v3, v5, 0, 0
+; CHECK-WACC-NEXT:    pmxvf64gernn wacc0, vsp32, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-WACC-NEXT:    stxv v4, 0(r4)
+; CHECK-WACC-NEXT:    stxv v3, 0(r5)
+; CHECK-WACC-NEXT:    stxv v2, 0(r6)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: intrinsics2:
+; CHECK-BE-WACC:       # %bb.0:
+; CHECK-BE-WACC-NEXT:    lxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r5)
+; CHECK-BE-WACC-NEXT:    lxv v3, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 0(r6)
+; CHECK-BE-WACC-NEXT:    vmr v1, v2
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi8ger4pp wacc0, v2, v3
+; CHECK-BE-WACC-NEXT:    xvf16ger2pn wacc0, v2, v4
+; CHECK-BE-WACC-NEXT:    vmr v0, v5
+; CHECK-BE-WACC-NEXT:    pmxvf32gernn wacc0, v3, v5, 0, 0
+; CHECK-BE-WACC-NEXT:    pmxvf64gernn wacc0, vsp32, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 0(r4)
+; CHECK-BE-WACC-NEXT:    stxv v4, 0(r5)
+; CHECK-BE-WACC-NEXT:    stxv v5, 0(r6)
+; CHECK-BE-WACC-NEXT:    blr
   %vc1 = load <16 x i8>, ptr %ptr1, align 16
   %vc2 = load <16 x i8>, ptr %ptr2, align 16
   %vc3 = load <16 x i8>, ptr %ptr3, align 16
@@ -157,6 +243,26 @@ define void @test1(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test1:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    xvi4ger8 wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test1:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvi4ger8 wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -196,6 +302,36 @@ define void @test2(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test2:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvi4ger8pp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test2:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi4ger8pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -226,6 +362,26 @@ define void @test3(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test3:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    pmxvi4ger8 wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test3:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvi4ger8 wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -265,6 +421,36 @@ define void @test4(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test4:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvi4ger8pp wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test4:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvi4ger8pp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -295,6 +481,26 @@ define void @test5(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test5:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    xvi8ger4 wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test5:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvi8ger4 wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -334,6 +540,36 @@ define void @test6(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test6:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvi8ger4pp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test6:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi8ger4pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -364,6 +600,26 @@ define void @test7(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test7:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    pmxvi8ger4 wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test7:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvi8ger4 wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -403,6 +659,36 @@ define void @test8(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test8:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvi8ger4pp wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test8:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvi8ger4pp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -433,6 +719,26 @@ define void @test9(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test9:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    xvi16ger2s wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test9:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvi16ger2s wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2s(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -472,6 +778,36 @@ define void @test10(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test10:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvi16ger2spp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test10:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi16ger2spp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -502,6 +838,26 @@ define void @test11(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test11:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    pmxvi16ger2s wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test11:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvi16ger2s wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -541,6 +897,36 @@ define void @test12(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test12:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvi16ger2spp wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test12:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvi16ger2spp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -571,6 +957,26 @@ define void @test13(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test13:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    xvf16ger2 wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test13:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvf16ger2 wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -610,6 +1016,36 @@ define void @test14(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test14:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf16ger2pp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test14:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -650,6 +1086,36 @@ define void @test15(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test15:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf16ger2pn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test15:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2pn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -690,6 +1156,36 @@ define void @test16(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test16:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf16ger2np wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test16:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2np wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2np(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -730,6 +1226,36 @@ define void @test17(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test17:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf16ger2nn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test17:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2nn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2nn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -760,6 +1286,26 @@ define void @test18(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test18:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    pmxvf16ger2 wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test18:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvf16ger2 wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -799,6 +1345,36 @@ define void @test19(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test19:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf16ger2pp wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test19:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf16ger2pp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -839,6 +1415,36 @@ define void @test20(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test20:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf16ger2pn wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test20:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf16ger2pn wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -879,6 +1485,36 @@ define void @test21(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test21:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf16ger2np wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test21:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf16ger2np wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2np(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -919,6 +1555,36 @@ define void @test22(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test22:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf16ger2nn wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test22:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf16ger2nn wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2nn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -949,6 +1615,26 @@ define void @test23(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test23:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    xvf32ger wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test23:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvf32ger wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -988,6 +1674,36 @@ define void @test24(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test24:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test24:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -1028,6 +1744,36 @@ define void @test25(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test25:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test25:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -1068,6 +1814,36 @@ define void @test26(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test26:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test26:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -1108,6 +1884,36 @@ define void @test27(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test27:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf32gernn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test27:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf32gernn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -1138,6 +1944,26 @@ define void @test28(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test28:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    pmxvf32ger wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test28:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvf32ger wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -1177,6 +2003,36 @@ define void @test29(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test29:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf32gerpp wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test29:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf32gerpp wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
@@ -1217,6 +2073,36 @@ define void @test30(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test30:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf32gerpn wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test30:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf32gerpn wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
@@ -1257,6 +2143,36 @@ define void @test31(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test31:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf32gernp wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test31:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf32gernp wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
@@ -1297,6 +2213,36 @@ define void @test32(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test32:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf32gernn wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test32:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf32gernn wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
@@ -1331,6 +2277,30 @@ define void @test33(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test33:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    xvf64ger wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test33:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    xvf64ger wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <256 x i1>, ptr %vpp, align 32
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> %0, <16 x i8> %vc)
@@ -1375,6 +2345,40 @@ define void @test34(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test34:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    xvf64gerpp wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test34:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    xvf64gerpp wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1420,6 +2424,40 @@ define void @test35(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test35:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    xvf64gerpn wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test35:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    xvf64gerpn wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1465,6 +2503,40 @@ define void @test36(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test36:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test36:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1510,6 +2582,40 @@ define void @test37(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test37:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    xvf64gernn wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test37:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    xvf64gernn wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1545,6 +2651,30 @@ define void @test38(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test38:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    pmxvf64ger wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test38:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    pmxvf64ger wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <256 x i1>, ptr %vpp, align 32
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> %0, <16 x i8> %vc, i32 0, i32 0)
@@ -1589,6 +2719,40 @@ define void @test39(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test39:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    pmxvf64gerpp wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test39:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    pmxvf64gerpp wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1634,6 +2798,40 @@ define void @test40(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test40:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    pmxvf64gerpn wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test40:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    pmxvf64gerpn wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1679,6 +2877,40 @@ define void @test41(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test41:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    pmxvf64gernp wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test41:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    pmxvf64gernp wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1724,6 +2956,40 @@ define void @test42(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test42:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    pmxvf64gernn wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test42:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    pmxvf64gernn wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
diff --git a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll
index 89e5147aecc5f..37d0e69b3beaa 100644
--- a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll
@@ -5,6 +5,12 @@
 ; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s  --check-prefix=CHECK-WACC
+; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
 
 declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>)
 declare <512 x i1> @llvm.ppc.mma.xxsetaccz()
@@ -64,6 +70,60 @@ define void @testPHI1(ptr %Dst, ptr %Src, i32 signext %Len) {
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testPHI1:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    cmpwi r5, 3
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    blt cr0, .LBB0_3
+; CHECK-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-WACC-NEXT:    clrldi r5, r5, 32
+; CHECK-WACC-NEXT:    addi r5, r5, -2
+; CHECK-WACC-NEXT:    lxv v2, 0(r4)
+; CHECK-WACC-NEXT:    lxv v3, 16(r4)
+; CHECK-WACC-NEXT:    mtctr r5
+; CHECK-WACC-NEXT:    addi r4, r4, 32
+; CHECK-WACC-NEXT:    .p2align 4
+; CHECK-WACC-NEXT:  .LBB0_2: # %for.body
+; CHECK-WACC-NEXT:    #
+; CHECK-WACC-NEXT:    lxv vs0, 0(r4)
+; CHECK-WACC-NEXT:    addi r4, r4, 16
+; CHECK-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-WACC-NEXT:    bdnz .LBB0_2
+; CHECK-WACC-NEXT:  .LBB0_3: # %for.cond.cleanup
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-WACC-NEXT:    stxv v4, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 48(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testPHI1:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    cmpwi r5, 3
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    blt cr0, .LBB0_3
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-BE-WACC-NEXT:    clrldi r5, r5, 32
+; CHECK-BE-WACC-NEXT:    addi r5, r5, -2
+; CHECK-BE-WACC-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-WACC-NEXT:    mtctr r5
+; CHECK-BE-WACC-NEXT:    addi r4, r4, 32
+; CHECK-BE-WACC-NEXT:    .p2align 4
+; CHECK-BE-WACC-NEXT:  .LBB0_2: # %for.body
+; CHECK-BE-WACC-NEXT:    #
+; CHECK-BE-WACC-NEXT:    lxv vs0, 0(r4)
+; CHECK-BE-WACC-NEXT:    addi r4, r4, 16
+; CHECK-BE-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-BE-WACC-NEXT:    bdnz .LBB0_2
+; CHECK-BE-WACC-NEXT:  .LBB0_3: # %for.cond.cleanup
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <16 x i8>, ptr %Src, align 16
   %arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1
@@ -161,6 +221,62 @@ define dso_local void @testPHI2(ptr %Dst, ptr %Src, i32 signext %Len) {
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testPHI2:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v2, 0(r4)
+; CHECK-WACC-NEXT:    lxv v3, 16(r4)
+; CHECK-WACC-NEXT:    lxv vs0, 32(r4)
+; CHECK-WACC-NEXT:    cmpwi r5, 4
+; CHECK-WACC-NEXT:    xvf64ger wacc0, vsp34, vs0
+; CHECK-WACC-NEXT:    blt cr0, .LBB1_3
+; CHECK-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-WACC-NEXT:    clrldi r5, r5, 32
+; CHECK-WACC-NEXT:    addi r5, r5, -3
+; CHECK-WACC-NEXT:    mtctr r5
+; CHECK-WACC-NEXT:    addi r4, r4, 48
+; CHECK-WACC-NEXT:    .p2align 4
+; CHECK-WACC-NEXT:  .LBB1_2: # %for.body
+; CHECK-WACC-NEXT:    #
+; CHECK-WACC-NEXT:    lxv vs0, 0(r4)
+; CHECK-WACC-NEXT:    addi r4, r4, 16
+; CHECK-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-WACC-NEXT:    bdnz .LBB1_2
+; CHECK-WACC-NEXT:  .LBB1_3: # %for.cond.cleanup
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-WACC-NEXT:    stxv v4, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 48(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testPHI2:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-WACC-NEXT:    lxv vs0, 32(r4)
+; CHECK-BE-WACC-NEXT:    cmpwi r5, 4
+; CHECK-BE-WACC-NEXT:    xvf64ger wacc0, vsp34, vs0
+; CHECK-BE-WACC-NEXT:    blt cr0, .LBB1_3
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-BE-WACC-NEXT:    clrldi r5, r5, 32
+; CHECK-BE-WACC-NEXT:    addi r5, r5, -3
+; CHECK-BE-WACC-NEXT:    mtctr r5
+; CHECK-BE-WACC-NEXT:    addi r4, r4, 48
+; CHECK-BE-WACC-NEXT:    .p2align 4
+; CHECK-BE-WACC-NEXT:  .LBB1_2: # %for.body
+; CHECK-BE-WACC-NEXT:    #
+; CHECK-BE-WACC-NEXT:    lxv vs0, 0(r4)
+; CHECK-BE-WACC-NEXT:    addi r4, r4, 16
+; CHECK-BE-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-BE-WACC-NEXT:    bdnz .LBB1_2
+; CHECK-BE-WACC-NEXT:  .LBB1_3: # %for.cond.cleanup
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <16 x i8>, ptr %Src, align 16
   %arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1
@@ -229,6 +345,28 @@ define void @testImplicitDef(ptr %ptr) {
 ; CHECK-BE-NEXT:    xxmfacc acc0
 ; CHECK-BE-NEXT:    stxv vs3, 0(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testImplicitDef:
+; CHECK-WACC:       # %bb.0: # %label1
+; CHECK-WACC-NEXT:    # implicit-def: $wacc0
+; CHECK-WACC-NEXT:    bc 12, 4*cr5+lt, .LBB2_2
+; CHECK-WACC-NEXT:  # %bb.1: # %label2
+; CHECK-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-WACC-NEXT:  .LBB2_2: # %label3
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testImplicitDef:
+; CHECK-BE-WACC:       # %bb.0: # %label1
+; CHECK-BE-WACC-NEXT:    # implicit-def: $wacc0
+; CHECK-BE-WACC-NEXT:    bc 12, 4*cr5+lt, .LBB2_2
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %label2
+; CHECK-BE-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-BE-WACC-NEXT:  .LBB2_2: # %label3
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 label1:
   br i1 undef, label %label3, label %label2
 
@@ -312,6 +450,70 @@ define dso_local signext i32 @testNestedPHI(i32 signext %cond, i32 signext %coun
 ; CHECK-BE-NEXT:    stxv vs3, 48(r5)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r5)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testNestedPHI:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    cmplwi r3, 0
+; CHECK-WACC-NEXT:    beq cr0, .LBB3_2
+; CHECK-WACC-NEXT:  # %bb.1: # %if.then
+; CHECK-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-WACC-NEXT:    cmpwi r4, 1
+; CHECK-WACC-NEXT:    bge cr0, .LBB3_3
+; CHECK-WACC-NEXT:    b .LBB3_5
+; CHECK-WACC-NEXT:  .LBB3_2:
+; CHECK-WACC-NEXT:    # implicit-def: $wacc0
+; CHECK-WACC-NEXT:    cmpwi r4, 1
+; CHECK-WACC-NEXT:    blt cr0, .LBB3_5
+; CHECK-WACC-NEXT:  .LBB3_3: # %for.body.preheader
+; CHECK-WACC-NEXT:    addi r3, r4, -1
+; CHECK-WACC-NEXT:    clrldi r3, r3, 32
+; CHECK-WACC-NEXT:    addi r3, r3, 1
+; CHECK-WACC-NEXT:    mtctr r3
+; CHECK-WACC-NEXT:    .p2align 4
+; CHECK-WACC-NEXT:  .LBB3_4: # %for.body
+; CHECK-WACC-NEXT:    #
+; CHECK-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-WACC-NEXT:    bdnz .LBB3_4
+; CHECK-WACC-NEXT:  .LBB3_5: # %for.cond.cleanup
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    li r3, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r5)
+; CHECK-WACC-NEXT:    stxv v5, 32(r5)
+; CHECK-WACC-NEXT:    stxv v2, 16(r5)
+; CHECK-WACC-NEXT:    stxv v3, 0(r5)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testNestedPHI:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    cmplwi r3, 0
+; CHECK-BE-WACC-NEXT:    beq cr0, .LBB3_2
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %if.then
+; CHECK-BE-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    cmpwi r4, 1
+; CHECK-BE-WACC-NEXT:    bge cr0, .LBB3_3
+; CHECK-BE-WACC-NEXT:    b .LBB3_5
+; CHECK-BE-WACC-NEXT:  .LBB3_2:
+; CHECK-BE-WACC-NEXT:    # implicit-def: $wacc0
+; CHECK-BE-WACC-NEXT:    cmpwi r4, 1
+; CHECK-BE-WACC-NEXT:    blt cr0, .LBB3_5
+; CHECK-BE-WACC-NEXT:  .LBB3_3: # %for.body.preheader
+; CHECK-BE-WACC-NEXT:    addi r3, r4, -1
+; CHECK-BE-WACC-NEXT:    clrldi r3, r3, 32
+; CHECK-BE-WACC-NEXT:    addi r3, r3, 1
+; CHECK-BE-WACC-NEXT:    mtctr r3
+; CHECK-BE-WACC-NEXT:    .p2align 4
+; CHECK-BE-WACC-NEXT:  .LBB3_4: # %for.body
+; CHECK-BE-WACC-NEXT:    #
+; CHECK-BE-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    bdnz .LBB3_4
+; CHECK-BE-WACC-NEXT:  .LBB3_5: # %for.cond.cleanup
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    li r3, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r5)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r5)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r5)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r5)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %tobool.not = icmp eq i32 %cond, 0
   br i1 %tobool.not, label %if.end, label %if.then
diff --git a/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll b/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll
index 291cf97fd009e..929bf5f61dd90 100644
--- a/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll
+++ b/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -verify-machineinstrs -mcpu=ppc -mtriple=powerpc64-ibm-aix < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=future \
+; RUN:   -mtriple=powerpc64-ibm-aix < %s | FileCheck %s --check-prefix=CHECK-WACC
 
 target datalayout = "E-m:a-Fi64-i64:64-n32:64-S128-v256:256:256-v512:512:512"
 
@@ -38,6 +40,43 @@ define void @baz(i64 %arg) local_unnamed_addr #0 {
 ; CHECK-NEXT:    xxswapd 0, 0
 ; CHECK-NEXT:    stxv 0, 0(3)
 ; CHECK-NEXT:    blr
+;
+; CHECK-WACC-LABEL: baz:
+; CHECK-WACC:       # %bb.0: # %bb
+; CHECK-WACC-NEXT:    dmxxextfdmr512 34, 36, 0, 0
+; CHECK-WACC-NEXT:    xxmrgld 1, 34, 36
+; CHECK-WACC-NEXT:    xxswapd 2, 1
+; CHECK-WACC-NEXT:    xxlxor 0, 0, 0
+; CHECK-WACC-NEXT:    xvnegdp 1, 1
+; CHECK-WACC-NEXT:    xvnegdp 2, 2
+; CHECK-WACC-NEXT:    xvsubdp 1, 1, 0
+; CHECK-WACC-NEXT:    xvsubdp 2, 2, 37
+; CHECK-WACC-NEXT:    xvmuldp 1, 1, 0
+; CHECK-WACC-NEXT:    xvmuldp 2, 2, 0
+; CHECK-WACC-NEXT:    xvmaddadp 1, 0, 0
+; CHECK-WACC-NEXT:    xvmaddadp 2, 0, 0
+; CHECK-WACC-NEXT:    stxv 1, 0(3)
+; CHECK-WACC-NEXT:    stxv 2, 0(3)
+; CHECK-WACC-NEXT:    # implicit-def: $wacc0
+; CHECK-WACC-NEXT:    bc 12, 20, L..BB0_2
+; CHECK-WACC-NEXT:  # %bb.1: # %bb10
+; CHECK-WACC-NEXT:    xvf64gerpp 0, 34, 0
+; CHECK-WACC-NEXT:  L..BB0_2: # %bb12
+; CHECK-WACC-NEXT:    cmpdi 3, 0
+; CHECK-WACC-NEXT:    .align 4
+; CHECK-WACC-NEXT:  L..BB0_3: # %bb13
+; CHECK-WACC-NEXT:    #
+; CHECK-WACC-NEXT:    bc 4, 2, L..BB0_3
+; CHECK-WACC-NEXT:  # %bb.4: # %bb14
+; CHECK-WACC-NEXT:    dmxxextfdmr512 34, 36, 0, 0
+; CHECK-WACC-NEXT:    xxlxor 0, 0, 0
+; CHECK-WACC-NEXT:    xvsubdp 1, 0, 35
+; CHECK-WACC-NEXT:    xxlxor 2, 2, 2
+; CHECK-WACC-NEXT:    xvmaddadp 2, 1, 2
+; CHECK-WACC-NEXT:    xvadddp 0, 2, 0
+; CHECK-WACC-NEXT:    xxswapd 0, 0
+; CHECK-WACC-NEXT:    stxv 0, 0(3)
+; CHECK-WACC-NEXT:    blr
 bb:
   %call = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> poison)
   %extractvalue = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %call, 0
diff --git a/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll b/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll
index 7d6117719da1d..2f7d227fa9e06 100644
--- a/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/perfect-shuffle.ll
@@ -162,16 +162,16 @@ define <4 x float> @shuffle5(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3, <16 x
 ; BE-ENABLE-NEXT:    vextublx 3, 3, 2
 ; BE-ENABLE-NEXT:    xxmrghw 0, 1, 0
 ; BE-ENABLE-NEXT:    andi. 3, 3, 255
-; BE-ENABLE-NEXT:    xxlor 1, 0, 0
+; BE-ENABLE-NEXT:    xxlor 35, 0, 0
 ; BE-ENABLE-NEXT:    beq 0, .LBB4_2
 ; BE-ENABLE-NEXT:  # %bb.1: # %exit
-; BE-ENABLE-NEXT:    xvaddsp 34, 0, 1
+; BE-ENABLE-NEXT:    xvaddsp 34, 35, 0
 ; BE-ENABLE-NEXT:    blr
 ; BE-ENABLE-NEXT:  .LBB4_2: # %second
-; BE-ENABLE-NEXT:    xxmrglw 1, 36, 37
-; BE-ENABLE-NEXT:    xxmrghw 2, 36, 37
-; BE-ENABLE-NEXT:    xxmrghw 1, 2, 1
-; BE-ENABLE-NEXT:    xvaddsp 34, 0, 1
+; BE-ENABLE-NEXT:    xxmrglw 0, 36, 37
+; BE-ENABLE-NEXT:    xxmrghw 1, 36, 37
+; BE-ENABLE-NEXT:    xxmrghw 0, 1, 0
+; BE-ENABLE-NEXT:    xvaddsp 34, 35, 0
 ; BE-ENABLE-NEXT:    blr
 entry:
   %shuf1 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-1.ll b/llvm/test/CodeGen/PowerPC/sms-phi-1.ll
index 516d54ba2fdbe..509457042ed68 100644
--- a/llvm/test/CodeGen/PowerPC/sms-phi-1.ll
+++ b/llvm/test/CodeGen/PowerPC/sms-phi-1.ll
@@ -26,11 +26,12 @@ define void @main() nounwind #0 {
 ; CHECK-NEXT:    mullw 4, 6, 6
 ; CHECK-NEXT:    addi 5, 6, 1
 ; CHECK-NEXT:    bdz .LBB0_3
-; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:    .p2align 5
 ; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    mr 6, 5
 ; CHECK-NEXT:    stwu 4, 4(3)
-; CHECK-NEXT:    mullw 4, 5, 5
 ; CHECK-NEXT:    addi 5, 5, 1
+; CHECK-NEXT:    mullw 4, 6, 6
 ; CHECK-NEXT:    bdnz .LBB0_2
 ; CHECK-NEXT:  .LBB0_3:
 ; CHECK-NEXT:    stwu 4, 4(3)
diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-2.ll b/llvm/test/CodeGen/PowerPC/sms-phi-2.ll
index 4904d11fc8104..8b4b50239a1a0 100644
--- a/llvm/test/CodeGen/PowerPC/sms-phi-2.ll
+++ b/llvm/test/CodeGen/PowerPC/sms-phi-2.ll
@@ -5,46 +5,45 @@
 define void @phi2(i32, i32, ptr) local_unnamed_addr {
 ; CHECK-LABEL: phi2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    divw 8, 3, 4
+; CHECK-NEXT:    divw 7, 3, 4
 ; CHECK-NEXT:    li 5, 55
 ; CHECK-NEXT:    li 6, 48
 ; CHECK-NEXT:    mtctr 3
 ; CHECK-NEXT:    bdz .LBB0_4
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    divw 9, 8, 4
-; CHECK-NEXT:    mullw 7, 8, 4
-; CHECK-NEXT:    sub 3, 3, 7
+; CHECK-NEXT:    divw 9, 7, 4
+; CHECK-NEXT:    mullw 8, 7, 4
+; CHECK-NEXT:    sub 3, 3, 8
 ; CHECK-NEXT:    cmplwi 3, 10
-; CHECK-NEXT:    isellt 7, 6, 5
-; CHECK-NEXT:    add 3, 7, 3
-; CHECK-NEXT:    stbu 3, -1(7)
-; CHECK-NEXT:    mr 3, 8
+; CHECK-NEXT:    isellt 8, 6, 5
+; CHECK-NEXT:    add 3, 8, 3
+; CHECK-NEXT:    stbu 3, -1(8)
 ; CHECK-NEXT:    bdz .LBB0_3
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_2:
-; CHECK-NEXT:    mr 3, 9
-; CHECK-NEXT:    mullw 9, 9, 4
-; CHECK-NEXT:    divw 10, 3, 4
-; CHECK-NEXT:    sub 8, 8, 9
-; CHECK-NEXT:    cmplwi 8, 10
-; CHECK-NEXT:    isellt 9, 6, 5
-; CHECK-NEXT:    add 8, 9, 8
-; CHECK-NEXT:    mr 9, 10
-; CHECK-NEXT:    stbu 8, -1(7)
-; CHECK-NEXT:    mr 8, 3
+; CHECK-NEXT:    mr 3, 7
+; CHECK-NEXT:    mr 7, 9
+; CHECK-NEXT:    mullw 10, 9, 4
+; CHECK-NEXT:    divw 9, 9, 4
+; CHECK-NEXT:    sub 3, 3, 10
+; CHECK-NEXT:    cmplwi 3, 10
+; CHECK-NEXT:    isellt 10, 6, 5
+; CHECK-NEXT:    add 3, 10, 3
+; CHECK-NEXT:    stbu 3, -1(8)
 ; CHECK-NEXT:    bdnz .LBB0_2
 ; CHECK-NEXT:  .LBB0_3:
-; CHECK-NEXT:    mr 8, 9
+; CHECK-NEXT:    mr 3, 7
+; CHECK-NEXT:    mr 7, 9
 ; CHECK-NEXT:    b .LBB0_5
 ; CHECK-NEXT:  .LBB0_4:
-; CHECK-NEXT:    # implicit-def: $x7
+; CHECK-NEXT:    # implicit-def: $x8
 ; CHECK-NEXT:  .LBB0_5:
-; CHECK-NEXT:    mullw 4, 8, 4
+; CHECK-NEXT:    mullw 4, 7, 4
 ; CHECK-NEXT:    sub 3, 3, 4
 ; CHECK-NEXT:    cmplwi 3, 10
 ; CHECK-NEXT:    isellt 4, 6, 5
 ; CHECK-NEXT:    add 3, 4, 3
-; CHECK-NEXT:    stbu 3, -1(7)
+; CHECK-NEXT:    stbu 3, -1(8)
 ; CHECK-NEXT:    blr
   br label %4
 
diff --git a/llvm/test/CodeGen/PowerPC/vec_rotate_lw.ll b/llvm/test/CodeGen/PowerPC/vec_rotate_lw.ll
new file mode 100644
index 0000000000000..03b1456f0c036
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vec_rotate_lw.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-aix \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+
+define <4 x i32> @testVRLWMI(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: testVRLWMI:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvrlw v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call <4 x i32> @llvm.ppc.vsx.xvrlw(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+}
diff --git a/llvm/test/CodeGen/PowerPC/vec_rounding.ll b/llvm/test/CodeGen/PowerPC/vec_rounding.ll
index 2f16a435440ff..438c8ebdc099e 100644
--- a/llvm/test/CodeGen/PowerPC/vec_rounding.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_rounding.ll
@@ -1,172 +1,251 @@
-; RUN: llc -verify-machineinstrs -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
 
 ; Check vector round to single-precision toward -infinity (vrfim)
 ; instruction generation using Altivec.
 
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
 declare <2 x double> @llvm.floor.v2f64(<2 x double> %p)
 define <2 x double> @floor_v2f64(<2 x double> %p)
+; CHECK-LABEL: floor_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    frim 1, 1
+; CHECK-NEXT:    frim 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
   ret <2 x double> %t
 }
-; CHECK-LABEL: floor_v2f64:
-; CHECK: frim
-; CHECK: frim
 
 declare <4 x double> @llvm.floor.v4f64(<4 x double> %p)
 define <4 x double> @floor_v4f64(<4 x double> %p)
+; CHECK-LABEL: floor_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    frim 1, 1
+; CHECK-NEXT:    frim 2, 2
+; CHECK-NEXT:    frim 3, 3
+; CHECK-NEXT:    frim 4, 4
+; CHECK-NEXT:    blr
 {
   %t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
   ret <4 x double> %t
 }
-; CHECK-LABEL: floor_v4f64:
-; CHECK: frim
-; CHECK: frim
-; CHECK: frim
-; CHECK: frim
 
 declare <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
 define <2 x double> @ceil_v2f64(<2 x double> %p)
+; CHECK-LABEL: ceil_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    frip 1, 1
+; CHECK-NEXT:    frip 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
   ret <2 x double> %t
 }
-; CHECK-LABEL: ceil_v2f64:
-; CHECK: frip
-; CHECK: frip
 
 declare <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
 define <4 x double> @ceil_v4f64(<4 x double> %p)
+; CHECK-LABEL: ceil_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    frip 1, 1
+; CHECK-NEXT:    frip 2, 2
+; CHECK-NEXT:    frip 3, 3
+; CHECK-NEXT:    frip 4, 4
+; CHECK-NEXT:    blr
 {
   %t = call <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
   ret <4 x double> %t
 }
-; CHECK-LABEL: ceil_v4f64:
-; CHECK: frip
-; CHECK: frip
-; CHECK: frip
-; CHECK: frip
 
 declare <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
 define <2 x double> @trunc_v2f64(<2 x double> %p)
+; CHECK-LABEL: trunc_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    friz 1, 1
+; CHECK-NEXT:    friz 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
   ret <2 x double> %t
 }
-; CHECK-LABEL: trunc_v2f64:
-; CHECK: friz
-; CHECK: friz
 
 declare <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
 define <4 x double> @trunc_v4f64(<4 x double> %p)
+; CHECK-LABEL: trunc_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    friz 1, 1
+; CHECK-NEXT:    friz 2, 2
+; CHECK-NEXT:    friz 3, 3
+; CHECK-NEXT:    friz 4, 4
+; CHECK-NEXT:    blr
 {
   %t = call <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
   ret <4 x double> %t
 }
-; CHECK-LABEL: trunc_v4f64:
-; CHECK: friz
-; CHECK: friz
-; CHECK: friz
-; CHECK: friz
 
 declare <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
-define <2 x double> @nearbyint_v2f64(<2 x double> %p)
+define <2 x double> @nearbyint_v2f64(<2 x double> %p) nounwind
+; CHECK-LABEL: nearbyint_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    stdu 1, -128(1)
+; CHECK-NEXT:    std 0, 144(1)
+; CHECK-NEXT:    stfd 30, 112(1) # 8-byte Folded Spill
+; CHECK-NEXT:    stfd 31, 120(1) # 8-byte Folded Spill
+; CHECK-NEXT:    fmr 31, 2
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 30, 1
+; CHECK-NEXT:    fmr 1, 31
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 2, 1
+; CHECK-NEXT:    fmr 1, 30
+; CHECK-NEXT:    lfd 31, 120(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lfd 30, 112(1) # 8-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 128
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
 {
   %t = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
   ret <2 x double> %t
 }
-; CHECK-LABEL: nearbyint_v2f64:
-; CHECK: bl nearbyint
-; CHECK: bl nearbyint
 
 declare <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
-define <4 x double> @nearbyint_v4f64(<4 x double> %p)
+define <4 x double> @nearbyint_v4f64(<4 x double> %p) nounwind
+; CHECK-LABEL: nearbyint_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    stdu 1, -144(1)
+; CHECK-NEXT:    std 0, 160(1)
+; CHECK-NEXT:    stfd 28, 112(1) # 8-byte Folded Spill
+; CHECK-NEXT:    stfd 29, 120(1) # 8-byte Folded Spill
+; CHECK-NEXT:    fmr 29, 2
+; CHECK-NEXT:    stfd 30, 128(1) # 8-byte Folded Spill
+; CHECK-NEXT:    fmr 30, 3
+; CHECK-NEXT:    stfd 31, 136(1) # 8-byte Folded Spill
+; CHECK-NEXT:    fmr 31, 4
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 28, 1
+; CHECK-NEXT:    fmr 1, 29
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 29, 1
+; CHECK-NEXT:    fmr 1, 30
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 30, 1
+; CHECK-NEXT:    fmr 1, 31
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 4, 1
+; CHECK-NEXT:    fmr 1, 28
+; CHECK-NEXT:    lfd 31, 136(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lfd 28, 112(1) # 8-byte Folded Reload
+; CHECK-NEXT:    fmr 2, 29
+; CHECK-NEXT:    fmr 3, 30
+; CHECK-NEXT:    lfd 30, 128(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lfd 29, 120(1) # 8-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 144
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
 {
   %t = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
   ret <4 x double> %t
 }
-; CHECK-LABEL: nearbyint_v4f64:
-; CHECK: bl nearbyint
-; CHECK: bl nearbyint
-; CHECK: bl nearbyint
-; CHECK: bl nearbyint
 
 
 declare <4 x float> @llvm.floor.v4f32(<4 x float> %p)
 define <4 x float> @floor_v4f32(<4 x float> %p)
+; CHECK-LABEL: floor_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfim 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
   ret <4 x float> %t
 }
-; CHECK-LABEL: floor_v4f32:
-; CHECK: vrfim
 
 declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)
 define <8 x float> @floor_v8f32(<8 x float> %p)
+; CHECK-LABEL: floor_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfim 2, 2
+; CHECK-NEXT:    vrfim 3, 3
+; CHECK-NEXT:    blr
 {
   %t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
   ret <8 x float> %t
 }
-; CHECK-LABEL: floor_v8f32:
-; CHECK: vrfim
-; CHECK: vrfim
 
 declare <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
 define <4 x float> @ceil_v4f32(<4 x float> %p)
+; CHECK-LABEL: ceil_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfip 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
   ret <4 x float> %t
 }
-; CHECK-LABEL: ceil_v4f32:
-; CHECK: vrfip
 
 declare <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
 define <8 x float> @ceil_v8f32(<8 x float> %p)
+; CHECK-LABEL: ceil_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfip 2, 2
+; CHECK-NEXT:    vrfip 3, 3
+; CHECK-NEXT:    blr
 {
   %t = call <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
   ret <8 x float> %t
 }
-; CHECK-LABEL: ceil_v8f32:
-; CHECK: vrfip
-; CHECK: vrfip
 
 declare <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
 define <4 x float> @trunc_v4f32(<4 x float> %p)
+; CHECK-LABEL: trunc_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfiz 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
   ret <4 x float> %t
 }
-; CHECK-LABEL: trunc_v4f32:
-; CHECK: vrfiz
 
 declare <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
 define <8 x float> @trunc_v8f32(<8 x float> %p)
+; CHECK-LABEL: trunc_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfiz 2, 2
+; CHECK-NEXT:    vrfiz 3, 3
+; CHECK-NEXT:    blr
 {
   %t = call <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
   ret <8 x float> %t
 }
-; CHECK-LABEL: trunc_v8f32:
-; CHECK: vrfiz
-; CHECK: vrfiz
 
 declare <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
 define <4 x float> @nearbyint_v4f32(<4 x float> %p)
+; CHECK-LABEL: nearbyint_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfin 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
   ret <4 x float> %t
 }
-; CHECK-LABEL: nearbyint_v4f32:
-; CHECK: vrfin
 
 declare <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
 define <8 x float> @nearbyint_v8f32(<8 x float> %p)
+; CHECK-LABEL: nearbyint_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfin 2, 2
+; CHECK-NEXT:    vrfin 3, 3
+; CHECK-NEXT:    blr
 {
   %t = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
   ret <8 x float> %t
 }
-; CHECK-LABEL: nearbyint_v8f32:
-; CHECK: vrfin
-; CHECK: vrfin
diff --git a/llvm/test/CodeGen/PowerPC/vector-rotates.ll b/llvm/test/CodeGen/PowerPC/vector-rotates.ll
index 2de8804ba8e24..38e273634da2a 100644
--- a/llvm/test/CodeGen/PowerPC/vector-rotates.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-rotates.ll
@@ -5,6 +5,9 @@
 ; RUN: llc -O3 -mtriple=powerpc64-unknown-unknown -ppc-asm-full-reg-names \
 ; RUN:   -verify-machineinstrs -mcpu=pwr7 < %s | \
 ; RUN:   FileCheck --check-prefix=CHECK-P7 %s
+; RUN: llc -O3 -mtriple=powerpc64-unknown-unknown -ppc-asm-full-reg-names \
+; RUN:   -verify-machineinstrs -mcpu=future < %s | \
+; RUN:   FileCheck --check-prefix=CHECK-FUTURE %s
 
 define <16 x i8> @rotl_v16i8(<16 x i8> %a) {
 ; CHECK-P8-LABEL: rotl_v16i8:
@@ -23,6 +26,14 @@ define <16 x i8> @rotl_v16i8(<16 x i8> %a) {
 ; CHECK-P7-NEXT:    lxvw4x vs35, 0, r3
 ; CHECK-P7-NEXT:    vrlb v2, v2, v3
 ; CHECK-P7-NEXT:    blr
+;
+; CHECK-FUTURE-LABEL: rotl_v16i8:
+; CHECK-FUTURE:       # %bb.0: # %entry
+; CHECK-FUTURE-NEXT:    addis r3, r2, .LCPI0_0@toc@ha
+; CHECK-FUTURE-NEXT:    addi r3, r3, .LCPI0_0@toc@l
+; CHECK-FUTURE-NEXT:    lxv vs35, 0(r3)
+; CHECK-FUTURE-NEXT:    vrlb v2, v2, v3
+; CHECK-FUTURE-NEXT:    blr
 entry:
   %b = shl <16 x i8> %a, <i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 5, i8 5, i8 6, i8 6, i8 7, i8 7, i8 8, i8 8>
   %c = lshr <16 x i8> %a, <i8 7, i8 7, i8 6, i8 6, i8 5, i8 5, i8 4, i8 4, i8 3, i8 3, i8 2, i8 2, i8 1, i8 1, i8 0, i8 0>
@@ -47,6 +58,14 @@ define <8 x i16> @rotl_v8i16(<8 x i16> %a) {
 ; CHECK-P7-NEXT:    lxvw4x vs35, 0, r3
 ; CHECK-P7-NEXT:    vrlh v2, v2, v3
 ; CHECK-P7-NEXT:    blr
+;
+; CHECK-FUTURE-LABEL: rotl_v8i16:
+; CHECK-FUTURE:       # %bb.0: # %entry
+; CHECK-FUTURE-NEXT:    addis r3, r2, .LCPI1_0@toc@ha
+; CHECK-FUTURE-NEXT:    addi r3, r3, .LCPI1_0@toc@l
+; CHECK-FUTURE-NEXT:    lxv vs35, 0(r3)
+; CHECK-FUTURE-NEXT:    vrlh v2, v2, v3
+; CHECK-FUTURE-NEXT:    blr
 entry:
   %b = shl <8 x i16> %a, <i16 1, i16 2, i16 3, i16 5, i16 7, i16 11, i16 13, i16 16>
   %c = lshr <8 x i16> %a, <i16 15, i16 14, i16 13, i16 11, i16 9, i16 5, i16 3, i16 0>
@@ -71,6 +90,14 @@ define <4 x i32> @rotl_v4i32_0(<4 x i32> %a) {
 ; CHECK-P7-NEXT:    lxvw4x vs35, 0, r3
 ; CHECK-P7-NEXT:    vrlw v2, v2, v3
 ; CHECK-P7-NEXT:    blr
+;
+; CHECK-FUTURE-LABEL: rotl_v4i32_0:
+; CHECK-FUTURE:       # %bb.0: # %entry
+; CHECK-FUTURE-NEXT:    addis r3, r2, .LCPI2_0@toc@ha
+; CHECK-FUTURE-NEXT:    addi r3, r3, .LCPI2_0@toc@l
+; CHECK-FUTURE-NEXT:    lxv vs0, 0(r3)
+; CHECK-FUTURE-NEXT:    xvrlw vs34, vs34, vs0
+; CHECK-FUTURE-NEXT:    blr
 entry:
   %b = shl <4 x i32> %a, <i32 29, i32 19, i32 17, i32 11>
   %c = lshr <4 x i32> %a, <i32 3, i32 13, i32 15, i32 21>
@@ -94,6 +121,12 @@ define <4 x i32> @rotl_v4i32_1(<4 x i32> %a) {
 ; CHECK-P7-NEXT:    vsubuwm v3, v4, v3
 ; CHECK-P7-NEXT:    vrlw v2, v2, v3
 ; CHECK-P7-NEXT:    blr
+;
+; CHECK-FUTURE-LABEL: rotl_v4i32_1:
+; CHECK-FUTURE:       # %bb.0: # %entry
+; CHECK-FUTURE-NEXT:    xxspltiw vs0, 23
+; CHECK-FUTURE-NEXT:    xvrlw vs34, vs34, vs0
+; CHECK-FUTURE-NEXT:    blr
 entry:
   %b = shl <4 x i32> %a, <i32 23, i32 23, i32 23, i32 23>
   %c = lshr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
@@ -124,6 +157,14 @@ define <2 x i64> @rotl_v2i64(<2 x i64> %a) {
 ; CHECK-P7-NEXT:    addi r3, r1, -16
 ; CHECK-P7-NEXT:    lxvd2x vs34, 0, r3
 ; CHECK-P7-NEXT:    blr
+;
+; CHECK-FUTURE-LABEL: rotl_v2i64:
+; CHECK-FUTURE:       # %bb.0: # %entry
+; CHECK-FUTURE-NEXT:    addis r3, r2, .LCPI4_0@toc@ha
+; CHECK-FUTURE-NEXT:    addi r3, r3, .LCPI4_0@toc@l
+; CHECK-FUTURE-NEXT:    lxv vs35, 0(r3)
+; CHECK-FUTURE-NEXT:    vrld v2, v2, v3
+; CHECK-FUTURE-NEXT:    blr
 entry:
   %b = shl <2 x i64> %a, <i64 41, i64 53>
   %c = lshr <2 x i64> %a, <i64 23, i64 11>
diff --git a/llvm/test/CodeGen/PowerPC/vp-ld-st.ll b/llvm/test/CodeGen/PowerPC/vp-ld-st.ll
new file mode 100644
index 0000000000000..f0f9943e901ec
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vp-ld-st.ll
@@ -0,0 +1,160 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 \
+; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=future \
+; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck -check-prefix=FUTURE %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 \
+; RUN:   -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=future \
+; RUN:   -mtriple=powerpc64-unknown-unknown < %s | FileCheck --check-prefix=FUTURE %s
+
+; Function Attrs: nounwind readnone
+define void @stxvl1(<16 x i8> %a, ptr %b, i64 %c) {
+; CHECK-LABEL: stxvl1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sldi 3, 6, 56
+; CHECK-NEXT:    stxvl 34, 5, 3
+; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: stxvl1:
+; FUTURE:       # %bb.0: # %entry
+; FUTURE-NEXT:    stxvrl 34, 5, 6
+; FUTURE-NEXT:    blr
+entry:
+  %cconv =  trunc i64 %c to i32
+  tail call void @llvm.vp.store.v16i8.p0(<16 x i8> %a, ptr %b, <16 x i1> splat (i1 true), i32 %cconv)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+define void @stxvl2(<8 x i16> %a, ptr %b, i64 %c) {
+; CHECK-LABEL: stxvl2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sldi 3, 6, 57
+; CHECK-NEXT:    stxvl 34, 5, 3
+; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: stxvl2:
+; FUTURE:       # %bb.0: # %entry
+; FUTURE-NEXT:    sldi 3, 6, 1
+; FUTURE-NEXT:    stxvrl 34, 5, 3
+; FUTURE-NEXT:    blr
+entry:
+  %cconv =  trunc i64 %c to i32
+  tail call void @llvm.vp.store.v8i16.p0(<8 x i16> %a, ptr %b, <8 x i1> splat (i1 true), i32 %cconv)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+define void @stxvl4(<4 x i32> %a, ptr %b, i64 %c) {
+; CHECK-LABEL: stxvl4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sldi 3, 6, 58
+; CHECK-NEXT:    stxvl 34, 5, 3
+; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: stxvl4:
+; FUTURE:       # %bb.0: # %entry
+; FUTURE-NEXT:    sldi 3, 6, 2
+; FUTURE-NEXT:    stxvrl 34, 5, 3
+; FUTURE-NEXT:    blr
+entry:
+  %cconv =  trunc i64 %c to i32
+  tail call void @llvm.vp.store.v4i32.p0(<4 x i32> %a, ptr %b, <4 x i1> splat (i1 true), i32 %cconv)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+define void @stxvl8(<2 x i64> %a, ptr %b, i64 %c) {
+; CHECK-LABEL: stxvl8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sldi 3, 6, 59
+; CHECK-NEXT:    stxvl 34, 5, 3
+; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: stxvl8:
+; FUTURE:       # %bb.0: # %entry
+; FUTURE-NEXT:    sldi 3, 6, 3
+; FUTURE-NEXT:    stxvrl 34, 5, 3
+; FUTURE-NEXT:    blr
+entry:
+  %cconv =  trunc i64 %c to i32
+  tail call void @llvm.vp.store.v2i64.p0(<2 x i64> %a, ptr %b, <2 x i1> splat (i1 true), i32 %cconv)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+define <16 x i8> @lxvl1(ptr %a, i64 %b) {
+; CHECK-LABEL: lxvl1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sldi 4, 4, 56
+; CHECK-NEXT:    lxvl 34, 3, 4
+; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: lxvl1:
+; FUTURE:       # %bb.0: # %entry
+; FUTURE-NEXT:    lxvrl 34, 3, 4
+; FUTURE-NEXT:    blr
+entry:
+  %bconv =  trunc i64 %b to i32
+  %0 = tail call <16 x i8> @llvm.vp.load.v16i8.p0(ptr %a, <16 x i1> splat (i1 true), i32 %bconv)
+  ret <16 x i8> %0
+}
+
+; Function Attrs: nounwind readnone
+define <8 x i16> @lxvl2(ptr %a, i64 %b) {
+; CHECK-LABEL: lxvl2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sldi 4, 4, 57
+; CHECK-NEXT:    lxvl 34, 3, 4
+; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: lxvl2:
+; FUTURE:       # %bb.0: # %entry
+; FUTURE-NEXT:    sldi 4, 4, 1
+; FUTURE-NEXT:    lxvrl 34, 3, 4
+; FUTURE-NEXT:    blr
+entry:
+  %bconv =  trunc i64 %b to i32
+  %0 = tail call <8 x i16> @llvm.vp.load.v8i16.p0(ptr %a, <8 x i1> splat (i1 true), i32 %bconv)
+  ret <8 x i16> %0
+}
+
+; Function Attrs: nounwind readnone
+define <4 x i32> @lxvl4(ptr %a, i64 %b) {
+; CHECK-LABEL: lxvl4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sldi 4, 4, 58
+; CHECK-NEXT:    lxvl 34, 3, 4
+; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: lxvl4:
+; FUTURE:       # %bb.0: # %entry
+; FUTURE-NEXT:    sldi 4, 4, 2
+; FUTURE-NEXT:    lxvrl 34, 3, 4
+; FUTURE-NEXT:    blr
+entry:
+  %bconv =  trunc i64 %b to i32
+  %0 = tail call <4 x i32> @llvm.vp.load.v4i32.p0(ptr %a, <4 x i1> splat (i1 true), i32 %bconv)
+  ret <4 x i32> %0
+}
+
+; Function Attrs: nounwind readnone
+define <2 x i64> @lxvl8(ptr %a, i64 %b) {
+; CHECK-LABEL: lxvl8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sldi 4, 4, 59
+; CHECK-NEXT:    lxvl 34, 3, 4
+; CHECK-NEXT:    blr
+;
+; FUTURE-LABEL: lxvl8:
+; FUTURE:       # %bb.0: # %entry
+; FUTURE-NEXT:    sldi 4, 4, 3
+; FUTURE-NEXT:    lxvrl 34, 3, 4
+; FUTURE-NEXT:    blr
+entry:
+  %bconv =  trunc i64 %b to i32
+  %0 = tail call <2 x i64> @llvm.vp.load.v2i64.p0(ptr %a, <2 x i1> splat (i1 true), i32 %bconv)
+  ret <2 x i64> %0
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll
index b24ea9ec1561e..3c617f9854761 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll
@@ -32,9 +32,12 @@ define void @constant_fold_barrier_i128(ptr %p) {
 ; RV32-NEXT:    mv a6, a1
 ; RV32-NEXT:    seqz a7, a1
 ; RV32-NEXT:    and a1, a7, a1
+; RV32-NEXT:    mv a1, a1
 ; RV32-NEXT:    mv a7, a1
 ; RV32-NEXT:    seqz a3, a1
 ; RV32-NEXT:    and a1, a3, a1
+; RV32-NEXT:    mv a1, a1
+; RV32-NEXT:    mv a1, a1
 ; RV32-NEXT:    sw a2, 0(a0)
 ; RV32-NEXT:    sw a6, 4(a0)
 ; RV32-NEXT:    sw a7, 8(a0)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
index 225ceed9627b7..5f61ee2d02d24 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
@@ -103,15 +103,18 @@ define i64 @udiv64_constant_no_add(i64 %a) nounwind {
 ; RV32-NEXT:    mulhu a1, a1, a2
 ; RV32-NEXT:    add a5, a5, a6
 ; RV32-NEXT:    mv t0, t1
+; RV32-NEXT:    mv a1, a1
 ; RV32-NEXT:    sltu a4, a5, a6
 ; RV32-NEXT:    add a5, a5, a7
 ; RV32-NEXT:    sltu a6, t1, t1
 ; RV32-NEXT:    sltiu t1, t1, 0
 ; RV32-NEXT:    add t0, t0, t2
+; RV32-NEXT:    mv a1, a1
 ; RV32-NEXT:    sltu a2, a5, a7
 ; RV32-NEXT:    add a6, a6, t1
 ; RV32-NEXT:    sltu a5, t0, t2
 ; RV32-NEXT:    add t0, t0, a0
+; RV32-NEXT:    mv a1, a1
 ; RV32-NEXT:    add a2, a4, a2
 ; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    sltu a0, t0, a0
@@ -155,6 +158,7 @@ define i64 @udiv64_constant_add(i64 %a) nounwind {
 ; RV32-NEXT:    mulhu a7, a0, a2
 ; RV32-NEXT:    mulhu t2, a1, a3
 ; RV32-NEXT:    mv t1, t2
+; RV32-NEXT:    mv t1, t1
 ; RV32-NEXT:    mul t2, a1, a3
 ; RV32-NEXT:    mulhu a2, a1, a2
 ; RV32-NEXT:    mulhu a3, a0, a3
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll
new file mode 100644
index 0000000000000..5cb55f15c7c8c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll
@@ -0,0 +1,1341 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+; The intrinsics are not supported with RV32.
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i64(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i64(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i64(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i64(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i64(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i64(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i64(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i64(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i64(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i64(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i64(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i64(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll
new file mode 100644
index 0000000000000..fafd45b7579e8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll
@@ -0,0 +1,5100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i32(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i32(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i32(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i32(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i32(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i32(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i32(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i32(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i32(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i32(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i32(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i32(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i32(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i32(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i16(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i16(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i16(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i16(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i16(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i16(
+    <vscale x 32 x half> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i16(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i16(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i16(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i16(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i16(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i16(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i16(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i16(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i16(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.vloxei.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  iXLen);
+
+define <vscale x 64 x i8> @intrinsic_vloxei_v_nxv64i8_nxv64i8_nxv64i8(ptr %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 64 x i8> @llvm.riscv.vloxei.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> poison,
+    ptr %0,
+    <vscale x 64 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 64 x i8> %a
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.vloxei.mask.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  <vscale x 64 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 64 x i8> @intrinsic_vloxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 64 x i8> @llvm.riscv.vloxei.mask.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 64 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i8(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i8(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i8(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i8(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i8(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i8(
+    <vscale x 32 x half> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i8(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i8(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i8(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i8(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i8(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i8(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i8(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i8(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i8(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll
new file mode 100644
index 0000000000000..916af2556c6a8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll
@@ -0,0 +1,1341 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+; The intrinsics are not supported with RV32.
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i64(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i64(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i64(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i64(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i64(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i64(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i64(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i64(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i64(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i64(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i64(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i64(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll
new file mode 100644
index 0000000000000..8dd32a1d640dc
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll
@@ -0,0 +1,5100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i32(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i32(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i32(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i32(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i32(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i32(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i32(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i32(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i32(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i32(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i32(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i32(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i32(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i32(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i16(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i16(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i16(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i16(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i16(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i16(
+    <vscale x 32 x half> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i16(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i16(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i16(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i16(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i16(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i16(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i16(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i16(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i16(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.vluxei.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  iXLen);
+
+define <vscale x 64 x i8> @intrinsic_vluxei_v_nxv64i8_nxv64i8_nxv64i8(ptr %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 64 x i8> @llvm.riscv.vluxei.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> poison,
+    ptr %0,
+    <vscale x 64 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 64 x i8> %a
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.vluxei.mask.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  <vscale x 64 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 64 x i8> @intrinsic_vluxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 64 x i8> @llvm.riscv.vluxei.mask.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 64 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i8(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i8(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i8(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i8(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i8(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i8(
+    <vscale x 32 x half> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i8(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i8(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i8(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i8(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i8(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i8(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i8(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i8(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i8(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vse.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vse.ll
new file mode 100644
index 0000000000000..785d9fc6a7970
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vse.ll
@@ -0,0 +1,1575 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare void @llvm.riscv.vse.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+define void @intrinsic_vse_allonesmask_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_allonesmask_v_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i1> splat (i1 true),
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv1f64(
+  <vscale x 1 x double>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv1f64_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv1f64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv1f64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv1f64_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv1f64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv2f64(
+  <vscale x 2 x double>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv2f64_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv2f64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv2f64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv2f64_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv2f64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv4f64(
+  <vscale x 4 x double>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv4f64_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv4f64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv4f64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv4f64_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv4f64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv8f64(
+  <vscale x 8 x double>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv8f64_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv8f64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv8f64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv8f64_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv8f64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv1f32(
+  <vscale x 1 x float>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv1f32_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv1f32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv1f32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv1f32_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv1f32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv2f32(
+  <vscale x 2 x float>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv2f32_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv2f32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv2f32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv2f32_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv2f32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv4f32(
+  <vscale x 4 x float>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv4f32_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv4f32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv4f32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv4f32_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv4f32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv8f32(
+  <vscale x 8 x float>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv8f32_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv8f32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv8f32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv8f32_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv8f32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv16f32(
+  <vscale x 16 x float>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv16f32_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv16f32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv16f32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv16f32_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv16f32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv1f16(
+  <vscale x 1 x half>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv1f16_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv1f16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv1f16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv1f16_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv1f16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv2f16(
+  <vscale x 2 x half>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv2f16_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv2f16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv2f16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv2f16_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv2f16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv4f16(
+  <vscale x 4 x half>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv4f16_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv4f16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv4f16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv4f16_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv4f16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv8f16(
+  <vscale x 8 x half>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv8f16_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv8f16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv8f16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv8f16_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv8f16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv16f16(
+  <vscale x 16 x half>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv16f16_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv16f16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv16f16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv16f16_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv16f16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv32f16(
+  <vscale x 32 x half>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv32f16_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv32f16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv32f16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv32f16_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vse16.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv32f16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  iXLen);
+
+define void @intrinsic_vse_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vse_v_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    iXLen %2)
+
+  ret void
+}
+
+declare void @llvm.riscv.vse.mask.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i1>,
+  iXLen);
+
+define void @intrinsic_vse_mask_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vse_mask_v_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vse.mask.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i1> %2,
+    iXLen %3)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsm.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsm.ll
new file mode 100644
index 0000000000000..5237536c07740
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsm.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -global-isel -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -global-isel -verify-machineinstrs | FileCheck %s
+
+declare void @llvm.riscv.vsm.nxv1i1(<vscale x 1 x i1>, ptr, iXLen);
+
+define void @intrinsic_vsm_v_nxv1i1(<vscale x 1 x i1> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vsm_v_nxv1i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsm.v v0, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsm.nxv1i1(<vscale x 1 x i1> %0, ptr %1, iXLen %2)
+  ret void
+}
+
+declare void @llvm.riscv.vsm.nxv2i1(<vscale x 2 x i1>, ptr, iXLen);
+
+define void @intrinsic_vsm_v_nxv2i1(<vscale x 2 x i1> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vsm_v_nxv2i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsm.v v0, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsm.nxv2i1(<vscale x 2 x i1> %0, ptr %1, iXLen %2)
+  ret void
+}
+
+declare void @llvm.riscv.vsm.nxv4i1(<vscale x 4 x i1>, ptr, iXLen);
+
+define void @intrinsic_vsm_v_nxv4i1(<vscale x 4 x i1> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vsm_v_nxv4i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsm.v v0, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsm.nxv4i1(<vscale x 4 x i1> %0, ptr %1, iXLen %2)
+  ret void
+}
+
+declare void @llvm.riscv.vsm.nxv8i1(<vscale x 8 x i1>, ptr, iXLen);
+
+define void @intrinsic_vsm_v_nxv8i1(<vscale x 8 x i1> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vsm_v_nxv8i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsm.v v0, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsm.nxv8i1(<vscale x 8 x i1> %0, ptr %1, iXLen %2)
+  ret void
+}
+
+declare void @llvm.riscv.vsm.nxv16i1(<vscale x 16 x i1>, ptr, iXLen);
+
+define void @intrinsic_vsm_v_nxv16i1(<vscale x 16 x i1> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vsm_v_nxv16i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsm.v v0, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsm.nxv16i1(<vscale x 16 x i1> %0, ptr %1, iXLen %2)
+  ret void
+}
+
+declare void @llvm.riscv.vsm.nxv32i1(<vscale x 32 x i1>, ptr, iXLen);
+
+define void @intrinsic_vsm_v_nxv32i1(<vscale x 32 x i1> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vsm_v_nxv32i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsm.v v0, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsm.nxv32i1(<vscale x 32 x i1> %0, ptr %1, iXLen %2)
+  ret void
+}
+
+declare void @llvm.riscv.vsm.nxv64i1(<vscale x 64 x i1>, ptr, iXLen);
+
+define void @intrinsic_vsm_v_nxv64i1(<vscale x 64 x i1> %0, ptr %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vsm_v_nxv64i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsm.v v0, (a0)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsm.nxv64i1(<vscale x 64 x i1> %0, ptr %1, iXLen %2)
+  ret void
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmseq.nxv1i16(
+  <vscale x 1 x i16>,
+  <vscale x 1 x i16>,
+  iXLen);
+
+; Make sure we can use the vsetvli from the producing instruction.
+define void @test_vsetvli_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, ptr %2, iXLen %3) nounwind {
+; CHECK-LABEL: test_vsetvli_i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vmseq.vv v8, v8, v9
+; CHECK-NEXT:    vsm.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmseq.nxv1i16(
+    <vscale x 1 x i16> %0,
+    <vscale x 1 x i16> %1,
+    iXLen %3)
+  call void @llvm.riscv.vsm.nxv1i1(<vscale x 1 x i1> %a, ptr %2, iXLen %3)
+  ret void
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmseq.nxv1i32(
+  <vscale x 1 x i32>,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @test_vsetvli_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, ptr %2, iXLen %3) nounwind {
+; CHECK-LABEL: test_vsetvli_i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vmseq.vv v8, v8, v9
+; CHECK-NEXT:    vsm.v v8, (a0)
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmseq.nxv1i32(
+    <vscale x 1 x i32> %0,
+    <vscale x 1 x i32> %1,
+    iXLen %3)
+  call void @llvm.riscv.vsm.nxv1i1(<vscale x 1 x i1> %a, ptr %2, iXLen %3)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll
new file mode 100644
index 0000000000000..4963d91a14988
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll
@@ -0,0 +1,1293 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+; The intrinsics are not supported with RV32.
+
+declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll
new file mode 100644
index 0000000000000..7ea2e1734e5a2
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll
@@ -0,0 +1,4881 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  <vscale x 64 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsse.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsse.ll
new file mode 100644
index 0000000000000..b7609ff5fd1cd
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsse.ll
@@ -0,0 +1,1724 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare void @llvm.riscv.vsse.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  iXLen,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+define void @intrinsic_vsse_allonesmask_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_allonesmask_v_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> splat (i1 true),
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  iXLen,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  iXLen,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  iXLen,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv1f64(
+  <vscale x 1 x double>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv1f64_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv1f64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv1f64(
+  <vscale x 1 x double>,
+  ptr,
+  iXLen,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1f64_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv1f64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv2f64(
+  <vscale x 2 x double>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv2f64_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv2f64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv2f64(
+  <vscale x 2 x double>,
+  ptr,
+  iXLen,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2f64_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv2f64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv4f64(
+  <vscale x 4 x double>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv4f64_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv4f64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv4f64(
+  <vscale x 4 x double>,
+  ptr,
+  iXLen,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4f64_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv4f64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv8f64(
+  <vscale x 8 x double>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv8f64_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv8f64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv8f64(
+  <vscale x 8 x double>,
+  ptr,
+  iXLen,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8f64_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv8f64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  iXLen,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  iXLen,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  iXLen,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  iXLen,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  iXLen,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv1f32(
+  <vscale x 1 x float>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv1f32_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv1f32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv1f32(
+  <vscale x 1 x float>,
+  ptr,
+  iXLen,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1f32_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv1f32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv2f32(
+  <vscale x 2 x float>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv2f32_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv2f32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv2f32(
+  <vscale x 2 x float>,
+  ptr,
+  iXLen,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2f32_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv2f32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv4f32(
+  <vscale x 4 x float>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv4f32_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv4f32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv4f32(
+  <vscale x 4 x float>,
+  ptr,
+  iXLen,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4f32_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv4f32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv8f32(
+  <vscale x 8 x float>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv8f32_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv8f32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv8f32(
+  <vscale x 8 x float>,
+  ptr,
+  iXLen,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8f32_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv8f32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv16f32(
+  <vscale x 16 x float>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv16f32_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv16f32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv16f32(
+  <vscale x 16 x float>,
+  ptr,
+  iXLen,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, ptr %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16f32_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv16f32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  iXLen,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  iXLen,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  iXLen,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  iXLen,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  iXLen,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  iXLen,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv1f16(
+  <vscale x 1 x half>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv1f16_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv1f16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv1f16(
+  <vscale x 1 x half>,
+  ptr,
+  iXLen,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1f16_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv1f16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv2f16(
+  <vscale x 2 x half>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv2f16_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv2f16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv2f16(
+  <vscale x 2 x half>,
+  ptr,
+  iXLen,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2f16_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv2f16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv4f16(
+  <vscale x 4 x half>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv4f16_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv4f16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv4f16(
+  <vscale x 4 x half>,
+  ptr,
+  iXLen,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4f16_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv4f16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv8f16(
+  <vscale x 8 x half>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv8f16_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv8f16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv8f16(
+  <vscale x 8 x half>,
+  ptr,
+  iXLen,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8f16_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv8f16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv16f16(
+  <vscale x 16 x half>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv16f16_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv16f16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv16f16(
+  <vscale x 16 x half>,
+  ptr,
+  iXLen,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, ptr %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16f16_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv16f16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv32f16(
+  <vscale x 32 x half>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv32f16_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv32f16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv32f16(
+  <vscale x 32 x half>,
+  ptr,
+  iXLen,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, ptr %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv32f16_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-NEXT:    vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv32f16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf8, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  iXLen,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf8, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf4, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  iXLen,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf4, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  iXLen,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  iXLen,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  iXLen,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  iXLen,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  iXLen,
+  iXLen);
+
+define void @intrinsic_vsse_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, iXLen %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsse_v_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsse.mask.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  iXLen,
+  <vscale x 64 x i1>,
+  iXLen);
+
+define void @intrinsic_vsse_mask_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, iXLen %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsse_mask_v_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsse.mask.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    iXLen %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll
new file mode 100644
index 0000000000000..9bd272a368d20
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll
@@ -0,0 +1,1310 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfmin -global-isel -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+; The intrinsics are not supported with RV32.
+
+declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+define void @intrinsic_vsuxei_allonesmask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_allonesmask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> splat (i1 true),
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll
new file mode 100644
index 0000000000000..7cd15454d40b9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll
@@ -0,0 +1,4881 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  <vscale x 64 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index ea08061221fd4..769823d1c4216 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -75,6 +75,7 @@
 ; CHECK-NEXT:       CodeGen Prepare
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Exception handling preparation
+; CHECK-NEXT:     RISC-V Promote Constants
 ; CHECK-NEXT:     A No-Op Barrier Pass
 ; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Merge internal globals
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 22c2d8102b5ca..f26d4f09c92fb 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -125,6 +125,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+smcdeleg %s -o - | FileCheck --check-prefixes=CHECK,RV32SMCDELEG %s
 ; RUN: llc -mtriple=riscv32 -mattr=+smcntrpmf %s -o - | FileCheck --check-prefixes=CHECK,RV32SMCNTRPMF %s
 ; RUN: llc -mtriple=riscv32 -mattr=+smepmp %s -o - | FileCheck --check-prefixes=CHECK,RV32SMEPMP %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-smpmpmt %s -o - | FileCheck --check-prefixes=CHECK,RV32SMPMPMT %s
 ; RUN: llc -mtriple=riscv32 -mattr=+smrnmi %s -o - | FileCheck --check-prefixes=CHECK,RV32SMRNMI %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZFBFMIN %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zvfbfa %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFA %s
@@ -275,6 +276,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+smcdeleg %s -o - | FileCheck --check-prefixes=CHECK,RV64SMCDELEG %s
 ; RUN: llc -mtriple=riscv64 -mattr=+smcntrpmf %s -o - | FileCheck --check-prefixes=CHECK,RV64SMCNTRPMF %s
 ; RUN: llc -mtriple=riscv64 -mattr=+smepmp %s -o - | FileCheck --check-prefixes=CHECK,RV64SMEPMP %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-smpmpmt %s -o - | FileCheck --check-prefixes=CHECK,RV64SMPMPMT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+smrnmi %s -o - | FileCheck --check-prefixes=CHECK,RV64SMRNMI %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZFBFMIN %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zvfbfa %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFA %s
@@ -439,6 +441,7 @@
 ; RV32SMCDELEG: .attribute 5, "rv32i2p1_smcdeleg1p0"
 ; RV32SMCNTRPMF: .attribute 5, "rv32i2p1_smcntrpmf1p0"
 ; RV32SMEPMP: .attribute 5, "rv32i2p1_smepmp1p0"
+; RV32SMPMPMT: .attribute 5, "rv32i2p1_smpmpmt0p6"
 ; RV32SMRNMI: .attribute 5, "rv32i2p1_smrnmi1p0"
 ; RV32ZFBFMIN: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin1p0"
 ; RV32ZVFBFA: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin1p0_zve32f1p0_zve32x1p0_zvfbfa0p1_zvl32b1p0"
@@ -587,6 +590,7 @@
 ; RV64SMCDELEG: .attribute 5, "rv64i2p1_smcdeleg1p0"
 ; RV64SMCNTRPMF: .attribute 5, "rv64i2p1_smcntrpmf1p0"
 ; RV64SMEPMP: .attribute 5, "rv64i2p1_smepmp1p0"
+; RV64SMPMPMT: .attribute 5, "rv64i2p1_smpmpmt0p6"
 ; RV64SMRNMI: .attribute 5, "rv64i2p1_smrnmi1p0"
 ; RV64ZFBFMIN: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0"
 ; RV64ZVFBFA: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0_zve32f1p0_zve32x1p0_zvfbfa0p1_zvl32b1p0"
diff --git a/llvm/test/CodeGen/RISCV/branch-on-zero.ll b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
index 02aeebdeb3775..2aec92eca145f 100644
--- a/llvm/test/CodeGen/RISCV/branch-on-zero.ll
+++ b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
@@ -127,13 +127,11 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
 ; RV32-NEXT:  .LBB3_2: # %while.body
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    lw a3, 0(a1)
-; RV32-NEXT:    addi a4, a1, 4
+; RV32-NEXT:    addi a1, a1, 4
 ; RV32-NEXT:    slli a3, a3, 1
-; RV32-NEXT:    addi a1, a0, 4
 ; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    mv a1, a4
-; RV32-NEXT:    bne a4, a2, .LBB3_2
+; RV32-NEXT:    addi a0, a0, 4
+; RV32-NEXT:    bne a1, a2, .LBB3_2
 ; RV32-NEXT:  .LBB3_3: # %while.end
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
@@ -151,13 +149,11 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
 ; RV64-NEXT:  .LBB3_2: # %while.body
 ; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64-NEXT:    lw a3, 0(a1)
-; RV64-NEXT:    addi a4, a1, 4
+; RV64-NEXT:    addi a1, a1, 4
 ; RV64-NEXT:    slli a3, a3, 1
-; RV64-NEXT:    addi a1, a0, 4
 ; RV64-NEXT:    sw a3, 0(a0)
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    mv a1, a4
-; RV64-NEXT:    bne a4, a2, .LBB3_2
+; RV64-NEXT:    addi a0, a0, 4
+; RV64-NEXT:    bne a1, a2, .LBB3_2
 ; RV64-NEXT:  .LBB3_3: # %while.end
 ; RV64-NEXT:    li a0, 0
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/cfi-multiple-locations.mir b/llvm/test/CodeGen/RISCV/cfi-multiple-locations.mir
new file mode 100644
index 0000000000000..7844589e3f93c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/cfi-multiple-locations.mir
@@ -0,0 +1,35 @@
+# RUN: llc %s -mtriple=riscv64 \
+# RUN: -run-pass=cfi-instr-inserter \
+# RUN: -riscv-enable-cfi-instr-inserter=true
+# XFAIL: *
+
+# Technically, it is possible that a callee-saved register is saved in multiple different locations.
+# CFIInstrInserter should handle this, but currently it does not.
+---
+name: multiple_locations
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x9, $x2
+    BEQ $x10, $x0, %bb.3
+    PseudoBR %bb.2
+
+  bb.1:
+    liveins: $x10, $x9, $x2
+    $x5 = COPY $x9
+    CFI_INSTRUCTION register $x9, $x5
+    $x9 = COPY $x5
+    CFI_INSTRUCTION register $x9, $x9
+    PseudoBR %bb.3
+
+  bb.2:
+    liveins: $x10, $x9, $x2
+    SD $x9, $x2, 0 :: (store (s64))
+    CFI_INSTRUCTION offset $x9, 0
+    $x9  = LD $x2, 0 :: (load (s64))
+    CFI_INSTRUCTION register $x9, $x9
+    PseudoBR %bb.3
+
+  bb.3:
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 37e11dbb12731..3d9906fdcbeb3 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -27,6 +27,7 @@
 ; CHECK-NEXT:   experimental                     - Experimental intrinsics.
 ; CHECK-NEXT:   experimental-p                   - 'P' ('Base P' (Packed SIMD)).
 ; CHECK-NEXT:   experimental-rvm23u32            - RISC-V experimental-rvm23u32 profile.
+; CHECK-NEXT:   experimental-smpmpmt             - 'Smpmpmt' (PMP-based Memory Types Extension).
 ; CHECK-NEXT:   experimental-svukte              - 'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses).
 ; CHECK-NEXT:   experimental-xqccmp              - 'Xqccmp' (Qualcomm 16-bit Push/Pop and Double Moves).
 ; CHECK-NEXT:   experimental-xqcia               - 'Xqcia' (Qualcomm uC Arithmetic Extension).
@@ -136,6 +137,8 @@
 ; CHECK-NEXT:   shgatpa                          - 'Shgatpa' (SvNNx4 mode supported for all modes supported by satp, as well as Bare).
 ; CHECK-NEXT:   shifted-zextw-fusion             - Enable SLLI+SRLI to be fused when computing (shifted) word zero extension.
 ; CHECK-NEXT:   shlcofideleg                     - 'Shlcofideleg' (Delegating LCOFI Interrupts to VS-mode).
+; CHECK-NEXT:   short-forward-branch-i-minmax    - Enable short forward branch optimization for min,max instructions in Zbb.
+; CHECK-NEXT:   short-forward-branch-i-mul       - Enable short forward branch optimization for mul instruction.
 ; CHECK-NEXT:   short-forward-branch-opt         - Enable short forward branch optimization.
 ; CHECK-NEXT:   shtvala                          - 'Shtvala' (htval provides all needed values).
 ; CHECK-NEXT:   shvsatpa                         - 'Shvsatpa' (vsatp supports all modes supported by satp).
diff --git a/llvm/test/CodeGen/RISCV/machine-copyprop-noop-removal.mir b/llvm/test/CodeGen/RISCV/machine-copyprop-noop-removal.mir
index d739537b50d05..293b15bf9d25e 100644
--- a/llvm/test/CodeGen/RISCV/machine-copyprop-noop-removal.mir
+++ b/llvm/test/CodeGen/RISCV/machine-copyprop-noop-removal.mir
@@ -1,8 +1,11 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -o - %s -mtriple=riscv64 -run-pass=machine-cp -mcp-use-is-copy-instr | FileCheck %s
 
-## This test was added to capture a case where MachineCopyPropagation risks
-## leaving a no-op register move (add, x0, reg).
+## This test was added to capture a case where MachineCopyPropagation may
+## leave a no-op register move (add reg, x0, reg).
+## Due to the bug reported in
+## <https://github.com/llvm/llvm-project/issues/166870>, we are not currently
+## able to optimize this case.
 
 ---
 name: ham
@@ -21,6 +24,7 @@ body: |
   ; CHECK-NEXT:   liveins: $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $x11 = ADDI $x0, 0
+  ; CHECK-NEXT:   renamable $x10 = ADDI killed renamable $x10, 0
   ; CHECK-NEXT:   BEQ renamable $x10, $x0, %bb.4
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
diff --git a/llvm/test/CodeGen/RISCV/machine-outliner-cfi.mir b/llvm/test/CodeGen/RISCV/machine-outliner-cfi.mir
index 2acb1d43e01ea..78d242b5a28b9 100644
--- a/llvm/test/CodeGen/RISCV/machine-outliner-cfi.mir
+++ b/llvm/test/CodeGen/RISCV/machine-outliner-cfi.mir
@@ -3,27 +3,33 @@
 # RUN: llc -mtriple=riscv64 -x mir -run-pass=machine-outliner -simplify-mir -verify-machineinstrs < %s \
 # RUN: | FileCheck -check-prefixes=OUTLINED,RV64I-MO %s
 
-# CFIs are invisible (they can be outlined, but won't actually impact the outlining result) if there
-# is no need to unwind. CFIs will be stripped when we build outlined functions.
+# Combined tests for outlining with CFI instructions on RISC-V:
+# 1) All CFIs present in candidate: outline as tail-call and keep CFIs.
+# 2) Partial CFIs in function (extra outside candidate): do not outline.
+# 3) CFIs present but candidate is not a tail-call: do not outline.
 
 --- |
-  define void @func1(i32 %a, i32 %b) nounwind { ret void }
-
-  define void @func2(i32 %a, i32 %b) nounwind { ret void }
-
-  define void @func3(i32 %a, i32 %b) nounwind { ret void }
+  define void @funcA(i32 %a, i32 %b) nounwind { ret void }
+  define void @funcB(i32 %a, i32 %b) nounwind { ret void }
+  define void @funcC(i32 %a, i32 %b) nounwind { ret void }
+  define void @funcD(i32 %a, i32 %b) nounwind { ret void }
+  define void @funcE(i32 %a, i32 %b) nounwind { ret void }
+  define void @funcF(i32 %a, i32 %b) nounwind { ret void }
 ...
+
+# Case 1: All CFIs present; expect outlining and CFIs retained in outlined body.
 ---
-name:            func1
+name:            funcA
 tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $x10, $x11
-    ; RV32I-MO-LABEL: name: func1
+    ; RV32I-MO-LABEL: name: funcA
     ; RV32I-MO: liveins: $x10, $x11
     ; RV32I-MO-NEXT: {{  $}}
     ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
-    ; RV64I-MO-LABEL: name: func1
+    ;
+    ; RV64I-MO-LABEL: name: funcA
     ; RV64I-MO: liveins: $x10, $x11
     ; RV64I-MO-NEXT: {{  $}}
     ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
@@ -39,62 +45,213 @@ body:             |
     PseudoRET
 ...
 ---
-name:            func2
+name:            funcB
 tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $x10, $x11
-    ; RV32I-MO-LABEL: name: func2
+    ; RV32I-MO-LABEL: name: funcB
     ; RV32I-MO: liveins: $x10, $x11
     ; RV32I-MO-NEXT: {{  $}}
     ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
-    ; RV64I-MO-LABEL: name: func2
+    ;
+    ; RV64I-MO-LABEL: name: funcB
     ; RV64I-MO: liveins: $x10, $x11
     ; RV64I-MO-NEXT: {{  $}}
     ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     $x10 = ORI $x10, 1023
     CFI_INSTRUCTION offset $x1, 0
     $x11 = ORI $x11, 1023
-    CFI_INSTRUCTION offset $x1, -8
-    $x12 = ADDI $x10, 17
     CFI_INSTRUCTION offset $x1, -4
+    $x12 = ADDI $x10, 17
+    CFI_INSTRUCTION offset $x1, -8
     $x11 = AND $x12, $x11
     CFI_INSTRUCTION offset $x1, -12
     $x10 = SUB $x10, $x11
     PseudoRET
 ...
+
+# Case 2: Partial CFIs (extra CFI outside candidate in funcD); expect no outlining.
 ---
-name:            func3
+name:            funcC
 tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $x10, $x11
-    ; RV32I-MO-LABEL: name: func3
+    ; RV32I-MO-LABEL: name: funcC
     ; RV32I-MO: liveins: $x10, $x11
     ; RV32I-MO-NEXT: {{  $}}
     ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
-    ; RV64I-MO-LABEL: name: func3
+    ;
+    ; RV64I-MO-LABEL: name: funcC
     ; RV64I-MO: liveins: $x10, $x11
     ; RV64I-MO-NEXT: {{  $}}
     ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     $x10 = ORI $x10, 1023
-    CFI_INSTRUCTION offset $x1, -12
+    CFI_INSTRUCTION offset $x1, 0
     $x11 = ORI $x11, 1023
+    CFI_INSTRUCTION offset $x1, -4
+    $x12 = ADDI $x10, 17
     CFI_INSTRUCTION offset $x1, -8
+    $x11 = AND $x12, $x11
+    CFI_INSTRUCTION offset $x1, -12
+    $x10 = SUB $x10, $x11
+    PseudoRET
+...
+---
+name:            funcD
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x10, $x11
+    ; RV32I-MO-LABEL: name: funcD
+    ; RV32I-MO: liveins: $x10, $x11
+    ; RV32I-MO-NEXT: {{  $}}
+    ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -16
+    ; RV32I-MO-NEXT: $x10 = ORI $x10, 1023
+    ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, 0
+    ; RV32I-MO-NEXT: $x11 = ORI $x11, 1023
+    ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -4
+    ; RV32I-MO-NEXT: $x12 = ADDI $x10, 17
+    ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -8
+    ; RV32I-MO-NEXT: $x11 = AND $x12, $x11
+    ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -12
+    ; RV32I-MO-NEXT: $x10 = SUB $x10, $x11
+    ; RV32I-MO-NEXT: PseudoRET
+    ;
+    ; RV64I-MO-LABEL: name: funcD
+    ; RV64I-MO: liveins: $x10, $x11
+    ; RV64I-MO-NEXT: {{  $}}
+    ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -16
+    ; RV64I-MO-NEXT: $x10 = ORI $x10, 1023
+    ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, 0
+    ; RV64I-MO-NEXT: $x11 = ORI $x11, 1023
+    ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -4
+    ; RV64I-MO-NEXT: $x12 = ADDI $x10, 17
+    ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -8
+    ; RV64I-MO-NEXT: $x11 = AND $x12, $x11
+    ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -12
+    ; RV64I-MO-NEXT: $x10 = SUB $x10, $x11
+    ; RV64I-MO-NEXT: PseudoRET
+    CFI_INSTRUCTION offset $x1, -16
+    $x10 = ORI $x10, 1023
+    CFI_INSTRUCTION offset $x1, 0
+    $x11 = ORI $x11, 1023
+    CFI_INSTRUCTION offset $x1, -4
     $x12 = ADDI $x10, 17
+    CFI_INSTRUCTION offset $x1, -8
+    $x11 = AND $x12, $x11
+    CFI_INSTRUCTION offset $x1, -12
+    $x10 = SUB $x10, $x11
+    PseudoRET
+...
+
+# Case 3: CFIs present but candidate is not a tail-call; expect no outlining.
+---
+name:            funcE
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x10, $x11
+    ; RV32I-MO-LABEL: name: funcE
+    ; RV32I-MO: liveins: $x10, $x11
+    ; RV32I-MO-NEXT: {{  $}}
+    ; RV32I-MO-NEXT: $x10 = ORI $x10, 1023
+    ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, 0
+    ; RV32I-MO-NEXT: $x11 = ORI $x11, 1023
+    ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -4
+    ; RV32I-MO-NEXT: $x12 = ADDI $x10, 17
+    ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -8
+    ; RV32I-MO-NEXT: $x11 = AND $x12, $x11
+    ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -12
+    ; RV32I-MO-NEXT: $x10 = SUB $x10, $x11
+    ; RV32I-MO-NEXT: $x10 = ADDI $x10, 1
+    ; RV32I-MO-NEXT: PseudoRET
+    ;
+    ; RV64I-MO-LABEL: name: funcE
+    ; RV64I-MO: liveins: $x10, $x11
+    ; RV64I-MO-NEXT: {{  $}}
+    ; RV64I-MO-NEXT: $x10 = ORI $x10, 1023
+    ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, 0
+    ; RV64I-MO-NEXT: $x11 = ORI $x11, 1023
+    ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -4
+    ; RV64I-MO-NEXT: $x12 = ADDI $x10, 17
+    ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -8
+    ; RV64I-MO-NEXT: $x11 = AND $x12, $x11
+    ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -12
+    ; RV64I-MO-NEXT: $x10 = SUB $x10, $x11
+    ; RV64I-MO-NEXT: $x10 = ADDI $x10, 1
+    ; RV64I-MO-NEXT: PseudoRET
+    $x10 = ORI $x10, 1023
+    CFI_INSTRUCTION offset $x1, 0
+    $x11 = ORI $x11, 1023
     CFI_INSTRUCTION offset $x1, -4
+    $x12 = ADDI $x10, 17
+    CFI_INSTRUCTION offset $x1, -8
     $x11 = AND $x12, $x11
+    CFI_INSTRUCTION offset $x1, -12
+    $x10 = SUB $x10, $x11
+    $x10 = ADDI $x10, 1
+    PseudoRET
+...
+---
+name:            funcF
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x10, $x11
+    ; RV32I-MO-LABEL: name: funcF
+    ; RV32I-MO: liveins: $x10, $x11
+    ; RV32I-MO-NEXT: {{  $}}
+    ; RV32I-MO-NEXT: $x10 = ORI $x10, 1023
+    ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, 0
+    ; RV32I-MO-NEXT: $x11 = ORI $x11, 1023
+    ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -4
+    ; RV32I-MO-NEXT: $x12 = ADDI $x10, 17
+    ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -8
+    ; RV32I-MO-NEXT: $x11 = AND $x12, $x11
+    ; RV32I-MO-NEXT: CFI_INSTRUCTION offset $x1, -12
+    ; RV32I-MO-NEXT: $x10 = SUB $x10, $x11
+    ; RV32I-MO-NEXT: $x10 = ADDI $x10, 2
+    ; RV32I-MO-NEXT: PseudoRET
+    ;
+    ; RV64I-MO-LABEL: name: funcF
+    ; RV64I-MO: liveins: $x10, $x11
+    ; RV64I-MO-NEXT: {{  $}}
+    ; RV64I-MO-NEXT: $x10 = ORI $x10, 1023
+    ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, 0
+    ; RV64I-MO-NEXT: $x11 = ORI $x11, 1023
+    ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -4
+    ; RV64I-MO-NEXT: $x12 = ADDI $x10, 17
+    ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -8
+    ; RV64I-MO-NEXT: $x11 = AND $x12, $x11
+    ; RV64I-MO-NEXT: CFI_INSTRUCTION offset $x1, -12
+    ; RV64I-MO-NEXT: $x10 = SUB $x10, $x11
+    ; RV64I-MO-NEXT: $x10 = ADDI $x10, 2
+    ; RV64I-MO-NEXT: PseudoRET
+    $x10 = ORI $x10, 1023
     CFI_INSTRUCTION offset $x1, 0
+    $x11 = ORI $x11, 1023
+    CFI_INSTRUCTION offset $x1, -4
+    $x12 = ADDI $x10, 17
+    CFI_INSTRUCTION offset $x1, -8
+    $x11 = AND $x12, $x11
+    CFI_INSTRUCTION offset $x1, -12
     $x10 = SUB $x10, $x11
+    $x10 = ADDI $x10, 2
     PseudoRET
-
+...
 
 # OUTLINED-LABEL: name: OUTLINED_FUNCTION_0
 # OUTLINED: liveins: $x11, $x10
 # OUTLINED-NEXT: {{  $}}
 # OUTLINED-NEXT: $x10 = ORI $x10, 1023
+# OUTLINED-NEXT: CFI_INSTRUCTION offset $x1, 0
 # OUTLINED-NEXT: $x11 = ORI $x11, 1023
+# OUTLINED-NEXT: CFI_INSTRUCTION offset $x1, -4
 # OUTLINED-NEXT: $x12 = ADDI $x10, 17
+# OUTLINED-NEXT: CFI_INSTRUCTION offset $x1, -8
 # OUTLINED-NEXT: $x11 = AND $x12, $x11
+# OUTLINED-NEXT: CFI_INSTRUCTION offset $x1, -12
 # OUTLINED-NEXT: $x10 = SUB $x10, $x11
 # OUTLINED-NEXT: PseudoRET
diff --git a/llvm/test/CodeGen/RISCV/machine-pipeliner.ll b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll
index d250098576687..a2a7da7e2d6ef 100644
--- a/llvm/test/CodeGen/RISCV/machine-pipeliner.ll
+++ b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll
@@ -54,37 +54,37 @@ define void @test_pipelined_1(ptr noalias %in, ptr noalias %out, i32 signext %cn
 ; CHECK-PIPELINED:       # %bb.0: # %entry
 ; CHECK-PIPELINED-NEXT:    blez a2, .LBB1_6
 ; CHECK-PIPELINED-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-PIPELINED-NEXT:    lw a4, 0(a1)
+; CHECK-PIPELINED-NEXT:    lw a7, 0(a1)
 ; CHECK-PIPELINED-NEXT:    addi a2, a2, -1
+; CHECK-PIPELINED-NEXT:    addi a3, a0, 4
+; CHECK-PIPELINED-NEXT:    addi a5, a1, 4
 ; CHECK-PIPELINED-NEXT:    sh2add.uw a6, a2, a1
-; CHECK-PIPELINED-NEXT:    addi a2, a0, 4
-; CHECK-PIPELINED-NEXT:    addi a1, a1, 4
 ; CHECK-PIPELINED-NEXT:    addi a6, a6, 4
-; CHECK-PIPELINED-NEXT:    beq a1, a6, .LBB1_5
+; CHECK-PIPELINED-NEXT:    beq a5, a6, .LBB1_5
 ; CHECK-PIPELINED-NEXT:  # %bb.2: # %for.body
-; CHECK-PIPELINED-NEXT:    lw a5, 0(a1)
-; CHECK-PIPELINED-NEXT:    addi a3, a2, 4
-; CHECK-PIPELINED-NEXT:    addi a4, a4, 1
-; CHECK-PIPELINED-NEXT:    addi a1, a1, 4
-; CHECK-PIPELINED-NEXT:    beq a1, a6, .LBB1_4
+; CHECK-PIPELINED-NEXT:    lw a1, 0(a5)
+; CHECK-PIPELINED-NEXT:    addi a4, a3, 4
+; CHECK-PIPELINED-NEXT:    addi a5, a5, 4
+; CHECK-PIPELINED-NEXT:    beq a5, a6, .LBB1_4
 ; CHECK-PIPELINED-NEXT:  .LBB1_3: # %for.body
 ; CHECK-PIPELINED-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-PIPELINED-NEXT:    sw a4, 0(a0)
-; CHECK-PIPELINED-NEXT:    mv a4, a5
-; CHECK-PIPELINED-NEXT:    lw a5, 0(a1)
-; CHECK-PIPELINED-NEXT:    mv a0, a2
-; CHECK-PIPELINED-NEXT:    mv a2, a3
-; CHECK-PIPELINED-NEXT:    addi a3, a3, 4
-; CHECK-PIPELINED-NEXT:    addi a4, a4, 1
-; CHECK-PIPELINED-NEXT:    addi a1, a1, 4
-; CHECK-PIPELINED-NEXT:    bne a1, a6, .LBB1_3
+; CHECK-PIPELINED-NEXT:    addi a2, a7, 1
+; CHECK-PIPELINED-NEXT:    mv a7, a1
+; CHECK-PIPELINED-NEXT:    lw a1, 0(a5)
+; CHECK-PIPELINED-NEXT:    sw a2, 0(a0)
+; CHECK-PIPELINED-NEXT:    mv a0, a3
+; CHECK-PIPELINED-NEXT:    mv a3, a4
+; CHECK-PIPELINED-NEXT:    addi a4, a4, 4
+; CHECK-PIPELINED-NEXT:    addi a5, a5, 4
+; CHECK-PIPELINED-NEXT:    bne a5, a6, .LBB1_3
 ; CHECK-PIPELINED-NEXT:  .LBB1_4:
-; CHECK-PIPELINED-NEXT:    sw a4, 0(a0)
-; CHECK-PIPELINED-NEXT:    mv a0, a2
-; CHECK-PIPELINED-NEXT:    mv a4, a5
+; CHECK-PIPELINED-NEXT:    addi a7, a7, 1
+; CHECK-PIPELINED-NEXT:    sw a7, 0(a0)
+; CHECK-PIPELINED-NEXT:    mv a0, a3
+; CHECK-PIPELINED-NEXT:    mv a7, a1
 ; CHECK-PIPELINED-NEXT:  .LBB1_5:
-; CHECK-PIPELINED-NEXT:    addi a4, a4, 1
-; CHECK-PIPELINED-NEXT:    sw a4, 0(a0)
+; CHECK-PIPELINED-NEXT:    addi a7, a7, 1
+; CHECK-PIPELINED-NEXT:    sw a7, 0(a0)
 ; CHECK-PIPELINED-NEXT:  .LBB1_6: # %for.end
 ; CHECK-PIPELINED-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/mask-variable-shift.ll b/llvm/test/CodeGen/RISCV/mask-variable-shift.ll
new file mode 100644
index 0000000000000..4e73cee30ef08
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/mask-variable-shift.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=RV32
+; RUN: llc -mtriple=riscv64-none-elf -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=RV64
+
+define i32 @mask_pair(i32 %x, i32 %y) {
+; RV32-LABEL: mask_pair:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srl a0, a0, a1
+; RV32-NEXT:    sll a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mask_pair:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srlw a0, a0, a1
+; RV64-NEXT:    sllw a0, a0, a1
+; RV64-NEXT:    ret
+  %shl = shl nsw i32 -1, %y
+  %and = and i32 %shl, %x
+  ret i32 %and
+}
+
+define i64 @mask_pair_64(i64 %x, i64 %y) {
+; RV32-LABEL: mask_pair_64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a3, -1
+; RV32-NEXT:    addi a4, a2, -32
+; RV32-NEXT:    sll a3, a3, a2
+; RV32-NEXT:    bltz a4, .LBB1_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:    j .LBB1_3
+; RV32-NEXT:  .LBB1_2:
+; RV32-NEXT:    not a2, a2
+; RV32-NEXT:    lui a5, 524288
+; RV32-NEXT:    addi a5, a5, -1
+; RV32-NEXT:    srl a2, a5, a2
+; RV32-NEXT:    or a2, a3, a2
+; RV32-NEXT:  .LBB1_3:
+; RV32-NEXT:    srai a4, a4, 31
+; RV32-NEXT:    and a3, a4, a3
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    and a0, a3, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mask_pair_64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srl a0, a0, a1
+; RV64-NEXT:    sll a0, a0, a1
+; RV64-NEXT:    ret
+  %shl = shl nsw i64 -1, %y
+  %and = and i64 %shl, %x
+  ret i64 %and
+}
+
+define i128 @mask_pair_128(i128 %x, i128 %y) {
+; RV32-LABEL: mask_pair_128:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lw a5, 0(a1)
+; RV32-NEXT:    lw a4, 4(a1)
+; RV32-NEXT:    lw a3, 8(a1)
+; RV32-NEXT:    lw a1, 12(a1)
+; RV32-NEXT:    lw a2, 0(a2)
+; RV32-NEXT:    li a6, -1
+; RV32-NEXT:    sw zero, 0(sp)
+; RV32-NEXT:    sw zero, 4(sp)
+; RV32-NEXT:    sw zero, 8(sp)
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    addi a7, sp, 16
+; RV32-NEXT:    sw a6, 16(sp)
+; RV32-NEXT:    sw a6, 20(sp)
+; RV32-NEXT:    sw a6, 24(sp)
+; RV32-NEXT:    sw a6, 28(sp)
+; RV32-NEXT:    srli a6, a2, 3
+; RV32-NEXT:    andi a6, a6, 12
+; RV32-NEXT:    sub a6, a7, a6
+; RV32-NEXT:    lw a7, 4(a6)
+; RV32-NEXT:    lw t0, 8(a6)
+; RV32-NEXT:    lw t1, 12(a6)
+; RV32-NEXT:    lw a6, 0(a6)
+; RV32-NEXT:    andi t2, a2, 31
+; RV32-NEXT:    xori t2, t2, 31
+; RV32-NEXT:    sll t1, t1, a2
+; RV32-NEXT:    srli t3, t0, 1
+; RV32-NEXT:    sll t0, t0, a2
+; RV32-NEXT:    srli t4, a7, 1
+; RV32-NEXT:    sll a7, a7, a2
+; RV32-NEXT:    sll a2, a6, a2
+; RV32-NEXT:    srli a6, a6, 1
+; RV32-NEXT:    srl t3, t3, t2
+; RV32-NEXT:    srl t4, t4, t2
+; RV32-NEXT:    srl a6, a6, t2
+; RV32-NEXT:    and a2, a2, a5
+; RV32-NEXT:    or a5, t1, t3
+; RV32-NEXT:    or t0, t0, t4
+; RV32-NEXT:    or a6, a7, a6
+; RV32-NEXT:    and a4, a6, a4
+; RV32-NEXT:    and a3, t0, a3
+; RV32-NEXT:    and a1, a5, a1
+; RV32-NEXT:    sw a2, 0(a0)
+; RV32-NEXT:    sw a4, 4(a0)
+; RV32-NEXT:    sw a3, 8(a0)
+; RV32-NEXT:    sw a1, 12(a0)
+; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mask_pair_128:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a5, -1
+; RV64-NEXT:    addi a4, a2, -64
+; RV64-NEXT:    sll a3, a5, a2
+; RV64-NEXT:    bltz a4, .LBB2_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:    j .LBB2_3
+; RV64-NEXT:  .LBB2_2:
+; RV64-NEXT:    not a2, a2
+; RV64-NEXT:    srli a5, a5, 1
+; RV64-NEXT:    srl a2, a5, a2
+; RV64-NEXT:    or a2, a3, a2
+; RV64-NEXT:  .LBB2_3:
+; RV64-NEXT:    srai a4, a4, 63
+; RV64-NEXT:    and a3, a4, a3
+; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    and a0, a3, a0
+; RV64-NEXT:    ret
+  %shl = shl nsw i128 -1, %y
+  %and = and i128 %shl, %x
+  ret i128 %and
+}
diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index ba6769b2aa3e1..0306bb18c2aed 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -232,7 +232,7 @@ define i64 @uaddo3_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
   ret i64 %Q
 }
 
-; TODO? CGP sinks the compare before we have a chance to form the overflow intrinsic.
+; Ensure CGP doesn't sink the compare before we have a chance to form the overflow intrinsic.
 
 define i64 @uaddo4(i64 %a, i64 %b, i1 %c) nounwind ssp {
 ; RV32-LABEL: uaddo4:
@@ -1076,41 +1076,37 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) {
 ; RV32-NEXT:    .cfi_offset s4, -24
 ; RV32-NEXT:    .cfi_offset s5, -28
 ; RV32-NEXT:    .cfi_offset s6, -32
-; RV32-NEXT:    mv s5, a5
-; RV32-NEXT:    mv s3, a1
+; RV32-NEXT:    mv s1, a5
+; RV32-NEXT:    mv s4, a1
 ; RV32-NEXT:    andi a1, a5, 1
-; RV32-NEXT:    beqz a1, .LBB32_8
+; RV32-NEXT:    beqz a1, .LBB32_6
 ; RV32-NEXT:  # %bb.1: # %t
 ; RV32-NEXT:    mv s0, a4
-; RV32-NEXT:    mv s2, a3
-; RV32-NEXT:    mv s1, a2
-; RV32-NEXT:    mv s4, a0
-; RV32-NEXT:    beq s3, a3, .LBB32_3
+; RV32-NEXT:    mv s3, a3
+; RV32-NEXT:    mv s2, a2
+; RV32-NEXT:    mv s5, a0
+; RV32-NEXT:    beq s4, a3, .LBB32_3
 ; RV32-NEXT:  # %bb.2: # %t
-; RV32-NEXT:    sltu s6, s3, s2
+; RV32-NEXT:    sltu s6, s4, s3
 ; RV32-NEXT:    j .LBB32_4
 ; RV32-NEXT:  .LBB32_3:
-; RV32-NEXT:    sltu s6, s4, s1
+; RV32-NEXT:    sltu s6, s5, s2
 ; RV32-NEXT:  .LBB32_4: # %t
 ; RV32-NEXT:    mv a0, s6
 ; RV32-NEXT:    call call
-; RV32-NEXT:    beqz s6, .LBB32_8
+; RV32-NEXT:    beqz s6, .LBB32_6
 ; RV32-NEXT:  # %bb.5: # %end
-; RV32-NEXT:    sltu a1, s4, s1
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    beq s3, s2, .LBB32_7
-; RV32-NEXT:  # %bb.6: # %end
-; RV32-NEXT:    sltu a0, s3, s2
-; RV32-NEXT:  .LBB32_7: # %end
-; RV32-NEXT:    sub a2, s3, s2
-; RV32-NEXT:    sub a3, s4, s1
-; RV32-NEXT:    sub a2, a2, a1
-; RV32-NEXT:    sw a3, 0(s0)
-; RV32-NEXT:    sw a2, 4(s0)
-; RV32-NEXT:    j .LBB32_9
-; RV32-NEXT:  .LBB32_8: # %f
-; RV32-NEXT:    mv a0, s5
-; RV32-NEXT:  .LBB32_9: # %f
+; RV32-NEXT:    sltu a0, s5, s2
+; RV32-NEXT:    sub a1, s4, s3
+; RV32-NEXT:    sub a2, s5, s2
+; RV32-NEXT:    sub a1, a1, a0
+; RV32-NEXT:    sw a2, 0(s0)
+; RV32-NEXT:    sw a1, 4(s0)
+; RV32-NEXT:    mv a0, s6
+; RV32-NEXT:    j .LBB32_7
+; RV32-NEXT:  .LBB32_6: # %f
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:  .LBB32_7: # %f
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/remat.ll b/llvm/test/CodeGen/RISCV/remat.ll
index 92ae85f560cd4..8490dd0877d30 100644
--- a/llvm/test/CodeGen/RISCV/remat.ll
+++ b/llvm/test/CodeGen/RISCV/remat.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O1 -mtriple=riscv32 -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefix=RV32I
+; RUN: llc -O1 -mtriple=riscv64 -verify-machineinstrs < %s | FileCheck %s
 
 @a = common global i32 0, align 4
 @l = common global i32 0, align 4
@@ -21,113 +20,113 @@
 ; situation.
 
 define i32 @test() nounwind {
-; RV32I-LABEL: test:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lui s0, %hi(a)
-; RV32I-NEXT:    lw a0, %lo(a)(s0)
-; RV32I-NEXT:    beqz a0, .LBB0_11
-; RV32I-NEXT:  # %bb.1: # %for.body.preheader
-; RV32I-NEXT:    lui s1, %hi(l)
-; RV32I-NEXT:    lui s2, %hi(k)
-; RV32I-NEXT:    lui s3, %hi(j)
-; RV32I-NEXT:    lui s4, %hi(i)
-; RV32I-NEXT:    lui s5, %hi(d)
-; RV32I-NEXT:    lui s6, %hi(e)
-; RV32I-NEXT:    lui s7, %hi(f)
-; RV32I-NEXT:    lui s8, %hi(g)
-; RV32I-NEXT:    lui s9, %hi(h)
-; RV32I-NEXT:    lui s10, %hi(c)
-; RV32I-NEXT:    lui s11, %hi(b)
-; RV32I-NEXT:    j .LBB0_3
-; RV32I-NEXT:  .LBB0_2: # %for.inc
-; RV32I-NEXT:    # in Loop: Header=BB0_3 Depth=1
-; RV32I-NEXT:    lw a0, %lo(a)(s0)
-; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    sw a0, %lo(a)(s0)
-; RV32I-NEXT:    beqz a0, .LBB0_11
-; RV32I-NEXT:  .LBB0_3: # %for.body
-; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    lw a1, %lo(l)(s1)
-; RV32I-NEXT:    beqz a1, .LBB0_5
-; RV32I-NEXT:  # %bb.4: # %if.then
-; RV32I-NEXT:    # in Loop: Header=BB0_3 Depth=1
-; RV32I-NEXT:    lw a1, %lo(b)(s11)
-; RV32I-NEXT:    lw a2, %lo(c)(s10)
-; RV32I-NEXT:    lw a3, %lo(d)(s5)
-; RV32I-NEXT:    lw a4, %lo(e)(s6)
-; RV32I-NEXT:    li a5, 32
-; RV32I-NEXT:    call foo
-; RV32I-NEXT:  .LBB0_5: # %if.end
-; RV32I-NEXT:    # in Loop: Header=BB0_3 Depth=1
-; RV32I-NEXT:    lw a0, %lo(k)(s2)
-; RV32I-NEXT:    beqz a0, .LBB0_7
-; RV32I-NEXT:  # %bb.6: # %if.then3
-; RV32I-NEXT:    # in Loop: Header=BB0_3 Depth=1
-; RV32I-NEXT:    lw a0, %lo(b)(s11)
-; RV32I-NEXT:    lw a1, %lo(c)(s10)
-; RV32I-NEXT:    lw a2, %lo(d)(s5)
-; RV32I-NEXT:    lw a3, %lo(e)(s6)
-; RV32I-NEXT:    lw a4, %lo(f)(s7)
-; RV32I-NEXT:    li a5, 64
-; RV32I-NEXT:    call foo
-; RV32I-NEXT:  .LBB0_7: # %if.end5
-; RV32I-NEXT:    # in Loop: Header=BB0_3 Depth=1
-; RV32I-NEXT:    lw a0, %lo(j)(s3)
-; RV32I-NEXT:    beqz a0, .LBB0_9
-; RV32I-NEXT:  # %bb.8: # %if.then7
-; RV32I-NEXT:    # in Loop: Header=BB0_3 Depth=1
-; RV32I-NEXT:    lw a0, %lo(c)(s10)
-; RV32I-NEXT:    lw a1, %lo(d)(s5)
-; RV32I-NEXT:    lw a2, %lo(e)(s6)
-; RV32I-NEXT:    lw a3, %lo(f)(s7)
-; RV32I-NEXT:    lw a4, %lo(g)(s8)
-; RV32I-NEXT:    li a5, 32
-; RV32I-NEXT:    call foo
-; RV32I-NEXT:  .LBB0_9: # %if.end9
-; RV32I-NEXT:    # in Loop: Header=BB0_3 Depth=1
-; RV32I-NEXT:    lw a0, %lo(i)(s4)
-; RV32I-NEXT:    beqz a0, .LBB0_2
-; RV32I-NEXT:  # %bb.10: # %if.then11
-; RV32I-NEXT:    # in Loop: Header=BB0_3 Depth=1
-; RV32I-NEXT:    lw a0, %lo(d)(s5)
-; RV32I-NEXT:    lw a1, %lo(e)(s6)
-; RV32I-NEXT:    lw a2, %lo(f)(s7)
-; RV32I-NEXT:    lw a3, %lo(g)(s8)
-; RV32I-NEXT:    lw a4, %lo(h)(s9)
-; RV32I-NEXT:    li a5, 32
-; RV32I-NEXT:    call foo
-; RV32I-NEXT:    j .LBB0_2
-; RV32I-NEXT:  .LBB0_11: # %for.end
-; RV32I-NEXT:    li a0, 1
-; RV32I-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 32(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 64
-; RV32I-NEXT:    ret
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi sp, sp, -112
+; CHECK-NEXT:    sd ra, 104(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s0, 96(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s1, 88(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s3, 72(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s4, 64(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s5, 56(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s6, 48(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s7, 40(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s8, 32(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s9, 24(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s10, 16(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s11, 8(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    lui s0, %hi(a)
+; CHECK-NEXT:    lw a0, %lo(a)(s0)
+; CHECK-NEXT:    beqz a0, .LBB0_11
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    lui s1, %hi(l)
+; CHECK-NEXT:    lui s2, %hi(k)
+; CHECK-NEXT:    lui s3, %hi(j)
+; CHECK-NEXT:    lui s4, %hi(i)
+; CHECK-NEXT:    lui s5, %hi(d)
+; CHECK-NEXT:    lui s6, %hi(e)
+; CHECK-NEXT:    lui s7, %hi(f)
+; CHECK-NEXT:    lui s8, %hi(g)
+; CHECK-NEXT:    lui s9, %hi(h)
+; CHECK-NEXT:    lui s10, %hi(c)
+; CHECK-NEXT:    lui s11, %hi(b)
+; CHECK-NEXT:    j .LBB0_3
+; CHECK-NEXT:  .LBB0_2: # %for.inc
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    lw a0, %lo(a)(s0)
+; CHECK-NEXT:    addiw a0, a0, -1
+; CHECK-NEXT:    sw a0, %lo(a)(s0)
+; CHECK-NEXT:    beqz a0, .LBB0_11
+; CHECK-NEXT:  .LBB0_3: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    lw a1, %lo(l)(s1)
+; CHECK-NEXT:    beqz a1, .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %if.then
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    lw a4, %lo(e)(s6)
+; CHECK-NEXT:    lw a3, %lo(d)(s5)
+; CHECK-NEXT:    lw a2, %lo(c)(s10)
+; CHECK-NEXT:    lw a1, %lo(b)(s11)
+; CHECK-NEXT:    li a5, 32
+; CHECK-NEXT:    call foo
+; CHECK-NEXT:  .LBB0_5: # %if.end
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    lw a0, %lo(k)(s2)
+; CHECK-NEXT:    beqz a0, .LBB0_7
+; CHECK-NEXT:  # %bb.6: # %if.then3
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    lw a4, %lo(f)(s7)
+; CHECK-NEXT:    lw a3, %lo(e)(s6)
+; CHECK-NEXT:    lw a2, %lo(d)(s5)
+; CHECK-NEXT:    lw a1, %lo(c)(s10)
+; CHECK-NEXT:    lw a0, %lo(b)(s11)
+; CHECK-NEXT:    li a5, 64
+; CHECK-NEXT:    call foo
+; CHECK-NEXT:  .LBB0_7: # %if.end5
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    lw a0, %lo(j)(s3)
+; CHECK-NEXT:    beqz a0, .LBB0_9
+; CHECK-NEXT:  # %bb.8: # %if.then7
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    lw a4, %lo(g)(s8)
+; CHECK-NEXT:    lw a3, %lo(f)(s7)
+; CHECK-NEXT:    lw a2, %lo(e)(s6)
+; CHECK-NEXT:    lw a1, %lo(d)(s5)
+; CHECK-NEXT:    lw a0, %lo(c)(s10)
+; CHECK-NEXT:    li a5, 32
+; CHECK-NEXT:    call foo
+; CHECK-NEXT:  .LBB0_9: # %if.end9
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    lw a0, %lo(i)(s4)
+; CHECK-NEXT:    beqz a0, .LBB0_2
+; CHECK-NEXT:  # %bb.10: # %if.then11
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    lw a4, %lo(h)(s9)
+; CHECK-NEXT:    lw a3, %lo(g)(s8)
+; CHECK-NEXT:    lw a2, %lo(f)(s7)
+; CHECK-NEXT:    lw a1, %lo(e)(s6)
+; CHECK-NEXT:    lw a0, %lo(d)(s5)
+; CHECK-NEXT:    li a5, 32
+; CHECK-NEXT:    call foo
+; CHECK-NEXT:    j .LBB0_2
+; CHECK-NEXT:  .LBB0_11: # %for.end
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    ld ra, 104(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s0, 96(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s1, 88(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s2, 80(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s3, 72(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s4, 64(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s5, 56(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s6, 48(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s7, 40(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s8, 32(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s9, 24(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s10, 16(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s11, 8(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    addi sp, sp, 112
+; CHECK-NEXT:    ret
 entry:
   %.pr = load i32, ptr @a, align 4
   %tobool14 = icmp eq i32 %.pr, 0
diff --git a/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll
index c489bc3681876..aa63552eb4b63 100644
--- a/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll
+++ b/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll
@@ -488,5 +488,5 @@ declare <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double>)
 declare <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float>)
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "target-features"="+v" }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+v" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+v" }
 ;.
diff --git a/llvm/test/CodeGen/RISCV/riscv-promote-constant.ll b/llvm/test/CodeGen/RISCV/riscv-promote-constant.ll
new file mode 100644
index 0000000000000..2bde6013b3640
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/riscv-promote-constant.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt %s -S -riscv-promote-const -mtriple=riscv64 -mattr=+d | FileCheck %s
+
+; No promotion should take place, as the pass skips floats.
+define float @multiple_floats(float %a, float %b) {
+; CHECK-LABEL: define float @multiple_floats(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd float [[A]], 1.000000e+00
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd float [[B]], 2.000000e+00
+; CHECK-NEXT:    [[SUM_F:%.*]] = fadd float [[ADD1]], [[ADD2]]
+; CHECK-NEXT:    ret float [[SUM_F]]
+;
+entry:
+  %add1 = fadd float %a, 1.0
+  %add2 = fadd float %b, 2.0
+  %sum_f = fadd float %add1, %add2
+  ret float %sum_f
+}
+
+; No promotion should take place as cases with a single constant are skipped.
+define double @single_double(double %a) {
+; CHECK-LABEL: define double @single_double(
+; CHECK-SAME: double [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[A]], 4.210000e+01
+; CHECK-NEXT:    ret double [[ADD]]
+;
+entry:
+  %add = fadd double %a, 42.1
+  ret double %add
+}
+
+; Promotion should happen as we have at least two unique constants that would
+; otherwise go in the constant pool.
+define double @multiple_doubles(double %a, double %b) {
+; CHECK-LABEL: define double @multiple_doubles(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DOUBLE_VAL1:%.*]] = load double, ptr getelementptr inbounds ([2 x double], ptr @.promoted_doubles.multiple_doubles, i64 0, i64 1), align 8
+; CHECK-NEXT:    [[ADD3:%.*]] = load double, ptr @.promoted_doubles.multiple_doubles, align 8
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd double [[A]], [[ADD3]]
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd double [[B]], [[DOUBLE_VAL1]]
+; CHECK-NEXT:    [[SUM:%.*]] = fadd double [[ADD2]], [[ADD3]]
+; CHECK-NEXT:    [[SUM1:%.*]] = fadd double [[ADD4]], [[SUM]]
+; CHECK-NEXT:    ret double [[SUM1]]
+;
+entry:
+  %add1 = fadd double %a, 2.718
+  %add2 = fadd double %b, 42.1
+  %add3 = fadd double %add1, 2.718
+  %sum = fadd double %add2, %add3
+  ret double %sum
+}
+
+; Promotion should not happen as the constants will be materialised rather
+; than using the constant pool.
+define double @multiple_doubles_no_promote(double %a, double %b) {
+; CHECK-LABEL: define double @multiple_doubles_no_promote(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd double [[A]], 1.000000e+00
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd double [[B]], 2.000000e+00
+; CHECK-NEXT:    [[ADD3:%.*]] = fadd double [[ADD1]], 1.000000e+00
+; CHECK-NEXT:    [[SUM:%.*]] = fadd double [[ADD2]], [[ADD3]]
+; CHECK-NEXT:    ret double [[SUM]]
+;
+entry:
+  %add1 = fadd double %a, 1.0
+  %add2 = fadd double %b, 2.0
+  %add3 = fadd double %add1, 1.0
+  %sum = fadd double %add2, %add3
+  ret double %sum
+}
+
+; The same constant shouldn't be loaded more than once per BB.
+define double @multiple_doubles_multi_bb(double %a, i1 %cond) {
+; CHECK-LABEL: define double @multiple_doubles_multi_bb(
+; CHECK-SAME: double [[A:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF_TRUE:.*]], label %[[IF_FALSE:.*]]
+; CHECK:       [[IF_TRUE]]:
+; CHECK-NEXT:    [[DOUBLE_VAL2:%.*]] = load double, ptr getelementptr inbounds ([2 x double], ptr @.promoted_doubles.multiple_doubles_multi_bb, i64 0, i64 1), align 8
+; CHECK-NEXT:    [[DOUBLE_VAL:%.*]] = load double, ptr @.promoted_doubles.multiple_doubles_multi_bb, align 8
+; CHECK-NEXT:    [[ADD_T:%.*]] = fadd double [[A]], [[DOUBLE_VAL]]
+; CHECK-NEXT:    [[MUL_T:%.*]] = fmul double [[ADD_T]], [[DOUBLE_VAL2]]
+; CHECK-NEXT:    [[SUB_T:%.*]] = fsub double [[MUL_T]], [[DOUBLE_VAL]]
+; CHECK-NEXT:    br label %[[IF_END:.*]]
+; CHECK:       [[IF_FALSE]]:
+; CHECK-NEXT:    [[DOUBLE_VAL3:%.*]] = load double, ptr getelementptr inbounds ([2 x double], ptr @.promoted_doubles.multiple_doubles_multi_bb, i64 0, i64 1), align 8
+; CHECK-NEXT:    [[DOUBLE_VAL1:%.*]] = load double, ptr @.promoted_doubles.multiple_doubles_multi_bb, align 8
+; CHECK-NEXT:    [[ADD_F:%.*]] = fadd double [[A]], [[DOUBLE_VAL1]]
+; CHECK-NEXT:    [[MUL_F:%.*]] = fmul double [[ADD_F]], [[DOUBLE_VAL3]]
+; CHECK-NEXT:    [[SUB_F:%.*]] = fsub double [[MUL_F]], [[DOUBLE_VAL1]]
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi double [ [[SUB_T]], %[[IF_TRUE]] ], [ [[SUB_F]], %[[IF_FALSE]] ]
+; CHECK-NEXT:    ret double [[PHI_RES]]
+;
+entry:
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  %add.t = fadd double %a, 1.23
+  %mul.t = fmul double %add.t, 4.56
+  %sub.t = fsub double %mul.t, 1.23
+  br label %if.end
+
+if.false:
+  %add.f = fadd double %a, 1.23
+  %mul.f = fmul double %add.f, 4.56
+  %sub.f = fsub double %mul.f, 1.23
+  br label %if.end
+
+if.end:
+  %phi.res = phi double [ %sub.t, %if.true ], [ %sub.f, %if.false ]
+  ret double %phi.res
+}
+
+; Check the insertion point in the case we have a phi taking a constant C and
+; the source block also uses that same constant.
+define double @multiple_doubles_phi(double %a, i1 %cond) {
+; CHECK-LABEL: define double @multiple_doubles_phi(
+; CHECK-SAME: double [[A:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[DOUBLE_VAL:%.*]] = load double, ptr @.promoted_doubles.multiple_doubles_phi, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[A]], [[DOUBLE_VAL]]
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[PHI_VAL:%.*]] = phi double [ [[DOUBLE_VAL]], %[[IF_THEN]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[DOUBLE_VAL1:%.*]] = load double, ptr getelementptr inbounds ([2 x double], ptr @.promoted_doubles.multiple_doubles_phi, i64 0, i64 1), align 8
+; CHECK-NEXT:    [[RES:%.*]] = fadd double [[PHI_VAL]], [[DOUBLE_VAL1]]
+; CHECK-NEXT:    ret double [[RES]]
+;
+entry:
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  %mul = fmul double %a, 1.23
+  br label %if.end
+
+if.end:
+  %phi.val = phi double [ 1.23, %if.then ], [ %a, %entry ]
+  %res = fadd double %phi.val, 4.56
+  ret double %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll
new file mode 100644
index 0000000000000..bf0a2e5e35a01
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh < %s | FileCheck %s
+
+; CHECK-LABEL:  .section	.llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte   3
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   0
+; Num Functions
+; CHECK-NEXT:   .word   1
+; Num LargeConstants
+; CHECK-NEXT:   .word   0
+; Num Callsites
+; CHECK-NEXT:   .word   1
+
+; Functions and stack size
+; CHECK-NEXT:   .quad   liveArgs
+; CHECK-NEXT:   .quad   0
+; CHECK-NEXT:   .quad   1
+
+; Spilled stack map values.
+;
+; Verify 3 stack map entries.
+;
+; CHECK-LABEL:  .word   .L{{.*}}-liveArgs
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .half   25
+;
+; Check that at least one is a spilled entry from SP.
+; Location: Indirect SP + ...
+; CHECK:        .byte   3
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   2
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+define void @liveArgs(double %arg0, double %arg1, double %arg2, double %arg3, double %arg4, double %arg5, double %arg6, double %arg7, double %arg8, double %arg9, double %arg10, double %arg11, double %arg12, double %arg13, double %arg14, double %arg15, double %arg16, double %arg17, double %arg18, double %arg19, double %arg20, double %arg21, double %arg22, double %arg23, half %arg24, half %arg25, half %arg26, half %arg27, half %arg28, bfloat %arg29) {
+entry:
+  call void (i64, i32, ptr, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 28, ptr null, i32 5, double %arg0, double %arg1, double %arg2, double %arg3, double %arg4, double %arg5, double %arg6, double %arg7, double %arg8, double %arg9, double %arg10, double %arg11, double %arg12, double %arg13, double %arg14, double %arg15, double %arg16, double %arg17, double %arg18, double %arg19, double %arg20, double %arg21, double %arg22, double %arg23, half %arg24, half %arg25, half %arg26, half %arg27, half %arg28, bfloat %arg29)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
index c3183a1a3e036..320a3aa94cd7d 100644
--- a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
@@ -7,11 +7,11 @@
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   0
 ; Num Functions
-; CHECK-NEXT:   .word   12
+; CHECK-NEXT:   .word   13
 ; Num LargeConstants
-; CHECK-NEXT:   .word   2
+; CHECK-NEXT:   .word   3
 ; Num Callsites
-; CHECK-NEXT:   .word   16
+; CHECK-NEXT:   .word   17
 
 ; Functions and stack size
 ; CHECK-NEXT:   .quad   constantargs
@@ -38,8 +38,8 @@
 ; CHECK-NEXT:   .quad   liveConstant
 ; CHECK-NEXT:   .quad   0
 ; CHECK-NEXT:   .quad   1
-; CHECK-NEXT:   .quad   spilledValue
-; CHECK-NEXT:   .quad   144
+; CHECK-NEXT:   .quad   liveArgs
+; CHECK-NEXT:   .quad   0
 ; CHECK-NEXT:   .quad   1
 ; CHECK-NEXT:   .quad   directFrameIdx
 ; CHECK-NEXT:   .quad   48
@@ -50,10 +50,14 @@
 ; CHECK-NEXT:   .quad   needsStackRealignment
 ; CHECK-NEXT:   .quad   -1
 ; CHECK-NEXT:   .quad   1
+; CHECK-NEXT:   .quad   floats
+; CHECK-NEXT:   .quad   32
+; CHECK-NEXT:   .quad   1
 
 ; Num LargeConstants
 ; CHECK-NEXT:   .quad   4294967295
 ; CHECK-NEXT:   .quad   4294967296
+; CHECK-NEXT:   .quad   4609434218613702656
 
 ; Constant arguments
 ;
@@ -278,19 +282,19 @@ define void @liveConstant() {
 ;
 ; Verify 28 stack map entries.
 ;
-; CHECK-LABEL:  .word   .L{{.*}}-spilledValue
+; CHECK-LABEL:  .word   .L{{.*}}-liveArgs
 ; CHECK-NEXT:   .half   0
 ; CHECK-NEXT:   .half   28
 ;
-; Check that at least one is a spilled entry from RBP.
-; Location: Indirect RBP + ...
+; Check that at least one is a spilled entry from SP.
+; Location: Indirect SP + ...
 ; CHECK:        .byte   3
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   8
 ; CHECK-NEXT:   .half   2
 ; CHECK-NEXT:   .half   0
 ; CHECK-NEXT:   .word
-define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 zeroext %l26, i32 signext %l27) {
+define void @liveArgs(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 zeroext %l26, i32 signext %l27) {
 entry:
   call void (i64, i32, ptr, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 28, ptr null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 %l26, i32 %l27)
   ret void
@@ -303,7 +307,7 @@ entry:
 ; CHECK-NEXT:   .half   0
 ; 1 location
 ; CHECK-NEXT:   .half   1
-; Loc 0: Direct RBP - ofs
+; Loc 0: Direct SP + ofs
 ; CHECK-NEXT:   .byte   2
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   8
@@ -316,14 +320,14 @@ entry:
 ; CHECK-NEXT:   .half   0
 ; 2 locations
 ; CHECK-NEXT:   .half   2
-; Loc 0: Direct RBP - ofs
+; Loc 0: Direct SP + ofs
 ; CHECK-NEXT:   .byte   2
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   8
 ; CHECK-NEXT:   .half   2
 ; CHECK-NEXT:   .half   0
 ; CHECK-NEXT:   .word
-; Loc 1: Direct RBP - ofs
+; Loc 1: Direct SP + ofs
 ; CHECK-NEXT:   .byte   2
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   8
@@ -379,6 +383,104 @@ define void @needsStackRealignment() {
 }
 declare void @escape_values(...)
 
+; CHECK-LABEL:  .word   .L{{.*}}-floats
+; CHECK-NEXT:   .half   0
+; Num Locations
+; CHECK-NEXT:   .half   12
+; Loc 0: constant float as constant integer
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 1: constant double as large constant integer
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 2: constant half as constant integer
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 3: constant bfloat as constant integer
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 4: float value in X register
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   10
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 5: double value in X register
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   11
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 6: half value in X register
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   12
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 7: bfloat value in X register
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   13
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 8: float on stack
+; CHECK-NEXT:   .byte   2
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   2
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 9: double on stack
+; CHECK-NEXT:   .byte   2
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   2
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 10: half on stack
+; CHECK-NEXT:   .byte   2
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   2
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 11: bfloat on stack
+; CHECK-NEXT:   .byte   2
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   2
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+define void @floats(float %f, double %g, half %h, bfloat %i) {
+  %ff = alloca float
+  %gg = alloca double
+  %hh = alloca half
+  %ii = alloca bfloat
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 888, i32 0, float 1.25,
+    double 1.5, half 1.5, bfloat 1.5, float %f, double %g, half %h, bfloat %i, ptr %ff, ptr %gg, ptr %hh, ptr %ii)
+  ret void
+}
+
 declare void @llvm.experimental.stackmap(i64, i32, ...)
 declare void @llvm.experimental.patchpoint.void(i64, i32, ptr, i32, ...)
 declare i64 @llvm.experimental.patchpoint.i64(i64, i32, ptr, i32, ...)
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
index 50bd22bf5fd69..f4964288e3541 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
@@ -205,12 +205,19 @@ define i64 @addmul20(i64 %a, i64 %b) {
 }
 
 define i64 @addmul22(i64 %a, i64 %b) {
-; CHECK-LABEL: addmul22:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 22
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: addmul22:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 22
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addmul22:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a2, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV64XTHEADBA-NEXT:    ret
   %c = mul i64 %a, 22
   %d = add i64 %c, %b
   ret i64 %d
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 7fd76262d547a..156599fb72877 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -585,6 +585,33 @@ define i64 @addmul12(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @addmul14(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul14:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a2, a0, 1
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul14:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a2, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul14:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.h a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 14
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
 define i64 @addmul18(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul18:
 ; RV64I:       # %bb.0:
@@ -636,12 +663,26 @@ define i64 @addmul20(i64 %a, i64 %b) {
 }
 
 define i64 @addmul22(i64 %a, i64 %b) {
-; CHECK-LABEL: addmul22:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 22
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: addmul22:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 22
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul22:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a2, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul22:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.w a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
   %c = mul i64 %a, 22
   %d = add i64 %c, %b
   ret i64 %d
@@ -672,6 +713,32 @@ define i64 @addmul24(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @addmul26(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul26:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 26
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul26:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a2, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul26:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.h a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.w a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 26
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
 define i64 @addmul36(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul36:
 ; RV64I:       # %bb.0:
@@ -722,6 +789,58 @@ define i64 @addmul40(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @addmul38(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul38:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 38
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul38:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a2, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul38:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.d a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 38
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul42(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul42:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 42
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul42:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a2, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul42:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.w a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.w a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 42
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
 define i64 @addmul72(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul72:
 ; RV64I:       # %bb.0:
@@ -747,6 +866,136 @@ define i64 @addmul72(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @addmul74(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul74:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 74
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul74:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a2, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul74:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.d a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.w a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 74
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul82(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul82:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 82
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul82:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a2, a0, a0
+; RV64ZBA-NEXT:    sh3add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul82:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.w a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.d a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 82
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul146(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul146:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 146
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul146:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a2, a0, a0
+; RV64ZBA-NEXT:    sh3add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul146:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.d a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.d a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 146
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @mul49(i64 %a) {
+; RV64I-LABEL: mul49:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 49
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul49:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a0, 4
+; RV64ZBA-NEXT:    sh1add a1, a1, a1
+; RV64ZBA-NEXT:    add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: mul49:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    slli a1, a0, 4
+; RV64XANDESPERF-NEXT:    nds.lea.h a1, a1, a1
+; RV64XANDESPERF-NEXT:    add a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 49
+  ret i64 %c
+}
+
+define i64 @zext_mul49(i32 signext %a) {
+; RV64I-LABEL: zext_mul49:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 49
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    mulhu a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: zext_mul49:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli.uw a1, a0, 4
+; RV64ZBA-NEXT:    sh1add a1, a1, a1
+; RV64ZBA-NEXT:    add.uw a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: zext_mul49:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    slli a1, a0, 32
+; RV64XANDESPERF-NEXT:    srli a1, a1, 28
+; RV64XANDESPERF-NEXT:    nds.lea.h a1, a1, a1
+; RV64XANDESPERF-NEXT:    nds.lea.b.ze a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %b = zext i32 %a to i64
+  %c = mul i64 %b, 49
+  ret i64 %c
+}
+
 define i64 @mul50(i64 %a) {
 ; RV64I-LABEL: mul50:
 ; RV64I:       # %bb.0:
@@ -847,6 +1096,54 @@ define i64 @addmul100(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @mul145(i64 %a) {
+; RV64I-LABEL: mul145:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 145
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul145:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a0, 4
+; RV64ZBA-NEXT:    sh3add a1, a1, a1
+; RV64ZBA-NEXT:    add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: mul145:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    slli a1, a0, 4
+; RV64XANDESPERF-NEXT:    nds.lea.d a1, a1, a1
+; RV64XANDESPERF-NEXT:    add a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 145
+  ret i64 %c
+}
+
+define i64 @mul161(i64 %a) {
+; RV64I-LABEL: mul161:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 161
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul161:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a0, 5
+; RV64ZBA-NEXT:    sh2add a1, a1, a1
+; RV64ZBA-NEXT:    add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: mul161:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    slli a1, a0, 5
+; RV64XANDESPERF-NEXT:    nds.lea.w a1, a1, a1
+; RV64XANDESPERF-NEXT:    add a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 161
+  ret i64 %c
+}
+
 define i64 @mul162(i64 %a) {
 ; RV64I-LABEL: mul162:
 ; RV64I:       # %bb.0:
@@ -1262,6 +1559,34 @@ define i64 @mul288(i64 %a) {
   ret i64 %c
 }
 
+define i64 @zext_mul44(i32 signext %a) {
+; RV64I-LABEL: zext_mul44:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 11
+; RV64I-NEXT:    slli a1, a1, 34
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    mulhu a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: zext_mul44:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli.uw a0, a0, 2
+; RV64ZBA-NEXT:    sh2add a1, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: zext_mul44:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    slli a0, a0, 32
+; RV64XANDESPERF-NEXT:    srli a0, a0, 30
+; RV64XANDESPERF-NEXT:    nds.lea.w a1, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a1
+; RV64XANDESPERF-NEXT:    ret
+  %b = zext i32 %a to i64
+  %c = mul i64 %b, 44
+  ret i64 %c
+}
+
 define i64 @zext_mul68(i32 signext %a) {
 ; RV64I-LABEL: zext_mul68:
 ; RV64I:       # %bb.0:
@@ -1314,6 +1639,34 @@ define i64 @zext_mul96(i32 signext %a) {
   ret i64 %c
 }
 
+define i64 @zext_mul100(i32 signext %a) {
+; RV64I-LABEL: zext_mul100:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 25
+; RV64I-NEXT:    slli a1, a1, 34
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    mulhu a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: zext_mul100:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli.uw a0, a0, 2
+; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: zext_mul100:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    slli a0, a0, 32
+; RV64XANDESPERF-NEXT:    srli a0, a0, 30
+; RV64XANDESPERF-NEXT:    nds.lea.w a0, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.w a0, a0, a0
+; RV64XANDESPERF-NEXT:    ret
+  %b = zext i32 %a to i64
+  %c = mul i64 %b, 100
+  ret i64 %c
+}
+
 define i64 @zext_mul160(i32 signext %a) {
 ; RV64I-LABEL: zext_mul160:
 ; RV64I:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll
new file mode 100644
index 0000000000000..46d5e9f9a538f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll
@@ -0,0 +1,515 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-p -enable-p-ext-codegen -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECK-RV32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -enable-p-ext-codegen -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECK-RV64 %s
+
+; Test basic add/sub operations for v2i16
+define void @test_padd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_padd_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    padd.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %res = add <2 x i16> %a, %b
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psub_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    psub.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %res = sub <2 x i16> %a, %b
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test basic add/sub operations for v4i8
+define void @test_padd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_padd_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    padd.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %res = add <4 x i8> %a, %b
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psub_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    psub.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %res = sub <4 x i8> %a, %b
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test saturating add operations for v2i16
+define void @test_psadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psadd_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    psadd.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %res = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %a, <2 x i16> %b)
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psaddu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    psaddu.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %res = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %a, <2 x i16> %b)
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test saturating sub operations for v2i16
+define void @test_pssub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pssub_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pssub.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %res = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %a, <2 x i16> %b)
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_pssubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pssubu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pssubu.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %res = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %a, <2 x i16> %b)
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test saturating add operations for v4i8
+define void @test_psadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psadd_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    psadd.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %res = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %a, <4 x i8> %b)
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psaddu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    psaddu.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %res = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %a, <4 x i8> %b)
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test saturating sub operations for v4i8
+define void @test_pssub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pssub_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pssub.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %res = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %a, <4 x i8> %b)
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_pssubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pssubu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pssubu.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %res = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %a, <4 x i8> %b)
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor signed operations for v2i16
+define void @test_paadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_paadd_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    paadd.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %ext.a = sext <2 x i16> %a to <2 x i32>
+  %ext.b = sext <2 x i16> %b to <2 x i32>
+  %add = add nsw <2 x i32> %ext.a, %ext.b
+  %shift = ashr <2 x i32> %add, <i32 1, i32 1>
+  %res = trunc <2 x i32> %shift to <2 x i16>
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor unsigned operations for v2i16
+define void @test_paaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_paaddu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    paaddu.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %and = and <2 x i16> %a, %b
+  %xor = xor <2 x i16> %a, %b
+  %shift = lshr <2 x i16> %xor, <i16 1, i16 1>
+  %res = add <2 x i16> %and, %shift
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor signed operations for v4i8
+define void @test_paadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_paadd_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    paadd.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %ext.a = sext <4 x i8> %a to <4 x i16>
+  %ext.b = sext <4 x i8> %b to <4 x i16>
+  %add = add nsw <4 x i16> %ext.a, %ext.b
+  %shift = ashr <4 x i16> %add, <i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <4 x i16> %shift to <4 x i8>
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor unsigned operations for v4i8
+define void @test_paaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_paaddu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    paaddu.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %and = and <4 x i8> %a, %b
+  %xor = xor <4 x i8> %a, %b
+  %shift = lshr <4 x i8> %xor, <i8 1, i8 1, i8 1, i8 1>
+  %res = add <4 x i8> %and, %shift
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test absolute difference signed for v2i16
+define void @test_pdif_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pdif_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pdif.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %min = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %a, <2 x i16> %b)
+  %max = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %a, <2 x i16> %b)
+  %res = sub <2 x i16> %max, %min
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test absolute difference unsigned for v2i16
+define void @test_pdifu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pdifu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pdifu.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %min = call <2 x i16> @llvm.umin.v2i16(<2 x i16> %a, <2 x i16> %b)
+  %max = call <2 x i16> @llvm.umax.v2i16(<2 x i16> %a, <2 x i16> %b)
+  %res = sub <2 x i16> %max, %min
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test absolute difference signed for v4i8
+define void @test_pdif_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pdif_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pdif.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %min = call <4 x i8> @llvm.smin.v4i8(<4 x i8> %a, <4 x i8> %b)
+  %max = call <4 x i8> @llvm.smax.v4i8(<4 x i8> %a, <4 x i8> %b)
+  %res = sub <4 x i8> %max, %min
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test absolute difference unsigned for v4i8
+define void @test_pdifu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pdifu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pdifu.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %min = call <4 x i8> @llvm.umin.v4i8(<4 x i8> %a, <4 x i8> %b)
+  %max = call <4 x i8> @llvm.umax.v4i8(<4 x i8> %a, <4 x i8> %b)
+  %res = sub <4 x i8> %max, %min
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor subtraction signed for v2i16
+; pasub pattern: (a - b) arithmetic shift right 1
+define void @test_pasub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pasub_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pasub.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %a_ext = sext <2 x i16> %a to <2 x i32>
+  %b_ext = sext <2 x i16> %b to <2 x i32>
+  %sub = sub <2 x i32> %a_ext, %b_ext
+  %res = ashr <2 x i32> %sub, <i32 1, i32 1>
+  %res_trunc = trunc <2 x i32> %res to <2 x i16>
+  store <2 x i16> %res_trunc, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor subtraction unsigned for v2i16
+; pasubu pattern: (a - b) logical shift right 1
+define void @test_pasubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pasubu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pasubu.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %a_ext = zext <2 x i16> %a to <2 x i32>
+  %b_ext = zext <2 x i16> %b to <2 x i32>
+  %sub = sub <2 x i32> %a_ext, %b_ext
+  %res = lshr <2 x i32> %sub, <i32 1, i32 1>
+  %res_trunc = trunc <2 x i32> %res to <2 x i16>
+  store <2 x i16> %res_trunc, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor subtraction signed for v4i8
+define void @test_pasub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pasub_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pasub.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %a_ext = sext <4 x i8> %a to <4 x i16>
+  %b_ext = sext <4 x i8> %b to <4 x i16>
+  %sub = sub <4 x i16> %a_ext, %b_ext
+  %res = ashr <4 x i16> %sub, <i16 1, i16 1, i16 1, i16 1>
+  %res_trunc = trunc <4 x i16> %res to <4 x i8>
+  store <4 x i8> %res_trunc, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor subtraction unsigned for v4i8
+define void @test_pasubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pasubu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pasubu.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %a_ext = zext <4 x i8> %a to <4 x i16>
+  %b_ext = zext <4 x i8> %b to <4 x i16>
+  %sub = sub <4 x i16> %a_ext, %b_ext
+  %res = lshr <4 x i16> %sub, <i16 1, i16 1, i16 1, i16 1>
+  %res_trunc = trunc <4 x i16> %res to <4 x i8>
+  store <4 x i8> %res_trunc, ptr %ret_ptr
+  ret void
+}
+
+; Test PLI (pack load immediate) for v2i16
+define void @test_pli_h(ptr %ret_ptr) {
+; CHECK-LABEL: test_pli_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pli.h a1, 42
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %res = add <2 x i16> <i16 42, i16 42>, <i16 0, i16 0>
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_pli_h_negative(ptr %ret_ptr) {
+; CHECK-LABEL: test_pli_h_negative:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pli.h a1, -5
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %res = add <2 x i16> <i16 -5, i16 -5>, <i16 0, i16 0>
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test PLI for v4i8 with unsigned immediate
+define void @test_pli_b(ptr %ret_ptr) {
+; CHECK-LABEL: test_pli_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pli.b a1, 32
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %res = add <4 x i8> <i8 32, i8 32, i8 32, i8 32>, <i8 0, i8 0, i8 0, i8 0>
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_pli_b_negative(ptr %ret_ptr) {
+; CHECK-RV32-LABEL: test_pli_b_negative:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    pli.b a1, -2
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_pli_b_negative:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    pli.h a1, -258
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+  %res = add <4 x i8> <i8 -2, i8 -2, i8 -2, i8 -2>, <i8 0, i8 0, i8 0, i8 0>
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_extract_vector_16(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_extract_vector_16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    sh a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %extracted = extractelement <2 x i16> %a, i32 0
+  store i16 %extracted, ptr %ret_ptr
+  ret void
+}
+
+define void @test_extract_vector_8(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_extract_vector_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    sb a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %extracted = extractelement <4 x i8> %a, i32 0
+  store i8 %extracted, ptr %ret_ptr
+  ret void
+}
+
+; Intrinsic declarations
+declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>)
+declare <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16>, <2 x i16>)
+declare <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16>, <2 x i16>)
+declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>)
+declare <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8>, <4 x i8>)
+declare <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8>, <4 x i8>)
+declare <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8>, <4 x i8>)
+declare <4 x i8> @llvm.usub.sat.v4i8(<4 x i8>, <4 x i8>)
+declare <2 x i16> @llvm.smin.v2i16(<2 x i16>, <2 x i16>)
+declare <2 x i16> @llvm.smax.v2i16(<2 x i16>, <2 x i16>)
+declare <2 x i16> @llvm.umin.v2i16(<2 x i16>, <2 x i16>)
+declare <2 x i16> @llvm.umax.v2i16(<2 x i16>, <2 x i16>)
+declare <4 x i8> @llvm.smin.v4i8(<4 x i8>, <4 x i8>)
+declare <4 x i8> @llvm.smax.v4i8(<4 x i8>, <4 x i8>)
+declare <4 x i8> @llvm.umin.v4i8(<4 x i8>, <4 x i8>)
+declare <4 x i8> @llvm.umax.v4i8(<4 x i8>, <4 x i8>)
diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll
new file mode 100644
index 0000000000000..000a95fb6e0f8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll
@@ -0,0 +1,514 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -enable-p-ext-codegen -verify-machineinstrs < %s | FileCheck %s
+
+; Test basic add/sub operations for v4i16
+define void @test_padd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_padd_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    padd.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %res = add <4 x i16> %a, %b
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psub_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    psub.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %res = sub <4 x i16> %a, %b
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test basic add/sub operations for v8i8
+define void @test_padd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_padd_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    padd.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %res = add <8 x i8> %a, %b
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psub_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    psub.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %res = sub <8 x i8> %a, %b
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test saturating add operations for v4i16
+define void @test_psadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psadd_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    psadd.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %res = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psaddu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    psaddu.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %res = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test saturating sub operations for v4i16
+define void @test_pssub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pssub_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pssub.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %res = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_pssubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pssubu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pssubu.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %res = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test saturating add operations for v8i8
+define void @test_psadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psadd_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    psadd.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %res = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psaddu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    psaddu.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %res = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test saturating sub operations for v8i8
+define void @test_pssub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pssub_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pssub.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %res = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_pssubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pssubu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pssubu.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %res = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor signed operations for v4i16
+; avgfloors pattern: (a + b) arithmetic shift right 1
+define void @test_paadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_paadd_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    paadd.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %ext.a = sext <4 x i16> %a to <4 x i32>
+  %ext.b = sext <4 x i16> %b to <4 x i32>
+  %add = add nsw <4 x i32> %ext.a, %ext.b
+  %shift = ashr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <4 x i32> %shift to <4 x i16>
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor unsigned operations for v4i16
+; avgflooru pattern: (a & b) + ((a ^ b) >> 1)
+define void @test_paaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_paaddu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    paaddu.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %and = and <4 x i16> %a, %b
+  %xor = xor <4 x i16> %a, %b
+  %shift = lshr <4 x i16> %xor, <i16 1, i16 1, i16 1, i16 1>
+  %res = add <4 x i16> %and, %shift
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor signed operations for v8i8
+define void @test_paadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_paadd_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    paadd.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %ext.a = sext <8 x i8> %a to <8 x i16>
+  %ext.b = sext <8 x i8> %b to <8 x i16>
+  %add = add nsw <8 x i16> %ext.a, %ext.b
+  %shift = ashr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <8 x i16> %shift to <8 x i8>
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor unsigned operations for v8i8
+define void @test_paaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_paaddu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    paaddu.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %and = and <8 x i8> %a, %b
+  %xor = xor <8 x i8> %a, %b
+  %shift = lshr <8 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = add <8 x i8> %and, %shift
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test absolute difference signed for v4i16
+; abds pattern: sub(smax(a,b), smin(a,b))
+define void @test_pdif_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pdif_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pdif.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %min = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %max = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %res = sub <4 x i16> %max, %min
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test absolute difference unsigned for v4i16
+; abdu pattern: sub(umax(a,b), umin(a,b))
+define void @test_pdifu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pdifu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pdifu.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %min = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %max = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %res = sub <4 x i16> %max, %min
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test absolute difference signed for v8i8
+define void @test_pdif_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pdif_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pdif.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %min = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %max = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %res = sub <8 x i8> %max, %min
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test absolute difference unsigned for v8i8
+define void @test_pdifu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pdifu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pdifu.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %min = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %max = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %res = sub <8 x i8> %max, %min
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor subtraction signed for v4i16
+; pasub pattern: (a - b) arithmetic shift right 1
+define void @test_pasub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pasub_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pasub.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %a_ext = sext <4 x i16> %a to <4 x i32>
+  %b_ext = sext <4 x i16> %b to <4 x i32>
+  %sub = sub <4 x i32> %a_ext, %b_ext
+  %res = ashr <4 x i32> %sub, <i32 1, i32 1, i32 1, i32 1>
+  %res_trunc = trunc <4 x i32> %res to <4 x i16>
+  store <4 x i16> %res_trunc, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor subtraction unsigned for v4i16
+; pasubu pattern: (a - b) logical shift right 1
+define void @test_pasubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pasubu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pasubu.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %a_ext = zext <4 x i16> %a to <4 x i32>
+  %b_ext = zext <4 x i16> %b to <4 x i32>
+  %sub = sub <4 x i32> %a_ext, %b_ext
+  %res = lshr <4 x i32> %sub, <i32 1, i32 1, i32 1, i32 1>
+  %res_trunc = trunc <4 x i32> %res to <4 x i16>
+  store <4 x i16> %res_trunc, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor subtraction signed for v8i8
+define void @test_pasub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pasub_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pasub.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %a_ext = sext <8 x i8> %a to <8 x i16>
+  %b_ext = sext <8 x i8> %b to <8 x i16>
+  %sub = sub <8 x i16> %a_ext, %b_ext
+  %res = ashr <8 x i16> %sub, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res_trunc = trunc <8 x i16> %res to <8 x i8>
+  store <8 x i8> %res_trunc, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor subtraction unsigned for v8i8
+define void @test_pasubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pasubu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pasubu.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %a_ext = zext <8 x i8> %a to <8 x i16>
+  %b_ext = zext <8 x i8> %b to <8 x i16>
+  %sub = sub <8 x i16> %a_ext, %b_ext
+  %res = lshr <8 x i16> %sub, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res_trunc = trunc <8 x i16> %res to <8 x i8>
+  store <8 x i8> %res_trunc, ptr %ret_ptr
+  ret void
+}
+
+; Test PLI (pack load immediate) for v4i16
+define void @test_pli_h(ptr %ret_ptr) {
+; CHECK-LABEL: test_pli_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pli.h a1, 100
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %res = add <4 x i16> <i16 100, i16 100, i16 100, i16 100>, <i16 0, i16 0, i16 0, i16 0>
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test PLI for v8i8 with unsigned immediate
+define void @test_pli_b(ptr %ret_ptr) {
+; CHECK-LABEL: test_pli_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pli.b a1, 64
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %res = add <8 x i8> <i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64>, <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test PLI for v2i32 with signed immediate
+define void @test_pli_w(ptr %ret_ptr) {
+; CHECK-LABEL: test_pli_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pli.w a1, -256
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %res = add <2 x i32> <i32 -256, i32 -256>, <i32 0, i32 0>
+  store <2 x i32> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_extract_vector_16(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_extract_vector_16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    sh a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %extracted = extractelement <4 x i16> %a, i32 0
+  store i16 %extracted, ptr %ret_ptr
+  ret void
+}
+
+define void @test_extract_vector_8(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_extract_vector_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    sb a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %extracted = extractelement <8 x i8> %a, i32 0
+  store i8 %extracted, ptr %ret_ptr
+  ret void
+}
+
+define void @test_extract_vector_32(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_extract_vector_32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr
+  %extracted = extractelement <2 x i32> %a, i32 0
+  store i32 %extracted, ptr %ret_ptr
+  ret void
+}
+
+; Intrinsic declarations
+declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.usub.sat.v8i8(<8 x i8>, <8 x i8>)
+declare <4 x i16> @llvm.smin.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.smax.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.umin.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.umax.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i8> @llvm.smin.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.smax.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.umin.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.umax.v8i8(<8 x i8>, <8 x i8>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index a2fcd7962b8b0..5567310bb2a61 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -8,25 +8,15 @@
 
 ; FIXME: This should be widened to a vlseg2 of <4 x i32> with VL set to 3
 define {<3 x i32>, <3 x i32>} @load_factor2_v3(ptr %ptr) {
-; RV32-LABEL: load_factor2_v3:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; RV32-NEXT:    vle32.v v10, (a0)
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vnsrl.wi v8, v10, 0
-; RV32-NEXT:    vnsrl.wx v9, v10, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: load_factor2_v3:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; RV64-NEXT:    vle32.v v10, (a0)
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vnsrl.wx v9, v10, a0
-; RV64-NEXT:    vnsrl.wi v8, v10, 0
-; RV64-NEXT:    ret
+; CHECK-LABEL: load_factor2_v3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wx v9, v10, a0
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    ret
   %interleaved.vec = load <6 x i32>, ptr %ptr
   %v0 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> <i32 0, i32 2, i32 4>
   %v1 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> <i32 1, i32 3, i32 5>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 4c35b2506d3e4..7e6f2c76e5881 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -15265,6 +15265,259 @@ define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) {
   ret <4 x i32> %x
 }
 
+define <7 x i8> @mgather_baseidx_v7i8(ptr %base, <7 x i8> %idxs, <7 x i1> %m, <7 x i8> %passthru) {
+; RV32-LABEL: mgather_baseidx_v7i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 127
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.s.x v10, a1
+; RV32-NEXT:    vmand.mm v0, v0, v10
+; RV32-NEXT:    vsext.vf4 v10, v8
+; RV32-NEXT:    vsetvli zero, zero, e8, mf2, ta, mu
+; RV32-NEXT:    vluxei32.v v9, (a0), v10, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64V-LABEL: mgather_baseidx_v7i8:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    li a1, 127
+; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64V-NEXT:    vmv.s.x v10, a1
+; RV64V-NEXT:    vmand.mm v0, v0, v10
+; RV64V-NEXT:    vsext.vf8 v12, v8
+; RV64V-NEXT:    vsetvli zero, zero, e8, mf2, ta, mu
+; RV64V-NEXT:    vluxei64.v v9, (a0), v12, v0.t
+; RV64V-NEXT:    vmv1r.v v8, v9
+; RV64V-NEXT:    ret
+;
+; RV64ZVE32F-LABEL: mgather_baseidx_v7i8:
+; RV64ZVE32F:       # %bb.0:
+; RV64ZVE32F-NEXT:    addi sp, sp, -16
+; RV64ZVE32F-NEXT:    .cfi_def_cfa_offset 16
+; RV64ZVE32F-NEXT:    .cfi_remember_state
+; RV64ZVE32F-NEXT:    li a1, 64
+; RV64ZVE32F-NEXT:    addi a2, sp, 8
+; RV64ZVE32F-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; RV64ZVE32F-NEXT:    vsm.v v0, (a2)
+; RV64ZVE32F-NEXT:    ld a1, 8(sp)
+; RV64ZVE32F-NEXT:    andi a2, a1, 1
+; RV64ZVE32F-NEXT:    beqz a2, .LBB132_2
+; RV64ZVE32F-NEXT:  # %bb.1: # %cond.load
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT:    vmv.v.x v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 3
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 5
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v10, a2
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:  .LBB132_2: # %else
+; RV64ZVE32F-NEXT:    andi a2, a1, 2
+; RV64ZVE32F-NEXT:    beqz a2, .LBB132_4
+; RV64ZVE32F-NEXT:  # %bb.3: # %cond.load1
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 2
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    lbu a3, 0(a3)
+; RV64ZVE32F-NEXT:    vmv.v.x v10, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v11
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v11
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 5
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v10, a2
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:  .LBB132_4: # %else2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
+; RV64ZVE32F-NEXT:    beqz a2, .LBB132_6
+; RV64ZVE32F-NEXT:  # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    vslidedown.vi v12, v9, 3
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v11
+; RV64ZVE32F-NEXT:    vmv.v.x v11, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v12
+; RV64ZVE32F-NEXT:    vslidedown.vi v12, v9, 4
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a4
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v12
+; RV64ZVE32F-NEXT:    vslidedown.vi v12, v9, 5
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v12
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v11, a2
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:  .LBB132_6: # %else5
+; RV64ZVE32F-NEXT:    andi a2, a1, 8
+; RV64ZVE32F-NEXT:    beqz a2, .LBB132_8
+; RV64ZVE32F-NEXT:  # %bb.7: # %cond.load7
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v11
+; RV64ZVE32F-NEXT:    vmv.v.x v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a4
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 5
+; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    lbu a3, 0(a3)
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v11, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v10, a2
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:  .LBB132_8: # %else8
+; RV64ZVE32F-NEXT:    andi a2, a1, 16
+; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 4
+; RV64ZVE32F-NEXT:    bnez a2, .LBB132_13
+; RV64ZVE32F-NEXT:  # %bb.9: # %else11
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
+; RV64ZVE32F-NEXT:    bnez a2, .LBB132_14
+; RV64ZVE32F-NEXT:  .LBB132_10: # %else14
+; RV64ZVE32F-NEXT:    andi a1, a1, 64
+; RV64ZVE32F-NEXT:    beqz a1, .LBB132_12
+; RV64ZVE32F-NEXT:  .LBB132_11: # %cond.load16
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v9
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT:    add a0, a0, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 3
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    lbu a0, 0(a0)
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v9
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT:  .LBB132_12: # %else17
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV64ZVE32F-NEXT:    vmv1r.v v8, v9
+; RV64ZVE32F-NEXT:    addi sp, sp, 16
+; RV64ZVE32F-NEXT:    .cfi_def_cfa_offset 0
+; RV64ZVE32F-NEXT:    ret
+; RV64ZVE32F-NEXT:  .LBB132_13: # %cond.load10
+; RV64ZVE32F-NEXT:    .cfi_restore_state
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 2
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v10
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vmv.v.x v10, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a4
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v11
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 5
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT:    add a2, a0, a2
+; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v10, a2
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:    andi a2, a1, 32
+; RV64ZVE32F-NEXT:    beqz a2, .LBB132_10
+; RV64ZVE32F-NEXT:  .LBB132_14: # %cond.load13
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v11, v9, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v11
+; RV64ZVE32F-NEXT:    vmv.v.x v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 3
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a4
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v10
+; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 4
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT:    add a3, a0, a3
+; RV64ZVE32F-NEXT:    lbu a3, 0(a3)
+; RV64ZVE32F-NEXT:    vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v11, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v10, a2
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:    andi a1, a1, 64
+; RV64ZVE32F-NEXT:    bnez a1, .LBB132_11
+; RV64ZVE32F-NEXT:    j .LBB132_12
+  %ptrs = getelementptr inbounds i8, ptr %base, <7 x i8> %idxs
+  %v = call <7 x i8> @llvm.masked.gather.v7i8.v7p0(<7 x ptr> %ptrs, i32 1, <7 x i1> %m, <7 x i8> %passthru)
+  ret <7 x i8> %v
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; RV32V-ZVFH: {{.*}}
 ; RV32V-ZVFHMIN: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index 9c6d77dde1b5c..c3fe6b335d3da 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -44,9 +44,8 @@ define <4 x i64> @m2_splat_with_tail(<4 x i64> %v1) vscale_range(2,2) {
 ; CHECK-LABEL: m2_splat_with_tail:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vrgather.vi v10, v8, 0
-; CHECK-NEXT:    vmv1r.v v11, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vrgather.vi v8, v10, 0
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
   ret <4 x i64> %res
@@ -99,9 +98,8 @@ define <4 x i64> @m2_splat_into_identity(<4 x i64> %v1) vscale_range(2,2) {
 ; CHECK-LABEL: m2_splat_into_identity:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vrgather.vi v10, v8, 0
-; CHECK-NEXT:    vmv1r.v v11, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vrgather.vi v8, v10, 0
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
   ret <4 x i64> %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-sdnode.ll
new file mode 100644
index 0000000000000..9cfed6a659c64
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-sdnode.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:     -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s
+
+define <2 x bfloat> @copysign_v2bf16(<2 x bfloat> %vm, <2 x bfloat> %vs) {
+; CHECK-LABEL: copysign_v2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfsgnj.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %r = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %vm, <2 x bfloat> %vs)
+  ret <2 x bfloat> %r
+}
+
+define <4 x bfloat> @copysign_v4bf16(<4 x bfloat> %vm, <4 x bfloat> %vs) {
+; CHECK-LABEL: copysign_v4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfsgnj.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %r = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %vm, <4 x bfloat> %vs)
+  ret <4 x bfloat> %r
+}
+
+define <8 x bfloat> @copysign_v8bf16(<8 x bfloat> %vm, <8 x bfloat> %vs) {
+; CHECK-LABEL: copysign_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfsgnj.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %r = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> %vm, <8 x bfloat> %vs)
+  ret <8 x bfloat> %r
+}
+
+define <16 x bfloat> @copysign_v16bf16(<16 x bfloat> %vm, <16 x bfloat> %vs) {
+; CHECK-LABEL: copysign_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfsgnj.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %r = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> %vm, <16 x bfloat> %vs)
+  ret <16 x bfloat> %r
+}
+
+define <32 x bfloat> @copysign_v32bf32(<32 x bfloat> %vm, <32 x bfloat> %vs) {
+; CHECK-LABEL: copysign_v32bf32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfsgnj.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %r = call <32 x bfloat> @llvm.copysign.v32bf32(<32 x bfloat> %vm, <32 x bfloat> %vs)
+  ret <32 x bfloat> %r
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
index a2178e1c571da..2455d872ae7f0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
@@ -1,8 +1,172 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+v -target-abi=ilp32d \
-; RUN:   -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+v -target-abi=lp64d \
-; RUN:   -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+zvfbfmin,+v -target-abi=ilp32d \
+; RUN:   -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,ZVFH %s
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+zvfbfmin,+v -target-abi=lp64d \
+; RUN:   -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,ZVFH %s
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+experimental-zvfbfa,+v -target-abi=ilp32d \
+; RUN:   -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,ZVFBFA %s
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+experimental-zvfbfa,+v -target-abi=lp64d \
+; RUN:   -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,ZVFBFA %s
+
+define <2 x bfloat> @vfsgnj_vv_v2bf16(<2 x bfloat> %va, <2 x bfloat> %vb, <2 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_v2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vand.vx v9, v9, a1, v0.t
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    vor.vv v8, v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_v2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <2 x bfloat> @llvm.vp.copysign.v2bf16(<2 x bfloat> %va, <2 x bfloat> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x bfloat> %v
+}
+
+define <2 x bfloat> @vfsgnj_vv_v2bf16_unmasked(<2 x bfloat> %va, <2 x bfloat> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_v2bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vand.vx v9, v9, a1
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    vor.vv v8, v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_v2bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
+  %v = call <2 x bfloat> @llvm.vp.copysign.v2bf16(<2 x bfloat> %va, <2 x bfloat> %vb, <2 x i1> splat (i1 true), i32 %evl)
+  ret <2 x bfloat> %v
+}
+
+define <4 x bfloat> @vfsgnj_vv_v4bf16(<4 x bfloat> %va, <4 x bfloat> %vb, <4 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_v4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vand.vx v9, v9, a1, v0.t
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    vor.vv v8, v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_v4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <4 x bfloat> @llvm.vp.copysign.v4bf16(<4 x bfloat> %va, <4 x bfloat> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x bfloat> %v
+}
+
+define <4 x bfloat> @vfsgnj_vv_v4bf16_unmasked(<4 x bfloat> %va, <4 x bfloat> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_v4bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vand.vx v9, v9, a1
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    vor.vv v8, v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_v4bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
+  %v = call <4 x bfloat> @llvm.vp.copysign.v4bf16(<4 x bfloat> %va, <4 x bfloat> %vb, <4 x i1> splat (i1 true), i32 %evl)
+  ret <4 x bfloat> %v
+}
+
+define <8 x bfloat> @vfsgnj_vv_v8bf16(<8 x bfloat> %va, <8 x bfloat> %vb, <8 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_v8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vand.vx v9, v9, a1, v0.t
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    vor.vv v8, v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_v8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <8 x bfloat> @llvm.vp.copysign.v8bf16(<8 x bfloat> %va, <8 x bfloat> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x bfloat> %v
+}
+
+define <8 x bfloat> @vfsgnj_vv_v8bf16_unmasked(<8 x bfloat> %va, <8 x bfloat> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_v8bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vand.vx v9, v9, a1
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    vor.vv v8, v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_v8bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
+  %v = call <8 x bfloat> @llvm.vp.copysign.v8bf16(<8 x bfloat> %va, <8 x bfloat> %vb, <8 x i1> splat (i1 true), i32 %evl)
+  ret <8 x bfloat> %v
+}
+
+define <16 x bfloat> @vfsgnj_vv_v16bf16(<16 x bfloat> %va, <16 x bfloat> %vb, <16 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_v16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vand.vx v10, v10, a1, v0.t
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    vor.vv v8, v8, v10, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_v16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v10, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <16 x bfloat> @llvm.vp.copysign.v16bf16(<16 x bfloat> %va, <16 x bfloat> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x bfloat> %v
+}
+
+define <16 x bfloat> @vfsgnj_vv_v16bf16_unmasked(<16 x bfloat> %va, <16 x bfloat> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_v16bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vand.vx v10, v10, a1
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    vor.vv v8, v8, v10
+; ZVFH-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_v16bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
+  %v = call <16 x bfloat> @llvm.vp.copysign.v16bf16(<16 x bfloat> %va, <16 x bfloat> %vb, <16 x i1> splat (i1 true), i32 %evl)
+  ret <16 x bfloat> %v
+}
 
 declare <2 x half> @llvm.vp.copysign.v2f16(<2 x half>, <2 x half>, <2 x i1>, i32)
 
@@ -311,10 +475,10 @@ define <32 x double> @vfsgnj_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v7, v0, 2
-; CHECK-NEXT:    bltu a2, a1, .LBB26_2
+; CHECK-NEXT:    bltu a2, a1, .LBB34_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:  .LBB26_2:
+; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsgnj.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    addi a0, a2, -16
@@ -346,10 +510,10 @@ define <32 x double> @vfsgnj_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
 ; CHECK-NEXT:    vle64.v v0, (a0)
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    mv a0, a2
-; CHECK-NEXT:    bltu a2, a1, .LBB27_2
+; CHECK-NEXT:    bltu a2, a1, .LBB35_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:  .LBB27_2:
+; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsgnj.vv v8, v8, v0
 ; CHECK-NEXT:    addi a0, a2, -16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-sdnode.ll
new file mode 100644
index 0000000000000..27c00de3c3487
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-sdnode.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zvfbfa,+v \
+; RUN:     -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zvfbfa,+v \
+; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s
+
+define <1 x bfloat> @v1bf16(<1 x bfloat> %v) {
+; CHECK-LABEL: v1bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v8
+; CHECK-NEXT:    ret
+  %r = call <1 x bfloat> @llvm.fabs.v1bf16(<1 x bfloat> %v)
+  ret <1 x bfloat> %r
+}
+
+define <2 x bfloat> @v2bf16(<2 x bfloat> %v) {
+; CHECK-LABEL: v2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v8
+; CHECK-NEXT:    ret
+  %r = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %v)
+  ret <2 x bfloat> %r
+}
+
+define <4 x bfloat> @v4bf16(<4 x bfloat> %v) {
+; CHECK-LABEL: v4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v8
+; CHECK-NEXT:    ret
+  %r = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> %v)
+  ret <4 x bfloat> %r
+}
+
+define <8 x bfloat> @v8bf16(<8 x bfloat> %v) {
+; CHECK-LABEL: v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v8
+; CHECK-NEXT:    ret
+  %r = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> %v)
+  ret <8 x bfloat> %r
+}
+
+define <16 x bfloat> @v16bf16(<16 x bfloat> %v) {
+; CHECK-LABEL: v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v8
+; CHECK-NEXT:    ret
+  %r = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> %v)
+  ret <16 x bfloat> %r
+}
+
+define <32 x bfloat> @v32bf16(<32 x bfloat> %v) {
+; CHECK-LABEL: v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v8
+; CHECK-NEXT:    ret
+  %r = call <32 x bfloat> @llvm.fabs.v32bf16(<32 x bfloat> %v)
+  ret <32 x bfloat> %r
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
index 08f486b601328..01bd706ed31f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
@@ -1,12 +1,224 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=ilp32d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=lp64d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v -target-abi=ilp32d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v -target-abi=lp64d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+experimental-zvfbfa,+v -target-abi=ilp32d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+experimental-zvfbfa,+v -target-abi=lp64d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA
+
+define <2 x bfloat> @vfabs_vv_v2bf16(<2 x bfloat> %va, <2 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_v2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_v2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_v2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <2 x bfloat> @llvm.vp.fabs.v2bf16(<2 x bfloat> %va, <2 x i1> %m, i32 %evl)
+  ret <2 x bfloat> %v
+}
+
+define <2 x bfloat> @vfabs_vv_v2bf16_unmasked(<2 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_v2bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_v2bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_v2bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <2 x bfloat> @llvm.vp.fabs.v2bf16(<2 x bfloat> %va, <2 x i1> splat (i1 true), i32 %evl)
+  ret <2 x bfloat> %v
+}
+
+define <4 x bfloat> @vfabs_vv_v4bf16(<4 x bfloat> %va, <4 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_v4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_v4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_v4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <4 x bfloat> @llvm.vp.fabs.v4bf16(<4 x bfloat> %va, <4 x i1> %m, i32 %evl)
+  ret <4 x bfloat> %v
+}
+
+define <4 x bfloat> @vfabs_vv_v4bf16_unmasked(<4 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_v4bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_v4bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_v4bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <4 x bfloat> @llvm.vp.fabs.v4bf16(<4 x bfloat> %va, <4 x i1> splat (i1 true), i32 %evl)
+  ret <4 x bfloat> %v
+}
+
+define <8 x bfloat> @vfabs_vv_v8bf16(<8 x bfloat> %va, <8 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_v8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_v8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_v8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <8 x bfloat> @llvm.vp.fabs.v8bf16(<8 x bfloat> %va, <8 x i1> %m, i32 %evl)
+  ret <8 x bfloat> %v
+}
+
+define <8 x bfloat> @vfabs_vv_v8bf16_unmasked(<8 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_v8bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_v8bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_v8bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <8 x bfloat> @llvm.vp.fabs.v8bf16(<8 x bfloat> %va, <8 x i1> splat (i1 true), i32 %evl)
+  ret <8 x bfloat> %v
+}
+
+define <16 x bfloat> @vfabs_vv_v16bf16(<16 x bfloat> %va, <16 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_v16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_v16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_v16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <16 x bfloat> @llvm.vp.fabs.v16bf16(<16 x bfloat> %va, <16 x i1> %m, i32 %evl)
+  ret <16 x bfloat> %v
+}
+
+define <16 x bfloat> @vfabs_vv_v16bf16_unmasked(<16 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_v16bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_v16bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_v16bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <16 x bfloat> @llvm.vp.fabs.v16bf16(<16 x bfloat> %va, <16 x i1> splat (i1 true), i32 %evl)
+  ret <16 x bfloat> %v
+}
 
 declare <2 x half> @llvm.vp.fabs.v2f16(<2 x half>, <2 x i1>, i32)
 
@@ -24,6 +236,14 @@ define <2 x half> @vfabs_vv_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_v2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fabs.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
 }
@@ -42,6 +262,14 @@ define <2 x half> @vfabs_vv_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_v2f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fabs.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x half> %v
 }
@@ -62,6 +290,14 @@ define <4 x half> @vfabs_vv_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_v4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fabs.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
 }
@@ -80,6 +316,14 @@ define <4 x half> @vfabs_vv_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_v4f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fabs.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x half> %v
 }
@@ -100,6 +344,14 @@ define <8 x half> @vfabs_vv_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_v8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fabs.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
 }
@@ -118,6 +370,14 @@ define <8 x half> @vfabs_vv_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_v8f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fabs.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x half> %v
 }
@@ -138,6 +398,14 @@ define <16 x half> @vfabs_vv_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_v16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fabs.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
 }
@@ -156,6 +424,14 @@ define <16 x half> @vfabs_vv_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_v16f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fabs.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x half> %v
 }
@@ -367,10 +643,10 @@ define <32 x double> @vfabs_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    bltu a0, a2, .LBB26_2
+; CHECK-NEXT:    bltu a0, a2, .LBB34_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:  .LBB26_2:
+; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v8, v0.t
 ; CHECK-NEXT:    addi a1, a0, -16
@@ -390,10 +666,10 @@ define <32 x double> @vfabs_vv_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    bltu a0, a2, .LBB27_2
+; CHECK-NEXT:    bltu a0, a2, .LBB35_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:  .LBB27_2:
+; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v8
 ; CHECK-NEXT:    addi a1, a0, -16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-sdnode.ll
new file mode 100644
index 0000000000000..b3b9a62600f46
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-sdnode.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zvfbfa,+v \
+; RUN:     -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zvfbfa,+v \
+; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s
+
+define <1 x bfloat> @v1bf16(<1 x bfloat> %va) {
+; CHECK-LABEL: v1bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfneg.v v8, v8
+; CHECK-NEXT:    ret
+  %vb = fneg <1 x bfloat> %va
+  ret <1 x bfloat> %vb
+}
+
+define <2 x bfloat> @v2bf16(<2 x bfloat> %va) {
+; CHECK-LABEL: v2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfneg.v v8, v8
+; CHECK-NEXT:    ret
+  %vb = fneg <2 x bfloat> %va
+  ret <2 x bfloat> %vb
+}
+
+define <4 x bfloat> @v4bf16(<4 x bfloat> %va) {
+; CHECK-LABEL: v4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfneg.v v8, v8
+; CHECK-NEXT:    ret
+  %vb = fneg <4 x bfloat> %va
+  ret <4 x bfloat> %vb
+}
+
+define <8 x bfloat> @v8bf16(<8 x bfloat> %va) {
+; CHECK-LABEL: v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfneg.v v8, v8
+; CHECK-NEXT:    ret
+  %vb = fneg <8 x bfloat> %va
+  ret <8 x bfloat> %vb
+}
+
+define <16 x bfloat> @v16bf16(<16 x bfloat> %va) {
+; CHECK-LABEL: v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfneg.v v8, v8
+; CHECK-NEXT:    ret
+  %vb = fneg <16 x bfloat> %va
+  ret <16 x bfloat> %vb
+}
+
+define <32 x bfloat> @v32bf16(<32 x bfloat> %va) {
+; CHECK-LABEL: v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfneg.v v8, v8
+; CHECK-NEXT:    ret
+  %vb = fneg <32 x bfloat> %va
+  ret <32 x bfloat> %vb
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
index 968fd9f9bab80..dede0e707d929 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
@@ -1,12 +1,208 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=ilp32d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=lp64d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v -target-abi=ilp32d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v -target-abi=lp64d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+experimental-zvfbfa,+v -target-abi=ilp32d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+experimental-zvfbfa,+v -target-abi=lp64d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA
+
+define <2 x bfloat> @vfneg_vv_v2bf16(<2 x bfloat> %va, <2 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_v2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_v2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_v2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <2 x bfloat> @llvm.vp.fneg.v2bf16(<2 x bfloat> %va, <2 x i1> %m, i32 %evl)
+  ret <2 x bfloat> %v
+}
+
+define <2 x bfloat> @vfneg_vv_v2bf16_unmasked(<2 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_v2bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_v2bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_v2bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <2 x bfloat> @llvm.vp.fneg.v2bf16(<2 x bfloat> %va, <2 x i1> splat (i1 true), i32 %evl)
+  ret <2 x bfloat> %v
+}
+
+define <4 x bfloat> @vfneg_vv_v4bf16(<4 x bfloat> %va, <4 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_v4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_v4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_v4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <4 x bfloat> @llvm.vp.fneg.v4bf16(<4 x bfloat> %va, <4 x i1> %m, i32 %evl)
+  ret <4 x bfloat> %v
+}
+
+define <4 x bfloat> @vfneg_vv_v4bf16_unmasked(<4 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_v4bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_v4bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_v4bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <4 x bfloat> @llvm.vp.fneg.v4bf16(<4 x bfloat> %va, <4 x i1> splat (i1 true), i32 %evl)
+  ret <4 x bfloat> %v
+}
+
+define <8 x bfloat> @vfneg_vv_v8bf16(<8 x bfloat> %va, <8 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_v8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_v8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_v8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <8 x bfloat> @llvm.vp.fneg.v8bf16(<8 x bfloat> %va, <8 x i1> %m, i32 %evl)
+  ret <8 x bfloat> %v
+}
+
+define <8 x bfloat> @vfneg_vv_v8bf16_unmasked(<8 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_v8bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_v8bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_v8bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <8 x bfloat> @llvm.vp.fneg.v8bf16(<8 x bfloat> %va, <8 x i1> splat (i1 true), i32 %evl)
+  ret <8 x bfloat> %v
+}
+
+define <16 x bfloat> @vfneg_vv_v16bf16(<16 x bfloat> %va, <16 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_v16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_v16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_v16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <16 x bfloat> @llvm.vp.fneg.v16bf16(<16 x bfloat> %va, <16 x i1> %m, i32 %evl)
+  ret <16 x bfloat> %v
+}
+
+define <16 x bfloat> @vfneg_vv_v16bf16_unmasked(<16 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_v16bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_v16bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_v16bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <16 x bfloat> @llvm.vp.fneg.v16bf16(<16 x bfloat> %va, <16 x i1> splat (i1 true), i32 %evl)
+  ret <16 x bfloat> %v
+}
 
 declare <2 x half> @llvm.vp.fneg.v2f16(<2 x half>, <2 x i1>, i32)
 
@@ -23,6 +219,13 @@ define <2 x half> @vfneg_vv_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_v2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
 }
@@ -40,6 +243,13 @@ define <2 x half> @vfneg_vv_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_v2f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x half> %v
 }
@@ -59,6 +269,13 @@ define <4 x half> @vfneg_vv_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_v4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
 }
@@ -76,6 +293,13 @@ define <4 x half> @vfneg_vv_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_v4f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x half> %v
 }
@@ -95,6 +319,13 @@ define <8 x half> @vfneg_vv_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_v8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
 }
@@ -112,6 +343,13 @@ define <8 x half> @vfneg_vv_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_v8f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x half> %v
 }
@@ -131,6 +369,13 @@ define <16 x half> @vfneg_vv_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_v16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
 }
@@ -148,6 +393,13 @@ define <16 x half> @vfneg_vv_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_v16f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x half> %v
 }
@@ -359,10 +611,10 @@ define <32 x double> @vfneg_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    bltu a0, a2, .LBB26_2
+; CHECK-NEXT:    bltu a0, a2, .LBB34_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:  .LBB26_2:
+; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfneg.v v8, v8, v0.t
 ; CHECK-NEXT:    addi a1, a0, -16
@@ -382,10 +634,10 @@ define <32 x double> @vfneg_vv_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    bltu a0, a2, .LBB27_2
+; CHECK-NEXT:    bltu a0, a2, .LBB35_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:  .LBB27_2:
+; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfneg.v v8, v8
 ; CHECK-NEXT:    addi a1, a0, -16
diff --git a/llvm/test/CodeGen/RISCV/rvv/machine-combiner-subreg-verifier-error.mir b/llvm/test/CodeGen/RISCV/rvv/machine-combiner-subreg-verifier-error.mir
new file mode 100644
index 0000000000000..76dfd4e746bea
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/machine-combiner-subreg-verifier-error.mir
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs -run-pass=machine-combiner -o - %s | FileCheck %s
+
+# Make sure the verifier doesn't fail due to dropping subregister
+# uses.
+
+---
+name:            machine_combiner_subreg_verifier_error
+tracksRegLiveness: true
+isSSA:           true
+body:             |
+  bb.0:
+    liveins: $v8m4, $v12m4
+
+    ; CHECK-LABEL: name: machine_combiner_subreg_verifier_error
+    ; CHECK: liveins: $v8m4, $v12m4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:gprnox0 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; CHECK-NEXT: [[PseudoVSLIDEDOWN_VI_M8_:%[0-9]+]]:vrm8 = PseudoVSLIDEDOWN_VI_M8 $noreg, [[DEF2]], 26, 2, 5 /* e32 */, 3 /* ta, ma */
+    ; CHECK-NEXT: [[PseudoVADD_VV_MF2_:%[0-9]+]]:vr = PseudoVADD_VV_MF2 $noreg, [[DEF2]].sub_vrm1_0, killed [[DEF3]], 2, 5 /* e32 */, 1 /* ta, mu */
+    ; CHECK-NEXT: [[PseudoVADD_VV_MF2_1:%[0-9]+]]:vr = PseudoVADD_VV_MF2 $noreg, [[PseudoVSLIDEDOWN_VI_M8_]].sub_vrm1_0, killed [[PseudoVADD_VV_MF2_]], 2, 5 /* e32 */, 1 /* ta, mu */
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:vrm4 = IMPLICIT_DEF
+    %1:gprnox0 = IMPLICIT_DEF
+    %2:vrm8 = IMPLICIT_DEF
+    %3:vr = IMPLICIT_DEF
+    %4:vrm2 = IMPLICIT_DEF
+    %5:vr = IMPLICIT_DEF
+    %6:vrm8 = PseudoVSLIDEDOWN_VI_M8 $noreg, %2, 26, 2, 5 /* e32 */, 3 /* ta, ma */
+    %7:vr = PseudoVADD_VV_MF2 $noreg, %6.sub_vrm1_0, %2.sub_vrm1_0, 2, 5 /* e32 */, 1 /* ta, mu */
+    %8:vr = PseudoVADD_VV_MF2 $noreg, killed %7, killed %3, 2, 5 /* e32 */, 1 /* ta, mu */
+    PseudoRET implicit $v8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr165232.ll b/llvm/test/CodeGen/RISCV/rvv/pr165232.ll
new file mode 100644
index 0000000000000..bef53c6a5ae62
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/pr165232.ll
@@ -0,0 +1,244 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-gnu"
+
+define i1 @main(ptr %var_117, ptr %arrayinit.element3045, ptr %arrayinit.element3047, ptr %arrayinit.element3049, ptr %arrayinit.element3051, ptr %arrayinit.element3053, ptr %arrayinit.element3055, ptr %arrayinit.element3057, ptr %arrayinit.element3059, ptr %arrayinit.element3061, ptr %arrayinit.element3063, ptr %arrayinit.element3065, ptr %arrayinit.element3067, i64 %var_94_i.07698, target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %1) {
+; CHECK-LABEL: main:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr t0, vlenb
+; CHECK-NEXT:    slli t0, t0, 3
+; CHECK-NEXT:    mv t1, t0
+; CHECK-NEXT:    slli t0, t0, 1
+; CHECK-NEXT:    add t0, t0, t1
+; CHECK-NEXT:    sub sp, sp, t0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd a2, 0(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs4r.v v12, (a1) # vscale x 32-byte Folded Spill
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 2
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    ld t0, 56(a1)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    ld t1, 48(a1)
+; CHECK-NEXT:    vsetvli t2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    ld t2, 40(a1)
+; CHECK-NEXT:    # kill: def $v10 killed $v9 killed $vtype
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    ld t3, 32(a1)
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    ld t4, 16(a1)
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    ld t5, 24(a1)
+; CHECK-NEXT:    vmv.v.i v13, 0
+; CHECK-NEXT:    vsetvli t6, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v22, 0
+; CHECK-NEXT:    vmv1r.v v14, v9
+; CHECK-NEXT:    sd zero, 0(a0)
+; CHECK-NEXT:    vmv.v.i v24, 0
+; CHECK-NEXT:    vmv1r.v v15, v9
+; CHECK-NEXT:    vmv1r.v v18, v9
+; CHECK-NEXT:    li t6, 1023
+; CHECK-NEXT:    vmv.v.i v26, 0
+; CHECK-NEXT:    vmv1r.v v19, v9
+; CHECK-NEXT:    slli t6, t6, 52
+; CHECK-NEXT:    vmv.v.i v28, 0
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs2r.v v22, (a1) # vscale x 16-byte Folded Spill
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    vs4r.v v24, (a1) # vscale x 32-byte Folded Spill
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    ld a2, 0(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    vs2r.v v28, (a1) # vscale x 16-byte Folded Spill
+; CHECK-NEXT:    ld a1, 8(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    vmv1r.v v20, v9
+; CHECK-NEXT:    sd t6, 0(t5)
+; CHECK-NEXT:    vmv2r.v v16, v14
+; CHECK-NEXT:    vmv2r.v v14, v12
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv1r.v v11, v9
+; CHECK-NEXT:    vmv1r.v v21, v9
+; CHECK-NEXT:    csrr t5, vlenb
+; CHECK-NEXT:    slli t5, t5, 3
+; CHECK-NEXT:    add t5, sp, t5
+; CHECK-NEXT:    addi t5, t5, 16
+; CHECK-NEXT:    vs2r.v v18, (t5) # vscale x 16-byte Folded Spill
+; CHECK-NEXT:    csrr t6, vlenb
+; CHECK-NEXT:    slli t6, t6, 1
+; CHECK-NEXT:    add t5, t5, t6
+; CHECK-NEXT:    vs2r.v v20, (t5) # vscale x 16-byte Folded Spill
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v19, 0
+; CHECK-NEXT:    vmclr.m v10
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v6, 0
+; CHECK-NEXT:  .LBB0_1: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmv1r.v v20, v19
+; CHECK-NEXT:    vmv1r.v v3, v19
+; CHECK-NEXT:    vmv1r.v v5, v19
+; CHECK-NEXT:    vmv1r.v v2, v19
+; CHECK-NEXT:    vmv1r.v v31, v19
+; CHECK-NEXT:    vmv1r.v v30, v19
+; CHECK-NEXT:    vmv1r.v v4, v19
+; CHECK-NEXT:    vmv2r.v v22, v10
+; CHECK-NEXT:    vmv4r.v v24, v12
+; CHECK-NEXT:    vmv2r.v v28, v16
+; CHECK-NEXT:    vmv2r.v v8, v6
+; CHECK-NEXT:    vmv1r.v v18, v19
+; CHECK-NEXT:    vmv1r.v v21, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, tu, ma
+; CHECK-NEXT:    vle32.v v20, (t4)
+; CHECK-NEXT:    vle32.v v3, (t1)
+; CHECK-NEXT:    vle32.v v30, (a7)
+; CHECK-NEXT:    vle64.v v8, (a4)
+; CHECK-NEXT:    vle32.v v5, (t2)
+; CHECK-NEXT:    vle32.v v2, (t3)
+; CHECK-NEXT:    vle32.v v31, (a6)
+; CHECK-NEXT:    vmv1r.v v24, v30
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vmflt.vv v21, v8, v6, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v19
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, tu, mu
+; CHECK-NEXT:    vle32.v v18, (a2)
+; CHECK-NEXT:    vle32.v v8, (a3)
+; CHECK-NEXT:    vle32.v v4, (a5)
+; CHECK-NEXT:    vmv1r.v v22, v20
+; CHECK-NEXT:    csrr t5, vlenb
+; CHECK-NEXT:    slli t5, t5, 3
+; CHECK-NEXT:    add t5, sp, t5
+; CHECK-NEXT:    addi t5, t5, 16
+; CHECK-NEXT:    vl1r.v v1, (t5) # vscale x 8-byte Folded Reload
+; CHECK-NEXT:    csrr t6, vlenb
+; CHECK-NEXT:    add t5, t5, t6
+; CHECK-NEXT:    vl2r.v v2, (t5) # vscale x 16-byte Folded Reload
+; CHECK-NEXT:    slli t6, t6, 1
+; CHECK-NEXT:    add t5, t5, t6
+; CHECK-NEXT:    vl1r.v v4, (t5) # vscale x 8-byte Folded Reload
+; CHECK-NEXT:    vsseg4e32.v v1, (zero)
+; CHECK-NEXT:    vsseg8e32.v v22, (a1)
+; CHECK-NEXT:    vmv1r.v v0, v21
+; CHECK-NEXT:    vssub.vv v8, v19, v18, v0.t
+; CHECK-NEXT:    csrr t5, vlenb
+; CHECK-NEXT:    slli t5, t5, 2
+; CHECK-NEXT:    mv t6, t5
+; CHECK-NEXT:    slli t5, t5, 1
+; CHECK-NEXT:    add t5, t5, t6
+; CHECK-NEXT:    add t5, sp, t5
+; CHECK-NEXT:    addi t5, t5, 16
+; CHECK-NEXT:    vl4r.v v20, (t5) # vscale x 32-byte Folded Reload
+; CHECK-NEXT:    vsetvli zero, t0, e64, m2, ta, ma
+; CHECK-NEXT:    vsseg2e64.v v20, (zero)
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    addi t5, sp, 16
+; CHECK-NEXT:    vl4r.v v20, (t5) # vscale x 32-byte Folded Reload
+; CHECK-NEXT:    csrr t6, vlenb
+; CHECK-NEXT:    slli t6, t6, 2
+; CHECK-NEXT:    add t5, t5, t6
+; CHECK-NEXT:    vl4r.v v24, (t5) # vscale x 32-byte Folded Reload
+; CHECK-NEXT:    vsetivli zero, 0, e64, m2, ta, ma
+; CHECK-NEXT:    vsseg4e64.v v20, (zero), v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg8e32.v v8, (a0)
+; CHECK-NEXT:    csrr t5, vlenb
+; CHECK-NEXT:    slli t5, t5, 4
+; CHECK-NEXT:    add t5, sp, t5
+; CHECK-NEXT:    addi t5, t5, 16
+; CHECK-NEXT:    vl4r.v v20, (t5) # vscale x 32-byte Folded Reload
+; CHECK-NEXT:    csrr t6, vlenb
+; CHECK-NEXT:    slli t6, t6, 2
+; CHECK-NEXT:    add t5, t5, t6
+; CHECK-NEXT:    vl4r.v v24, (t5) # vscale x 32-byte Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsseg4e64.v v20, (zero)
+; CHECK-NEXT:    j .LBB0_1
+entry:
+  store double 0.000000e+00, ptr %var_117, align 8
+  store double 1.000000e+00, ptr %arrayinit.element3061, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %2 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3059, i64 0)
+  %3 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3067, i64 0)
+  %4 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3065, i64 0)
+  %5 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3063, i64 0)
+  %6 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3055, i64 0)
+  %7 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3057, i64 0)
+  %8 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3053, i64 0)
+  %9 = call <vscale x 2 x double> @llvm.riscv.vle.nxv2f64.p0.i64(<vscale x 2 x double> zeroinitializer, ptr %arrayinit.element3051, i64 0)
+  %10 = tail call <vscale x 2 x i32> @llvm.riscv.vle.nxv2i32.p0.i64(<vscale x 2 x i32> zeroinitializer, ptr %arrayinit.element3047, i64 0)
+  %11 = tail call <vscale x 2 x i32> @llvm.riscv.vle.nxv2i32.p0.i64(<vscale x 2 x i32> zeroinitializer, ptr %arrayinit.element3049, i64 0)
+  call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) zeroinitializer, ptr null, i64 0, i64 5)
+  %12 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) zeroinitializer, <vscale x 2 x float> %8, i32 0)
+  %13 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %12, <vscale x 2 x float> %7, i32 2)
+  %14 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %13, <vscale x 2 x float> %6, i32 0)
+  %15 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %14, <vscale x 2 x float> %5, i32 0)
+  %16 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %15, <vscale x 2 x float> %4, i32 0)
+  %17 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %16, <vscale x 2 x float> %3, i32 0)
+  %18 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %17, <vscale x 2 x float> %2, i32 0)
+  call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %18, ptr %arrayinit.element3045, i64 0, i64 5)
+  %19 = tail call <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2f64.nxv2f64.i64(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> %9, <vscale x 2 x i1> zeroinitializer, i64 0)
+  %20 = tail call <vscale x 2 x i32> @llvm.riscv.vssub.mask.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> %11, <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> %10, <vscale x 2 x i1> %19, i64 0, i64 0)
+  call void @llvm.riscv.vsseg2.triscv.vector.tuple_nxv16i8_2t.p0.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, ptr null, i64 %var_94_i.07698, i64 6)
+  call void @llvm.riscv.vsseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) zeroinitializer, ptr null, <vscale x 2 x i1> zeroinitializer, i64 0, i64 6)
+  %21 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, <vscale x 2 x i32> %20, i32 0)
+  call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %21, ptr %var_117, i64 0, i64 5)
+  call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv16i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %1, ptr null, i64 0, i64 6)
+  br label %for.body
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
index ab9849631663c..a4c793b49d54a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
@@ -36,7 +36,7 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    .cfi_offset s10, -96
 ; CHECK-NEXT:    .cfi_offset s11, -104
 ; CHECK-NEXT:    li a6, 0
-; CHECK-NEXT:    li s2, 8
+; CHECK-NEXT:    li a7, 8
 ; CHECK-NEXT:    li t0, 12
 ; CHECK-NEXT:    li s0, 4
 ; CHECK-NEXT:    li t1, 20
@@ -45,7 +45,7 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    andi t3, a4, 1
-; CHECK-NEXT:    li t2, 4
+; CHECK-NEXT:    li s2, 4
 ; CHECK-NEXT:  .LBB0_1: # %for.cond1.preheader.i
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
 ; CHECK-NEXT:    # Child Loop BB0_2 Depth 2
@@ -53,9 +53,9 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    # Child Loop BB0_4 Depth 4
 ; CHECK-NEXT:    # Child Loop BB0_5 Depth 5
 ; CHECK-NEXT:    mv t4, t1
-; CHECK-NEXT:    mv t5, t2
+; CHECK-NEXT:    mv t2, s2
 ; CHECK-NEXT:    mv t6, t0
-; CHECK-NEXT:    mv a7, s2
+; CHECK-NEXT:    mv s3, a7
 ; CHECK-NEXT:    mv s4, a6
 ; CHECK-NEXT:  .LBB0_2: # %for.cond5.preheader.i
 ; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
@@ -64,9 +64,9 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    # Child Loop BB0_4 Depth 4
 ; CHECK-NEXT:    # Child Loop BB0_5 Depth 5
 ; CHECK-NEXT:    mv s5, t4
-; CHECK-NEXT:    mv s6, t5
+; CHECK-NEXT:    mv t5, t2
 ; CHECK-NEXT:    mv s7, t6
-; CHECK-NEXT:    mv s3, a7
+; CHECK-NEXT:    mv s8, s3
 ; CHECK-NEXT:    mv s9, s4
 ; CHECK-NEXT:  .LBB0_3: # %for.cond9.preheader.i
 ; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
@@ -75,9 +75,9 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    # Child Loop BB0_4 Depth 4
 ; CHECK-NEXT:    # Child Loop BB0_5 Depth 5
 ; CHECK-NEXT:    mv s11, s5
-; CHECK-NEXT:    mv a3, s6
+; CHECK-NEXT:    mv s6, t5
 ; CHECK-NEXT:    mv ra, s7
-; CHECK-NEXT:    mv s8, s3
+; CHECK-NEXT:    mv a5, s8
 ; CHECK-NEXT:    mv s1, s9
 ; CHECK-NEXT:  .LBB0_4: # %vector.ph.i
 ; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
@@ -92,45 +92,44 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    # Parent Loop BB0_3 Depth=3
 ; CHECK-NEXT:    # Parent Loop BB0_4 Depth=4
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=5
-; CHECK-NEXT:    addi a5, a1, 4
-; CHECK-NEXT:    add a4, s8, a1
-; CHECK-NEXT:    add a1, a1, a3
+; CHECK-NEXT:    add a4, a5, a1
+; CHECK-NEXT:    add a3, s6, a1
+; CHECK-NEXT:    addi a1, a1, 4
 ; CHECK-NEXT:    vse32.v v8, (a4), v0.t
-; CHECK-NEXT:    vse32.v v8, (a1), v0.t
-; CHECK-NEXT:    mv a1, a5
-; CHECK-NEXT:    bne a5, s0, .LBB0_5
+; CHECK-NEXT:    vse32.v v8, (a3), v0.t
+; CHECK-NEXT:    bne a1, s0, .LBB0_5
 ; CHECK-NEXT:  # %bb.6: # %for.cond.cleanup15.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_4 Depth=4
 ; CHECK-NEXT:    addi s1, s1, 4
-; CHECK-NEXT:    addi s8, s8, 4
+; CHECK-NEXT:    addi a5, a5, 4
 ; CHECK-NEXT:    addi ra, ra, 4
-; CHECK-NEXT:    addi a3, a3, 4
+; CHECK-NEXT:    addi s6, s6, 4
 ; CHECK-NEXT:    andi s10, a0, 1
 ; CHECK-NEXT:    addi s11, s11, 4
 ; CHECK-NEXT:    beqz s10, .LBB0_4
 ; CHECK-NEXT:  # %bb.7: # %for.cond.cleanup11.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=3
 ; CHECK-NEXT:    addi s9, s9, 4
-; CHECK-NEXT:    addi s3, s3, 4
+; CHECK-NEXT:    addi s8, s8, 4
 ; CHECK-NEXT:    addi s7, s7, 4
-; CHECK-NEXT:    addi s6, s6, 4
+; CHECK-NEXT:    addi t5, t5, 4
 ; CHECK-NEXT:    andi a1, a2, 1
 ; CHECK-NEXT:    addi s5, s5, 4
 ; CHECK-NEXT:    beqz a1, .LBB0_3
 ; CHECK-NEXT:  # %bb.8: # %for.cond.cleanup7.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_2 Depth=2
 ; CHECK-NEXT:    addi s4, s4, 4
-; CHECK-NEXT:    addi a7, a7, 4
+; CHECK-NEXT:    addi s3, s3, 4
 ; CHECK-NEXT:    addi t6, t6, 4
-; CHECK-NEXT:    addi t5, t5, 4
+; CHECK-NEXT:    addi t2, t2, 4
 ; CHECK-NEXT:    addi t4, t4, 4
 ; CHECK-NEXT:    beqz t3, .LBB0_2
 ; CHECK-NEXT:  # %bb.9: # %for.cond.cleanup3.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    addi a6, a6, 4
-; CHECK-NEXT:    addi s2, s2, 4
+; CHECK-NEXT:    addi a7, a7, 4
 ; CHECK-NEXT:    addi t0, t0, 4
-; CHECK-NEXT:    addi t2, t2, 4
+; CHECK-NEXT:    addi s2, s2, 4
 ; CHECK-NEXT:    addi t1, t1, 4
 ; CHECK-NEXT:    beqz a1, .LBB0_1
 ; CHECK-NEXT:  # %bb.10: # %l.exit
diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
index d666832cf6e0b..c79fb0f91b21f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
@@ -460,9 +460,9 @@ define void @reserved_call_frame(i64 %n) #0 {
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    sub sp, sp, a0
 ; RV64I-NEXT:    sd zero, 0(sp)
-; RV64I-NEXT:    .cfi_def_cfa_offset 4096
+; RV64I-NEXT:    .cfi_def_cfa_offset 6128
 ; RV64I-NEXT:    addi sp, sp, -48
-; RV64I-NEXT:    .cfi_def_cfa_offset 4144
+; RV64I-NEXT:    .cfi_def_cfa_offset 6176
 ; RV64I-NEXT:    lui a0, 1
 ; RV64I-NEXT:    add a0, sp, a0
 ; RV64I-NEXT:    call callee_stack_args
@@ -485,9 +485,9 @@ define void @reserved_call_frame(i64 %n) #0 {
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    sub sp, sp, a0
 ; RV32I-NEXT:    sw zero, 0(sp)
-; RV32I-NEXT:    .cfi_def_cfa_offset 4096
+; RV32I-NEXT:    .cfi_def_cfa_offset 6128
 ; RV32I-NEXT:    addi sp, sp, -80
-; RV32I-NEXT:    .cfi_def_cfa_offset 4176
+; RV32I-NEXT:    .cfi_def_cfa_offset 6208
 ; RV32I-NEXT:    lui a0, 1
 ; RV32I-NEXT:    addi a0, a0, 36
 ; RV32I-NEXT:    add a0, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
index f295bd8d74df3..386c736128794 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
@@ -2258,18 +2258,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-RV32-NEXT:    vsetvli a7, zero, e32, m2, ta, ma
 ; CHECK-RV32-NEXT:  .LBB98_3: # %vector.body
 ; CHECK-RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-RV32-NEXT:    slli a7, a6, 2
-; CHECK-RV32-NEXT:    add t0, a6, a4
-; CHECK-RV32-NEXT:    add a7, a0, a7
-; CHECK-RV32-NEXT:    vl2re32.v v8, (a7)
-; CHECK-RV32-NEXT:    sltu a6, t0, a6
-; CHECK-RV32-NEXT:    add a5, a5, a6
-; CHECK-RV32-NEXT:    xor a6, t0, a3
+; CHECK-RV32-NEXT:    mv a7, a6
+; CHECK-RV32-NEXT:    slli t0, a6, 2
+; CHECK-RV32-NEXT:    add a6, a6, a4
+; CHECK-RV32-NEXT:    add t0, a0, t0
+; CHECK-RV32-NEXT:    vl2re32.v v8, (t0)
+; CHECK-RV32-NEXT:    sltu a7, a6, a7
+; CHECK-RV32-NEXT:    add a5, a5, a7
+; CHECK-RV32-NEXT:    xor a7, a6, a3
 ; CHECK-RV32-NEXT:    vand.vx v8, v8, a1
-; CHECK-RV32-NEXT:    or t1, a6, a5
-; CHECK-RV32-NEXT:    vs2r.v v8, (a7)
-; CHECK-RV32-NEXT:    mv a6, t0
-; CHECK-RV32-NEXT:    bnez t1, .LBB98_3
+; CHECK-RV32-NEXT:    or a7, a7, a5
+; CHECK-RV32-NEXT:    vs2r.v v8, (t0)
+; CHECK-RV32-NEXT:    bnez a7, .LBB98_3
 ; CHECK-RV32-NEXT:  # %bb.4: # %middle.block
 ; CHECK-RV32-NEXT:    bnez a3, .LBB98_6
 ; CHECK-RV32-NEXT:  .LBB98_5: # %for.body
@@ -2350,18 +2350,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-NOZBB32-NEXT:    vsetvli a7, zero, e32, m2, ta, ma
 ; CHECK-ZVKB-NOZBB32-NEXT:  .LBB98_3: # %vector.body
 ; CHECK-ZVKB-NOZBB32-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-ZVKB-NOZBB32-NEXT:    slli a7, a6, 2
-; CHECK-ZVKB-NOZBB32-NEXT:    add t0, a6, a4
-; CHECK-ZVKB-NOZBB32-NEXT:    add a7, a0, a7
-; CHECK-ZVKB-NOZBB32-NEXT:    vl2re32.v v8, (a7)
-; CHECK-ZVKB-NOZBB32-NEXT:    sltu a6, t0, a6
-; CHECK-ZVKB-NOZBB32-NEXT:    add a5, a5, a6
-; CHECK-ZVKB-NOZBB32-NEXT:    xor a6, t0, a3
+; CHECK-ZVKB-NOZBB32-NEXT:    mv a7, a6
+; CHECK-ZVKB-NOZBB32-NEXT:    slli t0, a6, 2
+; CHECK-ZVKB-NOZBB32-NEXT:    add a6, a6, a4
+; CHECK-ZVKB-NOZBB32-NEXT:    add t0, a0, t0
+; CHECK-ZVKB-NOZBB32-NEXT:    vl2re32.v v8, (t0)
+; CHECK-ZVKB-NOZBB32-NEXT:    sltu a7, a6, a7
+; CHECK-ZVKB-NOZBB32-NEXT:    add a5, a5, a7
+; CHECK-ZVKB-NOZBB32-NEXT:    xor a7, a6, a3
 ; CHECK-ZVKB-NOZBB32-NEXT:    vandn.vx v8, v8, a1
-; CHECK-ZVKB-NOZBB32-NEXT:    or t1, a6, a5
-; CHECK-ZVKB-NOZBB32-NEXT:    vs2r.v v8, (a7)
-; CHECK-ZVKB-NOZBB32-NEXT:    mv a6, t0
-; CHECK-ZVKB-NOZBB32-NEXT:    bnez t1, .LBB98_3
+; CHECK-ZVKB-NOZBB32-NEXT:    or a7, a7, a5
+; CHECK-ZVKB-NOZBB32-NEXT:    vs2r.v v8, (t0)
+; CHECK-ZVKB-NOZBB32-NEXT:    bnez a7, .LBB98_3
 ; CHECK-ZVKB-NOZBB32-NEXT:  # %bb.4: # %middle.block
 ; CHECK-ZVKB-NOZBB32-NEXT:    bnez a3, .LBB98_7
 ; CHECK-ZVKB-NOZBB32-NEXT:  .LBB98_5: # %for.body.preheader
@@ -2444,18 +2444,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-ZBB32-NEXT:    vsetvli a7, zero, e32, m2, ta, ma
 ; CHECK-ZVKB-ZBB32-NEXT:  .LBB98_3: # %vector.body
 ; CHECK-ZVKB-ZBB32-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-ZVKB-ZBB32-NEXT:    slli a7, a6, 2
-; CHECK-ZVKB-ZBB32-NEXT:    add t0, a6, a4
-; CHECK-ZVKB-ZBB32-NEXT:    add a7, a0, a7
-; CHECK-ZVKB-ZBB32-NEXT:    vl2re32.v v8, (a7)
-; CHECK-ZVKB-ZBB32-NEXT:    sltu a6, t0, a6
-; CHECK-ZVKB-ZBB32-NEXT:    add a5, a5, a6
-; CHECK-ZVKB-ZBB32-NEXT:    xor a6, t0, a3
+; CHECK-ZVKB-ZBB32-NEXT:    mv a7, a6
+; CHECK-ZVKB-ZBB32-NEXT:    slli t0, a6, 2
+; CHECK-ZVKB-ZBB32-NEXT:    add a6, a6, a4
+; CHECK-ZVKB-ZBB32-NEXT:    add t0, a0, t0
+; CHECK-ZVKB-ZBB32-NEXT:    vl2re32.v v8, (t0)
+; CHECK-ZVKB-ZBB32-NEXT:    sltu a7, a6, a7
+; CHECK-ZVKB-ZBB32-NEXT:    add a5, a5, a7
+; CHECK-ZVKB-ZBB32-NEXT:    xor a7, a6, a3
 ; CHECK-ZVKB-ZBB32-NEXT:    vandn.vx v8, v8, a1
-; CHECK-ZVKB-ZBB32-NEXT:    or t1, a6, a5
-; CHECK-ZVKB-ZBB32-NEXT:    vs2r.v v8, (a7)
-; CHECK-ZVKB-ZBB32-NEXT:    mv a6, t0
-; CHECK-ZVKB-ZBB32-NEXT:    bnez t1, .LBB98_3
+; CHECK-ZVKB-ZBB32-NEXT:    or a7, a7, a5
+; CHECK-ZVKB-ZBB32-NEXT:    vs2r.v v8, (t0)
+; CHECK-ZVKB-ZBB32-NEXT:    bnez a7, .LBB98_3
 ; CHECK-ZVKB-ZBB32-NEXT:  # %bb.4: # %middle.block
 ; CHECK-ZVKB-ZBB32-NEXT:    bnez a3, .LBB98_6
 ; CHECK-ZVKB-ZBB32-NEXT:  .LBB98_5: # %for.body
diff --git a/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll
index ccf82b93d6b75..2f5fde3bb3b20 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll
@@ -1,12 +1,376 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v -target-abi=ilp32d \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v -target-abi=lp64d \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA
+
+define <vscale x 1 x bfloat> @vfsgnj_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vand.vx v9, v9, a1, v0.t
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    vor.vv v8, v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfsgnj_vv_nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1, v0.t
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 1 x bfloat> @llvm.vp.copysign.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfsgnj_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_nxv1bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vand.vx v9, v9, a1
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    vor.vv v8, v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfsgnj_vv_nxv1bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv1bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 1 x bfloat> @llvm.vp.copysign.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfsgnj_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vand.vx v9, v9, a1, v0.t
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    vor.vv v8, v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfsgnj_vv_nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1, v0.t
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 2 x bfloat> @llvm.vp.copysign.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfsgnj_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_nxv2bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vand.vx v9, v9, a1
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    vor.vv v8, v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfsgnj_vv_nxv2bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv2bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 2 x bfloat> @llvm.vp.copysign.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfsgnj_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vand.vx v9, v9, a1, v0.t
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    vor.vv v8, v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfsgnj_vv_nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1, v0.t
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 4 x bfloat> @llvm.vp.copysign.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfsgnj_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_nxv4bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vand.vx v9, v9, a1
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    vor.vv v8, v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfsgnj_vv_nxv4bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv4bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 4 x bfloat> @llvm.vp.copysign.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfsgnj_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vand.vx v10, v10, a1, v0.t
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    vor.vv v8, v8, v10, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfsgnj_vv_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a1, v0.t
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v10, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v10, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 8 x bfloat> @llvm.vp.copysign.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfsgnj_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_nxv8bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vand.vx v10, v10, a1
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    vor.vv v8, v8, v10
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfsgnj_vv_nxv8bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a1
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv8bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 8 x bfloat> @llvm.vp.copysign.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfsgnj_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vand.vx v12, v12, a1, v0.t
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    vor.vv v8, v8, v12, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfsgnj_vv_nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v12, v12, a1, v0.t
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v12, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v12, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 16 x bfloat> @llvm.vp.copysign.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfsgnj_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_nxv16bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vand.vx v12, v12, a1
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    vor.vv v8, v8, v12
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfsgnj_vv_nxv16bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v12, v12, a1
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v12
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv16bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v12
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 16 x bfloat> @llvm.vp.copysign.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfsgnj_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_nxv32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vand.vx v16, v16, a1, v0.t
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    vor.vv v8, v8, v16, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfsgnj_vv_nxv32bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v16, v16, a1, v0.t
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v16, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v16, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 32 x bfloat> @llvm.vp.copysign.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfsgnj_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfsgnj_vv_nxv32bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vand.vx v16, v16, a1
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    vor.vv v8, v8, v16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfsgnj_vv_nxv32bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v16, v16, a1
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v16
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv32bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v16
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 32 x bfloat> @llvm.vp.copysign.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 32 x bfloat> %v
+}
 
 declare <vscale x 1 x half> @llvm.vp.copysign.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x i1>, i32)
 
@@ -26,6 +390,16 @@ define <vscale x 1 x half> @vfsgnj_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vand.vx v9, v9, a1, v0.t
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.copysign.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
 }
@@ -46,6 +420,16 @@ define <vscale x 1 x half> @vfsgnj_vv_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv1f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vand.vx v9, v9, a1
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.copysign.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x half> %v
 }
@@ -68,6 +452,16 @@ define <vscale x 2 x half> @vfsgnj_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vand.vx v9, v9, a1, v0.t
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.copysign.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
 }
@@ -88,6 +482,16 @@ define <vscale x 2 x half> @vfsgnj_vv_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv2f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vand.vx v9, v9, a1
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.copysign.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
@@ -110,6 +514,16 @@ define <vscale x 4 x half> @vfsgnj_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vand.vx v9, v9, a1, v0.t
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.copysign.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
 }
@@ -130,6 +544,16 @@ define <vscale x 4 x half> @vfsgnj_vv_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv4f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vand.vx v9, v9, a1
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.copysign.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x half> %v
 }
@@ -152,6 +576,16 @@ define <vscale x 8 x half> @vfsgnj_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vand.vx v10, v10, a1, v0.t
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    vor.vv v8, v8, v10, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.copysign.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
 }
@@ -172,6 +606,16 @@ define <vscale x 8 x half> @vfsgnj_vv_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv8f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vand.vx v10, v10, a1
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.copysign.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x half> %v
 }
@@ -194,6 +638,16 @@ define <vscale x 16 x half> @vfsgnj_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vand.vx v12, v12, a1, v0.t
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    vor.vv v8, v8, v12, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.copysign.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
 }
@@ -214,6 +668,16 @@ define <vscale x 16 x half> @vfsgnj_vv_nxv16f16_unmasked(<vscale x 16 x half> %v
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv16f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vand.vx v12, v12, a1
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v12
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.copysign.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x half> %v
 }
@@ -236,6 +700,16 @@ define <vscale x 32 x half> @vfsgnj_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vand.vx v16, v16, a1, v0.t
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    vor.vv v8, v8, v16, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.copysign.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
 }
@@ -256,6 +730,16 @@ define <vscale x 32 x half> @vfsgnj_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v16
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfsgnj_vv_nxv32f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vand.vx v16, v16, a1
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v16
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.copysign.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll
index ed6b7f1e6efb8..10440089cff10 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll
@@ -25,24 +25,24 @@ define dso_local void @test_store1(ptr nocapture noundef writeonly %dst, ptr noc
 ; RV32-NEXT:    li a6, 0
 ; RV32-NEXT:  .LBB0_4: # %vector.body
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    slli t0, a7, 2
-; RV32-NEXT:    addi t1, a7, 8
-; RV32-NEXT:    add t0, a1, t0
+; RV32-NEXT:    mv t0, a7
+; RV32-NEXT:    slli t1, a7, 2
+; RV32-NEXT:    addi a7, a7, 8
+; RV32-NEXT:    add t1, a1, t1
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vle32.v v8, (t0)
-; RV32-NEXT:    sltu a7, t1, a7
-; RV32-NEXT:    xor t0, t1, a5
-; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    vle32.v v8, (t1)
+; RV32-NEXT:    sltu t0, a7, t0
+; RV32-NEXT:    xor t1, a7, a5
+; RV32-NEXT:    add a6, a6, t0
 ; RV32-NEXT:    vmslt.vx v12, v8, a2
 ; RV32-NEXT:    vcompress.vm v10, v8, v12
-; RV32-NEXT:    vcpop.m a7, v12
-; RV32-NEXT:    vsetvli zero, a7, e32, m2, ta, ma
+; RV32-NEXT:    vcpop.m t0, v12
+; RV32-NEXT:    vsetvli zero, t0, e32, m2, ta, ma
 ; RV32-NEXT:    vse32.v v10, (a0)
-; RV32-NEXT:    slli a7, a7, 2
-; RV32-NEXT:    or t0, t0, a6
-; RV32-NEXT:    add a0, a0, a7
-; RV32-NEXT:    mv a7, t1
-; RV32-NEXT:    bnez t0, .LBB0_4
+; RV32-NEXT:    slli t0, t0, 2
+; RV32-NEXT:    or t1, t1, a6
+; RV32-NEXT:    add a0, a0, t0
+; RV32-NEXT:    bnez t1, .LBB0_4
 ; RV32-NEXT:  # %bb.5: # %middle.block
 ; RV32-NEXT:    bne a5, a3, .LBB0_9
 ; RV32-NEXT:  .LBB0_6: # %for.cond.cleanup
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll
index 1d8638844af7f..28426ad018b83 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll
@@ -11,75 +11,165 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v \
 ; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v \
+; RUN:     -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,ZVFBFA
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v \
+; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,ZVFBFA
 
 define <vscale x 1 x bfloat> @nxv1bf16(<vscale x 1 x bfloat> %v) {
-; CHECK-LABEL: nxv1bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    addi a0, a0, -1
+; ZVFH-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 1 x bfloat> @llvm.fabs.nxv1bf16(<vscale x 1 x bfloat> %v)
   ret <vscale x 1 x bfloat> %r
 }
 
 define <vscale x 2 x bfloat> @nxv2bf16(<vscale x 2 x bfloat> %v) {
-; CHECK-LABEL: nxv2bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    addi a0, a0, -1
+; ZVFH-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat> %v)
   ret <vscale x 2 x bfloat> %r
 }
 
 define <vscale x 4 x bfloat> @nxv4bf16(<vscale x 4 x bfloat> %v) {
-; CHECK-LABEL: nxv4bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    addi a0, a0, -1
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat> %v)
   ret <vscale x 4 x bfloat> %r
 }
 
 define <vscale x 8 x bfloat> @nxv8bf16(<vscale x 8 x bfloat> %v) {
-; CHECK-LABEL: nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    addi a0, a0, -1
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat> %v)
   ret <vscale x 8 x bfloat> %r
 }
 
 define <vscale x 16 x bfloat> @nxv16bf16(<vscale x 16 x bfloat> %v) {
-; CHECK-LABEL: nxv16bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    addi a0, a0, -1
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 16 x bfloat> @llvm.fabs.nxv16bf16(<vscale x 16 x bfloat> %v)
   ret <vscale x 16 x bfloat> %r
 }
 
 define <vscale x 32 x bfloat> @nxv32bf16(<vscale x 32 x bfloat> %v) {
-; CHECK-LABEL: nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    addi a0, a0, -1
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv32bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m8, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 32 x bfloat> @llvm.fabs.nxv32bf16(<vscale x 32 x bfloat> %v)
   ret <vscale x 32 x bfloat> %r
 }
@@ -100,6 +190,14 @@ define <vscale x 1 x half> @vfabs_nxv1f16(<vscale x 1 x half> %v) {
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 1 x half> @llvm.fabs.nxv1f16(<vscale x 1 x half> %v)
   ret <vscale x 1 x half> %r
 }
@@ -120,6 +218,14 @@ define <vscale x 2 x half> @vfabs_nxv2f16(<vscale x 2 x half> %v) {
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 2 x half> @llvm.fabs.nxv2f16(<vscale x 2 x half> %v)
   ret <vscale x 2 x half> %r
 }
@@ -140,6 +246,14 @@ define <vscale x 4 x half> @vfabs_nxv4f16(<vscale x 4 x half> %v) {
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 4 x half> @llvm.fabs.nxv4f16(<vscale x 4 x half> %v)
   ret <vscale x 4 x half> %r
 }
@@ -160,6 +274,14 @@ define <vscale x 8 x half> @vfabs_nxv8f16(<vscale x 8 x half> %v) {
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 8 x half> @llvm.fabs.nxv8f16(<vscale x 8 x half> %v)
   ret <vscale x 8 x half> %r
 }
@@ -180,6 +302,14 @@ define <vscale x 16 x half> @vfabs_nxv16f16(<vscale x 16 x half> %v) {
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 16 x half> @llvm.fabs.nxv16f16(<vscale x 16 x half> %v)
   ret <vscale x 16 x half> %r
 }
@@ -200,6 +330,14 @@ define <vscale x 32 x half> @vfabs_nxv32f16(<vscale x 32 x half> %v) {
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 32 x half> @llvm.fabs.nxv32f16(<vscale x 32 x half> %v)
   ret <vscale x 32 x half> %r
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
index 8f9f9c4256c8f..c6888c0bcae0f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
@@ -1,12 +1,328 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v,+experimental-zvfbfa -target-abi=ilp32d \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v,+experimental-zvfbfa -target-abi=lp64d \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA
+
+define <vscale x 1 x bfloat> @vfabs_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 1 x bfloat> @llvm.vp.fabs.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfabs_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_nxv1bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_nxv1bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv1bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 1 x bfloat> @llvm.vp.fabs.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfabs_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 2 x bfloat> @llvm.vp.fabs.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfabs_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_nxv2bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_nxv2bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv2bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 2 x bfloat> @llvm.vp.fabs.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfabs_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 4 x bfloat> @llvm.vp.fabs.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfabs_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_nxv4bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_nxv4bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv4bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 4 x bfloat> @llvm.vp.fabs.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfabs_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 8 x bfloat> @llvm.vp.fabs.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfabs_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_nxv8bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_nxv8bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv8bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 8 x bfloat> @llvm.vp.fabs.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfabs_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 16 x bfloat> @llvm.vp.fabs.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfabs_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_nxv16bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_nxv16bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv16bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 16 x bfloat> @llvm.vp.fabs.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfabs_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_nxv32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_nxv32bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 32 x bfloat> @llvm.vp.fabs.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfabs_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfabs_vv_nxv32bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    addi a1, a1, -1
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vand.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfabs_vv_nxv32bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv32bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; ZVFBFA-NEXT:    vfabs.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 32 x bfloat> @llvm.vp.fabs.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 32 x bfloat> %v
+}
 
 declare <vscale x 1 x half> @llvm.vp.fabs.nxv1f16(<vscale x 1 x half>, <vscale x 1 x i1>, i32)
 
@@ -24,6 +340,14 @@ define <vscale x 1 x half> @vfabs_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fabs.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
 }
@@ -42,6 +366,14 @@ define <vscale x 1 x half> @vfabs_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, i
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv1f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fabs.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x half> %v
 }
@@ -62,6 +394,14 @@ define <vscale x 2 x half> @vfabs_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fabs.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
 }
@@ -80,6 +420,14 @@ define <vscale x 2 x half> @vfabs_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, i
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv2f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fabs.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
@@ -100,6 +448,14 @@ define <vscale x 4 x half> @vfabs_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fabs.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
 }
@@ -118,6 +474,14 @@ define <vscale x 4 x half> @vfabs_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, i
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv4f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fabs.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x half> %v
 }
@@ -138,6 +502,14 @@ define <vscale x 8 x half> @vfabs_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fabs.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
 }
@@ -156,6 +528,14 @@ define <vscale x 8 x half> @vfabs_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, i
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv8f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fabs.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x half> %v
 }
@@ -176,6 +556,14 @@ define <vscale x 16 x half> @vfabs_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fabs.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
 }
@@ -194,6 +582,14 @@ define <vscale x 16 x half> @vfabs_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv16f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fabs.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x half> %v
 }
@@ -214,6 +610,14 @@ define <vscale x 32 x half> @vfabs_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.fabs.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
 }
@@ -232,6 +636,14 @@ define <vscale x 32 x half> @vfabs_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfabs_vv_nxv32f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    addi a1, a1, -1
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.fabs.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
 }
@@ -473,10 +885,10 @@ define <vscale x 16 x double> @vfabs_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB32_2
+; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB32_2:
+; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v8, v0.t
@@ -495,10 +907,10 @@ define <vscale x 16 x double> @vfabs_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v16
-; CHECK-NEXT:    bltu a0, a1, .LBB33_2
+; CHECK-NEXT:    bltu a0, a1, .LBB45_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB33_2:
+; CHECK-NEXT:  .LBB45_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v8
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll
index 83f588ce5027d..bef2e8d3b57fc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll
@@ -11,87 +11,189 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v \
 ; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v \
+; RUN:     -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,ZVFBFA
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v \
+; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,ZVFBFA
 
 define <vscale x 1 x bfloat> @nxv1bf16(<vscale x 1 x bfloat> %vm, <vscale x 1 x bfloat> %vs) {
-; CHECK-LABEL: nxv1bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vand.vx v9, v9, a0
+; ZVFH-NEXT:    addi a0, a0, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a0
+; ZVFH-NEXT:    vor.vv v8, v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 1 x bfloat> @llvm.copysign.nxv1bf16(<vscale x 1 x bfloat> %vm, <vscale x 1 x bfloat> %vs)
   ret <vscale x 1 x bfloat> %r
 }
 
 define <vscale x 2 x bfloat> @nxv2bf16(<vscale x 2 x bfloat> %vm, <vscale x 2 x bfloat> %vs) {
-; CHECK-LABEL: nxv2bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vand.vx v9, v9, a0
+; ZVFH-NEXT:    addi a0, a0, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a0
+; ZVFH-NEXT:    vor.vv v8, v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 2 x bfloat> @llvm.copysign.nxv2bf16(<vscale x 2 x bfloat> %vm, <vscale x 2 x bfloat> %vs)
   ret <vscale x 2 x bfloat> %r
 }
 
 define <vscale x 4 x bfloat> @nxv4bf16(<vscale x 4 x bfloat> %vm, <vscale x 4 x bfloat> %vs) {
-; CHECK-LABEL: nxv4bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vand.vx v9, v9, a0
+; ZVFH-NEXT:    addi a0, a0, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a0
+; ZVFH-NEXT:    vor.vv v8, v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 4 x bfloat> @llvm.copysign.nxv4bf16(<vscale x 4 x bfloat> %vm, <vscale x 4 x bfloat> %vs)
   ret <vscale x 4 x bfloat> %r
 }
 
 define <vscale x 8 x bfloat> @nxv8bf16(<vscale x 8 x bfloat> %vm, <vscale x 8 x bfloat> %vs) {
-; CHECK-LABEL: nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vand.vx v10, v10, a0
+; ZVFH-NEXT:    addi a0, a0, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a0
+; ZVFH-NEXT:    vor.vv v8, v8, v10
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 8 x bfloat> @llvm.copysign.nxv8bf16(<vscale x 8 x bfloat> %vm, <vscale x 8 x bfloat> %vs)
   ret <vscale x 8 x bfloat> %r
 }
 
 define <vscale x 16 x bfloat> @nxv16bf16(<vscale x 16 x bfloat> %vm, <vscale x 16 x bfloat> %vs) {
-; CHECK-LABEL: nxv16bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vand.vx v12, v12, a0
+; ZVFH-NEXT:    addi a0, a0, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a0
+; ZVFH-NEXT:    vor.vv v8, v8, v12
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v12, v12, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v12
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v12
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 16 x bfloat> @llvm.copysign.nxv16bf16(<vscale x 16 x bfloat> %vm, <vscale x 16 x bfloat> %vs)
   ret <vscale x 16 x bfloat> %r
 }
 
 define <vscale x 32 x bfloat> @nxv32bf32(<vscale x 32 x bfloat> %vm, <vscale x 32 x bfloat> %vs) {
-; CHECK-LABEL: nxv32bf32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vand.vx v8, v8, a0
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv32bf32:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFH-NEXT:    vand.vx v16, v16, a0
+; ZVFH-NEXT:    addi a0, a0, -1
+; ZVFH-NEXT:    vand.vx v8, v8, a0
+; ZVFH-NEXT:    vor.vv v8, v8, v16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv32bf32:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v16, v16, a0
+; ZVFHMIN-NEXT:    addi a0, a0, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v16
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv32bf32:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m8, ta, ma
+; ZVFBFA-NEXT:    vfsgnj.vv v8, v8, v16
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 32 x bfloat> @llvm.copysign.nxv32bf32(<vscale x 32 x bfloat> %vm, <vscale x 32 x bfloat> %vs)
   ret <vscale x 32 x bfloat> %r
 }
@@ -114,6 +216,16 @@ define <vscale x 1 x half> @vfcopysign_vv_nxv1f16(<vscale x 1 x half> %vm, <vsca
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_vv_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vand.vx v9, v9, a0
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %vs)
   ret <vscale x 1 x half> %r
 }
@@ -136,6 +248,18 @@ define <vscale x 1 x half> @vfcopysign_vf_nxv1f16(<vscale x 1 x half> %vm, half
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_vf_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a0
+; ZVFBFA-NEXT:    addi a0, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vand.vx v9, v9, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 1 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 1 x half> %head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
   %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %splat)
@@ -159,6 +283,17 @@ define <vscale x 1 x half> @vfcopynsign_vv_nxv1f16(<vscale x 1 x half> %vm, <vsc
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_vv_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v9, v9, a0
+; ZVFBFA-NEXT:    vand.vx v9, v9, a0
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %n = fneg <vscale x 1 x half> %vs
   %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %n)
   ret <vscale x 1 x half> %r
@@ -183,6 +318,19 @@ define <vscale x 1 x half> @vfcopynsign_vf_nxv1f16(<vscale x 1 x half> %vm, half
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_vf_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a0
+; ZVFBFA-NEXT:    addi a0, a1, -1
+; ZVFBFA-NEXT:    vxor.vx v9, v9, a1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vand.vx v9, v9, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 1 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 1 x half> %head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
   %n = fneg <vscale x 1 x half> %splat
@@ -208,6 +356,17 @@ define <vscale x 1 x half> @vfcopysign_exttrunc_vv_nxv1f16_nxv1f32(<vscale x 1 x
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_exttrunc_vv_nxv1f16_nxv1f32:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vand.vx v9, v10, a0
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %e = fptrunc <vscale x 1 x float> %vs to <vscale x 1 x half>
   %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %e)
   ret <vscale x 1 x half> %r
@@ -235,6 +394,19 @@ define <vscale x 1 x half> @vfcopysign_exttrunc_vf_nxv1f16_nxv1f32(<vscale x 1 x
 ; ZVFHMIN-NEXT:    vand.vx v9, v10, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_exttrunc_vf_nxv1f16_nxv1f32:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v9, fa0
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFBFA-NEXT:    addi a1, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vand.vx v9, v10, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 1 x float> poison, float %s, i32 0
   %splat = shufflevector <vscale x 1 x float> %head, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
   %esplat = fptrunc <vscale x 1 x float> %splat to <vscale x 1 x half>
@@ -261,6 +433,18 @@ define <vscale x 1 x half> @vfcopynsign_exttrunc_vv_nxv1f16_nxv1f32(<vscale x 1
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_exttrunc_vv_nxv1f16_nxv1f32:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFBFA-NEXT:    addi a1, a0, -1
+; ZVFBFA-NEXT:    vxor.vx v9, v10, a0
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vand.vx v9, v9, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %n = fneg <vscale x 1 x float> %vs
   %eneg = fptrunc <vscale x 1 x float> %n to <vscale x 1 x half>
   %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %eneg)
@@ -290,6 +474,20 @@ define <vscale x 1 x half> @vfcopynsign_exttrunc_vf_nxv1f16_nxv1f32(<vscale x 1
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_exttrunc_vf_nxv1f16_nxv1f32:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v9, fa0
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    addi a1, a0, -1
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vxor.vx v9, v10, a0
+; ZVFBFA-NEXT:    vand.vx v9, v9, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 1 x float> poison, float %s, i32 0
   %splat = shufflevector <vscale x 1 x float> %head, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
   %n = fneg <vscale x 1 x float> %splat
@@ -320,6 +518,19 @@ define <vscale x 1 x half> @vfcopysign_exttrunc_vv_nxv1f16_nxv1f64(<vscale x 1 x
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_exttrunc_vv_nxv1f16_nxv1f64:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.rod.f.f.w v10, v9
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFBFA-NEXT:    addi a1, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vand.vx v9, v9, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %e = fptrunc <vscale x 1 x double> %vs to <vscale x 1 x half>
   %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %e)
   ret <vscale x 1 x half> %r
@@ -351,6 +562,21 @@ define <vscale x 1 x half> @vfcopysign_exttrunc_vf_nxv1f16_nxv1f64(<vscale x 1 x
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_exttrunc_vf_nxv1f16_nxv1f64:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v9, fa0
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.rod.f.f.w v10, v9
+; ZVFBFA-NEXT:    addi a1, a0, -1
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vand.vx v9, v9, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 1 x double> poison, double %s, i32 0
   %splat = shufflevector <vscale x 1 x double> %head, <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer
   %esplat = fptrunc <vscale x 1 x double> %splat to <vscale x 1 x half>
@@ -381,6 +607,20 @@ define <vscale x 1 x half> @vfcopynsign_exttrunc_vv_nxv1f16_nxv1f64(<vscale x 1
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_exttrunc_vv_nxv1f16_nxv1f64:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.rod.f.f.w v10, v9
+; ZVFBFA-NEXT:    addi a1, a0, -1
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vxor.vx v9, v9, a0
+; ZVFBFA-NEXT:    vand.vx v9, v9, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %n = fneg <vscale x 1 x double> %vs
   %eneg = fptrunc <vscale x 1 x double> %n to <vscale x 1 x half>
   %r = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> %vm, <vscale x 1 x half> %eneg)
@@ -414,6 +654,22 @@ define <vscale x 1 x half> @vfcopynsign_exttrunc_vf_nxv1f16_nxv1f64(<vscale x 1
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_exttrunc_vf_nxv1f16_nxv1f64:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v9, fa0
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    addi a1, a0, -1
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.rod.f.f.w v10, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFBFA-NEXT:    vxor.vx v9, v9, a0
+; ZVFBFA-NEXT:    vand.vx v9, v9, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 1 x double> poison, double %s, i32 0
   %splat = shufflevector <vscale x 1 x double> %head, <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer
   %n = fneg <vscale x 1 x double> %splat
@@ -440,6 +696,16 @@ define <vscale x 2 x half> @vfcopysign_vv_nxv2f16(<vscale x 2 x half> %vm, <vsca
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_vv_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vand.vx v9, v9, a0
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> %vm, <vscale x 2 x half> %vs)
   ret <vscale x 2 x half> %r
 }
@@ -462,6 +728,18 @@ define <vscale x 2 x half> @vfcopysign_vf_nxv2f16(<vscale x 2 x half> %vm, half
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_vf_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a0
+; ZVFBFA-NEXT:    addi a0, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vand.vx v9, v9, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 2 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 2 x half> %head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
   %r = call <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> %vm, <vscale x 2 x half> %splat)
@@ -485,6 +763,17 @@ define <vscale x 2 x half> @vfcopynsign_vv_nxv2f16(<vscale x 2 x half> %vm, <vsc
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_vv_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v9, v9, a0
+; ZVFBFA-NEXT:    vand.vx v9, v9, a0
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %n = fneg <vscale x 2 x half> %vs
   %r = call <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> %vm, <vscale x 2 x half> %n)
   ret <vscale x 2 x half> %r
@@ -509,6 +798,19 @@ define <vscale x 2 x half> @vfcopynsign_vf_nxv2f16(<vscale x 2 x half> %vm, half
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_vf_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a0
+; ZVFBFA-NEXT:    addi a0, a1, -1
+; ZVFBFA-NEXT:    vxor.vx v9, v9, a1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vand.vx v9, v9, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 2 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 2 x half> %head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
   %n = fneg <vscale x 2 x half> %splat
@@ -534,6 +836,16 @@ define <vscale x 4 x half> @vfcopysign_vv_nxv4f16(<vscale x 4 x half> %vm, <vsca
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_vv_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vand.vx v9, v9, a0
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> %vm, <vscale x 4 x half> %vs)
   ret <vscale x 4 x half> %r
 }
@@ -556,6 +868,18 @@ define <vscale x 4 x half> @vfcopysign_vf_nxv4f16(<vscale x 4 x half> %vm, half
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_vf_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a0
+; ZVFBFA-NEXT:    addi a0, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vand.vx v9, v9, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 4 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 4 x half> %head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
   %r = call <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> %vm, <vscale x 4 x half> %splat)
@@ -579,6 +903,17 @@ define <vscale x 4 x half> @vfcopynsign_vv_nxv4f16(<vscale x 4 x half> %vm, <vsc
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_vv_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v9, v9, a0
+; ZVFBFA-NEXT:    vand.vx v9, v9, a0
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %n = fneg <vscale x 4 x half> %vs
   %r = call <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> %vm, <vscale x 4 x half> %n)
   ret <vscale x 4 x half> %r
@@ -603,6 +938,19 @@ define <vscale x 4 x half> @vfcopynsign_vf_nxv4f16(<vscale x 4 x half> %vm, half
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_vf_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a0
+; ZVFBFA-NEXT:    addi a0, a1, -1
+; ZVFBFA-NEXT:    vxor.vx v9, v9, a1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vand.vx v9, v9, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 4 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 4 x half> %head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
   %n = fneg <vscale x 4 x half> %splat
@@ -628,6 +976,16 @@ define <vscale x 8 x half> @vfcopysign_vv_nxv8f16(<vscale x 8 x half> %vm, <vsca
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_vv_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vand.vx v10, v10, a0
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %vs)
   ret <vscale x 8 x half> %r
 }
@@ -650,6 +1008,18 @@ define <vscale x 8 x half> @vfcopysign_vf_nxv8f16(<vscale x 8 x half> %vm, half
 ; ZVFHMIN-NEXT:    vand.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_vf_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v10, a0
+; ZVFBFA-NEXT:    addi a0, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vand.vx v10, v10, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
   %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %splat)
@@ -673,6 +1043,17 @@ define <vscale x 8 x half> @vfcopynsign_vv_nxv8f16(<vscale x 8 x half> %vm, <vsc
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_vv_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v10, v10, a0
+; ZVFBFA-NEXT:    vand.vx v10, v10, a0
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
   %n = fneg <vscale x 8 x half> %vs
   %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %n)
   ret <vscale x 8 x half> %r
@@ -697,6 +1078,19 @@ define <vscale x 8 x half> @vfcopynsign_vf_nxv8f16(<vscale x 8 x half> %vm, half
 ; ZVFHMIN-NEXT:    vand.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_vf_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v10, a0
+; ZVFBFA-NEXT:    addi a0, a1, -1
+; ZVFBFA-NEXT:    vxor.vx v10, v10, a1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vand.vx v10, v10, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
   %n = fneg <vscale x 8 x half> %splat
@@ -722,6 +1116,17 @@ define <vscale x 8 x half> @vfcopysign_exttrunc_vv_nxv8f16_nxv8f32(<vscale x 8 x
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_exttrunc_vv_nxv8f16_nxv8f32:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vand.vx v10, v10, a0
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
   %e = fptrunc <vscale x 8 x float> %vs to <vscale x 8 x half>
   %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %e)
   ret <vscale x 8 x half> %r
@@ -749,6 +1154,19 @@ define <vscale x 8 x half> @vfcopysign_exttrunc_vf_nxv8f16_nxv8f32(<vscale x 8 x
 ; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_exttrunc_vf_nxv8f16_nxv8f32:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v12, fa0
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFBFA-NEXT:    addi a1, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vand.vx v10, v10, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %s, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
   %esplat = fptrunc <vscale x 8 x float> %splat to <vscale x 8 x half>
@@ -775,6 +1193,18 @@ define <vscale x 8 x half> @vfcopynsign_exttrunc_vv_nxv8f16_nxv8f32(<vscale x 8
 ; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_exttrunc_vv_nxv8f16_nxv8f32:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFBFA-NEXT:    addi a1, a0, -1
+; ZVFBFA-NEXT:    vxor.vx v10, v10, a0
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vand.vx v10, v10, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
   %n = fneg <vscale x 8 x float> %vs
   %eneg = fptrunc <vscale x 8 x float> %n to <vscale x 8 x half>
   %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %eneg)
@@ -804,6 +1234,20 @@ define <vscale x 8 x half> @vfcopynsign_exttrunc_vf_nxv8f16_nxv8f32(<vscale x 8
 ; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_exttrunc_vf_nxv8f16_nxv8f32:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v12, fa0
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    addi a1, a0, -1
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vxor.vx v10, v10, a0
+; ZVFBFA-NEXT:    vand.vx v10, v10, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %s, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
   %n = fneg <vscale x 8 x float> %splat
@@ -834,6 +1278,19 @@ define <vscale x 8 x half> @vfcopysign_exttrunc_vv_nxv8f16_nxv8f64(<vscale x 8 x
 ; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_exttrunc_vv_nxv8f16_nxv8f64:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.rod.f.f.w v12, v16
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFBFA-NEXT:    addi a1, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vand.vx v10, v10, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
   %e = fptrunc <vscale x 8 x double> %vs to <vscale x 8 x half>
   %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %e)
   ret <vscale x 8 x half> %r
@@ -865,6 +1322,21 @@ define <vscale x 8 x half> @vfcopysign_exttrunc_vf_nxv8f16_nxv8f64(<vscale x 8 x
 ; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_exttrunc_vf_nxv8f16_nxv8f64:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v16, fa0
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.rod.f.f.w v12, v16
+; ZVFBFA-NEXT:    addi a1, a0, -1
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vand.vx v10, v10, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x double> poison, double %s, i32 0
   %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
   %esplat = fptrunc <vscale x 8 x double> %splat to <vscale x 8 x half>
@@ -895,6 +1367,20 @@ define <vscale x 8 x half> @vfcopynsign_exttrunc_vv_nxv8f16_nxv8f64(<vscale x 8
 ; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_exttrunc_vv_nxv8f16_nxv8f64:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.rod.f.f.w v12, v16
+; ZVFBFA-NEXT:    addi a1, a0, -1
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vxor.vx v10, v10, a0
+; ZVFBFA-NEXT:    vand.vx v10, v10, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
   %n = fneg <vscale x 8 x double> %vs
   %eneg = fptrunc <vscale x 8 x double> %n to <vscale x 8 x half>
   %r = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %vm, <vscale x 8 x half> %eneg)
@@ -928,6 +1414,22 @@ define <vscale x 8 x half> @vfcopynsign_exttrunc_vf_nxv8f16_nxv8f64(<vscale x 8
 ; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_exttrunc_vf_nxv8f16_nxv8f64:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v16, fa0
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    addi a1, a0, -1
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.rod.f.f.w v12, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vand.vx v8, v8, a1
+; ZVFBFA-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFBFA-NEXT:    vxor.vx v10, v10, a0
+; ZVFBFA-NEXT:    vand.vx v10, v10, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x double> poison, double %s, i32 0
   %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
   %n = fneg <vscale x 8 x double> %splat
@@ -954,6 +1456,16 @@ define <vscale x 16 x half> @vfcopysign_vv_nxv16f16(<vscale x 16 x half> %vm, <v
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_vv_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vand.vx v12, v12, a0
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v12
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 16 x half> @llvm.copysign.nxv16f16(<vscale x 16 x half> %vm, <vscale x 16 x half> %vs)
   ret <vscale x 16 x half> %r
 }
@@ -976,6 +1488,18 @@ define <vscale x 16 x half> @vfcopysign_vf_nxv16f16(<vscale x 16 x half> %vm, ha
 ; ZVFHMIN-NEXT:    vand.vx v12, v12, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_vf_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v12, a0
+; ZVFBFA-NEXT:    addi a0, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vand.vx v12, v12, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v12
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
   %r = call <vscale x 16 x half> @llvm.copysign.nxv16f16(<vscale x 16 x half> %vm, <vscale x 16 x half> %splat)
@@ -999,6 +1523,17 @@ define <vscale x 16 x half> @vfcopynsign_vv_nxv16f16(<vscale x 16 x half> %vm, <
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_vv_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v12, v12, a0
+; ZVFBFA-NEXT:    vand.vx v12, v12, a0
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v12
+; ZVFBFA-NEXT:    ret
   %n = fneg <vscale x 16 x half> %vs
   %r = call <vscale x 16 x half> @llvm.copysign.nxv16f16(<vscale x 16 x half> %vm, <vscale x 16 x half> %n)
   ret <vscale x 16 x half> %r
@@ -1023,6 +1558,19 @@ define <vscale x 16 x half> @vfcopynsign_vf_nxv16f16(<vscale x 16 x half> %vm, h
 ; ZVFHMIN-NEXT:    vand.vx v12, v12, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_vf_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v12, a0
+; ZVFBFA-NEXT:    addi a0, a1, -1
+; ZVFBFA-NEXT:    vxor.vx v12, v12, a1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vand.vx v12, v12, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v12
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
   %n = fneg <vscale x 16 x half> %splat
@@ -1048,6 +1596,16 @@ define <vscale x 32 x half> @vfcopysign_vv_nxv32f16(<vscale x 32 x half> %vm, <v
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v16
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_vv_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vand.vx v16, v16, a0
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v16
+; ZVFBFA-NEXT:    ret
   %r = call <vscale x 32 x half> @llvm.copysign.nxv32f16(<vscale x 32 x half> %vm, <vscale x 32 x half> %vs)
   ret <vscale x 32 x half> %r
 }
@@ -1070,6 +1628,18 @@ define <vscale x 32 x half> @vfcopysign_vf_nxv32f16(<vscale x 32 x half> %vm, ha
 ; ZVFHMIN-NEXT:    vand.vx v16, v16, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v16
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopysign_vf_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v16, a0
+; ZVFBFA-NEXT:    addi a0, a1, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vand.vx v16, v16, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v16
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %r = call <vscale x 32 x half> @llvm.copysign.nxv32f16(<vscale x 32 x half> %vm, <vscale x 32 x half> %splat)
@@ -1093,6 +1663,17 @@ define <vscale x 32 x half> @vfcopynsign_vv_nxv32f16(<vscale x 32 x half> %vm, <
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v16
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_vv_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v16, v16, a0
+; ZVFBFA-NEXT:    vand.vx v16, v16, a0
+; ZVFBFA-NEXT:    addi a0, a0, -1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vor.vv v8, v8, v16
+; ZVFBFA-NEXT:    ret
   %n = fneg <vscale x 32 x half> %vs
   %r = call <vscale x 32 x half> @llvm.copysign.nxv32f16(<vscale x 32 x half> %vm, <vscale x 32 x half> %n)
   ret <vscale x 32 x half> %r
@@ -1117,6 +1698,19 @@ define <vscale x 32 x half> @vfcopynsign_vf_nxv32f16(<vscale x 32 x half> %vm, h
 ; ZVFHMIN-NEXT:    vand.vx v16, v16, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v16
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfcopynsign_vf_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v16, a0
+; ZVFBFA-NEXT:    addi a0, a1, -1
+; ZVFBFA-NEXT:    vxor.vx v16, v16, a1
+; ZVFBFA-NEXT:    vand.vx v8, v8, a0
+; ZVFBFA-NEXT:    vand.vx v16, v16, a1
+; ZVFBFA-NEXT:    vor.vv v8, v8, v16
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %s, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %n = fneg <vscale x 32 x half> %splat
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll
index 9f456e97be11d..c0b4916a54e51 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll
@@ -11,69 +11,153 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v \
 ; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v \
+; RUN:     -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,ZVFBFA
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v \
+; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,ZVFBFA
 
 define <vscale x 1 x bfloat> @nxv1bf16(<vscale x 1 x bfloat> %va) {
-; CHECK-LABEL: nxv1bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vxor.vx v8, v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8
+; ZVFBFA-NEXT:    ret
   %vb = fneg <vscale x 1 x bfloat> %va
   ret <vscale x 1 x bfloat> %vb
 }
 
 define <vscale x 2 x bfloat> @nxv2bf16(<vscale x 2 x bfloat> %va) {
-; CHECK-LABEL: nxv2bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vxor.vx v8, v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8
+; ZVFBFA-NEXT:    ret
   %vb = fneg <vscale x 2 x bfloat> %va
   ret <vscale x 2 x bfloat> %vb
 }
 
 define <vscale x 4 x bfloat> @nxv4bf16(<vscale x 4 x bfloat> %va) {
-; CHECK-LABEL: nxv4bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vxor.vx v8, v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8
+; ZVFBFA-NEXT:    ret
   %vb = fneg <vscale x 4 x bfloat> %va
   ret <vscale x 4 x bfloat> %vb
 }
 
 define <vscale x 8 x bfloat> @nxv8bf16(<vscale x 8 x bfloat> %va) {
-; CHECK-LABEL: nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vxor.vx v8, v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8
+; ZVFBFA-NEXT:    ret
   %vb = fneg <vscale x 8 x bfloat> %va
   ret <vscale x 8 x bfloat> %vb
 }
 
 define <vscale x 16 x bfloat> @nxv16bf16(<vscale x 16 x bfloat> %va) {
-; CHECK-LABEL: nxv16bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vxor.vx v8, v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8
+; ZVFBFA-NEXT:    ret
   %vb = fneg <vscale x 16 x bfloat> %va
   ret <vscale x 16 x bfloat> %vb
 }
 
 define <vscale x 32 x bfloat> @nxv32bf16(<vscale x 32 x bfloat> %va) {
-; CHECK-LABEL: nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vxor.vx v8, v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: nxv32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a0, 8
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: nxv32bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m8, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8
+; ZVFBFA-NEXT:    ret
   %vb = fneg <vscale x 32 x bfloat> %va
   ret <vscale x 32 x bfloat> %vb
 }
@@ -91,6 +175,13 @@ define <vscale x 1 x half> @vfneg_vv_nxv1f16(<vscale x 1 x half> %va) {
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a0
+; ZVFBFA-NEXT:    ret
   %vb = fneg <vscale x 1 x half> %va
   ret <vscale x 1 x half> %vb
 }
@@ -108,6 +199,13 @@ define <vscale x 2 x half> @vfneg_vv_nxv2f16(<vscale x 2 x half> %va) {
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a0
+; ZVFBFA-NEXT:    ret
   %vb = fneg <vscale x 2 x half> %va
   ret <vscale x 2 x half> %vb
 }
@@ -125,6 +223,13 @@ define <vscale x 4 x half> @vfneg_vv_nxv4f16(<vscale x 4 x half> %va) {
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a0
+; ZVFBFA-NEXT:    ret
   %vb = fneg <vscale x 4 x half> %va
   ret <vscale x 4 x half> %vb
 }
@@ -142,6 +247,13 @@ define <vscale x 8 x half> @vfneg_vv_nxv8f16(<vscale x 8 x half> %va) {
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a0
+; ZVFBFA-NEXT:    ret
   %vb = fneg <vscale x 8 x half> %va
   ret <vscale x 8 x half> %vb
 }
@@ -159,6 +271,13 @@ define <vscale x 16 x half> @vfneg_vv_nxv16f16(<vscale x 16 x half> %va) {
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a0
+; ZVFBFA-NEXT:    ret
   %vb = fneg <vscale x 16 x half> %va
   ret <vscale x 16 x half> %vb
 }
@@ -176,6 +295,13 @@ define <vscale x 32 x half> @vfneg_vv_nxv32f16(<vscale x 32 x half> %va) {
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 8
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a0
+; ZVFBFA-NEXT:    ret
   %vb = fneg <vscale x 32 x half> %va
   ret <vscale x 32 x half> %vb
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
index bbab056f0ff46..9bd24c44b1b90 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
@@ -1,12 +1,304 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v -target-abi=ilp32d \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+experimental-zvfbfa,+v -target-abi=lp64d \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA
+
+define <vscale x 1 x bfloat> @vfneg_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 1 x bfloat> @llvm.vp.fneg.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 1 x bfloat> @vfneg_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_nxv1bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_nxv1bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv1bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 1 x bfloat> @llvm.vp.fneg.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 1 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfneg_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 2 x bfloat> @llvm.vp.fneg.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 2 x bfloat> @vfneg_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_nxv2bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_nxv2bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv2bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 2 x bfloat> @llvm.vp.fneg.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 2 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfneg_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 4 x bfloat> @llvm.vp.fneg.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 4 x bfloat> @vfneg_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_nxv4bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_nxv4bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv4bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 4 x bfloat> @llvm.vp.fneg.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfneg_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 8 x bfloat> @llvm.vp.fneg.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 8 x bfloat> @vfneg_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_nxv8bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_nxv8bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv8bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 8 x bfloat> @llvm.vp.fneg.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 8 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfneg_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 16 x bfloat> @llvm.vp.fneg.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 16 x bfloat> @vfneg_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_nxv16bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_nxv16bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv16bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 16 x bfloat> @llvm.vp.fneg.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 16 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfneg_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_nxv32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_nxv32bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8, v0.t
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 32 x bfloat> @llvm.vp.fneg.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x bfloat> %v
+}
+
+define <vscale x 32 x bfloat> @vfneg_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
+; ZVFH-LABEL: vfneg_vv_nxv32bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    lui a1, 8
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vxor.vx v8, v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfneg_vv_nxv32bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv32bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; ZVFBFA-NEXT:    vfneg.v v8, v8
+; ZVFBFA-NEXT:    ret
+  %v = call <vscale x 32 x bfloat> @llvm.vp.fneg.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 32 x bfloat> %v
+}
 
 declare <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half>, <vscale x 1 x i1>, i32)
 
@@ -23,6 +315,13 @@ define <vscale x 1 x half> @vfneg_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
 }
@@ -40,6 +339,13 @@ define <vscale x 1 x half> @vfneg_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, i
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv1f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x half> %v
 }
@@ -59,6 +365,13 @@ define <vscale x 2 x half> @vfneg_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
 }
@@ -76,6 +389,13 @@ define <vscale x 2 x half> @vfneg_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, i
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv2f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
@@ -95,6 +415,13 @@ define <vscale x 4 x half> @vfneg_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
 }
@@ -112,6 +439,13 @@ define <vscale x 4 x half> @vfneg_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, i
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv4f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x half> %v
 }
@@ -131,6 +465,13 @@ define <vscale x 8 x half> @vfneg_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
 }
@@ -148,6 +489,13 @@ define <vscale x 8 x half> @vfneg_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, i
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv8f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x half> %v
 }
@@ -167,6 +515,13 @@ define <vscale x 16 x half> @vfneg_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
 }
@@ -184,6 +539,13 @@ define <vscale x 16 x half> @vfneg_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv16f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x half> %v
 }
@@ -203,6 +565,13 @@ define <vscale x 32 x half> @vfneg_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
 }
@@ -220,6 +589,13 @@ define <vscale x 32 x half> @vfneg_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfneg_vv_nxv32f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a1, 8
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vxor.vx v8, v8, a1
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
 }
@@ -461,10 +837,10 @@ define <vscale x 16 x double> @vfneg_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfneg.v v16, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB32_2
+; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB32_2:
+; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfneg.v v8, v8, v0.t
@@ -483,10 +859,10 @@ define <vscale x 16 x double> @vfneg_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfneg.v v16, v16
-; CHECK-NEXT:    bltu a0, a1, .LBB33_2
+; CHECK-NEXT:    bltu a0, a1, .LBB45_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB33_2:
+; CHECK-NEXT:  .LBB45_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfneg.v v8, v8
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
index ad2ed47e67e64..034186210513c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
@@ -570,7 +570,82 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i8i32_multiple_users(ptr %x, ptr %y,
   ret <vscale x 2 x i32> %i
 }
 
+define <vscale x 4 x i32> @mismatched_extend_sub_add(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) {
+; FOLDING-LABEL: mismatched_extend_sub_add:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; FOLDING-NEXT:    vzext.vf2 v10, v8
+; FOLDING-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; FOLDING-NEXT:    vwsub.wv v12, v10, v9
+; FOLDING-NEXT:    vwadd.wv v10, v10, v9
+; FOLDING-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; FOLDING-NEXT:    vmul.vv v8, v12, v10
+; FOLDING-NEXT:    ret
+  %a = zext <vscale x 4 x i16> %x to <vscale x 4 x i32>
+  %b = sext <vscale x 4 x i16> %y to <vscale x 4 x i32>
+  %c = sub <vscale x 4 x i32> %a, %b
+  %d = add <vscale x 4 x i32> %a, %b
+  %e = mul <vscale x 4 x i32> %c, %d
+  ret <vscale x 4 x i32> %e
+}
+
+; FIXME: this should remove the vsext
+define <vscale x 4 x i32> @mismatched_extend_sub_add_commuted(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) {
+; FOLDING-LABEL: mismatched_extend_sub_add_commuted:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; FOLDING-NEXT:    vzext.vf2 v10, v8
+; FOLDING-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; FOLDING-NEXT:    vwsub.wv v12, v10, v9
+; FOLDING-NEXT:    vwadd.wv v10, v10, v9
+; FOLDING-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; FOLDING-NEXT:    vmul.vv v8, v12, v10
+; FOLDING-NEXT:    ret
+  %a = zext <vscale x 4 x i16> %x to <vscale x 4 x i32>
+  %b = sext <vscale x 4 x i16> %y to <vscale x 4 x i32>
+  %c = sub <vscale x 4 x i32> %a, %b
+  %d = add <vscale x 4 x i32> %b, %a
+  %e = mul <vscale x 4 x i32> %c, %d
+  ret <vscale x 4 x i32> %e
+}
 
+define <vscale x 4 x i32> @mismatched_extend_add_sub(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) {
+; FOLDING-LABEL: mismatched_extend_add_sub:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; FOLDING-NEXT:    vzext.vf2 v10, v8
+; FOLDING-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; FOLDING-NEXT:    vwadd.wv v12, v10, v9
+; FOLDING-NEXT:    vwsub.wv v10, v10, v9
+; FOLDING-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; FOLDING-NEXT:    vmul.vv v8, v12, v10
+; FOLDING-NEXT:    ret
+  %a = zext <vscale x 4 x i16> %x to <vscale x 4 x i32>
+  %b = sext <vscale x 4 x i16> %y to <vscale x 4 x i32>
+  %c = add <vscale x 4 x i32> %a, %b
+  %d = sub <vscale x 4 x i32> %a, %b
+  %e = mul <vscale x 4 x i32> %c, %d
+  ret <vscale x 4 x i32> %e
+}
+
+define <vscale x 4 x i32> @mismatched_extend_add_sub_commuted(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) {
+; FOLDING-LABEL: mismatched_extend_add_sub_commuted:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; FOLDING-NEXT:    vzext.vf2 v10, v8
+; FOLDING-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; FOLDING-NEXT:    vwadd.wv v12, v10, v9
+; FOLDING-NEXT:    vwsub.wv v10, v10, v9
+; FOLDING-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; FOLDING-NEXT:    vmul.vv v8, v12, v10
+; FOLDING-NEXT:    ret
+  %a = zext <vscale x 4 x i16> %x to <vscale x 4 x i32>
+  %b = sext <vscale x 4 x i16> %y to <vscale x 4 x i32>
+  %c = add <vscale x 4 x i32> %a, %b
+  %d = sub <vscale x 4 x i32> %a, %b
+  %e = mul <vscale x 4 x i32> %c, %d
+  ret <vscale x 4 x i32> %e
+}
 
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; RV32: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
index 20034b638c06f..b6e29cf76cd48 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
@@ -863,3 +863,19 @@ entry:
     i64 2)
   ret <vscale x 1 x double> %2
 }
+
+; The two vsetvlis will be coalesced so the add will be made dead and
+; removed. Make sure we shrink the live interval of %x.
+define void @non_li_addi(i64 %x, ptr %p) {
+; CHECK-LABEL: non_li_addi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    ret
+entry:
+  %add = add i64 %x, 1
+  %0 = tail call i64 @llvm.riscv.vsetvli(i64 %add, i64 3, i64 0)
+  %1 = call <vscale x 8 x i8> @llvm.riscv.vle(<vscale x 8 x i8> poison, ptr %p, i64 %0)
+  %2 = tail call i64 @llvm.riscv.vsetvli(i64 1, i64 3, i64 0)
+  %3 = tail call { <vscale x 8 x i8>, i64 } @llvm.riscv.vleff(<vscale x 8 x i8> poison, ptr %p, i64 %2)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
index fdd30c9a2c772..f9929c9caf712 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
@@ -104,6 +104,10 @@
     ret void
   }
 
+  define void @non_li_addi() {
+    ret void
+  }
+
   declare <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, i64) #1
 
   declare <vscale x 1 x i64> @llvm.riscv.vle.nxv1i64.i64(<vscale x 1 x i64>, ptr nocapture, i64) #4
@@ -664,3 +668,23 @@ body:             |
   bb.2:
     $x10 = COPY %vl
     PseudoRET implicit killed $x10
+...
+---
+# The two vsetvlis will be coalesced so the ADDI will be made dead and removed.
+# Make sure we shrink the live interval of %0.
+name: non_li_addi
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: non_li_addi
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; CHECK-NEXT: dead [[PseudoVSETIVLI:%[0-9]+]]:gprnox0 = PseudoVSETIVLI 1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: PseudoRET
+    %0:gpr = COPY $x10
+    %1:gprnox0 = ADDI %0, 1
+    %2:gprnox0 = PseudoVSETVLI %1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    %3:gprnox0 = PseudoVSETIVLI 1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    PseudoRET
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
index ead79fcf53d8b..af3b0852a6461 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -102,20 +102,20 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV32-NEXT:  .LBB0_13: # %vector.body
 ; RV32-NEXT:    # Parent Loop BB0_10 Depth=1
 ; RV32-NEXT:    # => This Inner Loop Header: Depth=2
-; RV32-NEXT:    add s0, a2, t6
-; RV32-NEXT:    add s1, a4, t6
-; RV32-NEXT:    vl2r.v v8, (s0)
-; RV32-NEXT:    add s0, a0, t6
+; RV32-NEXT:    mv s0, t6
+; RV32-NEXT:    add t6, a2, t6
+; RV32-NEXT:    add s1, a4, s0
+; RV32-NEXT:    vl2r.v v8, (t6)
+; RV32-NEXT:    add s2, a0, s0
 ; RV32-NEXT:    vl2r.v v10, (s1)
-; RV32-NEXT:    add s1, t6, t2
-; RV32-NEXT:    sltu t6, s1, t6
-; RV32-NEXT:    add t5, t5, t6
-; RV32-NEXT:    xor t6, s1, t4
+; RV32-NEXT:    add t6, s0, t2
+; RV32-NEXT:    sltu s0, t6, s0
+; RV32-NEXT:    add t5, t5, s0
+; RV32-NEXT:    xor s0, t6, t4
 ; RV32-NEXT:    vaaddu.vv v8, v8, v10
-; RV32-NEXT:    or s2, t6, t5
-; RV32-NEXT:    vs2r.v v8, (s0)
-; RV32-NEXT:    mv t6, s1
-; RV32-NEXT:    bnez s2, .LBB0_13
+; RV32-NEXT:    or s0, s0, t5
+; RV32-NEXT:    vs2r.v v8, (s2)
+; RV32-NEXT:    bnez s0, .LBB0_13
 ; RV32-NEXT:  # %bb.14: # %middle.block
 ; RV32-NEXT:    # in Loop: Header=BB0_10 Depth=1
 ; RV32-NEXT:    beq t4, a6, .LBB0_9
diff --git a/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir b/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir
index dd9960d17af43..9c2fa9d0009a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir
@@ -32,10 +32,10 @@ body: |
     ; CHECK-NEXT: $x11 = ADDI $x2, 16
     ; CHECK-NEXT: VS4R_V $v0m4, $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store (<vscale x 1 x s256>) into %stack.0, align 8)
     ; CHECK-NEXT: $x12 = PseudoReadVLENB
-    ; CHECK-NEXT: $x13 = SLLI $x12, 2
-    ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x13
+    ; CHECK-NEXT: $x12 = SLLI killed $x12, 2
+    ; CHECK-NEXT: $x11 = ADD killed $x11, $x12
     ; CHECK-NEXT: VS2R_V $v4m2, $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store (<vscale x 1 x s128>) into %stack.0, align 8)
-    ; CHECK-NEXT: $x12 = SLLI killed $x12, 1
+    ; CHECK-NEXT: $x12 = SRLI killed $x12, 1
     ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x12
     ; CHECK-NEXT: VS1R_V $v6, killed $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store (<vscale x 1 x s64>) into %stack.0)
     ; CHECK-NEXT: $x11 = ADDI $x2, 16
@@ -93,10 +93,10 @@ body: |
     ; CHECK-NEXT: $x11 = ADDI $x2, 16
     ; CHECK-NEXT: $v10m2 = VL2RE8_V $x11 :: (load (<vscale x 1 x s128>) from %stack.0, align 8)
     ; CHECK-NEXT: $x12 = PseudoReadVLENB
-    ; CHECK-NEXT: $x13 = SLLI $x12, 1
-    ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x13
+    ; CHECK-NEXT: $x12 = SLLI killed $x12, 1
+    ; CHECK-NEXT: $x11 = ADD killed $x11, $x12
     ; CHECK-NEXT: $v12m4 = VL4RE8_V $x11 :: (load (<vscale x 1 x s256>) from %stack.0, align 8)
-    ; CHECK-NEXT: $x12 = SLLI killed $x12, 2
+    ; CHECK-NEXT: $x12 = SLLI killed $x12, 1
     ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x12
     ; CHECK-NEXT: $v16 = VL1RE8_V killed $x11 :: (load (<vscale x 1 x s64>) from %stack.0)
     ; CHECK-NEXT: VS1R_V killed $v10, killed renamable $x10
diff --git a/llvm/test/CodeGen/RISCV/select-const.ll b/llvm/test/CodeGen/RISCV/select-const.ll
index dfac6e1630d25..f2924bb364adb 100644
--- a/llvm/test/CodeGen/RISCV/select-const.ll
+++ b/llvm/test/CodeGen/RISCV/select-const.ll
@@ -177,9 +177,11 @@ define float @select_const_fp(i1 zeroext %a) nounwind {
 ;
 ; RV32IXQCI-LABEL: select_const_fp:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    lui a2, 263168
 ; RV32IXQCI-NEXT:    lui a1, 264192
-; RV32IXQCI-NEXT:    qc.mvnei a1, a0, 0, a2
+; RV32IXQCI-NEXT:    beqz a0, .LBB4_2
+; RV32IXQCI-NEXT:  # %bb.1:
+; RV32IXQCI-NEXT:    lui a1, 263168
+; RV32IXQCI-NEXT:  .LBB4_2:
 ; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
@@ -653,9 +655,11 @@ define i32 @select_nonnegative_lui_addi(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: select_nonnegative_lui_addi:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    lui a2, 4
 ; RV32IXQCI-NEXT:    li a1, 25
-; RV32IXQCI-NEXT:    qc.mvgei a1, a0, 0, a2
+; RV32IXQCI-NEXT:    bltz a0, .LBB21_2
+; RV32IXQCI-NEXT:  # %bb.1:
+; RV32IXQCI-NEXT:    lui a1, 4
+; RV32IXQCI-NEXT:  .LBB21_2:
 ; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
@@ -724,9 +728,11 @@ define i32 @select_nonnegative_lui_addi_swapped(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: select_nonnegative_lui_addi_swapped:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    li a2, 25
+; RV32IXQCI-NEXT:    li a1, 25
+; RV32IXQCI-NEXT:    bgez a0, .LBB22_2
+; RV32IXQCI-NEXT:  # %bb.1:
 ; RV32IXQCI-NEXT:    lui a1, 4
-; RV32IXQCI-NEXT:    qc.mvgei a1, a0, 0, a2
+; RV32IXQCI-NEXT:  .LBB22_2:
 ; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll
index b155feab9b4d9..9f326280885b5 100644
--- a/llvm/test/CodeGen/RISCV/sextw-removal.ll
+++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll
@@ -1352,6 +1352,7 @@ define signext i32 @sextw_sh2add(i1 zeroext %0, ptr %1, i32 signext %2, i32 sign
 ; NOREMOVAL-LABEL: sextw_sh2add:
 ; NOREMOVAL:       # %bb.0:
 ; NOREMOVAL-NEXT:    sh2add a2, a2, a3
+; NOREMOVAL-NEXT:    mv a2, a2
 ; NOREMOVAL-NEXT:    beqz a0, .LBB22_2
 ; NOREMOVAL-NEXT:  # %bb.1:
 ; NOREMOVAL-NEXT:    sw a2, 0(a1)
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-load-imm.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-load-imm.ll
new file mode 100644
index 0000000000000..6aae6cd0e82ee
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-load-imm.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+experimental-xqcili | FileCheck %s --check-prefixes=RV32I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 | FileCheck %s --check-prefixes=RV64I
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv32 -mattr=+experimental-xqcili,+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
+; RUN: llc < %s -verify-machineinstrs -mtriple=riscv64 -mattr=+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
+
+define i32 @select_example_1(i32 %a, i32 %b, i1 zeroext %x, i32 %y) {
+; RV32I-LABEL: select_example_1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a0, 16
+; RV32I-NEXT:    bnez a2, .LBB0_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:  .LBB0_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: select_example_1:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    lui a0, 16
+; RV64I-NEXT:    bnez a2, .LBB0_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB0_2: # %entry
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: select_example_1:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:    beqz a2, .LBB0_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    lui a0, 16
+; RV32I-SFB-NEXT:  .LBB0_2: # %entry
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: select_example_1:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:    beqz a2, .LBB0_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    lui a0, 16
+; RV64I-SFB-NEXT:  .LBB0_2: # %entry
+; RV64I-SFB-NEXT:    ret
+entry:
+  %sel = select i1 %x, i32 65536, i32 %b
+  ret i32 %sel
+}
+
+define i32 @select_example_2(i32 %a, i32 %b, i1 zeroext %x, i32 %y) {
+; RV32I-LABEL: select_example_2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bnez a2, .LBB1_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB1_2:
+; RV32I-NEXT:    qc.li a0, 65543
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: select_example_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    bnez a2, .LBB1_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB1_2:
+; RV64I-NEXT:    lui a0, 16
+; RV64I-NEXT:    addi a0, a0, 7
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: select_example_2:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:    beqz a2, .LBB1_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    qc.li a0, 65543
+; RV32I-SFB-NEXT:  .LBB1_2: # %entry
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: select_example_2:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:    lui a1, 16
+; RV64I-SFB-NEXT:    beqz a2, .LBB1_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    addi a0, a1, 7
+; RV64I-SFB-NEXT:  .LBB1_2: # %entry
+; RV64I-SFB-NEXT:    ret
+entry:
+  %sel = select i1 %x, i32 65543, i32 %b
+  ret i32 %sel
+}
+
+define i32 @select_example_3(i32 %a, i32 %b, i1 zeroext %x, i32 %y) {
+; RV32I-LABEL: select_example_3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bnez a2, .LBB2_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB2_2:
+; RV32I-NEXT:    qc.e.li a0, 4198928
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: select_example_3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    bnez a2, .LBB2_2
+; RV64I-NEXT:  # %bb.1: # %entry
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB2_2:
+; RV64I-NEXT:    lui a0, 1025
+; RV64I-NEXT:    addi a0, a0, 528
+; RV64I-NEXT:    ret
+;
+; RV32I-SFB-LABEL: select_example_3:
+; RV32I-SFB:       # %bb.0: # %entry
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:    beqz a2, .LBB2_2
+; RV32I-SFB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-NEXT:    qc.e.li a0, 4198928
+; RV32I-SFB-NEXT:  .LBB2_2: # %entry
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: select_example_3:
+; RV64I-SFB:       # %bb.0: # %entry
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:    lui a1, 1025
+; RV64I-SFB-NEXT:    beqz a2, .LBB2_2
+; RV64I-SFB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-NEXT:    addi a0, a1, 528
+; RV64I-SFB-NEXT:  .LBB2_2: # %entry
+; RV64I-SFB-NEXT:    ret
+entry:
+  %sel = select i1 %x, i32 4198928, i32 %b
+  ret i32 %sel
+}
+
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-min-max.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-min-max.ll
new file mode 100644
index 0000000000000..05e06cea9967a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-min-max.ll
@@ -0,0 +1,703 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=riscv32 -mattr=+zbb | FileCheck %s --check-prefixes=RV32I-ZBB
+; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb | FileCheck %s --check-prefixes=RV64I-ZBB
+; RUN: llc < %s -mtriple=riscv32 -mattr=+zbb,+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFB-ZBB
+; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb,+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFB-ZBB
+; RUN: llc < %s -mtriple=riscv32 -mattr=+zbb,+short-forward-branch-i-minmax | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFBIMinMax-ZBB
+; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb,+short-forward-branch-i-minmax | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFBIMinMax-ZBB
+
+define i32 @select_example_smax(i32 %a, i32 %b, i1 zeroext %x, i32 %y) {
+; RV32I-ZBB-LABEL: select_example_smax:
+; RV32I-ZBB:       # %bb.0: # %entry
+; RV32I-ZBB-NEXT:    beqz a2, .LBB0_2
+; RV32I-ZBB-NEXT:  # %bb.1:
+; RV32I-ZBB-NEXT:    max a1, a0, a3
+; RV32I-ZBB-NEXT:  .LBB0_2: # %entry
+; RV32I-ZBB-NEXT:    mv a0, a1
+; RV32I-ZBB-NEXT:    ret
+;
+; RV64I-ZBB-LABEL: select_example_smax:
+; RV64I-ZBB:       # %bb.0: # %entry
+; RV64I-ZBB-NEXT:    beqz a2, .LBB0_2
+; RV64I-ZBB-NEXT:  # %bb.1:
+; RV64I-ZBB-NEXT:    sext.w a3, a3
+; RV64I-ZBB-NEXT:    sext.w a0, a0
+; RV64I-ZBB-NEXT:    max a1, a0, a3
+; RV64I-ZBB-NEXT:  .LBB0_2: # %entry
+; RV64I-ZBB-NEXT:    mv a0, a1
+; RV64I-ZBB-NEXT:    ret
+;
+; RV32I-SFB-ZBB-LABEL: select_example_smax:
+; RV32I-SFB-ZBB:       # %bb.0: # %entry
+; RV32I-SFB-ZBB-NEXT:    max a0, a0, a3
+; RV32I-SFB-ZBB-NEXT:    bnez a2, .LBB0_2
+; RV32I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a1
+; RV32I-SFB-ZBB-NEXT:  .LBB0_2: # %entry
+; RV32I-SFB-ZBB-NEXT:    ret
+;
+; RV64I-SFB-ZBB-LABEL: select_example_smax:
+; RV64I-SFB-ZBB:       # %bb.0: # %entry
+; RV64I-SFB-ZBB-NEXT:    sext.w a3, a3
+; RV64I-SFB-ZBB-NEXT:    sext.w a0, a0
+; RV64I-SFB-ZBB-NEXT:    max a0, a0, a3
+; RV64I-SFB-ZBB-NEXT:    bnez a2, .LBB0_2
+; RV64I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-ZBB-NEXT:    mv a0, a1
+; RV64I-SFB-ZBB-NEXT:  .LBB0_2: # %entry
+; RV64I-SFB-ZBB-NEXT:    ret
+;
+; RV32I-SFBIMinMax-ZBB-LABEL: select_example_smax:
+; RV32I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB0_2
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    max a1, a0, a3
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB0_2: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV32I-SFBIMinMax-ZBB-NEXT:    ret
+;
+; RV64I-SFBIMinMax-ZBB-LABEL: select_example_smax:
+; RV64I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    sext.w a3, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:    sext.w a0, a0
+; RV64I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB0_2
+; RV64I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    max a1, a0, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:  .LBB0_2: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV64I-SFBIMinMax-ZBB-NEXT:    ret
+entry:
+  %res = call i32 @llvm.smax.i32(i32 %a, i32 %y)
+  %sel = select i1 %x, i32 %res, i32 %b
+  ret i32 %sel
+}
+
+define i32 @select_example_smin(i32 %a, i32 %b, i1 zeroext %x, i32 %y) {
+; RV32I-ZBB-LABEL: select_example_smin:
+; RV32I-ZBB:       # %bb.0: # %entry
+; RV32I-ZBB-NEXT:    beqz a2, .LBB1_2
+; RV32I-ZBB-NEXT:  # %bb.1:
+; RV32I-ZBB-NEXT:    min a1, a0, a3
+; RV32I-ZBB-NEXT:  .LBB1_2: # %entry
+; RV32I-ZBB-NEXT:    mv a0, a1
+; RV32I-ZBB-NEXT:    ret
+;
+; RV64I-ZBB-LABEL: select_example_smin:
+; RV64I-ZBB:       # %bb.0: # %entry
+; RV64I-ZBB-NEXT:    beqz a2, .LBB1_2
+; RV64I-ZBB-NEXT:  # %bb.1:
+; RV64I-ZBB-NEXT:    sext.w a3, a3
+; RV64I-ZBB-NEXT:    sext.w a0, a0
+; RV64I-ZBB-NEXT:    min a1, a0, a3
+; RV64I-ZBB-NEXT:  .LBB1_2: # %entry
+; RV64I-ZBB-NEXT:    mv a0, a1
+; RV64I-ZBB-NEXT:    ret
+;
+; RV32I-SFB-ZBB-LABEL: select_example_smin:
+; RV32I-SFB-ZBB:       # %bb.0: # %entry
+; RV32I-SFB-ZBB-NEXT:    min a0, a0, a3
+; RV32I-SFB-ZBB-NEXT:    bnez a2, .LBB1_2
+; RV32I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a1
+; RV32I-SFB-ZBB-NEXT:  .LBB1_2: # %entry
+; RV32I-SFB-ZBB-NEXT:    ret
+;
+; RV64I-SFB-ZBB-LABEL: select_example_smin:
+; RV64I-SFB-ZBB:       # %bb.0: # %entry
+; RV64I-SFB-ZBB-NEXT:    sext.w a3, a3
+; RV64I-SFB-ZBB-NEXT:    sext.w a0, a0
+; RV64I-SFB-ZBB-NEXT:    min a0, a0, a3
+; RV64I-SFB-ZBB-NEXT:    bnez a2, .LBB1_2
+; RV64I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-ZBB-NEXT:    mv a0, a1
+; RV64I-SFB-ZBB-NEXT:  .LBB1_2: # %entry
+; RV64I-SFB-ZBB-NEXT:    ret
+;
+; RV32I-SFBIMinMax-ZBB-LABEL: select_example_smin:
+; RV32I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB1_2
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    min a1, a0, a3
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB1_2: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV32I-SFBIMinMax-ZBB-NEXT:    ret
+;
+; RV64I-SFBIMinMax-ZBB-LABEL: select_example_smin:
+; RV64I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    sext.w a3, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:    sext.w a0, a0
+; RV64I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB1_2
+; RV64I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    min a1, a0, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:  .LBB1_2: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV64I-SFBIMinMax-ZBB-NEXT:    ret
+entry:
+  %res = call i32 @llvm.smin.i32(i32 %a, i32 %y)
+  %sel = select i1 %x, i32 %res, i32 %b
+  ret i32 %sel
+}
+
+define i32 @select_example_umax(i32 %a, i32 %b, i1 zeroext %x, i32 %y) {
+; RV32I-ZBB-LABEL: select_example_umax:
+; RV32I-ZBB:       # %bb.0: # %entry
+; RV32I-ZBB-NEXT:    beqz a2, .LBB2_2
+; RV32I-ZBB-NEXT:  # %bb.1:
+; RV32I-ZBB-NEXT:    maxu a1, a0, a3
+; RV32I-ZBB-NEXT:  .LBB2_2: # %entry
+; RV32I-ZBB-NEXT:    mv a0, a1
+; RV32I-ZBB-NEXT:    ret
+;
+; RV64I-ZBB-LABEL: select_example_umax:
+; RV64I-ZBB:       # %bb.0: # %entry
+; RV64I-ZBB-NEXT:    beqz a2, .LBB2_2
+; RV64I-ZBB-NEXT:  # %bb.1:
+; RV64I-ZBB-NEXT:    sext.w a3, a3
+; RV64I-ZBB-NEXT:    sext.w a0, a0
+; RV64I-ZBB-NEXT:    maxu a1, a0, a3
+; RV64I-ZBB-NEXT:  .LBB2_2: # %entry
+; RV64I-ZBB-NEXT:    mv a0, a1
+; RV64I-ZBB-NEXT:    ret
+;
+; RV32I-SFB-ZBB-LABEL: select_example_umax:
+; RV32I-SFB-ZBB:       # %bb.0: # %entry
+; RV32I-SFB-ZBB-NEXT:    maxu a0, a0, a3
+; RV32I-SFB-ZBB-NEXT:    bnez a2, .LBB2_2
+; RV32I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a1
+; RV32I-SFB-ZBB-NEXT:  .LBB2_2: # %entry
+; RV32I-SFB-ZBB-NEXT:    ret
+;
+; RV64I-SFB-ZBB-LABEL: select_example_umax:
+; RV64I-SFB-ZBB:       # %bb.0: # %entry
+; RV64I-SFB-ZBB-NEXT:    sext.w a3, a3
+; RV64I-SFB-ZBB-NEXT:    sext.w a0, a0
+; RV64I-SFB-ZBB-NEXT:    maxu a0, a0, a3
+; RV64I-SFB-ZBB-NEXT:    bnez a2, .LBB2_2
+; RV64I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-ZBB-NEXT:    mv a0, a1
+; RV64I-SFB-ZBB-NEXT:  .LBB2_2: # %entry
+; RV64I-SFB-ZBB-NEXT:    ret
+;
+; RV32I-SFBIMinMax-ZBB-LABEL: select_example_umax:
+; RV32I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB2_2
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    maxu a1, a0, a3
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB2_2: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV32I-SFBIMinMax-ZBB-NEXT:    ret
+;
+; RV64I-SFBIMinMax-ZBB-LABEL: select_example_umax:
+; RV64I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    sext.w a3, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:    sext.w a0, a0
+; RV64I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB2_2
+; RV64I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    maxu a1, a0, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:  .LBB2_2: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV64I-SFBIMinMax-ZBB-NEXT:    ret
+entry:
+  %res = call i32 @llvm.umax.i32(i32 %a, i32 %y)
+  %sel = select i1 %x, i32 %res, i32 %b
+  ret i32 %sel
+}
+
+define i32 @select_example_umin(i32 %a, i32 %b, i1 zeroext %x, i32 %y) {
+; RV32I-ZBB-LABEL: select_example_umin:
+; RV32I-ZBB:       # %bb.0: # %entry
+; RV32I-ZBB-NEXT:    beqz a2, .LBB3_2
+; RV32I-ZBB-NEXT:  # %bb.1:
+; RV32I-ZBB-NEXT:    minu a1, a0, a3
+; RV32I-ZBB-NEXT:  .LBB3_2: # %entry
+; RV32I-ZBB-NEXT:    mv a0, a1
+; RV32I-ZBB-NEXT:    ret
+;
+; RV64I-ZBB-LABEL: select_example_umin:
+; RV64I-ZBB:       # %bb.0: # %entry
+; RV64I-ZBB-NEXT:    beqz a2, .LBB3_2
+; RV64I-ZBB-NEXT:  # %bb.1:
+; RV64I-ZBB-NEXT:    sext.w a3, a3
+; RV64I-ZBB-NEXT:    sext.w a0, a0
+; RV64I-ZBB-NEXT:    minu a1, a0, a3
+; RV64I-ZBB-NEXT:  .LBB3_2: # %entry
+; RV64I-ZBB-NEXT:    mv a0, a1
+; RV64I-ZBB-NEXT:    ret
+;
+; RV32I-SFB-ZBB-LABEL: select_example_umin:
+; RV32I-SFB-ZBB:       # %bb.0: # %entry
+; RV32I-SFB-ZBB-NEXT:    minu a0, a0, a3
+; RV32I-SFB-ZBB-NEXT:    bnez a2, .LBB3_2
+; RV32I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a1
+; RV32I-SFB-ZBB-NEXT:  .LBB3_2: # %entry
+; RV32I-SFB-ZBB-NEXT:    ret
+;
+; RV64I-SFB-ZBB-LABEL: select_example_umin:
+; RV64I-SFB-ZBB:       # %bb.0: # %entry
+; RV64I-SFB-ZBB-NEXT:    sext.w a3, a3
+; RV64I-SFB-ZBB-NEXT:    sext.w a0, a0
+; RV64I-SFB-ZBB-NEXT:    minu a0, a0, a3
+; RV64I-SFB-ZBB-NEXT:    bnez a2, .LBB3_2
+; RV64I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-ZBB-NEXT:    mv a0, a1
+; RV64I-SFB-ZBB-NEXT:  .LBB3_2: # %entry
+; RV64I-SFB-ZBB-NEXT:    ret
+;
+; RV32I-SFBIMinMax-ZBB-LABEL: select_example_umin:
+; RV32I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB3_2
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    minu a1, a0, a3
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB3_2: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV32I-SFBIMinMax-ZBB-NEXT:    ret
+;
+; RV64I-SFBIMinMax-ZBB-LABEL: select_example_umin:
+; RV64I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    sext.w a3, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:    sext.w a0, a0
+; RV64I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB3_2
+; RV64I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    minu a1, a0, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:  .LBB3_2: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV64I-SFBIMinMax-ZBB-NEXT:    ret
+entry:
+  %res = call i32 @llvm.umin.i32(i32 %a, i32 %y)
+  %sel = select i1 %x, i32 %res, i32 %b
+  ret i32 %sel
+}
+
+define i64 @select_example_smax_1(i64 %a, i64 %b, i1 zeroext %x, i64 %y) {
+; RV32I-ZBB-LABEL: select_example_smax_1:
+; RV32I-ZBB:       # %bb.0: # %entry
+; RV32I-ZBB-NEXT:    beq a1, a6, .LBB4_2
+; RV32I-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-ZBB-NEXT:    slt a7, a6, a1
+; RV32I-ZBB-NEXT:    beqz a7, .LBB4_3
+; RV32I-ZBB-NEXT:    j .LBB4_4
+; RV32I-ZBB-NEXT:  .LBB4_2:
+; RV32I-ZBB-NEXT:    sltu a7, a5, a0
+; RV32I-ZBB-NEXT:    bnez a7, .LBB4_4
+; RV32I-ZBB-NEXT:  .LBB4_3: # %entry
+; RV32I-ZBB-NEXT:    mv a1, a6
+; RV32I-ZBB-NEXT:    mv a0, a5
+; RV32I-ZBB-NEXT:  .LBB4_4: # %entry
+; RV32I-ZBB-NEXT:    beqz a4, .LBB4_6
+; RV32I-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-ZBB-NEXT:    ret
+; RV32I-ZBB-NEXT:  .LBB4_6: # %entry
+; RV32I-ZBB-NEXT:    mv a0, a2
+; RV32I-ZBB-NEXT:    mv a1, a3
+; RV32I-ZBB-NEXT:    ret
+;
+; RV64I-ZBB-LABEL: select_example_smax_1:
+; RV64I-ZBB:       # %bb.0: # %entry
+; RV64I-ZBB-NEXT:    beqz a2, .LBB4_2
+; RV64I-ZBB-NEXT:  # %bb.1:
+; RV64I-ZBB-NEXT:    max a1, a0, a3
+; RV64I-ZBB-NEXT:  .LBB4_2: # %entry
+; RV64I-ZBB-NEXT:    mv a0, a1
+; RV64I-ZBB-NEXT:    ret
+;
+; RV32I-SFB-ZBB-LABEL: select_example_smax_1:
+; RV32I-SFB-ZBB:       # %bb.0: # %entry
+; RV32I-SFB-ZBB-NEXT:    sltu a7, a5, a0
+; RV32I-SFB-ZBB-NEXT:    slt t0, a6, a1
+; RV32I-SFB-ZBB-NEXT:    bne a1, a6, .LBB4_2
+; RV32I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv t0, a7
+; RV32I-SFB-ZBB-NEXT:  .LBB4_2: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez t0, .LBB4_4
+; RV32I-SFB-ZBB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a1, a6
+; RV32I-SFB-ZBB-NEXT:  .LBB4_4: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez t0, .LBB4_6
+; RV32I-SFB-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a5
+; RV32I-SFB-ZBB-NEXT:  .LBB4_6: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez a4, .LBB4_8
+; RV32I-SFB-ZBB-NEXT:  # %bb.7: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a2
+; RV32I-SFB-ZBB-NEXT:  .LBB4_8: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez a4, .LBB4_10
+; RV32I-SFB-ZBB-NEXT:  # %bb.9: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a1, a3
+; RV32I-SFB-ZBB-NEXT:  .LBB4_10: # %entry
+; RV32I-SFB-ZBB-NEXT:    ret
+;
+; RV64I-SFB-ZBB-LABEL: select_example_smax_1:
+; RV64I-SFB-ZBB:       # %bb.0: # %entry
+; RV64I-SFB-ZBB-NEXT:    max a0, a0, a3
+; RV64I-SFB-ZBB-NEXT:    bnez a2, .LBB4_2
+; RV64I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-ZBB-NEXT:    mv a0, a1
+; RV64I-SFB-ZBB-NEXT:  .LBB4_2: # %entry
+; RV64I-SFB-ZBB-NEXT:    ret
+;
+; RV32I-SFBIMinMax-ZBB-LABEL: select_example_smax_1:
+; RV32I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    sltu a7, a5, a0
+; RV32I-SFBIMinMax-ZBB-NEXT:    slt t0, a6, a1
+; RV32I-SFBIMinMax-ZBB-NEXT:    bne a1, a6, .LBB4_2
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv t0, a7
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB4_2: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez t0, .LBB4_4
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.3: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a1, a6
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB4_4: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez t0, .LBB4_6
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a5
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB4_6: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez a4, .LBB4_8
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.7: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a2
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB4_8: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez a4, .LBB4_10
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.9: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a1, a3
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB4_10: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    ret
+;
+; RV64I-SFBIMinMax-ZBB-LABEL: select_example_smax_1:
+; RV64I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB4_2
+; RV64I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    max a1, a0, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:  .LBB4_2: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV64I-SFBIMinMax-ZBB-NEXT:    ret
+entry:
+  %res = call i64 @llvm.smax.i64(i64 %a, i64 %y)
+  %sel = select i1 %x, i64 %res, i64 %b
+  ret i64 %sel
+}
+
+define i64 @select_example_smin_1(i64 %a, i64 %b, i1 zeroext %x, i64 %y) {
+; RV32I-ZBB-LABEL: select_example_smin_1:
+; RV32I-ZBB:       # %bb.0: # %entry
+; RV32I-ZBB-NEXT:    beq a1, a6, .LBB5_2
+; RV32I-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-ZBB-NEXT:    slt a7, a1, a6
+; RV32I-ZBB-NEXT:    beqz a7, .LBB5_3
+; RV32I-ZBB-NEXT:    j .LBB5_4
+; RV32I-ZBB-NEXT:  .LBB5_2:
+; RV32I-ZBB-NEXT:    sltu a7, a0, a5
+; RV32I-ZBB-NEXT:    bnez a7, .LBB5_4
+; RV32I-ZBB-NEXT:  .LBB5_3: # %entry
+; RV32I-ZBB-NEXT:    mv a1, a6
+; RV32I-ZBB-NEXT:    mv a0, a5
+; RV32I-ZBB-NEXT:  .LBB5_4: # %entry
+; RV32I-ZBB-NEXT:    beqz a4, .LBB5_6
+; RV32I-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-ZBB-NEXT:    ret
+; RV32I-ZBB-NEXT:  .LBB5_6: # %entry
+; RV32I-ZBB-NEXT:    mv a0, a2
+; RV32I-ZBB-NEXT:    mv a1, a3
+; RV32I-ZBB-NEXT:    ret
+;
+; RV64I-ZBB-LABEL: select_example_smin_1:
+; RV64I-ZBB:       # %bb.0: # %entry
+; RV64I-ZBB-NEXT:    beqz a2, .LBB5_2
+; RV64I-ZBB-NEXT:  # %bb.1:
+; RV64I-ZBB-NEXT:    min a1, a0, a3
+; RV64I-ZBB-NEXT:  .LBB5_2: # %entry
+; RV64I-ZBB-NEXT:    mv a0, a1
+; RV64I-ZBB-NEXT:    ret
+;
+; RV32I-SFB-ZBB-LABEL: select_example_smin_1:
+; RV32I-SFB-ZBB:       # %bb.0: # %entry
+; RV32I-SFB-ZBB-NEXT:    sltu a7, a0, a5
+; RV32I-SFB-ZBB-NEXT:    slt t0, a1, a6
+; RV32I-SFB-ZBB-NEXT:    bne a1, a6, .LBB5_2
+; RV32I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv t0, a7
+; RV32I-SFB-ZBB-NEXT:  .LBB5_2: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez t0, .LBB5_4
+; RV32I-SFB-ZBB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a1, a6
+; RV32I-SFB-ZBB-NEXT:  .LBB5_4: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez t0, .LBB5_6
+; RV32I-SFB-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a5
+; RV32I-SFB-ZBB-NEXT:  .LBB5_6: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez a4, .LBB5_8
+; RV32I-SFB-ZBB-NEXT:  # %bb.7: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a2
+; RV32I-SFB-ZBB-NEXT:  .LBB5_8: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez a4, .LBB5_10
+; RV32I-SFB-ZBB-NEXT:  # %bb.9: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a1, a3
+; RV32I-SFB-ZBB-NEXT:  .LBB5_10: # %entry
+; RV32I-SFB-ZBB-NEXT:    ret
+;
+; RV64I-SFB-ZBB-LABEL: select_example_smin_1:
+; RV64I-SFB-ZBB:       # %bb.0: # %entry
+; RV64I-SFB-ZBB-NEXT:    min a0, a0, a3
+; RV64I-SFB-ZBB-NEXT:    bnez a2, .LBB5_2
+; RV64I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-ZBB-NEXT:    mv a0, a1
+; RV64I-SFB-ZBB-NEXT:  .LBB5_2: # %entry
+; RV64I-SFB-ZBB-NEXT:    ret
+;
+; RV32I-SFBIMinMax-ZBB-LABEL: select_example_smin_1:
+; RV32I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    sltu a7, a0, a5
+; RV32I-SFBIMinMax-ZBB-NEXT:    slt t0, a1, a6
+; RV32I-SFBIMinMax-ZBB-NEXT:    bne a1, a6, .LBB5_2
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv t0, a7
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB5_2: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez t0, .LBB5_4
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.3: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a1, a6
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB5_4: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez t0, .LBB5_6
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a5
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB5_6: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez a4, .LBB5_8
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.7: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a2
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB5_8: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez a4, .LBB5_10
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.9: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a1, a3
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB5_10: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    ret
+;
+; RV64I-SFBIMinMax-ZBB-LABEL: select_example_smin_1:
+; RV64I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB5_2
+; RV64I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    min a1, a0, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:  .LBB5_2: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV64I-SFBIMinMax-ZBB-NEXT:    ret
+entry:
+  %res = call i64 @llvm.smin.i64(i64 %a, i64 %y)
+  %sel = select i1 %x, i64 %res, i64 %b
+  ret i64 %sel
+}
+
+define i64 @select_example_umax_1(i64 %a, i64 %b, i1 zeroext %x, i64 %y) {
+; RV32I-ZBB-LABEL: select_example_umax_1:
+; RV32I-ZBB:       # %bb.0: # %entry
+; RV32I-ZBB-NEXT:    beq a1, a6, .LBB6_2
+; RV32I-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-ZBB-NEXT:    sltu a7, a6, a1
+; RV32I-ZBB-NEXT:    beqz a7, .LBB6_3
+; RV32I-ZBB-NEXT:    j .LBB6_4
+; RV32I-ZBB-NEXT:  .LBB6_2:
+; RV32I-ZBB-NEXT:    sltu a7, a5, a0
+; RV32I-ZBB-NEXT:    bnez a7, .LBB6_4
+; RV32I-ZBB-NEXT:  .LBB6_3: # %entry
+; RV32I-ZBB-NEXT:    mv a1, a6
+; RV32I-ZBB-NEXT:    mv a0, a5
+; RV32I-ZBB-NEXT:  .LBB6_4: # %entry
+; RV32I-ZBB-NEXT:    beqz a4, .LBB6_6
+; RV32I-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-ZBB-NEXT:    ret
+; RV32I-ZBB-NEXT:  .LBB6_6: # %entry
+; RV32I-ZBB-NEXT:    mv a0, a2
+; RV32I-ZBB-NEXT:    mv a1, a3
+; RV32I-ZBB-NEXT:    ret
+;
+; RV64I-ZBB-LABEL: select_example_umax_1:
+; RV64I-ZBB:       # %bb.0: # %entry
+; RV64I-ZBB-NEXT:    beqz a2, .LBB6_2
+; RV64I-ZBB-NEXT:  # %bb.1:
+; RV64I-ZBB-NEXT:    maxu a1, a0, a3
+; RV64I-ZBB-NEXT:  .LBB6_2: # %entry
+; RV64I-ZBB-NEXT:    mv a0, a1
+; RV64I-ZBB-NEXT:    ret
+;
+; RV32I-SFB-ZBB-LABEL: select_example_umax_1:
+; RV32I-SFB-ZBB:       # %bb.0: # %entry
+; RV32I-SFB-ZBB-NEXT:    sltu a7, a5, a0
+; RV32I-SFB-ZBB-NEXT:    sltu t0, a6, a1
+; RV32I-SFB-ZBB-NEXT:    bne a1, a6, .LBB6_2
+; RV32I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv t0, a7
+; RV32I-SFB-ZBB-NEXT:  .LBB6_2: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez t0, .LBB6_4
+; RV32I-SFB-ZBB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a1, a6
+; RV32I-SFB-ZBB-NEXT:  .LBB6_4: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez t0, .LBB6_6
+; RV32I-SFB-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a5
+; RV32I-SFB-ZBB-NEXT:  .LBB6_6: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez a4, .LBB6_8
+; RV32I-SFB-ZBB-NEXT:  # %bb.7: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a2
+; RV32I-SFB-ZBB-NEXT:  .LBB6_8: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez a4, .LBB6_10
+; RV32I-SFB-ZBB-NEXT:  # %bb.9: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a1, a3
+; RV32I-SFB-ZBB-NEXT:  .LBB6_10: # %entry
+; RV32I-SFB-ZBB-NEXT:    ret
+;
+; RV64I-SFB-ZBB-LABEL: select_example_umax_1:
+; RV64I-SFB-ZBB:       # %bb.0: # %entry
+; RV64I-SFB-ZBB-NEXT:    maxu a0, a0, a3
+; RV64I-SFB-ZBB-NEXT:    bnez a2, .LBB6_2
+; RV64I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-ZBB-NEXT:    mv a0, a1
+; RV64I-SFB-ZBB-NEXT:  .LBB6_2: # %entry
+; RV64I-SFB-ZBB-NEXT:    ret
+;
+; RV32I-SFBIMinMax-ZBB-LABEL: select_example_umax_1:
+; RV32I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    sltu a7, a5, a0
+; RV32I-SFBIMinMax-ZBB-NEXT:    sltu t0, a6, a1
+; RV32I-SFBIMinMax-ZBB-NEXT:    bne a1, a6, .LBB6_2
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv t0, a7
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB6_2: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez t0, .LBB6_4
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.3: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a1, a6
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB6_4: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez t0, .LBB6_6
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a5
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB6_6: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez a4, .LBB6_8
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.7: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a2
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB6_8: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez a4, .LBB6_10
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.9: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a1, a3
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB6_10: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    ret
+;
+; RV64I-SFBIMinMax-ZBB-LABEL: select_example_umax_1:
+; RV64I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB6_2
+; RV64I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    maxu a1, a0, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:  .LBB6_2: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV64I-SFBIMinMax-ZBB-NEXT:    ret
+entry:
+  %res = call i64 @llvm.umax.i64(i64 %a, i64 %y)
+  %sel = select i1 %x, i64 %res, i64 %b
+  ret i64 %sel
+}
+
+define i64 @select_example_umin_1(i64 %a, i64 %b, i1 zeroext %x, i64 %y) {
+; RV32I-ZBB-LABEL: select_example_umin_1:
+; RV32I-ZBB:       # %bb.0: # %entry
+; RV32I-ZBB-NEXT:    beq a1, a6, .LBB7_2
+; RV32I-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-ZBB-NEXT:    sltu a7, a1, a6
+; RV32I-ZBB-NEXT:    beqz a7, .LBB7_3
+; RV32I-ZBB-NEXT:    j .LBB7_4
+; RV32I-ZBB-NEXT:  .LBB7_2:
+; RV32I-ZBB-NEXT:    sltu a7, a0, a5
+; RV32I-ZBB-NEXT:    bnez a7, .LBB7_4
+; RV32I-ZBB-NEXT:  .LBB7_3: # %entry
+; RV32I-ZBB-NEXT:    mv a1, a6
+; RV32I-ZBB-NEXT:    mv a0, a5
+; RV32I-ZBB-NEXT:  .LBB7_4: # %entry
+; RV32I-ZBB-NEXT:    beqz a4, .LBB7_6
+; RV32I-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-ZBB-NEXT:    ret
+; RV32I-ZBB-NEXT:  .LBB7_6: # %entry
+; RV32I-ZBB-NEXT:    mv a0, a2
+; RV32I-ZBB-NEXT:    mv a1, a3
+; RV32I-ZBB-NEXT:    ret
+;
+; RV64I-ZBB-LABEL: select_example_umin_1:
+; RV64I-ZBB:       # %bb.0: # %entry
+; RV64I-ZBB-NEXT:    beqz a2, .LBB7_2
+; RV64I-ZBB-NEXT:  # %bb.1:
+; RV64I-ZBB-NEXT:    minu a1, a0, a3
+; RV64I-ZBB-NEXT:  .LBB7_2: # %entry
+; RV64I-ZBB-NEXT:    mv a0, a1
+; RV64I-ZBB-NEXT:    ret
+;
+; RV32I-SFB-ZBB-LABEL: select_example_umin_1:
+; RV32I-SFB-ZBB:       # %bb.0: # %entry
+; RV32I-SFB-ZBB-NEXT:    sltu a7, a0, a5
+; RV32I-SFB-ZBB-NEXT:    sltu t0, a1, a6
+; RV32I-SFB-ZBB-NEXT:    bne a1, a6, .LBB7_2
+; RV32I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv t0, a7
+; RV32I-SFB-ZBB-NEXT:  .LBB7_2: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez t0, .LBB7_4
+; RV32I-SFB-ZBB-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a1, a6
+; RV32I-SFB-ZBB-NEXT:  .LBB7_4: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez t0, .LBB7_6
+; RV32I-SFB-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a5
+; RV32I-SFB-ZBB-NEXT:  .LBB7_6: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez a4, .LBB7_8
+; RV32I-SFB-ZBB-NEXT:  # %bb.7: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a0, a2
+; RV32I-SFB-ZBB-NEXT:  .LBB7_8: # %entry
+; RV32I-SFB-ZBB-NEXT:    bnez a4, .LBB7_10
+; RV32I-SFB-ZBB-NEXT:  # %bb.9: # %entry
+; RV32I-SFB-ZBB-NEXT:    mv a1, a3
+; RV32I-SFB-ZBB-NEXT:  .LBB7_10: # %entry
+; RV32I-SFB-ZBB-NEXT:    ret
+;
+; RV64I-SFB-ZBB-LABEL: select_example_umin_1:
+; RV64I-SFB-ZBB:       # %bb.0: # %entry
+; RV64I-SFB-ZBB-NEXT:    minu a0, a0, a3
+; RV64I-SFB-ZBB-NEXT:    bnez a2, .LBB7_2
+; RV64I-SFB-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-ZBB-NEXT:    mv a0, a1
+; RV64I-SFB-ZBB-NEXT:  .LBB7_2: # %entry
+; RV64I-SFB-ZBB-NEXT:    ret
+;
+; RV32I-SFBIMinMax-ZBB-LABEL: select_example_umin_1:
+; RV32I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    sltu a7, a0, a5
+; RV32I-SFBIMinMax-ZBB-NEXT:    sltu t0, a1, a6
+; RV32I-SFBIMinMax-ZBB-NEXT:    bne a1, a6, .LBB7_2
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv t0, a7
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB7_2: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez t0, .LBB7_4
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.3: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a1, a6
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB7_4: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez t0, .LBB7_6
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.5: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a5
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB7_6: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez a4, .LBB7_8
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.7: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a0, a2
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB7_8: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    bnez a4, .LBB7_10
+; RV32I-SFBIMinMax-ZBB-NEXT:  # %bb.9: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    mv a1, a3
+; RV32I-SFBIMinMax-ZBB-NEXT:  .LBB7_10: # %entry
+; RV32I-SFBIMinMax-ZBB-NEXT:    ret
+;
+; RV64I-SFBIMinMax-ZBB-LABEL: select_example_umin_1:
+; RV64I-SFBIMinMax-ZBB:       # %bb.0: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    beqz a2, .LBB7_2
+; RV64I-SFBIMinMax-ZBB-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    minu a1, a0, a3
+; RV64I-SFBIMinMax-ZBB-NEXT:  .LBB7_2: # %entry
+; RV64I-SFBIMinMax-ZBB-NEXT:    mv a0, a1
+; RV64I-SFBIMinMax-ZBB-NEXT:    ret
+entry:
+  %res = call i64 @llvm.umin.i64(i64 %a, i64 %y)
+  %sel = select i1 %x, i64 %res, i64 %b
+  ret i64 %sel
+}
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll
new file mode 100644
index 0000000000000..3f780fddafcce
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m | FileCheck %s --check-prefixes=RV32I-M
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m | FileCheck %s --check-prefixes=RV64I-M
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFB-M
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFB-M
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+short-forward-branch-i-mul | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFBIMul-M
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+short-forward-branch-i-mul | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFBIMul-M
+
+define i32 @select_example_mul_i32(i32 %a, i32 %b, i1 zeroext %x, i32 %y) {
+; RV32I-M-LABEL: select_example_mul_i32:
+; RV32I-M:       # %bb.0: # %entry
+; RV32I-M-NEXT:    beqz a2, .LBB0_2
+; RV32I-M-NEXT:  # %bb.1:
+; RV32I-M-NEXT:    mul a1, a0, a3
+; RV32I-M-NEXT:  .LBB0_2: # %entry
+; RV32I-M-NEXT:    mv a0, a1
+; RV32I-M-NEXT:    ret
+;
+; RV64I-M-LABEL: select_example_mul_i32:
+; RV64I-M:       # %bb.0: # %entry
+; RV64I-M-NEXT:    beqz a2, .LBB0_2
+; RV64I-M-NEXT:  # %bb.1:
+; RV64I-M-NEXT:    mulw a1, a0, a3
+; RV64I-M-NEXT:  .LBB0_2: # %entry
+; RV64I-M-NEXT:    mv a0, a1
+; RV64I-M-NEXT:    ret
+;
+; RV32I-SFB-M-LABEL: select_example_mul_i32:
+; RV32I-SFB-M:       # %bb.0: # %entry
+; RV32I-SFB-M-NEXT:    mul a0, a0, a3
+; RV32I-SFB-M-NEXT:    bnez a2, .LBB0_2
+; RV32I-SFB-M-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-M-NEXT:    mv a0, a1
+; RV32I-SFB-M-NEXT:  .LBB0_2: # %entry
+; RV32I-SFB-M-NEXT:    ret
+;
+; RV64I-SFB-M-LABEL: select_example_mul_i32:
+; RV64I-SFB-M:       # %bb.0: # %entry
+; RV64I-SFB-M-NEXT:    mulw a0, a0, a3
+; RV64I-SFB-M-NEXT:    bnez a2, .LBB0_2
+; RV64I-SFB-M-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-M-NEXT:    mv a0, a1
+; RV64I-SFB-M-NEXT:  .LBB0_2: # %entry
+; RV64I-SFB-M-NEXT:    ret
+;
+; RV32I-SFBIMul-M-LABEL: select_example_mul_i32:
+; RV32I-SFBIMul-M:       # %bb.0: # %entry
+; RV32I-SFBIMul-M-NEXT:    beqz a2, .LBB0_2
+; RV32I-SFBIMul-M-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMul-M-NEXT:    mul a1, a0, a3
+; RV32I-SFBIMul-M-NEXT:  .LBB0_2: # %entry
+; RV32I-SFBIMul-M-NEXT:    mv a0, a1
+; RV32I-SFBIMul-M-NEXT:    ret
+;
+; RV64I-SFBIMul-M-LABEL: select_example_mul_i32:
+; RV64I-SFBIMul-M:       # %bb.0: # %entry
+; RV64I-SFBIMul-M-NEXT:    mulw a0, a0, a3
+; RV64I-SFBIMul-M-NEXT:    bnez a2, .LBB0_2
+; RV64I-SFBIMul-M-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMul-M-NEXT:    mv a0, a1
+; RV64I-SFBIMul-M-NEXT:  .LBB0_2: # %entry
+; RV64I-SFBIMul-M-NEXT:    ret
+entry:
+  %res = mul i32 %a, %y
+  %sel = select i1 %x, i32 %res, i32 %b
+  ret i32 %sel
+}
+
+define i64 @select_example_mul_i64(i64 %a, i64 %b, i1 zeroext %x, i64 %y) {
+; RV32I-M-LABEL: select_example_mul_i64:
+; RV32I-M:       # %bb.0: # %entry
+; RV32I-M-NEXT:    beqz a4, .LBB1_2
+; RV32I-M-NEXT:  # %bb.1:
+; RV32I-M-NEXT:    mul a2, a0, a6
+; RV32I-M-NEXT:    mulhu a3, a0, a5
+; RV32I-M-NEXT:    mul a1, a1, a5
+; RV32I-M-NEXT:    add a2, a3, a2
+; RV32I-M-NEXT:    add a3, a2, a1
+; RV32I-M-NEXT:    mul a2, a0, a5
+; RV32I-M-NEXT:  .LBB1_2: # %entry
+; RV32I-M-NEXT:    mv a0, a2
+; RV32I-M-NEXT:    mv a1, a3
+; RV32I-M-NEXT:    ret
+;
+; RV64I-M-LABEL: select_example_mul_i64:
+; RV64I-M:       # %bb.0: # %entry
+; RV64I-M-NEXT:    beqz a2, .LBB1_2
+; RV64I-M-NEXT:  # %bb.1:
+; RV64I-M-NEXT:    mul a1, a0, a3
+; RV64I-M-NEXT:  .LBB1_2: # %entry
+; RV64I-M-NEXT:    mv a0, a1
+; RV64I-M-NEXT:    ret
+;
+; RV32I-SFB-M-LABEL: select_example_mul_i64:
+; RV32I-SFB-M:       # %bb.0: # %entry
+; RV32I-SFB-M-NEXT:    mul a6, a0, a6
+; RV32I-SFB-M-NEXT:    mulhu a7, a0, a5
+; RV32I-SFB-M-NEXT:    mul a1, a1, a5
+; RV32I-SFB-M-NEXT:    mul a0, a0, a5
+; RV32I-SFB-M-NEXT:    add a6, a7, a6
+; RV32I-SFB-M-NEXT:    beqz a4, .LBB1_2
+; RV32I-SFB-M-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-M-NEXT:    add a3, a6, a1
+; RV32I-SFB-M-NEXT:  .LBB1_2: # %entry
+; RV32I-SFB-M-NEXT:    bnez a4, .LBB1_4
+; RV32I-SFB-M-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-M-NEXT:    mv a0, a2
+; RV32I-SFB-M-NEXT:  .LBB1_4: # %entry
+; RV32I-SFB-M-NEXT:    mv a1, a3
+; RV32I-SFB-M-NEXT:    ret
+;
+; RV64I-SFB-M-LABEL: select_example_mul_i64:
+; RV64I-SFB-M:       # %bb.0: # %entry
+; RV64I-SFB-M-NEXT:    mul a0, a0, a3
+; RV64I-SFB-M-NEXT:    bnez a2, .LBB1_2
+; RV64I-SFB-M-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-M-NEXT:    mv a0, a1
+; RV64I-SFB-M-NEXT:  .LBB1_2: # %entry
+; RV64I-SFB-M-NEXT:    ret
+;
+; RV32I-SFBIMul-M-LABEL: select_example_mul_i64:
+; RV32I-SFBIMul-M:       # %bb.0: # %entry
+; RV32I-SFBIMul-M-NEXT:    mul a6, a0, a6
+; RV32I-SFBIMul-M-NEXT:    mulhu a7, a0, a5
+; RV32I-SFBIMul-M-NEXT:    mul a1, a1, a5
+; RV32I-SFBIMul-M-NEXT:    add a6, a7, a6
+; RV32I-SFBIMul-M-NEXT:    beqz a4, .LBB1_2
+; RV32I-SFBIMul-M-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMul-M-NEXT:    add a3, a6, a1
+; RV32I-SFBIMul-M-NEXT:  .LBB1_2: # %entry
+; RV32I-SFBIMul-M-NEXT:    beqz a4, .LBB1_4
+; RV32I-SFBIMul-M-NEXT:  # %bb.3: # %entry
+; RV32I-SFBIMul-M-NEXT:    mul a2, a0, a5
+; RV32I-SFBIMul-M-NEXT:  .LBB1_4: # %entry
+; RV32I-SFBIMul-M-NEXT:    mv a0, a2
+; RV32I-SFBIMul-M-NEXT:    mv a1, a3
+; RV32I-SFBIMul-M-NEXT:    ret
+;
+; RV64I-SFBIMul-M-LABEL: select_example_mul_i64:
+; RV64I-SFBIMul-M:       # %bb.0: # %entry
+; RV64I-SFBIMul-M-NEXT:    beqz a2, .LBB1_2
+; RV64I-SFBIMul-M-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMul-M-NEXT:    mul a1, a0, a3
+; RV64I-SFBIMul-M-NEXT:  .LBB1_2: # %entry
+; RV64I-SFBIMul-M-NEXT:    mv a0, a1
+; RV64I-SFBIMul-M-NEXT:    ret
+entry:
+  %res = mul i64 %a, %y
+  %sel = select i1 %x, i64 %res, i64 %b
+  ret i64 %sel
+}
+
diff --git a/llvm/test/CodeGen/RISCV/sra-xor-sra.ll b/llvm/test/CodeGen/RISCV/sra-xor-sra.ll
new file mode 100644
index 0000000000000..b04f0a29d07f3
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/sra-xor-sra.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s | FileCheck %s
+
+; Test folding of: (sra (xor (sra x, c1), -1), c2) -> (sra (xor x, -1), c3)
+; Original motivating example: should merge sra+sra across xor
+define i16 @not_invert_signbit_splat_mask(i8 %x, i16 %y) {
+; CHECK-LABEL: not_invert_signbit_splat_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 56
+; CHECK-NEXT:    srai a0, a0, 62
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    ret
+  %a = ashr i8 %x, 6
+  %n = xor i8 %a, -1
+  %s = sext i8 %n to i16
+  %r = and i16 %s, %y
+  ret i16 %r
+}
+
+; Edge case
+define i16 @sra_xor_sra_overflow(i8 %x, i16 %y) {
+; CHECK-LABEL: sra_xor_sra_overflow:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 0
+; CHECK-NEXT:    ret
+  %a = ashr i8 %x, 10
+  %n = xor i8 %a, -1
+  %s = sext i8 %n to i16
+  %r = and i16 %s, %y
+  ret i16 %r
+}
diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
index 636fdfae68438..ba9c926c57152 100644
--- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
@@ -579,7 +579,7 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    vmv.v.x v10, a3
 ; RV32MV-NEXT:    srli a3, a1, 22
 ; RV32MV-NEXT:    or a2, a3, a2
-; RV32MV-NEXT:    lui a3, 41121
+; RV32MV-NEXT:    lui a3, 161
 ; RV32MV-NEXT:    slli a1, a1, 10
 ; RV32MV-NEXT:    srli a1, a1, 21
 ; RV32MV-NEXT:    vslide1down.vx v10, v10, a1
@@ -636,7 +636,7 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV64MV-NEXT:    lui a3, %hi(.LCPI4_0)
 ; RV64MV-NEXT:    addi a3, a3, %lo(.LCPI4_0)
 ; RV64MV-NEXT:    vle16.v v9, (a3)
-; RV64MV-NEXT:    lui a3, 41121
+; RV64MV-NEXT:    lui a3, 161
 ; RV64MV-NEXT:    slli a2, a2, 32
 ; RV64MV-NEXT:    or a1, a1, a2
 ; RV64MV-NEXT:    andi a2, a1, 2047
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index bf6802deeffdc..93b68b0a95b48 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -1834,13 +1834,12 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZICOND-NEXT:    mul a5, a3, a0
 ; RV32ZICOND-NEXT:    mul a6, a1, a2
 ; RV32ZICOND-NEXT:    mulhu a7, a0, a2
-; RV32ZICOND-NEXT:    snez t0, a3
+; RV32ZICOND-NEXT:    add a5, a6, a5
+; RV32ZICOND-NEXT:    snez a6, a3
 ; RV32ZICOND-NEXT:    mulhu a3, a3, a0
-; RV32ZICOND-NEXT:    mul t1, a0, a2
+; RV32ZICOND-NEXT:    mul t0, a0, a2
 ; RV32ZICOND-NEXT:    mulhu a0, a1, a2
-; RV32ZICOND-NEXT:    snez a1, a1
-; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    and a1, a1, t0
+; RV32ZICOND-NEXT:    czero.eqz a1, a6, a1
 ; RV32ZICOND-NEXT:    snez a0, a0
 ; RV32ZICOND-NEXT:    snez a2, a3
 ; RV32ZICOND-NEXT:    add a5, a7, a5
@@ -1848,7 +1847,7 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZICOND-NEXT:    sltu a1, a5, a7
 ; RV32ZICOND-NEXT:    or a0, a0, a2
 ; RV32ZICOND-NEXT:    or a0, a0, a1
-; RV32ZICOND-NEXT:    sw t1, 0(a4)
+; RV32ZICOND-NEXT:    sw t0, 0(a4)
 ; RV32ZICOND-NEXT:    sw a5, 4(a4)
 ; RV32ZICOND-NEXT:    ret
 ;
@@ -3690,11 +3689,10 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    snez a6, a3
 ; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    snez a5, a1
-; RV32ZICOND-NEXT:    and a5, a5, a6
-; RV32ZICOND-NEXT:    mulhu a6, a1, a2
-; RV32ZICOND-NEXT:    snez a6, a6
-; RV32ZICOND-NEXT:    or a5, a5, a6
+; RV32ZICOND-NEXT:    mulhu a5, a1, a2
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a1
+; RV32ZICOND-NEXT:    snez a5, a5
+; RV32ZICOND-NEXT:    or a5, a6, a5
 ; RV32ZICOND-NEXT:    mulhu a6, a0, a2
 ; RV32ZICOND-NEXT:    add a4, a6, a4
 ; RV32ZICOND-NEXT:    sltu a4, a4, a6
@@ -3783,18 +3781,17 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    mul a4, a3, a0
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
-; RV32ZICOND-NEXT:    mulhu a6, a0, a2
+; RV32ZICOND-NEXT:    add a4, a5, a4
+; RV32ZICOND-NEXT:    mulhu a5, a0, a2
 ; RV32ZICOND-NEXT:    mulhu a0, a3, a0
 ; RV32ZICOND-NEXT:    snez a3, a3
 ; RV32ZICOND-NEXT:    mulhu a2, a1, a2
-; RV32ZICOND-NEXT:    snez a1, a1
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    and a1, a1, a3
+; RV32ZICOND-NEXT:    czero.eqz a1, a3, a1
 ; RV32ZICOND-NEXT:    snez a2, a2
 ; RV32ZICOND-NEXT:    snez a0, a0
-; RV32ZICOND-NEXT:    add a4, a6, a4
+; RV32ZICOND-NEXT:    add a4, a5, a4
 ; RV32ZICOND-NEXT:    or a1, a1, a2
-; RV32ZICOND-NEXT:    sltu a2, a4, a6
+; RV32ZICOND-NEXT:    sltu a2, a4, a5
 ; RV32ZICOND-NEXT:    or a0, a1, a0
 ; RV32ZICOND-NEXT:    or a0, a0, a2
 ; RV32ZICOND-NEXT:    xori a0, a0, 1
@@ -5156,18 +5153,17 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    mul a4, a3, a0
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
-; RV32ZICOND-NEXT:    mulhu a6, a0, a2
+; RV32ZICOND-NEXT:    add a4, a5, a4
+; RV32ZICOND-NEXT:    mulhu a5, a0, a2
 ; RV32ZICOND-NEXT:    mulhu a0, a3, a0
 ; RV32ZICOND-NEXT:    snez a3, a3
 ; RV32ZICOND-NEXT:    mulhu a2, a1, a2
-; RV32ZICOND-NEXT:    snez a1, a1
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    and a1, a1, a3
+; RV32ZICOND-NEXT:    czero.eqz a1, a3, a1
 ; RV32ZICOND-NEXT:    snez a2, a2
 ; RV32ZICOND-NEXT:    snez a0, a0
-; RV32ZICOND-NEXT:    add a4, a6, a4
+; RV32ZICOND-NEXT:    add a4, a5, a4
 ; RV32ZICOND-NEXT:    or a1, a1, a2
-; RV32ZICOND-NEXT:    sltu a2, a4, a6
+; RV32ZICOND-NEXT:    sltu a2, a4, a5
 ; RV32ZICOND-NEXT:    or a0, a1, a0
 ; RV32ZICOND-NEXT:    or a0, a0, a2
 ; RV32ZICOND-NEXT:    beqz a0, .LBB64_2
diff --git a/llvm/test/CodeGen/RISCV/zicond-opts.ll b/llvm/test/CodeGen/RISCV/zicond-opts.ll
index d8e2b2c2bf58d..c6d72981eff32 100644
--- a/llvm/test/CodeGen/RISCV/zicond-opts.ll
+++ b/llvm/test/CodeGen/RISCV/zicond-opts.ll
@@ -7,22 +7,132 @@ define i32 @icmp_and(i64 %x, i64 %y) {
 ; RV32ZICOND-LABEL: icmp_and:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    or a2, a2, a3
+; RV32ZICOND-NEXT:    snez a2, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a0, a2, a0
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: icmp_and:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    snez a1, a1
+; RV64ZICOND-NEXT:    czero.eqz a0, a1, a0
+; RV64ZICOND-NEXT:    ret
+  %3 = icmp ne i64 %y, 0
+  %4 = icmp ne i64 %x, 0
+  %5 = and i1 %4, %3
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+}
+
+; Make sure we choose the replace the single use icmp
+define i32 @icmp_and_x_multiple_uses(i64 %x, i64 %y) {
+; RV32ZICOND-LABEL: icmp_and_x_multiple_uses:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    or a2, a2, a3
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    czero.eqz a1, a0, a2
+; RV32ZICOND-NEXT:    add a0, a1, a0
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: icmp_and_x_multiple_uses:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    snez a0, a0
+; RV64ZICOND-NEXT:    czero.eqz a1, a0, a1
+; RV64ZICOND-NEXT:    add a0, a1, a0
+; RV64ZICOND-NEXT:    ret
+  %3 = icmp ne i64 %y, 0
+  %4 = icmp ne i64 %x, 0
+  %5 = and i1 %4, %3
+  %6 = zext i1 %5 to i32
+  %7 = zext i1 %4 to i32
+  %8 = add i32 %6, %7
+  ret i32 %8
+}
+
+; Make sure we choose the replace the single use icmp
+define i32 @icmp_and_y_multiple_uses(i64 %x, i64 %y) {
+; RV32ZICOND-LABEL: icmp_and_y_multiple_uses:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    or a2, a2, a3
+; RV32ZICOND-NEXT:    snez a2, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a0, a2, a0
+; RV32ZICOND-NEXT:    add a0, a0, a2
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: icmp_and_y_multiple_uses:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    snez a1, a1
+; RV64ZICOND-NEXT:    czero.eqz a0, a1, a0
+; RV64ZICOND-NEXT:    add a0, a0, a1
+; RV64ZICOND-NEXT:    ret
+  %3 = icmp ne i64 %y, 0
+  %4 = icmp ne i64 %x, 0
+  %5 = and i1 %4, %3
+  %6 = zext i1 %5 to i32
+  %7 = zext i1 %3 to i32
+  %8 = add i32 %6, %7
+  ret i32 %8
+}
+
+; Both icmp's have multiple uses, don't optimize
+define i32 @icmp_and_xy_multiple_uses(i64 %x, i64 %y) {
+; RV32ZICOND-LABEL: icmp_and_xy_multiple_uses:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    or a2, a2, a3
 ; RV32ZICOND-NEXT:    or a0, a0, a1
 ; RV32ZICOND-NEXT:    snez a1, a2
 ; RV32ZICOND-NEXT:    snez a0, a0
-; RV32ZICOND-NEXT:    and a0, a0, a1
+; RV32ZICOND-NEXT:    and a2, a0, a1
+; RV32ZICOND-NEXT:    add a0, a1, a0
+; RV32ZICOND-NEXT:    add a0, a2, a0
 ; RV32ZICOND-NEXT:    ret
 ;
-; RV64ZICOND-LABEL: icmp_and:
+; RV64ZICOND-LABEL: icmp_and_xy_multiple_uses:
 ; RV64ZICOND:       # %bb.0:
 ; RV64ZICOND-NEXT:    snez a1, a1
 ; RV64ZICOND-NEXT:    snez a0, a0
-; RV64ZICOND-NEXT:    and a0, a0, a1
+; RV64ZICOND-NEXT:    and a2, a0, a1
+; RV64ZICOND-NEXT:    add a0, a1, a0
+; RV64ZICOND-NEXT:    add a0, a2, a0
 ; RV64ZICOND-NEXT:    ret
   %3 = icmp ne i64 %y, 0
   %4 = icmp ne i64 %x, 0
   %5 = and i1 %4, %3
   %6 = zext i1 %5 to i32
+  %7 = zext i1 %3 to i32
+  %8 = zext i1 %4 to i32
+  %9 = add i32 %6, %7
+  %10 = add i32 %9, %8
+  ret i32 %10
+}
+
+
+; (and (icmp x. 0, ne), (icmp y, 0, ne)) -> (czero.eqz (icmp x, 0, ne), y)
+define i32 @icmp_and_select(i64 %x, i64 %y, i32 %z) {
+; RV32ZICOND-LABEL: icmp_and_select:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    sgtz a5, a3
+; RV32ZICOND-NEXT:    snez a2, a2
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a3
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a3
+; RV32ZICOND-NEXT:    or a2, a2, a5
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a0, a2, a0
+; RV32ZICOND-NEXT:    czero.eqz a0, a4, a0
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: icmp_and_select:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    sgtz a1, a1
+; RV64ZICOND-NEXT:    czero.eqz a0, a1, a0
+; RV64ZICOND-NEXT:    czero.eqz a0, a2, a0
+; RV64ZICOND-NEXT:    ret
+  %3 = icmp sgt i64 %y, 0
+  %4 = icmp ne i64 %x, 0
+  %5 = and i1 %4, %3
+  %6 = select i1 %5, i32 %z, i32 0
   ret i32 %6
 }
 
@@ -32,21 +142,17 @@ define i32 @icmp_and_and(i64 %x, i64 %y, i64 %z) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    or a2, a2, a3
 ; RV32ZICOND-NEXT:    or a0, a0, a1
-; RV32ZICOND-NEXT:    or a4, a4, a5
-; RV32ZICOND-NEXT:    snez a1, a2
 ; RV32ZICOND-NEXT:    snez a0, a0
-; RV32ZICOND-NEXT:    and a0, a1, a0
-; RV32ZICOND-NEXT:    snez a1, a4
-; RV32ZICOND-NEXT:    and a0, a1, a0
+; RV32ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV32ZICOND-NEXT:    or a4, a4, a5
+; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: icmp_and_and:
 ; RV64ZICOND:       # %bb.0:
-; RV64ZICOND-NEXT:    snez a1, a1
 ; RV64ZICOND-NEXT:    snez a0, a0
-; RV64ZICOND-NEXT:    and a0, a1, a0
-; RV64ZICOND-NEXT:    snez a1, a2
-; RV64ZICOND-NEXT:    and a0, a1, a0
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a1
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
 ; RV64ZICOND-NEXT:    ret
   %4 = icmp ne i64 %y, 0
   %5 = icmp ne i64 %x, 0
@@ -263,3 +369,35 @@ define i64 @test_inv_and_eqz(i64 %f, i64 %x, i1 %cond) {
   %7 = and i64 %6, %f
   ret i64 %7
 }
+
+define i32 @pr166596(i32 %conv.i, i1 %iszero) #0 {
+; RV32ZICOND-LABEL: pr166596:
+; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND-NEXT:    andi a1, a1, 1
+; RV32ZICOND-NEXT:    xori a0, a0, 1
+; RV32ZICOND-NEXT:    zext.h a0, a0
+; RV32ZICOND-NEXT:    clz a0, a0
+; RV32ZICOND-NEXT:    addi a0, a0, 41
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a1
+; RV32ZICOND-NEXT:    addi a0, a0, -9
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: pr166596:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    andi a1, a1, 1
+; RV64ZICOND-NEXT:    xori a0, a0, 1
+; RV64ZICOND-NEXT:    zext.h a0, a0
+; RV64ZICOND-NEXT:    clz a0, a0
+; RV64ZICOND-NEXT:    addi a0, a0, 9
+; RV64ZICOND-NEXT:    czero.nez a0, a0, a1
+; RV64ZICOND-NEXT:    addi a0, a0, -9
+; RV64ZICOND-NEXT:    ret
+entry:
+  %not.i = xor i32 %conv.i, 1
+  %conv2.i = trunc i32 %not.i to i16
+  %conv22 = zext i16 %conv2.i to i64
+  %0 = call i64 @llvm.ctlz.i64(i64 %conv22, i1 false)
+  %cast = trunc i64 %0 to i32
+  %clzg = select i1 %iszero, i32 -9, i32 %cast
+  ret i32 %clzg
+}
diff --git a/llvm/test/CodeGen/SPARC/predictable-select.ll b/llvm/test/CodeGen/SPARC/predictable-select.ll
new file mode 100644
index 0000000000000..cf200a121d0f1
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/predictable-select.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 < %s -relocation-model=pic -mtriple=sparc -mcpu=v9 | FileCheck --check-prefix=SPARC %s
+; RUN: llc -O3 < %s -relocation-model=pic -mtriple=sparcv9 | FileCheck --check-prefix=SPARC64 %s
+; RUN: llc -O3 < %s -relocation-model=pic -mtriple=sparc -mcpu=v9 -mattr=+no-predictor | FileCheck --check-prefix=SPARC-NO-PREDICTOR %s
+; RUN: llc -O3 < %s -relocation-model=pic -mtriple=sparcv9 -mattr=+no-predictor | FileCheck --check-prefix=SPARC64-NO-PREDICTOR %s
+
+;; Normally, highly predictable selects should be turned into branches.
+;; On the other hand, early Niagara processors should prefer conditional moves
+;; over branches even when it's predictable.
+
+define i32 @cdiv(i32 %cond, i32 %num) #0 {
+; SPARC-LABEL: cdiv:
+; SPARC:       ! %bb.0: ! %entry
+; SPARC-NEXT:    cmp %o0, 0
+; SPARC-NEXT:    be %icc, .LBB0_2
+; SPARC-NEXT:    mov %o1, %o0
+; SPARC-NEXT:  ! %bb.1: ! %select.end
+; SPARC-NEXT:    retl
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_2: ! %select.true.sink
+; SPARC-NEXT:    sethi 1398101, %o1
+; SPARC-NEXT:    or %o1, 342, %o1
+; SPARC-NEXT:    smul %o0, %o1, %o0
+; SPARC-NEXT:    rd %y, %o0
+; SPARC-NEXT:    srl %o0, 31, %o1
+; SPARC-NEXT:    retl
+; SPARC-NEXT:    add %o0, %o1, %o0
+;
+; SPARC64-LABEL: cdiv:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    cmp %o0, 0
+; SPARC64-NEXT:    be %icc, .LBB0_2
+; SPARC64-NEXT:    mov %o1, %o0
+; SPARC64-NEXT:  ! %bb.1: ! %select.end
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  .LBB0_2: ! %select.true.sink
+; SPARC64-NEXT:    sra %o0, 0, %o0
+; SPARC64-NEXT:    sethi 1398101, %o1
+; SPARC64-NEXT:    or %o1, 342, %o1
+; SPARC64-NEXT:    mulx %o0, %o1, %o0
+; SPARC64-NEXT:    srlx %o0, 63, %o1
+; SPARC64-NEXT:    srlx %o0, 32, %o0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    add %o0, %o1, %o0
+;
+; SPARC-NO-PREDICTOR-LABEL: cdiv:
+; SPARC-NO-PREDICTOR:       ! %bb.0: ! %entry
+; SPARC-NO-PREDICTOR-NEXT:    sethi 1398101, %o2
+; SPARC-NO-PREDICTOR-NEXT:    or %o2, 342, %o2
+; SPARC-NO-PREDICTOR-NEXT:    smul %o1, %o2, %o2
+; SPARC-NO-PREDICTOR-NEXT:    rd %y, %o2
+; SPARC-NO-PREDICTOR-NEXT:    srl %o2, 31, %o3
+; SPARC-NO-PREDICTOR-NEXT:    add %o2, %o3, %o2
+; SPARC-NO-PREDICTOR-NEXT:    cmp %o0, 0
+; SPARC-NO-PREDICTOR-NEXT:    move %icc, %o2, %o1
+; SPARC-NO-PREDICTOR-NEXT:    retl
+; SPARC-NO-PREDICTOR-NEXT:    mov %o1, %o0
+;
+; SPARC64-NO-PREDICTOR-LABEL: cdiv:
+; SPARC64-NO-PREDICTOR:       ! %bb.0: ! %entry
+; SPARC64-NO-PREDICTOR-NEXT:    sra %o1, 0, %o2
+; SPARC64-NO-PREDICTOR-NEXT:    sethi 1398101, %o3
+; SPARC64-NO-PREDICTOR-NEXT:    or %o3, 342, %o3
+; SPARC64-NO-PREDICTOR-NEXT:    mulx %o2, %o3, %o2
+; SPARC64-NO-PREDICTOR-NEXT:    srlx %o2, 63, %o3
+; SPARC64-NO-PREDICTOR-NEXT:    srlx %o2, 32, %o2
+; SPARC64-NO-PREDICTOR-NEXT:    add %o2, %o3, %o2
+; SPARC64-NO-PREDICTOR-NEXT:    cmp %o0, 0
+; SPARC64-NO-PREDICTOR-NEXT:    move %icc, %o2, %o1
+; SPARC64-NO-PREDICTOR-NEXT:    retl
+; SPARC64-NO-PREDICTOR-NEXT:    mov %o1, %o0
+entry:
+  %div = sdiv i32 %num, 3
+  %cmp = icmp eq i32 %cond, 0
+  %ret = select i1 %cmp, i32 %div, i32 %num
+  ret i32 %ret
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/SPIRV/ComparePointers.ll b/llvm/test/CodeGen/SPIRV/ComparePointers.ll
index 408b95579502e..bc1514e145cb5 100644
--- a/llvm/test/CodeGen/SPIRV/ComparePointers.ll
+++ b/llvm/test/CodeGen/SPIRV/ComparePointers.ll
@@ -12,7 +12,7 @@
 ;;     return;
 ;; }
 
-; CHECK-SPIRV: OpConvertPtrToU
+; CHECK-SPIRV: OpSpecConstantOp %[[#]] ConvertPtrToU
 ; CHECK-SPIRV: OpConvertPtrToU
 ; CHECK-SPIRV: OpINotEqual
 ; CHECK-SPIRV: OpConvertPtrToU
diff --git a/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll b/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll
new file mode 100644
index 0000000000000..677291a322900
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll
@@ -0,0 +1,36 @@
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck -check-prefix=CHECK-ERROR %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics %s -o %t.spvt 2>&1 | FileCheck -check-prefix=CHECK-ERROR %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=notllvm %s -o %t.spvt 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm.some.custom %s -o %t.spvt 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm. %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm.,random.prefix %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm. %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %}
+
+; The test checks command-line option which allows to represent unknown
+; intrinsics as external function calls in SPIR-V.
+
+; CHECK-ERROR: LLVM ERROR: unable to legalize instruction: %3:iid(s64) = G_READCYCLECOUNTER (in function: foo)
+
+; CHECK: Name %[[READCYCLECOUNTER:[0-9]+]] "spirv.llvm_readcyclecounter"
+; CHECK: Name %[[SOME_CUSTOM_INTRINSIC:[0-9]+]] "spirv.llvm_some_custom_intrinsic"
+; CHECK-DAG: Decorate %[[READCYCLECOUNTER]] LinkageAttributes {{.*}} Import
+; CHECK: Decorate %[[SOME_CUSTOM_INTRINSIC]] LinkageAttributes {{.*}} Import
+; CHECK-DAG: %[[I64:[0-9]+]] = OpTypeInt 64
+; CHECK: %[[FnTy:[0-9]+]] = OpTypeFunction %[[I64]]
+; CHECK: %[[READCYCLECOUNTER]] = OpFunction %[[I64]] {{.*}} %[[FnTy]]
+; CHECK-DAG: %[[SOME_CUSTOM_INTRINSIC]] = OpFunction %[[I64]] {{.*}} %[[FnTy]]
+; CHECK-DAG: OpFunctionCall %[[I64]] %[[READCYCLECOUNTER]]
+; CHECK:     OpFunctionCall %[[I64]] %[[SOME_CUSTOM_INTRINSIC]]
+
+define spir_func void @foo() {
+entry:
+; TODO: if and when the SPIR-V learns how to lower readcyclecounter, we will have to pick another unhandled intrinsic
+  %0 = call i64 @llvm.readcyclecounter()
+  %1 = call i64 @llvm.some.custom.intrinsic()
+  ret void
+}
+
+declare i64 @llvm.readcyclecounter()
+declare i64 @llvm.some.custom.intrinsic()
diff --git a/llvm/test/CodeGen/SPIRV/complex-constexpr.ll b/llvm/test/CodeGen/SPIRV/complex-constexpr.ll
index e2c1d00ba4c0e..a97a124ad2c65 100644
--- a/llvm/test/CodeGen/SPIRV/complex-constexpr.ll
+++ b/llvm/test/CodeGen/SPIRV/complex-constexpr.ll
@@ -6,7 +6,7 @@
 define linkonce_odr hidden spir_func void @test() {
 entry:
 ; CHECK: %[[#MinusOne:]] = OpConstant %[[#]] 18446744073709551615
-; CHECK: %[[#Ptr:]] = OpConvertUToPtr %[[#]]  %[[#MinusOne]]
+; CHECK: %[[#Ptr:]] = OpSpecConstantOp %[[#]] ConvertUToPtr %[[#MinusOne]]
 ; CHECK: %[[#PtrCast:]] = OpPtrCastToGeneric %[[#]] %[[#]]
 ; CHECK: %[[#]] = OpFunctionCall %[[#]] %[[#]] %[[#PtrCast]] %[[#Ptr]]
 
diff --git a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll
index ec4884ff643cb..3e0d0cc4cd8e2 100644
--- a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll
+++ b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll
@@ -1,7 +1,9 @@
 ; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info --print-after=spirv-nonsemantic-debug-info -O0 -mtriple=spirv64-unknown-unknown %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-MIR
+; RUN: llc --verify-machineinstrs --print-after=spirv-nonsemantic-debug-info -O0 -mtriple=spirv64-amd-amdhsa %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-MIR
 ; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: llc --verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 ; RUN: llc --verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_non_semantic_info %s -o - | FileCheck %s --check-prefix=CHECK-OPTION
-; TODO(#109287): When type is void * the spirv-val raises an error when DebugInfoNone is set as <id> Base Type argument of DebugTypePointer.   
+; TODO(#109287): When type is void * the spirv-val raises an error when DebugInfoNone is set as <id> Base Type argument of DebugTypePointer.
 ; DISABLED: %if spirv-tools %{ llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-MIR-DAG:   [[i32type:%[0-9]+\:type]] = OpTypeInt 32, 0
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll
new file mode 100644
index 0000000000000..f6b61153cf19e
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll
@@ -0,0 +1,98 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_ALTERA_blocking_pipes %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_blocking_pipes %s -o - -filetype=obj | spirv-val %}
+
+%opencl.pipe_ro_t = type opaque
+%opencl.pipe_wo_t = type opaque
+
+; CHECK-SPIRV: OpCapability BlockingPipesALTERA
+; CHECK-SPIRV: OpExtension "SPV_ALTERA_blocking_pipes"
+; CHECK-SPIRV: %[[PipeRTy:[0-9]+]] = OpTypePipe ReadOnly
+; CHECK-SPIRV: %[[PipeWTy:[0-9]+]] = OpTypePipe WriteOnly
+; CHECK-SPIRV: %[[PipeR1:[0-9]+]] = OpLoad %[[PipeRTy]] %[[#]] Aligned 8
+; CHECK-SPIRV: OpReadPipeBlockingALTERA %[[PipeR1]] %[[#]] %[[#]] %[[#]]
+; CHECK-SPIRV: %[[PipeR2:[0-9]+]] = OpLoad %[[PipeRTy]] %[[#]] Aligned 8
+; CHECK-SPIRV: OpReadPipeBlockingALTERA %[[PipeR2]] %[[#]]  %[[#]] %[[#]]
+; CHECK-SPIRV: %[[PipeW1:[0-9]+]] = OpLoad %[[PipeWTy]] %[[#]] Aligned 8
+; CHECK-SPIRV: OpWritePipeBlockingALTERA %[[PipeW1]] %[[#]]  %[[#]] %[[#]]
+; CHECK-SPIRV: %[[PipeW2:[0-9]+]] = OpLoad %[[PipeWTy]] %[[#]] Aligned 8
+; CHECK-SPIRV: OpWritePipeBlockingALTERA %[[PipeW2]] %[[#]] %[[#]] %[[#]]
+
+define spir_func void @foo(target("spirv.Pipe", 0) %p, ptr addrspace(1) %ptr) {
+entry:
+  %p.addr = alloca target("spirv.Pipe", 0), align 8
+  %ptr.addr = alloca ptr addrspace(1), align 8
+  store target("spirv.Pipe", 0) %p, target("spirv.Pipe", 0)* %p.addr, align 8
+  store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8
+  %0 = load target("spirv.Pipe", 0), target("spirv.Pipe", 0)* %p.addr, align 8
+  %1 = load ptr addrspace(1), ptr %ptr.addr, align 8
+  %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4)
+  call spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePiii(target("spirv.Pipe", 0) %0, ptr addrspace(4) %2, i32 4, i32 4)
+  ret void
+}
+
+declare dso_local spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePiii(target("spirv.Pipe", 0), ptr addrspace(4), i32, i32)
+
+define spir_func void @bar(target("spirv.Pipe", 0) %p, ptr addrspace(1) %ptr) {
+entry:
+  %p.addr = alloca target("spirv.Pipe", 0), align 8
+  %ptr.addr = alloca ptr addrspace(1), align 8
+  store target("spirv.Pipe", 0) %p, target("spirv.Pipe", 0)* %p.addr, align 8
+  store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8
+  %0 = load target("spirv.Pipe", 0), target("spirv.Pipe", 0)* %p.addr, align 8
+  %1 = load ptr addrspace(1), ptr %ptr.addr, align 8
+  %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4)
+  call spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePvii(target("spirv.Pipe", 0) %0, ptr addrspace(4) %2, i32 4, i32 4)
+  ret void
+}
+
+declare dso_local spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePvii(target("spirv.Pipe", 0), ptr addrspace(4), i32, i32)
+
+define spir_func void @boo(target("spirv.Pipe", 1) %p, ptr addrspace(1) %ptr) {
+entry:
+  %p.addr = alloca target("spirv.Pipe", 1), align 8
+  %ptr.addr = alloca ptr addrspace(1), align 8
+  store target("spirv.Pipe", 1) %p, target("spirv.Pipe", 1)* %p.addr, align 8
+  store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8
+  %0 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1)* %p.addr, align 8
+  %1 = load ptr addrspace(1), ptr %ptr.addr, align 8
+  %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4)
+  call spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePiii(target("spirv.Pipe", 1) %0, ptr addrspace(4) %2, i32 4, i32 4)
+  ret void
+}
+
+declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePiii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32)
+
+define spir_func void @baz(target("spirv.Pipe", 1) %p, ptr addrspace(1) %ptr) {
+entry:
+  %p.addr = alloca target("spirv.Pipe", 1), align 8
+  %ptr.addr = alloca ptr addrspace(1), align 8
+  store target("spirv.Pipe", 1) %p, target("spirv.Pipe", 1)* %p.addr, align 8
+  store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8
+  %0 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1)* %p.addr, align 8
+  %1 = load ptr addrspace(1), ptr %ptr.addr, align 8
+  %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4)
+  call spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePvii(target("spirv.Pipe", 1) %0, ptr addrspace(4) %2, i32 4, i32 4)
+  ret void
+}
+
+declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePvii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32)
+
+; CHECK-LLVM: declare spir_func void @__read_pipe_2_bl(ptr addrspace(1), ptr addrspace(4), i32, i32)
+; CHECK-LLVM: declare spir_func void @__write_pipe_2_bl(ptr addrspace(1), ptr addrspace(4), i32, i32)
+
+define linkonce_odr dso_local spir_func void @WritePipeBLockingi9Pointer(ptr addrspace(4) align 2 dereferenceable(2) %_Data) {
+entry:
+  %_Data.addr = alloca ptr addrspace(4), align 8
+  %_WPipe = alloca target("spirv.Pipe", 1), align 8
+  %_Data.addr.ascast = addrspacecast ptr %_Data.addr to ptr addrspace(4)
+  %_WPipe.ascast = addrspacecast target("spirv.Pipe", 1)* %_WPipe to target("spirv.Pipe", 1) addrspace(4)*
+  store ptr addrspace(4) %_Data, ptr addrspace(4) %_Data.addr.ascast, align 8
+  %0 = bitcast target("spirv.Pipe", 1)* %_WPipe to ptr
+  %1 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1) addrspace(4)* %_WPipe.ascast, align 8
+  %2 = load ptr addrspace(4), ptr addrspace(4) %_Data.addr.ascast, align 8
+  call spir_func void @_Z30__spirv_WritePipeBlockingINTELIDU9_Ev8ocl_pipePKT_ii(target("spirv.Pipe", 1) %1, ptr addrspace(4) %2, i32 2, i32 2)
+  ret void
+}
+
+declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIDU9_Ev8ocl_pipePKT_ii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32)
+ 
\ No newline at end of file
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll
index b1a555a52f40d..6b4e35e997124 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll
@@ -7,6 +7,8 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_optnone,+SPV_INTEL_optnone %s -o - | FileCheck %s --check-prefixes=CHECK-TWO-EXTENSIONS
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=all %s -o - | FileCheck %s --check-prefixes=CHECK-ALL-EXTENSIONS
 
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s --check-prefixes=CHECK-ALL-EXTENSIONS
+
 ; CHECK-EXTENSION: OpCapability OptNoneEXT
 ; CHECK-EXTENSION: OpExtension "SPV_EXT_optnone"
 ; CHECK-NO-EXTENSION-NOT: OpCapability OptNoneINTEL
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_16bit_atomics/atomicrmw_faddfsub_bfloat16.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_16bit_atomics/atomicrmw_faddfsub_bfloat16.ll
new file mode 100644
index 0000000000000..a189b2a655589
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_16bit_atomics/atomicrmw_faddfsub_bfloat16.ll
@@ -0,0 +1,34 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR1
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_shader_atomic_float_add,+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR2
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_shader_atomic_float_add,+SPV_INTEL_16bit_atomics,+SPV_KHR_bfloat16,+SPV_INTEL_bfloat16_arithmetic %s -o - | FileCheck %s
+
+; CHECK-ERROR1: LLVM ERROR: The atomic float instruction requires the following SPIR-V extension: SPV_EXT_shader_atomic_float_add
+; CHECK-ERROR2: LLVM ERROR: The atomic bfloat16 instruction requires the following SPIR-V extension: SPV_INTEL_16bit_atomics
+
+; CHECK: Capability BFloat16TypeKHR
+; CHECK: Capability AtomicBFloat16AddINTEL
+; CHECK: Extension "SPV_KHR_bfloat16"
+; CHECK: Extension "SPV_EXT_shader_atomic_float_add"
+; CHECK: Extension "SPV_INTEL_16bit_atomics"
+; CHECK-DAG: %[[TyBF16:[0-9]+]] = OpTypeFloat 16 0
+; CHECK-DAG: %[[TyBF16Ptr:[0-9]+]] = OpTypePointer {{[a-zA-Z]+}} %[[TyBF16]]
+; CHECK-DAG: %[[TyInt32:[0-9]+]] = OpTypeInt 32 0
+; CHECK-DAG: %[[ConstBF16:[0-9]+]] = OpConstant %[[TyBF16]] 16936{{$}}
+; CHECK-DAG: %[[Const0:[0-9]+]] = OpConstantNull %[[TyBF16]]
+; CHECK-DAG: %[[BF16Ptr:[0-9]+]] = OpVariable %[[TyBF16Ptr]] CrossWorkgroup %[[Const0]]
+; CHECK-DAG: %[[ScopeAllSvmDevices:[0-9]+]] = OpConstantNull %[[TyInt32]]
+; CHECK-DAG: %[[MemSeqCst:[0-9]+]] = OpConstant %[[TyInt32]] 16{{$}}
+; CHECK: OpAtomicFAddEXT %[[TyBF16]] %[[BF16Ptr]] %[[ScopeAllSvmDevices]] %[[MemSeqCst]] %[[ConstBF16]]
+; CHECK: %[[NegatedConstBF16:[0-9]+]] = OpFNegate %[[TyBF16]] %[[ConstBF16]]
+; CHECK: OpAtomicFAddEXT %[[TyBF16]] %[[BF16Ptr]] %[[ScopeAllSvmDevices]] %[[MemSeqCst]] %[[NegatedConstBF16]]
+
+
+@f = common dso_local local_unnamed_addr addrspace(1) global bfloat 0.000000e+00, align 8
+
+define dso_local spir_func void @test1() local_unnamed_addr {
+entry:
+  %addval = atomicrmw fadd ptr addrspace(1) @f, bfloat 42.000000e+00 seq_cst
+  %subval = atomicrmw fsub ptr addrspace(1) @f, bfloat 42.000000e+00 seq_cst
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_16bit_atomics/atomicrmw_fminfmax_bfloat16.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_16bit_atomics/atomicrmw_fminfmax_bfloat16.ll
new file mode 100644
index 0000000000000..dd8448039ec62
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_16bit_atomics/atomicrmw_fminfmax_bfloat16.ll
@@ -0,0 +1,28 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_shader_atomic_float_min_max,+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_shader_atomic_float_min_max,+SPV_INTEL_16bit_atomics,+SPV_KHR_bfloat16 %s -o - | FileCheck %s
+
+; CHECK-ERROR: LLVM ERROR: The atomic bfloat16 instruction requires the following SPIR-V extension: SPV_INTEL_16bit_atomics
+
+; CHECK: Capability AtomicBFloat16MinMaxINTEL
+; CHECK: Extension "SPV_KHR_bfloat16"
+; CHECK: Extension "SPV_EXT_shader_atomic_float_min_max"
+; CHECK: Extension "SPV_INTEL_16bit_atomics"
+; CHECK-DAG: %[[TyBF16:[0-9]+]] = OpTypeFloat 16 0
+; CHECK-DAG: %[[TyBF16Ptr:[0-9]+]] = OpTypePointer {{[a-zA-Z]+}} %[[TyBF16]]
+; CHECK-DAG: %[[TyInt32:[0-9]+]] = OpTypeInt 32 0
+; CHECK-DAG: %[[ConstBF16:[0-9]+]] = OpConstant %[[TyBF16]] 16936{{$}}
+; CHECK-DAG: %[[Const0:[0-9]+]] = OpConstantNull %[[TyBF16]]
+; CHECK-DAG: %[[BF16Ptr:[0-9]+]] = OpVariable %[[TyBF16Ptr]] CrossWorkgroup %[[Const0]]
+; CHECK-DAG: %[[ScopeAllSvmDevices:[0-9]+]] = OpConstantNull %[[TyInt32]]
+; CHECK-DAG: %[[MemSeqCst:[0-9]+]] = OpConstant %[[TyInt32]] 16{{$}}
+; CHECK: OpAtomicFMinEXT %[[TyBF16]] %[[BF16Ptr]] %[[ScopeAllSvmDevices]] %[[MemSeqCst]] %[[ConstBF16]]
+; CHECK: OpAtomicFMaxEXT %[[TyBF16]] %[[BF16Ptr]] %[[ScopeAllSvmDevices]] %[[MemSeqCst]] %[[ConstBF16]]
+
+@f = common dso_local local_unnamed_addr addrspace(1) global bfloat 0.000000e+00, align 8
+
+define spir_func void @test1() {
+entry:
+  %minval = atomicrmw fmin ptr addrspace(1) @f, bfloat 42.0e+00 seq_cst
+  %maxval = atomicrmw fmax ptr addrspace(1) @f, bfloat 42.0e+00 seq_cst
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll
new file mode 100644
index 0000000000000..4cabddb94df25
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll
@@ -0,0 +1,142 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: Arithmetic instructions with bfloat16 arguments require the following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic
+
+; CHECK-DAG: OpCapability BFloat16TypeKHR
+; CHECK-DAG: OpCapability BFloat16ArithmeticINTEL
+; CHECK-DAG: OpExtension "SPV_KHR_bfloat16"
+; CHECK-DAG: OpExtension "SPV_INTEL_bfloat16_arithmetic"
+; CHECK-DAG: OpName [[NEG:%.*]] "neg"
+; CHECK-DAG: OpName [[NEGV:%.*]] "negv"
+; CHECK-DAG: OpName [[ADD:%.*]] "add"
+; CHECK-DAG: OpName [[ADDV:%.*]] "addv"
+; CHECK-DAG: OpName [[SUB:%.*]] "sub"
+; CHECK-DAG: OpName [[SUBV:%.*]] "subv"
+; CHECK-DAG: OpName [[MUL:%.*]] "mul"
+; CHECK-DAG: OpName [[MULV:%.*]] "mulv"
+; CHECK-DAG: OpName [[DIV:%.*]] "div"
+; CHECK-DAG: OpName [[DIVV:%.*]] "divv"
+; CHECK-DAG: OpName [[REM:%.*]] "rem"
+; CHECK-DAG: OpName [[REMV:%.*]] "remv"
+; CHECK: [[BFLOAT:%.*]] = OpTypeFloat 16 0
+; CHECK: [[BFLOATV:%.*]] = OpTypeVector [[BFLOAT]] 4
+
+; CHECK-DAG: [[NEG]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFNegate [[BFLOAT]] [[X]]
+define spir_func bfloat @neg(bfloat %x) {
+entry:
+  %r = fneg bfloat %x
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[NEGV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFNegate [[BFLOATV]] [[X]]
+define spir_func <4 x bfloat> @negv(<4 x bfloat> %x) {
+entry:
+  %r = fneg <4 x bfloat> %x
+  ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[ADD]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFAdd [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @add(bfloat %x, bfloat %y) {
+entry:
+  %r = fadd bfloat %x, %y
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[ADDV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFAdd [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @addv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+  %r = fadd <4 x bfloat> %x, %y
+  ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[SUB]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFSub [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @sub(bfloat %x, bfloat %y) {
+entry:
+  %r = fsub bfloat %x, %y
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[SUBV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFSub [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @subv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+  %r = fsub <4 x bfloat> %x, %y
+  ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[MUL]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFMul [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @mul(bfloat %x, bfloat %y) {
+entry:
+  %r = fmul bfloat %x, %y
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[MULV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFMul [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @mulv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+  %r = fmul <4 x bfloat> %x, %y
+  ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[DIV]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFDiv [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @div(bfloat %x, bfloat %y) {
+entry:
+  %r = fdiv bfloat %x, %y
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[DIVV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFDiv [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @divv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+  %r = fdiv <4 x bfloat> %x, %y
+  ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[REM]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFRem [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @rem(bfloat %x, bfloat %y) {
+entry:
+  %r = frem bfloat %x, %y
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[REMV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFRem [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @remv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+  %r = frem <4 x bfloat> %x, %y
+  ret <4 x bfloat> %r
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll
new file mode 100644
index 0000000000000..3774791d58f87
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll
@@ -0,0 +1,376 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: Relational instructions with bfloat16 arguments require the following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic
+
+; CHECK-DAG: OpCapability BFloat16TypeKHR
+; CHECK-DAG: OpCapability BFloat16ArithmeticINTEL
+; CHECK-DAG: OpExtension "SPV_KHR_bfloat16"
+; CHECK-DAG: OpExtension "SPV_INTEL_bfloat16_arithmetic"
+; CHECK-DAG: OpName [[UEQ:%.*]] "test_ueq"
+; CHECK-DAG: OpName [[OEQ:%.*]] "test_oeq"
+; CHECK-DAG: OpName [[UNE:%.*]] "test_une"
+; CHECK-DAG: OpName [[ONE:%.*]] "test_one"
+; CHECK-DAG: OpName [[ULT:%.*]] "test_ult"
+; CHECK-DAG: OpName [[OLT:%.*]] "test_olt"
+; CHECK-DAG: OpName [[ULE:%.*]] "test_ule"
+; CHECK-DAG: OpName [[OLE:%.*]] "test_ole"
+; CHECK-DAG: OpName [[UGT:%.*]] "test_ugt"
+; CHECK-DAG: OpName [[OGT:%.*]] "test_ogt"
+; CHECK-DAG: OpName [[UGE:%.*]] "test_uge"
+; CHECK-DAG: OpName [[OGE:%.*]] "test_oge"
+; CHECK-DAG: OpName [[UNO:%.*]] "test_uno"
+; CHECK-DAG: OpName [[ORD:%.*]] "test_ord"
+; CHECK-DAG: OpName [[v3UEQ:%.*]] "test_v3_ueq"
+; CHECK-DAG: OpName [[v3OEQ:%.*]] "test_v3_oeq"
+; CHECK-DAG: OpName [[v3UNE:%.*]] "test_v3_une"
+; CHECK-DAG: OpName [[v3ONE:%.*]] "test_v3_one"
+; CHECK-DAG: OpName [[v3ULT:%.*]] "test_v3_ult"
+; CHECK-DAG: OpName [[v3OLT:%.*]] "test_v3_olt"
+; CHECK-DAG: OpName [[v3ULE:%.*]] "test_v3_ule"
+; CHECK-DAG: OpName [[v3OLE:%.*]] "test_v3_ole"
+; CHECK-DAG: OpName [[v3UGT:%.*]] "test_v3_ugt"
+; CHECK-DAG: OpName [[v3OGT:%.*]] "test_v3_ogt"
+; CHECK-DAG: OpName [[v3UGE:%.*]] "test_v3_uge"
+; CHECK-DAG: OpName [[v3OGE:%.*]] "test_v3_oge"
+; CHECK-DAG: OpName [[v3UNO:%.*]] "test_v3_uno"
+; CHECK-DAG: OpName [[v3ORD:%.*]] "test_v3_ord"
+; CHECK: [[BFLOAT:%.*]] = OpTypeFloat 16 0
+; CHECK: [[BFLOATV:%.*]] = OpTypeVector [[BFLOAT]] 3
+
+; CHECK:      [[UEQ]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ueq(bfloat %a, bfloat %b) {
+  %r = fcmp ueq bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[OEQ]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_oeq(bfloat %a, bfloat %b) {
+  %r = fcmp oeq bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[UNE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordNotEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_une(bfloat %a, bfloat %b) {
+  %r = fcmp une bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[ONE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdNotEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_one(bfloat %a, bfloat %b) {
+  %r = fcmp one bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[ULT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ult(bfloat %a, bfloat %b) {
+  %r = fcmp ult bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[OLT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_olt(bfloat %a, bfloat %b) {
+  %r = fcmp olt bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[ULE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ule(bfloat %a, bfloat %b) {
+  %r = fcmp ule bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[OLE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ole(bfloat %a, bfloat %b) {
+  %r = fcmp ole bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[UGT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ugt(bfloat %a, bfloat %b) {
+  %r = fcmp ugt bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[OGT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ogt(bfloat %a, bfloat %b) {
+  %r = fcmp ogt bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[UGE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_uge(bfloat %a, bfloat %b) {
+  %r = fcmp uge bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[OGE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_oge(bfloat %a, bfloat %b) {
+  %r = fcmp oge bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[ORD]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpOrdered {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ord(bfloat %a, bfloat %b) {
+  %r = fcmp ord bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[UNO]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpUnordered {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_uno(bfloat %a, bfloat %b) {
+  %r = fcmp uno bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[v3UEQ]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ueq(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ueq <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3OEQ]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_oeq(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp oeq <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3UNE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordNotEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_une(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp une <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3ONE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdNotEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_one(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp one <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3ULT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ult(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ult <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3OLT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_olt(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp olt <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3ULE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ule(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ule <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3OLE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ole(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ole <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3UGT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ugt(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ugt <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3OGT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ogt(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ogt <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3UGE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_uge(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp uge <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3OGE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_oge(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp oge <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3ORD]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpOrdered {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ord(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ord <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3UNO]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpUnordered {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_uno(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp uno <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll
new file mode 100644
index 0000000000000..717771c965496
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll
@@ -0,0 +1,32 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_kernel_attributes %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s
+; %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_kernel_attributes %s -o - -filetype=obj | spirv-val %}
+; %if spirv-tools %{ llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpCapability KernelAttributesINTEL
+; CHECK: OpExtension "SPV_INTEL_kernel_attributes"
+; CHECK: OpEntryPoint {{.*}} %[[DIM1:[0-9]+]] "Dim1"
+; CHECK: OpEntryPoint {{.*}} %[[DIM2:[0-9]+]] "Dim2"
+; CHECK: OpEntryPoint {{.*}} %[[DIM3:[0-9]+]] "Dim3"
+; CHECK: OpExecutionMode %[[DIM1]] MaxWorkgroupSizeINTEL 4 1 1
+; CHECK: OpExecutionMode %[[DIM2]] MaxWorkgroupSizeINTEL 8 4 1
+; CHECK: OpExecutionMode %[[DIM3]] MaxWorkgroupSizeINTEL 16 8 4
+; CHECK: %[[DIM1]] = OpFunction
+; CHECK: %[[DIM2]] = OpFunction
+; CHECK: %[[DIM3]] = OpFunction
+
+define spir_kernel void @Dim1() !max_work_group_size !0 {
+  ret void
+}
+
+define spir_kernel void @Dim2() !max_work_group_size !1 {
+  ret void
+}
+
+define spir_kernel void @Dim3() !max_work_group_size !2 {
+  ret void
+}
+
+!0 = !{i32 4}
+!1 = !{i32 8, i32 4}
+!2 = !{i32 16, i32 8, i32 4}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll
index f745794e11de1..15905dd1894e2 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=all %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s
 
 define i6 @getConstantI6() {
   ret i6 2
diff --git a/llvm/test/CodeGen/SPIRV/fembed-bitcode-marker.ll b/llvm/test/CodeGen/SPIRV/fembed-bitcode-marker.ll
new file mode 100644
index 0000000000000..4ffdb9b7f3c7a
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/fembed-bitcode-marker.ll
@@ -0,0 +1,24 @@
+; Expanding the bitcode marker works only for AMD at the moment.
+; RUN: not llc -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o -
+; RUN: llc -verify-machineinstrs -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %}
+;
+; Verify that we lower the embedded bitcode
+
+@llvm.embedded.module = private addrspace(1) constant [0 x i8] zeroinitializer, section ".llvmbc", align 1
+@llvm.compiler.used = appending addrspace(1) global [1 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.embedded.module to ptr addrspace(4))], section "llvm.metadata"
+
+; CHECK: OpName %[[#LLVM_EMBEDDED_MODULE:]] "llvm.embedded.module"
+; CHECK: OpDecorate %[[#LLVM_EMBEDDED_MODULE]] Constant
+; CHECK: %[[#UCHAR:]] = OpTypeInt 8 0
+; CHECK: %[[#UINT:]] = OpTypeInt 32 0
+; CHECK: %[[#ONE:]] = OpConstant %[[#UINT]] 1
+; CHECK: %[[#UCHAR_ARR_1:]] = OpTypeArray %[[#UCHAR]] %[[#ONE]]
+; CHECK: %[[#UCHAR_ARR_1_PTR:]] = OpTypePointer CrossWorkgroup %[[#UCHAR_ARR_1]]
+; CHECK: %[[#CONST_UCHAR_ARR_1:]] = OpConstantNull %[[#UCHAR_ARR_1]]
+; CHECK: %[[#LLVM_EMBEDDED_MODULE]] = OpVariable %[[#UCHAR_ARR_1_PTR]] CrossWorkgroup %[[#CONST_UCHAR_ARR_1]]
+
+define spir_kernel void @foo() {
+entry:
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/fembed-bitcode.ll b/llvm/test/CodeGen/SPIRV/fembed-bitcode.ll
new file mode 100644
index 0000000000000..a75b44925a1ea
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/fembed-bitcode.ll
@@ -0,0 +1,32 @@
+; RUN: llc -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: llc -verify-machineinstrs -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %}
+;
+; Verify that we can lower the embedded module and cmdline.
+
+@llvm.embedded.module = private addrspace(1) constant [4 x i8] c"BC\C0\DE", section ".llvmbc", align 1
+@llvm.cmdline = private addrspace(1) constant [5 x i8] c"-cc1\00", section ".llvmcmd", align 1
+@llvm.compiler.used = appending addrspace(1) global [2 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.embedded.module to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.cmdline to ptr addrspace(4))], section "llvm.metadata"
+
+; CHECK: OpName %[[#LLVM_EMBEDDED_MODULE:]] "llvm.embedded.module"
+; CHECK: OpName %[[#LLVM_CMDLINE:]] "llvm.cmdline"
+; CHECK: OpDecorate %[[#LLVM_EMBEDDED_MODULE]] Constant
+; CHECK: OpDecorate %[[#LLVM_CMDLINE]] Constant
+; CHECK: %[[#UCHAR:]] = OpTypeInt 8 0
+; CHECK: %[[#UINT:]] = OpTypeInt 32 0
+; CHECK: %[[#FIVE:]] = OpConstant %[[#UINT]] 5
+; CHECK: %[[#UCHAR_ARR_5:]] = OpTypeArray %[[#UCHAR]] %[[#FIVE]]
+; CHECK: %[[#FOUR:]] = OpConstant %[[#UINT]] 4
+; CHECK: %[[#UCHAR_ARR_4:]] = OpTypeArray %[[#UCHAR]] %[[#FOUR]]
+; CHECK: %[[#UCHAR_ARR_5_PTR:]] = OpTypePointer CrossWorkgroup %[[#UCHAR_ARR_5]]
+; CHECK: %[[#UCHAR_ARR_4_PTR:]] = OpTypePointer CrossWorkgroup %[[#UCHAR_ARR_4]]
+; CHECK: %[[#CONST_UCHAR_ARR_4:]] = OpConstantComposite %[[#UCHAR_ARR_4]]
+; CHECK: %[[#LLVM_EMBEDDED_MODULE]] = OpVariable %[[#UCHAR_ARR_4_PTR]] CrossWorkgroup %[[#CONST_UCHAR_ARR_4]]
+; CHECK: %[[#CONST_UCHAR_ARR_5:]] = OpConstantComposite %[[#UCHAR_ARR_5]]
+; CHECK: %[[#LLVM_CMDLINE]] = OpVariable %[[#UCHAR_ARR_5_PTR]] CrossWorkgroup %[[#CONST_UCHAR_ARR_5]]
+
+define spir_kernel void @foo() {
+entry:
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/hip_dyn_lds.ll b/llvm/test/CodeGen/SPIRV/hip_dyn_lds.ll
new file mode 100644
index 0000000000000..f0acfdfdede9d
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hip_dyn_lds.ll
@@ -0,0 +1,20 @@
+; RUN: llc -verify-machineinstrs -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpName %[[#LDS:]] "lds"
+; CHECK: OpDecorate %[[#LDS]] LinkageAttributes "lds" Import
+; CHECK: %[[#UINT:]] = OpTypeInt 32 0
+; CHECK: %[[#UINT_MAX:]] = OpConstant %[[#UINT]] 4294967295
+; CHECK: %[[#LDS_ARR_TY:]] = OpTypeArray %[[#UINT]] %[[#UINT_MAX]]
+; CHECK: %[[#LDS_ARR_PTR_WG:]] = OpTypePointer Workgroup %[[#LDS_ARR_TY]]
+; CHECK: %[[#LDS]] = OpVariable %[[#LDS_ARR_PTR_WG]] Workgroup
+
+@lds = external addrspace(3) global [0 x i32]
+
+define spir_kernel void @foo(ptr addrspace(4) %in, ptr addrspace(4) %out) {
+entry:
+  %val = load i32, ptr addrspace(4) %in
+  %add = add i32 %val, 1
+  store i32 %add, ptr addrspace(4) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll
index ed67344842b11..4817e7450ac2e 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll
@@ -16,7 +16,6 @@
 define void @case1() local_unnamed_addr {
   ; CHECK: %[[#BUFFER_LOAD:]] = OpLoad %[[#FLOAT4]] %{{[0-9]+}} Aligned 16
   ; CHECK: %[[#CAST_LOAD:]] = OpBitcast %[[#INT4]] %[[#BUFFER_LOAD]]
-  ; CHECK: %[[#VEC_SHUFFLE:]] = OpVectorShuffle %[[#INT4]] %[[#CAST_LOAD]] %[[#CAST_LOAD]] 0 1 2 3
   %1 = tail call target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4f32_12_0t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str)
   %2 = tail call target("spirv.VulkanBuffer", [0 x <4 x i32>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4i32_12_1t(i32 0, i32 5, i32 1, i32 0, ptr nonnull @.str.2)
   %3 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v4f32_12_0t(target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) %1, i32 0)
@@ -29,8 +28,7 @@ define void @case1() local_unnamed_addr {
 define void @case2() local_unnamed_addr {
   ; CHECK: %[[#BUFFER_LOAD:]] = OpLoad %[[#FLOAT4]] %{{[0-9]+}} Aligned 16
   ; CHECK: %[[#CAST_LOAD:]] = OpBitcast %[[#INT4]] %[[#BUFFER_LOAD]]
-  ; CHECK: %[[#VEC_SHUFFLE:]] = OpVectorShuffle %[[#INT4]] %[[#CAST_LOAD]] %[[#CAST_LOAD]] 0 1 2 3
-  ; CHECK: %[[#VEC_TRUNCATE:]] = OpVectorShuffle %[[#INT3]] %[[#VEC_SHUFFLE]] %[[#UNDEF_INT4]] 0 1 2
+  ; CHECK: %[[#VEC_TRUNCATE:]] = OpVectorShuffle %[[#INT3]] %[[#CAST_LOAD]] %[[#UNDEF_INT4]] 0 1 2
   %1 = tail call target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4f32_12_0t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str)
   %2 = tail call target("spirv.VulkanBuffer", [0 x <3 x i32>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v3i32_12_1t(i32 0, i32 5, i32 1, i32 0, ptr nonnull @.str.3)
   %3 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v4f32_12_0t(target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) %1, i32 0)
diff --git a/llvm/test/CodeGen/SPIRV/llc-pipeline.ll b/llvm/test/CodeGen/SPIRV/llc-pipeline.ll
index 3fff2a8a24a73..6db375445e4a3 100644
--- a/llvm/test/CodeGen/SPIRV/llc-pipeline.ll
+++ b/llvm/test/CodeGen/SPIRV/llc-pipeline.ll
@@ -31,6 +31,7 @@
 ; SPIRV-O0-NEXT:      Expand reduction intrinsics
 ; SPIRV-O0-NEXT:      SPIR-V Regularizer
 ; SPIRV-O0-NEXT:    SPIRV prepare functions
+; SPIRV-O0-NEXT:    SPIRV prepare global variables
 ; SPIRV-O0-NEXT:    FunctionPass Manager
 ; SPIRV-O0-NEXT:      Lower invoke and unwind, for unwindless code generators
 ; SPIRV-O0-NEXT:      Remove unreachable blocks from the CFG
@@ -130,6 +131,7 @@
 ; SPIRV-Opt-NEXT:      Expand reduction intrinsics
 ; SPIRV-Opt-NEXT:      SPIR-V Regularizer
 ; SPIRV-Opt-NEXT:    SPIRV prepare functions
+; SPIRV-Opt-NEXT:    SPIRV prepare global variables
 ; SPIRV-Opt-NEXT:    FunctionPass Manager
 ; SPIRV-Opt-NEXT:      Dominator Tree Construction
 ; SPIRV-Opt-NEXT:      Natural Loop Information
diff --git a/llvm/test/CodeGen/SPIRV/non_int_constant_null.ll b/llvm/test/CodeGen/SPIRV/non_int_constant_null.ll
new file mode 100644
index 0000000000000..0ba016aaa30aa
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/non_int_constant_null.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple spirv64-unknown-unknown %s --spirv-ext=+SPV_KHR_float_controls2 -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -mtriple spirv64-unknown-unknown %s --spirv-ext=+SPV_KHR_float_controls2 -o - -filetype=obj | spirv-val %}
+
+@A = addrspace(1) constant [1 x i8] zeroinitializer
+
+; CHECK: OpName %[[#FOO:]] "foo"
+; CHECK: OpName %[[#A:]] "A"
+; CHECK: OpDecorate %[[#A]] Constant
+; CHECK: OpDecorate %[[#A]] LinkageAttributes "A" Export
+; CHECK: %[[#INT8:]] = OpTypeInt 8 0
+; CHECK: %[[#INT32:]] = OpTypeInt 32 0
+; CHECK: %[[#ONE:]] = OpConstant %[[#INT32]] 1
+; CHECK: %[[#ARR_INT8:]] = OpTypeArray %[[#INT8]] %7
+; CHECK: %[[#ARR_INT8_PTR:]] = OpTypePointer CrossWorkgroup %[[#ARR_INT8]]
+; CHECK: %[[#ARR_INT8_ZERO:]] = OpConstantNull %[[#ARR_INT8]]
+; CHECK: %13 = OpVariable %[[#ARR_INT8_PTR]] CrossWorkgroup %[[#ARR_INT8_ZERO]]
+; CHECK: %[[#FOO]] = OpFunction
+; CHECK: = OpLabel
+; CHECK: OpReturn
+; CHECK: OpFunctionEnd
+
+define spir_kernel void @foo() {
+entry:
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll b/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll
new file mode 100644
index 0000000000000..1d3ba2a38e55b
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: %5:vfid(<2 x s64>) = nnan ninf nsz arcp afn reassoc G_INTRINSIC intrinsic(@llvm.spv.unpackhalf2x16), %0:iid(s64) is only supported with the GLSL extended instruction set.
+
+define hidden spir_func noundef nofpclass(nan inf) float @_Z9test_funcj(i32 noundef %0) local_unnamed_addr #0 {
+  %2 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.spv.unpackhalf2x16.v2f32(i32 %0)
+  %3 = extractelement <2 x float> %2, i64 0
+  ret float %3
+}
+
diff --git a/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll b/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll
index afffd9e69b454..11e7d006c5ecf 100644
--- a/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll
+++ b/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll
@@ -1,4 +1,6 @@
 ; REQUIRES: spirv-tools
 ; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s
+; RUN: llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - --filetype=obj | spirv-dis | FileCheck --check-prefix=AMDGCNSPIRV %s
 
 ; CHECK: Generator: {{.*}}{{43|LLVM SPIR-V Backend}}{{.*}}
+; AMDGCNSPIRV: Generator: {{.*}}{{65535|LLVM SPIR-V Backend}}{{.*}}
diff --git a/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll b/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll
index 686c1e97257ad..49ee9931d1126 100644
--- a/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll
+++ b/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll
@@ -6,6 +6,7 @@
 ; RUN: llc -O0 -mtriple=spirv64v1.4-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV14
 ; RUN: llc -O0 -mtriple=spirv64v1.5-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV15
 ; RUN: llc -O0 -mtriple=spirv64v1.6-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV16
+; RUN: llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=AMDGCNSPIRV
 
 ; CHECK-SPIRV10: Version: 1.0
 ; CHECK-SPIRV11: Version: 1.1
@@ -14,3 +15,4 @@
 ; CHECK-SPIRV14: Version: 1.4
 ; CHECK-SPIRV15: Version: 1.5
 ; CHECK-SPIRV16: Version: 1.6
+; AMDGCNSPIRV: Version: 1.6
diff --git a/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll b/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll
index 73c46b18bfa78..c9b2968a4aed7 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll
@@ -10,6 +10,7 @@
 
 ; CHECK-DAG: %[[#Int8:]] = OpTypeInt 8 0
 ; CHECK-DAG: %[[#Half:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#Float:]] = OpTypeFloat 32
 ; CHECK-DAG: %[[#Struct:]] = OpTypeStruct %[[#Half]]
 ; CHECK-DAG: %[[#Void:]] = OpTypeVoid
 ; CHECK-DAG: %[[#PtrInt8:]] = OpTypePointer CrossWorkgroup %[[#Int8:]]
@@ -17,12 +18,20 @@
 ; CHECK-DAG: %[[#Int64:]] = OpTypeInt 64 0
 ; CHECK-DAG: %[[#PtrInt64:]] = OpTypePointer CrossWorkgroup %[[#Int64]]
 ; CHECK-DAG: %[[#BarType:]] = OpTypeFunction %[[#Void]] %[[#PtrInt64]] %[[#Struct]]
+; CHECK-DAG: %[[#BazType:]] = OpTypeFunction %[[#Void]] %[[#PtrInt8]] %[[#Struct]] %[[#Int8]] %[[#Struct]] %[[#Float]] %[[#Struct]]
 ; CHECK: OpFunction %[[#Void]] None %[[#FooType]]
 ; CHECK: OpFunctionParameter %[[#PtrInt8]]
 ; CHECK: OpFunctionParameter %[[#Struct]]
 ; CHECK: OpFunction %[[#Void]] None %[[#BarType]]
 ; CHECK: OpFunctionParameter %[[#PtrInt64]]
 ; CHECK: OpFunctionParameter %[[#Struct]]
+; CHECK: OpFunction %[[#Void]] None %[[#BazType]]
+; CHECK: OpFunctionParameter %[[#PtrInt8]]
+; CHECK: OpFunctionParameter %[[#Struct]]
+; CHECK: OpFunctionParameter %[[#Int8]]
+; CHECK: OpFunctionParameter %[[#Struct]]
+; CHECK: OpFunctionParameter %[[#Float]]
+; CHECK: OpFunctionParameter %[[#Struct]]
 
 %t_half = type { half }
 
@@ -38,4 +47,9 @@ entry:
   ret void
 }
 
+define spir_kernel void @baz(ptr addrspace(1) %a, %t_half %b, i8 %c, %t_half %d, float %e, %t_half %f) {
+entry:
+  ret void
+}
+
 declare spir_func %t_half @_Z29__spirv_SpecConstantComposite(half)
diff --git a/llvm/test/CodeGen/SPIRV/pointers/load-store-vec-from-array.ll b/llvm/test/CodeGen/SPIRV/pointers/load-store-vec-from-array.ll
new file mode 100644
index 0000000000000..917bb27afad00
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/load-store-vec-from-array.ll
@@ -0,0 +1,54 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: [[FLOAT:%[0-9]+]] = OpTypeFloat 32
+; CHECK-DAG: [[VEC4FLOAT:%[0-9]+]] = OpTypeVector [[FLOAT]] 4
+; CHECK-DAG: [[UINT_TYPE:%[0-9]+]] = OpTypeInt 32 0
+; CHECK-DAG: [[UINT4:%[0-9]+]] = OpConstant [[UINT_TYPE]] 4
+; CHECK-DAG: [[ARRAY4FLOAT:%[0-9]+]] = OpTypeArray [[FLOAT]] [[UINT4]]
+; CHECK-DAG: [[PTR_ARRAY4FLOAT:%[0-9]+]] = OpTypePointer Private [[ARRAY4FLOAT]]
+; CHECK-DAG: [[G_IN:%[0-9]+]] = OpVariable [[PTR_ARRAY4FLOAT]] Private
+; CHECK-DAG: [[G_OUT:%[0-9]+]] = OpVariable [[PTR_ARRAY4FLOAT]] Private
+; CHECK-DAG: [[UINT0:%[0-9]+]] = OpConstant [[UINT_TYPE]] 0
+; CHECK-DAG: [[UINT1:%[0-9]+]] = OpConstant [[UINT_TYPE]] 1
+; CHECK-DAG: [[UINT2:%[0-9]+]] = OpConstant [[UINT_TYPE]] 2
+; CHECK-DAG: [[UINT3:%[0-9]+]] = OpConstant [[UINT_TYPE]] 3
+; CHECK-DAG: [[PTR_FLOAT:%[0-9]+]] = OpTypePointer Private [[FLOAT]]
+; CHECK-DAG: [[UNDEF_VEC:%[0-9]+]] = OpUndef [[VEC4FLOAT]]
+
+@G_in = internal addrspace(10) global [4 x float] zeroinitializer
+@G_out = internal addrspace(10) global [4 x float] zeroinitializer
+
+define spir_func void @main() {
+entry:
+; CHECK:      [[GEP0:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_IN]] [[UINT0]]
+; CHECK-NEXT: [[LOAD0:%[0-9]+]] = OpLoad [[FLOAT]] [[GEP0]]
+; CHECK-NEXT: [[GEP1:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_IN]] [[UINT1]]
+; CHECK-NEXT: [[LOAD1:%[0-9]+]] = OpLoad [[FLOAT]] [[GEP1]]
+; CHECK-NEXT: [[GEP2:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_IN]] [[UINT2]]
+; CHECK-NEXT: [[LOAD2:%[0-9]+]] = OpLoad [[FLOAT]] [[GEP2]]
+; CHECK-NEXT: [[GEP3:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_IN]] [[UINT3]]
+; CHECK-NEXT: [[LOAD3:%[0-9]+]] = OpLoad [[FLOAT]] [[GEP3]]
+; CHECK-NEXT: [[VEC_INSERT0:%[0-9]+]] = OpCompositeInsert [[VEC4FLOAT]] [[LOAD0]] [[UNDEF_VEC]] 0
+; CHECK-NEXT: [[VEC_INSERT1:%[0-9]+]] = OpCompositeInsert [[VEC4FLOAT]] [[LOAD1]] [[VEC_INSERT0]] 1
+; CHECK-NEXT: [[VEC_INSERT2:%[0-9]+]] = OpCompositeInsert [[VEC4FLOAT]] [[LOAD2]] [[VEC_INSERT1]] 2
+; CHECK-NEXT: [[VEC:%[0-9]+]] = OpCompositeInsert [[VEC4FLOAT]] [[LOAD3]] [[VEC_INSERT2]] 3
+  %0 = load <4 x float>, ptr addrspace(10) @G_in, align 64
+
+; CHECK-NEXT: [[GEP_OUT0:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_OUT]] [[UINT0]]
+; CHECK-NEXT: [[VEC_EXTRACT0:%[0-9]+]] = OpCompositeExtract [[FLOAT]] [[VEC]] 0
+; CHECK-NEXT: OpStore [[GEP_OUT0]] [[VEC_EXTRACT0]]
+; CHECK-NEXT: [[GEP_OUT1:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_OUT]] [[UINT1]]
+; CHECK-NEXT: [[VEC_EXTRACT1:%[0-9]+]] = OpCompositeExtract [[FLOAT]] [[VEC]] 1
+; CHECK-NEXT: OpStore [[GEP_OUT1]] [[VEC_EXTRACT1]]
+; CHECK-NEXT: [[GEP_OUT2:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_OUT]] [[UINT2]]
+; CHECK-NEXT: [[VEC_EXTRACT2:%[0-9]+]] = OpCompositeExtract [[FLOAT]] [[VEC]] 2
+; CHECK-NEXT: OpStore [[GEP_OUT2]] [[VEC_EXTRACT2]]
+; CHECK-NEXT: [[GEP_OUT3:%[0-9]+]] = OpAccessChain [[PTR_FLOAT]] [[G_OUT]] [[UINT3]]
+; CHECK-NEXT: [[VEC_EXTRACT3:%[0-9]+]] = OpCompositeExtract [[FLOAT]] [[VEC]] 3
+; CHECK-NEXT: OpStore [[GEP_OUT3]] [[VEC_EXTRACT3]]
+  store <4 x float> %0, ptr addrspace(10) @G_out, align 64
+
+; CHECK-NEXT: OpReturn
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll b/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll
index 84913283f6868..a1ec2cd1cfdd2 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll
@@ -26,3 +26,25 @@ entry:
   store <4 x i32> %6, ptr addrspace(11) %7, align 16
   ret void
 }
+
+; This tests a load from a pointer that has been bitcast between vector types
+; which share the same total bit-width but have different numbers of elements.
+; Tests that legalize-pointer-casts works correctly by moving the bitcast to
+; the element that was loaded.
+
+define void @main2() local_unnamed_addr #0 {
+entry:
+; CHECK:  %[[LOAD:[0-9]+]] = OpLoad %[[#v2_double]] {{.*}}
+; CHECK:  %[[BITCAST1:[0-9]+]] = OpBitcast %[[#v4_uint]] %[[LOAD]]
+; CHECK:  %[[BITCAST2:[0-9]+]] = OpBitcast %[[#v2_double]] %[[BITCAST1]]
+; CHECK: OpStore {{%[0-9]+}} %[[BITCAST2]] {{.*}}
+
+  %0 = tail call target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v2f64_12_1t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str.2)
+  %2 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v2f64_12_1t(target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) %0, i32 0)
+  %3 = load <4 x i32>, ptr addrspace(11) %2
+  %4 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v2f64_12_1t(target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) %0, i32 1)
+  store <4 x i32> %3, ptr addrspace(11) %4
+  ret void
+}
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/SPIRV/spirv_param_decorations_quals.ll b/llvm/test/CodeGen/SPIRV/spirv_param_decorations_quals.ll
index 260394b658348..fb550bb01a3a2 100644
--- a/llvm/test/CodeGen/SPIRV/spirv_param_decorations_quals.ll
+++ b/llvm/test/CodeGen/SPIRV/spirv_param_decorations_quals.ll
@@ -7,9 +7,11 @@ entry:
 
 ; CHECK-SPIRV: OpDecorate %[[#PId:]] Volatile
 ; CHECK-SPIRV: OpDecorate %[[#PId]] FuncParamAttr NoAlias
+; CHECK-SPIRV: OpDecorate %[[#PId]] FuncParamAttr NoWrite
 ; CHECK-SPIRV: %[[#PId]] = OpFunctionParameter %[[#]]
 
 !7 = !{!"volatile"}
 !8 = !{i32 38, i32 4} ; FuncParamAttr NoAlias
-!9 = !{!8}
+!11 = !{i32 38, i32 6} ; FuncParamAttr NoWrite
+!9 = !{!8, !11}
 !10 = !{!9}
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtrInGlobalInit.ll b/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtrInGlobalInit.ll
new file mode 100644
index 0000000000000..f397030c7bdb1
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtrInGlobalInit.ll
@@ -0,0 +1,49 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: %[[Int8Ty:[0-9]+]] = OpTypeInt 8 0
+; CHECK: %[[Int8PtrTy:[0-9]+]] = OpTypePointer Generic %[[Int8Ty]]
+; CHECK-DAG: %[[GlobInt8PtrTy:[0-9]+]] = OpTypePointer CrossWorkgroup %[[Int8Ty]]
+; CHECK: %[[GlobInt8PtrPtrTy:[0-9]+]] = OpTypePointer CrossWorkgroup %[[GlobInt8PtrTy]]
+; CHECK: %[[Int8PtrGlobPtrPtrTy:[0-9]+]] = OpTypePointer Generic %[[GlobInt8PtrPtrTy]]
+; CHECK: %[[Int32Ty:[0-9]+]] = OpTypeInt 32 0
+; CHECK: %[[Const5:[0-9]+]] = OpConstant %[[Int32Ty]] 5
+; CHECK: %[[ArrTy:[0-9]+]] = OpTypeArray %[[GlobInt8PtrTy]] %[[Const5]]
+; CHECK: %[[VtblTy:[0-9]+]] = OpTypeStruct %[[ArrTy]] %[[ArrTy]] %[[ArrTy]] %[[ArrTy]] %[[ArrTy]]
+; CHECK: %[[Int64Ty:[0-9]+]] = OpTypeInt 64 0
+; CHECK: %[[GlobVtblPtrTy:[0-9]+]] = OpTypePointer CrossWorkgroup %[[VtblTy]]
+; CHECK: %[[ConstMinus184:[0-9]+]] = OpConstant %[[Int64Ty]] 18446744073709551432
+; CHECK: %[[ConstMinus16:[0-9]+]] = OpConstant %[[Int64Ty]] 18446744073709551600
+; CHECK: %[[Const168:[0-9]+]] = OpConstant %[[Int64Ty]] 168
+; CHECK: %[[Nullptr:[0-9]+]] = OpConstantNull %[[GlobInt8PtrTy]]
+; CHECK: %[[Const184:[0-9]+]] = OpConstant %[[Int64Ty]] 184
+; CHECK: %[[Const184toPtr:[0-9]+]] = OpSpecConstantOp %[[GlobInt8PtrTy]] ConvertUToPtr %[[Const184]]
+; CHECK: %[[Const168toPtr:[0-9]+]] = OpSpecConstantOp %[[GlobInt8PtrTy]] ConvertUToPtr %[[Const168]]
+; CHECK: %[[ConstMinus16toPtr:[0-9]+]] = OpSpecConstantOp %[[GlobInt8PtrTy]] ConvertUToPtr %[[ConstMinus16]]
+; CHECK: %[[ConstMinus184toPtr:[0-9]+]] = OpSpecConstantOp %[[GlobInt8PtrTy]] ConvertUToPtr %[[ConstMinus184]]
+; CHECK: %[[Vtbl012:[0-9]+]] = OpConstantComposite %[[ArrTy]] %[[Const184toPtr]] %[[Nullptr]] %[[Nullptr]] %[[Nullptr]] %[[Nullptr]]
+; CHECK: %[[Vtbl3:[0-9]+]] = OpConstantComposite %[[ArrTy]] %[[Const168toPtr]] %[[ConstMinus16toPtr]] %[[Nullptr]] %[[Nullptr]] %[[Nullptr]]
+; CHECK: %[[Vtbl4:[0-9]+]] = OpConstantComposite %[[ArrTy]] %[[ConstMinus184toPtr]] %[[ConstMinus184toPtr]] %[[Nullptr]] %[[Nullptr]] %[[Nullptr]]
+; CHECK: %[[Vtbl:[0-9]+]] = OpConstantComposite %[[VtblTy]] %[[Vtbl012]] %[[Vtbl012]] %[[Vtbl012]] %[[Vtbl3]] %[[Vtbl4]]
+; CHECK: %[[#]] = OpVariable %[[GlobVtblPtrTy]] CrossWorkgroup %[[Vtbl]]
+
+@vtable = linkonce_odr unnamed_addr addrspace(1) constant { [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)] }
+                                                          { [5 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 184 to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null],
+                                                            [5 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 184 to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null],
+                                                            [5 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 184 to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null],
+                                                            [5 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 168 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 -16 to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null],
+                                                            [5 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 -184 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 -184 to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) null] }
+
+define linkonce_odr spir_func void @foo(ptr addrspace(4) %this) {
+entry:
+  %0 = getelementptr inbounds i8, ptr addrspace(4) %this, i64 184
+  store ptr addrspace(1) getelementptr inbounds inrange(-24, 16) ({ [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)] }, ptr addrspace(1) @vtable, i32 0, i32 0, i32 3), ptr addrspace(4) %this
+  store ptr addrspace(1) getelementptr inbounds inrange(-24, 16) ({ [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)] }, ptr addrspace(1) @vtable, i32 0, i32 1, i32 3), ptr addrspace(4) %this
+  store ptr addrspace(1) getelementptr inbounds inrange(-24, 16) ({ [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)] }, ptr addrspace(1) @vtable, i32 0, i32 2, i32 3), ptr addrspace(4) %this
+  %add.ptr = getelementptr inbounds i8, ptr addrspace(4) %this, i64 184
+  store ptr addrspace(1) getelementptr inbounds inrange(-24, 16) ({ [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)] }, ptr addrspace(1) @vtable, i32 0, i32 4, i32 3), ptr addrspace(4) %add.ptr
+  %add.ptr2 = getelementptr inbounds i8, ptr addrspace(4) %this, i64 16
+  store ptr addrspace(1) getelementptr inbounds inrange(-24, 16) ({ [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)], [5 x ptr addrspace(1)] }, ptr addrspace(1) @vtable, i32 0, i32 3, i32 3), ptr addrspace(4) %add.ptr2
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll b/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll
new file mode 100644
index 0000000000000..6a9ce4515f5c0
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: [[SET:%.*]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: [[UINT:%.*]] = OpTypeInt 32 0
+; CHECK-DAG: [[FLOAT:%.*]] = OpTypeFloat 32
+; CHECK-DAG: [[FLOAT2:%.*]] = OpTypeVector [[FLOAT]] 2
+
+; CHECK: [[P0:%.*]] = OpFunctionParameter [[UINT]]
+; CHECK: [[UNPACK2:%.*]] = OpExtInst [[FLOAT2]] [[SET]] UnpackHalf2x16 [[P0]]
+; CHECK: [[UNPACK:%.*]] = OpCompositeExtract [[FLOAT]] [[UNPACK2]] 0
+; CHECK: OpReturnValue [[UNPACK]]
+define hidden spir_func noundef nofpclass(nan inf) float @_Z9test_funcj(i32 noundef %0) local_unnamed_addr #0 {
+  %2 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.spv.unpackhalf2x16.v2f32(i32 %0)
+  %3 = extractelement <2 x float> %2, i64 0
+  ret float %3
+}
+
diff --git a/llvm/test/CodeGen/SPIRV/zero-length-array.ll b/llvm/test/CodeGen/SPIRV/zero-length-array.ll
index 666176c87adb6..5fd94d25dfd87 100644
--- a/llvm/test/CodeGen/SPIRV/zero-length-array.ll
+++ b/llvm/test/CodeGen/SPIRV/zero-length-array.ll
@@ -1,10 +1,17 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - -filetype=obj | spirv-val %}
 
-; CHECK: %[[#type:]] = OpTypeInt 32 0
-; CHECK: %[[#ext:]] = OpConstant %[[#type]] 0
+; Nothing is generated, but compilation doesn't crash.
+; CHECK: OpName %[[#FOO:]] "foo"
+; CHECK: OpName %[[#RTM:]] "reg2mem alloca point"
+; CHECK: %[[#INT:]] = OpTypeInt 32 0
+; CHECK: %[[#RTM]] = OpConstant %[[#INT]] 0
+; CHECK: %[[#FOO]] = OpFunction
+; CHECK-NEXT: = OpLabel
+; CHECK-NEXT: OpReturn
+; CHECK-NEXT: OpFunctionEnd
 
-define spir_func void @_Z3foov() {
+define spir_func void @foo() {
 entry:
   %i = alloca [0 x i32], align 4
   ret void
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-01.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-01.ll
index 1bfa055781c98..f77abd95f8e0f 100644
--- a/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-01.ll
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-01.ll
@@ -6,14 +6,15 @@ define float @f1(ptr %src, float %b) {
 ; CHECK-LABEL: f1:
 ; CHECK: le [[F:%f[0-9]+]], 0(%r2)
 ; CHECK: [[L:\.L.+]]:
-; CHECK: lgdr [[RI:%r[0-9]+]], [[F]]
-; CHECK: aebr [[F]], %f0
-; CHECK: lgdr [[RO:%r[0-9]+]], [[F]]
+; CHECK: ler [[COPY_F:%f[0-9]+]], [[F]]
+; CHECK-NEXT: aebr [[F]], %f0
+; CHECK-NEXT: lgdr [[RO:%r[0-9]+]], [[F]]
 ; CHECK: srlg [[RO]], [[RO]], 32
+; CHECK: lgdr [[RI:%r[0-9]+]], [[COPY_F]]
 ; CHECK: srlg [[RI]], [[RI]], 32
 ; CHECK: cs [[RI]], [[RO]], 0(%r2)
-; CHECK: sllg [[RI]], [[RI]], 32
-; CHECK: ldgr [[F]], [[RI]]
+; CHECK: sllg [[RO]], [[RI]], 32
+; CHECK: ldgr [[F]], [[RO]]
 ; CHECK: jl [[L]]
 ; CHECK: ler %f0, [[F]]
 ; CHECK: br %r14
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fsub-01.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fsub-01.ll
index 3f4ad31762753..ffe25694885a9 100644
--- a/llvm/test/CodeGen/SystemZ/atomicrmw-fsub-01.ll
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fsub-01.ll
@@ -6,14 +6,15 @@ define float @f1(ptr %src, float %b) {
 ; CHECK-LABEL: f1:
 ; CHECK: le [[F:%f[0-9]+]], 0(%r2)
 ; CHECK: [[L:\.L.+]]:
-; CHECK: lgdr [[RI:%r[0-9]+]], [[F]]
-; CHECK: sebr [[F]], %f0
-; CHECK: lgdr [[RO:%r[0-9]+]], [[F]]
+; CHECK: ler [[COPY_F:%f[0-9]+]], [[F]]
+; CHECK-NEXT: sebr [[F]], %f0
+; CHECK-NEXT: lgdr [[RO:%r[0-9]+]], [[F]]
 ; CHECK: srlg [[RO]], [[RO]], 32
+; CHECK: lgdr [[RI:%r[0-9]+]], [[COPY_F]]
 ; CHECK: srlg [[RI]], [[RI]], 32
 ; CHECK: cs [[RI]], [[RO]], 0(%r2)
-; CHECK: sllg [[RI]], [[RI]], 32
-; CHECK: ldgr [[F]], [[RI]]
+; CHECK: sllg [[RO]], [[RI]], 32
+; CHECK: ldgr [[F]], [[RO]]
 ; CHECK: jl [[L]]
 ; CHECK: ler %f0, [[F]]
 ; CHECK: br %r14
diff --git a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
index 678d9a9073155..ff9b6a34c1d53 100644
--- a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
+++ b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
@@ -22,10 +22,10 @@ define void @main(i16 %in) {
 ; CHECK-NEXT:    locghile %r3, 1
 ; CHECK-NEXT:    o %r0, 0(%r1)
 ; CHECK-NEXT:    larl %r1, g_222
-; CHECK-NEXT:    lghi %r5, 0
 ; CHECK-NEXT:    dsgfr %r2, %r0
+; CHECK-NEXT:    lghi %r3, 0
 ; CHECK-NEXT:    stgrl %r2, g_39
-; CHECK-NEXT:    stc %r5, 19(%r1)
+; CHECK-NEXT:    stc %r3, 19(%r1)
 ; CHECK-NEXT:    br %r14
   %tmp = load i32, ptr @g_151, align 4
   %tmp3 = or i32 %tmp, 1
diff --git a/llvm/test/CodeGen/SystemZ/stackmap.ll b/llvm/test/CodeGen/SystemZ/stackmap.ll
index 05b8de756c032..f414ea33a6e80 100644
--- a/llvm/test/CodeGen/SystemZ/stackmap.ll
+++ b/llvm/test/CodeGen/SystemZ/stackmap.ll
@@ -84,14 +84,14 @@
 ; CHECK-NEXT:   .short  8
 ; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   65535
+; CHECK-NEXT:   .long   -1
 ; SmallConstant
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .short  8
 ; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   65535
+; CHECK-NEXT:   .long   -1
 ; SmallConstant
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   0
diff --git a/llvm/test/CodeGen/SystemZ/vec-load-element.ll b/llvm/test/CodeGen/SystemZ/vec-load-element.ll
index 2baaed19546df..9bef279d7c0fa 100644
--- a/llvm/test/CodeGen/SystemZ/vec-load-element.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-load-element.ll
@@ -5,8 +5,8 @@
 ; CHECK-LABEL: .LBB0_1:
 ; CHECK-NOT: l %r
 ; CHECK-NOT: vlvgf
-; CHECK: pfd
-; CHECK: vlef
+; CHECK-DAG: pfd
+; CHECK-DAG: vlef
 
 %type0 = type { i32, [400 x i8], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
 @Mem = external global [150 x %type0], align 4
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll
index 79665af17ef58..9632469261f4d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll
@@ -7,22 +7,22 @@ define dso_local i32 @test_500_504(ptr nocapture readonly %x) {
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    mov.w lr, #126
-; CHECK-NEXT:    adr r2, .LCPI0_0
-; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    mov.w r2, #500
-; CHECK-NEXT:    vdup.32 q1, r2
-; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    adr r1, .LCPI0_0
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    mov.w r1, #500
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    vdup.32 q1, r1
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vqadd.u32 q2, q0, r1
-; CHECK-NEXT:    adds r1, #4
+; CHECK-NEXT:    vqadd.u32 q2, q0, r2
+; CHECK-NEXT:    adds r2, #4
 ; CHECK-NEXT:    vptt.u32 hi, q1, q2
 ; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
-; CHECK-NEXT:    vaddvat.u32 r2, q2
+; CHECK-NEXT:    vaddvat.u32 r12, q2
 ; CHECK-NEXT:    le lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    mov r0, r12
 ; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
index ec257bcf123f3..bcedcd40ba112 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
@@ -28,29 +28,29 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap
 ; CHECK-NEXT:    str r6, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    subs r7, #4
 ; CHECK-NEXT:    movs r6, #1
-; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
 ; CHECK-NEXT:  .LBB0_5: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r11, [r0, #16]!
-; CHECK-NEXT:    ldrd r5, r7, [r0, #-12]
+; CHECK-NEXT:    ldrd r5, r6, [r0, #-12]
 ; CHECK-NEXT:    ldr r4, [r0, #-4]
 ; CHECK-NEXT:    cmp r12, r5
 ; CHECK-NEXT:    csel r5, r5, r12, gt
-; CHECK-NEXT:    csinc r6, r10, r8, le
-; CHECK-NEXT:    cmp r5, r7
+; CHECK-NEXT:    csinc r7, r10, r8, le
+; CHECK-NEXT:    cmp r5, r6
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    addgt.w r6, r8, #2
-; CHECK-NEXT:    csel r7, r7, r5, gt
-; CHECK-NEXT:    cmp r7, r4
+; CHECK-NEXT:    addgt.w r7, r8, #2
+; CHECK-NEXT:    csel r6, r6, r5, gt
+; CHECK-NEXT:    cmp r6, r4
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    addgt.w r6, r8, #3
-; CHECK-NEXT:    csel r7, r4, r7, gt
+; CHECK-NEXT:    addgt.w r7, r8, #3
+; CHECK-NEXT:    csel r6, r4, r6, gt
 ; CHECK-NEXT:    add.w r8, r8, #4
-; CHECK-NEXT:    cmp r7, r11
-; CHECK-NEXT:    csel r10, r8, r6, gt
-; CHECK-NEXT:    csel r12, r11, r7, gt
+; CHECK-NEXT:    cmp r6, r11
+; CHECK-NEXT:    csel r10, r8, r7, gt
+; CHECK-NEXT:    csel r12, r11, r6, gt
 ; CHECK-NEXT:    le lr, .LBB0_5
 ; CHECK-NEXT:  @ %bb.6: @ %while.end.loopexit.unr-lcssa.loopexit
 ; CHECK-NEXT:    ldr r6, [sp] @ 4-byte Reload
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 1769c5d2fd385..98e082be4cad1 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -21,11 +21,12 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; ENABLED-NEXT:    it lt
 ; ENABLED-NEXT:    bxlt lr
 ; ENABLED-NEXT:  .LBB0_1: @ %for.body.lr.ph
-; ENABLED-NEXT:    push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; ENABLED-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; ENABLED-NEXT:    mov r11, r0
-; ENABLED-NEXT:    ldr r0, [sp, #32]
+; ENABLED-NEXT:    ldr r0, [sp, #36]
 ; ENABLED-NEXT:    add.w r9, r2, #3
 ; ENABLED-NEXT:    mov.w r12, #0
+; ENABLED-NEXT:    mov.w r8, #1
 ; ENABLED-NEXT:    mov r10, r11
 ; ENABLED-NEXT:    uxth r0, r0
 ; ENABLED-NEXT:    rsbs r5, r0, #0
@@ -49,18 +50,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; ENABLED-NEXT:  @ %bb.5: @ %vector.ph
 ; ENABLED-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; ENABLED-NEXT:    bic r0, r9, #3
-; ENABLED-NEXT:    movs r7, #1
-; ENABLED-NEXT:    subs r0, #4
 ; ENABLED-NEXT:    sub.w r4, r2, r12
+; ENABLED-NEXT:    subs r0, #4
 ; ENABLED-NEXT:    vmov.i32 q1, #0x0
-; ENABLED-NEXT:    add.w r6, r7, r0, lsr #2
+; ENABLED-NEXT:    mov r7, r10
+; ENABLED-NEXT:    add.w r6, r8, r0, lsr #2
 ; ENABLED-NEXT:    adds r0, r2, #3
 ; ENABLED-NEXT:    sub.w r0, r0, r12
 ; ENABLED-NEXT:    bic r0, r0, #3
 ; ENABLED-NEXT:    subs r0, #4
-; ENABLED-NEXT:    add.w r0, r7, r0, lsr #2
-; ENABLED-NEXT:    mov r7, r10
-; ENABLED-NEXT:    dls lr, r0
+; ENABLED-NEXT:    add.w lr, r8, r0, lsr #2
 ; ENABLED-NEXT:    mov r0, r11
 ; ENABLED-NEXT:  .LBB0_6: @ %vector.body
 ; ENABLED-NEXT:    @ Parent Loop BB0_4 Depth=1
@@ -83,7 +82,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; ENABLED-NEXT:    vaddv.u32 r0, q0
 ; ENABLED-NEXT:    b .LBB0_3
 ; ENABLED-NEXT:  .LBB0_8:
-; ENABLED-NEXT:    pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; ENABLED-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; ENABLED-NEXT:    bx lr
 ;
 ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction:
@@ -92,11 +91,12 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; NOREDUCTIONS-NEXT:    it lt
 ; NOREDUCTIONS-NEXT:    bxlt lr
 ; NOREDUCTIONS-NEXT:  .LBB0_1: @ %for.body.lr.ph
-; NOREDUCTIONS-NEXT:    push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; NOREDUCTIONS-NEXT:    mov r11, r0
-; NOREDUCTIONS-NEXT:    ldr r0, [sp, #32]
+; NOREDUCTIONS-NEXT:    ldr r0, [sp, #36]
 ; NOREDUCTIONS-NEXT:    add.w r9, r2, #3
 ; NOREDUCTIONS-NEXT:    mov.w r12, #0
+; NOREDUCTIONS-NEXT:    mov.w r8, #1
 ; NOREDUCTIONS-NEXT:    mov r10, r11
 ; NOREDUCTIONS-NEXT:    uxth r0, r0
 ; NOREDUCTIONS-NEXT:    rsbs r5, r0, #0
@@ -120,18 +120,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; NOREDUCTIONS-NEXT:  @ %bb.5: @ %vector.ph
 ; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    bic r0, r9, #3
-; NOREDUCTIONS-NEXT:    movs r7, #1
-; NOREDUCTIONS-NEXT:    subs r0, #4
 ; NOREDUCTIONS-NEXT:    sub.w r4, r2, r12
+; NOREDUCTIONS-NEXT:    subs r0, #4
 ; NOREDUCTIONS-NEXT:    vmov.i32 q1, #0x0
-; NOREDUCTIONS-NEXT:    add.w r6, r7, r0, lsr #2
+; NOREDUCTIONS-NEXT:    mov r7, r10
+; NOREDUCTIONS-NEXT:    add.w r6, r8, r0, lsr #2
 ; NOREDUCTIONS-NEXT:    adds r0, r2, #3
 ; NOREDUCTIONS-NEXT:    sub.w r0, r0, r12
 ; NOREDUCTIONS-NEXT:    bic r0, r0, #3
 ; NOREDUCTIONS-NEXT:    subs r0, #4
-; NOREDUCTIONS-NEXT:    add.w r0, r7, r0, lsr #2
-; NOREDUCTIONS-NEXT:    mov r7, r10
-; NOREDUCTIONS-NEXT:    dls lr, r0
+; NOREDUCTIONS-NEXT:    add.w lr, r8, r0, lsr #2
 ; NOREDUCTIONS-NEXT:    mov r0, r11
 ; NOREDUCTIONS-NEXT:  .LBB0_6: @ %vector.body
 ; NOREDUCTIONS-NEXT:    @ Parent Loop BB0_4 Depth=1
@@ -154,7 +152,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; NOREDUCTIONS-NEXT:    vaddv.u32 r0, q0
 ; NOREDUCTIONS-NEXT:    b .LBB0_3
 ; NOREDUCTIONS-NEXT:  .LBB0_8:
-; NOREDUCTIONS-NEXT:    pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; NOREDUCTIONS-NEXT:    bx lr
 entry:
   %conv = sext i16 %N to i32
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
index cbcbf1f392ce8..435acc29f076e 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
@@ -165,74 +165,73 @@ define dso_local i32 @b(ptr %c, i32 %d, i32 %e, ptr %n) "frame-pointer"="all" {
 ; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    wls lr, r1, .LBB2_3
 ; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
-; CHECK-NEXT:    adds r6, r3, #4
-; CHECK-NEXT:    adds r1, r0, #4
+; CHECK-NEXT:    add.w r9, r3, #4
+; CHECK-NEXT:    add.w r10, r0, #4
 ; CHECK-NEXT:    mvn r8, #1
-; CHECK-NEXT:    @ implicit-def: $r9
+; CHECK-NEXT:    @ implicit-def: $r6
 ; CHECK-NEXT:    @ implicit-def: $r4
 ; CHECK-NEXT:    str r2, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB2_2: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    ldr.w r1, [r10]
 ; CHECK-NEXT:    asrs r2, r4, #31
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    ldr r1, [r1]
+; CHECK-NEXT:    str r6, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    muls r1, r3, r1
 ; CHECK-NEXT:    adds r4, r4, r1
 ; CHECK-NEXT:    adc.w r1, r2, r1, asr #31
 ; CHECK-NEXT:    adds.w r2, r4, #-2147483648
-; CHECK-NEXT:    ldrd r2, r4, [r8]
-; CHECK-NEXT:    adc r5, r1, #0
-; CHECK-NEXT:    str r2, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    smull r4, r2, r4, r9
-; CHECK-NEXT:    asrs r1, r5, #31
+; CHECK-NEXT:    ldrd r5, r4, [r8]
+; CHECK-NEXT:    adc r2, r1, #0
 ; CHECK-NEXT:    str r5, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    subs r4, r5, r4
-; CHECK-NEXT:    sbcs r1, r2
-; CHECK-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    adds.w r10, r4, #-2147483648
-; CHECK-NEXT:    adc r1, r1, #0
-; CHECK-NEXT:    ldr r4, [r2, #-4]
+; CHECK-NEXT:    smull r4, r5, r4, r6
+; CHECK-NEXT:    asrs r1, r2, #31
+; CHECK-NEXT:    str r2, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    subs r4, r2, r4
+; CHECK-NEXT:    sbcs r1, r5
+; CHECK-NEXT:    adds.w r6, r4, #-2147483648
+; CHECK-NEXT:    ldr r4, [r10, #-4]
+; CHECK-NEXT:    adc r11, r1, #0
+; CHECK-NEXT:    mov r1, r9
+; CHECK-NEXT:    add.w r10, r10, #4
 ; CHECK-NEXT:    muls r4, r3, r4
 ; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    adds.w r12, r4, #-2147483648
 ; CHECK-NEXT:    asr.w r5, r4, #31
-; CHECK-NEXT:    ldr r4, [r6]
+; CHECK-NEXT:    ldr.w r4, [r9]
 ; CHECK-NEXT:    adc r5, r5, #0
 ; CHECK-NEXT:    mul r2, r4, r0
-; CHECK-NEXT:    adds r0, #4
 ; CHECK-NEXT:    add.w r2, r2, #-2147483648
 ; CHECK-NEXT:    asrl r12, r5, r2
-; CHECK-NEXT:    smull r2, r5, r4, r12
-; CHECK-NEXT:    lsll r2, r5, #30
-; CHECK-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    asr.w r11, r5, #31
-; CHECK-NEXT:    mov r12, r5
-; CHECK-NEXT:    lsll r12, r11, r4
-; CHECK-NEXT:    mul r2, r2, r9
-; CHECK-NEXT:    lsrl r12, r11, #2
-; CHECK-NEXT:    adds r2, #2
-; CHECK-NEXT:    lsll r12, r11, r2
+; CHECK-NEXT:    smull r2, r9, r4, r12
+; CHECK-NEXT:    mov r12, r0
+; CHECK-NEXT:    lsll r2, r9, #30
+; CHECK-NEXT:    asr.w r5, r9, #31
+; CHECK-NEXT:    mov r2, r9
+; CHECK-NEXT:    mov r9, r1
+; CHECK-NEXT:    ldrd r1, r0, [sp, #4] @ 8-byte Folded Reload
+; CHECK-NEXT:    lsll r2, r5, r4
+; CHECK-NEXT:    lsrl r2, r5, #2
+; CHECK-NEXT:    muls r0, r1, r0
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    adds r0, #2
+; CHECK-NEXT:    lsll r2, r5, r0
+; CHECK-NEXT:    add.w r0, r2, #-2147483648
 ; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
-; CHECK-NEXT:    add.w r5, r12, #-2147483648
-; CHECK-NEXT:    asrl r10, r1, r5
-; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    lsrl r10, r1, #2
-; CHECK-NEXT:    movs r1, #2
-; CHECK-NEXT:    mov r9, r10
-; CHECK-NEXT:    str.w r10, [r1]
-; CHECK-NEXT:    ldr r1, [r8], #-4
-; CHECK-NEXT:    mls r5, r1, r4, r5
-; CHECK-NEXT:    adds.w r4, r5, #-2147483648
-; CHECK-NEXT:    asr.w r1, r5, #31
+; CHECK-NEXT:    asrl r6, r11, r0
+; CHECK-NEXT:    movs r0, #2
+; CHECK-NEXT:    lsrl r6, r11, #2
+; CHECK-NEXT:    str r6, [r0]
+; CHECK-NEXT:    ldr r0, [r8], #-4
+; CHECK-NEXT:    mls r0, r0, r4, r1
+; CHECK-NEXT:    adds.w r4, r0, #-2147483648
+; CHECK-NEXT:    asr.w r1, r0, #31
 ; CHECK-NEXT:    adc r1, r1, #0
 ; CHECK-NEXT:    lsrl r4, r1, #2
-; CHECK-NEXT:    rsbs r1, r4, #0
-; CHECK-NEXT:    str r1, [r2]
-; CHECK-NEXT:    str r1, [r6, #-4]
-; CHECK-NEXT:    adds r6, #4
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    adds r1, #4
+; CHECK-NEXT:    rsbs r0, r4, #0
+; CHECK-NEXT:    str r0, [r2]
+; CHECK-NEXT:    str r0, [r9, #-4]
+; CHECK-NEXT:    add.w r9, r9, #4
+; CHECK-NEXT:    add.w r0, r12, #4
 ; CHECK-NEXT:    le lr, .LBB2_2
 ; CHECK-NEXT:  .LBB2_3: @ %while.end
 ; CHECK-NEXT:    add sp, #16
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index f7b4548f127bf..b6657d607ce6d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1573,120 +1573,115 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    ldrd r7, r9, [r0]
-; CHECK-NEXT:    and r6, r3, #3
-; CHECK-NEXT:    ldr r0, [r0, #8]
-; CHECK-NEXT:    lsrs r3, r3, #2
-; CHECK-NEXT:    @ implicit-def: $r12
-; CHECK-NEXT:    str r6, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
-; CHECK-NEXT:    str r2, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    ldm.w r0, {r7, r9, r11}
+; CHECK-NEXT:    and r0, r3, #3
+; CHECK-NEXT:    @ implicit-def: $r5
+; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    lsrs r0, r3, #2
+; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB19_3
 ; CHECK-NEXT:  .LBB19_1: @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    mov r2, r5
-; CHECK-NEXT:    mov r4, r11
-; CHECK-NEXT:    mov r8, r10
+; CHECK-NEXT:    mov r8, r3
+; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    mov r12, r10
 ; CHECK-NEXT:  .LBB19_2: @ %if.end69
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    adds r0, #128
-; CHECK-NEXT:    strd r2, r4, [r9]
-; CHECK-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    subs r7, #1
-; CHECK-NEXT:    strd r3, r8, [r9, #8]
-; CHECK-NEXT:    add.w r9, r9, #16
+; CHECK-NEXT:    add.w r11, r11, #128
+; CHECK-NEXT:    strd r8, r0, [r9]
 ; CHECK-NEXT:    mov r1, r2
+; CHECK-NEXT:    strd r3, r12, [r9, #8]
+; CHECK-NEXT:    add.w r9, r9, #16
+; CHECK-NEXT:    subs r7, #1
 ; CHECK-NEXT:    beq.w .LBB19_13
 ; CHECK-NEXT:  .LBB19_3: @ %do.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB19_5 Depth 2
-; CHECK-NEXT:    ldrd r5, r11, [r9]
+; CHECK-NEXT:    ldr.w r10, [r9, #12]
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    ldrd r8, r10, [r9, #8]
-; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
+; CHECK-NEXT:    ldm.w r9, {r3, r4, r12}
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    str r7, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    wls lr, r2, .LBB19_6
+; CHECK-NEXT:    wls lr, r0, .LBB19_6
 ; CHECK-NEXT:  @ %bb.4: @ %while.body.lr.ph
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    mov r4, r11
-; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    mov r6, r2
 ; CHECK-NEXT:  .LBB19_5: @ %while.body
 ; CHECK-NEXT:    @ Parent Loop BB19_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldr r5, [r1, #12]
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
-; CHECK-NEXT:    ldm.w r1, {r2, r7, r11}
-; CHECK-NEXT:    vmul.f32 q2, q2, r5
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
-; CHECK-NEXT:    vfma.f32 q2, q6, r11
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    mov r8, r4
+; CHECK-NEXT:    ldrd r4, r3, [r1, #8]
+; CHECK-NEXT:    vldrw.u32 q2, [r11]
+; CHECK-NEXT:    vldrw.u32 q6, [r11, #16]
+; CHECK-NEXT:    ldrd r0, r7, [r1]
+; CHECK-NEXT:    vmul.f32 q2, q2, r3
+; CHECK-NEXT:    vldrw.u32 q7, [r11, #32]
+; CHECK-NEXT:    vfma.f32 q2, q6, r4
+; CHECK-NEXT:    vldrw.u32 q4, [r11, #48]
 ; CHECK-NEXT:    vfma.f32 q2, q7, r7
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #64]
-; CHECK-NEXT:    vfma.f32 q2, q4, r2
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
-; CHECK-NEXT:    vfma.f32 q2, q5, r3
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
-; CHECK-NEXT:    vfma.f32 q2, q3, r4
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #112]
-; CHECK-NEXT:    vfma.f32 q2, q1, r8
+; CHECK-NEXT:    vldrw.u32 q5, [r11, #64]
+; CHECK-NEXT:    vfma.f32 q2, q4, r0
+; CHECK-NEXT:    vldrw.u32 q3, [r11, #80]
+; CHECK-NEXT:    vfma.f32 q2, q5, r5
+; CHECK-NEXT:    vldrw.u32 q1, [r11, #96]
+; CHECK-NEXT:    vfma.f32 q2, q3, r8
+; CHECK-NEXT:    vldrw.u32 q0, [r11, #112]
+; CHECK-NEXT:    vfma.f32 q2, q1, r12
 ; CHECK-NEXT:    adds r1, #16
 ; CHECK-NEXT:    vfma.f32 q2, q0, r10
-; CHECK-NEXT:    mov r4, r11
-; CHECK-NEXT:    vmov r10, r8, d5
+; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    vmov r10, r12, d5
 ; CHECK-NEXT:    vstrb.8 q2, [r6], #16
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    mov r12, r5
 ; CHECK-NEXT:    le lr, .LBB19_5
 ; CHECK-NEXT:  .LBB19_6: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    ldr r3, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    cmp r7, #0
 ; CHECK-NEXT:    beq .LBB19_1
 ; CHECK-NEXT:  @ %bb.7: @ %if.then
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    ldrd lr, r4, [r1]
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    ldrd r2, r1, [r1, #8]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    ldrd lr, r0, [r1]
+; CHECK-NEXT:    vldrw.u32 q0, [r11]
+; CHECK-NEXT:    ldrd r8, r1, [r1, #8]
+; CHECK-NEXT:    vldrw.u32 q6, [r11, #16]
+; CHECK-NEXT:    vldrw.u32 q7, [r11, #32]
+; CHECK-NEXT:    vldrw.u32 q4, [r11, #48]
 ; CHECK-NEXT:    vmul.f32 q0, q0, r1
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #64]
-; CHECK-NEXT:    vfma.f32 q0, q6, r2
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
-; CHECK-NEXT:    vfma.f32 q0, q7, r4
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #96]
+; CHECK-NEXT:    vldrw.u32 q5, [r11, #64]
+; CHECK-NEXT:    vfma.f32 q0, q6, r8
+; CHECK-NEXT:    vldrw.u32 q3, [r11, #80]
+; CHECK-NEXT:    vfma.f32 q0, q7, r0
+; CHECK-NEXT:    vldrw.u32 q2, [r11, #96]
 ; CHECK-NEXT:    vfma.f32 q0, q4, lr
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #112]
-; CHECK-NEXT:    vfma.f32 q0, q5, r5
-; CHECK-NEXT:    cmp r3, #1
-; CHECK-NEXT:    vfma.f32 q0, q3, r11
-; CHECK-NEXT:    vfma.f32 q0, q2, r8
+; CHECK-NEXT:    vldrw.u32 q1, [r11, #112]
+; CHECK-NEXT:    vfma.f32 q0, q5, r3
+; CHECK-NEXT:    cmp r7, #1
+; CHECK-NEXT:    vfma.f32 q0, q3, r4
+; CHECK-NEXT:    vfma.f32 q0, q2, r12
 ; CHECK-NEXT:    vfma.f32 q0, q1, r10
-; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    bne .LBB19_9
 ; CHECK-NEXT:  @ %bb.8: @ %if.then58
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    str r5, [r6]
-; CHECK-NEXT:    mov r2, lr
-; CHECK-NEXT:    mov r4, r12
-; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    str r4, [r6]
+; CHECK-NEXT:    mov r8, lr
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r3, r4
 ; CHECK-NEXT:    b .LBB19_12
 ; CHECK-NEXT:  .LBB19_9: @ %if.else
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    vmov r8, s1
-; CHECK-NEXT:    cmp r3, #2
+; CHECK-NEXT:    vmov r12, s1
+; CHECK-NEXT:    cmp r7, #2
 ; CHECK-NEXT:    vstr s1, [r6, #4]
-; CHECK-NEXT:    str r5, [r6]
+; CHECK-NEXT:    str r4, [r6]
 ; CHECK-NEXT:    bne .LBB19_11
 ; CHECK-NEXT:  @ %bb.10: @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    mov r2, r4
-; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    mov r4, lr
-; CHECK-NEXT:    mov r8, r5
+; CHECK-NEXT:    mov r8, r0
+; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    mov r0, lr
+; CHECK-NEXT:    mov r12, r4
 ; CHECK-NEXT:    b .LBB19_12
 ; CHECK-NEXT:  .LBB19_11: @ %if.else64
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
@@ -1694,7 +1689,7 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
 ; CHECK-NEXT:    vstr s2, [r6, #8]
 ; CHECK-NEXT:  .LBB19_12: @ %if.end69
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    mov r12, r1
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    b .LBB19_2
 ; CHECK-NEXT:  .LBB19_13: @ %do.end
 ; CHECK-NEXT:    add sp, #16
@@ -1901,8 +1896,8 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    ldrd r6, r12, [r0, #4]
 ; CHECK-NEXT:    lsr.w r8, r3, #1
 ; CHECK-NEXT:    ldrb r0, [r0]
@@ -1910,11 +1905,11 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur
 ; CHECK-NEXT:    b .LBB20_3
 ; CHECK-NEXT:  .LBB20_1: @ %if.else
 ; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
-; CHECK-NEXT:    vmov.f32 s14, s13
-; CHECK-NEXT:    vstr s12, [r6]
+; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    vstr s4, [r6]
 ; CHECK-NEXT:  .LBB20_2: @ %if.end
 ; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
-; CHECK-NEXT:    vstr s14, [r6, #4]
+; CHECK-NEXT:    vstr s6, [r6, #4]
 ; CHECK-NEXT:    add.w r12, r12, #20
 ; CHECK-NEXT:    adds r6, #8
 ; CHECK-NEXT:    subs r0, #1
@@ -1923,41 +1918,39 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur
 ; CHECK-NEXT:  .LBB20_3: @ %do.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB20_5 Depth 2
-; CHECK-NEXT:    vldrw.u32 q2, [r12]
+; CHECK-NEXT:    vldrw.u32 q3, [r12]
 ; CHECK-NEXT:    movs r5, #0
-; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vshlc q4, r5, #32
-; CHECK-NEXT:    vldrw.u32 q1, [r12, #8]
-; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vldrw.u32 q2, [r12, #8]
+; CHECK-NEXT:    vmov q5, q2
 ; CHECK-NEXT:    vshlc q5, r5, #32
-; CHECK-NEXT:    vldrw.u32 q3, [r6]
-; CHECK-NEXT:    vmov.f32 s14, s0
+; CHECK-NEXT:    vldrw.u32 q1, [r6]
+; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    mov r5, r2
-; CHECK-NEXT:    vmov.f32 s15, s0
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    wls lr, r8, .LBB20_6
 ; CHECK-NEXT:  @ %bb.4: @ %while.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
-; CHECK-NEXT:    vmov q6, q3
 ; CHECK-NEXT:    mov r5, r2
 ; CHECK-NEXT:  .LBB20_5: @ %while.body
 ; CHECK-NEXT:    @ Parent Loop BB20_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldrd r7, r4, [r1], #8
-; CHECK-NEXT:    vfma.f32 q6, q2, r7
-; CHECK-NEXT:    vmov r7, s24
-; CHECK-NEXT:    vmov q3, q6
-; CHECK-NEXT:    vfma.f32 q3, q1, r7
-; CHECK-NEXT:    vstr s24, [r5]
-; CHECK-NEXT:    vmov.f32 s15, s0
-; CHECK-NEXT:    vfma.f32 q3, q4, r4
-; CHECK-NEXT:    vmov r4, s13
-; CHECK-NEXT:    vstr s13, [r5, #4]
-; CHECK-NEXT:    vfma.f32 q3, q5, r4
+; CHECK-NEXT:    vfma.f32 q1, q3, r7
+; CHECK-NEXT:    vmov r7, s4
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vfma.f32 q1, q2, r7
+; CHECK-NEXT:    vmov.f32 s7, s0
+; CHECK-NEXT:    vfma.f32 q1, q4, r4
+; CHECK-NEXT:    vmov r4, s5
+; CHECK-NEXT:    vstr s5, [r5, #4]
+; CHECK-NEXT:    vfma.f32 q1, q5, r4
+; CHECK-NEXT:    vmov.f32 s4, s6
+; CHECK-NEXT:    vmov.f32 s5, s7
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vstr s2, [r5]
 ; CHECK-NEXT:    adds r5, #8
-; CHECK-NEXT:    vmov.f32 s12, s14
-; CHECK-NEXT:    vmov.f32 s13, s15
-; CHECK-NEXT:    vmov.f32 s14, s0
-; CHECK-NEXT:    vmov q6, q3
 ; CHECK-NEXT:    le lr, .LBB20_5
 ; CHECK-NEXT:  .LBB20_6: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
@@ -1966,14 +1959,14 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur
 ; CHECK-NEXT:  @ %bb.7: @ %if.then
 ; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
 ; CHECK-NEXT:    ldr r1, [r1]
-; CHECK-NEXT:    vfma.f32 q3, q2, r1
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vstr s12, [r5]
-; CHECK-NEXT:    vfma.f32 q3, q1, r1
-; CHECK-NEXT:    vstr s13, [r6]
+; CHECK-NEXT:    vfma.f32 q1, q3, r1
+; CHECK-NEXT:    vmov r1, s4
+; CHECK-NEXT:    vstr s4, [r5]
+; CHECK-NEXT:    vfma.f32 q1, q2, r1
+; CHECK-NEXT:    vstr s5, [r6]
 ; CHECK-NEXT:    b .LBB20_2
 ; CHECK-NEXT:  .LBB20_8: @ %do.end
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.9:
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index 0d86f22a321e0..b60ee7c6d406b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -1313,27 +1313,29 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    @ Child Loop BB16_3 Depth 2
 ; CHECK-NEXT:    ldr.w r8, [sp, #56] @ 4-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:  .LBB16_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB16_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vadd.i32 q1, q5, r0
+; CHECK-NEXT:    vmov q0, q6
+; CHECK-NEXT:    vadd.i32 q6, q5, r0
+; CHECK-NEXT:    vmov r7, r3, d13
 ; CHECK-NEXT:    vadd.i32 q2, q4, r0
-; CHECK-NEXT:    vmov r7, r3, d3
-; CHECK-NEXT:    vadd.i32 q6, q0, lr
 ; CHECK-NEXT:    vmov r5, r6, d5
+; CHECK-NEXT:    vmov q1, q7
+; CHECK-NEXT:    vmov r4, r10, d12
+; CHECK-NEXT:    vadd.i32 q6, q0, lr
 ; CHECK-NEXT:    subs.w r9, r9, #16
-; CHECK-NEXT:    vmov r4, r10, d2
-; CHECK-NEXT:    vadd.i32 q1, q7, lr
 ; CHECK-NEXT:    vadd.i32 q4, q4, lr
 ; CHECK-NEXT:    vadd.i32 q5, q5, lr
+; CHECK-NEXT:    vadd.i32 q7, q7, lr
 ; CHECK-NEXT:    ldrb.w r11, [r3]
 ; CHECK-NEXT:    ldrb r3, [r7]
 ; CHECK-NEXT:    vmov r7, r12, d4
-; CHECK-NEXT:    vadd.i32 q2, q7, r0
-; CHECK-NEXT:    vadd.i32 q7, q0, r0
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vadd.i32 q1, q0, r0
 ; CHECK-NEXT:    ldrb r5, [r5]
 ; CHECK-NEXT:    ldrb r6, [r6]
 ; CHECK-NEXT:    ldrb r4, [r4]
@@ -1342,7 +1344,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    ldrb.w r1, [r12]
 ; CHECK-NEXT:    vmov.8 q0[0], r7
 ; CHECK-NEXT:    vmov.8 q0[1], r1
-; CHECK-NEXT:    vmov r1, r7, d15
+; CHECK-NEXT:    vmov r1, r7, d3
 ; CHECK-NEXT:    vmov.8 q0[2], r5
 ; CHECK-NEXT:    vmov.8 q0[3], r6
 ; CHECK-NEXT:    vmov.8 q0[4], r4
@@ -1357,8 +1359,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    ldrb r3, [r5]
 ; CHECK-NEXT:    ldrb.w r12, [r7]
 ; CHECK-NEXT:    ldrb r5, [r4]
-; CHECK-NEXT:    vmov r4, r7, d14
-; CHECK-NEXT:    vmov q7, q1
+; CHECK-NEXT:    vmov r4, r7, d2
 ; CHECK-NEXT:    ldrb r4, [r4]
 ; CHECK-NEXT:    ldrb r7, [r7]
 ; CHECK-NEXT:    vmov.8 q0[8], r4
@@ -1370,7 +1371,6 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    vmov.8 q0[14], r3
 ; CHECK-NEXT:    vmov.8 q0[15], r12
 ; CHECK-NEXT:    vstrb.8 q0, [r8], #16
-; CHECK-NEXT:    vmov q0, q6
 ; CHECK-NEXT:    bne .LBB16_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB16_2 Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index eedca2cd4a5d3..c0b2da7eff41b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -236,11 +236,11 @@ define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(ptr noalias nocapture r
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:  .LBB5_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q2, [r0, q1, uxtw #2]
-; CHECK-NEXT:    vadd.i32 q3, q1, q0
+; CHECK-NEXT:    vldrw.u32 q3, [r0, q1, uxtw #2]
 ; CHECK-NEXT:    subs r2, #4
-; CHECK-NEXT:    vstrw.32 q2, [r0, q1, uxtw #2]
-; CHECK-NEXT:    vmov q1, q3
+; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vadd.i32 q1, q1, q0
+; CHECK-NEXT:    vstrw.32 q3, [r0, q2, uxtw #2]
 ; CHECK-NEXT:    bne .LBB5_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
 ; CHECK-NEXT:    bx lr
@@ -330,20 +330,20 @@ define arm_aapcs_vfpcc void @non_gatscat_use1(ptr noalias nocapture readonly %da
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    adr r4, .LCPI7_0
 ; CHECK-NEXT:    mov.w r12, #9
-; CHECK-NEXT:    vldrw.u32 q1, [r4]
+; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    mov.w lr, #12
 ; CHECK-NEXT:    movs r4, #8
-; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vdup.32 q1, r0
 ; CHECK-NEXT:  .LBB7_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vadd.i32 q2, q1, r4
-; CHECK-NEXT:    vmla.i32 q3, q1, lr
-; CHECK-NEXT:    vmul.i32 q1, q1, r12
-; CHECK-NEXT:    vldrw.u32 q4, [q3, #24]
+; CHECK-NEXT:    vmov q2, q0
+; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vmla.i32 q3, q2, lr
 ; CHECK-NEXT:    subs r2, #4
-; CHECK-NEXT:    vstrw.32 q1, [r3]
-; CHECK-NEXT:    vmov q1, q2
+; CHECK-NEXT:    vldrw.u32 q4, [q3, #24]
+; CHECK-NEXT:    vmul.i32 q2, q2, r12
+; CHECK-NEXT:    vadd.i32 q0, q0, r4
+; CHECK-NEXT:    vstrw.32 q2, [r3]
 ; CHECK-NEXT:    vstrb.8 q4, [r1], #16
 ; CHECK-NEXT:    bne .LBB7_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
@@ -390,22 +390,22 @@ define arm_aapcs_vfpcc void @non_gatscat_use2(ptr noalias nocapture readonly %da
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    adr r4, .LCPI8_0
 ; CHECK-NEXT:    movs r5, #18
-; CHECK-NEXT:    vldrw.u32 q2, [r4]
+; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    mov.w r12, #9
 ; CHECK-NEXT:    mov.w lr, #12
 ; CHECK-NEXT:    movs r4, #8
-; CHECK-NEXT:    vdup.32 q0, r0
-; CHECK-NEXT:    vdup.32 q1, r5
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vdup.32 q2, r5
 ; CHECK-NEXT:  .LBB8_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vadd.i32 q3, q2, r4
-; CHECK-NEXT:    vmla.i32 q4, q2, lr
+; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    vmov q4, q1
+; CHECK-NEXT:    vmla.i32 q4, q3, lr
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vldrw.u32 q5, [q4, #24]
-; CHECK-NEXT:    vmov q4, q1
-; CHECK-NEXT:    vmla.i32 q4, q2, r12
-; CHECK-NEXT:    vmov q2, q3
+; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vmla.i32 q4, q3, r12
+; CHECK-NEXT:    vadd.i32 q0, q0, r4
 ; CHECK-NEXT:    vstrb.8 q5, [r1], #16
 ; CHECK-NEXT:    vstrw.32 q4, [r3]
 ; CHECK-NEXT:    bne .LBB8_1
@@ -487,21 +487,21 @@ define dso_local void @arm_mat_mult_q31(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    @ => This Loop Header: Depth=2
 ; CHECK-NEXT:    @ Child Loop BB9_3 Depth 3
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov q7, q2
+; CHECK-NEXT:    vmov q1, q2
 ; CHECK-NEXT:    dls lr, r10
 ; CHECK-NEXT:    vmov.i32 q5, #0x0
-; CHECK-NEXT:    vmlas.i32 q7, q0, r7
-; CHECK-NEXT:    vmov q6, q4
+; CHECK-NEXT:    vmlas.i32 q1, q0, r7
+; CHECK-NEXT:    vmov q7, q4
 ; CHECK-NEXT:  .LBB9_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB9_1 Depth=1
 ; CHECK-NEXT:    @ Parent Loop BB9_2 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT:    vadd.i32 q0, q7, q3
-; CHECK-NEXT:    vldrw.u32 q1, [r1, q7, uxtw #2]
-; CHECK-NEXT:    vldrw.u32 q7, [q6, #32]!
-; CHECK-NEXT:    vmul.i32 q1, q1, q7
-; CHECK-NEXT:    vmov q7, q0
-; CHECK-NEXT:    vadd.i32 q5, q1, q5
+; CHECK-NEXT:    vmov q6, q1
+; CHECK-NEXT:    vadd.i32 q1, q1, q3
+; CHECK-NEXT:    vldrw.u32 q0, [r1, q6, uxtw #2]
+; CHECK-NEXT:    vldrw.u32 q6, [q7, #32]!
+; CHECK-NEXT:    vmul.i32 q0, q0, q6
+; CHECK-NEXT:    vadd.i32 q5, q0, q5
 ; CHECK-NEXT:    le lr, .LBB9_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB9_2 Depth=2
@@ -702,12 +702,12 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
 ; CHECK-NEXT:    @ Parent Loop BB10_8 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT:    vadd.i32 q6, q5, q3
-; CHECK-NEXT:    vldrh.s32 q7, [r1, q5, uxtw #1]
-; CHECK-NEXT:    vldrh.s32 q5, [r3], #8
-; CHECK-NEXT:    vmul.i32 q5, q7, q5
-; CHECK-NEXT:    vadd.i32 q4, q5, q4
-; CHECK-NEXT:    vmov q5, q6
+; CHECK-NEXT:    vmov q6, q5
+; CHECK-NEXT:    vadd.i32 q5, q5, q3
+; CHECK-NEXT:    vldrh.s32 q7, [r1, q6, uxtw #1]
+; CHECK-NEXT:    vldrh.s32 q6, [r3], #8
+; CHECK-NEXT:    vmul.i32 q6, q7, q6
+; CHECK-NEXT:    vadd.i32 q4, q6, q4
 ; CHECK-NEXT:    le lr, .LBB10_11
 ; CHECK-NEXT:  @ %bb.12: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
@@ -922,15 +922,15 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(ptr nocapture readonly
 ; CHECK-NEXT:    @ Parent Loop BB11_3 Depth=3
 ; CHECK-NEXT:    @ Parent Loop BB11_4 Depth=4
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=5
-; CHECK-NEXT:    vldrb.s32 q2, [r0, q5]
-; CHECK-NEXT:    vadd.i32 q7, q5, q0
-; CHECK-NEXT:    vldrb.s32 q5, [r1, q4]
-; CHECK-NEXT:    vadd.i32 q6, q4, q0
-; CHECK-NEXT:    vadd.i32 q2, q2, r2
+; CHECK-NEXT:    vmov q7, q5
+; CHECK-NEXT:    vmov q6, q4
+; CHECK-NEXT:    vldrb.s32 q2, [r0, q7]
+; CHECK-NEXT:    vldrb.s32 q7, [r1, q6]
 ; CHECK-NEXT:    subs r5, #4
-; CHECK-NEXT:    vmlava.u32 r12, q2, q5
-; CHECK-NEXT:    vmov q5, q7
-; CHECK-NEXT:    vmov q4, q6
+; CHECK-NEXT:    vadd.i32 q4, q4, q0
+; CHECK-NEXT:    vadd.i32 q2, q2, r2
+; CHECK-NEXT:    vadd.i32 q5, q5, q0
+; CHECK-NEXT:    vmlava.u32 r12, q2, q7
 ; CHECK-NEXT:    bne .LBB11_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB11_4 Depth=4
diff --git a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
index 43ed5eefbf4c7..d6c5cde30ed73 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
@@ -18,50 +18,50 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32
 ; CHECK-NEXT:    csel r7, r6, r5, hs
 ; CHECK-NEXT:    add.w lr, r7, #1
 ; CHECK-NEXT:    mov r4, r5
-; CHECK-NEXT:    vldrh.u16 q0, [r0], #32
+; CHECK-NEXT:    vldrh.u16 q1, [r0], #32
 ; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    mov r8, r5
+; CHECK-NEXT:    vldrh.u16 q2, [r1], #32
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q1, q2
+; CHECK-NEXT:    vldrh.u16 q0, [r0, #-16]
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q1, q2
+; CHECK-NEXT:    vldrh.u16 q2, [r1, #-16]
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q0, q2
 ; CHECK-NEXT:    vldrh.u16 q1, [r1], #32
-; CHECK-NEXT:    vmlsldava.s16 r4, r7, q0, q1
-; CHECK-NEXT:    vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q0, q1
-; CHECK-NEXT:    vldrh.u16 q3, [r1, #-16]
-; CHECK-NEXT:    vmlsldava.s16 r4, r7, q2, q3
-; CHECK-NEXT:    vldrh.u16 q0, [r1], #32
 ; CHECK-NEXT:    sub.w lr, lr, #1
 ; CHECK-NEXT:    cmp.w lr, #0
-; CHECK-NEXT:    vldrh.u16 q1, [r0], #32
+; CHECK-NEXT:    vldrh.u16 q3, [r0], #32
 ; CHECK-NEXT:    beq .LBB0_3
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  .LBB0_2: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q2, q3
-; CHECK-NEXT:    vldrh.u16 q3, [r1, #-16]
-; CHECK-NEXT:    vmlsldava.s16 r4, r7, q1, q0
-; CHECK-NEXT:    vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q1, q0
-; CHECK-NEXT:    vldrh.u16 q1, [r0], #32
-; CHECK-NEXT:    vmlsldava.s16 r4, r7, q2, q3
-; CHECK-NEXT:    vldrh.u16 q0, [r1], #32
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q0, q2
+; CHECK-NEXT:    vldrh.u16 q2, [r1, #-16]
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q3, q1
+; CHECK-NEXT:    vldrh.u16 q0, [r0, #-16]
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q3, q1
+; CHECK-NEXT:    vldrh.u16 q3, [r0], #32
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q0, q2
+; CHECK-NEXT:    vldrh.u16 q1, [r1], #32
 ; CHECK-NEXT:    le lr, .LBB0_2
 ; CHECK-NEXT:  .LBB0_3:
-; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q2, q3
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q0, q2
 ; CHECK-NEXT:    movs r6, #14
 ; CHECK-NEXT:    and.w r2, r6, r2, lsl #1
-; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q1, q0
-; CHECK-NEXT:    vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT:    vmlsldava.s16 r4, r7, q1, q0
-; CHECK-NEXT:    vldrh.u16 q0, [r1, #-16]
-; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q2, q0
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q3, q1
+; CHECK-NEXT:    vldrh.u16 q0, [r0, #-16]
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q3, q1
+; CHECK-NEXT:    vldrh.u16 q1, [r1, #-16]
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q0, q1
 ; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    vmlsldava.s16 r4, r7, q2, q0
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q0, q1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrht.u16 q1, [r0]
+; CHECK-NEXT:    vldrht.u16 q2, [r0]
 ; CHECK-NEXT:    cmp r2, #9
 ; CHECK-NEXT:    vpsttt
 ; CHECK-NEXT:    vldrht.u16 q0, [r1]
-; CHECK-NEXT:    vmlsldavat.s16 r4, r7, q1, q0
-; CHECK-NEXT:    vmlaldavaxt.s16 r8, r5, q1, q0
+; CHECK-NEXT:    vmlsldavat.s16 r4, r7, q2, q0
+; CHECK-NEXT:    vmlaldavaxt.s16 r8, r5, q2, q0
 ; CHECK-NEXT:    blo .LBB0_10
 ; CHECK-NEXT:  @ %bb.4: @ %do.body.1
 ; CHECK-NEXT:    subs r2, #8
diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
index 94d5490cead2f..6f2a0b2debc47 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
@@ -439,17 +439,18 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle4step_i16(<32 x i16> %src) {
 ; CHECK-NEXT:    vmovx.f16 s1, s14
 ; CHECK-NEXT:    vmovx.f16 s20, s0
 ; CHECK-NEXT:    vins.f16 s23, s1
-; CHECK-NEXT:    vmovx.f16 s1, s2
-; CHECK-NEXT:    vins.f16 s20, s1
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vmovx.f16 s2, s2
 ; CHECK-NEXT:    vmovx.f16 s21, s4
-; CHECK-NEXT:    vmovx.f16 s1, s6
+; CHECK-NEXT:    vins.f16 s20, s2
+; CHECK-NEXT:    vmovx.f16 s2, s6
 ; CHECK-NEXT:    vins.f16 s12, s14
 ; CHECK-NEXT:    vins.f16 s8, s10
 ; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vins.f16 s21, s1
-; CHECK-NEXT:    vins.f16 s0, s2
-; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vins.f16 s21, s2
+; CHECK-NEXT:    vins.f16 s0, s1
 ; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s1, s4
 ; CHECK-NEXT:    vmov.f32 s3, s12
 ; CHECK-NEXT:    vadd.i16 q0, q0, q5
 ; CHECK-NEXT:    vadd.i16 q0, q0, q4
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
index ab41069bfa258..ecb169898f9f0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
@@ -391,17 +391,18 @@ define void @vld4_v8i16_align1(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vmovx.f16 s1, s2
 ; CHECK-NEXT:    vmovx.f16 s20, s8
 ; CHECK-NEXT:    vins.f16 s23, s1
-; CHECK-NEXT:    vmovx.f16 s1, s10
-; CHECK-NEXT:    vins.f16 s20, s1
+; CHECK-NEXT:    vmov.f32 s1, s10
+; CHECK-NEXT:    vmovx.f16 s10, s10
 ; CHECK-NEXT:    vmovx.f16 s21, s12
-; CHECK-NEXT:    vmovx.f16 s1, s14
+; CHECK-NEXT:    vins.f16 s20, s10
+; CHECK-NEXT:    vmovx.f16 s10, s14
 ; CHECK-NEXT:    vins.f16 s0, s2
 ; CHECK-NEXT:    vins.f16 s12, s14
 ; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vins.f16 s8, s10
-; CHECK-NEXT:    vins.f16 s21, s1
-; CHECK-NEXT:    vmov.f32 s9, s12
+; CHECK-NEXT:    vins.f16 s21, s10
 ; CHECK-NEXT:    vmov.f32 s10, s4
+; CHECK-NEXT:    vins.f16 s8, s1
+; CHECK-NEXT:    vmov.f32 s9, s12
 ; CHECK-NEXT:    vmov.f32 s11, s0
 ; CHECK-NEXT:    vadd.i16 q0, q2, q5
 ; CHECK-NEXT:    vadd.i16 q0, q0, q4
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll b/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll
index 04be18e3dd873..6656d44eec81e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll
@@ -344,14 +344,14 @@ define void @loop_absmax32_pred_c(ptr %0, i32 %1, ptr nocapture %2) {
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    dlstp.32 lr, r1
 ; CHECK-NEXT:  .LBB19_1: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
-; CHECK-NEXT:    vmaxnma.f32 q1, q0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
+; CHECK-NEXT:    vmaxnma.f32 q0, q1
 ; CHECK-NEXT:    letp lr, .LBB19_1
 ; CHECK-NEXT:  @ %bb.2:
-; CHECK-NEXT:    vldr s0, .LCPI19_0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmaxnmav.f32 r0, q1
+; CHECK-NEXT:    vldr s4, .LCPI19_0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmaxnmav.f32 r0, q0
 ; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    vstr s0, [r2]
 ; CHECK-NEXT:    pop {r7, pc}
@@ -538,14 +538,14 @@ define void @loop_absmax16_pred_c(ptr %0, i32 %1, ptr nocapture %2) {
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    dlstp.16 lr, r1
 ; CHECK-NEXT:  .LBB23_1: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u16 q1, [r0], #8
-; CHECK-NEXT:    vmaxnma.f16 q1, q0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vldrh.u16 q0, [r0], #8
+; CHECK-NEXT:    vmaxnma.f16 q0, q1
 ; CHECK-NEXT:    letp lr, .LBB23_1
 ; CHECK-NEXT:  @ %bb.2:
-; CHECK-NEXT:    vldr.16 s0, .LCPI23_0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmaxnmav.f16 r0, q1
+; CHECK-NEXT:    vldr.16 s4, .LCPI23_0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmaxnmav.f16 r0, q0
 ; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    vstr.16 s0, [r2]
 ; CHECK-NEXT:    pop {r7, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
index ee2e58f2a6cc1..a1771f9356014 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
@@ -98,28 +98,29 @@ body:             |
 
     ; CHECK-LABEL: name: foo
     ; CHECK: liveins: $q0, $r0, $r1, $r2, $lr
-    ; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr
-    ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
-    ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
-    ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
-    ; CHECK: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg
-    ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7
-    ; CHECK: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2)
-    ; CHECK: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1)
-    ; CHECK: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0)
-    ; CHECK: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr {
-    ; CHECK:   MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr
-    ; CHECK:   renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4)
-    ; CHECK:   MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4)
-    ; CHECK:   renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4)
-    ; CHECK:   MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4)
-    ; CHECK: }
-    ; CHECK: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 {
-    ; CHECK:   MVE_VPST 4, implicit $vpr
-    ; CHECK:   renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4)
-    ; CHECK:   MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4)
-    ; CHECK: }
-    ; CHECK: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $lr, -4
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $r7, -8
+    ; CHECK-NEXT: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $r7
+    ; CHECK-NEXT: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2)
+    ; CHECK-NEXT: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1)
+    ; CHECK-NEXT: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0)
+    ; CHECK-NEXT: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr :: (load (s128) from %ir.src, align 4), (store (s128) into %ir.dest, align 4), (load (s128) from %ir.src2, align 4), (store (s128) into %ir.dest2, align 4) {
+    ; CHECK-NEXT:   MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr
+    ; CHECK-NEXT:   renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4)
+    ; CHECK-NEXT:   MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4)
+    ; CHECK-NEXT:   renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4)
+    ; CHECK-NEXT:   MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 :: (load (s128) from %ir.src3, align 4), (store (s128) into %ir.dest3, align 4) {
+    ; CHECK-NEXT:   MVE_VPST 4, implicit $vpr
+    ; CHECK-NEXT:   renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4)
+    ; CHECK-NEXT:   MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0
     $sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r7, killed $lr
     frame-setup CFI_INSTRUCTION def_cfa_offset 8
     frame-setup CFI_INSTRUCTION offset $lr, -4
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index 26ab555c2c593..fb5f543fd0d3a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -1055,18 +1055,18 @@ define void @vst4_v4f16(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vins.f16 s12, s2
 ; CHECK-NEXT:    vmovx.f16 s2, s3
 ; CHECK-NEXT:    vins.f16 s11, s2
-; CHECK-NEXT:    vmovx.f16 s2, s4
-; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s6
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vmovx.f16 s6, s4
+; CHECK-NEXT:    vins.f16 s4, s2
+; CHECK-NEXT:    vmovx.f16 s2, s2
 ; CHECK-NEXT:    vins.f16 s1, s3
-; CHECK-NEXT:    vins.f16 s2, s6
-; CHECK-NEXT:    vmovx.f16 s6, s7
+; CHECK-NEXT:    vins.f16 s6, s2
+; CHECK-NEXT:    vmovx.f16 s2, s7
 ; CHECK-NEXT:    vmov.f32 s8, s5
-; CHECK-NEXT:    vins.f16 s10, s6
+; CHECK-NEXT:    vins.f16 s10, s2
 ; CHECK-NEXT:    vmov.f32 s9, s1
 ; CHECK-NEXT:    vmov.f32 s5, s0
 ; CHECK-NEXT:    vstrh.16 q2, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s6, s2
 ; CHECK-NEXT:    vmov.f32 s7, s12
 ; CHECK-NEXT:    vstrh.16 q1, [r1]
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll
index e6fcf56af6e8d..2929a04cc0637 100644
--- a/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll
+++ b/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll
@@ -63,8 +63,8 @@ define hidden i32 @f(i32 %n) local_unnamed_addr #0 {
 ; CHECK-NEXT:    subs r0, #4
 ; CHECK-NEXT:    sub.w r3, r4, #16
 ; CHECK-NEXT:    add.w lr, r2, r0, lsr #2
-; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:  .LBB0_5: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r5, [r3, #16]!
diff --git a/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll
index a0247c29f257f..e5350409cd6ba 100644
--- a/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll
@@ -117,7 +117,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; CHECK-NEXT:    .short 9 @ 0x9
 ; CHECK-NEXT:    .short 10 @ 0xa
 ; CHECK-NEXT:    .short 10 @ 0xa
-; CHECK-NEXT:    .short 10 @ 0xa
+; CHECK-NEXT:    .short 0 @ 0x0
 ; CHECK-NEXT:  .LCPI4_4:
 ; CHECK-NEXT:    .short 341 @ 0x155
 ; CHECK-NEXT:    .short 292 @ 0x124
diff --git a/llvm/test/CodeGen/WebAssembly/fast-isel-pr138479.ll b/llvm/test/CodeGen/WebAssembly/fast-isel-pr138479.ll
index 2676000b968c3..1eb50d5f9564a 100644
--- a/llvm/test/CodeGen/WebAssembly/fast-isel-pr138479.ll
+++ b/llvm/test/CodeGen/WebAssembly/fast-isel-pr138479.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -asm-verbose=false -fast-isel -fast-isel-abort=1 -verify-machineinstrs | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -asm-verbose=false -fast-isel -fast-isel-abort=0 -verify-machineinstrs | FileCheck %s
 
 target triple = "wasm32-unknown-unknown"
 
@@ -13,3 +14,5 @@ define void @call_trunc_i64_to_i48(i64 %x) {
   call void @extern48(i48 %x48)
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/WebAssembly/fast-isel-simd128.ll b/llvm/test/CodeGen/WebAssembly/fast-isel-simd128.ll
new file mode 100644
index 0000000000000..df14e1054d91b
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/fast-isel-simd128.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -fast-isel -fast-isel-abort=0 -mattr=+simd128 -verify-machineinstrs | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+define i8 @pr165438(<4 x i32> %0) {
+; CHECK-LABEL: pr165438:
+; CHECK:         .functype pr165438 (v128) -> (i32)
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    i8x16.extract_lane_u 0
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %conv = trunc <4 x i32> %0 to <4 x i8>
+  br label %cond.true
+
+
+cond.true:                                        ; preds = %entry
+  %vecext = extractelement <4 x i8> %conv, i32 0
+  ret i8 %vecext
+}
diff --git a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
index a8d37be404cf2..c44b3bb5a9968 100644
--- a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
@@ -2808,6 +2808,348 @@ entry:
   ret <4 x i32> %spec.store.select7
 }
 
+define <2 x i8> @fptosi_v2f32_v2i8(<2 x float> %x) {
+; CHECK-LABEL: fptosi_v2f32_v2i8:
+; CHECK:         .functype fptosi_v2f32_v2i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <2 x float> %x to <2 x i8>
+  ret <2 x i8> %conv
+}
+
+define <2 x i8> @fptoui_v2f32_v2i8(<2 x float> %x) {
+; CHECK-LABEL: fptoui_v2f32_v2i8:
+; CHECK:         .functype fptoui_v2f32_v2i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <2 x float> %x to <2 x i8>
+  ret <2 x i8> %conv
+}
+
+define <2 x i16> @fptosi_v2f32_v2i16(<2 x float> %x) {
+; CHECK-LABEL: fptosi_v2f32_v2i16:
+; CHECK:         .functype fptosi_v2f32_v2i16 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <2 x float> %x to <2 x i16>
+  ret <2 x i16> %conv
+}
+
+define <2 x i16> @fptoui_v2f32_v2i16(<2 x float> %x) {
+; CHECK-LABEL: fptoui_v2f32_v2i16:
+; CHECK:         .functype fptoui_v2f32_v2i16 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <2 x float> %x to <2 x i16>
+  ret <2 x i16> %conv
+}
+
+define <4 x i8> @fptosi_v4f32_v4i8(<4 x float> %x) {
+; CHECK-LABEL: fptosi_v4f32_v4i8:
+; CHECK:         .functype fptosi_v4f32_v4i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <4 x float> %x to <4 x i8>
+  ret <4 x i8> %conv
+}
+
+define <4 x i8> @fptoui_v4f32_v4i8(<4 x float> %x) {
+; CHECK-LABEL: fptoui_v4f32_v4i8:
+; CHECK:         .functype fptoui_v4f32_v4i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <4 x float> %x to <4 x i8>
+  ret <4 x i8> %conv
+}
+
+define <4 x i16> @fptosi_v4f32_v4i16(<4 x float> %x) {
+; CHECK-LABEL: fptosi_v4f32_v4i16:
+; CHECK:         .functype fptosi_v4f32_v4i16 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <4 x float> %x to <4 x i16>
+  ret <4 x i16> %conv
+}
+
+define <4 x i16> @fptoui_v4f32_v4i16(<4 x float> %x) {
+; CHECK-LABEL: fptoui_v4f32_v4i16:
+; CHECK:         .functype fptoui_v4f32_v4i16 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <4 x float> %x to <4 x i16>
+  ret <4 x i16> %conv
+}
+
+define <8 x i8> @fptosi_v8f32_v8i8(<8 x float> %x) {
+; CHECK-LABEL: fptosi_v8f32_v8i8:
+; CHECK:         .functype fptosi_v8f32_v8i8 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <8 x float> %x to <8 x i8>
+  ret <8 x i8> %conv
+}
+
+define <8 x i8> @fptoui_v8f32_v8i8(<8 x float> %x) {
+; CHECK-LABEL: fptoui_v8f32_v8i8:
+; CHECK:         .functype fptoui_v8f32_v8i8 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <8 x float> %x to <8 x i8>
+  ret <8 x i8> %conv
+}
+
+define <8 x i16> @fptosi_v8f32_v8i16(<8 x float> %x) {
+; CHECK-LABEL: fptosi_v8f32_v8i16:
+; CHECK:         .functype fptosi_v8f32_v8i16 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <8 x float> %x to <8 x i16>
+  ret <8 x i16> %conv
+}
+
+define <8 x i16> @fptoui_v8f32_v8i16(<8 x float> %x) {
+; CHECK-LABEL: fptoui_v8f32_v8i16:
+; CHECK:         .functype fptoui_v8f32_v8i16 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <8 x float> %x to <8 x i16>
+  ret <8 x i16> %conv
+}
+
+define <16 x i8> @fptosi_v16f32_v16i8(<16 x float> %x) {
+; CHECK-LABEL: fptosi_v16f32_v16i8:
+; CHECK:         .functype fptosi_v16f32_v16i8 (v128, v128, v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <16 x float> %x to <16 x i8>
+  ret <16 x i8> %conv
+}
+
+define <16 x i8> @fptoui_v16f32_v16i8(<16 x float> %x) {
+; CHECK-LABEL: fptoui_v16f32_v16i8:
+; CHECK:         .functype fptoui_v16f32_v16i8 (v128, v128, v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <16 x float> %x to <16 x i8>
+  ret <16 x i8> %conv
+}
+
+define <16 x i16> @fptosi_v16f32_v16i16(<16 x float> %x) {
+; CHECK-LABEL: fptosi_v16f32_v16i16:
+; CHECK:         .functype fptosi_v16f32_v16i16 (i32, v128, v128, v128, v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    local.tee 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 4
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <16 x float> %x to <16 x i16>
+  ret <16 x i16> %conv
+}
+
+define <16 x i16> @fptoui_v16f32_v16i16(<16 x float> %x) {
+; CHECK-LABEL: fptoui_v16f32_v16i16:
+; CHECK:         .functype fptoui_v16f32_v16i16 (i32, v128, v128, v128, v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    local.tee 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 4
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <16 x float> %x to <16 x i16>
+  ret <16 x i16> %conv
+}
+
 declare <2 x i32> @llvm.smin.v2i32(<2 x i32>, <2 x i32>)
 declare <2 x i32> @llvm.smax.v2i32(<2 x i32>, <2 x i32>)
 declare <2 x i32> @llvm.umin.v2i32(<2 x i32>, <2 x i32>)
diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
index 5eb49fda9f014..5d58ae223da6f 100644
--- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=wasm32 -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
+; RUN: opt -mtriple=wasm32 -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
 
@@ -20,17 +20,17 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: i32x4.add
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 ; CHECK: i32x4.add
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -64,17 +64,17 @@ define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0,
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: i32x4.add
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 ; CHECK: i32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -208,27 +208,27 @@ define hidden void @three_shorts(ptr noalias nocapture noundef writeonly %0, ptr
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i16x8.sub
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i16x8.sub
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i16x8.sub
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i16x8.sub
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -276,27 +276,27 @@ define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.or
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.or
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.xor
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.xor
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -343,27 +343,27 @@ define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly
 ; CHECK-LABEL: four_shorts_interleave_op:
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.or
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.xor
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.or
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.xor
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_shorts_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -483,19 +483,19 @@ define hidden void @five_shorts(ptr noalias nocapture noundef writeonly %0, ptr
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 ; CHECK: i16x8.extmul_high_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
 ; CHECK: i16x8.extmul_high_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
+; CHECK: i8x16.shuffle  0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
 ; CHECK: v128.store
 ; CHECK: i16x8.extmul_low_i8x16_u
 ; CHECK: i16x8.extmul_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
+; CHECK: i8x16.shuffle  0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
 ; CHECK: v128.store
 define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -529,18 +529,18 @@ define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 ; CHECK: i16x8.extmul_high_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
 ; CHECK: i8x16.sub
-; CHECK: i8x16.shuffle {{.*}} 0, 24, 2, 25, 4, 26, 6, 27, 8, 28, 10, 29, 12, 30, 14, 31
+; CHECK: i8x16.shuffle  0, 24, 2, 25, 4, 26, 6, 27, 8, 28, 10, 29, 12, 30, 14, 31
 ; CHECK: v128.store
 ; CHECK: i16x8.extmul_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 17, 4, 18, 6, 19, 8, 20, 10, 21, 12, 22, 14, 23
+; CHECK: i8x16.shuffle  0, 16, 2, 17, 4, 18, 6, 19, 8, 20, 10, 21, 12, 22, 14, 23
 ; CHECK: v128.store
 define hidden void @two_bytes_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -672,27 +672,27 @@ define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writ
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.and
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.and
-; CHECK: i8x16.shuffle {{.*}} 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.and
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.and
-; CHECK: i8x16.shuffle {{.*}} 0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; CHECK: i8x16.shuffle  0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23
+; CHECK: i8x16.shuffle  0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; CHECK: i8x16.shuffle  0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0
+; CHECK: i8x16.shuffle  0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19
+; CHECK: i8x16.shuffle  0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
 ; CHECK: v128.store
 define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -740,25 +740,25 @@ define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}}, 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}}, 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extmul_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}}, 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}}, 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extmul_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}}, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
-; CHECK: i8x16.shuffle {{.*}}, 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}}, 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.sub
-; CHECK: i8x16.shuffle {{.*}}, 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}}, 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.sub
-; CHECK: i8x16.shuffle {{.*}}, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}}, 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}}, 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27
+; CHECK: i8x16.shuffle  0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27
 ; CHECK: v128.store
 define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -806,27 +806,27 @@ define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.add
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.sub
-; CHECK: i8x16.shuffle {{.*}} 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.add
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.sub
-; CHECK: i8x16.shuffle {{.*}} 0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; CHECK: i8x16.shuffle  0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23
+; CHECK: i8x16.shuffle  0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; CHECK: i8x16.shuffle  0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0
+; CHECK: i8x16.shuffle  0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19
+; CHECK: i8x16.shuffle  0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
 ; CHECK: v128.store
 define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -1272,45 +1272,45 @@ define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noun
 ; CHECK-LABEL: four_bytes_into_four_ints_vary_op:
 ; CHECK: loop
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extend_low_i16x8_u
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extend_low_i16x8_u
 ; CHECK: i32x4.add
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extend_low_i16x8_u
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extend_low_i16x8_u
 ; CHECK: i32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extmul_low_i16x8_u
 ; CHECK: v128.and
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extend_low_i16x8_u
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -1365,7 +1365,7 @@ define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noun
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+; CHECK: i8x16.shuffle  0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
 ; CHECK: v128.store
 define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
   %5 = icmp sgt i32 %3, 0
@@ -1396,35 +1396,35 @@ define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 no
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
 ; CHECK: i16x8.add
 ; CHECK: i16x8.shr_u
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
 ; CHECK: i16x8.add
 ; CHECK: i16x8.shr_u
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
+; CHECK: i8x16.shuffle  0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
 ; CHECK: v128.store
 define hidden void @scale_uv_row_down2_box(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
   %5 = icmp sgt i32 %3, 0
@@ -1492,13 +1492,13 @@ define hidden void @scale_uv_row_down2_box(ptr nocapture noundef readonly %0, i3
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.avgr_u 
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.avgr_u 
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
+; CHECK: i8x16.shuffle  0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
 ; CHECK: v128.store
 define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
   %5 = icmp sgt i32 %3, 0
@@ -1605,28 +1605,28 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-LABEL: two_bytes_two_floats_same_op:
 ; CHECK: loop
 ; CHECK: v128.load64_zero
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load64_zero
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s	
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1663,28 +1663,28 @@ for.body:                                         ; preds = %entry, %for.body
 
 ; CHECK-LABEL: two_bytes_two_floats_vary_op:
 ; CHECK: v128.load64_zero
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load64_zero
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.add
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1720,42 +1720,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: two_floats_two_bytes_same_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: v128.store64_lane
+; CHECK-NOT: v128.load
 define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp22.not = icmp eq i32 %N, 0
@@ -1788,42 +1753,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: two_floats_two_bytes_vary_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: f32x4.add
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: f32x4.sub
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: v128.store64_lane
+; CHECK-NOT: v128.load
 define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp21.not = icmp eq i32 %N, 0
@@ -1858,24 +1788,24 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-LABEL: two_shorts_two_floats_same_op:
 ; CHECK: loop
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1913,24 +1843,24 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-LABEL: two_shorts_two_floats_vary_op:
 ; CHECK: loop
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.add
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1969,38 +1899,22 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	65535, 65535, 65535, 65535
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
 ; CHECK: v128.store
 define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2037,38 +1951,22 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: f32x4.add
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	65535, 65535, 65535, 65535
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 ; CHECK: f32x4.sub
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
 ; CHECK: v128.store
 define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2195,58 +2093,58 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-LABEL: four_bytes_four_floats_same_op:
 ; CHECK: loop
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2302,58 +2200,58 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-LABEL: four_bytes_four_floats_vary_op:
 ; CHECK: loop
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.add
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.div
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2407,92 +2305,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: four_floats_four_bytes_same_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: v128.store
+; CHECK-NOT: v128.load
 define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp48.not = icmp eq i32 %N, 0
@@ -2541,92 +2354,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: four_floats_four_bytes_vary_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.add
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.div
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.sub
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: v128.store
+; CHECK-NOT: v128.load
 define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp45.not = icmp eq i32 %N, 0
@@ -2678,51 +2406,51 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2779,47 +2507,47 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.add
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.div
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2873,93 +2601,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: four_floats_four_shorts_same_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: v128.store
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: v128.store
+; CHECK-NOT: v128.load
 define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp48.not = icmp eq i32 %N, 0
@@ -3008,93 +2650,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: four_floats_four_shorts_vary_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.add
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.div
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.sub
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: v128.store
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: v128.store
+; CHECK-NOT: v128.load
 define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp45.not = icmp eq i32 %N, 0
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index 324a0c49fb413..d698fad745dfb 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -1992,6 +1992,389 @@ define <8 x i16> @avgr_u_v8i16_zext(<8 x i16> %x, <8 x i16> %y) {
   %c.trunc = trunc <8 x i32> %c to <8 x i16>
   ret <8 x i16> %c.trunc
 }
+define void @avgr_undef_shuffle_lanes(ptr %res, <8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; SIMD128-LABEL: avgr_undef_shuffle_lanes:
+; SIMD128:         .functype avgr_undef_shuffle_lanes (i32, v128, v128, v128, v128) -> ()
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.avgr_u $push1=, $1, $2
+; SIMD128-NEXT:    i8x16.shuffle $push12=, $pop1, $4, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
+; SIMD128-NEXT:    local.tee $push11=, $2=, $pop12
+; SIMD128-NEXT:    i8x16.avgr_u $push0=, $3, $4
+; SIMD128-NEXT:    i8x16.shuffle $push10=, $pop0, $4, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
+; SIMD128-NEXT:    local.tee $push9=, $4=, $pop10
+; SIMD128-NEXT:    i8x16.shuffle $push4=, $pop11, $pop9, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
+; SIMD128-NEXT:    v128.const $push8=, 255, 255, 255, 255, 255, 255, 255, 255
+; SIMD128-NEXT:    local.tee $push7=, $3=, $pop8
+; SIMD128-NEXT:    v128.and $push5=, $pop4, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $2, $4, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
+; SIMD128-NEXT:    v128.and $push3=, $pop2, $3
+; SIMD128-NEXT:    i8x16.narrow_i16x8_u $push6=, $pop5, $pop3
+; SIMD128-NEXT:    v128.store 0($0):p2align=0, $pop6
+; SIMD128-NEXT:    return
+;
+; SIMD128-FAST-LABEL: avgr_undef_shuffle_lanes:
+; SIMD128-FAST:         .functype avgr_undef_shuffle_lanes (i32, v128, v128, v128, v128) -> ()
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.avgr_u $push1=, $1, $2
+; SIMD128-FAST-NEXT:    i8x16.shuffle $push12=, $pop1, $4, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
+; SIMD128-FAST-NEXT:    local.tee $push11=, $2=, $pop12
+; SIMD128-FAST-NEXT:    i8x16.avgr_u $push0=, $3, $4
+; SIMD128-FAST-NEXT:    i8x16.shuffle $push10=, $pop0, $4, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
+; SIMD128-FAST-NEXT:    local.tee $push9=, $4=, $pop10
+; SIMD128-FAST-NEXT:    i8x16.shuffle $push4=, $pop11, $pop9, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
+; SIMD128-FAST-NEXT:    v128.const $push8=, 255, 255, 255, 255, 255, 255, 255, 255
+; SIMD128-FAST-NEXT:    local.tee $push7=, $3=, $pop8
+; SIMD128-FAST-NEXT:    v128.and $push5=, $pop4, $pop7
+; SIMD128-FAST-NEXT:    i8x16.shuffle $push2=, $2, $4, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
+; SIMD128-FAST-NEXT:    v128.and $push3=, $pop2, $3
+; SIMD128-FAST-NEXT:    i8x16.narrow_i16x8_u $push6=, $pop5, $pop3
+; SIMD128-FAST-NEXT:    v128.store 0($0):p2align=0, $pop6
+; SIMD128-FAST-NEXT:    return
+;
+; NO-SIMD128-LABEL: avgr_undef_shuffle_lanes:
+; NO-SIMD128:         .functype avgr_undef_shuffle_lanes (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-NEXT:    i32.and $push2=, $24, $pop0
+; NO-SIMD128-NEXT:    i32.const $push143=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop143
+; NO-SIMD128-NEXT:    i32.add $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.const $push4=, 1
+; NO-SIMD128-NEXT:    i32.add $push5=, $pop3, $pop4
+; NO-SIMD128-NEXT:    i32.const $push142=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop142
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push141=, 255
+; NO-SIMD128-NEXT:    i32.and $push8=, $8, $pop141
+; NO-SIMD128-NEXT:    i32.const $push140=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $16, $pop140
+; NO-SIMD128-NEXT:    i32.add $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.const $push139=, 1
+; NO-SIMD128-NEXT:    i32.add $push10=, $pop9, $pop139
+; NO-SIMD128-NEXT:    i32.const $push138=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push11=, $pop10, $pop138
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push137=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $23, $pop137
+; NO-SIMD128-NEXT:    i32.const $push136=, 255
+; NO-SIMD128-NEXT:    i32.and $push12=, $31, $pop136
+; NO-SIMD128-NEXT:    i32.add $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.const $push135=, 1
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop135
+; NO-SIMD128-NEXT:    i32.const $push134=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push16=, $pop15, $pop134
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push133=, 255
+; NO-SIMD128-NEXT:    i32.and $push18=, $7, $pop133
+; NO-SIMD128-NEXT:    i32.const $push132=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $15, $pop132
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.const $push131=, 1
+; NO-SIMD128-NEXT:    i32.add $push20=, $pop19, $pop131
+; NO-SIMD128-NEXT:    i32.const $push130=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop130
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push129=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $22, $pop129
+; NO-SIMD128-NEXT:    i32.const $push128=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $30, $pop128
+; NO-SIMD128-NEXT:    i32.add $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.const $push127=, 1
+; NO-SIMD128-NEXT:    i32.add $push25=, $pop24, $pop127
+; NO-SIMD128-NEXT:    i32.const $push126=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push26=, $pop25, $pop126
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop26
+; NO-SIMD128-NEXT:    i32.const $push125=, 255
+; NO-SIMD128-NEXT:    i32.and $push28=, $6, $pop125
+; NO-SIMD128-NEXT:    i32.const $push124=, 255
+; NO-SIMD128-NEXT:    i32.and $push27=, $14, $pop124
+; NO-SIMD128-NEXT:    i32.add $push29=, $pop28, $pop27
+; NO-SIMD128-NEXT:    i32.const $push123=, 1
+; NO-SIMD128-NEXT:    i32.add $push30=, $pop29, $pop123
+; NO-SIMD128-NEXT:    i32.const $push122=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push31=, $pop30, $pop122
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop31
+; NO-SIMD128-NEXT:    i32.const $push121=, 255
+; NO-SIMD128-NEXT:    i32.and $push33=, $21, $pop121
+; NO-SIMD128-NEXT:    i32.const $push120=, 255
+; NO-SIMD128-NEXT:    i32.and $push32=, $29, $pop120
+; NO-SIMD128-NEXT:    i32.add $push34=, $pop33, $pop32
+; NO-SIMD128-NEXT:    i32.const $push119=, 1
+; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop119
+; NO-SIMD128-NEXT:    i32.const $push118=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push36=, $pop35, $pop118
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push117=, 255
+; NO-SIMD128-NEXT:    i32.and $push38=, $5, $pop117
+; NO-SIMD128-NEXT:    i32.const $push116=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $13, $pop116
+; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.const $push115=, 1
+; NO-SIMD128-NEXT:    i32.add $push40=, $pop39, $pop115
+; NO-SIMD128-NEXT:    i32.const $push114=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop114
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop41
+; NO-SIMD128-NEXT:    i32.const $push113=, 255
+; NO-SIMD128-NEXT:    i32.and $push43=, $20, $pop113
+; NO-SIMD128-NEXT:    i32.const $push112=, 255
+; NO-SIMD128-NEXT:    i32.and $push42=, $28, $pop112
+; NO-SIMD128-NEXT:    i32.add $push44=, $pop43, $pop42
+; NO-SIMD128-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-NEXT:    i32.add $push45=, $pop44, $pop111
+; NO-SIMD128-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push46=, $pop45, $pop110
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop46
+; NO-SIMD128-NEXT:    i32.const $push109=, 255
+; NO-SIMD128-NEXT:    i32.and $push48=, $4, $pop109
+; NO-SIMD128-NEXT:    i32.const $push108=, 255
+; NO-SIMD128-NEXT:    i32.and $push47=, $12, $pop108
+; NO-SIMD128-NEXT:    i32.add $push49=, $pop48, $pop47
+; NO-SIMD128-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-NEXT:    i32.add $push50=, $pop49, $pop107
+; NO-SIMD128-NEXT:    i32.const $push106=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push51=, $pop50, $pop106
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop51
+; NO-SIMD128-NEXT:    i32.const $push105=, 255
+; NO-SIMD128-NEXT:    i32.and $push53=, $19, $pop105
+; NO-SIMD128-NEXT:    i32.const $push104=, 255
+; NO-SIMD128-NEXT:    i32.and $push52=, $27, $pop104
+; NO-SIMD128-NEXT:    i32.add $push54=, $pop53, $pop52
+; NO-SIMD128-NEXT:    i32.const $push103=, 1
+; NO-SIMD128-NEXT:    i32.add $push55=, $pop54, $pop103
+; NO-SIMD128-NEXT:    i32.const $push102=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push56=, $pop55, $pop102
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop56
+; NO-SIMD128-NEXT:    i32.const $push101=, 255
+; NO-SIMD128-NEXT:    i32.and $push58=, $3, $pop101
+; NO-SIMD128-NEXT:    i32.const $push100=, 255
+; NO-SIMD128-NEXT:    i32.and $push57=, $11, $pop100
+; NO-SIMD128-NEXT:    i32.add $push59=, $pop58, $pop57
+; NO-SIMD128-NEXT:    i32.const $push99=, 1
+; NO-SIMD128-NEXT:    i32.add $push60=, $pop59, $pop99
+; NO-SIMD128-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push61=, $pop60, $pop98
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop61
+; NO-SIMD128-NEXT:    i32.const $push97=, 255
+; NO-SIMD128-NEXT:    i32.and $push63=, $18, $pop97
+; NO-SIMD128-NEXT:    i32.const $push96=, 255
+; NO-SIMD128-NEXT:    i32.and $push62=, $26, $pop96
+; NO-SIMD128-NEXT:    i32.add $push64=, $pop63, $pop62
+; NO-SIMD128-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-NEXT:    i32.add $push65=, $pop64, $pop95
+; NO-SIMD128-NEXT:    i32.const $push94=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push66=, $pop65, $pop94
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop66
+; NO-SIMD128-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-NEXT:    i32.and $push68=, $2, $pop93
+; NO-SIMD128-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-NEXT:    i32.and $push67=, $10, $pop92
+; NO-SIMD128-NEXT:    i32.add $push69=, $pop68, $pop67
+; NO-SIMD128-NEXT:    i32.const $push91=, 1
+; NO-SIMD128-NEXT:    i32.add $push70=, $pop69, $pop91
+; NO-SIMD128-NEXT:    i32.const $push90=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push71=, $pop70, $pop90
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop71
+; NO-SIMD128-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-NEXT:    i32.and $push73=, $17, $pop89
+; NO-SIMD128-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-NEXT:    i32.and $push72=, $25, $pop88
+; NO-SIMD128-NEXT:    i32.add $push74=, $pop73, $pop72
+; NO-SIMD128-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-NEXT:    i32.add $push75=, $pop74, $pop87
+; NO-SIMD128-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push76=, $pop75, $pop86
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop76
+; NO-SIMD128-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-NEXT:    i32.and $push78=, $1, $pop85
+; NO-SIMD128-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-NEXT:    i32.and $push77=, $9, $pop84
+; NO-SIMD128-NEXT:    i32.add $push79=, $pop78, $pop77
+; NO-SIMD128-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-NEXT:    i32.add $push80=, $pop79, $pop83
+; NO-SIMD128-NEXT:    i32.const $push82=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push81=, $pop80, $pop82
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop81
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: avgr_undef_shuffle_lanes:
+; NO-SIMD128-FAST:         .functype avgr_undef_shuffle_lanes (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $17, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.const $push143=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $25, $pop143
+; NO-SIMD128-FAST-NEXT:    i32.add $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $pop3, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push142=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop142
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push141=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $1, $pop141
+; NO-SIMD128-FAST-NEXT:    i32.const $push140=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $9, $pop140
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push139=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $pop9, $pop139
+; NO-SIMD128-FAST-NEXT:    i32.const $push138=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push11=, $pop10, $pop138
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push137=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $18, $pop137
+; NO-SIMD128-FAST-NEXT:    i32.const $push136=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $26, $pop136
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push135=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop135
+; NO-SIMD128-FAST-NEXT:    i32.const $push134=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push16=, $pop15, $pop134
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push133=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $2, $pop133
+; NO-SIMD128-FAST-NEXT:    i32.const $push132=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $10, $pop132
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push131=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $pop19, $pop131
+; NO-SIMD128-FAST-NEXT:    i32.const $push130=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop130
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push129=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $19, $pop129
+; NO-SIMD128-FAST-NEXT:    i32.const $push128=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $27, $pop128
+; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push127=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $pop24, $pop127
+; NO-SIMD128-FAST-NEXT:    i32.const $push126=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push26=, $pop25, $pop126
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push125=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $3, $pop125
+; NO-SIMD128-FAST-NEXT:    i32.const $push124=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $11, $pop124
+; NO-SIMD128-FAST-NEXT:    i32.add $push29=, $pop28, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push123=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $pop29, $pop123
+; NO-SIMD128-FAST-NEXT:    i32.const $push122=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push31=, $pop30, $pop122
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push121=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $20, $pop121
+; NO-SIMD128-FAST-NEXT:    i32.const $push120=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $28, $pop120
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $pop33, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push119=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push35=, $pop34, $pop119
+; NO-SIMD128-FAST-NEXT:    i32.const $push118=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push36=, $pop35, $pop118
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $4, $pop117
+; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $12, $pop116
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $pop39, $pop115
+; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop114
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $21, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $29, $pop112
+; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $pop43, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push45=, $pop44, $pop111
+; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push46=, $pop45, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $5, $pop109
+; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $13, $pop108
+; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $pop48, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $pop49, $pop107
+; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push51=, $pop50, $pop106
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $22, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $30, $pop104
+; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $pop53, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop103
+; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push56=, $pop55, $pop102
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $6, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $14, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $pop58, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $pop59, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push61=, $pop60, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop61
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $23, $pop97
+; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $31, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.add $push64=, $pop63, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $pop64, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push66=, $pop65, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $7, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push67=, $15, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.add $push69=, $pop68, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push70=, $pop69, $pop91
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push71=, $pop70, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop71
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push73=, $24, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push72=, $32, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $pop73, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push75=, $pop74, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push76=, $pop75, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push78=, $8, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push77=, $16, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $pop78, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $pop79, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push81=, $pop80, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop81
+; NO-SIMD128-FAST-NEXT:    return
+  %zext.0 = zext <8 x i8> %a to <8 x i16>
+  %zext.1 = zext <8 x i8> %b to <8 x i16>
+  %add.0 = add nuw nsw <8 x i16> %zext.0, splat (i16 1)
+  %add.1 = add nuw nsw <8 x i16> %add.0, %zext.1
+  %shift.0 = lshr <8 x i16> %add.1, splat (i16 1)
+  %zext.2 = zext <8 x i8> %c to <8 x i16>
+  %zext.3 = zext <8 x i8> %d to <8 x i16>
+  %add.2 = add nuw nsw <8 x i16> %zext.2, splat (i16 1)
+  %add.3 = add nuw nsw <8 x i16> %add.2, %zext.3
+  %shift.1 = lshr <8 x i16> %add.3, splat (i16 1)
+  %shuffle = shufflevector <8 x i16> %shift.0, <8 x i16> %shift.1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %trunc = trunc nuw <16 x i16> %shuffle to <16 x i8>
+  store <16 x i8> %trunc, ptr %res, align 1
+  ret void
+}
 define <16 x i8> @avgr_u_v16i8_wrap(<16 x i8> %x, <16 x i8> %y) {
 ; SIMD128-LABEL: avgr_u_v16i8_wrap:
 ; SIMD128:         .functype avgr_u_v16i8_wrap (v128, v128) -> (v128)
diff --git a/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll
index 75612ba645ca4..9e4faa96dbf26 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll
@@ -15,16 +15,15 @@ define void @shl_loop(ptr %a, i8 %shift, i32 %count) {
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    loop # label0:
 ; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.tee 3
 ; CHECK-NEXT:    i32.const 16
 ; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    local.tee 3
-; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    local.get 3
 ; CHECK-NEXT:    v128.load 0:p2align=0
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i8x16.shl
 ; CHECK-NEXT:    v128.store 0
-; CHECK-NEXT:    local.get 3
-; CHECK-NEXT:    local.set 0
 ; CHECK-NEXT:    local.get 2
 ; CHECK-NEXT:    i32.const -1
 ; CHECK-NEXT:    i32.add
@@ -64,10 +63,11 @@ define void @shl_phi_loop(ptr %a, i8 %shift, i32 %count) {
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    loop # label1:
 ; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.tee 3
 ; CHECK-NEXT:    i32.const 16
 ; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    local.tee 3
-; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    local.get 3
 ; CHECK-NEXT:    v128.load 0:p2align=0
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i8x16.shl
@@ -76,8 +76,6 @@ define void @shl_phi_loop(ptr %a, i8 %shift, i32 %count) {
 ; CHECK-NEXT:    i32.const 1
 ; CHECK-NEXT:    i32.and
 ; CHECK-NEXT:    local.set 1
-; CHECK-NEXT:    local.get 3
-; CHECK-NEXT:    local.set 0
 ; CHECK-NEXT:    local.get 2
 ; CHECK-NEXT:    i32.const -1
 ; CHECK-NEXT:    i32.add
diff --git a/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll b/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
index 9c638199bb6e6..1cfda8a821bd6 100644
--- a/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
+++ b/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
@@ -10,7 +10,7 @@ declare i32 @has_ptr_arg(ptr)
 
 ; CHECK-LABEL: test_invalid_rtn:
 ; CHECK:      i32.const   $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid.2, $pop[[L0]]{{$}}
+; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid.1, $pop[[L0]]{{$}}
 ; CHECK-NEXT: drop $pop[[L1]]{{$}}
 ; CHECK-NEXT: i64.const   $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid, $pop[[L0]]{{$}}
@@ -32,7 +32,7 @@ define void @test_struct_rtn() {
 
 ; CHECK-LABEL: test_invalid_arg:
 ; CHECK:      	i32.const	$push[[L0:[0-9]+]]=, 2{{$}}
-; CHECK-NEXT: 	call	$push[[L1:[0-9]+]]=, .Lhas_ptr_arg_bitcast_invalid.4, $pop[[L0]]{{$}}
+; CHECK-NEXT: 	call	$push[[L1:[0-9]+]]=, .Lhas_ptr_arg_bitcast_invalid.2, $pop[[L0]]{{$}}
 ; CHECK-NEXT: 	drop	$pop[[L1]]{{$}}
 ; CHECK-NEXT: 	i32.const	$push[[L0:[0-9]+]]=, 2{{$}}
 ; CHECK-NEXT: 	call	$push[[L1:[0-9]+]]=, has_ptr_arg, $pop[[L0]]{{$}}
@@ -54,8 +54,8 @@ entry:
 ; CHECK-NEXT:  unreachable
 ; CHECK-NEXT:  end_function
 
-; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid.2:
-; CHECK-NEXT:  .functype .Lhas_i64_arg_bitcast_invalid.2 (i32) -> (i32)
+; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid.1:
+; CHECK-NEXT:  .functype .Lhas_i64_arg_bitcast_invalid.1 (i32) -> (i32)
 ; CHECK-NEXT:  unreachable
 ; CHECK-NEXT:  end_function
 
@@ -64,7 +64,7 @@ entry:
 ; CHECK-NEXT: 	unreachable
 ; CHECK-NEXT: 	end_function
 
-; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid.4:
-; CHECK-NEXT: 	.functype .Lhas_ptr_arg_bitcast_invalid.4 (i32) -> (i32)
+; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid.2:
+; CHECK-NEXT: 	.functype .Lhas_ptr_arg_bitcast_invalid.2 (i32) -> (i32)
 ; CHECK-NEXT: 	unreachable
 ; CHECK-NEXT: 	end_function
diff --git a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
index 1962ddebc2115..f2b4c49b1dbcd 100644
--- a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
+++ b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
@@ -36,10 +36,10 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:  Ltmp0:
+; CHECK-NEXT:  Ltmp0: ## EH_LABEL
 ; CHECK-NEXT:    ## implicit-def: $ebx
 ; CHECK-NEXT:    calll __Znam
-; CHECK-NEXT:  Ltmp1:
+; CHECK-NEXT:  Ltmp1: ## EH_LABEL
 ; CHECK-NEXT:  ## %bb.1: ## %bb11
 ; CHECK-NEXT:    movl %eax, %esi
 ; CHECK-NEXT:    movb $1, %al
@@ -58,13 +58,13 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
 ; CHECK-NEXT:    jne LBB0_9
 ; CHECK-NEXT:  ## %bb.10: ## %bb41
 ; CHECK-NEXT:    ## in Loop: Header=BB0_8 Depth=1
-; CHECK-NEXT:  Ltmp2:
+; CHECK-NEXT:  Ltmp2: ## EH_LABEL
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %esi, (%esp)
 ; CHECK-NEXT:    calll _Pjii
-; CHECK-NEXT:  Ltmp3:
+; CHECK-NEXT:  Ltmp3: ## EH_LABEL
 ; CHECK-NEXT:  ## %bb.11: ## %bb42
 ; CHECK-NEXT:    ## in Loop: Header=BB0_8 Depth=1
 ; CHECK-NEXT:    xorl %eax, %eax
@@ -126,20 +126,20 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
 ; CHECK-NEXT:    decl {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; CHECK-NEXT:    jmp LBB0_8
 ; CHECK-NEXT:  LBB0_18: ## %bb43
-; CHECK-NEXT:  Ltmp5:
+; CHECK-NEXT:  Ltmp5: ## EH_LABEL
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    calll _OnOverFlow
-; CHECK-NEXT:  Ltmp6:
+; CHECK-NEXT:  Ltmp6: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_3
 ; CHECK-NEXT:  LBB0_2: ## %bb29
-; CHECK-NEXT:  Ltmp7:
+; CHECK-NEXT:  Ltmp7: ## EH_LABEL
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    calll _OnOverFlow
-; CHECK-NEXT:  Ltmp8:
+; CHECK-NEXT:  Ltmp8: ## EH_LABEL
 ; CHECK-NEXT:  LBB0_3: ## %bb30
 ; CHECK-NEXT:    ud2
 ; CHECK-NEXT:  LBB0_4: ## %bb20.loopexit
-; CHECK-NEXT:  Ltmp4:
+; CHECK-NEXT:  Ltmp4: ## EH_LABEL
 ; CHECK-NEXT:  LBB0_9:
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:  LBB0_6: ## %bb23
@@ -151,7 +151,7 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
 ; CHECK-NEXT:  LBB0_5: ## %bb20.loopexit.split-lp
-; CHECK-NEXT:  Ltmp9:
+; CHECK-NEXT:  Ltmp9: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_6
 ; CHECK-NEXT:  Lfunc_end0:
 bb:
diff --git a/llvm/test/CodeGen/X86/3addr-16bit.ll b/llvm/test/CodeGen/X86/3addr-16bit.ll
index c9390d91d59c2..2b692bff0461e 100644
--- a/llvm/test/CodeGen/X86/3addr-16bit.ll
+++ b/llvm/test/CodeGen/X86/3addr-16bit.ll
@@ -10,27 +10,27 @@ define zeroext i16 @test1(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X64-LABEL: test1:
 ; X64:       ## %bb.0: ## %entry
 ; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    incl %eax
-; X64-NEXT:    cmpw %di, %si
+; X64-NEXT:    incl %esi
+; X64-NEXT:    cmpw %di, %ax
 ; X64-NEXT:    jne LBB0_2
 ; X64-NEXT:  ## %bb.1: ## %bb
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movzwl %ax, %ebx
+; X64-NEXT:    movzwl %si, %ebx
 ; X64-NEXT:    movl %ebx, %edi
 ; X64-NEXT:    callq _foo
 ; X64-NEXT:    movl %ebx, %eax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
 ; X64-NEXT:  LBB0_2: ## %bb1
-; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %si, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test1:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    incl %eax
 ; X86-NEXT:    cmpw {{[0-9]+}}(%esp), %cx
 ; X86-NEXT:    jne LBB0_2
@@ -63,27 +63,27 @@ define zeroext i16 @test2(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X64-LABEL: test2:
 ; X64:       ## %bb.0: ## %entry
 ; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    decl %eax
-; X64-NEXT:    cmpw %di, %si
+; X64-NEXT:    decl %esi
+; X64-NEXT:    cmpw %di, %ax
 ; X64-NEXT:    jne LBB1_2
 ; X64-NEXT:  ## %bb.1: ## %bb
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movzwl %ax, %ebx
+; X64-NEXT:    movzwl %si, %ebx
 ; X64-NEXT:    movl %ebx, %edi
 ; X64-NEXT:    callq _foo
 ; X64-NEXT:    movl %ebx, %eax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
 ; X64-NEXT:  LBB1_2: ## %bb1
-; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %si, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test2:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    decl %eax
 ; X86-NEXT:    cmpw {{[0-9]+}}(%esp), %cx
 ; X86-NEXT:    jne LBB1_2
@@ -118,27 +118,27 @@ define zeroext i16 @test3(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X64-LABEL: test3:
 ; X64:       ## %bb.0: ## %entry
 ; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    addl $2, %eax
-; X64-NEXT:    cmpw %di, %si
+; X64-NEXT:    addl $2, %esi
+; X64-NEXT:    cmpw %di, %ax
 ; X64-NEXT:    jne LBB2_2
 ; X64-NEXT:  ## %bb.1: ## %bb
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movzwl %ax, %ebx
+; X64-NEXT:    movzwl %si, %ebx
 ; X64-NEXT:    movl %ebx, %edi
 ; X64-NEXT:    callq _foo
 ; X64-NEXT:    movl %ebx, %eax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
 ; X64-NEXT:  LBB2_2: ## %bb1
-; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %si, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test3:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl $2, %eax
 ; X86-NEXT:    cmpw {{[0-9]+}}(%esp), %cx
 ; X86-NEXT:    jne LBB2_2
@@ -171,19 +171,19 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X64-LABEL: test4:
 ; X64:       ## %bb.0: ## %entry
 ; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    addl %edi, %eax
-; X64-NEXT:    cmpw %di, %si
+; X64-NEXT:    addl %edi, %esi
+; X64-NEXT:    cmpw %di, %ax
 ; X64-NEXT:    jne LBB3_2
 ; X64-NEXT:  ## %bb.1: ## %bb
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movzwl %ax, %ebx
+; X64-NEXT:    movzwl %si, %ebx
 ; X64-NEXT:    movl %ebx, %edi
 ; X64-NEXT:    callq _foo
 ; X64-NEXT:    movl %ebx, %eax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
 ; X64-NEXT:  LBB3_2: ## %bb1
-; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %si, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test4:
@@ -191,8 +191,8 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    cmpw %cx, %dx
 ; X86-NEXT:    jne LBB3_2
diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
index 87059c5d474e6..6ae7b2260c15c 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
 
 define dso_local void @test_no_bitcast(ptr %A_mem, ptr %B_mem, ptr %C_mem) local_unnamed_addr #0 {
 ; CHECK-LABEL: @test_no_bitcast(
diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
index 5fb2dcdc1d621..ca7c3573a3294 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
 
 define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, ptr%ptr, i64 %stride, ptr %vptr) {
 ; CHECK-LABEL: @test_amx_load_non_O0(
diff --git a/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll b/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll
new file mode 100644
index 0000000000000..841c9a6d62d9e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=CHECK
+
+define void @test_reloc_none() {
+; CHECK-LABEL: test_reloc_none:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:  .Lreloc_none0:
+; CHECK-NEXT:    .reloc .Lreloc_none0, BFD_RELOC_NONE, foo
+; CHECK-NEXT:    retq
+  call void @llvm.reloc.none(metadata !"foo")
+  ret void
+}
+
+declare void @llvm.reloc.none(metadata)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
index 41e1b5bf22bf1..5c059a4e0539d 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
@@ -1,5 +1,6 @@
-# RUN: llc -mtriple=i386-linux-gnu   -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
-# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=i386-linux-gnu   -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X86
+# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X64
 
 --- |
 
@@ -30,24 +31,23 @@
 ...
 ---
 name:            test_copy
-# ALL-LABEL: name:  test_copy
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr8, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 1, class: gr32, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
-# ALL:          %0:gr8 = COPY $al
-# ALL-NEXT:     %1:gr32 = MOVZX32rr8 %0
-# ALL-NEXT:     $eax = COPY %1
-# ALL-NEXT:     RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax
 
+    ; CHECK-LABEL: name: test_copy
+    ; CHECK: liveins: $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al
+    ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]]
+    ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; CHECK-NEXT: RET 0, implicit $eax
     %0(s8) = COPY $al
     %1(s32) = G_ZEXT %0(s8)
     $eax = COPY %1(s32)
@@ -56,24 +56,23 @@ body:             |
 ...
 ---
 name:            test_copy2
-# ALL-LABEL: name:  test_copy2
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr8, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 1, class: gr32, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
-# ALL:          %0:gr8 = COPY $al
-# ALL-NEXT:     %1:gr32 = MOVZX32rr8 %0
-# ALL-NEXT:     $eax = COPY %1
-# ALL-NEXT:     RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax
 
+    ; CHECK-LABEL: name: test_copy2
+    ; CHECK: liveins: $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al
+    ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]]
+    ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; CHECK-NEXT: RET 0, implicit $eax
     %0(s8) = COPY $al
     %1(s32) = G_ZEXT %0(s8)
     $eax = COPY %1(s32)
@@ -82,30 +81,35 @@ body:             |
 ...
 ---
 name:            test_copy3
-# ALL-LABEL: name:  test_copy3
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr16[[ABCD:(_abcd)?]], preferred-register: '', flags: [   ] }
-# X32-NEXT:   - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [   ] }
-# X64-NEXT:   - { id: 1, class: gr8, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 2, class: gr32, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
   - { id: 2, class: gpr, preferred-register: '' }
-# ALL:          %0:gr16 = COPY $ax
-# X32-NEXT:     %3:gr16_abcd = COPY %0
-# X32-NEXT:     %1:gr8_abcd_l = COPY %3.sub_8bit
-# X64-NEXT:     %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT:     %2:gr32 = MOVZX32rr8 %1
-# ALL-NEXT:     $eax = COPY %2
-# ALL-NEXT:     RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax
 
+    ; X86-LABEL: name: test_copy3
+    ; X86: liveins: $eax
+    ; X86-NEXT: {{  $}}
+    ; X86-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax
+    ; X86-NEXT: [[COPY1:%[0-9]+]]:gr16_abcd = COPY [[COPY]]
+    ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit
+    ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]]
+    ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; X86-NEXT: RET 0, implicit $eax
+    ;
+    ; X64-LABEL: name: test_copy3
+    ; X64: liveins: $eax
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax
+    ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
+    ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]]
+    ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; X64-NEXT: RET 0, implicit $eax
     %0(s16) = COPY $ax
     %1(s8) = G_TRUNC %0(s16)
     %2(s32) = G_ZEXT %1(s8)
@@ -115,27 +119,25 @@ body:             |
 ...
 ---
 name:            test_copy4
-# ALL-LABEL: name:  test_copy4
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr32, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 1, class: gr16, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 2, class: gr32, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
   - { id: 2, class: gpr, preferred-register: '' }
-# ALL:          %0:gr32 = COPY $eax
-# ALL-NEXT:     %1:gr16 = COPY %0.sub_16bit
-# ALL-NEXT:     %2:gr32 = MOVZX32rr16 %1
-# ALL-NEXT:     $eax = COPY %2
-# ALL-NEXT:     RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax
 
+    ; CHECK-LABEL: name: test_copy4
+    ; CHECK: liveins: $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $eax
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
+    ; CHECK-NEXT: [[MOVZX32rr16_:%[0-9]+]]:gr32 = MOVZX32rr16 [[COPY1]]
+    ; CHECK-NEXT: $eax = COPY [[MOVZX32rr16_]]
+    ; CHECK-NEXT: RET 0, implicit $eax
     %0(s32) = COPY $eax
     %1(s16) = G_TRUNC %0(s32)
     %2(s32) = G_ZEXT %1(s16)
@@ -145,30 +147,35 @@ body:             |
 ...
 ---
 name:            test_copy5
-# ALL-LABEL: name:  test_copy5
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr32[[ABCD:(_abcd)?]], preferred-register: '', flags: [   ] }
-# X32-NEXT:   - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [   ] }
-# X64-NEXT:   - { id: 1, class: gr8, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 2, class: gr32, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
   - { id: 2, class: gpr, preferred-register: '' }
-# ALL:          %0:gr32 = COPY $edx
-# X32-NEXT:     %3:gr32_abcd = COPY %0
-# X32-NEXT:     %1:gr8_abcd_l = COPY %3.sub_8bit
-# X64-NEXT:     %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT:     %2:gr32 = MOVZX32rr8 %1
-# ALL-NEXT:     $eax = COPY %2
-# ALL-NEXT:     RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax,$edx
 
+    ; X86-LABEL: name: test_copy5
+    ; X86: liveins: $eax, $edx
+    ; X86-NEXT: {{  $}}
+    ; X86-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+    ; X86-NEXT: [[COPY1:%[0-9]+]]:gr32_abcd = COPY [[COPY]]
+    ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit
+    ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]]
+    ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; X86-NEXT: RET 0, implicit $eax
+    ;
+    ; X64-LABEL: name: test_copy5
+    ; X64: liveins: $eax, $edx
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+    ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
+    ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]]
+    ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; X64-NEXT: RET 0, implicit $eax
     %0(s32) = COPY $edx
     %1(s8) = G_TRUNC %0(s32)
     %2(s32) = G_ANYEXT %1(s8)
@@ -178,29 +185,26 @@ body:             |
 ...
 ---
 name:            test_copy6
-# ALL-LABEL: name:  test_copy6
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr32, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 1, class: gr16, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 2, class: low32_addr_access_rbp, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 3, class: low32_addr_access_rbp, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
   - { id: 2, class: gpr, preferred-register: '' }
-# ALL:         %0:gr32 = COPY $edx
-# ALL-NEXT:    %1:gr16 = COPY %0.sub_16bit
-# ALL-NEXT:    %3:low32_addr_access_rbp = IMPLICIT_DEF
-# ALL-NEXT:    %2:low32_addr_access_rbp = INSERT_SUBREG %3, %1, %subreg.sub_16bit
-# ALL-NEXT:    $eax = COPY %2
-# ALL-NEXT:    RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax,$edx
 
+    ; CHECK-LABEL: name: test_copy6
+    ; CHECK: liveins: $eax, $edx
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:low32_addr_access_rbp = IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:low32_addr_access_rbp = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.sub_16bit
+    ; CHECK-NEXT: $eax = COPY [[INSERT_SUBREG]]
+    ; CHECK-NEXT: RET 0, implicit $eax
     %0(s32) = COPY $edx
     %1(s16) = G_TRUNC %0(s32)
     %2(s32) = G_ANYEXT %1(s16)
diff --git a/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir b/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir
index 348a2901ff6a4..24453066f2583 100644
--- a/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir
+++ b/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir
@@ -55,7 +55,7 @@
   !9 = !DILocalVariable(name: "4", scope: !5, file: !1, line: 4, type: !10)
   !10 = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_unsigned)
   !11 = !DILocation(line: 4, column: 1, scope: !5)
-  !12 = distinct !DISubprogram(name: "test_2", linkageName: "test_2", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+  !12 = distinct !DISubprogram(name: "test_2", linkageName: "test_2", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !7)
 
 ...
 ---
diff --git a/llvm/test/CodeGen/X86/amx-tf32-internal.ll b/llvm/test/CodeGen/X86/amx-tf32-internal.ll
index 6d0f3c57c08d8..caf7a1cb7bd2d 100644
--- a/llvm/test/CodeGen/X86/amx-tf32-internal.ll
+++ b/llvm/test/CodeGen/X86/amx-tf32-internal.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+avx512f, \
-; RUN: -mattr=+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s
+; RUN: -mattr=+amx-tf32 -verify-machineinstrs | FileCheck %s
 
 define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
 ; CHECK-LABEL: test_amx:
@@ -20,7 +20,6 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
 ; CHECK-NEXT:    tilezero %tmm1
 ; CHECK-NEXT:    tilezero %tmm2
 ; CHECK-NEXT:    tmmultf32ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT:    ttmmultf32ps %tmm1, %tmm0, %tmm2
 ; CHECK-NEXT:    tilestored %tmm2, (%rdi,%rdx)
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    vzeroupper
@@ -31,9 +30,8 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
   %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
 
   %c1 = call x86_amx @llvm.x86.tmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
-  %c2 = call x86_amx @llvm.x86.ttmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b)
 
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c2)
+  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c1)
   ret void
 }
 
@@ -43,4 +41,3 @@ declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
 
 
 declare x86_amx @llvm.x86.tmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.ttmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
diff --git a/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll
index af1a7ae102975..642c1b7317f81 100644
--- a/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-tf32 -verify-machineinstrs | FileCheck %s
 
 define void @test_tmmultf32ps() {
 ; CHECK-LABEL: test_tmmultf32ps:
@@ -11,13 +11,3 @@ define void @test_tmmultf32ps() {
 }
 declare void @llvm.x86.tmmultf32ps(i8 %A, i8 %B, i8 %C)
 
-define void @test_ttmmultf32ps() {
-; CHECK-LABEL: test_ttmmultf32ps:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ttmmultf32ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT:    retq
-  call void @llvm.x86.ttmmultf32ps(i8 1, i8 2, i8 3)
-  ret  void
-}
-declare void @llvm.x86.ttmmultf32ps(i8 %A, i8 %B, i8 %C)
-
diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
deleted file mode 100755
index 1f5758c804b2b..0000000000000
--- a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
+++ /dev/null
@@ -1,122 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0
-; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2
-; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR
-
-define void @test_amx(i64 %stride, i8* %addr1) #0 {
-; CHECK-LABEL: test_amx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    t2rpntlvwz0rs (%rsi,%rdi), %tmm0
-; CHECK-NEXT:    t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2
-; CHECK-NEXT:    t2rpntlvwz1rs (%rsi,%rdi), %tmm0
-; CHECK-NEXT:    t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2
-; CHECK-NEXT:    retq
-;
-; EGPR-LABEL: test_amx:
-; EGPR:       # %bb.0:
-; EGPR-NEXT:    t2rpntlvwz0rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x04,0x3e]
-; EGPR-NEXT:    t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x3e]
-; EGPR-NEXT:    t2rpntlvwz1rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x04,0x3e]
-; EGPR-NEXT:    t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x3e]
-; EGPR-NEXT:    retq # encoding: [0xc3]
-  call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride)
-  call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride)
-  call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride)
-  call void @llvm.x86.t2rpntlvwz1rst1(i8 2, i8* %addr1, i64 %stride)
-  ret void
-}
-declare void @llvm.x86.t2rpntlvwz0rs(i8 , i8* , i64 )
-declare void @llvm.x86.t2rpntlvwz0rst1(i8 , i8* , i64 )
-declare void @llvm.x86.t2rpntlvwz1rs(i8 , i8* , i64 )
-declare void @llvm.x86.t2rpntlvwz1rst1(i8 , i8* , i64 )
-
-define void @test_amx2(i8* %base, i64 %stride) #0 {
-; O0-LABEL: test_amx2:
-; O0:       # %bb.0:
-; O0-NEXT:    xorps %xmm0, %xmm0
-; O0-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movw $8, %ax
-; O0-NEXT:    # implicit-def: $al
-; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    # implicit-def: $al
-; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
-; O0-NEXT:    t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4
-; O0-NEXT:    movw $8, %ax
-; O0-NEXT:    # implicit-def: $al
-; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    # implicit-def: $al
-; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
-; O0-NEXT:    t2rpntlvwz1rs (%rdi,%rsi), %tmm4
-; O0-NEXT:    movw $8, %ax
-; O0-NEXT:    # implicit-def: $al
-; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    # implicit-def: $al
-; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
-; O0-NEXT:    t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
-; O0-NEXT:    tilerelease
-; O0-NEXT:    retq
-;
-; O2-LABEL: test_amx2:
-; O2:       # %bb.0:
-; O2-NEXT:    xorps %xmm0, %xmm0
-; O2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; O2-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
-; O2-NEXT:    movw $8, %ax
-; O2-NEXT:    t2rpntlvwz0rs (%rdi,%rsi), %tmm4
-; O2-NEXT:    t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4
-; O2-NEXT:    t2rpntlvwz1rs (%rdi,%rsi), %tmm4
-; O2-NEXT:    t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
-; O2-NEXT:    tilerelease
-; O2-NEXT:    retq
-;
-; EGPR-LABEL: test_amx2:
-; EGPR:       # %bb.0:
-; EGPR-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
-; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xc0]
-; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xd0]
-; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xe0]
-; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xf0]
-; EGPR-NEXT:    movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x08,0x00]
-; EGPR-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0]
-; EGPR-NEXT:    movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
-; EGPR-NEXT:    t2rpntlvwz0rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x24,0x37]
-; EGPR-NEXT:    t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x24,0x37]
-; EGPR-NEXT:    t2rpntlvwz1rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x24,0x37]
-; EGPR-NEXT:    t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x24,0x37]
-; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
-; EGPR-NEXT:    retq # encoding: [0xc3]
-  call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  ret void
-}
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16, i16, i16, i8*, i64)
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll
deleted file mode 100644
index 4f41410010302..0000000000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll
+++ /dev/null
@@ -1,136 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-; RUN: -mattr=+amx-transpose -verify-machineinstrs | FileCheck %s
-
-@buf = dso_local global [2048 x i8] zeroinitializer, align 16
-@buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
-
-define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
-; CHECK-LABEL: test_tile_2rpntlvwz0:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rbp
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset %rbp, -16
-; CHECK-NEXT:    movq %rsp, %rbp
-; CHECK-NEXT:    .cfi_def_cfa_register %rbp
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    andq $-1024, %rsp # imm = 0xFC00
-; CHECK-NEXT:    subq $8192, %rsp # imm = 0x2000
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    # kill: def $dx killed $dx killed $edx
-; CHECK-NEXT:    movw %si, %cx
-; CHECK-NEXT:    movw %di, %ax
-; CHECK-NEXT:    # implicit-def: $al
-; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    # implicit-def: $al
-; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    # implicit-def: $al
-; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    # implicit-def: $cl
-; CHECK-NEXT:    movb %cl, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    # implicit-def: $al
-; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    # implicit-def: $al
-; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    # implicit-def: $al
-; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    # implicit-def: $al
-; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movl $buf, %esi
-; CHECK-NEXT:    movl $32, %edi
-; CHECK-NEXT:    t2rpntlvwz0 (%rsi,%rdi), %tmm4
-; CHECK-NEXT:    movabsq $64, %rbx
-; CHECK-NEXT:    tilestored %tmm5, (%rsp,%rbx) # 1024-byte Folded Spill
-; CHECK-NEXT:    tileloadd (%rsp,%rbx), %tmm0 # 1024-byte Folded Reload
-; CHECK-NEXT:    movabsq $64, %rbx
-; CHECK-NEXT:    tilestored %tmm4, 1024(%rsp,%rbx) # 1024-byte Folded Spill
-; CHECK-NEXT:    tileloadd 1024(%rsp,%rbx), %tmm1 # 1024-byte Folded Reload
-; CHECK-NEXT:    movl $64, %edi
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT:    tilestored %tmm1, (%rsi,%rdi)
-; CHECK-NEXT:    movl $64, %edi
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT:    tilestored %tmm0, (%rsi,%rdi)
-; CHECK-NEXT:    tilezero %tmm0
-; CHECK-NEXT:    movl $64, %edi
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT:    tilestored %tmm0, (%rsi,%rdi)
-; CHECK-NEXT:    movl $64, %edi
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT:    tileloadd (%rsi,%rdi), %tmm1
-; CHECK-NEXT:    movl $64, %edi
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT:    tileloadd (%rsi,%rdi), %tmm2
-; CHECK-NEXT:    movl $64, %edi
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT:    tileloadd (%rsi,%rdi), %tmm0
-; CHECK-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
-; CHECK-NEXT:    movl $64, %edi
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT:    tilestored %tmm0, (%rsi,%rdi)
-; CHECK-NEXT:    movl $64, %edi
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT:    tileloadd (%rsi,%rdi), %tmm0
-; CHECK-NEXT:    movl $buf2, %edx
-; CHECK-NEXT:    movl $32, %esi
-; CHECK-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; CHECK-NEXT:    leaq -8(%rbp), %rsp
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    popq %rbp
-; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
-; CHECK-NEXT:    tilerelease
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-entry:
-  %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #3
-  %1 = extractvalue { x86_amx, x86_amx } %0, 0
-  %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #3
-  %3 = extractvalue { x86_amx, x86_amx } %0, 1
-  %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #3
-  %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #3
-  %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #3
-  %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #3
-  %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #3
-  %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #3
-  %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #3
-  %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #3
-  %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #3
-  tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #3
-  ret void
-}
-
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1
-
-declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
-
-declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
-
-declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
-
-declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
-
-declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4
-
-attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose" }
-attributes #1 = { argmemonly nofree nounwind readonly }
-attributes #2 = { nofree nosync nounwind readnone }
-attributes #3 = { nounwind }
-attributes #4 = { argmemonly nounwind writeonly }
-
-!llvm.module.flags = !{!0, !1, !2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 7, !"uwtable", i32 2}
-!2 = !{i32 7, !"frame-pointer", i32 2}
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir
deleted file mode 100644
index ab12ab3a4f13d..0000000000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir
+++ /dev/null
@@ -1,165 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=fasttileconfig -o - %s | FileCheck %s
-
----
-name:            test_tile_2rpntlvwz0
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes:     false
-hasEHFunclets:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:       []
-liveins:
-  - { reg: '$edi', virtual-reg: '' }
-  - { reg: '$esi', virtual-reg: '' }
-  - { reg: '$edx', virtual-reg: '' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1024
-  adjustsStack:    false
-  hasCalls:        true
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  localFrameSize:  0
-  savePoint:       []
-  restorePoint:    []
-fixedStack:      []
-stack:
-  - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 1, name: '', type: default, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 2, name: '', type: default, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 3, name: '', type: default, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 4, name: '', type: default, offset: 0, size: 64, alignment: 4,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 5, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 6, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 7, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo:
-  amxProgModel: ManagedRA
-body:             |
-  bb.0.entry:
-    liveins: $rdi, $rsi, $rdx, $rax
-
-    ; CHECK-LABEL: name: test_tile_2rpntlvwz0
-    ; CHECK: liveins: $rdi, $rsi, $rdx, $rax
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: renamable $zmm0 = AVX512_512_SET0
-    ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4)
-    ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
-    ; CHECK-NEXT: renamable $rcx = MOV32ri64 64
-    ; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7)
-    ; CHECK-NEXT: renamable $cx = MOV16ri 64
-    ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5)
-    ; CHECK-NEXT: renamable $cx = MOV16ri 16
-    ; CHECK-NEXT: renamable $r8w = MOV16ri 16
-    ; CHECK-NEXT: MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6)
-    ; CHECK-NEXT: $al = IMPLICIT_DEF
-    ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
-    ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
-    ; CHECK-NEXT: $al = IMPLICIT_DEF
-    ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 50, $noreg, $al :: (store (s512) into %stack.4 + 50, align 2, basealign 4)
-    ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 20, $noreg, $cx :: (store (s512) into %stack.4 + 20, align 4)
-    ; CHECK-NEXT: $al = IMPLICIT_DEF
-    ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 49, $noreg, $al :: (store (s512) into %stack.4 + 49, align 1, basealign 4)
-    ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 18, $noreg, $di :: (store (s512) into %stack.4 + 18, align 2, basealign 4)
-    ; CHECK-NEXT: $al = IMPLICIT_DEF
-    ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
-    ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
-    ; CHECK-NEXT: $al = IMPLICIT_DEF
-    ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
-    ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
-    ; CHECK-NEXT: $al = IMPLICIT_DEF
-    ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 52, $noreg, $al :: (store (s512) into %stack.4 + 52, align 4)
-    ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 24, $noreg, $cx :: (store (s512) into %stack.4 + 24, align 4)
-    ; CHECK-NEXT: $al = IMPLICIT_DEF
-    ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 53, $noreg, $al :: (store (s512) into %stack.4 + 53, align 1, basealign 4)
-    ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 26, $noreg, $di :: (store (s512) into %stack.4 + 26, align 2, basealign 4)
-    ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4)
-    ; CHECK-NEXT: renamable $r9 = COPY $rsi
-    ; CHECK-NEXT: $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7)
-    ; CHECK-NEXT: renamable $r8 = COPY $rdi
-    ; CHECK-NEXT: $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6)
-    ; CHECK-NEXT: renamable $r10 = COPY $rax
-    ; CHECK-NEXT: $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5)
-    ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg
-    ; CHECK-NEXT: renamable $tmm0 = COPY renamable $tmm5
-    ; CHECK-NEXT: renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
-    ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1
-    ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
-    ; CHECK-NEXT: renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx
-    ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
-    ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg
-    ; CHECK-NEXT: renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg
-    ; CHECK-NEXT: renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg
-    ; CHECK-NEXT: renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2
-    ; CHECK-NEXT: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
-    renamable $zmm0 = AVX512_512_SET0
-    VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4)
-    MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
-    renamable $rcx = MOV32ri64 64
-    MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7)
-    renamable $cx = MOV16ri 64
-    MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5)
-    renamable $cx = MOV16ri 16
-    renamable $r8w = MOV16ri 16
-    MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6)
-    PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4)
-    renamable $r9 = COPY $rsi
-    $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7)
-    renamable $r8 = COPY $rdi
-    $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6)
-    renamable $r10 = COPY $rax
-    $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5)
-    renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg
-    renamable $tmm0 = COPY renamable $tmm5
-    renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
-    PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1
-    PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
-    renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx
-    PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
-    renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg
-    renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg
-    renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg
-    renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2
-    PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
-...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir
deleted file mode 100644
index c7d241f8a98b6..0000000000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir
+++ /dev/null
@@ -1,153 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=greedy,tileconfig -o - %s | FileCheck %s
-
---- |
-  @buf = dso_local global [2048 x i8] zeroinitializer, align 16
-  @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
-
-  define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
-  entry:
-    %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf, i64 0, i64 0), i64 32) #5
-    %1 = extractvalue { x86_amx, x86_amx } %0, 0
-    %2 = extractvalue { x86_amx, x86_amx } %0, 1
-    %3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5
-    %4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %3, x86_amx %1, x86_amx %2) #5
-    tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf2, i64 0, i64 0), i64 32, x86_amx %4) #5
-    ret void
-  }
-
-  declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) #1
-
-  declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
-
-  declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
-
-  declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
-
-  declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
-
-  declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #4
-
-  attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
-  attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #5 = { nounwind }
-
-...
----
-name:            test_tile_2rpntlvwz0
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes:     false
-hasEHFunclets:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: gr32, preferred-register: '' }
-  - { id: 1, class: gr32, preferred-register: '' }
-  - { id: 2, class: gr32, preferred-register: '' }
-  - { id: 3, class: gr16, preferred-register: '' }
-  - { id: 4, class: gr16, preferred-register: '' }
-  - { id: 5, class: gr16, preferred-register: '' }
-  - { id: 6, class: gr64, preferred-register: '' }
-  - { id: 7, class: gr64_nosp, preferred-register: '' }
-  - { id: 8, class: tilepair, preferred-register: '' }
-  - { id: 9, class: tile, preferred-register: '' }
-  - { id: 10, class: tile, preferred-register: '' }
-  - { id: 11, class: tile, preferred-register: '' }
-  - { id: 12, class: tile, preferred-register: '' }
-  - { id: 13, class: gr64, preferred-register: '' }
-  - { id: 14, class: vr512, preferred-register: '' }
-liveins:
-  - { reg: '$edi', virtual-reg: '%0' }
-  - { reg: '$esi', virtual-reg: '%1' }
-  - { reg: '$edx', virtual-reg: '%2' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    4
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  localFrameSize:  0
-  savePoint:       []
-  restorePoint:    []
-fixedStack:      []
-stack:
-  - { id: 0, name: '', type: default, offset: 0, size: 64, alignment: 4,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo:
-  amxProgModel: ManagedRA
-body:             |
-  bb.0.entry:
-    liveins: $edi, $esi, $edx
-
-
-    ; CHECK-LABEL: name: test_tile_2rpntlvwz0
-    ; CHECK: liveins: $edi, $esi, $edx
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi
-    ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
-    ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4)
-    ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
-    ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 26, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 26, align 2, basealign 4)
-    ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 53, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 53, align 1, basealign 4)
-    ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 24, $noreg, [[COPY1]].sub_16bit :: (store (s512) into %stack.0 + 24, align 4)
-    ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 52, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 52, align 4)
-    ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 16, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 16, align 4)
-    ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 48, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 48, align 4)
-    ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
-    ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64 = MOV32ri64 @buf
-    ; CHECK-NEXT: [[MOV32ri64_1:%[0-9]+]]:gr64_nosp = MOV32ri64 32
-    ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[COPY]].sub_16bit, [[MOV32ri64_]], 1, [[MOV32ri64_1]], 0, $noreg
-    ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit
-    ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY2]].sub_16bit, [[COPY]].sub_16bit, [[COPY1]].sub_16bit, [[PTILEZEROV]], [[PT2RPNTLVWZ0V]].sub_t0, [[PT2RPNTLVWZ0V]].sub_t1
-    ; CHECK-NEXT: [[MOV32ri64_2:%[0-9]+]]:gr64 = MOV32ri64 @buf2
-    ; CHECK-NEXT: PTILESTOREDV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[MOV32ri64_2]], 1, [[MOV32ri64_1]], 0, $noreg, [[PTILEZEROV]]
-    ; CHECK-NEXT: RET 0
-    %2:gr32 = COPY $edx
-    %1:gr32 = COPY $esi
-    %0:gr32 = COPY $edi
-    %14:vr512 = AVX512_512_SET0
-    VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, %14 :: (store (s512) into %stack.0, align 4)
-    MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
-    PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
-    %6:gr64 = MOV32ri64 @buf
-    %7:gr64_nosp = MOV32ri64 32
-    %8:tilepair = PT2RPNTLVWZ0V %0.sub_16bit, %1.sub_16bit, %2.sub_16bit, %6, 1, %7, 0, $noreg
-    %12:tile = PTILEZEROV %0.sub_16bit, %1.sub_16bit
-    %12:tile = PTDPBSSDV %0.sub_16bit, %2.sub_16bit, %1.sub_16bit, %12, %8.sub_t0, %8.sub_t1
-    %13:gr64 = MOV32ri64 @buf2
-    PTILESTOREDV %0.sub_16bit, %1.sub_16bit, %13, 1, %7, 0, $noreg, %12
-    RET 0
-
-...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir b/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir
deleted file mode 100644
index 66b15aa5b3cde..0000000000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir
+++ /dev/null
@@ -1,97 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=lowertilecopy -o - %s | FileCheck %s
-
----
-name:            test_tile_2rpntlvwz0
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes:     false
-hasEHFunclets:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:       []
-liveins:
-  - { reg: '$edi', virtual-reg: '' }
-  - { reg: '$esi', virtual-reg: '' }
-  - { reg: '$edx', virtual-reg: '' }
-  - { reg: '$cx', virtual-reg: '' }
-  - { reg: '$r9', virtual-reg: '' }
-  - { reg: '$r10', virtual-reg: '' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1024
-  adjustsStack:    false
-  hasCalls:        true
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  localFrameSize:  0
-  savePoint:       []
-  restorePoint:    []
-fixedStack:      []
-stack:
-  - { id: 43, name: '', type: default, offset: 0, size: 64, alignment: 4,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 68, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo:
-  amxProgModel: ManagedRA
-body:             |
-  bb.0.entry:
-    liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9
-
-
-    ; CHECK-LABEL: name: test_tile_2rpntlvwz0
-    ; CHECK: liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
-    ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg
-    ; CHECK-NEXT: $rax = MOV64ri 64
-    ; CHECK-NEXT: TILESTORED %stack.3, 1, $rax, 0, $noreg, $tmm5 :: (store (s8192) into %stack.3)
-    ; CHECK-NEXT: $tmm0 = TILELOADD %stack.3, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.3)
-    ; CHECK-NEXT: $rax = MOV64ri 64
-    ; CHECK-NEXT: TILESTORED %stack.2, 1, $rax, 0, $noreg, $tmm4 :: (store (s8192) into %stack.2)
-    ; CHECK-NEXT: $tmm1 = TILELOADD %stack.2, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.2)
-    ; CHECK-NEXT: renamable $r8 = MOV32ri64 64
-    ; CHECK-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.1)
-    ; CHECK-NEXT: renamable $di = MOV16ri 64
-    ; CHECK-NEXT: renamable $cx = MOV16ri 16
-    ; CHECK-NEXT: PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1
-    ; CHECK-NEXT: PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0
-    PLDTILECFGV %stack.43, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.43, align 4)
-    renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg
-    renamable $tmm0 = COPY renamable $tmm5
-    renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
-    renamable $r8 = MOV32ri64 64
-    MOV64mr %stack.68, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.68)
-    renamable $di = MOV16ri 64
-    renamable $cx = MOV16ri 16
-    PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1
-    PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0
-
-...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll
deleted file mode 100644
index 3549875e858a9..0000000000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll
+++ /dev/null
@@ -1,87 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-  ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s
-  ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s
-
-  @buf = dso_local global [2048 x i8] zeroinitializer, align 16
-
-  ; Function Attrs: noinline nounwind optnone uwtable
-  define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1, ptr %m) #0 {
-; CHECK-LABEL: @test_tile_2rpntlvwz0(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = udiv i16 [[COL1:%.*]], 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M:%.*]], i64 [[TMP3]], x86_amx [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[COL1]] to i64
-; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP6]], x86_amx [[TMP5]])
-; CHECK-NEXT:    [[TMP8:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP9]], x86_amx [[TMP8]])
-; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT:    [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP11]])
-; CHECK-NEXT:    [[TMP14:%.*]] = sext i16 [[COL1]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP14]])
-; CHECK-NEXT:    [[TMP17:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT:    [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP0]], i16 [[COL0]], ptr [[M]], i64 [[TMP17]])
-; CHECK-NEXT:    [[TMP20:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL0]], i16 [[COL1]], x86_amx [[TMP13]], x86_amx [[TMP16]], x86_amx [[TMP19]]) #[[ATTR3]]
-; CHECK-NEXT:    [[TMP21:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP21]], x86_amx [[TMP20]])
-; CHECK-NEXT:    ret void
-;
-  entry:
-
-  %0 =  call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr getelementptr inbounds ([2048 x i8], ptr @buf, i64     0, i64 0), i64 32) #7
-  %1 = extractvalue { x86_amx, x86_amx } %0, 0
-  %2 =  call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #7
-  store <256 x i32> %2, ptr %m, align 1024
-
-  %3 = extractvalue { x86_amx, x86_amx } %0, 1
-  %4 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #7
-  store <256 x i32> %4, ptr %m, align 1024
-
-  %5 = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #7
-  %6 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #7
-  store <256 x i32> %6, ptr %m, align 64
-
-  %7 = load <256 x i32>, ptr %m, align 64
-  %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %7) #7
-  %9 = load <256 x i32>, ptr %m, align 64
-  %10 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %9) #7
-  %11 = load <256 x i32>, ptr %m, align 64
-  %12 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #7
-
-  %13 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col0, i16 %col1, x86_amx %8, x86_amx %10, x86_amx %12) #7
-  %14 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %13) #7
-  store <256 x i32> %14, ptr %m, align 64
-
-  ret void
-  }
-
-  ; Function Attrs: argmemonly nounwind readonly
-  declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #2
-
-  ; Function Attrs: nounwind readnone
-  declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #3
-
-  ; Function Attrs: nounwind
-  declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #4
-
-  ; Function Attrs: nounwind
-  declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #4
-
-  ; Function Attrs: nounwind readnone
-  declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #3
-
-  ; Function Attrs: argmemonly nounwind writeonly
-  declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #5
-
-  attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
-  attributes #1 = { argmemonly nofree nounwind willreturn writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #2 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #3 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #4 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #5 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #6 = { argmemonly nofree nounwind willreturn "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #7 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll
deleted file mode 100644
index 96966264e0515..0000000000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s
-; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s
-
-  @buf = dso_local global [2048 x i8] zeroinitializer, align 16
-  @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
-
-  ; Function Attrs: nounwind uwtable
-  define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
-; CHECK-LABEL: @test_tile_2rpntlvwz0(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1:%.*]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL1]], i16 [[COL0]], x86_amx [[TMP3]], x86_amx [[TMP1]], x86_amx [[TMP2]]) #[[ATTR3]]
-; CHECK-NEXT:    tail call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr @buf2, i64 32, x86_amx [[TMP4]]) #[[ATTR3]]
-; CHECK-NEXT:    ret void
-;
-  entry:
-  %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #5
-  %1 = extractvalue { x86_amx, x86_amx } %0, 0
-  %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #5
-  %3 = extractvalue { x86_amx, x86_amx } %0, 1
-  %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #5
-  %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5
-  %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #5
-  %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #5
-  %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #5
-  %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #5
-  %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #5
-  %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #5
-  %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #5
-  tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #5
-  ret void
-  }
-
-  ; Function Attrs: argmemonly nounwind readonly
-  declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1
-
-  ; Function Attrs: nounwind readnone
-  declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
-
-  ; Function Attrs: nounwind
-  declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
-
-  ; Function Attrs: nounwind
-  declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
-
-  ; Function Attrs: nounwind readnone
-  declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
-
-  ; Function Attrs: argmemonly nounwind writeonly
-  declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4
-
-  attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
-  attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
-  attributes #5 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir
deleted file mode 100644
index 1e3b242bca96c..0000000000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir
+++ /dev/null
@@ -1,134 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=fastpretileconfig -o - %s | FileCheck %s
-
----
-name:            test_tile_2rpntlvwz0
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes:     false
-hasEHFunclets:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: gr64_nosp, preferred-register: '' }
-  - { id: 1, class: gr16, preferred-register: '' }
-  - { id: 2, class: gr16, preferred-register: '' }
-  - { id: 3, class: gr16, preferred-register: '' }
-  - { id: 4, class: gr64, preferred-register: '' }
-  - { id: 5, class: gr64, preferred-register: '' }
-  - { id: 6, class: gr64, preferred-register: '' }
-  - { id: 7, class: gr64_nosp, preferred-register: '' }
-  - { id: 8, class: tilepair, preferred-register: '' }
-  - { id: 9, class: tile, preferred-register: '' }
-  - { id: 10, class: tile, preferred-register: '' }
-  - { id: 11, class: tile, preferred-register: '' }
-  - { id: 181, class: tile, preferred-register: '' }
-  - { id: 183, class: tile, preferred-register: '' }
-  - { id: 185, class: tile, preferred-register: '' }
-  - { id: 186, class: tile, preferred-register: '' }
-liveins:
-  - { reg: '$edi', virtual-reg: '%0' }
-  - { reg: '$esi', virtual-reg: '%1' }
-  - { reg: '$edx', virtual-reg: '%2' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1024
-  adjustsStack:    false
-  hasCalls:        true
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  localFrameSize:  0
-  savePoint:       []
-  restorePoint:    []
-fixedStack:      []
-stack:
-  - { id: 18, name: '', type: default, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 19, name: '', type: default, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 20, name: '', type: default, offset: 0, size: 8, alignment: 8,
-      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 21, name: '', type: default, offset: 0, size: 8,
-      alignment: 8, stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo:
-  amxProgModel: ManagedRA
-body:             |
-  bb.0.entry:
-    liveins: $rdi, $rsi, $rdx, $rax
-
-    ; CHECK-LABEL: name: test_tile_2rpntlvwz0
-    ; CHECK: liveins: $rdi, $rsi, $rdx, $rax
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
-    ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.4, align 4)
-    ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
-    ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 64
-    ; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 64
-    ; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 16
-    ; CHECK-NEXT: [[MOV16ri2:%[0-9]+]]:gr16 = MOV16ri 16
-    ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.4, align 4)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY $rdx
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY $rax
-    ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[COPY2]], 1, killed [[COPY3]], 0, $noreg
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0
-    ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY5]]
-    ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY4]]
-    ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV16ri]], [[MOV16ri1]]
-    ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTILEZEROV]]
-    ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg
-    ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg
-    ; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg
-    ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[PTILELOADDV]], killed [[PTILELOADDV1]], killed [[PTILELOADDV2]]
-    ; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri]], killed [[MOV16ri1]], killed [[COPY2]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]]
-    %0:gr64_nosp = MOV32ri64 64
-    %1:gr16 = MOV16ri 64
-    %2:gr16 = MOV16ri 16
-    %3:gr16 = MOV16ri 16
-    %4:gr64 = COPY $rsi
-    %5:gr64 = COPY $rdi
-    %6:gr64 = COPY $rdx
-    %7:gr64_nosp = COPY $rax
-    %8:tilepair = PT2RPNTLVWZ0V %1, %2, %3, %6, 1, killed %7, 0, $noreg
-    %9:tile = COPY %8.sub_t1
-    %10:tile = COPY %8.sub_t0
-    PTILESTOREDV %1, %2, %4, 1, %0, 0, $noreg, killed %10
-    PTILESTOREDV %1, %3, %5, 1, %0, 0, $noreg, killed %9
-    %11:tile = PTILEZEROV %1, %2
-    PTILESTOREDV %1, %2, %6, 1, %0, 0, $noreg, killed %11
-    %181:tile = PTILELOADDV %1, %2, %4, 1, %0, 0, $noreg
-    %183:tile = PTILELOADDV %1, %3, %5, 1, %0, 0, $noreg
-    %185:tile = PTILELOADDV %1, %2, %6, 1, %0, 0, $noreg
-    %186:tile = PTDPBSSDV %1, %2, %3, %181, killed %183, killed %185
-    PTILESTOREDV killed %1, killed %2, killed %6, 1, killed %0, 0, $noreg, killed %186
-...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir
deleted file mode 100644
index ac2cdb4a50568..0000000000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir
+++ /dev/null
@@ -1,113 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=tilepreconfig -o - %s | FileCheck %s
-
----
-name:            test_tile_2rpntlvwz0
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes:     false
-hasEHFunclets:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: gr32, preferred-register: '' }
-  - { id: 1, class: gr32, preferred-register: '' }
-  - { id: 2, class: gr32, preferred-register: '' }
-  - { id: 3, class: gr16, preferred-register: '' }
-  - { id: 4, class: gr16, preferred-register: '' }
-  - { id: 5, class: gr16, preferred-register: '' }
-  - { id: 6, class: gr64, preferred-register: '' }
-  - { id: 7, class: gr64_nosp, preferred-register: '' }
-  - { id: 8, class: tilepair, preferred-register: '' }
-  - { id: 9, class: tile, preferred-register: '' }
-  - { id: 10, class: tile, preferred-register: '' }
-  - { id: 11, class: tile, preferred-register: '' }
-  - { id: 12, class: tile, preferred-register: '' }
-  - { id: 13, class: gr64, preferred-register: '' }
-liveins:
-  - { reg: '$edi', virtual-reg: '%0' }
-  - { reg: '$esi', virtual-reg: '%1' }
-  - { reg: '$edx', virtual-reg: '%2' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  localFrameSize:  0
-  savePoint:       []
-  restorePoint:    []
-fixedStack:      []
-stack:           []
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo:
-  amxProgModel: ManagedRA
-body:             |
-  bb.0.entry:
-    liveins: $edi, $esi, $edx, $rax, $rbx
-
-    ; CHECK-LABEL: name: test_tile_2rpntlvwz0
-    ; CHECK: liveins: $edi, $esi, $edx, $rax, $rbx
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
-    ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4)
-    ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr16 = COPY [[COPY1]].sub_16bit
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr16 = COPY [[COPY2]].sub_16bit
-    ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.0, align 4)
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY $rax
-    ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32
-    ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY5]], [[COPY4]], [[COPY3]], killed [[COPY6]], 1, [[MOV32ri64_]], 0, $noreg
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1
-    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0
-    ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY5]], [[COPY4]]
-    ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY5]], [[COPY3]], [[COPY4]], [[PTILEZEROV]], killed [[COPY8]], killed [[COPY7]]
-    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY $rbx
-    ; CHECK-NEXT: PTILESTOREDV [[COPY5]], [[COPY4]], killed [[COPY9]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]]
-    ; CHECK-NEXT: RET 0
-    %2:gr32 = COPY $edx
-    %1:gr32 = COPY $esi
-    %0:gr32 = COPY $edi
-    %3:gr16 = COPY %2.sub_16bit
-    %4:gr16 = COPY %1.sub_16bit
-    %5:gr16 = COPY %0.sub_16bit
-    %6:gr64 = COPY $rax
-    %7:gr64_nosp = MOV32ri64 32
-    %8:tilepair = PT2RPNTLVWZ0V %5, %4, %3, killed %6, 1, %7, 0, $noreg
-    %9:tile = COPY %8.sub_t1
-    %10:tile = COPY %8.sub_t0
-    %11:tile = PTILEZEROV %5, %4
-    %12:tile = PTDPBSSDV %5, %3, %4, %11, killed %10, killed %9
-    %13:gr64 = COPY $rbx
-    PTILESTOREDV %5, %4, killed %13, 1, %7, 0, $noreg, killed %12
-    RET 0
-
-...
diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
deleted file mode 100644
index 4cfd97afe721b..0000000000000
--- a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
+++ /dev/null
@@ -1,371 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR
-
-define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 {
-; CHECK-LABEL: test_amx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    t2rpntlvwz0 (%rcx,%rsi), %tmm0
-; CHECK-NEXT:    t2rpntlvwz0t1 (%rcx,%rsi), %tmm2
-; CHECK-NEXT:    t2rpntlvwz1 (%rcx,%rsi), %tmm0
-; CHECK-NEXT:    t2rpntlvwz1t1 (%rcx,%rsi), %tmm2
-; CHECK-NEXT:    ttransposed %tmm3, %tmm1
-; CHECK-NEXT:    ttdpbf16ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT:    ttdpfp16ps %tmm6, %tmm5, %tmm4
-; CHECK-NEXT:    ttcmmimfp16ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT:    ttcmmrlfp16ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT:    tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT:    tconjtfp16 %tmm2, %tmm1
-; CHECK-NEXT:    retq
-;
-; EGPR-LABEL: test_amx:
-; EGPR:       # %bb.0:
-; EGPR-NEXT:    t2rpntlvwz0 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x04,0x31]
-; EGPR-NEXT:    t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x31]
-; EGPR-NEXT:    t2rpntlvwz1 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x04,0x31]
-; EGPR-NEXT:    t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x31]
-; EGPR-NEXT:    ttransposed %tmm3, %tmm1 # encoding: [0xc4,0xe2,0x7a,0x5f,0xcb]
-; EGPR-NEXT:    ttdpbf16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6c,0xca]
-; EGPR-NEXT:    ttdpfp16ps %tmm6, %tmm5, %tmm4 # encoding: [0xc4,0xe2,0x4b,0x6c,0xe5]
-; EGPR-NEXT:    ttcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x63,0x6b,0xca]
-; EGPR-NEXT:    ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6b,0xca]
-; EGPR-NEXT:    tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x60,0x6b,0xca]
-; EGPR-NEXT:    tconjtfp16 %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x79,0x6b,0xca]
-; EGPR-NEXT:    retq # encoding: [0xc3]
-  call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride)
-  call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride)
-  call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride)
-  call void @llvm.x86.t2rpntlvwz1t1(i8 2, i8* %addr1, i64 %stride)
-  call void @llvm.x86.ttransposed(i8 1, i8 3)
-  call void @llvm.x86.ttdpbf16ps(i8 1, i8 2, i8 3)
-  call void @llvm.x86.ttdpfp16ps(i8 4, i8 5, i8 6)
-  call void @llvm.x86.ttcmmimfp16ps(i8 1, i8 2, i8 3)
-  call void @llvm.x86.ttcmmrlfp16ps(i8 1, i8 2, i8 3)
-  call void @llvm.x86.tconjtcmmimfp16ps(i8 1, i8 2, i8 3)
-  call void @llvm.x86.tconjtfp16(i8 1, i8 2)
-  ret void
-}
-
-declare void @llvm.x86.t2rpntlvwz0(i8 %tile1, i8* %addr1, i64 %stride)
-declare void @llvm.x86.t2rpntlvwz0t1(i8 %tile1, i8* %addr1, i64 %stride)
-declare void @llvm.x86.t2rpntlvwz1(i8 %tile1, i8* %addr1, i64 %stride)
-declare void @llvm.x86.t2rpntlvwz1t1(i8 %tile1, i8* %addr1, i64 %stride)
-declare void @llvm.x86.ttransposed(i8 %tile0, i8 %tile1)
-declare void @llvm.x86.ttdpbf16ps(i8 %tile0, i8 %tile1, i8 %tile2)
-declare void @llvm.x86.ttdpfp16ps(i8 %tile0, i8 %tile1, i8 %tile2)
-declare void @llvm.x86.ttcmmimfp16ps(i8 %A, i8 %B, i8 %C)
-declare void @llvm.x86.ttcmmrlfp16ps(i8 %A, i8 %B, i8 %C)
-declare void @llvm.x86.tconjtcmmimfp16ps(i8 %A, i8 %B, i8 %C)
-declare void @llvm.x86.tconjtfp16(i8 %A, i8 %B)
-
-define void @test_amx2(i8* %pointer, i8* %base, i64 %stride) #0 {
-; CHECK-LABEL: test_amx2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rbp
-; CHECK-NEXT:    subq $2928, %rsp # imm = 0xB70
-; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, %ax
-; CHECK-NEXT:    tileloadd (%rsi,%rdx), %tmm0
-; CHECK-NEXT:    tilezero %tmm1
-; CHECK-NEXT:    tilezero %tmm2
-; CHECK-NEXT:    ttdpbf16ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT:    ttdpfp16ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT:    ttcmmimfp16ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT:    ttcmmrlfp16ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT:    movabsq $64, %rbp
-; CHECK-NEXT:    tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill
-; CHECK-NEXT:    tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
-; CHECK-NEXT:    tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3
-; CHECK-NEXT:    tconjtfp16 %tmm3, %tmm0
-; CHECK-NEXT:    tilestored %tmm2, (%rdi,%rdx)
-; CHECK-NEXT:    addq $2928, %rsp # imm = 0xB70
-; CHECK-NEXT:    popq %rbp
-; CHECK-NEXT:    tilerelease
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-;
-; EGPR-LABEL: test_amx2:
-; EGPR:       # %bb.0:
-; EGPR-NEXT:    pushq %rbp # encoding: [0x55]
-; EGPR-NEXT:    subq $2928, %rsp # encoding: [0x48,0x81,0xec,0x70,0x0b,0x00,0x00]
-; EGPR-NEXT:    # imm = 0xB70
-; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
-; EGPR-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0x0d]
-; EGPR-NEXT:    movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x40,0x03,0x00,0x00,0x01]
-; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x70,0x03,0x00,0x00,0x08]
-; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x50,0x03,0x00,0x00,0x08,0x00]
-; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x71,0x03,0x00,0x00,0x08]
-; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x52,0x03,0x00,0x00,0x08,0x00]
-; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x72,0x03,0x00,0x00,0x08]
-; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x54,0x03,0x00,0x00,0x08,0x00]
-; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x73,0x03,0x00,0x00,0x08]
-; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x56,0x03,0x00,0x00,0x08,0x00]
-; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0x40,0x03,0x00,0x00]
-; EGPR-NEXT:    movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
-; EGPR-NEXT:    tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16]
-; EGPR-NEXT:    tilezero %tmm1 # encoding: [0xc4,0xe2,0x7b,0x49,0xc8]
-; EGPR-NEXT:    tilezero %tmm2 # encoding: [0xc4,0xe2,0x7b,0x49,0xd0]
-; EGPR-NEXT:    ttdpbf16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6c,0xd0]
-; EGPR-NEXT:    ttdpfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6c,0xd0]
-; EGPR-NEXT:    ttcmmimfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6b,0xd0]
-; EGPR-NEXT:    ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6b,0xd0]
-; EGPR-NEXT:    movabsq $64, %rbp # encoding: [0x48,0xbd,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; EGPR-NEXT:    tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill
-; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x94,0x2c,0x80,0x03,0x00,0x00]
-; EGPR-NEXT:    tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
-; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x2c,0x80,0x03,0x00,0x00]
-; EGPR-NEXT:    tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 # encoding: [0xc4,0xe2,0x70,0x6b,0xd8]
-; EGPR-NEXT:    tconjtfp16 %tmm3, %tmm0 # encoding: [0xc4,0xe2,0x79,0x6b,0xc3]
-; EGPR-NEXT:    tilestored %tmm2, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x14,0x17]
-; EGPR-NEXT:    addq $2928, %rsp # encoding: [0x48,0x81,0xc4,0x70,0x0b,0x00,0x00]
-; EGPR-NEXT:    # imm = 0xB70
-; EGPR-NEXT:    popq %rbp # encoding: [0x5d]
-; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
-; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; EGPR-NEXT:    retq # encoding: [0xc3]
-
-  %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
-  %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
-  %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
-  %c1 = call x86_amx @llvm.x86.ttdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
-  %c2 = call x86_amx @llvm.x86.ttdpfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b)
-  %c3 = call x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c2, x86_amx %a, x86_amx %b)
-  %c4 = call x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c3, x86_amx %a, x86_amx %b)
-  %c5 = call x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c4, x86_amx %a, x86_amx %b)
-  %c6 = call x86_amx @llvm.x86.tconjtfp16.internal(i16 8, i16 8, x86_amx %c5)
-
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c4)
-  ret void
-}
-
-define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 {
-; CHECK-LABEL: test_amx3:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movw $8, %cx
-; CHECK-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm4
-; CHECK-NEXT:    t2rpntlvwz0t1 (%rsi,%rdx), %tmm4
-; CHECK-NEXT:    t2rpntlvwz1 (%rsi,%rdx), %tmm4
-; CHECK-NEXT:    t2rpntlvwz1t1 (%rsi,%rdx), %tmm4
-; CHECK-NEXT:    ttransposed %tmm4, %tmm0
-; CHECK-NEXT:    tilestored %tmm0, (%rdi,%rdx)
-; CHECK-NEXT:    tilerelease
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-;
-; EGPR-LABEL: test_amx3:
-; EGPR:       # %bb.0:
-; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
-; EGPR-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xff]
-; EGPR-NEXT:    movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf0,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd0,0x08,0x00]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00]
-; EGPR-NEXT:    movb $0, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x00]
-; EGPR-NEXT:    movw $0, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x00,0x00]
-; EGPR-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0]
-; EGPR-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; EGPR-NEXT:    movw $8, %cx # encoding: [0x66,0xb9,0x08,0x00]
-; EGPR-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16]
-; EGPR-NEXT:    t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x24,0x16]
-; EGPR-NEXT:    t2rpntlvwz1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x24,0x16]
-; EGPR-NEXT:    t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x24,0x16]
-; EGPR-NEXT:    ttransposed %tmm4, %tmm0 # encoding: [0xc4,0xe2,0x7a,0x5f,0xc4]
-; EGPR-NEXT:    tilestored %tmm0, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x17]
-; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
-; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; EGPR-NEXT:    retq # encoding: [0xc3]
-  %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
-  %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
-  %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
-  %4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
-  %5 = extractvalue { x86_amx, x86_amx } %4, 0
-  %6 = call x86_amx @llvm.x86.ttransposed.internal(i16 8, i16 8, x86_amx %5)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %6)
-  ret void
-}
-
-define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 {
-; CHECK-LABEL: test_amx_spill:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $6088, %rsp # imm = 0x17C8
-; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movw $8, %ax
-; CHECK-NEXT:    tileloadd (%rsi,%rdx), %tmm0
-; CHECK-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm4
-; CHECK-NEXT:    t2rpntlvwz0t1 (%rsi,%rdx), %tmm6
-; CHECK-NEXT:    movabsq $64, %rcx
-; CHECK-NEXT:    tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT:    tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT:    t2rpntlvwz1 (%rsi,%rdx), %tmm6
-; CHECK-NEXT:    tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT:    tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT:    t2rpntlvwz1t1 (%rsi,%rdx), %tmm6
-; CHECK-NEXT:    tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT:    tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm6
-; CHECK-NEXT:    tilestored %tmm4, (%rsi,%rdx)
-; CHECK-NEXT:    tilestored %tmm5, (%rsi,%rdx)
-; CHECK-NEXT:    tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; CHECK-NEXT:    tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; CHECK-NEXT:    tilestored %tmm4, (%rsi,%rdx)
-; CHECK-NEXT:    tilestored %tmm5, (%rsi,%rdx)
-; CHECK-NEXT:    tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; CHECK-NEXT:    tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; CHECK-NEXT:    tilestored %tmm4, (%rsi,%rdx)
-; CHECK-NEXT:    tilestored %tmm5, (%rsi,%rdx)
-; CHECK-NEXT:    tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; CHECK-NEXT:    tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; CHECK-NEXT:    tilestored %tmm4, (%rsi,%rdx)
-; CHECK-NEXT:    tilestored %tmm5, (%rsi,%rdx)
-; CHECK-NEXT:    tilestored %tmm6, (%rsi,%rdx)
-; CHECK-NEXT:    tilestored %tmm7, (%rsi,%rdx)
-; CHECK-NEXT:    addq $6088, %rsp # imm = 0x17C8
-; CHECK-NEXT:    tilerelease
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-;
-; EGPR-LABEL: test_amx_spill:
-; EGPR:       # %bb.0:
-; EGPR-NEXT:    subq $6088, %rsp # encoding: [0x48,0x81,0xec,0xc8,0x17,0x00,0x00]
-; EGPR-NEXT:    # imm = 0x17C8
-; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
-; EGPR-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xfe]
-; EGPR-NEXT:    movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x80,0x01]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb0,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x90,0x08,0x00]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb4,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x98,0x08,0x00]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb5,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9a,0x08,0x00]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb6,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9c,0x08,0x00]
-; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb7,0x08]
-; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9e,0x08,0x00]
-; EGPR-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x80]
-; EGPR-NEXT:    movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
-; EGPR-NEXT:    tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16]
-; EGPR-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16]
-; EGPR-NEXT:    t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x34,0x16]
-; EGPR-NEXT:    movabsq $64, %rcx # encoding: [0x48,0xb9,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; EGPR-NEXT:    tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x0f,0x00,0x00]
-; EGPR-NEXT:    tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x13,0x00,0x00]
-; EGPR-NEXT:    t2rpntlvwz1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x34,0x16]
-; EGPR-NEXT:    tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x07,0x00,0x00]
-; EGPR-NEXT:    tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x0b,0x00,0x00]
-; EGPR-NEXT:    t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x34,0x16]
-; EGPR-NEXT:    tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0x74,0x0c,0xc0]
-; EGPR-NEXT:    tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x03,0x00,0x00]
-; EGPR-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x34,0x16]
-; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
-; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
-; EGPR-NEXT:    tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x0f,0x00,0x00]
-; EGPR-NEXT:    tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x13,0x00,0x00]
-; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
-; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
-; EGPR-NEXT:    tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x07,0x00,0x00]
-; EGPR-NEXT:    tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x0b,0x00,0x00]
-; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
-; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
-; EGPR-NEXT:    tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0x64,0x0c,0xc0]
-; EGPR-NEXT:    tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x03,0x00,0x00]
-; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
-; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
-; EGPR-NEXT:    tilestored %tmm6, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x34,0x16]
-; EGPR-NEXT:    tilestored %tmm7, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x3c,0x16]
-; EGPR-NEXT:    addq $6088, %rsp # encoding: [0x48,0x81,0xc4,0xc8,0x17,0x00,0x00]
-; EGPR-NEXT:    # imm = 0x17C8
-; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
-; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; EGPR-NEXT:    retq # encoding: [0xc3]
-  %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
-  %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  %b3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  %b4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  %b5 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
-  %e11 = extractvalue { x86_amx, x86_amx } %b1, 0
-  %e12 = extractvalue { x86_amx, x86_amx } %b1, 1
-  %e21 = extractvalue { x86_amx, x86_amx } %b2, 0
-  %e22 = extractvalue { x86_amx, x86_amx } %b2, 1
-  %e31 = extractvalue { x86_amx, x86_amx } %b3, 0
-  %e32 = extractvalue { x86_amx, x86_amx } %b3, 1
-  %e41 = extractvalue { x86_amx, x86_amx } %b4, 0
-  %e42 = extractvalue { x86_amx, x86_amx } %b4, 1
-  %e51 = extractvalue { x86_amx, x86_amx } %b5, 0
-  %e52 = extractvalue { x86_amx, x86_amx } %b5, 1
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e11)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e12)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e21)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e22)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e31)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e32)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e41)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e42)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e51)
-  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e52)
-  ret void
-}
-
-declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
-declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16, i16, i16, i8*, i64)
-declare x86_amx @llvm.x86.ttransposed.internal(i16, i16, x86_amx)
-declare x86_amx @llvm.x86.ttdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.ttdpfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.tconjtfp16.internal(i16, i16, x86_amx)
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-general.ll b/llvm/test/CodeGen/X86/apx/no-rex2-general.ll
index 805fc7ccaab76..2b34739fa80e3 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-general.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-general.ll
@@ -1,76 +1,80 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr  | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr  | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefix=AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr  | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefixes=CHECK,AVX
 
 define i32 @map0(ptr nocapture noundef readonly %a, i64 noundef %b) {
-  ; SSE-LABEL: name: map0
-  ; SSE: bb.0.entry:
-  ; SSE-NEXT:   liveins: $rdi, $rsi
-  ; SSE-NEXT: {{  $}}
-  ; SSE-NEXT:   [[COPY:%[0-9]+]]:gr64_nosp = COPY $rsi
-  ; SSE-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
-  ; SSE-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s32) from %ir.add.ptr)
-  ; SSE-NEXT:   $eax = COPY [[MOV32rm]]
-  ; SSE-NEXT:   RET 0, $eax
-  ; AVX-LABEL: name: map0
-  ; AVX: bb.0.entry:
-  ; AVX-NEXT:   liveins: $rdi, $rsi
-  ; AVX-NEXT: {{  $}}
-  ; AVX-NEXT:   [[COPY:%[0-9]+]]:gr64_nosp = COPY $rsi
-  ; AVX-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
-  ; AVX-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s32) from %ir.add.ptr)
-  ; AVX-NEXT:   $eax = COPY [[MOV32rm]]
-  ; AVX-NEXT:   RET 0, $eax
+; CHECK-LABEL: map0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %r16
+; CHECK-NEXT:    movq %rdi, %r17
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl (%r17,%r16,4), %eax
+; CHECK-NEXT:    retq
 entry:
   %add.ptr = getelementptr inbounds i32, ptr %a, i64 %b
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   %0 = load i32, ptr %add.ptr
   ret i32 %0
 }
 
-define i32 @map1_or_vex(<2 x double> noundef %a) {
-  ; SSE-LABEL: name: map1_or_vex
-  ; SSE: bb.0.entry:
-  ; SSE-NEXT:   liveins: $xmm0
-  ; SSE-NEXT: {{  $}}
-  ; SSE-NEXT:   [[COPY:%[0-9]+]]:vr128 = COPY $xmm0
-  ; SSE-NEXT:   [[CVTSD2SIrr_Int:%[0-9]+]]:gr32 = nofpexcept CVTSD2SIrr_Int [[COPY]], implicit $mxcsr
-  ; SSE-NEXT:   $eax = COPY [[CVTSD2SIrr_Int]]
-  ; SSE-NEXT:   RET 0, $eax
-  ; AVX-LABEL: name: map1_or_vex
-  ; AVX: bb.0.entry:
-  ; AVX-NEXT:   liveins: $xmm0
-  ; AVX-NEXT: {{  $}}
-  ; AVX-NEXT:   [[COPY:%[0-9]+]]:vr128 = COPY $xmm0
-  ; AVX-NEXT:   [[VCVTSD2SIrr_Int:%[0-9]+]]:gr32_norex2 = nofpexcept VCVTSD2SIrr_Int [[COPY]], implicit $mxcsr
-  ; AVX-NEXT:   $eax = COPY [[VCVTSD2SIrr_Int]]
-  ; AVX-NEXT:   RET 0, $eax
+define i32 @map1_or_vex(<2 x double> noundef %a) nounwind {
+; SSE-LABEL: map1_or_vex:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvtsd2si %xmm0, %r16d
+; SSE-NEXT:    #APP
+; SSE-NEXT:    nop
+; SSE-NEXT:    #NO_APP
+; SSE-NEXT:    movl %r16d, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: map1_or_vex:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    vcvtsd2si %xmm0, %ebx
+; AVX-NEXT:    #APP
+; AVX-NEXT:    nop
+; AVX-NEXT:    #NO_APP
+; AVX-NEXT:    movl %ebx, %eax
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    retq
 entry:
   %0 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a)
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   ret i32 %0
 }
 
-define <2 x i64> @map2_or_vex(ptr nocapture noundef readonly %b, i64 noundef %c) {
-  ; SSE-LABEL: name: map2_or_vex
-  ; SSE: bb.0.entry:
-  ; SSE-NEXT:   liveins: $rdi, $rsi
-  ; SSE-NEXT: {{  $}}
-  ; SSE-NEXT:   [[COPY:%[0-9]+]]:gr64_norex2_nosp = COPY $rsi
-  ; SSE-NEXT:   [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi
-  ; SSE-NEXT:   [[PABSBrm:%[0-9]+]]:vr128 = PABSBrm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s128) from %ir.add.ptr)
-  ; SSE-NEXT:   $xmm0 = COPY [[PABSBrm]]
-  ; SSE-NEXT:   RET 0, $xmm0
-  ; AVX-LABEL: name: map2_or_vex
-  ; AVX: bb.0.entry:
-  ; AVX-NEXT:   liveins: $rdi, $rsi
-  ; AVX-NEXT: {{  $}}
-  ; AVX-NEXT:   [[COPY:%[0-9]+]]:gr64_norex2_nosp = COPY $rsi
-  ; AVX-NEXT:   [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi
-  ; AVX-NEXT:   [[VPABSBrm:%[0-9]+]]:vr128 = VPABSBrm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s128) from %ir.add.ptr)
-  ; AVX-NEXT:   $xmm0 = COPY [[VPABSBrm]]
-  ; AVX-NEXT:   RET 0, $xmm0
+define <2 x i64> @map2_or_vex(ptr nocapture noundef readonly %b, i64 noundef %c) nounwind {
+; SSE-LABEL: map2_or_vex:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq %rsi, %rbx
+; SSE-NEXT:    movq %rdi, %r14
+; SSE-NEXT:    #APP
+; SSE-NEXT:    nop
+; SSE-NEXT:    #NO_APP
+; SSE-NEXT:    pabsb (%r14,%rbx,4), %xmm0
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: map2_or_vex:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    movq %rsi, %rbx
+; AVX-NEXT:    movq %rdi, %r14
+; AVX-NEXT:    #APP
+; AVX-NEXT:    nop
+; AVX-NEXT:    #NO_APP
+; AVX-NEXT:    vpabsb (%r14,%rbx,4), %xmm0
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    retq
 entry:
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   %add.ptr = getelementptr inbounds i32, ptr %b, i64 %c
   %a = load <2 x i64>, ptr %add.ptr
   %0 = bitcast <2 x i64> %a to <16 x i8>
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll
index 5fa4cb4c8826b..c193680607f76 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll
@@ -1,17 +1,20 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+amx-tile,+egpr | FileCheck %s
-; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+amx-tile,+egpr | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+amx-tile,+egpr | FileCheck %s
 
-define dso_local void @amx(ptr noundef %data) {
-  ; CHECK-LABEL: name: amx
-  ; CHECK: bb.0.entry:
-  ; CHECK-NEXT:   liveins: $rdi
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64_norex2 = COPY $rdi
-  ; CHECK-NEXT:   [[MOV32ri64_:%[0-9]+]]:gr64_norex2_nosp = MOV32ri64 8
-  ; CHECK-NEXT:   PTILELOADD 4, [[COPY]], 1, killed [[MOV32ri64_]], 0, $noreg
-  ; CHECK-NEXT:   RET 0
-  entry:
+define dso_local void @amx(ptr noundef %data) nounwind {
+; CHECK-LABEL: amx:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl $8, %eax
+; CHECK-NEXT:    tileloadd (%rbx,%rax), %tmm4
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.tileloadd64(i8 4, ptr %data, i64 8)
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll
index a9ca591a156c2..4692a58d095a6 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll
@@ -1,17 +1,22 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=-sse,+egpr | FileCheck %s
-; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=-sse,+egpr | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=-sse,+egpr | FileCheck %s
 
-define void @x87(ptr %0, ptr %1) {
-  ; CHECK-LABEL: name: x87
-  ; CHECK: bb.0 (%ir-block.2):
-  ; CHECK-NEXT:   liveins: $rdi, $rsi
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64_norex2 = COPY $rsi
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi
-  ; CHECK-NEXT:   [[LD_Fp32m:%[0-9]+]]:rfp32 = nofpexcept LD_Fp32m [[COPY1]], 1, $noreg, 0, $noreg, implicit-def dead $fpsw, implicit $fpcw :: (load (s32) from %ir.0)
-  ; CHECK-NEXT:   nofpexcept ST_Fp32m [[COPY]], 1, $noreg, 0, $noreg, killed [[LD_Fp32m]], implicit-def dead $fpsw, implicit $fpcw :: (store (s32) into %ir.1)
-  ; CHECK-NEXT:   RET 0
+define void @x87(ptr %0, ptr %1) nounwind {
+; CHECK-LABEL: x87:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rsi, %rbx
+; CHECK-NEXT:    movq %rdi, %r14
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    flds (%r14)
+; CHECK-NEXT:    fstps (%rbx)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    retq
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   %3 = load float, ptr %0
   store float %3, ptr %1
   ret void
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-special.ll b/llvm/test/CodeGen/X86/apx/no-rex2-special.ll
index 86534427a9eae..f2025b5c8cbf8 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-special.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-special.ll
@@ -1,70 +1,81 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+xsave,+egpr  | FileCheck %s
-; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+xsave,+egpr  | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+xsave,+egpr  | FileCheck %s
 
-define void @test_xsave(ptr %ptr, i32 %hi, i32 %lo) {
-  ; CHECK-LABEL: name: test_xsave
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $rdi, $esi, $edx
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $edx
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr32 = COPY $esi
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi
-  ; CHECK-NEXT:   $edx = COPY [[COPY1]]
-  ; CHECK-NEXT:   $eax = COPY [[COPY]]
-  ; CHECK-NEXT:   XSAVE [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax
-  ; CHECK-NEXT:   RET 0
+define void @test_xsave(ptr %ptr, i32 %hi, i32 %lo) nounwind {
+; CHECK-LABEL: test_xsave:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %edx, %r16d
+; CHECK-NEXT:    movl %esi, %edx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl %r16d, %eax
+; CHECK-NEXT:    xsave (%rbx)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.xsave(ptr %ptr, i32 %hi, i32 %lo)
   ret void;
 }
 declare void @llvm.x86.xsave(ptr, i32, i32)
 
-define void @test_xsave64(ptr %ptr, i32 %hi, i32 %lo) {
-  ; CHECK-LABEL: name: test_xsave64
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $rdi, $esi, $edx
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $edx
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr32 = COPY $esi
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi
-  ; CHECK-NEXT:   $edx = COPY [[COPY1]]
-  ; CHECK-NEXT:   $eax = COPY [[COPY]]
-  ; CHECK-NEXT:   XSAVE64 [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax
-  ; CHECK-NEXT:   RET 0
+define void @test_xsave64(ptr %ptr, i32 %hi, i32 %lo) nounwind {
+; CHECK-LABEL: test_xsave64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %edx, %r16d
+; CHECK-NEXT:    movl %esi, %edx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl %r16d, %eax
+; CHECK-NEXT:    xsave64 (%rbx)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.xsave64(ptr %ptr, i32 %hi, i32 %lo)
   ret void;
 }
 declare void @llvm.x86.xsave64(ptr, i32, i32)
 
-define void @test_xrstor(ptr %ptr, i32 %hi, i32 %lo) {
-  ; CHECK-LABEL: name: test_xrstor
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $rdi, $esi, $edx
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $edx
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr32 = COPY $esi
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi
-  ; CHECK-NEXT:   $edx = COPY [[COPY1]]
-  ; CHECK-NEXT:   $eax = COPY [[COPY]]
-  ; CHECK-NEXT:   XRSTOR [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax
-  ; CHECK-NEXT:   RET 0
+define void @test_xrstor(ptr %ptr, i32 %hi, i32 %lo) nounwind {
+; CHECK-LABEL: test_xrstor:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %edx, %r16d
+; CHECK-NEXT:    movl %esi, %edx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl %r16d, %eax
+; CHECK-NEXT:    xrstor (%rbx)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.xrstor(ptr %ptr, i32 %hi, i32 %lo)
   ret void;
 }
 declare void @llvm.x86.xrstor(ptr, i32, i32)
 
-define void @test_xrstor64(ptr %ptr, i32 %hi, i32 %lo) {
-  ; CHECK-LABEL: name: test_xrstor64
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $rdi, $esi, $edx
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $edx
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr32 = COPY $esi
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi
-  ; CHECK-NEXT:   $edx = COPY [[COPY1]]
-  ; CHECK-NEXT:   $eax = COPY [[COPY]]
-  ; CHECK-NEXT:   XRSTOR64 [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax
-  ; CHECK-NEXT:   RET 0
+define void @test_xrstor64(ptr %ptr, i32 %hi, i32 %lo) nounwind {
+; CHECK-LABEL: test_xrstor64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %edx, %r16d
+; CHECK-NEXT:    movl %esi, %edx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl %r16d, %eax
+; CHECK-NEXT:    xrstor64 (%rbx)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.xrstor64(ptr %ptr, i32 %hi, i32 %lo)
   ret void;
 }
diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
index b4d40fee01e41..71887e369bd18 100644
--- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
+++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
@@ -2156,15 +2156,17 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_brz(ptr %v, i16 zeroext %c) no
 ; X64-LABEL: atomic_shl1_mask01_xor_16_gpr_brz:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %ecx, %edx
 ; X64-NEXT:    andb $15, %cl
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
+; X64-NEXT:    movl $1, %esi
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shll %cl, %esi
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    .p2align 4
 ; X64-NEXT:  .LBB34_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    xorl %edx, %ecx
+; X64-NEXT:    xorl %esi, %ecx
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; X64-NEXT:    # kill: def $ax killed $ax def $eax
@@ -2172,12 +2174,12 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_brz(ptr %v, i16 zeroext %c) no
 ; X64-NEXT:  # %bb.2: # %atomicrmw.end
 ; X64-NEXT:    movzwl %ax, %ecx
 ; X64-NEXT:    movw $123, %ax
-; X64-NEXT:    testl %ecx, %edx
+; X64-NEXT:    testl %ecx, %esi
 ; X64-NEXT:    je .LBB34_3
 ; X64-NEXT:  # %bb.4: # %return
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB34_3: # %if.then
-; X64-NEXT:    movzwl %si, %eax
+; X64-NEXT:    movzwl %dx, %eax
 ; X64-NEXT:    movzwl (%rdi,%rax,2), %eax
 ; X64-NEXT:    retq
 entry:
@@ -3398,10 +3400,12 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n
 ; X64-LABEL: atomic_shl1_mask01_and_16_gpr_brnz:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %ecx, %edx
 ; X64-NEXT:    andb $15, %cl
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
+; X64-NEXT:    movl $1, %esi
+; X64-NEXT:    shll %cl, %esi
 ; X64-NEXT:    movl $-2, %r8d
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    roll %cl, %r8d
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    .p2align 4
@@ -3415,10 +3419,10 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n
 ; X64-NEXT:    jne .LBB52_1
 ; X64-NEXT:  # %bb.2: # %atomicrmw.end
 ; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    testl %eax, %edx
+; X64-NEXT:    testl %eax, %esi
 ; X64-NEXT:    je .LBB52_3
 ; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movzwl %si, %eax
+; X64-NEXT:    movzwl %dx, %eax
 ; X64-NEXT:    movzwl (%rdi,%rax,2), %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB52_3:
diff --git a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll
index 105ee7f82ee79..e118f5dbc1534 100644
--- a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll
+++ b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll
@@ -46,8 +46,9 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x
 ; CHECK-NEXT:    orl %edx, %eax
 ; CHECK-NEXT:    lock cmpxchgl %ecx, (%rbx)
 ; CHECK-NEXT:    setne %cl
-; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    movl %eax, %edx
 ; CHECK-NEXT:    shrl $16, %eax
+; CHECK-NEXT:    pinsrw $0, %edx, %xmm0
 ; CHECK-NEXT:    pinsrw $0, %eax, %xmm1
 ; CHECK-NEXT:    testb %cl, %cl
 ; CHECK-NEXT:    jne .LBB0_1
diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
index 79849a7153c91..d9b4635042256 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
@@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src,
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2]
 ; X86-NEXT:    vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
index 0f2c75b15d5b4..01b7618753a23 100644
--- a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
@@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src,
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2]
 ; X86-NEXT:    vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1]
@@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2]
 ; X86-NEXT:    vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll
index 77053e2c1bc98..4dd883a24f623 100644
--- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll
+++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll
@@ -255,8 +255,8 @@ define void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf)
 ; CHECK-LABEL: gather_qps:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    kxnorw %k0, %k0, %k2
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k2
 ; CHECK-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
 ; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
@@ -520,7 +520,7 @@ define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, ptr %x1,
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
 ; CHECK-NEXT:    vaddps %ymm2, %ymm0, %ymm0
@@ -772,7 +772,7 @@ define void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i32> %x2, <
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -788,7 +788,7 @@ define void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, <
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -800,9 +800,9 @@ define void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, <
 define void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) {
 ; CHECK-LABEL: scatter_mask_test:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT:    kxorw %k0, %k0, %k1
+; CHECK-NEXT:    kxorb %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    movb $1, %al
 ; CHECK-NEXT:    kmovd %eax, %k1
diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index df71e3c3afa5e..5ed91ea1eb872 100644
--- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -251,9 +251,9 @@ define dso_local void @scatter_mask_qps_execdomain(<8 x i64> %ind, ptr %src, i8
 define dso_local void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf) {
 ; CHECK-LABEL: gather_qps:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    kxnorw %k0, %k0, %k2
+; CHECK-NEXT:    kxnorb %k0, %k0, %k2
 ; CHECK-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
 ; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
@@ -523,7 +523,7 @@ define <8 x float> @test_int_x86_avx512_mask_gather3siv8_sf(<8 x float> %x0, ptr
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
 ; CHECK-NEXT:    vaddps %ymm2, %ymm0, %ymm0
@@ -774,7 +774,7 @@ define dso_local void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -789,7 +789,7 @@ define dso_local void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -802,9 +802,9 @@ define dso_local void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i
 define dso_local void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) {
 ; CHECK-LABEL: scatter_mask_test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT:    kxorw %k0, %k0, %k1
+; CHECK-NEXT:    kxorb %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    movb $1, %al
 ; CHECK-NEXT:    kmovd %eax, %k1
@@ -856,7 +856,7 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, ptr %b
 define <8 x float> @gather_global(<8 x i64>, ptr nocapture readnone) {
 ; CHECK-LABEL: gather_global:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vgatherqps x(,%zmm0,4), %ymm1 {%k1}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
new file mode 100644
index 0000000000000..ca5f3192d7b97
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
@@ -0,0 +1,229 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512DQBW
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
+declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>)
+
+; Test case 1: Direct v8i1 all-ones mask (should use kxnorb on AVX512DQ)
+define <8 x float> @mask_v8i1_allones(ptr %ptr) {
+; AVX512F-LABEL: mask_v8i1_allones:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movw $255, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_v8i1_allones:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kxnorb %k0, %k0, %k1
+; AVX512DQ-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_v8i1_allones:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    movw $255, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQBW-LABEL: mask_v8i1_allones:
+; AVX512DQBW:       # %bb.0:
+; AVX512DQBW-NEXT:    kxnorb %k0, %k0, %k1
+; AVX512DQBW-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512DQBW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQBW-NEXT:    retq
+  %res = call <8 x float> @llvm.masked.expandload.v8f32(ptr %ptr, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> zeroinitializer)
+  ret <8 x float> %res
+}
+
+; Test case 2: v16i1 with lower 8 bits set via bitconvert (should use kxnorb on AVX512DQ)
+define <16 x float> @mask_v16i1_lower8(ptr %ptr) {
+; AVX512F-LABEL: mask_v16i1_lower8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movw $255, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_v16i1_lower8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kxnorb %k0, %k0, %k1
+; AVX512DQ-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_v16i1_lower8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    movw $255, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQBW-LABEL: mask_v16i1_lower8:
+; AVX512DQBW:       # %bb.0:
+; AVX512DQBW-NEXT:    kxnorb %k0, %k0, %k1
+; AVX512DQBW-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512DQBW-NEXT:    retq
+  %res = call <16 x float> @llvm.masked.expandload.v16f32(ptr %ptr, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> zeroinitializer)
+  ret <16 x float> %res
+}
+
+; Test case 3: v16i1 with all bits set (should use kxnorw on all targets)
+define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) {
+; AVX512-LABEL: gather_all:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    kxnorw %k0, %k0, %k1
+; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> poison)
+  ret <16 x float> %res
+}
+
+; Test case 4: v8i1 with lower 8 bits set in gather (should use kxnorb on AVX512DQ targets)
+define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) {
+; AVX512F-LABEL: gather_lower:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    movw $255, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: gather_lower:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kxnorb %k0, %k0, %k1
+; AVX512DQ-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQ-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: gather_lower:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    movw $255, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQBW-LABEL: gather_lower:
+; AVX512DQBW:       # %bb.0:
+; AVX512DQBW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQBW-NEXT:    kxnorb %k0, %k0, %k1
+; AVX512DQBW-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQBW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512DQBW-NEXT:    retq
+  %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> poison)
+  ret <16 x float> %res
+}
+
+; Test case 5: v32i1 mask via bitconvert combined with dynamic condition.
+; Ensures lower 16 lanes force the KSET1W path without folding into a shuffle.
+define <32 x i16> @mask_v32i1_lower16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) {
+; AVX512F-LABEL: mask_v32i1_lower16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
+; AVX512F-NEXT:    vpcmpgtw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = -1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_v32i1_lower16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
+; AVX512DQ-NEXT:    vpcmpgtw %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm3 = -1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_v32i1_lower16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    movl $65535, %eax # imm = 0xFFFF
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    vpcmpgtw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    kord %k0, %k1, %k1
+; AVX512BW-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQBW-LABEL: mask_v32i1_lower16:
+; AVX512DQBW:       # %bb.0:
+; AVX512DQBW-NEXT:    kxnorw %k0, %k0, %k0
+; AVX512DQBW-NEXT:    vpcmpgtw %zmm3, %zmm2, %k1
+; AVX512DQBW-NEXT:    kord %k0, %k1, %k1
+; AVX512DQBW-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512DQBW-NEXT:    retq
+  %mask0 = bitcast i32 65535 to <32 x i1>
+  %mask1 = icmp sgt <32 x i16> %c, %d
+  %mask = or <32 x i1> %mask0, %mask1
+  %res = select <32 x i1> %mask, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+; Test case 6: v64i1 mask via bitconvert combined with dynamic condition.
+; Verifies the KSET1D submask pattern survives past SelectionDAG combines.
+define <64 x i8> @mask_v64i1_lower32(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
+; AVX512F-LABEL: mask_v64i1_lower32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
+; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = -1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_v64i1_lower32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
+; AVX512DQ-NEXT:    vpcmpgtb %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm3 = -1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_v64i1_lower32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    movl $4294967295, %eax # imm = 0xFFFFFFFF
+; AVX512BW-NEXT:    kmovq %rax, %k0
+; AVX512BW-NEXT:    vpcmpgtb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    korq %k0, %k1, %k1
+; AVX512BW-NEXT:    vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQBW-LABEL: mask_v64i1_lower32:
+; AVX512DQBW:       # %bb.0:
+; AVX512DQBW-NEXT:    kxnord %k0, %k0, %k0
+; AVX512DQBW-NEXT:    vpcmpgtb %zmm3, %zmm2, %k1
+; AVX512DQBW-NEXT:    korq %k0, %k1, %k1
+; AVX512DQBW-NEXT:    vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512DQBW-NEXT:    retq
+  %mask0 = bitcast i64 4294967295 to <64 x i1>
+  %mask1 = icmp sgt <64 x i8> %c, %d
+  %mask = or <64 x i1> %mask0, %mask1
+  %res = select <64 x i1> %mask, <64 x i8> %a, <64 x i8> %b
+  ret <64 x i8> %res
+}
+
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll b/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll
new file mode 100644
index 0000000000000..293b48d7dc5dd
--- /dev/null
+++ b/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll
@@ -0,0 +1,39 @@
+;; BB section test with basic block hashes.
+
+;; basic block sections Profile with bb hashes
+; RUN: echo 'v1' > %t
+; RUN: echo 'f foo' >> %t
+; RUN: echo 'g 0:10,1:9,2:1 1:8,3:8 2:2,3:2 3:11' >> %t
+; RUN: echo 'c 0 2 3' >> %t
+; RUN: echo 'h 0:64863A11B5CA0000 1:54F1E80D6B270006 2:54F1F4E66B270008 3:C8BC6041A2CB0009' >> %t
+; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t | FileCheck %s
+;
+define void @foo(i1 zeroext) nounwind {
+  %2 = alloca i8, align 1
+  %3 = zext i1 %0 to i8
+  store i8 %3, ptr %2, align 1
+  %4 = load i8, ptr %2, align 1
+  %5 = trunc i8 %4 to i1
+  br i1 %5, label %6, label %8
+
+6:                                                ; preds = %1
+  %7 = call i32 @bar()
+  br label %10
+
+8:                                                ; preds = %1
+  %9 = call i32 @baz()
+  br label %10
+
+10:                                               ; preds = %8, %6
+  ret void
+}
+
+declare i32 @bar() #1
+
+declare i32 @baz() #1
+
+; CHECK: .section	.text.foo,"ax",@progbits
+; CHECK: callq baz
+; CHECK: retq
+; CHECK: .section	.text.split.foo,"ax",@progbits
+; CHECK: callq bar
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll b/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll
index 751ab76722c07..eb0a14b2820b4 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll
+++ b/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll
@@ -69,6 +69,20 @@
 ; RUN: echo 'g 0:4,1:2:3' >> %t15
 ; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t15 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR15
 ; CHECK-ERROR15: LLVM ERROR: invalid profile {{.*}} at line 4: unsigned integer expected: '2:3'
+; RUN: echo 'v1' > %t16
+; RUN: echo 'f dummy1' >> %t16
+; RUN: echo 'c 0 1' >> %t16
+; RUN: echo 'g 0:4,1:2' >> %t16
+; RUN: echo 'h a:1111111111111111 1:ffffffffffffffff' >> %t16
+; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t16 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR16
+; CHECK-ERROR16: LLVM ERROR: invalid profile {{.*}} at line 5: unsigned integer expected: 'a'
+; RUN: echo 'v1' > %t17
+; RUN: echo 'f dummy1' >> %t17
+; RUN: echo 'c 0 1' >> %t17
+; RUN: echo 'g 0:4,1:2' >> %t17
+; RUN: echo 'h 0:111111111111111g 1:ffffffffffffffff' >> %t17
+; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t17 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR17
+; CHECK-ERROR17: LLVM ERROR: invalid profile {{.*}} at line 5: unsigned integer expected in hex format: '111111111111111g'
 
 
 define i32 @dummy1(i32 %x, i32 %y, i32 %z) {
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-list.ll b/llvm/test/CodeGen/X86/basic-block-sections-list.ll
index 45ef452f4f5c1..d652a540f3e9c 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-list.ll
+++ b/llvm/test/CodeGen/X86/basic-block-sections-list.ll
@@ -1,17 +1,13 @@
-;; Check the basic block sections list option.
-;; version 0 profile:
-; RUN: echo '!_Z3foob' > %t1
+;; Check that specifying the function in the basic block sections profile
+;; without any other directives is a noop.
 ;;
-;; version 1 profile:
-; RUN: echo 'v1' > %t2
-; RUN: echo 'f _Z3foob' >> %t2
+;; Specify the bb sections profile:
+; RUN: echo 'v1' > %t
+; RUN: echo 'f _Z3foob' >> %t
 ;;
-; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t1 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-FUNCTION-SECTION
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t1 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-NO-FUNCTION-SECTION
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t1 -unique-basic-block-section-names --bbsections-guided-section-prefix=false | FileCheck %s -check-prefix=LINUX-SECTIONS-NO-GUIDED-PREFIX
-; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t2 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-FUNCTION-SECTION
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t2 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-NO-FUNCTION-SECTION
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t2 -unique-basic-block-section-names --bbsections-guided-section-prefix=false | FileCheck %s -check-prefix=LINUX-SECTIONS-NO-GUIDED-PREFIX
+; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t  > %bbsections
+; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections > %orig
+; RUN: diff -u %orig %bbsections
 
 define i32 @_Z3foob(i1 zeroext %0) nounwind {
   %2 = alloca i32, align 4
@@ -41,45 +37,3 @@ define i32 @_Z3foob(i1 zeroext %0) nounwind {
 
 declare i32 @_Z3barv() #1
 declare i32 @_Z3bazv() #1
-
-define i32 @_Z3zipb(i1 zeroext %0) nounwind {
-  %2 = alloca i32, align 4
-  %3 = alloca i8, align 1
-  %4 = zext i1 %0 to i8
-  store i8 %4, ptr %3, align 1
-  %5 = load i8, ptr %3, align 1
-  %6 = trunc i8 %5 to i1
-  %7 = zext i1 %6 to i32
-  %8 = icmp sgt i32 %7, 0
-  br i1 %8, label %9, label %11
-
-9:                                                ; preds = %1
-  %10 = call i32 @_Z3barv()
-  store i32 %10, ptr %2, align 4
-  br label %13
-
-11:                                               ; preds = %1
-  %12 = call i32 @_Z3bazv()
-  store i32 %12, ptr %2, align 4
-  br label %13
-
-13:                                               ; preds = %11, %9
-  %14 = load i32, ptr %2, align 4
-  ret i32 %14
-}
-
-; LINUX-SECTIONS-NO-GUIDED-PREFIX: .section        .text._Z3foob,"ax",@progbits
-; LINUX-SECTIONS: .section        .text.hot._Z3foob,"ax",@progbits
-; LINUX-SECTIONS: _Z3foob:
-; LINUX-SECTIONS: .section        .text.hot._Z3foob._Z3foob.__part.1,"ax",@progbits
-; LINUX-SECTIONS: _Z3foob.__part.1:
-; LINUX-SECTIONS: .section        .text.hot._Z3foob._Z3foob.__part.2,"ax",@progbits
-; LINUX-SECTIONS: _Z3foob.__part.2:
-; LINUX-SECTIONS: .section        .text.hot._Z3foob._Z3foob.__part.3,"ax",@progbits
-; LINUX-SECTIONS: _Z3foob.__part.3:
-
-; LINUX-SECTIONS-FUNCTION-SECTION: .section     .text._Z3zipb,"ax",@progbits
-; LINUX-SECTIONS-NO-FUNCTION-SECTION-NOT: .section     .text{{.*}}._Z3zipb,"ax",@progbits
-; LINUX-SECTIONS: _Z3zipb:
-; LINUX-SECTIONS-NOT: .section        .text{{.*}}._Z3zipb.__part.{{[0-9]+}},"ax",@progbits
-; LINUX-SECTIONS-NOT: _Z3zipb.__part.{{[0-9]+}}:
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll b/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll
index d481b147662dc..6e0db20ca0492 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll
+++ b/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll
@@ -1,6 +1,8 @@
-; RUN: echo "!foo" > %t.order.txt
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t.order.txt  | FileCheck --check-prefix=SOURCE-DRIFT %s
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t.order.txt -bbsections-detect-source-drift=false | FileCheck --check-prefix=HASH-CHECK-DISABLED %s
+; RUN: echo "v1" > %t
+; RUN: echo "f foo" >> %t
+; RUN: echo "c 0" >> %t
+; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t  | FileCheck --check-prefix=SOURCE-DRIFT %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t -bbsections-detect-source-drift=false | FileCheck --check-prefix=HASH-CHECK-DISABLED %s
 
 define dso_local i32 @foo(i1 zeroext %0, i1 zeroext %1)  !annotation !1 {
   br i1 %0, label %5, label %3
diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
index 86d7df0c2d648..fae1ff90dd8d5 100644
--- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
@@ -216,8 +216,8 @@ define i1 @trunc_v8i16_cmp(<8 x i16> %a0) nounwind {
 define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind {
 ; SSE-LABEL: bitcast_v16i8_to_v2i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pmovmskb %xmm0, %ecx
-; SSE-NEXT:    movl %ecx, %eax
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    movl %eax, %ecx
 ; SSE-NEXT:    shrl $8, %eax
 ; SSE-NEXT:    addb %cl, %al
 ; SSE-NEXT:    # kill: def $al killed $al killed $eax
@@ -225,8 +225,8 @@ define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind {
 ;
 ; AVX12-LABEL: bitcast_v16i8_to_v2i8:
 ; AVX12:       # %bb.0:
-; AVX12-NEXT:    vpmovmskb %xmm0, %ecx
-; AVX12-NEXT:    movl %ecx, %eax
+; AVX12-NEXT:    vpmovmskb %xmm0, %eax
+; AVX12-NEXT:    movl %eax, %ecx
 ; AVX12-NEXT:    shrl $8, %eax
 ; AVX12-NEXT:    addb %cl, %al
 ; AVX12-NEXT:    # kill: def $al killed $al killed $eax
@@ -441,8 +441,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind {
 ; SSE-LABEL: bitcast_v16i16_to_v2i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    packsswb %xmm1, %xmm0
-; SSE-NEXT:    pmovmskb %xmm0, %ecx
-; SSE-NEXT:    movl %ecx, %eax
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    movl %eax, %ecx
 ; SSE-NEXT:    shrl $8, %eax
 ; SSE-NEXT:    addb %cl, %al
 ; SSE-NEXT:    # kill: def $al killed $al killed $eax
@@ -452,8 +452,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
-; AVX1-NEXT:    movl %ecx, %eax
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    movl %eax, %ecx
 ; AVX1-NEXT:    shrl $8, %eax
 ; AVX1-NEXT:    addb %cl, %al
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
@@ -464,8 +464,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpmovmskb %xmm0, %ecx
-; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; AVX2-NEXT:    movl %eax, %ecx
 ; AVX2-NEXT:    shrl $8, %eax
 ; AVX2-NEXT:    addb %cl, %al
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
@@ -762,8 +762,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind {
 ; SSE-NEXT:    packssdw %xmm3, %xmm2
 ; SSE-NEXT:    packssdw %xmm1, %xmm0
 ; SSE-NEXT:    packsswb %xmm2, %xmm0
-; SSE-NEXT:    pmovmskb %xmm0, %ecx
-; SSE-NEXT:    movl %ecx, %eax
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    movl %eax, %ecx
 ; SSE-NEXT:    shrl $8, %eax
 ; SSE-NEXT:    addb %cl, %al
 ; SSE-NEXT:    # kill: def $al killed $al killed $eax
@@ -776,8 +776,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
-; AVX1-NEXT:    movl %ecx, %eax
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    movl %eax, %ecx
 ; AVX1-NEXT:    shrl $8, %eax
 ; AVX1-NEXT:    addb %cl, %al
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
@@ -793,8 +793,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-NEXT:    vpmovmskb %xmm0, %ecx
-; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; AVX2-NEXT:    movl %eax, %ecx
 ; AVX2-NEXT:    shrl $8, %eax
 ; AVX2-NEXT:    addb %cl, %al
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
index 13149d78b16fb..330c978d2a9f7 100644
--- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512POPCNT
 
 ;
 ; CTPOP
@@ -712,23 +712,15 @@ define i32 @load_ctlz_i256(ptr %p0) nounwind {
 ;
 ; AVX512-LABEL: load_ctlz_i256:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    movq 8(%rdi), %rcx
-; AVX512-NEXT:    movq 16(%rdi), %rdx
-; AVX512-NEXT:    movq 24(%rdi), %rsi
-; AVX512-NEXT:    lzcntq %rsi, %rax
-; AVX512-NEXT:    lzcntq %rdx, %r8
-; AVX512-NEXT:    addl $64, %r8d
-; AVX512-NEXT:    testq %rsi, %rsi
-; AVX512-NEXT:    cmovnel %eax, %r8d
-; AVX512-NEXT:    lzcntq %rcx, %r9
-; AVX512-NEXT:    lzcntq (%rdi), %rax
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %rcx, %rcx
-; AVX512-NEXT:    cmovnel %r9d, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %rsi, %rdx
-; AVX512-NEXT:    cmovnel %r8d, %eax
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX512-NEXT:    vplzcntq %ymm0, %ymm1
+; AVX512-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512-NEXT:    vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %a0 = load i256, ptr %p0
   %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 0)
@@ -845,47 +837,28 @@ define i32 @test_ctlz_i512(i512 %a0) nounwind {
 ;
 ; AVX512-LABEL: test_ctlz_i512:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    lzcntq %r11, %rax
-; AVX512-NEXT:    lzcntq %r10, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %r11, %r11
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    lzcntq %r9, %rax
-; AVX512-NEXT:    lzcntq %r8, %rbx
-; AVX512-NEXT:    addl $64, %ebx
-; AVX512-NEXT:    testq %r9, %r9
-; AVX512-NEXT:    cmovnel %eax, %ebx
-; AVX512-NEXT:    subl $-128, %ebx
-; AVX512-NEXT:    movq %r10, %rax
-; AVX512-NEXT:    orq %r11, %rax
-; AVX512-NEXT:    cmovnel %r14d, %ebx
-; AVX512-NEXT:    lzcntq %rcx, %rax
-; AVX512-NEXT:    lzcntq %rdx, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %rcx, %rcx
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    lzcntq %rsi, %r15
-; AVX512-NEXT:    lzcntq %rdi, %rax
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %rsi, %rsi
-; AVX512-NEXT:    cmovnel %r15d, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %rcx, %rdx
-; AVX512-NEXT:    cmovnel %r14d, %eax
-; AVX512-NEXT:    addl $256, %eax # imm = 0x100
-; AVX512-NEXT:    orq %r11, %r9
-; AVX512-NEXT:    orq %r10, %r8
-; AVX512-NEXT:    orq %r9, %r8
-; AVX512-NEXT:    cmovnel %ebx, %eax
-; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    vmovq %rdi, %xmm0
+; AVX512-NEXT:    vmovq %rsi, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vmovq %rdx, %xmm1
+; AVX512-NEXT:    vmovq %rcx, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmovq %r8, %xmm1
+; AVX512-NEXT:    vmovq %r9, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vplzcntq %zmm0, %zmm1
+; AVX512-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
   %res = trunc i512 %cnt to i32
@@ -1010,50 +983,16 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
 ;
 ; AVX512-LABEL: load_ctlz_i512:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    movq 8(%rdi), %r11
-; AVX512-NEXT:    movq 16(%rdi), %r9
-; AVX512-NEXT:    movq 24(%rdi), %r10
-; AVX512-NEXT:    movq 32(%rdi), %rcx
-; AVX512-NEXT:    movq 40(%rdi), %rdx
-; AVX512-NEXT:    movq 48(%rdi), %rsi
-; AVX512-NEXT:    movq 56(%rdi), %r8
-; AVX512-NEXT:    lzcntq %r8, %rax
-; AVX512-NEXT:    lzcntq %rsi, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %r8, %r8
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    lzcntq %rdx, %rax
-; AVX512-NEXT:    lzcntq %rcx, %rbx
-; AVX512-NEXT:    addl $64, %ebx
-; AVX512-NEXT:    testq %rdx, %rdx
-; AVX512-NEXT:    cmovnel %eax, %ebx
-; AVX512-NEXT:    subl $-128, %ebx
-; AVX512-NEXT:    movq %rsi, %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    cmovnel %r14d, %ebx
-; AVX512-NEXT:    lzcntq %r10, %rax
-; AVX512-NEXT:    lzcntq %r9, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %r10, %r10
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    lzcntq (%rdi), %rax
-; AVX512-NEXT:    lzcntq %r11, %rdi
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %r11, %r11
-; AVX512-NEXT:    cmovnel %edi, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %r10, %r9
-; AVX512-NEXT:    cmovnel %r14d, %eax
-; AVX512-NEXT:    addl $256, %eax # imm = 0x100
-; AVX512-NEXT:    orq %r8, %rdx
-; AVX512-NEXT:    orq %rsi, %rcx
-; AVX512-NEXT:    orq %rdx, %rcx
-; AVX512-NEXT:    cmovnel %ebx, %eax
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512-NEXT:    vplzcntq %zmm0, %zmm1
+; AVX512-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %a0 = load i512, ptr %p0
   %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
@@ -1807,182 +1746,190 @@ define i32 @load_ctlz_i1024(ptr %p0) nounwind {
 }
 
 ;
-; CTTZ
+; CTLZ_ZERO_UNDEF
 ;
 
-define i32 @test_cttz_i128(i128 %a0) nounwind {
-; SSE-LABEL: test_cttz_i128:
+define i32 @test_ctlz_undef_i128(i128 %a0) nounwind {
+; SSE-LABEL: test_ctlz_undef_i128:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    rep bsfq %rdi, %rcx
-; SSE-NEXT:    movl $64, %eax
-; SSE-NEXT:    rep bsfq %rsi, %rax
-; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    bsrq %rsi, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    bsrq %rdi, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rsi, %rsi
 ; SSE-NEXT:    cmovnel %ecx, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: test_cttz_i128:
+; AVX2-LABEL: test_ctlz_undef_i128:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    tzcntq %rdi, %rcx
-; AVX2-NEXT:    tzcntq %rsi, %rax
+; AVX2-NEXT:    lzcntq %rsi, %rcx
+; AVX2-NEXT:    lzcntq %rdi, %rax
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    testq %rsi, %rsi
 ; AVX2-NEXT:    cmovnel %ecx, %eax
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cttz_i128:
+; AVX512-LABEL: test_ctlz_undef_i128:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    tzcntq %rdi, %rcx
-; AVX512-NEXT:    tzcntq %rsi, %rax
+; AVX512-NEXT:    lzcntq %rsi, %rcx
+; AVX512-NEXT:    lzcntq %rdi, %rax
 ; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %rdi, %rdi
+; AVX512-NEXT:    testq %rsi, %rsi
 ; AVX512-NEXT:    cmovnel %ecx, %eax
 ; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX512-NEXT:    retq
-  %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 0)
+  %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 -1)
   %res = trunc i128 %cnt to i32
   ret i32 %res
 }
 
-define i32 @load_cttz_i128(ptr %p0) nounwind {
-; SSE-LABEL: load_cttz_i128:
+define i32 @load_ctlz_undef_i128(ptr %p0) nounwind {
+; SSE-LABEL: load_ctlz_undef_i128:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movq (%rdi), %rcx
-; SSE-NEXT:    rep bsfq %rcx, %rdx
-; SSE-NEXT:    movl $64, %eax
-; SSE-NEXT:    rep bsfq 8(%rdi), %rax
-; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    movq 8(%rdi), %rcx
+; SSE-NEXT:    bsrq %rcx, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    bsrq (%rdi), %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
 ; SSE-NEXT:    testq %rcx, %rcx
 ; SSE-NEXT:    cmovnel %edx, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: load_cttz_i128:
+; AVX2-LABEL: load_ctlz_undef_i128:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    movq (%rdi), %rcx
-; AVX2-NEXT:    tzcntq %rcx, %rdx
-; AVX2-NEXT:    tzcntq 8(%rdi), %rax
+; AVX2-NEXT:    movq 8(%rdi), %rcx
+; AVX2-NEXT:    lzcntq %rcx, %rdx
+; AVX2-NEXT:    lzcntq (%rdi), %rax
 ; AVX2-NEXT:    addl $64, %eax
 ; AVX2-NEXT:    testq %rcx, %rcx
 ; AVX2-NEXT:    cmovnel %edx, %eax
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: load_cttz_i128:
+; AVX512-LABEL: load_ctlz_undef_i128:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    movq (%rdi), %rcx
-; AVX512-NEXT:    tzcntq %rcx, %rdx
-; AVX512-NEXT:    tzcntq 8(%rdi), %rax
+; AVX512-NEXT:    movq 8(%rdi), %rcx
+; AVX512-NEXT:    lzcntq %rcx, %rdx
+; AVX512-NEXT:    lzcntq (%rdi), %rax
 ; AVX512-NEXT:    addl $64, %eax
 ; AVX512-NEXT:    testq %rcx, %rcx
 ; AVX512-NEXT:    cmovnel %edx, %eax
 ; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX512-NEXT:    retq
   %a0 = load i128, ptr %p0
-  %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 0)
+  %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 -1)
   %res = trunc i128 %cnt to i32
   ret i32 %res
 }
 
-define i32 @test_cttz_i256(i256 %a0) nounwind {
-; SSE-LABEL: test_cttz_i256:
+define i32 @test_ctlz_undef_i256(i256 %a0) nounwind {
+; SSE-LABEL: test_ctlz_undef_i256:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    rep bsfq %rdi, %rax
-; SSE-NEXT:    rep bsfq %rsi, %r8
-; SSE-NEXT:    addl $64, %r8d
-; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    bsrq %rcx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %rdx, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    orl $64, %r8d
+; SSE-NEXT:    testq %rcx, %rcx
 ; SSE-NEXT:    cmovnel %eax, %r8d
-; SSE-NEXT:    rep bsfq %rdx, %r9
-; SSE-NEXT:    movl $64, %eax
-; SSE-NEXT:    rep bsfq %rcx, %rax
-; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    bsrq %rsi, %r9
+; SSE-NEXT:    xorl $63, %r9d
+; SSE-NEXT:    bsrq %rdi, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rsi, %rsi
 ; SSE-NEXT:    cmovnel %r9d, %eax
 ; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    orq %rsi, %rdi
+; SSE-NEXT:    orq %rcx, %rdx
 ; SSE-NEXT:    cmovnel %r8d, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: test_cttz_i256:
+; AVX2-LABEL: test_ctlz_undef_i256:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    tzcntq %rdi, %rax
-; AVX2-NEXT:    tzcntq %rsi, %r8
+; AVX2-NEXT:    lzcntq %rcx, %rax
+; AVX2-NEXT:    lzcntq %rdx, %r8
 ; AVX2-NEXT:    addl $64, %r8d
-; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    testq %rcx, %rcx
 ; AVX2-NEXT:    cmovnel %eax, %r8d
-; AVX2-NEXT:    tzcntq %rdx, %r9
+; AVX2-NEXT:    lzcntq %rsi, %r9
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %rcx, %rax
+; AVX2-NEXT:    lzcntq %rdi, %rax
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    testq %rsi, %rsi
 ; AVX2-NEXT:    cmovnel %r9d, %eax
 ; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    orq %rsi, %rdi
+; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    cmovnel %r8d, %eax
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cttz_i256:
+; AVX512-LABEL: test_ctlz_undef_i256:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    tzcntq %rdi, %rax
-; AVX512-NEXT:    tzcntq %rsi, %r8
+; AVX512-NEXT:    lzcntq %rcx, %rax
+; AVX512-NEXT:    lzcntq %rdx, %r8
 ; AVX512-NEXT:    addl $64, %r8d
-; AVX512-NEXT:    testq %rdi, %rdi
+; AVX512-NEXT:    testq %rcx, %rcx
 ; AVX512-NEXT:    cmovnel %eax, %r8d
-; AVX512-NEXT:    tzcntq %rdx, %r9
-; AVX512-NEXT:    tzcntq %rcx, %rax
+; AVX512-NEXT:    lzcntq %rsi, %r9
+; AVX512-NEXT:    lzcntq %rdi, %rax
 ; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    testq %rsi, %rsi
 ; AVX512-NEXT:    cmovnel %r9d, %eax
 ; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %rsi, %rdi
+; AVX512-NEXT:    orq %rcx, %rdx
 ; AVX512-NEXT:    cmovnel %r8d, %eax
 ; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX512-NEXT:    retq
-  %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
+  %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1)
   %res = trunc i256 %cnt to i32
   ret i32 %res
 }
 
-define i32 @load_cttz_i256(ptr %p0) nounwind {
-; SSE-LABEL: load_cttz_i256:
+define i32 @load_ctlz_undef_i256(ptr %p0) nounwind {
+; SSE-LABEL: load_ctlz_undef_i256:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    movq 8(%rdi), %rdx
 ; SSE-NEXT:    movq 16(%rdi), %rcx
-; SSE-NEXT:    movq (%rdi), %rdx
-; SSE-NEXT:    movq 8(%rdi), %rsi
-; SSE-NEXT:    rep bsfq %rdx, %rax
-; SSE-NEXT:    rep bsfq %rsi, %r8
-; SSE-NEXT:    addl $64, %r8d
-; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    movq 24(%rdi), %rsi
+; SSE-NEXT:    bsrq %rsi, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %rcx, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    orl $64, %r8d
+; SSE-NEXT:    testq %rsi, %rsi
 ; SSE-NEXT:    cmovnel %eax, %r8d
-; SSE-NEXT:    rep bsfq %rcx, %r9
-; SSE-NEXT:    movl $64, %eax
-; SSE-NEXT:    rep bsfq 24(%rdi), %rax
-; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    bsrq %rdx, %r9
+; SSE-NEXT:    xorl $63, %r9d
+; SSE-NEXT:    bsrq (%rdi), %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
 ; SSE-NEXT:    cmovnel %r9d, %eax
 ; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    orq %rsi, %rdx
+; SSE-NEXT:    orq %rsi, %rcx
 ; SSE-NEXT:    cmovnel %r8d, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: load_cttz_i256:
+; AVX2-LABEL: load_ctlz_undef_i256:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    movq (%rdi), %rcx
-; AVX2-NEXT:    movq 8(%rdi), %rdx
-; AVX2-NEXT:    tzcntq %rcx, %rax
-; AVX2-NEXT:    tzcntq %rdx, %rsi
+; AVX2-NEXT:    movq 16(%rdi), %rcx
+; AVX2-NEXT:    movq 24(%rdi), %rdx
+; AVX2-NEXT:    lzcntq %rdx, %rax
+; AVX2-NEXT:    lzcntq %rcx, %rsi
 ; AVX2-NEXT:    addl $64, %esi
-; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    testq %rdx, %rdx
 ; AVX2-NEXT:    cmovnel %eax, %esi
-; AVX2-NEXT:    movq 16(%rdi), %r8
-; AVX2-NEXT:    tzcntq %r8, %r9
+; AVX2-NEXT:    movq 8(%rdi), %r8
+; AVX2-NEXT:    lzcntq %r8, %r9
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq 24(%rdi), %rax
+; AVX2-NEXT:    lzcntq (%rdi), %rax
 ; AVX2-NEXT:    addl $64, %eax
 ; AVX2-NEXT:    testq %r8, %r8
 ; AVX2-NEXT:    cmovnel %r9d, %eax
@@ -1992,77 +1939,2522 @@ define i32 @load_cttz_i256(ptr %p0) nounwind {
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: load_cttz_i256:
+; AVX512-LABEL: load_ctlz_undef_i256:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    movq 16(%rdi), %rcx
-; AVX512-NEXT:    movq (%rdi), %rdx
-; AVX512-NEXT:    movq 8(%rdi), %rsi
-; AVX512-NEXT:    tzcntq %rdx, %rax
-; AVX512-NEXT:    tzcntq %rsi, %r8
-; AVX512-NEXT:    addl $64, %r8d
-; AVX512-NEXT:    testq %rdx, %rdx
-; AVX512-NEXT:    cmovnel %eax, %r8d
-; AVX512-NEXT:    tzcntq %rcx, %r9
-; AVX512-NEXT:    tzcntq 24(%rdi), %rax
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %rcx, %rcx
-; AVX512-NEXT:    cmovnel %r9d, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %rsi, %rdx
-; AVX512-NEXT:    cmovnel %r8d, %eax
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX512-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512-NEXT:    vplzcntq %ymm0, %ymm0
+; AVX512-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpcompressq %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %a0 = load i256, ptr %p0
-  %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
+  %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1)
   %res = trunc i256 %cnt to i32
   ret i32 %res
 }
 
-define i32 @test_cttz_i512(i512 %a0) nounwind {
-; SSE-LABEL: test_cttz_i512:
+define i32 @test_ctlz_undef_i512(i512 %a0) nounwind {
+; SSE-LABEL: test_ctlz_undef_i512:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r15
 ; SSE-NEXT:    pushq %r14
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    rep bsfq %rdi, %rax
-; SSE-NEXT:    rep bsfq %rsi, %r11
-; SSE-NEXT:    addl $64, %r11d
-; SSE-NEXT:    testq %rdi, %rdi
-; SSE-NEXT:    cmovnel %eax, %r11d
-; SSE-NEXT:    rep bsfq %rdx, %rax
-; SSE-NEXT:    rep bsfq %rcx, %r10
-; SSE-NEXT:    addl $64, %r10d
-; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovnel %eax, %r10d
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; SSE-NEXT:    subl $-128, %r10d
-; SSE-NEXT:    movq %rdi, %rax
-; SSE-NEXT:    orq %rsi, %rax
-; SSE-NEXT:    cmovnel %r11d, %r10d
-; SSE-NEXT:    rep bsfq %r8, %rax
-; SSE-NEXT:    rep bsfq %r9, %r11
-; SSE-NEXT:    addl $64, %r11d
-; SSE-NEXT:    testq %r8, %r8
-; SSE-NEXT:    cmovnel %eax, %r11d
-; SSE-NEXT:    rep bsfq %rbx, %r14
-; SSE-NEXT:    movl $64, %eax
-; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %rbx, %rbx
-; SSE-NEXT:    cmovnel %r14d, %eax
-; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    orq %r9, %r8
-; SSE-NEXT:    cmovnel %r11d, %eax
-; SSE-NEXT:    addl $256, %eax # imm = 0x100
-; SSE-NEXT:    orq %rcx, %rsi
-; SSE-NEXT:    orq %rdx, %rdi
-; SSE-NEXT:    orq %rsi, %rdi
-; SSE-NEXT:    cmovnel %r10d, %eax
-; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: test_cttz_i512:
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    bsrq %r11, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %r10, %r14
+; SSE-NEXT:    xorl $63, %r14d
+; SSE-NEXT:    orl $64, %r14d
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    cmovnel %eax, %r14d
+; SSE-NEXT:    bsrq %r9, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %r8, %rbx
+; SSE-NEXT:    xorl $63, %ebx
+; SSE-NEXT:    orl $64, %ebx
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %eax, %ebx
+; SSE-NEXT:    subl $-128, %ebx
+; SSE-NEXT:    movq %r10, %rax
+; SSE-NEXT:    orq %r11, %rax
+; SSE-NEXT:    cmovnel %r14d, %ebx
+; SSE-NEXT:    bsrq %rcx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %rdx, %r14
+; SSE-NEXT:    xorl $63, %r14d
+; SSE-NEXT:    orl $64, %r14d
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %eax, %r14d
+; SSE-NEXT:    bsrq %rsi, %r15
+; SSE-NEXT:    xorl $63, %r15d
+; SSE-NEXT:    bsrq %rdi, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %rcx, %rdx
+; SSE-NEXT:    cmovnel %r14d, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    orq %r11, %r9
+; SSE-NEXT:    orq %r10, %r8
+; SSE-NEXT:    orq %r9, %r8
+; SSE-NEXT:    cmovnel %ebx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_ctlz_undef_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    lzcntq %r11, %rax
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    lzcntq %r10, %r14
+; AVX2-NEXT:    addl $64, %r14d
+; AVX2-NEXT:    testq %r11, %r11
+; AVX2-NEXT:    cmovnel %eax, %r14d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %r9, %rax
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    lzcntq %r8, %rbx
+; AVX2-NEXT:    addl $64, %ebx
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %eax, %ebx
+; AVX2-NEXT:    subl $-128, %ebx
+; AVX2-NEXT:    movq %r10, %rax
+; AVX2-NEXT:    orq %r11, %rax
+; AVX2-NEXT:    cmovnel %r14d, %ebx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rcx, %rax
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    lzcntq %rdx, %r14
+; AVX2-NEXT:    addl $64, %r14d
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %eax, %r14d
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    lzcntq %rsi, %r15
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rdi, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %r15d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r14d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %r11, %r9
+; AVX2-NEXT:    orq %r10, %r8
+; AVX2-NEXT:    orq %r9, %r8
+; AVX2-NEXT:    cmovnel %ebx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ctlz_undef_i512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq %rdi, %xmm0
+; AVX512-NEXT:    vmovq %rsi, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vmovq %rdx, %xmm1
+; AVX512-NEXT:    vmovq %rcx, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmovq %r8, %xmm1
+; AVX512-NEXT:    vmovq %r9, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 -1)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_ctlz_undef_i512(ptr %p0) nounwind {
+; SSE-LABEL: load_ctlz_undef_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq 8(%rdi), %r11
+; SSE-NEXT:    movq 16(%rdi), %r9
+; SSE-NEXT:    movq 24(%rdi), %r10
+; SSE-NEXT:    movq 32(%rdi), %rcx
+; SSE-NEXT:    movq 40(%rdi), %rdx
+; SSE-NEXT:    movq 48(%rdi), %rsi
+; SSE-NEXT:    movq 56(%rdi), %r8
+; SSE-NEXT:    bsrq %r8, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %rsi, %r14
+; SSE-NEXT:    xorl $63, %r14d
+; SSE-NEXT:    orl $64, %r14d
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %eax, %r14d
+; SSE-NEXT:    bsrq %rdx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %rcx, %rbx
+; SSE-NEXT:    xorl $63, %ebx
+; SSE-NEXT:    orl $64, %ebx
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %eax, %ebx
+; SSE-NEXT:    subl $-128, %ebx
+; SSE-NEXT:    movq %rsi, %rax
+; SSE-NEXT:    orq %r8, %rax
+; SSE-NEXT:    cmovnel %r14d, %ebx
+; SSE-NEXT:    bsrq %r10, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %r9, %r14
+; SSE-NEXT:    xorl $63, %r14d
+; SSE-NEXT:    orl $64, %r14d
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %eax, %r14d
+; SSE-NEXT:    bsrq %r11, %r15
+; SSE-NEXT:    xorl $63, %r15d
+; SSE-NEXT:    bsrq (%rdi), %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %r10, %r9
+; SSE-NEXT:    cmovnel %r14d, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    orq %r8, %rdx
+; SSE-NEXT:    orq %rsi, %rcx
+; SSE-NEXT:    orq %rdx, %rcx
+; SSE-NEXT:    cmovnel %ebx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_ctlz_undef_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq 8(%rdi), %r10
+; AVX2-NEXT:    movq 16(%rdi), %r9
+; AVX2-NEXT:    movq 32(%rdi), %rcx
+; AVX2-NEXT:    movq 40(%rdi), %rdx
+; AVX2-NEXT:    movq 48(%rdi), %rsi
+; AVX2-NEXT:    movq 56(%rdi), %r8
+; AVX2-NEXT:    lzcntq %r8, %rax
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    lzcntq %rsi, %rbx
+; AVX2-NEXT:    addl $64, %ebx
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %eax, %ebx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rdx, %rax
+; AVX2-NEXT:    lzcntq %rcx, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %eax, %r11d
+; AVX2-NEXT:    subl $-128, %r11d
+; AVX2-NEXT:    movq %rsi, %rax
+; AVX2-NEXT:    orq %r8, %rax
+; AVX2-NEXT:    cmovnel %ebx, %r11d
+; AVX2-NEXT:    movq 24(%rdi), %rbx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rbx, %rax
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    lzcntq %r9, %r14
+; AVX2-NEXT:    addl $64, %r14d
+; AVX2-NEXT:    testq %rbx, %rbx
+; AVX2-NEXT:    cmovnel %eax, %r14d
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    lzcntq %r10, %r15
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq (%rdi), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %r15d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rbx, %r9
+; AVX2-NEXT:    cmovnel %r14d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %r8, %rdx
+; AVX2-NEXT:    orq %rsi, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    cmovnel %r11d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_ctlz_undef_i512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %a0 = load i512, ptr %p0
+  %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 -1)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind {
+; SSE-LABEL: test_ctlz_undef_i1024:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r13
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq %r9, %r12
+; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    bsrq %r11, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %rsi, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    orl $64, %ecx
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    cmovnel %eax, %ecx
+; SSE-NEXT:    bsrq %rdx, %r10
+; SSE-NEXT:    xorl $63, %r10d
+; SSE-NEXT:    bsrq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %r10d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    movq %rsi, %r9
+; SSE-NEXT:    movq %rsi, %rbx
+; SSE-NEXT:    orq %r11, %r9
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    bsrq %r15, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    bsrq %r13, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    orl $64, %esi
+; SSE-NEXT:    testq %r15, %r15
+; SSE-NEXT:    cmovnel %ecx, %esi
+; SSE-NEXT:    bsrq %r14, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; SSE-NEXT:    bsrq %r9, %rbp
+; SSE-NEXT:    xorl $63, %ebp
+; SSE-NEXT:    orl $64, %ebp
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovnel %ecx, %ebp
+; SSE-NEXT:    movq %r8, %r10
+; SSE-NEXT:    subl $-128, %ebp
+; SSE-NEXT:    movq %r13, %rcx
+; SSE-NEXT:    orq %r15, %rcx
+; SSE-NEXT:    cmovnel %esi, %ebp
+; SSE-NEXT:    addl $256, %ebp # imm = 0x100
+; SSE-NEXT:    orq %r11, %rdx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT:    orq %rbx, %rsi
+; SSE-NEXT:    orq %rdx, %rsi
+; SSE-NEXT:    cmovnel %eax, %ebp
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT:    bsrq %rdx, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; SSE-NEXT:    bsrq %r8, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    bsrq %r12, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    bsrq %r10, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    orl $64, %ecx
+; SSE-NEXT:    testq %r12, %r12
+; SSE-NEXT:    cmovnel %esi, %ecx
+; SSE-NEXT:    movq %rdi, %rbx
+; SSE-NEXT:    subl $-128, %ecx
+; SSE-NEXT:    movq %r8, %rsi
+; SSE-NEXT:    orq %rdx, %rsi
+; SSE-NEXT:    cmovnel %eax, %ecx
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE-NEXT:    bsrq %r11, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    bsrq %r8, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    orl $64, %edx
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    cmovnel %eax, %edx
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; SSE-NEXT:    bsrq %rdi, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    bsrq %rbx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %r11, %r8
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    orq %r12, %r10
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r15
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r14
+; SSE-NEXT:    orq %r15, %r14
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r13
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r9
+; SSE-NEXT:    orq %r13, %r9
+; SSE-NEXT:    addl $512, %eax # imm = 0x200
+; SSE-NEXT:    orq %r14, %r9
+; SSE-NEXT:    cmovnel %ebp, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r13
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_ctlz_undef_i1024:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq %r9, %r14
+; AVX2-NEXT:    movq %r8, %r11
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    lzcntq %r12, %rcx
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    lzcntq %r8, %r9
+; AVX2-NEXT:    addl $64, %r9d
+; AVX2-NEXT:    testq %r12, %r12
+; AVX2-NEXT:    cmovnel %ecx, %r9d
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    lzcntq %r10, %rsi
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    lzcntq %rax, %rcx
+; AVX2-NEXT:    addl $64, %ecx
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %esi, %ecx
+; AVX2-NEXT:    subl $-128, %ecx
+; AVX2-NEXT:    movq %r8, %rsi
+; AVX2-NEXT:    orq %r12, %rsi
+; AVX2-NEXT:    cmovnel %r9d, %ecx
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    lzcntq %rbx, %rdi
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    lzcntq %r15, %rsi
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %rbx, %rbx
+; AVX2-NEXT:    cmovnel %edi, %esi
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; AVX2-NEXT:    xorl %ebp, %ebp
+; AVX2-NEXT:    lzcntq %r13, %rbp
+; AVX2-NEXT:    addl $64, %ebp
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    lzcntq %r9, %rdi
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %edi, %ebp
+; AVX2-NEXT:    subl $-128, %ebp
+; AVX2-NEXT:    movq %r15, %rdi
+; AVX2-NEXT:    orq %rbx, %rdi
+; AVX2-NEXT:    cmovnel %esi, %ebp
+; AVX2-NEXT:    addl $256, %ebp # imm = 0x100
+; AVX2-NEXT:    movq %r10, %rdi
+; AVX2-NEXT:    orq %r12, %rdi
+; AVX2-NEXT:    movq %rax, %rsi
+; AVX2-NEXT:    orq %r8, %rsi
+; AVX2-NEXT:    orq %rdi, %rsi
+; AVX2-NEXT:    cmovnel %ecx, %ebp
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rdi, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    lzcntq %r12, %rcx
+; AVX2-NEXT:    testq %r12, %r12
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    lzcntq %r11, %rcx
+; AVX2-NEXT:    addl $64, %ecx
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    lzcntq %r14, %rsi
+; AVX2-NEXT:    testq %r14, %r14
+; AVX2-NEXT:    cmovnel %esi, %ecx
+; AVX2-NEXT:    subl $-128, %ecx
+; AVX2-NEXT:    movq %rdi, %rsi
+; AVX2-NEXT:    orq %r12, %rsi
+; AVX2-NEXT:    cmovnel %eax, %ecx
+; AVX2-NEXT:    movq %rdx, %rdi
+; AVX2-NEXT:    lzcntq %rdx, %rdx
+; AVX2-NEXT:    addl $64, %edx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %r10, %rax
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %eax, %edx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT:    lzcntq %rsi, %r8
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %r10, %rdi
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    orq %r12, %r14
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %r14, %r11
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r9
+; AVX2-NEXT:    orq %rbx, %r9
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r13
+; AVX2-NEXT:    orq %r15, %r13
+; AVX2-NEXT:    addl $512, %eax # imm = 0x200
+; AVX2-NEXT:    orq %r9, %r13
+; AVX2-NEXT:    cmovnel %ebp, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ctlz_undef_i1024:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    movq %r9, %r14
+; AVX512-NEXT:    movq %r8, %r11
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX512-NEXT:    lzcntq %r12, %rcx
+; AVX512-NEXT:    lzcntq %r8, %r9
+; AVX512-NEXT:    addl $64, %r9d
+; AVX512-NEXT:    testq %r12, %r12
+; AVX512-NEXT:    cmovnel %ecx, %r9d
+; AVX512-NEXT:    lzcntq %r10, %rsi
+; AVX512-NEXT:    lzcntq %rax, %rcx
+; AVX512-NEXT:    addl $64, %ecx
+; AVX512-NEXT:    testq %r10, %r10
+; AVX512-NEXT:    cmovnel %esi, %ecx
+; AVX512-NEXT:    subl $-128, %ecx
+; AVX512-NEXT:    movq %r8, %rsi
+; AVX512-NEXT:    orq %r12, %rsi
+; AVX512-NEXT:    cmovnel %r9d, %ecx
+; AVX512-NEXT:    lzcntq %rbx, %rdi
+; AVX512-NEXT:    lzcntq %r15, %rsi
+; AVX512-NEXT:    addl $64, %esi
+; AVX512-NEXT:    testq %rbx, %rbx
+; AVX512-NEXT:    cmovnel %edi, %esi
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; AVX512-NEXT:    lzcntq %r13, %rbp
+; AVX512-NEXT:    addl $64, %ebp
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; AVX512-NEXT:    lzcntq %r9, %rdi
+; AVX512-NEXT:    testq %r9, %r9
+; AVX512-NEXT:    cmovnel %edi, %ebp
+; AVX512-NEXT:    subl $-128, %ebp
+; AVX512-NEXT:    movq %r15, %rdi
+; AVX512-NEXT:    orq %rbx, %rdi
+; AVX512-NEXT:    cmovnel %esi, %ebp
+; AVX512-NEXT:    addl $256, %ebp # imm = 0x100
+; AVX512-NEXT:    movq %r10, %rdi
+; AVX512-NEXT:    orq %r12, %rdi
+; AVX512-NEXT:    movq %rax, %rsi
+; AVX512-NEXT:    orq %r8, %rsi
+; AVX512-NEXT:    orq %rdi, %rsi
+; AVX512-NEXT:    cmovnel %ecx, %ebp
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT:    lzcntq %rdi, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX512-NEXT:    lzcntq %r12, %rcx
+; AVX512-NEXT:    testq %r12, %r12
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    lzcntq %r11, %rcx
+; AVX512-NEXT:    addl $64, %ecx
+; AVX512-NEXT:    lzcntq %r14, %rsi
+; AVX512-NEXT:    testq %r14, %r14
+; AVX512-NEXT:    cmovnel %esi, %ecx
+; AVX512-NEXT:    subl $-128, %ecx
+; AVX512-NEXT:    movq %rdi, %rsi
+; AVX512-NEXT:    orq %r12, %rsi
+; AVX512-NEXT:    cmovnel %eax, %ecx
+; AVX512-NEXT:    movq %rdx, %rdi
+; AVX512-NEXT:    lzcntq %rdx, %rdx
+; AVX512-NEXT:    addl $64, %edx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT:    lzcntq %r10, %rax
+; AVX512-NEXT:    testq %r10, %r10
+; AVX512-NEXT:    cmovnel %eax, %edx
+; AVX512-NEXT:    lzcntq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512-NEXT:    lzcntq %rsi, %r8
+; AVX512-NEXT:    testq %rsi, %rsi
+; AVX512-NEXT:    cmovnel %r8d, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %r10, %rdi
+; AVX512-NEXT:    cmovnel %edx, %eax
+; AVX512-NEXT:    orq %r12, %r14
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT:    addl $256, %eax # imm = 0x100
+; AVX512-NEXT:    orq %r14, %r11
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r9
+; AVX512-NEXT:    orq %rbx, %r9
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r15
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r13
+; AVX512-NEXT:    orq %r15, %r13
+; AVX512-NEXT:    addl $512, %eax # imm = 0x200
+; AVX512-NEXT:    orq %r9, %r13
+; AVX512-NEXT:    cmovnel %ebp, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    retq
+  %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 -1)
+  %res = trunc i1024 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_ctlz_undef_i1024(ptr %p0) nounwind {
+; SSE-LABEL: load_ctlz_undef_i1024:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r13
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq 40(%rdi), %rbp
+; SSE-NEXT:    movq 64(%rdi), %rbx
+; SSE-NEXT:    movq 72(%rdi), %r11
+; SSE-NEXT:    movq 80(%rdi), %r12
+; SSE-NEXT:    movq 88(%rdi), %r14
+; SSE-NEXT:    movq 96(%rdi), %r13
+; SSE-NEXT:    movq 104(%rdi), %r9
+; SSE-NEXT:    movq 112(%rdi), %r10
+; SSE-NEXT:    movq 120(%rdi), %r8
+; SSE-NEXT:    bsrq %r8, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %r10, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    orl $64, %ecx
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %eax, %ecx
+; SSE-NEXT:    bsrq %r9, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    bsrq %r13, %rax
+; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    movq %r10, %rdx
+; SSE-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    orq %r8, %rdx
+; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    bsrq %r14, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    movq %r12, %rsi
+; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    bsrq %r12, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    orl $64, %edx
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovnel %ecx, %edx
+; SSE-NEXT:    bsrq %r11, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    bsrq %rbx, %r15
+; SSE-NEXT:    xorl $63, %r15d
+; SSE-NEXT:    orl $64, %r15d
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    cmovnel %ecx, %r15d
+; SSE-NEXT:    movq 48(%rdi), %r12
+; SSE-NEXT:    subl $-128, %r15d
+; SSE-NEXT:    movq %rsi, %rcx
+; SSE-NEXT:    orq %r14, %rcx
+; SSE-NEXT:    cmovnel %edx, %r15d
+; SSE-NEXT:    addl $256, %r15d # imm = 0x100
+; SSE-NEXT:    movq %r9, %rcx
+; SSE-NEXT:    orq %r8, %rcx
+; SSE-NEXT:    movq %r13, %rdx
+; SSE-NEXT:    orq %r10, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
+; SSE-NEXT:    movq 56(%rdi), %r13
+; SSE-NEXT:    cmovnel %eax, %r15d
+; SSE-NEXT:    bsrq %r13, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %r12, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    orl $64, %edx
+; SSE-NEXT:    testq %r13, %r13
+; SSE-NEXT:    cmovnel %eax, %edx
+; SSE-NEXT:    movq %rbp, %r10
+; SSE-NEXT:    bsrq %rbp, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    movq 32(%rdi), %r8
+; SSE-NEXT:    bsrq %r8, %rbp
+; SSE-NEXT:    xorl $63, %ebp
+; SSE-NEXT:    orl $64, %ebp
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %eax, %ebp
+; SSE-NEXT:    subl $-128, %ebp
+; SSE-NEXT:    movq %r12, %rax
+; SSE-NEXT:    orq %r13, %rax
+; SSE-NEXT:    cmovnel %edx, %ebp
+; SSE-NEXT:    movq 24(%rdi), %r9
+; SSE-NEXT:    bsrq %r9, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    movq 16(%rdi), %rsi
+; SSE-NEXT:    bsrq %rsi, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    orl $64, %ecx
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %eax, %ecx
+; SSE-NEXT:    movq 8(%rdi), %rdx
+; SSE-NEXT:    bsrq (%rdi), %rax
+; SSE-NEXT:    bsrq %rdx, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %r9, %rsi
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    orq %r13, %r10
+; SSE-NEXT:    orq %r12, %r8
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    orq %r10, %r8
+; SSE-NEXT:    cmovnel %ebp, %eax
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE-NEXT:    orq %r14, %r11
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; SSE-NEXT:    orq %rcx, %rbx
+; SSE-NEXT:    addl $512, %eax # imm = 0x200
+; SSE-NEXT:    orq %r11, %rbx
+; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r13
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_ctlz_undef_i1024:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq 48(%rdi), %r9
+; AVX2-NEXT:    movq 56(%rdi), %rbp
+; AVX2-NEXT:    movq 64(%rdi), %r11
+; AVX2-NEXT:    movq 72(%rdi), %r10
+; AVX2-NEXT:    movq 80(%rdi), %r14
+; AVX2-NEXT:    movq 88(%rdi), %rbx
+; AVX2-NEXT:    movq 96(%rdi), %rdx
+; AVX2-NEXT:    movq 104(%rdi), %r8
+; AVX2-NEXT:    movq 112(%rdi), %rsi
+; AVX2-NEXT:    movq 120(%rdi), %r15
+; AVX2-NEXT:    lzcntq %r15, %rax
+; AVX2-NEXT:    lzcntq %rsi, %rcx
+; AVX2-NEXT:    addl $64, %ecx
+; AVX2-NEXT:    testq %r15, %r15
+; AVX2-NEXT:    cmovnel %eax, %ecx
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    lzcntq %r8, %r12
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rdx, %rax
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    movq %rsi, %r12
+; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    orq %r15, %r12
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    lzcntq %rbx, %rcx
+; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    xorl %r13d, %r13d
+; AVX2-NEXT:    lzcntq %r14, %r13
+; AVX2-NEXT:    addl $64, %r13d
+; AVX2-NEXT:    testq %rbx, %rbx
+; AVX2-NEXT:    cmovnel %ecx, %r13d
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    lzcntq %r10, %rcx
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    lzcntq %r11, %r12
+; AVX2-NEXT:    addl $64, %r12d
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %ecx, %r12d
+; AVX2-NEXT:    subl $-128, %r12d
+; AVX2-NEXT:    movq %r14, %rcx
+; AVX2-NEXT:    orq %rbx, %rcx
+; AVX2-NEXT:    cmovnel %r13d, %r12d
+; AVX2-NEXT:    addl $256, %r12d # imm = 0x100
+; AVX2-NEXT:    movq %r8, %rcx
+; AVX2-NEXT:    orq %r15, %rcx
+; AVX2-NEXT:    orq %rsi, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %eax, %r12d
+; AVX2-NEXT:    movq %rbp, %r14
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    lzcntq %rbp, %rcx
+; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %r9, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rbp, %rbp
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    movq 32(%rdi), %r13
+; AVX2-NEXT:    xorl %ebp, %ebp
+; AVX2-NEXT:    lzcntq %r13, %rbp
+; AVX2-NEXT:    addl $64, %ebp
+; AVX2-NEXT:    movq 40(%rdi), %r8
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    lzcntq %r8, %rdx
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %edx, %ebp
+; AVX2-NEXT:    subl $-128, %ebp
+; AVX2-NEXT:    movq %r9, %rdx
+; AVX2-NEXT:    orq %r14, %rdx
+; AVX2-NEXT:    cmovnel %eax, %ebp
+; AVX2-NEXT:    movq 16(%rdi), %r9
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    lzcntq %r9, %rcx
+; AVX2-NEXT:    addl $64, %ecx
+; AVX2-NEXT:    movq 24(%rdi), %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rdx, %rax
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %eax, %ecx
+; AVX2-NEXT:    movq 8(%rdi), %rsi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq (%rdi), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    lzcntq %rsi, %rdi
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rdx, %r9
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    orq %r14, %r8
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %r8, %r13
+; AVX2-NEXT:    cmovnel %ebp, %eax
+; AVX2-NEXT:    orq %r15, %rbx
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX2-NEXT:    orq %rbx, %r10
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX2-NEXT:    orq %rcx, %r11
+; AVX2-NEXT:    addl $512, %eax # imm = 0x200
+; AVX2-NEXT:    orq %r10, %r11
+; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_ctlz_undef_i1024:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    movq 32(%rdi), %r14
+; AVX512-NEXT:    movq 48(%rdi), %rbp
+; AVX512-NEXT:    movq 64(%rdi), %r11
+; AVX512-NEXT:    movq 72(%rdi), %r10
+; AVX512-NEXT:    movq 80(%rdi), %rdx
+; AVX512-NEXT:    movq 88(%rdi), %rbx
+; AVX512-NEXT:    movq 96(%rdi), %rsi
+; AVX512-NEXT:    movq 104(%rdi), %r9
+; AVX512-NEXT:    movq 112(%rdi), %r8
+; AVX512-NEXT:    movq 120(%rdi), %r15
+; AVX512-NEXT:    lzcntq %r15, %rax
+; AVX512-NEXT:    lzcntq %r8, %rcx
+; AVX512-NEXT:    addl $64, %ecx
+; AVX512-NEXT:    testq %r15, %r15
+; AVX512-NEXT:    cmovnel %eax, %ecx
+; AVX512-NEXT:    lzcntq %r9, %r12
+; AVX512-NEXT:    lzcntq %rsi, %rax
+; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %r9, %r9
+; AVX512-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    cmovnel %r12d, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    movq %r8, %r12
+; AVX512-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    orq %r15, %r12
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    lzcntq %rbx, %rcx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    lzcntq %rdx, %r13
+; AVX512-NEXT:    addl $64, %r13d
+; AVX512-NEXT:    testq %rbx, %rbx
+; AVX512-NEXT:    cmovnel %ecx, %r13d
+; AVX512-NEXT:    lzcntq %r10, %rcx
+; AVX512-NEXT:    lzcntq %r11, %r12
+; AVX512-NEXT:    addl $64, %r12d
+; AVX512-NEXT:    testq %r10, %r10
+; AVX512-NEXT:    cmovnel %ecx, %r12d
+; AVX512-NEXT:    subl $-128, %r12d
+; AVX512-NEXT:    movq %rdx, %rcx
+; AVX512-NEXT:    orq %rbx, %rcx
+; AVX512-NEXT:    cmovnel %r13d, %r12d
+; AVX512-NEXT:    addl $256, %r12d # imm = 0x100
+; AVX512-NEXT:    movq %r9, %rcx
+; AVX512-NEXT:    orq %r15, %rcx
+; AVX512-NEXT:    orq %r8, %rsi
+; AVX512-NEXT:    orq %rcx, %rsi
+; AVX512-NEXT:    movq 56(%rdi), %r13
+; AVX512-NEXT:    cmovnel %eax, %r12d
+; AVX512-NEXT:    lzcntq %r13, %rcx
+; AVX512-NEXT:    movq %rbp, %rsi
+; AVX512-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    lzcntq %rbp, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %r13, %r13
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    lzcntq %r14, %rbp
+; AVX512-NEXT:    addl $64, %ebp
+; AVX512-NEXT:    movq 40(%rdi), %r8
+; AVX512-NEXT:    lzcntq %r8, %rdx
+; AVX512-NEXT:    testq %r8, %r8
+; AVX512-NEXT:    cmovnel %edx, %ebp
+; AVX512-NEXT:    subl $-128, %ebp
+; AVX512-NEXT:    movq %rsi, %rdx
+; AVX512-NEXT:    orq %r13, %rdx
+; AVX512-NEXT:    cmovnel %eax, %ebp
+; AVX512-NEXT:    movq 16(%rdi), %r9
+; AVX512-NEXT:    lzcntq %r9, %rcx
+; AVX512-NEXT:    addl $64, %ecx
+; AVX512-NEXT:    movq 24(%rdi), %rdx
+; AVX512-NEXT:    lzcntq %rdx, %rax
+; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    cmovnel %eax, %ecx
+; AVX512-NEXT:    movq 8(%rdi), %rsi
+; AVX512-NEXT:    lzcntq (%rdi), %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    lzcntq %rsi, %rdi
+; AVX512-NEXT:    testq %rsi, %rsi
+; AVX512-NEXT:    cmovnel %edi, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %rdx, %r9
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    orq %r13, %r8
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; AVX512-NEXT:    addl $256, %eax # imm = 0x100
+; AVX512-NEXT:    orq %r8, %r14
+; AVX512-NEXT:    cmovnel %ebp, %eax
+; AVX512-NEXT:    orq %r15, %rbx
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX512-NEXT:    orq %rbx, %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX512-NEXT:    orq %rcx, %r11
+; AVX512-NEXT:    addl $512, %eax # imm = 0x200
+; AVX512-NEXT:    orq %r10, %r11
+; AVX512-NEXT:    cmovnel %r12d, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    retq
+  %a0 = load i1024, ptr %p0
+  %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 -1)
+  %res = trunc i1024 %cnt to i32
+  ret i32 %res
+}
+
+;
+; CTTZ
+;
+
+define i32 @test_cttz_i128(i128 %a0) nounwind {
+; SSE-LABEL: test_cttz_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rep bsfq %rdi, %rcx
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq %rsi, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_cttz_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    tzcntq %rdi, %rcx
+; AVX2-NEXT:    tzcntq %rsi, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_cttz_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    tzcntq %rdi, %rcx
+; AVX512-NEXT:    tzcntq %rsi, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rdi, %rdi
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_cttz_i128(ptr %p0) nounwind {
+; SSE-LABEL: load_cttz_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq (%rdi), %rcx
+; SSE-NEXT:    rep bsfq %rcx, %rdx
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq 8(%rdi), %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_cttz_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq (%rdi), %rcx
+; AVX2-NEXT:    tzcntq %rcx, %rdx
+; AVX2-NEXT:    tzcntq 8(%rdi), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_cttz_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq (%rdi), %rcx
+; AVX512-NEXT:    tzcntq %rcx, %rdx
+; AVX512-NEXT:    tzcntq 8(%rdi), %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %edx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = load i128, ptr %p0
+  %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_cttz_i256(i256 %a0) nounwind {
+; SSE-LABEL: test_cttz_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rep bsfq %rdi, %rax
+; SSE-NEXT:    rep bsfq %rsi, %r8
+; SSE-NEXT:    addl $64, %r8d
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %eax, %r8d
+; SSE-NEXT:    rep bsfq %rdx, %r9
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %r9d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %rsi, %rdi
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_cttz_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    tzcntq %rdi, %rax
+; AVX2-NEXT:    tzcntq %rsi, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    tzcntq %rdx, %r9
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rcx, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rsi, %rdi
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_cttz_i256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    tzcntq %rdi, %rax
+; AVX512-NEXT:    tzcntq %rsi, %r8
+; AVX512-NEXT:    addl $64, %r8d
+; AVX512-NEXT:    testq %rdi, %rdi
+; AVX512-NEXT:    cmovnel %eax, %r8d
+; AVX512-NEXT:    tzcntq %rdx, %r9
+; AVX512-NEXT:    tzcntq %rcx, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    cmovnel %r9d, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %rsi, %rdi
+; AVX512-NEXT:    cmovnel %r8d, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_cttz_i256(ptr %p0) nounwind {
+; SSE-LABEL: load_cttz_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq 16(%rdi), %rcx
+; SSE-NEXT:    movq (%rdi), %rdx
+; SSE-NEXT:    movq 8(%rdi), %rsi
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    rep bsfq %rsi, %r8
+; SSE-NEXT:    addl $64, %r8d
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %eax, %r8d
+; SSE-NEXT:    rep bsfq %rcx, %r9
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq 24(%rdi), %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %r9d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %rsi, %rdx
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_cttz_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq (%rdi), %rcx
+; AVX2-NEXT:    movq 8(%rdi), %rdx
+; AVX2-NEXT:    tzcntq %rcx, %rax
+; AVX2-NEXT:    tzcntq %rdx, %rsi
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %eax, %esi
+; AVX2-NEXT:    movq 16(%rdi), %r8
+; AVX2-NEXT:    tzcntq %r8, %r9
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq 24(%rdi), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_cttz_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512F-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vpandn %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vplzcntq %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [64,128,192,256]
+; AVX512F-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512F-NEXT:    vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: load_cttz_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512POPCNT-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpandn %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpopcntq %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512POPCNT-NEXT:    vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = load i256, ptr %p0
+  %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_cttz_i512(i512 %a0) nounwind {
+; SSE-LABEL: test_cttz_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    rep bsfq %rdi, %rax
+; SSE-NEXT:    rep bsfq %rsi, %r11
+; SSE-NEXT:    addl $64, %r11d
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %eax, %r11d
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    rep bsfq %rcx, %r10
+; SSE-NEXT:    addl $64, %r10d
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %eax, %r10d
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; SSE-NEXT:    subl $-128, %r10d
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    orq %rsi, %rax
+; SSE-NEXT:    cmovnel %r11d, %r10d
+; SSE-NEXT:    rep bsfq %r8, %rax
+; SSE-NEXT:    rep bsfq %r9, %r11
+; SSE-NEXT:    addl $64, %r11d
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %eax, %r11d
+; SSE-NEXT:    rep bsfq %rbx, %r14
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rbx, %rbx
+; SSE-NEXT:    cmovnel %r14d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %r9, %r8
+; SSE-NEXT:    cmovnel %r11d, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    orq %rcx, %rsi
+; SSE-NEXT:    orq %rdx, %rdi
+; SSE-NEXT:    orq %rsi, %rdi
+; SSE-NEXT:    cmovnel %r10d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_cttz_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    tzcntq %rdi, %rax
+; AVX2-NEXT:    tzcntq %rsi, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %eax, %r11d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rdx, %rax
+; AVX2-NEXT:    tzcntq %rcx, %r10
+; AVX2-NEXT:    addl $64, %r10d
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %eax, %r10d
+; AVX2-NEXT:    subl $-128, %r10d
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    orq %rsi, %rax
+; AVX2-NEXT:    cmovnel %r11d, %r10d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %r8, %rax
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %r9, %rbx
+; AVX2-NEXT:    addl $64, %ebx
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %eax, %ebx
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    tzcntq %r11, %r14
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r11, %r11
+; AVX2-NEXT:    cmovnel %r14d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %r9, %r8
+; AVX2-NEXT:    cmovnel %ebx, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %rcx, %rsi
+; AVX2-NEXT:    orq %rdx, %rdi
+; AVX2-NEXT:    orq %rsi, %rdi
+; AVX2-NEXT:    cmovnel %r10d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test_cttz_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %rcx, %xmm0
+; AVX512F-NEXT:    vmovq %rdx, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT:    vmovq %rsi, %xmm1
+; AVX512F-NEXT:    vmovq %rdi, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vmovq %r9, %xmm1
+; AVX512F-NEXT:    vmovq %r8, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: test_cttz_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovq %rcx, %xmm0
+; AVX512POPCNT-NEXT:    vmovq %rdx, %xmm1
+; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512POPCNT-NEXT:    vmovq %rsi, %xmm1
+; AVX512POPCNT-NEXT:    vmovq %rdi, %xmm2
+; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512POPCNT-NEXT:    vmovq %r9, %xmm1
+; AVX512POPCNT-NEXT:    vmovq %r8, %xmm2
+; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512POPCNT-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_cttz_i512(ptr %p0) nounwind {
+; SSE-LABEL: load_cttz_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq 48(%rdi), %r10
+; SSE-NEXT:    movq 40(%rdi), %r9
+; SSE-NEXT:    movq 24(%rdi), %r8
+; SSE-NEXT:    movq 16(%rdi), %rdx
+; SSE-NEXT:    movq (%rdi), %rcx
+; SSE-NEXT:    movq 8(%rdi), %rsi
+; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    rep bsfq %rsi, %rbx
+; SSE-NEXT:    addl $64, %ebx
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %eax, %ebx
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    rep bsfq %r8, %r11
+; SSE-NEXT:    addl $64, %r11d
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %eax, %r11d
+; SSE-NEXT:    movq 32(%rdi), %r14
+; SSE-NEXT:    subl $-128, %r11d
+; SSE-NEXT:    movq %rcx, %rax
+; SSE-NEXT:    orq %rsi, %rax
+; SSE-NEXT:    cmovnel %ebx, %r11d
+; SSE-NEXT:    rep bsfq %r14, %rax
+; SSE-NEXT:    rep bsfq %r9, %rbx
+; SSE-NEXT:    addl $64, %ebx
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovnel %eax, %ebx
+; SSE-NEXT:    rep bsfq %r10, %r15
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq 56(%rdi), %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %r9, %r14
+; SSE-NEXT:    cmovnel %ebx, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    orq %r8, %rsi
+; SSE-NEXT:    orq %rdx, %rcx
+; SSE-NEXT:    orq %rsi, %rcx
+; SSE-NEXT:    cmovnel %r11d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_cttz_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq 48(%rdi), %r10
+; AVX2-NEXT:    movq 40(%rdi), %r9
+; AVX2-NEXT:    movq 24(%rdi), %r8
+; AVX2-NEXT:    movq 16(%rdi), %rdx
+; AVX2-NEXT:    movq (%rdi), %rcx
+; AVX2-NEXT:    movq 8(%rdi), %rsi
+; AVX2-NEXT:    tzcntq %rcx, %rax
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %rsi, %rbx
+; AVX2-NEXT:    addl $64, %ebx
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %eax, %ebx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rdx, %rax
+; AVX2-NEXT:    tzcntq %r8, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %eax, %r11d
+; AVX2-NEXT:    subl $-128, %r11d
+; AVX2-NEXT:    movq %rcx, %rax
+; AVX2-NEXT:    orq %rsi, %rax
+; AVX2-NEXT:    cmovnel %ebx, %r11d
+; AVX2-NEXT:    movq 32(%rdi), %rbx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rbx, %rax
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    tzcntq %r9, %r14
+; AVX2-NEXT:    addl $64, %r14d
+; AVX2-NEXT:    testq %rbx, %rbx
+; AVX2-NEXT:    cmovnel %eax, %r14d
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    tzcntq %r10, %r15
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq 56(%rdi), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %r15d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %r9, %rbx
+; AVX2-NEXT:    cmovnel %r14d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %r8, %rsi
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    orq %rsi, %rcx
+; AVX2-NEXT:    cmovnel %r11d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_cttz_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: load_cttz_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512POPCNT-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = load i512, ptr %p0
+  %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_cttz_i1024(i1024 %a0) nounwind {
+; SSE-LABEL: test_cttz_i1024:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r13
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq %r9, %r13
+; SSE-NEXT:    movq %r8, %r14
+; SSE-NEXT:    movq %rcx, %rbx
+; SSE-NEXT:    movq %rdx, %r10
+; SSE-NEXT:    movq %rsi, %r9
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT:    rep bsfq %rdi, %rax
+; SSE-NEXT:    rep bsfq %r9, %r15
+; SSE-NEXT:    addl $64, %r15d
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %eax, %r15d
+; SSE-NEXT:    rep bsfq %r10, %r12
+; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %r12d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    movq %rdi, %r12
+; SSE-NEXT:    orq %r9, %r12
+; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    rep bsfq %r8, %r15
+; SSE-NEXT:    movq %r13, %rcx
+; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    rep bsfq %r13, %r13
+; SSE-NEXT:    addl $64, %r13d
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %r15d, %r13d
+; SSE-NEXT:    rep bsfq %rdx, %r12
+; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %r15
+; SSE-NEXT:    addl $64, %r15d
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %r12d, %r15d
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT:    subl $-128, %r15d
+; SSE-NEXT:    movq %r8, %rbp
+; SSE-NEXT:    orq %rcx, %rbp
+; SSE-NEXT:    cmovnel %r13d, %r15d
+; SSE-NEXT:    addl $256, %r15d # imm = 0x100
+; SSE-NEXT:    movq %r9, %r13
+; SSE-NEXT:    orq %rbx, %r13
+; SSE-NEXT:    movq %rdi, %rbp
+; SSE-NEXT:    orq %r10, %rbp
+; SSE-NEXT:    orq %r13, %rbp
+; SSE-NEXT:    cmovnel %eax, %r15d
+; SSE-NEXT:    rep bsfq %r11, %r13
+; SSE-NEXT:    rep bsfq %r12, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    cmovnel %r13d, %eax
+; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %r13
+; SSE-NEXT:    addl $64, %r13d
+; SSE-NEXT:    rep bsfq %rsi, %rcx
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %ecx, %r13d
+; SSE-NEXT:    subl $-128, %r13d
+; SSE-NEXT:    movq %r11, %rcx
+; SSE-NEXT:    orq %r12, %rcx
+; SSE-NEXT:    cmovnel %eax, %r13d
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; SSE-NEXT:    rep bsfq %rbp, %rcx
+; SSE-NEXT:    addl $64, %ecx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %eax, %ecx
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; SSE-NEXT:    rep bsfq %r8, %rsi
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %rbp, %rdx
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    orq %r12, %r11
+; SSE-NEXT:    cmovnel %r13d, %eax
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; SSE-NEXT:    orq %rbx, %r9
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    orq %r14, %rdi
+; SSE-NEXT:    orq %r10, %rdi
+; SSE-NEXT:    addl $512, %eax # imm = 0x200
+; SSE-NEXT:    orq %r9, %rdi
+; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r13
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_cttz_i1024:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq %r9, %rbx
+; AVX2-NEXT:    movq %r8, %r14
+; AVX2-NEXT:    movq %rcx, %r11
+; AVX2-NEXT:    movq %rdx, %r10
+; AVX2-NEXT:    movq %rsi, %r9
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    tzcntq %rdi, %rax
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    tzcntq %r9, %r15
+; AVX2-NEXT:    addl $64, %r15d
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %eax, %r15d
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    tzcntq %r10, %r12
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %r11, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    movq %rdi, %r12
+; AVX2-NEXT:    orq %r9, %r12
+; AVX2-NEXT:    cmovnel %r15d, %eax
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    tzcntq %r14, %r15
+; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    tzcntq %rbx, %r12
+; AVX2-NEXT:    addl $64, %r12d
+; AVX2-NEXT:    testq %r14, %r14
+; AVX2-NEXT:    cmovnel %r15d, %r12d
+; AVX2-NEXT:    xorl %r13d, %r13d
+; AVX2-NEXT:    tzcntq %rcx, %r13
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    tzcntq %rdx, %r15
+; AVX2-NEXT:    addl $64, %r15d
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %r13d, %r15d
+; AVX2-NEXT:    subl $-128, %r15d
+; AVX2-NEXT:    movq %r14, %r13
+; AVX2-NEXT:    orq %rbx, %r13
+; AVX2-NEXT:    cmovnel %r12d, %r15d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT:    addl $256, %r15d # imm = 0x100
+; AVX2-NEXT:    movq %r9, %r13
+; AVX2-NEXT:    orq %r11, %r13
+; AVX2-NEXT:    movq %rdi, %rbp
+; AVX2-NEXT:    orq %r10, %rbp
+; AVX2-NEXT:    orq %r13, %rbp
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; AVX2-NEXT:    cmovnel %eax, %r15d
+; AVX2-NEXT:    xorl %ebp, %ebp
+; AVX2-NEXT:    tzcntq %r12, %rbp
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %r13, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r12, %r12
+; AVX2-NEXT:    cmovnel %ebp, %eax
+; AVX2-NEXT:    xorl %ebp, %ebp
+; AVX2-NEXT:    tzcntq %r8, %rbp
+; AVX2-NEXT:    addl $64, %ebp
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    tzcntq %rsi, %rcx
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %ecx, %ebp
+; AVX2-NEXT:    subl $-128, %ebp
+; AVX2-NEXT:    movq %r12, %rcx
+; AVX2-NEXT:    orq %r13, %rcx
+; AVX2-NEXT:    cmovnel %eax, %ebp
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    tzcntq %rbx, %rcx
+; AVX2-NEXT:    addl $64, %ecx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rdx, %rax
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %eax, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT:    tzcntq %r8, %rsi
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rbx, %rdx
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r13
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %r13, %r12
+; AVX2-NEXT:    cmovnel %ebp, %eax
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX2-NEXT:    orq %r11, %r9
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    orq %r14, %rdi
+; AVX2-NEXT:    orq %r10, %rdi
+; AVX2-NEXT:    addl $512, %eax # imm = 0x200
+; AVX2-NEXT:    orq %r9, %rdi
+; AVX2-NEXT:    cmovnel %r15d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_cttz_i1024:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    movq %r9, %r14
+; AVX512-NEXT:    movq %r8, %r15
+; AVX512-NEXT:    movq %rcx, %r11
+; AVX512-NEXT:    movq %rdx, %r10
+; AVX512-NEXT:    movq %rsi, %r9
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    tzcntq %rdi, %rax
+; AVX512-NEXT:    tzcntq %r9, %r12
+; AVX512-NEXT:    addl $64, %r12d
+; AVX512-NEXT:    testq %rdi, %rdi
+; AVX512-NEXT:    cmovnel %eax, %r12d
+; AVX512-NEXT:    tzcntq %rdx, %r13
+; AVX512-NEXT:    tzcntq %r11, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    cmovnel %r13d, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    movq %rdi, %r13
+; AVX512-NEXT:    orq %r9, %r13
+; AVX512-NEXT:    cmovnel %r12d, %eax
+; AVX512-NEXT:    tzcntq %r8, %r12
+; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    tzcntq %r14, %r13
+; AVX512-NEXT:    addl $64, %r13d
+; AVX512-NEXT:    testq %r8, %r8
+; AVX512-NEXT:    cmovnel %r12d, %r13d
+; AVX512-NEXT:    tzcntq %rcx, %rbp
+; AVX512-NEXT:    tzcntq {{[0-9]+}}(%rsp), %r12
+; AVX512-NEXT:    addl $64, %r12d
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %ebp, %r12d
+; AVX512-NEXT:    subl $-128, %r12d
+; AVX512-NEXT:    movq %r8, %rbp
+; AVX512-NEXT:    orq %r14, %rbp
+; AVX512-NEXT:    cmovnel %r13d, %r12d
+; AVX512-NEXT:    addl $256, %r12d # imm = 0x100
+; AVX512-NEXT:    movq %r9, %r13
+; AVX512-NEXT:    orq %r11, %r13
+; AVX512-NEXT:    movq %rdi, %rbp
+; AVX512-NEXT:    orq %rdx, %rbp
+; AVX512-NEXT:    orq %r13, %rbp
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; AVX512-NEXT:    cmovnel %eax, %r12d
+; AVX512-NEXT:    tzcntq %rbx, %rbp
+; AVX512-NEXT:    tzcntq %r13, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rbx, %rbx
+; AVX512-NEXT:    cmovnel %ebp, %eax
+; AVX512-NEXT:    tzcntq {{[0-9]+}}(%rsp), %rbp
+; AVX512-NEXT:    addl $64, %ebp
+; AVX512-NEXT:    tzcntq %rsi, %rcx
+; AVX512-NEXT:    testq %rsi, %rsi
+; AVX512-NEXT:    cmovnel %ecx, %ebp
+; AVX512-NEXT:    subl $-128, %ebp
+; AVX512-NEXT:    movq %rbx, %rcx
+; AVX512-NEXT:    orq %r13, %rcx
+; AVX512-NEXT:    cmovnel %eax, %ebp
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX512-NEXT:    tzcntq %r14, %rcx
+; AVX512-NEXT:    addl $64, %ecx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT:    tzcntq %rdx, %rax
+; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    cmovnel %eax, %ecx
+; AVX512-NEXT:    tzcntq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX512-NEXT:    tzcntq %r8, %rsi
+; AVX512-NEXT:    testq %r8, %r8
+; AVX512-NEXT:    cmovnel %esi, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %r14, %rdx
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r13
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
+; AVX512-NEXT:    addl $256, %eax # imm = 0x100
+; AVX512-NEXT:    orq %r13, %rbx
+; AVX512-NEXT:    cmovnel %ebp, %eax
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX512-NEXT:    orq %r11, %r9
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    orq %r15, %rdi
+; AVX512-NEXT:    orq %r10, %rdi
+; AVX512-NEXT:    addl $512, %eax # imm = 0x200
+; AVX512-NEXT:    orq %r9, %rdi
+; AVX512-NEXT:    cmovnel %r12d, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    retq
+  %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0)
+  %res = trunc i1024 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_cttz_i1024(ptr %p0) nounwind {
+; SSE-LABEL: load_cttz_i1024:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r13
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq 88(%rdi), %r10
+; SSE-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 56(%rdi), %rcx
+; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 40(%rdi), %rsi
+; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 24(%rdi), %r9
+; SSE-NEXT:    movq 16(%rdi), %r15
+; SSE-NEXT:    movq (%rdi), %r8
+; SSE-NEXT:    movq 8(%rdi), %r11
+; SSE-NEXT:    rep bsfq %r8, %rax
+; SSE-NEXT:    rep bsfq %r11, %rdx
+; SSE-NEXT:    addl $64, %edx
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %eax, %edx
+; SSE-NEXT:    rep bsfq %r15, %rbx
+; SSE-NEXT:    rep bsfq %r9, %rax
+; SSE-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %r15, %r15
+; SSE-NEXT:    cmovnel %ebx, %eax
+; SSE-NEXT:    movq 32(%rdi), %rbx
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    movq %r8, %r14
+; SSE-NEXT:    orq %r11, %r14
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    rep bsfq %rbx, %rdx
+; SSE-NEXT:    rep bsfq %rsi, %r12
+; SSE-NEXT:    addl $64, %r12d
+; SSE-NEXT:    testq %rbx, %rbx
+; SSE-NEXT:    cmovnel %edx, %r12d
+; SSE-NEXT:    movq 48(%rdi), %r13
+; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    rep bsfq %r13, %rdx
+; SSE-NEXT:    rep bsfq %rcx, %r14
+; SSE-NEXT:    addl $64, %r14d
+; SSE-NEXT:    testq %r13, %r13
+; SSE-NEXT:    cmovnel %edx, %r14d
+; SSE-NEXT:    subl $-128, %r14d
+; SSE-NEXT:    movq %rbx, %rdx
+; SSE-NEXT:    orq %rsi, %rdx
+; SSE-NEXT:    cmovnel %r12d, %r14d
+; SSE-NEXT:    movq 72(%rdi), %r12
+; SSE-NEXT:    addl $256, %r14d # imm = 0x100
+; SSE-NEXT:    movq %r11, %rdx
+; SSE-NEXT:    orq %r9, %rdx
+; SSE-NEXT:    movq %r8, %r13
+; SSE-NEXT:    orq %r15, %r13
+; SSE-NEXT:    orq %rdx, %r13
+; SSE-NEXT:    movq 64(%rdi), %r13
+; SSE-NEXT:    cmovnel %eax, %r14d
+; SSE-NEXT:    rep bsfq %r13, %rdx
+; SSE-NEXT:    rep bsfq %r12, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %r13, %r13
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    rep bsfq %r10, %rbp
+; SSE-NEXT:    addl $64, %ebp
+; SSE-NEXT:    movq 80(%rdi), %r10
+; SSE-NEXT:    rep bsfq %r10, %rcx
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %ecx, %ebp
+; SSE-NEXT:    subl $-128, %ebp
+; SSE-NEXT:    movq %r13, %rcx
+; SSE-NEXT:    orq %r12, %rcx
+; SSE-NEXT:    cmovnel %eax, %ebp
+; SSE-NEXT:    movq 104(%rdi), %r9
+; SSE-NEXT:    rep bsfq %r9, %rcx
+; SSE-NEXT:    addl $64, %ecx
+; SSE-NEXT:    movq 96(%rdi), %rdx
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %eax, %ecx
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq 120(%rdi), %rax
+; SSE-NEXT:    movq 112(%rdi), %rdi
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    rep bsfq %rdi, %rsi
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %r9, %rdx
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; SSE-NEXT:    orq %r10, %r13
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    orq %r12, %r13
+; SSE-NEXT:    cmovnel %ebp, %eax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE-NEXT:    orq %rcx, %r11
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE-NEXT:    orq %rbx, %r8
+; SSE-NEXT:    orq %r15, %r8
+; SSE-NEXT:    addl $512, %eax # imm = 0x200
+; SSE-NEXT:    orq %r11, %r8
+; SSE-NEXT:    cmovnel %r14d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r13
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_cttz_i1024:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq 72(%rdi), %r14
+; AVX2-NEXT:    movq 64(%rdi), %r15
+; AVX2-NEXT:    movq 56(%rdi), %r9
+; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 48(%rdi), %rcx
+; AVX2-NEXT:    movq 40(%rdi), %r10
+; AVX2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 32(%rdi), %rsi
+; AVX2-NEXT:    movq 24(%rdi), %rbp
+; AVX2-NEXT:    movq 16(%rdi), %rbx
+; AVX2-NEXT:    movq (%rdi), %r8
+; AVX2-NEXT:    movq 8(%rdi), %r11
+; AVX2-NEXT:    tzcntq %r8, %rax
+; AVX2-NEXT:    tzcntq %r11, %rdx
+; AVX2-NEXT:    addl $64, %edx
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %eax, %edx
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    tzcntq %rbx, %r12
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rbp, %rax
+; AVX2-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rbx, %rbx
+; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    movq %r8, %r12
+; AVX2-NEXT:    orq %r11, %r12
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    tzcntq %rsi, %rdx
+; AVX2-NEXT:    xorl %r13d, %r13d
+; AVX2-NEXT:    tzcntq %r10, %r13
+; AVX2-NEXT:    addl $64, %r13d
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    cmovnel %edx, %r13d
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    tzcntq %rcx, %rdx
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    tzcntq %r9, %r12
+; AVX2-NEXT:    addl $64, %r12d
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %r12d
+; AVX2-NEXT:    subl $-128, %r12d
+; AVX2-NEXT:    movq %rsi, %rdx
+; AVX2-NEXT:    orq %r10, %rdx
+; AVX2-NEXT:    cmovnel %r13d, %r12d
+; AVX2-NEXT:    addl $256, %r12d # imm = 0x100
+; AVX2-NEXT:    movq %r11, %rdx
+; AVX2-NEXT:    orq %rbp, %rdx
+; AVX2-NEXT:    movq %r8, %r13
+; AVX2-NEXT:    orq %rbx, %r13
+; AVX2-NEXT:    orq %rdx, %r13
+; AVX2-NEXT:    cmovnel %eax, %r12d
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    tzcntq %r15, %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %r14, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r15, %r15
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    movq 88(%rdi), %rbp
+; AVX2-NEXT:    xorl %r13d, %r13d
+; AVX2-NEXT:    tzcntq %rbp, %r13
+; AVX2-NEXT:    addl $64, %r13d
+; AVX2-NEXT:    movq 80(%rdi), %r10
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    tzcntq %r10, %rcx
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %ecx, %r13d
+; AVX2-NEXT:    subl $-128, %r13d
+; AVX2-NEXT:    movq %r15, %rcx
+; AVX2-NEXT:    orq %r14, %rcx
+; AVX2-NEXT:    cmovnel %eax, %r13d
+; AVX2-NEXT:    movq 104(%rdi), %r9
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    tzcntq %r9, %rcx
+; AVX2-NEXT:    addl $64, %ecx
+; AVX2-NEXT:    movq 96(%rdi), %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rdx, %rax
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %eax, %ecx
+; AVX2-NEXT:    movq 112(%rdi), %rsi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq 120(%rdi), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    tzcntq %rsi, %rdi
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %r9, %rdx
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    orq %rbp, %r14
+; AVX2-NEXT:    orq %r10, %r15
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %r14, %r15
+; AVX2-NEXT:    cmovnel %r13d, %eax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX2-NEXT:    orq %rcx, %r11
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX2-NEXT:    orq %rbx, %r8
+; AVX2-NEXT:    addl $512, %eax # imm = 0x200
+; AVX2-NEXT:    orq %r11, %r8
+; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_cttz_i1024:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    movq 88(%rdi), %rbp
+; AVX512-NEXT:    movq 72(%rdi), %r15
+; AVX512-NEXT:    movq 56(%rdi), %r9
+; AVX512-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 48(%rdi), %rcx
+; AVX512-NEXT:    movq 40(%rdi), %r10
+; AVX512-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 32(%rdi), %rsi
+; AVX512-NEXT:    movq 24(%rdi), %r14
+; AVX512-NEXT:    movq 16(%rdi), %rbx
+; AVX512-NEXT:    movq (%rdi), %r8
+; AVX512-NEXT:    movq 8(%rdi), %r11
+; AVX512-NEXT:    tzcntq %r8, %rax
+; AVX512-NEXT:    tzcntq %r11, %rdx
+; AVX512-NEXT:    addl $64, %edx
+; AVX512-NEXT:    testq %r8, %r8
+; AVX512-NEXT:    cmovnel %eax, %edx
+; AVX512-NEXT:    tzcntq %rbx, %r12
+; AVX512-NEXT:    tzcntq %r14, %rax
+; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rbx, %rbx
+; AVX512-NEXT:    cmovnel %r12d, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    movq %r8, %r12
+; AVX512-NEXT:    orq %r11, %r12
+; AVX512-NEXT:    cmovnel %edx, %eax
+; AVX512-NEXT:    tzcntq %rsi, %rdx
+; AVX512-NEXT:    tzcntq %r10, %r13
+; AVX512-NEXT:    addl $64, %r13d
+; AVX512-NEXT:    testq %rsi, %rsi
+; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    cmovnel %edx, %r13d
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    tzcntq %rcx, %rdx
+; AVX512-NEXT:    tzcntq %r9, %r12
+; AVX512-NEXT:    addl $64, %r12d
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %edx, %r12d
+; AVX512-NEXT:    subl $-128, %r12d
+; AVX512-NEXT:    movq %rsi, %rdx
+; AVX512-NEXT:    orq %r10, %rdx
+; AVX512-NEXT:    cmovnel %r13d, %r12d
+; AVX512-NEXT:    addl $256, %r12d # imm = 0x100
+; AVX512-NEXT:    movq %r11, %rdx
+; AVX512-NEXT:    orq %r14, %rdx
+; AVX512-NEXT:    movq %r8, %r13
+; AVX512-NEXT:    orq %rbx, %r13
+; AVX512-NEXT:    orq %rdx, %r13
+; AVX512-NEXT:    movq 64(%rdi), %r13
+; AVX512-NEXT:    cmovnel %eax, %r12d
+; AVX512-NEXT:    tzcntq %r13, %rdx
+; AVX512-NEXT:    tzcntq %r15, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %r13, %r13
+; AVX512-NEXT:    cmovnel %edx, %eax
+; AVX512-NEXT:    movq %rbp, %r14
+; AVX512-NEXT:    tzcntq %rbp, %rbp
+; AVX512-NEXT:    addl $64, %ebp
+; AVX512-NEXT:    movq 80(%rdi), %r10
+; AVX512-NEXT:    tzcntq %r10, %rcx
+; AVX512-NEXT:    testq %r10, %r10
+; AVX512-NEXT:    cmovnel %ecx, %ebp
+; AVX512-NEXT:    subl $-128, %ebp
+; AVX512-NEXT:    movq %r13, %rcx
+; AVX512-NEXT:    orq %r15, %rcx
+; AVX512-NEXT:    cmovnel %eax, %ebp
+; AVX512-NEXT:    movq 104(%rdi), %r9
+; AVX512-NEXT:    tzcntq %r9, %rcx
+; AVX512-NEXT:    addl $64, %ecx
+; AVX512-NEXT:    movq 96(%rdi), %rdx
+; AVX512-NEXT:    tzcntq %rdx, %rax
+; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    cmovnel %eax, %ecx
+; AVX512-NEXT:    movq 112(%rdi), %rsi
+; AVX512-NEXT:    tzcntq 120(%rdi), %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    tzcntq %rsi, %rdi
+; AVX512-NEXT:    testq %rsi, %rsi
+; AVX512-NEXT:    cmovnel %edi, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %r9, %rdx
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    orq %r14, %r15
+; AVX512-NEXT:    orq %r10, %r13
+; AVX512-NEXT:    addl $256, %eax # imm = 0x100
+; AVX512-NEXT:    orq %r15, %r13
+; AVX512-NEXT:    cmovnel %ebp, %eax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX512-NEXT:    orq %rcx, %r11
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX512-NEXT:    orq %rbx, %r8
+; AVX512-NEXT:    addl $512, %eax # imm = 0x200
+; AVX512-NEXT:    orq %r11, %r8
+; AVX512-NEXT:    cmovnel %r12d, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    retq
+  %a0 = load i1024, ptr %p0
+  %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0)
+  %res = trunc i1024 %cnt to i32
+  ret i32 %res
+}
+
+;
+; CTTZ_ZERO_UNDEF
+;
+
+define i32 @test_cttz_undef_i128(i128 %a0) nounwind {
+; SSE-LABEL: test_cttz_undef_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rep bsfq %rdi, %rcx
+; SSE-NEXT:    rep bsfq %rsi, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_cttz_undef_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    tzcntq %rdi, %rcx
+; AVX2-NEXT:    tzcntq %rsi, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_cttz_undef_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    tzcntq %rdi, %rcx
+; AVX512-NEXT:    tzcntq %rsi, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rdi, %rdi
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 -1)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_cttz_undef_i128(ptr %p0) nounwind {
+; SSE-LABEL: load_cttz_undef_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq (%rdi), %rcx
+; SSE-NEXT:    rep bsfq %rcx, %rdx
+; SSE-NEXT:    rep bsfq 8(%rdi), %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_cttz_undef_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq (%rdi), %rcx
+; AVX2-NEXT:    tzcntq %rcx, %rdx
+; AVX2-NEXT:    tzcntq 8(%rdi), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_cttz_undef_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq (%rdi), %rcx
+; AVX512-NEXT:    tzcntq %rcx, %rdx
+; AVX512-NEXT:    tzcntq 8(%rdi), %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %edx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = load i128, ptr %p0
+  %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 -1)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_cttz_undef_i256(i256 %a0) nounwind {
+; SSE-LABEL: test_cttz_undef_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rep bsfq %rdi, %rax
+; SSE-NEXT:    rep bsfq %rsi, %r8
+; SSE-NEXT:    addl $64, %r8d
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %eax, %r8d
+; SSE-NEXT:    rep bsfq %rdx, %r9
+; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %r9d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %rsi, %rdi
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_cttz_undef_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    tzcntq %rdi, %rax
+; AVX2-NEXT:    tzcntq %rsi, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    tzcntq %rdx, %r9
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rcx, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rsi, %rdi
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_cttz_undef_i256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    tzcntq %rdi, %rax
+; AVX512-NEXT:    tzcntq %rsi, %r8
+; AVX512-NEXT:    addl $64, %r8d
+; AVX512-NEXT:    testq %rdi, %rdi
+; AVX512-NEXT:    cmovnel %eax, %r8d
+; AVX512-NEXT:    tzcntq %rdx, %r9
+; AVX512-NEXT:    tzcntq %rcx, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    cmovnel %r9d, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %rsi, %rdi
+; AVX512-NEXT:    cmovnel %r8d, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 -1)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_cttz_undef_i256(ptr %p0) nounwind {
+; SSE-LABEL: load_cttz_undef_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq 16(%rdi), %rcx
+; SSE-NEXT:    movq (%rdi), %rdx
+; SSE-NEXT:    movq 8(%rdi), %rsi
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    rep bsfq %rsi, %r8
+; SSE-NEXT:    addl $64, %r8d
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %eax, %r8d
+; SSE-NEXT:    rep bsfq %rcx, %r9
+; SSE-NEXT:    rep bsfq 24(%rdi), %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %r9d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %rsi, %rdx
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_cttz_undef_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq (%rdi), %rcx
+; AVX2-NEXT:    movq 8(%rdi), %rdx
+; AVX2-NEXT:    tzcntq %rcx, %rax
+; AVX2-NEXT:    tzcntq %rdx, %rsi
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %eax, %esi
+; AVX2-NEXT:    movq 16(%rdi), %r8
+; AVX2-NEXT:    tzcntq %r8, %r9
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq 24(%rdi), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: load_cttz_undef_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512F-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vpandn %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [64,128,192,256]
+; AVX512F-NEXT:    vplzcntq %ymm1, %ymm1
+; AVX512F-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512F-NEXT:    vpcompressq %ymm1, %ymm0 {%k1} {z}
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: load_cttz_undef_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512POPCNT-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpandn %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpopcntq %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT:    vpcompressq %ymm1, %ymm0 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = load i256, ptr %p0
+  %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 -1)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_cttz_undef_i512(i512 %a0) nounwind {
+; SSE-LABEL: test_cttz_undef_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    rep bsfq %rdi, %rax
+; SSE-NEXT:    rep bsfq %rsi, %r11
+; SSE-NEXT:    addl $64, %r11d
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %eax, %r11d
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    rep bsfq %rcx, %r10
+; SSE-NEXT:    addl $64, %r10d
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %eax, %r10d
+; SSE-NEXT:    subl $-128, %r10d
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    orq %rsi, %rax
+; SSE-NEXT:    cmovnel %r11d, %r10d
+; SSE-NEXT:    rep bsfq %r8, %rax
+; SSE-NEXT:    rep bsfq %r9, %r11
+; SSE-NEXT:    addl $64, %r11d
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %eax, %r11d
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; SSE-NEXT:    rep bsfq %rbx, %r14
+; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rbx, %rbx
+; SSE-NEXT:    cmovnel %r14d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %r9, %r8
+; SSE-NEXT:    cmovnel %r11d, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    orq %rcx, %rsi
+; SSE-NEXT:    orq %rdx, %rdi
+; SSE-NEXT:    orq %rsi, %rdi
+; SSE-NEXT:    cmovnel %r10d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_cttz_undef_i512:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %r14
 ; AVX2-NEXT:    pushq %rbx
@@ -2109,105 +4501,113 @@ define i32 @test_cttz_i512(i512 %a0) nounwind {
 ; AVX2-NEXT:    popq %r14
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cttz_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    tzcntq %rdi, %rax
-; AVX512-NEXT:    tzcntq %rsi, %rbx
-; AVX512-NEXT:    addl $64, %ebx
-; AVX512-NEXT:    testq %rdi, %rdi
-; AVX512-NEXT:    cmovnel %eax, %ebx
-; AVX512-NEXT:    tzcntq %rdx, %rax
-; AVX512-NEXT:    tzcntq %rcx, %r10
-; AVX512-NEXT:    addl $64, %r10d
-; AVX512-NEXT:    testq %rdx, %rdx
-; AVX512-NEXT:    cmovnel %eax, %r10d
-; AVX512-NEXT:    subl $-128, %r10d
-; AVX512-NEXT:    movq %rdi, %rax
-; AVX512-NEXT:    orq %rsi, %rax
-; AVX512-NEXT:    cmovnel %ebx, %r10d
-; AVX512-NEXT:    tzcntq %r8, %rax
-; AVX512-NEXT:    tzcntq %r9, %rbx
-; AVX512-NEXT:    addl $64, %ebx
-; AVX512-NEXT:    testq %r8, %r8
-; AVX512-NEXT:    cmovnel %eax, %ebx
-; AVX512-NEXT:    tzcntq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    tzcntq %r11, %r14
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %r11, %r11
-; AVX512-NEXT:    cmovnel %r14d, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %r9, %r8
-; AVX512-NEXT:    cmovnel %ebx, %eax
-; AVX512-NEXT:    addl $256, %eax # imm = 0x100
-; AVX512-NEXT:    orq %rcx, %rsi
-; AVX512-NEXT:    orq %rdx, %rdi
-; AVX512-NEXT:    orq %rsi, %rdi
-; AVX512-NEXT:    cmovnel %r10d, %eax
-; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    retq
-  %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0)
+; AVX512F-LABEL: test_cttz_undef_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %rcx, %xmm0
+; AVX512F-NEXT:    vmovq %rdx, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT:    vmovq %rsi, %xmm1
+; AVX512F-NEXT:    vmovq %rdi, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vmovq %r9, %xmm1
+; AVX512F-NEXT:    vmovq %r8, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: test_cttz_undef_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovq %rcx, %xmm0
+; AVX512POPCNT-NEXT:    vmovq %rdx, %xmm1
+; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512POPCNT-NEXT:    vmovq %rsi, %xmm1
+; AVX512POPCNT-NEXT:    vmovq %rdi, %xmm2
+; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT:    vmovq %r9, %xmm2
+; AVX512POPCNT-NEXT:    vmovq %r8, %xmm3
+; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512POPCNT-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm2, %ymm2
+; AVX512POPCNT-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512POPCNT-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512POPCNT-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 -1)
   %res = trunc i512 %cnt to i32
   ret i32 %res
 }
 
-define i32 @load_cttz_i512(ptr %p0) nounwind {
-; SSE-LABEL: load_cttz_i512:
+define i32 @load_cttz_undef_i512(ptr %p0) nounwind {
+; SSE-LABEL: load_cttz_undef_i512:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushq %r15
 ; SSE-NEXT:    pushq %r14
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq 48(%rdi), %r10
 ; SSE-NEXT:    movq 40(%rdi), %r9
 ; SSE-NEXT:    movq 24(%rdi), %r8
 ; SSE-NEXT:    movq 16(%rdi), %rdx
 ; SSE-NEXT:    movq (%rdi), %rcx
 ; SSE-NEXT:    movq 8(%rdi), %rsi
 ; SSE-NEXT:    rep bsfq %rcx, %rax
-; SSE-NEXT:    rep bsfq %rsi, %rbx
-; SSE-NEXT:    addl $64, %ebx
+; SSE-NEXT:    rep bsfq %rsi, %r11
+; SSE-NEXT:    addl $64, %r11d
 ; SSE-NEXT:    testq %rcx, %rcx
-; SSE-NEXT:    cmovnel %eax, %ebx
+; SSE-NEXT:    cmovnel %eax, %r11d
 ; SSE-NEXT:    rep bsfq %rdx, %rax
-; SSE-NEXT:    rep bsfq %r8, %r11
-; SSE-NEXT:    addl $64, %r11d
+; SSE-NEXT:    rep bsfq %r8, %r10
+; SSE-NEXT:    addl $64, %r10d
 ; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovnel %eax, %r11d
-; SSE-NEXT:    movq 32(%rdi), %r14
-; SSE-NEXT:    subl $-128, %r11d
+; SSE-NEXT:    cmovnel %eax, %r10d
+; SSE-NEXT:    movq 32(%rdi), %rbx
+; SSE-NEXT:    subl $-128, %r10d
 ; SSE-NEXT:    movq %rcx, %rax
 ; SSE-NEXT:    orq %rsi, %rax
-; SSE-NEXT:    cmovnel %ebx, %r11d
-; SSE-NEXT:    rep bsfq %r14, %rax
-; SSE-NEXT:    rep bsfq %r9, %rbx
-; SSE-NEXT:    addl $64, %ebx
-; SSE-NEXT:    testq %r14, %r14
-; SSE-NEXT:    cmovnel %eax, %ebx
-; SSE-NEXT:    rep bsfq %r10, %r15
-; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    cmovnel %r11d, %r10d
+; SSE-NEXT:    rep bsfq %rbx, %rax
+; SSE-NEXT:    rep bsfq %r9, %r11
+; SSE-NEXT:    addl $64, %r11d
+; SSE-NEXT:    testq %rbx, %rbx
+; SSE-NEXT:    cmovnel %eax, %r11d
+; SSE-NEXT:    movq 48(%rdi), %r14
+; SSE-NEXT:    rep bsfq %r14, %r15
 ; SSE-NEXT:    rep bsfq 56(%rdi), %rax
 ; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    testq %r14, %r14
 ; SSE-NEXT:    cmovnel %r15d, %eax
 ; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    orq %r9, %r14
-; SSE-NEXT:    cmovnel %ebx, %eax
+; SSE-NEXT:    orq %r9, %rbx
+; SSE-NEXT:    cmovnel %r11d, %eax
 ; SSE-NEXT:    addl $256, %eax # imm = 0x100
 ; SSE-NEXT:    orq %r8, %rsi
 ; SSE-NEXT:    orq %rdx, %rcx
 ; SSE-NEXT:    orq %rsi, %rcx
-; SSE-NEXT:    cmovnel %r11d, %eax
+; SSE-NEXT:    cmovnel %r10d, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    popq %r14
 ; SSE-NEXT:    popq %r15
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: load_cttz_i512:
+; AVX2-LABEL: load_cttz_undef_i512:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %r15
 ; AVX2-NEXT:    pushq %r14
@@ -2263,61 +4663,44 @@ define i32 @load_cttz_i512(ptr %p0) nounwind {
 ; AVX2-NEXT:    popq %r15
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: load_cttz_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    movq 48(%rdi), %r11
-; AVX512-NEXT:    movq 40(%rdi), %r9
-; AVX512-NEXT:    movq 32(%rdi), %r10
-; AVX512-NEXT:    movq 24(%rdi), %r8
-; AVX512-NEXT:    movq 16(%rdi), %rdx
-; AVX512-NEXT:    movq (%rdi), %rcx
-; AVX512-NEXT:    movq 8(%rdi), %rsi
-; AVX512-NEXT:    tzcntq %rcx, %rax
-; AVX512-NEXT:    tzcntq %rsi, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %rcx, %rcx
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    tzcntq %rdx, %rax
-; AVX512-NEXT:    tzcntq %r8, %rbx
-; AVX512-NEXT:    addl $64, %ebx
-; AVX512-NEXT:    testq %rdx, %rdx
-; AVX512-NEXT:    cmovnel %eax, %ebx
-; AVX512-NEXT:    subl $-128, %ebx
-; AVX512-NEXT:    movq %rcx, %rax
-; AVX512-NEXT:    orq %rsi, %rax
-; AVX512-NEXT:    cmovnel %r14d, %ebx
-; AVX512-NEXT:    tzcntq %r10, %rax
-; AVX512-NEXT:    tzcntq %r9, %r14
-; AVX512-NEXT:    addl $64, %r14d
-; AVX512-NEXT:    testq %r10, %r10
-; AVX512-NEXT:    cmovnel %eax, %r14d
-; AVX512-NEXT:    tzcntq 56(%rdi), %rax
-; AVX512-NEXT:    tzcntq %r11, %rdi
-; AVX512-NEXT:    addl $64, %eax
-; AVX512-NEXT:    testq %r11, %r11
-; AVX512-NEXT:    cmovnel %edi, %eax
-; AVX512-NEXT:    subl $-128, %eax
-; AVX512-NEXT:    orq %r9, %r10
-; AVX512-NEXT:    cmovnel %r14d, %eax
-; AVX512-NEXT:    addl $256, %eax # imm = 0x100
-; AVX512-NEXT:    orq %r8, %rsi
-; AVX512-NEXT:    orq %rdx, %rcx
-; AVX512-NEXT:    orq %rsi, %rcx
-; AVX512-NEXT:    cmovnel %ebx, %eax
-; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: load_cttz_undef_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: load_cttz_undef_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512POPCNT-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
   %a0 = load i512, ptr %p0
-  %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0)
+  %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 -1)
   %res = trunc i512 %cnt to i32
   ret i32 %res
 }
 
-define i32 @test_cttz_i1024(i1024 %a0) nounwind {
-; SSE-LABEL: test_cttz_i1024:
+define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind {
+; SSE-LABEL: test_cttz_undef_i1024:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushq %rbp
 ; SSE-NEXT:    pushq %r15
@@ -2325,74 +4708,72 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind {
 ; SSE-NEXT:    pushq %r13
 ; SSE-NEXT:    pushq %r12
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq %r9, %r13
-; SSE-NEXT:    movq %r8, %r14
+; SSE-NEXT:    movq %r9, %r14
 ; SSE-NEXT:    movq %rcx, %rbx
 ; SSE-NEXT:    movq %rdx, %r10
 ; SSE-NEXT:    movq %rsi, %r9
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE-NEXT:    rep bsfq %rdi, %rax
-; SSE-NEXT:    rep bsfq %r9, %r15
-; SSE-NEXT:    addl $64, %r15d
+; SSE-NEXT:    rep bsfq %rsi, %r12
+; SSE-NEXT:    addl $64, %r12d
 ; SSE-NEXT:    testq %rdi, %rdi
-; SSE-NEXT:    cmovnel %eax, %r15d
-; SSE-NEXT:    rep bsfq %r10, %r12
-; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    cmovnel %eax, %r12d
+; SSE-NEXT:    rep bsfq %r10, %r15
+; SSE-NEXT:    rep bsfq %rbx, %rax
 ; SSE-NEXT:    addl $64, %eax
 ; SSE-NEXT:    testq %r10, %r10
-; SSE-NEXT:    cmovnel %r12d, %eax
-; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    movq %rdi, %r12
-; SSE-NEXT:    orq %r9, %r12
 ; SSE-NEXT:    cmovnel %r15d, %eax
-; SSE-NEXT:    rep bsfq %r8, %r15
-; SSE-NEXT:    movq %r13, %rcx
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    rep bsfq %r13, %r13
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    movq %rdi, %r13
+; SSE-NEXT:    orq %rsi, %r13
+; SSE-NEXT:    cmovnel %r12d, %eax
+; SSE-NEXT:    movq %r8, %r15
+; SSE-NEXT:    rep bsfq %r8, %r12
+; SSE-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    rep bsfq %r14, %r13
 ; SSE-NEXT:    addl $64, %r13d
 ; SSE-NEXT:    testq %r8, %r8
-; SSE-NEXT:    cmovnel %r15d, %r13d
-; SSE-NEXT:    rep bsfq %rdx, %r12
-; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %r15
-; SSE-NEXT:    addl $64, %r15d
-; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovnel %r12d, %r15d
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; SSE-NEXT:    subl $-128, %r15d
+; SSE-NEXT:    cmovnel %r12d, %r13d
+; SSE-NEXT:    rep bsfq %rcx, %rbp
+; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT:    addl $64, %r12d
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %ebp, %r12d
+; SSE-NEXT:    subl $-128, %r12d
 ; SSE-NEXT:    movq %r8, %rbp
-; SSE-NEXT:    orq %rcx, %rbp
-; SSE-NEXT:    cmovnel %r13d, %r15d
-; SSE-NEXT:    addl $256, %r15d # imm = 0x100
-; SSE-NEXT:    movq %r9, %r13
+; SSE-NEXT:    orq %r14, %rbp
+; SSE-NEXT:    cmovnel %r13d, %r12d
+; SSE-NEXT:    addl $256, %r12d # imm = 0x100
+; SSE-NEXT:    movq %rsi, %r13
 ; SSE-NEXT:    orq %rbx, %r13
 ; SSE-NEXT:    movq %rdi, %rbp
 ; SSE-NEXT:    orq %r10, %rbp
 ; SSE-NEXT:    orq %r13, %rbp
-; SSE-NEXT:    cmovnel %eax, %r15d
-; SSE-NEXT:    rep bsfq %r11, %r13
-; SSE-NEXT:    rep bsfq %r12, %rax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; SSE-NEXT:    cmovnel %eax, %r12d
+; SSE-NEXT:    rep bsfq %r11, %rbp
+; SSE-NEXT:    rep bsfq %r13, %rax
 ; SSE-NEXT:    addl $64, %eax
 ; SSE-NEXT:    testq %r11, %r11
-; SSE-NEXT:    cmovnel %r13d, %eax
-; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %r13
-; SSE-NEXT:    addl $64, %r13d
-; SSE-NEXT:    rep bsfq %rsi, %rcx
-; SSE-NEXT:    testq %rsi, %rsi
-; SSE-NEXT:    cmovnel %ecx, %r13d
-; SSE-NEXT:    subl $-128, %r13d
+; SSE-NEXT:    cmovnel %ebp, %eax
+; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %rbp
+; SSE-NEXT:    addl $64, %ebp
+; SSE-NEXT:    rep bsfq %rdx, %rcx
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %ecx, %ebp
+; SSE-NEXT:    subl $-128, %ebp
 ; SSE-NEXT:    movq %r11, %rcx
-; SSE-NEXT:    orq %r12, %rcx
-; SSE-NEXT:    cmovnel %eax, %r13d
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSE-NEXT:    rep bsfq %rbp, %rcx
+; SSE-NEXT:    orq %r13, %rcx
+; SSE-NEXT:    cmovnel %eax, %ebp
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE-NEXT:    rep bsfq %r14, %rcx
 ; SSE-NEXT:    addl $64, %ecx
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; SSE-NEXT:    rep bsfq %rdx, %rax
 ; SSE-NEXT:    testq %rdx, %rdx
 ; SSE-NEXT:    cmovnel %eax, %ecx
-; SSE-NEXT:    movl $64, %eax
 ; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    addl $64, %eax
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r8
@@ -2400,22 +4781,22 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind {
 ; SSE-NEXT:    testq %r8, %r8
 ; SSE-NEXT:    cmovnel %esi, %eax
 ; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    orq %rbp, %rdx
+; SSE-NEXT:    orq %r14, %rdx
 ; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r13
 ; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r11
 ; SSE-NEXT:    addl $256, %eax # imm = 0x100
-; SSE-NEXT:    orq %r12, %r11
-; SSE-NEXT:    cmovnel %r13d, %eax
+; SSE-NEXT:    orq %r13, %r11
+; SSE-NEXT:    cmovnel %ebp, %eax
 ; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
 ; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
 ; SSE-NEXT:    orq %rbx, %r9
 ; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r10
-; SSE-NEXT:    orq %r14, %rdi
+; SSE-NEXT:    orq %r15, %rdi
 ; SSE-NEXT:    orq %r10, %rdi
 ; SSE-NEXT:    addl $512, %eax # imm = 0x200
 ; SSE-NEXT:    orq %r9, %rdi
-; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    cmovnel %r12d, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    popq %r12
@@ -2425,7 +4806,7 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind {
 ; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: test_cttz_i1024:
+; AVX2-LABEL: test_cttz_undef_i1024:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
 ; AVX2-NEXT:    pushq %r15
@@ -2547,7 +4928,7 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind {
 ; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cttz_i1024:
+; AVX512-LABEL: test_cttz_undef_i1024:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
 ; AVX512-NEXT:    pushq %r15
@@ -2652,13 +5033,13 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind {
 ; AVX512-NEXT:    popq %r15
 ; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    retq
-  %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0)
+  %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 -1)
   %res = trunc i1024 %cnt to i32
   ret i32 %res
 }
 
-define i32 @load_cttz_i1024(ptr %p0) nounwind {
-; SSE-LABEL: load_cttz_i1024:
+define i32 @load_cttz_undef_i1024(ptr %p0) nounwind {
+; SSE-LABEL: load_cttz_undef_i1024:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushq %rbp
 ; SSE-NEXT:    pushq %r15
@@ -2666,14 +5047,14 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind {
 ; SSE-NEXT:    pushq %r13
 ; SSE-NEXT:    pushq %r12
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq 88(%rdi), %r10
+; SSE-NEXT:    movq 72(%rdi), %rbx
+; SSE-NEXT:    movq 56(%rdi), %r9
+; SSE-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 48(%rdi), %rcx
+; SSE-NEXT:    movq 40(%rdi), %r10
 ; SSE-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 56(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 40(%rdi), %rsi
-; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 24(%rdi), %r9
-; SSE-NEXT:    movq 16(%rdi), %r15
+; SSE-NEXT:    movq 32(%rdi), %rsi
+; SSE-NEXT:    movq 24(%rdi), %rbp
 ; SSE-NEXT:    movq (%rdi), %r8
 ; SSE-NEXT:    movq 8(%rdi), %r11
 ; SSE-NEXT:    rep bsfq %r8, %rax
@@ -2681,57 +5062,57 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind {
 ; SSE-NEXT:    addl $64, %edx
 ; SSE-NEXT:    testq %r8, %r8
 ; SSE-NEXT:    cmovnel %eax, %edx
-; SSE-NEXT:    rep bsfq %r15, %rbx
-; SSE-NEXT:    rep bsfq %r9, %rax
-; SSE-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 16(%rdi), %r14
+; SSE-NEXT:    rep bsfq %r14, %r15
+; SSE-NEXT:    rep bsfq %rbp, %rax
+; SSE-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %r15, %r15
-; SSE-NEXT:    cmovnel %ebx, %eax
-; SSE-NEXT:    movq 32(%rdi), %rbx
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovnel %r15d, %eax
 ; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    movq %r8, %r14
-; SSE-NEXT:    orq %r11, %r14
+; SSE-NEXT:    movq %r8, %r15
+; SSE-NEXT:    orq %r11, %r15
 ; SSE-NEXT:    cmovnel %edx, %eax
-; SSE-NEXT:    rep bsfq %rbx, %rdx
-; SSE-NEXT:    rep bsfq %rsi, %r12
-; SSE-NEXT:    addl $64, %r12d
-; SSE-NEXT:    testq %rbx, %rbx
-; SSE-NEXT:    cmovnel %edx, %r12d
-; SSE-NEXT:    movq 48(%rdi), %r13
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    rep bsfq %r13, %rdx
-; SSE-NEXT:    rep bsfq %rcx, %r14
-; SSE-NEXT:    addl $64, %r14d
-; SSE-NEXT:    testq %r13, %r13
-; SSE-NEXT:    cmovnel %edx, %r14d
-; SSE-NEXT:    subl $-128, %r14d
-; SSE-NEXT:    movq %rbx, %rdx
-; SSE-NEXT:    orq %rsi, %rdx
-; SSE-NEXT:    cmovnel %r12d, %r14d
-; SSE-NEXT:    movq 72(%rdi), %r12
-; SSE-NEXT:    addl $256, %r14d # imm = 0x100
+; SSE-NEXT:    rep bsfq %rsi, %rdx
+; SSE-NEXT:    rep bsfq %r10, %r13
+; SSE-NEXT:    addl $64, %r13d
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    cmovnel %edx, %r13d
+; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    rep bsfq %rcx, %rdx
+; SSE-NEXT:    rep bsfq %r9, %r15
+; SSE-NEXT:    addl $64, %r15d
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edx, %r15d
+; SSE-NEXT:    movq 64(%rdi), %r12
+; SSE-NEXT:    subl $-128, %r15d
+; SSE-NEXT:    movq %rsi, %rdx
+; SSE-NEXT:    orq %r10, %rdx
+; SSE-NEXT:    cmovnel %r13d, %r15d
+; SSE-NEXT:    addl $256, %r15d # imm = 0x100
 ; SSE-NEXT:    movq %r11, %rdx
-; SSE-NEXT:    orq %r9, %rdx
+; SSE-NEXT:    orq %rbp, %rdx
 ; SSE-NEXT:    movq %r8, %r13
-; SSE-NEXT:    orq %r15, %r13
+; SSE-NEXT:    orq %r14, %r13
 ; SSE-NEXT:    orq %rdx, %r13
-; SSE-NEXT:    movq 64(%rdi), %r13
-; SSE-NEXT:    cmovnel %eax, %r14d
-; SSE-NEXT:    rep bsfq %r13, %rdx
-; SSE-NEXT:    rep bsfq %r12, %rax
+; SSE-NEXT:    cmovnel %eax, %r15d
+; SSE-NEXT:    rep bsfq %r12, %rdx
+; SSE-NEXT:    rep bsfq %rbx, %rax
 ; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %r13, %r13
+; SSE-NEXT:    testq %r12, %r12
 ; SSE-NEXT:    cmovnel %edx, %eax
-; SSE-NEXT:    rep bsfq %r10, %rbp
-; SSE-NEXT:    addl $64, %ebp
+; SSE-NEXT:    movq 88(%rdi), %rbp
+; SSE-NEXT:    rep bsfq %rbp, %r13
+; SSE-NEXT:    addl $64, %r13d
 ; SSE-NEXT:    movq 80(%rdi), %r10
 ; SSE-NEXT:    rep bsfq %r10, %rcx
 ; SSE-NEXT:    testq %r10, %r10
-; SSE-NEXT:    cmovnel %ecx, %ebp
-; SSE-NEXT:    subl $-128, %ebp
-; SSE-NEXT:    movq %r13, %rcx
-; SSE-NEXT:    orq %r12, %rcx
-; SSE-NEXT:    cmovnel %eax, %ebp
+; SSE-NEXT:    cmovnel %ecx, %r13d
+; SSE-NEXT:    subl $-128, %r13d
+; SSE-NEXT:    movq %r12, %rcx
+; SSE-NEXT:    orq %rbx, %rcx
+; SSE-NEXT:    cmovnel %eax, %r13d
 ; SSE-NEXT:    movq 104(%rdi), %r9
 ; SSE-NEXT:    rep bsfq %r9, %rcx
 ; SSE-NEXT:    addl $64, %ecx
@@ -2739,7 +5120,6 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind {
 ; SSE-NEXT:    rep bsfq %rdx, %rax
 ; SSE-NEXT:    testq %rdx, %rdx
 ; SSE-NEXT:    cmovnel %eax, %ecx
-; SSE-NEXT:    movl $64, %eax
 ; SSE-NEXT:    rep bsfq 120(%rdi), %rax
 ; SSE-NEXT:    movq 112(%rdi), %rdi
 ; SSE-NEXT:    addl $64, %eax
@@ -2749,21 +5129,21 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind {
 ; SSE-NEXT:    subl $-128, %eax
 ; SSE-NEXT:    orq %r9, %rdx
 ; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; SSE-NEXT:    orq %r10, %r13
+; SSE-NEXT:    orq %rbp, %rbx
+; SSE-NEXT:    orq %r10, %r12
 ; SSE-NEXT:    addl $256, %eax # imm = 0x100
-; SSE-NEXT:    orq %r12, %r13
-; SSE-NEXT:    cmovnel %ebp, %eax
+; SSE-NEXT:    orq %rbx, %r12
+; SSE-NEXT:    cmovnel %r13d, %eax
 ; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
 ; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
 ; SSE-NEXT:    orq %rcx, %r11
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT:    orq %rbx, %r8
-; SSE-NEXT:    orq %r15, %r8
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; SSE-NEXT:    orq %r14, %r8
 ; SSE-NEXT:    addl $512, %eax # imm = 0x200
 ; SSE-NEXT:    orq %r11, %r8
-; SSE-NEXT:    cmovnel %r14d, %eax
+; SSE-NEXT:    cmovnel %r15d, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    popq %r12
@@ -2773,7 +5153,7 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind {
 ; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: load_cttz_i1024:
+; AVX2-LABEL: load_cttz_undef_i1024:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
 ; AVX2-NEXT:    pushq %r15
@@ -2900,7 +5280,7 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind {
 ; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: load_cttz_i1024:
+; AVX512-LABEL: load_cttz_undef_i1024:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
 ; AVX512-NEXT:    pushq %r15
@@ -3015,7 +5395,7 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind {
 ; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    retq
   %a0 = load i1024, ptr %p0
-  %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0)
+  %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 -1)
   %res = trunc i1024 %cnt to i32
   ret i32 %res
 }
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index cc3dcf32ac0eb..9b7569ff8b29f 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686--                   | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=X64,SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE,SSE4
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512
 
@@ -356,41 +356,20 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    shldl %cl, %eax, %edi
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB9_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:  .LBB9_2:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    notl %esi
-; X86-NEXT:    notl %edx
-; X86-NEXT:    je .LBB9_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:  .LBB9_4:
-; X86-NEXT:    andl 4(%ebx), %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    andl (%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $32, %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    movl (%ebx,%eax), %eax
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $32, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%edx,%esi), %edi
+; X86-NEXT:    btl %ecx, %edi
 ; X86-NEXT:    setae %al
-; X86-NEXT:    movl %esi, 4(%ebx)
-; X86-NEXT:    movl %edx, (%ebx)
+; X86-NEXT:    btrl %ecx, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, (%edx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -600,201 +579,55 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
 define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-LABEL: init_eq_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $96, %esp
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movzbl 16(%ebp), %ebx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %edi
-; X86-NEXT:    movl 72(%esp,%edi), %edx
-; X86-NEXT:    movl 76(%esp,%edi), %esi
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 64(%esp,%edi), %ebx
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl 68(%esp,%edi), %ebx
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    shldl %cl, %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    notl %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl 40(%esp,%eax), %edi
-; X86-NEXT:    movl 44(%esp,%eax), %esi
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 12(%ecx), %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    notl %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl 36(%esp,%esi), %esi
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    andl 8(%edx), %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl 32(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl 8(%ebp), %edi
-; X86-NEXT:    andl 4(%edi), %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    andl (%edi), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $96, %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    movl (%edi,%eax), %eax
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 12(%edi)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 8(%edi)
-; X86-NEXT:    movl %ebx, 4(%edi)
-; X86-NEXT:    movl %edx, (%edi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $96, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%edx,%esi), %edi
+; X86-NEXT:    btl %ecx, %edi
 ; X86-NEXT:    setae %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    btrl %ecx, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, (%edx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: init_eq_i128:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %esi
-; SSE-NEXT:    xorl %r8d, %r8d
-; SSE-NEXT:    shldq %cl, %rsi, %r8
-; SSE-NEXT:    shlq %cl, %rsi
-; SSE-NEXT:    movl %edx, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    xorl %r9d, %r9d
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rsi, %r8
-; SSE-NEXT:    cmovneq %r9, %rsi
-; SSE-NEXT:    notq %r8
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    cmovneq %r9, %rax
-; SSE-NEXT:    notq %rsi
-; SSE-NEXT:    andq 8(%rdi), %r8
-; SSE-NEXT:    orq %rdx, %r8
-; SSE-NEXT:    andq (%rdi), %rsi
-; SSE-NEXT:    orq %rax, %rsi
-; SSE-NEXT:    movl %ecx, %eax
-; SSE-NEXT:    andl $96, %eax
-; SSE-NEXT:    shrl $3, %eax
-; SSE-NEXT:    movl (%rdi,%rax), %eax
-; SSE-NEXT:    btl %ecx, %eax
+; SSE-NEXT:    andl $96, %esi
+; SSE-NEXT:    shrl $3, %esi
+; SSE-NEXT:    movl (%rdi,%rsi), %r8d
+; SSE-NEXT:    btl %ecx, %r8d
 ; SSE-NEXT:    setae %al
-; SSE-NEXT:    movq %r8, 8(%rdi)
-; SSE-NEXT:    movq %rsi, (%rdi)
+; SSE-NEXT:    shll %cl, %edx
+; SSE-NEXT:    btrl %ecx, %r8d
+; SSE-NEXT:    orl %r8d, %edx
+; SSE-NEXT:    movl %edx, (%rdi,%rsi)
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: init_eq_i128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    movl $1, %eax
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    shldq %cl, %rax, %rsi
-; AVX2-NEXT:    movl %edx, %edx
-; AVX2-NEXT:    xorl %r8d, %r8d
-; AVX2-NEXT:    shldq %cl, %rdx, %r8
-; AVX2-NEXT:    xorl %r9d, %r9d
-; AVX2-NEXT:    shlxq %rcx, %rax, %rax
-; AVX2-NEXT:    testb $64, %cl
-; AVX2-NEXT:    cmovneq %rax, %rsi
-; AVX2-NEXT:    cmovneq %r9, %rax
-; AVX2-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX2-NEXT:    cmovneq %rdx, %r8
-; AVX2-NEXT:    cmovneq %r9, %rdx
-; AVX2-NEXT:    andnq 8(%rdi), %rsi, %rsi
-; AVX2-NEXT:    orq %r8, %rsi
-; AVX2-NEXT:    andnq (%rdi), %rax, %r8
-; AVX2-NEXT:    orq %rdx, %r8
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $96, %eax
-; AVX2-NEXT:    shrl $3, %eax
-; AVX2-NEXT:    movl (%rdi,%rax), %eax
-; AVX2-NEXT:    btl %ecx, %eax
-; AVX2-NEXT:    setae %al
-; AVX2-NEXT:    movq %rsi, 8(%rdi)
-; AVX2-NEXT:    movq %r8, (%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: init_eq_i128:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    movl $1, %eax
-; AVX512-NEXT:    xorl %esi, %esi
-; AVX512-NEXT:    shldq %cl, %rax, %rsi
-; AVX512-NEXT:    xorl %r8d, %r8d
-; AVX512-NEXT:    shlxq %rcx, %rax, %rax
-; AVX512-NEXT:    movl %edx, %edx
-; AVX512-NEXT:    xorl %r9d, %r9d
-; AVX512-NEXT:    shldq %cl, %rdx, %r9
-; AVX512-NEXT:    testb $64, %cl
-; AVX512-NEXT:    cmovneq %rax, %rsi
-; AVX512-NEXT:    cmovneq %r8, %rax
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX512-NEXT:    cmovneq %rdx, %r9
-; AVX512-NEXT:    cmovneq %r8, %rdx
-; AVX512-NEXT:    andnq 8(%rdi), %rsi, %rsi
-; AVX512-NEXT:    orq %r9, %rsi
-; AVX512-NEXT:    andnq (%rdi), %rax, %r8
-; AVX512-NEXT:    orq %rdx, %r8
-; AVX512-NEXT:    movl %ecx, %eax
-; AVX512-NEXT:    andl $96, %eax
-; AVX512-NEXT:    shrl $3, %eax
-; AVX512-NEXT:    movl (%rdi,%rax), %eax
-; AVX512-NEXT:    btl %ecx, %eax
-; AVX512-NEXT:    setae %al
-; AVX512-NEXT:    movq %rsi, 8(%rdi)
-; AVX512-NEXT:    movq %r8, (%rdi)
-; AVX512-NEXT:    retq
+; AVX-LABEL: init_eq_i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %esi, %ecx
+; AVX-NEXT:    andl $96, %ecx
+; AVX-NEXT:    shrl $3, %ecx
+; AVX-NEXT:    movl (%rdi,%rcx), %r8d
+; AVX-NEXT:    btl %esi, %r8d
+; AVX-NEXT:    setae %al
+; AVX-NEXT:    btrl %esi, %r8d
+; AVX-NEXT:    shlxl %esi, %edx, %edx
+; AVX-NEXT:    orl %r8d, %edx
+; AVX-NEXT:    movl %edx, (%rdi,%rcx)
+; AVX-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -970,665 +803,55 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
 define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-LABEL: init_eq_i512:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $352, %esp # imm = 0x160
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    andl $60, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 56(%eax), %esi
-; X86-NEXT:    movl 60(%eax), %ebx
-; X86-NEXT:    movl 52(%eax), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%eax), %edi
-; X86-NEXT:    movl 44(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl 16(%ebp), %eax
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    shldl %cl, %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 56(%eax), %esi
-; X86-NEXT:    movl 60(%eax), %edi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    andl 60(%edx), %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 52(%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    andl 56(%edx), %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 48(%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    andl 52(%edx), %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 44(%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    andl 48(%edx), %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 40(%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    andl 44(%edx), %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 36(%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    andl 40(%edx), %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 32(%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    andl 36(%edx), %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 28(%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    andl 32(%edx), %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 24(%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    andl 28(%edx), %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 20(%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    andl 24(%edx), %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 16(%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    andl 20(%edx), %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 12(%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    andl 16(%edx), %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 8(%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    andl 12(%edx), %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 4(%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    andl 8(%edx), %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    notl %esi
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %edi
-; X86-NEXT:    andl 4(%edx), %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    notl %esi
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    andl (%edx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl (%edx,%eax), %eax
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 60(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 56(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 52(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 48(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 44(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 40(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 36(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 32(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 28(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 24(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 20(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 16(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 12(%edx)
-; X86-NEXT:    movl %ebx, 8(%edx)
-; X86-NEXT:    movl %edi, 4(%edx)
-; X86-NEXT:    movl %esi, (%edx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    andl $60, %esi
+; X86-NEXT:    movl (%edx,%esi), %edi
+; X86-NEXT:    btl %ecx, %edi
 ; X86-NEXT:    setae %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    btrl %ecx, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, (%edx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: init_eq_i512:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $184, %rsp
-; SSE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    movl %esi, %eax
-; SSE-NEXT:    shrl $3, %eax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
-; SSE-NEXT:    andl $56, %eax
-; SSE-NEXT:    negl %eax
-; SSE-NEXT:    movslq %eax, %r12
-; SSE-NEXT:    movq 160(%rsp,%r12), %rax
-; SSE-NEXT:    movq 168(%rsp,%r12), %r10
-; SSE-NEXT:    shldq %cl, %rax, %r10
-; SSE-NEXT:    movq 152(%rsp,%r12), %rsi
-; SSE-NEXT:    shldq %cl, %rsi, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 144(%rsp,%r12), %r11
-; SSE-NEXT:    shldq %cl, %r11, %rsi
-; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 136(%rsp,%r12), %rbx
-; SSE-NEXT:    shldq %cl, %rbx, %r11
-; SSE-NEXT:    movq 128(%rsp,%r12), %r14
-; SSE-NEXT:    shldq %cl, %r14, %rbx
-; SSE-NEXT:    movq 120(%rsp,%r12), %r15
-; SSE-NEXT:    shldq %cl, %r15, %r14
-; SSE-NEXT:    movq 112(%rsp,%r12), %r13
-; SSE-NEXT:    shldq %cl, %r13, %r15
-; SSE-NEXT:    shlq %cl, %r13
-; SSE-NEXT:    movl %edx, %eax
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq 32(%rsp,%r12), %rax
-; SSE-NEXT:    movq 40(%rsp,%r12), %rdx
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 24(%rsp,%r12), %rdx
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq 16(%rsp,%r12), %rsi
-; SSE-NEXT:    shldq %cl, %rsi, %rdx
-; SSE-NEXT:    movq 8(%rsp,%r12), %r8
-; SSE-NEXT:    shldq %cl, %r8, %rsi
-; SSE-NEXT:    movq (%rsp,%r12), %rbp
-; SSE-NEXT:    shldq %cl, %rbp, %r8
-; SSE-NEXT:    movq -8(%rsp,%r12), %r9
-; SSE-NEXT:    shldq %cl, %r9, %rbp
-; SSE-NEXT:    notq %r10
-; SSE-NEXT:    andq 56(%rdi), %r10
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; SSE-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE-NEXT:    notq %r10
-; SSE-NEXT:    andq 48(%rdi), %r10
-; SSE-NEXT:    orq %rax, %r10
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    notq %rax
-; SSE-NEXT:    andq 40(%rdi), %rax
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    movq %rax, %rdx
-; SSE-NEXT:    notq %r11
-; SSE-NEXT:    andq 32(%rdi), %r11
-; SSE-NEXT:    orq %rsi, %r11
-; SSE-NEXT:    notq %rbx
-; SSE-NEXT:    andq 24(%rdi), %rbx
-; SSE-NEXT:    orq %r8, %rbx
-; SSE-NEXT:    notq %r14
-; SSE-NEXT:    andq 16(%rdi), %r14
-; SSE-NEXT:    orq %rbp, %r14
-; SSE-NEXT:    notq %r15
-; SSE-NEXT:    movq -16(%rsp,%r12), %rax
-; SSE-NEXT:    shldq %cl, %rax, %r9
-; SSE-NEXT:    andq 8(%rdi), %r15
-; SSE-NEXT:    orq %r9, %r15
-; SSE-NEXT:    notq %r13
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    andq (%rdi), %r13
-; SSE-NEXT:    orq %rax, %r13
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andl $60, %eax
-; SSE-NEXT:    movl (%rdi,%rax), %eax
-; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; SSE-NEXT:    btl %ecx, %eax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    movq %rax, 56(%rdi)
-; SSE-NEXT:    movq %r10, 48(%rdi)
-; SSE-NEXT:    movq %rdx, 40(%rdi)
-; SSE-NEXT:    movq %r11, 32(%rdi)
-; SSE-NEXT:    movq %rbx, 24(%rdi)
-; SSE-NEXT:    movq %r14, 16(%rdi)
-; SSE-NEXT:    movq %r15, 8(%rdi)
-; SSE-NEXT:    movq %r13, (%rdi)
+; SSE-NEXT:    shrl $3, %esi
+; SSE-NEXT:    andl $60, %esi
+; SSE-NEXT:    movl (%rdi,%rsi), %r8d
+; SSE-NEXT:    btl %ecx, %r8d
 ; SSE-NEXT:    setae %al
-; SSE-NEXT:    addq $184, %rsp
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    shll %cl, %edx
+; SSE-NEXT:    btrl %ecx, %r8d
+; SSE-NEXT:    orl %r8d, %edx
+; SSE-NEXT:    movl %edx, (%rdi,%rsi)
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: init_eq_i512:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $168, %rsp
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    movl %esi, %r11d
-; AVX2-NEXT:    shrl $3, %r11d
-; AVX2-NEXT:    movl %r11d, %eax
-; AVX2-NEXT:    andl $56, %eax
-; AVX2-NEXT:    negl %eax
-; AVX2-NEXT:    movslq %eax, %r10
-; AVX2-NEXT:    movq 104(%rsp,%r10), %r15
-; AVX2-NEXT:    movq 112(%rsp,%r10), %rax
-; AVX2-NEXT:    movq %rax, %rsi
-; AVX2-NEXT:    shldq %cl, %r15, %rsi
-; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 120(%rsp,%r10), %rsi
-; AVX2-NEXT:    movq %rsi, %r8
-; AVX2-NEXT:    shldq %cl, %rax, %r8
-; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 128(%rsp,%r10), %rax
-; AVX2-NEXT:    movq %rax, %rbx
-; AVX2-NEXT:    shldq %cl, %rsi, %rbx
-; AVX2-NEXT:    movq 136(%rsp,%r10), %rsi
-; AVX2-NEXT:    movq %rsi, %r14
-; AVX2-NEXT:    shldq %cl, %rax, %r14
-; AVX2-NEXT:    movq 144(%rsp,%r10), %rax
-; AVX2-NEXT:    movq %rax, %r12
-; AVX2-NEXT:    shldq %cl, %rsi, %r12
-; AVX2-NEXT:    movq 96(%rsp,%r10), %rsi
-; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 152(%rsp,%r10), %r13
-; AVX2-NEXT:    shldq %cl, %rax, %r13
-; AVX2-NEXT:    shldq %cl, %rsi, %r15
-; AVX2-NEXT:    movl %edx, %eax
-; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vmovups %xmm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq 16(%rsp,%r10), %rbp
-; AVX2-NEXT:    movq 24(%rsp,%r10), %r9
-; AVX2-NEXT:    shldq %cl, %rbp, %r9
-; AVX2-NEXT:    movq 8(%rsp,%r10), %rdx
-; AVX2-NEXT:    shldq %cl, %rdx, %rbp
-; AVX2-NEXT:    movq (%rsp,%r10), %rax
-; AVX2-NEXT:    shldq %cl, %rax, %rdx
-; AVX2-NEXT:    movq -8(%rsp,%r10), %r8
-; AVX2-NEXT:    shldq %cl, %r8, %rax
-; AVX2-NEXT:    movq -16(%rsp,%r10), %rsi
-; AVX2-NEXT:    shldq %cl, %rsi, %r8
-; AVX2-NEXT:    andnq 56(%rdi), %r13, %r13
-; AVX2-NEXT:    orq %r9, %r13
-; AVX2-NEXT:    movq -24(%rsp,%r10), %r9
-; AVX2-NEXT:    shldq %cl, %r9, %rsi
-; AVX2-NEXT:    andnq 48(%rdi), %r12, %r12
-; AVX2-NEXT:    andnq 40(%rdi), %r14, %r14
-; AVX2-NEXT:    orq %rbp, %r12
-; AVX2-NEXT:    orq %rdx, %r14
-; AVX2-NEXT:    andnq 32(%rdi), %rbx, %rdx
-; AVX2-NEXT:    orq %rax, %rdx
-; AVX2-NEXT:    shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX2-NEXT:    movq -32(%rsp,%r10), %r10
-; AVX2-NEXT:    shlxq %rcx, %r10, %rbx
-; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT:    shldq %cl, %r10, %r9
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    andnq 24(%rdi), %rcx, %rcx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andnq 16(%rdi), %r10, %r10
-; AVX2-NEXT:    orq %r8, %rcx
-; AVX2-NEXT:    orq %rsi, %r10
-; AVX2-NEXT:    andnq 8(%rdi), %r15, %rsi
-; AVX2-NEXT:    orq %r9, %rsi
-; AVX2-NEXT:    andnq (%rdi), %rax, %rax
-; AVX2-NEXT:    orq %rbx, %rax
-; AVX2-NEXT:    andl $60, %r11d
-; AVX2-NEXT:    movl (%rdi,%r11), %r8d
-; AVX2-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
-; AVX2-NEXT:    btl %r9d, %r8d
-; AVX2-NEXT:    movq %r13, 56(%rdi)
-; AVX2-NEXT:    movq %r12, 48(%rdi)
-; AVX2-NEXT:    movq %r14, 40(%rdi)
-; AVX2-NEXT:    movq %rdx, 32(%rdi)
-; AVX2-NEXT:    movq %rcx, 24(%rdi)
-; AVX2-NEXT:    movq %r10, 16(%rdi)
-; AVX2-NEXT:    movq %rsi, 8(%rdi)
-; AVX2-NEXT:    movq %rax, (%rdi)
-; AVX2-NEXT:    setae %al
-; AVX2-NEXT:    addq $168, %rsp
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: init_eq_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $152, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    movl %esi, %r8d
-; AVX512-NEXT:    shrl $3, %r8d
-; AVX512-NEXT:    movl %r8d, %eax
-; AVX512-NEXT:    andl $56, %eax
-; AVX512-NEXT:    negl %eax
-; AVX512-NEXT:    movslq %eax, %r9
-; AVX512-NEXT:    movq 88(%rsp,%r9), %r10
-; AVX512-NEXT:    movq 96(%rsp,%r9), %rax
-; AVX512-NEXT:    movq %rax, %rsi
-; AVX512-NEXT:    shldq %cl, %r10, %rsi
-; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 104(%rsp,%r9), %rsi
-; AVX512-NEXT:    movq %rsi, %r11
-; AVX512-NEXT:    shldq %cl, %rax, %r11
-; AVX512-NEXT:    movq 112(%rsp,%r9), %rax
-; AVX512-NEXT:    movq %rax, %rbx
-; AVX512-NEXT:    shldq %cl, %rsi, %rbx
-; AVX512-NEXT:    movq 120(%rsp,%r9), %rsi
-; AVX512-NEXT:    movq %rsi, %r14
-; AVX512-NEXT:    shldq %cl, %rax, %r14
-; AVX512-NEXT:    movq 128(%rsp,%r9), %rax
-; AVX512-NEXT:    movq %rax, %r12
-; AVX512-NEXT:    shldq %cl, %rsi, %r12
-; AVX512-NEXT:    movq 136(%rsp,%r9), %r13
-; AVX512-NEXT:    shldq %cl, %rax, %r13
-; AVX512-NEXT:    movq 80(%rsp,%r9), %r15
-; AVX512-NEXT:    shldq %cl, %r15, %r10
-; AVX512-NEXT:    movl %edx, %eax
-; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq (%rsp,%r9), %rbp
-; AVX512-NEXT:    movq 8(%rsp,%r9), %rsi
-; AVX512-NEXT:    shldq %cl, %rbp, %rsi
-; AVX512-NEXT:    movq -8(%rsp,%r9), %rdx
-; AVX512-NEXT:    shldq %cl, %rdx, %rbp
-; AVX512-NEXT:    movq -16(%rsp,%r9), %rax
-; AVX512-NEXT:    shldq %cl, %rax, %rdx
-; AVX512-NEXT:    andnq 56(%rdi), %r13, %r13
-; AVX512-NEXT:    andnq 48(%rdi), %r12, %r12
-; AVX512-NEXT:    orq %rsi, %r13
-; AVX512-NEXT:    orq %rbp, %r12
-; AVX512-NEXT:    andnq 40(%rdi), %r14, %r14
-; AVX512-NEXT:    orq %rdx, %r14
-; AVX512-NEXT:    movq -24(%rsp,%r9), %rsi
-; AVX512-NEXT:    shldq %cl, %rsi, %rax
-; AVX512-NEXT:    andnq 32(%rdi), %rbx, %rdx
-; AVX512-NEXT:    orq %rax, %rdx
-; AVX512-NEXT:    movq -32(%rsp,%r9), %rax
-; AVX512-NEXT:    shldq %cl, %rax, %rsi
-; AVX512-NEXT:    shlxq %rcx, %r15, %rbx
-; AVX512-NEXT:    andnq 24(%rdi), %r11, %r11
-; AVX512-NEXT:    orq %rsi, %r11
-; AVX512-NEXT:    movq -48(%rsp,%r9), %rsi
-; AVX512-NEXT:    movq -40(%rsp,%r9), %r9
-; AVX512-NEXT:    shldq %cl, %r9, %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    andnq 16(%rdi), %r15, %r15
-; AVX512-NEXT:    orq %rax, %r15
-; AVX512-NEXT:    shlxq %rcx, %rsi, %rax
-; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT:    shldq %cl, %rsi, %r9
-; AVX512-NEXT:    andnq 8(%rdi), %r10, %rcx
-; AVX512-NEXT:    orq %r9, %rcx
-; AVX512-NEXT:    andnq (%rdi), %rbx, %rsi
-; AVX512-NEXT:    orq %rax, %rsi
-; AVX512-NEXT:    andl $60, %r8d
-; AVX512-NEXT:    movl (%rdi,%r8), %eax
-; AVX512-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
-; AVX512-NEXT:    btl %r8d, %eax
-; AVX512-NEXT:    movq %r13, 56(%rdi)
-; AVX512-NEXT:    movq %r12, 48(%rdi)
-; AVX512-NEXT:    movq %r14, 40(%rdi)
-; AVX512-NEXT:    movq %rdx, 32(%rdi)
-; AVX512-NEXT:    movq %r11, 24(%rdi)
-; AVX512-NEXT:    movq %r15, 16(%rdi)
-; AVX512-NEXT:    movq %rcx, 8(%rdi)
-; AVX512-NEXT:    movq %rsi, (%rdi)
-; AVX512-NEXT:    setae %al
-; AVX512-NEXT:    addq $152, %rsp
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: init_eq_i512:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %esi, %ecx
+; AVX-NEXT:    shrl $3, %ecx
+; AVX-NEXT:    andl $60, %ecx
+; AVX-NEXT:    movl (%rdi,%rcx), %r8d
+; AVX-NEXT:    btl %esi, %r8d
+; AVX-NEXT:    setae %al
+; AVX-NEXT:    btrl %esi, %r8d
+; AVX-NEXT:    shlxl %esi, %edx, %edx
+; AVX-NEXT:    orl %r8d, %edx
+; AVX-NEXT:    movl %edx, (%rdi,%rcx)
+; AVX-NEXT:    retq
   %rem = and i32 %position, 511
   %ofs = zext nneg i32 %rem to i512
   %bit = shl nuw i512 1, %ofs
@@ -1676,3 +899,592 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
   %cmp = icmp ne i4096 %test, 0
   ret i1 %cmp
 }
+
+; Special Cases
+
+; Multiple uses of the stored value
+define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
+; X86-LABEL: complement_cmpz_i128:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    andl $96, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    xorl %edx, (%eax,%ecx)
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    orl 12(%eax), %edx
+; X86-NEXT:    orl 8(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; SSE-LABEL: complement_cmpz_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    movl $1, %eax
+; SSE-NEXT:    shll %cl, %eax
+; SSE-NEXT:    andl $96, %ecx
+; SSE-NEXT:    shrl $3, %ecx
+; SSE-NEXT:    xorl %eax, (%rdi,%rcx)
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    orq 8(%rdi), %rax
+; SSE-NEXT:    setne %al
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: complement_cmpz_i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
+; AVX-NEXT:    movl $1, %eax
+; AVX-NEXT:    shlxl %esi, %eax, %eax
+; AVX-NEXT:    andl $96, %esi
+; AVX-NEXT:    shrl $3, %esi
+; AVX-NEXT:    xorl %eax, (%rdi,%rsi)
+; AVX-NEXT:    movq (%rdi), %rax
+; AVX-NEXT:    orq 8(%rdi), %rax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    retq
+  %rem = and i32 %position, 127
+  %ofs = zext nneg i32 %rem to i128
+  %bit = shl nuw i128 1, %ofs
+  %ld = load i128, ptr %word
+  %res = xor i128 %ld, %bit
+  store i128 %res, ptr %word
+  %cmp = icmp ne i128 %res, 0
+  ret i1 %cmp
+}
+
+; Load hidden behind bitcast
+define <8 x i16> @complement_ne_i128_bitcast(ptr %word, i32 %position) nounwind {
+; X86-LABEL: complement_ne_i128_bitcast:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movzwl (%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzwl 12(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzwl 14(%eax), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll $16, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    movzwl 2(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzwl 4(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzwl 6(%eax), %esi
+; X86-NEXT:    movzwl 8(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzwl 10(%eax), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    andb $96, %bl
+; X86-NEXT:    shrb $3, %bl
+; X86-NEXT:    movzbl %bl, %edi
+; X86-NEXT:    movl 32(%esp,%edi), %edi
+; X86-NEXT:    btcl %eax, %edi
+; X86-NEXT:    andl $96, %eax
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %edi, (%ecx,%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movw %dx, 14(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movw %dx, 12(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movw %cx, 10(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movw %cx, 8(%eax)
+; X86-NEXT:    movw %si, 6(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movw %cx, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movw %cx, 2(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movw %cx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; SSE2-LABEL: complement_ne_i128_bitcast:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movq 8(%rdi), %rax
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    movl %esi, %ecx
+; SSE2-NEXT:    andb $32, %cl
+; SSE2-NEXT:    shrdq %cl, %rax, %rdx
+; SSE2-NEXT:    shrq %cl, %rax
+; SSE2-NEXT:    testb $64, %sil
+; SSE2-NEXT:    cmoveq %rdx, %rax
+; SSE2-NEXT:    btcl %esi, %eax
+; SSE2-NEXT:    andl $96, %esi
+; SSE2-NEXT:    shrl $3, %esi
+; SSE2-NEXT:    movl %eax, (%rdi,%rsi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: complement_ne_i128_bitcast:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    # kill: def $esi killed $esi def $rsi
+; SSE4-NEXT:    movdqa (%rdi), %xmm0
+; SSE4-NEXT:    pextrq $1, %xmm0, %rax
+; SSE4-NEXT:    movq %xmm0, %rdx
+; SSE4-NEXT:    movl %esi, %ecx
+; SSE4-NEXT:    andb $32, %cl
+; SSE4-NEXT:    shrdq %cl, %rax, %rdx
+; SSE4-NEXT:    shrq %cl, %rax
+; SSE4-NEXT:    testb $64, %sil
+; SSE4-NEXT:    cmoveq %rdx, %rax
+; SSE4-NEXT:    btcl %esi, %eax
+; SSE4-NEXT:    andl $96, %esi
+; SSE4-NEXT:    shrl $3, %esi
+; SSE4-NEXT:    movl %eax, (%rdi,%rsi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: complement_ne_i128_bitcast:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX-NEXT:    vmovq %xmm0, %rdx
+; AVX-NEXT:    movl %esi, %ecx
+; AVX-NEXT:    andb $32, %cl
+; AVX-NEXT:    shrdq %cl, %rax, %rdx
+; AVX-NEXT:    shrxq %rcx, %rax, %rax
+; AVX-NEXT:    testb $64, %sil
+; AVX-NEXT:    cmoveq %rdx, %rax
+; AVX-NEXT:    btcl %esi, %eax
+; AVX-NEXT:    andl $96, %esi
+; AVX-NEXT:    shrl $3, %esi
+; AVX-NEXT:    movl %eax, (%rdi,%rsi)
+; AVX-NEXT:    retq
+  %rem = and i32 %position, 127
+  %ofs = zext nneg i32 %rem to i128
+  %bit = shl nuw i128 1, %ofs
+  %ldv = load <8 x i16>, ptr %word
+  %ld = bitcast <8 x i16> %ldv to i128
+  %test = and i128 %ld, %bit
+  %res = xor i128 %ld, %bit
+  store i128 %res, ptr %word
+  ret <8 x i16> %ldv
+}
+
+; Multiple loads in store chain
+define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
+; X86-LABEL: reset_multiload_i128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $96, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    btrl %edx, %ebx
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl %ebx, (%ecx,%esi)
+; X86-NEXT:    jae .LBB23_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  .LBB23_2:
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: reset_multiload_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    andl $96, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %r9d
+; X64-NEXT:    movl %r9d, %r8d
+; X64-NEXT:    btrl %esi, %r8d
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    btl %esi, %r9d
+; X64-NEXT:    jb .LBB23_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    movl (%rdx), %eax
+; X64-NEXT:  .LBB23_2:
+; X64-NEXT:    movl %r8d, (%rdi,%rcx)
+; X64-NEXT:    retq
+  %rem = and i32 %position, 127
+  %ofs = zext nneg i32 %rem to i128
+  %bit = shl nuw i128 1, %ofs
+  %mask = xor i128 %bit, -1
+  %ld = load i128, ptr %word
+  %sel = load i32, ptr %p
+  %test = and i128 %ld, %bit
+  %res = and i128 %ld, %mask
+  %cmp = icmp eq i128 %test, 0
+  store i128 %res, ptr %word
+  %ret = select i1 %cmp, i32 %sel, i32 0
+  ret i32 %ret
+}
+
+; Multiple uses of the store chain AND stored value
+define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind {
+; X86-LABEL: chain_reset_i256:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $-2, %edi
+; X86-NEXT:    roll %cl, %edi
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    andl $28, %ecx
+; X86-NEXT:    andl %edi, (%esi,%ecx)
+; X86-NEXT:    movl 8(%esi), %ebx
+; X86-NEXT:    movl (%esi), %edi
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 12(%esi), %ebp
+; X86-NEXT:    orl 28(%esi), %ebp
+; X86-NEXT:    orl 20(%esi), %ecx
+; X86-NEXT:    orl %ebp, %ecx
+; X86-NEXT:    orl 24(%esi), %ebx
+; X86-NEXT:    movl 16(%esi), %ebp
+; X86-NEXT:    orl %edi, %ebp
+; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    movl (%edx), %esi
+; X86-NEXT:    movl %edi, (%edx)
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    orl %ecx, %ebp
+; X86-NEXT:    jne .LBB24_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:  .LBB24_2:
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; SSE-LABEL: chain_reset_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; SSE-NEXT:    movl $-2, %eax
+; SSE-NEXT:    roll %cl, %eax
+; SSE-NEXT:    shrl $3, %ecx
+; SSE-NEXT:    andl $28, %ecx
+; SSE-NEXT:    andl %eax, (%rdi,%rcx)
+; SSE-NEXT:    movq (%rdi), %rcx
+; SSE-NEXT:    movq 8(%rdi), %r8
+; SSE-NEXT:    orq 24(%rdi), %r8
+; SSE-NEXT:    movq 16(%rdi), %rdi
+; SSE-NEXT:    orq %rcx, %rdi
+; SSE-NEXT:    movl (%rsi), %eax
+; SSE-NEXT:    movl %ecx, (%rsi)
+; SSE-NEXT:    movl (%rdx), %ecx
+; SSE-NEXT:    addl %ecx, %eax
+; SSE-NEXT:    orq %r8, %rdi
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: chain_reset_i256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX-NEXT:    movl $-2, %eax
+; AVX-NEXT:    roll %cl, %eax
+; AVX-NEXT:    shrl $3, %ecx
+; AVX-NEXT:    andl $28, %ecx
+; AVX-NEXT:    andl %eax, (%rdi,%rcx)
+; AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX-NEXT:    movl (%rdi), %ecx
+; AVX-NEXT:    movl (%rsi), %eax
+; AVX-NEXT:    movl %ecx, (%rsi)
+; AVX-NEXT:    movl (%rdx), %ecx
+; AVX-NEXT:    addl %ecx, %eax
+; AVX-NEXT:    vptest %ymm0, %ymm0
+; AVX-NEXT:    cmovnel %ecx, %eax
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %rem = and i32 %position, 255
+  %ofs = zext nneg i32 %rem to i256
+  %bit = shl nuw i256 1, %ofs
+  %ld0 = load i256, ptr %p0
+  %msk = xor i256 %bit, -1
+  %res = and i256 %ld0, %msk
+  store i256 %res, ptr %p0
+  %cmp = icmp ne i256 %res, 0
+  %ld1 = load i32, ptr %p1
+  %trunc = trunc i256 %res to i32
+  store i32 %trunc, ptr %p1
+  %ld2 = load i32, ptr %p2
+  %add = add i32 %ld1, %ld2
+  %sel = select i1 %cmp, i32 %ld2, i32 %add
+  ret i32 %sel
+}
+
+; BTC/BT/BTS sequence on same i128
+define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind {
+; X86-LABEL: sequence_i128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $144, %esp
+; X86-NEXT:    movb 20(%ebp), %ch
+; X86-NEXT:    movb 12(%ebp), %cl
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 56(%esp,%eax), %edx
+; X86-NEXT:    movl 60(%esp,%eax), %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%esp,%eax), %edi
+; X86-NEXT:    movl 52(%esp,%eax), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %ebx
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movb %ch, %al
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 84(%esp,%eax), %edx
+; X86-NEXT:    movl 88(%esp,%eax), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl 20(%ebp), %ecx
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%eax), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 92(%esp,%eax), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl 8(%eax), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    xorl 12(%eax), %esi
+; X86-NEXT:    xorl (%eax), %edi
+; X86-NEXT:    xorl 4(%eax), %ebx
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    andb $96, %al
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 96(%esp,%eax), %eax
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setae %al
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl %esi, 12(%ecx)
+; X86-NEXT:    movl %edi, (%ecx)
+; X86-NEXT:    movl %ebx, 4(%ecx)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; SSE-LABEL: sequence_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movl %ecx, %eax
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    movl $1, %r8d
+; SSE-NEXT:    xorl %esi, %esi
+; SSE-NEXT:    shldq %cl, %r8, %rsi
+; SSE-NEXT:    movl $1, %r9d
+; SSE-NEXT:    shlq %cl, %r9
+; SSE-NEXT:    xorl %r11d, %r11d
+; SSE-NEXT:    testb $64, %cl
+; SSE-NEXT:    cmovneq %r9, %rsi
+; SSE-NEXT:    cmovneq %r11, %r9
+; SSE-NEXT:    xorl %r10d, %r10d
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    shldq %cl, %r8, %r10
+; SSE-NEXT:    shlq %cl, %r8
+; SSE-NEXT:    testb $64, %al
+; SSE-NEXT:    cmovneq %r8, %r10
+; SSE-NEXT:    cmovneq %r11, %r8
+; SSE-NEXT:    xorq 8(%rdi), %rsi
+; SSE-NEXT:    xorq (%rdi), %r9
+; SSE-NEXT:    movl %edx, %ecx
+; SSE-NEXT:    andb $32, %cl
+; SSE-NEXT:    movq %r9, %rax
+; SSE-NEXT:    shrdq %cl, %rsi, %rax
+; SSE-NEXT:    movq %rsi, %r11
+; SSE-NEXT:    shrq %cl, %r11
+; SSE-NEXT:    testb $64, %dl
+; SSE-NEXT:    cmoveq %rax, %r11
+; SSE-NEXT:    btl %edx, %r11d
+; SSE-NEXT:    setae %al
+; SSE-NEXT:    orq %r10, %rsi
+; SSE-NEXT:    orq %r8, %r9
+; SSE-NEXT:    movq %r9, (%rdi)
+; SSE-NEXT:    movq %rsi, 8(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: sequence_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    movl $1, %r10d
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    shldq %cl, %r10, %rsi
+; AVX2-NEXT:    shlxq %rcx, %r10, %r8
+; AVX2-NEXT:    testb $64, %cl
+; AVX2-NEXT:    cmovneq %r8, %rsi
+; AVX2-NEXT:    cmovneq %r9, %r8
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    shldq %cl, %r10, %r11
+; AVX2-NEXT:    shlxq %rax, %r10, %r10
+; AVX2-NEXT:    testb $64, %al
+; AVX2-NEXT:    cmovneq %r10, %r11
+; AVX2-NEXT:    cmovneq %r9, %r10
+; AVX2-NEXT:    xorq 8(%rdi), %rsi
+; AVX2-NEXT:    xorq (%rdi), %r8
+; AVX2-NEXT:    movl %edx, %ecx
+; AVX2-NEXT:    andb $32, %cl
+; AVX2-NEXT:    movq %r8, %rax
+; AVX2-NEXT:    shrdq %cl, %rsi, %rax
+; AVX2-NEXT:    shrxq %rcx, %rsi, %rcx
+; AVX2-NEXT:    testb $64, %dl
+; AVX2-NEXT:    cmoveq %rax, %rcx
+; AVX2-NEXT:    btl %edx, %ecx
+; AVX2-NEXT:    setae %al
+; AVX2-NEXT:    orq %r11, %rsi
+; AVX2-NEXT:    orq %r10, %r8
+; AVX2-NEXT:    movq %r8, (%rdi)
+; AVX2-NEXT:    movq %rsi, 8(%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sequence_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movl %ecx, %eax
+; AVX512-NEXT:    movl %esi, %ecx
+; AVX512-NEXT:    movl $1, %r9d
+; AVX512-NEXT:    xorl %esi, %esi
+; AVX512-NEXT:    shldq %cl, %r9, %rsi
+; AVX512-NEXT:    xorl %r10d, %r10d
+; AVX512-NEXT:    shlxq %rcx, %r9, %r8
+; AVX512-NEXT:    testb $64, %cl
+; AVX512-NEXT:    cmovneq %r8, %rsi
+; AVX512-NEXT:    cmovneq %r10, %r8
+; AVX512-NEXT:    xorl %r11d, %r11d
+; AVX512-NEXT:    movl %eax, %ecx
+; AVX512-NEXT:    shldq %cl, %r9, %r11
+; AVX512-NEXT:    shlxq %rax, %r9, %r9
+; AVX512-NEXT:    testb $64, %al
+; AVX512-NEXT:    cmovneq %r9, %r11
+; AVX512-NEXT:    cmovneq %r10, %r9
+; AVX512-NEXT:    xorq 8(%rdi), %rsi
+; AVX512-NEXT:    xorq (%rdi), %r8
+; AVX512-NEXT:    movl %edx, %ecx
+; AVX512-NEXT:    andb $32, %cl
+; AVX512-NEXT:    movq %r8, %rax
+; AVX512-NEXT:    shrdq %cl, %rsi, %rax
+; AVX512-NEXT:    shrxq %rcx, %rsi, %rcx
+; AVX512-NEXT:    testb $64, %dl
+; AVX512-NEXT:    cmoveq %rax, %rcx
+; AVX512-NEXT:    btl %edx, %ecx
+; AVX512-NEXT:    setae %al
+; AVX512-NEXT:    orq %r11, %rsi
+; AVX512-NEXT:    orq %r9, %r8
+; AVX512-NEXT:    movq %r8, (%rdi)
+; AVX512-NEXT:    movq %rsi, 8(%rdi)
+; AVX512-NEXT:    retq
+  %rem0 = and i32 %pos0, 127
+  %rem1 = and i32 %pos1, 127
+  %rem2 = and i32 %pos2, 127
+  %ofs0 = zext nneg i32 %rem0 to i128
+  %ofs1 = zext nneg i32 %rem1 to i128
+  %ofs2 = zext nneg i32 %rem2 to i128
+  %bit0 = shl nuw i128 1, %ofs0
+  %bit1 = shl nuw i128 1, %ofs1
+  %bit2 = shl nuw i128 1, %ofs2
+  %ld = load i128, ptr %word
+  %res0 = xor i128 %ld, %bit0
+  %test1 = and i128 %res0, %bit1
+  %cmp1 = icmp eq i128 %test1, 0
+  %res2 = or i128 %res0, %bit2
+  store i128 %res2, ptr %word
+  ret i1 %cmp1
+}
diff --git a/llvm/test/CodeGen/X86/build-vector-128.ll b/llvm/test/CodeGen/X86/build-vector-128.ll
index e2db8d4241420..b8bb417e1860c 100644
--- a/llvm/test/CodeGen/X86/build-vector-128.ll
+++ b/llvm/test/CodeGen/X86/build-vector-128.ll
@@ -410,6 +410,472 @@ define <16 x i8> @test_buildvector_v16i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   ret <16 x i8> %ins15
 }
 
+; build vectors where integers operands are split (typically via legalization)
+
+define <4 x i32> @test_buildvector_v2i64_split_v4i32(i64 %a0, i64 %a1) nounwind {
+; SSE-32-LABEL: test_buildvector_v2i64_split_v4i32:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
+; SSE-32-NEXT:    retl
+;
+; SSE2-64-LABEL: test_buildvector_v2i64_split_v4i32:
+; SSE2-64:       # %bb.0:
+; SSE2-64-NEXT:    movl %edi, %eax
+; SSE2-64-NEXT:    movl %esi, %ecx
+; SSE2-64-NEXT:    shrq $32, %rdi
+; SSE2-64-NEXT:    shrq $32, %rsi
+; SSE2-64-NEXT:    movd %ecx, %xmm1
+; SSE2-64-NEXT:    movd %esi, %xmm0
+; SSE2-64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-64-NEXT:    movd %eax, %xmm0
+; SSE2-64-NEXT:    movd %edi, %xmm2
+; SSE2-64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-64-NEXT:    retq
+;
+; SSE41-64-LABEL: test_buildvector_v2i64_split_v4i32:
+; SSE41-64:       # %bb.0:
+; SSE41-64-NEXT:    movl %edi, %eax
+; SSE41-64-NEXT:    movl %esi, %ecx
+; SSE41-64-NEXT:    shrq $32, %rdi
+; SSE41-64-NEXT:    shrq $32, %rsi
+; SSE41-64-NEXT:    movd %eax, %xmm0
+; SSE41-64-NEXT:    pinsrd $1, %edi, %xmm0
+; SSE41-64-NEXT:    pinsrd $2, %ecx, %xmm0
+; SSE41-64-NEXT:    pinsrd $3, %esi, %xmm0
+; SSE41-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_buildvector_v2i64_split_v4i32:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_buildvector_v2i64_split_v4i32:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    movl %edi, %eax
+; AVX-64-NEXT:    movl %esi, %ecx
+; AVX-64-NEXT:    shrq $32, %rdi
+; AVX-64-NEXT:    shrq $32, %rsi
+; AVX-64-NEXT:    vmovd %eax, %xmm0
+; AVX-64-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrd $3, %esi, %xmm0, %xmm0
+; AVX-64-NEXT:    retq
+  %a0.lo = trunc i64 %a0 to i32
+  %a1.lo = trunc i64 %a1 to i32
+  %a0.shr = lshr i64 %a0, 32
+  %a1.shr = lshr i64 %a1, 32
+  %a0.hi = trunc i64 %a0.shr to i32
+  %a1.hi = trunc i64 %a1.shr to i32
+  %v0 = insertelement <4 x i32> poison, i32 %a0.lo, i64 0
+  %v1 = insertelement <4 x i32> %v0, i32 %a0.hi, i64 1
+  %v2 = insertelement <4 x i32> %v1, i32 %a1.lo, i64 2
+  %v3 = insertelement <4 x i32> %v2, i32 %a1.hi, i64 3
+  ret <4 x i32> %v3
+}
+
+define <8 x i16> @test_buildvector_v4i32_split_v8i16(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
+; SSE2-32-LABEL: test_buildvector_v4i32_split_v8i16:
+; SSE2-32:       # %bb.0:
+; SSE2-32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE2-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-32-NEXT:    retl
+;
+; SSE2-64-LABEL: test_buildvector_v4i32_split_v8i16:
+; SSE2-64:       # %bb.0:
+; SSE2-64-NEXT:    movd %ecx, %xmm0
+; SSE2-64-NEXT:    movd %edx, %xmm1
+; SSE2-64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-64-NEXT:    movd %esi, %xmm2
+; SSE2-64-NEXT:    movd %edi, %xmm0
+; SSE2-64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-64-NEXT:    retq
+;
+; SSE41-32-LABEL: test_buildvector_v4i32_split_v8i16:
+; SSE41-32:       # %bb.0:
+; SSE41-32-NEXT:    pushl %esi
+; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SSE41-32-NEXT:    movd %esi, %xmm0
+; SSE41-32-NEXT:    shrl $16, %esi
+; SSE41-32-NEXT:    pinsrw $1, %esi, %xmm0
+; SSE41-32-NEXT:    pinsrw $2, %edx, %xmm0
+; SSE41-32-NEXT:    shrl $16, %edx
+; SSE41-32-NEXT:    pinsrw $3, %edx, %xmm0
+; SSE41-32-NEXT:    pinsrw $4, %ecx, %xmm0
+; SSE41-32-NEXT:    shrl $16, %ecx
+; SSE41-32-NEXT:    pinsrw $5, %ecx, %xmm0
+; SSE41-32-NEXT:    pinsrw $6, %eax, %xmm0
+; SSE41-32-NEXT:    shrl $16, %eax
+; SSE41-32-NEXT:    pinsrw $7, %eax, %xmm0
+; SSE41-32-NEXT:    popl %esi
+; SSE41-32-NEXT:    retl
+;
+; SSE41-64-LABEL: test_buildvector_v4i32_split_v8i16:
+; SSE41-64:       # %bb.0:
+; SSE41-64-NEXT:    movd %edi, %xmm0
+; SSE41-64-NEXT:    shrl $16, %edi
+; SSE41-64-NEXT:    pinsrw $1, %edi, %xmm0
+; SSE41-64-NEXT:    pinsrw $2, %esi, %xmm0
+; SSE41-64-NEXT:    shrl $16, %esi
+; SSE41-64-NEXT:    pinsrw $3, %esi, %xmm0
+; SSE41-64-NEXT:    pinsrw $4, %edx, %xmm0
+; SSE41-64-NEXT:    shrl $16, %edx
+; SSE41-64-NEXT:    pinsrw $5, %edx, %xmm0
+; SSE41-64-NEXT:    pinsrw $6, %ecx, %xmm0
+; SSE41-64-NEXT:    shrl $16, %ecx
+; SSE41-64-NEXT:    pinsrw $7, %ecx, %xmm0
+; SSE41-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_buildvector_v4i32_split_v8i16:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %esi
+; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; AVX-32-NEXT:    vmovd %esi, %xmm0
+; AVX-32-NEXT:    shrl $16, %esi
+; AVX-32-NEXT:    vpinsrw $1, %esi, %xmm0, %xmm0
+; AVX-32-NEXT:    vpinsrw $2, %edx, %xmm0, %xmm0
+; AVX-32-NEXT:    shrl $16, %edx
+; AVX-32-NEXT:    vpinsrw $3, %edx, %xmm0, %xmm0
+; AVX-32-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX-32-NEXT:    shrl $16, %ecx
+; AVX-32-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX-32-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    shrl $16, %eax
+; AVX-32-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    popl %esi
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_buildvector_v4i32_split_v8i16:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vmovd %edi, %xmm0
+; AVX-64-NEXT:    shrl $16, %edi
+; AVX-64-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrw $2, %esi, %xmm0, %xmm0
+; AVX-64-NEXT:    shrl $16, %esi
+; AVX-64-NEXT:    vpinsrw $3, %esi, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrw $4, %edx, %xmm0, %xmm0
+; AVX-64-NEXT:    shrl $16, %edx
+; AVX-64-NEXT:    vpinsrw $5, %edx, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX-64-NEXT:    shrl $16, %ecx
+; AVX-64-NEXT:    vpinsrw $7, %ecx, %xmm0, %xmm0
+; AVX-64-NEXT:    retq
+  %a0.lo = trunc i32 %a0 to i16
+  %a1.lo = trunc i32 %a1 to i16
+  %a2.lo = trunc i32 %a2 to i16
+  %a3.lo = trunc i32 %a3 to i16
+  %a0.shr = lshr i32 %a0, 16
+  %a1.shr = lshr i32 %a1, 16
+  %a2.shr = lshr i32 %a2, 16
+  %a3.shr = lshr i32 %a3, 16
+  %a0.hi = trunc i32 %a0.shr to i16
+  %a1.hi = trunc i32 %a1.shr to i16
+  %a2.hi = trunc i32 %a2.shr to i16
+  %a3.hi = trunc i32 %a3.shr to i16
+  %v0 = insertelement <8 x i16> poison, i16 %a0.lo, i64 0
+  %v1 = insertelement <8 x i16> %v0, i16 %a0.hi, i64 1
+  %v2 = insertelement <8 x i16> %v1, i16 %a1.lo, i64 2
+  %v3 = insertelement <8 x i16> %v2, i16 %a1.hi, i64 3
+  %v4 = insertelement <8 x i16> %v3, i16 %a2.lo, i64 4
+  %v5 = insertelement <8 x i16> %v4, i16 %a2.hi, i64 5
+  %v6 = insertelement <8 x i16> %v5, i16 %a3.lo, i64 6
+  %v7 = insertelement <8 x i16> %v6, i16 %a3.hi, i64 7
+  ret <8 x i16> %v7
+}
+
+define <16 x i8> @test_buildvector_v8i16_split_v16i8(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
+; SSE2-32-LABEL: test_buildvector_v8i16_split_v16i8:
+; SSE2-32:       # %bb.0:
+; SSE2-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE2-32-NEXT:    movd %eax, %xmm1
+; SSE2-32-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-32-NEXT:    psrld $8, %xmm0
+; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE2-32-NEXT:    movd %eax, %xmm0
+; SSE2-32-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-32-NEXT:    psrld $8, %xmm2
+; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE2-32-NEXT:    movd %eax, %xmm2
+; SSE2-32-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-32-NEXT:    psrld $8, %xmm1
+; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE2-32-NEXT:    movd %eax, %xmm1
+; SSE2-32-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-32-NEXT:    psrld $8, %xmm3
+; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-32-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-32-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE2-32-NEXT:    movd %eax, %xmm0
+; SSE2-32-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-32-NEXT:    psrld $8, %xmm2
+; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE2-32-NEXT:    movd %eax, %xmm2
+; SSE2-32-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-32-NEXT:    psrld $8, %xmm3
+; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE2-32-NEXT:    movd %eax, %xmm3
+; SSE2-32-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-32-NEXT:    psrld $8, %xmm0
+; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE2-32-NEXT:    movd %eax, %xmm0
+; SSE2-32-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-32-NEXT:    psrld $8, %xmm4
+; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-32-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-32-NEXT:    retl
+;
+; SSE2-64-LABEL: test_buildvector_v8i16_split_v16i8:
+; SSE2-64:       # %bb.0:
+; SSE2-64-NEXT:    pushq %rbp
+; SSE2-64-NEXT:    pushq %r15
+; SSE2-64-NEXT:    pushq %r14
+; SSE2-64-NEXT:    pushq %rbx
+; SSE2-64-NEXT:    movzwl %di, %eax
+; SSE2-64-NEXT:    movzwl %si, %r10d
+; SSE2-64-NEXT:    movzwl %dx, %r11d
+; SSE2-64-NEXT:    movzwl %cx, %ebx
+; SSE2-64-NEXT:    movzwl %r8w, %ebp
+; SSE2-64-NEXT:    movzwl %r9w, %r14d
+; SSE2-64-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
+; SSE2-64-NEXT:    movd %r15d, %xmm0
+; SSE2-64-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-64-NEXT:    psrld $8, %xmm1
+; SSE2-64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-64-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
+; SSE2-64-NEXT:    movd %r15d, %xmm2
+; SSE2-64-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-64-NEXT:    psrld $8, %xmm1
+; SSE2-64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-64-NEXT:    movd %r9d, %xmm0
+; SSE2-64-NEXT:    movd %r14d, %xmm1
+; SSE2-64-NEXT:    psrld $8, %xmm1
+; SSE2-64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-64-NEXT:    movd %r8d, %xmm1
+; SSE2-64-NEXT:    movd %ebp, %xmm3
+; SSE2-64-NEXT:    psrld $8, %xmm3
+; SSE2-64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-64-NEXT:    movd %ecx, %xmm0
+; SSE2-64-NEXT:    movd %ebx, %xmm2
+; SSE2-64-NEXT:    psrld $8, %xmm2
+; SSE2-64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-64-NEXT:    movd %edx, %xmm2
+; SSE2-64-NEXT:    movd %r11d, %xmm3
+; SSE2-64-NEXT:    psrld $8, %xmm3
+; SSE2-64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-64-NEXT:    movd %esi, %xmm3
+; SSE2-64-NEXT:    movd %r10d, %xmm0
+; SSE2-64-NEXT:    psrld $8, %xmm0
+; SSE2-64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-64-NEXT:    movd %edi, %xmm0
+; SSE2-64-NEXT:    movd %eax, %xmm4
+; SSE2-64-NEXT:    psrld $8, %xmm4
+; SSE2-64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-64-NEXT:    popq %rbx
+; SSE2-64-NEXT:    popq %r14
+; SSE2-64-NEXT:    popq %r15
+; SSE2-64-NEXT:    popq %rbp
+; SSE2-64-NEXT:    retq
+;
+; SSE41-32-LABEL: test_buildvector_v8i16_split_v16i8:
+; SSE41-32:       # %bb.0:
+; SSE41-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE41-32-NEXT:    movd %eax, %xmm0
+; SSE41-32-NEXT:    shrl $8, %eax
+; SSE41-32-NEXT:    pinsrb $1, %eax, %xmm0
+; SSE41-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE41-32-NEXT:    pinsrb $2, %eax, %xmm0
+; SSE41-32-NEXT:    shrl $8, %eax
+; SSE41-32-NEXT:    pinsrb $3, %eax, %xmm0
+; SSE41-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE41-32-NEXT:    pinsrb $4, %eax, %xmm0
+; SSE41-32-NEXT:    shrl $8, %eax
+; SSE41-32-NEXT:    pinsrb $5, %eax, %xmm0
+; SSE41-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE41-32-NEXT:    pinsrb $6, %eax, %xmm0
+; SSE41-32-NEXT:    shrl $8, %eax
+; SSE41-32-NEXT:    pinsrb $7, %eax, %xmm0
+; SSE41-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE41-32-NEXT:    pinsrb $8, %eax, %xmm0
+; SSE41-32-NEXT:    shrl $8, %eax
+; SSE41-32-NEXT:    pinsrb $9, %eax, %xmm0
+; SSE41-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE41-32-NEXT:    pinsrb $10, %eax, %xmm0
+; SSE41-32-NEXT:    shrl $8, %eax
+; SSE41-32-NEXT:    pinsrb $11, %eax, %xmm0
+; SSE41-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE41-32-NEXT:    pinsrb $12, %eax, %xmm0
+; SSE41-32-NEXT:    shrl $8, %eax
+; SSE41-32-NEXT:    pinsrb $13, %eax, %xmm0
+; SSE41-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE41-32-NEXT:    pinsrb $14, %eax, %xmm0
+; SSE41-32-NEXT:    shrl $8, %eax
+; SSE41-32-NEXT:    pinsrb $15, %eax, %xmm0
+; SSE41-32-NEXT:    retl
+;
+; SSE41-64-LABEL: test_buildvector_v8i16_split_v16i8:
+; SSE41-64:       # %bb.0:
+; SSE41-64-NEXT:    movd %edi, %xmm0
+; SSE41-64-NEXT:    shrl $8, %edi
+; SSE41-64-NEXT:    pinsrb $1, %edi, %xmm0
+; SSE41-64-NEXT:    pinsrb $2, %esi, %xmm0
+; SSE41-64-NEXT:    shrl $8, %esi
+; SSE41-64-NEXT:    pinsrb $3, %esi, %xmm0
+; SSE41-64-NEXT:    pinsrb $4, %edx, %xmm0
+; SSE41-64-NEXT:    shrl $8, %edx
+; SSE41-64-NEXT:    pinsrb $5, %edx, %xmm0
+; SSE41-64-NEXT:    pinsrb $6, %ecx, %xmm0
+; SSE41-64-NEXT:    shrl $8, %ecx
+; SSE41-64-NEXT:    pinsrb $7, %ecx, %xmm0
+; SSE41-64-NEXT:    pinsrb $8, %r8d, %xmm0
+; SSE41-64-NEXT:    shrl $8, %r8d
+; SSE41-64-NEXT:    pinsrb $9, %r8d, %xmm0
+; SSE41-64-NEXT:    pinsrb $10, %r9d, %xmm0
+; SSE41-64-NEXT:    shrl $8, %r9d
+; SSE41-64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; SSE41-64-NEXT:    pinsrb $11, %r9d, %xmm0
+; SSE41-64-NEXT:    pinsrb $12, %eax, %xmm0
+; SSE41-64-NEXT:    shrl $8, %eax
+; SSE41-64-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; SSE41-64-NEXT:    pinsrb $13, %eax, %xmm0
+; SSE41-64-NEXT:    pinsrb $14, %ecx, %xmm0
+; SSE41-64-NEXT:    shrl $8, %ecx
+; SSE41-64-NEXT:    pinsrb $15, %ecx, %xmm0
+; SSE41-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_buildvector_v8i16_split_v16i8:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; AVX-32-NEXT:    vmovd %eax, %xmm0
+; AVX-32-NEXT:    shrl $8, %eax
+; AVX-32-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; AVX-32-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    shrl $8, %eax
+; AVX-32-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; AVX-32-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    shrl $8, %eax
+; AVX-32-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; AVX-32-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    shrl $8, %eax
+; AVX-32-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; AVX-32-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    shrl $8, %eax
+; AVX-32-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; AVX-32-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    shrl $8, %eax
+; AVX-32-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; AVX-32-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    shrl $8, %eax
+; AVX-32-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; AVX-32-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    shrl $8, %eax
+; AVX-32-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_buildvector_v8i16_split_v16i8:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vmovd %edi, %xmm0
+; AVX-64-NEXT:    shrl $8, %edi
+; AVX-64-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrb $2, %esi, %xmm0, %xmm0
+; AVX-64-NEXT:    shrl $8, %esi
+; AVX-64-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrb $4, %edx, %xmm0, %xmm0
+; AVX-64-NEXT:    shrl $8, %edx
+; AVX-64-NEXT:    vpinsrb $5, %edx, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX-64-NEXT:    shrl $8, %ecx
+; AVX-64-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrb $8, %r8d, %xmm0, %xmm0
+; AVX-64-NEXT:    shrl $8, %r8d
+; AVX-64-NEXT:    vpinsrb $9, %r8d, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrb $10, %r9d, %xmm0, %xmm0
+; AVX-64-NEXT:    shrl $8, %r9d
+; AVX-64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; AVX-64-NEXT:    vpinsrb $11, %r9d, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX-64-NEXT:    shrl $8, %eax
+; AVX-64-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; AVX-64-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX-64-NEXT:    shrl $8, %ecx
+; AVX-64-NEXT:    vpinsrb $15, %ecx, %xmm0, %xmm0
+; AVX-64-NEXT:    retq
+  %a0.lo = trunc i16 %a0 to i8
+  %a1.lo = trunc i16 %a1 to i8
+  %a2.lo = trunc i16 %a2 to i8
+  %a3.lo = trunc i16 %a3 to i8
+  %a4.lo = trunc i16 %a4 to i8
+  %a5.lo = trunc i16 %a5 to i8
+  %a6.lo = trunc i16 %a6 to i8
+  %a7.lo = trunc i16 %a7 to i8
+  %a0.shr = lshr i16 %a0, 8
+  %a1.shr = lshr i16 %a1, 8
+  %a2.shr = lshr i16 %a2, 8
+  %a3.shr = lshr i16 %a3, 8
+  %a4.shr = lshr i16 %a4, 8
+  %a5.shr = lshr i16 %a5, 8
+  %a6.shr = lshr i16 %a6, 8
+  %a7.shr = lshr i16 %a7, 8
+  %a0.hi = trunc i16 %a0.shr to i8
+  %a1.hi = trunc i16 %a1.shr to i8
+  %a2.hi = trunc i16 %a2.shr to i8
+  %a3.hi = trunc i16 %a3.shr to i8
+  %a4.hi = trunc i16 %a4.shr to i8
+  %a5.hi = trunc i16 %a5.shr to i8
+  %a6.hi = trunc i16 %a6.shr to i8
+  %a7.hi = trunc i16 %a7.shr to i8
+  %v0 = insertelement <16 x i8> poison, i8 %a0.lo, i64 0
+  %v1 = insertelement <16 x i8> %v0, i8 %a0.hi, i64 1
+  %v2 = insertelement <16 x i8> %v1, i8 %a1.lo, i64 2
+  %v3 = insertelement <16 x i8> %v2, i8 %a1.hi, i64 3
+  %v4 = insertelement <16 x i8> %v3, i8 %a2.lo, i64 4
+  %v5 = insertelement <16 x i8> %v4, i8 %a2.hi, i64 5
+  %v6 = insertelement <16 x i8> %v5, i8 %a3.lo, i64 6
+  %v7 = insertelement <16 x i8> %v6, i8 %a3.hi, i64 7
+  %v8 = insertelement <16 x i8> %v7, i8 %a4.lo, i64 8
+  %v9 = insertelement <16 x i8> %v8, i8 %a4.hi, i64 9
+  %v10 = insertelement <16 x i8> %v9, i8 %a5.lo, i64 10
+  %v11 = insertelement <16 x i8> %v10, i8 %a5.hi, i64 11
+  %v12 = insertelement <16 x i8> %v11, i8 %a6.lo, i64 12
+  %v13 = insertelement <16 x i8> %v12, i8 %a6.hi, i64 13
+  %v14 = insertelement <16 x i8> %v13, i8 %a7.lo, i64 14
+  %v15 = insertelement <16 x i8> %v14, i8 %a7.hi, i64 15
+  ret <16 x i8> %v15
+}
+
 ; build vectors of repeated elements
 
 define <4 x float> @test_buildvector_4f32_2_var(float %a0, float %a1) {
diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
index f36baba402421..ab8498d8d3451 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
@@ -14,7 +14,6 @@ entry:
 }
 
 ; CHECK: _ZL10myCallbacki:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
 define internal void @_ZL10myCallbacki(i32 %value) !type !2 {
 entry:
   %sink = alloca i32, align 4
@@ -33,6 +32,6 @@ entry:
 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
 ; CHECK-NEXT: .byte   1
 ;; Function Entry PC
-; CHECK-NEXT: .quad   [[LABEL_FUNC]]
+; CHECK-NEXT: .quad   _ZL10myCallbacki
 ;; Function type ID
 ; CHECK-NEXT: .quad   -5212364466660467813
diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
index cdbad668aec54..02d71073b65c5 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
@@ -11,7 +11,6 @@ declare !type !1 i32 @direct_bar(i8)
 declare !type !2 ptr @direct_baz(ptr)
 
 ; CHECK: ball:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
 define ptr @ball() {
 entry:
   call void @direct_foo()
@@ -42,7 +41,7 @@ entry:
 ;; Flags
 ; CHECK-NEXT: .byte   7
 ;; Function Entry PC
-; CHECK-NEXT: .quad   [[LABEL_FUNC]]
+; CHECK-NEXT: .quad ball
 ;; Function type ID -- set to 0 as no type metadata attached to function.
 ; CHECK-NEXT: .quad   0
 ;; Number of unique direct callees.
diff --git a/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll b/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll
index 4d41c8406f6e0..a42a715bdc6ab 100644
--- a/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll
+++ b/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll
@@ -7,8 +7,8 @@
 define void @_ZNK4llvm5APInt21multiplicativeInverseERKS0_(ptr %r) {
 ; CHECK-LABEL: _ZNK4llvm5APInt21multiplicativeInverseERKS0_:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4
@@ -68,8 +68,8 @@ _ZNK4llvm5APInt13getActiveBitsEv.exit.i.i:        ; preds = %for.body.i.i.i.i.i
 define void @_ZNK4llvm5APInt21multiplicativeInverseERKS0__assert(ptr %r) {
 ; CHECK-LABEL: _ZNK4llvm5APInt21multiplicativeInverseERKS0__assert:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    jmp .LBB1_1
 ; CHECK-NEXT:    .p2align 4
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 81529aff39ff1..19c84d42a7ea6 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1141,8 +1141,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
 ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
 ; CHECK-AVX2:       # %bb.0:
 ; CHECK-AVX2-NEXT:    subq $56, %rsp
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2]
 ; CHECK-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0]
 ; CHECK-AVX2-NEXT:    vpsllvd %ymm0, %ymm1, %ymm0
 ; CHECK-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-AVX2-NEXT:    vpextrw $2, %xmm0, %eax
@@ -1171,8 +1171,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
 ;
 ; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
 ; CHECK-ONLY-AVX512F:       # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2]
 ; CHECK-ONLY-AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-ONLY-AVX512F-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0]
 ; CHECK-ONLY-AVX512F-NEXT:    vpsllvd %ymm0, %ymm1, %ymm0
 ; CHECK-ONLY-AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
 ; CHECK-ONLY-AVX512F-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
index c1beb7c803b2b..c9c88f7258435 100644
--- a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
+++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
@@ -1031,31 +1031,30 @@ define void @simple_urem_fail_intermediate_inc(i32 %N, i32 %rem_amt) nounwind {
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    je .LBB17_4
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    movl %edi, %r14d
 ; CHECK-NEXT:    negl %r14d
-; CHECK-NEXT:    movl $1, %r15d
+; CHECK-NEXT:    movl $1, %ebp
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB17_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %r15d, %eax
+; CHECK-NEXT:    movl %ebp, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divl %ebx
 ; CHECK-NEXT:    movl %edx, %edi
 ; CHECK-NEXT:    callq use.i32@PLT
-; CHECK-NEXT:    leal 1(%r14,%r15), %eax
-; CHECK-NEXT:    movl %r15d, %ecx
-; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    movl %ebp, %eax
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    leal 1(%r14,%rax), %eax
 ; CHECK-NEXT:    cmpl $1, %eax
-; CHECK-NEXT:    movl %ecx, %r15d
 ; CHECK-NEXT:    jne .LBB17_2
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:  .LBB17_4: # %for.cond.cleanup
 ; CHECK-NEXT:    retq
 entry:
@@ -1199,32 +1198,31 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_
 ; CHECK-NEXT:    cmpl $3, %edi
 ; CHECK-NEXT:    jb .LBB21_4
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    movl %edi, %r14d
 ; CHECK-NEXT:    orl $16, %ebx
 ; CHECK-NEXT:    negl %r14d
-; CHECK-NEXT:    movl $7, %r15d
+; CHECK-NEXT:    movl $7, %ebp
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB21_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %r15d, %eax
+; CHECK-NEXT:    movl %ebp, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divl %ebx
 ; CHECK-NEXT:    movl %edx, %edi
 ; CHECK-NEXT:    callq use.i32@PLT
-; CHECK-NEXT:    leal 1(%r14,%r15), %eax
-; CHECK-NEXT:    movl %r15d, %ecx
-; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    movl %ebp, %eax
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    leal 1(%r14,%rax), %eax
 ; CHECK-NEXT:    cmpl $5, %eax
-; CHECK-NEXT:    movl %ecx, %r15d
 ; CHECK-NEXT:    jne .LBB21_2
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:  .LBB21_4: # %for.cond.cleanup
 ; CHECK-NEXT:    retq
 entry:
@@ -1251,32 +1249,31 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(i32
 ; CHECK-NEXT:    cmpl $3, %edi
 ; CHECK-NEXT:    jb .LBB22_4
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    movl %edi, %r14d
 ; CHECK-NEXT:    orl $16, %ebx
 ; CHECK-NEXT:    negl %r14d
-; CHECK-NEXT:    movl $7, %r15d
+; CHECK-NEXT:    movl $7, %ebp
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB22_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %r15d, %eax
+; CHECK-NEXT:    movl %ebp, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divl %ebx
 ; CHECK-NEXT:    movl %edx, %edi
 ; CHECK-NEXT:    callq use.i32@PLT
-; CHECK-NEXT:    leal 1(%r14,%r15), %eax
-; CHECK-NEXT:    movl %r15d, %ecx
-; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    movl %ebp, %eax
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    leal 1(%r14,%rax), %eax
 ; CHECK-NEXT:    cmpl $5, %eax
-; CHECK-NEXT:    movl %ecx, %r15d
 ; CHECK-NEXT:    jne .LBB22_2
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:  .LBB22_4: # %for.cond.cleanup
 ; CHECK-NEXT:    retq
 entry:
@@ -1303,31 +1300,30 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem(
 ; CHECK-NEXT:    cmpl $3, %edi
 ; CHECK-NEXT:    jb .LBB23_4
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    movl %edi, %r14d
 ; CHECK-NEXT:    negl %r14d
-; CHECK-NEXT:    movl $7, %r15d
+; CHECK-NEXT:    movl $7, %ebp
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB23_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %r15d, %eax
+; CHECK-NEXT:    movl %ebp, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divl %ebx
 ; CHECK-NEXT:    movl %edx, %edi
 ; CHECK-NEXT:    callq use.i32@PLT
-; CHECK-NEXT:    leal 1(%r14,%r15), %eax
-; CHECK-NEXT:    movl %r15d, %ecx
-; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    movl %ebp, %eax
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    leal 1(%r14,%rax), %eax
 ; CHECK-NEXT:    cmpl $5, %eax
-; CHECK-NEXT:    movl %ecx, %r15d
 ; CHECK-NEXT:    jne .LBB23_2
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:  .LBB23_4: # %for.cond.cleanup
 ; CHECK-NEXT:    retq
 entry:
@@ -1404,32 +1400,31 @@ define void @simple_urem_to_sel_non_zero_start_through_sub_no_simplfy(i32 %N, i3
 ; CHECK-NEXT:    cmpl %edx, %edi
 ; CHECK-NEXT:    jbe .LBB25_4
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movl %edx, %r15d
-; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edx, %ebx
+; CHECK-NEXT:    movl %esi, %ebp
 ; CHECK-NEXT:    movl %edi, %r14d
 ; CHECK-NEXT:    negl %r14d
-; CHECK-NEXT:    addl $-2, %r15d
+; CHECK-NEXT:    addl $-2, %ebx
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB25_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %r15d, %eax
+; CHECK-NEXT:    movl %ebx, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    divl %ebp
 ; CHECK-NEXT:    movl %edx, %edi
 ; CHECK-NEXT:    callq use.i32@PLT
-; CHECK-NEXT:    leal 1(%r14,%r15), %eax
-; CHECK-NEXT:    movl %r15d, %ecx
-; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    movl %ebx, %eax
+; CHECK-NEXT:    incl %ebx
+; CHECK-NEXT:    leal 1(%r14,%rax), %eax
 ; CHECK-NEXT:    cmpl $-2, %eax
-; CHECK-NEXT:    movl %ecx, %r15d
 ; CHECK-NEXT:    jne .LBB25_2
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:  .LBB25_4: # %for.cond.cleanup
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index e223765eb887b..46b2571e196bb 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -490,20 +490,21 @@ define i32 @freeze_ashr_exact(i32 %a0) nounwind {
 define i32 @freeze_ashr_exact_extra_use(i32 %a0, ptr %escape) nounwind {
 ; X86-LABEL: freeze_ashr_exact_extra_use:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sarl $3, %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sarl $3, %eax
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    sarl $6, %eax
+; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_ashr_exact_extra_use:
 ; X64:       # %bb.0:
-; X64-NEXT:    sarl $3, %edi
-; X64-NEXT:    movl %edi, (%rsi)
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    sarl $3, %eax
+; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    sarl $6, %eax
+; X64-NEXT:    movl %ecx, (%rsi)
 ; X64-NEXT:    retq
   %x = ashr exact i32 %a0, 3
   %y = freeze i32 %x
@@ -604,20 +605,21 @@ define i32 @freeze_lshr_exact(i32 %a0) nounwind {
 define i32 @freeze_lshr_exact_extra_use(i32 %a0, ptr %escape) nounwind {
 ; X86-LABEL: freeze_lshr_exact_extra_use:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shrl $3, %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    shrl $5, %eax
+; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_lshr_exact_extra_use:
 ; X64:       # %bb.0:
-; X64-NEXT:    shrl $3, %edi
-; X64-NEXT:    movl %edi, (%rsi)
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    shrl $5, %eax
+; X64-NEXT:    movl %ecx, (%rsi)
 ; X64-NEXT:    retq
   %x = lshr exact i32 %a0, 3
   %y = freeze i32 %x
diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index feac3dcad243a..30f1874c51fed 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -1684,15 +1684,14 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; GFNIAVX512BW-LABEL: var_shl_v64i8:
 ; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2
+; GFNIAVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT:    kmovq %rax, %k1
+; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; GFNIAVX512BW-NEXT:    retq
   %shift = shl <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -1876,15 +1875,15 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; GFNIAVX512BW-LABEL: var_lshr_v64i8:
 ; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
+; GFNIAVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm3
+; GFNIAVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT:    vpsrlvw %zmm3, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT:    kmovq %rax, %k1
+; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; GFNIAVX512BW-NEXT:    retq
   %shift = lshr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -2232,36 +2231,16 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; GFNIAVX512BW-LABEL: var_ashr_v64i8:
 ; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; GFNIAVX512BW-NEXT:    vpsraw $4, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT:    vpsraw $2, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm5
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm5, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT:    vpsraw $1, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT:    vpsllw $2, %zmm4, %zmm4
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
-; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; GFNIAVX512BW-NEXT:    vpsraw $4, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpsraw $2, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm4
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT:    vpsllw $2, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm2
+; GFNIAVX512BW-NEXT:    vpsravw %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsraw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT:    kmovq %rax, %k1
+; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k1}
+; GFNIAVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; GFNIAVX512BW-NEXT:    retq
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll
index cffd88c55bb0a..477a0dce5c81c 100644
--- a/llvm/test/CodeGen/X86/i128-mul.ll
+++ b/llvm/test/CodeGen/X86/i128-mul.ll
@@ -111,62 +111,63 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
 ; X86-NOBMI-NEXT:    orl %ecx, %eax
 ; X86-NOBMI-NEXT:    je .LBB1_3
 ; X86-NOBMI-NEXT:  # %bb.1: # %for.body.preheader
-; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
+; X86-NOBMI-NEXT:    xorl %edi, %edi
+; X86-NOBMI-NEXT:    xorl %ebp, %ebp
 ; X86-NOBMI-NEXT:    .p2align 4
 ; X86-NOBMI-NEXT:  .LBB1_2: # %for.body
 ; X86-NOBMI-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOBMI-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOBMI-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax,%ecx,8), %edi
-; X86-NOBMI-NEXT:    movl 4(%eax,%ecx,8), %ebx
+; X86-NOBMI-NEXT:    movl (%eax,%edi,8), %ebp
+; X86-NOBMI-NEXT:    movl 4(%eax,%edi,8), %ebx
 ; X86-NOBMI-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOBMI-NEXT:    movl %edi, %eax
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    mull %esi
-; X86-NOBMI-NEXT:    movl %edx, %ebp
+; X86-NOBMI-NEXT:    movl %ebp, %eax
+; X86-NOBMI-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOBMI-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOBMI-NEXT:    movl %ebx, %eax
-; X86-NOBMI-NEXT:    mull %esi
-; X86-NOBMI-NEXT:    movl %edx, %ebx
-; X86-NOBMI-NEXT:    movl %eax, %esi
-; X86-NOBMI-NEXT:    addl %ebp, %esi
-; X86-NOBMI-NEXT:    adcl $0, %ebx
-; X86-NOBMI-NEXT:    movl %edi, %eax
+; X86-NOBMI-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT:    movl %eax, %ebx
+; X86-NOBMI-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NOBMI-NEXT:    adcl $0, %edx
+; X86-NOBMI-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NOBMI-NEXT:    movl %ebp, %eax
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    mull %edx
-; X86-NOBMI-NEXT:    movl %edx, %ebp
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    addl %esi, %edi
-; X86-NOBMI-NEXT:    adcl %ebx, %ebp
-; X86-NOBMI-NEXT:    setb %bl
+; X86-NOBMI-NEXT:    movl %eax, %ebp
+; X86-NOBMI-NEXT:    addl %ebx, %ebp
+; X86-NOBMI-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X86-NOBMI-NEXT:    movl %edx, %ebx
+; X86-NOBMI-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X86-NOBMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOBMI-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NOBMI-NEXT:    addl %ebp, %eax
-; X86-NOBMI-NEXT:    movzbl %bl, %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NOBMI-NEXT:    adcl %esi, %edx
-; X86-NOBMI-NEXT:    movl %ecx, %ebx
-; X86-NOBMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NOBMI-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NOBMI-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NOBMI-NEXT:    adcl $0, %eax
-; X86-NOBMI-NEXT:    adcl $0, %edx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl %ecx, (%esi,%ebx,8)
-; X86-NOBMI-NEXT:    movl %ebx, %ecx
-; X86-NOBMI-NEXT:    movl %edi, 4(%esi,%ebx,8)
-; X86-NOBMI-NEXT:    addl $1, %ecx
-; X86-NOBMI-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-NOBMI-NEXT:    adcl $0, %edi
-; X86-NOBMI-NEXT:    movl %ecx, %esi
-; X86-NOBMI-NEXT:    xorl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NOBMI-NEXT:    xorl %ebp, %edi
-; X86-NOBMI-NEXT:    orl %esi, %edi
+; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:    addl %ebx, %esi
+; X86-NOBMI-NEXT:    movl %ecx, %eax
+; X86-NOBMI-NEXT:    movzbl (%esp), %ebx # 1-byte Folded Reload
+; X86-NOBMI-NEXT:    movl %edx, %ecx
+; X86-NOBMI-NEXT:    adcl %ebx, %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NOBMI-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NOBMI-NEXT:    adcl %eax, %ebp
+; X86-NOBMI-NEXT:    adcl $0, %esi
+; X86-NOBMI-NEXT:    adcl $0, %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl %edx, (%eax,%edi,8)
+; X86-NOBMI-NEXT:    movl %ebp, 4(%eax,%edi,8)
+; X86-NOBMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    addl $1, %edi
+; X86-NOBMI-NEXT:    adcl $0, %ebp
+; X86-NOBMI-NEXT:    movl %edi, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %eax
+; X86-NOBMI-NEXT:    movl %ebp, %edx
+; X86-NOBMI-NEXT:    xorl %ebx, %edx
+; X86-NOBMI-NEXT:    orl %eax, %edx
 ; X86-NOBMI-NEXT:    jne .LBB1_2
 ; X86-NOBMI-NEXT:  .LBB1_3: # %for.end
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
@@ -184,71 +185,66 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
 ; X86-BMI-NEXT:    pushl %ebx
 ; X86-BMI-NEXT:    pushl %edi
 ; X86-BMI-NEXT:    pushl %esi
-; X86-BMI-NEXT:    subl $20, %esp
+; X86-BMI-NEXT:    subl $16, %esp
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI-NEXT:    orl %ecx, %eax
 ; X86-BMI-NEXT:    je .LBB1_3
 ; X86-BMI-NEXT:  # %bb.1: # %for.body.preheader
-; X86-BMI-NEXT:    xorl %ecx, %ecx
-; X86-BMI-NEXT:    xorl %eax, %eax
+; X86-BMI-NEXT:    xorl %esi, %esi
+; X86-BMI-NEXT:    xorl %edi, %edi
 ; X86-BMI-NEXT:    xorl %ebx, %ebx
-; X86-BMI-NEXT:    xorl %ebp, %ebp
+; X86-BMI-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-BMI-NEXT:    .p2align 4
 ; X86-BMI-NEXT:  .LBB1_2: # %for.body
 ; X86-BMI-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-BMI-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-BMI-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-BMI-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI-NEXT:    movl (%eax,%ebx,8), %ecx
-; X86-BMI-NEXT:    movl 4(%eax,%ebx,8), %esi
-; X86-BMI-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT:    movl 4(%eax,%ebx,8), %ebp
+; X86-BMI-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-BMI-NEXT:    movl %ecx, %edx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    mulxl %eax, %edx, %edi
+; X86-BMI-NEXT:    mulxl {{[0-9]+}}(%esp), %edx, %eax
+; X86-BMI-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-BMI-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-BMI-NEXT:    movl %esi, %edx
-; X86-BMI-NEXT:    mulxl %eax, %esi, %eax
-; X86-BMI-NEXT:    addl %edi, %esi
-; X86-BMI-NEXT:    adcl $0, %eax
+; X86-BMI-NEXT:    movl %ebp, %edx
+; X86-BMI-NEXT:    mulxl {{[0-9]+}}(%esp), %eax, %ebp
+; X86-BMI-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X86-BMI-NEXT:    adcl $0, %ebp
 ; X86-BMI-NEXT:    movl %ecx, %edx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT:    mulxl %ecx, %edi, %ebp
-; X86-BMI-NEXT:    addl %esi, %edi
-; X86-BMI-NEXT:    adcl %eax, %ebp
+; X86-BMI-NEXT:    mulxl {{[0-9]+}}(%esp), %ecx, %edx
+; X86-BMI-NEXT:    addl %eax, %ecx
+; X86-BMI-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-BMI-NEXT:    movl %esi, %eax
+; X86-BMI-NEXT:    adcl %ebp, %edx
+; X86-BMI-NEXT:    movl %edx, %ebp
 ; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-BMI-NEXT:    mulxl %ecx, %ecx, %eax
+; X86-BMI-NEXT:    mulxl {{[0-9]+}}(%esp), %esi, %edi
 ; X86-BMI-NEXT:    setb %dl
-; X86-BMI-NEXT:    addl %ebp, %ecx
-; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT:    addl %ebp, %esi
 ; X86-BMI-NEXT:    movzbl %dl, %edx
-; X86-BMI-NEXT:    adcl %edx, %eax
-; X86-BMI-NEXT:    movl %eax, %edx
-; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-BMI-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-BMI-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
-; X86-BMI-NEXT:    adcl $0, %ecx
-; X86-BMI-NEXT:    adcl $0, %edx
-; X86-BMI-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT:    movl %eax, (%edx,%ebx,8)
-; X86-BMI-NEXT:    movl %edi, 4(%edx,%ebx,8)
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT:    adcl %edx, %edi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-BMI-NEXT:    addl %eax, %edx
+; X86-BMI-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
+; X86-BMI-NEXT:    adcl $0, %esi
+; X86-BMI-NEXT:    adcl $0, %edi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    movl %edx, (%eax,%ebx,8)
+; X86-BMI-NEXT:    movl %ecx, 4(%eax,%ebx,8)
 ; X86-BMI-NEXT:    addl $1, %ebx
-; X86-BMI-NEXT:    adcl $0, %ebp
-; X86-BMI-NEXT:    movl %ebx, %edx
-; X86-BMI-NEXT:    xorl %esi, %edx
-; X86-BMI-NEXT:    movl %ebp, %esi
-; X86-BMI-NEXT:    xorl %edi, %esi
-; X86-BMI-NEXT:    orl %edx, %esi
-; X86-BMI-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-BMI-NEXT:    adcl $0, %ecx
+; X86-BMI-NEXT:    movl %ebx, %eax
+; X86-BMI-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT:    xorl %ebp, %ecx
+; X86-BMI-NEXT:    orl %eax, %ecx
 ; X86-BMI-NEXT:    jne .LBB1_2
 ; X86-BMI-NEXT:  .LBB1_3: # %for.end
 ; X86-BMI-NEXT:    xorl %eax, %eax
 ; X86-BMI-NEXT:    xorl %edx, %edx
-; X86-BMI-NEXT:    addl $20, %esp
+; X86-BMI-NEXT:    addl $16, %esp
 ; X86-BMI-NEXT:    popl %esi
 ; X86-BMI-NEXT:    popl %edi
 ; X86-BMI-NEXT:    popl %ebx
@@ -261,11 +257,12 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
 ; X64-NOBMI-NEXT:    je .LBB1_3
 ; X64-NOBMI-NEXT:  # %bb.1: # %for.body.preheader
 ; X64-NOBMI-NEXT:    movq %rdx, %r8
-; X64-NOBMI-NEXT:    xorl %r10d, %r10d
+; X64-NOBMI-NEXT:    xorl %edx, %edx
 ; X64-NOBMI-NEXT:    xorl %r9d, %r9d
 ; X64-NOBMI-NEXT:    .p2align 4
 ; X64-NOBMI-NEXT:  .LBB1_2: # %for.body
 ; X64-NOBMI-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NOBMI-NEXT:    movq %rdx, %r10
 ; X64-NOBMI-NEXT:    movq %rcx, %rax
 ; X64-NOBMI-NEXT:    mulq (%r8,%r9,8)
 ; X64-NOBMI-NEXT:    addq %r10, %rax
@@ -273,7 +270,6 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
 ; X64-NOBMI-NEXT:    movq %rax, (%rsi,%r9,8)
 ; X64-NOBMI-NEXT:    incq %r9
 ; X64-NOBMI-NEXT:    cmpq %r9, %rdi
-; X64-NOBMI-NEXT:    movq %rdx, %r10
 ; X64-NOBMI-NEXT:    jne .LBB1_2
 ; X64-NOBMI-NEXT:  .LBB1_3: # %for.end
 ; X64-NOBMI-NEXT:    xorl %eax, %eax
@@ -285,11 +281,12 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
 ; X64-BMI-NEXT:    je .LBB1_3
 ; X64-BMI-NEXT:  # %bb.1: # %for.body.preheader
 ; X64-BMI-NEXT:    movq %rdx, %rax
-; X64-BMI-NEXT:    xorl %r9d, %r9d
+; X64-BMI-NEXT:    xorl %edx, %edx
 ; X64-BMI-NEXT:    xorl %r8d, %r8d
 ; X64-BMI-NEXT:    .p2align 4
 ; X64-BMI-NEXT:  .LBB1_2: # %for.body
 ; X64-BMI-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-BMI-NEXT:    movq %rdx, %r9
 ; X64-BMI-NEXT:    movq %rcx, %rdx
 ; X64-BMI-NEXT:    mulxq (%rax,%r8,8), %r10, %rdx
 ; X64-BMI-NEXT:    addq %r9, %r10
@@ -297,7 +294,6 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
 ; X64-BMI-NEXT:    movq %r10, (%rsi,%r8,8)
 ; X64-BMI-NEXT:    incq %r8
 ; X64-BMI-NEXT:    cmpq %r8, %rdi
-; X64-BMI-NEXT:    movq %rdx, %r9
 ; X64-BMI-NEXT:    jne .LBB1_2
 ; X64-BMI-NEXT:  .LBB1_3: # %for.end
 ; X64-BMI-NEXT:    xorl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/icmp-abs-C.ll b/llvm/test/CodeGen/X86/icmp-abs-C.ll
index 53b70fa38958b..c98889b7d5cb3 100644
--- a/llvm/test/CodeGen/X86/icmp-abs-C.ll
+++ b/llvm/test/CodeGen/X86/icmp-abs-C.ll
@@ -161,22 +161,22 @@ define i16 @ne_and_with_dom_abs(i16 %x) nounwind {
 ; X86-LABEL: ne_and_with_dom_abs:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movswl %cx, %eax
-; X86-NEXT:    sarl $15, %eax
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    subl %eax, %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movswl %ax, %ecx
+; X86-NEXT:    sarl $15, %ecx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    xorl $12312, %eax # imm = 0x3018
 ; X86-NEXT:    movzwl %ax, %esi
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpw $64, %cx
-; X86-NEXT:    setne %cl
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpw $64, %dx
+; X86-NEXT:    setne %dl
 ; X86-NEXT:    cmpl $2345, %esi # imm = 0x929
 ; X86-NEXT:    jae .LBB3_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %cl, %dl
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movb %dl, %cl
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:  .LBB3_2:
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/ipra-reg-usage.ll b/llvm/test/CodeGen/X86/ipra-reg-usage.ll
index e73ff791dc423..f270f8fc741aa 100644
--- a/llvm/test/CodeGen/X86/ipra-reg-usage.ll
+++ b/llvm/test/CodeGen/X86/ipra-reg-usage.ll
@@ -7,7 +7,7 @@
 target triple = "x86_64-unknown-unknown"
 declare void @bar1()
 define preserve_allcc void @foo()#0 {
-; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
+; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
   call void @bar1()
   call void @bar2()
   ret void
@@ -15,7 +15,7 @@ define preserve_allcc void @foo()#0 {
 declare void @bar2()
 
 define preserve_nonecc void @foo2()#0 {
-; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
+; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
   call void @bar1()
   call void @bar2()
   ret void
diff --git a/llvm/test/CodeGen/X86/isel-llvm.sincos.ll b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
index 065710f91457b..8576f8f149e9a 100644
--- a/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
@@ -3,6 +3,9 @@
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel  | FileCheck %s --check-prefixes=X64,FASTISEL-X64
 ; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 -fast-isel=0  | FileCheck %s --check-prefixes=X86,SDAG-X86
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=0 -fast-isel=0  | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 | FileCheck %s --check-prefix=MACOS-SINCOS-STRET
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 | FileCheck %s --check-prefix=MACOS-NOSINCOS-STRET
+
 ; TODO: The below RUN line will fails GISEL selection and will fallback to DAG selection due to lack of support for loads/stores in i686 mode, support is expected soon enough, for this reason the llvm/test/CodeGen/X86/GlobalISel/llvm.sincos.mir test is added for now because of the lack of support for i686 in GlobalISel.
 ; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X86
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=1 -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
@@ -34,6 +37,29 @@ define { float, float } @test_sincos_f32(float %Val) nounwind {
 ; X64-NEXT:    popq %rax
 ; X64-NEXT:    retq
 ;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_f32:
+; MACOS-SINCOS-STRET:       ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT:    pushq %rax
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; MACOS-SINCOS-STRET-NEXT:    popq %rax
+; MACOS-SINCOS-STRET-NEXT:    retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f32:
+; MACOS-NOSINCOS-STRET:       ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT:    pushq %rax
+; MACOS-NOSINCOS-STRET-NEXT:    movss %xmm0, (%rsp) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movss (%rsp), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, %xmm1
+; MACOS-NOSINCOS-STRET-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT:    popq %rax
+; MACOS-NOSINCOS-STRET-NEXT:    retq
+;
 ; GISEL-X86-LABEL: test_sincos_f32:
 ; GISEL-X86:       # %bb.0:
 ; GISEL-X86-NEXT:    subl $28, %esp
@@ -93,6 +119,28 @@ define { double, double } @test_sincos_f64(double %Val) nounwind  {
 ; X64-NEXT:    addq $24, %rsp
 ; X64-NEXT:    retq
 ;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_f64:
+; MACOS-SINCOS-STRET:       ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT:    pushq %rax
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincos_stret
+; MACOS-SINCOS-STRET-NEXT:    popq %rax
+; MACOS-SINCOS-STRET-NEXT:    retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f64:
+; MACOS-NOSINCOS-STRET:       ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT:    subq $24, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sin
+; MACOS-NOSINCOS-STRET-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = mem[0],zero
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cos
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, %xmm1
+; MACOS-NOSINCOS-STRET-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = mem[0],zero
+; MACOS-NOSINCOS-STRET-NEXT:    addq $24, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    retq
+;
 ; GISEL-X86-LABEL: test_sincos_f64:
 ; GISEL-X86:       # %bb.0:
 ; GISEL-X86-NEXT:    subl $44, %esp
@@ -153,6 +201,40 @@ define { x86_fp80, x86_fp80 } @test_sincos_f80(x86_fp80 %Val) nounwind {
 ; X64-NEXT:    addq $56, %rsp
 ; X64-NEXT:    retq
 ;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_f80:
+; MACOS-SINCOS-STRET:       ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT:    subq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT:    fldt {{[0-9]+}}(%rsp)
+; MACOS-SINCOS-STRET-NEXT:    fld %st(0)
+; MACOS-SINCOS-STRET-NEXT:    fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-SINCOS-STRET-NEXT:    fstpt (%rsp)
+; MACOS-SINCOS-STRET-NEXT:    callq _cosl
+; MACOS-SINCOS-STRET-NEXT:    fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-SINCOS-STRET-NEXT:    fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT:    fstpt (%rsp)
+; MACOS-SINCOS-STRET-NEXT:    callq _sinl
+; MACOS-SINCOS-STRET-NEXT:    fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT:    fxch %st(1)
+; MACOS-SINCOS-STRET-NEXT:    addq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT:    retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f80:
+; MACOS-NOSINCOS-STRET:       ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT:    subq $40, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    fldt {{[0-9]+}}(%rsp)
+; MACOS-NOSINCOS-STRET-NEXT:    fld %st(0)
+; MACOS-NOSINCOS-STRET-NEXT:    fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-NOSINCOS-STRET-NEXT:    fstpt (%rsp)
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cosl
+; MACOS-NOSINCOS-STRET-NEXT:    fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-NOSINCOS-STRET-NEXT:    fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT:    fstpt (%rsp)
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sinl
+; MACOS-NOSINCOS-STRET-NEXT:    fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT:    fxch %st(1)
+; MACOS-NOSINCOS-STRET-NEXT:    addq $40, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    retq
+;
 ; GISEL-X86-LABEL: test_sincos_f80:
 ; GISEL-X86:       # %bb.0:
 ; GISEL-X86-NEXT:    subl $60, %esp
@@ -288,6 +370,57 @@ define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias %
 ; SDAG-X64-NEXT:    popq %r14
 ; SDAG-X64-NEXT:    retq
 ;
+; MACOS-SINCOS-STRET-LABEL: can_fold_with_call_in_chain:
+; MACOS-SINCOS-STRET:       ## %bb.0: ## %entry
+; MACOS-SINCOS-STRET-NEXT:    pushq %r14
+; MACOS-SINCOS-STRET-NEXT:    pushq %rbx
+; MACOS-SINCOS-STRET-NEXT:    subq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT:    movq %rsi, %rbx
+; MACOS-SINCOS-STRET-NEXT:    movq %rdi, %r14
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    movq %r14, %rdi
+; MACOS-SINCOS-STRET-NEXT:    movq %rbx, %rsi
+; MACOS-SINCOS-STRET-NEXT:    callq _foo
+; MACOS-SINCOS-STRET-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    movss %xmm0, (%r14)
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    movss %xmm0, (%rbx)
+; MACOS-SINCOS-STRET-NEXT:    addq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT:    popq %rbx
+; MACOS-SINCOS-STRET-NEXT:    popq %r14
+; MACOS-SINCOS-STRET-NEXT:    retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: can_fold_with_call_in_chain:
+; MACOS-NOSINCOS-STRET:       ## %bb.0: ## %entry
+; MACOS-NOSINCOS-STRET-NEXT:    pushq %r14
+; MACOS-NOSINCOS-STRET-NEXT:    pushq %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    pushq %rax
+; MACOS-NOSINCOS-STRET-NEXT:    movq %rsi, %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    movq %rdi, %r14
+; MACOS-NOSINCOS-STRET-NEXT:    movss %xmm0, (%rsp) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movss (%rsp), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT:    movss %xmm0, (%rsp) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movq %r14, %rdi
+; MACOS-NOSINCOS-STRET-NEXT:    movq %rbx, %rsi
+; MACOS-NOSINCOS-STRET-NEXT:    callq _foo
+; MACOS-NOSINCOS-STRET-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT:    movss %xmm0, (%r14)
+; MACOS-NOSINCOS-STRET-NEXT:    movss (%rsp), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT:    movss %xmm0, (%rbx)
+; MACOS-NOSINCOS-STRET-NEXT:    addq $8, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    popq %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    popq %r14
+; MACOS-NOSINCOS-STRET-NEXT:    retq
+;
 ; GISEL-X86-LABEL: can_fold_with_call_in_chain:
 ; GISEL-X86:       # %bb.0: # %entry
 ; GISEL-X86-NEXT:    pushl %ebx
diff --git a/llvm/test/CodeGen/X86/ldexp-avx512.ll b/llvm/test/CodeGen/X86/ldexp-avx512.ll
index ea93a911a1ad0..21491bc2cc8f5 100644
--- a/llvm/test/CodeGen/X86/ldexp-avx512.ll
+++ b/llvm/test/CodeGen/X86/ldexp-avx512.ll
@@ -47,6 +47,187 @@ entry:
 }
 declare fp128 @ldexpl(fp128, i32) memory(none)
 
+define <8 x half> @test_ldexp_8xhalf(<8 x half> %x, <8 x i16> %exp) nounwind {
+; AVX512-LABEL: test_ldexp_8xhalf:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    subq $88, %rsp
+; AVX512-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm0
+; AVX512-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512-NEXT:    addq $88, %rsp
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: test_ldexp_8xhalf:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    subq $88, %rsp
+; AVX512VL-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm0
+; AVX512VL-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,0]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512VL-NEXT:    addq $88, %rsp
+; AVX512VL-NEXT:    retq
+  %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> %x, <8 x i16> %exp)
+  ret <8 x half> %r
+}
+declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>)
+
 define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) nounwind {
 ; CHECK-LABEL: test_ldexp_4xfloat:
 ; CHECK:       # %bb.0:
@@ -109,6 +290,381 @@ define <2 x double> @test_ldexp_2xdouble(<2 x double> %x, <2 x i32> %exp) nounwi
 }
 declare <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>)
 
+define <16 x half> @test_ldexp_16xhalf(<16 x half> %x, <16 x i16> %exp) nounwind {
+; AVX512-LABEL: test_ldexp_16xhalf:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    subq $168, %rsp
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX512-NEXT:    addq $168, %rsp
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: test_ldexp_16xhalf:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    subq $168, %rsp
+; AVX512VL-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512VL-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,0]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,0]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512VL-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512VL-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX512VL-NEXT:    addq $168, %rsp
+; AVX512VL-NEXT:    retq
+  %r = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> %x, <16 x i16> %exp)
+  ret <16 x half> %r
+}
+declare <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half>, <16 x i16>)
+
 define <8 x float> @test_ldexp_8xfloat(<8 x float> %x, <8 x i32> %exp) nounwind {
 ; CHECK-LABEL: test_ldexp_8xfloat:
 ; CHECK:       # %bb.0:
@@ -230,6 +786,735 @@ define <4 x double> @test_ldexp_4xdouble(<4 x double> %x, <4 x i32> %exp) nounwi
 }
 declare <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>)
 
+define <32 x half> @test_ldexp_32xhalf(<32 x half> %x, <32 x i16> %exp) nounwind {
+; AVX512-LABEL: test_ldexp_32xhalf:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    subq $360, %rsp # imm = 0x168
+; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vextracti32x4 $3, %zmm1, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vextracti32x4 $2, %zmm1, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; AVX512-NEXT:    addq $360, %rsp # imm = 0x168
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: test_ldexp_32xhalf:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    subq $360, %rsp # imm = 0x168
+; AVX512VL-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512VL-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vextracti32x4 $3, %zmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,0]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512VL-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vextracti32x4 $2, %zmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,0]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,0]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,0]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512VL-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vcvtph2ps (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512VL-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512VL-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; AVX512VL-NEXT:    # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; AVX512VL-NEXT:    addq $360, %rsp # imm = 0x168
+; AVX512VL-NEXT:    retq
+  %r = call <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half> %x, <32 x i16> %exp)
+  ret <32 x half> %r
+}
+declare <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half>, <32 x i16>)
+
 define <16 x float> @test_ldexp_16xfloat(<16 x float> %x, <16 x i32> %exp) nounwind {
 ; CHECK-LABEL: test_ldexp_16xfloat:
 ; CHECK:       # %bb.0:
@@ -462,6 +1747,3 @@ define <8 x double> @test_ldexp_8xdouble(<8 x double> %x, <8 x i32> %exp) nounwi
 }
 declare <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double>, <8 x i32>)
 
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX512: {{.*}}
-; AVX512VL: {{.*}}
diff --git a/llvm/test/CodeGen/X86/llvm.sincos.vec.ll b/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
index 834dd788ff7fb..9b02438952035 100644
--- a/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
+++ b/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
@@ -1,59 +1,213 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --version 5
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu  | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu  | FileCheck -check-prefix=X86 %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu  | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 | FileCheck --check-prefix=MACOS-SINCOS-STRET %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 | FileCheck --check-prefix=MACOS-NOSINCOS-STRET %s
 
 define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias %out_cos) nounwind {
-; CHECK-LABEL: test_sincos_v4f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushl %edi
-; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    subl $52, %esp
-; CHECK-NEXT:    movl 84(%esp), %esi
-; CHECK-NEXT:    flds 76(%esp)
-; CHECK-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    flds 64(%esp)
-; CHECK-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    flds 72(%esp)
-; CHECK-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    flds 68(%esp)
-; CHECK-NEXT:    movl 80(%esp), %edi
-; CHECK-NEXT:    leal 40(%esp), %eax
-; CHECK-NEXT:    movl %eax, 8(%esp)
-; CHECK-NEXT:    leal 4(%edi), %eax
-; CHECK-NEXT:    movl %eax, 4(%esp)
-; CHECK-NEXT:    fstps (%esp)
-; CHECK-NEXT:    calll sincosf
-; CHECK-NEXT:    leal 44(%esp), %eax
-; CHECK-NEXT:    movl %eax, 8(%esp)
-; CHECK-NEXT:    leal 8(%edi), %eax
-; CHECK-NEXT:    movl %eax, 4(%esp)
-; CHECK-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; CHECK-NEXT:    fstps (%esp)
-; CHECK-NEXT:    calll sincosf
-; CHECK-NEXT:    leal 36(%esp), %eax
-; CHECK-NEXT:    movl %eax, 8(%esp)
-; CHECK-NEXT:    movl %edi, 4(%esp)
-; CHECK-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; CHECK-NEXT:    fstps (%esp)
-; CHECK-NEXT:    calll sincosf
-; CHECK-NEXT:    leal 48(%esp), %eax
-; CHECK-NEXT:    movl %eax, 8(%esp)
-; CHECK-NEXT:    addl $12, %edi
-; CHECK-NEXT:    movl %edi, 4(%esp)
-; CHECK-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; CHECK-NEXT:    fstps (%esp)
-; CHECK-NEXT:    calll sincosf
-; CHECK-NEXT:    flds 36(%esp)
-; CHECK-NEXT:    flds 40(%esp)
-; CHECK-NEXT:    flds 44(%esp)
-; CHECK-NEXT:    flds 48(%esp)
-; CHECK-NEXT:    fstps 12(%esi)
-; CHECK-NEXT:    fstps 8(%esi)
-; CHECK-NEXT:    fstps 4(%esi)
-; CHECK-NEXT:    fstps (%esi)
-; CHECK-NEXT:    addl $52, %esp
-; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    popl %edi
-; CHECK-NEXT:    retl
+; X86-LABEL: test_sincos_v4f32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    movl 84(%esp), %esi
+; X86-NEXT:    flds 76(%esp)
+; X86-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    flds 64(%esp)
+; X86-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    flds 72(%esp)
+; X86-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    flds 68(%esp)
+; X86-NEXT:    movl 80(%esp), %edi
+; X86-NEXT:    leal 40(%esp), %eax
+; X86-NEXT:    movl %eax, 8(%esp)
+; X86-NEXT:    leal 4(%edi), %eax
+; X86-NEXT:    movl %eax, 4(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll sincosf
+; X86-NEXT:    leal 44(%esp), %eax
+; X86-NEXT:    movl %eax, 8(%esp)
+; X86-NEXT:    leal 8(%edi), %eax
+; X86-NEXT:    movl %eax, 4(%esp)
+; X86-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll sincosf
+; X86-NEXT:    leal 36(%esp), %eax
+; X86-NEXT:    movl %eax, 8(%esp)
+; X86-NEXT:    movl %edi, 4(%esp)
+; X86-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll sincosf
+; X86-NEXT:    leal 48(%esp), %eax
+; X86-NEXT:    movl %eax, 8(%esp)
+; X86-NEXT:    addl $12, %edi
+; X86-NEXT:    movl %edi, 4(%esp)
+; X86-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll sincosf
+; X86-NEXT:    flds 36(%esp)
+; X86-NEXT:    flds 40(%esp)
+; X86-NEXT:    flds 44(%esp)
+; X86-NEXT:    flds 48(%esp)
+; X86-NEXT:    fstps 12(%esi)
+; X86-NEXT:    fstps 8(%esi)
+; X86-NEXT:    fstps 4(%esi)
+; X86-NEXT:    fstps (%esi)
+; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_sincos_v4f32:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    subq $56, %rsp
+; X64-NEXT:    movq %rsi, %rbx
+; X64-NEXT:    movq %rdi, %r14
+; X64-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X64-NEXT:    leaq 4(%rsp), %rdi
+; X64-NEXT:    movq %rsp, %rsi
+; X64-NEXT:    callq sincosf@PLT
+; X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-NEXT:    leaq 12(%rsp), %rdi
+; X64-NEXT:    leaq 8(%rsp), %rsi
+; X64-NEXT:    callq sincosf@PLT
+; X64-NEXT:    leaq 28(%rsp), %rdi
+; X64-NEXT:    leaq 24(%rsp), %rsi
+; X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT:    callq sincosf@PLT
+; X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X64-NEXT:    leaq 20(%rsp), %rdi
+; X64-NEXT:    leaq 16(%rsp), %rsi
+; X64-NEXT:    callq sincosf@PLT
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X64-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-NEXT:    movups %xmm1, (%r14)
+; X64-NEXT:    movups %xmm0, (%rbx)
+; X64-NEXT:    addq $56, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r14
+; X64-NEXT:    retq
+;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_v4f32:
+; MACOS-SINCOS-STRET:       ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT:    pushq %r14
+; MACOS-SINCOS-STRET-NEXT:    pushq %rbx
+; MACOS-SINCOS-STRET-NEXT:    subq $104, %rsp
+; MACOS-SINCOS-STRET-NEXT:    movq %rsi, %rbx
+; MACOS-SINCOS-STRET-NEXT:    movq %rdi, %r14
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT:    unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, %xmm1
+; MACOS-SINCOS-STRET-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; MACOS-SINCOS-STRET-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; MACOS-SINCOS-STRET-NEXT:    unpcklpd (%rsp), %xmm2 ## 16-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT:    ## xmm2 = xmm2[0],mem[0]
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-SINCOS-STRET-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-SINCOS-STRET-NEXT:    movups %xmm1, (%r14)
+; MACOS-SINCOS-STRET-NEXT:    movups %xmm2, (%rbx)
+; MACOS-SINCOS-STRET-NEXT:    addq $104, %rsp
+; MACOS-SINCOS-STRET-NEXT:    popq %rbx
+; MACOS-SINCOS-STRET-NEXT:    popq %r14
+; MACOS-SINCOS-STRET-NEXT:    retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_v4f32:
+; MACOS-NOSINCOS-STRET:       ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT:    pushq %r14
+; MACOS-NOSINCOS-STRET-NEXT:    pushq %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    subq $104, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    movq %rsi, %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    movq %rdi, %r14
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; MACOS-NOSINCOS-STRET-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm1 = xmm1[0],mem[0]
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; MACOS-NOSINCOS-STRET-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT:    ## xmm1 = xmm1[0],mem[0]
+; MACOS-NOSINCOS-STRET-NEXT:    movups %xmm1, (%r14)
+; MACOS-NOSINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    movups %xmm0, (%rbx)
+; MACOS-NOSINCOS-STRET-NEXT:    addq $104, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    popq %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    popq %r14
+; MACOS-NOSINCOS-STRET-NEXT:    retq
   %result = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> %x)
   %result.0 = extractvalue { <4 x float>, <4 x float> } %result, 0
   %result.1 = extractvalue { <4 x float>, <4 x float> } %result, 1
@@ -63,36 +217,120 @@ define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias
 }
 
 define void @test_sincos_v2f64(<2 x double> %x, ptr noalias %out_sin, ptr noalias %out_cos) nounwind {
-; CHECK-LABEL: test_sincos_v2f64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushl %edi
-; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    subl $52, %esp
-; CHECK-NEXT:    movl 84(%esp), %esi
-; CHECK-NEXT:    fldl 72(%esp)
-; CHECK-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; CHECK-NEXT:    fldl 64(%esp)
-; CHECK-NEXT:    movl 80(%esp), %edi
-; CHECK-NEXT:    leal 24(%esp), %eax
-; CHECK-NEXT:    movl %eax, 12(%esp)
-; CHECK-NEXT:    movl %edi, 8(%esp)
-; CHECK-NEXT:    fstpl (%esp)
-; CHECK-NEXT:    calll sincos
-; CHECK-NEXT:    leal 32(%esp), %eax
-; CHECK-NEXT:    movl %eax, 12(%esp)
-; CHECK-NEXT:    addl $8, %edi
-; CHECK-NEXT:    movl %edi, 8(%esp)
-; CHECK-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
-; CHECK-NEXT:    fstpl (%esp)
-; CHECK-NEXT:    calll sincos
-; CHECK-NEXT:    fldl 24(%esp)
-; CHECK-NEXT:    fldl 32(%esp)
-; CHECK-NEXT:    fstpl 8(%esi)
-; CHECK-NEXT:    fstpl (%esi)
-; CHECK-NEXT:    addl $52, %esp
-; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    popl %edi
-; CHECK-NEXT:    retl
+; X86-LABEL: test_sincos_v2f64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    movl 84(%esp), %esi
+; X86-NEXT:    fldl 72(%esp)
+; X86-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
+; X86-NEXT:    fldl 64(%esp)
+; X86-NEXT:    movl 80(%esp), %edi
+; X86-NEXT:    leal 24(%esp), %eax
+; X86-NEXT:    movl %eax, 12(%esp)
+; X86-NEXT:    movl %edi, 8(%esp)
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll sincos
+; X86-NEXT:    leal 32(%esp), %eax
+; X86-NEXT:    movl %eax, 12(%esp)
+; X86-NEXT:    addl $8, %edi
+; X86-NEXT:    movl %edi, 8(%esp)
+; X86-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll sincos
+; X86-NEXT:    fldl 24(%esp)
+; X86-NEXT:    fldl 32(%esp)
+; X86-NEXT:    fstpl 8(%esi)
+; X86-NEXT:    fstpl (%esi)
+; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_sincos_v2f64:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    subq $56, %rsp
+; X64-NEXT:    movq %rsi, %rbx
+; X64-NEXT:    movq %rdi, %r14
+; X64-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT:    leaq 24(%rsp), %rdi
+; X64-NEXT:    leaq 16(%rsp), %rsi
+; X64-NEXT:    callq sincos@PLT
+; X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-NEXT:    leaq 8(%rsp), %rdi
+; X64-NEXT:    movq %rsp, %rsi
+; X64-NEXT:    callq sincos@PLT
+; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; X64-NEXT:    movups %xmm1, (%r14)
+; X64-NEXT:    movups %xmm0, (%rbx)
+; X64-NEXT:    addq $56, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r14
+; X64-NEXT:    retq
+;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_v2f64:
+; MACOS-SINCOS-STRET:       ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT:    pushq %r14
+; MACOS-SINCOS-STRET-NEXT:    pushq %rbx
+; MACOS-SINCOS-STRET-NEXT:    subq $56, %rsp
+; MACOS-SINCOS-STRET-NEXT:    movq %rsi, %rbx
+; MACOS-SINCOS-STRET-NEXT:    movq %rdi, %r14
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincos_stret
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-SINCOS-STRET-NEXT:    callq ___sincos_stret
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; MACOS-SINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-SINCOS-STRET-NEXT:    movups %xmm1, (%r14)
+; MACOS-SINCOS-STRET-NEXT:    movups %xmm2, (%rbx)
+; MACOS-SINCOS-STRET-NEXT:    addq $56, %rsp
+; MACOS-SINCOS-STRET-NEXT:    popq %rbx
+; MACOS-SINCOS-STRET-NEXT:    popq %r14
+; MACOS-SINCOS-STRET-NEXT:    retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_v2f64:
+; MACOS-NOSINCOS-STRET:       ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT:    pushq %r14
+; MACOS-NOSINCOS-STRET-NEXT:    pushq %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    subq $56, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    movq %rsi, %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    movq %rdi, %r14
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cos
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    callq _cos
+; MACOS-NOSINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sin
+; MACOS-NOSINCOS-STRET-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    callq _sin
+; MACOS-NOSINCOS-STRET-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-NOSINCOS-STRET-NEXT:    movups %xmm1, (%r14)
+; MACOS-NOSINCOS-STRET-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT:    movups %xmm0, (%rbx)
+; MACOS-NOSINCOS-STRET-NEXT:    addq $56, %rsp
+; MACOS-NOSINCOS-STRET-NEXT:    popq %rbx
+; MACOS-NOSINCOS-STRET-NEXT:    popq %r14
+; MACOS-NOSINCOS-STRET-NEXT:    retq
   %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %x)
   %result.0 = extractvalue { <2 x double>, <2 x double> } %result, 0
   %result.1 = extractvalue { <2 x double>, <2 x double> } %result, 1
diff --git a/llvm/test/CodeGen/X86/llvm.sincospi.ll b/llvm/test/CodeGen/X86/llvm.sincospi.ll
new file mode 100644
index 0000000000000..5546c66deba30
--- /dev/null
+++ b/llvm/test/CodeGen/X86/llvm.sincospi.ll
@@ -0,0 +1,233 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=x86_64-apple-macosx10.9 < %s | FileCheck %s
+
+define { half, half } @test_sincospi_f16(half %a) #0 {
+; CHECK-LABEL: test_sincospi_f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-NEXT:    movzwl %ax, %edi
+; CHECK-NEXT:    callq ___extendhfsf2
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ___truncsfhf2
+; CHECK-NEXT:    ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ___truncsfhf2
+; CHECK-NEXT:    ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %result = call { half, half } @llvm.sincospi.f16(half %a)
+  ret { half, half } %result
+}
+
+define half @test_sincospi_f16_only_use_sin(half %a) #0 {
+; CHECK-LABEL: test_sincospi_f16_only_use_sin:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-NEXT:    movzwl %ax, %edi
+; CHECK-NEXT:    callq ___extendhfsf2
+; CHECK-NEXT:    movq %rsp, %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ___truncsfhf2
+; CHECK-NEXT:    ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %result = call { half, half } @llvm.sincospi.f16(half %a)
+  %result.0 = extractvalue { half, half } %result, 0
+  ret half %result.0
+}
+
+define half @test_sincospi_f16_only_use_cos(half %a) #0 {
+; CHECK-LABEL: test_sincospi_f16_only_use_cos:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-NEXT:    movzwl %ax, %edi
+; CHECK-NEXT:    callq ___extendhfsf2
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    movq %rsp, %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ___truncsfhf2
+; CHECK-NEXT:    ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %result = call { half, half } @llvm.sincospi.f16(half %a)
+  %result.1 = extractvalue { half, half } %result, 1
+  ret half %result.1
+}
+
+define { <2 x half>, <2 x half> } @test_sincospi_v2f16(<2 x half> %a) #0 {
+; CHECK-LABEL: test_sincospi_v2f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    subq $64, %rsp
+; CHECK-NEXT:    pextrw $0, %xmm0, %ebx
+; CHECK-NEXT:    psrld $16, %xmm0
+; CHECK-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-NEXT:    movzwl %ax, %edi
+; CHECK-NEXT:    callq ___extendhfsf2
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    movq %rsp, %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq ___extendhfsf2
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ___truncsfhf2
+; CHECK-NEXT:    ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ___truncsfhf2
+; CHECK-NEXT:    ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ___truncsfhf2
+; CHECK-NEXT:    ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ___truncsfhf2
+; CHECK-NEXT:    ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; CHECK-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; CHECK-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; CHECK-NEXT:    addq $64, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+  %result = call { <2 x half>, <2 x half> } @llvm.sincospi.v2f16(<2 x half> %a)
+  ret { <2 x half>, <2 x half> } %result
+}
+
+define { float, float } @test_sincospi_f32(float %a) #0 {
+; CHECK-LABEL: test_sincospi_f32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    movq %rsp, %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %result = call { float, float } @llvm.sincospi.f32(float %a)
+  ret { float, float } %result
+}
+
+define { <2 x float>, <2 x float> } @test_sincospi_v2f32(<2 x float> %a) #0 {
+; CHECK-LABEL: test_sincospi_v2f32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    movq %rsp, %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %result = call { <2 x float>, <2 x float> } @llvm.sincospi.v2f32(<2 x float> %a)
+  ret { <2 x float>, <2 x float> } %result
+}
+
+define { <3 x float>, <3 x float> } @test_sincospi_v3f32(<3 x float> %a) #0 {
+; CHECK-LABEL: test_sincospi_v3f32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    subq $56, %rsp
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; CHECK-NEXT:    addq $56, %rsp
+; CHECK-NEXT:    retq
+  %result = call { <3 x float>, <3 x float> } @llvm.sincospi.v3f32(<3 x float> %a)
+  ret { <3 x float>, <3 x float> } %result
+}
+
+define { double, double } @test_sincospi_f64(double %a) #0 {
+; CHECK-LABEL: test_sincospi_f64:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospi
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    retq
+  %result = call { double, double } @llvm.sincospi.f64(double %a)
+  ret { double, double } %result
+}
+
+define { <2 x double>, <2 x double> } @test_sincospi_v2f64(<2 x double> %a) #0 {
+; CHECK-LABEL: test_sincospi_v2f64:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    subq $56, %rsp
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospi
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    movq %rsp, %rsi
+; CHECK-NEXT:    callq ___sincospi
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; CHECK-NEXT:    addq $56, %rsp
+; CHECK-NEXT:    retq
+  %result = call { <2 x double>, <2 x double> } @llvm.sincospi.v2f64(<2 x double> %a)
+  ret { <2 x double>, <2 x double> } %result
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index caec02eaa19c7..58adbb767ed87 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -207,15 +207,15 @@ declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> , <16 x ptr> , i32 , <
 ; SCALAR-NEXT:   store i32 %Elt2, ptr %Ptr23, align 4
 
 define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) {
-; X64-LABEL: test6:
-; X64:       # %bb.0:
-; X64-NEXT:    kxnorw %k0, %k0, %k1
-; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT:    kxnorw %k0, %k0, %k2
-; X64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
-; X64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
-; X64-NEXT:    vmovdqa %ymm2, %ymm0
-; X64-NEXT:    retq
+; X64-KNL-LABEL: test6:
+; X64-KNL:       # %bb.0:
+; X64-KNL-NEXT:    kxnorw %k0, %k0, %k1
+; X64-KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-KNL-NEXT:    kxnorw %k0, %k0, %k2
+; X64-KNL-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
+; X64-KNL-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; X64-KNL-NEXT:    vmovdqa %ymm2, %ymm0
+; X64-KNL-NEXT:    retq
 ;
 ; X86-KNL-LABEL: test6:
 ; X86-KNL:       # %bb.0:
@@ -230,11 +230,21 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) {
 ; X86-KNL-NEXT:    vmovdqa %ymm2, %ymm0
 ; X86-KNL-NEXT:    retl
 ;
+; X64-SKX-LABEL: test6:
+; X64-SKX:       # %bb.0:
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k1
+; X64-SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k2
+; X64-SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
+; X64-SKX-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; X64-SKX-NEXT:    vmovdqa %ymm2, %ymm0
+; X64-SKX-NEXT:    retq
+;
 ; X86-SKX-LABEL: test6:
 ; X86-SKX:       # %bb.0:
-; X86-SKX-NEXT:    kxnorw %k0, %k0, %k1
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
 ; X86-SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X86-SKX-NEXT:    kxnorw %k0, %k0, %k2
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k2
 ; X86-SKX-NEXT:    vpgatherdd (,%ymm1), %ymm2 {%k2}
 ; X86-SKX-NEXT:    vpscatterdd %ymm0, (,%ymm1) {%k1}
 ; X86-SKX-NEXT:    vmovdqa %ymm2, %ymm0
@@ -255,9 +265,9 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) {
 ; X64-KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-KNL-NEXT:    kmovw %k1, %k2
 ; X64-KNL-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
-; X64-KNL-NEXT:    vmovdqa64 %zmm1, %zmm2
-; X64-KNL-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
-; X64-KNL-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
+; X64-KNL-NEXT:    vmovdqa %ymm1, %ymm2
+; X64-KNL-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
+; X64-KNL-NEXT:    vpaddd %ymm1, %ymm2, %ymm0
 ; X64-KNL-NEXT:    retq
 ;
 ; X86-KNL-LABEL: test7:
@@ -271,9 +281,9 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) {
 ; X86-KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-KNL-NEXT:    kmovw %k1, %k2
 ; X86-KNL-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
-; X86-KNL-NEXT:    vmovdqa64 %zmm1, %zmm2
-; X86-KNL-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
-; X86-KNL-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
+; X86-KNL-NEXT:    vmovdqa %ymm1, %ymm2
+; X86-KNL-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
+; X86-KNL-NEXT:    vpaddd %ymm1, %ymm2, %ymm0
 ; X86-KNL-NEXT:    retl
 ;
 ; X64-SKX-LABEL: test7:
@@ -397,7 +407,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
 ; X64-SKX-SMALL-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
 ; X64-SKX-SMALL-NEXT:    vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
 ; X64-SKX-SMALL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
-; X64-SKX-SMALL-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-SMALL-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-SMALL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X64-SKX-SMALL-NEXT:    vpgatherqd 72(,%zmm1), %ymm0 {%k1}
 ; X64-SKX-SMALL-NEXT:    retq
@@ -412,7 +422,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
 ; X64-SKX-LARGE-NEXT:    vpmullq (%rax){1to8}, %zmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
-; X64-SKX-LARGE-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-LARGE-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-LARGE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X64-SKX-LARGE-NEXT:    vpgatherqd 72(,%zmm1), %ymm0 {%k1}
 ; X64-SKX-LARGE-NEXT:    retq
@@ -424,7 +434,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
 ; X86-SKX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
 ; X86-SKX-NEXT:    vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
 ; X86-SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
-; X86-SKX-NEXT:    kxnorw %k0, %k0, %k1
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
 ; X86-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpgatherdd 68(,%ymm1), %ymm0 {%k1}
 ; X86-SKX-NEXT:    retl
@@ -481,7 +491,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
 ; X64-SKX-SMALL-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
 ; X64-SKX-SMALL-NEXT:    vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
 ; X64-SKX-SMALL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
-; X64-SKX-SMALL-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-SMALL-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-SMALL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X64-SKX-SMALL-NEXT:    vpgatherqd 72(,%zmm1), %ymm0 {%k1}
 ; X64-SKX-SMALL-NEXT:    retq
@@ -496,7 +506,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
 ; X64-SKX-LARGE-NEXT:    vpmullq (%rax){1to8}, %zmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
-; X64-SKX-LARGE-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-LARGE-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-LARGE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X64-SKX-LARGE-NEXT:    vpgatherqd 72(,%zmm1), %ymm0 {%k1}
 ; X64-SKX-LARGE-NEXT:    retq
@@ -508,7 +518,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
 ; X86-SKX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
 ; X86-SKX-NEXT:    vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
 ; X86-SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
-; X86-SKX-NEXT:    kxnorw %k0, %k0, %k1
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
 ; X86-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpgatherdd 68(,%ymm1), %ymm0 {%k1}
 ; X86-SKX-NEXT:    retl
@@ -2465,17 +2475,17 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
 
 declare <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x ptr>)
 define <16 x ptr> @test31(<16 x ptr> %ptrs) {
-; X64-LABEL: test31:
-; X64:       # %bb.0:
-; X64-NEXT:    kxnorw %k0, %k0, %k1
-; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; X64-NEXT:    kxnorw %k0, %k0, %k2
-; X64-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k2}
-; X64-NEXT:    vpgatherqq (,%zmm1), %zmm2 {%k1}
-; X64-NEXT:    vmovdqa64 %zmm3, %zmm0
-; X64-NEXT:    vmovdqa64 %zmm2, %zmm1
-; X64-NEXT:    retq
+; X64-KNL-LABEL: test31:
+; X64-KNL:       # %bb.0:
+; X64-KNL-NEXT:    kxnorw %k0, %k0, %k1
+; X64-KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-KNL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; X64-KNL-NEXT:    kxnorw %k0, %k0, %k2
+; X64-KNL-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k2}
+; X64-KNL-NEXT:    vpgatherqq (,%zmm1), %zmm2 {%k1}
+; X64-KNL-NEXT:    vmovdqa64 %zmm3, %zmm0
+; X64-KNL-NEXT:    vmovdqa64 %zmm2, %zmm1
+; X64-KNL-NEXT:    retq
 ;
 ; X86-LABEL: test31:
 ; X86:       # %bb.0:
@@ -2484,6 +2494,18 @@ define <16 x ptr> @test31(<16 x ptr> %ptrs) {
 ; X86-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k1}
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; X86-NEXT:    retl
+;
+; X64-SKX-LABEL: test31:
+; X64-SKX:       # %bb.0:
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k1
+; X64-SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k2
+; X64-SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; X64-SKX-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k2}
+; X64-SKX-NEXT:    vpgatherqq (,%zmm1), %zmm2 {%k1}
+; X64-SKX-NEXT:    vmovdqa64 %zmm3, %zmm0
+; X64-SKX-NEXT:    vmovdqa64 %zmm2, %zmm1
+; X64-SKX-NEXT:    retq
   %res = call <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x ptr> undef)
   ret <16 x ptr>%res
 }
@@ -3253,17 +3275,17 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) {
 ; X64-KNL-NEXT:    vmovdqa %ymm1, %ymm0
 ; X64-KNL-NEXT:    retq
 ;
-; X86-LABEL: test_global_array:
-; X86:       # %bb.0:
-; X86-NEXT:    kxnorw %k0, %k0, %k1
-; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
-; X86-NEXT:    vmovdqa %ymm1, %ymm0
-; X86-NEXT:    retl
+; X86-KNL-LABEL: test_global_array:
+; X86-KNL:       # %bb.0:
+; X86-KNL-NEXT:    kxnorw %k0, %k0, %k1
+; X86-KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-KNL-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
+; X86-KNL-NEXT:    vmovdqa %ymm1, %ymm0
+; X86-KNL-NEXT:    retl
 ;
 ; X64-SKX-SMALL-LABEL: test_global_array:
 ; X64-SKX-SMALL:       # %bb.0:
-; X64-SKX-SMALL-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-SMALL-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-SMALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-SKX-SMALL-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
 ; X64-SKX-SMALL-NEXT:    vmovdqa %ymm1, %ymm0
@@ -3272,11 +3294,19 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) {
 ; X64-SKX-LARGE-LABEL: test_global_array:
 ; X64-SKX-LARGE:       # %bb.0:
 ; X64-SKX-LARGE-NEXT:    movabsq $glob_array, %rax
-; X64-SKX-LARGE-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-LARGE-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-LARGE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-SKX-LARGE-NEXT:    vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
 ; X64-SKX-LARGE-NEXT:    vmovdqa %ymm1, %ymm0
 ; X64-SKX-LARGE-NEXT:    retq
+;
+; X86-SKX-LABEL: test_global_array:
+; X86-SKX:       # %bb.0:
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
+; X86-SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-SKX-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
+; X86-SKX-NEXT:    vmovdqa %ymm1, %ymm0
+; X86-SKX-NEXT:    retl
   %p = getelementptr inbounds [16 x i32], ptr @glob_array, i64 0, <8 x i64> %indxs
   %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
   ret <8 x i32> %g
@@ -3291,17 +3321,17 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
 ; X64-KNL-NEXT:    vmovdqa %ymm1, %ymm0
 ; X64-KNL-NEXT:    retq
 ;
-; X86-LABEL: test_global_array_zeroinitializer_index:
-; X86:       # %bb.0:
-; X86-NEXT:    kxnorw %k0, %k0, %k1
-; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
-; X86-NEXT:    vmovdqa %ymm1, %ymm0
-; X86-NEXT:    retl
+; X86-KNL-LABEL: test_global_array_zeroinitializer_index:
+; X86-KNL:       # %bb.0:
+; X86-KNL-NEXT:    kxnorw %k0, %k0, %k1
+; X86-KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-KNL-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
+; X86-KNL-NEXT:    vmovdqa %ymm1, %ymm0
+; X86-KNL-NEXT:    retl
 ;
 ; X64-SKX-SMALL-LABEL: test_global_array_zeroinitializer_index:
 ; X64-SKX-SMALL:       # %bb.0:
-; X64-SKX-SMALL-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-SMALL-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-SMALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-SKX-SMALL-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
 ; X64-SKX-SMALL-NEXT:    vmovdqa %ymm1, %ymm0
@@ -3310,11 +3340,19 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
 ; X64-SKX-LARGE-LABEL: test_global_array_zeroinitializer_index:
 ; X64-SKX-LARGE:       # %bb.0:
 ; X64-SKX-LARGE-NEXT:    movabsq $glob_array, %rax
-; X64-SKX-LARGE-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-LARGE-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-LARGE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-SKX-LARGE-NEXT:    vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
 ; X64-SKX-LARGE-NEXT:    vmovdqa %ymm1, %ymm0
 ; X64-SKX-LARGE-NEXT:    retq
+;
+; X86-SKX-LABEL: test_global_array_zeroinitializer_index:
+; X86-SKX:       # %bb.0:
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
+; X86-SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-SKX-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
+; X86-SKX-NEXT:    vmovdqa %ymm1, %ymm0
+; X86-SKX-NEXT:    retl
   %p = getelementptr inbounds [16 x i32], ptr @glob_array, <8 x i64> zeroinitializer, <8 x i64> %indxs
   %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
   ret <8 x i32> %g
@@ -3545,7 +3583,7 @@ define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) {
 ; X64-SKX-LABEL: sext_v8i8_index:
 ; X64-SKX:       # %bb.0:
 ; X64-SKX-NEXT:    vpmovsxbd %xmm0, %ymm1
-; X64-SKX-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X64-SKX-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
 ; X64-SKX-NEXT:    retq
@@ -3554,7 +3592,7 @@ define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) {
 ; X86-SKX:       # %bb.0:
 ; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vpmovsxbd %xmm0, %ymm1
-; X86-SKX-NEXT:    kxnorw %k0, %k0, %k1
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
 ; X86-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
 ; X86-SKX-NEXT:    retl
@@ -3617,7 +3655,7 @@ define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) {
 ; X64-SKX-LABEL: zext_v8i8_index:
 ; X64-SKX:       # %bb.0:
 ; X64-SKX-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; X64-SKX-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X64-SKX-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
 ; X64-SKX-NEXT:    retq
@@ -3626,7 +3664,7 @@ define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) {
 ; X86-SKX:       # %bb.0:
 ; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; X86-SKX-NEXT:    kxnorw %k0, %k0, %k1
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
 ; X86-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
 ; X86-SKX-NEXT:    retl
@@ -4793,19 +4831,19 @@ define <16 x i32> @pr163023_sext(ptr %a0, <16 x i32> %a1) {
 }
 
 define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) {
-; X64-LABEL: pr163023_zext:
-; X64:       # %bb.0:
-; X64-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; X64-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-NEXT:    kxnorw %k0, %k0, %k1
-; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; X64-NEXT:    kxnorw %k0, %k0, %k2
-; X64-NEXT:    vpgatherqd (%rdi,%zmm0), %ymm3 {%k2}
-; X64-NEXT:    vpgatherqd (%rdi,%zmm1), %ymm2 {%k1}
-; X64-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm0
-; X64-NEXT:    retq
+; X64-KNL-LABEL: pr163023_zext:
+; X64-KNL:       # %bb.0:
+; X64-KNL-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; X64-KNL-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-KNL-NEXT:    kxnorw %k0, %k0, %k1
+; X64-KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-KNL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; X64-KNL-NEXT:    kxnorw %k0, %k0, %k2
+; X64-KNL-NEXT:    vpgatherqd (%rdi,%zmm0), %ymm3 {%k2}
+; X64-KNL-NEXT:    vpgatherqd (%rdi,%zmm1), %ymm2 {%k1}
+; X64-KNL-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm0
+; X64-KNL-NEXT:    retq
 ;
 ; X86-LABEL: pr163023_zext:
 ; X86:       # %bb.0:
@@ -4815,6 +4853,20 @@ define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) {
 ; X86-NEXT:    vpgatherdd (%eax,%zmm0), %zmm1 {%k1}
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; X86-NEXT:    retl
+;
+; X64-SKX-LABEL: pr163023_zext:
+; X64-SKX:       # %bb.0:
+; X64-SKX-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-SKX-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; X64-SKX-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k1
+; X64-SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k2
+; X64-SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; X64-SKX-NEXT:    vpgatherqd (%rdi,%zmm0), %ymm3 {%k2}
+; X64-SKX-NEXT:    vpgatherqd (%rdi,%zmm1), %ymm2 {%k1}
+; X64-SKX-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm0
+; X64-SKX-NEXT:    retq
   %addr.p = ptrtoint ptr %a0 to i64
   %addr.v = insertelement <1 x i64> poison, i64 %addr.p, i64 0
   %addr.splat = shufflevector <1 x i64> %addr.v, <1 x i64> poison, <16 x i32> zeroinitializer
@@ -4834,21 +4886,37 @@ define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) {
 %struct.foo = type { ptr, i64, i16, i16, i32 }
 
 define <8 x i64> @pr45906(<8 x ptr> %ptr) {
-; X64-LABEL: pr45906:
-; X64:       # %bb.0: # %bb
-; X64-NEXT:    kxnorw %k0, %k0, %k1
-; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vpgatherqq 8(,%zmm0), %zmm1 {%k1}
-; X64-NEXT:    vmovdqa64 %zmm1, %zmm0
-; X64-NEXT:    retq
+; X64-KNL-LABEL: pr45906:
+; X64-KNL:       # %bb.0: # %bb
+; X64-KNL-NEXT:    kxnorw %k0, %k0, %k1
+; X64-KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-KNL-NEXT:    vpgatherqq 8(,%zmm0), %zmm1 {%k1}
+; X64-KNL-NEXT:    vmovdqa64 %zmm1, %zmm0
+; X64-KNL-NEXT:    retq
 ;
-; X86-LABEL: pr45906:
-; X86:       # %bb.0: # %bb
-; X86-NEXT:    kxnorw %k0, %k0, %k1
-; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vpgatherdq 4(,%ymm0), %zmm1 {%k1}
-; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
-; X86-NEXT:    retl
+; X86-KNL-LABEL: pr45906:
+; X86-KNL:       # %bb.0: # %bb
+; X86-KNL-NEXT:    kxnorw %k0, %k0, %k1
+; X86-KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-KNL-NEXT:    vpgatherdq 4(,%ymm0), %zmm1 {%k1}
+; X86-KNL-NEXT:    vmovdqa64 %zmm1, %zmm0
+; X86-KNL-NEXT:    retl
+;
+; X64-SKX-LABEL: pr45906:
+; X64-SKX:       # %bb.0: # %bb
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k1
+; X64-SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-SKX-NEXT:    vpgatherqq 8(,%zmm0), %zmm1 {%k1}
+; X64-SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; X64-SKX-NEXT:    retq
+;
+; X86-SKX-LABEL: pr45906:
+; X86-SKX:       # %bb.0: # %bb
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
+; X86-SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-SKX-NEXT:    vpgatherdq 4(,%ymm0), %zmm1 {%k1}
+; X86-SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; X86-SKX-NEXT:    retl
 bb:
   %tmp = getelementptr inbounds %struct.foo, <8 x ptr> %ptr, i64 0, i32 1
   %tmp1 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> %tmp, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i64> undef)
diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll
index a75d42ed0c50f..c058e37e0ce11 100644
--- a/llvm/test/CodeGen/X86/midpoint-int.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int.ll
@@ -658,9 +658,9 @@ define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    subw %dx, %ax
 ; X86-NEXT:    setle %bl
 ; X86-NEXT:    leal -1(%ebx,%ebx), %edx
@@ -710,9 +710,9 @@ define i16 @scalar_i16_unsigned_reg_reg(i16 %a1, i16 %a2) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    subw %dx, %ax
 ; X86-NEXT:    setbe %bl
 ; X86-NEXT:    leal -1(%ebx,%ebx), %edx
@@ -765,9 +765,9 @@ define i16 @scalar_i16_signed_mem_reg(ptr %a1_addr, i16 %a2) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %ecx
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    subw %dx, %ax
 ; X86-NEXT:    setle %bl
 ; X86-NEXT:    leal -1(%ebx,%ebx), %edx
@@ -817,11 +817,11 @@ define i16 @scalar_i16_signed_reg_mem(i16 %a1, ptr %a2_addr) nounwind {
 ; X86-LABEL: scalar_i16_signed_reg_mem:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %edx
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    subw %dx, %ax
 ; X86-NEXT:    setle %bl
 ; X86-NEXT:    leal -1(%ebx,%ebx), %edx
@@ -871,12 +871,12 @@ define i16 @scalar_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
 ; X86-LABEL: scalar_i16_signed_mem_mem:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    movzwl (%ecx), %edx
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    subw %dx, %ax
 ; X86-NEXT:    setle %bl
 ; X86-NEXT:    leal -1(%ebx,%ebx), %edx
diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll
index 73d459ba77026..8f97d2652bc53 100644
--- a/llvm/test/CodeGen/X86/mmx-arith.ll
+++ b/llvm/test/CodeGen/X86/mmx-arith.ll
@@ -403,11 +403,11 @@ define <1 x i64> @test3(ptr %a, ptr %b, i32 %count) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    je .LBB3_1
 ; X86-NEXT:  # %bb.2: # %bb26.preheader
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB3_3: # %bb26
@@ -427,7 +427,6 @@ define <1 x i64> @test3(ptr %a, ptr %b, i32 %count) nounwind {
 ; X86-NEXT:    jb .LBB3_3
 ; X86-NEXT:    jmp .LBB3_4
 ; X86-NEXT:  .LBB3_1:
-; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:  .LBB3_4: # %bb31
 ; X86-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/mul-constant-i16.ll b/llvm/test/CodeGen/X86/mul-constant-i16.ll
index b1aa789e53cd7..a663f6a1dd376 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i16.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i16.ll
@@ -715,8 +715,8 @@ define i16 @test_mul_by_66(i16 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shll $6, %eax
-; X64-NEXT:    leal (%rax,%rdi,2), %eax
+; X64-NEXT:    shll $6, %edi
+; X64-NEXT:    leal (%rdi,%rax,2), %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 66
@@ -757,8 +757,8 @@ define i16 @test_mul_by_520(i16 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shll $9, %eax
-; X64-NEXT:    leal (%rax,%rdi,8), %eax
+; X64-NEXT:    shll $9, %edi
+; X64-NEXT:    leal (%rdi,%rax,8), %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 520
diff --git a/llvm/test/CodeGen/X86/mul-constant-i32.ll b/llvm/test/CodeGen/X86/mul-constant-i32.ll
index 79889b9ace406..4129b44ed3ddc 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i32.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i32.ll
@@ -1155,16 +1155,16 @@ define i32 @test_mul_by_66(i32 %x) {
 ; X64-HSW:       # %bb.0:
 ; X64-HSW-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-HSW-NEXT:    movl %edi, %eax
-; X64-HSW-NEXT:    shll $6, %eax
-; X64-HSW-NEXT:    leal (%rax,%rdi,2), %eax
+; X64-HSW-NEXT:    shll $6, %edi
+; X64-HSW-NEXT:    leal (%rdi,%rax,2), %eax
 ; X64-HSW-NEXT:    retq
 ;
 ; X64-JAG-LABEL: test_mul_by_66:
 ; X64-JAG:       # %bb.0:
 ; X64-JAG-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-JAG-NEXT:    movl %edi, %eax
-; X64-JAG-NEXT:    shll $6, %eax
-; X64-JAG-NEXT:    leal (%rax,%rdi,2), %eax
+; X64-JAG-NEXT:    shll $6, %edi
+; X64-JAG-NEXT:    leal (%rdi,%rax,2), %eax
 ; X64-JAG-NEXT:    retq
 ;
 ; X86-NOOPT-LABEL: test_mul_by_66:
@@ -1241,16 +1241,16 @@ define i32 @test_mul_by_520(i32 %x) {
 ; X64-HSW:       # %bb.0:
 ; X64-HSW-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-HSW-NEXT:    movl %edi, %eax
-; X64-HSW-NEXT:    shll $9, %eax
-; X64-HSW-NEXT:    leal (%rax,%rdi,8), %eax
+; X64-HSW-NEXT:    shll $9, %edi
+; X64-HSW-NEXT:    leal (%rdi,%rax,8), %eax
 ; X64-HSW-NEXT:    retq
 ;
 ; X64-JAG-LABEL: test_mul_by_520:
 ; X64-JAG:       # %bb.0:
 ; X64-JAG-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-JAG-NEXT:    movl %edi, %eax
-; X64-JAG-NEXT:    shll $9, %eax
-; X64-JAG-NEXT:    leal (%rax,%rdi,8), %eax
+; X64-JAG-NEXT:    shll $9, %edi
+; X64-JAG-NEXT:    leal (%rdi,%rax,8), %eax
 ; X64-JAG-NEXT:    retq
 ;
 ; X86-NOOPT-LABEL: test_mul_by_520:
diff --git a/llvm/test/CodeGen/X86/mul-constant-i8.ll b/llvm/test/CodeGen/X86/mul-constant-i8.ll
index a4fa1ee8c0029..b488653655728 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i8.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i8.ll
@@ -425,8 +425,8 @@ define i8 @test_mul_by_66(i8 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shll $6, %eax
-; X64-NEXT:    leal (%rax,%rdi,2), %eax
+; X64-NEXT:    shll $6, %edi
+; X64-NEXT:    leal (%rdi,%rax,2), %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 66
diff --git a/llvm/test/CodeGen/X86/narrow-add-i64.ll b/llvm/test/CodeGen/X86/narrow-add-i64.ll
new file mode 100644
index 0000000000000..a7a54fd57413b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/narrow-add-i64.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
+
+define i64 @test_add_i64_i16_const(i16 %a) nounwind {
+; X86-LABEL: test_add_i64_i16_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl $42, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_add_i64_i16_const:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl %di, %eax
+; X64-NEXT:    addq $42, %rax
+; X64-NEXT:    retq
+  %zext_a = zext i16 %a to i64
+  %sum = add nuw nsw i64 %zext_a, 42
+  ret i64 %sum
+}
+
+; TODO: First 48 bits are all zeros so we can safely truncate to 32 bit additon
+define i64 @test_add_i64_i16_zext(i16 %a, i16 %b) nounwind {
+; X86-LABEL: test_add_i64_i16_zext:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_add_i64_i16_zext:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl %di, %ecx
+; X64-NEXT:    movzwl %si, %eax
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    retq
+  %zext_a = zext i16 %a to i64
+  %zext_b = zext i16 %b to i64
+  %sum = add nuw nsw i64 %zext_a, %zext_b
+  ret i64 %sum
+}
+
+; Negative: Set the 32nd bit of a to force 64 bit addition, we do not truncate to 32 bit addition in this case
+define i64 @negative_test_add_i64_i16(i16 %a) nounwind {
+; X86-LABEL: negative_test_add_i64_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl $42, %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: negative_test_add_i64_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl %di, %ecx
+; X64-NEXT:    movabsq $4294967338, %rax # imm = 0x10000002A
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    retq
+  %zext_a = zext i16 %a to i64
+  %or_a = or i64 %zext_a, 4294967296
+  %sum = add nuw nsw i64 %or_a, 42
+  ret i64 %sum
+}
+
+; Negative: We don't truncate to 32 bit addition in case of sign extension
+define i64 @negative_test_add_i64_i16_sext(i16 %a, i16 %b) nounwind {
+; X86-LABEL: negative_test_add_i64_i16_sext:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: negative_test_add_i64_i16_sext:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    movswq %di, %rcx
+; X64-NEXT:    movswq %si, %rax
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    retq
+  %sext_a = sext i16 %a to i64
+  %sext_b = sext i16 %b to i64
+  %sum = add nuw nsw i64 %sext_a, %sext_b
+  ret i64 %sum
+}
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 4b0f75df83a76..ac4554176c3e7 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -679,39 +679,39 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
 ; SSE2-NEXT:    packuswb %xmm4, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    pand %xmm5, %xmm6
-; SSE2-NEXT:    pandn %xmm1, %xmm5
-; SSE2-NEXT:    por %xmm6, %xmm5
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE2-NEXT:    pand %xmm6, %xmm5
-; SSE2-NEXT:    pandn %xmm3, %xmm6
-; SSE2-NEXT:    por %xmm5, %xmm6
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm6[2,1,0,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
-; SSE2-NEXT:    packuswb %xmm5, %xmm5
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
-; SSE2-NEXT:    pand %xmm6, %xmm0
-; SSE2-NEXT:    pandn %xmm1, %xmm6
-; SSE2-NEXT:    por %xmm0, %xmm6
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSE2-NEXT:    movq %xmm4, (%rsi)
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE2-NEXT:    pand %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm5[2,1,0,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; SSE2-NEXT:    packuswb %xmm4, %xmm4
+; SSE2-NEXT:    movq %xmm4, (%rdx)
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535]
-; SSE2-NEXT:    pand %xmm0, %xmm6
+; SSE2-NEXT:    pand %xmm0, %xmm4
 ; SSE2-NEXT:    pandn %xmm3, %xmm0
-; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-NEXT:    movq %xmm4, (%rsi)
-; SSE2-NEXT:    movq %xmm5, (%rdx)
 ; SSE2-NEXT:    movq %xmm0, (%rcx)
 ; SSE2-NEXT:    retq
 ;
@@ -724,16 +724,16 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; SSE42-NEXT:    movdqa %xmm0, %xmm3
 ; SSE42-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; SSE42-NEXT:    por %xmm2, %xmm3
+; SSE42-NEXT:    movq %xmm3, (%rsi)
 ; SSE42-NEXT:    movdqa %xmm1, %xmm2
 ; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,3,6,u,u,u,u,u,u,u,u]
-; SSE42-NEXT:    movdqa %xmm0, %xmm4
-; SSE42-NEXT:    pshufb {{.*#+}} xmm4 = xmm4[1,4,7,10,13],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u]
-; SSE42-NEXT:    por %xmm2, %xmm4
+; SSE42-NEXT:    movdqa %xmm0, %xmm3
+; SSE42-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; SSE42-NEXT:    por %xmm2, %xmm3
+; SSE42-NEXT:    movq %xmm3, (%rdx)
 ; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; SSE42-NEXT:    por %xmm1, %xmm0
-; SSE42-NEXT:    movq %xmm3, (%rsi)
-; SSE42-NEXT:    movq %xmm4, (%rdx)
 ; SSE42-NEXT:    movq %xmm0, (%rcx)
 ; SSE42-NEXT:    retq
 ;
@@ -744,14 +744,14 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vmovq %xmm2, (%rsi)
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm2, (%rsi)
-; AVX1-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX1-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX1-NEXT:    retq
 ;
@@ -762,14 +762,14 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vmovq %xmm2, (%rsi)
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovq %xmm2, (%rsi)
-; AVX2-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX2-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX2-NEXT:    retq
 ;
@@ -778,10 +778,10 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; XOP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; XOP-NEXT:    vmovdqu (%rdi), %xmm1
 ; XOP-NEXT:    vpperm {{.*#+}} xmm2 = xmm1[0,3,6,9,12,15],xmm0[2,5],xmm1[u,u,u,u,u,u,u,u]
-; XOP-NEXT:    vpperm {{.*#+}} xmm3 = xmm1[1,4,7,10,13],xmm0[0,3,6],xmm1[u,u,u,u,u,u,u,u]
-; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm1[2,5,8,11,14],xmm0[1,4,7],xmm1[u,u,u,u,u,u,u,u]
 ; XOP-NEXT:    vmovq %xmm2, (%rsi)
-; XOP-NEXT:    vmovq %xmm3, (%rdx)
+; XOP-NEXT:    vpperm {{.*#+}} xmm2 = xmm1[1,4,7,10,13],xmm0[0,3,6],xmm1[u,u,u,u,u,u,u,u]
+; XOP-NEXT:    vmovq %xmm2, (%rdx)
+; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm1[2,5,8,11,14],xmm0[1,4,7],xmm1[u,u,u,u,u,u,u,u]
 ; XOP-NEXT:    vmovq %xmm0, (%rcx)
 ; XOP-NEXT:    retq
   %wide.vec = load <24 x i8>, ptr %p, align 4
diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll
index 283c00e17f21a..b6af7e1641a9c 100644
--- a/llvm/test/CodeGen/X86/optimize-max-0.ll
+++ b/llvm/test/CodeGen/X86/optimize-max-0.ll
@@ -16,65 +16,65 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    subl $28, %esp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    movl %edi, %ecx
-; CHECK-NEXT:    imull %ebp, %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    imull %esi, %eax
 ; CHECK-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
+; CHECK-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; CHECK-NEXT:    je LBB0_19
 ; CHECK-NEXT:  ## %bb.1: ## %bb10.preheader
-; CHECK-NEXT:    movl %ecx, %eax
-; CHECK-NEXT:    sarl $31, %eax
-; CHECK-NEXT:    shrl $30, %eax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    sarl $2, %eax
-; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    movl %eax, %ebp
+; CHECK-NEXT:    sarl $31, %ebp
+; CHECK-NEXT:    shrl $30, %ebp
+; CHECK-NEXT:    addl %eax, %ebp
+; CHECK-NEXT:    sarl $2, %ebp
+; CHECK-NEXT:    testl %edx, %edx
 ; CHECK-NEXT:    jle LBB0_12
 ; CHECK-NEXT:  ## %bb.2: ## %bb.nph9
-; CHECK-NEXT:    testl %ebp, %ebp
+; CHECK-NEXT:    testl %esi, %esi
 ; CHECK-NEXT:    jle LBB0_12
 ; CHECK-NEXT:  ## %bb.3: ## %bb.nph9.split
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    incl %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    movl %edi, %edx
+; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_4: ## %bb6
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movzbl (%eax,%esi,2), %ebx
-; CHECK-NEXT:    movb %bl, (%edx,%esi)
-; CHECK-NEXT:    incl %esi
-; CHECK-NEXT:    cmpl %ebp, %esi
+; CHECK-NEXT:    movzbl (%eax,%edi,2), %ebx
+; CHECK-NEXT:    movb %bl, (%edx,%edi)
+; CHECK-NEXT:    incl %edi
+; CHECK-NEXT:    cmpl %esi, %edi
 ; CHECK-NEXT:    jl LBB0_4
 ; CHECK-NEXT:  ## %bb.5: ## %bb9
 ; CHECK-NEXT:    ## in Loop: Header=BB0_4 Depth=1
 ; CHECK-NEXT:    incl %ecx
 ; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    addl %ebp, %edx
-; CHECK-NEXT:    cmpl %edi, %ecx
+; CHECK-NEXT:    addl %esi, %edx
+; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    je LBB0_12
 ; CHECK-NEXT:  ## %bb.6: ## %bb7.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    jmp LBB0_4
 ; CHECK-NEXT:  LBB0_12: ## %bb18.loopexit
+; CHECK-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    addl %ebp, %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    jle LBB0_13
 ; CHECK-NEXT:  ## %bb.7: ## %bb.nph5
-; CHECK-NEXT:    cmpl $2, %ebp
+; CHECK-NEXT:    cmpl $2, %esi
 ; CHECK-NEXT:    jl LBB0_13
 ; CHECK-NEXT:  ## %bb.8: ## %bb.nph5.split
-; CHECK-NEXT:    movl %ebp, %edx
-; CHECK-NEXT:    shrl $31, %edx
-; CHECK-NEXT:    addl %ebp, %edx
-; CHECK-NEXT:    sarl %edx
+; CHECK-NEXT:    movl %esi, %ebp
+; CHECK-NEXT:    shrl $31, %ebp
+; CHECK-NEXT:    addl %esi, %ebp
+; CHECK-NEXT:    sarl %ebp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    shrl $31, %ecx
@@ -84,102 +84,103 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    addl $2, %esi
-; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    movl (%esp), %esi ## 4-byte Reload
-; CHECK-NEXT:    addl %esi, %ecx
-; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    addl $2, %edx
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl (%esp), %edx ## 4-byte Reload
+; CHECK-NEXT:    addl %edx, %ecx
 ; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_9: ## %bb13
 ; CHECK-NEXT:    ## =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ## Child Loop BB0_10 Depth 2
 ; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    addl %esi, %edi
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    addl %edx, %edi
 ; CHECK-NEXT:    imull {{[0-9]+}}(%esp), %edi
 ; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    xorl %ebx, %ebx
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_10: ## %bb14
 ; CHECK-NEXT:    ## Parent Loop BB0_9 Depth=1
 ; CHECK-NEXT:    ## => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    movzbl -2(%edi,%esi,4), %ebx
-; CHECK-NEXT:    movb %bl, (%ecx,%esi)
-; CHECK-NEXT:    movzbl (%edi,%esi,4), %ebx
-; CHECK-NEXT:    movb %bl, (%eax,%esi)
-; CHECK-NEXT:    incl %esi
-; CHECK-NEXT:    cmpl %edx, %esi
+; CHECK-NEXT:    movzbl -2(%edi,%ebx,4), %edx
+; CHECK-NEXT:    movb %dl, (%ecx,%ebx)
+; CHECK-NEXT:    movzbl (%edi,%ebx,4), %edx
+; CHECK-NEXT:    movb %dl, (%eax,%ebx)
+; CHECK-NEXT:    incl %ebx
+; CHECK-NEXT:    cmpl %ebp, %ebx
 ; CHECK-NEXT:    jl LBB0_10
 ; CHECK-NEXT:  ## %bb.11: ## %bb17
 ; CHECK-NEXT:    ## in Loop: Header=BB0_9 Depth=1
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
 ; CHECK-NEXT:    incl %edi
-; CHECK-NEXT:    addl %edx, %eax
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; CHECK-NEXT:    addl $2, %esi
-; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    addl %ebp, %eax
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; CHECK-NEXT:    addl $2, %edx
+; CHECK-NEXT:    addl %ebp, %ecx
 ; CHECK-NEXT:    cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; CHECK-NEXT:    jl LBB0_9
 ; CHECK-NEXT:  LBB0_13: ## %bb20
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    cmpl $1, %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmpl $1, %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; CHECK-NEXT:    je LBB0_19
 ; CHECK-NEXT:  ## %bb.14: ## %bb20
-; CHECK-NEXT:    cmpl $3, %eax
+; CHECK-NEXT:    cmpl $3, %ecx
 ; CHECK-NEXT:    jne LBB0_24
 ; CHECK-NEXT:  ## %bb.15: ## %bb22
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; CHECK-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; CHECK-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; CHECK-NEXT:    testl %edx, %edx
 ; CHECK-NEXT:    jle LBB0_18
 ; CHECK-NEXT:  ## %bb.16: ## %bb.nph
-; CHECK-NEXT:    leal 15(%edi), %eax
+; CHECK-NEXT:    leal 15(%edx), %eax
 ; CHECK-NEXT:    andl $-16, %eax
 ; CHECK-NEXT:    imull {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    addl %ebx, %ebx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl (%esp), %esi ## 4-byte Reload
-; CHECK-NEXT:    addl %esi, %ecx
-; CHECK-NEXT:    addl %ecx, %ebx
-; CHECK-NEXT:    addl %eax, %edx
-; CHECK-NEXT:    leal 15(%ebp), %eax
+; CHECK-NEXT:    addl %ebp, %ebp
+; CHECK-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    addl %edi, %ecx
+; CHECK-NEXT:    addl %ecx, %ebp
+; CHECK-NEXT:    addl %eax, %ebx
+; CHECK-NEXT:    leal 15(%esi), %eax
 ; CHECK-NEXT:    andl $-16, %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_17: ## %bb23
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    subl $4, %esp
-; CHECK-NEXT:    pushl %ebp
-; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    pushl %ebx
-; CHECK-NEXT:    movl %ebx, %esi
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %ebp, %edi
+; CHECK-NEXT:    movl %ebx, %ebp
 ; CHECK-NEXT:    movl %edx, %ebx
 ; CHECK-NEXT:    calll _memcpy
 ; CHECK-NEXT:    movl %ebx, %edx
-; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %ebp, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
 ; CHECK-NEXT:    addl $16, %esp
-; CHECK-NEXT:    addl %ebp, %ebx
-; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; CHECK-NEXT:    decl %edi
+; CHECK-NEXT:    addl %esi, %ebp
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; CHECK-NEXT:    decl %edx
 ; CHECK-NEXT:    jne LBB0_17
 ; CHECK-NEXT:  LBB0_18: ## %bb26
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; CHECK-NEXT:    movl (%esp), %edx ## 4-byte Reload
-; CHECK-NEXT:    addl %edx, %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; CHECK-NEXT:    addl %ecx, %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    addl %esi, %edx
 ; CHECK-NEXT:    jmp LBB0_23
 ; CHECK-NEXT:  LBB0_19: ## %bb29
-; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    testl %edx, %edx
 ; CHECK-NEXT:    jle LBB0_22
 ; CHECK-NEXT:  ## %bb.20: ## %bb.nph11
-; CHECK-NEXT:    movl %edi, %esi
-; CHECK-NEXT:    leal 15(%ebp), %eax
+; CHECK-NEXT:    leal 15(%esi), %eax
 ; CHECK-NEXT:    andl $-16, %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -187,30 +188,32 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:  LBB0_21: ## %bb30
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    subl $4, %esp
-; CHECK-NEXT:    pushl %ebp
-; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    movl %ebx, %ebp
 ; CHECK-NEXT:    movl %edx, %ebx
 ; CHECK-NEXT:    calll _memcpy
 ; CHECK-NEXT:    movl %ebx, %edx
+; CHECK-NEXT:    movl %ebp, %ebx
 ; CHECK-NEXT:    addl $16, %esp
-; CHECK-NEXT:    addl %ebp, %edi
-; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; CHECK-NEXT:    decl %esi
+; CHECK-NEXT:    addl %esi, %edi
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; CHECK-NEXT:    decl %edx
 ; CHECK-NEXT:    jne LBB0_21
 ; CHECK-NEXT:  LBB0_22: ## %bb33
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl (%esp), %edx ## 4-byte Reload
-; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    addl %ecx, %edx
 ; CHECK-NEXT:  LBB0_23: ## %bb33
-; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    shrl $31, %eax
-; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    sarl %eax
 ; CHECK-NEXT:    subl $4, %esp
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    pushl $128
-; CHECK-NEXT:    pushl %ecx
+; CHECK-NEXT:    pushl %edx
 ; CHECK-NEXT:    calll _memset
 ; CHECK-NEXT:    addl $44, %esp
 ; CHECK-NEXT:  LBB0_25: ## %return
@@ -523,38 +526,38 @@ define void @bar(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
 ; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB1_9: ## %bb13
 ; CHECK-NEXT:    ## =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ## Child Loop BB1_10 Depth 2
-; CHECK-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    andl $1, %ebx
 ; CHECK-NEXT:    movl %edx, (%esp) ## 4-byte Spill
-; CHECK-NEXT:    addl %edx, %ebx
-; CHECK-NEXT:    imull {{[0-9]+}}(%esp), %ebx
-; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; CHECK-NEXT:    andl $1, %edx
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    addl %esi, %edx
+; CHECK-NEXT:    imull {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB1_10: ## %bb14
 ; CHECK-NEXT:    ## Parent Loop BB1_9 Depth=1
 ; CHECK-NEXT:    ## => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    movzbl -2(%ebx,%esi,4), %edx
-; CHECK-NEXT:    movb %dl, (%eax,%esi)
-; CHECK-NEXT:    movzbl (%ebx,%esi,4), %edx
-; CHECK-NEXT:    movb %dl, (%ecx,%esi)
+; CHECK-NEXT:    movzbl -2(%edx,%esi,4), %ebx
+; CHECK-NEXT:    movb %bl, (%eax,%esi)
+; CHECK-NEXT:    movzbl (%edx,%esi,4), %ebx
+; CHECK-NEXT:    movb %bl, (%ecx,%esi)
 ; CHECK-NEXT:    incl %esi
 ; CHECK-NEXT:    cmpl %ebp, %esi
 ; CHECK-NEXT:    jb LBB1_10
 ; CHECK-NEXT:  ## %bb.11: ## %bb17
 ; CHECK-NEXT:    ## in Loop: Header=BB1_9 Depth=1
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; CHECK-NEXT:    incl %ebx
-; CHECK-NEXT:    addl %ebp, %ecx
 ; CHECK-NEXT:    movl (%esp), %edx ## 4-byte Reload
-; CHECK-NEXT:    addl $2, %edx
+; CHECK-NEXT:    incl %edx
+; CHECK-NEXT:    addl %ebp, %ecx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; CHECK-NEXT:    addl $2, %esi
 ; CHECK-NEXT:    addl %ebp, %eax
-; CHECK-NEXT:    cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; CHECK-NEXT:    cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; CHECK-NEXT:    jb LBB1_9
 ; CHECK-NEXT:  LBB1_13: ## %bb20
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
diff --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll
index 420f5ba5ab433..31a7f1125150b 100644
--- a/llvm/test/CodeGen/X86/parity.ll
+++ b/llvm/test/CodeGen/X86/parity.ll
@@ -219,12 +219,12 @@ define i64 @parity_64(i64 %x) {
 ;
 ; X64-NOPOPCNT-LABEL: parity_64:
 ; X64-NOPOPCNT:       # %bb.0:
-; X64-NOPOPCNT-NEXT:    movq %rdi, %rax
-; X64-NOPOPCNT-NEXT:    shrq $32, %rax
-; X64-NOPOPCNT-NEXT:    xorl %edi, %eax
-; X64-NOPOPCNT-NEXT:    movl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    movl %edi, %eax
+; X64-NOPOPCNT-NEXT:    shrq $32, %rdi
+; X64-NOPOPCNT-NEXT:    xorl %eax, %edi
+; X64-NOPOPCNT-NEXT:    movl %edi, %ecx
 ; X64-NOPOPCNT-NEXT:    shrl $16, %ecx
-; X64-NOPOPCNT-NEXT:    xorl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    xorl %edi, %ecx
 ; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
 ; X64-NOPOPCNT-NEXT:    xorb %ch, %cl
 ; X64-NOPOPCNT-NEXT:    setnp %al
@@ -264,12 +264,12 @@ define i32 @parity_64_trunc(i64 %x) {
 ;
 ; X64-NOPOPCNT-LABEL: parity_64_trunc:
 ; X64-NOPOPCNT:       # %bb.0:
-; X64-NOPOPCNT-NEXT:    movq %rdi, %rax
-; X64-NOPOPCNT-NEXT:    shrq $32, %rax
-; X64-NOPOPCNT-NEXT:    xorl %edi, %eax
-; X64-NOPOPCNT-NEXT:    movl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    movl %edi, %eax
+; X64-NOPOPCNT-NEXT:    shrq $32, %rdi
+; X64-NOPOPCNT-NEXT:    xorl %eax, %edi
+; X64-NOPOPCNT-NEXT:    movl %edi, %ecx
 ; X64-NOPOPCNT-NEXT:    shrl $16, %ecx
-; X64-NOPOPCNT-NEXT:    xorl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    xorl %edi, %ecx
 ; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
 ; X64-NOPOPCNT-NEXT:    xorb %ch, %cl
 ; X64-NOPOPCNT-NEXT:    setnp %al
@@ -628,12 +628,12 @@ define i64 @parity_64_shift(i64 %0) {
 ;
 ; X64-NOPOPCNT-LABEL: parity_64_shift:
 ; X64-NOPOPCNT:       # %bb.0:
-; X64-NOPOPCNT-NEXT:    movq %rdi, %rax
-; X64-NOPOPCNT-NEXT:    shrq $32, %rax
-; X64-NOPOPCNT-NEXT:    xorl %edi, %eax
-; X64-NOPOPCNT-NEXT:    movl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    movl %edi, %eax
+; X64-NOPOPCNT-NEXT:    shrq $32, %rdi
+; X64-NOPOPCNT-NEXT:    xorl %eax, %edi
+; X64-NOPOPCNT-NEXT:    movl %edi, %ecx
 ; X64-NOPOPCNT-NEXT:    shrl $16, %ecx
-; X64-NOPOPCNT-NEXT:    xorl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    xorl %edi, %ecx
 ; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
 ; X64-NOPOPCNT-NEXT:    xorb %ch, %cl
 ; X64-NOPOPCNT-NEXT:    setnp %al
diff --git a/llvm/test/CodeGen/X86/pr165755.ll b/llvm/test/CodeGen/X86/pr165755.ll
new file mode 100644
index 0000000000000..3ab484f676c45
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr165755.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=X64
+
+define i32 @PR165755(ptr %p0) {
+; X86-LABEL: PR165755:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %eax
+; X86-NEXT:    movb $0, (%ecx)
+; X86-NEXT:    retl
+;
+; X64-LABEL: PR165755:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movb $0, (%rdi)
+; X64-NEXT:    retq
+  %ld64 = load i64, ptr %p0, align 8
+  store i8 0, ptr %p0, align 1
+  %ld32 = load i32, ptr %p0, align 8
+  %mask = and i32 %ld32, 32
+  %zext = zext i32 %mask to i64
+  %srl = lshr i64 %ld64, %zext
+  %res = trunc i64 %srl to i32
+  ret i32 %res
+}
diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll
new file mode 100644
index 0000000000000..162a0c93bfcf4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr166534.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE4
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
+
+define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
+; SSE2-LABEL: pr166534:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqu (%rdi), %xmm0
+; SSE2-NEXT:    movdqu (%rsi), %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    pmovmskb %xmm1, %esi
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    cmpl $65535, %esi # imm = 0xFFFF
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    orq %rax, (%rdx)
+; SSE2-NEXT:    cmpl $65535, %esi # imm = 0xFFFF
+; SSE2-NEXT:    jne .LBB0_2
+; SSE2-NEXT:  # %bb.1: # %if.then
+; SSE2-NEXT:    orq %rax, (%rcx)
+; SSE2-NEXT:  .LBB0_2: # %if.end
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: pr166534:
+; SSE4:       # %bb.0: # %entry
+; SSE4-NEXT:    movdqu (%rdi), %xmm0
+; SSE4-NEXT:    movdqu (%rsi), %xmm1
+; SSE4-NEXT:    pxor %xmm0, %xmm1
+; SSE4-NEXT:    xorl %eax, %eax
+; SSE4-NEXT:    ptest %xmm1, %xmm1
+; SSE4-NEXT:    sete %al
+; SSE4-NEXT:    orq %rax, (%rdx)
+; SSE4-NEXT:    ptest %xmm1, %xmm1
+; SSE4-NEXT:    jne .LBB0_2
+; SSE4-NEXT:  # %bb.1: # %if.then
+; SSE4-NEXT:    orq %rax, (%rcx)
+; SSE4-NEXT:  .LBB0_2: # %if.end
+; SSE4-NEXT:    retq
+;
+; AVX2-LABEL: pr166534:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX2-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    vptest %xmm0, %xmm0
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    orq %rax, (%rdx)
+; AVX2-NEXT:    vptest %xmm0, %xmm0
+; AVX2-NEXT:    jne .LBB0_2
+; AVX2-NEXT:  # %bb.1: # %if.then
+; AVX2-NEXT:    orq %rax, (%rcx)
+; AVX2-NEXT:  .LBB0_2: # %if.end
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: pr166534:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX512-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    vptest %xmm0, %xmm0
+; AVX512-NEXT:    sete %al
+; AVX512-NEXT:    orq %rax, (%rdx)
+; AVX512-NEXT:    vptest %xmm0, %xmm0
+; AVX512-NEXT:    jne .LBB0_2
+; AVX512-NEXT:  # %bb.1: # %if.then
+; AVX512-NEXT:    orq %rax, (%rcx)
+; AVX512-NEXT:  .LBB0_2: # %if.end
+; AVX512-NEXT:    retq
+entry:
+  %a = load i128, ptr %pa, align 8
+  %b = load i128, ptr %pb, align 8
+  %cmp = icmp eq i128 %a, %b
+  %conv1 = zext i1 %cmp to i128
+  %c = load i128, ptr %pc, align 8
+  %or = or i128 %c, %conv1
+  store i128 %or, ptr %pc, align 8
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %d = load i128, ptr %pd, align 8
+  %or7 = or i128 %d, %conv1
+  store i128 %or7, ptr %pd, align 8
+  br label %if.end
+
+if.end:
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll
new file mode 100644
index 0000000000000..ffdb68c7a6c01
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr166744.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=POSTRA
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=haswell | FileCheck %s --check-prefixes=NOPOSTRA
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=NOPOSTRA
+
+; Ensure reloads are after narrowed i512 -> i32 store
+define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) {
+; POSTRA-LABEL: PR166744:
+; POSTRA:       # %bb.0:
+; POSTRA-NEXT:    movl $1029, %eax # imm = 0x405
+; POSTRA-NEXT:    shlxl %esi, %edx, %edx
+; POSTRA-NEXT:    bextrl %eax, %esi, %eax
+; POSTRA-NEXT:    movl (%rdi,%rax,4), %ecx
+; POSTRA-NEXT:    btrl %esi, %ecx
+; POSTRA-NEXT:    orl %ecx, %edx
+; POSTRA-NEXT:    movl %edx, (%rdi,%rax,4)
+; POSTRA-NEXT:    movq 16(%rdi), %rax
+; POSTRA-NEXT:    movq (%rdi), %rcx
+; POSTRA-NEXT:    movq 24(%rdi), %rdx
+; POSTRA-NEXT:    movq 8(%rdi), %rsi
+; POSTRA-NEXT:    orq 56(%rdi), %rdx
+; POSTRA-NEXT:    orq 40(%rdi), %rsi
+; POSTRA-NEXT:    orq 48(%rdi), %rax
+; POSTRA-NEXT:    orq 32(%rdi), %rcx
+; POSTRA-NEXT:    orq %rdx, %rsi
+; POSTRA-NEXT:    orq %rax, %rcx
+; POSTRA-NEXT:    orq %rsi, %rcx
+; POSTRA-NEXT:    setne %al
+; POSTRA-NEXT:    retq
+;
+; NOPOSTRA-LABEL: PR166744:
+; NOPOSTRA:       # %bb.0:
+; NOPOSTRA-NEXT:    movl %esi, %eax
+; NOPOSTRA-NEXT:    shrl $3, %esi
+; NOPOSTRA-NEXT:    andl $60, %esi
+; NOPOSTRA-NEXT:    movl (%rdi,%rsi), %ecx
+; NOPOSTRA-NEXT:    btrl %eax, %ecx
+; NOPOSTRA-NEXT:    shlxl %eax, %edx, %eax
+; NOPOSTRA-NEXT:    orl %ecx, %eax
+; NOPOSTRA-NEXT:    movl %eax, (%rdi,%rsi)
+; NOPOSTRA-NEXT:    movq 16(%rdi), %rax
+; NOPOSTRA-NEXT:    movq (%rdi), %rcx
+; NOPOSTRA-NEXT:    movq 8(%rdi), %rdx
+; NOPOSTRA-NEXT:    movq 24(%rdi), %rsi
+; NOPOSTRA-NEXT:    orq 56(%rdi), %rsi
+; NOPOSTRA-NEXT:    orq 40(%rdi), %rdx
+; NOPOSTRA-NEXT:    orq 48(%rdi), %rax
+; NOPOSTRA-NEXT:    orq 32(%rdi), %rcx
+; NOPOSTRA-NEXT:    orq %rsi, %rdx
+; NOPOSTRA-NEXT:    orq %rax, %rcx
+; NOPOSTRA-NEXT:    orq %rdx, %rcx
+; NOPOSTRA-NEXT:    setne %al
+; NOPOSTRA-NEXT:    retq
+  %rem = and i64 %idx, 511
+  %sh_prom = zext nneg i64 %rem to i512
+  %shl = shl nuw i512 1, %sh_prom
+  %not = xor i512 %shl, -1
+  %load = load i512, ptr %v, align 8
+  %and = and i512 %load, %not
+  %conv2 = zext i1 %b to i512
+  %shl4 = shl nuw i512 %conv2, %sh_prom
+  %or = or i512 %and, %shl4
+  store i512 %or, ptr %v, align 8
+  %cmp = icmp ne i512 %or, 0
+  ret i1 %cmp
+}
diff --git a/llvm/test/CodeGen/X86/regalloc-fp.ll b/llvm/test/CodeGen/X86/regalloc-fp.ll
new file mode 100644
index 0000000000000..e89e5ab1d6b59
--- /dev/null
+++ b/llvm/test/CodeGen/X86/regalloc-fp.ll
@@ -0,0 +1,775 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Context:
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+define i32 @check_none() "frame-pointer"="none" {
+; CHECK-LABEL: check_none:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $2, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $4, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $5, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $6, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $7, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $9, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $16, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $17, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $18, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $19, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $20, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r8d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r9d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r10d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r11d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %ebx
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %ebp
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r15d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r12d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r8d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r9d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r10d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r11d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %ebx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %ebp, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r14d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r15d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r12d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r13d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %reg0 = alloca i32, align 4
+  %reg1 = alloca i32, align 4
+  %reg2 = alloca i32, align 4
+  %reg3 = alloca i32, align 4
+  %reg4 = alloca i32, align 4
+  %reg5 = alloca i32, align 4
+  %reg6 = alloca i32, align 4
+  %reg7 = alloca i32, align 4
+  %reg8 = alloca i32, align 4
+  %reg9 = alloca i32, align 4
+  %reg10 = alloca i32, align 4
+  %reg11 = alloca i32, align 4
+  %reg12 = alloca i32, align 4
+  %reg13 = alloca i32, align 4
+  %reg14 = alloca i32, align 4
+  store volatile i32 0, ptr %reg0, align 4
+  store volatile i32 1, ptr %reg1, align 4
+  store volatile i32 2, ptr %reg2, align 4
+  store volatile i32 3, ptr %reg3, align 4
+  store volatile i32 4, ptr %reg4, align 4
+  store volatile i32 5, ptr %reg5, align 4
+  store volatile i32 6, ptr %reg6, align 4
+  store volatile i32 7, ptr %reg7, align 4
+  store volatile i32 8, ptr %reg8, align 4
+  store volatile i32 9, ptr %reg9, align 4
+  store volatile i32 16, ptr %reg10, align 4
+  store volatile i32 17, ptr %reg11, align 4
+  store volatile i32 18, ptr %reg12, align 4
+  store volatile i32 19, ptr %reg13, align 4
+  store volatile i32 20, ptr %reg14, align 4
+  %0 = load volatile i32, ptr %reg0, align 4
+  %1 = load volatile i32, ptr %reg1, align 4
+  %2 = load volatile i32, ptr %reg2, align 4
+  %3 = load volatile i32, ptr %reg3, align 4
+  %4 = load volatile i32, ptr %reg4, align 4
+  %5 = load volatile i32, ptr %reg5, align 4
+  %6 = load volatile i32, ptr %reg6, align 4
+  %7 = load volatile i32, ptr %reg7, align 4
+  %8 = load volatile i32, ptr %reg8, align 4
+  %9 = load volatile i32, ptr %reg9, align 4
+  %10 = load volatile i32, ptr %reg10, align 4
+  %11 = load volatile i32, ptr %reg11, align 4
+  %12 = load volatile i32, ptr %reg12, align 4
+  %13 = load volatile i32, ptr %reg13, align 4
+  %14 = load volatile i32, ptr %reg14, align 4
+  %15 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14) #1
+  %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 0
+  %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 1
+  %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 2
+  %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 3
+  %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 4
+  %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 5
+  %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 6
+  %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 7
+  %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 8
+  %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 9
+  %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 10
+  %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 11
+  %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 12
+  %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 13
+  %asmresult14 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 14
+  store volatile i32 %asmresult, ptr %reg0, align 4
+  store volatile i32 %asmresult1, ptr %reg1, align 4
+  store volatile i32 %asmresult2, ptr %reg2, align 4
+  store volatile i32 %asmresult3, ptr %reg3, align 4
+  store volatile i32 %asmresult4, ptr %reg4, align 4
+  store volatile i32 %asmresult5, ptr %reg5, align 4
+  store volatile i32 %asmresult6, ptr %reg6, align 4
+  store volatile i32 %asmresult7, ptr %reg7, align 4
+  store volatile i32 %asmresult8, ptr %reg8, align 4
+  store volatile i32 %asmresult9, ptr %reg9, align 4
+  store volatile i32 %asmresult10, ptr %reg10, align 4
+  store volatile i32 %asmresult11, ptr %reg11, align 4
+  store volatile i32 %asmresult12, ptr %reg12, align 4
+  store volatile i32 %asmresult13, ptr %reg13, align 4
+  store volatile i32 %asmresult14, ptr %reg14, align 4
+  ret i32 0
+}
+
+define i32 @test_non_leaf_no_reserve() "frame-pointer"="non-leaf-no-reserve" {
+; CHECK-LABEL: test_non_leaf_no_reserve:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $2, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $4, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $5, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $6, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $7, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $9, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $16, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $17, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $18, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $19, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $20, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r8d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r9d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r10d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r11d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %ebx
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %ebp
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r15d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r12d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r8d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r9d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r10d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r11d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %ebx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %ebp, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r14d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r15d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r12d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r13d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %reg0 = alloca i32, align 4
+  %reg1 = alloca i32, align 4
+  %reg2 = alloca i32, align 4
+  %reg3 = alloca i32, align 4
+  %reg4 = alloca i32, align 4
+  %reg5 = alloca i32, align 4
+  %reg6 = alloca i32, align 4
+  %reg7 = alloca i32, align 4
+  %reg8 = alloca i32, align 4
+  %reg9 = alloca i32, align 4
+  %reg10 = alloca i32, align 4
+  %reg11 = alloca i32, align 4
+  %reg12 = alloca i32, align 4
+  %reg13 = alloca i32, align 4
+  %reg14 = alloca i32, align 4
+  store volatile i32 0, ptr %reg0, align 4
+  store volatile i32 1, ptr %reg1, align 4
+  store volatile i32 2, ptr %reg2, align 4
+  store volatile i32 3, ptr %reg3, align 4
+  store volatile i32 4, ptr %reg4, align 4
+  store volatile i32 5, ptr %reg5, align 4
+  store volatile i32 6, ptr %reg6, align 4
+  store volatile i32 7, ptr %reg7, align 4
+  store volatile i32 8, ptr %reg8, align 4
+  store volatile i32 9, ptr %reg9, align 4
+  store volatile i32 16, ptr %reg10, align 4
+  store volatile i32 17, ptr %reg11, align 4
+  store volatile i32 18, ptr %reg12, align 4
+  store volatile i32 19, ptr %reg13, align 4
+  store volatile i32 20, ptr %reg14, align 4
+  %0 = load volatile i32, ptr %reg0, align 4
+  %1 = load volatile i32, ptr %reg1, align 4
+  %2 = load volatile i32, ptr %reg2, align 4
+  %3 = load volatile i32, ptr %reg3, align 4
+  %4 = load volatile i32, ptr %reg4, align 4
+  %5 = load volatile i32, ptr %reg5, align 4
+  %6 = load volatile i32, ptr %reg6, align 4
+  %7 = load volatile i32, ptr %reg7, align 4
+  %8 = load volatile i32, ptr %reg8, align 4
+  %9 = load volatile i32, ptr %reg9, align 4
+  %10 = load volatile i32, ptr %reg10, align 4
+  %11 = load volatile i32, ptr %reg11, align 4
+  %12 = load volatile i32, ptr %reg12, align 4
+  %13 = load volatile i32, ptr %reg13, align 4
+  %14 = load volatile i32, ptr %reg14, align 4
+  %15 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14) #1
+  %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 0
+  %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 1
+  %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 2
+  %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 3
+  %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 4
+  %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 5
+  %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 6
+  %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 7
+  %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 8
+  %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 9
+  %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 10
+  %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 11
+  %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 12
+  %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 13
+  %asmresult14 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 14
+  store volatile i32 %asmresult, ptr %reg0, align 4
+  store volatile i32 %asmresult1, ptr %reg1, align 4
+  store volatile i32 %asmresult2, ptr %reg2, align 4
+  store volatile i32 %asmresult3, ptr %reg3, align 4
+  store volatile i32 %asmresult4, ptr %reg4, align 4
+  store volatile i32 %asmresult5, ptr %reg5, align 4
+  store volatile i32 %asmresult6, ptr %reg6, align 4
+  store volatile i32 %asmresult7, ptr %reg7, align 4
+  store volatile i32 %asmresult8, ptr %reg8, align 4
+  store volatile i32 %asmresult9, ptr %reg9, align 4
+  store volatile i32 %asmresult10, ptr %reg10, align 4
+  store volatile i32 %asmresult11, ptr %reg11, align 4
+  store volatile i32 %asmresult12, ptr %reg12, align 4
+  store volatile i32 %asmresult13, ptr %reg13, align 4
+  store volatile i32 %asmresult14, ptr %reg14, align 4
+  ret i32 0
+}
+
+define i32 @test_non_leaf() "frame-pointer"="non-leaf" {
+; CHECK-LABEL: test_non_leaf:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset %rbx, -48
+; CHECK-NEXT:    .cfi_offset %r12, -40
+; CHECK-NEXT:    .cfi_offset %r13, -32
+; CHECK-NEXT:    .cfi_offset %r14, -24
+; CHECK-NEXT:    .cfi_offset %r15, -16
+; CHECK-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $2, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $4, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $5, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $6, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $7, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $9, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $16, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $17, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $18, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $19, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r8d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r9d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r10d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r11d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %ebx
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r15d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r12d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r8d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r9d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r10d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r11d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %ebx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r14d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r15d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r12d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r13d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %reg0 = alloca i32, align 4
+  %reg1 = alloca i32, align 4
+  %reg2 = alloca i32, align 4
+  %reg3 = alloca i32, align 4
+  %reg4 = alloca i32, align 4
+  %reg5 = alloca i32, align 4
+  %reg6 = alloca i32, align 4
+  %reg7 = alloca i32, align 4
+  %reg8 = alloca i32, align 4
+  %reg9 = alloca i32, align 4
+  %reg10 = alloca i32, align 4
+  %reg11 = alloca i32, align 4
+  %reg12 = alloca i32, align 4
+  %reg13 = alloca i32, align 4
+  store volatile i32 0, ptr %reg0, align 4
+  store volatile i32 1, ptr %reg1, align 4
+  store volatile i32 2, ptr %reg2, align 4
+  store volatile i32 3, ptr %reg3, align 4
+  store volatile i32 4, ptr %reg4, align 4
+  store volatile i32 5, ptr %reg5, align 4
+  store volatile i32 6, ptr %reg6, align 4
+  store volatile i32 7, ptr %reg7, align 4
+  store volatile i32 8, ptr %reg8, align 4
+  store volatile i32 9, ptr %reg9, align 4
+  store volatile i32 16, ptr %reg10, align 4
+  store volatile i32 17, ptr %reg11, align 4
+  store volatile i32 18, ptr %reg12, align 4
+  store volatile i32 19, ptr %reg13, align 4
+  %0 = load volatile i32, ptr %reg0, align 4
+  %1 = load volatile i32, ptr %reg1, align 4
+  %2 = load volatile i32, ptr %reg2, align 4
+  %3 = load volatile i32, ptr %reg3, align 4
+  %4 = load volatile i32, ptr %reg4, align 4
+  %5 = load volatile i32, ptr %reg5, align 4
+  %6 = load volatile i32, ptr %reg6, align 4
+  %7 = load volatile i32, ptr %reg7, align 4
+  %8 = load volatile i32, ptr %reg8, align 4
+  %9 = load volatile i32, ptr %reg9, align 4
+  %10 = load volatile i32, ptr %reg10, align 4
+  %11 = load volatile i32, ptr %reg11, align 4
+  %12 = load volatile i32, ptr %reg12, align 4
+  %13 = load volatile i32, ptr %reg13, align 4
+  %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1
+  %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0
+  %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1
+  %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2
+  %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3
+  %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4
+  %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5
+  %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6
+  %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7
+  %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8
+  %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9
+  %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10
+  %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11
+  %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12
+  %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13
+  store volatile i32 %asmresult, ptr %reg0, align 4
+  store volatile i32 %asmresult1, ptr %reg1, align 4
+  store volatile i32 %asmresult2, ptr %reg2, align 4
+  store volatile i32 %asmresult3, ptr %reg3, align 4
+  store volatile i32 %asmresult4, ptr %reg4, align 4
+  store volatile i32 %asmresult5, ptr %reg5, align 4
+  store volatile i32 %asmresult6, ptr %reg6, align 4
+  store volatile i32 %asmresult7, ptr %reg7, align 4
+  store volatile i32 %asmresult8, ptr %reg8, align 4
+  store volatile i32 %asmresult9, ptr %reg9, align 4
+  store volatile i32 %asmresult10, ptr %reg10, align 4
+  store volatile i32 %asmresult11, ptr %reg11, align 4
+  store volatile i32 %asmresult12, ptr %reg12, align 4
+  store volatile i32 %asmresult13, ptr %reg13, align 4
+  ret i32 0
+}
+
+define i32 @test_reserved() "frame-pointer"="reserved" {
+; CHECK-LABEL: test_reserved:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset %rbx, -48
+; CHECK-NEXT:    .cfi_offset %r12, -40
+; CHECK-NEXT:    .cfi_offset %r13, -32
+; CHECK-NEXT:    .cfi_offset %r14, -24
+; CHECK-NEXT:    .cfi_offset %r15, -16
+; CHECK-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $2, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $4, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $5, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $6, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $7, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $9, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $16, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $17, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $18, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $19, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r8d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r9d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r10d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r11d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %ebx
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r15d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r12d
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r8d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r9d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r10d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r11d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %ebx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r14d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r15d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r12d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %r13d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %reg0 = alloca i32, align 4
+  %reg1 = alloca i32, align 4
+  %reg2 = alloca i32, align 4
+  %reg3 = alloca i32, align 4
+  %reg4 = alloca i32, align 4
+  %reg5 = alloca i32, align 4
+  %reg6 = alloca i32, align 4
+  %reg7 = alloca i32, align 4
+  %reg8 = alloca i32, align 4
+  %reg9 = alloca i32, align 4
+  %reg10 = alloca i32, align 4
+  %reg11 = alloca i32, align 4
+  %reg12 = alloca i32, align 4
+  %reg13 = alloca i32, align 4
+  store volatile i32 0, ptr %reg0, align 4
+  store volatile i32 1, ptr %reg1, align 4
+  store volatile i32 2, ptr %reg2, align 4
+  store volatile i32 3, ptr %reg3, align 4
+  store volatile i32 4, ptr %reg4, align 4
+  store volatile i32 5, ptr %reg5, align 4
+  store volatile i32 6, ptr %reg6, align 4
+  store volatile i32 7, ptr %reg7, align 4
+  store volatile i32 8, ptr %reg8, align 4
+  store volatile i32 9, ptr %reg9, align 4
+  store volatile i32 16, ptr %reg10, align 4
+  store volatile i32 17, ptr %reg11, align 4
+  store volatile i32 18, ptr %reg12, align 4
+  store volatile i32 19, ptr %reg13, align 4
+  %0 = load volatile i32, ptr %reg0, align 4
+  %1 = load volatile i32, ptr %reg1, align 4
+  %2 = load volatile i32, ptr %reg2, align 4
+  %3 = load volatile i32, ptr %reg3, align 4
+  %4 = load volatile i32, ptr %reg4, align 4
+  %5 = load volatile i32, ptr %reg5, align 4
+  %6 = load volatile i32, ptr %reg6, align 4
+  %7 = load volatile i32, ptr %reg7, align 4
+  %8 = load volatile i32, ptr %reg8, align 4
+  %9 = load volatile i32, ptr %reg9, align 4
+  %10 = load volatile i32, ptr %reg10, align 4
+  %11 = load volatile i32, ptr %reg11, align 4
+  %12 = load volatile i32, ptr %reg12, align 4
+  %13 = load volatile i32, ptr %reg13, align 4
+  %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1
+  %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0
+  %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1
+  %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2
+  %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3
+  %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4
+  %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5
+  %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6
+  %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7
+  %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8
+  %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9
+  %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10
+  %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11
+  %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12
+  %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13
+  store volatile i32 %asmresult, ptr %reg0, align 4
+  store volatile i32 %asmresult1, ptr %reg1, align 4
+  store volatile i32 %asmresult2, ptr %reg2, align 4
+  store volatile i32 %asmresult3, ptr %reg3, align 4
+  store volatile i32 %asmresult4, ptr %reg4, align 4
+  store volatile i32 %asmresult5, ptr %reg5, align 4
+  store volatile i32 %asmresult6, ptr %reg6, align 4
+  store volatile i32 %asmresult7, ptr %reg7, align 4
+  store volatile i32 %asmresult8, ptr %reg8, align 4
+  store volatile i32 %asmresult9, ptr %reg9, align 4
+  store volatile i32 %asmresult10, ptr %reg10, align 4
+  store volatile i32 %asmresult11, ptr %reg11, align 4
+  store volatile i32 %asmresult12, ptr %reg12, align 4
+  store volatile i32 %asmresult13, ptr %reg13, align 4
+  ret i32 0
+}
+
+define i32 @test_all() "frame-pointer"="all" {
+; CHECK-LABEL: test_all:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    movl $0, -96(%rbp)
+; CHECK-NEXT:    movl $1, -92(%rbp)
+; CHECK-NEXT:    movl $2, -88(%rbp)
+; CHECK-NEXT:    movl $3, -84(%rbp)
+; CHECK-NEXT:    movl $4, -80(%rbp)
+; CHECK-NEXT:    movl $5, -76(%rbp)
+; CHECK-NEXT:    movl $6, -72(%rbp)
+; CHECK-NEXT:    movl $7, -68(%rbp)
+; CHECK-NEXT:    movl $8, -64(%rbp)
+; CHECK-NEXT:    movl $9, -60(%rbp)
+; CHECK-NEXT:    movl $16, -56(%rbp)
+; CHECK-NEXT:    movl $17, -52(%rbp)
+; CHECK-NEXT:    movl $18, -48(%rbp)
+; CHECK-NEXT:    movl $19, -44(%rbp)
+; CHECK-NEXT:    movl -96(%rbp), %eax
+; CHECK-NEXT:    movl -92(%rbp), %ecx
+; CHECK-NEXT:    movl -88(%rbp), %edx
+; CHECK-NEXT:    movl -84(%rbp), %esi
+; CHECK-NEXT:    movl -80(%rbp), %edi
+; CHECK-NEXT:    movl -76(%rbp), %r8d
+; CHECK-NEXT:    movl -72(%rbp), %r9d
+; CHECK-NEXT:    movl -68(%rbp), %r10d
+; CHECK-NEXT:    movl -64(%rbp), %r11d
+; CHECK-NEXT:    movl -60(%rbp), %ebx
+; CHECK-NEXT:    movl -56(%rbp), %r14d
+; CHECK-NEXT:    movl -52(%rbp), %r15d
+; CHECK-NEXT:    movl -48(%rbp), %r12d
+; CHECK-NEXT:    movl -44(%rbp), %r13d
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl %eax, -96(%rbp)
+; CHECK-NEXT:    movl %ecx, -92(%rbp)
+; CHECK-NEXT:    movl %edx, -88(%rbp)
+; CHECK-NEXT:    movl %esi, -84(%rbp)
+; CHECK-NEXT:    movl %edi, -80(%rbp)
+; CHECK-NEXT:    movl %r8d, -76(%rbp)
+; CHECK-NEXT:    movl %r9d, -72(%rbp)
+; CHECK-NEXT:    movl %r10d, -68(%rbp)
+; CHECK-NEXT:    movl %r11d, -64(%rbp)
+; CHECK-NEXT:    movl %ebx, -60(%rbp)
+; CHECK-NEXT:    movl %r14d, -56(%rbp)
+; CHECK-NEXT:    movl %r15d, -52(%rbp)
+; CHECK-NEXT:    movl %r12d, -48(%rbp)
+; CHECK-NEXT:    movl %r13d, -44(%rbp)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
+; CHECK-NEXT:    retq
+entry:
+  %reg0 = alloca i32, align 4
+  %reg1 = alloca i32, align 4
+  %reg2 = alloca i32, align 4
+  %reg3 = alloca i32, align 4
+  %reg4 = alloca i32, align 4
+  %reg5 = alloca i32, align 4
+  %reg6 = alloca i32, align 4
+  %reg7 = alloca i32, align 4
+  %reg8 = alloca i32, align 4
+  %reg9 = alloca i32, align 4
+  %reg10 = alloca i32, align 4
+  %reg11 = alloca i32, align 4
+  %reg12 = alloca i32, align 4
+  %reg13 = alloca i32, align 4
+  store volatile i32 0, ptr %reg0, align 4
+  store volatile i32 1, ptr %reg1, align 4
+  store volatile i32 2, ptr %reg2, align 4
+  store volatile i32 3, ptr %reg3, align 4
+  store volatile i32 4, ptr %reg4, align 4
+  store volatile i32 5, ptr %reg5, align 4
+  store volatile i32 6, ptr %reg6, align 4
+  store volatile i32 7, ptr %reg7, align 4
+  store volatile i32 8, ptr %reg8, align 4
+  store volatile i32 9, ptr %reg9, align 4
+  store volatile i32 16, ptr %reg10, align 4
+  store volatile i32 17, ptr %reg11, align 4
+  store volatile i32 18, ptr %reg12, align 4
+  store volatile i32 19, ptr %reg13, align 4
+  %0 = load volatile i32, ptr %reg0, align 4
+  %1 = load volatile i32, ptr %reg1, align 4
+  %2 = load volatile i32, ptr %reg2, align 4
+  %3 = load volatile i32, ptr %reg3, align 4
+  %4 = load volatile i32, ptr %reg4, align 4
+  %5 = load volatile i32, ptr %reg5, align 4
+  %6 = load volatile i32, ptr %reg6, align 4
+  %7 = load volatile i32, ptr %reg7, align 4
+  %8 = load volatile i32, ptr %reg8, align 4
+  %9 = load volatile i32, ptr %reg9, align 4
+  %10 = load volatile i32, ptr %reg10, align 4
+  %11 = load volatile i32, ptr %reg11, align 4
+  %12 = load volatile i32, ptr %reg12, align 4
+  %13 = load volatile i32, ptr %reg13, align 4
+  %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1
+  %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0
+  %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1
+  %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2
+  %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3
+  %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4
+  %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5
+  %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6
+  %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7
+  %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8
+  %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9
+  %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10
+  %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11
+  %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12
+  %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13
+  store volatile i32 %asmresult, ptr %reg0, align 4
+  store volatile i32 %asmresult1, ptr %reg1, align 4
+  store volatile i32 %asmresult2, ptr %reg2, align 4
+  store volatile i32 %asmresult3, ptr %reg3, align 4
+  store volatile i32 %asmresult4, ptr %reg4, align 4
+  store volatile i32 %asmresult5, ptr %reg5, align 4
+  store volatile i32 %asmresult6, ptr %reg6, align 4
+  store volatile i32 %asmresult7, ptr %reg7, align 4
+  store volatile i32 %asmresult8, ptr %reg8, align 4
+  store volatile i32 %asmresult9, ptr %reg9, align 4
+  store volatile i32 %asmresult10, ptr %reg10, align 4
+  store volatile i32 %asmresult11, ptr %reg11, align 4
+  store volatile i32 %asmresult12, ptr %reg12, align 4
+  store volatile i32 %asmresult13, ptr %reg13, align 4
+  ret i32 0
+}
diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll
index 8f046a4f5aea5..26e68861cf45c 100644
--- a/llvm/test/CodeGen/X86/rotate-extract.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract.ll
@@ -203,10 +203,10 @@ define i16 @no_extract_mul(i16 %i) nounwind {
 ; X64-LABEL: no_extract_mul:
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    leal (%rdi,%rdi,8), %eax
-; X64-NEXT:    # kill: def $edi killed $edi killed $rdi def $rdi
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shll $8, %edi
 ; X64-NEXT:    leal (%rdi,%rdi,8), %ecx
+; X64-NEXT:    leal (%rax,%rax,8), %eax
 ; X64-NEXT:    movzwl %ax, %eax
 ; X64-NEXT:    shrl $9, %eax
 ; X64-NEXT:    orl %ecx, %eax
diff --git a/llvm/test/CodeGen/X86/scatter-schedule.ll b/llvm/test/CodeGen/X86/scatter-schedule.ll
index 762a050247a87..36bf31395d6d5 100644
--- a/llvm/test/CodeGen/X86/scatter-schedule.ll
+++ b/llvm/test/CodeGen/X86/scatter-schedule.ll
@@ -9,9 +9,9 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test(i64 %x272, <16 x ptr> %x335, <16 x i32> %x270) {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterqd %ymm2, (,%zmm0) {%k1}
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; CHECK-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
 ; CHECK-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll
index ce56283df6010..8cb032776114b 100644
--- a/llvm/test/CodeGen/X86/smul_fix.ll
+++ b/llvm/test/CodeGen/X86/smul_fix.ll
@@ -10,10 +10,10 @@ declare  <4 x i32> @llvm.smul.fix.v4i32(<4 x i32>, <4 x i32>, i32)
 define i32 @func(i32 %x, i32 %y) nounwind {
 ; X64-LABEL: func:
 ; X64:       # %bb.0:
-; X64-NEXT:    movslq %esi, %rax
-; X64-NEXT:    movslq %edi, %rcx
-; X64-NEXT:    imulq %rax, %rcx
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movslq %esi, %rcx
+; X64-NEXT:    movslq %edi, %rax
+; X64-NEXT:    imulq %rcx, %rax
+; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    shrq $32, %rax
 ; X64-NEXT:    shldl $30, %ecx, %eax
 ; X64-NEXT:    # kill: def $eax killed $eax killed $rax
diff --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll
index e5ea911d4771a..a93be22bf5861 100644
--- a/llvm/test/CodeGen/X86/sshl_sat.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat.ll
@@ -15,16 +15,16 @@ define i16 @func(i16 %x, i16 %y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
 ; X64-NEXT:    movl %edi, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movswl %dx, %esi
+; X64-NEXT:    shll %cl, %edi
+; X64-NEXT:    movswl %di, %esi
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    sarl %cl, %esi
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    testw %di, %di
+; X64-NEXT:    testw %dx, %dx
 ; X64-NEXT:    sets %al
 ; X64-NEXT:    addl $32767, %eax # imm = 0x7FFF
-; X64-NEXT:    cmpw %si, %di
-; X64-NEXT:    cmovel %edx, %eax
+; X64-NEXT:    cmpw %si, %dx
+; X64-NEXT:    cmovel %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 ;
@@ -33,17 +33,17 @@ define i16 @func(i16 %x, i16 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movswl %si, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movswl %dx, %edi
 ; X86-NEXT:    sarl %cl, %edi
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testw %dx, %dx
+; X86-NEXT:    testw %si, %si
 ; X86-NEXT:    sets %al
 ; X86-NEXT:    addl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    cmpw %di, %dx
-; X86-NEXT:    cmovel %esi, %eax
+; X86-NEXT:    cmpw %di, %si
+; X86-NEXT:    cmovel %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -58,18 +58,18 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
 ; X64-NEXT:    movl %esi, %ecx
 ; X64-NEXT:    movsbl %dil, %eax
 ; X64-NEXT:    addl %eax, %eax
-; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    movl %eax, %edx
+; X64-NEXT:    xorl %esi, %esi
 ; X64-NEXT:    testw %ax, %ax
-; X64-NEXT:    sets %dl
-; X64-NEXT:    addl $32767, %edx # imm = 0x7FFF
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movswl %si, %edi
+; X64-NEXT:    sets %sil
+; X64-NEXT:    addl $32767, %esi # imm = 0x7FFF
+; X64-NEXT:    shll %cl, %eax
+; X64-NEXT:    movswl %ax, %edi
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    sarl %cl, %edi
-; X64-NEXT:    cmpw %di, %ax
-; X64-NEXT:    cmovnel %edx, %esi
-; X64-NEXT:    movswl %si, %eax
+; X64-NEXT:    cmpw %di, %dx
+; X64-NEXT:    cmovnel %esi, %eax
+; X64-NEXT:    cwtl
 ; X64-NEXT:    shrl %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
index 10dee14bdd1a0..ff76707bdbb69 100644
--- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
@@ -365,119 +365,118 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    movswl %bx, %ebp
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movswl %di, %ebp
 ; X86-NEXT:    sarl %cl, %ebp
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testw %di, %di
+; X86-NEXT:    testw %bx, %bx
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    cmpw %bp, %di
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmovel %ebx, %ecx
+; X86-NEXT:    cmpw %bp, %bx
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    cmovel %edi, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movswl %di, %ebx
-; X86-NEXT:    sarl %cl, %ebx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testw %si, %si
-; X86-NEXT:    sets %al
-; X86-NEXT:    addl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    cmpw %bx, %si
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovel %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movswl %si, %edi
 ; X86-NEXT:    sarl %cl, %edi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testw %dx, %dx
-; X86-NEXT:    sets %al
-; X86-NEXT:    addl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    cmpw %di, %dx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testw %bx, %bx
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    cmpw %di, %bx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovel %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    cmovel %esi, %ebp
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    movswl %dx, %esi
 ; X86-NEXT:    sarl %cl, %esi
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    testw %di, %di
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    addl $32767, %ebx # imm = 0x7FFF
-; X86-NEXT:    cmpw %si, %ax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %si, %di
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmovel %edx, %ebx
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movswl %ax, %edx
+; X86-NEXT:    sarl %cl, %edx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testw %si, %si
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    cmpw %dx, %si
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    cmovel %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movswl %dx, %esi
-; X86-NEXT:    sarl %cl, %esi
+; X86-NEXT:    movswl %dx, %eax
+; X86-NEXT:    sarl %cl, %eax
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    testw %si, %si
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    cmpw %si, %ax
+; X86-NEXT:    cmpw %ax, %si
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmovel %edx, %ecx
-; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movswl %dx, %esi
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movswl %ax, %esi
 ; X86-NEXT:    sarl %cl, %esi
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    testw %dx, %dx
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    addl $32767, %ebx # imm = 0x7FFF
-; X86-NEXT:    cmpw %si, %ax
-; X86-NEXT:    cmovel %edx, %ebx
+; X86-NEXT:    cmpw %si, %dx
+; X86-NEXT:    cmovel %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movswl %si, %edi
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movswl %ax, %edi
 ; X86-NEXT:    sarl %cl, %edi
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    testw %si, %si
 ; X86-NEXT:    sets %dl
 ; X86-NEXT:    addl $32767, %edx # imm = 0x7FFF
-; X86-NEXT:    cmpw %di, %ax
-; X86-NEXT:    cmovel %esi, %edx
+; X86-NEXT:    cmpw %di, %si
+; X86-NEXT:    cmovel %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movswl %si, %edi
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movswl %ax, %edi
 ; X86-NEXT:    sarl %cl, %edi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    testw %si, %si
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    cmpw %di, %ax
-; X86-NEXT:    cmovel %esi, %ecx
+; X86-NEXT:    cmpw %di, %si
+; X86-NEXT:    cmovel %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movw %cx, 14(%eax)
 ; X86-NEXT:    movw %dx, 12(%eax)
 ; X86-NEXT:    movw %bx, 10(%eax)
-; X86-NEXT:    movw %bp, 8(%eax)
 ; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movw %cx, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movw %cx, 6(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movw %cx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movw %cx, 2(%eax)
+; X86-NEXT:    movw %bp, 2(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movw %cx, (%eax)
 ; X86-NEXT:    addl $16, %esp
diff --git a/llvm/test/CodeGen/X86/stackmap.ll b/llvm/test/CodeGen/X86/stackmap.ll
index 72406aaa4efa8..9bf88cb8bdf81 100644
--- a/llvm/test/CodeGen/X86/stackmap.ll
+++ b/llvm/test/CodeGen/X86/stackmap.ll
@@ -1,7 +1,10 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -terminal-rule=0 | FileCheck %s
 ;
 ; Note: Print verbose stackmaps using -debug-only=stackmaps.
 
+; FIXME: Test should be fixed to produce the correct sized spill with
+; -terminal-rule=0 flag removed
+
 ; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
 ; CHECK-NEXT:  __LLVM_StackMaps:
 ; Header
@@ -546,8 +549,8 @@ define void @clobberScratch(i32 %a) {
   ret void
 }
 
-; A stack frame which needs to be realigned at runtime (to meet alignment 
-; criteria for values on the stack) does not have a fixed frame size. 
+; A stack frame which needs to be realigned at runtime (to meet alignment
+; criteria for values on the stack) does not have a fixed frame size.
 ; CHECK-LABEL:  .long L{{.*}}-_needsStackRealignment
 ; CHECK-NEXT:   .short 0
 ; 0 locations
diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
index 5bd624c0697a0..01fbafb18eb9f 100644
--- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
+++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
@@ -2429,126 +2429,126 @@ define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
 ; SSE2-ONLY:       # %bb.0:
 ; SSE2-ONLY-NEXT:    movl (%rdi), %eax
 ; SSE2-ONLY-NEXT:    notl %eax
-; SSE2-ONLY-NEXT:    movw %ax, (%rsi)
 ; SSE2-ONLY-NEXT:    movl %eax, %ecx
-; SSE2-ONLY-NEXT:    shrl $16, %ecx
-; SSE2-ONLY-NEXT:    movb %cl, 2(%rsi)
-; SSE2-ONLY-NEXT:    movb %cl, 2(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, (%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 6(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 4(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 10(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 8(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 14(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 12(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 18(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 16(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 22(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 20(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 26(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 24(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 30(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 28(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 34(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 32(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 38(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 36(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 42(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 40(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 46(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 44(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 50(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 48(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 54(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 52(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 58(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 56(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 62(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 60(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, (%rsi)
+; SSE2-ONLY-NEXT:    shrl $16, %eax
+; SSE2-ONLY-NEXT:    movb %al, 2(%rsi)
+; SSE2-ONLY-NEXT:    movb %al, 2(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, (%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 6(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 4(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 10(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 8(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 14(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 12(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 18(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 16(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 22(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 20(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 26(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 24(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 30(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 28(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 34(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 32(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 38(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 36(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 42(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 40(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 46(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 44(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 50(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 48(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 54(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 52(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 58(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 56(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 62(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 60(%rdx)
 ; SSE2-ONLY-NEXT:    retq
 ;
 ; SSE3-LABEL: vec384_v3i8:
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movl (%rdi), %eax
 ; SSE3-NEXT:    notl %eax
-; SSE3-NEXT:    movw %ax, (%rsi)
 ; SSE3-NEXT:    movl %eax, %ecx
-; SSE3-NEXT:    shrl $16, %ecx
-; SSE3-NEXT:    movb %cl, 2(%rsi)
-; SSE3-NEXT:    movb %cl, 2(%rdx)
-; SSE3-NEXT:    movw %ax, (%rdx)
-; SSE3-NEXT:    movb %cl, 6(%rdx)
-; SSE3-NEXT:    movw %ax, 4(%rdx)
-; SSE3-NEXT:    movb %cl, 10(%rdx)
-; SSE3-NEXT:    movw %ax, 8(%rdx)
-; SSE3-NEXT:    movb %cl, 14(%rdx)
-; SSE3-NEXT:    movw %ax, 12(%rdx)
-; SSE3-NEXT:    movb %cl, 18(%rdx)
-; SSE3-NEXT:    movw %ax, 16(%rdx)
-; SSE3-NEXT:    movb %cl, 22(%rdx)
-; SSE3-NEXT:    movw %ax, 20(%rdx)
-; SSE3-NEXT:    movb %cl, 26(%rdx)
-; SSE3-NEXT:    movw %ax, 24(%rdx)
-; SSE3-NEXT:    movb %cl, 30(%rdx)
-; SSE3-NEXT:    movw %ax, 28(%rdx)
-; SSE3-NEXT:    movb %cl, 34(%rdx)
-; SSE3-NEXT:    movw %ax, 32(%rdx)
-; SSE3-NEXT:    movb %cl, 38(%rdx)
-; SSE3-NEXT:    movw %ax, 36(%rdx)
-; SSE3-NEXT:    movb %cl, 42(%rdx)
-; SSE3-NEXT:    movw %ax, 40(%rdx)
-; SSE3-NEXT:    movb %cl, 46(%rdx)
-; SSE3-NEXT:    movw %ax, 44(%rdx)
-; SSE3-NEXT:    movb %cl, 50(%rdx)
-; SSE3-NEXT:    movw %ax, 48(%rdx)
-; SSE3-NEXT:    movb %cl, 54(%rdx)
-; SSE3-NEXT:    movw %ax, 52(%rdx)
-; SSE3-NEXT:    movb %cl, 58(%rdx)
-; SSE3-NEXT:    movw %ax, 56(%rdx)
-; SSE3-NEXT:    movb %cl, 62(%rdx)
-; SSE3-NEXT:    movw %ax, 60(%rdx)
+; SSE3-NEXT:    movw %ax, (%rsi)
+; SSE3-NEXT:    shrl $16, %eax
+; SSE3-NEXT:    movb %al, 2(%rsi)
+; SSE3-NEXT:    movb %al, 2(%rdx)
+; SSE3-NEXT:    movw %cx, (%rdx)
+; SSE3-NEXT:    movb %al, 6(%rdx)
+; SSE3-NEXT:    movw %cx, 4(%rdx)
+; SSE3-NEXT:    movb %al, 10(%rdx)
+; SSE3-NEXT:    movw %cx, 8(%rdx)
+; SSE3-NEXT:    movb %al, 14(%rdx)
+; SSE3-NEXT:    movw %cx, 12(%rdx)
+; SSE3-NEXT:    movb %al, 18(%rdx)
+; SSE3-NEXT:    movw %cx, 16(%rdx)
+; SSE3-NEXT:    movb %al, 22(%rdx)
+; SSE3-NEXT:    movw %cx, 20(%rdx)
+; SSE3-NEXT:    movb %al, 26(%rdx)
+; SSE3-NEXT:    movw %cx, 24(%rdx)
+; SSE3-NEXT:    movb %al, 30(%rdx)
+; SSE3-NEXT:    movw %cx, 28(%rdx)
+; SSE3-NEXT:    movb %al, 34(%rdx)
+; SSE3-NEXT:    movw %cx, 32(%rdx)
+; SSE3-NEXT:    movb %al, 38(%rdx)
+; SSE3-NEXT:    movw %cx, 36(%rdx)
+; SSE3-NEXT:    movb %al, 42(%rdx)
+; SSE3-NEXT:    movw %cx, 40(%rdx)
+; SSE3-NEXT:    movb %al, 46(%rdx)
+; SSE3-NEXT:    movw %cx, 44(%rdx)
+; SSE3-NEXT:    movb %al, 50(%rdx)
+; SSE3-NEXT:    movw %cx, 48(%rdx)
+; SSE3-NEXT:    movb %al, 54(%rdx)
+; SSE3-NEXT:    movw %cx, 52(%rdx)
+; SSE3-NEXT:    movb %al, 58(%rdx)
+; SSE3-NEXT:    movw %cx, 56(%rdx)
+; SSE3-NEXT:    movb %al, 62(%rdx)
+; SSE3-NEXT:    movw %cx, 60(%rdx)
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-ONLY-LABEL: vec384_v3i8:
 ; SSSE3-ONLY:       # %bb.0:
 ; SSSE3-ONLY-NEXT:    movl (%rdi), %eax
 ; SSSE3-ONLY-NEXT:    notl %eax
-; SSSE3-ONLY-NEXT:    movw %ax, (%rsi)
 ; SSSE3-ONLY-NEXT:    movl %eax, %ecx
-; SSSE3-ONLY-NEXT:    shrl $16, %ecx
-; SSSE3-ONLY-NEXT:    movb %cl, 2(%rsi)
-; SSSE3-ONLY-NEXT:    movb %cl, 2(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, (%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 6(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 4(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 10(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 8(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 14(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 12(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 18(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 16(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 22(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 20(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 26(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 24(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 30(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 28(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 34(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 32(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 38(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 36(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 42(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 40(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 46(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 44(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 50(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 48(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 54(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 52(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 58(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 56(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 62(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 60(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, (%rsi)
+; SSSE3-ONLY-NEXT:    shrl $16, %eax
+; SSSE3-ONLY-NEXT:    movb %al, 2(%rsi)
+; SSSE3-ONLY-NEXT:    movb %al, 2(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, (%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 6(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 4(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 10(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 8(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 14(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 12(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 18(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 16(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 22(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 20(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 26(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 24(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 30(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 28(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 34(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 32(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 38(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 36(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 42(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 40(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 46(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 44(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 50(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 48(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 54(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 52(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 58(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 56(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 62(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 60(%rdx)
 ; SSSE3-ONLY-NEXT:    retq
 ;
 ; SSE41-LABEL: vec384_v3i8:
diff --git a/llvm/test/CodeGen/X86/twoaddr-lea.ll b/llvm/test/CodeGen/X86/twoaddr-lea.ll
index f20b777531c5a..3ad3e9a0e7655 100644
--- a/llvm/test/CodeGen/X86/twoaddr-lea.ll
+++ b/llvm/test/CodeGen/X86/twoaddr-lea.ll
@@ -65,10 +65,10 @@ entry:
 define void @ham() {
 ; CHECK-LABEL: ham:
 ; CHECK:       ## %bb.0: ## %bb
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    movq _global@GOTPCREL(%rip), %rdx
 ; CHECK-NEXT:    movq _global2@GOTPCREL(%rip), %rsi
-; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %cl, %cl
 ; CHECK-NEXT:    je LBB3_2
 ; CHECK-NEXT:    .p2align 4
diff --git a/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll
new file mode 100644
index 0000000000000..e9756b411eb2c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll
@@ -0,0 +1,156 @@
+; RUN: llc < %s -mtriple=x86_64 | FileCheck %s
+
+; GitHub issue #161036
+
+; Positive test : umin(sub(a,b),a) with scalar types should be folded
+define i64 @underflow_compare_fold_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq    %rdi, %rax
+; CHECK-NEXT: subq    %rsi, %rax
+; CHECK-NEXT: cmovbq  %rdi, %rax
+; CHECK-NEXT: retq
+  %sub = sub i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+; Positive test : umin(a,sub(a,b)) with scalar types should be folded
+define i64 @underflow_compare_fold_i64_commute(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64_commute
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq    %rdi, %rax
+; CHECK-NEXT: subq    %rsi, %rax
+; CHECK-NEXT: cmovbq  %rdi, %rax
+; CHECK-NEXT: retq
+  %sub = sub i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+  ret i64 %cond
+}
+
+; Positive test : multi-use is OK since the sub instruction still runs once
+define i64 @underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i64_multi_use
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq    %rdi, %rax
+; CHECK-NEXT: subq    %rsi, %rax
+; CHECK-NEXT: movq    %rax, (%rdx)
+; CHECK-NEXT: cmovbq  %rdi, %rax
+; CHECK-NEXT: retq
+  %sub = sub i64 %a, %b
+  store i64 %sub, ptr addrspace(1) %ptr
+  %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subl    %esi, %eax
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: retq
+  %sub = sub i32 %a, %b
+  %cond = tail call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_commute(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32_commute
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subl    %esi, %eax
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: retq
+  %sub = sub i32 %a, %b
+  %cond = tail call i32 @llvm.umin.i32(i32 %a, i32 %sub)
+  ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i32_multi_use
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subl    %esi, %eax
+; CHECK-NEXT: movl    %eax, (%rdx)
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: retq
+  %sub = sub i32 %a, %b
+  store i32 %sub, ptr addrspace(1) %ptr
+  %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+; Positive test : i16
+define i16 @underflow_compare_fold_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subw    %si,  %ax
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+  %sub = sub i16 %a, %b
+  %cond = tail call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+; Positive test : i16
+define i16 @underflow_compare_fold_i16_commute(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16_commute
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subw    %si,  %ax
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+  %sub = sub i16 %a, %b
+  %cond = tail call i16 @llvm.umin.i16(i16 %a, i16 %sub)
+  ret i16 %cond
+}
+
+; Positive test : i16
+define i16 @underflow_compare_fold_i16_multi_use(i16 %a, i16 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i16_multi_use
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subw    %si,  %ax
+; CHECK-NEXT: movw    %ax,  (%rdx)
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+  %sub = sub i16 %a, %b
+  store i16 %sub, ptr addrspace(1) %ptr
+  %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+
+; Negative test, vector types : umin(sub(a,b),a) but with vectors
+define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: underflow_compare_dontfold_vectors
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: psubb %xmm1, %xmm2
+; CHECK-NEXT: pminub %xmm2, %xmm0
+; CHECK-NEXT: retq
+  %sub = sub <16 x i8> %a, %b
+  %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
+  ret <16 x i8> %cond
+}
+
+; Negative test, pattern mismatch : umin(add(a,b),a)
+define i64 @umin_add(i64 %a, i64 %b) {
+; CHECK-LABEL: umin_add
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: leaq (%rsi,%rdi), %rax
+; CHECK-NEXT: cmpq %rdi, %rax
+; CHECK-NEXT: cmovaeq %rdi, %rax
+; CHECK-NEXT: retq
+  %add = add i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
+  ret i64 %cond
+}
diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll
index eacc714b49a4d..5a68484596a2f 100644
--- a/llvm/test/CodeGen/X86/umul_fix.ll
+++ b/llvm/test/CodeGen/X86/umul_fix.ll
@@ -10,10 +10,10 @@ declare  <4 x i32> @llvm.umul.fix.v4i32(<4 x i32>, <4 x i32>, i32)
 define i32 @func(i32 %x, i32 %y) nounwind {
 ; X64-LABEL: func:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    imulq %rax, %rcx
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    imulq %rcx, %rax
+; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    shrq $32, %rax
 ; X64-NEXT:    shldl $30, %ecx, %eax
 ; X64-NEXT:    # kill: def $eax killed $eax killed $rax
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index 759055d284d12..1a92365638814 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -138,22 +138,25 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; SSE2-NEXT:    psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [683,u,819,u]
 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1024,2048,2048,2]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
 ; SSE2-NEXT:    movl $1463, %eax # imm = 0x5B7
 ; SSE2-NEXT:    movd %eax, %xmm3
 ; SSE2-NEXT:    pmuludq %xmm1, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2048,u,2,u]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047]
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
 ; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    psrld $1, %xmm0
-; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; SSE2-NEXT:    pslld $10, %xmm3
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
+; SSE2-NEXT:    orps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm1, %xmm3
 ; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
diff --git a/llvm/test/CodeGen/X86/ushl_sat.ll b/llvm/test/CodeGen/X86/ushl_sat.ll
index e0e1ef7108d0d..9768e4761f47a 100644
--- a/llvm/test/CodeGen/X86/ushl_sat.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat.ll
@@ -14,23 +14,23 @@ define i16 @func(i16 %x, i16 %y) nounwind {
 ; X64-LABEL: func:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movzwl %dx, %eax
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll %cl, %edi
+; X64-NEXT:    movzwl %di, %edx
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %eax
-; X64-NEXT:    cmpw %ax, %di
+; X64-NEXT:    shrl %cl, %edx
+; X64-NEXT:    cmpw %dx, %ax
 ; X64-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X64-NEXT:    cmovel %edx, %eax
+; X64-NEXT:    cmovel %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: func:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    movzwl %dx, %esi
 ; X86-NEXT:    shrl %cl, %esi
@@ -51,14 +51,14 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
 ; X64-NEXT:    movsbl %dil, %eax
 ; X64-NEXT:    addl %eax, %eax
 ; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movzwl %dx, %esi
+; X64-NEXT:    shll %cl, %eax
+; X64-NEXT:    movzwl %ax, %esi
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrl %cl, %esi
-; X64-NEXT:    cmpw %si, %ax
-; X64-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X64-NEXT:    cmovel %edx, %eax
-; X64-NEXT:    cwtl
+; X64-NEXT:    cmpw %si, %dx
+; X64-NEXT:    movl $65535, %ecx # imm = 0xFFFF
+; X64-NEXT:    cmovel %eax, %ecx
+; X64-NEXT:    movswl %cx, %eax
 ; X64-NEXT:    shrl %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
index b8e83da9cf361..762088cfb2935 100644
--- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
@@ -300,95 +300,94 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    movzwl %bx, %edi
-; X86-NEXT:    shrl %cl, %edi
-; X86-NEXT:    cmpw %di, %ax
-; X86-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X86-NEXT:    cmovnel %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movzwl %ax, %edi
-; X86-NEXT:    shrl %cl, %edi
-; X86-NEXT:    cmpw %di, %si
+; X86-NEXT:    movzwl %ax, %esi
+; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    cmpw %si, %dx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $65535, %esi # imm = 0xFFFF
-; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    movl $65535, %edx # imm = 0xFFFF
+; X86-NEXT:    cmovnel %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movzwl %ax, %edx
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    cmpw %dx, %bp
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovnel %esi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    shll %cl, %ebp
-; X86-NEXT:    movzwl %bp, %edx
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    cmpw %dx, %si
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzwl %bp, %eax
+; X86-NEXT:    shrl %cl, %eax
+; X86-NEXT:    cmpw %ax, %di
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovnel %eax, %ebp
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    cmovnel %edx, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    movzwl %bx, %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzwl %bx, %edx
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    cmpw %dx, %ax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $65535, %esi # imm = 0xFFFF
 ; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movzwl %di, %edx
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    cmpw %dx, %ax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    cmovnel %esi, %edi
+; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    shll %cl, %ebp
+; X86-NEXT:    movzwl %bp, %edx
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    cmpw %dx, %ax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmovnel %esi, %ebp
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movzwl %di, %eax
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    cmpw %ax, %dx
+; X86-NEXT:    movzwl %di, %edx
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    cmpw %dx, %ax
 ; X86-NEXT:    cmovnel %esi, %edi
+; X86-NEXT:    movl $65535, %ebx # imm = 0xFFFF
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movzwl %si, %eax
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movzwl %dx, %eax
 ; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    cmpw %ax, %dx
-; X86-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X86-NEXT:    cmovnel %eax, %esi
+; X86-NEXT:    cmpw %ax, %si
+; X86-NEXT:    cmovnel %ebx, %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movzwl %ax, %edx
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpw %dx, %cx
+; X86-NEXT:    movzwl %ax, %esi
+; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    cmpw %si, %bx
 ; X86-NEXT:    movl $65535, %ecx # imm = 0xFFFF
 ; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movw %ax, 14(%ecx)
-; X86-NEXT:    movw %si, 12(%ecx)
+; X86-NEXT:    movw %dx, 12(%ecx)
 ; X86-NEXT:    movw %di, 10(%ecx)
-; X86-NEXT:    movw %bx, 8(%ecx)
-; X86-NEXT:    movw %bp, 6(%ecx)
+; X86-NEXT:    movw %bp, 8(%ecx)
 ; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movw %ax, 6(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movw %ax, 4(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movw %ax, 2(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movw %ax, (%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
index 304daab6d17a9..2e85a4e60a253 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
@@ -319,9 +319,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; SSE2-LABEL: constant_funnnel_v2i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,u,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u]
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,32,u,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -333,8 +333,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; SSE41-LABEL: constant_funnnel_v2i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u]
-; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
+; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,32,u,u]
+; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,u,u]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -345,8 +345,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; AVX1-LABEL: constant_funnnel_v2i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,1,u]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1]
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,32,u,u]
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,u,u]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -411,9 +411,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; X86-SSE2-LABEL: constant_funnnel_v2i32:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1]
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,u,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,1,u]
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,32,u,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
index ae5dd18d4b663..8db54147b2fb7 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
@@ -499,11 +499,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; SSE2-NEXT:    psrld $28, %xmm1
 ; SSE2-NEXT:    psrld $27, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,u,1,u]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; SSE2-NEXT:    pslld $4, %xmm0
+; SSE2-NEXT:    pslld $5, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
@@ -514,7 +512,10 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; SSE41-NEXT:    psrld $27, %xmm2
 ; SSE41-NEXT:    psrld $28, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pslld $5, %xmm1
+; SSE41-NEXT:    pslld $4, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -523,7 +524,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpsrld $27, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1]
+; AVX1-NEXT:    vpslld $5, %xmm0, %xmm2
+; AVX1-NEXT:    vpslld $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -597,11 +600,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; X86-SSE2-NEXT:    psrld $28, %xmm1
 ; X86-SSE2-NEXT:    psrld $27, %xmm2
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [32,u,1,u]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    pslld $4, %xmm0
+; X86-SSE2-NEXT:    pslld $5, %xmm2
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
index 4b42b189538ac..17bbfa1208c01 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
@@ -341,9 +341,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; SSE2-LABEL: constant_funnnel_v2i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1]
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,u,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u]
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,134217728,u,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -355,8 +355,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; SSE41-LABEL: constant_funnnel_v2i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u]
-; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1]
+; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,134217728,u,u]
+; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,u,u]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -367,8 +367,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; AVX1-LABEL: constant_funnnel_v2i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,u,1,u]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,1,1]
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,134217728,u,u]
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,u,u]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -433,9 +433,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; X86-SSE2-LABEL: constant_funnnel_v2i32:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,1,1]
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,u,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,u,1,u]
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,134217728,u,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
index 2d8670a6d3f23..144e77b87f44c 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
@@ -497,42 +497,35 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %
 define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; SSE2-LABEL: constant_funnnel_v2i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; SSE2-NEXT:    psrld $4, %xmm1
 ; SSE2-NEXT:    psrld $5, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psrld $4, %xmm3
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
 ; SSE2-NEXT:    pslld $28, %xmm0
-; SSE2-NEXT:    pslld $27, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pslld $27, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_funnnel_v2i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    psrld $5, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    psrld $4, %xmm3
-; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT:    psrld $4, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    pslld $27, %xmm1
 ; SSE41-NEXT:    pslld $28, %xmm0
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_funnnel_v2i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpsrld $5, %xmm1, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT:    vpsrld $4, %xmm1, %xmm3
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpsrld $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
 ; AVX1-NEXT:    vpslld $27, %xmm0, %xmm2
 ; AVX1-NEXT:    vpslld $28, %xmm0, %xmm0
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
@@ -606,17 +599,15 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ;
 ; X86-SSE2-LABEL: constant_funnnel_v2i32:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X86-SSE2-NEXT:    psrld $4, %xmm1
 ; X86-SSE2-NEXT:    psrld $5, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE2-NEXT:    psrld $4, %xmm3
-; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
 ; X86-SSE2-NEXT:    pslld $28, %xmm0
-; X86-SSE2-NEXT:    pslld $27, %xmm1
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE2-NEXT:    por %xmm3, %xmm0
+; X86-SSE2-NEXT:    pslld $27, %xmm2
+; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
   %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 4, i32 5>)
   ret <2 x i32> %res
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
index dbb4b9f64f4b7..e0410ae0cc5cb 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
@@ -84,11 +84,11 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    movq %xmm1, (%rsi)
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; SSE-NEXT:    movq %xmm1, (%rsi)
 ; SSE-NEXT:    movq %xmm0, (%rdx)
 ; SSE-NEXT:    retq
 ;
@@ -96,8 +96,8 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vmovq %xmm1, (%rsi)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vmovq %xmm0, (%rdx)
 ; AVX-NEXT:    retq
 ;
@@ -105,8 +105,8 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vmovq %xmm1, (%rsi)
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vmovq %xmm0, (%rdx)
 ; AVX2-NEXT:    retq
 ;
@@ -114,8 +114,8 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vmovq %xmm1, (%rsi)
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vmovq %xmm0, (%rdx)
 ; AVX2-FP-NEXT:    retq
 ;
@@ -123,17 +123,17 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vmovq %xmm1, (%rsi)
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vmovq %xmm0, (%rdx)
 ; AVX2-FCP-NEXT:    retq
 ;
 ; AVX512-LABEL: load_i16_stride2_vf4:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpmovdw %xmm0, (%rsi)
-; AVX512-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovq %xmm0, (%rdx)
 ; AVX512-NEXT:    retq
   %wide.vec = load <8 x i16>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index da902b3aed5ab..c932482f7af9d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -196,18 +196,18 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[0,3,2,3,4,5,6,7]
+; SSE-NEXT:    movq %xmm2, (%rsi)
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
+; SSE-NEXT:    movq %xmm1, (%rdx)
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT:    movq %xmm2, (%rsi)
-; SSE-NEXT:    movq %xmm1, (%rdx)
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE-NEXT:    movq %xmm0, (%rcx)
 ; SSE-NEXT:    retq
 ;
@@ -217,14 +217,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm2, (%rsi)
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT:    vmovq %xmm2, (%rsi)
-; AVX-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX-NEXT:    retq
 ;
@@ -234,14 +234,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovq %xmm2, (%rsi)
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT:    vmovq %xmm2, (%rsi)
-; AVX2-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX2-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX2-NEXT:    retq
 ;
@@ -251,13 +251,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX2-FP-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX2-FP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX2-FP-NEXT:    retq
 ;
@@ -267,13 +267,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX2-FCP-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX2-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -283,14 +283,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512-NEXT:    retq
 ;
@@ -300,13 +300,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512-FCP-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -316,14 +316,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512DQ-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-NEXT:    retq
 ;
@@ -333,13 +333,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -348,15 +348,16 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512BW-NEXT:    vpermw %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
-; AVX512BW-NEXT:    vpermw %ymm1, %ymm2, %ymm1
 ; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm3 = mem[2,1,2,3]
-; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm3
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm1, (%rdx)
-; AVX512BW-NEXT:    vmovq %xmm2, (%rcx)
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7]
+; AVX512BW-NEXT:    vpermw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[2,1,2,3]
+; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -365,13 +366,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512BW-FCP-NEXT:    vpermw %ymm1, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
-; AVX512BW-FCP-NEXT:    vpermw %ymm1, %ymm2, %ymm2
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11]
-; AVX512BW-FCP-NEXT:    vpermw %ymm1, %ymm3, %ymm1
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7]
+; AVX512BW-FCP-NEXT:    vpermw %ymm1, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [2,5,8,11,2,3,10,11]
+; AVX512BW-FCP-NEXT:    vpermw %ymm1, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -380,15 +381,16 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
 ; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512DQ-BW-NEXT:    vpermw %ymm1, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
-; AVX512DQ-BW-NEXT:    vpermw %ymm1, %ymm2, %ymm1
 ; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm3 = mem[2,1,2,3]
-; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm3
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rcx)
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7]
+; AVX512DQ-BW-NEXT:    vpermw %ymm1, %ymm0, %ymm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[2,1,2,3]
+; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -397,13 +399,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm1, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm1, %ymm2, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm1, %ymm3, %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm1, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [2,5,8,11,2,3,10,11]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm1, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <12 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
index 01aacc1e06258..d4e5d4c16a9ec 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@@ -220,20 +220,20 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm4[0,2,2,3,4,5,6,7]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; SSE-NEXT:    movq %xmm5, (%rsi)
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm4[1,3,2,3,4,5,6,7]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT:    movq %xmm3, (%rdx)
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[2,0,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT:    movq %xmm3, (%rcx)
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT:    movq %xmm5, (%rsi)
-; SSE-NEXT:    movq %xmm3, (%rdx)
-; SSE-NEXT:    movq %xmm4, (%rcx)
 ; SSE-NEXT:    movq %xmm0, (%r8)
 ; SSE-NEXT:    retq
 ;
@@ -246,23 +246,23 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
 ; AVX-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; AVX-NEXT:    vmovq %xmm0, (%rdx)
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX-NEXT:    vmovq %xmm2, (%rcx)
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX-NEXT:    vmovq %xmm0, (%rsi)
-; AVX-NEXT:    vmovq %xmm3, (%rdx)
-; AVX-NEXT:    vmovq %xmm4, (%rcx)
-; AVX-NEXT:    vmovq %xmm1, (%r8)
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX-NEXT:    vmovq %xmm0, (%r8)
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: load_i16_stride4_vf4:
@@ -274,23 +274,23 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
 ; AVX2-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; AVX2-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; AVX2-NEXT:    vmovq %xmm0, (%rdx)
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-NEXT:    vmovq %xmm2, (%rcx)
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-NEXT:    vmovq %xmm3, (%rdx)
-; AVX2-NEXT:    vmovq %xmm4, (%rcx)
-; AVX2-NEXT:    vmovq %xmm1, (%r8)
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX2-NEXT:    vmovq %xmm0, (%r8)
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i16_stride4_vf4:
@@ -302,22 +302,22 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
 ; AVX2-FP-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX2-FP-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-FP-NEXT:    vmovq %xmm0, (%rdx)
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-FP-NEXT:    vmovq %xmm2, (%rcx)
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-FP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-FP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX2-FP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX2-FP-NEXT:    vmovq %xmm1, (%r8)
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX2-FP-NEXT:    vmovq %xmm0, (%r8)
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i16_stride4_vf4:
@@ -329,125 +329,125 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
 ; AVX2-FCP-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX2-FCP-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-FCP-NEXT:    vmovq %xmm0, (%rdx)
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%rcx)
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX2-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX2-FCP-NEXT:    vmovq %xmm1, (%r8)
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX2-FCP-NEXT:    vmovq %xmm0, (%r8)
 ; AVX2-FCP-NEXT:    retq
 ;
 ; AVX512-LABEL: load_i16_stride4_vf4:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512-NEXT:    vpsrlq $16, %ymm0, %ymm1
-; AVX512-NEXT:    vpsrlq $32, %ymm0, %ymm2
-; AVX512-NEXT:    vpsrlq $48, %ymm0, %ymm3
 ; AVX512-NEXT:    vpmovqw %ymm0, (%rsi)
+; AVX512-NEXT:    vpsrlq $16, %ymm0, %ymm1
 ; AVX512-NEXT:    vpmovqw %ymm1, (%rdx)
-; AVX512-NEXT:    vpmovqw %ymm2, (%rcx)
-; AVX512-NEXT:    vpmovqw %ymm3, (%r8)
+; AVX512-NEXT:    vpsrlq $32, %ymm0, %ymm1
+; AVX512-NEXT:    vpmovqw %ymm1, (%rcx)
+; AVX512-NEXT:    vpsrlq $48, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqw %ymm0, (%r8)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i16_stride4_vf4:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512-FCP-NEXT:    vpsrlq $16, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vpsrlq $32, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vpsrlq $48, %ymm0, %ymm3
 ; AVX512-FCP-NEXT:    vpmovqw %ymm0, (%rsi)
+; AVX512-FCP-NEXT:    vpsrlq $16, %ymm0, %ymm1
 ; AVX512-FCP-NEXT:    vpmovqw %ymm1, (%rdx)
-; AVX512-FCP-NEXT:    vpmovqw %ymm2, (%rcx)
-; AVX512-FCP-NEXT:    vpmovqw %ymm3, (%r8)
+; AVX512-FCP-NEXT:    vpsrlq $32, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpmovqw %ymm1, (%rcx)
+; AVX512-FCP-NEXT:    vpsrlq $48, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpmovqw %ymm0, (%r8)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i16_stride4_vf4:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-NEXT:    vpsrlq $16, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vpsrlq $32, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpsrlq $48, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpmovqw %ymm0, (%rsi)
+; AVX512DQ-NEXT:    vpsrlq $16, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpmovqw %ymm1, (%rdx)
-; AVX512DQ-NEXT:    vpmovqw %ymm2, (%rcx)
-; AVX512DQ-NEXT:    vpmovqw %ymm3, (%r8)
+; AVX512DQ-NEXT:    vpsrlq $32, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vpmovqw %ymm1, (%rcx)
+; AVX512DQ-NEXT:    vpsrlq $48, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovqw %ymm0, (%r8)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf4:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vpsrlq $16, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpsrlq $32, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vpsrlq $48, %ymm0, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpmovqw %ymm0, (%rsi)
+; AVX512DQ-FCP-NEXT:    vpsrlq $16, %ymm0, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpmovqw %ymm1, (%rdx)
-; AVX512DQ-FCP-NEXT:    vpmovqw %ymm2, (%rcx)
-; AVX512DQ-FCP-NEXT:    vpmovqw %ymm3, (%r8)
+; AVX512DQ-FCP-NEXT:    vpsrlq $32, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpmovqw %ymm1, (%rcx)
+; AVX512DQ-FCP-NEXT:    vpsrlq $48, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpmovqw %ymm0, (%r8)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i16_stride4_vf4:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpsrlq $16, %ymm0, %ymm1
-; AVX512BW-NEXT:    vpsrlq $32, %ymm0, %ymm2
-; AVX512BW-NEXT:    vpsrlq $48, %ymm0, %ymm3
 ; AVX512BW-NEXT:    vpmovqw %ymm0, (%rsi)
+; AVX512BW-NEXT:    vpsrlq $16, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpmovqw %ymm1, (%rdx)
-; AVX512BW-NEXT:    vpmovqw %ymm2, (%rcx)
-; AVX512BW-NEXT:    vpmovqw %ymm3, (%r8)
+; AVX512BW-NEXT:    vpsrlq $32, %ymm0, %ymm1
+; AVX512BW-NEXT:    vpmovqw %ymm1, (%rcx)
+; AVX512BW-NEXT:    vpsrlq $48, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovqw %ymm0, (%r8)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i16_stride4_vf4:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-FCP-NEXT:    vpsrlq $16, %ymm0, %ymm1
-; AVX512BW-FCP-NEXT:    vpsrlq $32, %ymm0, %ymm2
-; AVX512BW-FCP-NEXT:    vpsrlq $48, %ymm0, %ymm3
 ; AVX512BW-FCP-NEXT:    vpmovqw %ymm0, (%rsi)
+; AVX512BW-FCP-NEXT:    vpsrlq $16, %ymm0, %ymm1
 ; AVX512BW-FCP-NEXT:    vpmovqw %ymm1, (%rdx)
-; AVX512BW-FCP-NEXT:    vpmovqw %ymm2, (%rcx)
-; AVX512BW-FCP-NEXT:    vpmovqw %ymm3, (%r8)
+; AVX512BW-FCP-NEXT:    vpsrlq $32, %ymm0, %ymm1
+; AVX512BW-FCP-NEXT:    vpmovqw %ymm1, (%rcx)
+; AVX512BW-FCP-NEXT:    vpsrlq $48, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vpmovqw %ymm0, (%r8)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i16_stride4_vf4:
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-BW-NEXT:    vpsrlq $16, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT:    vpsrlq $32, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT:    vpsrlq $48, %ymm0, %ymm3
 ; AVX512DQ-BW-NEXT:    vpmovqw %ymm0, (%rsi)
+; AVX512DQ-BW-NEXT:    vpsrlq $16, %ymm0, %ymm1
 ; AVX512DQ-BW-NEXT:    vpmovqw %ymm1, (%rdx)
-; AVX512DQ-BW-NEXT:    vpmovqw %ymm2, (%rcx)
-; AVX512DQ-BW-NEXT:    vpmovqw %ymm3, (%r8)
+; AVX512DQ-BW-NEXT:    vpsrlq $32, %ymm0, %ymm1
+; AVX512DQ-BW-NEXT:    vpmovqw %ymm1, (%rcx)
+; AVX512DQ-BW-NEXT:    vpsrlq $48, %ymm0, %ymm0
+; AVX512DQ-BW-NEXT:    vpmovqw %ymm0, (%r8)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf4:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $16, %ymm0, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $32, %ymm0, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %ymm0, %ymm3
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqw %ymm0, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $16, %ymm0, %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqw %ymm1, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vpmovqw %ymm2, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vpmovqw %ymm3, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $32, %ymm0, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpmovqw %ymm1, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpmovqw %ymm0, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <16 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index 9b19ec15c6f55..8fb622228a26e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -288,55 +288,55 @@ define void @load_i16_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
 ; SSE-LABEL: load_i16_stride5_vf4:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm2
-; SSE-NEXT:    movdqa 16(%rdi), %xmm3
+; SSE-NEXT:    movdqa (%rdi), %xmm1
+; SSE-NEXT:    movdqa 16(%rdi), %xmm2
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[2,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE-NEXT:    movdqa %xmm3, %xmm4
-; SSE-NEXT:    psrlq $48, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,3,2,1]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,1,1,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[3,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,2,0,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[1,1,1,1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[0,2,2,3]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm7[0,3,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[3,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
-; SSE-NEXT:    pand %xmm7, %xmm2
-; SSE-NEXT:    pandn %xmm0, %xmm7
-; SSE-NEXT:    por %xmm2, %xmm7
-; SSE-NEXT:    movq %xmm1, (%rsi)
-; SSE-NEXT:    movq %xmm4, (%rdx)
-; SSE-NEXT:    movq %xmm5, (%rcx)
-; SSE-NEXT:    movq %xmm6, (%r8)
-; SSE-NEXT:    movq %xmm7, (%r9)
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT:    movq %xmm4, (%rsi)
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    psrlq $48, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,3,2,1]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7]
+; SSE-NEXT:    movq %xmm3, (%rdx)
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,2,0,3,4,5,6,7]
+; SSE-NEXT:    movq %xmm3, (%rcx)
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm4[0,3,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT:    movq %xmm3, (%r8)
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7]
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pandn %xmm0, %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
+; SSE-NEXT:    movq %xmm3, (%r9)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_i16_stride5_vf4:
@@ -349,30 +349,30 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX-NEXT:    vpsrlq $48, %xmm2, %xmm4
-; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[0,1,1,3]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,8,9,6,7,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,10,11,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7]
 ; AVX-NEXT:    vmovq %xmm0, (%rsi)
-; AVX-NEXT:    vmovq %xmm4, (%rdx)
-; AVX-NEXT:    vmovq %xmm5, (%rcx)
-; AVX-NEXT:    vmovq %xmm6, (%r8)
-; AVX-NEXT:    vmovq %xmm1, (%r9)
+; AVX-NEXT:    vpsrlq $48, %xmm2, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm0, (%rdx)
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm0, (%rcx)
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,10,11,10,11,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT:    vmovq %xmm0, (%r8)
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm0, (%r9)
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: load_i16_stride5_vf4:
@@ -385,22 +385,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
 ; AVX2-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovq %xmm3, (%rsi)
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovq %xmm3, (%rdx)
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovq %xmm3, (%rcx)
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovq %xmm3, (%r8)
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-NEXT:    vmovq %xmm4, (%rdx)
-; AVX2-NEXT:    vmovq %xmm5, (%rcx)
-; AVX2-NEXT:    vmovq %xmm6, (%r8)
 ; AVX2-NEXT:    vmovq %xmm0, (%r9)
 ; AVX2-NEXT:    retq
 ;
@@ -412,22 +412,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vmovq %xmm3, (%r8)
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-FP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-FP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX2-FP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX2-FP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX2-FP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX2-FP-NEXT:    retq
 ;
@@ -439,58 +439,64 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vmovq %xmm3, (%r8)
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX2-FCP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX2-FCP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX2-FCP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX2-FCP-NEXT:    retq
 ;
 ; AVX512-LABEL: load_i16_stride5_vf4:
 ; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT:    vpextrw $5, %xmm0, %eax
-; AVX512-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm3
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
 ; AVX512-NEXT:    vpextrw $7, %xmm1, %eax
-; AVX512-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512-NEXT:    vpextrw $6, %xmm0, %eax
-; AVX512-NEXT:    vpextrw $1, %xmm0, %r10d
-; AVX512-NEXT:    vmovd %r10d, %xmm4
-; AVX512-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; AVX512-NEXT:    vpextrw $3, %xmm1, %eax
-; AVX512-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm1
-; AVX512-NEXT:    vmovd %xmm2, %eax
+; AVX512-NEXT:    vpextrw $5, %xmm0, %r10d
+; AVX512-NEXT:    vmovd %xmm2, %r11d
+; AVX512-NEXT:    vpextrw $3, %xmm1, %ebx
+; AVX512-NEXT:    vpextrw $6, %xmm0, %ebp
+; AVX512-NEXT:    vpextrw $1, %xmm0, %r14d
+; AVX512-NEXT:    vpinsrw $1, %r10d, %xmm0, %xmm3
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
 ; AVX512-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX512-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512-NEXT:    vmovd %r14d, %xmm1
+; AVX512-NEXT:    vpinsrw $1, %ebp, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $2, %ebx, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $3, %r11d, %xmm1, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovq %xmm1, (%r8)
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512-NEXT:    vmovq %xmm1, (%rdx)
-; AVX512-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512-NEXT:    vmovq %xmm0, (%r9)
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i16_stride5_vf4:
@@ -498,65 +504,71 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpextrw $7, %xmm1, %eax
-; AVX512-FCP-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vpextrw $3, %xmm1, %eax
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vmovd %xmm2, %eax
+; AVX512-FCP-NEXT:    vmovd %xmm2, %r10d
+; AVX512-FCP-NEXT:    vpextrw $3, %xmm1, %r11d
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
+; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX512-FCP-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpinsrw $2, %r11d, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vpinsrw $3, %r10d, %xmm1, %xmm1
+; AVX512-FCP-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovq %xmm1, (%r8)
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512-FCP-NEXT:    vmovq %xmm1, (%rdx)
-; AVX512-FCP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512-FCP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i16_stride5_vf4:
 ; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:    pushq %r14
+; AVX512DQ-NEXT:    pushq %rbx
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-NEXT:    vpextrw $5, %xmm0, %eax
-; AVX512DQ-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm3
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpextrw $7, %xmm1, %eax
-; AVX512DQ-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vpextrw $6, %xmm0, %eax
-; AVX512DQ-NEXT:    vpextrw $1, %xmm0, %r10d
-; AVX512DQ-NEXT:    vmovd %r10d, %xmm4
-; AVX512DQ-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vpextrw $3, %xmm1, %eax
-; AVX512DQ-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm1
-; AVX512DQ-NEXT:    vmovd %xmm2, %eax
+; AVX512DQ-NEXT:    vpextrw $5, %xmm0, %r10d
+; AVX512DQ-NEXT:    vmovd %xmm2, %r11d
+; AVX512DQ-NEXT:    vpextrw $3, %xmm1, %ebx
+; AVX512DQ-NEXT:    vpextrw $6, %xmm0, %ebp
+; AVX512DQ-NEXT:    vpextrw $1, %xmm0, %r14d
+; AVX512DQ-NEXT:    vpinsrw $1, %r10d, %xmm0, %xmm3
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
+; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX512DQ-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512DQ-NEXT:    vmovd %r14d, %xmm1
+; AVX512DQ-NEXT:    vpinsrw $1, %ebp, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpinsrw $2, %ebx, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpinsrw $3, %r11d, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovq %xmm1, (%r8)
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-NEXT:    vmovq %xmm1, (%rdx)
-; AVX512DQ-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512DQ-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512DQ-NEXT:    vmovq %xmm0, (%r9)
+; AVX512DQ-NEXT:    popq %rbx
+; AVX512DQ-NEXT:    popq %r14
+; AVX512DQ-NEXT:    popq %rbp
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i16_stride5_vf4:
@@ -564,29 +576,29 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpextrw $7, %xmm1, %eax
-; AVX512DQ-FCP-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vpextrw $3, %xmm1, %eax
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovd %xmm2, %eax
+; AVX512DQ-FCP-NEXT:    vmovd %xmm2, %r10d
+; AVX512DQ-FCP-NEXT:    vpextrw $3, %xmm1, %r11d
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpinsrw $2, %r11d, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vpinsrw $3, %r10d, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%r8)
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -600,19 +612,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX512BW-NEXT:    vpextrw $7, %xmm3, %eax
+; AVX512BW-NEXT:    movl 32(%rdi), %edi
 ; AVX512BW-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm2, %zmm3, %zmm3
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm2, %zmm4, %zmm4
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm2, %zmm5, %zmm2
 ; AVX512BW-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512BW-NEXT:    vpinsrw $3, %edi, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512BW-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512BW-NEXT:    vmovq %xmm4, (%r8)
-; AVX512BW-NEXT:    vmovq %xmm2, (%r9)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%r8)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -626,19 +639,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX512BW-FCP-NEXT:    vpextrw $7, %xmm3, %eax
+; AVX512BW-FCP-NEXT:    movl 32(%rdi), %edi
 ; AVX512BW-FCP-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512BW-FCP-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm3, %zmm3
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm4, %zmm4
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm5, %zmm2
 ; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512BW-FCP-NEXT:    vpinsrw $3, %edi, %xmm0, %xmm0
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%r8)
-; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%r9)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r8)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -652,19 +666,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX512DQ-BW-NEXT:    vpextrw $7, %xmm3, %eax
+; AVX512DQ-BW-NEXT:    movl 32(%rdi), %edi
 ; AVX512DQ-BW-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512DQ-BW-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm2, %zmm3, %zmm3
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm2, %zmm4, %zmm4
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm2, %zmm5, %zmm2
 ; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512DQ-BW-NEXT:    vpinsrw $3, %edi, %xmm0, %xmm0
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%r8)
-; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%r9)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm2, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm2, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r8)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm2, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -678,19 +693,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpextrw $7, %xmm3, %eax
+; AVX512DQ-BW-FCP-NEXT:    movl 32(%rdi), %edi
 ; AVX512DQ-BW-FCP-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm3, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm4, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm5, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vpinsrw $3, %edi, %xmm0, %xmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <20 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index feb75b21d5c8d..dc8a9ed4a4ccc 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -382,57 +382,57 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa (%rdi), %xmm0
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE-NEXT:    movdqa 32(%rdi), %xmm5
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,1,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE-NEXT:    movdqa 32(%rdi), %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,1,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,6,6,7]
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT:    movq %xmm2, (%rsi)
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT:    movdqa %xmm2, %xmm5
+; SSE-NEXT:    pandn %xmm3, %xmm5
+; SSE-NEXT:    movdqa %xmm1, %xmm6
+; SSE-NEXT:    psrld $16, %xmm6
+; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; SSE-NEXT:    pand %xmm2, %xmm4
+; SSE-NEXT:    por %xmm5, %xmm4
+; SSE-NEXT:    movq %xmm4, (%rdx)
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,3,2,3]
+; SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
 ; SSE-NEXT:    movdqa %xmm2, %xmm6
-; SSE-NEXT:    pandn %xmm5, %xmm6
-; SSE-NEXT:    movdqa %xmm1, %xmm7
-; SSE-NEXT:    psrld $16, %xmm7
-; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3]
-; SSE-NEXT:    pand %xmm2, %xmm3
-; SSE-NEXT:    por %xmm6, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,3,2,3]
-; SSE-NEXT:    psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE-NEXT:    movdqa %xmm2, %xmm8
-; SSE-NEXT:    pandn %xmm5, %xmm8
-; SSE-NEXT:    movdqa %xmm0, %xmm5
-; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[1,0,2,3,4,5,6,7]
-; SSE-NEXT:    pand %xmm2, %xmm9
-; SSE-NEXT:    por %xmm8, %xmm9
+; SSE-NEXT:    pandn %xmm3, %xmm6
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7]
+; SSE-NEXT:    pand %xmm2, %xmm7
+; SSE-NEXT:    por %xmm6, %xmm7
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
-; SSE-NEXT:    pand %xmm2, %xmm5
-; SSE-NEXT:    pandn %xmm6, %xmm2
-; SSE-NEXT:    por %xmm5, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm7[0,2,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; SSE-NEXT:    movq %xmm7, (%rcx)
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
+; SSE-NEXT:    pand %xmm2, %xmm3
+; SSE-NEXT:    pandn %xmm5, %xmm2
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    movq %xmm2, (%r8)
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT:    movq %xmm3, (%r9)
 ; SSE-NEXT:    psrlq $48, %xmm1
 ; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm7[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm4[1,3,2,3,4,5,6,7]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT:    movq %xmm4, (%rsi)
-; SSE-NEXT:    movq %xmm3, (%rdx)
-; SSE-NEXT:    movq %xmm9, (%rcx)
-; SSE-NEXT:    movq %xmm2, (%r8)
-; SSE-NEXT:    movq %xmm6, (%r9)
 ; SSE-NEXT:    movq %xmm0, (%rax)
 ; SSE-NEXT:    retq
 ;
@@ -448,32 +448,32 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; AVX-NEXT:    vpsrld $16, %xmm1, %xmm5
+; AVX-NEXT:    vmovq %xmm4, (%rsi)
+; AVX-NEXT:    vpsrld $16, %xmm1, %xmm4
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm6[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX-NEXT:    vmovq %xmm3, (%rdx)
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3],xmm5[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm3, (%rcx)
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm2[4,5],xmm4[6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm3, (%r8)
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX-NEXT:    vmovq %xmm3, (%r9)
 ; AVX-NEXT:    vpsrlq $48, %xmm1, %xmm1
 ; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm2[1,3,2,3,4,5,6,7]
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT:    vmovq %xmm4, (%rsi)
-; AVX-NEXT:    vmovq %xmm3, (%rdx)
-; AVX-NEXT:    vmovq %xmm5, (%rcx)
-; AVX-NEXT:    vmovq %xmm6, (%r8)
-; AVX-NEXT:    vmovq %xmm7, (%r9)
 ; AVX-NEXT:    vmovq %xmm0, (%rax)
 ; AVX-NEXT:    retq
 ;
@@ -486,24 +486,24 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpsrld $16, %xmm1, %xmm4
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
-; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovq %xmm3, (%rsi)
+; AVX2-NEXT:    vpsrld $16, %xmm1, %xmm3
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
+; AVX2-NEXT:    vmovq %xmm3, (%rdx)
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1,2],xmm1[3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovq %xmm4, (%rcx)
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovq %xmm3, (%r8)
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-NEXT:    vmovq %xmm4, (%rdx)
-; AVX2-NEXT:    vmovq %xmm6, (%rcx)
-; AVX2-NEXT:    vmovq %xmm5, (%r8)
 ; AVX2-NEXT:    vmovq %xmm1, (%r9)
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-NEXT:    retq
 ;
@@ -516,23 +516,23 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpsrld $16, %xmm1, %xmm4
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
-; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX2-FP-NEXT:    vpsrld $16, %xmm1, %xmm3
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
+; AVX2-FP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1,2],xmm1[3]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vmovq %xmm4, (%rcx)
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vmovq %xmm3, (%r8)
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-FP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX2-FP-NEXT:    vmovq %xmm6, (%rcx)
-; AVX2-FP-NEXT:    vmovq %xmm5, (%r8)
 ; AVX2-FP-NEXT:    vmovq %xmm1, (%r9)
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-FP-NEXT:    retq
 ;
@@ -545,23 +545,23 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpsrld $16, %xmm1, %xmm4
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX2-FCP-NEXT:    vpsrld $16, %xmm1, %xmm3
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1,2],xmm1[3]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vmovq %xmm4, (%rcx)
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vmovq %xmm3, (%r8)
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX2-FCP-NEXT:    vmovq %xmm6, (%rcx)
-; AVX2-FCP-NEXT:    vmovq %xmm5, (%r8)
 ; AVX2-FCP-NEXT:    vmovq %xmm1, (%r9)
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -574,26 +574,26 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512-NEXT:    vpsrld $16, %xmm1, %xmm1
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512-NEXT:    vpermd %zmm2, %zmm1, %zmm1
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
-; AVX512-NEXT:    vpermd %zmm2, %zmm5, %zmm2
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512-NEXT:    vmovq %xmm1, (%r8)
-; AVX512-NEXT:    vmovq %xmm5, (%r9)
-; AVX512-NEXT:    vmovq %xmm2, (%rax)
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7]
+; AVX512-NEXT:    vpermd %zmm4, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovq %xmm0, (%r8)
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11]
+; AVX512-NEXT:    vpermd %zmm4, %zmm0, %zmm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovq %xmm1, (%r9)
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -606,25 +606,25 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512-FCP-NEXT:    vpsrld $16, %xmm1, %xmm1
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
 ; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm5, %zmm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512-FCP-NEXT:    vmovq %xmm1, (%r8)
-; AVX512-FCP-NEXT:    vmovq %xmm5, (%r9)
-; AVX512-FCP-NEXT:    vmovq %xmm2, (%rax)
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7]
+; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovq %xmm0, (%r8)
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11]
+; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovq %xmm1, (%r9)
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -637,26 +637,26 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512DQ-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512DQ-NEXT:    vpsrld $16, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
 ; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm5, %zmm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512DQ-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512DQ-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-NEXT:    vmovq %xmm1, (%r8)
-; AVX512DQ-NEXT:    vmovq %xmm5, (%r9)
-; AVX512DQ-NEXT:    vmovq %xmm2, (%rax)
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7]
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovq %xmm0, (%r8)
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11]
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovq %xmm1, (%r9)
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -669,25 +669,25 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vpsrld $16, %xmm1, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
 ; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm5, %zmm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rax)
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%r8)
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%r9)
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -697,22 +697,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm5, %zmm5
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm6, %zmm1
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512BW-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512BW-NEXT:    vmovq %xmm4, (%r8)
-; AVX512BW-NEXT:    vmovq %xmm5, (%r9)
-; AVX512BW-NEXT:    vmovq %xmm1, (%rax)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%r8)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%r9)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -722,22 +722,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm5
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm6, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%r8)
-; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r9)
-; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%rax)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r8)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r9)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -747,22 +747,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm5, %zmm5
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm6, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%r8)
-; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%r9)
-; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rax)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r8)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r9)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -772,22 +772,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm6, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <24 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index 038c73bd9fed2..e89248a5474c7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -418,77 +418,77 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
 ; SSE-LABEL: load_i16_stride7_vf4:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm1
-; SSE-NEXT:    movdqa 16(%rdi), %xmm4
-; SSE-NEXT:    movdqa 32(%rdi), %xmm3
-; SSE-NEXT:    movdqa 48(%rdi), %xmm6
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3]
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT:    movdqa %xmm0, %xmm5
-; SSE-NEXT:    pandn %xmm2, %xmm5
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[2,2,3,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    por %xmm5, %xmm2
-; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535]
-; SSE-NEXT:    movdqa %xmm4, %xmm7
+; SSE-NEXT:    movdqa (%rdi), %xmm0
+; SSE-NEXT:    movdqa 16(%rdi), %xmm2
+; SSE-NEXT:    movdqa 32(%rdi), %xmm1
+; SSE-NEXT:    movdqa 48(%rdi), %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3]
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT:    movdqa %xmm5, %xmm6
+; SSE-NEXT:    pandn %xmm3, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,1,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3]
 ; SSE-NEXT:    pand %xmm5, %xmm7
-; SSE-NEXT:    pandn %xmm1, %xmm5
-; SSE-NEXT:    por %xmm7, %xmm5
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm5[1,0,3,3,4,5,6,7]
-; SSE-NEXT:    pand %xmm0, %xmm7
-; SSE-NEXT:    movdqa %xmm3, %xmm5
-; SSE-NEXT:    psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    pandn %xmm5, %xmm0
-; SSE-NEXT:    por %xmm7, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[0,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm7[0,3,2,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm1, %xmm8
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm8[2,1,2,3]
+; SSE-NEXT:    por %xmm6, %xmm7
+; SSE-NEXT:    movq %xmm7, (%rsi)
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535]
+; SSE-NEXT:    movdqa %xmm2, %xmm6
+; SSE-NEXT:    pand %xmm3, %xmm6
+; SSE-NEXT:    pandn %xmm0, %xmm3
+; SSE-NEXT:    por %xmm6, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm3[1,0,3,3,4,5,6,7]
+; SSE-NEXT:    pand %xmm5, %xmm6
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    pandn %xmm3, %xmm5
+; SSE-NEXT:    por %xmm6, %xmm5
+; SSE-NEXT:    movq %xmm5, (%rdx)
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm5[0,3,2,3,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[2,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
-; SSE-NEXT:    movdqa %xmm3, %xmm10
-; SSE-NEXT:    movdqa %xmm3, %xmm9
-; SSE-NEXT:    psrlq $16, %xmm9
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm6[1,1,1,1]
-; SSE-NEXT:    pslld $16, %xmm6
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; SSE-NEXT:    movdqa %xmm1, %xmm10
-; SSE-NEXT:    psrld $16, %xmm10
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE-NEXT:    psrlq $48, %xmm4
+; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; SSE-NEXT:    movq %xmm7, (%rcx)
+; SSE-NEXT:    movdqa %xmm1, %xmm7
+; SSE-NEXT:    movdqa %xmm1, %xmm6
+; SSE-NEXT:    psrlq $16, %xmm6
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1]
+; SSE-NEXT:    pslld $16, %xmm4
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,1,1,1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE-NEXT:    movq %xmm2, (%rsi)
-; SSE-NEXT:    movq %xmm0, (%rdx)
-; SSE-NEXT:    movq %xmm7, (%rcx)
-; SSE-NEXT:    movq %xmm8, (%r8)
-; SSE-NEXT:    movq %xmm6, (%r9)
-; SSE-NEXT:    movq %xmm10, (%rdi)
-; SSE-NEXT:    movq %xmm1, (%rax)
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-NEXT:    movq %xmm5, (%r8)
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    psrld $16, %xmm5
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT:    psrlq $48, %xmm2
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,1,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-NEXT:    movq %xmm4, (%r9)
+; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; SSE-NEXT:    movq %xmm5, (%rcx)
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movq %xmm0, (%rax)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_i16_stride7_vf4:
@@ -497,54 +497,54 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX-NEXT:    vmovdqa 32(%rdi), %xmm3
 ; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,2,3,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,2,3,3]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
 ; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[2,2,3,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,3,4,5,6,7]
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,3]
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[2,2,3,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm4, (%rsi)
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm4, (%rdx)
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm8[2,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
-; AVX-NEXT:    vpslld $16, %xmm2, %xmm9
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,2]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7]
-; AVX-NEXT:    vpsrlq $16, %xmm4, %xmm9
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
-; AVX-NEXT:    vpsrlq $48, %xmm1, %xmm10
-; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7]
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm10
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
+; AVX-NEXT:    vmovq %xmm4, (%rcx)
+; AVX-NEXT:    vpslld $16, %xmm2, %xmm4
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm4, (%r8)
+; AVX-NEXT:    vpsrlq $16, %xmm3, %xmm4
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3],xmm10[4,5,6,7]
+; AVX-NEXT:    vpsrlq $48, %xmm1, %xmm6
+; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm4, (%r9)
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm4
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm3, (%r10)
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
 ; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
-; AVX-NEXT:    vmovq %xmm3, (%rsi)
-; AVX-NEXT:    vmovq %xmm5, (%rdx)
-; AVX-NEXT:    vmovq %xmm7, (%rcx)
-; AVX-NEXT:    vmovq %xmm8, (%r8)
-; AVX-NEXT:    vmovq %xmm9, (%r9)
-; AVX-NEXT:    vmovq %xmm4, (%r10)
 ; AVX-NEXT:    vmovq %xmm0, (%rax)
 ; AVX-NEXT:    retq
 ;
@@ -552,51 +552,51 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm3
-; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm5
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm5[2],xmm3[3]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6],xmm1[7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm4
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm6
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3,4,5,6],xmm4[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm7
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,4,6,7]
+; AVX2-NEXT:    vmovq %xmm5, (%rsi)
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7]
+; AVX2-NEXT:    vmovq %xmm5, (%rdx)
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vmovq %xmm2, (%rcx)
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vmovq %xmm2, (%r8)
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX2-NEXT:    vmovq %xmm2, (%r9)
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX2-NEXT:    vmovq %xmm2, (%r10)
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7]
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
 ; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX2-NEXT:    vmovq %xmm1, (%rsi)
-; AVX2-NEXT:    vmovq %xmm6, (%rdx)
-; AVX2-NEXT:    vmovq %xmm3, (%rcx)
-; AVX2-NEXT:    vmovq %xmm4, (%r8)
-; AVX2-NEXT:    vmovq %xmm5, (%r9)
-; AVX2-NEXT:    vmovq %xmm7, (%r10)
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; AVX2-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -605,8 +605,8 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX2-FP-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm4
@@ -615,37 +615,37 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm6
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vmovq %xmm5, (%rsi)
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vmovq %xmm5, (%rdx)
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm4, %xmm8
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX2-FP-NEXT:    vmovq %xmm2, (%rcx)
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vmovq %xmm2, (%r8)
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX2-FP-NEXT:    vmovq %xmm2, (%r9)
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX2-FP-NEXT:    vmovq %xmm2, (%r10)
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
 ; AVX2-FP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
 ; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-FP-NEXT:    vmovq %xmm5, (%rsi)
-; AVX2-FP-NEXT:    vmovq %xmm6, (%rdx)
-; AVX2-FP-NEXT:    vmovq %xmm2, (%rcx)
-; AVX2-FP-NEXT:    vmovq %xmm3, (%r8)
-; AVX2-FP-NEXT:    vmovq %xmm4, (%r9)
-; AVX2-FP-NEXT:    vmovq %xmm7, (%r10)
 ; AVX2-FP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
@@ -654,8 +654,8 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX2-FCP-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
@@ -664,37 +664,37 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm6
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vmovq %xmm5, (%rsi)
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vmovq %xmm5, (%rdx)
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm8
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%rcx)
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%r8)
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%r9)
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
+; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%r10)
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
 ; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-FCP-NEXT:    vmovq %xmm5, (%rsi)
-; AVX2-FCP-NEXT:    vmovq %xmm6, (%rdx)
-; AVX2-FCP-NEXT:    vmovq %xmm2, (%rcx)
-; AVX2-FCP-NEXT:    vmovq %xmm3, (%r8)
-; AVX2-FCP-NEXT:    vmovq %xmm4, (%r9)
-; AVX2-FCP-NEXT:    vmovq %xmm7, (%r10)
 ; AVX2-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
@@ -708,47 +708,47 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm4
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3]
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512-NEXT:    vmovq %xmm5, (%rsi)
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7]
+; AVX512-NEXT:    vmovq %xmm5, (%rdx)
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512-NEXT:    vmovq %xmm1, (%rdx)
 ; AVX512-NEXT:    vmovq %xmm2, (%rcx)
-; AVX512-NEXT:    vmovq %xmm5, (%r8)
-; AVX512-NEXT:    vmovq %xmm6, (%r9)
-; AVX512-NEXT:    vmovq %xmm7, (%r10)
-; AVX512-NEXT:    vmovq %xmm3, (%rax)
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovq %xmm2, (%r8)
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT:    vmovq %xmm2, (%r9)
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT:    vmovq %xmm2, (%r10)
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7]
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -756,48 +756,48 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm1[6],xmm3[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm4
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm8
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512-FCP-NEXT:    vmovq %xmm1, (%rcx)
-; AVX512-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512-FCP-NEXT:    vmovq %xmm6, (%r9)
-; AVX512-FCP-NEXT:    vmovq %xmm7, (%r10)
-; AVX512-FCP-NEXT:    vmovq %xmm2, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
+; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0],xmm4[1,2,3,4,5,6],xmm1[7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovq %xmm0, (%r8)
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-FCP-NEXT:    vmovq %xmm0, (%r9)
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-FCP-NEXT:    vmovq %xmm0, (%r10)
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -810,47 +810,47 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm4
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3]
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-NEXT:    vmovq %xmm5, (%rsi)
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7]
+; AVX512DQ-NEXT:    vmovq %xmm5, (%rdx)
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512DQ-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-NEXT:    vmovq %xmm1, (%rdx)
 ; AVX512DQ-NEXT:    vmovq %xmm2, (%rcx)
-; AVX512DQ-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-NEXT:    vmovq %xmm6, (%r9)
-; AVX512DQ-NEXT:    vmovq %xmm7, (%r10)
-; AVX512DQ-NEXT:    vmovq %xmm3, (%rax)
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovq %xmm2, (%r8)
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT:    vmovq %xmm2, (%r9)
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT:    vmovq %xmm2, (%r10)
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -858,48 +858,48 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm1[6],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm4
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm8
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm7, (%r10)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0],xmm4[1,2,3,4,5,6],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%r8)
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%r9)
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%r10)
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -910,25 +910,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm5, %zmm5
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm6, %zmm6
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm7, %zmm1
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512BW-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512BW-NEXT:    vmovq %xmm4, (%r8)
-; AVX512BW-NEXT:    vmovq %xmm5, (%r9)
-; AVX512BW-NEXT:    vmovq %xmm6, (%r10)
-; AVX512BW-NEXT:    vmovq %xmm1, (%rax)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%r8)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%r9)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%r10)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -939,25 +939,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm5
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm6, %zmm6
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm7, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%r8)
-; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r9)
-; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r10)
-; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%rax)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r8)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r9)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r10)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -968,25 +968,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm5, %zmm5
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm6, %zmm6
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm7, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%r8)
-; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%r9)
-; AVX512DQ-BW-NEXT:    vmovq %xmm6, (%r10)
-; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rax)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r8)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r9)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r10)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -997,25 +997,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm6, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm7, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r10)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <28 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
index fff21f9aad1bb..b249950eb8694 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
@@ -296,41 +296,41 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE-NEXT:    movdqa 32(%rdi), %xmm2
+; SSE-NEXT:    movdqa 16(%rdi), %xmm2
+; SSE-NEXT:    movdqa 32(%rdi), %xmm1
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm3
-; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    movdqa %xmm1, %xmm4
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; SSE-NEXT:    movdqa %xmm0, %xmm5
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
 ; SSE-NEXT:    movdqa %xmm5, %xmm6
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[1,1,1,1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[3,3,3,3]
+; SSE-NEXT:    movq %xmm6, (%rsi)
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; SSE-NEXT:    movq %xmm7, (%rdx)
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[3,3,3,3]
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE-NEXT:    movq %xmm5, (%rcx)
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; SSE-NEXT:    movq %xmm6, (%r8)
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    movq %xmm2, (%r9)
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT:    movq %xmm6, (%rsi)
-; SSE-NEXT:    movq %xmm8, (%rdx)
-; SSE-NEXT:    movq %xmm5, (%rcx)
-; SSE-NEXT:    movq %xmm7, (%r8)
-; SSE-NEXT:    movq %xmm1, (%r9)
-; SSE-NEXT:    movq %xmm4, (%r11)
+; SSE-NEXT:    movq %xmm3, (%r11)
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE-NEXT:    movq %xmm0, (%r10)
-; SSE-NEXT:    movq %xmm3, (%rax)
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE-NEXT:    movq %xmm2, (%rax)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_i16_stride8_vf4:
@@ -345,28 +345,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7]
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX-NEXT:    vmovq %xmm6, (%rsi)
+; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm6, (%rdx)
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX-NEXT:    vmovq %xmm6, (%rcx)
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm4, (%r8)
 ; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7]
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
-; AVX-NEXT:    vmovq %xmm6, (%rsi)
-; AVX-NEXT:    vmovq %xmm7, (%rdx)
-; AVX-NEXT:    vmovq %xmm8, (%rcx)
-; AVX-NEXT:    vmovq %xmm4, (%r8)
 ; AVX-NEXT:    vmovq %xmm1, (%r9)
-; AVX-NEXT:    vmovq %xmm3, (%r11)
-; AVX-NEXT:    vmovq %xmm5, (%r10)
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm1, (%r11)
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX-NEXT:    vmovq %xmm1, (%r10)
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
 ; AVX-NEXT:    vmovq %xmm0, (%rax)
 ; AVX-NEXT:    retq
 ;
@@ -382,28 +382,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
-; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX2-NEXT:    vmovq %xmm6, (%rsi)
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
+; AVX2-NEXT:    vmovq %xmm6, (%rdx)
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX2-NEXT:    vmovq %xmm6, (%rcx)
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT:    vmovq %xmm4, (%r8)
 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
-; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX2-NEXT:    vmovq %xmm6, (%rsi)
-; AVX2-NEXT:    vmovq %xmm7, (%rdx)
-; AVX2-NEXT:    vmovq %xmm8, (%rcx)
-; AVX2-NEXT:    vmovq %xmm4, (%r8)
 ; AVX2-NEXT:    vmovq %xmm1, (%r9)
-; AVX2-NEXT:    vmovq %xmm3, (%r11)
-; AVX2-NEXT:    vmovq %xmm5, (%r10)
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX2-NEXT:    vmovq %xmm1, (%r11)
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT:    vmovq %xmm1, (%r10)
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX2-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-NEXT:    retq
 ;
@@ -419,28 +419,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
-; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX2-FP-NEXT:    vmovq %xmm6, (%rsi)
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
+; AVX2-FP-NEXT:    vmovq %xmm6, (%rdx)
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX2-FP-NEXT:    vmovq %xmm6, (%rcx)
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT:    vmovq %xmm4, (%r8)
 ; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
-; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX2-FP-NEXT:    vmovq %xmm6, (%rsi)
-; AVX2-FP-NEXT:    vmovq %xmm7, (%rdx)
-; AVX2-FP-NEXT:    vmovq %xmm8, (%rcx)
-; AVX2-FP-NEXT:    vmovq %xmm4, (%r8)
 ; AVX2-FP-NEXT:    vmovq %xmm1, (%r9)
-; AVX2-FP-NEXT:    vmovq %xmm3, (%r11)
-; AVX2-FP-NEXT:    vmovq %xmm5, (%r10)
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX2-FP-NEXT:    vmovq %xmm1, (%r11)
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-FP-NEXT:    vmovq %xmm1, (%r10)
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX2-FP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-FP-NEXT:    retq
 ;
@@ -456,28 +456,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX2-FCP-NEXT:    vmovq %xmm6, (%rsi)
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
+; AVX2-FCP-NEXT:    vmovq %xmm6, (%rdx)
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX2-FCP-NEXT:    vmovq %xmm6, (%rcx)
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
 ; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FCP-NEXT:    vmovq %xmm4, (%r8)
 ; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX2-FCP-NEXT:    vmovq %xmm6, (%rsi)
-; AVX2-FCP-NEXT:    vmovq %xmm7, (%rdx)
-; AVX2-FCP-NEXT:    vmovq %xmm8, (%rcx)
-; AVX2-FCP-NEXT:    vmovq %xmm4, (%r8)
 ; AVX2-FCP-NEXT:    vmovq %xmm1, (%r9)
-; AVX2-FCP-NEXT:    vmovq %xmm3, (%r11)
-; AVX2-FCP-NEXT:    vmovq %xmm5, (%r10)
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vmovq %xmm1, (%r11)
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-FCP-NEXT:    vmovq %xmm1, (%r10)
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX2-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -493,25 +493,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3]
-; AVX512-NEXT:    vpermt2d %xmm4, %xmm9, %xmm5
+; AVX512-NEXT:    vmovq %xmm6, (%rsi)
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
+; AVX512-NEXT:    vmovq %xmm6, (%rdx)
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-NEXT:    vmovq %xmm6, (%rcx)
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [3,7,3,3]
+; AVX512-NEXT:    vpermt2d %xmm4, %xmm6, %xmm5
+; AVX512-NEXT:    vmovq %xmm5, (%r8)
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512-NEXT:    vpermt2d %xmm2, %xmm9, %xmm0
-; AVX512-NEXT:    vmovq %xmm6, (%rsi)
-; AVX512-NEXT:    vmovq %xmm7, (%rdx)
-; AVX512-NEXT:    vmovq %xmm8, (%rcx)
-; AVX512-NEXT:    vmovq %xmm5, (%r8)
 ; AVX512-NEXT:    vmovq %xmm1, (%r9)
-; AVX512-NEXT:    vmovq %xmm3, (%r11)
-; AVX512-NEXT:    vmovq %xmm4, (%r10)
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX512-NEXT:    vmovq %xmm1, (%r11)
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512-NEXT:    vmovq %xmm1, (%r10)
+; AVX512-NEXT:    vpermt2d %xmm2, %xmm6, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512-NEXT:    retq
 ;
@@ -527,25 +527,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1]
-; AVX512-FCP-NEXT:    vmovdqa %xmm5, %xmm8
-; AVX512-FCP-NEXT:    vpermt2d %xmm4, %xmm7, %xmm8
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3]
-; AVX512-FCP-NEXT:    vpermt2d %xmm4, %xmm10, %xmm5
+; AVX512-FCP-NEXT:    vmovq %xmm6, (%rsi)
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [1,5,1,1]
+; AVX512-FCP-NEXT:    vmovdqa %xmm5, %xmm7
+; AVX512-FCP-NEXT:    vpermt2d %xmm4, %xmm6, %xmm7
+; AVX512-FCP-NEXT:    vmovq %xmm7, (%rdx)
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-FCP-NEXT:    vmovq %xmm7, (%rcx)
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [3,7,3,3]
+; AVX512-FCP-NEXT:    vpermt2d %xmm4, %xmm7, %xmm5
+; AVX512-FCP-NEXT:    vmovq %xmm5, (%r8)
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512-FCP-NEXT:    vpermi2d %xmm2, %xmm0, %xmm7
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpermt2d %xmm2, %xmm10, %xmm0
-; AVX512-FCP-NEXT:    vmovq %xmm6, (%rsi)
-; AVX512-FCP-NEXT:    vmovq %xmm8, (%rdx)
-; AVX512-FCP-NEXT:    vmovq %xmm9, (%rcx)
-; AVX512-FCP-NEXT:    vmovq %xmm5, (%r8)
 ; AVX512-FCP-NEXT:    vmovq %xmm1, (%r9)
-; AVX512-FCP-NEXT:    vmovq %xmm7, (%r11)
-; AVX512-FCP-NEXT:    vmovq %xmm3, (%r10)
+; AVX512-FCP-NEXT:    vpermi2d %xmm2, %xmm0, %xmm6
+; AVX512-FCP-NEXT:    vmovq %xmm6, (%r11)
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512-FCP-NEXT:    vmovq %xmm1, (%r10)
+; AVX512-FCP-NEXT:    vpermt2d %xmm2, %xmm7, %xmm0
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -561,25 +561,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3]
-; AVX512DQ-NEXT:    vpermt2d %xmm4, %xmm9, %xmm5
+; AVX512DQ-NEXT:    vmovq %xmm6, (%rsi)
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
+; AVX512DQ-NEXT:    vmovq %xmm6, (%rdx)
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-NEXT:    vmovq %xmm6, (%rcx)
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [3,7,3,3]
+; AVX512DQ-NEXT:    vpermt2d %xmm4, %xmm6, %xmm5
+; AVX512DQ-NEXT:    vmovq %xmm5, (%r8)
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512DQ-NEXT:    vpermt2d %xmm2, %xmm9, %xmm0
-; AVX512DQ-NEXT:    vmovq %xmm6, (%rsi)
-; AVX512DQ-NEXT:    vmovq %xmm7, (%rdx)
-; AVX512DQ-NEXT:    vmovq %xmm8, (%rcx)
-; AVX512DQ-NEXT:    vmovq %xmm5, (%r8)
 ; AVX512DQ-NEXT:    vmovq %xmm1, (%r9)
-; AVX512DQ-NEXT:    vmovq %xmm3, (%r11)
-; AVX512DQ-NEXT:    vmovq %xmm4, (%r10)
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX512DQ-NEXT:    vmovq %xmm1, (%r11)
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512DQ-NEXT:    vmovq %xmm1, (%r10)
+; AVX512DQ-NEXT:    vpermt2d %xmm2, %xmm6, %xmm0
 ; AVX512DQ-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-NEXT:    retq
 ;
@@ -595,25 +595,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1]
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, %xmm8
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm4, %xmm7, %xmm8
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm4, %xmm10, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%rsi)
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [1,5,1,1]
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, %xmm7
+; AVX512DQ-FCP-NEXT:    vpermt2d %xmm4, %xmm6, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovq %xmm7, (%rdx)
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm7, (%rcx)
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm7 = [3,7,3,3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %xmm4, %xmm7, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%r8)
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm2, %xmm0, %xmm7
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpermt2d %xmm2, %xmm10, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm8, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm9, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm7, (%r11)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%r10)
+; AVX512DQ-FCP-NEXT:    vpermi2d %xmm2, %xmm0, %xmm6
+; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%r11)
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%r10)
+; AVX512DQ-FCP-NEXT:    vpermt2d %xmm2, %xmm7, %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -625,28 +625,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm5, %zmm5
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm6, %zmm6
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm7, %zmm7
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm8, %zmm1
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512BW-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512BW-NEXT:    vmovq %xmm4, (%r8)
-; AVX512BW-NEXT:    vmovq %xmm5, (%r9)
-; AVX512BW-NEXT:    vmovq %xmm6, (%r11)
-; AVX512BW-NEXT:    vmovq %xmm7, (%r10)
-; AVX512BW-NEXT:    vmovq %xmm1, (%rax)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%r8)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%r9)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%r11)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%r10)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -658,28 +658,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm5
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm6, %zmm6
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm7, %zmm7
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm8, %zmm1
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%r8)
-; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r9)
-; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r11)
-; AVX512BW-FCP-NEXT:    vmovq %xmm7, (%r10)
-; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%rax)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r8)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r9)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r11)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r10)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -691,28 +691,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm3, %zmm3
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm4, %zmm4
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm5, %zmm5
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm6, %zmm6
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm7, %zmm7
-; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm8, %zmm1
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%r8)
-; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%r9)
-; AVX512DQ-BW-NEXT:    vmovq %xmm6, (%r11)
-; AVX512DQ-BW-NEXT:    vmovq %xmm7, (%r10)
-; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rax)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r8)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r9)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r11)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r10)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -724,28 +724,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm3, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm4, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm5, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm6, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm7, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm8, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r11)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm7, (%r10)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r11)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <32 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
index f2c5a91d2cca3..995d641644dfa 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
@@ -20,8 +20,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa (%rdi), %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; SSE-NEXT:    movq %xmm1, (%rsi)
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; SSE-NEXT:    movq %xmm0, (%rdx)
 ; SSE-NEXT:    retq
 ;
@@ -29,8 +29,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX-NEXT:    vmovlps %xmm1, (%rsi)
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX-NEXT:    vmovlps %xmm0, (%rdx)
 ; AVX-NEXT:    retq
 ;
@@ -38,8 +38,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX2-NEXT:    vmovlps %xmm1, (%rsi)
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX2-NEXT:    vmovlps %xmm0, (%rdx)
 ; AVX2-NEXT:    retq
 ;
@@ -47,8 +47,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX2-FP-NEXT:    vmovlps %xmm1, (%rsi)
+; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX2-FP-NEXT:    vmovlps %xmm0, (%rdx)
 ; AVX2-FP-NEXT:    retq
 ;
@@ -56,8 +56,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX2-FCP-NEXT:    vmovlps %xmm1, (%rsi)
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX2-FCP-NEXT:    vmovlps %xmm0, (%rdx)
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -65,8 +65,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX512-NEXT:    vmovlps %xmm1, (%rsi)
+; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX512-NEXT:    vmovlps %xmm0, (%rdx)
 ; AVX512-NEXT:    retq
 ;
@@ -74,8 +74,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX512-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX512-FCP-NEXT:    vmovlps %xmm1, (%rsi)
+; AVX512-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX512-FCP-NEXT:    vmovlps %xmm0, (%rdx)
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -83,8 +83,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX512DQ-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX512DQ-NEXT:    vmovlps %xmm1, (%rsi)
+; AVX512DQ-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX512DQ-NEXT:    vmovlps %xmm0, (%rdx)
 ; AVX512DQ-NEXT:    retq
 ;
@@ -92,8 +92,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX512DQ-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, (%rsi)
+; AVX512DQ-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX512DQ-FCP-NEXT:    vmovlps %xmm0, (%rdx)
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -101,8 +101,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512BW-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX512BW-NEXT:    vmovlps %xmm1, (%rsi)
+; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX512BW-NEXT:    vmovlps %xmm0, (%rdx)
 ; AVX512BW-NEXT:    retq
 ;
@@ -110,8 +110,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512BW-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX512BW-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX512BW-FCP-NEXT:    vmovlps %xmm1, (%rsi)
+; AVX512BW-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX512BW-FCP-NEXT:    vmovlps %xmm0, (%rdx)
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -119,8 +119,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX512DQ-BW-NEXT:    vmovlps %xmm1, (%rsi)
+; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX512DQ-BW-NEXT:    vmovlps %xmm0, (%rdx)
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -128,8 +128,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-BW-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm0, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <4 x i32>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index 34f23213500c1..8af9594f81480 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -21,13 +21,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa (%rdi), %xmm0
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE-NEXT:    movq %xmm2, (%rsi)
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    movq %xmm2, (%rdx)
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT:    movq %xmm2, (%rsi)
-; SSE-NEXT:    movq %xmm3, (%rdx)
 ; SSE-NEXT:    movq %xmm0, (%rcx)
 ; SSE-NEXT:    retq
 ;
@@ -36,12 +36,12 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX-NEXT:    vmovaps 16(%rdi), %xmm1
 ; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[1,0,2,3]
+; AVX-NEXT:    vmovlps %xmm2, (%rsi)
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3]
+; AVX-NEXT:    vmovlps %xmm2, (%rdx)
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX-NEXT:    vmovlps %xmm2, (%rsi)
-; AVX-NEXT:    vmovlps %xmm3, (%rdx)
 ; AVX-NEXT:    vmovlps %xmm0, (%rcx)
 ; AVX-NEXT:    retq
 ;
@@ -50,13 +50,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX2-NEXT:    vmovaps 16(%rdi), %xmm1
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
 ; AVX2-NEXT:    vbroadcastss 8(%rdi), %xmm3
-; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
 ; AVX2-NEXT:    vmovlps %xmm2, (%rsi)
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
 ; AVX2-NEXT:    vmovlps %xmm0, (%rdx)
-; AVX2-NEXT:    vmovlps %xmm1, (%rcx)
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX2-NEXT:    vmovlps %xmm0, (%rcx)
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i32_stride3_vf2:
@@ -64,13 +64,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX2-FP-NEXT:    vmovaps 16(%rdi), %xmm1
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
 ; AVX2-FP-NEXT:    vbroadcastss 8(%rdi), %xmm3
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
 ; AVX2-FP-NEXT:    vmovlps %xmm2, (%rsi)
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
 ; AVX2-FP-NEXT:    vmovlps %xmm0, (%rdx)
-; AVX2-FP-NEXT:    vmovlps %xmm1, (%rcx)
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX2-FP-NEXT:    vmovlps %xmm0, (%rcx)
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i32_stride3_vf2:
@@ -78,13 +78,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX2-FCP-NEXT:    vmovaps 16(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
 ; AVX2-FCP-NEXT:    vbroadcastss 8(%rdi), %xmm3
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
 ; AVX2-FCP-NEXT:    vmovlps %xmm2, (%rsi)
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
 ; AVX2-FCP-NEXT:    vmovlps %xmm0, (%rdx)
-; AVX2-FCP-NEXT:    vmovlps %xmm1, (%rcx)
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX2-FCP-NEXT:    vmovlps %xmm0, (%rcx)
 ; AVX2-FCP-NEXT:    retq
 ;
 ; AVX512-LABEL: load_i32_stride3_vf2:
@@ -92,13 +92,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512-NEXT:    vmovaps 16(%rdi), %xmm1
 ; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
 ; AVX512-NEXT:    vbroadcastss 8(%rdi), %xmm3
-; AVX512-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
 ; AVX512-NEXT:    vmovlps %xmm2, (%rsi)
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
 ; AVX512-NEXT:    vmovlps %xmm0, (%rdx)
-; AVX512-NEXT:    vmovlps %xmm1, (%rcx)
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512-NEXT:    vmovlps %xmm0, (%rcx)
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i32_stride3_vf2:
@@ -119,13 +119,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovaps 16(%rdi), %xmm1
 ; AVX512DQ-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512DQ-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX512DQ-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
 ; AVX512DQ-NEXT:    vbroadcastss 8(%rdi), %xmm3
-; AVX512DQ-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
 ; AVX512DQ-NEXT:    vmovlps %xmm2, (%rsi)
+; AVX512DQ-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX512DQ-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
 ; AVX512DQ-NEXT:    vmovlps %xmm0, (%rdx)
-; AVX512DQ-NEXT:    vmovlps %xmm1, (%rcx)
+; AVX512DQ-NEXT:    vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512DQ-NEXT:    vmovlps %xmm0, (%rcx)
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf2:
@@ -146,13 +146,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512BW-NEXT:    vmovaps 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512BW-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
 ; AVX512BW-NEXT:    vbroadcastss 8(%rdi), %xmm3
-; AVX512BW-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
 ; AVX512BW-NEXT:    vmovlps %xmm2, (%rsi)
+; AVX512BW-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
 ; AVX512BW-NEXT:    vmovlps %xmm0, (%rdx)
-; AVX512BW-NEXT:    vmovlps %xmm1, (%rcx)
+; AVX512BW-NEXT:    vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512BW-NEXT:    vmovlps %xmm0, (%rcx)
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i32_stride3_vf2:
@@ -173,13 +173,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-BW-NEXT:    vmovaps 16(%rdi), %xmm1
 ; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
 ; AVX512DQ-BW-NEXT:    vbroadcastss 8(%rdi), %xmm3
-; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
 ; AVX512DQ-BW-NEXT:    vmovlps %xmm2, (%rsi)
+; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
 ; AVX512DQ-BW-NEXT:    vmovlps %xmm0, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovlps %xmm1, (%rcx)
+; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512DQ-BW-NEXT:    vmovlps %xmm0, (%rcx)
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf2:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
index 822d31eb45139..f7ddcfcc625b5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
@@ -22,13 +22,13 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm1
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-NEXT:    movq %xmm2, (%rsi)
-; SSE-NEXT:    movq %xmm3, (%rdx)
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE-NEXT:    movq %xmm2, (%rdx)
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE-NEXT:    movq %xmm0, (%rcx)
-; SSE-NEXT:    movq %xmm1, (%r8)
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT:    movq %xmm0, (%r8)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_i32_stride4_vf2:
@@ -36,11 +36,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5,6,7]
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX-NEXT:    vmovq %xmm2, (%rsi)
-; AVX-NEXT:    vmovq %xmm3, (%rdx)
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm2, (%rdx)
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX-NEXT:    vpextrq $1, %xmm0, (%r8)
 ; AVX-NEXT:    retq
@@ -50,11 +50,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-NEXT:    vmovq %xmm2, (%rsi)
-; AVX2-NEXT:    vmovq %xmm3, (%rdx)
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX2-NEXT:    vmovq %xmm2, (%rdx)
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX2-NEXT:    vpextrq $1, %xmm0, (%r8)
 ; AVX2-NEXT:    retq
@@ -64,11 +64,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-FP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-FP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX2-FP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX2-FP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-FP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX2-FP-NEXT:    vpextrq $1, %xmm0, (%r8)
 ; AVX2-FP-NEXT:    retq
@@ -78,11 +78,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX2-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX2-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
 ; AVX2-FCP-NEXT:    retq
@@ -92,11 +92,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX512-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512-NEXT:    vpextrq $1, %xmm0, (%r8)
 ; AVX512-NEXT:    retq
@@ -108,9 +108,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
 ; AVX512-FCP-NEXT:    vpermps (%rdi), %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-FCP-NEXT:    vmovq %xmm2, (%rsi)
 ; AVX512-FCP-NEXT:    vmovlps %xmm3, (%rdx)
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
 ; AVX512-FCP-NEXT:    vzeroupper
@@ -121,11 +121,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX512DQ-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm0, (%r8)
 ; AVX512DQ-NEXT:    retq
@@ -137,9 +137,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
 ; AVX512DQ-FCP-NEXT:    vpermps (%rdi), %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovlps %xmm3, (%rdx)
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
@@ -150,11 +150,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX512BW-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512BW-NEXT:    vpextrq $1, %xmm0, (%r8)
 ; AVX512BW-NEXT:    retq
@@ -166,9 +166,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
 ; AVX512BW-FCP-NEXT:    vpermps (%rdi), %ymm3, %ymm3
-; AVX512BW-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
 ; AVX512BW-FCP-NEXT:    vmovlps %xmm3, (%rdx)
+; AVX512BW-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512BW-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
 ; AVX512BW-FCP-NEXT:    vzeroupper
@@ -179,11 +179,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-BW-NEXT:    vpextrq $1, %xmm0, (%r8)
 ; AVX512DQ-BW-NEXT:    retq
@@ -195,9 +195,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
 ; AVX512DQ-BW-FCP-NEXT:    vpermps (%rdi), %ymm3, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
 ; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm3, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
index 4f80140bc6c1b..fea8ebdf116fa 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
@@ -24,19 +24,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
 ; SSE-NEXT:    movdqa %xmm0, %xmm4
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT:    movq %xmm4, (%rsi)
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT:    movq %xmm4, (%rdx)
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT:    movq %xmm4, (%rsi)
-; SSE-NEXT:    movq %xmm5, (%rdx)
 ; SSE-NEXT:    movq %xmm0, (%rcx)
-; SSE-NEXT:    movq %xmm6, (%r8)
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-NEXT:    movq %xmm4, (%r8)
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    movq %xmm1, (%r9)
 ; SSE-NEXT:    retq
 ;
@@ -46,16 +46,16 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5],xmm1[6,7]
-; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7]
 ; AVX-NEXT:    vmovq %xmm3, (%rsi)
-; AVX-NEXT:    vmovq %xmm4, (%rdx)
-; AVX-NEXT:    vpextrq $1, %xmm5, (%rcx)
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
+; AVX-NEXT:    vmovq %xmm3, (%rdx)
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; AVX-NEXT:    vpextrq $1, %xmm1, (%rcx)
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
 ; AVX-NEXT:    vmovq %xmm0, (%r8)
-; AVX-NEXT:    vmovq %xmm1, (%r9)
+; AVX-NEXT:    vmovq %xmm4, (%r9)
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: load_i32_stride5_vf2:
@@ -64,17 +64,17 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX2-NEXT:    vpbroadcastd 16(%rdi), %ymm5
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX2-NEXT:    vpbroadcastd 16(%rdi), %ymm4
 ; AVX2-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-NEXT:    vmovq %xmm4, (%rdx)
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
+; AVX2-NEXT:    vmovq %xmm3, (%rdx)
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
 ; AVX2-NEXT:    vpextrq $1, %xmm1, (%rcx)
+; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
 ; AVX2-NEXT:    vmovq %xmm0, (%r8)
-; AVX2-NEXT:    vmovq %xmm2, (%r9)
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX2-NEXT:    vmovq %xmm0, (%r9)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -84,17 +84,17 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-FP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX2-FP-NEXT:    vpbroadcastd 16(%rdi), %ymm5
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX2-FP-NEXT:    vpbroadcastd 16(%rdi), %ymm4
 ; AVX2-FP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-FP-NEXT:    vmovq %xmm4, (%rdx)
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3]
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
+; AVX2-FP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
 ; AVX2-FP-NEXT:    vpextrq $1, %xmm1, (%rcx)
+; AVX2-FP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
 ; AVX2-FP-NEXT:    vmovq %xmm0, (%r8)
-; AVX2-FP-NEXT:    vmovq %xmm2, (%r9)
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX2-FP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
@@ -104,17 +104,17 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX2-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm5
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX2-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm4
 ; AVX2-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-FCP-NEXT:    vmovq %xmm4, (%rdx)
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
+; AVX2-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
 ; AVX2-FCP-NEXT:    vpextrq $1, %xmm1, (%rcx)
+; AVX2-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
 ; AVX2-FCP-NEXT:    vmovq %xmm0, (%r8)
-; AVX2-FCP-NEXT:    vmovq %xmm2, (%r9)
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX2-FCP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -123,21 +123,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX512-NEXT:    vpextrd $2, %xmm1, %eax
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; AVX512-NEXT:    vpinsrd $1, %eax, %xmm4, %xmm4
-; AVX512-NEXT:    vpbroadcastd 8(%rdi), %xmm5
-; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
-; AVX512-NEXT:    vpinsrd $1, %eax, %xmm5, %xmm1
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512-NEXT:    vpbroadcastd 16(%rdi), %ymm5
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX512-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512-NEXT:    vmovq %xmm4, (%rdx)
+; AVX512-NEXT:    vpextrd $3, %xmm1, %r10d
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512-NEXT:    vpbroadcastd 8(%rdi), %xmm3
+; AVX512-NEXT:    vpbroadcastd 16(%rdi), %ymm4
+; AVX512-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512-NEXT:    vpinsrd $1, %r10d, %xmm3, %xmm1
 ; AVX512-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
 ; AVX512-NEXT:    vmovq %xmm0, (%r8)
-; AVX512-NEXT:    vmovq %xmm2, (%r9)
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX512-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -146,19 +146,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0]
+; AVX512-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm3
 ; AVX512-FCP-NEXT:    vmovaps (%rdi), %ymm4
-; AVX512-FCP-NEXT:    vpermps %ymm4, %ymm3, %ymm3
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0]
-; AVX512-FCP-NEXT:    vpermps %ymm4, %ymm5, %ymm4
-; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3]
 ; AVX512-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512-FCP-NEXT:    vmovlps %xmm3, (%rdx)
-; AVX512-FCP-NEXT:    vmovlps %xmm4, (%rcx)
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0]
+; AVX512-FCP-NEXT:    vpermps %ymm4, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vmovlps %xmm2, (%rdx)
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0]
+; AVX512-FCP-NEXT:    vpermps %ymm4, %ymm2, %ymm2
+; AVX512-FCP-NEXT:    vmovlps %xmm2, (%rcx)
+; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%r8)
-; AVX512-FCP-NEXT:    vmovq %xmm1, (%r9)
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512-FCP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -167,21 +167,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX512DQ-NEXT:    vpextrd $2, %xmm1, %eax
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; AVX512DQ-NEXT:    vpinsrd $1, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vpbroadcastd 8(%rdi), %xmm5
-; AVX512DQ-NEXT:    vpextrd $3, %xmm1, %eax
-; AVX512DQ-NEXT:    vpinsrd $1, %eax, %xmm5, %xmm1
-; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512DQ-NEXT:    vpbroadcastd 16(%rdi), %ymm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX512DQ-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-NEXT:    vmovq %xmm4, (%rdx)
+; AVX512DQ-NEXT:    vpextrd $3, %xmm1, %r10d
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512DQ-NEXT:    vpbroadcastd 8(%rdi), %xmm3
+; AVX512DQ-NEXT:    vpbroadcastd 16(%rdi), %ymm4
+; AVX512DQ-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512DQ-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512DQ-NEXT:    vpinsrd $1, %r10d, %xmm3, %xmm1
 ; AVX512DQ-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
 ; AVX512DQ-NEXT:    vmovq %xmm0, (%r8)
-; AVX512DQ-NEXT:    vmovq %xmm2, (%r9)
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX512DQ-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -190,19 +190,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0]
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vpermps %ymm4, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0]
-; AVX512DQ-FCP-NEXT:    vpermps %ymm4, %ymm5, %ymm4
-; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3]
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovlps %xmm3, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovlps %xmm4, (%rcx)
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0]
+; AVX512DQ-FCP-NEXT:    vpermps %ymm4, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm2, (%rdx)
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0]
+; AVX512DQ-FCP-NEXT:    vpermps %ymm4, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm2, (%rcx)
+; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%r9)
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -211,21 +211,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX512BW-NEXT:    vpextrd $2, %xmm1, %eax
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; AVX512BW-NEXT:    vpinsrd $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpbroadcastd 8(%rdi), %xmm5
-; AVX512BW-NEXT:    vpextrd $3, %xmm1, %eax
-; AVX512BW-NEXT:    vpinsrd $1, %eax, %xmm5, %xmm1
-; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512BW-NEXT:    vpbroadcastd 16(%rdi), %ymm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX512BW-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm4, (%rdx)
+; AVX512BW-NEXT:    vpextrd $3, %xmm1, %r10d
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT:    vpbroadcastd 8(%rdi), %xmm3
+; AVX512BW-NEXT:    vpbroadcastd 16(%rdi), %ymm4
+; AVX512BW-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512BW-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512BW-NEXT:    vpinsrd $1, %r10d, %xmm3, %xmm1
 ; AVX512BW-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%r8)
-; AVX512BW-NEXT:    vmovq %xmm2, (%r9)
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX512BW-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -234,19 +234,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0]
+; AVX512BW-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm3
 ; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %ymm4
-; AVX512BW-FCP-NEXT:    vpermps %ymm4, %ymm3, %ymm3
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0]
-; AVX512BW-FCP-NEXT:    vpermps %ymm4, %ymm5, %ymm4
-; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512BW-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm5
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3]
 ; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovlps %xmm3, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovlps %xmm4, (%rcx)
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0]
+; AVX512BW-FCP-NEXT:    vpermps %ymm4, %ymm2, %ymm2
+; AVX512BW-FCP-NEXT:    vmovlps %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0]
+; AVX512BW-FCP-NEXT:    vpermps %ymm4, %ymm2, %ymm2
+; AVX512BW-FCP-NEXT:    vmovlps %xmm2, (%rcx)
+; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r8)
-; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%r9)
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -255,21 +255,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX512DQ-BW-NEXT:    vpextrd $2, %xmm1, %eax
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; AVX512DQ-BW-NEXT:    vpinsrd $1, %eax, %xmm4, %xmm4
-; AVX512DQ-BW-NEXT:    vpbroadcastd 8(%rdi), %xmm5
-; AVX512DQ-BW-NEXT:    vpextrd $3, %xmm1, %eax
-; AVX512DQ-BW-NEXT:    vpinsrd $1, %eax, %xmm5, %xmm1
-; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512DQ-BW-NEXT:    vpbroadcastd 16(%rdi), %ymm5
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rdx)
+; AVX512DQ-BW-NEXT:    vpextrd $3, %xmm1, %r10d
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512DQ-BW-NEXT:    vpbroadcastd 8(%rdi), %xmm3
+; AVX512DQ-BW-NEXT:    vpbroadcastd 16(%rdi), %ymm4
+; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512DQ-BW-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512DQ-BW-NEXT:    vpinsrd $1, %r10d, %xmm3, %xmm1
 ; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rcx)
+; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r8)
-; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%r9)
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -278,19 +278,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm3
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm4, %ymm3, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm4, %ymm5, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd 16(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm3, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm4, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm4, %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm4, %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm2, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <10 x i32>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index 85ed61811af53..49b131827c447 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -18,31 +18,31 @@
 define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
 ; SSE-LABEL: load_i32_stride6_vf2:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movdqa (%rdi), %xmm1
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[3,3,3,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[3,3,3,3]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
 ; SSE-NEXT:    movq %xmm1, (%rsi)
-; SSE-NEXT:    movq %xmm4, (%rdx)
-; SSE-NEXT:    movq %xmm5, (%rcx)
-; SSE-NEXT:    movq %xmm6, (%r8)
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq %xmm3, (%rdx)
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-NEXT:    movq %xmm4, (%rcx)
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; SSE-NEXT:    movq %xmm5, (%r8)
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE-NEXT:    movq %xmm0, (%r9)
-; SSE-NEXT:    movq %xmm7, (%rax)
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT:    movq %xmm3, (%rax)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_i32_stride6_vf2:
@@ -53,22 +53,22 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm2
 ; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[2,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm0[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
+; AVX-NEXT:    vmovlps %xmm3, (%rsi)
+; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; AVX-NEXT:    vmovlps %xmm3, (%rdx)
+; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[2,0,2,3]
+; AVX-NEXT:    vmovlps %xmm3, (%rcx)
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm2[2,2,3,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; AVX-NEXT:    vmovlps %xmm3, (%rsi)
-; AVX-NEXT:    vmovlps %xmm4, (%rdx)
-; AVX-NEXT:    vmovlps %xmm5, (%rcx)
 ; AVX-NEXT:    vmovlps %xmm0, (%r8)
-; AVX-NEXT:    vmovlps %xmm6, (%r9)
-; AVX-NEXT:    vmovlps %xmm1, (%rax)
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[2,2,3,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT:    vmovlps %xmm0, (%r9)
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: load_i32_stride6_vf2:
@@ -80,22 +80,22 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vmovaps 32(%rdi), %xmm3
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT:    vmovlps %xmm4, (%rsi)
 ; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3]
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3]
-; AVX2-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
+; AVX2-NEXT:    vmovlps %xmm2, (%rdx)
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm1[2,3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3]
+; AVX2-NEXT:    vmovlps %xmm2, (%rcx)
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm3 = [4,2,0,0]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm0, %ymm3, %ymm3
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX2-NEXT:    vpermps %ymm0, %ymm6, %ymm0
-; AVX2-NEXT:    vmovlps %xmm4, (%rsi)
-; AVX2-NEXT:    vmovlps %xmm2, (%rdx)
-; AVX2-NEXT:    vmovlps %xmm5, (%rcx)
 ; AVX2-NEXT:    vmovlps %xmm1, (%r8)
-; AVX2-NEXT:    vmovlps %xmm3, (%r9)
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = [4,2,0,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm1
+; AVX2-NEXT:    vmovlps %xmm1, (%r9)
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = [5,3,0,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -109,22 +109,22 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vmovaps 32(%rdi), %xmm3
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT:    vmovlps %xmm4, (%rsi)
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3]
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
+; AVX2-FP-NEXT:    vmovlps %xmm2, (%rdx)
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm1[2,3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3]
+; AVX2-FP-NEXT:    vmovlps %xmm2, (%rcx)
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
-; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm3 = [4,2,0,0]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm0, %ymm3, %ymm3
-; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX2-FP-NEXT:    vpermps %ymm0, %ymm6, %ymm0
-; AVX2-FP-NEXT:    vmovlps %xmm4, (%rsi)
-; AVX2-FP-NEXT:    vmovlps %xmm2, (%rdx)
-; AVX2-FP-NEXT:    vmovlps %xmm5, (%rcx)
 ; AVX2-FP-NEXT:    vmovlps %xmm1, (%r8)
-; AVX2-FP-NEXT:    vmovlps %xmm3, (%r9)
+; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm1 = [4,2,0,0]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm1, %ymm1
+; AVX2-FP-NEXT:    vmovlps %xmm1, (%r9)
+; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm1 = [5,3,0,0]
+; AVX2-FP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2-FP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
@@ -138,54 +138,56 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %xmm3
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT:    vmovlps %xmm4, (%rsi)
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3]
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
+; AVX2-FCP-NEXT:    vmovlps %xmm2, (%rdx)
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3]
+; AVX2-FCP-NEXT:    vmovlps %xmm2, (%rcx)
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
-; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm3 = [4,2,0,0]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm3, %ymm3
-; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm6, %ymm0
-; AVX2-FCP-NEXT:    vmovlps %xmm4, (%rsi)
-; AVX2-FCP-NEXT:    vmovlps %xmm2, (%rdx)
-; AVX2-FCP-NEXT:    vmovlps %xmm5, (%rcx)
 ; AVX2-FCP-NEXT:    vmovlps %xmm1, (%r8)
-; AVX2-FCP-NEXT:    vmovlps %xmm3, (%r9)
+; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = [4,2,0,0]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vmovlps %xmm1, (%r9)
+; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = [5,3,0,0]
+; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
 ; AVX512-LABEL: load_i32_stride6_vf2:
 ; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbx
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512-NEXT:    vmovaps 16(%rdi), %xmm1
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX512-NEXT:    vextractps $2, %xmm1, %r10d
-; AVX512-NEXT:    vpinsrd $1, %r10d, %xmm0, %xmm3
-; AVX512-NEXT:    vextractps $3, %xmm1, %r10d
+; AVX512-NEXT:    vextractps $3, %xmm1, %r11d
+; AVX512-NEXT:    vmovd %xmm2, %ebx
+; AVX512-NEXT:    vpinsrd $1, %r10d, %xmm0, %xmm1
+; AVX512-NEXT:    vpbroadcastd 8(%rdi), %xmm3
+; AVX512-NEXT:    vmovaps 32(%rdi), %ymm4
+; AVX512-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
+; AVX512-NEXT:    vmovq %xmm1, (%rsi)
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX512-NEXT:    vpinsrd $1, %r10d, %xmm1, %xmm1
-; AVX512-NEXT:    vpbroadcastd 8(%rdi), %xmm4
-; AVX512-NEXT:    vmovd %xmm2, %r10d
-; AVX512-NEXT:    vpinsrd $1, %r10d, %xmm4, %xmm4
+; AVX512-NEXT:    vpinsrd $1, %r11d, %xmm1, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512-NEXT:    vpinsrd $1, %ebx, %xmm3, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, (%rcx)
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0]
-; AVX512-NEXT:    vmovaps 32(%rdi), %ymm5
-; AVX512-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512-NEXT:    vpermps %ymm5, %ymm2, %ymm2
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX512-NEXT:    vpermps %ymm5, %ymm6, %ymm5
-; AVX512-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512-NEXT:    vmovq %xmm1, (%rdx)
-; AVX512-NEXT:    vmovq %xmm4, (%rcx)
 ; AVX512-NEXT:    vmovq %xmm0, (%r8)
-; AVX512-NEXT:    vmovlps %xmm2, (%r9)
-; AVX512-NEXT:    vmovlps %xmm5, (%rax)
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0]
+; AVX512-NEXT:    vpermps %ymm4, %ymm0, %ymm0
+; AVX512-NEXT:    vmovlps %xmm0, (%r9)
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0]
+; AVX512-NEXT:    vpermps %ymm4, %ymm0, %ymm0
+; AVX512-NEXT:    vmovlps %xmm0, (%rax)
+; AVX512-NEXT:    popq %rbx
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -195,56 +197,58 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0]
 ; AVX512-FCP-NEXT:    vmovaps (%rdi), %ymm1
 ; AVX512-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0]
-; AVX512-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4]
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm5
-; AVX512-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm3
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %xmm4, %xmm5, %xmm6
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
-; AVX512-FCP-NEXT:    vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm4
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
-; AVX512-FCP-NEXT:    vpermps %ymm1, %ymm5, %ymm1
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512-FCP-NEXT:    vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-FCP-NEXT:    vmovlps %xmm0, (%rsi)
-; AVX512-FCP-NEXT:    vmovlps %xmm2, (%rdx)
-; AVX512-FCP-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512-FCP-NEXT:    vmovq %xmm6, (%r8)
-; AVX512-FCP-NEXT:    vmovlps %xmm4, (%r9)
-; AVX512-FCP-NEXT:    vmovlps %xmm1, (%rax)
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0]
+; AVX512-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovlps %xmm0, (%rdx)
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4]
+; AVX512-FCP-NEXT:    vpermi2d %xmm3, %xmm2, %xmm0
+; AVX512-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %xmm2, %xmm3, %xmm0
+; AVX512-FCP-NEXT:    vmovq %xmm0, (%r8)
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0]
+; AVX512-FCP-NEXT:    vpermps %ymm4, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovlps %xmm0, (%r9)
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0]
+; AVX512-FCP-NEXT:    vpermps %ymm4, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i32_stride6_vf2:
 ; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbx
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovaps 16(%rdi), %xmm1
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX512DQ-NEXT:    vextractps $2, %xmm1, %r10d
-; AVX512DQ-NEXT:    vpinsrd $1, %r10d, %xmm0, %xmm3
-; AVX512DQ-NEXT:    vextractps $3, %xmm1, %r10d
+; AVX512DQ-NEXT:    vextractps $3, %xmm1, %r11d
+; AVX512DQ-NEXT:    vmovd %xmm2, %ebx
+; AVX512DQ-NEXT:    vpinsrd $1, %r10d, %xmm0, %xmm1
+; AVX512DQ-NEXT:    vpbroadcastd 8(%rdi), %xmm3
+; AVX512DQ-NEXT:    vmovaps 32(%rdi), %ymm4
+; AVX512DQ-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-NEXT:    vmovq %xmm1, (%rsi)
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX512DQ-NEXT:    vpinsrd $1, %r10d, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpbroadcastd 8(%rdi), %xmm4
-; AVX512DQ-NEXT:    vmovd %xmm2, %r10d
-; AVX512DQ-NEXT:    vpinsrd $1, %r10d, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpinsrd $1, %r11d, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512DQ-NEXT:    vpinsrd $1, %ebx, %xmm3, %xmm1
+; AVX512DQ-NEXT:    vmovq %xmm1, (%rcx)
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0]
-; AVX512DQ-NEXT:    vmovaps 32(%rdi), %ymm5
-; AVX512DQ-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-NEXT:    vpermps %ymm5, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX512DQ-NEXT:    vpermps %ymm5, %ymm6, %ymm5
-; AVX512DQ-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-NEXT:    vmovq %xmm1, (%rdx)
-; AVX512DQ-NEXT:    vmovq %xmm4, (%rcx)
 ; AVX512DQ-NEXT:    vmovq %xmm0, (%r8)
-; AVX512DQ-NEXT:    vmovlps %xmm2, (%r9)
-; AVX512DQ-NEXT:    vmovlps %xmm5, (%rax)
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0]
+; AVX512DQ-NEXT:    vpermps %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovlps %xmm0, (%r9)
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0]
+; AVX512DQ-NEXT:    vpermps %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovlps %xmm0, (%rax)
+; AVX512DQ-NEXT:    popq %rbx
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -254,56 +258,58 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0]
 ; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %ymm1
 ; AVX512DQ-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0]
-; AVX512DQ-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm3
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm4, %xmm5, %xmm6
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
-; AVX512DQ-FCP-NEXT:    vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
-; AVX512DQ-FCP-NEXT:    vpermps %ymm1, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT:    vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vmovlps %xmm0, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovlps %xmm2, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovlps %xmm4, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, (%rax)
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0]
+; AVX512DQ-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm0, (%rdx)
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4]
+; AVX512DQ-FCP-NEXT:    vpermi2d %xmm3, %xmm2, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %xmm2, %xmm3, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%r8)
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0]
+; AVX512DQ-FCP-NEXT:    vpermps %ymm4, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm0, (%r9)
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0]
+; AVX512DQ-FCP-NEXT:    vpermps %ymm4, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i32_stride6_vf2:
 ; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbx
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-NEXT:    vmovaps 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX512BW-NEXT:    vextractps $2, %xmm1, %r10d
-; AVX512BW-NEXT:    vpinsrd $1, %r10d, %xmm0, %xmm3
-; AVX512BW-NEXT:    vextractps $3, %xmm1, %r10d
+; AVX512BW-NEXT:    vextractps $3, %xmm1, %r11d
+; AVX512BW-NEXT:    vmovd %xmm2, %ebx
+; AVX512BW-NEXT:    vpinsrd $1, %r10d, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpbroadcastd 8(%rdi), %xmm3
+; AVX512BW-NEXT:    vmovaps 32(%rdi), %ymm4
+; AVX512BW-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovq %xmm1, (%rsi)
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX512BW-NEXT:    vpinsrd $1, %r10d, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpbroadcastd 8(%rdi), %xmm4
-; AVX512BW-NEXT:    vmovd %xmm2, %r10d
-; AVX512BW-NEXT:    vpinsrd $1, %r10d, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpinsrd $1, %r11d, %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512BW-NEXT:    vpinsrd $1, %ebx, %xmm3, %xmm1
+; AVX512BW-NEXT:    vmovq %xmm1, (%rcx)
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0]
-; AVX512BW-NEXT:    vmovaps 32(%rdi), %ymm5
-; AVX512BW-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vpermps %ymm5, %ymm2, %ymm2
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX512BW-NEXT:    vpermps %ymm5, %ymm6, %ymm5
-; AVX512BW-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm1, (%rdx)
-; AVX512BW-NEXT:    vmovq %xmm4, (%rcx)
 ; AVX512BW-NEXT:    vmovq %xmm0, (%r8)
-; AVX512BW-NEXT:    vmovlps %xmm2, (%r9)
-; AVX512BW-NEXT:    vmovlps %xmm5, (%rax)
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0]
+; AVX512BW-NEXT:    vpermps %ymm4, %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovlps %xmm0, (%r9)
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0]
+; AVX512BW-NEXT:    vpermps %ymm4, %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovlps %xmm0, (%rax)
+; AVX512BW-NEXT:    popq %rbx
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -313,56 +319,58 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0]
 ; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %ymm1
 ; AVX512BW-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0]
-; AVX512BW-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm2
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4]
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm4
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm5
-; AVX512BW-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm3
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %xmm4, %xmm5, %xmm6
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
-; AVX512BW-FCP-NEXT:    vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm4
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
-; AVX512BW-FCP-NEXT:    vpermps %ymm1, %ymm5, %ymm1
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512BW-FCP-NEXT:    vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vmovlps %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovlps %xmm2, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r8)
-; AVX512BW-FCP-NEXT:    vmovlps %xmm4, (%r9)
-; AVX512BW-FCP-NEXT:    vmovlps %xmm1, (%rax)
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0]
+; AVX512BW-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vmovlps %xmm0, (%rdx)
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4]
+; AVX512BW-FCP-NEXT:    vpermi2d %xmm3, %xmm2, %xmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2d %xmm2, %xmm3, %xmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r8)
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0]
+; AVX512BW-FCP-NEXT:    vpermps %ymm4, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vmovlps %xmm0, (%r9)
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0]
+; AVX512BW-FCP-NEXT:    vpermps %ymm4, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i32_stride6_vf2:
 ; AVX512DQ-BW:       # %bb.0:
+; AVX512DQ-BW-NEXT:    pushq %rbx
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-BW-NEXT:    vmovaps 16(%rdi), %xmm1
 ; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX512DQ-BW-NEXT:    vextractps $2, %xmm1, %r10d
-; AVX512DQ-BW-NEXT:    vpinsrd $1, %r10d, %xmm0, %xmm3
-; AVX512DQ-BW-NEXT:    vextractps $3, %xmm1, %r10d
+; AVX512DQ-BW-NEXT:    vextractps $3, %xmm1, %r11d
+; AVX512DQ-BW-NEXT:    vmovd %xmm2, %ebx
+; AVX512DQ-BW-NEXT:    vpinsrd $1, %r10d, %xmm0, %xmm1
+; AVX512DQ-BW-NEXT:    vpbroadcastd 8(%rdi), %xmm3
+; AVX512DQ-BW-NEXT:    vmovaps 32(%rdi), %ymm4
+; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rsi)
 ; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX512DQ-BW-NEXT:    vpinsrd $1, %r10d, %xmm1, %xmm1
-; AVX512DQ-BW-NEXT:    vpbroadcastd 8(%rdi), %xmm4
-; AVX512DQ-BW-NEXT:    vmovd %xmm2, %r10d
-; AVX512DQ-BW-NEXT:    vpinsrd $1, %r10d, %xmm4, %xmm4
+; AVX512DQ-BW-NEXT:    vpinsrd $1, %r11d, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512DQ-BW-NEXT:    vpinsrd $1, %ebx, %xmm3, %xmm1
+; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rcx)
 ; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0]
-; AVX512DQ-BW-NEXT:    vmovaps 32(%rdi), %ymm5
-; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpermps %ymm5, %ymm2, %ymm2
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX512DQ-BW-NEXT:    vpermps %ymm5, %ymm6, %ymm5
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rcx)
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r8)
-; AVX512DQ-BW-NEXT:    vmovlps %xmm2, (%r9)
-; AVX512DQ-BW-NEXT:    vmovlps %xmm5, (%rax)
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0]
+; AVX512DQ-BW-NEXT:    vpermps %ymm4, %ymm0, %ymm0
+; AVX512DQ-BW-NEXT:    vmovlps %xmm0, (%r9)
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0]
+; AVX512DQ-BW-NEXT:    vpermps %ymm4, %ymm0, %ymm0
+; AVX512DQ-BW-NEXT:    vmovlps %xmm0, (%rax)
+; AVX512DQ-BW-NEXT:    popq %rbx
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -372,25 +380,25 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm1, %ymm2, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm5, %xmm4, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm4, %xmm5, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm1, %ymm4, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm1, %ymm5, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm2, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm4, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm1, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm0, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm3, %xmm2, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm2, %xmm3, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm4, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm0, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm4, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <12 x i32>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
index 7948141f6becd..64ddca71898b3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
@@ -18,35 +18,35 @@
 define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
 ; SSE-LABEL: load_i32_stride7_vf2:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE-NEXT:    movdqa (%rdi), %xmm1
+; SSE-NEXT:    movdqa 16(%rdi), %xmm0
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,2,3,3]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,2,3,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT:    movdqa 48(%rdi), %xmm3
+; SSE-NEXT:    movq %xmm1, (%rsi)
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT:    movdqa 48(%rdi), %xmm2
-; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
-; SSE-NEXT:    movq %xmm0, (%rsi)
 ; SSE-NEXT:    movq %xmm4, (%rdx)
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
 ; SSE-NEXT:    movq %xmm5, (%rcx)
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3]
 ; SSE-NEXT:    movq %xmm6, (%r8)
-; SSE-NEXT:    movq %xmm1, (%r9)
-; SSE-NEXT:    movq %xmm3, (%r10)
-; SSE-NEXT:    movq %xmm7, (%rax)
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movq %xmm0, (%r9)
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT:    movq %xmm2, (%rsi)
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE-NEXT:    movq %xmm4, (%rax)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_i32_stride7_vf2:
@@ -60,26 +60,26 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm4
 ; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[2,3,2,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm2[0],xmm5[1],xmm2[2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm4[0],xmm2[1],xmm4[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[1,0,2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
+; AVX-NEXT:    vmovlps %xmm5, (%rsi)
+; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3]
+; AVX-NEXT:    vmovlps %xmm5, (%rdx)
+; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm2[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX-NEXT:    vmovlps %xmm5, (%rcx)
 ; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3]
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
-; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4]
-; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
-; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3]
+; AVX-NEXT:    vmovlps %xmm2, (%r8)
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm4[2,3,2,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
+; AVX-NEXT:    vmovlps %xmm2, (%r9)
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4]
+; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3]
+; AVX-NEXT:    vmovlps %xmm2, (%r10)
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4]
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3]
-; AVX-NEXT:    vmovlps %xmm5, (%rsi)
-; AVX-NEXT:    vmovlps %xmm6, (%rdx)
-; AVX-NEXT:    vmovlps %xmm7, (%rcx)
-; AVX-NEXT:    vmovlps %xmm2, (%r8)
-; AVX-NEXT:    vmovlps %xmm3, (%r9)
-; AVX-NEXT:    vmovlps %xmm4, (%r10)
 ; AVX-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -94,27 +94,27 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vmovaps (%rdi), %xmm3
 ; AVX2-NEXT:    vmovaps 32(%rdi), %xmm4
 ; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX2-NEXT:    vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3]
-; AVX2-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3]
-; AVX2-NEXT:    vbroadcastss 8(%rdi), %xmm6
-; AVX2-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
-; AVX2-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
-; AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm4 = [4,3,0,0]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vpermps %ymm7, %ymm4, %ymm4
-; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX2-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
+; AVX2-NEXT:    vbroadcastss 8(%rdi), %xmm5
+; AVX2-NEXT:    vmovlps %xmm2, (%rsi)
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3]
+; AVX2-NEXT:    vmovlps %xmm2, (%rdx)
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT:    vmovlps %xmm2, (%rcx)
+; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm3[3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3]
+; AVX2-NEXT:    vmovlps %xmm2, (%r8)
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm2 = [4,3,0,0]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpermps %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vmovlps %xmm2, (%r9)
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3]
+; AVX2-NEXT:    vmovlps %xmm2, (%r10)
 ; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vmovlps %xmm2, (%rsi)
-; AVX2-NEXT:    vmovlps %xmm5, (%rdx)
-; AVX2-NEXT:    vmovlps %xmm6, (%rcx)
-; AVX2-NEXT:    vmovlps %xmm3, (%r8)
-; AVX2-NEXT:    vmovlps %xmm4, (%r9)
-; AVX2-NEXT:    vmovlps %xmm7, (%r10)
 ; AVX2-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -129,27 +129,27 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm3
 ; AVX2-FP-NEXT:    vmovaps 32(%rdi), %xmm4
 ; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3]
-; AVX2-FP-NEXT:    vbroadcastss 8(%rdi), %xmm6
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
-; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm4 = [4,3,0,0]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FP-NEXT:    vpermps %ymm7, %ymm4, %ymm4
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
+; AVX2-FP-NEXT:    vbroadcastss 8(%rdi), %xmm5
+; AVX2-FP-NEXT:    vmovlps %xmm2, (%rsi)
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3]
+; AVX2-FP-NEXT:    vmovlps %xmm2, (%rdx)
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT:    vmovlps %xmm2, (%rcx)
+; AVX2-FP-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm3[3]
+; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3]
+; AVX2-FP-NEXT:    vmovlps %xmm2, (%r8)
+; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm2 = [4,3,0,0]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpermps %ymm3, %ymm2, %ymm2
+; AVX2-FP-NEXT:    vmovlps %xmm2, (%r9)
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3]
+; AVX2-FP-NEXT:    vmovlps %xmm2, (%r10)
 ; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
 ; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
 ; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-FP-NEXT:    vmovlps %xmm2, (%rsi)
-; AVX2-FP-NEXT:    vmovlps %xmm5, (%rdx)
-; AVX2-FP-NEXT:    vmovlps %xmm6, (%rcx)
-; AVX2-FP-NEXT:    vmovlps %xmm3, (%r8)
-; AVX2-FP-NEXT:    vmovlps %xmm4, (%r9)
-; AVX2-FP-NEXT:    vmovlps %xmm7, (%r10)
 ; AVX2-FP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
@@ -164,27 +164,27 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm3
 ; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %xmm4
 ; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3]
-; AVX2-FCP-NEXT:    vbroadcastss 8(%rdi), %xmm6
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
-; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm4 = [4,3,0,0]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm4, %ymm4
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FCP-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
+; AVX2-FCP-NEXT:    vbroadcastss 8(%rdi), %xmm5
+; AVX2-FCP-NEXT:    vmovlps %xmm2, (%rsi)
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3]
+; AVX2-FCP-NEXT:    vmovlps %xmm2, (%rdx)
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FCP-NEXT:    vmovlps %xmm2, (%rcx)
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm3[3]
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3]
+; AVX2-FCP-NEXT:    vmovlps %xmm2, (%r8)
+; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = [4,3,0,0]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vpermps %ymm3, %ymm2, %ymm2
+; AVX2-FCP-NEXT:    vmovlps %xmm2, (%r9)
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
+; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3]
+; AVX2-FCP-NEXT:    vmovlps %xmm2, (%r10)
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
 ; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-FCP-NEXT:    vmovlps %xmm2, (%rsi)
-; AVX2-FCP-NEXT:    vmovlps %xmm5, (%rdx)
-; AVX2-FCP-NEXT:    vmovlps %xmm6, (%rcx)
-; AVX2-FCP-NEXT:    vmovlps %xmm3, (%r8)
-; AVX2-FCP-NEXT:    vmovlps %xmm4, (%r9)
-; AVX2-FCP-NEXT:    vmovlps %xmm7, (%r10)
 ; AVX2-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
@@ -195,31 +195,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
 ; AVX512-NEXT:    vmovd %xmm1, %r11d
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512-NEXT:    vpinsrd $1, %r11d, %xmm3, %xmm3
-; AVX512-NEXT:    vpbroadcastd 8(%rdi), %xmm4
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
+; AVX512-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
+; AVX512-NEXT:    vpbroadcastd 8(%rdi), %xmm3
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0]
+; AVX512-NEXT:    vpermps (%rdi), %zmm4, %zmm4
+; AVX512-NEXT:    vmovaps 32(%rdi), %ymm5
+; AVX512-NEXT:    vmovaps (%rdi), %ymm6
+; AVX512-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpinsrd $1, %r11d, %xmm2, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512-NEXT:    vmovq %xmm2, (%rcx)
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512-NEXT:    vpermps (%rdi), %zmm1, %zmm1
-; AVX512-NEXT:    vmovaps (%rdi), %ymm5
-; AVX512-NEXT:    vmovaps 32(%rdi), %ymm6
-; AVX512-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX512-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX512-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512-NEXT:    vmovq %xmm4, (%rcx)
 ; AVX512-NEXT:    vmovq %xmm0, (%r8)
-; AVX512-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512-NEXT:    vmovlps %xmm7, (%r10)
-; AVX512-NEXT:    vmovlps %xmm5, (%rax)
+; AVX512-NEXT:    vmovlps %xmm4, (%r9)
+; AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX512-NEXT:    vmovlps %xmm0, (%r10)
+; AVX512-NEXT:    vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7]
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -231,24 +231,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX512-FCP-NEXT:    vpinsrd $1, 28(%rdi), %xmm1, %xmm3
-; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4]
-; AVX512-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
-; AVX512-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm5
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm6
+; AVX512-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm4
+; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4]
+; AVX512-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm3
+; AVX512-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX512-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm3
+; AVX512-FCP-NEXT:    vmovq %xmm3, (%r8)
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
 ; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
-; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm2
-; AVX512-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7
-; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm7, %zmm0
-; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512-FCP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512-FCP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512-FCP-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512-FCP-NEXT:    vmovlps %xmm2, (%r10)
+; AVX512-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1
+; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vmovlps %xmm1, (%r10)
+; AVX512-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1
+; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
@@ -259,31 +259,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
 ; AVX512DQ-NEXT:    vmovd %xmm1, %r11d
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-NEXT:    vpinsrd $1, %r11d, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vpbroadcastd 8(%rdi), %xmm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
+; AVX512DQ-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
+; AVX512DQ-NEXT:    vpbroadcastd 8(%rdi), %xmm3
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0]
+; AVX512DQ-NEXT:    vpermps (%rdi), %zmm4, %zmm4
+; AVX512DQ-NEXT:    vmovaps 32(%rdi), %ymm5
+; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm6
+; AVX512DQ-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512DQ-NEXT:    vpinsrd $1, %r11d, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512DQ-NEXT:    vmovq %xmm2, (%rcx)
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512DQ-NEXT:    vpermps (%rdi), %zmm1, %zmm1
-; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm5
-; AVX512DQ-NEXT:    vmovaps 32(%rdi), %ymm6
-; AVX512DQ-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX512DQ-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512DQ-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512DQ-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX512DQ-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-NEXT:    vmovq %xmm4, (%rcx)
 ; AVX512DQ-NEXT:    vmovq %xmm0, (%r8)
-; AVX512DQ-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512DQ-NEXT:    vmovlps %xmm7, (%r10)
-; AVX512DQ-NEXT:    vmovlps %xmm5, (%rax)
+; AVX512DQ-NEXT:    vmovlps %xmm4, (%r9)
+; AVX512DQ-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
+; AVX512DQ-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX512DQ-NEXT:    vmovlps %xmm0, (%r10)
+; AVX512DQ-NEXT:    vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7]
+; AVX512DQ-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -295,24 +295,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX512DQ-FCP-NEXT:    vpinsrd $1, 28(%rdi), %xmm1, %xmm3
-; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm6
+; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4]
+; AVX512DQ-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%r8)
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
-; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7
-; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm7, %zmm0
-; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovlps %xmm2, (%r10)
+; AVX512DQ-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1
+; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, (%r10)
+; AVX512DQ-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1
+; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -323,31 +323,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512BW-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
 ; AVX512BW-NEXT:    vmovd %xmm1, %r11d
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512BW-NEXT:    vpinsrd $1, %r11d, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpbroadcastd 8(%rdi), %xmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
+; AVX512BW-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
+; AVX512BW-NEXT:    vpbroadcastd 8(%rdi), %xmm3
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0]
+; AVX512BW-NEXT:    vpermps (%rdi), %zmm4, %zmm4
+; AVX512BW-NEXT:    vmovaps 32(%rdi), %ymm5
+; AVX512BW-NEXT:    vmovaps (%rdi), %ymm6
+; AVX512BW-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512BW-NEXT:    vpinsrd $1, %r11d, %xmm2, %xmm2
+; AVX512BW-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512BW-NEXT:    vmovq %xmm2, (%rcx)
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512BW-NEXT:    vpermps (%rdi), %zmm1, %zmm1
-; AVX512BW-NEXT:    vmovaps (%rdi), %ymm5
-; AVX512BW-NEXT:    vmovaps 32(%rdi), %ymm6
-; AVX512BW-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512BW-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX512BW-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512BW-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX512BW-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512BW-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX512BW-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512BW-NEXT:    vmovq %xmm4, (%rcx)
 ; AVX512BW-NEXT:    vmovq %xmm0, (%r8)
-; AVX512BW-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512BW-NEXT:    vmovlps %xmm7, (%r10)
-; AVX512BW-NEXT:    vmovlps %xmm5, (%rax)
+; AVX512BW-NEXT:    vmovlps %xmm4, (%r9)
+; AVX512BW-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
+; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX512BW-NEXT:    vmovlps %xmm0, (%r10)
+; AVX512BW-NEXT:    vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7]
+; AVX512BW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7]
+; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -359,24 +359,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX512BW-FCP-NEXT:    vpinsrd $1, 28(%rdi), %xmm1, %xmm3
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4]
-; AVX512BW-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
-; AVX512BW-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm5
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm6
+; AVX512BW-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm4
+; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4]
+; AVX512BW-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm3
+; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm3
+; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%r8)
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
 ; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
-; AVX512BW-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
-; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7
-; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm7, %zmm0
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512BW-FCP-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512BW-FCP-NEXT:    vmovlps %xmm2, (%r10)
+; AVX512BW-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1
+; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vmovlps %xmm1, (%r10)
+; AVX512BW-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1
+; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
@@ -387,31 +387,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-BW-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
 ; AVX512DQ-BW-NEXT:    vmovd %xmm1, %r11d
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-BW-NEXT:    vpinsrd $1, %r11d, %xmm3, %xmm3
-; AVX512DQ-BW-NEXT:    vpbroadcastd 8(%rdi), %xmm4
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
+; AVX512DQ-BW-NEXT:    vpinsrd $1, 28(%rdi), %xmm0, %xmm2
+; AVX512DQ-BW-NEXT:    vpbroadcastd 8(%rdi), %xmm3
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0]
+; AVX512DQ-BW-NEXT:    vpermps (%rdi), %zmm4, %zmm4
+; AVX512DQ-BW-NEXT:    vmovaps 32(%rdi), %ymm5
+; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %ymm6
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512DQ-BW-NEXT:    vpinsrd $1, %r11d, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rcx)
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
 ; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512DQ-BW-NEXT:    vpermps (%rdi), %zmm1, %zmm1
-; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %ymm5
-; AVX512DQ-BW-NEXT:    vmovaps 32(%rdi), %ymm6
-; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rcx)
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r8)
-; AVX512DQ-BW-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512DQ-BW-NEXT:    vmovlps %xmm7, (%r10)
-; AVX512DQ-BW-NEXT:    vmovlps %xmm5, (%rax)
+; AVX512DQ-BW-NEXT:    vmovlps %xmm4, (%r9)
+; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
+; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX512DQ-BW-NEXT:    vmovlps %xmm0, (%r10)
+; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7]
+; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7]
+; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-BW-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -423,24 +423,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpinsrd $1, 28(%rdi), %xmm1, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm2, %xmm1, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm7, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm2, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
index 13410fb5cc4b8..a118b4056b3d0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
@@ -27,22 +27,22 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm3
 ; SSE-NEXT:    movdqa %xmm0, %xmm4
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE-NEXT:    movdqa %xmm1, %xmm6
-; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
 ; SSE-NEXT:    movq %xmm4, (%rsi)
-; SSE-NEXT:    movq %xmm5, (%rdx)
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; SSE-NEXT:    movq %xmm4, (%rdx)
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SSE-NEXT:    movq %xmm0, (%rcx)
-; SSE-NEXT:    movq %xmm2, (%r8)
-; SSE-NEXT:    movq %xmm6, (%r9)
-; SSE-NEXT:    movq %xmm7, (%r11)
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT:    movq %xmm0, (%r8)
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT:    movq %xmm0, (%r9)
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT:    movq %xmm0, (%r11)
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; SSE-NEXT:    movq %xmm1, (%r10)
-; SSE-NEXT:    movq %xmm3, (%rax)
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    movq %xmm0, (%rax)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_i32_stride8_vf2:
@@ -55,26 +55,26 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm3
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3],xmm5[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm4, (%rsi)
+; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm4, (%rdx)
 ; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
-; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm6
-; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3]
-; AVX-NEXT:    vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm7
+; AVX-NEXT:    vmovq %xmm2, (%rcx)
+; AVX-NEXT:    vpextrq $1, %xmm2, (%r8)
+; AVX-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT:    vmovlps %xmm2, (%r9)
+; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
+; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3]
+; AVX-NEXT:    vmovlps %xmm2, (%r11)
+; AVX-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT:    vmovlps %xmm2, (%r10)
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3]
-; AVX-NEXT:    vmovq %xmm4, (%rsi)
-; AVX-NEXT:    vmovq %xmm5, (%rdx)
-; AVX-NEXT:    vmovq %xmm2, (%rcx)
-; AVX-NEXT:    vpextrq $1, %xmm2, (%r8)
-; AVX-NEXT:    vmovlps %xmm3, (%r9)
-; AVX-NEXT:    vmovlps %xmm6, (%r11)
-; AVX-NEXT:    vmovlps %xmm7, (%r10)
 ; AVX-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -84,30 +84,30 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT:    vmovaps 32(%rdi), %ymm0
-; AVX2-NEXT:    vmovaps (%rdi), %ymm1
+; AVX2-NEXT:    vmovaps (%rdi), %ymm0
+; AVX2-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm3
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3]
-; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-NEXT:    vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
-; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX2-NEXT:    vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7]
-; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm6
-; AVX2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm4, (%rsi)
-; AVX2-NEXT:    vmovq %xmm5, (%rdx)
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3]
+; AVX2-NEXT:    vmovq %xmm4, (%rdx)
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX2-NEXT:    vmovq %xmm2, (%rcx)
 ; AVX2-NEXT:    vpextrq $1, %xmm2, (%r8)
-; AVX2-NEXT:    vmovlps %xmm3, (%r9)
-; AVX2-NEXT:    vmovlps %xmm6, (%r11)
+; AVX2-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vmovlps %xmm2, (%r9)
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[1,1,1,1,5,5,5,5]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vmovlps %xmm2, (%r11)
+; AVX2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vmovlps %xmm1, (%r10)
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX2-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -117,30 +117,30 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm0
-; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm1
+; AVX2-FP-NEXT:    vmovaps (%rdi), %ymm0
+; AVX2-FP-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm3
 ; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3]
-; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5]
-; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm6, %xmm6
-; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX2-FP-NEXT:    vmovq %xmm4, (%rsi)
-; AVX2-FP-NEXT:    vmovq %xmm5, (%rdx)
+; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3]
+; AVX2-FP-NEXT:    vmovq %xmm4, (%rdx)
+; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX2-FP-NEXT:    vmovq %xmm2, (%rcx)
 ; AVX2-FP-NEXT:    vpextrq $1, %xmm2, (%r8)
-; AVX2-FP-NEXT:    vmovlps %xmm3, (%r9)
-; AVX2-FP-NEXT:    vmovlps %xmm6, (%r11)
+; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX2-FP-NEXT:    vmovlps %xmm2, (%r9)
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[1,1,1,1,5,5,5,5]
+; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX2-FP-NEXT:    vmovlps %xmm2, (%r11)
+; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX2-FP-NEXT:    vmovlps %xmm1, (%r10)
+; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-FP-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX2-FP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
@@ -150,30 +150,30 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm0
-; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm1
+; AVX2-FCP-NEXT:    vmovaps (%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
 ; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3]
-; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
-; AVX2-FCP-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7]
-; AVX2-FCP-NEXT:    vextractf128 $1, %ymm6, %xmm6
-; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
-; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX2-FCP-NEXT:    vmovq %xmm4, (%rsi)
-; AVX2-FCP-NEXT:    vmovq %xmm5, (%rdx)
+; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1]
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3]
+; AVX2-FCP-NEXT:    vmovq %xmm4, (%rdx)
+; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX2-FCP-NEXT:    vmovq %xmm2, (%rcx)
 ; AVX2-FCP-NEXT:    vpextrq $1, %xmm2, (%r8)
-; AVX2-FCP-NEXT:    vmovlps %xmm3, (%r9)
-; AVX2-FCP-NEXT:    vmovlps %xmm6, (%r11)
+; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX2-FCP-NEXT:    vmovlps %xmm2, (%r9)
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
+; AVX2-FCP-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX2-FCP-NEXT:    vmovlps %xmm2, (%r11)
+; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX2-FCP-NEXT:    vmovlps %xmm1, (%r10)
+; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX2-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
@@ -186,28 +186,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT:    vmovaps 32(%rdi), %ymm1
+; AVX512-NEXT:    vmovaps 32(%rdi), %ymm3
 ; AVX512-NEXT:    vmovaps (%rdi), %ymm4
-; AVX512-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX512-NEXT:    vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
-; AVX512-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
-; AVX512-NEXT:    vextractf128 $1, %ymm6, %xmm6
-; AVX512-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX512-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX512-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX512-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512-NEXT:    vpextrq $1, %xmm0, (%r8)
-; AVX512-NEXT:    vmovlps %xmm5, (%r9)
-; AVX512-NEXT:    vmovlps %xmm6, (%r11)
-; AVX512-NEXT:    vmovlps %xmm4, (%r10)
-; AVX512-NEXT:    vmovlps %xmm1, (%rax)
+; AVX512-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovlps %xmm0, (%r9)
+; AVX512-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5]
+; AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovlps %xmm0, (%r11)
+; AVX512-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7]
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vmovlps %xmm1, (%r10)
+; AVX512-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -219,27 +219,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
-; AVX512-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
-; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vmovaps (%rdi), %ymm4
-; AVX512-FCP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX512-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6
-; AVX512-FCP-NEXT:    vpermps (%rdi), %zmm6, %zmm6
-; AVX512-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX512-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX512-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3
+; AVX512-FCP-NEXT:    vpermps (%rdi), %zmm3, %zmm3
+; AVX512-FCP-NEXT:    vmovaps 32(%rdi), %ymm4
+; AVX512-FCP-NEXT:    vmovaps (%rdi), %ymm5
 ; AVX512-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm2
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
-; AVX512-FCP-NEXT:    vmovlps %xmm5, (%r9)
-; AVX512-FCP-NEXT:    vmovlps %xmm6, (%r11)
-; AVX512-FCP-NEXT:    vmovlps %xmm4, (%r10)
-; AVX512-FCP-NEXT:    vmovlps %xmm1, (%rax)
+; AVX512-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
+; AVX512-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512-FCP-NEXT:    vmovlps %xmm0, (%r9)
+; AVX512-FCP-NEXT:    vmovlps %xmm3, (%r11)
+; AVX512-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
+; AVX512-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512-FCP-NEXT:    vmovlps %xmm1, (%r10)
+; AVX512-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -251,28 +251,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-NEXT:    vmovaps 32(%rdi), %ymm1
+; AVX512DQ-NEXT:    vmovaps 32(%rdi), %ymm3
 ; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm4
-; AVX512DQ-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512DQ-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX512DQ-NEXT:    vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
-; AVX512DQ-NEXT:    vextractf128 $1, %ymm6, %xmm6
-; AVX512DQ-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512DQ-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX512DQ-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX512DQ-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX512DQ-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm0, (%r8)
-; AVX512DQ-NEXT:    vmovlps %xmm5, (%r9)
-; AVX512DQ-NEXT:    vmovlps %xmm6, (%r11)
-; AVX512DQ-NEXT:    vmovlps %xmm4, (%r10)
-; AVX512DQ-NEXT:    vmovlps %xmm1, (%rax)
+; AVX512DQ-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
+; AVX512DQ-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-NEXT:    vmovlps %xmm0, (%r9)
+; AVX512DQ-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
+; AVX512DQ-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-NEXT:    vmovlps %xmm0, (%r11)
+; AVX512DQ-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7]
+; AVX512DQ-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vmovlps %xmm1, (%r10)
+; AVX512DQ-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -284,27 +284,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %ymm4
-; AVX512DQ-FCP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX512DQ-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6
-; AVX512DQ-FCP-NEXT:    vpermps (%rdi), %zmm6, %zmm6
-; AVX512DQ-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX512DQ-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3
+; AVX512DQ-FCP-NEXT:    vpermps (%rdi), %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovaps 32(%rdi), %ymm4
+; AVX512DQ-FCP-NEXT:    vmovaps (%rdi), %ymm5
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovlps %xmm5, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovlps %xmm6, (%r11)
-; AVX512DQ-FCP-NEXT:    vmovlps %xmm4, (%r10)
-; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, (%rax)
+; AVX512DQ-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
+; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm0, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm3, (%r11)
+; AVX512DQ-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
+; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, (%r10)
+; AVX512DQ-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -316,28 +316,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT:    vmovaps 32(%rdi), %ymm1
+; AVX512BW-NEXT:    vmovaps 32(%rdi), %ymm3
 ; AVX512BW-NEXT:    vmovaps (%rdi), %ymm4
-; AVX512BW-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512BW-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX512BW-NEXT:    vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
-; AVX512BW-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
-; AVX512BW-NEXT:    vextractf128 $1, %ymm6, %xmm6
-; AVX512BW-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512BW-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX512BW-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512BW-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX512BW-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX512BW-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512BW-NEXT:    vpextrq $1, %xmm0, (%r8)
-; AVX512BW-NEXT:    vmovlps %xmm5, (%r9)
-; AVX512BW-NEXT:    vmovlps %xmm6, (%r11)
-; AVX512BW-NEXT:    vmovlps %xmm4, (%r10)
-; AVX512BW-NEXT:    vmovlps %xmm1, (%rax)
+; AVX512BW-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
+; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT:    vmovlps %xmm0, (%r9)
+; AVX512BW-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5]
+; AVX512BW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
+; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT:    vmovlps %xmm0, (%r11)
+; AVX512BW-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7]
+; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovlps %xmm1, (%r10)
+; AVX512BW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -349,27 +349,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX512BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
-; AVX512BW-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
-; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %ymm4
-; AVX512BW-FCP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX512BW-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6
-; AVX512BW-FCP-NEXT:    vpermps (%rdi), %zmm6, %zmm6
-; AVX512BW-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX512BW-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX512BW-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3
+; AVX512BW-FCP-NEXT:    vpermps (%rdi), %zmm3, %zmm3
+; AVX512BW-FCP-NEXT:    vmovaps 32(%rdi), %ymm4
+; AVX512BW-FCP-NEXT:    vmovaps (%rdi), %ymm5
 ; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm2
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512BW-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
-; AVX512BW-FCP-NEXT:    vmovlps %xmm5, (%r9)
-; AVX512BW-FCP-NEXT:    vmovlps %xmm6, (%r11)
-; AVX512BW-FCP-NEXT:    vmovlps %xmm4, (%r10)
-; AVX512BW-FCP-NEXT:    vmovlps %xmm1, (%rax)
+; AVX512BW-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
+; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512BW-FCP-NEXT:    vmovlps %xmm0, (%r9)
+; AVX512BW-FCP-NEXT:    vmovlps %xmm3, (%r11)
+; AVX512BW-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
+; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512BW-FCP-NEXT:    vmovlps %xmm1, (%r10)
+; AVX512BW-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512BW-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -381,28 +381,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-BW-NEXT:    vmovaps 32(%rdi), %ymm1
+; AVX512DQ-BW-NEXT:    vmovaps 32(%rdi), %ymm3
 ; AVX512DQ-BW-NEXT:    vmovaps (%rdi), %ymm4
-; AVX512DQ-BW-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
-; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
-; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm6, %xmm6
-; AVX512DQ-BW-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-BW-NEXT:    vpextrq $1, %xmm0, (%r8)
-; AVX512DQ-BW-NEXT:    vmovlps %xmm5, (%r9)
-; AVX512DQ-BW-NEXT:    vmovlps %xmm6, (%r11)
-; AVX512DQ-BW-NEXT:    vmovlps %xmm4, (%r10)
-; AVX512DQ-BW-NEXT:    vmovlps %xmm1, (%rax)
+; AVX512DQ-BW-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
+; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-BW-NEXT:    vmovlps %xmm0, (%r9)
+; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5]
+; AVX512DQ-BW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
+; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-BW-NEXT:    vmovlps %xmm0, (%r11)
+; AVX512DQ-BW-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7]
+; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512DQ-BW-NEXT:    vmovlps %xmm1, (%r10)
+; AVX512DQ-BW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-BW-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -414,27 +414,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT:    vmovaps 32(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vpermps (%rdi), %zmm6, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpermps (%rdi), %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovaps 32(%rdi), %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vmovaps (%rdi), %ymm5
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm1, %xmm0, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vpextrq $1, %xmm0, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm5, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm6, (%r11)
-; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm4, (%r10)
-; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
+; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm0, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm3, (%r11)
+; AVX512DQ-BW-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
+; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <16 x i32>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
index 81fe19c4d8b56..b609299e5f757 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
@@ -280,9 +280,9 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
 ; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    packuswb %xmm1, %xmm1
+; SSE-NEXT:    movq %xmm1, (%rsi)
 ; SSE-NEXT:    psrlw $8, %xmm0
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    movq %xmm1, (%rsi)
 ; SSE-NEXT:    movq %xmm0, (%rdx)
 ; SSE-NEXT:    retq
 ;
@@ -290,8 +290,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vmovq %xmm1, (%rsi)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vmovq %xmm0, (%rdx)
 ; AVX-NEXT:    retq
 ;
@@ -299,8 +299,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vmovq %xmm1, (%rsi)
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vmovq %xmm0, (%rdx)
 ; AVX2-NEXT:    retq
 ;
@@ -308,8 +308,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
 ; AVX2-FP:       # %bb.0:
 ; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vmovq %xmm1, (%rsi)
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vmovq %xmm0, (%rdx)
 ; AVX2-FP-NEXT:    retq
 ;
@@ -317,8 +317,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
 ; AVX2-FCP:       # %bb.0:
 ; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vmovq %xmm1, (%rsi)
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vmovq %xmm0, (%rdx)
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -326,8 +326,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vmovq %xmm0, (%rdx)
 ; AVX512-NEXT:    retq
 ;
@@ -335,8 +335,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%rdx)
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -344,8 +344,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vmovq %xmm0, (%rdx)
 ; AVX512DQ-NEXT:    retq
 ;
@@ -353,41 +353,41 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rdx)
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i8_stride2_vf8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpmovwb %xmm0, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rdx)
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i8_stride2_vf8:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpmovwb %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i8_stride2_vf8:
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpmovwb %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rdx)
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride2_vf8:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovwb %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <16 x i8>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <16 x i8> %wide.vec, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
index d1d7cb0a34332..a238371f0acbf 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
@@ -378,39 +378,39 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
 ; SSE-NEXT:    packuswb %xmm4, %xmm4
-; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
-; SSE-NEXT:    movdqa %xmm0, %xmm6
-; SSE-NEXT:    pand %xmm5, %xmm6
-; SSE-NEXT:    pandn %xmm1, %xmm5
-; SSE-NEXT:    por %xmm6, %xmm5
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT:    pand %xmm6, %xmm5
-; SSE-NEXT:    pandn %xmm3, %xmm6
-; SSE-NEXT:    por %xmm5, %xmm6
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm6[2,1,0,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
-; SSE-NEXT:    packuswb %xmm5, %xmm5
-; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm6, %xmm0
-; SSE-NEXT:    pandn %xmm1, %xmm6
-; SSE-NEXT:    por %xmm0, %xmm6
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSE-NEXT:    movq %xmm4, (%rsi)
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm4, %xmm5
+; SSE-NEXT:    pandn %xmm1, %xmm4
+; SSE-NEXT:    por %xmm5, %xmm4
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE-NEXT:    pand %xmm5, %xmm4
+; SSE-NEXT:    pandn %xmm3, %xmm5
+; SSE-NEXT:    por %xmm4, %xmm5
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm5[2,1,0,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; SSE-NEXT:    packuswb %xmm4, %xmm4
+; SSE-NEXT:    movq %xmm4, (%rdx)
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm4
+; SSE-NEXT:    por %xmm0, %xmm4
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535]
-; SSE-NEXT:    pand %xmm0, %xmm6
+; SSE-NEXT:    pand %xmm0, %xmm4
 ; SSE-NEXT:    pandn %xmm3, %xmm0
-; SSE-NEXT:    por %xmm6, %xmm0
+; SSE-NEXT:    por %xmm4, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    movq %xmm4, (%rsi)
-; SSE-NEXT:    movq %xmm5, (%rdx)
 ; SSE-NEXT:    movq %xmm0, (%rcx)
 ; SSE-NEXT:    retq
 ;
@@ -421,14 +421,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT:    vmovq %xmm2, (%rsi)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovq %xmm2, (%rsi)
-; AVX-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX-NEXT:    retq
 ;
@@ -439,14 +439,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vmovq %xmm2, (%rsi)
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovq %xmm2, (%rsi)
-; AVX2-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX2-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX2-NEXT:    retq
 ;
@@ -457,14 +457,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX2-FP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX2-FP-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-FP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX2-FP-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX2-FP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX2-FP-NEXT:    retq
 ;
@@ -475,14 +475,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX2-FCP-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX2-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -493,14 +493,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512-NEXT:    retq
 ;
@@ -511,14 +511,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512-FCP-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -529,14 +529,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512DQ-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-NEXT:    retq
 ;
@@ -547,14 +547,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -565,14 +565,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512BW-NEXT:    retq
 ;
@@ -583,14 +583,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -601,14 +601,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -619,14 +619,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <24 x i8>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
index abef980277ece..1dff9f4b8fa2d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
@@ -409,62 +409,62 @@ define void @load_i8_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
 ; SSE-LABEL: load_i8_stride4_vf8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm3
+; SSE-NEXT:    movdqa (%rdi), %xmm2
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm4
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0]
+; SSE-NEXT:    movdqa %xmm4, %xmm0
 ; SSE-NEXT:    movdqa %xmm4, %xmm1
-; SSE-NEXT:    movdqa %xmm4, %xmm2
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm4[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pand %xmm0, %xmm4
-; SSE-NEXT:    pand %xmm3, %xmm0
-; SSE-NEXT:    packuswb %xmm4, %xmm0
-; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    pxor %xmm6, %xmm6
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; SSE-NEXT:    movdqa %xmm3, %xmm4
-; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm3[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1]
-; SSE-NEXT:    packuswb %xmm7, %xmm6
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm7, %xmm5
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pand %xmm7, %xmm8
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm8[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm5, %xmm7
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,3,2,3]
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    pand %xmm2, %xmm3
+; SSE-NEXT:    packuswb %xmm4, %xmm3
+; SSE-NEXT:    packuswb %xmm3, %xmm3
+; SSE-NEXT:    movq %xmm3, (%rsi)
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm2[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
+; SSE-NEXT:    packuswb %xmm6, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
+; SSE-NEXT:    movq %xmm4, (%rdx)
+; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm5, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm5, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm4, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3]
+; SSE-NEXT:    movq %xmm4, (%rcx)
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE-NEXT:    packuswb %xmm2, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3]
-; SSE-NEXT:    movq %xmm0, (%rsi)
-; SSE-NEXT:    movq %xmm6, (%rdx)
-; SSE-NEXT:    movq %xmm5, (%rcx)
-; SSE-NEXT:    movq %xmm1, (%r8)
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE-NEXT:    packuswb %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3]
+; SSE-NEXT:    movq %xmm0, (%r8)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_i8_stride4_vf8:
@@ -475,22 +475,22 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
 ; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX-NEXT:    vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
-; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX-NEXT:    vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
-; AVX-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX-NEXT:    vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; AVX-NEXT:    vmovq %xmm0, (%rsi)
-; AVX-NEXT:    vmovq %xmm3, (%rdx)
-; AVX-NEXT:    vmovq %xmm4, (%rcx)
-; AVX-NEXT:    vmovq %xmm1, (%r8)
+; AVX-NEXT:    vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX-NEXT:    vmovq %xmm0, (%rdx)
+; AVX-NEXT:    vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX-NEXT:    vmovq %xmm0, (%rcx)
+; AVX-NEXT:    vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX-NEXT:    vmovq %xmm0, (%r8)
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: load_i8_stride4_vf8:
@@ -501,22 +501,22 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
 ; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX2-NEXT:    vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-NEXT:    vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
-; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-NEXT:    vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; AVX2-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-NEXT:    vmovq %xmm3, (%rdx)
-; AVX2-NEXT:    vmovq %xmm4, (%rcx)
-; AVX2-NEXT:    vmovq %xmm1, (%r8)
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-NEXT:    vmovq %xmm0, (%rdx)
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-NEXT:    vmovq %xmm0, (%rcx)
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-NEXT:    vmovq %xmm0, (%r8)
 ; AVX2-NEXT:    retq
 ;
 ; AVX2-FP-LABEL: load_i8_stride4_vf8:
@@ -527,22 +527,22 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
 ; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
 ; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX2-FP-NEXT:    vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-FP-NEXT:    vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
-; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
-; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-FP-NEXT:    vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; AVX2-FP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-FP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX2-FP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX2-FP-NEXT:    vmovq %xmm1, (%r8)
+; AVX2-FP-NEXT:    vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-FP-NEXT:    vmovq %xmm0, (%rdx)
+; AVX2-FP-NEXT:    vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-FP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX2-FP-NEXT:    vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-FP-NEXT:    vmovq %xmm0, (%r8)
 ; AVX2-FP-NEXT:    retq
 ;
 ; AVX2-FCP-LABEL: load_i8_stride4_vf8:
@@ -553,125 +553,125 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
 ; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
 ; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
-; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; AVX2-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX2-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX2-FCP-NEXT:    vmovq %xmm1, (%r8)
+; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-FCP-NEXT:    vmovq %xmm0, (%rdx)
+; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-FCP-NEXT:    vmovq %xmm0, (%r8)
 ; AVX2-FCP-NEXT:    retq
 ;
 ; AVX512-LABEL: load_i8_stride4_vf8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512-NEXT:    vpsrld $8, %ymm0, %ymm1
-; AVX512-NEXT:    vpsrld $16, %ymm0, %ymm2
-; AVX512-NEXT:    vpsrld $24, %ymm0, %ymm3
 ; AVX512-NEXT:    vpmovdb %ymm0, (%rsi)
+; AVX512-NEXT:    vpsrld $8, %ymm0, %ymm1
 ; AVX512-NEXT:    vpmovdb %ymm1, (%rdx)
-; AVX512-NEXT:    vpmovdb %ymm2, (%rcx)
-; AVX512-NEXT:    vpmovdb %ymm3, (%r8)
+; AVX512-NEXT:    vpsrld $16, %ymm0, %ymm1
+; AVX512-NEXT:    vpmovdb %ymm1, (%rcx)
+; AVX512-NEXT:    vpsrld $24, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdb %ymm0, (%r8)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: load_i8_stride4_vf8:
 ; AVX512-FCP:       # %bb.0:
 ; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512-FCP-NEXT:    vpsrld $8, %ymm0, %ymm1
-; AVX512-FCP-NEXT:    vpsrld $16, %ymm0, %ymm2
-; AVX512-FCP-NEXT:    vpsrld $24, %ymm0, %ymm3
 ; AVX512-FCP-NEXT:    vpmovdb %ymm0, (%rsi)
+; AVX512-FCP-NEXT:    vpsrld $8, %ymm0, %ymm1
 ; AVX512-FCP-NEXT:    vpmovdb %ymm1, (%rdx)
-; AVX512-FCP-NEXT:    vpmovdb %ymm2, (%rcx)
-; AVX512-FCP-NEXT:    vpmovdb %ymm3, (%r8)
+; AVX512-FCP-NEXT:    vpsrld $16, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpmovdb %ymm1, (%rcx)
+; AVX512-FCP-NEXT:    vpsrld $24, %ymm0, %ymm0
+; AVX512-FCP-NEXT:    vpmovdb %ymm0, (%r8)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i8_stride4_vf8:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-NEXT:    vpsrld $8, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vpsrld $16, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpsrld $24, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpmovdb %ymm0, (%rsi)
+; AVX512DQ-NEXT:    vpsrld $8, %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpmovdb %ymm1, (%rdx)
-; AVX512DQ-NEXT:    vpmovdb %ymm2, (%rcx)
-; AVX512DQ-NEXT:    vpmovdb %ymm3, (%r8)
+; AVX512DQ-NEXT:    vpsrld $16, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vpmovdb %ymm1, (%rcx)
+; AVX512DQ-NEXT:    vpsrld $24, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdb %ymm0, (%r8)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: load_i8_stride4_vf8:
 ; AVX512DQ-FCP:       # %bb.0:
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vpsrld $8, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT:    vpsrld $16, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT:    vpsrld $24, %ymm0, %ymm3
 ; AVX512DQ-FCP-NEXT:    vpmovdb %ymm0, (%rsi)
+; AVX512DQ-FCP-NEXT:    vpsrld $8, %ymm0, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpmovdb %ymm1, (%rdx)
-; AVX512DQ-FCP-NEXT:    vpmovdb %ymm2, (%rcx)
-; AVX512DQ-FCP-NEXT:    vpmovdb %ymm3, (%r8)
+; AVX512DQ-FCP-NEXT:    vpsrld $16, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpmovdb %ymm1, (%rcx)
+; AVX512DQ-FCP-NEXT:    vpsrld $24, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT:    vpmovdb %ymm0, (%r8)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i8_stride4_vf8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpsrld $8, %ymm0, %ymm1
-; AVX512BW-NEXT:    vpsrld $16, %ymm0, %ymm2
-; AVX512BW-NEXT:    vpsrld $24, %ymm0, %ymm3
 ; AVX512BW-NEXT:    vpmovdb %ymm0, (%rsi)
+; AVX512BW-NEXT:    vpsrld $8, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpmovdb %ymm1, (%rdx)
-; AVX512BW-NEXT:    vpmovdb %ymm2, (%rcx)
-; AVX512BW-NEXT:    vpmovdb %ymm3, (%r8)
+; AVX512BW-NEXT:    vpsrld $16, %ymm0, %ymm1
+; AVX512BW-NEXT:    vpmovdb %ymm1, (%rcx)
+; AVX512BW-NEXT:    vpsrld $24, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdb %ymm0, (%r8)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: load_i8_stride4_vf8:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-FCP-NEXT:    vpsrld $8, %ymm0, %ymm1
-; AVX512BW-FCP-NEXT:    vpsrld $16, %ymm0, %ymm2
-; AVX512BW-FCP-NEXT:    vpsrld $24, %ymm0, %ymm3
 ; AVX512BW-FCP-NEXT:    vpmovdb %ymm0, (%rsi)
+; AVX512BW-FCP-NEXT:    vpsrld $8, %ymm0, %ymm1
 ; AVX512BW-FCP-NEXT:    vpmovdb %ymm1, (%rdx)
-; AVX512BW-FCP-NEXT:    vpmovdb %ymm2, (%rcx)
-; AVX512BW-FCP-NEXT:    vpmovdb %ymm3, (%r8)
+; AVX512BW-FCP-NEXT:    vpsrld $16, %ymm0, %ymm1
+; AVX512BW-FCP-NEXT:    vpmovdb %ymm1, (%rcx)
+; AVX512BW-FCP-NEXT:    vpsrld $24, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vpmovdb %ymm0, (%r8)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i8_stride4_vf8:
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-BW-NEXT:    vpsrld $8, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT:    vpsrld $16, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT:    vpsrld $24, %ymm0, %ymm3
 ; AVX512DQ-BW-NEXT:    vpmovdb %ymm0, (%rsi)
+; AVX512DQ-BW-NEXT:    vpsrld $8, %ymm0, %ymm1
 ; AVX512DQ-BW-NEXT:    vpmovdb %ymm1, (%rdx)
-; AVX512DQ-BW-NEXT:    vpmovdb %ymm2, (%rcx)
-; AVX512DQ-BW-NEXT:    vpmovdb %ymm3, (%r8)
+; AVX512DQ-BW-NEXT:    vpsrld $16, %ymm0, %ymm1
+; AVX512DQ-BW-NEXT:    vpmovdb %ymm1, (%rcx)
+; AVX512DQ-BW-NEXT:    vpsrld $24, %ymm0, %ymm0
+; AVX512DQ-BW-NEXT:    vpmovdb %ymm0, (%r8)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride4_vf8:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpsrld $8, %ymm0, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpsrld $16, %ymm0, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpsrld $24, %ymm0, %ymm3
 ; AVX512DQ-BW-FCP-NEXT:    vpmovdb %ymm0, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vpsrld $8, %ymm0, %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vpmovdb %ymm1, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vpmovdb %ymm2, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vpmovdb %ymm3, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vpsrld $16, %ymm0, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpmovdb %ymm1, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vpsrld $24, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpmovdb %ymm0, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <32 x i8>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
index ac14f55e3f0ed..5db006e5dadb3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
@@ -583,133 +583,133 @@ define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
 ; SSE-LABEL: load_i8_stride5_vf8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm4
-; SSE-NEXT:    movdqa 16(%rdi), %xmm3
+; SSE-NEXT:    movdqa (%rdi), %xmm3
+; SSE-NEXT:    movdqa 16(%rdi), %xmm2
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm0
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pandn %xmm3, %xmm2
-; SSE-NEXT:    movdqa %xmm4, %xmm5
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pandn %xmm2, %xmm4
+; SSE-NEXT:    movdqa %xmm3, %xmm5
 ; SSE-NEXT:    pand %xmm1, %xmm5
-; SSE-NEXT:    por %xmm2, %xmm5
-; SSE-NEXT:    pxor %xmm6, %xmm6
-; SSE-NEXT:    movdqa %xmm5, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; SSE-NEXT:    por %xmm4, %xmm5
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    movdqa %xmm5, %xmm6
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,0,0,65535,65535]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
 ; SSE-NEXT:    pand %xmm7, %xmm5
-; SSE-NEXT:    pandn %xmm2, %xmm7
+; SSE-NEXT:    pandn %xmm6, %xmm7
 ; SSE-NEXT:    por %xmm5, %xmm7
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm7[0,2,1,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,6,5,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm7[0,2,1,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,2,1,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,5,7]
+; SSE-NEXT:    packuswb %xmm6, %xmm6
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm5, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1]
+; SSE-NEXT:    movdqa %xmm5, %xmm8
+; SSE-NEXT:    pandn %xmm7, %xmm8
+; SSE-NEXT:    por %xmm6, %xmm8
+; SSE-NEXT:    movq %xmm8, (%rsi)
+; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
+; SSE-NEXT:    movdqa %xmm2, %xmm7
+; SSE-NEXT:    pand %xmm6, %xmm7
+; SSE-NEXT:    pandn %xmm3, %xmm6
+; SSE-NEXT:    por %xmm7, %xmm6
+; SSE-NEXT:    movdqa %xmm6, %xmm7
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,0,0,65535,65535,65535,0]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE-NEXT:    pand %xmm8, %xmm6
+; SSE-NEXT:    pandn %xmm7, %xmm8
+; SSE-NEXT:    por %xmm6, %xmm8
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm8[0,2,1,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,4,5,7]
+; SSE-NEXT:    packuswb %xmm6, %xmm6
+; SSE-NEXT:    pand %xmm5, %xmm6
+; SSE-NEXT:    movdqa %xmm0, %xmm7
+; SSE-NEXT:    pslld $24, %xmm7
+; SSE-NEXT:    pandn %xmm7, %xmm5
+; SSE-NEXT:    por %xmm6, %xmm5
+; SSE-NEXT:    movq %xmm5, (%rdx)
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
+; SSE-NEXT:    movdqa %xmm2, %xmm6
+; SSE-NEXT:    pand %xmm5, %xmm6
+; SSE-NEXT:    pandn %xmm3, %xmm5
+; SSE-NEXT:    por %xmm6, %xmm5
+; SSE-NEXT:    movdqa %xmm5, %xmm7
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
+; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535]
+; SSE-NEXT:    movdqa %xmm6, %xmm8
+; SSE-NEXT:    pandn %xmm7, %xmm8
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE-NEXT:    pand %xmm6, %xmm5
+; SSE-NEXT:    por %xmm8, %xmm5
+; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,6,5,6,7]
 ; SSE-NEXT:    packuswb %xmm7, %xmm7
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm2, %xmm7
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1]
-; SSE-NEXT:    movdqa %xmm2, %xmm5
-; SSE-NEXT:    pandn %xmm8, %xmm5
-; SSE-NEXT:    por %xmm7, %xmm5
-; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
-; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT:    pand %xmm5, %xmm7
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5]
+; SSE-NEXT:    packuswb %xmm8, %xmm8
+; SSE-NEXT:    movdqa %xmm5, %xmm9
+; SSE-NEXT:    pandn %xmm8, %xmm9
+; SSE-NEXT:    por %xmm7, %xmm9
+; SSE-NEXT:    movq %xmm9, (%rcx)
+; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
+; SSE-NEXT:    movdqa %xmm2, %xmm8
 ; SSE-NEXT:    pand %xmm7, %xmm8
-; SSE-NEXT:    pandn %xmm4, %xmm7
+; SSE-NEXT:    pandn %xmm3, %xmm7
 ; SSE-NEXT:    por %xmm8, %xmm7
 ; SSE-NEXT:    movdqa %xmm7, %xmm8
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
-; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [65535,65535,0,0,65535,65535,65535,0]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE-NEXT:    pand %xmm9, %xmm7
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[3,1],xmm8[2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5]
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[3,1,2,0]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[3,0,1,2,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,6,7]
+; SSE-NEXT:    packuswb %xmm7, %xmm7
+; SSE-NEXT:    pand %xmm5, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,1,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,6]
+; SSE-NEXT:    packuswb %xmm8, %xmm8
+; SSE-NEXT:    movdqa %xmm5, %xmm9
 ; SSE-NEXT:    pandn %xmm8, %xmm9
 ; SSE-NEXT:    por %xmm7, %xmm9
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[0,2,1,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5]
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,5,7]
-; SSE-NEXT:    packuswb %xmm7, %xmm7
-; SSE-NEXT:    pand %xmm2, %xmm7
-; SSE-NEXT:    movdqa %xmm0, %xmm8
-; SSE-NEXT:    pslld $24, %xmm8
-; SSE-NEXT:    pandn %xmm8, %xmm2
-; SSE-NEXT:    por %xmm7, %xmm2
-; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
-; SSE-NEXT:    movdqa %xmm3, %xmm8
-; SSE-NEXT:    pand %xmm7, %xmm8
-; SSE-NEXT:    pandn %xmm4, %xmm7
-; SSE-NEXT:    por %xmm8, %xmm7
-; SSE-NEXT:    movdqa %xmm7, %xmm9
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
-; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535]
-; SSE-NEXT:    movdqa %xmm8, %xmm10
-; SSE-NEXT:    pandn %xmm9, %xmm10
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE-NEXT:    pand %xmm8, %xmm7
-; SSE-NEXT:    por %xmm10, %xmm7
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,6,7]
-; SSE-NEXT:    packuswb %xmm10, %xmm10
-; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT:    pand %xmm7, %xmm10
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,6,5]
-; SSE-NEXT:    packuswb %xmm11, %xmm11
-; SSE-NEXT:    movdqa %xmm7, %xmm9
-; SSE-NEXT:    pandn %xmm11, %xmm9
-; SSE-NEXT:    por %xmm10, %xmm9
-; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
-; SSE-NEXT:    movdqa %xmm3, %xmm11
-; SSE-NEXT:    pand %xmm10, %xmm11
-; SSE-NEXT:    pandn %xmm4, %xmm10
-; SSE-NEXT:    por %xmm11, %xmm10
-; SSE-NEXT:    movdqa %xmm10, %xmm11
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
-; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[3,1],xmm11[2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,5]
-; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[3,1,2,0]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm10[3,0,1,2,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,7,4,6,7]
-; SSE-NEXT:    packuswb %xmm11, %xmm11
-; SSE-NEXT:    pand %xmm7, %xmm11
-; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,1,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,5,6]
-; SSE-NEXT:    packuswb %xmm12, %xmm12
-; SSE-NEXT:    movdqa %xmm7, %xmm10
-; SSE-NEXT:    pandn %xmm12, %xmm10
-; SSE-NEXT:    por %xmm11, %xmm10
-; SSE-NEXT:    pand %xmm1, %xmm3
-; SSE-NEXT:    pandn %xmm4, %xmm1
-; SSE-NEXT:    por %xmm3, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm3
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
-; SSE-NEXT:    pand %xmm8, %xmm1
-; SSE-NEXT:    pandn %xmm3, %xmm8
-; SSE-NEXT:    por %xmm1, %xmm8
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm8[3,1,0,3,4,5,6,7]
+; SSE-NEXT:    movq %xmm9, (%r8)
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pandn %xmm3, %xmm1
+; SSE-NEXT:    por %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
+; SSE-NEXT:    pand %xmm6, %xmm1
+; SSE-NEXT:    pandn %xmm2, %xmm6
+; SSE-NEXT:    por %xmm1, %xmm6
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm6[3,1,0,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
 ; SSE-NEXT:    packuswb %xmm1, %xmm1
-; SSE-NEXT:    pand %xmm7, %xmm1
+; SSE-NEXT:    pand %xmm5, %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    pandn %xmm0, %xmm7
-; SSE-NEXT:    por %xmm1, %xmm7
-; SSE-NEXT:    movq %xmm5, (%rsi)
-; SSE-NEXT:    movq %xmm2, (%rdx)
-; SSE-NEXT:    movq %xmm9, (%rcx)
-; SSE-NEXT:    movq %xmm10, (%r8)
-; SSE-NEXT:    movq %xmm7, (%r9)
+; SSE-NEXT:    pandn %xmm0, %xmm5
+; SSE-NEXT:    por %xmm1, %xmm5
+; SSE-NEXT:    movq %xmm5, (%r9)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_i8_stride5_vf8:
@@ -722,30 +722,30 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,7,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm4, %xmm5, %xmm4
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,9,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,2,4,6,8,10,1,11,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,3,13,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm3, (%rsi)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,9,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm3, (%rdx)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,1,11,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm3, (%rcx)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,3,13,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm3, (%r8)
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,5,15,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vmovq %xmm3, (%rsi)
-; AVX-NEXT:    vmovq %xmm4, (%rdx)
-; AVX-NEXT:    vmovq %xmm5, (%rcx)
-; AVX-NEXT:    vmovq %xmm6, (%r8)
 ; AVX-NEXT:    vmovq %xmm0, (%r9)
 ; AVX-NEXT:    retq
 ;
@@ -758,26 +758,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpor %xmm4, %xmm5, %xmm4
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX2-NEXT:    vmovq %xmm3, (%rsi)
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vmovq %xmm3, (%rdx)
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vmovq %xmm3, (%rcx)
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vmovq %xmm3, (%r8)
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-NEXT:    vmovq %xmm4, (%rdx)
-; AVX2-NEXT:    vmovq %xmm5, (%rcx)
-; AVX2-NEXT:    vmovq %xmm6, (%r8)
 ; AVX2-NEXT:    vmovq %xmm0, (%r9)
 ; AVX2-NEXT:    retq
 ;
@@ -790,26 +790,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpor %xmm4, %xmm5, %xmm4
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX2-FP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX2-FP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX2-FP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX2-FP-NEXT:    vmovq %xmm3, (%r8)
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-FP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-FP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX2-FP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX2-FP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX2-FP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX2-FP-NEXT:    retq
 ;
@@ -822,26 +822,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX2-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX2-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX2-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX2-FCP-NEXT:    vmovq %xmm3, (%r8)
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX2-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX2-FCP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX2-FCP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX2-FCP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -854,26 +854,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm4, %xmm5, %xmm4
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX512-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512-NEXT:    vmovq %xmm3, (%r8)
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512-NEXT:    retq
 ;
@@ -886,26 +886,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512-FCP-NEXT:    vmovq %xmm3, (%r8)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512-FCP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512-FCP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -918,26 +918,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm4, %xmm5, %xmm4
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX512DQ-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-NEXT:    vmovq %xmm3, (%r8)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512DQ-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512DQ-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512DQ-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512DQ-NEXT:    retq
 ;
@@ -950,26 +950,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%r8)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -982,26 +982,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm4, %xmm5, %xmm4
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX512BW-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-NEXT:    vmovq %xmm3, (%r8)
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512BW-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512BW-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512BW-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512BW-NEXT:    retq
 ;
@@ -1014,26 +1014,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%r8)
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -1046,26 +1046,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm5, %xmm4
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%r8)
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -1078,26 +1078,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <40 x i8>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
index f87126a98eea4..763b8a67edaf7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
@@ -755,146 +755,146 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-LABEL: load_i8_stride6_vf8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movdqa (%rdi), %xmm4
-; SSE-NEXT:    movdqa 16(%rdi), %xmm3
+; SSE-NEXT:    movdqa (%rdi), %xmm2
+; SSE-NEXT:    movdqa 16(%rdi), %xmm1
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT:    movdqa %xmm4, %xmm1
-; SSE-NEXT:    pand %xmm8, %xmm1
-; SSE-NEXT:    pandn %xmm3, %xmm8
-; SSE-NEXT:    por %xmm1, %xmm8
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3]
-; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [16711935,16711935,16711935,16711935]
-; SSE-NEXT:    pand %xmm5, %xmm1
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,7,6,7]
-; SSE-NEXT:    packuswb %xmm6, %xmm6
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT:    pand %xmm1, %xmm6
-; SSE-NEXT:    movdqa %xmm0, %xmm7
-; SSE-NEXT:    pand %xmm5, %xmm7
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[0,1,2,1]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,6,5]
+; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    pand %xmm6, %xmm3
+; SSE-NEXT:    pandn %xmm1, %xmm6
+; SSE-NEXT:    por %xmm3, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,2,1,3]
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [16711935,16711935,16711935,16711935]
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    packuswb %xmm4, %xmm4
+; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT:    pand %xmm7, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm3, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[0,1,2,1]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5]
+; SSE-NEXT:    packuswb %xmm8, %xmm8
+; SSE-NEXT:    movdqa %xmm7, %xmm9
+; SSE-NEXT:    pandn %xmm8, %xmm9
+; SSE-NEXT:    por %xmm4, %xmm9
+; SSE-NEXT:    movq %xmm9, (%rsi)
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    movdqa %xmm6, %xmm8
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[2,1,0,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[1,1,1,1,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
+; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,0,0,65535,65535]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,0,4,5,6,7]
+; SSE-NEXT:    pand %xmm9, %xmm6
+; SSE-NEXT:    pandn %xmm8, %xmm9
+; SSE-NEXT:    por %xmm6, %xmm9
 ; SSE-NEXT:    packuswb %xmm9, %xmm9
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pandn %xmm9, %xmm2
-; SSE-NEXT:    por %xmm6, %xmm2
-; SSE-NEXT:    pxor %xmm6, %xmm6
-; SSE-NEXT:    movdqa %xmm8, %xmm9
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[2,1,0,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7]
-; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,65535]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,0,4,5,6,7]
-; SSE-NEXT:    pand %xmm10, %xmm8
-; SSE-NEXT:    pandn %xmm9, %xmm10
-; SSE-NEXT:    por %xmm8, %xmm10
+; SSE-NEXT:    pand %xmm7, %xmm9
+; SSE-NEXT:    movdqa %xmm0, %xmm6
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[2,2,3,3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
+; SSE-NEXT:    packuswb %xmm8, %xmm8
+; SSE-NEXT:    pandn %xmm8, %xmm7
+; SSE-NEXT:    por %xmm9, %xmm7
+; SSE-NEXT:    movq %xmm7, (%rdx)
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE-NEXT:    movdqa %xmm8, %xmm7
+; SSE-NEXT:    pandn %xmm1, %xmm7
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    pand %xmm8, %xmm9
+; SSE-NEXT:    por %xmm7, %xmm9
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm9[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
+; SSE-NEXT:    pand %xmm3, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,5,5,5,5]
 ; SSE-NEXT:    packuswb %xmm10, %xmm10
-; SSE-NEXT:    pand %xmm1, %xmm10
-; SSE-NEXT:    movdqa %xmm0, %xmm8
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[2,2,3,3]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; SSE-NEXT:    packuswb %xmm9, %xmm9
-; SSE-NEXT:    pandn %xmm9, %xmm1
-; SSE-NEXT:    por %xmm10, %xmm1
-; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT:    movdqa %xmm11, %xmm9
-; SSE-NEXT:    pandn %xmm3, %xmm9
-; SSE-NEXT:    movdqa %xmm4, %xmm12
-; SSE-NEXT:    pand %xmm11, %xmm12
-; SSE-NEXT:    por %xmm9, %xmm12
-; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm12[2,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
-; SSE-NEXT:    pand %xmm5, %xmm9
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,0,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,5,5,5,5]
-; SSE-NEXT:    packuswb %xmm13, %xmm13
-; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm9, %xmm13
-; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm7[0,3,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,4,4,5,6]
-; SSE-NEXT:    packuswb %xmm14, %xmm14
+; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm7, %xmm10
+; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm5[0,3,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6]
+; SSE-NEXT:    packuswb %xmm11, %xmm11
+; SSE-NEXT:    movdqa %xmm7, %xmm12
+; SSE-NEXT:    pandn %xmm11, %xmm12
+; SSE-NEXT:    por %xmm10, %xmm12
+; SSE-NEXT:    movq %xmm12, (%rcx)
 ; SSE-NEXT:    movdqa %xmm9, %xmm10
-; SSE-NEXT:    pandn %xmm14, %xmm10
-; SSE-NEXT:    por %xmm13, %xmm10
-; SSE-NEXT:    movdqa %xmm12, %xmm13
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,1,4,5,6,7]
-; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [0,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7]
-; SSE-NEXT:    pand %xmm14, %xmm12
-; SSE-NEXT:    pandn %xmm13, %xmm14
-; SSE-NEXT:    por %xmm12, %xmm14
-; SSE-NEXT:    packuswb %xmm14, %xmm14
-; SSE-NEXT:    pand %xmm9, %xmm14
-; SSE-NEXT:    movdqa %xmm8, %xmm12
-; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[3,0]
-; SSE-NEXT:    movaps %xmm0, %xmm13
-; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[0,2]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,1,0,2]
-; SSE-NEXT:    packuswb %xmm13, %xmm13
-; SSE-NEXT:    movdqa %xmm9, %xmm12
-; SSE-NEXT:    pandn %xmm13, %xmm12
-; SSE-NEXT:    por %xmm14, %xmm12
-; SSE-NEXT:    pand %xmm11, %xmm3
-; SSE-NEXT:    pandn %xmm4, %xmm11
-; SSE-NEXT:    por %xmm3, %xmm11
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[3,1,2,0]
-; SSE-NEXT:    pand %xmm5, %xmm3
-; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[2,1,0,3,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm4, %xmm4
-; SSE-NEXT:    pand %xmm9, %xmm4
-; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,7,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,1,0,2]
-; SSE-NEXT:    packuswb %xmm5, %xmm5
-; SSE-NEXT:    movdqa %xmm9, %xmm3
-; SSE-NEXT:    pandn %xmm5, %xmm3
-; SSE-NEXT:    por %xmm4, %xmm3
-; SSE-NEXT:    movdqa %xmm11, %xmm4
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
-; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,1,1,2,4,5,6,7]
-; SSE-NEXT:    pand %xmm5, %xmm6
-; SSE-NEXT:    pandn %xmm4, %xmm5
-; SSE-NEXT:    por %xmm6, %xmm5
-; SSE-NEXT:    packuswb %xmm5, %xmm5
-; SSE-NEXT:    pand %xmm9, %xmm5
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7]
+; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [0,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,3,2,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7]
+; SSE-NEXT:    pand %xmm11, %xmm9
+; SSE-NEXT:    pandn %xmm10, %xmm11
+; SSE-NEXT:    por %xmm9, %xmm11
+; SSE-NEXT:    packuswb %xmm11, %xmm11
+; SSE-NEXT:    pand %xmm7, %xmm11
+; SSE-NEXT:    movdqa %xmm6, %xmm9
+; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[3,0]
+; SSE-NEXT:    movaps %xmm0, %xmm10
+; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,1],xmm9[0,2]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,1,0,2]
+; SSE-NEXT:    packuswb %xmm9, %xmm9
+; SSE-NEXT:    movdqa %xmm7, %xmm10
+; SSE-NEXT:    pandn %xmm9, %xmm10
+; SSE-NEXT:    por %xmm11, %xmm10
+; SSE-NEXT:    movq %xmm10, (%r8)
+; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    pandn %xmm2, %xmm8
+; SSE-NEXT:    por %xmm1, %xmm8
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[3,1,2,0]
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm1, %xmm1
+; SSE-NEXT:    pand %xmm7, %xmm1
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
+; SSE-NEXT:    packuswb %xmm2, %xmm2
+; SSE-NEXT:    movdqa %xmm7, %xmm3
+; SSE-NEXT:    pandn %xmm2, %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
+; SSE-NEXT:    movq %xmm3, (%r9)
+; SSE-NEXT:    movdqa %xmm8, %xmm1
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,1,2,4,5,6,7]
+; SSE-NEXT:    pand %xmm2, %xmm3
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    packuswb %xmm2, %xmm2
+; SSE-NEXT:    pand %xmm7, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    pandn %xmm0, %xmm9
-; SSE-NEXT:    por %xmm5, %xmm9
-; SSE-NEXT:    movq %xmm2, (%rsi)
-; SSE-NEXT:    movq %xmm1, (%rdx)
-; SSE-NEXT:    movq %xmm10, (%rcx)
-; SSE-NEXT:    movq %xmm12, (%r8)
-; SSE-NEXT:    movq %xmm3, (%r9)
-; SSE-NEXT:    movq %xmm9, (%rax)
+; SSE-NEXT:    pandn %xmm0, %xmm7
+; SSE-NEXT:    por %xmm2, %xmm7
+; SSE-NEXT:    movq %xmm7, (%rax)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_i8_stride6_vf8:
@@ -910,42 +910,42 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[4,10,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpor %xmm5, %xmm3, %xmm3
-; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[5,11,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX-NEXT:    vmovq {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
-; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm7, %xmm5, %xmm5
-; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
-; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm8, %xmm9, %xmm8
-; AVX-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
-; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm9, %xmm8, %xmm8
+; AVX-NEXT:    vmovq %xmm3, (%rsi)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm3, %xmm5, %xmm3
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,xmm0[5,11,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vmovq %xmm3, (%rdx)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT:    vmovq {{.*#+}} xmm4 = [0,1,2,3,4,128,128,128,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX-NEXT:    vmovq %xmm3, (%rcx)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm3, %xmm5, %xmm3
+; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX-NEXT:    vmovq %xmm3, (%r8)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm3, %xmm5, %xmm3
+; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX-NEXT:    vmovq %xmm3, (%r9)
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmovq %xmm3, (%rsi)
-; AVX-NEXT:    vmovq %xmm4, (%rdx)
-; AVX-NEXT:    vmovq %xmm5, (%rcx)
-; AVX-NEXT:    vmovq %xmm7, (%r8)
-; AVX-NEXT:    vmovq %xmm8, (%r9)
 ; AVX-NEXT:    vmovq %xmm0, (%rax)
 ; AVX-NEXT:    retq
 ;
@@ -959,30 +959,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX2-NEXT:    vmovq %xmm4, (%rsi)
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX2-NEXT:    vmovq %xmm2, (%rdx)
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX2-NEXT:    vmovq %xmm4, (%rcx)
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, (%r8)
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpor %xmm5, %xmm7, %xmm5
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, (%r9)
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovq %xmm4, (%rsi)
-; AVX2-NEXT:    vmovq %xmm2, (%rdx)
-; AVX2-NEXT:    vmovq %xmm6, (%rcx)
-; AVX2-NEXT:    vmovq %xmm3, (%r8)
-; AVX2-NEXT:    vmovq %xmm5, (%r9)
 ; AVX2-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -997,30 +997,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX2-FP-NEXT:    vmovq %xmm4, (%rsi)
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX2-FP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX2-FP-NEXT:    vmovq %xmm4, (%rcx)
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vmovq %xmm2, (%r8)
 ; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpor %xmm5, %xmm7, %xmm5
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX2-FP-NEXT:    vmovq %xmm2, (%r9)
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-FP-NEXT:    vmovq %xmm4, (%rsi)
-; AVX2-FP-NEXT:    vmovq %xmm2, (%rdx)
-; AVX2-FP-NEXT:    vmovq %xmm6, (%rcx)
-; AVX2-FP-NEXT:    vmovq %xmm3, (%r8)
-; AVX2-FP-NEXT:    vmovq %xmm5, (%r9)
 ; AVX2-FP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
@@ -1035,30 +1035,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX2-FCP-NEXT:    vmovq %xmm4, (%rsi)
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX2-FCP-NEXT:    vmovq %xmm4, (%rcx)
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%r8)
 ; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%r9)
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-FCP-NEXT:    vmovq %xmm4, (%rsi)
-; AVX2-FCP-NEXT:    vmovq %xmm2, (%rdx)
-; AVX2-FCP-NEXT:    vmovq %xmm6, (%rcx)
-; AVX2-FCP-NEXT:    vmovq %xmm3, (%r8)
-; AVX2-FCP-NEXT:    vmovq %xmm5, (%r9)
 ; AVX2-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
@@ -1073,30 +1073,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512-NEXT:    vmovq %xmm4, (%rsi)
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX512-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512-NEXT:    vmovq %xmm4, (%rcx)
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, (%r8)
 ; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm5, %xmm7, %xmm5
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, (%r9)
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm4, (%rsi)
-; AVX512-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512-NEXT:    vmovq %xmm6, (%rcx)
-; AVX512-NEXT:    vmovq %xmm3, (%r8)
-; AVX512-NEXT:    vmovq %xmm5, (%r9)
 ; AVX512-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1111,30 +1111,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512-FCP-NEXT:    vmovq %xmm4, (%rsi)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512-FCP-NEXT:    vmovq %xmm4, (%rcx)
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%r8)
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%r9)
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512-FCP-NEXT:    vmovq %xmm4, (%rsi)
-; AVX512-FCP-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512-FCP-NEXT:    vmovq %xmm6, (%rcx)
-; AVX512-FCP-NEXT:    vmovq %xmm3, (%r8)
-; AVX512-FCP-NEXT:    vmovq %xmm5, (%r9)
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
@@ -1149,30 +1149,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-NEXT:    vmovq %xmm4, (%rsi)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-NEXT:    vmovq %xmm4, (%rcx)
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovq %xmm2, (%r8)
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm5, %xmm7, %xmm5
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-NEXT:    vmovq %xmm2, (%r9)
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovq %xmm4, (%rsi)
-; AVX512DQ-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512DQ-NEXT:    vmovq %xmm6, (%rcx)
-; AVX512DQ-NEXT:    vmovq %xmm3, (%r8)
-; AVX512DQ-NEXT:    vmovq %xmm5, (%r9)
 ; AVX512DQ-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -1187,30 +1187,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rcx)
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%r8)
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%r9)
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%r9)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -1225,30 +1225,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512BW-NEXT:    vmovq %xmm4, (%rsi)
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX512BW-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512BW-NEXT:    vmovq %xmm4, (%rcx)
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vmovq %xmm2, (%r8)
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm5, %xmm7, %xmm5
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT:    vmovq %xmm2, (%r9)
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovq %xmm4, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512BW-NEXT:    vmovq %xmm6, (%rcx)
-; AVX512BW-NEXT:    vmovq %xmm3, (%r8)
-; AVX512BW-NEXT:    vmovq %xmm5, (%r9)
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1263,30 +1263,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rsi)
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rcx)
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%r8)
 ; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
 ; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%r9)
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%r8)
-; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r9)
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
@@ -1301,30 +1301,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rsi)
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rcx)
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%r8)
 ; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
 ; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm5, %xmm7, %xmm5
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%r9)
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm6, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%r8)
-; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%r9)
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
@@ -1339,30 +1339,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rsi)
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
 ; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index 82481269022b0..09d00795e4cc9 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -932,106 +932,100 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
 ; SSE-LABEL: load_i8_stride7_vf8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm3
+; SSE-NEXT:    movdqa (%rdi), %xmm2
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm11
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm6
-; SSE-NEXT:    movdqa 48(%rdi), %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535]
-; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pandn %xmm11, %xmm2
-; SSE-NEXT:    por %xmm1, %xmm2
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm2, %xmm5
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
+; SSE-NEXT:    movdqa 48(%rdi), %xmm13
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,0,65535]
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pandn %xmm11, %xmm4
+; SSE-NEXT:    por %xmm1, %xmm4
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    movdqa %xmm4, %xmm5
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,65535]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE-NEXT:    pxor %xmm4, %xmm4
-; SSE-NEXT:    pand %xmm7, %xmm2
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE-NEXT:    pand %xmm7, %xmm4
 ; SSE-NEXT:    pandn %xmm5, %xmm7
-; SSE-NEXT:    por %xmm2, %xmm7
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm2[0,3,2,1,4,5,6,7]
+; SSE-NEXT:    por %xmm4, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,2,1,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,1,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm4[0,3,2,1,4,5,6,7]
 ; SSE-NEXT:    packuswb %xmm7, %xmm7
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm2, %xmm7
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm4, %xmm7
 ; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; SSE-NEXT:    movdqa %xmm6, %xmm5
 ; SSE-NEXT:    pand %xmm9, %xmm5
-; SSE-NEXT:    pandn %xmm0, %xmm9
+; SSE-NEXT:    pandn %xmm13, %xmm9
 ; SSE-NEXT:    por %xmm5, %xmm9
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
 ; SSE-NEXT:    movdqa %xmm6, %xmm8
-; SSE-NEXT:    movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3]
+; SSE-NEXT:    movss {{.*#+}} xmm8 = xmm13[0],xmm8[1,2,3]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535]
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm11, %xmm10
-; SSE-NEXT:    movdqa %xmm11, %xmm1
 ; SSE-NEXT:    pand %xmm5, %xmm10
 ; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,65535,65535,65535]
-; SSE-NEXT:    movdqa %xmm6, %xmm4
-; SSE-NEXT:    pand %xmm12, %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pandn %xmm0, %xmm12
-; SSE-NEXT:    movaps %xmm0, %xmm14
+; SSE-NEXT:    movdqa %xmm6, %xmm1
+; SSE-NEXT:    pand %xmm12, %xmm1
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pandn %xmm13, %xmm12
+; SSE-NEXT:    movaps %xmm13, %xmm14
 ; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[1,0],xmm6[0,0]
 ; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[2,0],xmm6[2,3]
-; SSE-NEXT:    pand %xmm5, %xmm0
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm5, %xmm13
 ; SSE-NEXT:    pandn %xmm6, %xmm5
-; SSE-NEXT:    movdqa %xmm6, %xmm15
-; SSE-NEXT:    pxor %xmm0, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,4,5,6]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,6]
 ; SSE-NEXT:    packuswb %xmm9, %xmm9
-; SSE-NEXT:    movdqa %xmm2, %xmm11
-; SSE-NEXT:    movdqa %xmm2, %xmm13
-; SSE-NEXT:    pandn %xmm9, %xmm13
-; SSE-NEXT:    por %xmm7, %xmm13
+; SSE-NEXT:    movdqa %xmm4, %xmm15
+; SSE-NEXT:    pandn %xmm9, %xmm15
+; SSE-NEXT:    por %xmm7, %xmm15
+; SSE-NEXT:    movq %xmm15, (%rsi)
 ; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535]
 ; SSE-NEXT:    movdqa %xmm7, %xmm9
-; SSE-NEXT:    movdqa %xmm1, %xmm4
-; SSE-NEXT:    pandn %xmm1, %xmm9
-; SSE-NEXT:    movdqa %xmm3, %xmm2
-; SSE-NEXT:    pand %xmm7, %xmm3
-; SSE-NEXT:    por %xmm9, %xmm3
-; SSE-NEXT:    movdqa %xmm3, %xmm9
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; SSE-NEXT:    pandn %xmm11, %xmm9
+; SSE-NEXT:    movdqa %xmm2, %xmm15
+; SSE-NEXT:    pand %xmm7, %xmm15
+; SSE-NEXT:    por %xmm9, %xmm15
+; SSE-NEXT:    movdqa %xmm15, %xmm9
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535]
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    pandn %xmm9, %xmm1
-; SSE-NEXT:    pxor %xmm6, %xmm6
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15]
-; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    por %xmm1, %xmm3
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm3[8],xmm15[9],xmm3[9],xmm15[10],xmm3[10],xmm15[11],xmm3[11],xmm15[12],xmm3[12],xmm15[13],xmm3[13],xmm15[14],xmm3[14],xmm15[15],xmm3[15]
+; SSE-NEXT:    pand %xmm0, %xmm15
+; SSE-NEXT:    por %xmm1, %xmm15
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
 ; SSE-NEXT:    movdqa %xmm8, %xmm9
 ; SSE-NEXT:    pand %xmm1, %xmm9
-; SSE-NEXT:    pandn %xmm15, %xmm1
+; SSE-NEXT:    pandn %xmm6, %xmm1
 ; SSE-NEXT:    por %xmm9, %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
 ; SSE-NEXT:    packuswb %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm11, %xmm9
+; SSE-NEXT:    movdqa %xmm4, %xmm9
 ; SSE-NEXT:    pandn %xmm1, %xmm9
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm15[0,3,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
 ; SSE-NEXT:    packuswb %xmm1, %xmm1
-; SSE-NEXT:    pand %xmm11, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm1
 ; SSE-NEXT:    por %xmm1, %xmm9
+; SSE-NEXT:    movq %xmm9, (%rdx)
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE-NEXT:    pandn %xmm2, %xmm1
 ; SSE-NEXT:    por %xmm1, %xmm10
 ; SSE-NEXT:    movdqa %xmm10, %xmm1
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7]
 ; SSE-NEXT:    pand %xmm0, %xmm10
 ; SSE-NEXT:    pandn %xmm1, %xmm0
 ; SSE-NEXT:    por %xmm10, %xmm0
@@ -1040,107 +1034,104 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    pand %xmm11, %xmm0
+; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535]
 ; SSE-NEXT:    pand %xmm1, %xmm8
-; SSE-NEXT:    pandn %xmm15, %xmm1
+; SSE-NEXT:    pandn %xmm6, %xmm1
 ; SSE-NEXT:    por %xmm8, %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
 ; SSE-NEXT:    packuswb %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm11, %xmm8
+; SSE-NEXT:    movdqa %xmm4, %xmm8
 ; SSE-NEXT:    pandn %xmm1, %xmm8
 ; SSE-NEXT:    por %xmm0, %xmm8
+; SSE-NEXT:    movq %xmm8, (%rcx)
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,0,65535,65535]
-; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    movdqa %xmm11, %xmm1
 ; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    pandn %xmm2, %xmm0
-; SSE-NEXT:    movdqa %xmm2, %xmm10
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
-; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,65535,65535,65535]
-; SSE-NEXT:    pand %xmm3, %xmm0
-; SSE-NEXT:    pandn %xmm1, %xmm3
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,0,65535,0,65535,65535,65535,65535]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm8
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; SSE-NEXT:    por %xmm0, %xmm3
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm3[3,2,1,0,4,5,6,7]
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    por %xmm0, %xmm8
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm8[3,2,1,0,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    pand %xmm11, %xmm0
+; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3],xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm12[0,3,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6]
-; SSE-NEXT:    packuswb %xmm3, %xmm3
-; SSE-NEXT:    pandn %xmm3, %xmm11
-; SSE-NEXT:    por %xmm0, %xmm11
-; SSE-NEXT:    movdqa %xmm11, %xmm6
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm12[0,3,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6]
+; SSE-NEXT:    packuswb %xmm8, %xmm8
+; SSE-NEXT:    pandn %xmm8, %xmm4
+; SSE-NEXT:    por %xmm0, %xmm4
+; SSE-NEXT:    movq %xmm4, (%r8)
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT:    movdqa %xmm4, %xmm2
-; SSE-NEXT:    movdqa %xmm4, %xmm3
-; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    movdqa %xmm10, %xmm11
-; SSE-NEXT:    pandn %xmm10, %xmm0
-; SSE-NEXT:    por %xmm3, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    pxor %xmm4, %xmm4
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE-NEXT:    movdqa %xmm11, %xmm4
+; SSE-NEXT:    pand %xmm0, %xmm4
+; SSE-NEXT:    pandn %xmm2, %xmm0
+; SSE-NEXT:    por %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm4
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,5]
-; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,0,65535]
-; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    pandn %xmm15, %xmm3
-; SSE-NEXT:    por %xmm1, %xmm3
-; SSE-NEXT:    packuswb %xmm3, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,3,2,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[1,3,2,3]
-; SSE-NEXT:    movdqa %xmm11, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535]
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pandn %xmm6, %xmm4
+; SSE-NEXT:    por %xmm1, %xmm4
+; SSE-NEXT:    packuswb %xmm4, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE-NEXT:    movq %xmm0, (%r9)
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[0,2,2,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3],xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3],xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7]
 ; SSE-NEXT:    pand %xmm0, %xmm14
-; SSE-NEXT:    pandn %xmm15, %xmm0
+; SSE-NEXT:    pandn %xmm6, %xmm0
 ; SSE-NEXT:    por %xmm14, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
 ; SSE-NEXT:    packuswb %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[0,3,2,3]
-; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    pand %xmm7, %xmm0
-; SSE-NEXT:    pandn %xmm3, %xmm7
-; SSE-NEXT:    por %xmm0, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
+; SSE-NEXT:    movq %xmm0, (%rcx)
+; SSE-NEXT:    pand %xmm7, %xmm11
+; SSE-NEXT:    pandn %xmm2, %xmm7
+; SSE-NEXT:    por %xmm11, %xmm7
 ; SSE-NEXT:    movdqa %xmm7, %xmm0
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE-NEXT:    por %xmm13, %xmm5
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535]
 ; SSE-NEXT:    pand %xmm1, %xmm5
-; SSE-NEXT:    pandn %xmm15, %xmm1
+; SSE-NEXT:    pandn %xmm6, %xmm1
 ; SSE-NEXT:    por %xmm5, %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
@@ -1148,12 +1139,6 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; SSE-NEXT:    movq %xmm13, (%rsi)
-; SSE-NEXT:    movq %xmm9, (%rdx)
-; SSE-NEXT:    movq %xmm8, (%rcx)
-; SSE-NEXT:    movq %xmm6, (%r8)
-; SSE-NEXT:    movq %xmm10, (%r9)
-; SSE-NEXT:    movq %xmm11, (%rdi)
 ; SSE-NEXT:    movq %xmm0, (%rax)
 ; SSE-NEXT:    retq
 ;
@@ -1174,52 +1159,52 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovddup {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,0,0,0]
 ; AVX-NEXT:    # xmm7 = mem[0,0]
 ; AVX-NEXT:    vpblendvb %xmm7, %xmm4, %xmm5, %xmm4
-; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3]
-; AVX-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
-; AVX-NEXT:    vpblendvb %xmm7, %xmm5, %xmm8, %xmm5
-; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm8, %xmm9, %xmm8
-; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
-; AVX-NEXT:    vpshufb %xmm6, %xmm9, %xmm9
-; AVX-NEXT:    vpblendvb %xmm7, %xmm8, %xmm9, %xmm8
-; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm9, %xmm10, %xmm9
-; AVX-NEXT:    vpalignr {{.*#+}} xmm10 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
-; AVX-NEXT:    vpshufb %xmm6, %xmm10, %xmm6
-; AVX-NEXT:    vpblendvb %xmm7, %xmm9, %xmm6, %xmm6
-; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm7, %xmm9, %xmm7
-; AVX-NEXT:    vmovd {{.*#+}} xmm9 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
-; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3],xmm10[4,5,6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm10, %xmm11, %xmm10
-; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm1[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm4, (%rsi)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3]
+; AVX-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
+; AVX-NEXT:    vpblendvb %xmm7, %xmm4, %xmm5, %xmm4
+; AVX-NEXT:    vmovq %xmm4, (%rdx)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
+; AVX-NEXT:    vpblendvb %xmm7, %xmm4, %xmm5, %xmm4
+; AVX-NEXT:    vmovq %xmm4, (%rcx)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
+; AVX-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
+; AVX-NEXT:    vpblendvb %xmm7, %xmm4, %xmm5, %xmm4
+; AVX-NEXT:    vmovq %xmm4, (%r8)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX-NEXT:    vmovd {{.*#+}} xmm5 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm6
+; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm4, (%r9)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm4, %xmm6, %xmm4
+; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm4, (%r10)
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
-; AVX-NEXT:    vmovq %xmm4, (%rsi)
-; AVX-NEXT:    vmovq %xmm5, (%rdx)
-; AVX-NEXT:    vmovq %xmm8, (%rcx)
-; AVX-NEXT:    vmovq %xmm6, (%r8)
-; AVX-NEXT:    vmovq %xmm7, (%r9)
-; AVX-NEXT:    vmovq %xmm10, (%r10)
 ; AVX-NEXT:    vmovq %xmm0, (%rax)
 ; AVX-NEXT:    retq
 ;
@@ -1235,45 +1220,45 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
-; AVX2-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
-; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
-; AVX2-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
-; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
-; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
-; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovq %xmm2, (%rsi)
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, (%rdx)
+; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, (%rcx)
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, (%r8)
+; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, (%r9)
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, (%r10)
+; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT:    vmovq %xmm2, (%rsi)
-; AVX2-NEXT:    vmovq %xmm3, (%rdx)
-; AVX2-NEXT:    vmovq %xmm4, (%rcx)
-; AVX2-NEXT:    vmovq %xmm5, (%r8)
-; AVX2-NEXT:    vmovq %xmm6, (%r9)
-; AVX2-NEXT:    vmovq %xmm7, (%r10)
 ; AVX2-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1290,45 +1275,45 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
-; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
-; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
-; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
-; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
+; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vmovq %xmm2, (%rcx)
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vmovq %xmm2, (%r8)
+; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
+; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vmovq %xmm2, (%r9)
+; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
+; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vmovq %xmm2, (%r10)
+; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
+; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-FP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX2-FP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX2-FP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX2-FP-NEXT:    vmovq %xmm5, (%r8)
-; AVX2-FP-NEXT:    vmovq %xmm6, (%r9)
-; AVX2-FP-NEXT:    vmovq %xmm7, (%r10)
 ; AVX2-FP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
@@ -1345,45 +1330,45 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
-; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
-; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
-; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
-; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
+; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%rcx)
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%r8)
+; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
+; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm2
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%r9)
+; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
+; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%r10)
+; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
+; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX2-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX2-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX2-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX2-FCP-NEXT:    vmovq %xmm6, (%r9)
-; AVX2-FCP-NEXT:    vmovq %xmm7, (%r10)
 ; AVX2-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
@@ -1400,44 +1385,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
-; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
-; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0))
-; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
-; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; AVX512-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, (%rcx)
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, (%r8)
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, (%r9)
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, (%r10)
 ; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512-NEXT:    vmovq %xmm5, (%r8)
-; AVX512-NEXT:    vmovq %xmm6, (%r9)
-; AVX512-NEXT:    vmovq %xmm7, (%r10)
 ; AVX512-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1454,44 +1439,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0))
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
-; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%rcx)
+; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%r8)
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%r9)
+; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%r10)
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512-FCP-NEXT:    vmovq %xmm6, (%r9)
-; AVX512-FCP-NEXT:    vmovq %xmm7, (%r10)
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
@@ -1508,44 +1493,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0))
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; AVX512DQ-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovq %xmm2, (%rcx)
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovq %xmm2, (%r8)
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovq %xmm2, (%r9)
+; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovq %xmm2, (%r10)
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-NEXT:    vmovq %xmm6, (%r9)
-; AVX512DQ-NEXT:    vmovq %xmm7, (%r10)
 ; AVX512DQ-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -1562,44 +1547,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0))
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
-; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rcx)
+; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%r9)
+; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%r10)
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm7, (%r10)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -1617,48 +1602,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT:    movw $580, %di # imm = 0x244
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm4 {%k1}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX512BW-NEXT:    movw $4644, %di # imm = 0x1224
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm0, %ymm6 {%k1}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX512BW-NEXT:    movw $9288, %di # imm = 0x2448
-; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512BW-NEXT:    movw $580, %dx # imm = 0x244
+; AVX512BW-NEXT:    kmovd %edx, %k1
+; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vmovq %xmm2, (%rcx)
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vmovq %xmm2, (%r8)
+; AVX512BW-NEXT:    movw $4644, %cx # imm = 0x1224
+; AVX512BW-NEXT:    kmovd %ecx, %k1
+; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vmovq %xmm2, (%r9)
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vmovq %xmm2, (%r10)
+; AVX512BW-NEXT:    movw $9288, %cx # imm = 0x2448
+; AVX512BW-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512BW-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512BW-NEXT:    vmovq %xmm5, (%r8)
-; AVX512BW-NEXT:    vmovq %xmm6, (%r9)
-; AVX512BW-NEXT:    vmovq %xmm7, (%r10)
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1676,48 +1661,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX512BW-FCP-NEXT:    movw $580, %di # imm = 0x244
-; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm4 {%k1}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX512BW-FCP-NEXT:    movw $4644, %di # imm = 0x1224
-; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm0, %ymm6 {%k1}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX512BW-FCP-NEXT:    movw $9288, %di # imm = 0x2448
-; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT:    movw $580, %dx # imm = 0x244
+; AVX512BW-FCP-NEXT:    kmovd %edx, %k1
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rcx)
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%r8)
+; AVX512BW-FCP-NEXT:    movw $4644, %cx # imm = 0x1224
+; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%r9)
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%r10)
+; AVX512BW-FCP-NEXT:    movw $9288, %cx # imm = 0x2448
+; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
 ; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r9)
-; AVX512BW-FCP-NEXT:    vmovq %xmm7, (%r10)
 ; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
@@ -1735,48 +1720,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX512DQ-BW-NEXT:    movw $580, %di # imm = 0x244
-; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm4 {%k1}
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX512DQ-BW-NEXT:    movw $4644, %di # imm = 0x1224
-; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
-; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm0, %ymm6 {%k1}
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-BW-NEXT:    movw $9288, %di # imm = 0x2448
-; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-NEXT:    movw $580, %dx # imm = 0x244
+; AVX512DQ-BW-NEXT:    kmovd %edx, %k1
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rcx)
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%r8)
+; AVX512DQ-BW-NEXT:    movw $4644, %cx # imm = 0x1224
+; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
+; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%r9)
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%r10)
+; AVX512DQ-BW-NEXT:    movw $9288, %cx # imm = 0x2448
+; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
 ; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
 ; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-BW-NEXT:    vmovq %xmm6, (%r9)
-; AVX512DQ-BW-NEXT:    vmovq %xmm7, (%r10)
 ; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
@@ -1794,48 +1779,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    movw $580, %di # imm = 0x244
-; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm4 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    movw $4644, %di # imm = 0x1224
-; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm0, %ymm6 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    movw $9288, %di # imm = 0x2448
-; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    movw $580, %dx # imm = 0x244
+; AVX512DQ-BW-FCP-NEXT:    kmovd %edx, %k1
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    movw $4644, %cx # imm = 0x1224
+; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    movw $9288, %cx # imm = 0x2448
+; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm7, (%r10)
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
index 6770fb6660606..deb74d2b4651f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
@@ -878,212 +878,205 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
 ; SSE-LABEL: load_i8_stride8_vf8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    movdqa (%rdi), %xmm12
+; SSE-NEXT:    movdqa (%rdi), %xmm9
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm11
-; SSE-NEXT:    movdqa 32(%rdi), %xmm9
-; SSE-NEXT:    movdqa 48(%rdi), %xmm10
+; SSE-NEXT:    movdqa 32(%rdi), %xmm13
+; SSE-NEXT:    movdqa 48(%rdi), %xmm7
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0]
 ; SSE-NEXT:    movdqa %xmm11, %xmm1
 ; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm12, %xmm2
+; SSE-NEXT:    movdqa %xmm9, %xmm2
 ; SSE-NEXT:    pand %xmm0, %xmm2
 ; SSE-NEXT:    packuswb %xmm1, %xmm2
 ; SSE-NEXT:    packuswb %xmm2, %xmm2
-; SSE-NEXT:    movdqa %xmm10, %xmm1
+; SSE-NEXT:    movdqa %xmm7, %xmm1
 ; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm9, %xmm0
+; SSE-NEXT:    pand %xmm13, %xmm0
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
 ; SSE-NEXT:    packuswb %xmm0, %xmm2
-; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pxor %xmm7, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3]
+; SSE-NEXT:    movq %xmm0, (%rsi)
+; SSE-NEXT:    pxor %xmm5, %xmm5
 ; SSE-NEXT:    movdqa %xmm11, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm11, %xmm14
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3],xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
-; SSE-NEXT:    movdqa %xmm14, %xmm15
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; SSE-NEXT:    packuswb %xmm15, %xmm15
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7]
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; SSE-NEXT:    packuswb %xmm14, %xmm14
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535]
-; SSE-NEXT:    movdqa %xmm3, %xmm0
-; SSE-NEXT:    pandn %xmm15, %xmm0
-; SSE-NEXT:    movdqa %xmm12, %xmm1
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15]
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm12, %xmm5
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
-; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
-; SSE-NEXT:    packuswb %xmm1, %xmm1
-; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm10, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15]
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm10, %xmm13
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7]
-; SSE-NEXT:    movdqa %xmm13, %xmm4
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE-NEXT:    packuswb %xmm4, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,0,2,3]
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT:    movdqa %xmm2, %xmm6
-; SSE-NEXT:    pandn %xmm8, %xmm6
-; SSE-NEXT:    movdqa %xmm9, %xmm8
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
-; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm3, %xmm2
+; SSE-NEXT:    pandn %xmm14, %xmm2
 ; SSE-NEXT:    movdqa %xmm9, %xmm0
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, %xmm7
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,1,1,3]
-; SSE-NEXT:    packuswb %xmm8, %xmm8
-; SSE-NEXT:    pand %xmm2, %xmm8
-; SSE-NEXT:    por %xmm6, %xmm8
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm8[1,1,1,1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm6, %xmm11
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm11[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm8, %xmm8
-; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    pandn %xmm8, %xmm1
-; SSE-NEXT:    pand %xmm6, %xmm12
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm12[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm8, %xmm8
-; SSE-NEXT:    pand %xmm3, %xmm8
-; SSE-NEXT:    por %xmm1, %xmm8
-; SSE-NEXT:    pand %xmm6, %xmm10
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,5]
-; SSE-NEXT:    packuswb %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    pandn %xmm1, %xmm0
-; SSE-NEXT:    pand %xmm6, %xmm9
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    packuswb %xmm1, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[1,1,2,3]
-; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm15 = xmm5[3,3,3,3]
-; SSE-NEXT:    packuswb %xmm15, %xmm15
-; SSE-NEXT:    pand %xmm3, %xmm15
-; SSE-NEXT:    por %xmm1, %xmm15
-; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    pandn %xmm4, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3]
-; SSE-NEXT:    packuswb %xmm1, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm11[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm12[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm5[2,0,2,3,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm11, %xmm11
-; SSE-NEXT:    pand %xmm3, %xmm11
-; SSE-NEXT:    por %xmm1, %xmm11
-; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,4,6]
+; SSE-NEXT:    movdqa %xmm9, %xmm15
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7]
+; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm7, %xmm1
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm7, %xmm8
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; SSE-NEXT:    movdqa %xmm8, %xmm6
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
+; SSE-NEXT:    packuswb %xmm6, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,3]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    pandn %xmm10, %xmm4
+; SSE-NEXT:    movdqa %xmm13, %xmm1
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm13, %xmm10
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
+; SSE-NEXT:    movdqa %xmm10, %xmm5
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[0,1,1,3]
+; SSE-NEXT:    packuswb %xmm12, %xmm12
+; SSE-NEXT:    pand %xmm2, %xmm12
+; SSE-NEXT:    por %xmm4, %xmm12
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[1,1,1,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE-NEXT:    movq %xmm0, (%rdx)
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm0, %xmm11
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm11[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm4, %xmm4
+; SSE-NEXT:    movdqa %xmm3, %xmm12
+; SSE-NEXT:    pandn %xmm4, %xmm12
+; SSE-NEXT:    pand %xmm0, %xmm9
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm4, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    por %xmm12, %xmm4
+; SSE-NEXT:    pand %xmm0, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm7[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,7,5]
+; SSE-NEXT:    packuswb %xmm12, %xmm12
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm9[0,1,1,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7]
+; SSE-NEXT:    pandn %xmm12, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm13
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
 ; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
-; SSE-NEXT:    punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT:    punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
-; SSE-NEXT:    packuswb %xmm14, %xmm14
-; SSE-NEXT:    movdqa %xmm3, %xmm0
-; SSE-NEXT:    pandn %xmm14, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1]
-; SSE-NEXT:    packuswb %xmm9, %xmm9
-; SSE-NEXT:    pand %xmm3, %xmm9
-; SSE-NEXT:    por %xmm0, %xmm9
-; SSE-NEXT:    punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7]
-; SSE-NEXT:    packuswb %xmm13, %xmm13
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[0,0,2,3]
-; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE-NEXT:    movq %xmm4, (%rcx)
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[1,1,2,3]
+; SSE-NEXT:    movdqa %xmm3, %xmm1
 ; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT:    punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm3, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    pandn %xmm6, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3]
 ; SSE-NEXT:    packuswb %xmm4, %xmm4
-; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    pand %xmm2, %xmm4
 ; SSE-NEXT:    por %xmm1, %xmm4
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    movq %xmm0, (%r8)
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,5,7,6,7]
+; SSE-NEXT:    movdqa %xmm3, %xmm4
+; SSE-NEXT:    pandn %xmm0, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm9[2,0,2,3,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm6, %xmm6
+; SSE-NEXT:    pand %xmm3, %xmm6
+; SSE-NEXT:    por %xmm4, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,1,1,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,4,6]
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    pandn %xmm0, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[0,1,1,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,6,6,7]
+; SSE-NEXT:    packuswb %xmm7, %xmm7
+; SSE-NEXT:    pand %xmm2, %xmm7
+; SSE-NEXT:    por %xmm4, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; SSE-NEXT:    movq %xmm6, (%r9)
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; SSE-NEXT:    punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7]
+; SSE-NEXT:    packuswb %xmm11, %xmm11
+; SSE-NEXT:    movdqa %xmm3, %xmm4
+; SSE-NEXT:    pandn %xmm11, %xmm4
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; SSE-NEXT:    punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm12[1,1,1,1]
+; SSE-NEXT:    packuswb %xmm6, %xmm6
+; SSE-NEXT:    pand %xmm3, %xmm6
+; SSE-NEXT:    por %xmm4, %xmm6
+; SSE-NEXT:    punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7]
+; SSE-NEXT:    packuswb %xmm8, %xmm8
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[0,0,2,3]
+; SSE-NEXT:    movdqa %xmm2, %xmm7
+; SSE-NEXT:    pandn %xmm4, %xmm7
+; SSE-NEXT:    punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3]
+; SSE-NEXT:    packuswb %xmm4, %xmm4
+; SSE-NEXT:    pand %xmm2, %xmm4
+; SSE-NEXT:    por %xmm7, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq %xmm6, (%rax)
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm1, %xmm1
+; SSE-NEXT:    movdqa %xmm3, %xmm4
+; SSE-NEXT:    pandn %xmm1, %xmm4
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm9[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm1, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    por %xmm4, %xmm1
+; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,5,7]
+; SSE-NEXT:    packuswb %xmm4, %xmm4
+; SSE-NEXT:    movdqa %xmm2, %xmm5
+; SSE-NEXT:    pandn %xmm4, %xmm5
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
 ; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    por %xmm5, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[1,1,2,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq %xmm1, (%rax)
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[1,1,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3]
 ; SSE-NEXT:    packuswb %xmm1, %xmm1
 ; SSE-NEXT:    pand %xmm3, %xmm1
 ; SSE-NEXT:    pandn %xmm0, %xmm3
 ; SSE-NEXT:    por %xmm1, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
 ; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pandn %xmm13, %xmm2
+; SSE-NEXT:    pandn %xmm8, %xmm2
 ; SSE-NEXT:    por %xmm0, %xmm2
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; SSE-NEXT:    pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm0 = mem[0,3,2,3]
-; SSE-NEXT:    movq %xmm0, (%rsi)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    movlps %xmm0, (%rdx)
-; SSE-NEXT:    movq %xmm8, (%rcx)
-; SSE-NEXT:    movq %xmm15, (%r8)
-; SSE-NEXT:    movq %xmm11, (%r9)
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq %xmm9, (%rax)
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq %xmm4, (%rax)
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq %xmm3, (%rax)
-; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_i8_stride8_vf8:
@@ -1104,76 +1097,76 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
-; AVX-NEXT:    vmovd {{.*#+}} xmm5 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
-; AVX-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
+; AVX-NEXT:    vmovq %xmm4, (%rsi)
+; AVX-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX-NEXT:    vmovd {{.*#+}} xmm5 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX-NEXT:    vmovd {{.*#+}} xmm6 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
-; AVX-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
-; AVX-NEXT:    vmovd {{.*#+}} xmm6 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
-; AVX-NEXT:    vpshufb %xmm6, %xmm2, %xmm6
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX-NEXT:    vmovd {{.*#+}} xmm7 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm7, %xmm1, %xmm8
-; AVX-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
-; AVX-NEXT:    vmovd {{.*#+}} xmm7 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm7, %xmm3, %xmm8
-; AVX-NEXT:    vpshufb %xmm7, %xmm2, %xmm7
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX-NEXT:    vmovd {{.*#+}} xmm8 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm8, %xmm1, %xmm9
-; AVX-NEXT:    vpshufb %xmm8, %xmm0, %xmm8
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7]
-; AVX-NEXT:    vmovd {{.*#+}} xmm8 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm8, %xmm3, %xmm9
-; AVX-NEXT:    vpshufb %xmm8, %xmm2, %xmm8
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX-NEXT:    vmovd {{.*#+}} xmm9 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm9, %xmm1, %xmm10
-; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm9
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7]
-; AVX-NEXT:    vmovd {{.*#+}} xmm9 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm9, %xmm3, %xmm10
-; AVX-NEXT:    vpshufb %xmm9, %xmm2, %xmm9
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX-NEXT:    vmovd {{.*#+}} xmm10 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm10, %xmm1, %xmm11
-; AVX-NEXT:    vpshufb %xmm10, %xmm0, %xmm10
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7]
-; AVX-NEXT:    vmovd {{.*#+}} xmm10 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm10, %xmm3, %xmm11
-; AVX-NEXT:    vpshufb %xmm10, %xmm2, %xmm10
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX-NEXT:    vmovd {{.*#+}} xmm11 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm11, %xmm1, %xmm12
-; AVX-NEXT:    vpshufb %xmm11, %xmm0, %xmm11
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
-; AVX-NEXT:    vmovd {{.*#+}} xmm11 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
-; AVX-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm4, (%rdx)
+; AVX-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX-NEXT:    vmovd {{.*#+}} xmm5 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm4, (%rcx)
+; AVX-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX-NEXT:    vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm4, (%r8)
+; AVX-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX-NEXT:    vmovd {{.*#+}} xmm5 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm4, (%r9)
+; AVX-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX-NEXT:    vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm4, (%r11)
+; AVX-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX-NEXT:    vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
+; AVX-NEXT:    vmovq %xmm4, (%r10)
+; AVX-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX-NEXT:    vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
-; AVX-NEXT:    vmovq %xmm4, (%rsi)
-; AVX-NEXT:    vmovq %xmm5, (%rdx)
-; AVX-NEXT:    vmovq %xmm6, (%rcx)
-; AVX-NEXT:    vmovq %xmm7, (%r8)
-; AVX-NEXT:    vmovq %xmm8, (%r9)
-; AVX-NEXT:    vmovq %xmm9, (%r11)
-; AVX-NEXT:    vmovq %xmm10, (%r10)
 ; AVX-NEXT:    vmovq %xmm0, (%rax)
 ; AVX-NEXT:    retq
 ;
@@ -1195,76 +1188,76 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT:    vmovq %xmm4, (%rsi)
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
 ; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
-; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
+; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
-; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
-; AVX2-NEXT:    vpshufb %xmm6, %xmm2, %xmm6
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-NEXT:    vpshufb %xmm7, %xmm1, %xmm8
-; AVX2-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-NEXT:    vpshufb %xmm7, %xmm3, %xmm8
-; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm7
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-NEXT:    vpshufb %xmm8, %xmm1, %xmm9
-; AVX2-NEXT:    vpshufb %xmm8, %xmm0, %xmm8
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3]
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-NEXT:    vpshufb %xmm8, %xmm3, %xmm9
-; AVX2-NEXT:    vpshufb %xmm8, %xmm2, %xmm8
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-NEXT:    vpshufb %xmm9, %xmm1, %xmm10
-; AVX2-NEXT:    vpshufb %xmm9, %xmm0, %xmm9
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3]
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-NEXT:    vpshufb %xmm9, %xmm3, %xmm10
-; AVX2-NEXT:    vpshufb %xmm9, %xmm2, %xmm9
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-NEXT:    vpshufb %xmm10, %xmm1, %xmm11
-; AVX2-NEXT:    vpshufb %xmm10, %xmm0, %xmm10
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-NEXT:    vpshufb %xmm10, %xmm3, %xmm11
-; AVX2-NEXT:    vpshufb %xmm10, %xmm2, %xmm10
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-NEXT:    vpshufb %xmm11, %xmm1, %xmm12
-; AVX2-NEXT:    vpshufb %xmm11, %xmm0, %xmm11
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
-; AVX2-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT:    vmovq %xmm4, (%rdx)
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT:    vmovq %xmm4, (%rcx)
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT:    vmovq %xmm4, (%r8)
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT:    vmovq %xmm4, (%r9)
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT:    vmovq %xmm4, (%r11)
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT:    vmovq %xmm4, (%r10)
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX2-NEXT:    vmovq %xmm4, (%rsi)
-; AVX2-NEXT:    vmovq %xmm5, (%rdx)
-; AVX2-NEXT:    vmovq %xmm6, (%rcx)
-; AVX2-NEXT:    vmovq %xmm7, (%r8)
-; AVX2-NEXT:    vmovq %xmm8, (%r9)
-; AVX2-NEXT:    vmovq %xmm9, (%r11)
-; AVX2-NEXT:    vmovq %xmm10, (%r10)
 ; AVX2-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-NEXT:    retq
 ;
@@ -1286,76 +1279,76 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT:    vmovq %xmm4, (%rsi)
+; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
 ; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
-; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
-; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
-; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm2, %xmm6
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm1, %xmm8
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
-; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm3, %xmm8
-; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm2, %xmm7
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm1, %xmm9
-; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm0, %xmm8
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3]
-; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm3, %xmm9
-; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm2, %xmm8
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm1, %xmm10
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm0, %xmm9
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3]
-; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm3, %xmm10
-; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm2, %xmm9
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm1, %xmm11
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm0, %xmm10
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
-; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm3, %xmm11
-; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm2, %xmm10
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm1, %xmm12
-; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm0, %xmm11
-; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
-; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
-; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT:    vmovq %xmm4, (%rdx)
+; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT:    vmovq %xmm4, (%rcx)
+; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT:    vmovq %xmm4, (%r8)
+; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT:    vmovq %xmm4, (%r9)
+; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT:    vmovq %xmm4, (%r11)
+; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT:    vmovq %xmm4, (%r10)
+; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
 ; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX2-FP-NEXT:    vmovq %xmm4, (%rsi)
-; AVX2-FP-NEXT:    vmovq %xmm5, (%rdx)
-; AVX2-FP-NEXT:    vmovq %xmm6, (%rcx)
-; AVX2-FP-NEXT:    vmovq %xmm7, (%r8)
-; AVX2-FP-NEXT:    vmovq %xmm8, (%r9)
-; AVX2-FP-NEXT:    vmovq %xmm9, (%r11)
-; AVX2-FP-NEXT:    vmovq %xmm10, (%r10)
 ; AVX2-FP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-FP-NEXT:    retq
 ;
@@ -1364,54 +1357,54 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm0, %ymm3
+; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm3
 ; AVX2-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
 ; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm6
-; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm7 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm0
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm2, %ymm2
+; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm6 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm7
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3]
+; AVX2-FCP-NEXT:    vmovq %xmm5, (%rsi)
 ; AVX2-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm8
-; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm9 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm6, %xmm10
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3]
-; AVX2-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm3, %xmm11
-; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm6, %xmm13
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3]
-; AVX2-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-FCP-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
-; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm14 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm6
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3]
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [1,3,5,7,5,7,6,7]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm6, %ymm2
-; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
-; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm6, %ymm1
-; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm6
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
-; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
-; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm6
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
-; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm6
-; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm7
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
-; AVX2-FCP-NEXT:    vpshufb %xmm13, %xmm2, %xmm2
-; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
-; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
-; AVX2-FCP-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-FCP-NEXT:    vmovq %xmm8, (%rdx)
-; AVX2-FCP-NEXT:    vmovq %xmm11, (%rcx)
-; AVX2-FCP-NEXT:    vmovq %xmm3, (%r8)
-; AVX2-FCP-NEXT:    vmovq %xmm4, (%r9)
-; AVX2-FCP-NEXT:    vmovq %xmm5, (%r11)
-; AVX2-FCP-NEXT:    vmovq %xmm6, (%r10)
-; AVX2-FCP-NEXT:    vmovq %xmm1, (%rax)
+; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm7
+; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm8 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm9
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3]
+; AVX2-FCP-NEXT:    vmovq %xmm7, (%rdx)
+; AVX2-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm9
+; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm10 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm11
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3]
+; AVX2-FCP-NEXT:    vmovq %xmm9, (%rcx)
+; AVX2-FCP-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
+; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm11 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%r8)
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7]
+; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm3
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm2
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%r9)
+; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm2
+; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%r11)
+; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm2
+; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm3
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
+; AVX2-FCP-NEXT:    vmovq %xmm2, (%r10)
+; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
+; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
+; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-FCP-NEXT:    vmovq %xmm0, (%rax)
 ; AVX2-FCP-NEXT:    vzeroupper
 ; AVX2-FCP-NEXT:    retq
 ;
@@ -1421,21 +1414,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT:    vpsrlq $8, %zmm0, %zmm1
-; AVX512-NEXT:    vpsrlq $16, %zmm0, %zmm2
-; AVX512-NEXT:    vpsrlq $24, %zmm0, %zmm3
-; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm4
-; AVX512-NEXT:    vpsrlq $40, %zmm0, %zmm5
-; AVX512-NEXT:    vpsrlq $48, %zmm0, %zmm6
-; AVX512-NEXT:    vpsrlq $56, %zmm0, %zmm7
 ; AVX512-NEXT:    vpmovqb %zmm0, (%rsi)
+; AVX512-NEXT:    vpsrlq $8, %zmm0, %zmm1
 ; AVX512-NEXT:    vpmovqb %zmm1, (%rdx)
-; AVX512-NEXT:    vpmovqb %zmm2, (%rcx)
-; AVX512-NEXT:    vpmovqb %zmm3, (%r8)
-; AVX512-NEXT:    vpmovqb %zmm4, (%r9)
-; AVX512-NEXT:    vpmovqb %zmm5, (%r11)
-; AVX512-NEXT:    vpmovqb %zmm6, (%r10)
-; AVX512-NEXT:    vpmovqb %zmm7, (%rax)
+; AVX512-NEXT:    vpsrlq $16, %zmm0, %zmm1
+; AVX512-NEXT:    vpmovqb %zmm1, (%rcx)
+; AVX512-NEXT:    vpsrlq $24, %zmm0, %zmm1
+; AVX512-NEXT:    vpmovqb %zmm1, (%r8)
+; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm1
+; AVX512-NEXT:    vpmovqb %zmm1, (%r9)
+; AVX512-NEXT:    vpsrlq $40, %zmm0, %zmm1
+; AVX512-NEXT:    vpmovqb %zmm1, (%r11)
+; AVX512-NEXT:    vpsrlq $48, %zmm0, %zmm1
+; AVX512-NEXT:    vpmovqb %zmm1, (%r10)
+; AVX512-NEXT:    vpsrlq $56, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqb %zmm0, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -1445,21 +1438,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512-FCP-NEXT:    vpsrlq $8, %zmm0, %zmm1
-; AVX512-FCP-NEXT:    vpsrlq $16, %zmm0, %zmm2
-; AVX512-FCP-NEXT:    vpsrlq $24, %zmm0, %zmm3
-; AVX512-FCP-NEXT:    vpsrlq $32, %zmm0, %zmm4
-; AVX512-FCP-NEXT:    vpsrlq $40, %zmm0, %zmm5
-; AVX512-FCP-NEXT:    vpsrlq $48, %zmm0, %zmm6
-; AVX512-FCP-NEXT:    vpsrlq $56, %zmm0, %zmm7
 ; AVX512-FCP-NEXT:    vpmovqb %zmm0, (%rsi)
+; AVX512-FCP-NEXT:    vpsrlq $8, %zmm0, %zmm1
 ; AVX512-FCP-NEXT:    vpmovqb %zmm1, (%rdx)
-; AVX512-FCP-NEXT:    vpmovqb %zmm2, (%rcx)
-; AVX512-FCP-NEXT:    vpmovqb %zmm3, (%r8)
-; AVX512-FCP-NEXT:    vpmovqb %zmm4, (%r9)
-; AVX512-FCP-NEXT:    vpmovqb %zmm5, (%r11)
-; AVX512-FCP-NEXT:    vpmovqb %zmm6, (%r10)
-; AVX512-FCP-NEXT:    vpmovqb %zmm7, (%rax)
+; AVX512-FCP-NEXT:    vpsrlq $16, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vpmovqb %zmm1, (%rcx)
+; AVX512-FCP-NEXT:    vpsrlq $24, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vpmovqb %zmm1, (%r8)
+; AVX512-FCP-NEXT:    vpsrlq $32, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vpmovqb %zmm1, (%r9)
+; AVX512-FCP-NEXT:    vpsrlq $40, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vpmovqb %zmm1, (%r11)
+; AVX512-FCP-NEXT:    vpsrlq $48, %zmm0, %zmm1
+; AVX512-FCP-NEXT:    vpmovqb %zmm1, (%r10)
+; AVX512-FCP-NEXT:    vpsrlq $56, %zmm0, %zmm0
+; AVX512-FCP-NEXT:    vpmovqb %zmm0, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -1469,21 +1462,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512DQ-NEXT:    vpsrlq $8, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vpsrlq $16, %zmm0, %zmm2
-; AVX512DQ-NEXT:    vpsrlq $24, %zmm0, %zmm3
-; AVX512DQ-NEXT:    vpsrlq $32, %zmm0, %zmm4
-; AVX512DQ-NEXT:    vpsrlq $40, %zmm0, %zmm5
-; AVX512DQ-NEXT:    vpsrlq $48, %zmm0, %zmm6
-; AVX512DQ-NEXT:    vpsrlq $56, %zmm0, %zmm7
 ; AVX512DQ-NEXT:    vpmovqb %zmm0, (%rsi)
+; AVX512DQ-NEXT:    vpsrlq $8, %zmm0, %zmm1
 ; AVX512DQ-NEXT:    vpmovqb %zmm1, (%rdx)
-; AVX512DQ-NEXT:    vpmovqb %zmm2, (%rcx)
-; AVX512DQ-NEXT:    vpmovqb %zmm3, (%r8)
-; AVX512DQ-NEXT:    vpmovqb %zmm4, (%r9)
-; AVX512DQ-NEXT:    vpmovqb %zmm5, (%r11)
-; AVX512DQ-NEXT:    vpmovqb %zmm6, (%r10)
-; AVX512DQ-NEXT:    vpmovqb %zmm7, (%rax)
+; AVX512DQ-NEXT:    vpsrlq $16, %zmm0, %zmm1
+; AVX512DQ-NEXT:    vpmovqb %zmm1, (%rcx)
+; AVX512DQ-NEXT:    vpsrlq $24, %zmm0, %zmm1
+; AVX512DQ-NEXT:    vpmovqb %zmm1, (%r8)
+; AVX512DQ-NEXT:    vpsrlq $32, %zmm0, %zmm1
+; AVX512DQ-NEXT:    vpmovqb %zmm1, (%r9)
+; AVX512DQ-NEXT:    vpsrlq $40, %zmm0, %zmm1
+; AVX512DQ-NEXT:    vpmovqb %zmm1, (%r11)
+; AVX512DQ-NEXT:    vpsrlq $48, %zmm0, %zmm1
+; AVX512DQ-NEXT:    vpmovqb %zmm1, (%r10)
+; AVX512DQ-NEXT:    vpsrlq $56, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovqb %zmm0, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -1493,21 +1486,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512DQ-FCP-NEXT:    vpsrlq $8, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT:    vpsrlq $24, %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm0, %zmm4
-; AVX512DQ-FCP-NEXT:    vpsrlq $40, %zmm0, %zmm5
-; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT:    vpsrlq $56, %zmm0, %zmm7
 ; AVX512DQ-FCP-NEXT:    vpmovqb %zmm0, (%rsi)
+; AVX512DQ-FCP-NEXT:    vpsrlq $8, %zmm0, %zmm1
 ; AVX512DQ-FCP-NEXT:    vpmovqb %zmm1, (%rdx)
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm2, (%rcx)
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm3, (%r8)
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm4, (%r9)
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm5, (%r11)
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm6, (%r10)
-; AVX512DQ-FCP-NEXT:    vpmovqb %zmm7, (%rax)
+; AVX512DQ-FCP-NEXT:    vpsrlq $16, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm1, (%rcx)
+; AVX512DQ-FCP-NEXT:    vpsrlq $24, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm1, (%r8)
+; AVX512DQ-FCP-NEXT:    vpsrlq $32, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm1, (%r9)
+; AVX512DQ-FCP-NEXT:    vpsrlq $40, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm1, (%r11)
+; AVX512DQ-FCP-NEXT:    vpsrlq $48, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm1, (%r10)
+; AVX512DQ-FCP-NEXT:    vpsrlq $56, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT:    vpmovqb %zmm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -1517,21 +1510,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vpsrlq $8, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpsrlq $16, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpsrlq $24, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm4
-; AVX512BW-NEXT:    vpsrlq $40, %zmm0, %zmm5
-; AVX512BW-NEXT:    vpsrlq $48, %zmm0, %zmm6
-; AVX512BW-NEXT:    vpsrlq $56, %zmm0, %zmm7
 ; AVX512BW-NEXT:    vpmovqb %zmm0, (%rsi)
+; AVX512BW-NEXT:    vpsrlq $8, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpmovqb %zmm1, (%rdx)
-; AVX512BW-NEXT:    vpmovqb %zmm2, (%rcx)
-; AVX512BW-NEXT:    vpmovqb %zmm3, (%r8)
-; AVX512BW-NEXT:    vpmovqb %zmm4, (%r9)
-; AVX512BW-NEXT:    vpmovqb %zmm5, (%r11)
-; AVX512BW-NEXT:    vpmovqb %zmm6, (%r10)
-; AVX512BW-NEXT:    vpmovqb %zmm7, (%rax)
+; AVX512BW-NEXT:    vpsrlq $16, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpmovqb %zmm1, (%rcx)
+; AVX512BW-NEXT:    vpsrlq $24, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpmovqb %zmm1, (%r8)
+; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpmovqb %zmm1, (%r9)
+; AVX512BW-NEXT:    vpsrlq $40, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpmovqb %zmm1, (%r11)
+; AVX512BW-NEXT:    vpsrlq $48, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpmovqb %zmm1, (%r10)
+; AVX512BW-NEXT:    vpsrlq $56, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovqb %zmm0, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -1541,21 +1534,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FCP-NEXT:    vpsrlq $8, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vpsrlq $16, %zmm0, %zmm2
-; AVX512BW-FCP-NEXT:    vpsrlq $24, %zmm0, %zmm3
-; AVX512BW-FCP-NEXT:    vpsrlq $32, %zmm0, %zmm4
-; AVX512BW-FCP-NEXT:    vpsrlq $40, %zmm0, %zmm5
-; AVX512BW-FCP-NEXT:    vpsrlq $48, %zmm0, %zmm6
-; AVX512BW-FCP-NEXT:    vpsrlq $56, %zmm0, %zmm7
 ; AVX512BW-FCP-NEXT:    vpmovqb %zmm0, (%rsi)
+; AVX512BW-FCP-NEXT:    vpsrlq $8, %zmm0, %zmm1
 ; AVX512BW-FCP-NEXT:    vpmovqb %zmm1, (%rdx)
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm2, (%rcx)
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm3, (%r8)
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm4, (%r9)
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm5, (%r11)
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm6, (%r10)
-; AVX512BW-FCP-NEXT:    vpmovqb %zmm7, (%rax)
+; AVX512BW-FCP-NEXT:    vpsrlq $16, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm1, (%rcx)
+; AVX512BW-FCP-NEXT:    vpsrlq $24, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm1, (%r8)
+; AVX512BW-FCP-NEXT:    vpsrlq $32, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm1, (%r9)
+; AVX512BW-FCP-NEXT:    vpsrlq $40, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm1, (%r11)
+; AVX512BW-FCP-NEXT:    vpsrlq $48, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm1, (%r10)
+; AVX512BW-FCP-NEXT:    vpsrlq $56, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vpmovqb %zmm0, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -1565,21 +1558,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512DQ-BW-NEXT:    vpsrlq $8, %zmm0, %zmm1
-; AVX512DQ-BW-NEXT:    vpsrlq $16, %zmm0, %zmm2
-; AVX512DQ-BW-NEXT:    vpsrlq $24, %zmm0, %zmm3
-; AVX512DQ-BW-NEXT:    vpsrlq $32, %zmm0, %zmm4
-; AVX512DQ-BW-NEXT:    vpsrlq $40, %zmm0, %zmm5
-; AVX512DQ-BW-NEXT:    vpsrlq $48, %zmm0, %zmm6
-; AVX512DQ-BW-NEXT:    vpsrlq $56, %zmm0, %zmm7
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm0, (%rsi)
+; AVX512DQ-BW-NEXT:    vpsrlq $8, %zmm0, %zmm1
 ; AVX512DQ-BW-NEXT:    vpmovqb %zmm1, (%rdx)
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm2, (%rcx)
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm3, (%r8)
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm4, (%r9)
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm5, (%r11)
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm6, (%r10)
-; AVX512DQ-BW-NEXT:    vpmovqb %zmm7, (%rax)
+; AVX512DQ-BW-NEXT:    vpsrlq $16, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm1, (%rcx)
+; AVX512DQ-BW-NEXT:    vpsrlq $24, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm1, (%r8)
+; AVX512DQ-BW-NEXT:    vpsrlq $32, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm1, (%r9)
+; AVX512DQ-BW-NEXT:    vpsrlq $40, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm1, (%r11)
+; AVX512DQ-BW-NEXT:    vpsrlq $48, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm1, (%r10)
+; AVX512DQ-BW-NEXT:    vpsrlq $56, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vpmovqb %zmm0, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -1589,21 +1582,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $8, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $16, %zmm0, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $24, %zmm0, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $32, %zmm0, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $40, %zmm0, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %zmm0, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $56, %zmm0, %zmm7
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm0, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $8, %zmm0, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm1, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm2, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm3, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm4, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm5, (%r11)
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm6, (%r10)
-; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm7, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $16, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm1, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $24, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm1, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $32, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm1, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $40, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm1, (%r11)
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm1, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $56, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpmovqb %zmm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <64 x i8>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
index b233855029c58..324fe12de9400 100644
--- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
+++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
@@ -85,14 +85,14 @@ define <4 x i16> @smulfixsat(<4 x i16> %a) {
 ; CHECK-NEXT:    movswl %dx, %edx
 ; CHECK-NEXT:    leal (,%rdx,4), %esi
 ; CHECK-NEXT:    movl %esi, %edi
-; CHECK-NEXT:    shrl $16, %edi
-; CHECK-NEXT:    shldw $1, %si, %di
+; CHECK-NEXT:    shrl $16, %esi
+; CHECK-NEXT:    shldw $1, %di, %si
 ; CHECK-NEXT:    sarl $14, %edx
 ; CHECK-NEXT:    cmpl $16384, %edx # imm = 0x4000
-; CHECK-NEXT:    cmovgel %eax, %edi
+; CHECK-NEXT:    cmovgel %eax, %esi
 ; CHECK-NEXT:    cmpl $-16384, %edx # imm = 0xC000
-; CHECK-NEXT:    cmovll %ecx, %edi
-; CHECK-NEXT:    pinsrw $3, %edi, %xmm1
+; CHECK-NEXT:    cmovll %ecx, %esi
+; CHECK-NEXT:    pinsrw $3, %esi, %xmm1
 ; CHECK-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %t = call <4 x i16> @llvm.smul.fix.sat.v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> %a, i32 15)
@@ -106,19 +106,19 @@ define <4 x i16> @umulfixsat(<4 x i16> %a) {
 ; CHECK-NEXT:    pextrw $2, %xmm0, %eax
 ; CHECK-NEXT:    leal (%rax,%rax,2), %eax
 ; CHECK-NEXT:    movl %eax, %edx
-; CHECK-NEXT:    shrl $16, %edx
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    shldw $1, %ax, %cx
-; CHECK-NEXT:    cmpl $32768, %edx # imm = 0x8000
+; CHECK-NEXT:    shrl $16, %eax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shldw $1, %dx, %cx
+; CHECK-NEXT:    cmpl $32768, %eax # imm = 0x8000
 ; CHECK-NEXT:    movl $65535, %eax # imm = 0xFFFF
 ; CHECK-NEXT:    cmovael %eax, %ecx
 ; CHECK-NEXT:    pextrw $1, %xmm0, %edx
 ; CHECK-NEXT:    addl %edx, %edx
 ; CHECK-NEXT:    movl %edx, %esi
-; CHECK-NEXT:    shrl $16, %esi
-; CHECK-NEXT:    movl %esi, %edi
-; CHECK-NEXT:    shldw $1, %dx, %di
-; CHECK-NEXT:    cmpl $32768, %esi # imm = 0x8000
+; CHECK-NEXT:    shrl $16, %edx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    shldw $1, %si, %di
+; CHECK-NEXT:    cmpl $32768, %edx # imm = 0x8000
 ; CHECK-NEXT:    cmovael %eax, %edi
 ; CHECK-NEXT:    movd %xmm0, %edx
 ; CHECK-NEXT:    xorl %esi, %esi
@@ -133,10 +133,10 @@ define <4 x i16> @umulfixsat(<4 x i16> %a) {
 ; CHECK-NEXT:    pextrw $3, %xmm0, %ecx
 ; CHECK-NEXT:    shll $2, %ecx
 ; CHECK-NEXT:    movl %ecx, %edx
-; CHECK-NEXT:    shrl $16, %edx
-; CHECK-NEXT:    movl %edx, %esi
-; CHECK-NEXT:    shldw $1, %cx, %si
-; CHECK-NEXT:    cmpl $32768, %edx # imm = 0x8000
+; CHECK-NEXT:    shrl $16, %ecx
+; CHECK-NEXT:    movl %ecx, %esi
+; CHECK-NEXT:    shldw $1, %dx, %si
+; CHECK-NEXT:    cmpl $32768, %ecx # imm = 0x8000
 ; CHECK-NEXT:    cmovael %eax, %esi
 ; CHECK-NEXT:    pinsrw $3, %esi, %xmm1
 ; CHECK-NEXT:    movdqa %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index 320dce840ea57..6cb43234d713b 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -397,8 +397,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind {
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    shrl $8, %ecx
-; AVX512F-NEXT:    xorb %al, %cl
+; AVX512F-NEXT:    shrl $8, %eax
+; AVX512F-NEXT:    xorb %cl, %al
 ; AVX512F-NEXT:    setnp %al
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -409,8 +409,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind {
 ; AVX512BW-NEXT:    vpmovw2m %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    shrl $8, %ecx
-; AVX512BW-NEXT:    xorb %al, %cl
+; AVX512BW-NEXT:    shrl $8, %eax
+; AVX512BW-NEXT:    xorb %cl, %al
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -421,8 +421,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind {
 ; AVX512VL-NEXT:    vpmovw2m %ymm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    shrl $8, %ecx
-; AVX512VL-NEXT:    xorb %al, %cl
+; AVX512VL-NEXT:    shrl $8, %eax
+; AVX512VL-NEXT:    xorb %cl, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -722,8 +722,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind {
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    shrl $8, %ecx
-; AVX512F-NEXT:    xorb %al, %cl
+; AVX512F-NEXT:    shrl $8, %eax
+; AVX512F-NEXT:    xorb %cl, %al
 ; AVX512F-NEXT:    setnp %al
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -734,8 +734,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind {
 ; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    shrl $8, %ecx
-; AVX512BW-NEXT:    xorb %al, %cl
+; AVX512BW-NEXT:    shrl $8, %eax
+; AVX512BW-NEXT:    xorb %cl, %al
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -746,8 +746,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind {
 ; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    shrl $8, %ecx
-; AVX512VL-NEXT:    xorb %al, %cl
+; AVX512VL-NEXT:    shrl $8, %eax
+; AVX512VL-NEXT:    xorb %cl, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -974,13 +974,13 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) nounwind {
 ; AVX512BW-NEXT:    vpsllw $7, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shrq $32, %rcx
-; AVX512BW-NEXT:    xorl %eax, %ecx
-; AVX512BW-NEXT:    movl %ecx, %eax
-; AVX512BW-NEXT:    shrl $16, %eax
+; AVX512BW-NEXT:    movl %eax, %ecx
+; AVX512BW-NEXT:    shrq $32, %rax
 ; AVX512BW-NEXT:    xorl %ecx, %eax
-; AVX512BW-NEXT:    xorb %ah, %al
+; AVX512BW-NEXT:    movl %eax, %ecx
+; AVX512BW-NEXT:    shrl $16, %ecx
+; AVX512BW-NEXT:    xorl %eax, %ecx
+; AVX512BW-NEXT:    xorb %ch, %cl
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -990,13 +990,13 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) nounwind {
 ; AVX512VL-NEXT:    vpsllw $7, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512VL-NEXT:    kmovq %k0, %rax
-; AVX512VL-NEXT:    movq %rax, %rcx
-; AVX512VL-NEXT:    shrq $32, %rcx
-; AVX512VL-NEXT:    xorl %eax, %ecx
-; AVX512VL-NEXT:    movl %ecx, %eax
-; AVX512VL-NEXT:    shrl $16, %eax
+; AVX512VL-NEXT:    movl %eax, %ecx
+; AVX512VL-NEXT:    shrq $32, %rax
 ; AVX512VL-NEXT:    xorl %ecx, %eax
-; AVX512VL-NEXT:    xorb %ah, %al
+; AVX512VL-NEXT:    movl %eax, %ecx
+; AVX512VL-NEXT:    shrl $16, %ecx
+; AVX512VL-NEXT:    xorl %eax, %ecx
+; AVX512VL-NEXT:    xorb %ch, %cl
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -1211,8 +1211,8 @@ define i1 @icmp0_v16i8_v16i1(<16 x i8>) nounwind {
 ; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    shrl $8, %ecx
-; AVX512BW-NEXT:    xorb %al, %cl
+; AVX512BW-NEXT:    shrl $8, %eax
+; AVX512BW-NEXT:    xorb %cl, %al
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1222,8 +1222,8 @@ define i1 @icmp0_v16i8_v16i1(<16 x i8>) nounwind {
 ; AVX512VL-NEXT:    vptestnmb %xmm0, %xmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    shrl $8, %ecx
-; AVX512VL-NEXT:    xorb %al, %cl
+; AVX512VL-NEXT:    shrl $8, %eax
+; AVX512VL-NEXT:    xorb %cl, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i8> %0, zeroinitializer
@@ -1427,8 +1427,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind {
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    shrl $8, %ecx
-; AVX512F-NEXT:    xorb %al, %cl
+; AVX512F-NEXT:    shrl $8, %eax
+; AVX512F-NEXT:    xorb %cl, %al
 ; AVX512F-NEXT:    setnp %al
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1439,8 +1439,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind {
 ; AVX512BW-NEXT:    vptestnmw %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    shrl $8, %ecx
-; AVX512BW-NEXT:    xorb %al, %cl
+; AVX512BW-NEXT:    shrl $8, %eax
+; AVX512BW-NEXT:    xorb %cl, %al
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1450,8 +1450,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind {
 ; AVX512VL-NEXT:    vptestnmw %ymm0, %ymm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    shrl $8, %ecx
-; AVX512VL-NEXT:    xorb %al, %cl
+; AVX512VL-NEXT:    shrl $8, %eax
+; AVX512VL-NEXT:    xorb %cl, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -1756,8 +1756,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind {
 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    shrl $8, %ecx
-; AVX512F-NEXT:    xorb %al, %cl
+; AVX512F-NEXT:    shrl $8, %eax
+; AVX512F-NEXT:    xorb %cl, %al
 ; AVX512F-NEXT:    setnp %al
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1767,8 +1767,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind {
 ; AVX512BW-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    shrl $8, %ecx
-; AVX512BW-NEXT:    xorb %al, %cl
+; AVX512BW-NEXT:    shrl $8, %eax
+; AVX512BW-NEXT:    xorb %cl, %al
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1778,8 +1778,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind {
 ; AVX512VL-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    shrl $8, %ecx
-; AVX512VL-NEXT:    xorb %al, %cl
+; AVX512VL-NEXT:    shrl $8, %eax
+; AVX512VL-NEXT:    xorb %cl, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -2010,13 +2010,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shrq $32, %rcx
-; AVX512BW-NEXT:    xorl %eax, %ecx
-; AVX512BW-NEXT:    movl %ecx, %eax
-; AVX512BW-NEXT:    shrl $16, %eax
+; AVX512BW-NEXT:    movl %eax, %ecx
+; AVX512BW-NEXT:    shrq $32, %rax
 ; AVX512BW-NEXT:    xorl %ecx, %eax
-; AVX512BW-NEXT:    xorb %ah, %al
+; AVX512BW-NEXT:    movl %eax, %ecx
+; AVX512BW-NEXT:    shrl $16, %ecx
+; AVX512BW-NEXT:    xorl %eax, %ecx
+; AVX512BW-NEXT:    xorb %ch, %cl
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -2025,13 +2025,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vptestnmb %zmm0, %zmm0, %k0
 ; AVX512VL-NEXT:    kmovq %k0, %rax
-; AVX512VL-NEXT:    movq %rax, %rcx
-; AVX512VL-NEXT:    shrq $32, %rcx
-; AVX512VL-NEXT:    xorl %eax, %ecx
-; AVX512VL-NEXT:    movl %ecx, %eax
-; AVX512VL-NEXT:    shrl $16, %eax
+; AVX512VL-NEXT:    movl %eax, %ecx
+; AVX512VL-NEXT:    shrq $32, %rax
 ; AVX512VL-NEXT:    xorl %ecx, %eax
-; AVX512VL-NEXT:    xorb %ah, %al
+; AVX512VL-NEXT:    movl %eax, %ecx
+; AVX512VL-NEXT:    shrl $16, %ecx
+; AVX512VL-NEXT:    xorl %eax, %ecx
+; AVX512VL-NEXT:    xorb %ch, %cl
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -2240,8 +2240,8 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>, <16 x i8>) nounwind {
 ; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    shrl $8, %ecx
-; AVX512BW-NEXT:    xorb %al, %cl
+; AVX512BW-NEXT:    shrl $8, %eax
+; AVX512BW-NEXT:    xorb %cl, %al
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -2251,8 +2251,8 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>, <16 x i8>) nounwind {
 ; AVX512VL-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    shrl $8, %ecx
-; AVX512VL-NEXT:    xorb %al, %cl
+; AVX512VL-NEXT:    shrl $8, %eax
+; AVX512VL-NEXT:    xorb %cl, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i8> %0, %1
@@ -2504,8 +2504,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind {
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    shrl $8, %ecx
-; AVX512F-NEXT:    xorb %al, %cl
+; AVX512F-NEXT:    shrl $8, %eax
+; AVX512F-NEXT:    xorb %cl, %al
 ; AVX512F-NEXT:    setnp %al
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -2517,8 +2517,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind {
 ; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    shrl $8, %ecx
-; AVX512BW-NEXT:    xorb %al, %cl
+; AVX512BW-NEXT:    shrl $8, %eax
+; AVX512BW-NEXT:    xorb %cl, %al
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -2528,8 +2528,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind {
 ; AVX512VL-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    shrl $8, %ecx
-; AVX512VL-NEXT:    xorb %al, %cl
+; AVX512VL-NEXT:    shrl $8, %eax
+; AVX512VL-NEXT:    xorb %cl, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -2845,8 +2845,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind {
 ; AVX512F-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    shrl $8, %ecx
-; AVX512F-NEXT:    xorb %al, %cl
+; AVX512F-NEXT:    shrl $8, %eax
+; AVX512F-NEXT:    xorb %cl, %al
 ; AVX512F-NEXT:    setnp %al
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -2856,8 +2856,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind {
 ; AVX512BW-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    shrl $8, %ecx
-; AVX512BW-NEXT:    xorb %al, %cl
+; AVX512BW-NEXT:    shrl $8, %eax
+; AVX512BW-NEXT:    xorb %cl, %al
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -2867,8 +2867,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind {
 ; AVX512VL-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    shrl $8, %ecx
-; AVX512VL-NEXT:    xorb %al, %cl
+; AVX512VL-NEXT:    shrl $8, %eax
+; AVX512VL-NEXT:    xorb %cl, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -3097,13 +3097,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shrq $32, %rcx
-; AVX512BW-NEXT:    xorl %eax, %ecx
-; AVX512BW-NEXT:    movl %ecx, %eax
-; AVX512BW-NEXT:    shrl $16, %eax
+; AVX512BW-NEXT:    movl %eax, %ecx
+; AVX512BW-NEXT:    shrq $32, %rax
 ; AVX512BW-NEXT:    xorl %ecx, %eax
-; AVX512BW-NEXT:    xorb %ah, %al
+; AVX512BW-NEXT:    movl %eax, %ecx
+; AVX512BW-NEXT:    shrl $16, %ecx
+; AVX512BW-NEXT:    xorl %eax, %ecx
+; AVX512BW-NEXT:    xorb %ch, %cl
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -3112,13 +3112,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
 ; AVX512VL-NEXT:    kmovq %k0, %rax
-; AVX512VL-NEXT:    movq %rax, %rcx
-; AVX512VL-NEXT:    shrq $32, %rcx
-; AVX512VL-NEXT:    xorl %eax, %ecx
-; AVX512VL-NEXT:    movl %ecx, %eax
-; AVX512VL-NEXT:    shrl $16, %eax
+; AVX512VL-NEXT:    movl %eax, %ecx
+; AVX512VL-NEXT:    shrq $32, %rax
 ; AVX512VL-NEXT:    xorl %ecx, %eax
-; AVX512VL-NEXT:    xorb %ah, %al
+; AVX512VL-NEXT:    movl %eax, %ecx
+; AVX512VL-NEXT:    shrl $16, %ecx
+; AVX512VL-NEXT:    xorl %eax, %ecx
+; AVX512VL-NEXT:    xorb %ch, %cl
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
index a768baae97add..466fa6ba098b3 100644
--- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
@@ -5890,17 +5890,16 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512DQ-SLOW:       # %bb.0:
 ; AVX512DQ-SLOW-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-SLOW-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1]
-; AVX512DQ-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQ-SLOW-NEXT:    movw $255, %ax
-; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
-; AVX512DQ-SLOW-NEXT:    vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
-; AVX512DQ-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpmovd2m %zmm0, %k2
-; AVX512DQ-SLOW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
-; AVX512DQ-SLOW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512DQ-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,1]
+; AVX512DQ-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-SLOW-NEXT:    kxnorb %k0, %k0, %k2
+; AVX512DQ-SLOW-NEXT:    vpcmpgtd %zmm0, %zmm1, %k2 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512DQ-SLOW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, (%rdx)
 ; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, 64(%rdx)
 ; AVX512DQ-SLOW-NEXT:    vzeroupper
@@ -5910,17 +5909,16 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512DQ-FAST:       # %bb.0:
 ; AVX512DQ-FAST-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-FAST-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3]
-; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm1
-; AVX512DQ-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQ-FAST-NEXT:    movw $255, %ax
-; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FAST-NEXT:    vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
 ; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
-; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vpmovd2m %zmm0, %k2
-; AVX512DQ-FAST-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
-; AVX512DQ-FAST-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT:    kxnorb %k0, %k0, %k2
+; AVX512DQ-FAST-NEXT:    vpcmpgtd %zmm0, %zmm1, %k2 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512DQ-FAST-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, (%rdx)
 ; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, 64(%rdx)
 ; AVX512DQ-FAST-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index 0fb0420bb2609..aff2228c258b5 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -106,36 +106,16 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT:    vpsraw $4, %zmm2, %zmm3
-; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpsraw $2, %zmm2, %zmm3
-; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm5
-; AVX512BW-NEXT:    vpmovb2m %zmm5, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpsraw $1, %zmm2, %zmm3
-; AVX512BW-NEXT:    vpsllw $2, %zmm4, %zmm4
-; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT:    vpsraw $4, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsraw $2, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpsllw $2, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsravw %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsraw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT:    kmovq %rax, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index c5d3297e334c7..7c1a531628eab 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -1931,31 +1931,28 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
 define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
 ; SSE-LABEL: constant_shift_v8i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE-NEXT:    psraw $8, %xmm0
 ; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2]
 ; SSE-NEXT:    psrlw $8, %xmm0
-; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    packuswb %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v8i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2]
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: constant_shift_v8i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,256,256,256,256,256,256,256]
+; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
@@ -1977,7 +1974,8 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
 ;
 ; AVX512BW-LABEL: constant_shift_v8i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
 ; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
@@ -2003,14 +2001,12 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
 ;
 ; X86-SSE-LABEL: constant_shift_v8i8:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE-NEXT:    psraw $8, %xmm0
 ; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2]
 ; X86-SSE-NEXT:    psrlw $8, %xmm0
-; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
+; X86-SSE-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE-NEXT:    packuswb %xmm1, %xmm0
 ; X86-SSE-NEXT:    retl
   %shift = ashr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
   ret <8 x i8> %shift
@@ -2019,31 +2015,28 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
 define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
 ; SSE-LABEL: constant_shift_v4i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE-NEXT:    psraw $8, %xmm0
-; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,256,256,256,256]
+; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,u,u,u,u]
 ; SSE-NEXT:    psrlw $8, %xmm0
-; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    packuswb %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v4i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,256,256,256,256]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,u,u,u,u]
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: constant_shift_v4i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
@@ -2065,7 +2058,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
 ;
 ; AVX512BW-LABEL: constant_shift_v4i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
 ; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
@@ -2091,14 +2084,12 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
 ;
 ; X86-SSE-LABEL: constant_shift_v4i8:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE-NEXT:    psraw $8, %xmm0
-; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,256,256,256,256]
+; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,u,u,u,u]
 ; X86-SSE-NEXT:    psrlw $8, %xmm0
-; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
+; X86-SSE-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE-NEXT:    packuswb %xmm1, %xmm0
 ; X86-SSE-NEXT:    retl
   %shift = ashr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
   ret <4 x i8> %shift
@@ -2107,31 +2098,28 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
 define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
 ; SSE-LABEL: constant_shift_v2i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE-NEXT:    psraw $8, %xmm0
-; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256]
+; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,u,u,u,u,u,u]
 ; SSE-NEXT:    psrlw $8, %xmm0
-; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    packuswb %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v2i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,256,256,256,256,256,256]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,u,u,u,u,u,u]
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: constant_shift_v2i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
@@ -2153,7 +2141,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
 ;
 ; AVX512BW-LABEL: constant_shift_v2i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
 ; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
 ; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
@@ -2179,14 +2167,12 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
 ;
 ; X86-SSE-LABEL: constant_shift_v2i8:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE-NEXT:    psraw $8, %xmm0
-; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,256,256,256,256,256,256]
+; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,u,u,u,u,u,u]
 ; X86-SSE-NEXT:    psrlw $8, %xmm0
-; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
+; X86-SSE-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE-NEXT:    packuswb %xmm1, %xmm0
 ; X86-SSE-NEXT:    retl
   %shift = ashr <2 x i8> %a, <i8 2, i8 3>
   ret <2 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
index 103d5702fb93a..4450d07e01cca 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -85,20 +85,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpsrlvw %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT:    kmovq %rax, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = lshr <64 x i8> %a, %b
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index eb39b6a0d2227..e6eb4d70d22c9 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -1617,39 +1617,34 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v8i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2]
 ; SSE2-NEXT:    psrlw $8, %xmm0
-; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v8i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,16,8,4,2]
-; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    packuswb %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2]
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v8i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2]
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: constant_shift_v8i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,256,256,256,256,256,256,256]
+; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
@@ -1671,7 +1666,8 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
 ;
 ; AVX512BW-LABEL: constant_shift_v8i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
@@ -1698,12 +1694,10 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
 ; X86-SSE-LABEL: constant_shift_v8i8:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2]
 ; X86-SSE-NEXT:    psrlw $8, %xmm0
-; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
+; X86-SSE-NEXT:    packuswb %xmm1, %xmm0
 ; X86-SSE-NEXT:    retl
   %shift = lshr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
   ret <8 x i8> %shift
@@ -1713,39 +1707,34 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v4i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,256,256,256,256]
+; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,u,u,u,u]
 ; SSE2-NEXT:    psrlw $8, %xmm0
-; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v4i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,256,256,256,256]
-; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    packuswb %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,u,u,u,u]
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v4i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,256,256,256,256]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,u,u,u,u]
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: constant_shift_v4i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
@@ -1767,7 +1756,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
 ;
 ; AVX512BW-LABEL: constant_shift_v4i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
@@ -1794,12 +1783,10 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
 ; X86-SSE-LABEL: constant_shift_v4i8:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,256,256,256,256]
+; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,u,u,u,u]
 ; X86-SSE-NEXT:    psrlw $8, %xmm0
-; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
+; X86-SSE-NEXT:    packuswb %xmm1, %xmm0
 ; X86-SSE-NEXT:    retl
   %shift = lshr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
   ret <4 x i8> %shift
@@ -1809,39 +1796,34 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v2i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256]
+; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,u,u,u,u,u,u]
 ; SSE2-NEXT:    psrlw $8, %xmm0
-; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v2i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,32,256,256,256,256,256,256]
-; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    packuswb %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,u,u,u,u,u,u]
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v2i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,256,256,256,256,256,256]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,u,u,u,u,u,u]
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: constant_shift_v2i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
@@ -1863,7 +1845,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
 ;
 ; AVX512BW-LABEL: constant_shift_v2i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
@@ -1890,12 +1872,10 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
 ; X86-SSE-LABEL: constant_shift_v2i8:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,256,256,256,256,256,256]
+; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,u,u,u,u,u,u]
 ; X86-SSE-NEXT:    psrlw $8, %xmm0
-; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
+; X86-SSE-NEXT:    packuswb %xmm1, %xmm0
 ; X86-SSE-NEXT:    retl
   %shift = lshr <2 x i8> %a, <i8 2, i8 3>
   ret <2 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
index efd742956ed09..41238acc4b74d 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -82,19 +82,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT:    kmovq %rax, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = shl <64 x i8> %a, %b
   ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
index d245bdca6ee29..ec7db86e5e05e 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
@@ -1478,7 +1478,8 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
 ;
 ; AVX512BW-LABEL: constant_shift_v8i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
@@ -1567,7 +1568,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
 ;
 ; AVX512BW-LABEL: constant_shift_v4i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
@@ -1656,7 +1657,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
 ;
 ; AVX512BW-LABEL: constant_shift_v2i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index ee9d8a55aeb3e..35e1c5a559a95 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -3575,21 +3575,17 @@ define void @SpinningCube() {
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1]
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
-; SSE2-NEXT:    movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; SSE2-NEXT:    movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u]
-; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0]
-; SSE2-NEXT:    movq {{.*#+}} xmm3 = xmm3[0],zero
-; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0]
-; SSE2-NEXT:    addps %xmm0, %xmm3
-; SSE2-NEXT:    movaps %xmm3, (%rax)
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE2-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    addps %xmm2, %xmm0
-; SSE2-NEXT:    movaps %xmm0, (%rax)
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE2-NEXT:    movq {{.*#+}} xmm2 = xmm2[0],zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
+; SSE2-NEXT:    addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSE2-NEXT:    movaps %xmm2, (%rax)
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    addps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: SpinningCube:
@@ -3598,54 +3594,43 @@ define void @SpinningCube() {
 ; SSSE3-NEXT:    xorps %xmm0, %xmm0
 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1]
-; SSSE3-NEXT:    xorps %xmm2, %xmm2
-; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
-; SSSE3-NEXT:    movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; SSSE3-NEXT:    movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u]
-; SSSE3-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0]
-; SSSE3-NEXT:    movq {{.*#+}} xmm3 = xmm3[0],zero
-; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0]
-; SSSE3-NEXT:    addps %xmm0, %xmm3
-; SSSE3-NEXT:    movaps %xmm3, (%rax)
-; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,2]
-; SSSE3-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSSE3-NEXT:    addps %xmm2, %xmm0
-; SSSE3-NEXT:    movaps %xmm0, (%rax)
+; SSSE3-NEXT:    movd {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSSE3-NEXT:    movq {{.*#+}} xmm2 = xmm2[0],zero
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
+; SSSE3-NEXT:    addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSSE3-NEXT:    movaps %xmm2, (%rax)
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,2]
+; SSSE3-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    addps %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, (%rax)
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: SpinningCube:
 ; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,zero,zero,mem[0]
-; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u]
-; SSE41-NEXT:    movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; SSE41-NEXT:    movaps %xmm1, %xmm3
-; SSE41-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0]
-; SSE41-NEXT:    movaps %xmm0, %xmm4
-; SSE41-NEXT:    insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3]
-; SSE41-NEXT:    addps %xmm3, %xmm4
-; SSE41-NEXT:    movaps %xmm4, (%rax)
-; SSE41-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0,0,2]
-; SSE41-NEXT:    mulps %xmm1, %xmm2
-; SSE41-NEXT:    addps %xmm0, %xmm2
-; SSE41-NEXT:    movaps %xmm2, (%rax)
+; SSE41-NEXT:    movaps %xmm0, %xmm1
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; SSE41-NEXT:    addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
+; SSE41-NEXT:    movaps %xmm1, (%rax)
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,2]
+; SSE41-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    addps %xmm0, %xmm1
+; SSE41-NEXT:    movaps %xmm1, (%rax)
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: SpinningCube:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0.0E+0,0.0E+0,0.0E+0,1.0E+0]
-; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u]
-; AVX-NEXT:    vmovss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX-NEXT:    vaddps %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vmovaps %xmm2, (%rax)
-; AVX-NEXT:    vbroadcastss (%rax), %xmm2
-; AVX-NEXT:    vmulps %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3]
+; AVX-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vmovaps %xmm1, (%rax)
+; AVX-NEXT:    vbroadcastss (%rax), %xmm1
+; AVX-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, (%rax)
 ; AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/issue163738.ll b/llvm/test/CodeGen/X86/vpternlog.ll
similarity index 59%
rename from llvm/test/CodeGen/X86/issue163738.ll
rename to llvm/test/CodeGen/X86/vpternlog.ll
index 61fe043a970dd..bd7478d3a82d5 100644
--- a/llvm/test/CodeGen/X86/issue163738.ll
+++ b/llvm/test/CodeGen/X86/vpternlog.ll
@@ -11,3 +11,15 @@ define <8 x i64> @foo(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) {
   %and3 = xor <8 x i64> %and3.demorgan, splat (i64 -1)
   ret <8 x i64> %and3
 }
+
+define <8 x i64> @xorbitcast(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c) {
+; CHECK-LABEL: xorbitcast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpternlogq {{.*#+}} zmm0 = ~(zmm0 | zmm2 | zmm1)
+; CHECK-NEXT:    retq
+  %or1 = or <64 x i8> %a, %b
+  %or2 = or <64 x i8> %or1, %c
+  %cast = bitcast <64 x i8> %or2 to <8 x i64>
+  %xor = xor <8 x i64> %cast, splat (i64 -1)
+  ret <8 x i64> %xor
+}
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index f70145d6b21c2..0fe3edab4ac38 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -641,10 +641,10 @@ define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) {
 ; SSE2-NEXT:    movd %edi, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1]
 ; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm0
 ; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index 3c98eba69ae5b..1c3d27fac4203 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -777,31 +777,31 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %edx, (%esp)
+; FALLBACK18-NEXT:    movl %eax, %ecx
 ; FALLBACK18-NEXT:    andb $12, %bl
-; FALLBACK18-NEXT:    movzbl %bl, %esi
-; FALLBACK18-NEXT:    movl 4(%esp,%esi), %edi
-; FALLBACK18-NEXT:    movl 8(%esp,%esi), %ebx
-; FALLBACK18-NEXT:    shrxl %eax, %edi, %ebp
-; FALLBACK18-NEXT:    movl %eax, %edx
-; FALLBACK18-NEXT:    notb %dl
-; FALLBACK18-NEXT:    leal (%ebx,%ebx), %ecx
-; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK18-NEXT:    orl %ebp, %ecx
-; FALLBACK18-NEXT:    shrxl %eax, (%esp,%esi), %ebp
-; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
-; FALLBACK18-NEXT:    orl %ebp, %edi
-; FALLBACK18-NEXT:    shrxl %eax, %ebx, %ebx
-; FALLBACK18-NEXT:    movl 12(%esp,%esi), %esi
-; FALLBACK18-NEXT:    shrxl %eax, %esi, %eax
-; FALLBACK18-NEXT:    addl %esi, %esi
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %edx
-; FALLBACK18-NEXT:    orl %ebx, %edx
+; FALLBACK18-NEXT:    movzbl %bl, %edi
+; FALLBACK18-NEXT:    movl 4(%esp,%edi), %ebx
+; FALLBACK18-NEXT:    movl 8(%esp,%edi), %esi
+; FALLBACK18-NEXT:    shrxl %ecx, %ebx, %ebp
+; FALLBACK18-NEXT:    notb %al
+; FALLBACK18-NEXT:    leal (%esi,%esi), %edx
+; FALLBACK18-NEXT:    shlxl %eax, %edx, %edx
+; FALLBACK18-NEXT:    orl %ebp, %edx
+; FALLBACK18-NEXT:    shrxl %ecx, (%esp,%edi), %ebp
+; FALLBACK18-NEXT:    addl %ebx, %ebx
+; FALLBACK18-NEXT:    shlxl %eax, %ebx, %ebx
+; FALLBACK18-NEXT:    orl %ebp, %ebx
+; FALLBACK18-NEXT:    movl 12(%esp,%edi), %edi
+; FALLBACK18-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK18-NEXT:    shlxl %eax, %ebp, %eax
+; FALLBACK18-NEXT:    shrxl %ecx, %esi, %esi
+; FALLBACK18-NEXT:    orl %esi, %eax
+; FALLBACK18-NEXT:    shrxl %ecx, %edi, %ecx
 ; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK18-NEXT:    movl %eax, 12(%esi)
-; FALLBACK18-NEXT:    movl %edx, 8(%esi)
-; FALLBACK18-NEXT:    movl %edi, (%esi)
-; FALLBACK18-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK18-NEXT:    movl %ecx, 12(%esi)
+; FALLBACK18-NEXT:    movl %eax, 8(%esi)
+; FALLBACK18-NEXT:    movl %ebx, (%esi)
+; FALLBACK18-NEXT:    movl %edx, 4(%esi)
 ; FALLBACK18-NEXT:    addl $44, %esp
 ; FALLBACK18-NEXT:    popl %esi
 ; FALLBACK18-NEXT:    popl %edi
@@ -962,42 +962,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    pushl %ebx
 ; FALLBACK22-NEXT:    pushl %edi
 ; FALLBACK22-NEXT:    pushl %esi
-; FALLBACK22-NEXT:    subl $44, %esp
+; FALLBACK22-NEXT:    subl $60, %esp
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK22-NEXT:    movups (%ecx), %xmm0
-; FALLBACK22-NEXT:    movzbl (%eax), %ecx
-; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    movzbl (%eax), %edx
+; FALLBACK22-NEXT:    movl %edx, %eax
 ; FALLBACK22-NEXT:    shlb $3, %al
 ; FALLBACK22-NEXT:    xorps %xmm1, %xmm1
 ; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movaps %xmm0, (%esp)
-; FALLBACK22-NEXT:    andb $12, %cl
-; FALLBACK22-NEXT:    movzbl %cl, %edi
-; FALLBACK22-NEXT:    shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movl %eax, %ecx
-; FALLBACK22-NEXT:    notb %cl
-; FALLBACK22-NEXT:    movl 4(%esp,%edi), %ebp
-; FALLBACK22-NEXT:    movl 8(%esp,%edi), %esi
-; FALLBACK22-NEXT:    leal (%ebp,%ebp), %edx
-; FALLBACK22-NEXT:    shlxl %ecx, %edx, %edx
-; FALLBACK22-NEXT:    orl %ebx, %edx
-; FALLBACK22-NEXT:    shrxl %eax, %esi, %ebx
-; FALLBACK22-NEXT:    shrxl %eax, %ebp, %ebp
-; FALLBACK22-NEXT:    movl 12(%esp,%edi), %edi
-; FALLBACK22-NEXT:    shrxl %eax, %edi, %eax
-; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ecx, %edi, %edi
-; FALLBACK22-NEXT:    orl %ebx, %edi
-; FALLBACK22-NEXT:    addl %esi, %esi
-; FALLBACK22-NEXT:    shlxl %ecx, %esi, %ecx
-; FALLBACK22-NEXT:    orl %ebp, %ecx
+; FALLBACK22-NEXT:    andb $12, %dl
+; FALLBACK22-NEXT:    movzbl %dl, %edi
+; FALLBACK22-NEXT:    shrxl %ecx, 16(%esp,%edi), %ebp
+; FALLBACK22-NEXT:    notb %al
+; FALLBACK22-NEXT:    movl 20(%esp,%edi), %edx
+; FALLBACK22-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 24(%esp,%edi), %ebx
+; FALLBACK22-NEXT:    addl %edx, %edx
+; FALLBACK22-NEXT:    shlxl %eax, %edx, %edx
+; FALLBACK22-NEXT:    orl %ebp, %edx
+; FALLBACK22-NEXT:    movl 28(%esp,%edi), %ebp
+; FALLBACK22-NEXT:    leal (%ebp,%ebp), %edi
+; FALLBACK22-NEXT:    shlxl %eax, %edi, %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %ebx, %esi
+; FALLBACK22-NEXT:    orl %esi, %edi
+; FALLBACK22-NEXT:    addl %ebx, %ebx
+; FALLBACK22-NEXT:    shlxl %eax, %ebx, %eax
+; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    orl %esi, %eax
+; FALLBACK22-NEXT:    shrxl %ecx, %ebp, %ecx
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK22-NEXT:    movl %eax, 12(%esi)
-; FALLBACK22-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK22-NEXT:    movl %ecx, 12(%esi)
+; FALLBACK22-NEXT:    movl %eax, 4(%esi)
 ; FALLBACK22-NEXT:    movl %edi, 8(%esi)
 ; FALLBACK22-NEXT:    movl %edx, (%esi)
-; FALLBACK22-NEXT:    addl $44, %esp
+; FALLBACK22-NEXT:    addl $60, %esp
 ; FALLBACK22-NEXT:    popl %esi
 ; FALLBACK22-NEXT:    popl %edi
 ; FALLBACK22-NEXT:    popl %ebx
@@ -1152,42 +1153,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    pushl %ebx
 ; FALLBACK26-NEXT:    pushl %edi
 ; FALLBACK26-NEXT:    pushl %esi
-; FALLBACK26-NEXT:    subl $44, %esp
+; FALLBACK26-NEXT:    subl $60, %esp
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK26-NEXT:    vmovups (%ecx), %xmm0
-; FALLBACK26-NEXT:    movzbl (%eax), %ecx
-; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    movzbl (%eax), %edx
+; FALLBACK26-NEXT:    movl %edx, %eax
 ; FALLBACK26-NEXT:    shlb $3, %al
 ; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK26-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    vmovaps %xmm0, (%esp)
-; FALLBACK26-NEXT:    andb $12, %cl
-; FALLBACK26-NEXT:    movzbl %cl, %edi
-; FALLBACK26-NEXT:    shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK26-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    movl %eax, %ecx
-; FALLBACK26-NEXT:    notb %cl
-; FALLBACK26-NEXT:    movl 4(%esp,%edi), %ebp
-; FALLBACK26-NEXT:    movl 8(%esp,%edi), %esi
-; FALLBACK26-NEXT:    leal (%ebp,%ebp), %edx
-; FALLBACK26-NEXT:    shlxl %ecx, %edx, %edx
-; FALLBACK26-NEXT:    orl %ebx, %edx
-; FALLBACK26-NEXT:    shrxl %eax, %esi, %ebx
-; FALLBACK26-NEXT:    shrxl %eax, %ebp, %ebp
-; FALLBACK26-NEXT:    movl 12(%esp,%edi), %edi
-; FALLBACK26-NEXT:    shrxl %eax, %edi, %eax
-; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ecx, %edi, %edi
-; FALLBACK26-NEXT:    orl %ebx, %edi
-; FALLBACK26-NEXT:    addl %esi, %esi
-; FALLBACK26-NEXT:    shlxl %ecx, %esi, %ecx
-; FALLBACK26-NEXT:    orl %ebp, %ecx
+; FALLBACK26-NEXT:    andb $12, %dl
+; FALLBACK26-NEXT:    movzbl %dl, %edi
+; FALLBACK26-NEXT:    shrxl %ecx, 16(%esp,%edi), %ebp
+; FALLBACK26-NEXT:    notb %al
+; FALLBACK26-NEXT:    movl 20(%esp,%edi), %edx
+; FALLBACK26-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 24(%esp,%edi), %ebx
+; FALLBACK26-NEXT:    addl %edx, %edx
+; FALLBACK26-NEXT:    shlxl %eax, %edx, %edx
+; FALLBACK26-NEXT:    orl %ebp, %edx
+; FALLBACK26-NEXT:    movl 28(%esp,%edi), %ebp
+; FALLBACK26-NEXT:    leal (%ebp,%ebp), %edi
+; FALLBACK26-NEXT:    shlxl %eax, %edi, %edi
+; FALLBACK26-NEXT:    shrxl %ecx, %ebx, %esi
+; FALLBACK26-NEXT:    orl %esi, %edi
+; FALLBACK26-NEXT:    addl %ebx, %ebx
+; FALLBACK26-NEXT:    shlxl %eax, %ebx, %eax
+; FALLBACK26-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    orl %esi, %eax
+; FALLBACK26-NEXT:    shrxl %ecx, %ebp, %ecx
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK26-NEXT:    movl %eax, 12(%esi)
-; FALLBACK26-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK26-NEXT:    movl %ecx, 12(%esi)
+; FALLBACK26-NEXT:    movl %eax, 4(%esi)
 ; FALLBACK26-NEXT:    movl %edi, 8(%esi)
 ; FALLBACK26-NEXT:    movl %edx, (%esi)
-; FALLBACK26-NEXT:    addl $44, %esp
+; FALLBACK26-NEXT:    addl $60, %esp
 ; FALLBACK26-NEXT:    popl %esi
 ; FALLBACK26-NEXT:    popl %edi
 ; FALLBACK26-NEXT:    popl %ebx
@@ -1342,42 +1344,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    pushl %ebx
 ; FALLBACK30-NEXT:    pushl %edi
 ; FALLBACK30-NEXT:    pushl %esi
-; FALLBACK30-NEXT:    subl $44, %esp
+; FALLBACK30-NEXT:    subl $60, %esp
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK30-NEXT:    vmovups (%ecx), %xmm0
-; FALLBACK30-NEXT:    movzbl (%eax), %ecx
-; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    movzbl (%eax), %edx
+; FALLBACK30-NEXT:    movl %edx, %eax
 ; FALLBACK30-NEXT:    shlb $3, %al
 ; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK30-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    vmovaps %xmm0, (%esp)
-; FALLBACK30-NEXT:    andb $12, %cl
-; FALLBACK30-NEXT:    movzbl %cl, %edi
-; FALLBACK30-NEXT:    shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK30-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    movl %eax, %ecx
-; FALLBACK30-NEXT:    notb %cl
-; FALLBACK30-NEXT:    movl 4(%esp,%edi), %ebp
-; FALLBACK30-NEXT:    movl 8(%esp,%edi), %esi
-; FALLBACK30-NEXT:    leal (%ebp,%ebp), %edx
-; FALLBACK30-NEXT:    shlxl %ecx, %edx, %edx
-; FALLBACK30-NEXT:    orl %ebx, %edx
-; FALLBACK30-NEXT:    shrxl %eax, %esi, %ebx
-; FALLBACK30-NEXT:    shrxl %eax, %ebp, %ebp
-; FALLBACK30-NEXT:    movl 12(%esp,%edi), %edi
-; FALLBACK30-NEXT:    shrxl %eax, %edi, %eax
-; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ecx, %edi, %edi
-; FALLBACK30-NEXT:    orl %ebx, %edi
-; FALLBACK30-NEXT:    addl %esi, %esi
-; FALLBACK30-NEXT:    shlxl %ecx, %esi, %ecx
-; FALLBACK30-NEXT:    orl %ebp, %ecx
+; FALLBACK30-NEXT:    andb $12, %dl
+; FALLBACK30-NEXT:    movzbl %dl, %edi
+; FALLBACK30-NEXT:    shrxl %ecx, 16(%esp,%edi), %ebp
+; FALLBACK30-NEXT:    notb %al
+; FALLBACK30-NEXT:    movl 20(%esp,%edi), %edx
+; FALLBACK30-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 24(%esp,%edi), %ebx
+; FALLBACK30-NEXT:    addl %edx, %edx
+; FALLBACK30-NEXT:    shlxl %eax, %edx, %edx
+; FALLBACK30-NEXT:    orl %ebp, %edx
+; FALLBACK30-NEXT:    movl 28(%esp,%edi), %ebp
+; FALLBACK30-NEXT:    leal (%ebp,%ebp), %edi
+; FALLBACK30-NEXT:    shlxl %eax, %edi, %edi
+; FALLBACK30-NEXT:    shrxl %ecx, %ebx, %esi
+; FALLBACK30-NEXT:    orl %esi, %edi
+; FALLBACK30-NEXT:    addl %ebx, %ebx
+; FALLBACK30-NEXT:    shlxl %eax, %ebx, %eax
+; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    orl %esi, %eax
+; FALLBACK30-NEXT:    shrxl %ecx, %ebp, %ecx
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK30-NEXT:    movl %eax, 12(%esi)
-; FALLBACK30-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK30-NEXT:    movl %ecx, 12(%esi)
+; FALLBACK30-NEXT:    movl %eax, 4(%esi)
 ; FALLBACK30-NEXT:    movl %edi, 8(%esi)
 ; FALLBACK30-NEXT:    movl %edx, (%esi)
-; FALLBACK30-NEXT:    addl $44, %esp
+; FALLBACK30-NEXT:    addl $60, %esp
 ; FALLBACK30-NEXT:    popl %esi
 ; FALLBACK30-NEXT:    popl %edi
 ; FALLBACK30-NEXT:    popl %ebx
@@ -1784,41 +1787,41 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl 4(%ecx), %esi
 ; FALLBACK18-NEXT:    movl 8(%ecx), %edi
 ; FALLBACK18-NEXT:    movl 12(%ecx), %ecx
-; FALLBACK18-NEXT:    movzbl (%eax), %eax
-; FALLBACK18-NEXT:    movl %eax, %ebx
-; FALLBACK18-NEXT:    shlb $3, %bl
+; FALLBACK18-NEXT:    movzbl (%eax), %ebx
+; FALLBACK18-NEXT:    movl %ebx, %eax
+; FALLBACK18-NEXT:    shlb $3, %al
 ; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
 ; FALLBACK18-NEXT:    movaps %xmm0, (%esp)
 ; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    andb $12, %al
-; FALLBACK18-NEXT:    negb %al
-; FALLBACK18-NEXT:    movsbl %al, %edx
-; FALLBACK18-NEXT:    movl 16(%esp,%edx), %edi
-; FALLBACK18-NEXT:    movl 20(%esp,%edx), %ecx
-; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %esi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %ebp
-; FALLBACK18-NEXT:    movl %ebx, %eax
+; FALLBACK18-NEXT:    movl %eax, %ecx
+; FALLBACK18-NEXT:    andb $12, %bl
+; FALLBACK18-NEXT:    negb %bl
+; FALLBACK18-NEXT:    movsbl %bl, %esi
+; FALLBACK18-NEXT:    movl 16(%esp,%esi), %ebx
+; FALLBACK18-NEXT:    movl 20(%esp,%esi), %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %edi
 ; FALLBACK18-NEXT:    notb %al
-; FALLBACK18-NEXT:    shrl %edi
-; FALLBACK18-NEXT:    shrxl %eax, %edi, %edi
-; FALLBACK18-NEXT:    orl %esi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, 28(%esp,%edx), %esi
-; FALLBACK18-NEXT:    movl 24(%esp,%edx), %edx
-; FALLBACK18-NEXT:    shlxl %ebx, %edx, %ebx
+; FALLBACK18-NEXT:    shlxl %ecx, %ebx, %ebp
+; FALLBACK18-NEXT:    shrl %ebx
+; FALLBACK18-NEXT:    shrxl %eax, %ebx, %ebx
+; FALLBACK18-NEXT:    orl %edi, %ebx
+; FALLBACK18-NEXT:    shlxl %ecx, 28(%esp,%esi), %edi
+; FALLBACK18-NEXT:    movl 24(%esp,%esi), %esi
+; FALLBACK18-NEXT:    shlxl %ecx, %esi, %ecx
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %eax, %esi, %esi
+; FALLBACK18-NEXT:    orl %edi, %esi
 ; FALLBACK18-NEXT:    shrl %edx
-; FALLBACK18-NEXT:    shrxl %eax, %edx, %edx
-; FALLBACK18-NEXT:    orl %esi, %edx
-; FALLBACK18-NEXT:    shrl %ecx
-; FALLBACK18-NEXT:    shrxl %eax, %ecx, %eax
-; FALLBACK18-NEXT:    orl %ebx, %eax
+; FALLBACK18-NEXT:    shrxl %eax, %edx, %eax
+; FALLBACK18-NEXT:    orl %ecx, %eax
 ; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK18-NEXT:    movl %ebp, (%ecx)
 ; FALLBACK18-NEXT:    movl %eax, 8(%ecx)
-; FALLBACK18-NEXT:    movl %edx, 12(%ecx)
-; FALLBACK18-NEXT:    movl %edi, 4(%ecx)
+; FALLBACK18-NEXT:    movl %esi, 12(%ecx)
+; FALLBACK18-NEXT:    movl %ebx, 4(%ecx)
 ; FALLBACK18-NEXT:    addl $44, %esp
 ; FALLBACK18-NEXT:    popl %esi
 ; FALLBACK18-NEXT:    popl %edi
@@ -1983,39 +1986,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK22-NEXT:    movups (%ecx), %xmm0
-; FALLBACK22-NEXT:    movzbl (%eax), %ecx
-; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    movzbl (%eax), %edx
+; FALLBACK22-NEXT:    movl %edx, %eax
 ; FALLBACK22-NEXT:    shlb $3, %al
 ; FALLBACK22-NEXT:    xorps %xmm1, %xmm1
 ; FALLBACK22-NEXT:    movaps %xmm1, (%esp)
 ; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    andb $12, %cl
-; FALLBACK22-NEXT:    negb %cl
-; FALLBACK22-NEXT:    movsbl %cl, %ecx
-; FALLBACK22-NEXT:    shlxl %eax, 28(%esp,%ecx), %esi
-; FALLBACK22-NEXT:    movl 24(%esp,%ecx), %edx
-; FALLBACK22-NEXT:    shlxl %eax, %edx, %edi
-; FALLBACK22-NEXT:    movl %eax, %ebx
-; FALLBACK22-NEXT:    notb %bl
-; FALLBACK22-NEXT:    shrl %edx
-; FALLBACK22-NEXT:    shrxl %ebx, %edx, %edx
-; FALLBACK22-NEXT:    orl %esi, %edx
-; FALLBACK22-NEXT:    movl 20(%esp,%ecx), %esi
-; FALLBACK22-NEXT:    movl %esi, %ebp
+; FALLBACK22-NEXT:    movl %eax, %ecx
+; FALLBACK22-NEXT:    andb $12, %dl
+; FALLBACK22-NEXT:    negb %dl
+; FALLBACK22-NEXT:    movsbl %dl, %edx
+; FALLBACK22-NEXT:    shlxl %ecx, 28(%esp,%edx), %edi
+; FALLBACK22-NEXT:    notb %al
+; FALLBACK22-NEXT:    movl 24(%esp,%edx), %esi
+; FALLBACK22-NEXT:    shlxl %ecx, %esi, %ebx
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %eax, %esi, %esi
+; FALLBACK22-NEXT:    orl %edi, %esi
+; FALLBACK22-NEXT:    movl 20(%esp,%edx), %edi
+; FALLBACK22-NEXT:    movl %edi, %ebp
 ; FALLBACK22-NEXT:    shrl %ebp
-; FALLBACK22-NEXT:    shrxl %ebx, %ebp, %ebp
-; FALLBACK22-NEXT:    orl %edi, %ebp
-; FALLBACK22-NEXT:    shlxl %eax, %esi, %esi
-; FALLBACK22-NEXT:    movl 16(%esp,%ecx), %ecx
-; FALLBACK22-NEXT:    shlxl %eax, %ecx, %eax
-; FALLBACK22-NEXT:    shrl %ecx
-; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT:    orl %esi, %ecx
-; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK22-NEXT:    movl %eax, (%esi)
-; FALLBACK22-NEXT:    movl %ecx, 4(%esi)
-; FALLBACK22-NEXT:    movl %ebp, 8(%esi)
-; FALLBACK22-NEXT:    movl %edx, 12(%esi)
+; FALLBACK22-NEXT:    shrxl %eax, %ebp, %ebp
+; FALLBACK22-NEXT:    orl %ebx, %ebp
+; FALLBACK22-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK22-NEXT:    movl 16(%esp,%edx), %edx
+; FALLBACK22-NEXT:    shlxl %ecx, %edx, %ecx
+; FALLBACK22-NEXT:    shrl %edx
+; FALLBACK22-NEXT:    shrxl %eax, %edx, %eax
+; FALLBACK22-NEXT:    orl %edi, %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT:    movl %ecx, (%edx)
+; FALLBACK22-NEXT:    movl %eax, 4(%edx)
+; FALLBACK22-NEXT:    movl %ebp, 8(%edx)
+; FALLBACK22-NEXT:    movl %esi, 12(%edx)
 ; FALLBACK22-NEXT:    addl $44, %esp
 ; FALLBACK22-NEXT:    popl %esi
 ; FALLBACK22-NEXT:    popl %edi
@@ -2175,39 +2178,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK26-NEXT:    vmovups (%ecx), %xmm0
-; FALLBACK26-NEXT:    movzbl (%eax), %ecx
-; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    movzbl (%eax), %edx
+; FALLBACK26-NEXT:    movl %edx, %eax
 ; FALLBACK26-NEXT:    shlb $3, %al
 ; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK26-NEXT:    vmovaps %xmm1, (%esp)
 ; FALLBACK26-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    andb $12, %cl
-; FALLBACK26-NEXT:    negb %cl
-; FALLBACK26-NEXT:    movsbl %cl, %ecx
-; FALLBACK26-NEXT:    shlxl %eax, 28(%esp,%ecx), %esi
-; FALLBACK26-NEXT:    movl 24(%esp,%ecx), %edx
-; FALLBACK26-NEXT:    shlxl %eax, %edx, %edi
-; FALLBACK26-NEXT:    movl %eax, %ebx
-; FALLBACK26-NEXT:    notb %bl
-; FALLBACK26-NEXT:    shrl %edx
-; FALLBACK26-NEXT:    shrxl %ebx, %edx, %edx
-; FALLBACK26-NEXT:    orl %esi, %edx
-; FALLBACK26-NEXT:    movl 20(%esp,%ecx), %esi
-; FALLBACK26-NEXT:    movl %esi, %ebp
+; FALLBACK26-NEXT:    movl %eax, %ecx
+; FALLBACK26-NEXT:    andb $12, %dl
+; FALLBACK26-NEXT:    negb %dl
+; FALLBACK26-NEXT:    movsbl %dl, %edx
+; FALLBACK26-NEXT:    shlxl %ecx, 28(%esp,%edx), %edi
+; FALLBACK26-NEXT:    notb %al
+; FALLBACK26-NEXT:    movl 24(%esp,%edx), %esi
+; FALLBACK26-NEXT:    shlxl %ecx, %esi, %ebx
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %eax, %esi, %esi
+; FALLBACK26-NEXT:    orl %edi, %esi
+; FALLBACK26-NEXT:    movl 20(%esp,%edx), %edi
+; FALLBACK26-NEXT:    movl %edi, %ebp
 ; FALLBACK26-NEXT:    shrl %ebp
-; FALLBACK26-NEXT:    shrxl %ebx, %ebp, %ebp
-; FALLBACK26-NEXT:    orl %edi, %ebp
-; FALLBACK26-NEXT:    shlxl %eax, %esi, %esi
-; FALLBACK26-NEXT:    movl 16(%esp,%ecx), %ecx
-; FALLBACK26-NEXT:    shlxl %eax, %ecx, %eax
-; FALLBACK26-NEXT:    shrl %ecx
-; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK26-NEXT:    orl %esi, %ecx
-; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK26-NEXT:    movl %eax, (%esi)
-; FALLBACK26-NEXT:    movl %ecx, 4(%esi)
-; FALLBACK26-NEXT:    movl %ebp, 8(%esi)
-; FALLBACK26-NEXT:    movl %edx, 12(%esi)
+; FALLBACK26-NEXT:    shrxl %eax, %ebp, %ebp
+; FALLBACK26-NEXT:    orl %ebx, %ebp
+; FALLBACK26-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK26-NEXT:    movl 16(%esp,%edx), %edx
+; FALLBACK26-NEXT:    shlxl %ecx, %edx, %ecx
+; FALLBACK26-NEXT:    shrl %edx
+; FALLBACK26-NEXT:    shrxl %eax, %edx, %eax
+; FALLBACK26-NEXT:    orl %edi, %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT:    movl %ecx, (%edx)
+; FALLBACK26-NEXT:    movl %eax, 4(%edx)
+; FALLBACK26-NEXT:    movl %ebp, 8(%edx)
+; FALLBACK26-NEXT:    movl %esi, 12(%edx)
 ; FALLBACK26-NEXT:    addl $44, %esp
 ; FALLBACK26-NEXT:    popl %esi
 ; FALLBACK26-NEXT:    popl %edi
@@ -2367,39 +2370,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK30-NEXT:    vmovups (%ecx), %xmm0
-; FALLBACK30-NEXT:    movzbl (%eax), %ecx
-; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    movzbl (%eax), %edx
+; FALLBACK30-NEXT:    movl %edx, %eax
 ; FALLBACK30-NEXT:    shlb $3, %al
 ; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK30-NEXT:    vmovaps %xmm1, (%esp)
 ; FALLBACK30-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    andb $12, %cl
-; FALLBACK30-NEXT:    negb %cl
-; FALLBACK30-NEXT:    movsbl %cl, %ecx
-; FALLBACK30-NEXT:    shlxl %eax, 28(%esp,%ecx), %esi
-; FALLBACK30-NEXT:    movl 24(%esp,%ecx), %edx
-; FALLBACK30-NEXT:    shlxl %eax, %edx, %edi
-; FALLBACK30-NEXT:    movl %eax, %ebx
-; FALLBACK30-NEXT:    notb %bl
-; FALLBACK30-NEXT:    shrl %edx
-; FALLBACK30-NEXT:    shrxl %ebx, %edx, %edx
-; FALLBACK30-NEXT:    orl %esi, %edx
-; FALLBACK30-NEXT:    movl 20(%esp,%ecx), %esi
-; FALLBACK30-NEXT:    movl %esi, %ebp
+; FALLBACK30-NEXT:    movl %eax, %ecx
+; FALLBACK30-NEXT:    andb $12, %dl
+; FALLBACK30-NEXT:    negb %dl
+; FALLBACK30-NEXT:    movsbl %dl, %edx
+; FALLBACK30-NEXT:    shlxl %ecx, 28(%esp,%edx), %edi
+; FALLBACK30-NEXT:    notb %al
+; FALLBACK30-NEXT:    movl 24(%esp,%edx), %esi
+; FALLBACK30-NEXT:    shlxl %ecx, %esi, %ebx
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %eax, %esi, %esi
+; FALLBACK30-NEXT:    orl %edi, %esi
+; FALLBACK30-NEXT:    movl 20(%esp,%edx), %edi
+; FALLBACK30-NEXT:    movl %edi, %ebp
 ; FALLBACK30-NEXT:    shrl %ebp
-; FALLBACK30-NEXT:    shrxl %ebx, %ebp, %ebp
-; FALLBACK30-NEXT:    orl %edi, %ebp
-; FALLBACK30-NEXT:    shlxl %eax, %esi, %esi
-; FALLBACK30-NEXT:    movl 16(%esp,%ecx), %ecx
-; FALLBACK30-NEXT:    shlxl %eax, %ecx, %eax
-; FALLBACK30-NEXT:    shrl %ecx
-; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK30-NEXT:    orl %esi, %ecx
-; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK30-NEXT:    movl %eax, (%esi)
-; FALLBACK30-NEXT:    movl %ecx, 4(%esi)
-; FALLBACK30-NEXT:    movl %ebp, 8(%esi)
-; FALLBACK30-NEXT:    movl %edx, 12(%esi)
+; FALLBACK30-NEXT:    shrxl %eax, %ebp, %ebp
+; FALLBACK30-NEXT:    orl %ebx, %ebp
+; FALLBACK30-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK30-NEXT:    movl 16(%esp,%edx), %edx
+; FALLBACK30-NEXT:    shlxl %ecx, %edx, %ecx
+; FALLBACK30-NEXT:    shrl %edx
+; FALLBACK30-NEXT:    shrxl %eax, %edx, %eax
+; FALLBACK30-NEXT:    orl %edi, %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT:    movl %ecx, (%edx)
+; FALLBACK30-NEXT:    movl %eax, 4(%edx)
+; FALLBACK30-NEXT:    movl %ebp, 8(%edx)
+; FALLBACK30-NEXT:    movl %esi, 12(%edx)
 ; FALLBACK30-NEXT:    addl $44, %esp
 ; FALLBACK30-NEXT:    popl %esi
 ; FALLBACK30-NEXT:    popl %edi
@@ -2833,31 +2836,31 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    andb $12, %bl
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl %bl, %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%esp,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 8(%esp,%esi), %ebx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    notb %dl
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%ebx,%ebx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, (%esp,%esi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, %ebx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 12(%esp,%esi), %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarxl %eax, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %edx, %esi, %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebx, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl %bl, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%esp,%edi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 8(%esp,%edi), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    notb %al
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %eax, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, (%esp,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %eax, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 12(%esp,%edi), %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%edi,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %eax, %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarxl %ecx, %edi, %ecx
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, 12(%esi)
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, 8(%esi)
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, (%esi)
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, 4(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, 12(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, 8(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ebx, (%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, 4(%esi)
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl $44, %esp
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %edi
@@ -3208,30 +3211,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movl %eax, %ecx
 ; FALLBACK2-NEXT:    andb $24, %sil
-; FALLBACK2-NEXT:    movzbl %sil, %ecx
-; FALLBACK2-NEXT:    movq -64(%rsp,%rcx), %rsi
-; FALLBACK2-NEXT:    movq -56(%rsp,%rcx), %rdi
-; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
-; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx), %r9
-; FALLBACK2-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK2-NEXT:    movq -48(%rsp,%rcx), %rcx
-; FALLBACK2-NEXT:    shrxq %rax, %rcx, %r11
-; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    movzbl %sil, %esi
+; FALLBACK2-NEXT:    movq -64(%rsp,%rsi), %rdi
+; FALLBACK2-NEXT:    movq -56(%rsp,%rsi), %r8
+; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %r9
 ; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK2-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK2-NEXT:    orq %r9, %r10
+; FALLBACK2-NEXT:    shrxq %rcx, -72(%rsp,%rsi), %r9
 ; FALLBACK2-NEXT:    addq %rdi, %rdi
 ; FALLBACK2-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %r8, %rdi
-; FALLBACK2-NEXT:    addq %rsi, %rsi
-; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r9, %rsi
-; FALLBACK2-NEXT:    addq %rcx, %rcx
-; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
-; FALLBACK2-NEXT:    orq %r10, %rax
-; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK2-NEXT:    orq %r9, %rdi
+; FALLBACK2-NEXT:    shrxq %rcx, %r8, %r8
+; FALLBACK2-NEXT:    movq -48(%rsp,%rsi), %rsi
+; FALLBACK2-NEXT:    leaq (%rsi,%rsi), %r9
+; FALLBACK2-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK2-NEXT:    orq %r8, %rax
+; FALLBACK2-NEXT:    shrxq %rcx, %rsi, %rcx
+; FALLBACK2-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
-; FALLBACK2-NEXT:    movq %rsi, (%rdx)
-; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, (%rdx)
+; FALLBACK2-NEXT:    movq %r10, 8(%rdx)
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: lshr_32bytes:
@@ -3355,30 +3358,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movl %eax, %esi
 ; FALLBACK6-NEXT:    andb $24, %cl
 ; FALLBACK6-NEXT:    movzbl %cl, %ecx
-; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK6-NEXT:    movq -64(%rsp,%rcx), %rdi
-; FALLBACK6-NEXT:    movq -56(%rsp,%rcx), %r8
-; FALLBACK6-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK6-NEXT:    movq -48(%rsp,%rcx), %rcx
-; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK6-NEXT:    shrxq %rax, %rcx, %r11
-; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    shrxq %rsi, -72(%rsp,%rcx), %rdi
 ; FALLBACK6-NEXT:    notb %al
-; FALLBACK6-NEXT:    addq %rdi, %rdi
-; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT:    orq %rsi, %rdi
-; FALLBACK6-NEXT:    addq %rcx, %rcx
-; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT:    orq %r9, %rcx
-; FALLBACK6-NEXT:    addq %r8, %r8
-; FALLBACK6-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK6-NEXT:    orq %r10, %rax
-; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq -64(%rsp,%rcx), %r8
+; FALLBACK6-NEXT:    movq -56(%rsp,%rcx), %r9
+; FALLBACK6-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK6-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK6-NEXT:    orq %rdi, %r10
+; FALLBACK6-NEXT:    shrxq %rsi, %r9, %rdi
+; FALLBACK6-NEXT:    movq -48(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT:    leaq (%rcx,%rcx), %r11
+; FALLBACK6-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK6-NEXT:    orq %rdi, %r11
+; FALLBACK6-NEXT:    shrxq %rsi, %r8, %rdi
+; FALLBACK6-NEXT:    addq %r9, %r9
+; FALLBACK6-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK6-NEXT:    orq %rdi, %rax
+; FALLBACK6-NEXT:    shrxq %rsi, %rcx, %rcx
+; FALLBACK6-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, (%rdx)
+; FALLBACK6-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r10, (%rdx)
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: lshr_32bytes:
@@ -3487,35 +3490,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK10-LABEL: lshr_32bytes:
 ; FALLBACK10:       # %bb.0:
 ; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
-; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK10-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK10-NEXT:    movzbl (%rsi), %eax
+; FALLBACK10-NEXT:    leal (,%rax,8), %ecx
 ; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT:    andb $24, %cl
-; FALLBACK10-NEXT:    movzbl %cl, %ecx
-; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK10-NEXT:    movq -64(%rsp,%rcx), %rdi
-; FALLBACK10-NEXT:    movq -56(%rsp,%rcx), %r8
-; FALLBACK10-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK10-NEXT:    movq -48(%rsp,%rcx), %rcx
-; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK10-NEXT:    shrxq %rax, %rcx, %r11
-; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
-; FALLBACK10-NEXT:    notb %al
-; FALLBACK10-NEXT:    addq %rdi, %rdi
-; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT:    orq %rsi, %rdi
-; FALLBACK10-NEXT:    addq %rcx, %rcx
-; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT:    orq %r9, %rcx
-; FALLBACK10-NEXT:    addq %r8, %r8
-; FALLBACK10-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK10-NEXT:    orq %r10, %rax
-; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, (%rdx)
+; FALLBACK10-NEXT:    movl %ecx, %esi
+; FALLBACK10-NEXT:    andb $24, %al
+; FALLBACK10-NEXT:    movzbl %al, %eax
+; FALLBACK10-NEXT:    shrxq %rsi, -72(%rsp,%rax), %rdi
+; FALLBACK10-NEXT:    notb %cl
+; FALLBACK10-NEXT:    movq -64(%rsp,%rax), %r8
+; FALLBACK10-NEXT:    movq -56(%rsp,%rax), %r9
+; FALLBACK10-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK10-NEXT:    shlxq %rcx, %r10, %r10
+; FALLBACK10-NEXT:    orq %rdi, %r10
+; FALLBACK10-NEXT:    shrxq %rsi, %r9, %rdi
+; FALLBACK10-NEXT:    movq -48(%rsp,%rax), %rax
+; FALLBACK10-NEXT:    leaq (%rax,%rax), %r11
+; FALLBACK10-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK10-NEXT:    orq %rdi, %r11
+; FALLBACK10-NEXT:    shrxq %rsi, %r8, %rdi
+; FALLBACK10-NEXT:    addq %r9, %r9
+; FALLBACK10-NEXT:    shlxq %rcx, %r9, %rcx
+; FALLBACK10-NEXT:    orq %rdi, %rcx
+; FALLBACK10-NEXT:    shrxq %rsi, %rax, %rax
+; FALLBACK10-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rcx, 8(%rdx)
+; FALLBACK10-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r10, (%rdx)
 ; FALLBACK10-NEXT:    vzeroupper
 ; FALLBACK10-NEXT:    retq
 ;
@@ -3623,35 +3626,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK14-LABEL: lshr_32bytes:
 ; FALLBACK14:       # %bb.0:
 ; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
-; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK14-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK14-NEXT:    movzbl (%rsi), %eax
+; FALLBACK14-NEXT:    leal (,%rax,8), %ecx
 ; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT:    andb $24, %cl
-; FALLBACK14-NEXT:    movzbl %cl, %ecx
-; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK14-NEXT:    movq -64(%rsp,%rcx), %rdi
-; FALLBACK14-NEXT:    movq -56(%rsp,%rcx), %r8
-; FALLBACK14-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK14-NEXT:    movq -48(%rsp,%rcx), %rcx
-; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK14-NEXT:    shrxq %rax, %rcx, %r11
-; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
-; FALLBACK14-NEXT:    notb %al
-; FALLBACK14-NEXT:    addq %rdi, %rdi
-; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT:    orq %rsi, %rdi
-; FALLBACK14-NEXT:    addq %rcx, %rcx
-; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT:    orq %r9, %rcx
-; FALLBACK14-NEXT:    addq %r8, %r8
-; FALLBACK14-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK14-NEXT:    orq %r10, %rax
-; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, (%rdx)
+; FALLBACK14-NEXT:    movl %ecx, %esi
+; FALLBACK14-NEXT:    andb $24, %al
+; FALLBACK14-NEXT:    movzbl %al, %eax
+; FALLBACK14-NEXT:    shrxq %rsi, -72(%rsp,%rax), %rdi
+; FALLBACK14-NEXT:    notb %cl
+; FALLBACK14-NEXT:    movq -64(%rsp,%rax), %r8
+; FALLBACK14-NEXT:    movq -56(%rsp,%rax), %r9
+; FALLBACK14-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK14-NEXT:    shlxq %rcx, %r10, %r10
+; FALLBACK14-NEXT:    orq %rdi, %r10
+; FALLBACK14-NEXT:    shrxq %rsi, %r9, %rdi
+; FALLBACK14-NEXT:    movq -48(%rsp,%rax), %rax
+; FALLBACK14-NEXT:    leaq (%rax,%rax), %r11
+; FALLBACK14-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK14-NEXT:    orq %rdi, %r11
+; FALLBACK14-NEXT:    shrxq %rsi, %r8, %rdi
+; FALLBACK14-NEXT:    addq %r9, %r9
+; FALLBACK14-NEXT:    shlxq %rcx, %r9, %rcx
+; FALLBACK14-NEXT:    orq %rdi, %rcx
+; FALLBACK14-NEXT:    shrxq %rsi, %rax, %rax
+; FALLBACK14-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rcx, 8(%rdx)
+; FALLBACK14-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r10, (%rdx)
 ; FALLBACK14-NEXT:    vzeroupper
 ; FALLBACK14-NEXT:    retq
 ;
@@ -3914,81 +3917,75 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl %ebx, %eax
-; FALLBACK18-NEXT:    shlb $3, %al
+; FALLBACK18-NEXT:    movl %ebx, %ecx
+; FALLBACK18-NEXT:    shlb $3, %cl
 ; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, %eax
 ; FALLBACK18-NEXT:    andb $28, %bl
-; FALLBACK18-NEXT:    movzbl %bl, %edi
-; FALLBACK18-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK18-NEXT:    movl 40(%esp,%edi), %ecx
-; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %eax, %esi, %edx
+; FALLBACK18-NEXT:    movzbl %bl, %esi
+; FALLBACK18-NEXT:    movl 36(%esp,%esi), %edx
+; FALLBACK18-NEXT:    movl 40(%esp,%esi), %ebp
+; FALLBACK18-NEXT:    shrxl %eax, %edx, %edi
+; FALLBACK18-NEXT:    notb %cl
+; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ebx
+; FALLBACK18-NEXT:    shlxl %ecx, %ebx, %ebx
+; FALLBACK18-NEXT:    orl %edi, %ebx
+; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %eax, 32(%esp,%esi), %edi
+; FALLBACK18-NEXT:    addl %edx, %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %edx
+; FALLBACK18-NEXT:    orl %edi, %edx
 ; FALLBACK18-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl %eax, %edx
-; FALLBACK18-NEXT:    movl %eax, %ebx
-; FALLBACK18-NEXT:    notb %dl
-; FALLBACK18-NEXT:    leal (%ecx,%ecx), %ebp
-; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
-; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl %ebx, %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, 32(%esp,%edi), %ebx
-; FALLBACK18-NEXT:    addl %esi, %esi
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
-; FALLBACK18-NEXT:    orl %ebx, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 48(%esp,%edi), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    leal (%eax,%eax), %ebx
-; FALLBACK18-NEXT:    shlxl %edx, %ebx, %esi
-; FALLBACK18-NEXT:    movl 44(%esp,%edi), %ebp
-; FALLBACK18-NEXT:    movl %ecx, %eax
-; FALLBACK18-NEXT:    shrxl %ecx, %ebp, %ebx
-; FALLBACK18-NEXT:    orl %ebx, %esi
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; FALLBACK18-NEXT:    movl %eax, %ebx
-; FALLBACK18-NEXT:    addl %ebp, %ebp
-; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
-; FALLBACK18-NEXT:    orl %ecx, %eax
+; FALLBACK18-NEXT:    movl 48(%esp,%esi), %edx
+; FALLBACK18-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    addl %edx, %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %ebx
+; FALLBACK18-NEXT:    movl 44(%esp,%esi), %edx
+; FALLBACK18-NEXT:    shrxl %eax, %edx, %edi
+; FALLBACK18-NEXT:    orl %edi, %ebx
+; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %eax, %ebp, %edi
+; FALLBACK18-NEXT:    movl %eax, %ebp
+; FALLBACK18-NEXT:    addl %edx, %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 56(%esp,%edi), %ebp
-; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
-; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK18-NEXT:    movl 52(%esp,%edi), %eax
-; FALLBACK18-NEXT:    shrxl %ebx, %eax, %esi
-; FALLBACK18-NEXT:    orl %esi, %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 56(%esp,%esi), %edi
+; FALLBACK18-NEXT:    leal (%edi,%edi), %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %edx
+; FALLBACK18-NEXT:    movl 52(%esp,%esi), %eax
+; FALLBACK18-NEXT:    shrxl %ebp, %eax, %ebx
+; FALLBACK18-NEXT:    orl %ebx, %edx
+; FALLBACK18-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    addl %eax, %eax
-; FALLBACK18-NEXT:    shlxl %edx, %eax, %esi
-; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK18-NEXT:    shrxl %ebx, %ebp, %eax
-; FALLBACK18-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK18-NEXT:    shrxl %ebx, %edi, %ebx
-; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
-; FALLBACK18-NEXT:    orl %eax, %edi
-; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT:    movl %ebx, 28(%eax)
-; FALLBACK18-NEXT:    movl %edi, 24(%eax)
-; FALLBACK18-NEXT:    movl %esi, 16(%eax)
-; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, (%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    shlxl %ecx, %eax, %eax
+; FALLBACK18-NEXT:    orl %ebx, %eax
+; FALLBACK18-NEXT:    movl 60(%esp,%esi), %esi
+; FALLBACK18-NEXT:    leal (%esi,%esi), %ebx
+; FALLBACK18-NEXT:    shlxl %ecx, %ebx, %ecx
+; FALLBACK18-NEXT:    shrxl %ebp, %edi, %edi
+; FALLBACK18-NEXT:    orl %edi, %ecx
+; FALLBACK18-NEXT:    shrxl %ebp, %esi, %esi
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; FALLBACK18-NEXT:    movl %esi, 28(%edi)
+; FALLBACK18-NEXT:    movl %ecx, 24(%edi)
+; FALLBACK18-NEXT:    movl %eax, 16(%edi)
+; FALLBACK18-NEXT:    movl %edx, 20(%edi)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 8(%edi)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 12(%edi)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, (%edi)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 4(%edi)
 ; FALLBACK18-NEXT:    addl $108, %esp
 ; FALLBACK18-NEXT:    popl %esi
 ; FALLBACK18-NEXT:    popl %edi
@@ -4261,72 +4258,70 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK22-NEXT:    movups (%ecx), %xmm0
 ; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
-; FALLBACK22-NEXT:    movzbl (%eax), %ecx
-; FALLBACK22-NEXT:    movl %ecx, %edx
-; FALLBACK22-NEXT:    shlb $3, %dl
+; FALLBACK22-NEXT:    movzbl (%eax), %edx
+; FALLBACK22-NEXT:    movl %edx, %ecx
+; FALLBACK22-NEXT:    shlb $3, %cl
 ; FALLBACK22-NEXT:    xorps %xmm2, %xmm2
 ; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    andb $28, %cl
-; FALLBACK22-NEXT:    movzbl %cl, %edi
-; FALLBACK22-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
-; FALLBACK22-NEXT:    movl %edx, %eax
-; FALLBACK22-NEXT:    notb %al
-; FALLBACK22-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    addl %esi, %esi
-; FALLBACK22-NEXT:    shlxl %eax, %esi, %esi
-; FALLBACK22-NEXT:    orl %ecx, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 48(%esp,%edi), %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    addl %ecx, %ecx
-; FALLBACK22-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK22-NEXT:    movl %eax, %ebp
-; FALLBACK22-NEXT:    movl 44(%esp,%edi), %ecx
-; FALLBACK22-NEXT:    shrxl %edx, %ecx, %ebx
-; FALLBACK22-NEXT:    orl %ebx, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    addl %ecx, %ecx
-; FALLBACK22-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK22-NEXT:    movl 40(%esp,%edi), %eax
+; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    andb $28, %dl
+; FALLBACK22-NEXT:    movzbl %dl, %ebx
+; FALLBACK22-NEXT:    shrxl %eax, 32(%esp,%ebx), %edx
+; FALLBACK22-NEXT:    movl %eax, %edi
+; FALLBACK22-NEXT:    notb %cl
+; FALLBACK22-NEXT:    movl 36(%esp,%ebx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %eax, %ebx
-; FALLBACK22-NEXT:    orl %ebx, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 56(%esp,%edi), %esi
-; FALLBACK22-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK22-NEXT:    shlxl %ebp, %ebx, %eax
-; FALLBACK22-NEXT:    movl %ebp, %ecx
-; FALLBACK22-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK22-NEXT:    shrxl %edx, %ebx, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK22-NEXT:    shlxl %ecx, %esi, %eax
+; FALLBACK22-NEXT:    orl %edx, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK22-NEXT:    addl %ebx, %ebx
+; FALLBACK22-NEXT:    movl 48(%esp,%ebx), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%eax,%eax), %edx
+; FALLBACK22-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK22-NEXT:    movl 44(%esp,%ebx), %edx
+; FALLBACK22-NEXT:    shrxl %edi, %edx, %esi
+; FALLBACK22-NEXT:    orl %esi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %edx, %edx
+; FALLBACK22-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK22-NEXT:    movl 40(%esp,%ebx), %edx
+; FALLBACK22-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edi, %edx, %esi
+; FALLBACK22-NEXT:    movl %edi, %edx
+; FALLBACK22-NEXT:    orl %esi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 56(%esp,%ebx), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK22-NEXT:    shlxl %ecx, %ebp, %ebp
+; FALLBACK22-NEXT:    movl 52(%esp,%ebx), %eax
+; FALLBACK22-NEXT:    shrxl %edi, %eax, %edi
+; FALLBACK22-NEXT:    orl %edi, %ebp
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %eax, %eax
+; FALLBACK22-NEXT:    shlxl %ecx, %eax, %edi
+; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK22-NEXT:    movl 60(%esp,%ebx), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %ebx
 ; FALLBACK22-NEXT:    shlxl %ecx, %ebx, %ebx
-; FALLBACK22-NEXT:    orl %ebp, %ebx
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK22-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %eax
-; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    movl %ecx, %edx
-; FALLBACK22-NEXT:    shlxl %ecx, %edi, %edi
-; FALLBACK22-NEXT:    orl %ebp, %edi
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    addl %ecx, %ecx
-; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK22-NEXT:    orl %esi, %ecx
-; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK22-NEXT:    movl %eax, 28(%edx)
-; FALLBACK22-NEXT:    movl %ecx, 4(%edx)
-; FALLBACK22-NEXT:    movl %edi, 24(%edx)
-; FALLBACK22-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK22-NEXT:    orl %eax, %ebx
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT:    movl %eax, 20(%edx)
+; FALLBACK22-NEXT:    addl %eax, %eax
+; FALLBACK22-NEXT:    shlxl %ecx, %eax, %eax
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    orl %ecx, %eax
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %ecx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT:    movl %ecx, 28(%edx)
+; FALLBACK22-NEXT:    movl %eax, 4(%edx)
+; FALLBACK22-NEXT:    movl %ebx, 24(%edx)
+; FALLBACK22-NEXT:    movl %edi, 16(%edx)
+; FALLBACK22-NEXT:    movl %ebp, 20(%edx)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK22-NEXT:    movl %eax, 8(%edx)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4585,70 +4580,68 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
-; FALLBACK26-NEXT:    movzbl (%eax), %ecx
-; FALLBACK26-NEXT:    movl %ecx, %edx
-; FALLBACK26-NEXT:    shlb $3, %dl
+; FALLBACK26-NEXT:    movzbl (%eax), %edx
+; FALLBACK26-NEXT:    movl %edx, %ecx
+; FALLBACK26-NEXT:    shlb $3, %cl
 ; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    andb $28, %cl
-; FALLBACK26-NEXT:    movzbl %cl, %edi
-; FALLBACK26-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
-; FALLBACK26-NEXT:    movl %edx, %eax
-; FALLBACK26-NEXT:    notb %al
-; FALLBACK26-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    addl %esi, %esi
-; FALLBACK26-NEXT:    shlxl %eax, %esi, %esi
-; FALLBACK26-NEXT:    orl %ecx, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 48(%esp,%edi), %ecx
-; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    addl %ecx, %ecx
-; FALLBACK26-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK26-NEXT:    movl %eax, %ebp
-; FALLBACK26-NEXT:    movl 44(%esp,%edi), %ecx
-; FALLBACK26-NEXT:    shrxl %edx, %ecx, %ebx
-; FALLBACK26-NEXT:    orl %ebx, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    addl %ecx, %ecx
-; FALLBACK26-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK26-NEXT:    movl 40(%esp,%edi), %eax
+; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    andb $28, %dl
+; FALLBACK26-NEXT:    movzbl %dl, %ebx
+; FALLBACK26-NEXT:    shrxl %eax, 32(%esp,%ebx), %edx
+; FALLBACK26-NEXT:    movl %eax, %edi
+; FALLBACK26-NEXT:    notb %cl
+; FALLBACK26-NEXT:    movl 36(%esp,%ebx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %eax, %ebx
-; FALLBACK26-NEXT:    orl %ebx, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 56(%esp,%edi), %esi
-; FALLBACK26-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK26-NEXT:    shlxl %ebp, %ebx, %eax
-; FALLBACK26-NEXT:    movl %ebp, %ecx
-; FALLBACK26-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK26-NEXT:    shrxl %edx, %ebx, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK26-NEXT:    shlxl %ecx, %esi, %eax
+; FALLBACK26-NEXT:    orl %edx, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK26-NEXT:    addl %ebx, %ebx
+; FALLBACK26-NEXT:    movl 48(%esp,%ebx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %edx
+; FALLBACK26-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK26-NEXT:    movl 44(%esp,%ebx), %edx
+; FALLBACK26-NEXT:    shrxl %edi, %edx, %esi
+; FALLBACK26-NEXT:    orl %esi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %edx, %edx
+; FALLBACK26-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK26-NEXT:    movl 40(%esp,%ebx), %edx
+; FALLBACK26-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edi, %edx, %esi
+; FALLBACK26-NEXT:    movl %edi, %edx
+; FALLBACK26-NEXT:    orl %esi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 56(%esp,%ebx), %esi
+; FALLBACK26-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK26-NEXT:    shlxl %ecx, %ebp, %ebp
+; FALLBACK26-NEXT:    movl 52(%esp,%ebx), %eax
+; FALLBACK26-NEXT:    shrxl %edi, %eax, %edi
+; FALLBACK26-NEXT:    orl %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %eax, %eax
+; FALLBACK26-NEXT:    shlxl %ecx, %eax, %edi
+; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK26-NEXT:    movl 60(%esp,%ebx), %esi
+; FALLBACK26-NEXT:    leal (%esi,%esi), %ebx
 ; FALLBACK26-NEXT:    shlxl %ecx, %ebx, %ebx
-; FALLBACK26-NEXT:    orl %ebp, %ebx
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK26-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %eax
-; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    movl %ecx, %edx
-; FALLBACK26-NEXT:    shlxl %ecx, %edi, %edi
-; FALLBACK26-NEXT:    orl %ebp, %edi
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    addl %ecx, %ecx
-; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK26-NEXT:    orl %esi, %ecx
-; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK26-NEXT:    movl %eax, 28(%edx)
-; FALLBACK26-NEXT:    movl %ecx, 4(%edx)
-; FALLBACK26-NEXT:    movl %edi, 24(%edx)
-; FALLBACK26-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK26-NEXT:    orl %eax, %ebx
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    movl %eax, 20(%edx)
+; FALLBACK26-NEXT:    addl %eax, %eax
+; FALLBACK26-NEXT:    shlxl %ecx, %eax, %eax
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK26-NEXT:    orl %ecx, %eax
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %ecx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT:    movl %ecx, 28(%edx)
+; FALLBACK26-NEXT:    movl %eax, 4(%edx)
+; FALLBACK26-NEXT:    movl %ebx, 24(%edx)
+; FALLBACK26-NEXT:    movl %edi, 16(%edx)
+; FALLBACK26-NEXT:    movl %ebp, 20(%edx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK26-NEXT:    movl %eax, 8(%edx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4906,70 +4899,68 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK30-NEXT:    vmovups (%ecx), %ymm0
-; FALLBACK30-NEXT:    movzbl (%eax), %ecx
-; FALLBACK30-NEXT:    movl %ecx, %edx
-; FALLBACK30-NEXT:    shlb $3, %dl
+; FALLBACK30-NEXT:    movzbl (%eax), %edx
+; FALLBACK30-NEXT:    movl %edx, %ecx
+; FALLBACK30-NEXT:    shlb $3, %cl
 ; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK30-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    andb $28, %cl
-; FALLBACK30-NEXT:    movzbl %cl, %edi
-; FALLBACK30-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
-; FALLBACK30-NEXT:    movl %edx, %eax
-; FALLBACK30-NEXT:    notb %al
-; FALLBACK30-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    addl %esi, %esi
-; FALLBACK30-NEXT:    shlxl %eax, %esi, %esi
-; FALLBACK30-NEXT:    orl %ecx, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 48(%esp,%edi), %ecx
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    addl %ecx, %ecx
-; FALLBACK30-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK30-NEXT:    movl %eax, %ebp
-; FALLBACK30-NEXT:    movl 44(%esp,%edi), %ecx
-; FALLBACK30-NEXT:    shrxl %edx, %ecx, %ebx
-; FALLBACK30-NEXT:    orl %ebx, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    addl %ecx, %ecx
-; FALLBACK30-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK30-NEXT:    movl 40(%esp,%edi), %eax
+; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    andb $28, %dl
+; FALLBACK30-NEXT:    movzbl %dl, %ebx
+; FALLBACK30-NEXT:    shrxl %eax, 32(%esp,%ebx), %edx
+; FALLBACK30-NEXT:    movl %eax, %edi
+; FALLBACK30-NEXT:    notb %cl
+; FALLBACK30-NEXT:    movl 36(%esp,%ebx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, %eax, %ebx
-; FALLBACK30-NEXT:    orl %ebx, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 56(%esp,%edi), %esi
-; FALLBACK30-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK30-NEXT:    shlxl %ebp, %ebx, %eax
-; FALLBACK30-NEXT:    movl %ebp, %ecx
-; FALLBACK30-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK30-NEXT:    shrxl %edx, %ebx, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK30-NEXT:    shlxl %ecx, %esi, %eax
+; FALLBACK30-NEXT:    orl %edx, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK30-NEXT:    addl %ebx, %ebx
+; FALLBACK30-NEXT:    movl 48(%esp,%ebx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %edx
+; FALLBACK30-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK30-NEXT:    movl 44(%esp,%ebx), %edx
+; FALLBACK30-NEXT:    shrxl %edi, %edx, %esi
+; FALLBACK30-NEXT:    orl %esi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %edx, %edx
+; FALLBACK30-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK30-NEXT:    movl 40(%esp,%ebx), %edx
+; FALLBACK30-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edi, %edx, %esi
+; FALLBACK30-NEXT:    movl %edi, %edx
+; FALLBACK30-NEXT:    orl %esi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 56(%esp,%ebx), %esi
+; FALLBACK30-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK30-NEXT:    shlxl %ecx, %ebp, %ebp
+; FALLBACK30-NEXT:    movl 52(%esp,%ebx), %eax
+; FALLBACK30-NEXT:    shrxl %edi, %eax, %edi
+; FALLBACK30-NEXT:    orl %edi, %ebp
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %eax, %eax
+; FALLBACK30-NEXT:    shlxl %ecx, %eax, %edi
+; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK30-NEXT:    movl 60(%esp,%ebx), %esi
+; FALLBACK30-NEXT:    leal (%esi,%esi), %ebx
 ; FALLBACK30-NEXT:    shlxl %ecx, %ebx, %ebx
-; FALLBACK30-NEXT:    orl %ebp, %ebx
-; FALLBACK30-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK30-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK30-NEXT:    shrxl %edx, %edi, %eax
-; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    movl %ecx, %edx
-; FALLBACK30-NEXT:    shlxl %ecx, %edi, %edi
-; FALLBACK30-NEXT:    orl %ebp, %edi
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    addl %ecx, %ecx
-; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK30-NEXT:    orl %esi, %ecx
-; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK30-NEXT:    movl %eax, 28(%edx)
-; FALLBACK30-NEXT:    movl %ecx, 4(%edx)
-; FALLBACK30-NEXT:    movl %edi, 24(%edx)
-; FALLBACK30-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK30-NEXT:    orl %eax, %ebx
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    movl %eax, 20(%edx)
+; FALLBACK30-NEXT:    addl %eax, %eax
+; FALLBACK30-NEXT:    shlxl %ecx, %eax, %eax
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK30-NEXT:    orl %ecx, %eax
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %ecx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT:    movl %ecx, 28(%edx)
+; FALLBACK30-NEXT:    movl %eax, 4(%edx)
+; FALLBACK30-NEXT:    movl %ebx, 24(%edx)
+; FALLBACK30-NEXT:    movl %edi, 16(%edx)
+; FALLBACK30-NEXT:    movl %ebp, 20(%edx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK30-NEXT:    movl %eax, 8(%edx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5157,30 +5148,30 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movl %eax, %ecx
 ; FALLBACK2-NEXT:    andb $6, %sil
-; FALLBACK2-NEXT:    movzbl %sil, %ecx
-; FALLBACK2-NEXT:    movq -64(%rsp,%rcx,4), %rsi
-; FALLBACK2-NEXT:    movq -56(%rsp,%rcx,4), %rdi
-; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
-; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %r9
-; FALLBACK2-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK2-NEXT:    movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK2-NEXT:    shrxq %rax, %rcx, %r11
-; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    movzbl %sil, %esi
+; FALLBACK2-NEXT:    movq -64(%rsp,%rsi,4), %rdi
+; FALLBACK2-NEXT:    movq -56(%rsp,%rsi,4), %r8
+; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %r9
 ; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK2-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK2-NEXT:    orq %r9, %r10
+; FALLBACK2-NEXT:    shrxq %rcx, -72(%rsp,%rsi,4), %r9
 ; FALLBACK2-NEXT:    addq %rdi, %rdi
 ; FALLBACK2-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %r8, %rdi
-; FALLBACK2-NEXT:    addq %rsi, %rsi
-; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r9, %rsi
-; FALLBACK2-NEXT:    addq %rcx, %rcx
-; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
-; FALLBACK2-NEXT:    orq %r10, %rax
-; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK2-NEXT:    orq %r9, %rdi
+; FALLBACK2-NEXT:    shrxq %rcx, %r8, %r8
+; FALLBACK2-NEXT:    movq -48(%rsp,%rsi,4), %rsi
+; FALLBACK2-NEXT:    leaq (%rsi,%rsi), %r9
+; FALLBACK2-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK2-NEXT:    orq %r8, %rax
+; FALLBACK2-NEXT:    shrxq %rcx, %rsi, %rcx
+; FALLBACK2-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
-; FALLBACK2-NEXT:    movq %rsi, (%rdx)
-; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, (%rdx)
+; FALLBACK2-NEXT:    movq %r10, 8(%rdx)
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: lshr_32bytes_dwordOff:
@@ -5307,30 +5298,30 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movl %eax, %esi
 ; FALLBACK6-NEXT:    andb $6, %cl
 ; FALLBACK6-NEXT:    movzbl %cl, %ecx
-; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK6-NEXT:    movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK6-NEXT:    movq -56(%rsp,%rcx,4), %r8
-; FALLBACK6-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK6-NEXT:    movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK6-NEXT:    shrxq %rax, %rcx, %r11
-; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    shrxq %rsi, -72(%rsp,%rcx,4), %rdi
 ; FALLBACK6-NEXT:    notb %al
-; FALLBACK6-NEXT:    addq %rdi, %rdi
-; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT:    orq %rsi, %rdi
-; FALLBACK6-NEXT:    addq %rcx, %rcx
-; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT:    orq %r9, %rcx
-; FALLBACK6-NEXT:    addq %r8, %r8
-; FALLBACK6-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK6-NEXT:    orq %r10, %rax
-; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq -64(%rsp,%rcx,4), %r8
+; FALLBACK6-NEXT:    movq -56(%rsp,%rcx,4), %r9
+; FALLBACK6-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK6-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK6-NEXT:    orq %rdi, %r10
+; FALLBACK6-NEXT:    shrxq %rsi, %r9, %rdi
+; FALLBACK6-NEXT:    movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK6-NEXT:    leaq (%rcx,%rcx), %r11
+; FALLBACK6-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK6-NEXT:    orq %rdi, %r11
+; FALLBACK6-NEXT:    shrxq %rsi, %r8, %rdi
+; FALLBACK6-NEXT:    addq %r9, %r9
+; FALLBACK6-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK6-NEXT:    orq %rdi, %rax
+; FALLBACK6-NEXT:    shrxq %rsi, %rcx, %rcx
+; FALLBACK6-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, (%rdx)
+; FALLBACK6-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r10, (%rdx)
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: lshr_32bytes_dwordOff:
@@ -5441,36 +5432,36 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; FALLBACK10-LABEL: lshr_32bytes_dwordOff:
 ; FALLBACK10:       # %bb.0:
 ; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
-; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK10-NEXT:    movl %ecx, %eax
-; FALLBACK10-NEXT:    shlb $5, %al
+; FALLBACK10-NEXT:    movzbl (%rsi), %eax
+; FALLBACK10-NEXT:    movl %eax, %ecx
+; FALLBACK10-NEXT:    shlb $5, %cl
 ; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT:    andb $6, %cl
-; FALLBACK10-NEXT:    movzbl %cl, %ecx
-; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK10-NEXT:    movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK10-NEXT:    movq -56(%rsp,%rcx,4), %r8
-; FALLBACK10-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK10-NEXT:    movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK10-NEXT:    shrxq %rax, %rcx, %r11
-; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
-; FALLBACK10-NEXT:    notb %al
-; FALLBACK10-NEXT:    addq %rdi, %rdi
-; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT:    orq %rsi, %rdi
-; FALLBACK10-NEXT:    addq %rcx, %rcx
-; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT:    orq %r9, %rcx
-; FALLBACK10-NEXT:    addq %r8, %r8
-; FALLBACK10-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK10-NEXT:    orq %r10, %rax
-; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, (%rdx)
+; FALLBACK10-NEXT:    movl %ecx, %esi
+; FALLBACK10-NEXT:    andb $6, %al
+; FALLBACK10-NEXT:    movzbl %al, %eax
+; FALLBACK10-NEXT:    shrxq %rsi, -72(%rsp,%rax,4), %rdi
+; FALLBACK10-NEXT:    notb %cl
+; FALLBACK10-NEXT:    movq -64(%rsp,%rax,4), %r8
+; FALLBACK10-NEXT:    movq -56(%rsp,%rax,4), %r9
+; FALLBACK10-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK10-NEXT:    shlxq %rcx, %r10, %r10
+; FALLBACK10-NEXT:    orq %rdi, %r10
+; FALLBACK10-NEXT:    shrxq %rsi, %r9, %rdi
+; FALLBACK10-NEXT:    movq -48(%rsp,%rax,4), %rax
+; FALLBACK10-NEXT:    leaq (%rax,%rax), %r11
+; FALLBACK10-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK10-NEXT:    orq %rdi, %r11
+; FALLBACK10-NEXT:    shrxq %rsi, %r8, %rdi
+; FALLBACK10-NEXT:    addq %r9, %r9
+; FALLBACK10-NEXT:    shlxq %rcx, %r9, %rcx
+; FALLBACK10-NEXT:    orq %rdi, %rcx
+; FALLBACK10-NEXT:    shrxq %rsi, %rax, %rax
+; FALLBACK10-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rcx, 8(%rdx)
+; FALLBACK10-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r10, (%rdx)
 ; FALLBACK10-NEXT:    vzeroupper
 ; FALLBACK10-NEXT:    retq
 ;
@@ -5580,36 +5571,36 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; FALLBACK14-LABEL: lshr_32bytes_dwordOff:
 ; FALLBACK14:       # %bb.0:
 ; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
-; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK14-NEXT:    movl %ecx, %eax
-; FALLBACK14-NEXT:    shlb $5, %al
+; FALLBACK14-NEXT:    movzbl (%rsi), %eax
+; FALLBACK14-NEXT:    movl %eax, %ecx
+; FALLBACK14-NEXT:    shlb $5, %cl
 ; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT:    andb $6, %cl
-; FALLBACK14-NEXT:    movzbl %cl, %ecx
-; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK14-NEXT:    movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK14-NEXT:    movq -56(%rsp,%rcx,4), %r8
-; FALLBACK14-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK14-NEXT:    movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK14-NEXT:    shrxq %rax, %rcx, %r11
-; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
-; FALLBACK14-NEXT:    notb %al
-; FALLBACK14-NEXT:    addq %rdi, %rdi
-; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT:    orq %rsi, %rdi
-; FALLBACK14-NEXT:    addq %rcx, %rcx
-; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT:    orq %r9, %rcx
-; FALLBACK14-NEXT:    addq %r8, %r8
-; FALLBACK14-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK14-NEXT:    orq %r10, %rax
-; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, (%rdx)
+; FALLBACK14-NEXT:    movl %ecx, %esi
+; FALLBACK14-NEXT:    andb $6, %al
+; FALLBACK14-NEXT:    movzbl %al, %eax
+; FALLBACK14-NEXT:    shrxq %rsi, -72(%rsp,%rax,4), %rdi
+; FALLBACK14-NEXT:    notb %cl
+; FALLBACK14-NEXT:    movq -64(%rsp,%rax,4), %r8
+; FALLBACK14-NEXT:    movq -56(%rsp,%rax,4), %r9
+; FALLBACK14-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK14-NEXT:    shlxq %rcx, %r10, %r10
+; FALLBACK14-NEXT:    orq %rdi, %r10
+; FALLBACK14-NEXT:    shrxq %rsi, %r9, %rdi
+; FALLBACK14-NEXT:    movq -48(%rsp,%rax,4), %rax
+; FALLBACK14-NEXT:    leaq (%rax,%rax), %r11
+; FALLBACK14-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK14-NEXT:    orq %rdi, %r11
+; FALLBACK14-NEXT:    shrxq %rsi, %r8, %rdi
+; FALLBACK14-NEXT:    addq %r9, %r9
+; FALLBACK14-NEXT:    shlxq %rcx, %r9, %rcx
+; FALLBACK14-NEXT:    orq %rdi, %rcx
+; FALLBACK14-NEXT:    shrxq %rsi, %rax, %rax
+; FALLBACK14-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rcx, 8(%rdx)
+; FALLBACK14-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r10, (%rdx)
 ; FALLBACK14-NEXT:    vzeroupper
 ; FALLBACK14-NEXT:    retq
 ;
@@ -6025,31 +6016,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movl %eax, %ecx
 ; FALLBACK2-NEXT:    andb $24, %sil
 ; FALLBACK2-NEXT:    negb %sil
-; FALLBACK2-NEXT:    movsbq %sil, %rsi
-; FALLBACK2-NEXT:    movq -40(%rsp,%rsi), %rdi
-; FALLBACK2-NEXT:    movq -32(%rsp,%rsi), %rcx
-; FALLBACK2-NEXT:    shlxq %rax, %rcx, %r8
-; FALLBACK2-NEXT:    shlxq %rax, -16(%rsp,%rsi), %r9
-; FALLBACK2-NEXT:    movq -24(%rsp,%rsi), %rsi
-; FALLBACK2-NEXT:    shlxq %rax, %rsi, %r10
-; FALLBACK2-NEXT:    shlxq %rax, %rdi, %r11
-; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    movsbq %sil, %rdi
+; FALLBACK2-NEXT:    movq -40(%rsp,%rdi), %r8
+; FALLBACK2-NEXT:    movq -32(%rsp,%rdi), %rsi
+; FALLBACK2-NEXT:    shlxq %rcx, %rsi, %r9
 ; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    shlxq %rcx, %r8, %r10
+; FALLBACK2-NEXT:    shrq %r8
+; FALLBACK2-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK2-NEXT:    orq %r9, %r8
+; FALLBACK2-NEXT:    shlxq %rcx, -16(%rsp,%rdi), %r9
+; FALLBACK2-NEXT:    movq -24(%rsp,%rdi), %rdi
+; FALLBACK2-NEXT:    shlxq %rcx, %rdi, %rcx
 ; FALLBACK2-NEXT:    shrq %rdi
 ; FALLBACK2-NEXT:    shrxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %r8, %rdi
+; FALLBACK2-NEXT:    orq %r9, %rdi
 ; FALLBACK2-NEXT:    shrq %rsi
-; FALLBACK2-NEXT:    shrxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r9, %rsi
-; FALLBACK2-NEXT:    shrq %rcx
-; FALLBACK2-NEXT:    shrxq %rax, %rcx, %rax
-; FALLBACK2-NEXT:    orq %r10, %rax
-; FALLBACK2-NEXT:    movq %r11, (%rdx)
+; FALLBACK2-NEXT:    shrxq %rax, %rsi, %rax
+; FALLBACK2-NEXT:    orq %rcx, %rax
+; FALLBACK2-NEXT:    movq %r10, (%rdx)
 ; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
-; FALLBACK2-NEXT:    movq %rsi, 24(%rdx)
-; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK2-NEXT:    movq %r8, 8(%rdx)
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: shl_32bytes:
@@ -6167,38 +6158,38 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6:       # %bb.0:
 ; FALLBACK6-NEXT:    movups (%rdi), %xmm0
 ; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
-; FALLBACK6-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK6-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK6-NEXT:    movzbl (%rsi), %esi
+; FALLBACK6-NEXT:    leal (,%rsi,8), %eax
 ; FALLBACK6-NEXT:    xorps %xmm2, %xmm2
 ; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT:    andb $24, %cl
-; FALLBACK6-NEXT:    negb %cl
-; FALLBACK6-NEXT:    movsbq %cl, %rcx
-; FALLBACK6-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK6-NEXT:    movq -24(%rsp,%rcx), %rdi
-; FALLBACK6-NEXT:    shlxq %rax, %rdi, %r8
-; FALLBACK6-NEXT:    movq -40(%rsp,%rcx), %r9
-; FALLBACK6-NEXT:    movq -32(%rsp,%rcx), %rcx
-; FALLBACK6-NEXT:    shlxq %rax, %rcx, %r10
-; FALLBACK6-NEXT:    shlxq %rax, %r9, %r11
-; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    movl %eax, %ecx
+; FALLBACK6-NEXT:    andb $24, %sil
+; FALLBACK6-NEXT:    negb %sil
+; FALLBACK6-NEXT:    movsbq %sil, %rsi
+; FALLBACK6-NEXT:    shlxq %rcx, -16(%rsp,%rsi), %rdi
 ; FALLBACK6-NEXT:    notb %al
+; FALLBACK6-NEXT:    movq -24(%rsp,%rsi), %r8
+; FALLBACK6-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK6-NEXT:    shrq %r8
+; FALLBACK6-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK6-NEXT:    orq %rdi, %r8
+; FALLBACK6-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK6-NEXT:    movq -32(%rsp,%rsi), %rsi
+; FALLBACK6-NEXT:    shlxq %rcx, %rsi, %r10
+; FALLBACK6-NEXT:    shrq %rsi
+; FALLBACK6-NEXT:    shrxq %rax, %rsi, %rsi
+; FALLBACK6-NEXT:    orq %r9, %rsi
+; FALLBACK6-NEXT:    shlxq %rcx, %rdi, %rcx
 ; FALLBACK6-NEXT:    shrq %rdi
-; FALLBACK6-NEXT:    shrxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT:    orq %rsi, %rdi
-; FALLBACK6-NEXT:    shrq %rcx
-; FALLBACK6-NEXT:    shrxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT:    orq %r8, %rcx
-; FALLBACK6-NEXT:    shrq %r9
-; FALLBACK6-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK6-NEXT:    shrxq %rax, %rdi, %rax
 ; FALLBACK6-NEXT:    orq %r10, %rax
-; FALLBACK6-NEXT:    movq %r11, (%rdx)
+; FALLBACK6-NEXT:    movq %rcx, (%rdx)
 ; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK6-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r8, 24(%rdx)
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: shl_32bytes:
@@ -6308,36 +6299,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK10-LABEL: shl_32bytes:
 ; FALLBACK10:       # %bb.0:
 ; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
-; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK10-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK10-NEXT:    movzbl (%rsi), %esi
+; FALLBACK10-NEXT:    leal (,%rsi,8), %eax
 ; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT:    andb $24, %cl
-; FALLBACK10-NEXT:    negb %cl
-; FALLBACK10-NEXT:    movsbq %cl, %rcx
-; FALLBACK10-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK10-NEXT:    movq -24(%rsp,%rcx), %rdi
-; FALLBACK10-NEXT:    shlxq %rax, %rdi, %r8
-; FALLBACK10-NEXT:    movq -40(%rsp,%rcx), %r9
-; FALLBACK10-NEXT:    movq -32(%rsp,%rcx), %rcx
-; FALLBACK10-NEXT:    shlxq %rax, %rcx, %r10
-; FALLBACK10-NEXT:    shlxq %rax, %r9, %r11
-; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT:    movl %eax, %ecx
+; FALLBACK10-NEXT:    andb $24, %sil
+; FALLBACK10-NEXT:    negb %sil
+; FALLBACK10-NEXT:    movsbq %sil, %rsi
+; FALLBACK10-NEXT:    shlxq %rcx, -16(%rsp,%rsi), %rdi
 ; FALLBACK10-NEXT:    notb %al
+; FALLBACK10-NEXT:    movq -24(%rsp,%rsi), %r8
+; FALLBACK10-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK10-NEXT:    shrq %r8
+; FALLBACK10-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK10-NEXT:    orq %rdi, %r8
+; FALLBACK10-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK10-NEXT:    movq -32(%rsp,%rsi), %rsi
+; FALLBACK10-NEXT:    shlxq %rcx, %rsi, %r10
+; FALLBACK10-NEXT:    shrq %rsi
+; FALLBACK10-NEXT:    shrxq %rax, %rsi, %rsi
+; FALLBACK10-NEXT:    orq %r9, %rsi
+; FALLBACK10-NEXT:    shlxq %rcx, %rdi, %rcx
 ; FALLBACK10-NEXT:    shrq %rdi
-; FALLBACK10-NEXT:    shrxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT:    orq %rsi, %rdi
-; FALLBACK10-NEXT:    shrq %rcx
-; FALLBACK10-NEXT:    shrxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT:    orq %r8, %rcx
-; FALLBACK10-NEXT:    shrq %r9
-; FALLBACK10-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK10-NEXT:    shrxq %rax, %rdi, %rax
 ; FALLBACK10-NEXT:    orq %r10, %rax
-; FALLBACK10-NEXT:    movq %r11, (%rdx)
+; FALLBACK10-NEXT:    movq %rcx, (%rdx)
 ; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r8, 24(%rdx)
 ; FALLBACK10-NEXT:    vzeroupper
 ; FALLBACK10-NEXT:    retq
 ;
@@ -6446,36 +6437,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK14-LABEL: shl_32bytes:
 ; FALLBACK14:       # %bb.0:
 ; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
-; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK14-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK14-NEXT:    movzbl (%rsi), %esi
+; FALLBACK14-NEXT:    leal (,%rsi,8), %eax
 ; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT:    andb $24, %cl
-; FALLBACK14-NEXT:    negb %cl
-; FALLBACK14-NEXT:    movsbq %cl, %rcx
-; FALLBACK14-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK14-NEXT:    movq -24(%rsp,%rcx), %rdi
-; FALLBACK14-NEXT:    shlxq %rax, %rdi, %r8
-; FALLBACK14-NEXT:    movq -40(%rsp,%rcx), %r9
-; FALLBACK14-NEXT:    movq -32(%rsp,%rcx), %rcx
-; FALLBACK14-NEXT:    shlxq %rax, %rcx, %r10
-; FALLBACK14-NEXT:    shlxq %rax, %r9, %r11
-; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT:    movl %eax, %ecx
+; FALLBACK14-NEXT:    andb $24, %sil
+; FALLBACK14-NEXT:    negb %sil
+; FALLBACK14-NEXT:    movsbq %sil, %rsi
+; FALLBACK14-NEXT:    shlxq %rcx, -16(%rsp,%rsi), %rdi
 ; FALLBACK14-NEXT:    notb %al
+; FALLBACK14-NEXT:    movq -24(%rsp,%rsi), %r8
+; FALLBACK14-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK14-NEXT:    shrq %r8
+; FALLBACK14-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK14-NEXT:    orq %rdi, %r8
+; FALLBACK14-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK14-NEXT:    movq -32(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT:    shlxq %rcx, %rsi, %r10
+; FALLBACK14-NEXT:    shrq %rsi
+; FALLBACK14-NEXT:    shrxq %rax, %rsi, %rsi
+; FALLBACK14-NEXT:    orq %r9, %rsi
+; FALLBACK14-NEXT:    shlxq %rcx, %rdi, %rcx
 ; FALLBACK14-NEXT:    shrq %rdi
-; FALLBACK14-NEXT:    shrxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT:    orq %rsi, %rdi
-; FALLBACK14-NEXT:    shrq %rcx
-; FALLBACK14-NEXT:    shrxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT:    orq %r8, %rcx
-; FALLBACK14-NEXT:    shrq %r9
-; FALLBACK14-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK14-NEXT:    shrxq %rax, %rdi, %rax
 ; FALLBACK14-NEXT:    orq %r10, %rax
-; FALLBACK14-NEXT:    movq %r11, (%rdx)
+; FALLBACK14-NEXT:    movq %rcx, (%rdx)
 ; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r8, 24(%rdx)
 ; FALLBACK14-NEXT:    vzeroupper
 ; FALLBACK14-NEXT:    retq
 ;
@@ -6745,71 +6736,75 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edx, %eax
+; FALLBACK18-NEXT:    movl %eax, %ebp
 ; FALLBACK18-NEXT:    andb $28, %bl
 ; FALLBACK18-NEXT:    negb %bl
 ; FALLBACK18-NEXT:    movsbl %bl, %esi
 ; FALLBACK18-NEXT:    movl 64(%esp,%esi), %ebx
 ; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 68(%esp,%esi), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, %eax, %edi
-; FALLBACK18-NEXT:    movl %edx, %ecx
-; FALLBACK18-NEXT:    notb %cl
+; FALLBACK18-NEXT:    movl 68(%esp,%esi), %ecx
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %eax, %ecx, %edi
+; FALLBACK18-NEXT:    notb %dl
 ; FALLBACK18-NEXT:    shrl %ebx
-; FALLBACK18-NEXT:    shrxl %ecx, %ebx, %ebx
+; FALLBACK18-NEXT:    shrxl %edx, %ebx, %ebx
 ; FALLBACK18-NEXT:    orl %edi, %ebx
 ; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 72(%esp,%esi), %ebx
 ; FALLBACK18-NEXT:    movl %ebx, %edi
 ; FALLBACK18-NEXT:    shrl %edi
-; FALLBACK18-NEXT:    shrxl %ecx, %edi, %eax
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    movl 76(%esp,%esi), %edi
-; FALLBACK18-NEXT:    shlxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    movl %ebp, %esi
+; FALLBACK18-NEXT:    shlxl %ebp, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, %ebx, %ebx
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT:    shrl %eax
-; FALLBACK18-NEXT:    shrxl %ecx, %eax, %eax
-; FALLBACK18-NEXT:    orl %ebx, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 80(%esp,%esi), %ebx
-; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %esi, %ebx, %ebx
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT:    orl %ebx, %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK18-NEXT:    movl 80(%esp,%ebp), %ecx
+; FALLBACK18-NEXT:    movl %ecx, %ebx
 ; FALLBACK18-NEXT:    shrl %ebx
-; FALLBACK18-NEXT:    shrxl %ecx, %ebx, %eax
-; FALLBACK18-NEXT:    movl 84(%esp,%esi), %ebx
-; FALLBACK18-NEXT:    shlxl %edx, %ebx, %ebp
+; FALLBACK18-NEXT:    shrxl %edx, %ebx, %eax
+; FALLBACK18-NEXT:    movl 84(%esp,%ebp), %ebx
+; FALLBACK18-NEXT:    shlxl %esi, %ebx, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shlxl %esi, %ecx, %ecx
+; FALLBACK18-NEXT:    movl %esi, %eax
 ; FALLBACK18-NEXT:    shrl %edi
-; FALLBACK18-NEXT:    shrxl %ecx, %edi, %edi
-; FALLBACK18-NEXT:    orl %eax, %edi
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, 92(%esp,%esi), %ebp
-; FALLBACK18-NEXT:    movl 88(%esp,%esi), %esi
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %edi
+; FALLBACK18-NEXT:    orl %ecx, %edi
+; FALLBACK18-NEXT:    shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    shlxl %esi, 92(%esp,%ecx), %ebp
+; FALLBACK18-NEXT:    movl 88(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    shlxl %eax, %esi, %ecx
 ; FALLBACK18-NEXT:    shrl %esi
-; FALLBACK18-NEXT:    shrxl %ecx, %esi, %esi
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
 ; FALLBACK18-NEXT:    orl %ebp, %esi
 ; FALLBACK18-NEXT:    shrl %ebx
-; FALLBACK18-NEXT:    shrxl %ecx, %ebx, %edx
-; FALLBACK18-NEXT:    orl %eax, %edx
-; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, (%eax)
-; FALLBACK18-NEXT:    movl %edx, 24(%eax)
-; FALLBACK18-NEXT:    movl %esi, 28(%eax)
-; FALLBACK18-NEXT:    movl %edi, 16(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK18-NEXT:    shrxl %edx, %ebx, %eax
+; FALLBACK18-NEXT:    orl %ecx, %eax
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    movl %ecx, (%edx)
+; FALLBACK18-NEXT:    movl %eax, 24(%edx)
+; FALLBACK18-NEXT:    movl %esi, 28(%edx)
+; FALLBACK18-NEXT:    movl %edi, 16(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 20(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 8(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 12(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 4(%edx)
 ; FALLBACK18-NEXT:    addl $108, %esp
 ; FALLBACK18-NEXT:    popl %esi
 ; FALLBACK18-NEXT:    popl %edi
@@ -7085,78 +7080,76 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK22-NEXT:    movups (%ecx), %xmm0
 ; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
-; FALLBACK22-NEXT:    movzbl (%eax), %ecx
-; FALLBACK22-NEXT:    movl %ecx, %eax
-; FALLBACK22-NEXT:    shlb $3, %al
+; FALLBACK22-NEXT:    movzbl (%eax), %edx
+; FALLBACK22-NEXT:    movl %edx, %ecx
+; FALLBACK22-NEXT:    shlb $3, %cl
 ; FALLBACK22-NEXT:    xorps %xmm2, %xmm2
 ; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    andb $28, %cl
-; FALLBACK22-NEXT:    negb %cl
-; FALLBACK22-NEXT:    movsbl %cl, %edx
-; FALLBACK22-NEXT:    movl 84(%esp,%edx), %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %eax, %ecx, %ecx
-; FALLBACK22-NEXT:    movl 80(%esp,%edx), %esi
-; FALLBACK22-NEXT:    shlxl %eax, %esi, %edi
-; FALLBACK22-NEXT:    movl %eax, %ebx
-; FALLBACK22-NEXT:    notb %bl
-; FALLBACK22-NEXT:    shrl %esi
-; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %ecx, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 76(%esp,%edx), %ecx
-; FALLBACK22-NEXT:    movl %ecx, %esi
-; FALLBACK22-NEXT:    shrl %esi
-; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %edi, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %eax, %ecx, %ecx
-; FALLBACK22-NEXT:    movl 72(%esp,%edx), %esi
-; FALLBACK22-NEXT:    movl %esi, %edi
+; FALLBACK22-NEXT:    movl %ecx, %ebx
+; FALLBACK22-NEXT:    andb $28, %dl
+; FALLBACK22-NEXT:    negb %dl
+; FALLBACK22-NEXT:    movsbl %dl, %edx
+; FALLBACK22-NEXT:    movl 84(%esp,%edx), %eax
+; FALLBACK22-NEXT:    shlxl %ebx, %eax, %esi
+; FALLBACK22-NEXT:    notb %cl
+; FALLBACK22-NEXT:    movl 80(%esp,%edx), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %ebp
 ; FALLBACK22-NEXT:    shrl %edi
-; FALLBACK22-NEXT:    shrxl %ebx, %edi, %edi
-; FALLBACK22-NEXT:    orl %ecx, %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %edi, %edi
+; FALLBACK22-NEXT:    orl %esi, %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %eax, %esi, %ecx
-; FALLBACK22-NEXT:    movl 68(%esp,%edx), %esi
+; FALLBACK22-NEXT:    movl 76(%esp,%edx), %esi
 ; FALLBACK22-NEXT:    movl %esi, %edi
 ; FALLBACK22-NEXT:    shrl %edi
-; FALLBACK22-NEXT:    shrxl %ebx, %edi, %ebp
-; FALLBACK22-NEXT:    orl %ecx, %ebp
-; FALLBACK22-NEXT:    shlxl %eax, %esi, %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %edi, %edi
+; FALLBACK22-NEXT:    orl %ebp, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    movl 72(%esp,%edx), %edi
+; FALLBACK22-NEXT:    movl %edi, %ebp
+; FALLBACK22-NEXT:    shrl %ebp
+; FALLBACK22-NEXT:    shrxl %ecx, %ebp, %ebp
+; FALLBACK22-NEXT:    orl %esi, %ebp
+; FALLBACK22-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %esi
+; FALLBACK22-NEXT:    movl 68(%esp,%edx), %ebp
+; FALLBACK22-NEXT:    movl %ebp, %edi
+; FALLBACK22-NEXT:    shrl %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %edi, %edi
+; FALLBACK22-NEXT:    orl %esi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebp
 ; FALLBACK22-NEXT:    movl 64(%esp,%edx), %esi
-; FALLBACK22-NEXT:    movl %esi, %ecx
-; FALLBACK22-NEXT:    shrl %ecx
-; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT:    orl %edi, %ecx
-; FALLBACK22-NEXT:    shlxl %eax, %esi, %esi
 ; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %eax, 92(%esp,%edx), %edi
-; FALLBACK22-NEXT:    movl 88(%esp,%edx), %edx
-; FALLBACK22-NEXT:    shlxl %eax, %edx, %esi
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ecx, %esi, %edi
+; FALLBACK22-NEXT:    orl %ebp, %edi
 ; FALLBACK22-NEXT:    shrl %eax
-; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK22-NEXT:    orl %esi, %eax
-; FALLBACK22-NEXT:    shrl %edx
-; FALLBACK22-NEXT:    shrxl %ebx, %edx, %edx
-; FALLBACK22-NEXT:    orl %edi, %edx
-; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK22-NEXT:    movl %edi, (%esi)
-; FALLBACK22-NEXT:    movl %edx, 28(%esi)
-; FALLBACK22-NEXT:    movl %eax, 24(%esi)
-; FALLBACK22-NEXT:    movl %ecx, 4(%esi)
-; FALLBACK22-NEXT:    movl %ebp, 8(%esi)
+; FALLBACK22-NEXT:    shrxl %ecx, %eax, %esi
+; FALLBACK22-NEXT:    movl 88(%esp,%edx), %eax
+; FALLBACK22-NEXT:    shlxl %ebx, %eax, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %esi
+; FALLBACK22-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shlxl %ebx, 92(%esp,%edx), %edx
+; FALLBACK22-NEXT:    shrl %eax
+; FALLBACK22-NEXT:    shrxl %ecx, %eax, %eax
+; FALLBACK22-NEXT:    orl %edx, %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movl %ebp, (%ecx)
+; FALLBACK22-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK22-NEXT:    movl %esi, 24(%ecx)
+; FALLBACK22-NEXT:    movl %edi, 4(%ecx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 8(%ecx)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT:    movl %eax, 12(%esi)
+; FALLBACK22-NEXT:    movl %eax, 12(%ecx)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT:    movl %eax, 16(%esi)
+; FALLBACK22-NEXT:    movl %eax, 16(%ecx)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT:    movl %eax, 20(%esi)
+; FALLBACK22-NEXT:    movl %eax, 20(%ecx)
 ; FALLBACK22-NEXT:    addl $108, %esp
 ; FALLBACK22-NEXT:    popl %esi
 ; FALLBACK22-NEXT:    popl %edi
@@ -7410,76 +7403,74 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
-; FALLBACK26-NEXT:    movzbl (%eax), %ecx
-; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    movzbl (%eax), %edx
+; FALLBACK26-NEXT:    movl %edx, %eax
 ; FALLBACK26-NEXT:    shlb $3, %al
 ; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    andb $28, %cl
-; FALLBACK26-NEXT:    negb %cl
-; FALLBACK26-NEXT:    movsbl %cl, %edx
-; FALLBACK26-NEXT:    movl 84(%esp,%edx), %ecx
-; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %eax, %ecx, %ecx
-; FALLBACK26-NEXT:    movl 80(%esp,%edx), %esi
-; FALLBACK26-NEXT:    shlxl %eax, %esi, %edi
 ; FALLBACK26-NEXT:    movl %eax, %ebx
-; FALLBACK26-NEXT:    notb %bl
-; FALLBACK26-NEXT:    shrl %esi
-; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK26-NEXT:    orl %ecx, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 76(%esp,%edx), %ecx
-; FALLBACK26-NEXT:    movl %ecx, %esi
-; FALLBACK26-NEXT:    shrl %esi
-; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK26-NEXT:    orl %edi, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %eax, %ecx, %ecx
-; FALLBACK26-NEXT:    movl 72(%esp,%edx), %esi
-; FALLBACK26-NEXT:    movl %esi, %edi
+; FALLBACK26-NEXT:    andb $28, %dl
+; FALLBACK26-NEXT:    negb %dl
+; FALLBACK26-NEXT:    movsbl %dl, %edx
+; FALLBACK26-NEXT:    movl 84(%esp,%edx), %ecx
+; FALLBACK26-NEXT:    shlxl %ebx, %ecx, %esi
+; FALLBACK26-NEXT:    notb %al
+; FALLBACK26-NEXT:    movl 80(%esp,%edx), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    shrl %edi
-; FALLBACK26-NEXT:    shrxl %ebx, %edi, %edi
-; FALLBACK26-NEXT:    orl %ecx, %edi
+; FALLBACK26-NEXT:    shrxl %eax, %edi, %edi
+; FALLBACK26-NEXT:    orl %esi, %edi
 ; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %eax, %esi, %ecx
-; FALLBACK26-NEXT:    movl 68(%esp,%edx), %esi
+; FALLBACK26-NEXT:    movl 76(%esp,%edx), %esi
 ; FALLBACK26-NEXT:    movl %esi, %edi
 ; FALLBACK26-NEXT:    shrl %edi
-; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
-; FALLBACK26-NEXT:    orl %ecx, %ebp
-; FALLBACK26-NEXT:    shlxl %eax, %esi, %edi
+; FALLBACK26-NEXT:    shrxl %eax, %edi, %edi
+; FALLBACK26-NEXT:    orl %ebp, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    movl 72(%esp,%edx), %edi
+; FALLBACK26-NEXT:    movl %edi, %ebp
+; FALLBACK26-NEXT:    shrl %ebp
+; FALLBACK26-NEXT:    shrxl %eax, %ebp, %ebp
+; FALLBACK26-NEXT:    orl %esi, %ebp
+; FALLBACK26-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %esi
+; FALLBACK26-NEXT:    movl 68(%esp,%edx), %ebp
+; FALLBACK26-NEXT:    movl %ebp, %edi
+; FALLBACK26-NEXT:    shrl %edi
+; FALLBACK26-NEXT:    shrxl %eax, %edi, %edi
+; FALLBACK26-NEXT:    orl %esi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %ebp
 ; FALLBACK26-NEXT:    movl 64(%esp,%edx), %esi
-; FALLBACK26-NEXT:    movl %esi, %ecx
-; FALLBACK26-NEXT:    shrl %ecx
-; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK26-NEXT:    orl %edi, %ecx
-; FALLBACK26-NEXT:    shlxl %eax, %esi, %esi
 ; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %eax, 92(%esp,%edx), %edi
-; FALLBACK26-NEXT:    movl 88(%esp,%edx), %edx
-; FALLBACK26-NEXT:    shlxl %eax, %edx, %esi
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %eax, %esi, %edi
+; FALLBACK26-NEXT:    orl %ebp, %edi
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %eax, %ecx, %esi
+; FALLBACK26-NEXT:    movl 88(%esp,%edx), %ecx
+; FALLBACK26-NEXT:    shlxl %ebx, %ecx, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %esi
+; FALLBACK26-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shlxl %ebx, 92(%esp,%edx), %edx
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %eax, %ecx, %eax
+; FALLBACK26-NEXT:    orl %edx, %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    movl %ebp, (%ecx)
+; FALLBACK26-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK26-NEXT:    movl %esi, 24(%ecx)
+; FALLBACK26-NEXT:    movl %edi, 4(%ecx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    shrl %eax
-; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK26-NEXT:    orl %esi, %eax
-; FALLBACK26-NEXT:    shrl %edx
-; FALLBACK26-NEXT:    shrxl %ebx, %edx, %edx
-; FALLBACK26-NEXT:    orl %edi, %edx
-; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK26-NEXT:    movl %edi, (%esi)
-; FALLBACK26-NEXT:    movl %edx, 28(%esi)
-; FALLBACK26-NEXT:    movl %eax, 24(%esi)
-; FALLBACK26-NEXT:    movl %ecx, 4(%esi)
-; FALLBACK26-NEXT:    movl %ebp, 8(%esi)
+; FALLBACK26-NEXT:    movl %eax, 8(%ecx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    movl %eax, 12(%esi)
+; FALLBACK26-NEXT:    movl %eax, 12(%ecx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    movl %eax, 16(%esi)
+; FALLBACK26-NEXT:    movl %eax, 16(%ecx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    movl %eax, 20(%esi)
+; FALLBACK26-NEXT:    movl %eax, 20(%ecx)
 ; FALLBACK26-NEXT:    addl $108, %esp
 ; FALLBACK26-NEXT:    popl %esi
 ; FALLBACK26-NEXT:    popl %edi
@@ -7732,76 +7723,74 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK30-NEXT:    vmovups (%ecx), %ymm0
-; FALLBACK30-NEXT:    movzbl (%eax), %ecx
-; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    movzbl (%eax), %edx
+; FALLBACK30-NEXT:    movl %edx, %eax
 ; FALLBACK30-NEXT:    shlb $3, %al
 ; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK30-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    andb $28, %cl
-; FALLBACK30-NEXT:    negb %cl
-; FALLBACK30-NEXT:    movsbl %cl, %edx
-; FALLBACK30-NEXT:    movl 84(%esp,%edx), %ecx
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %eax, %ecx, %ecx
-; FALLBACK30-NEXT:    movl 80(%esp,%edx), %esi
-; FALLBACK30-NEXT:    shlxl %eax, %esi, %edi
 ; FALLBACK30-NEXT:    movl %eax, %ebx
-; FALLBACK30-NEXT:    notb %bl
-; FALLBACK30-NEXT:    shrl %esi
-; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK30-NEXT:    orl %ecx, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 76(%esp,%edx), %ecx
-; FALLBACK30-NEXT:    movl %ecx, %esi
-; FALLBACK30-NEXT:    shrl %esi
-; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK30-NEXT:    orl %edi, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %eax, %ecx, %ecx
-; FALLBACK30-NEXT:    movl 72(%esp,%edx), %esi
-; FALLBACK30-NEXT:    movl %esi, %edi
+; FALLBACK30-NEXT:    andb $28, %dl
+; FALLBACK30-NEXT:    negb %dl
+; FALLBACK30-NEXT:    movsbl %dl, %edx
+; FALLBACK30-NEXT:    movl 84(%esp,%edx), %ecx
+; FALLBACK30-NEXT:    shlxl %ebx, %ecx, %esi
+; FALLBACK30-NEXT:    notb %al
+; FALLBACK30-NEXT:    movl 80(%esp,%edx), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    shrl %edi
-; FALLBACK30-NEXT:    shrxl %ebx, %edi, %edi
-; FALLBACK30-NEXT:    orl %ecx, %edi
+; FALLBACK30-NEXT:    shrxl %eax, %edi, %edi
+; FALLBACK30-NEXT:    orl %esi, %edi
 ; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %eax, %esi, %ecx
-; FALLBACK30-NEXT:    movl 68(%esp,%edx), %esi
+; FALLBACK30-NEXT:    movl 76(%esp,%edx), %esi
 ; FALLBACK30-NEXT:    movl %esi, %edi
 ; FALLBACK30-NEXT:    shrl %edi
-; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
-; FALLBACK30-NEXT:    orl %ecx, %ebp
-; FALLBACK30-NEXT:    shlxl %eax, %esi, %edi
-; FALLBACK30-NEXT:    movl 64(%esp,%edx), %esi
-; FALLBACK30-NEXT:    movl %esi, %ecx
-; FALLBACK30-NEXT:    shrl %ecx
-; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK30-NEXT:    orl %edi, %ecx
-; FALLBACK30-NEXT:    shlxl %eax, %esi, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %eax, 92(%esp,%edx), %edi
-; FALLBACK30-NEXT:    movl 88(%esp,%edx), %edx
-; FALLBACK30-NEXT:    shlxl %eax, %edx, %esi
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    shrl %eax
-; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK30-NEXT:    orl %esi, %eax
-; FALLBACK30-NEXT:    shrl %edx
-; FALLBACK30-NEXT:    shrxl %ebx, %edx, %edx
-; FALLBACK30-NEXT:    orl %edi, %edx
-; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK30-NEXT:    movl %edi, (%esi)
-; FALLBACK30-NEXT:    movl %edx, 28(%esi)
-; FALLBACK30-NEXT:    movl %eax, 24(%esi)
-; FALLBACK30-NEXT:    movl %ecx, 4(%esi)
-; FALLBACK30-NEXT:    movl %ebp, 8(%esi)
+; FALLBACK30-NEXT:    shrxl %eax, %edi, %edi
+; FALLBACK30-NEXT:    orl %ebp, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    movl 72(%esp,%edx), %edi
+; FALLBACK30-NEXT:    movl %edi, %ebp
+; FALLBACK30-NEXT:    shrl %ebp
+; FALLBACK30-NEXT:    shrxl %eax, %ebp, %ebp
+; FALLBACK30-NEXT:    orl %esi, %ebp
+; FALLBACK30-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %esi
+; FALLBACK30-NEXT:    movl 68(%esp,%edx), %ebp
+; FALLBACK30-NEXT:    movl %ebp, %edi
+; FALLBACK30-NEXT:    shrl %edi
+; FALLBACK30-NEXT:    shrxl %eax, %edi, %edi
+; FALLBACK30-NEXT:    orl %esi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK30-NEXT:    movl 64(%esp,%edx), %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %eax, %esi, %edi
+; FALLBACK30-NEXT:    orl %ebp, %edi
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %eax, %ecx, %esi
+; FALLBACK30-NEXT:    movl 88(%esp,%edx), %ecx
+; FALLBACK30-NEXT:    shlxl %ebx, %ecx, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %esi
+; FALLBACK30-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shlxl %ebx, 92(%esp,%edx), %edx
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %eax, %ecx, %eax
+; FALLBACK30-NEXT:    orl %edx, %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    movl %ebp, (%ecx)
+; FALLBACK30-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK30-NEXT:    movl %esi, 24(%ecx)
+; FALLBACK30-NEXT:    movl %edi, 4(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 8(%ecx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    movl %eax, 12(%esi)
+; FALLBACK30-NEXT:    movl %eax, 12(%ecx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    movl %eax, 16(%esi)
+; FALLBACK30-NEXT:    movl %eax, 16(%ecx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    movl %eax, 20(%esi)
+; FALLBACK30-NEXT:    movl %eax, 20(%ecx)
 ; FALLBACK30-NEXT:    addl $108, %esp
 ; FALLBACK30-NEXT:    popl %esi
 ; FALLBACK30-NEXT:    popl %edi
@@ -7987,32 +7976,32 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movl %eax, %ecx
 ; FALLBACK2-NEXT:    shlb $2, %sil
 ; FALLBACK2-NEXT:    andb $24, %sil
 ; FALLBACK2-NEXT:    negb %sil
-; FALLBACK2-NEXT:    movsbq %sil, %rsi
-; FALLBACK2-NEXT:    movq -40(%rsp,%rsi), %rdi
-; FALLBACK2-NEXT:    movq -32(%rsp,%rsi), %rcx
-; FALLBACK2-NEXT:    shlxq %rax, %rcx, %r8
-; FALLBACK2-NEXT:    shlxq %rax, -16(%rsp,%rsi), %r9
-; FALLBACK2-NEXT:    movq -24(%rsp,%rsi), %rsi
-; FALLBACK2-NEXT:    shlxq %rax, %rsi, %r10
-; FALLBACK2-NEXT:    shlxq %rax, %rdi, %r11
-; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    movsbq %sil, %rdi
+; FALLBACK2-NEXT:    movq -40(%rsp,%rdi), %r8
+; FALLBACK2-NEXT:    movq -32(%rsp,%rdi), %rsi
+; FALLBACK2-NEXT:    shlxq %rcx, %rsi, %r9
 ; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    shlxq %rcx, %r8, %r10
+; FALLBACK2-NEXT:    shrq %r8
+; FALLBACK2-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK2-NEXT:    orq %r9, %r8
+; FALLBACK2-NEXT:    shlxq %rcx, -16(%rsp,%rdi), %r9
+; FALLBACK2-NEXT:    movq -24(%rsp,%rdi), %rdi
+; FALLBACK2-NEXT:    shlxq %rcx, %rdi, %rcx
 ; FALLBACK2-NEXT:    shrq %rdi
 ; FALLBACK2-NEXT:    shrxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %r8, %rdi
+; FALLBACK2-NEXT:    orq %r9, %rdi
 ; FALLBACK2-NEXT:    shrq %rsi
-; FALLBACK2-NEXT:    shrxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r9, %rsi
-; FALLBACK2-NEXT:    shrq %rcx
-; FALLBACK2-NEXT:    shrxq %rax, %rcx, %rax
-; FALLBACK2-NEXT:    orq %r10, %rax
-; FALLBACK2-NEXT:    movq %r11, (%rdx)
+; FALLBACK2-NEXT:    shrxq %rax, %rsi, %rax
+; FALLBACK2-NEXT:    orq %rcx, %rax
+; FALLBACK2-NEXT:    movq %r10, (%rdx)
 ; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
-; FALLBACK2-NEXT:    movq %rsi, 24(%rdx)
-; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK2-NEXT:    movq %r8, 8(%rdx)
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: shl_32bytes_dwordOff:
@@ -8135,40 +8124,40 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; FALLBACK6:       # %bb.0:
 ; FALLBACK6-NEXT:    movups (%rdi), %xmm0
 ; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
-; FALLBACK6-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK6-NEXT:    movl %ecx, %eax
+; FALLBACK6-NEXT:    movzbl (%rsi), %esi
+; FALLBACK6-NEXT:    movl %esi, %eax
 ; FALLBACK6-NEXT:    shlb $5, %al
 ; FALLBACK6-NEXT:    xorps %xmm2, %xmm2
 ; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT:    shlb $2, %cl
-; FALLBACK6-NEXT:    andb $24, %cl
-; FALLBACK6-NEXT:    negb %cl
-; FALLBACK6-NEXT:    movsbq %cl, %rcx
-; FALLBACK6-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK6-NEXT:    movq -24(%rsp,%rcx), %rdi
-; FALLBACK6-NEXT:    shlxq %rax, %rdi, %r8
-; FALLBACK6-NEXT:    movq -40(%rsp,%rcx), %r9
-; FALLBACK6-NEXT:    movq -32(%rsp,%rcx), %rcx
-; FALLBACK6-NEXT:    shlxq %rax, %rcx, %r10
-; FALLBACK6-NEXT:    shlxq %rax, %r9, %r11
-; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    movl %eax, %ecx
+; FALLBACK6-NEXT:    shlb $2, %sil
+; FALLBACK6-NEXT:    andb $24, %sil
+; FALLBACK6-NEXT:    negb %sil
+; FALLBACK6-NEXT:    movsbq %sil, %rsi
+; FALLBACK6-NEXT:    shlxq %rcx, -16(%rsp,%rsi), %rdi
 ; FALLBACK6-NEXT:    notb %al
+; FALLBACK6-NEXT:    movq -24(%rsp,%rsi), %r8
+; FALLBACK6-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK6-NEXT:    shrq %r8
+; FALLBACK6-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK6-NEXT:    orq %rdi, %r8
+; FALLBACK6-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK6-NEXT:    movq -32(%rsp,%rsi), %rsi
+; FALLBACK6-NEXT:    shlxq %rcx, %rsi, %r10
+; FALLBACK6-NEXT:    shrq %rsi
+; FALLBACK6-NEXT:    shrxq %rax, %rsi, %rsi
+; FALLBACK6-NEXT:    orq %r9, %rsi
+; FALLBACK6-NEXT:    shlxq %rcx, %rdi, %rcx
 ; FALLBACK6-NEXT:    shrq %rdi
-; FALLBACK6-NEXT:    shrxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT:    orq %rsi, %rdi
-; FALLBACK6-NEXT:    shrq %rcx
-; FALLBACK6-NEXT:    shrxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT:    orq %r8, %rcx
-; FALLBACK6-NEXT:    shrq %r9
-; FALLBACK6-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK6-NEXT:    shrxq %rax, %rdi, %rax
 ; FALLBACK6-NEXT:    orq %r10, %rax
-; FALLBACK6-NEXT:    movq %r11, (%rdx)
+; FALLBACK6-NEXT:    movq %rcx, (%rdx)
 ; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK6-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r8, 24(%rdx)
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: shl_32bytes_dwordOff:
@@ -8283,38 +8272,38 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; FALLBACK10-LABEL: shl_32bytes_dwordOff:
 ; FALLBACK10:       # %bb.0:
 ; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
-; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK10-NEXT:    movl %ecx, %eax
+; FALLBACK10-NEXT:    movzbl (%rsi), %esi
+; FALLBACK10-NEXT:    movl %esi, %eax
 ; FALLBACK10-NEXT:    shlb $5, %al
 ; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT:    shlb $2, %cl
-; FALLBACK10-NEXT:    andb $24, %cl
-; FALLBACK10-NEXT:    negb %cl
-; FALLBACK10-NEXT:    movsbq %cl, %rcx
-; FALLBACK10-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK10-NEXT:    movq -24(%rsp,%rcx), %rdi
-; FALLBACK10-NEXT:    shlxq %rax, %rdi, %r8
-; FALLBACK10-NEXT:    movq -40(%rsp,%rcx), %r9
-; FALLBACK10-NEXT:    movq -32(%rsp,%rcx), %rcx
-; FALLBACK10-NEXT:    shlxq %rax, %rcx, %r10
-; FALLBACK10-NEXT:    shlxq %rax, %r9, %r11
-; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT:    movl %eax, %ecx
+; FALLBACK10-NEXT:    shlb $2, %sil
+; FALLBACK10-NEXT:    andb $24, %sil
+; FALLBACK10-NEXT:    negb %sil
+; FALLBACK10-NEXT:    movsbq %sil, %rsi
+; FALLBACK10-NEXT:    shlxq %rcx, -16(%rsp,%rsi), %rdi
 ; FALLBACK10-NEXT:    notb %al
+; FALLBACK10-NEXT:    movq -24(%rsp,%rsi), %r8
+; FALLBACK10-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK10-NEXT:    shrq %r8
+; FALLBACK10-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK10-NEXT:    orq %rdi, %r8
+; FALLBACK10-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK10-NEXT:    movq -32(%rsp,%rsi), %rsi
+; FALLBACK10-NEXT:    shlxq %rcx, %rsi, %r10
+; FALLBACK10-NEXT:    shrq %rsi
+; FALLBACK10-NEXT:    shrxq %rax, %rsi, %rsi
+; FALLBACK10-NEXT:    orq %r9, %rsi
+; FALLBACK10-NEXT:    shlxq %rcx, %rdi, %rcx
 ; FALLBACK10-NEXT:    shrq %rdi
-; FALLBACK10-NEXT:    shrxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT:    orq %rsi, %rdi
-; FALLBACK10-NEXT:    shrq %rcx
-; FALLBACK10-NEXT:    shrxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT:    orq %r8, %rcx
-; FALLBACK10-NEXT:    shrq %r9
-; FALLBACK10-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK10-NEXT:    shrxq %rax, %rdi, %rax
 ; FALLBACK10-NEXT:    orq %r10, %rax
-; FALLBACK10-NEXT:    movq %r11, (%rdx)
+; FALLBACK10-NEXT:    movq %rcx, (%rdx)
 ; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r8, 24(%rdx)
 ; FALLBACK10-NEXT:    vzeroupper
 ; FALLBACK10-NEXT:    retq
 ;
@@ -8428,38 +8417,38 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; FALLBACK14-LABEL: shl_32bytes_dwordOff:
 ; FALLBACK14:       # %bb.0:
 ; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
-; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK14-NEXT:    movl %ecx, %eax
+; FALLBACK14-NEXT:    movzbl (%rsi), %esi
+; FALLBACK14-NEXT:    movl %esi, %eax
 ; FALLBACK14-NEXT:    shlb $5, %al
 ; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT:    shlb $2, %cl
-; FALLBACK14-NEXT:    andb $24, %cl
-; FALLBACK14-NEXT:    negb %cl
-; FALLBACK14-NEXT:    movsbq %cl, %rcx
-; FALLBACK14-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK14-NEXT:    movq -24(%rsp,%rcx), %rdi
-; FALLBACK14-NEXT:    shlxq %rax, %rdi, %r8
-; FALLBACK14-NEXT:    movq -40(%rsp,%rcx), %r9
-; FALLBACK14-NEXT:    movq -32(%rsp,%rcx), %rcx
-; FALLBACK14-NEXT:    shlxq %rax, %rcx, %r10
-; FALLBACK14-NEXT:    shlxq %rax, %r9, %r11
-; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT:    movl %eax, %ecx
+; FALLBACK14-NEXT:    shlb $2, %sil
+; FALLBACK14-NEXT:    andb $24, %sil
+; FALLBACK14-NEXT:    negb %sil
+; FALLBACK14-NEXT:    movsbq %sil, %rsi
+; FALLBACK14-NEXT:    shlxq %rcx, -16(%rsp,%rsi), %rdi
 ; FALLBACK14-NEXT:    notb %al
+; FALLBACK14-NEXT:    movq -24(%rsp,%rsi), %r8
+; FALLBACK14-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK14-NEXT:    shrq %r8
+; FALLBACK14-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK14-NEXT:    orq %rdi, %r8
+; FALLBACK14-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK14-NEXT:    movq -32(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT:    shlxq %rcx, %rsi, %r10
+; FALLBACK14-NEXT:    shrq %rsi
+; FALLBACK14-NEXT:    shrxq %rax, %rsi, %rsi
+; FALLBACK14-NEXT:    orq %r9, %rsi
+; FALLBACK14-NEXT:    shlxq %rcx, %rdi, %rcx
 ; FALLBACK14-NEXT:    shrq %rdi
-; FALLBACK14-NEXT:    shrxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT:    orq %rsi, %rdi
-; FALLBACK14-NEXT:    shrq %rcx
-; FALLBACK14-NEXT:    shrxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT:    orq %r8, %rcx
-; FALLBACK14-NEXT:    shrq %r9
-; FALLBACK14-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK14-NEXT:    shrxq %rax, %rdi, %rax
 ; FALLBACK14-NEXT:    orq %r10, %rax
-; FALLBACK14-NEXT:    movq %r11, (%rdx)
+; FALLBACK14-NEXT:    movq %rcx, (%rdx)
 ; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r8, 24(%rdx)
 ; FALLBACK14-NEXT:    vzeroupper
 ; FALLBACK14-NEXT:    retq
 ;
@@ -8906,30 +8895,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movl %eax, %ecx
 ; FALLBACK2-NEXT:    andb $24, %sil
-; FALLBACK2-NEXT:    movzbl %sil, %ecx
-; FALLBACK2-NEXT:    movq -64(%rsp,%rcx), %rsi
-; FALLBACK2-NEXT:    movq -56(%rsp,%rcx), %rdi
-; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
-; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx), %r9
-; FALLBACK2-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK2-NEXT:    movq -48(%rsp,%rcx), %rcx
-; FALLBACK2-NEXT:    sarxq %rax, %rcx, %r11
-; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    movzbl %sil, %esi
+; FALLBACK2-NEXT:    movq -64(%rsp,%rsi), %rdi
+; FALLBACK2-NEXT:    movq -56(%rsp,%rsi), %r8
+; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %r9
 ; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK2-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK2-NEXT:    orq %r9, %r10
+; FALLBACK2-NEXT:    shrxq %rcx, -72(%rsp,%rsi), %r9
 ; FALLBACK2-NEXT:    addq %rdi, %rdi
 ; FALLBACK2-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %r8, %rdi
-; FALLBACK2-NEXT:    addq %rsi, %rsi
-; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r9, %rsi
-; FALLBACK2-NEXT:    addq %rcx, %rcx
-; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
-; FALLBACK2-NEXT:    orq %r10, %rax
-; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK2-NEXT:    orq %r9, %rdi
+; FALLBACK2-NEXT:    shrxq %rcx, %r8, %r8
+; FALLBACK2-NEXT:    movq -48(%rsp,%rsi), %rsi
+; FALLBACK2-NEXT:    leaq (%rsi,%rsi), %r9
+; FALLBACK2-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK2-NEXT:    orq %r8, %rax
+; FALLBACK2-NEXT:    sarxq %rcx, %rsi, %rcx
+; FALLBACK2-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
-; FALLBACK2-NEXT:    movq %rsi, (%rdx)
-; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, (%rdx)
+; FALLBACK2-NEXT:    movq %r10, 8(%rdx)
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: ashr_32bytes:
@@ -9067,30 +9056,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movl %eax, %ecx
 ; FALLBACK6-NEXT:    andb $24, %sil
-; FALLBACK6-NEXT:    movzbl %sil, %ecx
-; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK6-NEXT:    movq -64(%rsp,%rcx), %rdi
-; FALLBACK6-NEXT:    movq -56(%rsp,%rcx), %r8
-; FALLBACK6-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK6-NEXT:    movq -48(%rsp,%rcx), %rcx
-; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK6-NEXT:    sarxq %rax, %rcx, %r11
-; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    movzbl %sil, %esi
+; FALLBACK6-NEXT:    shrxq %rcx, -72(%rsp,%rsi), %rdi
 ; FALLBACK6-NEXT:    notb %al
-; FALLBACK6-NEXT:    addq %rdi, %rdi
-; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT:    orq %rsi, %rdi
-; FALLBACK6-NEXT:    addq %rcx, %rcx
-; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT:    orq %r9, %rcx
-; FALLBACK6-NEXT:    addq %r8, %r8
-; FALLBACK6-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK6-NEXT:    orq %r10, %rax
-; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq -64(%rsp,%rsi), %r8
+; FALLBACK6-NEXT:    movq -56(%rsp,%rsi), %r9
+; FALLBACK6-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK6-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK6-NEXT:    orq %rdi, %r10
+; FALLBACK6-NEXT:    shrxq %rcx, %r9, %rdi
+; FALLBACK6-NEXT:    movq -48(%rsp,%rsi), %rsi
+; FALLBACK6-NEXT:    leaq (%rsi,%rsi), %r11
+; FALLBACK6-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK6-NEXT:    orq %rdi, %r11
+; FALLBACK6-NEXT:    shrxq %rcx, %r8, %rdi
+; FALLBACK6-NEXT:    addq %r9, %r9
+; FALLBACK6-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK6-NEXT:    orq %rdi, %rax
+; FALLBACK6-NEXT:    sarxq %rcx, %rsi, %rcx
+; FALLBACK6-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, (%rdx)
+; FALLBACK6-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r10, (%rdx)
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: ashr_32bytes:
@@ -9227,30 +9216,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movl %eax, %ecx
 ; FALLBACK10-NEXT:    andb $24, %sil
-; FALLBACK10-NEXT:    movzbl %sil, %ecx
-; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK10-NEXT:    movq -64(%rsp,%rcx), %rdi
-; FALLBACK10-NEXT:    movq -56(%rsp,%rcx), %r8
-; FALLBACK10-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK10-NEXT:    movq -48(%rsp,%rcx), %rcx
-; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK10-NEXT:    sarxq %rax, %rcx, %r11
-; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT:    movzbl %sil, %esi
+; FALLBACK10-NEXT:    shrxq %rcx, -72(%rsp,%rsi), %rdi
 ; FALLBACK10-NEXT:    notb %al
-; FALLBACK10-NEXT:    addq %rdi, %rdi
-; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT:    orq %rsi, %rdi
-; FALLBACK10-NEXT:    addq %rcx, %rcx
-; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT:    orq %r9, %rcx
-; FALLBACK10-NEXT:    addq %r8, %r8
-; FALLBACK10-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK10-NEXT:    orq %r10, %rax
-; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK10-NEXT:    movq -64(%rsp,%rsi), %r8
+; FALLBACK10-NEXT:    movq -56(%rsp,%rsi), %r9
+; FALLBACK10-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK10-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK10-NEXT:    orq %rdi, %r10
+; FALLBACK10-NEXT:    shrxq %rcx, %r9, %rdi
+; FALLBACK10-NEXT:    movq -48(%rsp,%rsi), %rsi
+; FALLBACK10-NEXT:    leaq (%rsi,%rsi), %r11
+; FALLBACK10-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK10-NEXT:    orq %rdi, %r11
+; FALLBACK10-NEXT:    shrxq %rcx, %r8, %rdi
+; FALLBACK10-NEXT:    addq %r9, %r9
+; FALLBACK10-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK10-NEXT:    orq %rdi, %rax
+; FALLBACK10-NEXT:    sarxq %rcx, %rsi, %rcx
+; FALLBACK10-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, (%rdx)
+; FALLBACK10-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r10, (%rdx)
 ; FALLBACK10-NEXT:    retq
 ;
 ; FALLBACK11-LABEL: ashr_32bytes:
@@ -9387,30 +9376,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movl %eax, %ecx
 ; FALLBACK14-NEXT:    andb $24, %sil
-; FALLBACK14-NEXT:    movzbl %sil, %ecx
-; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK14-NEXT:    movq -64(%rsp,%rcx), %rdi
-; FALLBACK14-NEXT:    movq -56(%rsp,%rcx), %r8
-; FALLBACK14-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK14-NEXT:    movq -48(%rsp,%rcx), %rcx
-; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK14-NEXT:    sarxq %rax, %rcx, %r11
-; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT:    movzbl %sil, %esi
+; FALLBACK14-NEXT:    shrxq %rcx, -72(%rsp,%rsi), %rdi
 ; FALLBACK14-NEXT:    notb %al
-; FALLBACK14-NEXT:    addq %rdi, %rdi
-; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT:    orq %rsi, %rdi
-; FALLBACK14-NEXT:    addq %rcx, %rcx
-; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT:    orq %r9, %rcx
-; FALLBACK14-NEXT:    addq %r8, %r8
-; FALLBACK14-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK14-NEXT:    orq %r10, %rax
-; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK14-NEXT:    movq -64(%rsp,%rsi), %r8
+; FALLBACK14-NEXT:    movq -56(%rsp,%rsi), %r9
+; FALLBACK14-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK14-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK14-NEXT:    orq %rdi, %r10
+; FALLBACK14-NEXT:    shrxq %rcx, %r9, %rdi
+; FALLBACK14-NEXT:    movq -48(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT:    leaq (%rsi,%rsi), %r11
+; FALLBACK14-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK14-NEXT:    orq %rdi, %r11
+; FALLBACK14-NEXT:    shrxq %rcx, %r8, %rdi
+; FALLBACK14-NEXT:    addq %r9, %r9
+; FALLBACK14-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK14-NEXT:    orq %rdi, %rax
+; FALLBACK14-NEXT:    sarxq %rcx, %rsi, %rcx
+; FALLBACK14-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, (%rdx)
+; FALLBACK14-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r10, (%rdx)
 ; FALLBACK14-NEXT:    retq
 ;
 ; FALLBACK15-LABEL: ashr_32bytes:
@@ -9671,7 +9660,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    pushl %edi
 ; FALLBACK18-NEXT:    pushl %esi
 ; FALLBACK18-NEXT:    subl $108, %esp
-; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; FALLBACK18-NEXT:    movl (%esi), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -9680,22 +9669,22 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl 8(%esi), %ebx
 ; FALLBACK18-NEXT:    movl 12(%esi), %ebp
 ; FALLBACK18-NEXT:    movl 16(%esi), %edi
-; FALLBACK18-NEXT:    movzbl (%ecx), %ecx
-; FALLBACK18-NEXT:    movl 20(%esi), %edx
+; FALLBACK18-NEXT:    movzbl (%edx), %edx
+; FALLBACK18-NEXT:    movl 20(%esi), %ecx
 ; FALLBACK18-NEXT:    movl 24(%esi), %eax
 ; FALLBACK18-NEXT:    movl 28(%esi), %esi
 ; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl %ecx, %eax
-; FALLBACK18-NEXT:    shlb $3, %al
+; FALLBACK18-NEXT:    movl %edx, %ecx
+; FALLBACK18-NEXT:    shlb $3, %cl
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    sarl $31, %esi
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
@@ -9705,66 +9694,65 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    andb $28, %cl
-; FALLBACK18-NEXT:    movzbl %cl, %edi
-; FALLBACK18-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK18-NEXT:    movl 40(%esp,%edi), %ecx
-; FALLBACK18-NEXT:    shrxl %eax, %esi, %ebx
-; FALLBACK18-NEXT:    movl %eax, %edx
-; FALLBACK18-NEXT:    notb %dl
-; FALLBACK18-NEXT:    leal (%ecx,%ecx), %ebp
-; FALLBACK18-NEXT:    shlxl %edx, %ebp, %ebp
-; FALLBACK18-NEXT:    orl %ebx, %ebp
-; FALLBACK18-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %eax, 32(%esp,%edi), %ebx
-; FALLBACK18-NEXT:    addl %esi, %esi
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK18-NEXT:    orl %ebx, %esi
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 48(%esp,%edi), %esi
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK18-NEXT:    shlxl %edx, %ebx, %esi
-; FALLBACK18-NEXT:    movl 44(%esp,%edi), %ebp
-; FALLBACK18-NEXT:    shrxl %eax, %ebp, %ebx
-; FALLBACK18-NEXT:    orl %ebx, %esi
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %eax, %ecx, %ecx
-; FALLBACK18-NEXT:    movl %eax, %ebx
-; FALLBACK18-NEXT:    addl %ebp, %ebp
-; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
-; FALLBACK18-NEXT:    orl %ecx, %eax
+; FALLBACK18-NEXT:    movl %ecx, %eax
+; FALLBACK18-NEXT:    andb $28, %dl
+; FALLBACK18-NEXT:    movzbl %dl, %esi
+; FALLBACK18-NEXT:    movl 36(%esp,%esi), %edx
+; FALLBACK18-NEXT:    movl 40(%esp,%esi), %ebp
+; FALLBACK18-NEXT:    shrxl %eax, %edx, %edi
+; FALLBACK18-NEXT:    notb %cl
+; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ebx
+; FALLBACK18-NEXT:    shlxl %ecx, %ebx, %ebx
+; FALLBACK18-NEXT:    orl %edi, %ebx
+; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %eax, 32(%esp,%esi), %edi
+; FALLBACK18-NEXT:    addl %edx, %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %edx
+; FALLBACK18-NEXT:    orl %edi, %edx
+; FALLBACK18-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 48(%esp,%esi), %edx
+; FALLBACK18-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    addl %edx, %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %ebx
+; FALLBACK18-NEXT:    movl 44(%esp,%esi), %edx
+; FALLBACK18-NEXT:    shrxl %eax, %edx, %edi
+; FALLBACK18-NEXT:    orl %edi, %ebx
+; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %eax, %ebp, %edi
+; FALLBACK18-NEXT:    movl %eax, %ebp
+; FALLBACK18-NEXT:    addl %edx, %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 56(%esp,%edi), %ebp
-; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
-; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK18-NEXT:    movl 52(%esp,%edi), %eax
-; FALLBACK18-NEXT:    shrxl %ebx, %eax, %esi
-; FALLBACK18-NEXT:    orl %esi, %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 56(%esp,%esi), %edi
+; FALLBACK18-NEXT:    leal (%edi,%edi), %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %edx
+; FALLBACK18-NEXT:    movl 52(%esp,%esi), %eax
+; FALLBACK18-NEXT:    shrxl %ebp, %eax, %ebx
+; FALLBACK18-NEXT:    orl %ebx, %edx
+; FALLBACK18-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    addl %eax, %eax
-; FALLBACK18-NEXT:    shlxl %edx, %eax, %esi
-; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK18-NEXT:    shrxl %ebx, %ebp, %eax
-; FALLBACK18-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK18-NEXT:    sarxl %ebx, %edi, %ebx
-; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %edx, %edi, %edx
-; FALLBACK18-NEXT:    orl %eax, %edx
-; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT:    movl %ebx, 28(%eax)
-; FALLBACK18-NEXT:    movl %edx, 24(%eax)
-; FALLBACK18-NEXT:    movl %esi, 16(%eax)
-; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, (%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    shlxl %ecx, %eax, %eax
+; FALLBACK18-NEXT:    orl %ebx, %eax
+; FALLBACK18-NEXT:    movl 60(%esp,%esi), %esi
+; FALLBACK18-NEXT:    leal (%esi,%esi), %ebx
+; FALLBACK18-NEXT:    shlxl %ecx, %ebx, %ecx
+; FALLBACK18-NEXT:    shrxl %ebp, %edi, %edi
+; FALLBACK18-NEXT:    orl %edi, %ecx
+; FALLBACK18-NEXT:    sarxl %ebp, %esi, %esi
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; FALLBACK18-NEXT:    movl %esi, 28(%edi)
+; FALLBACK18-NEXT:    movl %ecx, 24(%edi)
+; FALLBACK18-NEXT:    movl %eax, 16(%edi)
+; FALLBACK18-NEXT:    movl %edx, 20(%edi)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 8(%edi)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 12(%edi)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, (%edi)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 4(%edi)
 ; FALLBACK18-NEXT:    addl $108, %esp
 ; FALLBACK18-NEXT:    popl %esi
 ; FALLBACK18-NEXT:    popl %edi
@@ -10070,82 +10058,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movups (%ecx), %xmm0
 ; FALLBACK22-NEXT:    movl 16(%ecx), %esi
 ; FALLBACK22-NEXT:    movl 20(%ecx), %edi
-; FALLBACK22-NEXT:    movl 24(%ecx), %ebx
-; FALLBACK22-NEXT:    movl 28(%ecx), %edx
-; FALLBACK22-NEXT:    movzbl (%eax), %ecx
-; FALLBACK22-NEXT:    movl %ecx, %eax
-; FALLBACK22-NEXT:    shlb $3, %al
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl 24(%ecx), %ebp
+; FALLBACK22-NEXT:    movl 28(%ecx), %ecx
+; FALLBACK22-NEXT:    movzbl (%eax), %edx
+; FALLBACK22-NEXT:    movl %edx, %ebx
+; FALLBACK22-NEXT:    shlb $3, %bl
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    sarl $31, %edx
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    andb $28, %cl
-; FALLBACK22-NEXT:    movzbl %cl, %edi
-; FALLBACK22-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
-; FALLBACK22-NEXT:    movl %eax, %edx
-; FALLBACK22-NEXT:    notb %dl
-; FALLBACK22-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    addl %esi, %esi
-; FALLBACK22-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK22-NEXT:    orl %ecx, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 48(%esp,%edi), %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    addl %ecx, %ecx
-; FALLBACK22-NEXT:    shlxl %edx, %ecx, %esi
-; FALLBACK22-NEXT:    movl 44(%esp,%edi), %ecx
-; FALLBACK22-NEXT:    shrxl %eax, %ecx, %ebx
-; FALLBACK22-NEXT:    orl %ebx, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    addl %ecx, %ecx
-; FALLBACK22-NEXT:    shlxl %edx, %ecx, %esi
-; FALLBACK22-NEXT:    movl 40(%esp,%edi), %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %eax, %ecx, %ebx
-; FALLBACK22-NEXT:    movl %eax, %ecx
-; FALLBACK22-NEXT:    orl %ebx, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 56(%esp,%edi), %esi
-; FALLBACK22-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK22-NEXT:    shlxl %edx, %ebx, %eax
-; FALLBACK22-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK22-NEXT:    shrxl %ecx, %ebx, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    sarl $31, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ebx, %eax
+; FALLBACK22-NEXT:    andb $28, %dl
+; FALLBACK22-NEXT:    movzbl %dl, %ecx
+; FALLBACK22-NEXT:    shrxl %eax, 32(%esp,%ecx), %edx
+; FALLBACK22-NEXT:    movl %eax, %ebp
+; FALLBACK22-NEXT:    notb %bl
+; FALLBACK22-NEXT:    movl 36(%esp,%ecx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl %ecx, %eax
-; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK22-NEXT:    addl %ebx, %ebx
-; FALLBACK22-NEXT:    shlxl %edx, %ebx, %ebx
-; FALLBACK22-NEXT:    orl %ebp, %ebx
-; FALLBACK22-NEXT:    shrxl %ecx, %esi, %ecx
-; FALLBACK22-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK22-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK22-NEXT:    sarxl %eax, %edi, %eax
-; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %edx, %edi, %edi
-; FALLBACK22-NEXT:    orl %ecx, %edi
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    addl %ecx, %ecx
-; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK22-NEXT:    orl %esi, %ecx
-; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK22-NEXT:    movl %eax, 28(%edx)
-; FALLBACK22-NEXT:    movl %ecx, 4(%edx)
-; FALLBACK22-NEXT:    movl %edi, 24(%edx)
-; FALLBACK22-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK22-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    orl %edx, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 48(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%eax,%eax), %edx
+; FALLBACK22-NEXT:    shlxl %ebx, %edx, %edi
+; FALLBACK22-NEXT:    movl 44(%esp,%ecx), %edx
+; FALLBACK22-NEXT:    shrxl %ebp, %edx, %esi
+; FALLBACK22-NEXT:    orl %esi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %edx, %edx
+; FALLBACK22-NEXT:    shlxl %ebx, %edx, %edi
+; FALLBACK22-NEXT:    movl 40(%esp,%ecx), %edx
+; FALLBACK22-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %ebp, %edx, %esi
+; FALLBACK22-NEXT:    movl %ebp, %edx
+; FALLBACK22-NEXT:    orl %esi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 56(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK22-NEXT:    movl 52(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK22-NEXT:    orl %edi, %ebp
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %eax, %eax
+; FALLBACK22-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK22-NEXT:    movl 60(%esp,%ecx), %ecx
+; FALLBACK22-NEXT:    leal (%ecx,%ecx), %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    orl %eax, %esi
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT:    movl %eax, 20(%edx)
+; FALLBACK22-NEXT:    addl %eax, %eax
+; FALLBACK22-NEXT:    shlxl %ebx, %eax, %eax
+; FALLBACK22-NEXT:    movl %edx, %ebx
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    orl %edx, %eax
+; FALLBACK22-NEXT:    sarxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT:    movl %ecx, 28(%edx)
+; FALLBACK22-NEXT:    movl %eax, 4(%edx)
+; FALLBACK22-NEXT:    movl %esi, 24(%edx)
+; FALLBACK22-NEXT:    movl %edi, 16(%edx)
+; FALLBACK22-NEXT:    movl %ebp, 20(%edx)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK22-NEXT:    movl %eax, 8(%edx)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -10446,82 +10434,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    vmovups (%ecx), %xmm0
 ; FALLBACK26-NEXT:    movl 16(%ecx), %esi
 ; FALLBACK26-NEXT:    movl 20(%ecx), %edi
-; FALLBACK26-NEXT:    movl 24(%ecx), %ebx
-; FALLBACK26-NEXT:    movl 28(%ecx), %edx
-; FALLBACK26-NEXT:    movzbl (%eax), %ecx
-; FALLBACK26-NEXT:    movl %ecx, %eax
-; FALLBACK26-NEXT:    shlb $3, %al
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl 24(%ecx), %ebp
+; FALLBACK26-NEXT:    movl 28(%ecx), %ecx
+; FALLBACK26-NEXT:    movzbl (%eax), %edx
+; FALLBACK26-NEXT:    movl %edx, %ebx
+; FALLBACK26-NEXT:    shlb $3, %bl
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    sarl $31, %edx
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    andb $28, %cl
-; FALLBACK26-NEXT:    movzbl %cl, %edi
-; FALLBACK26-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
-; FALLBACK26-NEXT:    movl %eax, %edx
-; FALLBACK26-NEXT:    notb %dl
-; FALLBACK26-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    addl %esi, %esi
-; FALLBACK26-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK26-NEXT:    orl %ecx, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 48(%esp,%edi), %ecx
-; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    addl %ecx, %ecx
-; FALLBACK26-NEXT:    shlxl %edx, %ecx, %esi
-; FALLBACK26-NEXT:    movl 44(%esp,%edi), %ecx
-; FALLBACK26-NEXT:    shrxl %eax, %ecx, %ebx
-; FALLBACK26-NEXT:    orl %ebx, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    addl %ecx, %ecx
-; FALLBACK26-NEXT:    shlxl %edx, %ecx, %esi
-; FALLBACK26-NEXT:    movl 40(%esp,%edi), %ecx
-; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %eax, %ecx, %ebx
-; FALLBACK26-NEXT:    movl %eax, %ecx
-; FALLBACK26-NEXT:    orl %ebx, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 56(%esp,%edi), %esi
-; FALLBACK26-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK26-NEXT:    shlxl %edx, %ebx, %eax
-; FALLBACK26-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK26-NEXT:    shrxl %ecx, %ebx, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    sarl $31, %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ebx, %eax
+; FALLBACK26-NEXT:    andb $28, %dl
+; FALLBACK26-NEXT:    movzbl %dl, %ecx
+; FALLBACK26-NEXT:    shrxl %eax, 32(%esp,%ecx), %edx
+; FALLBACK26-NEXT:    movl %eax, %ebp
+; FALLBACK26-NEXT:    notb %bl
+; FALLBACK26-NEXT:    movl 36(%esp,%ecx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl %ecx, %eax
-; FALLBACK26-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK26-NEXT:    addl %ebx, %ebx
-; FALLBACK26-NEXT:    shlxl %edx, %ebx, %ebx
-; FALLBACK26-NEXT:    orl %ebp, %ebx
-; FALLBACK26-NEXT:    shrxl %ecx, %esi, %ecx
-; FALLBACK26-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK26-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK26-NEXT:    sarxl %eax, %edi, %eax
-; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %edx, %edi, %edi
-; FALLBACK26-NEXT:    orl %ecx, %edi
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    addl %ecx, %ecx
-; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK26-NEXT:    orl %esi, %ecx
+; FALLBACK26-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    orl %edx, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 48(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %edx
+; FALLBACK26-NEXT:    shlxl %ebx, %edx, %edi
+; FALLBACK26-NEXT:    movl 44(%esp,%ecx), %edx
+; FALLBACK26-NEXT:    shrxl %ebp, %edx, %esi
+; FALLBACK26-NEXT:    orl %esi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %edx, %edx
+; FALLBACK26-NEXT:    shlxl %ebx, %edx, %edi
+; FALLBACK26-NEXT:    movl 40(%esp,%ecx), %edx
+; FALLBACK26-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %ebp, %edx, %esi
+; FALLBACK26-NEXT:    movl %ebp, %edx
+; FALLBACK26-NEXT:    orl %esi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 56(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK26-NEXT:    movl 52(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK26-NEXT:    orl %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %eax, %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK26-NEXT:    movl 60(%esp,%ecx), %ecx
+; FALLBACK26-NEXT:    leal (%ecx,%ecx), %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    orl %eax, %esi
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    addl %eax, %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %eax, %eax
+; FALLBACK26-NEXT:    movl %edx, %ebx
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; FALLBACK26-NEXT:    orl %edx, %eax
+; FALLBACK26-NEXT:    sarxl %ebx, %ecx, %ecx
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK26-NEXT:    movl %eax, 28(%edx)
-; FALLBACK26-NEXT:    movl %ecx, 4(%edx)
-; FALLBACK26-NEXT:    movl %edi, 24(%edx)
-; FALLBACK26-NEXT:    movl %ebx, 16(%edx)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    movl %eax, 20(%edx)
+; FALLBACK26-NEXT:    movl %ecx, 28(%edx)
+; FALLBACK26-NEXT:    movl %eax, 4(%edx)
+; FALLBACK26-NEXT:    movl %esi, 24(%edx)
+; FALLBACK26-NEXT:    movl %edi, 16(%edx)
+; FALLBACK26-NEXT:    movl %ebp, 20(%edx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK26-NEXT:    movl %eax, 8(%edx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -10822,82 +10810,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    vmovups (%ecx), %xmm0
 ; FALLBACK30-NEXT:    movl 16(%ecx), %esi
 ; FALLBACK30-NEXT:    movl 20(%ecx), %edi
-; FALLBACK30-NEXT:    movl 24(%ecx), %ebx
-; FALLBACK30-NEXT:    movl 28(%ecx), %edx
-; FALLBACK30-NEXT:    movzbl (%eax), %ecx
-; FALLBACK30-NEXT:    movl %ecx, %eax
-; FALLBACK30-NEXT:    shlb $3, %al
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl 24(%ecx), %ebp
+; FALLBACK30-NEXT:    movl 28(%ecx), %ecx
+; FALLBACK30-NEXT:    movzbl (%eax), %edx
+; FALLBACK30-NEXT:    movl %edx, %ebx
+; FALLBACK30-NEXT:    shlb $3, %bl
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    sarl $31, %edx
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    andb $28, %cl
-; FALLBACK30-NEXT:    movzbl %cl, %edi
-; FALLBACK30-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
-; FALLBACK30-NEXT:    movl %eax, %edx
-; FALLBACK30-NEXT:    notb %dl
-; FALLBACK30-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    addl %esi, %esi
-; FALLBACK30-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK30-NEXT:    orl %ecx, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 48(%esp,%edi), %ecx
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    addl %ecx, %ecx
-; FALLBACK30-NEXT:    shlxl %edx, %ecx, %esi
-; FALLBACK30-NEXT:    movl 44(%esp,%edi), %ecx
-; FALLBACK30-NEXT:    shrxl %eax, %ecx, %ebx
-; FALLBACK30-NEXT:    orl %ebx, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    addl %ecx, %ecx
-; FALLBACK30-NEXT:    shlxl %edx, %ecx, %esi
-; FALLBACK30-NEXT:    movl 40(%esp,%edi), %ecx
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %eax, %ecx, %ebx
-; FALLBACK30-NEXT:    movl %eax, %ecx
-; FALLBACK30-NEXT:    orl %ebx, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 56(%esp,%edi), %esi
-; FALLBACK30-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK30-NEXT:    shlxl %edx, %ebx, %eax
-; FALLBACK30-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK30-NEXT:    shrxl %ecx, %ebx, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    sarl $31, %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ebx, %eax
+; FALLBACK30-NEXT:    andb $28, %dl
+; FALLBACK30-NEXT:    movzbl %dl, %ecx
+; FALLBACK30-NEXT:    shrxl %eax, 32(%esp,%ecx), %edx
+; FALLBACK30-NEXT:    movl %eax, %ebp
+; FALLBACK30-NEXT:    notb %bl
+; FALLBACK30-NEXT:    movl 36(%esp,%ecx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl %ecx, %eax
-; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK30-NEXT:    addl %ebx, %ebx
-; FALLBACK30-NEXT:    shlxl %edx, %ebx, %ebx
-; FALLBACK30-NEXT:    orl %ebp, %ebx
-; FALLBACK30-NEXT:    shrxl %ecx, %esi, %ecx
-; FALLBACK30-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK30-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK30-NEXT:    sarxl %eax, %edi, %eax
-; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %edx, %edi, %edi
-; FALLBACK30-NEXT:    orl %ecx, %edi
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    addl %ecx, %ecx
-; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK30-NEXT:    orl %esi, %ecx
-; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK30-NEXT:    movl %eax, 28(%edx)
-; FALLBACK30-NEXT:    movl %ecx, 4(%edx)
-; FALLBACK30-NEXT:    movl %edi, 24(%edx)
-; FALLBACK30-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK30-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    orl %edx, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 48(%esp,%ecx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %edx
+; FALLBACK30-NEXT:    shlxl %ebx, %edx, %edi
+; FALLBACK30-NEXT:    movl 44(%esp,%ecx), %edx
+; FALLBACK30-NEXT:    shrxl %ebp, %edx, %esi
+; FALLBACK30-NEXT:    orl %esi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %edx, %edx
+; FALLBACK30-NEXT:    shlxl %ebx, %edx, %edi
+; FALLBACK30-NEXT:    movl 40(%esp,%ecx), %edx
+; FALLBACK30-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ebp, %edx, %esi
+; FALLBACK30-NEXT:    movl %ebp, %edx
+; FALLBACK30-NEXT:    orl %esi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 56(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK30-NEXT:    movl 52(%esp,%ecx), %eax
+; FALLBACK30-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK30-NEXT:    orl %edi, %ebp
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %eax, %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK30-NEXT:    movl 60(%esp,%ecx), %ecx
+; FALLBACK30-NEXT:    leal (%ecx,%ecx), %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    orl %eax, %esi
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    movl %eax, 20(%edx)
+; FALLBACK30-NEXT:    addl %eax, %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %eax, %eax
+; FALLBACK30-NEXT:    movl %edx, %ebx
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; FALLBACK30-NEXT:    orl %edx, %eax
+; FALLBACK30-NEXT:    sarxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT:    movl %ecx, 28(%edx)
+; FALLBACK30-NEXT:    movl %eax, 4(%edx)
+; FALLBACK30-NEXT:    movl %esi, 24(%edx)
+; FALLBACK30-NEXT:    movl %edi, 16(%edx)
+; FALLBACK30-NEXT:    movl %ebp, 20(%edx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK30-NEXT:    movl %eax, 8(%edx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -11104,30 +11092,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movl %eax, %ecx
 ; FALLBACK2-NEXT:    andb $6, %sil
-; FALLBACK2-NEXT:    movzbl %sil, %ecx
-; FALLBACK2-NEXT:    movq -64(%rsp,%rcx,4), %rsi
-; FALLBACK2-NEXT:    movq -56(%rsp,%rcx,4), %rdi
-; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
-; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %r9
-; FALLBACK2-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK2-NEXT:    movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK2-NEXT:    sarxq %rax, %rcx, %r11
-; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    movzbl %sil, %esi
+; FALLBACK2-NEXT:    movq -64(%rsp,%rsi,4), %rdi
+; FALLBACK2-NEXT:    movq -56(%rsp,%rsi,4), %r8
+; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %r9
 ; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK2-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK2-NEXT:    orq %r9, %r10
+; FALLBACK2-NEXT:    shrxq %rcx, -72(%rsp,%rsi,4), %r9
 ; FALLBACK2-NEXT:    addq %rdi, %rdi
 ; FALLBACK2-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %r8, %rdi
-; FALLBACK2-NEXT:    addq %rsi, %rsi
-; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r9, %rsi
-; FALLBACK2-NEXT:    addq %rcx, %rcx
-; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
-; FALLBACK2-NEXT:    orq %r10, %rax
-; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK2-NEXT:    orq %r9, %rdi
+; FALLBACK2-NEXT:    shrxq %rcx, %r8, %r8
+; FALLBACK2-NEXT:    movq -48(%rsp,%rsi,4), %rsi
+; FALLBACK2-NEXT:    leaq (%rsi,%rsi), %r9
+; FALLBACK2-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK2-NEXT:    orq %r8, %rax
+; FALLBACK2-NEXT:    sarxq %rcx, %rsi, %rcx
+; FALLBACK2-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
-; FALLBACK2-NEXT:    movq %rsi, (%rdx)
-; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, (%rdx)
+; FALLBACK2-NEXT:    movq %r10, 8(%rdx)
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: ashr_32bytes_dwordOff:
@@ -11268,30 +11256,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movl %eax, %ecx
 ; FALLBACK6-NEXT:    andb $6, %sil
-; FALLBACK6-NEXT:    movzbl %sil, %ecx
-; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK6-NEXT:    movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK6-NEXT:    movq -56(%rsp,%rcx,4), %r8
-; FALLBACK6-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK6-NEXT:    movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK6-NEXT:    sarxq %rax, %rcx, %r11
-; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    movzbl %sil, %esi
+; FALLBACK6-NEXT:    shrxq %rcx, -72(%rsp,%rsi,4), %rdi
 ; FALLBACK6-NEXT:    notb %al
-; FALLBACK6-NEXT:    addq %rdi, %rdi
-; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT:    orq %rsi, %rdi
-; FALLBACK6-NEXT:    addq %rcx, %rcx
-; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT:    orq %r9, %rcx
-; FALLBACK6-NEXT:    addq %r8, %r8
-; FALLBACK6-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK6-NEXT:    orq %r10, %rax
-; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq -64(%rsp,%rsi,4), %r8
+; FALLBACK6-NEXT:    movq -56(%rsp,%rsi,4), %r9
+; FALLBACK6-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK6-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK6-NEXT:    orq %rdi, %r10
+; FALLBACK6-NEXT:    shrxq %rcx, %r9, %rdi
+; FALLBACK6-NEXT:    movq -48(%rsp,%rsi,4), %rsi
+; FALLBACK6-NEXT:    leaq (%rsi,%rsi), %r11
+; FALLBACK6-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK6-NEXT:    orq %rdi, %r11
+; FALLBACK6-NEXT:    shrxq %rcx, %r8, %rdi
+; FALLBACK6-NEXT:    addq %r9, %r9
+; FALLBACK6-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK6-NEXT:    orq %rdi, %rax
+; FALLBACK6-NEXT:    sarxq %rcx, %rsi, %rcx
+; FALLBACK6-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, (%rdx)
+; FALLBACK6-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r10, (%rdx)
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: ashr_32bytes_dwordOff:
@@ -11431,30 +11419,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movl %eax, %ecx
 ; FALLBACK10-NEXT:    andb $6, %sil
-; FALLBACK10-NEXT:    movzbl %sil, %ecx
-; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK10-NEXT:    movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK10-NEXT:    movq -56(%rsp,%rcx,4), %r8
-; FALLBACK10-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK10-NEXT:    movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK10-NEXT:    sarxq %rax, %rcx, %r11
-; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT:    movzbl %sil, %esi
+; FALLBACK10-NEXT:    shrxq %rcx, -72(%rsp,%rsi,4), %rdi
 ; FALLBACK10-NEXT:    notb %al
-; FALLBACK10-NEXT:    addq %rdi, %rdi
-; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT:    orq %rsi, %rdi
-; FALLBACK10-NEXT:    addq %rcx, %rcx
-; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT:    orq %r9, %rcx
-; FALLBACK10-NEXT:    addq %r8, %r8
-; FALLBACK10-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK10-NEXT:    orq %r10, %rax
-; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK10-NEXT:    movq -64(%rsp,%rsi,4), %r8
+; FALLBACK10-NEXT:    movq -56(%rsp,%rsi,4), %r9
+; FALLBACK10-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK10-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK10-NEXT:    orq %rdi, %r10
+; FALLBACK10-NEXT:    shrxq %rcx, %r9, %rdi
+; FALLBACK10-NEXT:    movq -48(%rsp,%rsi,4), %rsi
+; FALLBACK10-NEXT:    leaq (%rsi,%rsi), %r11
+; FALLBACK10-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK10-NEXT:    orq %rdi, %r11
+; FALLBACK10-NEXT:    shrxq %rcx, %r8, %rdi
+; FALLBACK10-NEXT:    addq %r9, %r9
+; FALLBACK10-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK10-NEXT:    orq %rdi, %rax
+; FALLBACK10-NEXT:    sarxq %rcx, %rsi, %rcx
+; FALLBACK10-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, (%rdx)
+; FALLBACK10-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r10, (%rdx)
 ; FALLBACK10-NEXT:    retq
 ;
 ; FALLBACK11-LABEL: ashr_32bytes_dwordOff:
@@ -11594,30 +11582,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movl %eax, %ecx
 ; FALLBACK14-NEXT:    andb $6, %sil
-; FALLBACK14-NEXT:    movzbl %sil, %ecx
-; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK14-NEXT:    movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK14-NEXT:    movq -56(%rsp,%rcx,4), %r8
-; FALLBACK14-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK14-NEXT:    movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK14-NEXT:    sarxq %rax, %rcx, %r11
-; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT:    movzbl %sil, %esi
+; FALLBACK14-NEXT:    shrxq %rcx, -72(%rsp,%rsi,4), %rdi
 ; FALLBACK14-NEXT:    notb %al
-; FALLBACK14-NEXT:    addq %rdi, %rdi
-; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT:    orq %rsi, %rdi
-; FALLBACK14-NEXT:    addq %rcx, %rcx
-; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT:    orq %r9, %rcx
-; FALLBACK14-NEXT:    addq %r8, %r8
-; FALLBACK14-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK14-NEXT:    orq %r10, %rax
-; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK14-NEXT:    movq -64(%rsp,%rsi,4), %r8
+; FALLBACK14-NEXT:    movq -56(%rsp,%rsi,4), %r9
+; FALLBACK14-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK14-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK14-NEXT:    orq %rdi, %r10
+; FALLBACK14-NEXT:    shrxq %rcx, %r9, %rdi
+; FALLBACK14-NEXT:    movq -48(%rsp,%rsi,4), %rsi
+; FALLBACK14-NEXT:    leaq (%rsi,%rsi), %r11
+; FALLBACK14-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK14-NEXT:    orq %rdi, %r11
+; FALLBACK14-NEXT:    shrxq %rcx, %r8, %rdi
+; FALLBACK14-NEXT:    addq %r9, %r9
+; FALLBACK14-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK14-NEXT:    orq %rdi, %rax
+; FALLBACK14-NEXT:    sarxq %rcx, %rsi, %rcx
+; FALLBACK14-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, (%rdx)
+; FALLBACK14-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r10, (%rdx)
 ; FALLBACK14-NEXT:    retq
 ;
 ; FALLBACK15-LABEL: ashr_32bytes_dwordOff:
@@ -12204,10 +12192,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK2-LABEL: lshr_64bytes:
 ; FALLBACK2:       # %bb.0:
-; FALLBACK2-NEXT:    pushq %rbp
 ; FALLBACK2-NEXT:    pushq %r15
 ; FALLBACK2-NEXT:    pushq %r14
-; FALLBACK2-NEXT:    pushq %r13
 ; FALLBACK2-NEXT:    pushq %r12
 ; FALLBACK2-NEXT:    pushq %rbx
 ; FALLBACK2-NEXT:    pushq %rax
@@ -12235,60 +12221,58 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    leal (,%rax,8), %ecx
 ; FALLBACK2-NEXT:    andl $56, %ecx
+; FALLBACK2-NEXT:    movl %ecx, %esi
 ; FALLBACK2-NEXT:    andl $56, %eax
-; FALLBACK2-NEXT:    movq -120(%rsp,%rax), %rdi
-; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r9
-; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %rbx
-; FALLBACK2-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r13
-; FALLBACK2-NEXT:    movq -104(%rsp,%rax), %rsi
-; FALLBACK2-NEXT:    shrxq %rcx, %rsi, %r8
-; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %r10
-; FALLBACK2-NEXT:    shrxq %rcx, %r9, %r11
-; FALLBACK2-NEXT:    movq -88(%rsp,%rax), %r14
-; FALLBACK2-NEXT:    shrxq %rcx, %r14, %r15
-; FALLBACK2-NEXT:    shrxq %rcx, %r10, %rbp
-; FALLBACK2-NEXT:    movl %ecx, %r12d
-; FALLBACK2-NEXT:    notb %r12b
-; FALLBACK2-NEXT:    addq %r9, %r9
-; FALLBACK2-NEXT:    shlxq %r12, %r9, %r9
+; FALLBACK2-NEXT:    movq -120(%rsp,%rax), %r8
+; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK2-NEXT:    shrxq %rsi, %r8, %r9
+; FALLBACK2-NEXT:    notb %cl
+; FALLBACK2-NEXT:    leaq (%r10,%r10), %rdi
+; FALLBACK2-NEXT:    shlxq %rcx, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %r9, %rdi
+; FALLBACK2-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r9
+; FALLBACK2-NEXT:    addq %r8, %r8
+; FALLBACK2-NEXT:    shlxq %rcx, %r8, %r8
+; FALLBACK2-NEXT:    orq %r9, %r8
+; FALLBACK2-NEXT:    movq -104(%rsp,%rax), %r11
+; FALLBACK2-NEXT:    shrxq %rsi, %r11, %rbx
+; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %r14
+; FALLBACK2-NEXT:    leaq (%r14,%r14), %r9
+; FALLBACK2-NEXT:    shlxq %rcx, %r9, %r9
 ; FALLBACK2-NEXT:    orq %rbx, %r9
-; FALLBACK2-NEXT:    addq %rdi, %rdi
-; FALLBACK2-NEXT:    shlxq %r12, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %r13, %rdi
-; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %rbx
-; FALLBACK2-NEXT:    shrxq %rcx, %rbx, %r13
-; FALLBACK2-NEXT:    movq -72(%rsp,%rax), %rax
-; FALLBACK2-NEXT:    shrxq %rcx, %rax, %rcx
+; FALLBACK2-NEXT:    shrxq %rsi, %r10, %r10
+; FALLBACK2-NEXT:    addq %r11, %r11
+; FALLBACK2-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK2-NEXT:    orq %r10, %r11
+; FALLBACK2-NEXT:    movq -88(%rsp,%rax), %r10
+; FALLBACK2-NEXT:    shrxq %rsi, %r10, %rbx
+; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %r15
+; FALLBACK2-NEXT:    leaq (%r15,%r15), %r12
+; FALLBACK2-NEXT:    shlxq %rcx, %r12, %r12
+; FALLBACK2-NEXT:    orq %rbx, %r12
+; FALLBACK2-NEXT:    shrxq %rsi, %r14, %rbx
 ; FALLBACK2-NEXT:    addq %r10, %r10
-; FALLBACK2-NEXT:    shlxq %r12, %r10, %r10
-; FALLBACK2-NEXT:    orq %r8, %r10
-; FALLBACK2-NEXT:    addq %rsi, %rsi
-; FALLBACK2-NEXT:    shlxq %r12, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r11, %rsi
-; FALLBACK2-NEXT:    leaq (%rbx,%rbx), %r8
-; FALLBACK2-NEXT:    shlxq %r12, %r8, %r8
-; FALLBACK2-NEXT:    orq %r15, %r8
-; FALLBACK2-NEXT:    addq %r14, %r14
-; FALLBACK2-NEXT:    shlxq %r12, %r14, %r11
-; FALLBACK2-NEXT:    orq %rbp, %r11
-; FALLBACK2-NEXT:    addq %rax, %rax
-; FALLBACK2-NEXT:    shlxq %r12, %rax, %rax
-; FALLBACK2-NEXT:    orq %r13, %rax
-; FALLBACK2-NEXT:    movq %rcx, 56(%rdx)
-; FALLBACK2-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK2-NEXT:    movq %r11, 32(%rdx)
-; FALLBACK2-NEXT:    movq %r8, 40(%rdx)
-; FALLBACK2-NEXT:    movq %rsi, 16(%rdx)
-; FALLBACK2-NEXT:    movq %r10, 24(%rdx)
-; FALLBACK2-NEXT:    movq %rdi, (%rdx)
-; FALLBACK2-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK2-NEXT:    shlxq %rcx, %r10, %r10
+; FALLBACK2-NEXT:    orq %rbx, %r10
+; FALLBACK2-NEXT:    shrxq %rsi, %r15, %rbx
+; FALLBACK2-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK2-NEXT:    leaq (%rax,%rax), %r14
+; FALLBACK2-NEXT:    shlxq %rcx, %r14, %rcx
+; FALLBACK2-NEXT:    orq %rbx, %rcx
+; FALLBACK2-NEXT:    shrxq %rsi, %rax, %rax
+; FALLBACK2-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK2-NEXT:    movq %rcx, 48(%rdx)
+; FALLBACK2-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK2-NEXT:    movq %r12, 40(%rdx)
+; FALLBACK2-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK2-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK2-NEXT:    movq %r8, (%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
 ; FALLBACK2-NEXT:    addq $8, %rsp
 ; FALLBACK2-NEXT:    popq %rbx
 ; FALLBACK2-NEXT:    popq %r12
-; FALLBACK2-NEXT:    popq %r13
 ; FALLBACK2-NEXT:    popq %r14
 ; FALLBACK2-NEXT:    popq %r15
-; FALLBACK2-NEXT:    popq %rbp
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: lshr_64bytes:
@@ -12512,13 +12496,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK6-LABEL: lshr_64bytes:
 ; FALLBACK6:       # %bb.0:
-; FALLBACK6-NEXT:    pushq %rbp
 ; FALLBACK6-NEXT:    pushq %r15
 ; FALLBACK6-NEXT:    pushq %r14
 ; FALLBACK6-NEXT:    pushq %r13
 ; FALLBACK6-NEXT:    pushq %r12
 ; FALLBACK6-NEXT:    pushq %rbx
-; FALLBACK6-NEXT:    pushq %rax
 ; FALLBACK6-NEXT:    movups (%rdi), %xmm0
 ; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
 ; FALLBACK6-NEXT:    movups 32(%rdi), %xmm2
@@ -12533,62 +12515,60 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT:    leal (,%rax,8), %esi
-; FALLBACK6-NEXT:    andl $56, %esi
+; FALLBACK6-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK6-NEXT:    andl $56, %ecx
+; FALLBACK6-NEXT:    movl %ecx, %esi
 ; FALLBACK6-NEXT:    andl $56, %eax
-; FALLBACK6-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
-; FALLBACK6-NEXT:    movq -112(%rsp,%rax), %rcx
-; FALLBACK6-NEXT:    movq -104(%rsp,%rax), %rdi
-; FALLBACK6-NEXT:    shrxq %rsi, %rdi, %r12
-; FALLBACK6-NEXT:    movq -96(%rsp,%rax), %r13
-; FALLBACK6-NEXT:    shrxq %rsi, %rcx, %r9
-; FALLBACK6-NEXT:    movq -88(%rsp,%rax), %r10
-; FALLBACK6-NEXT:    shrxq %rsi, %r10, %r14
-; FALLBACK6-NEXT:    shrxq %rsi, %r13, %r15
-; FALLBACK6-NEXT:    movl %esi, %ebx
-; FALLBACK6-NEXT:    notb %bl
-; FALLBACK6-NEXT:    movq -120(%rsp,%rax), %rbp
-; FALLBACK6-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK6-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK6-NEXT:    orq %r11, %r8
-; FALLBACK6-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK6-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK6-NEXT:    orq %r12, %r11
+; FALLBACK6-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r8
+; FALLBACK6-NEXT:    notb %cl
+; FALLBACK6-NEXT:    movq -120(%rsp,%rax), %r10
+; FALLBACK6-NEXT:    movq -112(%rsp,%rax), %r9
+; FALLBACK6-NEXT:    leaq (%r10,%r10), %rdi
+; FALLBACK6-NEXT:    shlxq %rcx, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %r8, %rdi
+; FALLBACK6-NEXT:    movq -104(%rsp,%rax), %r11
+; FALLBACK6-NEXT:    shrxq %rsi, %r11, %rbx
+; FALLBACK6-NEXT:    movq -96(%rsp,%rax), %r14
+; FALLBACK6-NEXT:    leaq (%r14,%r14), %r8
+; FALLBACK6-NEXT:    shlxq %rcx, %r8, %r8
+; FALLBACK6-NEXT:    orq %rbx, %r8
+; FALLBACK6-NEXT:    shrxq %rsi, %r9, %rbx
+; FALLBACK6-NEXT:    addq %r11, %r11
+; FALLBACK6-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK6-NEXT:    orq %rbx, %r11
+; FALLBACK6-NEXT:    movq -88(%rsp,%rax), %rbx
+; FALLBACK6-NEXT:    shrxq %rsi, %rbx, %r15
 ; FALLBACK6-NEXT:    movq -80(%rsp,%rax), %r12
-; FALLBACK6-NEXT:    shrxq %rsi, %r12, %r13
-; FALLBACK6-NEXT:    shrxq %rsi, %rbp, %rbp
+; FALLBACK6-NEXT:    leaq (%r12,%r12), %r13
+; FALLBACK6-NEXT:    shlxq %rcx, %r13, %r13
+; FALLBACK6-NEXT:    orq %r15, %r13
+; FALLBACK6-NEXT:    shrxq %rsi, %r14, %r14
+; FALLBACK6-NEXT:    addq %rbx, %rbx
+; FALLBACK6-NEXT:    shlxq %rcx, %rbx, %rbx
+; FALLBACK6-NEXT:    orq %r14, %rbx
+; FALLBACK6-NEXT:    shrxq %rsi, %r12, %r14
 ; FALLBACK6-NEXT:    movq -72(%rsp,%rax), %rax
-; FALLBACK6-NEXT:    shrxq %rsi, %rax, %rsi
-; FALLBACK6-NEXT:    addq %rdi, %rdi
-; FALLBACK6-NEXT:    shlxq %rbx, %rdi, %rdi
-; FALLBACK6-NEXT:    orq %r9, %rdi
-; FALLBACK6-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK6-NEXT:    shlxq %rbx, %r9, %r9
-; FALLBACK6-NEXT:    orq %r14, %r9
-; FALLBACK6-NEXT:    addq %r10, %r10
-; FALLBACK6-NEXT:    shlxq %rbx, %r10, %r10
-; FALLBACK6-NEXT:    orq %r15, %r10
-; FALLBACK6-NEXT:    addq %rax, %rax
-; FALLBACK6-NEXT:    shlxq %rbx, %rax, %rax
-; FALLBACK6-NEXT:    orq %r13, %rax
-; FALLBACK6-NEXT:    addq %rcx, %rcx
-; FALLBACK6-NEXT:    shlxq %rbx, %rcx, %rcx
-; FALLBACK6-NEXT:    orq %rbp, %rcx
-; FALLBACK6-NEXT:    movq %rsi, 56(%rdx)
+; FALLBACK6-NEXT:    leaq (%rax,%rax), %r15
+; FALLBACK6-NEXT:    shlxq %rcx, %r15, %r15
+; FALLBACK6-NEXT:    orq %r14, %r15
+; FALLBACK6-NEXT:    shrxq %rsi, %r10, %r10
+; FALLBACK6-NEXT:    addq %r9, %r9
+; FALLBACK6-NEXT:    shlxq %rcx, %r9, %rcx
+; FALLBACK6-NEXT:    orq %r10, %rcx
+; FALLBACK6-NEXT:    shrxq %rsi, %rax, %rax
+; FALLBACK6-NEXT:    movq %rax, 56(%rdx)
 ; FALLBACK6-NEXT:    movq %rcx, 8(%rdx)
-; FALLBACK6-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK6-NEXT:    movq %r10, 32(%rdx)
-; FALLBACK6-NEXT:    movq %r9, 40(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK6-NEXT:    movq %r8, (%rdx)
-; FALLBACK6-NEXT:    addq $8, %rsp
+; FALLBACK6-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK6-NEXT:    movq %rbx, 32(%rdx)
+; FALLBACK6-NEXT:    movq %r13, 40(%rdx)
+; FALLBACK6-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, (%rdx)
 ; FALLBACK6-NEXT:    popq %rbx
 ; FALLBACK6-NEXT:    popq %r12
 ; FALLBACK6-NEXT:    popq %r13
 ; FALLBACK6-NEXT:    popq %r14
 ; FALLBACK6-NEXT:    popq %r15
-; FALLBACK6-NEXT:    popq %rbp
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: lshr_64bytes:
@@ -12749,43 +12729,43 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK9-NEXT:    pushq %rbx
 ; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
 ; FALLBACK9-NEXT:    vmovups 32(%rdi), %ymm1
-; FALLBACK9-NEXT:    movl (%rsi), %eax
+; FALLBACK9-NEXT:    movl (%rsi), %edi
 ; FALLBACK9-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK9-NEXT:    leal (,%rdi,8), %ecx
 ; FALLBACK9-NEXT:    andl $56, %ecx
-; FALLBACK9-NEXT:    andl $56, %eax
-; FALLBACK9-NEXT:    movq -96(%rsp,%rax), %rdi
-; FALLBACK9-NEXT:    movq -104(%rsp,%rax), %r9
-; FALLBACK9-NEXT:    movq %r9, %rsi
-; FALLBACK9-NEXT:    shrdq %cl, %rdi, %rsi
-; FALLBACK9-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK9-NEXT:    andl $56, %edi
+; FALLBACK9-NEXT:    movq -96(%rsp,%rdi), %rsi
+; FALLBACK9-NEXT:    movq -104(%rsp,%rdi), %r9
+; FALLBACK9-NEXT:    movq %r9, %rax
+; FALLBACK9-NEXT:    shrdq %cl, %rsi, %rax
+; FALLBACK9-NEXT:    movq -112(%rsp,%rdi), %r10
 ; FALLBACK9-NEXT:    movq %r10, %r8
 ; FALLBACK9-NEXT:    shrdq %cl, %r9, %r8
-; FALLBACK9-NEXT:    movq -80(%rsp,%rax), %r9
-; FALLBACK9-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK9-NEXT:    movq -80(%rsp,%rdi), %r9
+; FALLBACK9-NEXT:    movq -88(%rsp,%rdi), %r11
 ; FALLBACK9-NEXT:    movq %r11, %rbx
 ; FALLBACK9-NEXT:    shrdq %cl, %r9, %rbx
-; FALLBACK9-NEXT:    shrdq %cl, %r11, %rdi
-; FALLBACK9-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK9-NEXT:    shrdq %cl, %r11, %rsi
+; FALLBACK9-NEXT:    movq -72(%rsp,%rdi), %r11
 ; FALLBACK9-NEXT:    shrdq %cl, %r11, %r9
-; FALLBACK9-NEXT:    movq -128(%rsp,%rax), %r14
-; FALLBACK9-NEXT:    movq -120(%rsp,%rax), %rax
-; FALLBACK9-NEXT:    movq %rax, %r15
+; FALLBACK9-NEXT:    movq -128(%rsp,%rdi), %r14
+; FALLBACK9-NEXT:    movq -120(%rsp,%rdi), %rdi
+; FALLBACK9-NEXT:    movq %rdi, %r15
 ; FALLBACK9-NEXT:    shrdq %cl, %r10, %r15
-; FALLBACK9-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK9-NEXT:    shrdq %cl, %rdi, %r14
 ; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK9-NEXT:    shrq %cl, %r11
 ; FALLBACK9-NEXT:    movq %r15, 8(%rdx)
 ; FALLBACK9-NEXT:    movq %r9, 48(%rdx)
 ; FALLBACK9-NEXT:    movq %r11, 56(%rdx)
-; FALLBACK9-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK9-NEXT:    movq %rsi, 32(%rdx)
 ; FALLBACK9-NEXT:    movq %rbx, 40(%rdx)
 ; FALLBACK9-NEXT:    movq %r8, 16(%rdx)
-; FALLBACK9-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT:    movq %rax, 24(%rdx)
 ; FALLBACK9-NEXT:    movq %r14, (%rdx)
 ; FALLBACK9-NEXT:    popq %rbx
 ; FALLBACK9-NEXT:    popq %r14
@@ -12795,77 +12775,73 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK10-LABEL: lshr_64bytes:
 ; FALLBACK10:       # %bb.0:
-; FALLBACK10-NEXT:    pushq %rbp
 ; FALLBACK10-NEXT:    pushq %r15
 ; FALLBACK10-NEXT:    pushq %r14
 ; FALLBACK10-NEXT:    pushq %r13
 ; FALLBACK10-NEXT:    pushq %r12
 ; FALLBACK10-NEXT:    pushq %rbx
-; FALLBACK10-NEXT:    pushq %rax
 ; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
 ; FALLBACK10-NEXT:    vmovups 32(%rdi), %ymm1
-; FALLBACK10-NEXT:    movl (%rsi), %eax
+; FALLBACK10-NEXT:    movl (%rsi), %esi
 ; FALLBACK10-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT:    leal (,%rax,8), %esi
-; FALLBACK10-NEXT:    andl $56, %esi
+; FALLBACK10-NEXT:    leal (,%rsi,8), %eax
 ; FALLBACK10-NEXT:    andl $56, %eax
-; FALLBACK10-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
-; FALLBACK10-NEXT:    movq -112(%rsp,%rax), %rcx
-; FALLBACK10-NEXT:    movq -104(%rsp,%rax), %rdi
-; FALLBACK10-NEXT:    shrxq %rsi, %rdi, %r12
-; FALLBACK10-NEXT:    movq -96(%rsp,%rax), %r13
-; FALLBACK10-NEXT:    shrxq %rsi, %rcx, %r9
-; FALLBACK10-NEXT:    movq -88(%rsp,%rax), %r10
-; FALLBACK10-NEXT:    shrxq %rsi, %r10, %r14
-; FALLBACK10-NEXT:    shrxq %rsi, %r13, %r15
-; FALLBACK10-NEXT:    movl %esi, %ebx
-; FALLBACK10-NEXT:    notb %bl
-; FALLBACK10-NEXT:    movq -120(%rsp,%rax), %rbp
-; FALLBACK10-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK10-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK10-NEXT:    orq %r11, %r8
-; FALLBACK10-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK10-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK10-NEXT:    orq %r12, %r11
-; FALLBACK10-NEXT:    movq -80(%rsp,%rax), %r12
-; FALLBACK10-NEXT:    shrxq %rsi, %r12, %r13
-; FALLBACK10-NEXT:    shrxq %rsi, %rbp, %rbp
-; FALLBACK10-NEXT:    movq -72(%rsp,%rax), %rax
-; FALLBACK10-NEXT:    shrxq %rsi, %rax, %rsi
-; FALLBACK10-NEXT:    addq %rdi, %rdi
-; FALLBACK10-NEXT:    shlxq %rbx, %rdi, %rdi
-; FALLBACK10-NEXT:    orq %r9, %rdi
-; FALLBACK10-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK10-NEXT:    shlxq %rbx, %r9, %r9
-; FALLBACK10-NEXT:    orq %r14, %r9
-; FALLBACK10-NEXT:    addq %r10, %r10
-; FALLBACK10-NEXT:    shlxq %rbx, %r10, %r10
-; FALLBACK10-NEXT:    orq %r15, %r10
-; FALLBACK10-NEXT:    addq %rax, %rax
-; FALLBACK10-NEXT:    shlxq %rbx, %rax, %rax
-; FALLBACK10-NEXT:    orq %r13, %rax
-; FALLBACK10-NEXT:    addq %rcx, %rcx
-; FALLBACK10-NEXT:    shlxq %rbx, %rcx, %rcx
-; FALLBACK10-NEXT:    orq %rbp, %rcx
-; FALLBACK10-NEXT:    movq %rsi, 56(%rdx)
-; FALLBACK10-NEXT:    movq %rcx, 8(%rdx)
-; FALLBACK10-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK10-NEXT:    movq %r10, 32(%rdx)
-; FALLBACK10-NEXT:    movq %r9, 40(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK10-NEXT:    movq %r8, (%rdx)
-; FALLBACK10-NEXT:    addq $8, %rsp
+; FALLBACK10-NEXT:    movl %eax, %ecx
+; FALLBACK10-NEXT:    andl $56, %esi
+; FALLBACK10-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %r8
+; FALLBACK10-NEXT:    notb %al
+; FALLBACK10-NEXT:    movq -120(%rsp,%rsi), %r10
+; FALLBACK10-NEXT:    movq -112(%rsp,%rsi), %r9
+; FALLBACK10-NEXT:    leaq (%r10,%r10), %rdi
+; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %r8, %rdi
+; FALLBACK10-NEXT:    movq -104(%rsp,%rsi), %r11
+; FALLBACK10-NEXT:    shrxq %rcx, %r11, %rbx
+; FALLBACK10-NEXT:    movq -96(%rsp,%rsi), %r14
+; FALLBACK10-NEXT:    leaq (%r14,%r14), %r8
+; FALLBACK10-NEXT:    shlxq %rax, %r8, %r8
+; FALLBACK10-NEXT:    orq %rbx, %r8
+; FALLBACK10-NEXT:    shrxq %rcx, %r9, %rbx
+; FALLBACK10-NEXT:    addq %r11, %r11
+; FALLBACK10-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK10-NEXT:    orq %rbx, %r11
+; FALLBACK10-NEXT:    movq -88(%rsp,%rsi), %rbx
+; FALLBACK10-NEXT:    shrxq %rcx, %rbx, %r15
+; FALLBACK10-NEXT:    movq -80(%rsp,%rsi), %r12
+; FALLBACK10-NEXT:    leaq (%r12,%r12), %r13
+; FALLBACK10-NEXT:    shlxq %rax, %r13, %r13
+; FALLBACK10-NEXT:    orq %r15, %r13
+; FALLBACK10-NEXT:    shrxq %rcx, %r14, %r14
+; FALLBACK10-NEXT:    addq %rbx, %rbx
+; FALLBACK10-NEXT:    shlxq %rax, %rbx, %rbx
+; FALLBACK10-NEXT:    orq %r14, %rbx
+; FALLBACK10-NEXT:    shrxq %rcx, %r12, %r14
+; FALLBACK10-NEXT:    movq -72(%rsp,%rsi), %rsi
+; FALLBACK10-NEXT:    leaq (%rsi,%rsi), %r15
+; FALLBACK10-NEXT:    shlxq %rax, %r15, %r15
+; FALLBACK10-NEXT:    orq %r14, %r15
+; FALLBACK10-NEXT:    shrxq %rcx, %r10, %r10
+; FALLBACK10-NEXT:    addq %r9, %r9
+; FALLBACK10-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK10-NEXT:    orq %r10, %rax
+; FALLBACK10-NEXT:    shrxq %rcx, %rsi, %rcx
+; FALLBACK10-NEXT:    movq %rcx, 56(%rdx)
+; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK10-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK10-NEXT:    movq %rbx, 32(%rdx)
+; FALLBACK10-NEXT:    movq %r13, 40(%rdx)
+; FALLBACK10-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, (%rdx)
 ; FALLBACK10-NEXT:    popq %rbx
 ; FALLBACK10-NEXT:    popq %r12
 ; FALLBACK10-NEXT:    popq %r13
 ; FALLBACK10-NEXT:    popq %r14
 ; FALLBACK10-NEXT:    popq %r15
-; FALLBACK10-NEXT:    popq %rbp
 ; FALLBACK10-NEXT:    vzeroupper
 ; FALLBACK10-NEXT:    retq
 ;
@@ -12930,45 +12906,45 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK12-NEXT:    pushq %rbx
 ; FALLBACK12-NEXT:    pushq %rax
 ; FALLBACK12-NEXT:    vmovups (%rdi), %zmm0
-; FALLBACK12-NEXT:    movl (%rsi), %r9d
+; FALLBACK12-NEXT:    movl (%rsi), %r10d
 ; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK12-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK12-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT:    leal (,%r9,8), %eax
+; FALLBACK12-NEXT:    leal (,%r10,8), %eax
 ; FALLBACK12-NEXT:    andl $56, %eax
-; FALLBACK12-NEXT:    andl $56, %r9d
-; FALLBACK12-NEXT:    movq -128(%rsp,%r9), %r10
-; FALLBACK12-NEXT:    movq -120(%rsp,%r9), %r8
+; FALLBACK12-NEXT:    andl $56, %r10d
+; FALLBACK12-NEXT:    movq -128(%rsp,%r10), %r9
+; FALLBACK12-NEXT:    movq -120(%rsp,%r10), %r8
 ; FALLBACK12-NEXT:    movl %eax, %ecx
-; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    shrq %cl, %r9
 ; FALLBACK12-NEXT:    movl %eax, %esi
 ; FALLBACK12-NEXT:    notb %sil
 ; FALLBACK12-NEXT:    leaq (%r8,%r8), %rdi
 ; FALLBACK12-NEXT:    movl %esi, %ecx
 ; FALLBACK12-NEXT:    shlq %cl, %rdi
-; FALLBACK12-NEXT:    orq %r10, %rdi
-; FALLBACK12-NEXT:    movq -104(%rsp,%r9), %r10
-; FALLBACK12-NEXT:    movq %r10, %rbx
+; FALLBACK12-NEXT:    orq %r9, %rdi
+; FALLBACK12-NEXT:    movq -104(%rsp,%r10), %r9
+; FALLBACK12-NEXT:    movq %r9, %rbx
 ; FALLBACK12-NEXT:    movl %eax, %ecx
 ; FALLBACK12-NEXT:    shrq %cl, %rbx
-; FALLBACK12-NEXT:    movq -96(%rsp,%r9), %r12
+; FALLBACK12-NEXT:    movq -96(%rsp,%r10), %r12
 ; FALLBACK12-NEXT:    leaq (%r12,%r12), %r11
 ; FALLBACK12-NEXT:    movl %esi, %ecx
 ; FALLBACK12-NEXT:    shlq %cl, %r11
 ; FALLBACK12-NEXT:    orq %rbx, %r11
-; FALLBACK12-NEXT:    movq -112(%rsp,%r9), %rbx
+; FALLBACK12-NEXT:    movq -112(%rsp,%r10), %rbx
 ; FALLBACK12-NEXT:    movq %rbx, %r14
 ; FALLBACK12-NEXT:    movl %eax, %ecx
 ; FALLBACK12-NEXT:    shrq %cl, %r14
-; FALLBACK12-NEXT:    addq %r10, %r10
+; FALLBACK12-NEXT:    addq %r9, %r9
 ; FALLBACK12-NEXT:    movl %esi, %ecx
-; FALLBACK12-NEXT:    shlq %cl, %r10
-; FALLBACK12-NEXT:    orq %r14, %r10
-; FALLBACK12-NEXT:    movq -88(%rsp,%r9), %r14
+; FALLBACK12-NEXT:    shlq %cl, %r9
+; FALLBACK12-NEXT:    orq %r14, %r9
+; FALLBACK12-NEXT:    movq -88(%rsp,%r10), %r14
 ; FALLBACK12-NEXT:    movq %r14, %r13
 ; FALLBACK12-NEXT:    movl %eax, %ecx
 ; FALLBACK12-NEXT:    shrq %cl, %r13
-; FALLBACK12-NEXT:    movq -80(%rsp,%r9), %rbp
+; FALLBACK12-NEXT:    movq -80(%rsp,%r10), %rbp
 ; FALLBACK12-NEXT:    leaq (%rbp,%rbp), %r15
 ; FALLBACK12-NEXT:    movl %esi, %ecx
 ; FALLBACK12-NEXT:    shlq %cl, %r15
@@ -12981,8 +12957,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK12-NEXT:    orq %r12, %r14
 ; FALLBACK12-NEXT:    movl %eax, %ecx
 ; FALLBACK12-NEXT:    shrq %cl, %rbp
-; FALLBACK12-NEXT:    movq -72(%rsp,%r9), %r9
-; FALLBACK12-NEXT:    leaq (%r9,%r9), %r12
+; FALLBACK12-NEXT:    movq -72(%rsp,%r10), %r10
+; FALLBACK12-NEXT:    leaq (%r10,%r10), %r12
 ; FALLBACK12-NEXT:    movl %esi, %ecx
 ; FALLBACK12-NEXT:    shlq %cl, %r12
 ; FALLBACK12-NEXT:    orq %rbp, %r12
@@ -12993,13 +12969,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK12-NEXT:    shlq %cl, %rbx
 ; FALLBACK12-NEXT:    orq %r8, %rbx
 ; FALLBACK12-NEXT:    movl %eax, %ecx
-; FALLBACK12-NEXT:    shrq %cl, %r9
-; FALLBACK12-NEXT:    movq %r9, 56(%rdx)
+; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    movq %r10, 56(%rdx)
 ; FALLBACK12-NEXT:    movq %rbx, 8(%rdx)
 ; FALLBACK12-NEXT:    movq %r12, 48(%rdx)
 ; FALLBACK12-NEXT:    movq %r14, 32(%rdx)
 ; FALLBACK12-NEXT:    movq %r15, 40(%rdx)
-; FALLBACK12-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK12-NEXT:    movq %r9, 16(%rdx)
 ; FALLBACK12-NEXT:    movq %r11, 24(%rdx)
 ; FALLBACK12-NEXT:    movq %rdi, (%rdx)
 ; FALLBACK12-NEXT:    addq $8, %rsp
@@ -13062,74 +13038,70 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK14-LABEL: lshr_64bytes:
 ; FALLBACK14:       # %bb.0:
-; FALLBACK14-NEXT:    pushq %rbp
 ; FALLBACK14-NEXT:    pushq %r15
 ; FALLBACK14-NEXT:    pushq %r14
 ; FALLBACK14-NEXT:    pushq %r13
 ; FALLBACK14-NEXT:    pushq %r12
 ; FALLBACK14-NEXT:    pushq %rbx
-; FALLBACK14-NEXT:    pushq %rax
 ; FALLBACK14-NEXT:    vmovups (%rdi), %zmm0
 ; FALLBACK14-NEXT:    movl (%rsi), %esi
 ; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK14-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT:    leal (,%rsi,8), %ecx
-; FALLBACK14-NEXT:    andl $56, %ecx
+; FALLBACK14-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK14-NEXT:    andl $56, %eax
+; FALLBACK14-NEXT:    movl %eax, %ecx
 ; FALLBACK14-NEXT:    andl $56, %esi
-; FALLBACK14-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %r11
-; FALLBACK14-NEXT:    movq -112(%rsp,%rsi), %rax
-; FALLBACK14-NEXT:    movq -104(%rsp,%rsi), %rdi
-; FALLBACK14-NEXT:    shrxq %rcx, %rdi, %r12
-; FALLBACK14-NEXT:    movq -96(%rsp,%rsi), %r13
-; FALLBACK14-NEXT:    shrxq %rcx, %rax, %r9
-; FALLBACK14-NEXT:    movq -88(%rsp,%rsi), %r10
-; FALLBACK14-NEXT:    shrxq %rcx, %r10, %r14
-; FALLBACK14-NEXT:    shrxq %rcx, %r13, %r15
-; FALLBACK14-NEXT:    movl %ecx, %ebx
-; FALLBACK14-NEXT:    notb %bl
-; FALLBACK14-NEXT:    movq -120(%rsp,%rsi), %rbp
-; FALLBACK14-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK14-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK14-NEXT:    orq %r11, %r8
-; FALLBACK14-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK14-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK14-NEXT:    orq %r12, %r11
+; FALLBACK14-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %r8
+; FALLBACK14-NEXT:    notb %al
+; FALLBACK14-NEXT:    movq -120(%rsp,%rsi), %r10
+; FALLBACK14-NEXT:    movq -112(%rsp,%rsi), %r9
+; FALLBACK14-NEXT:    leaq (%r10,%r10), %rdi
+; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %r8, %rdi
+; FALLBACK14-NEXT:    movq -104(%rsp,%rsi), %r11
+; FALLBACK14-NEXT:    shrxq %rcx, %r11, %rbx
+; FALLBACK14-NEXT:    movq -96(%rsp,%rsi), %r14
+; FALLBACK14-NEXT:    leaq (%r14,%r14), %r8
+; FALLBACK14-NEXT:    shlxq %rax, %r8, %r8
+; FALLBACK14-NEXT:    orq %rbx, %r8
+; FALLBACK14-NEXT:    shrxq %rcx, %r9, %rbx
+; FALLBACK14-NEXT:    addq %r11, %r11
+; FALLBACK14-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK14-NEXT:    orq %rbx, %r11
+; FALLBACK14-NEXT:    movq -88(%rsp,%rsi), %rbx
+; FALLBACK14-NEXT:    shrxq %rcx, %rbx, %r15
 ; FALLBACK14-NEXT:    movq -80(%rsp,%rsi), %r12
-; FALLBACK14-NEXT:    shrxq %rcx, %r12, %r13
-; FALLBACK14-NEXT:    shrxq %rcx, %rbp, %rbp
+; FALLBACK14-NEXT:    leaq (%r12,%r12), %r13
+; FALLBACK14-NEXT:    shlxq %rax, %r13, %r13
+; FALLBACK14-NEXT:    orq %r15, %r13
+; FALLBACK14-NEXT:    shrxq %rcx, %r14, %r14
+; FALLBACK14-NEXT:    addq %rbx, %rbx
+; FALLBACK14-NEXT:    shlxq %rax, %rbx, %rbx
+; FALLBACK14-NEXT:    orq %r14, %rbx
+; FALLBACK14-NEXT:    shrxq %rcx, %r12, %r14
 ; FALLBACK14-NEXT:    movq -72(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT:    leaq (%rsi,%rsi), %r15
+; FALLBACK14-NEXT:    shlxq %rax, %r15, %r15
+; FALLBACK14-NEXT:    orq %r14, %r15
+; FALLBACK14-NEXT:    shrxq %rcx, %r10, %r10
+; FALLBACK14-NEXT:    addq %r9, %r9
+; FALLBACK14-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK14-NEXT:    orq %r10, %rax
 ; FALLBACK14-NEXT:    shrxq %rcx, %rsi, %rcx
-; FALLBACK14-NEXT:    addq %rdi, %rdi
-; FALLBACK14-NEXT:    shlxq %rbx, %rdi, %rdi
-; FALLBACK14-NEXT:    orq %r9, %rdi
-; FALLBACK14-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK14-NEXT:    shlxq %rbx, %r9, %r9
-; FALLBACK14-NEXT:    orq %r14, %r9
-; FALLBACK14-NEXT:    addq %r10, %r10
-; FALLBACK14-NEXT:    shlxq %rbx, %r10, %r10
-; FALLBACK14-NEXT:    orq %r15, %r10
-; FALLBACK14-NEXT:    addq %rsi, %rsi
-; FALLBACK14-NEXT:    shlxq %rbx, %rsi, %rsi
-; FALLBACK14-NEXT:    orq %r13, %rsi
-; FALLBACK14-NEXT:    addq %rax, %rax
-; FALLBACK14-NEXT:    shlxq %rbx, %rax, %rax
-; FALLBACK14-NEXT:    orq %rbp, %rax
 ; FALLBACK14-NEXT:    movq %rcx, 56(%rdx)
 ; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK14-NEXT:    movq %rsi, 48(%rdx)
-; FALLBACK14-NEXT:    movq %r10, 32(%rdx)
-; FALLBACK14-NEXT:    movq %r9, 40(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK14-NEXT:    movq %r8, (%rdx)
-; FALLBACK14-NEXT:    addq $8, %rsp
+; FALLBACK14-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK14-NEXT:    movq %rbx, 32(%rdx)
+; FALLBACK14-NEXT:    movq %r13, 40(%rdx)
+; FALLBACK14-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, (%rdx)
 ; FALLBACK14-NEXT:    popq %rbx
 ; FALLBACK14-NEXT:    popq %r12
 ; FALLBACK14-NEXT:    popq %r13
 ; FALLBACK14-NEXT:    popq %r14
 ; FALLBACK14-NEXT:    popq %r15
-; FALLBACK14-NEXT:    popq %rbp
 ; FALLBACK14-NEXT:    vzeroupper
 ; FALLBACK14-NEXT:    retq
 ;
@@ -13139,40 +13111,40 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK15-NEXT:    pushq %r14
 ; FALLBACK15-NEXT:    pushq %rbx
 ; FALLBACK15-NEXT:    vmovups (%rdi), %zmm0
-; FALLBACK15-NEXT:    movl (%rsi), %eax
+; FALLBACK15-NEXT:    movl (%rsi), %edi
 ; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK15-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK15-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK15-NEXT:    leal (,%rdi,8), %ecx
 ; FALLBACK15-NEXT:    andl $56, %ecx
-; FALLBACK15-NEXT:    andl $56, %eax
-; FALLBACK15-NEXT:    movq -96(%rsp,%rax), %rdi
-; FALLBACK15-NEXT:    movq -104(%rsp,%rax), %r9
-; FALLBACK15-NEXT:    movq %r9, %rsi
-; FALLBACK15-NEXT:    shrdq %cl, %rdi, %rsi
-; FALLBACK15-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK15-NEXT:    andl $56, %edi
+; FALLBACK15-NEXT:    movq -96(%rsp,%rdi), %rsi
+; FALLBACK15-NEXT:    movq -104(%rsp,%rdi), %r9
+; FALLBACK15-NEXT:    movq %r9, %rax
+; FALLBACK15-NEXT:    shrdq %cl, %rsi, %rax
+; FALLBACK15-NEXT:    movq -112(%rsp,%rdi), %r10
 ; FALLBACK15-NEXT:    movq %r10, %r8
 ; FALLBACK15-NEXT:    shrdq %cl, %r9, %r8
-; FALLBACK15-NEXT:    movq -80(%rsp,%rax), %r9
-; FALLBACK15-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK15-NEXT:    movq -80(%rsp,%rdi), %r9
+; FALLBACK15-NEXT:    movq -88(%rsp,%rdi), %r11
 ; FALLBACK15-NEXT:    movq %r11, %rbx
 ; FALLBACK15-NEXT:    shrdq %cl, %r9, %rbx
-; FALLBACK15-NEXT:    shrdq %cl, %r11, %rdi
-; FALLBACK15-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK15-NEXT:    shrdq %cl, %r11, %rsi
+; FALLBACK15-NEXT:    movq -72(%rsp,%rdi), %r11
 ; FALLBACK15-NEXT:    shrdq %cl, %r11, %r9
-; FALLBACK15-NEXT:    movq -128(%rsp,%rax), %r14
-; FALLBACK15-NEXT:    movq -120(%rsp,%rax), %rax
-; FALLBACK15-NEXT:    movq %rax, %r15
+; FALLBACK15-NEXT:    movq -128(%rsp,%rdi), %r14
+; FALLBACK15-NEXT:    movq -120(%rsp,%rdi), %rdi
+; FALLBACK15-NEXT:    movq %rdi, %r15
 ; FALLBACK15-NEXT:    shrdq %cl, %r10, %r15
 ; FALLBACK15-NEXT:    shrxq %rcx, %r11, %r10
 ; FALLBACK15-NEXT:    # kill: def $cl killed $cl killed $rcx
-; FALLBACK15-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK15-NEXT:    shrdq %cl, %rdi, %r14
 ; FALLBACK15-NEXT:    movq %r15, 8(%rdx)
 ; FALLBACK15-NEXT:    movq %r9, 48(%rdx)
-; FALLBACK15-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK15-NEXT:    movq %rsi, 32(%rdx)
 ; FALLBACK15-NEXT:    movq %rbx, 40(%rdx)
 ; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
-; FALLBACK15-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK15-NEXT:    movq %rax, 24(%rdx)
 ; FALLBACK15-NEXT:    movq %r14, (%rdx)
 ; FALLBACK15-NEXT:    movq %r10, 56(%rdx)
 ; FALLBACK15-NEXT:    popq %rbx
@@ -13618,14 +13590,15 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 36(%eax), %ecx
 ; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 40(%eax), %ebp
-; FALLBACK18-NEXT:    movl 44(%eax), %ebx
+; FALLBACK18-NEXT:    movl 40(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 44(%eax), %ebp
 ; FALLBACK18-NEXT:    movl 48(%eax), %edi
 ; FALLBACK18-NEXT:    movl 52(%eax), %esi
 ; FALLBACK18-NEXT:    movl 56(%eax), %edx
 ; FALLBACK18-NEXT:    movl 60(%eax), %ecx
 ; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT:    movl (%eax), %eax
+; FALLBACK18-NEXT:    movl (%eax), %ebx
 ; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
 ; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
@@ -13634,136 +13607,138 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl %eax, %ecx
-; FALLBACK18-NEXT:    leal (,%eax,8), %edx
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    leal (,%ebx,8), %edx
 ; FALLBACK18-NEXT:    andl $24, %edx
-; FALLBACK18-NEXT:    andl $60, %ecx
-; FALLBACK18-NEXT:    movl 68(%esp,%ecx), %esi
-; FALLBACK18-NEXT:    movl 72(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl %edx, %ecx
+; FALLBACK18-NEXT:    andl $60, %ebx
+; FALLBACK18-NEXT:    movl 68(%esp,%ebx), %esi
+; FALLBACK18-NEXT:    movl 72(%esp,%ebx), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, %esi, %edi
-; FALLBACK18-NEXT:    movl %edx, %ebx
-; FALLBACK18-NEXT:    notb %bl
+; FALLBACK18-NEXT:    shrxl %ecx, %esi, %edi
+; FALLBACK18-NEXT:    notb %dl
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %ebp
-; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
 ; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %ecx, 64(%esp,%ebx), %edi
 ; FALLBACK18-NEXT:    addl %esi, %esi
-; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    movl 80(%esp,%ebx), %esi
 ; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT:    movl 76(%esp,%ecx), %edi
-; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK18-NEXT:    movl 76(%esp,%ebx), %edi
+; FALLBACK18-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK18-NEXT:    orl %eax, %edi
 ; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl 88(%esp,%ebx), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT:    movl 84(%esp,%ecx), %edi
-; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK18-NEXT:    movl 84(%esp,%ebx), %edi
+; FALLBACK18-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    shrxl %ecx, %esi, %esi
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    orl %esi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    movl 96(%esp,%ebx), %esi
 ; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT:    movl 92(%esp,%ecx), %edi
-; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK18-NEXT:    movl 92(%esp,%ebx), %edi
+; FALLBACK18-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK18-NEXT:    orl %eax, %edi
 ; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl 104(%esp,%ebx), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT:    movl 100(%esp,%ecx), %edi
-; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK18-NEXT:    movl 100(%esp,%ebx), %edi
+; FALLBACK18-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    shrxl %ecx, %esi, %esi
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    orl %esi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 112(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl 112(%esp,%ebx), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
-; FALLBACK18-NEXT:    movl 108(%esp,%ecx), %esi
-; FALLBACK18-NEXT:    movl %ecx, %edi
-; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK18-NEXT:    movl 108(%esp,%ebx), %esi
+; FALLBACK18-NEXT:    shrxl %ecx, %esi, %edi
+; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %ecx, %ebp
 ; FALLBACK18-NEXT:    addl %esi, %esi
-; FALLBACK18-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK18-NEXT:    orl %ecx, %esi
-; FALLBACK18-NEXT:    movl 120(%esp,%edi), %ebp
-; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
-; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK18-NEXT:    movl 116(%esp,%edi), %eax
-; FALLBACK18-NEXT:    shrxl %edx, %eax, %edi
-; FALLBACK18-NEXT:    orl %edi, %ecx
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %ecx
+; FALLBACK18-NEXT:    orl %eax, %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 120(%esp,%ebx), %edi
+; FALLBACK18-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK18-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK18-NEXT:    movl 116(%esp,%ebx), %eax
+; FALLBACK18-NEXT:    movl %ebp, %ecx
+; FALLBACK18-NEXT:    shrxl %ebp, %eax, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %esi
+; FALLBACK18-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl %ecx, %ebp
 ; FALLBACK18-NEXT:    addl %eax, %eax
-; FALLBACK18-NEXT:    shlxl %ebx, %eax, %edi
-; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK18-NEXT:    shrxl %edx, %ebp, %eax
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK18-NEXT:    movl 124(%esp,%ebp), %ebp
-; FALLBACK18-NEXT:    shrxl %edx, %ebp, %edx
-; FALLBACK18-NEXT:    addl %ebp, %ebp
-; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %ebx
-; FALLBACK18-NEXT:    orl %eax, %ebx
+; FALLBACK18-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl 124(%esp,%ebx), %eax
+; FALLBACK18-NEXT:    leal (%eax,%eax), %ebx
+; FALLBACK18-NEXT:    shlxl %edx, %ebx, %edx
+; FALLBACK18-NEXT:    shrxl %ebp, %edi, %edi
+; FALLBACK18-NEXT:    orl %edi, %edx
+; FALLBACK18-NEXT:    shrxl %ebp, %eax, %edi
 ; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT:    movl %edx, 60(%eax)
-; FALLBACK18-NEXT:    movl %ebx, 56(%eax)
-; FALLBACK18-NEXT:    movl %edi, 48(%eax)
-; FALLBACK18-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK18-NEXT:    movl %esi, 40(%eax)
+; FALLBACK18-NEXT:    movl %edi, 60(%eax)
+; FALLBACK18-NEXT:    movl %edx, 56(%eax)
+; FALLBACK18-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK18-NEXT:    movl %esi, 52(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 40(%eax)
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK18-NEXT:    movl %ecx, 44(%eax)
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -14284,7 +14259,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
 ; FALLBACK22-NEXT:    movups 32(%ecx), %xmm2
 ; FALLBACK22-NEXT:    movups 48(%ecx), %xmm3
-; FALLBACK22-NEXT:    movl (%eax), %ecx
+; FALLBACK22-NEXT:    movl (%eax), %ebx
 ; FALLBACK22-NEXT:    xorps %xmm4, %xmm4
 ; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
@@ -14294,112 +14269,114 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    leal (,%ecx,8), %edx
+; FALLBACK22-NEXT:    leal (,%ebx,8), %edx
 ; FALLBACK22-NEXT:    andl $24, %edx
-; FALLBACK22-NEXT:    andl $60, %ecx
-; FALLBACK22-NEXT:    movl 68(%esp,%ecx), %esi
-; FALLBACK22-NEXT:    movl 72(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl %edx, %ecx
+; FALLBACK22-NEXT:    andl $60, %ebx
+; FALLBACK22-NEXT:    movl 68(%esp,%ebx), %esi
+; FALLBACK22-NEXT:    movl 72(%esp,%ebx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %edi
-; FALLBACK22-NEXT:    movl %edx, %ebx
-; FALLBACK22-NEXT:    notb %bl
+; FALLBACK22-NEXT:    shrxl %ecx, %esi, %edi
+; FALLBACK22-NEXT:    notb %dl
 ; FALLBACK22-NEXT:    leal (%eax,%eax), %ebp
-; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebp
-; FALLBACK22-NEXT:    orl %edi, %ebp
-; FALLBACK22-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shlxl %edx, %ebp, %eax
+; FALLBACK22-NEXT:    orl %edi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %ecx, 64(%esp,%ebx), %edi
 ; FALLBACK22-NEXT:    addl %esi, %esi
-; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %edi, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK22-NEXT:    orl %edi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 80(%esp,%ebx), %esi
 ; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    movl 76(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK22-NEXT:    movl 76(%esp,%ebx), %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK22-NEXT:    orl %eax, %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl 88(%esp,%ebx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    movl 84(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK22-NEXT:    movl 84(%esp,%ebx), %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    shrxl %ecx, %esi, %esi
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK22-NEXT:    orl %esi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    movl 96(%esp,%ebx), %esi
 ; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    movl 92(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK22-NEXT:    movl 92(%esp,%ebx), %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK22-NEXT:    orl %eax, %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl 104(%esp,%ebx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    movl 100(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK22-NEXT:    movl 100(%esp,%ebx), %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    shrxl %ecx, %esi, %esi
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK22-NEXT:    orl %esi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl %ecx, %eax
-; FALLBACK22-NEXT:    movl 112(%esp,%ecx), %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    leal (%ecx,%ecx), %esi
-; FALLBACK22-NEXT:    shlxl %ebx, %esi, %ecx
-; FALLBACK22-NEXT:    movl 108(%esp,%eax), %esi
+; FALLBACK22-NEXT:    movl 112(%esp,%ebx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK22-NEXT:    movl 108(%esp,%ebx), %esi
+; FALLBACK22-NEXT:    shrxl %ecx, %esi, %edi
+; FALLBACK22-NEXT:    orl %edi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %ecx, %ebp
 ; FALLBACK22-NEXT:    addl %esi, %esi
-; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %ecx, %esi
-; FALLBACK22-NEXT:    movl 120(%esp,%eax), %ebp
-; FALLBACK22-NEXT:    leal (%ebp,%ebp), %ecx
-; FALLBACK22-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT:    movl 116(%esp,%eax), %eax
-; FALLBACK22-NEXT:    shrxl %edx, %eax, %edi
-; FALLBACK22-NEXT:    orl %edi, %ecx
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %ecx
+; FALLBACK22-NEXT:    orl %eax, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 120(%esp,%ebx), %edi
+; FALLBACK22-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK22-NEXT:    movl 116(%esp,%ebx), %eax
+; FALLBACK22-NEXT:    movl %ebp, %ecx
+; FALLBACK22-NEXT:    shrxl %ebp, %eax, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %esi
+; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl %ecx, %ebp
 ; FALLBACK22-NEXT:    addl %eax, %eax
-; FALLBACK22-NEXT:    shlxl %ebx, %eax, %edi
-; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK22-NEXT:    shrxl %edx, %ebp, %eax
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK22-NEXT:    movl 124(%esp,%ebp), %ebp
-; FALLBACK22-NEXT:    shrxl %edx, %ebp, %edx
-; FALLBACK22-NEXT:    addl %ebp, %ebp
-; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebx
-; FALLBACK22-NEXT:    orl %eax, %ebx
+; FALLBACK22-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl 124(%esp,%ebx), %eax
+; FALLBACK22-NEXT:    leal (%eax,%eax), %ebx
+; FALLBACK22-NEXT:    shlxl %edx, %ebx, %edx
+; FALLBACK22-NEXT:    shrxl %ebp, %edi, %edi
+; FALLBACK22-NEXT:    orl %edi, %edx
+; FALLBACK22-NEXT:    shrxl %ebp, %eax, %edi
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT:    movl %edx, 60(%eax)
-; FALLBACK22-NEXT:    movl %ebx, 56(%eax)
-; FALLBACK22-NEXT:    movl %edi, 48(%eax)
-; FALLBACK22-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK22-NEXT:    movl %esi, 40(%eax)
+; FALLBACK22-NEXT:    movl %edi, 60(%eax)
+; FALLBACK22-NEXT:    movl %edx, 56(%eax)
+; FALLBACK22-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK22-NEXT:    movl %esi, 52(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 40(%eax)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK22-NEXT:    movl %ecx, 44(%eax)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -14873,109 +14850,107 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    leal (,%ecx,8), %edx
 ; FALLBACK26-NEXT:    andl $24, %edx
+; FALLBACK26-NEXT:    movl %edx, %ebx
 ; FALLBACK26-NEXT:    andl $60, %ecx
 ; FALLBACK26-NEXT:    movl 68(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    movl 72(%esp,%ecx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %edi
-; FALLBACK26-NEXT:    movl %edx, %ebx
-; FALLBACK26-NEXT:    notb %bl
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK26-NEXT:    notb %dl
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %ebp
-; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK26-NEXT:    shlxl %edx, %ebp, %ebp
 ; FALLBACK26-NEXT:    orl %edi, %ebp
 ; FALLBACK26-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
 ; FALLBACK26-NEXT:    addl %esi, %esi
-; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %esi
 ; FALLBACK26-NEXT:    orl %edi, %esi
 ; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 80(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    movl 76(%esp,%ecx), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK26-NEXT:    orl %eax, %edi
 ; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 88(%esp,%ecx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    movl 84(%esp,%ecx), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    orl %esi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    movl 92(%esp,%ecx), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK26-NEXT:    orl %eax, %edi
 ; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 104(%esp,%ecx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    movl 100(%esp,%ecx), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    orl %esi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 112(%esp,%ecx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK26-NEXT:    movl 108(%esp,%ecx), %esi
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK26-NEXT:    orl %edi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %esi, %esi
-; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK26-NEXT:    orl %eax, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 120(%esp,%ecx), %ebp
-; FALLBACK26-NEXT:    leal (%ebp,%ebp), %eax
-; FALLBACK26-NEXT:    shlxl %ebx, %eax, %esi
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    orl %eax, %ebp
+; FALLBACK26-NEXT:    movl 120(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %eax, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %eax, %esi
 ; FALLBACK26-NEXT:    movl 116(%esp,%ecx), %eax
-; FALLBACK26-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %edi
 ; FALLBACK26-NEXT:    orl %edi, %esi
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %eax, %eax
-; FALLBACK26-NEXT:    shlxl %ebx, %eax, %edi
-; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK26-NEXT:    shrxl %edx, %ebp, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %eax, %eax
+; FALLBACK26-NEXT:    orl %edi, %eax
 ; FALLBACK26-NEXT:    movl 124(%esp,%ecx), %ecx
-; FALLBACK26-NEXT:    shrxl %edx, %ecx, %edx
-; FALLBACK26-NEXT:    addl %ecx, %ecx
-; FALLBACK26-NEXT:    shlxl %ebx, %ecx, %ebx
-; FALLBACK26-NEXT:    orl %eax, %ebx
+; FALLBACK26-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %edx
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    orl %edi, %edx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %edi
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK26-NEXT:    movl %edx, 60(%ecx)
-; FALLBACK26-NEXT:    movl %ebx, 56(%ecx)
-; FALLBACK26-NEXT:    movl %edi, 48(%ecx)
+; FALLBACK26-NEXT:    movl %edi, 60(%ecx)
+; FALLBACK26-NEXT:    movl %edx, 56(%ecx)
+; FALLBACK26-NEXT:    movl %eax, 48(%ecx)
 ; FALLBACK26-NEXT:    movl %esi, 52(%ecx)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    movl %eax, 40(%ecx)
+; FALLBACK26-NEXT:    movl %ebp, 40(%ecx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK26-NEXT:    movl %eax, 44(%ecx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -15430,115 +15405,113 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK30-NEXT:    vmovups (%ecx), %zmm0
-; FALLBACK30-NEXT:    movl (%eax), %edx
+; FALLBACK30-NEXT:    movl (%eax), %ecx
 ; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK30-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    leal (,%edx,8), %ecx
-; FALLBACK30-NEXT:    andl $24, %ecx
-; FALLBACK30-NEXT:    andl $60, %edx
-; FALLBACK30-NEXT:    movl 68(%esp,%edx), %esi
-; FALLBACK30-NEXT:    movl 72(%esp,%edx), %eax
+; FALLBACK30-NEXT:    leal (,%ecx,8), %edx
+; FALLBACK30-NEXT:    andl $24, %edx
+; FALLBACK30-NEXT:    movl %edx, %ebx
+; FALLBACK30-NEXT:    andl $60, %ecx
+; FALLBACK30-NEXT:    movl 68(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    movl 72(%esp,%ecx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %ecx, %esi, %edi
-; FALLBACK30-NEXT:    movl %ecx, %ebx
-; FALLBACK30-NEXT:    notb %bl
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK30-NEXT:    notb %dl
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %ebp
-; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK30-NEXT:    shlxl %edx, %ebp, %ebp
 ; FALLBACK30-NEXT:    orl %edi, %ebp
 ; FALLBACK30-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %ecx, 64(%esp,%edx), %edi
+; FALLBACK30-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
 ; FALLBACK30-NEXT:    addl %esi, %esi
-; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %esi
 ; FALLBACK30-NEXT:    orl %edi, %esi
 ; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 80(%esp,%edx), %esi
+; FALLBACK30-NEXT:    movl 80(%esp,%ecx), %esi
 ; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT:    movl 76(%esp,%edx), %edi
-; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK30-NEXT:    movl 76(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK30-NEXT:    orl %eax, %edi
 ; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 88(%esp,%edx), %eax
+; FALLBACK30-NEXT:    movl 88(%esp,%ecx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT:    movl 84(%esp,%edx), %edi
-; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK30-NEXT:    movl 84(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %ecx, %esi, %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK30-NEXT:    orl %esi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 96(%esp,%edx), %esi
+; FALLBACK30-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT:    movl 92(%esp,%edx), %edi
-; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK30-NEXT:    movl 92(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK30-NEXT:    orl %eax, %edi
 ; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 104(%esp,%edx), %eax
+; FALLBACK30-NEXT:    movl 104(%esp,%ecx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT:    movl 100(%esp,%edx), %edi
-; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK30-NEXT:    movl 100(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %ecx, %esi, %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK30-NEXT:    orl %esi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 112(%esp,%edx), %eax
+; FALLBACK30-NEXT:    movl 112(%esp,%ecx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
-; FALLBACK30-NEXT:    movl 108(%esp,%edx), %esi
-; FALLBACK30-NEXT:    shrxl %ecx, %esi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK30-NEXT:    movl 108(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK30-NEXT:    orl %edi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %esi, %esi
-; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK30-NEXT:    orl %eax, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 120(%esp,%edx), %ebp
-; FALLBACK30-NEXT:    leal (%ebp,%ebp), %eax
-; FALLBACK30-NEXT:    shlxl %ebx, %eax, %esi
-; FALLBACK30-NEXT:    movl 116(%esp,%edx), %eax
-; FALLBACK30-NEXT:    shrxl %ecx, %eax, %edi
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT:    orl %eax, %ebp
+; FALLBACK30-NEXT:    movl 120(%esp,%ecx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %eax, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %eax, %esi
+; FALLBACK30-NEXT:    movl 116(%esp,%ecx), %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %eax, %edi
 ; FALLBACK30-NEXT:    orl %edi, %esi
-; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %eax, %eax
-; FALLBACK30-NEXT:    shlxl %ebx, %eax, %edi
-; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK30-NEXT:    shrxl %ecx, %ebp, %eax
-; FALLBACK30-NEXT:    movl 124(%esp,%edx), %edx
-; FALLBACK30-NEXT:    shrxl %ecx, %edx, %ebp
-; FALLBACK30-NEXT:    leal (%edx,%edx), %ecx
-; FALLBACK30-NEXT:    shlxl %ebx, %ecx, %edx
-; FALLBACK30-NEXT:    orl %eax, %edx
+; FALLBACK30-NEXT:    shlxl %edx, %eax, %eax
+; FALLBACK30-NEXT:    orl %edi, %eax
+; FALLBACK30-NEXT:    movl 124(%esp,%ecx), %ecx
+; FALLBACK30-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %edx
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    orl %edi, %edx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %edi
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK30-NEXT:    movl %ebp, 60(%ecx)
+; FALLBACK30-NEXT:    movl %edi, 60(%ecx)
 ; FALLBACK30-NEXT:    movl %edx, 56(%ecx)
-; FALLBACK30-NEXT:    movl %edi, 48(%ecx)
+; FALLBACK30-NEXT:    movl %eax, 48(%ecx)
 ; FALLBACK30-NEXT:    movl %esi, 52(%ecx)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    movl %eax, 40(%ecx)
+; FALLBACK30-NEXT:    movl %ebp, 40(%ecx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK30-NEXT:    movl %eax, 44(%ecx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -16196,10 +16169,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK2-LABEL: shl_64bytes:
 ; FALLBACK2:       # %bb.0:
-; FALLBACK2-NEXT:    pushq %rbp
 ; FALLBACK2-NEXT:    pushq %r15
 ; FALLBACK2-NEXT:    pushq %r14
-; FALLBACK2-NEXT:    pushq %r13
 ; FALLBACK2-NEXT:    pushq %r12
 ; FALLBACK2-NEXT:    pushq %rbx
 ; FALLBACK2-NEXT:    pushq %rax
@@ -16227,62 +16198,60 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    leal (,%rsi,8), %eax
 ; FALLBACK2-NEXT:    andl $56, %eax
+; FALLBACK2-NEXT:    movl %eax, %ecx
 ; FALLBACK2-NEXT:    andl $56, %esi
 ; FALLBACK2-NEXT:    negl %esi
 ; FALLBACK2-NEXT:    movslq %esi, %rsi
-; FALLBACK2-NEXT:    movq -64(%rsp,%rsi), %r10
-; FALLBACK2-NEXT:    movq -56(%rsp,%rsi), %rcx
-; FALLBACK2-NEXT:    shlxq %rax, %rcx, %r9
-; FALLBACK2-NEXT:    movq -40(%rsp,%rsi), %rdi
-; FALLBACK2-NEXT:    shlxq %rax, %rdi, %r11
-; FALLBACK2-NEXT:    movq -48(%rsp,%rsi), %r14
-; FALLBACK2-NEXT:    shlxq %rax, %r14, %rbx
-; FALLBACK2-NEXT:    movq -24(%rsp,%rsi), %r8
-; FALLBACK2-NEXT:    shlxq %rax, %r8, %r15
-; FALLBACK2-NEXT:    shlxq %rax, %r10, %r12
-; FALLBACK2-NEXT:    movl %eax, %r13d
-; FALLBACK2-NEXT:    notb %r13b
-; FALLBACK2-NEXT:    shrq %r10
-; FALLBACK2-NEXT:    shrxq %r13, %r10, %r10
-; FALLBACK2-NEXT:    orq %r9, %r10
-; FALLBACK2-NEXT:    movq -32(%rsp,%rsi), %r9
-; FALLBACK2-NEXT:    shlxq %rax, %r9, %rbp
-; FALLBACK2-NEXT:    shrq %r14
-; FALLBACK2-NEXT:    shrxq %r13, %r14, %r14
-; FALLBACK2-NEXT:    orq %r11, %r14
-; FALLBACK2-NEXT:    shlxq %rax, -8(%rsp,%rsi), %r11
-; FALLBACK2-NEXT:    movq -16(%rsp,%rsi), %rsi
-; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rax
-; FALLBACK2-NEXT:    shrq %rcx
-; FALLBACK2-NEXT:    shrxq %r13, %rcx, %rcx
-; FALLBACK2-NEXT:    orq %rbx, %rcx
+; FALLBACK2-NEXT:    movq -64(%rsp,%rsi), %r9
+; FALLBACK2-NEXT:    movq -56(%rsp,%rsi), %rdi
+; FALLBACK2-NEXT:    shlxq %rcx, %rdi, %r8
+; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    shlxq %rcx, %r9, %r10
 ; FALLBACK2-NEXT:    shrq %r9
-; FALLBACK2-NEXT:    shrxq %r13, %r9, %r9
-; FALLBACK2-NEXT:    orq %r15, %r9
+; FALLBACK2-NEXT:    shrxq %rax, %r9, %r9
+; FALLBACK2-NEXT:    orq %r8, %r9
+; FALLBACK2-NEXT:    movq -40(%rsp,%rsi), %r11
+; FALLBACK2-NEXT:    shlxq %rcx, %r11, %rbx
+; FALLBACK2-NEXT:    movq -48(%rsp,%rsi), %r8
+; FALLBACK2-NEXT:    shlxq %rcx, %r8, %r14
+; FALLBACK2-NEXT:    shrq %r8
+; FALLBACK2-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK2-NEXT:    orq %rbx, %r8
 ; FALLBACK2-NEXT:    shrq %rdi
-; FALLBACK2-NEXT:    shrxq %r13, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %rbp, %rdi
+; FALLBACK2-NEXT:    shrxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %r14, %rdi
+; FALLBACK2-NEXT:    movq -24(%rsp,%rsi), %rbx
+; FALLBACK2-NEXT:    shlxq %rcx, %rbx, %r14
+; FALLBACK2-NEXT:    movq -32(%rsp,%rsi), %r15
+; FALLBACK2-NEXT:    shlxq %rcx, %r15, %r12
+; FALLBACK2-NEXT:    shrq %r15
+; FALLBACK2-NEXT:    shrxq %rax, %r15, %r15
+; FALLBACK2-NEXT:    orq %r14, %r15
+; FALLBACK2-NEXT:    shrq %r11
+; FALLBACK2-NEXT:    shrxq %rax, %r11, %r11
+; FALLBACK2-NEXT:    orq %r12, %r11
+; FALLBACK2-NEXT:    shlxq %rcx, -8(%rsp,%rsi), %r14
+; FALLBACK2-NEXT:    movq -16(%rsp,%rsi), %rsi
+; FALLBACK2-NEXT:    shlxq %rcx, %rsi, %rcx
 ; FALLBACK2-NEXT:    shrq %rsi
-; FALLBACK2-NEXT:    shrxq %r13, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r11, %rsi
-; FALLBACK2-NEXT:    shrq %r8
-; FALLBACK2-NEXT:    shrxq %r13, %r8, %r8
-; FALLBACK2-NEXT:    orq %rax, %r8
-; FALLBACK2-NEXT:    movq %r12, (%rdx)
-; FALLBACK2-NEXT:    movq %r8, 48(%rdx)
+; FALLBACK2-NEXT:    shrxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT:    orq %r14, %rsi
+; FALLBACK2-NEXT:    shrq %rbx
+; FALLBACK2-NEXT:    shrxq %rax, %rbx, %rax
+; FALLBACK2-NEXT:    orq %rcx, %rax
+; FALLBACK2-NEXT:    movq %r10, (%rdx)
+; FALLBACK2-NEXT:    movq %rax, 48(%rdx)
 ; FALLBACK2-NEXT:    movq %rsi, 56(%rdx)
-; FALLBACK2-NEXT:    movq %rdi, 32(%rdx)
-; FALLBACK2-NEXT:    movq %r9, 40(%rdx)
-; FALLBACK2-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK2-NEXT:    movq %r14, 24(%rdx)
-; FALLBACK2-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK2-NEXT:    movq %r11, 32(%rdx)
+; FALLBACK2-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK2-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK2-NEXT:    movq %r9, 8(%rdx)
 ; FALLBACK2-NEXT:    addq $8, %rsp
 ; FALLBACK2-NEXT:    popq %rbx
 ; FALLBACK2-NEXT:    popq %r12
-; FALLBACK2-NEXT:    popq %r13
 ; FALLBACK2-NEXT:    popq %r14
 ; FALLBACK2-NEXT:    popq %r15
-; FALLBACK2-NEXT:    popq %rbp
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: shl_64bytes:
@@ -16509,86 +16478,81 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK6-LABEL: shl_64bytes:
 ; FALLBACK6:       # %bb.0:
-; FALLBACK6-NEXT:    pushq %rbp
 ; FALLBACK6-NEXT:    pushq %r15
 ; FALLBACK6-NEXT:    pushq %r14
-; FALLBACK6-NEXT:    pushq %r13
 ; FALLBACK6-NEXT:    pushq %r12
 ; FALLBACK6-NEXT:    pushq %rbx
-; FALLBACK6-NEXT:    subq $24, %rsp
+; FALLBACK6-NEXT:    pushq %rax
 ; FALLBACK6-NEXT:    movups (%rdi), %xmm0
 ; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
 ; FALLBACK6-NEXT:    movups 32(%rdi), %xmm2
 ; FALLBACK6-NEXT:    movups 48(%rdi), %xmm3
-; FALLBACK6-NEXT:    movl (%rsi), %eax
+; FALLBACK6-NEXT:    movl (%rsi), %esi
 ; FALLBACK6-NEXT:    xorps %xmm4, %xmm4
 ; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT:    movaps %xmm3, (%rsp)
+; FALLBACK6-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK6-NEXT:    andl $56, %ecx
+; FALLBACK6-NEXT:    leal (,%rsi,8), %eax
 ; FALLBACK6-NEXT:    andl $56, %eax
-; FALLBACK6-NEXT:    negl %eax
-; FALLBACK6-NEXT:    movslq %eax, %rsi
-; FALLBACK6-NEXT:    movq -8(%rsp,%rsi), %rax
-; FALLBACK6-NEXT:    shlxq %rcx, %rax, %r12
-; FALLBACK6-NEXT:    movq -16(%rsp,%rsi), %rdi
-; FALLBACK6-NEXT:    shlxq %rcx, %rdi, %r15
-; FALLBACK6-NEXT:    movq -24(%rsp,%rsi), %r13
-; FALLBACK6-NEXT:    shlxq %rcx, %r13, %r8
-; FALLBACK6-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; FALLBACK6-NEXT:    movq -32(%rsp,%rsi), %r11
-; FALLBACK6-NEXT:    shlxq %rcx, %r11, %r10
-; FALLBACK6-NEXT:    movq -40(%rsp,%rsi), %r14
-; FALLBACK6-NEXT:    shlxq %rcx, %r14, %rbx
-; FALLBACK6-NEXT:    movl %ecx, %r9d
-; FALLBACK6-NEXT:    notb %r9b
+; FALLBACK6-NEXT:    movl %eax, %ecx
+; FALLBACK6-NEXT:    andl $56, %esi
+; FALLBACK6-NEXT:    negl %esi
+; FALLBACK6-NEXT:    movslq %esi, %rsi
+; FALLBACK6-NEXT:    movq -24(%rsp,%rsi), %rdi
+; FALLBACK6-NEXT:    shlxq %rcx, %rdi, %r9
+; FALLBACK6-NEXT:    notb %al
+; FALLBACK6-NEXT:    movq -32(%rsp,%rsi), %r8
+; FALLBACK6-NEXT:    shlxq %rcx, %r8, %r10
+; FALLBACK6-NEXT:    shrq %r8
+; FALLBACK6-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK6-NEXT:    orq %r9, %r8
+; FALLBACK6-NEXT:    movq -40(%rsp,%rsi), %r9
+; FALLBACK6-NEXT:    shlxq %rcx, %r9, %r11
+; FALLBACK6-NEXT:    shrq %r9
+; FALLBACK6-NEXT:    shrxq %rax, %r9, %r9
+; FALLBACK6-NEXT:    orq %r10, %r9
+; FALLBACK6-NEXT:    movq -48(%rsp,%rsi), %r10
+; FALLBACK6-NEXT:    shlxq %rcx, %r10, %r14
+; FALLBACK6-NEXT:    shrq %r10
+; FALLBACK6-NEXT:    shrxq %rax, %r10, %r10
+; FALLBACK6-NEXT:    orq %r11, %r10
+; FALLBACK6-NEXT:    movq -64(%rsp,%rsi), %rbx
+; FALLBACK6-NEXT:    movq -56(%rsp,%rsi), %r11
+; FALLBACK6-NEXT:    shlxq %rcx, %r11, %r15
+; FALLBACK6-NEXT:    shrq %r11
+; FALLBACK6-NEXT:    shrxq %rax, %r11, %r11
+; FALLBACK6-NEXT:    orq %r14, %r11
+; FALLBACK6-NEXT:    shlxq %rcx, %rbx, %r14
+; FALLBACK6-NEXT:    shrq %rbx
+; FALLBACK6-NEXT:    shrxq %rax, %rbx, %rbx
+; FALLBACK6-NEXT:    orq %r15, %rbx
+; FALLBACK6-NEXT:    movq -16(%rsp,%rsi), %r15
+; FALLBACK6-NEXT:    shlxq %rcx, %r15, %r12
 ; FALLBACK6-NEXT:    shrq %rdi
-; FALLBACK6-NEXT:    shrxq %r9, %rdi, %rdi
+; FALLBACK6-NEXT:    shrxq %rax, %rdi, %rdi
 ; FALLBACK6-NEXT:    orq %r12, %rdi
-; FALLBACK6-NEXT:    movq (%rsp,%rsi), %rbp
-; FALLBACK6-NEXT:    shlxq %rcx, %rbp, %r8
-; FALLBACK6-NEXT:    shrq %r13
-; FALLBACK6-NEXT:    shrxq %r9, %r13, %r12
-; FALLBACK6-NEXT:    orq %r15, %r12
-; FALLBACK6-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r15
-; FALLBACK6-NEXT:    movq -48(%rsp,%rsi), %rsi
-; FALLBACK6-NEXT:    shlxq %rcx, %rsi, %rcx
-; FALLBACK6-NEXT:    shrq %r11
-; FALLBACK6-NEXT:    shrxq %r9, %r11, %r11
-; FALLBACK6-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; FALLBACK6-NEXT:    shrq %r14
-; FALLBACK6-NEXT:    shrxq %r9, %r14, %r14
-; FALLBACK6-NEXT:    orq %r10, %r14
-; FALLBACK6-NEXT:    shrq %rsi
-; FALLBACK6-NEXT:    shrxq %r9, %rsi, %rsi
-; FALLBACK6-NEXT:    orq %rbx, %rsi
-; FALLBACK6-NEXT:    shrq %rax
-; FALLBACK6-NEXT:    shrxq %r9, %rax, %rax
-; FALLBACK6-NEXT:    orq %r8, %rax
-; FALLBACK6-NEXT:    shrq %rbp
-; FALLBACK6-NEXT:    shrxq %r9, %rbp, %r8
-; FALLBACK6-NEXT:    orq %r15, %r8
-; FALLBACK6-NEXT:    movq %rcx, (%rdx)
-; FALLBACK6-NEXT:    movq %r8, 56(%rdx)
-; FALLBACK6-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK6-NEXT:    movq %rsi, 8(%rdx)
-; FALLBACK6-NEXT:    movq %r14, 16(%rdx)
-; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK6-NEXT:    movq %r12, 32(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, 40(%rdx)
-; FALLBACK6-NEXT:    addq $24, %rsp
+; FALLBACK6-NEXT:    shlxq %rcx, -8(%rsp,%rsi), %rcx
+; FALLBACK6-NEXT:    shrq %r15
+; FALLBACK6-NEXT:    shrxq %rax, %r15, %rax
+; FALLBACK6-NEXT:    orq %rcx, %rax
+; FALLBACK6-NEXT:    movq %r14, (%rdx)
+; FALLBACK6-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, 48(%rdx)
+; FALLBACK6-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK6-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r10, 24(%rdx)
+; FALLBACK6-NEXT:    movq %r9, 32(%rdx)
+; FALLBACK6-NEXT:    movq %r8, 40(%rdx)
+; FALLBACK6-NEXT:    addq $8, %rsp
 ; FALLBACK6-NEXT:    popq %rbx
 ; FALLBACK6-NEXT:    popq %r12
-; FALLBACK6-NEXT:    popq %r13
 ; FALLBACK6-NEXT:    popq %r14
 ; FALLBACK6-NEXT:    popq %r15
-; FALLBACK6-NEXT:    popq %rbp
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: shl_64bytes:
@@ -16798,80 +16762,75 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK10-LABEL: shl_64bytes:
 ; FALLBACK10:       # %bb.0:
-; FALLBACK10-NEXT:    pushq %rbp
 ; FALLBACK10-NEXT:    pushq %r15
 ; FALLBACK10-NEXT:    pushq %r14
-; FALLBACK10-NEXT:    pushq %r13
 ; FALLBACK10-NEXT:    pushq %r12
 ; FALLBACK10-NEXT:    pushq %rbx
-; FALLBACK10-NEXT:    subq $24, %rsp
+; FALLBACK10-NEXT:    pushq %rax
 ; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
 ; FALLBACK10-NEXT:    vmovups 32(%rdi), %ymm1
-; FALLBACK10-NEXT:    movl (%rsi), %eax
+; FALLBACK10-NEXT:    movl (%rsi), %esi
 ; FALLBACK10-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK10-NEXT:    andl $56, %ecx
+; FALLBACK10-NEXT:    leal (,%rsi,8), %eax
 ; FALLBACK10-NEXT:    andl $56, %eax
-; FALLBACK10-NEXT:    negl %eax
-; FALLBACK10-NEXT:    movslq %eax, %rsi
-; FALLBACK10-NEXT:    movq -8(%rsp,%rsi), %rax
-; FALLBACK10-NEXT:    shlxq %rcx, %rax, %r12
-; FALLBACK10-NEXT:    movq -16(%rsp,%rsi), %rdi
-; FALLBACK10-NEXT:    shlxq %rcx, %rdi, %r15
-; FALLBACK10-NEXT:    movq -24(%rsp,%rsi), %r13
-; FALLBACK10-NEXT:    shlxq %rcx, %r13, %r8
-; FALLBACK10-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; FALLBACK10-NEXT:    movq -32(%rsp,%rsi), %r11
-; FALLBACK10-NEXT:    shlxq %rcx, %r11, %r10
-; FALLBACK10-NEXT:    movq -40(%rsp,%rsi), %r14
-; FALLBACK10-NEXT:    shlxq %rcx, %r14, %rbx
-; FALLBACK10-NEXT:    movl %ecx, %r9d
-; FALLBACK10-NEXT:    notb %r9b
+; FALLBACK10-NEXT:    movl %eax, %ecx
+; FALLBACK10-NEXT:    andl $56, %esi
+; FALLBACK10-NEXT:    negl %esi
+; FALLBACK10-NEXT:    movslq %esi, %rsi
+; FALLBACK10-NEXT:    movq -24(%rsp,%rsi), %rdi
+; FALLBACK10-NEXT:    shlxq %rcx, %rdi, %r9
+; FALLBACK10-NEXT:    notb %al
+; FALLBACK10-NEXT:    movq -32(%rsp,%rsi), %r8
+; FALLBACK10-NEXT:    shlxq %rcx, %r8, %r10
+; FALLBACK10-NEXT:    shrq %r8
+; FALLBACK10-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK10-NEXT:    orq %r9, %r8
+; FALLBACK10-NEXT:    movq -40(%rsp,%rsi), %r9
+; FALLBACK10-NEXT:    shlxq %rcx, %r9, %r11
+; FALLBACK10-NEXT:    shrq %r9
+; FALLBACK10-NEXT:    shrxq %rax, %r9, %r9
+; FALLBACK10-NEXT:    orq %r10, %r9
+; FALLBACK10-NEXT:    movq -48(%rsp,%rsi), %r10
+; FALLBACK10-NEXT:    shlxq %rcx, %r10, %r14
+; FALLBACK10-NEXT:    shrq %r10
+; FALLBACK10-NEXT:    shrxq %rax, %r10, %r10
+; FALLBACK10-NEXT:    orq %r11, %r10
+; FALLBACK10-NEXT:    movq -64(%rsp,%rsi), %rbx
+; FALLBACK10-NEXT:    movq -56(%rsp,%rsi), %r11
+; FALLBACK10-NEXT:    shlxq %rcx, %r11, %r15
+; FALLBACK10-NEXT:    shrq %r11
+; FALLBACK10-NEXT:    shrxq %rax, %r11, %r11
+; FALLBACK10-NEXT:    orq %r14, %r11
+; FALLBACK10-NEXT:    shlxq %rcx, %rbx, %r14
+; FALLBACK10-NEXT:    shrq %rbx
+; FALLBACK10-NEXT:    shrxq %rax, %rbx, %rbx
+; FALLBACK10-NEXT:    orq %r15, %rbx
+; FALLBACK10-NEXT:    movq -16(%rsp,%rsi), %r15
+; FALLBACK10-NEXT:    shlxq %rcx, %r15, %r12
 ; FALLBACK10-NEXT:    shrq %rdi
-; FALLBACK10-NEXT:    shrxq %r9, %rdi, %rdi
+; FALLBACK10-NEXT:    shrxq %rax, %rdi, %rdi
 ; FALLBACK10-NEXT:    orq %r12, %rdi
-; FALLBACK10-NEXT:    movq (%rsp,%rsi), %rbp
-; FALLBACK10-NEXT:    shlxq %rcx, %rbp, %r8
-; FALLBACK10-NEXT:    shrq %r13
-; FALLBACK10-NEXT:    shrxq %r9, %r13, %r12
-; FALLBACK10-NEXT:    orq %r15, %r12
-; FALLBACK10-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r15
-; FALLBACK10-NEXT:    movq -48(%rsp,%rsi), %rsi
-; FALLBACK10-NEXT:    shlxq %rcx, %rsi, %rcx
-; FALLBACK10-NEXT:    shrq %r11
-; FALLBACK10-NEXT:    shrxq %r9, %r11, %r11
-; FALLBACK10-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; FALLBACK10-NEXT:    shrq %r14
-; FALLBACK10-NEXT:    shrxq %r9, %r14, %r14
-; FALLBACK10-NEXT:    orq %r10, %r14
-; FALLBACK10-NEXT:    shrq %rsi
-; FALLBACK10-NEXT:    shrxq %r9, %rsi, %rsi
-; FALLBACK10-NEXT:    orq %rbx, %rsi
-; FALLBACK10-NEXT:    shrq %rax
-; FALLBACK10-NEXT:    shrxq %r9, %rax, %rax
-; FALLBACK10-NEXT:    orq %r8, %rax
-; FALLBACK10-NEXT:    shrq %rbp
-; FALLBACK10-NEXT:    shrxq %r9, %rbp, %r8
-; FALLBACK10-NEXT:    orq %r15, %r8
-; FALLBACK10-NEXT:    movq %rcx, (%rdx)
-; FALLBACK10-NEXT:    movq %r8, 56(%rdx)
-; FALLBACK10-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK10-NEXT:    movq %rsi, 8(%rdx)
-; FALLBACK10-NEXT:    movq %r14, 16(%rdx)
-; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK10-NEXT:    movq %r12, 32(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, 40(%rdx)
-; FALLBACK10-NEXT:    addq $24, %rsp
+; FALLBACK10-NEXT:    shlxq %rcx, -8(%rsp,%rsi), %rcx
+; FALLBACK10-NEXT:    shrq %r15
+; FALLBACK10-NEXT:    shrxq %rax, %r15, %rax
+; FALLBACK10-NEXT:    orq %rcx, %rax
+; FALLBACK10-NEXT:    movq %r14, (%rdx)
+; FALLBACK10-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, 48(%rdx)
+; FALLBACK10-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK10-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r10, 24(%rdx)
+; FALLBACK10-NEXT:    movq %r9, 32(%rdx)
+; FALLBACK10-NEXT:    movq %r8, 40(%rdx)
+; FALLBACK10-NEXT:    addq $8, %rsp
 ; FALLBACK10-NEXT:    popq %rbx
 ; FALLBACK10-NEXT:    popq %r12
-; FALLBACK10-NEXT:    popq %r13
 ; FALLBACK10-NEXT:    popq %r14
 ; FALLBACK10-NEXT:    popq %r15
-; FALLBACK10-NEXT:    popq %rbp
 ; FALLBACK10-NEXT:    vzeroupper
 ; FALLBACK10-NEXT:    retq
 ;
@@ -17071,77 +17030,72 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK14-LABEL: shl_64bytes:
 ; FALLBACK14:       # %bb.0:
-; FALLBACK14-NEXT:    pushq %rbp
 ; FALLBACK14-NEXT:    pushq %r15
 ; FALLBACK14-NEXT:    pushq %r14
-; FALLBACK14-NEXT:    pushq %r13
 ; FALLBACK14-NEXT:    pushq %r12
 ; FALLBACK14-NEXT:    pushq %rbx
-; FALLBACK14-NEXT:    subq $24, %rsp
+; FALLBACK14-NEXT:    pushq %rax
 ; FALLBACK14-NEXT:    vmovups (%rdi), %zmm0
-; FALLBACK14-NEXT:    movl (%rsi), %eax
+; FALLBACK14-NEXT:    movl (%rsi), %esi
 ; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK14-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK14-NEXT:    andl $56, %ecx
+; FALLBACK14-NEXT:    leal (,%rsi,8), %eax
 ; FALLBACK14-NEXT:    andl $56, %eax
-; FALLBACK14-NEXT:    negl %eax
-; FALLBACK14-NEXT:    movslq %eax, %rsi
-; FALLBACK14-NEXT:    movq -8(%rsp,%rsi), %rax
-; FALLBACK14-NEXT:    shlxq %rcx, %rax, %r12
-; FALLBACK14-NEXT:    movq -16(%rsp,%rsi), %rdi
-; FALLBACK14-NEXT:    shlxq %rcx, %rdi, %r15
-; FALLBACK14-NEXT:    movq -24(%rsp,%rsi), %r13
-; FALLBACK14-NEXT:    shlxq %rcx, %r13, %r8
-; FALLBACK14-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; FALLBACK14-NEXT:    movq -32(%rsp,%rsi), %r11
-; FALLBACK14-NEXT:    shlxq %rcx, %r11, %r10
-; FALLBACK14-NEXT:    movq -40(%rsp,%rsi), %r14
-; FALLBACK14-NEXT:    shlxq %rcx, %r14, %rbx
-; FALLBACK14-NEXT:    movl %ecx, %r9d
-; FALLBACK14-NEXT:    notb %r9b
+; FALLBACK14-NEXT:    movl %eax, %ecx
+; FALLBACK14-NEXT:    andl $56, %esi
+; FALLBACK14-NEXT:    negl %esi
+; FALLBACK14-NEXT:    movslq %esi, %rsi
+; FALLBACK14-NEXT:    movq -24(%rsp,%rsi), %rdi
+; FALLBACK14-NEXT:    shlxq %rcx, %rdi, %r9
+; FALLBACK14-NEXT:    notb %al
+; FALLBACK14-NEXT:    movq -32(%rsp,%rsi), %r8
+; FALLBACK14-NEXT:    shlxq %rcx, %r8, %r10
+; FALLBACK14-NEXT:    shrq %r8
+; FALLBACK14-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK14-NEXT:    orq %r9, %r8
+; FALLBACK14-NEXT:    movq -40(%rsp,%rsi), %r9
+; FALLBACK14-NEXT:    shlxq %rcx, %r9, %r11
+; FALLBACK14-NEXT:    shrq %r9
+; FALLBACK14-NEXT:    shrxq %rax, %r9, %r9
+; FALLBACK14-NEXT:    orq %r10, %r9
+; FALLBACK14-NEXT:    movq -48(%rsp,%rsi), %r10
+; FALLBACK14-NEXT:    shlxq %rcx, %r10, %r14
+; FALLBACK14-NEXT:    shrq %r10
+; FALLBACK14-NEXT:    shrxq %rax, %r10, %r10
+; FALLBACK14-NEXT:    orq %r11, %r10
+; FALLBACK14-NEXT:    movq -64(%rsp,%rsi), %rbx
+; FALLBACK14-NEXT:    movq -56(%rsp,%rsi), %r11
+; FALLBACK14-NEXT:    shlxq %rcx, %r11, %r15
+; FALLBACK14-NEXT:    shrq %r11
+; FALLBACK14-NEXT:    shrxq %rax, %r11, %r11
+; FALLBACK14-NEXT:    orq %r14, %r11
+; FALLBACK14-NEXT:    shlxq %rcx, %rbx, %r14
+; FALLBACK14-NEXT:    shrq %rbx
+; FALLBACK14-NEXT:    shrxq %rax, %rbx, %rbx
+; FALLBACK14-NEXT:    orq %r15, %rbx
+; FALLBACK14-NEXT:    movq -16(%rsp,%rsi), %r15
+; FALLBACK14-NEXT:    shlxq %rcx, %r15, %r12
 ; FALLBACK14-NEXT:    shrq %rdi
-; FALLBACK14-NEXT:    shrxq %r9, %rdi, %rdi
+; FALLBACK14-NEXT:    shrxq %rax, %rdi, %rdi
 ; FALLBACK14-NEXT:    orq %r12, %rdi
-; FALLBACK14-NEXT:    movq (%rsp,%rsi), %rbp
-; FALLBACK14-NEXT:    shlxq %rcx, %rbp, %r8
-; FALLBACK14-NEXT:    shrq %r13
-; FALLBACK14-NEXT:    shrxq %r9, %r13, %r12
-; FALLBACK14-NEXT:    orq %r15, %r12
-; FALLBACK14-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r15
-; FALLBACK14-NEXT:    movq -48(%rsp,%rsi), %rsi
-; FALLBACK14-NEXT:    shlxq %rcx, %rsi, %rcx
-; FALLBACK14-NEXT:    shrq %r11
-; FALLBACK14-NEXT:    shrxq %r9, %r11, %r11
-; FALLBACK14-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; FALLBACK14-NEXT:    shrq %r14
-; FALLBACK14-NEXT:    shrxq %r9, %r14, %r14
-; FALLBACK14-NEXT:    orq %r10, %r14
-; FALLBACK14-NEXT:    shrq %rsi
-; FALLBACK14-NEXT:    shrxq %r9, %rsi, %rsi
-; FALLBACK14-NEXT:    orq %rbx, %rsi
-; FALLBACK14-NEXT:    shrq %rax
-; FALLBACK14-NEXT:    shrxq %r9, %rax, %rax
-; FALLBACK14-NEXT:    orq %r8, %rax
-; FALLBACK14-NEXT:    shrq %rbp
-; FALLBACK14-NEXT:    shrxq %r9, %rbp, %r8
-; FALLBACK14-NEXT:    orq %r15, %r8
-; FALLBACK14-NEXT:    movq %rcx, (%rdx)
-; FALLBACK14-NEXT:    movq %r8, 56(%rdx)
-; FALLBACK14-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK14-NEXT:    movq %rsi, 8(%rdx)
-; FALLBACK14-NEXT:    movq %r14, 16(%rdx)
-; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK14-NEXT:    movq %r12, 32(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, 40(%rdx)
-; FALLBACK14-NEXT:    addq $24, %rsp
+; FALLBACK14-NEXT:    shlxq %rcx, -8(%rsp,%rsi), %rcx
+; FALLBACK14-NEXT:    shrq %r15
+; FALLBACK14-NEXT:    shrxq %rax, %r15, %rax
+; FALLBACK14-NEXT:    orq %rcx, %rax
+; FALLBACK14-NEXT:    movq %r14, (%rdx)
+; FALLBACK14-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, 48(%rdx)
+; FALLBACK14-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK14-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r10, 24(%rdx)
+; FALLBACK14-NEXT:    movq %r9, 32(%rdx)
+; FALLBACK14-NEXT:    movq %r8, 40(%rdx)
+; FALLBACK14-NEXT:    addq $8, %rsp
 ; FALLBACK14-NEXT:    popq %rbx
 ; FALLBACK14-NEXT:    popq %r12
-; FALLBACK14-NEXT:    popq %r13
 ; FALLBACK14-NEXT:    popq %r14
 ; FALLBACK14-NEXT:    popq %r15
-; FALLBACK14-NEXT:    popq %rbp
 ; FALLBACK14-NEXT:    vzeroupper
 ; FALLBACK14-NEXT:    retq
 ;
@@ -17681,144 +17635,149 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    leal (,%ebp,8), %edx
-; FALLBACK18-NEXT:    andl $24, %edx
+; FALLBACK18-NEXT:    leal (,%ebp,8), %ebx
+; FALLBACK18-NEXT:    andl $24, %ebx
+; FALLBACK18-NEXT:    movl %ebx, %eax
 ; FALLBACK18-NEXT:    andl $60, %ebp
 ; FALLBACK18-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; FALLBACK18-NEXT:    subl %ebp, %edi
-; FALLBACK18-NEXT:    movl (%edi), %ecx
-; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 4(%edi), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl %edx, %ebx
+; FALLBACK18-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; FALLBACK18-NEXT:    subl %ebp, %edx
+; FALLBACK18-NEXT:    movl (%edx), %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 4(%edx), %ecx
 ; FALLBACK18-NEXT:    notb %bl
-; FALLBACK18-NEXT:    shrl %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %esi
-; FALLBACK18-NEXT:    shlxl %edx, %eax, %ecx
-; FALLBACK18-NEXT:    orl %ecx, %esi
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK18-NEXT:    shlxl %eax, %ecx, %esi
+; FALLBACK18-NEXT:    movl %eax, %ebp
+; FALLBACK18-NEXT:    orl %esi, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 8(%edx), %esi
 ; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 8(%edi), %esi
-; FALLBACK18-NEXT:    movl %esi, %ecx
-; FALLBACK18-NEXT:    shrl %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK18-NEXT:    movl 12(%edi), %ecx
-; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT:    shrl %eax
-; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK18-NEXT:    orl %esi, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 16(%edi), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrl %eax
-; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK18-NEXT:    movl 20(%edi), %esi
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    movl 12(%edx), %esi
+; FALLBACK18-NEXT:    movl %ebp, %edi
+; FALLBACK18-NEXT:    shlxl %ebp, %esi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    shrl %ecx
 ; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ecx
 ; FALLBACK18-NEXT:    orl %eax, %ecx
 ; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 24(%edi), %ecx
+; FALLBACK18-NEXT:    movl 16(%edx), %ecx
 ; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    shrl %ecx
 ; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK18-NEXT:    movl 28(%edi), %ecx
-; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT:    movl 20(%edx), %ecx
+; FALLBACK18-NEXT:    shlxl %edi, %ecx, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    shrl %esi
-; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK18-NEXT:    orl %eax, %esi
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 32(%edi), %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrl %eax
-; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK18-NEXT:    movl 36(%edi), %esi
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT:    movl 24(%edx), %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    movl 28(%edx), %esi
+; FALLBACK18-NEXT:    shlxl %edi, %esi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    shrl %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK18-NEXT:    orl %eax, %ecx
-; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 40(%edi), %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 32(%edx), %ecx
 ; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    shrl %ecx
 ; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK18-NEXT:    movl 44(%edi), %ecx
-; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT:    movl 36(%edx), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edi, %ecx, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %edi, %eax
 ; FALLBACK18-NEXT:    shrl %esi
 ; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK18-NEXT:    orl %eax, %esi
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 48(%edi), %esi
+; FALLBACK18-NEXT:    orl %ebp, %esi
 ; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 40(%edx), %edi
+; FALLBACK18-NEXT:    movl %edi, %esi
 ; FALLBACK18-NEXT:    shrl %esi
-; FALLBACK18-NEXT:    shrxl %ebx, %esi, %eax
-; FALLBACK18-NEXT:    movl 52(%edi), %esi
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %ecx
+; FALLBACK18-NEXT:    movl 44(%edx), %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %eax, %esi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %eax, %edi, %edi
+; FALLBACK18-NEXT:    movl %eax, %esi
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    shrl %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT:    shrl %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ebp
-; FALLBACK18-NEXT:    orl %eax, %ebp
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl 48(%edx), %ebp
+; FALLBACK18-NEXT:    movl %ebp, %edi
+; FALLBACK18-NEXT:    shrl %edi
+; FALLBACK18-NEXT:    shrxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 52(%edx), %ecx
+; FALLBACK18-NEXT:    shlxl %esi, %ecx, %edi
+; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %esi, %ebp, %edi
+; FALLBACK18-NEXT:    movl %esi, %ebp
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT:    negl %eax
-; FALLBACK18-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
-; FALLBACK18-NEXT:    movl 56(%edi), %eax
-; FALLBACK18-NEXT:    shlxl %edx, %eax, %edx
-; FALLBACK18-NEXT:    shrl %esi
-; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK18-NEXT:    orl %edx, %esi
 ; FALLBACK18-NEXT:    shrl %eax
-; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK18-NEXT:    orl %eax, %ecx
-; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %edx, (%eax)
-; FALLBACK18-NEXT:    movl %esi, 56(%eax)
-; FALLBACK18-NEXT:    movl %ecx, 60(%eax)
-; FALLBACK18-NEXT:    movl %ebp, 48(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 40(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %esi
+; FALLBACK18-NEXT:    orl %edi, %esi
+; FALLBACK18-NEXT:    movl 56(%edx), %edi
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT:    shlxl %ebp, %edi, %ecx
+; FALLBACK18-NEXT:    orl %ecx, %eax
+; FALLBACK18-NEXT:    shrl %edi
+; FALLBACK18-NEXT:    shrxl %ebx, %edi, %ecx
+; FALLBACK18-NEXT:    shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK18-NEXT:    negl %ebx
+; FALLBACK18-NEXT:    shlxl %ebp, 188(%esp,%ebx), %ebx
+; FALLBACK18-NEXT:    orl %ecx, %ebx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK18-NEXT:    movl %edi, (%edx)
+; FALLBACK18-NEXT:    movl %eax, 56(%edx)
+; FALLBACK18-NEXT:    movl %ebx, 60(%edx)
+; FALLBACK18-NEXT:    movl %esi, 48(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 52(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 40(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 44(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 32(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 36(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 24(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 28(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 16(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 20(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 8(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 12(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 4(%edx)
 ; FALLBACK18-NEXT:    addl $204, %esp
 ; FALLBACK18-NEXT:    popl %esi
 ; FALLBACK18-NEXT:    popl %edi
@@ -18342,144 +18301,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    leal (,%eax,8), %edx
-; FALLBACK22-NEXT:    andl $24, %edx
+; FALLBACK22-NEXT:    leal (,%eax,8), %ebx
+; FALLBACK22-NEXT:    andl $24, %ebx
+; FALLBACK22-NEXT:    movl %ebx, %ecx
 ; FALLBACK22-NEXT:    andl $60, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; FALLBACK22-NEXT:    subl %eax, %edi
-; FALLBACK22-NEXT:    movl (%edi), %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 4(%edi), %eax
+; FALLBACK22-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT:    subl %eax, %edx
+; FALLBACK22-NEXT:    movl (%edx), %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 4(%edx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl %edx, %ebx
 ; FALLBACK22-NEXT:    notb %bl
-; FALLBACK22-NEXT:    shrl %ecx
-; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %esi
-; FALLBACK22-NEXT:    shlxl %edx, %eax, %ecx
-; FALLBACK22-NEXT:    orl %ecx, %esi
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK22-NEXT:    shlxl %ecx, %eax, %esi
+; FALLBACK22-NEXT:    orl %esi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 8(%edx), %esi
 ; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 8(%edi), %esi
-; FALLBACK22-NEXT:    movl %esi, %ecx
-; FALLBACK22-NEXT:    shrl %ecx
-; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK22-NEXT:    movl 12(%edi), %ecx
-; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT:    shrl %eax
-; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK22-NEXT:    orl %esi, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 16(%edi), %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrl %eax
-; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK22-NEXT:    movl 20(%edi), %esi
-; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    movl 12(%edx), %esi
+; FALLBACK22-NEXT:    shlxl %ecx, %esi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %ecx, %edi
+; FALLBACK22-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK22-NEXT:    shrl %ecx
 ; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
 ; FALLBACK22-NEXT:    orl %eax, %ecx
 ; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 24(%edi), %ecx
+; FALLBACK22-NEXT:    movl 16(%edx), %ecx
 ; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    shrl %ecx
 ; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK22-NEXT:    movl 28(%edi), %ecx
-; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT:    movl 20(%edx), %ecx
+; FALLBACK22-NEXT:    shlxl %edi, %ecx, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    shrl %esi
-; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %eax, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 32(%edi), %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrl %eax
-; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK22-NEXT:    movl 36(%edi), %esi
-; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT:    movl 24(%edx), %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    movl 28(%edx), %esi
+; FALLBACK22-NEXT:    shlxl %edi, %esi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    shrl %ecx
-; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT:    orl %eax, %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 40(%edi), %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 32(%edx), %ecx
 ; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    shrl %ecx
 ; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK22-NEXT:    movl 44(%edi), %ecx
-; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT:    movl 36(%edx), %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edi, %ecx, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %edi, %eax
 ; FALLBACK22-NEXT:    shrl %esi
 ; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %eax, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 48(%edi), %esi
+; FALLBACK22-NEXT:    orl %ebp, %esi
 ; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 40(%edx), %edi
+; FALLBACK22-NEXT:    movl %edi, %esi
 ; FALLBACK22-NEXT:    shrl %esi
-; FALLBACK22-NEXT:    shrxl %ebx, %esi, %eax
-; FALLBACK22-NEXT:    movl 52(%edi), %esi
-; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %ecx
+; FALLBACK22-NEXT:    movl 44(%edx), %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %eax, %esi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %eax, %edi, %edi
+; FALLBACK22-NEXT:    movl %eax, %esi
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    shrl %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT:    orl %edi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK22-NEXT:    shrl %ecx
-; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ebp
-; FALLBACK22-NEXT:    orl %eax, %ebp
-; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl 48(%edx), %ebp
+; FALLBACK22-NEXT:    movl %ebp, %edi
+; FALLBACK22-NEXT:    shrl %edi
+; FALLBACK22-NEXT:    shrxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 52(%edx), %ecx
+; FALLBACK22-NEXT:    shlxl %esi, %ecx, %edi
+; FALLBACK22-NEXT:    orl %edi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %esi, %ebp, %edi
+; FALLBACK22-NEXT:    movl %esi, %ebp
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT:    negl %eax
-; FALLBACK22-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
-; FALLBACK22-NEXT:    movl 56(%edi), %eax
-; FALLBACK22-NEXT:    shlxl %edx, %eax, %edx
-; FALLBACK22-NEXT:    shrl %esi
-; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %edx, %esi
 ; FALLBACK22-NEXT:    shrl %eax
-; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK22-NEXT:    orl %eax, %ecx
-; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %edx, (%eax)
-; FALLBACK22-NEXT:    movl %esi, 56(%eax)
-; FALLBACK22-NEXT:    movl %ecx, 60(%eax)
-; FALLBACK22-NEXT:    movl %ebp, 48(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 40(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK22-NEXT:    shrxl %ebx, %eax, %esi
+; FALLBACK22-NEXT:    orl %edi, %esi
+; FALLBACK22-NEXT:    movl 56(%edx), %edi
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT:    shlxl %ebp, %edi, %ecx
+; FALLBACK22-NEXT:    orl %ecx, %eax
+; FALLBACK22-NEXT:    shrl %edi
+; FALLBACK22-NEXT:    shrxl %ebx, %edi, %ecx
+; FALLBACK22-NEXT:    shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK22-NEXT:    negl %ebx
+; FALLBACK22-NEXT:    shlxl %ebp, 188(%esp,%ebx), %ebx
+; FALLBACK22-NEXT:    orl %ecx, %ebx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT:    movl %edi, (%edx)
+; FALLBACK22-NEXT:    movl %eax, 56(%edx)
+; FALLBACK22-NEXT:    movl %ebx, 60(%edx)
+; FALLBACK22-NEXT:    movl %esi, 48(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 52(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 40(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 44(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 32(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 36(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 24(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 28(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 16(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 20(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 8(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 12(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 4(%edx)
 ; FALLBACK22-NEXT:    addl $204, %esp
 ; FALLBACK22-NEXT:    popl %esi
 ; FALLBACK22-NEXT:    popl %edi
@@ -18943,144 +18908,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    leal (,%eax,8), %edx
-; FALLBACK26-NEXT:    andl $24, %edx
+; FALLBACK26-NEXT:    leal (,%eax,8), %ebx
+; FALLBACK26-NEXT:    andl $24, %ebx
+; FALLBACK26-NEXT:    movl %ebx, %ecx
 ; FALLBACK26-NEXT:    andl $60, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; FALLBACK26-NEXT:    subl %eax, %edi
-; FALLBACK26-NEXT:    movl (%edi), %ecx
-; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 4(%edi), %eax
+; FALLBACK26-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT:    subl %eax, %edx
+; FALLBACK26-NEXT:    movl (%edx), %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 4(%edx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl %edx, %ebx
 ; FALLBACK26-NEXT:    notb %bl
-; FALLBACK26-NEXT:    shrl %ecx
-; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %esi
-; FALLBACK26-NEXT:    shlxl %edx, %eax, %ecx
-; FALLBACK26-NEXT:    orl %ecx, %esi
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK26-NEXT:    shlxl %ecx, %eax, %esi
+; FALLBACK26-NEXT:    orl %esi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 8(%edx), %esi
 ; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 8(%edi), %esi
-; FALLBACK26-NEXT:    movl %esi, %ecx
-; FALLBACK26-NEXT:    shrl %ecx
-; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK26-NEXT:    movl 12(%edi), %ecx
-; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    shrl %eax
-; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK26-NEXT:    orl %esi, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 16(%edi), %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrl %eax
-; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK26-NEXT:    movl 20(%edi), %esi
-; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    movl 12(%edx), %esi
+; FALLBACK26-NEXT:    shlxl %ecx, %esi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl %ecx, %edi
+; FALLBACK26-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK26-NEXT:    shrl %ecx
 ; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
 ; FALLBACK26-NEXT:    orl %eax, %ecx
 ; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 24(%edi), %ecx
+; FALLBACK26-NEXT:    movl 16(%edx), %ecx
 ; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    shrl %ecx
 ; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK26-NEXT:    movl 28(%edi), %ecx
-; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT:    movl 20(%edx), %ecx
+; FALLBACK26-NEXT:    shlxl %edi, %ecx, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    shrl %esi
-; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK26-NEXT:    orl %eax, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 32(%edi), %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrl %eax
-; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK26-NEXT:    movl 36(%edi), %esi
-; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    movl 24(%edx), %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    movl 28(%edx), %esi
+; FALLBACK26-NEXT:    shlxl %edi, %esi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    shrl %ecx
-; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK26-NEXT:    orl %eax, %ecx
-; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 40(%edi), %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 32(%edx), %ecx
 ; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    shrl %ecx
 ; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK26-NEXT:    movl 44(%edi), %ecx
-; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT:    movl 36(%edx), %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %edi, %ecx, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl %edi, %eax
 ; FALLBACK26-NEXT:    shrl %esi
 ; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK26-NEXT:    orl %eax, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 48(%edi), %esi
+; FALLBACK26-NEXT:    orl %ebp, %esi
 ; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 40(%edx), %edi
+; FALLBACK26-NEXT:    movl %edi, %esi
 ; FALLBACK26-NEXT:    shrl %esi
-; FALLBACK26-NEXT:    shrxl %ebx, %esi, %eax
-; FALLBACK26-NEXT:    movl 52(%edi), %esi
-; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %ecx
+; FALLBACK26-NEXT:    movl 44(%edx), %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %eax, %esi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %eax, %edi, %edi
+; FALLBACK26-NEXT:    movl %eax, %esi
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    shrl %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT:    orl %edi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK26-NEXT:    shrl %ecx
-; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ebp
-; FALLBACK26-NEXT:    orl %eax, %ebp
-; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl 48(%edx), %ebp
+; FALLBACK26-NEXT:    movl %ebp, %edi
+; FALLBACK26-NEXT:    shrl %edi
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 52(%edx), %ecx
+; FALLBACK26-NEXT:    shlxl %esi, %ecx, %edi
+; FALLBACK26-NEXT:    orl %edi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %esi, %ebp, %edi
+; FALLBACK26-NEXT:    movl %esi, %ebp
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    negl %eax
-; FALLBACK26-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
-; FALLBACK26-NEXT:    movl 56(%edi), %eax
-; FALLBACK26-NEXT:    shlxl %edx, %eax, %edx
-; FALLBACK26-NEXT:    shrl %esi
-; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK26-NEXT:    orl %edx, %esi
 ; FALLBACK26-NEXT:    shrl %eax
-; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK26-NEXT:    orl %eax, %ecx
-; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %edx, (%eax)
-; FALLBACK26-NEXT:    movl %esi, 56(%eax)
-; FALLBACK26-NEXT:    movl %ecx, 60(%eax)
-; FALLBACK26-NEXT:    movl %ebp, 48(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 40(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %esi
+; FALLBACK26-NEXT:    orl %edi, %esi
+; FALLBACK26-NEXT:    movl 56(%edx), %edi
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT:    shlxl %ebp, %edi, %ecx
+; FALLBACK26-NEXT:    orl %ecx, %eax
+; FALLBACK26-NEXT:    shrl %edi
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ecx
+; FALLBACK26-NEXT:    shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK26-NEXT:    negl %ebx
+; FALLBACK26-NEXT:    shlxl %ebp, 188(%esp,%ebx), %ebx
+; FALLBACK26-NEXT:    orl %ecx, %ebx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT:    movl %edi, (%edx)
+; FALLBACK26-NEXT:    movl %eax, 56(%edx)
+; FALLBACK26-NEXT:    movl %ebx, 60(%edx)
+; FALLBACK26-NEXT:    movl %esi, 48(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 52(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 40(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 44(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 32(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 36(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 24(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 28(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 16(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 20(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 8(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 12(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 4(%edx)
 ; FALLBACK26-NEXT:    addl $204, %esp
 ; FALLBACK26-NEXT:    popl %esi
 ; FALLBACK26-NEXT:    popl %edi
@@ -19531,144 +19502,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK30-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    leal (,%eax,8), %edx
-; FALLBACK30-NEXT:    andl $24, %edx
+; FALLBACK30-NEXT:    leal (,%eax,8), %ebx
+; FALLBACK30-NEXT:    andl $24, %ebx
+; FALLBACK30-NEXT:    movl %ebx, %ecx
 ; FALLBACK30-NEXT:    andl $60, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; FALLBACK30-NEXT:    subl %eax, %edi
-; FALLBACK30-NEXT:    movl (%edi), %ecx
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 4(%edi), %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl %edx, %ebx
-; FALLBACK30-NEXT:    notb %bl
-; FALLBACK30-NEXT:    shrl %ecx
-; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %esi
-; FALLBACK30-NEXT:    shlxl %edx, %eax, %ecx
-; FALLBACK30-NEXT:    orl %ecx, %esi
+; FALLBACK30-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT:    subl %eax, %edx
+; FALLBACK30-NEXT:    movl (%edx), %esi
 ; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 8(%edi), %esi
-; FALLBACK30-NEXT:    movl %esi, %ecx
-; FALLBACK30-NEXT:    shrl %ecx
-; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK30-NEXT:    movl 12(%edi), %ecx
-; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    shrl %eax
-; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK30-NEXT:    orl %esi, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 16(%edi), %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrl %eax
-; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK30-NEXT:    movl 20(%edi), %esi
-; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK30-NEXT:    shrl %ecx
-; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK30-NEXT:    orl %eax, %ecx
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 24(%edi), %ecx
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrl %ecx
-; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK30-NEXT:    movl 28(%edi), %ecx
-; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl 4(%edx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    notb %bl
 ; FALLBACK30-NEXT:    shrl %esi
-; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK30-NEXT:    orl %eax, %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK30-NEXT:    shlxl %ecx, %eax, %esi
+; FALLBACK30-NEXT:    orl %esi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 8(%edx), %esi
 ; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 32(%edi), %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrl %eax
-; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK30-NEXT:    movl 36(%edi), %esi
-; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    movl 12(%edx), %esi
+; FALLBACK30-NEXT:    shlxl %ecx, %esi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl %ecx, %edi
+; FALLBACK30-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK30-NEXT:    shrl %ecx
 ; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
 ; FALLBACK30-NEXT:    orl %eax, %ecx
 ; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 40(%edi), %ecx
+; FALLBACK30-NEXT:    movl 16(%edx), %ecx
 ; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    shrl %ecx
 ; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK30-NEXT:    movl 44(%edi), %ecx
-; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK30-NEXT:    movl 20(%edx), %ecx
+; FALLBACK30-NEXT:    shlxl %edi, %ecx, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    shrl %esi
-; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK30-NEXT:    orl %eax, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 48(%edi), %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 24(%edx), %esi
 ; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    shrl %esi
 ; FALLBACK30-NEXT:    shrxl %ebx, %esi, %eax
-; FALLBACK30-NEXT:    movl 52(%edi), %esi
-; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT:    movl 28(%edx), %esi
+; FALLBACK30-NEXT:    shlxl %edi, %esi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    shrl %ecx
-; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ebp
-; FALLBACK30-NEXT:    orl %eax, %ebp
-; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    negl %eax
-; FALLBACK30-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
-; FALLBACK30-NEXT:    movl 56(%edi), %eax
-; FALLBACK30-NEXT:    shlxl %edx, %eax, %edx
+; FALLBACK30-NEXT:    movl 32(%edx), %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT:    movl 36(%edx), %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edi, %ecx, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl %edi, %eax
 ; FALLBACK30-NEXT:    shrl %esi
 ; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK30-NEXT:    orl %edx, %esi
+; FALLBACK30-NEXT:    orl %ebp, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 40(%edx), %edi
+; FALLBACK30-NEXT:    movl %edi, %esi
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %ecx
+; FALLBACK30-NEXT:    movl 44(%edx), %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %eax, %esi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %eax, %edi, %edi
+; FALLBACK30-NEXT:    movl %eax, %esi
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK30-NEXT:    shrl %eax
 ; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK30-NEXT:    orl %eax, %ecx
-; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %edx, (%eax)
-; FALLBACK30-NEXT:    movl %esi, 56(%eax)
-; FALLBACK30-NEXT:    movl %ecx, 60(%eax)
-; FALLBACK30-NEXT:    movl %ebp, 48(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 40(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK30-NEXT:    orl %edi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 48(%edx), %ebp
+; FALLBACK30-NEXT:    movl %ebp, %edi
+; FALLBACK30-NEXT:    shrl %edi
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 52(%edx), %ecx
+; FALLBACK30-NEXT:    shlxl %esi, %ecx, %edi
+; FALLBACK30-NEXT:    orl %edi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %esi, %ebp, %edi
+; FALLBACK30-NEXT:    movl %esi, %ebp
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    shrl %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %eax, %esi
+; FALLBACK30-NEXT:    orl %edi, %esi
+; FALLBACK30-NEXT:    movl 56(%edx), %edi
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT:    shlxl %ebp, %edi, %ecx
+; FALLBACK30-NEXT:    orl %ecx, %eax
+; FALLBACK30-NEXT:    shrl %edi
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ecx
+; FALLBACK30-NEXT:    shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK30-NEXT:    negl %ebx
+; FALLBACK30-NEXT:    shlxl %ebp, 188(%esp,%ebx), %ebx
+; FALLBACK30-NEXT:    orl %ecx, %ebx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT:    movl %edi, (%edx)
+; FALLBACK30-NEXT:    movl %eax, 56(%edx)
+; FALLBACK30-NEXT:    movl %ebx, 60(%edx)
+; FALLBACK30-NEXT:    movl %esi, 48(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 52(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 40(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 44(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 32(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 36(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 24(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 28(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 16(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 20(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 8(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 12(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 4(%edx)
 ; FALLBACK30-NEXT:    addl $204, %esp
 ; FALLBACK30-NEXT:    popl %esi
 ; FALLBACK30-NEXT:    popl %edi
@@ -20336,10 +20313,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK2-LABEL: ashr_64bytes:
 ; FALLBACK2:       # %bb.0:
-; FALLBACK2-NEXT:    pushq %rbp
 ; FALLBACK2-NEXT:    pushq %r15
 ; FALLBACK2-NEXT:    pushq %r14
-; FALLBACK2-NEXT:    pushq %r13
 ; FALLBACK2-NEXT:    pushq %r12
 ; FALLBACK2-NEXT:    pushq %rbx
 ; FALLBACK2-NEXT:    pushq %rax
@@ -20371,60 +20346,58 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    leal (,%rax,8), %ecx
 ; FALLBACK2-NEXT:    andl $56, %ecx
+; FALLBACK2-NEXT:    movl %ecx, %esi
 ; FALLBACK2-NEXT:    andl $56, %eax
-; FALLBACK2-NEXT:    movq -120(%rsp,%rax), %rdi
-; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r9
-; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %rbx
-; FALLBACK2-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r13
-; FALLBACK2-NEXT:    movq -104(%rsp,%rax), %rsi
-; FALLBACK2-NEXT:    shrxq %rcx, %rsi, %r8
-; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %r10
-; FALLBACK2-NEXT:    shrxq %rcx, %r9, %r11
-; FALLBACK2-NEXT:    movq -88(%rsp,%rax), %r14
-; FALLBACK2-NEXT:    shrxq %rcx, %r14, %r15
-; FALLBACK2-NEXT:    shrxq %rcx, %r10, %rbp
-; FALLBACK2-NEXT:    movl %ecx, %r12d
-; FALLBACK2-NEXT:    notb %r12b
-; FALLBACK2-NEXT:    addq %r9, %r9
-; FALLBACK2-NEXT:    shlxq %r12, %r9, %r9
+; FALLBACK2-NEXT:    movq -120(%rsp,%rax), %r8
+; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK2-NEXT:    shrxq %rsi, %r8, %r9
+; FALLBACK2-NEXT:    notb %cl
+; FALLBACK2-NEXT:    leaq (%r10,%r10), %rdi
+; FALLBACK2-NEXT:    shlxq %rcx, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %r9, %rdi
+; FALLBACK2-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r9
+; FALLBACK2-NEXT:    addq %r8, %r8
+; FALLBACK2-NEXT:    shlxq %rcx, %r8, %r8
+; FALLBACK2-NEXT:    orq %r9, %r8
+; FALLBACK2-NEXT:    movq -104(%rsp,%rax), %r11
+; FALLBACK2-NEXT:    shrxq %rsi, %r11, %rbx
+; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %r14
+; FALLBACK2-NEXT:    leaq (%r14,%r14), %r9
+; FALLBACK2-NEXT:    shlxq %rcx, %r9, %r9
 ; FALLBACK2-NEXT:    orq %rbx, %r9
-; FALLBACK2-NEXT:    addq %rdi, %rdi
-; FALLBACK2-NEXT:    shlxq %r12, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %r13, %rdi
-; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %rbx
-; FALLBACK2-NEXT:    shrxq %rcx, %rbx, %r13
-; FALLBACK2-NEXT:    movq -72(%rsp,%rax), %rax
-; FALLBACK2-NEXT:    sarxq %rcx, %rax, %rcx
+; FALLBACK2-NEXT:    shrxq %rsi, %r10, %r10
+; FALLBACK2-NEXT:    addq %r11, %r11
+; FALLBACK2-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK2-NEXT:    orq %r10, %r11
+; FALLBACK2-NEXT:    movq -88(%rsp,%rax), %r10
+; FALLBACK2-NEXT:    shrxq %rsi, %r10, %rbx
+; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %r15
+; FALLBACK2-NEXT:    leaq (%r15,%r15), %r12
+; FALLBACK2-NEXT:    shlxq %rcx, %r12, %r12
+; FALLBACK2-NEXT:    orq %rbx, %r12
+; FALLBACK2-NEXT:    shrxq %rsi, %r14, %rbx
 ; FALLBACK2-NEXT:    addq %r10, %r10
-; FALLBACK2-NEXT:    shlxq %r12, %r10, %r10
-; FALLBACK2-NEXT:    orq %r8, %r10
-; FALLBACK2-NEXT:    addq %rsi, %rsi
-; FALLBACK2-NEXT:    shlxq %r12, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r11, %rsi
-; FALLBACK2-NEXT:    leaq (%rbx,%rbx), %r8
-; FALLBACK2-NEXT:    shlxq %r12, %r8, %r8
-; FALLBACK2-NEXT:    orq %r15, %r8
-; FALLBACK2-NEXT:    addq %r14, %r14
-; FALLBACK2-NEXT:    shlxq %r12, %r14, %r11
-; FALLBACK2-NEXT:    orq %rbp, %r11
-; FALLBACK2-NEXT:    addq %rax, %rax
-; FALLBACK2-NEXT:    shlxq %r12, %rax, %rax
-; FALLBACK2-NEXT:    orq %r13, %rax
-; FALLBACK2-NEXT:    movq %rcx, 56(%rdx)
-; FALLBACK2-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK2-NEXT:    movq %r11, 32(%rdx)
-; FALLBACK2-NEXT:    movq %r8, 40(%rdx)
-; FALLBACK2-NEXT:    movq %rsi, 16(%rdx)
-; FALLBACK2-NEXT:    movq %r10, 24(%rdx)
-; FALLBACK2-NEXT:    movq %rdi, (%rdx)
-; FALLBACK2-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK2-NEXT:    shlxq %rcx, %r10, %r10
+; FALLBACK2-NEXT:    orq %rbx, %r10
+; FALLBACK2-NEXT:    shrxq %rsi, %r15, %rbx
+; FALLBACK2-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK2-NEXT:    leaq (%rax,%rax), %r14
+; FALLBACK2-NEXT:    shlxq %rcx, %r14, %rcx
+; FALLBACK2-NEXT:    orq %rbx, %rcx
+; FALLBACK2-NEXT:    sarxq %rsi, %rax, %rax
+; FALLBACK2-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK2-NEXT:    movq %rcx, 48(%rdx)
+; FALLBACK2-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK2-NEXT:    movq %r12, 40(%rdx)
+; FALLBACK2-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK2-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK2-NEXT:    movq %r8, (%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
 ; FALLBACK2-NEXT:    addq $8, %rsp
 ; FALLBACK2-NEXT:    popq %rbx
 ; FALLBACK2-NEXT:    popq %r12
-; FALLBACK2-NEXT:    popq %r13
 ; FALLBACK2-NEXT:    popq %r14
 ; FALLBACK2-NEXT:    popq %r15
-; FALLBACK2-NEXT:    popq %rbp
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: ashr_64bytes:
@@ -20664,13 +20637,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK6-LABEL: ashr_64bytes:
 ; FALLBACK6:       # %bb.0:
-; FALLBACK6-NEXT:    pushq %rbp
 ; FALLBACK6-NEXT:    pushq %r15
 ; FALLBACK6-NEXT:    pushq %r14
 ; FALLBACK6-NEXT:    pushq %r13
 ; FALLBACK6-NEXT:    pushq %r12
 ; FALLBACK6-NEXT:    pushq %rbx
-; FALLBACK6-NEXT:    pushq %rax
 ; FALLBACK6-NEXT:    movups (%rdi), %xmm0
 ; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
 ; FALLBACK6-NEXT:    movups 32(%rdi), %xmm2
@@ -20691,62 +20662,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT:    leal (,%rax,8), %esi
-; FALLBACK6-NEXT:    andl $56, %esi
+; FALLBACK6-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK6-NEXT:    andl $56, %ecx
+; FALLBACK6-NEXT:    movl %ecx, %esi
 ; FALLBACK6-NEXT:    andl $56, %eax
-; FALLBACK6-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
-; FALLBACK6-NEXT:    movq -112(%rsp,%rax), %rcx
-; FALLBACK6-NEXT:    movq -104(%rsp,%rax), %rdi
-; FALLBACK6-NEXT:    shrxq %rsi, %rdi, %r12
-; FALLBACK6-NEXT:    movq -96(%rsp,%rax), %r13
-; FALLBACK6-NEXT:    shrxq %rsi, %rcx, %r9
-; FALLBACK6-NEXT:    movq -88(%rsp,%rax), %r10
-; FALLBACK6-NEXT:    shrxq %rsi, %r10, %r14
-; FALLBACK6-NEXT:    shrxq %rsi, %r13, %r15
-; FALLBACK6-NEXT:    movl %esi, %ebx
-; FALLBACK6-NEXT:    notb %bl
-; FALLBACK6-NEXT:    movq -120(%rsp,%rax), %rbp
-; FALLBACK6-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK6-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK6-NEXT:    orq %r11, %r8
-; FALLBACK6-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK6-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK6-NEXT:    orq %r12, %r11
+; FALLBACK6-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r8
+; FALLBACK6-NEXT:    notb %cl
+; FALLBACK6-NEXT:    movq -120(%rsp,%rax), %r10
+; FALLBACK6-NEXT:    movq -112(%rsp,%rax), %r9
+; FALLBACK6-NEXT:    leaq (%r10,%r10), %rdi
+; FALLBACK6-NEXT:    shlxq %rcx, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %r8, %rdi
+; FALLBACK6-NEXT:    movq -104(%rsp,%rax), %r11
+; FALLBACK6-NEXT:    shrxq %rsi, %r11, %rbx
+; FALLBACK6-NEXT:    movq -96(%rsp,%rax), %r14
+; FALLBACK6-NEXT:    leaq (%r14,%r14), %r8
+; FALLBACK6-NEXT:    shlxq %rcx, %r8, %r8
+; FALLBACK6-NEXT:    orq %rbx, %r8
+; FALLBACK6-NEXT:    shrxq %rsi, %r9, %rbx
+; FALLBACK6-NEXT:    addq %r11, %r11
+; FALLBACK6-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK6-NEXT:    orq %rbx, %r11
+; FALLBACK6-NEXT:    movq -88(%rsp,%rax), %rbx
+; FALLBACK6-NEXT:    shrxq %rsi, %rbx, %r15
 ; FALLBACK6-NEXT:    movq -80(%rsp,%rax), %r12
-; FALLBACK6-NEXT:    shrxq %rsi, %r12, %r13
-; FALLBACK6-NEXT:    shrxq %rsi, %rbp, %rbp
+; FALLBACK6-NEXT:    leaq (%r12,%r12), %r13
+; FALLBACK6-NEXT:    shlxq %rcx, %r13, %r13
+; FALLBACK6-NEXT:    orq %r15, %r13
+; FALLBACK6-NEXT:    shrxq %rsi, %r14, %r14
+; FALLBACK6-NEXT:    addq %rbx, %rbx
+; FALLBACK6-NEXT:    shlxq %rcx, %rbx, %rbx
+; FALLBACK6-NEXT:    orq %r14, %rbx
+; FALLBACK6-NEXT:    shrxq %rsi, %r12, %r14
 ; FALLBACK6-NEXT:    movq -72(%rsp,%rax), %rax
-; FALLBACK6-NEXT:    sarxq %rsi, %rax, %rsi
-; FALLBACK6-NEXT:    addq %rdi, %rdi
-; FALLBACK6-NEXT:    shlxq %rbx, %rdi, %rdi
-; FALLBACK6-NEXT:    orq %r9, %rdi
-; FALLBACK6-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK6-NEXT:    shlxq %rbx, %r9, %r9
-; FALLBACK6-NEXT:    orq %r14, %r9
-; FALLBACK6-NEXT:    addq %r10, %r10
-; FALLBACK6-NEXT:    shlxq %rbx, %r10, %r10
-; FALLBACK6-NEXT:    orq %r15, %r10
-; FALLBACK6-NEXT:    addq %rax, %rax
-; FALLBACK6-NEXT:    shlxq %rbx, %rax, %rax
-; FALLBACK6-NEXT:    orq %r13, %rax
-; FALLBACK6-NEXT:    addq %rcx, %rcx
-; FALLBACK6-NEXT:    shlxq %rbx, %rcx, %rcx
-; FALLBACK6-NEXT:    orq %rbp, %rcx
-; FALLBACK6-NEXT:    movq %rsi, 56(%rdx)
+; FALLBACK6-NEXT:    leaq (%rax,%rax), %r15
+; FALLBACK6-NEXT:    shlxq %rcx, %r15, %r15
+; FALLBACK6-NEXT:    orq %r14, %r15
+; FALLBACK6-NEXT:    shrxq %rsi, %r10, %r10
+; FALLBACK6-NEXT:    addq %r9, %r9
+; FALLBACK6-NEXT:    shlxq %rcx, %r9, %rcx
+; FALLBACK6-NEXT:    orq %r10, %rcx
+; FALLBACK6-NEXT:    sarxq %rsi, %rax, %rax
+; FALLBACK6-NEXT:    movq %rax, 56(%rdx)
 ; FALLBACK6-NEXT:    movq %rcx, 8(%rdx)
-; FALLBACK6-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK6-NEXT:    movq %r10, 32(%rdx)
-; FALLBACK6-NEXT:    movq %r9, 40(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK6-NEXT:    movq %r8, (%rdx)
-; FALLBACK6-NEXT:    addq $8, %rsp
+; FALLBACK6-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK6-NEXT:    movq %rbx, 32(%rdx)
+; FALLBACK6-NEXT:    movq %r13, 40(%rdx)
+; FALLBACK6-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, (%rdx)
 ; FALLBACK6-NEXT:    popq %rbx
 ; FALLBACK6-NEXT:    popq %r12
 ; FALLBACK6-NEXT:    popq %r13
 ; FALLBACK6-NEXT:    popq %r14
 ; FALLBACK6-NEXT:    popq %r15
-; FALLBACK6-NEXT:    popq %rbp
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: ashr_64bytes:
@@ -20979,13 +20948,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK10-LABEL: ashr_64bytes:
 ; FALLBACK10:       # %bb.0:
-; FALLBACK10-NEXT:    pushq %rbp
 ; FALLBACK10-NEXT:    pushq %r15
 ; FALLBACK10-NEXT:    pushq %r14
 ; FALLBACK10-NEXT:    pushq %r13
 ; FALLBACK10-NEXT:    pushq %r12
 ; FALLBACK10-NEXT:    pushq %rbx
-; FALLBACK10-NEXT:    pushq %rax
 ; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
 ; FALLBACK10-NEXT:    vmovups 32(%rdi), %xmm1
 ; FALLBACK10-NEXT:    movq 48(%rdi), %rcx
@@ -21004,62 +20971,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT:    leal (,%rax,8), %esi
-; FALLBACK10-NEXT:    andl $56, %esi
+; FALLBACK10-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK10-NEXT:    andl $56, %ecx
+; FALLBACK10-NEXT:    movl %ecx, %esi
 ; FALLBACK10-NEXT:    andl $56, %eax
-; FALLBACK10-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
-; FALLBACK10-NEXT:    movq -112(%rsp,%rax), %rcx
-; FALLBACK10-NEXT:    movq -104(%rsp,%rax), %rdi
-; FALLBACK10-NEXT:    shrxq %rsi, %rdi, %r12
-; FALLBACK10-NEXT:    movq -96(%rsp,%rax), %r13
-; FALLBACK10-NEXT:    shrxq %rsi, %rcx, %r9
-; FALLBACK10-NEXT:    movq -88(%rsp,%rax), %r10
-; FALLBACK10-NEXT:    shrxq %rsi, %r10, %r14
-; FALLBACK10-NEXT:    shrxq %rsi, %r13, %r15
-; FALLBACK10-NEXT:    movl %esi, %ebx
-; FALLBACK10-NEXT:    notb %bl
-; FALLBACK10-NEXT:    movq -120(%rsp,%rax), %rbp
-; FALLBACK10-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK10-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK10-NEXT:    orq %r11, %r8
-; FALLBACK10-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK10-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK10-NEXT:    orq %r12, %r11
+; FALLBACK10-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r8
+; FALLBACK10-NEXT:    notb %cl
+; FALLBACK10-NEXT:    movq -120(%rsp,%rax), %r10
+; FALLBACK10-NEXT:    movq -112(%rsp,%rax), %r9
+; FALLBACK10-NEXT:    leaq (%r10,%r10), %rdi
+; FALLBACK10-NEXT:    shlxq %rcx, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %r8, %rdi
+; FALLBACK10-NEXT:    movq -104(%rsp,%rax), %r11
+; FALLBACK10-NEXT:    shrxq %rsi, %r11, %rbx
+; FALLBACK10-NEXT:    movq -96(%rsp,%rax), %r14
+; FALLBACK10-NEXT:    leaq (%r14,%r14), %r8
+; FALLBACK10-NEXT:    shlxq %rcx, %r8, %r8
+; FALLBACK10-NEXT:    orq %rbx, %r8
+; FALLBACK10-NEXT:    shrxq %rsi, %r9, %rbx
+; FALLBACK10-NEXT:    addq %r11, %r11
+; FALLBACK10-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK10-NEXT:    orq %rbx, %r11
+; FALLBACK10-NEXT:    movq -88(%rsp,%rax), %rbx
+; FALLBACK10-NEXT:    shrxq %rsi, %rbx, %r15
 ; FALLBACK10-NEXT:    movq -80(%rsp,%rax), %r12
-; FALLBACK10-NEXT:    shrxq %rsi, %r12, %r13
-; FALLBACK10-NEXT:    shrxq %rsi, %rbp, %rbp
+; FALLBACK10-NEXT:    leaq (%r12,%r12), %r13
+; FALLBACK10-NEXT:    shlxq %rcx, %r13, %r13
+; FALLBACK10-NEXT:    orq %r15, %r13
+; FALLBACK10-NEXT:    shrxq %rsi, %r14, %r14
+; FALLBACK10-NEXT:    addq %rbx, %rbx
+; FALLBACK10-NEXT:    shlxq %rcx, %rbx, %rbx
+; FALLBACK10-NEXT:    orq %r14, %rbx
+; FALLBACK10-NEXT:    shrxq %rsi, %r12, %r14
 ; FALLBACK10-NEXT:    movq -72(%rsp,%rax), %rax
-; FALLBACK10-NEXT:    sarxq %rsi, %rax, %rsi
-; FALLBACK10-NEXT:    addq %rdi, %rdi
-; FALLBACK10-NEXT:    shlxq %rbx, %rdi, %rdi
-; FALLBACK10-NEXT:    orq %r9, %rdi
-; FALLBACK10-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK10-NEXT:    shlxq %rbx, %r9, %r9
-; FALLBACK10-NEXT:    orq %r14, %r9
-; FALLBACK10-NEXT:    addq %r10, %r10
-; FALLBACK10-NEXT:    shlxq %rbx, %r10, %r10
-; FALLBACK10-NEXT:    orq %r15, %r10
-; FALLBACK10-NEXT:    addq %rax, %rax
-; FALLBACK10-NEXT:    shlxq %rbx, %rax, %rax
-; FALLBACK10-NEXT:    orq %r13, %rax
-; FALLBACK10-NEXT:    addq %rcx, %rcx
-; FALLBACK10-NEXT:    shlxq %rbx, %rcx, %rcx
-; FALLBACK10-NEXT:    orq %rbp, %rcx
-; FALLBACK10-NEXT:    movq %rsi, 56(%rdx)
+; FALLBACK10-NEXT:    leaq (%rax,%rax), %r15
+; FALLBACK10-NEXT:    shlxq %rcx, %r15, %r15
+; FALLBACK10-NEXT:    orq %r14, %r15
+; FALLBACK10-NEXT:    shrxq %rsi, %r10, %r10
+; FALLBACK10-NEXT:    addq %r9, %r9
+; FALLBACK10-NEXT:    shlxq %rcx, %r9, %rcx
+; FALLBACK10-NEXT:    orq %r10, %rcx
+; FALLBACK10-NEXT:    sarxq %rsi, %rax, %rax
+; FALLBACK10-NEXT:    movq %rax, 56(%rdx)
 ; FALLBACK10-NEXT:    movq %rcx, 8(%rdx)
-; FALLBACK10-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK10-NEXT:    movq %r10, 32(%rdx)
-; FALLBACK10-NEXT:    movq %r9, 40(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK10-NEXT:    movq %r8, (%rdx)
-; FALLBACK10-NEXT:    addq $8, %rsp
+; FALLBACK10-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK10-NEXT:    movq %rbx, 32(%rdx)
+; FALLBACK10-NEXT:    movq %r13, 40(%rdx)
+; FALLBACK10-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, (%rdx)
 ; FALLBACK10-NEXT:    popq %rbx
 ; FALLBACK10-NEXT:    popq %r12
 ; FALLBACK10-NEXT:    popq %r13
 ; FALLBACK10-NEXT:    popq %r14
 ; FALLBACK10-NEXT:    popq %r15
-; FALLBACK10-NEXT:    popq %rbp
 ; FALLBACK10-NEXT:    vzeroupper
 ; FALLBACK10-NEXT:    retq
 ;
@@ -21292,13 +21257,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK14-LABEL: ashr_64bytes:
 ; FALLBACK14:       # %bb.0:
-; FALLBACK14-NEXT:    pushq %rbp
 ; FALLBACK14-NEXT:    pushq %r15
 ; FALLBACK14-NEXT:    pushq %r14
 ; FALLBACK14-NEXT:    pushq %r13
 ; FALLBACK14-NEXT:    pushq %r12
 ; FALLBACK14-NEXT:    pushq %rbx
-; FALLBACK14-NEXT:    pushq %rax
 ; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
 ; FALLBACK14-NEXT:    vmovups 32(%rdi), %xmm1
 ; FALLBACK14-NEXT:    movq 48(%rdi), %rcx
@@ -21317,62 +21280,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT:    leal (,%rax,8), %esi
-; FALLBACK14-NEXT:    andl $56, %esi
+; FALLBACK14-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK14-NEXT:    andl $56, %ecx
+; FALLBACK14-NEXT:    movl %ecx, %esi
 ; FALLBACK14-NEXT:    andl $56, %eax
-; FALLBACK14-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
-; FALLBACK14-NEXT:    movq -112(%rsp,%rax), %rcx
-; FALLBACK14-NEXT:    movq -104(%rsp,%rax), %rdi
-; FALLBACK14-NEXT:    shrxq %rsi, %rdi, %r12
-; FALLBACK14-NEXT:    movq -96(%rsp,%rax), %r13
-; FALLBACK14-NEXT:    shrxq %rsi, %rcx, %r9
-; FALLBACK14-NEXT:    movq -88(%rsp,%rax), %r10
-; FALLBACK14-NEXT:    shrxq %rsi, %r10, %r14
-; FALLBACK14-NEXT:    shrxq %rsi, %r13, %r15
-; FALLBACK14-NEXT:    movl %esi, %ebx
-; FALLBACK14-NEXT:    notb %bl
-; FALLBACK14-NEXT:    movq -120(%rsp,%rax), %rbp
-; FALLBACK14-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK14-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK14-NEXT:    orq %r11, %r8
-; FALLBACK14-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK14-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK14-NEXT:    orq %r12, %r11
+; FALLBACK14-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r8
+; FALLBACK14-NEXT:    notb %cl
+; FALLBACK14-NEXT:    movq -120(%rsp,%rax), %r10
+; FALLBACK14-NEXT:    movq -112(%rsp,%rax), %r9
+; FALLBACK14-NEXT:    leaq (%r10,%r10), %rdi
+; FALLBACK14-NEXT:    shlxq %rcx, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %r8, %rdi
+; FALLBACK14-NEXT:    movq -104(%rsp,%rax), %r11
+; FALLBACK14-NEXT:    shrxq %rsi, %r11, %rbx
+; FALLBACK14-NEXT:    movq -96(%rsp,%rax), %r14
+; FALLBACK14-NEXT:    leaq (%r14,%r14), %r8
+; FALLBACK14-NEXT:    shlxq %rcx, %r8, %r8
+; FALLBACK14-NEXT:    orq %rbx, %r8
+; FALLBACK14-NEXT:    shrxq %rsi, %r9, %rbx
+; FALLBACK14-NEXT:    addq %r11, %r11
+; FALLBACK14-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK14-NEXT:    orq %rbx, %r11
+; FALLBACK14-NEXT:    movq -88(%rsp,%rax), %rbx
+; FALLBACK14-NEXT:    shrxq %rsi, %rbx, %r15
 ; FALLBACK14-NEXT:    movq -80(%rsp,%rax), %r12
-; FALLBACK14-NEXT:    shrxq %rsi, %r12, %r13
-; FALLBACK14-NEXT:    shrxq %rsi, %rbp, %rbp
+; FALLBACK14-NEXT:    leaq (%r12,%r12), %r13
+; FALLBACK14-NEXT:    shlxq %rcx, %r13, %r13
+; FALLBACK14-NEXT:    orq %r15, %r13
+; FALLBACK14-NEXT:    shrxq %rsi, %r14, %r14
+; FALLBACK14-NEXT:    addq %rbx, %rbx
+; FALLBACK14-NEXT:    shlxq %rcx, %rbx, %rbx
+; FALLBACK14-NEXT:    orq %r14, %rbx
+; FALLBACK14-NEXT:    shrxq %rsi, %r12, %r14
 ; FALLBACK14-NEXT:    movq -72(%rsp,%rax), %rax
-; FALLBACK14-NEXT:    sarxq %rsi, %rax, %rsi
-; FALLBACK14-NEXT:    addq %rdi, %rdi
-; FALLBACK14-NEXT:    shlxq %rbx, %rdi, %rdi
-; FALLBACK14-NEXT:    orq %r9, %rdi
-; FALLBACK14-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK14-NEXT:    shlxq %rbx, %r9, %r9
-; FALLBACK14-NEXT:    orq %r14, %r9
-; FALLBACK14-NEXT:    addq %r10, %r10
-; FALLBACK14-NEXT:    shlxq %rbx, %r10, %r10
-; FALLBACK14-NEXT:    orq %r15, %r10
-; FALLBACK14-NEXT:    addq %rax, %rax
-; FALLBACK14-NEXT:    shlxq %rbx, %rax, %rax
-; FALLBACK14-NEXT:    orq %r13, %rax
-; FALLBACK14-NEXT:    addq %rcx, %rcx
-; FALLBACK14-NEXT:    shlxq %rbx, %rcx, %rcx
-; FALLBACK14-NEXT:    orq %rbp, %rcx
-; FALLBACK14-NEXT:    movq %rsi, 56(%rdx)
+; FALLBACK14-NEXT:    leaq (%rax,%rax), %r15
+; FALLBACK14-NEXT:    shlxq %rcx, %r15, %r15
+; FALLBACK14-NEXT:    orq %r14, %r15
+; FALLBACK14-NEXT:    shrxq %rsi, %r10, %r10
+; FALLBACK14-NEXT:    addq %r9, %r9
+; FALLBACK14-NEXT:    shlxq %rcx, %r9, %rcx
+; FALLBACK14-NEXT:    orq %r10, %rcx
+; FALLBACK14-NEXT:    sarxq %rsi, %rax, %rax
+; FALLBACK14-NEXT:    movq %rax, 56(%rdx)
 ; FALLBACK14-NEXT:    movq %rcx, 8(%rdx)
-; FALLBACK14-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK14-NEXT:    movq %r10, 32(%rdx)
-; FALLBACK14-NEXT:    movq %r9, 40(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK14-NEXT:    movq %r8, (%rdx)
-; FALLBACK14-NEXT:    addq $8, %rsp
+; FALLBACK14-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK14-NEXT:    movq %rbx, 32(%rdx)
+; FALLBACK14-NEXT:    movq %r13, 40(%rdx)
+; FALLBACK14-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, (%rdx)
 ; FALLBACK14-NEXT:    popq %rbx
 ; FALLBACK14-NEXT:    popq %r12
 ; FALLBACK14-NEXT:    popq %r13
 ; FALLBACK14-NEXT:    popq %r14
 ; FALLBACK14-NEXT:    popq %r15
-; FALLBACK14-NEXT:    popq %rbp
 ; FALLBACK14-NEXT:    vzeroupper
 ; FALLBACK14-NEXT:    retq
 ;
@@ -21960,111 +21921,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %eax, %ecx
 ; FALLBACK18-NEXT:    leal (,%eax,8), %edx
 ; FALLBACK18-NEXT:    andl $24, %edx
+; FALLBACK18-NEXT:    movl %edx, %ebx
 ; FALLBACK18-NEXT:    andl $60, %ecx
 ; FALLBACK18-NEXT:    movl 68(%esp,%ecx), %esi
 ; FALLBACK18-NEXT:    movl 72(%esp,%ecx), %edi
 ; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl %edx, %ebx
-; FALLBACK18-NEXT:    notb %bl
+; FALLBACK18-NEXT:    notb %dl
 ; FALLBACK18-NEXT:    leal (%edi,%edi), %ebp
-; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
 ; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
 ; FALLBACK18-NEXT:    addl %esi, %esi
-; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 80(%esp,%ecx), %esi
 ; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    movl 76(%esp,%ecx), %edi
-; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK18-NEXT:    orl %eax, %edi
 ; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 88(%esp,%ecx), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    movl 84(%esp,%ecx), %edi
-; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    orl %esi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    movl 92(%esp,%ecx), %edi
-; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK18-NEXT:    orl %eax, %edi
 ; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 104(%esp,%ecx), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    movl 100(%esp,%ecx), %edi
-; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    orl %esi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl %ecx, %ebp
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 112(%esp,%ecx), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK18-NEXT:    movl 108(%esp,%ecx), %esi
-; FALLBACK18-NEXT:    movl %ecx, %edi
-; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    addl %esi, %esi
-; FALLBACK18-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK18-NEXT:    orl %ecx, %esi
-; FALLBACK18-NEXT:    movl 120(%esp,%edi), %ebp
-; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
-; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK18-NEXT:    movl 116(%esp,%edi), %eax
-; FALLBACK18-NEXT:    shrxl %edx, %eax, %edi
-; FALLBACK18-NEXT:    orl %edi, %ecx
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK18-NEXT:    orl %ecx, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 120(%esp,%ebp), %edi
+; FALLBACK18-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK18-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK18-NEXT:    movl 116(%esp,%ebp), %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %esi
+; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    addl %eax, %eax
-; FALLBACK18-NEXT:    shlxl %ebx, %eax, %edi
-; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK18-NEXT:    shrxl %edx, %ebp, %eax
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK18-NEXT:    movl 124(%esp,%ebp), %ebp
-; FALLBACK18-NEXT:    sarxl %edx, %ebp, %edx
-; FALLBACK18-NEXT:    addl %ebp, %ebp
-; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %ebx
-; FALLBACK18-NEXT:    orl %eax, %ebx
+; FALLBACK18-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK18-NEXT:    orl %ebp, %ecx
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl 124(%esp,%eax), %eax
+; FALLBACK18-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %ebp, %edx
+; FALLBACK18-NEXT:    shrxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    orl %edi, %edx
+; FALLBACK18-NEXT:    sarxl %ebx, %eax, %edi
 ; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT:    movl %edx, 60(%eax)
-; FALLBACK18-NEXT:    movl %ebx, 56(%eax)
-; FALLBACK18-NEXT:    movl %edi, 48(%eax)
-; FALLBACK18-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK18-NEXT:    movl %esi, 40(%eax)
+; FALLBACK18-NEXT:    movl %edi, 60(%eax)
+; FALLBACK18-NEXT:    movl %edx, 56(%eax)
+; FALLBACK18-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK18-NEXT:    movl %esi, 52(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 40(%eax)
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK18-NEXT:    movl %ecx, 44(%eax)
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -22664,111 +22626,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movl %eax, %ecx
 ; FALLBACK22-NEXT:    leal (,%eax,8), %edx
 ; FALLBACK22-NEXT:    andl $24, %edx
+; FALLBACK22-NEXT:    movl %edx, %ebx
 ; FALLBACK22-NEXT:    andl $60, %ecx
 ; FALLBACK22-NEXT:    movl 68(%esp,%ecx), %esi
 ; FALLBACK22-NEXT:    movl 72(%esp,%ecx), %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl %edx, %ebx
-; FALLBACK22-NEXT:    notb %bl
+; FALLBACK22-NEXT:    notb %dl
 ; FALLBACK22-NEXT:    leal (%edi,%edi), %ebp
-; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %ebp, %eax
 ; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
 ; FALLBACK22-NEXT:    addl %esi, %esi
-; FALLBACK22-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK22-NEXT:    orl %edi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 80(%esp,%ecx), %esi
 ; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK22-NEXT:    movl 76(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK22-NEXT:    orl %eax, %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 88(%esp,%ecx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK22-NEXT:    movl 84(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK22-NEXT:    orl %esi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK22-NEXT:    movl 92(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK22-NEXT:    orl %eax, %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 104(%esp,%ecx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK22-NEXT:    movl 100(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK22-NEXT:    orl %esi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl %ecx, %ebp
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 112(%esp,%ecx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK22-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK22-NEXT:    movl 108(%esp,%ecx), %esi
-; FALLBACK22-NEXT:    movl %ecx, %edi
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK22-NEXT:    orl %edi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    addl %esi, %esi
-; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %ecx, %esi
-; FALLBACK22-NEXT:    movl 120(%esp,%edi), %ebp
-; FALLBACK22-NEXT:    leal (%ebp,%ebp), %ecx
-; FALLBACK22-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT:    movl 116(%esp,%edi), %eax
-; FALLBACK22-NEXT:    shrxl %edx, %eax, %edi
-; FALLBACK22-NEXT:    orl %edi, %ecx
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK22-NEXT:    orl %ecx, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 120(%esp,%ebp), %edi
+; FALLBACK22-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK22-NEXT:    movl 116(%esp,%ebp), %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %eax, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %esi
+; FALLBACK22-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    addl %eax, %eax
-; FALLBACK22-NEXT:    shlxl %ebx, %eax, %edi
-; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK22-NEXT:    shrxl %edx, %ebp, %eax
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK22-NEXT:    movl 124(%esp,%ebp), %ebp
-; FALLBACK22-NEXT:    sarxl %edx, %ebp, %edx
-; FALLBACK22-NEXT:    addl %ebp, %ebp
-; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebx
-; FALLBACK22-NEXT:    orl %eax, %ebx
+; FALLBACK22-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK22-NEXT:    orl %ebp, %ecx
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl 124(%esp,%eax), %eax
+; FALLBACK22-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK22-NEXT:    shlxl %edx, %ebp, %edx
+; FALLBACK22-NEXT:    shrxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    orl %edi, %edx
+; FALLBACK22-NEXT:    sarxl %ebx, %eax, %edi
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT:    movl %edx, 60(%eax)
-; FALLBACK22-NEXT:    movl %ebx, 56(%eax)
-; FALLBACK22-NEXT:    movl %edi, 48(%eax)
-; FALLBACK22-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK22-NEXT:    movl %esi, 40(%eax)
+; FALLBACK22-NEXT:    movl %edi, 60(%eax)
+; FALLBACK22-NEXT:    movl %edx, 56(%eax)
+; FALLBACK22-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK22-NEXT:    movl %esi, 52(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 40(%eax)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK22-NEXT:    movl %ecx, 44(%eax)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -23326,111 +23289,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    movl %eax, %ecx
 ; FALLBACK26-NEXT:    leal (,%eax,8), %edx
 ; FALLBACK26-NEXT:    andl $24, %edx
+; FALLBACK26-NEXT:    movl %edx, %ebx
 ; FALLBACK26-NEXT:    andl $60, %ecx
 ; FALLBACK26-NEXT:    movl 68(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    movl 72(%esp,%ecx), %edi
 ; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl %edx, %ebx
-; FALLBACK26-NEXT:    notb %bl
+; FALLBACK26-NEXT:    notb %dl
 ; FALLBACK26-NEXT:    leal (%edi,%edi), %ebp
-; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %ebp, %eax
 ; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
 ; FALLBACK26-NEXT:    addl %esi, %esi
-; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK26-NEXT:    orl %edi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 80(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    movl 76(%esp,%ecx), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK26-NEXT:    orl %eax, %edi
 ; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 88(%esp,%ecx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    movl 84(%esp,%ecx), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    orl %esi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    movl 92(%esp,%ecx), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK26-NEXT:    orl %eax, %edi
 ; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 104(%esp,%ecx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    movl 100(%esp,%ecx), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    orl %esi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl %ecx, %ebp
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 112(%esp,%ecx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK26-NEXT:    movl 108(%esp,%ecx), %esi
-; FALLBACK26-NEXT:    movl %ecx, %edi
-; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK26-NEXT:    orl %edi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %esi, %esi
-; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK26-NEXT:    orl %ecx, %esi
-; FALLBACK26-NEXT:    movl 120(%esp,%edi), %ebp
-; FALLBACK26-NEXT:    leal (%ebp,%ebp), %ecx
-; FALLBACK26-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK26-NEXT:    movl 116(%esp,%edi), %eax
-; FALLBACK26-NEXT:    shrxl %edx, %eax, %edi
-; FALLBACK26-NEXT:    orl %edi, %ecx
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK26-NEXT:    orl %ecx, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 120(%esp,%ebp), %edi
+; FALLBACK26-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK26-NEXT:    movl 116(%esp,%ebp), %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %esi
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %eax, %eax
-; FALLBACK26-NEXT:    shlxl %ebx, %eax, %edi
-; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK26-NEXT:    shrxl %edx, %ebp, %eax
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK26-NEXT:    movl 124(%esp,%ebp), %ebp
-; FALLBACK26-NEXT:    sarxl %edx, %ebp, %edx
-; FALLBACK26-NEXT:    addl %ebp, %ebp
-; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %ebx
-; FALLBACK26-NEXT:    orl %eax, %ebx
+; FALLBACK26-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK26-NEXT:    orl %ebp, %ecx
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl 124(%esp,%eax), %eax
+; FALLBACK26-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK26-NEXT:    shlxl %edx, %ebp, %edx
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    orl %edi, %edx
+; FALLBACK26-NEXT:    sarxl %ebx, %eax, %edi
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK26-NEXT:    movl %edx, 60(%eax)
-; FALLBACK26-NEXT:    movl %ebx, 56(%eax)
-; FALLBACK26-NEXT:    movl %edi, 48(%eax)
-; FALLBACK26-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK26-NEXT:    movl %esi, 40(%eax)
+; FALLBACK26-NEXT:    movl %edi, 60(%eax)
+; FALLBACK26-NEXT:    movl %edx, 56(%eax)
+; FALLBACK26-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK26-NEXT:    movl %esi, 52(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 40(%eax)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK26-NEXT:    movl %ecx, 44(%eax)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -23988,111 +23952,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    movl %eax, %ecx
 ; FALLBACK30-NEXT:    leal (,%eax,8), %edx
 ; FALLBACK30-NEXT:    andl $24, %edx
+; FALLBACK30-NEXT:    movl %edx, %ebx
 ; FALLBACK30-NEXT:    andl $60, %ecx
 ; FALLBACK30-NEXT:    movl 68(%esp,%ecx), %esi
 ; FALLBACK30-NEXT:    movl 72(%esp,%ecx), %edi
 ; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl %edx, %ebx
-; FALLBACK30-NEXT:    notb %bl
+; FALLBACK30-NEXT:    notb %dl
 ; FALLBACK30-NEXT:    leal (%edi,%edi), %ebp
-; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %ebp, %eax
 ; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
 ; FALLBACK30-NEXT:    addl %esi, %esi
-; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK30-NEXT:    orl %edi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 80(%esp,%ecx), %esi
 ; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK30-NEXT:    movl 76(%esp,%ecx), %edi
-; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK30-NEXT:    orl %eax, %edi
 ; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 88(%esp,%ecx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK30-NEXT:    movl 84(%esp,%ecx), %edi
-; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK30-NEXT:    orl %esi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK30-NEXT:    movl 92(%esp,%ecx), %edi
-; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK30-NEXT:    orl %eax, %edi
 ; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 104(%esp,%ecx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK30-NEXT:    movl 100(%esp,%ecx), %edi
-; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK30-NEXT:    orl %esi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl %ecx, %ebp
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 112(%esp,%ecx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK30-NEXT:    movl 108(%esp,%ecx), %esi
-; FALLBACK30-NEXT:    movl %ecx, %edi
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK30-NEXT:    orl %edi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %esi, %esi
-; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK30-NEXT:    orl %ecx, %esi
-; FALLBACK30-NEXT:    movl 120(%esp,%edi), %ebp
-; FALLBACK30-NEXT:    leal (%ebp,%ebp), %ecx
-; FALLBACK30-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK30-NEXT:    movl 116(%esp,%edi), %eax
-; FALLBACK30-NEXT:    shrxl %edx, %eax, %edi
-; FALLBACK30-NEXT:    orl %edi, %ecx
-; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK30-NEXT:    orl %ecx, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 120(%esp,%ebp), %edi
+; FALLBACK30-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK30-NEXT:    movl 116(%esp,%ebp), %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %eax, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %esi
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %eax, %eax
-; FALLBACK30-NEXT:    shlxl %ebx, %eax, %edi
-; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK30-NEXT:    shrxl %edx, %ebp, %eax
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK30-NEXT:    movl 124(%esp,%ebp), %ebp
-; FALLBACK30-NEXT:    sarxl %edx, %ebp, %edx
-; FALLBACK30-NEXT:    addl %ebp, %ebp
-; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %ebx
-; FALLBACK30-NEXT:    orl %eax, %ebx
+; FALLBACK30-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK30-NEXT:    orl %ebp, %ecx
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl 124(%esp,%eax), %eax
+; FALLBACK30-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK30-NEXT:    shlxl %edx, %ebp, %edx
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    orl %edi, %edx
+; FALLBACK30-NEXT:    sarxl %ebx, %eax, %edi
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK30-NEXT:    movl %edx, 60(%eax)
-; FALLBACK30-NEXT:    movl %ebx, 56(%eax)
-; FALLBACK30-NEXT:    movl %edi, 48(%eax)
-; FALLBACK30-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK30-NEXT:    movl %esi, 40(%eax)
+; FALLBACK30-NEXT:    movl %edi, 60(%eax)
+; FALLBACK30-NEXT:    movl %edx, 56(%eax)
+; FALLBACK30-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK30-NEXT:    movl %esi, 52(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 40(%eax)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK30-NEXT:    movl %ecx, 44(%eax)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index 338e104fbe8f0..221a51ed44696 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -712,33 +712,33 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edi), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%esi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%edi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%edx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -994,42 +994,42 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %al, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %dl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%esi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, 28(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, 28(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -1297,33 +1297,33 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edi), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%esi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%edi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ecx, %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%edx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -1487,31 +1487,31 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -72(%rsp,%rsi,8), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi,8), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %sil, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -72(%rsp,%rsi,8), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rsi,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rsi, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
@@ -1761,88 +1761,90 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 32(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%esi,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %eax, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 28(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 20(%edi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%edi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%edi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $108, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -2040,32 +2042,32 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movsbq %cl, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    negb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movsbq %sil, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -16(%rsp,%rdi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r8, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rdi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rsi, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r8, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rsi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, -16(%rsp,%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rdi, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
@@ -2319,97 +2321,101 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $28, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $28, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %dl, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%esi), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%esi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%esi), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%esi), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%esi), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %edi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebp), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, 92(%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%esi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, 92(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 28(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $108, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -2610,31 +2616,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -72(%rsp,%rsi,8), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi,8), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rax, %rsi, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %sil, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -72(%rsp,%rsi,8), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rsi,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rcx, %rsi, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
@@ -2927,60 +2933,59 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 32(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%esi,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 32(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ecx, %esi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 24(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%esi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3263,13 +3268,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
@@ -3292,65 +3295,63 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r11,%r11), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r12,%r12), %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r13, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 56(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r12, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r15, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 48(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
@@ -3868,20 +3869,20 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -3906,116 +3907,117 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%ebx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ecx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -4388,10 +4390,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
@@ -4419,63 +4419,61 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    negl %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movslq %esi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r14, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rsi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r8, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %r13d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r14, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -8(%rsp,%rsi), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rsi), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r9, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rsi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r8, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rsi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rbx, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rsi), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r15, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r15, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, -8(%rsp,%rsi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rsi, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r12, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 48(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rbx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 56(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
@@ -4972,33 +4970,33 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%ebp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%ebp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%ebp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
@@ -5011,7 +5009,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -5032,149 +5030,152 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edx), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edx), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    negl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, 188(%esp,%ecx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    negl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, 188(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 56(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 60(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 52(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 40(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 44(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 32(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 36(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $204, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -5534,13 +5535,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
@@ -5567,65 +5566,63 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r11,%r11), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r12,%r12), %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r13, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 56(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r12, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r15, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 48(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
@@ -6221,33 +6218,31 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%ebx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
@@ -6256,87 +6251,84 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %edx, %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index c3054a365c466..6b5c6049f025b 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -1635,22 +1635,22 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rdi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -72(%rsp,%rcx,8), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rcx,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rcx,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
@@ -1807,40 +1807,43 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 16(%esp,%ecx,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%esi,4), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi,4), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ecx,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $92, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -1906,13 +1909,13 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
 ; X64-BMI2-NEXT:    andl $56, %eax
-; X64-BMI2-NEXT:    andl $56, %esi
-; X64-BMI2-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
-; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT:    movl %eax, %ecx
 ; X64-BMI2-NEXT:    notl %eax
-; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
-; X64-BMI2-NEXT:    addl %esi, %esi
-; X64-BMI2-NEXT:    shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT:    andl $56, %esi
+; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %edi
+; X64-BMI2-NEXT:    addl %edi, %edi
+; X64-BMI2-NEXT:    shlxq %rax, %rdi, %rax
+; X64-BMI2-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rcx
 ; X64-BMI2-NEXT:    orl %eax, %ecx
 ; X64-BMI2-NEXT:    movb %cl, (%rdx)
 ; X64-BMI2-NEXT:    popq %rax
@@ -2070,13 +2073,13 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
 ; X64-BMI2-NEXT:    andl $56, %eax
-; X64-BMI2-NEXT:    andl $56, %esi
-; X64-BMI2-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
-; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT:    movl %eax, %ecx
 ; X64-BMI2-NEXT:    notl %eax
-; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
-; X64-BMI2-NEXT:    addl %esi, %esi
-; X64-BMI2-NEXT:    shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT:    andl $56, %esi
+; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %edi
+; X64-BMI2-NEXT:    addl %edi, %edi
+; X64-BMI2-NEXT:    shlxq %rax, %rdi, %rax
+; X64-BMI2-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rcx
 ; X64-BMI2-NEXT:    orl %eax, %ecx
 ; X64-BMI2-NEXT:    movw %cx, (%rdx)
 ; X64-BMI2-NEXT:    popq %rax
@@ -2233,13 +2236,13 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
 ; X64-BMI2-NEXT:    andl $56, %eax
-; X64-BMI2-NEXT:    andl $56, %esi
-; X64-BMI2-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
-; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT:    movl %eax, %ecx
 ; X64-BMI2-NEXT:    notl %eax
-; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
-; X64-BMI2-NEXT:    addl %esi, %esi
-; X64-BMI2-NEXT:    shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT:    andl $56, %esi
+; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %edi
+; X64-BMI2-NEXT:    addl %edi, %edi
+; X64-BMI2-NEXT:    shlxq %rax, %rdi, %rax
+; X64-BMI2-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rcx
 ; X64-BMI2-NEXT:    orl %eax, %ecx
 ; X64-BMI2-NEXT:    movl %ecx, (%rdx)
 ; X64-BMI2-NEXT:    popq %rax
@@ -2521,10 +2524,11 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $128, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $140, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -2541,25 +2545,26 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, (%esp,%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $128, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $140, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2667,21 +2672,21 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx def $rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, -128(%rsp,%rsi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r8, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r10, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, %r9, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
@@ -2860,33 +2865,33 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, 16(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $156, %esp
@@ -3026,9 +3031,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
@@ -3043,38 +3046,36 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rsi), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx def $rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, -128(%rsp,%rsi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, %r9, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r9, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, %r10, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rbx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, %r10, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rsi), %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
@@ -3304,7 +3305,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $172, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
@@ -3320,59 +3321,60 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, 32(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%eax), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%eax), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 24(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
@@ -3380,7 +3382,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $172, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 84c2cc6d5ec31..bed8e5806380c 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -168,8 +168,8 @@ define void @load_2byte_chunk_of_4byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
@@ -188,17 +188,15 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
@@ -215,13 +213,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
@@ -236,14 +232,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%edx), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
@@ -260,23 +253,19 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca:
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %bl, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %dl, (%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
@@ -292,8 +281,8 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
@@ -312,17 +301,15 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
@@ -339,18 +326,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movw %si, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movw %dx, (%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
 ;
@@ -360,14 +345,11 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%edx), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
@@ -386,18 +368,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %si, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %dx, (%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
@@ -413,8 +393,8 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
@@ -433,17 +413,15 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
@@ -460,18 +438,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
 ;
@@ -481,14 +457,11 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%edx), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
@@ -507,18 +480,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
@@ -536,8 +507,8 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
@@ -557,8 +528,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
@@ -571,8 +542,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
@@ -591,8 +562,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
@@ -698,8 +669,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
@@ -719,8 +690,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
@@ -733,8 +704,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
@@ -753,8 +724,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
@@ -859,8 +830,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
@@ -880,8 +851,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
@@ -894,8 +865,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
@@ -914,8 +885,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
@@ -1020,8 +991,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
@@ -1041,8 +1012,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
@@ -1055,8 +1026,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
@@ -1075,8 +1046,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
@@ -1908,22 +1879,22 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rdi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -72(%rsp,%rcx,8), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rcx,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rcx,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
@@ -2084,40 +2055,43 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 16(%esp,%ecx,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%esi,4), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi,4), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ecx,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $92, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
index 4d261a9810896..9fbbba2ed3b47 100644
--- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -820,7 +820,7 @@ define void @infiniteloop() {
 ; ENABLE-NEXT:    movq %rsp, %rax
 ; ENABLE-NEXT:    addq $-16, %rax
 ; ENABLE-NEXT:    movq %rax, %rsp
-; ENABLE-NEXT:    xorl    %ecx, %ecx
+; ENABLE-NEXT:    xorl %ecx, %ecx
 ; ENABLE-NEXT:    .p2align 4
 ; ENABLE-NEXT:  LBB10_2: ## %for.body
 ; ENABLE-NEXT:    ## =>This Inner Loop Header: Depth=1
@@ -851,8 +851,8 @@ define void @infiniteloop() {
 ; DISABLE-NEXT:  ## %bb.1: ## %if.then
 ; DISABLE-NEXT:    movq %rsp, %rax
 ; DISABLE-NEXT:    addq $-16, %rax
-; DISABLE-NEXT:    %rax, %rsp
-; DISABLE-NEXT:    xorl    %ecx, %ecx
+; DISABLE-NEXT:    movq %rax, %rsp
+; DISABLE-NEXT:    xorl %ecx, %ecx
 ; DISABLE-NEXT:    .p2align 4
 ; DISABLE-NEXT:  LBB10_2: ## %for.body
 ; DISABLE-NEXT:    ## =>This Inner Loop Header: Depth=1
@@ -1185,10 +1185,10 @@ define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 {
 ; ENABLE-NEXT:    .p2align 4
 ; ENABLE-NEXT:  LBB14_2: ## %for.body
 ; ENABLE-NEXT:    ## =>This Inner Loop Header: Depth=1
-; ENABLE-NEXT:    cmpl %esi, %edi
-; ENABLE-NEXT:    setl %al
+; ENABLE-NEXT:    movl %esi, %eax
 ; ENABLE-NEXT:    xorl %esi, %esi
-; ENABLE-NEXT:    movb %al, %sil
+; ENABLE-NEXT:    cmpl %eax, %edi
+; ENABLE-NEXT:    setl %sil
 ; ENABLE-NEXT:    incb %dl
 ; ENABLE-NEXT:    cmpb $45, %dl
 ; ENABLE-NEXT:    jl LBB14_2
@@ -1220,10 +1220,10 @@ define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 {
 ; DISABLE-NEXT:    .p2align 4
 ; DISABLE-NEXT:  LBB14_2: ## %for.body
 ; DISABLE-NEXT:    ## =>This Inner Loop Header: Depth=1
-; DISABLE-NEXT:    cmpl %esi, %edi
-; DISABLE-NEXT:    setl %al
+; DISABLE-NEXT:    movl %esi, %eax
 ; DISABLE-NEXT:    xorl %esi, %esi
-; DISABLE-NEXT:    movb %al, %sil
+; DISABLE-NEXT:    cmpl %eax, %edi
+; DISABLE-NEXT:    setl %sil
 ; DISABLE-NEXT:    incb %dl
 ; DISABLE-NEXT:    cmpb $45, %dl
 ; DISABLE-NEXT:    jl LBB14_2
diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll
index 2bef66825d8c0..59fbf7183abc6 100644
--- a/llvm/test/CodeGen/X86/xor.ll
+++ b/llvm/test/CodeGen/X86/xor.ll
@@ -62,12 +62,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind  {
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB3_1: # %bb
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl %edx, %ecx
+; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    jne .LBB3_1
 ; X86-NEXT:  # %bb.2: # %bb12
 ; X86-NEXT:    retl
@@ -78,12 +78,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind  {
 ; X64-LIN-NEXT:    .p2align 4
 ; X64-LIN-NEXT:  .LBB3_1: # %bb
 ; X64-LIN-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-LIN-NEXT:    movl %esi, %ecx
 ; X64-LIN-NEXT:    xorl %esi, %eax
-; X64-LIN-NEXT:    movl %eax, %ecx
-; X64-LIN-NEXT:    notl %ecx
-; X64-LIN-NEXT:    andl %esi, %ecx
-; X64-LIN-NEXT:    addl %ecx, %ecx
-; X64-LIN-NEXT:    movl %ecx, %esi
+; X64-LIN-NEXT:    movl %eax, %esi
+; X64-LIN-NEXT:    notl %esi
+; X64-LIN-NEXT:    andl %ecx, %esi
+; X64-LIN-NEXT:    addl %esi, %esi
 ; X64-LIN-NEXT:    jne .LBB3_1
 ; X64-LIN-NEXT:  # %bb.2: # %bb12
 ; X64-LIN-NEXT:    retq
@@ -94,12 +94,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind  {
 ; X64-WIN-NEXT:    .p2align 4
 ; X64-WIN-NEXT:  .LBB3_1: # %bb
 ; X64-WIN-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-WIN-NEXT:    movl %edx, %ecx
 ; X64-WIN-NEXT:    xorl %edx, %eax
-; X64-WIN-NEXT:    movl %eax, %ecx
-; X64-WIN-NEXT:    notl %ecx
-; X64-WIN-NEXT:    andl %edx, %ecx
-; X64-WIN-NEXT:    addl %ecx, %ecx
-; X64-WIN-NEXT:    movl %ecx, %edx
+; X64-WIN-NEXT:    movl %eax, %edx
+; X64-WIN-NEXT:    notl %edx
+; X64-WIN-NEXT:    andl %ecx, %edx
+; X64-WIN-NEXT:    addl %edx, %edx
 ; X64-WIN-NEXT:    jne .LBB3_1
 ; X64-WIN-NEXT:  # %bb.2: # %bb12
 ; X64-WIN-NEXT:    retq
@@ -126,13 +126,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind  {
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB4_1: # %bb
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    testw %dx, %dx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl %edx, %ecx
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    testw %cx, %cx
 ; X86-NEXT:    jne .LBB4_1
 ; X86-NEXT:  # %bb.2: # %bb12
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -144,13 +144,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind  {
 ; X64-LIN-NEXT:    .p2align 4
 ; X64-LIN-NEXT:  .LBB4_1: # %bb
 ; X64-LIN-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-LIN-NEXT:    xorl %esi, %eax
-; X64-LIN-NEXT:    movl %eax, %ecx
-; X64-LIN-NEXT:    notl %ecx
-; X64-LIN-NEXT:    andl %esi, %ecx
-; X64-LIN-NEXT:    addl %ecx, %ecx
-; X64-LIN-NEXT:    testw %cx, %cx
-; X64-LIN-NEXT:    movl %ecx, %esi
+; X64-LIN-NEXT:    movl %esi, %ecx
+; X64-LIN-NEXT:    xorl %ecx, %eax
+; X64-LIN-NEXT:    movl %eax, %esi
+; X64-LIN-NEXT:    notl %esi
+; X64-LIN-NEXT:    andl %ecx, %esi
+; X64-LIN-NEXT:    addl %esi, %esi
+; X64-LIN-NEXT:    testw %si, %si
 ; X64-LIN-NEXT:    jne .LBB4_1
 ; X64-LIN-NEXT:  # %bb.2: # %bb12
 ; X64-LIN-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -163,13 +163,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind  {
 ; X64-WIN-NEXT:    .p2align 4
 ; X64-WIN-NEXT:  .LBB4_1: # %bb
 ; X64-WIN-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-WIN-NEXT:    xorl %edx, %eax
-; X64-WIN-NEXT:    movl %eax, %ecx
-; X64-WIN-NEXT:    notl %ecx
-; X64-WIN-NEXT:    andl %edx, %ecx
-; X64-WIN-NEXT:    addl %ecx, %ecx
-; X64-WIN-NEXT:    testw %cx, %cx
-; X64-WIN-NEXT:    movl %ecx, %edx
+; X64-WIN-NEXT:    movl %edx, %ecx
+; X64-WIN-NEXT:    xorl %ecx, %eax
+; X64-WIN-NEXT:    movl %eax, %edx
+; X64-WIN-NEXT:    notl %edx
+; X64-WIN-NEXT:    andl %ecx, %edx
+; X64-WIN-NEXT:    addl %edx, %edx
+; X64-WIN-NEXT:    testw %dx, %dx
 ; X64-WIN-NEXT:    jne .LBB4_1
 ; X64-WIN-NEXT:  # %bb.2: # %bb12
 ; X64-WIN-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -197,12 +197,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind  {
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB5_1: # %bb
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    xorb %cl, %al
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    notb %dl
-; X86-NEXT:    andb %cl, %dl
-; X86-NEXT:    addb %dl, %dl
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    andb %dl, %cl
+; X86-NEXT:    addb %cl, %cl
 ; X86-NEXT:    jne .LBB5_1
 ; X86-NEXT:  # %bb.2: # %bb12
 ; X86-NEXT:    retl
@@ -213,12 +213,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind  {
 ; X64-LIN-NEXT:    .p2align 4
 ; X64-LIN-NEXT:  .LBB5_1: # %bb
 ; X64-LIN-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-LIN-NEXT:    movl %esi, %ecx
 ; X64-LIN-NEXT:    xorb %sil, %al
-; X64-LIN-NEXT:    movl %eax, %ecx
-; X64-LIN-NEXT:    notb %cl
-; X64-LIN-NEXT:    andb %sil, %cl
-; X64-LIN-NEXT:    addb %cl, %cl
-; X64-LIN-NEXT:    movl %ecx, %esi
+; X64-LIN-NEXT:    movl %eax, %esi
+; X64-LIN-NEXT:    notb %sil
+; X64-LIN-NEXT:    andb %cl, %sil
+; X64-LIN-NEXT:    addb %sil, %sil
 ; X64-LIN-NEXT:    jne .LBB5_1
 ; X64-LIN-NEXT:  # %bb.2: # %bb12
 ; X64-LIN-NEXT:    # kill: def $al killed $al killed $eax
@@ -230,12 +230,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind  {
 ; X64-WIN-NEXT:    .p2align 4
 ; X64-WIN-NEXT:  .LBB5_1: # %bb
 ; X64-WIN-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-WIN-NEXT:    movl %edx, %ecx
 ; X64-WIN-NEXT:    xorb %dl, %al
-; X64-WIN-NEXT:    movl %eax, %ecx
-; X64-WIN-NEXT:    notb %cl
-; X64-WIN-NEXT:    andb %dl, %cl
-; X64-WIN-NEXT:    addb %cl, %cl
-; X64-WIN-NEXT:    movl %ecx, %edx
+; X64-WIN-NEXT:    movl %eax, %edx
+; X64-WIN-NEXT:    notb %dl
+; X64-WIN-NEXT:    andb %cl, %dl
+; X64-WIN-NEXT:    addb %dl, %dl
 ; X64-WIN-NEXT:    jne .LBB5_1
 ; X64-WIN-NEXT:  # %bb.2: # %bb12
 ; X64-WIN-NEXT:    retq
@@ -262,12 +262,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind  {
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB6_1: # %bb
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    xorl $2147483646, %edx # imm = 0x7FFFFFFE
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    xorl $2147483646, %ecx # imm = 0x7FFFFFFE
+; X86-NEXT:    andl %edx, %ecx
+; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    jne .LBB6_1
 ; X86-NEXT:  # %bb.2: # %bb12
 ; X86-NEXT:    retl
@@ -278,12 +278,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind  {
 ; X64-LIN-NEXT:    .p2align 4
 ; X64-LIN-NEXT:  .LBB6_1: # %bb
 ; X64-LIN-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-LIN-NEXT:    movl %esi, %ecx
 ; X64-LIN-NEXT:    xorl %esi, %eax
-; X64-LIN-NEXT:    movl %eax, %ecx
-; X64-LIN-NEXT:    xorl $2147483646, %ecx # imm = 0x7FFFFFFE
-; X64-LIN-NEXT:    andl %esi, %ecx
-; X64-LIN-NEXT:    addl %ecx, %ecx
-; X64-LIN-NEXT:    movl %ecx, %esi
+; X64-LIN-NEXT:    movl %eax, %esi
+; X64-LIN-NEXT:    xorl $2147483646, %esi # imm = 0x7FFFFFFE
+; X64-LIN-NEXT:    andl %ecx, %esi
+; X64-LIN-NEXT:    addl %esi, %esi
 ; X64-LIN-NEXT:    jne .LBB6_1
 ; X64-LIN-NEXT:  # %bb.2: # %bb12
 ; X64-LIN-NEXT:    retq
@@ -294,12 +294,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind  {
 ; X64-WIN-NEXT:    .p2align 4
 ; X64-WIN-NEXT:  .LBB6_1: # %bb
 ; X64-WIN-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-WIN-NEXT:    movl %edx, %ecx
 ; X64-WIN-NEXT:    xorl %edx, %eax
-; X64-WIN-NEXT:    movl %eax, %ecx
-; X64-WIN-NEXT:    xorl $2147483646, %ecx # imm = 0x7FFFFFFE
-; X64-WIN-NEXT:    andl %edx, %ecx
-; X64-WIN-NEXT:    addl %ecx, %ecx
-; X64-WIN-NEXT:    movl %ecx, %edx
+; X64-WIN-NEXT:    movl %eax, %edx
+; X64-WIN-NEXT:    xorl $2147483646, %edx # imm = 0x7FFFFFFE
+; X64-WIN-NEXT:    andl %ecx, %edx
+; X64-WIN-NEXT:    addl %edx, %edx
 ; X64-WIN-NEXT:    jne .LBB6_1
 ; X64-WIN-NEXT:  # %bb.2: # %bb12
 ; X64-WIN-NEXT:    retq
diff --git a/llvm/test/CodeGen/Xtensa/s32c1i.ll b/llvm/test/CodeGen/Xtensa/s32c1i.ll
new file mode 100644
index 0000000000000..aad738abe6a4c
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/s32c1i.ll
@@ -0,0 +1,7 @@
+; RUN: llc -mtriple=xtensa -mattr=+s32c1i  -filetype=obj %s -o - | llvm-objdump --arch=xtensa --mattr=s32c1i -d - | FileCheck %s -check-prefix=XTENSA
+
+define i32 @constraint_i(i32 %a) {
+; XTENSA: 0: 22 e2 01    s32c1i  a2, a2, 4
+  %res = tail call i32 asm "s32c1i $0, $1, $2", "=r,r,i"(i32 %a, i32 4)
+  ret i32 %res
+}
diff --git a/llvm/test/DebugInfo/Generic/debuginfofinder-cu-source-language-names.ll b/llvm/test/DebugInfo/Generic/debuginfofinder-cu-source-language-names.ll
new file mode 100644
index 0000000000000..aafeb5ceb0db3
--- /dev/null
+++ b/llvm/test/DebugInfo/Generic/debuginfofinder-cu-source-language-names.ll
@@ -0,0 +1,22 @@
+; RUN: opt -passes='print<module-debuginfo>' -disable-output 2>&1 < %s \
+; RUN:   | FileCheck %s
+
+; CHECK: Compile unit: DW_LANG_C99 from /tmp/test1.c
+; CHECK: Compile unit: DW_LNAME_C from /tmp/test2.c
+; CHECK: Compile unit: unknown-language(0) from /tmp/test3.c
+
+!llvm.dbg.cu = !{!0, !6, !10}
+!llvm.module.flags = !{!8, !9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "test1.c", directory: "/tmp")
+!2 = !{}
+!3 = !DIFile(filename: "test1.c", directory: "/tmp")
+!4 = !DISubroutineType(types: !7)
+!5 = !{null}
+!6 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_C, producer: "clang", isOptimized: false, emissionKind: FullDebug, file: !7, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!7 = !DIFile(filename: "test2.c", directory: "/tmp")
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 1, !"Debug Info Version", i32 3}
+!10 = distinct !DICompileUnit(sourceLanguageName: 0, producer: "clang", isOptimized: false, emissionKind: FullDebug, file: !11, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!11 = !DIFile(filename: "test3.c", directory: "/tmp")
diff --git a/llvm/test/DebugInfo/Generic/objc-property.ll b/llvm/test/DebugInfo/Generic/objc-property.ll
index 007d1fe698b30..1ee792941bcbb 100644
--- a/llvm/test/DebugInfo/Generic/objc-property.ll
+++ b/llvm/test/DebugInfo/Generic/objc-property.ll
@@ -5,33 +5,33 @@
 ; CHECK: DW_TAG_structure_type
 ; CHECK:   DW_AT_name ("Foo")
 ;
-; CHECK:   DW_TAG_APPLE_property
+; CHECK:   0x[[AUTO_SYNTH:[0-9a-f]+]]: DW_TAG_APPLE_property
 ; CHECK:     DW_AT_APPLE_property_name ("autoSynthProp")
 ; CHECK:     DW_AT_APPLE_property_attribute
 ; CHECK-SAME: DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite,
 ; CHECK-SAME: DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained
 ;
-; CHECK:   DW_TAG_APPLE_property
+; CHECK:   0x[[SYNTH:[0-9a-f]+]]: DW_TAG_APPLE_property
 ; CHECK:     DW_AT_APPLE_property_name ("synthProp")
 ; CHECK:     DW_AT_APPLE_property_attribute
 ; CHECK-SAME: DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite,
 ; CHECK-SAME: DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained
 ;
-; CHECK:   DW_TAG_APPLE_property
+; CHECK:   0x[[GET:[0-9a-f]+]]: DW_TAG_APPLE_property
 ; CHECK:     DW_AT_APPLE_property_name ("customGetterProp")
 ; CHECK:     DW_AT_APPLE_property_getter   ("customGetter")
 ; CHECK:     DW_AT_APPLE_property_attribute
 ; CHECK-SAME: DW_APPLE_PROPERTY_getter, DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite,
 ; CHECK-SAME: DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained
 ;
-; CHECK:   DW_TAG_APPLE_property
+; CHECK:   0x[[SET:[0-9a-f]+]]: DW_TAG_APPLE_property
 ; CHECK:     DW_AT_APPLE_property_name ("customSetterProp")
 ; CHECK:     DW_AT_APPLE_property_setter   ("customSetter:")
 ; CHECK:     DW_AT_APPLE_property_attribute
 ; CHECK-SAME: DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite,
 ; CHECK-SAME: DW_APPLE_PROPERTY_setter, DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained
 ;
-; CHECK:   DW_TAG_APPLE_property
+; CHECK:   0x[[ACCESSORS:[0-9a-f]+]]: DW_TAG_APPLE_property
 ; CHECK:     DW_AT_APPLE_property_name ("customAccessorsProp")
 ; CHECK:     DW_AT_APPLE_property_getter   ("customGetter")
 ; CHECK:     DW_AT_APPLE_property_setter   ("customSetter:")
@@ -39,15 +39,21 @@
 ; CHECK-SAME: DW_APPLE_PROPERTY_getter, DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite,
 ; CHECK-SAME: DW_APPLE_PROPERTY_setter, DW_APPLE_PROPERTY_atomic, DW_APPLE_PROPERTY_unsafe_unretained
 ;
-; FIXME: missing link between DW_TAG_member and the associated DW_TAG_APPLE_property
 ; CHECK:   DW_TAG_member
-; CHECK-NOT: DW_AT_APPLE_property
+; CHECK:     DW_AT_name ("someBackingIvar")
+; CHECK:     DW_AT_APPLE_property (0x[[SYNTH]] "synthProp")
+;
 ; CHECK:   DW_TAG_member
-; CHECK-NOT: DW_AT_APPLE_property
+; CHECK:     DW_AT_name ("_autoSynthProp")
+; CHECK:     DW_AT_APPLE_property (0x[[AUTO_SYNTH]] "autoSynthProp")
+;
 ; CHECK:   DW_TAG_member
-; CHECK-NOT: DW_AT_APPLE_property
+; CHECK:     DW_AT_name ("_customGetterProp")
+; CHECK:     DW_AT_APPLE_property (0x[[GET]] "customGetterProp")
+;
 ; CHECK:   DW_TAG_member
-; CHECK-NOT: DW_AT_APPLE_property
+; CHECK:     DW_AT_name ("_customSetterProp")
+; CHECK:     DW_AT_APPLE_property (0x[[SET]] "customSetterProp")
 
 !llvm.module.flags = !{!0, !1}
 !llvm.dbg.cu = !{!2}
diff --git a/llvm/test/DebugInfo/Hexagon/lit.local.cfg b/llvm/test/DebugInfo/Hexagon/lit.local.cfg
new file mode 100644
index 0000000000000..3bed54b1a88d2
--- /dev/null
+++ b/llvm/test/DebugInfo/Hexagon/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "Hexagon" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/DebugInfo/Hexagon/packet-debug.mir b/llvm/test/DebugInfo/Hexagon/packet-debug.mir
new file mode 100644
index 0000000000000..485b543b6e176
--- /dev/null
+++ b/llvm/test/DebugInfo/Hexagon/packet-debug.mir
@@ -0,0 +1,48 @@
+# RUN: llc -mtriple=hexagon -run-pass hexagon-packetizer %s -o - | FileCheck %s
+
+# CHECK-LABEL: name: factorial
+
+# The first bundle in bb.0 should have debug-location !19 (line 9),
+# not !18 (line 0) from the DBG_VALUE instructions.
+# CHECK: bb.0:
+# CHECK: BUNDLE {{.*}}line: 9
+
+--- |
+  define void @factorial() { ret void }
+
+  !llvm.dbg.cu = !{!2}
+  !llvm.module.flags = !{!6, !7}
+
+  !2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !3, producer: "test", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+  !3 = !DIFile(filename: "fact.c", directory: "/test")
+  !5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !6 = !{i32 2, !"Debug Info Version", i32 3}
+  !7 = !{i32 1, !"wchar_size", i32 4}
+  !12 = distinct !DISubprogram(name: "factorial", scope: !3, file: !3, line: 6, type: !13, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+  !13 = !DISubroutineType(types: !14)
+  !14 = !{!5, !5}
+  !16 = !DILocalVariable(name: "i", arg: 1, scope: !12, file: !3, line: 6, type: !5)
+  !18 = !DILocation(line: 0, scope: !12)
+  !19 = !DILocation(line: 9, column: 9, scope: !12)
+  !21 = !DILocation(line: 9, column: 7, scope: !12)
+
+...
+---
+name:            factorial
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $r0
+
+    DBG_VALUE $r0, $noreg, !16, !DIExpression(), debug-location !18
+    $r2 = A2_tfr $r0
+    DBG_VALUE $r2, $noreg, !16, !DIExpression(), debug-location !18
+    renamable $p0 = C2_cmpeqi killed $r0, 1, debug-location !19
+    renamable $r0 = A2_tfrsi 1
+    J2_jumpt killed $p0, %bb.1, implicit-def $pc, debug-location !21
+
+  bb.1:
+    PS_jmpret $r31, implicit-def dead $pc
+
+...
diff --git a/llvm/test/DebugInfo/MIR/X86/clobbered-fragments.mir b/llvm/test/DebugInfo/MIR/X86/clobbered-fragments.mir
index a334e99b9cade..ea01835cae1e5 100644
--- a/llvm/test/DebugInfo/MIR/X86/clobbered-fragments.mir
+++ b/llvm/test/DebugInfo/MIR/X86/clobbered-fragments.mir
@@ -85,10 +85,11 @@
   !15 = !DISubrange(count: 3)
   !16 = !DILocation(line: 8, scope: !8)
   !17 = !DILocation(line: 9, scope: !8)
-  !18 = distinct !DISubprogram(name: "test2", scope: !2, file: !2, line: 7, type: !9, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !11)
+  !18 = distinct !DISubprogram(name: "test2", scope: !2, file: !2, line: 7, type: !9, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !22)
   !19 = !DILocalVariable(name: "local", scope: !18, file: !2, line: 8, type: !13)
   !20 = !DILocation(line: 15, scope: !18)
   !21 = !DILocation(line: 16, scope: !18)
+  !22 = !{!19}
 
 ...
 ---
diff --git a/llvm/test/DebugInfo/MIR/X86/machine-cse.mir b/llvm/test/DebugInfo/MIR/X86/machine-cse.mir
index c38c0a1a79f75..63dc44fb705fe 100644
--- a/llvm/test/DebugInfo/MIR/X86/machine-cse.mir
+++ b/llvm/test/DebugInfo/MIR/X86/machine-cse.mir
@@ -73,13 +73,14 @@
   !0 = !{i32 2, !"Debug Info Version", i32 3}
   !1 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !2, producer: "beards", isOptimized: true, runtimeVersion: 4, emissionKind: FullDebug)
   !2 = !DIFile(filename: "bees.cpp", directory: "")
-  !3 = distinct !DISubprogram(name: "nope", scope: !1, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !8)
-  !33 = distinct !DISubprogram(name: "alsonope", scope: !1, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !8)
+  !3 = distinct !DISubprogram(name: "nope", scope: !1, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !9)
+  !33 = distinct !DISubprogram(name: "alsonope", scope: !1, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !9)
   !4 = !DILocalVariable(name: "bees", scope: !3, type: !5)
   !5 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !6, size: 64)
   !6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
   !7 = !DILocation(line: 0, scope: !3)
   !8 = !{!4}
+  !9 = !{}
 
 
   ; CHECK: ![[METAVAR:[0-9]+]] = !DILocalVariable(name: "bees",
diff --git a/llvm/test/DebugInfo/MIR/X86/remove-redundant-dbg-vals.mir b/llvm/test/DebugInfo/MIR/X86/remove-redundant-dbg-vals.mir
index 06ce18d8edaa7..28fc044e606b5 100644
--- a/llvm/test/DebugInfo/MIR/X86/remove-redundant-dbg-vals.mir
+++ b/llvm/test/DebugInfo/MIR/X86/remove-redundant-dbg-vals.mir
@@ -139,15 +139,15 @@
   !23 = !DISubprogram(name: "bar", scope: !1, file: !1, line: 1, type: !24, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
   !24 = !DISubroutineType(types: !25)
   !25 = !{null, !11}
-  !26 = distinct !DISubprogram(name: "foo2", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+  !26 = distinct !DISubprogram(name: "foo2", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
   !27 = !DILocation(line: 0, scope: !26)
-  !28 = distinct !DISubprogram(name: "foo3", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+  !28 = distinct !DISubprogram(name: "foo3", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
   !29 = !DILocation(line: 0, scope: !28)
-  !30 = distinct !DISubprogram(name: "foo4", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+  !30 = distinct !DISubprogram(name: "foo4", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
   !31 = !DILocation(line: 0, scope: !30)
-  !32 = distinct !DISubprogram(name: "foo5", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+  !32 = distinct !DISubprogram(name: "foo5", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
   !33 = !DILocation(line: 0, scope: !32)
-  !34 = distinct !DISubprogram(name: "foo6", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+  !34 = distinct !DISubprogram(name: "foo6", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
   !35 = !DILocation(line: 0, scope: !34)
 
 ...
diff --git a/llvm/test/DebugInfo/PDB/Native/pdb-native-index-overflow.test b/llvm/test/DebugInfo/PDB/Native/pdb-native-index-overflow.test
new file mode 100755
index 0000000000000..aa3f6dcb9632a
--- /dev/null
+++ b/llvm/test/DebugInfo/PDB/Native/pdb-native-index-overflow.test
@@ -0,0 +1,13 @@
+; Test that the native PDB reader isn't crashed by index value bigger than
+; number of types in TPI or IPI stream
+; RUN: llvm-pdbutil dump %p/../Inputs/empty.pdb --type-index=20000000\
+; RUN:   | FileCheck -check-prefixes=TYPES,NOT_FOUND %s
+; RUN: llvm-pdbutil dump %p/../Inputs/empty.pdb --id-index=20000000\
+; RUN:   | FileCheck -check-prefixes=IDS,NOT_FOUND %s
+
+TYPES:                     Types (TPI Stream)
+IDS:                       Types (IPI Stream)
+NOT_FOUND:============================================================
+NOT_FOUND:  Showing 1 records.
+NOT_FOUND:  Type 0x1312D00 doesn't exist in TPI stream
+
diff --git a/llvm/test/DebugInfo/X86/codeview-empty-dbg-cu-crash.ll b/llvm/test/DebugInfo/X86/codeview-empty-dbg-cu-crash.ll
new file mode 100644
index 0000000000000..51435b10fdc2a
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/codeview-empty-dbg-cu-crash.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s
+
+; CHECK: .file	"<stdin>"
+; CHECK-NEXT: .section	.debug$S,"dr"
+; CHECK-NEXT: .p2align	2, 0x0
+; CHECK-NEXT: .long	4                               # Debug section magic
+; CHECK-NEXT: .long	241
+; CHECK-NEXT: .long	.Ltmp1-.Ltmp0                   # Subsection size
+; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT: .short	.Ltmp3-.Ltmp2                   # Record length
+; CHECK-NEXT: .Ltmp2:
+; CHECK-NEXT: .short	4353                            # Record kind: S_OBJNAME
+; CHECK-NEXT: .long	0                               # Signature
+; CHECK-NEXT: .byte	0                               # Object name
+; CHECK-NEXT: .p2align	2, 0x0
+; CHECK-NEXT: .Ltmp3:
+; CHECK-NEXT: .short	.Ltmp5-.Ltmp4                   # Record length
+; CHECK-NEXT: .Ltmp4:
+; CHECK-NEXT: .short	4412                            # Record kind: S_COMPILE3
+; CHECK-NEXT: .long	3                               # Flags and language
+; CHECK-NEXT: .short	208                             # CPUType
+; CHECK-NEXT: .short	0                               # Frontend version
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .short	22000                           # Backend version
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .asciz	"0"                             # Null-terminated compiler version string
+; CHECK-NEXT: .p2align	2, 0x0
+; CHECK-NEXT: .Ltmp5:
+; CHECK-NEXT: .Ltmp1:
+; CHECK-NEXT: .p2align	2, 0x0
+
+!llvm.dbg.cu = !{}
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/DebugInfo/X86/instr-ref-selectiondag.ll b/llvm/test/DebugInfo/X86/instr-ref-selectiondag.ll
index dbbef2b39587d..594607c6e95d8 100644
--- a/llvm/test/DebugInfo/X86/instr-ref-selectiondag.ll
+++ b/llvm/test/DebugInfo/X86/instr-ref-selectiondag.ll
@@ -281,15 +281,19 @@ lala:
 !11 = !{!13}
 !13 = !DILocalVariable(name: "baz", scope: !7, file: !1, line: 6, type: !10)
 !14 = !DILocation(line: 1, scope: !7)
-!20 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+!20 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !23)
 !21 = !DILocalVariable(name: "xyzzy", scope: !20, file: !1, line: 6, type: !10)
 !22 = !DILocation(line: 1, scope: !20)
-!30 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+!23 = !{!21}
+!30 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !33)
 !31 = !DILocalVariable(name: "xyzzy", scope: !30, file: !1, line: 6, type: !10)
 !32 = !DILocation(line: 1, scope: !30)
-!40 = distinct !DISubprogram(name: "qux", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+!33 = !{!31}
+!40 = distinct !DISubprogram(name: "qux", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !46)
 !41 = !DILocalVariable(name: "socks", scope: !40, file: !1, line: 6, type: !10)
 !42 = !DILocation(line: 1, scope: !40)
-!43 = distinct !DISubprogram(name: "inlined", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+!43 = distinct !DISubprogram(name: "inlined", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !47)
 !44 = !DILocation(line: 0, scope: !43, inlinedAt: !42)
 !45 = !DILocalVariable(name: "knees", scope: !43, file: !1, line: 6, type: !10)
+!46 = !{!41}
+!47 = !{!45}
diff --git a/llvm/test/DebugInfo/X86/live-debug-values-constprop.mir b/llvm/test/DebugInfo/X86/live-debug-values-constprop.mir
index 8a0537658c9c0..2900f0bdcf864 100644
--- a/llvm/test/DebugInfo/X86/live-debug-values-constprop.mir
+++ b/llvm/test/DebugInfo/X86/live-debug-values-constprop.mir
@@ -82,15 +82,18 @@
   !14 = !DISubroutineType(types: !15)
   !15 = !{!16}
   !16 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-  !40 = distinct !DISubprogram(name: "bar", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !13, type: !14, isDefinition: true)
+  !40 = distinct !DISubprogram(name: "bar", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !43, type: !14, isDefinition: true)
   !41 = !DILocalVariable(name: "towel", scope: !40, file: !2, line: 1, type: !16)
   !42 = !DILocation(line: 40, scope: !40)
-  !80 = distinct !DISubprogram(name: "baz", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !13, type: !14, isDefinition: true)
+  !43 = !{!41}
+  !80 = distinct !DISubprogram(name: "baz", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !83, type: !14, isDefinition: true)
   !81 = !DILocalVariable(name: "socks", scope: !80, file: !2, line: 1, type: !16)
   !82 = !DILocation(line: 40, scope: !80)
-  !120 = distinct !DISubprogram(name: "qux", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !13, type: !14, isDefinition: true)
+  !83 = !{!81}
+  !120 = distinct !DISubprogram(name: "qux", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !123, type: !14, isDefinition: true)
   !121 = !DILocalVariable(name: "shoes", scope: !120, file: !2, line: 1, type: !16)
   !122 = !DILocation(line: 40, scope: !120)
+  !123 = !{!121}
 
 ...
 ---
diff --git a/llvm/test/DebugInfo/X86/live-debug-values-remove-range.ll b/llvm/test/DebugInfo/X86/live-debug-values-remove-range.ll
index e656c6237c068..145b5045687cf 100644
--- a/llvm/test/DebugInfo/X86/live-debug-values-remove-range.ll
+++ b/llvm/test/DebugInfo/X86/live-debug-values-remove-range.ll
@@ -108,6 +108,6 @@ exit:
 !106 = !DILocation(line: 1, scope: !104)
 !113 = !{!103}
 !203 = !DILocalVariable(name: "teacake", scope: !204, file: !2, line: 1, type: !16)
-!204 = distinct !DISubprogram(name: "toad", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !113, type: !14, isDefinition: true)
+!204 = distinct !DISubprogram(name: "toad", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !213, type: !14, isDefinition: true)
 !206 = !DILocation(line: 1, scope: !204)
 !213 = !{!203}
diff --git a/llvm/test/DebugInfo/X86/live-debug-vars-intervals.mir b/llvm/test/DebugInfo/X86/live-debug-vars-intervals.mir
index 3beaf8996e4f0..ab57a9612702f 100644
--- a/llvm/test/DebugInfo/X86/live-debug-vars-intervals.mir
+++ b/llvm/test/DebugInfo/X86/live-debug-vars-intervals.mir
@@ -91,7 +91,7 @@
   !10 = !{!11}
   !11 = !DILocalVariable(name: "x", arg: 1, scope: !6, file: !1, line: 3, type: !9)
   !12 = !DILocation(line: 3, column: 12, scope: !6)
-  !13 = distinct !DISubprogram(name: "f2", scope: !1, file: !1, line: 20, type: !7, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !10)
+  !13 = distinct !DISubprogram(name: "f2", scope: !1, file: !1, line: 20, type: !7, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !{!14})
   !14 = !DILocalVariable(name: "x", arg: 1, scope: !13, file: !1, line: 21, type: !9)
   !15 = !DILocation(line: 23, column: 12, scope: !13)
 
diff --git a/llvm/test/DebugInfo/debug-bool-const-value.ll b/llvm/test/DebugInfo/debug-bool-const-value.ll
new file mode 100644
index 0000000000000..84cf993cf4aae
--- /dev/null
+++ b/llvm/test/DebugInfo/debug-bool-const-value.ll
@@ -0,0 +1,29 @@
+; REQUIRES: object-emission
+; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
+
+; CHECK: {{.*}}DW_TAG_variable
+; CHECK-NEXT: {{.*}} DW_AT_const_value     (1)
+; CHECK-NEXT: {{.*}} DW_AT_name    ("arg")
+
+define void @test() !dbg !5
+{
+entry:
+  call void @"llvm.dbg.value"(metadata i1 true, metadata !7, metadata !8), !dbg !6
+  ret void, !dbg !6
+}
+
+declare void @"llvm.dbg.value"(metadata %".1", metadata %".2", metadata %".3")
+
+!llvm.dbg.cu = !{ !2 }
+!llvm.module.flags = !{ !9, !10 }
+
+!1 = !DIFile(directory: "", filename: "test")
+!2 = distinct !DICompileUnit(emissionKind: FullDebug, file: !1, isOptimized: false, language: DW_LANG_C_plus_plus, runtimeVersion: 0)
+!3 = !DIBasicType(encoding: DW_ATE_boolean, name: "bool", size: 8)
+!4 = !DISubroutineType(types: !{null})
+!5 = distinct !DISubprogram(file: !1, isDefinition: true, isLocal: false, isOptimized: false, line: 5, linkageName: "test", name: "test", scope: !1, scopeLine: 5, type: !4, unit: !2)
+!6 = !DILocation(column: 1, line: 5, scope: !5)
+!7 = !DILocalVariable(arg: 0, file: !1, line: 5, name: "arg", scope: !5, type: !3)
+!8 = !DIExpression()
+!9 = !{ i32 2, !"Dwarf Version", i32 4 }
+!10 = !{ i32 2, !"Debug Info Version", i32 3 }
diff --git a/llvm/test/DebugInfo/extradata-node-reference.ll b/llvm/test/DebugInfo/extradata-node-reference.ll
new file mode 100644
index 0000000000000..188175617b244
--- /dev/null
+++ b/llvm/test/DebugInfo/extradata-node-reference.ll
@@ -0,0 +1,101 @@
+;; Test verifies that node reference in the extraData field are handled correctly
+;; when used with tags like DW_TAG_member, DW_TAG_inheritance etc.
+
+; REQUIRES: object-emission
+; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s -check-prefix=CHECK-IR
+; RUN: verify-uselistorder %s
+
+; Example 1: BitField with storage offset (extraData: i64 0)
+%struct.BitField = type { i8 }
+@bf = global %struct.BitField zeroinitializer, !dbg !9
+
+; Example 2: Static member with constant value (extraData: i32 42)
+%struct.Static = type { i32 }
+@st = global %struct.Static zeroinitializer, !dbg !16
+
+; Example 3: Discriminant value for variant (extraData: i32 100)
+%union.Variant = type { [8 x i8] }
+@var = global %union.Variant zeroinitializer, !dbg !24
+
+; Example 4: Inheritance VBPtr offset (extraData: i32 0)
+%class.Derived = type { i32 }
+@der = global %class.Derived zeroinitializer, !dbg !35
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 11.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !8)
+!1 = !DIFile(filename: "test.cpp", directory: ".")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 1, !"wchar_size", i32 4}
+!4 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{!9, !16, !24, !35}
+
+; extraData node definitions
+!15 = !{i64 0}       ; BitField storage offset
+!22 = !{i32 42}      ; Static member constant value
+!33 = !{i32 100}     ; Discriminant value
+!41 = !{i32 0}       ; VBPtr offset
+
+; CHECK-IR: !9 = !DIDerivedType(tag: DW_TAG_member, name: "const_val", scope: !7, file: !3, line: 11, baseType: !10, flags: DIFlagStaticMember, extraData: !12)
+; CHECK-IR: !12 = !{i32 42}
+; CHECK-IR: !20 = !DIDerivedType(tag: DW_TAG_member, name: "variant_some", scope: !17, file: !3, baseType: !11, size: 32, extraData: !21)
+; CHECK-IR: !21 = !{i32 100}
+; CHECK-IR: !27 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !25, baseType: !28, extraData: !29)
+; CHECK-IR: !29 = !{i32 0}
+; CHECK-IR: !32 = !DIDerivedType(tag: DW_TAG_member, name: "field", scope: !30, file: !3, line: 6, baseType: !11, size: 3, flags: DIFlagBitField, extraData: !33)
+; CHECK-IR: !33 = !{i64 0}
+
+; CHECK: {{.*}} DW_TAG_variable
+; CHECK: {{.*}} DW_AT_name	("bf")
+; CHECK: {{.*}} DW_TAG_member
+; CHECK: {{.*}} DW_AT_name	("field")
+; === BitField: extraData holds storage offset ===
+!9 = !DIGlobalVariableExpression(var: !10, expr: !DIExpression())
+!10 = distinct !DIGlobalVariable(name: "bf", scope: !0, file: !1, line: 5, type: !11, isLocal: false, isDefinition: true)
+!11 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "BitField", file: !1, line: 5, size: 8, elements: !12)
+!12 = !{!13}
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "field", scope: !11, file: !1, line: 6, baseType: !14, size: 3, flags: DIFlagBitField, extraData: !15)
+!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+
+; CHECK: {{.*}} DW_TAG_variable
+; CHECK: {{.*}} DW_AT_name	("st")
+; CHECK: {{.*}} DW_TAG_member
+; CHECK: {{.*}} DW_AT_name	("const_val")
+; CHECK: {{.*}} DW_AT_const_value	(42)
+; === Static Member: extraData holds constant value ===
+!16 = !DIGlobalVariableExpression(var: !17, expr: !DIExpression())
+!17 = distinct !DIGlobalVariable(name: "st", scope: !0, file: !1, line: 10, type: !18, isLocal: false, isDefinition: true)
+!18 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Static", file: !1, line: 10, size: 32, elements: !19)
+!19 = !{!20}
+!20 = !DIDerivedType(tag: DW_TAG_member, name: "const_val", scope: !18, file: !1, line: 11, baseType: !21, flags: DIFlagStaticMember, extraData: !22)
+!21 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !14)
+
+; CHECK: {{.*}} DW_TAG_variable
+; CHECK: {{.*}} DW_AT_name	("var")
+; CHECK: {{.*}} DW_TAG_member
+; CHECK: {{.*}} DW_AT_name	("variant_none")
+; CHECK: {{.*}} DW_AT_discr_value	(0x64)
+; === Discriminant: extraData holds discriminant value ===
+!24 = !DIGlobalVariableExpression(var: !25, expr: !DIExpression())
+!25 = distinct !DIGlobalVariable(name: "var", scope: !0, file: !1, line: 15, type: !26, isLocal: false, isDefinition: true)
+!26 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Variant", file: !1, line: 15, size: 128, elements: !27)
+!27 = !{!28}
+!28 = !DICompositeType(tag: DW_TAG_variant_part, scope: !26, file: !1, size: 128, elements: !29, discriminator: !30)
+!29 = !{!31, !32}
+!30 = !DIDerivedType(tag: DW_TAG_member, scope: !28, file: !1, baseType: !14, size: 32, align: 32, flags: DIFlagArtificial)
+!31 = !DIDerivedType(tag: DW_TAG_member, name: "variant_none", scope: !28, file: !1, baseType: !14, size: 32)
+!32 = !DIDerivedType(tag: DW_TAG_member, name: "variant_some", scope: !28, file: !1, baseType: !14, size: 32, extraData: !33)
+
+; CHECK: {{.*}} DW_TAG_variable
+; CHECK: {{.*}} DW_AT_name	("der")
+; CHECK: {{.*}} DW_TAG_inheritance
+; CHECK: {{.*}} DW_AT_type	({{.*}} "Base")
+; === Inheritance: extraData holds VBPtr offset ===
+!35 = !DIGlobalVariableExpression(var: !36, expr: !DIExpression())
+!36 = distinct !DIGlobalVariable(name: "der", scope: !0, file: !1, line: 20, type: !37, isLocal: false, isDefinition: true)
+!37 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Derived", file: !1, line: 20, size: 32, elements: !38)
+!38 = !{!39}
+!39 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !37, baseType: !40, extraData: !41)
+!40 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Base", file: !1, line: 19, size: 32)
diff --git a/llvm/test/Demangle/ms-operators.test b/llvm/test/Demangle/ms-operators.test
index b940488786631..cafa1ae3c0663 100644
--- a/llvm/test/Demangle/ms-operators.test
+++ b/llvm/test/Demangle/ms-operators.test
@@ -143,9 +143,24 @@
 ??_7A@B@@6BC@D@@@
 ; CHECK: const B::A::`vftable'{for `D::C'}
 
+??_7A@B@@6BC@D@@E@F@@@
+; CHECK: const B::A::`vftable'{for `D::C's `F::E'}
+
+??_7A@B@@6BC@D@@E@F@@G@H@@@
+; CHECK: const B::A::`vftable'{for `D::C's `F::E's `H::G'}
+
 ??_8Middle2@@7B@
 ; CHECK: const Middle2::`vbtable'
 
+??_7A@@6BB@@@
+; CHECK: const A::`vftable'{for `B'}
+
+??_7A@@6BB@@C@@@
+; CHECK: const A::`vftable'{for `B's `C'}
+
+??_7A@@6BB@@C@@D@@@
+; CHECK: const A::`vftable'{for `B's `C's `D'}
+
 ??_9Base@@$B7AA
 ; CHECK: [thunk]: __cdecl Base::`vcall'{8, {flat}}
 
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_ehframe.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_ehframe.s
new file mode 100644
index 0000000000000..fca8345ff207c
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_ehframe.s
@@ -0,0 +1,58 @@
+# REQUIRES: asserts
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux-gnu -filetype=obj -o %t %s
+# RUN: llvm-jitlink -noexec -phony-externals -debug-only=jitlink %t 2>&1 | \
+# RUN:   FileCheck %s
+#
+# Check that splitting of eh-frame sections works.
+#
+# CHECK: DWARFRecordSectionSplitter: Processing .eh_frame...
+# CHECK:   Processing block at
+# CHECK:     Processing CFI record at
+# CHECK:     Processing CFI record at
+# CHECK: EHFrameEdgeFixer: Processing .eh_frame in "{{.*}}"...
+# CHECK:   Processing block at
+# CHECK:     Record is CIE
+# CHECK:   Processing block at
+# CHECK:     Record is FDE
+# CHECK:       Adding edge at {{.*}} to CIE at: {{.*}}
+# CHECK:       Processing PC-begin at
+# CHECK:       Existing edge at {{.*}} to PC begin at {{.*}}
+# CHECK:       Adding keep-alive edge from target at {{.*}} to FDE at {{.*}}
+
+	.text
+	.file	"exceptions.cpp"
+                                        # Start of file scope inline assembly
+	.globl	_ZSt21ios_base_library_initv
+
+                                        # End of file scope inline assembly
+	.globl	main                            # -- Begin function main
+	.p2align	4
+	.type	main,@function
+main:                                   # @main
+	.cfi_startproc
+# %bb.0:                                # %entry
+	stmg	%r11, %r15, 88(%r15)
+	.cfi_offset %r11, -72
+	.cfi_offset %r14, -48
+	.cfi_offset %r15, -40
+	aghi	%r15, -168
+	.cfi_def_cfa_offset 328
+	lgr	%r11, %r15
+	.cfi_def_cfa_register %r11
+	mvhi	164(%r11), 0
+	lghi	%r2, 4
+	brasl	%r14, __cxa_allocate_exception@PLT
+	mvhi	0(%r2), 1
+	lgrl	%r3, _ZTIi@GOT
+	lghi	%r4, 0
+	brasl	%r14, __cxa_throw@PLT
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.addrsig_sym __cxa_allocate_exception
+	.addrsig_sym __cxa_throw
+	.addrsig_sym _ZTIi
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs16.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs16.s
new file mode 100644
index 0000000000000..04e828685c040
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs16.s
@@ -0,0 +1,35 @@
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t.o %s
+# RUN: llvm-jitlink -noexec -abs X=0xFFFF -check=%s %t.o
+
+# RUN: not llvm-jitlink -noexec -abs X=0x10000 %t.o 2>&1 | \
+# RUN:   FileCheck -check-prefix=CHECK-ERROR %s
+#
+# Check success and failure cases of R_390_16 handling.
+
+# jitlink-check: *{8}P = X
+
+# CHECK-ERROR: relocation target {{.*}} (X) is out of range of Pointer16 fixup
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br    %r14
+.Lfunc_end0:
+        .size   main, .Lfunc_end0-main
+
+        .type   P,@object
+        .data
+        .globl  P
+	.p2align 1
+P:
+        .short   0
+        .short   0
+        .short   0
+        .short   X    # Using byte here generates R_390_16.
+        .size    P, 8
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs32.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs32.s
new file mode 100644
index 0000000000000..1a63acdb63d57
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs32.s
@@ -0,0 +1,32 @@
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t.o %s
+# RUN: llvm-jitlink -noexec -abs X=0x12345678 -check=%s %t.o
+#
+# RUN: not llvm-jitlink -noexec -abs X=0x123456789 %t.o 2>&1 | \
+# RUN:   FileCheck -check-prefix=CHECK-ERROR %s
+#
+# Check success and failure cases of R_390_32 handling.
+
+# jitlink-check: *{8}P = X
+
+# CHECK-ERROR: relocation target {{.*}} (X) is out of range of Pointer32 fixup
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br  %r14
+.Lfunc_end0:
+        .size   main, .Lfunc_end0-main
+
+        .type   P,@object
+        .data
+        .globl  P
+        .p2align        2
+P:
+        .long   0
+        .long   X    # Using long here generates R_390_32.
+        .size   P, 8
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs64.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs64.s
new file mode 100644
index 0000000000000..63d2a1a539aeb
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs64.s
@@ -0,0 +1,28 @@
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t.o %s
+# RUN: llvm-jitlink -noexec -abs X=0xffffffffffffffff -check=%s %t.o
+#
+# Check success and failure cases of R_390_64 handling.
+
+# jitlink-check: *{8}P = X
+
+# CHECK-ERROR: relocation target "X" {{.*}} is out of range of Pointer64 fixup
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br  %r14
+.Lfunc_end0:
+        .size   main, .Lfunc_end0-main
+
+        .type   P,@object
+        .data
+        .globl  P
+        .p2align       4 
+P:
+        .quad  X    # Using quad here generates R_390_64.
+        .size   P, 8
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs8.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs8.s
new file mode 100644
index 0000000000000..5f23f289140a6
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs8.s
@@ -0,0 +1,38 @@
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t.o %s
+# RUN: llvm-jitlink -noexec -abs X=0xFF -check=%s %t.o
+
+# RUN: not llvm-jitlink -noexec -abs X=0x100 %t.o 2>&1 | \
+# RUN:   FileCheck -check-prefix=CHECK-ERROR %s
+#
+# Check success and failure cases of R_390_8 handling.
+
+# jitlink-check: *{8}P = X
+
+# CHECK-ERROR: relocation target {{.*}} (X) is out of range of Pointer8 fixup
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br    %r14
+.Lfunc_end0:
+        .size   main, .Lfunc_end0-main
+
+        .type   P,@object
+        .data
+        .globl  P
+P:
+        .byte   0
+        .byte   0
+        .byte   0
+        .byte   0
+        .byte   0
+        .byte   0
+        .byte   0
+        .byte   X    # Using byte here generates R_390_8.
+        .size   P, 8
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_call_pic.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_call_pic.s
new file mode 100644
index 0000000000000..743181655a5cc
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_call_pic.s
@@ -0,0 +1,96 @@
+# REQUIRES: system-linux
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o  %t/elf_pic_reloc.o %s 
+#
+# RUN: llvm-jitlink -noexec \ 
+# RUN:     -slab-allocate 100Kb -slab-address 0xfff00000 -slab-page-size 4096 \
+# RUN:     -abs external_data=0x1 \
+# RUN:     -abs extern_out_of_range32=0x7fff00000000 \
+# RUN:     -abs extern_in_range32=0xffe00000 \
+# RUN:     -check %s %t/elf_pic_reloc.o
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+         br   %r14
+         .size main, .-main
+
+        .globl  named_func
+        .p2align       4
+        .type   named_func,@function
+named_func:
+	br    %r14
+        .size   named_func, .-named_func
+
+# Check R_390_PC32DBL handling with a call to a local function in the text
+# section. This produces a Delta32dbl edge that is resolved like a regular
+# direct relative branches(no PLT entry created).
+#
+# jitlink-check: decode_operand(test_call_local, 1) = \
+# jitlink-check:   named_func - test_call_local
+        .globl  test_call_local
+        .p2align       4
+        .type   test_call_local,@function
+test_call_local:
+        brasl  %r14, named_func@PLT 
+
+        .size   test_call_local, .-test_call_local
+
+# Check R_390_PLT32dbl(DeltaPLT32dbl)  handling with a call to an 
+# external via PLT. This produces a Delta32dbl edge, because externals are 
+# not defined locally. As the target is out-of-range from the callsite, 
+# the edge keeps using its PLT entry.
+#
+# jitlink-check: decode_operand(test_call_extern_plt, 1) = \
+# jitlink-check:     stub_addr(elf_pic_reloc.o, extern_out_of_range32) - \
+# jitlink-check:        test_call_extern_plt
+# jitlink-check: *{8}(got_addr(elf_pic_reloc.o, extern_out_of_range32)) = \
+# jitlink-check:     extern_out_of_range32
+        .globl  test_call_extern_plt
+        .p2align       4
+        .type   test_call_extern_plt,@function
+test_call_extern_plt:
+        brasl   %r14, extern_out_of_range32@plt
+
+        .size   test_call_extern_plt, .-test_call_extern_plt
+
+# Check PLT stub relocation for lgrl(Delta32dbl). 
+#
+# jitlink-check: *{4}(stub_addr(elf_pic_reloc.o, extern_out_of_range32) + 2) = \
+# jitlink-check: ((got_addr(elf_pic_reloc.o, extern_out_of_range32) - \
+# jitlink-check:   stub_addr(elf_pic_reloc.o, extern_out_of_range32)) >> 1) \
+# jitlink-check: & 0xffffffff
+        .globl  test_call_extern_plt_stub
+        .p2align       4
+        .type   test_call_extern_plt_stub,@function
+test_call_extern_plt_stub:
+        brasl   %r14, extern_out_of_range32@plt
+
+        .size   test_call_extern_plt_stub, .-test_call_extern_plt_stub
+
+# Check R_390_PLT32(DeltaPLT32dbl) handling with a call to an external. 
+# This produces a Delta32dbl edge, because externals are not defined 
+# locally. During resolution, the target turns out to be in-range from the 
+# callsite.
+### TODO: edge can be relaxed in post-allocation optimization, it will then
+### require:
+### jitlink-check: decode_operand(test_call_extern, 1) = \
+### jitlink-check:     extern_in_range32 - test_call_extern
+#
+# Same as test_call_extern_plt(no-optimization)
+# jitlink-check: decode_operand(test_call_extern, 1) = \
+# jitlink-check:     stub_addr(elf_pic_reloc.o, extern_in_range32) - \
+# jitlink-check:        test_call_extern
+# jitlink-check: *{8}(got_addr(elf_pic_reloc.o, extern_in_range32)) = \
+# jitlink-check:     extern_in_range32
+        .globl  test_call_extern
+        .p2align       4
+        .type   test_call_extern,@function
+test_call_extern:
+        brasl   %r14, extern_in_range32@plt
+        .size   test_call_extern, .-test_call_extern
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp12.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp12.s
new file mode 100644
index 0000000000000..cf12cdc987ce3
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp12.s
@@ -0,0 +1,28 @@
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t.o %s
+#
+# RUN: llvm-jitlink -noexec -abs DISP=0xFFF -check=%s %t.o
+
+# RUN: not llvm-jitlink -noexec -abs  DISP=0x1000 %t.o 2>&1 | \
+# RUN:  FileCheck -check-prefix=CHECK-ERROR %s
+#
+# Check success and failure cases of R_390_12 handling.
+
+# CHECK-ERROR: relocation target {{.*}} (DISP) is out of range of
+# CHECK-ERROR: Pointer12 fixup
+
+# jitlink-check: decode_operand(main, 2) = DISP
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+    	.reloc .+2, R_390_12, DISP 
+    	l %r6, 0(%r7,%r8)
+        br    %r14
+.Lfunc_end0:
+        .size   main, .Lfunc_end0-main
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp20.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp20.s
new file mode 100644
index 0000000000000..5c7de535cf8b4
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp20.s
@@ -0,0 +1,31 @@
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t.o %s
+#
+# RUN: llvm-jitlink -noexec -abs DISP=0x7FFFF -check=%s %t.o
+
+# RUN: not llvm-jitlink -noexec -abs DISP=0x80000 %t.o 2>&1 | \
+# RUN:  FileCheck -check-prefix=CHECK-ERROR %s
+
+# RUN: not llvm-jitlink -noexec -abs DISP=0xFFFFF %t.o 2>&1 | \
+# RUN:  FileCheck -check-prefix=CHECK-ERROR %s
+#
+# Check success and failure cases of R_390_20 handling.
+
+# CHECK-ERROR: relocation target {{.*}} (DISP) is out of range of
+# CHECK-ERROR: Pointer20 fixup
+
+# jitlink-check: decode_operand(main, 2) = DISP
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+    	.reloc .+2, R_390_20, DISP
+    	lg %r6, 0(%r7,%r8)
+        br    %r14
+.Lfunc_end0:
+        .size   main, .Lfunc_end0-main
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_got.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_got.s
new file mode 100644
index 0000000000000..7da48cfa704e2
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_got.s
@@ -0,0 +1,244 @@
+# REQUIRES: system-linux
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t/elf_reloc.o %s
+#
+# RUN: llvm-jitlink -noexec \
+# RUN:    -slab-allocate 100Kb -slab-address 0x6ff00000 -slab-page-size 4096 \
+# RUN:    -abs foo=0x6ff04040 \
+# RUN:    -abs bar=0x6ff04048 \
+# RUN:     %t/elf_reloc.o -check %s
+
+# Verifying GOT related relocations.
+
+        .text
+        .globl  main
+        .type   main,@function
+main:
+# jitlink-check: decode_operand(main, 1) = _GLOBAL_OFFSET_TABLE_ - main 
+	larl %r12, _GLOBAL_OFFSET_TABLE_
+	.globl test_gotent_foo
+test_gotent_foo:
+# jitlink-check: decode_operand(test_gotent_foo, 1) = \
+# jitlink-check:          (got_addr(elf_reloc.o, foo) - test_gotent_foo)
+	.reloc .+2, R_390_GOTENT, foo+2
+	larl %r1, 0
+	.size test_gotent_foo, .-test_gotent_foo
+
+	.globl test_gotent_bar
+test_gotent_bar:
+# jitlink-check: decode_operand(test_gotent_bar, 1) = \
+# jitlink-check:          (got_addr(elf_reloc.o, bar) - test_gotent_bar) 
+	.reloc .+2, R_390_GOTENT, bar+2
+	larl %r1, 0
+        .size test_gotent_bar, .-test_gotent_bar
+
+        .globl test_gotpltent_foo
+test_gotpltent_foo:
+# jitlink-check: decode_operand(test_gotpltent_foo, 1) = \
+# jitlink-check:          (got_addr(elf_reloc.o, foo) - test_gotpltent_foo)
+	.reloc .+2, R_390_GOTPLTENT, foo+2
+	larl %r1, 0
+	.size test_gotpltent_foo, .-test_gotpltent_foo
+
+        .globl test_gotpltent_bar
+test_gotpltent_bar:
+# jitlink-check: decode_operand(test_gotpltent_bar, 1) = \
+# jitlink-check:          (got_addr(elf_reloc.o, bar) - test_gotpltent_bar)
+	.reloc .+2, R_390_GOTPLTENT, bar+2
+	larl %r1, 0
+        .size test_gotpltent_bar, .-test_gotpltent_bar
+
+       .globl test_got12_foo
+test_got12_foo:
+# jitlink-check: decode_operand(test_got12_foo, 2) = \
+# jitlink-check:       (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+	.reloc .+2, R_390_GOT12, foo
+	l %r1, 0(%r12)
+        .size test_got12_foo, .-test_got12_foo
+
+      .globl test_got12_bar
+test_got12_bar:
+# jitlink-check: decode_operand(test_got12_bar, 2) = \
+# jitlink-check:       (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+	.reloc .+2, R_390_GOT12, bar
+	l %r1, 0(%r12)
+        .size test_got12_bar, .-test_got12_bar
+
+       .globl test_gotplt12_foo
+test_gotplt12_foo:
+# jitlink-check: decode_operand(test_gotplt12_foo, 2) = \
+# jitlink-check:       (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+	.reloc .+2, R_390_GOTPLT12, foo
+	l %r1, 0(%r12)
+        .size test_gotplt12_foo, .-test_gotplt12_foo
+
+       .globl test_gotplt12_bar
+test_gotplt12_bar:
+# jitlink-check: decode_operand(test_gotplt12_bar, 2) = \
+# jitlink-check:       (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+        .reloc .+2, R_390_GOTPLT12, bar 
+        l %r1, 0(%r12)
+        .size test_gotplt12_bar, .-test_gotplt12_bar
+
+       .globl test_got20_foo
+test_got20_foo:
+# jitlink-check: decode_operand(test_got20_foo, 2) = \
+# jitlink-check:       (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+        .reloc .+2, R_390_GOT20, foo
+        lg %r1, 0(%r12)
+        .size test_got20_foo, .-test_got20_foo
+
+      .globl test_got20_bar
+test_got20_bar:
+# jitlink-check: decode_operand(test_got20_bar, 2) = \
+# jitlink-check:       (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+        .reloc .+2, R_390_GOT20, bar
+        lg %r1, 0(%r12)
+        .size test_got20_bar, .-test_got20_bar
+
+       .globl test_gotplt20_foo
+test_gotplt20_foo:
+# jitlink-check: decode_operand(test_gotplt20_foo, 2) = \
+# jitlink-check:       (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+        .reloc .+2, R_390_GOTPLT20, foo
+        lg %r1, 0(%r12)
+        .size test_gotplt20_foo, .-test_gotplt20_foo
+
+       .globl test_gotplt20_bar
+test_gotplt20_bar:
+# jitlink-check: decode_operand(test_gotplt20_bar, 2) = \
+# jitlink-check:       (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+        .reloc .+2, R_390_GOTPLT20, bar
+        lg %r1, 0(%r12)
+        .size test_gotplt20_bar, .-test_gotplt20_bar
+        br  %r14
+	.size   main, .-main
+
+	.data
+	.globl test_got16_foo
+# jitlink-check: *{2}test_got16_foo = \
+# jitlink-check:     (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+test_got16_foo:
+	.reloc ., R_390_GOT16, foo
+	.space 2
+	.size test_got16_foo, .-test_got16_foo
+
+       .globl test_got16_bar
+# jitlink-check: *{2}test_got16_bar = \
+# jitlink-check:     (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+test_got16_bar:
+        .reloc ., R_390_GOT16, bar
+        .space 2
+        .size test_got16_bar, .-test_got16_bar
+
+        .globl test_gotplt16_foo
+# jitlink-check: *{2}test_gotplt16_foo = \
+# jitlink-check:    (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+test_gotplt16_foo:
+        .reloc ., R_390_GOTPLT16, foo
+        .space 2
+        .size test_gotplt16_foo, .-test_gotplt16_foo
+
+       .globl test_gotplt16_bar
+# jitlink-check: *{2}test_gotplt16_bar = \
+# jitlink-check:    (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+test_gotplt16_bar:
+        .reloc ., R_390_GOTPLT16, bar
+        .space 2
+        .size test_gotplt16_bar, .-test_gotplt16_bar
+
+        .globl test_got32_foo
+# jitlink-check: *{4}test_got32_foo = \
+# jitlink-check:    (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+test_got32_foo:
+        .reloc ., R_390_GOT32, foo
+        .space 4 
+        .size test_got32_foo, .-test_got32_foo
+
+       .globl test_got32_bar
+# jitlink-check: *{4}test_got32_bar = \
+# jitlink-check:    (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+test_got32_bar:
+        .reloc ., R_390_GOT32, bar
+        .space 4 
+        .size test_got32_bar, .-test_got32_bar
+
+        .globl test_gotplt32_foo
+# jitlink-check: *{4}test_gotplt32_foo = \
+# jitlink-check:    (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+test_gotplt32_foo:
+        .reloc ., R_390_GOTPLT32, foo
+        .space 4 
+        .size test_gotplt32_foo, .-test_gotplt32_foo
+
+       .globl test_gotplt32_bar
+# jitlink-check: *{4}test_gotplt32_bar = \
+# jitlink-check:    (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+test_gotplt32_bar:
+        .reloc ., R_390_GOTPLT32, bar
+        .space 4 
+        .size test_gotplt32_bar, .-test_gotplt32_bar
+
+        .globl test_got64_foo
+# jitlink-check: *{8}test_got64_foo = \
+# jitlink-check:    (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+test_got64_foo:
+        .reloc ., R_390_GOT64, foo
+        .space 8 
+        .size test_got64_foo, .-test_got64_foo
+
+       .globl test_got64_bar
+# jitlink-check: *{8}test_got64_bar = \
+# jitlink-check:    (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+test_got64_bar:
+        .reloc ., R_390_GOT64, bar
+        .space 8 
+        .size test_got64_bar, .-test_got64_bar
+
+        .globl test_gotplt64_foo
+# jitlink-check: *{8}test_gotplt64_foo = \
+# jitlink-check:    (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+test_gotplt64_foo:
+        .reloc ., R_390_GOTPLT64, foo
+        .space 8 
+        .size test_gotplt64_foo, .-test_gotplt64_foo
+
+       .globl test_gotplt64_bar
+# jitlink-check: *{8}test_gotplt64_bar = \
+# jitlink-check:  (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+test_gotplt64_bar:
+        .reloc ., R_390_GOTPLT64, bar
+        .space 8 
+        .size test_gotplt64_bar, .-test_gotplt64_bar
+
+        .globl test_gotpc_foo
+# jitlink-check: *{4}test_gotpc_foo = _GLOBAL_OFFSET_TABLE_ - test_gotpc_foo
+test_gotpc_foo:
+        .reloc ., R_390_GOTPC, foo
+        .space 4 
+        .size test_gotpc_foo, .-test_gotpc_foo
+
+        .globl test_gotpc_bar
+# jitlink-check: *{4}test_gotpc_bar = _GLOBAL_OFFSET_TABLE_ - test_gotpc_bar
+test_gotpc_bar:
+        .reloc ., R_390_GOTPC, bar 
+        .space 4
+        .size test_gotpc_bar, .-test_gotpc_bar
+
+        .globl test_gotpcdbl_foo
+# jitlink-check: *{4}test_gotpcdbl_foo = \
+# jitlink-check:           (_GLOBAL_OFFSET_TABLE_ - test_gotpcdbl_foo) >> 1
+test_gotpcdbl_foo:
+        .reloc ., R_390_GOTPCDBL, foo
+        .space 4
+        .size test_gotpcdbl_foo, .-test_gotpcdbl_foo
+
+        .globl test_gotpcdbl_bar
+# jitlink-check: *{4}test_gotpcdbl_bar =  \
+# jitlink-check:           (_GLOBAL_OFFSET_TABLE_ - test_gotpcdbl_bar) >> 1
+test_gotpcdbl_bar:
+        .reloc ., R_390_GOTPCDBL, bar
+        .space 4
+        .size test_gotpcdbl_bar, .-test_gotpcdbl_bar
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_gotrel.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_gotrel.s
new file mode 100644
index 0000000000000..af37b3f75ca42
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_gotrel.s
@@ -0,0 +1,68 @@
+# REQUIRES: system-linux
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t/elf_reloc.o %s
+#
+# RUN: llvm-jitlink -noexec \
+# RUN:    -slab-allocate 100Kb -slab-address 0x6ff00000 -slab-page-size 4096 \
+# RUN:    -abs foo=0x6ff04080 \
+# RUN:    -abs bar=0x6ff04040 \
+# RUN:     %t/elf_reloc.o -check %s
+
+        .text
+        .globl  main
+        .type   main,@function
+main:
+        br  %r14
+	.size   main, .-main
+
+	.data
+	.globl test_gotoff16_bar
+# jitlink-check: *{2}test_gotoff16_bar = (bar - _GLOBAL_OFFSET_TABLE_) & 0xffff
+test_gotoff16_bar:
+	.reloc ., R_390_GOTOFF16, bar
+	.space 2
+	.size test_gotoff16_bar, .-test_gotoff16_bar
+
+       .globl test_pltoff16_foo
+# jitlink-check: *{2}test_pltoff16_foo =  \
+# jitlink-check:      (stub_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_) \
+# jitlink-check:       & 0xffff
+test_pltoff16_foo:
+        .reloc ., R_390_PLTOFF16, foo 
+        .space 2
+        .size test_pltoff16_foo, .-test_pltoff16_foo
+
+
+       .globl test_gotoff32_bar
+# jitlink-check: *{4}test_gotoff32_bar = (bar - _GLOBAL_OFFSET_TABLE_) \
+# jitlink-check:       & 0xffffffff
+test_gotoff32_bar:
+        .reloc ., R_390_GOTOFF, bar
+        .space 4 
+        .size test_gotoff32_bar, .-test_gotoff32_bar
+
+        .globl test_pltoff32_foo
+# jitlink-check: *{4}test_pltoff32_foo = \
+# jitlink-check:      (stub_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_) \
+# jitlink-check:       & 0xffffffff
+test_pltoff32_foo:
+        .reloc ., R_390_PLTOFF32, foo
+        .space 4 
+        .size test_pltoff32_foo, .-test_pltoff32_foo
+
+	 .globl test_gotoff64_bar
+# jitlink-check: *{8}test_gotoff64_bar = bar - _GLOBAL_OFFSET_TABLE_
+test_gotoff64_bar:
+        .reloc ., R_390_GOTOFF64, bar
+        .space 8 
+        .size test_gotoff64_bar, .-test_gotoff64_bar
+
+        .globl test_pltoff64_foo
+# jitlink-check: *{8}test_pltoff64_foo = \
+# jitlink-check:      (stub_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+test_pltoff64_foo:
+        .reloc ., R_390_PLTOFF64, foo
+        .space 8 
+        .size test_pltoff64_foo, .-test_pltoff64_foo
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc.s
new file mode 100644
index 0000000000000..4b3a65e53ab93
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc.s
@@ -0,0 +1,20 @@
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t.o %s
+#
+# RUN: llvm-jitlink -noexec %t.o
+#
+# Check R_390_PC* handling.
+
+        .text
+        .globl  main
+        .type   main,@function
+main:
+        br      %r14 
+        .size   main, .-main
+
+        .rodata
+        .short  main-. # Generate R_390_PC16 relocation.
+        .long   main-. # Generate R_390_PC32 relocation.
+        .quad   main-. # Generate R_390_PC64 relocation.
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc16.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc16.s
new file mode 100644
index 0000000000000..0da54b2a58972
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc16.s
@@ -0,0 +1,41 @@
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -defsym OFFSET=0x8000 -filetype=obj -o %t.o %s
+# RUN: llvm-jitlink -noexec -abs OFFSET=0x8000 -check=%s %t.o
+#
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -defsym OFFSET=0xFFFF -filetype=obj -o %t.o %s
+# RUN: not llvm-jitlink -noexec -abs OFFSET=0xFFFF %t.o 2>&1 | \
+# RUN:   FileCheck -check-prefix=CHECK-ERROR %s
+#
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -defsym OFFSET=0x8001 -filetype=obj -o %t.o %s
+# RUN: not llvm-jitlink -noexec -abs OFFSET=0x8001 %t.o 2>&1 | \
+# RUN:   FileCheck -check-prefix=CHECK-ERROR %s
+#
+# jitlink-check: *{2}test_pc16 = OFFSET
+# jitlink-check: *{2}test_pc16dbl = OFFSET
+
+# CHECK-ERROR:  {{.*}} is out of range of Delta16 fixup
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br    %r14
+        .size   main, .-main
+
+	.globl test_pc16
+test_pc16:
+  	.reloc test_pc16, R_390_PC16, .-OFFSET
+	.space 2
+  	.size test_pc16, .-test_pc16 
+
+	.globl test_pc16dbl
+test_pc16dbl:
+  	.reloc test_pc16dbl, R_390_PC16DBL, .-(OFFSET + OFFSET)
+	.space 2
+  	.size test_pc16dbl, .-test_pc16dbl
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc32.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc32.s
new file mode 100644
index 0000000000000..503fd2d0a5d49
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc32.s
@@ -0,0 +1,41 @@
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -defsym OFFSET=0x80000000 -filetype=obj -o %t.o %s
+# RUN: llvm-jitlink -noexec -abs OFFSET=0x80000000 -check=%s %t.o
+#
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -defsym OFFSET=0xFFFFFFFF -filetype=obj -o %t.o %s
+# RUN: not llvm-jitlink -noexec -abs OFFSET=0xFFFFFFFF %t.o 2>&1 | \
+# RUN:   FileCheck -check-prefix=CHECK-ERROR %s
+#
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -defsym OFFSET=0x80000001 -filetype=obj -o %t.o %s
+# RUN: not llvm-jitlink -noexec -abs OFFSET=0x80000001 %t.o 2>&1 | \
+# RUN:   FileCheck -check-prefix=CHECK-ERROR %s
+#
+# jitlink-check: *{4}test_pc32 = OFFSET
+# jitlink-check: *{4}test_pc32dbl = OFFSET
+
+# CHECK-ERROR:  {{.*}} is out of range of Delta32 fixup
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br    %r14
+        .size   main, .-main
+
+	.globl test_pc32
+test_pc32:
+  	.reloc test_pc32, R_390_PC32, .-OFFSET
+	.space 4 
+  	.size test_pc32, .-test_pc32 
+
+	.globl test_pc32dbl
+test_pc32dbl:
+  	.reloc test_pc32dbl, R_390_PC32DBL, .-(OFFSET + OFFSET)
+	.space 4 
+  	.size test_pc32dbl, .-test_pc32dbl 
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc64.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc64.s
new file mode 100644
index 0000000000000..0d33ae2976de5
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc64.s
@@ -0,0 +1,34 @@
+# REQUIRES: system-linux
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o  %t/elf_reloc.o %s
+#
+# RUN: llvm-jitlink -noexec \
+# RUN:     -slab-allocate 100Kb -slab-address 0xffff0000 -slab-page-size 4096 \
+# RUN:     -abs external_data=0x1 \
+# RUN:    -abs foo=0x6ff04040 \
+# RUN:    -abs bar=0x6ff04048 \
+# RUN:     -check %s %t/elf_reloc.o
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br    %r14
+        .size   main, .-main
+
+        .globl test_pc64_foo
+# jitlink-check: *{8}test_pc64_foo = foo - test_pc64_foo
+test_pc64_foo:
+        .reloc ., R_390_PC64, foo
+        .space 8
+        .size test_pc64_foo, .-test_pc64_foo
+
+        .globl test_pc64_bar
+# jitlink-check: *{8}test_pc64_bar = bar - test_pc64_bar
+test_pc64_bar:
+        .reloc ., R_390_PC64, bar 
+        .space 8
+        .size test_pc64_bar, .-test_pc64_bar
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pcdbl.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pcdbl.s
new file mode 100644
index 0000000000000..6a7ca8bd6e2a6
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pcdbl.s
@@ -0,0 +1,74 @@
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux -mcpu=z16 -position-independent \
+# RUN:         -defsym OFF12=0xffe -defsym OFF16=4 -defsym OFF24=6 \
+# RUN:         -filetype=obj -o %t.o %s
+#
+# RUN: llvm-jitlink -noexec -abs OFF12=0xffe -abs OFF16=4 -abs OFF24=6 \
+# RUN:                      -check=%s %t.o
+#
+# RUN: llvm-mc -triple=systemz-unknown-linux -mcpu=z16 -position-independent \
+# RUN:         -defsym OFF12=6 -defsym OFF16=0xfffe -defsym OFF24=6 \
+# RUN:         -filetype=obj -o %t.o %s
+#
+# RUN: llvm-jitlink -noexec -abs OFF12=6 -abs OFF16=0xfffe -abs OFF24=6 \
+# RUN:                      -check=%s %t.o
+#
+# RUN: llvm-mc -triple=systemz-unknown-linux -mcpu=z16 -position-independent \
+# RUN:         -defsym OFF12=6 -defsym OFF16=4 -defsym OFF24=0xfffffe \
+# RUN:         -filetype=obj -o %t.o %s
+#
+# RUN: llvm-jitlink -noexec -abs OFF12=6 -abs OFF16=4 -abs OFF24=0xfffffe \
+# RUN:                      -check=%s %t.o
+#
+# RUN: llvm-mc -triple=systemz-unknown-linux -mcpu=z16 -position-independent \
+# RUN:         -defsym OFF12=6 -defsym OFF16=4 -defsym OFF24=6 \
+# RUN:         -filetype=obj -o %t.o %s
+#
+# RUN: llvm-jitlink -noexec -abs OFF12=6 -abs OFF16=4 -abs OFF24=6 \
+# RUN:                      -check=%s %t.o
+
+# Check R_390_PC*dbl relocations.  R_390_PC32_DBL test is in 
+# ELF_systemz_reloc_abs32.s because of large offset. 
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br   %r14
+        .size main, .-main
+
+# R_390_PC16DBL
+# jitlink-check: *{2}(test_pc16dbl + 2) = (OFF16 >> 1)
+        .globl  test_pc16dbl
+        .p2align 3 
+test_pc16dbl:
+        je   .Lpc16dbl
+	.space OFF16 - 4
+.Lpc16dbl:
+        jne  test_pc16dbl 
+        .size test_pc16dbl,.-test_pc16dbl
+
+# R_390_PC12DBL
+# jitlink-check: ((*{2} (test_pc12dbl + 1)) & 0x0fff) = (OFF12 >> 1)
+        .globl  test_pc12dbl
+        .p2align 4 
+test_pc12dbl:
+        bprp  0, .Lpc12dbl, 0 
+        .space OFF12 - 6
+.Lpc12dbl:
+        bprp  0, test_pc12dbl, 0 
+        .size test_pc12dbl,.-test_pc12dbl
+
+# R_390_PC24DBL
+# jitlink-check: ((*{4} (test_pc24dbl + 2)) & 0x0ffffff) = (OFF24 >> 1)
+        .globl  test_pc24dbl
+        .p2align 4 
+test_pc24dbl:
+        bprp  0, 0, .Lpc24dbl
+        .space OFF24 - 6
+.Lpc24dbl:
+        bprp  0, 0, test_pc24dbl
+        .size test_pc24dbl,.-test_pc24dbl
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_plt.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_plt.s
new file mode 100644
index 0000000000000..47f064b45816a
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_plt.s
@@ -0,0 +1,71 @@
+# REQUIRES: system-linux
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o  %t/elf_reloc.o %s
+#
+# RUN: llvm-jitlink -noexec \
+# RUN:     -slab-allocate 100Kb -slab-address 0xffff0000 -slab-page-size 4096 \
+# RUN:     -abs external_data=0x1 \
+# RUN:    -abs foo=0x6ff04040 \
+# RUN:    -abs bar=0x6ff04048 \
+# RUN:     -check %s %t/elf_reloc.o
+
+# Check R_390_PLT32/64 relocations.
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br   %r14
+        .size main, .-main
+
+        .globl test_plt32_foo
+# jitlink-check: *{4}test_plt32_foo = \
+# jitlink-check:  stub_addr(elf_reloc.o, foo) - test_plt32_foo
+test_plt32_foo:
+        .reloc ., R_390_PLT32, foo
+        .space 4
+        .size test_plt32_foo, .-test_plt32_foo
+
+        .globl test_plt32_bar
+# jitlink-check: *{4}test_plt32_bar = \
+# jitlink-check:  stub_addr(elf_reloc.o, bar) - test_plt32_bar
+test_plt32_bar:
+        .reloc ., R_390_PLT32, bar
+        .space 4
+        .size test_plt32_bar, .-test_plt32_bar
+
+       .globl test_plt64_foo
+# jitlink-check: *{8}test_plt64_foo = \
+# jitlink-check:  stub_addr(elf_reloc.o, foo) - test_plt64_foo
+test_plt64_foo:
+        .reloc ., R_390_PLT64, foo
+        .space 8
+        .size test_plt64_foo, .-test_plt64_foo
+
+       .globl test_plt64_bar
+# jitlink-check: *{8}test_plt64_bar = \
+# jitlink-check:  stub_addr(elf_reloc.o, bar) - test_plt64_bar
+test_plt64_bar:
+        .reloc ., R_390_PLT64, bar
+        .space 8
+        .size test_plt64_bar, .-test_plt64_bar
+
+        .globl test_plt32dbl_foo
+# jitlink-check: *{4}test_plt32dbl_foo = \
+# jitlink-check:  (stub_addr(elf_reloc.o, foo) - test_plt32dbl_foo) >> 1
+test_plt32dbl_foo:
+        .reloc ., R_390_PLT32DBL, foo
+        .space 4
+        .size test_plt32dbl_foo, .-test_plt32dbl_foo
+
+        .globl test_plt32dbl_bar
+# jitlink-check: *{4}test_plt32dbl_bar = \
+# jitlink-check:  (stub_addr(elf_reloc.o, bar) - test_plt32dbl_bar) >> 1
+test_plt32dbl_bar:
+        .reloc ., R_390_PLT32DBL, bar
+        .space 4
+        .size test_plt32dbl_bar, .-test_plt32dbl_bar
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pltdbl.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pltdbl.s
new file mode 100644
index 0000000000000..c36a77684008a
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pltdbl.s
@@ -0,0 +1,51 @@
+# REQUIRES: system-linux
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -mcpu=z16 -filetype=obj -o %t/elf_reloc.o %s
+
+# RUN: llvm-jitlink -noexec \
+# RUN:    -abs external_addr12=0xffe \
+# RUN:    -abs external_addr16=0xfffe \
+# RUN:    -abs external_addr24=0xffffe \
+# RUN:     %t/elf_reloc.o -check %s
+
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br   %r14
+        .size main, .-main
+
+# R_390_PLT16DBL
+# jitlink-check: *{2}(test_plt16dbl + 4) = \
+# jitlink-check:     (stub_addr(elf_reloc.o, external_addr16) - \
+# jitlink-check:                test_plt16dbl) >> 1
+        .globl  test_plt16dbl
+        .p2align 4 
+test_plt16dbl:
+	bpp   0, external_addr16@plt, 0
+        .size test_plt16dbl,.-test_plt16dbl
+
+# R_390_PLT12DBL
+# jitlink-check: ((*{2}(test_plt12dbl + 1)) & 0x0fff) = \
+# jitlink-check:      (stub_addr(elf_reloc.o, external_addr12) - \
+# jitlink-check:                 test_plt12dbl) >> 1
+        .globl  test_plt12dbl
+        .p2align 4 
+test_plt12dbl:
+        bprp  0, external_addr12@plt, 0 
+        .size test_plt12dbl,.-test_plt12dbl
+
+# R_390_PLT24DBL
+# jitlink-check: ((*{4}(test_plt24dbl + 2)) & 0x0ffffff) = \
+# jitlink-check:       (stub_addr(elf_reloc.o, external_addr24) - \
+# jitlink-check:                  test_plt24dbl) >> 1
+        .globl  test_plt24dbl
+        .p2align 4 
+test_plt24dbl:
+        bprp  0, 0, external_addr24@plt 
+        .size test_plt24dbl,.-test_plt24dbl
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/lit.local.cfg b/llvm/test/ExecutionEngine/JITLink/systemz/lit.local.cfg
new file mode 100644
index 0000000000000..caf81b69c06fd
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "SystemZ" in config.root.targets:
+  config.unsupported = True
diff --git a/llvm/test/ExecutionEngine/lit.local.cfg b/llvm/test/ExecutionEngine/lit.local.cfg
index 1951f140ea889..bbffee852e10e 100644
--- a/llvm/test/ExecutionEngine/lit.local.cfg
+++ b/llvm/test/ExecutionEngine/lit.local.cfg
@@ -1,4 +1,4 @@
-if config.root.native_target in ['Sparc', 'SystemZ', 'Hexagon']:
+if config.root.native_target in ['Sparc', 'Hexagon']:
     config.unsupported = True
 
 # ExecutionEngine tests are not expected to pass in a cross-compilation setup.
diff --git a/llvm/test/Feature/intrinsics.ll b/llvm/test/Feature/intrinsics.ll
index b6abc0fff6db7..a2da8f29116e1 100644
--- a/llvm/test/Feature/intrinsics.ll
+++ b/llvm/test/Feature/intrinsics.ll
@@ -64,7 +64,7 @@ define void @libm() {
 
 ; FIXME: test ALL the intrinsics in this file.
 
-; CHECK: declare void @llvm.trap() #1
+; CHECK: declare void @llvm.trap() #2
 declare void @llvm.trap()
 
 define void @trap() {
@@ -72,5 +72,4 @@ define void @trap() {
   ret void
 }
 
-; CHECK: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; CHECK: attributes #1 = { cold noreturn nounwind memory(inaccessiblemem: write) }
+; CHECK: attributes #2 = { cold noreturn nounwind memory(inaccessiblemem: write) }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll
new file mode 100644
index 0000000000000..877fe5fe4b393
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll
@@ -0,0 +1,355 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=msan -mattr=+avx512bf16 < %s | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/X86/avx512bf16-intrinsics.ll
+;
+; Strictly handled:
+; - llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B)
+; - llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A)
+; - llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B)
+;
+; Heuristically handled: (none)
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float>, <16 x float>) #3
+
+define <8 x i64> @test_mm512_cvtne2ps2bf16_512(<16 x float> %A, <16 x float> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x i64> @test_mm512_cvtne2ps2bf16_512(
+; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x bfloat> [[TMP6]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+;
+entry:
+  %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4
+  %1 = bitcast <32 x bfloat> %0 to <8 x i64>
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @test_mm512_maskz_cvtne2ps2bf16_512(<16 x float> %A, <16 x float> %B, i32 %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x i64> @test_mm512_maskz_cvtne2ps2bf16_512(
+; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[U]] to <32 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <32 x bfloat> [[TMP7]] to <32 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <32 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i16> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP8]], <32 x i16> [[TMP14]], <32 x i16> [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <32 x i1> [[TMP9]], <32 x bfloat> [[TMP7]], <32 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <32 x i16> [[_MSPROP_SELECT]] to <8 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x bfloat> [[TMP15]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
+;
+entry:
+  %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4
+  %1 = bitcast i32 %U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer
+  %3 = bitcast <32 x bfloat> %2 to <8 x i64>
+  ret <8 x i64> %3
+}
+
+define <8 x i64> @test_mm512_mask_cvtne2ps2bf16_512(<8 x i64> %C, i32 %U, <16 x float> %A, <16 x float> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x i64> @test_mm512_mask_cvtne2ps2bf16_512(
+; CHECK-SAME: <8 x i64> [[C:%.*]], i32 [[U:%.*]], <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[C]] to <32 x bfloat>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32 [[U]] to <32 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <32 x i1> [[TMP12]], <32 x i16> zeroinitializer, <32 x i16> [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <32 x bfloat> [[TMP8]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x bfloat> [[TMP10]] to <32 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <32 x i16> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i16> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP18]], <32 x i16> [[TMP13]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <32 x i1> [[TMP12]], <32 x bfloat> [[TMP8]], <32 x bfloat> [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <32 x i16> [[_MSPROP_SELECT]] to <8 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <32 x bfloat> [[TMP19]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP21]]
+;
+entry:
+  %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4
+  %1 = bitcast <8 x i64> %C to <32 x bfloat>
+  %2 = bitcast i32 %U to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x bfloat> %0, <32 x bfloat> %1
+  %4 = bitcast <32 x bfloat> %3 to <8 x i64>
+  ret <8 x i64> %4
+}
+
+declare <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float>) #3
+
+define <4 x i64> @test_mm512_cvtneps2bf16_512(<16 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm512_cvtneps2bf16_512(
+; CHECK-SAME: <16 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x bfloat> [[TMP4]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP5]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4
+  %1 = bitcast <16 x bfloat> %0 to <4 x i64>
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @test_mm512_maskz_cvtneps2bf16_512(<16 x float> %A, i16 %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm512_maskz_cvtneps2bf16_512(
+; CHECK-SAME: <16 x float> [[A:%.*]], i16 [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i16> zeroinitializer, <16 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x bfloat> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i16> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i16> [[TMP12]], <16 x i16> [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP7]], <16 x bfloat> [[TMP5]], <16 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x bfloat> [[TMP13]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP14]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP15]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4
+  %1 = bitcast i16 %U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer
+  %3 = bitcast <16 x bfloat> %2 to <4 x i64>
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @test_mm512_mask_cvtneps2bf16_512(<4 x i64> %C, i16 %U, <16 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm512_mask_cvtneps2bf16_512(
+; CHECK-SAME: <4 x i64> [[C:%.*]], i16 [[U:%.*]], <16 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[C]] to <16 x bfloat>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i16> zeroinitializer, <16 x i16> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x bfloat> [[TMP6]] to <16 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x bfloat> [[TMP8]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i16> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i16> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i16> [[TMP15]], [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i16> [[TMP16]], <16 x i16> [[TMP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP10]], <16 x bfloat> [[TMP6]], <16 x bfloat> [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x bfloat> [[TMP17]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP19]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4
+  %1 = bitcast <4 x i64> %C to <16 x bfloat>
+  %2 = bitcast i16 %U to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1
+  %4 = bitcast <16 x bfloat> %3 to <4 x i64>
+  ret <4 x i64> %4
+}
+
+declare <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float>, <32 x bfloat>, <32 x bfloat>) #3
+
+define <16 x float> @test_mm512_dpbf16ps_512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <16 x float> @test_mm512_dpbf16ps_512(
+; CHECK-SAME: <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP8]]
+;
+entry:
+  %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4
+  ret <16 x float> %0
+}
+
+define <16 x float> @test_mm512_maskz_dpbf16ps_512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B, i16 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <16 x float> @test_mm512_maskz_dpbf16ps_512(
+; CHECK-SAME: <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]], i16 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP16]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP17]]
+;
+entry:
+  %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4
+  %1 = bitcast i16 %U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
+  ret <16 x float> %2
+}
+define <16 x float> @test_mm512_mask_dpbf16ps_512(i16 zeroext %U, <16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <16 x float> @test_mm512_mask_dpbf16ps_512(
+; CHECK-SAME: i16 zeroext [[U:%.*]], <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[E]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP0]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[E]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+entry:
+  %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4
+  %1 = bitcast i16 %U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %E
+  ret <16 x float> %2
+}
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll
new file mode 100644
index 0000000000000..ac65645a9ec2c
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=msan -mattr=+avx512bf16 < %s | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/X86/avx512bf16-mov.ll
+;
+; Strictly handled: (none)
+;
+; Heuristically handled: (none)
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @funbf16(ptr readonly %src, ptr writeonly %dst) sanitize_memory {
+; CHECK-LABEL: define dso_local void @funbf16(
+; CHECK-SAME: ptr readonly [[SRC:%.*]], ptr writeonly [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x bfloat>, ptr [[SRC]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 87960930222080
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    store <8 x i16> [[_MSLD]], ptr [[TMP12]], align 1
+; CHECK-NEXT:    store <8 x bfloat> [[TMP4]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = load <8 x bfloat>, ptr [[SRC]], align 32
+; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 87960930222080
+; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP18]], align 32
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB19:.*]], label %[[BB20:.*]], !prof [[PROF1]]
+; CHECK:       [[BB19]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB20]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
+; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
+; CHECK-NEXT:    store <8 x i16> [[_MSLD1]], ptr [[TMP23]], align 32
+; CHECK-NEXT:    store <8 x bfloat> [[TMP15]], ptr [[DST]], align 32
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP7]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]]
+; CHECK:       [[BB24]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB25]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = load <16 x bfloat>, ptr [[SRC]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 87960930222080
+; CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+; CHECK-NEXT:    [[_MSLD2:%.*]] = load <16 x i16>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP8]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]]
+; CHECK:       [[BB30]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB31]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 87960930222080
+; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+; CHECK-NEXT:    store <16 x i16> [[_MSLD2]], ptr [[TMP34]], align 1
+; CHECK-NEXT:    store <16 x bfloat> [[TMP26]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP9]], label %[[BB35:.*]], label %[[BB36:.*]], !prof [[PROF1]]
+; CHECK:       [[BB35]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB36]]:
+; CHECK-NEXT:    [[TMP37:%.*]] = load <16 x bfloat>, ptr [[SRC]], align 32
+; CHECK-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[TMP39:%.*]] = xor i64 [[TMP38]], 87960930222080
+; CHECK-NEXT:    [[TMP40:%.*]] = inttoptr i64 [[TMP39]] to ptr
+; CHECK-NEXT:    [[_MSLD3:%.*]] = load <16 x i16>, ptr [[TMP40]], align 32
+; CHECK-NEXT:    [[_MSCMP10:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP10]], label %[[BB41:.*]], label %[[BB42:.*]], !prof [[PROF1]]
+; CHECK:       [[BB41]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB42]]:
+; CHECK-NEXT:    [[TMP43:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP44:%.*]] = xor i64 [[TMP43]], 87960930222080
+; CHECK-NEXT:    [[TMP45:%.*]] = inttoptr i64 [[TMP44]] to ptr
+; CHECK-NEXT:    store <16 x i16> [[_MSLD3]], ptr [[TMP45]], align 32
+; CHECK-NEXT:    store <16 x bfloat> [[TMP37]], ptr [[DST]], align 32
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <8 x bfloat>, ptr %src, align 1
+  store <8 x bfloat> %0, ptr %dst, align 1
+  %1 = load <8 x bfloat>, ptr %src, align 32
+  store <8 x bfloat> %1, ptr %dst, align 32
+  %2 = load <16 x bfloat>, ptr %src, align 1
+  store <16 x bfloat> %2, ptr %dst, align 1
+  %3 = load <16 x bfloat>, ptr %src, align 32
+  store <16 x bfloat> %3, ptr %dst, align 32
+  ret void
+}
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll
new file mode 100644
index 0000000000000..904614e961d6c
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll
@@ -0,0 +1,774 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=msan -mattr=+avx512bf16 -mattr=+avx512vl < %s | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
+;
+; Strictly handled:
+; - llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B)
+; - llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B)
+; - llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A)
+; - llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B)
+; - llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B)
+; - llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %6, <4 x i1> %4)
+;
+; Heuristically handled: (none)
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>) #1
+
+define <2 x i64> @test_mm_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B) local_unnamed_addr #0 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm_cvtne2ps2bf16_128(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
+  %1 = bitcast <8 x bfloat> %0 to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B, i8 zeroext %U) local_unnamed_addr #0 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i16> zeroinitializer, <8 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x bfloat> [[TMP7]] to <8 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i16> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i16> [[TMP14]], <8 x i16> [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP9]], <8 x bfloat> [[TMP7]], <8 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x bfloat> [[TMP15]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP17]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
+  %1 = bitcast i8 %U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer
+  %3 = bitcast <8 x bfloat> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A, <4 x float> %B) local_unnamed_addr #0 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128(
+; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i16> zeroinitializer, <8 x i16> [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x bfloat> [[TMP10]] to <8 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <8 x i16> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i16> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i16> [[TMP17]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i16> [[TMP18]], <8 x i16> [[TMP13]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP12]], <8 x bfloat> [[TMP8]], <8 x bfloat> [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x bfloat> [[TMP19]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP21]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
+  %1 = bitcast <2 x i64> %C to <8 x bfloat>
+  %2 = bitcast i8 %U to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1
+  %4 = bitcast <8 x bfloat> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+declare <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>) #3
+
+define <4 x i64> @test_mm256_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B) local_unnamed_addr #1 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm256_cvtne2ps2bf16_256(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x bfloat> [[TMP6]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
+  %1 = bitcast <16 x bfloat> %0 to <4 x i64>
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B, i16 zeroext %U) local_unnamed_addr #1 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], i16 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i16> zeroinitializer, <16 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x bfloat> [[TMP7]] to <16 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i16> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i16> [[TMP14]], <16 x i16> [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP9]], <16 x bfloat> [[TMP7]], <16 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x bfloat> [[TMP15]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP17]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
+  %1 = bitcast i16 %U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer
+  %3 = bitcast <16 x bfloat> %2 to <4 x i64>
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256(<4 x i64> %C, i16 zeroext %U, <8 x float> %A, <8 x float> %B) local_unnamed_addr #1 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256(
+; CHECK-SAME: <4 x i64> [[C:%.*]], i16 zeroext [[U:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i64> [[C]] to <16 x bfloat>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i16> zeroinitializer, <16 x i16> [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x bfloat> [[TMP8]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x bfloat> [[TMP10]] to <16 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <16 x i16> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i16> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i16> [[TMP17]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i16> [[TMP18]], <16 x i16> [[TMP13]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP12]], <16 x bfloat> [[TMP8]], <16 x bfloat> [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <16 x bfloat> [[TMP19]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP21]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
+  %1 = bitcast <4 x i64> %C to <16 x bfloat>
+  %2 = bitcast i16 %U to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1
+  %4 = bitcast <16 x bfloat> %3 to <4 x i64>
+  ret <4 x i64> %4
+}
+
+declare <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>) #3
+
+define <2 x i64> @test_mm256_cvtneps2bf16_256(<8 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm256_cvtneps2bf16_256(
+; CHECK-SAME: <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x bfloat> [[TMP4]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
+  %1 = bitcast <8 x bfloat> %0 to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256(<8 x float> %A, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256(
+; CHECK-SAME: <8 x float> [[A:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i16> zeroinitializer, <8 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i16> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i16> [[TMP12]], <8 x i16> [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP7]], <8 x bfloat> [[TMP5]], <8 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP14]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP15]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
+  %1 = bitcast i8 %U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer
+  %3 = bitcast <8 x bfloat> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_mm256_mask_cvtneps2bf16_256(<2 x i64> %C, i8 zeroext %U, <8 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm256_mask_cvtneps2bf16_256(
+; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i16> zeroinitializer, <8 x i16> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i16> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i16> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i16> [[TMP15]], [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i16> [[TMP16]], <8 x i16> [[TMP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP10]], <8 x bfloat> [[TMP6]], <8 x bfloat> [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x bfloat> [[TMP17]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP19]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
+  %1 = bitcast <2 x i64> %C to <8 x bfloat>
+  %2 = bitcast i8 %U to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1
+  %4 = bitcast <8 x bfloat> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+declare <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x bfloat>, <4 x i1>) #3
+
+define <2 x i64> @test_mm128_cvtneps2bf16_128(<4 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm128_cvtneps2bf16_128(
+; CHECK-SAME: <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> poison, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4
+  %1 = bitcast <8 x bfloat> %0 to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128(<4 x float> %A, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128(
+; CHECK-SAME: <4 x float> [[A:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[TMP0]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> zeroinitializer, <4 x i1> [[TMP4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x bfloat> [[TMP9]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP10]]
+;
+entry:
+  %0 = bitcast i8 %U to <8 x i1>
+  %1 = shufflevector <8 x i1> %0, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> zeroinitializer, <4 x i1> %1) #4
+  %3 = bitcast <8 x bfloat> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_mm128_mask_cvtneps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm128_mask_cvtneps2bf16_128(
+; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP0]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP6]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> [[TMP7]], <4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP14]]
+;
+entry:
+  %0 = bitcast i8 %U to <8 x i1>
+  %1 = shufflevector <8 x i1> %0, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = bitcast <2 x i64> %C to <8 x bfloat>
+  %3 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %2, <4 x i1> %1) #4
+  %4 = bitcast <8 x bfloat> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+define <2 x i64> @test_mm128_cvtneps2bf16_128_select(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm128_cvtneps2bf16_128_select(
+; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP0]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> poison, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP4]], <8 x i16> zeroinitializer, <8 x i16> [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP7]] to <8 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i16> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i16> [[TMP12]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP3]], <8 x i16> [[TMP13]], <8 x i16> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP4]], <8 x bfloat> [[TMP7]], <8 x bfloat> [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x bfloat> [[TMP14]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP15]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP16]]
+;
+entry:
+  %0 = bitcast i8 %U to <8 x i1>
+  %1 = bitcast <2 x i64> %C to <8 x bfloat>
+  %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4
+  %3 = select <8 x i1> %0, <8 x bfloat> %2, <8 x bfloat> %1
+  %4 = bitcast <8 x bfloat> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+declare <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <16 x bfloat>, <16 x bfloat>) #3
+
+define <8 x float> @test_mm256_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x float> @test_mm256_dpbf16ps_256(
+; CHECK-SAME: <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP8]]
+;
+entry:
+  %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
+  ret <8 x float> %0
+}
+
+define <8 x float> @test_mm256_maskz_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x float> @test_mm256_maskz_dpbf16ps_256(
+; CHECK-SAME: <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> zeroinitializer, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x float> [[TMP9]] to <8 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP16]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP11]], <8 x float> [[TMP9]], <8 x float> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP17]]
+;
+entry:
+  %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
+  %1 = bitcast i8 %U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
+  ret <8 x float> %2
+}
+define <8 x float> @test_mm256_mask_dpbf16ps_256(i8 zeroext %U, <8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x float> @test_mm256_mask_dpbf16ps_256(
+; CHECK-SAME: i8 zeroext [[U:%.*]], <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> zeroinitializer, <8 x i32> [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x float> [[TMP9]] to <8 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x float> [[E]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP0]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP17]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x float> [[TMP9]], <8 x float> [[E]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP18]]
+;
+entry:
+  %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
+  %1 = bitcast i8 %U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %E
+  ret <8 x float> %2
+}
+
+declare <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <8 x bfloat>, <8 x bfloat>) #3
+
+define <4 x float> @test_mm128_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x float> @test_mm128_dpbf16ps_128(
+; CHECK-SAME: <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
+  ret <4 x float> %0
+}
+
+define <4 x float> @test_mm128_maskz_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B, i4 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x float> @test_mm128_maskz_dpbf16ps_128(
+; CHECK-SAME: <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]], i4 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i4 [[TMP3]] to <4 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i4 [[U]] to <4 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x float> [[TMP9]] to <4 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP16]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP9]], <4 x float> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP17]]
+;
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
+  %1 = bitcast i4 %U to <4 x i1>
+  %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> zeroinitializer
+  ret <4 x float> %2
+}
+define <4 x float> @test_mm128_mask_dpbf16ps_128(i4 zeroext %U, <4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x float> @test_mm128_mask_dpbf16ps_128(
+; CHECK-SAME: i4 zeroext [[U:%.*]], <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i4, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i4 [[TMP3]] to <4 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i4 [[U]] to <4 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> zeroinitializer, <4 x i32> [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x float> [[TMP9]] to <4 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x float> [[E]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP0]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP17]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP9]], <4 x float> [[E]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP18]]
+;
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
+  %1 = bitcast i4 %U to <4 x i1>
+  %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> %E
+  ret <4 x float> %2
+}
+
+define <16 x i16> @test_no_vbroadcast1() sanitize_memory {
+; CHECK-LABEL: define <16 x i16> @test_no_vbroadcast1(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[TMP0]] to <8 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = bitcast <8 x bfloat> %0 to <8 x i16>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> poison, <16 x i32> zeroinitializer
+  ret <16 x i16> %2
+}
+
+define <16 x bfloat> @test_no_vbroadcast2() nounwind sanitize_memory {
+; CHECK-LABEL: define <16 x bfloat> @test_no_vbroadcast2(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x bfloat> [[TMP0]], <8 x bfloat> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x bfloat> [[TMP1]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> poison, <16 x i32> zeroinitializer
+  ret <16 x bfloat> %1
+}
+
+define <16 x i32> @pr83358() sanitize_memory {
+; CHECK-LABEL: define <16 x i32> @pr83358(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %1 = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>)
+  %2 = bitcast <8 x bfloat> %1 to <4 x i32>
+  %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x i32> %3
+}
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic_outlined.ll b/llvm/test/Instrumentation/TypeSanitizer/basic_outlined.ll
new file mode 100644
index 0000000000000..1d118560f7580
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/basic_outlined.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; Test basic type sanitizer instrumentation.
+;
+; RUN: opt -passes='tysan' -tysan-outline-instrumentation -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+;.
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
+; CHECK: @__tysan_v1_Simple_20C_2b_2b_20TBAA = linkonce_odr constant { i64, i64, [16 x i8] } { i64 2, i64 0, [16 x i8] c"Simple C++ TBAA\00" }, comdat
+; CHECK: @__tysan_v1_omnipotent_20char = linkonce_odr constant { i64, i64, ptr, i64, [16 x i8] } { i64 2, i64 1, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, i64 0, [16 x i8] c"omnipotent char\00" }, comdat
+; CHECK: @__tysan_v1_int = linkonce_odr constant { i64, i64, ptr, i64, [4 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [4 x i8] c"int\00" }, comdat
+; CHECK: @__tysan_v1_int_o_0 = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1_int, ptr @__tysan_v1_int, i64 0 }, comdat
+; CHECK: @__tysan_shadow_memory_address = external global i64
+; CHECK: @__tysan_app_memory_mask = external global i64
+; CHECK: @__tysan_v1___ZTS1x = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 2, ptr @__tysan_v1_int, i64 0, ptr @__tysan_v1_int, i64 4, [7 x i8] c"_ZTS1x\00" }, comdat
+; CHECK: @__tysan_v1___ZTS1v = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 3, ptr @__tysan_v1_int, i64 8, ptr @__tysan_v1_int, i64 12, ptr @__tysan_v1___ZTS1x, i64 16, [7 x i8] c"_ZTS1v\00" }, comdat
+; CHECK: @__tysan_v1___ZTS1v_o_12 = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1_int, i64 12 }, comdat
+; CHECK: @llvm.used = appending global [8 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_int_o_0, ptr @__tysan_v1___ZTS1x, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1___ZTS1v_o_12], section "llvm.metadata"
+;.
+define i32 @test_load(ptr %a) sanitize_type {
+; CHECK-LABEL: @test_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    call void @__tysan_instrument_with_shadow_update(ptr [[A:%.*]], ptr @__tysan_v1_int_o_0, i1 true, i64 4, i32 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  %tmp1 = load i32, ptr %a, align 4, !tbaa !3
+  ret i32 %tmp1
+}
+
+define void @test_store(ptr %a) sanitize_type {
+; CHECK-LABEL: @test_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    call void @__tysan_instrument_with_shadow_update(ptr [[A:%.*]], ptr @__tysan_v1___ZTS1v_o_12, i1 true, i64 4, i32 2)
+; CHECK-NEXT:    store i32 42, ptr [[A]], align 4, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  store i32 42, ptr %a, align 4, !tbaa !6
+  ret void
+}
+
+!0 = !{!"Simple C++ TBAA"}
+!1 = !{!"omnipotent char", !0, i64 0}
+!2 = !{!"int", !1, i64 0}
+!3 = !{!2, !2, i64 0}
+!4 = !{!"_ZTS1x", !2, i64 0, !2, i64 4}
+!5 = !{!"_ZTS1v", !2, i64 8, !2, i64 12, !4, i64 16}
+!6 = !{!5, !2, i64 12}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { sanitize_type }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
+; CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META1]], i64 12}
+; CHECK: [[META5]] = !{!"_ZTS1v", [[META1]], i64 8, [[META1]], i64 12, [[META6:![0-9]+]], i64 16}
+; CHECK: [[META6]] = !{!"_ZTS1x", [[META1]], i64 0, [[META1]], i64 4}
+;.
diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic_verify_outlined.ll b/llvm/test/Instrumentation/TypeSanitizer/basic_verify_outlined.ll
new file mode 100644
index 0000000000000..187a41ea8a825
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/basic_verify_outlined.ll
@@ -0,0 +1,736 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; Test basic type sanitizer instrumentation.
+;
+; RUN: opt -passes='tysan' -S -tysan-outline-instrumentation -tysan-verify-outlined-instrumentation -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+;.
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
+; CHECK: @__tysan_v1_Simple_20C_2b_2b_20TBAA = linkonce_odr constant { i64, i64, [16 x i8] } { i64 2, i64 0, [16 x i8] c"Simple C++ TBAA\00" }, comdat
+; CHECK: @__tysan_v1_omnipotent_20char = linkonce_odr constant { i64, i64, ptr, i64, [16 x i8] } { i64 2, i64 1, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, i64 0, [16 x i8] c"omnipotent char\00" }, comdat
+; CHECK: @__tysan_v1_int = linkonce_odr constant { i64, i64, ptr, i64, [4 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [4 x i8] c"int\00" }, comdat
+; CHECK: @__tysan_v1_int_o_0 = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1_int, ptr @__tysan_v1_int, i64 0 }, comdat
+; CHECK: @__tysan_shadow_memory_address = external global i64
+; CHECK: @__tysan_app_memory_mask = external global i64
+; CHECK: @__tysan_v1___ZTS1x = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 2, ptr @__tysan_v1_int, i64 0, ptr @__tysan_v1_int, i64 4, [7 x i8] c"_ZTS1x\00" }, comdat
+; CHECK: @__tysan_v1___ZTS1v = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 3, ptr @__tysan_v1_int, i64 8, ptr @__tysan_v1_int, i64 12, ptr @__tysan_v1___ZTS1x, i64 16, [7 x i8] c"_ZTS1v\00" }, comdat
+; CHECK: @__tysan_v1___ZTS1v_o_12 = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1_int, i64 12 }, comdat
+; CHECK: @llvm.used = appending global [8 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_int_o_0, ptr @__tysan_v1___ZTS1x, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1___ZTS1v_o_12], section "llvm.metadata"
+;.
+define i32 @test_load(ptr %a) sanitize_type {
+; CHECK-LABEL: @test_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK2:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE1:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 ptrtoint (ptr @__tysan_app_memory_mask to i64), [[APP_MEM_MASK2]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE1]]
+; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP42:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne ptr [[TMP5]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 false, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne ptr [[TMP10]], null
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null
+; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[SHADOW_PTR_INT]], 32
+; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne ptr [[TMP20]], null
+; CHECK-NEXT:    [[TMP22:%.*]] = or i1 [[TMP17]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 40
+; CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+; CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne ptr [[TMP25]], null
+; CHECK-NEXT:    [[TMP27:%.*]] = or i1 [[TMP22]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = add i64 [[SHADOW_PTR_INT]], 48
+; CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+; CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[TMP29]], align 8
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne ptr [[TMP30]], null
+; CHECK-NEXT:    [[TMP32:%.*]] = or i1 [[TMP27]], [[TMP31]]
+; CHECK-NEXT:    [[TMP33:%.*]] = add i64 [[SHADOW_PTR_INT]], 56
+; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+; CHECK-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[TMP34]], align 8
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne ptr [[TMP35]], null
+; CHECK-NEXT:    [[TMP37:%.*]] = or i1 [[TMP32]], [[TMP36]]
+; CHECK-NEXT:    br i1 [[TMP37]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF0]]
+; CHECK:       38:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP39]]
+; CHECK:       39:
+; CHECK-NEXT:    store ptr null, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_4_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 32
+; CHECK-NEXT:    [[SHADOW_BYTE_4_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_5_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 40
+; CHECK-NEXT:    [[SHADOW_BYTE_5_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_6_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 48
+; CHECK-NEXT:    [[SHADOW_BYTE_6_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_7_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 56
+; CHECK-NEXT:    [[SHADOW_BYTE_7_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR]], align 8
+; CHECK-NEXT:    br label [[TMP41:%.*]]
+; CHECK:       40:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP41]]
+; CHECK:       41:
+; CHECK-NEXT:    br label [[TMP87:%.*]]
+; CHECK:       42:
+; CHECK-NEXT:    [[TMP43:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP44:%.*]] = inttoptr i64 [[TMP43]] to ptr
+; CHECK-NEXT:    [[TMP45:%.*]] = load ptr, ptr [[TMP44]], align 8
+; CHECK-NEXT:    [[TMP46:%.*]] = ptrtoint ptr [[TMP45]] to i64
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp sge i64 [[TMP46]], 0
+; CHECK-NEXT:    [[TMP48:%.*]] = or i1 false, [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP50:%.*]] = inttoptr i64 [[TMP49]] to ptr
+; CHECK-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[TMP50]], align 8
+; CHECK-NEXT:    [[TMP52:%.*]] = ptrtoint ptr [[TMP51]] to i64
+; CHECK-NEXT:    [[TMP53:%.*]] = icmp sge i64 [[TMP52]], 0
+; CHECK-NEXT:    [[TMP54:%.*]] = or i1 [[TMP48]], [[TMP53]]
+; CHECK-NEXT:    [[TMP55:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
+; CHECK-NEXT:    [[TMP57:%.*]] = load ptr, ptr [[TMP56]], align 8
+; CHECK-NEXT:    [[TMP58:%.*]] = ptrtoint ptr [[TMP57]] to i64
+; CHECK-NEXT:    [[TMP59:%.*]] = icmp sge i64 [[TMP58]], 0
+; CHECK-NEXT:    [[TMP60:%.*]] = or i1 [[TMP54]], [[TMP59]]
+; CHECK-NEXT:    [[TMP61:%.*]] = add i64 [[SHADOW_PTR_INT]], 32
+; CHECK-NEXT:    [[TMP62:%.*]] = inttoptr i64 [[TMP61]] to ptr
+; CHECK-NEXT:    [[TMP63:%.*]] = load ptr, ptr [[TMP62]], align 8
+; CHECK-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[TMP63]] to i64
+; CHECK-NEXT:    [[TMP65:%.*]] = icmp sge i64 [[TMP64]], 0
+; CHECK-NEXT:    [[TMP66:%.*]] = or i1 [[TMP60]], [[TMP65]]
+; CHECK-NEXT:    [[TMP67:%.*]] = add i64 [[SHADOW_PTR_INT]], 40
+; CHECK-NEXT:    [[TMP68:%.*]] = inttoptr i64 [[TMP67]] to ptr
+; CHECK-NEXT:    [[TMP69:%.*]] = load ptr, ptr [[TMP68]], align 8
+; CHECK-NEXT:    [[TMP70:%.*]] = ptrtoint ptr [[TMP69]] to i64
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp sge i64 [[TMP70]], 0
+; CHECK-NEXT:    [[TMP72:%.*]] = or i1 [[TMP66]], [[TMP71]]
+; CHECK-NEXT:    [[TMP73:%.*]] = add i64 [[SHADOW_PTR_INT]], 48
+; CHECK-NEXT:    [[TMP74:%.*]] = inttoptr i64 [[TMP73]] to ptr
+; CHECK-NEXT:    [[TMP75:%.*]] = load ptr, ptr [[TMP74]], align 8
+; CHECK-NEXT:    [[TMP76:%.*]] = ptrtoint ptr [[TMP75]] to i64
+; CHECK-NEXT:    [[TMP77:%.*]] = icmp sge i64 [[TMP76]], 0
+; CHECK-NEXT:    [[TMP78:%.*]] = or i1 [[TMP72]], [[TMP77]]
+; CHECK-NEXT:    [[TMP79:%.*]] = add i64 [[SHADOW_PTR_INT]], 56
+; CHECK-NEXT:    [[TMP80:%.*]] = inttoptr i64 [[TMP79]] to ptr
+; CHECK-NEXT:    [[TMP81:%.*]] = load ptr, ptr [[TMP80]], align 8
+; CHECK-NEXT:    [[TMP82:%.*]] = ptrtoint ptr [[TMP81]] to i64
+; CHECK-NEXT:    [[TMP83:%.*]] = icmp sge i64 [[TMP82]], 0
+; CHECK-NEXT:    [[TMP84:%.*]] = or i1 [[TMP78]], [[TMP83]]
+; CHECK-NEXT:    br i1 [[TMP84]], label [[TMP85:%.*]], label [[TMP86:%.*]], !prof [[PROF0]]
+; CHECK:       85:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP86]]
+; CHECK:       86:
+; CHECK-NEXT:    br label [[TMP87]]
+; CHECK:       87:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[APP_PTR_MASKED3:%.*]] = and i64 ptrtoint (ptr @__tysan_shadow_memory_address to i64), [[APP_MEM_MASK2]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED4:%.*]] = shl i64 [[APP_PTR_MASKED3]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT5:%.*]] = add i64 [[APP_PTR_SHIFTED4]], [[SHADOW_BASE1]]
+; CHECK-NEXT:    [[SHADOW_PTR6:%.*]] = inttoptr i64 [[SHADOW_PTR_INT5]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC7:%.*]] = load ptr, ptr [[SHADOW_PTR6]], align 8
+; CHECK-NEXT:    [[BAD_DESC8:%.*]] = icmp ne ptr [[SHADOW_DESC7]], null
+; CHECK-NEXT:    br i1 [[BAD_DESC8]], label [[TMP88:%.*]], label [[TMP130:%.*]], !prof [[PROF0]]
+; CHECK:       88:
+; CHECK-NEXT:    [[TMP89:%.*]] = icmp eq ptr [[SHADOW_DESC7]], null
+; CHECK-NEXT:    br i1 [[TMP89]], label [[TMP90:%.*]], label [[TMP128:%.*]]
+; CHECK:       90:
+; CHECK-NEXT:    [[TMP91:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8
+; CHECK-NEXT:    [[TMP92:%.*]] = inttoptr i64 [[TMP91]] to ptr
+; CHECK-NEXT:    [[TMP93:%.*]] = load ptr, ptr [[TMP92]], align 8
+; CHECK-NEXT:    [[TMP94:%.*]] = icmp ne ptr [[TMP93]], null
+; CHECK-NEXT:    [[TMP95:%.*]] = or i1 false, [[TMP94]]
+; CHECK-NEXT:    [[TMP96:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16
+; CHECK-NEXT:    [[TMP97:%.*]] = inttoptr i64 [[TMP96]] to ptr
+; CHECK-NEXT:    [[TMP98:%.*]] = load ptr, ptr [[TMP97]], align 8
+; CHECK-NEXT:    [[TMP99:%.*]] = icmp ne ptr [[TMP98]], null
+; CHECK-NEXT:    [[TMP100:%.*]] = or i1 [[TMP95]], [[TMP99]]
+; CHECK-NEXT:    [[TMP101:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24
+; CHECK-NEXT:    [[TMP102:%.*]] = inttoptr i64 [[TMP101]] to ptr
+; CHECK-NEXT:    [[TMP103:%.*]] = load ptr, ptr [[TMP102]], align 8
+; CHECK-NEXT:    [[TMP104:%.*]] = icmp ne ptr [[TMP103]], null
+; CHECK-NEXT:    [[TMP105:%.*]] = or i1 [[TMP100]], [[TMP104]]
+; CHECK-NEXT:    [[TMP106:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32
+; CHECK-NEXT:    [[TMP107:%.*]] = inttoptr i64 [[TMP106]] to ptr
+; CHECK-NEXT:    [[TMP108:%.*]] = load ptr, ptr [[TMP107]], align 8
+; CHECK-NEXT:    [[TMP109:%.*]] = icmp ne ptr [[TMP108]], null
+; CHECK-NEXT:    [[TMP110:%.*]] = or i1 [[TMP105]], [[TMP109]]
+; CHECK-NEXT:    [[TMP111:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40
+; CHECK-NEXT:    [[TMP112:%.*]] = inttoptr i64 [[TMP111]] to ptr
+; CHECK-NEXT:    [[TMP113:%.*]] = load ptr, ptr [[TMP112]], align 8
+; CHECK-NEXT:    [[TMP114:%.*]] = icmp ne ptr [[TMP113]], null
+; CHECK-NEXT:    [[TMP115:%.*]] = or i1 [[TMP110]], [[TMP114]]
+; CHECK-NEXT:    [[TMP116:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48
+; CHECK-NEXT:    [[TMP117:%.*]] = inttoptr i64 [[TMP116]] to ptr
+; CHECK-NEXT:    [[TMP118:%.*]] = load ptr, ptr [[TMP117]], align 8
+; CHECK-NEXT:    [[TMP119:%.*]] = icmp ne ptr [[TMP118]], null
+; CHECK-NEXT:    [[TMP120:%.*]] = or i1 [[TMP115]], [[TMP119]]
+; CHECK-NEXT:    [[TMP121:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56
+; CHECK-NEXT:    [[TMP122:%.*]] = inttoptr i64 [[TMP121]] to ptr
+; CHECK-NEXT:    [[TMP123:%.*]] = load ptr, ptr [[TMP122]], align 8
+; CHECK-NEXT:    [[TMP124:%.*]] = icmp ne ptr [[TMP123]], null
+; CHECK-NEXT:    [[TMP125:%.*]] = or i1 [[TMP120]], [[TMP124]]
+; CHECK-NEXT:    br i1 [[TMP125]], label [[TMP126:%.*]], label [[TMP127:%.*]], !prof [[PROF0]]
+; CHECK:       126:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP127]]
+; CHECK:       127:
+; CHECK-NEXT:    store ptr null, ptr [[SHADOW_PTR6]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET9:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR10:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET9]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR10]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET11:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR12:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET11]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR12]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET13:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR14:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET13]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR14]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_4_OFFSET15:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32
+; CHECK-NEXT:    [[SHADOW_BYTE_4_PTR16:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET15]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR16]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_5_OFFSET17:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40
+; CHECK-NEXT:    [[SHADOW_BYTE_5_PTR18:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET17]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR18]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_6_OFFSET19:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48
+; CHECK-NEXT:    [[SHADOW_BYTE_6_PTR20:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET19]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR20]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_7_OFFSET21:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56
+; CHECK-NEXT:    [[SHADOW_BYTE_7_PTR22:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET21]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR22]], align 8
+; CHECK-NEXT:    br label [[TMP129:%.*]]
+; CHECK:       128:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP129]]
+; CHECK:       129:
+; CHECK-NEXT:    br label [[TMP175:%.*]]
+; CHECK:       130:
+; CHECK-NEXT:    [[TMP131:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8
+; CHECK-NEXT:    [[TMP132:%.*]] = inttoptr i64 [[TMP131]] to ptr
+; CHECK-NEXT:    [[TMP133:%.*]] = load ptr, ptr [[TMP132]], align 8
+; CHECK-NEXT:    [[TMP134:%.*]] = ptrtoint ptr [[TMP133]] to i64
+; CHECK-NEXT:    [[TMP135:%.*]] = icmp sge i64 [[TMP134]], 0
+; CHECK-NEXT:    [[TMP136:%.*]] = or i1 false, [[TMP135]]
+; CHECK-NEXT:    [[TMP137:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16
+; CHECK-NEXT:    [[TMP138:%.*]] = inttoptr i64 [[TMP137]] to ptr
+; CHECK-NEXT:    [[TMP139:%.*]] = load ptr, ptr [[TMP138]], align 8
+; CHECK-NEXT:    [[TMP140:%.*]] = ptrtoint ptr [[TMP139]] to i64
+; CHECK-NEXT:    [[TMP141:%.*]] = icmp sge i64 [[TMP140]], 0
+; CHECK-NEXT:    [[TMP142:%.*]] = or i1 [[TMP136]], [[TMP141]]
+; CHECK-NEXT:    [[TMP143:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24
+; CHECK-NEXT:    [[TMP144:%.*]] = inttoptr i64 [[TMP143]] to ptr
+; CHECK-NEXT:    [[TMP145:%.*]] = load ptr, ptr [[TMP144]], align 8
+; CHECK-NEXT:    [[TMP146:%.*]] = ptrtoint ptr [[TMP145]] to i64
+; CHECK-NEXT:    [[TMP147:%.*]] = icmp sge i64 [[TMP146]], 0
+; CHECK-NEXT:    [[TMP148:%.*]] = or i1 [[TMP142]], [[TMP147]]
+; CHECK-NEXT:    [[TMP149:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32
+; CHECK-NEXT:    [[TMP150:%.*]] = inttoptr i64 [[TMP149]] to ptr
+; CHECK-NEXT:    [[TMP151:%.*]] = load ptr, ptr [[TMP150]], align 8
+; CHECK-NEXT:    [[TMP152:%.*]] = ptrtoint ptr [[TMP151]] to i64
+; CHECK-NEXT:    [[TMP153:%.*]] = icmp sge i64 [[TMP152]], 0
+; CHECK-NEXT:    [[TMP154:%.*]] = or i1 [[TMP148]], [[TMP153]]
+; CHECK-NEXT:    [[TMP155:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40
+; CHECK-NEXT:    [[TMP156:%.*]] = inttoptr i64 [[TMP155]] to ptr
+; CHECK-NEXT:    [[TMP157:%.*]] = load ptr, ptr [[TMP156]], align 8
+; CHECK-NEXT:    [[TMP158:%.*]] = ptrtoint ptr [[TMP157]] to i64
+; CHECK-NEXT:    [[TMP159:%.*]] = icmp sge i64 [[TMP158]], 0
+; CHECK-NEXT:    [[TMP160:%.*]] = or i1 [[TMP154]], [[TMP159]]
+; CHECK-NEXT:    [[TMP161:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48
+; CHECK-NEXT:    [[TMP162:%.*]] = inttoptr i64 [[TMP161]] to ptr
+; CHECK-NEXT:    [[TMP163:%.*]] = load ptr, ptr [[TMP162]], align 8
+; CHECK-NEXT:    [[TMP164:%.*]] = ptrtoint ptr [[TMP163]] to i64
+; CHECK-NEXT:    [[TMP165:%.*]] = icmp sge i64 [[TMP164]], 0
+; CHECK-NEXT:    [[TMP166:%.*]] = or i1 [[TMP160]], [[TMP165]]
+; CHECK-NEXT:    [[TMP167:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56
+; CHECK-NEXT:    [[TMP168:%.*]] = inttoptr i64 [[TMP167]] to ptr
+; CHECK-NEXT:    [[TMP169:%.*]] = load ptr, ptr [[TMP168]], align 8
+; CHECK-NEXT:    [[TMP170:%.*]] = ptrtoint ptr [[TMP169]] to i64
+; CHECK-NEXT:    [[TMP171:%.*]] = icmp sge i64 [[TMP170]], 0
+; CHECK-NEXT:    [[TMP172:%.*]] = or i1 [[TMP166]], [[TMP171]]
+; CHECK-NEXT:    br i1 [[TMP172]], label [[TMP173:%.*]], label [[TMP174:%.*]], !prof [[PROF0]]
+; CHECK:       173:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP174]]
+; CHECK:       174:
+; CHECK-NEXT:    br label [[TMP175]]
+; CHECK:       175:
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    call void @__tysan_instrument_with_shadow_update(ptr [[A:%.*]], ptr @__tysan_v1_int_o_0, i1 true, i64 4, i32 1)
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[APP_PTR_MASKED23:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK2]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED24:%.*]] = shl i64 [[APP_PTR_MASKED23]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT25:%.*]] = add i64 [[APP_PTR_SHIFTED24]], [[SHADOW_BASE1]]
+; CHECK-NEXT:    [[SHADOW_PTR26:%.*]] = inttoptr i64 [[SHADOW_PTR_INT25]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC27:%.*]] = load ptr, ptr [[SHADOW_PTR26]], align 8
+; CHECK-NEXT:    [[BAD_DESC28:%.*]] = icmp ne ptr [[SHADOW_DESC27]], @__tysan_v1_int_o_0
+; CHECK-NEXT:    br i1 [[BAD_DESC28]], label [[TMP176:%.*]], label [[TMP198:%.*]], !prof [[PROF0]]
+; CHECK:       176:
+; CHECK-NEXT:    [[TMP177:%.*]] = icmp eq ptr [[SHADOW_DESC27]], null
+; CHECK-NEXT:    br i1 [[TMP177]], label [[TMP178:%.*]], label [[TMP196:%.*]]
+; CHECK:       178:
+; CHECK-NEXT:    [[TMP179:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8
+; CHECK-NEXT:    [[TMP180:%.*]] = inttoptr i64 [[TMP179]] to ptr
+; CHECK-NEXT:    [[TMP181:%.*]] = load ptr, ptr [[TMP180]], align 8
+; CHECK-NEXT:    [[TMP182:%.*]] = icmp ne ptr [[TMP181]], null
+; CHECK-NEXT:    [[TMP183:%.*]] = or i1 false, [[TMP182]]
+; CHECK-NEXT:    [[TMP184:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16
+; CHECK-NEXT:    [[TMP185:%.*]] = inttoptr i64 [[TMP184]] to ptr
+; CHECK-NEXT:    [[TMP186:%.*]] = load ptr, ptr [[TMP185]], align 8
+; CHECK-NEXT:    [[TMP187:%.*]] = icmp ne ptr [[TMP186]], null
+; CHECK-NEXT:    [[TMP188:%.*]] = or i1 [[TMP183]], [[TMP187]]
+; CHECK-NEXT:    [[TMP189:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24
+; CHECK-NEXT:    [[TMP190:%.*]] = inttoptr i64 [[TMP189]] to ptr
+; CHECK-NEXT:    [[TMP191:%.*]] = load ptr, ptr [[TMP190]], align 8
+; CHECK-NEXT:    [[TMP192:%.*]] = icmp ne ptr [[TMP191]], null
+; CHECK-NEXT:    [[TMP193:%.*]] = or i1 [[TMP188]], [[TMP192]]
+; CHECK-NEXT:    br i1 [[TMP193]], label [[TMP194:%.*]], label [[TMP195:%.*]], !prof [[PROF0]]
+; CHECK:       194:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1)
+; CHECK-NEXT:    br label [[TMP195]]
+; CHECK:       195:
+; CHECK-NEXT:    store ptr @__tysan_v1_int_o_0, ptr [[SHADOW_PTR26]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET29:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR30:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET29]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR30]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET31:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR32:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET31]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR32]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET33:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR34:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET33]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR34]], align 8
+; CHECK-NEXT:    br label [[TMP197:%.*]]
+; CHECK:       196:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1)
+; CHECK-NEXT:    br label [[TMP197]]
+; CHECK:       197:
+; CHECK-NEXT:    br label [[TMP219:%.*]]
+; CHECK:       198:
+; CHECK-NEXT:    [[TMP199:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8
+; CHECK-NEXT:    [[TMP200:%.*]] = inttoptr i64 [[TMP199]] to ptr
+; CHECK-NEXT:    [[TMP201:%.*]] = load ptr, ptr [[TMP200]], align 8
+; CHECK-NEXT:    [[TMP202:%.*]] = ptrtoint ptr [[TMP201]] to i64
+; CHECK-NEXT:    [[TMP203:%.*]] = icmp sge i64 [[TMP202]], 0
+; CHECK-NEXT:    [[TMP204:%.*]] = or i1 false, [[TMP203]]
+; CHECK-NEXT:    [[TMP205:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16
+; CHECK-NEXT:    [[TMP206:%.*]] = inttoptr i64 [[TMP205]] to ptr
+; CHECK-NEXT:    [[TMP207:%.*]] = load ptr, ptr [[TMP206]], align 8
+; CHECK-NEXT:    [[TMP208:%.*]] = ptrtoint ptr [[TMP207]] to i64
+; CHECK-NEXT:    [[TMP209:%.*]] = icmp sge i64 [[TMP208]], 0
+; CHECK-NEXT:    [[TMP210:%.*]] = or i1 [[TMP204]], [[TMP209]]
+; CHECK-NEXT:    [[TMP211:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24
+; CHECK-NEXT:    [[TMP212:%.*]] = inttoptr i64 [[TMP211]] to ptr
+; CHECK-NEXT:    [[TMP213:%.*]] = load ptr, ptr [[TMP212]], align 8
+; CHECK-NEXT:    [[TMP214:%.*]] = ptrtoint ptr [[TMP213]] to i64
+; CHECK-NEXT:    [[TMP215:%.*]] = icmp sge i64 [[TMP214]], 0
+; CHECK-NEXT:    [[TMP216:%.*]] = or i1 [[TMP210]], [[TMP215]]
+; CHECK-NEXT:    br i1 [[TMP216]], label [[TMP217:%.*]], label [[TMP218:%.*]], !prof [[PROF0]]
+; CHECK:       217:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1)
+; CHECK-NEXT:    br label [[TMP218]]
+; CHECK:       218:
+; CHECK-NEXT:    br label [[TMP219]]
+; CHECK:       219:
+; CHECK-NEXT:    [[WAA:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[WAA]]
+;
+entry:
+  %WAA = load i32, ptr %a, align 4, !tbaa !3
+  ret i32 %WAA
+}
+
+define void @test_store(ptr %a) sanitize_type {
+; CHECK-LABEL: @test_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK2:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE1:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 ptrtoint (ptr @__tysan_app_memory_mask to i64), [[APP_MEM_MASK2]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE1]]
+; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP42:%.*]], !prof [[PROF0]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP40:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne ptr [[TMP5]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 false, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne ptr [[TMP10]], null
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null
+; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[SHADOW_PTR_INT]], 32
+; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne ptr [[TMP20]], null
+; CHECK-NEXT:    [[TMP22:%.*]] = or i1 [[TMP17]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 40
+; CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+; CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne ptr [[TMP25]], null
+; CHECK-NEXT:    [[TMP27:%.*]] = or i1 [[TMP22]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = add i64 [[SHADOW_PTR_INT]], 48
+; CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+; CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[TMP29]], align 8
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne ptr [[TMP30]], null
+; CHECK-NEXT:    [[TMP32:%.*]] = or i1 [[TMP27]], [[TMP31]]
+; CHECK-NEXT:    [[TMP33:%.*]] = add i64 [[SHADOW_PTR_INT]], 56
+; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+; CHECK-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[TMP34]], align 8
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne ptr [[TMP35]], null
+; CHECK-NEXT:    [[TMP37:%.*]] = or i1 [[TMP32]], [[TMP36]]
+; CHECK-NEXT:    br i1 [[TMP37]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF0]]
+; CHECK:       38:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP39]]
+; CHECK:       39:
+; CHECK-NEXT:    store ptr null, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_4_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 32
+; CHECK-NEXT:    [[SHADOW_BYTE_4_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_5_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 40
+; CHECK-NEXT:    [[SHADOW_BYTE_5_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_6_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 48
+; CHECK-NEXT:    [[SHADOW_BYTE_6_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_7_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 56
+; CHECK-NEXT:    [[SHADOW_BYTE_7_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR]], align 8
+; CHECK-NEXT:    br label [[TMP41:%.*]]
+; CHECK:       40:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP41]]
+; CHECK:       41:
+; CHECK-NEXT:    br label [[TMP87:%.*]]
+; CHECK:       42:
+; CHECK-NEXT:    [[TMP43:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP44:%.*]] = inttoptr i64 [[TMP43]] to ptr
+; CHECK-NEXT:    [[TMP45:%.*]] = load ptr, ptr [[TMP44]], align 8
+; CHECK-NEXT:    [[TMP46:%.*]] = ptrtoint ptr [[TMP45]] to i64
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp sge i64 [[TMP46]], 0
+; CHECK-NEXT:    [[TMP48:%.*]] = or i1 false, [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP50:%.*]] = inttoptr i64 [[TMP49]] to ptr
+; CHECK-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[TMP50]], align 8
+; CHECK-NEXT:    [[TMP52:%.*]] = ptrtoint ptr [[TMP51]] to i64
+; CHECK-NEXT:    [[TMP53:%.*]] = icmp sge i64 [[TMP52]], 0
+; CHECK-NEXT:    [[TMP54:%.*]] = or i1 [[TMP48]], [[TMP53]]
+; CHECK-NEXT:    [[TMP55:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
+; CHECK-NEXT:    [[TMP57:%.*]] = load ptr, ptr [[TMP56]], align 8
+; CHECK-NEXT:    [[TMP58:%.*]] = ptrtoint ptr [[TMP57]] to i64
+; CHECK-NEXT:    [[TMP59:%.*]] = icmp sge i64 [[TMP58]], 0
+; CHECK-NEXT:    [[TMP60:%.*]] = or i1 [[TMP54]], [[TMP59]]
+; CHECK-NEXT:    [[TMP61:%.*]] = add i64 [[SHADOW_PTR_INT]], 32
+; CHECK-NEXT:    [[TMP62:%.*]] = inttoptr i64 [[TMP61]] to ptr
+; CHECK-NEXT:    [[TMP63:%.*]] = load ptr, ptr [[TMP62]], align 8
+; CHECK-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[TMP63]] to i64
+; CHECK-NEXT:    [[TMP65:%.*]] = icmp sge i64 [[TMP64]], 0
+; CHECK-NEXT:    [[TMP66:%.*]] = or i1 [[TMP60]], [[TMP65]]
+; CHECK-NEXT:    [[TMP67:%.*]] = add i64 [[SHADOW_PTR_INT]], 40
+; CHECK-NEXT:    [[TMP68:%.*]] = inttoptr i64 [[TMP67]] to ptr
+; CHECK-NEXT:    [[TMP69:%.*]] = load ptr, ptr [[TMP68]], align 8
+; CHECK-NEXT:    [[TMP70:%.*]] = ptrtoint ptr [[TMP69]] to i64
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp sge i64 [[TMP70]], 0
+; CHECK-NEXT:    [[TMP72:%.*]] = or i1 [[TMP66]], [[TMP71]]
+; CHECK-NEXT:    [[TMP73:%.*]] = add i64 [[SHADOW_PTR_INT]], 48
+; CHECK-NEXT:    [[TMP74:%.*]] = inttoptr i64 [[TMP73]] to ptr
+; CHECK-NEXT:    [[TMP75:%.*]] = load ptr, ptr [[TMP74]], align 8
+; CHECK-NEXT:    [[TMP76:%.*]] = ptrtoint ptr [[TMP75]] to i64
+; CHECK-NEXT:    [[TMP77:%.*]] = icmp sge i64 [[TMP76]], 0
+; CHECK-NEXT:    [[TMP78:%.*]] = or i1 [[TMP72]], [[TMP77]]
+; CHECK-NEXT:    [[TMP79:%.*]] = add i64 [[SHADOW_PTR_INT]], 56
+; CHECK-NEXT:    [[TMP80:%.*]] = inttoptr i64 [[TMP79]] to ptr
+; CHECK-NEXT:    [[TMP81:%.*]] = load ptr, ptr [[TMP80]], align 8
+; CHECK-NEXT:    [[TMP82:%.*]] = ptrtoint ptr [[TMP81]] to i64
+; CHECK-NEXT:    [[TMP83:%.*]] = icmp sge i64 [[TMP82]], 0
+; CHECK-NEXT:    [[TMP84:%.*]] = or i1 [[TMP78]], [[TMP83]]
+; CHECK-NEXT:    br i1 [[TMP84]], label [[TMP85:%.*]], label [[TMP86:%.*]], !prof [[PROF0]]
+; CHECK:       85:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_app_memory_mask, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP86]]
+; CHECK:       86:
+; CHECK-NEXT:    br label [[TMP87]]
+; CHECK:       87:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[APP_PTR_MASKED3:%.*]] = and i64 ptrtoint (ptr @__tysan_shadow_memory_address to i64), [[APP_MEM_MASK2]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED4:%.*]] = shl i64 [[APP_PTR_MASKED3]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT5:%.*]] = add i64 [[APP_PTR_SHIFTED4]], [[SHADOW_BASE1]]
+; CHECK-NEXT:    [[SHADOW_PTR6:%.*]] = inttoptr i64 [[SHADOW_PTR_INT5]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC7:%.*]] = load ptr, ptr [[SHADOW_PTR6]], align 8
+; CHECK-NEXT:    [[BAD_DESC8:%.*]] = icmp ne ptr [[SHADOW_DESC7]], null
+; CHECK-NEXT:    br i1 [[BAD_DESC8]], label [[TMP88:%.*]], label [[TMP130:%.*]], !prof [[PROF0]]
+; CHECK:       88:
+; CHECK-NEXT:    [[TMP89:%.*]] = icmp eq ptr [[SHADOW_DESC7]], null
+; CHECK-NEXT:    br i1 [[TMP89]], label [[TMP90:%.*]], label [[TMP128:%.*]]
+; CHECK:       90:
+; CHECK-NEXT:    [[TMP91:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8
+; CHECK-NEXT:    [[TMP92:%.*]] = inttoptr i64 [[TMP91]] to ptr
+; CHECK-NEXT:    [[TMP93:%.*]] = load ptr, ptr [[TMP92]], align 8
+; CHECK-NEXT:    [[TMP94:%.*]] = icmp ne ptr [[TMP93]], null
+; CHECK-NEXT:    [[TMP95:%.*]] = or i1 false, [[TMP94]]
+; CHECK-NEXT:    [[TMP96:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16
+; CHECK-NEXT:    [[TMP97:%.*]] = inttoptr i64 [[TMP96]] to ptr
+; CHECK-NEXT:    [[TMP98:%.*]] = load ptr, ptr [[TMP97]], align 8
+; CHECK-NEXT:    [[TMP99:%.*]] = icmp ne ptr [[TMP98]], null
+; CHECK-NEXT:    [[TMP100:%.*]] = or i1 [[TMP95]], [[TMP99]]
+; CHECK-NEXT:    [[TMP101:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24
+; CHECK-NEXT:    [[TMP102:%.*]] = inttoptr i64 [[TMP101]] to ptr
+; CHECK-NEXT:    [[TMP103:%.*]] = load ptr, ptr [[TMP102]], align 8
+; CHECK-NEXT:    [[TMP104:%.*]] = icmp ne ptr [[TMP103]], null
+; CHECK-NEXT:    [[TMP105:%.*]] = or i1 [[TMP100]], [[TMP104]]
+; CHECK-NEXT:    [[TMP106:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32
+; CHECK-NEXT:    [[TMP107:%.*]] = inttoptr i64 [[TMP106]] to ptr
+; CHECK-NEXT:    [[TMP108:%.*]] = load ptr, ptr [[TMP107]], align 8
+; CHECK-NEXT:    [[TMP109:%.*]] = icmp ne ptr [[TMP108]], null
+; CHECK-NEXT:    [[TMP110:%.*]] = or i1 [[TMP105]], [[TMP109]]
+; CHECK-NEXT:    [[TMP111:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40
+; CHECK-NEXT:    [[TMP112:%.*]] = inttoptr i64 [[TMP111]] to ptr
+; CHECK-NEXT:    [[TMP113:%.*]] = load ptr, ptr [[TMP112]], align 8
+; CHECK-NEXT:    [[TMP114:%.*]] = icmp ne ptr [[TMP113]], null
+; CHECK-NEXT:    [[TMP115:%.*]] = or i1 [[TMP110]], [[TMP114]]
+; CHECK-NEXT:    [[TMP116:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48
+; CHECK-NEXT:    [[TMP117:%.*]] = inttoptr i64 [[TMP116]] to ptr
+; CHECK-NEXT:    [[TMP118:%.*]] = load ptr, ptr [[TMP117]], align 8
+; CHECK-NEXT:    [[TMP119:%.*]] = icmp ne ptr [[TMP118]], null
+; CHECK-NEXT:    [[TMP120:%.*]] = or i1 [[TMP115]], [[TMP119]]
+; CHECK-NEXT:    [[TMP121:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56
+; CHECK-NEXT:    [[TMP122:%.*]] = inttoptr i64 [[TMP121]] to ptr
+; CHECK-NEXT:    [[TMP123:%.*]] = load ptr, ptr [[TMP122]], align 8
+; CHECK-NEXT:    [[TMP124:%.*]] = icmp ne ptr [[TMP123]], null
+; CHECK-NEXT:    [[TMP125:%.*]] = or i1 [[TMP120]], [[TMP124]]
+; CHECK-NEXT:    br i1 [[TMP125]], label [[TMP126:%.*]], label [[TMP127:%.*]], !prof [[PROF0]]
+; CHECK:       126:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP127]]
+; CHECK:       127:
+; CHECK-NEXT:    store ptr null, ptr [[SHADOW_PTR6]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET9:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR10:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET9]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR10]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET11:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR12:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET11]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR12]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET13:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR14:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET13]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR14]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_4_OFFSET15:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32
+; CHECK-NEXT:    [[SHADOW_BYTE_4_PTR16:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET15]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR16]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_5_OFFSET17:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40
+; CHECK-NEXT:    [[SHADOW_BYTE_5_PTR18:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET17]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR18]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_6_OFFSET19:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48
+; CHECK-NEXT:    [[SHADOW_BYTE_6_PTR20:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET19]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR20]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_7_OFFSET21:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56
+; CHECK-NEXT:    [[SHADOW_BYTE_7_PTR22:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET21]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR22]], align 8
+; CHECK-NEXT:    br label [[TMP129:%.*]]
+; CHECK:       128:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP129]]
+; CHECK:       129:
+; CHECK-NEXT:    br label [[TMP175:%.*]]
+; CHECK:       130:
+; CHECK-NEXT:    [[TMP131:%.*]] = add i64 [[SHADOW_PTR_INT5]], 8
+; CHECK-NEXT:    [[TMP132:%.*]] = inttoptr i64 [[TMP131]] to ptr
+; CHECK-NEXT:    [[TMP133:%.*]] = load ptr, ptr [[TMP132]], align 8
+; CHECK-NEXT:    [[TMP134:%.*]] = ptrtoint ptr [[TMP133]] to i64
+; CHECK-NEXT:    [[TMP135:%.*]] = icmp sge i64 [[TMP134]], 0
+; CHECK-NEXT:    [[TMP136:%.*]] = or i1 false, [[TMP135]]
+; CHECK-NEXT:    [[TMP137:%.*]] = add i64 [[SHADOW_PTR_INT5]], 16
+; CHECK-NEXT:    [[TMP138:%.*]] = inttoptr i64 [[TMP137]] to ptr
+; CHECK-NEXT:    [[TMP139:%.*]] = load ptr, ptr [[TMP138]], align 8
+; CHECK-NEXT:    [[TMP140:%.*]] = ptrtoint ptr [[TMP139]] to i64
+; CHECK-NEXT:    [[TMP141:%.*]] = icmp sge i64 [[TMP140]], 0
+; CHECK-NEXT:    [[TMP142:%.*]] = or i1 [[TMP136]], [[TMP141]]
+; CHECK-NEXT:    [[TMP143:%.*]] = add i64 [[SHADOW_PTR_INT5]], 24
+; CHECK-NEXT:    [[TMP144:%.*]] = inttoptr i64 [[TMP143]] to ptr
+; CHECK-NEXT:    [[TMP145:%.*]] = load ptr, ptr [[TMP144]], align 8
+; CHECK-NEXT:    [[TMP146:%.*]] = ptrtoint ptr [[TMP145]] to i64
+; CHECK-NEXT:    [[TMP147:%.*]] = icmp sge i64 [[TMP146]], 0
+; CHECK-NEXT:    [[TMP148:%.*]] = or i1 [[TMP142]], [[TMP147]]
+; CHECK-NEXT:    [[TMP149:%.*]] = add i64 [[SHADOW_PTR_INT5]], 32
+; CHECK-NEXT:    [[TMP150:%.*]] = inttoptr i64 [[TMP149]] to ptr
+; CHECK-NEXT:    [[TMP151:%.*]] = load ptr, ptr [[TMP150]], align 8
+; CHECK-NEXT:    [[TMP152:%.*]] = ptrtoint ptr [[TMP151]] to i64
+; CHECK-NEXT:    [[TMP153:%.*]] = icmp sge i64 [[TMP152]], 0
+; CHECK-NEXT:    [[TMP154:%.*]] = or i1 [[TMP148]], [[TMP153]]
+; CHECK-NEXT:    [[TMP155:%.*]] = add i64 [[SHADOW_PTR_INT5]], 40
+; CHECK-NEXT:    [[TMP156:%.*]] = inttoptr i64 [[TMP155]] to ptr
+; CHECK-NEXT:    [[TMP157:%.*]] = load ptr, ptr [[TMP156]], align 8
+; CHECK-NEXT:    [[TMP158:%.*]] = ptrtoint ptr [[TMP157]] to i64
+; CHECK-NEXT:    [[TMP159:%.*]] = icmp sge i64 [[TMP158]], 0
+; CHECK-NEXT:    [[TMP160:%.*]] = or i1 [[TMP154]], [[TMP159]]
+; CHECK-NEXT:    [[TMP161:%.*]] = add i64 [[SHADOW_PTR_INT5]], 48
+; CHECK-NEXT:    [[TMP162:%.*]] = inttoptr i64 [[TMP161]] to ptr
+; CHECK-NEXT:    [[TMP163:%.*]] = load ptr, ptr [[TMP162]], align 8
+; CHECK-NEXT:    [[TMP164:%.*]] = ptrtoint ptr [[TMP163]] to i64
+; CHECK-NEXT:    [[TMP165:%.*]] = icmp sge i64 [[TMP164]], 0
+; CHECK-NEXT:    [[TMP166:%.*]] = or i1 [[TMP160]], [[TMP165]]
+; CHECK-NEXT:    [[TMP167:%.*]] = add i64 [[SHADOW_PTR_INT5]], 56
+; CHECK-NEXT:    [[TMP168:%.*]] = inttoptr i64 [[TMP167]] to ptr
+; CHECK-NEXT:    [[TMP169:%.*]] = load ptr, ptr [[TMP168]], align 8
+; CHECK-NEXT:    [[TMP170:%.*]] = ptrtoint ptr [[TMP169]] to i64
+; CHECK-NEXT:    [[TMP171:%.*]] = icmp sge i64 [[TMP170]], 0
+; CHECK-NEXT:    [[TMP172:%.*]] = or i1 [[TMP166]], [[TMP171]]
+; CHECK-NEXT:    br i1 [[TMP172]], label [[TMP173:%.*]], label [[TMP174:%.*]], !prof [[PROF0]]
+; CHECK:       173:
+; CHECK-NEXT:    call void @__tysan_check(ptr @__tysan_shadow_memory_address, i32 8, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP174]]
+; CHECK:       174:
+; CHECK-NEXT:    br label [[TMP175]]
+; CHECK:       175:
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    call void @__tysan_instrument_with_shadow_update(ptr [[A:%.*]], ptr @__tysan_v1___ZTS1v_o_12, i1 true, i64 4, i32 2)
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[APP_PTR_MASKED23:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK2]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED24:%.*]] = shl i64 [[APP_PTR_MASKED23]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT25:%.*]] = add i64 [[APP_PTR_SHIFTED24]], [[SHADOW_BASE1]]
+; CHECK-NEXT:    [[SHADOW_PTR26:%.*]] = inttoptr i64 [[SHADOW_PTR_INT25]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC27:%.*]] = load ptr, ptr [[SHADOW_PTR26]], align 8
+; CHECK-NEXT:    [[BAD_DESC28:%.*]] = icmp ne ptr [[SHADOW_DESC27]], @__tysan_v1___ZTS1v_o_12
+; CHECK-NEXT:    br i1 [[BAD_DESC28]], label [[TMP176:%.*]], label [[TMP198:%.*]], !prof [[PROF0]]
+; CHECK:       176:
+; CHECK-NEXT:    [[TMP177:%.*]] = icmp eq ptr [[SHADOW_DESC27]], null
+; CHECK-NEXT:    br i1 [[TMP177]], label [[TMP178:%.*]], label [[TMP196:%.*]]
+; CHECK:       178:
+; CHECK-NEXT:    [[TMP179:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8
+; CHECK-NEXT:    [[TMP180:%.*]] = inttoptr i64 [[TMP179]] to ptr
+; CHECK-NEXT:    [[TMP181:%.*]] = load ptr, ptr [[TMP180]], align 8
+; CHECK-NEXT:    [[TMP182:%.*]] = icmp ne ptr [[TMP181]], null
+; CHECK-NEXT:    [[TMP183:%.*]] = or i1 false, [[TMP182]]
+; CHECK-NEXT:    [[TMP184:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16
+; CHECK-NEXT:    [[TMP185:%.*]] = inttoptr i64 [[TMP184]] to ptr
+; CHECK-NEXT:    [[TMP186:%.*]] = load ptr, ptr [[TMP185]], align 8
+; CHECK-NEXT:    [[TMP187:%.*]] = icmp ne ptr [[TMP186]], null
+; CHECK-NEXT:    [[TMP188:%.*]] = or i1 [[TMP183]], [[TMP187]]
+; CHECK-NEXT:    [[TMP189:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24
+; CHECK-NEXT:    [[TMP190:%.*]] = inttoptr i64 [[TMP189]] to ptr
+; CHECK-NEXT:    [[TMP191:%.*]] = load ptr, ptr [[TMP190]], align 8
+; CHECK-NEXT:    [[TMP192:%.*]] = icmp ne ptr [[TMP191]], null
+; CHECK-NEXT:    [[TMP193:%.*]] = or i1 [[TMP188]], [[TMP192]]
+; CHECK-NEXT:    br i1 [[TMP193]], label [[TMP194:%.*]], label [[TMP195:%.*]], !prof [[PROF0]]
+; CHECK:       194:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2)
+; CHECK-NEXT:    br label [[TMP195]]
+; CHECK:       195:
+; CHECK-NEXT:    store ptr @__tysan_v1___ZTS1v_o_12, ptr [[SHADOW_PTR26]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET29:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR30:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET29]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR30]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET31:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR32:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET31]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR32]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET33:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR34:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET33]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR34]], align 8
+; CHECK-NEXT:    br label [[TMP197:%.*]]
+; CHECK:       196:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2)
+; CHECK-NEXT:    br label [[TMP197]]
+; CHECK:       197:
+; CHECK-NEXT:    br label [[TMP219:%.*]]
+; CHECK:       198:
+; CHECK-NEXT:    [[TMP199:%.*]] = add i64 [[SHADOW_PTR_INT25]], 8
+; CHECK-NEXT:    [[TMP200:%.*]] = inttoptr i64 [[TMP199]] to ptr
+; CHECK-NEXT:    [[TMP201:%.*]] = load ptr, ptr [[TMP200]], align 8
+; CHECK-NEXT:    [[TMP202:%.*]] = ptrtoint ptr [[TMP201]] to i64
+; CHECK-NEXT:    [[TMP203:%.*]] = icmp sge i64 [[TMP202]], 0
+; CHECK-NEXT:    [[TMP204:%.*]] = or i1 false, [[TMP203]]
+; CHECK-NEXT:    [[TMP205:%.*]] = add i64 [[SHADOW_PTR_INT25]], 16
+; CHECK-NEXT:    [[TMP206:%.*]] = inttoptr i64 [[TMP205]] to ptr
+; CHECK-NEXT:    [[TMP207:%.*]] = load ptr, ptr [[TMP206]], align 8
+; CHECK-NEXT:    [[TMP208:%.*]] = ptrtoint ptr [[TMP207]] to i64
+; CHECK-NEXT:    [[TMP209:%.*]] = icmp sge i64 [[TMP208]], 0
+; CHECK-NEXT:    [[TMP210:%.*]] = or i1 [[TMP204]], [[TMP209]]
+; CHECK-NEXT:    [[TMP211:%.*]] = add i64 [[SHADOW_PTR_INT25]], 24
+; CHECK-NEXT:    [[TMP212:%.*]] = inttoptr i64 [[TMP211]] to ptr
+; CHECK-NEXT:    [[TMP213:%.*]] = load ptr, ptr [[TMP212]], align 8
+; CHECK-NEXT:    [[TMP214:%.*]] = ptrtoint ptr [[TMP213]] to i64
+; CHECK-NEXT:    [[TMP215:%.*]] = icmp sge i64 [[TMP214]], 0
+; CHECK-NEXT:    [[TMP216:%.*]] = or i1 [[TMP210]], [[TMP215]]
+; CHECK-NEXT:    br i1 [[TMP216]], label [[TMP217:%.*]], label [[TMP218:%.*]], !prof [[PROF0]]
+; CHECK:       217:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2)
+; CHECK-NEXT:    br label [[TMP218]]
+; CHECK:       218:
+; CHECK-NEXT:    br label [[TMP219]]
+; CHECK:       219:
+; CHECK-NEXT:    store i32 42, ptr [[A]], align 4, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i32 42, ptr %a, align 4, !tbaa !6
+  ret void
+}
+
+!0 = !{!"Simple C++ TBAA"}
+!1 = !{!"omnipotent char", !0, i64 0}
+!2 = !{!"int", !1, i64 0}
+!3 = !{!2, !2, i64 0}
+!4 = !{!"_ZTS1x", !2, i64 0, !2, i64 4}
+!5 = !{!"_ZTS1v", !2, i64 8, !2, i64 12, !4, i64 16}
+!6 = !{!5, !2, i64 12}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { sanitize_type }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 100000}
+; CHECK: [[TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[META2]] = !{!"int", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C++ TBAA"}
+; CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META2]], i64 12}
+; CHECK: [[META6]] = !{!"_ZTS1v", [[META2]], i64 8, [[META2]], i64 12, [[META7:![0-9]+]], i64 16}
+; CHECK: [[META7]] = !{!"_ZTS1x", [[META2]], i64 0, [[META2]], i64 4}
+;.
diff --git a/llvm/test/Instrumentation/TypeSanitizer/globals_outlined.ll b/llvm/test/Instrumentation/TypeSanitizer/globals_outlined.ll
new file mode 100644
index 0000000000000..0bd7940467415
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/globals_outlined.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs
+; RUN: opt -passes='tysan' -tysan-outline-instrumentation -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@global1 = global i32 0, align 4
+@global2 = global i32 0, align 4
+
+
+; CHECK-LABEL: define internal void @__tysan_set_globals_types(
+; CHECK-NEXT:     %app.mem.mask = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:     %shadow.base = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:     call void @__tysan_set_shadow_type(ptr @global1, ptr @__tysan_v1_int, i64 4)
+; CHECK-NEXT:     call void @__tysan_set_shadow_type(ptr @global1, ptr @__tysan_v1_int, i64 4)
+; CHECK-NEXT:     ret void
+; CHECK-NEXT:   }
+
+!llvm.tysan.globals = !{!13, !14}
+
+!0 = !{!"Simple C++ TBAA"}
+!1 = !{!"omnipotent char", !0, i64 0}
+!2 = !{!"int", !1, i64 0}
+!13 = !{ptr @global1, !2}
+!14 = !{ptr @global1, !2}
diff --git a/llvm/test/Linker/drop-attribute.ll b/llvm/test/Linker/drop-attribute.ll
index 9be95a89109b4..3d4c13c2ffc75 100644
--- a/llvm/test/Linker/drop-attribute.ll
+++ b/llvm/test/Linker/drop-attribute.ll
@@ -39,7 +39,7 @@ define void @test_nocallback_definition() nocallback {
 declare void @test_nocallback_call_site()
 
 ; Test that checks that nocallback attribute on an intrinsic is NOT dropped.
-; CHECK: ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn
+; CHECK: ; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn
 ; CHECK-NEXT: declare float @llvm.sqrt.f32(float) #0
 declare float @llvm.sqrt.f32(float) nocallback
 
diff --git a/llvm/test/Linker/thinlto_funcimport_debug.ll b/llvm/test/Linker/thinlto_funcimport_debug.ll
index 294b3a773ef51..4454a56c40ef7 100644
--- a/llvm/test/Linker/thinlto_funcimport_debug.ll
+++ b/llvm/test/Linker/thinlto_funcimport_debug.ll
@@ -80,8 +80,8 @@ attributes #1 = { nounwind readnone }
 !26 = !DILocation(line: 9, column: 3, scope: !4)
 !27 = distinct !DISubprogram(name: "func3", scope: !1, file: !1, line: 8, type: !5, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !28)
 !28 = !{!29}
-!29 = !DILocalVariable(name: "n", arg: 1, scope: !30, file: !1, line: 8, type: !7)
+!29 = !DILocalVariable(name: "n", arg: 1, scope: !27, file: !1, line: 8, type: !33)
 !30 = distinct !DISubprogram(name: "func4", scope: !1, file: !1, line: 8, type: !5, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !31)
 !31 = !{!32}
 !32 = !DILocalVariable(name: "n", arg: 1, scope: !30, file: !1, line: 8, type: !7)
-
+!33 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", scope: !30, file: !1, line: 13, baseType: !7)
diff --git a/llvm/test/MC/AArch64/arm-mops-go-diagnostics.s b/llvm/test/MC/AArch64/arm-mops-go-diagnostics.s
new file mode 100644
index 0000000000000..c22331b9f18e4
--- /dev/null
+++ b/llvm/test/MC/AArch64/arm-mops-go-diagnostics.s
@@ -0,0 +1,36 @@
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+mops-go,+mte < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+// Operands must be different from each other
+
+// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same
+// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same
+// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same
+setgop [x0]!, x0!
+setgom [x0]!, x0!
+setgoe [x0]!, x0!
+
+// SP cannot be used as argument at any position
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+setgop [sp]!, x1!
+setgop [x0]!, sp!
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+setgom [sp]!, x1!
+setgom [x0]!, sp!
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+setgoe [sp]!, x1!
+setgoe [x0]!, sp!
+
+// CHECK-ERROR: error: invalid operand for instruction
+setgop [xzr]!, x1!
+
+// CHECK-ERROR: error: invalid operand for instruction
+setgom [xzr]!, x1!
+
+// CHECK-ERROR: error: invalid operand for instruction
+setgoe [xzr]!, x1!
diff --git a/llvm/test/MC/AArch64/arm-mops-go.s b/llvm/test/MC/AArch64/arm-mops-go.s
new file mode 100644
index 0000000000000..0b7809c252b86
--- /dev/null
+++ b/llvm/test/MC/AArch64/arm-mops-go.s
@@ -0,0 +1,89 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mops-go,+mte < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mops-go,+mte < %s \
+// RUN:        | llvm-objdump -d --mattr=+mops-go,+mte --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mops-go,+mte < %s \
+// RUN:        | llvm-objdump -d --mattr=-mops-go,-mte --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mops-go,+mte < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+mops-go,+mte -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//------------------------------------------------------------------------------
+// FEAT_MOPS_GO Extension instructions
+//------------------------------------------------------------------------------
+
+setgop [x3]!, x2!
+// CHECK-INST:    setgop [x3]!, x2!
+// CHECK-ENCODING: [0x43,0x00,0xdf,0x1d]
+// CHECK-UNKNOWN: 1ddf0043
+// CHECK-ERROR: instruction requires: mops-go mte
+
+setgom [x3]!, x2!
+// CHECK-INST:    setgom [x3]!, x2!
+// CHECK-ENCODING: [0x43,0x40,0xdf,0x1d]
+// CHECK-UNKNOWN: 1ddf4043
+// CHECK-ERROR: instruction requires: mops-go mte
+
+setgoe [x3]!, x2!
+// CHECK-INST:    setgoe [x3]!, x2!
+// CHECK-ENCODING: [0x43,0x80,0xdf,0x1d]
+// CHECK-UNKNOWN: 1ddf8043
+// CHECK-ERROR: instruction requires: mops-go mte
+
+setgopn [x3]!, x2!
+// CHECK-INST:    setgopn [x3]!, x2!
+// CHECK-ENCODING: [0x43,0x20,0xdf,0x1d]
+// CHECK-UNKNOWN: 1ddf2043
+// CHECK-ERROR: instruction requires: mops-go mte
+
+setgomn [x3]!, x2!
+// CHECK-INST:    setgomn [x3]!, x2!
+// CHECK-ENCODING: [0x43,0x60,0xdf,0x1d]
+// CHECK-UNKNOWN: 1ddf6043
+// CHECK-ERROR: instruction requires: mops-go mte
+
+setgoen [x3]!, x2!
+// CHECK-INST:    setgoen [x3]!, x2!
+// CHECK-ENCODING: [0x43,0xa0,0xdf,0x1d]
+// CHECK-UNKNOWN: 1ddfa043
+// CHECK-ERROR: instruction requires: mops-go mte
+
+setgopt [x3]!, x2!
+// CHECK-INST:    setgopt [x3]!, x2!
+// CHECK-ENCODING: [0x43,0x10,0xdf,0x1d]
+// CHECK-UNKNOWN: 1ddf1043
+// CHECK-ERROR: instruction requires: mops-go mte
+
+setgomt [x3]!, x2!
+// CHECK-INST:    setgomt [x3]!, x2!
+// CHECK-ENCODING: [0x43,0x50,0xdf,0x1d]
+// CHECK-UNKNOWN: 1ddf5043
+// CHECK-ERROR: instruction requires: mops-go mte
+
+setgoet [x3]!, x2!
+// CHECK-INST:    setgoet [x3]!, x2!
+// CHECK-ENCODING: [0x43,0x90,0xdf,0x1d]
+// CHECK-UNKNOWN: 1ddf9043
+// CHECK-ERROR: instruction requires: mops-go mte
+
+setgoptn [x3]!, x2!
+// CHECK-INST:    setgoptn [x3]!, x2!
+// CHECK-ENCODING: [0x43,0x30,0xdf,0x1d]
+// CHECK-UNKNOWN: 1ddf3043
+// CHECK-ERROR: instruction requires: mops-go mte
+
+setgomtn [x3]!, x2!
+// CHECK-INST:    setgomtn [x3]!, x2!
+// CHECK-ENCODING: [0x43,0x70,0xdf,0x1d]
+// CHECK-UNKNOWN: 1ddf7043
+// CHECK-ERROR: instruction requires: mops-go mte
+
+setgoetn [x3]!, x2!
+// CHECK-INST:    setgoetn [x3]!, x2!
+// CHECK-ENCODING: [0x43,0xb0,0xdf,0x1d]
+// CHECK-UNKNOWN: 1ddfb043
+// CHECK-ERROR: instruction requires: mops-go mte
diff --git a/llvm/test/MC/AArch64/prfum.s b/llvm/test/MC/AArch64/prfum.s
new file mode 100644
index 0000000000000..81a864a694325
--- /dev/null
+++ b/llvm/test/MC/AArch64/prfum.s
@@ -0,0 +1,44 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding --print-imm-hex=false < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj < %s \
+// RUN:        | llvm-objdump -d --print-imm-hex=false - | FileCheck %s --check-prefix=CHECK-INST
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -disassemble -show-encoding --print-imm-hex=false \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// PRFM falls back to PRFUM for negative or unaligned offsets (not a multiple
+// of 8).
+
+prfm pldl1keep, [x0, #-256]
+// CHECK-INST: prfum pldl1keep, [x0, #-256]
+// CHECK-ENCODING: [0x00,0x00,0x90,0xf8]
+
+prfm pldl1keep, [x0, #-8]
+// CHECK-INST: prfum pldl1keep, [x0, #-8]
+// CHECK-ENCODING: [0x00,0x80,0x9f,0xf8]
+
+prfm pldl1keep, [x0, #-1]
+// CHECK-INST: prfum pldl1keep, [x0, #-1]
+// CHECK-ENCODING: [0x00,0xf0,0x9f,0xf8]
+
+prfm pldl1keep, [x0, #0]
+// CHECK-INST: prfm pldl1keep, [x0]
+// CHECK-ENCODING: [0x00,0x00,0x80,0xf9]
+
+prfm pldl1keep, [x0, #1]
+// CHECK-INST: prfum pldl1keep, [x0, #1]
+// CHECK-ENCODING: [0x00,0x10,0x80,0xf8]
+
+prfm pldl1keep, [x0, #8]
+// CHECK-INST: prfm pldl1keep, [x0, #8]
+// CHECK-ENCODING: [0x00,0x04,0x80,0xf9]
+
+prfm pldl1keep, [x0, #255]
+// CHECK-INST: prfum pldl1keep, [x0, #255]
+// CHECK-ENCODING: [0x00,0xf0,0x8f,0xf8]
+
+prfm pldl1keep, [x0, #256]
+// CHECK-INST: prfm pldl1keep, [x0, #256]
+// CHECK-ENCODING: [0x00,0x80,0x80,0xf9]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s
index fec8ba19f93fe..0a480a73cde5b 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s
@@ -2,33 +2,33 @@
 ; RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
 tensor_load_to_lds s[0:3], s[4:11]
-// GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c]
+// GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS
-// GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c]
+// GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19]
-// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10]
+// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV
-// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10]
+// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_store_from_lds s[0:3], s[4:11]
-// GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c]
+// GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS
-// GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c]
+// GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19]
-// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10]
+// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV
-// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10]
+// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s
index c393d3e819880..3f6d8feb45df0 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s
@@ -34,3 +34,83 @@ v_cvt_f32_bf16 v5, v1 div:2
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
 // GFX1250-ERR-NEXT:{{^}}v_cvt_f32_bf16 v5, v1 div:2
 // GFX1250-ERR-NEXT:{{^}}                      ^
+
+v_cos_bf16 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_cos_bf16 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_cos_bf16 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_cos_bf16 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_exp_bf16 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_exp_bf16 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_exp_bf16 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_exp_bf16 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_log_bf16 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_log_bf16 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_log_bf16 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_log_bf16 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_rcp_bf16 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_rcp_bf16 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_rcp_bf16 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_rcp_bf16 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_rsq_bf16 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_rsq_bf16 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_rsq_bf16 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_rsq_bf16 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_sin_bf16 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_sin_bf16 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_sin_bf16 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_sin_bf16 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_sqrt_bf16 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_sqrt_bf16 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                   ^
+
+v_sqrt_bf16 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_sqrt_bf16 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                   ^
+
+v_tanh_bf16 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_tanh_bf16 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                   ^
+
+v_tanh_bf16 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_tanh_bf16 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                   ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
index 0931523bbf40c..37ad6eb249da4 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
@@ -3781,15 +3781,6 @@ v_tanh_bf16_e64 v5, null
 v_tanh_bf16_e64 v5, -1
 // GFX1250: v_tanh_bf16_e64 v5, -1                  ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00]
 
-v_tanh_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_tanh_bf16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08]
-
-v_tanh_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_tanh_bf16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10]
-
-v_tanh_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_prng_b32_e64 v5, v1
 // GFX1250: v_prng_b32_e64 v5, v1                   ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00]
 
@@ -3862,15 +3853,6 @@ v_rcp_bf16_e64 v5, null
 v_rcp_bf16_e64 v5, -1
 // GFX1250: v_rcp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rcp_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_rcp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08]
-
-v_rcp_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_rcp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10]
-
-v_rcp_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_sqrt_bf16_e64 v5, v1
 // GFX1250: v_sqrt_bf16_e64 v5, v1                  ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x01,0x00,0x00]
 
@@ -3907,15 +3889,6 @@ v_sqrt_bf16_e64 v5, null
 v_sqrt_bf16_e64 v5, -1
 // GFX1250: v_sqrt_bf16_e64 v5, -1                  ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sqrt_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_sqrt_bf16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08]
-
-v_sqrt_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_sqrt_bf16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10]
-
-v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_rsq_bf16_e64 v5, v1
 // GFX1250: v_rsq_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00]
 
@@ -3952,15 +3925,6 @@ v_rsq_bf16_e64 v5, null
 v_rsq_bf16_e64 v5, -1
 // GFX1250: v_rsq_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rsq_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_rsq_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
-
-v_rsq_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_rsq_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
-
-v_rsq_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_log_bf16_e64 v5, v1
 // GFX1250: v_log_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00]
 
@@ -3997,15 +3961,6 @@ v_log_bf16_e64 v5, null
 v_log_bf16_e64 v5, -1
 // GFX1250: v_log_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
 
-v_log_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_log_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
-
-v_log_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_log_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
-
-v_log_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_exp_bf16_e64 v5, v1
 // GFX1250: v_exp_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00]
 
@@ -4042,15 +3997,6 @@ v_exp_bf16_e64 v5, null
 v_exp_bf16_e64 v5, -1
 // GFX1250: v_exp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
 
-v_exp_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_exp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
-
-v_exp_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_exp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
-
-v_exp_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_sin_bf16_e64 v5, v1
 // GFX1250: v_sin_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00]
 
@@ -4087,15 +4033,6 @@ v_sin_bf16_e64 v5, null
 v_sin_bf16_e64 v5, -1
 // GFX1250: v_sin_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sin_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_sin_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
-
-v_sin_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_sin_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
-
-v_sin_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_cos_bf16_e64 v5, v1
 // GFX1250: v_cos_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00]
 
@@ -4132,15 +4069,6 @@ v_cos_bf16_e64 v5, null
 v_cos_bf16_e64 v5, -1
 // GFX1250: v_cos_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_cos_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
-
-v_cos_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_cos_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
-
-v_cos_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_cvt_f32_bf16_e64 v5, v1
 // GFX1250: v_cvt_f32_bf16_e64 v5, v1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
index 5ac9eb47381d6..52f9ba3a99483 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
@@ -3952,15 +3952,6 @@ v_tanh_bf16_e64 v5.l, null
 v_tanh_bf16_e64 v5.l, -1
 // GFX1250: v_tanh_bf16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00]
 
-v_tanh_bf16_e64 v5.l, 0.5 mul:2
-// GFX1250: v_tanh_bf16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08]
-
-v_tanh_bf16_e64 v5.l, src_scc mul:4
-// GFX1250: v_tanh_bf16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10]
-
-v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2
-// GFX1250: v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_tanh_bf16 v5.l, v128.h
 // GFX1250: v_tanh_bf16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xca,0xd5,0x80,0x01,0x00,0x00]
 
@@ -4036,15 +4027,6 @@ v_rcp_bf16_e64 v5.l, null
 v_rcp_bf16_e64 v5.l, -1
 // GFX1250: v_rcp_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rcp_bf16_e64 v5.l, 0.5 mul:2
-// GFX1250: v_rcp_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08]
-
-v_rcp_bf16_e64 v5.l, src_scc mul:4
-// GFX1250: v_rcp_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10]
-
-v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2
-// GFX1250: v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_rcp_bf16 v5.h, v128.h
 // GFX1250: v_rcp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xf9,0xd5,0x80,0x01,0x00,0x00]
 
@@ -4084,15 +4066,6 @@ v_sqrt_bf16_e64 v5.l, null
 v_sqrt_bf16_e64 v5.l, -1
 // GFX1250: v_sqrt_bf16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sqrt_bf16_e64 v5.l, 0.5 mul:2
-// GFX1250: v_sqrt_bf16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08]
-
-v_sqrt_bf16_e64 v5.l, src_scc mul:4
-// GFX1250: v_sqrt_bf16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10]
-
-v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2
-// GFX1250: v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_sqrt_bf16 v5.h, v128.h
 // GFX1250: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00]
 
@@ -4132,15 +4105,6 @@ v_rsq_bf16_e64 v5.l, null
 v_rsq_bf16_e64 v5.l, -1
 // GFX1250: v_rsq_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rsq_bf16_e64 v5.l, 0.5 mul:2
-// GFX1250: v_rsq_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
-
-v_rsq_bf16_e64 v5.l, src_scc mul:4
-// GFX1250: v_rsq_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
-
-v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2
-// GFX1250: v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_rsq_bf16 v5.h, v128.h
 // GFX1250: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00]
 
@@ -4180,15 +4144,6 @@ v_log_bf16_e64 v5.l, null
 v_log_bf16_e64 v5.l, -1
 // GFX1250: v_log_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
 
-v_log_bf16_e64 v5.l, 0.5 mul:2
-// GFX1250: v_log_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
-
-v_log_bf16_e64 v5.l, src_scc mul:4
-// GFX1250: v_log_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
-
-v_log_bf16_e64 v255.l, -|0x8000| clamp div:2
-// GFX1250: v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_log_bf16 v5.h, v128.h
 // GFX1250: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00]
 
@@ -4228,15 +4183,6 @@ v_exp_bf16_e64 v5.l, null
 v_exp_bf16_e64 v5.l, -1
 // GFX1250: v_exp_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
 
-v_exp_bf16_e64 v5.l, 0.5 mul:2
-// GFX1250: v_exp_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
-
-v_exp_bf16_e64 v5.l, src_scc mul:4
-// GFX1250: v_exp_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
-
-v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2
-// GFX1250: v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_exp_bf16 v5.h, v128.h
 // GFX1250: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00]
 
@@ -4276,15 +4222,6 @@ v_sin_bf16_e64 v5.l, null
 v_sin_bf16_e64 v5.l, -1
 // GFX1250: v_sin_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sin_bf16_e64 v5.l, 0.5 mul:2
-// GFX1250: v_sin_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
-
-v_sin_bf16_e64 v5.l, src_scc mul:4
-// GFX1250: v_sin_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
-
-v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2
-// GFX1250: v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_sin_bf16 v5.h, v128.h
 // GFX1250: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00]
 
@@ -4324,15 +4261,6 @@ v_cos_bf16_e64 v5.l, null
 v_cos_bf16_e64 v5.l, -1
 // GFX1250: v_cos_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_bf16_e64 v5.l, 0.5 mul:2
-// GFX1250: v_cos_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
-
-v_cos_bf16_e64 v5.l, src_scc mul:4
-// GFX1250: v_cos_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
-
-v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2
-// GFX1250: v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_cos_bf16_e64 v5.h, v128.h
 // GFX1250: v_cos_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xff,0xd5,0x80,0x01,0x00,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
index b21fca654590a..21077fe4f9f05 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
@@ -158,18 +158,6 @@ v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -258,18 +246,6 @@ v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -314,18 +290,6 @@ v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -370,18 +334,6 @@ v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -426,18 +378,6 @@ v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -482,18 +422,6 @@ v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -538,18 +466,6 @@ v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -594,18 +510,6 @@ v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
index d1638565a386a..646acf5219d7e 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
@@ -162,18 +162,6 @@ v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_tanh_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -266,18 +254,6 @@ v_rcp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_rcp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -326,18 +302,6 @@ v_sqrt_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_sqrt_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -386,18 +350,6 @@ v_rsq_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_rsq_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -446,18 +398,6 @@ v_log_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_log_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_log_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -506,18 +446,6 @@ v_exp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_exp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -566,18 +494,6 @@ v_sin_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_sin_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -626,18 +542,6 @@ v_cos_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_cos_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
index 78afa10b984cb..1907a939b488b 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
@@ -38,18 +38,6 @@ v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -58,114 +46,30 @@ v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
index 6ec4d5f48f8b1..35a51dbe9f922 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
@@ -42,18 +42,6 @@ v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_tanh_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -66,18 +54,6 @@ v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_rcp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -86,18 +62,6 @@ v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_sqrt_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -106,18 +70,6 @@ v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_rsq_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -126,18 +78,6 @@ v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_log_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -146,18 +86,6 @@ v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_exp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -166,18 +94,6 @@ v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_sin_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -186,18 +102,6 @@ v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_cos_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
index 8185b77beb935..fcfff9ac5b63d 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
@@ -4,1906 +4,1906 @@
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0 neg_lo:[0,0,1] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x04,0x5d,0xcc,0x00,0x05,0x12,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_a_reuse ; encoding: [0x04,0x20,0x5d,0xcc,0x00,0x05,0x12,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse ; encoding: [0x04,0x40,0x5d,0xcc,0x00,0x05,0x12,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x62,0xcc,0x00,0x11,0x42,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x62,0xcc,0x00,0x11,0x42,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x62,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x62,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x62,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x63,0xcc,0x00,0x11,0x42,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x63,0xcc,0x00,0x11,0x42,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x63,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x63,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x63,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0 ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x1a,0x01,0x64,0xcc,0x00,0x11,0x42,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x1a,0x02,0x64,0xcc,0x00,0x11,0x42,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x1a,0x04,0x64,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x1a,0x20,0x64,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x1a,0x40,0x64,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6a,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6a,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6a,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6b,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6b,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6b,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6c,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6c,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6c,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6d,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6d,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6d,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6e,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x6e,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x6e,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6f,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x6f,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x6f,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x70,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x70,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x70,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x71,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x71,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x71,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], 1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], 1 ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x06,0x1a]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x72,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x72,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x60,0xcc,0x00,0x11,0x42,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x60,0xcc,0x00,0x11,0x42,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x60,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x60,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x60,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x61,0xcc,0x00,0x11,0x42,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x61,0xcc,0x00,0x11,0x42,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x61,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x61,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x61,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 ; encoding: [0x18,0x00,0x66,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; encoding: [0x18,0x08,0x66,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x66,0xcc,0x00,0x11,0x82,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x66,0xcc,0x00,0x11,0x82,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x66,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x66,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 ; encoding: [0x18,0x00,0x68,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1 ; encoding: [0x18,0x08,0x68,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x68,0xcc,0x00,0x11,0x72,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x68,0xcc,0x00,0x11,0x72,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x66,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x66,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 ; encoding: [0x18,0x00,0x69,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; encoding: [0x18,0x08,0x69,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x69,0xcc,0x00,0x11,0x82,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x69,0xcc,0x00,0x11,0x82,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x69,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x69,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x73,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x73,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x73,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x73,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x74,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x74,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x74,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x74,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x75,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x75,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x75,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x75,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x76,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x76,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x76,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x76,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x77,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x77,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x77,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x77,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x78,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x78,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x78,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x78,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x79,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x79,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x79,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x79,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x7a,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x7a,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x7a,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x7a,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x7b,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0] ; encoding: [0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0] ; encoding: [0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x7b,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x7b,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 ; encoding: [0x18,0x00,0x65,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; encoding: [0x18,0x08,0x65,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x65,0xcc,0x00,0x11,0x82,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x65,0xcc,0x00,0x11,0x82,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x65,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x65,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 ; encoding: [0x18,0x00,0x67,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1 ; encoding: [0x18,0x08,0x67,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x67,0xcc,0x00,0x11,0x72,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x67,0xcc,0x00,0x11,0x72,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_a_reuse ; encoding: [0x18,0x20,0x67,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse ; encoding: [0x18,0x40,0x67,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] ; encoding: [0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 s1, s2
-// GFX1250: v_wmma_ld_scale_paired_b32 s1, s2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 s1, s2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 2, -4
-// GFX1250: v_wmma_ld_scale_paired_b32 2, -4        ; encoding: [0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 2, -4        ; encoding: [0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW0 matrix_b_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse
-// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
-// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse
-// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse
-// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x08]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x0c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x28]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x2c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x48]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x4c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x28]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x2c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5]
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5]
-// GFX1250: v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 2, -4
-// GFX1250: v_wmma_ld_scale16_paired_b64 2, -4      ; encoding: [0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 2, -4      ; encoding: [0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW0 matrix_b_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse
-// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
-// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse
-// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse
-// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x08]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x0c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x28]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x2c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x48]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x4c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x28]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x2c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s1, s2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s1, s2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s1, s2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP8
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF8
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP6
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF6
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP4
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP8
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF8
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP6
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF6
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP4
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_reuse
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_reuse
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s[2:3], s[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s[2:3], s[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s[2:3], s[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1]
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP8
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF8
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF6
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP4
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP8
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF8
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP6
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF6
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP4
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_reuse
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_reuse
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x84,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x84,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x84,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x85,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x85,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x85,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x86,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x86,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x86,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x87,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x87,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x87,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x80,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x80,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x80,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x81,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x81,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x81,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x82,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x82,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x82,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x83,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x83,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x83,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0x12,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0 ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0 neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0x12,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1]
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt
index 9afaa075ea838..800579391d8eb 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt
@@ -1,25 +1,25 @@
 # RUN: llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
 
-# GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c]
-0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c
+# GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c]
+0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c
 
-# GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c]
-0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c
+# GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c]
+0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c
 
-# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10]
-0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10
+# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10]
+0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10
 
-# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10]
-0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10
+# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10]
+0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10
 
-# GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c]
-0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c
+# GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c]
+0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c
 
-# GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c]
-0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c
+# GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c]
+0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c
 
-# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10]
-0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10
+# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10]
+0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10
 
-# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10]
-0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10
+# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10]
+0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
index 67747a65ee52a..0b393973b7875 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
@@ -4123,18 +4123,10 @@
 # GFX1250-REAL16: v_tanh_f16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_f16_e64 v5, v128                 ; encoding: [0x05,0x00,0x9f,0xd5,0x80,0x01,0x00,0x00]
 
-0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
-# GFX1250-REAL16: v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-# GFX1250-FAKE16: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00
 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_bf16_e64 v5, -1                  ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08
-# GFX1250-REAL16: v_tanh_bf16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08]
-# GFX1250-FAKE16: v_tanh_bf16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08]
-
 0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_bf16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00]
@@ -4159,10 +4151,6 @@
 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_bf16_e64 v5, s105                ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00]
 
-0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10
-# GFX1250-REAL16: v_tanh_bf16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10]
-# GFX1250-FAKE16: v_tanh_bf16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10]
-
 0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00
 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_bf16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00]
@@ -4223,18 +4211,10 @@
 0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00
 # GFX1250: v_prng_b32_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00]
 
-0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
-# GFX1250-REAL16: v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-# GFX1250-FAKE16: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00
 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_rcp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08
-# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08]
-# GFX1250-FAKE16: v_rcp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08]
-
 0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_rcp_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00]
@@ -4259,10 +4239,6 @@
 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_rcp_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00]
 
-0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10
-# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10]
-# GFX1250-FAKE16: v_rcp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10]
-
 0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00
 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_rcp_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00]
@@ -4287,18 +4263,10 @@
 # GFX1250-REAL16: v_rcp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xf9,0xd5,0x80,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_rcp_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xf9,0xd5,0x80,0x01,0x00,0x00]
 
-0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
-# GFX1250-REAL16: v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-# GFX1250-FAKE16: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00
 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, -1                  ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08
-# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08]
-# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08]
-
 0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00]
@@ -4323,10 +4291,6 @@
 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, s105                ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00]
 
-0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10
-# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10]
-# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10]
-
 0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00
 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00]
@@ -4351,18 +4315,10 @@
 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, v128                ; encoding: [0x05,0x00,0xfa,0xd5,0x80,0x01,0x00,0x00]
 
-0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
-# GFX1250-REAL16: v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-# GFX1250-FAKE16: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00
 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_rsq_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08
-# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
-# GFX1250-FAKE16: v_rsq_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
-
 0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_rsq_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00]
@@ -4387,10 +4343,6 @@
 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_rsq_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00]
 
-0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10
-# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
-# GFX1250-FAKE16: v_rsq_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
-
 0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00
 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_rsq_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00]
@@ -4415,18 +4367,10 @@
 # GFX1250-REAL16: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_rsq_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xfb,0xd5,0x80,0x01,0x00,0x00]
 
-0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
-# GFX1250-REAL16: v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-# GFX1250-FAKE16: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00
 # GFX1250-REAL16: v_log_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_log_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08
-# GFX1250-REAL16: v_log_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
-# GFX1250-FAKE16: v_log_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
-
 0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_log_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_log_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00]
@@ -4451,10 +4395,6 @@
 # GFX1250-REAL16: v_log_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_log_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00]
 
-0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10
-# GFX1250-REAL16: v_log_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
-# GFX1250-FAKE16: v_log_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
-
 0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00
 # GFX1250-REAL16: v_log_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_log_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00]
@@ -4479,18 +4419,10 @@
 # GFX1250-REAL16: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_log_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xfc,0xd5,0x80,0x01,0x00,0x00]
 
-0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
-# GFX1250-REAL16: v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-# GFX1250-FAKE16: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00
 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_exp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08
-# GFX1250-REAL16: v_exp_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
-# GFX1250-FAKE16: v_exp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
-
 0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_exp_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00]
@@ -4515,10 +4447,6 @@
 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_exp_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00]
 
-0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10
-# GFX1250-REAL16: v_exp_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
-# GFX1250-FAKE16: v_exp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
-
 0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00
 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_exp_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00]
@@ -4543,18 +4471,10 @@
 # GFX1250-REAL16: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_exp_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xfd,0xd5,0x80,0x01,0x00,0x00]
 
-0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
-# GFX1250-REAL16: v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-# GFX1250-FAKE16: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00
 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_sin_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08
-# GFX1250-REAL16: v_sin_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
-# GFX1250-FAKE16: v_sin_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
-
 0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_sin_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00]
@@ -4579,10 +4499,6 @@
 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_sin_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00]
 
-0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10
-# GFX1250-REAL16: v_sin_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
-# GFX1250-FAKE16: v_sin_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
-
 0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00
 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_sin_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00]
@@ -4607,18 +4523,10 @@
 # GFX1250-REAL16: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_sin_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xfe,0xd5,0x80,0x01,0x00,0x00]
 
-0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
-# GFX1250-REAL16: v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-# GFX1250-FAKE16: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00
 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_cos_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08
-# GFX1250-REAL16: v_cos_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
-# GFX1250-FAKE16: v_cos_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
-
 0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_cos_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00]
@@ -4643,10 +4551,6 @@
 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_cos_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00]
 
-0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10
-# GFX1250-REAL16: v_cos_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
-# GFX1250-FAKE16: v_cos_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
-
 0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00
 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_cos_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
index 7c29f8ab01a1b..8b26d2a8696e2 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
@@ -104,18 +104,6 @@
 # GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 # GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 
-0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-
-0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-
-0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-
 0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
@@ -197,18 +185,6 @@
 0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
 # GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
-# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-
-0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-
-0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
-# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-
 0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
@@ -257,18 +233,6 @@
 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 
-0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
-# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-
-0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-
-0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
-# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-
 0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
@@ -317,18 +281,6 @@
 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 
-0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
-# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-
-0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-
-0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
-# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-
 0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
@@ -377,18 +329,6 @@
 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 
-0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
-# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-
-0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-
-0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
-# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-
 0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 # GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
@@ -437,18 +377,6 @@
 # GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 
-0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
-# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-
-0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-
-0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
-# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-
 0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
@@ -497,18 +425,6 @@
 # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 
-0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
-# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-
-0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-
-0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
-# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-
 0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
@@ -557,18 +473,6 @@
 # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 
-0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
-# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-
-0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-
-0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
-# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-
 0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
index d26bc46a1f272..15f76c54a1c65 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
@@ -34,22 +34,10 @@
 # GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 
-0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
 0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-
-0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-
 0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
 # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
@@ -57,142 +45,58 @@
 0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
 0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-
-0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-
 0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 
-0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
 0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-
-0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-
 0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 
-0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
 0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-
-0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-
 0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 
-0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
 0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-
-0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-
 0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
 # GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 
-0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
 0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-
-0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-
 0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
 # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 
-0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
 0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-
-0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-
 0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
 # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 
-0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
 0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-
-0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-
 0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
 # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
index a409dac321f83..5d73cbd512edb 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
@@ -586,233 +586,233 @@
 0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x3c
 # GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x3c]
 
-0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 2, -4      ; encoding: [0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x00]
+0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 2, -4      ; encoding: [0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x04]
 
-0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00]
+0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04]
 
-0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00]
+0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04]
 
-0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00]
+0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04]
 
-0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00]
+0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08
-# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c
+# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c]
 
-0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08
-# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08]
+0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c
+# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c]
 
-0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x00]
+0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x04]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04]
 
-0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x28
-# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x28]
+0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x2c
+# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x2c]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x08
-# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x08]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x0c
+# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x0c]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x48
-# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x48]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x4c
+# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x4c]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x28
-# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x28]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x2c
+# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x2c]
 
-0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00]
+0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04]
 
-0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00]
+0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04]
 
-0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 2, -4        ; encoding: [0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x00]
+0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 2, -4        ; encoding: [0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x04]
 
-0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00]
+0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04]
 
-0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00]
+0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04]
 
-0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00]
+0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04]
 
-0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00]
+0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08
-# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c
+# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c]
 
-0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08
-# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08]
+0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c
+# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c]
 
-0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 s1, s2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x00]
+0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 s1, s2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x04]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04]
 
-0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x28
-# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x28]
+0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x2c
+# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x2c]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x08
-# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x08]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x0c
+# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x0c]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x48
-# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x48]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x4c
+# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x4c]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x28
-# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x28]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x2c
+# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x2c]
 
-0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00]
+0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04]
 
-0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00]
+0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c]
 
-0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s[2:3], s[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s[2:3], s[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
 
-0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
 
-0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c]
 
-0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s1, s2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s1, s2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
 
-0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
 
-0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
 0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x1b
 # GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x1b]
@@ -1000,92 +1000,92 @@
 0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c
 # GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
 
-0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
 
-0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
index f5cb4b72959f9..b27a50d93f5b9 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
@@ -82,12 +82,18 @@
 #CHECK: lxvprll 6, 2, 1
 0x7c 0xc2 0x0c 0xda
 
+#CHECK: lxvpb32x 2, 15, 16
+0x7c,0x4f,0x86,0xda
+
 #CHECK: stxvprl 0, 1, 2
 0x7c 0x01 0x15 0x9a
 
 #CHECK: stxvprll 6, 0, 1
 0x7c 0xc0 0x0d 0xda
 
+#CHECK: stxvpb32x 2, 15, 16
+0x7c,0x4f,0x87,0xda
+
 #CHECK: dmxvi8gerx4 1, 2, 4
 0xec,0x82,0x20,0x58
 
@@ -244,6 +250,9 @@
 #CHECK: vucmprhh 1, 3, 6
 0x10,0x23,0x31,0x03
 
+#CHECK: xvrlw 34, 15, 16
+0xf0,0x4f,0x85,0xc1
+
 #CHECK: xxaes192encp 8, 10, 14
 0xf1,0x0b,0x76,0x10
 
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
index f0df8ce39021b..72662d9736740 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
@@ -76,12 +76,18 @@
 #CHECK: lxvprll 6, 2, 1
 0xda 0x0c 0xc2 0x7c
 
+#CHECK: lxvpb32x 2, 15, 16
+0xda,0x86,0x4f,0x7c
+
 #CHECK: stxvprl 0, 1, 2
 0x9a 0x15 0x01 0x7c
 
 #CHECK: stxvprll 6, 0, 1
 0xda 0x0d 0xc0 0x7c
 
+#CHECK: stxvpb32x 2, 15, 16
+0xda,0x87,0x4f,0x7c
+
 #CHECK: dmxvi8gerx4 1, 2, 4
 0x58,0x20,0x82,0xec
 
@@ -238,6 +244,9 @@
 #CHECK: vucmprhh 1, 3, 6
 0x03,0x31,0x23,0x10
 
+#CHECK: xvrlw 34, 15, 16
+0xc1,0x85,0x4f,0xf0
+
 #CHECK: xxaes192encp 8, 10, 14
 0x10,0x76,0x0b,0xf1
 
diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
index 57e3153da401b..5c2927afbda4c 100755
--- a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
+++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
@@ -1,70 +1,6 @@
 # RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s -check-prefix=ATT
 # RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s -check-prefix=INTEL
 
-# ATT:   t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6
-# INTEL: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456]
-0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2
-# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291]
-0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz0rs 64(%rbx), %tmm6
-# INTEL: t2rpntlvwz0rs tmm6, [rbx + 64]
-0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40
-
-# ATT:   t2rpntlvwz0rs -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32]
-0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6
-# INTEL: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456]
-0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2
-# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291]
-0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz0rst1 64(%rbx), %tmm6
-# INTEL: t2rpntlvwz0rst1 tmm6, [rbx + 64]
-0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40
-
-# ATT:   t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
-0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6
-# INTEL: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456]
-0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2
-# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291]
-0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz1rs 64(%rbx), %tmm6
-# INTEL: t2rpntlvwz1rs tmm6, [rbx + 64]
-0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40
-
-# ATT:   t2rpntlvwz1rs -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32]
-0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6
-# INTEL: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456]
-0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2
-# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291]
-0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz1rst1 64(%rbx), %tmm6
-# INTEL: t2rpntlvwz1rst1 tmm6, [rbx + 64]
-0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40
-
-# ATT:   t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
-0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
-
 # ATT:   tileloaddrs 268435456(%rbp,%r14,8), %tmm6
 # INTEL: tileloaddrs tmm6, [rbp + 8*r14 + 268435456]
 0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10
@@ -97,70 +33,6 @@
 # INTEL: tileloaddrst1 tmm3, [2*rbp - 32]
 0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff
 
-# ATT:   t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6
-# INTEL: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456]
-0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2
-# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291]
-0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz0rs 64(%r18), %tmm6
-# INTEL: t2rpntlvwz0rs tmm6, [r18 + 64]
-0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40
-
-# ATT:   t2rpntlvwz0rs -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32]
-0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6
-# INTEL: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456]
-0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2
-# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291]
-0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz0rst1 64(%r18), %tmm6
-# INTEL: t2rpntlvwz0rst1 tmm6, [r18 + 64]
-0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40
-
-# ATT:   t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
-0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6
-# INTEL: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456]
-0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2
-# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291]
-0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz1rs 64(%r18), %tmm6
-# INTEL: t2rpntlvwz1rs tmm6, [r18 + 64]
-0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40
-
-# ATT:   t2rpntlvwz1rs -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32]
-0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6
-# INTEL: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456]
-0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2
-# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291]
-0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz1rst1 64(%r18), %tmm6
-# INTEL: t2rpntlvwz1rst1 tmm6, [r18 + 64]
-0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40
-
-# ATT:   t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
-0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
-
 # ATT:   tileloaddrs 268435456(%r16,%r14,8), %tmm6
 # INTEL: tileloaddrs tmm6, [r16 + 8*r14 + 268435456]
 0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10
diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt
index f372c42982b1b..347e61cdfc4b8 100644
--- a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt
+++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt
@@ -9,11 +9,3 @@
 # INTEL:      tmmultf32ps tmm3, tmm2, tmm1
 0xc4,0xe2,0x71,0x48,0xda
 
-# ATT:      ttmmultf32ps %tmm4, %tmm5, %tmm6
-# INTEL:      ttmmultf32ps tmm6, tmm5, tmm4
-0xc4,0xe2,0x58,0x48,0xf5
-
-# ATT:      ttmmultf32ps %tmm1, %tmm2, %tmm3
-# INTEL:      ttmmultf32ps tmm3, tmm2, tmm1
-0xc4,0xe2,0x70,0x48,0xda
-
diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
deleted file mode 100644
index d768630ac1475..0000000000000
--- a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
+++ /dev/null
@@ -1,154 +0,0 @@
-# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
-# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
-
-# ATT:   t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4
-# INTEL: t2rpntlvwz0 tmm4, [rbp + 8*r14 + 268435456]
-0xc4,0xa2,0x78,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz0 291(%r8,%rax,4), %tmm2
-# INTEL: t2rpntlvwz0 tmm2, [r8 + 4*rax + 291]
-0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz0 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz0 tmm2, [2*rbp - 32]
-0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm4
-# INTEL: t2rpntlvwz0t1 tmm4, [rbp + 8*r14 + 268435456]
-0xc4,0xa2,0x78,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2
-# INTEL: t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291]
-0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz0t1 tmm2, [2*rbp - 32]
-0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm4
-# INTEL: t2rpntlvwz1 tmm4, [rbp + 8*r14 + 268435456]
-0xc4,0xa2,0x79,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz1 291(%r8,%rax,4), %tmm2
-# INTEL: t2rpntlvwz1 tmm2, [r8 + 4*rax + 291]
-0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz1 tmm2, [2*rbp - 32]
-0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm4
-# INTEL: t2rpntlvwz1t1 tmm4, [rbp + 8*r14 + 268435456]
-0xc4,0xa2,0x79,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2
-# INTEL: t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291]
-0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32]
-0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4
-# INTEL: t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456]
-0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz0 291(%r8,%r17,4), %tmm2
-# INTEL: t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291]
-0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz0 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz0 tmm2, [2*rbp - 32]
-0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4
-# INTEL: t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456]
-0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2
-# INTEL: t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291]
-0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz0t1 tmm2, [2*rbp - 32]
-0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4
-# INTEL: t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456]
-0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz1 291(%r8,%r17,4), %tmm2
-# INTEL: t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291]
-0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz1 tmm2, [2*rbp - 32]
-0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4
-# INTEL: t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456]
-0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10
-
-# ATT:   t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2
-# INTEL: t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291]
-0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00
-
-# ATT:   t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
-# INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32]
-0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
-
-# ATT:   ttransposed %tmm1, %tmm2
-# INTEL: ttransposed tmm2, tmm1
-0xc4,0xe2,0x7a,0x5f,0xd1
-
-# ATT:   ttransposed %tmm2, %tmm3
-# INTEL: ttransposed tmm3, tmm2
-0xc4,0xe2,0x7a,0x5f,0xda
-
-# ATT:   ttdpbf16ps %tmm7, %tmm6, %tmm5
-# INTEL: ttdpbf16ps tmm5, tmm6, tmm7
-0xc4,0xe2,0x42,0x6c,0xee
-
-# ATT:   ttdpbf16ps %tmm1, %tmm2, %tmm3
-# INTEL: ttdpbf16ps tmm3, tmm2, tmm1
-0xc4,0xe2,0x72,0x6c,0xda
-
-# ATT:   ttdpfp16ps %tmm7, %tmm6, %tmm5
-# INTEL: ttdpfp16ps tmm5, tmm6, tmm7
-0xc4,0xe2,0x43,0x6c,0xee
-
-# ATT:   ttdpfp16ps %tmm1, %tmm2, %tmm3
-# INTEL: ttdpfp16ps tmm3, tmm2, tmm1
-0xc4,0xe2,0x73,0x6c,0xda
-
-# ATT:   ttcmmimfp16ps %tmm4, %tmm5, %tmm6
-# INTEL: ttcmmimfp16ps tmm6, tmm5, tmm4
-0xc4,0xe2,0x5b,0x6b,0xf5
-
-# ATT:   ttcmmimfp16ps %tmm1, %tmm2, %tmm3
-# INTEL: ttcmmimfp16ps tmm3, tmm2, tmm1
-0xc4,0xe2,0x73,0x6b,0xda
-
-# ATT:   ttcmmrlfp16ps %tmm4, %tmm5, %tmm6
-# INTEL: ttcmmrlfp16ps tmm6, tmm5, tmm4
-0xc4,0xe2,0x5a,0x6b,0xf5
-
-# ATT:   ttcmmrlfp16ps %tmm1, %tmm2, %tmm3
-# INTEL: ttcmmrlfp16ps tmm3, tmm2, tmm1
-0xc4,0xe2,0x72,0x6b,0xda
-
-# ATT:   tconjtcmmimfp16ps %tmm4, %tmm5, %tmm6
-# INTEL: tconjtcmmimfp16ps tmm6, tmm5, tmm4
-0xc4,0xe2,0x58,0x6b,0xf5
-
-# ATT:   tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3
-# INTEL: tconjtcmmimfp16ps tmm3, tmm2, tmm1
-0xc4,0xe2,0x70,0x6b,0xda
-
-# ATT:   tconjtfp16 %tmm5, %tmm6
-# INTEL: tconjtfp16 tmm6, tmm5
-0xc4,0xe2,0x79,0x6b,0xf5
-
-# ATT:   tconjtfp16 %tmm2, %tmm3
-# INTEL: tconjtfp16 tmm3, tmm2
-0xc4,0xe2,0x79,0x6b,0xda
diff --git a/llvm/test/MC/MachO/invalid-section-index.s b/llvm/test/MC/MachO/invalid-section-index.s
new file mode 100644
index 0000000000000..104e8a82e43af
--- /dev/null
+++ b/llvm/test/MC/MachO/invalid-section-index.s
@@ -0,0 +1,1573 @@
+// REQUIRES: aarch64-registered-target
+
+/// Test that when there are more than 255 sections, error is shown specifying too many sections.
+
+// RUN: not llvm-mc -filetype=obj -triple arm64-apple-darwin %s -o - 2>&1 | FileCheck %s --check-prefix=MACHOERROR
+
+// MACHOERROR: error: Too many sections!
+// MACHOERROR-NEXT: error: Invalid section index!
+// MACHOERROR-NEXT: error: Invalid section index!
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.globl	_main                           ; -- Begin function main
+	.p2align	2
+_main:                                  ; @main
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	sub	sp, sp, #16
+	.cfi_def_cfa_offset 16
+	mov	w0, #0                          ; =0x0
+	str	wzr, [sp, #12]
+	add	sp, sp, #16
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.section	seg,sect0
+	.globl	_var0                           ; @var0
+	.p2align	2, 0x0
+_var0:
+	.long	0                               ; 0x0
+
+	.section	seg,sect1
+	.globl	_var1                           ; @var1
+	.p2align	2, 0x0
+_var1:
+	.long	1                               ; 0x1
+
+	.section	seg,sect2
+	.globl	_var2                           ; @var2
+	.p2align	2, 0x0
+_var2:
+	.long	2                               ; 0x2
+
+	.section	seg,sect3
+	.globl	_var3                           ; @var3
+	.p2align	2, 0x0
+_var3:
+	.long	3                               ; 0x3
+
+	.section	seg,sect4
+	.globl	_var4                           ; @var4
+	.p2align	2, 0x0
+_var4:
+	.long	4                               ; 0x4
+
+	.section	seg,sect5
+	.globl	_var5                           ; @var5
+	.p2align	2, 0x0
+_var5:
+	.long	5                               ; 0x5
+
+	.section	seg,sect6
+	.globl	_var6                           ; @var6
+	.p2align	2, 0x0
+_var6:
+	.long	6                               ; 0x6
+
+	.section	seg,sect7
+	.globl	_var7                           ; @var7
+	.p2align	2, 0x0
+_var7:
+	.long	7                               ; 0x7
+
+	.section	seg,sect8
+	.globl	_var8                           ; @var8
+	.p2align	2, 0x0
+_var8:
+	.long	8                               ; 0x8
+
+	.section	seg,sect9
+	.globl	_var9                           ; @var9
+	.p2align	2, 0x0
+_var9:
+	.long	9                               ; 0x9
+
+	.section	seg,sect10
+	.globl	_var10                          ; @var10
+	.p2align	2, 0x0
+_var10:
+	.long	10                              ; 0xa
+
+	.section	seg,sect11
+	.globl	_var11                          ; @var11
+	.p2align	2, 0x0
+_var11:
+	.long	11                              ; 0xb
+
+	.section	seg,sect12
+	.globl	_var12                          ; @var12
+	.p2align	2, 0x0
+_var12:
+	.long	12                              ; 0xc
+
+	.section	seg,sect13
+	.globl	_var13                          ; @var13
+	.p2align	2, 0x0
+_var13:
+	.long	13                              ; 0xd
+
+	.section	seg,sect14
+	.globl	_var14                          ; @var14
+	.p2align	2, 0x0
+_var14:
+	.long	14                              ; 0xe
+
+	.section	seg,sect15
+	.globl	_var15                          ; @var15
+	.p2align	2, 0x0
+_var15:
+	.long	15                              ; 0xf
+
+	.section	seg,sect16
+	.globl	_var16                          ; @var16
+	.p2align	2, 0x0
+_var16:
+	.long	16                              ; 0x10
+
+	.section	seg,sect17
+	.globl	_var17                          ; @var17
+	.p2align	2, 0x0
+_var17:
+	.long	17                              ; 0x11
+
+	.section	seg,sect18
+	.globl	_var18                          ; @var18
+	.p2align	2, 0x0
+_var18:
+	.long	18                              ; 0x12
+
+	.section	seg,sect19
+	.globl	_var19                          ; @var19
+	.p2align	2, 0x0
+_var19:
+	.long	19                              ; 0x13
+
+	.section	seg,sect20
+	.globl	_var20                          ; @var20
+	.p2align	2, 0x0
+_var20:
+	.long	20                              ; 0x14
+
+	.section	seg,sect21
+	.globl	_var21                          ; @var21
+	.p2align	2, 0x0
+_var21:
+	.long	21                              ; 0x15
+
+	.section	seg,sect22
+	.globl	_var22                          ; @var22
+	.p2align	2, 0x0
+_var22:
+	.long	22                              ; 0x16
+
+	.section	seg,sect23
+	.globl	_var23                          ; @var23
+	.p2align	2, 0x0
+_var23:
+	.long	23                              ; 0x17
+
+	.section	seg,sect24
+	.globl	_var24                          ; @var24
+	.p2align	2, 0x0
+_var24:
+	.long	24                              ; 0x18
+
+	.section	seg,sect25
+	.globl	_var25                          ; @var25
+	.p2align	2, 0x0
+_var25:
+	.long	25                              ; 0x19
+
+	.section	seg,sect26
+	.globl	_var26                          ; @var26
+	.p2align	2, 0x0
+_var26:
+	.long	26                              ; 0x1a
+
+	.section	seg,sect27
+	.globl	_var27                          ; @var27
+	.p2align	2, 0x0
+_var27:
+	.long	27                              ; 0x1b
+
+	.section	seg,sect28
+	.globl	_var28                          ; @var28
+	.p2align	2, 0x0
+_var28:
+	.long	28                              ; 0x1c
+
+	.section	seg,sect29
+	.globl	_var29                          ; @var29
+	.p2align	2, 0x0
+_var29:
+	.long	29                              ; 0x1d
+
+	.section	seg,sect30
+	.globl	_var30                          ; @var30
+	.p2align	2, 0x0
+_var30:
+	.long	30                              ; 0x1e
+
+	.section	seg,sect31
+	.globl	_var31                          ; @var31
+	.p2align	2, 0x0
+_var31:
+	.long	31                              ; 0x1f
+
+	.section	seg,sect32
+	.globl	_var32                          ; @var32
+	.p2align	2, 0x0
+_var32:
+	.long	32                              ; 0x20
+
+	.section	seg,sect33
+	.globl	_var33                          ; @var33
+	.p2align	2, 0x0
+_var33:
+	.long	33                              ; 0x21
+
+	.section	seg,sect34
+	.globl	_var34                          ; @var34
+	.p2align	2, 0x0
+_var34:
+	.long	34                              ; 0x22
+
+	.section	seg,sect35
+	.globl	_var35                          ; @var35
+	.p2align	2, 0x0
+_var35:
+	.long	35                              ; 0x23
+
+	.section	seg,sect36
+	.globl	_var36                          ; @var36
+	.p2align	2, 0x0
+_var36:
+	.long	36                              ; 0x24
+
+	.section	seg,sect37
+	.globl	_var37                          ; @var37
+	.p2align	2, 0x0
+_var37:
+	.long	37                              ; 0x25
+
+	.section	seg,sect38
+	.globl	_var38                          ; @var38
+	.p2align	2, 0x0
+_var38:
+	.long	38                              ; 0x26
+
+	.section	seg,sect39
+	.globl	_var39                          ; @var39
+	.p2align	2, 0x0
+_var39:
+	.long	39                              ; 0x27
+
+	.section	seg,sect40
+	.globl	_var40                          ; @var40
+	.p2align	2, 0x0
+_var40:
+	.long	40                              ; 0x28
+
+	.section	seg,sect41
+	.globl	_var41                          ; @var41
+	.p2align	2, 0x0
+_var41:
+	.long	41                              ; 0x29
+
+	.section	seg,sect42
+	.globl	_var42                          ; @var42
+	.p2align	2, 0x0
+_var42:
+	.long	42                              ; 0x2a
+
+	.section	seg,sect43
+	.globl	_var43                          ; @var43
+	.p2align	2, 0x0
+_var43:
+	.long	43                              ; 0x2b
+
+	.section	seg,sect44
+	.globl	_var44                          ; @var44
+	.p2align	2, 0x0
+_var44:
+	.long	44                              ; 0x2c
+
+	.section	seg,sect45
+	.globl	_var45                          ; @var45
+	.p2align	2, 0x0
+_var45:
+	.long	45                              ; 0x2d
+
+	.section	seg,sect46
+	.globl	_var46                          ; @var46
+	.p2align	2, 0x0
+_var46:
+	.long	46                              ; 0x2e
+
+	.section	seg,sect47
+	.globl	_var47                          ; @var47
+	.p2align	2, 0x0
+_var47:
+	.long	47                              ; 0x2f
+
+	.section	seg,sect48
+	.globl	_var48                          ; @var48
+	.p2align	2, 0x0
+_var48:
+	.long	48                              ; 0x30
+
+	.section	seg,sect49
+	.globl	_var49                          ; @var49
+	.p2align	2, 0x0
+_var49:
+	.long	49                              ; 0x31
+
+	.section	seg,sect50
+	.globl	_var50                          ; @var50
+	.p2align	2, 0x0
+_var50:
+	.long	50                              ; 0x32
+
+	.section	seg,sect51
+	.globl	_var51                          ; @var51
+	.p2align	2, 0x0
+_var51:
+	.long	51                              ; 0x33
+
+	.section	seg,sect52
+	.globl	_var52                          ; @var52
+	.p2align	2, 0x0
+_var52:
+	.long	52                              ; 0x34
+
+	.section	seg,sect53
+	.globl	_var53                          ; @var53
+	.p2align	2, 0x0
+_var53:
+	.long	53                              ; 0x35
+
+	.section	seg,sect54
+	.globl	_var54                          ; @var54
+	.p2align	2, 0x0
+_var54:
+	.long	54                              ; 0x36
+
+	.section	seg,sect55
+	.globl	_var55                          ; @var55
+	.p2align	2, 0x0
+_var55:
+	.long	55                              ; 0x37
+
+	.section	seg,sect56
+	.globl	_var56                          ; @var56
+	.p2align	2, 0x0
+_var56:
+	.long	56                              ; 0x38
+
+	.section	seg,sect57
+	.globl	_var57                          ; @var57
+	.p2align	2, 0x0
+_var57:
+	.long	57                              ; 0x39
+
+	.section	seg,sect58
+	.globl	_var58                          ; @var58
+	.p2align	2, 0x0
+_var58:
+	.long	58                              ; 0x3a
+
+	.section	seg,sect59
+	.globl	_var59                          ; @var59
+	.p2align	2, 0x0
+_var59:
+	.long	59                              ; 0x3b
+
+	.section	seg,sect60
+	.globl	_var60                          ; @var60
+	.p2align	2, 0x0
+_var60:
+	.long	60                              ; 0x3c
+
+	.section	seg,sect61
+	.globl	_var61                          ; @var61
+	.p2align	2, 0x0
+_var61:
+	.long	61                              ; 0x3d
+
+	.section	seg,sect62
+	.globl	_var62                          ; @var62
+	.p2align	2, 0x0
+_var62:
+	.long	62                              ; 0x3e
+
+	.section	seg,sect63
+	.globl	_var63                          ; @var63
+	.p2align	2, 0x0
+_var63:
+	.long	63                              ; 0x3f
+
+	.section	seg,sect64
+	.globl	_var64                          ; @var64
+	.p2align	2, 0x0
+_var64:
+	.long	64                              ; 0x40
+
+	.section	seg,sect65
+	.globl	_var65                          ; @var65
+	.p2align	2, 0x0
+_var65:
+	.long	65                              ; 0x41
+
+	.section	seg,sect66
+	.globl	_var66                          ; @var66
+	.p2align	2, 0x0
+_var66:
+	.long	66                              ; 0x42
+
+	.section	seg,sect67
+	.globl	_var67                          ; @var67
+	.p2align	2, 0x0
+_var67:
+	.long	67                              ; 0x43
+
+	.section	seg,sect68
+	.globl	_var68                          ; @var68
+	.p2align	2, 0x0
+_var68:
+	.long	68                              ; 0x44
+
+	.section	seg,sect69
+	.globl	_var69                          ; @var69
+	.p2align	2, 0x0
+_var69:
+	.long	69                              ; 0x45
+
+	.section	seg,sect70
+	.globl	_var70                          ; @var70
+	.p2align	2, 0x0
+_var70:
+	.long	70                              ; 0x46
+
+	.section	seg,sect71
+	.globl	_var71                          ; @var71
+	.p2align	2, 0x0
+_var71:
+	.long	71                              ; 0x47
+
+	.section	seg,sect72
+	.globl	_var72                          ; @var72
+	.p2align	2, 0x0
+_var72:
+	.long	72                              ; 0x48
+
+	.section	seg,sect73
+	.globl	_var73                          ; @var73
+	.p2align	2, 0x0
+_var73:
+	.long	73                              ; 0x49
+
+	.section	seg,sect74
+	.globl	_var74                          ; @var74
+	.p2align	2, 0x0
+_var74:
+	.long	74                              ; 0x4a
+
+	.section	seg,sect75
+	.globl	_var75                          ; @var75
+	.p2align	2, 0x0
+_var75:
+	.long	75                              ; 0x4b
+
+	.section	seg,sect76
+	.globl	_var76                          ; @var76
+	.p2align	2, 0x0
+_var76:
+	.long	76                              ; 0x4c
+
+	.section	seg,sect77
+	.globl	_var77                          ; @var77
+	.p2align	2, 0x0
+_var77:
+	.long	77                              ; 0x4d
+
+	.section	seg,sect78
+	.globl	_var78                          ; @var78
+	.p2align	2, 0x0
+_var78:
+	.long	78                              ; 0x4e
+
+	.section	seg,sect79
+	.globl	_var79                          ; @var79
+	.p2align	2, 0x0
+_var79:
+	.long	79                              ; 0x4f
+
+	.section	seg,sect80
+	.globl	_var80                          ; @var80
+	.p2align	2, 0x0
+_var80:
+	.long	80                              ; 0x50
+
+	.section	seg,sect81
+	.globl	_var81                          ; @var81
+	.p2align	2, 0x0
+_var81:
+	.long	81                              ; 0x51
+
+	.section	seg,sect82
+	.globl	_var82                          ; @var82
+	.p2align	2, 0x0
+_var82:
+	.long	82                              ; 0x52
+
+	.section	seg,sect83
+	.globl	_var83                          ; @var83
+	.p2align	2, 0x0
+_var83:
+	.long	83                              ; 0x53
+
+	.section	seg,sect84
+	.globl	_var84                          ; @var84
+	.p2align	2, 0x0
+_var84:
+	.long	84                              ; 0x54
+
+	.section	seg,sect85
+	.globl	_var85                          ; @var85
+	.p2align	2, 0x0
+_var85:
+	.long	85                              ; 0x55
+
+	.section	seg,sect86
+	.globl	_var86                          ; @var86
+	.p2align	2, 0x0
+_var86:
+	.long	86                              ; 0x56
+
+	.section	seg,sect87
+	.globl	_var87                          ; @var87
+	.p2align	2, 0x0
+_var87:
+	.long	87                              ; 0x57
+
+	.section	seg,sect88
+	.globl	_var88                          ; @var88
+	.p2align	2, 0x0
+_var88:
+	.long	88                              ; 0x58
+
+	.section	seg,sect89
+	.globl	_var89                          ; @var89
+	.p2align	2, 0x0
+_var89:
+	.long	89                              ; 0x59
+
+	.section	seg,sect90
+	.globl	_var90                          ; @var90
+	.p2align	2, 0x0
+_var90:
+	.long	90                              ; 0x5a
+
+	.section	seg,sect91
+	.globl	_var91                          ; @var91
+	.p2align	2, 0x0
+_var91:
+	.long	91                              ; 0x5b
+
+	.section	seg,sect92
+	.globl	_var92                          ; @var92
+	.p2align	2, 0x0
+_var92:
+	.long	92                              ; 0x5c
+
+	.section	seg,sect93
+	.globl	_var93                          ; @var93
+	.p2align	2, 0x0
+_var93:
+	.long	93                              ; 0x5d
+
+	.section	seg,sect94
+	.globl	_var94                          ; @var94
+	.p2align	2, 0x0
+_var94:
+	.long	94                              ; 0x5e
+
+	.section	seg,sect95
+	.globl	_var95                          ; @var95
+	.p2align	2, 0x0
+_var95:
+	.long	95                              ; 0x5f
+
+	.section	seg,sect96
+	.globl	_var96                          ; @var96
+	.p2align	2, 0x0
+_var96:
+	.long	96                              ; 0x60
+
+	.section	seg,sect97
+	.globl	_var97                          ; @var97
+	.p2align	2, 0x0
+_var97:
+	.long	97                              ; 0x61
+
+	.section	seg,sect98
+	.globl	_var98                          ; @var98
+	.p2align	2, 0x0
+_var98:
+	.long	98                              ; 0x62
+
+	.section	seg,sect99
+	.globl	_var99                          ; @var99
+	.p2align	2, 0x0
+_var99:
+	.long	99                              ; 0x63
+
+	.section	seg,sect100
+	.globl	_var100                         ; @var100
+	.p2align	2, 0x0
+_var100:
+	.long	100                             ; 0x64
+
+	.section	seg,sect101
+	.globl	_var101                         ; @var101
+	.p2align	2, 0x0
+_var101:
+	.long	101                             ; 0x65
+
+	.section	seg,sect102
+	.globl	_var102                         ; @var102
+	.p2align	2, 0x0
+_var102:
+	.long	102                             ; 0x66
+
+	.section	seg,sect103
+	.globl	_var103                         ; @var103
+	.p2align	2, 0x0
+_var103:
+	.long	103                             ; 0x67
+
+	.section	seg,sect104
+	.globl	_var104                         ; @var104
+	.p2align	2, 0x0
+_var104:
+	.long	104                             ; 0x68
+
+	.section	seg,sect105
+	.globl	_var105                         ; @var105
+	.p2align	2, 0x0
+_var105:
+	.long	105                             ; 0x69
+
+	.section	seg,sect106
+	.globl	_var106                         ; @var106
+	.p2align	2, 0x0
+_var106:
+	.long	106                             ; 0x6a
+
+	.section	seg,sect107
+	.globl	_var107                         ; @var107
+	.p2align	2, 0x0
+_var107:
+	.long	107                             ; 0x6b
+
+	.section	seg,sect108
+	.globl	_var108                         ; @var108
+	.p2align	2, 0x0
+_var108:
+	.long	108                             ; 0x6c
+
+	.section	seg,sect109
+	.globl	_var109                         ; @var109
+	.p2align	2, 0x0
+_var109:
+	.long	109                             ; 0x6d
+
+	.section	seg,sect110
+	.globl	_var110                         ; @var110
+	.p2align	2, 0x0
+_var110:
+	.long	110                             ; 0x6e
+
+	.section	seg,sect111
+	.globl	_var111                         ; @var111
+	.p2align	2, 0x0
+_var111:
+	.long	111                             ; 0x6f
+
+	.section	seg,sect112
+	.globl	_var112                         ; @var112
+	.p2align	2, 0x0
+_var112:
+	.long	112                             ; 0x70
+
+	.section	seg,sect113
+	.globl	_var113                         ; @var113
+	.p2align	2, 0x0
+_var113:
+	.long	113                             ; 0x71
+
+	.section	seg,sect114
+	.globl	_var114                         ; @var114
+	.p2align	2, 0x0
+_var114:
+	.long	114                             ; 0x72
+
+	.section	seg,sect115
+	.globl	_var115                         ; @var115
+	.p2align	2, 0x0
+_var115:
+	.long	115                             ; 0x73
+
+	.section	seg,sect116
+	.globl	_var116                         ; @var116
+	.p2align	2, 0x0
+_var116:
+	.long	116                             ; 0x74
+
+	.section	seg,sect117
+	.globl	_var117                         ; @var117
+	.p2align	2, 0x0
+_var117:
+	.long	117                             ; 0x75
+
+	.section	seg,sect118
+	.globl	_var118                         ; @var118
+	.p2align	2, 0x0
+_var118:
+	.long	118                             ; 0x76
+
+	.section	seg,sect119
+	.globl	_var119                         ; @var119
+	.p2align	2, 0x0
+_var119:
+	.long	119                             ; 0x77
+
+	.section	seg,sect120
+	.globl	_var120                         ; @var120
+	.p2align	2, 0x0
+_var120:
+	.long	120                             ; 0x78
+
+	.section	seg,sect121
+	.globl	_var121                         ; @var121
+	.p2align	2, 0x0
+_var121:
+	.long	121                             ; 0x79
+
+	.section	seg,sect122
+	.globl	_var122                         ; @var122
+	.p2align	2, 0x0
+_var122:
+	.long	122                             ; 0x7a
+
+	.section	seg,sect123
+	.globl	_var123                         ; @var123
+	.p2align	2, 0x0
+_var123:
+	.long	123                             ; 0x7b
+
+	.section	seg,sect124
+	.globl	_var124                         ; @var124
+	.p2align	2, 0x0
+_var124:
+	.long	124                             ; 0x7c
+
+	.section	seg,sect125
+	.globl	_var125                         ; @var125
+	.p2align	2, 0x0
+_var125:
+	.long	125                             ; 0x7d
+
+	.section	seg,sect126
+	.globl	_var126                         ; @var126
+	.p2align	2, 0x0
+_var126:
+	.long	126                             ; 0x7e
+
+	.section	seg,sect127
+	.globl	_var127                         ; @var127
+	.p2align	2, 0x0
+_var127:
+	.long	127                             ; 0x7f
+
+	.section	seg,sect128
+	.globl	_var128                         ; @var128
+	.p2align	2, 0x0
+_var128:
+	.long	128                             ; 0x80
+
+	.section	seg,sect129
+	.globl	_var129                         ; @var129
+	.p2align	2, 0x0
+_var129:
+	.long	129                             ; 0x81
+
+	.section	seg,sect130
+	.globl	_var130                         ; @var130
+	.p2align	2, 0x0
+_var130:
+	.long	130                             ; 0x82
+
+	.section	seg,sect131
+	.globl	_var131                         ; @var131
+	.p2align	2, 0x0
+_var131:
+	.long	131                             ; 0x83
+
+	.section	seg,sect132
+	.globl	_var132                         ; @var132
+	.p2align	2, 0x0
+_var132:
+	.long	132                             ; 0x84
+
+	.section	seg,sect133
+	.globl	_var133                         ; @var133
+	.p2align	2, 0x0
+_var133:
+	.long	133                             ; 0x85
+
+	.section	seg,sect134
+	.globl	_var134                         ; @var134
+	.p2align	2, 0x0
+_var134:
+	.long	134                             ; 0x86
+
+	.section	seg,sect135
+	.globl	_var135                         ; @var135
+	.p2align	2, 0x0
+_var135:
+	.long	135                             ; 0x87
+
+	.section	seg,sect136
+	.globl	_var136                         ; @var136
+	.p2align	2, 0x0
+_var136:
+	.long	136                             ; 0x88
+
+	.section	seg,sect137
+	.globl	_var137                         ; @var137
+	.p2align	2, 0x0
+_var137:
+	.long	137                             ; 0x89
+
+	.section	seg,sect138
+	.globl	_var138                         ; @var138
+	.p2align	2, 0x0
+_var138:
+	.long	138                             ; 0x8a
+
+	.section	seg,sect139
+	.globl	_var139                         ; @var139
+	.p2align	2, 0x0
+_var139:
+	.long	139                             ; 0x8b
+
+	.section	seg,sect140
+	.globl	_var140                         ; @var140
+	.p2align	2, 0x0
+_var140:
+	.long	140                             ; 0x8c
+
+	.section	seg,sect141
+	.globl	_var141                         ; @var141
+	.p2align	2, 0x0
+_var141:
+	.long	141                             ; 0x8d
+
+	.section	seg,sect142
+	.globl	_var142                         ; @var142
+	.p2align	2, 0x0
+_var142:
+	.long	142                             ; 0x8e
+
+	.section	seg,sect143
+	.globl	_var143                         ; @var143
+	.p2align	2, 0x0
+_var143:
+	.long	143                             ; 0x8f
+
+	.section	seg,sect144
+	.globl	_var144                         ; @var144
+	.p2align	2, 0x0
+_var144:
+	.long	144                             ; 0x90
+
+	.section	seg,sect145
+	.globl	_var145                         ; @var145
+	.p2align	2, 0x0
+_var145:
+	.long	145                             ; 0x91
+
+	.section	seg,sect146
+	.globl	_var146                         ; @var146
+	.p2align	2, 0x0
+_var146:
+	.long	146                             ; 0x92
+
+	.section	seg,sect147
+	.globl	_var147                         ; @var147
+	.p2align	2, 0x0
+_var147:
+	.long	147                             ; 0x93
+
+	.section	seg,sect148
+	.globl	_var148                         ; @var148
+	.p2align	2, 0x0
+_var148:
+	.long	148                             ; 0x94
+
+	.section	seg,sect149
+	.globl	_var149                         ; @var149
+	.p2align	2, 0x0
+_var149:
+	.long	149                             ; 0x95
+
+	.section	seg,sect150
+	.globl	_var150                         ; @var150
+	.p2align	2, 0x0
+_var150:
+	.long	150                             ; 0x96
+
+	.section	seg,sect151
+	.globl	_var151                         ; @var151
+	.p2align	2, 0x0
+_var151:
+	.long	151                             ; 0x97
+
+	.section	seg,sect152
+	.globl	_var152                         ; @var152
+	.p2align	2, 0x0
+_var152:
+	.long	152                             ; 0x98
+
+	.section	seg,sect153
+	.globl	_var153                         ; @var153
+	.p2align	2, 0x0
+_var153:
+	.long	153                             ; 0x99
+
+	.section	seg,sect154
+	.globl	_var154                         ; @var154
+	.p2align	2, 0x0
+_var154:
+	.long	154                             ; 0x9a
+
+	.section	seg,sect155
+	.globl	_var155                         ; @var155
+	.p2align	2, 0x0
+_var155:
+	.long	155                             ; 0x9b
+
+	.section	seg,sect156
+	.globl	_var156                         ; @var156
+	.p2align	2, 0x0
+_var156:
+	.long	156                             ; 0x9c
+
+	.section	seg,sect157
+	.globl	_var157                         ; @var157
+	.p2align	2, 0x0
+_var157:
+	.long	157                             ; 0x9d
+
+	.section	seg,sect158
+	.globl	_var158                         ; @var158
+	.p2align	2, 0x0
+_var158:
+	.long	158                             ; 0x9e
+
+	.section	seg,sect159
+	.globl	_var159                         ; @var159
+	.p2align	2, 0x0
+_var159:
+	.long	159                             ; 0x9f
+
+	.section	seg,sect160
+	.globl	_var160                         ; @var160
+	.p2align	2, 0x0
+_var160:
+	.long	160                             ; 0xa0
+
+	.section	seg,sect161
+	.globl	_var161                         ; @var161
+	.p2align	2, 0x0
+_var161:
+	.long	161                             ; 0xa1
+
+	.section	seg,sect162
+	.globl	_var162                         ; @var162
+	.p2align	2, 0x0
+_var162:
+	.long	162                             ; 0xa2
+
+	.section	seg,sect163
+	.globl	_var163                         ; @var163
+	.p2align	2, 0x0
+_var163:
+	.long	163                             ; 0xa3
+
+	.section	seg,sect164
+	.globl	_var164                         ; @var164
+	.p2align	2, 0x0
+_var164:
+	.long	164                             ; 0xa4
+
+	.section	seg,sect165
+	.globl	_var165                         ; @var165
+	.p2align	2, 0x0
+_var165:
+	.long	165                             ; 0xa5
+
+	.section	seg,sect166
+	.globl	_var166                         ; @var166
+	.p2align	2, 0x0
+_var166:
+	.long	166                             ; 0xa6
+
+	.section	seg,sect167
+	.globl	_var167                         ; @var167
+	.p2align	2, 0x0
+_var167:
+	.long	167                             ; 0xa7
+
+	.section	seg,sect168
+	.globl	_var168                         ; @var168
+	.p2align	2, 0x0
+_var168:
+	.long	168                             ; 0xa8
+
+	.section	seg,sect169
+	.globl	_var169                         ; @var169
+	.p2align	2, 0x0
+_var169:
+	.long	169                             ; 0xa9
+
+	.section	seg,sect170
+	.globl	_var170                         ; @var170
+	.p2align	2, 0x0
+_var170:
+	.long	170                             ; 0xaa
+
+	.section	seg,sect171
+	.globl	_var171                         ; @var171
+	.p2align	2, 0x0
+_var171:
+	.long	171                             ; 0xab
+
+	.section	seg,sect172
+	.globl	_var172                         ; @var172
+	.p2align	2, 0x0
+_var172:
+	.long	172                             ; 0xac
+
+	.section	seg,sect173
+	.globl	_var173                         ; @var173
+	.p2align	2, 0x0
+_var173:
+	.long	173                             ; 0xad
+
+	.section	seg,sect174
+	.globl	_var174                         ; @var174
+	.p2align	2, 0x0
+_var174:
+	.long	174                             ; 0xae
+
+	.section	seg,sect175
+	.globl	_var175                         ; @var175
+	.p2align	2, 0x0
+_var175:
+	.long	175                             ; 0xaf
+
+	.section	seg,sect176
+	.globl	_var176                         ; @var176
+	.p2align	2, 0x0
+_var176:
+	.long	176                             ; 0xb0
+
+	.section	seg,sect177
+	.globl	_var177                         ; @var177
+	.p2align	2, 0x0
+_var177:
+	.long	177                             ; 0xb1
+
+	.section	seg,sect178
+	.globl	_var178                         ; @var178
+	.p2align	2, 0x0
+_var178:
+	.long	178                             ; 0xb2
+
+	.section	seg,sect179
+	.globl	_var179                         ; @var179
+	.p2align	2, 0x0
+_var179:
+	.long	179                             ; 0xb3
+
+	.section	seg,sect180
+	.globl	_var180                         ; @var180
+	.p2align	2, 0x0
+_var180:
+	.long	180                             ; 0xb4
+
+	.section	seg,sect181
+	.globl	_var181                         ; @var181
+	.p2align	2, 0x0
+_var181:
+	.long	181                             ; 0xb5
+
+	.section	seg,sect182
+	.globl	_var182                         ; @var182
+	.p2align	2, 0x0
+_var182:
+	.long	182                             ; 0xb6
+
+	.section	seg,sect183
+	.globl	_var183                         ; @var183
+	.p2align	2, 0x0
+_var183:
+	.long	183                             ; 0xb7
+
+	.section	seg,sect184
+	.globl	_var184                         ; @var184
+	.p2align	2, 0x0
+_var184:
+	.long	184                             ; 0xb8
+
+	.section	seg,sect185
+	.globl	_var185                         ; @var185
+	.p2align	2, 0x0
+_var185:
+	.long	185                             ; 0xb9
+
+	.section	seg,sect186
+	.globl	_var186                         ; @var186
+	.p2align	2, 0x0
+_var186:
+	.long	186                             ; 0xba
+
+	.section	seg,sect187
+	.globl	_var187                         ; @var187
+	.p2align	2, 0x0
+_var187:
+	.long	187                             ; 0xbb
+
+	.section	seg,sect188
+	.globl	_var188                         ; @var188
+	.p2align	2, 0x0
+_var188:
+	.long	188                             ; 0xbc
+
+	.section	seg,sect189
+	.globl	_var189                         ; @var189
+	.p2align	2, 0x0
+_var189:
+	.long	189                             ; 0xbd
+
+	.section	seg,sect190
+	.globl	_var190                         ; @var190
+	.p2align	2, 0x0
+_var190:
+	.long	190                             ; 0xbe
+
+	.section	seg,sect191
+	.globl	_var191                         ; @var191
+	.p2align	2, 0x0
+_var191:
+	.long	191                             ; 0xbf
+
+	.section	seg,sect192
+	.globl	_var192                         ; @var192
+	.p2align	2, 0x0
+_var192:
+	.long	192                             ; 0xc0
+
+	.section	seg,sect193
+	.globl	_var193                         ; @var193
+	.p2align	2, 0x0
+_var193:
+	.long	193                             ; 0xc1
+
+	.section	seg,sect194
+	.globl	_var194                         ; @var194
+	.p2align	2, 0x0
+_var194:
+	.long	194                             ; 0xc2
+
+	.section	seg,sect195
+	.globl	_var195                         ; @var195
+	.p2align	2, 0x0
+_var195:
+	.long	195                             ; 0xc3
+
+	.section	seg,sect196
+	.globl	_var196                         ; @var196
+	.p2align	2, 0x0
+_var196:
+	.long	196                             ; 0xc4
+
+	.section	seg,sect197
+	.globl	_var197                         ; @var197
+	.p2align	2, 0x0
+_var197:
+	.long	197                             ; 0xc5
+
+	.section	seg,sect198
+	.globl	_var198                         ; @var198
+	.p2align	2, 0x0
+_var198:
+	.long	198                             ; 0xc6
+
+	.section	seg,sect199
+	.globl	_var199                         ; @var199
+	.p2align	2, 0x0
+_var199:
+	.long	199                             ; 0xc7
+
+	.section	seg,sect200
+	.globl	_var200                         ; @var200
+	.p2align	2, 0x0
+_var200:
+	.long	200                             ; 0xc8
+
+	.section	seg,sect201
+	.globl	_var201                         ; @var201
+	.p2align	2, 0x0
+_var201:
+	.long	201                             ; 0xc9
+
+	.section	seg,sect202
+	.globl	_var202                         ; @var202
+	.p2align	2, 0x0
+_var202:
+	.long	202                             ; 0xca
+
+	.section	seg,sect203
+	.globl	_var203                         ; @var203
+	.p2align	2, 0x0
+_var203:
+	.long	203                             ; 0xcb
+
+	.section	seg,sect204
+	.globl	_var204                         ; @var204
+	.p2align	2, 0x0
+_var204:
+	.long	204                             ; 0xcc
+
+	.section	seg,sect205
+	.globl	_var205                         ; @var205
+	.p2align	2, 0x0
+_var205:
+	.long	205                             ; 0xcd
+
+	.section	seg,sect206
+	.globl	_var206                         ; @var206
+	.p2align	2, 0x0
+_var206:
+	.long	206                             ; 0xce
+
+	.section	seg,sect207
+	.globl	_var207                         ; @var207
+	.p2align	2, 0x0
+_var207:
+	.long	207                             ; 0xcf
+
+	.section	seg,sect208
+	.globl	_var208                         ; @var208
+	.p2align	2, 0x0
+_var208:
+	.long	208                             ; 0xd0
+
+	.section	seg,sect209
+	.globl	_var209                         ; @var209
+	.p2align	2, 0x0
+_var209:
+	.long	209                             ; 0xd1
+
+	.section	seg,sect210
+	.globl	_var210                         ; @var210
+	.p2align	2, 0x0
+_var210:
+	.long	210                             ; 0xd2
+
+	.section	seg,sect211
+	.globl	_var211                         ; @var211
+	.p2align	2, 0x0
+_var211:
+	.long	211                             ; 0xd3
+
+	.section	seg,sect212
+	.globl	_var212                         ; @var212
+	.p2align	2, 0x0
+_var212:
+	.long	212                             ; 0xd4
+
+	.section	seg,sect213
+	.globl	_var213                         ; @var213
+	.p2align	2, 0x0
+_var213:
+	.long	213                             ; 0xd5
+
+	.section	seg,sect214
+	.globl	_var214                         ; @var214
+	.p2align	2, 0x0
+_var214:
+	.long	214                             ; 0xd6
+
+	.section	seg,sect215
+	.globl	_var215                         ; @var215
+	.p2align	2, 0x0
+_var215:
+	.long	215                             ; 0xd7
+
+	.section	seg,sect216
+	.globl	_var216                         ; @var216
+	.p2align	2, 0x0
+_var216:
+	.long	216                             ; 0xd8
+
+	.section	seg,sect217
+	.globl	_var217                         ; @var217
+	.p2align	2, 0x0
+_var217:
+	.long	217                             ; 0xd9
+
+	.section	seg,sect218
+	.globl	_var218                         ; @var218
+	.p2align	2, 0x0
+_var218:
+	.long	218                             ; 0xda
+
+	.section	seg,sect219
+	.globl	_var219                         ; @var219
+	.p2align	2, 0x0
+_var219:
+	.long	219                             ; 0xdb
+
+	.section	seg,sect220
+	.globl	_var220                         ; @var220
+	.p2align	2, 0x0
+_var220:
+	.long	220                             ; 0xdc
+
+	.section	seg,sect221
+	.globl	_var221                         ; @var221
+	.p2align	2, 0x0
+_var221:
+	.long	221                             ; 0xdd
+
+	.section	seg,sect222
+	.globl	_var222                         ; @var222
+	.p2align	2, 0x0
+_var222:
+	.long	222                             ; 0xde
+
+	.section	seg,sect223
+	.globl	_var223                         ; @var223
+	.p2align	2, 0x0
+_var223:
+	.long	223                             ; 0xdf
+
+	.section	seg,sect224
+	.globl	_var224                         ; @var224
+	.p2align	2, 0x0
+_var224:
+	.long	224                             ; 0xe0
+
+	.section	seg,sect225
+	.globl	_var225                         ; @var225
+	.p2align	2, 0x0
+_var225:
+	.long	225                             ; 0xe1
+
+	.section	seg,sect226
+	.globl	_var226                         ; @var226
+	.p2align	2, 0x0
+_var226:
+	.long	226                             ; 0xe2
+
+	.section	seg,sect227
+	.globl	_var227                         ; @var227
+	.p2align	2, 0x0
+_var227:
+	.long	227                             ; 0xe3
+
+	.section	seg,sect228
+	.globl	_var228                         ; @var228
+	.p2align	2, 0x0
+_var228:
+	.long	228                             ; 0xe4
+
+	.section	seg,sect229
+	.globl	_var229                         ; @var229
+	.p2align	2, 0x0
+_var229:
+	.long	229                             ; 0xe5
+
+	.section	seg,sect230
+	.globl	_var230                         ; @var230
+	.p2align	2, 0x0
+_var230:
+	.long	230                             ; 0xe6
+
+	.section	seg,sect231
+	.globl	_var231                         ; @var231
+	.p2align	2, 0x0
+_var231:
+	.long	231                             ; 0xe7
+
+	.section	seg,sect232
+	.globl	_var232                         ; @var232
+	.p2align	2, 0x0
+_var232:
+	.long	232                             ; 0xe8
+
+	.section	seg,sect233
+	.globl	_var233                         ; @var233
+	.p2align	2, 0x0
+_var233:
+	.long	233                             ; 0xe9
+
+	.section	seg,sect234
+	.globl	_var234                         ; @var234
+	.p2align	2, 0x0
+_var234:
+	.long	234                             ; 0xea
+
+	.section	seg,sect235
+	.globl	_var235                         ; @var235
+	.p2align	2, 0x0
+_var235:
+	.long	235                             ; 0xeb
+
+	.section	seg,sect236
+	.globl	_var236                         ; @var236
+	.p2align	2, 0x0
+_var236:
+	.long	236                             ; 0xec
+
+	.section	seg,sect237
+	.globl	_var237                         ; @var237
+	.p2align	2, 0x0
+_var237:
+	.long	237                             ; 0xed
+
+	.section	seg,sect238
+	.globl	_var238                         ; @var238
+	.p2align	2, 0x0
+_var238:
+	.long	238                             ; 0xee
+
+	.section	seg,sect239
+	.globl	_var239                         ; @var239
+	.p2align	2, 0x0
+_var239:
+	.long	239                             ; 0xef
+
+	.section	seg,sect240
+	.globl	_var240                         ; @var240
+	.p2align	2, 0x0
+_var240:
+	.long	240                             ; 0xf0
+
+	.section	seg,sect241
+	.globl	_var241                         ; @var241
+	.p2align	2, 0x0
+_var241:
+	.long	241                             ; 0xf1
+
+	.section	seg,sect242
+	.globl	_var242                         ; @var242
+	.p2align	2, 0x0
+_var242:
+	.long	242                             ; 0xf2
+
+	.section	seg,sect243
+	.globl	_var243                         ; @var243
+	.p2align	2, 0x0
+_var243:
+	.long	243                             ; 0xf3
+
+	.section	seg,sect244
+	.globl	_var244                         ; @var244
+	.p2align	2, 0x0
+_var244:
+	.long	244                             ; 0xf4
+
+	.section	seg,sect245
+	.globl	_var245                         ; @var245
+	.p2align	2, 0x0
+_var245:
+	.long	245                             ; 0xf5
+
+	.section	seg,sect246
+	.globl	_var246                         ; @var246
+	.p2align	2, 0x0
+_var246:
+	.long	246                             ; 0xf6
+
+	.section	seg,sect247
+	.globl	_var247                         ; @var247
+	.p2align	2, 0x0
+_var247:
+	.long	247                             ; 0xf7
+
+	.section	seg,sect248
+	.globl	_var248                         ; @var248
+	.p2align	2, 0x0
+_var248:
+	.long	248                             ; 0xf8
+
+	.section	seg,sect249
+	.globl	_var249                         ; @var249
+	.p2align	2, 0x0
+_var249:
+	.long	249                             ; 0xf9
+
+	.section	seg,sect250
+	.globl	_var250                         ; @var250
+	.p2align	2, 0x0
+_var250:
+	.long	250                             ; 0xfa
+
+	.section	seg,sect251
+	.globl	_var251                         ; @var251
+	.p2align	2, 0x0
+_var251:
+	.long	251                             ; 0xfb
+
+	.section	seg,sect252
+	.globl	_var252                         ; @var252
+	.p2align	2, 0x0
+_var252:
+	.long	252                             ; 0xfc
+
+	.section	seg,sect253
+	.globl	_var253                         ; @var253
+	.p2align	2, 0x0
+_var253:
+	.long	253                             ; 0xfd
+
+	.section	seg,sect254
+	.globl	_var254                         ; @var254
+	.p2align	2, 0x0
+_var254:
+	.long	254                             ; 0xfe
+
+	.section	seg,sect255
+	.globl	_var255                         ; @var255
+	.p2align	2, 0x0
+_var255:
+	.long	255                             ; 0xff
+
+	.section	seg,sect256
+	.globl	_var256                         ; @var256
+	.p2align	2, 0x0
+_var256:
+	.long	256                             ; 0x100
+
+	.section	seg,sect257
+	.globl	_var257                         ; @var257
+	.p2align	2, 0x0
+_var257:
+	.long	257                             ; 0x101
+
+.subsections_via_symbols
diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
index bc0683e38887c..ab72649fc3404 100644
--- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
+++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
@@ -105,6 +105,10 @@
 # CHECK-LE: lxvprll 6, 2, 1               # encoding: [0xda,0x0c,0xc2,0x7c]
             lxvprll 6, 2, 1
 
+           lxvpb32x 2, 15, 16
+#CHECK-BE: lxvpb32x 2, 15, 16            # encoding: [0x7c,0x4f,0x86,0xda]
+#CHECK-LE: lxvpb32x 2, 15, 16            # encoding: [0xda,0x86,0x4f,0x7c]
+
 # CHECK-BE: stxvprl 0, 1, 2               # encoding: [0x7c,0x01,0x15,0x9a]
 # CHECK-LE: stxvprl 0, 1, 2               # encoding: [0x9a,0x15,0x01,0x7c]
             stxvprl 0, 1, 2
@@ -113,6 +117,10 @@
 # CHECK-LE: stxvprll 6, 0, 1              # encoding: [0xda,0x0d,0xc0,0x7c]
             stxvprll 6, 0, 1
 
+           stxvpb32x 2, 15, 16
+#CHECK-BE: stxvpb32x 2, 15, 16            # encoding: [0x7c,0x4f,0x87,0xda]
+#CHECK-LE: stxvpb32x 2, 15, 16            # encoding: [0xda,0x87,0x4f,0x7c]
+
             dmxvi8gerx4 1, 2, 4
 # CHECK-BE: dmxvi8gerx4 1, 2, 4                     # encoding: [0xec,0x82,0x20,0x58]
 # CHECK-LE: dmxvi8gerx4 1, 2, 4                     # encoding: [0x58,0x20,0x82,0xec]
@@ -347,6 +355,10 @@
 #CHECK-BE: vucmprhh 1, 3, 6               # encoding: [0x10,0x23,0x31,0x03]
 #CHECK-LE: vucmprhh 1, 3, 6               # encoding: [0x03,0x31,0x23,0x10]
 
+           xvrlw 34, 15, 16
+#CHECK-BE: xvrlw 34, 15, 16              # encoding: [0xf0,0x4f,0x85,0xc1]
+#CHECK-LE: xvrlw 34, 15, 16              # encoding: [0xc1,0x85,0x4f,0xf0]
+
            xxaes192encp 8, 10, 14
 #CHECK-BE: xxaes192encp 8, 10, 14         # encoding: [0xf1,0x0b,0x76,0x10]
 #CHECK-LE: xxaes192encp 8, 10, 14         # encoding: [0x10,0x76,0x0b,0xf1]
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index 111616df254d3..e41c9eac982a7 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -348,6 +348,9 @@
 .attribute arch, "rv32i_smepmp1p0"
 # CHECK: attribute      5, "rv32i2p1_smepmp1p0"
 
+.attribute arch, "rv32i_smpmpmt0p6"
+# CHECK: attribute      5, "rv32i2p1_smpmpmt0p6"
+
 .attribute arch, "rv32i_smrnmi1p0"
 # CHECK: attribute      5, "rv32i2p1_smrnmi1p0"
 
diff --git a/llvm/test/MC/WebAssembly/reference-types.s b/llvm/test/MC/WebAssembly/reference-types.s
index 7a838fc519493..a694abf25826b 100644
--- a/llvm/test/MC/WebAssembly/reference-types.s
+++ b/llvm/test/MC/WebAssembly/reference-types.s
@@ -105,3 +105,12 @@ ref_block_test:
   end_block
   drop
   end_function
+
+# CHECK-LABEL: ref_func_test:
+# CHECK-NEXT:  .functype ref_func_test () -> (funcref)
+# CHECK-NEXT:  ref.func ref_func_test # encoding: [0xd2,0x80'A',0x80'A',0x80'A',0x80'A',A]
+# CHECK-NEXT:  # fixup A - offset: 1, value: ref_func_test, kind: fixup_uleb128_i32
+ref_func_test:
+  .functype ref_func_test () -> (funcref)
+  ref.func ref_func_test
+  end_function
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
index 92db672e1c82d..497a1c6b7bad5 100755
--- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
@@ -1,69 +1,5 @@
 // RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
 
-// CHECK: t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6
-// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6
-
-// CHECK: t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2
-// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2
-
-// CHECK: t2rpntlvwz0rs 64(%rbx), %tmm6
-// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40]
-          t2rpntlvwz0rs 64(%rbx), %tmm6
-
-// CHECK: t2rpntlvwz0rs -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz0rs -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6
-// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6
-
-// CHECK: t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2
-// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2
-
-// CHECK: t2rpntlvwz0rst1 64(%rbx), %tmm6
-// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40]
-          t2rpntlvwz0rst1 64(%rbx), %tmm6
-
-// CHECK: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6
-// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6
-
-// CHECK: t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2
-// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2
-
-// CHECK: t2rpntlvwz1rs 64(%rbx), %tmm6
-// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40]
-          t2rpntlvwz1rs 64(%rbx), %tmm6
-
-// CHECK: t2rpntlvwz1rs -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz1rs -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6
-// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6
-
-// CHECK: t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2
-// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2
-
-// CHECK: t2rpntlvwz1rst1 64(%rbx), %tmm6
-// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40]
-          t2rpntlvwz1rst1 64(%rbx), %tmm6
-
-// CHECK: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
-
 // CHECK: tileloaddrs 268435456(%rbp,%r14,8), %tmm6
 // CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10]
           tileloaddrs 268435456(%rbp,%r14,8), %tmm6
@@ -88,70 +24,6 @@
 // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
           tileloaddrst1 -32(,%rbp,2), %tmm3
 
-// CHECK: t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6
-// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6
-
-// CHECK: t2rpntlvwz0rs   291(%r8,%r17,4), %tmm2
-// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2
-
-// CHECK: t2rpntlvwz0rs   64(%r18), %tmm6
-// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40]
-          t2rpntlvwz0rs 64(%r18), %tmm6
-
-// CHECK: {evex}  t2rpntlvwz0rs   -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz0rs -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6
-// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6
-
-// CHECK: t2rpntlvwz0rst1   291(%r8,%r17,4), %tmm2
-// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2
-
-// CHECK: t2rpntlvwz0rst1   64(%r18), %tmm6
-// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40]
-          t2rpntlvwz0rst1 64(%r18), %tmm6
-
-// CHECK: {evex}  t2rpntlvwz0rst1   -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6
-// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6
-
-// CHECK: t2rpntlvwz1rs   291(%r8,%r17,4), %tmm2
-// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2
-
-// CHECK: t2rpntlvwz1rs   64(%r18), %tmm6
-// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40]
-          t2rpntlvwz1rs 64(%r18), %tmm6
-
-// CHECK: {evex}  t2rpntlvwz1rs   -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz1rs -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6
-// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6
-
-// CHECK: t2rpntlvwz1rst1   291(%r8,%r17,4), %tmm2
-// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2
-
-// CHECK: t2rpntlvwz1rst1   64(%r18), %tmm6
-// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40]
-          t2rpntlvwz1rst1 64(%r18), %tmm6
-
-// CHECK: {evex}  t2rpntlvwz1rst1   -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
-
 // CHECK: tileloaddrs     291(%r16,%rax,4), %tmm3
 // CHECK: encoding: [0x62,0xfa,0x7f,0x08,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
           tileloaddrs 291(%r16,%rax,4), %tmm3
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
index 140d1aa6b198e..0e030ca415a16 100755
--- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
@@ -1,69 +1,5 @@
 // RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding  %s | FileCheck %s
 
-// CHECK: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291]
-// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291]
-
-// CHECK: t2rpntlvwz0rs tmm6, [rbx + 64]
-// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40]
-          t2rpntlvwz0rs tmm6, [rbx + 64]
-
-// CHECK: t2rpntlvwz0rs tmm2, [2*rbp - 32]
-// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz0rs tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291]
-// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291]
-
-// CHECK: t2rpntlvwz0rst1 tmm6, [rbx + 64]
-// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40]
-          t2rpntlvwz0rst1 tmm6, [rbx + 64]
-
-// CHECK: t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
-// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291]
-// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291]
-
-// CHECK: t2rpntlvwz1rs tmm6, [rbx + 64]
-// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40]
-          t2rpntlvwz1rs tmm6, [rbx + 64]
-
-// CHECK: t2rpntlvwz1rs tmm2, [2*rbp - 32]
-// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz1rs tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291]
-// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291]
-
-// CHECK: t2rpntlvwz1rst1 tmm6, [rbx + 64]
-// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40]
-          t2rpntlvwz1rst1 tmm6, [rbx + 64]
-
-// CHECK: t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
-// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
-
 // CHECK: tileloaddrs tmm6, [rbp + 8*r14 + 268435456]
 // CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10]
           tileloaddrs tmm6, [rbp + 8*r14 + 268435456]
@@ -96,70 +32,6 @@
 // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
           tileloaddrst1 tmm3, [2*rbp - 32]
 
-// CHECK: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291]
-// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291]
-
-// CHECK: t2rpntlvwz0rs tmm6, [r18 + 64]
-// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40]
-          t2rpntlvwz0rs tmm6, [r18 + 64]
-
-// CHECK: {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32]
-// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291]
-// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291]
-
-// CHECK: t2rpntlvwz0rst1 tmm6, [r18 + 64]
-// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40]
-          t2rpntlvwz0rst1 tmm6, [r18 + 64]
-
-// CHECK: {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
-// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291]
-// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291]
-
-// CHECK: t2rpntlvwz1rs tmm6, [r18 + 64]
-// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40]
-          t2rpntlvwz1rs tmm6, [r18 + 64]
-
-// CHECK: {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32]
-// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291]
-// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291]
-
-// CHECK: t2rpntlvwz1rst1 tmm6, [r18 + 64]
-// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40]
-          t2rpntlvwz1rst1 tmm6, [r18 + 64]
-
-// CHECK: {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
-// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
-
 // CHECK: tileloaddrs     tmm6, [r16 + 8*r14 + 268435456]
 // CHECK: encoding: [0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10]
           tileloaddrs tmm6, [r16 + 8*r14 + 268435456]
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s
index b413597cd9da7..d1d0997b7eec0 100644
--- a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s
@@ -8,10 +8,3 @@
 // CHECK: encoding: [0xc4,0xe2,0x71,0x48,0xda]
                tmmultf32ps %tmm1, %tmm2, %tmm3
 
-// CHECK:      ttmmultf32ps %tmm4, %tmm5, %tmm6
-// CHECK: encoding: [0xc4,0xe2,0x58,0x48,0xf5]
-               ttmmultf32ps %tmm4, %tmm5, %tmm6
-
-// CHECK:      ttmmultf32ps %tmm1, %tmm2, %tmm3
-// CHECK: encoding: [0xc4,0xe2,0x70,0x48,0xda]
-               ttmmultf32ps %tmm1, %tmm2, %tmm3
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s
index 98f55275716eb..b6c0947ee750c 100644
--- a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s
@@ -8,10 +8,3 @@
 // CHECK: encoding: [0xc4,0xe2,0x71,0x48,0xda]
                tmmultf32ps tmm3, tmm2, tmm1
 
-// CHECK:      ttmmultf32ps tmm6, tmm5, tmm4
-// CHECK: encoding: [0xc4,0xe2,0x58,0x48,0xf5]
-               ttmmultf32ps tmm6, tmm5, tmm4
-
-// CHECK:      ttmmultf32ps tmm3, tmm2, tmm1
-// CHECK: encoding: [0xc4,0xe2,0x70,0x48,0xda]
-               ttmmultf32ps tmm3, tmm2, tmm1
diff --git a/llvm/test/MC/X86/amx-transpose-att.s b/llvm/test/MC/X86/amx-transpose-att.s
deleted file mode 100644
index 5158470f8c905..0000000000000
--- a/llvm/test/MC/X86/amx-transpose-att.s
+++ /dev/null
@@ -1,153 +0,0 @@
-// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
-
-// CHECK: t2rpntlvwz0     268435456(%rbp,%r14,8), %tmm4
-// CHECK: encoding: [0xc4,0xa2,0x78,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4
-
-// CHECK: t2rpntlvwz0     291(%r8,%rax,4), %tmm2
-// CHECK: encoding: [0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0 291(%r8,%rax,4), %tmm2
-
-// CHECK: t2rpntlvwz0     -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz0 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz0t1     268435456(%rbp,%r14,8), %tmm4
-// CHECK: encoding: [0xc4,0xa2,0x78,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm5
-
-// CHECK: t2rpntlvwz0t1     291(%r8,%rax,4), %tmm2
-// CHECK: encoding: [0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2
-
-// CHECK: t2rpntlvwz0t1     -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz1     268435456(%rbp,%r14,8), %tmm4
-// CHECK: encoding: [0xc4,0xa2,0x79,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm5
-
-// CHECK: t2rpntlvwz1     291(%r8,%rax,4), %tmm2
-// CHECK: encoding: [0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1 291(%r8,%rax,4), %tmm2
-
-// CHECK: t2rpntlvwz1     -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz1 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz1t1     268435456(%rbp,%r14,8), %tmm2
-// CHECK: encoding: [0xc4,0xa2,0x79,0x6f,0x94,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm3
-
-// CHECK: t2rpntlvwz1t1     291(%r8,%rax,4), %tmm2
-// CHECK: encoding: [0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2
-
-// CHECK: t2rpntlvwz1t1     -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz0     268435456(%r16,%r14,8), %tmm4
-// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4
-
-// CHECK: t2rpntlvwz0     291(%r8,%r17,4), %tmm2
-// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0 291(%r8,%r17,4), %tmm2
-
-// CHECK: {evex}  t2rpntlvwz0     -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz0 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz0t1     268435456(%r16,%r14,8), %tmm4
-// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4
-
-// CHECK: t2rpntlvwz0t1     291(%r8,%r17,4), %tmm2
-// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2
-
-// CHECK: {evex}  t2rpntlvwz0t1     -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz1     268435456(%r16,%r14,8), %tmm4
-// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4
-
-// CHECK: t2rpntlvwz1     291(%r8,%r17,4), %tmm2
-// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1 291(%r8,%r17,4), %tmm2
-
-// CHECK: {evex}  t2rpntlvwz1     -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz1 -32(,%rbp,2), %tmm2
-
-// CHECK: t2rpntlvwz1t1     268435456(%r16,%r14,8), %tmm4
-// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4
-
-// CHECK: t2rpntlvwz1t1     291(%r8,%r17,4), %tmm2
-// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2
-
-// CHECK: {evex}  t2rpntlvwz1t1     -32(,%rbp,2), %tmm2
-// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
-
-// CHECK: ttransposed     %tmm1, %tmm5
-// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9]
-          ttransposed %tmm1, %tmm5
-
-// CHECK: ttransposed     %tmm2, %tmm3
-// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda]
-          ttransposed %tmm2, %tmm3
-
-// CHECK: ttdpbf16ps     %tmm1, %tmm2, %tmm5
-// CHECK: encoding: [0xc4,0xe2,0x72,0x6c,0xea]
-          ttdpbf16ps %tmm1, %tmm2, %tmm5
-
-// CHECK: ttdpbf16ps     %tmm1, %tmm2, %tmm3
-// CHECK: encoding: [0xc4,0xe2,0x72,0x6c,0xda]
-          ttdpbf16ps %tmm1, %tmm2, %tmm3
-
-// CHECK: ttdpfp16ps     %tmm3, %tmm4, %tmm5
-// CHECK: encoding: [0xc4,0xe2,0x63,0x6c,0xec]
-          ttdpfp16ps %tmm3, %tmm4, %tmm5
-
-// CHECK: ttdpfp16ps     %tmm1, %tmm2, %tmm3
-// CHECK: encoding: [0xc4,0xe2,0x73,0x6c,0xda]
-          ttdpfp16ps %tmm1, %tmm2, %tmm3
-
-// CHECK: ttcmmimfp16ps %tmm4, %tmm5, %tmm6
-// CHECK: encoding: [0xc4,0xe2,0x5b,0x6b,0xf5]
-          ttcmmimfp16ps %tmm4, %tmm5, %tmm6
-
-// CHECK: ttcmmimfp16ps %tmm1, %tmm2, %tmm3
-// CHECK: encoding: [0xc4,0xe2,0x73,0x6b,0xda]
-          ttcmmimfp16ps %tmm1, %tmm2, %tmm3
-
-// CHECK: ttcmmrlfp16ps %tmm4, %tmm5, %tmm6
-// CHECK: encoding: [0xc4,0xe2,0x5a,0x6b,0xf5]
-          ttcmmrlfp16ps %tmm4, %tmm5, %tmm6
-
-// CHECK: ttcmmrlfp16ps %tmm1, %tmm2, %tmm3
-// CHECK: encoding: [0xc4,0xe2,0x72,0x6b,0xda]
-          ttcmmrlfp16ps %tmm1, %tmm2, %tmm3
-
-// CHECK: tconjtcmmimfp16ps %tmm4, %tmm5, %tmm6
-// CHECK: encoding: [0xc4,0xe2,0x58,0x6b,0xf5]
-          tconjtcmmimfp16ps %tmm4, %tmm5, %tmm6
-
-// CHECK: tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3
-// CHECK: encoding: [0xc4,0xe2,0x70,0x6b,0xda]
-          tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3
-
-// CHECK: tconjtfp16 %tmm5, %tmm6
-// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xf5]
-          tconjtfp16 %tmm5, %tmm6
-
-// CHECK: tconjtfp16 %tmm2, %tmm3
-// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xda]
-          tconjtfp16 %tmm2, %tmm3
diff --git a/llvm/test/MC/X86/amx-transpose-intel.s b/llvm/test/MC/X86/amx-transpose-intel.s
deleted file mode 100644
index 0d2c22f67a173..0000000000000
--- a/llvm/test/MC/X86/amx-transpose-intel.s
+++ /dev/null
@@ -1,153 +0,0 @@
-// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
-
-// CHECK: t2rpntlvwz0     tmm6, [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0xc4,0xa2,0x78,0x6e,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0 tmm6, [rbp + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz0     tmm2, [r8 + 4*rax + 291]
-// CHECK: encoding: [0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0 tmm2, [r8 + 4*rax + 291]
-
-// CHECK: t2rpntlvwz0     tmm2, [2*rbp - 32]
-// CHECK: encoding: [0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz0 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz0t1     tmm6, [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0xc4,0xa2,0x78,0x6f,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0t1 tmm7, [rbp + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz0t1     tmm2, [r8 + 4*rax + 291]
-// CHECK: encoding: [0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291]
-
-// CHECK: t2rpntlvwz0t1     tmm2, [2*rbp - 32]
-// CHECK: encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz0t1 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz1     tmm0, [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0xc4,0xa2,0x79,0x6e,0x84,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1 tmm1, [rbp + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz1     tmm2, [r8 + 4*rax + 291]
-// CHECK: encoding: [0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1 tmm2, [r8 + 4*rax + 291]
-
-// CHECK: t2rpntlvwz1     tmm2, [2*rbp - 32]
-// CHECK: encoding: [0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz1 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz1t1     tmm6, [rbp + 8*r14 + 268435456]
-// CHECK: encoding: [0xc4,0xa2,0x79,0x6f,0xb4,0xf5,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1t1 tmm6, [rbp + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz1t1     tmm2, [r8 + 4*rax + 291]
-// CHECK: encoding: [0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291]
-
-// CHECK: t2rpntlvwz1t1     tmm2, [2*rbp - 32]
-// CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          t2rpntlvwz1t1 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz0     tmm4, [r16 + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz0     tmm2, [r8 + 4*r17 + 291]
-// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291]
-
-// CHECK: {evex} t2rpntlvwz0     tmm2, [2*rbp - 32]
-// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz0 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz0t1     tmm4, [r16 + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz0t1     tmm2, [r8 + 4*r17 + 291]
-// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291]
-
-// CHECK: {evex} t2rpntlvwz0t1     tmm2, [2*rbp - 32]
-// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz0t1 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz1     tmm4, [r16 + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz1     tmm2, [r8 + 4*r17 + 291]
-// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291]
-
-// CHECK: {evex} t2rpntlvwz1     tmm2, [2*rbp - 32]
-// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz1 tmm2, [2*rbp - 32]
-
-// CHECK: t2rpntlvwz1t1     tmm4, [r16 + 8*r14 + 268435456]
-// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
-          t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456]
-
-// CHECK: t2rpntlvwz1t1     tmm2, [r8 + 4*r17 + 291]
-// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
-          t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291]
-
-// CHECK: {evex} t2rpntlvwz1t1     tmm2, [2*rbp - 32]
-// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
-          {evex} t2rpntlvwz1t1 tmm2, [2*rbp - 32]
-
-// CHECK: ttransposed     tmm5, tmm1
-// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9]
-          ttransposed tmm5, tmm1
-
-// CHECK: ttransposed     tmm3, tmm2
-// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda]
-          ttransposed tmm3, tmm2
-
-// CHECK: ttdpbf16ps     tmm5, tmm0, tmm4
-// CHECK: encoding: [0xc4,0xe2,0x5a,0x6c,0xe8]
-          ttdpbf16ps tmm5, tmm0, tmm4
-
-// CHECK: ttdpbf16ps     tmm3, tmm2, tmm1
-// CHECK: encoding: [0xc4,0xe2,0x72,0x6c,0xda]
-          ttdpbf16ps tmm3, tmm2, tmm1
-
-// CHECK: ttdpfp16ps     tmm1, tmm0, tmm4
-// CHECK: encoding: [0xc4,0xe2,0x5b,0x6c,0xc8]
-          ttdpfp16ps tmm1, tmm0, tmm4
-
-// CHECK: ttdpfp16ps     tmm3, tmm2, tmm1
-// CHECK: encoding: [0xc4,0xe2,0x73,0x6c,0xda]
-          ttdpfp16ps tmm3, tmm2, tmm1
-
-// CHECK: ttcmmimfp16ps tmm6, tmm5, tmm4
-// CHECK: encoding: [0xc4,0xe2,0x5b,0x6b,0xf5]
-          ttcmmimfp16ps tmm6, tmm5, tmm4
-
-// CHECK: ttcmmimfp16ps tmm3, tmm2, tmm1
-// CHECK: encoding: [0xc4,0xe2,0x73,0x6b,0xda]
-          ttcmmimfp16ps tmm3, tmm2, tmm1
-
-// CHECK: ttcmmrlfp16ps tmm6, tmm5, tmm4
-// CHECK: encoding: [0xc4,0xe2,0x5a,0x6b,0xf5]
-          ttcmmrlfp16ps tmm6, tmm5, tmm4
-
-// CHECK: ttcmmrlfp16ps tmm3, tmm2, tmm1
-// CHECK: encoding: [0xc4,0xe2,0x72,0x6b,0xda]
-          ttcmmrlfp16ps tmm3, tmm2, tmm1
-
-// CHECK: tconjtcmmimfp16ps tmm6, tmm5, tmm4
-// CHECK: encoding: [0xc4,0xe2,0x58,0x6b,0xf5]
-          tconjtcmmimfp16ps tmm6, tmm5, tmm4
-
-// CHECK: tconjtcmmimfp16ps tmm3, tmm2, tmm1
-// CHECK: encoding: [0xc4,0xe2,0x70,0x6b,0xda]
-          tconjtcmmimfp16ps tmm3, tmm2, tmm1
-
-// CHECK: tconjtfp16 tmm6, tmm5
-// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xf5]
-          tconjtfp16 tmm6, tmm5
-
-// CHECK: tconjtfp16 tmm3, tmm2
-// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xda]
-          tconjtfp16 tmm3, tmm2
diff --git a/llvm/test/MC/Xtensa/s32c1i.s b/llvm/test/MC/Xtensa/s32c1i.s
new file mode 100644
index 0000000000000..218a86dd56752
--- /dev/null
+++ b/llvm/test/MC/Xtensa/s32c1i.s
@@ -0,0 +1,13 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+s32c1i \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+# CHECK-INST: xsr a3, atomctl
+# CHECK: # encoding: [0x30,0x63,0x61]
+xsr a3, atomctl
+
+# CHECK-INST: xsr a3, scompare1
+# CHECK: # encoding: [0x30,0x0c,0x61]
+xsr a3, scompare1
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 65b96c8b8ef5d..1f437a662cc96 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -260,6 +260,7 @@
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis on foo
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
+; CHECK-DEFAULT-NEXT: Running pass: DropUnnecessaryAssumesPass
 ; CHECK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index f595dfe1d6845..c865d77c86d77 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -129,6 +129,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LoopAccessAnalysis on foo
 ; CHECK-O23SZ-NEXT: Running pass: LoopVectorizePass on foo
 ; CHECK-O23SZ-NEXT: Running analysis: DemandedBitsAnalysis on foo
+; CHECK-O23SZ-NEXT: Running pass: DropUnnecessaryAssumesPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: LoopUnrollPass on foo
 ; CHECK-O23SZ-NEXT: WarnMissedTransformationsPass on foo
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 3a0fffe426da1..2d8b8f1b22091 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -179,6 +179,7 @@
 ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis on foo
 ; CHECK-POSTLINK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
+; CHECK-POSTLINK-O-NEXT: Running pass: DropUnnecessaryAssumesPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 4623edcaf6656..7cacc17c7ab9a 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -164,6 +164,7 @@
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis on foo
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
+; CHECK-O-NEXT: Running pass: DropUnnecessaryAssumesPass
 ; CHECK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 590afd925e841..ef6cd8354ae3d 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -173,6 +173,7 @@
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
+; CHECK-O-NEXT: Running pass: DropUnnecessaryAssumesPass
 ; CHECK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
diff --git a/llvm/test/Other/offload-wrapper.ll b/llvm/test/Other/offload-wrapper.ll
deleted file mode 100644
index 9107a141ad201..0000000000000
--- a/llvm/test/Other/offload-wrapper.ll
+++ /dev/null
@@ -1,52 +0,0 @@
-; RUN: llvm-offload-wrapper --triple=x86-64 -kind=hip %s -o %t.bc
-; RUN: llvm-dis %t.bc -o - | FileCheck %s --check-prefix=HIP
-
-;      HIP: @__start_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry], section "llvm_offload_entries$OA"
-; HIP-NEXT: @__stop_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry], section "llvm_offload_entries$OZ"
-; HIP-NEXT: @.fatbin_image = internal constant {{.*}}, section ".hip_fatbin"
-; HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8
-; HIP-NEXT: @.hip.binary_handle = internal global ptr null
-; HIP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.hip.fatbin_reg, ptr null }]
-
-;      HIP: define internal void @.hip.fatbin_reg() section ".text.startup" {
-; HIP-NEXT: entry:
-; HIP-NEXT:   %0 = call ptr @__hipRegisterFatBinary(ptr @.fatbin_wrapper)
-; HIP-NEXT:   store ptr %0, ptr @.hip.binary_handle, align 8
-; HIP-NEXT:   call void @.hip.globals_reg(ptr %0)
-; HIP-NEXT:   %1 = call i32 @atexit(ptr @.hip.fatbin_unreg)
-; HIP-NEXT:   ret void
-; HIP-NEXT: }
-
-;      HIP: define internal void @.hip.fatbin_unreg() section ".text.startup" {
-; HIP-NEXT: entry:
-; HIP-NEXT:   %0 = load ptr, ptr @.hip.binary_handle, align 8
-; HIP-NEXT:   call void @__hipUnregisterFatBinary(ptr %0)
-; HIP-NEXT:   ret void
-; HIP-NEXT: }
-
-; RUN: llvm-offload-wrapper --triple=x86-64 -kind=cuda %s -o %t.bc
-; RUN: llvm-dis %t.bc -o - | FileCheck %s --check-prefix=CUDA
-
-;      CUDA: @__start_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry], section "llvm_offload_entries$OA"
-; CUDA-NEXT: @__stop_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry], section "llvm_offload_entries$OZ"
-; CUDA-NEXT: @.fatbin_image = internal constant {{.*}}, section ".nv_fatbin"
-; CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8
-; CUDA-NEXT: @.cuda.binary_handle = internal global ptr null
-; CUDA-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.cuda.fatbin_reg, ptr null }]
-
-;      CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" {
-; CUDA-NEXT: entry:
-; CUDA-NEXT:   %0 = call ptr @__cudaRegisterFatBinary(ptr @.fatbin_wrapper)
-; CUDA-NEXT:   store ptr %0, ptr @.cuda.binary_handle, align 8
-; CUDA-NEXT:   call void @.cuda.globals_reg(ptr %0)
-; CUDA-NEXT:   call void @__cudaRegisterFatBinaryEnd(ptr %0)
-; CUDA-NEXT:   %1 = call i32 @atexit(ptr @.cuda.fatbin_unreg)
-; CUDA-NEXT:   ret void
-; CUDA-NEXT: }
-
-;      CUDA: define internal void @.cuda.fatbin_unreg() section ".text.startup" {
-; CUDA-NEXT: entry:
-; CUDA-NEXT:   %0 = load ptr, ptr @.cuda.binary_handle, align 8
-; CUDA-NEXT:   call void @__cudaUnregisterFatBinary(ptr %0)
-; CUDA-NEXT:   ret void
-; CUDA-NEXT: }
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
index 18960b43ab97d..3170f2c06c00b 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
@@ -96,7 +96,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 
 // CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:      /*   0 */ GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(99), GIMT_Encode2(211), /*)*//*default:*//*Label 5*/ GIMT_Encode4(524),
+// CHECK-NEXT:      /*   0 */ GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(100), GIMT_Encode2(212), /*)*//*default:*//*Label 5*/ GIMT_Encode4(524),
 // CHECK-NEXT:      /* 10 */ /*TargetOpcode::G_STORE*//*Label 0*/ GIMT_Encode4(458), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
 // CHECK-NEXT:      /* 182 */ /*TargetOpcode::G_SEXT*//*Label 1*/ GIMT_Encode4(476), GIMT_Encode4(0),
 // CHECK-NEXT:      /* 190 */ /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4(488), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
diff --git a/llvm/test/TableGen/RegClassByHwMode.td b/llvm/test/TableGen/RegClassByHwMode.td
index ca72cfbd403bf..a21a396f7fd52 100644
--- a/llvm/test/TableGen/RegClassByHwMode.td
+++ b/llvm/test/TableGen/RegClassByHwMode.td
@@ -6,18 +6,21 @@
 
 include "llvm/Target/Target.td"
 
-// INSTRINFO: #ifdef GET_INSTRINFO_ENUM
+// INSTRINFO:      #ifdef GET_INSTRINFO_ENUM
 // INSTRINFO-NEXT: #undef GET_INSTRINFO_ENUM
+// INSTRINFO-EMPTY:
 // INSTRINFO-NEXT: namespace llvm::MyTarget {
+// INSTRINFO-EMPTY:
 // INSTRINFO-NEXT: enum {
-// INSTRINFO-NEXT: PHI
-// INSTRINFO: };
-// INSTRINFO: enum RegClassByHwModeUses : uint16_t {
+// INSTRINFO-NEXT:   PHI
+// INSTRINFO:      };
+// INSTRINFO:      enum RegClassByHwModeUses : uint16_t {
 // INSTRINFO-NEXT:   MyPtrRC,
 // INSTRINFO-NEXT:   XRegs_EvenIfRequired,
 // INSTRINFO-NEXT:   YRegs_EvenIfRequired,
 // INSTRINFO-NEXT: };
-// INSTRINFO-NEXT: }
+// INSTRINFO-EMPTY:
+// INSTRINFO-NEXT: } // namespace llvm::MyTarget
 
 // INSTRINFO: { MyTarget::XRegsRegClassID, 0, MCOI::OPERAND_REGISTER, 0 },
 // INSTRINFO: { MyTarget::XRegs_EvenRegClassID, 0, MCOI::OPERAND_REGISTER, 0 },
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td b/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td
index 2904474f6110b..e4a7126d79fbd 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td
@@ -53,21 +53,21 @@ def MSP430LibraryWithCondCC : SystemRuntimeLibrary<isMSP430,
 // CHECK-NEXT:   });
 // CHECK-NEXT:   AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::MALLOC, RTLIB::impl_malloc}, // malloc
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_malloc, // malloc
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls_AlwaysAvailable_AVR_BUILTIN[] = {
-// CHECK-NEXT:        {RTLIB::SDIVREM_I8, RTLIB::impl___divmodqi4}, // __divmodqi4
-// CHECK-NEXT:        {RTLIB::UDIVREM_I16, RTLIB::impl___udivmodhi4}, // __udivmodhi4
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls_AlwaysAvailable_AVR_BUILTIN[] = {
+// CHECK-NEXT:        RTLIB::impl___divmodqi4, // __divmodqi4
+// CHECK-NEXT:        RTLIB::impl___udivmodhi4, // __udivmodhi4
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls_AlwaysAvailable_AVR_BUILTIN) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls_AlwaysAvailable_AVR_BUILTIN) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:      setLibcallImplCallingConv(Impl, CallingConv::AVR_BUILTIN);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
@@ -80,21 +80,21 @@ def MSP430LibraryWithCondCC : SystemRuntimeLibrary<isMSP430,
 // CHECK-NEXT:   });
 // CHECK-NEXT:   AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::MALLOC, RTLIB::impl_malloc}, // malloc
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_malloc, // malloc
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:   for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:     setLibcallImpl(Func, Impl);
+// CHECK-NEXT:   for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:     setAvailable(Impl);
 // CHECK-NEXT:   }
 // CHECK-EMPTY:
-// CHECK-NEXT:   static const LibcallImplPair LibraryCalls_AlwaysAvailable_AVR_BUILTIN[] = {
-// CHECK-NEXT:       {RTLIB::SDIVREM_I8, RTLIB::impl___divmodqi4}, // __divmodqi4
-// CHECK-NEXT:       {RTLIB::UDIVREM_I16, RTLIB::impl___udivmodhi4}, // __udivmodhi4
+// CHECK-NEXT:   static const RTLIB::LibcallImpl LibraryCalls_AlwaysAvailable_AVR_BUILTIN[] = {
+// CHECK-NEXT:       RTLIB::impl___divmodqi4, // __divmodqi4
+// CHECK-NEXT:       RTLIB::impl___udivmodhi4, // __udivmodhi4
 // CHECK-NEXT:   };
 // CHECK-EMPTY:
-// CHECK-NEXT:   for (const auto [Func, Impl] : LibraryCalls_AlwaysAvailable_AVR_BUILTIN) {
-// CHECK-NEXT:     setLibcallImpl(Func, Impl);
+// CHECK-NEXT:   for (const RTLIB::LibcallImpl Impl : LibraryCalls_AlwaysAvailable_AVR_BUILTIN) {
+// CHECK-NEXT:     setAvailable(Impl);
 // CHECK-NEXT:     setLibcallImplCallingConv(Impl, CallingConv::AVR_BUILTIN);
 // CHECK-NEXT:   }
 // CHECK-EMPTY:
@@ -107,33 +107,33 @@ def MSP430LibraryWithCondCC : SystemRuntimeLibrary<isMSP430,
 // CHECK-NEXT:    });
 // CHECK-NEXT:    AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::MALLOC, RTLIB::impl_malloc}, // malloc
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_malloc, // malloc
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    if ( isFoo() ) {
-// CHECK-NEXT:      static const LibcallImplPair LibraryCalls_anonymous_3_AVR_BUILTIN[] = {
-// CHECK-NEXT:          {RTLIB::SDIVREM_I8, RTLIB::impl___divmodqi4}, // __divmodqi4
+// CHECK-NEXT:      static const RTLIB::LibcallImpl LibraryCalls_anonymous_3_AVR_BUILTIN[] = {
+// CHECK-NEXT:          RTLIB::impl___divmodqi4, // __divmodqi4
 // CHECK-NEXT:      };
 // CHECK-EMPTY:
-// CHECK-NEXT:      for (const auto [Func, Impl] : LibraryCalls_anonymous_3_AVR_BUILTIN) {
-// CHECK-NEXT:        setLibcallImpl(Func, Impl);
+// CHECK-NEXT:      for (const RTLIB::LibcallImpl Impl : LibraryCalls_anonymous_3_AVR_BUILTIN) {
+// CHECK-NEXT:        setAvailable(Impl);
 // CHECK-NEXT:        setLibcallImplCallingConv(Impl, CallingConv::AVR_BUILTIN);
 // CHECK-NEXT:      }
 // CHECK-EMPTY:
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    if ( isBar() ) {
-// CHECK-NEXT:      static const LibcallImplPair LibraryCalls_anonymous_5_MSP430_BUILTIN[] = {
-// CHECK-NEXT:          {RTLIB::UDIVREM_I16, RTLIB::impl___udivmodhi4}, // __udivmodhi4
+// CHECK-NEXT:      static const RTLIB::LibcallImpl LibraryCalls_anonymous_5_MSP430_BUILTIN[] = {
+// CHECK-NEXT:          RTLIB::impl___udivmodhi4, // __udivmodhi4
 // CHECK-NEXT:      };
 // CHECK-EMPTY:
-// CHECK-NEXT:      for (const auto [Func, Impl] : LibraryCalls_anonymous_5_MSP430_BUILTIN) {
-// CHECK-NEXT:        setLibcallImpl(Func, Impl);
+// CHECK-NEXT:      for (const RTLIB::LibcallImpl Impl : LibraryCalls_anonymous_5_MSP430_BUILTIN) {
+// CHECK-NEXT:        setAvailable(Impl);
 // CHECK-NEXT:        setLibcallImplCallingConv(Impl, CallingConv::MSP430_BUILTIN);
 // CHECK-NEXT:      }
 // CHECK-EMPTY:
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td b/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td
index f9a148a183806..82206ce6ba254 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td
@@ -31,12 +31,12 @@ def dup1 : RuntimeLibcallImpl<ANOTHER_DUP>;
 // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
 
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::SOME_FUNC, RTLIB::impl_func_b}, // func_b
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_func_b, // func_b
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    return;
@@ -53,13 +53,13 @@ def TheSystemLibraryA : SystemRuntimeLibrary<isTargetArchA,
 // CHECK-NEXT: });
 // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::OTHER_FUNC, RTLIB::impl_other_func}, // other_func
-// CHECK-NEXT:        {RTLIB::SOME_FUNC, RTLIB::impl_func_a}, // func_a
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_other_func, // other_func
+// CHECK-NEXT:        RTLIB::impl_func_a, // func_a
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    return;
@@ -76,14 +76,14 @@ def TheSystemLibraryB : SystemRuntimeLibrary<isTargetArchB,
 // CHECK-NEXT: });
 // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:     static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:         {RTLIB::ANOTHER_DUP, RTLIB::impl_dup1}, // dup1
-// CHECK-NEXT:         {RTLIB::OTHER_FUNC, RTLIB::impl_other_func}, // other_func
-// CHECK-NEXT:         {RTLIB::SOME_FUNC, RTLIB::impl_func_a}, // func_a
+// CHECK-NEXT:     static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:         RTLIB::impl_dup1, // dup1
+// CHECK-NEXT:         RTLIB::impl_other_func, // other_func
+// CHECK-NEXT:         RTLIB::impl_func_a, // func_a
 // CHECK-NEXT:     };
 // CHECK-EMPTY:
-// CHECK-NEXT:     for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:       setLibcallImpl(Func, Impl);
+// CHECK-NEXT:     for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:       setAvailable(Impl);
 // CHECK-NEXT:     }
 // CHECK-EMPTY:
 // CHECK-NEXT:    return;
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter.td b/llvm/test/TableGen/RuntimeLibcallEmitter.td
index 7aaf3a0e8e1cf..2a1cc72efcd4b 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter.td
@@ -200,10 +200,6 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT: }
 
 // CHECK: void llvm::RTLIB::RuntimeLibcallsInfo::setTargetRuntimeLibcallSets(const llvm::Triple &TT, ExceptionHandling ExceptionModel, FloatABI::ABIType FloatABI, EABI EABIVersion, StringRef ABIName) {
-// CHECK-NEXT:  struct LibcallImplPair {
-// CHECK-NEXT:    RTLIB::Libcall Func;
-// CHECK-NEXT:    RTLIB::LibcallImpl Impl;
-// CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT: if (TT.getArch() == Triple::blah) {
 // CHECK-NEXT:     static constexpr LibcallImplBitset SystemAvailableImpls({
@@ -211,35 +207,35 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT:     });
 // CHECK-NEXT:     AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::BZERO, RTLIB::impl_bzero}, // bzero
-// CHECK-NEXT:        {RTLIB::CALLOC, RTLIB::impl_calloc}, // calloc
-// CHECK-NEXT:        {RTLIB::SQRT_F128, RTLIB::impl_sqrtl_f128}, // sqrtl
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_bzero, // bzero
+// CHECK-NEXT:        RTLIB::impl_calloc, // calloc
+// CHECK-NEXT:        RTLIB::impl_sqrtl_f128, // sqrtl
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    if (TT.hasCompilerRT()) {
-// CHECK-NEXT:      static const LibcallImplPair LibraryCalls_hasCompilerRT[] = {
-// CHECK-NEXT:          {RTLIB::SHL_I32, RTLIB::impl___ashlsi3}, // __ashlsi3
-// CHECK-NEXT:          {RTLIB::SRL_I64, RTLIB::impl___lshrdi3}, // __lshrdi3
+// CHECK-NEXT:      static const RTLIB::LibcallImpl LibraryCalls_hasCompilerRT[] = {
+// CHECK-NEXT:          RTLIB::impl___ashlsi3, // __ashlsi3
+// CHECK-NEXT:          RTLIB::impl___lshrdi3, // __lshrdi3
 // CHECK-NEXT:      };
 // CHECK-EMPTY:
-// CHECK-NEXT:      for (const auto [Func, Impl] : LibraryCalls_hasCompilerRT) {
-// CHECK-NEXT:        setLibcallImpl(Func, Impl);
+// CHECK-NEXT:      for (const RTLIB::LibcallImpl Impl : LibraryCalls_hasCompilerRT) {
+// CHECK-NEXT:        setAvailable(Impl);
 // CHECK-NEXT:      }
 // CHECK-EMPTY:
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    if (TT.getOS() == Triple::bar) {
-// CHECK-NEXT:      static const LibcallImplPair LibraryCalls_isBarOS[] = {
-// CHECK-NEXT:          {RTLIB::MEMSET, RTLIB::impl____memset}, // ___memset
+// CHECK-NEXT:      static const RTLIB::LibcallImpl LibraryCalls_isBarOS[] = {
+// CHECK-NEXT:          RTLIB::impl____memset, // ___memset
 // CHECK-NEXT:      };
 // CHECK-EMPTY:
-// CHECK-NEXT:      for (const auto [Func, Impl] : LibraryCalls_isBarOS) {
-// CHECK-NEXT:        setLibcallImpl(Func, Impl);
+// CHECK-NEXT:      for (const RTLIB::LibcallImpl Impl : LibraryCalls_isBarOS) {
+// CHECK-NEXT:        setAvailable(Impl);
 // CHECK-NEXT:      }
 // CHECK-EMPTY:
 // CHECK-NEXT:    }
@@ -253,14 +249,14 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT:    });
 // CHECK-NEXT:    AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::SHL_I32, RTLIB::impl___ashlsi3}, // __ashlsi3
-// CHECK-NEXT:        {RTLIB::SQRT_F80, RTLIB::impl_sqrtl_f80}, // sqrtl
-// CHECK-NEXT:        {RTLIB::SRL_I64, RTLIB::impl___lshrdi3}, // __lshrdi3
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl___ashlsi3, // __ashlsi3
+// CHECK-NEXT:        RTLIB::impl_sqrtl_f80, // sqrtl
+// CHECK-NEXT:        RTLIB::impl___lshrdi3, // __lshrdi3
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:   return;
@@ -272,22 +268,22 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT:    });
 // CHECK-NEXT:    AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::BZERO, RTLIB::impl_bzero}, // bzero
-// CHECK-NEXT:        {RTLIB::SQRT_F128, RTLIB::impl_sqrtl_f128}, // sqrtl
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_bzero, // bzero
+// CHECK-NEXT:        RTLIB::impl_sqrtl_f128, // sqrtl
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    if (TT.getOS() == Triple::bar) {
-// CHECK-NEXT:      static const LibcallImplPair LibraryCalls_isBarOS[] = {
-// CHECK-NEXT:          {RTLIB::MEMSET, RTLIB::impl____memset}, // ___memset
+// CHECK-NEXT:      static const RTLIB::LibcallImpl LibraryCalls_isBarOS[] = {
+// CHECK-NEXT:          RTLIB::impl____memset, // ___memset
 // CHECK-NEXT:      };
 // CHECK-EMPTY:
-// CHECK-NEXT:      for (const auto [Func, Impl] : LibraryCalls_isBarOS) {
-// CHECK-NEXT:        setLibcallImpl(Func, Impl);
+// CHECK-NEXT:      for (const RTLIB::LibcallImpl Impl : LibraryCalls_isBarOS) {
+// CHECK-NEXT:        setAvailable(Impl);
 // CHECK-NEXT:      }
 // CHECK-EMPTY:
 // CHECK-NEXT:    }
@@ -301,15 +297,15 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT:    });
 // CHECK-NEXT:    AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::CALLOC, RTLIB::impl_calloc}, // calloc
-// CHECK-NEXT:        {RTLIB::SHL_I32, RTLIB::impl___ashlsi3}, // __ashlsi3
-// CHECK-NEXT:        {RTLIB::SQRT_F80, RTLIB::impl_sqrtl_f80}, // sqrtl
-// CHECK-NEXT:        {RTLIB::SRL_I64, RTLIB::impl___lshrdi3}, // __lshrdi3
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_calloc, // calloc
+// CHECK-NEXT:        RTLIB::impl___ashlsi3, // __ashlsi3
+// CHECK-NEXT:        RTLIB::impl_sqrtl_f80, // sqrtl
+// CHECK-NEXT:        RTLIB::impl___lshrdi3, // __lshrdi3
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:   return;
diff --git a/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td b/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td
index 0c5c63db4c95b..cc0f87755cdc2 100644
--- a/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td
+++ b/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td
@@ -20,6 +20,7 @@ def MyTarget : Target;
 // CHECK-EMPTY:
 // CHECK-NEXT:  namespace llvm {
 // CHECK-EMPTY:
+// CHECK-EMPTY:
 // CHECK-NEXT:  #ifdef __GNUC__
 // CHECK-NEXT:  #pragma GCC diagnostic push
 // CHECK-NEXT:  #pragma GCC diagnostic ignored "-Woverlength-strings"
diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td
index 475faf9254157..5bd7890e0ddd1 100644
--- a/llvm/test/TableGen/directive1.td
+++ b/llvm/test/TableGen/directive1.td
@@ -61,6 +61,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  #include <utility>
 // CHECK-EMPTY:
 // CHECK-NEXT:  namespace llvm {
+// CHECK-EMPTY:
 // CHECK-NEXT:  namespace tdl {
 // CHECK-EMPTY:
 // CHECK-NEXT:  LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
@@ -176,6 +177,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Clause> {
 // CHECK-NEXT:    static constexpr bool is_iterable = true;
 // CHECK-NEXT:  };
+// CHECK-EMPTY:
 // CHECK-NEXT:  } // namespace llvm
 // CHECK-EMPTY:
 // CHECK-NEXT:  #endif // LLVM_Tdl_INC
@@ -184,8 +186,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // IMPL:       #ifdef GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-NEXT:  #undef GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-EMPTY:
-// IMPL-NEXT:  namespace llvm {
-// IMPL-NEXT:  namespace tdl {
+// IMPL-NEXT:  namespace llvm::tdl {
 // IMPL-EMPTY:
 // IMPL-NEXT:    // Sets for dira
 // IMPL-EMPTY:
@@ -202,8 +203,8 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // IMPL-EMPTY:
 // IMPL-NEXT:    static  requiredClauses_TDLD_dira {
 // IMPL-NEXT:    };
-// IMPL-NEXT:  } // namespace tdl
-// IMPL-NEXT:  } // namespace llvm
+// IMPL-EMPTY:
+// IMPL-NEXT:  } // namespace llvm::tdl
 // IMPL-EMPTY:
 // IMPL-NEXT:  #endif // GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-EMPTY:
diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td
index ccc09446b4465..eaaf82ddaaf41 100644
--- a/llvm/test/TableGen/directive2.td
+++ b/llvm/test/TableGen/directive2.td
@@ -54,6 +54,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  #include <utility>
 // CHECK-EMPTY:
 // CHECK-NEXT:  namespace llvm {
+// CHECK-EMPTY:
 // CHECK-NEXT:  namespace tdl {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Association {
@@ -132,6 +133,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  LLVM_ABI Association getDirectiveAssociation(Directive D);
 // CHECK-NEXT:  LLVM_ABI Category getDirectiveCategory(Directive D);
 // CHECK-NEXT:  LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D);
+// CHECK-EMPTY:
 // CHECK-NEXT:  } // namespace tdl
 // CHECK-EMPTY:
 // CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Association> {
@@ -149,6 +151,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Clause> {
 // CHECK-NEXT:    static constexpr bool is_iterable = true;
 // CHECK-NEXT:  };
+// CHECK-EMPTY:
 // CHECK-NEXT:  } // namespace llvm
 // CHECK-EMPTY:
 // CHECK-NEXT:  #endif // LLVM_Tdl_INC
@@ -156,8 +159,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // IMPL:      #ifdef GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-NEXT: #undef GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-EMPTY:
-// IMPL-NEXT: namespace llvm {
-// IMPL-NEXT: namespace tdl {
+// IMPL-NEXT: namespace llvm::tdl {
 // IMPL-EMPTY:
 // IMPL-NEXT:   // Sets for dira
 // IMPL-EMPTY:
@@ -174,8 +176,8 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // IMPL-EMPTY:
 // IMPL-NEXT:   static  requiredClauses_TDLD_dira {
 // IMPL-NEXT:   };
-// IMPL-NEXT: } // namespace tdl
-// IMPL-NEXT: } // namespace llvm
+// IMPL-EMPTY:
+// IMPL-NEXT: } // namespace llvm::tdl
 // IMPL-EMPTY:
 // IMPL-NEXT: #endif // GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-EMPTY:
diff --git a/llvm/test/TableGen/get-named-operand-idx.td b/llvm/test/TableGen/get-named-operand-idx.td
index e6f6331cd9c48..8bb4f2f68b5fe 100644
--- a/llvm/test/TableGen/get-named-operand-idx.td
+++ b/llvm/test/TableGen/get-named-operand-idx.td
@@ -50,7 +50,9 @@ def InstD : InstBase {
 
 // CHECK-LABEL: #ifdef GET_INSTRINFO_OPERAND_ENUM
 // CHECK-NEXT:  #undef GET_INSTRINFO_OPERAND_ENUM
+// CHECK-EMPTY:
 // CHECK-NEXT:  namespace llvm::MyNamespace {
+// CHECK-EMPTY:
 // CHECK-NEXT:  enum class OpName : uint8_t {
 // CHECK-NEXT:    a = 0,
 // CHECK-NEXT:    b = 1,
@@ -62,12 +64,16 @@ def InstD : InstBase {
 // CHECK-EMPTY:
 // CHECK-NEXT:  LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, OpName Name);
 // CHECK-NEXT:  LLVM_READONLY OpName getOperandIdxName(uint16_t Opcode, int16_t Idx);
-// CHECK-NEXT:  } // end namespace llvm::MyNamespace
-// CHECK-NEXT:  #endif //GET_INSTRINFO_OPERAND_ENUM
+// CHECK-EMPTY:
+// CHECK-NEXT:  } // namespace llvm::MyNamespace
+// CHECK-EMPTY:
+// CHECK-NEXT:  #endif // GET_INSTRINFO_OPERAND_ENUM
 
 // CHECK-LABEL: #ifdef GET_INSTRINFO_NAMED_OPS
 // CHECK-NEXT:  #undef GET_INSTRINFO_NAMED_OPS
+// CHECK-EMPTY:
 // CHECK-NEXT:  namespace llvm::MyNamespace {
+// CHECK-EMPTY:
 // CHECK-NEXT:  LLVM_READONLY static uint8_t getInstructionIndexForOpLookup(uint16_t Opcode) {
 // CHECK-NEXT:    static constexpr uint8_t InstructionIndex[] = {
 // CHECK-NEXT:      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -89,7 +95,8 @@ def InstD : InstBase {
 // CHECK-NEXT:      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 // CHECK-NEXT:      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 // CHECK-NEXT:      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-// CHECK-NEXT:      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0,
+// CHECK-NEXT:      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2,
+// CHECK-NEXT:      0,
 // CHECK-NEXT:    };
 // CHECK-NEXT:    return InstructionIndex[Opcode];
 // CHECK-NEXT:  }
@@ -113,5 +120,7 @@ def InstD : InstBase {
 // CHECK-NEXT:    unsigned InstrIdx = getInstructionIndexForOpLookup(Opcode);
 // CHECK-NEXT:    return OperandMap[InstrIdx][(unsigned)Idx];
 // CHECK-NEXT:  }
-// CHECK-NEXT:  } // end namespace llvm::MyNamespace
-// CHECK-NEXT:  #endif //GET_INSTRINFO_NAMED_OPS
+// CHECK-EMPTY:
+// CHECK-NEXT:  } // namespace llvm::MyNamespace
+// CHECK-EMPTY:
+// CHECK-NEXT:  #endif // GET_INSTRINFO_NAMED_OPS
diff --git a/llvm/test/TableGen/x86-instr-mapping.inc b/llvm/test/TableGen/x86-instr-mapping.inc
index f621979b2af95..6d2873ed4e749 100644
--- a/llvm/test/TableGen/x86-instr-mapping.inc
+++ b/llvm/test/TableGen/x86-instr-mapping.inc
@@ -167,14 +167,6 @@ static const X86TableEntry X86CompressEVEXTable[] = {
   { X86::SHRX64rm_EVEX, X86::SHRX64rm },
   { X86::SHRX64rr_EVEX, X86::SHRX64rr },
   { X86::STTILECFG_EVEX, X86::STTILECFG },
-  { X86::T2RPNTLVWZ0RST1_EVEX, X86::T2RPNTLVWZ0RST1 },
-  { X86::T2RPNTLVWZ0RS_EVEX, X86::T2RPNTLVWZ0RS },
-  { X86::T2RPNTLVWZ0T1_EVEX, X86::T2RPNTLVWZ0T1 },
-  { X86::T2RPNTLVWZ0_EVEX, X86::T2RPNTLVWZ0 },
-  { X86::T2RPNTLVWZ1RST1_EVEX, X86::T2RPNTLVWZ1RST1 },
-  { X86::T2RPNTLVWZ1RS_EVEX, X86::T2RPNTLVWZ1RS },
-  { X86::T2RPNTLVWZ1T1_EVEX, X86::T2RPNTLVWZ1T1 },
-  { X86::T2RPNTLVWZ1_EVEX, X86::T2RPNTLVWZ1 },
   { X86::TILELOADDRST1_EVEX, X86::TILELOADDRST1 },
   { X86::TILELOADDRS_EVEX, X86::TILELOADDRS },
   { X86::TILELOADDT1_EVEX, X86::TILELOADDT1 },
diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
index 8ffacb9bdd5f6..1b728f56ab2ea 100644
--- a/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
@@ -1,7 +1,7 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -S -mtriple=aarch64-linux-gnu -passes=atomic-expand %s | FileCheck %s
 
-define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
+define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) !prof !0 {
 ; CHECK-LABEL: @test_atomicrmw_fadd_f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -14,7 +14,7 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
 ; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
 ; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret float [[TMP5]]
 ;
@@ -336,3 +336,11 @@ define <2 x half> @atomicrmw_fminimum_2_x_half(ptr %ptr, <2 x half> %val) {
   %res = atomicrmw fminimum ptr %ptr, <2 x half> %val seq_cst
   ret <2 x half> %res
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+;.
diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
index 95a52aa0f7f52..b509b2469cfdc 100644
--- a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
@@ -1,8 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -codegen-opt-level=1 -S -mtriple=aarch64-- -passes=atomic-expand %s | FileCheck %s
 ; RUN: opt -codegen-opt-level=1 -S -mtriple=aarch64-- -mattr=+outline-atomics -passes=atomic-expand %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS
 
-define void @atomic_swap_f16(ptr %ptr, half %val) nounwind {
+define void @atomic_swap_f16(ptr %ptr, half %val) !prof !0 {
 ; CHECK-LABEL: @atomic_swap_f16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VAL:%.*]] to i16
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -12,7 +12,7 @@ define void @atomic_swap_f16(ptr %ptr, half %val) nounwind {
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP4]], ptr elementtype(i16) [[PTR]])
 ; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
+; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to half
 ; CHECK-NEXT:    ret void
@@ -27,7 +27,7 @@ define void @atomic_swap_f16(ptr %ptr, half %val) nounwind {
   ret void
 }
 
-define void @atomic_swap_f32(ptr %ptr, float %val) nounwind {
+define void @atomic_swap_f32(ptr %ptr, float %val) nounwind !prof !0 {
 ; CHECK-LABEL: @atomic_swap_f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[VAL:%.*]] to i32
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -37,7 +37,7 @@ define void @atomic_swap_f32(ptr %ptr, float %val) nounwind {
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP4]], ptr elementtype(i32) [[PTR]])
 ; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
+; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP3]] to float
 ; CHECK-NEXT:    ret void
@@ -52,7 +52,7 @@ define void @atomic_swap_f32(ptr %ptr, float %val) nounwind {
   ret void
 }
 
-define void @atomic_swap_f64(ptr %ptr, double %val) nounwind {
+define void @atomic_swap_f64(ptr %ptr, double %val) nounwind !prof !0 {
 ; CHECK-LABEL: @atomic_swap_f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double [[VAL:%.*]] to i64
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -60,7 +60,7 @@ define void @atomic_swap_f64(ptr %ptr, double %val) nounwind {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i64) [[PTR:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP1]], ptr elementtype(i64) [[PTR]])
 ; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
+; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64 [[TMP2]] to double
 ; CHECK-NEXT:    ret void
@@ -74,3 +74,17 @@ define void @atomic_swap_f64(ptr %ptr, double %val) nounwind {
   %t1 = atomicrmw xchg ptr %ptr, double %val acquire
   ret void
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nofree nounwind willreturn }
+;.
+; OUTLINE-ATOMICS: attributes #[[ATTR0:[0-9]+]] = { "target-features"="+outline-atomics" }
+; OUTLINE-ATOMICS: attributes #[[ATTR1:[0-9]+]] = { nounwind "target-features"="+outline-atomics" }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+;.
+; OUTLINE-ATOMICS: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+;.
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
index 649e9467c0318..fffe50fde1e50 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
@@ -9,15 +9,25 @@ target triple = "x86_64-unknown-linux-gnu"
 ; This should promote
 define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr %arg, ptr readonly %arg1) #0 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0:![0-9]+]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -66,15 +76,25 @@ bb:
 ; This should promote
 define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -123,15 +143,25 @@ bb:
 ; This should promote
 define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -180,15 +210,25 @@ bb:
 ; This should promote
 define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr %arg, ptr readonly %arg1) #0 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -237,13 +277,21 @@ bb:
 ; This should not promote
 define internal fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -290,13 +338,21 @@ bb:
 ; This should not promote
 define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(ptr %arg, ptr readonly %arg1) #2 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -343,15 +399,25 @@ bb:
 ; This should promote
 define internal fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr %arg, ptr readonly %arg1) #3 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -400,15 +466,25 @@ bb:
 ; This should promote
 define internal fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr %arg, ptr readonly %arg1) #4 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -464,6 +540,14 @@ attributes #3 = { inlinehint norecurse nounwind uwtable "target-features"="+avx2
 attributes #4 = { inlinehint norecurse nounwind uwtable "target-features"="+avx2" "min-legal-vector-width"="256" "prefer-vector-width"="256" }
 attributes #5 = { argmemonly nounwind }
 ;.
+; CGSCC: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" }
+; CGSCC: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" }
+; CGSCC: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" }
+; CGSCC: attributes #[[ATTR3]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx2" }
+; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; CGSCC: attributes #[[ATTR5]] = { nofree willreturn memory(write) }
+; CGSCC: attributes #[[ATTR6]] = { nofree nounwind willreturn }
+;.
 ; TUNIT: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" }
 ; TUNIT: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" }
 ; TUNIT: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" }
@@ -472,11 +556,7 @@ attributes #5 = { argmemonly nounwind }
 ; TUNIT: attributes #[[ATTR5]] = { nofree willreturn memory(write) }
 ; TUNIT: attributes #[[ATTR6]] = { nofree nosync nounwind willreturn }
 ;.
-; CGSCC: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" }
-; CGSCC: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" }
-; CGSCC: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" }
-; CGSCC: attributes #[[ATTR3]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx2" }
-; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
-; CGSCC: attributes #[[ATTR5]] = { nofree willreturn memory(write) }
-; CGSCC: attributes #[[ATTR6]] = { nofree nounwind willreturn }
+; CGSCC: [[META0]] = !{}
 ;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/Attributor/align-ptrmask.ll b/llvm/test/Transforms/Attributor/align-ptrmask.ll
new file mode 100644
index 0000000000000..008f5e1b8a46e
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/align-ptrmask.ll
@@ -0,0 +1,206 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=attributor -S < %s | FileCheck %s
+
+define ptr @align_ptrmask_back_no_prop(ptr align 2 %x, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_no_prop(
+; CHECK-SAME: ptr nofree writeonly align 2 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr [[P]], align 8
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 -8
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  store float 1.0, ptr %p, align 8
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_prop(ptr align 2 %x, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define noundef nonnull align 16 dereferenceable(4) ptr @align_ptrmask_back_prop(
+; CHECK-SAME: ptr nofree writeonly align 16 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call noundef nonnull align 16 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr [[P]], align 16
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 -8
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  store float 1.0, ptr %p, align 16
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_mask(ptr align 2 %x, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 8 ptr @align_ptrmask_forward_mask(
+; CHECK-SAME: ptr nofree readnone align 2 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 -8
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_ptr(ptr align 16 %x, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 16 ptr @align_ptrmask_forward_ptr(
+; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 -8
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_nonconst_mask(ptr align 8 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 8 ptr @align_ptrmask_forward_nonconst_mask(
+; CHECK-SAME: ptr nofree readnone align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 [[Y]]
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 [[SEL1]]) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 %y
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_nonconst_mask(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_nonconst_mask(
+; CHECK-SAME: ptr nofree writeonly align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 [[Y]]
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 [[SEL1]]) #[[ATTR4]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr [[P]], align 8
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 %y
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  store float 1.0, ptr %p, align 8
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_const_back_noprop(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_const_back_noprop(
+; CHECK-SAME: ptr nofree writeonly align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr [[P]], align 8
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8)
+  store float 1.0, ptr %p, align 8
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_const_back_prop(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_const_back_prop(
+; CHECK-SAME: ptr nofree writeonly align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -2) #[[ATTR4]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr [[P]], align 8
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -2)
+  store float 1.0, ptr %p, align 8
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_const_forward_mask(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 8 ptr @align_ptrmask_back_const_forward_mask(
+; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_const_forward_ptr(ptr align 16 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 16 ptr @align_ptrmask_back_const_forward_ptr(
+; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8)
+  ret ptr %p
+}
+
+; FIXME: The store will create AAAlign for %ptr1,
+; but the attribute didn't propagate through extractelement, need propagate
+define <2 x ptr> @ptrmask_v2p0_v2i64(<2 x ptr> align 2 %ptr, i64 %a) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_v2p0_v2i64(
+; CHECK-SAME: <2 x ptr> align 2 [[PTR:%.*]], i64 [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[RESULT:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[PTR]], <2 x i64> noundef splat (i64 -8)) #[[ATTR4]]
+; CHECK-NEXT:    [[PTR1:%.*]] = extractelement <2 x ptr> [[RESULT]], i32 0
+; CHECK-NEXT:    [[PTR2:%.*]] = extractelement <2 x ptr> [[RESULT]], i32 1
+; CHECK-NEXT:    store i64 [[A]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    store i64 [[A]], ptr [[PTR2]], align 16
+; CHECK-NEXT:    ret <2 x ptr> [[RESULT]]
+;
+  %result = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %ptr, <2 x i64> splat(i64 -8))
+  %ptr1 = extractelement <2 x ptr> %result, i32 0
+  %ptr2 = extractelement <2 x ptr> %result, i32 1
+  store i64 %a, ptr %ptr1, align 16
+  store i64 %a, ptr %ptr2, align 16
+  ret <2 x ptr> %result
+}
+
+define ptr @align_ptrmask_forward_mask_positive(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 4 ptr @align_ptrmask_forward_mask_positive(
+; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 4 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef 2) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 2)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_mask_poison(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 4 ptr @align_ptrmask_forward_mask_poison(
+; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 4 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 poison) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 poison)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_mask_max(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 4294967296 ptr @align_ptrmask_forward_mask_max(
+; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 4294967296 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -4294967296) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -4294967296)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_mask_max_plus_one(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 4294967296 ptr @align_ptrmask_forward_mask_max_plus_one(
+; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 4294967296 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8589934592) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8589934592)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_callsite(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 16 ptr @align_ptrmask_back_callsite(
+; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -4) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -4)
+  ret ptr %p
+}
diff --git a/llvm/test/Transforms/Attributor/nofree.ll b/llvm/test/Transforms/Attributor/nofree.ll
index 2a9d5d91ae053..94aa79aa327f4 100644
--- a/llvm/test/Transforms/Attributor/nofree.ll
+++ b/llvm/test/Transforms/Attributor/nofree.ll
@@ -238,7 +238,7 @@ define void @call_both() #0 {
 
 ; TEST 10 (positive case)
 ; Call intrinsic function
-; CHECK: Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+; CHECK: Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
 declare float @llvm.floor.f32(float)
 
 define void @call_floor(float %a) #0 {
@@ -489,7 +489,7 @@ attributes #2 = { nobuiltin nounwind }
 ; TUNIT: attributes #[[ATTR3]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable }
 ; TUNIT: attributes #[[ATTR4]] = { mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable }
 ; TUNIT: attributes #[[ATTR5:[0-9]+]] = { nofree noinline nounwind memory(none) uwtable }
-; TUNIT: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; TUNIT: attributes #[[ATTR6:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; TUNIT: attributes #[[ATTR7]] = { nofree nounwind }
 ; TUNIT: attributes #[[ATTR8]] = { nobuiltin nofree nounwind }
 ; TUNIT: attributes #[[ATTR9]] = { nosync memory(none) }
@@ -506,7 +506,7 @@ attributes #2 = { nobuiltin nounwind }
 ; CGSCC: attributes #[[ATTR3]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable }
 ; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nofree noinline nounwind memory(none) uwtable }
 ; CGSCC: attributes #[[ATTR5]] = { mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable }
-; CGSCC: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CGSCC: attributes #[[ATTR6:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR7]] = { nofree nounwind }
 ; CGSCC: attributes #[[ATTR8]] = { nobuiltin nofree nounwind }
 ; CGSCC: attributes #[[ATTR9]] = { nosync memory(none) }
diff --git a/llvm/test/Transforms/Attributor/nosync.ll b/llvm/test/Transforms/Attributor/nosync.ll
index 7ef46e8e94c9e..c15bd775ddb4d 100644
--- a/llvm/test/Transforms/Attributor/nosync.ll
+++ b/llvm/test/Transforms/Attributor/nosync.ll
@@ -454,7 +454,7 @@ define void @nosync_convergent_callee_test() {
 ; CHECK: attributes #[[ATTR14:[0-9]+]] = { convergent memory(none) }
 ; CHECK: attributes #[[ATTR15]] = { memory(none) }
 ; CHECK: attributes #[[ATTR16]] = { nounwind }
-; CHECK: attributes #[[ATTR17:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR17:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR18]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR19]] = { nosync memory(none) }
 ; CHECK: attributes #[[ATTR20]] = { nofree nounwind }
diff --git a/llvm/test/Transforms/Attributor/willreturn.ll b/llvm/test/Transforms/Attributor/willreturn.ll
index b7ac7fc2970b0..d65480b05759a 100644
--- a/llvm/test/Transforms/Attributor/willreturn.ll
+++ b/llvm/test/Transforms/Attributor/willreturn.ll
@@ -276,7 +276,7 @@ define void @conditional_exit(i32 %0, ptr nocapture readonly %1) local_unnamed_a
 
 ; TEST 6 (positive case)
 ; Call intrinsic function
-; CHECK: Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+; CHECK: Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
 declare float @llvm.floor.f32(float)
 
 define void @call_floor(float %a) #0 {
@@ -425,7 +425,7 @@ define i32 @loop_constant_trip_count(ptr nocapture readonly %0) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i64 [ 0, [[TMP1:%.*]] ], [ [[TMP9:%.*]], [[TMP3]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ 0, [[TMP1]] ], [ [[TMP8]], [[TMP3]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !invariant.load [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP8]] = add nsw i32 [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9]] = add nuw nsw i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 10
@@ -472,7 +472,7 @@ define i32 @loop_trip_count_unbound(i32 %0, i32 %1, ptr nocapture readonly %2, i
 ; CHECK-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP14]], [[TMP8]] ], [ 0, [[TMP4]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP9]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !invariant.load [[META0]]
 ; CHECK-NEXT:    [[TMP14]] = add nsw i32 [[TMP13]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP15]] = add i32 [[TMP9]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP1]]
@@ -522,7 +522,7 @@ define i32 @loop_trip_dec(i32 %0, ptr nocapture readonly %1) local_unnamed_addr
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ [[TMP5]], [[TMP4]] ], [ [[TMP12:%.*]], [[TMP6]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ 0, [[TMP4]] ], [ [[TMP11:%.*]], [[TMP6]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !invariant.load [[META0]]
 ; CHECK-NEXT:    [[TMP11]] = add nsw i32 [[TMP10]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP12]] = add nsw i64 [[TMP7]], -1
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt i64 [[TMP7]], 0
@@ -1294,7 +1294,7 @@ attributes #1 = { uwtable noinline }
 ; TUNIT: attributes #[[ATTR5]] = { noreturn }
 ; TUNIT: attributes #[[ATTR6]] = { noinline noreturn nounwind uwtable }
 ; TUNIT: attributes #[[ATTR7]] = { noinline nounwind uwtable }
-; TUNIT: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; TUNIT: attributes #[[ATTR8:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; TUNIT: attributes #[[ATTR9:[0-9]+]] = { norecurse willreturn }
 ; TUNIT: attributes #[[ATTR10]] = { mustprogress noinline nounwind willreturn uwtable }
 ; TUNIT: attributes #[[ATTR11:[0-9]+]] = { noinline willreturn uwtable }
@@ -1332,7 +1332,7 @@ attributes #1 = { uwtable noinline }
 ; CGSCC: attributes #[[ATTR5]] = { noreturn }
 ; CGSCC: attributes #[[ATTR6]] = { noinline noreturn nounwind uwtable }
 ; CGSCC: attributes #[[ATTR7]] = { noinline nounwind uwtable }
-; CGSCC: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CGSCC: attributes #[[ATTR8:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR9:[0-9]+]] = { norecurse willreturn }
 ; CGSCC: attributes #[[ATTR10]] = { mustprogress noinline nounwind willreturn uwtable }
 ; CGSCC: attributes #[[ATTR11:[0-9]+]] = { noinline willreturn uwtable }
@@ -1364,3 +1364,7 @@ attributes #1 = { uwtable noinline }
 ; CGSCC: attributes #[[ATTR37]] = { nosync willreturn memory(read) }
 ; CGSCC: attributes #[[ATTR38]] = { willreturn memory(read) }
 ;.
+; TUNIT: [[META0]] = !{}
+;.
+; CGSCC: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/CodeExtractor/PartialInlineDebug.ll b/llvm/test/Transforms/CodeExtractor/PartialInlineDebug.ll
index eb2fb4f4774d8..ab01bbf20de71 100644
--- a/llvm/test/Transforms/CodeExtractor/PartialInlineDebug.ll
+++ b/llvm/test/Transforms/CodeExtractor/PartialInlineDebug.ll
@@ -96,11 +96,11 @@ entry:
 !13 = !DILocalVariable(name: "v", arg: 1, scope: !8, file: !1, line: 3, type: !11)
 !14 = !DILocation(line: 5, column: 10, scope: !8)
 !15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 9, column: 7)
-!16 = distinct !DISubprogram(name: "callee", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!16 = distinct !DISubprogram(name: "callee", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !17 = !DILocation(line: 10, column: 7, scope: !15)
-!18 = distinct !DISubprogram(name: "callee2", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!18 = distinct !DISubprogram(name: "callee2", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !19 = distinct !DILexicalBlock(scope: !18, file: !1, line: 100, column: 1)
 !20 = !DILocation(line: 110, column: 17, scope: !19)
-!21 = distinct !DISubprogram(name: "caller2", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!21 = distinct !DISubprogram(name: "caller2", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !22 = !DILocation(line: 110, column: 17, scope: !21)
 !23 = !DILocation(line: 15, column: 7, scope: !15)
diff --git a/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll b/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll
index 8a6f60ba7a204..87aed77d06ef8 100644
--- a/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll
+++ b/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll
@@ -184,6 +184,18 @@ define void @type_test(ptr %x) {
   ret void
 }
 
+define void @public_type_test(ptr %x) {
+; CHECK-LABEL: define void @public_type_test(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:    [[TEST:%.*]] = call i1 @llvm.public.type.test(ptr [[X]], metadata !"typeid")
+; CHECK-NEXT:    call void @llvm.assume(i1 [[TEST]])
+; CHECK-NEXT:    ret void
+;
+  %test = call i1 @llvm.public.type.test(ptr %x, metadata !"typeid")
+  call void @llvm.assume(i1 %test)
+  ret void
+}
+
 define void @multiple_dead_conds(i32 %x) {
 ; CHECK-LABEL: define void @multiple_dead_conds(
 ; CHECK-SAME: i32 [[X:%.*]]) {
diff --git a/llvm/test/Transforms/DropUnnecessaryAssumes/dereferenceable.ll b/llvm/test/Transforms/DropUnnecessaryAssumes/dereferenceable.ll
new file mode 100644
index 0000000000000..43fa08c070828
--- /dev/null
+++ b/llvm/test/Transforms/DropUnnecessaryAssumes/dereferenceable.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes='drop-unnecessary-assumes' -S %s | FileCheck %s
+; RUN: opt -passes='drop-unnecessary-assumes<drop-deref>' -S %s | FileCheck --check-prefix=DROP-DEREF %s
+
+declare void @use(ptr)
+
+define i8 @test_dereferenceable_assume_ptr_not_used(ptr %p, i64 %size) {
+; CHECK-LABEL: define i8 @test_dereferenceable_assume_ptr_not_used(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 [[SIZE]]) ]
+; CHECK-NEXT:    ret i8 0
+;
+; DROP-DEREF-LABEL: define i8 @test_dereferenceable_assume_ptr_not_used(
+; DROP-DEREF-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) {
+; DROP-DEREF-NEXT:    ret i8 0
+;
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %size) ]
+  ret i8 0
+}
+
+define i8 @test_dereferenceable_assume_ptr_used_variable_size(ptr %p, i64 %size) {
+; CHECK-LABEL: define i8 @test_dereferenceable_assume_ptr_used_variable_size(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 [[SIZE]]) ]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    ret i8 [[VAL]]
+;
+; DROP-DEREF-LABEL: define i8 @test_dereferenceable_assume_ptr_used_variable_size(
+; DROP-DEREF-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) {
+; DROP-DEREF-NEXT:    [[VAL:%.*]] = load i8, ptr [[P]], align 1
+; DROP-DEREF-NEXT:    ret i8 [[VAL]]
+;
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %size) ]
+  %val = load i8, ptr %p
+  ret i8 %val
+}
+
+define i8 @test_dereferenceable_with_align_ptr_used(ptr %p, i64 %size) {
+; CHECK-LABEL: define i8 @test_dereferenceable_with_align_ptr_used(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 [[SIZE]]), "align"(ptr [[P]], i64 8) ]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    ret i8 [[VAL]]
+;
+; DROP-DEREF-LABEL: define i8 @test_dereferenceable_with_align_ptr_used(
+; DROP-DEREF-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) {
+; DROP-DEREF-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P]], i64 8) ]
+; DROP-DEREF-NEXT:    [[VAL:%.*]] = load i8, ptr [[P]], align 1
+; DROP-DEREF-NEXT:    ret i8 [[VAL]]
+;
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %size), "align"(ptr %p, i64 8) ]
+  %val = load i8, ptr %p
+  ret i8 %val
+}
diff --git a/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll b/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll
index cf871e5714bf5..1dbffd962a638 100644
--- a/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll
+++ b/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll
@@ -262,7 +262,7 @@ define i32 @commutative_intrinsic_intersection_failure(i32 %arg, i32 %arg1) {
 }
 
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR1]] = { memory(none) }
 ; CHECK: attributes #[[ATTR2]] = { memory(read) }
 ; CHECK: attributes #[[ATTR3]] = { alwaysinline memory(none) }
diff --git a/llvm/test/Transforms/HotColdSplit/split-out-dbg-label.ll b/llvm/test/Transforms/HotColdSplit/split-out-dbg-label.ll
index da6c19d604c7c..76406ddea6b9f 100644
--- a/llvm/test/Transforms/HotColdSplit/split-out-dbg-label.ll
+++ b/llvm/test/Transforms/HotColdSplit/split-out-dbg-label.ll
@@ -66,7 +66,7 @@ define void @inline_me() !dbg !13 {
 !10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
 !11 = !DILocation(line: 1, column: 1, scope: !6)
 !12 = !DILabel(scope: !6, name: "bye", file: !1, line: 28)
-!13 = distinct !DISubprogram(name: "inline_me", linkageName: "inline_me", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8)
+!13 = distinct !DISubprogram(name: "inline_me", linkageName: "inline_me", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !2)
 !14 = !DILabel(scope: !13, name: "label_in_@inline_me", file: !1, line: 29)
 !15 = !DILocation(line: 2, column: 2, scope: !13, inlinedAt: !11)
 !16 = !DILabel(scope: !17, name: "scoped_label_in_foo", file: !1, line: 30)
diff --git a/llvm/test/Transforms/HotColdSplit/transfer-debug-info.ll b/llvm/test/Transforms/HotColdSplit/transfer-debug-info.ll
index 3f69f0c200dad..f9dd9eaf01422 100644
--- a/llvm/test/Transforms/HotColdSplit/transfer-debug-info.ll
+++ b/llvm/test/Transforms/HotColdSplit/transfer-debug-info.ll
@@ -106,7 +106,7 @@ define void @inline_me() !dbg !12{
 !9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10)
 !10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
 !11 = !DILocation(line: 1, column: 1, scope: !6)
-!12 = distinct !DISubprogram(name: "inline_me", linkageName: "inline_me", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8)
+!12 = distinct !DISubprogram(name: "inline_me", linkageName: "inline_me", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !2)
 !13 = !DILocation(line: 2, column: 2, scope: !12, inlinedAt: !14)
 !14 = !DILocation(line: 3, column: 3, scope: !15)
 !15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 4, column: 4)
diff --git a/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll b/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll
index b1ef50382c070..c4933678d0391 100644
--- a/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll
+++ b/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll
@@ -417,3 +417,140 @@ loop:
 exit:
   ret void
 }
+
+define void @test_fp_to_int_irrealizable_initval() {
+; CHECK-LABEL: @test_fp_to_int_irrealizable_initval(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi float [ 1.000000e+08, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    call void @opaque()
+; CHECK-NEXT:    [[IV_NEXT]] = fadd float [[IV]], -1.700000e+01
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult float [[IV_NEXT]], 2.500000e+01
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi float [ 1.000000e+08, %entry ], [ %iv.next, %loop ]
+  call void @opaque()
+  %iv.next = fadd float %iv, -1.700000e+01
+  %cmp = fcmp ult float %iv.next, 2.500000e+01
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_fp_to_int_irrealizable_exitval() {
+; CHECK-LABEL: @test_fp_to_int_irrealizable_exitval(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi float [ 2.500000e+01, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    call void @opaque()
+; CHECK-NEXT:    [[IV_NEXT]] = fadd float [[IV]], 1.700000e+01
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[IV_NEXT]], 1.000000e+08
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi float [ 2.500000e+01, %entry ], [ %iv.next, %loop ]
+  call void @opaque()
+  %iv.next = fadd float %iv, 1.700000e+01
+  %cmp = fcmp ugt float %iv.next, 1.000000e+08
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_fp_to_int_irrealizable_negative_exitval() {
+; CHECK-LABEL: @test_fp_to_int_irrealizable_negative_exitval(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi float [ -2.500000e+01, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    call void @opaque()
+; CHECK-NEXT:    [[IV_NEXT]] = fadd float [[IV]], -1.700000e+01
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult float [[IV_NEXT]], -1.000000e+08
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi float [ -2.500000e+01, %entry ], [ %iv.next, %loop ]
+  call void @opaque()
+  %iv.next = fadd float %iv, -1.700000e+01
+  %cmp = fcmp ult float %iv.next, -1.000000e+08
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_fp_to_int_irrealizable_exitval_pow_2_24() {
+; CHECK-LABEL: @test_fp_to_int_irrealizable_exitval_pow_2_24(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    call void @opaque()
+; CHECK-NEXT:    [[IV_NEXT]] = fadd float [[IV]], 1.000000e+00
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[IV_NEXT]], 0x4170000000000000
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi float [ 0.000000e+00, %entry ], [ %iv.next, %loop ]
+  call void @opaque()
+  %iv.next = fadd float %iv, 1.000000e+00
+  %cmp = fcmp ugt float %iv.next, 0x4170000000000000
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_fp_to_int_irrealizable_exitval_int64_min() {
+; CHECK-LABEL: @test_fp_to_int_irrealizable_exitval_int64_min(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi double [ 2.500000e+01, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    call void @opaque()
+; CHECK-NEXT:    [[IV_NEXT]] = fadd double [[IV]], 1.700000e+01
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult double [[IV_NEXT]], 0xC3E0000000000000
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi double [ 2.500000e+01, %entry ], [ %iv.next, %loop ]
+  call void @opaque()
+  %iv.next = fadd double %iv, 1.700000e+01
+  %cmp = fcmp ult double %iv.next, 0xC3E0000000000000
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare void @opaque()
diff --git a/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll b/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll
index 14ee00d77197c..2763860e79875 100644
--- a/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll
+++ b/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll
@@ -114,7 +114,7 @@ define i32 @urem_order1(i32 %n) {
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    call void @foo()
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 3
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 3
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT_LOOPEXIT]]:
@@ -205,13 +205,12 @@ define i64 @test_loop_with_div_order_1(i64 %n) {
 ; CHECK-NEXT:    [[PARITY_CHECK:%.*]] = icmp eq i64 [[IS_ODD]], 0
 ; CHECK-NEXT:    br i1 [[PARITY_CHECK]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT]]
 ; CHECK:       [[LOOP_PREHEADER]]:
-; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[UPPER_BOUND]], i64 1)
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    [[DUMMY:%.*]] = load volatile i64, ptr null, align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[UMAX]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[UPPER_BOUND]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT_LOOPEXIT:.*]]
 ; CHECK:       [[EXIT_LOOPEXIT]]:
 ; CHECK-NEXT:    br label %[[EXIT]]
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/builtin-assumed-addrspace.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/builtin-assumed-addrspace.ll
index e0c80c0389541..32dca860a7ded 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/builtin-assumed-addrspace.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/builtin-assumed-addrspace.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces -o - %s | FileCheck %s
 
-define float @f0(ptr %p) {
-; CHECK-LABEL: define float @f0(
+define float @assume_is_shared_gep(ptr %p) {
+; CHECK-LABEL: define float @assume_is_shared_gep(
 ; CHECK-SAME: ptr [[P:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[P]])
@@ -24,8 +24,8 @@ entry:
   ret float %load
 }
 
-define float @f1(ptr %p) {
-; CHECK-LABEL: define float @f1(
+define float @assume_is_private_gep(ptr %p) {
+; CHECK-LABEL: define float @assume_is_private_gep(
 ; CHECK-SAME: ptr [[P:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[P]])
@@ -47,8 +47,8 @@ entry:
   ret float %load
 }
 
-define float @f2(ptr %p) {
-; CHECK-LABEL: define float @f2(
+define float @assume_not_private_and_not_shared_gep(ptr %p) {
+; CHECK-LABEL: define float @assume_not_private_and_not_shared_gep(
 ; CHECK-SAME: ptr [[P:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[P]])
@@ -78,8 +78,8 @@ entry:
   ret float %load
 }
 
-define float @g0(i32 %c, ptr %p) {
-; CHECK-LABEL: define float @g0(
+define float @conditionally_assume_is_shared_gep(i32 %c, ptr %p) {
+; CHECK-LABEL: define float @conditionally_assume_is_shared_gep(
 ; CHECK-SAME: i32 [[C:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[C]], 0
@@ -127,6 +127,198 @@ if.end:
   ret float %add2
 }
 
+define float @conditionally_assume_is_shared_else_assume_private(i32 %c, ptr %p) {
+; CHECK-LABEL: define float @conditionally_assume_is_shared_else_assume_private(
+; CHECK-SAME: i32 [[C:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[IF_THEN_SHARED:.*]], label %[[IF_THEN_PRIVATE:.*]]
+; CHECK:       [[IF_THEN_SHARED]]:
+; CHECK-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[P]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[IS_SHARED]])
+; CHECK-NEXT:    [[WORKITEM_ID_X_0:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[WORKITEM_ID_X_0]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(3)
+; CHECK-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP0]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[LOAD0:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX0]], align 4
+; CHECK-NEXT:    [[ADD0:%.*]] = fadd float [[LOAD0]], 4.000000e+00
+; CHECK-NEXT:    br label %[[IF_END:.*]]
+; CHECK:       [[IF_THEN_PRIVATE]]:
+; CHECK-NEXT:    [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[P]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[IS_PRIVATE]])
+; CHECK-NEXT:    [[WORKITEM_ID_X_1:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[WORKITEM_ID_X_1]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(5)
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(5) [[TMP1]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load float, ptr addrspace(5) [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd float [[LOAD1]], 4.000000e+00
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi float [ [[ADD0]], %[[IF_THEN_SHARED]] ], [ [[ADD1]], %[[IF_THEN_PRIVATE]] ]
+; CHECK-NEXT:    ret float [[PHI]]
+;
+entry:
+  %tobool.not = icmp eq i32 %c, 0
+  br i1 %tobool.not, label %if.then.shared, label %if.then.private
+
+if.then.shared:
+  %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %p)
+  tail call void @llvm.assume(i1 %is.shared)
+  %workitem.id.x.0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %workitem.id.x.0 to i64
+  %arrayidx0 = getelementptr inbounds float, ptr %p, i64 %idxprom
+  %load0 = load float, ptr %arrayidx0, align 4
+  %add0 = fadd float %load0, 4.0
+  br label %if.end
+
+if.then.private:
+  %is.private = call i1 @llvm.amdgcn.is.private(ptr %p)
+  tail call void @llvm.assume(i1 %is.private)
+  %workitem.id.x.1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom1 = zext i32 %workitem.id.x.1 to i64
+  %arrayidx1 = getelementptr inbounds float, ptr %p, i64 %idxprom1
+  %load1 = load float, ptr %arrayidx1, align 4
+  %add1 = fadd float %load1, 4.0
+  br label %if.end
+
+if.end:
+  %phi = phi float [ %add0, %if.then.shared ], [ %add1, %if.then.private ]
+  ret float %phi
+}
+
+define float @assume_func_arg_is_shared_load(ptr %flat.ptr) {
+; CHECK-LABEL: define float @assume_func_arg_is_shared_load(
+; CHECK-SAME: ptr [[FLAT_PTR:%.*]]) {
+; CHECK-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[FLAT_PTR]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[IS_SHARED]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load float, ptr [[FLAT_PTR]], align 4
+; CHECK-NEXT:    ret float [[LOAD]]
+;
+  %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %flat.ptr)
+  tail call void @llvm.assume(i1 %is.shared)
+  %load = load float, ptr %flat.ptr, align 4
+  ret float %load
+}
+
+define float @assume_func_arg_is_private_load(ptr %flat.ptr) {
+; CHECK-LABEL: define float @assume_func_arg_is_private_load(
+; CHECK-SAME: ptr [[FLAT_PTR:%.*]]) {
+; CHECK-NEXT:    [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[FLAT_PTR]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[IS_PRIVATE]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load float, ptr [[FLAT_PTR]], align 4
+; CHECK-NEXT:    ret float [[LOAD]]
+;
+  %is.private = call i1 @llvm.amdgcn.is.private(ptr %flat.ptr)
+  tail call void @llvm.assume(i1 %is.private)
+  %load = load float, ptr %flat.ptr, align 4
+  ret float %load
+}
+
+define float @assume_func_arg_is_not_shared_not_private(ptr %flat.ptr) {
+; CHECK-LABEL: define float @assume_func_arg_is_not_shared_not_private(
+; CHECK-SAME: ptr [[FLAT_PTR:%.*]]) {
+; CHECK-NEXT:    [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[FLAT_PTR]])
+; CHECK-NEXT:    [[NOT_PRIVATE:%.*]] = xor i1 [[IS_PRIVATE]], true
+; CHECK-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[FLAT_PTR]])
+; CHECK-NEXT:    [[NOT_SHARED:%.*]] = xor i1 [[IS_SHARED]], true
+; CHECK-NEXT:    [[NOT_PRIVATE_AND_NOT_SHARED:%.*]] = and i1 [[NOT_PRIVATE]], [[NOT_SHARED]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[NOT_PRIVATE_AND_NOT_SHARED]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load float, ptr [[FLAT_PTR]], align 4
+; CHECK-NEXT:    ret float [[LOAD]]
+;
+  %is.private = call i1 @llvm.amdgcn.is.private(ptr %flat.ptr)
+  %not.private = xor i1 %is.private, true
+  %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %flat.ptr)
+  %not.shared = xor i1 %is.shared, true
+  %not.private.and.not.shared = and i1 %not.private, %not.shared
+  tail call void @llvm.assume(i1 %not.private.and.not.shared)
+  %load = load float, ptr %flat.ptr, align 4
+  ret float %load
+}
+
+define float @assume_func_arg_is_not_private_load(ptr %flat.ptr) {
+; CHECK-LABEL: define float @assume_func_arg_is_not_private_load(
+; CHECK-SAME: ptr [[FLAT_PTR:%.*]]) {
+; CHECK-NEXT:    [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[FLAT_PTR]])
+; CHECK-NEXT:    [[NOT_IS_PRIVATE:%.*]] = xor i1 [[IS_PRIVATE]], true
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[NOT_IS_PRIVATE]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load float, ptr [[FLAT_PTR]], align 4
+; CHECK-NEXT:    ret float [[LOAD]]
+;
+  %is.private = call i1 @llvm.amdgcn.is.private(ptr %flat.ptr)
+  %not.is.private = xor i1 %is.private, true
+  tail call void @llvm.assume(i1 %not.is.private)
+  %load = load float, ptr %flat.ptr, align 4
+  ret float %load
+}
+
+define i64 @assume_func_arg_is_not_private_atomicrmw(ptr %flat.ptr, i64 %val) {
+; CHECK-LABEL: define i64 @assume_func_arg_is_not_private_atomicrmw(
+; CHECK-SAME: ptr [[FLAT_PTR:%.*]], i64 [[VAL:%.*]]) {
+; CHECK-NEXT:    [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[FLAT_PTR]])
+; CHECK-NEXT:    [[NOT_IS_PRIVATE:%.*]] = xor i1 [[IS_PRIVATE]], true
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[NOT_IS_PRIVATE]])
+; CHECK-NEXT:    [[RMW:%.*]] = atomicrmw sub ptr [[FLAT_PTR]], i64 [[VAL]] seq_cst, align 4
+; CHECK-NEXT:    ret i64 [[RMW]]
+;
+  %is.private = call i1 @llvm.amdgcn.is.private(ptr %flat.ptr)
+  %not.is.private = xor i1 %is.private, true
+  tail call void @llvm.assume(i1 %not.is.private)
+  %rmw = atomicrmw sub ptr %flat.ptr, i64 %val seq_cst, align 4
+  ret i64 %rmw
+}
+
+define float @contradictory_assume_after_gep_same_block(ptr %p) {
+; CHECK-LABEL: define float @contradictory_assume_after_gep_same_block(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[P]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[IS_SHARED]])
+; CHECK-NEXT:    [[WORKITEM_ID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[WORKITEM_ID_X]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(3)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP1]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[TMP2]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 false)
+; CHECK-NEXT:    [[LOAD:%.*]] = load float, ptr addrspace(3) [[GEP]], align 4
+; CHECK-NEXT:    ret float [[LOAD]]
+;
+  %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %p)
+  tail call void @llvm.assume(i1 %is.shared)
+  %workitem.id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %workitem.id.x to i64
+  %gep = getelementptr inbounds float, ptr %p, i64 %idxprom
+  %is.private = call i1 @llvm.amdgcn.is.private(ptr %gep)
+  tail call void @llvm.assume(i1 %is.private)
+  %load = load float, ptr %gep, align 4
+  ret float %load
+}
+
+define float @contradictory_assume_argument_same_block(ptr %p) {
+; CHECK-LABEL: define float @contradictory_assume_argument_same_block(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[P]])
+; CHECK-NEXT:    [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[P]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[IS_SHARED]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[IS_PRIVATE]])
+; CHECK-NEXT:    [[WORKITEM_ID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[WORKITEM_ID_X]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(3)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP1]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load float, ptr addrspace(3) [[GEP]], align 4
+; CHECK-NEXT:    ret float [[LOAD]]
+;
+  %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %p)
+  %is.private = call i1 @llvm.amdgcn.is.private(ptr %p)
+  tail call void @llvm.assume(i1 %is.shared)
+  tail call void @llvm.assume(i1 %is.private)
+  %workitem.id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %workitem.id.x to i64
+  %gep = getelementptr inbounds float, ptr %p, i64 %idxprom
+  %load = load float, ptr %gep, align 4
+  ret float %load
+}
+
 declare void @llvm.assume(i1)
 declare i1 @llvm.amdgcn.is.shared(ptr nocapture)
 declare i1 @llvm.amdgcn.is.private(ptr nocapture)
diff --git a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
index 51e22bb86f331..25a70a026a0b7 100644
--- a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
+++ b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
@@ -762,6 +762,24 @@ declare float @nearbyintf(float)
 ; CHECK: declare x86_fp80 @nearbyintl(x86_fp80) [[MEMNONE_NOFREE_NOUNWIND_WILLRETURN:#[0-9]+]]
 declare x86_fp80 @nearbyintl(x86_fp80)
 
+; CHECK: declare double @nextafter(double, double) [[ERRNOMEMONLY_NOFREE_NOUNWIND_WILLRETURN:#[0-9]+]]
+declare double @nextafter(double, double)
+
+; CHECK: declare float @nextafterf(float, float) [[ERRNOMEMONLY_NOFREE_NOUNWIND_WILLRETURN:#[0-9]+]]
+declare float @nextafterf(float, float)
+
+; CHECK: declare x86_fp80 @nextafterl(x86_fp80, x86_fp80) [[ERRNOMEMONLY_NOFREE_NOUNWIND_WILLRETURN:#[0-9]+]]
+declare x86_fp80 @nextafterl(x86_fp80, x86_fp80)
+
+; CHECK: declare double @nexttoward(double, x86_fp80) [[ERRNOMEMONLY_NOFREE_NOUNWIND_WILLRETURN:#[0-9]+]]
+declare double @nexttoward(double, x86_fp80)
+
+; CHECK: declare float @nexttowardf(float, x86_fp80) [[ERRNOMEMONLY_NOFREE_NOUNWIND_WILLRETURN:#[0-9]+]]
+declare float @nexttowardf(float, x86_fp80)
+
+; CHECK: declare x86_fp80 @nexttowardl(x86_fp80, x86_fp80) [[ERRNOMEMONLY_NOFREE_NOUNWIND_WILLRETURN:#[0-9]+]]
+declare x86_fp80 @nexttowardl(x86_fp80, x86_fp80)
+
 ; CHECK-LINUX: declare noundef i32 @open(ptr noundef readonly captures(none), i32 noundef, ...) [[NOFREE]]
 ; CHECK-OPEN: declare noundef i32 @open(ptr noundef readonly captures(none), i32 noundef, ...) [[NOFREE:#[0-9]+]]
 declare i32 @open(ptr, i32, ...)
diff --git a/llvm/test/Transforms/InstCombine/binop-phi-operands.ll b/llvm/test/Transforms/InstCombine/binop-phi-operands.ll
index 9e049837b0352..f0d4ad74fbe05 100644
--- a/llvm/test/Transforms/InstCombine/binop-phi-operands.ll
+++ b/llvm/test/Transforms/InstCombine/binop-phi-operands.ll
@@ -653,12 +653,11 @@ define i8 @mul_const_incoming0_speculatable(i1 %b, i8 %x, i8 %y) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[IF:%.*]], label [[THEN:%.*]]
 ; CHECK:       if:
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i8 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    br label [[THEN]]
 ; CHECK:       then:
-; CHECK-NEXT:    [[P0:%.*]] = phi i8 [ 42, [[ENTRY:%.*]] ], [ [[X:%.*]], [[IF]] ]
-; CHECK-NEXT:    [[P1:%.*]] = phi i8 [ 17, [[ENTRY]] ], [ [[Y:%.*]], [[IF]] ]
+; CHECK-NEXT:    [[R:%.*]] = phi i8 [ -54, [[ENTRY:%.*]] ], [ [[TMP0]], [[IF]] ]
 ; CHECK-NEXT:    call void @sideeffect()
-; CHECK-NEXT:    [[R:%.*]] = mul i8 [[P0]], [[P1]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
 entry:
diff --git a/llvm/test/Transforms/InstCombine/binop-select.ll b/llvm/test/Transforms/InstCombine/binop-select.ll
index 25f624ee13412..9e336ad104599 100644
--- a/llvm/test/Transforms/InstCombine/binop-select.ll
+++ b/llvm/test/Transforms/InstCombine/binop-select.ll
@@ -335,7 +335,7 @@ define i32 @sub_sel_op1_use(i1 %b) {
 
 define float @fadd_sel_op0(i1 %b, float %x) {
 ; CHECK-LABEL: @fadd_sel_op0(
-; CHECK-NEXT:    [[R:%.*]] = select nnan i1 [[B:%.*]], float 0xFFF0000000000000, float 0x7FF0000000000000
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[B:%.*]], float 0xFFF0000000000000, float 0x7FF0000000000000
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %s = select i1 %b, float 0xFFF0000000000000, float 0x7FF0000000000000
@@ -403,3 +403,171 @@ define i32 @ashr_sel_op1_use(i1 %b) {
   %r = ashr i32 -2, %s
   ret i32 %r
 }
+
+define i8 @commonArgWithOr0(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithOr0(
+; CHECK-NEXT:    [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 17, i8 24
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 0, i8 8
+  %v2 = or i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithOr1(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithOr1(
+; CHECK-NEXT:    [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 17, i8 23
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 1, i8 7
+  %v2 = or i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithOr2(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithOr2(
+; CHECK-NEXT:    [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 21, i8 58
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 21, i8 42
+  %v2 = or i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithAnd0(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithAnd0(
+; CHECK-NEXT:    ret i8 16
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 0, i8 8
+  %v2 = and i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithAnd1(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithAnd1(
+; CHECK-NEXT:    ret i8 16
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 8, i8 1
+  %v2 = and i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithAnd2(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithAnd2(
+; CHECK-NEXT:    [[V2:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V3:%.*]] = or disjoint i8 [[V2]], 16
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 1, i8 7
+  %v2 = and i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithAnd3(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithAnd3(
+; CHECK-NEXT:    [[V2:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V3:%.*]] = or disjoint i8 [[V2]], 16
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 21, i8 42
+  %v2 = and i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithXor0(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithXor0(
+; CHECK-NEXT:    [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 17, i8 24
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 0, i8 8
+  %v2 = xor i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithXor1(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithXor1(
+; CHECK-NEXT:    [[V2:%.*]] = select i1 [[ARG0:%.*]], i8 8, i8 1
+; CHECK-NEXT:    ret i8 [[V2]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 9, i8 1
+  %v2 = xor i8 %v1, %v0
+  ret i8 %v2
+}
+
+define i8 @commonArgWithXor2(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithXor2(
+; CHECK-NEXT:    [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 16, i8 23
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 1, i8 7
+  %v2 = xor i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithXor3(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithXor3(
+; CHECK-NEXT:    [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 20, i8 61
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 21, i8 45
+  %v2 = xor i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithAdd0(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithAdd0(
+; CHECK-NEXT:    [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 22, i8 61
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 21, i8 45
+  %v2 = add i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i32 @OrSelectIcmpZero(i32 %a, i32 %b) {
+; CHECK-LABEL: @OrSelectIcmpZero(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[A]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %a
+  ret i32 %or
+}
+
+define i32 @OrSelectIcmpNonZero(i32 %a, i32 %b) {
+; CHECK-LABEL: @OrSelectIcmpNonZero(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 42
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL]], [[A]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 42
+  %or = or i32 %sel, %a
+  ret i32 %or
+}
diff --git a/llvm/test/Transforms/InstCombine/debuginfo-dce.ll b/llvm/test/Transforms/InstCombine/debuginfo-dce.ll
index c1d7c30e936f2..ec90779d0acce 100644
--- a/llvm/test/Transforms/InstCombine/debuginfo-dce.ll
+++ b/llvm/test/Transforms/InstCombine/debuginfo-dce.ll
@@ -125,15 +125,15 @@ attributes #1 = { nounwind readnone }
 !19 = !DILocation(line: 6, column: 17, scope: !14)
 !20 = !DIExpression(DW_OP_plus_uconst, 0)
 !21 = !DILocation(line: 11, column: 1, scope: !14)
-!22 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17)
+!22 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !23 = !DILocation(line: 6, column: 17, scope: !22)
 !24 = !DILocalVariable(name: "entry", scope: !22, file: !1, line: 6, type: !4)
-!25 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17)
+!25 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !26 = !DILocation(line: 6, column: 17, scope: !25)
 !27 = !DILocalVariable(name: "entry", scope: !25, file: !1, line: 6, type: !4)
-!28 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17)
+!28 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !29 = !DILocation(line: 6, column: 17, scope: !28)
 !30 = !DILocalVariable(name: "entry", scope: !28, file: !1, line: 6, type: !4)
-!31 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17)
+!31 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !32 = !DILocation(line: 6, column: 17, scope: !31)
 !33 = !DILocalVariable(name: "entry", scope: !31, file: !1, line: 6, type: !4)
diff --git a/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll b/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll
index 45e47d8e781be..5e90d4b8d4419 100644
--- a/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll
+++ b/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll
@@ -7,7 +7,7 @@
 define zeroext i1 @foo(i32 %arg) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[ARG:%.*]], 37
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[ARG:%.*]], 37
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[BB_ELSE:%.*]], label [[BB_THEN:%.*]]
 ; CHECK:       bb_then:
 ; CHECK-NEXT:    call void @bar()
@@ -16,8 +16,7 @@ define zeroext i1 @foo(i32 %arg) {
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[ARG]], 17
 ; CHECK-NEXT:    br label [[BB_EXIT]]
 ; CHECK:       bb_exit:
-; CHECK-NEXT:    [[PHI1:%.*]] = phi i1 [ [[CMP2]], [[BB_ELSE]] ], [ undef, [[BB_THEN]] ]
-; CHECK-NEXT:    [[AND1:%.*]] = and i1 [[PHI1]], [[CMP1]]
+; CHECK-NEXT:    [[AND1:%.*]] = phi i1 [ [[CMP2]], [[BB_THEN]] ], [ false, [[BB_ELSE]] ]
 ; CHECK-NEXT:    ret i1 [[AND1]]
 ;
 
@@ -43,7 +42,7 @@ bb_exit:
 define zeroext i1 @foo_logical(i32 %arg) {
 ; CHECK-LABEL: @foo_logical(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[ARG:%.*]], 37
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[ARG:%.*]], 37
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[BB_ELSE:%.*]], label [[BB_THEN:%.*]]
 ; CHECK:       bb_then:
 ; CHECK-NEXT:    call void @bar()
@@ -52,8 +51,7 @@ define zeroext i1 @foo_logical(i32 %arg) {
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[ARG]], 17
 ; CHECK-NEXT:    br label [[BB_EXIT]]
 ; CHECK:       bb_exit:
-; CHECK-NEXT:    [[PHI1:%.*]] = phi i1 [ [[CMP2]], [[BB_ELSE]] ], [ undef, [[BB_THEN]] ]
-; CHECK-NEXT:    [[AND1:%.*]] = and i1 [[PHI1]], [[CMP1]]
+; CHECK-NEXT:    [[AND1:%.*]] = phi i1 [ [[CMP2]], [[BB_THEN]] ], [ false, [[BB_ELSE]] ]
 ; CHECK-NEXT:    ret i1 [[AND1]]
 ;
 
diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll
index cd4a8e36c6e23..3cbf7090a13b8 100644
--- a/llvm/test/Transforms/InstCombine/fmul.ll
+++ b/llvm/test/Transforms/InstCombine/fmul.ll
@@ -1222,7 +1222,7 @@ define <2 x double> @negate_if_true_wrong_constant(<2 x double> %px, i1 %cond) {
 ; X *fast (C ? 1.0 : 0.0) -> C ? X : 0.0
 define float @fmul_select(float %x, i1 %c) {
 ; CHECK-LABEL: @fmul_select(
-; CHECK-NEXT:    [[MUL:%.*]] = select fast i1 [[C:%.*]], float [[X:%.*]], float 0.000000e+00
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[C:%.*]], float [[X:%.*]], float 0.000000e+00
 ; CHECK-NEXT:    ret float [[MUL]]
 ;
   %sel = select i1 %c, float 1.0, float 0.0
@@ -1233,7 +1233,7 @@ define float @fmul_select(float %x, i1 %c) {
 ; X *fast (C ? 1.0 : 0.0) -> C ? X : 0.0
 define <2 x float> @fmul_select_vec(<2 x float> %x, i1 %c) {
 ; CHECK-LABEL: @fmul_select_vec(
-; CHECK-NEXT:    [[MUL:%.*]] = select fast i1 [[C:%.*]], <2 x float> [[X:%.*]], <2 x float> zeroinitializer
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[C:%.*]], <2 x float> [[X:%.*]], <2 x float> zeroinitializer
 ; CHECK-NEXT:    ret <2 x float> [[MUL]]
 ;
   %sel = select i1 %c, <2 x float> <float 1.0, float 1.0>, <2 x float> zeroinitializer
diff --git a/llvm/test/Transforms/InstCombine/free-inversion.ll b/llvm/test/Transforms/InstCombine/free-inversion.ll
index 4b69a5e77b4ce..2e8e75c3ab3ef 100644
--- a/llvm/test/Transforms/InstCombine/free-inversion.ll
+++ b/llvm/test/Transforms/InstCombine/free-inversion.ll
@@ -563,10 +563,10 @@ define i1 @test_inv_free(i1 %c1, i1 %c2, i1 %c3, i1 %c4) {
 ; CHECK:       b2:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       b3:
+; CHECK-NEXT:    [[TMP0:%.*]] = and i1 [[C3:%.*]], [[C4:%.*]]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[VAL_NOT:%.*]] = phi i1 [ false, [[B1]] ], [ true, [[B2]] ], [ [[C3:%.*]], [[B3]] ]
-; CHECK-NEXT:    [[COND_NOT:%.*]] = and i1 [[VAL_NOT]], [[C4:%.*]]
+; CHECK-NEXT:    [[COND_NOT:%.*]] = phi i1 [ false, [[B1]] ], [ [[C4]], [[B2]] ], [ [[TMP0]], [[B3]] ]
 ; CHECK-NEXT:    br i1 [[COND_NOT]], label [[B5:%.*]], label [[B4:%.*]]
 ; CHECK:       b4:
 ; CHECK-NEXT:    ret i1 true
diff --git a/llvm/test/Transforms/InstCombine/known-bits-lerp-pattern.ll b/llvm/test/Transforms/InstCombine/known-bits-lerp-pattern.ll
new file mode 100644
index 0000000000000..5a33d35aa1cf1
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/known-bits-lerp-pattern.ll
@@ -0,0 +1,181 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; Test known bits refinements for pattern: a * (b - c) + c * d
+; where a > 0, c > 0, b > 0, d > 0, and b > c.
+; This pattern is a generalization of lerp and it appears frequently in graphics operations.
+
+define i32 @test_clamp(i8 %a, i8 %c, i8 %d) {
+; CHECK-LABEL: define i32 @test_clamp(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) {
+; CHECK-NEXT:    [[A32:%.*]] = zext i8 [[A]] to i32
+; CHECK-NEXT:    [[C32:%.*]] = zext i8 [[C]] to i32
+; CHECK-NEXT:    [[D32:%.*]] = zext i8 [[D]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = xor i32 [[C32]], 255
+; CHECK-NEXT:    [[MUL1:%.*]] = mul nuw nsw i32 [[SUB]], [[A32]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul nuw nsw i32 [[C32]], [[D32]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[MUL1]], [[MUL2]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %a32 = zext i8 %a to i32
+  %c32 = zext i8 %c to i32
+  %d32 = zext i8 %d to i32
+  %sub = sub i32 255, %c32
+  %mul1 = mul i32 %a32, %sub
+  %mul2 = mul i32 %c32, %d32
+  %add = add i32 %mul1, %mul2
+  %cmp = icmp ugt i32 %add, 65535
+  %result = select i1 %cmp, i32 65535, i32 %add
+  ret i32 %result
+}
+
+define i1 @test_trunc_cmp(i8 %a, i8 %c, i8 %d) {
+; CHECK-LABEL: define i1 @test_trunc_cmp(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) {
+; CHECK-NEXT:    [[A32:%.*]] = zext i8 [[A]] to i32
+; CHECK-NEXT:    [[C32:%.*]] = zext i8 [[C]] to i32
+; CHECK-NEXT:    [[D32:%.*]] = zext i8 [[D]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = xor i32 [[C32]], 255
+; CHECK-NEXT:    [[MUL1:%.*]] = mul nuw nsw i32 [[SUB]], [[A32]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul nuw nsw i32 [[C32]], [[D32]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[MUL1]], [[MUL2]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[ADD]], 1234
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a32 = zext i8 %a to i32
+  %c32 = zext i8 %c to i32
+  %d32 = zext i8 %d to i32
+  %sub = sub i32 255, %c32
+  %mul1 = mul i32 %a32, %sub
+  %mul2 = mul i32 %c32, %d32
+  %add = add i32 %mul1, %mul2
+  %trunc = trunc i32 %add to i16
+  %cmp = icmp eq i16 %trunc, 1234
+  ret i1 %cmp
+}
+
+define i1 @test_trunc_cmp_xor(i8 %a, i8 %c, i8 %d) {
+; CHECK-LABEL: define i1 @test_trunc_cmp_xor(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) {
+; CHECK-NEXT:    [[A32:%.*]] = zext i8 [[A]] to i32
+; CHECK-NEXT:    [[C32:%.*]] = zext i8 [[C]] to i32
+; CHECK-NEXT:    [[D32:%.*]] = zext i8 [[D]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = xor i32 [[C32]], 255
+; CHECK-NEXT:    [[MUL1:%.*]] = mul nuw nsw i32 [[SUB]], [[A32]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul nuw nsw i32 [[C32]], [[D32]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[MUL1]], [[MUL2]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[ADD]], 1234
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a32 = zext i8 %a to i32
+  %c32 = zext i8 %c to i32
+  %d32 = zext i8 %d to i32
+  %sub = xor i32 255, %c32
+  %mul1 = mul i32 %a32, %sub
+  %mul2 = mul i32 %c32, %d32
+  %add = add i32 %mul1, %mul2
+  %trunc = trunc i32 %add to i16
+  %cmp = icmp eq i16 %trunc, 1234
+  ret i1 %cmp
+}
+
+define i1 @test_trunc_cmp_arbitrary_b(i8 %a, i8 %b, i8 %c, i8 %d) {
+; CHECK-LABEL: define i1 @test_trunc_cmp_arbitrary_b(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) {
+; CHECK-NEXT:    [[A32:%.*]] = zext i8 [[A]] to i32
+; CHECK-NEXT:    [[B32:%.*]] = zext i8 [[B]] to i32
+; CHECK-NEXT:    [[C32:%.*]] = zext i8 [[C]] to i32
+; CHECK-NEXT:    [[D32:%.*]] = zext i8 [[D]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 [[B32]], [[C32]]
+; CHECK-NEXT:    [[MUL1:%.*]] = mul nuw nsw i32 [[SUB]], [[A32]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul nuw nsw i32 [[C32]], [[D32]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[MUL1]], [[MUL2]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[ADD]], 1234
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a32 = zext i8 %a to i32
+  %b32 = zext i8 %b to i32
+  %c32 = zext i8 %c to i32
+  %d32 = zext i8 %d to i32
+  %sub = sub nsw nuw i32 %b32, %c32
+  %mul1 = mul i32 %a32, %sub
+  %mul2 = mul i32 %c32, %d32
+  %add = add i32 %mul1, %mul2
+  %trunc = trunc i32 %add to i16
+  %cmp = icmp eq i16 %trunc, 1234
+  ret i1 %cmp
+}
+
+
+define i1 @test_trunc_cmp_no_a(i8 %b, i8 %c, i8 %d) {
+; CHECK-LABEL: define i1 @test_trunc_cmp_no_a(
+; CHECK-SAME: i8 [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) {
+; CHECK-NEXT:    [[B32:%.*]] = zext i8 [[B]] to i32
+; CHECK-NEXT:    [[C32:%.*]] = zext i8 [[C]] to i32
+; CHECK-NEXT:    [[D32:%.*]] = zext i8 [[D]] to i32
+; CHECK-NEXT:    [[MUL1:%.*]] = sub nuw nsw i32 [[B32]], [[C32]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul nuw nsw i32 [[C32]], [[D32]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[MUL1]], [[MUL2]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[ADD]], 1234
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %b32 = zext i8 %b to i32
+  %c32 = zext i8 %c to i32
+  %d32 = zext i8 %d to i32
+  %sub = sub nuw i32 %b32, %c32
+  %mul2 = mul i32 %c32, %d32
+  %add = add i32 %sub, %mul2
+  %trunc = trunc i32 %add to i16
+  %cmp = icmp eq i16 %trunc, 1234
+  ret i1 %cmp
+}
+
+define i1 @test_trunc_cmp_no_d(i8 %a, i8 %b, i8 %c) {
+; CHECK-LABEL: define i1 @test_trunc_cmp_no_d(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]]) {
+; CHECK-NEXT:    [[A32:%.*]] = zext i8 [[A]] to i32
+; CHECK-NEXT:    [[B32:%.*]] = zext i8 [[B]] to i32
+; CHECK-NEXT:    [[C32:%.*]] = zext i8 [[C]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 [[B32]], [[C32]]
+; CHECK-NEXT:    [[MUL1:%.*]] = mul nuw nsw i32 [[SUB]], [[A32]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[MUL1]], [[C32]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[ADD]], 1234
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a32 = zext i8 %a to i32
+  %b32 = zext i8 %b to i32
+  %c32 = zext i8 %c to i32
+  %sub = sub nsw nuw i32 %b32, %c32
+  %mul1 = mul i32 %a32, %sub
+  %add = add i32 %mul1, %c32
+  %trunc = trunc i32 %add to i16
+  %cmp = icmp eq i16 %trunc, 1234
+  ret i1 %cmp
+}
+
+define i1 @test_trunc_cmp_xor_negative(i8 %a, i8 %c, i8 %d) {
+; CHECK-LABEL: define i1 @test_trunc_cmp_xor_negative(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) {
+; CHECK-NEXT:    [[A32:%.*]] = zext i8 [[A]] to i32
+; CHECK-NEXT:    [[C32:%.*]] = zext i8 [[C]] to i32
+; CHECK-NEXT:    [[D32:%.*]] = zext i8 [[D]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = xor i32 [[C32]], 234
+; CHECK-NEXT:    [[MUL1:%.*]] = mul nuw nsw i32 [[SUB]], [[A32]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul nuw nsw i32 [[C32]], [[D32]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[MUL1]], [[MUL2]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ADD]] to i16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[TRUNC]], 1234
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a32 = zext i8 %a to i32
+  %c32 = zext i8 %c to i32
+  %d32 = zext i8 %d to i32
+  %sub = xor i32 234, %c32
+  %mul1 = mul i32 %a32, %sub
+  %mul2 = mul i32 %c32, %d32
+  %add = add i32 %mul1, %mul2
+  ; We should keep the trunc in this case
+  %trunc = trunc i32 %add to i16
+  %cmp = icmp eq i16 %trunc, 1234
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstCombine/modular-format.ll b/llvm/test/Transforms/InstCombine/modular-format.ll
new file mode 100644
index 0000000000000..d9b7b6f056f59
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/modular-format.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Test that the modular format string library call simplifier works correctly.
+;
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@.str.int = constant [3 x i8] c"%d\00"
+@.str.float = constant [3 x i8] c"%f\00"
+@.str.multi = constant [6 x i8] c"%f %d\00"
+@.str.noargs = constant [1 x i8] c"\00"
+
+;; No aspects are specified, so no transformation occurs.
+define void @test_basic(i32 %arg) {
+; CHECK-LABEL: @test_basic(
+; CHECK-NEXT:    call void (ptr, ...) @basic(ptr nonnull @.str.int, i32 [[ARG:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void (ptr, ...) @basic(ptr @.str.int, i32 %arg)
+  ret void
+}
+
+declare void @basic(ptr, ...) #0
+
+;; The "float" aspect is present and needed, so no transformation occurs.
+define void @test_float_present(double %arg) {
+; CHECK-LABEL: @test_float_present(
+; CHECK-NEXT:    call void (ptr, ...) @float_present(ptr nonnull @.str.float, double [[ARG:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void (ptr, ...) @float_present(ptr @.str.float, double %arg)
+  ret void
+}
+
+declare void @float_present(ptr, ...) #1
+
+;; The "float" aspect is present but not needed, so the call is transformed.
+define void @test_float_absent(i32 %arg) {
+; CHECK-LABEL: @test_float_absent(
+; CHECK-NEXT:    call void (ptr, ...) @float_present_mod(ptr nonnull @.str.int, i32 [[ARG:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void (ptr, ...) @float_absent(ptr @.str.int, i32 %arg)
+  ret void
+}
+
+declare void @float_absent(ptr, ...) #1
+
+;; Unknown aspects are always considered needed, so no transformation occurs.
+define void @test_unknown_aspects(i32 %arg) {
+; CHECK-LABEL: @test_unknown_aspects(
+; CHECK-NEXT:    call void (ptr, ...) @unknown_aspects(ptr nonnull @.str.int, i32 [[ARG:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void (ptr, ...) @unknown_aspects(ptr @.str.int, i32 %arg)
+  ret void
+}
+
+declare void @unknown_aspects(ptr, ...) #2
+
+;; The call has no arguments to check, so the "float" aspect is not needed and
+;; the call is transformed.
+define void @test_no_args_to_check() {
+; CHECK-LABEL: @test_no_args_to_check(
+; CHECK-NEXT:    call void (ptr, ...) @float_present_mod(ptr nonnull @.str.noargs)
+; CHECK-NEXT:    ret void
+;
+  call void (ptr, ...) @no_args_to_check(ptr @.str.noargs)
+  ret void
+}
+
+declare void @no_args_to_check(ptr, ...) #1
+
+;; The first argument index is not 2. The "float" aspect is needed, so no
+;; transformation occurs.
+define void @test_first_arg_idx(i32 %ignored, double %arg) {
+; CHECK-LABEL: @test_first_arg_idx(
+; CHECK-NEXT:    call void (i32, ptr, ...) @first_arg_idx(i32 [[IGNORED:%.*]], ptr nonnull @.str.float, double [[ARG:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void (i32, ptr, ...) @first_arg_idx(i32 %ignored, ptr @.str.float, double %arg)
+  ret void
+}
+
+declare void @first_arg_idx(i32, ptr, ...) #3
+
+;; One aspect ("unknown") is needed, but one ("float") is not. The call is
+;; transformed, and a reference to the needed aspect is emitted.
+define void @test_partial_aspects(i32 %arg) {
+; CHECK-LABEL: @test_partial_aspects(
+; CHECK-NEXT:    call void (ptr, ...) @multiple_aspects_mod(ptr nonnull @.str.int, i32 [[ARG:%.*]])
+; CHECK-NEXT:    call void @llvm.reloc.none(metadata !"basic_impl_unknown")
+; CHECK-NEXT:    ret void
+;
+  call void (ptr, ...) @partial_aspects(ptr @.str.int, i32 %arg)
+  ret void
+}
+
+declare void @partial_aspects(ptr, ...) #4
+
+attributes #0 = { "modular-format"="printf,1,2,basic_mod,basic_impl" }
+attributes #1 = { "modular-format"="printf,1,2,float_present_mod,basic_impl,float" }
+attributes #2 = { "modular-format"="printf,1,2,unknown_aspects_mod,basic_impl,unknown1,unknown2" }
+attributes #3 = { "modular-format"="printf,2,3,first_arg_idx_mod,basic_impl,float" }
+attributes #4 = { "modular-format"="printf,1,2,multiple_aspects_mod,basic_impl,float,unknown" }
diff --git a/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll b/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll
new file mode 100644
index 0000000000000..a3b21ccc63e94
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll
@@ -0,0 +1,169 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; Basic functional test
+define i32 @basic(i32 %a, i32 %b) {
+; CHECK-LABEL: @basic(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[A]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %a
+  ret i32 %or
+}
+
+; Operand order swap test
+define i32 @swap_operand_order(i32 %x, i32 %y) {
+; CHECK-LABEL: @swap_operand_order(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[RES:%.*]] = or i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %cmp = icmp eq i32 %x, 0
+  %sel = select i1 %cmp, i32 %y, i32 0
+  %or = or i32 %x, %sel
+  ret i32 %or
+}
+
+; Negative test: Non-zero false value in select
+define i32 @negative_non_zero_false_val(i32 %a, i32 %b) {
+; CHECK-LABEL: @negative_non_zero_false_val(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL]], [[A]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 1
+  %or = or i32 %sel, %a
+  ret i32 %or
+}
+
+; Negative test: Incorrect comparison predicate (NE)
+define i32 @negative_wrong_predicate(i32 %a, i32 %b) {
+; CHECK-LABEL: @negative_wrong_predicate(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[CMP]], i32 0, i32 [[TMP1:%.*]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[OR]], [[A]]
+; CHECK-NEXT:    ret i32 [[OR1]]
+;
+  %cmp = icmp ne i32 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %a
+  ret i32 %or
+}
+
+; Comparison direction swap test (0 == X)
+define i32 @cmp_swapped(i32 %x, i32 %y) {
+; CHECK-LABEL: @cmp_swapped(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[RES:%.*]] = or i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %cmp = icmp eq i32 0, %x
+  %sel = select i1 %cmp, i32 %y, i32 0
+  %or = or i32 %x, %sel
+  ret i32 %or
+}
+
+; Complex expression test
+define i32 @complex_expression(i32 %a, i32 %b) {
+; CHECK-LABEL: @complex_expression(
+; CHECK-NEXT:    [[X:%.*]] = add i32 [[A:%.*]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[X]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %x = add i32 %a, 1
+  %cmp = icmp eq i32 %x, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %x
+  ret i32 %or
+}
+
+; zext test
+define i32 @zext_cond(i8 %a, i32 %b) {
+; CHECK-LABEL: @zext_cond(
+; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL]], [[Z]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %z = zext i8 %a to i32
+  %cmp = icmp eq i8 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %z
+  ret i32 %or
+}
+
+; sext test
+define i32 @sext_cond(i8 %a, i32 %b) {
+; CHECK-LABEL: @sext_cond(
+; CHECK-NEXT:    [[S:%.*]] = sext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL]], [[S]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %s = sext i8 %a to i32
+  %cmp = icmp eq i8 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %s
+  ret i32 %or
+}
+
+; Vector type test
+define <2 x i32> @vector_type(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @vector_type(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i32> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[B:%.*]], <2 x i32> [[A]]
+; CHECK-NEXT:    ret <2 x i32> [[RES]]
+;
+  %cmp = icmp eq <2 x i32> %a, zeroinitializer
+  %sel = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> zeroinitializer
+  %or = or <2 x i32> %sel, %a
+  ret <2 x i32> %or
+}
+
+; Pointer type test (should not trigger optimization)
+define ptr @pointer_type(ptr %p, ptr %q) {
+; CHECK-LABEL: @pointer_type(
+; CHECK-NEXT:    [[A:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[P]], null
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], ptr [[Q:%.*]], ptr null
+; CHECK-NEXT:    [[SEL_INT:%.*]] = ptrtoint ptr [[SEL]] to i64
+; CHECK-NEXT:    [[OR:%.*]] = or i64 [[A]], [[SEL_INT]]
+; CHECK-NEXT:    [[RET:%.*]] = inttoptr i64 [[OR]] to ptr
+; CHECK-NEXT:    ret ptr [[RET]]
+;
+  %a = ptrtoint ptr %p to i64
+  %cmp = icmp eq i64 %a, 0
+  %sel = select i1 %cmp, ptr %q, ptr null
+  %sel_int = ptrtoint ptr %sel to i64
+  %or_val = or i64 %a, %sel_int
+  %ret = inttoptr i64 %or_val to ptr
+  ret ptr %ret
+}
+
+; Multi-use test (should not trigger optimization)
+define i32 @multi_use_test(i32 %x, i32 %m) {
+; CHECK-LABEL: @multi_use_test(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[M:%.*]], i32 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL]], [[X]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SEL]], [[X]]
+; CHECK-NEXT:    [[O2:%.*]] = sub i32 [[OR]], [[ADD]]
+; CHECK-NEXT:    ret i32 [[O2]]
+;
+  %cmp = icmp eq i32 %x, 0
+  %sel = select i1 %cmp, i32 %m, i32 0
+  %or = or i32 %sel, %x
+  %add = add i32 %sel, %x
+  %res = sub i32 %or, %add
+  ret i32 %res
+}
diff --git a/llvm/test/Transforms/InstCombine/or.ll b/llvm/test/Transforms/InstCombine/or.ll
index 6b090e982af0a..f61a1970d3aa4 100644
--- a/llvm/test/Transforms/InstCombine/or.ll
+++ b/llvm/test/Transforms/InstCombine/or.ll
@@ -2113,3 +2113,98 @@ define <4 x i32> @or_zext_nneg_minus_constant_splat(<4 x i8> %a) {
   %or = or <4 x i32> %zext, splat (i32 -9)
   ret <4 x i32> %or
 }
+
+define i8 @or_positive_minus_non_positive_to_abs(i8 %a){
+; CHECK-LABEL: @or_positive_minus_non_positive_to_abs(
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.abs.i8(i8 [[A:%.*]], i1 false)
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %b = icmp sgt i8 %a, 0
+  %mask = sext i1 %b to i8
+  %neg = sub i8 0, %a
+  %mask_inv = xor i8 %mask, -1
+  %c = and i8 %neg, %mask_inv
+  %d = and i8 %a, %mask
+  %or = or i8 %c, %d
+  ret i8 %or
+}
+
+; TODO: Fold to smax https://alive2.llvm.org/ce/z/wDiDh2
+define i8 @or_select_smax_neg_to_abs(i8 %a){
+; CHECK-LABEL: @or_select_smax_neg_to_abs(
+; CHECK-NEXT:    [[SGT0:%.*]] = icmp sgt i8 [[A:%.*]], 0
+; CHECK-NEXT:    [[NEG:%.*]] = sub nsw i8 0, [[A]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[SGT0]], i8 0, i8 [[NEG]]
+; CHECK-NEXT:    ret i8 [[OR]]
+;
+  %sgt0 = icmp sgt i8 %a, 0
+  %neg = sub nsw i8 0, %a
+  %sel = select i1 %sgt0, i8 0, i8 %neg
+  ret i8 %sel
+}
+
+; TODO: Fold to abs https://alive2.llvm.org/ce/z/DybfHG
+define i8 @or_select_smax_smax_to_abs(i8 %a){
+; CHECK-LABEL: @or_select_smax_smax_to_abs(
+; CHECK-NEXT:    [[NEG:%.*]] = sub nsw i8 0, [[A:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = call i8 @llvm.smax.i8(i8 [[NEG]], i8 0)
+; CHECK-NEXT:    [[MAX:%.*]] = call i8 @llvm.smax.i8(i8 [[A]], i8 0)
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[SEL]], [[MAX]]
+; CHECK-NEXT:    ret i8 [[OR]]
+;
+  %neg = sub nsw i8 0, %a
+  %sel = call i8 @llvm.smax.i8(i8 %neg, i8 0)
+  %max = call i8 @llvm.smax.i8(i8 %a, i8 0)
+  %or = or i8 %sel, %max
+  ret i8 %or
+}
+
+declare i8 @llvm.abs.i8(i8, i1)
+declare <2 x i8> @llvm.abs.v2i8(<2 x i8>, i1)
+
+define <2 x i8> @or_sgt_select_smax_to_abs(<2 x i8> %a){
+; CHECK-LABEL: @or_sgt_select_smax_to_abs(
+; CHECK-NEXT:    [[OR:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[A:%.*]], i1 false)
+; CHECK-NEXT:    ret <2 x i8> [[OR]]
+;
+  %sgt0 = icmp sgt <2 x i8> %a, zeroinitializer
+  %neg = sub <2 x i8> zeroinitializer, %a
+  %sel = select <2 x i1> %sgt0, <2 x i8> zeroinitializer, <2 x i8> %neg
+  %max = call <2 x i8> @llvm.smax.v2i8(<2 x i8> %a, <2 x i8> zeroinitializer)
+  %or = or <2 x i8> %sel, %max
+  ret <2 x i8> %or
+}
+
+define <2 x i8> @or_slt_select_smax_to_abs(<2 x i8> %a){
+; CHECK-LABEL: @or_slt_select_smax_to_abs(
+; CHECK-NEXT:    [[OR:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[A:%.*]], i1 false)
+; CHECK-NEXT:    ret <2 x i8> [[OR]]
+;
+  %slt0 = icmp slt <2 x i8> %a, zeroinitializer
+  %neg = sub <2 x i8> zeroinitializer, %a
+  %sel = select <2 x i1> %slt0, <2 x i8> %neg, <2 x i8> zeroinitializer
+  %max = call <2 x i8> @llvm.smax.v2i8(<2 x i8> %a, <2 x i8> zeroinitializer)
+  %or = or <2 x i8> %sel, %max
+  ret <2 x i8> %or
+}
+
+; negative test - %d has multiple uses. %or is not folded to abs.
+
+define <2 x i8> @or_select_smax_multi_uses(<2 x i8> %a){
+; CHECK-LABEL: @or_select_smax_multi_uses(
+; CHECK-NEXT:    [[B:%.*]] = icmp sgt <2 x i8> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[NEG:%.*]] = sub <2 x i8> zeroinitializer, [[A]]
+; CHECK-NEXT:    [[C:%.*]] = select <2 x i1> [[B]], <2 x i8> zeroinitializer, <2 x i8> [[NEG]]
+; CHECK-NEXT:    [[D:%.*]] = call <2 x i8> @llvm.smax.v2i8(<2 x i8> [[A]], <2 x i8> zeroinitializer)
+; CHECK-NEXT:    [[OR1:%.*]] = or <2 x i8> [[C]], [[D]]
+; CHECK-NEXT:    [[OR:%.*]] = add <2 x i8> [[OR1]], [[D]]
+; CHECK-NEXT:    ret <2 x i8> [[OR]]
+;
+  %sgt0 = icmp sgt <2 x i8> %a, zeroinitializer
+  %neg = sub <2 x i8> zeroinitializer, %a
+  %sel = select <2 x i1> %sgt0, <2 x i8> zeroinitializer, <2 x i8> %neg
+  %max = call <2 x i8> @llvm.smax.v2i8(<2 x i8> %a, <2 x i8> zeroinitializer)
+  %or = or <2 x i8> %sel, %max
+  %add = add <2 x i8> %or, %max
+  ret <2 x i8> %add
+}
diff --git a/llvm/test/Transforms/InstCombine/recurrence.ll b/llvm/test/Transforms/InstCombine/recurrence.ll
index f75e0d439c572..643e7efc243a3 100644
--- a/llvm/test/Transforms/InstCombine/recurrence.ll
+++ b/llvm/test/Transforms/InstCombine/recurrence.ll
@@ -24,9 +24,9 @@ loop:                                             ; preds = %loop, %entry
 define i64 @test_or2(i64 %a, i64 %b) {
 ; CHECK-LABEL: @test_or2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = or i64 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV_NEXT:%.*]] = or i64 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    tail call void @use(i64 [[IV_NEXT]])
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
@@ -104,9 +104,9 @@ loop:                                             ; preds = %loop, %entry
 define i64 @test_and2(i64 %a, i64 %b) {
 ; CHECK-LABEL: @test_and2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = and i64 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV_NEXT:%.*]] = and i64 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    tail call void @use(i64 [[IV_NEXT]])
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
index 3d97048f43127..8b3c0502ac04d 100644
--- a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
+++ b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
@@ -256,27 +256,27 @@ define <2 x i1> @not_logical_or2(i1 %b, <2 x i32> %a) {
   ret <2 x i1> %and
 }
 
-define i1 @bools_logical_commute0(i1 %a, i1 %b, i1 %c) {
+define i1 @bools_logical_commute0(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools_logical_commute0(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF2]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
-  %and1 = select i1 %not, i1 %a, i1 false
-  %and2 = select i1 %c, i1 %b, i1 false
-  %or = select i1 %and1, i1 true, i1 %and2
+  %and1 = select i1 %not, i1 %a, i1 false, !prof!1
+  %and2 = select i1 %c, i1 %b, i1 false, !prof !2
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !3
   ret i1 %or
 }
 
-define i1 @bools_logical_commute0_and1(i1 %a, i1 %b, i1 %c) {
+define i1 @bools_logical_commute0_and1(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools_logical_commute0_and1(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF1]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
   %and1 = and i1 %not, %a
-  %and2 = select i1 %c, i1 %b, i1 false
-  %or = select i1 %and1, i1 true, i1 %and2
+  %and2 = select i1 %c, i1 %b, i1 false, !prof !1
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !2
   ret i1 %or
 }
 
@@ -292,15 +292,15 @@ define i1 @bools_logical_commute0_and2(i1 %a, i1 %b, i1 %c) {
   ret i1 %or
 }
 
-define i1 @bools_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) {
+define i1 @bools_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools_logical_commute0_and1_and2(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF3:![0-9]+]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
   %and1 = and i1 %not, %a
   %and2 = and i1 %c, %b
-  %or = select i1 %and1, i1 true, i1 %and2
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !1
   ret i1 %or
 }
 
@@ -457,27 +457,27 @@ define i1 @bools_logical_commute3_and1_and2(i1 %b, i1 %c) {
   ret i1 %or
 }
 
-define i1 @bools2_logical_commute0(i1 %a, i1 %b, i1 %c) {
+define i1 @bools2_logical_commute0(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools2_logical_commute0(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF1]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
-  %and1 = select i1 %c, i1 %a, i1 false
-  %and2 = select i1 %not, i1 %b, i1 false
-  %or = select i1 %and1, i1 true, i1 %and2
+  %and1 = select i1 %c, i1 %a, i1 false, !prof !1
+  %and2 = select i1 %not, i1 %b, i1 false, !prof !2
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !3
   ret i1 %or
 }
 
-define i1 @bools2_logical_commute0_and1(i1 %a, i1 %b, i1 %c) {
+define i1 @bools2_logical_commute0_and1(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools2_logical_commute0_and1(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF2]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
   %and1 = and i1 %c, %a
-  %and2 = select i1 %not, i1 %b, i1 false
-  %or = select i1 %and1, i1 true, i1 %and2
+  %and2 = select i1 %not, i1 %b, i1 false, !prof !1
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !2
   ret i1 %or
 }
 
@@ -493,15 +493,15 @@ define i1 @bools2_logical_commute0_and2(i1 %a, i1 %b, i1 %c) {
   ret i1 %or
 }
 
-define i1 @bools2_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) {
+define i1 @bools2_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools2_logical_commute0_and1_and2(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF3]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
   %and1 = and i1 %c, %a
   %and2 = and i1 %not, %b
-  %or = select i1 %and1, i1 true, i1 %and2
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !1
   ret i1 %or
 }
 
@@ -799,8 +799,11 @@ define <2 x i1> @not_logical_and2(i1 %b, <2 x i32> %a) {
 
 !0 = !{!"function_entry_count", i64 1000}
 !1 = !{!"branch_weights", i32 2, i32 3}
+!2 = !{!"branch_weights", i32 5, i32 7}
+!3 = !{!"branch_weights", i32 11, i32 13}
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3}
 ; CHECK: [[PROF2]] = !{!"branch_weights", i32 3, i32 2}
+; CHECK: [[PROF3]] = !{!"unknown", !"instcombine"}
 ;.
diff --git a/llvm/test/Transforms/InstCombine/sink-dereferenceable-assume.ll b/llvm/test/Transforms/InstCombine/sink-dereferenceable-assume.ll
new file mode 100644
index 0000000000000..8ceb310fc8f71
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/sink-dereferenceable-assume.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -p instcombine -S %s | FileCheck %s
+
+define i64 @test_dereferenceable_assume(ptr %p, ptr %q, i1 %c.0) {
+; CHECK-LABEL: define i64 @test_dereferenceable_assume(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i1 [[C_0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P_INT:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[Q_INT:%.*]] = ptrtoint ptr [[Q]] to i64
+; CHECK-NEXT:    [[DIFF:%.*]] = sub i64 [[Q_INT]], [[P_INT]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 [[DIFF]]) ]
+; CHECK-NEXT:    br i1 [[C_0]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    ret i64 [[DIFF]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  %p_int = ptrtoint ptr %p to i64
+  %q_int = ptrtoint ptr %q to i64
+  %diff = sub i64 %q_int, %p_int
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %diff) ]
+  br i1 %c.0, label %then, label %else
+
+then:
+  ret i64 %diff
+
+else:
+  ret i64 0
+}
+
+define i64 @test_sink_with_dereferenceable_assume_same_block_as_user(ptr %p, ptr %q, i1 %c.0) {
+; CHECK-LABEL: define i64 @test_sink_with_dereferenceable_assume_same_block_as_user(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i1 [[C_0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[C_0]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[Q_INT:%.*]] = ptrtoint ptr [[Q]] to i64
+; CHECK-NEXT:    [[P_INT:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[DIFF:%.*]] = sub i64 [[Q_INT]], [[P_INT]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 [[DIFF]]) ]
+; CHECK-NEXT:    ret i64 [[DIFF]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  %p_int = ptrtoint ptr %p to i64
+  %q_int = ptrtoint ptr %q to i64
+  %diff = sub i64 %q_int, %p_int
+  br i1 %c.0, label %then, label %else
+
+then:
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %diff) ]
+  ret i64 %diff
+
+else:
+  ret i64 0
+}
+
+define i64 @test_sink_with_multiple_users_dominated_by_deref(ptr %p, ptr %q, i1 %c.0, i1 %c.1) {
+; CHECK-LABEL: define i64 @test_sink_with_multiple_users_dominated_by_deref(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i1 [[C_0:%.*]], i1 [[C_1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P_INT:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[Q_INT:%.*]] = ptrtoint ptr [[Q]] to i64
+; CHECK-NEXT:    [[DIFF:%.*]] = sub i64 [[Q_INT]], [[P_INT]]
+; CHECK-NEXT:    br i1 [[C_0]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 [[DIFF]]) ]
+; CHECK-NEXT:    br i1 [[C_1]], label %[[THEN_2:.*]], label %[[ELSE]]
+; CHECK:       [[THEN_2]]:
+; CHECK-NEXT:    [[DOUBLED:%.*]] = shl i64 [[DIFF]], 1
+; CHECK-NEXT:    ret i64 [[DOUBLED]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  %p_int = ptrtoint ptr %p to i64
+  %q_int = ptrtoint ptr %q to i64
+  %diff = sub i64 %q_int, %p_int
+  br i1 %c.0, label %then, label %else
+
+then:
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %diff) ]
+  br i1 %c.1, label %then.2, label %else
+
+then.2:
+  %doubled = mul i64 %diff, 2
+  ret i64 %doubled
+
+else:
+  ret i64 0
+}
+
+define i64 @test_deref_user_does_not_dominate_other_user(ptr %p, ptr %q, i1 %c.0) {
+; CHECK-LABEL: define i64 @test_deref_user_does_not_dominate_other_user(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i1 [[C_0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P_INT:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[Q_INT:%.*]] = ptrtoint ptr [[Q]] to i64
+; CHECK-NEXT:    [[DIFF:%.*]] = sub i64 [[Q_INT]], [[P_INT]]
+; CHECK-NEXT:    br i1 [[C_0]], label %[[MIDDLE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[MIDDLE]]:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 [[DIFF]]) ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i64 [[DIFF]]
+;
+entry:
+  %p_int = ptrtoint ptr %p to i64
+  %q_int = ptrtoint ptr %q to i64
+  %diff = sub i64 %q_int, %p_int
+  br i1 %c.0, label %middle, label %exit
+
+middle:
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %diff) ]
+  br label %exit
+
+exit:
+  ret i64 %diff
+}
+
+declare void @llvm.assume(i1 noundef)
diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll
index ee70137e8fbd7..01da63fa5b0af 100644
--- a/llvm/test/Transforms/InstCombine/sub-gep.ll
+++ b/llvm/test/Transforms/InstCombine/sub-gep.ll
@@ -858,8 +858,7 @@ define i1 @_gep_phi2(ptr %str1, i64 %val2) {
 ; CHECK:       while.end.i:
 ; CHECK-NEXT:    br label [[_Z3FOOPKC_EXIT]]
 ; CHECK:       _Z3fooPKc.exit:
-; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = phi i64 [ 1, [[WHILE_END_I]] ], [ 0, [[LOR_LHS_FALSE_I]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[RETVAL_0_I]], [[VAL2:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 1, [[WHILE_END_I]] ], [ [[VAL2:%.*]], [[LOR_LHS_FALSE_I]] ], [ [[VAL2]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i64 [[TMP2]], 0
 ; CHECK-NEXT:    ret i1 [[TOBOOL]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll
deleted file mode 100644
index 9fcac802378f6..0000000000000
--- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-define void @test (float %b, ptr %p)  {
-; CHECK: extractelement
-; CHECK: fptosi
-  %1 = load <8 x float> , ptr %p
-  %2 = bitcast <8 x float> %1 to <8 x i32>
-  %3 = bitcast <8 x i32> %2 to <8 x float>
-  %a = fptosi <8 x float> %3 to <8 x i32>
-  %4 = fptosi float %b to i32
-  %5 = add i32 %4, -2
-  %6 = extractelement <8 x i32> %a, i32 %5
-  %7 = insertelement <8 x i32> poison, i32 %6, i32 7
-  %8 = sitofp <8 x i32> %7 to <8 x float>
-  store <8 x float> %8, ptr %p
-  ret void    
-}
-
-; PR18600
-define i32 @test2(i32 %i) {
-  %e = extractelement <4 x i32> bitcast (<2 x i64> <i64 1, i64 2> to <4 x i32>), i32 %i
-  ret i32 %e
-
-; CHECK-LABEL: @test2
-; CHECK: extractelement
-}
diff --git a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
index 32bf4da12c497..205b4b88c473a 100644
--- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
+++ b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
@@ -1,26 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
-define void @test (float %b, ptr %p)  {
-; CHECK: extractelement
-; CHECK: fptosi
-  %1 = load <8 x float> , ptr %p
+define void @test_poison(float %b, ptr %p) {
+; CHECK-LABEL: define void @test_poison(
+; CHECK-SAME: float [[B:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[P]], align 32
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[B]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], -2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fptosi float [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <8 x i32> [[TMP6]] to <8 x float>
+; CHECK-NEXT:    store <8 x float> [[TMP7]], ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %1 = load <8 x float>, ptr %p
   %2 = bitcast <8 x float> %1 to <8 x i32>
   %3 = bitcast <8 x i32> %2 to <8 x float>
   %a = fptosi <8 x float> %3 to <8 x i32>
   %4 = fptosi float %b to i32
   %5 = add i32 %4, -2
   %6 = extractelement <8 x i32> %a, i32 %5
-  %7 = insertelement <8 x i32> undef, i32 %6, i32 7
+  %7 = insertelement <8 x i32> poison, i32 %6, i32 7
   %8 = sitofp <8 x i32> %7 to <8 x float>
   store <8 x float> %8, ptr %p
-  ret void    
+  ret void
 }
 
 ; PR18600
-define i32 @test2(i32 %i) {
+define i32 @test_bitcast(i32 %i) {
+; CHECK-LABEL: define i32 @test_bitcast(
+; CHECK-SAME: i32 [[I:%.*]]) {
+; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x i32> <i32 1, i32 0, i32 2, i32 0>, i32 [[I]]
+; CHECK-NEXT:    ret i32 [[E]]
+;
   %e = extractelement <4 x i32> bitcast (<2 x i64> <i64 1, i64 2> to <4 x i32>), i32 %i
   ret i32 %e
+}
+
+declare void @use(i32)
 
-; CHECK-LABEL: @test2
-; CHECK: extractelement
+define void @test_loop(<4 x float> %in) {
+; CHECK-LABEL: define void @test_loop(
+; CHECK-SAME: <4 x float> [[IN:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[R:%.*]] = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> [[IN]], i32 9)
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[LATCH:.*]] ]
+; CHECK-NEXT:    [[COND:%.*]] = icmp samesign ult i32 [[I]], 4
+; CHECK-NEXT:    br i1 [[COND]], label %[[BODY:.*]], label %[[DONE:.*]]
+; CHECK:       [[BODY]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[R]], i32 [[I]]
+; CHECK-NEXT:    [[ELEM:%.*]] = fptosi float [[TMP0]] to i32
+; CHECK-NEXT:    call void @use(i32 [[ELEM]])
+; CHECK-NEXT:    br label %[[LATCH]]
+; CHECK:       [[LATCH]]:
+; CHECK-NEXT:    [[NEXT]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    br label %[[LOOP]]
+; CHECK:       [[DONE]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %r = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %in, i32 9)
+  %vi = fptosi <4 x float> %r to <4 x i32>
+  br label %loop
+loop:
+  %i = phi i32 [ 0, %entry ], [ %next, %latch ]
+  %cond = icmp ult i32 %i, 4
+  br i1 %cond, label %body, label %done
+body:
+  %elem = extractelement <4 x i32> %vi, i32 %i
+  call void @use(i32 %elem)
+  br label %latch
+latch:
+  %next = add i32 %i, 1
+  br label %loop
+done:
+  ret void
 }
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
index 77a7f0d4e4acf..479b3f8ea4128 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
@@ -12,8 +12,7 @@ define i32 @add_0() {
 
 define i32 @add_0_scalable_vector() {
 ; CHECK-LABEL: @add_0_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 0
 ;
   %x = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> zeroinitializer)
   ret i32 %x
@@ -89,8 +88,7 @@ define i32 @add_poison() {
 
 define i32 @add_poison_scalable_vector() {
 ; CHECK-LABEL: @add_poison_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> poison)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> poison)
   ret i32 %x
@@ -123,8 +121,7 @@ define i32 @mul_0() {
 
 define i32 @mul_0_scalable_vector() {
 ; CHECK-LABEL: @mul_0_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 0
 ;
   %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> zeroinitializer)
   ret i32 %x
@@ -140,13 +137,29 @@ define i32 @mul_1() {
 
 define i32 @mul_1_scalable_vector() {
 ; CHECK-LABEL: @mul_1_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> splat (i32 1))
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 1
 ;
   %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> splat (i32 1))
   ret i32 %x
 }
 
+define i32 @mul_2() {
+; CHECK-LABEL: @mul_2(
+; CHECK-NEXT:    ret i32 256
+;
+  %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>)
+  ret i32 %x
+}
+
+define i32 @mul_2_scalable_vector() {
+; CHECK-LABEL: @mul_2_scalable_vector(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> splat (i32 2))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> splat (i32 2))
+  ret i32 %x
+}
+
 define i32 @mul_inc() {
 ; CHECK-LABEL: @mul_inc(
 ; CHECK-NEXT:    ret i32 40320
@@ -200,8 +213,7 @@ define i32 @mul_poison() {
 
 define i32 @mul_poison_scalable_vector() {
 ; CHECK-LABEL: @mul_poison_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> poison)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.mul.nxv8i32(<vscale x 8 x i32> poison)
   ret i32 %x
@@ -225,8 +237,7 @@ define i32 @and_0() {
 
 define i32 @and_0_scalable_vector() {
 ; CHECK-LABEL: @and_0_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 0
 ;
   %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> zeroinitializer)
   ret i32 %x
@@ -242,8 +253,7 @@ define i32 @and_1() {
 
 define i32 @and_1_scalable_vector() {
 ; CHECK-LABEL: @and_1_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> splat (i32 1))
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 1
 ;
   %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> splat (i32 1))
   ret i32 %x
@@ -302,8 +312,7 @@ define i32 @and_poison() {
 
 define i32 @and_poison_scalable_vector() {
 ; CHECK-LABEL: @and_poison_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> poison)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> poison)
   ret i32 %x
@@ -327,8 +336,7 @@ define i32 @or_0() {
 
 define i32 @or_0_scalable_vector() {
 ; CHECK-LABEL: @or_0_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 0
 ;
   %x = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> zeroinitializer)
   ret i32 %x
@@ -344,8 +352,7 @@ define i32 @or_1() {
 
 define i32 @or_1_scalable_vector() {
 ; CHECK-LABEL: @or_1_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> splat (i32 1))
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 1
 ;
   %x = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> splat (i32 1))
   ret i32 %x
@@ -404,8 +411,7 @@ define i32 @or_poison() {
 
 define i32 @or_poison_scalable_vector() {
 ; CHECK-LABEL: @or_poison_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> poison)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> poison)
   ret i32 %x
@@ -429,8 +435,7 @@ define i32 @xor_0() {
 
 define i32 @xor_0_scalable_vector() {
 ; CHECK-LABEL: @xor_0_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 0
 ;
   %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> zeroinitializer)
   ret i32 %x
@@ -446,13 +451,21 @@ define i32 @xor_1() {
 
 define i32 @xor_1_scalable_vector() {
 ; CHECK-LABEL: @xor_1_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> splat (i32 1))
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 0
 ;
   %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> splat(i32 1))
   ret i32 %x
 }
 
+define i32 @xor_1_scalable_vector_lane_count_not_known_even() {
+; CHECK-LABEL: @xor_1_scalable_vector_lane_count_not_known_even(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> splat (i32 1))
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 1 x i32> splat(i32 1))
+  ret i32 %x
+}
+
 define i32 @xor_inc() {
 ; CHECK-LABEL: @xor_inc(
 ; CHECK-NEXT:    ret i32 10
@@ -506,8 +519,7 @@ define i32 @xor_poison() {
 
 define i32 @xor_poison_scalable_vector() {
 ; CHECK-LABEL: @xor_poison_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> poison)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> poison)
   ret i32 %x
@@ -531,8 +543,7 @@ define i32 @smin_0() {
 
 define i32 @smin_0_scalable_vector() {
 ; CHECK-LABEL: @smin_0_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 0
 ;
   %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> zeroinitializer)
   ret i32 %x
@@ -548,8 +559,7 @@ define i32 @smin_1() {
 
 define i32 @smin_1_scalable_vector() {
 ; CHECK-LABEL: @smin_1_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> splat (i32 1))
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 1
 ;
   %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> splat(i32 1))
   ret i32 %x
@@ -608,8 +618,7 @@ define i32 @smin_poison() {
 
 define i32 @smin_poison_scalable_vector() {
 ; CHECK-LABEL: @smin_poison_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> poison)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> poison)
   ret i32 %x
@@ -633,8 +642,7 @@ define i32 @smax_0() {
 
 define i32 @smax_0_scalable_vector() {
 ; CHECK-LABEL: @smax_0_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 0
 ;
   %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> zeroinitializer)
   ret i32 %x
@@ -650,8 +658,7 @@ define i32 @smax_1() {
 
 define i32 @smax_1_scalable_vector() {
 ; CHECK-LABEL: @smax_1_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> splat (i32 1))
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 1
 ;
   %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> splat(i32 1))
   ret i32 %x
@@ -710,8 +717,7 @@ define i32 @smax_poison() {
 
 define i32 @smax_poison_scalable_vector() {
 ; CHECK-LABEL: @smax_poison_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> poison)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> poison)
   ret i32 %x
@@ -735,8 +741,7 @@ define i32 @umin_0() {
 
 define i32 @umin_0_scalable_vector() {
 ; CHECK-LABEL: @umin_0_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 0
 ;
   %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> zeroinitializer)
   ret i32 %x
@@ -752,8 +757,7 @@ define i32 @umin_1() {
 
 define i32 @umin_1_scalable_vector() {
 ; CHECK-LABEL: @umin_1_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> splat (i32 1))
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 1
 ;
   %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> splat (i32 1))
   ret i32 %x
@@ -812,8 +816,7 @@ define i32 @umin_poison() {
 
 define i32 @umin_poison_scalable_vector() {
 ; CHECK-LABEL: @umin_poison_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> poison)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.umin.nxv8i32(<vscale x 8 x i32> poison)
   ret i32 %x
@@ -837,8 +840,7 @@ define i32 @umax_0() {
 
 define i32 @umax_0_scalable_vector() {
 ; CHECK-LABEL: @umax_0_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 0
 ;
   %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> zeroinitializer)
   ret i32 %x
@@ -854,8 +856,7 @@ define i32 @umax_1() {
 
 define i32 @umax_1_scalable_vector() {
 ; CHECK-LABEL: @umax_1_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> splat (i32 1))
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 1
 ;
   %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> splat(i32 1))
   ret i32 %x
@@ -914,8 +915,7 @@ define i32 @umax_poison() {
 
 define i32 @umax_poison_scalable_vector() {
 ; CHECK-LABEL: @umax_poison_scalable_vector(
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> poison)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 poison
 ;
   %x = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> poison)
   ret i32 %x
diff --git a/llvm/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll b/llvm/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll
index 437e56665d53b..fa8357505e7e9 100644
--- a/llvm/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll
+++ b/llvm/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll
@@ -131,7 +131,8 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !10 = !DILexicalBlockFile(scope: !6, file: !1, discriminator: 0)
 !11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
 !12 = !DILocation(line: 0, scope: !10)
-!13 = distinct !DISubprogram(name: "multi_exit", scope: !1, file: !1, line: 10, type: !7, scopeLine: 10, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!13 = distinct !DISubprogram(name: "multi_exit", scope: !1, file: !1, line: 10, type: !7, scopeLine: 10, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
 !14 = !DILocation(line: 0, scope: !15)
 !15 = !DILexicalBlockFile(scope: !13, file: !1, discriminator: 0)
 !16 = !DILocalVariable(name: "sum2", scope: !15, file: !1, line: 11, type: !11)
+!17 = !{!16}
diff --git a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
index 97ea2c6708dad..2828882afe779 100644
--- a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -28,7 +28,7 @@ target triple = "x86_64-apple-macosx10.10.0"
 @E = common global ptr null, align 8
 
 ; CHECK-LABEL: @f(
-define void @f() {
+define void @f() !prof !{!"function_entry_count", i32 10} {
 entry:
   %a = load ptr, ptr @A, align 8
   %b = load ptr, ptr @B, align 8
@@ -55,7 +55,7 @@ entry:
 ; CHECK:     = icmp
 
 ; CHECK-NOT: = icmp
-; CHECK:     br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label %for.body.ph.ldist1
+; CHECK:     br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label %for.body.ph.ldist1, !prof ![[PROF1:[0-9]]]
 
 ; The non-distributed loop that the memchecks fall back on.
 
@@ -289,3 +289,4 @@ attributes #1 = { nounwind convergent }
 
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.distribute.enable", i1 true}
+; CHECK: ![[PROF1]] = !{!"unknown", !"loop-versioning"}
diff --git a/llvm/test/Transforms/LoopFusion/pr164082.ll b/llvm/test/Transforms/LoopFusion/pr164082.ll
new file mode 100644
index 0000000000000..652557cef48f8
--- /dev/null
+++ b/llvm/test/Transforms/LoopFusion/pr164082.ll
@@ -0,0 +1,65 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-fusion -disable-output -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; STAT: 1 loop-fusion - Loops fused
+
+; C Code
+;
+;;  for (int i = 0; i < 100; ++i)
+;;      Array[i][i] = -i;
+;;  for (int row = 0; row < 100; ++row)
+;;      for (int col = 0; col < 100; ++col)
+;;          if (col != row)
+;;              Array[row][col] = row + col;
+;
+; Loop fusion should not crash anymore as now forgetBlockAndLoopDispositions()
+; is trigerred after mergeLatch() during the fusion.
+
+define i32 @forget_dispositions() nounwind {
+entry:
+  %Array = alloca [100 x [100 x i32]], align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv33 = phi i64 [ 0, %entry ], [ %indvars.iv.next34, %for.body ]
+  %0 = trunc i64 %indvars.iv33 to i32
+  %sub = sub i32 0, %0
+  %arrayidx2 = getelementptr inbounds [100 x [100 x i32]], ptr %Array, i64 0, i64 %indvars.iv33, i64 %indvars.iv33
+  store i32 %sub, ptr %arrayidx2, align 4
+  %indvars.iv.next34 = add i64 %indvars.iv33, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, 100
+  br i1 %exitcond36, label %for.cond6.preheader, label %for.body
+
+for.cond6.preheader:                              ; preds = %for.body, %for.inc17
+  %indvars.iv29 = phi i64 [ %indvars.iv.next30, %for.inc17 ], [ 0, %for.body ]
+  br label %for.body8
+
+for.body8:                                        ; preds = %for.inc14, %for.cond6.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond6.preheader ], [ %indvars.iv.next, %for.inc14 ]
+  %1 = trunc i64 %indvars.iv to i32
+  %2 = trunc i64 %indvars.iv29 to i32
+  %cmp9 = icmp eq i32 %1, %2
+  br i1 %cmp9, label %for.inc14, label %if.then
+
+if.then:                                          ; preds = %for.body8
+  %3 = add i64 %indvars.iv, %indvars.iv29
+  %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], ptr %Array, i64 0, i64 %indvars.iv29, i64 %indvars.iv
+  %4 = trunc i64 %3 to i32
+  store i32 %4, ptr %arrayidx13, align 4
+  br label %for.inc14
+
+for.inc14:                                        ; preds = %for.body8, %if.then
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv27 = trunc i64 %indvars.iv.next to i32
+  %exitcond28 = icmp eq i32 %lftr.wideiv27, 100
+  br i1 %exitcond28, label %for.inc17, label %for.body8
+
+for.inc17:                                        ; preds = %for.inc14
+  %indvars.iv.next30 = add i64 %indvars.iv29, 1
+  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+  %exitcond32 = icmp eq i32 %lftr.wideiv31, 100
+  br i1 %exitcond32, label %for.exit, label %for.cond6.preheader
+
+for.exit:                                    ; preds = %for.inc17
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
new file mode 100644
index 0000000000000..d01bb748d9422
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
@@ -0,0 +1,70 @@
+; RUN: opt -passes="module(print<block-freq>),function(loop(loop-idiom)),module(print<block-freq>)" -mtriple=x86_64 -mcpu=core-avx2 %s -disable-output 2>&1 | FileCheck --check-prefix=PROFILE %s
+
+declare void @escape_inner(i8, i8, i8, i1, i8)
+declare void @escape_outer(i8, i8, i8, i1, i8)
+
+declare i8 @gen.i8()
+
+; Most basic pattern; Note that iff the shift amount is offset, said offsetting
+; must not cause an overflow, but `add nsw` is fine.
+define i8 @p0(i8 %val, i8 %start, i8 %extraoffset) mustprogress {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
+  %nbits = add nsw i8 %iv, %extraoffset
+  %val.shifted = ashr i8 %val, %nbits
+  %val.shifted.iszero = icmp eq i8 %val.shifted, 0
+  %iv.next = add i8 %iv, 1
+
+  call void @escape_inner(i8 %iv, i8 %nbits, i8 %val.shifted, i1 %val.shifted.iszero, i8 %iv.next)
+
+  br i1 %val.shifted.iszero, label %end, label %loop, !prof !{!"branch_weights", i32 1, i32 1000 }
+
+end:
+  %iv.res = phi i8 [ %iv, %loop ]
+  %nbits.res = phi i8 [ %nbits, %loop ]
+  %val.shifted.res = phi i8 [ %val.shifted, %loop ]
+  %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ]
+  %iv.next.res = phi i8 [ %iv.next, %loop ]
+
+  call void @escape_outer(i8 %iv.res, i8 %nbits.res, i8 %val.shifted.res, i1 %val.shifted.iszero.res, i8 %iv.next.res)
+
+  ret i8 %iv.res
+}
+
+define i32 @p1(i32 %x, i32 %bit) {
+entry:
+  %bitmask = shl i32 1, %bit
+  br label %loop
+
+loop:
+  %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
+  %x.curr.bitmasked = and i32 %x.curr, %bitmask
+  %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
+  %x.next = shl i32 %x.curr, 1
+  br i1 %x.curr.isbitunset, label %loop, label %end, !prof !{!"branch_weights", i32 500, i32 1 }
+
+end:
+  ret i32 %x.curr
+}
+
+;
+; PROFILE: Printing analysis results of BFI for function 'p0':
+; PROFILE: block-frequency-info: p0
+; PROFILE: - entry: float = 1.0,
+; PROFILE:  - loop: float = 1001.0,
+; PROFILE: - end: float = 1.0,
+; PROFILE: block-frequency-info: p1
+; PROFILE: - entry: float = 1.0, 
+; PROFILE: - loop: float = 501.0,
+; PROFILE: - end: float = 1.0,
+; PROFILE: block-frequency-info: p0
+; PROFILE: - entry: float = 1.0,
+; PROFILE:  - loop: float = 1001.0,
+; PROFILE: - end: float = 1.0,
+; PROFILE: block-frequency-info: p1
+; PROFILE: - entry: float = 1.0, 
+; PROFILE: - loop: float = 501.0,
+; PROFILE: - end: float = 1.0,
diff --git a/llvm/test/Transforms/LoopIdiom/basic.ll b/llvm/test/Transforms/LoopIdiom/basic.ll
index e8ea912246728..9deccf5352ea8 100644
--- a/llvm/test/Transforms/LoopIdiom/basic.ll
+++ b/llvm/test/Transforms/LoopIdiom/basic.ll
@@ -1620,5 +1620,5 @@ define noalias ptr @_ZN8CMSPULog9beginImplEja(ptr nocapture writeonly %0) local_
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
index 1ec212f0bb5ea..46b6209986fed 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; REQUIRES: asserts
 ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
 ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(loop-simplifycfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
@@ -59,7 +59,7 @@ define i32 @dead_backedge_test_switch_loop(i32 %end) {
 ; CHECK:       dead_backedge:
 ; CHECK-NEXT:    [[I_2]] = add i32 [[I_1]], 10
 ; CHECK-NEXT:    switch i32 1, label [[EXIT:%.*]] [
-; CHECK-NEXT:    i32 0, label [[HEADER_BACKEDGE]]
+; CHECK-NEXT:      i32 0, label [[HEADER_BACKEDGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[I_2_LCSSA:%.*]] = phi i32 [ [[I_2]], [[DEAD_BACKEDGE]] ]
@@ -233,12 +233,12 @@ exit:
 
 ; Check that we preserve static reachibility of a dead exit block while deleting
 ; a branch.
-define i32 @dead_exit_test_branch_loop(i32 %end) {
+define i32 @dead_exit_test_branch_loop(i32 %end) !prof !{!"function_entry_count", i32 10} {
 ; CHECK-LABEL: @dead_exit_test_branch_loop(
 ; CHECK-NEXT:  preheader:
 ; CHECK-NEXT:    switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[DEAD:%.*]]
-; CHECK-NEXT:    ]
+; CHECK-NEXT:      i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT:    ], !prof [[PROF1:![0-9]+]]
 ; CHECK:       preheader.split:
 ; CHECK-NEXT:    br label [[HEADER:%.*]]
 ; CHECK:       header:
@@ -262,7 +262,7 @@ preheader:
 
 header:
   %i = phi i32 [0, %preheader], [%i.inc, %backedge]
-  br i1 true, label %backedge, label %dead
+  br i1 true, label %backedge, label %dead, !prof !{!"branch_weights", i32 10, i32 1}
 
 dead:
   br label %dummy
@@ -286,7 +286,7 @@ define i32 @dead_exit_test_switch_loop(i32 %end) {
 ; CHECK-LABEL: @dead_exit_test_switch_loop(
 ; CHECK-NEXT:  preheader:
 ; CHECK-NEXT:    switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT:      i32 1, label [[DEAD:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       preheader.split:
 ; CHECK-NEXT:    br label [[HEADER:%.*]]
@@ -383,9 +383,9 @@ define i32 @dead_loop_test_switch_loop(i32 %end) {
 ; CHECK:       header:
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
-; CHECK-NEXT:    i32 0, label [[DEAD]]
-; CHECK-NEXT:    i32 1, label [[BACKEDGE]]
-; CHECK-NEXT:    i32 2, label [[DEAD]]
+; CHECK-NEXT:      i32 0, label [[DEAD]]
+; CHECK-NEXT:      i32 1, label [[BACKEDGE]]
+; CHECK-NEXT:      i32 2, label [[DEAD]]
 ; CHECK-NEXT:    ]
 ; CHECK:       dead:
 ; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
@@ -552,7 +552,7 @@ define i32 @inf_loop_test_branch_loop(i32 %end) {
 ; CHECK-LABEL: @inf_loop_test_branch_loop(
 ; CHECK-NEXT:  preheader:
 ; CHECK-NEXT:    switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[EXIT:%.*]]
+; CHECK-NEXT:      i32 1, label [[EXIT:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       preheader.split:
 ; CHECK-NEXT:    br label [[HEADER:%.*]]
@@ -592,7 +592,7 @@ define i32 @inf_loop_test_switch_loop(i32 %end) {
 ; CHECK-LABEL: @inf_loop_test_switch_loop(
 ; CHECK-NEXT:  preheader:
 ; CHECK-NEXT:    switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[EXIT:%.*]]
+; CHECK-NEXT:      i32 1, label [[EXIT:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       preheader.split:
 ; CHECK-NEXT:    br label [[HEADER:%.*]]
@@ -1001,7 +1001,7 @@ define i32 @full_sub_loop_test_switch_loop(i32 %end) {
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
 ; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
-; CHECK-NEXT:    i32 0, label [[BACKEDGE]]
+; CHECK-NEXT:      i32 0, label [[BACKEDGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       dead:
 ; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
@@ -1010,7 +1010,7 @@ define i32 @full_sub_loop_test_switch_loop(i32 %end) {
 ; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
 ; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
 ; CHECK-NEXT:    switch i32 1, label [[OUTER_BACKEDGE]] [
-; CHECK-NEXT:    i32 0, label [[HEADER]]
+; CHECK-NEXT:      i32 0, label [[HEADER]]
 ; CHECK-NEXT:    ]
 ; CHECK:       outer_backedge:
 ; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
@@ -1132,7 +1132,7 @@ define i32 @full_sub_loop_test_switch_loop_inverse_1(i32 %end) {
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
 ; CHECK-NEXT:    switch i32 1, label [[BACKEDGE]] [
-; CHECK-NEXT:    i32 0, label [[DEAD:%.*]]
+; CHECK-NEXT:      i32 0, label [[DEAD:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       dead:
 ; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
@@ -1141,7 +1141,7 @@ define i32 @full_sub_loop_test_switch_loop_inverse_1(i32 %end) {
 ; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
 ; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
 ; CHECK-NEXT:    switch i32 1, label [[OUTER_BACKEDGE]] [
-; CHECK-NEXT:    i32 0, label [[HEADER]]
+; CHECK-NEXT:      i32 0, label [[HEADER]]
 ; CHECK-NEXT:    ]
 ; CHECK:       outer_backedge:
 ; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
@@ -1195,7 +1195,7 @@ define i32 @full_sub_loop_test_branch_loop_inverse_2(i32 %end) {
 ; CHECK:       outer_header:
 ; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[OUTER_BACKEDGE]]
+; CHECK-NEXT:      i32 1, label [[OUTER_BACKEDGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       preheader.split:
 ; CHECK-NEXT:    br label [[HEADER:%.*]]
@@ -1256,7 +1256,7 @@ define i32 @full_sub_loop_test_switch_loop_inverse_2(i32 %end) {
 ; CHECK:       outer_header:
 ; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[OUTER_BACKEDGE]]
+; CHECK-NEXT:      i32 1, label [[OUTER_BACKEDGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       preheader.split:
 ; CHECK-NEXT:    br label [[HEADER:%.*]]
@@ -1318,7 +1318,7 @@ define i32 @full_sub_loop_test_branch_loop_inverse_3(i32 %end) {
 ; CHECK:       outer_header:
 ; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[OUTER_BACKEDGE]]
+; CHECK-NEXT:      i32 1, label [[OUTER_BACKEDGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       preheader.split:
 ; CHECK-NEXT:    br label [[HEADER:%.*]]
@@ -1378,7 +1378,7 @@ define i32 @full_sub_loop_test_switch_loop_inverse_3(i32 %end) {
 ; CHECK:       outer_header:
 ; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[OUTER_BACKEDGE]]
+; CHECK-NEXT:      i32 1, label [[OUTER_BACKEDGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       preheader.split:
 ; CHECK-NEXT:    br label [[HEADER:%.*]]
@@ -1441,7 +1441,7 @@ define i32 @exit_branch_from_inner_to_grandparent(i1 %cond1, i1 %cond2, i32 %N)
 ; CHECK:       loop_2:
 ; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    switch i32 0, label [[LOOP_2_SPLIT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[LOOP_2_BACKEDGE]]
+; CHECK-NEXT:      i32 1, label [[LOOP_2_BACKEDGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       loop_2.split:
 ; CHECK-NEXT:    br label [[LOOP_3:%.*]]
@@ -1510,7 +1510,7 @@ define i32 @exit_switch_from_inner_to_grandparent(i1 %cond1, i1 %cond2, i32 %N)
 ; CHECK:       loop_2:
 ; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP_1]] ], [ [[J_NEXT:%.*]], [[LOOP_2_BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    switch i32 0, label [[LOOP_2_SPLIT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[LOOP_2_BACKEDGE]]
+; CHECK-NEXT:      i32 1, label [[LOOP_2_BACKEDGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       loop_2.split:
 ; CHECK-NEXT:    br label [[LOOP_3:%.*]]
@@ -1654,7 +1654,7 @@ define i32 @intermediate_switch_from_inner_to_grandparent(i1 %cond1, i1 %cond2,
 ; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[INTERMEDIATE:%.*]]
 ; CHECK:       intermediate:
 ; CHECK-NEXT:    switch i32 1, label [[LOOP_1_BACKEDGE_LOOPEXIT:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LOOP_3_BACKEDGE]]
+; CHECK-NEXT:      i32 0, label [[LOOP_3_BACKEDGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       loop_3_backedge:
 ; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
@@ -1792,7 +1792,7 @@ define i32 @intermediate_switch_from_inner_to_parent(i1 %cond1, i1 %cond2, i32 %
 ; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[LOOP_3_BACKEDGE]], label [[INTERMEDIATE:%.*]]
 ; CHECK:       intermediate:
 ; CHECK-NEXT:    switch i32 1, label [[LOOP_2_BACKEDGE]] [
-; CHECK-NEXT:    i32 0, label [[LOOP_3_BACKEDGE]]
+; CHECK-NEXT:      i32 0, label [[LOOP_3_BACKEDGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       loop_3_backedge:
 ; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
@@ -1944,7 +1944,7 @@ define i32 @intermediate_subloop_switch_from_inner_to_grandparent(i1 %cond1, i1
 ; CHECK-NEXT:    br i1 [[COND3:%.*]], label [[INTERMEDIATE_LOOP]], label [[INTERMEDIATE_EXIT:%.*]]
 ; CHECK:       intermediate_exit:
 ; CHECK-NEXT:    switch i32 1, label [[LOOP_1_BACKEDGE_LOOPEXIT:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LOOP_3_BACKEDGE]]
+; CHECK-NEXT:      i32 0, label [[LOOP_3_BACKEDGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       loop_3_backedge:
 ; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
@@ -2102,7 +2102,7 @@ define i32 @intermediate_subloop_switch_from_inner_to_parent(i1 %cond1, i1 %cond
 ; CHECK-NEXT:    br i1 [[COND3:%.*]], label [[INTERMEDIATE_LOOP]], label [[INTERMEDIATE_EXIT:%.*]]
 ; CHECK:       intermediate_exit:
 ; CHECK-NEXT:    switch i32 1, label [[LOOP_2_BACKEDGE]] [
-; CHECK-NEXT:    i32 0, label [[LOOP_3_BACKEDGE]]
+; CHECK-NEXT:      i32 0, label [[LOOP_3_BACKEDGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       loop_3_backedge:
 ; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
@@ -2267,7 +2267,7 @@ define i32 @intermediate_complex_subloop_switch_from_inner_to_parent(i1 %cond1,
 ; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[INTERMEDIATE_LOOP_BACKEDGE]], label [[INTERMEDIATE_EXIT:%.*]]
 ; CHECK:       intermediate_exit:
 ; CHECK-NEXT:    switch i32 1, label [[LOOP_2_BACKEDGE]] [
-; CHECK-NEXT:    i32 0, label [[LOOP_3_BACKEDGE]]
+; CHECK-NEXT:      i32 0, label [[LOOP_3_BACKEDGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       loop_3_backedge:
 ; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
@@ -2440,7 +2440,7 @@ define i32 @intermediate_complex_subloop_switch_from_inner_to_grandparent(i1 %co
 ; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[INTERMEDIATE_LOOP_BACKEDGE]], label [[INTERMEDIATE_EXIT:%.*]]
 ; CHECK:       intermediate_exit:
 ; CHECK-NEXT:    switch i32 1, label [[LOOP_1_BACKEDGE_LOOPEXIT:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LOOP_3_BACKEDGE]]
+; CHECK-NEXT:      i32 0, label [[LOOP_3_BACKEDGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       loop_3_backedge:
 ; CHECK-NEXT:    [[K_NEXT]] = add i32 [[K]], 1
@@ -2585,38 +2585,38 @@ define void @test_crash_01(i1 %arg, i32 %arg2) {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    br i1 %arg, label [[BB17:%.*]], label [[BB2:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB17:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    switch i32 0, label [[BB2_SPLIT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[BB19:%.*]]
+; CHECK-NEXT:      i32 1, label [[BB19:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb2.split:
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    switch i32 %arg2, label [[BB16:%.*]] [
-; CHECK-NEXT:    i32 0, label [[BB15:%.*]]
-; CHECK-NEXT:    i32 1, label [[BB14:%.*]]
-; CHECK-NEXT:    i32 2, label [[BB13:%.*]]
-; CHECK-NEXT:    i32 3, label [[BB12:%.*]]
-; CHECK-NEXT:    i32 4, label [[BB11:%.*]]
-; CHECK-NEXT:    i32 5, label [[BB8:%.*]]
-; CHECK-NEXT:    i32 6, label [[BB10:%.*]]
-; CHECK-NEXT:    i32 7, label [[BB9:%.*]]
-; CHECK-NEXT:    i32 8, label [[BB7:%.*]]
+; CHECK-NEXT:    switch i32 [[ARG2:%.*]], label [[BB16:%.*]] [
+; CHECK-NEXT:      i32 0, label [[BB15:%.*]]
+; CHECK-NEXT:      i32 1, label [[BB14:%.*]]
+; CHECK-NEXT:      i32 2, label [[BB13:%.*]]
+; CHECK-NEXT:      i32 3, label [[BB12:%.*]]
+; CHECK-NEXT:      i32 4, label [[BB11:%.*]]
+; CHECK-NEXT:      i32 5, label [[BB8:%.*]]
+; CHECK-NEXT:      i32 6, label [[BB10:%.*]]
+; CHECK-NEXT:      i32 7, label [[BB9:%.*]]
+; CHECK-NEXT:      i32 8, label [[BB7:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb7:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb8:
-; CHECK-NEXT:    switch i32 %arg2, label [[BB28:%.*]] [
-; CHECK-NEXT:    i32 0, label [[BB27:%.*]]
-; CHECK-NEXT:    i32 1, label [[BB26:%.*]]
-; CHECK-NEXT:    i32 2, label [[BB23:%.*]]
-; CHECK-NEXT:    i32 3, label [[BB24:%.*]]
-; CHECK-NEXT:    i32 4, label [[BB25:%.*]]
-; CHECK-NEXT:    i32 5, label [[BB29:%.*]]
-; CHECK-NEXT:    i32 6, label [[BB22:%.*]]
-; CHECK-NEXT:    i32 7, label [[BB20:%.*]]
-; CHECK-NEXT:    i32 8, label [[BB21:%.*]]
+; CHECK-NEXT:    switch i32 [[ARG2]], label [[BB28:%.*]] [
+; CHECK-NEXT:      i32 0, label [[BB27:%.*]]
+; CHECK-NEXT:      i32 1, label [[BB26:%.*]]
+; CHECK-NEXT:      i32 2, label [[BB23:%.*]]
+; CHECK-NEXT:      i32 3, label [[BB24:%.*]]
+; CHECK-NEXT:      i32 4, label [[BB25:%.*]]
+; CHECK-NEXT:      i32 5, label [[BB29:%.*]]
+; CHECK-NEXT:      i32 6, label [[BB22:%.*]]
+; CHECK-NEXT:      i32 7, label [[BB20:%.*]]
+; CHECK-NEXT:      i32 8, label [[BB21:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb9:
 ; CHECK-NEXT:    unreachable
@@ -2772,3 +2772,7 @@ bb28:                                             ; preds = %bb8
 bb29:                                             ; preds = %bb8
   br label %bb6
 }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 0}
+;.
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
index 9003072f5fcdf..dd347a7a6519d 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
@@ -19,9 +19,8 @@ define void @mulvl123_addressing(ptr %src, ptr %dst, i64 %count) #0 {
 ; COMMON-NEXT:    ldr z3, [x0, #3, mul vl]
 ; COMMON-NEXT:    addvl x0, x0, #5
 ; COMMON-NEXT:    umax z0.b, p0/m, z0.b, z1.b
-; COMMON-NEXT:    movprfx z1, z2
-; COMMON-NEXT:    umax z1.b, p0/m, z1.b, z3.b
-; COMMON-NEXT:    umax z0.b, p0/m, z0.b, z1.b
+; COMMON-NEXT:    umax z2.b, p0/m, z2.b, z3.b
+; COMMON-NEXT:    umax z0.b, p0/m, z0.b, z2.b
 ; COMMON-NEXT:    st1b { z0.b }, p0, [x1, x8]
 ; COMMON-NEXT:    incb x8
 ; COMMON-NEXT:    cmp x8, x2
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
index c12d8135e5eba..082b876b542e5 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -234,16 +234,17 @@ define void @extrastride(ptr nocapture %main, i32 %main_stride, ptr nocapture %r
 ; X32-NEXT:    .p2align 4
 ; X32-NEXT:  .LBB2_2: # %for.body
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl (%ebx,%esi), %ebp
-; X32-NEXT:    addl (%ebx), %ebp
-; X32-NEXT:    addl %esi, %ebx
-; X32-NEXT:    addl (%esi,%ebx), %ebp
-; X32-NEXT:    addl %esi, %ebx
-; X32-NEXT:    addl (%esi,%ebx), %ebp
-; X32-NEXT:    addl %esi, %ebx
-; X32-NEXT:    addl (%esi,%ebx), %ebp
-; X32-NEXT:    movl %ebp, (%edx)
-; X32-NEXT:    addl %esi, %ebx
+; X32-NEXT:    movl %ebx, %ebp
+; X32-NEXT:    movl (%ebx,%esi), %ebx
+; X32-NEXT:    addl (%ebp), %ebx
+; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    addl (%esi,%ebp), %ebx
+; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    addl (%esi,%ebp), %ebx
+; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    addl (%esi,%ebp), %ebx
+; X32-NEXT:    movl %ebx, (%edx)
+; X32-NEXT:    leal (%ebp,%esi), %ebx
 ; X32-NEXT:    addl %edi, %ebx
 ; X32-NEXT:    addl %ecx, %edx
 ; X32-NEXT:    decl %eax
diff --git a/llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/peel.ll
similarity index 100%
rename from llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll
rename to llvm/test/Transforms/LoopUnroll/branch-weights-freq/peel.ll
diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll
new file mode 100644
index 0000000000000..96b31d801c2f9
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll
@@ -0,0 +1,160 @@
+; Test branch weight metadata, estimated trip count metadata, and block
+; frequencies after loop unrolling with an epilogue.
+
+; ------------------------------------------------------------------------------
+; Define substitutions.
+;
+; Check original loop body frequency.
+; DEFINE: %{bf-fc} = opt %s -S -passes='print<block-freq>' 2>&1 | \
+; DEFINE:   FileCheck %s -check-prefixes
+;
+; Unroll loops and then check block frequency.  The -implicit-check-not options
+; make sure that no additional labels or @f calls show up.
+; DEFINE: %{ur-bf} = opt %s -S -passes='loop-unroll,print<block-freq>' 2>&1
+; DEFINE: %{fc} = FileCheck %s \
+; DEFINE:     -implicit-check-not='{{^( *- )?[^ ;]*:}}' \
+; DEFINE:     -implicit-check-not='call void @f' -check-prefixes
+
+; ------------------------------------------------------------------------------
+; Check various interesting unroll count values relative to the original loop's
+; estimated trip count of 11 (e.g., minimum and boundary values).
+;
+; RUN: %{bf-fc} ALL,ORIG
+; RUN: %{ur-bf} -unroll-count=2 -unroll-runtime | %{fc} ALL,UR,UR2
+; RUN: %{ur-bf} -unroll-count=4 -unroll-runtime | %{fc} ALL,UR,UR4
+; RUN: %{ur-bf} -unroll-count=10 -unroll-runtime | %{fc} ALL,UR,UR10
+; RUN: %{ur-bf} -unroll-count=11 -unroll-runtime | %{fc} ALL,UR,UR11
+; RUN: %{ur-bf} -unroll-count=12 -unroll-runtime | %{fc} ALL,UR,UR12
+
+; ------------------------------------------------------------------------------
+; Check the iteration frequencies, which, when each is multiplied by the number
+; of original loop bodies that execute within it, should sum to almost exactly
+; the original loop body frequency.
+;
+; ALL-LABEL: block-frequency-info: test
+;
+;      ORIG: - [[ENTRY:.*]]:
+;      ORIG: - [[DO_BODY:.*]]: float = 11.0,
+;      ORIG: - [[DO_END:.*]]:
+;
+;        UR: - [[ENTRY:.*]]:
+;        UR: - [[ENTRY_NEW:.*]]:
+;       UR2: - [[DO_BODY:.*]]: float = 5.2381,
+;       UR4: - [[DO_BODY:.*]]: float = 2.3702,
+;      UR10: - [[DO_BODY:.*]]: float = 0.6902,
+;      UR11: - [[DO_BODY:.*]]: float = 0.59359,
+;      UR12: - [[DO_BODY:.*]]: float = 0.5144,
+;        UR: - [[DO_END_UNR_LCSSA:.*]]:
+;        UR: - [[DO_BODY_EPIL_PREHEADER:.*]]:
+;       UR2: - [[DO_BODY_EPIL:.*]]: float = 0.52381,
+;       UR4: - [[DO_BODY_EPIL:.*]]: float = 1.5193,
+;      UR10: - [[DO_BODY_EPIL:.*]]: float = 4.098,
+;      UR11: - [[DO_BODY_EPIL:.*]]: float = 4.4705,
+;      UR12: - [[DO_BODY_EPIL:.*]]: float = 4.8272,
+;       UR4: - [[DO_END_EPILOG_LCSSA:.*]]:
+;      UR10: - [[DO_END_EPILOG_LCSSA:.*]]:
+;      UR11: - [[DO_END_EPILOG_LCSSA:.*]]:
+;      UR12: - [[DO_END_EPILOG_LCSSA:.*]]:
+;        UR: - [[DO_END:.*]]:
+
+; ------------------------------------------------------------------------------
+; Check the CFGs, including the number of original loop bodies that appear
+; within each unrolled iteration.
+;
+;      UR-LABEL: define void @test(i32 %{{.*}}) {
+;            UR: [[ENTRY]]:
+;            UR:   br i1 %{{.*}}, label %[[DO_BODY_EPIL_PREHEADER]], label %[[ENTRY_NEW]], !prof ![[#PROF_UR_GUARD:]]{{$}}
+;            UR: [[ENTRY_NEW]]:
+;            UR:   br label %[[DO_BODY]]
+;            UR: [[DO_BODY]]:
+;   UR2-COUNT-2:   call void @f
+;   UR4-COUNT-4:   call void @f
+; UR10-COUNT-10:   call void @f
+; UR11-COUNT-11:   call void @f
+; UR12-COUNT-12:   call void @f
+;            UR:   br i1 %{{.*}}, label %[[DO_END_UNR_LCSSA]], label %[[DO_BODY]], !prof ![[#PROF_UR_LATCH:]], !llvm.loop ![[#LOOP_UR_LATCH:]]{{$}}
+;            UR: [[DO_END_UNR_LCSSA]]:
+;            UR:   br i1 %{{.*}}, label %[[DO_BODY_EPIL_PREHEADER]], label %[[DO_END:.*]], !prof ![[#PROF_RM_GUARD:]]{{$}}
+;            UR: [[DO_BODY_EPIL_PREHEADER]]:
+;            UR:   br label %[[DO_BODY_EPIL]]
+;            UR: [[DO_BODY_EPIL]]:
+;            UR:   call void @f
+;           UR4:   br i1 %{{.*}}, label %[[DO_BODY_EPIL]], label %[[DO_END_EPILOG_LCSSA]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]{{$}}
+;          UR10:   br i1 %{{.*}}, label %[[DO_BODY_EPIL]], label %[[DO_END_EPILOG_LCSSA]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]{{$}}
+;          UR11:   br i1 %{{.*}}, label %[[DO_BODY_EPIL]], label %[[DO_END_EPILOG_LCSSA]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]{{$}}
+;          UR12:   br i1 %{{.*}}, label %[[DO_BODY_EPIL]], label %[[DO_END_EPILOG_LCSSA]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]{{$}}
+;           UR4: [[DO_END_EPILOG_LCSSA]]:
+;          UR10: [[DO_END_EPILOG_LCSSA]]:
+;          UR11: [[DO_END_EPILOG_LCSSA]]:
+;          UR12: [[DO_END_EPILOG_LCSSA]]:
+;            UR:   br label %[[DO_END]]
+;            UR: [[DO_END]]:
+;            UR:   ret void
+
+declare void @f(i32)
+
+define void @test(i32 %n) {
+entry:
+  br label %do.body
+
+do.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %do.body ]
+  %inc = add i32 %i, 1
+  call void @f(i32 %i)
+  %c = icmp sge i32 %inc, %n
+  br i1 %c, label %do.end, label %do.body, !prof !0
+
+do.end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 10}
+
+; ------------------------------------------------------------------------------
+; Check branch weight metadata and estimated trip count metadata.
+;
+;  UR2: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 195225786, i32 1952257862}
+;  UR4: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 534047398, i32 1613436250}
+; UR10: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 1236740947, i32 910742701}
+; UR11: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 1319535738, i32 827947910}
+; UR12: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 1394803730, i32 752679918}
+;
+;  UR2: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 372703773, i32 1774779875}
+;  UR4: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 680723421, i32 1466760227}
+; UR10: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 1319535738, i32 827947910}
+; UR11: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 1394803730, i32 752679918}
+; UR12: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 1463229177, i32 684254471}
+;
+;  UR2: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+;  UR4: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+; UR10: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+; UR11: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+; UR12: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+;
+;  UR2: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 5}
+;  UR4: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 2}
+; UR10: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 1}
+; UR11: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 1}
+; UR12: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 0}
+;   UR: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"}
+;
+;  UR2: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1022611260, i32 1124872388}
+;  UR4: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1531603292, i32 615880356}
+; UR10: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1829762672, i32 317720976}
+; UR11: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1846907894, i32 300575754}
+; UR12: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1860963812, i32 286519836}
+;
+;  UR4: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1038564635, i32 1108919013}
+; UR10: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1656332913, i32 491150735}
+; UR11: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1693034047, i32 454449601}
+; UR12: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1723419551, i32 424064097}
+
+;  UR4: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]}
+; UR10: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+; UR11: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]}
+; UR12: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]}
+;
+;  UR4: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 3}
+; For UR10, llvm.loop.estimated_trip_count is the same for both loops.
+; UR11: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 0}
+; UR12: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 11}
diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll
new file mode 100644
index 0000000000000..cde9d46ee8421
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll
@@ -0,0 +1,68 @@
+; Test branch weight metadata, estimated trip count metadata, and block
+; frequencies after partial loop unrolling without -unroll-runtime.
+
+; RUN: opt < %s -S -passes='print<block-freq>' 2>&1 | \
+; RUN:   FileCheck -check-prefix=CHECK %s
+
+; The -implicit-check-not options make sure that no additional labels or calls
+; to @f show up.
+; RUN: opt < %s -S -passes='loop-unroll,print<block-freq>' \
+; RUN:     -unroll-count=4 2>&1 | \
+; RUN:   FileCheck %s -check-prefix=CHECK-UR \
+; RUN:       -implicit-check-not='{{^( *- )?[^ ;]*:}}' \
+; RUN:       -implicit-check-not='call void @f'
+
+; CHECK: block-frequency-info: test
+; CHECK: do.body: float = 10.0,
+
+; The sum should still be ~10.
+;
+; CHECK-UR: block-frequency-info: test
+; CHECK-UR: - [[ENTRY:.*]]:
+; CHECK-UR: - [[DO_BODY:.*]]: float = 2.9078,
+; CHECK-UR: - [[DO_BODY_1:.*]]: float = 2.617,
+; CHECK-UR: - [[DO_BODY_2:.*]]: float = 2.3553,
+; CHECK-UR: - [[DO_BODY_3:.*]]: float = 2.1198,
+; CHECK-UR: - [[DO_END:.*]]:
+
+declare void @f(i32)
+
+define void @test(i32 %n) {
+; CHECK-UR-LABEL: define void @test(i32 %{{.*}}) {
+;       CHECK-UR: [[ENTRY]]:
+;       CHECK-UR:   br label %[[DO_BODY]]
+;       CHECK-UR: [[DO_BODY]]:
+;       CHECK-UR:   call void @f
+;       CHECK-UR:   br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY_1]], !prof ![[#PROF:]]
+;       CHECK-UR: [[DO_BODY_1]]:
+;       CHECK-UR:   call void @f
+;       CHECK-UR:   br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY_2]], !prof ![[#PROF]]
+;       CHECK-UR: [[DO_BODY_2]]:
+;       CHECK-UR:   call void @f
+;       CHECK-UR:   br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY_3]], !prof ![[#PROF]]
+;       CHECK-UR: [[DO_BODY_3]]:
+;       CHECK-UR:   call void @f
+;       CHECK-UR:   br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY]], !prof ![[#PROF]], !llvm.loop ![[#LOOP_UR_LATCH:]]
+;       CHECK-UR: [[DO_END]]:
+;       CHECK-UR:   ret void
+
+entry:
+  br label %do.body
+
+do.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %do.body ]
+  %inc = add i32 %i, 1
+  call void @f(i32 %i)
+  %c = icmp sge i32 %inc, %n
+  br i1 %c, label %do.end, label %do.body, !prof !0
+
+do.end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 9}
+
+; CHECK-UR: ![[#PROF]] = !{!"branch_weights", i32 1, i32 9}
+; CHECK-UR: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+; CHECK-UR: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 3}
+; CHECK-UR: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"}
diff --git a/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll b/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll
index 7f266a754d1bc..314cf38baae04 100644
--- a/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll
+++ b/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll
@@ -85,6 +85,35 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %exitcond, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !3
 }
 
+; LOOP-UNROLL-LABEL: Loop Unroll: F[pragma_unroll_count2] Loop %for.body
+; LOOP-UNROLL-NEXT: Loop Size = 4
+; LOOP-UNROLL-NEXT: Exiting block %for.body: TripCount=0, TripMultiple=1, BreakoutTrip=1
+; LOOP-UNROLL-NEXT: Trying runtime unrolling on Loop:
+; LOOP-UNROLL-NEXT: Loop at depth 1 containing: %for.body<header><exiting>,%for.cond<latch>
+; LOOP-UNROLL-NEXT: Using epilog remainder.
+; LOOP-UNROLL-NEXT: Loop latch not terminated by a conditional branch.
+; LOOP-UNROLL-NEXT: UNROLLING loop %for.body by 5!
+
+; LOOP-UNROLL-FULL-LABEL: Loop Unroll: F[pragma_unroll_count2] Loop %for.body
+; LOOP-UNROLL-FULL-NEXT: Loop Size = 4
+; LOOP-UNROLL-FULL-NEXT: Not attempting partial/runtime unroll in FullLoopUnroll
+define void @pragma_unroll_count2(i64 %n) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.cond, %entry
+  %i = phi i64 [ 0, %entry ], [ %inc, %for.cond ]
+  %cmp = icmp ult i64 %i, %n
+  br i1 %cmp, label %for.cond, label %for.cond.cleanup
+
+for.cond:                                         ; preds = %for.body
+  %inc = add i64 %i, 8
+  br label %for.body, !llvm.loop !3
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+}
+
 ; LOOP-UNROLL: llvm.loop.unroll.disable
 ; LOOP-UNROLL-FULL: llvm.loop.unroll.enable
 !0 = !{!"llvm.loop.unroll.enable"}
diff --git a/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll b/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll
new file mode 100644
index 0000000000000..14f6da42df6b1
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll
@@ -0,0 +1,116 @@
+; Check that a loop probability of one (indicating an always infinite loop) does
+; not crash or otherwise break LoopUnroll behavior when it tries to compute new
+; probabilities from it.
+;
+; That case indicates an always infinite loop.  A remainder loop cannot be
+; calculated at run time when the original loop is infinite as infinity %
+; UnrollCount is undefined, so consistent remainder loop probabilities are
+; difficult or impossible to reason about.  The implementation chooses
+; probabilities indicating that all remainder loop iterations will always
+; execute.
+
+; DEFINE: %{unroll} = opt < %s -unroll-count=3 -passes=loop-unroll -S
+; DEFINE: %{rt} = %{unroll} -unroll-runtime
+
+; RUN: %{unroll} | FileCheck %s -check-prefix UNROLL
+; RUN: %{rt} -unroll-runtime-epilog=true | FileCheck %s -check-prefix EPILOG
+; RUN: %{rt} -unroll-runtime-epilog=false | FileCheck %s -check-prefix PROLOG
+
+define void @test(i32 %n) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  %inc = add i32 %i, 1
+  %c = icmp slt i32 %inc, %n
+  br i1 %c, label %loop, label %end, !prof !0
+
+end:
+  ret void
+}
+
+
+!0 = !{!"branch_weights", i32 1, i32 0}
+
+; UNROLL: define void @test(i32 %n) {
+; UNROLL: entry:
+; UNROLL:   br label %loop
+; UNROLL: loop:
+; UNROLL:   br i1 %c, label %loop.1, label %end, !prof !0
+; UNROLL: loop.1:
+; UNROLL:   br i1 %c.1, label %loop.2, label %end, !prof !0
+; UNROLL: loop.2:
+; UNROLL:   br i1 %c.2, label %loop, label %end, !prof !0, !llvm.loop !1
+; UNROLL-NOT: loop.3
+; UNROLL: end:
+; UNROLL:   ret void
+; UNROLL: }
+;
+; Infinite unrolled loop.
+; UNROLL: !0 = !{!"branch_weights", i32 1, i32 0}
+
+; EPILOG: define void @test(i32 %n) {
+; EPILOG: entry:
+; EPILOG:   br i1 %{{.*}}, label %loop.epil.preheader, label %entry.new, !prof !0
+; EPILOG: entry.new:
+; EPILOG:   br label %loop
+; EPILOG: loop:
+; EPILOG:   br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !1
+; EPILOG: end.unr-lcssa:
+; EPILOG:   br i1 %{{.*}}, label %loop.epil.preheader, label %end, !prof !1
+; EPILOG: loop.epil.preheader:
+; EPILOG:   br label %loop.epil
+; EPILOG: loop.epil:
+; EPILOG:   br i1 %{{.*}}, label %loop.epil, label %end.epilog-lcssa, !prof !4
+; EPILOG: end.epilog-lcssa:
+; EPILOG:   br label %end
+; EPILOG: end:
+; EPILOG:   ret void
+; EPILOG: }
+;
+; Unrolled loop guard: Unrolled loop is always entered.
+; EPILOG: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+; Unrolled loop latch: Unrolled loop is infinite.
+; Epilogue loop guard: Epilogue loop is always entered if unrolled loop exits.
+; EPILOG: !1 = !{!"branch_weights", i32 -2147483648, i32 0}
+;
+; Epilogue loop latch: Epilogue loop executes both of its 2 iterations.
+; EPILOG: !4 = !{!"branch_weights", i32 1073741824, i32 1073741824}
+
+; PROLOG: define void @test(i32 %n) {
+; PROLOG: entry:
+; PROLOG:   br i1 %{{.*}}, label %loop.prol.preheader, label %loop.prol.loopexit, !prof !0
+; PROLOG: loop.prol.preheader:
+; PROLOG:   br label %loop.prol
+; PROLOG: loop.prol:
+; PROLOG:   br i1 %{{.*}}, label %loop.prol, label %loop.prol.loopexit.unr-lcssa, !prof !1
+; PROLOG: loop.prol.loopexit.unr-lcssa:
+; PROLOG:   br label %loop.prol.loopexit
+; PROLOG: loop.prol.loopexit:
+; PROLOG:   br i1 %{{.*}}, label %end, label %entry.new, !prof !0
+; PROLOG: entry.new:
+; PROLOG:   br label %loop
+; PROLOG: loop:
+; PROLOG:   br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !4
+; PROLOG: end.unr-lcssa:
+; PROLOG:   br label %end
+; PROLOG: end:
+; PROLOG:   ret void
+; PROLOG: }
+;
+; FIXME: Branch weights still need to be fixed in the case of prologues (issue
+; #135812), so !0 and !1 do not yet match their comments below.  When we do
+; fix it, this test will hopefully catch any bug like issue #165998, which
+; impacted the case of epilogues.
+;
+; Prologue loop guard: Prologue loop is always entered.
+; Unrolled loop guard: Unrolled loop is always entered.
+; PROLOG: !0 = !{!"branch_weights", i32 1, i32 127}
+;
+; Prologue loop latch: Prologue loop executes both of its 2 iterations.
+; PROLOG: !1 = !{!"branch_weights", i32 0, i32 1}
+;
+; Unrolled loop latch: Unrolled loop is infinite.
+; PROLOG: !4 = !{!"branch_weights", i32 1, i32 0}
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll b/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll
index 0c52b5a0edef8..047360178aa06 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-exit-phi-scev-invalidation.ll
@@ -188,7 +188,7 @@ define void @pr56286(i64 %x, ptr %src, ptr %dst, ptr %ptr.src) !prof !0 {
 ; CHECK-NEXT:    [[L_1_LCSSA_UNR:%.*]] = phi i32 [ poison, [[OUTER_HEADER]] ], [ [[L_1_LCSSA_UNR_PH]], [[INNER_1_HEADER_PROL_LOOPEXIT_UNR_LCSSA]] ]
 ; CHECK-NEXT:    [[INNER_1_IV_UNR:%.*]] = phi i64 [ [[X]], [[OUTER_HEADER]] ], [ [[INNER_1_IV_UNR_PH]], [[INNER_1_HEADER_PROL_LOOPEXIT_UNR_LCSSA]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 7
-; CHECK-NEXT:    br i1 [[TMP4]], label [[OUTER_MIDDLE:%.*]], label [[OUTER_HEADER_NEW:%.*]], !prof [[PROF3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[OUTER_MIDDLE:%.*]], label [[OUTER_HEADER_NEW:%.*]], !prof [[PROF6:![0-9]+]]
 ; CHECK:       outer.header.new:
 ; CHECK-NEXT:    br label [[INNER_1_HEADER:%.*]]
 ; CHECK:       inner.1.header:
@@ -232,7 +232,7 @@ define void @pr56286(i64 %x, ptr %src, ptr %dst, ptr %ptr.src) !prof !0 {
 ; CHECK-NEXT:    store i32 [[L_1_7]], ptr [[DST]], align 8
 ; CHECK-NEXT:    [[INNER_1_IV_NEXT_7]] = add i64 [[INNER_1_IV]], 8
 ; CHECK-NEXT:    [[CMP_2_7:%.*]] = icmp sgt i64 [[INNER_1_IV_NEXT_6]], 0
-; CHECK-NEXT:    br i1 [[CMP_2_7]], label [[OUTER_MIDDLE_UNR_LCSSA:%.*]], label [[INNER_1_HEADER]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP_2_7]], label [[OUTER_MIDDLE_UNR_LCSSA:%.*]], label [[INNER_1_HEADER]], !prof [[PROF7:![0-9]+]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       outer.middle.unr-lcssa:
 ; CHECK-NEXT:    [[L_1_LCSSA_PH:%.*]] = phi i32 [ [[L_1_7]], [[INNER_1_LATCH_7]] ]
 ; CHECK-NEXT:    br label [[OUTER_MIDDLE]]
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll
index 26171990a2592..2f8f98d40e86f 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-loop-branchweight.ll
@@ -2,12 +2,24 @@
 
 ;; Check that the remainder loop is properly assigned a branch weight for its latch branch.
 ; CHECK-LABEL: @test(
-; CHECK-LABEL: for.body:
-; CHECK: br i1 [[COND1:%.*]], label %for.end.loopexit.unr-lcssa, label %for.body, !prof ![[#PROF:]], !llvm.loop ![[#LOOP:]]
-; CHECK-LABEL: for.body.epil:
-; CHECK: br i1 [[COND2:%.*]], label  %for.body.epil, label %for.end.loopexit.epilog-lcssa, !prof ![[#PROF2:]], !llvm.loop ![[#LOOP2:]]
-; CHECK: ![[#PROF]] = !{!"branch_weights", i32 1, i32 2499}
-; CHECK: ![[#PROF2]] = !{!"branch_weights", i32 1, i32 1}
+; CHECK-LABEL: entry:
+;       CHECK: [[FOR_BODY_PREHEADER:.*]]:
+;       CHECK:   br i1 %{{.*}}, label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]], !prof ![[#PROF_UR_GUARD:]]
+;       CHECK: [[FOR_BODY_PREHEADER_NEW]]:
+;       CHECK:   br label %for.body
+;       CHECK: for.body:
+;       CHECK:   %add = add
+;       CHECK:   %add.1 = add
+;       CHECK:   %add.2 = add
+;       CHECK:   %add.3 = add
+;   CHECK-NOT:   %add.4 = add
+;       CHECK:   br i1 %{{.*}}, label %[[FOR_END_LOOPEXIT_UNR_LCSSA:.*]], label %for.body, !prof ![[#PROF_UR_LATCH:]], !llvm.loop ![[#LOOP_UR_LATCH:]]
+;       CHECK: [[FOR_END_LOOPEXIT_UNR_LCSSA]]:
+;       CHECK:   br i1 %{{.*}}, label %[[FOR_BODY_EPIL_PREHEADER]], label %[[FOR_END_LOOPEXIT:.*]], !prof ![[#PROF_RM_GUARD:]]
+;       CHECK: [[FOR_BODY_EPIL_PREHEADER]]:
+;       CHECK:   br label %[[FOR_BODY_EPIL:.*]]
+;       CHECK: [[FOR_BODY_EPIL]]:
+;       CHECK:   br i1 {{.*}}, label %[[FOR_BODY_EPIL]], label %[[FOR_END_LOOPEXIT_EPILOG_LCSSA:.*]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]
 
 define i3 @test(ptr %a, i3 %n) {
 entry:
@@ -31,3 +43,37 @@ for.end:
 }
 
 !0 = !{!"branch_weights", i32 1, i32 9999}
+
+; Original loop probability: p = 9999/(1+9999) = 0.9999
+; Original estimated trip count: (1+9999)/1 = 10000
+; Unroll count: 4
+
+; Probability of >=3 iterations after first: p^3 = 0.9970003 =~
+; 2146839468 / (644180 + 2146839468).
+; CHECK: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 644180, i32 2146839468}
+
+; Probability of >=4 more iterations: p^4 = 0.99960006 =~
+; 2146624784 / (858864 + 2146624784).
+; CHECK: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 858864, i32 2146624784}
+
+; 10000//4 = 2500
+; CHECK: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+; CHECK: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 2500}
+;
+; CHECK: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"}
+
+; Probability of 1 to 3 more of 3 more remainder iterations:
+; (p-p^4)/(1-p^4) = 0.749962497 =~ 1610532724 / (1610532724 + 536950924).
+; CHECK: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1610532724, i32 536950924}
+
+; Frequency of first remainder iter:  r1 =                      1
+; Frequency of second remainder iter: r2 = r1*(p-p^3)/(1-p^3) = 0.666633331
+; Frequency of third remainder iter:  r3 = r2*(p-p^2)/(1-p^2) = 0.333299999
+; Solve for loop probability that produces that frequency: f = 1/(1-p') =>
+; p' = 1-1/f = 1-1/(r1+r2+r3) = 0.499983332 =~
+; 1073706403 / (1073706403 + 1073777245).
+; CHECK: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1073706403, i32 1073777245}
+
+; 10000%4 = 0
+; CHECK: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]}
+; CHECK: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 0}
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop.ll
index 492de063573be..ec7aba432b484 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-loop.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-loop.ll
@@ -295,11 +295,12 @@ exit2.loopexit:
 ; COMMON-LABEL: {{^}}!0 =
 
 ; EPILOG: [[EPILOG_PROF_0]] = !{!"branch_weights", i32 1, i32 11}
-; EPILOG: [[EPILOG_PROF_1]] = !{!"branch_weights", i32 1, i32 127}
-; EPILOG: [[EPILOG_PROF_2]] = !{!"branch_weights", i32 1, i32 7}
-; EPILOG: [[EPILOG_PROF_3]] = !{!"branch_weights", i32 3, i32 1}
+; EPILOG: [[EPILOG_PROF_1]] = !{!"branch_weights", i32 326124004, i32 1821359644}
+; EPILOG: [[EPILOG_PROF_2]] = !{!"branch_weights", i32 1856428066, i32 291055582}
+; EPILOG: [[EPILOG_PROF_3]] = !{!"branch_weights", i32 1597681585, i32 549802063}
 
-; EPILOG: [[EPILOG_LOOP]] = distinct !{[[EPILOG_LOOP]], [[EPILOG_LOOP_1:![0-9]+]]}
+; EPILOG: [[EPILOG_LOOP]] = distinct !{[[EPILOG_LOOP]], [[EPILOG_TC:![0-9]+]], [[EPILOG_LOOP_1:![0-9]+]]}
+; EPILOG: [[EPILOG_TC]] = !{!"llvm.loop.estimated_trip_count", i32 3}
 ; EPILOG: [[EPILOG_LOOP_1]] = !{!"llvm.loop.unroll.disable"}
 
 ; PROLOG: [[PROLOG_PROF_0]] = !{!"branch_weights", i32 1, i32 11}
diff --git a/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll b/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll
index 611ee5fb5807e..02f5bf932132e 100644
--- a/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll
+++ b/llvm/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll
@@ -3,14 +3,27 @@
 @known_constant = internal unnamed_addr constant [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16
 
 ; CHECK-LABEL: @bar_prof
-; CHECK: loop:
-; CHECK:   %mul = mul
-; CHECK:   %mul.1 = mul
-; CHECK:   %mul.2 = mul
-; CHECK:   %mul.3 = mul
-; CHECK:   br i1 %niter.ncmp.7, label %loop.end.unr-lcssa, label %loop, !prof [[PROF0:![0-9]+]]
-; CHECK: loop.epil:
-; CHECK:   br i1 %epil.iter.cmp, label %loop.epil, label %loop.end.epilog-lcssa, !prof [[PROF1:![0-9]+]], !llvm.loop {{![0-9]+}}
+;       CHECK: entry:
+;       CHECK:   br i1 %{{.*}}, label %[[LOOP_EPIL_PREHEADER:.*]], label %[[ENTRY_NEW:.*]], !prof ![[#PROF_UR_GUARD:]]
+;       CHECK: [[ENTRY_NEW]]:
+;       CHECK:   br label %loop
+;       CHECK: loop:
+;       CHECK:   %mul = mul
+;       CHECK:   %mul.1 = mul
+;       CHECK:   %mul.2 = mul
+;       CHECK:   %mul.3 = mul
+;       CHECK:   %mul.4 = mul
+;       CHECK:   %mul.5 = mul
+;       CHECK:   %mul.6 = mul
+;       CHECK:   %mul.7 = mul
+;   CHECK-NOT:   %mul.8 = mul
+;       CHECK:   br i1 %{{.*}}, label %[[LOOP_END_UNR_LCSSA:.*]], label %loop, !prof ![[#PROF_UR_LATCH:]], !llvm.loop ![[#LOOP_UR_LATCH:]]
+;       CHECK: [[LOOP_END_UNR_LCSSA]]:
+;       CHECK:   br i1 %{{.*}}, label %[[LOOP_EPIL_PREHEADER]], label %loop.end, !prof ![[#PROF_RM_GUARD:]]
+;       CHECK: [[LOOP_EPIL_PREHEADER]]:
+;       CHECK:   br label %[[LOOP_EPIL:.*]]
+;       CHECK: [[LOOP_EPIL]]:
+;       CHECK:   br i1 %{{.*}}, label %[[LOOP_EPIL]], label %[[LOOP_END_EPILOG_LCSSA:.*]], !prof ![[#PROF_RM_LATCH:]], !llvm.loop ![[#LOOP_RM_LATCH:]]
 define i32 @bar_prof(ptr noalias nocapture readonly %src, i64 %c) !prof !1 {
 entry:
   br label %loop
@@ -60,5 +73,38 @@ loop.end:
 !1 = !{!"function_entry_count", i64 1}
 !2 = !{!"branch_weights", i32 1, i32 1000}
 
-; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 124}
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 3, i32 1}
+; Original loop probability: p = 1000/(1+1000) = 0.999000999
+; Original estimated trip count: (1+1000)/1 = 1001
+; Unroll count: 8
+
+; Probability of >=7 iterations after first: p^7 = 0.993027916 =~
+; 2132511214 / (14972434 + 2132511214).
+; CHECK: ![[#PROF_UR_GUARD]] = !{!"branch_weights", i32 14972434, i32 2132511214}
+
+; Probability of >=8 more iterations: p^8 = 0.99203588 =~
+; 2130380833 / (17102815 + 2130380833).
+; CHECK: ![[#PROF_UR_LATCH]] = !{!"branch_weights", i32 17102815, i32 2130380833}
+
+; 1001//8 = 125
+; CHECK: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_TC:]]}
+; CHECK: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 125}
+
+; Probability of 1 to 7 more of 7 more remainder iterations:
+; (p-p^8)/(1-p^8) = 0.874562282 =~ 1878108210 / (1878108210 + 269375438).
+; CHECK: ![[#PROF_RM_GUARD]] = !{!"branch_weights", i32 1878108210, i32 269375438}
+
+; Frequency of first remainder iter:   r1 =                      1
+; Frequency of second remainder iter:  r2 = r1*(p-p^7)/(1-p^7) = 0.856714143
+; Frequency of third remainder iter:   r3 = r2*(p-p^6)/(1-p^6) = 0.713571429
+; Frequency of fourth remainder iter:  r4 = r2*(p-p^5)/(1-p^5) = 0.570571715
+; Frequency of fifth remainder iter:   r5 = r2*(p-p^4)/(1-p^4) = 0.427714858
+; Frequency of sixth remainder iter:   r6 = r2*(p-p^3)/(1-p^3) = 0.285000715
+; Frequency of seventh remainder iter: r7 = r2*(p-p^2)/(1-p^2) = 0.142429143
+; Solve for loop probability that produces that frequency: f = 1/(1-p') =>
+; p' = 1-1/f = 1-1/(r1+r2+r3+r4+r5+r6+r7) = 0.749749875 =~
+; 1610075606 / (1610075606 + 537408042).
+; CHECK: ![[#PROF_RM_LATCH]] = !{!"branch_weights", i32 1610075606, i32 537408042}
+
+; Remainder estimated trip count: 1001%8 = 1
+; CHECK: ![[#LOOP_RM_LATCH]] = distinct !{![[#LOOP_RM_LATCH]], ![[#LOOP_RM_TC:]], ![[#DISABLE:]]}
+; CHECK: ![[#LOOP_RM_TC]] = !{!"llvm.loop.estimated_trip_count", i32 1}
diff --git a/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll b/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll
new file mode 100644
index 0000000000000..4d378b0d22f7d
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll
@@ -0,0 +1,30 @@
+; Check that zeroed branch weights do not crash or otherwise break basic
+; LoopUnroll behavior when it tries to compute a probability from them.
+
+; RUN: opt < %s -S -unroll-count=2 -passes='loop-unroll' 2>&1 | FileCheck %s
+
+define void @test() {
+entry:
+  br label %loop
+
+loop:
+  br i1 false, label %end, label %loop, !prof !0
+
+end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 0, i32 0}
+
+; CHECK: define void @test() {
+; CHECK: entry:
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   br i1 false, label %end, label %loop.1, !prof !0
+; CHECK: loop.1:
+; CHECK:   br i1 false, label %end, label %loop, !prof !0, !llvm.loop !1
+; CHECK-NOT: loop.2
+; CHECK: end:
+; CHECK:   ret void
+; CHECK: }
+; CHECK: !0 = !{!"branch_weights", i32 0, i32 0}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index f16351720b20f..2f7e3568d5654 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -429,48 +429,36 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
 ; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; DEFAULT:       [[VECTOR_BODY]]:
-; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE37:.*]] ]
-; DEFAULT-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META8:![0-9]+]]
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> poison, i32 [[TMP9]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> poison, <4 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP19:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META11:![0-9]+]]
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP19]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT29]]
-; DEFAULT-NEXT:    [[TMP7:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META13:![0-9]+]]
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT31:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT30]], <4 x i32> poison, <4 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP8:%.*]] = icmp ugt <4 x i32> [[BROADCAST_SPLAT31]], [[TMP6]]
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE33:.*]] ]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META8:![0-9]+]]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META11:![0-9]+]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META13:![0-9]+]]
+; DEFAULT-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP5]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[D]], i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
-; DEFAULT-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; DEFAULT-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; DEFAULT:       [[PRED_STORE_IF]]:
-; DEFAULT-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; DEFAULT-NEXT:    store i32 [[TMP11]], ptr [[E]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META17:![0-9]+]]
+; DEFAULT-NEXT:    store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META17:![0-9]+]]
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE]]:
-; DEFAULT-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1
-; DEFAULT-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]]
+; DEFAULT-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]]
+; DEFAULT:       [[PRED_STORE_IF28]]:
+; DEFAULT-NEXT:    store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE29]]
+; DEFAULT:       [[PRED_STORE_CONTINUE29]]:
+; DEFAULT-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]]
+; DEFAULT:       [[PRED_STORE_IF30]]:
+; DEFAULT-NEXT:    store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE31]]
+; DEFAULT:       [[PRED_STORE_CONTINUE31]]:
+; DEFAULT-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33]]
 ; DEFAULT:       [[PRED_STORE_IF32]]:
-; DEFAULT-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; DEFAULT-NEXT:    store i32 [[TMP13]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
+; DEFAULT-NEXT:    store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE33]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE33]]:
-; DEFAULT-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2
-; DEFAULT-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35:.*]]
-; DEFAULT:       [[PRED_STORE_IF34]]:
-; DEFAULT-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; DEFAULT-NEXT:    store i32 [[TMP15]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE35]]
-; DEFAULT:       [[PRED_STORE_CONTINUE35]]:
-; DEFAULT-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3
-; DEFAULT-NEXT:    br i1 [[TMP21]], label %[[PRED_STORE_IF36:.*]], label %[[PRED_STORE_CONTINUE37]]
-; DEFAULT:       [[PRED_STORE_IF36]]:
-; DEFAULT-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; DEFAULT-NEXT:    store i32 [[TMP22]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE37]]
-; DEFAULT:       [[PRED_STORE_CONTINUE37]]:
-; DEFAULT-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr align 4 [[TMP16]], <4 x i1> [[TMP8]]), !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]]
+; DEFAULT-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr align 4 [[TMP16]], <4 x i1> [[BROADCAST_SPLAT]]), !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]]
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; DEFAULT-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
@@ -613,63 +601,17 @@ exit:
 define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
 ; COMMON-LABEL: define void @low_trip_count_fold_tail_scalarized_store(
 ; COMMON-SAME: ptr [[DST:%.*]]) {
-; COMMON-NEXT:  [[ENTRY:.*:]]
-; COMMON-NEXT:    br label %[[VECTOR_PH:.*]]
-; COMMON:       [[VECTOR_PH]]:
-; COMMON-NEXT:    br label %[[VECTOR_BODY:.*]]
-; COMMON:       [[VECTOR_BODY]]:
-; COMMON-NEXT:    br i1 true, label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; COMMON:       [[PRED_STORE_IF]]:
-; COMMON-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 0
-; COMMON-NEXT:    store i8 0, ptr [[TMP0]], align 1
-; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE]]
-; COMMON:       [[PRED_STORE_CONTINUE]]:
-; COMMON-NEXT:    br i1 true, label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
-; COMMON:       [[PRED_STORE_IF1]]:
-; COMMON-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1
-; COMMON-NEXT:    store i8 1, ptr [[TMP1]], align 1
-; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
-; COMMON:       [[PRED_STORE_CONTINUE2]]:
-; COMMON-NEXT:    br i1 true, label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
-; COMMON:       [[PRED_STORE_IF3]]:
-; COMMON-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 2
-; COMMON-NEXT:    store i8 2, ptr [[TMP2]], align 1
-; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
-; COMMON:       [[PRED_STORE_CONTINUE4]]:
-; COMMON-NEXT:    br i1 true, label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
-; COMMON:       [[PRED_STORE_IF5]]:
-; COMMON-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 3
-; COMMON-NEXT:    store i8 3, ptr [[TMP3]], align 1
-; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
-; COMMON:       [[PRED_STORE_CONTINUE6]]:
-; COMMON-NEXT:    br i1 true, label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
-; COMMON:       [[PRED_STORE_IF7]]:
-; COMMON-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 4
-; COMMON-NEXT:    store i8 4, ptr [[TMP4]], align 1
-; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
-; COMMON:       [[PRED_STORE_CONTINUE8]]:
-; COMMON-NEXT:    br i1 true, label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
-; COMMON:       [[PRED_STORE_IF9]]:
-; COMMON-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 5
-; COMMON-NEXT:    store i8 5, ptr [[TMP5]], align 1
-; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
-; COMMON:       [[PRED_STORE_CONTINUE10]]:
-; COMMON-NEXT:    br i1 true, label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
-; COMMON:       [[PRED_STORE_IF11]]:
-; COMMON-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 6
-; COMMON-NEXT:    store i8 6, ptr [[TMP6]], align 1
-; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
-; COMMON:       [[PRED_STORE_CONTINUE12]]:
-; COMMON-NEXT:    br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
-; COMMON:       [[PRED_STORE_IF13]]:
-; COMMON-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
-; COMMON-NEXT:    store i8 7, ptr [[TMP7]], align 1
-; COMMON-NEXT:    br label %[[EXIT]]
+; COMMON-NEXT:  [[ENTRY:.*]]:
+; COMMON-NEXT:    br label %[[LOOP:.*]]
+; COMMON:       [[LOOP]]:
+; COMMON-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; COMMON-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8
+; COMMON-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; COMMON-NEXT:    store i8 [[IV_TRUNC]], ptr [[GEP]], align 1
+; COMMON-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; COMMON-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7
+; COMMON-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
 ; COMMON:       [[EXIT]]:
-; COMMON-NEXT:    br label %[[SCALAR_PH:.*]]
-; COMMON:       [[SCALAR_PH]]:
-; COMMON-NEXT:    br label %[[EXIT1:.*]]
-; COMMON:       [[EXIT1]]:
 ; COMMON-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
index dc52e644742e2..a49f089bd2085 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
@@ -150,10 +150,11 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
 ; CHECK-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
-; CHECK-NEXT:    [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP1]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI2]]
-; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP1]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI3]]
-; CHECK-NEXT:    [[TMP6]] = select <4 x i1> [[TMP1]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3]] = select i1 [[TMP2]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP4]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI2]]
+; CHECK-NEXT:    [[TMP5]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI3]]
+; CHECK-NEXT:    [[TMP6]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -191,7 +192,8 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
 ; CHECK-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI12:%.*]] = phi <4 x i32> [ [[DOTSPLAT14]], %[[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND15:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP14]] = select <4 x i1> [[TMP11]], <4 x i32> [[VEC_IND15]], <4 x i32> [[VEC_PHI12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP14]] = select i1 [[TMP13]], <4 x i32> [[VEC_IND15]], <4 x i32> [[VEC_PHI12]]
 ; CHECK-NEXT:    [[INDEX_NEXT17]] = add nuw i64 [[INDEX11]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT16]] = add <4 x i32> [[VEC_IND15]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC8]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
index 5970608794b55..bea34e29e3530 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
@@ -16,7 +16,7 @@
 ; CM: vector.ph:
 ; CM:  CLONE ir<%a> = extractvalue ir<%sv>
 ; CM:  CLONE ir<%b> = extractvalue ir<%sv>
-; CM:  WIDEN ir<%add> = add ir<%a>, ir<%b>
+; CM:  CLONE ir<%add> = add ir<%a>, ir<%b>
 ; CM:  Successor(s): vector loop
 
 ; CM: LV: Scalar loop costs: 5.
@@ -30,23 +30,22 @@ define void @test1(ptr %dst, {i64, i64} %sv) {
 ; FORCED-NEXT:    br label %[[VECTOR_PH:.*]]
 ; FORCED:       [[VECTOR_PH]]:
 ; FORCED-NEXT:    [[TMP0:%.*]] = extractvalue { i64, i64 } [[SV]], 0
-; FORCED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
-; FORCED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; FORCED-NEXT:    [[TMP4:%.*]] = extractvalue { i64, i64 } [[SV]], 1
-; FORCED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0
+; FORCED-NEXT:    [[TMP5:%.*]] = add i64 [[TMP0]], [[TMP4]]
+; FORCED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
 ; FORCED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
-; FORCED-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
 ; FORCED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; FORCED:       [[VECTOR_BODY]]:
 ; FORCED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; FORCED-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[DST]], i32 [[INDEX]]
-; FORCED-NEXT:    store <2 x i64> [[TMP1]], ptr [[TMP2]], align 4
+; FORCED-NEXT:    store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP2]], align 4
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; FORCED-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; FORCED-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FORCED:       [[MIDDLE_BLOCK]]:
-; FORCED-NEXT:    br [[EXIT:label %.*]]
-; FORCED:       [[SCALAR_PH:.*:]]
+; FORCED-NEXT:    br label %[[EXIT:.*]]
+; FORCED:       [[EXIT]]:
+; FORCED-NEXT:    ret void
 ;
 entry:
   br label %loop.body
@@ -99,10 +98,11 @@ define void @test_getVectorCallCost(ptr %dst, {float, float} %sv) {
 ; FORCED-NEXT:    store <2 x float> [[TMP2]], ptr [[TMP1]], align 4
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; FORCED-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; FORCED-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; FORCED-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; FORCED:       [[MIDDLE_BLOCK]]:
-; FORCED-NEXT:    br [[EXIT:label %.*]]
-; FORCED:       [[SCALAR_PH:.*:]]
+; FORCED-NEXT:    br label %[[EXIT:.*]]
+; FORCED:       [[EXIT]]:
+; FORCED-NEXT:    ret void
 ;
 entry:
   br label %loop.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
index 56a1abd2384c8..f3d649b899686 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
@@ -59,20 +59,18 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = freeze <4 x i1> [[TMP3]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = freeze <4 x i1> [[TMP4]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP18]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
@@ -114,27 +112,80 @@ exit:
   ret float %max.next
 }
 
+; TODO: Could fold pairs of `fcmp uno` together.
 define float @test_fmax_and_fmin(ptr %src.0, ptr %src.1, i64 %n) {
 ; CHECK-LABEL: define float @test_fmax_and_fmin(
 ; CHECK-SAME: ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV]]
 ; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV]]
-; CHECK-NEXT:    [[L_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4
-; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC_0]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC_0]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC_1]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI2]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP5]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI3]], <4 x float> [[WIDE_LOAD4]])
+; CHECK-NEXT:    [[TMP6]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD5]])
+; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD6]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = freeze <4 x i1> [[TMP9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i1 [[TMP19]], [[TMP21]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP19]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP6]]
+; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP19]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP19]], <4 x float> [[VEC_PHI2]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP19]], <4 x float> [[VEC_PHI3]], <4 x float> [[TMP5]]
+; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP19]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP23]], <4 x float> [[TMP24]])
+; CHECK-NEXT:    [[TMP28:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[RDX_MINMAX]])
+; CHECK-NEXT:    [[RDX_MINMAX9:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP25]], <4 x float> [[TMP26]])
+; CHECK-NEXT:    [[TMP29:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX9]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP30:%.*]] = xor i1 [[TMP19]], true
+; CHECK-NEXT:    [[TMP31:%.*]] = and i1 [[CMP_N]], [[TMP30]]
+; CHECK-NEXT:    br i1 [[TMP31]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP27]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX8:%.*]] = phi float [ [[TMP29]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MIN:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX8]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV1]]
+; CHECK-NEXT:    [[GEP_SRC_3:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_0:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
+; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_3]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = tail call noundef float @llvm.maxnum.f32(float [[MAX]], float [[L_0]])
 ; CHECK-NEXT:    [[MIN_NEXT]] = tail call noundef float @llvm.minnum.f32(float [[MIN]], float [[L_1]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP29]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ], [ [[TMP28]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[MAX_NEXT_LCSSA]], [[MIN_NEXT_LCSSA]]
 ; CHECK-NEXT:    ret float [[SUB]]
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
index d4f1227a38bda..1cc4c152649b4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
@@ -59,20 +59,18 @@ define float @fminnum(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index bfee39eac0ae2..068f82c7db670 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -365,8 +365,8 @@ define void @invalid_legacy_cost(i64 %N, ptr %x) #0 {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = alloca i8, i64 0, align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x ptr> [[TMP7]], ptr [[TMP6]], i32 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr ptr, ptr [[X]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x ptr> [[TMP8]], ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index 199203a9f5cb0..1164778c19070 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -82,7 +82,7 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 27
+; CHECK: Cost for VF 8: 16
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index cfc6cc87a2a21..4b097ba2422e4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -271,69 +271,11 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
 ;
 ; PRED-LABEL: define void @iv_trunc(
 ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; PRED-NEXT:  [[ENTRY:.*:]]
+; PRED-NEXT:  [[ENTRY:.*]]:
 ; PRED-NEXT:    [[MUL_X:%.*]] = add i32 [[X]], 1
-; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
-; PRED-NEXT:    br label %[[VECTOR_SCEVCHECK:.*]]
-; PRED:       [[VECTOR_SCEVCHECK]]:
-; PRED-NEXT:    [[TMP1:%.*]] = sub i32 -1, [[X]]
-; PRED-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[MUL_X]], 0
-; PRED-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[MUL_X]]
-; PRED-NEXT:    [[TMP4:%.*]] = trunc i64 [[N]] to i32
-; PRED-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]])
-; PRED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
-; PRED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; PRED-NEXT:    [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]]
-; PRED-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0
-; PRED-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false
-; PRED-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
-; PRED-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295
-; PRED-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[MUL_X]], 0
-; PRED-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
-; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
-; PRED-NEXT:    br i1 [[TMP12]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; PRED:       [[VECTOR_PH]]:
-; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 2
-; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 2
-; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[MUL_X]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; PRED-NEXT:    br label %[[VECTOR_BODY:.*]]
-; PRED:       [[VECTOR_BODY]]:
-; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE2]] ]
-; PRED-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE2]] ]
-; PRED-NEXT:    [[TMP16:%.*]] = mul <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; PRED-NEXT:    [[TMP17:%.*]] = zext <2 x i32> [[TMP16]] to <2 x i64>
-; PRED-NEXT:    [[TMP18:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; PRED-NEXT:    br i1 [[TMP18]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; PRED:       [[PRED_STORE_IF]]:
-; PRED-NEXT:    [[TMP19:%.*]] = extractelement <2 x i64> [[TMP17]], i32 0
-; PRED-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]]
-; PRED-NEXT:    store i32 1, ptr [[TMP20]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE]]
-; PRED:       [[PRED_STORE_CONTINUE]]:
-; PRED-NEXT:    [[TMP21:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; PRED-NEXT:    br i1 [[TMP21]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]]
-; PRED:       [[PRED_STORE_IF1]]:
-; PRED-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP17]], i32 1
-; PRED-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP22]]
-; PRED-NEXT:    store i32 1, ptr [[TMP23]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
-; PRED:       [[PRED_STORE_CONTINUE2]]:
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT:    [[TMP25:%.*]] = xor i1 [[TMP24]], true
-; PRED-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
-; PRED-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; PRED:       [[MIDDLE_BLOCK]]:
-; PRED-NEXT:    br label %[[EXIT:.*]]
-; PRED:       [[SCALAR_PH]]:
 ; PRED-NEXT:    br label %[[FOR_BODY:.*]]
 ; PRED:       [[FOR_BODY]]:
-; PRED-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; PRED-NEXT:    [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i32
 ; PRED-NEXT:    [[ADD_I:%.*]] = mul i32 [[MUL_X]], [[TRUNC_IV]]
 ; PRED-NEXT:    [[IV_MUL:%.*]] = zext i32 [[ADD_I]] to i64
@@ -341,7 +283,7 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    store i32 1, ptr [[GEP]], align 4
 ; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; PRED-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PRED-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[FOR_BODY]]
 ; PRED:       [[EXIT]]:
 ; PRED-NEXT:    ret void
 ;
@@ -437,101 +379,21 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
 ;
 ; PRED-LABEL: define void @trunc_ivs_and_store(
 ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; PRED-NEXT:  [[ENTRY:.*:]]
-; PRED-NEXT:    [[MUL:%.*]] = mul i32 [[X]], [[X]]
-; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
-; PRED-NEXT:    br label %[[VECTOR_SCEVCHECK:.*]]
-; PRED:       [[VECTOR_SCEVCHECK]]:
+; PRED-NEXT:  [[ENTRY:.*]]:
 ; PRED-NEXT:    [[TMP1:%.*]] = mul i32 [[X]], [[X]]
-; PRED-NEXT:    [[TMP2:%.*]] = sub i32 0, [[TMP1]]
-; PRED-NEXT:    [[TMP3:%.*]] = icmp slt i32 [[MUL]], 0
-; PRED-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 [[MUL]]
-; PRED-NEXT:    [[TMP5:%.*]] = trunc i64 [[N]] to i32
-; PRED-NEXT:    [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP4]], i32 [[TMP5]])
-; PRED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
-; PRED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
-; PRED-NEXT:    [[TMP6:%.*]] = sub i32 0, [[MUL_RESULT]]
-; PRED-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], 0
-; PRED-NEXT:    [[TMP8:%.*]] = select i1 [[TMP3]], i1 [[TMP7]], i1 false
-; PRED-NEXT:    [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
-; PRED-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[N]], 4294967295
-; PRED-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[MUL]], 0
-; PRED-NEXT:    [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]]
-; PRED-NEXT:    [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
-; PRED-NEXT:    br i1 [[TMP13]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; PRED:       [[VECTOR_PH]]:
-; PRED-NEXT:    [[TMP14:%.*]] = sub i64 [[TMP0]], 4
-; PRED-NEXT:    [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4
-; PRED-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; PRED-NEXT:    br label %[[VECTOR_BODY:.*]]
-; PRED:       [[VECTOR_BODY]]:
-; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE7]] ]
-; PRED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE7]] ]
-; PRED-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; PRED-NEXT:    [[TMP17:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; PRED-NEXT:    [[TMP18:%.*]] = zext <4 x i32> [[TMP17]] to <4 x i64>
-; PRED-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; PRED-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; PRED:       [[PRED_STORE_IF]]:
-; PRED-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0
-; PRED-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP20]]
-; PRED-NEXT:    [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 0
-; PRED-NEXT:    store i32 [[TMP22]], ptr [[TMP21]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE]]
-; PRED:       [[PRED_STORE_CONTINUE]]:
-; PRED-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; PRED-NEXT:    br i1 [[TMP23]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3:.*]]
-; PRED:       [[PRED_STORE_IF2]]:
-; PRED-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1
-; PRED-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP24]]
-; PRED-NEXT:    [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 1
-; PRED-NEXT:    store i32 [[TMP26]], ptr [[TMP25]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE3]]
-; PRED:       [[PRED_STORE_CONTINUE3]]:
-; PRED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
-; PRED-NEXT:    br i1 [[TMP27]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]]
-; PRED:       [[PRED_STORE_IF4]]:
-; PRED-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2
-; PRED-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP28]]
-; PRED-NEXT:    [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], 2
-; PRED-NEXT:    store i32 [[TMP30]], ptr [[TMP29]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE5]]
-; PRED:       [[PRED_STORE_CONTINUE5]]:
-; PRED-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; PRED-NEXT:    br i1 [[TMP31]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]]
-; PRED:       [[PRED_STORE_IF6]]:
-; PRED-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3
-; PRED-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP32]]
-; PRED-NEXT:    [[TMP34:%.*]] = add i32 [[OFFSET_IDX]], 3
-; PRED-NEXT:    store i32 [[TMP34]], ptr [[TMP33]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
-; PRED:       [[PRED_STORE_CONTINUE7]]:
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]])
-; PRED-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT:    [[TMP36:%.*]] = xor i1 [[TMP35]], true
-; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; PRED-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; PRED:       [[MIDDLE_BLOCK]]:
-; PRED-NEXT:    br label %[[EXIT:.*]]
-; PRED:       [[SCALAR_PH]]:
 ; PRED-NEXT:    br label %[[LOOP:.*]]
 ; PRED:       [[LOOP]]:
-; PRED-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
-; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
+; PRED-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
+; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
 ; PRED-NEXT:    [[IV_1_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32
-; PRED-NEXT:    [[IV_1_MUL:%.*]] = mul i32 [[MUL]], [[IV_1_TRUNC]]
+; PRED-NEXT:    [[IV_1_MUL:%.*]] = mul i32 [[TMP1]], [[IV_1_TRUNC]]
 ; PRED-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
 ; PRED-NEXT:    [[MUL_EXT:%.*]] = zext i32 [[IV_1_MUL]] to i64
 ; PRED-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[MUL_EXT]]
 ; PRED-NEXT:    store i32 [[IV_2]], ptr [[GEP]], align 4
 ; PRED-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
 ; PRED-NEXT:    [[EXITCOND_3_NOT:%.*]] = icmp eq i64 [[IV_1]], [[N]]
-; PRED-NEXT:    br i1 [[EXITCOND_3_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; PRED-NEXT:    br i1 [[EXITCOND_3_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
 ; PRED:       [[EXIT]]:
 ; PRED-NEXT:    ret void
 ;
@@ -627,91 +489,12 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
 ;
 ; PRED-LABEL: define void @ivs_trunc_and_ext(
 ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; PRED-NEXT:  [[ENTRY:.*:]]
+; PRED-NEXT:  [[ENTRY:.*]]:
 ; PRED-NEXT:    [[ADD:%.*]] = add i32 [[X]], 1
-; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
-; PRED-NEXT:    br label %[[VECTOR_SCEVCHECK:.*]]
-; PRED:       [[VECTOR_SCEVCHECK]]:
-; PRED-NEXT:    [[TMP1:%.*]] = sub i32 -1, [[X]]
-; PRED-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[ADD]], 0
-; PRED-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[ADD]]
-; PRED-NEXT:    [[TMP4:%.*]] = trunc i64 [[N]] to i32
-; PRED-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]])
-; PRED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
-; PRED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; PRED-NEXT:    [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]]
-; PRED-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0
-; PRED-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false
-; PRED-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
-; PRED-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295
-; PRED-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[ADD]], 0
-; PRED-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
-; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
-; PRED-NEXT:    br i1 [[TMP12]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; PRED:       [[VECTOR_PH]]:
-; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 4
-; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 4
-; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ADD]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; PRED-NEXT:    br label %[[VECTOR_BODY:.*]]
-; PRED:       [[VECTOR_BODY]]:
-; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
-; PRED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
-; PRED-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; PRED-NEXT:    [[TMP16:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; PRED-NEXT:    [[TMP17:%.*]] = zext <4 x i32> [[TMP16]] to <4 x i64>
-; PRED-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; PRED-NEXT:    br i1 [[TMP18]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; PRED:       [[PRED_STORE_IF]]:
-; PRED-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
-; PRED-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]]
-; PRED-NEXT:    [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 0
-; PRED-NEXT:    store i32 [[TMP21]], ptr [[TMP20]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE]]
-; PRED:       [[PRED_STORE_CONTINUE]]:
-; PRED-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; PRED-NEXT:    br i1 [[TMP22]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
-; PRED:       [[PRED_STORE_IF1]]:
-; PRED-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
-; PRED-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP23]]
-; PRED-NEXT:    [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 1
-; PRED-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
-; PRED:       [[PRED_STORE_CONTINUE2]]:
-; PRED-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
-; PRED-NEXT:    br i1 [[TMP26]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
-; PRED:       [[PRED_STORE_IF3]]:
-; PRED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
-; PRED-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP27]]
-; PRED-NEXT:    [[TMP29:%.*]] = add i32 [[OFFSET_IDX]], 2
-; PRED-NEXT:    store i32 [[TMP29]], ptr [[TMP28]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
-; PRED:       [[PRED_STORE_CONTINUE4]]:
-; PRED-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; PRED-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]]
-; PRED:       [[PRED_STORE_IF5]]:
-; PRED-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
-; PRED-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP31]]
-; PRED-NEXT:    [[TMP33:%.*]] = add i32 [[OFFSET_IDX]], 3
-; PRED-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
-; PRED:       [[PRED_STORE_CONTINUE6]]:
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT:    [[TMP35:%.*]] = xor i1 [[TMP34]], true
-; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; PRED-NEXT:    br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; PRED:       [[MIDDLE_BLOCK]]:
-; PRED-NEXT:    br label %[[EXIT:.*]]
-; PRED:       [[SCALAR_PH]]:
 ; PRED-NEXT:    br label %[[LOOP:.*]]
 ; PRED:       [[LOOP]]:
-; PRED-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
-; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
+; PRED-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
+; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
 ; PRED-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32
 ; PRED-NEXT:    [[IV_MUL:%.*]] = mul i32 [[ADD]], [[IV_TRUNC]]
 ; PRED-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
@@ -720,7 +503,7 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    store i32 [[IV_2]], ptr [[GEP]], align 4
 ; PRED-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
 ; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]]
-; PRED-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; PRED-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
 ; PRED:       [[EXIT]]:
 ; PRED-NEXT:    ret void
 ;
@@ -842,7 +625,7 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
 ; PRED:       [[PRED_STORE_CONTINUE5]]:
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; PRED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; PRED-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; PRED-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
 ; PRED:       [[SCALAR_PH]]:
@@ -855,7 +638,7 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
 ; PRED-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
 ; PRED-NEXT:    [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64
 ; PRED-NEXT:    [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]]
-; PRED-NEXT:    br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; PRED-NEXT:    br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
 ; PRED:       [[EXIT]]:
 ; PRED-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll
index d80fdd1ce7270..9dfb987bd24a6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll
@@ -11,8 +11,6 @@ define i32 @test_invariant_replicate_region(i32 %x, i1 %c) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE6:.*]] ]
@@ -43,8 +41,8 @@ define i32 @test_invariant_replicate_region(i32 %x, i1 %c) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP13]], i32 3
 ; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE6]]
 ; CHECK:       [[PRED_UREM_CONTINUE6]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP11]], %[[PRED_UREM_CONTINUE4]] ], [ [[TMP14]], %[[PRED_UREM_IF5]] ]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP15]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x i32> [ [[TMP11]], %[[PRED_UREM_CONTINUE4]] ], [ [[TMP14]], %[[PRED_UREM_IF5]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll
index ea0148952f51b..0a9494e4c7ade 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll
@@ -10,8 +10,8 @@ define void @licm_replicate_call(double %x, ptr %dst) {
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call double @llvm.pow.f64(double [[X]], double 3.000000e+00)
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
index 157b78704234a..3311cbc11881b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
@@ -64,11 +64,12 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFCOMMON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
 ; TFCOMMON-NEXT:    [[LD:%.*]] = load double, ptr [[P2:%.*]], align 8
-; TFCOMMON-NEXT:    [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR3:[0-9]+]]
-; TFCOMMON-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
-; TFCOMMON-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP5]], i32 1
+; TFCOMMON-NEXT:    [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR2:[0-9]+]]
+; TFCOMMON-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i64 0
+; TFCOMMON-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
 ; TFCOMMON-NEXT:    [[TMP9:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer
-; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP9]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00)
+; TFCOMMON-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
+; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP7]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00)
 ; TFCOMMON-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
 ; TFCOMMON-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; TFCOMMON:       pred.store.if:
@@ -79,7 +80,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFCOMMON-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
 ; TFCOMMON-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE6]]
 ; TFCOMMON:       pred.store.if1:
-; TFCOMMON-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
+; TFCOMMON-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 0
 ; TFCOMMON-NEXT:    store double [[TMP19]], ptr [[P]], align 8
 ; TFCOMMON-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; TFCOMMON:       pred.store.continue2:
@@ -105,11 +106,12 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ]
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT10:%.*]], [[PRED_STORE_CONTINUE9]] ]
 ; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = load double, ptr [[P2:%.*]], align 8
-; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR3:[0-9]+]]
-; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i32 0
-; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[TMP9]], i32 1
+; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR2:[0-9]+]]
+; TFA_INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i64 0
+; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
 ; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = fcmp ogt <2 x double> [[TMP12]], zeroinitializer
-; TFA_INTERLEAVE-NEXT:    [[PREDPHI3:%.*]] = select <2 x i1> [[TMP14]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00)
+; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP14]], i32 0
+; TFA_INTERLEAVE-NEXT:    [[PREDPHI3:%.*]] = select i1 [[TMP7]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00)
 ; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
 ; TFA_INTERLEAVE-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; TFA_INTERLEAVE:       pred.store.if:
@@ -120,7 +122,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
 ; TFA_INTERLEAVE-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
 ; TFA_INTERLEAVE:       pred.store.if3:
-; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 1
+; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 0
 ; TFA_INTERLEAVE-NEXT:    store double [[TMP22]], ptr [[P]], align 8
 ; TFA_INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE5]]
 ; TFA_INTERLEAVE:       pred.store.continue4:
@@ -134,7 +136,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK2]], i32 1
 ; TFA_INTERLEAVE-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]]
 ; TFA_INTERLEAVE:       pred.store.if7:
-; TFA_INTERLEAVE-NEXT:    [[TMP34:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 1
+; TFA_INTERLEAVE-NEXT:    [[TMP34:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 0
 ; TFA_INTERLEAVE-NEXT:    store double [[TMP34]], ptr [[P]], align 8
 ; TFA_INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE9]]
 ; TFA_INTERLEAVE:       pred.store.continue8:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked_ldst_sme.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked_ldst_sme.ll
new file mode 100644
index 0000000000000..fe7f43f7f4b02
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked_ldst_sme.ll
@@ -0,0 +1,187 @@
+; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @wombat(i32 %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, i8 %arg6) #0 {
+; CHECK-LABEL: define void @wombat(
+; CHECK-SAME: i32 [[ARG:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]], ptr [[ARG3:%.*]], ptr [[ARG4:%.*]], ptr [[ARG5:%.*]], i8 [[ARG6:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp sgt i32 [[ARG]], 0
+; CHECK-NEXT:    br i1 [[ICMP]], label %[[BB7:.*]], label %[[BB25:.*]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext nneg i32 [[ARG]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ARG1]], i64 [[ZEXT]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[ZEXT]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[ARG5]], i64 [[ZEXT]]
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[ARG3]], i64 [[ZEXT]]
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[ARG4]], i64 [[ZEXT]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND05:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND16:%.*]] = icmp ult ptr [[ARG5]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT7:%.*]] = and i1 [[BOUND05]], [[BOUND16]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT7]]
+; CHECK-NEXT:    [[BOUND08:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[BOUND19:%.*]] = icmp ult ptr [[ARG3]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
+; CHECK-NEXT:    [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
+; CHECK-NEXT:    [[BOUND012:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP4]]
+; CHECK-NEXT:    [[BOUND113:%.*]] = icmp ult ptr [[ARG4]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
+; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]]
+; CHECK-NEXT:    [[BOUND016:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND117:%.*]] = icmp ult ptr [[ARG5]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; CHECK-NEXT:    [[CONFLICT_RDX19:%.*]] = or i1 [[CONFLICT_RDX15]], [[FOUND_CONFLICT18]]
+; CHECK-NEXT:    [[BOUND020:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[BOUND121:%.*]] = icmp ult ptr [[ARG3]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT22:%.*]] = and i1 [[BOUND020]], [[BOUND121]]
+; CHECK-NEXT:    [[CONFLICT_RDX23:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
+; CHECK-NEXT:    [[BOUND024:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP4]]
+; CHECK-NEXT:    [[BOUND125:%.*]] = icmp ult ptr [[ARG4]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]]
+; CHECK-NEXT:    [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX23]], [[FOUND_CONFLICT26]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX27]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[ARG6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG5]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP4]], align 1, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge <vscale x 16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[ARG1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP6]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META3:![0-9]+]], !noalias [[META5:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[ARG3]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD28:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP7]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META9:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[ARG4]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP8]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META10:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD29]], [[WIDE_MASKED_LOAD28]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i8> [[TMP9]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP10]], ptr align 1 [[TMP6]], <vscale x 16 x i1> [[TMP5]]), !alias.scope [[META3]], !noalias [[META5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD30:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META11:![0-9]+]], !noalias [[META12:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD28]], [[WIDE_MASKED_LOAD28]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD30]], [[TMP12]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr align 1 [[TMP11]], <vscale x 16 x i1> [[TMP5]]), !alias.scope [[META11]], !noalias [[META12]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[BB24:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[BB7]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[BB8:.*]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD22:%.*]], %[[BB21:.*]] ]
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG5]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GETELEMENTPTR]], align 1
+; CHECK-NEXT:    [[ICMP9:%.*]] = icmp ult i8 [[LOAD]], [[ARG6]]
+; CHECK-NEXT:    br i1 [[ICMP9]], label %[[BB21]], label %[[BB10:.*]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[GETELEMENTPTR11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG1]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD12:%.*]] = load i8, ptr [[GETELEMENTPTR11]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR13:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG3]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD14:%.*]] = load i8, ptr [[GETELEMENTPTR13]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR15:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG4]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD16:%.*]] = load i8, ptr [[GETELEMENTPTR15]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[LOAD16]], [[LOAD14]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[LOAD12]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[GETELEMENTPTR11]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR17:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG2]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD18:%.*]] = load i8, ptr [[GETELEMENTPTR17]], align 1
+; CHECK-NEXT:    [[MUL19:%.*]] = mul i8 [[LOAD14]], [[LOAD14]]
+; CHECK-NEXT:    [[ADD20:%.*]] = add i8 [[LOAD18]], [[MUL19]]
+; CHECK-NEXT:    store i8 [[ADD20]], ptr [[GETELEMENTPTR17]], align 1
+; CHECK-NEXT:    br label %[[BB21]]
+; CHECK:       [[BB21]]:
+; CHECK-NEXT:    [[ADD22]] = add nuw nsw i64 [[PHI]], 1
+; CHECK-NEXT:    [[ICMP23:%.*]] = icmp eq i64 [[ADD22]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[ICMP23]], label %[[BB24]], label %[[BB8]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       [[BB24]]:
+; CHECK-NEXT:    br label %[[BB25]]
+; CHECK:       [[BB25]]:
+; CHECK-NEXT:    ret void
+;
+bb:
+  %icmp = icmp sgt i32 %arg, 0
+  br i1 %icmp, label %bb7, label %bb25
+
+bb7:                                              ; preds = %bb
+  %zext = zext nneg i32 %arg to i64
+  br label %bb8
+
+bb8:                                              ; preds = %bb21, %bb7
+  %phi = phi i64 [ 0, %bb7 ], [ %add22, %bb21 ]
+  %getelementptr = getelementptr inbounds nuw i8, ptr %arg5, i64 %phi
+  %load = load i8, ptr %getelementptr, align 1
+  %icmp9 = icmp ult i8 %load, %arg6
+  br i1 %icmp9, label %bb21, label %bb10
+
+bb10:                                             ; preds = %bb8
+  %getelementptr11 = getelementptr inbounds nuw i8, ptr %arg1, i64 %phi
+  %load12 = load i8, ptr %getelementptr11, align 1
+  %getelementptr13 = getelementptr inbounds nuw i8, ptr %arg3, i64 %phi
+  %load14 = load i8, ptr %getelementptr13, align 1
+  %getelementptr15 = getelementptr inbounds nuw i8, ptr %arg4, i64 %phi
+  %load16 = load i8, ptr %getelementptr15, align 1
+  %mul = mul i8 %load16, %load14
+  %add = add i8 %mul, %load12
+  store i8 %add, ptr %getelementptr11, align 1
+  %getelementptr17 = getelementptr inbounds nuw i8, ptr %arg2, i64 %phi
+  %load18 = load i8, ptr %getelementptr17, align 1
+  %mul19 = mul i8 %load14, %load14
+  %add20 = add i8 %load18, %mul19
+  store i8 %add20, ptr %getelementptr17, align 1
+  br label %bb21
+
+bb21:                                             ; preds = %bb10, %bb8
+  %add22 = add nuw nsw i64 %phi, 1
+  %icmp23 = icmp eq i64 %add22, %zext
+  br i1 %icmp23, label %bb24, label %bb8, !llvm.loop !0
+
+bb24:                                             ; preds = %bb21
+  br label %bb25
+
+bb25:                                             ; preds = %bb24, %bb
+  ret void
+}
+
+attributes #0 = { uwtable vscale_range(1,16) "aarch64_pstate_sm_body" "target-features"="+fp-armv8,+neon,+sme,+v8a,-fmv" }
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 16}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]}
+; CHECK: [[META5]] = !{[[META6:![0-9]+]], [[META1]], [[META7:![0-9]+]], [[META8:![0-9]+]]}
+; CHECK: [[META6]] = distinct !{[[META6]], [[META2]]}
+; CHECK: [[META7]] = distinct !{[[META7]], [[META2]]}
+; CHECK: [[META8]] = distinct !{[[META8]], [[META2]]}
+; CHECK: [[META9]] = !{[[META7]]}
+; CHECK: [[META10]] = !{[[META8]]}
+; CHECK: [[META11]] = !{[[META6]]}
+; CHECK: [[META12]] = !{[[META1]], [[META7]], [[META8]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META14:![0-9]+]], [[META15:![0-9]+]], [[META16:![0-9]+]]}
+; CHECK: [[META14]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META15]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META16]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META14]], [[META15]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index d8f1a86c9ebda..5b9bd0997f2fa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -182,313 +182,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1
+; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK:       pred.load.continue:
 ; CHECK-NEXT:    [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
 ; CHECK:       pred.load.if1:
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1
+; CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
+; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
 ; CHECK:       pred.load.continue2:
 ; CHECK-NEXT:    [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
 ; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
 ; CHECK:       pred.load.if3:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2
+; CHECK-NEXT:    [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1
+; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
 ; CHECK:       pred.load.continue4:
 ; CHECK-NEXT:    [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
 ; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3
+; CHECK-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1
+; CHECK-NEXT:    [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
 ; CHECK:       pred.load.continue6:
 ; CHECK-NEXT:    [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
 ; CHECK-NEXT:    br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
 ; CHECK:       pred.load.if7:
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
 ; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4
+; CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
+; CHECK-NEXT:    [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
 ; CHECK:       pred.load.continue8:
 ; CHECK-NEXT:    [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ]
+; CHECK-NEXT:    [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
 ; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
 ; CHECK:       pred.load.if9:
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
 ; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5
+; CHECK-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1
+; CHECK-NEXT:    [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
 ; CHECK:       pred.load.continue10:
 ; CHECK-NEXT:    [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ]
+; CHECK-NEXT:    [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ]
 ; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
 ; CHECK-NEXT:    br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
 ; CHECK:       pred.load.if11:
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6
+; CHECK-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1
+; CHECK-NEXT:    [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
 ; CHECK:       pred.load.continue12:
 ; CHECK-NEXT:    [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ]
+; CHECK-NEXT:    [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ]
 ; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
 ; CHECK-NEXT:    br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
 ; CHECK:       pred.load.if13:
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
 ; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7
+; CHECK-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
+; CHECK-NEXT:    [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
 ; CHECK:       pred.load.continue14:
 ; CHECK-NEXT:    [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ]
+; CHECK-NEXT:    [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ]
 ; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
 ; CHECK-NEXT:    br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
 ; CHECK:       pred.load.if15:
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8
+; CHECK-NEXT:    [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1
+; CHECK-NEXT:    [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
 ; CHECK:       pred.load.continue16:
 ; CHECK-NEXT:    [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ]
+; CHECK-NEXT:    [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ]
 ; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
 ; CHECK-NEXT:    br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
 ; CHECK:       pred.load.if17:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
 ; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1
 ; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9
+; CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1
+; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE18]]
 ; CHECK:       pred.load.continue18:
 ; CHECK-NEXT:    [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ]
+; CHECK-NEXT:    [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ]
 ; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
 ; CHECK-NEXT:    br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
 ; CHECK:       pred.load.if19:
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
 ; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1
 ; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10
+; CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1
+; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE20]]
 ; CHECK:       pred.load.continue20:
 ; CHECK-NEXT:    [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ]
+; CHECK-NEXT:    [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ]
 ; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
 ; CHECK-NEXT:    br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
 ; CHECK:       pred.load.if21:
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
 ; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1
 ; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11
+; CHECK-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1
+; CHECK-NEXT:    [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE22]]
 ; CHECK:       pred.load.continue22:
 ; CHECK-NEXT:    [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ]
+; CHECK-NEXT:    [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ]
 ; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
 ; CHECK-NEXT:    br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
 ; CHECK:       pred.load.if23:
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
 ; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1
 ; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12
+; CHECK-NEXT:    [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1
+; CHECK-NEXT:    [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE24]]
 ; CHECK:       pred.load.continue24:
 ; CHECK-NEXT:    [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ]
+; CHECK-NEXT:    [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ]
 ; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
 ; CHECK-NEXT:    br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
 ; CHECK:       pred.load.if25:
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1
 ; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13
+; CHECK-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1
+; CHECK-NEXT:    [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE26]]
 ; CHECK:       pred.load.continue26:
 ; CHECK-NEXT:    [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ]
+; CHECK-NEXT:    [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ]
 ; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
 ; CHECK-NEXT:    br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
 ; CHECK:       pred.load.if27:
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1
 ; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14
+; CHECK-NEXT:    [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1
+; CHECK-NEXT:    [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE28]]
 ; CHECK:       pred.load.continue28:
 ; CHECK-NEXT:    [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ]
+; CHECK-NEXT:    [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ]
 ; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
-; CHECK-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
+; CHECK-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]]
 ; CHECK:       pred.load.if29:
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-NEXT:    [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1
 ; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE30]]
-; CHECK:       pred.load.continue30:
-; CHECK-NEXT:    [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ]
-; CHECK-NEXT:    [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32>
-; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
-; CHECK-NEXT:    br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
-; CHECK:       pred.load.if31:
-; CHECK-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1
-; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE32]]
-; CHECK:       pred.load.continue32:
-; CHECK-NEXT:    [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ]
-; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
-; CHECK-NEXT:    br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
-; CHECK:       pred.load.if33:
-; CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
-; CHECK-NEXT:    [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE34]]
-; CHECK:       pred.load.continue34:
-; CHECK-NEXT:    [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ]
-; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
-; CHECK-NEXT:    br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]]
-; CHECK:       pred.load.if35:
-; CHECK-NEXT:    [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1
-; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE36]]
-; CHECK:       pred.load.continue36:
-; CHECK-NEXT:    [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ]
-; CHECK-NEXT:    [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
-; CHECK-NEXT:    br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]]
-; CHECK:       pred.load.if37:
-; CHECK-NEXT:    [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1
-; CHECK-NEXT:    [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE38]]
-; CHECK:       pred.load.continue38:
-; CHECK-NEXT:    [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ]
-; CHECK-NEXT:    [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
-; CHECK-NEXT:    br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]]
-; CHECK:       pred.load.if39:
-; CHECK-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1
-; CHECK-NEXT:    [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE40]]
-; CHECK:       pred.load.continue40:
-; CHECK-NEXT:    [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ]
-; CHECK-NEXT:    [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
-; CHECK-NEXT:    br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]]
-; CHECK:       pred.load.if41:
-; CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
-; CHECK-NEXT:    [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE42]]
-; CHECK:       pred.load.continue42:
-; CHECK-NEXT:    [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ]
-; CHECK-NEXT:    [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
-; CHECK-NEXT:    br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]]
-; CHECK:       pred.load.if43:
-; CHECK-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1
-; CHECK-NEXT:    [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE44]]
-; CHECK:       pred.load.continue44:
-; CHECK-NEXT:    [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ]
-; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
-; CHECK-NEXT:    br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]]
-; CHECK:       pred.load.if45:
-; CHECK-NEXT:    [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1
-; CHECK-NEXT:    [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE46]]
-; CHECK:       pred.load.continue46:
-; CHECK-NEXT:    [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ]
-; CHECK-NEXT:    [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
-; CHECK-NEXT:    br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]]
-; CHECK:       pred.load.if47:
-; CHECK-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1
-; CHECK-NEXT:    [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE48]]
-; CHECK:       pred.load.continue48:
-; CHECK-NEXT:    [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ]
-; CHECK-NEXT:    [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
-; CHECK-NEXT:    br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]]
-; CHECK:       pred.load.if49:
-; CHECK-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
-; CHECK-NEXT:    [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE50]]
-; CHECK:       pred.load.continue50:
-; CHECK-NEXT:    [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ]
-; CHECK-NEXT:    [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
-; CHECK-NEXT:    br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]]
-; CHECK:       pred.load.if51:
-; CHECK-NEXT:    [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1
-; CHECK-NEXT:    [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE52]]
-; CHECK:       pred.load.continue52:
-; CHECK-NEXT:    [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ]
-; CHECK-NEXT:    [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
-; CHECK-NEXT:    br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]]
-; CHECK:       pred.load.if53:
-; CHECK-NEXT:    [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1
-; CHECK-NEXT:    [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE54]]
-; CHECK:       pred.load.continue54:
-; CHECK-NEXT:    [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ]
-; CHECK-NEXT:    [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
-; CHECK-NEXT:    br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]]
-; CHECK:       pred.load.if55:
-; CHECK-NEXT:    [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1
-; CHECK-NEXT:    [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE56]]
-; CHECK:       pred.load.continue56:
-; CHECK-NEXT:    [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ]
-; CHECK-NEXT:    [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
-; CHECK-NEXT:    br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]]
-; CHECK:       pred.load.if57:
-; CHECK-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1
-; CHECK-NEXT:    [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE58]]
-; CHECK:       pred.load.continue58:
-; CHECK-NEXT:    [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ]
-; CHECK-NEXT:    [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
-; CHECK-NEXT:    br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]]
-; CHECK:       pred.load.if59:
-; CHECK-NEXT:    [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1
-; CHECK-NEXT:    [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE60]]
-; CHECK:       pred.load.continue60:
-; CHECK-NEXT:    [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ]
-; CHECK-NEXT:    [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
-; CHECK-NEXT:    br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]]
-; CHECK:       pred.load.if61:
 ; CHECK-NEXT:    [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1
 ; CHECK-NEXT:    [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE62]]
-; CHECK:       pred.load.continue62:
-; CHECK-NEXT:    [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ]
+; CHECK:       pred.load.continue30:
+; CHECK-NEXT:    [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ]
+; CHECK-NEXT:    [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ]
 ; CHECK-NEXT:    [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
+; CHECK-NEXT:    [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
 ; CHECK-NEXT:    [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
index 26e630f969ef3..0ee6b52a2450b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
@@ -55,44 +55,36 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:  entry:
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-NOI8MM:       vector.ph:
-; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; CHECK-NOI8MM-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NOI8MM-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-NOI8MM:       vector.body:
 ; CHECK-NOI8MM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
-; CHECK-NOI8MM-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-NOI8MM-NEXT:    [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
-; CHECK-NOI8MM-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]]
-; CHECK-NOI8MM-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]]
-; CHECK-NOI8MM-NEXT:    [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]]
-; CHECK-NOI8MM-NEXT:    [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
-; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NOI8MM-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NOI8MM-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NOI8MM-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP13]], i32 16
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[TMP14]], [[TMP2]]
+; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]]
+; CHECK-NOI8MM-NEXT:    [[TMP10]] = add <16 x i32> [[TMP8]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT:    [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]]
+; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NOI8MM-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
-; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]]
-; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NOI8MM-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NOI8MM-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-NOI8MM:       scalar.ph:
+; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-NOI8MM:       for.exit:
+; CHECK-NOI8MM-NEXT:    ret i32 [[TMP15]]
 ;
 entry:
   br label %for.body
@@ -166,44 +158,36 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:  entry:
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-NOI8MM:       vector.ph:
-; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; CHECK-NOI8MM-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NOI8MM-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-NOI8MM:       vector.body:
 ; CHECK-NOI8MM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
-; CHECK-NOI8MM-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-NOI8MM-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
-; CHECK-NOI8MM-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]]
-; CHECK-NOI8MM-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]]
-; CHECK-NOI8MM-NEXT:    [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]]
-; CHECK-NOI8MM-NEXT:    [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
-; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NOI8MM-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NOI8MM-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NOI8MM-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP13]], i32 16
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[TMP14]], [[TMP2]]
+; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]]
+; CHECK-NOI8MM-NEXT:    [[TMP10]] = add <16 x i32> [[TMP8]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT:    [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]]
+; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NOI8MM-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
-; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]]
-; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NOI8MM-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NOI8MM-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-NOI8MM:       scalar.ph:
+; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-NOI8MM:       for.exit:
+; CHECK-NOI8MM-NEXT:    ret i32 [[TMP15]]
 ;
 entry:
   br label %for.body
@@ -292,7 +276,7 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NOI8MM-NEXT:    [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI1]]
 ; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NOI8MM-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NOI8MM-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
 ; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP13]], [[TMP12]]
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
@@ -387,7 +371,7 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NOI8MM-NEXT:    [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI1]]
 ; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NOI8MM-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NOI8MM-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
 ; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP13]], [[TMP12]]
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index b84763142b686..6ead2a4eecbe8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -512,17 +512,23 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul <16 x i32> [[TMP12]], [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
@@ -991,313 +997,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK-INTERLEAVE1:       pred.load.continue:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if1:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
 ; CHECK-INTERLEAVE1:       pred.load.continue2:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if3:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
 ; CHECK-INTERLEAVE1:       pred.load.continue4:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if5:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
 ; CHECK-INTERLEAVE1:       pred.load.continue6:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if7:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
 ; CHECK-INTERLEAVE1:       pred.load.continue8:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if9:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5
+; CHECK-INTERLEAVE1-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
 ; CHECK-INTERLEAVE1:       pred.load.continue10:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if11:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6
+; CHECK-INTERLEAVE1-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
 ; CHECK-INTERLEAVE1:       pred.load.continue12:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if13:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7
+; CHECK-INTERLEAVE1-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
 ; CHECK-INTERLEAVE1:       pred.load.continue14:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if15:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
 ; CHECK-INTERLEAVE1:       pred.load.continue16:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if17:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9
+; CHECK-INTERLEAVE1-NEXT:    [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE18]]
 ; CHECK-INTERLEAVE1:       pred.load.continue18:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if19:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10
+; CHECK-INTERLEAVE1-NEXT:    [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE20]]
 ; CHECK-INTERLEAVE1:       pred.load.continue20:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if21:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11
+; CHECK-INTERLEAVE1-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE22]]
 ; CHECK-INTERLEAVE1:       pred.load.continue22:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if23:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12
+; CHECK-INTERLEAVE1-NEXT:    [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE24]]
 ; CHECK-INTERLEAVE1:       pred.load.continue24:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if25:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13
+; CHECK-INTERLEAVE1-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE26]]
 ; CHECK-INTERLEAVE1:       pred.load.continue26:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if27:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14
+; CHECK-INTERLEAVE1-NEXT:    [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE28]]
 ; CHECK-INTERLEAVE1:       pred.load.continue28:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]]
 ; CHECK-INTERLEAVE1:       pred.load.if29:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE30]]
-; CHECK-INTERLEAVE1:       pred.load.continue30:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if31:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE32]]
-; CHECK-INTERLEAVE1:       pred.load.continue32:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if33:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE34]]
-; CHECK-INTERLEAVE1:       pred.load.continue34:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if35:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE36]]
-; CHECK-INTERLEAVE1:       pred.load.continue36:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if37:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE38]]
-; CHECK-INTERLEAVE1:       pred.load.continue38:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if39:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE40]]
-; CHECK-INTERLEAVE1:       pred.load.continue40:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if41:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE42]]
-; CHECK-INTERLEAVE1:       pred.load.continue42:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if43:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE44]]
-; CHECK-INTERLEAVE1:       pred.load.continue44:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if45:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE46]]
-; CHECK-INTERLEAVE1:       pred.load.continue46:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if47:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE48]]
-; CHECK-INTERLEAVE1:       pred.load.continue48:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if49:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE50]]
-; CHECK-INTERLEAVE1:       pred.load.continue50:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if51:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE52]]
-; CHECK-INTERLEAVE1:       pred.load.continue52:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if53:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE54]]
-; CHECK-INTERLEAVE1:       pred.load.continue54:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if55:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE56]]
-; CHECK-INTERLEAVE1:       pred.load.continue56:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if57:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE58]]
-; CHECK-INTERLEAVE1:       pred.load.continue58:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if59:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE60]]
-; CHECK-INTERLEAVE1:       pred.load.continue60:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]]
-; CHECK-INTERLEAVE1:       pred.load.if61:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE62]]
-; CHECK-INTERLEAVE1:       pred.load.continue62:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ]
+; CHECK-INTERLEAVE1:       pred.load.continue30:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
@@ -1327,313 +1253,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK-INTERLEAVED:       pred.load.continue:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if1:
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
 ; CHECK-INTERLEAVED:       pred.load.continue2:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if3:
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
 ; CHECK-INTERLEAVED:       pred.load.continue4:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if5:
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
 ; CHECK-INTERLEAVED:       pred.load.continue6:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if7:
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
 ; CHECK-INTERLEAVED:       pred.load.continue8:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if9:
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
 ; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
 ; CHECK-INTERLEAVED:       pred.load.continue10:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if11:
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
 ; CHECK-INTERLEAVED:       pred.load.continue12:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if13:
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
 ; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
 ; CHECK-INTERLEAVED:       pred.load.continue14:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if15:
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
 ; CHECK-INTERLEAVED:       pred.load.continue16:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if17:
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
 ; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE18]]
 ; CHECK-INTERLEAVED:       pred.load.continue18:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if19:
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
 ; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE20]]
 ; CHECK-INTERLEAVED:       pred.load.continue20:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if21:
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
 ; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE22]]
 ; CHECK-INTERLEAVED:       pred.load.continue22:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if23:
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
 ; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE24]]
 ; CHECK-INTERLEAVED:       pred.load.continue24:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if25:
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE26]]
 ; CHECK-INTERLEAVED:       pred.load.continue26:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if27:
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-INTERLEAVED-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE28]]
 ; CHECK-INTERLEAVED:       pred.load.continue28:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]]
 ; CHECK-INTERLEAVED:       pred.load.if29:
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-INTERLEAVED-NEXT:    [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE30]]
-; CHECK-INTERLEAVED:       pred.load.continue30:
-; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if31:
-; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE32]]
-; CHECK-INTERLEAVED:       pred.load.continue32:
-; CHECK-INTERLEAVED-NEXT:    [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if33:
-; CHECK-INTERLEAVED-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE34]]
-; CHECK-INTERLEAVED:       pred.load.continue34:
-; CHECK-INTERLEAVED-NEXT:    [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if35:
-; CHECK-INTERLEAVED-NEXT:    [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE36]]
-; CHECK-INTERLEAVED:       pred.load.continue36:
-; CHECK-INTERLEAVED-NEXT:    [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if37:
-; CHECK-INTERLEAVED-NEXT:    [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE38]]
-; CHECK-INTERLEAVED:       pred.load.continue38:
-; CHECK-INTERLEAVED-NEXT:    [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if39:
-; CHECK-INTERLEAVED-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE40]]
-; CHECK-INTERLEAVED:       pred.load.continue40:
-; CHECK-INTERLEAVED-NEXT:    [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if41:
-; CHECK-INTERLEAVED-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE42]]
-; CHECK-INTERLEAVED:       pred.load.continue42:
-; CHECK-INTERLEAVED-NEXT:    [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if43:
-; CHECK-INTERLEAVED-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE44]]
-; CHECK-INTERLEAVED:       pred.load.continue44:
-; CHECK-INTERLEAVED-NEXT:    [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if45:
-; CHECK-INTERLEAVED-NEXT:    [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE46]]
-; CHECK-INTERLEAVED:       pred.load.continue46:
-; CHECK-INTERLEAVED-NEXT:    [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if47:
-; CHECK-INTERLEAVED-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE48]]
-; CHECK-INTERLEAVED:       pred.load.continue48:
-; CHECK-INTERLEAVED-NEXT:    [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if49:
-; CHECK-INTERLEAVED-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE50]]
-; CHECK-INTERLEAVED:       pred.load.continue50:
-; CHECK-INTERLEAVED-NEXT:    [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if51:
-; CHECK-INTERLEAVED-NEXT:    [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE52]]
-; CHECK-INTERLEAVED:       pred.load.continue52:
-; CHECK-INTERLEAVED-NEXT:    [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if53:
-; CHECK-INTERLEAVED-NEXT:    [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE54]]
-; CHECK-INTERLEAVED:       pred.load.continue54:
-; CHECK-INTERLEAVED-NEXT:    [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if55:
-; CHECK-INTERLEAVED-NEXT:    [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE56]]
-; CHECK-INTERLEAVED:       pred.load.continue56:
-; CHECK-INTERLEAVED-NEXT:    [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if57:
-; CHECK-INTERLEAVED-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE58]]
-; CHECK-INTERLEAVED:       pred.load.continue58:
-; CHECK-INTERLEAVED-NEXT:    [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if59:
-; CHECK-INTERLEAVED-NEXT:    [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE60]]
-; CHECK-INTERLEAVED:       pred.load.continue60:
-; CHECK-INTERLEAVED-NEXT:    [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]]
-; CHECK-INTERLEAVED:       pred.load.if61:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE62]]
-; CHECK-INTERLEAVED:       pred.load.continue62:
-; CHECK-INTERLEAVED-NEXT:    [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ]
+; CHECK-INTERLEAVED:       pred.load.continue30:
+; CHECK-INTERLEAVED-NEXT:    [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
@@ -1663,313 +1509,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ]
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK-MAXBW:       pred.load.if:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
 ; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0
+; CHECK-MAXBW-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK-MAXBW:       pred.load.continue:
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ]
+; CHECK-MAXBW-NEXT:    [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
 ; CHECK-MAXBW:       pred.load.if1:
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
 ; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1
+; CHECK-MAXBW-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
 ; CHECK-MAXBW:       pred.load.continue2:
 ; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ]
+; CHECK-MAXBW-NEXT:    [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
 ; CHECK-MAXBW:       pred.load.if3:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]]
 ; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2
+; CHECK-MAXBW-NEXT:    [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-MAXBW-NEXT:    [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
 ; CHECK-MAXBW:       pred.load.continue4:
 ; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ]
+; CHECK-MAXBW-NEXT:    [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
 ; CHECK-MAXBW:       pred.load.if5:
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3
+; CHECK-MAXBW-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
 ; CHECK-MAXBW:       pred.load.continue6:
 ; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ]
+; CHECK-MAXBW-NEXT:    [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
 ; CHECK-MAXBW:       pred.load.if7:
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
 ; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4
+; CHECK-MAXBW-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-MAXBW-NEXT:    [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
 ; CHECK-MAXBW:       pred.load.continue8:
 ; CHECK-MAXBW-NEXT:    [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ]
+; CHECK-MAXBW-NEXT:    [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
 ; CHECK-MAXBW:       pred.load.if9:
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
 ; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-MAXBW-NEXT:    [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5
+; CHECK-MAXBW-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
 ; CHECK-MAXBW:       pred.load.continue10:
 ; CHECK-MAXBW-NEXT:    [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ]
+; CHECK-MAXBW-NEXT:    [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
 ; CHECK-MAXBW:       pred.load.if11:
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-MAXBW-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
 ; CHECK-MAXBW-NEXT:    [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6
+; CHECK-MAXBW-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
 ; CHECK-MAXBW:       pred.load.continue12:
 ; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ]
+; CHECK-MAXBW-NEXT:    [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
 ; CHECK-MAXBW:       pred.load.if13:
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
 ; CHECK-MAXBW-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]]
 ; CHECK-MAXBW-NEXT:    [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7
+; CHECK-MAXBW-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-MAXBW-NEXT:    [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
 ; CHECK-MAXBW:       pred.load.continue14:
 ; CHECK-MAXBW-NEXT:    [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ]
+; CHECK-MAXBW-NEXT:    [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
 ; CHECK-MAXBW:       pred.load.if15:
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]]
 ; CHECK-MAXBW-NEXT:    [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8
+; CHECK-MAXBW-NEXT:    [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-MAXBW-NEXT:    [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
 ; CHECK-MAXBW:       pred.load.continue16:
 ; CHECK-MAXBW-NEXT:    [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ]
+; CHECK-MAXBW-NEXT:    [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
 ; CHECK-MAXBW:       pred.load.if17:
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
 ; CHECK-MAXBW-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-MAXBW-NEXT:    [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9
+; CHECK-MAXBW-NEXT:    [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE18]]
 ; CHECK-MAXBW:       pred.load.continue18:
 ; CHECK-MAXBW-NEXT:    [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ]
+; CHECK-MAXBW-NEXT:    [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
 ; CHECK-MAXBW:       pred.load.if19:
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
 ; CHECK-MAXBW-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
 ; CHECK-MAXBW-NEXT:    [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10
+; CHECK-MAXBW-NEXT:    [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE20]]
 ; CHECK-MAXBW:       pred.load.continue20:
 ; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ]
+; CHECK-MAXBW-NEXT:    [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
 ; CHECK-MAXBW:       pred.load.if21:
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
 ; CHECK-MAXBW-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
 ; CHECK-MAXBW-NEXT:    [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11
+; CHECK-MAXBW-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-MAXBW-NEXT:    [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE22]]
 ; CHECK-MAXBW:       pred.load.continue22:
 ; CHECK-MAXBW-NEXT:    [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ]
+; CHECK-MAXBW-NEXT:    [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
 ; CHECK-MAXBW:       pred.load.if23:
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
 ; CHECK-MAXBW-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
 ; CHECK-MAXBW-NEXT:    [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12
+; CHECK-MAXBW-NEXT:    [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT:    [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE24]]
 ; CHECK-MAXBW:       pred.load.continue24:
 ; CHECK-MAXBW-NEXT:    [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ]
+; CHECK-MAXBW-NEXT:    [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
 ; CHECK-MAXBW:       pred.load.if25:
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-MAXBW-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]]
 ; CHECK-MAXBW-NEXT:    [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13
+; CHECK-MAXBW-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE26]]
 ; CHECK-MAXBW:       pred.load.continue26:
 ; CHECK-MAXBW-NEXT:    [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ]
+; CHECK-MAXBW-NEXT:    [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
 ; CHECK-MAXBW:       pred.load.if27:
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-MAXBW-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
 ; CHECK-MAXBW-NEXT:    [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14
+; CHECK-MAXBW-NEXT:    [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-MAXBW-NEXT:    [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE28]]
 ; CHECK-MAXBW:       pred.load.continue28:
 ; CHECK-MAXBW-NEXT:    [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ]
+; CHECK-MAXBW-NEXT:    [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
-; CHECK-MAXBW-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]]
 ; CHECK-MAXBW:       pred.load.if29:
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-MAXBW-NEXT:    [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
 ; CHECK-MAXBW-NEXT:    [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE30]]
-; CHECK-MAXBW:       pred.load.continue30:
-; CHECK-MAXBW-NEXT:    [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ]
-; CHECK-MAXBW-NEXT:    [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
-; CHECK-MAXBW-NEXT:    br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
-; CHECK-MAXBW:       pred.load.if31:
-; CHECK-MAXBW-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE32]]
-; CHECK-MAXBW:       pred.load.continue32:
-; CHECK-MAXBW-NEXT:    [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ]
-; CHECK-MAXBW-NEXT:    [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
-; CHECK-MAXBW-NEXT:    br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
-; CHECK-MAXBW:       pred.load.if33:
-; CHECK-MAXBW-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
-; CHECK-MAXBW-NEXT:    [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE34]]
-; CHECK-MAXBW:       pred.load.continue34:
-; CHECK-MAXBW-NEXT:    [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ]
-; CHECK-MAXBW-NEXT:    [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
-; CHECK-MAXBW-NEXT:    br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]]
-; CHECK-MAXBW:       pred.load.if35:
-; CHECK-MAXBW-NEXT:    [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-MAXBW-NEXT:    [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE36]]
-; CHECK-MAXBW:       pred.load.continue36:
-; CHECK-MAXBW-NEXT:    [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ]
-; CHECK-MAXBW-NEXT:    [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
-; CHECK-MAXBW-NEXT:    br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]]
-; CHECK-MAXBW:       pred.load.if37:
-; CHECK-MAXBW-NEXT:    [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE38]]
-; CHECK-MAXBW:       pred.load.continue38:
-; CHECK-MAXBW-NEXT:    [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ]
-; CHECK-MAXBW-NEXT:    [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
-; CHECK-MAXBW-NEXT:    br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]]
-; CHECK-MAXBW:       pred.load.if39:
-; CHECK-MAXBW-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
-; CHECK-MAXBW-NEXT:    [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE40]]
-; CHECK-MAXBW:       pred.load.continue40:
-; CHECK-MAXBW-NEXT:    [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ]
-; CHECK-MAXBW-NEXT:    [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
-; CHECK-MAXBW-NEXT:    br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]]
-; CHECK-MAXBW:       pred.load.if41:
-; CHECK-MAXBW-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-MAXBW-NEXT:    [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE42]]
-; CHECK-MAXBW:       pred.load.continue42:
-; CHECK-MAXBW-NEXT:    [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ]
-; CHECK-MAXBW-NEXT:    [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
-; CHECK-MAXBW-NEXT:    br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]]
-; CHECK-MAXBW:       pred.load.if43:
-; CHECK-MAXBW-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE44]]
-; CHECK-MAXBW:       pred.load.continue44:
-; CHECK-MAXBW-NEXT:    [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ]
-; CHECK-MAXBW-NEXT:    [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
-; CHECK-MAXBW-NEXT:    br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]]
-; CHECK-MAXBW:       pred.load.if45:
-; CHECK-MAXBW-NEXT:    [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
-; CHECK-MAXBW-NEXT:    [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE46]]
-; CHECK-MAXBW:       pred.load.continue46:
-; CHECK-MAXBW-NEXT:    [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ]
-; CHECK-MAXBW-NEXT:    [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
-; CHECK-MAXBW-NEXT:    br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]]
-; CHECK-MAXBW:       pred.load.if47:
-; CHECK-MAXBW-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
-; CHECK-MAXBW-NEXT:    [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE48]]
-; CHECK-MAXBW:       pred.load.continue48:
-; CHECK-MAXBW-NEXT:    [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ]
-; CHECK-MAXBW-NEXT:    [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
-; CHECK-MAXBW-NEXT:    br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]]
-; CHECK-MAXBW:       pred.load.if49:
-; CHECK-MAXBW-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE50]]
-; CHECK-MAXBW:       pred.load.continue50:
-; CHECK-MAXBW-NEXT:    [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ]
-; CHECK-MAXBW-NEXT:    [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
-; CHECK-MAXBW-NEXT:    br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]]
-; CHECK-MAXBW:       pred.load.if51:
-; CHECK-MAXBW-NEXT:    [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE52]]
-; CHECK-MAXBW:       pred.load.continue52:
-; CHECK-MAXBW-NEXT:    [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ]
-; CHECK-MAXBW-NEXT:    [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
-; CHECK-MAXBW-NEXT:    br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]]
-; CHECK-MAXBW:       pred.load.if53:
-; CHECK-MAXBW-NEXT:    [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
-; CHECK-MAXBW-NEXT:    [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE54]]
-; CHECK-MAXBW:       pred.load.continue54:
-; CHECK-MAXBW-NEXT:    [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ]
-; CHECK-MAXBW-NEXT:    [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
-; CHECK-MAXBW-NEXT:    br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]]
-; CHECK-MAXBW:       pred.load.if55:
-; CHECK-MAXBW-NEXT:    [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-MAXBW-NEXT:    [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE56]]
-; CHECK-MAXBW:       pred.load.continue56:
-; CHECK-MAXBW-NEXT:    [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ]
-; CHECK-MAXBW-NEXT:    [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
-; CHECK-MAXBW-NEXT:    br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]]
-; CHECK-MAXBW:       pred.load.if57:
-; CHECK-MAXBW-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE58]]
-; CHECK-MAXBW:       pred.load.continue58:
-; CHECK-MAXBW-NEXT:    [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ]
-; CHECK-MAXBW-NEXT:    [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
-; CHECK-MAXBW-NEXT:    br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]]
-; CHECK-MAXBW:       pred.load.if59:
-; CHECK-MAXBW-NEXT:    [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-MAXBW-NEXT:    [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE60]]
-; CHECK-MAXBW:       pred.load.continue60:
-; CHECK-MAXBW-NEXT:    [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ]
-; CHECK-MAXBW-NEXT:    [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
-; CHECK-MAXBW-NEXT:    br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]]
-; CHECK-MAXBW:       pred.load.if61:
 ; CHECK-MAXBW-NEXT:    [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
 ; CHECK-MAXBW-NEXT:    [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE62]]
-; CHECK-MAXBW:       pred.load.continue62:
-; CHECK-MAXBW-NEXT:    [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ]
+; CHECK-MAXBW:       pred.load.continue30:
+; CHECK-MAXBW-NEXT:    [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ]
+; CHECK-MAXBW-NEXT:    [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
 ; CHECK-MAXBW-NEXT:    [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 4636c1b63da82..ab593f6f8bb6b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -758,132 +758,87 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP16]], i32 -1)
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP4]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP16]], i32 [[TMP22]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = mul nuw i32 [[TMP24]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP25]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = extractelement <vscale x 8 x i32> [[TMP18]], i32 [[TMP26]]
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[TMP8]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = shl nuw i64 [[TMP12]], 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP10]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1)
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP17]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP8]], <16 x i32> [[TMP9]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add <16 x i32> [[TMP9]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul nuw i32 [[TMP29]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP31]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = mul nuw i32 [[TMP33]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP35]]
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = extractelement <16 x i32> [[TMP11]], i32 15
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[TMP13]]
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried(
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 8
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
-; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP25]], i32 -1)
-; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP4]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP4]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-MAXBW-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8
-; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP22]]
-; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = mul nuw i32 [[TMP24]], 8
-; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
-; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP31]]
-; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
+; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-MAXBW:       for.exit:
+; CHECK-MAXBW-NEXT:    ret i32 [[TMP8]]
 ;
 entry:
   br label %for.body
@@ -930,7 +885,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8
@@ -968,7 +923,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 8
@@ -1000,7 +955,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8
@@ -1085,7 +1040,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP22]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
@@ -1183,7 +1138,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE29]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP42]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE29]], [[PARTIAL_REDUCE28]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
@@ -1253,7 +1208,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE13]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP26]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE13]])
 ; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE10]])
@@ -1350,7 +1305,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = xor i1 [[TMP20]], true
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[EXIT:%.*]]
@@ -1388,7 +1343,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = xor i1 [[TMP20]], true
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[EXIT:%.*]]
@@ -1415,10 +1370,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 16 x i32> [[TMP16]], [[TMP13]]
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP17]], <vscale x 16 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP18]])
@@ -1426,7 +1381,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = xor i1 [[TMP19]], true
-; CHECK-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    br label [[EXIT:%.*]]
@@ -1461,82 +1416,66 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP8]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP3]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP9]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <16 x i32> [[TMP4]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i32> [[TMP12]], i32 [[TMP19]]
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    [[RESULT:%.*]] = add i32 [[TMP7]], [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[RESULT]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP7]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = shl nuw i64 [[TMP15]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP3]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP20]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP8]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP10]] = add <16 x i32> [[TMP15]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP28]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 4 x i32> [[TMP20]], i32 [[TMP29]]
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    [[RESULT:%.*]] = add i32 [[TMP13]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[RESULT]]
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user(
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -1561,7 +1500,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP24]])
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
@@ -1616,7 +1555,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH:%.*]]
@@ -1650,8 +1589,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP8]], [[TMP9]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP10]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
@@ -1685,7 +1623,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
@@ -1803,7 +1741,7 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add i32 [[TMP21]], [[TMP15]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP22]]
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -1915,7 +1853,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -1953,7 +1891,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
@@ -1968,31 +1906,27 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
 ; CHECK-MAXBW:       for.ph:
 ; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP7]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP9]], [[BROADCAST_SPLAT]]
-; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP1]], [[BROADCAST_SPLAT]]
+; CHECK-MAXBW-NEXT:    [[TMP3]] = add <8 x i64> [[TMP2]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
@@ -2048,7 +1982,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2086,7 +2020,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
@@ -2101,31 +2035,27 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
 ; CHECK-MAXBW:       for.ph:
 ; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP7]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[TMP3]] = add <8 x i64> [[TMP2]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
@@ -2186,7 +2116,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -2232,7 +2162,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE6]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP12]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
@@ -2274,7 +2204,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP11]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -2396,7 +2326,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
@@ -2496,7 +2426,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
@@ -2596,7 +2526,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
 ; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
index d80178fde45d9..866487d2620ea 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
@@ -70,3 +70,28 @@ loop:
 exit:
   ret i32 %red.next
 }
+
+define i16 @test_incomplete_chain_without_mul(ptr noalias %dst, ptr %A, ptr %B) #0 {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i16 [ 0, %entry ], [ %red.next, %loop ]
+  %l.a = load i8, ptr %A, align 1
+  %a.ext = zext i8 %l.a to i16
+  store i16 %a.ext, ptr %dst, align 2
+  %l.b = load i8, ptr %B, align 1
+  %b.ext = zext i8 %l.b to i16
+  %add = add i16 %red, %b.ext
+  %add.1 = add i16 %add, %a.ext
+  %red.next = add i16 %add.1, %b.ext
+  %iv.next = add i64 %iv, 1
+  %ec = icmp ult i64 %iv, 1024
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i16 %red.next
+}
+
+attributes #0 = { "target-cpu"="grace" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
index 70685c1c3fe12..d4fb3d70c538d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
@@ -248,3 +248,30 @@ exit:                                        ; preds = %for.body
   %result = add nsw i32 %result0, %result1
   ret i32 %result
 }
+
+define i64 @loop_reduction_and_store_last_element(ptr %src, ptr writeonly %dst) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_reduction_and_store_last_element'
+; CHECK:       LV(REG): VF = 16
+; CHECK-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-NEXT:  LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: Generic::VectorRC, 16 registers
+; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
+  %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+  %ptr = phi ptr [ %src, %entry ], [ %ptr.next, %loop ]
+  %iv.next = add nuw i32 %iv, 1
+  %ptr.next = getelementptr i8, ptr %ptr, i64 1
+  store ptr %ptr, ptr %dst, align 8
+  %val = load i8, ptr %ptr, align 1
+  %val.ext = zext i8 %val to i64
+  %red.next = or i64 %red, %val.ext
+  %ec = icmp eq i32 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i64 %red.next
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
index 7f345133f51dd..68cfc659e1e94 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
@@ -660,6 +660,114 @@ exit:
   ret i32 %red
 }
 
+
+define i32 @test_or_reduction_with_stride_2(i32 %scale, ptr %src) {
+; CHECK-LABEL: define i32 @test_or_reduction_with_stride_2(
+; CHECK-SAME: i32 [[SCALE:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[SCALE]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP66:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 18
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 20
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 22
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 24
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 26
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 28
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 30
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr [32 x i8], ptr [[SRC]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i8, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i8, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = load i8, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP40:%.*]] = load i8, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP44:%.*]] = load i8, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP45:%.*]] = load i8, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP46:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP47:%.*]] = load i8, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <16 x i8> poison, i8 [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <16 x i8> [[TMP48]], i8 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <16 x i8> [[TMP49]], i8 [[TMP34]], i32 2
+; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <16 x i8> [[TMP50]], i8 [[TMP35]], i32 3
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP36]], i32 4
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <16 x i8> [[TMP52]], i8 [[TMP37]], i32 5
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <16 x i8> [[TMP53]], i8 [[TMP38]], i32 6
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <16 x i8> [[TMP54]], i8 [[TMP39]], i32 7
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <16 x i8> [[TMP55]], i8 [[TMP40]], i32 8
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP41]], i32 9
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <16 x i8> [[TMP57]], i8 [[TMP42]], i32 10
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <16 x i8> [[TMP58]], i8 [[TMP43]], i32 11
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <16 x i8> [[TMP59]], i8 [[TMP44]], i32 12
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <16 x i8> [[TMP60]], i8 [[TMP45]], i32 13
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP46]], i32 14
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <16 x i8> [[TMP62]], i8 [[TMP47]], i32 15
+; CHECK-NEXT:    [[TMP64:%.*]] = sext <16 x i8> [[TMP63]] to <16 x i32>
+; CHECK-NEXT:    [[TMP65:%.*]] = mul <16 x i32> [[BROADCAST_SPLAT]], [[TMP64]]
+; CHECK-NEXT:    [[TMP66]] = or <16 x i32> [[TMP65]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP67:%.*]] = icmp eq i64 [[INDEX_NEXT]], 48
+; CHECK-NEXT:    br i1 [[TMP67]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP68:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP66]])
+; CHECK-NEXT:    br label %[[SCALAR_PH:.*]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %reduction = phi i32 [ %reduction.next, %loop ], [ 0, %entry ]
+  %gep = getelementptr [32 x i8], ptr %src, i64 %iv
+  %load = load i8, ptr %gep, align 1
+  %sext = sext i8 %load to i32
+  %mul = mul i32 %scale, %sext
+  %reduction.next = or i32 %mul, %reduction
+  %iv.next = add i64 %iv, 2
+  %cmp = icmp eq i64 %iv.next, 100
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret i32 %reduction.next
+}
+
 attributes #0 = { "target-cpu"="neoverse-512tvb" }
 
 !0 = !{!1, !2, i64 0}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
index f2e3b708d7820..61da142ad376c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
@@ -1,6 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^middle.block:" --version 4
 ; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=1 < %s | FileCheck %s -check-prefix CHECK-UF1
 ; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-UF4
+; RUN: opt -S --passes=loop-vectorize -enable-wide-lane-mask -prefer-predicate-over-epilogue=predicate-dont-vectorize < %s | FileCheck %s -check-prefix CHECK-TF
+; RUN: opt -S --passes=forceattrs,loop-vectorize -enable-wide-lane-mask -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-attribute=optsize < %s | FileCheck %s -check-prefix CHECK-UF1
 
 target triple = "aarch64-unknown-linux"
 
@@ -101,6 +103,49 @@ define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src,
 ; CHECK-UF4-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-UF4:       middle.block:
 ;
+; CHECK-TF-LABEL: define void @scalable_wide_active_lane_mask(
+; CHECK-TF-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-TF-NEXT:  entry:
+; CHECK-TF-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK-TF:       vector.ph:
+; CHECK-TF-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32
+; CHECK-TF-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 5
+; CHECK-TF-NEXT:    [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]]
+; CHECK-TF-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]]
+; CHECK-TF-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 [[N]])
+; CHECK-TF-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 16)
+; CHECK-TF-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-TF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-TF:       vector.body:
+; CHECK-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = phi <vscale x 16 x i1> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 4
+; CHECK-TF-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP11]]
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP9]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 16 x i8> poison)
+; CHECK-TF-NEXT:    [[TMP13:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
+; CHECK-TF-NEXT:    [[TMP14:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD2]], splat (i8 3)
+; CHECK-TF-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 4
+; CHECK-TF-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 [[TMP17]]
+; CHECK-TF-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr align 1 [[TMP15]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-TF-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP14]], ptr align 1 [[TMP18]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK1]])
+; CHECK-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 [[INDEX]], i64 [[TMP6]])
+; CHECK-TF-NEXT:    [[TMP19]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 16)
+; CHECK-TF-NEXT:    [[TMP20]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-TF-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[TMP20]], i32 0
+; CHECK-TF-NEXT:    [[TMP22:%.*]] = xor i1 [[TMP21]], true
+; CHECK-TF-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-TF:       middle.block:
+;
 entry:
   br label %for.body
 
@@ -222,6 +267,52 @@ define void @scalable_wide_active_lane_mask_double(ptr noalias %dst, ptr readonl
 ; CHECK-UF4-NEXT:    br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-UF4:       middle.block:
 ;
+; CHECK-TF-LABEL: define void @scalable_wide_active_lane_mask_double(
+; CHECK-TF-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-TF-NEXT:  entry:
+; CHECK-TF-NEXT:    [[CMP6:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-TF-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK-TF:       for.body.preheader:
+; CHECK-TF-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK-TF:       vector.ph:
+; CHECK-TF-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-TF-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; CHECK-TF-NEXT:    [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]]
+; CHECK-TF-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]]
+; CHECK-TF-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-TF-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 2)
+; CHECK-TF-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-TF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-TF:       vector.body:
+; CHECK-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = phi <vscale x 2 x i1> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1
+; CHECK-TF-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[TMP9]], i64 [[TMP11]]
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP12]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 2 x double> poison)
+; CHECK-TF-NEXT:    [[TMP13:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00)
+; CHECK-TF-NEXT:    [[TMP14:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD2]], splat (double 3.000000e+00)
+; CHECK-TF-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 1
+; CHECK-TF-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP17]]
+; CHECK-TF-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP13]], ptr align 8 [[TMP15]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-TF-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr align 8 [[TMP18]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK1]])
+; CHECK-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]])
+; CHECK-TF-NEXT:    [[TMP19]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 2)
+; CHECK-TF-NEXT:    [[TMP20]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-TF-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[TMP20]], i32 0
+; CHECK-TF-NEXT:    [[TMP22:%.*]] = xor i1 [[TMP21]], true
+; CHECK-TF-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-TF:       middle.block:
+;
 entry:
   %cmp6 = icmp sgt i64 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -243,14 +334,3 @@ for.end:
 
 attributes #0 = { nounwind vscale_range(1,16) "target-features"="+sve2p1" }
 
-;.
-; CHECK-UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
-;.
-; CHECK-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-UF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
-;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll
index 0c6a490ddf4ba..eceda0897b174 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll
@@ -17,17 +17,15 @@ define void @widen_extractvalue(ptr %dst, {i64, i64} %sv) #0 {
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1000, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 1000, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[EXTRACT0:%.*]] = extractvalue { i64, i64 } [[SV]], 0
-; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[EXTRACT0]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { i64, i64 } [[SV]], 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[EXTRACT0]], [[TMP10]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT2]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[DST]], i32 [[INDEX]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-and-casts.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-and-casts.ll
new file mode 100644
index 0000000000000..bba7d058d6637
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-and-casts.ll
@@ -0,0 +1,694 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF2 %s
+; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF4 %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx"
+
+define void @test_2xi64_matching_zext_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_matching_zext_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_matching_zext_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %ext.0 = zext i32 %l.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  %ext.1 = zext i32 %l.0 to i64
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_matching_sext_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_matching_sext_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_matching_sext_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %ext.0 = sext i32 %l.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  %ext.1 = sext i32 %l.0 to i64
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mismatching_cast_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_mismatching_cast_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP4:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_mismatching_cast_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP4:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %ext.0 = zext i32 %l.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  %ext.1 = sext i32 %l.0 to i64
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_matching_cast_add_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_matching_cast_add_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP2]], splat (i64 2)
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_matching_cast_add_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP2]], splat (i64 2)
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %ext.0 = zext i32 %l.0 to i64
+  %add.0 = add i64 %ext.0, 2
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %add.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  %ext.1 = zext i32 %l.0 to i64
+  %add.1 = add i64 %ext.1, 2
+  store i64 %add.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mismatching_cast_add_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_mismatching_cast_add_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP2]], splat (i64 2)
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP5]], splat (i64 2)
+; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_mismatching_cast_add_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP2]], splat (i64 2)
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP5]], splat (i64 2)
+; VF4-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %ext.0 = sext i32 %l.0 to i64
+  %add.0 = add i64 %ext.0, 2
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %add.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  %ext.1 = zext i32 %l.0 to i64
+  %add.1 = add i64 %ext.1, 2
+  store i64 %add.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_add_cast_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_add_cast_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_add_cast_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF4-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %add.0 = add i32 %l.0, 2
+  %ext.0 = zext i32 %add.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %add.1 = add i32 %l.0, 2
+  %ext.1 = zext i32 %add.1 to i64
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mismatching_add_cast_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_mismatching_add_cast_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = sub <2 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF2-NEXT:    [[TMP6:%.*]] = zext <2 x i32> [[TMP5]] to <2 x i64>
+; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_mismatching_add_cast_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF4-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF4-NEXT:    [[TMP6:%.*]] = zext <4 x i32> [[TMP5]] to <4 x i64>
+; VF4-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %add.0 = add i32 %l.0, 2
+  %ext.0 = zext i32 %add.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %add.1 = sub i32 %l.0, 2
+  %ext.1 = zext i32 %add.1 to i64
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_add_mismatching_cast_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_add_mismatching_cast_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
+; VF2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_add_mismatching_cast_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF4-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; VF4-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %add.0 = add i32 %l.0, 2
+  %ext.0 = zext i32 %add.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %add.1 = add i32 %l.0, 2
+  %ext.1 = sext i32 %add.1 to i64
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_sub_mismatching_ops_cast_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_sub_mismatching_ops_cast_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = sub <2 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = sub <2 x i32> splat (i32 2), [[WIDE_LOAD]]
+; VF2-NEXT:    [[TMP6:%.*]] = zext <2 x i32> [[TMP5]] to <2 x i64>
+; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_sub_mismatching_ops_cast_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = sub <4 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF4-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = sub <4 x i32> splat (i32 2), [[WIDE_LOAD]]
+; VF4-NEXT:    [[TMP6:%.*]] = zext <4 x i32> [[TMP5]] to <4 x i64>
+; VF4-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %add.0 = sub i32 %l.0, 2
+  %ext.0 = zext i32 %add.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %add.1 = sub i32 2, %l.0
+  %ext.1 = zext i32 %add.1 to i64
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-chained.ll
new file mode 100644
index 0000000000000..23bc21a49a8b3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-chained.ll
@@ -0,0 +1,659 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 6
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF2 %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx"
+
+define void @test_2xi64_mul_add(ptr noalias %data, ptr noalias %factor) {
+; VF2-LABEL: define void @test_2xi64_mul_add(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2)
+; VF2-NEXT:    [[TMP5:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; VF2-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP5]], splat (i64 2)
+; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %idx.0 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %add.0 = add i64 %mul.0, 2
+  store i64 %add.0, ptr %data.0, align 8
+  %idx.1 = or disjoint i64 %idx.0, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  %add.1 = add i64 %mul.1, 2
+  store i64 %add.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mixed_opcodes1(ptr noalias %data, ptr noalias %factor) {
+; VF2-LABEL: define void @test_2xi64_mixed_opcodes1(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2)
+; VF2-NEXT:    [[TMP5:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; VF2-NEXT:    [[TMP6:%.*]] = xor <2 x i64> [[TMP5]], splat (i64 2)
+; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %idx.0 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %add.0 = add i64 %mul.0, 2
+  store i64 %add.0, ptr %data.0, align 8
+  %idx.1 = or disjoint i64 %idx.0, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  %add.1 = xor i64 %mul.1, 2
+  store i64 %add.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mixed_opcodes2(ptr noalias %data, ptr noalias %factor) {
+; VF2-LABEL: define void @test_2xi64_mixed_opcodes2(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[TMP3:%.*]] = xor <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2)
+; VF2-NEXT:    [[TMP5:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; VF2-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP5]], splat (i64 2)
+; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %idx.0 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = xor i64 %l.factor, %l.0
+  %add.0 = add i64 %mul.0, 2
+  store i64 %add.0, ptr %data.0, align 8
+  %idx.1 = or disjoint i64 %idx.0, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  %add.1 = add i64 %mul.1, 2
+  store i64 %add.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mul_sub(ptr noalias %data, ptr noalias %factor) {
+; VF2-LABEL: define void @test_2xi64_mul_sub(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; VF2-NEXT:    [[TMP4:%.*]] = sub <2 x i64> [[TMP3]], splat (i64 2)
+; VF2-NEXT:    [[TMP5:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; VF2-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP5]], splat (i64 2)
+; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %idx.0 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %sub.0 = sub i64 %mul.0, 2
+  store i64 %sub.0, ptr %data.0, align 8
+  %idx.1 = or disjoint i64 %idx.0, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  %sub.1 = sub i64 %mul.1, 2
+  store i64 %sub.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mul_sub_mismatched_ops1(ptr noalias %data, ptr noalias %factor) {
+; VF2-LABEL: define void @test_2xi64_mul_sub_mismatched_ops1(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; VF2-NEXT:    [[TMP4:%.*]] = sub <2 x i64> [[TMP3]], splat (i64 2)
+; VF2-NEXT:    [[TMP5:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; VF2-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP5]], splat (i64 3)
+; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %idx.0 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %sub.0 = sub i64 %mul.0, 2
+  store i64 %sub.0, ptr %data.0, align 8
+  %idx.1 = or disjoint i64 %idx.0, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  %sub.1 = sub i64 %mul.1, 3
+  store i64 %sub.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mul_sub_mismatched_ops2(ptr noalias %data, ptr noalias %factor) {
+; VF2-LABEL: define void @test_2xi64_mul_sub_mismatched_ops2(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], splat (i64 3)
+; VF2-NEXT:    [[TMP4:%.*]] = sub <2 x i64> [[TMP3]], splat (i64 2)
+; VF2-NEXT:    [[TMP5:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; VF2-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP5]], splat (i64 2)
+; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %idx.0 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, 3
+  %sub.0 = sub i64 %mul.0, 2
+  store i64 %sub.0, ptr %data.0, align 8
+  %idx.1 = or disjoint i64 %idx.0, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  %sub.1 = sub i64 %mul.1, 2
+  store i64 %sub.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mul_sub_mismatched_op_order(ptr noalias %data, ptr noalias %factor) {
+; VF2-LABEL: define void @test_2xi64_mul_sub_mismatched_op_order(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; VF2-NEXT:    [[TMP4:%.*]] = sub <2 x i64> [[TMP3]], splat (i64 2)
+; VF2-NEXT:    [[TMP5:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; VF2-NEXT:    [[TMP6:%.*]] = sub <2 x i64> splat (i64 2), [[TMP5]]
+; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %idx.0 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %sub.0 = sub i64 %mul.0, 2
+  store i64 %sub.0, ptr %data.0, align 8
+  %idx.1 = or disjoint i64 %idx.0, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  %sub.1 = sub i64 2, %mul.1
+  store i64 %sub.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mul_add_xor(ptr noalias %data, ptr noalias %factor) {
+; VF2-LABEL: define void @test_2xi64_mul_add_xor(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2)
+; VF2-NEXT:    [[TMP5:%.*]] = xor <2 x i64> splat (i64 4), [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; VF2-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP6]], splat (i64 2)
+; VF2-NEXT:    [[TMP8:%.*]] = xor <2 x i64> splat (i64 4), [[TMP7]]
+; VF2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %idx.0 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %add.0 = add i64 %mul.0, 2
+  %xor.0 = xor i64 4, %add.0
+  store i64 %xor.0, ptr %data.0, align 8
+  %idx.1 = or disjoint i64 %idx.0, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  %add.1 = add i64 %mul.1, 2
+  %xor.1 = xor i64 4, %add.1
+  store i64 %xor.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mul_add_xor_mismatched_opcodes1(ptr noalias %data, ptr noalias %factor) {
+; VF2-LABEL: define void @test_2xi64_mul_add_xor_mismatched_opcodes1(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2)
+; VF2-NEXT:    [[TMP5:%.*]] = xor <2 x i64> splat (i64 4), [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; VF2-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP6]], splat (i64 2)
+; VF2-NEXT:    [[TMP8:%.*]] = xor <2 x i64> splat (i64 4), [[TMP7]]
+; VF2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %idx.0 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %add.0 = add i64 %mul.0, 2
+  %xor.0 = xor i64 4, %add.0
+  store i64 %xor.0, ptr %data.0, align 8
+  %idx.1 = or disjoint i64 %idx.0, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = sub i64 %l.factor, %l.1
+  %add.1 = add i64 %mul.1, 2
+  %xor.1 = xor i64 4, %add.1
+  store i64 %xor.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mul_add_xor_mismatched_opcodes2(ptr noalias %data, ptr noalias %factor) {
+; VF2-LABEL: define void @test_2xi64_mul_add_xor_mismatched_opcodes2(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
+; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2)
+; VF2-NEXT:    [[TMP5:%.*]] = xor <2 x i64> splat (i64 4), [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; VF2-NEXT:    [[TMP7:%.*]] = mul <2 x i64> [[TMP6]], splat (i64 2)
+; VF2-NEXT:    [[TMP8:%.*]] = xor <2 x i64> splat (i64 4), [[TMP7]]
+; VF2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %idx.0 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, %l.0
+  %add.0 = add i64 %mul.0, 2
+  %xor.0 = xor i64 4, %add.0
+  store i64 %xor.0, ptr %data.0, align 8
+  %idx.1 = or disjoint i64 %idx.0, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  %add.1 = mul i64 %mul.1, 2
+  %xor.1 = xor i64 4, %add.1
+  store i64 %xor.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mul_add_xor_mismatched_ops(ptr noalias %data, ptr noalias %factor) {
+; VF2-LABEL: define void @test_2xi64_mul_add_xor_mismatched_ops(
+; VF2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; VF2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], splat (i64 3)
+; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2)
+; VF2-NEXT:    [[TMP5:%.*]] = xor <2 x i64> splat (i64 4), [[TMP4]]
+; VF2-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
+; VF2-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP6]], splat (i64 2)
+; VF2-NEXT:    [[TMP8:%.*]] = xor <2 x i64> splat (i64 4), [[TMP7]]
+; VF2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv
+  %l.factor = load i64, ptr %arrayidx, align 8
+  %idx.0 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %idx.0
+  %l.0 = load i64, ptr %data.0, align 8
+  %mul.0 = mul i64 %l.factor, 3
+  %add.0 = add i64 %mul.0, 2
+  %xor.0 = xor i64 4, %add.0
+  store i64 %xor.0, ptr %data.0, align 8
+  %idx.1 = or disjoint i64 %idx.0, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %idx.1
+  %l.1 = load i64, ptr %data.1, align 8
+  %mul.1 = mul i64 %l.factor, %l.1
+  %add.1 = add i64 %mul.1, 2
+  %xor.1 = xor i64 4, %add.1
+  store i64 %xor.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index 49f663f5703b6..0c3b987a74ece 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -1,12 +1,12 @@
 ; REQUIRES: asserts
-; RUN: opt -mattr=+neon,+dotprod -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -disable-output %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"
 
 ; Tests for printing VPlans that are enabled under AArch64
 
-define i32 @print_partial_reduction(ptr %a, ptr %b) {
+define i32 @print_partial_reduction(ptr %a, ptr %b) "target-features"="+neon,+dotprod" {
 ; CHECK:      VPlan 'Initial VPlan for VF={8,16},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VF:%.]]> = VF
 ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
@@ -69,62 +69,107 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
 ; CHECK: VPlan 'Final VPlan for VF={8,16},UF={1}' {
+; CHECK-NEXT: Live-in ir<1024> = vector-trip-count
 ; CHECK-NEXT: Live-in ir<1024> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
-; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.main.loop.iter.check>
+; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<vector.main.loop.iter.check>:
-; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<vector.ph>:
-; CHECK-NEXT:  EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<4>
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   EMIT vp<%1> = reduction-start-vector ir<0>, ir<0>, ir<4>
 ; CHECK-NEXT: Successor(s): vector.body
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.body:
-; CHECK-NEXT:   EMIT-SCALAR vp<[[EP_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%accum> = phi vp<[[RDX_START]]>, ir<%add> (VF scaled by 1/4)
-; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]>
+; CHECK-NEXT:   EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%accum> = phi vp<%1>, ir<%add> (VF scaled by 1/4)
+; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%index>
 ; CHECK-NEXT:   WIDEN ir<%load.a> = load ir<%gep.a>
-; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[EP_IV]]>
+; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%index>
 ; CHECK-NEXT:   WIDEN ir<%load.b> = load ir<%gep.b>
 ; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
 ; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
 ; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
 ; CHECK-NEXT:   PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul>
-; CHECK-NEXT:   EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16>
-; CHECK-NEXT:   EMIT branch-on-count vp<[[EP_IV_NEXT]]>, ir<1024>
+; CHECK-NEXT:   EMIT vp<%index.next> = add nuw vp<%index>, ir<16>
+; CHECK-NEXT:   EMIT branch-on-count vp<%index.next>, ir<1024>
 ; CHECK-NEXT: Successor(s): middle.block, vector.body
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, ir<%add>
-; CHECK-NEXT:   EMIT branch-on-cond ir<true>
-; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
+; CHECK-NEXT:   EMIT vp<%3> = compute-reduction-result ir<%accum>, ir<%add>
+; CHECK-NEXT: Successor(s): ir-bb<exit>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RED_RESULT]]> from middle.block)
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%3> from middle.block)
 ; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0
+
+exit:
+  ret i32 %add
+}
+
+; Test that we also get VPExpressions when there is predication.
+define i32 @print_partial_reduction_predication(ptr %a, ptr %b, i64 %N) "target-features"="+sve" {
+; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF
+; CHECK-NEXT: Live-in vp<%1> = VF * UF
+; CHECK-NEXT: Live-in ir<%N> = original trip-count
 ; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<scalar.ph>:
-; CHECK-NEXT:   EMIT-SCALAR vp<[[EP_RESUME:%.+]]> = phi [ ir<1024>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT:   EMIT-SCALAR vp<[[EP_MERGE:%.+]]> = phi [ vp<[[RED_RESULT]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT:   EMIT-SCALAR vp<%6> = resume-for-epilogue vp<%vec.epilog.resume.val>
-; CHECK-NEXT: Successor(s): ir-bb<for.body>
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %accum = phi i32 [ 0, %scalar.ph ], [ %add, %for.body ] (extra operand: vp<[[EP_MERGE]]> from ir-bb<scalar.ph>)
-; CHECK-NEXT:   IR   %gep.a = getelementptr i8, ptr %a, i64 %iv
-; CHECK-NEXT:   IR   %load.a = load i8, ptr %gep.a, align 1
-; CHECK-NEXT:   IR   %ext.a = zext i8 %load.a to i32
-; CHECK-NEXT:   IR   %gep.b = getelementptr i8, ptr %b, i64 %iv
-; CHECK-NEXT:   IR   %load.b = load i8, ptr %gep.b, align 1
-; CHECK-NEXT:   IR   %ext.b = zext i8 %load.b to i32
-; CHECK-NEXT:   IR   %mul = mul i32 %ext.b, %ext.a
-; CHECK-NEXT:   IR   %add = add i32 %mul, %accum
-; CHECK-NEXT:   IR   %iv.next = add i64 %iv, 1
-; CHECK-NEXT:   IR   %exitcond.not = icmp eq i64 %iv.next, 1024
-; CHECK-NEXT: No successors
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   EMIT vp<%4> = reduction-start-vector ir<0>, ir<0>, ir<4>
+; CHECK-NEXT:   EMIT vp<%5> = TC > VF ? TC - VF : 0 ir<%N>
+; CHECK-NEXT:   EMIT vp<%index.part.next> = VF * Part + ir<0>
+; CHECK-NEXT:   EMIT vp<%active.lane.mask.entry> = active lane mask vp<%index.part.next>, ir<%N>, ir<1>
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     ACTIVE-LANE-MASK-PHI vp<%7> = phi vp<%active.lane.mask.entry>, vp<%active.lane.mask.next>
+; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<%accum> = phi vp<%4>, vp<%11> (VF scaled by 1/4)
+; CHECK-NEXT:     vp<%8> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0>
+; CHECK-NEXT:     CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%8>
+; CHECK-NEXT:     vp<%9> = vector-pointer ir<%gep.a>
+; CHECK-NEXT:     WIDEN ir<%load.a> = load vp<%9>, vp<%7>
+; CHECK-NEXT:     CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%8>
+; CHECK-NEXT:     vp<%10> = vector-pointer ir<%gep.b>
+; CHECK-NEXT:     WIDEN ir<%load.b> = load vp<%10>, vp<%7>
+; CHECK-NEXT:     EXPRESSION vp<%11> = vp<%7> + partial.reduce.add (mul (ir<%load.b> zext to i32), (ir<%load.a> zext to i32), <badref>)
+; CHECK-NEXT:     EMIT vp<%index.next> = add vp<%6>, vp<%1>
+; CHECK-NEXT:     EMIT vp<%12> = VF * Part + vp<%6>
+; CHECK-NEXT:     EMIT vp<%active.lane.mask.next> = active lane mask vp<%12>, vp<%5>, ir<1>
+; CHECK-NEXT:     EMIT vp<%13> = not vp<%active.lane.mask.next>
+; CHECK-NEXT:     EMIT branch-on-cond vp<%13>
+; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<%15> = compute-reduction-result ir<%accum>, vp<%11>
+; CHECK-NEXT: Successor(s): ir-bb<exit>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%15> from middle.block)
+; CHECK-NEXT: No successors
 entry:
   br label %for.body
 
@@ -140,9 +185,16 @@ for.body:                                         ; preds = %for.body, %entry
   %mul = mul i32 %ext.b, %ext.a
   %add = add i32 %mul, %accum
   %iv.next = add i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 1024
-  br i1 %exitcond.not, label %exit, label %for.body
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !1
 
 exit:
   ret i32 %add
 }
+
+
+!0 = distinct !{!0, !2, !3}
+!1 = distinct !{!1, !2, !4}
+!2 = !{!"llvm.loop.interleave.count", i32 1}
+!3 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
+!4 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
index 6ea075f76aed4..83be0708774f1 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
@@ -181,178 +181,23 @@ for.cond.cleanup:
 define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) {
 ; DEFAULT-LABEL: define void @tail_predicate_without_optsize(
 ; DEFAULT-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) {
-; DEFAULT-NEXT:  [[ENTRY:.*:]]
-; DEFAULT-NEXT:    br label %[[VECTOR_PH:.*]]
-; DEFAULT:       [[VECTOR_PH]]:
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT3]], <16 x i8> poison, <16 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer
-; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
-; DEFAULT:       [[VECTOR_BODY]]:
-; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE35:.*]] ]
-; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE35]] ]
-; DEFAULT-NEXT:    [[VEC_IND1:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE35]] ]
-; DEFAULT-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i8> [[VEC_IND]], splat (i8 14)
-; DEFAULT-NEXT:    [[TMP1:%.*]] = mul <16 x i8> [[BROADCAST_SPLAT]], [[VEC_IND1]]
-; DEFAULT-NEXT:    [[TMP2:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 1)
-; DEFAULT-NEXT:    [[TMP3:%.*]] = mul <16 x i8> [[TMP2]], [[BROADCAST_SPLAT4]]
-; DEFAULT-NEXT:    [[TMP4:%.*]] = add <16 x i8> [[TMP3]], [[TMP1]]
-; DEFAULT-NEXT:    [[TMP5:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 2)
-; DEFAULT-NEXT:    [[TMP6:%.*]] = mul <16 x i8> [[TMP5]], [[BROADCAST_SPLAT6]]
-; DEFAULT-NEXT:    [[TMP7:%.*]] = add <16 x i8> [[TMP4]], [[TMP6]]
-; DEFAULT-NEXT:    [[TMP8:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0
-; DEFAULT-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; DEFAULT:       [[PRED_STORE_IF]]:
-; DEFAULT-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
-; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP9]]
-; DEFAULT-NEXT:    [[TMP11:%.*]] = extractelement <16 x i8> [[TMP7]], i32 0
-; DEFAULT-NEXT:    store i8 [[TMP11]], ptr [[TMP10]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE]]
-; DEFAULT:       [[PRED_STORE_CONTINUE]]:
-; DEFAULT-NEXT:    [[TMP12:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1
-; DEFAULT-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
-; DEFAULT:       [[PRED_STORE_IF6]]:
-; DEFAULT-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 1
-; DEFAULT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP13]]
-; DEFAULT-NEXT:    [[TMP15:%.*]] = extractelement <16 x i8> [[TMP7]], i32 1
-; DEFAULT-NEXT:    store i8 [[TMP15]], ptr [[TMP14]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
-; DEFAULT:       [[PRED_STORE_CONTINUE7]]:
-; DEFAULT-NEXT:    [[TMP16:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2
-; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
-; DEFAULT:       [[PRED_STORE_IF8]]:
-; DEFAULT-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 2
-; DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP17]]
-; DEFAULT-NEXT:    [[TMP19:%.*]] = extractelement <16 x i8> [[TMP7]], i32 2
-; DEFAULT-NEXT:    store i8 [[TMP19]], ptr [[TMP18]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE9]]
-; DEFAULT:       [[PRED_STORE_CONTINUE9]]:
-; DEFAULT-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3
-; DEFAULT-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]]
-; DEFAULT:       [[PRED_STORE_IF10]]:
-; DEFAULT-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 3
-; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP21]]
-; DEFAULT-NEXT:    [[TMP23:%.*]] = extractelement <16 x i8> [[TMP7]], i32 3
-; DEFAULT-NEXT:    store i8 [[TMP23]], ptr [[TMP22]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE11]]
-; DEFAULT:       [[PRED_STORE_CONTINUE11]]:
-; DEFAULT-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4
-; DEFAULT-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]]
-; DEFAULT:       [[PRED_STORE_IF12]]:
-; DEFAULT-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX]], 4
-; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP25]]
-; DEFAULT-NEXT:    [[TMP27:%.*]] = extractelement <16 x i8> [[TMP7]], i32 4
-; DEFAULT-NEXT:    store i8 [[TMP27]], ptr [[TMP26]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE13]]
-; DEFAULT:       [[PRED_STORE_CONTINUE13]]:
-; DEFAULT-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5
-; DEFAULT-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]]
-; DEFAULT:       [[PRED_STORE_IF14]]:
-; DEFAULT-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 5
-; DEFAULT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP29]]
-; DEFAULT-NEXT:    [[TMP31:%.*]] = extractelement <16 x i8> [[TMP7]], i32 5
-; DEFAULT-NEXT:    store i8 [[TMP31]], ptr [[TMP30]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE15]]
-; DEFAULT:       [[PRED_STORE_CONTINUE15]]:
-; DEFAULT-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6
-; DEFAULT-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17:.*]]
-; DEFAULT:       [[PRED_STORE_IF16]]:
-; DEFAULT-NEXT:    [[TMP33:%.*]] = add i64 [[INDEX]], 6
-; DEFAULT-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP33]]
-; DEFAULT-NEXT:    [[TMP35:%.*]] = extractelement <16 x i8> [[TMP7]], i32 6
-; DEFAULT-NEXT:    store i8 [[TMP35]], ptr [[TMP34]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE17]]
-; DEFAULT:       [[PRED_STORE_CONTINUE17]]:
-; DEFAULT-NEXT:    [[TMP36:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7
-; DEFAULT-NEXT:    br i1 [[TMP36]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]]
-; DEFAULT:       [[PRED_STORE_IF18]]:
-; DEFAULT-NEXT:    [[TMP37:%.*]] = add i64 [[INDEX]], 7
-; DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP37]]
-; DEFAULT-NEXT:    [[TMP39:%.*]] = extractelement <16 x i8> [[TMP7]], i32 7
-; DEFAULT-NEXT:    store i8 [[TMP39]], ptr [[TMP38]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE19]]
-; DEFAULT:       [[PRED_STORE_CONTINUE19]]:
-; DEFAULT-NEXT:    [[TMP40:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8
-; DEFAULT-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21:.*]]
-; DEFAULT:       [[PRED_STORE_IF20]]:
-; DEFAULT-NEXT:    [[TMP41:%.*]] = add i64 [[INDEX]], 8
-; DEFAULT-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP41]]
-; DEFAULT-NEXT:    [[TMP43:%.*]] = extractelement <16 x i8> [[TMP7]], i32 8
-; DEFAULT-NEXT:    store i8 [[TMP43]], ptr [[TMP42]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE21]]
-; DEFAULT:       [[PRED_STORE_CONTINUE21]]:
-; DEFAULT-NEXT:    [[TMP44:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9
-; DEFAULT-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF22:.*]], label %[[PRED_STORE_CONTINUE23:.*]]
-; DEFAULT:       [[PRED_STORE_IF22]]:
-; DEFAULT-NEXT:    [[TMP45:%.*]] = add i64 [[INDEX]], 9
-; DEFAULT-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP45]]
-; DEFAULT-NEXT:    [[TMP47:%.*]] = extractelement <16 x i8> [[TMP7]], i32 9
-; DEFAULT-NEXT:    store i8 [[TMP47]], ptr [[TMP46]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE23]]
-; DEFAULT:       [[PRED_STORE_CONTINUE23]]:
-; DEFAULT-NEXT:    [[TMP48:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10
-; DEFAULT-NEXT:    br i1 [[TMP48]], label %[[PRED_STORE_IF24:.*]], label %[[PRED_STORE_CONTINUE25:.*]]
-; DEFAULT:       [[PRED_STORE_IF24]]:
-; DEFAULT-NEXT:    [[TMP49:%.*]] = add i64 [[INDEX]], 10
-; DEFAULT-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP49]]
-; DEFAULT-NEXT:    [[TMP51:%.*]] = extractelement <16 x i8> [[TMP7]], i32 10
-; DEFAULT-NEXT:    store i8 [[TMP51]], ptr [[TMP50]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE25]]
-; DEFAULT:       [[PRED_STORE_CONTINUE25]]:
-; DEFAULT-NEXT:    [[TMP52:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11
-; DEFAULT-NEXT:    br i1 [[TMP52]], label %[[PRED_STORE_IF26:.*]], label %[[PRED_STORE_CONTINUE27:.*]]
-; DEFAULT:       [[PRED_STORE_IF26]]:
-; DEFAULT-NEXT:    [[TMP53:%.*]] = add i64 [[INDEX]], 11
-; DEFAULT-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP53]]
-; DEFAULT-NEXT:    [[TMP55:%.*]] = extractelement <16 x i8> [[TMP7]], i32 11
-; DEFAULT-NEXT:    store i8 [[TMP55]], ptr [[TMP54]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE27]]
-; DEFAULT:       [[PRED_STORE_CONTINUE27]]:
-; DEFAULT-NEXT:    [[TMP56:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12
-; DEFAULT-NEXT:    br i1 [[TMP56]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]]
-; DEFAULT:       [[PRED_STORE_IF28]]:
-; DEFAULT-NEXT:    [[TMP57:%.*]] = add i64 [[INDEX]], 12
-; DEFAULT-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP57]]
-; DEFAULT-NEXT:    [[TMP59:%.*]] = extractelement <16 x i8> [[TMP7]], i32 12
-; DEFAULT-NEXT:    store i8 [[TMP59]], ptr [[TMP58]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE29]]
-; DEFAULT:       [[PRED_STORE_CONTINUE29]]:
-; DEFAULT-NEXT:    [[TMP60:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13
-; DEFAULT-NEXT:    br i1 [[TMP60]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]]
-; DEFAULT:       [[PRED_STORE_IF30]]:
-; DEFAULT-NEXT:    [[TMP61:%.*]] = add i64 [[INDEX]], 13
-; DEFAULT-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP61]]
-; DEFAULT-NEXT:    [[TMP63:%.*]] = extractelement <16 x i8> [[TMP7]], i32 13
-; DEFAULT-NEXT:    store i8 [[TMP63]], ptr [[TMP62]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE31]]
-; DEFAULT:       [[PRED_STORE_CONTINUE31]]:
-; DEFAULT-NEXT:    [[TMP64:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14
-; DEFAULT-NEXT:    br i1 [[TMP64]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]]
-; DEFAULT:       [[PRED_STORE_IF32]]:
-; DEFAULT-NEXT:    [[TMP65:%.*]] = add i64 [[INDEX]], 14
-; DEFAULT-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP65]]
-; DEFAULT-NEXT:    [[TMP67:%.*]] = extractelement <16 x i8> [[TMP7]], i32 14
-; DEFAULT-NEXT:    store i8 [[TMP67]], ptr [[TMP66]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE33]]
-; DEFAULT:       [[PRED_STORE_CONTINUE33]]:
-; DEFAULT-NEXT:    [[TMP68:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15
-; DEFAULT-NEXT:    br i1 [[TMP68]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35]]
-; DEFAULT:       [[PRED_STORE_IF34]]:
-; DEFAULT-NEXT:    [[TMP69:%.*]] = add i64 [[INDEX]], 15
+; DEFAULT-NEXT:  [[ENTRY:.*]]:
+; DEFAULT-NEXT:    br label %[[FOR_BODY:.*]]
+; DEFAULT:       [[FOR_BODY]]:
+; DEFAULT-NEXT:    [[TMP69:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = trunc nuw nsw i64 [[TMP69]] to i8
+; DEFAULT-NEXT:    [[MUL:%.*]] = mul i8 [[A]], [[TMP0]]
+; DEFAULT-NEXT:    [[SHR:%.*]] = lshr i8 [[TMP0]], 1
+; DEFAULT-NEXT:    [[MUL5:%.*]] = mul i8 [[SHR]], [[B]]
+; DEFAULT-NEXT:    [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]]
+; DEFAULT-NEXT:    [[SHR7:%.*]] = lshr i8 [[TMP0]], 2
+; DEFAULT-NEXT:    [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]]
+; DEFAULT-NEXT:    [[TMP71:%.*]] = add i8 [[ADD]], [[MUL9]]
 ; DEFAULT-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP69]]
-; DEFAULT-NEXT:    [[TMP71:%.*]] = extractelement <16 x i8> [[TMP7]], i32 15
 ; DEFAULT-NEXT:    store i8 [[TMP71]], ptr [[TMP70]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE35]]
-; DEFAULT:       [[PRED_STORE_CONTINUE35]]:
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16)
-; DEFAULT-NEXT:    [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16)
-; DEFAULT-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; DEFAULT:       [[MIDDLE_BLOCK]]:
-; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
+; DEFAULT-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP69]], 1
+; DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 15
+; DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY]]
 ; DEFAULT:       [[FOR_COND_CLEANUP]]:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -449,7 +294,7 @@ define void @dont_vectorize_with_minsize() {
 ; DEFAULT-NEXT:    store <4 x i16> [[TMP11]], ptr [[TMP9]], align 2
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 ; DEFAULT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
-; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; DEFAULT:       [[FOR_COND_CLEANUP]]:
@@ -555,7 +400,7 @@ define void @vectorization_forced() {
 ; DEFAULT-NEXT:    store <4 x i16> [[TMP11]], ptr [[TMP9]], align 2
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 ; DEFAULT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
-; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; DEFAULT:       [[FOR_COND_CLEANUP]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index f25b86d3b20c2..a22f72fe929d1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -69,51 +69,51 @@ exit:
 define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
 ; CHECK-LABEL: define i8 @dead_live_out_due_to_scalar_epilogue_required(
 ; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 6)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 252, [[TMP2]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 1005
 ; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 1005
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 252, [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 252, [[TMP6]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i32> [[TMP9]], splat (i32 4)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 4, [[TMP4]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP14]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <vscale x 16 x i32> [[TMP0]], splat (i32 4)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 16 x i32> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[VEC_IND]] to <vscale x 4 x i64>
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], <vscale x 4 x i64> [[TMP15]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> align 1 [[TMP16]], <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META6:![0-9]+]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i32 [ 252, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 4, [[TMP2]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge <vscale x 16 x i32> [[TMP0]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <vscale x 16 x i32> [[VEC_IND]] to <vscale x 16 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[SRC]], <vscale x 16 x i64> [[TMP9]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP6]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP2]]), !alias.scope [[META3:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], <vscale x 16 x i64> [[TMP9]]
+; CHECK-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x ptr> align 1 [[TMP7]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP2]]), !alias.scope [[META6:![0-9]+]], !noalias [[META3]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP2]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP5]], i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 16
+; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 16 x i8> [[WIDE_MASKED_GATHER]], i64 [[TMP11]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IDXPROM]]
 ; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
@@ -121,9 +121,9 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IV]], 1001
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[R:%.*]] = phi i8 [ [[L]], %[[LOOP]] ]
+; CHECK-NEXT:    [[R:%.*]] = phi i8 [ [[L]], %[[LOOP]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
 entry:
@@ -293,9 +293,9 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i32 [[A]], -1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = xor <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 -1)
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 9)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP7]]
@@ -309,7 +309,7 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x ptr> align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP8]])
+; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP8]])
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index 01b4502308c95..ebd80b2c2af4d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -276,16 +276,12 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP12]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp ult <vscale x 2 x i32> [[TMP7]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]])
-; CHECK-NEXT:    [[TMP16:%.*]] = select <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP12]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 2 x i1> [[TMP6]], i32 0
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP9]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP12]] to i64
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]]
@@ -304,14 +300,16 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
 ; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
-; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
+; FIXED-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; FIXED-NEXT:    [[TMP5:%.*]] = select i1 [[TMP1]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; FIXED-NEXT:    [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]]
-; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]]
+; FIXED-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; FIXED-NEXT:    [[PREDPHI2:%.*]] = select i1 [[TMP6]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]]
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI2]], ptr [[TMP2]], align 8
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -357,16 +355,12 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP12]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp ult <vscale x 2 x i32> [[TMP7]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]])
-; CHECK-NEXT:    [[TMP16:%.*]] = select <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP12]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 2 x i1> [[TMP6]], i32 0
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP9]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP12]] to i64
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]]
@@ -385,14 +379,16 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
 ; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
-; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
+; FIXED-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; FIXED-NEXT:    [[TMP5:%.*]] = select i1 [[TMP1]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; FIXED-NEXT:    [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]]
-; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]]
+; FIXED-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; FIXED-NEXT:    [[PREDPHI2:%.*]] = select i1 [[TMP6]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]]
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI2]], ptr [[TMP2]], align 8
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -575,15 +571,10 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP12]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp ult <vscale x 16 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP7]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP12]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 -128)
-; CHECK-NEXT:    [[TMP16:%.*]] = select <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 16 x i1> [[TMP16]], <vscale x 16 x i8> splat (i8 -1), <vscale x 16 x i8> splat (i8 1)
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.vp.merge.nxv16i8(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i8> splat (i8 -1), <vscale x 16 x i8> splat (i8 1), i32 [[TMP12]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = sdiv <vscale x 16 x i8> [[WIDE_LOAD]], [[TMP10]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[WIDE_LOAD]]
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[PREDPHI]], ptr align 1 [[TMP7]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP12]])
@@ -648,75 +639,50 @@ for.end:
 define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) {
 ; CHECK-LABEL: @udiv_sdiv_with_invariant_divisors(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 12, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 12, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 12, [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[Y:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i8> poison, i8 [[X:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i8> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i16
-; CHECK-NEXT:    [[TMP4:%.*]] = add i16 -12, [[DOTCAST]]
-; CHECK-NEXT:    [[DOTCAST5:%.*]] = trunc i32 [[N_VEC]] to i8
-; CHECK-NEXT:    [[TMP5:%.*]] = add i8 -12, [[DOTCAST5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = select <vscale x 2 x i1> [[BROADCAST_SPLAT4]], <vscale x 2 x i8> splat (i8 1), <vscale x 2 x i8> [[BROADCAST_SPLAT2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select <vscale x 2 x i1> [[BROADCAST_SPLAT4]], <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i16> [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i8> [[TMP8]], splat (i8 1)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i8> splat (i8 -12), [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc i32 [[TMP3]] to i8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 2 x i8> poison, i8 [[TMP10]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <vscale x 2 x i8> [[BROADCAST_SPLATINSERT6]], <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[C:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <vscale x 8 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[X:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT1]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT3]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i8> @llvm.stepvector.nxv8i8()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <vscale x 8 x i8> [[TMP1]], splat (i8 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i8> splat (i8 -12), [[TMP2]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = udiv <vscale x 2 x i8> [[VEC_IND]], [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = zext <vscale x 2 x i8> [[TMP11]] to <vscale x 2 x i16>
-; CHECK-NEXT:    [[TMP13:%.*]] = sdiv <vscale x 2 x i16> [[TMP12]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = sext <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x i32>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[BROADCAST_SPLAT4]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> [[TMP14]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i8> [[VEC_IND]], [[BROADCAST_SPLAT7]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i32 [ 12, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 8, i1 true)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT7]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[TMP4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT5]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp uge <vscale x 8 x i32> [[TMP5]], [[BROADCAST_SPLAT8]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> [[BROADCAST_SPLAT2]], <vscale x 8 x i8> splat (i8 1), i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP9:%.*]] = udiv <vscale x 8 x i8> [[VEC_IND]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <vscale x 8 x i8> [[TMP9]] to <vscale x 8 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[BROADCAST_SPLAT4]], <vscale x 8 x i16> splat (i16 1), i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP12:%.*]] = sdiv <vscale x 8 x i16> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i16> [[TMP12]] to <vscale x 8 x i32>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> [[TMP13]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP3]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i8> [[VEC_IND]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP17:%.*]] = mul nuw i32 [[TMP16]], 2
-; CHECK-NEXT:    [[TMP18:%.*]] = sub i32 [[TMP17]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 2 x i32> [[PREDPHI]], i32 [[TMP18]]
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 12, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ -12, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL8:%.*]] = phi i8 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ -12, [[ENTRY]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[NARROW_IV:%.*]] = phi i8 [ [[BC_RESUME_VAL8]], [[SCALAR_PH]] ], [ [[IV_NEXT_TRUNC:%.*]], [[LOOP_LATCH]] ]
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_LATCH]], label [[THEN:%.*]]
-; CHECK:       then:
-; CHECK-NEXT:    [[UD:%.*]] = udiv i8 [[NARROW_IV]], [[X]]
-; CHECK-NEXT:    [[UD_EXT:%.*]] = zext i8 [[UD]] to i16
-; CHECK-NEXT:    [[SD:%.*]] = sdiv i16 [[UD_EXT]], [[Y]]
-; CHECK-NEXT:    [[SD_EXT:%.*]] = sext i16 [[SD]] to i32
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ 0, [[LOOP_HEADER]] ], [ [[SD_EXT]], [[THEN]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i16 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 0
-; CHECK-NEXT:    [[IV_NEXT_TRUNC]] = trunc i16 [[IV_NEXT]] to i8
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> [[TMP15]], i1 true)
+; CHECK-NEXT:    [[TMP17:%.*]] = sub i64 [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 8
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 0
+; CHECK-NEXT:    [[MERGE_LCSSA:%.*]] = extractelement <vscale x 8 x i32> [[PREDPHI]], i64 [[TMP17]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[MERGE_LCSSA:%.*]] = phi i32 [ [[MERGE]], [[LOOP_LATCH]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[MERGE_LCSSA]]
 ;
 ; FIXED-LABEL: @udiv_sdiv_with_invariant_divisors(
@@ -727,10 +693,8 @@ define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) {
 ; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[X:%.*]], i64 0
 ; FIXED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
-; FIXED-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0
-; FIXED-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT3]], <4 x i1> poison, <4 x i32> zeroinitializer
-; FIXED-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i8> splat (i8 1), <4 x i8> [[BROADCAST_SPLAT2]]
-; FIXED-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i16> splat (i16 1), <4 x i16> [[BROADCAST_SPLAT]]
+; FIXED-NEXT:    [[TMP0:%.*]] = select i1 [[C:%.*]], <4 x i8> splat (i8 1), <4 x i8> [[BROADCAST_SPLAT2]]
+; FIXED-NEXT:    [[TMP1:%.*]] = select i1 [[C]], <4 x i16> splat (i16 1), <4 x i16> [[BROADCAST_SPLAT]]
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -739,7 +703,7 @@ define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) {
 ; FIXED-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16>
 ; FIXED-NEXT:    [[TMP4:%.*]] = sdiv <4 x i16> [[TMP3]], [[TMP1]]
 ; FIXED-NEXT:    [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32>
-; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]]
+; FIXED-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]]
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; FIXED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
index 7eb3d7fc5a36d..0083da77dfea3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -7,55 +7,54 @@ target triple = "riscv64-unknown-linux-gnu"
 define i64 @pr97452_scalable_vf1_for(ptr %src, ptr noalias %dst) #0 {
 ; CHECK-LABEL: define i64 @pr97452_scalable_vf1_for(
 ; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 23, [[TMP0]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 23, [[TMP1]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 23, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 0, i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 [[TMP4]], 1
+; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 0, i32 [[TMP5]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 1 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <vscale x 1 x i64>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> [[VECTOR_RECUR]], <vscale x 1 x i64> [[WIDE_LOAD]], i32 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP7]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 2 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 23, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[PREV_EVL:%.*]] = phi i32 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp uge <vscale x 2 x i32> [[TMP7]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[VP_OP_LOAD]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64> [[VECTOR_RECUR]], <vscale x 2 x i64> [[VP_OP_LOAD]], i32 -1, <vscale x 2 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP6]])
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr align 8 [[TMP11]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 [[TMP11]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 1 x i64> [[WIDE_LOAD]], i32 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = sub i32 [[TMP14]], 1
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 1 x i64> [[TMP7]], i32 [[TMP15]]
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 23, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L]] = load i64, ptr [[GEP_SRC]], align 8
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[FOR]], ptr [[GEP_DST]], align 8
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 22
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP8]], i1 true)
+; CHECK-NEXT:    [[TMP15:%.*]] = sub i64 [[TMP14]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 2 x i64> [[VP_OP_LOAD]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP22:%.*]] = mul nuw i32 [[TMP21]], 2
+; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP22]], 1
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 2 x i64> [[VECTOR_RECUR]], i32 [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i64 [[TMP24]], i64 [[TMP20]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ], [ [[TMP16]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-NEXT:    ret i64 [[TMP26]]
 ;
 entry:
   br label %loop
@@ -81,5 +80,4 @@ attributes #0 = { "target-features"="+64bit,+v,+zvl128b,+zvl256b" }
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
index 1c6954c187e5f..212a5c99676f4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
@@ -40,7 +40,7 @@ define void @predicated_uniform_load(ptr %src, i32 %n, ptr %dst, i1 %cond) {
 ; CHECK-NEXT:    [[AVL:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP13]], i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[BROADCAST_SPLAT1]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[COND]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index 8ef53cade01ac..345f6f632158a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -295,8 +295,7 @@ define i8 @mul_non_pow_2_low_trip_count(ptr noalias %a) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; CHECK-NEXT:    [[TMP1]] = mul <8 x i8> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
-; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> [[TMP1]])
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index 89819f2be4967..1cbec47d72203 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -17,18 +17,33 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-LABEL: @foo4(
 ; RV32-NEXT:  entry:
 ; RV32-NEXT:    br label [[VECTOR_MEMCHECK:%.*]]
+; RV32:       vector.scevcheck:
+; RV32-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 128, i32 624)
+; RV32-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV32-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV32-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[MUL_RESULT]]
+; RV32-NEXT:    [[TMP1:%.*]] = icmp ult ptr [[TMP0]], [[A]]
+; RV32-NEXT:    [[TMP2:%.*]] = or i1 [[TMP1]], [[MUL_OVERFLOW]]
+; RV32-NEXT:    [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 256, i32 624)
+; RV32-NEXT:    [[MUL_RESULT2:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
+; RV32-NEXT:    [[MUL_OVERFLOW3:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
+; RV32-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[MUL_RESULT2]]
+; RV32-NEXT:    [[TMP4:%.*]] = icmp ult ptr [[TMP3]], [[B]]
+; RV32-NEXT:    [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW3]]
+; RV32-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[TMP5]]
+; RV32-NEXT:    br i1 [[TMP6]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK1:%.*]]
 ; RV32:       vector.memcheck:
-; RV32-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 79880
 ; RV32-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i32 39940
-; RV32-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 159752
-; RV32-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
-; RV32-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
+; RV32-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i32 79880
+; RV32-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B]], i32 159752
+; RV32-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
+; RV32-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
 ; RV32-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; RV32-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
 ; RV32-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
 ; RV32-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
 ; RV32-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
-; RV32-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; RV32-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; RV32:       vector.ph:
 ; RV32-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; RV32-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16)
@@ -43,25 +58,26 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
 ; RV32-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; RV32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
 ; RV32-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], splat (i32 100)
 ; RV32-NEXT:    [[TMP15:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], splat (i64 1)
 ; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP15]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]]
 ; RV32-NEXT:    [[TMP17:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
 ; RV32-NEXT:    [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]]
 ; RV32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
-; RV32-NEXT:    call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
+; RV32-NEXT:    call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3]], !noalias [[META5]]
 ; RV32-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; RV32-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; RV32-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RV32-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV32-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; RV32:       middle.block:
 ; RV32-NEXT:    br label [[FOR_END:%.*]]
 ; RV32:       scalar.ph:
+; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK1]] ]
 ; RV32-NEXT:    br label [[FOR_BODY:%.*]]
 ; RV32:       for.body:
-; RV32-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; RV32-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; RV32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
 ; RV32-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; RV32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP21]], 100
@@ -78,7 +94,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32:       for.inc:
 ; RV32-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
 ; RV32-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
-; RV32-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]]
+; RV32-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]]
 ; RV32:       for.end:
 ; RV32-NEXT:    ret void
 ;
@@ -146,7 +162,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64:       for.inc:
 ; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
 ; RV64-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
-; RV64-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]]
+; RV64-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]]
 ; RV64:       for.end:
 ; RV64-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
index 3c90908b0a08f..e09284f26f6db 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
@@ -71,7 +71,7 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.end:
@@ -115,7 +115,7 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP11]])
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -159,7 +159,7 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.end:
@@ -199,7 +199,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.end:
@@ -224,43 +224,37 @@ for.end:
 define i64 @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) {
 ; CHECK-LABEL: @uniform_load(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT:    br label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 1025, [[ENTRY]] ], [ [[AVL_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]]
 ; CHECK-NEXT:    [[V:%.*]] = load i64, ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[ARRAYIDX]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[TMP5]], [[IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 [[TMP8]]
 ; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[B]], align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
-; CHECK-NEXT:    store i64 [[V1]], ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V1]], [[FOR_BODY1]] ], [ [[V]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[V_LCSSA]]
+; CHECK-NEXT:    ret i64 [[TMP12]]
 ;
 entry:
   br label %for.body
@@ -299,7 +293,7 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
index c7ba826295de8..a89435f4b24e3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
@@ -400,61 +400,54 @@ for.end:
 define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-LABEL: define i32 @FOR_reduction(
 ; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] {
-; IF-EVL-NEXT:  [[ENTRY:.*]]:
-; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP9]], 2
-; IF-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]]
-; IF-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IF-EVL-NEXT:  [[ENTRY:.*:]]
+; IF-EVL-NEXT:    br label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]]
-; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 4
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
 ; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP8]]
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
-; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[AVL:%.*]] = phi i64 [ [[TC]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[PREV_EVL:%.*]] = phi i32 [ [[TMP4]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP9]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP9]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP22:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; IF-EVL-NEXT:    [[TMP23:%.*]] = icmp uge <vscale x 4 x i32> [[TMP22]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
-; IF-EVL-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
+; IF-EVL-NEXT:    [[WIDE_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[ARRAYIDX]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[TMP10]], [[WIDE_LOAD]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]]
-; IF-EVL-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP12]], align 4
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS]], [[TMP3]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP11]], ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP9]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[INDVARS]]
+; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; IF-EVL-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP23]], i1 true)
+; IF-EVL-NEXT:    [[TMP28:%.*]] = sub i64 [[TMP27]], 1
+; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i64 [[TMP28]], 1
+; IF-EVL-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4
+; IF-EVL-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 0
+; IF-EVL-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i64 [[TMP17]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = mul nuw i32 [[TMP14]], 4
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
-; IF-EVL-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP16]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 4
-; IF-EVL-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 2
-; IF-EVL-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP19]]
-; IF-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
-; IF-EVL:       [[SCALAR_PH]]:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; IF-EVL-NEXT:    br label %[[FOR_BODY:.*]]
-; IF-EVL:       [[FOR_BODY]]:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP0:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP0]] = load i32, ptr [[ARRAYIDX1]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR1]], [[TMP0]]
-; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
-; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 4 x i32> [[VECTOR_RECUR]], i32 [[TMP16]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP28]], 0
+; IF-EVL-NEXT:    [[FOR1_LCSSA:%.*]] = select i1 [[TMP26]], i32 [[TMP25]], i32 [[TMP21]]
+; IF-EVL-NEXT:    br label %[[FOR_END:.*]]
 ; IF-EVL:       [[FOR_END]]:
-; IF-EVL-NEXT:    [[FOR1_LCSSA:%.*]] = phi i32 [ [[FOR1]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
 ; IF-EVL-NEXT:    ret i32 [[FOR1_LCSSA]]
 ;
 ; NO-VP-LABEL: define i32 @FOR_reduction(
@@ -570,7 +563,7 @@ define void @first_order_recurrence_indvar(ptr noalias %A, i64 %TC) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
 ; IF-EVL-NEXT:    br label %[[FOR_END:.*]]
 ; IF-EVL:       [[FOR_END]]:
@@ -662,8 +655,7 @@ for.end:
 ; IF-EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
 ; IF-EVL: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; IF-EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
-; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
-; IF-EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
+; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
 ;.
 ; NO-VP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; NO-VP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/RISCV/transform-narrow-interleave-to-widen-memory.ll
new file mode 100644
index 0000000000000..d4d7d398185a1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/transform-narrow-interleave-to-widen-memory.ll
@@ -0,0 +1,214 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -p loop-vectorize -mtriple riscv64 -mattr=+v -S %s | FileCheck -check-prefix=CHECK %s
+; RUN: opt -p loop-vectorize -mtriple riscv64 -mattr=+v -S %s -prefer-predicate-over-epilogue=scalar-epilogue | FileCheck -check-prefix=EPILOGUE %s
+
+define void @load_store_interleave_group(ptr noalias %data) {
+; CHECK-LABEL: define void @load_store_interleave_group(
+; CHECK-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 100, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[EVL_BASED_IV]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP0]], 2
+; CHECK-NEXT:    [[WIDE_VP_LOAD:%.*]] = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP2]], <vscale x 4 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VP_LOAD]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP0]], 2
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]])
+; CHECK-NEXT:    call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP2]], <vscale x 4 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+; EPILOGUE-LABEL: define void @load_store_interleave_group(
+; EPILOGUE-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
+; EPILOGUE-NEXT:  [[ENTRY:.*]]:
+; EPILOGUE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EPILOGUE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
+; EPILOGUE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]]
+; EPILOGUE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; EPILOGUE:       [[VECTOR_PH]]:
+; EPILOGUE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EPILOGUE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
+; EPILOGUE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]]
+; EPILOGUE-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; EPILOGUE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; EPILOGUE:       [[VECTOR_BODY]]:
+; EPILOGUE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EPILOGUE-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1
+; EPILOGUE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; EPILOGUE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP5]], align 8
+; EPILOGUE-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 8
+; EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; EPILOGUE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EPILOGUE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; EPILOGUE:       [[MIDDLE_BLOCK]]:
+; EPILOGUE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; EPILOGUE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; EPILOGUE:       [[SCALAR_PH]]:
+; EPILOGUE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; EPILOGUE-NEXT:    br label %[[LOOP:.*]]
+; EPILOGUE:       [[LOOP]]:
+; EPILOGUE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; EPILOGUE-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
+; EPILOGUE-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[MUL_2]]
+; EPILOGUE-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
+; EPILOGUE-NEXT:    store i64 [[L_0]], ptr [[DATA_0]], align 8
+; EPILOGUE-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
+; EPILOGUE-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[ADD_1]]
+; EPILOGUE-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
+; EPILOGUE-NEXT:    store i64 [[L_1]], ptr [[DATA_1]], align 8
+; EPILOGUE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EPILOGUE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
+; EPILOGUE-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; EPILOGUE:       [[EXIT]]:
+; EPILOGUE-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %mul.2 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %mul.2
+  %l.0 = load i64, ptr %data.0, align 8
+  store i64 %l.0, ptr %data.0, align 8
+  %add.1 = or disjoint i64 %mul.2, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %add.1
+  %l.1 = load i64, ptr %data.1, align 8
+  store i64 %l.1, ptr %data.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
+define void @load_store_interleave_group_i32(ptr noalias %data) {
+; CHECK-LABEL: define void @load_store_interleave_group_i32(
+; CHECK-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 100, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[EVL_BASED_IV]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP0]], 4
+; CHECK-NEXT:    [[WIDE_VP_LOAD:%.*]] = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr align 8 [[TMP2]], <vscale x 16 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VP_LOAD]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 3
+; CHECK-NEXT:    [[INTERLEAVE_EVL1:%.*]] = mul nuw nsw i32 [[TMP0]], 4
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.vp.store.nxv16i32.p0(<vscale x 16 x i32> [[INTERLEAVED_VEC]], ptr align 8 [[TMP2]], <vscale x 16 x i1> splat (i1 true), i32 [[INTERLEAVE_EVL1]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+; EPILOGUE-LABEL: define void @load_store_interleave_group_i32(
+; EPILOGUE-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0]] {
+; EPILOGUE-NEXT:  [[ENTRY:.*]]:
+; EPILOGUE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EPILOGUE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; EPILOGUE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]]
+; EPILOGUE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; EPILOGUE:       [[VECTOR_PH]]:
+; EPILOGUE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EPILOGUE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; EPILOGUE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]]
+; EPILOGUE-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; EPILOGUE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; EPILOGUE:       [[VECTOR_BODY]]:
+; EPILOGUE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; EPILOGUE-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 2
+; EPILOGUE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[TMP4]]
+; EPILOGUE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 8
+; EPILOGUE-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP5]], align 8
+; EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; EPILOGUE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EPILOGUE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; EPILOGUE:       [[MIDDLE_BLOCK]]:
+; EPILOGUE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; EPILOGUE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; EPILOGUE:       [[SCALAR_PH]]:
+; EPILOGUE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; EPILOGUE-NEXT:    br label %[[LOOP:.*]]
+; EPILOGUE:       [[LOOP]]:
+; EPILOGUE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; EPILOGUE-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 2
+; EPILOGUE-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[MUL_2]]
+; EPILOGUE-NEXT:    [[L_0:%.*]] = load i32, ptr [[DATA_0]], align 8
+; EPILOGUE-NEXT:    store i32 [[L_0]], ptr [[DATA_0]], align 8
+; EPILOGUE-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
+; EPILOGUE-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[ADD_1]]
+; EPILOGUE-NEXT:    [[L_1:%.*]] = load i32, ptr [[DATA_1]], align 8
+; EPILOGUE-NEXT:    store i32 [[L_1]], ptr [[DATA_1]], align 8
+; EPILOGUE-NEXT:    [[ADD_2:%.*]] = add i64 [[MUL_2]], 2
+; EPILOGUE-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[ADD_2]]
+; EPILOGUE-NEXT:    [[L_2:%.*]] = load i32, ptr [[DATA_2]], align 8
+; EPILOGUE-NEXT:    store i32 [[L_2]], ptr [[DATA_2]], align 8
+; EPILOGUE-NEXT:    [[ADD_3:%.*]] = add i64 [[MUL_2]], 3
+; EPILOGUE-NEXT:    [[DATA_3:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[ADD_3]]
+; EPILOGUE-NEXT:    [[L_3:%.*]] = load i32, ptr [[DATA_3]], align 8
+; EPILOGUE-NEXT:    store i32 [[L_3]], ptr [[DATA_3]], align 8
+; EPILOGUE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EPILOGUE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
+; EPILOGUE-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; EPILOGUE:       [[EXIT]]:
+; EPILOGUE-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %mul.4 = shl nsw i64 %iv, 2
+  %data.0 = getelementptr inbounds i32, ptr %data, i64 %mul.4
+  %l.0 = load i32, ptr %data.0, align 8
+  store i32 %l.0, ptr %data.0, align 8
+  %add.1 = or disjoint i64 %mul.4, 1
+  %data.1 = getelementptr inbounds i32, ptr %data, i64 %add.1
+  %l.1 = load i32, ptr %data.1, align 8
+  store i32 %l.1, ptr %data.1, align 8
+  %add.2 = add i64 %mul.4, 2
+  %data.2 = getelementptr inbounds i32, ptr %data, i64 %add.2
+  %l.2 = load i32, ptr %data.2, align 8
+  store i32 %l.2, ptr %data.2, align 8
+  %add.3 = add i64 %mul.4, 3
+  %data.3 = getelementptr inbounds i32, ptr %data, i64 %add.3
+  %l.3 = load i32, ptr %data.3, align 8
+  store i32 %l.3, ptr %data.3, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 1e21c753840e9..3f404daef6965 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -109,44 +109,38 @@ for.end:
 define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) {
 ; SCALABLE-LABEL: define i64 @uniform_load_outside_use(
 ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SCALABLE-NEXT:  [[ENTRY:.*]]:
-; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; SCALABLE-NEXT:  [[ENTRY:.*:]]
+; SCALABLE-NEXT:    br label %[[VECTOR_PH:.*]]
 ; SCALABLE:       [[VECTOR_PH]]:
-; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
-; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; SCALABLE:       [[VECTOR_BODY]]:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; SCALABLE-NEXT:    [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = load i64, ptr [[B]], align 8
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; SCALABLE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
+; SCALABLE-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
+; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[INDEX]]
+; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; SCALABLE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
-; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
-; SCALABLE:       [[SCALAR_PH]]:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; SCALABLE:       [[FOR_BODY]]:
-; SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; SCALABLE-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 8
-; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALABLE-NEXT:    [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 true)
+; SCALABLE-NEXT:    [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1
+; SCALABLE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP7]], 2
+; SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP11]], 0
+; SCALABLE-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 [[LAST_ACTIVE_LANE]]
+; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
-; SCALABLE-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
-; SCALABLE-NEXT:    ret i64 [[V_LCSSA]]
+; SCALABLE-NEXT:    ret i64 [[TMP12]]
 ;
 ; FIXEDLEN-LABEL: define i64 @uniform_load_outside_use(
 ; FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
@@ -184,44 +178,38 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ;
 ; TF-SCALABLE-LABEL: define i64 @uniform_load_outside_use(
 ; TF-SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; TF-SCALABLE-NEXT:  [[ENTRY:.*]]:
-; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; TF-SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
-; TF-SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; TF-SCALABLE-NEXT:  [[ENTRY:.*:]]
+; TF-SCALABLE-NEXT:    br label %[[VECTOR_PH:.*]]
 ; TF-SCALABLE:       [[VECTOR_PH]]:
-; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
-; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TF-SCALABLE:       [[VECTOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]]
 ; TF-SCALABLE-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 8
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[ARRAYIDX]], align 8
-; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP3]]
-; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[ARRAYIDX]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP5]], [[IV]]
+; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
+; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; TF-SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
-; TF-SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
-; TF-SCALABLE:       [[SCALAR_PH]]:
-; TF-SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; TF-SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-SCALABLE:       [[FOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[V1:%.*]] = load i64, ptr [[B]], align 8
-; TF-SCALABLE-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
-; TF-SCALABLE-NEXT:    store i64 [[V1]], ptr [[ARRAYIDX1]], align 8
-; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
-; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; TF-SCALABLE-NEXT:    [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 true)
+; TF-SCALABLE-NEXT:    [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1
+; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
+; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 0
+; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 [[LAST_ACTIVE_LANE]]
+; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
-; TF-SCALABLE-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V1]], %[[FOR_BODY]] ], [ [[V]], %[[MIDDLE_BLOCK]] ]
-; TF-SCALABLE-NEXT:    ret i64 [[V_LCSSA]]
+; TF-SCALABLE-NEXT:    ret i64 [[TMP12]]
 ;
 entry:
   br label %for.body
@@ -269,7 +257,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -350,7 +338,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -399,7 +387,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; SCALABLE-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -457,7 +445,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -499,7 +487,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -557,7 +545,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -608,7 +596,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -679,7 +667,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -731,7 +719,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -812,7 +800,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -860,7 +848,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -918,7 +906,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
deleted file mode 100644
index c05878995f474..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
+++ /dev/null
@@ -1,511 +0,0 @@
-; REQUIRES: asserts
-
-; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
-; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
-; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
-
-define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-INTRINSIC ir<[[SMAX:%.+]]> = call llvm.smax(ir<[[LD1]]>, ir<[[LD2]]>)
-; IF-EVL-NEXT:     CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[SMAX]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds i32, ptr %b, i64 %iv
-  %0 = load i32, ptr %gep, align 4
-  %gep3 = getelementptr inbounds i32, ptr %c, i64 %iv
-  %1 = load i32, ptr %gep3, align 4
-  %. = tail call i32 @llvm.smax.i32(i32 %0, i32 %1)
-  %gep11 = getelementptr inbounds i32, ptr %a, i64 %iv
-  store i32 %., ptr %gep11, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-INTRINSIC ir<[[SMIN:%.+]]> = call llvm.smin(ir<[[LD1]]>, ir<[[LD2]]>)
-; IF-EVL-NEXT:     CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[SMIN]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds i32, ptr %b, i64 %iv
-  %0 = load i32, ptr %gep, align 4
-  %gep3 = getelementptr inbounds i32, ptr %c, i64 %iv
-  %1 = load i32, ptr %gep3, align 4
-  %. = tail call i32 @llvm.smin.i32(i32 %0, i32 %1)
-  %gep11 = getelementptr inbounds i32, ptr %a, i64 %iv
-  store i32 %., ptr %gep11, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-INTRINSIC ir<[[UMAX:%.+]]> = call llvm.umax(ir<[[LD1]]>, ir<[[LD2]]>)
-; IF-EVL-NEXT:     CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[UMAX]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds i32, ptr %b, i64 %iv
-  %0 = load i32, ptr %gep, align 4
-  %gep3 = getelementptr inbounds i32, ptr %c, i64 %iv
-  %1 = load i32, ptr %gep3, align 4
-  %. = tail call i32 @llvm.umax.i32(i32 %0, i32 %1)
-  %gep11 = getelementptr inbounds i32, ptr %a, i64 %iv
-  store i32 %., ptr %gep11, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-INTRINSIC ir<[[UMIN:%.+]]> = call llvm.umin(ir<[[LD1]]>, ir<[[LD2]]>)
-; IF-EVL-NEXT:     CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[UMIN]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds i32, ptr %b, i64 %iv
-  %0 = load i32, ptr %gep, align 4
-  %gep3 = getelementptr inbounds i32, ptr %c, i64 %iv
-  %1 = load i32, ptr %gep3, align 4
-  %. = tail call i32 @llvm.umin.i32(i32 %0, i32 %1)
-  %gep11 = getelementptr inbounds i32, ptr %a, i64 %iv
-  store i32 %., ptr %gep11, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_ctlz(ptr %a, ptr %b, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-INTRINSIC ir<[[CTLZ:%.+]]> = call llvm.ctlz(ir<[[LD1]]>, ir<true>)
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[CTLZ]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds i32, ptr %b, i64 %iv
-  %0 = load i32, ptr %gep, align 4
-  %1 = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 %0, i1 true)
-  %gep3 = getelementptr inbounds i32, ptr %a, i64 %iv
-  store i32 %1, ptr %gep3, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_cttz(ptr %a, ptr %b, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-INTRINSIC ir<[[CTTZ:%.+]]> = call llvm.cttz(ir<[[LD1]]>, ir<true>)
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[CTTZ]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds i32, ptr %b, i64 %iv
-  %0 = load i32, ptr %gep, align 4
-  %1 = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %0, i1 true)
-  %gep3 = getelementptr inbounds i32, ptr %a, i64 %iv
-  store i32 %1, ptr %gep3, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_lrint(ptr %a, ptr %b, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-CAST ir<[[FPEXT:%.+]]> = fpext ir<[[LD1]]> to double
-; IF-EVL-NEXT:     WIDEN-INTRINSIC ir<[[LRINT:%.+]]> = call llvm.lrint(ir<[[FPEXT]]>)
-; IF-EVL-NEXT:     WIDEN-CAST ir<[[TRUNC:%.+]]> = trunc ir<[[LRINT]]> to i32
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[TRUNC]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds float, ptr %b, i64 %iv
-  %0 = load float, ptr %gep, align 4
-  %conv2 = fpext float %0 to double
-  %1 = tail call i64 @llvm.lrint.i64.f64(double %conv2)
-  %conv3 = trunc i64 %1 to i32
-  %gep5 = getelementptr inbounds i32, ptr %a, i64 %iv
-  store i32 %conv3, ptr %gep5, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_llrint(ptr %a, ptr %b, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-CAST ir<[[FPEXT:%.+]]> = fpext ir<[[LD1]]> to double
-; IF-EVL-NEXT:     WIDEN-INTRINSIC ir<[[LLRINT:%.+]]> = call llvm.llrint(ir<[[FPEXT]]>)
-; IF-EVL-NEXT:     WIDEN-CAST ir<[[TRUNC:%.+]]> = trunc ir<[[LLRINT]]> to i32
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[TRUNC]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds float, ptr %b, i64 %iv
-  %0 = load float, ptr %gep, align 4
-  %conv2 = fpext float %0 to double
-  %1 = tail call i64 @llvm.llrint.i64.f64(double %conv2)
-  %conv3 = trunc i64 %1 to i32
-  %gep5 = getelementptr inbounds i32, ptr %a, i64 %iv
-  store i32 %conv3, ptr %gep5, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_abs(ptr %a, ptr %b, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-INTRINSIC ir<[[ABS:%.+]]> = call llvm.abs(ir<[[LD1]]>, ir<true>)
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[ABS]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds i32, ptr %b, i64 %iv
-  %0 = load i32, ptr %gep, align 4
-  %cond = tail call i32 @llvm.abs.i32(i32 %0, i1 true)
-  %gep9 = getelementptr inbounds i32, ptr %a, i64 %iv
-  store i32 %cond, ptr %gep9, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-declare i32 @llvm.smax.i32(i32, i32)
-declare i32 @llvm.smin.i32(i32, i32)
-declare i32 @llvm.umax.i32(i32, i32)
-declare i32 @llvm.umin.i32(i32, i32)
-declare i32 @llvm.ctlz.i32(i32, i1 immarg)
-declare i32 @llvm.cttz.i32(i32, i1 immarg)
-declare i64 @llvm.lrint.i64.f64(double)
-declare i64 @llvm.llrint.i64.f64(double)
-declare i32 @llvm.abs.i32(i32, i1 immarg)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll
deleted file mode 100644
index 8d3fe484e6468..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll
+++ /dev/null
@@ -1,576 +0,0 @@
-; REQUIRES: asserts
-; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
-; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
-; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
-
-define void @vp_sext(ptr %a, ptr %b, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL:   <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-CAST ir<[[SEXT:%.+]]> = sext ir<[[LD1]]> to i64
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[SEXT]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-; IF-EVL-NEXT: Successor(s): middle.block
-
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds i32, ptr %b, i64 %iv
-  %0 = load i32, ptr %gep, align 4
-  %conv2 = sext i32 %0 to i64
-  %gep4 = getelementptr inbounds i64, ptr %a, i64 %iv
-  store i64 %conv2, ptr %gep4, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_zext(ptr %a, ptr %b, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-CAST ir<[[ZEXT:%.+]]> = zext ir<[[LD1]]> to i64
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[ZEXT]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds i32, ptr %b, i64 %iv
-  %0 = load i32, ptr %gep, align 4
-  %conv2 = zext i32 %0 to i64
-  %gep4 = getelementptr inbounds i64, ptr %a, i64 %iv
-  store i64 %conv2, ptr %gep4, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_trunc(ptr %a, ptr %b, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-CAST ir<[[TRUNC:%.+]]> = trunc ir<[[LD1]]> to i16
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[TRUNC]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds i32, ptr %b, i64 %iv
-  %0 = load i32, ptr %gep, align 4
-  %conv2 = trunc i32 %0 to i16
-  %gep4 = getelementptr inbounds i16, ptr %a, i64 %iv
-  store i16 %conv2, ptr %gep4, align 2
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_fpext(ptr %a, ptr %b, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-CAST ir<[[FPEXT:%.+]]> = fpext ir<[[LD1]]> to double
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[FPEXT]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds float, ptr %b, i64 %iv
-  %0 = load float, ptr %gep, align 4
-  %conv2 = fpext float %0 to double
-  %gep4 = getelementptr inbounds double, ptr %a, i64 %iv
-  store double %conv2, ptr %gep4, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_fptrunc(ptr %a, ptr %b, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-CAST ir<[[FPTRUNC:%.+]]> = fptrunc ir<[[LD1]]> to float
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[FPTRUNC]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds double, ptr %b, i64 %iv
-  %0 = load double, ptr %gep, align 8
-  %conv2 = fptrunc double %0 to float
-  %gep4 = getelementptr inbounds float, ptr %a, i64 %iv
-  store float %conv2, ptr %gep4, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_sitofp(ptr %a, ptr %b, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-CAST ir<[[SITOFP:%.+]]> = sitofp ir<[[LD1]]> to float
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[SITOFP]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds i32, ptr %b, i64 %iv
-  %0 = load i32, ptr %gep, align 4
-  %conv2 = sitofp i32 %0 to float
-  %gep4 = getelementptr inbounds float, ptr %a, i64 %iv
-  store float %conv2, ptr %gep4, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_uitofp(ptr %a, ptr %b, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-CAST ir<[[UITOFP:%.+]]> = uitofp ir<[[LD1]]> to float
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[UITOFP]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds i32, ptr %b, i64 %iv
-  %0 = load i32, ptr %gep, align 4
-  %conv2 = uitofp i32 %0 to float
-  %gep4 = getelementptr inbounds float, ptr %a, i64 %iv
-  store float %conv2, ptr %gep4, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_fptosi(ptr %a, ptr %b, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-CAST ir<[[FPTOSI:%.+]]> = fptosi ir<[[LD1]]> to i32
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[FPTOSI]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds float, ptr %b, i64 %iv
-  %0 = load float, ptr %gep, align 4
-  %conv2 = fptosi float %0 to i32
-  %gep4 = getelementptr inbounds i32, ptr %a, i64 %iv
-  store i32 %conv2, ptr %gep4, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_fptoui(ptr %a, ptr %b, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-CAST ir<[[FPTOUI:%.+]]> = fptoui ir<[[LD1]]> to i32
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[FPTOUI]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-  %gep = getelementptr inbounds float, ptr %b, i64 %iv
-  %0 = load float, ptr %gep, align 4
-  %conv2 = fptoui float %0 to i32
-  %gep4 = getelementptr inbounds i32, ptr %a, i64 %iv
-  store i32 %conv2, ptr %gep4, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_inttoptr(ptr %a, ptr %b, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-;
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
-; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-CAST ir<[[INTTOPTR:%.+]]> = inttoptr ir<[[LD1]]> to ptr
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
-; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[INTTOPTR]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[CAST]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %gep = getelementptr inbounds i64, ptr %b, i64 %iv
-  %0 = load i64, ptr %gep, align 8
-  %1 = inttoptr i64 %0 to ptr
-  %gep2 = getelementptr inbounds ptr, ptr %a, i64 %iv
-  store ptr %1, ptr %gep2, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-define void @vp_ptrtoint(ptr %a, ptr %b, i64 %N) {
-; IF-EVL: VPlan 'Initial VPlan for VF={1},UF>=1'
-; IF-EVL-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
-
-; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' {
-; IF-EVL-NEXT: Live-in vp<[[VFUF:%.+]]> = VF * UF
-; IF-EVL-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
-; IF-EVL-NEXT: Live-in ir<[[N:%.+]]> = original trip-count
-
-; IF-EVL: vector.ph:
-; IF-EVL-NEXT: Successor(s): vector loop
-
-; IF-EVL: <x1> vector loop: {
-; IF-EVL-NEXT:   vector.body:
-; IF-EVL-NEXT:     EMIT vp<[[INDEX:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[INDEX_NEXT:%.+]]>
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[INDEX_EVL:%.+]]> = phi ir<0>, vp<[[INDEX_EVL_NEXT:%.+]]>
-; IF-EVL-NEXT:     ir<[[IV:%.+]]> = WIDEN-INDUCTION  ir<0>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[AVL:%.+]]> = phi [ ir<%N>, vector.ph ], [ vp<[[AVL_NEXT:%.+]]>, vector.body ]
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
-; IF-EVL-NEXT:     vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[INDEX_EVL]]>, ir<1>, vp<[[EVL]]>
-; IF-EVL-NEXT:     WIDEN-GEP Inv[Var] ir<[[GEP:%.+]]> = getelementptr inbounds ir<%b>, ir<[[IV]]>
-; IF-EVL-NEXT:     WIDEN-CAST ir<[[PTRTOINT:%.+]]> = ptrtoint ir<[[GEP]]> to i64
-; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[SCALAR_STEPS]]>
-; IF-EVL-NEXT:     vp<[[VECTOR_PTR:%.+]]> = vector-pointer ir<[[GEP2]]>
-; IF-EVL-NEXT:     WIDEN vp.store vp<[[VECTOR_PTR]]>, ir<[[PTRTOINT]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT-SCALAR vp<[[ZEXT:%.+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[INDEX_EVL_NEXT]]> = add vp<[[ZEXT]]>, vp<[[INDEX_EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[AVL_NEXT]]> = sub nuw vp<[[AVL]]>, vp<[[ZEXT]]>
-; IF-EVL-NEXT:     EMIT vp<[[INDEX_NEXT]]> = add vp<[[INDEX]]>, vp<[[VFUF]]>
-; IF-EVL-NEXT:     EMIT branch-on-count vp<[[INDEX_NEXT]]>, vp<[[VTC]]>
-; IF-EVL-NEXT:   No successors
-; IF-EVL-NEXT: }
-; IF-EVL-NEXT: Successor(s): middle.block
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %gep = getelementptr inbounds i32, ptr %b, i64 %iv
-  %0 = ptrtoint ptr %gep to i64
-  %gep2 = getelementptr inbounds i64, ptr %a, i64 %iv
-  store i64 %0, ptr %gep2, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
index b26e9cf55ddbf..718e03cfa0c67 100644
--- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
@@ -1231,7 +1231,7 @@ define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 no
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %14 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %20 = load i8
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8 %48
-; CHECK: LV: Vector loop of width 8 costs: 10.
+; CHECK: LV: Vector loop of width 8 costs: 11.
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %14 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %20 = load i8
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: store i8 %48
@@ -1442,8 +1442,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 18
-; CHECK: LV: Vector loop of width 2 costs: 23
-; CHECK: LV: Vector loop of width 4 costs: 13
+; CHECK: LV: Vector loop of width 2 costs: 27
+; CHECK: LV: Vector loop of width 4 costs: 15
 ; CHECK: LV: Selecting VF: 4.
 define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1484,8 +1484,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 18
-; CHECK: LV: Vector loop of width 2 costs: 23
-; CHECK: LV: Vector loop of width 4 costs: 13
+; CHECK: LV: Vector loop of width 2 costs: 27
+; CHECK: LV: Vector loop of width 4 costs: 15
 ; CHECK: LV: Selecting VF: 4.
 define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1526,9 +1526,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 21
-; CHECK: LV: Vector loop of width 4 costs: 14.
-; CHECK: LV: Selecting VF: 4.
+; CHECK: LV: Vector loop of width 2 costs: 26
+; CHECK: LV: Vector loop of width 4 costs: 16.
+; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp22.not = icmp eq i32 %N, 0
@@ -1566,9 +1566,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 21
-; CHECK: LV: Vector loop of width 4 costs: 14.
-; CHECK: LV: Selecting VF: 4.
+; CHECK: LV: Vector loop of width 2 costs: 26
+; CHECK: LV: Vector loop of width 4 costs: 16.
+; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp21.not = icmp eq i32 %N, 0
@@ -1608,8 +1608,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 18
-; CHECK: LV: Vector loop of width 2 costs: 22
-; CHECK: LV: Vector loop of width 4 costs: 11.
+; CHECK: LV: Vector loop of width 2 costs: 24
+; CHECK: LV: Vector loop of width 4 costs: 12
 ; CHECK: LV: Selecting VF: 4.
 define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1652,8 +1652,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 18
-; CHECK: LV: Vector loop of width 2 costs: 22
-; CHECK: LV: Vector loop of width 4 costs: 11.
+; CHECK: LV: Vector loop of width 2 costs: 24
+; CHECK: LV: Vector loop of width 4 costs: 12
 ; CHECK: LV: Selecting VF: 4.
 define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1696,9 +1696,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 20
-; CHECK: LV: Vector loop of width 4 costs: 13.
-; CHECK: LV: Selecting VF: 4.
+; CHECK: LV: Vector loop of width 2 costs: 23
+; CHECK: LV: Vector loop of width 4 costs: 14
+; CHECK: LV: Selecting VF: 4
 define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp22.not = icmp eq i32 %N, 0
@@ -1738,9 +1738,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 20
-; CHECK: LV: Vector loop of width 4 costs: 13.
-; CHECK: LV: Selecting VF: 4.
+; CHECK: LV: Vector loop of width 2 costs: 23
+; CHECK: LV: Vector loop of width 4 costs: 14
+; CHECK: LV: Selecting VF: 4
 define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp21.not = icmp eq i32 %N, 0
@@ -1883,8 +1883,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 32
-; CHECK: LV: Vector loop of width 2 costs: 43
-; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Vector loop of width 2 costs: 51
+; CHECK: LV: Vector loop of width 4 costs: 27
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1943,8 +1943,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 32
-; CHECK: LV: Vector loop of width 2 costs: 43
-; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Vector loop of width 2 costs: 51
+; CHECK: LV: Vector loop of width 4 costs: 27
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2004,9 +2004,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
-; CHECK: LV: Vector loop of width 2 costs: 38
-; CHECK: LV: Vector loop of width 4 costs: 26
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Vector loop of width 2 costs: 48
+; CHECK: LV: Vector loop of width 4 costs: 31
+; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp48.not = icmp eq i32 %N, 0
@@ -2061,9 +2061,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
-; CHECK: LV: Vector loop of width 2 costs: 38
-; CHECK: LV: Vector loop of width 4 costs: 26
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Vector loop of width 2 costs: 48
+; CHECK: LV: Vector loop of width 4 costs: 31
+; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp45.not = icmp eq i32 %N, 0
@@ -2119,8 +2119,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 32
-; CHECK: LV: Vector loop of width 2 costs: 37
-; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Vector loop of width 2 costs: 41
+; CHECK: LV: Vector loop of width 4 costs: 25
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2181,8 +2181,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 32
-; CHECK: LV: Vector loop of width 2 costs: 37
-; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Vector loop of width 2 costs: 41
+; CHECK: LV: Vector loop of width 4 costs: 25
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2243,9 +2243,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
-; CHECK: LV: Vector loop of width 2 costs: 35
-; CHECK: LV: Vector loop of width 4 costs: 26
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Vector loop of width 2 costs: 41
+; CHECK: LV: Vector loop of width 4 costs: 29
+; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp48.not = icmp eq i32 %N, 0
@@ -2301,9 +2301,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
-; CHECK: LV: Vector loop of width 2 costs: 35
-; CHECK: LV: Vector loop of width 4 costs: 26
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Vector loop of width 2 costs: 41
+; CHECK: LV: Vector loop of width 4 costs: 29
+; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp45.not = icmp eq i32 %N, 0
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
index a286df9bc2fc7..c2c04ce6f5ff5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
@@ -85,13 +85,13 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 8 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 10 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 17 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 21 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 35 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 3000000 for VF 8 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 43 for VF 16 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 71 for VF 16 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 3000000 for VF 16 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ;
 ; ENABLED_MASKED_STRIDED-LABEL: 'test2'
@@ -99,8 +99,8 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll
index e6b74062ad765..a33f8eb920039 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll
@@ -35,7 +35,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; This test was originally vectorized, but now SCEV is smart enough to prove
 ; that its trip count is 1, so it gets ignored by vectorizer.
 ; Function Attrs: uwtable
-define void @test_01(i1 %arg) {
+define void @test_01(ptr addrspace(1) %p, i1 %arg) {
   br label %.outer
 
 ; <label>:1:                                      ; preds = %2
@@ -57,8 +57,8 @@ define void @test_01(i1 %arg) {
   %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ]
   %9 = add i32 %8, 2
   %10 = zext i32 %9 to i64
-  %11 = getelementptr inbounds i32, ptr addrspace(1) undef, i64 %10
-  %12 = ashr i32 undef, %4
+  %11 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %10
+  %12 = ashr i32 12, %4
   store i32 %12, ptr addrspace(1) %11, align 4
   %13 = add i32 %7, 1
   %14 = icmp sgt i32 %13, 61
@@ -74,7 +74,7 @@ define void @test_01(i1 %arg) {
 ; CHECK: store <4 x i32>
 
 ; Function Attrs: uwtable
-define void @test_02(i1 %arg) {
+define void @test_02(ptr addrspace(1) %p, i1 %arg) {
   br label %.outer
 
 ; <label>:1:                                      ; preds = %2
@@ -96,8 +96,8 @@ define void @test_02(i1 %arg) {
   %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ]
   %9 = add i32 %8, 2
   %10 = zext i32 %9 to i64
-  %11 = getelementptr inbounds i32, ptr addrspace(1) undef, i64 %10
-  %12 = ashr i32 undef, %4
+  %11 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %10
+  %12 = ashr i32 12, %4
   store i32 %12, ptr addrspace(1) %11, align 4
   %13 = add i32 %7, 1
   %14 = icmp sgt i32 %13, 610
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
index baedf34b5548f..6ec010cdcc248 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
@@ -1193,7 +1193,7 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 {
 ; CHECK:       pred.udiv.continue62:
 ; CHECK-NEXT:    [[TMP161:%.*]] = phi <32 x i32> [ [[TMP156]], [[PRED_UDIV_CONTINUE60]] ], [ [[TMP160]], [[PRED_UDIV_IF61]] ]
 ; CHECK-NEXT:    [[TMP162:%.*]] = zext <32 x i32> [[TMP161]] to <32 x i64>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <32 x i1> [[BROADCAST_SPLAT]], <32 x i64> zeroinitializer, <32 x i64> [[TMP162]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <32 x i64> zeroinitializer, <32 x i64> [[TMP162]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i32> [[VEC_IND]], splat (i32 32)
 ; CHECK-NEXT:    [[TMP163:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
@@ -1289,7 +1289,7 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 {
 ; CHECK:       pred.udiv.continue84:
 ; CHECK-NEXT:    [[TMP206:%.*]] = phi <8 x i32> [ [[TMP201]], [[PRED_UDIV_CONTINUE82]] ], [ [[TMP205]], [[PRED_UDIV_IF83]] ]
 ; CHECK-NEXT:    [[TMP207:%.*]] = zext <8 x i32> [[TMP206]] to <8 x i64>
-; CHECK-NEXT:    [[PREDPHI85:%.*]] = select <8 x i1> [[BROADCAST_SPLAT64]], <8 x i64> zeroinitializer, <8 x i64> [[TMP207]]
+; CHECK-NEXT:    [[PREDPHI85:%.*]] = select i1 [[C]], <8 x i64> zeroinitializer, <8 x i64> [[TMP207]]
 ; CHECK-NEXT:    [[INDEX_NEXT86]] = add nuw i32 [[INDEX67]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT87]] = add <8 x i32> [[VEC_IND68]], splat (i32 8)
 ; CHECK-NEXT:    [[TMP208:%.*]] = icmp eq i32 [[INDEX_NEXT86]], 1000
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
index 4cff8753ba9b1..239366c59470e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
@@ -11,9 +11,9 @@
 target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-w64-windows-gnu"
 
-define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p) #0 {
+define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p, ptr %pend) #0 {
 ; CHECK-LABEL: define void @cff_index_load_offsets(
-; CHECK-SAME: i1 [[COND:%.*]], i8 [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: i1 [[COND:%.*]], i8 [[X:%.*]], ptr [[P:%.*]], ptr [[PEND:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[IF_THEN:.*]], label %[[EXIT:.*]]
 ; CHECK:       [[IF_THEN]]:
@@ -26,14 +26,14 @@ define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p) #0 {
 ; CHECK-NEXT:    [[CONV73:%.*]] = zext i8 [[TMP0]] to i32
 ; CHECK-NEXT:    [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16
 ; CHECK-NEXT:    [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr undef, align 1, !tbaa [[CHAR_TBAA1]]
-; CHECK-NEXT:    [[SHL78:%.*]] = shl nuw nsw i32 undef, 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA1]]
+; CHECK-NEXT:    [[SHL78:%.*]] = shl nuw nsw i32 12, 8
 ; CHECK-NEXT:    [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]]
 ; CHECK-NEXT:    [[CONV81:%.*]] = zext i8 [[TMP1]] to i32
 ; CHECK-NEXT:    [[OR83:%.*]] = or i32 [[OR79]], [[CONV81]]
-; CHECK-NEXT:    store i32 [[OR83]], ptr undef, align 4, !tbaa [[LONG_TBAA4:![0-9]+]]
+; CHECK-NEXT:    store i32 [[OR83]], ptr [[P]], align 4, !tbaa [[LONG_TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[ADD_PTR86]] = getelementptr inbounds i8, ptr [[P_359]], i64 4
-; CHECK-NEXT:    [[CMP66:%.*]] = icmp ult ptr [[ADD_PTR86]], undef
+; CHECK-NEXT:    [[CMP66:%.*]] = icmp ult ptr [[ADD_PTR86]], [[PEND]]
 ; CHECK-NEXT:    br i1 [[CMP66]], label %[[FOR_BODY68]], label %[[SW_EPILOG:.*]]
 ; CHECK:       [[SW_EPILOG]]:
 ; CHECK-NEXT:    unreachable
@@ -54,14 +54,14 @@ for.body68:                                       ; preds = %for.body68, %if.the
   %conv73 = zext i8 %0 to i32
   %shl74 = shl nuw nsw i32 %conv73, 16
   %or75 = or i32 %shl74, %shl71
-  %1 = load i8, ptr undef, align 1, !tbaa !1
-  %shl78 = shl nuw nsw i32 undef, 8
+  %1 = load i8, ptr %p, align 1, !tbaa !1
+  %shl78 = shl nuw nsw i32 12, 8
   %or79 = or i32 %or75, %shl78
   %conv81 = zext i8 %1 to i32
   %or83 = or i32 %or79, %conv81
-  store i32 %or83, ptr undef, align 4, !tbaa !4
+  store i32 %or83, ptr %p, align 4, !tbaa !4
   %add.ptr86 = getelementptr inbounds i8, ptr %p.359, i64 4
-  %cmp66 = icmp ult ptr %add.ptr86, undef
+  %cmp66 = icmp ult ptr %add.ptr86, %pend
   br i1 %cmp66, label %for.body68, label %sw.epilog
 
 sw.epilog:                                        ; preds = %for.body68
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index cc84fabd00ecc..002d811d46992 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -435,67 +435,16 @@ define void @test_first_order_recurrence_tried_to_scalarized(ptr %dst, i1 %c, i3
 ; CHECK-LABEL: @test_first_order_recurrence_tried_to_scalarized(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[N:%.*]] = select i1 [[C:%.*]], i32 8, i32 9
-; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 4>, [[VECTOR_PH]] ], [ [[VEC_IND]], [[PRED_STORE_CONTINUE6]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[DST:%.*]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw i32 10, [[TMP5]]
-; CHECK-NEXT:    store i32 [[TMP6]], ptr [[TMP4]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
-; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
-; CHECK:       pred.store.if1:
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = sub nsw i32 10, [[TMP10]]
-; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP9]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
-; CHECK:       pred.store.continue2:
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
-; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
-; CHECK:       pred.store.if3:
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; CHECK:       loop:
+; CHECK-NEXT:    [[TMP18:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi i32 [ 4, [[ENTRY]] ], [ [[TMP18]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[TMP18]], 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = sub nsw i32 10, [[TMP15]]
-; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP14]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
-; CHECK:       pred.store.continue4:
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
-; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
-; CHECK:       pred.store.if5:
-; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP21:%.*]] = sub nsw i32 10, [[TMP20]]
-; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP19]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
-; CHECK:       pred.store.continue6:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i32, ptr [[DST:%.*]], i32 [[TMP18]]
+; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP19]], align 4
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
index e75d469506376..acec9e47a94ee 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
@@ -41,8 +41,8 @@ for.cond.cleanup:                                 ; preds = %for.body
 
 ; Make sure interleave groups with a key being the special 'empty' value for
 ; the map do not cause a crash.
-define void @test_gap_empty_key() {
-; CHECK-LABEL: @test_gap_empty_key()
+define void @test_gap_empty_key(ptr %p) {
+; CHECK-LABEL: @test_gap_empty_key(ptr %p)
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label %for.body
 
@@ -57,7 +57,7 @@ entry:
 for.body:
   %iv = phi i64 [ 1, %entry ], [ %iv.next, %for.body ]
   %iv.next = add nsw i64 %iv, 1
-  %arrayidx = getelementptr inbounds [3 x i32], ptr undef, i64 0, i64 %iv.next
+  %arrayidx = getelementptr inbounds [3 x i32], ptr %p, i64 0, i64 %iv.next
   %G2 = getelementptr i32, ptr %arrayidx, i64 %iv.next
   %G9 = getelementptr i32, ptr %G2, i32 -2147483647
   store i32 0, ptr %G2
@@ -71,8 +71,8 @@ exit:
 
 ; Make sure interleave groups with a key being the special 'tombstone' value for
 ; the map do not cause a crash.
-define void @test_tombstone_key() {
-; CHECK-LABEL: @test_tombstone_key()
+define void @test_tombstone_key(ptr %p) {
+; CHECK-LABEL: @test_tombstone_key(ptr %p)
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label %for.body
 
@@ -87,7 +87,7 @@ entry:
 for.body:
   %iv = phi i64 [ 1, %entry ], [ %iv.next, %for.body ]
   %iv.next = add nsw i64 %iv, 1
-  %arrayidx = getelementptr inbounds [3 x i32], ptr undef, i64 0, i64 %iv.next
+  %arrayidx = getelementptr inbounds [3 x i32], ptr %p, i64 0, i64 %iv.next
   %G2 = getelementptr i32, ptr %arrayidx, i64 %iv.next
   %G9 = getelementptr i32, ptr %G2, i32 -2147483648
   store i32 0, ptr %G2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
index 0bc86fff9831b..7e5964ac30cba 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
@@ -37,7 +37,8 @@ define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP2]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[BROADCAST_SPLAT]], <16 x i1> [[TMP1]], <16 x i32> poison), !alias.scope [[META3]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> splat (i32 1)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP9]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> splat (i32 1)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -63,7 +64,8 @@ define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX10]]
 ; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT12]], ptr [[TMP6]], align 4, !alias.scope [[META0]], !noalias [[META3]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[BROADCAST_SPLAT9]], <8 x i1> [[TMP5]], <8 x i32> poison), !alias.scope [[META3]]
-; CHECK-NEXT:    [[PREDPHI14:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[WIDE_MASKED_GATHER13]], <8 x i32> splat (i32 1)
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    [[PREDPHI14:%.*]] = select i1 [[TMP10]], <8 x i32> [[WIDE_MASKED_GATHER13]], <8 x i32> splat (i32 1)
 ; CHECK-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX10]], 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC7]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
index c1272e56836f8..6e3b2a5390948 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
@@ -12,27 +12,22 @@ define void @test_tc_17_no_epilogue_vectorization(ptr noalias %src, ptr noalias
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 64
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 64
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1:%.*]], align 64
+; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3:%.*]], align 64
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 16, [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[LDADDR:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I]]
+; CHECK-NEXT:    [[LDADDR:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[I]]
 ; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[LDADDR]], align 64
-; CHECK-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I]]
+; CHECK-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[I]]
 ; CHECK-NEXT:    store i8 [[VAL]], ptr [[STADDR]], align 64
 ; CHECK-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[IS_NEXT:%.*]] = icmp ult i64 [[I_NEXT]], 17
-; CHECK-NEXT:    br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -69,11 +64,11 @@ define void @test_tc_18(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 64
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF4:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
@@ -140,7 +135,7 @@ define void @test_tc_19(ptr noalias %src, ptr noalias %dst) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF4]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
@@ -219,7 +214,7 @@ define void @test_tc_20(ptr noalias %src, ptr noalias %dst) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF11:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
@@ -231,7 +226,7 @@ define void @test_tc_20(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD5]], ptr [[TMP15]], align 64
 ; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT6]], 20
-; CHECK-NEXT:    br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
@@ -245,7 +240,7 @@ define void @test_tc_20(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    store i8 [[VAL]], ptr [[STADDR]], align 64
 ; CHECK-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[IS_NEXT:%.*]] = icmp ult i64 [[I_NEXT]], 20
-; CHECK-NEXT:    br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -281,7 +276,7 @@ define void @limit_main_loop_vf_to_avoid_dead_main_vector_loop(ptr noalias %src,
 ; CHECK-NEXT:    store <8 x i8> [[STRIDED_VEC]], ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -294,7 +289,7 @@ define void @limit_main_loop_vf_to_avoid_dead_main_vector_loop(ptr noalias %src,
 ; CHECK-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 32
-; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
index 8771dc9a20379..6605338771c47 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -2581,8 +2581,7 @@ define i32 @test_non_unit_stride_five(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP114]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI5]]
 ; CHECK-NEXT:    [[TMP115]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI6]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP116:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP116]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP113]], [[TMP112]]
 ; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP114]], [[BIN_RDX]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/narrow-to-single-scalar.ll b/llvm/test/Transforms/LoopVectorize/X86/narrow-to-single-scalar.ll
new file mode 100644
index 0000000000000..94a05a67a0bdc
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/narrow-to-single-scalar.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -p loop-vectorize -mcpu=skylake -S %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+@p = external global [3952 x i8], align 8
+@q = external global [3952 x i8], align 8
+
+define void @narrow_store_user_mask_operand(i32 %x) {
+; CHECK-LABEL: define void @narrow_store_user_mask_operand(
+; CHECK-SAME: i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_PH:.*]]
+; CHECK:       [[LOOP_PH]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_TAIL:.*]] ]
+; CHECK-NEXT:    [[X_POS:%.*]] = icmp sgt i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[X_POS]], label %[[LOOP_BODY:.*]], label %[[LOOP_TAIL]]
+; CHECK:       [[LOOP_BODY]]:
+; CHECK-NEXT:    [[LD_P:%.*]] = load double, ptr @p, align 8
+; CHECK-NEXT:    [[GEP_Q_IV:%.*]] = getelementptr double, ptr @q, i64 [[IV]]
+; CHECK-NEXT:    [[GEP_Q_IV_8:%.*]] = getelementptr i8, ptr [[GEP_Q_IV]], i64 -8
+; CHECK-NEXT:    store double [[LD_P]], ptr [[GEP_Q_IV_8]], align 8
+; CHECK-NEXT:    br label %[[LOOP_TAIL]]
+; CHECK:       [[LOOP_TAIL]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_PH]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.ph
+
+loop.ph:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.tail ]
+  %x.pos = icmp sgt i32 %x, 0
+  br i1 %x.pos, label %loop.body, label %loop.tail
+
+loop.body:
+  %ld.p = load double, ptr @p
+  %gep.q.iv = getelementptr double, ptr @q, i64 %iv
+  %gep.q.iv.8 = getelementptr i8, ptr %gep.q.iv, i64 -8
+  store double %ld.p, ptr %gep.q.iv.8
+  br label %loop.tail
+
+loop.tail:
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, 1
+  br i1 %ec, label %exit, label %loop.ph
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll b/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll
index 619693abf51e4..57cbe7f4c241b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll
@@ -97,8 +97,7 @@ define i8 @pr141968(i1 %cond, i8 %v) {
 ; CHECK:       [[PRED_SDIV_IF29]]:
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE30]]
 ; CHECK:       [[PRED_SDIV_CONTINUE30]]:
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i1> [[BROADCAST_SPLAT]], i32 0
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP18]], i8 0, i8 [[V]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[COND]], i8 0, i8 [[V]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
index 03087bb883464..715d6db50488f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
@@ -199,10 +199,6 @@ define float @uniform_load_replicating_select(ptr %A, ptr %B, i64 %1) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 7
 ; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[A]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i1> poison, i1 [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i1> [[TMP8]], i1 [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i1> [[TMP9]], i1 [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i1> [[TMP13]], i1 [[TMP10]], i32 3
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
@@ -211,7 +207,7 @@ define float @uniform_load_replicating_select(ptr %A, ptr %B, i64 %1) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP16]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP17]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <4 x i1> [[TMP14]], <4 x float> splat (float 1.000000e+01), <4 x float> splat (float 1.000000e+00)
+; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP10]], <4 x float> splat (float 1.000000e+01), <4 x float> splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP20]], align 4
 ; CHECK-NEXT:    [[TMP26:%.*]] = load float, ptr [[TMP21]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index e99ffda9e4043..c10dc5ddba2a9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -524,22 +524,78 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; induction is used outside the loop.
 define i64 @example23d(ptr noalias nocapture %src, ptr noalias nocapture %dst) optsize {
 ; CHECK-LABEL: @example23d(
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[TMP1:%.*]]
-; CHECK:       1:
-; CHECK-NEXT:    [[DOT04:%.*]] = phi ptr [ [[SRC:%.*]], [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[TMP1]] ]
-; CHECK-NEXT:    [[DOT013:%.*]] = phi ptr [ [[DST:%.*]], [[TMP0]] ], [ [[TMP6:%.*]], [[TMP1]] ]
-; CHECK-NEXT:    [[I_02:%.*]] = phi i64 [ 0, [[TMP0]] ], [ [[TMP7:%.*]], [[TMP1]] ]
-; CHECK-NEXT:    [[TMP2]] = getelementptr inbounds nuw i8, ptr [[DOT04]], i64 2
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[TMP9]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[TMP10]], i64 6
+; CHECK-NEXT:    [[OFFSET_IDX4:%.*]] = shl i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[OFFSET_IDX4]]
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[TMP11]], i64 4
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX4]]
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[TMP32]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX4]]
+; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[TMP6]], i64 12
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP33]], i64 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[DOT013:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX4]]
+; CHECK-NEXT:    [[DOT04:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[DOT04]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 7
-; CHECK-NEXT:    [[TMP6]] = getelementptr inbounds nuw i8, ptr [[DOT013]], i64 4
 ; CHECK-NEXT:    store i32 [[TMP5]], ptr [[DOT013]], align 4
-; CHECK-NEXT:    [[TMP7]] = add nuw nsw i64 [[I_02]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[TMP7]], 257
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[TMP8:%.*]], label [[TMP1]]
-; CHECK:       8:
-; CHECK-NEXT:    ret i64 [[TMP7]]
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP33]], i64 1
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK:       pred.store.if9:
+; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[NEXT_GEP1]], align 2
+; CHECK-NEXT:    [[TMP14:%.*]] = zext i16 [[TMP13]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw nsw i32 [[TMP14]], 7
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[NEXT_GEP6]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; CHECK:       pred.store.continue10:
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP33]], i64 2
+; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK:       pred.store.if11:
+; CHECK-NEXT:    [[TMP17:%.*]] = load i16, ptr [[NEXT_GEP2]], align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i16 [[TMP17]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i32 [[TMP18]], 7
+; CHECK-NEXT:    store i32 [[TMP19]], ptr [[NEXT_GEP7]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; CHECK:       pred.store.continue12:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP33]], i64 3
+; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]]
+; CHECK:       pred.store.if13:
+; CHECK-NEXT:    [[TMP21:%.*]] = load i16, ptr [[NEXT_GEP3]], align 2
+; CHECK-NEXT:    [[TMP22:%.*]] = zext i16 [[TMP21]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 7
+; CHECK-NEXT:    store i32 [[TMP23]], ptr [[NEXT_GEP8]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
+; CHECK:       pred.store.continue14:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
+; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[TMP1]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[TMP30:%.*]]
+; CHECK:       25:
+; CHECK-NEXT:    [[TMP25:%.*]] = xor <4 x i1> [[TMP33]], splat (i1 true)
+; CHECK-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP25]], i1 true)
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], -1
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add nsw i64 [[TMP28]], 1
+; CHECK-NEXT:    ret i64 [[TMP29]]
 ;
   br label %1
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll
index d4004daf8833c..8081c0e17f865 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll
@@ -64,39 +64,24 @@ exit:
 define void @uniform_load_can_fold_users(ptr noalias %src, ptr %dst, i64 %start, double %d) {
 ; CHECK-LABEL: define void @uniform_load_can_fold_users(
 ; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i64 [[START:%.*]], double [[D:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[START]], 1
-; CHECK-NEXT:    [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 [[START]], i64 0)
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[SMIN]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[START]], [[N_VEC]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[START]], %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[SRC]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[BROADCAST_SPLAT]], splat (double 9.000000e+00)
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul double [[TMP5]], 9.000000e+00
 ; CHECK-NEXT:    [[TMP8:%.*]] = fdiv double [[TMP7]], [[D]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP3]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP4]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[TMP11]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[TMP12]], i64 [[TMP10]]
-; CHECK-NEXT:    store double [[TMP8]], ptr [[TMP13]], align 8
 ; CHECK-NEXT:    store double [[TMP8]], ptr [[TMP14]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[IV_1_NEXT]] = add i64 [[TMP4]], 1
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], -1
+; CHECK-NEXT:    [[EC:%.*]] = icmp sgt i64 [[IV_2]], 0
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
index 22eb0ca380033..9cd5625e5f8e6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
@@ -23,8 +23,8 @@ define void @scalarselect(i1 %cond) {
   %7 = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 %indvars.iv
 
 ; CHECK: cost of 1 for VF 1 {{.*}}  select i1 %cond, i32 %6, i32 0
-; CHECK: Cost of 2 for VF 2: WIDEN-SELECT ir<%sel> = select ir<%cond>, ir<%6>, ir<0> (condition is loop invariant)
-; CHECK: Cost of 2 for VF 4: WIDEN-SELECT ir<%sel> = select ir<%cond>, ir<%6>, ir<0> (condition is loop invariant)
+; CHECK: Cost of 2 for VF 2: WIDEN-SELECT ir<%sel> = select ir<%cond>, ir<%6>, ir<0> (condition is single-scalar)
+; CHECK: Cost of 2 for VF 4: WIDEN-SELECT ir<%sel> = select ir<%cond>, ir<%6>, ir<0> (condition is single-scalar)
 
   %sel = select i1 %cond, i32 %6, i32 zeroinitializer
   store i32 %sel, ptr %7, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index efcc0005acaa3..f9570405ecabc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -711,17 +711,92 @@ define dso_local void @masked_strided3_optsize_unknown_tc(ptr noalias nocapture
 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
-; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE16:%.*]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE16]] ]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = mul i32 [[INDEX]], 3
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP2]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0(ptr align 1 [[TMP3]], <24 x i1> [[TMP5]], <24 x i8> poison)
-; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = mul nsw <8 x i32> [[VEC_IND]], splat (i32 3)
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP51]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i32 [[TMP5]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP53:%.*]] = load i8, ptr [[TMP52]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = insertelement <8 x i8> poison, i8 [[TMP53]], i64 0
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = phi <8 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP4]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if3:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP11]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[TMP13]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue4:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = phi <8 x i8> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP4]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if5:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP17]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP20:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[TMP19]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue6:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = phi <8 x i8> [ [[TMP15]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP20]], [[PRED_LOAD_IF5]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP22:%.*]] = extractelement <8 x i1> [[TMP4]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if7:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP23]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP24]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP26:%.*]] = insertelement <8 x i8> [[TMP21]], i8 [[TMP25]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue8:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP27:%.*]] = phi <8 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP26]], [[PRED_LOAD_IF7]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP28:%.*]] = extractelement <8 x i1> [[TMP4]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP28]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if9:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP29:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP29]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP32:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP31]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue10:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP33:%.*]] = phi <8 x i8> [ [[TMP27]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP34:%.*]] = extractelement <8 x i1> [[TMP4]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if11:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP35:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP35]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP38:%.*]] = insertelement <8 x i8> [[TMP33]], i8 [[TMP37]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue12:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP39:%.*]] = phi <8 x i8> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP38]], [[PRED_LOAD_IF11]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP40:%.*]] = extractelement <8 x i1> [[TMP4]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP40]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if13:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP41:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP41]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP44:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[TMP43]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue14:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP45:%.*]] = phi <8 x i8> [ [[TMP39]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP44]], [[PRED_LOAD_IF13]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP46:%.*]] = extractelement <8 x i1> [[TMP4]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP46]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16]]
+; ENABLED_MASKED_STRIDED:       pred.load.if15:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP47:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP47]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP50:%.*]] = insertelement <8 x i8> [[TMP45]], i8 [[TMP49]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue16:
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
 ; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP6]], <8 x i1> [[TMP4]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
index deef94aa3fe9d..67fe87a328976 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -39,7 +39,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; CHECK:       pred.sdiv.continue2:
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_SDIV_IF1]] ]
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <2 x i32> [[TMP14]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP15]], <2 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <2 x i32> [[TMP15]], <2 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP17]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
@@ -127,7 +127,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; SINK-GATHER:       pred.sdiv.continue14:
 ; SINK-GATHER-NEXT:    [[TMP44:%.*]] = phi <8 x i32> [ [[TMP39]], [[PRED_SDIV_CONTINUE12]] ], [ [[TMP43]], [[PRED_SDIV_IF13]] ]
 ; SINK-GATHER-NEXT:    [[TMP45:%.*]] = add nsw <8 x i32> [[TMP44]], [[WIDE_LOAD]]
-; SINK-GATHER-NEXT:    [[PREDPHI:%.*]] = select <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> [[TMP45]], <8 x i32> [[WIDE_LOAD]]
+; SINK-GATHER-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <8 x i32> [[TMP45]], <8 x i32> [[WIDE_LOAD]]
 ; SINK-GATHER-NEXT:    [[TMP47]] = add <8 x i32> [[VEC_PHI]], [[PREDPHI]]
 ; SINK-GATHER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; SINK-GATHER-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
@@ -179,15 +179,13 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i1> poison, i1 [[TMP1:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT4]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE4:%.*]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_UDIV_CONTINUE4]] ]
-; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; CHECK-NEXT:    br i1 [[TMP1:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; CHECK:       pred.udiv.if:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0]], 777
@@ -199,7 +197,7 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK:       pred.udiv.continue:
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ]
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4]]
-; CHECK:       pred.udiv.if3:
+; CHECK:       pred.udiv.if1:
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP7]], 777
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
@@ -207,9 +205,9 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = udiv i32 [[TMP12]], [[X]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP13]], i32 1
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
-; CHECK:       pred.udiv.continue4:
+; CHECK:       pred.udiv.continue2:
 ; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP14]], [[PRED_UDIV_IF3]] ]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP16]], <2 x i32> [[BROADCAST_SPLAT4]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP1]], <2 x i32> [[TMP16]], <2 x i32> [[BROADCAST_SPLAT4]]
 ; CHECK-NEXT:    [[TMP18]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -250,8 +248,6 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER:       vector.ph:
 ; SINK-GATHER-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 8
 ; SINK-GATHER-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
-; SINK-GATHER-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <8 x i1> poison, i1 [[TMP1:%.*]], i64 0
-; SINK-GATHER-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT16]], <8 x i1> poison, <8 x i32> zeroinitializer
 ; SINK-GATHER-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> poison, i32 [[X:%.*]], i64 0
 ; SINK-GATHER-NEXT:    [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; SINK-GATHER-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -260,7 +256,7 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_UDIV_CONTINUE16]] ]
 ; SINK-GATHER-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP66:%.*]], [[PRED_UDIV_CONTINUE16]] ]
 ; SINK-GATHER-NEXT:    [[TMP0:%.*]] = mul <8 x i64> [[VEC_IND]], splat (i64 777)
-; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; SINK-GATHER-NEXT:    br i1 [[TMP1:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; SINK-GATHER:       pred.udiv.if:
 ; SINK-GATHER-NEXT:    [[TMP2:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
 ; SINK-GATHER-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP2]]
@@ -271,76 +267,76 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER:       pred.udiv.continue:
 ; SINK-GATHER-NEXT:    [[TMP8:%.*]] = phi <8 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ]
 ; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
-; SINK-GATHER:       pred.udiv.if3:
+; SINK-GATHER:       pred.udiv.if1:
 ; SINK-GATHER-NEXT:    [[TMP10:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1
 ; SINK-GATHER-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
 ; SINK-GATHER-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
 ; SINK-GATHER-NEXT:    [[TMP13:%.*]] = udiv i32 [[TMP12]], [[X]]
 ; SINK-GATHER-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP13]], i32 1
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
-; SINK-GATHER:       pred.udiv.continue4:
+; SINK-GATHER:       pred.udiv.continue2:
 ; SINK-GATHER-NEXT:    [[TMP16:%.*]] = phi <8 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP14]], [[PRED_UDIV_IF5]] ]
 ; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
-; SINK-GATHER:       pred.udiv.if5:
+; SINK-GATHER:       pred.udiv.if3:
 ; SINK-GATHER-NEXT:    [[TMP18:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2
 ; SINK-GATHER-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
 ; SINK-GATHER-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
 ; SINK-GATHER-NEXT:    [[TMP21:%.*]] = udiv i32 [[TMP20]], [[X]]
 ; SINK-GATHER-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP21]], i32 2
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE6]]
-; SINK-GATHER:       pred.udiv.continue6:
+; SINK-GATHER:       pred.udiv.continue4:
 ; SINK-GATHER-NEXT:    [[TMP24:%.*]] = phi <8 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP22]], [[PRED_UDIV_IF6]] ]
 ; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
-; SINK-GATHER:       pred.udiv.if7:
+; SINK-GATHER:       pred.udiv.if5:
 ; SINK-GATHER-NEXT:    [[TMP26:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3
 ; SINK-GATHER-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP26]]
 ; SINK-GATHER-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
 ; SINK-GATHER-NEXT:    [[TMP29:%.*]] = udiv i32 [[TMP28]], [[X]]
 ; SINK-GATHER-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP29]], i32 3
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
-; SINK-GATHER:       pred.udiv.continue8:
+; SINK-GATHER:       pred.udiv.continue6:
 ; SINK-GATHER-NEXT:    [[TMP32:%.*]] = phi <8 x i32> [ [[TMP24]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP30]], [[PRED_UDIV_IF7]] ]
 ; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]]
-; SINK-GATHER:       pred.udiv.if9:
+; SINK-GATHER:       pred.udiv.if7:
 ; SINK-GATHER-NEXT:    [[TMP34:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4
 ; SINK-GATHER-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP34]]
 ; SINK-GATHER-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
 ; SINK-GATHER-NEXT:    [[TMP37:%.*]] = udiv i32 [[TMP36]], [[X]]
 ; SINK-GATHER-NEXT:    [[TMP38:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP37]], i32 4
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE10]]
-; SINK-GATHER:       pred.udiv.continue10:
+; SINK-GATHER:       pred.udiv.continue8:
 ; SINK-GATHER-NEXT:    [[TMP40:%.*]] = phi <8 x i32> [ [[TMP32]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP38]], [[PRED_UDIV_IF9]] ]
 ; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
-; SINK-GATHER:       pred.udiv.if11:
+; SINK-GATHER:       pred.udiv.if9:
 ; SINK-GATHER-NEXT:    [[TMP42:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5
 ; SINK-GATHER-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP42]]
 ; SINK-GATHER-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4
 ; SINK-GATHER-NEXT:    [[TMP45:%.*]] = udiv i32 [[TMP44]], [[X]]
 ; SINK-GATHER-NEXT:    [[TMP46:%.*]] = insertelement <8 x i32> [[TMP40]], i32 [[TMP45]], i32 5
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE12]]
-; SINK-GATHER:       pred.udiv.continue12:
+; SINK-GATHER:       pred.udiv.continue10:
 ; SINK-GATHER-NEXT:    [[TMP48:%.*]] = phi <8 x i32> [ [[TMP40]], [[PRED_UDIV_CONTINUE10]] ], [ [[TMP46]], [[PRED_UDIV_IF11]] ]
 ; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]]
-; SINK-GATHER:       pred.udiv.if13:
+; SINK-GATHER:       pred.udiv.if11:
 ; SINK-GATHER-NEXT:    [[TMP50:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6
 ; SINK-GATHER-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP50]]
 ; SINK-GATHER-NEXT:    [[TMP52:%.*]] = load i32, ptr [[TMP51]], align 4
 ; SINK-GATHER-NEXT:    [[TMP53:%.*]] = udiv i32 [[TMP52]], [[X]]
 ; SINK-GATHER-NEXT:    [[TMP54:%.*]] = insertelement <8 x i32> [[TMP48]], i32 [[TMP53]], i32 6
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE14]]
-; SINK-GATHER:       pred.udiv.continue14:
+; SINK-GATHER:       pred.udiv.continue12:
 ; SINK-GATHER-NEXT:    [[TMP56:%.*]] = phi <8 x i32> [ [[TMP48]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP54]], [[PRED_UDIV_IF13]] ]
 ; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16]]
-; SINK-GATHER:       pred.udiv.if15:
+; SINK-GATHER:       pred.udiv.if13:
 ; SINK-GATHER-NEXT:    [[TMP58:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7
 ; SINK-GATHER-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP58]]
 ; SINK-GATHER-NEXT:    [[TMP60:%.*]] = load i32, ptr [[TMP59]], align 4
 ; SINK-GATHER-NEXT:    [[TMP61:%.*]] = udiv i32 [[TMP60]], [[X]]
 ; SINK-GATHER-NEXT:    [[TMP62:%.*]] = insertelement <8 x i32> [[TMP56]], i32 [[TMP61]], i32 7
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE16]]
-; SINK-GATHER:       pred.udiv.continue16:
+; SINK-GATHER:       pred.udiv.continue14:
 ; SINK-GATHER-NEXT:    [[TMP64:%.*]] = phi <8 x i32> [ [[TMP56]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP62]], [[PRED_UDIV_IF15]] ]
-; SINK-GATHER-NEXT:    [[PREDPHI:%.*]] = select <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> [[TMP64]], <8 x i32> [[BROADCAST_SPLAT16]]
+; SINK-GATHER-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP1]], <8 x i32> [[TMP64]], <8 x i32> [[BROADCAST_SPLAT16]]
 ; SINK-GATHER-NEXT:    [[TMP66]] = add <8 x i32> [[VEC_PHI]], [[PREDPHI]]
 ; SINK-GATHER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; SINK-GATHER-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
diff --git a/llvm/test/Transforms/LoopVectorize/debugloc.ll b/llvm/test/Transforms/LoopVectorize/debugloc.ll
index 40cd6b63ca8f6..03e0853d29075 100644
--- a/llvm/test/Transforms/LoopVectorize/debugloc.ll
+++ b/llvm/test/Transforms/LoopVectorize/debugloc.ll
@@ -253,10 +253,10 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !32 = distinct !DILexicalBlock(scope: !31, file: !5, line: 137, column: 2)
 !33 = !DILocation(line: 210, column: 44, scope: !32)
 !34 = !DILocation(line: 320, column: 44, scope: !32)
-!35 = distinct !DISubprogram(name: "test_misc", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !12)
+!35 = distinct !DISubprogram(name: "test_misc", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !2)
 !36 = distinct !DILexicalBlock(scope: !35, file: !5, line: 137, column: 2)
 !37 = !DILocation(line: 430, column: 44, scope: !36)
 !38 = !DILocation(line: 540, column: 44, scope: !36)
-!39 = distinct !DISubprogram(name: "test_scalar_Steps", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !12)
+!39 = distinct !DISubprogram(name: "test_scalar_Steps", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !2)
 !40 = distinct !DILexicalBlock(scope: !39, file: !5, line: 137, column: 2)
 !41 = !DILocation(line: 650, column: 44, scope: !40)
diff --git a/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll b/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll
index 41756ffb64e6c..8744e45344242 100644
--- a/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll
+++ b/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll
@@ -4,13 +4,13 @@
 ; Only make sure we do not crash.
 
 ; CHECK: @test
-define void @test(ptr %ptr, ptr %ptr_end) {
+define void @test(i8 %v, ptr %ptr, ptr %ptr_end) {
 start:
   br label %loop
 
 loop:
   %ptr2 = phi ptr [ %ptr3, %loop ], [ %ptr, %start ]
-  %x = sext i8 undef to i64
+  %x = sext i8 %v to i64
   %ptr3 = getelementptr inbounds i8, ptr %ptr2, i64 1
   %cmp = icmp ult ptr %ptr3, %ptr_end
   br i1 %cmp, label %loop, label %end
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 9deab9063d710..b72cbd333cb79 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -49,6 +49,8 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize
 ; CHECK-NEXT: loop.0:
 ; CHECK-NEXT:   WIDEN-CAST ir<%conv> = sext vp<[[PRED1]]> to i32
 ; CHECK-NEXT:   EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%0>, ir<%conv>
+; CHECK-NEXT:   WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
+; CHECK-NEXT:   WIDEN ir<%add> = add ir<%conv>, ir<%rem>
 ; CHECK-NEXT: Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <xVFxUF> pred.store: {
@@ -57,9 +59,7 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize
 ; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.if:
-; CHECK-NEXT:     REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:     REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[STEPS]]>
-; CHECK-NEXT:     REPLICATE ir<%add> = add ir<%conv>, ir<%rem>
 ; CHECK-NEXT:     REPLICATE store ir<%add>, ir<%gep.dst>
 ; CHECK-NEXT:   Successor(s): pred.store.continue
 ; CHECK-EMPTY:
@@ -102,7 +102,7 @@ exit:
   ret void
 }
 
-define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
+define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr, i32 %z) optsize {
 ; CHECK-LABEL: sink_replicate_region_2
 ; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
@@ -125,16 +125,18 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT:   ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]>
 ; CHECK-NEXT:   EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]>
 ; CHECK-NEXT:   EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, ir<%recur.next>
+; CHECK-NEXT:   WIDEN ir<%cond> = icmp eq ir<%iv>, ir<%z>
+; CHECK-NEXT:   EMIT vp<[[AND:%.+]]> = logical-and vp<[[MASK]]>, ir<%cond>
 ; CHECK-NEXT:   Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <xVFxUF> pred.store: {
 ; CHECK-NEXT:  pred.store.entry:
-; CHECK-NEXT:    BRANCH-ON-MASK vp<[[MASK]]>
+; CHECK-NEXT:    BRANCH-ON-MASK vp<[[AND]]>
 ; CHECK-NEXT:  Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  pred.store.if:
-; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
+; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     REPLICATE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEPS]]>
 ; CHECK-NEXT:     REPLICATE ir<%add> = add ir<%rem>, ir<%recur.next>
 ; CHECK-NEXT:     REPLICATE store ir<%add>, ir<%gep>
@@ -143,9 +145,9 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT:   pred.store.continue:
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): loop.0
+; CHECK-NEXT: Successor(s): if.1
 ; CHECK-EMPTY:
-; CHECK-NEXT: loop.0:
+; CHECK-NEXT: if.1:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -162,13 +164,20 @@ entry:
   br label %loop
 
 loop:
-  %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ]
-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
-  %rem = srem i32 %recur, %x
+  %recur = phi i32 [ 0, %entry ], [ %recur.next, %latch ]
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %latch ]
   %recur.next = sext i8 %y to i32
+  %cond = icmp eq i32 %iv, %z
+  br i1 %cond, label %if, label %latch
+
+if:
+  %rem = srem i32 %recur, %x
   %add = add i32 %rem, %recur.next
   %gep = getelementptr i32, ptr %ptr, i32 %iv
   store i32 %add, ptr %gep
+  br label %latch
+
+latch:
   %iv.next = add nsw i32 %iv, 1
   %ec = icmp eq i32 %iv.next, 20001
   br i1 %ec, label %exit, label %loop
@@ -284,27 +293,44 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr
 ; CHECK-NEXT: loop.0:
 ; CHECK-NEXT:   WIDEN-CAST ir<%conv> = sext vp<[[PRED]]> to i32
 ; CHECK-NEXT:   EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%0>, ir<%conv>
-; CHECK-NEXT: Successor(s): pred.store
+; CHECK-NEXT:   WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
+; CHECK-NEXT: Successor(s): pred.load
 ; CHECK-EMPTY:
-; CHECK:      <xVFxUF> pred.store: {
-; CHECK-NEXT:   pred.store.entry:
+; CHECK:      <xVFxUF> pred.load: {
+; CHECK-NEXT:   pred.load.entry:
 ; CHECK-NEXT:     BRANCH-ON-MASK vp<[[MASK]]>
-; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
+; CHECK-NEXT:   Successor(s): pred.load.if, pred.load.continue
 ; CHECK-EMPTY:
-; CHECK:        pred.store.if:
-; CHECK-NEXT:     REPLICATE ir<%lv.2> = load ir<%gep>
-; CHECK-NEXT:     REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
-; CHECK-NEXT:     REPLICATE ir<%conv.lv.2> = sext ir<%lv.2>
-; CHECK-NEXT:     REPLICATE ir<%add.1> = add ir<%conv>, ir<%rem>
-; CHECK-NEXT:     REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[STEPS]]>
-; CHECK-NEXT:     REPLICATE ir<%add> = add ir<%add.1>, ir<%conv.lv.2>
-; CHECK-NEXT:     REPLICATE store ir<%add>, ir<%gep.dst>
-; CHECK-NEXT:   Successor(s): pred.store.continue
+; CHECK:        pred.load.if:
+; CHECK-NEXT:     REPLICATE ir<%lv.2> = load ir<%gep> (S->V)
+; CHECK-NEXT:   Successor(s): pred.load.continue
 ; CHECK-EMPTY:
-; CHECK:        pred.store.continue:
+; CHECK:        pred.load.continue:
+; CHECK-NEXT:     PHI-PREDICATED-INSTRUCTION vp<%9> = ir<%lv.2>
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
-; CHECK-NEXT:   Successor(s): loop.2
+; CHECK-NEXT:   Successor(s): loop.1
+; CHECK-EMPTY:
+; CHECK-NEXT:  loop.1:
+; CHECK-NEXT:    WIDEN ir<%add.1> = add ir<%conv>, ir<%rem>
+; CHECK-NEXT:    WIDEN-CAST ir<%conv.lv.2> = sext vp<%9> to i32
+; CHECK-NEXT:    WIDEN ir<%add> = add ir<%add.1>, ir<%conv.lv.2>
+; CHECK-NEXT:  Successor(s): pred.store
+; CHECK-EMPTY:
+; CHECK-NEXT:  <xVFxUF> pred.store: {
+; CHECK-NEXT:    pred.store.entry:
+; CHECK-NEXT:      BRANCH-ON-MASK vp<[[MASK]]>
+; CHECK-NEXT:    Successor(s): pred.store.if, pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:    pred.store.if:
+; CHECK-NEXT:      REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[STEPS]]>
+; CHECK-NEXT:      REPLICATE store ir<%add>, ir<%gep.dst>
+; CHECK-NEXT:    Successor(s): pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:    pred.store.continue:
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): loop.2
 ; CHECK-EMPTY:
 ; CHECK:      loop.2:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
@@ -368,6 +394,7 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias
 ; CHECK-NEXT:   ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]>
 ; CHECK-NEXT:   EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]>
 ; CHECK-NEXT:   EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, ir<%recur.next>
+; CHECK-NEXT:   WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT: Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <xVFxUF> pred.store: {
@@ -377,7 +404,6 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.if:
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:     REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:     REPLICATE ir<%rem.div> = sdiv ir<20>, ir<%rem>
 ; CHECK-NEXT:     REPLICATE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEPS]]>
 ; CHECK-NEXT:     REPLICATE store ir<%rem.div>, ir<%gep>
@@ -448,6 +474,7 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias
 ; CHECK-NEXT:     EMIT vp<[[CMP:%.+]]> = icmp ule vp<[[WIDE_IV]]>, vp<[[BTC]]>
 ; CHECK-NEXT:     CLONE ir<[[L]]> = load ir<%src>
 ; CHECK-NEXT:     EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%.pn>, ir<[[L]]>
+; CHECK-NEXT:     WIDEN ir<%val> = sdiv vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:  Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   <xVFxUF> pred.store: {
@@ -458,7 +485,6 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias
 ; CHECK-NEXT:     pred.store.if:
 ; CHECK-NEXT:       vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<1>, vp<[[VF]]>
 ; CHECK-NEXT:       REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[SCALAR_STEPS]]>
-; CHECK-NEXT:       REPLICATE ir<%val> = sdiv vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:       REPLICATE store ir<%val>, ir<%gep.dst>
 ; CHECK-NEXT:     Successor(s): pred.store.continue
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll
index e97d6e66d9d7a..58217069058f8 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll
@@ -6,59 +6,276 @@
 define i32 @FOR_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) {
 ; VF2IC1-LABEL: define i32 @FOR_used_outside(
 ; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF2IC1-NEXT:  [[ENTRY:.*]]:
-; VF2IC1-NEXT:    br label %[[LOOP:.*]]
-; VF2IC1:       [[LOOP]]:
-; VF2IC1-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF2IC1-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT:  [[ENTRY:.*:]]
+; VF2IC1-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC1:       [[VECTOR_PH]]:
+; VF2IC1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 1
+; VF2IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; VF2IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF2IC1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF2IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF2IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VF2IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC1:       [[VECTOR_BODY]]:
+; VF2IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE4:.*]] ]
+; VF2IC1-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE4]] ]
+; VF2IC1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[PRED_STORE_CONTINUE4]] ]
+; VF2IC1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
+; VF2IC1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
+; VF2IC1-NEXT:    [[TMP2:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; VF2IC1-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; VF2IC1-NEXT:    br i1 [[TMP3]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF2IC1:       [[PRED_LOAD_IF]]:
 ; VF2IC1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
-; VF2IC1-NEXT:    [[TMP10]] = load i32, ptr [[TMP9]], align 4
-; VF2IC1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]]
-; VF2IC1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; VF2IC1-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; VF2IC1-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF2IC1:       [[PRED_LOAD_CONTINUE]]:
+; VF2IC1-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; VF2IC1-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; VF2IC1-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; VF2IC1:       [[PRED_LOAD_IF1]]:
+; VF2IC1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP4]]
+; VF2IC1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4
+; VF2IC1-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP34]], i32 1
+; VF2IC1-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; VF2IC1:       [[PRED_LOAD_CONTINUE2]]:
+; VF2IC1-NEXT:    [[TMP12]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], %[[PRED_LOAD_IF1]] ]
+; VF2IC1-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP12]], <2 x i32> <i32 1, i32 2>
+; VF2IC1-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; VF2IC1-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC1:       [[PRED_STORE_IF]]:
+; VF2IC1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0
+; VF2IC1-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0
+; VF2IC1-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+; VF2IC1-NEXT:    store i32 [[TMP18]], ptr [[TMP15]], align 4
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC1:       [[PRED_STORE_CONTINUE]]:
+; VF2IC1-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; VF2IC1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4]]
+; VF2IC1:       [[PRED_STORE_IF3]]:
+; VF2IC1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP4]]
+; VF2IC1-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
+; VF2IC1-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
+; VF2IC1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
 ; VF2IC1-NEXT:    store i32 [[TMP23]], ptr [[TMP20]], align 4
-; VF2IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1
-; VF2IC1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF2IC1-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF2IC1:       [[PRED_STORE_CONTINUE4]]:
+; VF2IC1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; VF2IC1-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2IC1-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2IC1-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2IC1:       [[MIDDLE_BLOCK]]:
+; VF2IC1-NEXT:    [[TMP25:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
+; VF2IC1-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP25]], i1 true)
+; VF2IC1-NEXT:    [[TMP27:%.*]] = sub i64 [[TMP26]], 1
+; VF2IC1-NEXT:    [[TMP28:%.*]] = sub i64 [[TMP27]], 1
+; VF2IC1-NEXT:    [[TMP29:%.*]] = extractelement <2 x i32> [[TMP12]], i64 [[TMP28]]
+; VF2IC1-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[VECTOR_RECUR]], i32 1
+; VF2IC1-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[TMP27]], 0
+; VF2IC1-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 [[TMP29]]
+; VF2IC1-NEXT:    br label %[[FOR_END:.*]]
 ; VF2IC1:       [[FOR_END]]:
-; VF2IC1-NEXT:    [[TMP32:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
 ; VF2IC1-NEXT:    ret i32 [[TMP32]]
 ;
 ; VF2IC2-LABEL: define i32 @FOR_used_outside(
 ; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF2IC2-NEXT:  [[ENTRY:.*]]:
-; VF2IC2-NEXT:    br label %[[LOOP:.*]]
-; VF2IC2:       [[LOOP]]:
-; VF2IC2-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF2IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 3
+; VF2IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; VF2IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF2IC2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF2IC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF2IC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ]
+; VF2IC2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE12]] ]
+; VF2IC2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[PRED_STORE_CONTINUE12]] ]
+; VF2IC2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2IC2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; VF2IC2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2IC2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 3
+; VF2IC2-NEXT:    [[TMP4:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; VF2IC2-NEXT:    [[TMP5:%.*]] = icmp ule <2 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; VF2IC2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP6]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF2IC2:       [[PRED_LOAD_IF]]:
 ; VF2IC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
-; VF2IC2-NEXT:    [[TMP23]] = load i32, ptr [[TMP22]], align 4
-; VF2IC2-NEXT:    [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]]
-; VF2IC2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+; VF2IC2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i32 0
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE]]:
+; VF2IC2-NEXT:    [[TMP10:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP9]], %[[PRED_LOAD_IF]] ]
+; VF2IC2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP11]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; VF2IC2:       [[PRED_LOAD_IF1]]:
+; VF2IC2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; VF2IC2-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP13]], i32 1
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE2]]:
+; VF2IC2-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP10]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
+; VF2IC2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP16]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; VF2IC2:       [[PRED_LOAD_IF3]]:
+; VF2IC2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+; VF2IC2-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE4]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE4]]:
+; VF2IC2-NEXT:    [[TMP20:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], %[[PRED_LOAD_IF3]] ]
+; VF2IC2-NEXT:    [[TMP21:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP21]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]]
+; VF2IC2:       [[PRED_LOAD_IF5]]:
+; VF2IC2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP7]]
+; VF2IC2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP34]], align 4
+; VF2IC2-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP37]], i32 1
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE6]]:
+; VF2IC2-NEXT:    [[TMP25]] = phi <2 x i32> [ [[TMP20]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], %[[PRED_LOAD_IF5]] ]
+; VF2IC2-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP15]], <2 x i32> <i32 1, i32 2>
+; VF2IC2-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> [[TMP25]], <2 x i32> <i32 1, i32 2>
+; VF2IC2-NEXT:    [[TMP28:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC2:       [[PRED_STORE_IF]]:
+; VF2IC2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
+; VF2IC2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
+; VF2IC2-NEXT:    [[TMP32:%.*]] = add nsw i32 [[TMP30]], [[TMP31]]
+; VF2IC2-NEXT:    store i32 [[TMP32]], ptr [[TMP29]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC2:       [[PRED_STORE_CONTINUE]]:
+; VF2IC2-NEXT:    [[TMP33:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP33]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF2IC2:       [[PRED_STORE_IF7]]:
+; VF2IC2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
+; VF2IC2-NEXT:    [[TMP36:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
+; VF2IC2-NEXT:    [[TMP47:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
 ; VF2IC2-NEXT:    store i32 [[TMP47]], ptr [[TMP44]], align 4
-; VF2IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1
-; VF2IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF2IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF2IC2:       [[PRED_STORE_CONTINUE8]]:
+; VF2IC2-NEXT:    [[TMP38:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP38]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF2IC2:       [[PRED_STORE_IF9]]:
+; VF2IC2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
+; VF2IC2-NEXT:    [[TMP41:%.*]] = extractelement <2 x i32> [[TMP25]], i32 0
+; VF2IC2-NEXT:    [[TMP42:%.*]] = add nsw i32 [[TMP40]], [[TMP41]]
+; VF2IC2-NEXT:    store i32 [[TMP42]], ptr [[TMP39]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF2IC2:       [[PRED_STORE_CONTINUE10]]:
+; VF2IC2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP43]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_IF11]]:
+; VF2IC2-NEXT:    [[TMP67:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP7]]
+; VF2IC2-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1
+; VF2IC2-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP25]], i32 1
+; VF2IC2-NEXT:    [[TMP68:%.*]] = add nsw i32 [[TMP45]], [[TMP46]]
+; VF2IC2-NEXT:    store i32 [[TMP68]], ptr [[TMP67]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_CONTINUE12]]:
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
+; VF2IC2-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2IC2-NEXT:    br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT:    [[TMP49:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
+; VF2IC2-NEXT:    [[TMP50:%.*]] = xor <2 x i1> [[TMP5]], splat (i1 true)
+; VF2IC2-NEXT:    [[TMP51:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP50]], i1 true)
+; VF2IC2-NEXT:    [[TMP52:%.*]] = add i64 2, [[TMP51]]
+; VF2IC2-NEXT:    [[TMP53:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP49]], i1 true)
+; VF2IC2-NEXT:    [[TMP54:%.*]] = add i64 0, [[TMP53]]
+; VF2IC2-NEXT:    [[TMP55:%.*]] = icmp ne i64 [[TMP53]], 2
+; VF2IC2-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i64 [[TMP54]], i64 [[TMP52]]
+; VF2IC2-NEXT:    [[TMP57:%.*]] = sub i64 [[TMP56]], 1
+; VF2IC2-NEXT:    [[TMP58:%.*]] = sub i64 [[TMP57]], 1
+; VF2IC2-NEXT:    [[TMP59:%.*]] = extractelement <2 x i32> [[TMP15]], i64 [[TMP58]]
+; VF2IC2-NEXT:    [[TMP60:%.*]] = sub i64 [[TMP58]], 2
+; VF2IC2-NEXT:    [[TMP61:%.*]] = extractelement <2 x i32> [[TMP25]], i64 [[TMP60]]
+; VF2IC2-NEXT:    [[TMP62:%.*]] = icmp uge i64 [[TMP58]], 2
+; VF2IC2-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], i32 [[TMP61]], i32 [[TMP59]]
+; VF2IC2-NEXT:    [[TMP64:%.*]] = extractelement <2 x i32> [[VECTOR_RECUR]], i32 1
+; VF2IC2-NEXT:    [[TMP65:%.*]] = icmp eq i64 [[TMP57]], 0
+; VF2IC2-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], i32 [[TMP64]], i32 [[TMP63]]
+; VF2IC2-NEXT:    br label %[[FOR_END:.*]]
 ; VF2IC2:       [[FOR_END]]:
-; VF2IC2-NEXT:    [[TMP66:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
 ; VF2IC2-NEXT:    ret i32 [[TMP66]]
 ;
 ; VF1IC2-LABEL: define i32 @FOR_used_outside(
 ; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF1IC2-NEXT:  [[ENTRY:.*]]:
-; VF1IC2-NEXT:    br label %[[LOOP:.*]]
-; VF1IC2:       [[LOOP]]:
-; VF1IC2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF1IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT:  [[ENTRY:.*:]]
+; VF1IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF1IC2:       [[VECTOR_PH]]:
+; VF1IC2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 1
+; VF1IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; VF1IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF1IC2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF1IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF1IC2:       [[VECTOR_BODY]]:
+; VF1IC2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE5:.*]] ]
+; VF1IC2-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 33, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[PRED_STORE_CONTINUE5]] ]
+; VF1IC2-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 1
+; VF1IC2-NEXT:    [[VEC_IV:%.*]] = add i64 [[TMP0]], 0
+; VF1IC2-NEXT:    [[VEC_IV1:%.*]] = add i64 [[TMP0]], 1
+; VF1IC2-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]]
+; VF1IC2-NEXT:    [[TMP2:%.*]] = icmp ule i64 [[VEC_IV1]], [[TRIP_COUNT_MINUS_1]]
+; VF1IC2-NEXT:    br i1 [[TMP1]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF1IC2:       [[PRED_LOAD_IF]]:
 ; VF1IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]]
-; VF1IC2-NEXT:    [[TMP7]] = load i32, ptr [[TMP6]], align 4
-; VF1IC2-NEXT:    [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]]
-; VF1IC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF1IC2:       [[PRED_LOAD_CONTINUE]]:
+; VF1IC2-NEXT:    [[TMP5:%.*]] = phi i32 [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ]
+; VF1IC2-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF2:.*]], label %[[PRED_LOAD_CONTINUE3:.*]]
+; VF1IC2:       [[PRED_LOAD_IF2]]:
+; VF1IC2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
+; VF1IC2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE3]]
+; VF1IC2:       [[PRED_LOAD_CONTINUE3]]:
+; VF1IC2-NEXT:    [[TMP8]] = phi i32 [ poison, %[[PRED_LOAD_CONTINUE]] ], [ [[TMP32]], %[[PRED_LOAD_IF2]] ]
+; VF1IC2-NEXT:    br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF1IC2:       [[PRED_STORE_IF]]:
+; VF1IC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT:    [[TMP10:%.*]] = add nsw i32 [[VECTOR_RECUR]], [[TMP5]]
+; VF1IC2-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF1IC2:       [[PRED_STORE_CONTINUE]]:
+; VF1IC2-NEXT:    br i1 [[TMP2]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5]]
+; VF1IC2:       [[PRED_STORE_IF4]]:
+; VF1IC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF1IC2-NEXT:    [[TMP12:%.*]] = add nsw i32 [[TMP5]], [[TMP8]]
 ; VF1IC2-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
-; VF1IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1
-; VF1IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF1IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF1IC2-NEXT:    br label %[[PRED_STORE_CONTINUE5]]
+; VF1IC2:       [[PRED_STORE_CONTINUE5]]:
+; VF1IC2-NEXT:    [[INDEX_NEXT]] = add i64 [[TMP0]], 2
+; VF1IC2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF1IC2-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF1IC2:       [[MIDDLE_BLOCK]]:
+; VF1IC2-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP1]], true
+; VF1IC2-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP2]], true
+; VF1IC2-NEXT:    [[TMP16:%.*]] = icmp eq i1 [[TMP15]], false
+; VF1IC2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i64
+; VF1IC2-NEXT:    [[TMP18:%.*]] = add i64 1, [[TMP17]]
+; VF1IC2-NEXT:    [[TMP19:%.*]] = icmp eq i1 [[TMP14]], false
+; VF1IC2-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i64
+; VF1IC2-NEXT:    [[TMP21:%.*]] = add i64 0, [[TMP20]]
+; VF1IC2-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP20]], 1
+; VF1IC2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 [[TMP18]]
+; VF1IC2-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP23]], 1
+; VF1IC2-NEXT:    [[TMP25:%.*]] = sub i64 [[TMP24]], 1
+; VF1IC2-NEXT:    [[TMP26:%.*]] = sub i64 [[TMP25]], 1
+; VF1IC2-NEXT:    [[TMP27:%.*]] = icmp uge i64 [[TMP25]], 1
+; VF1IC2-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP8]], i32 [[TMP5]]
+; VF1IC2-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[TMP24]], 0
+; VF1IC2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[VECTOR_RECUR]], i32 [[TMP28]]
+; VF1IC2-NEXT:    br label %[[FOR_END:.*]]
 ; VF1IC2:       [[FOR_END]]:
-; VF1IC2-NEXT:    [[TMP30:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
 ; VF1IC2-NEXT:    ret i32 [[TMP30]]
 ;
 entry:
@@ -83,59 +300,265 @@ for.end:
 define i32 @FOR_next_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) {
 ; VF2IC1-LABEL: define i32 @FOR_next_used_outside(
 ; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF2IC1-NEXT:  [[ENTRY:.*]]:
-; VF2IC1-NEXT:    br label %[[LOOP:.*]]
-; VF2IC1:       [[LOOP]]:
-; VF2IC1-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF2IC1-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT:  [[ENTRY:.*:]]
+; VF2IC1-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC1:       [[VECTOR_PH]]:
+; VF2IC1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 1
+; VF2IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; VF2IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF2IC1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF2IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF2IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VF2IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC1:       [[VECTOR_BODY]]:
+; VF2IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE4:.*]] ]
+; VF2IC1-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE4]] ]
+; VF2IC1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[PRED_STORE_CONTINUE4]] ]
+; VF2IC1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
+; VF2IC1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
+; VF2IC1-NEXT:    [[TMP2:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; VF2IC1-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; VF2IC1-NEXT:    br i1 [[TMP3]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF2IC1:       [[PRED_LOAD_IF]]:
 ; VF2IC1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
-; VF2IC1-NEXT:    [[TMP10]] = load i32, ptr [[TMP9]], align 4
-; VF2IC1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]]
-; VF2IC1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; VF2IC1-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; VF2IC1-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF2IC1:       [[PRED_LOAD_CONTINUE]]:
+; VF2IC1-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; VF2IC1-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; VF2IC1-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; VF2IC1:       [[PRED_LOAD_IF1]]:
+; VF2IC1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP4]]
+; VF2IC1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4
+; VF2IC1-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP30]], i32 1
+; VF2IC1-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; VF2IC1:       [[PRED_LOAD_CONTINUE2]]:
+; VF2IC1-NEXT:    [[TMP12]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], %[[PRED_LOAD_IF1]] ]
+; VF2IC1-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP12]], <2 x i32> <i32 1, i32 2>
+; VF2IC1-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; VF2IC1-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC1:       [[PRED_STORE_IF]]:
+; VF2IC1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0
+; VF2IC1-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0
+; VF2IC1-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+; VF2IC1-NEXT:    store i32 [[TMP18]], ptr [[TMP15]], align 4
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC1:       [[PRED_STORE_CONTINUE]]:
+; VF2IC1-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; VF2IC1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4]]
+; VF2IC1:       [[PRED_STORE_IF3]]:
+; VF2IC1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP4]]
+; VF2IC1-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
+; VF2IC1-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
+; VF2IC1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
 ; VF2IC1-NEXT:    store i32 [[TMP23]], ptr [[TMP20]], align 4
-; VF2IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1
-; VF2IC1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF2IC1-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF2IC1:       [[PRED_STORE_CONTINUE4]]:
+; VF2IC1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; VF2IC1-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2IC1-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2IC1-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2IC1:       [[MIDDLE_BLOCK]]:
+; VF2IC1-NEXT:    [[TMP25:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
+; VF2IC1-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP25]], i1 true)
+; VF2IC1-NEXT:    [[TMP27:%.*]] = sub i64 [[TMP26]], 1
+; VF2IC1-NEXT:    [[TMP28:%.*]] = extractelement <2 x i32> [[TMP12]], i64 [[TMP27]]
+; VF2IC1-NEXT:    br label %[[FOR_END:.*]]
 ; VF2IC1:       [[FOR_END]]:
-; VF2IC1-NEXT:    [[TMP28:%.*]] = phi i32 [ [[TMP10]], %[[LOOP]] ]
 ; VF2IC1-NEXT:    ret i32 [[TMP28]]
 ;
 ; VF2IC2-LABEL: define i32 @FOR_next_used_outside(
 ; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF2IC2-NEXT:  [[ENTRY:.*]]:
-; VF2IC2-NEXT:    br label %[[LOOP:.*]]
-; VF2IC2:       [[LOOP]]:
-; VF2IC2-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF2IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 3
+; VF2IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; VF2IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF2IC2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF2IC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF2IC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ]
+; VF2IC2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE12]] ]
+; VF2IC2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[PRED_STORE_CONTINUE12]] ]
+; VF2IC2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2IC2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; VF2IC2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2IC2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 3
+; VF2IC2-NEXT:    [[TMP4:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; VF2IC2-NEXT:    [[TMP5:%.*]] = icmp ule <2 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; VF2IC2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP6]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF2IC2:       [[PRED_LOAD_IF]]:
 ; VF2IC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
-; VF2IC2-NEXT:    [[TMP23]] = load i32, ptr [[TMP22]], align 4
-; VF2IC2-NEXT:    [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]]
-; VF2IC2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+; VF2IC2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i32 0
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE]]:
+; VF2IC2-NEXT:    [[TMP10:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP9]], %[[PRED_LOAD_IF]] ]
+; VF2IC2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP11]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; VF2IC2:       [[PRED_LOAD_IF1]]:
+; VF2IC2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; VF2IC2-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP13]], i32 1
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE2]]:
+; VF2IC2-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP10]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
+; VF2IC2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP16]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; VF2IC2:       [[PRED_LOAD_IF3]]:
+; VF2IC2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+; VF2IC2-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE4]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE4]]:
+; VF2IC2-NEXT:    [[TMP20:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], %[[PRED_LOAD_IF3]] ]
+; VF2IC2-NEXT:    [[TMP21:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP21]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]]
+; VF2IC2:       [[PRED_LOAD_IF5]]:
+; VF2IC2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP7]]
+; VF2IC2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP34]], align 4
+; VF2IC2-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP37]], i32 1
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE6]]:
+; VF2IC2-NEXT:    [[TMP25]] = phi <2 x i32> [ [[TMP20]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], %[[PRED_LOAD_IF5]] ]
+; VF2IC2-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP15]], <2 x i32> <i32 1, i32 2>
+; VF2IC2-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> [[TMP25]], <2 x i32> <i32 1, i32 2>
+; VF2IC2-NEXT:    [[TMP28:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC2:       [[PRED_STORE_IF]]:
+; VF2IC2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
+; VF2IC2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
+; VF2IC2-NEXT:    [[TMP32:%.*]] = add nsw i32 [[TMP30]], [[TMP31]]
+; VF2IC2-NEXT:    store i32 [[TMP32]], ptr [[TMP29]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC2:       [[PRED_STORE_CONTINUE]]:
+; VF2IC2-NEXT:    [[TMP33:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP33]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF2IC2:       [[PRED_STORE_IF7]]:
+; VF2IC2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
+; VF2IC2-NEXT:    [[TMP36:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
+; VF2IC2-NEXT:    [[TMP47:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
 ; VF2IC2-NEXT:    store i32 [[TMP47]], ptr [[TMP44]], align 4
-; VF2IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1
-; VF2IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF2IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF2IC2:       [[PRED_STORE_CONTINUE8]]:
+; VF2IC2-NEXT:    [[TMP38:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP38]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF2IC2:       [[PRED_STORE_IF9]]:
+; VF2IC2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
+; VF2IC2-NEXT:    [[TMP41:%.*]] = extractelement <2 x i32> [[TMP25]], i32 0
+; VF2IC2-NEXT:    [[TMP42:%.*]] = add nsw i32 [[TMP40]], [[TMP41]]
+; VF2IC2-NEXT:    store i32 [[TMP42]], ptr [[TMP39]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF2IC2:       [[PRED_STORE_CONTINUE10]]:
+; VF2IC2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP43]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_IF11]]:
+; VF2IC2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP7]]
+; VF2IC2-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1
+; VF2IC2-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP25]], i32 1
+; VF2IC2-NEXT:    [[TMP64:%.*]] = add nsw i32 [[TMP45]], [[TMP46]]
+; VF2IC2-NEXT:    store i32 [[TMP64]], ptr [[TMP63]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_CONTINUE12]]:
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
+; VF2IC2-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2IC2-NEXT:    br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT:    [[TMP49:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
+; VF2IC2-NEXT:    [[TMP50:%.*]] = xor <2 x i1> [[TMP5]], splat (i1 true)
+; VF2IC2-NEXT:    [[TMP51:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP50]], i1 true)
+; VF2IC2-NEXT:    [[TMP52:%.*]] = add i64 2, [[TMP51]]
+; VF2IC2-NEXT:    [[TMP53:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP49]], i1 true)
+; VF2IC2-NEXT:    [[TMP54:%.*]] = add i64 0, [[TMP53]]
+; VF2IC2-NEXT:    [[TMP55:%.*]] = icmp ne i64 [[TMP53]], 2
+; VF2IC2-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i64 [[TMP54]], i64 [[TMP52]]
+; VF2IC2-NEXT:    [[TMP57:%.*]] = sub i64 [[TMP56]], 1
+; VF2IC2-NEXT:    [[TMP58:%.*]] = extractelement <2 x i32> [[TMP15]], i64 [[TMP57]]
+; VF2IC2-NEXT:    [[TMP59:%.*]] = sub i64 [[TMP57]], 2
+; VF2IC2-NEXT:    [[TMP60:%.*]] = extractelement <2 x i32> [[TMP25]], i64 [[TMP59]]
+; VF2IC2-NEXT:    [[TMP61:%.*]] = icmp uge i64 [[TMP57]], 2
+; VF2IC2-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP60]], i32 [[TMP58]]
+; VF2IC2-NEXT:    br label %[[FOR_END:.*]]
 ; VF2IC2:       [[FOR_END]]:
-; VF2IC2-NEXT:    [[TMP62:%.*]] = phi i32 [ [[TMP23]], %[[LOOP]] ]
 ; VF2IC2-NEXT:    ret i32 [[TMP62]]
 ;
 ; VF1IC2-LABEL: define i32 @FOR_next_used_outside(
 ; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF1IC2-NEXT:  [[ENTRY:.*]]:
-; VF1IC2-NEXT:    br label %[[LOOP:.*]]
-; VF1IC2:       [[LOOP]]:
-; VF1IC2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF1IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT:  [[ENTRY:.*:]]
+; VF1IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF1IC2:       [[VECTOR_PH]]:
+; VF1IC2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 1
+; VF1IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; VF1IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF1IC2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF1IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF1IC2:       [[VECTOR_BODY]]:
+; VF1IC2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE5:.*]] ]
+; VF1IC2-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 33, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[PRED_STORE_CONTINUE5]] ]
+; VF1IC2-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 1
+; VF1IC2-NEXT:    [[VEC_IV:%.*]] = add i64 [[TMP0]], 0
+; VF1IC2-NEXT:    [[VEC_IV1:%.*]] = add i64 [[TMP0]], 1
+; VF1IC2-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]]
+; VF1IC2-NEXT:    [[TMP2:%.*]] = icmp ule i64 [[VEC_IV1]], [[TRIP_COUNT_MINUS_1]]
+; VF1IC2-NEXT:    br i1 [[TMP1]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF1IC2:       [[PRED_LOAD_IF]]:
 ; VF1IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]]
-; VF1IC2-NEXT:    [[TMP7]] = load i32, ptr [[TMP6]], align 4
-; VF1IC2-NEXT:    [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]]
-; VF1IC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF1IC2:       [[PRED_LOAD_CONTINUE]]:
+; VF1IC2-NEXT:    [[TMP5:%.*]] = phi i32 [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ]
+; VF1IC2-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF2:.*]], label %[[PRED_LOAD_CONTINUE3:.*]]
+; VF1IC2:       [[PRED_LOAD_IF2]]:
+; VF1IC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
+; VF1IC2-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE3]]
+; VF1IC2:       [[PRED_LOAD_CONTINUE3]]:
+; VF1IC2-NEXT:    [[TMP8]] = phi i32 [ poison, %[[PRED_LOAD_CONTINUE]] ], [ [[TMP29]], %[[PRED_LOAD_IF2]] ]
+; VF1IC2-NEXT:    br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF1IC2:       [[PRED_STORE_IF]]:
+; VF1IC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT:    [[TMP10:%.*]] = add nsw i32 [[VECTOR_RECUR]], [[TMP5]]
+; VF1IC2-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF1IC2:       [[PRED_STORE_CONTINUE]]:
+; VF1IC2-NEXT:    br i1 [[TMP2]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5]]
+; VF1IC2:       [[PRED_STORE_IF4]]:
+; VF1IC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF1IC2-NEXT:    [[TMP12:%.*]] = add nsw i32 [[TMP5]], [[TMP8]]
 ; VF1IC2-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
-; VF1IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1
-; VF1IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF1IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF1IC2-NEXT:    br label %[[PRED_STORE_CONTINUE5]]
+; VF1IC2:       [[PRED_STORE_CONTINUE5]]:
+; VF1IC2-NEXT:    [[INDEX_NEXT]] = add i64 [[TMP0]], 2
+; VF1IC2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF1IC2-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF1IC2:       [[MIDDLE_BLOCK]]:
+; VF1IC2-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP1]], true
+; VF1IC2-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP2]], true
+; VF1IC2-NEXT:    [[TMP16:%.*]] = icmp eq i1 [[TMP15]], false
+; VF1IC2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i64
+; VF1IC2-NEXT:    [[TMP18:%.*]] = add i64 1, [[TMP17]]
+; VF1IC2-NEXT:    [[TMP19:%.*]] = icmp eq i1 [[TMP14]], false
+; VF1IC2-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i64
+; VF1IC2-NEXT:    [[TMP21:%.*]] = add i64 0, [[TMP20]]
+; VF1IC2-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP20]], 1
+; VF1IC2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 [[TMP18]]
+; VF1IC2-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP23]], 1
+; VF1IC2-NEXT:    [[TMP25:%.*]] = sub i64 [[TMP24]], 1
+; VF1IC2-NEXT:    [[TMP26:%.*]] = icmp uge i64 [[TMP24]], 1
+; VF1IC2-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP8]], i32 [[TMP5]]
+; VF1IC2-NEXT:    br label %[[FOR_END:.*]]
 ; VF1IC2:       [[FOR_END]]:
-; VF1IC2-NEXT:    [[TMP27:%.*]] = phi i32 [ [[TMP7]], %[[LOOP]] ]
 ; VF1IC2-NEXT:    ret i32 [[TMP27]]
 ;
 entry:
@@ -160,64 +583,287 @@ for.end:
 define i32 @FOR_and_next_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) {
 ; VF2IC1-LABEL: define i32 @FOR_and_next_used_outside(
 ; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF2IC1-NEXT:  [[ENTRY:.*]]:
-; VF2IC1-NEXT:    br label %[[LOOP:.*]]
-; VF2IC1:       [[LOOP]]:
-; VF2IC1-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF2IC1-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT:  [[ENTRY:.*:]]
+; VF2IC1-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC1:       [[VECTOR_PH]]:
+; VF2IC1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 1
+; VF2IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; VF2IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF2IC1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF2IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF2IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VF2IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC1:       [[VECTOR_BODY]]:
+; VF2IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE4:.*]] ]
+; VF2IC1-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE4]] ]
+; VF2IC1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[PRED_STORE_CONTINUE4]] ]
+; VF2IC1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
+; VF2IC1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
+; VF2IC1-NEXT:    [[TMP2:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; VF2IC1-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; VF2IC1-NEXT:    br i1 [[TMP3]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF2IC1:       [[PRED_LOAD_IF]]:
 ; VF2IC1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
-; VF2IC1-NEXT:    [[TMP10]] = load i32, ptr [[TMP9]], align 4
-; VF2IC1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]]
-; VF2IC1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; VF2IC1-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; VF2IC1-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF2IC1:       [[PRED_LOAD_CONTINUE]]:
+; VF2IC1-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; VF2IC1-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; VF2IC1-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; VF2IC1:       [[PRED_LOAD_IF1]]:
+; VF2IC1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP4]]
+; VF2IC1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4
+; VF2IC1-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP35]], i32 1
+; VF2IC1-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; VF2IC1:       [[PRED_LOAD_CONTINUE2]]:
+; VF2IC1-NEXT:    [[TMP12]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], %[[PRED_LOAD_IF1]] ]
+; VF2IC1-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP12]], <2 x i32> <i32 1, i32 2>
+; VF2IC1-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; VF2IC1-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC1:       [[PRED_STORE_IF]]:
+; VF2IC1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0
+; VF2IC1-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0
+; VF2IC1-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+; VF2IC1-NEXT:    store i32 [[TMP18]], ptr [[TMP15]], align 4
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC1:       [[PRED_STORE_CONTINUE]]:
+; VF2IC1-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; VF2IC1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4]]
+; VF2IC1:       [[PRED_STORE_IF3]]:
+; VF2IC1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP4]]
+; VF2IC1-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
+; VF2IC1-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
+; VF2IC1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
 ; VF2IC1-NEXT:    store i32 [[TMP23]], ptr [[TMP20]], align 4
-; VF2IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1
-; VF2IC1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF2IC1-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF2IC1:       [[PRED_STORE_CONTINUE4]]:
+; VF2IC1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; VF2IC1-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2IC1-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2IC1-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2IC1:       [[MIDDLE_BLOCK]]:
+; VF2IC1-NEXT:    [[TMP25:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
+; VF2IC1-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP25]], i1 true)
+; VF2IC1-NEXT:    [[TMP27:%.*]] = sub i64 [[TMP26]], 1
+; VF2IC1-NEXT:    [[TMP28:%.*]] = sub i64 [[TMP27]], 1
+; VF2IC1-NEXT:    [[TMP29:%.*]] = extractelement <2 x i32> [[TMP12]], i64 [[TMP28]]
+; VF2IC1-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[VECTOR_RECUR]], i32 1
+; VF2IC1-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[TMP27]], 0
+; VF2IC1-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 [[TMP29]]
+; VF2IC1-NEXT:    [[TMP33:%.*]] = extractelement <2 x i32> [[TMP12]], i64 [[TMP27]]
+; VF2IC1-NEXT:    br label %[[FOR_END:.*]]
 ; VF2IC1:       [[FOR_END]]:
-; VF2IC1-NEXT:    [[TMP32:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
-; VF2IC1-NEXT:    [[TMP33:%.*]] = phi i32 [ [[TMP10]], %[[LOOP]] ]
 ; VF2IC1-NEXT:    [[RES:%.*]] = add i32 [[TMP32]], [[TMP33]]
 ; VF2IC1-NEXT:    ret i32 [[RES]]
 ;
 ; VF2IC2-LABEL: define i32 @FOR_and_next_used_outside(
 ; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF2IC2-NEXT:  [[ENTRY:.*]]:
-; VF2IC2-NEXT:    br label %[[LOOP:.*]]
-; VF2IC2:       [[LOOP]]:
-; VF2IC2-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF2IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 3
+; VF2IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; VF2IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF2IC2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF2IC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF2IC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ]
+; VF2IC2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE12]] ]
+; VF2IC2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 33>, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[PRED_STORE_CONTINUE12]] ]
+; VF2IC2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2IC2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; VF2IC2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; VF2IC2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 3
+; VF2IC2-NEXT:    [[TMP4:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; VF2IC2-NEXT:    [[TMP5:%.*]] = icmp ule <2 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; VF2IC2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP6]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF2IC2:       [[PRED_LOAD_IF]]:
 ; VF2IC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
-; VF2IC2-NEXT:    [[TMP23]] = load i32, ptr [[TMP22]], align 4
-; VF2IC2-NEXT:    [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]]
-; VF2IC2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+; VF2IC2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i32 0
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE]]:
+; VF2IC2-NEXT:    [[TMP10:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP9]], %[[PRED_LOAD_IF]] ]
+; VF2IC2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP11]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; VF2IC2:       [[PRED_LOAD_IF1]]:
+; VF2IC2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; VF2IC2-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP13]], i32 1
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE2]]:
+; VF2IC2-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP10]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
+; VF2IC2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP16]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; VF2IC2:       [[PRED_LOAD_IF3]]:
+; VF2IC2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+; VF2IC2-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE4]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE4]]:
+; VF2IC2-NEXT:    [[TMP20:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], %[[PRED_LOAD_IF3]] ]
+; VF2IC2-NEXT:    [[TMP21:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP21]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]]
+; VF2IC2:       [[PRED_LOAD_IF5]]:
+; VF2IC2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP7]]
+; VF2IC2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP34]], align 4
+; VF2IC2-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP37]], i32 1
+; VF2IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
+; VF2IC2:       [[PRED_LOAD_CONTINUE6]]:
+; VF2IC2-NEXT:    [[TMP25]] = phi <2 x i32> [ [[TMP20]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], %[[PRED_LOAD_IF5]] ]
+; VF2IC2-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP15]], <2 x i32> <i32 1, i32 2>
+; VF2IC2-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> [[TMP25]], <2 x i32> <i32 1, i32 2>
+; VF2IC2-NEXT:    [[TMP28:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC2:       [[PRED_STORE_IF]]:
+; VF2IC2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
+; VF2IC2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
+; VF2IC2-NEXT:    [[TMP32:%.*]] = add nsw i32 [[TMP30]], [[TMP31]]
+; VF2IC2-NEXT:    store i32 [[TMP32]], ptr [[TMP29]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC2:       [[PRED_STORE_CONTINUE]]:
+; VF2IC2-NEXT:    [[TMP33:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP33]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF2IC2:       [[PRED_STORE_IF7]]:
+; VF2IC2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
+; VF2IC2-NEXT:    [[TMP36:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
+; VF2IC2-NEXT:    [[TMP47:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
 ; VF2IC2-NEXT:    store i32 [[TMP47]], ptr [[TMP44]], align 4
-; VF2IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1
-; VF2IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF2IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF2IC2:       [[PRED_STORE_CONTINUE8]]:
+; VF2IC2-NEXT:    [[TMP38:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP38]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF2IC2:       [[PRED_STORE_IF9]]:
+; VF2IC2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
+; VF2IC2-NEXT:    [[TMP41:%.*]] = extractelement <2 x i32> [[TMP25]], i32 0
+; VF2IC2-NEXT:    [[TMP42:%.*]] = add nsw i32 [[TMP40]], [[TMP41]]
+; VF2IC2-NEXT:    store i32 [[TMP42]], ptr [[TMP39]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF2IC2:       [[PRED_STORE_CONTINUE10]]:
+; VF2IC2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP43]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_IF11]]:
+; VF2IC2-NEXT:    [[TMP72:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP7]]
+; VF2IC2-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1
+; VF2IC2-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP25]], i32 1
+; VF2IC2-NEXT:    [[TMP73:%.*]] = add nsw i32 [[TMP45]], [[TMP46]]
+; VF2IC2-NEXT:    store i32 [[TMP73]], ptr [[TMP72]], align 4
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_CONTINUE12]]:
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
+; VF2IC2-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2IC2-NEXT:    br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT:    [[TMP49:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
+; VF2IC2-NEXT:    [[TMP50:%.*]] = xor <2 x i1> [[TMP5]], splat (i1 true)
+; VF2IC2-NEXT:    [[TMP51:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP50]], i1 true)
+; VF2IC2-NEXT:    [[TMP52:%.*]] = add i64 2, [[TMP51]]
+; VF2IC2-NEXT:    [[TMP53:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP49]], i1 true)
+; VF2IC2-NEXT:    [[TMP54:%.*]] = add i64 0, [[TMP53]]
+; VF2IC2-NEXT:    [[TMP55:%.*]] = icmp ne i64 [[TMP53]], 2
+; VF2IC2-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i64 [[TMP54]], i64 [[TMP52]]
+; VF2IC2-NEXT:    [[TMP57:%.*]] = sub i64 [[TMP56]], 1
+; VF2IC2-NEXT:    [[TMP58:%.*]] = sub i64 [[TMP57]], 1
+; VF2IC2-NEXT:    [[TMP59:%.*]] = extractelement <2 x i32> [[TMP15]], i64 [[TMP58]]
+; VF2IC2-NEXT:    [[TMP60:%.*]] = sub i64 [[TMP58]], 2
+; VF2IC2-NEXT:    [[TMP61:%.*]] = extractelement <2 x i32> [[TMP25]], i64 [[TMP60]]
+; VF2IC2-NEXT:    [[TMP62:%.*]] = icmp uge i64 [[TMP58]], 2
+; VF2IC2-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], i32 [[TMP61]], i32 [[TMP59]]
+; VF2IC2-NEXT:    [[TMP64:%.*]] = extractelement <2 x i32> [[VECTOR_RECUR]], i32 1
+; VF2IC2-NEXT:    [[TMP65:%.*]] = icmp eq i64 [[TMP57]], 0
+; VF2IC2-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], i32 [[TMP64]], i32 [[TMP63]]
+; VF2IC2-NEXT:    [[TMP67:%.*]] = extractelement <2 x i32> [[TMP15]], i64 [[TMP57]]
+; VF2IC2-NEXT:    [[TMP68:%.*]] = sub i64 [[TMP57]], 2
+; VF2IC2-NEXT:    [[TMP69:%.*]] = extractelement <2 x i32> [[TMP25]], i64 [[TMP68]]
+; VF2IC2-NEXT:    [[TMP70:%.*]] = icmp uge i64 [[TMP57]], 2
+; VF2IC2-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP69]], i32 [[TMP67]]
+; VF2IC2-NEXT:    br label %[[FOR_END:.*]]
 ; VF2IC2:       [[FOR_END]]:
-; VF2IC2-NEXT:    [[TMP66:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
-; VF2IC2-NEXT:    [[TMP71:%.*]] = phi i32 [ [[TMP23]], %[[LOOP]] ]
 ; VF2IC2-NEXT:    [[RES:%.*]] = add i32 [[TMP66]], [[TMP71]]
 ; VF2IC2-NEXT:    ret i32 [[RES]]
 ;
 ; VF1IC2-LABEL: define i32 @FOR_and_next_used_outside(
 ; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; VF1IC2-NEXT:  [[ENTRY:.*]]:
-; VF1IC2-NEXT:    br label %[[LOOP:.*]]
-; VF1IC2:       [[LOOP]]:
-; VF1IC2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF1IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT:  [[ENTRY:.*:]]
+; VF1IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF1IC2:       [[VECTOR_PH]]:
+; VF1IC2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 1
+; VF1IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; VF1IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF1IC2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; VF1IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF1IC2:       [[VECTOR_BODY]]:
+; VF1IC2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE5:.*]] ]
+; VF1IC2-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 33, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[PRED_STORE_CONTINUE5]] ]
+; VF1IC2-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 1
+; VF1IC2-NEXT:    [[VEC_IV:%.*]] = add i64 [[TMP0]], 0
+; VF1IC2-NEXT:    [[VEC_IV1:%.*]] = add i64 [[TMP0]], 1
+; VF1IC2-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]]
+; VF1IC2-NEXT:    [[TMP2:%.*]] = icmp ule i64 [[VEC_IV1]], [[TRIP_COUNT_MINUS_1]]
+; VF1IC2-NEXT:    br i1 [[TMP1]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; VF1IC2:       [[PRED_LOAD_IF]]:
 ; VF1IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]]
-; VF1IC2-NEXT:    [[TMP7]] = load i32, ptr [[TMP6]], align 4
-; VF1IC2-NEXT:    [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]]
-; VF1IC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; VF1IC2:       [[PRED_LOAD_CONTINUE]]:
+; VF1IC2-NEXT:    [[TMP5:%.*]] = phi i32 [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ]
+; VF1IC2-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF2:.*]], label %[[PRED_LOAD_CONTINUE3:.*]]
+; VF1IC2:       [[PRED_LOAD_IF2]]:
+; VF1IC2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
+; VF1IC2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_LOAD_CONTINUE3]]
+; VF1IC2:       [[PRED_LOAD_CONTINUE3]]:
+; VF1IC2-NEXT:    [[TMP8]] = phi i32 [ poison, %[[PRED_LOAD_CONTINUE]] ], [ [[TMP35]], %[[PRED_LOAD_IF2]] ]
+; VF1IC2-NEXT:    br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF1IC2:       [[PRED_STORE_IF]]:
+; VF1IC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT:    [[TMP10:%.*]] = add nsw i32 [[VECTOR_RECUR]], [[TMP5]]
+; VF1IC2-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
+; VF1IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF1IC2:       [[PRED_STORE_CONTINUE]]:
+; VF1IC2-NEXT:    br i1 [[TMP2]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5]]
+; VF1IC2:       [[PRED_STORE_IF4]]:
+; VF1IC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF1IC2-NEXT:    [[TMP12:%.*]] = add nsw i32 [[TMP5]], [[TMP8]]
 ; VF1IC2-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
-; VF1IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1
-; VF1IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF1IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF1IC2-NEXT:    br label %[[PRED_STORE_CONTINUE5]]
+; VF1IC2:       [[PRED_STORE_CONTINUE5]]:
+; VF1IC2-NEXT:    [[INDEX_NEXT]] = add i64 [[TMP0]], 2
+; VF1IC2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF1IC2-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF1IC2:       [[MIDDLE_BLOCK]]:
+; VF1IC2-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP1]], true
+; VF1IC2-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP2]], true
+; VF1IC2-NEXT:    [[TMP16:%.*]] = icmp eq i1 [[TMP15]], false
+; VF1IC2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i64
+; VF1IC2-NEXT:    [[TMP18:%.*]] = add i64 1, [[TMP17]]
+; VF1IC2-NEXT:    [[TMP19:%.*]] = icmp eq i1 [[TMP14]], false
+; VF1IC2-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i64
+; VF1IC2-NEXT:    [[TMP21:%.*]] = add i64 0, [[TMP20]]
+; VF1IC2-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP20]], 1
+; VF1IC2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 [[TMP18]]
+; VF1IC2-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP23]], 1
+; VF1IC2-NEXT:    [[TMP25:%.*]] = sub i64 [[TMP24]], 1
+; VF1IC2-NEXT:    [[TMP26:%.*]] = sub i64 [[TMP25]], 1
+; VF1IC2-NEXT:    [[TMP27:%.*]] = icmp uge i64 [[TMP25]], 1
+; VF1IC2-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP8]], i32 [[TMP5]]
+; VF1IC2-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[TMP24]], 0
+; VF1IC2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[VECTOR_RECUR]], i32 [[TMP28]]
+; VF1IC2-NEXT:    [[TMP31:%.*]] = sub i64 [[TMP24]], 1
+; VF1IC2-NEXT:    [[TMP32:%.*]] = icmp uge i64 [[TMP24]], 1
+; VF1IC2-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP8]], i32 [[TMP5]]
+; VF1IC2-NEXT:    br label %[[FOR_END:.*]]
 ; VF1IC2:       [[FOR_END]]:
-; VF1IC2-NEXT:    [[TMP30:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
-; VF1IC2-NEXT:    [[TMP33:%.*]] = phi i32 [ [[TMP7]], %[[LOOP]] ]
 ; VF1IC2-NEXT:    [[RES:%.*]] = add i32 [[TMP30]], [[TMP33]]
 ; VF1IC2-NEXT:    ret i32 [[RES]]
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
index 198a30af814ba..372876c5faac6 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
@@ -134,21 +134,18 @@ define i16 @for_phi_removed(ptr  %src) {
 ; UNROLL-NO-IC:       [[VECTOR_BODY]]:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> splat (i16 1), <4 x i16> zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i16 1, i16 0
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 104
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; UNROLL-NO-IC:       [[MIDDLE_BLOCK]]:
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
 ; UNROLL-NO-IC-NEXT:    br label %[[SCALAR_PH:.*]]
 ; UNROLL-NO-IC:       [[SCALAR_PH]]:
 ; UNROLL-NO-IC-NEXT:    br label %[[LOOP:.*]]
 ; UNROLL-NO-IC:       [[LOOP]]:
 ; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i16 [ 104, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; UNROLL-NO-IC-NEXT:    [[P:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; UNROLL-NO-IC-NEXT:    [[P:%.*]] = phi i16 [ [[TMP2]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
 ; UNROLL-NO-IC-NEXT:    [[L:%.*]] = load i32, ptr [[SRC]], align 4
 ; UNROLL-NO-IC-NEXT:    [[C:%.*]] = icmp eq i32 [[L]], 0
 ; UNROLL-NO-IC-NEXT:    [[SEL]] = select i1 [[C]], i16 1, i16 0
@@ -199,21 +196,18 @@ define i16 @for_phi_removed(ptr  %src) {
 ; SINK-AFTER:       [[VECTOR_BODY]]:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; SINK-AFTER-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; SINK-AFTER-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SINK-AFTER-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
-; SINK-AFTER-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> splat (i16 1), <4 x i16> zeroinitializer
+; SINK-AFTER-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; SINK-AFTER-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i16 1, i16 0
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 108
 ; SINK-AFTER-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SINK-AFTER:       [[MIDDLE_BLOCK]]:
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
 ; SINK-AFTER-NEXT:    br label %[[SCALAR_PH:.*]]
 ; SINK-AFTER:       [[SCALAR_PH]]:
 ; SINK-AFTER-NEXT:    br label %[[LOOP:.*]]
 ; SINK-AFTER:       [[LOOP]]:
 ; SINK-AFTER-NEXT:    [[IV:%.*]] = phi i16 [ 108, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; SINK-AFTER-NEXT:    [[P:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; SINK-AFTER-NEXT:    [[P:%.*]] = phi i16 [ [[TMP2]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
 ; SINK-AFTER-NEXT:    [[L:%.*]] = load i32, ptr [[SRC]], align 4
 ; SINK-AFTER-NEXT:    [[C:%.*]] = icmp eq i32 [[L]], 0
 ; SINK-AFTER-NEXT:    [[SEL]] = select i1 [[C]], i16 1, i16 0
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
index 5b7c27a0b5f1b..ca6e5bc2d0dcb 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
@@ -59,20 +59,18 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
@@ -118,23 +116,75 @@ define float @test_fmax_and_fmin(ptr %src.0, ptr %src.1, i64 %n) {
 ; CHECK-LABEL: define float @test_fmax_and_fmin(
 ; CHECK-SAME: ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV]]
 ; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV]]
-; CHECK-NEXT:    [[L_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4
-; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC_0]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC_0]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC_1]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI2]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP5]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI3]], <4 x float> [[WIDE_LOAD4]])
+; CHECK-NEXT:    [[TMP6]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD5]])
+; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD6]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = freeze <4 x i1> [[TMP9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i1 [[TMP19]], [[TMP21]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP19]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP6]]
+; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP19]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP19]], <4 x float> [[VEC_PHI2]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP19]], <4 x float> [[VEC_PHI3]], <4 x float> [[TMP5]]
+; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP19]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP23]], <4 x float> [[TMP24]])
+; CHECK-NEXT:    [[TMP28:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[RDX_MINMAX]])
+; CHECK-NEXT:    [[RDX_MINMAX9:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP25]], <4 x float> [[TMP26]])
+; CHECK-NEXT:    [[TMP29:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX9]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP30:%.*]] = xor i1 [[TMP19]], true
+; CHECK-NEXT:    [[TMP31:%.*]] = and i1 [[CMP_N]], [[TMP30]]
+; CHECK-NEXT:    br i1 [[TMP31]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP27]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX8:%.*]] = phi float [ [[TMP29]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MIN:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX8]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV1]]
+; CHECK-NEXT:    [[GEP_SRC_3:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_0:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
+; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_3]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = tail call noundef float @llvm.maxnum.f32(float [[MAX]], float [[L_0]])
 ; CHECK-NEXT:    [[MIN_NEXT]] = tail call noundef float @llvm.minnum.f32(float [[MIN]], float [[L_1]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP29]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ], [ [[TMP28]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[MAX_NEXT_LCSSA]], [[MIN_NEXT_LCSSA]]
 ; CHECK-NEXT:    ret float [[SUB]]
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
index 8b6a6e1e46101..a4f7631435bb3 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
@@ -205,16 +205,14 @@ define float @fmaxnum_1(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = freeze <4 x i1> [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP7]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -272,16 +270,14 @@ define float @fmaxnum_2(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = freeze <4 x i1> [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP7]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -341,16 +337,14 @@ define float @fmaxnum_induction_starts_at_10(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = freeze <4 x i1> [[TMP5]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]])
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[INDEX]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP8]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 10, [[TMP9]]
@@ -411,16 +405,14 @@ define float @fmaxnum_induction_starts_at_value(ptr %src, i64 %start, i64 %n) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = freeze <4 x i1> [[TMP5]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]])
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[INDEX]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP8]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[START]], [[TMP9]]
@@ -688,23 +680,60 @@ define float @test_fmax_and_fmax(ptr %src.0, ptr %src.1, i64 %n) {
 ; CHECK-LABEL: define float @test_fmax_and_fmax(
 ; CHECK-SAME: ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV]]
 ; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV]]
-; CHECK-NEXT:    [[L_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4
-; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC_0]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT:    [[TMP2]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP3]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP8:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP9]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP9]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP9]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP14]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP9]], true
+; CHECK-NEXT:    [[TMP19:%.*]] = and i1 [[CMP_N]], [[TMP18]]
+; CHECK-NEXT:    br i1 [[TMP19]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX3:%.*]] = phi float [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MIN:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX3]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV1]]
+; CHECK-NEXT:    [[GEP_SRC_3:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_0:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
+; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_3]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = tail call noundef float @llvm.maxnum.f32(float [[MAX]], float [[L_0]])
 ; CHECK-NEXT:    [[MIN_NEXT]] = tail call noundef float @llvm.minnum.f32(float [[MIN]], float [[L_1]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP17]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ], [ [[TMP16]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[MAX_NEXT_LCSSA]], [[MIN_NEXT_LCSSA]]
 ; CHECK-NEXT:    ret float [[SUB]]
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
index 211d3bf4c1f6a..368553dc2a7d2 100644
--- a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
@@ -205,16 +205,14 @@ define float @fminnum_1(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = freeze <4 x i1> [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -272,16 +270,14 @@ define float @fminnum_2(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = freeze <4 x i1> [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll b/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll
new file mode 100644
index 0000000000000..8615401af34f8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll
@@ -0,0 +1,247 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
+
+define void @hoist_invariant_load_noalias_due_to_memchecks(ptr %dst, ptr %invariant_ptr, i32 %n) {
+; CHECK-LABEL: define void @hoist_invariant_load_noalias_due_to_memchecks(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[INVARIANT_PTR:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[INVARIANT_PTR]], i64 4
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[INVARIANT_PTR]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[INDEX]]
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[INV_VAL:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT:    store i32 [[INV_VAL]], ptr [[GEP]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %inv_val = load i32, ptr %invariant_ptr, align 4
+  %gep = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %inv_val, ptr %gep, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Test that loads with non-invariant addresses are not hoisted.
+define void @dont_hoist_variant_address(ptr %dst, ptr %src, i32 %n) {
+; CHECK-LABEL: define void @dont_hoist_variant_address(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[A1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[SRC2]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[INDEX]]
+; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT:    store i32 [[VAL]], ptr [[GEP_DST]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv
+  %val = load i32, ptr %gep.src, align 4
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %val, ptr %gep.dst, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Test that predicated loads are not hoisted.
+define void @dont_hoist_predicated_load(ptr %dst, ptr %invariant_ptr, ptr %cond_ptr, i32 %n) {
+; CHECK-LABEL: define void @dont_hoist_predicated_load(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[INVARIANT_PTR:%.*]], ptr [[COND_PTR:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP22:%.*]] = shl nuw nsw i64 [[TMP20]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP22]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND_PTR]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[INVARIANT_PTR]], i64 4
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[COND_PTR]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[INVARIANT_PTR]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE11:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[COND_PTR]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4, !alias.scope [[META11:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META14:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP9]], align 4, !alias.scope [[META16:![0-9]+]], !noalias [[META18:![0-9]+]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
+; CHECK:       [[PRED_STORE_IF6]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META14]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP8]]
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP13]], align 4, !alias.scope [[META16]], !noalias [[META18]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
+; CHECK:       [[PRED_STORE_CONTINUE7]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
+; CHECK:       [[PRED_STORE_IF8]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META14]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP12]]
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP17]], align 4, !alias.scope [[META16]], !noalias [[META18]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE9]]
+; CHECK:       [[PRED_STORE_CONTINUE9]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11]]
+; CHECK:       [[PRED_STORE_IF10]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP16]]
+; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP21]], align 4, !alias.scope [[META16]], !noalias [[META18]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE11]]
+; CHECK:       [[PRED_STORE_CONTINUE11]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_COND:%.*]] = getelementptr inbounds i32, ptr [[COND_PTR]], i32 [[IV]]
+; CHECK-NEXT:    [[COND:%.*]] = load i32, ptr [[GEP_COND]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[COND]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[INV_VAL:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT:    store i32 [[INV_VAL]], ptr [[GEP]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.cond = getelementptr inbounds i32, ptr %cond_ptr, i32 %iv
+  %cond = load i32, ptr %gep.cond, align 4
+  %cmp = icmp sgt i32 %cond, 0
+  br i1 %cmp, label %if.then, label %loop.latch
+
+if.then:
+  %inv_val = load i32, ptr %invariant_ptr, align 4
+  %gep = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %inv_val, ptr %gep, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll
new file mode 100644
index 0000000000000..d447a39aafd93
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll
@@ -0,0 +1,761 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6
+; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
+
+define void @test_stores_noalias_via_rt_checks_after_loads(ptr %dst, ptr %src, ptr %cond, i32 %n) {
+; CHECK-LABEL: define void @test_stores_noalias_via_rt_checks_after_loads(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE17:.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11)
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META3:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]]
+; CHECK:       [[PRED_LOAD_IF6]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META3]]
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP16]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE7]]
+; CHECK:       [[PRED_LOAD_CONTINUE7]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF6]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = sub <2 x i32> [[TMP18]], splat (i32 5)
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP21]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
+; CHECK:       [[PRED_STORE_IF8]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4, !alias.scope [[META5]], !noalias [[META7]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE9]]
+; CHECK:       [[PRED_STORE_CONTINUE9]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP26]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11:.*]]
+; CHECK:       [[PRED_LOAD_IF10]]:
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4, !alias.scope [[META3]]
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP28]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE11]]
+; CHECK:       [[PRED_LOAD_CONTINUE11]]:
+; CHECK-NEXT:    [[TMP30:%.*]] = phi <2 x i32> [ poison, %[[PRED_STORE_CONTINUE9]] ], [ [[TMP29]], %[[PRED_LOAD_IF10]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP31]], label %[[PRED_LOAD_IF12:.*]], label %[[PRED_LOAD_CONTINUE13:.*]]
+; CHECK:       [[PRED_LOAD_IF12]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4, !alias.scope [[META3]]
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP33]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE13]]
+; CHECK:       [[PRED_LOAD_CONTINUE13]]:
+; CHECK-NEXT:    [[TMP35:%.*]] = phi <2 x i32> [ [[TMP30]], %[[PRED_LOAD_CONTINUE11]] ], [ [[TMP34]], %[[PRED_LOAD_IF12]] ]
+; CHECK-NEXT:    [[TMP36:%.*]] = add <2 x i32> [[TMP35]], splat (i32 10)
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP37]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]]
+; CHECK:       [[PRED_STORE_IF14]]:
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i32> [[TMP36]], i32 0
+; CHECK-NEXT:    store i32 [[TMP39]], ptr [[TMP38]], align 4, !alias.scope [[META5]], !noalias [[META7]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE15]]
+; CHECK:       [[PRED_STORE_CONTINUE15]]:
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17]]
+; CHECK:       [[PRED_STORE_IF16]]:
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[TMP36]], i32 1
+; CHECK-NEXT:    store i32 [[TMP42]], ptr [[TMP41]], align 4, !alias.scope [[META5]], !noalias [[META7]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE17]]
+; CHECK:       [[PRED_STORE_CONTINUE17]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP43]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv
+  %c = load i32, ptr %gep.cond, align 4
+  %c.0 = icmp ule i32 %c, 11
+  br i1 %c.0, label %then, label %else
+
+then:
+  %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv
+  %l.src.then = load i32, ptr %gep.src.then, align 4
+  %add = add i32 %l.src.then, 10
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %add, ptr %gep.dst, align 4
+  br label %loop.latch
+
+else:
+  %gep.src.else= getelementptr inbounds i32, ptr %src, i32 %iv
+  %l.src.else = load i32, ptr %gep.src.else, align 4
+  %sub = sub i32 %l.src.else, 5
+  %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %sub, ptr %gep.dst.else, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_aliasing_store(ptr %dst, ptr %src, ptr %cond, i32 %n) {
+; CHECK-LABEL: define void @test_aliasing_store(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT:    [[BOUND06:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND17:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT8:%.*]] = and i1 [[BOUND06]], [[BOUND17]]
+; CHECK-NEXT:    [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT8]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX9]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE21:.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META12:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11)
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
+; CHECK-NEXT:    store i32 99, ptr [[TMP10]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META17:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META15]], !noalias [[META17]]
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11:.*]]
+; CHECK:       [[PRED_LOAD_IF10]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
+; CHECK-NEXT:    store i32 99, ptr [[TMP15]], align 4, !alias.scope [[META15]], !noalias [[META17]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META15]], !noalias [[META17]]
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP16]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE11]]
+; CHECK:       [[PRED_LOAD_CONTINUE11]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF10]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = sub <2 x i32> [[TMP18]], splat (i32 5)
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP21]], align 4, !alias.scope [[META19:![0-9]+]], !noalias [[META12]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]]
+; CHECK:       [[PRED_STORE_IF12]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4, !alias.scope [[META19]], !noalias [[META12]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE13]]
+; CHECK:       [[PRED_STORE_CONTINUE13]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP26]], label %[[PRED_LOAD_IF14:.*]], label %[[PRED_LOAD_CONTINUE15:.*]]
+; CHECK:       [[PRED_LOAD_IF14]]:
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4, !alias.scope [[META15]], !noalias [[META17]]
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP28]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE15]]
+; CHECK:       [[PRED_LOAD_CONTINUE15]]:
+; CHECK-NEXT:    [[TMP30:%.*]] = phi <2 x i32> [ poison, %[[PRED_STORE_CONTINUE13]] ], [ [[TMP29]], %[[PRED_LOAD_IF14]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP31]], label %[[PRED_LOAD_IF16:.*]], label %[[PRED_LOAD_CONTINUE17:.*]]
+; CHECK:       [[PRED_LOAD_IF16]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4, !alias.scope [[META15]], !noalias [[META17]]
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP33]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE17]]
+; CHECK:       [[PRED_LOAD_CONTINUE17]]:
+; CHECK-NEXT:    [[TMP35:%.*]] = phi <2 x i32> [ [[TMP30]], %[[PRED_LOAD_CONTINUE15]] ], [ [[TMP34]], %[[PRED_LOAD_IF16]] ]
+; CHECK-NEXT:    [[TMP36:%.*]] = add <2 x i32> [[TMP35]], splat (i32 10)
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP37]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]]
+; CHECK:       [[PRED_STORE_IF18]]:
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i32> [[TMP36]], i32 0
+; CHECK-NEXT:    store i32 [[TMP39]], ptr [[TMP38]], align 4, !alias.scope [[META19]], !noalias [[META12]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE19]]
+; CHECK:       [[PRED_STORE_CONTINUE19]]:
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21]]
+; CHECK:       [[PRED_STORE_IF20]]:
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[TMP36]], i32 1
+; CHECK-NEXT:    store i32 [[TMP42]], ptr [[TMP41]], align 4, !alias.scope [[META19]], !noalias [[META12]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE21]]
+; CHECK:       [[PRED_STORE_CONTINUE21]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP43]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv
+  %c = load i32, ptr %gep.cond, align 4
+  %c.0 = icmp ule i32 %c, 11
+  br i1 %c.0, label %then, label %else
+
+then:
+  %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv
+  %l.src.then = load i32, ptr %gep.src.then, align 4
+  %add = add i32 %l.src.then, 10
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %add, ptr %gep.dst, align 4
+  br label %loop.latch
+
+else:
+  %gep.src.else = getelementptr inbounds i32, ptr %src, i32 %iv
+  store i32 99, ptr %gep.src.else, align 4
+  %l.src.else = load i32, ptr %gep.src.else, align 4
+  %sub = sub i32 %l.src.else, 5
+  %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %sub, ptr %gep.dst.else, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_noalias_store_via_runtime_checks(ptr %dst, ptr %dst.1, ptr %src, ptr %cond, i32 %n) {
+; CHECK-LABEL: define void @test_noalias_store_via_runtime_checks(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[DST_1:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST_1]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND04:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND15:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]]
+; CHECK-NEXT:    [[BOUND07:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[BOUND18:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT9:%.*]] = and i1 [[BOUND07]], [[BOUND18]]
+; CHECK-NEXT:    [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]]
+; CHECK-NEXT:    [[BOUND011:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND112:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT13:%.*]] = and i1 [[BOUND011]], [[BOUND112]]
+; CHECK-NEXT:    [[CONFLICT_RDX14:%.*]] = or i1 [[CONFLICT_RDX10]], [[FOUND_CONFLICT13]]
+; CHECK-NEXT:    [[BOUND015:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[BOUND116:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]]
+; CHECK-NEXT:    [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX14]], [[FOUND_CONFLICT17]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX18]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE30:.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META22:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11)
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i32 [[TMP4]]
+; CHECK-NEXT:    store i32 10, ptr [[TMP10]], align 4, !alias.scope [[META25:![0-9]+]], !noalias [[META27:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !alias.scope [[META30:![0-9]+]]
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP13]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[PRED_LOAD_IF19:.*]], label %[[PRED_LOAD_CONTINUE20:.*]]
+; CHECK:       [[PRED_LOAD_IF19]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i32 [[TMP5]]
+; CHECK-NEXT:    store i32 10, ptr [[TMP16]], align 4, !alias.scope [[META25]], !noalias [[META27]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4, !alias.scope [[META30]]
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP18]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE20]]
+; CHECK:       [[PRED_LOAD_CONTINUE20]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <2 x i32> [ [[TMP14]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], %[[PRED_LOAD_IF19]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = sub <2 x i32> [[TMP20]], splat (i32 5)
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP22]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[TMP21]], i32 0
+; CHECK-NEXT:    store i32 [[TMP24]], ptr [[TMP23]], align 4, !alias.scope [[META31:![0-9]+]], !noalias [[META32:![0-9]+]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; CHECK:       [[PRED_STORE_IF21]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x i32> [[TMP21]], i32 1
+; CHECK-NEXT:    store i32 [[TMP27]], ptr [[TMP26]], align 4, !alias.scope [[META31]], !noalias [[META32]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; CHECK:       [[PRED_STORE_CONTINUE22]]:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[PRED_LOAD_IF23:.*]], label %[[PRED_LOAD_CONTINUE24:.*]]
+; CHECK:       [[PRED_LOAD_IF23]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4, !alias.scope [[META30]]
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[TMP30]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE24]]
+; CHECK:       [[PRED_LOAD_CONTINUE24]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = phi <2 x i32> [ poison, %[[PRED_STORE_CONTINUE22]] ], [ [[TMP31]], %[[PRED_LOAD_IF23]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP33]], label %[[PRED_LOAD_IF25:.*]], label %[[PRED_LOAD_CONTINUE26:.*]]
+; CHECK:       [[PRED_LOAD_IF25]]:
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4, !alias.scope [[META30]]
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP35]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE26]]
+; CHECK:       [[PRED_LOAD_CONTINUE26]]:
+; CHECK-NEXT:    [[TMP37:%.*]] = phi <2 x i32> [ [[TMP32]], %[[PRED_LOAD_CONTINUE24]] ], [ [[TMP36]], %[[PRED_LOAD_IF25]] ]
+; CHECK-NEXT:    [[TMP38:%.*]] = add <2 x i32> [[TMP37]], splat (i32 10)
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP39]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; CHECK:       [[PRED_STORE_IF27]]:
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0
+; CHECK-NEXT:    store i32 [[TMP41]], ptr [[TMP40]], align 4, !alias.scope [[META31]], !noalias [[META32]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; CHECK:       [[PRED_STORE_CONTINUE28]]:
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP42]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30]]
+; CHECK:       [[PRED_STORE_IF29]]:
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1
+; CHECK-NEXT:    store i32 [[TMP44]], ptr [[TMP43]], align 4, !alias.scope [[META31]], !noalias [[META32]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; CHECK:       [[PRED_STORE_CONTINUE30]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP45]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv
+  %c = load i32, ptr %gep.cond, align 4
+  %c.0 = icmp ule i32 %c, 11
+  br i1 %c.0, label %then, label %else
+
+then:
+  %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv
+  %l.src.then = load i32, ptr %gep.src.then, align 4
+  %add = add i32 %l.src.then, 10
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %add, ptr %gep.dst, align 4
+  br label %loop.latch
+
+else:
+  %gep.dst.1.else = getelementptr inbounds i32, ptr %dst.1, i32 %iv
+  store i32 10, ptr %gep.dst.1.else, align 4
+  %gep.src.else = getelementptr inbounds i32, ptr %src, i32 %iv
+  %l.src.else = load i32, ptr %gep.src.else, align 4
+  %sub = sub i32 %l.src.else, 5
+  %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %sub, ptr %gep.dst.else, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_memory_op_between_loads_alias(ptr %dst, ptr %src, ptr %cond, ptr %dst.1, i32 %n) {
+; CHECK-LABEL: define void @test_memory_op_between_loads_alias(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], ptr [[DST_1:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT:    [[BOUND06:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND17:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT8:%.*]] = and i1 [[BOUND06]], [[BOUND17]]
+; CHECK-NEXT:    [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT8]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX9]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE17:.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META35:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11)
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP10]], align 4, !alias.scope [[META38:![0-9]+]], !noalias [[META40:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META38]], !noalias [[META40]]
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11:.*]]
+; CHECK:       [[PRED_LOAD_IF10]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP15]], align 4, !alias.scope [[META38]], !noalias [[META40]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META38]], !noalias [[META40]]
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP16]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE11]]
+; CHECK:       [[PRED_LOAD_CONTINUE11]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF10]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = add <2 x i32> [[TMP18]], splat (i32 10)
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP21]], align 4, !alias.scope [[META42:![0-9]+]], !noalias [[META35]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]]
+; CHECK:       [[PRED_STORE_IF12]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4, !alias.scope [[META42]], !noalias [[META35]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE13]]
+; CHECK:       [[PRED_STORE_CONTINUE13]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP26]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]]
+; CHECK:       [[PRED_STORE_IF14]]:
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4, !alias.scope [[META38]], !noalias [[META40]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4, !alias.scope [[META42]], !noalias [[META35]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE15]]
+; CHECK:       [[PRED_STORE_CONTINUE15]]:
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17]]
+; CHECK:       [[PRED_STORE_IF16]]:
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !alias.scope [[META38]], !noalias [[META40]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]]
+; CHECK-NEXT:    store i32 [[TMP32]], ptr [[TMP33]], align 4, !alias.scope [[META42]], !noalias [[META35]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE17]]
+; CHECK:       [[PRED_STORE_CONTINUE17]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv
+  %c = load i32, ptr %gep.cond, align 4
+  %c.0 = icmp ule i32 %c, 11
+  br i1 %c.0, label %then, label %middle
+
+middle:
+  %gep.src.middle = getelementptr inbounds i32, ptr %src, i32 %iv
+  store i32 0, ptr %gep.src.middle, align 4
+  br label %else
+
+then:
+  %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv
+  %l.src.then = load i32, ptr %gep.src.then, align 4
+  %gep.dst.then = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %l.src.then, ptr %gep.dst.then, align 4
+  br label %loop.latch
+
+else:
+  %gep.src.else = getelementptr inbounds i32, ptr %src, i32 %iv
+  %l.src.else = load i32, ptr %gep.src.else, align 4
+  %add = add i32 %l.src.else, 10
+  %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %add, ptr %gep.dst.else, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_memory_op_between_loads_no_alias_via_rt_checks(ptr %dst, ptr %src, ptr %cond, ptr %dst.1, i32 %n) {
+; CHECK-LABEL: define void @test_memory_op_between_loads_no_alias_via_rt_checks(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], ptr [[DST_1:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST_1]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND04:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND15:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]]
+; CHECK-NEXT:    [[BOUND07:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[BOUND18:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT9:%.*]] = and i1 [[BOUND07]], [[BOUND18]]
+; CHECK-NEXT:    [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]]
+; CHECK-NEXT:    [[BOUND011:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND112:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT13:%.*]] = and i1 [[BOUND011]], [[BOUND112]]
+; CHECK-NEXT:    [[CONFLICT_RDX14:%.*]] = or i1 [[CONFLICT_RDX10]], [[FOUND_CONFLICT13]]
+; CHECK-NEXT:    [[BOUND015:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[BOUND116:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]]
+; CHECK-NEXT:    [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX14]], [[FOUND_CONFLICT17]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX18]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE26:.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META45:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11)
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i32 [[TMP4]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP10]], align 4, !alias.scope [[META48:![0-9]+]], !noalias [[META50:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !alias.scope [[META53:![0-9]+]]
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP13]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[PRED_LOAD_IF19:.*]], label %[[PRED_LOAD_CONTINUE20:.*]]
+; CHECK:       [[PRED_LOAD_IF19]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i32 [[TMP5]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP16]], align 4, !alias.scope [[META48]], !noalias [[META50]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4, !alias.scope [[META53]]
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP18]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE20]]
+; CHECK:       [[PRED_LOAD_CONTINUE20]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <2 x i32> [ [[TMP14]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], %[[PRED_LOAD_IF19]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = add <2 x i32> [[TMP20]], splat (i32 10)
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP22]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[TMP21]], i32 0
+; CHECK-NEXT:    store i32 [[TMP24]], ptr [[TMP23]], align 4, !alias.scope [[META54:![0-9]+]], !noalias [[META55:![0-9]+]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; CHECK:       [[PRED_STORE_IF21]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x i32> [[TMP21]], i32 1
+; CHECK-NEXT:    store i32 [[TMP27]], ptr [[TMP26]], align 4, !alias.scope [[META54]], !noalias [[META55]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; CHECK:       [[PRED_STORE_CONTINUE22]]:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; CHECK:       [[PRED_STORE_IF23]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4, !alias.scope [[META53]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    store i32 [[TMP30]], ptr [[TMP31]], align 4, !alias.scope [[META54]], !noalias [[META55]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; CHECK:       [[PRED_STORE_CONTINUE24]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26]]
+; CHECK:       [[PRED_STORE_IF25]]:
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4, !alias.scope [[META53]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]]
+; CHECK-NEXT:    store i32 [[TMP34]], ptr [[TMP35]], align 4, !alias.scope [[META54]], !noalias [[META55]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; CHECK:       [[PRED_STORE_CONTINUE26]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP56:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv
+  %c = load i32, ptr %gep.cond, align 4
+  %c.0 = icmp ule i32 %c, 11
+  br i1 %c.0, label %then, label %middle
+
+middle:
+  %gep.dst.1 = getelementptr inbounds i32, ptr %dst.1, i32 %iv
+  store i32 0, ptr %gep.dst.1, align 4
+  br label %else
+
+then:
+  %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv
+  %l.src.then = load i32, ptr %gep.src.then, align 4
+  %gep.dst.then = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %l.src.then, ptr %gep.dst.then, align 4
+  br label %loop.latch
+
+else:
+  %gep.src.else = getelementptr inbounds i32, ptr %src, i32 %iv
+  %l.src.else = load i32, ptr %gep.src.else, align 4
+  %add = add i32 %l.src.else, 10
+  %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %add, ptr %gep.dst.else, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll
new file mode 100644
index 0000000000000..b30d010aaf9c9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll
@@ -0,0 +1,1291 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6
+; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
+
+define void @test(ptr %dst, ptr %src, ptr %cond, i32 %n) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x ptr> [[TMP10]], ptr [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP24]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11)
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <2 x i1> [[TMP15]], splat (i1 true)
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP34]], i32 0
+; CHECK-NEXT:    br i1 [[TMP35]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META3:![0-9]+]]
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP19]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x i1> [[TMP34]], i32 1
+; CHECK-NEXT:    br i1 [[TMP21]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]]
+; CHECK:       [[PRED_LOAD_IF6]]:
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META3]]
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP22]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE7]]
+; CHECK:       [[PRED_LOAD_CONTINUE7]]:
+; CHECK-NEXT:    [[TMP36:%.*]] = phi <2 x i32> [ [[TMP20]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], %[[PRED_LOAD_IF6]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = add <2 x i32> [[TMP36]], splat (i32 10)
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <2 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]]
+; CHECK:       [[PRED_LOAD_IF8]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META3]]
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[TMP26]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE9]]
+; CHECK:       [[PRED_LOAD_CONTINUE9]]:
+; CHECK-NEXT:    [[TMP33:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP31]], %[[PRED_LOAD_IF8]] ]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x i1> [[TMP15]], i32 1
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]]
+; CHECK:       [[PRED_LOAD_IF10]]:
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META3]]
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x i32> [[TMP33]], i32 [[TMP27]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE11]]
+; CHECK:       [[PRED_LOAD_CONTINUE11]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = phi <2 x i32> [ [[TMP33]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP28]], %[[PRED_LOAD_IF10]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP29]], <2 x i32> [[TMP25]]
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP37]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP60:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP60]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv
+  %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv
+  %l.c = load i32, ptr %gep.cond
+  %c = icmp ule i32 %l.c, 11
+  br i1 %c, label %then, label %else
+
+then:
+  %l.src = load i32, ptr %gep.src, align 4
+  br label %loop.latch
+
+else:
+  %l.src.2 = load i32, ptr %gep.src, align 4
+  %add = add i32 %l.src.2, 10
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.src, %then ], [ %add, %else ]
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %merge, ptr %gep.dst, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Negative test: Different addresses - should NOT hoist
+define void @different_addresses(ptr %dst, ptr %src1, ptr %src2, ptr %cond, i32 %n) {
+; CHECK-LABEL: define void @different_addresses(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC15:%.*]] = ptrtoint ptr [[SRC1]] to i64
+; CHECK-NEXT:    [[SRC23:%.*]] = ptrtoint ptr [[SRC2]] to i64
+; CHECK-NEXT:    [[COND2:%.*]] = ptrtoint ptr [[COND]] to i64
+; CHECK-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[DST1]], [[COND2]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = icmp ult i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[DST1]], [[SRC23]]
+; CHECK-NEXT:    [[FOUND_CONFLICT6:%.*]] = icmp ult i64 [[TMP1]], 8
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[DST1]], [[SRC15]]
+; CHECK-NEXT:    [[FOUND_CONFLICT9:%.*]] = icmp ult i64 [[TMP2]], 8
+; CHECK-NEXT:    [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE13:.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11)
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <2 x i1> [[TMP11]], splat (i1 true)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP20]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP19]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]]
+; CHECK:       [[PRED_LOAD_IF8]]:
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP25]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP29]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE9]]
+; CHECK:       [[PRED_LOAD_CONTINUE9]]:
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <2 x i32> [ [[TMP17]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP21]], %[[PRED_LOAD_IF8]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = add <2 x i32> [[TMP22]], splat (i32 10)
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11:.*]]
+; CHECK:       [[PRED_LOAD_IF10]]:
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP49]], align 4
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> poison, i32 [[TMP24]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE11]]
+; CHECK:       [[PRED_LOAD_CONTINUE11]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP30]], %[[PRED_LOAD_IF10]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1
+; CHECK-NEXT:    br i1 [[TMP31]], label %[[PRED_LOAD_IF12:.*]], label %[[PRED_LOAD_CONTINUE13]]
+; CHECK:       [[PRED_LOAD_IF12]]:
+; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP52]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP26]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE13]]
+; CHECK:       [[PRED_LOAD_CONTINUE13]]:
+; CHECK-NEXT:    [[TMP33:%.*]] = phi <2 x i32> [ [[TMP32]], %[[PRED_LOAD_CONTINUE11]] ], [ [[TMP27]], %[[PRED_LOAD_IF12]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x i32> [[TMP33]], <2 x i32> [[TMP23]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP34]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP60:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP60]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.src.1 = getelementptr inbounds i32, ptr %src1, i32 %iv
+  %gep.src.2 = getelementptr inbounds i32, ptr %src2, i32 %iv
+  %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv
+  %l.c = load i32, ptr %gep.cond
+  %c = icmp ule i32 %l.c, 11
+  br i1 %c, label %then, label %else
+
+then:
+  %l.src = load i32, ptr %gep.src.1, align 4
+  br label %loop.latch
+
+else:
+  %l.src.2 = load i32, ptr %gep.src.2, align 4
+  %add = add i32 %l.src.2, 10
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.src, %then ], [ %add, %else ]
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %merge, ptr %gep.dst, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Negative test: Non-complementary masks - should NOT hoist
+define void @non_complementary_masks(ptr %dst, ptr %src, ptr %cond1, ptr %cond2, i32 %n) {
+; CHECK-LABEL: define void @non_complementary_masks(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND1:%.*]], ptr [[COND2:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND1]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND2]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[COND1]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND04:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND15:%.*]] = icmp ult ptr [[COND2]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]]
+; CHECK-NEXT:    [[BOUND07:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[BOUND18:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT9:%.*]] = and i1 [[BOUND07]], [[BOUND18]]
+; CHECK-NEXT:    [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE17:.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x ptr> [[TMP10]], ptr [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[COND1]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[COND2]], i32 [[TMP4]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP24]], align 4, !alias.scope [[META14:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <2 x i32>, ptr [[TMP25]], align 4, !alias.scope [[META17:![0-9]+]]
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11)
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD11]], splat (i32 20)
+; CHECK-NEXT:    [[TMP18:%.*]] = xor <2 x i1> [[TMP37]], splat (i1 true)
+; CHECK-NEXT:    [[TMP19:%.*]] = select <2 x i1> [[TMP18]], <2 x i1> [[TMP38]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i1> [[TMP19]], i32 0
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META19:![0-9]+]]
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP23:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP22]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP19]], i32 1
+; CHECK-NEXT:    br i1 [[TMP39]], label %[[PRED_LOAD_IF12:.*]], label %[[PRED_LOAD_CONTINUE13:.*]]
+; CHECK:       [[PRED_LOAD_IF12]]:
+; CHECK-NEXT:    [[TMP40:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META19]]
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP40]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE13]]
+; CHECK:       [[PRED_LOAD_CONTINUE13]]:
+; CHECK-NEXT:    [[TMP27:%.*]] = phi <2 x i32> [ [[TMP23]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP26]], %[[PRED_LOAD_IF12]] ]
+; CHECK-NEXT:    [[TMP28:%.*]] = add <2 x i32> [[TMP27]], splat (i32 10)
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x i1> [[TMP37]], i32 0
+; CHECK-NEXT:    br i1 [[TMP33]], label %[[PRED_LOAD_IF14:.*]], label %[[PRED_LOAD_CONTINUE15:.*]]
+; CHECK:       [[PRED_LOAD_IF14]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META19]]
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <2 x i32> poison, i32 [[TMP29]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE15]]
+; CHECK:       [[PRED_LOAD_CONTINUE15]]:
+; CHECK-NEXT:    [[TMP36:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE13]] ], [ [[TMP34]], %[[PRED_LOAD_IF14]] ]
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP37]], i32 1
+; CHECK-NEXT:    br i1 [[TMP35]], label %[[PRED_LOAD_IF16:.*]], label %[[PRED_LOAD_CONTINUE17]]
+; CHECK:       [[PRED_LOAD_IF16]]:
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META19]]
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <2 x i32> [[TMP36]], i32 [[TMP30]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE17]]
+; CHECK:       [[PRED_LOAD_CONTINUE17]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = phi <2 x i32> [ [[TMP36]], %[[PRED_LOAD_CONTINUE15]] ], [ [[TMP31]], %[[PRED_LOAD_IF16]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP19]], <2 x i32> [[TMP28]], <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI18:%.*]] = select <2 x i1> [[TMP37]], <2 x i32> [[TMP32]], <2 x i32> [[PREDPHI]]
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI18]], ptr [[TMP41]], align 4, !alias.scope [[META21:![0-9]+]], !noalias [[META23:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP63:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP63]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv
+  %gep.cond1 = getelementptr inbounds i32, ptr %cond1, i32 %iv
+  %gep.cond2 = getelementptr inbounds i32, ptr %cond2, i32 %iv
+  %l.c1 = load i32, ptr %gep.cond1
+  %l.c2 = load i32, ptr %gep.cond2
+  %c1 = icmp ule i32 %l.c1, 11
+  %c2 = icmp ule i32 %l.c2, 20
+  br i1 %c1, label %then, label %else
+
+then:
+  %l.src = load i32, ptr %gep.src, align 4
+  br label %loop.latch
+
+else:
+  br i1 %c2, label %else.then, label %loop.latch
+
+else.then:
+  %l.src.2 = load i32, ptr %gep.src, align 4
+  %add = add i32 %l.src.2, 10
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.src, %then ], [ %add, %else.then ], [ 0, %else ]
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %merge, ptr %gep.dst, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Negative test: Different access sizes - should NOT hoist
+; Both loads use the same pointer but have different types (i8 vs i32)
+define void @different_access_sizes(ptr %dst, ptr %src, ptr %cond, i32 %n) {
+; CHECK-LABEL: define void @different_access_sizes(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x ptr> [[TMP8]], ptr [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP10]], align 4, !alias.scope [[META26:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11)
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <2 x i1> [[TMP11]], splat (i1 true)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP6]], align 4, !alias.scope [[META29:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP14]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP15]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]]
+; CHECK:       [[PRED_LOAD_IF6]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP7]], align 4, !alias.scope [[META29]]
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> [[TMP16]], i32 [[TMP18]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE7]]
+; CHECK:       [[PRED_LOAD_CONTINUE7]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <2 x i32> [ [[TMP16]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], %[[PRED_LOAD_IF6]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = add <2 x i32> [[TMP20]], splat (i32 10)
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0
+; CHECK-NEXT:    br i1 [[TMP22]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]]
+; CHECK:       [[PRED_LOAD_IF8]]:
+; CHECK-NEXT:    [[TMP23:%.*]] = load i8, ptr [[TMP6]], align 4, !alias.scope [[META29]]
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <2 x i8> poison, i8 [[TMP23]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE9]]
+; CHECK:       [[PRED_LOAD_CONTINUE9]]:
+; CHECK-NEXT:    [[TMP25:%.*]] = phi <2 x i8> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP24]], %[[PRED_LOAD_IF8]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1
+; CHECK-NEXT:    br i1 [[TMP26]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]]
+; CHECK:       [[PRED_LOAD_IF10]]:
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[TMP7]], align 4, !alias.scope [[META29]]
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x i8> [[TMP25]], i8 [[TMP27]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE11]]
+; CHECK:       [[PRED_LOAD_CONTINUE11]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = phi <2 x i8> [ [[TMP25]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP28]], %[[PRED_LOAD_IF10]] ]
+; CHECK-NEXT:    [[TMP30:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x i32> [[TMP30]], <2 x i32> [[TMP21]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP31]], align 4, !alias.scope [[META31:![0-9]+]], !noalias [[META33:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv
+  %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv
+  %l.c = load i32, ptr %gep.cond
+  %c = icmp ule i32 %l.c, 11
+  br i1 %c, label %then, label %else
+
+then:
+  %l.src = load i8, ptr %gep.src, align 4
+  %ext = zext i8 %l.src to i32
+  br label %loop.latch
+
+else:
+  %l.src.2 = load i32, ptr %gep.src, align 4
+  %add = add i32 %l.src.2, 10
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %ext, %then ], [ %add, %else ]
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %merge, ptr %gep.dst, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Positive test: Same address with different alignments - should hoist with minimum alignment
+define void @different_alignments_same_address(ptr %dst, ptr %src, ptr %cond, i32 %n) {
+; CHECK-LABEL: define void @different_alignments_same_address(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x ptr> [[TMP10]], ptr [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP20]], align 4, !alias.scope [[META36:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11)
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <2 x i1> [[TMP15]], splat (i1 true)
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP16]], i32 0
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META39:![0-9]+]]
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP35:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP19]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x i1> [[TMP16]], i32 1
+; CHECK-NEXT:    br i1 [[TMP21]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]]
+; CHECK:       [[PRED_LOAD_IF6]]:
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META39]]
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x i32> [[TMP35]], i32 [[TMP22]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE7]]
+; CHECK:       [[PRED_LOAD_CONTINUE7]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = phi <2 x i32> [ [[TMP35]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], %[[PRED_LOAD_IF6]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = add <2 x i32> [[TMP24]], splat (i32 10)
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <2 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]]
+; CHECK:       [[PRED_LOAD_IF8]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP8]], align 2, !alias.scope [[META39]]
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[TMP26]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE9]]
+; CHECK:       [[PRED_LOAD_CONTINUE9]]:
+; CHECK-NEXT:    [[TMP33:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP31]], %[[PRED_LOAD_IF8]] ]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x i1> [[TMP15]], i32 1
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]]
+; CHECK:       [[PRED_LOAD_IF10]]:
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP9]], align 2, !alias.scope [[META39]]
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x i32> [[TMP33]], i32 [[TMP27]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE11]]
+; CHECK:       [[PRED_LOAD_CONTINUE11]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = phi <2 x i32> [ [[TMP33]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP28]], %[[PRED_LOAD_IF10]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP29]], <2 x i32> [[TMP25]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP34]], align 4, !alias.scope [[META41:![0-9]+]], !noalias [[META43:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP48:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv
+  %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv
+  %l.c = load i32, ptr %gep.cond
+  %c = icmp ule i32 %l.c, 11
+  br i1 %c, label %then, label %else
+
+then:
+  %l.src = load i32, ptr %gep.src, align 2
+  br label %loop.latch
+
+else:
+  %l.src.2 = load i32, ptr %gep.src, align 4
+  %add = add i32 %l.src.2, 10
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.src, %then ], [ %add, %else ]
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %merge, ptr %gep.dst, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Negative test: Volatile loads - should NOT hoist
+define void @volatile_load(ptr %dst, ptr %src, ptr %cond, i32 %n) {
+; CHECK-LABEL: define void @volatile_load(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
+; CHECK-NEXT:    [[GEP_COND:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[IV]]
+; CHECK-NEXT:    [[L_C:%.*]] = load i32, ptr [[GEP_COND]], align 4
+; CHECK-NEXT:    [[C:%.*]] = icmp ule i32 [[L_C]], 11
+; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[L_SRC:%.*]] = load volatile i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[L_SRC_2]], 10
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_SRC]], %[[THEN]] ], [ [[ADD]], %[[ELSE]] ]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_DST]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv
+  %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv
+  %l.c = load i32, ptr %gep.cond
+  %c = icmp ule i32 %l.c, 11
+  br i1 %c, label %then, label %else
+
+then:
+  %l.src = load volatile i32, ptr %gep.src, align 4
+  br label %loop.latch
+
+else:
+  %l.src.2 = load i32, ptr %gep.src, align 4
+  %add = add i32 %l.src.2, 10
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.src, %then ], [ %add, %else ]
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %merge, ptr %gep.dst, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Test hoisting with duplicate GEPs: The same address is computed by different
+; GEP instructions in different branches. The hoisting pass should use SCEV to
+; recognize they compute the same address and hoist the load.
+define void @duplicate_gep(ptr %dst, ptr %src, ptr %cond, i32 %n) {
+; CHECK-LABEL: define void @duplicate_gep(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META46:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11)
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META49:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]]
+; CHECK:       [[PRED_LOAD_IF6]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META49]]
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP16]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE7]]
+; CHECK:       [[PRED_LOAD_CONTINUE7]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF6]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = add <2 x i32> [[TMP18]], splat (i32 10)
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]]
+; CHECK:       [[PRED_LOAD_IF8]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4, !alias.scope [[META49]]
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE9]]
+; CHECK:       [[PRED_LOAD_CONTINUE9]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP23]], %[[PRED_LOAD_IF8]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP25]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]]
+; CHECK:       [[PRED_LOAD_IF10]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4, !alias.scope [[META49]]
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP27]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE11]]
+; CHECK:       [[PRED_LOAD_CONTINUE11]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = phi <2 x i32> [ [[TMP24]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP28]], %[[PRED_LOAD_IF10]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> [[TMP29]], <2 x i32> [[TMP19]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP30]], align 4, !alias.scope [[META51:![0-9]+]], !noalias [[META53:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv
+  %l.c = load i32, ptr %gep.cond
+  %c = icmp ule i32 %l.c, 11
+  br i1 %c, label %then, label %else
+
+then:
+  %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv
+  %l.src = load i32, ptr %gep.src.then, align 4
+  br label %loop.latch
+
+else:
+  %gep.src.else= getelementptr inbounds i32, ptr %src, i32 %iv
+  %l.src.2 = load i32, ptr %gep.src.else, align 4
+  %add = add i32 %l.src.2, 10
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.src, %then ], [ %add, %else ]
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %merge, ptr %gep.dst, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Test with non-unit-stride loads: Loads have stride 16 (2 doubles * 8 bytes)
+; instead of unit stride (8 bytes). The hoisting optimization should still work
+; since both loads access the same address with the same stride.
+define void @non_unit_stride_i64(ptr %dst, ptr %src, ptr %cond, i32 %n) {
+; CHECK-LABEL: define void @non_unit_stride_i64(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 4
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP6]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4, !alias.scope [[META56:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11)
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[TMP9]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !alias.scope [[META59:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> poison, i32 [[TMP13]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP14]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]]
+; CHECK:       [[PRED_LOAD_IF6]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4, !alias.scope [[META59]]
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP18]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE7]]
+; CHECK:       [[PRED_LOAD_CONTINUE7]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <2 x i32> [ [[TMP15]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], %[[PRED_LOAD_IF6]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = add <2 x i32> [[TMP20]], splat (i32 10)
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
+; CHECK-NEXT:    br i1 [[TMP22]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]]
+; CHECK:       [[PRED_LOAD_IF8]]:
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4, !alias.scope [[META59]]
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP24]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE9]]
+; CHECK:       [[PRED_LOAD_CONTINUE9]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP25]], %[[PRED_LOAD_IF8]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
+; CHECK-NEXT:    br i1 [[TMP27]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]]
+; CHECK:       [[PRED_LOAD_IF10]]:
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4, !alias.scope [[META59]]
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> [[TMP26]], i32 [[TMP29]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE11]]
+; CHECK:       [[PRED_LOAD_CONTINUE11]]:
+; CHECK-NEXT:    [[TMP31:%.*]] = phi <2 x i32> [ [[TMP26]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP30]], %[[PRED_LOAD_IF10]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP9]], <2 x i32> [[TMP31]], <2 x i32> [[TMP21]]
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP6]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP32]], align 4, !alias.scope [[META61:![0-9]+]], !noalias [[META63:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP33]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP64:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv
+  %l.c = load i32, ptr %gep.cond
+  %c = icmp ule i32 %l.c, 11
+  br i1 %c, label %then, label %else
+
+then:
+  %gep.src.then = getelementptr inbounds i64, ptr %src, i32 %iv
+  %l.src = load i32, ptr %gep.src.then, align 4
+  br label %loop.latch
+
+else:
+  %gep.src.else= getelementptr inbounds i64, ptr %src, i32 %iv
+  %l.src.2 = load i32, ptr %gep.src.else, align 4
+  %add = add i32 %l.src.2, 10
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.src, %then ], [ %add, %else ]
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %merge, ptr %gep.dst, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
+; Test that loads inside masked regions (without individual masks) are
+; correctly detected and hoisted when they have complementary predicates.
+define void @hoist_loads_in_masked_regions(ptr noalias %dst, ptr noalias %src, ptr %cond, i32 %n) {
+; CHECK-LABEL: define void @hoist_loads_in_masked_regions(
+; CHECK-SAME: ptr noalias [[DST:%.*]], ptr noalias [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[SRC]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11)
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> [[WIDE_LOAD1]], <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[INDEX]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP66:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv
+  %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv
+  %l.c = load i32, ptr %gep.cond
+  %c = icmp ule i32 %l.c, 11
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  %l.src = load i32, ptr %gep.src, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.src, %then ], [ 0, %loop ]
+  %l.src.2 = load i32, ptr %gep.src, align 4
+  %add = add i32 %l.src.2, %merge
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %merge, ptr %gep.dst, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+; Test that when there are 3 or more regions with complementary predicates
+; loading from the same address, all loads are hoisted and replaced, not just
+; the first pair. This tests the K loop that continues searching after finding
+; the initial complementary pair.
+define void @hoist_multiple_complementary_loads(ptr noalias %dst, ptr noalias %src, ptr %cond, i32 %n) {
+; CHECK-LABEL: define void @hoist_multiple_complementary_loads(
+; CHECK-SAME: ptr noalias [[DST:%.*]], ptr noalias [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE10:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP0]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP43]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP64:%.*]] = load i32, ptr [[TMP63]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP64]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP70:%.*]] = load i32, ptr [[TMP69]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP70]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP28:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = add <2 x i32> [[TMP28]], splat (i32 1)
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], splat (i32 32)
+; CHECK-NEXT:    [[TMP29:%.*]] = xor <2 x i1> [[TMP16]], splat (i1 true)
+; CHECK-NEXT:    [[TMP32:%.*]] = select <2 x i1> [[TMP7]], <2 x i1> [[TMP29]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP32]], i32 0
+; CHECK-NEXT:    br i1 [[TMP19]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; CHECK:       [[PRED_LOAD_IF3]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE4]]
+; CHECK:       [[PRED_LOAD_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP23:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP33]], %[[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[TMP32]], i32 1
+; CHECK-NEXT:    br i1 [[TMP24]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]]
+; CHECK:       [[PRED_LOAD_IF5]]:
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP26]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
+; CHECK:       [[PRED_LOAD_CONTINUE6]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <2 x i32> [ [[TMP23]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], %[[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = mul <2 x i32> [[TMP18]], splat (i32 2)
+; CHECK-NEXT:    [[TMP30:%.*]] = select <2 x i1> [[TMP7]], <2 x i1> [[TMP16]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP30]], i32 0
+; CHECK-NEXT:    br i1 [[TMP31]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8:.*]]
+; CHECK:       [[PRED_LOAD_IF7]]:
+; CHECK-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP61]], align 4
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <2 x i32> poison, i32 [[TMP34]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE8]]
+; CHECK:       [[PRED_LOAD_CONTINUE8]]:
+; CHECK-NEXT:    [[TMP35:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP38]], %[[PRED_LOAD_IF7]] ]
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <2 x i1> [[TMP30]], i32 1
+; CHECK-NEXT:    br i1 [[TMP36]], label %[[PRED_LOAD_IF9:.*]], label %[[PRED_LOAD_CONTINUE10]]
+; CHECK:       [[PRED_LOAD_IF9]]:
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP65]], align 4
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <2 x i32> [[TMP35]], i32 [[TMP37]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE10]]
+; CHECK:       [[PRED_LOAD_CONTINUE10]]:
+; CHECK-NEXT:    [[TMP45:%.*]] = phi <2 x i32> [ [[TMP35]], %[[PRED_LOAD_CONTINUE8]] ], [ [[TMP44]], %[[PRED_LOAD_IF9]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP32]], <2 x i32> [[TMP22]], <2 x i32> [[TMP15]]
+; CHECK-NEXT:    [[TMP42:%.*]] = select <2 x i1> [[TMP30]], <2 x i32> [[TMP45]], <2 x i32> [[PREDPHI]]
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP39]], i64 32
+; CHECK-NEXT:    store <2 x i32> [[TMP42]], ptr [[TMP40]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP68:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv
+  %l.cond = load i32, ptr %gep.cond, align 4
+  %c.1 = icmp ne i32 %l.cond, 0
+  br i1 %c.1, label %check2, label %region3
+
+check2:
+  %c.2 = icmp ne i32 %l.cond, 32
+  br i1 %c.2, label %region1, label %region2
+
+region1:
+  %gep.src.8.r1 = getelementptr inbounds i8, ptr %src, i32 %iv
+  %val1 = load i32, ptr %gep.src.8.r1, align 4
+  br label %loop.latch
+
+region2:
+  %gep.src.8.r2 = getelementptr inbounds i8, ptr %src, i32 %iv
+  %val2 = load i32, ptr %gep.src.8.r2, align 4
+  %mul = mul i32 %val2, 2
+  br label %loop.latch
+
+region3:
+  %gep.src.8.r3 = getelementptr inbounds i8, ptr %src, i32 %iv
+  %val3 = load i32, ptr %gep.src.8.r3, align 4
+  %add = add i32 %val3, 1
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %val1, %region1 ], [ %mul, %region2 ], [ %add, %region3 ]
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv
+  %offset.dst = getelementptr inbounds i8, ptr %gep.dst, i64 32
+  store i32 %merge, ptr %offset.dst, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @hoist_predicated_load_with_chained_geps1(ptr %dst, ptr %src, i1 %cond) {
+; CHECK-LABEL: define void @hoist_predicated_load_with_chained_geps1(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 2
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 8
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 2210
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[COND]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE8:.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2, !alias.scope [[META70:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i16> poison, i16 [[TMP6]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x i16> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; CHECK:       [[PRED_LOAD_IF3]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i64 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP11]], align 2, !alias.scope [[META70]]
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i16> [[TMP8]], i16 [[TMP12]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE4]]
+; CHECK:       [[PRED_LOAD_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], %[[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    br i1 [[COND]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]]
+; CHECK:       [[PRED_LOAD_IF5]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i64 8
+; CHECK-NEXT:    [[TMP17:%.*]] = load i16, ptr [[TMP16]], align 2, !alias.scope [[META70]]
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i16> poison, i16 [[TMP17]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
+; CHECK:       [[PRED_LOAD_CONTINUE6]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = phi <2 x i16> [ poison, %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP18]], %[[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    br i1 [[COND]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8]]
+; CHECK:       [[PRED_LOAD_IF7]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i64 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load i16, ptr [[TMP21]], align 2, !alias.scope [[META70]]
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x i16> [[TMP19]], i16 [[TMP22]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE8]]
+; CHECK:       [[PRED_LOAD_CONTINUE8]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = phi <2 x i16> [ [[TMP19]], %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP23]], %[[PRED_LOAD_IF7]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[COND]], <2 x i16> [[TMP24]], <2 x i16> [[TMP14]]
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
+; CHECK-NEXT:    store i16 [[TMP25]], ptr [[DST]], align 2, !alias.scope [[META73:![0-9]+]], !noalias [[META70]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP75:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  br i1 %cond, label %then, label %else
+
+then:
+  %gep1 = getelementptr [11 x i16], ptr %src, i64 %iv
+  %gep2 = getelementptr i8, ptr %gep1, i64 8
+  %l.0 = load i16, ptr %gep2, align 2
+  br label %loop.latch
+
+else:
+  %gep3 = getelementptr [11 x i16], ptr %src, i64 %iv
+  %gep4 = getelementptr i8, ptr %gep3, i64 8
+  %l.1 = load i16, ptr %gep4, align 2
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i16 [ %l.0, %then ], [ %l.1, %else ]
+  store i16 %merge, ptr %dst, align 2
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, 100
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @hoist_predicated_load_with_chained_geps2(ptr %dst, ptr %src, i1 %cond) {
+; CHECK-LABEL: define void @hoist_predicated_load_with_chained_geps2(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 2
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 8
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 2210
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[COND]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE8:.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x ptr> [[TMP5]], ptr [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[TMP8]], align 2, !alias.scope [[META77:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i16> poison, i16 [[TMP9]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <2 x i16> [ poison, %[[VECTOR_BODY]] ], [ [[TMP10]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; CHECK:       [[PRED_LOAD_IF3]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP4]], i64 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP13]], align 2, !alias.scope [[META77]]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i16> [[TMP11]], i16 [[TMP14]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE4]]
+; CHECK:       [[PRED_LOAD_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i16> [ [[TMP11]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP15]], %[[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    br i1 [[COND]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]]
+; CHECK:       [[PRED_LOAD_IF5]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load i16, ptr [[TMP17]], align 2, !alias.scope [[META77]]
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i16> poison, i16 [[TMP18]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
+; CHECK:       [[PRED_LOAD_CONTINUE6]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <2 x i16> [ poison, %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP19]], %[[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    br i1 [[COND]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8]]
+; CHECK:       [[PRED_LOAD_IF7]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP4]], i64 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load i16, ptr [[TMP21]], align 2, !alias.scope [[META77]]
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x i16> [[TMP20]], i16 [[TMP22]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE8]]
+; CHECK:       [[PRED_LOAD_CONTINUE8]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = phi <2 x i16> [ [[TMP20]], %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP23]], %[[PRED_LOAD_IF7]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[COND]], <2 x i16> [[TMP24]], <2 x i16> [[TMP16]]
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
+; CHECK-NEXT:    store i16 [[TMP25]], ptr [[DST]], align 2, !alias.scope [[META80:![0-9]+]], !noalias [[META77]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP82:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep1 = getelementptr [11 x i16], ptr %src, i64 %iv
+  br i1 %cond, label %then, label %else
+
+then:
+  %gep2 = getelementptr i8, ptr %gep1, i64 8
+  %l.0 = load i16, ptr %gep2, align 2
+  br label %loop.latch
+
+else:
+  %gep3 = getelementptr i8, ptr %gep1, i64 8
+  %l.1 = load i16, ptr %gep3, align 2
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i16 [ %l.0, %then ], [ %l.1, %else ]
+  store i16 %merge, ptr %dst, align 2
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, 100
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
index 7b9fcebb34049..c236b0af2a61d 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
@@ -716,15 +716,13 @@ define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_UDIV_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; CHECK:       pred.udiv.if:
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], [[X:%.*]]
@@ -744,7 +742,7 @@ define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
 ; CHECK:       pred.udiv.continue2:
 ; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_UDIV_IF1]] ]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP16]], <2 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <2 x i32> [[TMP16]], <2 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP18]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index c164c4a46bd94..f9dd626e523e8 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -296,8 +296,6 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; VEC-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
 ; VEC-NEXT:    [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]]
 ; VEC-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[V_2:%.*]], i32 0
-; VEC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[COND_2:%.*]], i64 0
-; VEC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
@@ -305,7 +303,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; VEC-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[V_1]], [[INDEX]]
 ; VEC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR:%.*]], i64 0, i64 [[OFFSET_IDX]]
 ; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 4
-; VEC-NEXT:    br i1 [[COND_2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]]
+; VEC-NEXT:    br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.if:
 ; VEC-NEXT:    [[INDVARS_IV3:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VEC-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[INDVARS_IV3]]
@@ -318,7 +316,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.continue2:
 ; VEC-NEXT:    [[TMP15:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 1)
-; VEC-NEXT:    [[PREDPHI]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP15]], <2 x i32> [[VEC_PHI]]
+; VEC-NEXT:    [[PREDPHI]] = select i1 [[COND_2]], <2 x i32> [[TMP15]], <2 x i32> [[VEC_PHI]]
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VEC-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VEC-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -384,15 +382,15 @@ for.inc26:
 ; conditional store to remain scalar. Since we can only type-shrink vector
 ; types, we shouldn't try to represent the expression in a smaller type.
 ;
-define void @minimal_bit_widths(i1 %c) {
+define void @minimal_bit_widths(ptr %p, i1 %c) {
 ; UNROLL-LABEL: @minimal_bit_widths(
 ; UNROLL-NEXT:  entry:
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
 ; UNROLL-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]]
-; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP1]]
+; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]]
+; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]]
 ; UNROLL-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1
 ; UNROLL-NEXT:    [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1
 ; UNROLL-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]]
@@ -415,8 +413,8 @@ define void @minimal_bit_widths(i1 %c) {
 ; UNROLL-NOSIMPLIFY:       vector.body:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP1]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -442,16 +440,16 @@ define void @minimal_bit_widths(i1 %c) {
 ; VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
-; VEC-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]]
+; VEC-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]]
 ; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP1]], align 1
 ; VEC-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.if:
 ; VEC-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; VEC-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP8]]
+; VEC-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]]
 ; VEC-NEXT:    [[TMP4:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0
 ; VEC-NEXT:    store i8 [[TMP4]], ptr [[TMP3]], align 1
 ; VEC-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 1
-; VEC-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr undef, i64 [[TMP5]]
+; VEC-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP5]]
 ; VEC-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1
 ; VEC-NEXT:    store i8 [[TMP7]], ptr [[TMP6]], align 1
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE2]]
@@ -468,7 +466,7 @@ entry:
 for.body:
   %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ]
   %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 1000, %entry ]
-  %tmp2 = getelementptr i8, ptr undef, i64 %tmp0
+  %tmp2 = getelementptr i8, ptr %p, i64 %tmp0
   %tmp3 = load i8, ptr %tmp2, align 1
   br i1 %c, label %if.then, label %for.inc
 
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index e33995327b856..66e4de5da7955 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1959,15 +1959,13 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[SMAX]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[SMAX]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[PRED_UDIV_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; CHECK:       pred.udiv.if:
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
@@ -1985,7 +1983,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
 ; CHECK:       pred.udiv.continue2:
 ; CHECK-NEXT:    [[TMP11:%.*]] = phi <2 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP10]], [[PRED_UDIV_IF1]] ]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP11]], <2 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <2 x i32> [[TMP11]], <2 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP16]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -2024,8 +2022,6 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; IND-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
 ; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483646
-; IND-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
-; IND-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
@@ -2033,7 +2029,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; IND-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
 ; IND-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
 ; IND-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
-; IND-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; IND-NEXT:    br i1 [[C:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; IND:       pred.udiv.if:
 ; IND-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 0
 ; IND-NEXT:    [[TMP3:%.*]] = udiv i32 [[TMP2]], [[INDEX]]
@@ -2049,8 +2045,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; IND-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP8]], i64 1
 ; IND-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
 ; IND:       pred.udiv.continue2:
-; IND-NEXT:    [[TMP10:%.*]] = phi <2 x i32> [ [[TMP5]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP9]], [[PRED_UDIV_IF1]] ]
-; IND-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP10]], <2 x i32> [[WIDE_LOAD]]
+; IND-NEXT:    [[PREDPHI:%.*]] = phi <2 x i32> [ [[WIDE_LOAD]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP9]], [[PRED_UDIV_IF1]] ]
 ; IND-NEXT:    [[TMP13]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; IND-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -2090,8 +2085,6 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483644
-; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
-; UNROLL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE8:%.*]] ]
@@ -2102,7 +2095,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 8
 ; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; UNROLL-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
-; UNROLL-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; UNROLL-NEXT:    br i1 [[C:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; UNROLL:       pred.udiv.if:
 ; UNROLL-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 0
 ; UNROLL-NEXT:    [[TMP4:%.*]] = udiv i32 [[TMP3]], [[INDEX]]
@@ -2136,9 +2129,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> [[TMP16]], i32 [[TMP19]], i64 1
 ; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; UNROLL:       pred.udiv.continue8:
-; UNROLL-NEXT:    [[TMP21:%.*]] = phi <2 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP20]], [[PRED_UDIV_IF7]] ]
-; UNROLL-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP11]], <2 x i32> [[WIDE_LOAD]]
-; UNROLL-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP21]], <2 x i32> [[WIDE_LOAD2]]
+; UNROLL-NEXT:    [[PREDPHI9:%.*]] = phi <2 x i32> [ [[WIDE_LOAD2]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP20]], [[PRED_UDIV_IF7]] ]
+; UNROLL-NEXT:    [[PREDPHI:%.*]] = phi <2 x i32> [ [[WIDE_LOAD]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP11]], [[PRED_UDIV_IF7]] ]
 ; UNROLL-NEXT:    [[TMP22]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; UNROLL-NEXT:    [[TMP23]] = add <2 x i32> [[PREDPHI9]], [[VEC_PHI1]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -2181,8 +2173,6 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC:       vector.ph:
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[SMAX]], 4
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[SMAX]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE8:%.*]] ]
@@ -2192,7 +2182,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
-; UNROLL-NO-IC-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[C:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if:
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 0
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
@@ -2228,8 +2218,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; UNROLL-NO-IC:       pred.udiv.continue8:
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = phi <2 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP28]], [[PRED_UDIV_IF7]] ]
-; UNROLL-NO-IC-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP17]], <2 x i32> [[WIDE_LOAD]]
-; UNROLL-NO-IC-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP29]], <2 x i32> [[WIDE_LOAD2]]
+; UNROLL-NO-IC-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <2 x i32> [[TMP17]], <2 x i32> [[WIDE_LOAD]]
+; UNROLL-NO-IC-NEXT:    [[PREDPHI9:%.*]] = select i1 [[C]], <2 x i32> [[TMP29]], <2 x i32> [[WIDE_LOAD2]]
 ; UNROLL-NO-IC-NEXT:    [[TMP32]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; UNROLL-NO-IC-NEXT:    [[TMP33]] = add <2 x i32> [[PREDPHI9]], [[VEC_PHI1]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -2270,8 +2260,6 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483640
-; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0
-; INTERLEAVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE16:%.*]] ]
@@ -2282,7 +2270,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
-; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; INTERLEAVE-NEXT:    br i1 [[C:%.*]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; INTERLEAVE:       pred.udiv.if:
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = udiv i32 [[TMP3]], [[INDEX]]
@@ -2352,9 +2340,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP40:%.*]] = insertelement <4 x i32> [[TMP36]], i32 [[TMP39]], i64 3
 ; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE16]]
 ; INTERLEAVE:       pred.udiv.continue16:
-; INTERLEAVE-NEXT:    [[TMP41:%.*]] = phi <4 x i32> [ [[TMP36]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP40]], [[PRED_UDIV_IF15]] ]
-; INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP21]], <4 x i32> [[WIDE_LOAD]]
-; INTERLEAVE-NEXT:    [[PREDPHI17:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP41]], <4 x i32> [[WIDE_LOAD2]]
+; INTERLEAVE-NEXT:    [[PREDPHI17:%.*]] = phi <4 x i32> [ [[WIDE_LOAD2]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP40]], [[PRED_UDIV_IF15]] ]
+; INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = phi <4 x i32> [ [[WIDE_LOAD]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP21]], [[PRED_UDIV_IF15]] ]
 ; INTERLEAVE-NEXT:    [[TMP42]] = add <4 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; INTERLEAVE-NEXT:    [[TMP43]] = add <4 x i32> [[PREDPHI17]], [[VEC_PHI1]]
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
index 7b0c366e16c7b..440309d246899 100644
--- a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
+++ b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
@@ -153,3 +153,79 @@ loop:
 exit:
   ret void
 }
+
+define void @narrow_widen_store_user(i32 %x, ptr noalias %A, ptr noalias %B) {
+; VF4IC1-LABEL: define void @narrow_widen_store_user(
+; VF4IC1-SAME: i32 [[X:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
+; VF4IC1-NEXT:  [[ENTRY:.*:]]
+; VF4IC1-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4IC1:       [[VECTOR_PH]]:
+; VF4IC1-NEXT:    [[TMP0:%.*]] = add i32 [[X]], 1
+; VF4IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
+; VF4IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; VF4IC1-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP0]], 3
+; VF4IC1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0
+; VF4IC1-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4IC1:       [[VECTOR_BODY]]:
+; VF4IC1-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[A]], i32 [[INDEX]]
+; VF4IC1-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[B]], i32 [[INDEX]]
+; VF4IC1-NEXT:    store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 4
+; VF4IC1-NEXT:    store <4 x i32> [[TMP1]], ptr [[TMP3]], align 4
+; VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; VF4IC1-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; VF4IC1-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4IC1:       [[MIDDLE_BLOCK]]:
+; VF4IC1-NEXT:    br label %[[EXIT:.*]]
+; VF4IC1:       [[EXIT]]:
+; VF4IC1-NEXT:    ret void
+;
+; VF2IC2-LABEL: define void @narrow_widen_store_user(
+; VF2IC2-SAME: i32 [[X:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    [[TMP0:%.*]] = add i32 [[X]], 1
+; VF2IC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
+; VF2IC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; VF2IC2-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP0]], 3
+; VF2IC2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i64 0
+; VF2IC2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2IC2-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[A]], i32 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[B]], i32 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP2]], i32 2
+; VF2IC2-NEXT:    store <2 x i32> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 4
+; VF2IC2-NEXT:    store <2 x i32> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 4
+; VF2IC2-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP3]], i32 2
+; VF2IC2-NEXT:    store <2 x i32> [[TMP1]], ptr [[TMP3]], align 4
+; VF2IC2-NEXT:    store <2 x i32> [[TMP1]], ptr [[TMP5]], align 4
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; VF2IC2-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; VF2IC2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT:    br label %[[EXIT:.*]]
+; VF2IC2:       [[EXIT]]:
+; VF2IC2-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.A = getelementptr i32, ptr %A, i32 %iv
+  %gep.B = getelementptr i32, ptr %B, i32 %iv
+  %wide.add = add i32 %x, 1
+  %wide.mul = mul i32 %wide.add, 3
+  store i32 %wide.add, ptr %gep.A
+  store i32 %wide.mul, ptr %gep.B
+  %iv.next = add i32 %iv, 1
+  %ec = icmp ne i32 %iv.next, 1024
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/nsw-crash.ll b/llvm/test/Transforms/LoopVectorize/nsw-crash.ll
index 106054d989776..d87d9b155db1c 100644
--- a/llvm/test/Transforms/LoopVectorize/nsw-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/nsw-crash.ll
@@ -3,7 +3,7 @@
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-define void @test(i1 %arg) {
+define void @test(ptr %p, i1 %arg) {
 entry:
   br i1 %arg, label %while.end, label %while.body.lr.ph
 
@@ -11,7 +11,7 @@ while.body.lr.ph:
   br label %while.body
 
 while.body:
-  %it.sroa.0.091 = phi ptr [ undef, %while.body.lr.ph ], [ %incdec.ptr.i, %while.body ]
+  %it.sroa.0.091 = phi ptr [ %p, %while.body.lr.ph ], [ %incdec.ptr.i, %while.body ]
   %incdec.ptr.i = getelementptr inbounds i32, ptr %it.sroa.0.091, i64 1
   %inc32 = add i32 undef, 1                                        ; <------------- Make sure we don't set NSW flags to the undef.
   %cmp.i11 = icmp eq ptr %incdec.ptr.i, undef
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
index 763072ab16f73..9931137566c1a 100644
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -248,25 +248,27 @@ for.end:                                          ; preds = %for.body
 ;
 @cm_array = external global [2592 x i16], align 1
 
-define void @pr43371() optsize {
+define void @pr43371(i16 %val) optsize {
 ;
 ; CHECK-LABEL: define void @pr43371(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
-; CHECK-NEXT:    store i16 0, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP5]], align 1
+; CHECK-NEXT:    store i16 0, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
@@ -277,22 +279,24 @@ define void @pr43371() optsize {
 ; CHECK-NEXT:    unreachable
 ;
 ; PGSO-LABEL: define void @pr43371(
-; PGSO-SAME: ) #[[ATTR0]] {
+; PGSO-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] {
 ; PGSO-NEXT:  [[ENTRY:.*:]]
 ; PGSO-NEXT:    br label %[[VECTOR_PH:.*]]
 ; PGSO:       [[VECTOR_PH]]:
+; PGSO-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0
+; PGSO-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; PGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; PGSO:       [[VECTOR_BODY]]:
 ; PGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PGSO-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
+; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; PGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
-; PGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
+; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; PGSO-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; PGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
-; PGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
+; PGSO-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; PGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
+; PGSO-NEXT:    store i16 0, ptr [[TMP7]], align 1
 ; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; PGSO-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; PGSO-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
@@ -303,22 +307,24 @@ define void @pr43371() optsize {
 ; PGSO-NEXT:    unreachable
 ;
 ; NPGSO-LABEL: define void @pr43371(
-; NPGSO-SAME: ) #[[ATTR0]] {
+; NPGSO-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] {
 ; NPGSO-NEXT:  [[ENTRY:.*:]]
 ; NPGSO-NEXT:    br label %[[VECTOR_PH:.*]]
 ; NPGSO:       [[VECTOR_PH]]:
+; NPGSO-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0
+; NPGSO-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; NPGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; NPGSO:       [[VECTOR_BODY]]:
 ; NPGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NPGSO-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; NPGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
+; NPGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; NPGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
-; NPGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; NPGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; NPGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
+; NPGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; NPGSO-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; NPGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
-; NPGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
+; NPGSO-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; NPGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
+; NPGSO-NEXT:    store i16 0, ptr [[TMP7]], align 1
 ; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; NPGSO-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; NPGSO-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
@@ -340,7 +346,7 @@ for.cond.cleanup28:
 
 for.body29:
   %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29]
-  %add33 = add i16 undef, %i24.0170
+  %add33 = add i16 %val, %i24.0170
   %idxprom34 = zext i16 %add33 to i32
   %arrayidx35 = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 %idxprom34
   store i16 0, ptr %arrayidx35, align 1
@@ -349,25 +355,27 @@ for.body29:
   br i1 %cmp26, label %for.body29, label %for.cond.cleanup28
 }
 
-define void @pr43371_pgso() !prof !14 {
+define void @pr43371_pgso(i16 %val) !prof !14 {
 ;
 ; CHECK-LABEL: define void @pr43371_pgso(
-; CHECK-SAME: ) !prof [[PROF14]] {
+; CHECK-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
-; CHECK-NEXT:    store i16 0, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP5]], align 1
+; CHECK-NEXT:    store i16 0, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
@@ -378,22 +386,24 @@ define void @pr43371_pgso() !prof !14 {
 ; CHECK-NEXT:    unreachable
 ;
 ; PGSO-LABEL: define void @pr43371_pgso(
-; PGSO-SAME: ) !prof [[PROF14]] {
+; PGSO-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] {
 ; PGSO-NEXT:  [[ENTRY:.*:]]
 ; PGSO-NEXT:    br label %[[VECTOR_PH:.*]]
 ; PGSO:       [[VECTOR_PH]]:
+; PGSO-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0
+; PGSO-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; PGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; PGSO:       [[VECTOR_BODY]]:
 ; PGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PGSO-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
+; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; PGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
-; PGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
+; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; PGSO-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; PGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
-; PGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
+; PGSO-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; PGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
+; PGSO-NEXT:    store i16 0, ptr [[TMP7]], align 1
 ; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; PGSO-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; PGSO-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
@@ -404,17 +414,19 @@ define void @pr43371_pgso() !prof !14 {
 ; PGSO-NEXT:    unreachable
 ;
 ; NPGSO-LABEL: define void @pr43371_pgso(
-; NPGSO-SAME: ) !prof [[PROF14]] {
+; NPGSO-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] {
 ; NPGSO-NEXT:  [[ENTRY:.*:]]
 ; NPGSO-NEXT:    br label %[[VECTOR_SCEVCHECK:.*]]
 ; NPGSO:       [[VECTOR_SCEVCHECK]]:
-; NPGSO-NEXT:    br i1 undef, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NPGSO-NEXT:    [[TMP0:%.*]] = add i16 [[VAL]], 755
+; NPGSO-NEXT:    [[TMP4:%.*]] = icmp ult i16 [[TMP0]], [[VAL]]
+; NPGSO-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; NPGSO:       [[VECTOR_PH]]:
 ; NPGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; NPGSO:       [[VECTOR_BODY]]:
 ; NPGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NPGSO-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
-; NPGSO-NEXT:    [[TMP1:%.*]] = add i16 undef, [[OFFSET_IDX]]
+; NPGSO-NEXT:    [[TMP1:%.*]] = add i16 [[VAL]], [[OFFSET_IDX]]
 ; NPGSO-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
 ; NPGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; NPGSO-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP3]], align 1
@@ -429,7 +441,7 @@ define void @pr43371_pgso() !prof !14 {
 ; NPGSO-NEXT:    unreachable
 ; NPGSO:       [[FOR_BODY29]]:
 ; NPGSO-NEXT:    [[I24_0170:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ]
-; NPGSO-NEXT:    [[ADD33:%.*]] = add i16 undef, [[I24_0170]]
+; NPGSO-NEXT:    [[ADD33:%.*]] = add i16 [[VAL]], [[I24_0170]]
 ; NPGSO-NEXT:    [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32
 ; NPGSO-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[IDXPROM34]]
 ; NPGSO-NEXT:    store i16 0, ptr [[ARRAYIDX35]], align 1
@@ -449,7 +461,7 @@ for.cond.cleanup28:
 
 for.body29:
   %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29]
-  %add33 = add i16 undef, %i24.0170
+  %add33 = add i16 %val, %i24.0170
   %idxprom34 = zext i16 %add33 to i32
   %arrayidx35 = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 %idxprom34
   store i16 0, ptr %arrayidx35, align 1
@@ -464,45 +476,87 @@ define i32 @pr45526() optsize {
 ;
 ; CHECK-LABEL: define i32 @pr45526(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
-; CHECK-NEXT:    [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; CHECK-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510)
+; CHECK-NEXT:    [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true)
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i32 [[FOR_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP10]]
 ;
 ; PGSO-LABEL: define i32 @pr45526(
 ; PGSO-SAME: ) #[[ATTR0]] {
-; PGSO-NEXT:  [[ENTRY:.*]]:
-; PGSO-NEXT:    br label %[[LOOP:.*]]
-; PGSO:       [[LOOP]]:
-; PGSO-NEXT:    [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
-; PGSO-NEXT:    [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
-; PGSO-NEXT:    [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
-; PGSO-NEXT:    [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; PGSO-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; PGSO-NEXT:  [[ENTRY:.*:]]
+; PGSO-NEXT:    br label %[[VECTOR_PH:.*]]
+; PGSO:       [[VECTOR_PH]]:
+; PGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
+; PGSO:       [[VECTOR_BODY]]:
+; PGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT:    [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510)
+; PGSO-NEXT:    [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
+; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; PGSO-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; PGSO-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512
+; PGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; PGSO:       [[MIDDLE_BLOCK]]:
+; PGSO-NEXT:    [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true)
+; PGSO-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; PGSO-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; PGSO-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]]
+; PGSO-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; PGSO-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0
+; PGSO-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
+; PGSO-NEXT:    br label %[[EXIT:.*]]
 ; PGSO:       [[EXIT]]:
-; PGSO-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
-; PGSO-NEXT:    ret i32 [[FOR_LCSSA]]
+; PGSO-NEXT:    ret i32 [[TMP10]]
 ;
 ; NPGSO-LABEL: define i32 @pr45526(
 ; NPGSO-SAME: ) #[[ATTR0]] {
-; NPGSO-NEXT:  [[ENTRY:.*]]:
-; NPGSO-NEXT:    br label %[[LOOP:.*]]
-; NPGSO:       [[LOOP]]:
-; NPGSO-NEXT:    [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
-; NPGSO-NEXT:    [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
-; NPGSO-NEXT:    [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
-; NPGSO-NEXT:    [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; NPGSO-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; NPGSO-NEXT:  [[ENTRY:.*:]]
+; NPGSO-NEXT:    br label %[[VECTOR_PH:.*]]
+; NPGSO:       [[VECTOR_PH]]:
+; NPGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NPGSO:       [[VECTOR_BODY]]:
+; NPGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NPGSO-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NPGSO-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; NPGSO-NEXT:    [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510)
+; NPGSO-NEXT:    [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
+; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; NPGSO-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; NPGSO-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512
+; NPGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; NPGSO:       [[MIDDLE_BLOCK]]:
+; NPGSO-NEXT:    [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true)
+; NPGSO-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; NPGSO-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; NPGSO-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]]
+; NPGSO-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; NPGSO-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0
+; NPGSO-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
+; NPGSO-NEXT:    br label %[[EXIT:.*]]
 ; NPGSO:       [[EXIT]]:
-; NPGSO-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
-; NPGSO-NEXT:    ret i32 [[FOR_LCSSA]]
+; NPGSO-NEXT:    ret i32 [[TMP10]]
 ;
 entry:
   br label %loop
@@ -522,31 +576,59 @@ define i32 @pr45526_pgso() !prof !14 {
 ;
 ; CHECK-LABEL: define i32 @pr45526_pgso(
 ; CHECK-SAME: ) !prof [[PROF14]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
-; CHECK-NEXT:    [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; CHECK-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510)
+; CHECK-NEXT:    [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true)
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i32 [[FOR_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP10]]
 ;
 ; PGSO-LABEL: define i32 @pr45526_pgso(
 ; PGSO-SAME: ) !prof [[PROF14]] {
-; PGSO-NEXT:  [[ENTRY:.*]]:
-; PGSO-NEXT:    br label %[[LOOP:.*]]
-; PGSO:       [[LOOP]]:
-; PGSO-NEXT:    [[PIV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PIVPLUS1:%.*]], %[[LOOP]] ]
-; PGSO-NEXT:    [[FOR:%.*]] = phi i32 [ 5, %[[ENTRY]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
-; PGSO-NEXT:    [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
-; PGSO-NEXT:    [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; PGSO-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; PGSO-NEXT:  [[ENTRY:.*:]]
+; PGSO-NEXT:    br label %[[VECTOR_PH:.*]]
+; PGSO:       [[VECTOR_PH]]:
+; PGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
+; PGSO:       [[VECTOR_BODY]]:
+; PGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; PGSO-NEXT:    [[TMP0:%.*]] = icmp ugt <4 x i32> [[VEC_IND]], splat (i32 510)
+; PGSO-NEXT:    [[TMP1]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
+; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; PGSO-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; PGSO-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 512
+; PGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; PGSO:       [[MIDDLE_BLOCK]]:
+; PGSO-NEXT:    [[TMP4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP0]], i1 true)
+; PGSO-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; PGSO-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; PGSO-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 [[TMP6]]
+; PGSO-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; PGSO-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP5]], 0
+; PGSO-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]]
+; PGSO-NEXT:    br label %[[EXIT:.*]]
 ; PGSO:       [[EXIT]]:
-; PGSO-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
-; PGSO-NEXT:    ret i32 [[FOR_LCSSA]]
+; PGSO-NEXT:    ret i32 [[TMP10]]
 ;
 ; NPGSO-LABEL: define i32 @pr45526_pgso(
 ; NPGSO-SAME: ) !prof [[PROF14]] {
@@ -561,7 +643,7 @@ define i32 @pr45526_pgso() !prof !14 {
 ; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; NPGSO-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; NPGSO-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 508
-; NPGSO-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; NPGSO-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; NPGSO:       [[MIDDLE_BLOCK]]:
 ; NPGSO-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
 ; NPGSO-NEXT:    br label %[[SCALAR_PH:.*]]
@@ -572,7 +654,7 @@ define i32 @pr45526_pgso() !prof !14 {
 ; NPGSO-NEXT:    [[FOR:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[SCALAR_PH]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
 ; NPGSO-NEXT:    [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
 ; NPGSO-NEXT:    [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; NPGSO-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP23:![0-9]+]]
+; NPGSO-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP24:![0-9]+]]
 ; NPGSO:       [[EXIT]]:
 ; NPGSO-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
 ; NPGSO-NEXT:    ret i32 [[FOR_LCSSA]]
@@ -628,7 +710,7 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
 ; CHECK:       [[FOR_END]]:
@@ -666,7 +748,7 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize {
 ; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; PGSO-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; PGSO-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026
-; PGSO-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; PGSO-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; PGSO:       [[MIDDLE_BLOCK]]:
 ; PGSO-NEXT:    br label %[[FOR_END:.*]]
 ; PGSO:       [[FOR_END]]:
@@ -704,7 +786,7 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize {
 ; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; NPGSO-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; NPGSO-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026
-; NPGSO-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; NPGSO-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; NPGSO:       [[MIDDLE_BLOCK]]:
 ; NPGSO-NEXT:    br label %[[FOR_END:.*]]
 ; NPGSO:       [[FOR_END]]:
@@ -745,7 +827,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; CHECK-NEXT:    store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -758,7 +840,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; CHECK-NEXT:    store i16 42, ptr [[GEPOFB]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -777,7 +859,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; PGSO-NEXT:    store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4
 ; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2
 ; PGSO-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; PGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; PGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; PGSO:       [[MIDDLE_BLOCK]]:
 ; PGSO-NEXT:    br label %[[SCALAR_PH]]
 ; PGSO:       [[SCALAR_PH]]:
@@ -790,7 +872,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; PGSO-NEXT:    store i16 42, ptr [[GEPOFB]], align 4
 ; PGSO-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; PGSO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; PGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; PGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; PGSO:       [[FOR_END]]:
 ; PGSO-NEXT:    ret void
 ;
@@ -809,7 +891,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; NPGSO-NEXT:    store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4
 ; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2
 ; NPGSO-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; NPGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; NPGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; NPGSO:       [[MIDDLE_BLOCK]]:
 ; NPGSO-NEXT:    br label %[[SCALAR_PH]]
 ; NPGSO:       [[SCALAR_PH]]:
@@ -822,7 +904,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; NPGSO-NEXT:    store i16 42, ptr [[GEPOFB]], align 4
 ; NPGSO-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; NPGSO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; NPGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; NPGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; NPGSO:       [[FOR_END]]:
 ; NPGSO-NEXT:    ret void
 ;
@@ -1008,7 +1090,9 @@ exit:
 ; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META16]], [[META17]]}
 ; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META16]], [[META17]]}
 ; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META16]], [[META17]]}
-; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]]}
+; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]], [[META17]]}
+; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META16]], [[META17]]}
+; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META16]]}
 ;.
 ; PGSO: [[PROF14]] = !{!"function_entry_count", i64 0}
 ; PGSO: [[LOOP15]] = distinct !{[[LOOP15]], [[META16:![0-9]+]], [[META17:![0-9]+]]}
@@ -1017,7 +1101,9 @@ exit:
 ; PGSO: [[LOOP18]] = distinct !{[[LOOP18]], [[META16]], [[META17]]}
 ; PGSO: [[LOOP19]] = distinct !{[[LOOP19]], [[META16]], [[META17]]}
 ; PGSO: [[LOOP20]] = distinct !{[[LOOP20]], [[META16]], [[META17]]}
-; PGSO: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]]}
+; PGSO: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]], [[META17]]}
+; PGSO: [[LOOP22]] = distinct !{[[LOOP22]], [[META16]], [[META17]]}
+; PGSO: [[LOOP23]] = distinct !{[[LOOP23]], [[META16]]}
 ;.
 ; NPGSO: [[PROF14]] = !{!"function_entry_count", i64 0}
 ; NPGSO: [[LOOP15]] = distinct !{[[LOOP15]], [[META16:![0-9]+]], [[META17:![0-9]+]]}
@@ -1028,8 +1114,9 @@ exit:
 ; NPGSO: [[LOOP20]] = distinct !{[[LOOP20]], [[META16]], [[META17]]}
 ; NPGSO: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]]}
 ; NPGSO: [[LOOP22]] = distinct !{[[LOOP22]], [[META16]], [[META17]]}
-; NPGSO: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META16]]}
-; NPGSO: [[LOOP24]] = distinct !{[[LOOP24]], [[META16]], [[META17]]}
+; NPGSO: [[LOOP23]] = distinct !{[[LOOP23]], [[META16]], [[META17]]}
+; NPGSO: [[LOOP24]] = distinct !{[[LOOP24]], [[META17]], [[META16]]}
 ; NPGSO: [[LOOP25]] = distinct !{[[LOOP25]], [[META16]], [[META17]]}
-; NPGSO: [[LOOP26]] = distinct !{[[LOOP26]], [[META16]]}
+; NPGSO: [[LOOP26]] = distinct !{[[LOOP26]], [[META16]], [[META17]]}
+; NPGSO: [[LOOP27]] = distinct !{[[LOOP27]], [[META16]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/pr32859.ll b/llvm/test/Transforms/LoopVectorize/pr32859.ll
index 2d30e0c9ad10f..f65e9cab1700b 100644
--- a/llvm/test/Transforms/LoopVectorize/pr32859.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr32859.ll
@@ -10,13 +10,13 @@
 ; CHECK: %e.0.ph = phi i32 [ 0, %if.end.2.i ], [ 0, %middle.block ]
 
 ; Function Attrs: nounwind uwtable
-define void @main(i32 %n) #0 {
+define void @main(i32 %n, i32 %v) #0 {
 entry:
   br label %for.cond1.preheader.i
 
 for.cond1.preheader.i:                            ; preds = %if.end.2.i, %entry
   %c.06.i = phi i32 [ 0, %entry ], [ %inc5.i, %if.end.2.i ]
-  %tobool.i = icmp ne i32 undef, 0
+  %tobool.i = icmp ne i32 %v, 0
   br label %if.end.2.i
 
 if.end.2.i:                                       ; preds = %for.cond1.preheader.i
diff --git a/llvm/test/Transforms/LoopVectorize/pr36311.ll b/llvm/test/Transforms/LoopVectorize/pr36311.ll
index f2dfecc341e6f..bc27e4e9b09cd 100644
--- a/llvm/test/Transforms/LoopVectorize/pr36311.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr36311.ll
@@ -10,10 +10,7 @@
 
 $test = comdat any
 
-declare i32 @__gxx_personality_v0(...)
-
-; Function Attrs: uwtable
-define dso_local void @test(i1 %arg) local_unnamed_addr #0 comdat align 2 personality ptr @__gxx_personality_v0 {
+define void @test(ptr %p, i1 %arg) {
 entry:
   br label %for.body51
 
@@ -26,9 +23,9 @@ for.cond80.loopexit:                              ; preds = %for.body89
 
 for.body89.lr.ph:                                 ; preds = %for.cond80.loopexit, %for.body51
   %i79.0179 = phi i32 [ %add90, %for.cond80.loopexit ], [ 0, %for.body51 ]
-  %next_index.4178 = phi i32 [ %inc94.lcssa, %for.cond80.loopexit ], [ undef, %for.body51 ]
+  %next_index.4178 = phi i32 [ %inc94.lcssa, %for.cond80.loopexit ], [ 0, %for.body51 ]
   %add90 = add nuw i32 %i79.0179, 1
-  %mul91 = mul i32 %add90, undef
+  %mul91 = mul i32 %add90, 7
   br label %for.body89
 
 for.body89:                                       ; preds = %for.body89, %for.body89.lr.ph
@@ -38,10 +35,10 @@ for.body89:                                       ; preds = %for.body89, %for.bo
   %add93 = add i32 %add92, %mul91
   %inc94 = add i32 %next_index.5174, 1
   %conv95 = zext i32 %next_index.5174 to i64
-  %arrayidx.i160 = getelementptr inbounds i32, ptr undef, i64 %conv95
+  %arrayidx.i160 = getelementptr inbounds i32, ptr %p, i64 %conv95
   store i32 %add93, ptr %arrayidx.i160, align 4
 ;, !tbaa !1
-  %cmp87 = icmp ult i32 %add92, undef
+  %cmp87 = icmp ult i32 %add92, 123
   br i1 %cmp87, label %for.body89, label %for.cond80.loopexit
 
 nrvo.skipdtor.loopexit:                           ; preds = %for.cond80.loopexit
diff --git a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
index cbc9fccebb881..960065b09cfd1 100644
--- a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll
@@ -39,22 +39,24 @@
 define i64 @test1(i64 %y) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i64> splat (i64 3), [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP2]], <4 x i64> splat (i64 77), <4 x i64> [[TMP1]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]]
-; CHECK:       cond.false:
-; CHECK-NEXT:    [[DIV:%.*]] = xor i64 3, [[Y]]
-; CHECK-NEXT:    br label [[COND_END]]
-; CHECK:       cond.end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i64 [ [[DIV]], [[COND_FALSE]] ], [ 77, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> <i1 false, i1 false, i1 false, i1 true>, i1 true)
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI]], i64 [[TMP4]]
+; CHECK-NEXT:    br label [[COND_END:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ]
-; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
+; CHECK-NEXT:    ret i64 [[TMP5]]
 ;
 entry:
   br label %for.body
@@ -84,21 +86,23 @@ for.cond.cleanup:
 define i64 @test2(i64 %y) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP1]], <4 x i64> splat (i64 77), <4 x i64> splat (i64 55)
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]]
-; CHECK:       cond.false:
-; CHECK-NEXT:    br label [[COND_END]]
-; CHECK:       cond.end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i64 [ 55, [[COND_FALSE]] ], [ 77, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> <i1 false, i1 false, i1 false, i1 true>, i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[PREDPHI]], i64 [[TMP3]]
+; CHECK-NEXT:    br label [[COND_END:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], [[COND_END]] ]
-; CHECK-NEXT:    ret i64 [[COND_LCSSA]]
+; CHECK-NEXT:    ret i64 [[TMP4]]
 ;
 entry:
   br label %for.body
@@ -127,21 +131,23 @@ for.cond.cleanup:
 define i32 @test3(i64 %y) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[Y:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[COND_END]], label [[COND_FALSE:%.*]]
-; CHECK:       cond.false:
-; CHECK-NEXT:    br label [[COND_END]]
-; CHECK:       cond.end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ 55, [[COND_FALSE]] ], [ [[I]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> splat (i32 55)
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> <i1 false, i1 false, i1 false, i1 true>, i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 [[TMP3]]
+; CHECK-NEXT:    br label [[COND_END:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[COND_END]] ]
-; CHECK-NEXT:    ret i32 [[COND_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll b/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
index a1cb361d20bee..9921f2916ce00 100644
--- a/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
@@ -36,7 +36,8 @@ define i16 @test_true_and_false_branch_equal() {
 ; CHECK-NEXT:    br label [[PRED_SREM_CONTINUE2]]
 ; CHECK:       pred.srem.continue2:
 ; CHECK-NEXT:    [[TMP10:%.*]] = phi <2 x i16> [ [[TMP6]], [[PRED_SREM_CONTINUE]] ], [ [[TMP9]], [[PRED_SREM_IF1]] ]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> [[TMP10]], <2 x i16> splat (i16 5786)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP13]], <2 x i16> [[TMP10]], <2 x i16> splat (i16 5786)
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
 ; CHECK-NEXT:    store i16 [[TMP11]], ptr @v_39, align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/pr45525.ll b/llvm/test/Transforms/LoopVectorize/pr45525.ll
index f32de2d75cdef..b05cf6ef76675 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45525.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45525.ll
@@ -9,14 +9,12 @@ define void @main(i1 %cond, ptr %arr) {
 ; CHECK-NEXT:  [[BB_0:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[COND]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[VEC_IND]], splat (i32 3)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> splat (i32 7), <4 x i32> [[TMP5]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[COND]], <4 x i32> splat (i32 7), <4 x i32> [[TMP5]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i32 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i32> [[PREDPHI]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -24,8 +22,9 @@ define void @main(i1 %cond, ptr %arr) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[BB_4:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[BB_4:.*]]
+; CHECK:       [[BB_4]]:
+; CHECK-NEXT:    ret void
 ;
 bb.0:
   br label %bb.1
diff --git a/llvm/test/Transforms/LoopVectorize/pr50686.ll b/llvm/test/Transforms/LoopVectorize/pr50686.ll
index 878fbec452220..be9110ce0093a 100644
--- a/llvm/test/Transforms/LoopVectorize/pr50686.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr50686.ll
@@ -18,20 +18,16 @@ define void @m(ptr nocapture %p, ptr nocapture %p2, i32 %q) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[P2]], align 4, !alias.scope [[META0:![0-9]+]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <4 x i32> zeroinitializer, [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX9_1]], align 4, !alias.scope [[META0]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP2]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4, !alias.scope [[META0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P2]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw i32 0, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX9_1]], align 4, !alias.scope [[META0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4, !alias.scope [[META0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw i32 [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <4 x i32> [[TMP4]], [[BROADCAST_SPLAT5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDEX]]
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP7]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP7]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 60
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
index 7b3500933314a..ebd532aa5032c 100644
--- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
@@ -23,9 +23,9 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 10)
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[TMP0]], splat (i32 20)
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[PREDPHI5:%.*]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_IND]], <2 x i32> splat (i32 9)
+; CHECK-NEXT:    [[PREDPHI5:%.*]] = select i1 [[C_2]], <2 x i32> [[VEC_IND]], <2 x i32> splat (i32 9)
 ; CHECK-NEXT:    [[PREDPHI6:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP0]], <2 x i32> [[TMP3]]
-; CHECK-NEXT:    [[PREDPHI7]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_PHI]], <2 x i32> [[PREDPHI6]]
+; CHECK-NEXT:    [[PREDPHI7]] = select i1 [[C_2]], <2 x i32> [[VEC_PHI]], <2 x i32> [[PREDPHI6]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 176
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll b/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll
index 0656cd2b2aa94..0fdc8fd6ad519 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll
@@ -15,7 +15,7 @@ define void @PR49215(ptr %p, ptr %q) {
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult ptr [[Q:%.*]], [[G]]
 ; CHECK-NEXT:    [[UMIN]] = select i1 [[CMP2]], ptr [[Q]], ptr [[G]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], undef
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 123
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]]
 ; CHECK:       loopexit:
 ; CHECK-NEXT:    [[UMIN_LCSSA:%.*]] = phi ptr [ [[UMIN]], [[FOR_BODY]] ]
@@ -31,7 +31,7 @@ for.body:
   %cmp2 = icmp ult ptr %q, %g
   %umin = select i1 %cmp2, ptr %q, ptr %g
   %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, undef
+  %exitcond = icmp eq i64 %iv.next, 123
   br i1 %exitcond, label %loopexit, label %for.body
 
 loopexit:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
index 13cc1b657d231..5f54b0ac7834a 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
@@ -3,7 +3,7 @@
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define i8 @PR34687(i1 %c, i32 %x, i32 %n) {
+define i8 @PR34687(i1 %c, i32 %x, i32 %n, i32 %divisor) {
 ; CHECK-LABEL: @PR34687(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 4
@@ -13,20 +13,28 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = select i1 [[C:%.*]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = sdiv <4 x i32> [[VEC_IND]], [[TMP0]]
+; CHECK-NEXT:    [[PREDPHI1:%.*]] = select i1 [[C]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[VEC_PHI]], splat (i32 255)
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[BROADCAST_SPLAT4]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP4]] = zext <4 x i8> [[TMP3]] to <4 x i32>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP3]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP7]] to i32
+; CHECK-NEXT:    [[PREDPHI:%.*]] = extractelement <4 x i32> [[PREDPHI1]], i32 3
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -36,17 +44,19 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ]
 ; CHECK-NEXT:    [[R:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[R_NEXT:%.*]], [[IF_END]] ]
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_END]]
+; CHECK-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[T0:%.*]] = sdiv i32 undef, undef
+; CHECK-NEXT:    [[T0:%.*]] = sdiv i32 [[I]], [[X]]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
+; CHECK-NEXT:    [[DIV_PHI:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[T0]], [[IF_THEN]] ]
 ; CHECK-NEXT:    [[T1:%.*]] = and i32 [[R]], 255
 ; CHECK-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
-; CHECK-NEXT:    [[R_NEXT]] = add nuw nsw i32 [[T1]], [[X]]
+; CHECK-NEXT:    [[R_NEXT]] = add nuw nsw i32 [[T1]], [[X1]]
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end:
+; CHECK-NEXT:    [[DIV_USE:%.*]] = phi i32 [ [[DIV_PHI]], [[IF_END]] ], [ [[PREDPHI]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[T2:%.*]] = phi i32 [ [[R_NEXT]], [[IF_END]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[T3:%.*]] = trunc i32 [[T2]] to i8
 ; CHECK-NEXT:    ret i8 [[T3]]
@@ -60,10 +70,11 @@ for.body:
   br i1 %c, label %if.then, label %if.end
 
 if.then:
-  %t0 = sdiv i32 undef, undef
+  %t0 = sdiv i32 %i, %divisor
   br label %if.end
 
 if.end:
+  %div_phi = phi i32 [ 0, %for.body ], [ %t0, %if.then ]
   %t1 = and i32 %r, 255
   %i.next = add nsw i32 %i, 1
   %r.next = add nuw nsw i32 %t1, %x
@@ -71,6 +82,7 @@ if.end:
   br i1 %cond, label %for.end, label %for.body
 
 for.end:
+  %div_use = phi i32 [ %div_phi, %if.end ]
   %t2 = phi i32 [ %r.next, %if.end ]
   %t3 = trunc i32 %t2 to i8
   ret i8 %t3
@@ -86,11 +98,9 @@ define i8 @PR34687_no_undef(i1 %c, i32 %x, i32 %n) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT2]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1)
+; CHECK-NEXT:    [[TMP0:%.*]] = select i1 [[C:%.*]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <4 x i32> splat (i32 99), [[TMP0]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll
index c76c2c0ef47a2..ab10d62bc6048 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll
@@ -12,12 +12,12 @@ entry:
 
 loop:
   %tmp3 = phi i64 [ 0, %entry ], [ %tmp18, %loop ]
-  %tmp4 = getelementptr inbounds %struct.foo, ptr %ptr, i64 undef
+  %tmp4 = getelementptr inbounds %struct.foo, ptr %ptr
   store i64 0, ptr %tmp4, align 8
   %tmp8 = add i64 1, %tmp3
   %tmp10 = getelementptr inbounds %struct.foo, ptr %ptr, i64 %tmp8
   store i64 1, ptr %tmp10, align 8
-  %tmp14 = add i64 undef, %tmp3
+  %tmp14 = add i64 3, %tmp3
   %tmp16 = getelementptr inbounds %struct.foo, ptr %ptr, i64 %tmp14
   store i64 2, ptr %tmp16, align 8
   %tmp18 = add nuw nsw i64 %tmp3, 4
diff --git a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
index 9a699826696ec..70adac2103feb 100644
--- a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
@@ -84,12 +84,8 @@ define void @single_scalar_cast_stored(ptr %src, ptr %dst, i32 %n) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 2, !alias.scope [[META4:![0-9]+]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i16> [[BROADCAST_SPLAT]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i16> [[BROADCAST_SPLAT]], splat (i16 15)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i16 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = and i16 [[TMP0]], 15
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], i16 0, i16 [[TMP4]]
 ; CHECK-NEXT:    store i16 [[TMP5]], ptr [[DST]], align 2, !alias.scope [[META7:![0-9]+]], !noalias [[META4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll
index a852b731ea13b..9e523be618b44 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll
@@ -12,12 +12,15 @@ define i32 @test(ptr %vf1, i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 -56)
+; CHECK-NEXT:    [[TMP18:%.*]] = alloca i8, i64 [[N]], align 16
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP18]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[VF1]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = alloca i8, i64 [[N]], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x ptr> [[BROADCAST_SPLAT]], i32 0
 ; CHECK-NEXT:    store ptr [[TMP4]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
@@ -26,7 +29,7 @@ define i32 @test(ptr %vf1, i64 %n) {
 ; CHECK:       [[PRED_STORE_IF1]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[VF1]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = alloca i8, i64 [[N]], align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x ptr> [[BROADCAST_SPLAT]], i32 0
 ; CHECK-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
@@ -35,7 +38,7 @@ define i32 @test(ptr %vf1, i64 %n) {
 ; CHECK:       [[PRED_STORE_IF3]]:
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds ptr, ptr [[VF1]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = alloca i8, i64 [[N]], align 16
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x ptr> [[BROADCAST_SPLAT]], i32 0
 ; CHECK-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 8
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
 ; CHECK:       [[PRED_STORE_CONTINUE4]]:
@@ -44,7 +47,7 @@ define i32 @test(ptr %vf1, i64 %n) {
 ; CHECK:       [[PRED_STORE_IF5]]:
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds ptr, ptr [[VF1]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = alloca i8, i64 [[N]], align 16
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x ptr> [[BROADCAST_SPLAT]], i32 0
 ; CHECK-NEXT:    store ptr [[TMP16]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
 ; CHECK:       [[PRED_STORE_CONTINUE6]]:
diff --git a/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll b/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll
index 1fccf546f4a67..d3cd80beaae90 100644
--- a/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll
+++ b/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll
@@ -14,7 +14,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK-LABEL: @t(
 ; CHECK: <4 x i32>
 
-define void @t() {
+define void @t(ptr %p) {
 entry:
   br label %for.body
 
@@ -22,13 +22,13 @@ for.body:
   %indvars.iv17 = phi i64 [ %indvars.next, %for.body ], [ 128, %entry ]
 
   ; Loop invariant anchored in loop.
-  %idxprom21 = zext i32 undef to i64
+  %idxprom21 = zext i32 0 to i64
 
-  %arrayidx23 = getelementptr inbounds [100 x [100 x i32]], ptr undef, i64 0, i64 %idxprom21, i64 %indvars.iv17
-  store i32 undef, ptr %arrayidx23, align 4
+  %arrayidx23 = getelementptr inbounds [100 x [100 x i32]], ptr %p, i64 0, i64 %idxprom21, i64 %indvars.iv17
+  store i32 poison, ptr %arrayidx23, align 4
   %indvars.next= add i64 %indvars.iv17, -1
   %0 = trunc i64 %indvars.next to i32
-  %cmp15 = icmp ugt i32 %0, undef
+  %cmp15 = icmp ugt i32 %0, poison
   br i1 %cmp15, label %for.body, label %loopexit
 
 loopexit:
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 71311db33cf1a..3b515a2acb1a7 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -96,13 +96,11 @@ define void @blend_chain_iv(i1 %c) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[PREDPHI1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI1]], <4 x i64> poison
+; CHECK-NEXT:    [[PREDPHI2:%.*]] = select i1 [[C]], <4 x i64> [[PREDPHI1]], <4 x i64> poison
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 2
diff --git a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
index 3b34b75a4c511..52dbe931db8bc 100644
--- a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
+++ b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s
+; RUN: opt -S -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck --check-prefix=FORCED-TF %s
 ; RUN: opt -S -passes=loop-vectorize < %s | FileCheck %s
 
 ; This tests should produce the same result as with default options, and when tail folding
@@ -13,6 +13,24 @@
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @basic_loop(ptr nocapture readonly %ptr, i32 %size, ptr %pos) {
+; FORCED-TF-LABEL: @basic_loop(
+; FORCED-TF-NEXT:  header:
+; FORCED-TF-NEXT:    [[PTR0:%.*]] = load ptr, ptr [[POS:%.*]], align 4
+; FORCED-TF-NEXT:    br label [[BODY:%.*]]
+; FORCED-TF:       body:
+; FORCED-TF-NEXT:    [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[SIZE:%.*]], [[HEADER:%.*]] ]
+; FORCED-TF-NEXT:    [[BUFF:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[PTR:%.*]], [[HEADER]] ]
+; FORCED-TF-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[BUFF]], i32 1
+; FORCED-TF-NEXT:    [[DEC]] = add nsw i32 [[DEC66]], -1
+; FORCED-TF-NEXT:    [[TMP0:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1
+; FORCED-TF-NEXT:    store i8 [[TMP0]], ptr [[BUFF]], align 1
+; FORCED-TF-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
+; FORCED-TF-NEXT:    br i1 [[TOBOOL11]], label [[END:%.*]], label [[BODY]]
+; FORCED-TF:       end:
+; FORCED-TF-NEXT:    [[INCDEC_PTR_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR]], [[BODY]] ]
+; FORCED-TF-NEXT:    store ptr [[INCDEC_PTR_LCSSA]], ptr [[POS]], align 4
+; FORCED-TF-NEXT:    ret void
+;
 ; CHECK-LABEL: @basic_loop(
 ; CHECK-NEXT:  header:
 ; CHECK-NEXT:    [[PTR0:%.*]] = load ptr, ptr [[POS:%.*]], align 4
@@ -21,36 +39,36 @@ define void @basic_loop(ptr nocapture readonly %ptr, i32 %size, ptr %pos) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[SIZE]], [[N_VEC]]
-; CHECK-NEXT:    [[IND_END1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[N_VEC]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 [[SIZE]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[N_VEC]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ]
 ; CHECK-NEXT:    br label [[BODY:%.*]]
 ; CHECK:       body:
 ; CHECK-NEXT:    [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[BUFF:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[BUFF:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[BUFF]], i32 1
 ; CHECK-NEXT:    [[DEC]] = add nsw i32 [[DEC66]], -1
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1
-; CHECK-NEXT:    store i8 [[TMP5]], ptr [[BUFF]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    store i8 [[TMP4]], ptr [[BUFF]], align 1
 ; CHECK-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[INCDEC_PTR_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END1]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[INCDEC_PTR_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR]], [[BODY]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    store ptr [[INCDEC_PTR_LCSSA]], ptr [[POS]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -74,45 +92,162 @@ end:
 }
 
 define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) {
+; FORCED-TF-LABEL: @metadata(
+; FORCED-TF-NEXT:  header:
+; FORCED-TF-NEXT:    [[PTR0:%.*]] = load ptr, ptr [[POS:%.*]], align 4
+; FORCED-TF-NEXT:    br label [[VECTOR_PH:%.*]]
+; FORCED-TF:       vector.ph:
+; FORCED-TF-NEXT:    [[N_RND_UP:%.*]] = add i32 [[SIZE:%.*]], 3
+; FORCED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
+; FORCED-TF-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; FORCED-TF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[SIZE]], 1
+; FORCED-TF-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
+; FORCED-TF-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; FORCED-TF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FORCED-TF:       vector.body:
+; FORCED-TF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE11:%.*]] ]
+; FORCED-TF-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; FORCED-TF-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
+; FORCED-TF-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
+; FORCED-TF-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
+; FORCED-TF-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[TMP0]]
+; FORCED-TF-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP1]]
+; FORCED-TF-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP2]]
+; FORCED-TF-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP3]]
+; FORCED-TF-NEXT:    [[TMP4:%.*]] = insertelement <4 x ptr> poison, ptr [[NEXT_GEP]], i32 0
+; FORCED-TF-NEXT:    [[TMP5:%.*]] = insertelement <4 x ptr> [[TMP4]], ptr [[NEXT_GEP1]], i32 1
+; FORCED-TF-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[NEXT_GEP2]], i32 2
+; FORCED-TF-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[NEXT_GEP3]], i32 3
+; FORCED-TF-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0
+; FORCED-TF-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
+; FORCED-TF-NEXT:    [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], <i32 0, i32 1, i32 2, i32 3>
+; FORCED-TF-NEXT:    [[TMP8:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; FORCED-TF-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
+; FORCED-TF-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP1]], i32 1
+; FORCED-TF-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP2]], i32 1
+; FORCED-TF-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP3]], i32 1
+; FORCED-TF-NEXT:    [[TMP13:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP9]], i32 0
+; FORCED-TF-NEXT:    [[TMP14:%.*]] = insertelement <4 x ptr> [[TMP13]], ptr [[TMP10]], i32 1
+; FORCED-TF-NEXT:    [[TMP15:%.*]] = insertelement <4 x ptr> [[TMP14]], ptr [[TMP11]], i32 2
+; FORCED-TF-NEXT:    [[TMP16:%.*]] = insertelement <4 x ptr> [[TMP15]], ptr [[TMP12]], i32 3
+; FORCED-TF-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
+; FORCED-TF-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FORCED-TF:       pred.store.if:
+; FORCED-TF-NEXT:    [[TMP18:%.*]] = load i8, ptr [[TMP9]], align 1
+; FORCED-TF-NEXT:    store i8 [[TMP18]], ptr [[NEXT_GEP]], align 1
+; FORCED-TF-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FORCED-TF:       pred.store.continue:
+; FORCED-TF-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1
+; FORCED-TF-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
+; FORCED-TF:       pred.store.if6:
+; FORCED-TF-NEXT:    [[TMP20:%.*]] = load i8, ptr [[TMP10]], align 1
+; FORCED-TF-NEXT:    store i8 [[TMP20]], ptr [[NEXT_GEP1]], align 1
+; FORCED-TF-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; FORCED-TF:       pred.store.continue7:
+; FORCED-TF-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2
+; FORCED-TF-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; FORCED-TF:       pred.store.if8:
+; FORCED-TF-NEXT:    [[TMP22:%.*]] = load i8, ptr [[TMP11]], align 1
+; FORCED-TF-NEXT:    store i8 [[TMP22]], ptr [[NEXT_GEP2]], align 1
+; FORCED-TF-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; FORCED-TF:       pred.store.continue9:
+; FORCED-TF-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3
+; FORCED-TF-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]]
+; FORCED-TF:       pred.store.if10:
+; FORCED-TF-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP12]], align 1
+; FORCED-TF-NEXT:    store i8 [[TMP24]], ptr [[NEXT_GEP3]], align 1
+; FORCED-TF-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; FORCED-TF:       pred.store.continue11:
+; FORCED-TF-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; FORCED-TF-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; FORCED-TF-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; FORCED-TF:       middle.block:
+; FORCED-TF-NEXT:    [[TMP26:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
+; FORCED-TF-NEXT:    [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP26]], i1 true)
+; FORCED-TF-NEXT:    [[TMP28:%.*]] = sub i64 [[TMP27]], 1
+; FORCED-TF-NEXT:    [[TMP29:%.*]] = extractelement <4 x ptr> [[TMP16]], i64 [[TMP28]]
+; FORCED-TF-NEXT:    br label [[END:%.*]]
+; FORCED-TF:       end:
+; FORCED-TF-NEXT:    store ptr [[TMP29]], ptr [[POS]], align 4
+; FORCED-TF-NEXT:    ret void
+;
 ; CHECK-LABEL: @metadata(
 ; CHECK-NEXT:  header:
 ; CHECK-NEXT:    [[PTR0:%.*]] = load ptr, ptr [[POS:%.*]], align 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE:%.*]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[SIZE]], [[N_VEC]]
-; CHECK-NEXT:    [[IND_END1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[N_VEC]]
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[SIZE:%.*]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[SIZE]], 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], ptr [[NEXT_GEP]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE11:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP1]]
+; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP2]]
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x ptr> poison, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x ptr> [[TMP4]], ptr [[NEXT_GEP1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[NEXT_GEP2]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[NEXT_GEP3]], i32 3
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP1]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP2]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP3]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x ptr> [[TMP13]], ptr [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x ptr> [[TMP14]], ptr [[TMP11]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x ptr> [[TMP15]], ptr [[TMP12]], i32 3
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    store i8 [[TMP18]], ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
+; CHECK:       pred.store.if6:
+; CHECK-NEXT:    [[TMP20:%.*]] = load i8, ptr [[TMP10]], align 1
+; CHECK-NEXT:    store i8 [[TMP20]], ptr [[NEXT_GEP1]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; CHECK:       pred.store.continue7:
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2
+; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; CHECK:       pred.store.if8:
+; CHECK-NEXT:    [[TMP22:%.*]] = load i8, ptr [[TMP11]], align 1
+; CHECK-NEXT:    store i8 [[TMP22]], ptr [[NEXT_GEP2]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; CHECK:       pred.store.continue9:
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3
+; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]]
+; CHECK:       pred.store.if10:
+; CHECK-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP12]], align 1
+; CHECK-NEXT:    store i8 [[TMP24]], ptr [[NEXT_GEP3]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; CHECK:       pred.store.continue11:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ]
-; CHECK-NEXT:    br label [[BODY:%.*]]
-; CHECK:       body:
-; CHECK-NEXT:    [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[BUFF:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[BUFF]], i32 1
-; CHECK-NEXT:    [[DEC]] = add nsw i32 [[DEC66]], -1
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1
-; CHECK-NEXT:    store i8 [[TMP5]], ptr [[BUFF]], align 1
-; CHECK-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP26:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
+; CHECK-NEXT:    [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP26]], i1 true)
+; CHECK-NEXT:    [[TMP28:%.*]] = sub i64 [[TMP27]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x ptr> [[TMP16]], i64 [[TMP28]]
+; CHECK-NEXT:    br label [[END:%.*]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[INCDEC_PTR_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END1]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    store ptr [[INCDEC_PTR_LCSSA]], ptr [[POS]], align 4
+; CHECK-NEXT:    store ptr [[TMP29]], ptr [[POS]], align 4
 ; CHECK-NEXT:    ret void
 ;
 header:
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
index e160a15ece47d..bba459f776050 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
@@ -1140,18 +1140,14 @@ define void @test_vector_tc_eq_16(ptr %A) {
 ; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF8UF2:       [[VECTOR_BODY]]:
-; VF8UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF8UF2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8
-; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[NEXT_GEP]], align 1
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i32 8
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[A]], align 1
 ; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
 ; VF8UF2-NEXT:    [[TMP2:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
 ; VF8UF2-NEXT:    [[TMP3:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10)
-; VF8UF2-NEXT:    store <8 x i8> [[TMP2]], ptr [[NEXT_GEP]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[TMP2]], ptr [[A]], align 1
 ; VF8UF2-NEXT:    store <8 x i8> [[TMP3]], ptr [[TMP1]], align 1
-; VF8UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF8UF2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; VF8UF2-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
 ; VF8UF2-NEXT:    br label %[[SCALAR_PH:.*]]
 ; VF8UF2:       [[SCALAR_PH]]:
@@ -1165,7 +1161,7 @@ define void @test_vector_tc_eq_16(ptr %A) {
 ; VF8UF2-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
 ; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
 ; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17
-; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF8UF2:       [[EXIT]]:
 ; VF8UF2-NEXT:    ret void
 ;
@@ -1177,14 +1173,10 @@ define void @test_vector_tc_eq_16(ptr %A) {
 ; VF16UF1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF16UF1:       [[VECTOR_BODY]]:
-; VF16UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF16UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[A]], align 1
 ; VF16UF1-NEXT:    [[TMP1:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10)
-; VF16UF1-NEXT:    store <16 x i8> [[TMP1]], ptr [[NEXT_GEP]], align 1
-; VF16UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF16UF1-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; VF16UF1-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF16UF1-NEXT:    store <16 x i8> [[TMP1]], ptr [[A]], align 1
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF16UF1:       [[MIDDLE_BLOCK]]:
 ; VF16UF1-NEXT:    br label %[[SCALAR_PH:.*]]
 ; VF16UF1:       [[SCALAR_PH]]:
@@ -1198,7 +1190,7 @@ define void @test_vector_tc_eq_16(ptr %A) {
 ; VF16UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
 ; VF16UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
 ; VF16UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17
-; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF16UF1:       [[EXIT]]:
 ; VF16UF1-NEXT:    ret void
 ;
@@ -1232,12 +1224,10 @@ exit:
 ; VF8UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; VF8UF2: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; VF8UF2: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
-; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; VF8UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
 ;.
 ; VF16UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; VF16UF1: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; VF16UF1: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
-; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; VF16UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-metadata.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-metadata.ll
new file mode 100644
index 0000000000000..857b9131a0b8c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-metadata.ll
@@ -0,0 +1,100 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -disable-output %s 2>&1 | FileCheck %s
+
+define void @test_widen_metadata(ptr noalias %A, ptr noalias %B, i32 %n) {
+; CHECK-LABEL: Checking a loop in 'test_widen_metadata'
+; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK:      <x1> vector loop: {
+; CHECK:        vector.body:
+; CHECK:          WIDEN ir<%lv> = load vp<{{.*}}>
+; CHECK:          WIDEN-CAST ir<%conv> = sitofp ir<%lv> to float
+; CHECK:          WIDEN ir<%mul> = fmul ir<%conv>, ir<2.000000e+00>
+; CHECK:          WIDEN-CAST ir<%conv.back> = fptosi ir<%mul> to i32
+; CHECK:          WIDEN store vp<{{.*}}>, ir<%conv.back>
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i32 %i
+  %lv = load i32, ptr %gep.A, align 4, !tbaa !0, !range !6
+  %conv = sitofp i32 %lv to float, !fpmath !5
+  %mul = fmul float %conv, 2.0, !fpmath !5
+  %conv.back = fptosi float %mul to i32
+  %gep.B = getelementptr inbounds i32, ptr %B, i32 %i
+  store i32 %conv.back, ptr %gep.B, align 4, !tbaa !0
+  %i.next = add i32 %i, 1
+  %cond = icmp eq i32 %i.next, %n
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare float @llvm.sqrt.f32(float)
+
+define void @test_intrinsic_with_metadata(ptr noalias %A, ptr noalias %B, i32 %n) {
+; CHECK-LABEL: Checking a loop in 'test_intrinsic_with_metadata'
+; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK:      <x1> vector loop: {
+; CHECK:        vector.body:
+; CHECK:          WIDEN ir<%lv> = load vp<{{.*}}>
+; CHECK:          WIDEN-INTRINSIC ir<%sqrt> = call llvm.sqrt(ir<%lv>)
+; CHECK:          WIDEN store vp<{{.*}}>, ir<%sqrt>
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %gep.A = getelementptr inbounds float, ptr %A, i32 %i
+  %lv = load float, ptr %gep.A, align 4, !tbaa !0
+  %sqrt = call float @llvm.sqrt.f32(float %lv), !fpmath !5
+  %gep.B = getelementptr inbounds float, ptr %B, i32 %i
+  store float %sqrt, ptr %gep.B, align 4, !tbaa !0
+  %i.next = add i32 %i, 1
+  %cond = icmp eq i32 %i.next, %n
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_widen_with_multiple_metadata(ptr noalias %A, ptr noalias %B, i32 %n) {
+; CHECK-LABEL: Checking a loop in 'test_widen_with_multiple_metadata'
+; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK:      <x1> vector loop: {
+; CHECK:        vector.body:
+; CHECK:          WIDEN ir<%lv> = load vp<{{.*}}>
+; CHECK:          WIDEN-CAST ir<%conv> = sitofp ir<%lv> to float
+; CHECK:          WIDEN ir<%mul> = fmul ir<%conv>, ir<2.000000e+00>
+; CHECK:          WIDEN-CAST ir<%conv.back> = fptosi ir<%mul> to i32
+; CHECK:          WIDEN store vp<{{.*}}>, ir<%conv.back>
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i32 %i
+  %lv = load i32, ptr %gep.A, align 4, !tbaa !0, !range !6
+  %conv = sitofp i32 %lv to float
+  %mul = fmul float %conv, 2.0
+  %conv.back = fptosi float %mul to i32
+  %gep.B = getelementptr inbounds i32, ptr %B, i32 %i
+  store i32 %conv.back, ptr %gep.B, align 4, !tbaa !0
+  %i.next = add i32 %i, 1
+  %cond = icmp eq i32 %i.next, %n
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"float", !2}
+!2 = !{!"root"}
+!5 = !{float 2.500000e+00}
+!6 = !{i32 0, i32 100}
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index 291ada86cf797..ef678ff759943 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -804,9 +804,9 @@ exit:
 define i32 @print_mulacc_extended_const(ptr %start, ptr %end) {
 ; CHECK-LABEL: 'print_mulacc_extended_const'
 ; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<%0> = VF
-; CHECK-NEXT:  Live-in vp<%1> = VF * UF
-; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT:  Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT:  vp<%3> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<entry>:
@@ -814,107 +814,84 @@ define i32 @print_mulacc_extended_const(ptr %start, ptr %end) {
 ; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1>
-; CHECK-NEXT:    EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:    vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1>
+; CHECK-NEXT:    EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9>
-; CHECK-NEXT:      vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0>
-; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7>
-; CHECK-NEXT:      vp<%8> = vector-pointer vp<%next.gep>
-; CHECK-NEXT:      WIDEN ir<%l> = load vp<%8>
-; CHECK-NEXT:      EXPRESSION vp<%9> = ir<%red> + reduce.add (mul (ir<%l> zext to i32), (ir<63> zext to i32))
-; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
-; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:      EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul (ir<%l> zext to i32), (ir<63> zext to i32))
+; CHECK-NEXT:      EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9>
-; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<[[RDX]]>, vp<[[RDX_NEXT]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]>
 ; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
-; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<exit>:
-; CHECK-NEXT:    IR   %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block)
-; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  scalar.ph:
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT:  Successor(s): ir-bb<loop>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
-; CHECK-NEXT:    IR   %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
-; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
-; CHECK-NEXT:    IR   %l.ext = zext i8 %l to i32
-; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 63
-; CHECK-NEXT:    IR   %red.next = add i32 %red, %mul
-; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
-; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
-; CHECK-NEXT:  No successors
-; CHECK-NEXT:  }
-; CHECK:       VPlan 'Final VPlan for VF={4},UF={1}' {
-; CHECK-NEXT:  Live-in ir<%1> = original trip-count
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = zext i8 %l to i32
+  %mul = mul i32 %l.ext, 63
+  %red.next = add i32 %red, %mul
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %red.next
+}
+
+define i32 @print_mulacc_extended_const_lhs(ptr %start, ptr %end) {
+; CHECK-LABEL: 'print_mulacc_extended_const_lhs'
+; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<entry>:
-; CHECK-NEXT:    IR   %start2 = ptrtoint ptr %start to i64
-; CHECK-NEXT:    IR   %end1 = ptrtoint ptr %end to i64
-; CHECK-NEXT:    IR   %0 = add i64 %end1, 1
-; CHECK-NEXT:    IR   %1 = sub i64 %0, %start2
-; CHECK-NEXT:    EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4>
-; CHECK-NEXT:    EMIT branch-on-cond vp<%min.iters.check>
-; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, vector.ph
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64))
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4>
-; CHECK-NEXT:    EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf>
-; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1>
-; CHECK-NEXT:  Successor(s): vector.body
-; CHECK-EMPTY:
-; CHECK-NEXT:  vector.body:
-; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT:    WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next>
-; CHECK-NEXT:    EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index>
-; CHECK-NEXT:    WIDEN ir<%l> = load vp<%next.gep>
-; CHECK-NEXT:    WIDEN-CAST ir<%l.ext> = zext ir<%l> to i32
-; CHECK-NEXT:    WIDEN ir<%mul> = mul ir<%l.ext>, ir<63>
-; CHECK-NEXT:    REDUCE ir<%red.next> = ir<%red> +  reduce.add (ir<%mul>)
-; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<4>
-; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, vp<%n.vec>
-; CHECK-NEXT:  Successor(s): middle.block, vector.body
+; CHECK-NEXT:    vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1>
+; CHECK-NEXT:    EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:      WIDEN-CAST ir<%l.ext> = zext ir<%l> to i32
+; CHECK-NEXT:      EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul ir<63>, ir<%l.ext>)
+; CHECK-NEXT:      EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next>
-; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec>
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<[[RDX]]>, vp<[[RDX_NEXT]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]>
 ; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
-; CHECK-NEXT:  Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<exit>:
-; CHECK-NEXT:    IR   %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block)
-; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<scalar.ph>:
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT:  Successor(s): ir-bb<loop>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
-; CHECK-NEXT:    IR   %red = phi i32 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>)
-; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
-; CHECK-NEXT:    IR   %l.ext = zext i8 %l to i32
-; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 63
-; CHECK-NEXT:    IR   %red.next = add i32 %red, %mul
-; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
-; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
-; CHECK-NEXT:  No successors
-; CHECK-NEXT:  }
 entry:
   br label %loop
 
@@ -923,7 +900,7 @@ loop:
   %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
   %l = load i8, ptr %ptr.iv, align 1
   %l.ext = zext i8 %l to i32
-  %mul = mul i32 %l.ext, 63
+  %mul = mul i32 63, %l.ext
   %red.next = add i32 %red, %mul
   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
   %ec = icmp eq ptr %ptr.iv, %end
@@ -937,9 +914,9 @@ exit:
 define i32 @print_mulacc_not_extended_const(ptr %start, ptr %end) {
 ; CHECK-LABEL: 'print_mulacc_not_extended_const'
 ; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<%0> = VF
-; CHECK-NEXT:  Live-in vp<%1> = VF * UF
-; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT:  Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT:  vp<%3> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<entry>:
@@ -947,108 +924,30 @@ define i32 @print_mulacc_not_extended_const(ptr %start, ptr %end) {
 ; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1>
-; CHECK-NEXT:    EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:    vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1>
+; CHECK-NEXT:    EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9>
-; CHECK-NEXT:      vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0>
-; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7>
-; CHECK-NEXT:      vp<%8> = vector-pointer vp<%next.gep>
-; CHECK-NEXT:      WIDEN ir<%l> = load vp<%8>
+; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:      WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32
-; CHECK-NEXT:      EXPRESSION vp<%9> = ir<%red> + reduce.add (mul ir<%l.ext>, ir<128>)
-; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
-; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:      EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul ir<%l.ext>, ir<128>)
+; CHECK-NEXT:      EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9>
-; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<[[RDX:%.+]]>, vp<[[RDX_NEXT]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]>
 ; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
-; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<exit>:
-; CHECK-NEXT:    IR   %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block)
-; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  scalar.ph:
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT:  Successor(s): ir-bb<loop>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
-; CHECK-NEXT:    IR   %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
-; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
-; CHECK-NEXT:    IR   %l.ext = sext i8 %l to i32
-; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 128
-; CHECK-NEXT:    IR   %red.next = add i32 %red, %mul
-; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
-; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
-; CHECK-NEXT:  No successors
-; CHECK-NEXT:  }
-; CHECK:       VPlan 'Final VPlan for VF={4},UF={1}' {
-; CHECK-NEXT:  Live-in ir<%1> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<entry>:
-; CHECK-NEXT:    IR   %start2 = ptrtoint ptr %start to i64
-; CHECK-NEXT:    IR   %end1 = ptrtoint ptr %end to i64
-; CHECK-NEXT:    IR   %0 = add i64 %end1, 1
-; CHECK-NEXT:    IR   %1 = sub i64 %0, %start2
-; CHECK-NEXT:    EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4>
-; CHECK-NEXT:    EMIT branch-on-cond vp<%min.iters.check>
-; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, vector.ph
-; CHECK-EMPTY:
-; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4>
-; CHECK-NEXT:    EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf>
-; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1>
-; CHECK-NEXT:  Successor(s): vector.body
-; CHECK-EMPTY:
-; CHECK-NEXT:  vector.body:
-; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT:    WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next>
-; CHECK-NEXT:    EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index>
-; CHECK-NEXT:    WIDEN ir<%l> = load vp<%next.gep>
-; CHECK-NEXT:    WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32
-; CHECK-NEXT:    WIDEN ir<%mul> = mul ir<%l.ext>, ir<128>
-; CHECK-NEXT:    REDUCE ir<%red.next> = ir<%red> +  reduce.add (ir<%mul>)
-; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<4>
-; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, vp<%n.vec>
-; CHECK-NEXT:  Successor(s): middle.block, vector.body
-; CHECK-EMPTY:
-; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next>
-; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec>
-; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
-; CHECK-NEXT:  Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<exit>:
-; CHECK-NEXT:    IR   %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block)
-; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<scalar.ph>:
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT:  Successor(s): ir-bb<loop>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
-; CHECK-NEXT:    IR   %red = phi i32 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>)
-; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
-; CHECK-NEXT:    IR   %l.ext = sext i8 %l to i32
-; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 128
-; CHECK-NEXT:    IR   %red.next = add i32 %red, %mul
-; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
-; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
-; CHECK-NEXT:  No successors
-; CHECK-NEXT:  }
 entry:
   br label %loop
 
@@ -1071,9 +970,9 @@ exit:
 define i64 @print_ext_mulacc_extended_const(ptr %start, ptr %end) {
 ; CHECK-LABEL: 'print_ext_mulacc_extended_const'
 ; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<%0> = VF
-; CHECK-NEXT:  Live-in vp<%1> = VF * UF
-; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT:  Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT:  vp<%3> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<entry>:
@@ -1081,109 +980,29 @@ define i64 @print_ext_mulacc_extended_const(ptr %start, ptr %end) {
 ; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1>
-; CHECK-NEXT:    EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:    vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1>
+; CHECK-NEXT:    EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9>
-; CHECK-NEXT:      vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0>
-; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7>
-; CHECK-NEXT:      vp<%8> = vector-pointer vp<%next.gep>
-; CHECK-NEXT:      WIDEN ir<%l> = load vp<%8>
-; CHECK-NEXT:      EXPRESSION vp<%9> = ir<%red> + reduce.add (mul (ir<%l> zext to i64), (ir<63> zext to i64))
-; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
-; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:      EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul (ir<%l> zext to i64), (ir<63> zext to i64))
+; CHECK-NEXT:      EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9>
-; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<[[RDX]]>, vp<[[RDX_NEXT]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]>
 ; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
-; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<exit>:
-; CHECK-NEXT:    IR   %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block)
-; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  scalar.ph:
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT:  Successor(s): ir-bb<loop>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
-; CHECK-NEXT:    IR   %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
-; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
-; CHECK-NEXT:    IR   %l.ext = zext i8 %l to i32
-; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 63
-; CHECK-NEXT:    IR   %mul.ext = zext i32 %mul to i64
-; CHECK-NEXT:    IR   %red.next = add i64 %red, %mul.ext
-; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
-; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
-; CHECK-NEXT:  No successors
-; CHECK-NEXT:  }
-; CHECK:       VPlan 'Final VPlan for VF={4},UF={1}' {
-; CHECK-NEXT:  Live-in ir<%1> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<entry>:
-; CHECK-NEXT:    IR   %start2 = ptrtoint ptr %start to i64
-; CHECK-NEXT:    IR   %end1 = ptrtoint ptr %end to i64
-; CHECK-NEXT:    IR   %0 = add i64 %end1, 1
-; CHECK-NEXT:    IR   %1 = sub i64 %0, %start2
-; CHECK-NEXT:    EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4>
-; CHECK-NEXT:    EMIT branch-on-cond vp<%min.iters.check>
-; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, vector.ph
-; CHECK-EMPTY:
-; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4>
-; CHECK-NEXT:    EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf>
-; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1>
-; CHECK-NEXT:  Successor(s): vector.body
-; CHECK-EMPTY:
-; CHECK-NEXT:  vector.body:
-; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT:    WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next>
-; CHECK-NEXT:    EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index>
-; CHECK-NEXT:    WIDEN ir<%l> = load vp<%next.gep>
-; CHECK-NEXT:    WIDEN-CAST vp<%4> = zext ir<%l> to i64
-; CHECK-NEXT:    WIDEN ir<%mul> = mul vp<%4>, ir<63>
-; CHECK-NEXT:    REDUCE ir<%red.next> = ir<%red> +  reduce.add (ir<%mul>)
-; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<4>
-; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, vp<%n.vec>
-; CHECK-NEXT:  Successor(s): middle.block, vector.body
-; CHECK-EMPTY:
-; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<%6> = compute-reduction-result ir<%red>, ir<%red.next>
-; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec>
-; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
-; CHECK-NEXT:  Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<exit>:
-; CHECK-NEXT:    IR   %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%6> from middle.block)
-; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<scalar.ph>:
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%6>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT:  Successor(s): ir-bb<loop>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
-; CHECK-NEXT:    IR   %red = phi i64 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>)
-; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
-; CHECK-NEXT:    IR   %l.ext = zext i8 %l to i32
-; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 63
-; CHECK-NEXT:    IR   %mul.ext = zext i32 %mul to i64
-; CHECK-NEXT:    IR   %red.next = add i64 %red, %mul.ext
-; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
-; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
-; CHECK-NEXT:  No successors
-; CHECK-NEXT:  }
 entry:
   br label %loop
 
@@ -1207,9 +1026,9 @@ exit:
 define i64 @print_ext_mulacc_not_extended_const(ptr %start, ptr %end) {
 ; CHECK-LABEL: 'print_ext_mulacc_not_extended_const'
 ; CHECK:       VPlan 'Initial VPlan for VF={4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<%0> = VF
-; CHECK-NEXT:  Live-in vp<%1> = VF * UF
-; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT:  Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT:  vp<%3> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<entry>:
@@ -1217,112 +1036,31 @@ define i64 @print_ext_mulacc_not_extended_const(ptr %start, ptr %end) {
 ; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1>
-; CHECK-NEXT:    EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT:    vp<[[DER_IV:%.+]]> = DERIVED-IV ir<%start> + vp<[[VTC]]> * ir<1>
+; CHECK-NEXT:    EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9>
-; CHECK-NEXT:      vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0>
-; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7>
-; CHECK-NEXT:      vp<%8> = vector-pointer vp<%next.gep>
-; CHECK-NEXT:      WIDEN ir<%l> = load vp<%8>
+; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:      EMIT vp<%next.gep> = ptradd ir<%start>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep>
+; CHECK-NEXT:      WIDEN ir<%l> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:      WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32
 ; CHECK-NEXT:      WIDEN ir<%mul> = mul ir<%l.ext>, ir<128>
-; CHECK-NEXT:      EXPRESSION vp<%9> = ir<%red> + reduce.add (ir<%mul> sext to i64)
-; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
-; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:      EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (ir<%mul> sext to i64)
+; CHECK-NEXT:      EMIT vp<[[IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9>
-; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
-; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
-; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<exit>:
-; CHECK-NEXT:    IR   %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block)
-; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  scalar.ph:
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT:  Successor(s): ir-bb<loop>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
-; CHECK-NEXT:    IR   %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
-; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
-; CHECK-NEXT:    IR   %l.ext = sext i8 %l to i32
-; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 128
-; CHECK-NEXT:    IR   %mul.ext = sext i32 %mul to i64
-; CHECK-NEXT:    IR   %red.next = add i64 %red, %mul.ext
-; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
-; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
-; CHECK-NEXT:  No successors
-; CHECK-NEXT:  }
-; CHECK:       VPlan 'Final VPlan for VF={4},UF={1}' {
-; CHECK-NEXT:  Live-in ir<%1> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<entry>:
-; CHECK-NEXT:    IR   %start2 = ptrtoint ptr %start to i64
-; CHECK-NEXT:    IR   %end1 = ptrtoint ptr %end to i64
-; CHECK-NEXT:    IR   %0 = add i64 %end1, 1
-; CHECK-NEXT:    IR   %1 = sub i64 %0, %start2
-; CHECK-NEXT:    EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4>
-; CHECK-NEXT:    EMIT branch-on-cond vp<%min.iters.check>
-; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, vector.ph
-; CHECK-EMPTY:
-; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4>
-; CHECK-NEXT:    EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf>
-; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1>
-; CHECK-NEXT:  Successor(s): vector.body
-; CHECK-EMPTY:
-; CHECK-NEXT:  vector.body:
-; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT:    WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next>
-; CHECK-NEXT:    EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index>
-; CHECK-NEXT:    WIDEN ir<%l> = load vp<%next.gep>
-; CHECK-NEXT:    WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32
-; CHECK-NEXT:    WIDEN ir<%mul> = mul ir<%l.ext>, ir<128>
-; CHECK-NEXT:    WIDEN-CAST ir<%mul.ext> = sext ir<%mul> to i64
-; CHECK-NEXT:    REDUCE ir<%red.next> = ir<%red> +  reduce.add (ir<%mul.ext>)
-; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<4>
-; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, vp<%n.vec>
-; CHECK-NEXT:  Successor(s): middle.block, vector.body
-; CHECK-EMPTY:
-; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next>
-; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec>
+; CHECK-NEXT:    EMIT vp<%11> = compute-reduction-result ir<[[RDX]]>, vp<[[RDX_NEXT]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<[[VTC]]>
 ; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
-; CHECK-NEXT:  Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<exit>:
-; CHECK-NEXT:    IR   %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block)
-; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<scalar.ph>:
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT:  Successor(s): ir-bb<loop>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
-; CHECK-NEXT:    IR   %red = phi i64 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>)
-; CHECK-NEXT:    IR   %l = load i8, ptr %ptr.iv, align 1
-; CHECK-NEXT:    IR   %l.ext = sext i8 %l to i32
-; CHECK-NEXT:    IR   %mul = mul i32 %l.ext, 128
-; CHECK-NEXT:    IR   %mul.ext = sext i32 %mul to i64
-; CHECK-NEXT:    IR   %red.next = add i64 %red, %mul.ext
-; CHECK-NEXT:    IR   %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
-; CHECK-NEXT:    IR   %ec = icmp eq ptr %ptr.iv, %end
-; CHECK-NEXT:  No successors
-; CHECK-NEXT:  }
 entry:
   br label %loop
 
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index 994e9c1ce64fa..3161a0d5e6f5e 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -debug -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -force-widen-divrem-via-safe-divisor=0 -debug -disable-output %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -29,11 +29,13 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
 ; CHECK-NEXT:   ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]>
 ; CHECK-NEXT:   EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]>
+; CHECK-NEXT:   WIDEN ir<%cond> = icmp eq ir<%iv>, ir<%x>
+; CHECK-NEXT:   EMIT vp<[[AND:%.+]]> = logical-and vp<[[MASK]]>, ir<%cond>
 ; CHECK-NEXT: Successor(s): pred.store
 
 ; CHECK:      <xVFxUF> pred.store: {
 ; CHECK-NEXT:   pred.store.entry:
-; CHECK-NEXT:     BRANCH-ON-MASK vp<[[MASK]]>
+; CHECK-NEXT:     BRANCH-ON-MASK vp<[[AND]]>
 ; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
 
 ; CHECK:      pred.store.if:
@@ -50,24 +52,31 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
 
-; CHECK:      loop.1:
+; CHECK:      if.1:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
 ;
-define void @sink1(i32 %k) {
+define void @sink1(i32 %k, i32 %x) {
 entry:
   br label %loop
 
 loop:
-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %latch ]
+  %cond = icmp eq i32 %iv, %x
+  br i1 %cond, label %if, label %latch
+
+if:
   %gep.b = getelementptr inbounds [2048 x i32], ptr @b, i32 0, i32 %iv
   %lv.b  = load i32, ptr %gep.b, align 4
   %add = add i32 %lv.b, 10
   %mul = mul i32 2, %add
   %gep.a = getelementptr inbounds [2048 x i32], ptr @a, i32 0, i32 %iv
   store i32 %mul, ptr %gep.a, align 4
+  br label %latch
+
+latch:
   %iv.next = add i32 %iv, 1
   %large = icmp sge i32 %iv, 8
   %exitcond = icmp eq i32 %iv, %k
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll
index aaabd18958fae..618ec86ebd35d 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll
@@ -118,18 +118,18 @@ declare <2 x float> @llvm.matrix.transpose(<2 x float>, i32, i32)
 !4 = !{i32 2, !"Debug Info Version", i32 3}
 !5 = distinct !DISubprogram(name: "load_fn", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
 !17 = !DIFile(filename: "toplevel.c", directory: "/test")
-!16 = distinct !DISubprogram(name: "toplevel", scope: !1, file: !17, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!16 = distinct !DISubprogram(name: "toplevel", scope: !1, file: !17, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !18 = !DIFile(filename: "assign.h", directory: "/test")
-!19 = distinct !DISubprogram(name: "assign", scope: !1, file: !18, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!19 = distinct !DISubprogram(name: "assign", scope: !1, file: !18, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 
 !20 = !DIFile(filename: "add.h", directory: "/test")
-!21 = distinct !DISubprogram(name: "add_fn", scope: !1, file: !20, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!21 = distinct !DISubprogram(name: "add_fn", scope: !1, file: !20, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 
 !22 = !DIFile(filename: "store.h", directory: "/test")
-!23 = distinct !DISubprogram(name: "store_fn", scope: !1, file: !22, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!23 = distinct !DISubprogram(name: "store_fn", scope: !1, file: !22, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 
 !24 = !DIFile(filename: "transpose.h", directory: "/test")
-!25 = distinct !DISubprogram(name: "transpose", scope: !1, file: !24, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!25 = distinct !DISubprogram(name: "transpose", scope: !1, file: !24, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 
 
 !6 = !DISubroutineType(types: !7)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
index 628ff08b81679..ff41c57055bff 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
@@ -163,26 +163,26 @@ declare void @llvm.matrix.column.major.store(<9 x double>, ptr, i64, i1, i32, i3
 !19 = !DILocation(line: 10, column: 20, scope: !5)
 !20 = !DILocation(line: 10, column: 10, scope: !5)
 
-!21 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!21 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !22 = !DILocation(line: 30, column: 20, scope: !21)
 
-!23 = distinct !DISubprogram(name: "fn3", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!23 = distinct !DISubprogram(name: "fn3", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !24 = !DILocation(line: 40, column: 20, scope: !23)
 
-!25 = distinct !DISubprogram(name: "fn4", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!25 = distinct !DISubprogram(name: "fn4", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !26 = !DILocation(line: 50, column: 20, scope: !25)
 
-!27 = distinct !DISubprogram(name: "fn5", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!27 = distinct !DISubprogram(name: "fn5", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !28 = !DILocation(line: 60, column: 20, scope: !27)
 
-!29 = distinct !DISubprogram(name: "fn6", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!29 = distinct !DISubprogram(name: "fn6", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !30 = !DILocation(line: 70, column: 20, scope: !29)
 
-!31 = distinct !DISubprogram(name: "fn7", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!31 = distinct !DISubprogram(name: "fn7", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !32 = !DILocation(line: 80, column: 20, scope: !31)
 
-!33 = distinct !DISubprogram(name: "fn8", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!33 = distinct !DISubprogram(name: "fn8", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !34 = !DILocation(line: 90, column: 20, scope: !33)
 
-!35 = distinct !DISubprogram(name: "fn9", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!35 = distinct !DISubprogram(name: "fn9", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !36 = !DILocation(line: 100, column: 20, scope: !35)
diff --git a/llvm/test/Transforms/OpenMP/parallel_deletion.ll b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
index 67970c41f765e..0b6c4f32772f5 100644
--- a/llvm/test/Transforms/OpenMP/parallel_deletion.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
@@ -385,7 +385,7 @@ define internal void @.omp_outlined..4(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..4
 ; CHECK-SAME: (ptr noalias nofree noundef nonnull readonly align 4 captures(none) dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], ptr noalias nofree readnone captures(none) [[DOTBOUND_TID_:%.*]], ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1:![0-9]+]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
@@ -458,7 +458,7 @@ define internal void @.omp_outlined..5(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-SAME: (ptr noalias nofree readonly captures(none) [[DOTGLOBAL_TID_:%.*]], ptr noalias nofree readnone captures(none) [[DOTBOUND_TID_:%.*]], ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr noundef nonnull @[[GLOB0]]) #[[ATTR19]]
-; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_single(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
@@ -534,7 +534,7 @@ define internal void @.omp_outlined..6(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr noundef nonnull align 4 [[A1]]) #[[ATTR20:[0-9]+]]
 ; CHECK-NEXT:    store i32 1, ptr [[A1]], align 4
 ; CHECK-NEXT:    store ptr [[A1]], ptr [[DOTOMP_REDUCTION_RED_LIST]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_reduce_nowait(ptr noundef nonnull @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 noundef 1, i64 noundef 8, ptr noundef nonnull align 8 [[DOTOMP_REDUCTION_RED_LIST]], ptr noundef nonnull @.omp.reduction.reduction_func, ptr noundef nonnull @.gomp_critical_user_.reduction.var)
 ; CHECK-NEXT:    switch i32 [[TMP4]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
 ; CHECK-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
@@ -646,10 +646,10 @@ define internal void @.omp.reduction.reduction_func(ptr %arg, ptr %arg1) {
 ; CHECK-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func
 ; CHECK-SAME: (ptr nofree noundef nonnull readonly align 8 captures(none) dereferenceable(8) [[ARG:%.*]], ptr nofree noundef nonnull readonly align 8 captures(none) dereferenceable(8) [[ARG1:%.*]]) #[[ATTR10:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARG1]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARG]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARG1]], align 8, !invariant.load [[META1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARG]], align 8, !invariant.load [[META1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !invariant.load [[META1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP2]], align 4, !invariant.load [[META1]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll b/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll
index b29834f9fe960..6e543b8c87fc7 100644
--- a/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll
+++ b/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt < %s -passes='require<profile-summary>,chr' -S | FileCheck %s
 
 declare void @foo()
@@ -14,21 +14,21 @@ define void @test_chr_with_lifetimes(ptr %i) !prof !14 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = freeze i1 [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select i1 true, i1 [[TMP9]], i1 false
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 true, i1 [[TMP9]], i1 false, !prof [[PROF15:![0-9]+]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = freeze i1 [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP10]], i1 [[TMP11]], i1 false
-; CHECK-NEXT:    br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP8]], i1 [[TMP11]], i1 false, !prof [[PROF15]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF16:![0-9]+]]
 ; CHECK:       entry.split:
-; CHECK-NEXT:    [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF16:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF17:![0-9]+]]
 ; CHECK-NEXT:    call void @baz(i64 [[TMP6]])
-; CHECK-NEXT:    br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF17:![0-9]+]]
+; CHECK-NEXT:    br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF18:![0-9]+]]
 ; CHECK:       bb0:
 ; CHECK-NEXT:    call void @foo()
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       entry.split.nonchr:
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF16]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF17]]
 ; CHECK-NEXT:    call void @baz(i64 [[TMP7]])
-; CHECK-NEXT:    br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF17]]
 ; CHECK:       bb0.nonchr:
 ; CHECK-NEXT:    call void @foo()
 ; CHECK-NEXT:    br label [[BB1]]
@@ -83,24 +83,24 @@ define void @test_chr_dynamic_alloca(ptr %i) !prof !14 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = freeze i1 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false, !prof [[PROF15]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = freeze i1 [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false
-; CHECK-NEXT:    br i1 [[TMP5]], label [[BB4_SPLIT:%.*]], label [[BB4_SPLIT_NONCHR:%.*]], !prof [[PROF15]]
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false, !prof [[PROF15]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[BB4_SPLIT:%.*]], label [[BB4_SPLIT_NONCHR:%.*]], !prof [[PROF16]]
 ; CHECK:       bb4.split:
-; CHECK-NEXT:    [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF16]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF17]]
 ; CHECK-NEXT:    [[TEST:%.*]] = alloca i32, align 8
 ; CHECK-NEXT:    call void @baz(i64 [[TMP6]])
-; CHECK-NEXT:    br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF17]]
+; CHECK-NEXT:    br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF18]]
 ; CHECK:       bb0:
 ; CHECK-NEXT:    call void @foo()
 ; CHECK-NEXT:    store ptr [[TEST]], ptr [[I]], align 8
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb4.split.nonchr:
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF16]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF17]]
 ; CHECK-NEXT:    [[TEST_NONCHR:%.*]] = alloca i32, align 8
 ; CHECK-NEXT:    call void @baz(i64 [[TMP7]])
-; CHECK-NEXT:    br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF17]]
 ; CHECK:       bb0.nonchr:
 ; CHECK-NEXT:    call void @foo()
 ; CHECK-NEXT:    store ptr [[TEST_NONCHR]], ptr [[I]], align 8
@@ -167,21 +167,21 @@ define void @test_no_move_allocas(ptr %i) !prof !14 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = freeze i1 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false, !prof [[PROF15]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = freeze i1 [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false
-; CHECK-NEXT:    br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15]]
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false, !prof [[PROF15]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF16]]
 ; CHECK:       entry.split:
-; CHECK-NEXT:    [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF16]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF17]]
 ; CHECK-NEXT:    call void @baz(i64 [[TMP6]])
-; CHECK-NEXT:    br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF17]]
+; CHECK-NEXT:    br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF18]]
 ; CHECK:       bb0:
 ; CHECK-NEXT:    call void @foo()
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       entry.split.nonchr:
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF16]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF17]]
 ; CHECK-NEXT:    call void @baz(i64 [[TMP7]])
-; CHECK-NEXT:    br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF17]]
 ; CHECK:       bb0.nonchr:
 ; CHECK-NEXT:    call void @foo()
 ; CHECK-NEXT:    br label [[BB1]]
@@ -242,4 +242,26 @@ bb3:
 
 !14 = !{!"function_entry_count", i64 100}
 !15 = !{!"branch_weights", i32 0, i32 1}
-; CHECK: !15 = !{!"branch_weights", i32 1000, i32 0}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 1, !"ProfileSummary", [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{[[META2:![0-9]+]], [[META3:![0-9]+]], [[META4:![0-9]+]], [[META5:![0-9]+]], [[META6:![0-9]+]], [[META7:![0-9]+]], [[META8:![0-9]+]], [[META9:![0-9]+]]}
+; CHECK: [[META2]] = !{!"ProfileFormat", !"InstrProf"}
+; CHECK: [[META3]] = !{!"TotalCount", i64 10000}
+; CHECK: [[META4]] = !{!"MaxCount", i64 10}
+; CHECK: [[META5]] = !{!"MaxInternalCount", i64 1}
+; CHECK: [[META6]] = !{!"MaxFunctionCount", i64 1000}
+; CHECK: [[META7]] = !{!"NumCounts", i64 3}
+; CHECK: [[META8]] = !{!"NumFunctions", i64 3}
+; CHECK: [[META9]] = !{!"DetailedSummary", [[META10:![0-9]+]]}
+; CHECK: [[META10]] = !{[[META11:![0-9]+]], [[META12:![0-9]+]], [[META13:![0-9]+]]}
+; CHECK: [[META11]] = !{i32 10000, i64 100, i32 1}
+; CHECK: [[META12]] = !{i32 999000, i64 100, i32 1}
+; CHECK: [[META13]] = !{i32 999999, i64 1, i32 2}
+; CHECK: [[META14:![0-9]+]] = !{!"function_entry_count", i64 100}
+; CHECK: [[PROF15]] = !{!"unknown", !"chr"}
+; CHECK: [[PROF16]] = !{!"branch_weights", i32 1000, i32 0}
+; CHECK: [[PROF17]] = !{!"branch_weights", i32 1, i32 0}
+; CHECK: [[PROF18]] = !{!"branch_weights", i32 0, i32 1}
+;.
diff --git a/llvm/test/Transforms/PGOProfile/memprof_diff_inline.ll b/llvm/test/Transforms/PGOProfile/memprof_diff_inline.ll
new file mode 100644
index 0000000000000..5213a07d13d39
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/memprof_diff_inline.ll
@@ -0,0 +1,118 @@
+;; Tests that the compiler ignores smaller contexts that differ only in the
+;; IsInlineFrame bool. These map to the same full context id internally, as we
+;; ignore the inline frame status which may differ in feedback compiles.
+;; Presumably this happens when profiles collected from different binaries are
+;; merged. If we didn't pick the largest we would default them all to noncold.
+
+;; Avoid failures on big-endian systems that can't read the profile properly
+; REQUIRES: x86_64-linux
+
+;; Generate the profile and the IR.
+; RUN: split-file %s %t
+
+;; Generate indexed profile
+; RUN: llvm-profdata merge %t/memprof_diff_inline.yaml -o %t.memprofdata
+
+; RUN: opt < %t/memprof_diff_inline.ll -passes='memprof-use<profile-filename=%t.memprofdata>' -S -memprof-report-hinted-sizes -memprof-print-match-info 2>&1 | FileCheck %s --check-prefixes=MEMPROF
+
+; MEMPROF: MemProf notcold context with id 10194276560488437434 has total profiled size 200 is matched with 1 frames
+; MEMPROF: MemProf cold context with id 16342802530253093571 has total profiled size 10000 is matched with 1 frames
+
+;--- memprof_diff_inline.yaml
+---
+HeapProfileRecords:
+  - GUID:            _Z3foov
+    AllocSites:
+      # Small non-cold, full context id 16342802530253093571, should ignore
+      - Callstack:
+          - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false }
+        MemInfoBlock:
+          AllocCount:      1
+          TotalSize:       10
+          TotalLifetime:   0
+          TotalLifetimeAccessDensity: 20000
+      # Large cold, full context id 16342802530253093571, should keep
+      - Callstack:
+          - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: true }
+          - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false }
+        MemInfoBlock:
+          AllocCount:      1
+          TotalSize:       10000
+          TotalLifetime:   200000
+          TotalLifetimeAccessDensity: 0
+      # Small non-cold, full context id 16342802530253093571, should ignore
+      - Callstack:
+          - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: true }
+          - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false }
+        MemInfoBlock:
+          AllocCount:      1
+          TotalSize:       100
+          TotalLifetime:   0
+          TotalLifetimeAccessDensity: 20000
+      # Small non-cold, full context id 10194276560488437434
+      - Callstack:
+          - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false }
+          - { Function: main, LineOffset: 9, Column: 13, IsInlineFrame: false }
+        MemInfoBlock:
+          AllocCount:      1
+          TotalSize:       200
+          TotalLifetime:   0
+          TotalLifetimeAccessDensity: 20000
+    CallSites:       []
+...
+;--- memprof_diff_inline.ll
+; ModuleID = 'memprof_diff_inline.cc'
+source_filename = "memprof_diff_inline.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%"struct.std::nothrow_t" = type { i8 }
+
+@_ZSt7nothrow = external global %"struct.std::nothrow_t", align 1
+
+define dso_local noundef ptr @_Z3foov() !dbg !10 {
+entry:
+  ; MEMPROF: call {{.*}} @_Znwm{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]]
+  %call = call noalias noundef align 32 ptr @_Znwm(i64 noundef 32) #6, !dbg !13
+  ret ptr %call
+}
+
+declare noundef ptr @_Znwm(i64 noundef)
+
+attributes #6 = { builtin allocsize(0) }
+
+; MEMPROF: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]]}
+
+; MEMPROF: ![[MIB1]] = !{![[STACK1:[0-9]+]], !"notcold", ![[CONTEXTSIZE1:[0-9]+]]}
+; MEMPROF: ![[STACK1]] = !{i64 2732490490862098848, i64 8467819354083268568, i64 9086428284934609951, i64 2061451396820446691}
+;; Full context id 10194276560488437434 == -8252467513221114182
+; MEMPROF: ![[CONTEXTSIZE1]] = !{i64 -8252467513221114182, i64 200}
+
+; MEMPROF: ![[MIB2]] = !{![[STACK2:[0-9]+]], !"cold", ![[CONTEXTSIZE2:[0-9]+]]}
+; MEMPROF: ![[STACK2]] = !{i64 2732490490862098848, i64 8467819354083268568, i64 9086428284934609951, i64 -5747251260480066785}
+;; Full context id 16342802530253093571 == -2103941543456458045
+;; We should have kept the large (cold) one.
+; MEMPROF: ![[CONTEXTSIZE2]] = !{i64 -2103941543456458045, i64 10000}
+
+; MEMPROF: ![[C1]] = !{i64 2732490490862098848}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git 6cbe6284d1f0a088b5c6482ae27b738f03d82fe7)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "memprof.cc", directory: "/usr/local/google/home/tejohnson/llvm/tmp", checksumkind: CSK_MD5, checksum: "e8c40ebe4b21776b4d60e9632cbc13c2")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !11, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12)
+!11 = !DISubroutineType(types: !12)
+!12 = !{}
+!13 = !DILocation(line: 5, column: 10, scope: !10)
diff --git a/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll b/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll
index 2461ca32e9821..ba53c5797208c 100644
--- a/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll
@@ -4,24 +4,50 @@
 ;; Avoid failures on big-endian systems that can't read the profile properly
 ; REQUIRES: x86_64-linux
 
-;; TODO: Use text profile inputs once that is available for memprof.
-;; # To update the Inputs below, run Inputs/update_memprof_inputs.sh.
-;; # To generate below LLVM IR for use in matching.
-;; $ clang++ -gmlt -fdebug-info-for-profiling -S %S/Inputs/memprof_loop_unroll_b.cc -emit-llvm
+; Generate the profile and the IR.
+; RUN: split-file %s %t
+
+;; Generate indexed profile
+; RUN: llvm-profdata merge %t/memprof_loop_unroll.yaml -o %t.memprofdata
 
-; RUN: llvm-profdata merge %S/Inputs/memprof_loop_unroll.memprofraw --profiled-binary %S/Inputs/memprof_loop_unroll.exe -o %t.memprofdata
 ;; Set the minimum lifetime threshold to 0 to ensure that one context is
 ;; considered cold (the other will be notcold).
-; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -S -memprof-report-hinted-sizes -memprof-ave-lifetime-cold-threshold=0 2>&1 | FileCheck %s
+; RUN: opt < %t/memprof_loop_unroll.ll -passes='memprof-use<profile-filename=%t.memprofdata>' -S -memprof-report-hinted-sizes -memprof-ave-lifetime-cold-threshold=0 2>&1 | FileCheck %s
 
-;; Conservatively annotate as not cold. We get two messages as there are two
-;; unrolled copies of the allocation.
-; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and indistinguishable alloc type notcold: 4
-; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and indistinguishable alloc type notcold: 4
+;; Conservatively annotate as not cold.
+; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and single alloc type notcold: 4
 ; CHECK: call {{.*}} @_Znam{{.*}} #[[ATTR:[0-9]+]]
 ; CHECK: attributes #[[ATTR]] = { builtin allocsize(0) "memprof"="notcold" }
 ; CHECK-NOT: stackIds: ()
 
+;--- memprof_loop_unroll.yaml
+---
+HeapProfileRecords:
+  - GUID:            0x7f8d88fcc70a347b
+    AllocSites:
+      - Callstack:
+          - { Function: 0x7f8d88fcc70a347b, LineOffset: 2, Column: 16, IsInlineFrame: false }
+          - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false }
+        MemInfoBlock:
+          AllocCount:      1
+          TotalSize:       4
+          TotalLifetime:   2
+          TotalLifetimeAccessDensity: 12500000000
+      - Callstack:
+          - { Function: 0x7f8d88fcc70a347b, LineOffset: 2, Column: 16, IsInlineFrame: false }
+          - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false }
+        MemInfoBlock:
+          AllocCount:      1
+          TotalSize:       4
+          TotalLifetime:   2
+          TotalLifetimeAccessDensity: 0
+  - GUID:            0xdb956436e78dd5fa
+    CallSites:
+      - Frames:
+          - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false }
+...
+
+;--- memprof_loop_unroll.ll
 ; ModuleID = 'memprof_loop_unroll_b.cc'
 source_filename = "memprof_loop_unroll_b.cc"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Transforms/PGOProfile/prof-verify-no-entrycount.ll b/llvm/test/Transforms/PGOProfile/prof-verify-no-entrycount.ll
index 3b059fd7d8800..9c5f046af47af 100644
--- a/llvm/test/Transforms/PGOProfile/prof-verify-no-entrycount.ll
+++ b/llvm/test/Transforms/PGOProfile/prof-verify-no-entrycount.ll
@@ -1,3 +1,7 @@
+; This test fails under the profcheck configuration due to profcheck creating
+; metadata.
+; UNSUPPORTED: profcheck
+
 ; Test prof-verify for functions without entry count
 
 ; RUN: not opt -passes=prof-verify %s -o - 2>&1 | FileCheck %s
diff --git a/llvm/test/Transforms/PGOProfile/prof-verify.ll b/llvm/test/Transforms/PGOProfile/prof-verify.ll
index 50159506e8313..75d1e6a3db571 100644
--- a/llvm/test/Transforms/PGOProfile/prof-verify.ll
+++ b/llvm/test/Transforms/PGOProfile/prof-verify.ll
@@ -1,3 +1,7 @@
+; This test fails under the profcheck configuration due to profcheck creating
+; metadata.
+; UNSUPPORTED: profcheck
+
 ; Test prof-inject and prof-verify
 
 ; RUN: opt -passes=prof-inject %s -S -o - | FileCheck %s --check-prefix=INJECT
diff --git a/llvm/test/Transforms/PGOProfile/profcheck-select.ll b/llvm/test/Transforms/PGOProfile/profcheck-select.ll
index b5dc97d2d5a6d..74bcb3f52428b 100644
--- a/llvm/test/Transforms/PGOProfile/profcheck-select.ll
+++ b/llvm/test/Transforms/PGOProfile/profcheck-select.ll
@@ -1,3 +1,7 @@
+; This test fails under the profcheck configuration due to profcheck creating
+; metadata.
+; UNSUPPORTED: profcheck
+
 ; RUN: split-file %s %t
 
 ; RUN: opt -passes=prof-inject %t/inject.ll -S -o - | FileCheck %t/inject.ll
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll
new file mode 100644
index 0000000000000..a35bcf1c5a88d
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+
+target triple = "arm64-apple-macosx"
+
+%"class.dealii::VectorizedArray" = type { [4 x double] }
+
+define void @hoist_invariant_load(ptr %invariant_ptr, i64 %num_elements, ptr %array) {
+; CHECK-LABEL: define void @hoist_invariant_load(
+; CHECK-SAME: ptr readonly captures(none) [[INVARIANT_PTR:%.*]], i64 [[NUM_ELEMENTS:%.*]], ptr captures(none) [[ARRAY:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[NUM_ELEMENTS]], 0
+; CHECK-NEXT:    br i1 [[CMP1_NOT]], label %[[EXIT:.*]], label %[[LOOP_LATCH:.*]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[I2:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP_LATCH]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nusw %"class.dealii::VectorizedArray", ptr [[ARRAY]], i64 [[I2]]
+; CHECK-NEXT:    [[INVARIANT_VAL:%.*]] = load double, ptr [[INVARIANT_PTR]], align 8
+; CHECK-NEXT:    [[ARRAY_VAL:%.*]] = load double, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[SUM:%.*]] = fadd double [[INVARIANT_VAL]], [[ARRAY_VAL]]
+; CHECK-NEXT:    store double [[SUM]], ptr [[GEP]], align 8
+; CHECK-NEXT:    [[I_NEXT]] = add nuw i64 [[I2]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_NEXT]], [[NUM_ELEMENTS]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_LATCH]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:                                      ; preds = %loop.latch, %entry
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop.latch ]
+  %cmp = icmp ult i64 %i, %num_elements
+  br i1 %cmp, label %loop.latch, label %exit
+
+loop.latch:                                       ; preds = %loop.header
+  %gep = getelementptr nusw %"class.dealii::VectorizedArray", ptr %array, i64 %i
+  %invariant_val = load double, ptr %invariant_ptr, align 8
+  %array_val = load double, ptr %gep, align 8
+  %sum = fadd double %array_val, %invariant_val
+  store double %sum, ptr %gep, align 8
+  %i.next = add i64 %i, 1
+  br label %loop.header
+
+exit:                                             ; preds = %loop.header
+  ret void
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
index e3765ed541e7a..75276c0412647 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
@@ -106,23 +106,6 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP4]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP6]], i64 1
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult <2 x i64> [[TMP8]], splat (i64 225)
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP11]], i64 1
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ult <2 x i64> [[TMP10]], splat (i64 225)
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP12]], i64 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP14]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP15]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP16]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP17]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP17]], align 8, !alias.scope [[META0:![0-9]+]]
@@ -182,23 +165,6 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK:       vector.body.1:
 ; CHECK-NEXT:    [[INDEX_1:%.*]] = phi i64 [ 0, [[VECTOR_PH_1]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY_1]] ]
 ; CHECK-NEXT:    [[TMP33:%.*]] = add nuw nsw i64 [[INDEX_1]], 15
-; CHECK-NEXT:    [[TMP34:%.*]] = add nuw nsw i64 [[INDEX_1]], 16
-; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <2 x i64> poison, i64 [[TMP33]], i64 0
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <2 x i64> [[TMP35]], i64 [[TMP34]], i64 1
-; CHECK-NEXT:    [[TMP37:%.*]] = add nuw nsw i64 [[INDEX_1]], 17
-; CHECK-NEXT:    [[TMP38:%.*]] = add nuw nsw i64 [[INDEX_1]], 18
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <2 x i64> poison, i64 [[TMP37]], i64 0
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <2 x i64> [[TMP39]], i64 [[TMP38]], i64 1
-; CHECK-NEXT:    [[TMP41:%.*]] = icmp ult <2 x i64> [[TMP36]], splat (i64 225)
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP41]], i64 0
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <2 x i1> [[TMP41]], i64 1
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp ult <2 x i64> [[TMP40]], splat (i64 225)
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <2 x i1> [[TMP42]], i64 0
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <2 x i1> [[TMP42]], i64 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP43]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP44]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP45]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP46]])
 ; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP33]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP47]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD_1:%.*]] = load <2 x double>, ptr [[TMP47]], align 8, !alias.scope [[META0]]
@@ -259,23 +225,6 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK:       vector.body.2:
 ; CHECK-NEXT:    [[INDEX_2:%.*]] = phi i64 [ 0, [[VECTOR_PH_2]] ], [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY_2]] ]
 ; CHECK-NEXT:    [[TMP64:%.*]] = add nuw nsw i64 [[INDEX_2]], 30
-; CHECK-NEXT:    [[TMP65:%.*]] = add nuw nsw i64 [[INDEX_2]], 31
-; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <2 x i64> poison, i64 [[TMP64]], i64 0
-; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <2 x i64> [[TMP66]], i64 [[TMP65]], i64 1
-; CHECK-NEXT:    [[TMP68:%.*]] = add nuw nsw i64 [[INDEX_2]], 32
-; CHECK-NEXT:    [[TMP69:%.*]] = add nuw nsw i64 [[INDEX_2]], 33
-; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <2 x i64> poison, i64 [[TMP68]], i64 0
-; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <2 x i64> [[TMP70]], i64 [[TMP69]], i64 1
-; CHECK-NEXT:    [[TMP72:%.*]] = icmp ult <2 x i64> [[TMP67]], splat (i64 225)
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <2 x i1> [[TMP72]], i64 0
-; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <2 x i1> [[TMP72]], i64 1
-; CHECK-NEXT:    [[TMP73:%.*]] = icmp ult <2 x i64> [[TMP71]], splat (i64 225)
-; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <2 x i1> [[TMP73]], i64 0
-; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <2 x i1> [[TMP73]], i64 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP74]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP75]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP76]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP77]])
 ; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP64]]
 ; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP78]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD_2:%.*]] = load <2 x double>, ptr [[TMP78]], align 8, !alias.scope [[META0]]
@@ -336,23 +285,6 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK:       vector.body.3:
 ; CHECK-NEXT:    [[INDEX_3:%.*]] = phi i64 [ 0, [[VECTOR_PH_3]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY_3]] ]
 ; CHECK-NEXT:    [[TMP95:%.*]] = add nuw nsw i64 [[INDEX_3]], 45
-; CHECK-NEXT:    [[TMP96:%.*]] = add nuw nsw i64 [[INDEX_3]], 46
-; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i64> poison, i64 [[TMP95]], i64 0
-; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <2 x i64> [[TMP97]], i64 [[TMP96]], i64 1
-; CHECK-NEXT:    [[TMP99:%.*]] = add nuw nsw i64 [[INDEX_3]], 47
-; CHECK-NEXT:    [[TMP100:%.*]] = add nuw nsw i64 [[INDEX_3]], 48
-; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <2 x i64> poison, i64 [[TMP99]], i64 0
-; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <2 x i64> [[TMP101]], i64 [[TMP100]], i64 1
-; CHECK-NEXT:    [[TMP103:%.*]] = icmp ult <2 x i64> [[TMP98]], splat (i64 225)
-; CHECK-NEXT:    [[TMP105:%.*]] = extractelement <2 x i1> [[TMP103]], i64 0
-; CHECK-NEXT:    [[TMP106:%.*]] = extractelement <2 x i1> [[TMP103]], i64 1
-; CHECK-NEXT:    [[TMP104:%.*]] = icmp ult <2 x i64> [[TMP102]], splat (i64 225)
-; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <2 x i1> [[TMP104]], i64 0
-; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <2 x i1> [[TMP104]], i64 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP105]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP106]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP107]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP108]])
 ; CHECK-NEXT:    [[TMP109:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP95]]
 ; CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP109]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD_3:%.*]] = load <2 x double>, ptr [[TMP109]], align 8, !alias.scope [[META0]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll
index 55adda7d5b0f3..08191c636bd3f 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll
@@ -18,45 +18,45 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483640
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[Y]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT19]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT14]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI15:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI17:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX1]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP23]], align 4
+; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI17:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI18:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = fpext <4 x float> [[WIDE_LOAD]] to <4 x double>
-; CHECK-NEXT:    [[TMP3:%.*]] = fpext <4 x float> [[WIDE_LOAD18]] to <4 x double>
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext <4 x float> [[WIDE_LOAD19]] to <4 x double>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <4 x double> [[BROADCAST_SPLAT]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[BROADCAST_SPLAT]], [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub fast <4 x double> [[TMP4]], [[BROADCAST_SPLAT20]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fsub fast <4 x double> [[TMP5]], [[BROADCAST_SPLAT20]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub fast <4 x double> [[TMP4]], [[BROADCAST_SPLAT15]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fsub fast <4 x double> [[TMP5]], [[BROADCAST_SPLAT15]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fcmp fast ogt <4 x double> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = fcmp fast ogt <4 x double> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <4 x double> [[TMP6]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast <4 x double> [[TMP7]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = select ninf <4 x i1> [[TMP8]], <4 x double> [[TMP6]], <4 x double> splat (double -0.000000e+00)
 ; CHECK-NEXT:    [[TMP13:%.*]] = select ninf <4 x i1> [[TMP9]], <4 x double> [[TMP7]], <4 x double> splat (double -0.000000e+00)
-; CHECK-NEXT:    [[TMP14]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI16]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI17]], [[TMP13]]
+; CHECK-NEXT:    [[TMP14]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI17]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI18]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select ninf <4 x i1> [[TMP8]], <4 x double> [[TMP10]], <4 x double> splat (double -0.000000e+00)
 ; CHECK-NEXT:    [[TMP17:%.*]] = select ninf <4 x i1> [[TMP9]], <4 x double> [[TMP11]], <4 x double> splat (double -0.000000e+00)
 ; CHECK-NEXT:    [[TMP18]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI15]], [[TMP17]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV1]], 8
+; CHECK-NEXT:    [[TMP19]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI16]], [[TMP17]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV]], 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX]])
-; CHECK-NEXT:    [[BIN_RDX21:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP15]], [[TMP14]]
-; CHECK-NEXT:    [[TMP22:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX21]])
+; CHECK-NEXT:    [[BIN_RDX20:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP22:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX20]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY_PREHEADER22]]
 ; CHECK:       [[FOR_BODY_PREHEADER22]]:
@@ -65,11 +65,11 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef
 ; CHECK-NEXT:    [[V0_010_PH:%.*]] = phi double [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[TMP22]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER22]] ]
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER22]] ]
 ; CHECK-NEXT:    [[V1_012:%.*]] = phi double [ [[V1_2:%.*]], %[[FOR_BODY]] ], [ [[V1_011_PH]], %[[FOR_BODY_PREHEADER22]] ]
 ; CHECK-NEXT:    [[V0_011:%.*]] = phi double [ [[V0_2:%.*]], %[[FOR_BODY]] ], [ [[V0_010_PH]], %[[FOR_BODY_PREHEADER22]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
 ; CHECK-NEXT:    [[CONV:%.*]] = fpext float [[TMP0]] to double
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul fast double [[Y]], [[CONV]]
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub fast double [[MUL]], [[Z]]
@@ -79,16 +79,16 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef
 ; CHECK-NEXT:    [[V0_2]] = fadd reassoc arcp contract afn double [[V0_011]], [[ADD8]]
 ; CHECK-NEXT:    [[ADD4:%.*]] = select ninf i1 [[CMP1]], double [[MUL3]], double -0.000000e+00
 ; CHECK-NEXT:    [[V1_2]] = fadd reassoc arcp contract afn double [[V1_012]], [[ADD4]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[FOR_END_LOOPEXIT]]:
-; CHECK-NEXT:    [[V0_1:%.*]] = phi double [ [[TMP22]], %[[MIDDLE_BLOCK]] ], [ [[V0_2]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[V1_1:%.*]] = phi double [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[V1_2]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast double [[V1_1]], [[V0_1]]
+; CHECK-NEXT:    [[V0_1_LCSSA:%.*]] = phi double [ [[TMP22]], %[[MIDDLE_BLOCK]] ], [ [[V0_2]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[V1_1_LCSSA:%.*]] = phi double [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[V1_2]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = fadd fast double [[V1_1_LCSSA]], [[V0_1_LCSSA]]
 ; CHECK-NEXT:    br label %[[FOR_END]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[ADD5:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[ADD5:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP24]], %[[FOR_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    ret double [[ADD5]]
 ;
 entry:
@@ -193,29 +193,29 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483640
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[Y]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT36:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT35]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT30:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT29]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br label %[[FOR_BODY_US:.*]]
 ; CHECK:       [[FOR_BODY_US]]:
-; CHECK-NEXT:    [[V1_021_US:%.*]] = phi double [ [[V1_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US:.*]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ]
-; CHECK-NEXT:    [[V0_020_US:%.*]] = phi double [ [[V0_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ]
+; CHECK-NEXT:    [[V1_019_US:%.*]] = phi double [ [[V1_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US:.*]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ]
+; CHECK-NEXT:    [[V0_018_US:%.*]] = phi double [ [[V0_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[BLOCK_017_US:%.*]] = phi i32 [ [[INC9_US:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0, %[[FOR_BODY_US_PREHEADER]] ]
 ; CHECK-NEXT:    tail call void @resample(i32 noundef [[RAND_BLOCK_LENGTH]], ptr noundef [[SAMPLES]])
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY3_US_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V1_021_US]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V0_020_US]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V1_019_US]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V0_018_US]], i64 0
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI31:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI32:%.*]] = phi <4 x double> [ [[TMP27]], %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI33:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX_US1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US1]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX_US1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX_US]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD34:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[WIDE_LOAD]] to <4 x double>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fpext <4 x float> [[WIDE_LOAD34]] to <4 x double>
@@ -223,8 +223,8 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R
 ; CHECK-NEXT:    [[TMP7:%.*]] = tail call fast <4 x double> @llvm.exp2.v4f64(<4 x double> [[TMP5]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <4 x double> [[TMP6]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <4 x double> [[TMP7]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fsub fast <4 x double> [[TMP8]], [[BROADCAST_SPLAT36]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fsub fast <4 x double> [[TMP9]], [[BROADCAST_SPLAT36]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fsub fast <4 x double> [[TMP8]], [[BROADCAST_SPLAT30]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fsub fast <4 x double> [[TMP9]], [[BROADCAST_SPLAT30]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fcmp fast ogt <4 x double> [[TMP10]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast ogt <4 x double> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = fmul fast <4 x double> [[TMP10]], [[TMP10]]
@@ -237,26 +237,26 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R
 ; CHECK-NEXT:    [[TMP21:%.*]] = select ninf <4 x i1> [[TMP13]], <4 x double> [[TMP15]], <4 x double> splat (double -0.000000e+00)
 ; CHECK-NEXT:    [[TMP22]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI31]], [[TMP21]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV1]], 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV]], 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP23]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX]])
-; CHECK-NEXT:    [[BIN_RDX37:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]]
-; CHECK-NEXT:    [[TMP26:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX37]])
+; CHECK-NEXT:    [[BIN_RDX35:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]]
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX35]])
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]], label %[[FOR_BODY3_US_PREHEADER]]
 ; CHECK:       [[FOR_BODY3_US_PREHEADER]]:
 ; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY_US]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[V1_114_US_PH:%.*]] = phi double [ [[V1_021_US]], %[[FOR_BODY_US]] ], [ [[TMP25]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[V0_113_US_PH:%.*]] = phi double [ [[V0_020_US]], %[[FOR_BODY_US]] ], [ [[TMP26]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[V1_114_US_PH:%.*]] = phi double [ [[V1_019_US]], %[[FOR_BODY_US]] ], [ [[TMP25]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[V0_113_US_PH:%.*]] = phi double [ [[V0_018_US]], %[[FOR_BODY_US]] ], [ [[TMP26]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY3_US:.*]]
 ; CHECK:       [[FOR_BODY3_US]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3_US]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY3_US_PREHEADER]] ]
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3_US]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY3_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[V1_116_US:%.*]] = phi double [ [[V1_2_US:%.*]], %[[FOR_BODY3_US]] ], [ [[V1_114_US_PH]], %[[FOR_BODY3_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[V0_115_US:%.*]] = phi double [ [[V0_2_US:%.*]], %[[FOR_BODY3_US]] ], [ [[V0_113_US_PH]], %[[FOR_BODY3_US_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_US1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX_US1]], align 4
 ; CHECK-NEXT:    [[CONV_US:%.*]] = fpext float [[TMP0]] to double
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast double @llvm.exp2.f64(double [[CONV_US]])
 ; CHECK-NEXT:    [[MUL_US:%.*]] = fmul fast double [[TMP1]], [[Y]]
@@ -267,7 +267,7 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R
 ; CHECK-NEXT:    [[V0_2_US]] = fadd reassoc arcp contract afn double [[V0_115_US]], [[ADD12_US]]
 ; CHECK-NEXT:    [[ADD7_US1:%.*]] = select ninf i1 [[CMP4_US]], double [[ADD7_US]], double -0.000000e+00
 ; CHECK-NEXT:    [[V1_2_US]] = fadd reassoc arcp contract afn double [[V1_116_US]], [[ADD7_US1]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1
 ; CHECK-NEXT:    [[EXITCOND25_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND25_NOT]], label %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]], label %[[FOR_BODY3_US]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[FOR_COND1_FOR_INC8_CRIT_EDGE_US]]:
@@ -275,17 +275,18 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R
 ; CHECK-NEXT:    [[V1_2_US_LCSSA]] = phi double [ [[TMP25]], %[[MIDDLE_BLOCK]] ], [ [[V1_2_US]], %[[FOR_BODY3_US]] ]
 ; CHECK-NEXT:    [[INC9_US]] = add nuw nsw i32 [[BLOCK_017_US]], 1
 ; CHECK-NEXT:    [[EXITCOND26_NOT:%.*]] = icmp eq i32 [[INC9_US]], [[NBLOCKS]]
-; CHECK-NEXT:    br i1 [[EXITCOND26_NOT]], label %[[FOR_END10]], label %[[FOR_BODY_US]]
+; CHECK-NEXT:    br i1 [[EXITCOND26_NOT]], label %[[FOR_END10_LOOPEXIT:.*]], label %[[FOR_BODY_US]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[BLOCK_017:%.*]] = phi i32 [ [[INC9:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
 ; CHECK-NEXT:    tail call void @resample(i32 noundef [[RAND_BLOCK_LENGTH]], ptr noundef [[SAMPLES]])
 ; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[BLOCK_017]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[NBLOCKS]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END10]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END10_LOOPEXIT]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = fadd fast double [[V1_2_US_LCSSA]], [[V0_2_US_LCSSA]]
+; CHECK-NEXT:    br label %[[FOR_END10]]
 ; CHECK:       [[FOR_END10]]:
-; CHECK-NEXT:    [[V0_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V0_2_US_LCSSA]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[V1_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V1_2_US_LCSSA]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ADD11:%.*]] = fadd fast double [[V1_0_LCSSA]], [[V0_0_LCSSA]]
+; CHECK-NEXT:    [[ADD11:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP29]], %[[FOR_END10_LOOPEXIT]] ], [ 0.000000e+00, %[[FOR_BODY]] ]
 ; CHECK-NEXT:    ret double [[ADD11]]
 ;
 entry:
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/scalarize-load-ext-extract.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/scalarize-load-ext-extract.ll
new file mode 100644
index 0000000000000..f7918b0e0a798
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/scalarize-load-ext-extract.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -O3 -mtriple=arm64-apple-darwinos -S %s | FileCheck %s
+
+define noundef i32 @load_ext_extract(ptr %src) {
+; CHECK-LABEL: define noundef range(i32 0, 1021) i32 @load_ext_extract(
+; CHECK-SAME: ptr readonly captures(none) [[SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i32 [[TMP14]], 24
+; CHECK-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP14]], 16
+; CHECK-NEXT:    [[TMP17:%.*]] = and i32 [[TMP16]], 255
+; CHECK-NEXT:    [[TMP18:%.*]] = lshr i32 [[TMP14]], 8
+; CHECK-NEXT:    [[TMP19:%.*]] = and i32 [[TMP18]], 255
+; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP14]], 255
+; CHECK-NEXT:    [[ADD1:%.*]] = add nuw nsw i32 [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add nuw nsw i32 [[ADD1]], [[TMP17]]
+; CHECK-NEXT:    [[ADD3:%.*]] = add nuw nsw i32 [[ADD2]], [[TMP15]]
+; CHECK-NEXT:    ret i32 [[ADD3]]
+;
+entry:
+  %x = load <4 x i8>, ptr %src, align 4
+  %ext = zext nneg <4 x i8> %x to <4 x i32>
+  %ext.0 = extractelement <4 x i32> %ext, i64 0
+  %ext.1 = extractelement <4 x i32> %ext, i64 1
+  %ext.2 = extractelement <4 x i32> %ext, i64 2
+  %ext.3 = extractelement <4 x i32> %ext, i64 3
+
+  %add1 = add i32 %ext.0, %ext.1
+  %add2 = add i32 %add1, %ext.2
+  %add3 = add i32 %add2, %ext.3
+  ret i32 %add3
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
index 338d9259b635c..fd7b75f22cb6d 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
@@ -8,7 +8,6 @@ define i64 @std_find_i16_constant_offset_with_assumptions(ptr %first.coerce, i16
 ; CHECK-SAME: ptr [[FIRST_COERCE:%.*]], i16 noundef signext [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[FIRST_COERCE]], i64 2) ]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[FIRST_COERCE]], i64 256) ]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -49,10 +48,10 @@ entry:
   call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %0, i64 256) ]
   %start.ptr = load ptr, ptr %first, align 8
   %1 = load i64, ptr %first, align 8
-  %coerce.val.pi.i = add i64 %1, 256
-  %coerce.val.ip = inttoptr i64 %coerce.val.pi.i to ptr
-  %cmp.not6.i.i = icmp eq ptr %start.ptr, %coerce.val.ip
-  br i1 %cmp.not6.i.i, label %return, label %loop.ph
+  %coerce.val.p = add i64 %1, 256
+  %coerce.val.ip = inttoptr i64 %coerce.val.p to ptr
+  %ec6. = icmp eq ptr %start.ptr, %coerce.val.ip
+  br i1 %ec6., label %return, label %loop.ph
 
 loop.ph:
   %2 = load i16, ptr %s.addr, align 2
@@ -61,13 +60,13 @@ loop.ph:
 loop.header:
   %ptr.iv = phi ptr [ %start.ptr, %loop.ph ], [ %ptr.iv.next, %loop.latch ]
   %3 = load i16, ptr %ptr.iv, align 2
-  %cmp2.i.i = icmp eq i16 %3, %2
-  br i1 %cmp2.i.i, label %return, label %loop.latch
+  %cmp2. = icmp eq i16 %3, %2
+  br i1 %cmp2., label %return, label %loop.latch
 
 loop.latch:
   %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 2
-  %cmp.not.i.i = icmp eq ptr %ptr.iv.next, %coerce.val.ip
-  br i1 %cmp.not.i.i, label %return, label %loop.header
+  %ec. = icmp eq ptr %ptr.iv.next, %coerce.val.ip
+  br i1 %ec., label %return, label %loop.header
 
 return:
   %merge = phi ptr [ %start.ptr, %entry ], [ %coerce.val.ip, %loop.latch ], [ %ptr.iv, %loop.header ]
@@ -103,10 +102,10 @@ entry:
   %0 = load ptr, ptr %first, align 8
   %start.ptr = load ptr, ptr %first, align 8
   %1 = load i64, ptr %first, align 8
-  %coerce.val.pi.i = add i64 %1, 256
-  %coerce.val.ip = inttoptr i64 %coerce.val.pi.i to ptr
-  %cmp.not6.i.i = icmp eq ptr %start.ptr, %coerce.val.ip
-  br i1 %cmp.not6.i.i, label %return, label %loop.ph
+  %coerce.val.p = add i64 %1, 256
+  %coerce.val.ip = inttoptr i64 %coerce.val.p to ptr
+  %ec6. = icmp eq ptr %start.ptr, %coerce.val.ip
+  br i1 %ec6., label %return, label %loop.ph
 
 loop.ph:
   %2 = load i16, ptr %s.addr, align 2
@@ -115,13 +114,13 @@ loop.ph:
 loop.header:
   %ptr.iv = phi ptr [ %start.ptr, %loop.ph ], [ %ptr.iv.next, %loop.latch ]
   %3 = load i16, ptr %ptr.iv, align 2
-  %cmp2.i.i = icmp eq i16 %3, %2
-  br i1 %cmp2.i.i, label %return, label %loop.latch
+  %cmp2. = icmp eq i16 %3, %2
+  br i1 %cmp2., label %return, label %loop.latch
 
 loop.latch:
   %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 2
-  %cmp.not.i.i = icmp eq ptr %ptr.iv.next, %coerce.val.ip
-  br i1 %cmp.not.i.i, label %return, label %loop.header
+  %ec. = icmp eq ptr %ptr.iv.next, %coerce.val.ip
+  br i1 %ec., label %return, label %loop.header
 
 return:
   %merge = phi ptr [ %start.ptr, %entry ], [ %coerce.val.ip, %loop.latch ], [ %ptr.iv, %loop.header ]
@@ -129,9 +128,107 @@ return:
   ret i64 %res
 }
 
+define ptr @std_find_caller(ptr noundef %first, ptr noundef %last) {
+; CHECK-LABEL: define noundef ptr @std_find_caller(
+; CHECK-SAME: ptr noundef [[FIRST:%.*]], ptr noundef [[LAST:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[FIRST]], i64 2) ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[LAST]], i64 2) ]
+; CHECK-NEXT:    [[PRE_I:%.*]] = icmp eq ptr [[FIRST]], [[LAST]]
+; CHECK-NEXT:    br i1 [[PRE_I]], label %[[STD_FIND_GENERIC_IMPL_EXIT:.*]], label %[[LOOP_HEADER_I_PREHEADER:.*]]
+; CHECK:       [[LOOP_HEADER_I_PREHEADER]]:
+; CHECK-NEXT:    [[LAST_I64:%.*]] = ptrtoint ptr [[LAST]] to i64
+; CHECK-NEXT:    [[FIRST3:%.*]] = ptrtoint ptr [[FIRST]] to i64
+; CHECK-NEXT:    [[PTR_SUB:%.*]] = sub i64 [[LAST_I64]], [[FIRST3]]
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[PTR_SUB]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[LAST_I64]], -2
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[FIRST3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr exact i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 158
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[LOOP_HEADER_I_PREHEADER2:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP3]], -8
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[PROL_ITER_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD_FR:%.*]] = freeze <8 x i16> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <8 x i16> [[WIDE_LOAD_FR]], splat (i16 1)
+; CHECK-NEXT:    [[PROL_ITER_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i8 [[TMP5]], 0
+; CHECK-NEXT:    [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[PROL_ITER_CMP_NOT]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = shl i64 [[XTRAITER]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[XTRAITER]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[STD_FIND_GENERIC_IMPL_EXIT]], label %[[LOOP_HEADER_I_PREHEADER2]]
+; CHECK:       [[LOOP_HEADER_I_PREHEADER2]]:
+; CHECK-NEXT:    [[PTR_IV_I_PH:%.*]] = phi ptr [ [[FIRST]], %[[LOOP_HEADER_I_PREHEADER]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER_I:.*]]
+; CHECK:       [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shl i64 [[TMP12]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
+; CHECK-NEXT:    br label %[[STD_FIND_GENERIC_IMPL_EXIT]]
+; CHECK:       [[LOOP_HEADER_I]]:
+; CHECK-NEXT:    [[PTR_IV_I:%.*]] = phi ptr [ [[PTR_IV_NEXT_I:%.*]], %[[LOOP_LATCH_I:.*]] ], [ [[PTR_IV_I_PH]], %[[LOOP_HEADER_I_PREHEADER2]] ]
+; CHECK-NEXT:    [[L_I:%.*]] = load i16, ptr [[PTR_IV_I]], align 2
+; CHECK-NEXT:    [[C_1_I:%.*]] = icmp eq i16 [[L_I]], 1
+; CHECK-NEXT:    br i1 [[C_1_I]], label %[[STD_FIND_GENERIC_IMPL_EXIT]], label %[[LOOP_LATCH_I]]
+; CHECK:       [[LOOP_LATCH_I]]:
+; CHECK-NEXT:    [[PTR_IV_NEXT_I]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_I]], i64 2
+; CHECK-NEXT:    [[EC_I:%.*]] = icmp eq ptr [[PTR_IV_NEXT_I]], [[LAST]]
+; CHECK-NEXT:    br i1 [[EC_I]], label %[[STD_FIND_GENERIC_IMPL_EXIT]], label %[[LOOP_HEADER_I]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[STD_FIND_GENERIC_IMPL_EXIT]]:
+; CHECK-NEXT:    [[RES_I:%.*]] = phi ptr [ [[FIRST]], %[[ENTRY]] ], [ [[SCEVGEP]], %[[MIDDLE_BLOCK]] ], [ [[TMP14]], %[[VECTOR_EARLY_EXIT]] ], [ [[SCEVGEP]], %[[LOOP_LATCH_I]] ], [ [[PTR_IV_I]], %[[LOOP_HEADER_I]] ]
+; CHECK-NEXT:    ret ptr [[RES_I]]
+;
+entry:
+  %last.i64 = ptrtoint ptr %last to i64
+  %first.i64 = ptrtoint ptr %first to i64
+  %ptr.sub = sub i64 %last.i64, %first.i64
+  call void @llvm.assume(i1 true) [ "align"(ptr %first, i64 2) ]
+  call void @llvm.assume(i1 true) [ "align"(ptr %last, i64 2) ]
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %first, i64 %ptr.sub) ]
+  %call = call noundef ptr @std_find_generic_impl(ptr noundef nonnull %first, ptr noundef %last, i16 noundef signext 1)
+  ret ptr %call
+}
+
+define linkonce_odr noundef ptr @std_find_generic_impl(ptr noundef %first, ptr noundef %last, i16 noundef %value) {
+entry:
+  %pre = icmp eq ptr %first, %last
+  br i1 %pre, label %exit, label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %first, %entry ]
+  %l = load i16, ptr %ptr.iv, align 2
+  %c.1 = icmp eq i16 %l, %value
+  br i1 %c.1, label %exit, label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 2
+  %ec = icmp eq ptr %ptr.iv.next, %last
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  %res = phi ptr [ %first, %entry ], [ %ptr.iv, %loop.header ], [ %ptr.iv.next, %loop.latch ]
+  ret ptr %res
+}
+
 declare void @llvm.assume(i1 noundef)
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
index 2c1d73eaafc5e..9f3244ded92ff 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
@@ -498,11 +498,9 @@ define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %
 ; PR58139
 define <2 x double> @_mm_complexmult_pd_naive(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: @_mm_complexmult_pd_naive(
-; SSE-NEXT:    [[B1:%.*]] = extractelement <2 x double> [[B:%.*]], i64 1
-; SSE-NEXT:    [[TMP1:%.*]] = fneg double [[B1]]
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TMP1]], i64 0
+; SSE-NEXT:    [[TMP3:%.*]] = fneg <2 x double> [[B:%.*]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[B]], <2 x i32> <i32 1, i32 2>
 ; SSE-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
 ; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> zeroinitializer
 ; SSE-NEXT:    [[TMP7:%.*]] = tail call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> [[B]], <2 x double> [[TMP5]])
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
index fa6403f3d4267..de64bf2657f72 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
@@ -502,11 +502,9 @@ define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %
 ; PR58139
 define <2 x double> @_mm_complexmult_pd_naive(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: @_mm_complexmult_pd_naive(
-; SSE-NEXT:    [[B1:%.*]] = extractelement <2 x double> [[B:%.*]], i64 1
-; SSE-NEXT:    [[TMP1:%.*]] = fneg double [[B1]]
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TMP1]], i64 0
+; SSE-NEXT:    [[TMP3:%.*]] = fneg <2 x double> [[B:%.*]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[B]], <2 x i32> <i32 1, i32 2>
 ; SSE-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
 ; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> zeroinitializer
 ; SSE-NEXT:    [[TMP7:%.*]] = tail call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> [[B]], <2 x double> [[TMP5]])
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
index 09f583f9242d5..3416584729317 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
@@ -38,5 +38,5 @@ define <4 x float> @fixed_vec_exp(<4 x float> %input) {
 declare <4 x float> @llvm.exp.v4f32(<4 x float>) #0
 declare <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float>) #0
 
-; CHECK: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll
index d16843c81144d..6629b1219cbe8 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll
@@ -1,21 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
 
-define ptr @test(ptr %d) {
+define ptr @test(ptr %d, i64 %v) {
 ; CHECK-LABEL: define ptr @test(
-; CHECK-SAME: ptr [[D:%.*]]) {
+; CHECK-SAME: ptr [[D:%.*]], i64 [[V:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[D]], align 1
 ; CHECK-NEXT:    [[CMP4_2:%.*]] = icmp eq i8 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[CMP4_2]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 0, 0
-; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 1, 0
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[CMP4_2]], i64 0, i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 0, [[V]]
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 1, [[V]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <6 x i64> poison, i64 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <6 x i64> [[TMP5]], i64 [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <6 x i64> [[TMP6]], i64 [[TMP4]], i32 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <6 x i64> [[TMP7]], <6 x i64> poison, <6 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <6 x i64> [[TMP8]], <i64 2, i64 6, i64 1, i64 1, i64 1, i64 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <6 x i64> [[TMP8]], <i64 2, i64 6, i64 4, i64 3, i64 5, i64 4>
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <6 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <6 x i64> [[TMP9]], i32 1
@@ -31,23 +31,23 @@ define ptr @test(ptr %d) {
 ; CHECK-NEXT:    ret ptr [[TMP20]]
 ;
 entry:
-  %0 = load i8, ptr null, align 1
+  %0 = load i8, ptr %d, align 1
   %cmp4.2 = icmp eq i8 %0, 0
-  %1 = select i1 %cmp4.2, i64 0, i64 0
+  %1 = select i1 %cmp4.2, i64 0, i64 4
   %2 = shl i64 %1, 1
   %3 = getelementptr i8, ptr %d, i64 %2
-  %4 = xor i64 0, 0
-  %5 = udiv i64 %4, 0
+  %4 = xor i64 0, %v
+  %5 = udiv i64 %4, 3
   %6 = mul i64 %5, 6
   %7 = getelementptr i8, ptr %d, i64 %6
-  %8 = shl i64 %1, 0
+  %8 = shl i64 %1, 2
   %scevgep42 = getelementptr i8, ptr %d, i64 %8
-  %9 = mul i64 %5, 1
+  %9 = mul i64 %5, 3
   %10 = getelementptr i8, ptr %d, i64 %9
-  %11 = udiv i64 1, 0
-  %12 = mul i64 %11, 1
+  %11 = udiv i64 1, %v
+  %12 = mul i64 %11, 5
   %13 = getelementptr i8, ptr %d, i64 %12
-  %14 = mul i64 %11, 0
+  %14 = mul i64 %11, 4
   %15 = getelementptr i8, ptr %d, i64 %14
   ret ptr %15
 }
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll
index c1a87f0c5f907..577efcbbac012 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll
@@ -930,7 +930,7 @@ entry:
 
 
 ; COST-LABEL: Function:  mla_v8i8_i32
-; COST: Cost:            '-18'
+; COST: Cost:            '-24'
 define i32 @mla_v8i8_i32(ptr %x, ptr %y) "target-features"="+dotprod" {
 ; CHECK-LABEL: @mla_v8i8_i32(
 ; CHECK-NEXT:  entry:
@@ -1009,7 +1009,7 @@ entry:
 
 
 ; COST-LABEL: Function:  mla_v16i8_i32
-; COST: Cost:            '-40'
+; COST: Cost:            '-52'
 define i32 @mla_v16i8_i32(ptr %x, ptr %y) "target-features"="+dotprod" {
 ; CHECK-LABEL: @mla_v16i8_i32(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll
new file mode 100644
index 0000000000000..959b2350d9d78
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define float @test(i8 %0) {
+; CHECK-LABEL: define float @test(
+; CHECK-SAME: i8 [[TMP0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> <i8 poison, i8 0>, i8 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <2 x i32> [[TMP2]], <i32 2, i32 27>
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i32> [[TMP2]], <i32 2, i32 27>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    switch i32 [[TMP8]], label %[[EXIT:.*]] [
+; CHECK-NEXT:      i32 0, label %[[EXIT]]
+; CHECK-NEXT:      i32 1, label %[[EXIT]]
+; CHECK-NEXT:    ]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+entry:
+  %1 = sext i8 0 to i32
+  %2 = lshr i32 %1, 27
+  %3 = sext i8 %0 to i32
+  %reass.add.epil = mul i32 %3, 2
+  %4 = or i32 %reass.add.epil, %2
+  switch i32 %4, label %exit [
+  i32 0, label %exit
+  i32 1, label %exit
+  ]
+
+exit:
+  ret float 0.000000e+00
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll b/llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll
new file mode 100644
index 0000000000000..65975199e46b8
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define <4 x i32> @test() {
+; CHECK-LABEL: define <4 x i32> @test() {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 0 to i32
+; CHECK-NEXT:    br label %[[BB1:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[TRUNC]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[TRUNC]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[OR]] to i64
+; CHECK-NEXT:    br label %[[BB3:.*]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
+bb:
+  %trunc = trunc i64 0 to i32
+  br label %bb1
+
+bb1:
+  %or = or i32 %trunc, 0
+  %zext = zext i32 %or to i64
+  %and = and i32 0, 0
+  %or2 = or i32 %trunc, 0
+  br label %bb3
+
+bb3:
+  %0 = insertelement <4 x i32> zeroinitializer, i32 %trunc, i32 0
+  %1 = insertelement <4 x i32> %0, i32 %and, i32 1
+  %2 = insertelement <4 x i32> %1, i32 %or2, i32 2
+  %3 = insertelement <4 x i32> %2, i32 %or, i32 3
+  ret <4 x i32> %3
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/copyable-used-outside-with-immediate-op.ll b/llvm/test/Transforms/SLPVectorizer/X86/copyable-used-outside-with-immediate-op.ll
new file mode 100644
index 0000000000000..d4aef24962313
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/copyable-used-outside-with-immediate-op.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-9999 < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    br label %[[BB1:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB]] ], [ [[TMP6:%.*]], %[[BB14:.*]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB10:.*]] ]
+; CHECK-NEXT:    br label %[[BB3:.*]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x float> [ zeroinitializer, %[[BB1]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ [[TMP0]], %[[BB1]] ]
+; CHECK-NEXT:    br label %[[BB10]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[PHI12:%.*]] = phi float [ 0.000000e+00, %[[BB3]] ], [ 0.000000e+00, %[[BB14]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB3]] ], [ [[TMP7:%.*]], %[[BB14]] ]
+; CHECK-NEXT:    switch i32 0, label %[[BB14]] [
+; CHECK-NEXT:      i32 0, label %[[BB1]]
+; CHECK-NEXT:    ]
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, <4 x i32> [[TMP3]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6]] = or <4 x i32> [[TMP5]], <i32 poison, i32 poison, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP7]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 7>
+; CHECK-NEXT:    br i1 false, label %[[BB1]], label %[[BB10]]
+;
+bb:
+  br label %bb1
+
+bb1:
+  %phi = phi i32 [ 0, %bb ], [ %or16, %bb14 ], [ 0, %bb10 ]
+  %phi2 = phi i32 [ 0, %bb ], [ %or15, %bb14 ], [ 0, %bb10 ]
+  br label %bb3
+
+bb3:                                              ; preds = %bb1
+  %phi4 = phi i32 [ poison, %bb1 ]
+  %phi6 = phi i32 [ poison, %bb1 ]
+  %phi7 = phi i32 [ %phi, %bb1 ]
+  %phi9 = phi i32 [ %phi2, %bb1 ]
+  %0 = phi <2 x float> [ zeroinitializer, %bb1 ]
+  br label %bb10
+
+bb10:
+  %phi11 = phi i32 [ 0, %bb3 ], [ %phi11, %bb14 ]
+  %phi12 = phi float [ 0.000000e+00, %bb3 ], [ 0.000000e+00, %bb14 ]
+  %phi13 = phi i32 [ 0, %bb3 ], [ %or15, %bb14 ]
+  switch i32 0, label %bb14 [
+  i32 0, label %bb1
+  ]
+
+bb14:
+  %or = or i32 %phi13, %phi11
+  %or15 = or i32 %or, 0
+  %or16 = or i32 %phi11, 0
+  br i1 false, label %bb1, label %bb10
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll b/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll
new file mode 100644
index 0000000000000..590b0be973002
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S --mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+@a = common global [100 x i64] zeroinitializer, align 64
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[TMP0]], splat (i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP2]], splat (i64 1)
+; CHECK-NEXT:    br i1 false, label %[[LOP_RHSCNT_I_PEEL:.*]], label %[[LAND_END_I_PEEL:.*]]
+; CHECK:       [[LOP_RHSCNT_I_PEEL]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i64> [[TMP1]], <i64 1, i64 0>
+; CHECK-NEXT:    br label %[[LAND_END_I_PEEL]]
+; CHECK:       [[LAND_END_I_PEEL]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x i64> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP4]], %[[LOP_RHSCNT_I_PEEL]] ]
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.promoted104.i = load i64, ptr getelementptr inbounds nuw (i8, ptr @a, i64 56), align 8
+  %.promoted103.i = load i64, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8
+  %0 = add i64 %.promoted104.i, 1
+  %1 = add i64 %.promoted103.i, 1
+  %2 = add i64 %0, 1
+  br i1 false, label %lop.rhscnt.i.peel, label %land.end.i.peel
+
+lop.rhscnt.i.peel:
+  %3 = or i64 %1, 1
+  br label %land.end.i.peel
+
+land.end.i.peel:
+  %4 = phi i64 [ %2, %entry ], [ %0, %lop.rhscnt.i.peel ]
+  %5 = phi i64 [ %1, %entry ], [ %3, %lop.rhscnt.i.peel ]
+  store i64 %5, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8
+  store i64 %4, ptr getelementptr inbounds nuw (i8, ptr @a, i64 56), align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
index 533b1f691f5ad..42b32e769d8d7 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
@@ -1,26 +1,35 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; RUN: opt -passes='loop(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards -S < %s | FileCheck %s
 ; RUN: opt -passes='simple-loop-unswitch<nontrivial>' -simple-loop-unswitch-guards -S < %s | FileCheck %s
 ; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards  -verify-memoryssa -verify-loop-info -S < %s | FileCheck %s
 
 declare void @llvm.experimental.guard(i1, ...)
 
-define void @test_simple_case(i1 %cond, i32 %N) {
-; CHECK-LABEL: @test_simple_case(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
-; CHECK:       entry.split.us:
-; CHECK-NEXT:    br label [[LOOP_US:%.*]]
-; CHECK:       loop.us:
-; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ]
-; CHECK-NEXT:    br label [[GUARDED_US]]
-; CHECK:       guarded.us:
+define void @test_simple_case(i1 %cond, i32 %N) !prof !0 {
+; CHECK-LABEL: define void @test_simple_case(
+; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) !prof [[PROF0:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[ENTRY_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[LOOP_US:.*]]
+; CHECK:       [[LOOP_US]]:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[GUARDED_US:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US]]
+; CHECK:       [[GUARDED_US]]:
 ; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]]
-; CHECK:       deopt:
+; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[EXIT_SPLIT_US:.*]]
+; CHECK:       [[EXIT_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    br label %[[DEOPT:.*]]
+; CHECK:       [[DEOPT]]:
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 
 entry:
@@ -38,29 +47,44 @@ exit:
 }
 
 define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) {
-; CHECK-LABEL: @test_two_guards(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
-; CHECK:       entry.split.us:
-; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]]
-; CHECK:       entry.split.us.split.us:
-; CHECK-NEXT:    br label [[LOOP_US_US:%.*]]
-; CHECK:       loop.us.us:
-; CHECK-NEXT:    [[IV_US_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[IV_NEXT_US_US:%.*]], [[GUARDED_US2:%.*]] ]
-; CHECK-NEXT:    br label [[GUARDED_US_US:%.*]]
-; CHECK:       guarded.us.us:
-; CHECK-NEXT:    br label [[GUARDED_US2]]
-; CHECK:       guarded.us2:
+; CHECK-LABEL: define void @test_two_guards(
+; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]]
+; CHECK:       [[ENTRY_SPLIT_US]]:
+; CHECK-NEXT:    br i1 [[COND2]], label %[[ENTRY_SPLIT_US_SPLIT_US:.*]], label %[[ENTRY_SPLIT_US_SPLIT:.*]], !prof [[PROF1]]
+; CHECK:       [[ENTRY_SPLIT_US_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[LOOP_US_US:.*]]
+; CHECK:       [[LOOP_US_US]]:
+; CHECK-NEXT:    [[IV_US_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[IV_NEXT_US_US:%.*]], %[[GUARDED_US2:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US_US:.*]]
+; CHECK:       [[GUARDED_US_US]]:
+; CHECK-NEXT:    br label %[[GUARDED_US2]]
+; CHECK:       [[GUARDED_US2]]:
 ; CHECK-NEXT:    [[IV_NEXT_US_US]] = add i32 [[IV_US_US]], 1
-; CHECK-NEXT:    [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[LOOP_COND_US_US]], label [[LOOP_US_US]], label [[EXIT_SPLIT_US_SPLIT_US:%.*]]
-; CHECK:       deopt1:
+; CHECK-NEXT:    [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US_US]], label %[[LOOP_US_US]], label %[[EXIT_SPLIT_US_SPLIT_US:.*]]
+; CHECK:       [[EXIT_SPLIT_US_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[EXIT_SPLIT_US:.*]]
+; CHECK:       [[ENTRY_SPLIT_US_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP_US:.*]]
+; CHECK:       [[LOOP_US]]:
+; CHECK-NEXT:    br label %[[GUARDED_US:.*]]
+; CHECK:       [[GUARDED_US]]:
+; CHECK-NEXT:    br label %[[DEOPT1:.*]]
+; CHECK:       [[DEOPT1]]:
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
-; CHECK:       deopt:
+; CHECK:       [[EXIT_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    br label %[[DEOPT:.*]]
+; CHECK:       [[DEOPT]]:
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
-; CHECK:       exit:
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 
@@ -80,35 +104,46 @@ exit:
 }
 
 define void @test_conditional_guards(i1 %cond, i32 %N) {
-; CHECK-LABEL: @test_conditional_guards(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[FROZEN:%.+]] = freeze i1 [[COND:%.*]]
-; CHECK-NEXT:    br i1 [[FROZEN]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
-; CHECK:       entry.split.us:
-; CHECK-NEXT:    br label [[LOOP_US:%.*]]
-; CHECK:       loop.us:
-; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[BACKEDGE_US:%.*]] ]
+; CHECK-LABEL: define void @test_conditional_guards(
+; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[COND]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]]
+; CHECK:       [[ENTRY_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[LOOP_US:.*]]
+; CHECK:       [[LOOP_US]]:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[BACKEDGE_US:.*]] ]
 ; CHECK-NEXT:    [[CONDITION_US:%.*]] = icmp eq i32 [[IV_US]], 123
-; CHECK-NEXT:    br i1 [[CONDITION_US]], label [[GUARD_US:%.*]], label [[BACKEDGE_US]]
-; CHECK:       guard.us:
-; CHECK-NEXT:    br label [[GUARDED_US:%.*]]
-; CHECK:       backedge.us:
+; CHECK-NEXT:    br i1 [[CONDITION_US]], label %[[GUARD_US:.*]], label %[[BACKEDGE_US]]
+; CHECK:       [[GUARD_US]]:
+; CHECK-NEXT:    br label %[[GUARDED_US:.*]]
+; CHECK:       [[BACKEDGE_US]]:
 ; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[EXIT_SPLIT_US:.*]]
+; CHECK:       [[GUARDED_US]]:
+; CHECK-NEXT:    br label %[[BACKEDGE_US]]
+; CHECK:       [[EXIT_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], %[[BACKEDGE:.*]] ]
 ; CHECK-NEXT:    [[CONDITION:%.*]] = icmp eq i32 [[IV]], 123
-; CHECK-NEXT:    br i1 [[CONDITION]], label [[GUARD:%.*]], label [[BACKEDGE]]
-; CHECK:       guard:
-; CHECK-NEXT:    br label [[DEOPT:%.*]]
-; CHECK:       deopt:
+; CHECK-NEXT:    br i1 [[CONDITION]], label %[[GUARD:.*]], label %[[BACKEDGE]]
+; CHECK:       [[GUARD]]:
+; CHECK-NEXT:    br label %[[DEOPT:.*]]
+; CHECK:       [[DEOPT]]:
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
-; CHECK:       backedge:
+; CHECK:       [[BACKEDGE]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label %loop, label [[EXIT_SPLIT:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT_SPLIT:.*]]
+; CHECK:       [[EXIT_SPLIT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 
 entry:
@@ -133,53 +168,54 @@ exit:
 }
 
 define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) {
-; CHECK-LABEL: define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 %cond, label %entry.split, label %outer_loop.split
-; CHECK:       entry.split:
-; CHECK-NEXT:    br i1 %arg, label %entry.split.split.us, label %entry.split.split
-; CHECK:       entry.split.split.us:
-; CHECK-NEXT:    br label %outer_loop.us
-; CHECK:       outer_loop.us:
-; CHECK-NEXT:    br label %outer_loop.split.us.us
-; CHECK:       outer_backedge.us:
-; CHECK-NEXT:    br label %outer_loop.us
-; CHECK:       outer_loop.split.us.us:
-; CHECK-NEXT:    br label %loop.us.us
-; CHECK:       loop.us.us:
-; CHECK-NEXT:    %iv.us.us = phi i32 [ 0, %outer_loop.split.us.us ], [ %iv.next.us.us, %guarded.us.us ]
-; CHECK-NEXT:    br label %guarded.us.us
-; CHECK:       guarded.us.us:
-; CHECK-NEXT:    %iv.next.us.us = add i32 %iv.us.us, 1
-; CHECK-NEXT:    %loop.cond.us.us = icmp slt i32 %iv.next.us.us, %N
-; CHECK-NEXT:    br i1 %loop.cond.us.us, label %loop.us.us, label %outer_backedge.split.us.us
-; CHECK:       outer_backedge.split.us.us:
-; CHECK-NEXT:    br label %outer_backedge.us
-; CHECK:       entry.split.split:
-; CHECK-NEXT:    br label %outer_loop
-; CHECK:       outer_loop:
-; CHECK-NEXT:    br label %outer_loop.split.us
-; CHECK:       outer_loop.split.us:
-; CHECK-NEXT:    br label %loop.us
-; CHECK:       loop.us:
-; CHECK-NEXT:    %iv.us = phi i32 [ 0, %outer_loop.split.us ], [ %iv.next.us, %guarded.us ]
-; CHECK-NEXT:    br label %guarded.us
-; CHECK:       guarded.us:
-; CHECK-NEXT:    %iv.next.us = add i32 %iv.us, 1
-; CHECK-NEXT:    %loop.cond.us = icmp slt i32 %iv.next.us, %N
-; CHECK-NEXT:    br i1 %loop.cond.us, label %loop.us, label %outer_backedge.split.us
-; CHECK:       outer_backedge.split.us:
-; CHECK-NEXT:    br label %outer_backedge
-; CHECK:       outer_loop.split:
-; CHECK-NEXT:    br label %loop
-; CHECK:       loop:
-; CHECK-NEXT:    br label %deopt
-; CHECK:       deopt:
+; CHECK-LABEL: define void @test_nested_loop(
+; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]], i1 [[ARG:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[ENTRY_SPLIT:.*]], label %[[OUTER_LOOP_SPLIT:.*]], !prof [[PROF1]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br i1 [[ARG]], label %[[ENTRY_SPLIT_SPLIT_US:.*]], label %[[ENTRY_SPLIT_SPLIT:.*]]
+; CHECK:       [[ENTRY_SPLIT_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[OUTER_LOOP_US:.*]]
+; CHECK:       [[OUTER_LOOP_US]]:
+; CHECK-NEXT:    br label %[[OUTER_LOOP_SPLIT_US_US:.*]]
+; CHECK:       [[OUTER_BACKEDGE_US:.*]]:
+; CHECK-NEXT:    br label %[[OUTER_LOOP_US]]
+; CHECK:       [[OUTER_LOOP_SPLIT_US_US]]:
+; CHECK-NEXT:    br label %[[LOOP_US_US:.*]]
+; CHECK:       [[LOOP_US_US]]:
+; CHECK-NEXT:    [[IV_US_US:%.*]] = phi i32 [ 0, %[[OUTER_LOOP_SPLIT_US_US]] ], [ [[IV_NEXT_US_US:%.*]], %[[GUARDED_US_US:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US_US]]
+; CHECK:       [[GUARDED_US_US]]:
+; CHECK-NEXT:    [[IV_NEXT_US_US]] = add i32 [[IV_US_US]], 1
+; CHECK-NEXT:    [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US_US]], label %[[LOOP_US_US]], label %[[OUTER_BACKEDGE_SPLIT_US_US:.*]]
+; CHECK:       [[OUTER_BACKEDGE_SPLIT_US_US]]:
+; CHECK-NEXT:    br label %[[OUTER_BACKEDGE_US]]
+; CHECK:       [[ENTRY_SPLIT_SPLIT]]:
+; CHECK-NEXT:    br label %[[OUTER_LOOP:.*]]
+; CHECK:       [[OUTER_LOOP]]:
+; CHECK-NEXT:    br label %[[OUTER_LOOP_SPLIT_US:.*]]
+; CHECK:       [[OUTER_LOOP_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[LOOP_US:.*]]
+; CHECK:       [[LOOP_US]]:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, %[[OUTER_LOOP_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[GUARDED_US:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US]]
+; CHECK:       [[GUARDED_US]]:
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[OUTER_BACKEDGE_SPLIT_US:.*]]
+; CHECK:       [[OUTER_BACKEDGE_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[OUTER_BACKEDGE:.*]]
+; CHECK:       [[OUTER_LOOP_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    br label %[[DEOPT:.*]]
+; CHECK:       [[DEOPT]]:
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
-; CHECK:       outer_backedge:
-; CHECK-NEXT:    br label %exit
-; CHECK:       exit:
+; CHECK:       [[OUTER_BACKEDGE]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 
@@ -204,17 +240,50 @@ exit:
 }
 
 define void @test_sibling_loops(i1 %cond1, i1 %cond2, i32 %N) {
-; CHECK-LABEL: @test_sibling_loops(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
-; CHECK:         [[IV1_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV1_NEXT_US:%.*]], [[GUARDED_US:%.*]] ]
-; CHECK-NEXT:    br label [[GUARDED_US]]
-; CHECK:         call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK-LABEL: define void @test_sibling_loops(
+; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]]
+; CHECK:       [[ENTRY_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[LOOP1_US:.*]]
+; CHECK:       [[LOOP1_US]]:
+; CHECK-NEXT:    [[IV1_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV1_NEXT_US:%.*]], %[[GUARDED_US:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US]]
+; CHECK:       [[GUARDED_US]]:
+; CHECK-NEXT:    [[IV1_NEXT_US]] = add i32 [[IV1_US]], 1
+; CHECK-NEXT:    [[LOOP1_COND_US:%.*]] = icmp slt i32 [[IV1_NEXT_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP1_COND_US]], label %[[LOOP1_US]], label %[[BETWEEN_SPLIT_US:.*]]
+; CHECK:       [[BETWEEN_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[BETWEEN:.*]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP1:.*]]
+; CHECK:       [[LOOP1]]:
+; CHECK-NEXT:    br label %[[DEOPT:.*]]
+; CHECK:       [[DEOPT]]:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
-; CHECK:         [[IV2_US:%.*]] = phi i32 [ 0, [[BETWEEN:%.*]] ], [ [[IV1_NEXT_US2:%.*]], [[GUARDED_US2:%.*]] ]
-; CHECK-NEXT:    br label [[GUARDED_US2]]
-; CHECK:         call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK:       [[BETWEEN]]:
+; CHECK-NEXT:    br i1 [[COND2]], label %[[BETWEEN_SPLIT_US2:.*]], label %[[BETWEEN_SPLIT:.*]], !prof [[PROF1]]
+; CHECK:       [[BETWEEN_SPLIT_US2]]:
+; CHECK-NEXT:    br label %[[LOOP2_US:.*]]
+; CHECK:       [[LOOP2_US]]:
+; CHECK-NEXT:    [[IV2_US:%.*]] = phi i32 [ 0, %[[BETWEEN_SPLIT_US2]] ], [ [[IV2_NEXT_US:%.*]], %[[GUARDED_US3:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US3]]
+; CHECK:       [[GUARDED_US3]]:
+; CHECK-NEXT:    [[IV2_NEXT_US]] = add i32 [[IV2_US]], 1
+; CHECK-NEXT:    [[LOOP2_COND_US:%.*]] = icmp slt i32 [[IV2_NEXT_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP2_COND_US]], label %[[LOOP2_US]], label %[[EXIT_SPLIT_US:.*]]
+; CHECK:       [[EXIT_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[BETWEEN_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP2:.*]]
+; CHECK:       [[LOOP2]]:
+; CHECK-NEXT:    br label %[[DEOPT1:.*]]
+; CHECK:       [[DEOPT1]]:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 
 entry:
@@ -242,11 +311,21 @@ exit:
 }
 
 ; Check that we don't do anything because of cleanuppad.
-; CHECK-LABEL: @test_cleanuppad(
-; CHECK:       call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
-; CHECK-NOT:   call void (i1, ...) @llvm.experimental.guard(
 define void @test_cleanuppad(i1 %cond, i32 %N) personality ptr @__CxxFrameHandler3 {
-
+; CHECK-LABEL: define void @test_cleanuppad(
+; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) personality ptr @__CxxFrameHandler3 {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[COND]]) [ "deopt"() ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    invoke void @may_throw(i32 [[IV]])
+; CHECK-NEXT:            to label %[[LOOP]] unwind label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[CP:%.*]] = cleanuppad within none []
+; CHECK-NEXT:    cleanupret from [[CP]] unwind to caller
+;
 entry:
   br label %loop
 
@@ -264,3 +343,9 @@ exit:
 
 declare void @may_throw(i32 %i)
 declare i32 @__CxxFrameHandler3(...)
+
+!0 = !{!"function_entry_count", i32 10}
+;.
+; CHECK: [[PROF0]] = !{!"function_entry_count", i32 10}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1048575, i32 1}
+;.
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
index 536e0c6a0e74a..3c84dea2a0672 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
@@ -2,40 +2,40 @@
 ; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true -passes="loop(simple-loop-unswitch<nontrivial>),simplifycfg" | FileCheck %s
 ; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true -passes="loop-mssa(simple-loop-unswitch<nontrivial>),simplifycfg" -verify-memoryssa | FileCheck %s
 
-define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
+define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) !prof !{!"function_entry_count", i32 10} {
 ; CHECK-LABEL: @test_01(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1:![0-9]+]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]]
-; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
+; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]], !prof [[PROF2:![0-9]+]]
 ; CHECK:       loop.us:
-; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
-; CHECK-NEXT:    [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
-; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       guarded.us:
-; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
-; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
-; CHECK-NEXT:    store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
-; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL]], [[X]]
+; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
+; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR_US]], align 4
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
-; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
-; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[EL_PTR1:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV1]]
+; CHECK-NEXT:    [[EL1:%.*]] = load i32, ptr [[EL_PTR1]], align 4
+; CHECK-NEXT:    [[BOUND_CHECK1:%.*]] = icmp ult i32 [[EL1]], [[LIMIT]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK1]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       guarded:
-; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]]
+; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL1]], [[X]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]]
-; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL1]]
+; CHECK-NEXT:    store i32 [[IV1]], ptr [[ARR_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV1]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -76,7 +76,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_01_neg_void_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_01_neg_void_profile(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
@@ -133,7 +133,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_01_constants(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 200, 300
 ; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
 ; CHECK:       loop.us:
@@ -141,7 +141,7 @@ define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p
 ; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
 ; CHECK-NEXT:    [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], 200
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]]
 ; CHECK:       guarded.us:
 ; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], 300
 ; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
@@ -154,13 +154,13 @@ define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], 200
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -200,17 +200,17 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_01_neg_degenerate_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_01_neg_degenerate_profile(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]]
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF3]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF7:![0-9]+]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
@@ -257,17 +257,17 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_01_neg_cold(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_01_neg_cold(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]]
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF3]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF8:![0-9]+]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
@@ -314,17 +314,17 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_01_neg_overflowing_metadata(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_01_neg_overflowing_metadata(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]]
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF9:![0-9]+]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF7]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF9]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
@@ -371,7 +371,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_02(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]]
 ; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
 ; CHECK:       loop.us:
@@ -379,7 +379,7 @@ define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
 ; CHECK-NEXT:    [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp sge i32 [[EL_US]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]]
 ; CHECK:       guarded.us:
 ; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
 ; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
@@ -392,16 +392,16 @@ define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp sge i32 [[EL]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -441,7 +441,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_02_inverse(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]]
 ; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
 ; CHECK:       loop.us:
@@ -449,7 +449,7 @@ define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit,
 ; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
 ; CHECK-NEXT:    [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp sge i32 [[EL_US]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]]
 ; CHECK:       guarded.us:
 ; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp uge i32 [[EL_US]], [[X]]
 ; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
@@ -462,16 +462,16 @@ define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit,
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp sge i32 [[EL]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp uge i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[COMMON_RET]], label [[BACKEDGE]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[COMMON_RET]], label [[BACKEDGE]], !prof [[PROF11:![0-9]+]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -511,7 +511,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_03(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]]
 ; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
 ; CHECK:       loop.us:
@@ -519,7 +519,7 @@ define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
 ; CHECK-NEXT:    [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp slt i32 [[EL_US]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF11]]
 ; CHECK:       guarded.us:
 ; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
 ; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
@@ -532,16 +532,16 @@ define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp slt i32 [[EL]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF10]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF11]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -581,7 +581,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_04(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 128, [[X]]
 ; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
 ; CHECK:       loop.us:
@@ -589,7 +589,7 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[IV_US]]
 ; CHECK-NEXT:    [[EL_US:%.*]] = load i8, ptr [[EL_PTR_US]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp slt i8 [[EL_US]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF10]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF11]]
 ; CHECK:       guarded.us:
 ; CHECK-NEXT:    [[EL_WIDE_US:%.*]] = zext i8 [[EL_US]] to i32
 ; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_WIDE_US]], [[X]]
@@ -603,17 +603,17 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i8, ptr [[P]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i8, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp slt i8 [[EL]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF10]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF11]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[EL_WIDE:%.*]] = zext i8 [[EL]] to i32
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL_WIDE]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL_WIDE]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -651,17 +651,19 @@ range_check_failed:                               ; preds = %guarded
   ret i32 -2
 }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{}
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 100, i32 1}
-; CHECK: [[LOOP2]] = distinct !{!2, !3}
-; CHECK: [[META3:![0-9]+]] = !{!"llvm.loop.unswitch.injection.disable"}
-; CHECK: [[LOOP4]] = distinct !{!4, !3}
-; CHECK: [[PROF5]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK: [[PROF6]] = !{!"branch_weights", i32 2, i32 3}
-; CHECK: [[PROF7]] = !{!"branch_weights", i32 -1, i32 -1000}
-; CHECK: [[LOOP8]] = distinct !{!8, !3}
-; CHECK: [[LOOP9]] = distinct !{!9, !3}
-; CHECK: [[PROF10]] = !{!"branch_weights", i32 1, i32 100}
-; CHECK: [[LOOP11]] = distinct !{!11, !3}
-; CHECK: [[LOOP12]] = distinct !{!12, !3}
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK: [[META1]] = !{}
+; CHECK: [[PROF2]] = !{!"unknown", !"simple-loop-unswitch"}
+; CHECK: [[PROF3]] = !{!"branch_weights", i32 100, i32 1}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{!"llvm.loop.unswitch.injection.disable"}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META5]]}
+; CHECK: [[PROF7]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK: [[PROF8]] = !{!"branch_weights", i32 2, i32 3}
+; CHECK: [[PROF9]] = !{!"branch_weights", i32 -1, i32 -1000}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META5]]}
+; CHECK: [[PROF11]] = !{!"branch_weights", i32 1, i32 100}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META5]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META5]]}
 ;.
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
index 1d8942079ffd8..87161707d9f69 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
@@ -1,14 +1,14 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -S < %s | FileCheck %s
 
 declare void @clobber()
 
-define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) {
+define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) !prof !0 {
 ; CHECK-LABEL: @partial_unswitch_true_successor(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 100
-; CHECK-NEXT:    br i1 [[TMP1]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       entry.split.us:
 ; CHECK-NEXT:    br label [[LOOP_HEADER_US:%.*]]
 ; CHECK:       loop.header.us:
@@ -19,7 +19,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch.us:
 ; CHECK-NEXT:    [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
 ; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT:    br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT:    br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !prof [[PROF2:![0-9]+]]
 ; CHECK:       exit.split.us:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       entry.split:
@@ -28,7 +28,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
 ; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[SC:%.*]] = icmp eq i32 [[LV]], 100
-; CHECK-NEXT:    br i1 [[SC]], label [[NOCLOBBER:%.*]], label [[CLOBBER:%.*]]
+; CHECK-NEXT:    br i1 [[SC]], label [[NOCLOBBER:%.*]], label [[CLOBBER:%.*]], !prof [[PROF1]]
 ; CHECK:       noclobber:
 ; CHECK-NEXT:    br label [[LOOP_LATCH]]
 ; CHECK:       clobber:
@@ -37,7 +37,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !prof [[PROF2]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -50,7 +50,7 @@ loop.header:
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
   %lv = load i32, ptr %ptr
   %sc = icmp eq i32 %lv, 100
-  br i1 %sc, label %noclobber, label %clobber
+  br i1 %sc, label %noclobber, label %clobber, !prof !1
 
 noclobber:
   br label %loop.latch
@@ -62,7 +62,7 @@ clobber:
 loop.latch:
   %c = icmp ult i32 %iv, %N
   %iv.next = add i32 %iv, 1
-  br i1 %c, label %loop.header, label %exit
+  br i1 %c, label %loop.header, label %exit, !prof !2
 
 exit:
   ret i32 10
@@ -102,7 +102,7 @@ define i32 @partial_unswitch_false_successor(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -171,7 +171,7 @@ define i32 @partial_unswtich_gep_load_icmp(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -246,7 +246,7 @@ define i32 @partial_unswitch_reduction_phi(ptr %ptr, i32 %N) {
 ; CHECK-NEXT:    [[RED_NEXT]] = phi i32 [ [[ADD_5]], [[CLOBBER]] ], [ [[ADD_10]], [[NOCLOBBER]] ]
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -325,7 +325,7 @@ define i32 @partial_unswitch_true_successor_noclobber(ptr noalias %ptr.1, ptr no
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -637,7 +637,7 @@ define i32 @partial_unswitch_true_successor_preheader_insertion(ptr %ptr, i32 %N
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT_SPLIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT_SPLIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit.loopexit.split:
 ; CHECK-NEXT:    br label [[EXIT_LOOPEXIT]]
 ; CHECK:       exit.loopexit:
@@ -713,7 +713,7 @@ define i32 @partial_unswitch_true_successor_insert_point(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -784,7 +784,7 @@ define i32 @partial_unswitch_true_successor_hoist_invariant(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -1073,7 +1073,7 @@ define i32 @partial_unswitch_true_to_latch(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -1138,7 +1138,7 @@ define i32 @partial_unswitch_exiting_block_with_multiple_unswitch_candidates(i32
 ; CHECK-NEXT:    store i32 [[TMP1:%.*]], ptr [[PTR]], align 16
 ; CHECK-NEXT:    br label [[EXITING]]
 ; CHECK:       exiting:
-; CHECK-NEXT:    br i1 [[EXIT_COND]], label [[LOOP]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label [[LOOP]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    [[RET_VAL:%.*]] = phi i32 [ 1, [[EXITING]] ]
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -1249,7 +1249,7 @@ define i32 @partial_unswitch_true_successor_for_cost_calculation(ptr %ptr, i32 %
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -1360,7 +1360,7 @@ define i32 @partial_unswitch_true_successor_trunc(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -1425,7 +1425,7 @@ define i32 @partial_unswitch_false_successor_trunc(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -1456,15 +1456,26 @@ exit:
   ret i32 10
 }
 
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[UNSWITCH_PARTIAL_DISABLE:![0-9]+]]}
-; CHECK: [[UNSWITCH_PARTIAL_DISABLE]] = !{!"llvm.loop.unswitch.partial.disable"}
-; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[UNSWITCH_PARTIAL_DISABLE]]}
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 1000, i32 1}
+!2 = !{!"branch_weights", i32 100, i32 3}
+
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1000, i32 1}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 100, i32 3}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+; CHECK: [[META4]] = !{!"llvm.loop.unswitch.partial.disable"}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META4]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META4]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META4]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META4]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META4]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META4]]}
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/ARM/switch-to-lookup-table.ll b/llvm/test/Transforms/SimplifyCFG/ARM/switch-to-lookup-table.ll
index 6def8f4eeb089..a51b816846cdc 100644
--- a/llvm/test/Transforms/SimplifyCFG/ARM/switch-to-lookup-table.ll
+++ b/llvm/test/Transforms/SimplifyCFG/ARM/switch-to-lookup-table.ll
@@ -15,8 +15,8 @@
 ; DISABLE-NOT: @{{.*}} = private unnamed_addr constant [3 x ptr] [ptr @c1, ptr @c2, ptr @c3]
 ; ENABLE:      @{{.*}} = private unnamed_addr constant [3 x ptr] [ptr @g1, ptr @g2, ptr @g3]
 ; DISABLE-NOT: @{{.*}} = private unnamed_addr constant [3 x ptr] [ptr @g1, ptr @g2, ptr @g3]
-; ENABLE:      @{{.*}} = private unnamed_addr constant [3 x ptr] [ptr @f1, ptr @f2, ptr @f3]
-; DISABLE-NOT: @{{.*}} = private unnamed_addr constant [3 x ptr] [ptr @f1, ptr @f2, ptr @f3]
+; ENABLE:      @{{.*}} = private unnamed_addr constant [4 x ptr] [ptr @f1, ptr @f2, ptr @f3, ptr @f4]
+; DISABLE-NOT: @{{.*}} = private unnamed_addr constant [4 x ptr] [ptr @f1, ptr @f2, ptr @f3, ptr @f4]
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "armv7a--none-eabi"
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll
index aa95b3fd235e5..e48c2b46a138a 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll
@@ -1,8 +1,13 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; RUN: opt -passes='simplifycfg<switch-to-lookup>' -simplifycfg-require-and-preserve-domtree=1 -S < %s | FileCheck %s
 
 target triple = "x86_64-unknown-linux-gnu"
 
+;.
+; CHECK: @switch.table.switch_of_powers_two = private unnamed_addr constant [7 x i32] [i32 3, i32 poison, i32 poison, i32 2, i32 1, i32 0, i32 42], align 4
+; CHECK: @switch.table.switch_of_powers_two_default_reachable = private unnamed_addr constant [7 x i32] [i32 3, i32 5, i32 5, i32 2, i32 1, i32 0, i32 42], align 4
+; CHECK: @switch.table.switch_of_powers_two_default_reachable_multipreds = private unnamed_addr constant [7 x i32] [i32 3, i32 poison, i32 poison, i32 2, i32 1, i32 0, i32 42], align 4
+;.
 define i32 @switch_of_powers_two(i32 %arg) {
 ; CHECK-LABEL: define i32 @switch_of_powers_two(
 ; CHECK-SAME: i32 [[ARG:%.*]]) {
@@ -35,17 +40,17 @@ return:
   ret i32 %phi
 }
 
-define i32 @switch_of_powers_two_default_reachable(i32 %arg) {
+define i32 @switch_of_powers_two_default_reachable(i32 %arg) !prof !0 {
 ; CHECK-LABEL: define i32 @switch_of_powers_two_default_reachable(
-; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-SAME: i32 [[ARG:%.*]]) !prof [[PROF0:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[ARG]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 1
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[ENTRY_SPLIT:.*]], label %[[RETURN:.*]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[ENTRY_SPLIT:.*]], label %[[RETURN:.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       [[ENTRY_SPLIT]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[ARG]], i1 true)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 7
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[SWITCH_LOOKUP:.*]], label %[[RETURN]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[SWITCH_LOOKUP:.*]], label %[[RETURN]], !prof [[PROF2:![0-9]+]]
 ; CHECK:       [[SWITCH_LOOKUP]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext nneg i32 [[TMP2]] to i64
 ; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [7 x i32], ptr @switch.table.switch_of_powers_two_default_reachable, i64 0, i64 [[TMP4]]
@@ -62,7 +67,7 @@ entry:
   i32 16, label %bb3
   i32 32, label %bb4
   i32 64, label %bb5
-  ]
+  ], !prof !1
 
 default_case: br label %return
 bb1: br label %return
@@ -128,3 +133,13 @@ return:
   %phi = phi i32 [ 3, %bb1 ], [ 2, %bb2 ], [ 1, %bb3 ], [ 0, %bb4 ], [ 42, %bb5 ], [ %pn, %default_case ]
   ret i32 %phi
 }
+
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 10, i32 5, i32 7, i32 11, i32 13, i32 17}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[PROF0]] = !{!"function_entry_count", i32 10}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 58, i32 5}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 53, i32 5}
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/pr165301.ll b/llvm/test/Transforms/SimplifyCFG/pr165301.ll
new file mode 100644
index 0000000000000..1df655250f57e
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/pr165301.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
+; RUN: opt -S -passes="simplifycfg<switch-range-to-icmp>" < %s | FileCheck %s
+
+; Make sure there's no use after free when removing incoming values from PHI nodes
+
+define i32 @pr165301(i1 %cond) !prof !0 {
+; CHECK-LABEL: define i32 @pr165301(
+; CHECK-SAME: i1 [[COND:%.*]]) !prof [[PROF0:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[SWITCHBB:.*]]
+; CHECK:       [[SWITCHBB]]:
+; CHECK-NEXT:    br label %[[SWITCHBB]]
+;
+entry:
+  br label %switchbb
+
+switchbb:
+  switch i1 %cond, label %default [
+  i1 false, label %switchbb
+  i1 true, label %switchbb
+  ], !prof !1
+
+default:
+  %phi.lcssa = phi i32 [ 0, %switchbb ]
+  ret i32 %phi.lcssa
+}
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 2, i32 3, i32 5}
+;.
+; CHECK: [[PROF0]] = !{!"function_entry_count", i32 10}
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/pr166369.ll b/llvm/test/Transforms/SimplifyCFG/pr166369.ll
new file mode 100644
index 0000000000000..c0a85c0293dd8
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/pr166369.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=simplifycfg < %s | FileCheck %s
+
+; Make sure we handle full-set ranges correctly.
+define void @test_i1() {
+; CHECK-LABEL: define void @test_i1() {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    ret void
+;
+bb:
+  %icmp = icmp ugt i1 false, true
+  br label %bb5
+
+bb5:
+  %select = select i1 %icmp, i1 %icmp, i1 false
+  br i1 %select, label %bb5, label %bb6
+
+bb6:
+  ret void
+}
+
+define void @test_i3() {
+; CHECK-LABEL: define void @test_i3() {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    ret void
+;
+bb:
+  %icmp = icmp ugt i3 0, 7
+  br label %bb5
+
+bb5:
+  %select = select i1 %icmp, i1 %icmp, i1 false
+  br i1 %select, label %bb5, label %bb6
+
+bb6:
+  ret void
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
index d1fba91d1e505..169803f7aa012 100644
--- a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
+++ b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
@@ -321,7 +321,7 @@ three:
 !1 = !{!"branch_weights", i32 5, i32 7, i32 11, i32 13, i32 17}
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { optsize }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 100}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 48, i32 5}
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll b/llvm/test/Transforms/SimplifyCFG/switch-on-const.ll
similarity index 54%
rename from llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll
rename to llvm/test/Transforms/SimplifyCFG/switch-on-const.ll
index e8b58639c13dd..1ab1b5e8bd838 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-on-const.ll
@@ -154,6 +154,132 @@ bees:
   unreachable
 }
 
+define void @pr165179(i1 %cond) {
+; CHECK-LABEL: @pr165179(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @bees.a() #[[ATTR0]]
+; CHECK-NEXT:    br label [[SWITCHBB:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    tail call void @bees.b() #[[ATTR0]]
+; CHECK-NEXT:    br label [[SWITCHBB]]
+; CHECK:       exit:
+; CHECK-NEXT:    tail call void @bees.a() #[[ATTR0]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  tail call void @bees.a() nounwind
+  br label %switchbb
+
+if.else:
+  tail call void @bees.b() nounwind
+  br label %switchbb
+
+switchbb:
+  %cond1 = phi i32 [ 1, %if.else ], [ -1, %if.then ]
+  switch i32 %cond1, label %default [
+  i32 1, label %exit
+  i32 -1, label %exit
+  ]
+
+exit:
+  tail call void @bees.a() nounwind
+  ret void
+
+default:
+  tail call void @bees.b() nounwind
+  ret void
+}
+
+define void @switch_remove_dead_case_phi(i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: @switch_remove_dead_case_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @bees.a() #[[ATTR0]]
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[SWITCHBB:%.*]], label [[IF_ELSE]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ -1, [[IF_THEN]] ]
+; CHECK-NEXT:    tail call void @bees.b() #[[ATTR0]]
+; CHECK-NEXT:    br label [[SWITCHBB]]
+; CHECK:       switchbb:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[PHI]], [[IF_ELSE]] ], [ 5, [[IF_THEN]] ]
+; CHECK-NEXT:    [[COND3:%.*]] = icmp eq i32 [[COND]], -1
+; CHECK-NEXT:    br i1 [[COND3]], label [[EXIT:%.*]], label [[DEFAULT:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       exit:
+; CHECK-NEXT:    tail call void @bees.a() #[[ATTR0]]
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       default:
+; CHECK-NEXT:    tail call void @bees.b() #[[ATTR0]]
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+entry:
+  br i1 %cond1, label %if.then, label %if.else
+
+if.then:
+  tail call void @bees.a() nounwind
+  br i1 %cond2, label %switchbb, label %if.else
+
+if.else:
+  %phi = phi i32 [ 3, %entry ], [ -1, %if.then ]
+  tail call void @bees.b() nounwind
+  br label %switchbb
+
+switchbb:
+  %cond = phi i32 [ %phi, %if.else ], [ 5, %if.then ]
+  switch i32 %cond, label %default [
+  i32 1, label %exit
+  i32 -1, label %exit
+  ]
+
+exit:
+  tail call void @bees.a() nounwind
+  ret void
+
+default:
+  tail call void @bees.b() nounwind
+  ret void
+}
+
+define void @switch_remove_dead_case_select(i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: @switch_remove_dead_case_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = select i1 [[COND1:%.*]], i32 -1, i32 3
+; CHECK-NEXT:    [[Y:%.*]] = select i1 [[COND2:%.*]], i32 [[X]], i32 5
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[Y]], -1
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[DEFAULT:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       exit:
+; CHECK-NEXT:    tail call void @bees.a() #[[ATTR0]]
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       default:
+; CHECK-NEXT:    tail call void @bees.b() #[[ATTR0]]
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+entry:
+  %x = select i1 %cond1, i32 -1, i32 3
+  %y = select i1 %cond2, i32 %x, i32 5
+  switch i32 %y, label %default [
+  i32 1, label %exit
+  i32 -1, label %exit
+  ]
+
+exit:
+  tail call void @bees.a() nounwind
+  ret void
+
+default:
+  tail call void @bees.b() nounwind
+  ret void
+}
+
 declare void @llvm.trap() nounwind noreturn
 declare void @bees.a() nounwind
 declare void @bees.b() nounwind
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-transformations-no-lut.ll b/llvm/test/Transforms/SimplifyCFG/switch-transformations-no-lut.ll
index 25267dcc6dbcb..48be76c19e48f 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-transformations-no-lut.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-transformations-no-lut.ll
@@ -410,13 +410,12 @@ define i1 @single_value_with_mask(i32 %x) {
 ; OPTNOLUT-NEXT:      i32 21, label %[[END]]
 ; OPTNOLUT-NEXT:      i32 48, label %[[END]]
 ; OPTNOLUT-NEXT:      i32 16, label %[[END]]
+; OPTNOLUT-NEXT:      i32 80, label %[[END]]
 ; OPTNOLUT-NEXT:    ]
 ; OPTNOLUT:       [[DEFAULT]]:
-; OPTNOLUT-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 80
-; OPTNOLUT-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i1 false, i1 true
 ; OPTNOLUT-NEXT:    br label %[[END]]
 ; OPTNOLUT:       [[END]]:
-; OPTNOLUT-NEXT:    [[RES:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ false, %[[ENTRY]] ], [ false, %[[ENTRY]] ], [ false, %[[ENTRY]] ], [ [[SEL]], %[[DEFAULT]] ]
+; OPTNOLUT-NEXT:    [[RES:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ false, %[[ENTRY]] ], [ false, %[[ENTRY]] ], [ false, %[[ENTRY]] ], [ true, %[[DEFAULT]] ], [ false, %[[ENTRY]] ]
 ; OPTNOLUT-NEXT:    ret i1 [[RES]]
 ;
 ; TTINOLUT-LABEL: define i1 @single_value_with_mask(
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-umin.ll b/llvm/test/Transforms/SimplifyCFG/switch-umin.ll
new file mode 100644
index 0000000000000..44665365dc222
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/switch-umin.ll
@@ -0,0 +1,246 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=simplifycfg < %s | FileCheck %s
+
+declare void @a()
+declare void @b()
+declare void @c()
+declare void @d()
+
+define void @switch_replace_default(i32 %x) {
+; CHECK-LABEL: define void @switch_replace_default(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3)
+; CHECK-NEXT:    switch i32 [[X]], label %[[COMMON_RET:.*]] [
+; CHECK-NEXT:      i32 0, label %[[CASE0:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 2, label %[[CASE2:.*]]
+; CHECK-NEXT:    ], !prof [[PROF0:![0-9]+]]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CASE0]]:
+; CHECK-NEXT:    call void @a()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    call void @b()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    call void @c()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+  %min = call i32 @llvm.umin.i32(i32 %x, i32 3)
+  switch i32 %min, label %unreachable [
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case2
+  i32 3, label %case3
+  ], !prof !0
+
+case0:
+  call void @a()
+  ret void
+
+case1:
+  call void @b()
+  ret void
+
+case2:
+  call void @c()
+  ret void
+
+case3:
+  ret void
+
+unreachable:
+  unreachable
+}
+
+define void @switch_replace_default_and_remove_dead_cases(i32 %x) {
+; CHECK-LABEL: define void @switch_replace_default_and_remove_dead_cases(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3)
+; CHECK-NEXT:    switch i32 [[X]], label %[[COMMON_RET:.*]] [
+; CHECK-NEXT:      i32 2, label %[[CASE2:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE1:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    call void @b()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    call void @c()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+  %min = call i32 @llvm.umin.i32(i32 %x, i32 3)
+  switch i32 %min, label %unreachable [
+  i32 4, label %case4
+  i32 1, label %case1
+  i32 2, label %case2
+  i32 3, label %case3
+  ]
+
+case4:
+  call void @a()
+  ret void
+
+case1:
+  call void @b()
+  ret void
+
+case2:
+  call void @c()
+  ret void
+
+case3:
+  ret void
+
+unreachable:
+  unreachable
+}
+
+define void @switch_replace_default_when_holes(i32 %x) {
+; CHECK-LABEL: define void @switch_replace_default_when_holes(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3)
+; CHECK-NEXT:    switch i32 [[X]], label %[[COMMON_RET:.*]] [
+; CHECK-NEXT:      i32 1, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 2, label %[[CASE2:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    call void @b()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    call void @c()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+  %min = call i32 @llvm.umin.i32(i32 %x, i32 3)
+  switch i32 %min, label %unreachable [
+  i32 1, label %case1
+  i32 2, label %case2
+  i32 3, label %case3
+  ]
+
+case1:
+  call void @b()
+  ret void
+
+case2:
+  call void @c()
+  ret void
+
+case3:
+  ret void
+
+unreachable:
+  unreachable
+}
+
+define void @do_not_switch_replace_default(i32 %x, i32 %y) {
+; CHECK-LABEL: define void @do_not_switch_replace_default(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 [[Y]])
+; CHECK-NEXT:    switch i32 [[MIN]], label %[[UNREACHABLE:.*]] [
+; CHECK-NEXT:      i32 0, label %[[CASE0:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 2, label %[[CASE2:.*]]
+; CHECK-NEXT:      i32 3, label %[[COMMON_RET:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CASE0]]:
+; CHECK-NEXT:    call void @a()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    call void @b()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    call void @c()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[UNREACHABLE]]:
+; CHECK-NEXT:    unreachable
+;
+  %min = call i32 @llvm.umin.i32(i32 %x, i32 %y)
+  switch i32 %min, label %unreachable [
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case2
+  i32 3, label %case3
+  ]
+
+case0:
+  call void @a()
+  ret void
+
+case1:
+  call void @b()
+  ret void
+
+case2:
+  call void @c()
+  ret void
+
+case3:
+  ret void
+
+unreachable:
+  unreachable
+}
+
+define void @do_not_replace_switch_default_but_remove_dead_cases(i32 %x) {
+; CHECK-LABEL: define void @do_not_replace_switch_default_but_remove_dead_cases(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3)
+; CHECK-NEXT:    switch i32 [[MIN]], label %[[CASE0:.*]] [
+; CHECK-NEXT:      i32 3, label %[[COMMON_RET:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 2, label %[[CASE2:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CASE0]]:
+; CHECK-NEXT:    call void @a()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    call void @b()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    call void @c()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+  %min = call i32 @llvm.umin.i32(i32 %x, i32 3)
+  switch i32 %min, label %case0 [ ; default is reachable, therefore simplification not triggered
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case2
+  i32 3, label %case3
+  i32 4, label %case4
+  ]
+
+case0:
+  call void @a()
+  ret void
+
+case1:
+  call void @b()
+  ret void
+
+case2:
+  call void @c()
+  ret void
+
+case3:
+  ret void
+
+case4:
+  call void @d()
+  ret void
+
+}
+
+
+!0 = !{!"branch_weights", i32 1, i32 2, i32 3, i32 99, i32 5}
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 5, i32 2, i32 3, i32 99}
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/switch_create.ll b/llvm/test/Transforms/SimplifyCFG/switch_create.ll
index ef5aee68e268e..64016f3a4b97c 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch_create.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch_create.ll
@@ -1314,6 +1314,136 @@ if.end:
   ret void
 }
 
+define i32 @switch_with_icmp_select_after_it(i32 %x) {
+; CHECK-LABEL: @switch_with_icmp_select_after_it(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[X:%.*]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 18, label [[END:%.*]]
+; CHECK-NEXT:      i32 21, label [[END]]
+; CHECK-NEXT:      i32 48, label [[END]]
+; CHECK-NEXT:      i32 16, label [[END]]
+; CHECK-NEXT:      i32 80, label [[SWITCH_EDGE:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       switch.edge:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       default:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 1, [[ENTRY]] ], [ 1, [[ENTRY]] ], [ 1, [[ENTRY]] ], [ 3, [[DEFAULT]] ], [ 2, [[SWITCH_EDGE]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  switch i32 %x, label %default [
+  i32 18, label %end
+  i32 21, label %end
+  i32 48, label %end
+  i32 16, label %end
+  ]
+default:
+  %cmp = icmp eq i32 %x, 80
+  ; Create a new switch case BB for case 80.
+  %sel = select i1 %cmp, i32 2, i32 3
+  br label %end
+end:
+  %res = phi i32 [ 1, %entry ], [ 1, %entry ], [ 1, %entry ], [ 1, %entry ], [ %sel, %default ]
+  ret i32 %res
+}
+
+define i32 @switch_with_icmp_select_after_it2(i32 %x) {
+; CHECK-LABEL: @switch_with_icmp_select_after_it2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[X:%.*]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 18, label [[END:%.*]]
+; CHECK-NEXT:      i32 21, label [[END]]
+; CHECK-NEXT:      i32 48, label [[END]]
+; CHECK-NEXT:      i32 16, label [[END]]
+; CHECK-NEXT:      i32 80, label [[END]]
+; CHECK-NEXT:    ]
+; CHECK:       default:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 1, [[ENTRY]] ], [ 1, [[ENTRY]] ], [ 1, [[ENTRY]] ], [ 3, [[DEFAULT]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  switch i32 %x, label %default [
+  i32 18, label %end
+  i32 21, label %end
+  i32 48, label %end
+  i32 16, label %end
+  ]
+default:
+  %cmp = icmp eq i32 %x, 80
+  ; Should not create new case BB
+  %sel = select i1 %cmp, i32 1, i32 3
+  br label %end
+end:
+  %res = phi i32 [ 1, %entry ], [ 1, %entry ], [ 1, %entry ], [ 1, %entry ], [ %sel, %default ]
+  ret i32 %res
+}
+
+define i32 @switch_with_icmp_select_after_it3(i32 %x) {
+; CHECK-LABEL: @switch_with_icmp_select_after_it3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 80
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 3, i32 1
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+entry:
+  switch i32 %x, label %default [
+  i32 18, label %end
+  i32 21, label %end
+  i32 48, label %end
+  i32 16, label %end
+  ]
+default:
+  %cmp = icmp eq i32 %x, 80
+  ; Should not create new case BB
+  %sel = select i1 %cmp, i32 3, i32 1
+  br label %end
+end:
+  %res = phi i32 [ 1, %entry ], [ 1, %entry ], [ 1, %entry ], [ 1, %entry ], [ %sel, %default ]
+  ret i32 %res
+}
+
+; TODO: support this case (multi-phis).
+define i32 @switch_with_icmp_select_after_it_multi_phis(i32 %x) {
+; CHECK-LABEL: @switch_with_icmp_select_after_it_multi_phis(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[X:%.*]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 18, label [[END:%.*]]
+; CHECK-NEXT:      i32 21, label [[END]]
+; CHECK-NEXT:      i32 48, label [[END]]
+; CHECK-NEXT:      i32 16, label [[END]]
+; CHECK-NEXT:    ]
+; CHECK:       default:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 80
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 2, i32 3
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RES1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[ENTRY]] ], [ 0, [[ENTRY]] ], [ 0, [[ENTRY]] ], [ 100, [[DEFAULT]] ]
+; CHECK-NEXT:    [[RES2:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ 1, [[ENTRY]] ], [ 1, [[ENTRY]] ], [ 1, [[ENTRY]] ], [ [[SEL]], [[DEFAULT]] ]
+; CHECK-NEXT:    [[RES:%.*]] = xor i32 [[RES1]], [[RES2]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  switch i32 %x, label %default [
+  i32 18, label %end
+  i32 21, label %end
+  i32 48, label %end
+  i32 16, label %end
+  ]
+default:
+  %cmp = icmp eq i32 %x, 80
+  %sel = select i1 %cmp, i32 2, i32 3
+  br label %end
+end:
+  %res1 = phi i32 [ 0, %entry ], [ 0, %entry ], [ 0, %entry ], [ 0, %entry ], [ 100, %default ]
+  %res2 = phi i32 [ 1, %entry ], [ 1, %entry ], [ 1, %entry ], [ 1, %entry ], [ %sel, %default ]
+  %res = xor i32 %res1, %res2
+  ret i32 %res
+}
+
 !0 = !{!"function_entry_count", i32 100}
 !1 = !{!"branch_weights", i32 6, i32 10}
 ;.
diff --git a/llvm/test/Transforms/SimplifyCFG/switch_mask.ll b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll
index f8bcbc057a7ae..428c18fc18e3d 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch_mask.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll
@@ -221,6 +221,7 @@ define i1 @pr88607() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 false, i32 4, i32 1
 ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 false, i32 2, i32 [[COND]]
+; CHECK-NEXT:    [[COND1:%.*]] = icmp eq i32 [[SPEC_SELECT]], 1
 ; CHECK-NEXT:    ret i1 false
 ;
 entry:
diff --git a/llvm/test/Transforms/SimplifyCFG/switch_undef.ll b/llvm/test/Transforms/SimplifyCFG/switch_undef.ll
index 88a729b7d941a..4de5ea948ed27 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch_undef.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch_undef.ll
@@ -5,12 +5,11 @@
 define void @f6() #0 {
 ; CHECK-LABEL: @f6(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
-; CHECK:       for.cond.i:
+; CHECK-NEXT:    br label [[F1_EXIT_I:%.*]]
+; CHECK:       f1.exit.i:
 ; CHECK-NEXT:    [[TOBOOL7_I:%.*]] = icmp ne i16 1, 0
-; CHECK-NEXT:    br label [[FOR_COND_I]]
+; CHECK-NEXT:    br label [[F1_EXIT_I]]
 ;
-
 entry:
   br label %for.cond.i
 
diff --git a/llvm/test/Transforms/StructurizeCFG/callbr.ll b/llvm/test/Transforms/StructurizeCFG/callbr.ll
new file mode 100644
index 0000000000000..42f95194980d4
--- /dev/null
+++ b/llvm/test/Transforms/StructurizeCFG/callbr.ll
@@ -0,0 +1,235 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=structurizecfg %s -o - | FileCheck %s
+
+; Structurize as usual, but don't tear callbr and its destination blocks apart.
+;
+; Note: currently, callbr blocks and their corresponding target blocks
+; themselves are not handled by the structurizer.* If the CFG turns out to be
+; unstructured at the end, the CFG lowering (si-annotate-control-flow) will
+; detect this. For the currently intended use cases of callbr in the context of
+; the AMDGPU backend, this is not a limitation (cf.
+; https://discourse.llvm.org/t/rfc-add-callbr-intrinsic-support/86087).
+;
+; Note 2: while callbr and its targets remain untouched, everything else is
+; handled as usual, even if it is nested in a callbr region.
+;
+; *FIXME: this will be fixed in the future. Callbr can be handled as follows:
+; Input IR:
+; ```
+; define void @foo_callbr() {
+;   callbr void asm "", "!i"() to label %fallthrough [label %indirect, ...]
+; fallthrough:
+;   br label %exit
+; indirect:
+;   br label %exit
+; ...
+; exit:
+;   ret void
+; }
+; ```
+;
+; Output IR:
+; ```
+; define void @foo_callbr() {
+;   callbr void asm "", "!i"()
+;          to label %fallthrough [label %fake.indirect, label %fake.indirect1, label %fake.indirect2, ...]
+; fake.indirect:                                    ; preds = %0
+;   br label %Flow
+; fake.indirect1:                                   ; preds = %0
+;   br label %Flow
+; fake.indirect2:                                   ; preds = %0
+;   br label %Flow
+; ...
+; Flow:                                             ; preds = %fallthrough, %fake.indirect[0-N]
+;   %1 = phi i1 [ false, %fallthrough ], [ true, %fake.indirect ], [ false, %fake.indirect[1-N] ]
+;   br i1 %1, label %indirect, label %Flow1
+; Flow1:                                            ; preds = %Flow, %indirect
+;   %2 = phi i1 [ false, %Flow], [ true, %fake.indirect1 ], [ false, %indirect ]
+;   br i1 %2, label %indirect1, label %Flow2
+; Flow2:                                            ; preds = %Flow, %indirect1
+;   %2 = phi i1 [ false, %Flow], [ true, %fake.indirect2 ], [ false, %indirect1 ]
+;   br i1 %2, label %indirect2, label %Flow3
+; ...
+; fallthrough:                                      ; preds = %0
+;   br label %Flow
+; indirect:                                         ; preds = %Flow
+;   br label %Flow1
+; indirect1:                                        ; preds = %Flow1
+;   br label %Flow2
+; indirect2:                                        : preds = %Flow2
+;   br label %Flow3
+; ...
+; exit:                                             ; preds = %indirectN, %FlowN
+;   ret void
+; }
+; ```
+;
+; Output IR as ASCII-art:
+;          %0
+; ---------------------
+; |     |     |     |
+; v     v     v     v
+; f    f.i   f.i1  f.i2
+; |     |     |     |
+; v     v     v     v
+; ---------------------
+;        %Flow
+;          |   \
+;          |    %indirect
+;          |   /
+;       %Flow1
+;          |   \
+;          |    %indirect1
+;          |   /
+;       %Flow2
+;          |   \
+;          |    %indirect2
+;          |   /
+;        %exit
+;
+
+; Only callbr, nothing to do.
+define void @callbr_simple() {
+; CHECK-LABEL: define void @callbr_simple() {
+; CHECK-NEXT:  [[CALLBR:.*:]]
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK:       [[INDIRECT]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[INDIRECT1:.*:]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+callbr:
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br label %exit
+indirect:
+  br label %exit
+exit:
+  ret void
+}
+
+; Callbr nested in non-callbr: non-callbr is transformed
+define void @callbr_in_non_callbr(i1 %c) {
+; CHECK-LABEL: define void @callbr_in_non_callbr(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW:.*]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[NOCALLBR]] ], [ true, [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[EXIT:.*]]
+; CHECK:       [[CALLBR]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK:       [[INDIRECT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[INDIRECT1:.*:]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[NOCALLBR]]:
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+  br i1 %c, label %callbr, label %nocallbr
+callbr:
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br label %exit
+indirect:
+  br label %exit
+nocallbr:
+  br label %exit
+exit:
+  ret void
+}
+
+; Callbr parent of non-callbr: non-callbr is transformed
+define void @non_callbr_in_callbr(i1 %c) {
+; CHECK-LABEL: define void @non_callbr_in_callbr(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK:       [[INDIRECT]]:
+; CHECK-NEXT:    br i1 [[C_INV]], label %[[FALLTHROUGH2:.*]], label %[[FLOW:.*]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[FALLTHROUGH2]] ], [ true, %[[INDIRECT]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FALLTHROUGH1:.*]], label %[[FLOW1:.*]]
+; CHECK:       [[FALLTHROUGH1]]:
+; CHECK-NEXT:    br label %[[FLOW1]]
+; CHECK:       [[FALLTHROUGH2]]:
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[INDIRECT1:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[FLOW1]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br i1 %c, label %fallthrough1, label %fallthrough2
+fallthrough1:
+  br label %exit
+fallthrough2:
+  br label %exit
+indirect:
+  br label %exit
+exit:
+  ret void
+}
+
+; Callbr surrounded by non-callbr: all three regular branches are handled
+; correctly
+define void @callbr_nested_in_non_callbr(i1 %c, i1 %d, i1 %e, i1 %f) {
+; CHECK-LABEL: define void @callbr_nested_in_non_callbr(
+; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]], i1 [[E:%.*]], i1 [[F:%.*]]) {
+; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW3:.*]]
+; CHECK:       [[FLOW3]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[FLOW:.*]] ], [ true, [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[RET:.*]]
+; CHECK:       [[CALLBR]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK:       [[INDIRECT]]:
+; CHECK-NEXT:    br i1 [[D]], label %[[FALLTHROUGH1:.*]], label %[[FLOW2:.*]]
+; CHECK:       [[FALLTHROUGH1]]:
+; CHECK-NEXT:    br label %[[FLOW2]]
+; CHECK:       [[INDIRECT2:.*:]]
+; CHECK-NEXT:    br i1 [[E]], label %[[INDIRECT1:.*]], label %[[FLOW1:.*]]
+; CHECK:       [[INDIRECT1]]:
+; CHECK-NEXT:    br label %[[FLOW1]]
+; CHECK:       [[NOCALLBR]]:
+; CHECK-NEXT:    br i1 [[F]], label %[[NOCALLBR1:.*]], label %[[FLOW]]
+; CHECK:       [[NOCALLBR1]]:
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    br label %[[FLOW3]]
+; CHECK:       [[FLOW1]]:
+; CHECK-NEXT:    br label %[[RET]]
+; CHECK:       [[FLOW2]]:
+; CHECK-NEXT:    br label %[[RET]]
+; CHECK:       [[RET]]:
+; CHECK-NEXT:    ret void
+;
+  br i1 %c, label %callbr, label %nocallbr
+callbr:
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br i1 %d, label %fallthrough1, label %ret
+fallthrough1:
+  br label %ret
+indirect:
+  br i1 %e, label %indirect1, label %ret
+indirect1:
+  br label %ret
+nocallbr:
+  br i1 %f, label %nocallbr1, label %ret
+nocallbr1:
+  br label %ret
+ret:
+  ret void
+}
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/armpl.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/armpl.ll
new file mode 100644
index 0000000000000..e79e89c95c14a
--- /dev/null
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/armpl.ll
@@ -0,0 +1,29 @@
+; REQUIRES: aarch64-registered-target
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=aarch64-unknown-linux -mattr=+neon,+sve -vector-library=ArmPL < %s | FileCheck %s
+
+; CHECK: declare <vscale x 4 x float> @armpl_svmodf_f32_x(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16, <vscale x 4 x i1>) [[ATTRS:#[0-9]+]]
+
+; CHECK: declare <vscale x 2 x double> @armpl_svmodf_f64_x(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16, <vscale x 2 x i1>) [[ATTRS]]
+
+; CHECK: declare void @armpl_svsincos_f32_x(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16, <vscale x 4 x i1>) [[ATTRS:#[0-9]+]]
+
+; CHECK: declare void @armpl_svsincos_f64_x(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16, <vscale x 2 x i1>) [[ATTRS]]
+
+; CHECK: declare void @armpl_svsincospi_f32_x(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16, <vscale x 4 x i1>) [[ATTRS]]
+
+; CHECK: declare void @armpl_svsincospi_f64_x(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16, <vscale x 2 x i1>) [[ATTRS]]
+
+; CHECK: declare <4 x float> @armpl_vmodfq_f32(<4 x float>, ptr noalias nonnull writeonly align 16) [[ATTRS]]
+
+; CHECK: declare <2 x double> @armpl_vmodfq_f64(<2 x double>, ptr noalias nonnull writeonly align 16) [[ATTRS]]
+
+; CHECK: declare void @armpl_vsincospiq_f32(<4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]
+
+; CHECK: declare void @armpl_vsincospiq_f64(<2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]
+
+; CHECK: declare aarch64_vector_pcs void @armpl_vsincosq_f32(<4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]
+
+; CHECK: declare aarch64_vector_pcs void @armpl_vsincosq_f64(<2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]
+
+
+; CHECK: attributes [[ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
index ee3a0539bf300..4c8c829a59f3c 100644
--- a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
@@ -10,6 +10,15 @@ define float @sinf(float %x) {
   ret float %x
 }
 
-; CHECK: declare void @acosf(...)
+; CHECK: declare void @_Unwind_Resume(...)
+
 ; CHECK: declare void @__umodti3(...)
 
+; CHECK: declare void @acosf(...)
+
+; CHECK: declare nofpclass(ninf nsub nnorm) double @sqrt(double) [[SQRT_ATTRS:#[0-9]+]]
+
+; CHECK: declare nofpclass(ninf nsub nnorm) float @sqrtf(float) [[SQRT_ATTRS:#[0-9]+]]
+
+; CHECK: declare void @truncl(...)
+
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll
new file mode 100644
index 0000000000000..ffbf11d4106dc
--- /dev/null
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll
@@ -0,0 +1,11 @@
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define noundef nofpclass(nan) float @sqrtf(float %x) "foo" {
+  %ret = call float asm "; $0 = sqrt($1)", "=r,r"(float %x)
+  ret float %ret
+}
+
+; FIXME: Individual fields of nofpclass not merged
+; CHECK: define noundef nofpclass(ninf nsub nnorm) float @sqrtf(float %x) [[SQRT_ATTR:#[0-9]+]] {
+
+; CHECK: attributes [[SQRT_ATTR]] = { nocallback nofree nosync nounwind willreturn memory(errnomem: write) "foo" }
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
new file mode 100644
index 0000000000000..57cb016bcb7f3
--- /dev/null
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
@@ -0,0 +1,23 @@
+; RUN: %if x86-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,X64 %s %}
+; RUN: %if aarch64-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=arm64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s %}
+; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s %}
+; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s %}
+; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s %}
+
+; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios6 < %s | FileCheck -check-prefix=NONE %s %}
+; RUN: %if x86-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.8 < %s | FileCheck -check-prefix=NONE %s %}
+
+; X64: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]]
+; X64: declare <2 x float> @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]]
+
+; STRUCT: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]]
+; STRUCT: declare { float, float } @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]]
+
+; SRET: declare void @__sincos_stret(ptr sret({ double, double }) align 4, double) [[SINCOS_ATTRS:#[0-9]+]]
+; SRET: declare void @__sincosf_stret(ptr sret({ float, float }) align 4, float) [[SINCOS_ATTRS:#[0-9]+]]
+
+; CHECK: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(errnomem: write) }
+; SRET: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write, errnomem: write) }
+
+; NONE-NOT: __sincos_stret
+; NONE-NOT: __sincosf_stret
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sleef.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sleef.ll
new file mode 100644
index 0000000000000..ef2481111087f
--- /dev/null
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sleef.ll
@@ -0,0 +1,28 @@
+; REQUIRES: aarch64-registered-target
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=aarch64-unknown-linux -mattr=+neon,+sve -vector-library=sleefgnuabi < %s | FileCheck %s
+
+; CHECK: declare <2 x double> @_ZGVnN2vl8_modf(<2 x double>, ptr noalias nonnull writeonly align 16) [[ATTRS:#[0-9]+]]
+
+; CHECK: declare void @_ZGVnN2vl8l8_sincos(<2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]
+
+; CHECK: declare void @_ZGVnN2vl8l8_sincospi(<2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]
+
+; CHECK: declare <4 x float> @_ZGVnN4vl4_modff(<4 x float>, ptr noalias nonnull writeonly align 16) [[ATTRS]]
+
+; CHECK: declare void @_ZGVnN4vl4l4_sincosf(<4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]
+
+; CHECK: declare void @_ZGVnN4vl4l4_sincospif(<4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]
+
+; CHECK: declare <vscale x 4 x float> @_ZGVsNxvl4_modff(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16) [[ATTRS]]
+
+; CHECK: declare void @_ZGVsNxvl4l4_sincosf(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]
+
+; CHECK: declare void @_ZGVsNxvl4l4_sincospif(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]
+
+; CHECK: declare <vscale x 2 x double> @_ZGVsNxvl8_modf(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16) [[ATTRS]]
+
+; CHECK: declare void @_ZGVsNxvl8l8_sincos(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]
+
+; CHECK: declare void @_ZGVsNxvl8l8_sincospi(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]
+
+; CHECK: attributes [[ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll
new file mode 100644
index 0000000000000..2451010df5b75
--- /dev/null
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll
@@ -0,0 +1,21 @@
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck %s
+
+; Make sure there is no crash if there are definitions or declarations
+; with the wrong type signature.
+
+; CHECK: define void @sqrtf() {
+define void @sqrtf() {
+  ret void
+}
+
+; CHECK: define float @sqrt(float %0) {
+define float @sqrt(float) {
+  ret float 0.0
+}
+
+; CHECK: declare double @__sincos_stret(double)
+declare double @__sincos_stret(double)
+
+; CHECK: declare { float, float } @__sincosf_stret(float)
+declare { float, float } @__sincosf_stret(float)
+
diff --git a/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll b/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll
index 0be13ee76bece..f024106b7299a 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/branch-on-same-cond.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
 ; RUN: opt -S -passes=print-predicateinfo < %s 2>&1 >/dev/null | FileCheck %s
 
 ; FIXME:  RenamedOp should be %cmp or %x in all cases here,
@@ -9,25 +9,25 @@ define i32 @test(i32 %x) {
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK: RenamedOp: [[CMP]]
-; CHECK:         [[CMP_0:%.*]] = bitcast i1 [[CMP]] to i1
-; CHECK: RenamedOp: [[X]]
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[BB2:%.*]], label [[EXIT1:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB1]],label [[BB2:%.*]]], RenamedOp: [[CMP]] }
+; CHECK-NEXT:    [[CMP_0:%.*]] = bitcast i1 [[CMP]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB1]],label [[BB2]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB2]], label [[EXIT1:%.*]]
 ; CHECK:       bb2:
-; CHECK: RenamedOp: [[CMP_0]]
-; CHECK:         [[CMP_0_1:%.*]] = bitcast i1 [[CMP_0]] to i1
-; CHECK: RenamedOp: [[X]]
-; CHECK:         [[X_0_1:%.*]] = bitcast i32 [[X_0]] to i32
-; CHECK: RenamedOp: [[X_0]]
-; CHECK:         [[X_0_4:%.*]] = bitcast i32 [[X_0]] to i32
-; CHECK-NEXT:    br i1 [[CMP_0]], label [[BB3:%.*]], label [[EXIT2:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB2]],label [[BB3:%.*]]], RenamedOp: [[CMP_0]] }
+; CHECK-NEXT:    [[CMP_0_1:%.*]] = bitcast i1 [[CMP_0]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB2]],label [[BB3]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0_1:%.*]] = bitcast i32 [[X_0]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB2]],label [[EXIT2:%.*]]], RenamedOp: [[X_0]] }
+; CHECK-NEXT:    [[X_0_4:%.*]] = bitcast i32 [[X_0]] to i32
+; CHECK-NEXT:    br i1 [[CMP_0]], label [[BB3]], label [[EXIT2]]
 ; CHECK:       bb3:
-; CHECK: RenamedOp: [[X]]
-; CHECK:         [[X_0_1_2:%.*]] = bitcast i32 [[X_0_1]] to i32
-; CHECK: RenamedOp: [[X_0_1]]
-; CHECK:         [[X_0_1_3:%.*]] = bitcast i32 [[X_0_1]] to i32
-; CHECK-NEXT:    br i1 [[CMP_0_1]], label [[EXIT3:%.*]], label [[EXIT4:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB3]],label [[EXIT3:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0_1_2:%.*]] = bitcast i32 [[X_0_1]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB3]],label [[EXIT4:%.*]]], RenamedOp: [[X_0_1]] }
+; CHECK-NEXT:    [[X_0_1_3:%.*]] = bitcast i32 [[X_0_1]] to i32
+; CHECK-NEXT:    br i1 [[CMP_0_1]], label [[EXIT3]], label [[EXIT4]]
 ; CHECK:       exit1:
 ; CHECK-NEXT:    ret i32 0
 ; CHECK:       exit2:
diff --git a/llvm/test/Transforms/Util/PredicateInfo/condprop.ll b/llvm/test/Transforms/Util/PredicateInfo/condprop.ll
index 256d0d908ec1e..42e8ccb760b3f 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/condprop.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/condprop.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
 ; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
 
 @a = external global i32		; <ptr> [#uses=7]
@@ -98,12 +98,17 @@ define void @test3(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
 ; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
-; CHECK:         [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
-; CHECK:         [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
-; CHECK:         [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK-NEXT:    br i1 [[Z]], label [[BOTH_ZERO:%.*]], label [[NOPE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = and i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH_ZERO:%.*]]], RenamedOp: [[XZ]] }
+; CHECK-NEXT:    [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH_ZERO]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH_ZERO]]], RenamedOp: [[YZ]] }
+; CHECK-NEXT:    [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH_ZERO]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH_ZERO]], label [[NOPE]]
 ; CHECK:       both_zero:
 ; CHECK-NEXT:    call void @foo(i1 [[XZ_0]])
 ; CHECK-NEXT:    call void @foo(i1 [[YZ_0]])
@@ -133,10 +138,11 @@ define void @test4(i1 %b, i32 %x) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[SW:%.*]], label [[CASE3:%.*]]
 ; CHECK:       sw:
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X:%.*]] to i32
+; CHECK-NEXT:  ; switch predicate info { CaseValue: i32 1 Edge: [label [[SW]],label [[CASE1:%.*]]], RenamedOp: [[X:%.*]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
 ; CHECK-NEXT:    switch i32 [[X]], label [[DEFAULT:%.*]] [
 ; CHECK-NEXT:      i32 0, label [[CASE0:%.*]]
-; CHECK-NEXT:      i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i32 1, label [[CASE1]]
 ; CHECK-NEXT:      i32 2, label [[CASE0]]
 ; CHECK-NEXT:      i32 3, label [[CASE3]]
 ; CHECK-NEXT:      i32 4, label [[DEFAULT]]
@@ -180,11 +186,15 @@ case3:
 define i1 @test5(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[X_1:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK:         [[Y_1:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], [[Y]] Edge: [label [[TMP0:%.*]],label [[SAME:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_1:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[SAME]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_1:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME]], label [[DIFFERENT]]
 ; CHECK:       same:
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[X_0]], [[Y_0]]
 ; CHECK-NEXT:    ret i1 [[CMP2]]
@@ -253,11 +263,15 @@ different:
 define i1 @test7(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]]
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[X_1:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK:         [[Y_1:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp sgt i32 [[X]], [[Y]] Edge: [label [[TMP0:%.*]],label [[SAME:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp sgt i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_1:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp sgt i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[SAME]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp sgt i32 [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_1:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME]], label [[DIFFERENT]]
 ; CHECK:       same:
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[X_0]], [[Y_0]]
 ; CHECK-NEXT:    ret i1 [[CMP2]]
@@ -280,11 +294,15 @@ different:
 define i1 @test7_fp(float %x, float %y) {
 ; CHECK-LABEL: @test7_fp(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]]
-; CHECK:         [[X_0:%.*]] = bitcast float [[X]] to float
-; CHECK:         [[X_1:%.*]] = bitcast float [[X]] to float
-; CHECK:         [[Y_0:%.*]] = bitcast float [[Y]] to float
-; CHECK:         [[Y_1:%.*]] = bitcast float [[Y]] to float
-; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp ogt float [[X]], [[Y]] Edge: [label [[TMP0:%.*]],label [[SAME:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast float [[X]] to float
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp ogt float [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_1:%.*]] = bitcast float [[X]] to float
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp ogt float [[X]], [[Y]] Edge: [label [[TMP0]],label [[SAME]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast float [[Y]] to float
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp ogt float [[X]], [[Y]] Edge: [label [[TMP0]],label [[DIFFERENT]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_1:%.*]] = bitcast float [[Y]] to float
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME]], label [[DIFFERENT]]
 ; CHECK:       same:
 ; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ule float [[X_0]], [[Y_0]]
 ; CHECK-NEXT:    ret i1 [[CMP2]]
@@ -353,9 +371,11 @@ different:
 define i32 @test9(i32 %i, i32 %j) {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]]
-; CHECK:         [[I_0:%.*]] = bitcast i32 [[I]] to i32
-; CHECK:         [[J_0:%.*]] = bitcast i32 [[J]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[I]], [[J]] Edge: [label [[TMP0:%.*]],label [[COND_TRUE:%.*]]], RenamedOp: [[I]] }
+; CHECK-NEXT:    [[I_0:%.*]] = bitcast i32 [[I]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[I]], [[J]] Edge: [label [[TMP0]],label [[COND_TRUE]]], RenamedOp: [[J]] }
+; CHECK-NEXT:    [[J_0:%.*]] = bitcast i32 [[J]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE]], label [[RET:%.*]]
 ; CHECK:       cond_true:
 ; CHECK-NEXT:    [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]]
 ; CHECK-NEXT:    ret i32 [[DIFF]]
@@ -376,9 +396,11 @@ ret:
 define i32 @test10(i32 %j, i32 %i) {
 ; CHECK-LABEL: @test10(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]]
-; CHECK:         [[I_0:%.*]] = bitcast i32 [[I]] to i32
-; CHECK:         [[J_0:%.*]] = bitcast i32 [[J]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[I]], [[J]] Edge: [label [[TMP0:%.*]],label [[COND_TRUE:%.*]]], RenamedOp: [[I]] }
+; CHECK-NEXT:    [[I_0:%.*]] = bitcast i32 [[I]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[I]], [[J]] Edge: [label [[TMP0]],label [[COND_TRUE]]], RenamedOp: [[J]] }
+; CHECK-NEXT:    [[J_0:%.*]] = bitcast i32 [[J]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE]], label [[RET:%.*]]
 ; CHECK:       cond_true:
 ; CHECK-NEXT:    [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]]
 ; CHECK-NEXT:    ret i32 [[DIFF]]
@@ -403,15 +425,18 @@ define i32 @test11(i32 %x) {
 ; CHECK-NEXT:    [[V0:%.*]] = call i32 @yogibar()
 ; CHECK-NEXT:    [[V1:%.*]] = call i32 @yogibar()
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[V0]], [[V1]]
-; CHECK:         [[V0_0:%.*]] = bitcast i32 [[V0]] to i32
-; CHECK:         [[V1_0:%.*]] = bitcast i32 [[V1]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[NEXT:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[V0]], [[V1]] Edge: [label [[TMP0:%.*]],label [[NEXT:%.*]]], RenamedOp: [[V0]] }
+; CHECK-NEXT:    [[V0_0:%.*]] = bitcast i32 [[V0]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[V0]], [[V1]] Edge: [label [[TMP0]],label [[COND_TRUE:%.*]]], RenamedOp: [[V1]] }
+; CHECK-NEXT:    [[V1_0:%.*]] = bitcast i32 [[V1]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE]], label [[NEXT]]
 ; CHECK:       cond_true:
 ; CHECK-NEXT:    ret i32 [[V1_0]]
 ; CHECK:       next:
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X:%.*]], [[V0_0]]
-; CHECK:         [[V0_0_1:%.*]] = bitcast i32 [[V0_0]] to i32
-; CHECK-NEXT:    br i1 [[CMP2]], label [[COND_TRUE2:%.*]], label [[NEXT2:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP2]] = icmp eq i32 [[X]], [[V0_0]] Edge: [label [[NEXT]],label [[COND_TRUE2:%.*]]], RenamedOp: [[V0_0]] }
+; CHECK-NEXT:    [[V0_0_1:%.*]] = bitcast i32 [[V0_0]] to i32
+; CHECK-NEXT:    br i1 [[CMP2]], label [[COND_TRUE2]], label [[NEXT2:%.*]]
 ; CHECK:       cond_true2:
 ; CHECK-NEXT:    ret i32 [[V0_0_1]]
 ; CHECK:       next2:
@@ -439,9 +464,11 @@ next2:
 define i32 @test12(i32 %x) {
 ; CHECK-LABEL: @test12(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[X_1:%.*]] = bitcast i32 [[X]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0:%.*]],label [[COND_TRUE:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[COND_FALSE:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_1:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE]], label [[COND_FALSE]]
 ; CHECK:       cond_true:
 ; CHECK-NEXT:    br label [[RET:%.*]]
 ; CHECK:       cond_false:
diff --git a/llvm/test/Transforms/Util/PredicateInfo/diamond.ll b/llvm/test/Transforms/Util/PredicateInfo/diamond.ll
index ac2c9a1026e76..06c02d699c511 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/diamond.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/diamond.ll
@@ -1,16 +1,18 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=print-predicateinfo < %s 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
+; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
 define i1 @f(i32 %x, i1 %y) {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:    br i1 [[Y:%.*]], label [[BB0:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb0:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[X:%.*]], 0
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp sge i32 [[X]], 0 Edge: [label [[BB0]],label [[BB2:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB2]], label [[BB3:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[X2:%.*]] = add nuw nsw i32 [[X]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i32 [[X2]], 2
-; CHECK:         [[X2_0:%.*]] = bitcast i32 [[X2]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP2]] = icmp sge i32 [[X2]], 2 Edge: [label [[BB1]],label [[BB2]]], RenamedOp: [[X2]] }
+; CHECK-NEXT:    [[X2_0:%.*]] = bitcast i32 [[X2]] to i32
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[BB2]], label [[BB3]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[X3:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ [[X2_0]], [[BB1]] ]
@@ -38,12 +40,14 @@ define i1 @g(i32 %x, i1 %y) {
 ; CHECK-NEXT:    br i1 [[Y:%.*]], label [[BB0:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb0:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[X:%.*]], 0
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[BB3:%.*]], label [[BB2:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp sge i32 [[X]], 0 Edge: [label [[BB0]],label [[BB2:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB3:%.*]], label [[BB2]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[X2:%.*]] = add nuw nsw i32 [[X]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i32 [[X2]], 2
-; CHECK:         [[X2_0:%.*]] = bitcast i32 [[X2]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP2]] = icmp sge i32 [[X2]], 2 Edge: [label [[BB1]],label [[BB2]]], RenamedOp: [[X2]] }
+; CHECK-NEXT:    [[X2_0:%.*]] = bitcast i32 [[X2]] to i32
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[BB3]], label [[BB2]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[X3:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ [[X2_0]], [[BB1]] ]
diff --git a/llvm/test/Transforms/Util/PredicateInfo/edge.ll b/llvm/test/Transforms/Util/PredicateInfo/edge.ll
index ef757f323921a..913832696215e 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/edge.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/edge.ll
@@ -1,16 +1,17 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=print-predicateinfo < %s 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
+; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
 
 define i32 @f1(i32 %x) {
 ; CHECK-LABEL: @f1(
 ; CHECK-NEXT:  bb0:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[BB2:%.*]], label [[BB1:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = icmp eq i32 [[X]], 0 Edge: [label [[BB0:%.*]],label [[BB2:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB2]], label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[BB2]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ]
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ 0, [[BB1]] ]
 ; CHECK-NEXT:    [[FOO:%.*]] = add i32 [[COND]], [[X]]
 ; CHECK-NEXT:    ret i32 [[FOO]]
 ;
@@ -29,12 +30,13 @@ define i32 @f2(i32 %x) {
 ; CHECK-LABEL: @f2(
 ; CHECK-NEXT:  bb0:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[X:%.*]], 0
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK-NEXT:    br i1 [[CMP]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = icmp ne i32 [[X]], 0 Edge: [label [[BB0:%.*]],label [[BB2:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB1:%.*]], label [[BB2]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[BB2]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ]
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ 0, [[BB1]] ]
 ; CHECK-NEXT:    [[FOO:%.*]] = add i32 [[COND]], [[X]]
 ; CHECK-NEXT:    ret i32 [[FOO]]
 ;
@@ -52,14 +54,15 @@ bb2:
 define i32 @f3(i32 %x) {
 ; CHECK-LABEL: @f3(
 ; CHECK-NEXT:  bb0:
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X:%.*]] to i32
+; CHECK-NEXT:  ; switch predicate info { CaseValue: i32 0 Edge: [label [[BB0:%.*]],label [[BB2:%.*]]], RenamedOp: [[X:%.*]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
 ; CHECK-NEXT:    switch i32 [[X]], label [[BB1:%.*]] [
-; CHECK-NEXT:    i32 0, label [[BB2:%.*]]
+; CHECK-NEXT:      i32 0, label [[BB2]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[BB2]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ]
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ 0, [[BB1]] ]
 ; CHECK-NEXT:    [[FOO:%.*]] = add i32 [[COND]], [[X]]
 ; CHECK-NEXT:    ret i32 [[FOO]]
 ;
@@ -78,13 +81,14 @@ define double @fcmp_oeq_not_zero(double %x, double %y) {
 ; CHECK-LABEL: @fcmp_oeq_not_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], 2.000000e+00
-; CHECK:         [[Y_0:%.*]] = bitcast double [[Y]] to double
-; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp oeq double [[Y]], 2.000000e+00 Edge: [label [[ENTRY:%.*]],label [[IF:%.*]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast double [[Y]] to double
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF]], label [[RETURN:%.*]]
 ; CHECK:       if:
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY]] ]
 ; CHECK-NEXT:    ret double [[RETVAL]]
 ;
 entry:
@@ -105,13 +109,14 @@ define double @fcmp_une_not_zero(double %x, double %y) {
 ; CHECK-LABEL: @fcmp_une_not_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y:%.*]], 2.000000e+00
-; CHECK:         [[Y_0:%.*]] = bitcast double [[Y]] to double
-; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp une double [[Y]], 2.000000e+00 Edge: [label [[ENTRY:%.*]],label [[ELSE:%.*]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast double [[Y]] to double
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE]]
 ; CHECK:       else:
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY]] ]
 ; CHECK-NEXT:    ret double [[RETVAL]]
 ;
 entry:
@@ -132,13 +137,14 @@ define double @fcmp_oeq_zero(double %x, double %y) {
 ; CHECK-LABEL: @fcmp_oeq_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], 0.000000e+00
-; CHECK:         [[Y_0:%.*]] = bitcast double [[Y]] to double
-; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp oeq double [[Y]], 0.000000e+00 Edge: [label [[ENTRY:%.*]],label [[IF:%.*]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast double [[Y]] to double
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF]], label [[RETURN:%.*]]
 ; CHECK:       if:
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY]] ]
 ; CHECK-NEXT:    ret double [[RETVAL]]
 ;
 entry:
@@ -159,13 +165,14 @@ define double @fcmp_une_zero(double %x, double %y) {
 ; CHECK-LABEL: @fcmp_une_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y:%.*]], -0.000000e+00
-; CHECK:         [[Y_0:%.*]] = bitcast double [[Y]] to double
-; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp une double [[Y]], -0.000000e+00 Edge: [label [[ENTRY:%.*]],label [[ELSE:%.*]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast double [[Y]] to double
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE]]
 ; CHECK:       else:
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY]] ]
 ; CHECK-NEXT:    ret double [[RETVAL]]
 ;
 entry:
@@ -188,13 +195,14 @@ define double @fcmp_oeq_maybe_zero(double %x, double %y, double %z1, double %z2)
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Z:%.*]] = fadd double [[Z1:%.*]], [[Z2:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], [[Z]]
-; CHECK:         [[Z_0:%.*]] = bitcast double [[Z]] to double
-; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[CMP]] = fcmp oeq double [[Y]], [[Z]] Edge: [label [[ENTRY:%.*]],label [[IF:%.*]]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[Z_0:%.*]] = bitcast double [[Z]] to double
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF]], label [[RETURN:%.*]]
 ; CHECK:       if:
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Z_0]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY]] ]
 ; CHECK-NEXT:    ret double [[RETVAL]]
 ;
 entry:
@@ -217,13 +225,14 @@ define double @fcmp_une_maybe_zero(double %x, double %y, double %z1, double %z2)
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Z:%.*]] = fadd double [[Z1:%.*]], [[Z2:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y:%.*]], [[Z]]
-; CHECK:         [[Z_0:%.*]] = bitcast double [[Z]] to double
-; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[CMP]] = fcmp une double [[Y]], [[Z]] Edge: [label [[ENTRY:%.*]],label [[ELSE:%.*]]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[Z_0:%.*]] = bitcast double [[Z]] to double
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE]]
 ; CHECK:       else:
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Z_0]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY]] ]
 ; CHECK-NEXT:    ret double [[RETVAL]]
 ;
 entry:
diff --git a/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll b/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll
index 36eaf6e66578d..4762d376ef5aa 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/pr33456.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=print-predicateinfo < %s 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
+; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
 ; Don't insert predicate info for conditions with a single target.
 @a = global i32 1, align 4
 @d = common global i32 0, align 4
@@ -12,22 +12,27 @@ define i32 @main() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @d, align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP13:%.*]]
-; CHECK:         [[TMP4:%.*]] = load i32, ptr @a, align 4
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @a, align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr @c, align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt i32 [[TMP5]], 1
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
-; CHECK:         [[TMP8:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[TMP9]], label [[TMP9]]
-; CHECK:         [[DOT0:%.*]] = phi i32 [ [[TMP4]], [[TMP7]] ], [ [[TMP4]], [[TMP7]] ], [ [[DOT1:%.*]], [[TMP13]] ], [ [[TMP4]], [[TMP3]] ]
+; CHECK:       9:
+; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ [[TMP4]], [[TMP7]] ], [ [[TMP4]], [[TMP7]] ], [ [[DOT1:%.*]], [[TMP13]] ], [ [[TMP4]], [[TMP3]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr @b, align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = sdiv i32 [[TMP10]], [[DOT0]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 0
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[TMP13]], label [[TMP13]]
-; CHECK:         [[DOT1]] = phi i32 [ [[DOT0]], [[TMP9]] ], [ [[DOT0]], [[TMP9]] ], [ undef, [[TMP0:%.*]] ]
+; CHECK:       13:
+; CHECK-NEXT:    [[DOT1]] = phi i32 [ [[DOT0]], [[TMP9]] ], [ [[DOT0]], [[TMP9]] ], [ undef, [[TMP0:%.*]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr @e, align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 0
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[TMP16:%.*]], label [[TMP9]]
-; CHECK:         ret i32 0
+; CHECK:       16:
+; CHECK-NEXT:    ret i32 0
 ;
   %1 = load i32, ptr @d, align 4
   %2 = icmp eq i32 %1, 0
diff --git a/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll b/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll
index bc1d39f371515..e4fd4cc6dd8a2 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/pr33457.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=print-predicateinfo < %s 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
+; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
 ; Don't insert predicate info for conditions with a single target.
 @a = global i32 6, align 4
 @c = global i32 -1, align 4
@@ -13,26 +13,32 @@ define i32 @main() {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:    store i32 6, ptr @e, align 4
 ; CHECK-NEXT:    br label [[TMP1:%.*]]
-; CHECK:         [[TMP2:%.*]] = load i32, ptr @d, align 4
+; CHECK:       1:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @d, align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [6 x i32], ptr @b, i64 0, i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 (ptr, ...) @printf(ptr @.str, i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr @a, align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[TMP8]], label %thread-pre-split, label [[TMP9:%.*]]
-; CHECK:         [[TMP10:%.*]] = load i32, ptr @e, align 4
+; CHECK-NEXT:    br i1 [[TMP8]], label [[THREAD_PRE_SPLIT:%.*]], label [[TMP9:%.*]]
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr @e, align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP12]]
 ; CHECK:       thread-pre-split:
 ; CHECK-NEXT:    [[DOTPR:%.*]] = load i32, ptr @e, align 4
 ; CHECK-NEXT:    br label [[TMP12]]
-; CHECK:         [[TMP13:%.*]] = phi i32 [ [[DOTPR]], %thread-pre-split ], [ [[TMP10]], [[TMP9]] ], [ [[TMP10]], [[TMP9]] ]
+; CHECK:       12:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ [[DOTPR]], [[THREAD_PRE_SPLIT]] ], [ [[TMP10]], [[TMP9]] ], [ [[TMP10]], [[TMP9]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[TMP15:%.*]], label [[TMP15]]
-; CHECK:         br i1 [[TMP14]], label [[TMP16:%.*]], label [[TMP17:%.*]]
-; CHECK:         br label [[TMP17]]
-; CHECK:         [[DOT0:%.*]] = phi i32 [ 1, [[TMP16]] ], [ -1, [[TMP15]] ]
+; CHECK:       15:
+; CHECK-NEXT:    br i1 [[TMP14]], label [[TMP16:%.*]], label [[TMP17:%.*]]
+; CHECK:       16:
+; CHECK-NEXT:    br label [[TMP17]]
+; CHECK:       17:
+; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ 1, [[TMP16]] ], [ -1, [[TMP15]] ]
 ; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[DOT0]], 8693
 ; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr @c, align 4
 ; CHECK-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
@@ -40,7 +46,8 @@ define i32 @main() {
 ; CHECK-NEXT:    store i32 [[TMP21]], ptr @d, align 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp slt i32 [[TMP20]], -2
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[TMP1]], label [[TMP23:%.*]]
-; CHECK:         ret i32 0
+; CHECK:       23:
+; CHECK-NEXT:    ret i32 0
 ;
   store i32 6, ptr @e, align 4
   br label %1
diff --git a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
index cc1dc4e6989a1..d29aadd54128c 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments
 ; RUN: opt -passes=print-predicateinfo -disable-output < %s 2>&1 | FileCheck %s
 
 declare void @foo(i1)
@@ -10,12 +10,17 @@ define void @test_or(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
 ; CHECK-NEXT:    [[Z:%.*]] = or i1 [[XZ]], [[YZ]]
-; CHECK:         [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
-; CHECK:         [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
-; CHECK:         [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK-NEXT:    br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = or i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NEITHER:%.*]]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[XZ]] }
+; CHECK-NEXT:    [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[YZ]] }
+; CHECK-NEXT:    [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:    br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER]]
 ; CHECK:       oneof:
 ; CHECK-NEXT:    call void @foo(i1 [[XZ]])
 ; CHECK-NEXT:    call void @foo(i1 [[YZ]])
@@ -55,12 +60,17 @@ define void @test_or_logical(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
 ; CHECK-NEXT:    [[Z:%.*]] = select i1 [[XZ]], i1 true, i1 [[YZ]]
-; CHECK:         [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
-; CHECK:         [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
-; CHECK:         [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK-NEXT:    br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = select i1 [[XZ]], i1 true, i1 [[YZ]] Edge: [label [[TMP0:%.*]],label [[NEITHER:%.*]]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[XZ]] }
+; CHECK-NEXT:    [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[YZ]] }
+; CHECK-NEXT:    [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[NEITHER]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:    br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER]]
 ; CHECK:       oneof:
 ; CHECK-NEXT:    call void @foo(i1 [[XZ]])
 ; CHECK-NEXT:    call void @foo(i1 [[YZ]])
@@ -100,12 +110,17 @@ define void @test_and(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
 ; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
-; CHECK:         [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
-; CHECK:         [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
-; CHECK:         [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK-NEXT:    br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = and i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH:%.*]]], RenamedOp: [[XZ]] }
+; CHECK-NEXT:    [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[YZ]] }
+; CHECK-NEXT:    [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH]], label [[NOPE]]
 ; CHECK:       both:
 ; CHECK-NEXT:    call void @foo(i1 [[XZ_0]])
 ; CHECK-NEXT:    call void @foo(i1 [[YZ_0]])
@@ -145,12 +160,17 @@ define void @test_and_logical(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
 ; CHECK-NEXT:    [[Z:%.*]] = select i1 [[XZ]], i1 [[YZ]], i1 false
-; CHECK:         [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
-; CHECK:         [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
-; CHECK:         [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK-NEXT:    br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = select i1 [[XZ]], i1 [[YZ]], i1 false Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH:%.*]]], RenamedOp: [[XZ]] }
+; CHECK-NEXT:    [[XZ_0:%.*]] = bitcast i1 [[XZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[YZ]] }
+; CHECK-NEXT:    [[YZ_0:%.*]] = bitcast i1 [[YZ]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[Y_0:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH]], label [[NOPE]]
 ; CHECK:       both:
 ; CHECK-NEXT:    call void @foo(i1 [[XZ_0]])
 ; CHECK-NEXT:    call void @foo(i1 [[YZ_0]])
@@ -190,12 +210,17 @@ define void @testandsame(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[XGT:%.*]] = icmp sgt i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[XLT:%.*]] = icmp slt i32 [[X]], 100
 ; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XGT]], [[XLT]]
-; CHECK:         [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
-; CHECK:         [[XGT_0:%.*]] = bitcast i1 [[XGT]] to i1
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[X_0_1:%.*]] = bitcast i32 [[X_0]] to i32
-; CHECK:         [[XLT_0:%.*]] = bitcast i1 [[XLT]] to i1
-; CHECK-NEXT:    br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = and i1 [[XGT]], [[XLT]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[Z_0:%.*]] = bitcast i1 [[Z]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XGT]] = icmp sgt i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH:%.*]]], RenamedOp: [[XGT]] }
+; CHECK-NEXT:    [[XGT_0:%.*]] = bitcast i1 [[XGT]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XGT]] = icmp sgt i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XLT]] = icmp slt i32 [[X]], 100 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0_1:%.*]] = bitcast i32 [[X_0]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XLT]] = icmp slt i32 [[X]], 100 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[XLT]] }
+; CHECK-NEXT:    [[XLT_0:%.*]] = bitcast i1 [[XLT]] to i1
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH]], label [[NOPE]]
 ; CHECK:       both:
 ; CHECK-NEXT:    call void @foo(i1 [[XGT_0]])
 ; CHECK-NEXT:    call void @foo(i1 [[XLT_0]])
@@ -229,17 +254,27 @@ define void @testandassume(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
 ; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[Z]])
-; CHECK:         [[TMP1:%.*]] = bitcast i32 [[Y]] to i32
-; CHECK:         [[TMP2:%.*]] = bitcast i1 [[YZ]] to i1
-; CHECK:         [[TMP3:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[TMP4:%.*]] = bitcast i1 [[XZ]] to i1
-; CHECK:         [[TMP5:%.*]] = bitcast i1 [[Z]] to i1
-; CHECK:         [[DOT0:%.*]] = bitcast i1 [[TMP5]] to i1
-; CHECK:         [[DOT01:%.*]] = bitcast i1 [[TMP4]] to i1
-; CHECK:         [[DOT02:%.*]] = bitcast i32 [[TMP3]] to i32
-; CHECK:         [[DOT03:%.*]] = bitcast i1 [[TMP2]] to i1
-; CHECK:         [[DOT04:%.*]] = bitcast i32 [[TMP1]] to i32
-; CHECK-NEXT:    br i1 [[TMP5]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[YZ]] = icmp eq i32 [[Y]], 0, RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[Y]] to i32
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[YZ]] = icmp eq i32 [[Y]], 0, RenamedOp: [[YZ]] }
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i1 [[YZ]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[XZ]] = icmp eq i32 [[X]], 0, RenamedOp: [[X]] }
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[XZ]] = icmp eq i32 [[X]], 0, RenamedOp: [[XZ]] }
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i1 [[XZ]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[Z]] = and i1 [[XZ]], [[YZ]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i1 [[Z]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = and i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[TMP5]] }
+; CHECK-NEXT:    [[DOT0:%.*]] = bitcast i1 [[TMP5]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH:%.*]]], RenamedOp: [[XZ]] }
+; CHECK-NEXT:    [[DOT01:%.*]] = bitcast i1 [[TMP4]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XZ]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[DOT02:%.*]] = bitcast i32 [[TMP3]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[YZ]] }
+; CHECK-NEXT:    [[DOT03:%.*]] = bitcast i1 [[TMP2]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[YZ]] = icmp eq i32 [[Y]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[Y]] }
+; CHECK-NEXT:    [[DOT04:%.*]] = bitcast i32 [[TMP1]] to i32
+; CHECK-NEXT:    br i1 [[TMP5]], label [[BOTH]], label [[NOPE]]
 ; CHECK:       both:
 ; CHECK-NEXT:    call void @foo(i1 [[DOT01]])
 ; CHECK-NEXT:    call void @foo(i1 [[DOT03]])
@@ -274,9 +309,11 @@ define void @testorassume(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
 ; CHECK-NEXT:    [[Z:%.*]] = or i1 [[XZ]], [[YZ]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[Z]])
-; CHECK:         [[TMP1:%.*]] = bitcast i1 [[Z]] to i1
-; CHECK:         [[DOT0:%.*]] = bitcast i1 [[TMP1]] to i1
-; CHECK-NEXT:    br i1 [[TMP1]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[Z]] = or i1 [[XZ]], [[YZ]], RenamedOp: [[Z]] }
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[Z]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[Z]] = or i1 [[XZ]], [[YZ]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[TMP1]] }
+; CHECK-NEXT:    [[DOT0:%.*]] = bitcast i1 [[TMP1]] to i1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BOTH:%.*]], label [[NOPE]]
 ; CHECK:       both:
 ; CHECK-NEXT:    call void @foo(i1 [[XZ]])
 ; CHECK-NEXT:    call void @foo(i1 [[YZ]])
@@ -307,12 +344,17 @@ define void @test_and_one_unknown_cond(i32 %x, i1 %c1) {
 ; CHECK-LABEL: @test_and_one_unknown_cond(
 ; CHECK-NEXT:    [[C2:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[A:%.*]] = and i1 [[C1:%.*]], [[C2]]
-; CHECK:         [[A_0:%.*]] = bitcast i1 [[A]] to i1
-; CHECK:         [[A_1:%.*]] = bitcast i1 [[A]] to i1
-; CHECK:         [[C1_0:%.*]] = bitcast i1 [[C1]] to i1
-; CHECK:         [[C2_0:%.*]] = bitcast i1 [[C2]] to i1
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK-NEXT:    br i1 [[A]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A]] = and i1 [[C1]], [[C2]] Edge: [label [[TMP0:%.*]],label [[BOTH:%.*]]], RenamedOp: [[A]] }
+; CHECK-NEXT:    [[A_0:%.*]] = bitcast i1 [[A]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A]] = and i1 [[C1]], [[C2]] Edge: [label [[TMP0]],label [[NOPE:%.*]]], RenamedOp: [[A]] }
+; CHECK-NEXT:    [[A_1:%.*]] = bitcast i1 [[A]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison:i1 [[C1]] Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[C1]] }
+; CHECK-NEXT:    [[C1_0:%.*]] = bitcast i1 [[C1]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[C2]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[C2]] }
+; CHECK-NEXT:    [[C2_0:%.*]] = bitcast i1 [[C2]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[C2]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:    br i1 [[A]], label [[BOTH]], label [[NOPE]]
 ; CHECK:       both:
 ; CHECK-NEXT:    call void @bar(i32 [[X_0]])
 ; CHECK-NEXT:    call void @foo(i1 [[C1_0]])
@@ -349,12 +391,17 @@ define void @test_or_one_unknown_cond(i32 %x, i1 %c1) {
 ; CHECK-LABEL: @test_or_one_unknown_cond(
 ; CHECK-NEXT:    [[C2:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[A:%.*]] = or i1 [[C1:%.*]], [[C2]]
-; CHECK:         [[A_0:%.*]] = bitcast i1 [[A]] to i1
-; CHECK:         [[A_1:%.*]] = bitcast i1 [[A]] to i1
-; CHECK:         [[C1_0:%.*]] = bitcast i1 [[C1]] to i1
-; CHECK:         [[C2_0:%.*]] = bitcast i1 [[C2]] to i1
-; CHECK:         [[X_0:%.*]] = bitcast i32 [[X]] to i32
-; CHECK-NEXT:    br i1 [[A]], label [[NOPE:%.*]], label [[BOTH_INVERTED:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A]] = or i1 [[C1]], [[C2]] Edge: [label [[TMP0:%.*]],label [[NOPE:%.*]]], RenamedOp: [[A]] }
+; CHECK-NEXT:    [[A_0:%.*]] = bitcast i1 [[A]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A]] = or i1 [[C1]], [[C2]] Edge: [label [[TMP0]],label [[BOTH_INVERTED:%.*]]], RenamedOp: [[A]] }
+; CHECK-NEXT:    [[A_1:%.*]] = bitcast i1 [[A]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison:i1 [[C1]] Edge: [label [[TMP0]],label [[BOTH_INVERTED]]], RenamedOp: [[C1]] }
+; CHECK-NEXT:    [[C1_0:%.*]] = bitcast i1 [[C1]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[C2]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH_INVERTED]]], RenamedOp: [[C2]] }
+; CHECK-NEXT:    [[C2_0:%.*]] = bitcast i1 [[C2]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[C2]] = icmp eq i32 [[X]], 0 Edge: [label [[TMP0]],label [[BOTH_INVERTED]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:    br i1 [[A]], label [[NOPE]], label [[BOTH_INVERTED]]
 ; CHECK:       both_inverted:
 ; CHECK-NEXT:    call void @bar(i32 [[X_0]])
 ; CHECK-NEXT:    call void @foo(i1 [[C1_0]])
@@ -391,13 +438,19 @@ define void @test_and_chain(i1 %a, i1 %b, i1 %c) {
 ; CHECK-LABEL: @test_and_chain(
 ; CHECK-NEXT:    [[AND1:%.*]] = and i1 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[AND2:%.*]] = and i1 [[AND1]], [[C:%.*]]
-; CHECK:         [[AND2_0:%.*]] = bitcast i1 [[AND2]] to i1
-; CHECK:         [[AND2_1:%.*]] = bitcast i1 [[AND2]] to i1
-; CHECK:         [[AND1_0:%.*]] = bitcast i1 [[AND1]] to i1
-; CHECK:         [[A_0:%.*]] = bitcast i1 [[A]] to i1
-; CHECK:         [[B_0:%.*]] = bitcast i1 [[B]] to i1
-; CHECK:         [[C_0:%.*]] = bitcast i1 [[C]] to i1
-; CHECK-NEXT:    br i1 [[AND2]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[AND2]] = and i1 [[AND1]], [[C]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[AND2]] }
+; CHECK-NEXT:    [[AND2_0:%.*]] = bitcast i1 [[AND2]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[AND2]] = and i1 [[AND1]], [[C]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[AND2]] }
+; CHECK-NEXT:    [[AND2_1:%.*]] = bitcast i1 [[AND2]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[AND1]] = and i1 [[A]], [[B]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[AND1]] }
+; CHECK-NEXT:    [[AND1_0:%.*]] = bitcast i1 [[AND1]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison:i1 [[A]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A]] }
+; CHECK-NEXT:    [[A_0:%.*]] = bitcast i1 [[A]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison:i1 [[B]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[B]] }
+; CHECK-NEXT:    [[B_0:%.*]] = bitcast i1 [[B]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison:i1 [[C]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[C]] }
+; CHECK-NEXT:    [[C_0:%.*]] = bitcast i1 [[C]] to i1
+; CHECK-NEXT:    br i1 [[AND2]], label [[IF]], label [[ELSE]]
 ; CHECK:       if:
 ; CHECK-NEXT:    call void @foo(i1 [[A_0]])
 ; CHECK-NEXT:    call void @foo(i1 [[B_0]])
@@ -438,13 +491,19 @@ define void @test_or_chain(i1 %a, i1 %b, i1 %c) {
 ; CHECK-LABEL: @test_or_chain(
 ; CHECK-NEXT:    [[OR1:%.*]] = or i1 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[OR2:%.*]] = or i1 [[OR1]], [[C:%.*]]
-; CHECK:         [[OR2_0:%.*]] = bitcast i1 [[OR2]] to i1
-; CHECK:         [[OR2_1:%.*]] = bitcast i1 [[OR2]] to i1
-; CHECK:         [[OR1_0:%.*]] = bitcast i1 [[OR1]] to i1
-; CHECK:         [[A_0:%.*]] = bitcast i1 [[A]] to i1
-; CHECK:         [[B_0:%.*]] = bitcast i1 [[B]] to i1
-; CHECK:         [[C_0:%.*]] = bitcast i1 [[C]] to i1
-; CHECK-NEXT:    br i1 [[OR2]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[OR2]] = or i1 [[OR1]], [[C]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[OR2]] }
+; CHECK-NEXT:    [[OR2_0:%.*]] = bitcast i1 [[OR2]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[OR2]] = or i1 [[OR1]], [[C]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[OR2]] }
+; CHECK-NEXT:    [[OR2_1:%.*]] = bitcast i1 [[OR2]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[OR1]] = or i1 [[A]], [[B]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[OR1]] }
+; CHECK-NEXT:    [[OR1_0:%.*]] = bitcast i1 [[OR1]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison:i1 [[A]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A]] }
+; CHECK-NEXT:    [[A_0:%.*]] = bitcast i1 [[A]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison:i1 [[B]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[B]] }
+; CHECK-NEXT:    [[B_0:%.*]] = bitcast i1 [[B]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison:i1 [[C]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[C]] }
+; CHECK-NEXT:    [[C_0:%.*]] = bitcast i1 [[C]] to i1
+; CHECK-NEXT:    br i1 [[OR2]], label [[IF]], label [[ELSE]]
 ; CHECK:       if:
 ; CHECK-NEXT:    call void @foo(i1 [[A]])
 ; CHECK-NEXT:    call void @foo(i1 [[B]])
@@ -485,11 +544,15 @@ define void @test_and_or_mixed(i1 %a, i1 %b, i1 %c) {
 ; CHECK-LABEL: @test_and_or_mixed(
 ; CHECK-NEXT:    [[OR:%.*]] = or i1 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[AND:%.*]] = and i1 [[OR]], [[C:%.*]]
-; CHECK:         [[AND_0:%.*]] = bitcast i1 [[AND]] to i1
-; CHECK:         [[AND_1:%.*]] = bitcast i1 [[AND]] to i1
-; CHECK:         [[OR_0:%.*]] = bitcast i1 [[OR]] to i1
-; CHECK:         [[C_0:%.*]] = bitcast i1 [[C]] to i1
-; CHECK-NEXT:    br i1 [[AND]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[AND]] = and i1 [[OR]], [[C]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[AND]] }
+; CHECK-NEXT:    [[AND_0:%.*]] = bitcast i1 [[AND]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[AND]] = and i1 [[OR]], [[C]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[AND]] }
+; CHECK-NEXT:    [[AND_1:%.*]] = bitcast i1 [[AND]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[OR]] = or i1 [[A]], [[B]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[OR]] }
+; CHECK-NEXT:    [[OR_0:%.*]] = bitcast i1 [[OR]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison:i1 [[C]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[C]] }
+; CHECK-NEXT:    [[C_0:%.*]] = bitcast i1 [[C]] to i1
+; CHECK-NEXT:    br i1 [[AND]], label [[IF]], label [[ELSE]]
 ; CHECK:       if:
 ; CHECK-NEXT:    call void @foo(i1 [[A]])
 ; CHECK-NEXT:    call void @foo(i1 [[B]])
@@ -542,16 +605,25 @@ define void @test_deep_and_chain(i1 %a1) {
 ; CHECK-NEXT:    [[A13:%.*]] = and i1 [[A12]], true
 ; CHECK-NEXT:    [[A14:%.*]] = and i1 [[A13]], true
 ; CHECK-NEXT:    [[A15:%.*]] = and i1 [[A14]], true
-; CHECK:         [[A15_0:%.*]] = bitcast i1 [[A15]] to i1
-; CHECK:         [[A15_1:%.*]] = bitcast i1 [[A15]] to i1
-; CHECK:         [[A14_0:%.*]] = bitcast i1 [[A14]] to i1
-; CHECK:         [[A13_0:%.*]] = bitcast i1 [[A13]] to i1
-; CHECK:         [[A12_0:%.*]] = bitcast i1 [[A12]] to i1
-; CHECK:         [[A11_0:%.*]] = bitcast i1 [[A11]] to i1
-; CHECK:         [[A10_0:%.*]] = bitcast i1 [[A10]] to i1
-; CHECK:         [[A9_0:%.*]] = bitcast i1 [[A9]] to i1
-; CHECK:         [[A8_0:%.*]] = bitcast i1 [[A8]] to i1
-; CHECK-NEXT:    br i1 [[A15]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A15]] = and i1 [[A14]], true Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[A15]] }
+; CHECK-NEXT:    [[A15_0:%.*]] = bitcast i1 [[A15]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A15]] = and i1 [[A14]], true Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[A15]] }
+; CHECK-NEXT:    [[A15_1:%.*]] = bitcast i1 [[A15]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A14]] = and i1 [[A13]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A14]] }
+; CHECK-NEXT:    [[A14_0:%.*]] = bitcast i1 [[A14]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A13]] = and i1 [[A12]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A13]] }
+; CHECK-NEXT:    [[A13_0:%.*]] = bitcast i1 [[A13]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A12]] = and i1 [[A11]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A12]] }
+; CHECK-NEXT:    [[A12_0:%.*]] = bitcast i1 [[A12]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A11]] = and i1 [[A10]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A11]] }
+; CHECK-NEXT:    [[A11_0:%.*]] = bitcast i1 [[A11]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A10]] = and i1 [[A9]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A10]] }
+; CHECK-NEXT:    [[A10_0:%.*]] = bitcast i1 [[A10]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A9]] = and i1 [[A8]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A9]] }
+; CHECK-NEXT:    [[A9_0:%.*]] = bitcast i1 [[A9]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A8]] = and i1 [[A7]], true Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A8]] }
+; CHECK-NEXT:    [[A8_0:%.*]] = bitcast i1 [[A8]] to i1
+; CHECK-NEXT:    br i1 [[A15]], label [[IF]], label [[ELSE]]
 ; CHECK:       if:
 ; CHECK-NEXT:    call void @foo(i1 [[A1]])
 ; CHECK-NEXT:    call void @foo(i1 [[A2]])
@@ -656,16 +728,25 @@ define void @test_deep_and_tree(i1 %a1) {
 ; CHECK-NEXT:    [[A13:%.*]] = and i1 [[A12]], [[A12]]
 ; CHECK-NEXT:    [[A14:%.*]] = and i1 [[A13]], [[A13]]
 ; CHECK-NEXT:    [[A15:%.*]] = and i1 [[A14]], [[A14]]
-; CHECK:         [[A15_0:%.*]] = bitcast i1 [[A15]] to i1
-; CHECK:         [[A15_1:%.*]] = bitcast i1 [[A15]] to i1
-; CHECK:         [[A14_0:%.*]] = bitcast i1 [[A14]] to i1
-; CHECK:         [[A13_0:%.*]] = bitcast i1 [[A13]] to i1
-; CHECK:         [[A12_0:%.*]] = bitcast i1 [[A12]] to i1
-; CHECK:         [[A11_0:%.*]] = bitcast i1 [[A11]] to i1
-; CHECK:         [[A10_0:%.*]] = bitcast i1 [[A10]] to i1
-; CHECK:         [[A9_0:%.*]] = bitcast i1 [[A9]] to i1
-; CHECK:         [[A8_0:%.*]] = bitcast i1 [[A8]] to i1
-; CHECK-NEXT:    br i1 [[A15]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A15]] = and i1 [[A14]], [[A14]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[A15]] }
+; CHECK-NEXT:    [[A15_0:%.*]] = bitcast i1 [[A15]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A15]] = and i1 [[A14]], [[A14]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[A15]] }
+; CHECK-NEXT:    [[A15_1:%.*]] = bitcast i1 [[A15]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A14]] = and i1 [[A13]], [[A13]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A14]] }
+; CHECK-NEXT:    [[A14_0:%.*]] = bitcast i1 [[A14]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A13]] = and i1 [[A12]], [[A12]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A13]] }
+; CHECK-NEXT:    [[A13_0:%.*]] = bitcast i1 [[A13]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A12]] = and i1 [[A11]], [[A11]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A12]] }
+; CHECK-NEXT:    [[A12_0:%.*]] = bitcast i1 [[A12]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A11]] = and i1 [[A10]], [[A10]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A11]] }
+; CHECK-NEXT:    [[A11_0:%.*]] = bitcast i1 [[A11]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A10]] = and i1 [[A9]], [[A9]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A10]] }
+; CHECK-NEXT:    [[A10_0:%.*]] = bitcast i1 [[A10]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A9]] = and i1 [[A8]], [[A8]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A9]] }
+; CHECK-NEXT:    [[A9_0:%.*]] = bitcast i1 [[A9]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A8]] = and i1 [[A7]], [[A7]] Edge: [label [[TMP0]],label [[IF]]], RenamedOp: [[A8]] }
+; CHECK-NEXT:    [[A8_0:%.*]] = bitcast i1 [[A8]] to i1
+; CHECK-NEXT:    br i1 [[A15]], label [[IF]], label [[ELSE]]
 ; CHECK:       if:
 ; CHECK-NEXT:    call void @foo(i1 [[A1]])
 ; CHECK-NEXT:    call void @foo(i1 [[A2]])
@@ -770,16 +851,25 @@ define void @test_deep_or_tree(i1 %a1) {
 ; CHECK-NEXT:    [[A13:%.*]] = or i1 [[A12]], [[A12]]
 ; CHECK-NEXT:    [[A14:%.*]] = or i1 [[A13]], [[A13]]
 ; CHECK-NEXT:    [[A15:%.*]] = or i1 [[A14]], [[A14]]
-; CHECK:         [[A15_0:%.*]] = bitcast i1 [[A15]] to i1
-; CHECK:         [[A15_1:%.*]] = bitcast i1 [[A15]] to i1
-; CHECK:         [[A14_0:%.*]] = bitcast i1 [[A14]] to i1
-; CHECK:         [[A13_0:%.*]] = bitcast i1 [[A13]] to i1
-; CHECK:         [[A12_0:%.*]] = bitcast i1 [[A12]] to i1
-; CHECK:         [[A11_0:%.*]] = bitcast i1 [[A11]] to i1
-; CHECK:         [[A10_0:%.*]] = bitcast i1 [[A10]] to i1
-; CHECK:         [[A9_0:%.*]] = bitcast i1 [[A9]] to i1
-; CHECK:         [[A8_0:%.*]] = bitcast i1 [[A8]] to i1
-; CHECK-NEXT:    br i1 [[A15]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[A15]] = or i1 [[A14]], [[A14]] Edge: [label [[TMP0:%.*]],label [[IF:%.*]]], RenamedOp: [[A15]] }
+; CHECK-NEXT:    [[A15_0:%.*]] = bitcast i1 [[A15]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A15]] = or i1 [[A14]], [[A14]] Edge: [label [[TMP0]],label [[ELSE:%.*]]], RenamedOp: [[A15]] }
+; CHECK-NEXT:    [[A15_1:%.*]] = bitcast i1 [[A15]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A14]] = or i1 [[A13]], [[A13]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A14]] }
+; CHECK-NEXT:    [[A14_0:%.*]] = bitcast i1 [[A14]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A13]] = or i1 [[A12]], [[A12]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A13]] }
+; CHECK-NEXT:    [[A13_0:%.*]] = bitcast i1 [[A13]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A12]] = or i1 [[A11]], [[A11]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A12]] }
+; CHECK-NEXT:    [[A12_0:%.*]] = bitcast i1 [[A12]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A11]] = or i1 [[A10]], [[A10]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A11]] }
+; CHECK-NEXT:    [[A11_0:%.*]] = bitcast i1 [[A11]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A10]] = or i1 [[A9]], [[A9]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A10]] }
+; CHECK-NEXT:    [[A10_0:%.*]] = bitcast i1 [[A10]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A9]] = or i1 [[A8]], [[A8]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A9]] }
+; CHECK-NEXT:    [[A9_0:%.*]] = bitcast i1 [[A9]] to i1
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 0 Comparison: [[A8]] = or i1 [[A7]], [[A7]] Edge: [label [[TMP0]],label [[ELSE]]], RenamedOp: [[A8]] }
+; CHECK-NEXT:    [[A8_0:%.*]] = bitcast i1 [[A8]] to i1
+; CHECK-NEXT:    br i1 [[A15]], label [[IF]], label [[ELSE]]
 ; CHECK:       if:
 ; CHECK-NEXT:    call void @foo(i1 [[A1]])
 ; CHECK-NEXT:    call void @foo(i1 [[A2]])
@@ -873,11 +963,16 @@ define void @test_assume_and_chain(i1 %a, i1 %b, i1 %c) {
 ; CHECK-NEXT:    [[AND1:%.*]] = and i1 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[AND2:%.*]] = and i1 [[AND1]], [[C:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[AND2]])
-; CHECK:         [[TMP1:%.*]] = bitcast i1 [[C]] to i1
-; CHECK:         [[TMP2:%.*]] = bitcast i1 [[B]] to i1
-; CHECK:         [[TMP3:%.*]] = bitcast i1 [[A]] to i1
-; CHECK:         [[TMP4:%.*]] = bitcast i1 [[AND1]] to i1
-; CHECK:         [[TMP5:%.*]] = bitcast i1 [[AND2]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison:i1 [[C]], RenamedOp: [[C]] }
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[C]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison:i1 [[B]], RenamedOp: [[B]] }
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i1 [[B]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison:i1 [[A]], RenamedOp: [[A]] }
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i1 [[A]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[AND1]] = and i1 [[A]], [[B]], RenamedOp: [[AND1]] }
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i1 [[AND1]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[AND2]] = and i1 [[AND1]], [[C]], RenamedOp: [[AND2]] }
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i1 [[AND2]] to i1
 ; CHECK-NEXT:    call void @foo(i1 [[TMP3]])
 ; CHECK-NEXT:    call void @foo(i1 [[TMP2]])
 ; CHECK-NEXT:    call void @foo(i1 [[TMP1]])
@@ -901,7 +996,8 @@ define void @test_assume_or_chain(i1 %a, i1 %b, i1 %c) {
 ; CHECK-NEXT:    [[OR1:%.*]] = or i1 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[OR2:%.*]] = or i1 [[OR1]], [[C:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OR2]])
-; CHECK:         [[TMP1:%.*]] = bitcast i1 [[OR2]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[OR2]] = or i1 [[OR1]], [[C]], RenamedOp: [[OR2]] }
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[OR2]] to i1
 ; CHECK-NEXT:    call void @foo(i1 [[A]])
 ; CHECK-NEXT:    call void @foo(i1 [[B]])
 ; CHECK-NEXT:    call void @foo(i1 [[C]])
@@ -937,14 +1033,22 @@ define void @test_assume_deep_and_tree(i1 %a1) {
 ; CHECK-NEXT:    [[A14:%.*]] = and i1 [[A13]], [[A13]]
 ; CHECK-NEXT:    [[A15:%.*]] = and i1 [[A14]], [[A14]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[A15]])
-; CHECK:         [[TMP1:%.*]] = bitcast i1 [[A8]] to i1
-; CHECK:         [[TMP2:%.*]] = bitcast i1 [[A9]] to i1
-; CHECK:         [[TMP3:%.*]] = bitcast i1 [[A10]] to i1
-; CHECK:         [[TMP4:%.*]] = bitcast i1 [[A11]] to i1
-; CHECK:         [[TMP5:%.*]] = bitcast i1 [[A12]] to i1
-; CHECK:         [[TMP6:%.*]] = bitcast i1 [[A13]] to i1
-; CHECK:         [[TMP7:%.*]] = bitcast i1 [[A14]] to i1
-; CHECK:         [[TMP8:%.*]] = bitcast i1 [[A15]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[A8]] = and i1 [[A7]], [[A7]], RenamedOp: [[A8]] }
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[A8]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[A9]] = and i1 [[A8]], [[A8]], RenamedOp: [[A9]] }
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i1 [[A9]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[A10]] = and i1 [[A9]], [[A9]], RenamedOp: [[A10]] }
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i1 [[A10]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[A11]] = and i1 [[A10]], [[A10]], RenamedOp: [[A11]] }
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i1 [[A11]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[A12]] = and i1 [[A11]], [[A11]], RenamedOp: [[A12]] }
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i1 [[A12]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[A13]] = and i1 [[A12]], [[A12]], RenamedOp: [[A13]] }
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i1 [[A13]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[A14]] = and i1 [[A13]], [[A13]], RenamedOp: [[A14]] }
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i1 [[A14]] to i1
+; CHECK-NEXT:  ; assume predicate info { Comparison: [[A15]] = and i1 [[A14]], [[A14]], RenamedOp: [[A15]] }
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i1 [[A15]] to i1
 ; CHECK-NEXT:    call void @foo(i1 [[A1]])
 ; CHECK-NEXT:    call void @foo(i1 [[A2]])
 ; CHECK-NEXT:    call void @foo(i1 [[A3]])
@@ -1001,13 +1105,15 @@ define i32 @test_and_with_phinode(i32 %x) {
 ; CHECK-NEXT:    [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1
 ; CHECK-NEXT:    [[XLT2:%.*]] = icmp ult i32 [[X]], 2
 ; CHECK-NEXT:    [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]]
-; CHECK:         [[X_0_1:%.*]] = bitcast i32 [[X]] to i32
-; CHECK:         [[X_0_2:%.*]] = bitcast i32 [[X_0_1]] to i32
-; CHECK-NEXT:    br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]]
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XGE1]] = icmp uge i32 [[X]], 1 Edge: [label [[ENTRY:%.*]],label [[PHI:%.*]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0_1:%.*]] = bitcast i32 [[X]] to i32
+; CHECK-NEXT:  ; branch predicate info { TrueEdge: 1 Comparison: [[XLT2]] = icmp ult i32 [[X]], 2 Edge: [label [[ENTRY]],label [[PHI]]], RenamedOp: [[X]] }
+; CHECK-NEXT:    [[X_0_2:%.*]] = bitcast i32 [[X_0_1]] to i32
+; CHECK-NEXT:    br i1 [[AND]], label [[PHI]], label [[NOPE:%.*]]
 ; CHECK:       nope:
 ; CHECK-NEXT:    br label [[PHI]]
 ; CHECK:       phi:
-; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[X_0_2]], [[ENTRY:%.*]] ], [ 1, [[NOPE]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[X_0_2]], [[ENTRY]] ], [ 1, [[NOPE]] ]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
diff --git a/llvm/test/Transforms/Util/annotation-remarks-dbg-info.ll b/llvm/test/Transforms/Util/annotation-remarks-dbg-info.ll
index a0fa79aa7edbe..7fc72077ee5b3 100644
--- a/llvm/test/Transforms/Util/annotation-remarks-dbg-info.ll
+++ b/llvm/test/Transforms/Util/annotation-remarks-dbg-info.ll
@@ -72,5 +72,5 @@ entry:
 !14 = !{!15}
 !15 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 1, type: !10)
 !16 = !DILocation(line: 400, column: 3, scope: !7)
-!17 = distinct !DISubprogram(name: "test2", scope: !1, file: !1, line: 21, type: !8, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !14)
+!17 = distinct !DISubprogram(name: "test2", scope: !1, file: !1, line: 21, type: !8, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
 !18 = !DILocation(line: 200, column: 3, scope: !17)
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll b/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll
index 60700412686ea..e7b11cdf8475e 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll
@@ -346,3 +346,189 @@ entry:
   call void @use.i32(i32 %ext.3)
   ret void
 }
+
+define noundef i32 @zext_v4i8_all_lanes_used_no_freeze(<4 x i8> %src) {
+; CHECK-LABEL: define noundef i32 @zext_v4i8_all_lanes_used_no_freeze(
+; CHECK-SAME: <4 x i8> [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[SRC]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 24
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], 255
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], 255
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP0]], 255
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0
+; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1
+; CHECK-NEXT:    [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2
+; CHECK-NEXT:    [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP3]]
+; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP1]]
+; CHECK-NEXT:    ret i32 [[ADD3]]
+;
+entry:
+  %ext = zext nneg <4 x i8> %src to <4 x i32>
+  %ext.0 = extractelement <4 x i32> %ext, i64 0
+  %ext.1 = extractelement <4 x i32> %ext, i64 1
+  %ext.2 = extractelement <4 x i32> %ext, i64 2
+  %ext.3 = extractelement <4 x i32> %ext, i64 3
+
+  %add1 = add i32 %ext.0, %ext.1
+  %add2 = add i32 %add1, %ext.2
+  %add3 = add i32 %add2, %ext.3
+  ret i32 %add3
+}
+
+define noundef i32 @zext_v4i8_not_all_lanes_used(<4 x i8> %src) {
+; CHECK-LABEL: define noundef i32 @zext_v4i8_not_all_lanes_used(
+; CHECK-SAME: <4 x i8> [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = freeze <4 x i8> [[SRC]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 24
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], 255
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP0]], 255
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0
+; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1
+; CHECK-NEXT:    [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP1]]
+; CHECK-NEXT:    ret i32 [[ADD3]]
+;
+entry:
+  %ext = zext nneg <4 x i8> %src to <4 x i32>
+  %ext.0 = extractelement <4 x i32> %ext, i64 0
+  %ext.1 = extractelement <4 x i32> %ext, i64 1
+  %ext.3 = extractelement <4 x i32> %ext, i64 3
+
+  %add1 = add i32 %ext.0, %ext.1
+  %add2 = add i32 %add1, %ext.3
+  ret i32 %add2
+}
+
+define i32 @zext_v4i8_all_lanes_used_no_ub(<4 x i8> %src) {
+; CHECK-LABEL: define i32 @zext_v4i8_all_lanes_used_no_ub(
+; CHECK-SAME: <4 x i8> [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = freeze <4 x i8> [[SRC]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 24
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP1]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP3]], 255
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 255
+; CHECK-NEXT:    [[TMP7:%.*]] = and i32 [[TMP1]], 255
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0
+; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1
+; CHECK-NEXT:    [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2
+; CHECK-NEXT:    [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP4]]
+; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[ADD3]]
+;
+entry:
+  %ext = zext nneg <4 x i8> %src to <4 x i32>
+  %ext.0 = extractelement <4 x i32> %ext, i64 0
+  %ext.1 = extractelement <4 x i32> %ext, i64 1
+  %ext.2 = extractelement <4 x i32> %ext, i64 2
+  %ext.3 = extractelement <4 x i32> %ext, i64 3
+
+  %add1 = add i32 %ext.0, %ext.1
+  %add2 = add i32 %add1, %ext.2
+  %add3 = add i32 %add2, %ext.3
+  ret i32 %add3
+}
+
+define noundef i32 @zext_v4i8_extracts_different_blocks(<4 x i8> %src, i1 %cond) {
+; CHECK-LABEL: define noundef i32 @zext_v4i8_extracts_different_blocks(
+; CHECK-SAME: <4 x i8> [[SRC:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = freeze <4 x i8> [[SRC]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 24
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP1]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP3]], 255
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 255
+; CHECK-NEXT:    [[TMP7:%.*]] = and i32 [[TMP1]], 255
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0
+; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1
+; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[TMP4]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[PHI]]
+; CHECK-NEXT:    ret i32 [[ADD2]]
+;
+entry:
+  %ext = zext nneg <4 x i8> %src to <4 x i32>
+  %ext.0 = extractelement <4 x i32> %ext, i64 0
+  %ext.1 = extractelement <4 x i32> %ext, i64 1
+  br i1 %cond, label %then, label %else
+
+then:
+  %ext.2 = extractelement <4 x i32> %ext, i64 2
+  br label %exit
+
+else:
+  %ext.3 = extractelement <4 x i32> %ext, i64 3
+  br label %exit
+
+exit:
+  %phi = phi i32 [ %ext.2, %then ], [ %ext.3, %else ]
+  %add1 = add i32 %ext.0, %ext.1
+  %add2 = add i32 %add1, %phi
+  ret i32 %add2
+}
+
+
+declare void @may_throw() willreturn
+
+define noundef i32 @zext_v4i8_throwing_call_between(<4 x i8> %src) {
+; CHECK-LABEL: define noundef i32 @zext_v4i8_throwing_call_between(
+; CHECK-SAME: <4 x i8> [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = freeze <4 x i8> [[SRC]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 24
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP1]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP3]], 255
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 255
+; CHECK-NEXT:    [[TMP7:%.*]] = and i32 [[TMP1]], 255
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0
+; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1
+; CHECK-NEXT:    [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2
+; CHECK-NEXT:    call void @may_throw()
+; CHECK-NEXT:    [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP4]]
+; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[ADD3]]
+;
+entry:
+  %ext = zext nneg <4 x i8> %src to <4 x i32>
+  %ext.0 = extractelement <4 x i32> %ext, i64 0
+  %ext.1 = extractelement <4 x i32> %ext, i64 1
+  %ext.2 = extractelement <4 x i32> %ext, i64 2
+  call void @may_throw()
+  %ext.3 = extractelement <4 x i32> %ext, i64 3
+  %add1 = add i32 %ext.0, %ext.1
+  %add2 = add i32 %add1, %ext.2
+  %add3 = add i32 %add2, %ext.3
+  ret i32 %add3
+}
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-bitcast-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-bitcast-scalarization.ll
new file mode 100644
index 0000000000000..ca3df3310a795
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/AArch64/load-bitcast-scalarization.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=vector-combine -mtriple=arm64-apple-darwinos -S %s | FileCheck %s
+
+define i32 @load_v4i8_bitcast_to_i32(ptr %x) {
+; CHECK-LABEL: define i32 @load_v4i8_bitcast_to_i32(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:    [[R_SCALAR:%.*]] = load i32, ptr [[X]], align 4
+; CHECK-NEXT:    ret i32 [[R_SCALAR]]
+;
+  %lv = load <4 x i8>, ptr %x
+  %r = bitcast <4 x i8> %lv to i32
+  ret i32 %r
+}
+
+define i64 @load_v2i32_bitcast_to_i64(ptr %x) {
+; CHECK-LABEL: define i64 @load_v2i32_bitcast_to_i64(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:    [[R_SCALAR:%.*]] = load i64, ptr [[X]], align 8
+; CHECK-NEXT:    ret i64 [[R_SCALAR]]
+;
+  %lv = load <2 x i32>, ptr %x
+  %r = bitcast <2 x i32> %lv to i64
+  ret i64 %r
+}
+
+define float @load_v4i8_bitcast_to_float(ptr %x) {
+; CHECK-LABEL: define float @load_v4i8_bitcast_to_float(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:    [[R_SCALAR:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    ret float [[R_SCALAR]]
+;
+  %lv = load <4 x i8>, ptr %x
+  %r = bitcast <4 x i8> %lv to float
+  ret float %r
+}
+
+define float @load_v2i16_bitcast_to_float(ptr %x) {
+; CHECK-LABEL: define float @load_v2i16_bitcast_to_float(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:    [[R_SCALAR:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    ret float [[R_SCALAR]]
+;
+  %lv = load <2 x i16>, ptr %x
+  %r = bitcast <2 x i16> %lv to float
+  ret float %r
+}
+
+define double @load_v4i16_bitcast_to_double(ptr %x) {
+; CHECK-LABEL: define double @load_v4i16_bitcast_to_double(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:    [[LV:%.*]] = load <4 x i16>, ptr [[X]], align 8
+; CHECK-NEXT:    [[R_SCALAR:%.*]] = bitcast <4 x i16> [[LV]] to double
+; CHECK-NEXT:    ret double [[R_SCALAR]]
+;
+  %lv = load <4 x i16>, ptr %x
+  %r = bitcast <4 x i16> %lv to double
+  ret double %r
+}
+
+define double @load_v2i32_bitcast_to_double(ptr %x) {
+; CHECK-LABEL: define double @load_v2i32_bitcast_to_double(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:    [[LV:%.*]] = load <2 x i32>, ptr [[X]], align 8
+; CHECK-NEXT:    [[R_SCALAR:%.*]] = bitcast <2 x i32> [[LV]] to double
+; CHECK-NEXT:    ret double [[R_SCALAR]]
+;
+  %lv = load <2 x i32>, ptr %x
+  %r = bitcast <2 x i32> %lv to double
+  ret double %r
+}
+
+; Multiple users with the same bitcast type should be scalarized.
+define i32 @load_v4i8_bitcast_multiple_users_same_type(ptr %x) {
+; CHECK-LABEL: define i32 @load_v4i8_bitcast_multiple_users_same_type(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:    [[LV_SCALAR:%.*]] = load i32, ptr [[X]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LV_SCALAR]], [[LV_SCALAR]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %lv = load <4 x i8>, ptr %x
+  %r1 = bitcast <4 x i8> %lv to i32
+  %r2 = bitcast <4 x i8> %lv to i32
+  %add = add i32 %r1, %r2
+  ret i32 %add
+}
+
+; Different bitcast types should not be scalarized.
+define i32 @load_v4i8_bitcast_multiple_users_different_types(ptr %x) {
+; CHECK-LABEL: define i32 @load_v4i8_bitcast_multiple_users_different_types(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:    [[LV:%.*]] = load <4 x i8>, ptr [[X]], align 4
+; CHECK-NEXT:    [[R1:%.*]] = bitcast <4 x i8> [[LV]] to i32
+; CHECK-NEXT:    [[R2:%.*]] = bitcast <4 x i8> [[LV]] to float
+; CHECK-NEXT:    [[R2_INT:%.*]] = bitcast float [[R2]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[R1]], [[R2_INT]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %lv = load <4 x i8>, ptr %x
+  %r1 = bitcast <4 x i8> %lv to i32
+  %r2 = bitcast <4 x i8> %lv to float
+  %r2.int = bitcast float %r2 to i32
+  %add = add i32 %r1, %r2.int
+  ret i32 %add
+}
+
+; Bitcast to vector should not be scalarized.
+define <2 x i16> @load_v4i8_bitcast_to_vector(ptr %x) {
+; CHECK-LABEL: define <2 x i16> @load_v4i8_bitcast_to_vector(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:    [[LV:%.*]] = load <4 x i8>, ptr [[X]], align 4
+; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i8> [[LV]] to <2 x i16>
+; CHECK-NEXT:    ret <2 x i16> [[R]]
+;
+  %lv = load <4 x i8>, ptr %x
+  %r = bitcast <4 x i8> %lv to <2 x i16>
+  ret <2 x i16> %r
+}
+
+; Load with both bitcast users and other users should not be scalarized.
+define i32 @load_v4i8_mixed_users(ptr %x) {
+; CHECK-LABEL: define i32 @load_v4i8_mixed_users(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:    [[LV:%.*]] = load <4 x i8>, ptr [[X]], align 4
+; CHECK-NEXT:    [[R1:%.*]] = bitcast <4 x i8> [[LV]] to i32
+; CHECK-NEXT:    [[R2:%.*]] = extractelement <4 x i8> [[LV]], i32 0
+; CHECK-NEXT:    [[R2_EXT:%.*]] = zext i8 [[R2]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[R1]], [[R2_EXT]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %lv = load <4 x i8>, ptr %x
+  %r1 = bitcast <4 x i8> %lv to i32
+  %r2 = extractelement <4 x i8> %lv, i32 0
+  %r2.ext = zext i8 %r2 to i32
+  %add = add i32 %r1, %r2.ext
+  ret i32 %add
+}
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/sve-interleave-splat.ll b/llvm/test/Transforms/VectorCombine/AArch64/sve-interleave-splat.ll
new file mode 100644
index 0000000000000..921bcf086f2bf
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/AArch64/sve-interleave-splat.ll
@@ -0,0 +1,11 @@
+; RUN: opt -passes=vector-combine %s -S -o - | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 4 x i16> @interleave2_same_const_splat_nxv4i16() {
+;CHECK-LABEL: @interleave2_same_const_splat_nxv4i16(
+;CHECK: call <vscale x 4 x i16> @llvm.vector.interleave2
+;CHECK: ret <vscale x 4 x i16> %retval
+  %retval = call <vscale x 4 x i16> @llvm.vector.interleave2.nxv4i16(<vscale x 2 x i16> splat(i16 3), <vscale x 2 x i16> splat(i16 3))
+  ret <vscale x 4 x i16> %retval
+}
diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll
new file mode 100644
index 0000000000000..4b551fad5b43a
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll
@@ -0,0 +1,567 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=vector-combine < %s | FileCheck -check-prefix=OPT %s
+
+; Generated from amdgpu-promote-alloca on array of vectors
+; VectorCombiner should recognize chain of extract-insert vectors
+; and turn them into one or two shuffles
+define amdgpu_kernel void @extract_insert_chain_to_shuffles(<16 x i8> %in, <16 x i8> %add, ptr addrspace(3) %out) #0 {
+; OPT-LABEL: define amdgpu_kernel void @extract_insert_chain_to_shuffles(
+; OPT-SAME: <16 x i8> [[IN:%.*]], <16 x i8> [[ADD:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-NEXT:  [[ENTRY:.*:]]
+; OPT-NEXT:    [[ALLOCA:%.*]] = freeze <128 x i8> poison
+; OPT-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP1:%.*]] = shufflevector <128 x i8> [[ALLOCA]], <128 x i8> [[TMP0]], <128 x i32> <i32 128, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP2:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP3:%.*]] = insertelement <128 x i8> [[TMP1]], i8 [[TMP2]], i32 1
+; OPT-NEXT:    [[TMP4:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP5:%.*]] = insertelement <128 x i8> [[TMP3]], i8 [[TMP4]], i32 2
+; OPT-NEXT:    [[TMP6:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP7:%.*]] = insertelement <128 x i8> [[TMP5]], i8 [[TMP6]], i32 3
+; OPT-NEXT:    [[TMP8:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP9:%.*]] = insertelement <128 x i8> [[TMP7]], i8 [[TMP8]], i32 4
+; OPT-NEXT:    [[TMP10:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP11:%.*]] = insertelement <128 x i8> [[TMP9]], i8 [[TMP10]], i32 5
+; OPT-NEXT:    [[TMP12:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP13:%.*]] = insertelement <128 x i8> [[TMP11]], i8 [[TMP12]], i32 6
+; OPT-NEXT:    [[TMP14:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP15:%.*]] = insertelement <128 x i8> [[TMP13]], i8 [[TMP14]], i32 7
+; OPT-NEXT:    [[TMP16:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP17:%.*]] = insertelement <128 x i8> [[TMP15]], i8 [[TMP16]], i32 8
+; OPT-NEXT:    [[TMP18:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP19:%.*]] = insertelement <128 x i8> [[TMP17]], i8 [[TMP18]], i32 9
+; OPT-NEXT:    [[TMP20:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP21:%.*]] = insertelement <128 x i8> [[TMP19]], i8 [[TMP20]], i32 10
+; OPT-NEXT:    [[TMP22:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP23:%.*]] = insertelement <128 x i8> [[TMP21]], i8 [[TMP22]], i32 11
+; OPT-NEXT:    [[TMP24:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP25:%.*]] = insertelement <128 x i8> [[TMP23]], i8 [[TMP24]], i32 12
+; OPT-NEXT:    [[TMP26:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP27:%.*]] = insertelement <128 x i8> [[TMP25]], i8 [[TMP26]], i32 13
+; OPT-NEXT:    [[TMP28:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP29:%.*]] = insertelement <128 x i8> [[TMP27]], i8 [[TMP28]], i32 14
+; OPT-NEXT:    [[TMP30:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP31:%.*]] = insertelement <128 x i8> [[TMP29]], i8 [[TMP30]], i32 15
+; OPT-NEXT:    [[TMP32:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP33:%.*]] = shufflevector <128 x i8> [[TMP31]], <128 x i8> [[TMP32]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 128, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP34:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 17
+; OPT-NEXT:    [[TMP36:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 18
+; OPT-NEXT:    [[TMP38:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 19
+; OPT-NEXT:    [[TMP40:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 20
+; OPT-NEXT:    [[TMP42:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 21
+; OPT-NEXT:    [[TMP44:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 22
+; OPT-NEXT:    [[TMP46:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 23
+; OPT-NEXT:    [[TMP48:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 24
+; OPT-NEXT:    [[TMP50:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 25
+; OPT-NEXT:    [[TMP52:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 26
+; OPT-NEXT:    [[TMP54:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 27
+; OPT-NEXT:    [[TMP56:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 28
+; OPT-NEXT:    [[TMP58:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 29
+; OPT-NEXT:    [[TMP60:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 30
+; OPT-NEXT:    [[TMP62:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 31
+; OPT-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP65:%.*]] = shufflevector <128 x i8> [[TMP63]], <128 x i8> [[TMP64]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 128, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP66:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 33
+; OPT-NEXT:    [[TMP68:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 34
+; OPT-NEXT:    [[TMP70:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 35
+; OPT-NEXT:    [[TMP72:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 36
+; OPT-NEXT:    [[TMP74:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 37
+; OPT-NEXT:    [[TMP76:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 38
+; OPT-NEXT:    [[TMP78:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 39
+; OPT-NEXT:    [[TMP80:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 40
+; OPT-NEXT:    [[TMP82:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 41
+; OPT-NEXT:    [[TMP84:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 42
+; OPT-NEXT:    [[TMP86:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 43
+; OPT-NEXT:    [[TMP88:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 44
+; OPT-NEXT:    [[TMP90:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 45
+; OPT-NEXT:    [[TMP92:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 46
+; OPT-NEXT:    [[TMP94:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 47
+; OPT-NEXT:    [[TMP96:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP97:%.*]] = shufflevector <128 x i8> [[TMP95]], <128 x i8> [[TMP96]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 128, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP98:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 49
+; OPT-NEXT:    [[TMP100:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 50
+; OPT-NEXT:    [[TMP102:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 51
+; OPT-NEXT:    [[TMP104:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 52
+; OPT-NEXT:    [[TMP106:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 53
+; OPT-NEXT:    [[TMP108:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 54
+; OPT-NEXT:    [[TMP110:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 55
+; OPT-NEXT:    [[TMP112:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 56
+; OPT-NEXT:    [[TMP114:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 57
+; OPT-NEXT:    [[TMP116:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 58
+; OPT-NEXT:    [[TMP118:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 59
+; OPT-NEXT:    [[TMP120:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 60
+; OPT-NEXT:    [[TMP122:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 61
+; OPT-NEXT:    [[TMP124:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 62
+; OPT-NEXT:    [[TMP126:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 63
+; OPT-NEXT:    [[TMP128:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP129:%.*]] = shufflevector <128 x i8> [[TMP127]], <128 x i8> [[TMP128]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 128, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP130:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 65
+; OPT-NEXT:    [[TMP132:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 66
+; OPT-NEXT:    [[TMP134:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 67
+; OPT-NEXT:    [[TMP136:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 68
+; OPT-NEXT:    [[TMP138:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 69
+; OPT-NEXT:    [[TMP140:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 70
+; OPT-NEXT:    [[TMP142:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 71
+; OPT-NEXT:    [[TMP144:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 72
+; OPT-NEXT:    [[TMP146:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 73
+; OPT-NEXT:    [[TMP148:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 74
+; OPT-NEXT:    [[TMP150:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 75
+; OPT-NEXT:    [[TMP152:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 76
+; OPT-NEXT:    [[TMP154:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 77
+; OPT-NEXT:    [[TMP156:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 78
+; OPT-NEXT:    [[TMP158:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 79
+; OPT-NEXT:    [[TMP160:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP161:%.*]] = shufflevector <128 x i8> [[TMP159]], <128 x i8> [[TMP160]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 128, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP162:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 81
+; OPT-NEXT:    [[TMP164:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 82
+; OPT-NEXT:    [[TMP166:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 83
+; OPT-NEXT:    [[TMP168:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 84
+; OPT-NEXT:    [[TMP170:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 85
+; OPT-NEXT:    [[TMP172:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 86
+; OPT-NEXT:    [[TMP174:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 87
+; OPT-NEXT:    [[TMP176:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 88
+; OPT-NEXT:    [[TMP178:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 89
+; OPT-NEXT:    [[TMP180:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 90
+; OPT-NEXT:    [[TMP182:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 91
+; OPT-NEXT:    [[TMP184:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 92
+; OPT-NEXT:    [[TMP186:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 93
+; OPT-NEXT:    [[TMP188:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 94
+; OPT-NEXT:    [[TMP190:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 95
+; OPT-NEXT:    [[TMP192:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP193:%.*]] = shufflevector <128 x i8> [[TMP191]], <128 x i8> [[TMP192]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 128, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP194:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 97
+; OPT-NEXT:    [[TMP196:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 98
+; OPT-NEXT:    [[TMP198:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 99
+; OPT-NEXT:    [[TMP200:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 100
+; OPT-NEXT:    [[TMP202:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 101
+; OPT-NEXT:    [[TMP204:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 102
+; OPT-NEXT:    [[TMP206:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 103
+; OPT-NEXT:    [[TMP208:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 104
+; OPT-NEXT:    [[TMP210:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 105
+; OPT-NEXT:    [[TMP212:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 106
+; OPT-NEXT:    [[TMP214:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 107
+; OPT-NEXT:    [[TMP216:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 108
+; OPT-NEXT:    [[TMP218:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 109
+; OPT-NEXT:    [[TMP220:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 110
+; OPT-NEXT:    [[TMP222:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 111
+; OPT-NEXT:    [[TMP224:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP225:%.*]] = shufflevector <128 x i8> [[TMP223]], <128 x i8> [[TMP224]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 128, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP226:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 113
+; OPT-NEXT:    [[TMP228:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 114
+; OPT-NEXT:    [[TMP230:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 115
+; OPT-NEXT:    [[TMP232:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 116
+; OPT-NEXT:    [[TMP234:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 117
+; OPT-NEXT:    [[TMP236:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 118
+; OPT-NEXT:    [[TMP238:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 119
+; OPT-NEXT:    [[TMP240:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 120
+; OPT-NEXT:    [[TMP242:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 121
+; OPT-NEXT:    [[TMP244:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 122
+; OPT-NEXT:    [[TMP246:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 123
+; OPT-NEXT:    [[TMP248:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 124
+; OPT-NEXT:    [[TMP250:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 125
+; OPT-NEXT:    [[TMP252:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 126
+; OPT-NEXT:    [[TMP254:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 127
+; OPT-NEXT:    [[TMP256:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <16 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP257:%.*]] = insertelement <16 x i8> [[TMP256]], i8 [[TMP162]], i64 1
+; OPT-NEXT:    [[TMP258:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP164]], i64 2
+; OPT-NEXT:    [[TMP259:%.*]] = insertelement <16 x i8> [[TMP258]], i8 [[TMP166]], i64 3
+; OPT-NEXT:    [[TMP260:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP168]], i64 4
+; OPT-NEXT:    [[TMP261:%.*]] = insertelement <16 x i8> [[TMP260]], i8 [[TMP170]], i64 5
+; OPT-NEXT:    [[TMP262:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP172]], i64 6
+; OPT-NEXT:    [[TMP263:%.*]] = insertelement <16 x i8> [[TMP262]], i8 [[TMP174]], i64 7
+; OPT-NEXT:    [[TMP264:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP176]], i64 8
+; OPT-NEXT:    [[TMP265:%.*]] = insertelement <16 x i8> [[TMP264]], i8 [[TMP178]], i64 9
+; OPT-NEXT:    [[TMP266:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP180]], i64 10
+; OPT-NEXT:    [[TMP267:%.*]] = insertelement <16 x i8> [[TMP266]], i8 [[TMP182]], i64 11
+; OPT-NEXT:    [[TMP268:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP184]], i64 12
+; OPT-NEXT:    [[TMP269:%.*]] = insertelement <16 x i8> [[TMP268]], i8 [[TMP186]], i64 13
+; OPT-NEXT:    [[TMP270:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP188]], i64 14
+; OPT-NEXT:    [[TMP271:%.*]] = shufflevector <16 x i8> [[TMP270]], <16 x i8> [[IN]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
+; OPT-NEXT:    [[SUM:%.*]] = add <16 x i8> [[TMP271]], [[ADD]]
+; OPT-NEXT:    store <16 x i8> [[SUM]], ptr addrspace(3) [[OUT]], align 16
+; OPT-NEXT:    ret void
+;
+entry:
+  %alloca = freeze <128 x i8> poison
+  %0 = extractelement <16 x i8> %in, i64 0
+  %1 = insertelement <128 x i8> %alloca, i8 %0, i32 0
+  %2 = extractelement <16 x i8> %in, i64 1
+  %3 = insertelement <128 x i8> %1, i8 %2, i32 1
+  %4 = extractelement <16 x i8> %in, i64 2
+  %5 = insertelement <128 x i8> %3, i8 %4, i32 2
+  %6 = extractelement <16 x i8> %in, i64 3
+  %7 = insertelement <128 x i8> %5, i8 %6, i32 3
+  %8 = extractelement <16 x i8> %in, i64 4
+  %9 = insertelement <128 x i8> %7, i8 %8, i32 4
+  %10 = extractelement <16 x i8> %in, i64 5
+  %11 = insertelement <128 x i8> %9, i8 %10, i32 5
+  %12 = extractelement <16 x i8> %in, i64 6
+  %13 = insertelement <128 x i8> %11, i8 %12, i32 6
+  %14 = extractelement <16 x i8> %in, i64 7
+  %15 = insertelement <128 x i8> %13, i8 %14, i32 7
+  %16 = extractelement <16 x i8> %in, i64 8
+  %17 = insertelement <128 x i8> %15, i8 %16, i32 8
+  %18 = extractelement <16 x i8> %in, i64 9
+  %19 = insertelement <128 x i8> %17, i8 %18, i32 9
+  %20 = extractelement <16 x i8> %in, i64 10
+  %21 = insertelement <128 x i8> %19, i8 %20, i32 10
+  %22 = extractelement <16 x i8> %in, i64 11
+  %23 = insertelement <128 x i8> %21, i8 %22, i32 11
+  %24 = extractelement <16 x i8> %in, i64 12
+  %25 = insertelement <128 x i8> %23, i8 %24, i32 12
+  %26 = extractelement <16 x i8> %in, i64 13
+  %27 = insertelement <128 x i8> %25, i8 %26, i32 13
+  %28 = extractelement <16 x i8> %in, i64 14
+  %29 = insertelement <128 x i8> %27, i8 %28, i32 14
+  %30 = extractelement <16 x i8> %in, i64 15
+  %31 = insertelement <128 x i8> %29, i8 %30, i32 15
+  %32 = extractelement <16 x i8> %in, i64 0
+  %33 = insertelement <128 x i8> %31, i8 %32, i32 16
+  %34 = extractelement <16 x i8> %in, i64 1
+  %35 = insertelement <128 x i8> %33, i8 %34, i32 17
+  %36 = extractelement <16 x i8> %in, i64 2
+  %37 = insertelement <128 x i8> %35, i8 %36, i32 18
+  %38 = extractelement <16 x i8> %in, i64 3
+  %39 = insertelement <128 x i8> %37, i8 %38, i32 19
+  %40 = extractelement <16 x i8> %in, i64 4
+  %41 = insertelement <128 x i8> %39, i8 %40, i32 20
+  %42 = extractelement <16 x i8> %in, i64 5
+  %43 = insertelement <128 x i8> %41, i8 %42, i32 21
+  %44 = extractelement <16 x i8> %in, i64 6
+  %45 = insertelement <128 x i8> %43, i8 %44, i32 22
+  %46 = extractelement <16 x i8> %in, i64 7
+  %47 = insertelement <128 x i8> %45, i8 %46, i32 23
+  %48 = extractelement <16 x i8> %in, i64 8
+  %49 = insertelement <128 x i8> %47, i8 %48, i32 24
+  %50 = extractelement <16 x i8> %in, i64 9
+  %51 = insertelement <128 x i8> %49, i8 %50, i32 25
+  %52 = extractelement <16 x i8> %in, i64 10
+  %53 = insertelement <128 x i8> %51, i8 %52, i32 26
+  %54 = extractelement <16 x i8> %in, i64 11
+  %55 = insertelement <128 x i8> %53, i8 %54, i32 27
+  %56 = extractelement <16 x i8> %in, i64 12
+  %57 = insertelement <128 x i8> %55, i8 %56, i32 28
+  %58 = extractelement <16 x i8> %in, i64 13
+  %59 = insertelement <128 x i8> %57, i8 %58, i32 29
+  %60 = extractelement <16 x i8> %in, i64 14
+  %61 = insertelement <128 x i8> %59, i8 %60, i32 30
+  %62 = extractelement <16 x i8> %in, i64 15
+  %63 = insertelement <128 x i8> %61, i8 %62, i32 31
+  %64 = extractelement <16 x i8> %in, i64 0
+  %65 = insertelement <128 x i8> %63, i8 %64, i32 32
+  %66 = extractelement <16 x i8> %in, i64 1
+  %67 = insertelement <128 x i8> %65, i8 %66, i32 33
+  %68 = extractelement <16 x i8> %in, i64 2
+  %69 = insertelement <128 x i8> %67, i8 %68, i32 34
+  %70 = extractelement <16 x i8> %in, i64 3
+  %71 = insertelement <128 x i8> %69, i8 %70, i32 35
+  %72 = extractelement <16 x i8> %in, i64 4
+  %73 = insertelement <128 x i8> %71, i8 %72, i32 36
+  %74 = extractelement <16 x i8> %in, i64 5
+  %75 = insertelement <128 x i8> %73, i8 %74, i32 37
+  %76 = extractelement <16 x i8> %in, i64 6
+  %77 = insertelement <128 x i8> %75, i8 %76, i32 38
+  %78 = extractelement <16 x i8> %in, i64 7
+  %79 = insertelement <128 x i8> %77, i8 %78, i32 39
+  %80 = extractelement <16 x i8> %in, i64 8
+  %81 = insertelement <128 x i8> %79, i8 %80, i32 40
+  %82 = extractelement <16 x i8> %in, i64 9
+  %83 = insertelement <128 x i8> %81, i8 %82, i32 41
+  %84 = extractelement <16 x i8> %in, i64 10
+  %85 = insertelement <128 x i8> %83, i8 %84, i32 42
+  %86 = extractelement <16 x i8> %in, i64 11
+  %87 = insertelement <128 x i8> %85, i8 %86, i32 43
+  %88 = extractelement <16 x i8> %in, i64 12
+  %89 = insertelement <128 x i8> %87, i8 %88, i32 44
+  %90 = extractelement <16 x i8> %in, i64 13
+  %91 = insertelement <128 x i8> %89, i8 %90, i32 45
+  %92 = extractelement <16 x i8> %in, i64 14
+  %93 = insertelement <128 x i8> %91, i8 %92, i32 46
+  %94 = extractelement <16 x i8> %in, i64 15
+  %95 = insertelement <128 x i8> %93, i8 %94, i32 47
+  %96 = extractelement <16 x i8> %in, i64 0
+  %97 = insertelement <128 x i8> %95, i8 %96, i32 48
+  %98 = extractelement <16 x i8> %in, i64 1
+  %99 = insertelement <128 x i8> %97, i8 %98, i32 49
+  %100 = extractelement <16 x i8> %in, i64 2
+  %101 = insertelement <128 x i8> %99, i8 %100, i32 50
+  %102 = extractelement <16 x i8> %in, i64 3
+  %103 = insertelement <128 x i8> %101, i8 %102, i32 51
+  %104 = extractelement <16 x i8> %in, i64 4
+  %105 = insertelement <128 x i8> %103, i8 %104, i32 52
+  %106 = extractelement <16 x i8> %in, i64 5
+  %107 = insertelement <128 x i8> %105, i8 %106, i32 53
+  %108 = extractelement <16 x i8> %in, i64 6
+  %109 = insertelement <128 x i8> %107, i8 %108, i32 54
+  %110 = extractelement <16 x i8> %in, i64 7
+  %111 = insertelement <128 x i8> %109, i8 %110, i32 55
+  %112 = extractelement <16 x i8> %in, i64 8
+  %113 = insertelement <128 x i8> %111, i8 %112, i32 56
+  %114 = extractelement <16 x i8> %in, i64 9
+  %115 = insertelement <128 x i8> %113, i8 %114, i32 57
+  %116 = extractelement <16 x i8> %in, i64 10
+  %117 = insertelement <128 x i8> %115, i8 %116, i32 58
+  %118 = extractelement <16 x i8> %in, i64 11
+  %119 = insertelement <128 x i8> %117, i8 %118, i32 59
+  %120 = extractelement <16 x i8> %in, i64 12
+  %121 = insertelement <128 x i8> %119, i8 %120, i32 60
+  %122 = extractelement <16 x i8> %in, i64 13
+  %123 = insertelement <128 x i8> %121, i8 %122, i32 61
+  %124 = extractelement <16 x i8> %in, i64 14
+  %125 = insertelement <128 x i8> %123, i8 %124, i32 62
+  %126 = extractelement <16 x i8> %in, i64 15
+  %127 = insertelement <128 x i8> %125, i8 %126, i32 63
+  %128 = extractelement <16 x i8> %in, i64 0
+  %129 = insertelement <128 x i8> %127, i8 %128, i32 64
+  %130 = extractelement <16 x i8> %in, i64 1
+  %131 = insertelement <128 x i8> %129, i8 %130, i32 65
+  %132 = extractelement <16 x i8> %in, i64 2
+  %133 = insertelement <128 x i8> %131, i8 %132, i32 66
+  %134 = extractelement <16 x i8> %in, i64 3
+  %135 = insertelement <128 x i8> %133, i8 %134, i32 67
+  %136 = extractelement <16 x i8> %in, i64 4
+  %137 = insertelement <128 x i8> %135, i8 %136, i32 68
+  %138 = extractelement <16 x i8> %in, i64 5
+  %139 = insertelement <128 x i8> %137, i8 %138, i32 69
+  %140 = extractelement <16 x i8> %in, i64 6
+  %141 = insertelement <128 x i8> %139, i8 %140, i32 70
+  %142 = extractelement <16 x i8> %in, i64 7
+  %143 = insertelement <128 x i8> %141, i8 %142, i32 71
+  %144 = extractelement <16 x i8> %in, i64 8
+  %145 = insertelement <128 x i8> %143, i8 %144, i32 72
+  %146 = extractelement <16 x i8> %in, i64 9
+  %147 = insertelement <128 x i8> %145, i8 %146, i32 73
+  %148 = extractelement <16 x i8> %in, i64 10
+  %149 = insertelement <128 x i8> %147, i8 %148, i32 74
+  %150 = extractelement <16 x i8> %in, i64 11
+  %151 = insertelement <128 x i8> %149, i8 %150, i32 75
+  %152 = extractelement <16 x i8> %in, i64 12
+  %153 = insertelement <128 x i8> %151, i8 %152, i32 76
+  %154 = extractelement <16 x i8> %in, i64 13
+  %155 = insertelement <128 x i8> %153, i8 %154, i32 77
+  %156 = extractelement <16 x i8> %in, i64 14
+  %157 = insertelement <128 x i8> %155, i8 %156, i32 78
+  %158 = extractelement <16 x i8> %in, i64 15
+  %159 = insertelement <128 x i8> %157, i8 %158, i32 79
+  %160 = extractelement <16 x i8> %in, i64 0
+  %161 = insertelement <128 x i8> %159, i8 %160, i32 80
+  %162 = extractelement <16 x i8> %in, i64 1
+  %163 = insertelement <128 x i8> %161, i8 %162, i32 81
+  %164 = extractelement <16 x i8> %in, i64 2
+  %165 = insertelement <128 x i8> %163, i8 %164, i32 82
+  %166 = extractelement <16 x i8> %in, i64 3
+  %167 = insertelement <128 x i8> %165, i8 %166, i32 83
+  %168 = extractelement <16 x i8> %in, i64 4
+  %169 = insertelement <128 x i8> %167, i8 %168, i32 84
+  %170 = extractelement <16 x i8> %in, i64 5
+  %171 = insertelement <128 x i8> %169, i8 %170, i32 85
+  %172 = extractelement <16 x i8> %in, i64 6
+  %173 = insertelement <128 x i8> %171, i8 %172, i32 86
+  %174 = extractelement <16 x i8> %in, i64 7
+  %175 = insertelement <128 x i8> %173, i8 %174, i32 87
+  %176 = extractelement <16 x i8> %in, i64 8
+  %177 = insertelement <128 x i8> %175, i8 %176, i32 88
+  %178 = extractelement <16 x i8> %in, i64 9
+  %179 = insertelement <128 x i8> %177, i8 %178, i32 89
+  %180 = extractelement <16 x i8> %in, i64 10
+  %181 = insertelement <128 x i8> %179, i8 %180, i32 90
+  %182 = extractelement <16 x i8> %in, i64 11
+  %183 = insertelement <128 x i8> %181, i8 %182, i32 91
+  %184 = extractelement <16 x i8> %in, i64 12
+  %185 = insertelement <128 x i8> %183, i8 %184, i32 92
+  %186 = extractelement <16 x i8> %in, i64 13
+  %187 = insertelement <128 x i8> %185, i8 %186, i32 93
+  %188 = extractelement <16 x i8> %in, i64 14
+  %189 = insertelement <128 x i8> %187, i8 %188, i32 94
+  %190 = extractelement <16 x i8> %in, i64 15
+  %191 = insertelement <128 x i8> %189, i8 %190, i32 95
+  %192 = extractelement <16 x i8> %in, i64 0
+  %193 = insertelement <128 x i8> %191, i8 %192, i32 96
+  %194 = extractelement <16 x i8> %in, i64 1
+  %195 = insertelement <128 x i8> %193, i8 %194, i32 97
+  %196 = extractelement <16 x i8> %in, i64 2
+  %197 = insertelement <128 x i8> %195, i8 %196, i32 98
+  %198 = extractelement <16 x i8> %in, i64 3
+  %199 = insertelement <128 x i8> %197, i8 %198, i32 99
+  %200 = extractelement <16 x i8> %in, i64 4
+  %201 = insertelement <128 x i8> %199, i8 %200, i32 100
+  %202 = extractelement <16 x i8> %in, i64 5
+  %203 = insertelement <128 x i8> %201, i8 %202, i32 101
+  %204 = extractelement <16 x i8> %in, i64 6
+  %205 = insertelement <128 x i8> %203, i8 %204, i32 102
+  %206 = extractelement <16 x i8> %in, i64 7
+  %207 = insertelement <128 x i8> %205, i8 %206, i32 103
+  %208 = extractelement <16 x i8> %in, i64 8
+  %209 = insertelement <128 x i8> %207, i8 %208, i32 104
+  %210 = extractelement <16 x i8> %in, i64 9
+  %211 = insertelement <128 x i8> %209, i8 %210, i32 105
+  %212 = extractelement <16 x i8> %in, i64 10
+  %213 = insertelement <128 x i8> %211, i8 %212, i32 106
+  %214 = extractelement <16 x i8> %in, i64 11
+  %215 = insertelement <128 x i8> %213, i8 %214, i32 107
+  %216 = extractelement <16 x i8> %in, i64 12
+  %217 = insertelement <128 x i8> %215, i8 %216, i32 108
+  %218 = extractelement <16 x i8> %in, i64 13
+  %219 = insertelement <128 x i8> %217, i8 %218, i32 109
+  %220 = extractelement <16 x i8> %in, i64 14
+  %221 = insertelement <128 x i8> %219, i8 %220, i32 110
+  %222 = extractelement <16 x i8> %in, i64 15
+  %223 = insertelement <128 x i8> %221, i8 %222, i32 111
+  %224 = extractelement <16 x i8> %in, i64 0
+  %225 = insertelement <128 x i8> %223, i8 %224, i32 112
+  %226 = extractelement <16 x i8> %in, i64 1
+  %227 = insertelement <128 x i8> %225, i8 %226, i32 113
+  %228 = extractelement <16 x i8> %in, i64 2
+  %229 = insertelement <128 x i8> %227, i8 %228, i32 114
+  %230 = extractelement <16 x i8> %in, i64 3
+  %231 = insertelement <128 x i8> %229, i8 %230, i32 115
+  %232 = extractelement <16 x i8> %in, i64 4
+  %233 = insertelement <128 x i8> %231, i8 %232, i32 116
+  %234 = extractelement <16 x i8> %in, i64 5
+  %235 = insertelement <128 x i8> %233, i8 %234, i32 117
+  %236 = extractelement <16 x i8> %in, i64 6
+  %237 = insertelement <128 x i8> %235, i8 %236, i32 118
+  %238 = extractelement <16 x i8> %in, i64 7
+  %239 = insertelement <128 x i8> %237, i8 %238, i32 119
+  %240 = extractelement <16 x i8> %in, i64 8
+  %241 = insertelement <128 x i8> %239, i8 %240, i32 120
+  %242 = extractelement <16 x i8> %in, i64 9
+  %243 = insertelement <128 x i8> %241, i8 %242, i32 121
+  %244 = extractelement <16 x i8> %in, i64 10
+  %245 = insertelement <128 x i8> %243, i8 %244, i32 122
+  %246 = extractelement <16 x i8> %in, i64 11
+  %247 = insertelement <128 x i8> %245, i8 %246, i32 123
+  %248 = extractelement <16 x i8> %in, i64 12
+  %249 = insertelement <128 x i8> %247, i8 %248, i32 124
+  %250 = extractelement <16 x i8> %in, i64 13
+  %251 = insertelement <128 x i8> %249, i8 %250, i32 125
+  %252 = extractelement <16 x i8> %in, i64 14
+  %253 = insertelement <128 x i8> %251, i8 %252, i32 126
+  %254 = extractelement <16 x i8> %in, i64 15
+  %255 = insertelement <128 x i8> %253, i8 %254, i32 127
+  %256 = insertelement <16 x i8> poison, i8 %160, i64 0
+  %257 = insertelement <16 x i8> %256, i8 %162, i64 1
+  %258 = insertelement <16 x i8> %257, i8 %164, i64 2
+  %259 = insertelement <16 x i8> %258, i8 %166, i64 3
+  %260 = insertelement <16 x i8> %259, i8 %168, i64 4
+  %261 = insertelement <16 x i8> %260, i8 %170, i64 5
+  %262 = insertelement <16 x i8> %261, i8 %172, i64 6
+  %263 = insertelement <16 x i8> %262, i8 %174, i64 7
+  %264 = insertelement <16 x i8> %263, i8 %176, i64 8
+  %265 = insertelement <16 x i8> %264, i8 %178, i64 9
+  %266 = insertelement <16 x i8> %265, i8 %180, i64 10
+  %267 = insertelement <16 x i8> %266, i8 %182, i64 11
+  %268 = insertelement <16 x i8> %267, i8 %184, i64 12
+  %269 = insertelement <16 x i8> %268, i8 %186, i64 13
+  %270 = insertelement <16 x i8> %269, i8 %188, i64 14
+  %271 = insertelement <16 x i8> %270, i8 %190, i64 15
+  %sum = add <16 x i8> %271, %add
+  store <16 x i8> %sum, ptr addrspace(3) %out, align 16
+  ret void
+}
+
+attributes #0 = { "amdgpu-waves-per-eu"="2,2" }
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
index 5358e0419e7a7..88fcf359f7c8e 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
@@ -58,6 +58,19 @@ define <4 x float> @ext2_v2f32v4f32(<2 x float> %x, <4 x float> %y) {
   ret <4 x float> %r
 }
 
+define <2 x float> @ext2_v4f32v2f32(<4 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @ext2_v4f32v2f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <4 x float> [[X:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[Y:%.*]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %e = extractelement <4 x float> %x, i32 3
+  %n = fneg float %e
+  %r = insertelement <2 x float> %y, float %n, i32 1
+  ret <2 x float> %r
+}
+
 ; Eliminating extract/insert is still profitable. Flags propagate.
 
 define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
@@ -73,17 +86,11 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
 }
 
 define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) {
-; SSE-LABEL: @ext1_v2f64v4f64(
-; SSE-NEXT:    [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
-; SSE-NEXT:    [[N:%.*]] = fneg nsz double [[E]]
-; SSE-NEXT:    [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1
-; SSE-NEXT:    ret <4 x double> [[R]]
-;
-; AVX-LABEL: @ext1_v2f64v4f64(
-; AVX-NEXT:    [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
-; AVX-NEXT:    [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
-; AVX-NEXT:    ret <4 x double> [[R]]
+; CHECK-LABEL: @ext1_v2f64v4f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x double> [[R]]
 ;
   %e = extractelement <2 x double> %x, i32 1
   %n = fneg nsz double %e
@@ -91,6 +98,19 @@ define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) {
   ret <4 x double> %r
 }
 
+define <2 x double> @ext1_v4f64v2f64(<4 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: @ext1_v4f64v2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg nsz <4 x double> [[X:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
+  %e = extractelement <4 x double> %x, i32 3
+  %n = fneg nsz double %e
+  %r = insertelement <2 x double> %y, double %n, i32 1
+  ret <2 x double> %r
+}
+
 define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
 ; CHECK-LABEL: @ext7_v8f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
@@ -105,9 +125,9 @@ define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
 
 define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) {
 ; CHECK-LABEL: @ext7_v4f32v8f32(
-; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
-; CHECK-NEXT:    [[N:%.*]] = fneg float [[E]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <4 x float> [[X:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
 ; CHECK-NEXT:    ret <8 x float> [[R]]
 ;
   %e = extractelement <4 x float> %x, i32 3
@@ -116,6 +136,19 @@ define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) {
   ret <8 x float> %r
 }
 
+define <4 x float> @ext7_v8f32v4f32(<8 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @ext7_v8f32v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %e = extractelement <8 x float> %x, i32 7
+  %n = fneg float %e
+  %r = insertelement <4 x float> %y, float %n, i32 3
+  ret <4 x float> %r
+}
+
 ; Same as above with an extra use of the extracted element.
 
 define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
@@ -141,12 +174,20 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
 }
 
 define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) {
-; CHECK-LABEL: @ext7_v4f32v8f32_use1(
-; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
-; CHECK-NEXT:    call void @use(float [[E]])
-; CHECK-NEXT:    [[N:%.*]] = fneg float [[E]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
-; CHECK-NEXT:    ret <8 x float> [[R]]
+; SSE-LABEL: @ext7_v4f32v8f32_use1(
+; SSE-NEXT:    [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
+; SSE-NEXT:    call void @use(float [[E]])
+; SSE-NEXT:    [[TMP1:%.*]] = fneg <4 x float> [[X]]
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[R]]
+;
+; AVX-LABEL: @ext7_v4f32v8f32_use1(
+; AVX-NEXT:    [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
+; AVX-NEXT:    call void @use(float [[E]])
+; AVX-NEXT:    [[N:%.*]] = fneg float [[E]]
+; AVX-NEXT:    [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
+; AVX-NEXT:    ret <8 x float> [[R]]
 ;
   %e = extractelement <4 x float> %x, i32 3
   call void @use(float %e)
@@ -155,6 +196,29 @@ define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) {
   ret <8 x float> %r
 }
 
+define <4 x float> @ext7_v8f32v4f32_use1(<8 x float> %x, <4 x float> %y) {
+; SSE-LABEL: @ext7_v8f32v4f32_use1(
+; SSE-NEXT:    [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
+; SSE-NEXT:    call void @use(float [[E]])
+; SSE-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[X]]
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 7>
+; SSE-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; SSE-NEXT:    ret <4 x float> [[R]]
+;
+; AVX-LABEL: @ext7_v8f32v4f32_use1(
+; AVX-NEXT:    [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
+; AVX-NEXT:    call void @use(float [[E]])
+; AVX-NEXT:    [[N:%.*]] = fneg float [[E]]
+; AVX-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 3
+; AVX-NEXT:    ret <4 x float> [[R]]
+;
+  %e = extractelement <8 x float> %x, i32 7
+  call void @use(float %e)
+  %n = fneg float %e
+  %r = insertelement <4 x float> %y, float %n, i32 3
+  ret <4 x float> %r
+}
+
 ; Negative test - the transform is likely not profitable if the fneg has another use.
 
 define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) {
@@ -187,6 +251,21 @@ define <8 x float> @ext7_v4f32v8f32_use2(<4 x float> %x, <8 x float> %y) {
   ret <8 x float> %r
 }
 
+define <4 x float> @ext7_v8f32v4f32_use2(<8 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @ext7_v8f32v4f32_use2(
+; CHECK-NEXT:    [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
+; CHECK-NEXT:    [[N:%.*]] = fneg float [[E]]
+; CHECK-NEXT:    call void @use(float [[N]])
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %e = extractelement <8 x float> %x, i32 7
+  %n = fneg float %e
+  call void @use(float %n)
+  %r = insertelement <4 x float> %y, float %n, i32 3
+  ret <4 x float> %r
+}
+
 ; Negative test - can't convert variable index to a shuffle.
 
 define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %index) {
@@ -215,14 +294,10 @@ define <4 x double> @ext_index_var_v2f64v4f64(<2 x double> %x, <4 x double> %y,
   ret <4 x double> %r
 }
 
-; Negative test - require same extract/insert index for simple shuffle.
-; TODO: We could handle this by adjusting the cost calculation.
-
 define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: @ext1_v2f64_ins0(
-; CHECK-NEXT:    [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
-; CHECK-NEXT:    [[N:%.*]] = fneg nsz double [[E]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 3, i32 1>
 ; CHECK-NEXT:    ret <2 x double> [[R]]
 ;
   %e = extractelement <2 x double> %x, i32 1
@@ -231,12 +306,11 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
   ret <2 x double> %r
 }
 
-; Negative test - extract from an index greater than the vector width of the destination
 define <2 x double> @ext3_v4f64v2f64(<4 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: @ext3_v4f64v2f64(
-; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x double> [[X:%.*]], i32 3
-; CHECK-NEXT:    [[N:%.*]] = fneg nsz double [[E]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg nsz <4 x double> [[X:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    ret <2 x double> [[R]]
 ;
   %e = extractelement <4 x double> %x, i32 3
@@ -246,11 +320,17 @@ define <2 x double> @ext3_v4f64v2f64(<4 x double> %x, <2 x double> %y) {
 }
 
 define <4 x double> @ext1_v2f64v4f64_ins0(<2 x double> %x, <4 x double> %y) {
-; CHECK-LABEL: @ext1_v2f64v4f64_ins0(
-; CHECK-NEXT:    [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
-; CHECK-NEXT:    [[N:%.*]] = fneg nsz double [[E]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0
-; CHECK-NEXT:    ret <4 x double> [[R]]
+; SSE-LABEL: @ext1_v2f64v4f64_ins0(
+; SSE-NEXT:    [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:    [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 5, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[R]]
+;
+; AVX-LABEL: @ext1_v2f64v4f64_ins0(
+; AVX-NEXT:    [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
+; AVX-NEXT:    [[N:%.*]] = fneg nsz double [[E]]
+; AVX-NEXT:    [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0
+; AVX-NEXT:    ret <4 x double> [[R]]
 ;
   %e = extractelement <2 x double> %x, i32 1
   %n = fneg nsz double %e
diff --git a/llvm/test/Verifier/modular-format.ll b/llvm/test/Verifier/modular-format.ll
new file mode 100644
index 0000000000000..abdd73d098be1
--- /dev/null
+++ b/llvm/test/Verifier/modular-format.ll
@@ -0,0 +1,41 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+define void @test_too_few_arguments(i32 %arg, ...) "modular-format"="printf,1,2,basic_mod" {
+  ret void
+}
+; CHECK: modular-format attribute requires at least 5 arguments
+; CHECK-NEXT: ptr @test_too_few_arguments
+
+define void @test_first_arg_index_not_integer(i32 %arg, ...) "modular-format"="printf,1,foo,basic_mod,basic_impl" {
+  ret void
+}
+; CHECK: modular-format attribute first arg index is not an integer
+; CHECK-NEXT: ptr @test_first_arg_index_not_integer
+
+define void @test_first_arg_index_zero(i32 %arg) "modular-format"="printf,1,0,basic_mod,basic_impl" {
+  ret void
+}
+; CHECK: modular-format attribute first arg index is out of bounds
+; CHECK-NEXT: ptr @test_first_arg_index_zero
+
+define void @test_first_arg_index_out_of_bounds(i32 %arg) "modular-format"="printf,1,2,basic_mod,basic_impl" {
+  ret void
+}
+; CHECK: modular-format attribute first arg index is out of bounds
+; CHECK-NEXT: ptr @test_first_arg_index_out_of_bounds
+
+define void @test_first_arg_index_out_of_bounds_varargs(i32 %arg, ...) "modular-format"="printf,1,3,basic_mod,basic_impl" {
+  ret void
+}
+; CHECK: modular-format attribute first arg index is out of bounds
+; CHECK-NEXT: ptr @test_first_arg_index_out_of_bounds_varargs
+
+; CHECK-NOT: ptr @test_first_arg_index_in_bounds
+define void @test_first_arg_index_in_bounds(i32 %arg) "modular-format"="printf,1,1,basic_mod,basic_impl" {
+  ret void
+}
+
+; CHECK-NOT: ptr @test_first_arg_index_in_bounds_varargs
+define void @test_first_arg_index_in_bounds_varargs(i32 %arg, ...) "modular-format"="printf,1,2,basic_mod,basic_impl" {
+  ret void
+}
diff --git a/llvm/test/Verifier/reloc-none.ll b/llvm/test/Verifier/reloc-none.ll
new file mode 100644
index 0000000000000..9c96799a36a36
--- /dev/null
+++ b/llvm/test/Verifier/reloc-none.ll
@@ -0,0 +1,13 @@
+; RUN: not llvm-as -disable-output 2>&1 %s | FileCheck %s
+
+; CHECK: llvm.reloc.none argument must be a metadata string
+; CHECK-NEXT: call void @llvm.reloc.none(metadata !0)
+
+define void @test_reloc_none_bad_arg() {
+  call void @llvm.reloc.none(metadata !0)
+  ret void
+}
+
+declare void @llvm.reloc.none(metadata)
+
+!0 = !{}
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 11a5a5785a6ec..94cf8bc358514 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -57,8 +57,19 @@
     # so we just exclude llvm-reduce tests from this config altogether. This should
     # be fine though as profcheck config tests are mostly concerned with opt.
     config.excludes.append("llvm-reduce")
+    # Exclude llvm-objcopy tests - not the target of this effort, and some use
+    # cat in ways that conflict with how profcheck uses it.
+    config.excludes.append("llvm-objcopy")
     # (Issue #161235) Temporarily exclude LoopVectorize.
     config.excludes.append("LoopVectorize")
+    # exclude UpdateTestChecks - they fail because of inserted prof annotations
+    config.excludes.append("UpdateTestChecks")
+    # TODO(#166655): Reenable Instrumentation tests
+    config.excludes.append("Instrumentation")
+    # profiling doesn't work quite well on GPU, excluding
+    config.excludes.append("AMDGPU")
+
+    config.available_features.add("profcheck")
 
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)
@@ -224,6 +235,7 @@ def get_asan_rtlib():
         "llvm-addr2line",
         "llvm-bcanalyzer",
         "llvm-bitcode-strip",
+        "llvm-cas",
         "llvm-cgdata",
         "llvm-config",
         "llvm-cov",
@@ -474,7 +486,7 @@ def enable_ptxas(ptxas_executable):
 config.available_features.add("host-byteorder-" + sys.byteorder + "-endian")
 if config.target_triple:
     if re.match(
-        r"(aarch64_be|arc|armeb|bpfeb|lanai|m68k|mips|mips64|powerpc|powerpc64|sparc|sparcv9|s390x|s390|tce|thumbeb)-.*",
+        r"(aarch64_be|arc|armeb|bpfeb|lanai|m68k|mips|mips64|powerpc|powerpc64|sparc|sparcv9|sparc64|s390x|s390|tce|thumbeb)-.*",
         config.target_triple,
     ):
         config.available_features.add("target-byteorder-big-endian")
@@ -579,7 +591,7 @@ def have_cxx_shared_library():
         print("could not exec llvm-readobj")
         return False
 
-    readobj_out = readobj_cmd.stdout.read().decode("ascii")
+    readobj_out = readobj_cmd.stdout.read().decode("utf-8")
     readobj_cmd.wait()
 
     regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
@@ -787,10 +799,19 @@ def host_unwind_supports_jit():
 if config.expensive_checks:
     config.available_features.add("expensive_checks")
 
+if config.have_ondisk_cas:
+    config.available_features.add("ondisk_cas")
+
 if "MemoryWithOrigins" in config.llvm_use_sanitizer:
     config.available_features.add("use_msan_with_origins")
 
 
+# Restrict the size of the on-disk CAS for tests. This allows testing in
+# constrained environments (e.g. small TMPDIR). It also prevents leaving
+# behind large files on file systems that do not support sparse files if a test
+# crashes before resizing the file.
+config.environment["LLVM_CAS_MAX_MAPPING_SIZE"] = "%d" % (100 * 1024 * 1024)
+
 # Some tools support an environment variable "OBJECT_MODE" on AIX OS, which
 # controls the kind of objects they will support. If there is no "OBJECT_MODE"
 # environment variable specified, the default behaviour is to support 32-bit
diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
index 973e0ec934a52..c5cb7160a3d40 100644
--- a/llvm/test/lit.site.cfg.py.in
+++ b/llvm/test/lit.site.cfg.py.in
@@ -66,6 +66,7 @@ config.spirv_tools_tests = @LLVM_INCLUDE_SPIRV_TOOLS_TESTS@
 config.have_vc_rev = @LLVM_APPEND_VC_REV@
 config.force_vc_rev = "@LLVM_FORCE_VC_REVISION@"
 config.has_logf128 = @LLVM_HAS_LOGF128@
+config.have_ondisk_cas = @LLVM_ENABLE_ONDISK_CAS@
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
index f515ee651d835..8d50df7050636 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
@@ -7,10 +7,10 @@ define i64 @i64_test(i64 %i) nounwind readnone {
 ; CHECK-NEXT:    t0: ch,glue = EntryToken
 ; CHECK-NEXT:    t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8
 ; CHECK-NEXT:    t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %9
-; CHECK-NEXT:    t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<75>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11>
+; CHECK-NEXT:    t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<66>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11>
 ; CHECK-NEXT:    t27: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc, align 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
 ; CHECK-NEXT:    t30: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc + 4, basealign 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
-; CHECK-NEXT:    t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<75>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11>
+; CHECK-NEXT:    t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<66>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11>
 ; CHECK-NEXT:    t10: i64 = V_ADD_U64_PSEUDO # D:1 t50, t33
 ; CHECK-NEXT:    t24: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<3>
 ; CHECK-NEXT:    t17: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t24
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll
new file mode 100644
index 0000000000000..292637177591f
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR
+
+define i64 @test1(i64 %i) nounwind readnone {
+  %loc = alloca i64
+  %j = load i64, ptr %loc
+  %r = add i64 %i, %j
+  ret i64 %r
+}
+
+define i64 @test2(i32 %i) nounwind readnone {
+  %loc = alloca i32
+  %j = load i32, ptr %loc
+  %r = add i32 %i, %j
+  %ext = zext i32 %r to i64
+  ret i64 %ext
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected
new file mode 100644
index 0000000000000..88cb03e85204a
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR
+
+define i64 @test1(i64 %i) nounwind readnone {
+; ASM-LABEL: test1:
+; ASM:       # %bb.0:
+; ASM-NEXT:    movq %rdi, %rax
+; ASM-NEXT:    addq -{{[0-9]+}}(%rsp), %rax
+; ASM-NEXT:    retq
+; MIR-LABEL: name: test1
+; MIR: bb.0 (%ir-block.0):
+; MIR-NEXT:   liveins: $rdi
+; MIR-NEXT: {{  $}}
+; MIR-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+; MIR-NEXT:   [[ADD64rm:%[0-9]+]]:gr64 = ADD64rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s64) from %ir.loc)
+; MIR-NEXT:   $rax = COPY [[ADD64rm]]
+; MIR-NEXT:   RET 0, $rax
+  %loc = alloca i64
+  %j = load i64, ptr %loc
+  %r = add i64 %i, %j
+  ret i64 %r
+}
+
+define i64 @test2(i32 %i) nounwind readnone {
+; ASM-LABEL: test2:
+; ASM:       # %bb.0:
+; ASM-NEXT:    movl %edi, %eax
+; ASM-NEXT:    addl -{{[0-9]+}}(%rsp), %eax
+; ASM-NEXT:    retq
+; MIR-LABEL: name: test2
+; MIR: bb.0 (%ir-block.0):
+; MIR-NEXT:   liveins: $edi
+; MIR-NEXT: {{  $}}
+; MIR-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $edi
+; MIR-NEXT:   [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s32) from %ir.loc)
+; MIR-NEXT:   [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, killed [[ADD32rm]], %subreg.sub_32bit
+; MIR-NEXT:   $rax = COPY [[SUBREG_TO_REG]]
+; MIR-NEXT:   RET 0, $rax
+  %loc = alloca i32
+  %j = load i32, ptr %loc
+  %r = add i32 %i, %j
+  %ext = zext i32 %r to i64
+  ret i64 %ext
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll
new file mode 100644
index 0000000000000..7167bcf258e68
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK
+
+define i32 @add(i32 %a, i32 %b) {
+  %sum = add i32 %a, %b
+  ret i32 %sum
+}
+
+define i32 @sub(i32 %a, i32 %b) {
+  %diff = sub i32 %a, %b
+  ret i32 %diff
+}
+
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected
new file mode 100644
index 0000000000000..1ba920d1de8b0
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK
+
+define i32 @add(i32 %a, i32 %b) {
+  %sum = add i32 %a, %b
+  ret i32 %sum
+}
+
+define i32 @sub(i32 %a, i32 %b) {
+  %diff = sub i32 %a, %b
+  ret i32 %diff
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test
new file mode 100644
index 0000000000000..6fc57b583b37d
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test
@@ -0,0 +1,9 @@
+# REQUIRES: x86-registered-target
+## Test checking that update_llc_test_checks.py can generate both ASM and MIR checks in the same file
+
+# RUN: cp -f %S/Inputs/x86_asm_mir_mixed.ll %t.ll && %update_llc_test_checks %t.ll
+# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll
+
+## Verify that running the script again on an already updated file doesn't add duplicate checks
+# RUN: %update_llc_test_checks %t.ll
+# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test
new file mode 100644
index 0000000000000..0f8aaa549afa4
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test
@@ -0,0 +1,8 @@
+# REQUIRES: x86-registered-target
+## Test that using the same prefix for both ASM and MIR outputs generates a warning
+## and doesn't produce any checks.
+
+# RUN: cp -f %S/Inputs/x86_asm_mir_same_prefix.ll %t.ll && %update_llc_test_checks %t.ll 2>&1 | FileCheck %s --check-prefix=WARNING
+# RUN: diff -u %S/Inputs/x86_asm_mir_same_prefix.ll.expected %t.ll
+
+# WARNING: WARNING: The following prefixes are used for both ASM and MIR output, which will cause FileCheck failures: CHECK
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll
new file mode 100644
index 0000000000000..bfd216d1ced49
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll
@@ -0,0 +1,29 @@
+; RUN: opt < %s -S | FileCheck %s
+
+; Test whether UTC checks empty lines instead of skipping them.
+define i32 @test(i32 %x) {
+entry:
+  br label %block1
+
+block1:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %block2, label %exit1
+
+block2:
+  br i1 %cmp, label %block3, label %exit2
+
+block3:
+  br i1 %cmp, label %exit3, label %exit4
+
+exit1:
+  ret i32 0
+
+exit2:
+  ret i32 %x
+
+exit3:
+  ret i32 %x
+
+exit4:
+  ret i32 %x
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll.expected
new file mode 100644
index 0000000000000..c5f822d10181a
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll.expected
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 7
+; RUN: opt < %s -S | FileCheck %s
+
+; Test whether UTC checks empty lines instead of skipping them.
+define i32 @test(i32 %x) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[BLOCK1:.*]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[BLOCK1]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[BLOCK2:.*]], label %[[EXIT1:.*]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[BLOCK2]]:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[BLOCK3:.*]], label %[[EXIT2:.*]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[BLOCK3]]:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[EXIT3:.*]], label %[[EXIT4:.*]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[EXIT1]]:
+; CHECK-NEXT:    ret i32 0
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[EXIT2]]:
+; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[EXIT3]]:
+; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[EXIT4]]:
+; CHECK-NEXT:    ret i32 [[X]]
+;
+entry:
+  br label %block1
+
+block1:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %block2, label %exit1
+
+block2:
+  br i1 %cmp, label %block3, label %exit2
+
+block3:
+  br i1 %cmp, label %exit3, label %exit4
+
+exit1:
+  ret i32 0
+
+exit2:
+  ret i32 %x
+
+exit3:
+  ret i32 %x
+
+exit4:
+  ret i32 %x
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected
index b1977e7ae2ee2..8cab0bbf304f3 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected
@@ -12,13 +12,17 @@ define i8 @testi8(i8 %x) {
 ; CHECK-NEXT:      i8 2, label %[[CASE3:.*]]
 ; CHECK-NEXT:      i8 3, label %[[CASE3]]
 ; CHECK-NEXT:    ]
-; CHECK:       [[DEFAULT]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[DEFAULT]]:
 ; CHECK-NEXT:    ret i8 0
-; CHECK:       [[CASE1]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE1]]:
 ; CHECK-NEXT:    ret i8 1
-; CHECK:       [[CASE2]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE2]]:
 ; CHECK-NEXT:    ret i8 2
-; CHECK:       [[CASE3]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE3]]:
 ; CHECK-NEXT:    ret i8 3
 ;
   switch i8 %x, label %default [
@@ -46,13 +50,17 @@ define i32 @testi32(i32 %x) {
 ; CHECK-NEXT:      i32 2, label %[[CASE3:.*]]
 ; CHECK-NEXT:      i32 3, label %[[CASE3]]
 ; CHECK-NEXT:    ]
-; CHECK:       [[DEFAULT]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[DEFAULT]]:
 ; CHECK-NEXT:    ret i32 0
-; CHECK:       [[CASE1]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE1]]:
 ; CHECK-NEXT:    ret i32 1
-; CHECK:       [[CASE2]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE2]]:
 ; CHECK-NEXT:    ret i32 2
-; CHECK:       [[CASE3]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE3]]:
 ; CHECK-NEXT:    ret i32 3
 ;
   switch i32 %x, label %default [
@@ -80,13 +88,17 @@ define i128 @testi128(i128 %x) {
 ; CHECK-NEXT:      i128 2, label %[[CASE3:.*]]
 ; CHECK-NEXT:      i128 3, label %[[CASE3]]
 ; CHECK-NEXT:    ]
-; CHECK:       [[DEFAULT]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[DEFAULT]]:
 ; CHECK-NEXT:    ret i128 0
-; CHECK:       [[CASE1]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE1]]:
 ; CHECK-NEXT:    ret i128 1
-; CHECK:       [[CASE2]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE2]]:
 ; CHECK-NEXT:    ret i128 2
-; CHECK:       [[CASE3]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE3]]:
 ; CHECK-NEXT:    ret i128 3
 ;
   switch i128 %x, label %default [
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/check_empty.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/check_empty.test
new file mode 100644
index 0000000000000..670bda27bb369
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/check_empty.test
@@ -0,0 +1,3 @@
+## test whether the UTC generates CHECK-EMPTY for blank lines
+# RUN: cp -f %S/Inputs/check_empty.ll %t.ll && %update_test_checks %t.ll --version 7
+# RUN: diff -u %t.ll %S/Inputs/check_empty.ll.expected
diff --git a/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test b/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test
new file mode 100644
index 0000000000000..00141f12587d4
--- /dev/null
+++ b/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test
@@ -0,0 +1,33 @@
+# RUN: dsymutil -include-swiftmodules-from-interface -verbose -oso-prepend-path=%p -y -o %t.dSYM  %s | FileCheck %s
+#
+# RUN: dsymutil -include-swiftmodules-from-interface --linker parallel -verbose -oso-prepend-path=%p -y %s -o %t-parallel.dSYM | FileCheck %s
+#
+# To regenerate:
+# echo ''>I.swift
+# echo ''>B.swift
+# echo 'import I'>main.swift
+# xcrun swiftc -emit-module-interface-path I.swiftinterface -enable-library-evolution I.swift
+# xcrun swiftc -emit-module-path B.swiftmodule B.swift -Xfrontend -no-serialize-debugging-options
+# xcrun swiftc -explicit-module-build main.swift -I. -module-cache-path cache -g -Xfrontend  -no-serialize-debugging-options
+# output is "B.swiftmodule" and "cache/I*.swiftmodule"
+#
+# CHECK-NOT: Skipping compiled textual Swift interface: {{.*}}/Inputs/Binary.swiftmodule
+# CHECK-NOT: Skipping compiled textual Swift interface: {{.*}}/Inputs/FromInterface.swiftmodule
+
+#
+---
+triple:          'arm64-apple-darwin'
+objects:
+  - filename:        '../Inputs/Binary.swiftmodule'
+    timestamp:       0
+    type:            50
+    symbols:         []
+  - filename:        '../Inputs/FromInterface.swiftmodule'
+    timestamp:       0
+    type:            50
+    symbols:         []
+  - filename:        '../Inputs/FromInterface.swiftmodule'
+    timestamp:       0
+    type:            50
+    symbols:         []
+...
diff --git a/llvm/test/tools/dsymutil/ARM/typedefs-with-same-name.test b/llvm/test/tools/dsymutil/ARM/typedefs-with-same-name.test
new file mode 100644
index 0000000000000..cef40b4bcf3b0
--- /dev/null
+++ b/llvm/test/tools/dsymutil/ARM/typedefs-with-same-name.test
@@ -0,0 +1,41 @@
+#RUN: dsymutil --linker=parallel -f -oso-prepend-path=%p/../Inputs/ -y %s -o %t.dwarf
+#RUN: llvm-dwarfdump %t.dwarf | FileCheck %s
+
+# There should be two typedef DIE named "BarInt" in the resultant .dwarf file.
+# The second should refer to the first, which refer to "Foo<int>".
+# CHECK:      0x[[FIRST_BARINT_ADDR:[0-9a-f]*]]:  DW_TAG_typedef
+# CHECK-NEXT:     DW_AT_type  (0x{{([[:xdigit:]]*)}} "Foo<int>")
+# CHECK-NEXT:     DW_AT_name  ("BarInt")
+# CHECK:      0x{{([[:xdigit:]]*)}}:  DW_TAG_typedef
+# CHECK-NEXT:     DW_AT_type  (0x[[FIRST_BARINT_ADDR]] "BarInt")
+# CHECK-NEXT:     DW_AT_name  ("BarInt")
+
+# Source:
+#
+#   template <typename T> struct Foo;
+#   typedef Foo<int> BarInt;
+#   template <typename T>
+#   struct [[clang::preferred_name(BarInt)]] Foo{};
+#   int main() {
+#     BarInt barInt;
+#     return 0;
+#   }
+#
+# Compile with:
+#
+#   $ clang++ -g -O0 -c typedefs-with-same-name.cpp -o typedefs-with-same-name.o
+#
+# To generate the debug map:
+#
+#   $ clang++ typedefs-with-same-name.o -o typedefs-with-same-name
+#   $ dsymutil -dump-debug-map typedefs-with-same-name
+
+---
+triple:          'arm64-apple-darwin'
+objects:
+  - filename:        '/typedefs-with-same-name.o'
+    timestamp:       1762438746
+    type:            102
+    symbols:
+      - { sym: _main, objAddr: 0x0, binAddr: 0x100000360, size: 0x14 }
+...
diff --git a/llvm/test/tools/dsymutil/Inputs/typedefs-with-same-name.o b/llvm/test/tools/dsymutil/Inputs/typedefs-with-same-name.o
new file mode 100644
index 0000000000000..6cc47c1a783b3
Binary files /dev/null and b/llvm/test/tools/dsymutil/Inputs/typedefs-with-same-name.o differ
diff --git a/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/odr-fwd-declaration.test b/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/odr-fwd-declaration.test
index d028194f7e83a..fd15ce3e18978 100644
--- a/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/odr-fwd-declaration.test
+++ b/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/odr-fwd-declaration.test
@@ -35,14 +35,14 @@ void foo() { Sptrptr ptr1 = 0; }
 // CHECK: DW_TAG_member
 // CHECK-NEXT: DW_AT_name{{.*}}"field"
 
-// CHECK: 0x[[TYPEDEF_PTR_S]]: DW_TAG_typedef
-// CHECK-NEXT: DW_AT_type{{.*}}{0x[[PTR_S]]} "S *"
-// CHECK-NEXT: DW_AT_name{{.*}}"Sptr"
-
 // CHECK: 0x[[TYPEDEF_PTR_PTR_S:[a-f0-9]*]]: DW_TAG_typedef
 // CHECK-NEXT: DW_AT_type{{.*}}{0x[[PTR_PTR_S]]} "Sptr *"
 // CHECK-NEXT: DW_AT_name{{.*}}"Sptrptr"
 
+// CHECK: 0x[[TYPEDEF_PTR_S]]: DW_TAG_typedef
+// CHECK-NEXT: DW_AT_type{{.*}}{0x[[PTR_S]]} "S *"
+// CHECK-NEXT: DW_AT_name{{.*}}"Sptr"
+
 // First we confirm that first compile unit properly references type.
 //
 // CHECK: DW_TAG_compile_unit
diff --git a/llvm/test/tools/dsymutil/cmdline.test b/llvm/test/tools/dsymutil/cmdline.test
index 1574fe35f5254..0b0bce194d575 100644
--- a/llvm/test/tools/dsymutil/cmdline.test
+++ b/llvm/test/tools/dsymutil/cmdline.test
@@ -14,6 +14,7 @@ CHECK: -fat64
 CHECK: -flat
 CHECK: -gen-reproducer
 CHECK: -help
+CHECK: -include-swiftmodules-from-interface
 CHECK: -keep-function-for-static
 CHECK: -no-object-timestamp
 CHECK: -no-odr
diff --git a/llvm/test/tools/dxil-dis/llvm_assume.ll b/llvm/test/tools/dxil-dis/llvm_assume.ll
deleted file mode 100644
index f5be66c0d192f..0000000000000
--- a/llvm/test/tools/dxil-dis/llvm_assume.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc --filetype=obj %s -o - | dxil-dis -o - | FileCheck %s
-
-target triple = "dxil-pc-shadermodel6.7-library"
-
-define void @test_llvm_assume(i1 %0)  {
-; CHECK-LABEL: test_llvm_assume
-; CHECK-NEXT: tail call void @llvm.assume(i1 %0)
-tail call void @llvm.assume(i1 %0)
-ret void
-}
-
diff --git a/llvm/test/tools/llc/new-pm/start-stop.ll b/llvm/test/tools/llc/new-pm/start-stop.ll
index e4c454900fd38..0e68cdbe67b63 100644
--- a/llvm/test/tools/llc/new-pm/start-stop.ll
+++ b/llvm/test/tools/llc/new-pm/start-stop.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -filetype=null %s | FileCheck --match-full-lines %s --check-prefix=NULL
 ; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -o /dev/null %s | FileCheck --match-full-lines %s --check-prefix=OBJ
 
-; NULL: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,function(verify,mergeicmps,expand-memcmp,gc-lowering,verify)
-; OBJ: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,function(verify,mergeicmps,expand-memcmp,gc-lowering,verify),PrintMIRPreparePass,function(machine-function(print),free-machine-function)
+; NULL: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,function(verify,mergeicmps,expand-memcmp,gc-lowering,verify)
+; OBJ: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,function(verify,mergeicmps,expand-memcmp,gc-lowering,verify),PrintMIRPreparePass,function(machine-function(print),free-machine-function)
diff --git a/llvm/test/tools/llc/save-stats.ll b/llvm/test/tools/llc/save-stats.ll
new file mode 100644
index 0000000000000..a5769f86648dc
--- /dev/null
+++ b/llvm/test/tools/llc/save-stats.ll
@@ -0,0 +1,16 @@
+; REQUIRES: asserts
+
+; RUN: rm -rf %t.dir && mkdir -p %t.dir && cd %t.dir
+; RUN: llc --save-stats=obj -o %t.s %s && cat %t.stats | FileCheck %s
+; RUN: llc --save-stats=cwd -o %t.s %s && cat %{t:stem}.tmp.stats | FileCheck %s
+; RUN: llc --save-stats -o %t.s %s && cat %{t:stem}.tmp.stats | FileCheck %s
+; RUN: not llc --save-stats=invalid -o %t.s %s 2>&1 | FileCheck %s --check-prefix=INVALID_ARG
+
+; CHECK: {
+; CHECK: "asm-printer.EmittedInsts":
+; CHECK: }
+
+; INVALID_ARG: {{.*}}llc{{.*}}: for the --save-stats option: Cannot find option named 'invalid'!
+define i32 @func() {
+  ret i32 0
+}
diff --git a/llvm/test/tools/llvm-cas/Inputs/oneline b/llvm/test/tools/llvm-cas/Inputs/oneline
new file mode 100644
index 0000000000000..d95f3ad14dee6
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/Inputs/oneline
@@ -0,0 +1 @@
+content
diff --git a/llvm/test/tools/llvm-cas/Inputs/oneline-nonewline b/llvm/test/tools/llvm-cas/Inputs/oneline-nonewline
new file mode 100644
index 0000000000000..6b584e8ece562
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/Inputs/oneline-nonewline
@@ -0,0 +1 @@
+content
\ No newline at end of file
diff --git a/llvm/test/tools/llvm-cas/action-cache.test b/llvm/test/tools/llvm-cas/action-cache.test
new file mode 100644
index 0000000000000..fcb212c24e215
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/action-cache.test
@@ -0,0 +1,14 @@
+RUN: rm -rf %t %t.cas
+RUN: mkdir %t
+
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN:   --data %S/Inputs/oneline >%t/oneline.casid
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN:   --data %S/Inputs/oneline-nonewline >%t/oneline-nonewline.casid
+
+RUN: llvm-cas --cas %t.cas --put-cache-key @%t/oneline.casid @%t/oneline-nonewline.casid
+RUN: llvm-cas --cas %t.cas --get-cache-result @%t/oneline.casid > %t/result.casid
+RUN: diff %t/oneline-nonewline.casid %t/result.casid
+
+RUN: not llvm-cas --cas %t.cas --get-cache-result @%t/oneline-nonewline.casid 2>&1 | FileCheck %s
+CHECK: result not found
diff --git a/llvm/test/tools/llvm-cas/cache.test b/llvm/test/tools/llvm-cas/cache.test
new file mode 100644
index 0000000000000..f0ce69190d418
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/cache.test
@@ -0,0 +1,14 @@
+RUN: rm -rf %t %t.cas
+RUN: mkdir %t
+
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN:   --data /dev/null > %t/empty.casid
+RUN: echo "abc" | \
+RUN:   llvm-cas --cas %t.cas --make-blob \
+RUN:   --data - >%t/abc.casid
+
+RUN: llvm-cas --cas %t/cas --put-cache-key @%t/abc.casid @%t/empty.casid
+RUN: llvm-cas --cas %t/cas --get-cache-result @%t/abc.casid > %t/empty2.casid
+RUN: diff %t/empty.casid %t/empty2.casid
+
+RUN: not llvm-cas --cas %t/cas --get-cache-result @%t/empty.casid
diff --git a/llvm/test/tools/llvm-cas/dump.test b/llvm/test/tools/llvm-cas/dump.test
new file mode 100644
index 0000000000000..f23bac6cdf849
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/dump.test
@@ -0,0 +1,27 @@
+RUN: rm -rf %t
+RUN: mkdir %t
+
+RUN: llvm-cas --cas %t/cas --make-blob \
+RUN:   --data - </dev/null
+
+RUN: llvm-cas --cas %t/cas --make-blob \
+RUN:   --data %s
+
+RUN: llvm-cas --cas %t/cas --dump | FileCheck %s
+
+// check the dump format.
+CHECK:      index:
+CHECK-NEXT: hash-num-bits=
+CHECK-NEXT: root addr=
+// it should has at least one index
+CHECK-NEXT: - index=
+
+// two records
+CHECK:      record
+CHECK-NEXT: - addr=
+CHECK-NEXT: - addr=
+
+// both should be small enough to be in data pool
+CHECK:      pool:
+CHECK-NEXT: - addr=
+CHECK-NEXT: - addr=
diff --git a/llvm/test/tools/llvm-cas/lit.local.cfg b/llvm/test/tools/llvm-cas/lit.local.cfg
new file mode 100644
index 0000000000000..379945b68925d
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/lit.local.cfg
@@ -0,0 +1,2 @@
+if not config.have_ondisk_cas:
+    config.unsupported = True
diff --git a/llvm/test/tools/llvm-cas/make-blob.test b/llvm/test/tools/llvm-cas/make-blob.test
new file mode 100644
index 0000000000000..532a3a3351f80
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/make-blob.test
@@ -0,0 +1,41 @@
+RUN: rm -rf %t %t.cas
+RUN: mkdir %t
+
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN:   --data - </dev/null >%t/empty.casid
+RUN: sed -e 's,^.,CHECK: ,' <%t/empty.casid >%t/empty.check
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN:   --data /dev/null | FileCheck %t/empty.check
+RUN: echo "abc" | \
+RUN:   llvm-cas --cas %t.cas --make-blob \
+RUN:   --data - >%t/abc.casid
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN:   --data %S/Inputs/oneline >%t/oneline.casid
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN:   --data %S/Inputs/oneline-nonewline >%t/oneline-nonewline.casid
+
+RUN: llvm-cas --cas %t.cas --cat-node-data @%t/empty.casid |\
+RUN:   FileCheck %s -check-prefix CHECK-EMPTY -allow-empty
+CHECK-EMPTY-NOT: {{.}}
+
+RUN: llvm-cas --cas %t.cas --cat-node-data @%t/abc.casid |\
+RUN:   FileCheck %s -check-prefix CHECK-ABC
+CHECK-ABC: abc
+
+RUN: llvm-cas --cas %t.cas --cat-node-data @%t/oneline-nonewline.casid |\
+RUN:   FileCheck %s -check-prefix CHECK-ONELINE
+RUN: llvm-cas --cas %t.cas --cat-node-data @%t/oneline.casid |\
+RUN:   FileCheck %s -check-prefix CHECK-ONELINE
+CHECK-ONELINE: content
+
+# Double-check newlines.
+RUN: llvm-cas --cas %t.cas --cat-node-data @%t/oneline-nonewline.casid \
+RUN:   >%t/oneline-nonewline
+RUN: diff %S/Inputs/oneline-nonewline %t/oneline-nonewline
+RUN: llvm-cas --cas %t.cas --cat-node-data @%t/oneline.casid \
+RUN:   >%t/oneline
+RUN: diff %S/Inputs/oneline %t/oneline
+
+# Validate
+RUN: llvm-cas --cas %t.cas --validate-object @%t/oneline-nonewline.casid
+RUN: llvm-cas --cas %t.cas --validate-object @%t/oneline.casid
diff --git a/llvm/test/tools/llvm-cas/make-node.test b/llvm/test/tools/llvm-cas/make-node.test
new file mode 100644
index 0000000000000..de548af8fa2bf
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/make-node.test
@@ -0,0 +1,37 @@
+RUN: rm -rf %t
+RUN: mkdir %t
+
+# Make some empty objects.
+RUN: llvm-cas --cas %t/cas --make-node \
+RUN:   --data - </dev/null >%t/empty.casid
+
+RUN: llvm-cas --cas %t/cas --cat-node-data @%t/empty.casid |\
+RUN:   FileCheck %s -check-prefix CHECK-EMPTY -allow-empty
+RUN: llvm-cas --cas %t/cas --ls-node-refs @%t/empty.casid |\
+RUN:   FileCheck %s -check-prefix CHECK-EMPTY -allow-empty
+CHECK-EMPTY-NOT: {{.}}
+
+# Make a complex object, which references existing ones. Reference a blob and
+# other objects, and reference one of them twice to be sure they don't get
+# deduped.
+RUN: llvm-cas --cas %t/cas --make-blob --data /dev/null \
+RUN:   >%t/empty-blob.casid
+RUN: cat %t/empty.casid %t/empty.casid %t/empty-blob.casid \
+RUN:   >%t/complex.refs
+RUN: cat %t/complex.refs | sed -e 's,^.,CHECK: ,' > %t/complex.check
+RUN: llvm-cas --cas %t/cas --make-node \
+RUN:   --data %S/Inputs/oneline @%t/complex.refs \
+RUN:   >%t/complex.casid
+RUN: llvm-cas --cas %t/cas --cat-node-data \
+RUN:   @%t/complex.casid | FileCheck %s -check-prefix COMPLEX-DATA
+RUN: llvm-cas --cas %t/cas --ls-node-refs @%t/complex.casid |\
+RUN:   FileCheck %t/complex.check
+COMPLEX-DATA: content
+
+RUN: llvm-cas --cas %t/cas --validate-object @%t/complex.casid
+
+# Import from a new CAS.
+RUN: llvm-cas --cas %t/new-cas --upstream-cas %t/cas --import @%t/complex.casid
+RUN: llvm-cas --cas %t/new-cas --cat-node-data \
+RUN:   @%t/complex.casid | FileCheck %s -check-prefix COMPLEX-DATA
+RUN: llvm-cas --cas %t/new-cas --validate
diff --git a/llvm/test/tools/llvm-cas/print-id.test b/llvm/test/tools/llvm-cas/print-id.test
new file mode 100644
index 0000000000000..5a2efd58dde11
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/print-id.test
@@ -0,0 +1,13 @@
+RUN: rm -rf %t
+RUN: mkdir %t
+
+RUN: llvm-cas --cas %t/cas --make-blob --data %S/Inputs/oneline > %t/id
+
+# Confirm that the ID has the right prefix, is well-formed, and that there's
+# nothing else on the line.
+RUN: FileCheck %s --match-full-lines --strict-whitespace <%t/id
+CHECK:llvmcas://{{[a-z0-9]+}}
+
+# Confirm that there's a newline after.
+RUN: wc -l <%t/id | FileCheck %s -check-prefix=NEWLINE
+NEWLINE: 1
diff --git a/llvm/test/tools/llvm-cas/validation.test b/llvm/test/tools/llvm-cas/validation.test
new file mode 100644
index 0000000000000..13f24f0873463
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/validation.test
@@ -0,0 +1,31 @@
+RUN: rm -rf %t
+RUN: mkdir %t
+
+# Ingest a blob which just fits inside the CAS data pool to make sure the validate passes.
+RUN: truncate -s 7 %t/file
+RUN: cat %t/file | \
+RUN:   llvm-cas --cas %t/cas  --make-blob \
+RUN:   --data -
+RUN: llvm-cas --cas %t/cas --validate --check-hash
+
+RUN: llvm-cas --cas %t/cas --validate
+RUN: llvm-cas --cas %t/cas --validate --check-hash
+
+RUN: rm %t/cas/v1.1/data.v1
+RUN: not llvm-cas --cas %t/cas --validate
+RUN: not llvm-cas --cas %t/cas --validate --check-hash
+
+RUN: mkdir %t/ac
+
+RUN: llvm-cas --cas %t/ac --make-blob \
+RUN:   --data /dev/null > %t/empty.casid
+RUN: echo "abc" | \
+RUN:   llvm-cas --cas %t/ac  --make-blob \
+RUN:   --data - >%t/abc.casid
+
+RUN: llvm-cas --cas %t/ac --put-cache-key @%t/abc.casid @%t/empty.casid
+RUN: llvm-cas --cas %t/ac --validate
+# Note: records are 40 bytes (32 hash bytes + 8 byte value), so trim the last
+# allocated record, leaving it invalid.
+RUN: truncate -s -40 %t/ac/v1.1/actions.v1
+RUN: not llvm-cas --cas %t/ac --validate
diff --git a/llvm/test/tools/llvm-config/paths.test b/llvm/test/tools/llvm-config/paths.test
index 419f155ae1f83..61d86f7eb0ba1 100644
--- a/llvm/test/tools/llvm-config/paths.test
+++ b/llvm/test/tools/llvm-config/paths.test
@@ -4,18 +4,34 @@ RUN: llvm-config --bindir 2>&1 | FileCheck --check-prefix=CHECK-BINDIR %s
 CHECK-BINDIR: {{.*}}{{/|\\}}bin
 CHECK-BINDIR-NOT: error:
 CHECK-BINDIR-NOT: warning
+RUN: llvm-config --bindir --quote-paths 2>&1 | FileCheck --check-prefix=CHECK-BINDIR2 %s
+CHECK-BINDIR2: {{.*}}{{/|\\\\}}bin
+CHECK-BINDIR2-NOT: error:
+CHECK-BINDIR2-NOT: warning
 
 RUN: llvm-config --includedir 2>&1 | FileCheck --check-prefix=CHECK-INCLUDEDIR %s
 CHECK-INCLUDEDIR: {{.*}}{{/|\\}}include
 CHECK-INCLUDEDIR-NOT: error:
 CHECK-INCLUDEDIR-NOT: warning
+RUN: llvm-config --includedir --quote-paths 2>&1 | FileCheck --check-prefix=CHECK-INCLUDEDIR2 %s
+CHECK-INCLUDEDIR2: {{.*}}{{/|\\\\}}include
+CHECK-INCLUDEDIR2-NOT: error:
+CHECK-INCLUDEDIR2-NOT: warning
 
 RUN: llvm-config --libdir 2>&1 | FileCheck --check-prefix=CHECK-LIBDIR %s
 CHECK-LIBDIR: {{.*}}{{/|\\}}lib{{.*}}
 CHECK-LIBDIR-NOT: error:
 CHECK-LIBDIR-NOT: warning
+RUN: llvm-config --libdir --quote-paths 2>&1 | FileCheck --check-prefix=CHECK-LIBDIR2 %s
+CHECK-LIBDIR2: {{.*}}{{/|\\\\}}lib{{.*}}
+CHECK-LIBDIR2-NOT: error:
+CHECK-LIBDIR2-NOT: warning
 
 RUN: llvm-config --cmakedir 2>&1 | FileCheck --check-prefix=CHECK-CMAKEDIR %s
 CHECK-CMAKEDIR: {{.*}}{{/|\\}}cmake{{/|\\}}llvm
 CHECK-CMAKEDIR-NOT: error:
 CHECK-CMAKEDIR-NOT: warning
+RUN: llvm-config --cmakedir --quote-paths 2>&1 | FileCheck --check-prefix=CHECK-CMAKEDIR2 %s
+CHECK-CMAKEDIR2: {{.*}}{{/|\\\\}}cmake{{/|\\\\}}llvm
+CHECK-CMAKEDIR2-NOT: error:
+CHECK-CMAKEDIR2-NOT: warning
diff --git a/llvm/test/tools/llvm-dwarfdump/AArch64/DW_AT_APPLE_property.s b/llvm/test/tools/llvm-dwarfdump/AArch64/DW_AT_APPLE_property.s
new file mode 100644
index 0000000000000..6c38791b0a083
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/AArch64/DW_AT_APPLE_property.s
@@ -0,0 +1,126 @@
+# Checks that we correctly display the DW_AT_APPLE_property_name of a
+# referenced DW_TAG_APPLE_property.
+#
+# RUN: llvm-mc -triple=aarch64--darwin -filetype=obj -o %t.o < %s
+# RUN: not llvm-dwarfdump %t.o 2> %t.errs.txt | FileCheck %s
+# RUN: FileCheck %s --check-prefix=ERRORS < %t.errs.txt 
+
+# CHECK: 0x[[PROP_REF:[0-9a-f]+]]: DW_TAG_APPLE_property
+# CHECK-NEXT: DW_AT_APPLE_property_name ("autoSynthProp")
+#
+# CHECK: 0x[[NO_NAME_PROP:[0-9a-f]+]]: DW_TAG_APPLE_property
+# CHECK-NOT: DW_AT_APPLE_property_name
+#
+# CHECK: 0x[[INVALID_STRP:[0-9a-f]+]]: DW_TAG_APPLE_property
+# CHECK-NEXT: DW_AT_APPLE_property_name
+#
+# CHECK: DW_TAG_member
+# CHECK:   DW_AT_APPLE_property  (0x[[PROP_REF]] "autoSynthProp")
+# CHECK:   DW_AT_APPLE_property  (0x[[NO_NAME_PROP]] "")
+# CHECK:   DW_AT_APPLE_property  (0x{{.*}})
+# CHECK:   DW_AT_APPLE_property  (0x{{.*}})
+# CHECK:   DW_AT_APPLE_property  (0x[[INVALID_STRP]])
+
+# ERRORS: error: decoding DW_AT_APPLE_property_name: not referencing a DW_TAG_APPLE_property
+# ERRORS: error: decoding DW_AT_APPLE_property_name: invalid DIE
+# ERRORS: error: decoding DW_AT_APPLE_property_name: DW_FORM_strp offset 102 is beyond .debug_str bounds
+
+	.section	__DWARF,__debug_abbrev,regular,debug
+Lsection_abbrev:
+	.byte	1                               ; Abbreviation Code
+	.byte	17                              ; DW_TAG_compile_unit
+	.byte	1                               ; DW_CHILDREN_yes
+	.byte	114                             ; DW_AT_str_offsets_base
+	.byte	23                              ; DW_FORM_sec_offset
+	.byte	0                               ; EOM(1)
+	.byte	0                               ; EOM(2)
+	.byte	2                               ; Abbreviation Code
+	.byte	19                              ; DW_TAG_structure_type
+	.byte	1                               ; DW_CHILDREN_yes
+	.byte	3                               ; DW_AT_name
+	.byte	37                              ; DW_FORM_strx1
+	.byte	0                               ; EOM(1)
+	.byte	0                               ; EOM(2)
+	.byte	3                               ; Abbreviation Code
+	.ascii	"\200\204\001"                  ; DW_TAG_APPLE_property
+	.byte	0                               ; DW_CHILDREN_no
+	.ascii	"\350\177"                      ; DW_AT_APPLE_property_name
+	.byte	37                              ; DW_FORM_strx1
+	.byte	0                               ; EOM(1)
+	.byte	0                               ; EOM(2)
+	.byte	4                               ; Abbreviation Code
+	.ascii	"\200\204\001"                  ; DW_TAG_APPLE_property
+	.byte	0                               ; DW_CHILDREN_no
+	.byte	0                               ; EOM(1)
+	.byte	0                               ; EOM(2)
+	.byte	5                               ; Abbreviation Code
+	.ascii	"\200\204\001"                  ; DW_TAG_APPLE_property
+	.byte	0                               ; DW_CHILDREN_no
+	.ascii	"\350\177"                      ; DW_AT_APPLE_property_name
+	.byte	14                              ; DW_FORM_strp
+	.byte	0                               ; EOM(1)
+	.byte	0                               ; EOM(2)
+	.byte	6                               ; Abbreviation Code
+	.byte	13                              ; DW_TAG_member
+	.byte	0                               ; DW_CHILDREN_no
+	.byte	3                               ; DW_AT_name
+	.byte	37                              ; DW_FORM_strx1
+	.ascii	"\355\177"                      ; DW_AT_APPLE_property
+	.byte	19                              ; DW_FORM_ref4
+	.ascii	"\355\177"                      ; DW_AT_APPLE_property
+	.byte	19                              ; DW_FORM_ref4
+	.ascii	"\355\177"                      ; DW_AT_APPLE_property
+	.byte	19                              ; DW_FORM_ref4
+	.ascii	"\355\177"                      ; DW_AT_APPLE_property
+	.byte	19                              ; DW_FORM_ref4
+	.ascii	"\355\177"                      ; DW_AT_APPLE_property
+	.byte	19                              ; DW_FORM_ref4
+	.byte	0                               ; EOM(1)
+	.byte	0                               ; EOM(2)
+	.byte	0                               ; EOM(3)
+	.section	__DWARF,__debug_info,regular,debug
+Lsection_info:
+Lcu_begin0:
+Lset0 = Ldebug_info_end0-Ldebug_info_start0 ; Length of Unit
+	.long	Lset0
+Ldebug_info_start0:
+	.short	5                               ; DWARF version number
+	.byte	1                               ; DWARF Unit Type
+	.byte	8                               ; Address Size (in bytes)
+Lset1 = Lsection_abbrev-Lsection_abbrev ; Offset Into Abbrev. Section
+	.long	Lset1
+	.byte	1                               ; Abbrev [1] DW_TAG_compile_unit
+Lset2 = Lstr_offsets_base0-Lsection_str_off ; DW_AT_str_offsets_base
+	.long	Lset2
+	.byte	2                               ; Abbrev [2] DW_TAG_structure_type
+	.byte	2                               ; DW_AT_name
+	.byte	3                               ; Abbrev [3] DW_TAG_APPLE_property
+	.byte	0                               ; DW_AT_APPLE_property_name
+	.byte	4                               ; Abbrev [4] DW_TAG_APPLE_property
+	.byte	5                               ; Abbrev [5] DW_TAG_APPLE_property
+	.long	102                             ; DW_AT_APPLE_property_name
+	.byte	6                               ; Abbrev [6] DW_TAG_member
+	.byte	1                               ; DW_AT_name
+	.long	19                              ; DW_AT_APPLE_property
+	.long	21                              ; DW_AT_APPLE_property
+	.long	17                              ; DW_AT_APPLE_property
+	.long	0                               ; DW_AT_APPLE_property
+	.long	22                              ; DW_AT_APPLE_property
+	.byte	0                               ; End Of Children Mark
+	.byte	0                               ; End Of Children Mark
+Ldebug_info_end0:
+	.section	__DWARF,__debug_str_offs,regular,debug
+Lsection_str_off:
+	.long	16                              ; Length of String Offsets Set
+	.short	5
+	.short	0
+Lstr_offsets_base0:
+	.section	__DWARF,__debug_str,regular,debug
+Linfo_string:
+	.asciz	"autoSynthProp"                 ; string offset=0
+	.asciz	"_var"                          ; string offset=14
+	.asciz	"Foo"                           ; string offset=19
+	.section	__DWARF,__debug_str_offs,regular,debug
+	.long	0
+	.long	14
+	.long	19
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml b/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml
new file mode 100644
index 0000000000000..2a8c37da80e64
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml
@@ -0,0 +1,136 @@
+## Tests the --filter-child-tag (-t) option.
+
+# RUN: yaml2obj %s -o %t.o
+
+# RUN: llvm-dwarfdump %t.o --filter-child-tag=DW_TAG_structure_type | FileCheck %s --check-prefix=ONLY_STRUCT
+
+# ONLY_STRUCT: DW_TAG_compile_unit
+# ONLY_STRUCT-NOT: DW_TAG_namespace
+# ONLY_STRUCT-NOT: DW_TAG_structure_type
+
+# RUN: llvm-dwarfdump %t.o -t DW_TAG_structure_type -t DW_TAG_namespace | \
+# RUN:     FileCheck %s --check-prefix=STRUCT_AND_NS --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_member
+
+# STRUCT_AND_NS: DW_TAG_compile_unit
+# STRUCT_AND_NS: DW_TAG_namespace
+# STRUCT_AND_NS: DW_TAG_structure_type
+# STRUCT_AND_NS: DW_TAG_structure_type
+
+# RUN: llvm-dwarfdump %t.o -c --name=Foo -t DW_TAG_member | \
+# RUN:     FileCheck %s --check-prefix=FOO_MEM --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace
+
+# FOO_MEM: DW_TAG_structure_type
+# FOO_MEM: DW_TAG_member
+# FOO_MEM: DW_TAG_member
+# FOO_MEM: DW_TAG_member
+# FOO_MEM-NOT: DW_TAG_structure_type
+# FOO_MEM-NOT: DW_TAG_member
+
+# RUN: llvm-dwarfdump %t.o -c --name=Foo -t not_a_tag -t DW_TAG_member | \
+# RUN:     FileCheck %s --check-prefix=SINGLE_INVALID_TAG --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace
+
+# SINGLE_INVALID_TAG: DW_TAG_structure_type
+# SINGLE_INVALID_TAG: DW_TAG_member
+# SINGLE_INVALID_TAG: DW_TAG_member
+# SINGLE_INVALID_TAG: DW_TAG_member
+# SINGLE_INVALID_TAG-NOT: DW_TAG_structure_type
+# SINGLE_INVALID_TAG-NOT: DW_TAG_member
+
+# RUN: llvm-dwarfdump %t.o -c --name=Foo -t not_a_tag | \
+# RUN:     FileCheck %s --check-prefix=ONLY_INVALID_TAGS --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace --implicit-check-not=DW_TAG_member
+
+# ONLY_INVALID_TAGS: DW_TAG_structure_type
+# ONLY_INVALID_TAGS-NOT: DW_TAG_structure_type
+
+# RUN: llvm-dwarfdump %t.o -c -p --name=Foo -t DW_TAG_member | \
+# RUN:     FileCheck %s --check-prefix=FOO_MEM_WITH_PARENT --implicit-check-not=DW_TAG_subprogram
+
+# FOO_MEM_WITH_PARENT: DW_TAG_compile_unit
+# FOO_MEM_WITH_PARENT: DW_TAG_namespace
+# FOO_MEM_WITH_PARENT: DW_TAG_structure_type
+# FOO_MEM_WITH_PARENT: DW_TAG_member
+# FOO_MEM_WITH_PARENT: DW_TAG_member
+# FOO_MEM_WITH_PARENT: DW_TAG_member
+# FOO_MEM_WITH_PARENT-NOT: DW_TAG_structure_type
+# FOO_MEM_WITH_PARENT-NOT: DW_TAG_member
+
+## Not specifying --show-children ignores the --filter-child-tag option.
+# RUN: llvm-dwarfdump %t.o --name=Foo -t DW_TAG_member 2>&1 | FileCheck %s --check-prefix=NO_SHOW_CHILDREN
+
+# NO_SHOW_CHILDREN: DW_TAG_structure_type
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+DWARF:
+  debug_abbrev:
+    - Table:
+      - Tag:      DW_TAG_compile_unit
+        Children: DW_CHILDREN_yes
+        Attributes:
+          - Attribute: DW_AT_producer
+            Form:      DW_FORM_string
+      - Tag:      DW_TAG_namespace
+        Children: DW_CHILDREN_yes
+        Attributes:
+          - Attribute: DW_AT_name
+            Form:      DW_FORM_string
+      - Tag:      DW_TAG_structure_type
+        Children: DW_CHILDREN_yes
+        Attributes:
+          - Attribute: DW_AT_name
+            Form:      DW_FORM_string
+      - Tag:      DW_TAG_member
+        Children: DW_CHILDREN_no
+        Attributes:
+          - Attribute: DW_AT_name
+            Form:      DW_FORM_string
+      - Tag:      DW_TAG_subprogram
+        Children: DW_CHILDREN_no
+        Attributes:
+          - Attribute: DW_AT_name
+            Form:      DW_FORM_string
+  debug_info:
+    - Version: 5
+      UnitType: DW_UT_compile
+      Entries:
+        - AbbrCode: 1
+          Values:
+            - CStr: handwritten
+        - AbbrCode: 2
+          Values:
+            - CStr: ns
+        - AbbrCode: 3
+          Values:
+          - CStr: Foo
+        - AbbrCode: 4
+          Values:
+            - CStr: mem1
+        - AbbrCode: 4
+          Values:
+            - CStr: mem2
+        - AbbrCode: 4
+          Values:
+            - CStr: mem3
+        - AbbrCode: 3
+          Values:
+            - CStr: NestedInFoo
+        - AbbrCode: 4
+          Values:
+            - CStr: NestedMem1
+        - AbbrCode: 4
+          Values:
+            - CStr: NestedMem2
+        - AbbrCode: 5
+          Values:
+            - CStr: NestedFunc
+        - AbbrCode: 0x0
+        - AbbrCode: 5
+          Values:
+            - CStr: FooFunc
+        - AbbrCode: 0x0
+        - AbbrCode: 0x0
+        - AbbrCode: 0x0
diff --git a/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt b/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt
index dfbac4ce0c4d3..ec061ff9185f2 100644
--- a/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt
+++ b/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt
@@ -1,33 +1,33 @@
 MAX_RELATION=4
-187	7072	1
-187	6968	2
+187	7052	1
+187	6949	2
 187	187	0
-187	7072	1
-187	6969	2
+187	7052	1
+187	6950	2
 187	10	0
-10	7072	1
-10	7072	2
-10	7072	3
-10	6961	4
+10	7052	1
+10	7052	2
+10	7052	3
+10	6942	4
 10	187	0
-187	6952	1
-187	7072	2
-187	1555	0
-1555	6882	1
-1555	6952	2
-187	7072	1
-187	6968	2
+187	6933	1
+187	7052	2
+187	1544	0
+1544	6863	1
+1544	6933	2
+187	7052	1
+187	6949	2
 187	187	0
-187	7072	1
-187	6969	2
+187	7052	1
+187	6950	2
 187	601	0
-601	7072	1
-601	7072	2
-601	7072	3
-601	6961	4
+601	7052	1
+601	7052	2
+601	7052	3
+601	6942	4
 601	187	0
-187	6952	1
-187	7072	2
-187	1555	0
-1555	6882	1
-1555	6952	2
+187	6933	1
+187	7052	2
+187	1544	0
+1544	6863	1
+1544	6933	2
diff --git a/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt b/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt
index dc436d123fd35..1b90a8a75a80e 100644
--- a/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt
+++ b/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt
@@ -1,4 +1,4 @@
-7173
+7152
 AAA	0
 AAD	1
 AADD	2
@@ -1440,5735 +1440,5714 @@ PSUBWrm	1437
 PSUBWrr	1438
 PSWAPDrm	1439
 PSWAPDrr	1440
-PT	1441
-PTCMMIMFP	1442
-PTCMMRLFP	1443
-PTCONJTCMMIMFP	1444
-PTCONJTFP	1445
-PTCVTROWD	1446
-PTCVTROWPS	1447
-PTDPBF	1448
-PTDPBHF	1449
-PTDPBSSD	1450
-PTDPBSSDV	1451
-PTDPBSUD	1452
-PTDPBSUDV	1453
-PTDPBUSD	1454
-PTDPBUSDV	1455
-PTDPBUUD	1456
-PTDPBUUDV	1457
-PTDPFP	1458
-PTDPHBF	1459
-PTDPHF	1460
-PTESTrm	1461
-PTESTrr	1462
-PTILELOADD	1463
-PTILELOADDRS	1464
-PTILELOADDRST	1465
-PTILELOADDRSV	1466
-PTILELOADDT	1467
-PTILELOADDV	1468
-PTILEMOVROWrre	1469
-PTILEMOVROWrreV	1470
-PTILEMOVROWrri	1471
-PTILEMOVROWrriV	1472
-PTILEPAIRLOAD	1473
-PTILEPAIRSTORE	1474
-PTILESTORED	1475
-PTILESTOREDV	1476
-PTILEZERO	1477
-PTILEZEROV	1478
-PTMMULTF	1479
-PTTCMMIMFP	1480
-PTTCMMRLFP	1481
-PTTDPBF	1482
-PTTDPFP	1483
-PTTMMULTF	1484
-PTTRANSPOSED	1485
-PTTRANSPOSEDV	1486
-PTWRITE	1487
-PTWRITEm	1488
-PTWRITEr	1489
-PUNPCKHBWrm	1490
-PUNPCKHBWrr	1491
-PUNPCKHDQrm	1492
-PUNPCKHDQrr	1493
-PUNPCKHQDQrm	1494
-PUNPCKHQDQrr	1495
-PUNPCKHWDrm	1496
-PUNPCKHWDrr	1497
-PUNPCKLBWrm	1498
-PUNPCKLBWrr	1499
-PUNPCKLDQrm	1500
-PUNPCKLDQrr	1501
-PUNPCKLQDQrm	1502
-PUNPCKLQDQrr	1503
-PUNPCKLWDrm	1504
-PUNPCKLWDrr	1505
-PUSH	1506
-PUSHA	1507
-PUSHCS	1508
-PUSHDS	1509
-PUSHES	1510
-PUSHF	1511
-PUSHFS	1512
-PUSHGS	1513
-PUSHP	1514
-PUSHSS	1515
-PVALIDATE	1516
-PXORrm	1517
-PXORrr	1518
-RCL	1519
-RCPPSm	1520
-RCPPSr	1521
-RCPSSm	1522
-RCPSSm_Int	1523
-RCPSSr	1524
-RCPSSr_Int	1525
-RCR	1526
-RDFLAGS	1527
-RDFSBASE	1528
-RDGSBASE	1529
-RDMSR	1530
-RDMSRLIST	1531
-RDMSRri	1532
-RDMSRri_EVEX	1533
-RDPID	1534
-RDPKRUr	1535
-RDPMC	1536
-RDPRU	1537
-RDRAND	1538
-RDSEED	1539
-RDSSPD	1540
-RDSSPQ	1541
-RDTSC	1542
-RDTSCP	1543
-REG_SEQUENCE	1544
-REPNE_PREFIX	1545
-REP_MOVSB	1546
-REP_MOVSD	1547
-REP_MOVSQ	1548
-REP_MOVSW	1549
-REP_PREFIX	1550
-REP_STOSB	1551
-REP_STOSD	1552
-REP_STOSQ	1553
-REP_STOSW	1554
-RET	1555
-RETI	1556
-REX	1557
-RMPADJUST	1558
-RMPQUERY	1559
-RMPUPDATE	1560
-ROL	1561
-ROR	1562
-RORX	1563
-ROUNDPDmi	1564
-ROUNDPDri	1565
-ROUNDPSmi	1566
-ROUNDPSri	1567
-ROUNDSDmi	1568
-ROUNDSDmi_Int	1569
-ROUNDSDri	1570
-ROUNDSDri_Int	1571
-ROUNDSSmi	1572
-ROUNDSSmi_Int	1573
-ROUNDSSri	1574
-ROUNDSSri_Int	1575
-RSM	1576
-RSQRTPSm	1577
-RSQRTPSr	1578
-RSQRTSSm	1579
-RSQRTSSm_Int	1580
-RSQRTSSr	1581
-RSQRTSSr_Int	1582
-RSTORSSP	1583
-SAHF	1584
-SALC	1585
-SAR	1586
-SARX	1587
-SAVEPREVSSP	1588
-SBB	1589
-SCASB	1590
-SCASL	1591
-SCASQ	1592
-SCASW	1593
-SEAMCALL	1594
-SEAMOPS	1595
-SEAMRET	1596
-SEG_ALLOCA	1597
-SEH_BeginEpilogue	1598
-SEH_EndEpilogue	1599
-SEH_EndPrologue	1600
-SEH_PushFrame	1601
-SEH_PushReg	1602
-SEH_SaveReg	1603
-SEH_SaveXMM	1604
-SEH_SetFrame	1605
-SEH_StackAlign	1606
-SEH_StackAlloc	1607
-SEH_UnwindV	1608
-SEH_UnwindVersion	1609
-SENDUIPI	1610
-SERIALIZE	1611
-SETB_C	1612
-SETCCm	1613
-SETCCm_EVEX	1614
-SETCCr	1615
-SETCCr_EVEX	1616
-SETSSBSY	1617
-SETZUCCm	1618
-SETZUCCr	1619
-SFENCE	1620
-SGDT	1621
-SHA	1622
-SHL	1623
-SHLD	1624
-SHLDROT	1625
-SHLX	1626
-SHR	1627
-SHRD	1628
-SHRDROT	1629
-SHRX	1630
-SHUFPDrmi	1631
-SHUFPDrri	1632
-SHUFPSrmi	1633
-SHUFPSrri	1634
-SIDT	1635
-SKINIT	1636
-SLDT	1637
-SLWPCB	1638
-SMSW	1639
-SQRTPDm	1640
-SQRTPDr	1641
-SQRTPSm	1642
-SQRTPSr	1643
-SQRTSDm	1644
-SQRTSDm_Int	1645
-SQRTSDr	1646
-SQRTSDr_Int	1647
-SQRTSSm	1648
-SQRTSSm_Int	1649
-SQRTSSr	1650
-SQRTSSr_Int	1651
-SQRT_F	1652
-SQRT_Fp	1653
-SS_PREFIX	1654
-STAC	1655
-STACKALLOC_W_PROBING	1656
-STACKMAP	1657
-STATEPOINT	1658
-STC	1659
-STD	1660
-STGI	1661
-STI	1662
-STMXCSR	1663
-STOSB	1664
-STOSL	1665
-STOSQ	1666
-STOSW	1667
-STR	1668
-STRm	1669
-STTILECFG	1670
-STTILECFG_EVEX	1671
-STUI	1672
-ST_F	1673
-ST_FP	1674
-ST_FPrr	1675
-ST_Fp	1676
-ST_FpP	1677
-ST_Frr	1678
-SUB	1679
-SUBPDrm	1680
-SUBPDrr	1681
-SUBPSrm	1682
-SUBPSrr	1683
-SUBREG_TO_REG	1684
-SUBR_F	1685
-SUBR_FI	1686
-SUBR_FPrST	1687
-SUBR_FST	1688
-SUBR_Fp	1689
-SUBR_FpI	1690
-SUBR_FrST	1691
-SUBSDrm	1692
-SUBSDrm_Int	1693
-SUBSDrr	1694
-SUBSDrr_Int	1695
-SUBSSrm	1696
-SUBSSrm_Int	1697
-SUBSSrr	1698
-SUBSSrr_Int	1699
-SUB_F	1700
-SUB_FI	1701
-SUB_FPrST	1702
-SUB_FST	1703
-SUB_Fp	1704
-SUB_FpI	1705
-SUB_FrST	1706
-SWAPGS	1707
-SYSCALL	1708
-SYSENTER	1709
-SYSEXIT	1710
-SYSRET	1711
-T	1712
-TAILJMPd	1713
-TAILJMPd_CC	1714
-TAILJMPm	1715
-TAILJMPr	1716
-TCMMIMFP	1717
-TCMMRLFP	1718
-TCONJTCMMIMFP	1719
-TCONJTFP	1720
-TCRETURN_HIPE	1721
-TCRETURN_WIN	1722
-TCRETURN_WINmi	1723
-TCRETURNdi	1724
-TCRETURNdicc	1725
-TCRETURNmi	1726
-TCRETURNri	1727
-TCVTROWD	1728
-TCVTROWPS	1729
-TDCALL	1730
-TDPBF	1731
-TDPBHF	1732
-TDPBSSD	1733
-TDPBSUD	1734
-TDPBUSD	1735
-TDPBUUD	1736
-TDPFP	1737
-TDPHBF	1738
-TDPHF	1739
-TEST	1740
-TESTUI	1741
-TILELOADD	1742
-TILELOADDRS	1743
-TILELOADDRST	1744
-TILELOADDRS_EVEX	1745
-TILELOADDT	1746
-TILELOADD_EVEX	1747
-TILEMOVROWrre	1748
-TILEMOVROWrri	1749
-TILERELEASE	1750
-TILESTORED	1751
-TILESTORED_EVEX	1752
-TILEZERO	1753
-TLBSYNC	1754
-TLSCall	1755
-TLS_addr	1756
-TLS_addrX	1757
-TLS_base_addr	1758
-TLS_base_addrX	1759
-TLS_desc	1760
-TMMULTF	1761
-TPAUSE	1762
-TRAP	1763
-TST_F	1764
-TST_Fp	1765
-TTCMMIMFP	1766
-TTCMMRLFP	1767
-TTDPBF	1768
-TTDPFP	1769
-TTMMULTF	1770
-TTRANSPOSED	1771
-TZCNT	1772
-TZMSK	1773
-UBSAN_UD	1774
-UCOMISDrm	1775
-UCOMISDrm_Int	1776
-UCOMISDrr	1777
-UCOMISDrr_Int	1778
-UCOMISSrm	1779
-UCOMISSrm_Int	1780
-UCOMISSrr	1781
-UCOMISSrr_Int	1782
-UCOM_FIPr	1783
-UCOM_FIr	1784
-UCOM_FPPr	1785
-UCOM_FPr	1786
-UCOM_FpIr	1787
-UCOM_Fpr	1788
-UCOM_Fr	1789
-UD	1790
-UIRET	1791
-UMONITOR	1792
-UMWAIT	1793
-UNPCKHPDrm	1794
-UNPCKHPDrr	1795
-UNPCKHPSrm	1796
-UNPCKHPSrr	1797
-UNPCKLPDrm	1798
-UNPCKLPDrr	1799
-UNPCKLPSrm	1800
-UNPCKLPSrr	1801
-URDMSRri	1802
-URDMSRri_EVEX	1803
-URDMSRrr	1804
-URDMSRrr_EVEX	1805
-UWRMSRir	1806
-UWRMSRir_EVEX	1807
-UWRMSRrr	1808
-UWRMSRrr_EVEX	1809
-V	1810
-VAARG	1811
-VAARG_X	1812
-VADDBF	1813
-VADDPDYrm	1814
-VADDPDYrr	1815
-VADDPDZ	1816
-VADDPDZrm	1817
-VADDPDZrmb	1818
-VADDPDZrmbk	1819
-VADDPDZrmbkz	1820
-VADDPDZrmk	1821
-VADDPDZrmkz	1822
-VADDPDZrr	1823
-VADDPDZrrb	1824
-VADDPDZrrbk	1825
-VADDPDZrrbkz	1826
-VADDPDZrrk	1827
-VADDPDZrrkz	1828
-VADDPDrm	1829
-VADDPDrr	1830
-VADDPHZ	1831
-VADDPHZrm	1832
-VADDPHZrmb	1833
-VADDPHZrmbk	1834
-VADDPHZrmbkz	1835
-VADDPHZrmk	1836
-VADDPHZrmkz	1837
-VADDPHZrr	1838
-VADDPHZrrb	1839
-VADDPHZrrbk	1840
-VADDPHZrrbkz	1841
-VADDPHZrrk	1842
-VADDPHZrrkz	1843
-VADDPSYrm	1844
-VADDPSYrr	1845
-VADDPSZ	1846
-VADDPSZrm	1847
-VADDPSZrmb	1848
-VADDPSZrmbk	1849
-VADDPSZrmbkz	1850
-VADDPSZrmk	1851
-VADDPSZrmkz	1852
-VADDPSZrr	1853
-VADDPSZrrb	1854
-VADDPSZrrbk	1855
-VADDPSZrrbkz	1856
-VADDPSZrrk	1857
-VADDPSZrrkz	1858
-VADDPSrm	1859
-VADDPSrr	1860
-VADDSDZrm	1861
-VADDSDZrm_Int	1862
-VADDSDZrmk_Int	1863
-VADDSDZrmkz_Int	1864
-VADDSDZrr	1865
-VADDSDZrr_Int	1866
-VADDSDZrrb_Int	1867
-VADDSDZrrbk_Int	1868
-VADDSDZrrbkz_Int	1869
-VADDSDZrrk_Int	1870
-VADDSDZrrkz_Int	1871
-VADDSDrm	1872
-VADDSDrm_Int	1873
-VADDSDrr	1874
-VADDSDrr_Int	1875
-VADDSHZrm	1876
-VADDSHZrm_Int	1877
-VADDSHZrmk_Int	1878
-VADDSHZrmkz_Int	1879
-VADDSHZrr	1880
-VADDSHZrr_Int	1881
-VADDSHZrrb_Int	1882
-VADDSHZrrbk_Int	1883
-VADDSHZrrbkz_Int	1884
-VADDSHZrrk_Int	1885
-VADDSHZrrkz_Int	1886
-VADDSSZrm	1887
-VADDSSZrm_Int	1888
-VADDSSZrmk_Int	1889
-VADDSSZrmkz_Int	1890
-VADDSSZrr	1891
-VADDSSZrr_Int	1892
-VADDSSZrrb_Int	1893
-VADDSSZrrbk_Int	1894
-VADDSSZrrbkz_Int	1895
-VADDSSZrrk_Int	1896
-VADDSSZrrkz_Int	1897
-VADDSSrm	1898
-VADDSSrm_Int	1899
-VADDSSrr	1900
-VADDSSrr_Int	1901
-VADDSUBPDYrm	1902
-VADDSUBPDYrr	1903
-VADDSUBPDrm	1904
-VADDSUBPDrr	1905
-VADDSUBPSYrm	1906
-VADDSUBPSYrr	1907
-VADDSUBPSrm	1908
-VADDSUBPSrr	1909
-VAESDECLASTYrm	1910
-VAESDECLASTYrr	1911
-VAESDECLASTZ	1912
-VAESDECLASTZrm	1913
-VAESDECLASTZrr	1914
-VAESDECLASTrm	1915
-VAESDECLASTrr	1916
-VAESDECYrm	1917
-VAESDECYrr	1918
-VAESDECZ	1919
-VAESDECZrm	1920
-VAESDECZrr	1921
-VAESDECrm	1922
-VAESDECrr	1923
-VAESENCLASTYrm	1924
-VAESENCLASTYrr	1925
-VAESENCLASTZ	1926
-VAESENCLASTZrm	1927
-VAESENCLASTZrr	1928
-VAESENCLASTrm	1929
-VAESENCLASTrr	1930
-VAESENCYrm	1931
-VAESENCYrr	1932
-VAESENCZ	1933
-VAESENCZrm	1934
-VAESENCZrr	1935
-VAESENCrm	1936
-VAESENCrr	1937
-VAESIMCrm	1938
-VAESIMCrr	1939
-VAESKEYGENASSISTrmi	1940
-VAESKEYGENASSISTrri	1941
-VALIGNDZ	1942
-VALIGNDZrmbi	1943
-VALIGNDZrmbik	1944
-VALIGNDZrmbikz	1945
-VALIGNDZrmi	1946
-VALIGNDZrmik	1947
-VALIGNDZrmikz	1948
-VALIGNDZrri	1949
-VALIGNDZrrik	1950
-VALIGNDZrrikz	1951
-VALIGNQZ	1952
-VALIGNQZrmbi	1953
-VALIGNQZrmbik	1954
-VALIGNQZrmbikz	1955
-VALIGNQZrmi	1956
-VALIGNQZrmik	1957
-VALIGNQZrmikz	1958
-VALIGNQZrri	1959
-VALIGNQZrrik	1960
-VALIGNQZrrikz	1961
-VANDNPDYrm	1962
-VANDNPDYrr	1963
-VANDNPDZ	1964
-VANDNPDZrm	1965
-VANDNPDZrmb	1966
-VANDNPDZrmbk	1967
-VANDNPDZrmbkz	1968
-VANDNPDZrmk	1969
-VANDNPDZrmkz	1970
-VANDNPDZrr	1971
-VANDNPDZrrk	1972
-VANDNPDZrrkz	1973
-VANDNPDrm	1974
-VANDNPDrr	1975
-VANDNPSYrm	1976
-VANDNPSYrr	1977
-VANDNPSZ	1978
-VANDNPSZrm	1979
-VANDNPSZrmb	1980
-VANDNPSZrmbk	1981
-VANDNPSZrmbkz	1982
-VANDNPSZrmk	1983
-VANDNPSZrmkz	1984
-VANDNPSZrr	1985
-VANDNPSZrrk	1986
-VANDNPSZrrkz	1987
-VANDNPSrm	1988
-VANDNPSrr	1989
-VANDPDYrm	1990
-VANDPDYrr	1991
-VANDPDZ	1992
-VANDPDZrm	1993
-VANDPDZrmb	1994
-VANDPDZrmbk	1995
-VANDPDZrmbkz	1996
-VANDPDZrmk	1997
-VANDPDZrmkz	1998
-VANDPDZrr	1999
-VANDPDZrrk	2000
-VANDPDZrrkz	2001
-VANDPDrm	2002
-VANDPDrr	2003
-VANDPSYrm	2004
-VANDPSYrr	2005
-VANDPSZ	2006
-VANDPSZrm	2007
-VANDPSZrmb	2008
-VANDPSZrmbk	2009
-VANDPSZrmbkz	2010
-VANDPSZrmk	2011
-VANDPSZrmkz	2012
-VANDPSZrr	2013
-VANDPSZrrk	2014
-VANDPSZrrkz	2015
-VANDPSrm	2016
-VANDPSrr	2017
-VASTART_SAVE_XMM_REGS	2018
-VBCSTNEBF	2019
-VBCSTNESH	2020
-VBLENDMPDZ	2021
-VBLENDMPDZrm	2022
-VBLENDMPDZrmb	2023
-VBLENDMPDZrmbk	2024
-VBLENDMPDZrmbkz	2025
-VBLENDMPDZrmk	2026
-VBLENDMPDZrmkz	2027
-VBLENDMPDZrr	2028
-VBLENDMPDZrrk	2029
-VBLENDMPDZrrkz	2030
-VBLENDMPSZ	2031
-VBLENDMPSZrm	2032
-VBLENDMPSZrmb	2033
-VBLENDMPSZrmbk	2034
-VBLENDMPSZrmbkz	2035
-VBLENDMPSZrmk	2036
-VBLENDMPSZrmkz	2037
-VBLENDMPSZrr	2038
-VBLENDMPSZrrk	2039
-VBLENDMPSZrrkz	2040
-VBLENDPDYrmi	2041
-VBLENDPDYrri	2042
-VBLENDPDrmi	2043
-VBLENDPDrri	2044
-VBLENDPSYrmi	2045
-VBLENDPSYrri	2046
-VBLENDPSrmi	2047
-VBLENDPSrri	2048
-VBLENDVPDYrmr	2049
-VBLENDVPDYrrr	2050
-VBLENDVPDrmr	2051
-VBLENDVPDrrr	2052
-VBLENDVPSYrmr	2053
-VBLENDVPSYrrr	2054
-VBLENDVPSrmr	2055
-VBLENDVPSrrr	2056
-VBROADCASTF	2057
-VBROADCASTI	2058
-VBROADCASTSDYrm	2059
-VBROADCASTSDYrr	2060
-VBROADCASTSDZ	2061
-VBROADCASTSDZrm	2062
-VBROADCASTSDZrmk	2063
-VBROADCASTSDZrmkz	2064
-VBROADCASTSDZrr	2065
-VBROADCASTSDZrrk	2066
-VBROADCASTSDZrrkz	2067
-VBROADCASTSSYrm	2068
-VBROADCASTSSYrr	2069
-VBROADCASTSSZ	2070
-VBROADCASTSSZrm	2071
-VBROADCASTSSZrmk	2072
-VBROADCASTSSZrmkz	2073
-VBROADCASTSSZrr	2074
-VBROADCASTSSZrrk	2075
-VBROADCASTSSZrrkz	2076
-VBROADCASTSSrm	2077
-VBROADCASTSSrr	2078
-VCMPBF	2079
-VCMPPDYrmi	2080
-VCMPPDYrri	2081
-VCMPPDZ	2082
-VCMPPDZrmbi	2083
-VCMPPDZrmbik	2084
-VCMPPDZrmi	2085
-VCMPPDZrmik	2086
-VCMPPDZrri	2087
-VCMPPDZrrib	2088
-VCMPPDZrribk	2089
-VCMPPDZrrik	2090
-VCMPPDrmi	2091
-VCMPPDrri	2092
-VCMPPHZ	2093
-VCMPPHZrmbi	2094
-VCMPPHZrmbik	2095
-VCMPPHZrmi	2096
-VCMPPHZrmik	2097
-VCMPPHZrri	2098
-VCMPPHZrrib	2099
-VCMPPHZrribk	2100
-VCMPPHZrrik	2101
-VCMPPSYrmi	2102
-VCMPPSYrri	2103
-VCMPPSZ	2104
-VCMPPSZrmbi	2105
-VCMPPSZrmbik	2106
-VCMPPSZrmi	2107
-VCMPPSZrmik	2108
-VCMPPSZrri	2109
-VCMPPSZrrib	2110
-VCMPPSZrribk	2111
-VCMPPSZrrik	2112
-VCMPPSrmi	2113
-VCMPPSrri	2114
-VCMPSDZrmi	2115
-VCMPSDZrmi_Int	2116
-VCMPSDZrmik_Int	2117
-VCMPSDZrri	2118
-VCMPSDZrri_Int	2119
-VCMPSDZrrib_Int	2120
-VCMPSDZrribk_Int	2121
-VCMPSDZrrik_Int	2122
-VCMPSDrmi	2123
-VCMPSDrmi_Int	2124
-VCMPSDrri	2125
-VCMPSDrri_Int	2126
-VCMPSHZrmi	2127
-VCMPSHZrmi_Int	2128
-VCMPSHZrmik_Int	2129
-VCMPSHZrri	2130
-VCMPSHZrri_Int	2131
-VCMPSHZrrib_Int	2132
-VCMPSHZrribk_Int	2133
-VCMPSHZrrik_Int	2134
-VCMPSSZrmi	2135
-VCMPSSZrmi_Int	2136
-VCMPSSZrmik_Int	2137
-VCMPSSZrri	2138
-VCMPSSZrri_Int	2139
-VCMPSSZrrib_Int	2140
-VCMPSSZrribk_Int	2141
-VCMPSSZrrik_Int	2142
-VCMPSSrmi	2143
-VCMPSSrmi_Int	2144
-VCMPSSrri	2145
-VCMPSSrri_Int	2146
-VCOMISBF	2147
-VCOMISDZrm	2148
-VCOMISDZrm_Int	2149
-VCOMISDZrr	2150
-VCOMISDZrr_Int	2151
-VCOMISDZrrb	2152
-VCOMISDrm	2153
-VCOMISDrm_Int	2154
-VCOMISDrr	2155
-VCOMISDrr_Int	2156
-VCOMISHZrm	2157
-VCOMISHZrm_Int	2158
-VCOMISHZrr	2159
-VCOMISHZrr_Int	2160
-VCOMISHZrrb	2161
-VCOMISSZrm	2162
-VCOMISSZrm_Int	2163
-VCOMISSZrr	2164
-VCOMISSZrr_Int	2165
-VCOMISSZrrb	2166
-VCOMISSrm	2167
-VCOMISSrm_Int	2168
-VCOMISSrr	2169
-VCOMISSrr_Int	2170
-VCOMPRESSPDZ	2171
-VCOMPRESSPDZmr	2172
-VCOMPRESSPDZmrk	2173
-VCOMPRESSPDZrr	2174
-VCOMPRESSPDZrrk	2175
-VCOMPRESSPDZrrkz	2176
-VCOMPRESSPSZ	2177
-VCOMPRESSPSZmr	2178
-VCOMPRESSPSZmrk	2179
-VCOMPRESSPSZrr	2180
-VCOMPRESSPSZrrk	2181
-VCOMPRESSPSZrrkz	2182
-VCOMXSDZrm_Int	2183
-VCOMXSDZrr_Int	2184
-VCOMXSDZrrb_Int	2185
-VCOMXSHZrm_Int	2186
-VCOMXSHZrr_Int	2187
-VCOMXSHZrrb_Int	2188
-VCOMXSSZrm_Int	2189
-VCOMXSSZrr_Int	2190
-VCOMXSSZrrb_Int	2191
-VCVT	2192
-VCVTBF	2193
-VCVTBIASPH	2194
-VCVTDQ	2195
-VCVTHF	2196
-VCVTNE	2197
-VCVTNEEBF	2198
-VCVTNEEPH	2199
-VCVTNEOBF	2200
-VCVTNEOPH	2201
-VCVTNEPS	2202
-VCVTPD	2203
-VCVTPH	2204
-VCVTPS	2205
-VCVTQQ	2206
-VCVTSD	2207
-VCVTSH	2208
-VCVTSI	2209
-VCVTSS	2210
-VCVTTBF	2211
-VCVTTPD	2212
-VCVTTPH	2213
-VCVTTPS	2214
-VCVTTSD	2215
-VCVTTSH	2216
-VCVTTSS	2217
-VCVTUDQ	2218
-VCVTUQQ	2219
-VCVTUSI	2220
-VCVTUW	2221
-VCVTW	2222
-VDBPSADBWZ	2223
-VDBPSADBWZrmi	2224
-VDBPSADBWZrmik	2225
-VDBPSADBWZrmikz	2226
-VDBPSADBWZrri	2227
-VDBPSADBWZrrik	2228
-VDBPSADBWZrrikz	2229
-VDIVBF	2230
-VDIVPDYrm	2231
-VDIVPDYrr	2232
-VDIVPDZ	2233
-VDIVPDZrm	2234
-VDIVPDZrmb	2235
-VDIVPDZrmbk	2236
-VDIVPDZrmbkz	2237
-VDIVPDZrmk	2238
-VDIVPDZrmkz	2239
-VDIVPDZrr	2240
-VDIVPDZrrb	2241
-VDIVPDZrrbk	2242
-VDIVPDZrrbkz	2243
-VDIVPDZrrk	2244
-VDIVPDZrrkz	2245
-VDIVPDrm	2246
-VDIVPDrr	2247
-VDIVPHZ	2248
-VDIVPHZrm	2249
-VDIVPHZrmb	2250
-VDIVPHZrmbk	2251
-VDIVPHZrmbkz	2252
-VDIVPHZrmk	2253
-VDIVPHZrmkz	2254
-VDIVPHZrr	2255
-VDIVPHZrrb	2256
-VDIVPHZrrbk	2257
-VDIVPHZrrbkz	2258
-VDIVPHZrrk	2259
-VDIVPHZrrkz	2260
-VDIVPSYrm	2261
-VDIVPSYrr	2262
-VDIVPSZ	2263
-VDIVPSZrm	2264
-VDIVPSZrmb	2265
-VDIVPSZrmbk	2266
-VDIVPSZrmbkz	2267
-VDIVPSZrmk	2268
-VDIVPSZrmkz	2269
-VDIVPSZrr	2270
-VDIVPSZrrb	2271
-VDIVPSZrrbk	2272
-VDIVPSZrrbkz	2273
-VDIVPSZrrk	2274
-VDIVPSZrrkz	2275
-VDIVPSrm	2276
-VDIVPSrr	2277
-VDIVSDZrm	2278
-VDIVSDZrm_Int	2279
-VDIVSDZrmk_Int	2280
-VDIVSDZrmkz_Int	2281
-VDIVSDZrr	2282
-VDIVSDZrr_Int	2283
-VDIVSDZrrb_Int	2284
-VDIVSDZrrbk_Int	2285
-VDIVSDZrrbkz_Int	2286
-VDIVSDZrrk_Int	2287
-VDIVSDZrrkz_Int	2288
-VDIVSDrm	2289
-VDIVSDrm_Int	2290
-VDIVSDrr	2291
-VDIVSDrr_Int	2292
-VDIVSHZrm	2293
-VDIVSHZrm_Int	2294
-VDIVSHZrmk_Int	2295
-VDIVSHZrmkz_Int	2296
-VDIVSHZrr	2297
-VDIVSHZrr_Int	2298
-VDIVSHZrrb_Int	2299
-VDIVSHZrrbk_Int	2300
-VDIVSHZrrbkz_Int	2301
-VDIVSHZrrk_Int	2302
-VDIVSHZrrkz_Int	2303
-VDIVSSZrm	2304
-VDIVSSZrm_Int	2305
-VDIVSSZrmk_Int	2306
-VDIVSSZrmkz_Int	2307
-VDIVSSZrr	2308
-VDIVSSZrr_Int	2309
-VDIVSSZrrb_Int	2310
-VDIVSSZrrbk_Int	2311
-VDIVSSZrrbkz_Int	2312
-VDIVSSZrrk_Int	2313
-VDIVSSZrrkz_Int	2314
-VDIVSSrm	2315
-VDIVSSrm_Int	2316
-VDIVSSrr	2317
-VDIVSSrr_Int	2318
-VDPBF	2319
-VDPPDrmi	2320
-VDPPDrri	2321
-VDPPHPSZ	2322
-VDPPHPSZm	2323
-VDPPHPSZmb	2324
-VDPPHPSZmbk	2325
-VDPPHPSZmbkz	2326
-VDPPHPSZmk	2327
-VDPPHPSZmkz	2328
-VDPPHPSZr	2329
-VDPPHPSZrk	2330
-VDPPHPSZrkz	2331
-VDPPSYrmi	2332
-VDPPSYrri	2333
-VDPPSrmi	2334
-VDPPSrri	2335
-VERRm	2336
-VERRr	2337
-VERWm	2338
-VERWr	2339
-VEXP	2340
-VEXPANDPDZ	2341
-VEXPANDPDZrm	2342
-VEXPANDPDZrmk	2343
-VEXPANDPDZrmkz	2344
-VEXPANDPDZrr	2345
-VEXPANDPDZrrk	2346
-VEXPANDPDZrrkz	2347
-VEXPANDPSZ	2348
-VEXPANDPSZrm	2349
-VEXPANDPSZrmk	2350
-VEXPANDPSZrmkz	2351
-VEXPANDPSZrr	2352
-VEXPANDPSZrrk	2353
-VEXPANDPSZrrkz	2354
-VEXTRACTF	2355
-VEXTRACTI	2356
-VEXTRACTPSZmri	2357
-VEXTRACTPSZrri	2358
-VEXTRACTPSmri	2359
-VEXTRACTPSrri	2360
-VFCMADDCPHZ	2361
-VFCMADDCPHZm	2362
-VFCMADDCPHZmb	2363
-VFCMADDCPHZmbk	2364
-VFCMADDCPHZmbkz	2365
-VFCMADDCPHZmk	2366
-VFCMADDCPHZmkz	2367
-VFCMADDCPHZr	2368
-VFCMADDCPHZrb	2369
-VFCMADDCPHZrbk	2370
-VFCMADDCPHZrbkz	2371
-VFCMADDCPHZrk	2372
-VFCMADDCPHZrkz	2373
-VFCMADDCSHZm	2374
-VFCMADDCSHZmk	2375
-VFCMADDCSHZmkz	2376
-VFCMADDCSHZr	2377
-VFCMADDCSHZrb	2378
-VFCMADDCSHZrbk	2379
-VFCMADDCSHZrbkz	2380
-VFCMADDCSHZrk	2381
-VFCMADDCSHZrkz	2382
-VFCMULCPHZ	2383
-VFCMULCPHZrm	2384
-VFCMULCPHZrmb	2385
-VFCMULCPHZrmbk	2386
-VFCMULCPHZrmbkz	2387
-VFCMULCPHZrmk	2388
-VFCMULCPHZrmkz	2389
-VFCMULCPHZrr	2390
-VFCMULCPHZrrb	2391
-VFCMULCPHZrrbk	2392
-VFCMULCPHZrrbkz	2393
-VFCMULCPHZrrk	2394
-VFCMULCPHZrrkz	2395
-VFCMULCSHZrm	2396
-VFCMULCSHZrmk	2397
-VFCMULCSHZrmkz	2398
-VFCMULCSHZrr	2399
-VFCMULCSHZrrb	2400
-VFCMULCSHZrrbk	2401
-VFCMULCSHZrrbkz	2402
-VFCMULCSHZrrk	2403
-VFCMULCSHZrrkz	2404
-VFIXUPIMMPDZ	2405
-VFIXUPIMMPDZrmbi	2406
-VFIXUPIMMPDZrmbik	2407
-VFIXUPIMMPDZrmbikz	2408
-VFIXUPIMMPDZrmi	2409
-VFIXUPIMMPDZrmik	2410
-VFIXUPIMMPDZrmikz	2411
-VFIXUPIMMPDZrri	2412
-VFIXUPIMMPDZrrib	2413
-VFIXUPIMMPDZrribk	2414
-VFIXUPIMMPDZrribkz	2415
-VFIXUPIMMPDZrrik	2416
-VFIXUPIMMPDZrrikz	2417
-VFIXUPIMMPSZ	2418
-VFIXUPIMMPSZrmbi	2419
-VFIXUPIMMPSZrmbik	2420
-VFIXUPIMMPSZrmbikz	2421
-VFIXUPIMMPSZrmi	2422
-VFIXUPIMMPSZrmik	2423
-VFIXUPIMMPSZrmikz	2424
-VFIXUPIMMPSZrri	2425
-VFIXUPIMMPSZrrib	2426
-VFIXUPIMMPSZrribk	2427
-VFIXUPIMMPSZrribkz	2428
-VFIXUPIMMPSZrrik	2429
-VFIXUPIMMPSZrrikz	2430
-VFIXUPIMMSDZrmi	2431
-VFIXUPIMMSDZrmik	2432
-VFIXUPIMMSDZrmikz	2433
-VFIXUPIMMSDZrri	2434
-VFIXUPIMMSDZrrib	2435
-VFIXUPIMMSDZrribk	2436
-VFIXUPIMMSDZrribkz	2437
-VFIXUPIMMSDZrrik	2438
-VFIXUPIMMSDZrrikz	2439
-VFIXUPIMMSSZrmi	2440
-VFIXUPIMMSSZrmik	2441
-VFIXUPIMMSSZrmikz	2442
-VFIXUPIMMSSZrri	2443
-VFIXUPIMMSSZrrib	2444
-VFIXUPIMMSSZrribk	2445
-VFIXUPIMMSSZrribkz	2446
-VFIXUPIMMSSZrrik	2447
-VFIXUPIMMSSZrrikz	2448
-VFMADD	2449
-VFMADDCPHZ	2450
-VFMADDCPHZm	2451
-VFMADDCPHZmb	2452
-VFMADDCPHZmbk	2453
-VFMADDCPHZmbkz	2454
-VFMADDCPHZmk	2455
-VFMADDCPHZmkz	2456
-VFMADDCPHZr	2457
-VFMADDCPHZrb	2458
-VFMADDCPHZrbk	2459
-VFMADDCPHZrbkz	2460
-VFMADDCPHZrk	2461
-VFMADDCPHZrkz	2462
-VFMADDCSHZm	2463
-VFMADDCSHZmk	2464
-VFMADDCSHZmkz	2465
-VFMADDCSHZr	2466
-VFMADDCSHZrb	2467
-VFMADDCSHZrbk	2468
-VFMADDCSHZrbkz	2469
-VFMADDCSHZrk	2470
-VFMADDCSHZrkz	2471
-VFMADDPD	2472
-VFMADDPS	2473
-VFMADDSD	2474
-VFMADDSS	2475
-VFMADDSUB	2476
-VFMADDSUBPD	2477
-VFMADDSUBPS	2478
-VFMSUB	2479
-VFMSUBADD	2480
-VFMSUBADDPD	2481
-VFMSUBADDPS	2482
-VFMSUBPD	2483
-VFMSUBPS	2484
-VFMSUBSD	2485
-VFMSUBSS	2486
-VFMULCPHZ	2487
-VFMULCPHZrm	2488
-VFMULCPHZrmb	2489
-VFMULCPHZrmbk	2490
-VFMULCPHZrmbkz	2491
-VFMULCPHZrmk	2492
-VFMULCPHZrmkz	2493
-VFMULCPHZrr	2494
-VFMULCPHZrrb	2495
-VFMULCPHZrrbk	2496
-VFMULCPHZrrbkz	2497
-VFMULCPHZrrk	2498
-VFMULCPHZrrkz	2499
-VFMULCSHZrm	2500
-VFMULCSHZrmk	2501
-VFMULCSHZrmkz	2502
-VFMULCSHZrr	2503
-VFMULCSHZrrb	2504
-VFMULCSHZrrbk	2505
-VFMULCSHZrrbkz	2506
-VFMULCSHZrrk	2507
-VFMULCSHZrrkz	2508
-VFNMADD	2509
-VFNMADDPD	2510
-VFNMADDPS	2511
-VFNMADDSD	2512
-VFNMADDSS	2513
-VFNMSUB	2514
-VFNMSUBPD	2515
-VFNMSUBPS	2516
-VFNMSUBSD	2517
-VFNMSUBSS	2518
-VFPCLASSBF	2519
-VFPCLASSPDZ	2520
-VFPCLASSPDZmbi	2521
-VFPCLASSPDZmbik	2522
-VFPCLASSPDZmi	2523
-VFPCLASSPDZmik	2524
-VFPCLASSPDZri	2525
-VFPCLASSPDZrik	2526
-VFPCLASSPHZ	2527
-VFPCLASSPHZmbi	2528
-VFPCLASSPHZmbik	2529
-VFPCLASSPHZmi	2530
-VFPCLASSPHZmik	2531
-VFPCLASSPHZri	2532
-VFPCLASSPHZrik	2533
-VFPCLASSPSZ	2534
-VFPCLASSPSZmbi	2535
-VFPCLASSPSZmbik	2536
-VFPCLASSPSZmi	2537
-VFPCLASSPSZmik	2538
-VFPCLASSPSZri	2539
-VFPCLASSPSZrik	2540
-VFPCLASSSDZmi	2541
-VFPCLASSSDZmik	2542
-VFPCLASSSDZri	2543
-VFPCLASSSDZrik	2544
-VFPCLASSSHZmi	2545
-VFPCLASSSHZmik	2546
-VFPCLASSSHZri	2547
-VFPCLASSSHZrik	2548
-VFPCLASSSSZmi	2549
-VFPCLASSSSZmik	2550
-VFPCLASSSSZri	2551
-VFPCLASSSSZrik	2552
-VFRCZPDYrm	2553
-VFRCZPDYrr	2554
-VFRCZPDrm	2555
-VFRCZPDrr	2556
-VFRCZPSYrm	2557
-VFRCZPSYrr	2558
-VFRCZPSrm	2559
-VFRCZPSrr	2560
-VFRCZSDrm	2561
-VFRCZSDrr	2562
-VFRCZSSrm	2563
-VFRCZSSrr	2564
-VGATHERDPDYrm	2565
-VGATHERDPDZ	2566
-VGATHERDPDZrm	2567
-VGATHERDPDrm	2568
-VGATHERDPSYrm	2569
-VGATHERDPSZ	2570
-VGATHERDPSZrm	2571
-VGATHERDPSrm	2572
-VGATHERPF	2573
-VGATHERQPDYrm	2574
-VGATHERQPDZ	2575
-VGATHERQPDZrm	2576
-VGATHERQPDrm	2577
-VGATHERQPSYrm	2578
-VGATHERQPSZ	2579
-VGATHERQPSZrm	2580
-VGATHERQPSrm	2581
-VGETEXPBF	2582
-VGETEXPPDZ	2583
-VGETEXPPDZm	2584
-VGETEXPPDZmb	2585
-VGETEXPPDZmbk	2586
-VGETEXPPDZmbkz	2587
-VGETEXPPDZmk	2588
-VGETEXPPDZmkz	2589
-VGETEXPPDZr	2590
-VGETEXPPDZrb	2591
-VGETEXPPDZrbk	2592
-VGETEXPPDZrbkz	2593
-VGETEXPPDZrk	2594
-VGETEXPPDZrkz	2595
-VGETEXPPHZ	2596
-VGETEXPPHZm	2597
-VGETEXPPHZmb	2598
-VGETEXPPHZmbk	2599
-VGETEXPPHZmbkz	2600
-VGETEXPPHZmk	2601
-VGETEXPPHZmkz	2602
-VGETEXPPHZr	2603
-VGETEXPPHZrb	2604
-VGETEXPPHZrbk	2605
-VGETEXPPHZrbkz	2606
-VGETEXPPHZrk	2607
-VGETEXPPHZrkz	2608
-VGETEXPPSZ	2609
-VGETEXPPSZm	2610
-VGETEXPPSZmb	2611
-VGETEXPPSZmbk	2612
-VGETEXPPSZmbkz	2613
-VGETEXPPSZmk	2614
-VGETEXPPSZmkz	2615
-VGETEXPPSZr	2616
-VGETEXPPSZrb	2617
-VGETEXPPSZrbk	2618
-VGETEXPPSZrbkz	2619
-VGETEXPPSZrk	2620
-VGETEXPPSZrkz	2621
-VGETEXPSDZm	2622
-VGETEXPSDZmk	2623
-VGETEXPSDZmkz	2624
-VGETEXPSDZr	2625
-VGETEXPSDZrb	2626
-VGETEXPSDZrbk	2627
-VGETEXPSDZrbkz	2628
-VGETEXPSDZrk	2629
-VGETEXPSDZrkz	2630
-VGETEXPSHZm	2631
-VGETEXPSHZmk	2632
-VGETEXPSHZmkz	2633
-VGETEXPSHZr	2634
-VGETEXPSHZrb	2635
-VGETEXPSHZrbk	2636
-VGETEXPSHZrbkz	2637
-VGETEXPSHZrk	2638
-VGETEXPSHZrkz	2639
-VGETEXPSSZm	2640
-VGETEXPSSZmk	2641
-VGETEXPSSZmkz	2642
-VGETEXPSSZr	2643
-VGETEXPSSZrb	2644
-VGETEXPSSZrbk	2645
-VGETEXPSSZrbkz	2646
-VGETEXPSSZrk	2647
-VGETEXPSSZrkz	2648
-VGETMANTBF	2649
-VGETMANTPDZ	2650
-VGETMANTPDZrmbi	2651
-VGETMANTPDZrmbik	2652
-VGETMANTPDZrmbikz	2653
-VGETMANTPDZrmi	2654
-VGETMANTPDZrmik	2655
-VGETMANTPDZrmikz	2656
-VGETMANTPDZrri	2657
-VGETMANTPDZrrib	2658
-VGETMANTPDZrribk	2659
-VGETMANTPDZrribkz	2660
-VGETMANTPDZrrik	2661
-VGETMANTPDZrrikz	2662
-VGETMANTPHZ	2663
-VGETMANTPHZrmbi	2664
-VGETMANTPHZrmbik	2665
-VGETMANTPHZrmbikz	2666
-VGETMANTPHZrmi	2667
-VGETMANTPHZrmik	2668
-VGETMANTPHZrmikz	2669
-VGETMANTPHZrri	2670
-VGETMANTPHZrrib	2671
-VGETMANTPHZrribk	2672
-VGETMANTPHZrribkz	2673
-VGETMANTPHZrrik	2674
-VGETMANTPHZrrikz	2675
-VGETMANTPSZ	2676
-VGETMANTPSZrmbi	2677
-VGETMANTPSZrmbik	2678
-VGETMANTPSZrmbikz	2679
-VGETMANTPSZrmi	2680
-VGETMANTPSZrmik	2681
-VGETMANTPSZrmikz	2682
-VGETMANTPSZrri	2683
-VGETMANTPSZrrib	2684
-VGETMANTPSZrribk	2685
-VGETMANTPSZrribkz	2686
-VGETMANTPSZrrik	2687
-VGETMANTPSZrrikz	2688
-VGETMANTSDZrmi	2689
-VGETMANTSDZrmik	2690
-VGETMANTSDZrmikz	2691
-VGETMANTSDZrri	2692
-VGETMANTSDZrrib	2693
-VGETMANTSDZrribk	2694
-VGETMANTSDZrribkz	2695
-VGETMANTSDZrrik	2696
-VGETMANTSDZrrikz	2697
-VGETMANTSHZrmi	2698
-VGETMANTSHZrmik	2699
-VGETMANTSHZrmikz	2700
-VGETMANTSHZrri	2701
-VGETMANTSHZrrib	2702
-VGETMANTSHZrribk	2703
-VGETMANTSHZrribkz	2704
-VGETMANTSHZrrik	2705
-VGETMANTSHZrrikz	2706
-VGETMANTSSZrmi	2707
-VGETMANTSSZrmik	2708
-VGETMANTSSZrmikz	2709
-VGETMANTSSZrri	2710
-VGETMANTSSZrrib	2711
-VGETMANTSSZrribk	2712
-VGETMANTSSZrribkz	2713
-VGETMANTSSZrrik	2714
-VGETMANTSSZrrikz	2715
-VGF	2716
-VHADDPDYrm	2717
-VHADDPDYrr	2718
-VHADDPDrm	2719
-VHADDPDrr	2720
-VHADDPSYrm	2721
-VHADDPSYrr	2722
-VHADDPSrm	2723
-VHADDPSrr	2724
-VHSUBPDYrm	2725
-VHSUBPDYrr	2726
-VHSUBPDrm	2727
-VHSUBPDrr	2728
-VHSUBPSYrm	2729
-VHSUBPSYrr	2730
-VHSUBPSrm	2731
-VHSUBPSrr	2732
-VINSERTF	2733
-VINSERTI	2734
-VINSERTPSZrmi	2735
-VINSERTPSZrri	2736
-VINSERTPSrmi	2737
-VINSERTPSrri	2738
-VLDDQUYrm	2739
-VLDDQUrm	2740
-VLDMXCSR	2741
-VMASKMOVDQU	2742
-VMASKMOVPDYmr	2743
-VMASKMOVPDYrm	2744
-VMASKMOVPDmr	2745
-VMASKMOVPDrm	2746
-VMASKMOVPSYmr	2747
-VMASKMOVPSYrm	2748
-VMASKMOVPSmr	2749
-VMASKMOVPSrm	2750
-VMAXBF	2751
-VMAXCPDYrm	2752
-VMAXCPDYrr	2753
-VMAXCPDZ	2754
-VMAXCPDZrm	2755
-VMAXCPDZrmb	2756
-VMAXCPDZrmbk	2757
-VMAXCPDZrmbkz	2758
-VMAXCPDZrmk	2759
-VMAXCPDZrmkz	2760
-VMAXCPDZrr	2761
-VMAXCPDZrrk	2762
-VMAXCPDZrrkz	2763
-VMAXCPDrm	2764
-VMAXCPDrr	2765
-VMAXCPHZ	2766
-VMAXCPHZrm	2767
-VMAXCPHZrmb	2768
-VMAXCPHZrmbk	2769
-VMAXCPHZrmbkz	2770
-VMAXCPHZrmk	2771
-VMAXCPHZrmkz	2772
-VMAXCPHZrr	2773
-VMAXCPHZrrk	2774
-VMAXCPHZrrkz	2775
-VMAXCPSYrm	2776
-VMAXCPSYrr	2777
-VMAXCPSZ	2778
-VMAXCPSZrm	2779
-VMAXCPSZrmb	2780
-VMAXCPSZrmbk	2781
-VMAXCPSZrmbkz	2782
-VMAXCPSZrmk	2783
-VMAXCPSZrmkz	2784
-VMAXCPSZrr	2785
-VMAXCPSZrrk	2786
-VMAXCPSZrrkz	2787
-VMAXCPSrm	2788
-VMAXCPSrr	2789
-VMAXCSDZrm	2790
-VMAXCSDZrr	2791
-VMAXCSDrm	2792
-VMAXCSDrr	2793
-VMAXCSHZrm	2794
-VMAXCSHZrr	2795
-VMAXCSSZrm	2796
-VMAXCSSZrr	2797
-VMAXCSSrm	2798
-VMAXCSSrr	2799
-VMAXPDYrm	2800
-VMAXPDYrr	2801
-VMAXPDZ	2802
-VMAXPDZrm	2803
-VMAXPDZrmb	2804
-VMAXPDZrmbk	2805
-VMAXPDZrmbkz	2806
-VMAXPDZrmk	2807
-VMAXPDZrmkz	2808
-VMAXPDZrr	2809
-VMAXPDZrrb	2810
-VMAXPDZrrbk	2811
-VMAXPDZrrbkz	2812
-VMAXPDZrrk	2813
-VMAXPDZrrkz	2814
-VMAXPDrm	2815
-VMAXPDrr	2816
-VMAXPHZ	2817
-VMAXPHZrm	2818
-VMAXPHZrmb	2819
-VMAXPHZrmbk	2820
-VMAXPHZrmbkz	2821
-VMAXPHZrmk	2822
-VMAXPHZrmkz	2823
-VMAXPHZrr	2824
-VMAXPHZrrb	2825
-VMAXPHZrrbk	2826
-VMAXPHZrrbkz	2827
-VMAXPHZrrk	2828
-VMAXPHZrrkz	2829
-VMAXPSYrm	2830
-VMAXPSYrr	2831
-VMAXPSZ	2832
-VMAXPSZrm	2833
-VMAXPSZrmb	2834
-VMAXPSZrmbk	2835
-VMAXPSZrmbkz	2836
-VMAXPSZrmk	2837
-VMAXPSZrmkz	2838
-VMAXPSZrr	2839
-VMAXPSZrrb	2840
-VMAXPSZrrbk	2841
-VMAXPSZrrbkz	2842
-VMAXPSZrrk	2843
-VMAXPSZrrkz	2844
-VMAXPSrm	2845
-VMAXPSrr	2846
-VMAXSDZrm	2847
-VMAXSDZrm_Int	2848
-VMAXSDZrmk_Int	2849
-VMAXSDZrmkz_Int	2850
-VMAXSDZrr	2851
-VMAXSDZrr_Int	2852
-VMAXSDZrrb_Int	2853
-VMAXSDZrrbk_Int	2854
-VMAXSDZrrbkz_Int	2855
-VMAXSDZrrk_Int	2856
-VMAXSDZrrkz_Int	2857
-VMAXSDrm	2858
-VMAXSDrm_Int	2859
-VMAXSDrr	2860
-VMAXSDrr_Int	2861
-VMAXSHZrm	2862
-VMAXSHZrm_Int	2863
-VMAXSHZrmk_Int	2864
-VMAXSHZrmkz_Int	2865
-VMAXSHZrr	2866
-VMAXSHZrr_Int	2867
-VMAXSHZrrb_Int	2868
-VMAXSHZrrbk_Int	2869
-VMAXSHZrrbkz_Int	2870
-VMAXSHZrrk_Int	2871
-VMAXSHZrrkz_Int	2872
-VMAXSSZrm	2873
-VMAXSSZrm_Int	2874
-VMAXSSZrmk_Int	2875
-VMAXSSZrmkz_Int	2876
-VMAXSSZrr	2877
-VMAXSSZrr_Int	2878
-VMAXSSZrrb_Int	2879
-VMAXSSZrrbk_Int	2880
-VMAXSSZrrbkz_Int	2881
-VMAXSSZrrk_Int	2882
-VMAXSSZrrkz_Int	2883
-VMAXSSrm	2884
-VMAXSSrm_Int	2885
-VMAXSSrr	2886
-VMAXSSrr_Int	2887
-VMCALL	2888
-VMCLEARm	2889
-VMFUNC	2890
-VMINBF	2891
-VMINCPDYrm	2892
-VMINCPDYrr	2893
-VMINCPDZ	2894
-VMINCPDZrm	2895
-VMINCPDZrmb	2896
-VMINCPDZrmbk	2897
-VMINCPDZrmbkz	2898
-VMINCPDZrmk	2899
-VMINCPDZrmkz	2900
-VMINCPDZrr	2901
-VMINCPDZrrk	2902
-VMINCPDZrrkz	2903
-VMINCPDrm	2904
-VMINCPDrr	2905
-VMINCPHZ	2906
-VMINCPHZrm	2907
-VMINCPHZrmb	2908
-VMINCPHZrmbk	2909
-VMINCPHZrmbkz	2910
-VMINCPHZrmk	2911
-VMINCPHZrmkz	2912
-VMINCPHZrr	2913
-VMINCPHZrrk	2914
-VMINCPHZrrkz	2915
-VMINCPSYrm	2916
-VMINCPSYrr	2917
-VMINCPSZ	2918
-VMINCPSZrm	2919
-VMINCPSZrmb	2920
-VMINCPSZrmbk	2921
-VMINCPSZrmbkz	2922
-VMINCPSZrmk	2923
-VMINCPSZrmkz	2924
-VMINCPSZrr	2925
-VMINCPSZrrk	2926
-VMINCPSZrrkz	2927
-VMINCPSrm	2928
-VMINCPSrr	2929
-VMINCSDZrm	2930
-VMINCSDZrr	2931
-VMINCSDrm	2932
-VMINCSDrr	2933
-VMINCSHZrm	2934
-VMINCSHZrr	2935
-VMINCSSZrm	2936
-VMINCSSZrr	2937
-VMINCSSrm	2938
-VMINCSSrr	2939
-VMINMAXBF	2940
-VMINMAXPDZ	2941
-VMINMAXPDZrmbi	2942
-VMINMAXPDZrmbik	2943
-VMINMAXPDZrmbikz	2944
-VMINMAXPDZrmi	2945
-VMINMAXPDZrmik	2946
-VMINMAXPDZrmikz	2947
-VMINMAXPDZrri	2948
-VMINMAXPDZrrib	2949
-VMINMAXPDZrribk	2950
-VMINMAXPDZrribkz	2951
-VMINMAXPDZrrik	2952
-VMINMAXPDZrrikz	2953
-VMINMAXPHZ	2954
-VMINMAXPHZrmbi	2955
-VMINMAXPHZrmbik	2956
-VMINMAXPHZrmbikz	2957
-VMINMAXPHZrmi	2958
-VMINMAXPHZrmik	2959
-VMINMAXPHZrmikz	2960
-VMINMAXPHZrri	2961
-VMINMAXPHZrrib	2962
-VMINMAXPHZrribk	2963
-VMINMAXPHZrribkz	2964
-VMINMAXPHZrrik	2965
-VMINMAXPHZrrikz	2966
-VMINMAXPSZ	2967
-VMINMAXPSZrmbi	2968
-VMINMAXPSZrmbik	2969
-VMINMAXPSZrmbikz	2970
-VMINMAXPSZrmi	2971
-VMINMAXPSZrmik	2972
-VMINMAXPSZrmikz	2973
-VMINMAXPSZrri	2974
-VMINMAXPSZrrib	2975
-VMINMAXPSZrribk	2976
-VMINMAXPSZrribkz	2977
-VMINMAXPSZrrik	2978
-VMINMAXPSZrrikz	2979
-VMINMAXSDrmi	2980
-VMINMAXSDrmi_Int	2981
-VMINMAXSDrmik_Int	2982
-VMINMAXSDrmikz_Int	2983
-VMINMAXSDrri	2984
-VMINMAXSDrri_Int	2985
-VMINMAXSDrrib_Int	2986
-VMINMAXSDrribk_Int	2987
-VMINMAXSDrribkz_Int	2988
-VMINMAXSDrrik_Int	2989
-VMINMAXSDrrikz_Int	2990
-VMINMAXSHrmi	2991
-VMINMAXSHrmi_Int	2992
-VMINMAXSHrmik_Int	2993
-VMINMAXSHrmikz_Int	2994
-VMINMAXSHrri	2995
-VMINMAXSHrri_Int	2996
-VMINMAXSHrrib_Int	2997
-VMINMAXSHrribk_Int	2998
-VMINMAXSHrribkz_Int	2999
-VMINMAXSHrrik_Int	3000
-VMINMAXSHrrikz_Int	3001
-VMINMAXSSrmi	3002
-VMINMAXSSrmi_Int	3003
-VMINMAXSSrmik_Int	3004
-VMINMAXSSrmikz_Int	3005
-VMINMAXSSrri	3006
-VMINMAXSSrri_Int	3007
-VMINMAXSSrrib_Int	3008
-VMINMAXSSrribk_Int	3009
-VMINMAXSSrribkz_Int	3010
-VMINMAXSSrrik_Int	3011
-VMINMAXSSrrikz_Int	3012
-VMINPDYrm	3013
-VMINPDYrr	3014
-VMINPDZ	3015
-VMINPDZrm	3016
-VMINPDZrmb	3017
-VMINPDZrmbk	3018
-VMINPDZrmbkz	3019
-VMINPDZrmk	3020
-VMINPDZrmkz	3021
-VMINPDZrr	3022
-VMINPDZrrb	3023
-VMINPDZrrbk	3024
-VMINPDZrrbkz	3025
-VMINPDZrrk	3026
-VMINPDZrrkz	3027
-VMINPDrm	3028
-VMINPDrr	3029
-VMINPHZ	3030
-VMINPHZrm	3031
-VMINPHZrmb	3032
-VMINPHZrmbk	3033
-VMINPHZrmbkz	3034
-VMINPHZrmk	3035
-VMINPHZrmkz	3036
-VMINPHZrr	3037
-VMINPHZrrb	3038
-VMINPHZrrbk	3039
-VMINPHZrrbkz	3040
-VMINPHZrrk	3041
-VMINPHZrrkz	3042
-VMINPSYrm	3043
-VMINPSYrr	3044
-VMINPSZ	3045
-VMINPSZrm	3046
-VMINPSZrmb	3047
-VMINPSZrmbk	3048
-VMINPSZrmbkz	3049
-VMINPSZrmk	3050
-VMINPSZrmkz	3051
-VMINPSZrr	3052
-VMINPSZrrb	3053
-VMINPSZrrbk	3054
-VMINPSZrrbkz	3055
-VMINPSZrrk	3056
-VMINPSZrrkz	3057
-VMINPSrm	3058
-VMINPSrr	3059
-VMINSDZrm	3060
-VMINSDZrm_Int	3061
-VMINSDZrmk_Int	3062
-VMINSDZrmkz_Int	3063
-VMINSDZrr	3064
-VMINSDZrr_Int	3065
-VMINSDZrrb_Int	3066
-VMINSDZrrbk_Int	3067
-VMINSDZrrbkz_Int	3068
-VMINSDZrrk_Int	3069
-VMINSDZrrkz_Int	3070
-VMINSDrm	3071
-VMINSDrm_Int	3072
-VMINSDrr	3073
-VMINSDrr_Int	3074
-VMINSHZrm	3075
-VMINSHZrm_Int	3076
-VMINSHZrmk_Int	3077
-VMINSHZrmkz_Int	3078
-VMINSHZrr	3079
-VMINSHZrr_Int	3080
-VMINSHZrrb_Int	3081
-VMINSHZrrbk_Int	3082
-VMINSHZrrbkz_Int	3083
-VMINSHZrrk_Int	3084
-VMINSHZrrkz_Int	3085
-VMINSSZrm	3086
-VMINSSZrm_Int	3087
-VMINSSZrmk_Int	3088
-VMINSSZrmkz_Int	3089
-VMINSSZrr	3090
-VMINSSZrr_Int	3091
-VMINSSZrrb_Int	3092
-VMINSSZrrbk_Int	3093
-VMINSSZrrbkz_Int	3094
-VMINSSZrrk_Int	3095
-VMINSSZrrkz_Int	3096
-VMINSSrm	3097
-VMINSSrm_Int	3098
-VMINSSrr	3099
-VMINSSrr_Int	3100
-VMLAUNCH	3101
-VMLOAD	3102
-VMMCALL	3103
-VMOV	3104
-VMOVAPDYmr	3105
-VMOVAPDYrm	3106
-VMOVAPDYrr	3107
-VMOVAPDYrr_REV	3108
-VMOVAPDZ	3109
-VMOVAPDZmr	3110
-VMOVAPDZmrk	3111
-VMOVAPDZrm	3112
-VMOVAPDZrmk	3113
-VMOVAPDZrmkz	3114
-VMOVAPDZrr	3115
-VMOVAPDZrr_REV	3116
-VMOVAPDZrrk	3117
-VMOVAPDZrrk_REV	3118
-VMOVAPDZrrkz	3119
-VMOVAPDZrrkz_REV	3120
-VMOVAPDmr	3121
-VMOVAPDrm	3122
-VMOVAPDrr	3123
-VMOVAPDrr_REV	3124
-VMOVAPSYmr	3125
-VMOVAPSYrm	3126
-VMOVAPSYrr	3127
-VMOVAPSYrr_REV	3128
-VMOVAPSZ	3129
-VMOVAPSZmr	3130
-VMOVAPSZmrk	3131
-VMOVAPSZrm	3132
-VMOVAPSZrmk	3133
-VMOVAPSZrmkz	3134
-VMOVAPSZrr	3135
-VMOVAPSZrr_REV	3136
-VMOVAPSZrrk	3137
-VMOVAPSZrrk_REV	3138
-VMOVAPSZrrkz	3139
-VMOVAPSZrrkz_REV	3140
-VMOVAPSmr	3141
-VMOVAPSrm	3142
-VMOVAPSrr	3143
-VMOVAPSrr_REV	3144
-VMOVDDUPYrm	3145
-VMOVDDUPYrr	3146
-VMOVDDUPZ	3147
-VMOVDDUPZrm	3148
-VMOVDDUPZrmk	3149
-VMOVDDUPZrmkz	3150
-VMOVDDUPZrr	3151
-VMOVDDUPZrrk	3152
-VMOVDDUPZrrkz	3153
-VMOVDDUPrm	3154
-VMOVDDUPrr	3155
-VMOVDI	3156
-VMOVDQA	3157
-VMOVDQAYmr	3158
-VMOVDQAYrm	3159
-VMOVDQAYrr	3160
-VMOVDQAYrr_REV	3161
-VMOVDQAmr	3162
-VMOVDQArm	3163
-VMOVDQArr	3164
-VMOVDQArr_REV	3165
-VMOVDQU	3166
-VMOVDQUYmr	3167
-VMOVDQUYrm	3168
-VMOVDQUYrr	3169
-VMOVDQUYrr_REV	3170
-VMOVDQUmr	3171
-VMOVDQUrm	3172
-VMOVDQUrr	3173
-VMOVDQUrr_REV	3174
-VMOVHLPSZrr	3175
-VMOVHLPSrr	3176
-VMOVHPDZ	3177
-VMOVHPDmr	3178
-VMOVHPDrm	3179
-VMOVHPSZ	3180
-VMOVHPSmr	3181
-VMOVHPSrm	3182
-VMOVLHPSZrr	3183
-VMOVLHPSrr	3184
-VMOVLPDZ	3185
-VMOVLPDmr	3186
-VMOVLPDrm	3187
-VMOVLPSZ	3188
-VMOVLPSmr	3189
-VMOVLPSrm	3190
-VMOVMSKPDYrr	3191
-VMOVMSKPDrr	3192
-VMOVMSKPSYrr	3193
-VMOVMSKPSrr	3194
-VMOVNTDQAYrm	3195
-VMOVNTDQAZ	3196
-VMOVNTDQAZrm	3197
-VMOVNTDQArm	3198
-VMOVNTDQYmr	3199
-VMOVNTDQZ	3200
-VMOVNTDQZmr	3201
-VMOVNTDQmr	3202
-VMOVNTPDYmr	3203
-VMOVNTPDZ	3204
-VMOVNTPDZmr	3205
-VMOVNTPDmr	3206
-VMOVNTPSYmr	3207
-VMOVNTPSZ	3208
-VMOVNTPSZmr	3209
-VMOVNTPSmr	3210
-VMOVPDI	3211
-VMOVPQI	3212
-VMOVPQIto	3213
-VMOVQI	3214
-VMOVRSBZ	3215
-VMOVRSBZm	3216
-VMOVRSBZmk	3217
-VMOVRSBZmkz	3218
-VMOVRSDZ	3219
-VMOVRSDZm	3220
-VMOVRSDZmk	3221
-VMOVRSDZmkz	3222
-VMOVRSQZ	3223
-VMOVRSQZm	3224
-VMOVRSQZmk	3225
-VMOVRSQZmkz	3226
-VMOVRSWZ	3227
-VMOVRSWZm	3228
-VMOVRSWZmk	3229
-VMOVRSWZmkz	3230
-VMOVSDZmr	3231
-VMOVSDZmrk	3232
-VMOVSDZrm	3233
-VMOVSDZrm_alt	3234
-VMOVSDZrmk	3235
-VMOVSDZrmkz	3236
-VMOVSDZrr	3237
-VMOVSDZrr_REV	3238
-VMOVSDZrrk	3239
-VMOVSDZrrk_REV	3240
-VMOVSDZrrkz	3241
-VMOVSDZrrkz_REV	3242
-VMOVSDmr	3243
-VMOVSDrm	3244
-VMOVSDrm_alt	3245
-VMOVSDrr	3246
-VMOVSDrr_REV	3247
-VMOVSDto	3248
-VMOVSH	3249
-VMOVSHDUPYrm	3250
-VMOVSHDUPYrr	3251
-VMOVSHDUPZ	3252
-VMOVSHDUPZrm	3253
-VMOVSHDUPZrmk	3254
-VMOVSHDUPZrmkz	3255
-VMOVSHDUPZrr	3256
-VMOVSHDUPZrrk	3257
-VMOVSHDUPZrrkz	3258
-VMOVSHDUPrm	3259
-VMOVSHDUPrr	3260
-VMOVSHZmr	3261
-VMOVSHZmrk	3262
-VMOVSHZrm	3263
-VMOVSHZrm_alt	3264
-VMOVSHZrmk	3265
-VMOVSHZrmkz	3266
-VMOVSHZrr	3267
-VMOVSHZrr_REV	3268
-VMOVSHZrrk	3269
-VMOVSHZrrk_REV	3270
-VMOVSHZrrkz	3271
-VMOVSHZrrkz_REV	3272
-VMOVSHtoW	3273
-VMOVSLDUPYrm	3274
-VMOVSLDUPYrr	3275
-VMOVSLDUPZ	3276
-VMOVSLDUPZrm	3277
-VMOVSLDUPZrmk	3278
-VMOVSLDUPZrmkz	3279
-VMOVSLDUPZrr	3280
-VMOVSLDUPZrrk	3281
-VMOVSLDUPZrrkz	3282
-VMOVSLDUPrm	3283
-VMOVSLDUPrr	3284
-VMOVSS	3285
-VMOVSSZmr	3286
-VMOVSSZmrk	3287
-VMOVSSZrm	3288
-VMOVSSZrm_alt	3289
-VMOVSSZrmk	3290
-VMOVSSZrmkz	3291
-VMOVSSZrr	3292
-VMOVSSZrr_REV	3293
-VMOVSSZrrk	3294
-VMOVSSZrrk_REV	3295
-VMOVSSZrrkz	3296
-VMOVSSZrrkz_REV	3297
-VMOVSSmr	3298
-VMOVSSrm	3299
-VMOVSSrm_alt	3300
-VMOVSSrr	3301
-VMOVSSrr_REV	3302
-VMOVUPDYmr	3303
-VMOVUPDYrm	3304
-VMOVUPDYrr	3305
-VMOVUPDYrr_REV	3306
-VMOVUPDZ	3307
-VMOVUPDZmr	3308
-VMOVUPDZmrk	3309
-VMOVUPDZrm	3310
-VMOVUPDZrmk	3311
-VMOVUPDZrmkz	3312
-VMOVUPDZrr	3313
-VMOVUPDZrr_REV	3314
-VMOVUPDZrrk	3315
-VMOVUPDZrrk_REV	3316
-VMOVUPDZrrkz	3317
-VMOVUPDZrrkz_REV	3318
-VMOVUPDmr	3319
-VMOVUPDrm	3320
-VMOVUPDrr	3321
-VMOVUPDrr_REV	3322
-VMOVUPSYmr	3323
-VMOVUPSYrm	3324
-VMOVUPSYrr	3325
-VMOVUPSYrr_REV	3326
-VMOVUPSZ	3327
-VMOVUPSZmr	3328
-VMOVUPSZmrk	3329
-VMOVUPSZrm	3330
-VMOVUPSZrmk	3331
-VMOVUPSZrmkz	3332
-VMOVUPSZrr	3333
-VMOVUPSZrr_REV	3334
-VMOVUPSZrrk	3335
-VMOVUPSZrrk_REV	3336
-VMOVUPSZrrkz	3337
-VMOVUPSZrrkz_REV	3338
-VMOVUPSmr	3339
-VMOVUPSrm	3340
-VMOVUPSrr	3341
-VMOVUPSrr_REV	3342
-VMOVW	3343
-VMOVWmr	3344
-VMOVWrm	3345
-VMOVZPDILo	3346
-VMOVZPQILo	3347
-VMOVZPWILo	3348
-VMPSADBWYrmi	3349
-VMPSADBWYrri	3350
-VMPSADBWZ	3351
-VMPSADBWZrmi	3352
-VMPSADBWZrmik	3353
-VMPSADBWZrmikz	3354
-VMPSADBWZrri	3355
-VMPSADBWZrrik	3356
-VMPSADBWZrrikz	3357
-VMPSADBWrmi	3358
-VMPSADBWrri	3359
-VMPTRLDm	3360
-VMPTRSTm	3361
-VMREAD	3362
-VMRESUME	3363
-VMRUN	3364
-VMSAVE	3365
-VMULBF	3366
-VMULPDYrm	3367
-VMULPDYrr	3368
-VMULPDZ	3369
-VMULPDZrm	3370
-VMULPDZrmb	3371
-VMULPDZrmbk	3372
-VMULPDZrmbkz	3373
-VMULPDZrmk	3374
-VMULPDZrmkz	3375
-VMULPDZrr	3376
-VMULPDZrrb	3377
-VMULPDZrrbk	3378
-VMULPDZrrbkz	3379
-VMULPDZrrk	3380
-VMULPDZrrkz	3381
-VMULPDrm	3382
-VMULPDrr	3383
-VMULPHZ	3384
-VMULPHZrm	3385
-VMULPHZrmb	3386
-VMULPHZrmbk	3387
-VMULPHZrmbkz	3388
-VMULPHZrmk	3389
-VMULPHZrmkz	3390
-VMULPHZrr	3391
-VMULPHZrrb	3392
-VMULPHZrrbk	3393
-VMULPHZrrbkz	3394
-VMULPHZrrk	3395
-VMULPHZrrkz	3396
-VMULPSYrm	3397
-VMULPSYrr	3398
-VMULPSZ	3399
-VMULPSZrm	3400
-VMULPSZrmb	3401
-VMULPSZrmbk	3402
-VMULPSZrmbkz	3403
-VMULPSZrmk	3404
-VMULPSZrmkz	3405
-VMULPSZrr	3406
-VMULPSZrrb	3407
-VMULPSZrrbk	3408
-VMULPSZrrbkz	3409
-VMULPSZrrk	3410
-VMULPSZrrkz	3411
-VMULPSrm	3412
-VMULPSrr	3413
-VMULSDZrm	3414
-VMULSDZrm_Int	3415
-VMULSDZrmk_Int	3416
-VMULSDZrmkz_Int	3417
-VMULSDZrr	3418
-VMULSDZrr_Int	3419
-VMULSDZrrb_Int	3420
-VMULSDZrrbk_Int	3421
-VMULSDZrrbkz_Int	3422
-VMULSDZrrk_Int	3423
-VMULSDZrrkz_Int	3424
-VMULSDrm	3425
-VMULSDrm_Int	3426
-VMULSDrr	3427
-VMULSDrr_Int	3428
-VMULSHZrm	3429
-VMULSHZrm_Int	3430
-VMULSHZrmk_Int	3431
-VMULSHZrmkz_Int	3432
-VMULSHZrr	3433
-VMULSHZrr_Int	3434
-VMULSHZrrb_Int	3435
-VMULSHZrrbk_Int	3436
-VMULSHZrrbkz_Int	3437
-VMULSHZrrk_Int	3438
-VMULSHZrrkz_Int	3439
-VMULSSZrm	3440
-VMULSSZrm_Int	3441
-VMULSSZrmk_Int	3442
-VMULSSZrmkz_Int	3443
-VMULSSZrr	3444
-VMULSSZrr_Int	3445
-VMULSSZrrb_Int	3446
-VMULSSZrrbk_Int	3447
-VMULSSZrrbkz_Int	3448
-VMULSSZrrk_Int	3449
-VMULSSZrrkz_Int	3450
-VMULSSrm	3451
-VMULSSrm_Int	3452
-VMULSSrr	3453
-VMULSSrr_Int	3454
-VMWRITE	3455
-VMXOFF	3456
-VMXON	3457
-VORPDYrm	3458
-VORPDYrr	3459
-VORPDZ	3460
-VORPDZrm	3461
-VORPDZrmb	3462
-VORPDZrmbk	3463
-VORPDZrmbkz	3464
-VORPDZrmk	3465
-VORPDZrmkz	3466
-VORPDZrr	3467
-VORPDZrrk	3468
-VORPDZrrkz	3469
-VORPDrm	3470
-VORPDrr	3471
-VORPSYrm	3472
-VORPSYrr	3473
-VORPSZ	3474
-VORPSZrm	3475
-VORPSZrmb	3476
-VORPSZrmbk	3477
-VORPSZrmbkz	3478
-VORPSZrmk	3479
-VORPSZrmkz	3480
-VORPSZrr	3481
-VORPSZrrk	3482
-VORPSZrrkz	3483
-VORPSrm	3484
-VORPSrr	3485
-VP	3486
-VPABSBYrm	3487
-VPABSBYrr	3488
-VPABSBZ	3489
-VPABSBZrm	3490
-VPABSBZrmk	3491
-VPABSBZrmkz	3492
-VPABSBZrr	3493
-VPABSBZrrk	3494
-VPABSBZrrkz	3495
-VPABSBrm	3496
-VPABSBrr	3497
-VPABSDYrm	3498
-VPABSDYrr	3499
-VPABSDZ	3500
-VPABSDZrm	3501
-VPABSDZrmb	3502
-VPABSDZrmbk	3503
-VPABSDZrmbkz	3504
-VPABSDZrmk	3505
-VPABSDZrmkz	3506
-VPABSDZrr	3507
-VPABSDZrrk	3508
-VPABSDZrrkz	3509
-VPABSDrm	3510
-VPABSDrr	3511
-VPABSQZ	3512
-VPABSQZrm	3513
-VPABSQZrmb	3514
-VPABSQZrmbk	3515
-VPABSQZrmbkz	3516
-VPABSQZrmk	3517
-VPABSQZrmkz	3518
-VPABSQZrr	3519
-VPABSQZrrk	3520
-VPABSQZrrkz	3521
-VPABSWYrm	3522
-VPABSWYrr	3523
-VPABSWZ	3524
-VPABSWZrm	3525
-VPABSWZrmk	3526
-VPABSWZrmkz	3527
-VPABSWZrr	3528
-VPABSWZrrk	3529
-VPABSWZrrkz	3530
-VPABSWrm	3531
-VPABSWrr	3532
-VPACKSSDWYrm	3533
-VPACKSSDWYrr	3534
-VPACKSSDWZ	3535
-VPACKSSDWZrm	3536
-VPACKSSDWZrmb	3537
-VPACKSSDWZrmbk	3538
-VPACKSSDWZrmbkz	3539
-VPACKSSDWZrmk	3540
-VPACKSSDWZrmkz	3541
-VPACKSSDWZrr	3542
-VPACKSSDWZrrk	3543
-VPACKSSDWZrrkz	3544
-VPACKSSDWrm	3545
-VPACKSSDWrr	3546
-VPACKSSWBYrm	3547
-VPACKSSWBYrr	3548
-VPACKSSWBZ	3549
-VPACKSSWBZrm	3550
-VPACKSSWBZrmk	3551
-VPACKSSWBZrmkz	3552
-VPACKSSWBZrr	3553
-VPACKSSWBZrrk	3554
-VPACKSSWBZrrkz	3555
-VPACKSSWBrm	3556
-VPACKSSWBrr	3557
-VPACKUSDWYrm	3558
-VPACKUSDWYrr	3559
-VPACKUSDWZ	3560
-VPACKUSDWZrm	3561
-VPACKUSDWZrmb	3562
-VPACKUSDWZrmbk	3563
-VPACKUSDWZrmbkz	3564
-VPACKUSDWZrmk	3565
-VPACKUSDWZrmkz	3566
-VPACKUSDWZrr	3567
-VPACKUSDWZrrk	3568
-VPACKUSDWZrrkz	3569
-VPACKUSDWrm	3570
-VPACKUSDWrr	3571
-VPACKUSWBYrm	3572
-VPACKUSWBYrr	3573
-VPACKUSWBZ	3574
-VPACKUSWBZrm	3575
-VPACKUSWBZrmk	3576
-VPACKUSWBZrmkz	3577
-VPACKUSWBZrr	3578
-VPACKUSWBZrrk	3579
-VPACKUSWBZrrkz	3580
-VPACKUSWBrm	3581
-VPACKUSWBrr	3582
-VPADDBYrm	3583
-VPADDBYrr	3584
-VPADDBZ	3585
-VPADDBZrm	3586
-VPADDBZrmk	3587
-VPADDBZrmkz	3588
-VPADDBZrr	3589
-VPADDBZrrk	3590
-VPADDBZrrkz	3591
-VPADDBrm	3592
-VPADDBrr	3593
-VPADDDYrm	3594
-VPADDDYrr	3595
-VPADDDZ	3596
-VPADDDZrm	3597
-VPADDDZrmb	3598
-VPADDDZrmbk	3599
-VPADDDZrmbkz	3600
-VPADDDZrmk	3601
-VPADDDZrmkz	3602
-VPADDDZrr	3603
-VPADDDZrrk	3604
-VPADDDZrrkz	3605
-VPADDDrm	3606
-VPADDDrr	3607
-VPADDQYrm	3608
-VPADDQYrr	3609
-VPADDQZ	3610
-VPADDQZrm	3611
-VPADDQZrmb	3612
-VPADDQZrmbk	3613
-VPADDQZrmbkz	3614
-VPADDQZrmk	3615
-VPADDQZrmkz	3616
-VPADDQZrr	3617
-VPADDQZrrk	3618
-VPADDQZrrkz	3619
-VPADDQrm	3620
-VPADDQrr	3621
-VPADDSBYrm	3622
-VPADDSBYrr	3623
-VPADDSBZ	3624
-VPADDSBZrm	3625
-VPADDSBZrmk	3626
-VPADDSBZrmkz	3627
-VPADDSBZrr	3628
-VPADDSBZrrk	3629
-VPADDSBZrrkz	3630
-VPADDSBrm	3631
-VPADDSBrr	3632
-VPADDSWYrm	3633
-VPADDSWYrr	3634
-VPADDSWZ	3635
-VPADDSWZrm	3636
-VPADDSWZrmk	3637
-VPADDSWZrmkz	3638
-VPADDSWZrr	3639
-VPADDSWZrrk	3640
-VPADDSWZrrkz	3641
-VPADDSWrm	3642
-VPADDSWrr	3643
-VPADDUSBYrm	3644
-VPADDUSBYrr	3645
-VPADDUSBZ	3646
-VPADDUSBZrm	3647
-VPADDUSBZrmk	3648
-VPADDUSBZrmkz	3649
-VPADDUSBZrr	3650
-VPADDUSBZrrk	3651
-VPADDUSBZrrkz	3652
-VPADDUSBrm	3653
-VPADDUSBrr	3654
-VPADDUSWYrm	3655
-VPADDUSWYrr	3656
-VPADDUSWZ	3657
-VPADDUSWZrm	3658
-VPADDUSWZrmk	3659
-VPADDUSWZrmkz	3660
-VPADDUSWZrr	3661
-VPADDUSWZrrk	3662
-VPADDUSWZrrkz	3663
-VPADDUSWrm	3664
-VPADDUSWrr	3665
-VPADDWYrm	3666
-VPADDWYrr	3667
-VPADDWZ	3668
-VPADDWZrm	3669
-VPADDWZrmk	3670
-VPADDWZrmkz	3671
-VPADDWZrr	3672
-VPADDWZrrk	3673
-VPADDWZrrkz	3674
-VPADDWrm	3675
-VPADDWrr	3676
-VPALIGNRYrmi	3677
-VPALIGNRYrri	3678
-VPALIGNRZ	3679
-VPALIGNRZrmi	3680
-VPALIGNRZrmik	3681
-VPALIGNRZrmikz	3682
-VPALIGNRZrri	3683
-VPALIGNRZrrik	3684
-VPALIGNRZrrikz	3685
-VPALIGNRrmi	3686
-VPALIGNRrri	3687
-VPANDDZ	3688
-VPANDDZrm	3689
-VPANDDZrmb	3690
-VPANDDZrmbk	3691
-VPANDDZrmbkz	3692
-VPANDDZrmk	3693
-VPANDDZrmkz	3694
-VPANDDZrr	3695
-VPANDDZrrk	3696
-VPANDDZrrkz	3697
-VPANDNDZ	3698
-VPANDNDZrm	3699
-VPANDNDZrmb	3700
-VPANDNDZrmbk	3701
-VPANDNDZrmbkz	3702
-VPANDNDZrmk	3703
-VPANDNDZrmkz	3704
-VPANDNDZrr	3705
-VPANDNDZrrk	3706
-VPANDNDZrrkz	3707
-VPANDNQZ	3708
-VPANDNQZrm	3709
-VPANDNQZrmb	3710
-VPANDNQZrmbk	3711
-VPANDNQZrmbkz	3712
-VPANDNQZrmk	3713
-VPANDNQZrmkz	3714
-VPANDNQZrr	3715
-VPANDNQZrrk	3716
-VPANDNQZrrkz	3717
-VPANDNYrm	3718
-VPANDNYrr	3719
-VPANDNrm	3720
-VPANDNrr	3721
-VPANDQZ	3722
-VPANDQZrm	3723
-VPANDQZrmb	3724
-VPANDQZrmbk	3725
-VPANDQZrmbkz	3726
-VPANDQZrmk	3727
-VPANDQZrmkz	3728
-VPANDQZrr	3729
-VPANDQZrrk	3730
-VPANDQZrrkz	3731
-VPANDYrm	3732
-VPANDYrr	3733
-VPANDrm	3734
-VPANDrr	3735
-VPAVGBYrm	3736
-VPAVGBYrr	3737
-VPAVGBZ	3738
-VPAVGBZrm	3739
-VPAVGBZrmk	3740
-VPAVGBZrmkz	3741
-VPAVGBZrr	3742
-VPAVGBZrrk	3743
-VPAVGBZrrkz	3744
-VPAVGBrm	3745
-VPAVGBrr	3746
-VPAVGWYrm	3747
-VPAVGWYrr	3748
-VPAVGWZ	3749
-VPAVGWZrm	3750
-VPAVGWZrmk	3751
-VPAVGWZrmkz	3752
-VPAVGWZrr	3753
-VPAVGWZrrk	3754
-VPAVGWZrrkz	3755
-VPAVGWrm	3756
-VPAVGWrr	3757
-VPBLENDDYrmi	3758
-VPBLENDDYrri	3759
-VPBLENDDrmi	3760
-VPBLENDDrri	3761
-VPBLENDMBZ	3762
-VPBLENDMBZrm	3763
-VPBLENDMBZrmk	3764
-VPBLENDMBZrmkz	3765
-VPBLENDMBZrr	3766
-VPBLENDMBZrrk	3767
-VPBLENDMBZrrkz	3768
-VPBLENDMDZ	3769
-VPBLENDMDZrm	3770
-VPBLENDMDZrmb	3771
-VPBLENDMDZrmbk	3772
-VPBLENDMDZrmbkz	3773
-VPBLENDMDZrmk	3774
-VPBLENDMDZrmkz	3775
-VPBLENDMDZrr	3776
-VPBLENDMDZrrk	3777
-VPBLENDMDZrrkz	3778
-VPBLENDMQZ	3779
-VPBLENDMQZrm	3780
-VPBLENDMQZrmb	3781
-VPBLENDMQZrmbk	3782
-VPBLENDMQZrmbkz	3783
-VPBLENDMQZrmk	3784
-VPBLENDMQZrmkz	3785
-VPBLENDMQZrr	3786
-VPBLENDMQZrrk	3787
-VPBLENDMQZrrkz	3788
-VPBLENDMWZ	3789
-VPBLENDMWZrm	3790
-VPBLENDMWZrmk	3791
-VPBLENDMWZrmkz	3792
-VPBLENDMWZrr	3793
-VPBLENDMWZrrk	3794
-VPBLENDMWZrrkz	3795
-VPBLENDVBYrmr	3796
-VPBLENDVBYrrr	3797
-VPBLENDVBrmr	3798
-VPBLENDVBrrr	3799
-VPBLENDWYrmi	3800
-VPBLENDWYrri	3801
-VPBLENDWrmi	3802
-VPBLENDWrri	3803
-VPBROADCASTBYrm	3804
-VPBROADCASTBYrr	3805
-VPBROADCASTBZ	3806
-VPBROADCASTBZrm	3807
-VPBROADCASTBZrmk	3808
-VPBROADCASTBZrmkz	3809
-VPBROADCASTBZrr	3810
-VPBROADCASTBZrrk	3811
-VPBROADCASTBZrrkz	3812
-VPBROADCASTBrZ	3813
-VPBROADCASTBrZrr	3814
-VPBROADCASTBrZrrk	3815
-VPBROADCASTBrZrrkz	3816
-VPBROADCASTBrm	3817
-VPBROADCASTBrr	3818
-VPBROADCASTDYrm	3819
-VPBROADCASTDYrr	3820
-VPBROADCASTDZ	3821
-VPBROADCASTDZrm	3822
-VPBROADCASTDZrmk	3823
-VPBROADCASTDZrmkz	3824
-VPBROADCASTDZrr	3825
-VPBROADCASTDZrrk	3826
-VPBROADCASTDZrrkz	3827
-VPBROADCASTDrZ	3828
-VPBROADCASTDrZrr	3829
-VPBROADCASTDrZrrk	3830
-VPBROADCASTDrZrrkz	3831
-VPBROADCASTDrm	3832
-VPBROADCASTDrr	3833
-VPBROADCASTMB	3834
-VPBROADCASTMW	3835
-VPBROADCASTQYrm	3836
-VPBROADCASTQYrr	3837
-VPBROADCASTQZ	3838
-VPBROADCASTQZrm	3839
-VPBROADCASTQZrmk	3840
-VPBROADCASTQZrmkz	3841
-VPBROADCASTQZrr	3842
-VPBROADCASTQZrrk	3843
-VPBROADCASTQZrrkz	3844
-VPBROADCASTQrZ	3845
-VPBROADCASTQrZrr	3846
-VPBROADCASTQrZrrk	3847
-VPBROADCASTQrZrrkz	3848
-VPBROADCASTQrm	3849
-VPBROADCASTQrr	3850
-VPBROADCASTWYrm	3851
-VPBROADCASTWYrr	3852
-VPBROADCASTWZ	3853
-VPBROADCASTWZrm	3854
-VPBROADCASTWZrmk	3855
-VPBROADCASTWZrmkz	3856
-VPBROADCASTWZrr	3857
-VPBROADCASTWZrrk	3858
-VPBROADCASTWZrrkz	3859
-VPBROADCASTWrZ	3860
-VPBROADCASTWrZrr	3861
-VPBROADCASTWrZrrk	3862
-VPBROADCASTWrZrrkz	3863
-VPBROADCASTWrm	3864
-VPBROADCASTWrr	3865
-VPCLMULQDQYrmi	3866
-VPCLMULQDQYrri	3867
-VPCLMULQDQZ	3868
-VPCLMULQDQZrmi	3869
-VPCLMULQDQZrri	3870
-VPCLMULQDQrmi	3871
-VPCLMULQDQrri	3872
-VPCMOVYrmr	3873
-VPCMOVYrrm	3874
-VPCMOVYrrr	3875
-VPCMOVYrrr_REV	3876
-VPCMOVrmr	3877
-VPCMOVrrm	3878
-VPCMOVrrr	3879
-VPCMOVrrr_REV	3880
-VPCMPBZ	3881
-VPCMPBZrmi	3882
-VPCMPBZrmik	3883
-VPCMPBZrri	3884
-VPCMPBZrrik	3885
-VPCMPDZ	3886
-VPCMPDZrmbi	3887
-VPCMPDZrmbik	3888
-VPCMPDZrmi	3889
-VPCMPDZrmik	3890
-VPCMPDZrri	3891
-VPCMPDZrrik	3892
-VPCMPEQBYrm	3893
-VPCMPEQBYrr	3894
-VPCMPEQBZ	3895
-VPCMPEQBZrm	3896
-VPCMPEQBZrmk	3897
-VPCMPEQBZrr	3898
-VPCMPEQBZrrk	3899
-VPCMPEQBrm	3900
-VPCMPEQBrr	3901
-VPCMPEQDYrm	3902
-VPCMPEQDYrr	3903
-VPCMPEQDZ	3904
-VPCMPEQDZrm	3905
-VPCMPEQDZrmb	3906
-VPCMPEQDZrmbk	3907
-VPCMPEQDZrmk	3908
-VPCMPEQDZrr	3909
-VPCMPEQDZrrk	3910
-VPCMPEQDrm	3911
-VPCMPEQDrr	3912
-VPCMPEQQYrm	3913
-VPCMPEQQYrr	3914
-VPCMPEQQZ	3915
-VPCMPEQQZrm	3916
-VPCMPEQQZrmb	3917
-VPCMPEQQZrmbk	3918
-VPCMPEQQZrmk	3919
-VPCMPEQQZrr	3920
-VPCMPEQQZrrk	3921
-VPCMPEQQrm	3922
-VPCMPEQQrr	3923
-VPCMPEQWYrm	3924
-VPCMPEQWYrr	3925
-VPCMPEQWZ	3926
-VPCMPEQWZrm	3927
-VPCMPEQWZrmk	3928
-VPCMPEQWZrr	3929
-VPCMPEQWZrrk	3930
-VPCMPEQWrm	3931
-VPCMPEQWrr	3932
-VPCMPESTRIrmi	3933
-VPCMPESTRIrri	3934
-VPCMPESTRMrmi	3935
-VPCMPESTRMrri	3936
-VPCMPGTBYrm	3937
-VPCMPGTBYrr	3938
-VPCMPGTBZ	3939
-VPCMPGTBZrm	3940
-VPCMPGTBZrmk	3941
-VPCMPGTBZrr	3942
-VPCMPGTBZrrk	3943
-VPCMPGTBrm	3944
-VPCMPGTBrr	3945
-VPCMPGTDYrm	3946
-VPCMPGTDYrr	3947
-VPCMPGTDZ	3948
-VPCMPGTDZrm	3949
-VPCMPGTDZrmb	3950
-VPCMPGTDZrmbk	3951
-VPCMPGTDZrmk	3952
-VPCMPGTDZrr	3953
-VPCMPGTDZrrk	3954
-VPCMPGTDrm	3955
-VPCMPGTDrr	3956
-VPCMPGTQYrm	3957
-VPCMPGTQYrr	3958
-VPCMPGTQZ	3959
-VPCMPGTQZrm	3960
-VPCMPGTQZrmb	3961
-VPCMPGTQZrmbk	3962
-VPCMPGTQZrmk	3963
-VPCMPGTQZrr	3964
-VPCMPGTQZrrk	3965
-VPCMPGTQrm	3966
-VPCMPGTQrr	3967
-VPCMPGTWYrm	3968
-VPCMPGTWYrr	3969
-VPCMPGTWZ	3970
-VPCMPGTWZrm	3971
-VPCMPGTWZrmk	3972
-VPCMPGTWZrr	3973
-VPCMPGTWZrrk	3974
-VPCMPGTWrm	3975
-VPCMPGTWrr	3976
-VPCMPISTRIrmi	3977
-VPCMPISTRIrri	3978
-VPCMPISTRMrmi	3979
-VPCMPISTRMrri	3980
-VPCMPQZ	3981
-VPCMPQZrmbi	3982
-VPCMPQZrmbik	3983
-VPCMPQZrmi	3984
-VPCMPQZrmik	3985
-VPCMPQZrri	3986
-VPCMPQZrrik	3987
-VPCMPUBZ	3988
-VPCMPUBZrmi	3989
-VPCMPUBZrmik	3990
-VPCMPUBZrri	3991
-VPCMPUBZrrik	3992
-VPCMPUDZ	3993
-VPCMPUDZrmbi	3994
-VPCMPUDZrmbik	3995
-VPCMPUDZrmi	3996
-VPCMPUDZrmik	3997
-VPCMPUDZrri	3998
-VPCMPUDZrrik	3999
-VPCMPUQZ	4000
-VPCMPUQZrmbi	4001
-VPCMPUQZrmbik	4002
-VPCMPUQZrmi	4003
-VPCMPUQZrmik	4004
-VPCMPUQZrri	4005
-VPCMPUQZrrik	4006
-VPCMPUWZ	4007
-VPCMPUWZrmi	4008
-VPCMPUWZrmik	4009
-VPCMPUWZrri	4010
-VPCMPUWZrrik	4011
-VPCMPWZ	4012
-VPCMPWZrmi	4013
-VPCMPWZrmik	4014
-VPCMPWZrri	4015
-VPCMPWZrrik	4016
-VPCOMBmi	4017
-VPCOMBri	4018
-VPCOMDmi	4019
-VPCOMDri	4020
-VPCOMPRESSBZ	4021
-VPCOMPRESSBZmr	4022
-VPCOMPRESSBZmrk	4023
-VPCOMPRESSBZrr	4024
-VPCOMPRESSBZrrk	4025
-VPCOMPRESSBZrrkz	4026
-VPCOMPRESSDZ	4027
-VPCOMPRESSDZmr	4028
-VPCOMPRESSDZmrk	4029
-VPCOMPRESSDZrr	4030
-VPCOMPRESSDZrrk	4031
-VPCOMPRESSDZrrkz	4032
-VPCOMPRESSQZ	4033
-VPCOMPRESSQZmr	4034
-VPCOMPRESSQZmrk	4035
-VPCOMPRESSQZrr	4036
-VPCOMPRESSQZrrk	4037
-VPCOMPRESSQZrrkz	4038
-VPCOMPRESSWZ	4039
-VPCOMPRESSWZmr	4040
-VPCOMPRESSWZmrk	4041
-VPCOMPRESSWZrr	4042
-VPCOMPRESSWZrrk	4043
-VPCOMPRESSWZrrkz	4044
-VPCOMQmi	4045
-VPCOMQri	4046
-VPCOMUBmi	4047
-VPCOMUBri	4048
-VPCOMUDmi	4049
-VPCOMUDri	4050
-VPCOMUQmi	4051
-VPCOMUQri	4052
-VPCOMUWmi	4053
-VPCOMUWri	4054
-VPCOMWmi	4055
-VPCOMWri	4056
-VPCONFLICTDZ	4057
-VPCONFLICTDZrm	4058
-VPCONFLICTDZrmb	4059
-VPCONFLICTDZrmbk	4060
-VPCONFLICTDZrmbkz	4061
-VPCONFLICTDZrmk	4062
-VPCONFLICTDZrmkz	4063
-VPCONFLICTDZrr	4064
-VPCONFLICTDZrrk	4065
-VPCONFLICTDZrrkz	4066
-VPCONFLICTQZ	4067
-VPCONFLICTQZrm	4068
-VPCONFLICTQZrmb	4069
-VPCONFLICTQZrmbk	4070
-VPCONFLICTQZrmbkz	4071
-VPCONFLICTQZrmk	4072
-VPCONFLICTQZrmkz	4073
-VPCONFLICTQZrr	4074
-VPCONFLICTQZrrk	4075
-VPCONFLICTQZrrkz	4076
-VPDPBSSDSYrm	4077
-VPDPBSSDSYrr	4078
-VPDPBSSDSZ	4079
-VPDPBSSDSZrm	4080
-VPDPBSSDSZrmb	4081
-VPDPBSSDSZrmbk	4082
-VPDPBSSDSZrmbkz	4083
-VPDPBSSDSZrmk	4084
-VPDPBSSDSZrmkz	4085
-VPDPBSSDSZrr	4086
-VPDPBSSDSZrrk	4087
-VPDPBSSDSZrrkz	4088
-VPDPBSSDSrm	4089
-VPDPBSSDSrr	4090
-VPDPBSSDYrm	4091
-VPDPBSSDYrr	4092
-VPDPBSSDZ	4093
-VPDPBSSDZrm	4094
-VPDPBSSDZrmb	4095
-VPDPBSSDZrmbk	4096
-VPDPBSSDZrmbkz	4097
-VPDPBSSDZrmk	4098
-VPDPBSSDZrmkz	4099
-VPDPBSSDZrr	4100
-VPDPBSSDZrrk	4101
-VPDPBSSDZrrkz	4102
-VPDPBSSDrm	4103
-VPDPBSSDrr	4104
-VPDPBSUDSYrm	4105
-VPDPBSUDSYrr	4106
-VPDPBSUDSZ	4107
-VPDPBSUDSZrm	4108
-VPDPBSUDSZrmb	4109
-VPDPBSUDSZrmbk	4110
-VPDPBSUDSZrmbkz	4111
-VPDPBSUDSZrmk	4112
-VPDPBSUDSZrmkz	4113
-VPDPBSUDSZrr	4114
-VPDPBSUDSZrrk	4115
-VPDPBSUDSZrrkz	4116
-VPDPBSUDSrm	4117
-VPDPBSUDSrr	4118
-VPDPBSUDYrm	4119
-VPDPBSUDYrr	4120
-VPDPBSUDZ	4121
-VPDPBSUDZrm	4122
-VPDPBSUDZrmb	4123
-VPDPBSUDZrmbk	4124
-VPDPBSUDZrmbkz	4125
-VPDPBSUDZrmk	4126
-VPDPBSUDZrmkz	4127
-VPDPBSUDZrr	4128
-VPDPBSUDZrrk	4129
-VPDPBSUDZrrkz	4130
-VPDPBSUDrm	4131
-VPDPBSUDrr	4132
-VPDPBUSDSYrm	4133
-VPDPBUSDSYrr	4134
-VPDPBUSDSZ	4135
-VPDPBUSDSZrm	4136
-VPDPBUSDSZrmb	4137
-VPDPBUSDSZrmbk	4138
-VPDPBUSDSZrmbkz	4139
-VPDPBUSDSZrmk	4140
-VPDPBUSDSZrmkz	4141
-VPDPBUSDSZrr	4142
-VPDPBUSDSZrrk	4143
-VPDPBUSDSZrrkz	4144
-VPDPBUSDSrm	4145
-VPDPBUSDSrr	4146
-VPDPBUSDYrm	4147
-VPDPBUSDYrr	4148
-VPDPBUSDZ	4149
-VPDPBUSDZrm	4150
-VPDPBUSDZrmb	4151
-VPDPBUSDZrmbk	4152
-VPDPBUSDZrmbkz	4153
-VPDPBUSDZrmk	4154
-VPDPBUSDZrmkz	4155
-VPDPBUSDZrr	4156
-VPDPBUSDZrrk	4157
-VPDPBUSDZrrkz	4158
-VPDPBUSDrm	4159
-VPDPBUSDrr	4160
-VPDPBUUDSYrm	4161
-VPDPBUUDSYrr	4162
-VPDPBUUDSZ	4163
-VPDPBUUDSZrm	4164
-VPDPBUUDSZrmb	4165
-VPDPBUUDSZrmbk	4166
-VPDPBUUDSZrmbkz	4167
-VPDPBUUDSZrmk	4168
-VPDPBUUDSZrmkz	4169
-VPDPBUUDSZrr	4170
-VPDPBUUDSZrrk	4171
-VPDPBUUDSZrrkz	4172
-VPDPBUUDSrm	4173
-VPDPBUUDSrr	4174
-VPDPBUUDYrm	4175
-VPDPBUUDYrr	4176
-VPDPBUUDZ	4177
-VPDPBUUDZrm	4178
-VPDPBUUDZrmb	4179
-VPDPBUUDZrmbk	4180
-VPDPBUUDZrmbkz	4181
-VPDPBUUDZrmk	4182
-VPDPBUUDZrmkz	4183
-VPDPBUUDZrr	4184
-VPDPBUUDZrrk	4185
-VPDPBUUDZrrkz	4186
-VPDPBUUDrm	4187
-VPDPBUUDrr	4188
-VPDPWSSDSYrm	4189
-VPDPWSSDSYrr	4190
-VPDPWSSDSZ	4191
-VPDPWSSDSZrm	4192
-VPDPWSSDSZrmb	4193
-VPDPWSSDSZrmbk	4194
-VPDPWSSDSZrmbkz	4195
-VPDPWSSDSZrmk	4196
-VPDPWSSDSZrmkz	4197
-VPDPWSSDSZrr	4198
-VPDPWSSDSZrrk	4199
-VPDPWSSDSZrrkz	4200
-VPDPWSSDSrm	4201
-VPDPWSSDSrr	4202
-VPDPWSSDYrm	4203
-VPDPWSSDYrr	4204
-VPDPWSSDZ	4205
-VPDPWSSDZrm	4206
-VPDPWSSDZrmb	4207
-VPDPWSSDZrmbk	4208
-VPDPWSSDZrmbkz	4209
-VPDPWSSDZrmk	4210
-VPDPWSSDZrmkz	4211
-VPDPWSSDZrr	4212
-VPDPWSSDZrrk	4213
-VPDPWSSDZrrkz	4214
-VPDPWSSDrm	4215
-VPDPWSSDrr	4216
-VPDPWSUDSYrm	4217
-VPDPWSUDSYrr	4218
-VPDPWSUDSZ	4219
-VPDPWSUDSZrm	4220
-VPDPWSUDSZrmb	4221
-VPDPWSUDSZrmbk	4222
-VPDPWSUDSZrmbkz	4223
-VPDPWSUDSZrmk	4224
-VPDPWSUDSZrmkz	4225
-VPDPWSUDSZrr	4226
-VPDPWSUDSZrrk	4227
-VPDPWSUDSZrrkz	4228
-VPDPWSUDSrm	4229
-VPDPWSUDSrr	4230
-VPDPWSUDYrm	4231
-VPDPWSUDYrr	4232
-VPDPWSUDZ	4233
-VPDPWSUDZrm	4234
-VPDPWSUDZrmb	4235
-VPDPWSUDZrmbk	4236
-VPDPWSUDZrmbkz	4237
-VPDPWSUDZrmk	4238
-VPDPWSUDZrmkz	4239
-VPDPWSUDZrr	4240
-VPDPWSUDZrrk	4241
-VPDPWSUDZrrkz	4242
-VPDPWSUDrm	4243
-VPDPWSUDrr	4244
-VPDPWUSDSYrm	4245
-VPDPWUSDSYrr	4246
-VPDPWUSDSZ	4247
-VPDPWUSDSZrm	4248
-VPDPWUSDSZrmb	4249
-VPDPWUSDSZrmbk	4250
-VPDPWUSDSZrmbkz	4251
-VPDPWUSDSZrmk	4252
-VPDPWUSDSZrmkz	4253
-VPDPWUSDSZrr	4254
-VPDPWUSDSZrrk	4255
-VPDPWUSDSZrrkz	4256
-VPDPWUSDSrm	4257
-VPDPWUSDSrr	4258
-VPDPWUSDYrm	4259
-VPDPWUSDYrr	4260
-VPDPWUSDZ	4261
-VPDPWUSDZrm	4262
-VPDPWUSDZrmb	4263
-VPDPWUSDZrmbk	4264
-VPDPWUSDZrmbkz	4265
-VPDPWUSDZrmk	4266
-VPDPWUSDZrmkz	4267
-VPDPWUSDZrr	4268
-VPDPWUSDZrrk	4269
-VPDPWUSDZrrkz	4270
-VPDPWUSDrm	4271
-VPDPWUSDrr	4272
-VPDPWUUDSYrm	4273
-VPDPWUUDSYrr	4274
-VPDPWUUDSZ	4275
-VPDPWUUDSZrm	4276
-VPDPWUUDSZrmb	4277
-VPDPWUUDSZrmbk	4278
-VPDPWUUDSZrmbkz	4279
-VPDPWUUDSZrmk	4280
-VPDPWUUDSZrmkz	4281
-VPDPWUUDSZrr	4282
-VPDPWUUDSZrrk	4283
-VPDPWUUDSZrrkz	4284
-VPDPWUUDSrm	4285
-VPDPWUUDSrr	4286
-VPDPWUUDYrm	4287
-VPDPWUUDYrr	4288
-VPDPWUUDZ	4289
-VPDPWUUDZrm	4290
-VPDPWUUDZrmb	4291
-VPDPWUUDZrmbk	4292
-VPDPWUUDZrmbkz	4293
-VPDPWUUDZrmk	4294
-VPDPWUUDZrmkz	4295
-VPDPWUUDZrr	4296
-VPDPWUUDZrrk	4297
-VPDPWUUDZrrkz	4298
-VPDPWUUDrm	4299
-VPDPWUUDrr	4300
-VPERM	4301
-VPERMBZ	4302
-VPERMBZrm	4303
-VPERMBZrmk	4304
-VPERMBZrmkz	4305
-VPERMBZrr	4306
-VPERMBZrrk	4307
-VPERMBZrrkz	4308
-VPERMDYrm	4309
-VPERMDYrr	4310
-VPERMDZ	4311
-VPERMDZrm	4312
-VPERMDZrmb	4313
-VPERMDZrmbk	4314
-VPERMDZrmbkz	4315
-VPERMDZrmk	4316
-VPERMDZrmkz	4317
-VPERMDZrr	4318
-VPERMDZrrk	4319
-VPERMDZrrkz	4320
-VPERMI	4321
-VPERMIL	4322
-VPERMILPDYmi	4323
-VPERMILPDYri	4324
-VPERMILPDYrm	4325
-VPERMILPDYrr	4326
-VPERMILPDZ	4327
-VPERMILPDZmbi	4328
-VPERMILPDZmbik	4329
-VPERMILPDZmbikz	4330
-VPERMILPDZmi	4331
-VPERMILPDZmik	4332
-VPERMILPDZmikz	4333
-VPERMILPDZri	4334
-VPERMILPDZrik	4335
-VPERMILPDZrikz	4336
-VPERMILPDZrm	4337
-VPERMILPDZrmb	4338
-VPERMILPDZrmbk	4339
-VPERMILPDZrmbkz	4340
-VPERMILPDZrmk	4341
-VPERMILPDZrmkz	4342
-VPERMILPDZrr	4343
-VPERMILPDZrrk	4344
-VPERMILPDZrrkz	4345
-VPERMILPDmi	4346
-VPERMILPDri	4347
-VPERMILPDrm	4348
-VPERMILPDrr	4349
-VPERMILPSYmi	4350
-VPERMILPSYri	4351
-VPERMILPSYrm	4352
-VPERMILPSYrr	4353
-VPERMILPSZ	4354
-VPERMILPSZmbi	4355
-VPERMILPSZmbik	4356
-VPERMILPSZmbikz	4357
-VPERMILPSZmi	4358
-VPERMILPSZmik	4359
-VPERMILPSZmikz	4360
-VPERMILPSZri	4361
-VPERMILPSZrik	4362
-VPERMILPSZrikz	4363
-VPERMILPSZrm	4364
-VPERMILPSZrmb	4365
-VPERMILPSZrmbk	4366
-VPERMILPSZrmbkz	4367
-VPERMILPSZrmk	4368
-VPERMILPSZrmkz	4369
-VPERMILPSZrr	4370
-VPERMILPSZrrk	4371
-VPERMILPSZrrkz	4372
-VPERMILPSmi	4373
-VPERMILPSri	4374
-VPERMILPSrm	4375
-VPERMILPSrr	4376
-VPERMPDYmi	4377
-VPERMPDYri	4378
-VPERMPDZ	4379
-VPERMPDZmbi	4380
-VPERMPDZmbik	4381
-VPERMPDZmbikz	4382
-VPERMPDZmi	4383
-VPERMPDZmik	4384
-VPERMPDZmikz	4385
-VPERMPDZri	4386
-VPERMPDZrik	4387
-VPERMPDZrikz	4388
-VPERMPDZrm	4389
-VPERMPDZrmb	4390
-VPERMPDZrmbk	4391
-VPERMPDZrmbkz	4392
-VPERMPDZrmk	4393
-VPERMPDZrmkz	4394
-VPERMPDZrr	4395
-VPERMPDZrrk	4396
-VPERMPDZrrkz	4397
-VPERMPSYrm	4398
-VPERMPSYrr	4399
-VPERMPSZ	4400
-VPERMPSZrm	4401
-VPERMPSZrmb	4402
-VPERMPSZrmbk	4403
-VPERMPSZrmbkz	4404
-VPERMPSZrmk	4405
-VPERMPSZrmkz	4406
-VPERMPSZrr	4407
-VPERMPSZrrk	4408
-VPERMPSZrrkz	4409
-VPERMQYmi	4410
-VPERMQYri	4411
-VPERMQZ	4412
-VPERMQZmbi	4413
-VPERMQZmbik	4414
-VPERMQZmbikz	4415
-VPERMQZmi	4416
-VPERMQZmik	4417
-VPERMQZmikz	4418
-VPERMQZri	4419
-VPERMQZrik	4420
-VPERMQZrikz	4421
-VPERMQZrm	4422
-VPERMQZrmb	4423
-VPERMQZrmbk	4424
-VPERMQZrmbkz	4425
-VPERMQZrmk	4426
-VPERMQZrmkz	4427
-VPERMQZrr	4428
-VPERMQZrrk	4429
-VPERMQZrrkz	4430
-VPERMT	4431
-VPERMWZ	4432
-VPERMWZrm	4433
-VPERMWZrmk	4434
-VPERMWZrmkz	4435
-VPERMWZrr	4436
-VPERMWZrrk	4437
-VPERMWZrrkz	4438
-VPEXPANDBZ	4439
-VPEXPANDBZrm	4440
-VPEXPANDBZrmk	4441
-VPEXPANDBZrmkz	4442
-VPEXPANDBZrr	4443
-VPEXPANDBZrrk	4444
-VPEXPANDBZrrkz	4445
-VPEXPANDDZ	4446
-VPEXPANDDZrm	4447
-VPEXPANDDZrmk	4448
-VPEXPANDDZrmkz	4449
-VPEXPANDDZrr	4450
-VPEXPANDDZrrk	4451
-VPEXPANDDZrrkz	4452
-VPEXPANDQZ	4453
-VPEXPANDQZrm	4454
-VPEXPANDQZrmk	4455
-VPEXPANDQZrmkz	4456
-VPEXPANDQZrr	4457
-VPEXPANDQZrrk	4458
-VPEXPANDQZrrkz	4459
-VPEXPANDWZ	4460
-VPEXPANDWZrm	4461
-VPEXPANDWZrmk	4462
-VPEXPANDWZrmkz	4463
-VPEXPANDWZrr	4464
-VPEXPANDWZrrk	4465
-VPEXPANDWZrrkz	4466
-VPEXTRBZmri	4467
-VPEXTRBZrri	4468
-VPEXTRBmri	4469
-VPEXTRBrri	4470
-VPEXTRDZmri	4471
-VPEXTRDZrri	4472
-VPEXTRDmri	4473
-VPEXTRDrri	4474
-VPEXTRQZmri	4475
-VPEXTRQZrri	4476
-VPEXTRQmri	4477
-VPEXTRQrri	4478
-VPEXTRWZmri	4479
-VPEXTRWZrri	4480
-VPEXTRWZrri_REV	4481
-VPEXTRWmri	4482
-VPEXTRWrri	4483
-VPEXTRWrri_REV	4484
-VPGATHERDDYrm	4485
-VPGATHERDDZ	4486
-VPGATHERDDZrm	4487
-VPGATHERDDrm	4488
-VPGATHERDQYrm	4489
-VPGATHERDQZ	4490
-VPGATHERDQZrm	4491
-VPGATHERDQrm	4492
-VPGATHERQDYrm	4493
-VPGATHERQDZ	4494
-VPGATHERQDZrm	4495
-VPGATHERQDrm	4496
-VPGATHERQQYrm	4497
-VPGATHERQQZ	4498
-VPGATHERQQZrm	4499
-VPGATHERQQrm	4500
-VPHADDBDrm	4501
-VPHADDBDrr	4502
-VPHADDBQrm	4503
-VPHADDBQrr	4504
-VPHADDBWrm	4505
-VPHADDBWrr	4506
-VPHADDDQrm	4507
-VPHADDDQrr	4508
-VPHADDDYrm	4509
-VPHADDDYrr	4510
-VPHADDDrm	4511
-VPHADDDrr	4512
-VPHADDSWYrm	4513
-VPHADDSWYrr	4514
-VPHADDSWrm	4515
-VPHADDSWrr	4516
-VPHADDUBDrm	4517
-VPHADDUBDrr	4518
-VPHADDUBQrm	4519
-VPHADDUBQrr	4520
-VPHADDUBWrm	4521
-VPHADDUBWrr	4522
-VPHADDUDQrm	4523
-VPHADDUDQrr	4524
-VPHADDUWDrm	4525
-VPHADDUWDrr	4526
-VPHADDUWQrm	4527
-VPHADDUWQrr	4528
-VPHADDWDrm	4529
-VPHADDWDrr	4530
-VPHADDWQrm	4531
-VPHADDWQrr	4532
-VPHADDWYrm	4533
-VPHADDWYrr	4534
-VPHADDWrm	4535
-VPHADDWrr	4536
-VPHMINPOSUWrm	4537
-VPHMINPOSUWrr	4538
-VPHSUBBWrm	4539
-VPHSUBBWrr	4540
-VPHSUBDQrm	4541
-VPHSUBDQrr	4542
-VPHSUBDYrm	4543
-VPHSUBDYrr	4544
-VPHSUBDrm	4545
-VPHSUBDrr	4546
-VPHSUBSWYrm	4547
-VPHSUBSWYrr	4548
-VPHSUBSWrm	4549
-VPHSUBSWrr	4550
-VPHSUBWDrm	4551
-VPHSUBWDrr	4552
-VPHSUBWYrm	4553
-VPHSUBWYrr	4554
-VPHSUBWrm	4555
-VPHSUBWrr	4556
-VPINSRBZrmi	4557
-VPINSRBZrri	4558
-VPINSRBrmi	4559
-VPINSRBrri	4560
-VPINSRDZrmi	4561
-VPINSRDZrri	4562
-VPINSRDrmi	4563
-VPINSRDrri	4564
-VPINSRQZrmi	4565
-VPINSRQZrri	4566
-VPINSRQrmi	4567
-VPINSRQrri	4568
-VPINSRWZrmi	4569
-VPINSRWZrri	4570
-VPINSRWrmi	4571
-VPINSRWrri	4572
-VPLZCNTDZ	4573
-VPLZCNTDZrm	4574
-VPLZCNTDZrmb	4575
-VPLZCNTDZrmbk	4576
-VPLZCNTDZrmbkz	4577
-VPLZCNTDZrmk	4578
-VPLZCNTDZrmkz	4579
-VPLZCNTDZrr	4580
-VPLZCNTDZrrk	4581
-VPLZCNTDZrrkz	4582
-VPLZCNTQZ	4583
-VPLZCNTQZrm	4584
-VPLZCNTQZrmb	4585
-VPLZCNTQZrmbk	4586
-VPLZCNTQZrmbkz	4587
-VPLZCNTQZrmk	4588
-VPLZCNTQZrmkz	4589
-VPLZCNTQZrr	4590
-VPLZCNTQZrrk	4591
-VPLZCNTQZrrkz	4592
-VPMACSDDrm	4593
-VPMACSDDrr	4594
-VPMACSDQHrm	4595
-VPMACSDQHrr	4596
-VPMACSDQLrm	4597
-VPMACSDQLrr	4598
-VPMACSSDDrm	4599
-VPMACSSDDrr	4600
-VPMACSSDQHrm	4601
-VPMACSSDQHrr	4602
-VPMACSSDQLrm	4603
-VPMACSSDQLrr	4604
-VPMACSSWDrm	4605
-VPMACSSWDrr	4606
-VPMACSSWWrm	4607
-VPMACSSWWrr	4608
-VPMACSWDrm	4609
-VPMACSWDrr	4610
-VPMACSWWrm	4611
-VPMACSWWrr	4612
-VPMADCSSWDrm	4613
-VPMADCSSWDrr	4614
-VPMADCSWDrm	4615
-VPMADCSWDrr	4616
-VPMADD	4617
-VPMADDUBSWYrm	4618
-VPMADDUBSWYrr	4619
-VPMADDUBSWZ	4620
-VPMADDUBSWZrm	4621
-VPMADDUBSWZrmk	4622
-VPMADDUBSWZrmkz	4623
-VPMADDUBSWZrr	4624
-VPMADDUBSWZrrk	4625
-VPMADDUBSWZrrkz	4626
-VPMADDUBSWrm	4627
-VPMADDUBSWrr	4628
-VPMADDWDYrm	4629
-VPMADDWDYrr	4630
-VPMADDWDZ	4631
-VPMADDWDZrm	4632
-VPMADDWDZrmk	4633
-VPMADDWDZrmkz	4634
-VPMADDWDZrr	4635
-VPMADDWDZrrk	4636
-VPMADDWDZrrkz	4637
-VPMADDWDrm	4638
-VPMADDWDrr	4639
-VPMASKMOVDYmr	4640
-VPMASKMOVDYrm	4641
-VPMASKMOVDmr	4642
-VPMASKMOVDrm	4643
-VPMASKMOVQYmr	4644
-VPMASKMOVQYrm	4645
-VPMASKMOVQmr	4646
-VPMASKMOVQrm	4647
-VPMAXSBYrm	4648
-VPMAXSBYrr	4649
-VPMAXSBZ	4650
-VPMAXSBZrm	4651
-VPMAXSBZrmk	4652
-VPMAXSBZrmkz	4653
-VPMAXSBZrr	4654
-VPMAXSBZrrk	4655
-VPMAXSBZrrkz	4656
-VPMAXSBrm	4657
-VPMAXSBrr	4658
-VPMAXSDYrm	4659
-VPMAXSDYrr	4660
-VPMAXSDZ	4661
-VPMAXSDZrm	4662
-VPMAXSDZrmb	4663
-VPMAXSDZrmbk	4664
-VPMAXSDZrmbkz	4665
-VPMAXSDZrmk	4666
-VPMAXSDZrmkz	4667
-VPMAXSDZrr	4668
-VPMAXSDZrrk	4669
-VPMAXSDZrrkz	4670
-VPMAXSDrm	4671
-VPMAXSDrr	4672
-VPMAXSQZ	4673
-VPMAXSQZrm	4674
-VPMAXSQZrmb	4675
-VPMAXSQZrmbk	4676
-VPMAXSQZrmbkz	4677
-VPMAXSQZrmk	4678
-VPMAXSQZrmkz	4679
-VPMAXSQZrr	4680
-VPMAXSQZrrk	4681
-VPMAXSQZrrkz	4682
-VPMAXSWYrm	4683
-VPMAXSWYrr	4684
-VPMAXSWZ	4685
-VPMAXSWZrm	4686
-VPMAXSWZrmk	4687
-VPMAXSWZrmkz	4688
-VPMAXSWZrr	4689
-VPMAXSWZrrk	4690
-VPMAXSWZrrkz	4691
-VPMAXSWrm	4692
-VPMAXSWrr	4693
-VPMAXUBYrm	4694
-VPMAXUBYrr	4695
-VPMAXUBZ	4696
-VPMAXUBZrm	4697
-VPMAXUBZrmk	4698
-VPMAXUBZrmkz	4699
-VPMAXUBZrr	4700
-VPMAXUBZrrk	4701
-VPMAXUBZrrkz	4702
-VPMAXUBrm	4703
-VPMAXUBrr	4704
-VPMAXUDYrm	4705
-VPMAXUDYrr	4706
-VPMAXUDZ	4707
-VPMAXUDZrm	4708
-VPMAXUDZrmb	4709
-VPMAXUDZrmbk	4710
-VPMAXUDZrmbkz	4711
-VPMAXUDZrmk	4712
-VPMAXUDZrmkz	4713
-VPMAXUDZrr	4714
-VPMAXUDZrrk	4715
-VPMAXUDZrrkz	4716
-VPMAXUDrm	4717
-VPMAXUDrr	4718
-VPMAXUQZ	4719
-VPMAXUQZrm	4720
-VPMAXUQZrmb	4721
-VPMAXUQZrmbk	4722
-VPMAXUQZrmbkz	4723
-VPMAXUQZrmk	4724
-VPMAXUQZrmkz	4725
-VPMAXUQZrr	4726
-VPMAXUQZrrk	4727
-VPMAXUQZrrkz	4728
-VPMAXUWYrm	4729
-VPMAXUWYrr	4730
-VPMAXUWZ	4731
-VPMAXUWZrm	4732
-VPMAXUWZrmk	4733
-VPMAXUWZrmkz	4734
-VPMAXUWZrr	4735
-VPMAXUWZrrk	4736
-VPMAXUWZrrkz	4737
-VPMAXUWrm	4738
-VPMAXUWrr	4739
-VPMINSBYrm	4740
-VPMINSBYrr	4741
-VPMINSBZ	4742
-VPMINSBZrm	4743
-VPMINSBZrmk	4744
-VPMINSBZrmkz	4745
-VPMINSBZrr	4746
-VPMINSBZrrk	4747
-VPMINSBZrrkz	4748
-VPMINSBrm	4749
-VPMINSBrr	4750
-VPMINSDYrm	4751
-VPMINSDYrr	4752
-VPMINSDZ	4753
-VPMINSDZrm	4754
-VPMINSDZrmb	4755
-VPMINSDZrmbk	4756
-VPMINSDZrmbkz	4757
-VPMINSDZrmk	4758
-VPMINSDZrmkz	4759
-VPMINSDZrr	4760
-VPMINSDZrrk	4761
-VPMINSDZrrkz	4762
-VPMINSDrm	4763
-VPMINSDrr	4764
-VPMINSQZ	4765
-VPMINSQZrm	4766
-VPMINSQZrmb	4767
-VPMINSQZrmbk	4768
-VPMINSQZrmbkz	4769
-VPMINSQZrmk	4770
-VPMINSQZrmkz	4771
-VPMINSQZrr	4772
-VPMINSQZrrk	4773
-VPMINSQZrrkz	4774
-VPMINSWYrm	4775
-VPMINSWYrr	4776
-VPMINSWZ	4777
-VPMINSWZrm	4778
-VPMINSWZrmk	4779
-VPMINSWZrmkz	4780
-VPMINSWZrr	4781
-VPMINSWZrrk	4782
-VPMINSWZrrkz	4783
-VPMINSWrm	4784
-VPMINSWrr	4785
-VPMINUBYrm	4786
-VPMINUBYrr	4787
-VPMINUBZ	4788
-VPMINUBZrm	4789
-VPMINUBZrmk	4790
-VPMINUBZrmkz	4791
-VPMINUBZrr	4792
-VPMINUBZrrk	4793
-VPMINUBZrrkz	4794
-VPMINUBrm	4795
-VPMINUBrr	4796
-VPMINUDYrm	4797
-VPMINUDYrr	4798
-VPMINUDZ	4799
-VPMINUDZrm	4800
-VPMINUDZrmb	4801
-VPMINUDZrmbk	4802
-VPMINUDZrmbkz	4803
-VPMINUDZrmk	4804
-VPMINUDZrmkz	4805
-VPMINUDZrr	4806
-VPMINUDZrrk	4807
-VPMINUDZrrkz	4808
-VPMINUDrm	4809
-VPMINUDrr	4810
-VPMINUQZ	4811
-VPMINUQZrm	4812
-VPMINUQZrmb	4813
-VPMINUQZrmbk	4814
-VPMINUQZrmbkz	4815
-VPMINUQZrmk	4816
-VPMINUQZrmkz	4817
-VPMINUQZrr	4818
-VPMINUQZrrk	4819
-VPMINUQZrrkz	4820
-VPMINUWYrm	4821
-VPMINUWYrr	4822
-VPMINUWZ	4823
-VPMINUWZrm	4824
-VPMINUWZrmk	4825
-VPMINUWZrmkz	4826
-VPMINUWZrr	4827
-VPMINUWZrrk	4828
-VPMINUWZrrkz	4829
-VPMINUWrm	4830
-VPMINUWrr	4831
-VPMOVB	4832
-VPMOVD	4833
-VPMOVDBZ	4834
-VPMOVDBZmr	4835
-VPMOVDBZmrk	4836
-VPMOVDBZrr	4837
-VPMOVDBZrrk	4838
-VPMOVDBZrrkz	4839
-VPMOVDWZ	4840
-VPMOVDWZmr	4841
-VPMOVDWZmrk	4842
-VPMOVDWZrr	4843
-VPMOVDWZrrk	4844
-VPMOVDWZrrkz	4845
-VPMOVM	4846
-VPMOVMSKBYrr	4847
-VPMOVMSKBrr	4848
-VPMOVQ	4849
-VPMOVQBZ	4850
-VPMOVQBZmr	4851
-VPMOVQBZmrk	4852
-VPMOVQBZrr	4853
-VPMOVQBZrrk	4854
-VPMOVQBZrrkz	4855
-VPMOVQDZ	4856
-VPMOVQDZmr	4857
-VPMOVQDZmrk	4858
-VPMOVQDZrr	4859
-VPMOVQDZrrk	4860
-VPMOVQDZrrkz	4861
-VPMOVQWZ	4862
-VPMOVQWZmr	4863
-VPMOVQWZmrk	4864
-VPMOVQWZrr	4865
-VPMOVQWZrrk	4866
-VPMOVQWZrrkz	4867
-VPMOVSDBZ	4868
-VPMOVSDBZmr	4869
-VPMOVSDBZmrk	4870
-VPMOVSDBZrr	4871
-VPMOVSDBZrrk	4872
-VPMOVSDBZrrkz	4873
-VPMOVSDWZ	4874
-VPMOVSDWZmr	4875
-VPMOVSDWZmrk	4876
-VPMOVSDWZrr	4877
-VPMOVSDWZrrk	4878
-VPMOVSDWZrrkz	4879
-VPMOVSQBZ	4880
-VPMOVSQBZmr	4881
-VPMOVSQBZmrk	4882
-VPMOVSQBZrr	4883
-VPMOVSQBZrrk	4884
-VPMOVSQBZrrkz	4885
-VPMOVSQDZ	4886
-VPMOVSQDZmr	4887
-VPMOVSQDZmrk	4888
-VPMOVSQDZrr	4889
-VPMOVSQDZrrk	4890
-VPMOVSQDZrrkz	4891
-VPMOVSQWZ	4892
-VPMOVSQWZmr	4893
-VPMOVSQWZmrk	4894
-VPMOVSQWZrr	4895
-VPMOVSQWZrrk	4896
-VPMOVSQWZrrkz	4897
-VPMOVSWBZ	4898
-VPMOVSWBZmr	4899
-VPMOVSWBZmrk	4900
-VPMOVSWBZrr	4901
-VPMOVSWBZrrk	4902
-VPMOVSWBZrrkz	4903
-VPMOVSXBDYrm	4904
-VPMOVSXBDYrr	4905
-VPMOVSXBDZ	4906
-VPMOVSXBDZrm	4907
-VPMOVSXBDZrmk	4908
-VPMOVSXBDZrmkz	4909
-VPMOVSXBDZrr	4910
-VPMOVSXBDZrrk	4911
-VPMOVSXBDZrrkz	4912
-VPMOVSXBDrm	4913
-VPMOVSXBDrr	4914
-VPMOVSXBQYrm	4915
-VPMOVSXBQYrr	4916
-VPMOVSXBQZ	4917
-VPMOVSXBQZrm	4918
-VPMOVSXBQZrmk	4919
-VPMOVSXBQZrmkz	4920
-VPMOVSXBQZrr	4921
-VPMOVSXBQZrrk	4922
-VPMOVSXBQZrrkz	4923
-VPMOVSXBQrm	4924
-VPMOVSXBQrr	4925
-VPMOVSXBWYrm	4926
-VPMOVSXBWYrr	4927
-VPMOVSXBWZ	4928
-VPMOVSXBWZrm	4929
-VPMOVSXBWZrmk	4930
-VPMOVSXBWZrmkz	4931
-VPMOVSXBWZrr	4932
-VPMOVSXBWZrrk	4933
-VPMOVSXBWZrrkz	4934
-VPMOVSXBWrm	4935
-VPMOVSXBWrr	4936
-VPMOVSXDQYrm	4937
-VPMOVSXDQYrr	4938
-VPMOVSXDQZ	4939
-VPMOVSXDQZrm	4940
-VPMOVSXDQZrmk	4941
-VPMOVSXDQZrmkz	4942
-VPMOVSXDQZrr	4943
-VPMOVSXDQZrrk	4944
-VPMOVSXDQZrrkz	4945
-VPMOVSXDQrm	4946
-VPMOVSXDQrr	4947
-VPMOVSXWDYrm	4948
-VPMOVSXWDYrr	4949
-VPMOVSXWDZ	4950
-VPMOVSXWDZrm	4951
-VPMOVSXWDZrmk	4952
-VPMOVSXWDZrmkz	4953
-VPMOVSXWDZrr	4954
-VPMOVSXWDZrrk	4955
-VPMOVSXWDZrrkz	4956
-VPMOVSXWDrm	4957
-VPMOVSXWDrr	4958
-VPMOVSXWQYrm	4959
-VPMOVSXWQYrr	4960
-VPMOVSXWQZ	4961
-VPMOVSXWQZrm	4962
-VPMOVSXWQZrmk	4963
-VPMOVSXWQZrmkz	4964
-VPMOVSXWQZrr	4965
-VPMOVSXWQZrrk	4966
-VPMOVSXWQZrrkz	4967
-VPMOVSXWQrm	4968
-VPMOVSXWQrr	4969
-VPMOVUSDBZ	4970
-VPMOVUSDBZmr	4971
-VPMOVUSDBZmrk	4972
-VPMOVUSDBZrr	4973
-VPMOVUSDBZrrk	4974
-VPMOVUSDBZrrkz	4975
-VPMOVUSDWZ	4976
-VPMOVUSDWZmr	4977
-VPMOVUSDWZmrk	4978
-VPMOVUSDWZrr	4979
-VPMOVUSDWZrrk	4980
-VPMOVUSDWZrrkz	4981
-VPMOVUSQBZ	4982
-VPMOVUSQBZmr	4983
-VPMOVUSQBZmrk	4984
-VPMOVUSQBZrr	4985
-VPMOVUSQBZrrk	4986
-VPMOVUSQBZrrkz	4987
-VPMOVUSQDZ	4988
-VPMOVUSQDZmr	4989
-VPMOVUSQDZmrk	4990
-VPMOVUSQDZrr	4991
-VPMOVUSQDZrrk	4992
-VPMOVUSQDZrrkz	4993
-VPMOVUSQWZ	4994
-VPMOVUSQWZmr	4995
-VPMOVUSQWZmrk	4996
-VPMOVUSQWZrr	4997
-VPMOVUSQWZrrk	4998
-VPMOVUSQWZrrkz	4999
-VPMOVUSWBZ	5000
-VPMOVUSWBZmr	5001
-VPMOVUSWBZmrk	5002
-VPMOVUSWBZrr	5003
-VPMOVUSWBZrrk	5004
-VPMOVUSWBZrrkz	5005
-VPMOVW	5006
-VPMOVWBZ	5007
-VPMOVWBZmr	5008
-VPMOVWBZmrk	5009
-VPMOVWBZrr	5010
-VPMOVWBZrrk	5011
-VPMOVWBZrrkz	5012
-VPMOVZXBDYrm	5013
-VPMOVZXBDYrr	5014
-VPMOVZXBDZ	5015
-VPMOVZXBDZrm	5016
-VPMOVZXBDZrmk	5017
-VPMOVZXBDZrmkz	5018
-VPMOVZXBDZrr	5019
-VPMOVZXBDZrrk	5020
-VPMOVZXBDZrrkz	5021
-VPMOVZXBDrm	5022
-VPMOVZXBDrr	5023
-VPMOVZXBQYrm	5024
-VPMOVZXBQYrr	5025
-VPMOVZXBQZ	5026
-VPMOVZXBQZrm	5027
-VPMOVZXBQZrmk	5028
-VPMOVZXBQZrmkz	5029
-VPMOVZXBQZrr	5030
-VPMOVZXBQZrrk	5031
-VPMOVZXBQZrrkz	5032
-VPMOVZXBQrm	5033
-VPMOVZXBQrr	5034
-VPMOVZXBWYrm	5035
-VPMOVZXBWYrr	5036
-VPMOVZXBWZ	5037
-VPMOVZXBWZrm	5038
-VPMOVZXBWZrmk	5039
-VPMOVZXBWZrmkz	5040
-VPMOVZXBWZrr	5041
-VPMOVZXBWZrrk	5042
-VPMOVZXBWZrrkz	5043
-VPMOVZXBWrm	5044
-VPMOVZXBWrr	5045
-VPMOVZXDQYrm	5046
-VPMOVZXDQYrr	5047
-VPMOVZXDQZ	5048
-VPMOVZXDQZrm	5049
-VPMOVZXDQZrmk	5050
-VPMOVZXDQZrmkz	5051
-VPMOVZXDQZrr	5052
-VPMOVZXDQZrrk	5053
-VPMOVZXDQZrrkz	5054
-VPMOVZXDQrm	5055
-VPMOVZXDQrr	5056
-VPMOVZXWDYrm	5057
-VPMOVZXWDYrr	5058
-VPMOVZXWDZ	5059
-VPMOVZXWDZrm	5060
-VPMOVZXWDZrmk	5061
-VPMOVZXWDZrmkz	5062
-VPMOVZXWDZrr	5063
-VPMOVZXWDZrrk	5064
-VPMOVZXWDZrrkz	5065
-VPMOVZXWDrm	5066
-VPMOVZXWDrr	5067
-VPMOVZXWQYrm	5068
-VPMOVZXWQYrr	5069
-VPMOVZXWQZ	5070
-VPMOVZXWQZrm	5071
-VPMOVZXWQZrmk	5072
-VPMOVZXWQZrmkz	5073
-VPMOVZXWQZrr	5074
-VPMOVZXWQZrrk	5075
-VPMOVZXWQZrrkz	5076
-VPMOVZXWQrm	5077
-VPMOVZXWQrr	5078
-VPMULDQYrm	5079
-VPMULDQYrr	5080
-VPMULDQZ	5081
-VPMULDQZrm	5082
-VPMULDQZrmb	5083
-VPMULDQZrmbk	5084
-VPMULDQZrmbkz	5085
-VPMULDQZrmk	5086
-VPMULDQZrmkz	5087
-VPMULDQZrr	5088
-VPMULDQZrrk	5089
-VPMULDQZrrkz	5090
-VPMULDQrm	5091
-VPMULDQrr	5092
-VPMULHRSWYrm	5093
-VPMULHRSWYrr	5094
-VPMULHRSWZ	5095
-VPMULHRSWZrm	5096
-VPMULHRSWZrmk	5097
-VPMULHRSWZrmkz	5098
-VPMULHRSWZrr	5099
-VPMULHRSWZrrk	5100
-VPMULHRSWZrrkz	5101
-VPMULHRSWrm	5102
-VPMULHRSWrr	5103
-VPMULHUWYrm	5104
-VPMULHUWYrr	5105
-VPMULHUWZ	5106
-VPMULHUWZrm	5107
-VPMULHUWZrmk	5108
-VPMULHUWZrmkz	5109
-VPMULHUWZrr	5110
-VPMULHUWZrrk	5111
-VPMULHUWZrrkz	5112
-VPMULHUWrm	5113
-VPMULHUWrr	5114
-VPMULHWYrm	5115
-VPMULHWYrr	5116
-VPMULHWZ	5117
-VPMULHWZrm	5118
-VPMULHWZrmk	5119
-VPMULHWZrmkz	5120
-VPMULHWZrr	5121
-VPMULHWZrrk	5122
-VPMULHWZrrkz	5123
-VPMULHWrm	5124
-VPMULHWrr	5125
-VPMULLDYrm	5126
-VPMULLDYrr	5127
-VPMULLDZ	5128
-VPMULLDZrm	5129
-VPMULLDZrmb	5130
-VPMULLDZrmbk	5131
-VPMULLDZrmbkz	5132
-VPMULLDZrmk	5133
-VPMULLDZrmkz	5134
-VPMULLDZrr	5135
-VPMULLDZrrk	5136
-VPMULLDZrrkz	5137
-VPMULLDrm	5138
-VPMULLDrr	5139
-VPMULLQZ	5140
-VPMULLQZrm	5141
-VPMULLQZrmb	5142
-VPMULLQZrmbk	5143
-VPMULLQZrmbkz	5144
-VPMULLQZrmk	5145
-VPMULLQZrmkz	5146
-VPMULLQZrr	5147
-VPMULLQZrrk	5148
-VPMULLQZrrkz	5149
-VPMULLWYrm	5150
-VPMULLWYrr	5151
-VPMULLWZ	5152
-VPMULLWZrm	5153
-VPMULLWZrmk	5154
-VPMULLWZrmkz	5155
-VPMULLWZrr	5156
-VPMULLWZrrk	5157
-VPMULLWZrrkz	5158
-VPMULLWrm	5159
-VPMULLWrr	5160
-VPMULTISHIFTQBZ	5161
-VPMULTISHIFTQBZrm	5162
-VPMULTISHIFTQBZrmb	5163
-VPMULTISHIFTQBZrmbk	5164
-VPMULTISHIFTQBZrmbkz	5165
-VPMULTISHIFTQBZrmk	5166
-VPMULTISHIFTQBZrmkz	5167
-VPMULTISHIFTQBZrr	5168
-VPMULTISHIFTQBZrrk	5169
-VPMULTISHIFTQBZrrkz	5170
-VPMULUDQYrm	5171
-VPMULUDQYrr	5172
-VPMULUDQZ	5173
-VPMULUDQZrm	5174
-VPMULUDQZrmb	5175
-VPMULUDQZrmbk	5176
-VPMULUDQZrmbkz	5177
-VPMULUDQZrmk	5178
-VPMULUDQZrmkz	5179
-VPMULUDQZrr	5180
-VPMULUDQZrrk	5181
-VPMULUDQZrrkz	5182
-VPMULUDQrm	5183
-VPMULUDQrr	5184
-VPOPCNTBZ	5185
-VPOPCNTBZrm	5186
-VPOPCNTBZrmk	5187
-VPOPCNTBZrmkz	5188
-VPOPCNTBZrr	5189
-VPOPCNTBZrrk	5190
-VPOPCNTBZrrkz	5191
-VPOPCNTDZ	5192
-VPOPCNTDZrm	5193
-VPOPCNTDZrmb	5194
-VPOPCNTDZrmbk	5195
-VPOPCNTDZrmbkz	5196
-VPOPCNTDZrmk	5197
-VPOPCNTDZrmkz	5198
-VPOPCNTDZrr	5199
-VPOPCNTDZrrk	5200
-VPOPCNTDZrrkz	5201
-VPOPCNTQZ	5202
-VPOPCNTQZrm	5203
-VPOPCNTQZrmb	5204
-VPOPCNTQZrmbk	5205
-VPOPCNTQZrmbkz	5206
-VPOPCNTQZrmk	5207
-VPOPCNTQZrmkz	5208
-VPOPCNTQZrr	5209
-VPOPCNTQZrrk	5210
-VPOPCNTQZrrkz	5211
-VPOPCNTWZ	5212
-VPOPCNTWZrm	5213
-VPOPCNTWZrmk	5214
-VPOPCNTWZrmkz	5215
-VPOPCNTWZrr	5216
-VPOPCNTWZrrk	5217
-VPOPCNTWZrrkz	5218
-VPORDZ	5219
-VPORDZrm	5220
-VPORDZrmb	5221
-VPORDZrmbk	5222
-VPORDZrmbkz	5223
-VPORDZrmk	5224
-VPORDZrmkz	5225
-VPORDZrr	5226
-VPORDZrrk	5227
-VPORDZrrkz	5228
-VPORQZ	5229
-VPORQZrm	5230
-VPORQZrmb	5231
-VPORQZrmbk	5232
-VPORQZrmbkz	5233
-VPORQZrmk	5234
-VPORQZrmkz	5235
-VPORQZrr	5236
-VPORQZrrk	5237
-VPORQZrrkz	5238
-VPORYrm	5239
-VPORYrr	5240
-VPORrm	5241
-VPORrr	5242
-VPPERMrmr	5243
-VPPERMrrm	5244
-VPPERMrrr	5245
-VPPERMrrr_REV	5246
-VPROLDZ	5247
-VPROLDZmbi	5248
-VPROLDZmbik	5249
-VPROLDZmbikz	5250
-VPROLDZmi	5251
-VPROLDZmik	5252
-VPROLDZmikz	5253
-VPROLDZri	5254
-VPROLDZrik	5255
-VPROLDZrikz	5256
-VPROLQZ	5257
-VPROLQZmbi	5258
-VPROLQZmbik	5259
-VPROLQZmbikz	5260
-VPROLQZmi	5261
-VPROLQZmik	5262
-VPROLQZmikz	5263
-VPROLQZri	5264
-VPROLQZrik	5265
-VPROLQZrikz	5266
-VPROLVDZ	5267
-VPROLVDZrm	5268
-VPROLVDZrmb	5269
-VPROLVDZrmbk	5270
-VPROLVDZrmbkz	5271
-VPROLVDZrmk	5272
-VPROLVDZrmkz	5273
-VPROLVDZrr	5274
-VPROLVDZrrk	5275
-VPROLVDZrrkz	5276
-VPROLVQZ	5277
-VPROLVQZrm	5278
-VPROLVQZrmb	5279
-VPROLVQZrmbk	5280
-VPROLVQZrmbkz	5281
-VPROLVQZrmk	5282
-VPROLVQZrmkz	5283
-VPROLVQZrr	5284
-VPROLVQZrrk	5285
-VPROLVQZrrkz	5286
-VPRORDZ	5287
-VPRORDZmbi	5288
-VPRORDZmbik	5289
-VPRORDZmbikz	5290
-VPRORDZmi	5291
-VPRORDZmik	5292
-VPRORDZmikz	5293
-VPRORDZri	5294
-VPRORDZrik	5295
-VPRORDZrikz	5296
-VPRORQZ	5297
-VPRORQZmbi	5298
-VPRORQZmbik	5299
-VPRORQZmbikz	5300
-VPRORQZmi	5301
-VPRORQZmik	5302
-VPRORQZmikz	5303
-VPRORQZri	5304
-VPRORQZrik	5305
-VPRORQZrikz	5306
-VPRORVDZ	5307
-VPRORVDZrm	5308
-VPRORVDZrmb	5309
-VPRORVDZrmbk	5310
-VPRORVDZrmbkz	5311
-VPRORVDZrmk	5312
-VPRORVDZrmkz	5313
-VPRORVDZrr	5314
-VPRORVDZrrk	5315
-VPRORVDZrrkz	5316
-VPRORVQZ	5317
-VPRORVQZrm	5318
-VPRORVQZrmb	5319
-VPRORVQZrmbk	5320
-VPRORVQZrmbkz	5321
-VPRORVQZrmk	5322
-VPRORVQZrmkz	5323
-VPRORVQZrr	5324
-VPRORVQZrrk	5325
-VPRORVQZrrkz	5326
-VPROTBmi	5327
-VPROTBmr	5328
-VPROTBri	5329
-VPROTBrm	5330
-VPROTBrr	5331
-VPROTBrr_REV	5332
-VPROTDmi	5333
-VPROTDmr	5334
-VPROTDri	5335
-VPROTDrm	5336
-VPROTDrr	5337
-VPROTDrr_REV	5338
-VPROTQmi	5339
-VPROTQmr	5340
-VPROTQri	5341
-VPROTQrm	5342
-VPROTQrr	5343
-VPROTQrr_REV	5344
-VPROTWmi	5345
-VPROTWmr	5346
-VPROTWri	5347
-VPROTWrm	5348
-VPROTWrr	5349
-VPROTWrr_REV	5350
-VPSADBWYrm	5351
-VPSADBWYrr	5352
-VPSADBWZ	5353
-VPSADBWZrm	5354
-VPSADBWZrr	5355
-VPSADBWrm	5356
-VPSADBWrr	5357
-VPSCATTERDDZ	5358
-VPSCATTERDDZmr	5359
-VPSCATTERDQZ	5360
-VPSCATTERDQZmr	5361
-VPSCATTERQDZ	5362
-VPSCATTERQDZmr	5363
-VPSCATTERQQZ	5364
-VPSCATTERQQZmr	5365
-VPSHABmr	5366
-VPSHABrm	5367
-VPSHABrr	5368
-VPSHABrr_REV	5369
-VPSHADmr	5370
-VPSHADrm	5371
-VPSHADrr	5372
-VPSHADrr_REV	5373
-VPSHAQmr	5374
-VPSHAQrm	5375
-VPSHAQrr	5376
-VPSHAQrr_REV	5377
-VPSHAWmr	5378
-VPSHAWrm	5379
-VPSHAWrr	5380
-VPSHAWrr_REV	5381
-VPSHLBmr	5382
-VPSHLBrm	5383
-VPSHLBrr	5384
-VPSHLBrr_REV	5385
-VPSHLDDZ	5386
-VPSHLDDZrmbi	5387
-VPSHLDDZrmbik	5388
-VPSHLDDZrmbikz	5389
-VPSHLDDZrmi	5390
-VPSHLDDZrmik	5391
-VPSHLDDZrmikz	5392
-VPSHLDDZrri	5393
-VPSHLDDZrrik	5394
-VPSHLDDZrrikz	5395
-VPSHLDQZ	5396
-VPSHLDQZrmbi	5397
-VPSHLDQZrmbik	5398
-VPSHLDQZrmbikz	5399
-VPSHLDQZrmi	5400
-VPSHLDQZrmik	5401
-VPSHLDQZrmikz	5402
-VPSHLDQZrri	5403
-VPSHLDQZrrik	5404
-VPSHLDQZrrikz	5405
-VPSHLDVDZ	5406
-VPSHLDVDZm	5407
-VPSHLDVDZmb	5408
-VPSHLDVDZmbk	5409
-VPSHLDVDZmbkz	5410
-VPSHLDVDZmk	5411
-VPSHLDVDZmkz	5412
-VPSHLDVDZr	5413
-VPSHLDVDZrk	5414
-VPSHLDVDZrkz	5415
-VPSHLDVQZ	5416
-VPSHLDVQZm	5417
-VPSHLDVQZmb	5418
-VPSHLDVQZmbk	5419
-VPSHLDVQZmbkz	5420
-VPSHLDVQZmk	5421
-VPSHLDVQZmkz	5422
-VPSHLDVQZr	5423
-VPSHLDVQZrk	5424
-VPSHLDVQZrkz	5425
-VPSHLDVWZ	5426
-VPSHLDVWZm	5427
-VPSHLDVWZmk	5428
-VPSHLDVWZmkz	5429
-VPSHLDVWZr	5430
-VPSHLDVWZrk	5431
-VPSHLDVWZrkz	5432
-VPSHLDWZ	5433
-VPSHLDWZrmi	5434
-VPSHLDWZrmik	5435
-VPSHLDWZrmikz	5436
-VPSHLDWZrri	5437
-VPSHLDWZrrik	5438
-VPSHLDWZrrikz	5439
-VPSHLDmr	5440
-VPSHLDrm	5441
-VPSHLDrr	5442
-VPSHLDrr_REV	5443
-VPSHLQmr	5444
-VPSHLQrm	5445
-VPSHLQrr	5446
-VPSHLQrr_REV	5447
-VPSHLWmr	5448
-VPSHLWrm	5449
-VPSHLWrr	5450
-VPSHLWrr_REV	5451
-VPSHRDDZ	5452
-VPSHRDDZrmbi	5453
-VPSHRDDZrmbik	5454
-VPSHRDDZrmbikz	5455
-VPSHRDDZrmi	5456
-VPSHRDDZrmik	5457
-VPSHRDDZrmikz	5458
-VPSHRDDZrri	5459
-VPSHRDDZrrik	5460
-VPSHRDDZrrikz	5461
-VPSHRDQZ	5462
-VPSHRDQZrmbi	5463
-VPSHRDQZrmbik	5464
-VPSHRDQZrmbikz	5465
-VPSHRDQZrmi	5466
-VPSHRDQZrmik	5467
-VPSHRDQZrmikz	5468
-VPSHRDQZrri	5469
-VPSHRDQZrrik	5470
-VPSHRDQZrrikz	5471
-VPSHRDVDZ	5472
-VPSHRDVDZm	5473
-VPSHRDVDZmb	5474
-VPSHRDVDZmbk	5475
-VPSHRDVDZmbkz	5476
-VPSHRDVDZmk	5477
-VPSHRDVDZmkz	5478
-VPSHRDVDZr	5479
-VPSHRDVDZrk	5480
-VPSHRDVDZrkz	5481
-VPSHRDVQZ	5482
-VPSHRDVQZm	5483
-VPSHRDVQZmb	5484
-VPSHRDVQZmbk	5485
-VPSHRDVQZmbkz	5486
-VPSHRDVQZmk	5487
-VPSHRDVQZmkz	5488
-VPSHRDVQZr	5489
-VPSHRDVQZrk	5490
-VPSHRDVQZrkz	5491
-VPSHRDVWZ	5492
-VPSHRDVWZm	5493
-VPSHRDVWZmk	5494
-VPSHRDVWZmkz	5495
-VPSHRDVWZr	5496
-VPSHRDVWZrk	5497
-VPSHRDVWZrkz	5498
-VPSHRDWZ	5499
-VPSHRDWZrmi	5500
-VPSHRDWZrmik	5501
-VPSHRDWZrmikz	5502
-VPSHRDWZrri	5503
-VPSHRDWZrrik	5504
-VPSHRDWZrrikz	5505
-VPSHUFBITQMBZ	5506
-VPSHUFBITQMBZrm	5507
-VPSHUFBITQMBZrmk	5508
-VPSHUFBITQMBZrr	5509
-VPSHUFBITQMBZrrk	5510
-VPSHUFBYrm	5511
-VPSHUFBYrr	5512
-VPSHUFBZ	5513
-VPSHUFBZrm	5514
-VPSHUFBZrmk	5515
-VPSHUFBZrmkz	5516
-VPSHUFBZrr	5517
-VPSHUFBZrrk	5518
-VPSHUFBZrrkz	5519
-VPSHUFBrm	5520
-VPSHUFBrr	5521
-VPSHUFDYmi	5522
-VPSHUFDYri	5523
-VPSHUFDZ	5524
-VPSHUFDZmbi	5525
-VPSHUFDZmbik	5526
-VPSHUFDZmbikz	5527
-VPSHUFDZmi	5528
-VPSHUFDZmik	5529
-VPSHUFDZmikz	5530
-VPSHUFDZri	5531
-VPSHUFDZrik	5532
-VPSHUFDZrikz	5533
-VPSHUFDmi	5534
-VPSHUFDri	5535
-VPSHUFHWYmi	5536
-VPSHUFHWYri	5537
-VPSHUFHWZ	5538
-VPSHUFHWZmi	5539
-VPSHUFHWZmik	5540
-VPSHUFHWZmikz	5541
-VPSHUFHWZri	5542
-VPSHUFHWZrik	5543
-VPSHUFHWZrikz	5544
-VPSHUFHWmi	5545
-VPSHUFHWri	5546
-VPSHUFLWYmi	5547
-VPSHUFLWYri	5548
-VPSHUFLWZ	5549
-VPSHUFLWZmi	5550
-VPSHUFLWZmik	5551
-VPSHUFLWZmikz	5552
-VPSHUFLWZri	5553
-VPSHUFLWZrik	5554
-VPSHUFLWZrikz	5555
-VPSHUFLWmi	5556
-VPSHUFLWri	5557
-VPSIGNBYrm	5558
-VPSIGNBYrr	5559
-VPSIGNBrm	5560
-VPSIGNBrr	5561
-VPSIGNDYrm	5562
-VPSIGNDYrr	5563
-VPSIGNDrm	5564
-VPSIGNDrr	5565
-VPSIGNWYrm	5566
-VPSIGNWYrr	5567
-VPSIGNWrm	5568
-VPSIGNWrr	5569
-VPSLLDQYri	5570
-VPSLLDQZ	5571
-VPSLLDQZmi	5572
-VPSLLDQZri	5573
-VPSLLDQri	5574
-VPSLLDYri	5575
-VPSLLDYrm	5576
-VPSLLDYrr	5577
-VPSLLDZ	5578
-VPSLLDZmbi	5579
-VPSLLDZmbik	5580
-VPSLLDZmbikz	5581
-VPSLLDZmi	5582
-VPSLLDZmik	5583
-VPSLLDZmikz	5584
-VPSLLDZri	5585
-VPSLLDZrik	5586
-VPSLLDZrikz	5587
-VPSLLDZrm	5588
-VPSLLDZrmk	5589
-VPSLLDZrmkz	5590
-VPSLLDZrr	5591
-VPSLLDZrrk	5592
-VPSLLDZrrkz	5593
-VPSLLDri	5594
-VPSLLDrm	5595
-VPSLLDrr	5596
-VPSLLQYri	5597
-VPSLLQYrm	5598
-VPSLLQYrr	5599
-VPSLLQZ	5600
-VPSLLQZmbi	5601
-VPSLLQZmbik	5602
-VPSLLQZmbikz	5603
-VPSLLQZmi	5604
-VPSLLQZmik	5605
-VPSLLQZmikz	5606
-VPSLLQZri	5607
-VPSLLQZrik	5608
-VPSLLQZrikz	5609
-VPSLLQZrm	5610
-VPSLLQZrmk	5611
-VPSLLQZrmkz	5612
-VPSLLQZrr	5613
-VPSLLQZrrk	5614
-VPSLLQZrrkz	5615
-VPSLLQri	5616
-VPSLLQrm	5617
-VPSLLQrr	5618
-VPSLLVDYrm	5619
-VPSLLVDYrr	5620
-VPSLLVDZ	5621
-VPSLLVDZrm	5622
-VPSLLVDZrmb	5623
-VPSLLVDZrmbk	5624
-VPSLLVDZrmbkz	5625
-VPSLLVDZrmk	5626
-VPSLLVDZrmkz	5627
-VPSLLVDZrr	5628
-VPSLLVDZrrk	5629
-VPSLLVDZrrkz	5630
-VPSLLVDrm	5631
-VPSLLVDrr	5632
-VPSLLVQYrm	5633
-VPSLLVQYrr	5634
-VPSLLVQZ	5635
-VPSLLVQZrm	5636
-VPSLLVQZrmb	5637
-VPSLLVQZrmbk	5638
-VPSLLVQZrmbkz	5639
-VPSLLVQZrmk	5640
-VPSLLVQZrmkz	5641
-VPSLLVQZrr	5642
-VPSLLVQZrrk	5643
-VPSLLVQZrrkz	5644
-VPSLLVQrm	5645
-VPSLLVQrr	5646
-VPSLLVWZ	5647
-VPSLLVWZrm	5648
-VPSLLVWZrmk	5649
-VPSLLVWZrmkz	5650
-VPSLLVWZrr	5651
-VPSLLVWZrrk	5652
-VPSLLVWZrrkz	5653
-VPSLLWYri	5654
-VPSLLWYrm	5655
-VPSLLWYrr	5656
-VPSLLWZ	5657
-VPSLLWZmi	5658
-VPSLLWZmik	5659
-VPSLLWZmikz	5660
-VPSLLWZri	5661
-VPSLLWZrik	5662
-VPSLLWZrikz	5663
-VPSLLWZrm	5664
-VPSLLWZrmk	5665
-VPSLLWZrmkz	5666
-VPSLLWZrr	5667
-VPSLLWZrrk	5668
-VPSLLWZrrkz	5669
-VPSLLWri	5670
-VPSLLWrm	5671
-VPSLLWrr	5672
-VPSRADYri	5673
-VPSRADYrm	5674
-VPSRADYrr	5675
-VPSRADZ	5676
-VPSRADZmbi	5677
-VPSRADZmbik	5678
-VPSRADZmbikz	5679
-VPSRADZmi	5680
-VPSRADZmik	5681
-VPSRADZmikz	5682
-VPSRADZri	5683
-VPSRADZrik	5684
-VPSRADZrikz	5685
-VPSRADZrm	5686
-VPSRADZrmk	5687
-VPSRADZrmkz	5688
-VPSRADZrr	5689
-VPSRADZrrk	5690
-VPSRADZrrkz	5691
-VPSRADri	5692
-VPSRADrm	5693
-VPSRADrr	5694
-VPSRAQZ	5695
-VPSRAQZmbi	5696
-VPSRAQZmbik	5697
-VPSRAQZmbikz	5698
-VPSRAQZmi	5699
-VPSRAQZmik	5700
-VPSRAQZmikz	5701
-VPSRAQZri	5702
-VPSRAQZrik	5703
-VPSRAQZrikz	5704
-VPSRAQZrm	5705
-VPSRAQZrmk	5706
-VPSRAQZrmkz	5707
-VPSRAQZrr	5708
-VPSRAQZrrk	5709
-VPSRAQZrrkz	5710
-VPSRAVDYrm	5711
-VPSRAVDYrr	5712
-VPSRAVDZ	5713
-VPSRAVDZrm	5714
-VPSRAVDZrmb	5715
-VPSRAVDZrmbk	5716
-VPSRAVDZrmbkz	5717
-VPSRAVDZrmk	5718
-VPSRAVDZrmkz	5719
-VPSRAVDZrr	5720
-VPSRAVDZrrk	5721
-VPSRAVDZrrkz	5722
-VPSRAVDrm	5723
-VPSRAVDrr	5724
-VPSRAVQZ	5725
-VPSRAVQZrm	5726
-VPSRAVQZrmb	5727
-VPSRAVQZrmbk	5728
-VPSRAVQZrmbkz	5729
-VPSRAVQZrmk	5730
-VPSRAVQZrmkz	5731
-VPSRAVQZrr	5732
-VPSRAVQZrrk	5733
-VPSRAVQZrrkz	5734
-VPSRAVWZ	5735
-VPSRAVWZrm	5736
-VPSRAVWZrmk	5737
-VPSRAVWZrmkz	5738
-VPSRAVWZrr	5739
-VPSRAVWZrrk	5740
-VPSRAVWZrrkz	5741
-VPSRAWYri	5742
-VPSRAWYrm	5743
-VPSRAWYrr	5744
-VPSRAWZ	5745
-VPSRAWZmi	5746
-VPSRAWZmik	5747
-VPSRAWZmikz	5748
-VPSRAWZri	5749
-VPSRAWZrik	5750
-VPSRAWZrikz	5751
-VPSRAWZrm	5752
-VPSRAWZrmk	5753
-VPSRAWZrmkz	5754
-VPSRAWZrr	5755
-VPSRAWZrrk	5756
-VPSRAWZrrkz	5757
-VPSRAWri	5758
-VPSRAWrm	5759
-VPSRAWrr	5760
-VPSRLDQYri	5761
-VPSRLDQZ	5762
-VPSRLDQZmi	5763
-VPSRLDQZri	5764
-VPSRLDQri	5765
-VPSRLDYri	5766
-VPSRLDYrm	5767
-VPSRLDYrr	5768
-VPSRLDZ	5769
-VPSRLDZmbi	5770
-VPSRLDZmbik	5771
-VPSRLDZmbikz	5772
-VPSRLDZmi	5773
-VPSRLDZmik	5774
-VPSRLDZmikz	5775
-VPSRLDZri	5776
-VPSRLDZrik	5777
-VPSRLDZrikz	5778
-VPSRLDZrm	5779
-VPSRLDZrmk	5780
-VPSRLDZrmkz	5781
-VPSRLDZrr	5782
-VPSRLDZrrk	5783
-VPSRLDZrrkz	5784
-VPSRLDri	5785
-VPSRLDrm	5786
-VPSRLDrr	5787
-VPSRLQYri	5788
-VPSRLQYrm	5789
-VPSRLQYrr	5790
-VPSRLQZ	5791
-VPSRLQZmbi	5792
-VPSRLQZmbik	5793
-VPSRLQZmbikz	5794
-VPSRLQZmi	5795
-VPSRLQZmik	5796
-VPSRLQZmikz	5797
-VPSRLQZri	5798
-VPSRLQZrik	5799
-VPSRLQZrikz	5800
-VPSRLQZrm	5801
-VPSRLQZrmk	5802
-VPSRLQZrmkz	5803
-VPSRLQZrr	5804
-VPSRLQZrrk	5805
-VPSRLQZrrkz	5806
-VPSRLQri	5807
-VPSRLQrm	5808
-VPSRLQrr	5809
-VPSRLVDYrm	5810
-VPSRLVDYrr	5811
-VPSRLVDZ	5812
-VPSRLVDZrm	5813
-VPSRLVDZrmb	5814
-VPSRLVDZrmbk	5815
-VPSRLVDZrmbkz	5816
-VPSRLVDZrmk	5817
-VPSRLVDZrmkz	5818
-VPSRLVDZrr	5819
-VPSRLVDZrrk	5820
-VPSRLVDZrrkz	5821
-VPSRLVDrm	5822
-VPSRLVDrr	5823
-VPSRLVQYrm	5824
-VPSRLVQYrr	5825
-VPSRLVQZ	5826
-VPSRLVQZrm	5827
-VPSRLVQZrmb	5828
-VPSRLVQZrmbk	5829
-VPSRLVQZrmbkz	5830
-VPSRLVQZrmk	5831
-VPSRLVQZrmkz	5832
-VPSRLVQZrr	5833
-VPSRLVQZrrk	5834
-VPSRLVQZrrkz	5835
-VPSRLVQrm	5836
-VPSRLVQrr	5837
-VPSRLVWZ	5838
-VPSRLVWZrm	5839
-VPSRLVWZrmk	5840
-VPSRLVWZrmkz	5841
-VPSRLVWZrr	5842
-VPSRLVWZrrk	5843
-VPSRLVWZrrkz	5844
-VPSRLWYri	5845
-VPSRLWYrm	5846
-VPSRLWYrr	5847
-VPSRLWZ	5848
-VPSRLWZmi	5849
-VPSRLWZmik	5850
-VPSRLWZmikz	5851
-VPSRLWZri	5852
-VPSRLWZrik	5853
-VPSRLWZrikz	5854
-VPSRLWZrm	5855
-VPSRLWZrmk	5856
-VPSRLWZrmkz	5857
-VPSRLWZrr	5858
-VPSRLWZrrk	5859
-VPSRLWZrrkz	5860
-VPSRLWri	5861
-VPSRLWrm	5862
-VPSRLWrr	5863
-VPSUBBYrm	5864
-VPSUBBYrr	5865
-VPSUBBZ	5866
-VPSUBBZrm	5867
-VPSUBBZrmk	5868
-VPSUBBZrmkz	5869
-VPSUBBZrr	5870
-VPSUBBZrrk	5871
-VPSUBBZrrkz	5872
-VPSUBBrm	5873
-VPSUBBrr	5874
-VPSUBDYrm	5875
-VPSUBDYrr	5876
-VPSUBDZ	5877
-VPSUBDZrm	5878
-VPSUBDZrmb	5879
-VPSUBDZrmbk	5880
-VPSUBDZrmbkz	5881
-VPSUBDZrmk	5882
-VPSUBDZrmkz	5883
-VPSUBDZrr	5884
-VPSUBDZrrk	5885
-VPSUBDZrrkz	5886
-VPSUBDrm	5887
-VPSUBDrr	5888
-VPSUBQYrm	5889
-VPSUBQYrr	5890
-VPSUBQZ	5891
-VPSUBQZrm	5892
-VPSUBQZrmb	5893
-VPSUBQZrmbk	5894
-VPSUBQZrmbkz	5895
-VPSUBQZrmk	5896
-VPSUBQZrmkz	5897
-VPSUBQZrr	5898
-VPSUBQZrrk	5899
-VPSUBQZrrkz	5900
-VPSUBQrm	5901
-VPSUBQrr	5902
-VPSUBSBYrm	5903
-VPSUBSBYrr	5904
-VPSUBSBZ	5905
-VPSUBSBZrm	5906
-VPSUBSBZrmk	5907
-VPSUBSBZrmkz	5908
-VPSUBSBZrr	5909
-VPSUBSBZrrk	5910
-VPSUBSBZrrkz	5911
-VPSUBSBrm	5912
-VPSUBSBrr	5913
-VPSUBSWYrm	5914
-VPSUBSWYrr	5915
-VPSUBSWZ	5916
-VPSUBSWZrm	5917
-VPSUBSWZrmk	5918
-VPSUBSWZrmkz	5919
-VPSUBSWZrr	5920
-VPSUBSWZrrk	5921
-VPSUBSWZrrkz	5922
-VPSUBSWrm	5923
-VPSUBSWrr	5924
-VPSUBUSBYrm	5925
-VPSUBUSBYrr	5926
-VPSUBUSBZ	5927
-VPSUBUSBZrm	5928
-VPSUBUSBZrmk	5929
-VPSUBUSBZrmkz	5930
-VPSUBUSBZrr	5931
-VPSUBUSBZrrk	5932
-VPSUBUSBZrrkz	5933
-VPSUBUSBrm	5934
-VPSUBUSBrr	5935
-VPSUBUSWYrm	5936
-VPSUBUSWYrr	5937
-VPSUBUSWZ	5938
-VPSUBUSWZrm	5939
-VPSUBUSWZrmk	5940
-VPSUBUSWZrmkz	5941
-VPSUBUSWZrr	5942
-VPSUBUSWZrrk	5943
-VPSUBUSWZrrkz	5944
-VPSUBUSWrm	5945
-VPSUBUSWrr	5946
-VPSUBWYrm	5947
-VPSUBWYrr	5948
-VPSUBWZ	5949
-VPSUBWZrm	5950
-VPSUBWZrmk	5951
-VPSUBWZrmkz	5952
-VPSUBWZrr	5953
-VPSUBWZrrk	5954
-VPSUBWZrrkz	5955
-VPSUBWrm	5956
-VPSUBWrr	5957
-VPTERNLOGDZ	5958
-VPTERNLOGDZrmbi	5959
-VPTERNLOGDZrmbik	5960
-VPTERNLOGDZrmbikz	5961
-VPTERNLOGDZrmi	5962
-VPTERNLOGDZrmik	5963
-VPTERNLOGDZrmikz	5964
-VPTERNLOGDZrri	5965
-VPTERNLOGDZrrik	5966
-VPTERNLOGDZrrikz	5967
-VPTERNLOGQZ	5968
-VPTERNLOGQZrmbi	5969
-VPTERNLOGQZrmbik	5970
-VPTERNLOGQZrmbikz	5971
-VPTERNLOGQZrmi	5972
-VPTERNLOGQZrmik	5973
-VPTERNLOGQZrmikz	5974
-VPTERNLOGQZrri	5975
-VPTERNLOGQZrrik	5976
-VPTERNLOGQZrrikz	5977
-VPTESTMBZ	5978
-VPTESTMBZrm	5979
-VPTESTMBZrmk	5980
-VPTESTMBZrr	5981
-VPTESTMBZrrk	5982
-VPTESTMDZ	5983
-VPTESTMDZrm	5984
-VPTESTMDZrmb	5985
-VPTESTMDZrmbk	5986
-VPTESTMDZrmk	5987
-VPTESTMDZrr	5988
-VPTESTMDZrrk	5989
-VPTESTMQZ	5990
-VPTESTMQZrm	5991
-VPTESTMQZrmb	5992
-VPTESTMQZrmbk	5993
-VPTESTMQZrmk	5994
-VPTESTMQZrr	5995
-VPTESTMQZrrk	5996
-VPTESTMWZ	5997
-VPTESTMWZrm	5998
-VPTESTMWZrmk	5999
-VPTESTMWZrr	6000
-VPTESTMWZrrk	6001
-VPTESTNMBZ	6002
-VPTESTNMBZrm	6003
-VPTESTNMBZrmk	6004
-VPTESTNMBZrr	6005
-VPTESTNMBZrrk	6006
-VPTESTNMDZ	6007
-VPTESTNMDZrm	6008
-VPTESTNMDZrmb	6009
-VPTESTNMDZrmbk	6010
-VPTESTNMDZrmk	6011
-VPTESTNMDZrr	6012
-VPTESTNMDZrrk	6013
-VPTESTNMQZ	6014
-VPTESTNMQZrm	6015
-VPTESTNMQZrmb	6016
-VPTESTNMQZrmbk	6017
-VPTESTNMQZrmk	6018
-VPTESTNMQZrr	6019
-VPTESTNMQZrrk	6020
-VPTESTNMWZ	6021
-VPTESTNMWZrm	6022
-VPTESTNMWZrmk	6023
-VPTESTNMWZrr	6024
-VPTESTNMWZrrk	6025
-VPTESTYrm	6026
-VPTESTYrr	6027
-VPTESTrm	6028
-VPTESTrr	6029
-VPUNPCKHBWYrm	6030
-VPUNPCKHBWYrr	6031
-VPUNPCKHBWZ	6032
-VPUNPCKHBWZrm	6033
-VPUNPCKHBWZrmk	6034
-VPUNPCKHBWZrmkz	6035
-VPUNPCKHBWZrr	6036
-VPUNPCKHBWZrrk	6037
-VPUNPCKHBWZrrkz	6038
-VPUNPCKHBWrm	6039
-VPUNPCKHBWrr	6040
-VPUNPCKHDQYrm	6041
-VPUNPCKHDQYrr	6042
-VPUNPCKHDQZ	6043
-VPUNPCKHDQZrm	6044
-VPUNPCKHDQZrmb	6045
-VPUNPCKHDQZrmbk	6046
-VPUNPCKHDQZrmbkz	6047
-VPUNPCKHDQZrmk	6048
-VPUNPCKHDQZrmkz	6049
-VPUNPCKHDQZrr	6050
-VPUNPCKHDQZrrk	6051
-VPUNPCKHDQZrrkz	6052
-VPUNPCKHDQrm	6053
-VPUNPCKHDQrr	6054
-VPUNPCKHQDQYrm	6055
-VPUNPCKHQDQYrr	6056
-VPUNPCKHQDQZ	6057
-VPUNPCKHQDQZrm	6058
-VPUNPCKHQDQZrmb	6059
-VPUNPCKHQDQZrmbk	6060
-VPUNPCKHQDQZrmbkz	6061
-VPUNPCKHQDQZrmk	6062
-VPUNPCKHQDQZrmkz	6063
-VPUNPCKHQDQZrr	6064
-VPUNPCKHQDQZrrk	6065
-VPUNPCKHQDQZrrkz	6066
-VPUNPCKHQDQrm	6067
-VPUNPCKHQDQrr	6068
-VPUNPCKHWDYrm	6069
-VPUNPCKHWDYrr	6070
-VPUNPCKHWDZ	6071
-VPUNPCKHWDZrm	6072
-VPUNPCKHWDZrmk	6073
-VPUNPCKHWDZrmkz	6074
-VPUNPCKHWDZrr	6075
-VPUNPCKHWDZrrk	6076
-VPUNPCKHWDZrrkz	6077
-VPUNPCKHWDrm	6078
-VPUNPCKHWDrr	6079
-VPUNPCKLBWYrm	6080
-VPUNPCKLBWYrr	6081
-VPUNPCKLBWZ	6082
-VPUNPCKLBWZrm	6083
-VPUNPCKLBWZrmk	6084
-VPUNPCKLBWZrmkz	6085
-VPUNPCKLBWZrr	6086
-VPUNPCKLBWZrrk	6087
-VPUNPCKLBWZrrkz	6088
-VPUNPCKLBWrm	6089
-VPUNPCKLBWrr	6090
-VPUNPCKLDQYrm	6091
-VPUNPCKLDQYrr	6092
-VPUNPCKLDQZ	6093
-VPUNPCKLDQZrm	6094
-VPUNPCKLDQZrmb	6095
-VPUNPCKLDQZrmbk	6096
-VPUNPCKLDQZrmbkz	6097
-VPUNPCKLDQZrmk	6098
-VPUNPCKLDQZrmkz	6099
-VPUNPCKLDQZrr	6100
-VPUNPCKLDQZrrk	6101
-VPUNPCKLDQZrrkz	6102
-VPUNPCKLDQrm	6103
-VPUNPCKLDQrr	6104
-VPUNPCKLQDQYrm	6105
-VPUNPCKLQDQYrr	6106
-VPUNPCKLQDQZ	6107
-VPUNPCKLQDQZrm	6108
-VPUNPCKLQDQZrmb	6109
-VPUNPCKLQDQZrmbk	6110
-VPUNPCKLQDQZrmbkz	6111
-VPUNPCKLQDQZrmk	6112
-VPUNPCKLQDQZrmkz	6113
-VPUNPCKLQDQZrr	6114
-VPUNPCKLQDQZrrk	6115
-VPUNPCKLQDQZrrkz	6116
-VPUNPCKLQDQrm	6117
-VPUNPCKLQDQrr	6118
-VPUNPCKLWDYrm	6119
-VPUNPCKLWDYrr	6120
-VPUNPCKLWDZ	6121
-VPUNPCKLWDZrm	6122
-VPUNPCKLWDZrmk	6123
-VPUNPCKLWDZrmkz	6124
-VPUNPCKLWDZrr	6125
-VPUNPCKLWDZrrk	6126
-VPUNPCKLWDZrrkz	6127
-VPUNPCKLWDrm	6128
-VPUNPCKLWDrr	6129
-VPXORDZ	6130
-VPXORDZrm	6131
-VPXORDZrmb	6132
-VPXORDZrmbk	6133
-VPXORDZrmbkz	6134
-VPXORDZrmk	6135
-VPXORDZrmkz	6136
-VPXORDZrr	6137
-VPXORDZrrk	6138
-VPXORDZrrkz	6139
-VPXORQZ	6140
-VPXORQZrm	6141
-VPXORQZrmb	6142
-VPXORQZrmbk	6143
-VPXORQZrmbkz	6144
-VPXORQZrmk	6145
-VPXORQZrmkz	6146
-VPXORQZrr	6147
-VPXORQZrrk	6148
-VPXORQZrrkz	6149
-VPXORYrm	6150
-VPXORYrr	6151
-VPXORrm	6152
-VPXORrr	6153
-VRANGEPDZ	6154
-VRANGEPDZrmbi	6155
-VRANGEPDZrmbik	6156
-VRANGEPDZrmbikz	6157
-VRANGEPDZrmi	6158
-VRANGEPDZrmik	6159
-VRANGEPDZrmikz	6160
-VRANGEPDZrri	6161
-VRANGEPDZrrib	6162
-VRANGEPDZrribk	6163
-VRANGEPDZrribkz	6164
-VRANGEPDZrrik	6165
-VRANGEPDZrrikz	6166
-VRANGEPSZ	6167
-VRANGEPSZrmbi	6168
-VRANGEPSZrmbik	6169
-VRANGEPSZrmbikz	6170
-VRANGEPSZrmi	6171
-VRANGEPSZrmik	6172
-VRANGEPSZrmikz	6173
-VRANGEPSZrri	6174
-VRANGEPSZrrib	6175
-VRANGEPSZrribk	6176
-VRANGEPSZrribkz	6177
-VRANGEPSZrrik	6178
-VRANGEPSZrrikz	6179
-VRANGESDZrmi	6180
-VRANGESDZrmik	6181
-VRANGESDZrmikz	6182
-VRANGESDZrri	6183
-VRANGESDZrrib	6184
-VRANGESDZrribk	6185
-VRANGESDZrribkz	6186
-VRANGESDZrrik	6187
-VRANGESDZrrikz	6188
-VRANGESSZrmi	6189
-VRANGESSZrmik	6190
-VRANGESSZrmikz	6191
-VRANGESSZrri	6192
-VRANGESSZrrib	6193
-VRANGESSZrribk	6194
-VRANGESSZrribkz	6195
-VRANGESSZrrik	6196
-VRANGESSZrrikz	6197
-VRCP	6198
-VRCPBF	6199
-VRCPPHZ	6200
-VRCPPHZm	6201
-VRCPPHZmb	6202
-VRCPPHZmbk	6203
-VRCPPHZmbkz	6204
-VRCPPHZmk	6205
-VRCPPHZmkz	6206
-VRCPPHZr	6207
-VRCPPHZrk	6208
-VRCPPHZrkz	6209
-VRCPPSYm	6210
-VRCPPSYr	6211
-VRCPPSm	6212
-VRCPPSr	6213
-VRCPSHZrm	6214
-VRCPSHZrmk	6215
-VRCPSHZrmkz	6216
-VRCPSHZrr	6217
-VRCPSHZrrk	6218
-VRCPSHZrrkz	6219
-VRCPSSm	6220
-VRCPSSm_Int	6221
-VRCPSSr	6222
-VRCPSSr_Int	6223
-VREDUCEBF	6224
-VREDUCEPDZ	6225
-VREDUCEPDZrmbi	6226
-VREDUCEPDZrmbik	6227
-VREDUCEPDZrmbikz	6228
-VREDUCEPDZrmi	6229
-VREDUCEPDZrmik	6230
-VREDUCEPDZrmikz	6231
-VREDUCEPDZrri	6232
-VREDUCEPDZrrib	6233
-VREDUCEPDZrribk	6234
-VREDUCEPDZrribkz	6235
-VREDUCEPDZrrik	6236
-VREDUCEPDZrrikz	6237
-VREDUCEPHZ	6238
-VREDUCEPHZrmbi	6239
-VREDUCEPHZrmbik	6240
-VREDUCEPHZrmbikz	6241
-VREDUCEPHZrmi	6242
-VREDUCEPHZrmik	6243
-VREDUCEPHZrmikz	6244
-VREDUCEPHZrri	6245
-VREDUCEPHZrrib	6246
-VREDUCEPHZrribk	6247
-VREDUCEPHZrribkz	6248
-VREDUCEPHZrrik	6249
-VREDUCEPHZrrikz	6250
-VREDUCEPSZ	6251
-VREDUCEPSZrmbi	6252
-VREDUCEPSZrmbik	6253
-VREDUCEPSZrmbikz	6254
-VREDUCEPSZrmi	6255
-VREDUCEPSZrmik	6256
-VREDUCEPSZrmikz	6257
-VREDUCEPSZrri	6258
-VREDUCEPSZrrib	6259
-VREDUCEPSZrribk	6260
-VREDUCEPSZrribkz	6261
-VREDUCEPSZrrik	6262
-VREDUCEPSZrrikz	6263
-VREDUCESDZrmi	6264
-VREDUCESDZrmik	6265
-VREDUCESDZrmikz	6266
-VREDUCESDZrri	6267
-VREDUCESDZrrib	6268
-VREDUCESDZrribk	6269
-VREDUCESDZrribkz	6270
-VREDUCESDZrrik	6271
-VREDUCESDZrrikz	6272
-VREDUCESHZrmi	6273
-VREDUCESHZrmik	6274
-VREDUCESHZrmikz	6275
-VREDUCESHZrri	6276
-VREDUCESHZrrib	6277
-VREDUCESHZrribk	6278
-VREDUCESHZrribkz	6279
-VREDUCESHZrrik	6280
-VREDUCESHZrrikz	6281
-VREDUCESSZrmi	6282
-VREDUCESSZrmik	6283
-VREDUCESSZrmikz	6284
-VREDUCESSZrri	6285
-VREDUCESSZrrib	6286
-VREDUCESSZrribk	6287
-VREDUCESSZrribkz	6288
-VREDUCESSZrrik	6289
-VREDUCESSZrrikz	6290
-VRNDSCALEBF	6291
-VRNDSCALEPDZ	6292
-VRNDSCALEPDZrmbi	6293
-VRNDSCALEPDZrmbik	6294
-VRNDSCALEPDZrmbikz	6295
-VRNDSCALEPDZrmi	6296
-VRNDSCALEPDZrmik	6297
-VRNDSCALEPDZrmikz	6298
-VRNDSCALEPDZrri	6299
-VRNDSCALEPDZrrib	6300
-VRNDSCALEPDZrribk	6301
-VRNDSCALEPDZrribkz	6302
-VRNDSCALEPDZrrik	6303
-VRNDSCALEPDZrrikz	6304
-VRNDSCALEPHZ	6305
-VRNDSCALEPHZrmbi	6306
-VRNDSCALEPHZrmbik	6307
-VRNDSCALEPHZrmbikz	6308
-VRNDSCALEPHZrmi	6309
-VRNDSCALEPHZrmik	6310
-VRNDSCALEPHZrmikz	6311
-VRNDSCALEPHZrri	6312
-VRNDSCALEPHZrrib	6313
-VRNDSCALEPHZrribk	6314
-VRNDSCALEPHZrribkz	6315
-VRNDSCALEPHZrrik	6316
-VRNDSCALEPHZrrikz	6317
-VRNDSCALEPSZ	6318
-VRNDSCALEPSZrmbi	6319
-VRNDSCALEPSZrmbik	6320
-VRNDSCALEPSZrmbikz	6321
-VRNDSCALEPSZrmi	6322
-VRNDSCALEPSZrmik	6323
-VRNDSCALEPSZrmikz	6324
-VRNDSCALEPSZrri	6325
-VRNDSCALEPSZrrib	6326
-VRNDSCALEPSZrribk	6327
-VRNDSCALEPSZrribkz	6328
-VRNDSCALEPSZrrik	6329
-VRNDSCALEPSZrrikz	6330
-VRNDSCALESDZrmi	6331
-VRNDSCALESDZrmi_Int	6332
-VRNDSCALESDZrmik_Int	6333
-VRNDSCALESDZrmikz_Int	6334
-VRNDSCALESDZrri	6335
-VRNDSCALESDZrri_Int	6336
-VRNDSCALESDZrrib_Int	6337
-VRNDSCALESDZrribk_Int	6338
-VRNDSCALESDZrribkz_Int	6339
-VRNDSCALESDZrrik_Int	6340
-VRNDSCALESDZrrikz_Int	6341
-VRNDSCALESHZrmi	6342
-VRNDSCALESHZrmi_Int	6343
-VRNDSCALESHZrmik_Int	6344
-VRNDSCALESHZrmikz_Int	6345
-VRNDSCALESHZrri	6346
-VRNDSCALESHZrri_Int	6347
-VRNDSCALESHZrrib_Int	6348
-VRNDSCALESHZrribk_Int	6349
-VRNDSCALESHZrribkz_Int	6350
-VRNDSCALESHZrrik_Int	6351
-VRNDSCALESHZrrikz_Int	6352
-VRNDSCALESSZrmi	6353
-VRNDSCALESSZrmi_Int	6354
-VRNDSCALESSZrmik_Int	6355
-VRNDSCALESSZrmikz_Int	6356
-VRNDSCALESSZrri	6357
-VRNDSCALESSZrri_Int	6358
-VRNDSCALESSZrrib_Int	6359
-VRNDSCALESSZrribk_Int	6360
-VRNDSCALESSZrribkz_Int	6361
-VRNDSCALESSZrrik_Int	6362
-VRNDSCALESSZrrikz_Int	6363
-VROUNDPDYmi	6364
-VROUNDPDYri	6365
-VROUNDPDmi	6366
-VROUNDPDri	6367
-VROUNDPSYmi	6368
-VROUNDPSYri	6369
-VROUNDPSmi	6370
-VROUNDPSri	6371
-VROUNDSDmi	6372
-VROUNDSDmi_Int	6373
-VROUNDSDri	6374
-VROUNDSDri_Int	6375
-VROUNDSSmi	6376
-VROUNDSSmi_Int	6377
-VROUNDSSri	6378
-VROUNDSSri_Int	6379
-VRSQRT	6380
-VRSQRTBF	6381
-VRSQRTPHZ	6382
-VRSQRTPHZm	6383
-VRSQRTPHZmb	6384
-VRSQRTPHZmbk	6385
-VRSQRTPHZmbkz	6386
-VRSQRTPHZmk	6387
-VRSQRTPHZmkz	6388
-VRSQRTPHZr	6389
-VRSQRTPHZrk	6390
-VRSQRTPHZrkz	6391
-VRSQRTPSYm	6392
-VRSQRTPSYr	6393
-VRSQRTPSm	6394
-VRSQRTPSr	6395
-VRSQRTSHZrm	6396
-VRSQRTSHZrmk	6397
-VRSQRTSHZrmkz	6398
-VRSQRTSHZrr	6399
-VRSQRTSHZrrk	6400
-VRSQRTSHZrrkz	6401
-VRSQRTSSm	6402
-VRSQRTSSm_Int	6403
-VRSQRTSSr	6404
-VRSQRTSSr_Int	6405
-VSCALEFBF	6406
-VSCALEFPDZ	6407
-VSCALEFPDZrm	6408
-VSCALEFPDZrmb	6409
-VSCALEFPDZrmbk	6410
-VSCALEFPDZrmbkz	6411
-VSCALEFPDZrmk	6412
-VSCALEFPDZrmkz	6413
-VSCALEFPDZrr	6414
-VSCALEFPDZrrb	6415
-VSCALEFPDZrrbk	6416
-VSCALEFPDZrrbkz	6417
-VSCALEFPDZrrk	6418
-VSCALEFPDZrrkz	6419
-VSCALEFPHZ	6420
-VSCALEFPHZrm	6421
-VSCALEFPHZrmb	6422
-VSCALEFPHZrmbk	6423
-VSCALEFPHZrmbkz	6424
-VSCALEFPHZrmk	6425
-VSCALEFPHZrmkz	6426
-VSCALEFPHZrr	6427
-VSCALEFPHZrrb	6428
-VSCALEFPHZrrbk	6429
-VSCALEFPHZrrbkz	6430
-VSCALEFPHZrrk	6431
-VSCALEFPHZrrkz	6432
-VSCALEFPSZ	6433
-VSCALEFPSZrm	6434
-VSCALEFPSZrmb	6435
-VSCALEFPSZrmbk	6436
-VSCALEFPSZrmbkz	6437
-VSCALEFPSZrmk	6438
-VSCALEFPSZrmkz	6439
-VSCALEFPSZrr	6440
-VSCALEFPSZrrb	6441
-VSCALEFPSZrrbk	6442
-VSCALEFPSZrrbkz	6443
-VSCALEFPSZrrk	6444
-VSCALEFPSZrrkz	6445
-VSCALEFSDZrm	6446
-VSCALEFSDZrmk	6447
-VSCALEFSDZrmkz	6448
-VSCALEFSDZrr	6449
-VSCALEFSDZrrb_Int	6450
-VSCALEFSDZrrbk_Int	6451
-VSCALEFSDZrrbkz_Int	6452
-VSCALEFSDZrrk	6453
-VSCALEFSDZrrkz	6454
-VSCALEFSHZrm	6455
-VSCALEFSHZrmk	6456
-VSCALEFSHZrmkz	6457
-VSCALEFSHZrr	6458
-VSCALEFSHZrrb_Int	6459
-VSCALEFSHZrrbk_Int	6460
-VSCALEFSHZrrbkz_Int	6461
-VSCALEFSHZrrk	6462
-VSCALEFSHZrrkz	6463
-VSCALEFSSZrm	6464
-VSCALEFSSZrmk	6465
-VSCALEFSSZrmkz	6466
-VSCALEFSSZrr	6467
-VSCALEFSSZrrb_Int	6468
-VSCALEFSSZrrbk_Int	6469
-VSCALEFSSZrrbkz_Int	6470
-VSCALEFSSZrrk	6471
-VSCALEFSSZrrkz	6472
-VSCATTERDPDZ	6473
-VSCATTERDPDZmr	6474
-VSCATTERDPSZ	6475
-VSCATTERDPSZmr	6476
-VSCATTERPF	6477
-VSCATTERQPDZ	6478
-VSCATTERQPDZmr	6479
-VSCATTERQPSZ	6480
-VSCATTERQPSZmr	6481
-VSHA	6482
-VSHUFF	6483
-VSHUFI	6484
-VSHUFPDYrmi	6485
-VSHUFPDYrri	6486
-VSHUFPDZ	6487
-VSHUFPDZrmbi	6488
-VSHUFPDZrmbik	6489
-VSHUFPDZrmbikz	6490
-VSHUFPDZrmi	6491
-VSHUFPDZrmik	6492
-VSHUFPDZrmikz	6493
-VSHUFPDZrri	6494
-VSHUFPDZrrik	6495
-VSHUFPDZrrikz	6496
-VSHUFPDrmi	6497
-VSHUFPDrri	6498
-VSHUFPSYrmi	6499
-VSHUFPSYrri	6500
-VSHUFPSZ	6501
-VSHUFPSZrmbi	6502
-VSHUFPSZrmbik	6503
-VSHUFPSZrmbikz	6504
-VSHUFPSZrmi	6505
-VSHUFPSZrmik	6506
-VSHUFPSZrmikz	6507
-VSHUFPSZrri	6508
-VSHUFPSZrrik	6509
-VSHUFPSZrrikz	6510
-VSHUFPSrmi	6511
-VSHUFPSrri	6512
-VSM	6513
-VSQRTBF	6514
-VSQRTPDYm	6515
-VSQRTPDYr	6516
-VSQRTPDZ	6517
-VSQRTPDZm	6518
-VSQRTPDZmb	6519
-VSQRTPDZmbk	6520
-VSQRTPDZmbkz	6521
-VSQRTPDZmk	6522
-VSQRTPDZmkz	6523
-VSQRTPDZr	6524
-VSQRTPDZrb	6525
-VSQRTPDZrbk	6526
-VSQRTPDZrbkz	6527
-VSQRTPDZrk	6528
-VSQRTPDZrkz	6529
-VSQRTPDm	6530
-VSQRTPDr	6531
-VSQRTPHZ	6532
-VSQRTPHZm	6533
-VSQRTPHZmb	6534
-VSQRTPHZmbk	6535
-VSQRTPHZmbkz	6536
-VSQRTPHZmk	6537
-VSQRTPHZmkz	6538
-VSQRTPHZr	6539
-VSQRTPHZrb	6540
-VSQRTPHZrbk	6541
-VSQRTPHZrbkz	6542
-VSQRTPHZrk	6543
-VSQRTPHZrkz	6544
-VSQRTPSYm	6545
-VSQRTPSYr	6546
-VSQRTPSZ	6547
-VSQRTPSZm	6548
-VSQRTPSZmb	6549
-VSQRTPSZmbk	6550
-VSQRTPSZmbkz	6551
-VSQRTPSZmk	6552
-VSQRTPSZmkz	6553
-VSQRTPSZr	6554
-VSQRTPSZrb	6555
-VSQRTPSZrbk	6556
-VSQRTPSZrbkz	6557
-VSQRTPSZrk	6558
-VSQRTPSZrkz	6559
-VSQRTPSm	6560
-VSQRTPSr	6561
-VSQRTSDZm	6562
-VSQRTSDZm_Int	6563
-VSQRTSDZmk_Int	6564
-VSQRTSDZmkz_Int	6565
-VSQRTSDZr	6566
-VSQRTSDZr_Int	6567
-VSQRTSDZrb_Int	6568
-VSQRTSDZrbk_Int	6569
-VSQRTSDZrbkz_Int	6570
-VSQRTSDZrk_Int	6571
-VSQRTSDZrkz_Int	6572
-VSQRTSDm	6573
-VSQRTSDm_Int	6574
-VSQRTSDr	6575
-VSQRTSDr_Int	6576
-VSQRTSHZm	6577
-VSQRTSHZm_Int	6578
-VSQRTSHZmk_Int	6579
-VSQRTSHZmkz_Int	6580
-VSQRTSHZr	6581
-VSQRTSHZr_Int	6582
-VSQRTSHZrb_Int	6583
-VSQRTSHZrbk_Int	6584
-VSQRTSHZrbkz_Int	6585
-VSQRTSHZrk_Int	6586
-VSQRTSHZrkz_Int	6587
-VSQRTSSZm	6588
-VSQRTSSZm_Int	6589
-VSQRTSSZmk_Int	6590
-VSQRTSSZmkz_Int	6591
-VSQRTSSZr	6592
-VSQRTSSZr_Int	6593
-VSQRTSSZrb_Int	6594
-VSQRTSSZrbk_Int	6595
-VSQRTSSZrbkz_Int	6596
-VSQRTSSZrk_Int	6597
-VSQRTSSZrkz_Int	6598
-VSQRTSSm	6599
-VSQRTSSm_Int	6600
-VSQRTSSr	6601
-VSQRTSSr_Int	6602
-VSTMXCSR	6603
-VSUBBF	6604
-VSUBPDYrm	6605
-VSUBPDYrr	6606
-VSUBPDZ	6607
-VSUBPDZrm	6608
-VSUBPDZrmb	6609
-VSUBPDZrmbk	6610
-VSUBPDZrmbkz	6611
-VSUBPDZrmk	6612
-VSUBPDZrmkz	6613
-VSUBPDZrr	6614
-VSUBPDZrrb	6615
-VSUBPDZrrbk	6616
-VSUBPDZrrbkz	6617
-VSUBPDZrrk	6618
-VSUBPDZrrkz	6619
-VSUBPDrm	6620
-VSUBPDrr	6621
-VSUBPHZ	6622
-VSUBPHZrm	6623
-VSUBPHZrmb	6624
-VSUBPHZrmbk	6625
-VSUBPHZrmbkz	6626
-VSUBPHZrmk	6627
-VSUBPHZrmkz	6628
-VSUBPHZrr	6629
-VSUBPHZrrb	6630
-VSUBPHZrrbk	6631
-VSUBPHZrrbkz	6632
-VSUBPHZrrk	6633
-VSUBPHZrrkz	6634
-VSUBPSYrm	6635
-VSUBPSYrr	6636
-VSUBPSZ	6637
-VSUBPSZrm	6638
-VSUBPSZrmb	6639
-VSUBPSZrmbk	6640
-VSUBPSZrmbkz	6641
-VSUBPSZrmk	6642
-VSUBPSZrmkz	6643
-VSUBPSZrr	6644
-VSUBPSZrrb	6645
-VSUBPSZrrbk	6646
-VSUBPSZrrbkz	6647
-VSUBPSZrrk	6648
-VSUBPSZrrkz	6649
-VSUBPSrm	6650
-VSUBPSrr	6651
-VSUBSDZrm	6652
-VSUBSDZrm_Int	6653
-VSUBSDZrmk_Int	6654
-VSUBSDZrmkz_Int	6655
-VSUBSDZrr	6656
-VSUBSDZrr_Int	6657
-VSUBSDZrrb_Int	6658
-VSUBSDZrrbk_Int	6659
-VSUBSDZrrbkz_Int	6660
-VSUBSDZrrk_Int	6661
-VSUBSDZrrkz_Int	6662
-VSUBSDrm	6663
-VSUBSDrm_Int	6664
-VSUBSDrr	6665
-VSUBSDrr_Int	6666
-VSUBSHZrm	6667
-VSUBSHZrm_Int	6668
-VSUBSHZrmk_Int	6669
-VSUBSHZrmkz_Int	6670
-VSUBSHZrr	6671
-VSUBSHZrr_Int	6672
-VSUBSHZrrb_Int	6673
-VSUBSHZrrbk_Int	6674
-VSUBSHZrrbkz_Int	6675
-VSUBSHZrrk_Int	6676
-VSUBSHZrrkz_Int	6677
-VSUBSSZrm	6678
-VSUBSSZrm_Int	6679
-VSUBSSZrmk_Int	6680
-VSUBSSZrmkz_Int	6681
-VSUBSSZrr	6682
-VSUBSSZrr_Int	6683
-VSUBSSZrrb_Int	6684
-VSUBSSZrrbk_Int	6685
-VSUBSSZrrbkz_Int	6686
-VSUBSSZrrk_Int	6687
-VSUBSSZrrkz_Int	6688
-VSUBSSrm	6689
-VSUBSSrm_Int	6690
-VSUBSSrr	6691
-VSUBSSrr_Int	6692
-VTESTPDYrm	6693
-VTESTPDYrr	6694
-VTESTPDrm	6695
-VTESTPDrr	6696
-VTESTPSYrm	6697
-VTESTPSYrr	6698
-VTESTPSrm	6699
-VTESTPSrr	6700
-VUCOMISDZrm	6701
-VUCOMISDZrm_Int	6702
-VUCOMISDZrr	6703
-VUCOMISDZrr_Int	6704
-VUCOMISDZrrb	6705
-VUCOMISDrm	6706
-VUCOMISDrm_Int	6707
-VUCOMISDrr	6708
-VUCOMISDrr_Int	6709
-VUCOMISHZrm	6710
-VUCOMISHZrm_Int	6711
-VUCOMISHZrr	6712
-VUCOMISHZrr_Int	6713
-VUCOMISHZrrb	6714
-VUCOMISSZrm	6715
-VUCOMISSZrm_Int	6716
-VUCOMISSZrr	6717
-VUCOMISSZrr_Int	6718
-VUCOMISSZrrb	6719
-VUCOMISSrm	6720
-VUCOMISSrm_Int	6721
-VUCOMISSrr	6722
-VUCOMISSrr_Int	6723
-VUCOMXSDZrm	6724
-VUCOMXSDZrm_Int	6725
-VUCOMXSDZrr	6726
-VUCOMXSDZrr_Int	6727
-VUCOMXSDZrrb_Int	6728
-VUCOMXSHZrm	6729
-VUCOMXSHZrm_Int	6730
-VUCOMXSHZrr	6731
-VUCOMXSHZrr_Int	6732
-VUCOMXSHZrrb_Int	6733
-VUCOMXSSZrm	6734
-VUCOMXSSZrm_Int	6735
-VUCOMXSSZrr	6736
-VUCOMXSSZrr_Int	6737
-VUCOMXSSZrrb_Int	6738
-VUNPCKHPDYrm	6739
-VUNPCKHPDYrr	6740
-VUNPCKHPDZ	6741
-VUNPCKHPDZrm	6742
-VUNPCKHPDZrmb	6743
-VUNPCKHPDZrmbk	6744
-VUNPCKHPDZrmbkz	6745
-VUNPCKHPDZrmk	6746
-VUNPCKHPDZrmkz	6747
-VUNPCKHPDZrr	6748
-VUNPCKHPDZrrk	6749
-VUNPCKHPDZrrkz	6750
-VUNPCKHPDrm	6751
-VUNPCKHPDrr	6752
-VUNPCKHPSYrm	6753
-VUNPCKHPSYrr	6754
-VUNPCKHPSZ	6755
-VUNPCKHPSZrm	6756
-VUNPCKHPSZrmb	6757
-VUNPCKHPSZrmbk	6758
-VUNPCKHPSZrmbkz	6759
-VUNPCKHPSZrmk	6760
-VUNPCKHPSZrmkz	6761
-VUNPCKHPSZrr	6762
-VUNPCKHPSZrrk	6763
-VUNPCKHPSZrrkz	6764
-VUNPCKHPSrm	6765
-VUNPCKHPSrr	6766
-VUNPCKLPDYrm	6767
-VUNPCKLPDYrr	6768
-VUNPCKLPDZ	6769
-VUNPCKLPDZrm	6770
-VUNPCKLPDZrmb	6771
-VUNPCKLPDZrmbk	6772
-VUNPCKLPDZrmbkz	6773
-VUNPCKLPDZrmk	6774
-VUNPCKLPDZrmkz	6775
-VUNPCKLPDZrr	6776
-VUNPCKLPDZrrk	6777
-VUNPCKLPDZrrkz	6778
-VUNPCKLPDrm	6779
-VUNPCKLPDrr	6780
-VUNPCKLPSYrm	6781
-VUNPCKLPSYrr	6782
-VUNPCKLPSZ	6783
-VUNPCKLPSZrm	6784
-VUNPCKLPSZrmb	6785
-VUNPCKLPSZrmbk	6786
-VUNPCKLPSZrmbkz	6787
-VUNPCKLPSZrmk	6788
-VUNPCKLPSZrmkz	6789
-VUNPCKLPSZrr	6790
-VUNPCKLPSZrrk	6791
-VUNPCKLPSZrrkz	6792
-VUNPCKLPSrm	6793
-VUNPCKLPSrr	6794
-VXORPDYrm	6795
-VXORPDYrr	6796
-VXORPDZ	6797
-VXORPDZrm	6798
-VXORPDZrmb	6799
-VXORPDZrmbk	6800
-VXORPDZrmbkz	6801
-VXORPDZrmk	6802
-VXORPDZrmkz	6803
-VXORPDZrr	6804
-VXORPDZrrk	6805
-VXORPDZrrkz	6806
-VXORPDrm	6807
-VXORPDrr	6808
-VXORPSYrm	6809
-VXORPSYrr	6810
-VXORPSZ	6811
-VXORPSZrm	6812
-VXORPSZrmb	6813
-VXORPSZrmbk	6814
-VXORPSZrmbkz	6815
-VXORPSZrmk	6816
-VXORPSZrmkz	6817
-VXORPSZrr	6818
-VXORPSZrrk	6819
-VXORPSZrrkz	6820
-VXORPSrm	6821
-VXORPSrr	6822
-VZEROALL	6823
-VZEROUPPER	6824
-V_SET	6825
-V_SETALLONES	6826
-WAIT	6827
-WBINVD	6828
-WBNOINVD	6829
-WRFLAGS	6830
-WRFSBASE	6831
-WRGSBASE	6832
-WRMSR	6833
-WRMSRLIST	6834
-WRMSRNS	6835
-WRMSRNSir	6836
-WRMSRNSir_EVEX	6837
-WRPKRUr	6838
-WRSSD	6839
-WRSSD_EVEX	6840
-WRSSQ	6841
-WRSSQ_EVEX	6842
-WRUSSD	6843
-WRUSSD_EVEX	6844
-WRUSSQ	6845
-WRUSSQ_EVEX	6846
-XABORT	6847
-XABORT_DEF	6848
-XACQUIRE_PREFIX	6849
-XADD	6850
-XAM_F	6851
-XAM_Fp	6852
-XBEGIN	6853
-XCHG	6854
-XCH_F	6855
-XCRYPTCBC	6856
-XCRYPTCFB	6857
-XCRYPTCTR	6858
-XCRYPTECB	6859
-XCRYPTOFB	6860
-XEND	6861
-XGETBV	6862
-XLAT	6863
-XOR	6864
-XORPDrm	6865
-XORPDrr	6866
-XORPSrm	6867
-XORPSrr	6868
-XRELEASE_PREFIX	6869
-XRESLDTRK	6870
-XRSTOR	6871
-XRSTORS	6872
-XSAVE	6873
-XSAVEC	6874
-XSAVEOPT	6875
-XSAVES	6876
-XSETBV	6877
-XSHA	6878
-XSTORE	6879
-XSUSLDTRK	6880
-XTEST	6881
-Immediate	6882
-CImmediate	6883
-FPImmediate	6884
-MBB	6885
-FrameIndex	6886
-ConstantPoolIndex	6887
-TargetIndex	6888
-JumpTableIndex	6889
-ExternalSymbol	6890
-GlobalAddress	6891
-BlockAddress	6892
-RegisterMask	6893
-RegisterLiveOut	6894
-Metadata	6895
-MCSymbol	6896
-CFIIndex	6897
-IntrinsicID	6898
-Predicate	6899
-ShuffleMask	6900
-PhyReg_GR8	6901
-PhyReg_GRH8	6902
-PhyReg_GR8_NOREX2	6903
-PhyReg_GR8_NOREX	6904
-PhyReg_GR8_ABCD_H	6905
-PhyReg_GR8_ABCD_L	6906
-PhyReg_GRH16	6907
-PhyReg_GR16	6908
-PhyReg_GR16_NOREX2	6909
-PhyReg_GR16_NOREX	6910
-PhyReg_VK1	6911
-PhyReg_VK16	6912
-PhyReg_VK2	6913
-PhyReg_VK4	6914
-PhyReg_VK8	6915
-PhyReg_VK16WM	6916
-PhyReg_VK1WM	6917
-PhyReg_VK2WM	6918
-PhyReg_VK4WM	6919
-PhyReg_VK8WM	6920
-PhyReg_SEGMENT_REG	6921
-PhyReg_GR16_ABCD	6922
-PhyReg_FPCCR	6923
-PhyReg_FR16X	6924
-PhyReg_FR16	6925
-PhyReg_VK16PAIR	6926
-PhyReg_VK1PAIR	6927
-PhyReg_VK2PAIR	6928
-PhyReg_VK4PAIR	6929
-PhyReg_VK8PAIR	6930
-PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM	6931
-PhyReg_LOW32_ADDR_ACCESS_RBP	6932
-PhyReg_LOW32_ADDR_ACCESS	6933
-PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit	6934
-PhyReg_FR32X	6935
-PhyReg_GR32	6936
-PhyReg_GR32_NOSP	6937
-PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2	6938
-PhyReg_DEBUG_REG	6939
-PhyReg_FR32	6940
-PhyReg_GR32_NOREX2	6941
-PhyReg_GR32_NOREX2_NOSP	6942
-PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX	6943
-PhyReg_GR32_NOREX	6944
-PhyReg_VK32	6945
-PhyReg_GR32_NOREX_NOSP	6946
-PhyReg_RFP32	6947
-PhyReg_VK32WM	6948
-PhyReg_GR32_ABCD	6949
-PhyReg_GR32_TC	6950
-PhyReg_GR32_ABCD_and_GR32_TC	6951
-PhyReg_GR32_AD	6952
-PhyReg_GR32_ArgRef	6953
-PhyReg_GR32_BPSP	6954
-PhyReg_GR32_BSI	6955
-PhyReg_GR32_CB	6956
-PhyReg_GR32_DC	6957
-PhyReg_GR32_DIBP	6958
-PhyReg_GR32_SIDI	6959
-PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit	6960
-PhyReg_CCR	6961
-PhyReg_DFCCR	6962
-PhyReg_GR32_ABCD_and_GR32_BSI	6963
-PhyReg_GR32_AD_and_GR32_ArgRef	6964
-PhyReg_GR32_ArgRef_and_GR32_CB	6965
-PhyReg_GR32_BPSP_and_GR32_DIBP	6966
-PhyReg_GR32_BPSP_and_GR32_TC	6967
-PhyReg_GR32_BSI_and_GR32_SIDI	6968
-PhyReg_GR32_DIBP_and_GR32_SIDI	6969
-PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit	6970
-PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit	6971
-PhyReg_RFP64	6972
-PhyReg_GR64	6973
-PhyReg_FR64X	6974
-PhyReg_GR64_with_sub_8bit	6975
-PhyReg_GR64_NOSP	6976
-PhyReg_GR64_NOREX2	6977
-PhyReg_CONTROL_REG	6978
-PhyReg_FR64	6979
-PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2	6980
-PhyReg_GR64_NOREX2_NOSP	6981
-PhyReg_GR64PLTSafe	6982
-PhyReg_GR64_TC	6983
-PhyReg_GR64_NOREX	6984
-PhyReg_GR64_TCW64	6985
-PhyReg_GR64_TC_with_sub_8bit	6986
-PhyReg_GR64_NOREX2_NOSP_and_GR64_TC	6987
-PhyReg_GR64_TCW64_with_sub_8bit	6988
-PhyReg_GR64_TC_and_GR64_TCW64	6989
-PhyReg_GR64_with_sub_16bit_in_GR16_NOREX	6990
-PhyReg_VK64	6991
-PhyReg_VR64	6992
-PhyReg_GR64PLTSafe_and_GR64_TC	6993
-PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64	6994
-PhyReg_GR64_NOREX_NOSP	6995
-PhyReg_GR64_NOREX_and_GR64_TC	6996
-PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit	6997
-PhyReg_VK64WM	6998
-PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64	6999
-PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX	7000
-PhyReg_GR64PLTSafe_and_GR64_TCW64	7001
-PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC	7002
-PhyReg_GR64_NOREX_and_GR64_TCW64	7003
-PhyReg_GR64_ABCD	7004
-PhyReg_GR64_with_sub_32bit_in_GR32_TC	7005
-PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC	7006
-PhyReg_GR64_AD	7007
-PhyReg_GR64_ArgRef	7008
-PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP	7009
-PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef	7010
-PhyReg_GR64_with_sub_32bit_in_GR32_BPSP	7011
-PhyReg_GR64_with_sub_32bit_in_GR32_BSI	7012
-PhyReg_GR64_with_sub_32bit_in_GR32_CB	7013
-PhyReg_GR64_with_sub_32bit_in_GR32_DIBP	7014
-PhyReg_GR64_with_sub_32bit_in_GR32_SIDI	7015
-PhyReg_GR64_A	7016
-PhyReg_GR64_ArgRef_and_GR64_TC	7017
-PhyReg_GR64_and_LOW32_ADDR_ACCESS	7018
-PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI	7019
-PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef	7020
-PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB	7021
-PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP	7022
-PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC	7023
-PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI	7024
-PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI	7025
-PhyReg_RST	7026
-PhyReg_RFP80	7027
-PhyReg_RFP80_7	7028
-PhyReg_VR128X	7029
-PhyReg_VR128	7030
-PhyReg_VR256X	7031
-PhyReg_VR256	7032
-PhyReg_VR512	7033
-PhyReg_VR512_0_15	7034
-PhyReg_TILE	7035
-PhyReg_TILEPAIR	7036
-VirtReg_GR8	7037
-VirtReg_GRH8	7038
-VirtReg_GR8_NOREX2	7039
-VirtReg_GR8_NOREX	7040
-VirtReg_GR8_ABCD_H	7041
-VirtReg_GR8_ABCD_L	7042
-VirtReg_GRH16	7043
-VirtReg_GR16	7044
-VirtReg_GR16_NOREX2	7045
-VirtReg_GR16_NOREX	7046
-VirtReg_VK1	7047
-VirtReg_VK16	7048
-VirtReg_VK2	7049
-VirtReg_VK4	7050
-VirtReg_VK8	7051
-VirtReg_VK16WM	7052
-VirtReg_VK1WM	7053
-VirtReg_VK2WM	7054
-VirtReg_VK4WM	7055
-VirtReg_VK8WM	7056
-VirtReg_SEGMENT_REG	7057
-VirtReg_GR16_ABCD	7058
-VirtReg_FPCCR	7059
-VirtReg_FR16X	7060
-VirtReg_FR16	7061
-VirtReg_VK16PAIR	7062
-VirtReg_VK1PAIR	7063
-VirtReg_VK2PAIR	7064
-VirtReg_VK4PAIR	7065
-VirtReg_VK8PAIR	7066
-VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM	7067
-VirtReg_LOW32_ADDR_ACCESS_RBP	7068
-VirtReg_LOW32_ADDR_ACCESS	7069
-VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit	7070
-VirtReg_FR32X	7071
-VirtReg_GR32	7072
-VirtReg_GR32_NOSP	7073
-VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2	7074
-VirtReg_DEBUG_REG	7075
-VirtReg_FR32	7076
-VirtReg_GR32_NOREX2	7077
-VirtReg_GR32_NOREX2_NOSP	7078
-VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX	7079
-VirtReg_GR32_NOREX	7080
-VirtReg_VK32	7081
-VirtReg_GR32_NOREX_NOSP	7082
-VirtReg_RFP32	7083
-VirtReg_VK32WM	7084
-VirtReg_GR32_ABCD	7085
-VirtReg_GR32_TC	7086
-VirtReg_GR32_ABCD_and_GR32_TC	7087
-VirtReg_GR32_AD	7088
-VirtReg_GR32_ArgRef	7089
-VirtReg_GR32_BPSP	7090
-VirtReg_GR32_BSI	7091
-VirtReg_GR32_CB	7092
-VirtReg_GR32_DC	7093
-VirtReg_GR32_DIBP	7094
-VirtReg_GR32_SIDI	7095
-VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit	7096
-VirtReg_CCR	7097
-VirtReg_DFCCR	7098
-VirtReg_GR32_ABCD_and_GR32_BSI	7099
-VirtReg_GR32_AD_and_GR32_ArgRef	7100
-VirtReg_GR32_ArgRef_and_GR32_CB	7101
-VirtReg_GR32_BPSP_and_GR32_DIBP	7102
-VirtReg_GR32_BPSP_and_GR32_TC	7103
-VirtReg_GR32_BSI_and_GR32_SIDI	7104
-VirtReg_GR32_DIBP_and_GR32_SIDI	7105
-VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit	7106
-VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit	7107
-VirtReg_RFP64	7108
-VirtReg_GR64	7109
-VirtReg_FR64X	7110
-VirtReg_GR64_with_sub_8bit	7111
-VirtReg_GR64_NOSP	7112
-VirtReg_GR64_NOREX2	7113
-VirtReg_CONTROL_REG	7114
-VirtReg_FR64	7115
-VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2	7116
-VirtReg_GR64_NOREX2_NOSP	7117
-VirtReg_GR64PLTSafe	7118
-VirtReg_GR64_TC	7119
-VirtReg_GR64_NOREX	7120
-VirtReg_GR64_TCW64	7121
-VirtReg_GR64_TC_with_sub_8bit	7122
-VirtReg_GR64_NOREX2_NOSP_and_GR64_TC	7123
-VirtReg_GR64_TCW64_with_sub_8bit	7124
-VirtReg_GR64_TC_and_GR64_TCW64	7125
-VirtReg_GR64_with_sub_16bit_in_GR16_NOREX	7126
-VirtReg_VK64	7127
-VirtReg_VR64	7128
-VirtReg_GR64PLTSafe_and_GR64_TC	7129
-VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64	7130
-VirtReg_GR64_NOREX_NOSP	7131
-VirtReg_GR64_NOREX_and_GR64_TC	7132
-VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit	7133
-VirtReg_VK64WM	7134
-VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64	7135
-VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX	7136
-VirtReg_GR64PLTSafe_and_GR64_TCW64	7137
-VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC	7138
-VirtReg_GR64_NOREX_and_GR64_TCW64	7139
-VirtReg_GR64_ABCD	7140
-VirtReg_GR64_with_sub_32bit_in_GR32_TC	7141
-VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC	7142
-VirtReg_GR64_AD	7143
-VirtReg_GR64_ArgRef	7144
-VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP	7145
-VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef	7146
-VirtReg_GR64_with_sub_32bit_in_GR32_BPSP	7147
-VirtReg_GR64_with_sub_32bit_in_GR32_BSI	7148
-VirtReg_GR64_with_sub_32bit_in_GR32_CB	7149
-VirtReg_GR64_with_sub_32bit_in_GR32_DIBP	7150
-VirtReg_GR64_with_sub_32bit_in_GR32_SIDI	7151
-VirtReg_GR64_A	7152
-VirtReg_GR64_ArgRef_and_GR64_TC	7153
-VirtReg_GR64_and_LOW32_ADDR_ACCESS	7154
-VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI	7155
-VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef	7156
-VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB	7157
-VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP	7158
-VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC	7159
-VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI	7160
-VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI	7161
-VirtReg_RST	7162
-VirtReg_RFP80	7163
-VirtReg_RFP80_7	7164
-VirtReg_VR128X	7165
-VirtReg_VR128	7166
-VirtReg_VR256X	7167
-VirtReg_VR256	7168
-VirtReg_VR512	7169
-VirtReg_VR512_0_15	7170
-VirtReg_TILE	7171
-VirtReg_TILEPAIR	7172
+PTCMMIMFP	1441
+PTCMMRLFP	1442
+PTCVTROWD	1443
+PTCVTROWPS	1444
+PTDPBF	1445
+PTDPBHF	1446
+PTDPBSSD	1447
+PTDPBSSDV	1448
+PTDPBSUD	1449
+PTDPBSUDV	1450
+PTDPBUSD	1451
+PTDPBUSDV	1452
+PTDPBUUD	1453
+PTDPBUUDV	1454
+PTDPFP	1455
+PTDPHBF	1456
+PTDPHF	1457
+PTESTrm	1458
+PTESTrr	1459
+PTILELOADD	1460
+PTILELOADDRS	1461
+PTILELOADDRST	1462
+PTILELOADDRSV	1463
+PTILELOADDT	1464
+PTILELOADDV	1465
+PTILEMOVROWrre	1466
+PTILEMOVROWrreV	1467
+PTILEMOVROWrri	1468
+PTILEMOVROWrriV	1469
+PTILESTORED	1470
+PTILESTOREDV	1471
+PTILEZERO	1472
+PTILEZEROV	1473
+PTMMULTF	1474
+PTWRITE	1475
+PTWRITEm	1476
+PTWRITEr	1477
+PUNPCKHBWrm	1478
+PUNPCKHBWrr	1479
+PUNPCKHDQrm	1480
+PUNPCKHDQrr	1481
+PUNPCKHQDQrm	1482
+PUNPCKHQDQrr	1483
+PUNPCKHWDrm	1484
+PUNPCKHWDrr	1485
+PUNPCKLBWrm	1486
+PUNPCKLBWrr	1487
+PUNPCKLDQrm	1488
+PUNPCKLDQrr	1489
+PUNPCKLQDQrm	1490
+PUNPCKLQDQrr	1491
+PUNPCKLWDrm	1492
+PUNPCKLWDrr	1493
+PUSH	1494
+PUSHA	1495
+PUSHCS	1496
+PUSHDS	1497
+PUSHES	1498
+PUSHF	1499
+PUSHFS	1500
+PUSHGS	1501
+PUSHP	1502
+PUSHSS	1503
+PVALIDATE	1504
+PXORrm	1505
+PXORrr	1506
+RCL	1507
+RCPPSm	1508
+RCPPSr	1509
+RCPSSm	1510
+RCPSSm_Int	1511
+RCPSSr	1512
+RCPSSr_Int	1513
+RCR	1514
+RDFLAGS	1515
+RDFSBASE	1516
+RDGSBASE	1517
+RDMSR	1518
+RDMSRLIST	1519
+RDMSRri	1520
+RDMSRri_EVEX	1521
+RDPID	1522
+RDPKRUr	1523
+RDPMC	1524
+RDPRU	1525
+RDRAND	1526
+RDSEED	1527
+RDSSPD	1528
+RDSSPQ	1529
+RDTSC	1530
+RDTSCP	1531
+REG_SEQUENCE	1532
+RELOC_NONE	1533
+REPNE_PREFIX	1534
+REP_MOVSB	1535
+REP_MOVSD	1536
+REP_MOVSQ	1537
+REP_MOVSW	1538
+REP_PREFIX	1539
+REP_STOSB	1540
+REP_STOSD	1541
+REP_STOSQ	1542
+REP_STOSW	1543
+RET	1544
+RETI	1545
+REX	1546
+RMPADJUST	1547
+RMPQUERY	1548
+RMPUPDATE	1549
+ROL	1550
+ROR	1551
+RORX	1552
+ROUNDPDmi	1553
+ROUNDPDri	1554
+ROUNDPSmi	1555
+ROUNDPSri	1556
+ROUNDSDmi	1557
+ROUNDSDmi_Int	1558
+ROUNDSDri	1559
+ROUNDSDri_Int	1560
+ROUNDSSmi	1561
+ROUNDSSmi_Int	1562
+ROUNDSSri	1563
+ROUNDSSri_Int	1564
+RSM	1565
+RSQRTPSm	1566
+RSQRTPSr	1567
+RSQRTSSm	1568
+RSQRTSSm_Int	1569
+RSQRTSSr	1570
+RSQRTSSr_Int	1571
+RSTORSSP	1572
+SAHF	1573
+SALC	1574
+SAR	1575
+SARX	1576
+SAVEPREVSSP	1577
+SBB	1578
+SCASB	1579
+SCASL	1580
+SCASQ	1581
+SCASW	1582
+SEAMCALL	1583
+SEAMOPS	1584
+SEAMRET	1585
+SEG_ALLOCA	1586
+SEH_BeginEpilogue	1587
+SEH_EndEpilogue	1588
+SEH_EndPrologue	1589
+SEH_PushFrame	1590
+SEH_PushReg	1591
+SEH_SaveReg	1592
+SEH_SaveXMM	1593
+SEH_SetFrame	1594
+SEH_StackAlign	1595
+SEH_StackAlloc	1596
+SEH_UnwindV	1597
+SEH_UnwindVersion	1598
+SENDUIPI	1599
+SERIALIZE	1600
+SETB_C	1601
+SETCCm	1602
+SETCCm_EVEX	1603
+SETCCr	1604
+SETCCr_EVEX	1605
+SETSSBSY	1606
+SETZUCCm	1607
+SETZUCCr	1608
+SFENCE	1609
+SGDT	1610
+SHA	1611
+SHL	1612
+SHLD	1613
+SHLDROT	1614
+SHLX	1615
+SHR	1616
+SHRD	1617
+SHRDROT	1618
+SHRX	1619
+SHUFPDrmi	1620
+SHUFPDrri	1621
+SHUFPSrmi	1622
+SHUFPSrri	1623
+SIDT	1624
+SKINIT	1625
+SLDT	1626
+SLWPCB	1627
+SMSW	1628
+SQRTPDm	1629
+SQRTPDr	1630
+SQRTPSm	1631
+SQRTPSr	1632
+SQRTSDm	1633
+SQRTSDm_Int	1634
+SQRTSDr	1635
+SQRTSDr_Int	1636
+SQRTSSm	1637
+SQRTSSm_Int	1638
+SQRTSSr	1639
+SQRTSSr_Int	1640
+SQRT_F	1641
+SQRT_Fp	1642
+SS_PREFIX	1643
+STAC	1644
+STACKALLOC_W_PROBING	1645
+STACKMAP	1646
+STATEPOINT	1647
+STC	1648
+STD	1649
+STGI	1650
+STI	1651
+STMXCSR	1652
+STOSB	1653
+STOSL	1654
+STOSQ	1655
+STOSW	1656
+STR	1657
+STRm	1658
+STTILECFG	1659
+STTILECFG_EVEX	1660
+STUI	1661
+ST_F	1662
+ST_FP	1663
+ST_FPrr	1664
+ST_Fp	1665
+ST_FpP	1666
+ST_Frr	1667
+SUB	1668
+SUBPDrm	1669
+SUBPDrr	1670
+SUBPSrm	1671
+SUBPSrr	1672
+SUBREG_TO_REG	1673
+SUBR_F	1674
+SUBR_FI	1675
+SUBR_FPrST	1676
+SUBR_FST	1677
+SUBR_Fp	1678
+SUBR_FpI	1679
+SUBR_FrST	1680
+SUBSDrm	1681
+SUBSDrm_Int	1682
+SUBSDrr	1683
+SUBSDrr_Int	1684
+SUBSSrm	1685
+SUBSSrm_Int	1686
+SUBSSrr	1687
+SUBSSrr_Int	1688
+SUB_F	1689
+SUB_FI	1690
+SUB_FPrST	1691
+SUB_FST	1692
+SUB_Fp	1693
+SUB_FpI	1694
+SUB_FrST	1695
+SWAPGS	1696
+SYSCALL	1697
+SYSENTER	1698
+SYSEXIT	1699
+SYSRET	1700
+T	1701
+TAILJMPd	1702
+TAILJMPd_CC	1703
+TAILJMPm	1704
+TAILJMPr	1705
+TCMMIMFP	1706
+TCMMRLFP	1707
+TCRETURN_HIPE	1708
+TCRETURN_WIN	1709
+TCRETURN_WINmi	1710
+TCRETURNdi	1711
+TCRETURNdicc	1712
+TCRETURNmi	1713
+TCRETURNri	1714
+TCVTROWD	1715
+TCVTROWPS	1716
+TDCALL	1717
+TDPBF	1718
+TDPBHF	1719
+TDPBSSD	1720
+TDPBSUD	1721
+TDPBUSD	1722
+TDPBUUD	1723
+TDPFP	1724
+TDPHBF	1725
+TDPHF	1726
+TEST	1727
+TESTUI	1728
+TILELOADD	1729
+TILELOADDRS	1730
+TILELOADDRST	1731
+TILELOADDRS_EVEX	1732
+TILELOADDT	1733
+TILELOADD_EVEX	1734
+TILEMOVROWrre	1735
+TILEMOVROWrri	1736
+TILERELEASE	1737
+TILESTORED	1738
+TILESTORED_EVEX	1739
+TILEZERO	1740
+TLBSYNC	1741
+TLSCall	1742
+TLS_addr	1743
+TLS_addrX	1744
+TLS_base_addr	1745
+TLS_base_addrX	1746
+TLS_desc	1747
+TMMULTF	1748
+TPAUSE	1749
+TRAP	1750
+TST_F	1751
+TST_Fp	1752
+TZCNT	1753
+TZMSK	1754
+UBSAN_UD	1755
+UCOMISDrm	1756
+UCOMISDrm_Int	1757
+UCOMISDrr	1758
+UCOMISDrr_Int	1759
+UCOMISSrm	1760
+UCOMISSrm_Int	1761
+UCOMISSrr	1762
+UCOMISSrr_Int	1763
+UCOM_FIPr	1764
+UCOM_FIr	1765
+UCOM_FPPr	1766
+UCOM_FPr	1767
+UCOM_FpIr	1768
+UCOM_Fpr	1769
+UCOM_Fr	1770
+UD	1771
+UIRET	1772
+UMONITOR	1773
+UMWAIT	1774
+UNPCKHPDrm	1775
+UNPCKHPDrr	1776
+UNPCKHPSrm	1777
+UNPCKHPSrr	1778
+UNPCKLPDrm	1779
+UNPCKLPDrr	1780
+UNPCKLPSrm	1781
+UNPCKLPSrr	1782
+URDMSRri	1783
+URDMSRri_EVEX	1784
+URDMSRrr	1785
+URDMSRrr_EVEX	1786
+UWRMSRir	1787
+UWRMSRir_EVEX	1788
+UWRMSRrr	1789
+UWRMSRrr_EVEX	1790
+V	1791
+VAARG	1792
+VAARG_X	1793
+VADDBF	1794
+VADDPDYrm	1795
+VADDPDYrr	1796
+VADDPDZ	1797
+VADDPDZrm	1798
+VADDPDZrmb	1799
+VADDPDZrmbk	1800
+VADDPDZrmbkz	1801
+VADDPDZrmk	1802
+VADDPDZrmkz	1803
+VADDPDZrr	1804
+VADDPDZrrb	1805
+VADDPDZrrbk	1806
+VADDPDZrrbkz	1807
+VADDPDZrrk	1808
+VADDPDZrrkz	1809
+VADDPDrm	1810
+VADDPDrr	1811
+VADDPHZ	1812
+VADDPHZrm	1813
+VADDPHZrmb	1814
+VADDPHZrmbk	1815
+VADDPHZrmbkz	1816
+VADDPHZrmk	1817
+VADDPHZrmkz	1818
+VADDPHZrr	1819
+VADDPHZrrb	1820
+VADDPHZrrbk	1821
+VADDPHZrrbkz	1822
+VADDPHZrrk	1823
+VADDPHZrrkz	1824
+VADDPSYrm	1825
+VADDPSYrr	1826
+VADDPSZ	1827
+VADDPSZrm	1828
+VADDPSZrmb	1829
+VADDPSZrmbk	1830
+VADDPSZrmbkz	1831
+VADDPSZrmk	1832
+VADDPSZrmkz	1833
+VADDPSZrr	1834
+VADDPSZrrb	1835
+VADDPSZrrbk	1836
+VADDPSZrrbkz	1837
+VADDPSZrrk	1838
+VADDPSZrrkz	1839
+VADDPSrm	1840
+VADDPSrr	1841
+VADDSDZrm	1842
+VADDSDZrm_Int	1843
+VADDSDZrmk_Int	1844
+VADDSDZrmkz_Int	1845
+VADDSDZrr	1846
+VADDSDZrr_Int	1847
+VADDSDZrrb_Int	1848
+VADDSDZrrbk_Int	1849
+VADDSDZrrbkz_Int	1850
+VADDSDZrrk_Int	1851
+VADDSDZrrkz_Int	1852
+VADDSDrm	1853
+VADDSDrm_Int	1854
+VADDSDrr	1855
+VADDSDrr_Int	1856
+VADDSHZrm	1857
+VADDSHZrm_Int	1858
+VADDSHZrmk_Int	1859
+VADDSHZrmkz_Int	1860
+VADDSHZrr	1861
+VADDSHZrr_Int	1862
+VADDSHZrrb_Int	1863
+VADDSHZrrbk_Int	1864
+VADDSHZrrbkz_Int	1865
+VADDSHZrrk_Int	1866
+VADDSHZrrkz_Int	1867
+VADDSSZrm	1868
+VADDSSZrm_Int	1869
+VADDSSZrmk_Int	1870
+VADDSSZrmkz_Int	1871
+VADDSSZrr	1872
+VADDSSZrr_Int	1873
+VADDSSZrrb_Int	1874
+VADDSSZrrbk_Int	1875
+VADDSSZrrbkz_Int	1876
+VADDSSZrrk_Int	1877
+VADDSSZrrkz_Int	1878
+VADDSSrm	1879
+VADDSSrm_Int	1880
+VADDSSrr	1881
+VADDSSrr_Int	1882
+VADDSUBPDYrm	1883
+VADDSUBPDYrr	1884
+VADDSUBPDrm	1885
+VADDSUBPDrr	1886
+VADDSUBPSYrm	1887
+VADDSUBPSYrr	1888
+VADDSUBPSrm	1889
+VADDSUBPSrr	1890
+VAESDECLASTYrm	1891
+VAESDECLASTYrr	1892
+VAESDECLASTZ	1893
+VAESDECLASTZrm	1894
+VAESDECLASTZrr	1895
+VAESDECLASTrm	1896
+VAESDECLASTrr	1897
+VAESDECYrm	1898
+VAESDECYrr	1899
+VAESDECZ	1900
+VAESDECZrm	1901
+VAESDECZrr	1902
+VAESDECrm	1903
+VAESDECrr	1904
+VAESENCLASTYrm	1905
+VAESENCLASTYrr	1906
+VAESENCLASTZ	1907
+VAESENCLASTZrm	1908
+VAESENCLASTZrr	1909
+VAESENCLASTrm	1910
+VAESENCLASTrr	1911
+VAESENCYrm	1912
+VAESENCYrr	1913
+VAESENCZ	1914
+VAESENCZrm	1915
+VAESENCZrr	1916
+VAESENCrm	1917
+VAESENCrr	1918
+VAESIMCrm	1919
+VAESIMCrr	1920
+VAESKEYGENASSISTrmi	1921
+VAESKEYGENASSISTrri	1922
+VALIGNDZ	1923
+VALIGNDZrmbi	1924
+VALIGNDZrmbik	1925
+VALIGNDZrmbikz	1926
+VALIGNDZrmi	1927
+VALIGNDZrmik	1928
+VALIGNDZrmikz	1929
+VALIGNDZrri	1930
+VALIGNDZrrik	1931
+VALIGNDZrrikz	1932
+VALIGNQZ	1933
+VALIGNQZrmbi	1934
+VALIGNQZrmbik	1935
+VALIGNQZrmbikz	1936
+VALIGNQZrmi	1937
+VALIGNQZrmik	1938
+VALIGNQZrmikz	1939
+VALIGNQZrri	1940
+VALIGNQZrrik	1941
+VALIGNQZrrikz	1942
+VANDNPDYrm	1943
+VANDNPDYrr	1944
+VANDNPDZ	1945
+VANDNPDZrm	1946
+VANDNPDZrmb	1947
+VANDNPDZrmbk	1948
+VANDNPDZrmbkz	1949
+VANDNPDZrmk	1950
+VANDNPDZrmkz	1951
+VANDNPDZrr	1952
+VANDNPDZrrk	1953
+VANDNPDZrrkz	1954
+VANDNPDrm	1955
+VANDNPDrr	1956
+VANDNPSYrm	1957
+VANDNPSYrr	1958
+VANDNPSZ	1959
+VANDNPSZrm	1960
+VANDNPSZrmb	1961
+VANDNPSZrmbk	1962
+VANDNPSZrmbkz	1963
+VANDNPSZrmk	1964
+VANDNPSZrmkz	1965
+VANDNPSZrr	1966
+VANDNPSZrrk	1967
+VANDNPSZrrkz	1968
+VANDNPSrm	1969
+VANDNPSrr	1970
+VANDPDYrm	1971
+VANDPDYrr	1972
+VANDPDZ	1973
+VANDPDZrm	1974
+VANDPDZrmb	1975
+VANDPDZrmbk	1976
+VANDPDZrmbkz	1977
+VANDPDZrmk	1978
+VANDPDZrmkz	1979
+VANDPDZrr	1980
+VANDPDZrrk	1981
+VANDPDZrrkz	1982
+VANDPDrm	1983
+VANDPDrr	1984
+VANDPSYrm	1985
+VANDPSYrr	1986
+VANDPSZ	1987
+VANDPSZrm	1988
+VANDPSZrmb	1989
+VANDPSZrmbk	1990
+VANDPSZrmbkz	1991
+VANDPSZrmk	1992
+VANDPSZrmkz	1993
+VANDPSZrr	1994
+VANDPSZrrk	1995
+VANDPSZrrkz	1996
+VANDPSrm	1997
+VANDPSrr	1998
+VASTART_SAVE_XMM_REGS	1999
+VBCSTNEBF	2000
+VBCSTNESH	2001
+VBLENDMPDZ	2002
+VBLENDMPDZrm	2003
+VBLENDMPDZrmb	2004
+VBLENDMPDZrmbk	2005
+VBLENDMPDZrmbkz	2006
+VBLENDMPDZrmk	2007
+VBLENDMPDZrmkz	2008
+VBLENDMPDZrr	2009
+VBLENDMPDZrrk	2010
+VBLENDMPDZrrkz	2011
+VBLENDMPSZ	2012
+VBLENDMPSZrm	2013
+VBLENDMPSZrmb	2014
+VBLENDMPSZrmbk	2015
+VBLENDMPSZrmbkz	2016
+VBLENDMPSZrmk	2017
+VBLENDMPSZrmkz	2018
+VBLENDMPSZrr	2019
+VBLENDMPSZrrk	2020
+VBLENDMPSZrrkz	2021
+VBLENDPDYrmi	2022
+VBLENDPDYrri	2023
+VBLENDPDrmi	2024
+VBLENDPDrri	2025
+VBLENDPSYrmi	2026
+VBLENDPSYrri	2027
+VBLENDPSrmi	2028
+VBLENDPSrri	2029
+VBLENDVPDYrmr	2030
+VBLENDVPDYrrr	2031
+VBLENDVPDrmr	2032
+VBLENDVPDrrr	2033
+VBLENDVPSYrmr	2034
+VBLENDVPSYrrr	2035
+VBLENDVPSrmr	2036
+VBLENDVPSrrr	2037
+VBROADCASTF	2038
+VBROADCASTI	2039
+VBROADCASTSDYrm	2040
+VBROADCASTSDYrr	2041
+VBROADCASTSDZ	2042
+VBROADCASTSDZrm	2043
+VBROADCASTSDZrmk	2044
+VBROADCASTSDZrmkz	2045
+VBROADCASTSDZrr	2046
+VBROADCASTSDZrrk	2047
+VBROADCASTSDZrrkz	2048
+VBROADCASTSSYrm	2049
+VBROADCASTSSYrr	2050
+VBROADCASTSSZ	2051
+VBROADCASTSSZrm	2052
+VBROADCASTSSZrmk	2053
+VBROADCASTSSZrmkz	2054
+VBROADCASTSSZrr	2055
+VBROADCASTSSZrrk	2056
+VBROADCASTSSZrrkz	2057
+VBROADCASTSSrm	2058
+VBROADCASTSSrr	2059
+VCMPBF	2060
+VCMPPDYrmi	2061
+VCMPPDYrri	2062
+VCMPPDZ	2063
+VCMPPDZrmbi	2064
+VCMPPDZrmbik	2065
+VCMPPDZrmi	2066
+VCMPPDZrmik	2067
+VCMPPDZrri	2068
+VCMPPDZrrib	2069
+VCMPPDZrribk	2070
+VCMPPDZrrik	2071
+VCMPPDrmi	2072
+VCMPPDrri	2073
+VCMPPHZ	2074
+VCMPPHZrmbi	2075
+VCMPPHZrmbik	2076
+VCMPPHZrmi	2077
+VCMPPHZrmik	2078
+VCMPPHZrri	2079
+VCMPPHZrrib	2080
+VCMPPHZrribk	2081
+VCMPPHZrrik	2082
+VCMPPSYrmi	2083
+VCMPPSYrri	2084
+VCMPPSZ	2085
+VCMPPSZrmbi	2086
+VCMPPSZrmbik	2087
+VCMPPSZrmi	2088
+VCMPPSZrmik	2089
+VCMPPSZrri	2090
+VCMPPSZrrib	2091
+VCMPPSZrribk	2092
+VCMPPSZrrik	2093
+VCMPPSrmi	2094
+VCMPPSrri	2095
+VCMPSDZrmi	2096
+VCMPSDZrmi_Int	2097
+VCMPSDZrmik_Int	2098
+VCMPSDZrri	2099
+VCMPSDZrri_Int	2100
+VCMPSDZrrib_Int	2101
+VCMPSDZrribk_Int	2102
+VCMPSDZrrik_Int	2103
+VCMPSDrmi	2104
+VCMPSDrmi_Int	2105
+VCMPSDrri	2106
+VCMPSDrri_Int	2107
+VCMPSHZrmi	2108
+VCMPSHZrmi_Int	2109
+VCMPSHZrmik_Int	2110
+VCMPSHZrri	2111
+VCMPSHZrri_Int	2112
+VCMPSHZrrib_Int	2113
+VCMPSHZrribk_Int	2114
+VCMPSHZrrik_Int	2115
+VCMPSSZrmi	2116
+VCMPSSZrmi_Int	2117
+VCMPSSZrmik_Int	2118
+VCMPSSZrri	2119
+VCMPSSZrri_Int	2120
+VCMPSSZrrib_Int	2121
+VCMPSSZrribk_Int	2122
+VCMPSSZrrik_Int	2123
+VCMPSSrmi	2124
+VCMPSSrmi_Int	2125
+VCMPSSrri	2126
+VCMPSSrri_Int	2127
+VCOMISBF	2128
+VCOMISDZrm	2129
+VCOMISDZrm_Int	2130
+VCOMISDZrr	2131
+VCOMISDZrr_Int	2132
+VCOMISDZrrb	2133
+VCOMISDrm	2134
+VCOMISDrm_Int	2135
+VCOMISDrr	2136
+VCOMISDrr_Int	2137
+VCOMISHZrm	2138
+VCOMISHZrm_Int	2139
+VCOMISHZrr	2140
+VCOMISHZrr_Int	2141
+VCOMISHZrrb	2142
+VCOMISSZrm	2143
+VCOMISSZrm_Int	2144
+VCOMISSZrr	2145
+VCOMISSZrr_Int	2146
+VCOMISSZrrb	2147
+VCOMISSrm	2148
+VCOMISSrm_Int	2149
+VCOMISSrr	2150
+VCOMISSrr_Int	2151
+VCOMPRESSPDZ	2152
+VCOMPRESSPDZmr	2153
+VCOMPRESSPDZmrk	2154
+VCOMPRESSPDZrr	2155
+VCOMPRESSPDZrrk	2156
+VCOMPRESSPDZrrkz	2157
+VCOMPRESSPSZ	2158
+VCOMPRESSPSZmr	2159
+VCOMPRESSPSZmrk	2160
+VCOMPRESSPSZrr	2161
+VCOMPRESSPSZrrk	2162
+VCOMPRESSPSZrrkz	2163
+VCOMXSDZrm_Int	2164
+VCOMXSDZrr_Int	2165
+VCOMXSDZrrb_Int	2166
+VCOMXSHZrm_Int	2167
+VCOMXSHZrr_Int	2168
+VCOMXSHZrrb_Int	2169
+VCOMXSSZrm_Int	2170
+VCOMXSSZrr_Int	2171
+VCOMXSSZrrb_Int	2172
+VCVT	2173
+VCVTBF	2174
+VCVTBIASPH	2175
+VCVTDQ	2176
+VCVTHF	2177
+VCVTNE	2178
+VCVTNEEBF	2179
+VCVTNEEPH	2180
+VCVTNEOBF	2181
+VCVTNEOPH	2182
+VCVTNEPS	2183
+VCVTPD	2184
+VCVTPH	2185
+VCVTPS	2186
+VCVTQQ	2187
+VCVTSD	2188
+VCVTSH	2189
+VCVTSI	2190
+VCVTSS	2191
+VCVTTBF	2192
+VCVTTPD	2193
+VCVTTPH	2194
+VCVTTPS	2195
+VCVTTSD	2196
+VCVTTSH	2197
+VCVTTSS	2198
+VCVTUDQ	2199
+VCVTUQQ	2200
+VCVTUSI	2201
+VCVTUW	2202
+VCVTW	2203
+VDBPSADBWZ	2204
+VDBPSADBWZrmi	2205
+VDBPSADBWZrmik	2206
+VDBPSADBWZrmikz	2207
+VDBPSADBWZrri	2208
+VDBPSADBWZrrik	2209
+VDBPSADBWZrrikz	2210
+VDIVBF	2211
+VDIVPDYrm	2212
+VDIVPDYrr	2213
+VDIVPDZ	2214
+VDIVPDZrm	2215
+VDIVPDZrmb	2216
+VDIVPDZrmbk	2217
+VDIVPDZrmbkz	2218
+VDIVPDZrmk	2219
+VDIVPDZrmkz	2220
+VDIVPDZrr	2221
+VDIVPDZrrb	2222
+VDIVPDZrrbk	2223
+VDIVPDZrrbkz	2224
+VDIVPDZrrk	2225
+VDIVPDZrrkz	2226
+VDIVPDrm	2227
+VDIVPDrr	2228
+VDIVPHZ	2229
+VDIVPHZrm	2230
+VDIVPHZrmb	2231
+VDIVPHZrmbk	2232
+VDIVPHZrmbkz	2233
+VDIVPHZrmk	2234
+VDIVPHZrmkz	2235
+VDIVPHZrr	2236
+VDIVPHZrrb	2237
+VDIVPHZrrbk	2238
+VDIVPHZrrbkz	2239
+VDIVPHZrrk	2240
+VDIVPHZrrkz	2241
+VDIVPSYrm	2242
+VDIVPSYrr	2243
+VDIVPSZ	2244
+VDIVPSZrm	2245
+VDIVPSZrmb	2246
+VDIVPSZrmbk	2247
+VDIVPSZrmbkz	2248
+VDIVPSZrmk	2249
+VDIVPSZrmkz	2250
+VDIVPSZrr	2251
+VDIVPSZrrb	2252
+VDIVPSZrrbk	2253
+VDIVPSZrrbkz	2254
+VDIVPSZrrk	2255
+VDIVPSZrrkz	2256
+VDIVPSrm	2257
+VDIVPSrr	2258
+VDIVSDZrm	2259
+VDIVSDZrm_Int	2260
+VDIVSDZrmk_Int	2261
+VDIVSDZrmkz_Int	2262
+VDIVSDZrr	2263
+VDIVSDZrr_Int	2264
+VDIVSDZrrb_Int	2265
+VDIVSDZrrbk_Int	2266
+VDIVSDZrrbkz_Int	2267
+VDIVSDZrrk_Int	2268
+VDIVSDZrrkz_Int	2269
+VDIVSDrm	2270
+VDIVSDrm_Int	2271
+VDIVSDrr	2272
+VDIVSDrr_Int	2273
+VDIVSHZrm	2274
+VDIVSHZrm_Int	2275
+VDIVSHZrmk_Int	2276
+VDIVSHZrmkz_Int	2277
+VDIVSHZrr	2278
+VDIVSHZrr_Int	2279
+VDIVSHZrrb_Int	2280
+VDIVSHZrrbk_Int	2281
+VDIVSHZrrbkz_Int	2282
+VDIVSHZrrk_Int	2283
+VDIVSHZrrkz_Int	2284
+VDIVSSZrm	2285
+VDIVSSZrm_Int	2286
+VDIVSSZrmk_Int	2287
+VDIVSSZrmkz_Int	2288
+VDIVSSZrr	2289
+VDIVSSZrr_Int	2290
+VDIVSSZrrb_Int	2291
+VDIVSSZrrbk_Int	2292
+VDIVSSZrrbkz_Int	2293
+VDIVSSZrrk_Int	2294
+VDIVSSZrrkz_Int	2295
+VDIVSSrm	2296
+VDIVSSrm_Int	2297
+VDIVSSrr	2298
+VDIVSSrr_Int	2299
+VDPBF	2300
+VDPPDrmi	2301
+VDPPDrri	2302
+VDPPHPSZ	2303
+VDPPHPSZm	2304
+VDPPHPSZmb	2305
+VDPPHPSZmbk	2306
+VDPPHPSZmbkz	2307
+VDPPHPSZmk	2308
+VDPPHPSZmkz	2309
+VDPPHPSZr	2310
+VDPPHPSZrk	2311
+VDPPHPSZrkz	2312
+VDPPSYrmi	2313
+VDPPSYrri	2314
+VDPPSrmi	2315
+VDPPSrri	2316
+VERRm	2317
+VERRr	2318
+VERWm	2319
+VERWr	2320
+VEXP	2321
+VEXPANDPDZ	2322
+VEXPANDPDZrm	2323
+VEXPANDPDZrmk	2324
+VEXPANDPDZrmkz	2325
+VEXPANDPDZrr	2326
+VEXPANDPDZrrk	2327
+VEXPANDPDZrrkz	2328
+VEXPANDPSZ	2329
+VEXPANDPSZrm	2330
+VEXPANDPSZrmk	2331
+VEXPANDPSZrmkz	2332
+VEXPANDPSZrr	2333
+VEXPANDPSZrrk	2334
+VEXPANDPSZrrkz	2335
+VEXTRACTF	2336
+VEXTRACTI	2337
+VEXTRACTPSZmri	2338
+VEXTRACTPSZrri	2339
+VEXTRACTPSmri	2340
+VEXTRACTPSrri	2341
+VFCMADDCPHZ	2342
+VFCMADDCPHZm	2343
+VFCMADDCPHZmb	2344
+VFCMADDCPHZmbk	2345
+VFCMADDCPHZmbkz	2346
+VFCMADDCPHZmk	2347
+VFCMADDCPHZmkz	2348
+VFCMADDCPHZr	2349
+VFCMADDCPHZrb	2350
+VFCMADDCPHZrbk	2351
+VFCMADDCPHZrbkz	2352
+VFCMADDCPHZrk	2353
+VFCMADDCPHZrkz	2354
+VFCMADDCSHZm	2355
+VFCMADDCSHZmk	2356
+VFCMADDCSHZmkz	2357
+VFCMADDCSHZr	2358
+VFCMADDCSHZrb	2359
+VFCMADDCSHZrbk	2360
+VFCMADDCSHZrbkz	2361
+VFCMADDCSHZrk	2362
+VFCMADDCSHZrkz	2363
+VFCMULCPHZ	2364
+VFCMULCPHZrm	2365
+VFCMULCPHZrmb	2366
+VFCMULCPHZrmbk	2367
+VFCMULCPHZrmbkz	2368
+VFCMULCPHZrmk	2369
+VFCMULCPHZrmkz	2370
+VFCMULCPHZrr	2371
+VFCMULCPHZrrb	2372
+VFCMULCPHZrrbk	2373
+VFCMULCPHZrrbkz	2374
+VFCMULCPHZrrk	2375
+VFCMULCPHZrrkz	2376
+VFCMULCSHZrm	2377
+VFCMULCSHZrmk	2378
+VFCMULCSHZrmkz	2379
+VFCMULCSHZrr	2380
+VFCMULCSHZrrb	2381
+VFCMULCSHZrrbk	2382
+VFCMULCSHZrrbkz	2383
+VFCMULCSHZrrk	2384
+VFCMULCSHZrrkz	2385
+VFIXUPIMMPDZ	2386
+VFIXUPIMMPDZrmbi	2387
+VFIXUPIMMPDZrmbik	2388
+VFIXUPIMMPDZrmbikz	2389
+VFIXUPIMMPDZrmi	2390
+VFIXUPIMMPDZrmik	2391
+VFIXUPIMMPDZrmikz	2392
+VFIXUPIMMPDZrri	2393
+VFIXUPIMMPDZrrib	2394
+VFIXUPIMMPDZrribk	2395
+VFIXUPIMMPDZrribkz	2396
+VFIXUPIMMPDZrrik	2397
+VFIXUPIMMPDZrrikz	2398
+VFIXUPIMMPSZ	2399
+VFIXUPIMMPSZrmbi	2400
+VFIXUPIMMPSZrmbik	2401
+VFIXUPIMMPSZrmbikz	2402
+VFIXUPIMMPSZrmi	2403
+VFIXUPIMMPSZrmik	2404
+VFIXUPIMMPSZrmikz	2405
+VFIXUPIMMPSZrri	2406
+VFIXUPIMMPSZrrib	2407
+VFIXUPIMMPSZrribk	2408
+VFIXUPIMMPSZrribkz	2409
+VFIXUPIMMPSZrrik	2410
+VFIXUPIMMPSZrrikz	2411
+VFIXUPIMMSDZrmi	2412
+VFIXUPIMMSDZrmik	2413
+VFIXUPIMMSDZrmikz	2414
+VFIXUPIMMSDZrri	2415
+VFIXUPIMMSDZrrib	2416
+VFIXUPIMMSDZrribk	2417
+VFIXUPIMMSDZrribkz	2418
+VFIXUPIMMSDZrrik	2419
+VFIXUPIMMSDZrrikz	2420
+VFIXUPIMMSSZrmi	2421
+VFIXUPIMMSSZrmik	2422
+VFIXUPIMMSSZrmikz	2423
+VFIXUPIMMSSZrri	2424
+VFIXUPIMMSSZrrib	2425
+VFIXUPIMMSSZrribk	2426
+VFIXUPIMMSSZrribkz	2427
+VFIXUPIMMSSZrrik	2428
+VFIXUPIMMSSZrrikz	2429
+VFMADD	2430
+VFMADDCPHZ	2431
+VFMADDCPHZm	2432
+VFMADDCPHZmb	2433
+VFMADDCPHZmbk	2434
+VFMADDCPHZmbkz	2435
+VFMADDCPHZmk	2436
+VFMADDCPHZmkz	2437
+VFMADDCPHZr	2438
+VFMADDCPHZrb	2439
+VFMADDCPHZrbk	2440
+VFMADDCPHZrbkz	2441
+VFMADDCPHZrk	2442
+VFMADDCPHZrkz	2443
+VFMADDCSHZm	2444
+VFMADDCSHZmk	2445
+VFMADDCSHZmkz	2446
+VFMADDCSHZr	2447
+VFMADDCSHZrb	2448
+VFMADDCSHZrbk	2449
+VFMADDCSHZrbkz	2450
+VFMADDCSHZrk	2451
+VFMADDCSHZrkz	2452
+VFMADDPD	2453
+VFMADDPS	2454
+VFMADDSD	2455
+VFMADDSS	2456
+VFMADDSUB	2457
+VFMADDSUBPD	2458
+VFMADDSUBPS	2459
+VFMSUB	2460
+VFMSUBADD	2461
+VFMSUBADDPD	2462
+VFMSUBADDPS	2463
+VFMSUBPD	2464
+VFMSUBPS	2465
+VFMSUBSD	2466
+VFMSUBSS	2467
+VFMULCPHZ	2468
+VFMULCPHZrm	2469
+VFMULCPHZrmb	2470
+VFMULCPHZrmbk	2471
+VFMULCPHZrmbkz	2472
+VFMULCPHZrmk	2473
+VFMULCPHZrmkz	2474
+VFMULCPHZrr	2475
+VFMULCPHZrrb	2476
+VFMULCPHZrrbk	2477
+VFMULCPHZrrbkz	2478
+VFMULCPHZrrk	2479
+VFMULCPHZrrkz	2480
+VFMULCSHZrm	2481
+VFMULCSHZrmk	2482
+VFMULCSHZrmkz	2483
+VFMULCSHZrr	2484
+VFMULCSHZrrb	2485
+VFMULCSHZrrbk	2486
+VFMULCSHZrrbkz	2487
+VFMULCSHZrrk	2488
+VFMULCSHZrrkz	2489
+VFNMADD	2490
+VFNMADDPD	2491
+VFNMADDPS	2492
+VFNMADDSD	2493
+VFNMADDSS	2494
+VFNMSUB	2495
+VFNMSUBPD	2496
+VFNMSUBPS	2497
+VFNMSUBSD	2498
+VFNMSUBSS	2499
+VFPCLASSBF	2500
+VFPCLASSPDZ	2501
+VFPCLASSPDZmbi	2502
+VFPCLASSPDZmbik	2503
+VFPCLASSPDZmi	2504
+VFPCLASSPDZmik	2505
+VFPCLASSPDZri	2506
+VFPCLASSPDZrik	2507
+VFPCLASSPHZ	2508
+VFPCLASSPHZmbi	2509
+VFPCLASSPHZmbik	2510
+VFPCLASSPHZmi	2511
+VFPCLASSPHZmik	2512
+VFPCLASSPHZri	2513
+VFPCLASSPHZrik	2514
+VFPCLASSPSZ	2515
+VFPCLASSPSZmbi	2516
+VFPCLASSPSZmbik	2517
+VFPCLASSPSZmi	2518
+VFPCLASSPSZmik	2519
+VFPCLASSPSZri	2520
+VFPCLASSPSZrik	2521
+VFPCLASSSDZmi	2522
+VFPCLASSSDZmik	2523
+VFPCLASSSDZri	2524
+VFPCLASSSDZrik	2525
+VFPCLASSSHZmi	2526
+VFPCLASSSHZmik	2527
+VFPCLASSSHZri	2528
+VFPCLASSSHZrik	2529
+VFPCLASSSSZmi	2530
+VFPCLASSSSZmik	2531
+VFPCLASSSSZri	2532
+VFPCLASSSSZrik	2533
+VFRCZPDYrm	2534
+VFRCZPDYrr	2535
+VFRCZPDrm	2536
+VFRCZPDrr	2537
+VFRCZPSYrm	2538
+VFRCZPSYrr	2539
+VFRCZPSrm	2540
+VFRCZPSrr	2541
+VFRCZSDrm	2542
+VFRCZSDrr	2543
+VFRCZSSrm	2544
+VFRCZSSrr	2545
+VGATHERDPDYrm	2546
+VGATHERDPDZ	2547
+VGATHERDPDZrm	2548
+VGATHERDPDrm	2549
+VGATHERDPSYrm	2550
+VGATHERDPSZ	2551
+VGATHERDPSZrm	2552
+VGATHERDPSrm	2553
+VGATHERPF	2554
+VGATHERQPDYrm	2555
+VGATHERQPDZ	2556
+VGATHERQPDZrm	2557
+VGATHERQPDrm	2558
+VGATHERQPSYrm	2559
+VGATHERQPSZ	2560
+VGATHERQPSZrm	2561
+VGATHERQPSrm	2562
+VGETEXPBF	2563
+VGETEXPPDZ	2564
+VGETEXPPDZm	2565
+VGETEXPPDZmb	2566
+VGETEXPPDZmbk	2567
+VGETEXPPDZmbkz	2568
+VGETEXPPDZmk	2569
+VGETEXPPDZmkz	2570
+VGETEXPPDZr	2571
+VGETEXPPDZrb	2572
+VGETEXPPDZrbk	2573
+VGETEXPPDZrbkz	2574
+VGETEXPPDZrk	2575
+VGETEXPPDZrkz	2576
+VGETEXPPHZ	2577
+VGETEXPPHZm	2578
+VGETEXPPHZmb	2579
+VGETEXPPHZmbk	2580
+VGETEXPPHZmbkz	2581
+VGETEXPPHZmk	2582
+VGETEXPPHZmkz	2583
+VGETEXPPHZr	2584
+VGETEXPPHZrb	2585
+VGETEXPPHZrbk	2586
+VGETEXPPHZrbkz	2587
+VGETEXPPHZrk	2588
+VGETEXPPHZrkz	2589
+VGETEXPPSZ	2590
+VGETEXPPSZm	2591
+VGETEXPPSZmb	2592
+VGETEXPPSZmbk	2593
+VGETEXPPSZmbkz	2594
+VGETEXPPSZmk	2595
+VGETEXPPSZmkz	2596
+VGETEXPPSZr	2597
+VGETEXPPSZrb	2598
+VGETEXPPSZrbk	2599
+VGETEXPPSZrbkz	2600
+VGETEXPPSZrk	2601
+VGETEXPPSZrkz	2602
+VGETEXPSDZm	2603
+VGETEXPSDZmk	2604
+VGETEXPSDZmkz	2605
+VGETEXPSDZr	2606
+VGETEXPSDZrb	2607
+VGETEXPSDZrbk	2608
+VGETEXPSDZrbkz	2609
+VGETEXPSDZrk	2610
+VGETEXPSDZrkz	2611
+VGETEXPSHZm	2612
+VGETEXPSHZmk	2613
+VGETEXPSHZmkz	2614
+VGETEXPSHZr	2615
+VGETEXPSHZrb	2616
+VGETEXPSHZrbk	2617
+VGETEXPSHZrbkz	2618
+VGETEXPSHZrk	2619
+VGETEXPSHZrkz	2620
+VGETEXPSSZm	2621
+VGETEXPSSZmk	2622
+VGETEXPSSZmkz	2623
+VGETEXPSSZr	2624
+VGETEXPSSZrb	2625
+VGETEXPSSZrbk	2626
+VGETEXPSSZrbkz	2627
+VGETEXPSSZrk	2628
+VGETEXPSSZrkz	2629
+VGETMANTBF	2630
+VGETMANTPDZ	2631
+VGETMANTPDZrmbi	2632
+VGETMANTPDZrmbik	2633
+VGETMANTPDZrmbikz	2634
+VGETMANTPDZrmi	2635
+VGETMANTPDZrmik	2636
+VGETMANTPDZrmikz	2637
+VGETMANTPDZrri	2638
+VGETMANTPDZrrib	2639
+VGETMANTPDZrribk	2640
+VGETMANTPDZrribkz	2641
+VGETMANTPDZrrik	2642
+VGETMANTPDZrrikz	2643
+VGETMANTPHZ	2644
+VGETMANTPHZrmbi	2645
+VGETMANTPHZrmbik	2646
+VGETMANTPHZrmbikz	2647
+VGETMANTPHZrmi	2648
+VGETMANTPHZrmik	2649
+VGETMANTPHZrmikz	2650
+VGETMANTPHZrri	2651
+VGETMANTPHZrrib	2652
+VGETMANTPHZrribk	2653
+VGETMANTPHZrribkz	2654
+VGETMANTPHZrrik	2655
+VGETMANTPHZrrikz	2656
+VGETMANTPSZ	2657
+VGETMANTPSZrmbi	2658
+VGETMANTPSZrmbik	2659
+VGETMANTPSZrmbikz	2660
+VGETMANTPSZrmi	2661
+VGETMANTPSZrmik	2662
+VGETMANTPSZrmikz	2663
+VGETMANTPSZrri	2664
+VGETMANTPSZrrib	2665
+VGETMANTPSZrribk	2666
+VGETMANTPSZrribkz	2667
+VGETMANTPSZrrik	2668
+VGETMANTPSZrrikz	2669
+VGETMANTSDZrmi	2670
+VGETMANTSDZrmik	2671
+VGETMANTSDZrmikz	2672
+VGETMANTSDZrri	2673
+VGETMANTSDZrrib	2674
+VGETMANTSDZrribk	2675
+VGETMANTSDZrribkz	2676
+VGETMANTSDZrrik	2677
+VGETMANTSDZrrikz	2678
+VGETMANTSHZrmi	2679
+VGETMANTSHZrmik	2680
+VGETMANTSHZrmikz	2681
+VGETMANTSHZrri	2682
+VGETMANTSHZrrib	2683
+VGETMANTSHZrribk	2684
+VGETMANTSHZrribkz	2685
+VGETMANTSHZrrik	2686
+VGETMANTSHZrrikz	2687
+VGETMANTSSZrmi	2688
+VGETMANTSSZrmik	2689
+VGETMANTSSZrmikz	2690
+VGETMANTSSZrri	2691
+VGETMANTSSZrrib	2692
+VGETMANTSSZrribk	2693
+VGETMANTSSZrribkz	2694
+VGETMANTSSZrrik	2695
+VGETMANTSSZrrikz	2696
+VGF	2697
+VHADDPDYrm	2698
+VHADDPDYrr	2699
+VHADDPDrm	2700
+VHADDPDrr	2701
+VHADDPSYrm	2702
+VHADDPSYrr	2703
+VHADDPSrm	2704
+VHADDPSrr	2705
+VHSUBPDYrm	2706
+VHSUBPDYrr	2707
+VHSUBPDrm	2708
+VHSUBPDrr	2709
+VHSUBPSYrm	2710
+VHSUBPSYrr	2711
+VHSUBPSrm	2712
+VHSUBPSrr	2713
+VINSERTF	2714
+VINSERTI	2715
+VINSERTPSZrmi	2716
+VINSERTPSZrri	2717
+VINSERTPSrmi	2718
+VINSERTPSrri	2719
+VLDDQUYrm	2720
+VLDDQUrm	2721
+VLDMXCSR	2722
+VMASKMOVDQU	2723
+VMASKMOVPDYmr	2724
+VMASKMOVPDYrm	2725
+VMASKMOVPDmr	2726
+VMASKMOVPDrm	2727
+VMASKMOVPSYmr	2728
+VMASKMOVPSYrm	2729
+VMASKMOVPSmr	2730
+VMASKMOVPSrm	2731
+VMAXBF	2732
+VMAXCPDYrm	2733
+VMAXCPDYrr	2734
+VMAXCPDZ	2735
+VMAXCPDZrm	2736
+VMAXCPDZrmb	2737
+VMAXCPDZrmbk	2738
+VMAXCPDZrmbkz	2739
+VMAXCPDZrmk	2740
+VMAXCPDZrmkz	2741
+VMAXCPDZrr	2742
+VMAXCPDZrrk	2743
+VMAXCPDZrrkz	2744
+VMAXCPDrm	2745
+VMAXCPDrr	2746
+VMAXCPHZ	2747
+VMAXCPHZrm	2748
+VMAXCPHZrmb	2749
+VMAXCPHZrmbk	2750
+VMAXCPHZrmbkz	2751
+VMAXCPHZrmk	2752
+VMAXCPHZrmkz	2753
+VMAXCPHZrr	2754
+VMAXCPHZrrk	2755
+VMAXCPHZrrkz	2756
+VMAXCPSYrm	2757
+VMAXCPSYrr	2758
+VMAXCPSZ	2759
+VMAXCPSZrm	2760
+VMAXCPSZrmb	2761
+VMAXCPSZrmbk	2762
+VMAXCPSZrmbkz	2763
+VMAXCPSZrmk	2764
+VMAXCPSZrmkz	2765
+VMAXCPSZrr	2766
+VMAXCPSZrrk	2767
+VMAXCPSZrrkz	2768
+VMAXCPSrm	2769
+VMAXCPSrr	2770
+VMAXCSDZrm	2771
+VMAXCSDZrr	2772
+VMAXCSDrm	2773
+VMAXCSDrr	2774
+VMAXCSHZrm	2775
+VMAXCSHZrr	2776
+VMAXCSSZrm	2777
+VMAXCSSZrr	2778
+VMAXCSSrm	2779
+VMAXCSSrr	2780
+VMAXPDYrm	2781
+VMAXPDYrr	2782
+VMAXPDZ	2783
+VMAXPDZrm	2784
+VMAXPDZrmb	2785
+VMAXPDZrmbk	2786
+VMAXPDZrmbkz	2787
+VMAXPDZrmk	2788
+VMAXPDZrmkz	2789
+VMAXPDZrr	2790
+VMAXPDZrrb	2791
+VMAXPDZrrbk	2792
+VMAXPDZrrbkz	2793
+VMAXPDZrrk	2794
+VMAXPDZrrkz	2795
+VMAXPDrm	2796
+VMAXPDrr	2797
+VMAXPHZ	2798
+VMAXPHZrm	2799
+VMAXPHZrmb	2800
+VMAXPHZrmbk	2801
+VMAXPHZrmbkz	2802
+VMAXPHZrmk	2803
+VMAXPHZrmkz	2804
+VMAXPHZrr	2805
+VMAXPHZrrb	2806
+VMAXPHZrrbk	2807
+VMAXPHZrrbkz	2808
+VMAXPHZrrk	2809
+VMAXPHZrrkz	2810
+VMAXPSYrm	2811
+VMAXPSYrr	2812
+VMAXPSZ	2813
+VMAXPSZrm	2814
+VMAXPSZrmb	2815
+VMAXPSZrmbk	2816
+VMAXPSZrmbkz	2817
+VMAXPSZrmk	2818
+VMAXPSZrmkz	2819
+VMAXPSZrr	2820
+VMAXPSZrrb	2821
+VMAXPSZrrbk	2822
+VMAXPSZrrbkz	2823
+VMAXPSZrrk	2824
+VMAXPSZrrkz	2825
+VMAXPSrm	2826
+VMAXPSrr	2827
+VMAXSDZrm	2828
+VMAXSDZrm_Int	2829
+VMAXSDZrmk_Int	2830
+VMAXSDZrmkz_Int	2831
+VMAXSDZrr	2832
+VMAXSDZrr_Int	2833
+VMAXSDZrrb_Int	2834
+VMAXSDZrrbk_Int	2835
+VMAXSDZrrbkz_Int	2836
+VMAXSDZrrk_Int	2837
+VMAXSDZrrkz_Int	2838
+VMAXSDrm	2839
+VMAXSDrm_Int	2840
+VMAXSDrr	2841
+VMAXSDrr_Int	2842
+VMAXSHZrm	2843
+VMAXSHZrm_Int	2844
+VMAXSHZrmk_Int	2845
+VMAXSHZrmkz_Int	2846
+VMAXSHZrr	2847
+VMAXSHZrr_Int	2848
+VMAXSHZrrb_Int	2849
+VMAXSHZrrbk_Int	2850
+VMAXSHZrrbkz_Int	2851
+VMAXSHZrrk_Int	2852
+VMAXSHZrrkz_Int	2853
+VMAXSSZrm	2854
+VMAXSSZrm_Int	2855
+VMAXSSZrmk_Int	2856
+VMAXSSZrmkz_Int	2857
+VMAXSSZrr	2858
+VMAXSSZrr_Int	2859
+VMAXSSZrrb_Int	2860
+VMAXSSZrrbk_Int	2861
+VMAXSSZrrbkz_Int	2862
+VMAXSSZrrk_Int	2863
+VMAXSSZrrkz_Int	2864
+VMAXSSrm	2865
+VMAXSSrm_Int	2866
+VMAXSSrr	2867
+VMAXSSrr_Int	2868
+VMCALL	2869
+VMCLEARm	2870
+VMFUNC	2871
+VMINBF	2872
+VMINCPDYrm	2873
+VMINCPDYrr	2874
+VMINCPDZ	2875
+VMINCPDZrm	2876
+VMINCPDZrmb	2877
+VMINCPDZrmbk	2878
+VMINCPDZrmbkz	2879
+VMINCPDZrmk	2880
+VMINCPDZrmkz	2881
+VMINCPDZrr	2882
+VMINCPDZrrk	2883
+VMINCPDZrrkz	2884
+VMINCPDrm	2885
+VMINCPDrr	2886
+VMINCPHZ	2887
+VMINCPHZrm	2888
+VMINCPHZrmb	2889
+VMINCPHZrmbk	2890
+VMINCPHZrmbkz	2891
+VMINCPHZrmk	2892
+VMINCPHZrmkz	2893
+VMINCPHZrr	2894
+VMINCPHZrrk	2895
+VMINCPHZrrkz	2896
+VMINCPSYrm	2897
+VMINCPSYrr	2898
+VMINCPSZ	2899
+VMINCPSZrm	2900
+VMINCPSZrmb	2901
+VMINCPSZrmbk	2902
+VMINCPSZrmbkz	2903
+VMINCPSZrmk	2904
+VMINCPSZrmkz	2905
+VMINCPSZrr	2906
+VMINCPSZrrk	2907
+VMINCPSZrrkz	2908
+VMINCPSrm	2909
+VMINCPSrr	2910
+VMINCSDZrm	2911
+VMINCSDZrr	2912
+VMINCSDrm	2913
+VMINCSDrr	2914
+VMINCSHZrm	2915
+VMINCSHZrr	2916
+VMINCSSZrm	2917
+VMINCSSZrr	2918
+VMINCSSrm	2919
+VMINCSSrr	2920
+VMINMAXBF	2921
+VMINMAXPDZ	2922
+VMINMAXPDZrmbi	2923
+VMINMAXPDZrmbik	2924
+VMINMAXPDZrmbikz	2925
+VMINMAXPDZrmi	2926
+VMINMAXPDZrmik	2927
+VMINMAXPDZrmikz	2928
+VMINMAXPDZrri	2929
+VMINMAXPDZrrib	2930
+VMINMAXPDZrribk	2931
+VMINMAXPDZrribkz	2932
+VMINMAXPDZrrik	2933
+VMINMAXPDZrrikz	2934
+VMINMAXPHZ	2935
+VMINMAXPHZrmbi	2936
+VMINMAXPHZrmbik	2937
+VMINMAXPHZrmbikz	2938
+VMINMAXPHZrmi	2939
+VMINMAXPHZrmik	2940
+VMINMAXPHZrmikz	2941
+VMINMAXPHZrri	2942
+VMINMAXPHZrrib	2943
+VMINMAXPHZrribk	2944
+VMINMAXPHZrribkz	2945
+VMINMAXPHZrrik	2946
+VMINMAXPHZrrikz	2947
+VMINMAXPSZ	2948
+VMINMAXPSZrmbi	2949
+VMINMAXPSZrmbik	2950
+VMINMAXPSZrmbikz	2951
+VMINMAXPSZrmi	2952
+VMINMAXPSZrmik	2953
+VMINMAXPSZrmikz	2954
+VMINMAXPSZrri	2955
+VMINMAXPSZrrib	2956
+VMINMAXPSZrribk	2957
+VMINMAXPSZrribkz	2958
+VMINMAXPSZrrik	2959
+VMINMAXPSZrrikz	2960
+VMINMAXSDrmi	2961
+VMINMAXSDrmi_Int	2962
+VMINMAXSDrmik_Int	2963
+VMINMAXSDrmikz_Int	2964
+VMINMAXSDrri	2965
+VMINMAXSDrri_Int	2966
+VMINMAXSDrrib_Int	2967
+VMINMAXSDrribk_Int	2968
+VMINMAXSDrribkz_Int	2969
+VMINMAXSDrrik_Int	2970
+VMINMAXSDrrikz_Int	2971
+VMINMAXSHrmi	2972
+VMINMAXSHrmi_Int	2973
+VMINMAXSHrmik_Int	2974
+VMINMAXSHrmikz_Int	2975
+VMINMAXSHrri	2976
+VMINMAXSHrri_Int	2977
+VMINMAXSHrrib_Int	2978
+VMINMAXSHrribk_Int	2979
+VMINMAXSHrribkz_Int	2980
+VMINMAXSHrrik_Int	2981
+VMINMAXSHrrikz_Int	2982
+VMINMAXSSrmi	2983
+VMINMAXSSrmi_Int	2984
+VMINMAXSSrmik_Int	2985
+VMINMAXSSrmikz_Int	2986
+VMINMAXSSrri	2987
+VMINMAXSSrri_Int	2988
+VMINMAXSSrrib_Int	2989
+VMINMAXSSrribk_Int	2990
+VMINMAXSSrribkz_Int	2991
+VMINMAXSSrrik_Int	2992
+VMINMAXSSrrikz_Int	2993
+VMINPDYrm	2994
+VMINPDYrr	2995
+VMINPDZ	2996
+VMINPDZrm	2997
+VMINPDZrmb	2998
+VMINPDZrmbk	2999
+VMINPDZrmbkz	3000
+VMINPDZrmk	3001
+VMINPDZrmkz	3002
+VMINPDZrr	3003
+VMINPDZrrb	3004
+VMINPDZrrbk	3005
+VMINPDZrrbkz	3006
+VMINPDZrrk	3007
+VMINPDZrrkz	3008
+VMINPDrm	3009
+VMINPDrr	3010
+VMINPHZ	3011
+VMINPHZrm	3012
+VMINPHZrmb	3013
+VMINPHZrmbk	3014
+VMINPHZrmbkz	3015
+VMINPHZrmk	3016
+VMINPHZrmkz	3017
+VMINPHZrr	3018
+VMINPHZrrb	3019
+VMINPHZrrbk	3020
+VMINPHZrrbkz	3021
+VMINPHZrrk	3022
+VMINPHZrrkz	3023
+VMINPSYrm	3024
+VMINPSYrr	3025
+VMINPSZ	3026
+VMINPSZrm	3027
+VMINPSZrmb	3028
+VMINPSZrmbk	3029
+VMINPSZrmbkz	3030
+VMINPSZrmk	3031
+VMINPSZrmkz	3032
+VMINPSZrr	3033
+VMINPSZrrb	3034
+VMINPSZrrbk	3035
+VMINPSZrrbkz	3036
+VMINPSZrrk	3037
+VMINPSZrrkz	3038
+VMINPSrm	3039
+VMINPSrr	3040
+VMINSDZrm	3041
+VMINSDZrm_Int	3042
+VMINSDZrmk_Int	3043
+VMINSDZrmkz_Int	3044
+VMINSDZrr	3045
+VMINSDZrr_Int	3046
+VMINSDZrrb_Int	3047
+VMINSDZrrbk_Int	3048
+VMINSDZrrbkz_Int	3049
+VMINSDZrrk_Int	3050
+VMINSDZrrkz_Int	3051
+VMINSDrm	3052
+VMINSDrm_Int	3053
+VMINSDrr	3054
+VMINSDrr_Int	3055
+VMINSHZrm	3056
+VMINSHZrm_Int	3057
+VMINSHZrmk_Int	3058
+VMINSHZrmkz_Int	3059
+VMINSHZrr	3060
+VMINSHZrr_Int	3061
+VMINSHZrrb_Int	3062
+VMINSHZrrbk_Int	3063
+VMINSHZrrbkz_Int	3064
+VMINSHZrrk_Int	3065
+VMINSHZrrkz_Int	3066
+VMINSSZrm	3067
+VMINSSZrm_Int	3068
+VMINSSZrmk_Int	3069
+VMINSSZrmkz_Int	3070
+VMINSSZrr	3071
+VMINSSZrr_Int	3072
+VMINSSZrrb_Int	3073
+VMINSSZrrbk_Int	3074
+VMINSSZrrbkz_Int	3075
+VMINSSZrrk_Int	3076
+VMINSSZrrkz_Int	3077
+VMINSSrm	3078
+VMINSSrm_Int	3079
+VMINSSrr	3080
+VMINSSrr_Int	3081
+VMLAUNCH	3082
+VMLOAD	3083
+VMMCALL	3084
+VMOV	3085
+VMOVAPDYmr	3086
+VMOVAPDYrm	3087
+VMOVAPDYrr	3088
+VMOVAPDYrr_REV	3089
+VMOVAPDZ	3090
+VMOVAPDZmr	3091
+VMOVAPDZmrk	3092
+VMOVAPDZrm	3093
+VMOVAPDZrmk	3094
+VMOVAPDZrmkz	3095
+VMOVAPDZrr	3096
+VMOVAPDZrr_REV	3097
+VMOVAPDZrrk	3098
+VMOVAPDZrrk_REV	3099
+VMOVAPDZrrkz	3100
+VMOVAPDZrrkz_REV	3101
+VMOVAPDmr	3102
+VMOVAPDrm	3103
+VMOVAPDrr	3104
+VMOVAPDrr_REV	3105
+VMOVAPSYmr	3106
+VMOVAPSYrm	3107
+VMOVAPSYrr	3108
+VMOVAPSYrr_REV	3109
+VMOVAPSZ	3110
+VMOVAPSZmr	3111
+VMOVAPSZmrk	3112
+VMOVAPSZrm	3113
+VMOVAPSZrmk	3114
+VMOVAPSZrmkz	3115
+VMOVAPSZrr	3116
+VMOVAPSZrr_REV	3117
+VMOVAPSZrrk	3118
+VMOVAPSZrrk_REV	3119
+VMOVAPSZrrkz	3120
+VMOVAPSZrrkz_REV	3121
+VMOVAPSmr	3122
+VMOVAPSrm	3123
+VMOVAPSrr	3124
+VMOVAPSrr_REV	3125
+VMOVDDUPYrm	3126
+VMOVDDUPYrr	3127
+VMOVDDUPZ	3128
+VMOVDDUPZrm	3129
+VMOVDDUPZrmk	3130
+VMOVDDUPZrmkz	3131
+VMOVDDUPZrr	3132
+VMOVDDUPZrrk	3133
+VMOVDDUPZrrkz	3134
+VMOVDDUPrm	3135
+VMOVDDUPrr	3136
+VMOVDI	3137
+VMOVDQA	3138
+VMOVDQAYmr	3139
+VMOVDQAYrm	3140
+VMOVDQAYrr	3141
+VMOVDQAYrr_REV	3142
+VMOVDQAmr	3143
+VMOVDQArm	3144
+VMOVDQArr	3145
+VMOVDQArr_REV	3146
+VMOVDQU	3147
+VMOVDQUYmr	3148
+VMOVDQUYrm	3149
+VMOVDQUYrr	3150
+VMOVDQUYrr_REV	3151
+VMOVDQUmr	3152
+VMOVDQUrm	3153
+VMOVDQUrr	3154
+VMOVDQUrr_REV	3155
+VMOVHLPSZrr	3156
+VMOVHLPSrr	3157
+VMOVHPDZ	3158
+VMOVHPDmr	3159
+VMOVHPDrm	3160
+VMOVHPSZ	3161
+VMOVHPSmr	3162
+VMOVHPSrm	3163
+VMOVLHPSZrr	3164
+VMOVLHPSrr	3165
+VMOVLPDZ	3166
+VMOVLPDmr	3167
+VMOVLPDrm	3168
+VMOVLPSZ	3169
+VMOVLPSmr	3170
+VMOVLPSrm	3171
+VMOVMSKPDYrr	3172
+VMOVMSKPDrr	3173
+VMOVMSKPSYrr	3174
+VMOVMSKPSrr	3175
+VMOVNTDQAYrm	3176
+VMOVNTDQAZ	3177
+VMOVNTDQAZrm	3178
+VMOVNTDQArm	3179
+VMOVNTDQYmr	3180
+VMOVNTDQZ	3181
+VMOVNTDQZmr	3182
+VMOVNTDQmr	3183
+VMOVNTPDYmr	3184
+VMOVNTPDZ	3185
+VMOVNTPDZmr	3186
+VMOVNTPDmr	3187
+VMOVNTPSYmr	3188
+VMOVNTPSZ	3189
+VMOVNTPSZmr	3190
+VMOVNTPSmr	3191
+VMOVPDI	3192
+VMOVPQI	3193
+VMOVPQIto	3194
+VMOVQI	3195
+VMOVRSBZ	3196
+VMOVRSBZm	3197
+VMOVRSBZmk	3198
+VMOVRSBZmkz	3199
+VMOVRSDZ	3200
+VMOVRSDZm	3201
+VMOVRSDZmk	3202
+VMOVRSDZmkz	3203
+VMOVRSQZ	3204
+VMOVRSQZm	3205
+VMOVRSQZmk	3206
+VMOVRSQZmkz	3207
+VMOVRSWZ	3208
+VMOVRSWZm	3209
+VMOVRSWZmk	3210
+VMOVRSWZmkz	3211
+VMOVSDZmr	3212
+VMOVSDZmrk	3213
+VMOVSDZrm	3214
+VMOVSDZrm_alt	3215
+VMOVSDZrmk	3216
+VMOVSDZrmkz	3217
+VMOVSDZrr	3218
+VMOVSDZrr_REV	3219
+VMOVSDZrrk	3220
+VMOVSDZrrk_REV	3221
+VMOVSDZrrkz	3222
+VMOVSDZrrkz_REV	3223
+VMOVSDmr	3224
+VMOVSDrm	3225
+VMOVSDrm_alt	3226
+VMOVSDrr	3227
+VMOVSDrr_REV	3228
+VMOVSDto	3229
+VMOVSH	3230
+VMOVSHDUPYrm	3231
+VMOVSHDUPYrr	3232
+VMOVSHDUPZ	3233
+VMOVSHDUPZrm	3234
+VMOVSHDUPZrmk	3235
+VMOVSHDUPZrmkz	3236
+VMOVSHDUPZrr	3237
+VMOVSHDUPZrrk	3238
+VMOVSHDUPZrrkz	3239
+VMOVSHDUPrm	3240
+VMOVSHDUPrr	3241
+VMOVSHZmr	3242
+VMOVSHZmrk	3243
+VMOVSHZrm	3244
+VMOVSHZrm_alt	3245
+VMOVSHZrmk	3246
+VMOVSHZrmkz	3247
+VMOVSHZrr	3248
+VMOVSHZrr_REV	3249
+VMOVSHZrrk	3250
+VMOVSHZrrk_REV	3251
+VMOVSHZrrkz	3252
+VMOVSHZrrkz_REV	3253
+VMOVSHtoW	3254
+VMOVSLDUPYrm	3255
+VMOVSLDUPYrr	3256
+VMOVSLDUPZ	3257
+VMOVSLDUPZrm	3258
+VMOVSLDUPZrmk	3259
+VMOVSLDUPZrmkz	3260
+VMOVSLDUPZrr	3261
+VMOVSLDUPZrrk	3262
+VMOVSLDUPZrrkz	3263
+VMOVSLDUPrm	3264
+VMOVSLDUPrr	3265
+VMOVSS	3266
+VMOVSSZmr	3267
+VMOVSSZmrk	3268
+VMOVSSZrm	3269
+VMOVSSZrm_alt	3270
+VMOVSSZrmk	3271
+VMOVSSZrmkz	3272
+VMOVSSZrr	3273
+VMOVSSZrr_REV	3274
+VMOVSSZrrk	3275
+VMOVSSZrrk_REV	3276
+VMOVSSZrrkz	3277
+VMOVSSZrrkz_REV	3278
+VMOVSSmr	3279
+VMOVSSrm	3280
+VMOVSSrm_alt	3281
+VMOVSSrr	3282
+VMOVSSrr_REV	3283
+VMOVUPDYmr	3284
+VMOVUPDYrm	3285
+VMOVUPDYrr	3286
+VMOVUPDYrr_REV	3287
+VMOVUPDZ	3288
+VMOVUPDZmr	3289
+VMOVUPDZmrk	3290
+VMOVUPDZrm	3291
+VMOVUPDZrmk	3292
+VMOVUPDZrmkz	3293
+VMOVUPDZrr	3294
+VMOVUPDZrr_REV	3295
+VMOVUPDZrrk	3296
+VMOVUPDZrrk_REV	3297
+VMOVUPDZrrkz	3298
+VMOVUPDZrrkz_REV	3299
+VMOVUPDmr	3300
+VMOVUPDrm	3301
+VMOVUPDrr	3302
+VMOVUPDrr_REV	3303
+VMOVUPSYmr	3304
+VMOVUPSYrm	3305
+VMOVUPSYrr	3306
+VMOVUPSYrr_REV	3307
+VMOVUPSZ	3308
+VMOVUPSZmr	3309
+VMOVUPSZmrk	3310
+VMOVUPSZrm	3311
+VMOVUPSZrmk	3312
+VMOVUPSZrmkz	3313
+VMOVUPSZrr	3314
+VMOVUPSZrr_REV	3315
+VMOVUPSZrrk	3316
+VMOVUPSZrrk_REV	3317
+VMOVUPSZrrkz	3318
+VMOVUPSZrrkz_REV	3319
+VMOVUPSmr	3320
+VMOVUPSrm	3321
+VMOVUPSrr	3322
+VMOVUPSrr_REV	3323
+VMOVW	3324
+VMOVWmr	3325
+VMOVWrm	3326
+VMOVZPDILo	3327
+VMOVZPQILo	3328
+VMOVZPWILo	3329
+VMPSADBWYrmi	3330
+VMPSADBWYrri	3331
+VMPSADBWZ	3332
+VMPSADBWZrmi	3333
+VMPSADBWZrmik	3334
+VMPSADBWZrmikz	3335
+VMPSADBWZrri	3336
+VMPSADBWZrrik	3337
+VMPSADBWZrrikz	3338
+VMPSADBWrmi	3339
+VMPSADBWrri	3340
+VMPTRLDm	3341
+VMPTRSTm	3342
+VMREAD	3343
+VMRESUME	3344
+VMRUN	3345
+VMSAVE	3346
+VMULBF	3347
+VMULPDYrm	3348
+VMULPDYrr	3349
+VMULPDZ	3350
+VMULPDZrm	3351
+VMULPDZrmb	3352
+VMULPDZrmbk	3353
+VMULPDZrmbkz	3354
+VMULPDZrmk	3355
+VMULPDZrmkz	3356
+VMULPDZrr	3357
+VMULPDZrrb	3358
+VMULPDZrrbk	3359
+VMULPDZrrbkz	3360
+VMULPDZrrk	3361
+VMULPDZrrkz	3362
+VMULPDrm	3363
+VMULPDrr	3364
+VMULPHZ	3365
+VMULPHZrm	3366
+VMULPHZrmb	3367
+VMULPHZrmbk	3368
+VMULPHZrmbkz	3369
+VMULPHZrmk	3370
+VMULPHZrmkz	3371
+VMULPHZrr	3372
+VMULPHZrrb	3373
+VMULPHZrrbk	3374
+VMULPHZrrbkz	3375
+VMULPHZrrk	3376
+VMULPHZrrkz	3377
+VMULPSYrm	3378
+VMULPSYrr	3379
+VMULPSZ	3380
+VMULPSZrm	3381
+VMULPSZrmb	3382
+VMULPSZrmbk	3383
+VMULPSZrmbkz	3384
+VMULPSZrmk	3385
+VMULPSZrmkz	3386
+VMULPSZrr	3387
+VMULPSZrrb	3388
+VMULPSZrrbk	3389
+VMULPSZrrbkz	3390
+VMULPSZrrk	3391
+VMULPSZrrkz	3392
+VMULPSrm	3393
+VMULPSrr	3394
+VMULSDZrm	3395
+VMULSDZrm_Int	3396
+VMULSDZrmk_Int	3397
+VMULSDZrmkz_Int	3398
+VMULSDZrr	3399
+VMULSDZrr_Int	3400
+VMULSDZrrb_Int	3401
+VMULSDZrrbk_Int	3402
+VMULSDZrrbkz_Int	3403
+VMULSDZrrk_Int	3404
+VMULSDZrrkz_Int	3405
+VMULSDrm	3406
+VMULSDrm_Int	3407
+VMULSDrr	3408
+VMULSDrr_Int	3409
+VMULSHZrm	3410
+VMULSHZrm_Int	3411
+VMULSHZrmk_Int	3412
+VMULSHZrmkz_Int	3413
+VMULSHZrr	3414
+VMULSHZrr_Int	3415
+VMULSHZrrb_Int	3416
+VMULSHZrrbk_Int	3417
+VMULSHZrrbkz_Int	3418
+VMULSHZrrk_Int	3419
+VMULSHZrrkz_Int	3420
+VMULSSZrm	3421
+VMULSSZrm_Int	3422
+VMULSSZrmk_Int	3423
+VMULSSZrmkz_Int	3424
+VMULSSZrr	3425
+VMULSSZrr_Int	3426
+VMULSSZrrb_Int	3427
+VMULSSZrrbk_Int	3428
+VMULSSZrrbkz_Int	3429
+VMULSSZrrk_Int	3430
+VMULSSZrrkz_Int	3431
+VMULSSrm	3432
+VMULSSrm_Int	3433
+VMULSSrr	3434
+VMULSSrr_Int	3435
+VMWRITE	3436
+VMXOFF	3437
+VMXON	3438
+VORPDYrm	3439
+VORPDYrr	3440
+VORPDZ	3441
+VORPDZrm	3442
+VORPDZrmb	3443
+VORPDZrmbk	3444
+VORPDZrmbkz	3445
+VORPDZrmk	3446
+VORPDZrmkz	3447
+VORPDZrr	3448
+VORPDZrrk	3449
+VORPDZrrkz	3450
+VORPDrm	3451
+VORPDrr	3452
+VORPSYrm	3453
+VORPSYrr	3454
+VORPSZ	3455
+VORPSZrm	3456
+VORPSZrmb	3457
+VORPSZrmbk	3458
+VORPSZrmbkz	3459
+VORPSZrmk	3460
+VORPSZrmkz	3461
+VORPSZrr	3462
+VORPSZrrk	3463
+VORPSZrrkz	3464
+VORPSrm	3465
+VORPSrr	3466
+VP	3467
+VPABSBYrm	3468
+VPABSBYrr	3469
+VPABSBZ	3470
+VPABSBZrm	3471
+VPABSBZrmk	3472
+VPABSBZrmkz	3473
+VPABSBZrr	3474
+VPABSBZrrk	3475
+VPABSBZrrkz	3476
+VPABSBrm	3477
+VPABSBrr	3478
+VPABSDYrm	3479
+VPABSDYrr	3480
+VPABSDZ	3481
+VPABSDZrm	3482
+VPABSDZrmb	3483
+VPABSDZrmbk	3484
+VPABSDZrmbkz	3485
+VPABSDZrmk	3486
+VPABSDZrmkz	3487
+VPABSDZrr	3488
+VPABSDZrrk	3489
+VPABSDZrrkz	3490
+VPABSDrm	3491
+VPABSDrr	3492
+VPABSQZ	3493
+VPABSQZrm	3494
+VPABSQZrmb	3495
+VPABSQZrmbk	3496
+VPABSQZrmbkz	3497
+VPABSQZrmk	3498
+VPABSQZrmkz	3499
+VPABSQZrr	3500
+VPABSQZrrk	3501
+VPABSQZrrkz	3502
+VPABSWYrm	3503
+VPABSWYrr	3504
+VPABSWZ	3505
+VPABSWZrm	3506
+VPABSWZrmk	3507
+VPABSWZrmkz	3508
+VPABSWZrr	3509
+VPABSWZrrk	3510
+VPABSWZrrkz	3511
+VPABSWrm	3512
+VPABSWrr	3513
+VPACKSSDWYrm	3514
+VPACKSSDWYrr	3515
+VPACKSSDWZ	3516
+VPACKSSDWZrm	3517
+VPACKSSDWZrmb	3518
+VPACKSSDWZrmbk	3519
+VPACKSSDWZrmbkz	3520
+VPACKSSDWZrmk	3521
+VPACKSSDWZrmkz	3522
+VPACKSSDWZrr	3523
+VPACKSSDWZrrk	3524
+VPACKSSDWZrrkz	3525
+VPACKSSDWrm	3526
+VPACKSSDWrr	3527
+VPACKSSWBYrm	3528
+VPACKSSWBYrr	3529
+VPACKSSWBZ	3530
+VPACKSSWBZrm	3531
+VPACKSSWBZrmk	3532
+VPACKSSWBZrmkz	3533
+VPACKSSWBZrr	3534
+VPACKSSWBZrrk	3535
+VPACKSSWBZrrkz	3536
+VPACKSSWBrm	3537
+VPACKSSWBrr	3538
+VPACKUSDWYrm	3539
+VPACKUSDWYrr	3540
+VPACKUSDWZ	3541
+VPACKUSDWZrm	3542
+VPACKUSDWZrmb	3543
+VPACKUSDWZrmbk	3544
+VPACKUSDWZrmbkz	3545
+VPACKUSDWZrmk	3546
+VPACKUSDWZrmkz	3547
+VPACKUSDWZrr	3548
+VPACKUSDWZrrk	3549
+VPACKUSDWZrrkz	3550
+VPACKUSDWrm	3551
+VPACKUSDWrr	3552
+VPACKUSWBYrm	3553
+VPACKUSWBYrr	3554
+VPACKUSWBZ	3555
+VPACKUSWBZrm	3556
+VPACKUSWBZrmk	3557
+VPACKUSWBZrmkz	3558
+VPACKUSWBZrr	3559
+VPACKUSWBZrrk	3560
+VPACKUSWBZrrkz	3561
+VPACKUSWBrm	3562
+VPACKUSWBrr	3563
+VPADDBYrm	3564
+VPADDBYrr	3565
+VPADDBZ	3566
+VPADDBZrm	3567
+VPADDBZrmk	3568
+VPADDBZrmkz	3569
+VPADDBZrr	3570
+VPADDBZrrk	3571
+VPADDBZrrkz	3572
+VPADDBrm	3573
+VPADDBrr	3574
+VPADDDYrm	3575
+VPADDDYrr	3576
+VPADDDZ	3577
+VPADDDZrm	3578
+VPADDDZrmb	3579
+VPADDDZrmbk	3580
+VPADDDZrmbkz	3581
+VPADDDZrmk	3582
+VPADDDZrmkz	3583
+VPADDDZrr	3584
+VPADDDZrrk	3585
+VPADDDZrrkz	3586
+VPADDDrm	3587
+VPADDDrr	3588
+VPADDQYrm	3589
+VPADDQYrr	3590
+VPADDQZ	3591
+VPADDQZrm	3592
+VPADDQZrmb	3593
+VPADDQZrmbk	3594
+VPADDQZrmbkz	3595
+VPADDQZrmk	3596
+VPADDQZrmkz	3597
+VPADDQZrr	3598
+VPADDQZrrk	3599
+VPADDQZrrkz	3600
+VPADDQrm	3601
+VPADDQrr	3602
+VPADDSBYrm	3603
+VPADDSBYrr	3604
+VPADDSBZ	3605
+VPADDSBZrm	3606
+VPADDSBZrmk	3607
+VPADDSBZrmkz	3608
+VPADDSBZrr	3609
+VPADDSBZrrk	3610
+VPADDSBZrrkz	3611
+VPADDSBrm	3612
+VPADDSBrr	3613
+VPADDSWYrm	3614
+VPADDSWYrr	3615
+VPADDSWZ	3616
+VPADDSWZrm	3617
+VPADDSWZrmk	3618
+VPADDSWZrmkz	3619
+VPADDSWZrr	3620
+VPADDSWZrrk	3621
+VPADDSWZrrkz	3622
+VPADDSWrm	3623
+VPADDSWrr	3624
+VPADDUSBYrm	3625
+VPADDUSBYrr	3626
+VPADDUSBZ	3627
+VPADDUSBZrm	3628
+VPADDUSBZrmk	3629
+VPADDUSBZrmkz	3630
+VPADDUSBZrr	3631
+VPADDUSBZrrk	3632
+VPADDUSBZrrkz	3633
+VPADDUSBrm	3634
+VPADDUSBrr	3635
+VPADDUSWYrm	3636
+VPADDUSWYrr	3637
+VPADDUSWZ	3638
+VPADDUSWZrm	3639
+VPADDUSWZrmk	3640
+VPADDUSWZrmkz	3641
+VPADDUSWZrr	3642
+VPADDUSWZrrk	3643
+VPADDUSWZrrkz	3644
+VPADDUSWrm	3645
+VPADDUSWrr	3646
+VPADDWYrm	3647
+VPADDWYrr	3648
+VPADDWZ	3649
+VPADDWZrm	3650
+VPADDWZrmk	3651
+VPADDWZrmkz	3652
+VPADDWZrr	3653
+VPADDWZrrk	3654
+VPADDWZrrkz	3655
+VPADDWrm	3656
+VPADDWrr	3657
+VPALIGNRYrmi	3658
+VPALIGNRYrri	3659
+VPALIGNRZ	3660
+VPALIGNRZrmi	3661
+VPALIGNRZrmik	3662
+VPALIGNRZrmikz	3663
+VPALIGNRZrri	3664
+VPALIGNRZrrik	3665
+VPALIGNRZrrikz	3666
+VPALIGNRrmi	3667
+VPALIGNRrri	3668
+VPANDDZ	3669
+VPANDDZrm	3670
+VPANDDZrmb	3671
+VPANDDZrmbk	3672
+VPANDDZrmbkz	3673
+VPANDDZrmk	3674
+VPANDDZrmkz	3675
+VPANDDZrr	3676
+VPANDDZrrk	3677
+VPANDDZrrkz	3678
+VPANDNDZ	3679
+VPANDNDZrm	3680
+VPANDNDZrmb	3681
+VPANDNDZrmbk	3682
+VPANDNDZrmbkz	3683
+VPANDNDZrmk	3684
+VPANDNDZrmkz	3685
+VPANDNDZrr	3686
+VPANDNDZrrk	3687
+VPANDNDZrrkz	3688
+VPANDNQZ	3689
+VPANDNQZrm	3690
+VPANDNQZrmb	3691
+VPANDNQZrmbk	3692
+VPANDNQZrmbkz	3693
+VPANDNQZrmk	3694
+VPANDNQZrmkz	3695
+VPANDNQZrr	3696
+VPANDNQZrrk	3697
+VPANDNQZrrkz	3698
+VPANDNYrm	3699
+VPANDNYrr	3700
+VPANDNrm	3701
+VPANDNrr	3702
+VPANDQZ	3703
+VPANDQZrm	3704
+VPANDQZrmb	3705
+VPANDQZrmbk	3706
+VPANDQZrmbkz	3707
+VPANDQZrmk	3708
+VPANDQZrmkz	3709
+VPANDQZrr	3710
+VPANDQZrrk	3711
+VPANDQZrrkz	3712
+VPANDYrm	3713
+VPANDYrr	3714
+VPANDrm	3715
+VPANDrr	3716
+VPAVGBYrm	3717
+VPAVGBYrr	3718
+VPAVGBZ	3719
+VPAVGBZrm	3720
+VPAVGBZrmk	3721
+VPAVGBZrmkz	3722
+VPAVGBZrr	3723
+VPAVGBZrrk	3724
+VPAVGBZrrkz	3725
+VPAVGBrm	3726
+VPAVGBrr	3727
+VPAVGWYrm	3728
+VPAVGWYrr	3729
+VPAVGWZ	3730
+VPAVGWZrm	3731
+VPAVGWZrmk	3732
+VPAVGWZrmkz	3733
+VPAVGWZrr	3734
+VPAVGWZrrk	3735
+VPAVGWZrrkz	3736
+VPAVGWrm	3737
+VPAVGWrr	3738
+VPBLENDDYrmi	3739
+VPBLENDDYrri	3740
+VPBLENDDrmi	3741
+VPBLENDDrri	3742
+VPBLENDMBZ	3743
+VPBLENDMBZrm	3744
+VPBLENDMBZrmk	3745
+VPBLENDMBZrmkz	3746
+VPBLENDMBZrr	3747
+VPBLENDMBZrrk	3748
+VPBLENDMBZrrkz	3749
+VPBLENDMDZ	3750
+VPBLENDMDZrm	3751
+VPBLENDMDZrmb	3752
+VPBLENDMDZrmbk	3753
+VPBLENDMDZrmbkz	3754
+VPBLENDMDZrmk	3755
+VPBLENDMDZrmkz	3756
+VPBLENDMDZrr	3757
+VPBLENDMDZrrk	3758
+VPBLENDMDZrrkz	3759
+VPBLENDMQZ	3760
+VPBLENDMQZrm	3761
+VPBLENDMQZrmb	3762
+VPBLENDMQZrmbk	3763
+VPBLENDMQZrmbkz	3764
+VPBLENDMQZrmk	3765
+VPBLENDMQZrmkz	3766
+VPBLENDMQZrr	3767
+VPBLENDMQZrrk	3768
+VPBLENDMQZrrkz	3769
+VPBLENDMWZ	3770
+VPBLENDMWZrm	3771
+VPBLENDMWZrmk	3772
+VPBLENDMWZrmkz	3773
+VPBLENDMWZrr	3774
+VPBLENDMWZrrk	3775
+VPBLENDMWZrrkz	3776
+VPBLENDVBYrmr	3777
+VPBLENDVBYrrr	3778
+VPBLENDVBrmr	3779
+VPBLENDVBrrr	3780
+VPBLENDWYrmi	3781
+VPBLENDWYrri	3782
+VPBLENDWrmi	3783
+VPBLENDWrri	3784
+VPBROADCASTBYrm	3785
+VPBROADCASTBYrr	3786
+VPBROADCASTBZ	3787
+VPBROADCASTBZrm	3788
+VPBROADCASTBZrmk	3789
+VPBROADCASTBZrmkz	3790
+VPBROADCASTBZrr	3791
+VPBROADCASTBZrrk	3792
+VPBROADCASTBZrrkz	3793
+VPBROADCASTBrZ	3794
+VPBROADCASTBrZrr	3795
+VPBROADCASTBrZrrk	3796
+VPBROADCASTBrZrrkz	3797
+VPBROADCASTBrm	3798
+VPBROADCASTBrr	3799
+VPBROADCASTDYrm	3800
+VPBROADCASTDYrr	3801
+VPBROADCASTDZ	3802
+VPBROADCASTDZrm	3803
+VPBROADCASTDZrmk	3804
+VPBROADCASTDZrmkz	3805
+VPBROADCASTDZrr	3806
+VPBROADCASTDZrrk	3807
+VPBROADCASTDZrrkz	3808
+VPBROADCASTDrZ	3809
+VPBROADCASTDrZrr	3810
+VPBROADCASTDrZrrk	3811
+VPBROADCASTDrZrrkz	3812
+VPBROADCASTDrm	3813
+VPBROADCASTDrr	3814
+VPBROADCASTMB	3815
+VPBROADCASTMW	3816
+VPBROADCASTQYrm	3817
+VPBROADCASTQYrr	3818
+VPBROADCASTQZ	3819
+VPBROADCASTQZrm	3820
+VPBROADCASTQZrmk	3821
+VPBROADCASTQZrmkz	3822
+VPBROADCASTQZrr	3823
+VPBROADCASTQZrrk	3824
+VPBROADCASTQZrrkz	3825
+VPBROADCASTQrZ	3826
+VPBROADCASTQrZrr	3827
+VPBROADCASTQrZrrk	3828
+VPBROADCASTQrZrrkz	3829
+VPBROADCASTQrm	3830
+VPBROADCASTQrr	3831
+VPBROADCASTWYrm	3832
+VPBROADCASTWYrr	3833
+VPBROADCASTWZ	3834
+VPBROADCASTWZrm	3835
+VPBROADCASTWZrmk	3836
+VPBROADCASTWZrmkz	3837
+VPBROADCASTWZrr	3838
+VPBROADCASTWZrrk	3839
+VPBROADCASTWZrrkz	3840
+VPBROADCASTWrZ	3841
+VPBROADCASTWrZrr	3842
+VPBROADCASTWrZrrk	3843
+VPBROADCASTWrZrrkz	3844
+VPBROADCASTWrm	3845
+VPBROADCASTWrr	3846
+VPCLMULQDQYrmi	3847
+VPCLMULQDQYrri	3848
+VPCLMULQDQZ	3849
+VPCLMULQDQZrmi	3850
+VPCLMULQDQZrri	3851
+VPCLMULQDQrmi	3852
+VPCLMULQDQrri	3853
+VPCMOVYrmr	3854
+VPCMOVYrrm	3855
+VPCMOVYrrr	3856
+VPCMOVYrrr_REV	3857
+VPCMOVrmr	3858
+VPCMOVrrm	3859
+VPCMOVrrr	3860
+VPCMOVrrr_REV	3861
+VPCMPBZ	3862
+VPCMPBZrmi	3863
+VPCMPBZrmik	3864
+VPCMPBZrri	3865
+VPCMPBZrrik	3866
+VPCMPDZ	3867
+VPCMPDZrmbi	3868
+VPCMPDZrmbik	3869
+VPCMPDZrmi	3870
+VPCMPDZrmik	3871
+VPCMPDZrri	3872
+VPCMPDZrrik	3873
+VPCMPEQBYrm	3874
+VPCMPEQBYrr	3875
+VPCMPEQBZ	3876
+VPCMPEQBZrm	3877
+VPCMPEQBZrmk	3878
+VPCMPEQBZrr	3879
+VPCMPEQBZrrk	3880
+VPCMPEQBrm	3881
+VPCMPEQBrr	3882
+VPCMPEQDYrm	3883
+VPCMPEQDYrr	3884
+VPCMPEQDZ	3885
+VPCMPEQDZrm	3886
+VPCMPEQDZrmb	3887
+VPCMPEQDZrmbk	3888
+VPCMPEQDZrmk	3889
+VPCMPEQDZrr	3890
+VPCMPEQDZrrk	3891
+VPCMPEQDrm	3892
+VPCMPEQDrr	3893
+VPCMPEQQYrm	3894
+VPCMPEQQYrr	3895
+VPCMPEQQZ	3896
+VPCMPEQQZrm	3897
+VPCMPEQQZrmb	3898
+VPCMPEQQZrmbk	3899
+VPCMPEQQZrmk	3900
+VPCMPEQQZrr	3901
+VPCMPEQQZrrk	3902
+VPCMPEQQrm	3903
+VPCMPEQQrr	3904
+VPCMPEQWYrm	3905
+VPCMPEQWYrr	3906
+VPCMPEQWZ	3907
+VPCMPEQWZrm	3908
+VPCMPEQWZrmk	3909
+VPCMPEQWZrr	3910
+VPCMPEQWZrrk	3911
+VPCMPEQWrm	3912
+VPCMPEQWrr	3913
+VPCMPESTRIrmi	3914
+VPCMPESTRIrri	3915
+VPCMPESTRMrmi	3916
+VPCMPESTRMrri	3917
+VPCMPGTBYrm	3918
+VPCMPGTBYrr	3919
+VPCMPGTBZ	3920
+VPCMPGTBZrm	3921
+VPCMPGTBZrmk	3922
+VPCMPGTBZrr	3923
+VPCMPGTBZrrk	3924
+VPCMPGTBrm	3925
+VPCMPGTBrr	3926
+VPCMPGTDYrm	3927
+VPCMPGTDYrr	3928
+VPCMPGTDZ	3929
+VPCMPGTDZrm	3930
+VPCMPGTDZrmb	3931
+VPCMPGTDZrmbk	3932
+VPCMPGTDZrmk	3933
+VPCMPGTDZrr	3934
+VPCMPGTDZrrk	3935
+VPCMPGTDrm	3936
+VPCMPGTDrr	3937
+VPCMPGTQYrm	3938
+VPCMPGTQYrr	3939
+VPCMPGTQZ	3940
+VPCMPGTQZrm	3941
+VPCMPGTQZrmb	3942
+VPCMPGTQZrmbk	3943
+VPCMPGTQZrmk	3944
+VPCMPGTQZrr	3945
+VPCMPGTQZrrk	3946
+VPCMPGTQrm	3947
+VPCMPGTQrr	3948
+VPCMPGTWYrm	3949
+VPCMPGTWYrr	3950
+VPCMPGTWZ	3951
+VPCMPGTWZrm	3952
+VPCMPGTWZrmk	3953
+VPCMPGTWZrr	3954
+VPCMPGTWZrrk	3955
+VPCMPGTWrm	3956
+VPCMPGTWrr	3957
+VPCMPISTRIrmi	3958
+VPCMPISTRIrri	3959
+VPCMPISTRMrmi	3960
+VPCMPISTRMrri	3961
+VPCMPQZ	3962
+VPCMPQZrmbi	3963
+VPCMPQZrmbik	3964
+VPCMPQZrmi	3965
+VPCMPQZrmik	3966
+VPCMPQZrri	3967
+VPCMPQZrrik	3968
+VPCMPUBZ	3969
+VPCMPUBZrmi	3970
+VPCMPUBZrmik	3971
+VPCMPUBZrri	3972
+VPCMPUBZrrik	3973
+VPCMPUDZ	3974
+VPCMPUDZrmbi	3975
+VPCMPUDZrmbik	3976
+VPCMPUDZrmi	3977
+VPCMPUDZrmik	3978
+VPCMPUDZrri	3979
+VPCMPUDZrrik	3980
+VPCMPUQZ	3981
+VPCMPUQZrmbi	3982
+VPCMPUQZrmbik	3983
+VPCMPUQZrmi	3984
+VPCMPUQZrmik	3985
+VPCMPUQZrri	3986
+VPCMPUQZrrik	3987
+VPCMPUWZ	3988
+VPCMPUWZrmi	3989
+VPCMPUWZrmik	3990
+VPCMPUWZrri	3991
+VPCMPUWZrrik	3992
+VPCMPWZ	3993
+VPCMPWZrmi	3994
+VPCMPWZrmik	3995
+VPCMPWZrri	3996
+VPCMPWZrrik	3997
+VPCOMBmi	3998
+VPCOMBri	3999
+VPCOMDmi	4000
+VPCOMDri	4001
+VPCOMPRESSBZ	4002
+VPCOMPRESSBZmr	4003
+VPCOMPRESSBZmrk	4004
+VPCOMPRESSBZrr	4005
+VPCOMPRESSBZrrk	4006
+VPCOMPRESSBZrrkz	4007
+VPCOMPRESSDZ	4008
+VPCOMPRESSDZmr	4009
+VPCOMPRESSDZmrk	4010
+VPCOMPRESSDZrr	4011
+VPCOMPRESSDZrrk	4012
+VPCOMPRESSDZrrkz	4013
+VPCOMPRESSQZ	4014
+VPCOMPRESSQZmr	4015
+VPCOMPRESSQZmrk	4016
+VPCOMPRESSQZrr	4017
+VPCOMPRESSQZrrk	4018
+VPCOMPRESSQZrrkz	4019
+VPCOMPRESSWZ	4020
+VPCOMPRESSWZmr	4021
+VPCOMPRESSWZmrk	4022
+VPCOMPRESSWZrr	4023
+VPCOMPRESSWZrrk	4024
+VPCOMPRESSWZrrkz	4025
+VPCOMQmi	4026
+VPCOMQri	4027
+VPCOMUBmi	4028
+VPCOMUBri	4029
+VPCOMUDmi	4030
+VPCOMUDri	4031
+VPCOMUQmi	4032
+VPCOMUQri	4033
+VPCOMUWmi	4034
+VPCOMUWri	4035
+VPCOMWmi	4036
+VPCOMWri	4037
+VPCONFLICTDZ	4038
+VPCONFLICTDZrm	4039
+VPCONFLICTDZrmb	4040
+VPCONFLICTDZrmbk	4041
+VPCONFLICTDZrmbkz	4042
+VPCONFLICTDZrmk	4043
+VPCONFLICTDZrmkz	4044
+VPCONFLICTDZrr	4045
+VPCONFLICTDZrrk	4046
+VPCONFLICTDZrrkz	4047
+VPCONFLICTQZ	4048
+VPCONFLICTQZrm	4049
+VPCONFLICTQZrmb	4050
+VPCONFLICTQZrmbk	4051
+VPCONFLICTQZrmbkz	4052
+VPCONFLICTQZrmk	4053
+VPCONFLICTQZrmkz	4054
+VPCONFLICTQZrr	4055
+VPCONFLICTQZrrk	4056
+VPCONFLICTQZrrkz	4057
+VPDPBSSDSYrm	4058
+VPDPBSSDSYrr	4059
+VPDPBSSDSZ	4060
+VPDPBSSDSZrm	4061
+VPDPBSSDSZrmb	4062
+VPDPBSSDSZrmbk	4063
+VPDPBSSDSZrmbkz	4064
+VPDPBSSDSZrmk	4065
+VPDPBSSDSZrmkz	4066
+VPDPBSSDSZrr	4067
+VPDPBSSDSZrrk	4068
+VPDPBSSDSZrrkz	4069
+VPDPBSSDSrm	4070
+VPDPBSSDSrr	4071
+VPDPBSSDYrm	4072
+VPDPBSSDYrr	4073
+VPDPBSSDZ	4074
+VPDPBSSDZrm	4075
+VPDPBSSDZrmb	4076
+VPDPBSSDZrmbk	4077
+VPDPBSSDZrmbkz	4078
+VPDPBSSDZrmk	4079
+VPDPBSSDZrmkz	4080
+VPDPBSSDZrr	4081
+VPDPBSSDZrrk	4082
+VPDPBSSDZrrkz	4083
+VPDPBSSDrm	4084
+VPDPBSSDrr	4085
+VPDPBSUDSYrm	4086
+VPDPBSUDSYrr	4087
+VPDPBSUDSZ	4088
+VPDPBSUDSZrm	4089
+VPDPBSUDSZrmb	4090
+VPDPBSUDSZrmbk	4091
+VPDPBSUDSZrmbkz	4092
+VPDPBSUDSZrmk	4093
+VPDPBSUDSZrmkz	4094
+VPDPBSUDSZrr	4095
+VPDPBSUDSZrrk	4096
+VPDPBSUDSZrrkz	4097
+VPDPBSUDSrm	4098
+VPDPBSUDSrr	4099
+VPDPBSUDYrm	4100
+VPDPBSUDYrr	4101
+VPDPBSUDZ	4102
+VPDPBSUDZrm	4103
+VPDPBSUDZrmb	4104
+VPDPBSUDZrmbk	4105
+VPDPBSUDZrmbkz	4106
+VPDPBSUDZrmk	4107
+VPDPBSUDZrmkz	4108
+VPDPBSUDZrr	4109
+VPDPBSUDZrrk	4110
+VPDPBSUDZrrkz	4111
+VPDPBSUDrm	4112
+VPDPBSUDrr	4113
+VPDPBUSDSYrm	4114
+VPDPBUSDSYrr	4115
+VPDPBUSDSZ	4116
+VPDPBUSDSZrm	4117
+VPDPBUSDSZrmb	4118
+VPDPBUSDSZrmbk	4119
+VPDPBUSDSZrmbkz	4120
+VPDPBUSDSZrmk	4121
+VPDPBUSDSZrmkz	4122
+VPDPBUSDSZrr	4123
+VPDPBUSDSZrrk	4124
+VPDPBUSDSZrrkz	4125
+VPDPBUSDSrm	4126
+VPDPBUSDSrr	4127
+VPDPBUSDYrm	4128
+VPDPBUSDYrr	4129
+VPDPBUSDZ	4130
+VPDPBUSDZrm	4131
+VPDPBUSDZrmb	4132
+VPDPBUSDZrmbk	4133
+VPDPBUSDZrmbkz	4134
+VPDPBUSDZrmk	4135
+VPDPBUSDZrmkz	4136
+VPDPBUSDZrr	4137
+VPDPBUSDZrrk	4138
+VPDPBUSDZrrkz	4139
+VPDPBUSDrm	4140
+VPDPBUSDrr	4141
+VPDPBUUDSYrm	4142
+VPDPBUUDSYrr	4143
+VPDPBUUDSZ	4144
+VPDPBUUDSZrm	4145
+VPDPBUUDSZrmb	4146
+VPDPBUUDSZrmbk	4147
+VPDPBUUDSZrmbkz	4148
+VPDPBUUDSZrmk	4149
+VPDPBUUDSZrmkz	4150
+VPDPBUUDSZrr	4151
+VPDPBUUDSZrrk	4152
+VPDPBUUDSZrrkz	4153
+VPDPBUUDSrm	4154
+VPDPBUUDSrr	4155
+VPDPBUUDYrm	4156
+VPDPBUUDYrr	4157
+VPDPBUUDZ	4158
+VPDPBUUDZrm	4159
+VPDPBUUDZrmb	4160
+VPDPBUUDZrmbk	4161
+VPDPBUUDZrmbkz	4162
+VPDPBUUDZrmk	4163
+VPDPBUUDZrmkz	4164
+VPDPBUUDZrr	4165
+VPDPBUUDZrrk	4166
+VPDPBUUDZrrkz	4167
+VPDPBUUDrm	4168
+VPDPBUUDrr	4169
+VPDPWSSDSYrm	4170
+VPDPWSSDSYrr	4171
+VPDPWSSDSZ	4172
+VPDPWSSDSZrm	4173
+VPDPWSSDSZrmb	4174
+VPDPWSSDSZrmbk	4175
+VPDPWSSDSZrmbkz	4176
+VPDPWSSDSZrmk	4177
+VPDPWSSDSZrmkz	4178
+VPDPWSSDSZrr	4179
+VPDPWSSDSZrrk	4180
+VPDPWSSDSZrrkz	4181
+VPDPWSSDSrm	4182
+VPDPWSSDSrr	4183
+VPDPWSSDYrm	4184
+VPDPWSSDYrr	4185
+VPDPWSSDZ	4186
+VPDPWSSDZrm	4187
+VPDPWSSDZrmb	4188
+VPDPWSSDZrmbk	4189
+VPDPWSSDZrmbkz	4190
+VPDPWSSDZrmk	4191
+VPDPWSSDZrmkz	4192
+VPDPWSSDZrr	4193
+VPDPWSSDZrrk	4194
+VPDPWSSDZrrkz	4195
+VPDPWSSDrm	4196
+VPDPWSSDrr	4197
+VPDPWSUDSYrm	4198
+VPDPWSUDSYrr	4199
+VPDPWSUDSZ	4200
+VPDPWSUDSZrm	4201
+VPDPWSUDSZrmb	4202
+VPDPWSUDSZrmbk	4203
+VPDPWSUDSZrmbkz	4204
+VPDPWSUDSZrmk	4205
+VPDPWSUDSZrmkz	4206
+VPDPWSUDSZrr	4207
+VPDPWSUDSZrrk	4208
+VPDPWSUDSZrrkz	4209
+VPDPWSUDSrm	4210
+VPDPWSUDSrr	4211
+VPDPWSUDYrm	4212
+VPDPWSUDYrr	4213
+VPDPWSUDZ	4214
+VPDPWSUDZrm	4215
+VPDPWSUDZrmb	4216
+VPDPWSUDZrmbk	4217
+VPDPWSUDZrmbkz	4218
+VPDPWSUDZrmk	4219
+VPDPWSUDZrmkz	4220
+VPDPWSUDZrr	4221
+VPDPWSUDZrrk	4222
+VPDPWSUDZrrkz	4223
+VPDPWSUDrm	4224
+VPDPWSUDrr	4225
+VPDPWUSDSYrm	4226
+VPDPWUSDSYrr	4227
+VPDPWUSDSZ	4228
+VPDPWUSDSZrm	4229
+VPDPWUSDSZrmb	4230
+VPDPWUSDSZrmbk	4231
+VPDPWUSDSZrmbkz	4232
+VPDPWUSDSZrmk	4233
+VPDPWUSDSZrmkz	4234
+VPDPWUSDSZrr	4235
+VPDPWUSDSZrrk	4236
+VPDPWUSDSZrrkz	4237
+VPDPWUSDSrm	4238
+VPDPWUSDSrr	4239
+VPDPWUSDYrm	4240
+VPDPWUSDYrr	4241
+VPDPWUSDZ	4242
+VPDPWUSDZrm	4243
+VPDPWUSDZrmb	4244
+VPDPWUSDZrmbk	4245
+VPDPWUSDZrmbkz	4246
+VPDPWUSDZrmk	4247
+VPDPWUSDZrmkz	4248
+VPDPWUSDZrr	4249
+VPDPWUSDZrrk	4250
+VPDPWUSDZrrkz	4251
+VPDPWUSDrm	4252
+VPDPWUSDrr	4253
+VPDPWUUDSYrm	4254
+VPDPWUUDSYrr	4255
+VPDPWUUDSZ	4256
+VPDPWUUDSZrm	4257
+VPDPWUUDSZrmb	4258
+VPDPWUUDSZrmbk	4259
+VPDPWUUDSZrmbkz	4260
+VPDPWUUDSZrmk	4261
+VPDPWUUDSZrmkz	4262
+VPDPWUUDSZrr	4263
+VPDPWUUDSZrrk	4264
+VPDPWUUDSZrrkz	4265
+VPDPWUUDSrm	4266
+VPDPWUUDSrr	4267
+VPDPWUUDYrm	4268
+VPDPWUUDYrr	4269
+VPDPWUUDZ	4270
+VPDPWUUDZrm	4271
+VPDPWUUDZrmb	4272
+VPDPWUUDZrmbk	4273
+VPDPWUUDZrmbkz	4274
+VPDPWUUDZrmk	4275
+VPDPWUUDZrmkz	4276
+VPDPWUUDZrr	4277
+VPDPWUUDZrrk	4278
+VPDPWUUDZrrkz	4279
+VPDPWUUDrm	4280
+VPDPWUUDrr	4281
+VPERM	4282
+VPERMBZ	4283
+VPERMBZrm	4284
+VPERMBZrmk	4285
+VPERMBZrmkz	4286
+VPERMBZrr	4287
+VPERMBZrrk	4288
+VPERMBZrrkz	4289
+VPERMDYrm	4290
+VPERMDYrr	4291
+VPERMDZ	4292
+VPERMDZrm	4293
+VPERMDZrmb	4294
+VPERMDZrmbk	4295
+VPERMDZrmbkz	4296
+VPERMDZrmk	4297
+VPERMDZrmkz	4298
+VPERMDZrr	4299
+VPERMDZrrk	4300
+VPERMDZrrkz	4301
+VPERMI	4302
+VPERMIL	4303
+VPERMILPDYmi	4304
+VPERMILPDYri	4305
+VPERMILPDYrm	4306
+VPERMILPDYrr	4307
+VPERMILPDZ	4308
+VPERMILPDZmbi	4309
+VPERMILPDZmbik	4310
+VPERMILPDZmbikz	4311
+VPERMILPDZmi	4312
+VPERMILPDZmik	4313
+VPERMILPDZmikz	4314
+VPERMILPDZri	4315
+VPERMILPDZrik	4316
+VPERMILPDZrikz	4317
+VPERMILPDZrm	4318
+VPERMILPDZrmb	4319
+VPERMILPDZrmbk	4320
+VPERMILPDZrmbkz	4321
+VPERMILPDZrmk	4322
+VPERMILPDZrmkz	4323
+VPERMILPDZrr	4324
+VPERMILPDZrrk	4325
+VPERMILPDZrrkz	4326
+VPERMILPDmi	4327
+VPERMILPDri	4328
+VPERMILPDrm	4329
+VPERMILPDrr	4330
+VPERMILPSYmi	4331
+VPERMILPSYri	4332
+VPERMILPSYrm	4333
+VPERMILPSYrr	4334
+VPERMILPSZ	4335
+VPERMILPSZmbi	4336
+VPERMILPSZmbik	4337
+VPERMILPSZmbikz	4338
+VPERMILPSZmi	4339
+VPERMILPSZmik	4340
+VPERMILPSZmikz	4341
+VPERMILPSZri	4342
+VPERMILPSZrik	4343
+VPERMILPSZrikz	4344
+VPERMILPSZrm	4345
+VPERMILPSZrmb	4346
+VPERMILPSZrmbk	4347
+VPERMILPSZrmbkz	4348
+VPERMILPSZrmk	4349
+VPERMILPSZrmkz	4350
+VPERMILPSZrr	4351
+VPERMILPSZrrk	4352
+VPERMILPSZrrkz	4353
+VPERMILPSmi	4354
+VPERMILPSri	4355
+VPERMILPSrm	4356
+VPERMILPSrr	4357
+VPERMPDYmi	4358
+VPERMPDYri	4359
+VPERMPDZ	4360
+VPERMPDZmbi	4361
+VPERMPDZmbik	4362
+VPERMPDZmbikz	4363
+VPERMPDZmi	4364
+VPERMPDZmik	4365
+VPERMPDZmikz	4366
+VPERMPDZri	4367
+VPERMPDZrik	4368
+VPERMPDZrikz	4369
+VPERMPDZrm	4370
+VPERMPDZrmb	4371
+VPERMPDZrmbk	4372
+VPERMPDZrmbkz	4373
+VPERMPDZrmk	4374
+VPERMPDZrmkz	4375
+VPERMPDZrr	4376
+VPERMPDZrrk	4377
+VPERMPDZrrkz	4378
+VPERMPSYrm	4379
+VPERMPSYrr	4380
+VPERMPSZ	4381
+VPERMPSZrm	4382
+VPERMPSZrmb	4383
+VPERMPSZrmbk	4384
+VPERMPSZrmbkz	4385
+VPERMPSZrmk	4386
+VPERMPSZrmkz	4387
+VPERMPSZrr	4388
+VPERMPSZrrk	4389
+VPERMPSZrrkz	4390
+VPERMQYmi	4391
+VPERMQYri	4392
+VPERMQZ	4393
+VPERMQZmbi	4394
+VPERMQZmbik	4395
+VPERMQZmbikz	4396
+VPERMQZmi	4397
+VPERMQZmik	4398
+VPERMQZmikz	4399
+VPERMQZri	4400
+VPERMQZrik	4401
+VPERMQZrikz	4402
+VPERMQZrm	4403
+VPERMQZrmb	4404
+VPERMQZrmbk	4405
+VPERMQZrmbkz	4406
+VPERMQZrmk	4407
+VPERMQZrmkz	4408
+VPERMQZrr	4409
+VPERMQZrrk	4410
+VPERMQZrrkz	4411
+VPERMT	4412
+VPERMWZ	4413
+VPERMWZrm	4414
+VPERMWZrmk	4415
+VPERMWZrmkz	4416
+VPERMWZrr	4417
+VPERMWZrrk	4418
+VPERMWZrrkz	4419
+VPEXPANDBZ	4420
+VPEXPANDBZrm	4421
+VPEXPANDBZrmk	4422
+VPEXPANDBZrmkz	4423
+VPEXPANDBZrr	4424
+VPEXPANDBZrrk	4425
+VPEXPANDBZrrkz	4426
+VPEXPANDDZ	4427
+VPEXPANDDZrm	4428
+VPEXPANDDZrmk	4429
+VPEXPANDDZrmkz	4430
+VPEXPANDDZrr	4431
+VPEXPANDDZrrk	4432
+VPEXPANDDZrrkz	4433
+VPEXPANDQZ	4434
+VPEXPANDQZrm	4435
+VPEXPANDQZrmk	4436
+VPEXPANDQZrmkz	4437
+VPEXPANDQZrr	4438
+VPEXPANDQZrrk	4439
+VPEXPANDQZrrkz	4440
+VPEXPANDWZ	4441
+VPEXPANDWZrm	4442
+VPEXPANDWZrmk	4443
+VPEXPANDWZrmkz	4444
+VPEXPANDWZrr	4445
+VPEXPANDWZrrk	4446
+VPEXPANDWZrrkz	4447
+VPEXTRBZmri	4448
+VPEXTRBZrri	4449
+VPEXTRBmri	4450
+VPEXTRBrri	4451
+VPEXTRDZmri	4452
+VPEXTRDZrri	4453
+VPEXTRDmri	4454
+VPEXTRDrri	4455
+VPEXTRQZmri	4456
+VPEXTRQZrri	4457
+VPEXTRQmri	4458
+VPEXTRQrri	4459
+VPEXTRWZmri	4460
+VPEXTRWZrri	4461
+VPEXTRWZrri_REV	4462
+VPEXTRWmri	4463
+VPEXTRWrri	4464
+VPEXTRWrri_REV	4465
+VPGATHERDDYrm	4466
+VPGATHERDDZ	4467
+VPGATHERDDZrm	4468
+VPGATHERDDrm	4469
+VPGATHERDQYrm	4470
+VPGATHERDQZ	4471
+VPGATHERDQZrm	4472
+VPGATHERDQrm	4473
+VPGATHERQDYrm	4474
+VPGATHERQDZ	4475
+VPGATHERQDZrm	4476
+VPGATHERQDrm	4477
+VPGATHERQQYrm	4478
+VPGATHERQQZ	4479
+VPGATHERQQZrm	4480
+VPGATHERQQrm	4481
+VPHADDBDrm	4482
+VPHADDBDrr	4483
+VPHADDBQrm	4484
+VPHADDBQrr	4485
+VPHADDBWrm	4486
+VPHADDBWrr	4487
+VPHADDDQrm	4488
+VPHADDDQrr	4489
+VPHADDDYrm	4490
+VPHADDDYrr	4491
+VPHADDDrm	4492
+VPHADDDrr	4493
+VPHADDSWYrm	4494
+VPHADDSWYrr	4495
+VPHADDSWrm	4496
+VPHADDSWrr	4497
+VPHADDUBDrm	4498
+VPHADDUBDrr	4499
+VPHADDUBQrm	4500
+VPHADDUBQrr	4501
+VPHADDUBWrm	4502
+VPHADDUBWrr	4503
+VPHADDUDQrm	4504
+VPHADDUDQrr	4505
+VPHADDUWDrm	4506
+VPHADDUWDrr	4507
+VPHADDUWQrm	4508
+VPHADDUWQrr	4509
+VPHADDWDrm	4510
+VPHADDWDrr	4511
+VPHADDWQrm	4512
+VPHADDWQrr	4513
+VPHADDWYrm	4514
+VPHADDWYrr	4515
+VPHADDWrm	4516
+VPHADDWrr	4517
+VPHMINPOSUWrm	4518
+VPHMINPOSUWrr	4519
+VPHSUBBWrm	4520
+VPHSUBBWrr	4521
+VPHSUBDQrm	4522
+VPHSUBDQrr	4523
+VPHSUBDYrm	4524
+VPHSUBDYrr	4525
+VPHSUBDrm	4526
+VPHSUBDrr	4527
+VPHSUBSWYrm	4528
+VPHSUBSWYrr	4529
+VPHSUBSWrm	4530
+VPHSUBSWrr	4531
+VPHSUBWDrm	4532
+VPHSUBWDrr	4533
+VPHSUBWYrm	4534
+VPHSUBWYrr	4535
+VPHSUBWrm	4536
+VPHSUBWrr	4537
+VPINSRBZrmi	4538
+VPINSRBZrri	4539
+VPINSRBrmi	4540
+VPINSRBrri	4541
+VPINSRDZrmi	4542
+VPINSRDZrri	4543
+VPINSRDrmi	4544
+VPINSRDrri	4545
+VPINSRQZrmi	4546
+VPINSRQZrri	4547
+VPINSRQrmi	4548
+VPINSRQrri	4549
+VPINSRWZrmi	4550
+VPINSRWZrri	4551
+VPINSRWrmi	4552
+VPINSRWrri	4553
+VPLZCNTDZ	4554
+VPLZCNTDZrm	4555
+VPLZCNTDZrmb	4556
+VPLZCNTDZrmbk	4557
+VPLZCNTDZrmbkz	4558
+VPLZCNTDZrmk	4559
+VPLZCNTDZrmkz	4560
+VPLZCNTDZrr	4561
+VPLZCNTDZrrk	4562
+VPLZCNTDZrrkz	4563
+VPLZCNTQZ	4564
+VPLZCNTQZrm	4565
+VPLZCNTQZrmb	4566
+VPLZCNTQZrmbk	4567
+VPLZCNTQZrmbkz	4568
+VPLZCNTQZrmk	4569
+VPLZCNTQZrmkz	4570
+VPLZCNTQZrr	4571
+VPLZCNTQZrrk	4572
+VPLZCNTQZrrkz	4573
+VPMACSDDrm	4574
+VPMACSDDrr	4575
+VPMACSDQHrm	4576
+VPMACSDQHrr	4577
+VPMACSDQLrm	4578
+VPMACSDQLrr	4579
+VPMACSSDDrm	4580
+VPMACSSDDrr	4581
+VPMACSSDQHrm	4582
+VPMACSSDQHrr	4583
+VPMACSSDQLrm	4584
+VPMACSSDQLrr	4585
+VPMACSSWDrm	4586
+VPMACSSWDrr	4587
+VPMACSSWWrm	4588
+VPMACSSWWrr	4589
+VPMACSWDrm	4590
+VPMACSWDrr	4591
+VPMACSWWrm	4592
+VPMACSWWrr	4593
+VPMADCSSWDrm	4594
+VPMADCSSWDrr	4595
+VPMADCSWDrm	4596
+VPMADCSWDrr	4597
+VPMADD	4598
+VPMADDUBSWYrm	4599
+VPMADDUBSWYrr	4600
+VPMADDUBSWZ	4601
+VPMADDUBSWZrm	4602
+VPMADDUBSWZrmk	4603
+VPMADDUBSWZrmkz	4604
+VPMADDUBSWZrr	4605
+VPMADDUBSWZrrk	4606
+VPMADDUBSWZrrkz	4607
+VPMADDUBSWrm	4608
+VPMADDUBSWrr	4609
+VPMADDWDYrm	4610
+VPMADDWDYrr	4611
+VPMADDWDZ	4612
+VPMADDWDZrm	4613
+VPMADDWDZrmk	4614
+VPMADDWDZrmkz	4615
+VPMADDWDZrr	4616
+VPMADDWDZrrk	4617
+VPMADDWDZrrkz	4618
+VPMADDWDrm	4619
+VPMADDWDrr	4620
+VPMASKMOVDYmr	4621
+VPMASKMOVDYrm	4622
+VPMASKMOVDmr	4623
+VPMASKMOVDrm	4624
+VPMASKMOVQYmr	4625
+VPMASKMOVQYrm	4626
+VPMASKMOVQmr	4627
+VPMASKMOVQrm	4628
+VPMAXSBYrm	4629
+VPMAXSBYrr	4630
+VPMAXSBZ	4631
+VPMAXSBZrm	4632
+VPMAXSBZrmk	4633
+VPMAXSBZrmkz	4634
+VPMAXSBZrr	4635
+VPMAXSBZrrk	4636
+VPMAXSBZrrkz	4637
+VPMAXSBrm	4638
+VPMAXSBrr	4639
+VPMAXSDYrm	4640
+VPMAXSDYrr	4641
+VPMAXSDZ	4642
+VPMAXSDZrm	4643
+VPMAXSDZrmb	4644
+VPMAXSDZrmbk	4645
+VPMAXSDZrmbkz	4646
+VPMAXSDZrmk	4647
+VPMAXSDZrmkz	4648
+VPMAXSDZrr	4649
+VPMAXSDZrrk	4650
+VPMAXSDZrrkz	4651
+VPMAXSDrm	4652
+VPMAXSDrr	4653
+VPMAXSQZ	4654
+VPMAXSQZrm	4655
+VPMAXSQZrmb	4656
+VPMAXSQZrmbk	4657
+VPMAXSQZrmbkz	4658
+VPMAXSQZrmk	4659
+VPMAXSQZrmkz	4660
+VPMAXSQZrr	4661
+VPMAXSQZrrk	4662
+VPMAXSQZrrkz	4663
+VPMAXSWYrm	4664
+VPMAXSWYrr	4665
+VPMAXSWZ	4666
+VPMAXSWZrm	4667
+VPMAXSWZrmk	4668
+VPMAXSWZrmkz	4669
+VPMAXSWZrr	4670
+VPMAXSWZrrk	4671
+VPMAXSWZrrkz	4672
+VPMAXSWrm	4673
+VPMAXSWrr	4674
+VPMAXUBYrm	4675
+VPMAXUBYrr	4676
+VPMAXUBZ	4677
+VPMAXUBZrm	4678
+VPMAXUBZrmk	4679
+VPMAXUBZrmkz	4680
+VPMAXUBZrr	4681
+VPMAXUBZrrk	4682
+VPMAXUBZrrkz	4683
+VPMAXUBrm	4684
+VPMAXUBrr	4685
+VPMAXUDYrm	4686
+VPMAXUDYrr	4687
+VPMAXUDZ	4688
+VPMAXUDZrm	4689
+VPMAXUDZrmb	4690
+VPMAXUDZrmbk	4691
+VPMAXUDZrmbkz	4692
+VPMAXUDZrmk	4693
+VPMAXUDZrmkz	4694
+VPMAXUDZrr	4695
+VPMAXUDZrrk	4696
+VPMAXUDZrrkz	4697
+VPMAXUDrm	4698
+VPMAXUDrr	4699
+VPMAXUQZ	4700
+VPMAXUQZrm	4701
+VPMAXUQZrmb	4702
+VPMAXUQZrmbk	4703
+VPMAXUQZrmbkz	4704
+VPMAXUQZrmk	4705
+VPMAXUQZrmkz	4706
+VPMAXUQZrr	4707
+VPMAXUQZrrk	4708
+VPMAXUQZrrkz	4709
+VPMAXUWYrm	4710
+VPMAXUWYrr	4711
+VPMAXUWZ	4712
+VPMAXUWZrm	4713
+VPMAXUWZrmk	4714
+VPMAXUWZrmkz	4715
+VPMAXUWZrr	4716
+VPMAXUWZrrk	4717
+VPMAXUWZrrkz	4718
+VPMAXUWrm	4719
+VPMAXUWrr	4720
+VPMINSBYrm	4721
+VPMINSBYrr	4722
+VPMINSBZ	4723
+VPMINSBZrm	4724
+VPMINSBZrmk	4725
+VPMINSBZrmkz	4726
+VPMINSBZrr	4727
+VPMINSBZrrk	4728
+VPMINSBZrrkz	4729
+VPMINSBrm	4730
+VPMINSBrr	4731
+VPMINSDYrm	4732
+VPMINSDYrr	4733
+VPMINSDZ	4734
+VPMINSDZrm	4735
+VPMINSDZrmb	4736
+VPMINSDZrmbk	4737
+VPMINSDZrmbkz	4738
+VPMINSDZrmk	4739
+VPMINSDZrmkz	4740
+VPMINSDZrr	4741
+VPMINSDZrrk	4742
+VPMINSDZrrkz	4743
+VPMINSDrm	4744
+VPMINSDrr	4745
+VPMINSQZ	4746
+VPMINSQZrm	4747
+VPMINSQZrmb	4748
+VPMINSQZrmbk	4749
+VPMINSQZrmbkz	4750
+VPMINSQZrmk	4751
+VPMINSQZrmkz	4752
+VPMINSQZrr	4753
+VPMINSQZrrk	4754
+VPMINSQZrrkz	4755
+VPMINSWYrm	4756
+VPMINSWYrr	4757
+VPMINSWZ	4758
+VPMINSWZrm	4759
+VPMINSWZrmk	4760
+VPMINSWZrmkz	4761
+VPMINSWZrr	4762
+VPMINSWZrrk	4763
+VPMINSWZrrkz	4764
+VPMINSWrm	4765
+VPMINSWrr	4766
+VPMINUBYrm	4767
+VPMINUBYrr	4768
+VPMINUBZ	4769
+VPMINUBZrm	4770
+VPMINUBZrmk	4771
+VPMINUBZrmkz	4772
+VPMINUBZrr	4773
+VPMINUBZrrk	4774
+VPMINUBZrrkz	4775
+VPMINUBrm	4776
+VPMINUBrr	4777
+VPMINUDYrm	4778
+VPMINUDYrr	4779
+VPMINUDZ	4780
+VPMINUDZrm	4781
+VPMINUDZrmb	4782
+VPMINUDZrmbk	4783
+VPMINUDZrmbkz	4784
+VPMINUDZrmk	4785
+VPMINUDZrmkz	4786
+VPMINUDZrr	4787
+VPMINUDZrrk	4788
+VPMINUDZrrkz	4789
+VPMINUDrm	4790
+VPMINUDrr	4791
+VPMINUQZ	4792
+VPMINUQZrm	4793
+VPMINUQZrmb	4794
+VPMINUQZrmbk	4795
+VPMINUQZrmbkz	4796
+VPMINUQZrmk	4797
+VPMINUQZrmkz	4798
+VPMINUQZrr	4799
+VPMINUQZrrk	4800
+VPMINUQZrrkz	4801
+VPMINUWYrm	4802
+VPMINUWYrr	4803
+VPMINUWZ	4804
+VPMINUWZrm	4805
+VPMINUWZrmk	4806
+VPMINUWZrmkz	4807
+VPMINUWZrr	4808
+VPMINUWZrrk	4809
+VPMINUWZrrkz	4810
+VPMINUWrm	4811
+VPMINUWrr	4812
+VPMOVB	4813
+VPMOVD	4814
+VPMOVDBZ	4815
+VPMOVDBZmr	4816
+VPMOVDBZmrk	4817
+VPMOVDBZrr	4818
+VPMOVDBZrrk	4819
+VPMOVDBZrrkz	4820
+VPMOVDWZ	4821
+VPMOVDWZmr	4822
+VPMOVDWZmrk	4823
+VPMOVDWZrr	4824
+VPMOVDWZrrk	4825
+VPMOVDWZrrkz	4826
+VPMOVM	4827
+VPMOVMSKBYrr	4828
+VPMOVMSKBrr	4829
+VPMOVQ	4830
+VPMOVQBZ	4831
+VPMOVQBZmr	4832
+VPMOVQBZmrk	4833
+VPMOVQBZrr	4834
+VPMOVQBZrrk	4835
+VPMOVQBZrrkz	4836
+VPMOVQDZ	4837
+VPMOVQDZmr	4838
+VPMOVQDZmrk	4839
+VPMOVQDZrr	4840
+VPMOVQDZrrk	4841
+VPMOVQDZrrkz	4842
+VPMOVQWZ	4843
+VPMOVQWZmr	4844
+VPMOVQWZmrk	4845
+VPMOVQWZrr	4846
+VPMOVQWZrrk	4847
+VPMOVQWZrrkz	4848
+VPMOVSDBZ	4849
+VPMOVSDBZmr	4850
+VPMOVSDBZmrk	4851
+VPMOVSDBZrr	4852
+VPMOVSDBZrrk	4853
+VPMOVSDBZrrkz	4854
+VPMOVSDWZ	4855
+VPMOVSDWZmr	4856
+VPMOVSDWZmrk	4857
+VPMOVSDWZrr	4858
+VPMOVSDWZrrk	4859
+VPMOVSDWZrrkz	4860
+VPMOVSQBZ	4861
+VPMOVSQBZmr	4862
+VPMOVSQBZmrk	4863
+VPMOVSQBZrr	4864
+VPMOVSQBZrrk	4865
+VPMOVSQBZrrkz	4866
+VPMOVSQDZ	4867
+VPMOVSQDZmr	4868
+VPMOVSQDZmrk	4869
+VPMOVSQDZrr	4870
+VPMOVSQDZrrk	4871
+VPMOVSQDZrrkz	4872
+VPMOVSQWZ	4873
+VPMOVSQWZmr	4874
+VPMOVSQWZmrk	4875
+VPMOVSQWZrr	4876
+VPMOVSQWZrrk	4877
+VPMOVSQWZrrkz	4878
+VPMOVSWBZ	4879
+VPMOVSWBZmr	4880
+VPMOVSWBZmrk	4881
+VPMOVSWBZrr	4882
+VPMOVSWBZrrk	4883
+VPMOVSWBZrrkz	4884
+VPMOVSXBDYrm	4885
+VPMOVSXBDYrr	4886
+VPMOVSXBDZ	4887
+VPMOVSXBDZrm	4888
+VPMOVSXBDZrmk	4889
+VPMOVSXBDZrmkz	4890
+VPMOVSXBDZrr	4891
+VPMOVSXBDZrrk	4892
+VPMOVSXBDZrrkz	4893
+VPMOVSXBDrm	4894
+VPMOVSXBDrr	4895
+VPMOVSXBQYrm	4896
+VPMOVSXBQYrr	4897
+VPMOVSXBQZ	4898
+VPMOVSXBQZrm	4899
+VPMOVSXBQZrmk	4900
+VPMOVSXBQZrmkz	4901
+VPMOVSXBQZrr	4902
+VPMOVSXBQZrrk	4903
+VPMOVSXBQZrrkz	4904
+VPMOVSXBQrm	4905
+VPMOVSXBQrr	4906
+VPMOVSXBWYrm	4907
+VPMOVSXBWYrr	4908
+VPMOVSXBWZ	4909
+VPMOVSXBWZrm	4910
+VPMOVSXBWZrmk	4911
+VPMOVSXBWZrmkz	4912
+VPMOVSXBWZrr	4913
+VPMOVSXBWZrrk	4914
+VPMOVSXBWZrrkz	4915
+VPMOVSXBWrm	4916
+VPMOVSXBWrr	4917
+VPMOVSXDQYrm	4918
+VPMOVSXDQYrr	4919
+VPMOVSXDQZ	4920
+VPMOVSXDQZrm	4921
+VPMOVSXDQZrmk	4922
+VPMOVSXDQZrmkz	4923
+VPMOVSXDQZrr	4924
+VPMOVSXDQZrrk	4925
+VPMOVSXDQZrrkz	4926
+VPMOVSXDQrm	4927
+VPMOVSXDQrr	4928
+VPMOVSXWDYrm	4929
+VPMOVSXWDYrr	4930
+VPMOVSXWDZ	4931
+VPMOVSXWDZrm	4932
+VPMOVSXWDZrmk	4933
+VPMOVSXWDZrmkz	4934
+VPMOVSXWDZrr	4935
+VPMOVSXWDZrrk	4936
+VPMOVSXWDZrrkz	4937
+VPMOVSXWDrm	4938
+VPMOVSXWDrr	4939
+VPMOVSXWQYrm	4940
+VPMOVSXWQYrr	4941
+VPMOVSXWQZ	4942
+VPMOVSXWQZrm	4943
+VPMOVSXWQZrmk	4944
+VPMOVSXWQZrmkz	4945
+VPMOVSXWQZrr	4946
+VPMOVSXWQZrrk	4947
+VPMOVSXWQZrrkz	4948
+VPMOVSXWQrm	4949
+VPMOVSXWQrr	4950
+VPMOVUSDBZ	4951
+VPMOVUSDBZmr	4952
+VPMOVUSDBZmrk	4953
+VPMOVUSDBZrr	4954
+VPMOVUSDBZrrk	4955
+VPMOVUSDBZrrkz	4956
+VPMOVUSDWZ	4957
+VPMOVUSDWZmr	4958
+VPMOVUSDWZmrk	4959
+VPMOVUSDWZrr	4960
+VPMOVUSDWZrrk	4961
+VPMOVUSDWZrrkz	4962
+VPMOVUSQBZ	4963
+VPMOVUSQBZmr	4964
+VPMOVUSQBZmrk	4965
+VPMOVUSQBZrr	4966
+VPMOVUSQBZrrk	4967
+VPMOVUSQBZrrkz	4968
+VPMOVUSQDZ	4969
+VPMOVUSQDZmr	4970
+VPMOVUSQDZmrk	4971
+VPMOVUSQDZrr	4972
+VPMOVUSQDZrrk	4973
+VPMOVUSQDZrrkz	4974
+VPMOVUSQWZ	4975
+VPMOVUSQWZmr	4976
+VPMOVUSQWZmrk	4977
+VPMOVUSQWZrr	4978
+VPMOVUSQWZrrk	4979
+VPMOVUSQWZrrkz	4980
+VPMOVUSWBZ	4981
+VPMOVUSWBZmr	4982
+VPMOVUSWBZmrk	4983
+VPMOVUSWBZrr	4984
+VPMOVUSWBZrrk	4985
+VPMOVUSWBZrrkz	4986
+VPMOVW	4987
+VPMOVWBZ	4988
+VPMOVWBZmr	4989
+VPMOVWBZmrk	4990
+VPMOVWBZrr	4991
+VPMOVWBZrrk	4992
+VPMOVWBZrrkz	4993
+VPMOVZXBDYrm	4994
+VPMOVZXBDYrr	4995
+VPMOVZXBDZ	4996
+VPMOVZXBDZrm	4997
+VPMOVZXBDZrmk	4998
+VPMOVZXBDZrmkz	4999
+VPMOVZXBDZrr	5000
+VPMOVZXBDZrrk	5001
+VPMOVZXBDZrrkz	5002
+VPMOVZXBDrm	5003
+VPMOVZXBDrr	5004
+VPMOVZXBQYrm	5005
+VPMOVZXBQYrr	5006
+VPMOVZXBQZ	5007
+VPMOVZXBQZrm	5008
+VPMOVZXBQZrmk	5009
+VPMOVZXBQZrmkz	5010
+VPMOVZXBQZrr	5011
+VPMOVZXBQZrrk	5012
+VPMOVZXBQZrrkz	5013
+VPMOVZXBQrm	5014
+VPMOVZXBQrr	5015
+VPMOVZXBWYrm	5016
+VPMOVZXBWYrr	5017
+VPMOVZXBWZ	5018
+VPMOVZXBWZrm	5019
+VPMOVZXBWZrmk	5020
+VPMOVZXBWZrmkz	5021
+VPMOVZXBWZrr	5022
+VPMOVZXBWZrrk	5023
+VPMOVZXBWZrrkz	5024
+VPMOVZXBWrm	5025
+VPMOVZXBWrr	5026
+VPMOVZXDQYrm	5027
+VPMOVZXDQYrr	5028
+VPMOVZXDQZ	5029
+VPMOVZXDQZrm	5030
+VPMOVZXDQZrmk	5031
+VPMOVZXDQZrmkz	5032
+VPMOVZXDQZrr	5033
+VPMOVZXDQZrrk	5034
+VPMOVZXDQZrrkz	5035
+VPMOVZXDQrm	5036
+VPMOVZXDQrr	5037
+VPMOVZXWDYrm	5038
+VPMOVZXWDYrr	5039
+VPMOVZXWDZ	5040
+VPMOVZXWDZrm	5041
+VPMOVZXWDZrmk	5042
+VPMOVZXWDZrmkz	5043
+VPMOVZXWDZrr	5044
+VPMOVZXWDZrrk	5045
+VPMOVZXWDZrrkz	5046
+VPMOVZXWDrm	5047
+VPMOVZXWDrr	5048
+VPMOVZXWQYrm	5049
+VPMOVZXWQYrr	5050
+VPMOVZXWQZ	5051
+VPMOVZXWQZrm	5052
+VPMOVZXWQZrmk	5053
+VPMOVZXWQZrmkz	5054
+VPMOVZXWQZrr	5055
+VPMOVZXWQZrrk	5056
+VPMOVZXWQZrrkz	5057
+VPMOVZXWQrm	5058
+VPMOVZXWQrr	5059
+VPMULDQYrm	5060
+VPMULDQYrr	5061
+VPMULDQZ	5062
+VPMULDQZrm	5063
+VPMULDQZrmb	5064
+VPMULDQZrmbk	5065
+VPMULDQZrmbkz	5066
+VPMULDQZrmk	5067
+VPMULDQZrmkz	5068
+VPMULDQZrr	5069
+VPMULDQZrrk	5070
+VPMULDQZrrkz	5071
+VPMULDQrm	5072
+VPMULDQrr	5073
+VPMULHRSWYrm	5074
+VPMULHRSWYrr	5075
+VPMULHRSWZ	5076
+VPMULHRSWZrm	5077
+VPMULHRSWZrmk	5078
+VPMULHRSWZrmkz	5079
+VPMULHRSWZrr	5080
+VPMULHRSWZrrk	5081
+VPMULHRSWZrrkz	5082
+VPMULHRSWrm	5083
+VPMULHRSWrr	5084
+VPMULHUWYrm	5085
+VPMULHUWYrr	5086
+VPMULHUWZ	5087
+VPMULHUWZrm	5088
+VPMULHUWZrmk	5089
+VPMULHUWZrmkz	5090
+VPMULHUWZrr	5091
+VPMULHUWZrrk	5092
+VPMULHUWZrrkz	5093
+VPMULHUWrm	5094
+VPMULHUWrr	5095
+VPMULHWYrm	5096
+VPMULHWYrr	5097
+VPMULHWZ	5098
+VPMULHWZrm	5099
+VPMULHWZrmk	5100
+VPMULHWZrmkz	5101
+VPMULHWZrr	5102
+VPMULHWZrrk	5103
+VPMULHWZrrkz	5104
+VPMULHWrm	5105
+VPMULHWrr	5106
+VPMULLDYrm	5107
+VPMULLDYrr	5108
+VPMULLDZ	5109
+VPMULLDZrm	5110
+VPMULLDZrmb	5111
+VPMULLDZrmbk	5112
+VPMULLDZrmbkz	5113
+VPMULLDZrmk	5114
+VPMULLDZrmkz	5115
+VPMULLDZrr	5116
+VPMULLDZrrk	5117
+VPMULLDZrrkz	5118
+VPMULLDrm	5119
+VPMULLDrr	5120
+VPMULLQZ	5121
+VPMULLQZrm	5122
+VPMULLQZrmb	5123
+VPMULLQZrmbk	5124
+VPMULLQZrmbkz	5125
+VPMULLQZrmk	5126
+VPMULLQZrmkz	5127
+VPMULLQZrr	5128
+VPMULLQZrrk	5129
+VPMULLQZrrkz	5130
+VPMULLWYrm	5131
+VPMULLWYrr	5132
+VPMULLWZ	5133
+VPMULLWZrm	5134
+VPMULLWZrmk	5135
+VPMULLWZrmkz	5136
+VPMULLWZrr	5137
+VPMULLWZrrk	5138
+VPMULLWZrrkz	5139
+VPMULLWrm	5140
+VPMULLWrr	5141
+VPMULTISHIFTQBZ	5142
+VPMULTISHIFTQBZrm	5143
+VPMULTISHIFTQBZrmb	5144
+VPMULTISHIFTQBZrmbk	5145
+VPMULTISHIFTQBZrmbkz	5146
+VPMULTISHIFTQBZrmk	5147
+VPMULTISHIFTQBZrmkz	5148
+VPMULTISHIFTQBZrr	5149
+VPMULTISHIFTQBZrrk	5150
+VPMULTISHIFTQBZrrkz	5151
+VPMULUDQYrm	5152
+VPMULUDQYrr	5153
+VPMULUDQZ	5154
+VPMULUDQZrm	5155
+VPMULUDQZrmb	5156
+VPMULUDQZrmbk	5157
+VPMULUDQZrmbkz	5158
+VPMULUDQZrmk	5159
+VPMULUDQZrmkz	5160
+VPMULUDQZrr	5161
+VPMULUDQZrrk	5162
+VPMULUDQZrrkz	5163
+VPMULUDQrm	5164
+VPMULUDQrr	5165
+VPOPCNTBZ	5166
+VPOPCNTBZrm	5167
+VPOPCNTBZrmk	5168
+VPOPCNTBZrmkz	5169
+VPOPCNTBZrr	5170
+VPOPCNTBZrrk	5171
+VPOPCNTBZrrkz	5172
+VPOPCNTDZ	5173
+VPOPCNTDZrm	5174
+VPOPCNTDZrmb	5175
+VPOPCNTDZrmbk	5176
+VPOPCNTDZrmbkz	5177
+VPOPCNTDZrmk	5178
+VPOPCNTDZrmkz	5179
+VPOPCNTDZrr	5180
+VPOPCNTDZrrk	5181
+VPOPCNTDZrrkz	5182
+VPOPCNTQZ	5183
+VPOPCNTQZrm	5184
+VPOPCNTQZrmb	5185
+VPOPCNTQZrmbk	5186
+VPOPCNTQZrmbkz	5187
+VPOPCNTQZrmk	5188
+VPOPCNTQZrmkz	5189
+VPOPCNTQZrr	5190
+VPOPCNTQZrrk	5191
+VPOPCNTQZrrkz	5192
+VPOPCNTWZ	5193
+VPOPCNTWZrm	5194
+VPOPCNTWZrmk	5195
+VPOPCNTWZrmkz	5196
+VPOPCNTWZrr	5197
+VPOPCNTWZrrk	5198
+VPOPCNTWZrrkz	5199
+VPORDZ	5200
+VPORDZrm	5201
+VPORDZrmb	5202
+VPORDZrmbk	5203
+VPORDZrmbkz	5204
+VPORDZrmk	5205
+VPORDZrmkz	5206
+VPORDZrr	5207
+VPORDZrrk	5208
+VPORDZrrkz	5209
+VPORQZ	5210
+VPORQZrm	5211
+VPORQZrmb	5212
+VPORQZrmbk	5213
+VPORQZrmbkz	5214
+VPORQZrmk	5215
+VPORQZrmkz	5216
+VPORQZrr	5217
+VPORQZrrk	5218
+VPORQZrrkz	5219
+VPORYrm	5220
+VPORYrr	5221
+VPORrm	5222
+VPORrr	5223
+VPPERMrmr	5224
+VPPERMrrm	5225
+VPPERMrrr	5226
+VPPERMrrr_REV	5227
+VPROLDZ	5228
+VPROLDZmbi	5229
+VPROLDZmbik	5230
+VPROLDZmbikz	5231
+VPROLDZmi	5232
+VPROLDZmik	5233
+VPROLDZmikz	5234
+VPROLDZri	5235
+VPROLDZrik	5236
+VPROLDZrikz	5237
+VPROLQZ	5238
+VPROLQZmbi	5239
+VPROLQZmbik	5240
+VPROLQZmbikz	5241
+VPROLQZmi	5242
+VPROLQZmik	5243
+VPROLQZmikz	5244
+VPROLQZri	5245
+VPROLQZrik	5246
+VPROLQZrikz	5247
+VPROLVDZ	5248
+VPROLVDZrm	5249
+VPROLVDZrmb	5250
+VPROLVDZrmbk	5251
+VPROLVDZrmbkz	5252
+VPROLVDZrmk	5253
+VPROLVDZrmkz	5254
+VPROLVDZrr	5255
+VPROLVDZrrk	5256
+VPROLVDZrrkz	5257
+VPROLVQZ	5258
+VPROLVQZrm	5259
+VPROLVQZrmb	5260
+VPROLVQZrmbk	5261
+VPROLVQZrmbkz	5262
+VPROLVQZrmk	5263
+VPROLVQZrmkz	5264
+VPROLVQZrr	5265
+VPROLVQZrrk	5266
+VPROLVQZrrkz	5267
+VPRORDZ	5268
+VPRORDZmbi	5269
+VPRORDZmbik	5270
+VPRORDZmbikz	5271
+VPRORDZmi	5272
+VPRORDZmik	5273
+VPRORDZmikz	5274
+VPRORDZri	5275
+VPRORDZrik	5276
+VPRORDZrikz	5277
+VPRORQZ	5278
+VPRORQZmbi	5279
+VPRORQZmbik	5280
+VPRORQZmbikz	5281
+VPRORQZmi	5282
+VPRORQZmik	5283
+VPRORQZmikz	5284
+VPRORQZri	5285
+VPRORQZrik	5286
+VPRORQZrikz	5287
+VPRORVDZ	5288
+VPRORVDZrm	5289
+VPRORVDZrmb	5290
+VPRORVDZrmbk	5291
+VPRORVDZrmbkz	5292
+VPRORVDZrmk	5293
+VPRORVDZrmkz	5294
+VPRORVDZrr	5295
+VPRORVDZrrk	5296
+VPRORVDZrrkz	5297
+VPRORVQZ	5298
+VPRORVQZrm	5299
+VPRORVQZrmb	5300
+VPRORVQZrmbk	5301
+VPRORVQZrmbkz	5302
+VPRORVQZrmk	5303
+VPRORVQZrmkz	5304
+VPRORVQZrr	5305
+VPRORVQZrrk	5306
+VPRORVQZrrkz	5307
+VPROTBmi	5308
+VPROTBmr	5309
+VPROTBri	5310
+VPROTBrm	5311
+VPROTBrr	5312
+VPROTBrr_REV	5313
+VPROTDmi	5314
+VPROTDmr	5315
+VPROTDri	5316
+VPROTDrm	5317
+VPROTDrr	5318
+VPROTDrr_REV	5319
+VPROTQmi	5320
+VPROTQmr	5321
+VPROTQri	5322
+VPROTQrm	5323
+VPROTQrr	5324
+VPROTQrr_REV	5325
+VPROTWmi	5326
+VPROTWmr	5327
+VPROTWri	5328
+VPROTWrm	5329
+VPROTWrr	5330
+VPROTWrr_REV	5331
+VPSADBWYrm	5332
+VPSADBWYrr	5333
+VPSADBWZ	5334
+VPSADBWZrm	5335
+VPSADBWZrr	5336
+VPSADBWrm	5337
+VPSADBWrr	5338
+VPSCATTERDDZ	5339
+VPSCATTERDDZmr	5340
+VPSCATTERDQZ	5341
+VPSCATTERDQZmr	5342
+VPSCATTERQDZ	5343
+VPSCATTERQDZmr	5344
+VPSCATTERQQZ	5345
+VPSCATTERQQZmr	5346
+VPSHABmr	5347
+VPSHABrm	5348
+VPSHABrr	5349
+VPSHABrr_REV	5350
+VPSHADmr	5351
+VPSHADrm	5352
+VPSHADrr	5353
+VPSHADrr_REV	5354
+VPSHAQmr	5355
+VPSHAQrm	5356
+VPSHAQrr	5357
+VPSHAQrr_REV	5358
+VPSHAWmr	5359
+VPSHAWrm	5360
+VPSHAWrr	5361
+VPSHAWrr_REV	5362
+VPSHLBmr	5363
+VPSHLBrm	5364
+VPSHLBrr	5365
+VPSHLBrr_REV	5366
+VPSHLDDZ	5367
+VPSHLDDZrmbi	5368
+VPSHLDDZrmbik	5369
+VPSHLDDZrmbikz	5370
+VPSHLDDZrmi	5371
+VPSHLDDZrmik	5372
+VPSHLDDZrmikz	5373
+VPSHLDDZrri	5374
+VPSHLDDZrrik	5375
+VPSHLDDZrrikz	5376
+VPSHLDQZ	5377
+VPSHLDQZrmbi	5378
+VPSHLDQZrmbik	5379
+VPSHLDQZrmbikz	5380
+VPSHLDQZrmi	5381
+VPSHLDQZrmik	5382
+VPSHLDQZrmikz	5383
+VPSHLDQZrri	5384
+VPSHLDQZrrik	5385
+VPSHLDQZrrikz	5386
+VPSHLDVDZ	5387
+VPSHLDVDZm	5388
+VPSHLDVDZmb	5389
+VPSHLDVDZmbk	5390
+VPSHLDVDZmbkz	5391
+VPSHLDVDZmk	5392
+VPSHLDVDZmkz	5393
+VPSHLDVDZr	5394
+VPSHLDVDZrk	5395
+VPSHLDVDZrkz	5396
+VPSHLDVQZ	5397
+VPSHLDVQZm	5398
+VPSHLDVQZmb	5399
+VPSHLDVQZmbk	5400
+VPSHLDVQZmbkz	5401
+VPSHLDVQZmk	5402
+VPSHLDVQZmkz	5403
+VPSHLDVQZr	5404
+VPSHLDVQZrk	5405
+VPSHLDVQZrkz	5406
+VPSHLDVWZ	5407
+VPSHLDVWZm	5408
+VPSHLDVWZmk	5409
+VPSHLDVWZmkz	5410
+VPSHLDVWZr	5411
+VPSHLDVWZrk	5412
+VPSHLDVWZrkz	5413
+VPSHLDWZ	5414
+VPSHLDWZrmi	5415
+VPSHLDWZrmik	5416
+VPSHLDWZrmikz	5417
+VPSHLDWZrri	5418
+VPSHLDWZrrik	5419
+VPSHLDWZrrikz	5420
+VPSHLDmr	5421
+VPSHLDrm	5422
+VPSHLDrr	5423
+VPSHLDrr_REV	5424
+VPSHLQmr	5425
+VPSHLQrm	5426
+VPSHLQrr	5427
+VPSHLQrr_REV	5428
+VPSHLWmr	5429
+VPSHLWrm	5430
+VPSHLWrr	5431
+VPSHLWrr_REV	5432
+VPSHRDDZ	5433
+VPSHRDDZrmbi	5434
+VPSHRDDZrmbik	5435
+VPSHRDDZrmbikz	5436
+VPSHRDDZrmi	5437
+VPSHRDDZrmik	5438
+VPSHRDDZrmikz	5439
+VPSHRDDZrri	5440
+VPSHRDDZrrik	5441
+VPSHRDDZrrikz	5442
+VPSHRDQZ	5443
+VPSHRDQZrmbi	5444
+VPSHRDQZrmbik	5445
+VPSHRDQZrmbikz	5446
+VPSHRDQZrmi	5447
+VPSHRDQZrmik	5448
+VPSHRDQZrmikz	5449
+VPSHRDQZrri	5450
+VPSHRDQZrrik	5451
+VPSHRDQZrrikz	5452
+VPSHRDVDZ	5453
+VPSHRDVDZm	5454
+VPSHRDVDZmb	5455
+VPSHRDVDZmbk	5456
+VPSHRDVDZmbkz	5457
+VPSHRDVDZmk	5458
+VPSHRDVDZmkz	5459
+VPSHRDVDZr	5460
+VPSHRDVDZrk	5461
+VPSHRDVDZrkz	5462
+VPSHRDVQZ	5463
+VPSHRDVQZm	5464
+VPSHRDVQZmb	5465
+VPSHRDVQZmbk	5466
+VPSHRDVQZmbkz	5467
+VPSHRDVQZmk	5468
+VPSHRDVQZmkz	5469
+VPSHRDVQZr	5470
+VPSHRDVQZrk	5471
+VPSHRDVQZrkz	5472
+VPSHRDVWZ	5473
+VPSHRDVWZm	5474
+VPSHRDVWZmk	5475
+VPSHRDVWZmkz	5476
+VPSHRDVWZr	5477
+VPSHRDVWZrk	5478
+VPSHRDVWZrkz	5479
+VPSHRDWZ	5480
+VPSHRDWZrmi	5481
+VPSHRDWZrmik	5482
+VPSHRDWZrmikz	5483
+VPSHRDWZrri	5484
+VPSHRDWZrrik	5485
+VPSHRDWZrrikz	5486
+VPSHUFBITQMBZ	5487
+VPSHUFBITQMBZrm	5488
+VPSHUFBITQMBZrmk	5489
+VPSHUFBITQMBZrr	5490
+VPSHUFBITQMBZrrk	5491
+VPSHUFBYrm	5492
+VPSHUFBYrr	5493
+VPSHUFBZ	5494
+VPSHUFBZrm	5495
+VPSHUFBZrmk	5496
+VPSHUFBZrmkz	5497
+VPSHUFBZrr	5498
+VPSHUFBZrrk	5499
+VPSHUFBZrrkz	5500
+VPSHUFBrm	5501
+VPSHUFBrr	5502
+VPSHUFDYmi	5503
+VPSHUFDYri	5504
+VPSHUFDZ	5505
+VPSHUFDZmbi	5506
+VPSHUFDZmbik	5507
+VPSHUFDZmbikz	5508
+VPSHUFDZmi	5509
+VPSHUFDZmik	5510
+VPSHUFDZmikz	5511
+VPSHUFDZri	5512
+VPSHUFDZrik	5513
+VPSHUFDZrikz	5514
+VPSHUFDmi	5515
+VPSHUFDri	5516
+VPSHUFHWYmi	5517
+VPSHUFHWYri	5518
+VPSHUFHWZ	5519
+VPSHUFHWZmi	5520
+VPSHUFHWZmik	5521
+VPSHUFHWZmikz	5522
+VPSHUFHWZri	5523
+VPSHUFHWZrik	5524
+VPSHUFHWZrikz	5525
+VPSHUFHWmi	5526
+VPSHUFHWri	5527
+VPSHUFLWYmi	5528
+VPSHUFLWYri	5529
+VPSHUFLWZ	5530
+VPSHUFLWZmi	5531
+VPSHUFLWZmik	5532
+VPSHUFLWZmikz	5533
+VPSHUFLWZri	5534
+VPSHUFLWZrik	5535
+VPSHUFLWZrikz	5536
+VPSHUFLWmi	5537
+VPSHUFLWri	5538
+VPSIGNBYrm	5539
+VPSIGNBYrr	5540
+VPSIGNBrm	5541
+VPSIGNBrr	5542
+VPSIGNDYrm	5543
+VPSIGNDYrr	5544
+VPSIGNDrm	5545
+VPSIGNDrr	5546
+VPSIGNWYrm	5547
+VPSIGNWYrr	5548
+VPSIGNWrm	5549
+VPSIGNWrr	5550
+VPSLLDQYri	5551
+VPSLLDQZ	5552
+VPSLLDQZmi	5553
+VPSLLDQZri	5554
+VPSLLDQri	5555
+VPSLLDYri	5556
+VPSLLDYrm	5557
+VPSLLDYrr	5558
+VPSLLDZ	5559
+VPSLLDZmbi	5560
+VPSLLDZmbik	5561
+VPSLLDZmbikz	5562
+VPSLLDZmi	5563
+VPSLLDZmik	5564
+VPSLLDZmikz	5565
+VPSLLDZri	5566
+VPSLLDZrik	5567
+VPSLLDZrikz	5568
+VPSLLDZrm	5569
+VPSLLDZrmk	5570
+VPSLLDZrmkz	5571
+VPSLLDZrr	5572
+VPSLLDZrrk	5573
+VPSLLDZrrkz	5574
+VPSLLDri	5575
+VPSLLDrm	5576
+VPSLLDrr	5577
+VPSLLQYri	5578
+VPSLLQYrm	5579
+VPSLLQYrr	5580
+VPSLLQZ	5581
+VPSLLQZmbi	5582
+VPSLLQZmbik	5583
+VPSLLQZmbikz	5584
+VPSLLQZmi	5585
+VPSLLQZmik	5586
+VPSLLQZmikz	5587
+VPSLLQZri	5588
+VPSLLQZrik	5589
+VPSLLQZrikz	5590
+VPSLLQZrm	5591
+VPSLLQZrmk	5592
+VPSLLQZrmkz	5593
+VPSLLQZrr	5594
+VPSLLQZrrk	5595
+VPSLLQZrrkz	5596
+VPSLLQri	5597
+VPSLLQrm	5598
+VPSLLQrr	5599
+VPSLLVDYrm	5600
+VPSLLVDYrr	5601
+VPSLLVDZ	5602
+VPSLLVDZrm	5603
+VPSLLVDZrmb	5604
+VPSLLVDZrmbk	5605
+VPSLLVDZrmbkz	5606
+VPSLLVDZrmk	5607
+VPSLLVDZrmkz	5608
+VPSLLVDZrr	5609
+VPSLLVDZrrk	5610
+VPSLLVDZrrkz	5611
+VPSLLVDrm	5612
+VPSLLVDrr	5613
+VPSLLVQYrm	5614
+VPSLLVQYrr	5615
+VPSLLVQZ	5616
+VPSLLVQZrm	5617
+VPSLLVQZrmb	5618
+VPSLLVQZrmbk	5619
+VPSLLVQZrmbkz	5620
+VPSLLVQZrmk	5621
+VPSLLVQZrmkz	5622
+VPSLLVQZrr	5623
+VPSLLVQZrrk	5624
+VPSLLVQZrrkz	5625
+VPSLLVQrm	5626
+VPSLLVQrr	5627
+VPSLLVWZ	5628
+VPSLLVWZrm	5629
+VPSLLVWZrmk	5630
+VPSLLVWZrmkz	5631
+VPSLLVWZrr	5632
+VPSLLVWZrrk	5633
+VPSLLVWZrrkz	5634
+VPSLLWYri	5635
+VPSLLWYrm	5636
+VPSLLWYrr	5637
+VPSLLWZ	5638
+VPSLLWZmi	5639
+VPSLLWZmik	5640
+VPSLLWZmikz	5641
+VPSLLWZri	5642
+VPSLLWZrik	5643
+VPSLLWZrikz	5644
+VPSLLWZrm	5645
+VPSLLWZrmk	5646
+VPSLLWZrmkz	5647
+VPSLLWZrr	5648
+VPSLLWZrrk	5649
+VPSLLWZrrkz	5650
+VPSLLWri	5651
+VPSLLWrm	5652
+VPSLLWrr	5653
+VPSRADYri	5654
+VPSRADYrm	5655
+VPSRADYrr	5656
+VPSRADZ	5657
+VPSRADZmbi	5658
+VPSRADZmbik	5659
+VPSRADZmbikz	5660
+VPSRADZmi	5661
+VPSRADZmik	5662
+VPSRADZmikz	5663
+VPSRADZri	5664
+VPSRADZrik	5665
+VPSRADZrikz	5666
+VPSRADZrm	5667
+VPSRADZrmk	5668
+VPSRADZrmkz	5669
+VPSRADZrr	5670
+VPSRADZrrk	5671
+VPSRADZrrkz	5672
+VPSRADri	5673
+VPSRADrm	5674
+VPSRADrr	5675
+VPSRAQZ	5676
+VPSRAQZmbi	5677
+VPSRAQZmbik	5678
+VPSRAQZmbikz	5679
+VPSRAQZmi	5680
+VPSRAQZmik	5681
+VPSRAQZmikz	5682
+VPSRAQZri	5683
+VPSRAQZrik	5684
+VPSRAQZrikz	5685
+VPSRAQZrm	5686
+VPSRAQZrmk	5687
+VPSRAQZrmkz	5688
+VPSRAQZrr	5689
+VPSRAQZrrk	5690
+VPSRAQZrrkz	5691
+VPSRAVDYrm	5692
+VPSRAVDYrr	5693
+VPSRAVDZ	5694
+VPSRAVDZrm	5695
+VPSRAVDZrmb	5696
+VPSRAVDZrmbk	5697
+VPSRAVDZrmbkz	5698
+VPSRAVDZrmk	5699
+VPSRAVDZrmkz	5700
+VPSRAVDZrr	5701
+VPSRAVDZrrk	5702
+VPSRAVDZrrkz	5703
+VPSRAVDrm	5704
+VPSRAVDrr	5705
+VPSRAVQZ	5706
+VPSRAVQZrm	5707
+VPSRAVQZrmb	5708
+VPSRAVQZrmbk	5709
+VPSRAVQZrmbkz	5710
+VPSRAVQZrmk	5711
+VPSRAVQZrmkz	5712
+VPSRAVQZrr	5713
+VPSRAVQZrrk	5714
+VPSRAVQZrrkz	5715
+VPSRAVWZ	5716
+VPSRAVWZrm	5717
+VPSRAVWZrmk	5718
+VPSRAVWZrmkz	5719
+VPSRAVWZrr	5720
+VPSRAVWZrrk	5721
+VPSRAVWZrrkz	5722
+VPSRAWYri	5723
+VPSRAWYrm	5724
+VPSRAWYrr	5725
+VPSRAWZ	5726
+VPSRAWZmi	5727
+VPSRAWZmik	5728
+VPSRAWZmikz	5729
+VPSRAWZri	5730
+VPSRAWZrik	5731
+VPSRAWZrikz	5732
+VPSRAWZrm	5733
+VPSRAWZrmk	5734
+VPSRAWZrmkz	5735
+VPSRAWZrr	5736
+VPSRAWZrrk	5737
+VPSRAWZrrkz	5738
+VPSRAWri	5739
+VPSRAWrm	5740
+VPSRAWrr	5741
+VPSRLDQYri	5742
+VPSRLDQZ	5743
+VPSRLDQZmi	5744
+VPSRLDQZri	5745
+VPSRLDQri	5746
+VPSRLDYri	5747
+VPSRLDYrm	5748
+VPSRLDYrr	5749
+VPSRLDZ	5750
+VPSRLDZmbi	5751
+VPSRLDZmbik	5752
+VPSRLDZmbikz	5753
+VPSRLDZmi	5754
+VPSRLDZmik	5755
+VPSRLDZmikz	5756
+VPSRLDZri	5757
+VPSRLDZrik	5758
+VPSRLDZrikz	5759
+VPSRLDZrm	5760
+VPSRLDZrmk	5761
+VPSRLDZrmkz	5762
+VPSRLDZrr	5763
+VPSRLDZrrk	5764
+VPSRLDZrrkz	5765
+VPSRLDri	5766
+VPSRLDrm	5767
+VPSRLDrr	5768
+VPSRLQYri	5769
+VPSRLQYrm	5770
+VPSRLQYrr	5771
+VPSRLQZ	5772
+VPSRLQZmbi	5773
+VPSRLQZmbik	5774
+VPSRLQZmbikz	5775
+VPSRLQZmi	5776
+VPSRLQZmik	5777
+VPSRLQZmikz	5778
+VPSRLQZri	5779
+VPSRLQZrik	5780
+VPSRLQZrikz	5781
+VPSRLQZrm	5782
+VPSRLQZrmk	5783
+VPSRLQZrmkz	5784
+VPSRLQZrr	5785
+VPSRLQZrrk	5786
+VPSRLQZrrkz	5787
+VPSRLQri	5788
+VPSRLQrm	5789
+VPSRLQrr	5790
+VPSRLVDYrm	5791
+VPSRLVDYrr	5792
+VPSRLVDZ	5793
+VPSRLVDZrm	5794
+VPSRLVDZrmb	5795
+VPSRLVDZrmbk	5796
+VPSRLVDZrmbkz	5797
+VPSRLVDZrmk	5798
+VPSRLVDZrmkz	5799
+VPSRLVDZrr	5800
+VPSRLVDZrrk	5801
+VPSRLVDZrrkz	5802
+VPSRLVDrm	5803
+VPSRLVDrr	5804
+VPSRLVQYrm	5805
+VPSRLVQYrr	5806
+VPSRLVQZ	5807
+VPSRLVQZrm	5808
+VPSRLVQZrmb	5809
+VPSRLVQZrmbk	5810
+VPSRLVQZrmbkz	5811
+VPSRLVQZrmk	5812
+VPSRLVQZrmkz	5813
+VPSRLVQZrr	5814
+VPSRLVQZrrk	5815
+VPSRLVQZrrkz	5816
+VPSRLVQrm	5817
+VPSRLVQrr	5818
+VPSRLVWZ	5819
+VPSRLVWZrm	5820
+VPSRLVWZrmk	5821
+VPSRLVWZrmkz	5822
+VPSRLVWZrr	5823
+VPSRLVWZrrk	5824
+VPSRLVWZrrkz	5825
+VPSRLWYri	5826
+VPSRLWYrm	5827
+VPSRLWYrr	5828
+VPSRLWZ	5829
+VPSRLWZmi	5830
+VPSRLWZmik	5831
+VPSRLWZmikz	5832
+VPSRLWZri	5833
+VPSRLWZrik	5834
+VPSRLWZrikz	5835
+VPSRLWZrm	5836
+VPSRLWZrmk	5837
+VPSRLWZrmkz	5838
+VPSRLWZrr	5839
+VPSRLWZrrk	5840
+VPSRLWZrrkz	5841
+VPSRLWri	5842
+VPSRLWrm	5843
+VPSRLWrr	5844
+VPSUBBYrm	5845
+VPSUBBYrr	5846
+VPSUBBZ	5847
+VPSUBBZrm	5848
+VPSUBBZrmk	5849
+VPSUBBZrmkz	5850
+VPSUBBZrr	5851
+VPSUBBZrrk	5852
+VPSUBBZrrkz	5853
+VPSUBBrm	5854
+VPSUBBrr	5855
+VPSUBDYrm	5856
+VPSUBDYrr	5857
+VPSUBDZ	5858
+VPSUBDZrm	5859
+VPSUBDZrmb	5860
+VPSUBDZrmbk	5861
+VPSUBDZrmbkz	5862
+VPSUBDZrmk	5863
+VPSUBDZrmkz	5864
+VPSUBDZrr	5865
+VPSUBDZrrk	5866
+VPSUBDZrrkz	5867
+VPSUBDrm	5868
+VPSUBDrr	5869
+VPSUBQYrm	5870
+VPSUBQYrr	5871
+VPSUBQZ	5872
+VPSUBQZrm	5873
+VPSUBQZrmb	5874
+VPSUBQZrmbk	5875
+VPSUBQZrmbkz	5876
+VPSUBQZrmk	5877
+VPSUBQZrmkz	5878
+VPSUBQZrr	5879
+VPSUBQZrrk	5880
+VPSUBQZrrkz	5881
+VPSUBQrm	5882
+VPSUBQrr	5883
+VPSUBSBYrm	5884
+VPSUBSBYrr	5885
+VPSUBSBZ	5886
+VPSUBSBZrm	5887
+VPSUBSBZrmk	5888
+VPSUBSBZrmkz	5889
+VPSUBSBZrr	5890
+VPSUBSBZrrk	5891
+VPSUBSBZrrkz	5892
+VPSUBSBrm	5893
+VPSUBSBrr	5894
+VPSUBSWYrm	5895
+VPSUBSWYrr	5896
+VPSUBSWZ	5897
+VPSUBSWZrm	5898
+VPSUBSWZrmk	5899
+VPSUBSWZrmkz	5900
+VPSUBSWZrr	5901
+VPSUBSWZrrk	5902
+VPSUBSWZrrkz	5903
+VPSUBSWrm	5904
+VPSUBSWrr	5905
+VPSUBUSBYrm	5906
+VPSUBUSBYrr	5907
+VPSUBUSBZ	5908
+VPSUBUSBZrm	5909
+VPSUBUSBZrmk	5910
+VPSUBUSBZrmkz	5911
+VPSUBUSBZrr	5912
+VPSUBUSBZrrk	5913
+VPSUBUSBZrrkz	5914
+VPSUBUSBrm	5915
+VPSUBUSBrr	5916
+VPSUBUSWYrm	5917
+VPSUBUSWYrr	5918
+VPSUBUSWZ	5919
+VPSUBUSWZrm	5920
+VPSUBUSWZrmk	5921
+VPSUBUSWZrmkz	5922
+VPSUBUSWZrr	5923
+VPSUBUSWZrrk	5924
+VPSUBUSWZrrkz	5925
+VPSUBUSWrm	5926
+VPSUBUSWrr	5927
+VPSUBWYrm	5928
+VPSUBWYrr	5929
+VPSUBWZ	5930
+VPSUBWZrm	5931
+VPSUBWZrmk	5932
+VPSUBWZrmkz	5933
+VPSUBWZrr	5934
+VPSUBWZrrk	5935
+VPSUBWZrrkz	5936
+VPSUBWrm	5937
+VPSUBWrr	5938
+VPTERNLOGDZ	5939
+VPTERNLOGDZrmbi	5940
+VPTERNLOGDZrmbik	5941
+VPTERNLOGDZrmbikz	5942
+VPTERNLOGDZrmi	5943
+VPTERNLOGDZrmik	5944
+VPTERNLOGDZrmikz	5945
+VPTERNLOGDZrri	5946
+VPTERNLOGDZrrik	5947
+VPTERNLOGDZrrikz	5948
+VPTERNLOGQZ	5949
+VPTERNLOGQZrmbi	5950
+VPTERNLOGQZrmbik	5951
+VPTERNLOGQZrmbikz	5952
+VPTERNLOGQZrmi	5953
+VPTERNLOGQZrmik	5954
+VPTERNLOGQZrmikz	5955
+VPTERNLOGQZrri	5956
+VPTERNLOGQZrrik	5957
+VPTERNLOGQZrrikz	5958
+VPTESTMBZ	5959
+VPTESTMBZrm	5960
+VPTESTMBZrmk	5961
+VPTESTMBZrr	5962
+VPTESTMBZrrk	5963
+VPTESTMDZ	5964
+VPTESTMDZrm	5965
+VPTESTMDZrmb	5966
+VPTESTMDZrmbk	5967
+VPTESTMDZrmk	5968
+VPTESTMDZrr	5969
+VPTESTMDZrrk	5970
+VPTESTMQZ	5971
+VPTESTMQZrm	5972
+VPTESTMQZrmb	5973
+VPTESTMQZrmbk	5974
+VPTESTMQZrmk	5975
+VPTESTMQZrr	5976
+VPTESTMQZrrk	5977
+VPTESTMWZ	5978
+VPTESTMWZrm	5979
+VPTESTMWZrmk	5980
+VPTESTMWZrr	5981
+VPTESTMWZrrk	5982
+VPTESTNMBZ	5983
+VPTESTNMBZrm	5984
+VPTESTNMBZrmk	5985
+VPTESTNMBZrr	5986
+VPTESTNMBZrrk	5987
+VPTESTNMDZ	5988
+VPTESTNMDZrm	5989
+VPTESTNMDZrmb	5990
+VPTESTNMDZrmbk	5991
+VPTESTNMDZrmk	5992
+VPTESTNMDZrr	5993
+VPTESTNMDZrrk	5994
+VPTESTNMQZ	5995
+VPTESTNMQZrm	5996
+VPTESTNMQZrmb	5997
+VPTESTNMQZrmbk	5998
+VPTESTNMQZrmk	5999
+VPTESTNMQZrr	6000
+VPTESTNMQZrrk	6001
+VPTESTNMWZ	6002
+VPTESTNMWZrm	6003
+VPTESTNMWZrmk	6004
+VPTESTNMWZrr	6005
+VPTESTNMWZrrk	6006
+VPTESTYrm	6007
+VPTESTYrr	6008
+VPTESTrm	6009
+VPTESTrr	6010
+VPUNPCKHBWYrm	6011
+VPUNPCKHBWYrr	6012
+VPUNPCKHBWZ	6013
+VPUNPCKHBWZrm	6014
+VPUNPCKHBWZrmk	6015
+VPUNPCKHBWZrmkz	6016
+VPUNPCKHBWZrr	6017
+VPUNPCKHBWZrrk	6018
+VPUNPCKHBWZrrkz	6019
+VPUNPCKHBWrm	6020
+VPUNPCKHBWrr	6021
+VPUNPCKHDQYrm	6022
+VPUNPCKHDQYrr	6023
+VPUNPCKHDQZ	6024
+VPUNPCKHDQZrm	6025
+VPUNPCKHDQZrmb	6026
+VPUNPCKHDQZrmbk	6027
+VPUNPCKHDQZrmbkz	6028
+VPUNPCKHDQZrmk	6029
+VPUNPCKHDQZrmkz	6030
+VPUNPCKHDQZrr	6031
+VPUNPCKHDQZrrk	6032
+VPUNPCKHDQZrrkz	6033
+VPUNPCKHDQrm	6034
+VPUNPCKHDQrr	6035
+VPUNPCKHQDQYrm	6036
+VPUNPCKHQDQYrr	6037
+VPUNPCKHQDQZ	6038
+VPUNPCKHQDQZrm	6039
+VPUNPCKHQDQZrmb	6040
+VPUNPCKHQDQZrmbk	6041
+VPUNPCKHQDQZrmbkz	6042
+VPUNPCKHQDQZrmk	6043
+VPUNPCKHQDQZrmkz	6044
+VPUNPCKHQDQZrr	6045
+VPUNPCKHQDQZrrk	6046
+VPUNPCKHQDQZrrkz	6047
+VPUNPCKHQDQrm	6048
+VPUNPCKHQDQrr	6049
+VPUNPCKHWDYrm	6050
+VPUNPCKHWDYrr	6051
+VPUNPCKHWDZ	6052
+VPUNPCKHWDZrm	6053
+VPUNPCKHWDZrmk	6054
+VPUNPCKHWDZrmkz	6055
+VPUNPCKHWDZrr	6056
+VPUNPCKHWDZrrk	6057
+VPUNPCKHWDZrrkz	6058
+VPUNPCKHWDrm	6059
+VPUNPCKHWDrr	6060
+VPUNPCKLBWYrm	6061
+VPUNPCKLBWYrr	6062
+VPUNPCKLBWZ	6063
+VPUNPCKLBWZrm	6064
+VPUNPCKLBWZrmk	6065
+VPUNPCKLBWZrmkz	6066
+VPUNPCKLBWZrr	6067
+VPUNPCKLBWZrrk	6068
+VPUNPCKLBWZrrkz	6069
+VPUNPCKLBWrm	6070
+VPUNPCKLBWrr	6071
+VPUNPCKLDQYrm	6072
+VPUNPCKLDQYrr	6073
+VPUNPCKLDQZ	6074
+VPUNPCKLDQZrm	6075
+VPUNPCKLDQZrmb	6076
+VPUNPCKLDQZrmbk	6077
+VPUNPCKLDQZrmbkz	6078
+VPUNPCKLDQZrmk	6079
+VPUNPCKLDQZrmkz	6080
+VPUNPCKLDQZrr	6081
+VPUNPCKLDQZrrk	6082
+VPUNPCKLDQZrrkz	6083
+VPUNPCKLDQrm	6084
+VPUNPCKLDQrr	6085
+VPUNPCKLQDQYrm	6086
+VPUNPCKLQDQYrr	6087
+VPUNPCKLQDQZ	6088
+VPUNPCKLQDQZrm	6089
+VPUNPCKLQDQZrmb	6090
+VPUNPCKLQDQZrmbk	6091
+VPUNPCKLQDQZrmbkz	6092
+VPUNPCKLQDQZrmk	6093
+VPUNPCKLQDQZrmkz	6094
+VPUNPCKLQDQZrr	6095
+VPUNPCKLQDQZrrk	6096
+VPUNPCKLQDQZrrkz	6097
+VPUNPCKLQDQrm	6098
+VPUNPCKLQDQrr	6099
+VPUNPCKLWDYrm	6100
+VPUNPCKLWDYrr	6101
+VPUNPCKLWDZ	6102
+VPUNPCKLWDZrm	6103
+VPUNPCKLWDZrmk	6104
+VPUNPCKLWDZrmkz	6105
+VPUNPCKLWDZrr	6106
+VPUNPCKLWDZrrk	6107
+VPUNPCKLWDZrrkz	6108
+VPUNPCKLWDrm	6109
+VPUNPCKLWDrr	6110
+VPXORDZ	6111
+VPXORDZrm	6112
+VPXORDZrmb	6113
+VPXORDZrmbk	6114
+VPXORDZrmbkz	6115
+VPXORDZrmk	6116
+VPXORDZrmkz	6117
+VPXORDZrr	6118
+VPXORDZrrk	6119
+VPXORDZrrkz	6120
+VPXORQZ	6121
+VPXORQZrm	6122
+VPXORQZrmb	6123
+VPXORQZrmbk	6124
+VPXORQZrmbkz	6125
+VPXORQZrmk	6126
+VPXORQZrmkz	6127
+VPXORQZrr	6128
+VPXORQZrrk	6129
+VPXORQZrrkz	6130
+VPXORYrm	6131
+VPXORYrr	6132
+VPXORrm	6133
+VPXORrr	6134
+VRANGEPDZ	6135
+VRANGEPDZrmbi	6136
+VRANGEPDZrmbik	6137
+VRANGEPDZrmbikz	6138
+VRANGEPDZrmi	6139
+VRANGEPDZrmik	6140
+VRANGEPDZrmikz	6141
+VRANGEPDZrri	6142
+VRANGEPDZrrib	6143
+VRANGEPDZrribk	6144
+VRANGEPDZrribkz	6145
+VRANGEPDZrrik	6146
+VRANGEPDZrrikz	6147
+VRANGEPSZ	6148
+VRANGEPSZrmbi	6149
+VRANGEPSZrmbik	6150
+VRANGEPSZrmbikz	6151
+VRANGEPSZrmi	6152
+VRANGEPSZrmik	6153
+VRANGEPSZrmikz	6154
+VRANGEPSZrri	6155
+VRANGEPSZrrib	6156
+VRANGEPSZrribk	6157
+VRANGEPSZrribkz	6158
+VRANGEPSZrrik	6159
+VRANGEPSZrrikz	6160
+VRANGESDZrmi	6161
+VRANGESDZrmik	6162
+VRANGESDZrmikz	6163
+VRANGESDZrri	6164
+VRANGESDZrrib	6165
+VRANGESDZrribk	6166
+VRANGESDZrribkz	6167
+VRANGESDZrrik	6168
+VRANGESDZrrikz	6169
+VRANGESSZrmi	6170
+VRANGESSZrmik	6171
+VRANGESSZrmikz	6172
+VRANGESSZrri	6173
+VRANGESSZrrib	6174
+VRANGESSZrribk	6175
+VRANGESSZrribkz	6176
+VRANGESSZrrik	6177
+VRANGESSZrrikz	6178
+VRCP	6179
+VRCPBF	6180
+VRCPPHZ	6181
+VRCPPHZm	6182
+VRCPPHZmb	6183
+VRCPPHZmbk	6184
+VRCPPHZmbkz	6185
+VRCPPHZmk	6186
+VRCPPHZmkz	6187
+VRCPPHZr	6188
+VRCPPHZrk	6189
+VRCPPHZrkz	6190
+VRCPPSYm	6191
+VRCPPSYr	6192
+VRCPPSm	6193
+VRCPPSr	6194
+VRCPSHZrm	6195
+VRCPSHZrmk	6196
+VRCPSHZrmkz	6197
+VRCPSHZrr	6198
+VRCPSHZrrk	6199
+VRCPSHZrrkz	6200
+VRCPSSm	6201
+VRCPSSm_Int	6202
+VRCPSSr	6203
+VRCPSSr_Int	6204
+VREDUCEBF	6205
+VREDUCEPDZ	6206
+VREDUCEPDZrmbi	6207
+VREDUCEPDZrmbik	6208
+VREDUCEPDZrmbikz	6209
+VREDUCEPDZrmi	6210
+VREDUCEPDZrmik	6211
+VREDUCEPDZrmikz	6212
+VREDUCEPDZrri	6213
+VREDUCEPDZrrib	6214
+VREDUCEPDZrribk	6215
+VREDUCEPDZrribkz	6216
+VREDUCEPDZrrik	6217
+VREDUCEPDZrrikz	6218
+VREDUCEPHZ	6219
+VREDUCEPHZrmbi	6220
+VREDUCEPHZrmbik	6221
+VREDUCEPHZrmbikz	6222
+VREDUCEPHZrmi	6223
+VREDUCEPHZrmik	6224
+VREDUCEPHZrmikz	6225
+VREDUCEPHZrri	6226
+VREDUCEPHZrrib	6227
+VREDUCEPHZrribk	6228
+VREDUCEPHZrribkz	6229
+VREDUCEPHZrrik	6230
+VREDUCEPHZrrikz	6231
+VREDUCEPSZ	6232
+VREDUCEPSZrmbi	6233
+VREDUCEPSZrmbik	6234
+VREDUCEPSZrmbikz	6235
+VREDUCEPSZrmi	6236
+VREDUCEPSZrmik	6237
+VREDUCEPSZrmikz	6238
+VREDUCEPSZrri	6239
+VREDUCEPSZrrib	6240
+VREDUCEPSZrribk	6241
+VREDUCEPSZrribkz	6242
+VREDUCEPSZrrik	6243
+VREDUCEPSZrrikz	6244
+VREDUCESDZrmi	6245
+VREDUCESDZrmik	6246
+VREDUCESDZrmikz	6247
+VREDUCESDZrri	6248
+VREDUCESDZrrib	6249
+VREDUCESDZrribk	6250
+VREDUCESDZrribkz	6251
+VREDUCESDZrrik	6252
+VREDUCESDZrrikz	6253
+VREDUCESHZrmi	6254
+VREDUCESHZrmik	6255
+VREDUCESHZrmikz	6256
+VREDUCESHZrri	6257
+VREDUCESHZrrib	6258
+VREDUCESHZrribk	6259
+VREDUCESHZrribkz	6260
+VREDUCESHZrrik	6261
+VREDUCESHZrrikz	6262
+VREDUCESSZrmi	6263
+VREDUCESSZrmik	6264
+VREDUCESSZrmikz	6265
+VREDUCESSZrri	6266
+VREDUCESSZrrib	6267
+VREDUCESSZrribk	6268
+VREDUCESSZrribkz	6269
+VREDUCESSZrrik	6270
+VREDUCESSZrrikz	6271
+VRNDSCALEBF	6272
+VRNDSCALEPDZ	6273
+VRNDSCALEPDZrmbi	6274
+VRNDSCALEPDZrmbik	6275
+VRNDSCALEPDZrmbikz	6276
+VRNDSCALEPDZrmi	6277
+VRNDSCALEPDZrmik	6278
+VRNDSCALEPDZrmikz	6279
+VRNDSCALEPDZrri	6280
+VRNDSCALEPDZrrib	6281
+VRNDSCALEPDZrribk	6282
+VRNDSCALEPDZrribkz	6283
+VRNDSCALEPDZrrik	6284
+VRNDSCALEPDZrrikz	6285
+VRNDSCALEPHZ	6286
+VRNDSCALEPHZrmbi	6287
+VRNDSCALEPHZrmbik	6288
+VRNDSCALEPHZrmbikz	6289
+VRNDSCALEPHZrmi	6290
+VRNDSCALEPHZrmik	6291
+VRNDSCALEPHZrmikz	6292
+VRNDSCALEPHZrri	6293
+VRNDSCALEPHZrrib	6294
+VRNDSCALEPHZrribk	6295
+VRNDSCALEPHZrribkz	6296
+VRNDSCALEPHZrrik	6297
+VRNDSCALEPHZrrikz	6298
+VRNDSCALEPSZ	6299
+VRNDSCALEPSZrmbi	6300
+VRNDSCALEPSZrmbik	6301
+VRNDSCALEPSZrmbikz	6302
+VRNDSCALEPSZrmi	6303
+VRNDSCALEPSZrmik	6304
+VRNDSCALEPSZrmikz	6305
+VRNDSCALEPSZrri	6306
+VRNDSCALEPSZrrib	6307
+VRNDSCALEPSZrribk	6308
+VRNDSCALEPSZrribkz	6309
+VRNDSCALEPSZrrik	6310
+VRNDSCALEPSZrrikz	6311
+VRNDSCALESDZrmi	6312
+VRNDSCALESDZrmi_Int	6313
+VRNDSCALESDZrmik_Int	6314
+VRNDSCALESDZrmikz_Int	6315
+VRNDSCALESDZrri	6316
+VRNDSCALESDZrri_Int	6317
+VRNDSCALESDZrrib_Int	6318
+VRNDSCALESDZrribk_Int	6319
+VRNDSCALESDZrribkz_Int	6320
+VRNDSCALESDZrrik_Int	6321
+VRNDSCALESDZrrikz_Int	6322
+VRNDSCALESHZrmi	6323
+VRNDSCALESHZrmi_Int	6324
+VRNDSCALESHZrmik_Int	6325
+VRNDSCALESHZrmikz_Int	6326
+VRNDSCALESHZrri	6327
+VRNDSCALESHZrri_Int	6328
+VRNDSCALESHZrrib_Int	6329
+VRNDSCALESHZrribk_Int	6330
+VRNDSCALESHZrribkz_Int	6331
+VRNDSCALESHZrrik_Int	6332
+VRNDSCALESHZrrikz_Int	6333
+VRNDSCALESSZrmi	6334
+VRNDSCALESSZrmi_Int	6335
+VRNDSCALESSZrmik_Int	6336
+VRNDSCALESSZrmikz_Int	6337
+VRNDSCALESSZrri	6338
+VRNDSCALESSZrri_Int	6339
+VRNDSCALESSZrrib_Int	6340
+VRNDSCALESSZrribk_Int	6341
+VRNDSCALESSZrribkz_Int	6342
+VRNDSCALESSZrrik_Int	6343
+VRNDSCALESSZrrikz_Int	6344
+VROUNDPDYmi	6345
+VROUNDPDYri	6346
+VROUNDPDmi	6347
+VROUNDPDri	6348
+VROUNDPSYmi	6349
+VROUNDPSYri	6350
+VROUNDPSmi	6351
+VROUNDPSri	6352
+VROUNDSDmi	6353
+VROUNDSDmi_Int	6354
+VROUNDSDri	6355
+VROUNDSDri_Int	6356
+VROUNDSSmi	6357
+VROUNDSSmi_Int	6358
+VROUNDSSri	6359
+VROUNDSSri_Int	6360
+VRSQRT	6361
+VRSQRTBF	6362
+VRSQRTPHZ	6363
+VRSQRTPHZm	6364
+VRSQRTPHZmb	6365
+VRSQRTPHZmbk	6366
+VRSQRTPHZmbkz	6367
+VRSQRTPHZmk	6368
+VRSQRTPHZmkz	6369
+VRSQRTPHZr	6370
+VRSQRTPHZrk	6371
+VRSQRTPHZrkz	6372
+VRSQRTPSYm	6373
+VRSQRTPSYr	6374
+VRSQRTPSm	6375
+VRSQRTPSr	6376
+VRSQRTSHZrm	6377
+VRSQRTSHZrmk	6378
+VRSQRTSHZrmkz	6379
+VRSQRTSHZrr	6380
+VRSQRTSHZrrk	6381
+VRSQRTSHZrrkz	6382
+VRSQRTSSm	6383
+VRSQRTSSm_Int	6384
+VRSQRTSSr	6385
+VRSQRTSSr_Int	6386
+VSCALEFBF	6387
+VSCALEFPDZ	6388
+VSCALEFPDZrm	6389
+VSCALEFPDZrmb	6390
+VSCALEFPDZrmbk	6391
+VSCALEFPDZrmbkz	6392
+VSCALEFPDZrmk	6393
+VSCALEFPDZrmkz	6394
+VSCALEFPDZrr	6395
+VSCALEFPDZrrb	6396
+VSCALEFPDZrrbk	6397
+VSCALEFPDZrrbkz	6398
+VSCALEFPDZrrk	6399
+VSCALEFPDZrrkz	6400
+VSCALEFPHZ	6401
+VSCALEFPHZrm	6402
+VSCALEFPHZrmb	6403
+VSCALEFPHZrmbk	6404
+VSCALEFPHZrmbkz	6405
+VSCALEFPHZrmk	6406
+VSCALEFPHZrmkz	6407
+VSCALEFPHZrr	6408
+VSCALEFPHZrrb	6409
+VSCALEFPHZrrbk	6410
+VSCALEFPHZrrbkz	6411
+VSCALEFPHZrrk	6412
+VSCALEFPHZrrkz	6413
+VSCALEFPSZ	6414
+VSCALEFPSZrm	6415
+VSCALEFPSZrmb	6416
+VSCALEFPSZrmbk	6417
+VSCALEFPSZrmbkz	6418
+VSCALEFPSZrmk	6419
+VSCALEFPSZrmkz	6420
+VSCALEFPSZrr	6421
+VSCALEFPSZrrb	6422
+VSCALEFPSZrrbk	6423
+VSCALEFPSZrrbkz	6424
+VSCALEFPSZrrk	6425
+VSCALEFPSZrrkz	6426
+VSCALEFSDZrm	6427
+VSCALEFSDZrmk	6428
+VSCALEFSDZrmkz	6429
+VSCALEFSDZrr	6430
+VSCALEFSDZrrb_Int	6431
+VSCALEFSDZrrbk_Int	6432
+VSCALEFSDZrrbkz_Int	6433
+VSCALEFSDZrrk	6434
+VSCALEFSDZrrkz	6435
+VSCALEFSHZrm	6436
+VSCALEFSHZrmk	6437
+VSCALEFSHZrmkz	6438
+VSCALEFSHZrr	6439
+VSCALEFSHZrrb_Int	6440
+VSCALEFSHZrrbk_Int	6441
+VSCALEFSHZrrbkz_Int	6442
+VSCALEFSHZrrk	6443
+VSCALEFSHZrrkz	6444
+VSCALEFSSZrm	6445
+VSCALEFSSZrmk	6446
+VSCALEFSSZrmkz	6447
+VSCALEFSSZrr	6448
+VSCALEFSSZrrb_Int	6449
+VSCALEFSSZrrbk_Int	6450
+VSCALEFSSZrrbkz_Int	6451
+VSCALEFSSZrrk	6452
+VSCALEFSSZrrkz	6453
+VSCATTERDPDZ	6454
+VSCATTERDPDZmr	6455
+VSCATTERDPSZ	6456
+VSCATTERDPSZmr	6457
+VSCATTERPF	6458
+VSCATTERQPDZ	6459
+VSCATTERQPDZmr	6460
+VSCATTERQPSZ	6461
+VSCATTERQPSZmr	6462
+VSHA	6463
+VSHUFF	6464
+VSHUFI	6465
+VSHUFPDYrmi	6466
+VSHUFPDYrri	6467
+VSHUFPDZ	6468
+VSHUFPDZrmbi	6469
+VSHUFPDZrmbik	6470
+VSHUFPDZrmbikz	6471
+VSHUFPDZrmi	6472
+VSHUFPDZrmik	6473
+VSHUFPDZrmikz	6474
+VSHUFPDZrri	6475
+VSHUFPDZrrik	6476
+VSHUFPDZrrikz	6477
+VSHUFPDrmi	6478
+VSHUFPDrri	6479
+VSHUFPSYrmi	6480
+VSHUFPSYrri	6481
+VSHUFPSZ	6482
+VSHUFPSZrmbi	6483
+VSHUFPSZrmbik	6484
+VSHUFPSZrmbikz	6485
+VSHUFPSZrmi	6486
+VSHUFPSZrmik	6487
+VSHUFPSZrmikz	6488
+VSHUFPSZrri	6489
+VSHUFPSZrrik	6490
+VSHUFPSZrrikz	6491
+VSHUFPSrmi	6492
+VSHUFPSrri	6493
+VSM	6494
+VSQRTBF	6495
+VSQRTPDYm	6496
+VSQRTPDYr	6497
+VSQRTPDZ	6498
+VSQRTPDZm	6499
+VSQRTPDZmb	6500
+VSQRTPDZmbk	6501
+VSQRTPDZmbkz	6502
+VSQRTPDZmk	6503
+VSQRTPDZmkz	6504
+VSQRTPDZr	6505
+VSQRTPDZrb	6506
+VSQRTPDZrbk	6507
+VSQRTPDZrbkz	6508
+VSQRTPDZrk	6509
+VSQRTPDZrkz	6510
+VSQRTPDm	6511
+VSQRTPDr	6512
+VSQRTPHZ	6513
+VSQRTPHZm	6514
+VSQRTPHZmb	6515
+VSQRTPHZmbk	6516
+VSQRTPHZmbkz	6517
+VSQRTPHZmk	6518
+VSQRTPHZmkz	6519
+VSQRTPHZr	6520
+VSQRTPHZrb	6521
+VSQRTPHZrbk	6522
+VSQRTPHZrbkz	6523
+VSQRTPHZrk	6524
+VSQRTPHZrkz	6525
+VSQRTPSYm	6526
+VSQRTPSYr	6527
+VSQRTPSZ	6528
+VSQRTPSZm	6529
+VSQRTPSZmb	6530
+VSQRTPSZmbk	6531
+VSQRTPSZmbkz	6532
+VSQRTPSZmk	6533
+VSQRTPSZmkz	6534
+VSQRTPSZr	6535
+VSQRTPSZrb	6536
+VSQRTPSZrbk	6537
+VSQRTPSZrbkz	6538
+VSQRTPSZrk	6539
+VSQRTPSZrkz	6540
+VSQRTPSm	6541
+VSQRTPSr	6542
+VSQRTSDZm	6543
+VSQRTSDZm_Int	6544
+VSQRTSDZmk_Int	6545
+VSQRTSDZmkz_Int	6546
+VSQRTSDZr	6547
+VSQRTSDZr_Int	6548
+VSQRTSDZrb_Int	6549
+VSQRTSDZrbk_Int	6550
+VSQRTSDZrbkz_Int	6551
+VSQRTSDZrk_Int	6552
+VSQRTSDZrkz_Int	6553
+VSQRTSDm	6554
+VSQRTSDm_Int	6555
+VSQRTSDr	6556
+VSQRTSDr_Int	6557
+VSQRTSHZm	6558
+VSQRTSHZm_Int	6559
+VSQRTSHZmk_Int	6560
+VSQRTSHZmkz_Int	6561
+VSQRTSHZr	6562
+VSQRTSHZr_Int	6563
+VSQRTSHZrb_Int	6564
+VSQRTSHZrbk_Int	6565
+VSQRTSHZrbkz_Int	6566
+VSQRTSHZrk_Int	6567
+VSQRTSHZrkz_Int	6568
+VSQRTSSZm	6569
+VSQRTSSZm_Int	6570
+VSQRTSSZmk_Int	6571
+VSQRTSSZmkz_Int	6572
+VSQRTSSZr	6573
+VSQRTSSZr_Int	6574
+VSQRTSSZrb_Int	6575
+VSQRTSSZrbk_Int	6576
+VSQRTSSZrbkz_Int	6577
+VSQRTSSZrk_Int	6578
+VSQRTSSZrkz_Int	6579
+VSQRTSSm	6580
+VSQRTSSm_Int	6581
+VSQRTSSr	6582
+VSQRTSSr_Int	6583
+VSTMXCSR	6584
+VSUBBF	6585
+VSUBPDYrm	6586
+VSUBPDYrr	6587
+VSUBPDZ	6588
+VSUBPDZrm	6589
+VSUBPDZrmb	6590
+VSUBPDZrmbk	6591
+VSUBPDZrmbkz	6592
+VSUBPDZrmk	6593
+VSUBPDZrmkz	6594
+VSUBPDZrr	6595
+VSUBPDZrrb	6596
+VSUBPDZrrbk	6597
+VSUBPDZrrbkz	6598
+VSUBPDZrrk	6599
+VSUBPDZrrkz	6600
+VSUBPDrm	6601
+VSUBPDrr	6602
+VSUBPHZ	6603
+VSUBPHZrm	6604
+VSUBPHZrmb	6605
+VSUBPHZrmbk	6606
+VSUBPHZrmbkz	6607
+VSUBPHZrmk	6608
+VSUBPHZrmkz	6609
+VSUBPHZrr	6610
+VSUBPHZrrb	6611
+VSUBPHZrrbk	6612
+VSUBPHZrrbkz	6613
+VSUBPHZrrk	6614
+VSUBPHZrrkz	6615
+VSUBPSYrm	6616
+VSUBPSYrr	6617
+VSUBPSZ	6618
+VSUBPSZrm	6619
+VSUBPSZrmb	6620
+VSUBPSZrmbk	6621
+VSUBPSZrmbkz	6622
+VSUBPSZrmk	6623
+VSUBPSZrmkz	6624
+VSUBPSZrr	6625
+VSUBPSZrrb	6626
+VSUBPSZrrbk	6627
+VSUBPSZrrbkz	6628
+VSUBPSZrrk	6629
+VSUBPSZrrkz	6630
+VSUBPSrm	6631
+VSUBPSrr	6632
+VSUBSDZrm	6633
+VSUBSDZrm_Int	6634
+VSUBSDZrmk_Int	6635
+VSUBSDZrmkz_Int	6636
+VSUBSDZrr	6637
+VSUBSDZrr_Int	6638
+VSUBSDZrrb_Int	6639
+VSUBSDZrrbk_Int	6640
+VSUBSDZrrbkz_Int	6641
+VSUBSDZrrk_Int	6642
+VSUBSDZrrkz_Int	6643
+VSUBSDrm	6644
+VSUBSDrm_Int	6645
+VSUBSDrr	6646
+VSUBSDrr_Int	6647
+VSUBSHZrm	6648
+VSUBSHZrm_Int	6649
+VSUBSHZrmk_Int	6650
+VSUBSHZrmkz_Int	6651
+VSUBSHZrr	6652
+VSUBSHZrr_Int	6653
+VSUBSHZrrb_Int	6654
+VSUBSHZrrbk_Int	6655
+VSUBSHZrrbkz_Int	6656
+VSUBSHZrrk_Int	6657
+VSUBSHZrrkz_Int	6658
+VSUBSSZrm	6659
+VSUBSSZrm_Int	6660
+VSUBSSZrmk_Int	6661
+VSUBSSZrmkz_Int	6662
+VSUBSSZrr	6663
+VSUBSSZrr_Int	6664
+VSUBSSZrrb_Int	6665
+VSUBSSZrrbk_Int	6666
+VSUBSSZrrbkz_Int	6667
+VSUBSSZrrk_Int	6668
+VSUBSSZrrkz_Int	6669
+VSUBSSrm	6670
+VSUBSSrm_Int	6671
+VSUBSSrr	6672
+VSUBSSrr_Int	6673
+VTESTPDYrm	6674
+VTESTPDYrr	6675
+VTESTPDrm	6676
+VTESTPDrr	6677
+VTESTPSYrm	6678
+VTESTPSYrr	6679
+VTESTPSrm	6680
+VTESTPSrr	6681
+VUCOMISDZrm	6682
+VUCOMISDZrm_Int	6683
+VUCOMISDZrr	6684
+VUCOMISDZrr_Int	6685
+VUCOMISDZrrb	6686
+VUCOMISDrm	6687
+VUCOMISDrm_Int	6688
+VUCOMISDrr	6689
+VUCOMISDrr_Int	6690
+VUCOMISHZrm	6691
+VUCOMISHZrm_Int	6692
+VUCOMISHZrr	6693
+VUCOMISHZrr_Int	6694
+VUCOMISHZrrb	6695
+VUCOMISSZrm	6696
+VUCOMISSZrm_Int	6697
+VUCOMISSZrr	6698
+VUCOMISSZrr_Int	6699
+VUCOMISSZrrb	6700
+VUCOMISSrm	6701
+VUCOMISSrm_Int	6702
+VUCOMISSrr	6703
+VUCOMISSrr_Int	6704
+VUCOMXSDZrm	6705
+VUCOMXSDZrm_Int	6706
+VUCOMXSDZrr	6707
+VUCOMXSDZrr_Int	6708
+VUCOMXSDZrrb_Int	6709
+VUCOMXSHZrm	6710
+VUCOMXSHZrm_Int	6711
+VUCOMXSHZrr	6712
+VUCOMXSHZrr_Int	6713
+VUCOMXSHZrrb_Int	6714
+VUCOMXSSZrm	6715
+VUCOMXSSZrm_Int	6716
+VUCOMXSSZrr	6717
+VUCOMXSSZrr_Int	6718
+VUCOMXSSZrrb_Int	6719
+VUNPCKHPDYrm	6720
+VUNPCKHPDYrr	6721
+VUNPCKHPDZ	6722
+VUNPCKHPDZrm	6723
+VUNPCKHPDZrmb	6724
+VUNPCKHPDZrmbk	6725
+VUNPCKHPDZrmbkz	6726
+VUNPCKHPDZrmk	6727
+VUNPCKHPDZrmkz	6728
+VUNPCKHPDZrr	6729
+VUNPCKHPDZrrk	6730
+VUNPCKHPDZrrkz	6731
+VUNPCKHPDrm	6732
+VUNPCKHPDrr	6733
+VUNPCKHPSYrm	6734
+VUNPCKHPSYrr	6735
+VUNPCKHPSZ	6736
+VUNPCKHPSZrm	6737
+VUNPCKHPSZrmb	6738
+VUNPCKHPSZrmbk	6739
+VUNPCKHPSZrmbkz	6740
+VUNPCKHPSZrmk	6741
+VUNPCKHPSZrmkz	6742
+VUNPCKHPSZrr	6743
+VUNPCKHPSZrrk	6744
+VUNPCKHPSZrrkz	6745
+VUNPCKHPSrm	6746
+VUNPCKHPSrr	6747
+VUNPCKLPDYrm	6748
+VUNPCKLPDYrr	6749
+VUNPCKLPDZ	6750
+VUNPCKLPDZrm	6751
+VUNPCKLPDZrmb	6752
+VUNPCKLPDZrmbk	6753
+VUNPCKLPDZrmbkz	6754
+VUNPCKLPDZrmk	6755
+VUNPCKLPDZrmkz	6756
+VUNPCKLPDZrr	6757
+VUNPCKLPDZrrk	6758
+VUNPCKLPDZrrkz	6759
+VUNPCKLPDrm	6760
+VUNPCKLPDrr	6761
+VUNPCKLPSYrm	6762
+VUNPCKLPSYrr	6763
+VUNPCKLPSZ	6764
+VUNPCKLPSZrm	6765
+VUNPCKLPSZrmb	6766
+VUNPCKLPSZrmbk	6767
+VUNPCKLPSZrmbkz	6768
+VUNPCKLPSZrmk	6769
+VUNPCKLPSZrmkz	6770
+VUNPCKLPSZrr	6771
+VUNPCKLPSZrrk	6772
+VUNPCKLPSZrrkz	6773
+VUNPCKLPSrm	6774
+VUNPCKLPSrr	6775
+VXORPDYrm	6776
+VXORPDYrr	6777
+VXORPDZ	6778
+VXORPDZrm	6779
+VXORPDZrmb	6780
+VXORPDZrmbk	6781
+VXORPDZrmbkz	6782
+VXORPDZrmk	6783
+VXORPDZrmkz	6784
+VXORPDZrr	6785
+VXORPDZrrk	6786
+VXORPDZrrkz	6787
+VXORPDrm	6788
+VXORPDrr	6789
+VXORPSYrm	6790
+VXORPSYrr	6791
+VXORPSZ	6792
+VXORPSZrm	6793
+VXORPSZrmb	6794
+VXORPSZrmbk	6795
+VXORPSZrmbkz	6796
+VXORPSZrmk	6797
+VXORPSZrmkz	6798
+VXORPSZrr	6799
+VXORPSZrrk	6800
+VXORPSZrrkz	6801
+VXORPSrm	6802
+VXORPSrr	6803
+VZEROALL	6804
+VZEROUPPER	6805
+V_SET	6806
+V_SETALLONES	6807
+WAIT	6808
+WBINVD	6809
+WBNOINVD	6810
+WRFLAGS	6811
+WRFSBASE	6812
+WRGSBASE	6813
+WRMSR	6814
+WRMSRLIST	6815
+WRMSRNS	6816
+WRMSRNSir	6817
+WRMSRNSir_EVEX	6818
+WRPKRUr	6819
+WRSSD	6820
+WRSSD_EVEX	6821
+WRSSQ	6822
+WRSSQ_EVEX	6823
+WRUSSD	6824
+WRUSSD_EVEX	6825
+WRUSSQ	6826
+WRUSSQ_EVEX	6827
+XABORT	6828
+XABORT_DEF	6829
+XACQUIRE_PREFIX	6830
+XADD	6831
+XAM_F	6832
+XAM_Fp	6833
+XBEGIN	6834
+XCHG	6835
+XCH_F	6836
+XCRYPTCBC	6837
+XCRYPTCFB	6838
+XCRYPTCTR	6839
+XCRYPTECB	6840
+XCRYPTOFB	6841
+XEND	6842
+XGETBV	6843
+XLAT	6844
+XOR	6845
+XORPDrm	6846
+XORPDrr	6847
+XORPSrm	6848
+XORPSrr	6849
+XRELEASE_PREFIX	6850
+XRESLDTRK	6851
+XRSTOR	6852
+XRSTORS	6853
+XSAVE	6854
+XSAVEC	6855
+XSAVEOPT	6856
+XSAVES	6857
+XSETBV	6858
+XSHA	6859
+XSTORE	6860
+XSUSLDTRK	6861
+XTEST	6862
+Immediate	6863
+CImmediate	6864
+FPImmediate	6865
+MBB	6866
+FrameIndex	6867
+ConstantPoolIndex	6868
+TargetIndex	6869
+JumpTableIndex	6870
+ExternalSymbol	6871
+GlobalAddress	6872
+BlockAddress	6873
+RegisterMask	6874
+RegisterLiveOut	6875
+Metadata	6876
+MCSymbol	6877
+CFIIndex	6878
+IntrinsicID	6879
+Predicate	6880
+ShuffleMask	6881
+PhyReg_GR8	6882
+PhyReg_GRH8	6883
+PhyReg_GR8_NOREX2	6884
+PhyReg_GR8_NOREX	6885
+PhyReg_GR8_ABCD_H	6886
+PhyReg_GR8_ABCD_L	6887
+PhyReg_GRH16	6888
+PhyReg_GR16	6889
+PhyReg_GR16_NOREX2	6890
+PhyReg_GR16_NOREX	6891
+PhyReg_VK1	6892
+PhyReg_VK16	6893
+PhyReg_VK2	6894
+PhyReg_VK4	6895
+PhyReg_VK8	6896
+PhyReg_VK16WM	6897
+PhyReg_VK1WM	6898
+PhyReg_VK2WM	6899
+PhyReg_VK4WM	6900
+PhyReg_VK8WM	6901
+PhyReg_SEGMENT_REG	6902
+PhyReg_GR16_ABCD	6903
+PhyReg_FPCCR	6904
+PhyReg_FR16X	6905
+PhyReg_FR16	6906
+PhyReg_VK16PAIR	6907
+PhyReg_VK1PAIR	6908
+PhyReg_VK2PAIR	6909
+PhyReg_VK4PAIR	6910
+PhyReg_VK8PAIR	6911
+PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM	6912
+PhyReg_LOW32_ADDR_ACCESS_RBP	6913
+PhyReg_LOW32_ADDR_ACCESS	6914
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit	6915
+PhyReg_FR32X	6916
+PhyReg_GR32	6917
+PhyReg_GR32_NOSP	6918
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2	6919
+PhyReg_DEBUG_REG	6920
+PhyReg_FR32	6921
+PhyReg_GR32_NOREX2	6922
+PhyReg_GR32_NOREX2_NOSP	6923
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX	6924
+PhyReg_GR32_NOREX	6925
+PhyReg_VK32	6926
+PhyReg_GR32_NOREX_NOSP	6927
+PhyReg_RFP32	6928
+PhyReg_VK32WM	6929
+PhyReg_GR32_ABCD	6930
+PhyReg_GR32_TC	6931
+PhyReg_GR32_ABCD_and_GR32_TC	6932
+PhyReg_GR32_AD	6933
+PhyReg_GR32_ArgRef	6934
+PhyReg_GR32_BPSP	6935
+PhyReg_GR32_BSI	6936
+PhyReg_GR32_CB	6937
+PhyReg_GR32_DC	6938
+PhyReg_GR32_DIBP	6939
+PhyReg_GR32_SIDI	6940
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit	6941
+PhyReg_CCR	6942
+PhyReg_DFCCR	6943
+PhyReg_GR32_ABCD_and_GR32_BSI	6944
+PhyReg_GR32_AD_and_GR32_ArgRef	6945
+PhyReg_GR32_ArgRef_and_GR32_CB	6946
+PhyReg_GR32_BPSP_and_GR32_DIBP	6947
+PhyReg_GR32_BPSP_and_GR32_TC	6948
+PhyReg_GR32_BSI_and_GR32_SIDI	6949
+PhyReg_GR32_DIBP_and_GR32_SIDI	6950
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit	6951
+PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit	6952
+PhyReg_RFP64	6953
+PhyReg_GR64	6954
+PhyReg_FR64X	6955
+PhyReg_GR64_with_sub_8bit	6956
+PhyReg_GR64_NOSP	6957
+PhyReg_GR64_NOREX2	6958
+PhyReg_CONTROL_REG	6959
+PhyReg_FR64	6960
+PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2	6961
+PhyReg_GR64_NOREX2_NOSP	6962
+PhyReg_GR64PLTSafe	6963
+PhyReg_GR64_TC	6964
+PhyReg_GR64_NOREX	6965
+PhyReg_GR64_TCW64	6966
+PhyReg_GR64_TC_with_sub_8bit	6967
+PhyReg_GR64_NOREX2_NOSP_and_GR64_TC	6968
+PhyReg_GR64_TCW64_with_sub_8bit	6969
+PhyReg_GR64_TC_and_GR64_TCW64	6970
+PhyReg_GR64_with_sub_16bit_in_GR16_NOREX	6971
+PhyReg_VK64	6972
+PhyReg_VR64	6973
+PhyReg_GR64PLTSafe_and_GR64_TC	6974
+PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64	6975
+PhyReg_GR64_NOREX_NOSP	6976
+PhyReg_GR64_NOREX_and_GR64_TC	6977
+PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit	6978
+PhyReg_VK64WM	6979
+PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64	6980
+PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX	6981
+PhyReg_GR64PLTSafe_and_GR64_TCW64	6982
+PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC	6983
+PhyReg_GR64_NOREX_and_GR64_TCW64	6984
+PhyReg_GR64_ABCD	6985
+PhyReg_GR64_with_sub_32bit_in_GR32_TC	6986
+PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC	6987
+PhyReg_GR64_AD	6988
+PhyReg_GR64_ArgRef	6989
+PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP	6990
+PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef	6991
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP	6992
+PhyReg_GR64_with_sub_32bit_in_GR32_BSI	6993
+PhyReg_GR64_with_sub_32bit_in_GR32_CB	6994
+PhyReg_GR64_with_sub_32bit_in_GR32_DIBP	6995
+PhyReg_GR64_with_sub_32bit_in_GR32_SIDI	6996
+PhyReg_GR64_A	6997
+PhyReg_GR64_ArgRef_and_GR64_TC	6998
+PhyReg_GR64_and_LOW32_ADDR_ACCESS	6999
+PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI	7000
+PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef	7001
+PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB	7002
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP	7003
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC	7004
+PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI	7005
+PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI	7006
+PhyReg_RST	7007
+PhyReg_RFP80	7008
+PhyReg_RFP80_7	7009
+PhyReg_VR128X	7010
+PhyReg_VR128	7011
+PhyReg_VR256X	7012
+PhyReg_VR256	7013
+PhyReg_VR512	7014
+PhyReg_VR512_0_15	7015
+PhyReg_TILE	7016
+VirtReg_GR8	7017
+VirtReg_GRH8	7018
+VirtReg_GR8_NOREX2	7019
+VirtReg_GR8_NOREX	7020
+VirtReg_GR8_ABCD_H	7021
+VirtReg_GR8_ABCD_L	7022
+VirtReg_GRH16	7023
+VirtReg_GR16	7024
+VirtReg_GR16_NOREX2	7025
+VirtReg_GR16_NOREX	7026
+VirtReg_VK1	7027
+VirtReg_VK16	7028
+VirtReg_VK2	7029
+VirtReg_VK4	7030
+VirtReg_VK8	7031
+VirtReg_VK16WM	7032
+VirtReg_VK1WM	7033
+VirtReg_VK2WM	7034
+VirtReg_VK4WM	7035
+VirtReg_VK8WM	7036
+VirtReg_SEGMENT_REG	7037
+VirtReg_GR16_ABCD	7038
+VirtReg_FPCCR	7039
+VirtReg_FR16X	7040
+VirtReg_FR16	7041
+VirtReg_VK16PAIR	7042
+VirtReg_VK1PAIR	7043
+VirtReg_VK2PAIR	7044
+VirtReg_VK4PAIR	7045
+VirtReg_VK8PAIR	7046
+VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM	7047
+VirtReg_LOW32_ADDR_ACCESS_RBP	7048
+VirtReg_LOW32_ADDR_ACCESS	7049
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit	7050
+VirtReg_FR32X	7051
+VirtReg_GR32	7052
+VirtReg_GR32_NOSP	7053
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2	7054
+VirtReg_DEBUG_REG	7055
+VirtReg_FR32	7056
+VirtReg_GR32_NOREX2	7057
+VirtReg_GR32_NOREX2_NOSP	7058
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX	7059
+VirtReg_GR32_NOREX	7060
+VirtReg_VK32	7061
+VirtReg_GR32_NOREX_NOSP	7062
+VirtReg_RFP32	7063
+VirtReg_VK32WM	7064
+VirtReg_GR32_ABCD	7065
+VirtReg_GR32_TC	7066
+VirtReg_GR32_ABCD_and_GR32_TC	7067
+VirtReg_GR32_AD	7068
+VirtReg_GR32_ArgRef	7069
+VirtReg_GR32_BPSP	7070
+VirtReg_GR32_BSI	7071
+VirtReg_GR32_CB	7072
+VirtReg_GR32_DC	7073
+VirtReg_GR32_DIBP	7074
+VirtReg_GR32_SIDI	7075
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit	7076
+VirtReg_CCR	7077
+VirtReg_DFCCR	7078
+VirtReg_GR32_ABCD_and_GR32_BSI	7079
+VirtReg_GR32_AD_and_GR32_ArgRef	7080
+VirtReg_GR32_ArgRef_and_GR32_CB	7081
+VirtReg_GR32_BPSP_and_GR32_DIBP	7082
+VirtReg_GR32_BPSP_and_GR32_TC	7083
+VirtReg_GR32_BSI_and_GR32_SIDI	7084
+VirtReg_GR32_DIBP_and_GR32_SIDI	7085
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit	7086
+VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit	7087
+VirtReg_RFP64	7088
+VirtReg_GR64	7089
+VirtReg_FR64X	7090
+VirtReg_GR64_with_sub_8bit	7091
+VirtReg_GR64_NOSP	7092
+VirtReg_GR64_NOREX2	7093
+VirtReg_CONTROL_REG	7094
+VirtReg_FR64	7095
+VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2	7096
+VirtReg_GR64_NOREX2_NOSP	7097
+VirtReg_GR64PLTSafe	7098
+VirtReg_GR64_TC	7099
+VirtReg_GR64_NOREX	7100
+VirtReg_GR64_TCW64	7101
+VirtReg_GR64_TC_with_sub_8bit	7102
+VirtReg_GR64_NOREX2_NOSP_and_GR64_TC	7103
+VirtReg_GR64_TCW64_with_sub_8bit	7104
+VirtReg_GR64_TC_and_GR64_TCW64	7105
+VirtReg_GR64_with_sub_16bit_in_GR16_NOREX	7106
+VirtReg_VK64	7107
+VirtReg_VR64	7108
+VirtReg_GR64PLTSafe_and_GR64_TC	7109
+VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64	7110
+VirtReg_GR64_NOREX_NOSP	7111
+VirtReg_GR64_NOREX_and_GR64_TC	7112
+VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit	7113
+VirtReg_VK64WM	7114
+VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64	7115
+VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX	7116
+VirtReg_GR64PLTSafe_and_GR64_TCW64	7117
+VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC	7118
+VirtReg_GR64_NOREX_and_GR64_TCW64	7119
+VirtReg_GR64_ABCD	7120
+VirtReg_GR64_with_sub_32bit_in_GR32_TC	7121
+VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC	7122
+VirtReg_GR64_AD	7123
+VirtReg_GR64_ArgRef	7124
+VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP	7125
+VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef	7126
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP	7127
+VirtReg_GR64_with_sub_32bit_in_GR32_BSI	7128
+VirtReg_GR64_with_sub_32bit_in_GR32_CB	7129
+VirtReg_GR64_with_sub_32bit_in_GR32_DIBP	7130
+VirtReg_GR64_with_sub_32bit_in_GR32_SIDI	7131
+VirtReg_GR64_A	7132
+VirtReg_GR64_ArgRef_and_GR64_TC	7133
+VirtReg_GR64_and_LOW32_ADDR_ACCESS	7134
+VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI	7135
+VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef	7136
+VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB	7137
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP	7138
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC	7139
+VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI	7140
+VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI	7141
+VirtReg_RST	7142
+VirtReg_RFP80	7143
+VirtReg_RFP80_7	7144
+VirtReg_VR128X	7145
+VirtReg_VR128	7146
+VirtReg_VR256X	7147
+VirtReg_VR256	7148
+VirtReg_VR512	7149
+VirtReg_VR512_0_15	7150
+VirtReg_TILE	7151
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s
index f59c7987b615b..311a13c9427b1 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s
@@ -2911,65 +2911,65 @@ vfwsub.wv v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
 # CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_F_S                   vfmv.f.s	fs0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_F_S                   vfmv.f.s	fs0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_F_S                   vfmv.f.s	fs0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_F_S                   vfmv.f.s	fs0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_F_S                   vfmv.f.s	fs0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_F_S                   vfmv.f.s	fs0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_F_S                   vfmv.f.s	fs0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_F_S                   vfmv.f.s	fs0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_F_S                   vfmv.f.s	fs0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_F_S                   vfmv.f.s	fs0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_F_S                   vfmv.f.s	fs0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_F_S                   vfmv.f.s	fs0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_F_S                   vfmv.f.s	fs0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_F_S                   vfmv.f.s	fs0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_F_S                   vfmv.f.s	fs0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_S_F                   vfmv.s.f	v8, fs0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_S_F                   vfmv.s.f	v8, fs0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_S_F                   vfmv.s.f	v8, fs0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_S_F                   vfmv.s.f	v8, fs0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_S_F                   vfmv.s.f	v8, fs0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_S_F                   vfmv.s.f	v8, fs0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_S_F                   vfmv.s.f	v8, fs0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_S_F                   vfmv.s.f	v8, fs0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_S_F                   vfmv.s.f	v8, fs0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_S_F                   vfmv.s.f	v8, fs0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_S_F                   vfmv.s.f	v8, fs0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_S_F                   vfmv.s.f	v8, fs0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_S_F                   vfmv.s.f	v8, fs0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_S_F                   vfmv.s.f	v8, fs0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VFMV_S_F                   vfmv.s.f	v8, fs0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
 # CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
@@ -3763,7 +3763,7 @@ vfwsub.wv v8, v16, v24
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
-# CHECK-NEXT:  -     915.00  -      -      -     885.00 30.00   -
+# CHECK-NEXT:  -     915.00  -      -      -     885.00 120.00  -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
@@ -4758,65 +4758,65 @@ vfwsub.wv v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.f.s	fs0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.f.s	fs0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.f.s	fs0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.f.s	fs0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.f.s	fs0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.f.s	fs0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.f.s	fs0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.f.s	fs0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.f.s	fs0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.f.s	fs0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.f.s	fs0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.f.s	fs0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.f.s	fs0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.f.s	fs0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.f.s	fs0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.s.f	v8, fs0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.s.f	v8, fs0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.s.f	v8, fs0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.s.f	v8, fs0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.s.f	v8, fs0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.s.f	v8, fs0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.s.f	v8, fs0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.s.f	v8, fs0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.s.f	v8, fs0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.s.f	v8, fs0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.s.f	v8, fs0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.s.f	v8, fs0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.s.f	v8, fs0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.s.f	v8, fs0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vfmv.s.f	v8, fs0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s
index 5ae0d43b42d10..de1a5971fcd1d 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s
@@ -1330,93 +1330,93 @@ vfslide1up.vf v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
 # CHECK-NEXT:  1      16    4.00                         16    SMX60_VIEU[4]                              VMV_V_I                    vmv.v.i	v8, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      6     6.00                         6     SMX60_VIEU[6]                              VMV_X_S                    vmv.x.s	s0, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMV_S_X                    vmv.s.x	v8, s0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
 # CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
@@ -1638,487 +1638,487 @@ vfslide1up.vf v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
 # CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      10    10.00                        10    SMX60_VIEU[10]                             VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      36    36.00                        36    SMX60_VIEU[36]                             VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      136   136.00                       136   SMX60_VIEU[136]                            VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      10    10.00                        10    SMX60_VIEU[10]                             VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      36    36.00                        36    SMX60_VIEU[36]                             VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      136   136.00                       136   SMX60_VIEU[136]                            VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      10    10.00                        10    SMX60_VIEU[10]                             VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      36    36.00                        36    SMX60_VIEU[36]                             VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      136   136.00                       136   SMX60_VIEU[136]                            VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      10    10.00                        10    SMX60_VIEU[10]                             VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      36    36.00                        36    SMX60_VIEU[36]                             VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      136   136.00                       136   SMX60_VIEU[136]                            VCOMPRESS_VM               vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     2.00                         4     SMX60_VIEU[2]                              VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      8     8.00                         8     SMX60_VIEU[8]                              VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     2.00                         4     SMX60_VIEU[2]                              VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      8     8.00                         8     SMX60_VIEU[8]                              VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     2.00                         4     SMX60_VIEU[2]                              VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      8     8.00                         8     SMX60_VIEU[8]                              VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     2.00                         4     SMX60_VIEU[2]                              VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      8     8.00                         8     SMX60_VIEU[8]                              VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VSLIDEUP_VX                vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDEUP_VI                vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VIEU[3]                              VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VIEU[5]                              VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VIEU[9]                              VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VIEU[17]                             VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      64    64.00                        64    SMX60_VIEU[64]                             VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      256   256.00                       256   SMX60_VIEU[256]                            VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      64    64.00                        64    SMX60_VIEU[64]                             VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      256   256.00                       256   SMX60_VIEU[256]                            VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      64    64.00                        64    SMX60_VIEU[64]                             VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      256   256.00                       256   SMX60_VIEU[256]                            VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      64    64.00                        64    SMX60_VIEU[64]                             VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      256   256.00                       256   SMX60_VIEU[256]                            VRGATHER_VV                vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     2.00                         4     SMX60_VIEU[2]                              VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      8     8.00                         8     SMX60_VIEU[8]                              VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     2.00                         4     SMX60_VIEU[2]                              VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      8     8.00                         8     SMX60_VIEU[8]                              VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     2.00                         4     SMX60_VIEU[2]                              VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      8     8.00                         8     SMX60_VIEU[8]                              VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     2.00                         4     SMX60_VIEU[2]                              VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      8     8.00                         8     SMX60_VIEU[8]                              VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VRGATHER_VX                vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     2.00                         4     SMX60_VIEU[2]                              VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      8     8.00                         8     SMX60_VIEU[8]                              VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     2.00                         4     SMX60_VIEU[2]                              VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      8     8.00                         8     SMX60_VIEU[8]                              VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     2.00                         4     SMX60_VIEU[2]                              VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      8     8.00                         8     SMX60_VIEU[8]                              VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     2.00                         4     SMX60_VIEU[2]                              VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      8     8.00                         8     SMX60_VIEU[8]                              VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VRGATHER_VI                vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     SMX60_VIEU[2]                              VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     SMX60_VIEU[8]                              VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      32    32.00                        32    SMX60_VIEU[32]                             VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      128   128.00                       128   SMX60_VIEU[128]                            VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      64    64.00                        64    SMX60_VIEU[64]                             VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      256   256.00                       256   SMX60_VIEU[256]                            VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      64    64.00                        64    SMX60_VIEU[64]                             VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      256   256.00                       256   SMX60_VIEU[256]                            VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      16    16.00                        16    SMX60_VIEU[16]                             VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      64    64.00                        64    SMX60_VIEU[64]                             VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      256   256.00                       256   SMX60_VIEU[256]                            VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
 # CHECK-NEXT:  1      4     4.00                         4     SMX60_VIEU[4]                              VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
@@ -2282,65 +2282,65 @@ vfslide1up.vf v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
 # CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VFP[3]                               VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VFP[5]                               VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VFP[9]                               VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VFP[17]                              VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VFP[3]                               VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VFP[5]                               VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VFP[9]                               VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VFP[17]                              VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VFP[3]                               VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VFP[5]                               VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VFP[9]                               VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VFP[17]                              VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VFP[3]                               VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VFP[5]                               VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VFP[9]                               VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VFP[17]                              VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      4     1.00                         4     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VFP[3]                               VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VFP[5]                               VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VFP[9]                               VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VFP[17]                              VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      4     3.00                         4     SMX60_VFP[3]                               VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      5     5.00                         5     SMX60_VFP[5]                               VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      9     9.00                         9     SMX60_VFP[9]                               VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      17    17.00                        17    SMX60_VFP[17]                              VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - SMX60_FP
@@ -2354,7 +2354,7 @@ vfslide1up.vf v8, v16, ft0
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
-# CHECK-NEXT:  -     572.00  -      -      -     45.00  923.00  -
+# CHECK-NEXT:  -     572.00  -      -      -     225.00 5253.00  -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
@@ -2491,93 +2491,93 @@ vfslide1up.vf v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.v.i	v8, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -      -      -      -      -      -     6.00    -     vmv.x.s	s0, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmv.s.x	v8, s0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
@@ -2805,43 +2805,43 @@ vfslide1up.vf v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     36.00   -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     136.00  -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     36.00   -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     136.00  -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     36.00   -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     136.00  -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     36.00   -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     136.00  -     vcompress.vm	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
@@ -2849,43 +2849,43 @@ vfslide1up.vf v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslide1up.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
@@ -2893,43 +2893,43 @@ vfslide1up.vf v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslide1down.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
@@ -2937,43 +2937,43 @@ vfslide1up.vf v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     8.00    -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     8.00    -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     8.00    -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     8.00    -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vslideup.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
@@ -2981,43 +2981,43 @@ vfslide1up.vf v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslideup.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
@@ -3025,43 +3025,43 @@ vfslide1up.vf v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslidedown.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
@@ -3069,43 +3069,43 @@ vfslide1up.vf v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     3.00    -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     9.00    -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     17.00   -     vslidedown.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
@@ -3113,43 +3113,43 @@ vfslide1up.vf v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     64.00   -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     256.00  -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     64.00   -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     256.00  -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     64.00   -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     256.00  -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     64.00   -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     256.00  -     vrgather.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
@@ -3157,43 +3157,43 @@ vfslide1up.vf v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     8.00    -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     8.00    -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     8.00    -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     8.00    -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vrgather.vx	v8, v16, t5
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
@@ -3201,85 +3201,85 @@ vfslide1up.vf v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     8.00    -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     8.00    -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     8.00    -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     8.00    -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vrgather.vi	v8, v16, 12
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     8.00    -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     32.00   -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     128.00  -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     64.00   -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     256.00  -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     64.00   -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     256.00  -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     16.00   -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     64.00   -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     256.00  -     vrgatherei16.vv	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vmerge.vim	v8, v8, 12, v0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
@@ -3447,58 +3447,58 @@ vfslide1up.vf v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -     vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     5.00    -      -     vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     9.00    -      -     vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     17.00   -      -     vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -     vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     5.00    -      -     vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     9.00    -      -     vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     17.00   -      -     vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -     vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     5.00    -      -     vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     9.00    -      -     vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     17.00   -      -     vfslide1down.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -     vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     5.00    -      -     vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     9.00    -      -     vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     17.00   -      -     vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -     vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     5.00    -      -     vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     9.00    -      -     vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     17.00   -      -     vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -     vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     5.00    -      -     vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     9.00    -      -     vfslide1up.vf	v8, v16, ft0
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -      -      -      -      -     17.00   -      -     vfslide1up.vf	v8, v16, ft0
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s
index 3d7a67d8ba161..621cad6e121ab 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s
@@ -630,593 +630,593 @@ vfwredusum.vs v8, v8, v8
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDAND_VS                 vredand.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDMAX_VS                 vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDMINU_VS                vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDMIN_VS                 vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDOR_VS                  vredor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDSUM_VS                 vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VREDXOR_VS                 vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      5     1.00                         5     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      5     2.00                         5     SMX60_VIEU[2]                              VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      7     2.00                         7     SMX60_VIEU[2]                              VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      11    4.00                         11    SMX60_VIEU[4]                              VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      19    10.00                        19    SMX60_VIEU[10]                             VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      35    35.00                        35    SMX60_VIEU[35]                             VWREDSUM_VS                vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      12    8.00                         12    SMX60_VFP[8]                               VFREDMAX_VS                vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      12    8.00                         12    SMX60_VFP[8]                               VFREDMAX_VS                vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      15    8.00                         15    SMX60_VFP[8]                               VFREDMAX_VS                vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      21    14.00                        21    SMX60_VFP[14]                              VFREDMAX_VS                vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      33    20.00                        33    SMX60_VFP[20]                              VFREDMAX_VS                vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      57    57.00                        57    SMX60_VFP[57]                              VFREDMAX_VS                vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      12    8.00                         12    SMX60_VFP[8]                               VFREDMAX_VS                vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      15    8.00                         15    SMX60_VFP[8]                               VFREDMAX_VS                vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      21    14.00                        21    SMX60_VFP[14]                              VFREDMAX_VS                vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      33    20.00                        33    SMX60_VFP[20]                              VFREDMAX_VS                vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      57    57.00                        57    SMX60_VFP[57]                              VFREDMAX_VS                vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      15    8.00                         15    SMX60_VFP[8]                               VFREDMAX_VS                vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      21    14.00                        21    SMX60_VFP[14]                              VFREDMAX_VS                vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      33    20.00                        33    SMX60_VFP[20]                              VFREDMAX_VS                vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      57    57.00                        57    SMX60_VFP[57]                              VFREDMAX_VS                vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      12    8.00                         12    SMX60_VFP[8]                               VFREDMIN_VS                vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      12    8.00                         12    SMX60_VFP[8]                               VFREDMIN_VS                vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      15    8.00                         15    SMX60_VFP[8]                               VFREDMIN_VS                vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      21    14.00                        21    SMX60_VFP[14]                              VFREDMIN_VS                vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      33    20.00                        33    SMX60_VFP[20]                              VFREDMIN_VS                vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      57    57.00                        57    SMX60_VFP[57]                              VFREDMIN_VS                vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      12    8.00                         12    SMX60_VFP[8]                               VFREDMIN_VS                vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      15    8.00                         15    SMX60_VFP[8]                               VFREDMIN_VS                vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      21    14.00                        21    SMX60_VFP[14]                              VFREDMIN_VS                vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      33    20.00                        33    SMX60_VFP[20]                              VFREDMIN_VS                vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      57    57.00                        57    SMX60_VFP[57]                              VFREDMIN_VS                vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      15    8.00                         15    SMX60_VFP[8]                               VFREDMIN_VS                vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      21    14.00                        21    SMX60_VFP[14]                              VFREDMIN_VS                vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      33    20.00                        33    SMX60_VFP[20]                              VFREDMIN_VS                vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      57    57.00                        57    SMX60_VFP[57]                              VFREDMIN_VS                vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      24    20.00                        24    SMX60_VFP[20]                              VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      12    8.00                         12    SMX60_VFP[8]                               VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      48    24.00                        48    SMX60_VFP[24]                              VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      96    48.00                        96    SMX60_VFP[48]                              VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      192   96.00                        192   SMX60_VFP[96]                              VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      384   384.00                       384   SMX60_VFP[384]                             VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      12    8.00                         12    SMX60_VFP[8]                               VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      24    12.00                        24    SMX60_VFP[12]                              VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      48    24.00                        48    SMX60_VFP[24]                              VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      96    48.00                        96    SMX60_VFP[48]                              VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      192   192.00                       192   SMX60_VFP[192]                             VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      12    6.00                         12    SMX60_VFP[6]                               VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      24    12.00                        24    SMX60_VFP[12]                              VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      48    24.00                        48    SMX60_VFP[24]                              VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      96    96.00                        96    SMX60_VFP[96]                              VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      12    8.00                         12    SMX60_VFP[8]                               VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      12    8.00                         12    SMX60_VFP[8]                               VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      15    8.00                         15    SMX60_VFP[8]                               VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      21    14.00                        21    SMX60_VFP[14]                              VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      33    20.00                        33    SMX60_VFP[20]                              VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      57    57.00                        57    SMX60_VFP[57]                              VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      12    8.00                         12    SMX60_VFP[8]                               VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      15    8.00                         15    SMX60_VFP[8]                               VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      21    14.00                        21    SMX60_VFP[14]                              VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      33    20.00                        33    SMX60_VFP[20]                              VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      57    57.00                        57    SMX60_VFP[57]                              VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      15    8.00                         15    SMX60_VFP[8]                               VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      21    14.00                        21    SMX60_VFP[14]                              VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      33    20.00                        33    SMX60_VFP[20]                              VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      57    57.00                        57    SMX60_VFP[57]                              VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      32    27.00                        32    SMX60_VFP[27]                              VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      16    11.00                        16    SMX60_VFP[11]                              VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      64    32.00                        64    SMX60_VFP[32]                              VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      128   64.00                        128   SMX60_VFP[64]                              VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      256   128.00                       256   SMX60_VFP[128]                             VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      512   512.00                       512   SMX60_VFP[512]                             VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      16    11.00                        16    SMX60_VFP[11]                              VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      32    16.00                        32    SMX60_VFP[16]                              VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      64    32.00                        64    SMX60_VFP[32]                              VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      128   64.00                        128   SMX60_VFP[64]                              VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      256   256.00                       256   SMX60_VFP[256]                             VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      32    27.00                        32    SMX60_VFP[27]                              VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      16    11.00                        16    SMX60_VFP[11]                              VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      64    32.00                        64    SMX60_VFP[32]                              VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      128   64.00                        128   SMX60_VFP[64]                              VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      256   128.00                       256   SMX60_VFP[128]                             VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      512   512.00                       512   SMX60_VFP[512]                             VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      16    11.00                        16    SMX60_VFP[11]                              VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      32    16.00                        32    SMX60_VFP[16]                              VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      64    32.00                        64    SMX60_VFP[32]                              VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      128   64.00                        128   SMX60_VFP[64]                              VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      256   256.00                       256   SMX60_VFP[256]                             VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - SMX60_FP
@@ -1230,595 +1230,595 @@ vfwredusum.vs v8, v8, v8
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
-# CHECK-NEXT:  -     294.00  -      -      -     82.00  212.00  -
+# CHECK-NEXT:  -     294.00  -      -      -     4271.00 2028.00  -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredand.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredmaxu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredminu.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredsum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vredxor.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vwredsumu.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     4.00    -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -      -      -     35.00   -     vwredsum.vs	v8, v16, v24
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     14.00   -      -     vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     20.00   -      -     vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     57.00   -      -     vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     14.00   -      -     vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     20.00   -      -     vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     57.00   -      -     vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     14.00   -      -     vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     20.00   -      -     vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     57.00   -      -     vfredmax.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     14.00   -      -     vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     20.00   -      -     vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     57.00   -      -     vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     14.00   -      -     vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     20.00   -      -     vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     57.00   -      -     vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     14.00   -      -     vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     20.00   -      -     vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     57.00   -      -     vfredmin.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     20.00   -      -     vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     24.00   -      -     vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     48.00   -      -     vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     96.00   -      -     vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     384.00  -      -     vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     12.00   -      -     vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     24.00   -      -     vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     48.00   -      -     vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     192.00  -      -     vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     6.00    -      -     vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     12.00   -      -     vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     24.00   -      -     vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     96.00   -      -     vfredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     14.00   -      -     vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     20.00   -      -     vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     57.00   -      -     vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     14.00   -      -     vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     20.00   -      -     vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     57.00   -      -     vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     8.00    -      -     vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     14.00   -      -     vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     20.00   -      -     vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     57.00   -      -     vfredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     27.00   -      -     vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     11.00   -      -     vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     32.00   -      -     vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     64.00   -      -     vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     128.00  -      -     vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     512.00  -      -     vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     11.00   -      -     vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     16.00   -      -     vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     32.00   -      -     vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     64.00   -      -     vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     256.00  -      -     vfwredosum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     27.00   -      -     vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     11.00   -      -     vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     32.00   -      -     vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     64.00   -      -     vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     128.00  -      -     vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     512.00  -      -     vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     11.00   -      -     vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     16.00   -      -     vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     32.00   -      -     vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     64.00   -      -     vfwredusum.vs	v8, v8, v8
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -      -      -      -      -     256.00  -      -     vfwredusum.vs	v8, v8, v8
diff --git a/llvm/test/tools/llvm-offload-wrapper/offload-wrapper.ll b/llvm/test/tools/llvm-offload-wrapper/offload-wrapper.ll
new file mode 100644
index 0000000000000..32aad0b6cf64e
--- /dev/null
+++ b/llvm/test/tools/llvm-offload-wrapper/offload-wrapper.ll
@@ -0,0 +1,81 @@
+; RUN: llvm-offload-wrapper --triple=x86_64-unknown-linux-gnu -kind=openmp %s -o %t.bc
+; RUN: llvm-dis %t.bc -o - | FileCheck %s --check-prefix=OMP
+
+;      OMP: @__start_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry]
+; OMP-NEXT: @__stop_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry]
+; OMP-NEXT: @__dummy.llvm_offload_entries = internal constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "llvm_offload_entries", align 8
+; OMP-NEXT: @llvm.compiler.used = appending global [1 x ptr] [ptr @__dummy.llvm_offload_entries], section "llvm.metadata"
+; OMP-NEXT: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"{{.*}}", section ".llvm.offloading", align 8
+; OMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr @.omp_offloading.device_image, ptr getelementptr ([[[SIZE]] x i8], ptr @.omp_offloading.device_image, i64 0, i64 [[SIZE]]), ptr @__start_llvm_offload_entries, ptr @__stop_llvm_offload_entries }]
+; OMP-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_llvm_offload_entries, ptr @__stop_llvm_offload_entries }
+; OMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.omp_offloading.descriptor_reg, ptr null }]
+
+;      OMP: define internal void @.omp_offloading.descriptor_reg() section ".text.startup" {
+; OMP-NEXT: entry:
+; OMP-NEXT:   call void @__tgt_register_lib(ptr @.omp_offloading.descriptor)
+; OMP-NEXT:   %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg)
+; OMP-NEXT:   ret void
+; OMP-NEXT: }
+
+;      OMP: define internal void @.omp_offloading.descriptor_unreg() section ".text.startup" {
+; OMP-NEXT: entry:
+; OMP-NEXT:   call void @__tgt_unregister_lib(ptr @.omp_offloading.descriptor)
+; OMP-NEXT:   ret void
+; OMP-NEXT: }
+
+; RUN: llvm-offload-wrapper --triple=x86_64-unknown-linux-gnu -kind=hip %s -o %t.bc
+; RUN: llvm-dis %t.bc -o - | FileCheck %s --check-prefix=HIP
+
+;      HIP: @__start_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry]
+; HIP-NEXT: @__stop_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry]
+; HIP-NEXT: @__dummy.llvm_offload_entries = internal constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "llvm_offload_entries", align 8
+; HIP-NEXT: @llvm.compiler.used = appending global [1 x ptr] [ptr @__dummy.llvm_offload_entries], section "llvm.metadata"
+; HIP-NEXT: @.fatbin_image = internal constant {{.*}}, section ".hip_fatbin"
+; HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8
+; HIP-NEXT: @.hip.binary_handle = internal global ptr null
+; HIP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.hip.fatbin_reg, ptr null }]
+
+;      HIP: define internal void @.hip.fatbin_reg() section ".text.startup" {
+; HIP-NEXT: entry:
+; HIP-NEXT:   %0 = call ptr @__hipRegisterFatBinary(ptr @.fatbin_wrapper)
+; HIP-NEXT:   store ptr %0, ptr @.hip.binary_handle, align 8
+; HIP-NEXT:   call void @.hip.globals_reg(ptr %0)
+; HIP-NEXT:   %1 = call i32 @atexit(ptr @.hip.fatbin_unreg)
+; HIP-NEXT:   ret void
+; HIP-NEXT: }
+
+;      HIP: define internal void @.hip.fatbin_unreg() section ".text.startup" {
+; HIP-NEXT: entry:
+; HIP-NEXT:   %0 = load ptr, ptr @.hip.binary_handle, align 8
+; HIP-NEXT:   call void @__hipUnregisterFatBinary(ptr %0)
+; HIP-NEXT:   ret void
+; HIP-NEXT: }
+
+; RUN: llvm-offload-wrapper --triple=x86_64-unknown-linux-gnu -kind=cuda %s -o %t.bc
+; RUN: llvm-dis %t.bc -o - | FileCheck %s --check-prefix=CUDA
+
+;      CUDA: @__start_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry]
+; CUDA-NEXT: @__stop_llvm_offload_entries = external hidden constant [0 x %struct.__tgt_offload_entry]
+; CUDA-NEXT: @__dummy.llvm_offload_entries = internal constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "llvm_offload_entries", align 8
+; CUDA-NEXT: @llvm.compiler.used = appending global [1 x ptr] [ptr @__dummy.llvm_offload_entries], section "llvm.metadata"
+; CUDA-NEXT: @.fatbin_image = internal constant {{.*}}, section ".nv_fatbin"
+; CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8
+; CUDA-NEXT: @.cuda.binary_handle = internal global ptr null
+; CUDA-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.cuda.fatbin_reg, ptr null }]
+
+;      CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" {
+; CUDA-NEXT: entry:
+; CUDA-NEXT:   %0 = call ptr @__cudaRegisterFatBinary(ptr @.fatbin_wrapper)
+; CUDA-NEXT:   store ptr %0, ptr @.cuda.binary_handle, align 8
+; CUDA-NEXT:   call void @.cuda.globals_reg(ptr %0)
+; CUDA-NEXT:   call void @__cudaRegisterFatBinaryEnd(ptr %0)
+; CUDA-NEXT:   %1 = call i32 @atexit(ptr @.cuda.fatbin_unreg)
+; CUDA-NEXT:   ret void
+; CUDA-NEXT: }
+
+;      CUDA: define internal void @.cuda.fatbin_unreg() section ".text.startup" {
+; CUDA-NEXT: entry:
+; CUDA-NEXT:   %0 = load ptr, ptr @.cuda.binary_handle, align 8
+; CUDA-NEXT:   call void @__cudaUnregisterFatBinary(ptr %0)
+; CUDA-NEXT:   ret void
+; CUDA-NEXT: }
diff --git a/llvm/test/tools/llvm-profdata/profile-version.test b/llvm/test/tools/llvm-profdata/profile-version.test
index cb68a648d5e5a..e811699ac63ed 100644
--- a/llvm/test/tools/llvm-profdata/profile-version.test
+++ b/llvm/test/tools/llvm-profdata/profile-version.test
@@ -2,7 +2,7 @@ Test the profile version.
 
 RUN: llvm-profdata merge -o %t.profdata %p/Inputs/basic.proftext
 RUN: llvm-profdata show --profile-version %t.profdata | FileCheck %s
-CHECK: Profile version: 12
+CHECK: Profile version: 13
 
 RUN: llvm-profdata merge -o %t.prev.profdata %p/Inputs/basic.proftext --write-prev-version
 RUN: llvm-profdata show --profile-version %t.prev.profdata | FileCheck %s --check-prefix=PREV
diff --git a/llvm/test/tools/llvm-rc/Inputs/octal-in-range.rc b/llvm/test/tools/llvm-rc/Inputs/octal-in-range.rc
new file mode 100644
index 0000000000000..8327ef9be9f5c
--- /dev/null
+++ b/llvm/test/tools/llvm-rc/Inputs/octal-in-range.rc
@@ -0,0 +1,4 @@
+1 VERSIONINFO
+FILEVERSION 0010,0010,0010,0010
+BEGIN
+END
diff --git a/llvm/test/tools/llvm-rc/Inputs/octal-out-of-range.rc b/llvm/test/tools/llvm-rc/Inputs/octal-out-of-range.rc
new file mode 100644
index 0000000000000..ce520f245a48d
--- /dev/null
+++ b/llvm/test/tools/llvm-rc/Inputs/octal-out-of-range.rc
@@ -0,0 +1,4 @@
+1 VERSIONINFO
+FILEVERSION 9,08,09,1
+BEGIN
+END
diff --git a/llvm/test/tools/llvm-rc/Inputs/tokens.rc b/llvm/test/tools/llvm-rc/Inputs/tokens.rc
index 20f77912477d9..caf01aeff45fe 100644
--- a/llvm/test/tools/llvm-rc/Inputs/tokens.rc
+++ b/llvm/test/tools/llvm-rc/Inputs/tokens.rc
@@ -1,4 +1,4 @@
-﻿1 + 2 - 3214L & 0x120894 032173 2|&~+(-7){0xabcdef 0xABCDEFl} Begin End
+﻿1 + 2 - 3214L & 0x120894 032173 -0042 009 2|&~+(-7){0xabcdef 0xABCDEFl} Begin End
 1*3/4
 He11o LLVM
 identifier-with-dashes
diff --git a/llvm/test/tools/llvm-rc/octal.test b/llvm/test/tools/llvm-rc/octal.test
new file mode 100644
index 0000000000000..686c1fcf1608e
--- /dev/null
+++ b/llvm/test/tools/llvm-rc/octal.test
@@ -0,0 +1,38 @@
+; RUN: llvm-rc -no-preprocess /FO %t.in-range-rc.res -- %p/Inputs/octal-in-range.rc
+; RUN: llvm-readobj %t.in-range-rc.res | FileCheck %s --check-prefix=IN-RANGE-RC
+; RUN: llvm-windres --no-preprocess %p/Inputs/octal-in-range.rc %t.in-range-windres.res
+; RUN: llvm-readobj %t.in-range-windres.res | FileCheck %s --check-prefix=IN-RANGE-WINDRES
+
+; IN-RANGE-RC:      Data: (
+; IN-RANGE-RC-NEXT:   0000: 5C003400 00005600 53005F00 56004500  |\.4...V.S._.V.E.|
+; IN-RANGE-RC-NEXT:   0010: 52005300 49004F00 4E005F00 49004E00  |R.S.I.O.N._.I.N.|
+; IN-RANGE-RC-NEXT:   0020: 46004F00 00000000 BD04EFFE 00000100  |F.O.............|
+; IN-RANGE-RC-NEXT:   0030: 0A000A00 0A000A00 00000000 00000000  |................|
+; IN-RANGE-RC-NEXT:   0040: 00000000 00000000 00000000 00000000  |................|
+; IN-RANGE-RC-NEXT:   0050: 00000000 00000000 00000000           |............|
+; IN-RANGE-RC-NEXT: )
+
+; IN-RANGE-WINDRES:      Data: (
+; IN-RANGE-WINDRES-NEXT:   0000: 5C003400 00005600 53005F00 56004500  |\.4...V.S._.V.E.|
+; IN-RANGE-WINDRES-NEXT:   0010: 52005300 49004F00 4E005F00 49004E00  |R.S.I.O.N._.I.N.|
+; IN-RANGE-WINDRES-NEXT:   0020: 46004F00 00000000 BD04EFFE 00000100  |F.O.............|
+; IN-RANGE-WINDRES-NEXT:   0030: 08000800 08000800 00000000 00000000  |................|
+; IN-RANGE-WINDRES-NEXT:   0040: 00000000 00000000 00000000 00000000  |................|
+; IN-RANGE-WINDRES-NEXT:   0050: 00000000 00000000 00000000           |............|
+; IN-RANGE-WINDRES-NEXT: )
+
+; RUN: llvm-rc -no-preprocess /FO %t.out-of-range-rc.res -- %p/Inputs/octal-out-of-range.rc
+; RUN: llvm-readobj %t.out-of-range-rc.res | FileCheck %s --check-prefix=OUT-OF-RANGE-RC
+; RUN: not llvm-windres --no-preprocess %p/Inputs/octal-out-of-range.rc %t.out-of-range-windres.res 2>&1 | FileCheck %s --check-prefix OUT-OF-RANGE-WINDRES
+
+; OUT-OF-RANGE-RC:      Data: (
+; OUT-OF-RANGE-RC-NEXT:   0000: 5C003400 00005600 53005F00 56004500  |\.4...V.S._.V.E.|
+; OUT-OF-RANGE-RC-NEXT:   0010: 52005300 49004F00 4E005F00 49004E00  |R.S.I.O.N._.I.N.|
+; OUT-OF-RANGE-RC-NEXT:   0020: 46004F00 00000000 BD04EFFE 00000100  |F.O.............|
+; OUT-OF-RANGE-RC-NEXT:   0030: 08000900 01000900 00000000 00000000  |................|
+; OUT-OF-RANGE-RC-NEXT:   0040: 00000000 00000000 00000000 00000000  |................|
+; OUT-OF-RANGE-RC-NEXT:   0050: 00000000 00000000 00000000           |............|
+; OUT-OF-RANGE-RC-NEXT: )
+
+
+; OUT-OF-RANGE-WINDRES: llvm-rc: Error parsing file: Integer invalid or too large: 08
diff --git a/llvm/test/tools/llvm-rc/tokenizer.test b/llvm/test/tools/llvm-rc/tokenizer.test
index 3062e2bf64629..953b0ca8c1b57 100644
--- a/llvm/test/tools/llvm-rc/tokenizer.test
+++ b/llvm/test/tools/llvm-rc/tokenizer.test
@@ -9,7 +9,10 @@
 ; CHECK-NEXT:  Int: 3214L; int value = 3214
 ; CHECK-NEXT:  Amp: &
 ; CHECK-NEXT:  Int: 0x120894; int value = 1181844
-; CHECK-NEXT:  Int: 032173; int value = 13435
+; CHECK-NEXT:  Int: 32173; int value = 32173
+; CHECK-NEXT:  Minus: -
+; CHECK-NEXT:  Int: 42; int value = 42
+; CHECK-NEXT:  Int: 9; int value = 9
 ; CHECK-NEXT:  Int: 2; int value = 2
 ; CHECK-NEXT:  Pipe: |
 ; CHECK-NEXT:  Amp: &
diff --git a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-feature-warning.test b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-feature-warning.test
new file mode 100644
index 0000000000000..24726c34d3509
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-feature-warning.test
@@ -0,0 +1,37 @@
+## This test checks that we output a warning when the specified version is too old to support the given features.
+
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-readobj --bb-addr-map %t 2>&1 | FileCheck -DFILE=%t %s
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+
+# CHECK: BBAddrMap [
+# CHECK-NEXT: warning: '[[FILE]]': unable to dump SHT_LLVM_BB_ADDR_MAP section with index 1: version should be >= 3 for SHT_LLVM_BB_ADDR_MAP when callsite offsets feature is enabled: version = 2 feature = 32
+Sections:
+  - Name: '.llvm_bb_addr_map (1)'
+    Type: SHT_LLVM_BB_ADDR_MAP
+    Entries:
+      - Version: 2
+        Feature: 0x20
+
+# CHECK: BBAddrMap [
+# CHECK-NEXT: warning: '[[FILE]]': unable to dump SHT_LLVM_BB_ADDR_MAP section with index 2: version should be >= 4 for SHT_LLVM_BB_ADDR_MAP when basic block hash feature is enabled: version = 3 feature = 64
+
+  - Name: '.llvm_bb_addr_map (2)'
+    Type: SHT_LLVM_BB_ADDR_MAP
+    Entries:
+      - Version: 3
+        Feature: 0x40
+
+# CHECK: BBAddrMap [
+# CHECK-NEXT: warning: '[[FILE]]': unable to dump SHT_LLVM_BB_ADDR_MAP section with index 3: version should be >= 5 for SHT_LLVM_BB_ADDR_MAP when post link cfg feature is enabled: version = 4 feature = 128
+
+  - Name: '.llvm_bb_addr_map (3)'
+    Type: SHT_LLVM_BB_ADDR_MAP
+    Entries:
+      - Version: 4
+        Feature: 0x80
diff --git a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test
index 5faafd4d83b2f..8e9d2271b8721 100644
--- a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test
+++ b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test
@@ -15,7 +15,7 @@
 
 ## Check that a malformed section can be handled.
 # RUN: yaml2obj %s -DBITS=32 -DSIZE=24 -o %t2.o
-# RUN: llvm-readobj %t2.o --bb-addr-map 2>&1 | FileCheck --match-full-lines %s -DOFFSET=0x00000018 -DFILE=%t2.o --check-prefix=TRUNCATED
+# RUN: llvm-readobj %t2.o --bb-addr-map 2>&1 | FileCheck --match-full-lines %s -DOFFSET=0x00000015 -DFILE=%t2.o --check-prefix=TRUNCATED
 
 ## Check that missing features can be handled.
 # RUN: yaml2obj %s -DBITS=32 -DFEATURE=0x2 -o %t3.o
@@ -59,17 +59,20 @@
 # CHECK-NEXT:         {
 # RAW-NEXT:             Frequency: 100
 # PRETTY-NEXT:          Frequency: 1.0
+# CHECK-NEXT:           PostLink Frequency: 10
 # CHECK-NEXT:           Successors [
 # CHECK-NEXT:             {
 # CHECK-NEXT:               ID: 2
 # RAW-NEXT:                 Probability: 0x80000000
 # PRETTY-NEXT:              Probability: 0x80000000 / 0x80000000 = 100.00%
+# CHECK-NEXT:               PostLink Probability: 7
 # CHECK-NEXT:             }
 # CHECK-NEXT:           ]
 # CHECK-NEXT:         }
 # CHECK-NEXT:         {
 # RAW-NEXT:             Frequency: 100
 # PRETTY-NEXT:          Frequency: 1.0
+# CHECK-NEXT:           PostLink Frequency: 0
 # CHECK-NEXT:           Successors [
 # CHECK-NEXT:           ]
 # CHECK-NEXT:         }
@@ -172,8 +175,8 @@ Sections:
     ShSize: [[SIZE=<none>]]
     Link:   .text
     Entries:
-      - Version: 2
-        Feature: 0x7
+      - Version: 5
+        Feature: 0x87
         BBRanges:
           - BaseAddress: [[ADDR=0x11111]]
             BBEntries:
@@ -197,10 +200,12 @@ Sections:
     PGOAnalyses:
       - FuncEntryCount: 100
         PGOBBEntries:
-          - BBFreq:        100
+          - BBFreq:          100
+            PostLinkBBFreq: 10
             Successors:
-              - ID:        2
-                BrProb:    0x80000000
+              - ID:              2
+                BrProb:          0x80000000
+                PostLinkBrFreq: 7
           - BBFreq:        100
             Successors:    []
       - FuncEntryCount: 8888
diff --git a/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll b/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll
index a6ace2246fa8e..b800f9aa97c8f 100644
--- a/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll
+++ b/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll
@@ -26,7 +26,7 @@ define i32 @t(i32 %a) {
 ; CHECK-ALL: declare i32 @llvm.uadd.sat.i32(i32, i32) #0
 declare i32 @llvm.uadd.sat.i32(i32, i32) #0
 
-; CHECK-ALL: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK-ALL: attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 
 ; CHECK-INTERESTINGNESS: attributes #1 = {
 ; CHECK-INTERESTINGNESS-SAME: "arg4"
diff --git a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
index 51a5a63ba370c..ff2c9ae00bdb9 100644
--- a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
+++ b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
@@ -34,7 +34,7 @@
 #
 # CHECK: << Total TLI yes SDK no:  18
 # CHECK: >> Total TLI no  SDK yes: 0
-# CHECK: == Total TLI yes SDK yes: 271
+# CHECK: == Total TLI yes SDK yes: 277
 #
 # WRONG_DETAIL: << TLI yes SDK no : '_ZdaPv' aka operator delete[](void*)
 # WRONG_DETAIL: >> TLI no  SDK yes: '_ZdaPvj' aka operator delete[](void*, unsigned int)
@@ -48,14 +48,14 @@
 # WRONG_DETAIL: << TLI yes SDK no : 'fminimum_numl'
 # WRONG_SUMMARY: << Total TLI yes SDK no:  19{{$}}
 # WRONG_SUMMARY: >> Total TLI no  SDK yes: 1{{$}}
-# WRONG_SUMMARY: == Total TLI yes SDK yes: 270
+# WRONG_SUMMARY: == Total TLI yes SDK yes: 276
 #
 ## The -COUNT suffix doesn't care if there are too many matches, so check
 ## the exact count first; the two directives should add up to that.
 ## Yes, this means additions to TLI will fail this test, but the argument
 ## to -COUNT can't be an expression.
-# AVAIL: TLI knows 524 symbols, 289 available
-# AVAIL-COUNT-289: {{^}} available
+# AVAIL: TLI knows 530 symbols, 295 available
+# AVAIL-COUNT-295: {{^}} available
 # AVAIL-NOT:       {{^}} available
 # UNAVAIL-COUNT-235: not available
 # UNAVAIL-NOT:       not available
@@ -778,6 +778,30 @@ DynamicSymbols:
     Type:            STT_FUNC
     Section:         .text
     Binding:         STB_GLOBAL
+  - Name:            nextafter
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+  - Name:            nextafterf
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+  - Name:            nextafterl
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+  - Name:            nexttoward
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+  - Name:            nexttowardf
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+  - Name:            nexttowardl
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
   - Name:            perror
     Type:            STT_FUNC
     Section:         .text
diff --git a/llvm/test/tools/obj2yaml/ELF/bb-addr-map-pgo-analysis-map.yaml b/llvm/test/tools/obj2yaml/ELF/bb-addr-map-pgo-analysis-map.yaml
index 299bf463cf4bc..645507af080cb 100644
--- a/llvm/test/tools/obj2yaml/ELF/bb-addr-map-pgo-analysis-map.yaml
+++ b/llvm/test/tools/obj2yaml/ELF/bb-addr-map-pgo-analysis-map.yaml
@@ -15,7 +15,7 @@
 # VALID-NEXT:     Type: SHT_LLVM_BB_ADDR_MAP
 # VALID-NEXT:     Entries:
 # VALID-NEXT:       - Version: 2
-# VALID-NEXT:         Feature: 0x7
+# VALID-NEXT:         Feature: 0x87
 ## The 'BaseAddress' field is omitted when it's zero.
 # VALID-NEXT:         BBRanges:
 # VALID-NEXT:           - BBEntries:
@@ -43,17 +43,23 @@
 # VALID-NEXT:     PGOAnalyses:
 # VALID-NEXT:       - FuncEntryCount: 100
 # VALID-NEXT:         PGOBBEntries:
-# VALID-NEXT:           - BBFreq:        100
+# VALID-NEXT:           - BBFreq:           100
+# VALID-NEXT:             PostLinkBBFreq:   10
 # VALID-NEXT:             Successors:
-# VALID-NEXT:               - ID:        2
-# VALID-NEXT:                 BrProb:    0x80000000
-# VALID-NEXT:               - ID:        4
-# VALID-NEXT:                 BrProb:    0x80000000
-# VALID-NEXT:           - BBFreq:        50
+# VALID-NEXT:               - ID:              2
+# VALID-NEXT:                 BrProb:          0x80000000
+# VALID-NEXT:                 PostLinkBrFreq:  7
+# VALID-NEXT:               - ID:              4
+# VALID-NEXT:                 BrProb:          0x80000000
+# VALID-NEXT:                 PostLinkBrFreq:  0
+# VALID-NEXT:           - BBFreq:           50
+# VALID-NEXT:             PostLinkBBFreq:   0
 # VALID-NEXT:             Successors:
-# VALID-NEXT:               - ID:        4
-# VALID-NEXT:                 BrProb:    0xFFFFFFFF
-# VALID-NEXT:           - BBFreq:        100
+# VALID-NEXT:               - ID:              4
+# VALID-NEXT:                 BrProb:          0xFFFFFFFF
+# VALID-NEXT:                 PostLinkBrFreq:  0
+# VALID-NEXT:           - BBFreq:           100
+# VALID-NEXT:             PostLinkBBFreq:   3
 # VALID-NEXT:             Successors:    []
 # VALID-NEXT:         PGOBBEntries:
 # VALID-NEXT:           - BBFreq:        20
@@ -69,7 +75,7 @@ Sections:
     ShSize: [[SIZE=<none>]]
     Entries:
       - Version: 2
-        Feature: 0x7
+        Feature: 0x87
         BBRanges:
           - BaseAddress: 0x0
             BBEntries:
@@ -97,17 +103,20 @@ Sections:
     PGOAnalyses:
       - FuncEntryCount: 100
         PGOBBEntries:
-          - BBFreq:        100
+          - BBFreq:          100
+            PostLinkBBFreq:  10
             Successors:
-              - ID:        2
-                BrProb:    0x80000000
-              - ID:        4
-                BrProb:    0x80000000
-          - BBFreq:        50
+              - ID:              2
+                BrProb:          0x80000000
+                PostLinkBrFreq:  7
+              - ID:              4
+                BrProb:          0x80000000
+          - BBFreq:              50
             Successors:
-              - ID:        4
-                BrProb:    0xFFFFFFFF
-          - BBFreq:        100
+              - ID:              4
+                BrProb:          0xFFFFFFFF
+          - BBFreq:              100
+            PostLinkBBFreq:      3
             Successors: []
       - PGOBBEntries:
           - BBFreq:        20
diff --git a/llvm/test/tools/yaml2obj/ELF/bb-addr-map-pgo-analysis-map.yaml b/llvm/test/tools/yaml2obj/ELF/bb-addr-map-pgo-analysis-map.yaml
index a4cb572e6d993..ac9c8d402b0a6 100644
--- a/llvm/test/tools/yaml2obj/ELF/bb-addr-map-pgo-analysis-map.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/bb-addr-map-pgo-analysis-map.yaml
@@ -6,8 +6,9 @@
 # Case 4: Specify Entries.
 # CHECK:        Name: .llvm_bb_addr_map (1)
 # CHECK:        SectionData (
-# CHECK-NEXT:     0000: 02072000 00000000 0000010B 010203E8
-# CHECK-NEXT:     0010: 07E80702 0CEEDDBB F70E0D91 A2C48801
+# CHECK-NEXT:     0000: 02872000 00000000 0000010B 010203E8
+# CHECK-NEXT:     0010: 07E80764 020CEEDD BBF70E28 0D91A2C4
+# CHECK-NEXT:     0020: 880100
 # CHECK-NEXT:   )
 
 # Case 7: Not including a field which is enabled in feature doesn't emit value
@@ -26,12 +27,12 @@ Sections:
 ## Test the following cases:
 
 ## 1) We can produce an .llvm_bb_addr_map section from a description with
-##    Entries and PGO Analysis data.
+##    Entries and PGO Analysis and Post Link data.
   - Name: '.llvm_bb_addr_map (1)'
     Type: SHT_LLVM_BB_ADDR_MAP
     Entries:
       - Version: 2
-        Feature: 0x7
+        Feature: 0x87
         BBRanges:
           - BaseAddress: 0x0000000000000020
             BBEntries:
@@ -42,12 +43,14 @@ Sections:
     PGOAnalyses:
       - FuncEntryCount: 1000
         PGOBBEntries:
-          - BBFreq:        1000
+          - BBFreq:          1000
+            PostLinkBBFreq:  100
             Successors:
-              - ID:        12
-                BrProb:    0xeeeeeeee
-              - ID:        13
-                BrProb:    0x11111111
+              - ID:               12
+                BrProb:           0xeeeeeeee
+                PostLinkBrFreq:   40
+              - ID:               13
+                BrProb:           0x11111111
 
 ## 2) According to feature we have FuncEntryCount but none is provided in yaml
   - Name: '.llvm_bb_addr_map (2)'
@@ -66,7 +69,7 @@ Sections:
 
 ## Check that yaml2obj generates a warning when we use unsupported feature.
 # RUN: yaml2obj --docnum=2  %s 2>&1 | FileCheck %s --check-prefix=INVALID-FEATURE
-# INVALID-FEATURE: warning: invalid encoding for BBAddrMap::Features: 0xf0
+# INVALID-FEATURE: warning: invalid encoding for BBAddrMap::Features: 0x100
 
 --- !ELF
 FileHeader:
@@ -79,4 +82,4 @@ Sections:
     Entries:
       - Version: 2
 ##  Specify unsupported feature
-        Feature: 0xF0
+        Feature: 0x100
diff --git a/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml b/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml
index 339e419b39458..05d77d67e4468 100644
--- a/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml
@@ -220,7 +220,7 @@ Sections:
 
 ## Check that yaml2obj generates a warning when we use unsupported versions.
 # RUN: yaml2obj --docnum=3  %s 2>&1 | FileCheck %s --check-prefix=INVALID-VERSION
-# INVALID-VERSION: warning: unsupported SHT_LLVM_BB_ADDR_MAP version: 5; encoding using the most recent version
+# INVALID-VERSION: warning: unsupported SHT_LLVM_BB_ADDR_MAP version: 6; encoding using the most recent version
 
 --- !ELF
 FileHeader:
@@ -232,4 +232,4 @@ Sections:
     Type: SHT_LLVM_BB_ADDR_MAP
     Entries:
 ##  Specify unsupported version
-      - Version: 5
+      - Version: 6
diff --git a/llvm/tools/bugpoint/ListReducer.h b/llvm/tools/bugpoint/ListReducer.h
index 06f8ddb255346..ceee85325129e 100644
--- a/llvm/tools/bugpoint/ListReducer.h
+++ b/llvm/tools/bugpoint/ListReducer.h
@@ -32,7 +32,7 @@ template <typename ElTy> struct ListReducer {
     KeepPrefix  // The prefix alone satisfies the predicate
   };
 
-  virtual ~ListReducer() {}
+  virtual ~ListReducer() = default;
 
   /// This virtual function should be overriden by subclasses to implement the
   /// test desired.  The testcase is only required to test to see if the Kept
diff --git a/llvm/tools/bugpoint/ToolRunner.h b/llvm/tools/bugpoint/ToolRunner.h
index c9da9afba0e46..9ff06639d311d 100644
--- a/llvm/tools/bugpoint/ToolRunner.h
+++ b/llvm/tools/bugpoint/ToolRunner.h
@@ -105,7 +105,7 @@ class AbstractInterpreter {
   createCustomExecutor(const char *Argv0, std::string &Message,
                        const std::string &ExecCommandLine);
 
-  virtual ~AbstractInterpreter() {}
+  virtual ~AbstractInterpreter() = default;
 
   /// compileProgram - Compile the specified program from bitcode to executable
   /// code.  This does not produce any output, it is only used when debugging
diff --git a/llvm/tools/dsymutil/BinaryHolder.h b/llvm/tools/dsymutil/BinaryHolder.h
index cb5bd95978144..27d71514cb73e 100644
--- a/llvm/tools/dsymutil/BinaryHolder.h
+++ b/llvm/tools/dsymutil/BinaryHolder.h
@@ -110,7 +110,7 @@ class BinaryHolder {
       std::string Filename;
       TimestampTy Timestamp;
 
-      KeyTy() {}
+      KeyTy() = default;
       KeyTy(StringRef Filename, TimestampTy Timestamp)
           : Filename(Filename.str()), Timestamp(Timestamp) {}
     };
diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
index b91c27e6a0f86..1fc5bba602d8b 100644
--- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
+++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
@@ -90,7 +90,6 @@
 #include <cstdlib>
 #include <cstring>
 #include <limits>
-#include <map>
 #include <memory>
 #include <optional>
 #include <string>
@@ -794,9 +793,10 @@ bool DwarfLinkerForBinary::linkImpl(
         reportWarning("Could not parse binary Swift module: " +
                           toString(FromInterfaceOrErr.takeError()),
                       Obj->getObjectFilename());
-        // Only skip swiftmodules that could be parsed and are
-        // positively identified as textual.
-      } else if (*FromInterfaceOrErr) {
+        // Only skip swiftmodules that could be parsed and are positively
+        // identified as textual. Do so only when the option allows.
+      } else if (*FromInterfaceOrErr &&
+                 !Options.IncludeSwiftModulesFromInterface) {
         if (Options.Verbose)
           outs() << "Skipping compiled textual Swift interface: "
                  << Obj->getObjectFilename() << "\n";
diff --git a/llvm/tools/dsymutil/LinkUtils.h b/llvm/tools/dsymutil/LinkUtils.h
index ad5515a04333e..c333a3d4afee0 100644
--- a/llvm/tools/dsymutil/LinkUtils.h
+++ b/llvm/tools/dsymutil/LinkUtils.h
@@ -114,6 +114,13 @@ struct LinkOptions {
   /// Whether all remarks should be kept or only remarks with valid debug
   /// locations.
   bool RemarksKeepAll = true;
+
+  /// Whether or not to copy binary swiftmodules built from textual
+  /// .swiftinterface files into the dSYM bundle. These typically come only
+  /// from the SDK (since textual interfaces require library evolution) and
+  /// thus are a waste of space to copy into the bundle. Turn this on if the
+  /// swiftmodules are different from those in the SDK.
+  bool IncludeSwiftModulesFromInterface = false;
   /// @}
 
   LinkOptions() = default;
diff --git a/llvm/tools/dsymutil/Options.td b/llvm/tools/dsymutil/Options.td
index ad35e55e33b12..e99bc12fa7fd8 100644
--- a/llvm/tools/dsymutil/Options.td
+++ b/llvm/tools/dsymutil/Options.td
@@ -202,6 +202,14 @@ def remarks_drop_without_debug: Flag<["--", "-"], "remarks-drop-without-debug">,
            "all remarks are kept.">,
   Group<grp_general>;
 
+def include_swiftmodules_from_interface: Flag<["--", "-"], "include-swiftmodules-from-interface">,
+  HelpText<"Whether or not to copy binary swiftmodules built from textual "
+  ".swiftinterface files into the dSYM bundle. These typically come only "
+  "from the SDK (since textual interfaces require library evolution) and "
+  "thus are a waste of space to copy into the bundle. Turn this on if the "
+  "swiftmodules are different from those in the SDK.">,
+  Group<grp_general>;
+
 def linker: Separate<["--", "-"], "linker">,
   MetaVarName<"<DWARF linker type>">,
   HelpText<"Specify the desired type of DWARF linker. Defaults to 'classic'">,
diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp
index 913077eb0b06d..688f6aaf3d0c9 100644
--- a/llvm/tools/dsymutil/dsymutil.cpp
+++ b/llvm/tools/dsymutil/dsymutil.cpp
@@ -391,6 +391,9 @@ static Expected<DsymutilOptions> getOptions(opt::InputArgList &Args) {
   Options.LinkOpts.RemarksKeepAll =
       !Args.hasArg(OPT_remarks_drop_without_debug);
 
+  Options.LinkOpts.IncludeSwiftModulesFromInterface =
+      Args.hasArg(OPT_include_swiftmodules_from_interface);
+
   if (opt::Arg *BuildVariantSuffix = Args.getLastArg(OPT_build_variant_suffix))
     Options.LinkOpts.BuildVariantSuffix = BuildVariantSuffix->getValue();
 
diff --git a/llvm/tools/gold/gold-plugin.cpp b/llvm/tools/gold/gold-plugin.cpp
index 256933d3f53f9..06045a66ad3e8 100644
--- a/llvm/tools/gold/gold-plugin.cpp
+++ b/llvm/tools/gold/gold-plugin.cpp
@@ -36,7 +36,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Host.h"
 #include <list>
-#include <map>
 #include <plugin-api.h>
 #include <string>
 #include <system_error>
diff --git a/llvm/tools/llc/NewPMDriver.h b/llvm/tools/llc/NewPMDriver.h
index c8a60223cb296..0dbd46797dabc 100644
--- a/llvm/tools/llc/NewPMDriver.h
+++ b/llvm/tools/llc/NewPMDriver.h
@@ -22,7 +22,6 @@
 #include "llvm/IR/DiagnosticHandler.h"
 #include "llvm/Support/CodeGen.h"
 #include <memory>
-#include <vector>
 
 namespace llvm {
 class Module;
diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
index 152f7db0719a1..dc2f878830863 100644
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -15,6 +15,7 @@
 #include "NewPMDriver.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/CodeGen/LinkAllAsmWriterComponents.h"
@@ -45,6 +46,7 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/PGOOptions.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
@@ -57,6 +59,7 @@
 #include "llvm/TargetParser/SubtargetFeature.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include <cassert>
 #include <memory>
 #include <optional>
 using namespace llvm;
@@ -208,6 +211,20 @@ static cl::opt<std::string> RemarksFormat(
     cl::desc("The format used for serializing remarks (default: YAML)"),
     cl::value_desc("format"), cl::init("yaml"));
 
+enum SaveStatsMode { None, Cwd, Obj };
+
+static cl::opt<SaveStatsMode> SaveStats(
+    "save-stats",
+    cl::desc("Save LLVM statistics to a file in the current directory"
+             "(`-save-stats`/`-save-stats=cwd`) or the directory of the output"
+             "file (`-save-stats=obj`). (default: cwd)"),
+    cl::values(clEnumValN(SaveStatsMode::Cwd, "cwd",
+                          "Save to the current working directory"),
+               clEnumValN(SaveStatsMode::Cwd, "", ""),
+               clEnumValN(SaveStatsMode::Obj, "obj",
+                          "Save to the output file directory")),
+    cl::init(SaveStatsMode::None), cl::ValueOptional);
+
 static cl::opt<bool> EnableNewPassManager(
     "enable-new-pm", cl::desc("Enable the new pass manager"), cl::init(false));
 
@@ -281,7 +298,8 @@ static void setPGOOptions(TargetMachine &TM) {
     TM.setPGOOption(PGOOpt);
 }
 
-static int compileModule(char **, LLVMContext &);
+static int compileModule(char **argv, LLVMContext &Context,
+                         std::string &OutputFilename);
 
 [[noreturn]] static void reportError(Twine Msg, StringRef Filename = "") {
   SmallString<256> Prefix;
@@ -360,6 +378,45 @@ static std::unique_ptr<ToolOutputFile> GetOutputStream(const char *TargetName,
   return FDOut;
 }
 
+static int MaybeEnableStats() {
+  if (SaveStats == SaveStatsMode::None)
+    return 0;
+
+  llvm::EnableStatistics(false);
+  return 0;
+}
+
+static int MaybeSaveStats(std::string &&OutputFilename) {
+  if (SaveStats == SaveStatsMode::None)
+    return 0;
+
+  SmallString<128> StatsFilename;
+  if (SaveStats == SaveStatsMode::Obj) {
+    StatsFilename = OutputFilename;
+    llvm::sys::path::remove_filename(StatsFilename);
+  } else {
+    assert(SaveStats == SaveStatsMode::Cwd &&
+           "Should have been a valid --save-stats value");
+  }
+
+  auto BaseName = llvm::sys::path::filename(OutputFilename);
+  llvm::sys::path::append(StatsFilename, BaseName);
+  llvm::sys::path::replace_extension(StatsFilename, "stats");
+
+  auto FileFlags = llvm::sys::fs::OF_TextWithCRLF;
+  std::error_code EC;
+  auto StatsOS =
+      std::make_unique<llvm::raw_fd_ostream>(StatsFilename, EC, FileFlags);
+  if (EC) {
+    WithColor::error(errs(), "llc")
+        << "Unable to open statistics file: " << EC.message() << "\n";
+    return 1;
+  }
+
+  llvm::PrintStatisticsJSON(*StatsOS);
+  return 0;
+}
+
 // main - Entry point for the llc compiler.
 //
 int main(int argc, char **argv) {
@@ -437,18 +494,23 @@ int main(int argc, char **argv) {
     reportError(std::move(E), RemarksFilename);
   LLVMRemarkFileHandle RemarksFile = std::move(*RemarksFileOrErr);
 
+  if (int RetVal = MaybeEnableStats())
+    return RetVal;
+  std::string OutputFilename;
+
   if (InputLanguage != "" && InputLanguage != "ir" && InputLanguage != "mir")
     reportError("input language must be '', 'IR' or 'MIR'");
 
   // Compile the module TimeCompilations times to give better compile time
   // metrics.
   for (unsigned I = TimeCompilations; I; --I)
-    if (int RetVal = compileModule(argv, Context))
+    if (int RetVal = compileModule(argv, Context, OutputFilename))
       return RetVal;
 
   if (RemarksFile)
     RemarksFile->keep();
-  return 0;
+
+  return MaybeSaveStats(std::move(OutputFilename));
 }
 
 static bool addPass(PassManagerBase &PM, const char *argv0, StringRef PassName,
@@ -480,7 +542,8 @@ static bool addPass(PassManagerBase &PM, const char *argv0, StringRef PassName,
   return false;
 }
 
-static int compileModule(char **argv, LLVMContext &Context) {
+static int compileModule(char **argv, LLVMContext &Context,
+                         std::string &OutputFilename) {
   // Load the module to be compiled...
   SMDiagnostic Err;
   std::unique_ptr<Module> M;
@@ -664,6 +727,9 @@ static int compileModule(char **argv, LLVMContext &Context) {
   // Ensure the filename is passed down to CodeViewDebug.
   Target->Options.ObjectFilenameForDebug = Out->outputFilename();
 
+  // Return a copy of the output filename via the output param
+  OutputFilename = Out->outputFilename();
+
   // Tell target that this tool is not necessarily used with argument ABI
   // compliance (i.e. narrow integer argument extensions).
   Target->Options.VerifyArgABICompliance = 0;
diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp
index 7fee06b5d7b4f..017e2102348b7 100644
--- a/llvm/tools/lli/lli.cpp
+++ b/llvm/tools/lli/lli.cpp
@@ -305,7 +305,7 @@ class LLIObjectCache : public ObjectCache {
         this->CacheDir[this->CacheDir.size() - 1] != '/')
       this->CacheDir += '/';
   }
-  ~LLIObjectCache() override {}
+  ~LLIObjectCache() override = default;
 
   void notifyObjectCompiled(const Module *M, MemoryBufferRef Obj) override {
     const std::string &ModuleID = M->getModuleIdentifier();
diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
index 9db7aa0929aab..677722fea1a98 100644
--- a/llvm/tools/llvm-c-test/debuginfo.c
+++ b/llvm/tools/llvm-c-test/debuginfo.c
@@ -364,6 +364,43 @@ int llvm_test_dibuilder(void) {
   assert(AddDbgRecordUnderTheRange == NULL);
   (void)AddDbgRecordUnderTheRange;
 
+  // Test that we can read the first debug record.
+  LLVMMetadataRef AddDbgRecordFirstDebugLoc =
+      LLVMDbgRecordGetDebugLoc(AddDbgRecordFirst);
+  (void)AddDbgRecordFirstDebugLoc;
+  assert(LLVMDILocationGetLine(AddDbgRecordFirstDebugLoc) == 43);
+  assert(LLVMDbgRecordGetKind(AddDbgRecordFirst) == LLVMDbgRecordValue);
+  LLVMValueRef AddDbgRecordFirstValue =
+      LLVMDbgVariableRecordGetValue(AddDbgRecordFirst, 0);
+  (void)AddDbgRecordFirstValue;
+  assert(LLVMGetValueKind(AddDbgRecordFirstValue) == LLVMConstantIntValueKind);
+  assert(LLVMConstIntGetZExtValue(AddDbgRecordFirstValue) == 0);
+  LLVMMetadataRef AddDbgRecordFirstVariable =
+      LLVMDbgVariableRecordGetVariable(AddDbgRecordFirst);
+  (void)AddDbgRecordFirstVariable;
+  assert(LLVMGetMetadataKind(AddDbgRecordFirstVariable) ==
+         LLVMDILocalVariableMetadataKind);
+  // TODO: For now, there is no way to get the name.
+  LLVMMetadataRef AddDbgRecordFirstVariableScope =
+      LLVMDIVariableGetScope(AddDbgRecordFirstVariable);
+  (void)AddDbgRecordFirstVariableScope;
+  assert(LLVMGetMetadataKind(AddDbgRecordFirstVariableScope) ==
+         LLVMDILexicalBlockMetadataKind);
+  LLVMMetadataRef AddDbgRecordFirstVariableFile =
+      LLVMDIScopeGetFile(AddDbgRecordFirstVariableScope);
+  (void)AddDbgRecordFirstVariableFile;
+  assert(LLVMGetMetadataKind(AddDbgRecordFirstVariableFile) ==
+         LLVMDIFileMetadataKind);
+  unsigned FileLen = 0;
+  assert(strcmp(LLVMDIFileGetFilename(AddDbgRecordFirstVariableFile, &FileLen),
+                "debuginfo.c") == 0);
+  (void)FileLen;
+  LLVMMetadataRef AddDbgRecordFirstExpr =
+      LLVMDbgVariableRecordGetExpression(AddDbgRecordFirst);
+  assert(LLVMGetMetadataKind(AddDbgRecordFirstExpr) ==
+         LLVMDIExpressionMetadataKind);
+  (void)AddDbgRecordFirstExpr;
+
   char *MStr = LLVMPrintModuleToString(M);
   puts(MStr);
   LLVMDisposeMessage(MStr);
diff --git a/llvm/tools/llvm-cas/CMakeLists.txt b/llvm/tools/llvm-cas/CMakeLists.txt
new file mode 100644
index 0000000000000..e9d40cb49e015
--- /dev/null
+++ b/llvm/tools/llvm-cas/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(LLVM_TARGET_DEFINITIONS Options.td)
+tablegen(LLVM Options.inc -gen-opt-parser-defs)
+add_public_tablegen_target(LLVMCASToolTableGen)
+
+set(LLVM_LINK_COMPONENTS
+  Support
+  CAS
+  Option
+  )
+
+add_llvm_tool(llvm-cas
+  llvm-cas.cpp
+
+  DEPENDS
+  ${tablegen_deps}
+  LLVMCASToolTableGen
+  )
diff --git a/llvm/tools/llvm-cas/Options.td b/llvm/tools/llvm-cas/Options.td
new file mode 100644
index 0000000000000..5ae64c104fdb6
--- /dev/null
+++ b/llvm/tools/llvm-cas/Options.td
@@ -0,0 +1,63 @@
+include "llvm/Option/OptParser.td"
+
+class F<string name> : Flag<["--", "-"], name>;
+
+def grp_action : OptionGroup<"Actions">, HelpText<"llvm-cas actions">;
+
+def help : F<"help">, HelpText<"Prints this help output">;
+def : Flag<["-"], "h">, Alias<help>, HelpText<"Alias for --help">;
+
+// Tool actions
+
+def cas_dump : F<"dump">, HelpText<"Dump internal contents">, Group<grp_action>;
+def cat_node_data : F<"cat-node-data">,
+                    HelpText<"Cat node data">,
+                    Group<grp_action>;
+def make_blob : F<"make-blob">, HelpText<"Make blob">, Group<grp_action>;
+def make_node : F<"make-node">, HelpText<"Make node">, Group<grp_action>;
+def ls_node_refs : F<"ls-node-refs">,
+                   HelpText<"List node refs">,
+                   Group<grp_action>;
+def import : F<"import">,
+             HelpText<"Import objects from another CAS">,
+             Group<grp_action>;
+def put_cache_key : F<"put-cache-key">,
+                    HelpText<"Set a value for a cache key">,
+                    Group<grp_action>;
+def get_cache_result : F<"get-cache-result">,
+                       HelpText<"Get the result value from a cache key">,
+                       Group<grp_action>;
+def validate : F<"validate">,
+               HelpText<"Validate ObjectStore">,
+               Group<grp_action>;
+def validate_object : F<"validate-object">,
+                      HelpText<"Validate the object for CASID">,
+                      Group<grp_action>;
+def validate_if_needed : F<"validate-if-needed">,
+                         HelpText<"Validate cas contents if needed">,
+                         Group<grp_action>;
+def prune : F<"prune">, HelpText<"Prune local cas storage">, Group<grp_action>;
+
+// Tool options
+
+def cas_path : Separate<["-", "--"], "cas">,
+               MetaVarName<"<path>">,
+               HelpText<"Path to CAS on disk">;
+
+def upstream_cas : Separate<["-", "--"], "upstream-cas">,
+                   MetaVarName<"<path>">,
+                   HelpText<"Path to another upstream CAS">;
+
+def data : Separate<["-", "--"], "data">,
+           MetaVarName<"<path>">,
+           HelpText<"Path to data or '-' for stdin">;
+
+def check_hash : F<"check-hash">,
+                 HelpText<"Check all hashes during validation">;
+
+def allow_recovery : F<"allow-recovery">,
+                     HelpText<"Allow recovery of CAS data">;
+
+def force : F<"force">, HelpText<"Force validation even if unnecessary">;
+
+def in_process : F<"in-process">, HelpText<"Validation in-process">;
diff --git a/llvm/tools/llvm-cas/llvm-cas.cpp b/llvm/tools/llvm-cas/llvm-cas.cpp
new file mode 100644
index 0000000000000..e72ee470d2319
--- /dev/null
+++ b/llvm/tools/llvm-cas/llvm-cas.cpp
@@ -0,0 +1,405 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file A utility for operating on LLVM CAS.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Option/Arg.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+namespace {
+enum ID {
+  OPT_INVALID = 0, // This is not an option ID.
+#define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__),
+#include "Options.inc"
+#undef OPTION
+};
+
+#define OPTTABLE_STR_TABLE_CODE
+#include "Options.inc"
+#undef OPTTABLE_STR_TABLE_CODE
+
+#define OPTTABLE_PREFIXES_TABLE_CODE
+#include "Options.inc"
+#undef OPTTABLE_PREFIXES_TABLE_CODE
+
+using namespace llvm::opt;
+static constexpr opt::OptTable::Info InfoTable[] = {
+#define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__),
+#include "Options.inc"
+#undef OPTION
+};
+
+class LLVMCASOptTable : public opt::GenericOptTable {
+public:
+  LLVMCASOptTable()
+      : opt::GenericOptTable(OptionStrTable, OptionPrefixesTable, InfoTable) {}
+};
+
+enum class CommandKind {
+  Invalid,
+  Dump,
+  CatNodeData,
+  MakeBlob,
+  MakeNode,
+  ListObjectReferences,
+  Import,
+  PutCacheKey,
+  GetCacheResult,
+  Validate,
+  ValidateObject,
+  ValidateIfNeeded,
+  Prune,
+};
+
+struct CommandOptions {
+  CommandKind Command = CommandKind::Invalid;
+  std::vector<std::string> Inputs;
+  std::string CASPath;
+  std::string UpstreamCASPath;
+  std::string DataPath;
+  bool CheckHash;
+  bool AllowRecovery;
+  bool Force;
+  bool InProcess;
+
+  static CommandKind getCommandKind(opt::Arg &A) {
+    switch (A.getOption().getID()) {
+    case OPT_cas_dump:
+      return CommandKind::Dump;
+    case OPT_cat_node_data:
+      return CommandKind::CatNodeData;
+    case OPT_make_blob:
+      return CommandKind::MakeBlob;
+    case OPT_make_node:
+      return CommandKind::MakeNode;
+    case OPT_ls_node_refs:
+      return CommandKind::ListObjectReferences;
+    case OPT_import:
+      return CommandKind::Import;
+    case OPT_put_cache_key:
+      return CommandKind::PutCacheKey;
+    case OPT_get_cache_result:
+      return CommandKind::GetCacheResult;
+    case OPT_validate:
+      return CommandKind::Validate;
+    case OPT_validate_object:
+      return CommandKind::ValidateObject;
+    case OPT_validate_if_needed:
+      return CommandKind::ValidateIfNeeded;
+    case OPT_prune:
+      return CommandKind::Prune;
+    }
+    return CommandKind::Invalid;
+  }
+
+  // Command requires input.
+  static bool requiresInput(CommandKind Kind) {
+    return Kind != CommandKind::ValidateIfNeeded &&
+           Kind != CommandKind::Validate && Kind != CommandKind::MakeBlob &&
+           Kind != CommandKind::MakeNode && Kind != CommandKind::Dump &&
+           Kind != CommandKind::Prune;
+  }
+};
+} // namespace
+
+static int dump(ObjectStore &CAS);
+static int listObjectReferences(ObjectStore &CAS, const CASID &ID);
+static int catNodeData(ObjectStore &CAS, const CASID &ID);
+static int makeBlob(ObjectStore &CAS, StringRef DataPath);
+static int makeNode(ObjectStore &CAS, ArrayRef<std::string> References,
+                    StringRef DataPath);
+static int import(ObjectStore &FromCAS, ObjectStore &ToCAS,
+                  ArrayRef<std::string> Objects);
+static int putCacheKey(ObjectStore &CAS, ActionCache &AC,
+                       ArrayRef<std::string> Objects);
+static int getCacheResult(ObjectStore &CAS, ActionCache &AC, const CASID &ID);
+static int validateObject(ObjectStore &CAS, const CASID &ID);
+static int validate(ObjectStore &CAS, ActionCache &AC, bool CheckHash);
+static int validateIfNeeded(StringRef Path, bool CheckHash, bool Force,
+                            bool AllowRecovery, bool InProcess,
+                            const char *Argv0);
+static int prune(cas::ObjectStore &CAS);
+
+static Expected<CommandOptions> parseOptions(int Argc, char **Argv) {
+  BumpPtrAllocator Alloc;
+  StringSaver Saver(Alloc);
+  SmallVector<const char *> ExpanedArgs;
+  if (!cl::expandResponseFiles(Argc, Argv, nullptr, Saver, ExpanedArgs))
+    return createStringError("cannot expand response file");
+
+  LLVMCASOptTable T;
+  unsigned MI, MC;
+  opt::InputArgList Args = T.ParseArgs(ExpanedArgs, MI, MC);
+
+  for (auto *Arg : Args.filtered(OPT_UNKNOWN)) {
+    llvm::errs() << "ignoring unknown option: " << Arg->getSpelling() << '\n';
+  }
+
+  if (Args.hasArg(OPT_help)) {
+    T.printHelp(
+        outs(),
+        (std::string(Argv[0]) + " [action] [options] <input files>").c_str(),
+        "llvm-cas tool that performs CAS actions.", false);
+    exit(0);
+  }
+
+  CommandOptions Opts;
+  for (auto *A : Args.filtered(OPT_grp_action))
+    Opts.Command = CommandOptions::getCommandKind(*A);
+
+  if (Opts.Command == CommandKind::Invalid)
+    return createStringError("no command action is specified");
+
+  for (auto *File : Args.filtered(OPT_INPUT))
+    Opts.Inputs.push_back(File->getValue());
+  Opts.CASPath = Args.getLastArgValue(OPT_cas_path);
+  Opts.UpstreamCASPath = Args.getLastArgValue(OPT_upstream_cas);
+  Opts.DataPath = Args.getLastArgValue(OPT_data);
+  Opts.CheckHash = Args.hasArg(OPT_check_hash);
+  Opts.AllowRecovery = Args.hasArg(OPT_allow_recovery);
+  Opts.Force = Args.hasArg(OPT_force);
+  Opts.InProcess = Args.hasArg(OPT_in_process);
+
+  // Validate options.
+  if (Opts.CASPath.empty())
+    return createStringError("missing --cas <path>");
+
+  if (Opts.Inputs.empty() && CommandOptions::requiresInput(Opts.Command))
+    return createStringError("missing <input> to operate on");
+
+  return Opts;
+}
+
+int main(int Argc, char **Argv) {
+  InitLLVM X(Argc, Argv);
+
+  ExitOnError ExitOnErr;
+  auto Opts = ExitOnErr(parseOptions(Argc, Argv));
+
+  if (Opts.Command == CommandKind::ValidateIfNeeded)
+    return validateIfNeeded(Opts.CASPath, Opts.CheckHash, Opts.Force,
+                            Opts.AllowRecovery, Opts.InProcess, Argv[0]);
+
+  auto [CAS, AC] = ExitOnErr(createOnDiskUnifiedCASDatabases(Opts.CASPath));
+  assert(CAS);
+
+  if (Opts.Command == CommandKind::Dump)
+    return dump(*CAS);
+
+  if (Opts.Command == CommandKind::Validate)
+    return validate(*CAS, *AC, Opts.CheckHash);
+
+  if (Opts.Command == CommandKind::MakeBlob)
+    return makeBlob(*CAS, Opts.DataPath);
+
+  if (Opts.Command == CommandKind::MakeNode)
+    return makeNode(*CAS, Opts.Inputs, Opts.DataPath);
+
+  if (Opts.Command == CommandKind::Prune)
+    return prune(*CAS);
+
+  if (Opts.Command == CommandKind::Import) {
+    if (Opts.UpstreamCASPath.empty())
+      ExitOnErr(createStringError("missing '-upstream-cas'"));
+
+    auto [UpstreamCAS, _] =
+        ExitOnErr(createOnDiskUnifiedCASDatabases(Opts.UpstreamCASPath));
+    return import(*UpstreamCAS, *CAS, Opts.Inputs);
+  }
+
+  if (Opts.Command == CommandKind::PutCacheKey ||
+      Opts.Command == CommandKind::GetCacheResult) {
+    if (!AC)
+      ExitOnErr(createStringError("no action-cache available"));
+  }
+
+  if (Opts.Command == CommandKind::PutCacheKey)
+    return putCacheKey(*CAS, *AC, Opts.Inputs);
+
+  // Remaining commands need exactly one CAS object.
+  if (Opts.Inputs.size() > 1)
+    ExitOnErr(createStringError("too many <object>s, expected 1"));
+  CASID ID = ExitOnErr(CAS->parseID(Opts.Inputs.front()));
+
+  if (Opts.Command == CommandKind::GetCacheResult)
+    return getCacheResult(*CAS, *AC, ID);
+
+  if (Opts.Command == CommandKind::ListObjectReferences)
+    return listObjectReferences(*CAS, ID);
+
+  if (Opts.Command == CommandKind::CatNodeData)
+    return catNodeData(*CAS, ID);
+
+  assert(Opts.Command == CommandKind::ValidateObject);
+  return validateObject(*CAS, ID);
+}
+
+static Expected<std::unique_ptr<MemoryBuffer>> openBuffer(StringRef DataPath) {
+  if (DataPath.empty())
+    return createStringError("--data missing");
+  return errorOrToExpected(DataPath == "-"
+                               ? llvm::MemoryBuffer::getSTDIN()
+                               : llvm::MemoryBuffer::getFile(DataPath));
+}
+
+int dump(ObjectStore &CAS) {
+  ExitOnError ExitOnErr("llvm-cas: dump: ");
+  CAS.print(llvm::outs());
+  return 0;
+}
+
+int makeBlob(ObjectStore &CAS, StringRef DataPath) {
+  ExitOnError ExitOnErr("llvm-cas: make-blob: ");
+  std::unique_ptr<MemoryBuffer> Buffer = ExitOnErr(openBuffer(DataPath));
+
+  ObjectProxy Blob = ExitOnErr(CAS.createProxy({}, Buffer->getBuffer()));
+  llvm::outs() << Blob.getID() << "\n";
+  return 0;
+}
+
+int catNodeData(ObjectStore &CAS, const CASID &ID) {
+  ExitOnError ExitOnErr("llvm-cas: cat-node-data: ");
+  llvm::outs() << ExitOnErr(CAS.getProxy(ID)).getData();
+  return 0;
+}
+
+int listObjectReferences(ObjectStore &CAS, const CASID &ID) {
+  ExitOnError ExitOnErr("llvm-cas: ls-node-refs: ");
+
+  ObjectProxy Object = ExitOnErr(CAS.getProxy(ID));
+  ExitOnErr(Object.forEachReference([&](ObjectRef Ref) -> Error {
+    llvm::outs() << CAS.getID(Ref) << "\n";
+    return Error::success();
+  }));
+
+  return 0;
+}
+
+static int makeNode(ObjectStore &CAS, ArrayRef<std::string> Objects,
+                    StringRef DataPath) {
+  std::unique_ptr<MemoryBuffer> Data =
+      ExitOnError("llvm-cas: make-node: data: ")(openBuffer(DataPath));
+
+  SmallVector<ObjectRef> IDs;
+  for (StringRef Object : Objects) {
+    ExitOnError ObjectErr("llvm-cas: make-node: ref: ");
+    std::optional<ObjectRef> ID =
+        CAS.getReference(ObjectErr(CAS.parseID(Object)));
+    if (!ID)
+      ObjectErr(createStringError("unknown object '" + Object + "'"));
+    IDs.push_back(*ID);
+  }
+
+  ExitOnError ExitOnErr("llvm-cas: make-node: ");
+  ObjectProxy Object = ExitOnErr(CAS.createProxy(IDs, Data->getBuffer()));
+  llvm::outs() << Object.getID() << "\n";
+  return 0;
+}
+
+static int import(ObjectStore &FromCAS, ObjectStore &ToCAS,
+                  ArrayRef<std::string> Objects) {
+  ExitOnError ExitOnErr("llvm-cas: import: ");
+
+  for (StringRef Object : Objects) {
+    CASID ID = ExitOnErr(FromCAS.parseID(Object));
+    auto Ref = FromCAS.getReference(ID);
+    if (!Ref)
+      ExitOnErr(createStringError("input not found: " + ID.toString()));
+
+    auto Imported = ExitOnErr(ToCAS.importObject(FromCAS, *Ref));
+    llvm::outs() << ToCAS.getID(Imported).toString() << "\n";
+  }
+  return 0;
+}
+
+static int putCacheKey(ObjectStore &CAS, ActionCache &AC,
+                       ArrayRef<std::string> Objects) {
+  ExitOnError ExitOnErr("llvm-cas: put-cache-key: ");
+
+  if (Objects.size() % 2 != 0)
+    ExitOnErr(createStringError("expected pairs of inputs"));
+  while (!Objects.empty()) {
+    CASID Key = ExitOnErr(CAS.parseID(Objects[0]));
+    CASID Result = ExitOnErr(CAS.parseID(Objects[1]));
+    Objects = Objects.drop_front(2);
+    ExitOnErr(AC.put(Key, Result));
+  }
+  return 0;
+}
+
+static int getCacheResult(ObjectStore &CAS, ActionCache &AC, const CASID &ID) {
+  ExitOnError ExitOnErr("llvm-cas: get-cache-result: ");
+
+  auto Result = ExitOnErr(AC.get(ID));
+  if (!Result) {
+    outs() << "result not found\n";
+    return 1;
+  }
+  outs() << *Result << "\n";
+  return 0;
+}
+
+int validateObject(ObjectStore &CAS, const CASID &ID) {
+  ExitOnError ExitOnErr("llvm-cas: validate-object: ");
+  ExitOnErr(CAS.validateObject(ID));
+  outs() << ID << ": validated successfully\n";
+  return 0;
+}
+
+int validate(ObjectStore &CAS, ActionCache &AC, bool CheckHash) {
+  ExitOnError ExitOnErr("llvm-cas: validate: ");
+  ExitOnErr(CAS.validate(CheckHash));
+  ExitOnErr(AC.validate());
+  outs() << "validated successfully\n";
+  return 0;
+}
+
+int validateIfNeeded(StringRef Path, bool CheckHash, bool Force,
+                     bool AllowRecovery, bool InProcess, const char *Argv0) {
+  ExitOnError ExitOnErr("llvm-cas: validate-if-needed: ");
+  std::string ExecStorage;
+  std::optional<StringRef> Exec;
+  if (!InProcess) {
+    ExecStorage = sys::fs::getMainExecutable(Argv0, (void *)validateIfNeeded);
+    Exec = ExecStorage;
+  }
+  ValidationResult Result = ExitOnErr(validateOnDiskUnifiedCASDatabasesIfNeeded(
+      Path, CheckHash, AllowRecovery, Force, Exec));
+  switch (Result) {
+  case ValidationResult::Valid:
+    outs() << "validated successfully\n";
+    break;
+  case ValidationResult::Recovered:
+    outs() << "recovered from invalid data\n";
+    break;
+  case ValidationResult::Skipped:
+    outs() << "validation skipped\n";
+    break;
+  }
+  return 0;
+}
+
+static int prune(cas::ObjectStore &CAS) {
+  ExitOnError ExitOnErr("llvm-cas: prune: ");
+  ExitOnErr(CAS.pruneStorageData());
+  return 0;
+}
diff --git a/llvm/tools/llvm-cfi-verify/lib/GraphBuilder.h b/llvm/tools/llvm-cfi-verify/lib/GraphBuilder.h
index 55e628ab1a8de..4ee3e7c22c97c 100644
--- a/llvm/tools/llvm-cfi-verify/lib/GraphBuilder.h
+++ b/llvm/tools/llvm-cfi-verify/lib/GraphBuilder.h
@@ -37,7 +37,6 @@
 #include "llvm/Support/raw_ostream.h"
 
 #include <functional>
-#include <set>
 
 using Instr = llvm::cfi_verify::FileAnalysis::Instr;
 
diff --git a/llvm/tools/llvm-config/llvm-config.cpp b/llvm/tools/llvm-config/llvm-config.cpp
index 020b1b5e093d5..5300c5c83e5ce 100644
--- a/llvm/tools/llvm-config/llvm-config.cpp
+++ b/llvm/tools/llvm-config/llvm-config.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Config/config.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
@@ -232,6 +233,7 @@ Options:\n\
   --link-static     Link the component libraries statically.\n\
   --obj-root        Print the object root used to build LLVM.\n\
   --prefix          Print the installation prefix.\n\
+  --quote-paths     Quote and escape paths when needed.\n\
   --shared-mode     Print how the provided components can be collectively linked (`shared` or `static`).\n\
   --system-libs     System Libraries needed to link against LLVM components.\n\
   --targets-built   List of all targets currently built.\n\
@@ -324,7 +326,7 @@ int main(int argc, char **argv) {
   // information.
   std::string ActivePrefix, ActiveBinDir, ActiveIncludeDir, ActiveLibDir,
               ActiveCMakeDir;
-  std::string ActiveIncludeOption;
+  std::vector<std::string> ActiveIncludeOptions;
   if (IsInDevelopmentTree) {
     ActiveIncludeDir = std::string(LLVM_SRC_ROOT) + "/include";
     ActivePrefix = CurrentExecPrefix;
@@ -350,8 +352,8 @@ int main(int argc, char **argv) {
     }
 
     // We need to include files from both the source and object trees.
-    ActiveIncludeOption =
-        ("-I" + ActiveIncludeDir + " " + "-I" + ActiveObjRoot + "/include");
+    ActiveIncludeOptions.push_back(ActiveIncludeDir);
+    ActiveIncludeOptions.push_back(ActiveObjRoot + "/include");
   } else {
     ActivePrefix = CurrentExecPrefix;
     {
@@ -370,7 +372,7 @@ int main(int argc, char **argv) {
       sys::path::make_absolute(ActivePrefix, Path);
       ActiveCMakeDir = std::string(Path);
     }
-    ActiveIncludeOption = "-I" + ActiveIncludeDir;
+    ActiveIncludeOptions.push_back(ActiveIncludeDir);
   }
 
   /// We only use `shared library` mode in cases where the static library form
@@ -399,7 +401,9 @@ int main(int argc, char **argv) {
       llvm::replace(ActiveBinDir, '/', '\\');
       llvm::replace(ActiveLibDir, '/', '\\');
       llvm::replace(ActiveCMakeDir, '/', '\\');
-      llvm::replace(ActiveIncludeOption, '/', '\\');
+      llvm::replace(ActiveIncludeDir, '/', '\\');
+      for (auto &Include : ActiveIncludeOptions)
+        llvm::replace(Include, '/', '\\');
     }
     SharedDir = ActiveBinDir;
     StaticDir = ActiveLibDir;
@@ -501,6 +505,32 @@ int main(int argc, char **argv) {
   };
 
   raw_ostream &OS = outs();
+
+  // Check if we want quoting and escaping.
+  bool QuotePaths = std::any_of(&argv[0], &argv[argc], [](const char *Arg) {
+    return StringRef(Arg) == "--quote-paths";
+  });
+
+  auto MaybePrintQuoted = [&](StringRef Str) {
+    if (QuotePaths)
+      sys::printArg(OS, Str, /*Quote=*/false); // only add quotes if necessary
+    else
+      OS << Str;
+  };
+
+  // Render include paths and associated flags
+  auto RenderFlags = [&](StringRef Flags) {
+    bool First = true;
+    for (auto &Include : ActiveIncludeOptions) {
+      if (!First)
+        OS << ' ';
+      std::string FlagsStr = "-I" + Include;
+      MaybePrintQuoted(FlagsStr);
+      First = false;
+    }
+    OS << ' ' << Flags << '\n';
+  };
+
   for (int i = 1; i != argc; ++i) {
     StringRef Arg = argv[i];
 
@@ -509,24 +539,32 @@ int main(int argc, char **argv) {
       if (Arg == "--version") {
         OS << PACKAGE_VERSION << '\n';
       } else if (Arg == "--prefix") {
-        OS << ActivePrefix << '\n';
+        MaybePrintQuoted(ActivePrefix);
+        OS << '\n';
       } else if (Arg == "--bindir") {
-        OS << ActiveBinDir << '\n';
+        MaybePrintQuoted(ActiveBinDir);
+        OS << '\n';
       } else if (Arg == "--includedir") {
-        OS << ActiveIncludeDir << '\n';
+        MaybePrintQuoted(ActiveIncludeDir);
+        OS << '\n';
       } else if (Arg == "--libdir") {
-        OS << ActiveLibDir << '\n';
+        MaybePrintQuoted(ActiveLibDir);
+        OS << '\n';
       } else if (Arg == "--cmakedir") {
-        OS << ActiveCMakeDir << '\n';
+        MaybePrintQuoted(ActiveCMakeDir);
+        OS << '\n';
       } else if (Arg == "--cppflags") {
-        OS << ActiveIncludeOption << ' ' << LLVM_CPPFLAGS << '\n';
+        RenderFlags(LLVM_CPPFLAGS);
       } else if (Arg == "--cflags") {
-        OS << ActiveIncludeOption << ' ' << LLVM_CFLAGS << '\n';
+        RenderFlags(LLVM_CFLAGS);
       } else if (Arg == "--cxxflags") {
-        OS << ActiveIncludeOption << ' ' << LLVM_CXXFLAGS << '\n';
+        RenderFlags(LLVM_CXXFLAGS);
       } else if (Arg == "--ldflags") {
-        OS << ((HostTriple.isWindowsMSVCEnvironment()) ? "-LIBPATH:" : "-L")
-           << ActiveLibDir << ' ' << LLVM_LDFLAGS << '\n';
+        std::string LDFlags =
+            HostTriple.isWindowsMSVCEnvironment() ? "-LIBPATH:" : "-L";
+        LDFlags += ActiveLibDir;
+        MaybePrintQuoted(LDFlags);
+        OS << ' ' << LLVM_LDFLAGS << '\n';
       } else if (Arg == "--system-libs") {
         PrintSystemLibs = true;
       } else if (Arg == "--libs") {
@@ -580,7 +618,8 @@ int main(int argc, char **argv) {
       } else if (Arg == "--shared-mode") {
         PrintSharedMode = true;
       } else if (Arg == "--obj-root") {
-        OS << ActivePrefix << '\n';
+        MaybePrintQuoted(ActivePrefix);
+        OS << '\n';
       } else if (Arg == "--ignore-libllvm") {
         LinkDyLib = false;
         LinkMode = BuiltSharedLibs ? LinkModeShared : LinkModeAuto;
@@ -590,6 +629,8 @@ int main(int argc, char **argv) {
         LinkMode = LinkModeStatic;
       } else if (Arg == "--help") {
         usage(false);
+      } else if (Arg == "--quote-paths") {
+        // Was already handled above this loop.
       } else {
         usage();
       }
@@ -682,26 +723,30 @@ int main(int argc, char **argv) {
 
       auto PrintForLib = [&](const StringRef &Lib) {
         const bool Shared = LinkMode == LinkModeShared;
+        std::string LibFileName;
         if (PrintLibNames) {
-          OS << GetComponentLibraryFileName(Lib, Shared);
+          LibFileName = GetComponentLibraryFileName(Lib, Shared);
         } else if (PrintLibFiles) {
-          OS << GetComponentLibraryPath(Lib, Shared);
+          LibFileName = GetComponentLibraryPath(Lib, Shared);
         } else if (PrintLibs) {
           // On Windows, output full path to library without parameters.
           // Elsewhere, if this is a typical library name, include it using -l.
           if (HostTriple.isWindowsMSVCEnvironment()) {
-            OS << GetComponentLibraryPath(Lib, Shared);
+            LibFileName = GetComponentLibraryPath(Lib, Shared);
           } else {
+            LibFileName = "-l";
             StringRef LibName;
             if (GetComponentLibraryNameSlice(Lib, LibName)) {
               // Extract library name (remove prefix and suffix).
-              OS << "-l" << LibName;
+              LibFileName += LibName;
             } else {
               // Lib is already a library name without prefix and suffix.
-              OS << "-l" << Lib;
+              LibFileName += Lib;
             }
           }
         }
+        if (!LibFileName.empty())
+          MaybePrintQuoted(LibFileName);
       };
 
       if (LinkMode == LinkModeShared && LinkDyLib)
diff --git a/llvm/tools/llvm-cov/CoverageExporter.h b/llvm/tools/llvm-cov/CoverageExporter.h
index 751e55dc09161..ba946a14e6e5c 100644
--- a/llvm/tools/llvm-cov/CoverageExporter.h
+++ b/llvm/tools/llvm-cov/CoverageExporter.h
@@ -37,7 +37,7 @@ class CoverageExporter {
       : Coverage(CoverageMapping), Options(Options), OS(OS) {}
 
 public:
-  virtual ~CoverageExporter(){};
+  virtual ~CoverageExporter() = default;
 
   /// Render the CoverageMapping object.
   virtual void renderRoot(const CoverageFilters &IgnoreFilters) = 0;
diff --git a/llvm/tools/llvm-cov/CoverageFilters.h b/llvm/tools/llvm-cov/CoverageFilters.h
index 5345b0c87cc27..3cee23ae50dbf 100644
--- a/llvm/tools/llvm-cov/CoverageFilters.h
+++ b/llvm/tools/llvm-cov/CoverageFilters.h
@@ -28,7 +28,7 @@ struct FunctionRecord;
 /// Matches specific functions that pass the requirement of this filter.
 class CoverageFilter {
 public:
-  virtual ~CoverageFilter() {}
+  virtual ~CoverageFilter() = default;
 
   /// Return true if the function passes the requirements of this filter.
   virtual bool matches(const coverage::CoverageMapping &CM,
diff --git a/llvm/tools/llvm-cov/SourceCoverageView.h b/llvm/tools/llvm-cov/SourceCoverageView.h
index 43fb890ad7687..bde187ea35ed1 100644
--- a/llvm/tools/llvm-cov/SourceCoverageView.h
+++ b/llvm/tools/llvm-cov/SourceCoverageView.h
@@ -122,7 +122,7 @@ class CoveragePrinter {
   static std::unique_ptr<CoveragePrinter>
   create(const CoverageViewOptions &Opts);
 
-  virtual ~CoveragePrinter() {}
+  virtual ~CoveragePrinter() = default;
 
   /// @name File Creation Interface
   /// @{
@@ -288,7 +288,7 @@ class SourceCoverageView {
   create(StringRef SourceName, const MemoryBuffer &File,
          const CoverageViewOptions &Options, CoverageData &&CoverageInfo);
 
-  virtual ~SourceCoverageView() {}
+  virtual ~SourceCoverageView() = default;
 
   /// Return the source name formatted for the host OS.
   std::string getSourceName() const;
diff --git a/llvm/tools/llvm-diff/lib/DiffConsumer.h b/llvm/tools/llvm-diff/lib/DiffConsumer.h
index 08c3afcbe111e..d4f339bd560f4 100644
--- a/llvm/tools/llvm-diff/lib/DiffConsumer.h
+++ b/llvm/tools/llvm-diff/lib/DiffConsumer.h
@@ -49,7 +49,7 @@ class StringRef;
     virtual void logd(const DiffLogBuilder &Log) = 0;
 
   protected:
-    virtual ~Consumer() {}
+    virtual ~Consumer() = default;
   };
 
   class DiffConsumer : public Consumer {
diff --git a/llvm/tools/llvm-diff/lib/DifferenceEngine.h b/llvm/tools/llvm-diff/lib/DifferenceEngine.h
index 436a35566360f..01fd0d9540dc2 100644
--- a/llvm/tools/llvm-diff/lib/DifferenceEngine.h
+++ b/llvm/tools/llvm-diff/lib/DifferenceEngine.h
@@ -17,7 +17,6 @@
 #include "DiffConsumer.h"
 #include "DiffLog.h"
 #include "llvm/ADT/StringRef.h"
-#include <utility>
 
 namespace llvm {
   class Function;
@@ -54,7 +53,7 @@ namespace llvm {
       virtual bool operator()(const Value *L, const Value *R) = 0;
 
     protected:
-      virtual ~Oracle() {}
+      virtual ~Oracle() = default;
     };
 
     DifferenceEngine(Consumer &consumer)
diff --git a/llvm/tools/llvm-diff/llvm-diff.cpp b/llvm/tools/llvm-diff/llvm-diff.cpp
index 2126b91f75ae1..45b8ed91ce52c 100644
--- a/llvm/tools/llvm-diff/llvm-diff.cpp
+++ b/llvm/tools/llvm-diff/llvm-diff.cpp
@@ -20,11 +20,9 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
 #include <string>
-#include <utility>
-
 
 using namespace llvm;
 
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index 35c540963a487..90ae3ef077ae9 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -101,13 +101,26 @@ static void printDebugLoc(const DebugLoc &DL, formatted_raw_ostream &OS) {
   }
 }
 class CommentWriter : public AssemblyAnnotationWriter {
+private:
+  bool canSafelyAccessUses(const Value &V) {
+    // Can't safely access uses, if module not materialized.
+    const GlobalValue *GV = dyn_cast<GlobalValue>(&V);
+    return !GV || (GV->getParent() && GV->getParent()->isMaterialized());
+  }
+
 public:
   void emitFunctionAnnot(const Function *F,
                          formatted_raw_ostream &OS) override {
+    if (!canSafelyAccessUses(*F))
+      return;
+
     OS << "; [#uses=" << F->getNumUses() << ']';  // Output # uses
     OS << '\n';
   }
   void printInfoComment(const Value &V, formatted_raw_ostream &OS) override {
+    if (!canSafelyAccessUses(V))
+      return;
+
     bool Padded = false;
     if (!V.getType()->isVoidTy()) {
       OS.PadToColumn(50);
diff --git a/llvm/tools/llvm-dwarfdump/CMakeLists.txt b/llvm/tools/llvm-dwarfdump/CMakeLists.txt
index aeb1b8f14d830..7a0adf32e938c 100644
--- a/llvm/tools/llvm-dwarfdump/CMakeLists.txt
+++ b/llvm/tools/llvm-dwarfdump/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  BinaryFormat
   DebugInfoDWARF
   DebugInfoDWARFLowLevel
   AllTargetsDescs
diff --git a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 11eb58ea911df..6f120f93700f6 100644
--- a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
@@ -242,6 +243,15 @@ static opt<bool>
                 cat(DwarfDumpCategory));
 static alias ShowParentsAlias("p", desc("Alias for --show-parents."),
                               aliasopt(ShowParents), cl::NotHidden);
+
+static list<std::string> FilterChildTag(
+    "filter-child-tag",
+    desc("When --show-children is specified, show only DIEs with the "
+         "specified DWARF tags."),
+    value_desc("list of DWARF tags"), cat(DwarfDumpCategory));
+static alias FilterChildTagAlias("t", desc("Alias for --filter-child-tag."),
+                                 aliasopt(FilterChildTag), cl::NotHidden);
+
 static opt<bool>
     ShowForm("show-form",
              desc("Show DWARF form types after the DWARF attribute types."),
@@ -330,6 +340,13 @@ static cl::extrahelp
 /// @}
 //===----------------------------------------------------------------------===//
 
+static llvm::SmallVector<unsigned>
+makeTagVector(const list<std::string> &TagStrings) {
+  return llvm::map_to_vector(TagStrings, [](const std::string &Tag) {
+    return llvm::dwarf::getTag(Tag);
+  });
+}
+
 static void error(Error Err) {
   if (!Err)
     return;
@@ -356,6 +373,7 @@ static DIDumpOptions getDumpOpts(DWARFContext &C) {
   DumpOpts.ShowAddresses = !Diff;
   DumpOpts.ShowChildren = ShowChildren;
   DumpOpts.ShowParents = ShowParents;
+  DumpOpts.FilterChildTag = makeTagVector(FilterChildTag);
   DumpOpts.ShowForm = ShowForm;
   DumpOpts.SummarizeTypes = SummarizeTypes;
   DumpOpts.Verbose = Verbose;
diff --git a/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp b/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp
index 2c13dd514a744..0e73adab15d86 100644
--- a/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp
@@ -112,7 +112,7 @@ namespace {
 
 // Use X19 as the loop counter register since it's a callee-saved register
 // that's available for temporary use.
-constexpr const MCPhysReg kDefaultLoopCounterReg = AArch64::X19;
+constexpr MCPhysReg kDefaultLoopCounterReg = AArch64::X19;
 
 class ExegesisAArch64Target : public ExegesisTarget {
 public:
diff --git a/llvm/tools/llvm-exegesis/lib/Analysis.cpp b/llvm/tools/llvm-exegesis/lib/Analysis.cpp
index fb843285ada2a..f3bf9690d2a6e 100644
--- a/llvm/tools/llvm-exegesis/lib/Analysis.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Analysis.cpp
@@ -446,7 +446,7 @@ void Analysis::printClusterRawHtml(const BenchmarkClustering::ClusterId &Id,
 
 } // namespace exegesis
 
-static constexpr const char kHtmlHead[] = R"(
+static constexpr char kHtmlHead[] = R"(
 <head>
 <title>llvm-exegesis Analysis Results</title>
 <style>
diff --git a/llvm/tools/llvm-exegesis/lib/Assembler.cpp b/llvm/tools/llvm-exegesis/lib/Assembler.cpp
index fd7924db08441..163f1419b370c 100644
--- a/llvm/tools/llvm-exegesis/lib/Assembler.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Assembler.cpp
@@ -44,8 +44,8 @@
 namespace llvm {
 namespace exegesis {
 
-static constexpr const char ModuleID[] = "ExegesisInfoTest";
-static constexpr const char FunctionID[] = "foo";
+static constexpr char ModuleID[] = "ExegesisInfoTest";
+static constexpr char FunctionID[] = "foo";
 static const Align kFunctionAlignment(4096);
 
 // Fills the given basic block with register setup code, and returns true if
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
index 1823a534a301a..c6164b6323e20 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
@@ -21,9 +21,9 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 
-static constexpr const char kIntegerPrefix[] = "i_0x";
-static constexpr const char kDoublePrefix[] = "f_";
-static constexpr const char kInvalidOperand[] = "INVALID";
+static constexpr char kIntegerPrefix[] = "i_0x";
+static constexpr char kDoublePrefix[] = "f_";
+static constexpr char kInvalidOperand[] = "INVALID";
 
 namespace llvm {
 
@@ -202,7 +202,7 @@ struct CustomMappingTraits<std::map<exegesis::ValidationEvent, int64_t>> {
       Io.setError("Key is not a valid validation event");
       return;
     }
-    Io.mapRequired(KeyStr.str().c_str(), VI[*Key]);
+    Io.mapRequired(KeyStr, VI[*Key]);
   }
 
   static void output(IO &Io, std::map<exegesis::ValidationEvent, int64_t> &VI) {
@@ -245,8 +245,8 @@ template <> struct SequenceElementTraits<exegesis::RegisterValue> {
 };
 
 template <> struct ScalarTraits<exegesis::RegisterValue> {
-  static constexpr const unsigned kRadix = 16;
-  static constexpr const bool kSigned = false;
+  static constexpr unsigned kRadix = 16;
+  static constexpr bool kSigned = false;
 
   static void output(const exegesis::RegisterValue &RV, void *Ctx,
                      raw_ostream &Out) {
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 1fd0a15dcfa91..12fad7d57444f 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -155,7 +155,7 @@ class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
 #ifdef LLVM_ON_UNIX
         // See "Exit Status for Commands":
         // https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xcu_chap02.html
-        constexpr const int kSigOffset = 128;
+        constexpr int kSigOffset = 128;
         return make_error<SnippetSignal>(CRC.RetCode - kSigOffset);
 #else
         // The exit code of the process on windows is not meaningful as a
@@ -877,7 +877,7 @@ Error BenchmarkRunner::getValidationCountersToRun(
   return Error::success();
 }
 
-BenchmarkRunner::FunctionExecutor::~FunctionExecutor() {}
+BenchmarkRunner::FunctionExecutor::~FunctionExecutor() = default;
 
 } // namespace exegesis
 } // namespace llvm
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
index e688b814d1c83..16d3c9ccd7658 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
@@ -73,8 +73,8 @@ class BenchmarkRunner {
 
   // Scratch space to run instructions that touch memory.
   struct ScratchSpace {
-    static constexpr const size_t kAlignment = 1024;
-    static constexpr const size_t kSize = 1 << 20; // 1MB.
+    static constexpr size_t kAlignment = 1024;
+    static constexpr size_t kSize = 1 << 20; // 1MB.
     ScratchSpace()
         : UnalignedPtr(std::make_unique<char[]>(kSize + kAlignment)),
           AlignedPtr(
diff --git a/llvm/tools/llvm-exegesis/lib/Clustering.h b/llvm/tools/llvm-exegesis/lib/Clustering.h
index 9d6c110e2e854..2b0f5b4e4a147 100644
--- a/llvm/tools/llvm-exegesis/lib/Clustering.h
+++ b/llvm/tools/llvm-exegesis/lib/Clustering.h
@@ -67,11 +67,11 @@ class BenchmarkClustering {
     ClusterId(size_t Id, bool IsUnstable = false)
         : Id_(Id), IsUnstable_(IsUnstable) {}
 
-    static constexpr const size_t kMaxValid =
+    static constexpr size_t kMaxValid =
         (std::numeric_limits<size_t>::max() >> 1) - 4;
-    static constexpr const size_t kNoise = kMaxValid + 1;
-    static constexpr const size_t kError = kMaxValid + 2;
-    static constexpr const size_t kUndef = kMaxValid + 3;
+    static constexpr size_t kNoise = kMaxValid + 1;
+    static constexpr size_t kError = kMaxValid + 2;
+    static constexpr size_t kUndef = kMaxValid + 3;
 
     size_t Id_ : (std::numeric_limits<size_t>::digits - 1);
     size_t IsUnstable_ : 1;
diff --git a/llvm/tools/llvm-exegesis/lib/Error.h b/llvm/tools/llvm-exegesis/lib/Error.h
index 9b71fe8f56897..c899023e46607 100644
--- a/llvm/tools/llvm-exegesis/lib/Error.h
+++ b/llvm/tools/llvm-exegesis/lib/Error.h
@@ -81,7 +81,7 @@ class SnippetSignal : public SnippetExecutionFailure {
 struct PerfCounterNotFullyEnabled
     : public ErrorInfo<PerfCounterNotFullyEnabled> {
   static char ID;
-  PerfCounterNotFullyEnabled() {}
+  PerfCounterNotFullyEnabled() = default;
 
   void log(raw_ostream &OS) const override;
 
diff --git a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp
index 66c770d9ca86b..afc20bed25914 100644
--- a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp
+++ b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp
@@ -8,7 +8,6 @@
 
 #include "MCInstrDescView.h"
 
-#include <iterator>
 #include <tuple>
 
 #include "llvm/ADT/STLExtras.h"
diff --git a/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp b/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp
index 79a585ec52957..aa409138ae71b 100644
--- a/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp
+++ b/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp
@@ -350,7 +350,5 @@ ParallelSnippetGenerator::generateCodeTemplates(
   return Result;
 }
 
-constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;
-
 } // namespace exegesis
 } // namespace llvm
diff --git a/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.h b/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.h
index 8a6b8569c5d4c..d3c85c0c303a2 100644
--- a/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.h
+++ b/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.h
@@ -28,7 +28,7 @@ class ParallelSnippetGenerator : public SnippetGenerator {
   generateCodeTemplates(InstructionTemplate Variant,
                         const BitVector &ForbiddenRegisters) const override;
 
-  static constexpr const size_t kMinNumDifferentAddresses = 6;
+  static constexpr size_t kMinNumDifferentAddresses = 6;
 
 private:
   // Instantiates memory operands within a snippet.
diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.h b/llvm/tools/llvm-exegesis/lib/PerfHelper.h
index 4a825b293b716..744e3c2994515 100644
--- a/llvm/tools/llvm-exegesis/lib/PerfHelper.h
+++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.h
@@ -21,7 +21,6 @@
 
 #include <cstdint>
 #include <functional>
-#include <memory>
 
 #ifdef _MSC_VER
 typedef int pid_t;
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.h b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.h
index 770e4e8d1f42d..1ef0beb6d7c5a 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.h
+++ b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.h
@@ -25,7 +25,6 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Error.h"
 #include <cstdlib>
-#include <memory>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
index 80f5ce4a2f1db..37dcc7c56e5af 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
@@ -131,7 +131,7 @@ class LoopSnippetRepetitor : public SnippetRepetitor {
 
 } // namespace
 
-SnippetRepetitor::~SnippetRepetitor() {}
+SnippetRepetitor::~SnippetRepetitor() = default;
 
 std::unique_ptr<const SnippetRepetitor>
 SnippetRepetitor::Create(Benchmark::RepetitionModeE Mode,
diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
index 572d1085d9cff..52ee980b6defd 100644
--- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
+++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
@@ -32,8 +32,8 @@ namespace exegesis {
 
 class SubprocessMemory {
 public:
-  static constexpr const size_t AuxiliaryMemoryOffset = 1;
-  static constexpr const size_t AuxiliaryMemorySize = 4096;
+  static constexpr size_t AuxiliaryMemoryOffset = 1;
+  static constexpr size_t AuxiliaryMemorySize = 4096;
 
   // Gets the thread ID for the calling thread.
   static long getCurrentTID();
diff --git a/llvm/tools/llvm-exegesis/lib/Target.cpp b/llvm/tools/llvm-exegesis/lib/Target.cpp
index fc5f82f288ae4..2ad6c5af4de4d 100644
--- a/llvm/tools/llvm-exegesis/lib/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Target.cpp
@@ -23,7 +23,7 @@ cl::OptionCategory Options("llvm-exegesis options");
 cl::OptionCategory BenchmarkOptions("llvm-exegesis benchmark options");
 cl::OptionCategory AnalysisOptions("llvm-exegesis analysis options");
 
-ExegesisTarget::~ExegesisTarget() {} // anchor.
+ExegesisTarget::~ExegesisTarget() = default; // anchor.
 
 static ExegesisTarget *FirstTarget = nullptr;
 
@@ -215,7 +215,7 @@ const PfmCountersInfo &ExegesisTarget::getDummyPfmCounters() const {
   return PfmCountersInfo::Dummy;
 }
 
-ExegesisTarget::SavedState::~SavedState() {} // anchor.
+ExegesisTarget::SavedState::~SavedState() = default; // anchor.
 
 namespace {
 
diff --git a/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.h
index ef47b7fe8a655..74a18dab80608 100644
--- a/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.h
+++ b/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.h
@@ -30,7 +30,7 @@ class UopsBenchmarkRunner : public BenchmarkRunner {
                         ExecutionMode, ValCounters) {}
   ~UopsBenchmarkRunner() override;
 
-  static constexpr const size_t kMinNumDifferentAddresses = 6;
+  static constexpr size_t kMinNumDifferentAddresses = 6;
 
 private:
   Expected<std::vector<BenchmarkMeasure>>
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index b4437f798d485..77b2f491f6a72 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -28,7 +28,6 @@
 #include "llvm/TargetParser/Host.h"
 
 #include <memory>
-#include <string>
 #include <vector>
 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) &&              \
     !defined(_M_ARM64EC)
@@ -278,9 +277,9 @@ static Expected<std::vector<CodeTemplate>> generateLEATemplatesCommon(
   assert(X86II::getMemoryOperandNo(Instr.Description.TSFlags) == 1 &&
          "invalid LEA");
 
-  constexpr const int kDestOp = 0;
-  constexpr const int kBaseOp = 1;
-  constexpr const int kIndexOp = 3;
+  constexpr int kDestOp = 0;
+  constexpr int kBaseOp = 1;
+  constexpr int kIndexOp = 3;
   auto PossibleDestRegs =
       Instr.Operands[kDestOp].getRegisterAliasing().sourceBits();
   remove(PossibleDestRegs, ForbiddenRegisters);
@@ -548,7 +547,7 @@ struct ConstantInliner {
 
   void initStack(unsigned Bytes);
 
-  static constexpr const unsigned kF80Bytes = 10; // 80 bits.
+  static constexpr unsigned kF80Bytes = 10; // 80 bits.
 
   APInt Constant_;
   std::vector<MCInst> Instructions;
@@ -864,7 +863,7 @@ const MCPhysReg ExegesisX86Target::kUnavailableRegistersSSE[12] = {
 // We're using one of R8-R15 because these registers are never hardcoded in
 // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
 // conflicts.
-constexpr const MCPhysReg kDefaultLoopCounterReg = X86::R8;
+constexpr MCPhysReg kDefaultLoopCounterReg = X86::R8;
 
 } // namespace
 
@@ -1110,9 +1109,9 @@ std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
 #ifdef __linux__
 
 #ifdef __arm__
-static constexpr const uintptr_t VAddressSpaceCeiling = 0xC0000000;
+static constexpr uintptr_t VAddressSpaceCeiling = 0xC0000000;
 #else
-static constexpr const uintptr_t VAddressSpaceCeiling = 0x0000800000000000;
+static constexpr uintptr_t VAddressSpaceCeiling = 0x0000800000000000;
 #endif
 
 void generateRoundToNearestPage(unsigned int Register,
diff --git a/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp
index 05b6562a57dc6..9dc6c764599cd 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp
@@ -27,7 +27,6 @@
 #include <cstdint>
 #include <limits>
 #include <memory>
-#include <vector>
 
 #include <poll.h>
 #include <sys/mman.h>
diff --git a/llvm/tools/llvm-gpu-loader/amdhsa.cpp b/llvm/tools/llvm-gpu-loader/amdhsa.cpp
index 5715058d8cfac..fa9ee185549a5 100644
--- a/llvm/tools/llvm-gpu-loader/amdhsa.cpp
+++ b/llvm/tools/llvm-gpu-loader/amdhsa.cpp
@@ -26,7 +26,6 @@
 #include <cstdlib>
 #include <cstring>
 #include <thread>
-#include <tuple>
 #include <utility>
 
 // The implicit arguments of COV5 AMDGPU kernels.
diff --git a/llvm/tools/llvm-ifs/ErrorCollector.cpp b/llvm/tools/llvm-ifs/ErrorCollector.cpp
index 04daa848548e2..7417060a664cc 100644
--- a/llvm/tools/llvm-ifs/ErrorCollector.cpp
+++ b/llvm/tools/llvm-ifs/ErrorCollector.cpp
@@ -11,7 +11,6 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
-#include <vector>
 
 using namespace llvm;
 using namespace llvm::ifs;
diff --git a/llvm/tools/llvm-ifs/llvm-ifs.cpp b/llvm/tools/llvm-ifs/llvm-ifs.cpp
index e12016c51e906..1244bfb788ae5 100644
--- a/llvm/tools/llvm-ifs/llvm-ifs.cpp
+++ b/llvm/tools/llvm-ifs/llvm-ifs.cpp
@@ -34,7 +34,6 @@
 #include "llvm/TextAPI/TextAPIReader.h"
 #include "llvm/TextAPI/TextAPIWriter.h"
 #include <optional>
-#include <set>
 #include <string>
 #include <vector>
 
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 79216e89c7cba..88d6daf08d35e 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -776,6 +776,7 @@ createSharedMemoryManager(SimpleRemoteEPC &SREPC) {
       SlabSize, SREPC, SAs);
 }
 
+#if LLVM_ON_UNIX && LLVM_ENABLE_THREADS
 static void setupEPCRemoteMemoryManager(SimpleRemoteEPC::Setup &S) {
   switch (UseMemMgr) {
   case MemMgr::Default:
@@ -789,6 +790,7 @@ static void setupEPCRemoteMemoryManager(SimpleRemoteEPC::Setup &S) {
     break;
   }
 }
+#endif
 
 static Expected<MaterializationUnit::Interface>
 getTestObjectFileInterface(Session &S, MemoryBufferRef O) {
diff --git a/llvm/tools/llvm-libtool-darwin/DependencyInfo.h b/llvm/tools/llvm-libtool-darwin/DependencyInfo.h
index 784ec3f50cd53..80bad8f5fa545 100644
--- a/llvm/tools/llvm-libtool-darwin/DependencyInfo.h
+++ b/llvm/tools/llvm-libtool-darwin/DependencyInfo.h
@@ -18,7 +18,7 @@ class DependencyInfo {
   explicit DependencyInfo(std::string DependencyInfoPath)
       : DependencyInfoPath(DependencyInfoPath) {}
 
-  virtual ~DependencyInfo(){};
+  virtual ~DependencyInfo() = default;
 
   virtual void addMissingInput(llvm::StringRef Path) {
     NotFounds.insert(Path.str());
diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp
index 09f7142d0dcdc..30f2e530eeb40 100644
--- a/llvm/tools/llvm-lto/llvm-lto.cpp
+++ b/llvm/tools/llvm-lto/llvm-lto.cpp
@@ -45,14 +45,13 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <cstdlib>
-#include <map>
 #include <memory>
 #include <string>
 #include <system_error>
diff --git a/llvm/tools/llvm-mc/Disassembler.h b/llvm/tools/llvm-mc/Disassembler.h
index dd8525d73200e..76cee9e84c312 100644
--- a/llvm/tools/llvm-mc/Disassembler.h
+++ b/llvm/tools/llvm-mc/Disassembler.h
@@ -14,8 +14,6 @@
 #ifndef LLVM_TOOLS_LLVM_MC_DISASSEMBLER_H
 #define LLVM_TOOLS_LLVM_MC_DISASSEMBLER_H
 
-#include <string>
-
 namespace llvm {
 
 class MemoryBuffer;
diff --git a/llvm/tools/llvm-mca/CodeRegionGenerator.cpp b/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
index f7f929e49ead9..146907189929e 100644
--- a/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
+++ b/llvm/tools/llvm-mca/CodeRegionGenerator.cpp
@@ -26,7 +26,7 @@ namespace llvm {
 namespace mca {
 
 // This virtual dtor serves as the anchor for the CodeRegionGenerator class.
-CodeRegionGenerator::~CodeRegionGenerator() {}
+CodeRegionGenerator::~CodeRegionGenerator() = default;
 
 Expected<const CodeRegions &> AsmCodeRegionGenerator::parseCodeRegions(
     const std::unique_ptr<MCInstPrinter> &IP, bool SkipFailures) {
diff --git a/llvm/tools/llvm-mca/CodeRegionGenerator.h b/llvm/tools/llvm-mca/CodeRegionGenerator.h
index a48c67a22f27b..c30f67a53eac0 100644
--- a/llvm/tools/llvm-mca/CodeRegionGenerator.h
+++ b/llvm/tools/llvm-mca/CodeRegionGenerator.h
@@ -151,7 +151,7 @@ class CodeRegionGenerator {
                    bool SkipFailures) = 0;
 
 public:
-  CodeRegionGenerator() {}
+  CodeRegionGenerator() = default;
   virtual ~CodeRegionGenerator();
 };
 
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index 3d7f33cd64bf4..8aa843b6a5155 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -811,12 +811,12 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> ArgsArr,
             .Case("boot_application",
                   COFF::IMAGE_SUBSYSTEM_WINDOWS_BOOT_APPLICATION)
             .Case("console", COFF::IMAGE_SUBSYSTEM_WINDOWS_CUI)
-            .Cases("efi_application", "efi-app",
+            .Cases({"efi_application", "efi-app"},
                    COFF::IMAGE_SUBSYSTEM_EFI_APPLICATION)
-            .Cases("efi_boot_service_driver", "efi-bsd",
+            .Cases({"efi_boot_service_driver", "efi-bsd"},
                    COFF::IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER)
             .Case("efi_rom", COFF::IMAGE_SUBSYSTEM_EFI_ROM)
-            .Cases("efi_runtime_driver", "efi-rtd",
+            .Cases({"efi_runtime_driver", "efi-rtd"},
                    COFF::IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER)
             .Case("native", COFF::IMAGE_SUBSYSTEM_NATIVE)
             .Case("posix", COFF::IMAGE_SUBSYSTEM_POSIX_CUI)
diff --git a/llvm/tools/llvm-objdump/SourcePrinter.h b/llvm/tools/llvm-objdump/SourcePrinter.h
index 5c131a0eb1fd7..19acc871707aa 100644
--- a/llvm/tools/llvm-objdump/SourcePrinter.h
+++ b/llvm/tools/llvm-objdump/SourcePrinter.h
@@ -34,7 +34,7 @@ class LiveElement {
   LiveElement(const char *Name, DWARFUnit *Unit, const DWARFDie FuncDie)
       : Name(Name), Unit(Unit), FuncDie(FuncDie) {}
 
-  virtual ~LiveElement() {};
+  virtual ~LiveElement() = default;
   const char *getName() const { return Name; }
 
   virtual bool liveAtAddress(object::SectionedAddress Addr) const = 0;
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.h b/llvm/tools/llvm-objdump/llvm-objdump.h
index 3525be9a5314a..bac858929ae9c 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.h
+++ b/llvm/tools/llvm-objdump/llvm-objdump.h
@@ -84,7 +84,7 @@ class Dumper {
 
 public:
   Dumper(const object::ObjectFile &O);
-  virtual ~Dumper() {}
+  virtual ~Dumper() = default;
 
   void reportUniqueWarning(Error Err);
   void reportUniqueWarning(const Twine &Msg);
diff --git a/llvm/tools/llvm-offload-wrapper/llvm-offload-wrapper.cpp b/llvm/tools/llvm-offload-wrapper/llvm-offload-wrapper.cpp
index d65b402571ae8..cda59b6f49b62 100644
--- a/llvm/tools/llvm-offload-wrapper/llvm-offload-wrapper.cpp
+++ b/llvm/tools/llvm-offload-wrapper/llvm-offload-wrapper.cpp
@@ -64,7 +64,7 @@ static Error wrapImages(ArrayRef<ArrayRef<char>> BuffersToWrap) {
 
   LLVMContext Context;
   Module M("offload.wrapper.module", Context);
-  M.setTargetTriple(Triple());
+  M.setTargetTriple(llvm::Triple(TheTriple));
 
   switch (Kind) {
   case llvm::object::OFK_OpenMP:
diff --git a/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp b/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
index b2362ecb75703..d836d98e1b0d0 100644
--- a/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
+++ b/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
@@ -68,7 +68,7 @@ DumpOutputStyle::DumpOutputStyle(InputFile &File)
     RefTracker.reset(new TypeReferenceTracker(File));
 }
 
-DumpOutputStyle::~DumpOutputStyle() {}
+DumpOutputStyle::~DumpOutputStyle() = default;
 
 PDBFile &DumpOutputStyle::getPdb() { return File.pdb(); }
 object::COFFObjectFile &DumpOutputStyle::getObj() { return File.obj(); }
diff --git a/llvm/tools/llvm-pdbutil/DumpOutputStyle.h b/llvm/tools/llvm-pdbutil/DumpOutputStyle.h
index 6714a6aa59f17..ea4a47f6f4f06 100644
--- a/llvm/tools/llvm-pdbutil/DumpOutputStyle.h
+++ b/llvm/tools/llvm-pdbutil/DumpOutputStyle.h
@@ -29,7 +29,7 @@ class TypeReferenceTracker;
 
 struct StatCollection {
   struct Stat {
-    Stat() {}
+    Stat() = default;
     Stat(uint32_t Count, uint32_t Size) : Count(Count), Size(Size) {}
     uint32_t Count = 0;
     uint32_t Size = 0;
diff --git a/llvm/tools/llvm-pdbutil/OutputStyle.h b/llvm/tools/llvm-pdbutil/OutputStyle.h
index 8cc9016d79a28..a09fb82f066bb 100644
--- a/llvm/tools/llvm-pdbutil/OutputStyle.h
+++ b/llvm/tools/llvm-pdbutil/OutputStyle.h
@@ -17,7 +17,7 @@ namespace pdb {
 
 class OutputStyle {
 public:
-  virtual ~OutputStyle() {}
+  virtual ~OutputStyle() = default;
 
   virtual Error dump() = 0;
 };
diff --git a/llvm/tools/llvm-pdbutil/PrettyClassDefinitionDumper.h b/llvm/tools/llvm-pdbutil/PrettyClassDefinitionDumper.h
index 50c8f5dde0cd4..9e492a4fb7f1e 100644
--- a/llvm/tools/llvm-pdbutil/PrettyClassDefinitionDumper.h
+++ b/llvm/tools/llvm-pdbutil/PrettyClassDefinitionDumper.h
@@ -15,8 +15,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 
-#include <memory>
-
 namespace llvm {
 class BitVector;
 
diff --git a/llvm/tools/llvm-pdbutil/PrettyCompilandDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
index b347cfdfc3923..902a1d79106f0 100644
--- a/llvm/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
@@ -32,8 +32,6 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/tools/llvm-pdbutil/StreamUtil.h b/llvm/tools/llvm-pdbutil/StreamUtil.h
index 9d6030c7ba9c4..6b8c13f0c0c25 100644
--- a/llvm/tools/llvm-pdbutil/StreamUtil.h
+++ b/llvm/tools/llvm-pdbutil/StreamUtil.h
@@ -35,7 +35,7 @@ enum class StreamPurpose {
 
 struct StreamInfo {
 public:
-  StreamInfo() {}
+  StreamInfo() = default;
 
   uint32_t getModuleIndex() const { return *ModuleIndex; }
   StreamPurpose getPurpose() const { return Purpose; }
diff --git a/llvm/tools/llvm-profgen/MissingFrameInferrer.cpp b/llvm/tools/llvm-profgen/MissingFrameInferrer.cpp
index 7ebca23ba7956..d692de75cfb89 100644
--- a/llvm/tools/llvm-profgen/MissingFrameInferrer.cpp
+++ b/llvm/tools/llvm-profgen/MissingFrameInferrer.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ADT/Statistic.h"
 #include <algorithm>
 #include <cstdint>
-#include <iterator>
 #include <queue>
 #include <sys/types.h>
 
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index 94728ce4abffe..96db6a714572a 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -187,7 +187,7 @@ ProfiledBinary::ProfiledBinary(const StringRef ExeBinPath,
   load();
 }
 
-ProfiledBinary::~ProfiledBinary() {}
+ProfiledBinary::~ProfiledBinary() = default;
 
 void ProfiledBinary::warnNoFuncEntry() {
   uint64_t NoFuncEntryNum = 0;
diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.h b/llvm/tools/llvm-rc/ResourceFileWriter.h
index 82d3e3b9e9e8b..a13af45512625 100644
--- a/llvm/tools/llvm-rc/ResourceFileWriter.h
+++ b/llvm/tools/llvm-rc/ResourceFileWriter.h
@@ -19,6 +19,8 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Endian.h"
 
+#include <map>
+
 namespace llvm {
 
 class MemoryBuffer;
diff --git a/llvm/tools/llvm-rc/ResourceScriptStmt.h b/llvm/tools/llvm-rc/ResourceScriptStmt.h
index a81e384fda365..84da9beb64be9 100644
--- a/llvm/tools/llvm-rc/ResourceScriptStmt.h
+++ b/llvm/tools/llvm-rc/ResourceScriptStmt.h
@@ -242,9 +242,9 @@ class RCResource {
   virtual raw_ostream &log(raw_ostream &OS) const {
     return OS << "Base statement\n";
   };
-  RCResource() {}
+  RCResource() = default;
   RCResource(uint16_t Flags) : MemoryFlags(Flags) {}
-  virtual ~RCResource() {}
+  virtual ~RCResource() = default;
 
   virtual Error visit(Visitor *) const {
     llvm_unreachable("This is unable to call methods from Visitor base");
@@ -290,7 +290,7 @@ class OptionalStmtList : public OptionalStmt {
   std::vector<std::unique_ptr<OptionalStmt>> Statements;
 
 public:
-  OptionalStmtList() {}
+  OptionalStmtList() = default;
   raw_ostream &log(raw_ostream &OS) const override;
 
   void addStmt(std::unique_ptr<OptionalStmt> Stmt) {
@@ -510,7 +510,7 @@ class MenuDefinition {
   virtual raw_ostream &log(raw_ostream &OS) const {
     return OS << "Base menu definition\n";
   }
-  virtual ~MenuDefinition() {}
+  virtual ~MenuDefinition() = default;
 
   virtual uint16_t getResFlags() const { return 0; }
   virtual MenuDefKind getKind() const { return MkBase; }
@@ -818,7 +818,7 @@ class VersionInfoStmt {
   enum StmtKind { StBase = 0, StBlock = 1, StValue = 2 };
 
   virtual raw_ostream &log(raw_ostream &OS) const { return OS << "VI stmt\n"; }
-  virtual ~VersionInfoStmt() {}
+  virtual ~VersionInfoStmt() = default;
 
   virtual StmtKind getKind() const { return StBase; }
   static bool classof(const VersionInfoStmt *S) {
diff --git a/llvm/tools/llvm-rc/ResourceScriptToken.cpp b/llvm/tools/llvm-rc/ResourceScriptToken.cpp
index 0070037e63e6a..046a1bf78daef 100644
--- a/llvm/tools/llvm-rc/ResourceScriptToken.cpp
+++ b/llvm/tools/llvm-rc/ResourceScriptToken.cpp
@@ -26,11 +26,11 @@ using namespace llvm;
 using Kind = RCToken::Kind;
 
 // Checks if Representation is a correct description of an RC integer.
-// It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
-// or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
-// character (that is the difference between our representation and
-// StringRef's one). If Representation is correct, 'true' is returned and
-// the return value is put back in Num.
+// It should be a 32-bit unsigned integer, either decimal or hexadecimal
+// (0x[0-9a-f]+). For Windres mode, it can also be octal (0[0-7]+).
+// It might be followed by a single 'L' character (that is the difference
+// between our representation and StringRef's one). If Representation is
+// correct, 'true' is returned and the return value is put back in Num.
 static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
   size_t Length = Representation.size();
   if (Length == 0)
@@ -95,7 +95,8 @@ namespace {
 
 class Tokenizer {
 public:
-  Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()), Pos(0) {}
+  Tokenizer(StringRef Input, bool IsWindres)
+      : Data(Input), DataLength(Input.size()), Pos(0), IsWindres(IsWindres) {}
 
   Expected<std::vector<RCToken>> run();
 
@@ -128,6 +129,7 @@ class Tokenizer {
   // character.
   bool canStartInt() const;
   bool canContinueInt() const;
+  void trimIntString(StringRef &Str) const;
 
   bool canStartString() const;
 
@@ -153,6 +155,7 @@ class Tokenizer {
 
   StringRef Data;
   size_t DataLength, Pos;
+  bool IsWindres;
 };
 
 void Tokenizer::skipCurrentLine() {
@@ -187,7 +190,12 @@ Expected<std::vector<RCToken>> Tokenizer::run() {
     if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment)
       continue;
 
-    RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
+    StringRef Contents = Data.take_front(Pos).drop_front(TokenStart);
+
+    if (TokenKind == Kind::Int)
+      trimIntString(Contents);
+
+    RCToken Token(TokenKind, Contents);
     if (TokenKind == Kind::Identifier) {
       processIdentifier(Token);
     } else if (TokenKind == Kind::Int) {
@@ -366,12 +374,30 @@ void Tokenizer::processIdentifier(RCToken &Token) const {
     Token = RCToken(Kind::BlockEnd, Name);
 }
 
+void Tokenizer::trimIntString(StringRef &Str) const {
+  if (!IsWindres) {
+    // For compatibility with rc.exe, strip leading zeros that make the
+    // integer literal interpreted as octal.
+    //
+    // We do rely on Stringref::getAsInteger for autodetecting between
+    // decimal and hexadecimal literals, but we want to avoid interpreting
+    // literals as octal.
+    //
+    // This omits the leading zeros from the RCToken's value string entirely,
+    // which also has a visible effect when dumping the tokenizer output.
+    // Alternatively, we could store the IsWindres flag in RCToken and defer
+    // the trimming to RCToken::intValue.
+    while (Str.size() >= 2 && Str[0] == '0' && std::isdigit(Str[1]))
+      Str = Str.drop_front(1);
+  }
+}
+
 } // anonymous namespace
 
 namespace llvm {
 
-Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
-  return Tokenizer(Input).run();
+Expected<std::vector<RCToken>> tokenizeRC(StringRef Input, bool IsWindres) {
+  return Tokenizer(Input, IsWindres).run();
 }
 
 } // namespace llvm
diff --git a/llvm/tools/llvm-rc/ResourceScriptToken.h b/llvm/tools/llvm-rc/ResourceScriptToken.h
index 3dcdfafd2d576..4c839a009e79f 100644
--- a/llvm/tools/llvm-rc/ResourceScriptToken.h
+++ b/llvm/tools/llvm-rc/ResourceScriptToken.h
@@ -28,7 +28,6 @@
 #include "llvm/Support/Error.h"
 
 #include <cstdint>
-#include <map>
 #include <vector>
 
 namespace llvm {
@@ -76,7 +75,7 @@ class RCToken {
 // Tokens returned by this function hold only references to the parts
 // of the Input. Memory buffer containing Input cannot be freed,
 // modified or reallocated.
-Expected<std::vector<RCToken>> tokenizeRC(StringRef Input);
+Expected<std::vector<RCToken>> tokenizeRC(StringRef Input, bool IsWindres);
 
 } // namespace llvm
 
diff --git a/llvm/tools/llvm-rc/ResourceScriptTokenList.def b/llvm/tools/llvm-rc/ResourceScriptTokenList.def
index 6ee13b2815d35..98af23c649577 100644
--- a/llvm/tools/llvm-rc/ResourceScriptTokenList.def
+++ b/llvm/tools/llvm-rc/ResourceScriptTokenList.def
@@ -14,7 +14,7 @@
 
 // Long tokens. They might consist of more than one character.
 TOKEN(Invalid)      // Invalid token. Should not occur in a valid script.
-TOKEN(Int)          // Integer (decimal, octal or hexadecimal).
+TOKEN(Int)          // Integer (decimal or hexadecimal, and possibly octal for windres).
 TOKEN(String)       // String value.
 TOKEN(Identifier)   // Script identifier (resource name or type).
 TOKEN(LineComment)  // Beginning of single-line comment.
diff --git a/llvm/tools/llvm-rc/ResourceVisitor.h b/llvm/tools/llvm-rc/ResourceVisitor.h
index a121a0a507c27..1815c6b8a2f52 100644
--- a/llvm/tools/llvm-rc/ResourceVisitor.h
+++ b/llvm/tools/llvm-rc/ResourceVisitor.h
@@ -55,7 +55,7 @@ class Visitor {
   virtual Error visitVersionStmt(const VersionStmt *) = 0;
   virtual Error visitMenuStmt(const MenuStmt *) = 0;
 
-  virtual ~Visitor() {}
+  virtual ~Visitor() = default;
 };
 
 } // namespace rc
diff --git a/llvm/tools/llvm-rc/llvm-rc.cpp b/llvm/tools/llvm-rc/llvm-rc.cpp
index f623342366515..38bf03f51227b 100644
--- a/llvm/tools/llvm-rc/llvm-rc.cpp
+++ b/llvm/tools/llvm-rc/llvm-rc.cpp
@@ -619,7 +619,8 @@ void doRc(std::string Src, std::string Dest, RcOptions &Opts,
   StringRef Contents = FileContents->getBuffer();
 
   std::string FilteredContents = filterCppOutput(Contents);
-  std::vector<RCToken> Tokens = ExitOnErr(tokenizeRC(FilteredContents));
+  std::vector<RCToken> Tokens =
+      ExitOnErr(tokenizeRC(FilteredContents, Opts.IsWindres));
 
   if (Opts.BeVerbose) {
     const Twine TokenNames[] = {
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 423a11fd5b72a..6f09da5a4099f 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -8188,6 +8188,8 @@ void LLVMELFDumper<ELFT>::printBBAddrMaps(bool PrettyPGOAnalysis) {
               } else {
                 W.printNumber("Frequency", PBBE.BlockFreq.getFrequency());
               }
+              if (PAM.FeatEnable.PostLinkCfg)
+                W.printNumber("PostLink Frequency", PBBE.PostLinkBlockFreq);
             }
 
             if (PAM.FeatEnable.BrProb) {
@@ -8200,6 +8202,8 @@ void LLVMELFDumper<ELFT>::printBBAddrMaps(bool PrettyPGOAnalysis) {
                 } else {
                   W.printHex("Probability", Succ.Prob.getNumerator());
                 }
+                if (PAM.FeatEnable.PostLinkCfg)
+                  W.printNumber("PostLink Probability", Succ.PostLinkFreq);
               }
             }
           }
diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp
index 0b59dd48d4203..20e027aa5a5ef 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.cpp
+++ b/llvm/tools/llvm-readobj/ObjDumper.cpp
@@ -41,7 +41,7 @@ ObjDumper::ObjDumper(ScopedPrinter &Writer, StringRef ObjName) : W(Writer) {
   };
 }
 
-ObjDumper::~ObjDumper() {}
+ObjDumper::~ObjDumper() = default;
 
 void ObjDumper::reportUniqueWarning(Error Err) const {
   reportUniqueWarning(toString(std::move(Err)));
diff --git a/llvm/tools/llvm-readtapi/DiffEngine.h b/llvm/tools/llvm-readtapi/DiffEngine.h
index 7ab57d43e3af2..b350ceef5c6e0 100644
--- a/llvm/tools/llvm-readtapi/DiffEngine.h
+++ b/llvm/tools/llvm-readtapi/DiffEngine.h
@@ -39,7 +39,7 @@ enum DiffAttrKind {
 class AttributeDiff {
 public:
   AttributeDiff(DiffAttrKind Kind) : Kind(Kind){};
-  virtual ~AttributeDiff(){};
+  virtual ~AttributeDiff() = default;
   DiffAttrKind getKind() const { return Kind; }
 
 private:
diff --git a/llvm/tools/llvm-reduce/TestRunner.h b/llvm/tools/llvm-reduce/TestRunner.h
index 930c3248ff105..0218dad263b05 100644
--- a/llvm/tools/llvm-reduce/TestRunner.h
+++ b/llvm/tools/llvm-reduce/TestRunner.h
@@ -16,7 +16,6 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Target/TargetMachine.h"
-#include <vector>
 
 namespace llvm {
 
diff --git a/llvm/tools/llvm-stress/llvm-stress.cpp b/llvm/tools/llvm-stress/llvm-stress.cpp
index 133812e419d2b..2fe5d6b7e5254 100644
--- a/llvm/tools/llvm-stress/llvm-stress.cpp
+++ b/llvm/tools/llvm-stress/llvm-stress.cpp
@@ -40,7 +40,6 @@
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
diff --git a/llvm/tools/llvm-xray/trie-node.h b/llvm/tools/llvm-xray/trie-node.h
index b42b0293620dd..f96be592fc364 100644
--- a/llvm/tools/llvm-xray/trie-node.h
+++ b/llvm/tools/llvm-xray/trie-node.h
@@ -15,7 +15,6 @@
 #define LLVM_TOOLS_LLVM_XRAY_STACK_TRIE_H
 
 #include <forward_list>
-#include <numeric>
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/llvm/tools/llvm-xray/xray-graph.h b/llvm/tools/llvm-xray/xray-graph.h
index fd9644908426f..bf25f8d11c5b9 100644
--- a/llvm/tools/llvm-xray/xray-graph.h
+++ b/llvm/tools/llvm-xray/xray-graph.h
@@ -86,7 +86,7 @@ class GraphRenderer {
   };
 
   GraphT G;
-  using VertexIdentifier = typename decltype(G)::VertexIdentifier;
+  using VertexIdentifier = decltype(G)::VertexIdentifier;
   using EdgeIdentifier = decltype(G)::EdgeIdentifier;
 
   /// Use a Map to store the Function stack for each thread whilst building the
diff --git a/llvm/tools/obj2yaml/elf2yaml.cpp b/llvm/tools/obj2yaml/elf2yaml.cpp
index 68e18f6c79202..4364d15a8b455 100644
--- a/llvm/tools/obj2yaml/elf2yaml.cpp
+++ b/llvm/tools/obj2yaml/elf2yaml.cpp
@@ -895,7 +895,7 @@ ELFDumper<ELFT>::dumpBBAddrMapSection(const Elf_Shdr *Shdr) {
   std::vector<ELFYAML::PGOAnalysisMapEntry> PGOAnalyses;
   DataExtractor::Cursor Cur(0);
   uint8_t Version = 0;
-  uint8_t Feature = 0;
+  uint16_t Feature = 0;
   uint64_t Address = 0;
   while (Cur && Cur.tell() < Content.size()) {
     if (Shdr->sh_type == ELF::SHT_LLVM_BB_ADDR_MAP) {
@@ -905,7 +905,7 @@ ELFDumper<ELFT>::dumpBBAddrMapSection(const Elf_Shdr *Shdr) {
             errc::invalid_argument,
             "invalid SHT_LLVM_BB_ADDR_MAP section version: " +
                 Twine(static_cast<int>(Version)));
-      Feature = Data.getU8(Cur);
+      Feature = Version < 5 ? Data.getU8(Cur) : Data.getU16(Cur);
     }
     uint64_t NumBBRanges = 1;
     uint64_t NumBlocks = 0;
@@ -972,6 +972,8 @@ ELFDumper<ELFT>::dumpBBAddrMapSection(const Elf_Shdr *Shdr) {
           auto &PGOBBEntry = PGOBBEntries.emplace_back();
           if (FeatureOrErr->BBFreq) {
             PGOBBEntry.BBFreq = Data.getULEB128(Cur);
+            if (FeatureOrErr->PostLinkCfg)
+              PGOBBEntry.PostLinkBBFreq = Data.getULEB128(Cur);
             if (!Cur)
               break;
           }
@@ -982,7 +984,10 @@ ELFDumper<ELFT>::dumpBBAddrMapSection(const Elf_Shdr *Shdr) {
             for (uint64_t SuccIdx = 0; Cur && SuccIdx < SuccCount; ++SuccIdx) {
               uint32_t ID = Data.getULEB128(Cur);
               uint32_t BrProb = Data.getULEB128(Cur);
-              SuccEntries.push_back({ID, BrProb});
+              std::optional<uint32_t> PostLinkBrFreq;
+              if (FeatureOrErr->PostLinkCfg)
+                PostLinkBrFreq = Data.getULEB128(Cur);
+              SuccEntries.push_back({ID, BrProb, PostLinkBrFreq});
             }
           }
         }
diff --git a/llvm/tools/opt-viewer/optrecord.py b/llvm/tools/opt-viewer/optrecord.py
index 8014204a64f45..07e21028535c4 100644
--- a/llvm/tools/opt-viewer/optrecord.py
+++ b/llvm/tools/opt-viewer/optrecord.py
@@ -19,35 +19,18 @@
 from multiprocessing import Lock
 import os, os.path
 import subprocess
-
-try:
-    # The previously builtin function `intern()` was moved
-    # to the `sys` module in Python 3.
-    from sys import intern
-except:
-    pass
-
+from sys import intern
 import re
 
 import optpmap
 
-try:
-    dict.iteritems
-except AttributeError:
-    # Python 3
-    def itervalues(d):
-        return iter(d.values())
 
-    def iteritems(d):
-        return iter(d.items())
+def itervalues(d):
+    return iter(d.values())
 
-else:
-    # Python 2
-    def itervalues(d):
-        return d.itervalues()
 
-    def iteritems(d):
-        return d.iteritems()
+def iteritems(d):
+    return iter(d.items())
 
 
 def html_file_name(filename):
@@ -361,6 +344,8 @@ def find_opt_files(*dirs_or_files):
                     d for d in subdirs if not os.path.ismount(os.path.join(dir, d))
                 ]
                 for file in files:
-                    if fnmatch.fnmatch(file, "*.opt.yaml*"):
+                    if fnmatch.fnmatch(file, "*.opt.yaml*") or fnmatch.fnmatch(
+                        file, "*.opt.ld.yaml*"
+                    ):
                         all.append(os.path.join(dir, file))
     return all
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index fbe96bb127836..99cc38b6b422b 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -10118,7 +10118,7 @@ TEST(APFloatTest, Float4E2M1FNToFloat) {
 }
 
 TEST(APFloatTest, AddOrSubtractSignificand) {
-  typedef detail::IEEEFloatUnitTestHelper Helper;
+  using Helper = detail::IEEEFloatUnitTestHelper;
   // Test cases are all combinations of:
   // {equal exponents, LHS larger exponent, RHS larger exponent}
   // {equal significands, LHS larger significand, RHS larger significand}
diff --git a/llvm/unittests/ADT/BitVectorTest.cpp b/llvm/unittests/ADT/BitVectorTest.cpp
index 12ba0041af551..e13523b8e10c3 100644
--- a/llvm/unittests/ADT/BitVectorTest.cpp
+++ b/llvm/unittests/ADT/BitVectorTest.cpp
@@ -21,7 +21,7 @@ template <typename T>
 class BitVectorTest : public ::testing::Test { };
 
 // Test both BitVector and SmallBitVector with the same suite of tests.
-typedef ::testing::Types<BitVector, SmallBitVector> BitVectorTestTypes;
+using BitVectorTestTypes = ::testing::Types<BitVector, SmallBitVector>;
 TYPED_TEST_SUITE(BitVectorTest, BitVectorTestTypes, );
 
 TYPED_TEST(BitVectorTest, TrivialOperation) {
@@ -857,7 +857,7 @@ TYPED_TEST(BitVectorTest, BinOps) {
   EXPECT_FALSE(B.anyCommon(A));
 }
 
-typedef std::vector<std::pair<int, int>> RangeList;
+using RangeList = std::vector<std::pair<int, int>>;
 
 template <typename VecType>
 static inline VecType createBitVector(uint32_t Size,
diff --git a/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp b/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp
index 0cd7fd3f706f0..571e4d27c6752 100644
--- a/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp
+++ b/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp
@@ -21,7 +21,7 @@ using namespace llvm;
 namespace llvm {
 
 TEST(BreadthFristIteratorTest, Basic) {
-  typedef bf_iterator<Graph<4>> BFIter;
+  using BFIter = bf_iterator<Graph<4>>;
 
   Graph<4> G;
   G.AddEdge(0, 1);
@@ -46,7 +46,7 @@ TEST(BreadthFristIteratorTest, Basic) {
 }
 
 TEST(BreadthFristIteratorTest, Cycle) {
-  typedef bf_iterator<Graph<4>> BFIter;
+  using BFIter = bf_iterator<Graph<4>>;
 
   Graph<4> G;
   G.AddEdge(0, 1);
@@ -78,7 +78,7 @@ TEST(BreadthFristIteratorTest, Cycle) {
 
 static_assert(
     std::is_convertible_v<decltype(*std::declval<bf_iterator<Graph<3>>>()),
-                          typename bf_iterator<Graph<3>>::reference>);
+                          bf_iterator<Graph<3>>::reference>);
 
 // bf_iterator should be (at-least) a forward-iterator
 static_assert(std::is_base_of_v<std::forward_iterator_tag,
diff --git a/llvm/unittests/ADT/CombinationGeneratorTest.cpp b/llvm/unittests/ADT/CombinationGeneratorTest.cpp
index f3e174a83ba69..cf187e68479c0 100644
--- a/llvm/unittests/ADT/CombinationGeneratorTest.cpp
+++ b/llvm/unittests/ADT/CombinationGeneratorTest.cpp
@@ -12,9 +12,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include <algorithm>
 #include <cstddef>
-#include <iterator>
 #include <vector>
 
 using namespace llvm;
diff --git a/llvm/unittests/ADT/ConcurrentHashtableTest.cpp b/llvm/unittests/ADT/ConcurrentHashtableTest.cpp
index ee1ee41f453a3..05c480959142b 100644
--- a/llvm/unittests/ADT/ConcurrentHashtableTest.cpp
+++ b/llvm/unittests/ADT/ConcurrentHashtableTest.cpp
@@ -14,14 +14,13 @@
 #include "gtest/gtest.h"
 #include <limits>
 #include <random>
-#include <vector>
 using namespace llvm;
 using namespace parallel;
 
 namespace {
 class String {
 public:
-  String() {}
+  String() = default;
   const std::string &getKey() const { return Data; }
 
   template <typename AllocatorTy>
diff --git a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
index f543947899393..9f5149099e309 100644
--- a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
+++ b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
@@ -9,13 +9,12 @@
 #include "llvm/ADT/DAGDeltaAlgorithm.h"
 #include "llvm/ADT/STLExtras.h"
 #include "gtest/gtest.h"
-#include <algorithm>
 #include <cstdarg>
 using namespace llvm;
 
 namespace {
 
-typedef DAGDeltaAlgorithm::edge_ty edge_ty;
+using edge_ty = DAGDeltaAlgorithm::edge_ty;
 
 class FixedDAGDeltaAlgorithm : public DAGDeltaAlgorithm {
   changeset_ty FailingSet;
diff --git a/llvm/unittests/ADT/DeltaAlgorithmTest.cpp b/llvm/unittests/ADT/DeltaAlgorithmTest.cpp
index 24e18f42eb33c..530bd1cb51173 100644
--- a/llvm/unittests/ADT/DeltaAlgorithmTest.cpp
+++ b/llvm/unittests/ADT/DeltaAlgorithmTest.cpp
@@ -9,7 +9,6 @@
 #include "llvm/ADT/DeltaAlgorithm.h"
 #include "llvm/ADT/STLExtras.h"
 #include "gtest/gtest.h"
-#include <algorithm>
 #include <cstdarg>
 using namespace llvm;
 
diff --git a/llvm/unittests/ADT/DenseMapTest.cpp b/llvm/unittests/ADT/DenseMapTest.cpp
index aceb4f30d878d..273ee09fc1e28 100644
--- a/llvm/unittests/ADT/DenseMapTest.cpp
+++ b/llvm/unittests/ADT/DenseMapTest.cpp
@@ -129,18 +129,17 @@ typename T::mapped_type *const DenseMapTest<T>::dummy_value_ptr = nullptr;
 
 // Register these types for testing.
 // clang-format off
-typedef ::testing::Types<DenseMap<uint32_t, uint32_t>,
-                         DenseMap<uint32_t *, uint32_t *>,
-                         DenseMap<CtorTester, CtorTester, CtorTesterMapInfo>,
-                         DenseMap<EnumClass, uint32_t>,
-                         DenseMap<std::optional<uint32_t>, uint32_t>,
-                         SmallDenseMap<uint32_t, uint32_t>,
-                         SmallDenseMap<uint32_t *, uint32_t *>,
-                         SmallDenseMap<CtorTester, CtorTester, 4,
-                                       CtorTesterMapInfo>,
-                         SmallDenseMap<EnumClass, uint32_t>,
-                         SmallDenseMap<std::optional<uint32_t>, uint32_t>
-                         > DenseMapTestTypes;
+using DenseMapTestTypes = ::testing::Types<
+    DenseMap<uint32_t, uint32_t>,
+    DenseMap<uint32_t *, uint32_t *>,
+    DenseMap<CtorTester, CtorTester, CtorTesterMapInfo>,
+    DenseMap<EnumClass, uint32_t>,
+    DenseMap<std::optional<uint32_t>, uint32_t>,
+    SmallDenseMap<uint32_t, uint32_t>,
+    SmallDenseMap<uint32_t *, uint32_t *>,
+    SmallDenseMap<CtorTester, CtorTester, 4, CtorTesterMapInfo>,
+    SmallDenseMap<EnumClass, uint32_t>,
+    SmallDenseMap<std::optional<uint32_t>, uint32_t>>;
 // clang-format on
 
 TYPED_TEST_SUITE(DenseMapTest, DenseMapTestTypes, );
diff --git a/llvm/unittests/ADT/DenseSetTest.cpp b/llvm/unittests/ADT/DenseSetTest.cpp
index a24f99b6bb34f..a2a062b151b67 100644
--- a/llvm/unittests/ADT/DenseSetTest.cpp
+++ b/llvm/unittests/ADT/DenseSetTest.cpp
@@ -96,13 +96,13 @@ template <typename T> class DenseSetTest : public testing::Test {
 };
 
 // Register these types for testing.
-typedef ::testing::Types<DenseSet<unsigned, TestDenseSetInfo>,
-                         const DenseSet<unsigned, TestDenseSetInfo>,
-                         SmallDenseSet<unsigned, 1, TestDenseSetInfo>,
-                         SmallDenseSet<unsigned, 4, TestDenseSetInfo>,
-                         const SmallDenseSet<unsigned, 4, TestDenseSetInfo>,
-                         SmallDenseSet<unsigned, 64, TestDenseSetInfo>>
-    DenseSetTestTypes;
+using DenseSetTestTypes =
+    ::testing::Types<DenseSet<unsigned, TestDenseSetInfo>,
+                     const DenseSet<unsigned, TestDenseSetInfo>,
+                     SmallDenseSet<unsigned, 1, TestDenseSetInfo>,
+                     SmallDenseSet<unsigned, 4, TestDenseSetInfo>,
+                     const SmallDenseSet<unsigned, 4, TestDenseSetInfo>,
+                     SmallDenseSet<unsigned, 64, TestDenseSetInfo>>;
 TYPED_TEST_SUITE(DenseSetTest, DenseSetTestTypes, );
 
 TYPED_TEST(DenseSetTest, Constructor) {
diff --git a/llvm/unittests/ADT/DepthFirstIteratorTest.cpp b/llvm/unittests/ADT/DepthFirstIteratorTest.cpp
index 95923b8100838..00312ca6044e6 100644
--- a/llvm/unittests/ADT/DepthFirstIteratorTest.cpp
+++ b/llvm/unittests/ADT/DepthFirstIteratorTest.cpp
@@ -21,7 +21,7 @@ using namespace llvm;
 namespace llvm {
 
 template <typename T> struct CountedSet {
-  typedef typename SmallPtrSet<T, 4>::iterator iterator;
+  using iterator = typename SmallPtrSet<T, 4>::iterator;
 
   SmallPtrSet<T, 4> S;
   int InsertVisited = 0;
@@ -44,8 +44,8 @@ template <typename T> class df_iterator_storage<CountedSet<T>, true> {
 };
 
 TEST(DepthFirstIteratorTest, ActuallyUpdateIterator) {
-  typedef CountedSet<Graph<3>::NodeType *> StorageT;
-  typedef df_iterator<Graph<3>, StorageT, true> DFIter;
+  using StorageT = CountedSet<Graph<3>::NodeType *>;
+  using DFIter = df_iterator<Graph<3>, StorageT, true>;
 
   Graph<3> G;
   G.AddEdge(0, 1);
@@ -59,7 +59,7 @@ TEST(DepthFirstIteratorTest, ActuallyUpdateIterator) {
 
 static_assert(
     std::is_convertible_v<decltype(*std::declval<df_iterator<Graph<3>>>()),
-                          typename df_iterator<Graph<3>>::reference>);
+                          df_iterator<Graph<3>>::reference>);
 
 // df_iterator should be (at-least) a forward-iterator
 static_assert(std::is_base_of_v<std::forward_iterator_tag,
diff --git a/llvm/unittests/ADT/DirectedGraphTest.cpp b/llvm/unittests/ADT/DirectedGraphTest.cpp
index 49ccf06ddc00c..82a631b4f83f3 100644
--- a/llvm/unittests/ADT/DirectedGraphTest.cpp
+++ b/llvm/unittests/ADT/DirectedGraphTest.cpp
@@ -43,7 +43,7 @@ class DGTestEdge : public DGTestEdgeBase {
 class DGTestGraph : public DGTestBase {
 public:
   DGTestGraph() = default;
-  ~DGTestGraph(){};
+  ~DGTestGraph() = default;
 };
 
 using EdgeListTy = SmallVector<DGTestEdge *, 2>;
diff --git a/llvm/unittests/ADT/FallibleIteratorTest.cpp b/llvm/unittests/ADT/FallibleIteratorTest.cpp
index d3389744ffbfe..c17aa0393dfcb 100644
--- a/llvm/unittests/ADT/FallibleIteratorTest.cpp
+++ b/llvm/unittests/ADT/FallibleIteratorTest.cpp
@@ -19,8 +19,8 @@ using namespace llvm;
 
 namespace {
 
-using ItemValid = enum { ValidItem, InvalidItem };
-using LinkValid = enum { ValidLink, InvalidLink };
+enum ItemValid { ValidItem, InvalidItem };
+enum LinkValid { ValidLink, InvalidLink };
 
 class Item {
 public:
diff --git a/llvm/unittests/ADT/FunctionExtrasTest.cpp b/llvm/unittests/ADT/FunctionExtrasTest.cpp
index 9809a92daac72..fdabdca269da2 100644
--- a/llvm/unittests/ADT/FunctionExtrasTest.cpp
+++ b/llvm/unittests/ADT/FunctionExtrasTest.cpp
@@ -11,7 +11,6 @@
 #include "gtest/gtest.h"
 
 #include <memory>
-#include <type_traits>
 
 using namespace llvm;
 
diff --git a/llvm/unittests/ADT/IListBaseTest.cpp b/llvm/unittests/ADT/IListBaseTest.cpp
index bd915688b190d..eeed488c28d88 100644
--- a/llvm/unittests/ADT/IListBaseTest.cpp
+++ b/llvm/unittests/ADT/IListBaseTest.cpp
@@ -19,13 +19,14 @@ template <typename T> class IListBaseTest : public ::testing::Test {};
 class Parent;
 
 // Test variants with the same test.
-typedef ::testing::Types<ilist_base<false, void>, ilist_base<true, void>, ilist_base<false, Parent*>, ilist_base<true, Parent*>>
-    IListBaseTestTypes;
+using IListBaseTestTypes =
+    ::testing::Types<ilist_base<false, void>, ilist_base<true, void>,
+                     ilist_base<false, Parent *>, ilist_base<true, Parent *>>;
 TYPED_TEST_SUITE(IListBaseTest, IListBaseTestTypes, );
 
 TYPED_TEST(IListBaseTest, insertBeforeImpl) {
-  typedef TypeParam list_base_type;
-  typedef typename list_base_type::node_base_type node_base_type;
+  using list_base_type = TypeParam;
+  using node_base_type = typename list_base_type::node_base_type;
 
   node_base_type S, A, B;
 
@@ -51,8 +52,8 @@ TYPED_TEST(IListBaseTest, insertBeforeImpl) {
 }
 
 TYPED_TEST(IListBaseTest, removeImpl) {
-  typedef TypeParam list_base_type;
-  typedef typename list_base_type::node_base_type node_base_type;
+  using list_base_type = TypeParam;
+  using node_base_type = typename list_base_type::node_base_type;
 
   node_base_type S, A, B;
 
@@ -80,8 +81,8 @@ TYPED_TEST(IListBaseTest, removeImpl) {
 }
 
 TYPED_TEST(IListBaseTest, removeRangeImpl) {
-  typedef TypeParam list_base_type;
-  typedef typename list_base_type::node_base_type node_base_type;
+  using list_base_type = TypeParam;
+  using node_base_type = typename list_base_type::node_base_type;
 
   node_base_type S, A, B, C, D;
 
@@ -106,8 +107,8 @@ TYPED_TEST(IListBaseTest, removeRangeImpl) {
 }
 
 TYPED_TEST(IListBaseTest, removeRangeImplAllButSentinel) {
-  typedef TypeParam list_base_type;
-  typedef typename list_base_type::node_base_type node_base_type;
+  using list_base_type = TypeParam;
+  using node_base_type = typename list_base_type::node_base_type;
 
   node_base_type S, A, B;
 
@@ -126,8 +127,8 @@ TYPED_TEST(IListBaseTest, removeRangeImplAllButSentinel) {
 }
 
 TYPED_TEST(IListBaseTest, transferBeforeImpl) {
-  typedef TypeParam list_base_type;
-  typedef typename list_base_type::node_base_type node_base_type;
+  using list_base_type = TypeParam;
+  using node_base_type = typename list_base_type::node_base_type;
 
   node_base_type S1, S2, A, B, C, D, E;
 
diff --git a/llvm/unittests/ADT/IListIteratorBitsTest.cpp b/llvm/unittests/ADT/IListIteratorBitsTest.cpp
index 97c14265e6bcb..b430bcbe1eb0e 100644
--- a/llvm/unittests/ADT/IListIteratorBitsTest.cpp
+++ b/llvm/unittests/ADT/IListIteratorBitsTest.cpp
@@ -93,8 +93,8 @@ TEST(IListIteratorBitsTest, ConsAndAssignment) {
 class dummy {
   // Test that we get an ilist_iterator_w_bits out of the node given that the
   // options are enabled.
-  using node_options = typename ilist_detail::compute_node_options<
-      Node, ilist_iterator_bits<true>>::type;
+  using node_options =
+      ilist_detail::compute_node_options<Node, ilist_iterator_bits<true>>::type;
   static_assert(std::is_same<Node::self_iterator,
                              llvm::ilist_iterator_w_bits<node_options, false,
                                                          false>>::value);
@@ -102,7 +102,7 @@ class dummy {
   // Now test that a plain node, without the option, gets a plain
   // ilist_iterator.
   using plain_node_options =
-      typename ilist_detail::compute_node_options<PlainNode>::type;
+      ilist_detail::compute_node_options<PlainNode>::type;
   static_assert(std::is_same<
                 PlainNode::self_iterator,
                 llvm::ilist_iterator<plain_node_options, false, false>>::value);
diff --git a/llvm/unittests/ADT/IListIteratorTest.cpp b/llvm/unittests/ADT/IListIteratorTest.cpp
index 4e5b847b35ffe..54a4258246e9b 100644
--- a/llvm/unittests/ADT/IListIteratorTest.cpp
+++ b/llvm/unittests/ADT/IListIteratorTest.cpp
@@ -141,10 +141,10 @@ TEST(IListIteratorTest, ReverseConstructor) {
   L.insert(L.end(), B);
 
   // Save typing.
-  typedef simple_ilist<Node>::iterator iterator;
-  typedef simple_ilist<Node>::reverse_iterator reverse_iterator;
-  typedef simple_ilist<Node>::const_iterator const_iterator;
-  typedef simple_ilist<Node>::const_reverse_iterator const_reverse_iterator;
+  using iterator = simple_ilist<Node>::iterator;
+  using reverse_iterator = simple_ilist<Node>::reverse_iterator;
+  using const_iterator = simple_ilist<Node>::const_iterator;
+  using const_reverse_iterator = simple_ilist<Node>::const_reverse_iterator;
 
   // Check conversion values.
   EXPECT_EQ(L.begin(), iterator(L.rend()));
diff --git a/llvm/unittests/ADT/IListNodeBaseTest.cpp b/llvm/unittests/ADT/IListNodeBaseTest.cpp
index ef90c716a4118..393f83af99b76 100644
--- a/llvm/unittests/ADT/IListNodeBaseTest.cpp
+++ b/llvm/unittests/ADT/IListNodeBaseTest.cpp
@@ -17,10 +17,10 @@ namespace {
 
 class Parent {};
 
-typedef ilist_node_base<false, void> RawNode;
-typedef ilist_node_base<true, void> TrackingNode;
-typedef ilist_node_base<false, Parent> ParentNode;
-typedef ilist_node_base<true, Parent> ParentTrackingNode;
+using RawNode = ilist_node_base<false, void>;
+using TrackingNode = ilist_node_base<true, void>;
+using ParentNode = ilist_node_base<false, Parent>;
+using ParentTrackingNode = ilist_node_base<true, Parent>;
 
 TEST(IListNodeBaseTest, DefaultConstructor) {
   RawNode A;
diff --git a/llvm/unittests/ADT/IListSentinelTest.cpp b/llvm/unittests/ADT/IListSentinelTest.cpp
index 1f4a8311370a6..709a1a4bb90e7 100644
--- a/llvm/unittests/ADT/IListSentinelTest.cpp
+++ b/llvm/unittests/ADT/IListSentinelTest.cpp
@@ -14,18 +14,17 @@ using namespace llvm;
 namespace {
 
 template <class T, class... Options> struct PickSentinel {
-  typedef ilist_sentinel<
-      typename ilist_detail::compute_node_options<T, Options...>::type>
-      type;
+  using type = ilist_sentinel<
+      typename ilist_detail::compute_node_options<T, Options...>::type>;
 };
 
 class Node : public ilist_node<Node> {};
 class TrackingNode : public ilist_node<Node, ilist_sentinel_tracking<true>> {};
-typedef PickSentinel<Node>::type Sentinel;
-typedef PickSentinel<Node, ilist_sentinel_tracking<true>>::type
-    TrackingSentinel;
-typedef PickSentinel<Node, ilist_sentinel_tracking<false>>::type
-    NoTrackingSentinel;
+using Sentinel = PickSentinel<Node>::type;
+using TrackingSentinel =
+    PickSentinel<Node, ilist_sentinel_tracking<true>>::type;
+using NoTrackingSentinel =
+    PickSentinel<Node, ilist_sentinel_tracking<false>>::type;
 
 struct LocalAccess : ilist_detail::NodeAccess {
   using NodeAccess::getPrev;
diff --git a/llvm/unittests/ADT/IListTest.cpp b/llvm/unittests/ADT/IListTest.cpp
index 2fdc8e12d0fa8..984014f679db6 100644
--- a/llvm/unittests/ADT/IListTest.cpp
+++ b/llvm/unittests/ADT/IListTest.cpp
@@ -19,7 +19,7 @@ namespace {
 struct Node : ilist_node<Node> {
   int Value;
 
-  Node() {}
+  Node() = default;
   Node(int Value) : Value(Value) {}
   Node(const Node&) = default;
   ~Node() { Value = -1; }
diff --git a/llvm/unittests/ADT/IntervalMapTest.cpp b/llvm/unittests/ADT/IntervalMapTest.cpp
index 99a93ab198d89..06848e218da10 100644
--- a/llvm/unittests/ADT/IntervalMapTest.cpp
+++ b/llvm/unittests/ADT/IntervalMapTest.cpp
@@ -8,15 +8,14 @@
 
 #include "llvm/ADT/IntervalMap.h"
 #include "gtest/gtest.h"
-#include <type_traits>
 
 using namespace llvm;
 
 namespace {
 
-typedef IntervalMap<unsigned, unsigned, 4> UUMap;
-typedef IntervalMap<unsigned, unsigned, 4,
-                    IntervalMapHalfOpenInfo<unsigned>> UUHalfOpenMap;
+using UUMap = IntervalMap<unsigned, unsigned, 4>;
+using UUHalfOpenMap =
+    IntervalMap<unsigned, unsigned, 4, IntervalMapHalfOpenInfo<unsigned>>;
 
 // Empty map tests
 TEST(IntervalMapTest, EmptyMap) {
@@ -713,7 +712,7 @@ TEST(IntervalMapTest, OverlapsHalfOpen) {
 }
 
 TEST(IntervalMapOverlapsTest, SmallMaps) {
-  typedef IntervalMapOverlaps<UUMap,UUMap> UUOverlaps;
+  using UUOverlaps = IntervalMapOverlaps<UUMap, UUMap>;
   UUMap::Allocator allocator;
   UUMap mapA(allocator);
   UUMap mapB(allocator);
@@ -757,7 +756,7 @@ TEST(IntervalMapOverlapsTest, SmallMaps) {
 }
 
 TEST(IntervalMapOverlapsTest, BigMaps) {
-  typedef IntervalMapOverlaps<UUMap,UUMap> UUOverlaps;
+  using UUOverlaps = IntervalMapOverlaps<UUMap, UUMap>;
   UUMap::Allocator allocator;
   UUMap mapA(allocator);
   UUMap mapB(allocator);
diff --git a/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp b/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp
index f4f2083482804..6da42271764bc 100644
--- a/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp
+++ b/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp
@@ -25,9 +25,9 @@ struct SimpleRefCounted : Base<SimpleRefCounted<Base>> {
 
 template <typename T> struct IntrusiveRefCntPtrTest : testing::Test {};
 
-typedef ::testing::Types<SimpleRefCounted<RefCountedBase>,
-                         SimpleRefCounted<ThreadSafeRefCountedBase>>
-    IntrusiveRefCntTypes;
+using IntrusiveRefCntTypes =
+    ::testing::Types<SimpleRefCounted<RefCountedBase>,
+                     SimpleRefCounted<ThreadSafeRefCountedBase>>;
 TYPED_TEST_SUITE(IntrusiveRefCntPtrTest, IntrusiveRefCntTypes, );
 
 TYPED_TEST(IntrusiveRefCntPtrTest, RefCountedBaseCopyDoesNotLeak) {
diff --git a/llvm/unittests/ADT/IteratorTest.cpp b/llvm/unittests/ADT/IteratorTest.cpp
index 691fbce5080ff..9dd8c1a84f44a 100644
--- a/llvm/unittests/ADT/IteratorTest.cpp
+++ b/llvm/unittests/ADT/IteratorTest.cpp
@@ -48,11 +48,10 @@ struct AdaptedIter : iterator_adaptor_base<AdaptedIter, WeirdIter> {};
 
 // Test that iterator_adaptor_base forwards typedefs, if value_type is
 // unchanged.
-static_assert(std::is_same_v<typename AdaptedIter::value_type, Shadow<0>>, "");
-static_assert(std::is_same_v<typename AdaptedIter::difference_type, Shadow<1>>,
-              "");
-static_assert(std::is_same_v<typename AdaptedIter::pointer, Shadow<2>>, "");
-static_assert(std::is_same_v<typename AdaptedIter::reference, Shadow<3>>, "");
+static_assert(std::is_same_v<AdaptedIter::value_type, Shadow<0>>, "");
+static_assert(std::is_same_v<AdaptedIter::difference_type, Shadow<1>>, "");
+static_assert(std::is_same_v<AdaptedIter::pointer, Shadow<2>>, "");
+static_assert(std::is_same_v<AdaptedIter::reference, Shadow<3>>, "");
 
 // Ensure that pointe{e,r}_iterator adaptors correctly forward the category of
 // the underlying iterator.
@@ -178,8 +177,8 @@ TEST(PointeeIteratorTest, Basic) {
   V.push_back(&arr[2]);
   V.push_back(&arr[3]);
 
-  typedef pointee_iterator<SmallVectorImpl<int *>::const_iterator>
-      test_iterator;
+  using test_iterator =
+      pointee_iterator<SmallVectorImpl<int *>::const_iterator>;
 
   test_iterator Begin, End;
   Begin = V.begin();
@@ -219,9 +218,8 @@ TEST(PointeeIteratorTest, SmartPointer) {
   V.push_back(std::make_unique<int>(3));
   V.push_back(std::make_unique<int>(4));
 
-  typedef pointee_iterator<
-      SmallVectorImpl<std::unique_ptr<int>>::const_iterator>
-      test_iterator;
+  using test_iterator =
+      pointee_iterator<SmallVectorImpl<std::unique_ptr<int>>::const_iterator>;
 
   test_iterator Begin, End;
   Begin = V.begin();
diff --git a/llvm/unittests/ADT/PointerSumTypeTest.cpp b/llvm/unittests/ADT/PointerSumTypeTest.cpp
index fbf59f3a2fda5..11e657ad8bd25 100644
--- a/llvm/unittests/ADT/PointerSumTypeTest.cpp
+++ b/llvm/unittests/ADT/PointerSumTypeTest.cpp
@@ -17,10 +17,9 @@ struct PointerSumTypeTest : public testing::Test {
   float f;
   int i1, i2;
 
-  typedef PointerSumType<Kinds, PointerSumTypeMember<Float, float *>,
-                         PointerSumTypeMember<Int1, int *>,
-                         PointerSumTypeMember<Int2, int *>>
-      SumType;
+  using SumType = PointerSumType<Kinds, PointerSumTypeMember<Float, float *>,
+                                 PointerSumTypeMember<Int1, int *>,
+                                 PointerSumTypeMember<Int2, int *>>;
   SumType a, b, c, n;
 
   PointerSumTypeTest()
diff --git a/llvm/unittests/ADT/PointerUnionTest.cpp b/llvm/unittests/ADT/PointerUnionTest.cpp
index acddb78960149..d8ac3aed76da2 100644
--- a/llvm/unittests/ADT/PointerUnionTest.cpp
+++ b/llvm/unittests/ADT/PointerUnionTest.cpp
@@ -12,9 +12,9 @@ using namespace llvm;
 
 namespace {
 
-typedef PointerUnion<int *, float *> PU;
-typedef PointerUnion<int *, float *, long long *> PU3;
-typedef PointerUnion<int *, float *, long long *, double *> PU4;
+using PU = PointerUnion<int *, float *>;
+using PU3 = PointerUnion<int *, float *, long long *>;
+using PU4 = PointerUnion<int *, float *, long long *, double *>;
 
 struct PointerUnionTest : public testing::Test {
   float f;
@@ -116,9 +116,9 @@ TEST_F(PointerUnionTest, Get) {
 
 template<int I> struct alignas(8) Aligned {};
 
-typedef PointerUnion<Aligned<0> *, Aligned<1> *, Aligned<2> *, Aligned<3> *,
-                     Aligned<4> *, Aligned<5> *, Aligned<6> *, Aligned<7> *>
-    PU8;
+using PU8 =
+    PointerUnion<Aligned<0> *, Aligned<1> *, Aligned<2> *, Aligned<3> *,
+                 Aligned<4> *, Aligned<5> *, Aligned<6> *, Aligned<7> *>;
 
 TEST_F(PointerUnionTest, ManyElements) {
   Aligned<0> a0;
diff --git a/llvm/unittests/ADT/PostOrderIteratorTest.cpp b/llvm/unittests/ADT/PostOrderIteratorTest.cpp
index 4c2a66e8d5b62..e875dd63a1958 100644
--- a/llvm/unittests/ADT/PostOrderIteratorTest.cpp
+++ b/llvm/unittests/ADT/PostOrderIteratorTest.cpp
@@ -23,7 +23,7 @@ namespace {
 
 // Whether we're able to compile
 TEST(PostOrderIteratorTest, Compiles) {
-  typedef SmallPtrSet<void *, 4> ExtSetTy;
+  using ExtSetTy = SmallPtrSet<void *, 4>;
 
   // Tests that template specializations are kept up to date
   void *Null = nullptr;
@@ -44,7 +44,7 @@ TEST(PostOrderIteratorTest, Compiles) {
 
 static_assert(
     std::is_convertible_v<decltype(*std::declval<po_iterator<Graph<3>>>()),
-                          typename po_iterator<Graph<3>>::reference>);
+                          po_iterator<Graph<3>>::reference>);
 
 // Test post-order and reverse post-order traversals for simple graph type.
 TEST(PostOrderIteratorTest, PostOrderAndReversePostOrderTraverrsal) {
diff --git a/llvm/unittests/ADT/PriorityWorklistTest.cpp b/llvm/unittests/ADT/PriorityWorklistTest.cpp
index f12d32ac9f496..08a47736c392e 100644
--- a/llvm/unittests/ADT/PriorityWorklistTest.cpp
+++ b/llvm/unittests/ADT/PriorityWorklistTest.cpp
@@ -20,8 +20,8 @@ namespace {
 using namespace llvm;
 
 template <typename T> class PriorityWorklistTest : public ::testing::Test {};
-typedef ::testing::Types<PriorityWorklist<int>, SmallPriorityWorklist<int, 2>>
-    TestTypes;
+using TestTypes =
+    ::testing::Types<PriorityWorklist<int>, SmallPriorityWorklist<int, 2>>;
 TYPED_TEST_SUITE(PriorityWorklistTest, TestTypes, );
 
 TYPED_TEST(PriorityWorklistTest, Basic) {
diff --git a/llvm/unittests/ADT/RangeAdapterTest.cpp b/llvm/unittests/ADT/RangeAdapterTest.cpp
index c1a8a984f233b..6849ccbc8052d 100644
--- a/llvm/unittests/ADT/RangeAdapterTest.cpp
+++ b/llvm/unittests/ADT/RangeAdapterTest.cpp
@@ -24,8 +24,8 @@ class ReverseOnlyVector {
 public:
   ReverseOnlyVector(std::initializer_list<int> list) : Vec(list) {}
 
-  typedef std::vector<int>::reverse_iterator reverse_iterator;
-  typedef std::vector<int>::const_reverse_iterator const_reverse_iterator;
+  using reverse_iterator = std::vector<int>::reverse_iterator;
+  using const_reverse_iterator = std::vector<int>::const_reverse_iterator;
   reverse_iterator rbegin() { return Vec.rbegin(); }
   reverse_iterator rend() { return Vec.rend(); }
   const_reverse_iterator rbegin() const { return Vec.rbegin(); }
@@ -41,11 +41,11 @@ class BidirectionalVector {
 public:
   BidirectionalVector(std::initializer_list<int> list) : Vec(list) {}
 
-  typedef std::vector<int>::iterator iterator;
+  using iterator = std::vector<int>::iterator;
   iterator begin() const;
   iterator end() const;
 
-  typedef std::vector<int>::reverse_iterator reverse_iterator;
+  using reverse_iterator = std::vector<int>::reverse_iterator;
   reverse_iterator rbegin() const { return Vec.rbegin(); }
   reverse_iterator rend() const { return Vec.rend(); }
 };
@@ -58,15 +58,15 @@ class BidirectionalVectorConsts {
 public:
   BidirectionalVectorConsts(std::initializer_list<int> list) : Vec(list) {}
 
-  typedef std::vector<int>::iterator iterator;
-  typedef std::vector<int>::const_iterator const_iterator;
+  using iterator = std::vector<int>::iterator;
+  using const_iterator = std::vector<int>::const_iterator;
   iterator begin();
   iterator end();
   const_iterator begin() const;
   const_iterator end() const;
 
-  typedef std::vector<int>::reverse_iterator reverse_iterator;
-  typedef std::vector<int>::const_reverse_iterator const_reverse_iterator;
+  using reverse_iterator = std::vector<int>::reverse_iterator;
+  using const_reverse_iterator = std::vector<int>::const_reverse_iterator;
   reverse_iterator rbegin() { return Vec.rbegin(); }
   reverse_iterator rend() { return Vec.rend(); }
   const_reverse_iterator rbegin() const { return Vec.rbegin(); }
@@ -80,7 +80,7 @@ class CustomIteratorVector {
 public:
   CustomIteratorVector(std::initializer_list<int> list) : V(list) {}
 
-  typedef std::vector<int>::iterator iterator;
+  using iterator = std::vector<int>::iterator;
   class reverse_iterator {
     std::vector<int>::iterator I;
 
@@ -126,8 +126,8 @@ template <typename R> void TestRev(const R &r) {
 // Test fixture
 template <typename T> class RangeAdapterLValueTest : public ::testing::Test {};
 
-typedef ::testing::Types<std::vector<int>, std::list<int>, int[4]>
-    RangeAdapterLValueTestTypes;
+using RangeAdapterLValueTestTypes =
+    ::testing::Types<std::vector<int>, std::list<int>, int[4]>;
 TYPED_TEST_SUITE(RangeAdapterLValueTest, RangeAdapterLValueTestTypes, );
 
 TYPED_TEST(RangeAdapterLValueTest, TrivialOperation) {
@@ -140,10 +140,10 @@ TYPED_TEST(RangeAdapterLValueTest, TrivialOperation) {
 
 template <typename T> struct RangeAdapterRValueTest : testing::Test {};
 
-typedef ::testing::Types<std::vector<int>, std::list<int>, CustomIteratorVector,
-                         ReverseOnlyVector, BidirectionalVector,
-                         BidirectionalVectorConsts>
-    RangeAdapterRValueTestTypes;
+using RangeAdapterRValueTestTypes =
+    ::testing::Types<std::vector<int>, std::list<int>, CustomIteratorVector,
+                     ReverseOnlyVector, BidirectionalVector,
+                     BidirectionalVectorConsts>;
 TYPED_TEST_SUITE(RangeAdapterRValueTest, RangeAdapterRValueTestTypes, );
 
 TYPED_TEST(RangeAdapterRValueTest, TrivialOperation) {
diff --git a/llvm/unittests/ADT/SCCIteratorTest.cpp b/llvm/unittests/ADT/SCCIteratorTest.cpp
index 48350959d046b..5f088294b1a2d 100644
--- a/llvm/unittests/ADT/SCCIteratorTest.cpp
+++ b/llvm/unittests/ADT/SCCIteratorTest.cpp
@@ -21,7 +21,7 @@ TEST(SCCIteratorTest, AllSmallGraphs) {
   // create graphs for which every node has a self-edge.
 #define NUM_NODES 4
 #define NUM_GRAPHS (NUM_NODES * (NUM_NODES - 1))
-  typedef Graph<NUM_NODES> GT;
+  using GT = Graph<NUM_NODES>;
 
   /// Enumerate all graphs using NUM_GRAPHS bits.
   static_assert(NUM_GRAPHS < sizeof(unsigned) * CHAR_BIT, "Too many graphs!");
diff --git a/llvm/unittests/ADT/STLExtrasTest.cpp b/llvm/unittests/ADT/STLExtrasTest.cpp
index 966b1f01e8a31..85567775e4ebd 100644
--- a/llvm/unittests/ADT/STLExtrasTest.cpp
+++ b/llvm/unittests/ADT/STLExtrasTest.cpp
@@ -60,7 +60,7 @@ TEST(STLExtrasTest, EnumerateLValue) {
   // Test that a simple LValue can be enumerated and gives correct results with
   // multiple types, including the empty container.
   std::vector<char> foo = {'a', 'b', 'c'};
-  typedef std::pair<std::size_t, char> CharPairType;
+  using CharPairType = std::pair<std::size_t, char>;
   std::vector<CharPairType> CharResults;
 
   for (auto [index, value] : llvm::enumerate(foo)) {
@@ -72,7 +72,7 @@ TEST(STLExtrasTest, EnumerateLValue) {
                           CharPairType(2u, 'c')));
 
   // Test a const range of a different type.
-  typedef std::pair<std::size_t, int> IntPairType;
+  using IntPairType = std::pair<std::size_t, int>;
   std::vector<IntPairType> IntResults;
   const std::vector<int> bar = {1, 2, 3};
   for (auto [index, value] : llvm::enumerate(bar)) {
@@ -111,7 +111,7 @@ TEST(STLExtrasTest, EnumerateModifyLValue) {
 
 TEST(STLExtrasTest, EnumerateRValueRef) {
   // Test that an rvalue can be enumerated.
-  typedef std::pair<std::size_t, int> PairType;
+  using PairType = std::pair<std::size_t, int>;
   std::vector<PairType> Results;
 
   auto Enumerator = llvm::enumerate(std::vector<int>{1, 2, 3});
@@ -138,7 +138,7 @@ TEST(STLExtrasTest, EnumerateModifyRValue) {
   // Test that when enumerating an rvalue, modification still works (even if
   // this isn't terribly useful, it at least shows that we haven't snuck an
   // extra const in there somewhere.
-  typedef std::pair<std::size_t, char> PairType;
+  using PairType = std::pair<std::size_t, char>;
   std::vector<PairType> Results;
 
   for (auto X : llvm::enumerate(std::vector<char>{'1', '2', '3'})) {
diff --git a/llvm/unittests/ADT/STLForwardCompatTest.cpp b/llvm/unittests/ADT/STLForwardCompatTest.cpp
index c6ae6e36cfbff..d0092fdb52b01 100644
--- a/llvm/unittests/ADT/STLForwardCompatTest.cpp
+++ b/llvm/unittests/ADT/STLForwardCompatTest.cpp
@@ -11,7 +11,6 @@
 #include "gtest/gtest.h"
 
 #include <optional>
-#include <tuple>
 #include <type_traits>
 #include <utility>
 
diff --git a/llvm/unittests/ADT/SequenceTest.cpp b/llvm/unittests/ADT/SequenceTest.cpp
index 7b7dc85cb79be..ab50ad0bb5606 100644
--- a/llvm/unittests/ADT/SequenceTest.cpp
+++ b/llvm/unittests/ADT/SequenceTest.cpp
@@ -11,8 +11,9 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
-#include <algorithm>
-#include <numeric>
+#include <iterator>
+#include <limits>
+#include <vector>
 
 using namespace llvm;
 
diff --git a/llvm/unittests/ADT/SimpleIListTest.cpp b/llvm/unittests/ADT/SimpleIListTest.cpp
index c2992baf8a5f7..cf3df8c293e25 100644
--- a/llvm/unittests/ADT/SimpleIListTest.cpp
+++ b/llvm/unittests/ADT/SimpleIListTest.cpp
@@ -605,8 +605,8 @@ struct Tag2 {};
 
 struct DoubleNode : ilist_node<DoubleNode, ilist_tag<Tag1>>,
                     ilist_node<DoubleNode, ilist_tag<Tag2>> {
-  typedef ilist_node<DoubleNode, ilist_tag<Tag1>> Node1Type;
-  typedef ilist_node<DoubleNode, ilist_tag<Tag2>> Node2Type;
+  using Node1Type = ilist_node<DoubleNode, ilist_tag<Tag1>>;
+  using Node2Type = ilist_node<DoubleNode, ilist_tag<Tag2>>;
 
   Node1Type::self_iterator getIterator1() { return Node1Type::getIterator(); }
   Node2Type::self_iterator getIterator2() { return Node2Type::getIterator(); }
@@ -617,8 +617,8 @@ struct DoubleNode : ilist_node<DoubleNode, ilist_tag<Tag1>>,
     return Node2Type::getIterator();
   }
 };
-typedef simple_ilist<DoubleNode, ilist_tag<Tag1>> TaggedList1Type;
-typedef simple_ilist<DoubleNode, ilist_tag<Tag2>> TaggedList2Type;
+using TaggedList1Type = simple_ilist<DoubleNode, ilist_tag<Tag1>>;
+using TaggedList2Type = simple_ilist<DoubleNode, ilist_tag<Tag2>>;
 
 TEST(SimpleIListTest, TaggedLists) {
   TaggedList1Type L1;
diff --git a/llvm/unittests/ADT/SmallPtrSetTest.cpp b/llvm/unittests/ADT/SmallPtrSetTest.cpp
index a627091b90c70..fe7a8279d06b1 100644
--- a/llvm/unittests/ADT/SmallPtrSetTest.cpp
+++ b/llvm/unittests/ADT/SmallPtrSetTest.cpp
@@ -57,7 +57,7 @@ TEST(SmallPtrSetTest, GrowthTest) {
 
 
   SmallPtrSet<int *, 4> s;
-  typedef SmallPtrSet<int *, 4>::iterator iter;
+  using iter = SmallPtrSet<int *, 4>::iterator;
 
   s.insert(&buf[0]);
   s.insert(&buf[1]);
diff --git a/llvm/unittests/ADT/SmallStringTest.cpp b/llvm/unittests/ADT/SmallStringTest.cpp
index 2f4df8afeafa5..db858246c9bbf 100644
--- a/llvm/unittests/ADT/SmallStringTest.cpp
+++ b/llvm/unittests/ADT/SmallStringTest.cpp
@@ -23,7 +23,7 @@ namespace {
 // Test fixture class
 class SmallStringTest : public testing::Test {
 protected:
-  typedef SmallString<40> StringType;
+  using StringType = SmallString<40>;
 
   StringType theString;
 
diff --git a/llvm/unittests/ADT/SmallVectorTest.cpp b/llvm/unittests/ADT/SmallVectorTest.cpp
index 1a01f30e8dd35..dbc626db54482 100644
--- a/llvm/unittests/ADT/SmallVectorTest.cpp
+++ b/llvm/unittests/ADT/SmallVectorTest.cpp
@@ -159,7 +159,7 @@ int Constructable::numCopyAssignmentCalls;
 int Constructable::numMoveAssignmentCalls;
 
 struct NonCopyable {
-  NonCopyable() {}
+  NonCopyable() = default;
   NonCopyable(NonCopyable &&) {}
   NonCopyable &operator=(NonCopyable &&) { return *this; }
 private:
@@ -226,13 +226,10 @@ class SmallVectorTest : public SmallVectorTestBase {
   VectorT otherVector;
 };
 
-
-typedef ::testing::Types<SmallVector<Constructable, 0>,
-                         SmallVector<Constructable, 1>,
-                         SmallVector<Constructable, 2>,
-                         SmallVector<Constructable, 4>,
-                         SmallVector<Constructable, 5>
-                         > SmallVectorTestTypes;
+using SmallVectorTestTypes = ::testing::Types<
+    SmallVector<Constructable, 0>, SmallVector<Constructable, 1>,
+    SmallVector<Constructable, 2>, SmallVector<Constructable, 4>,
+    SmallVector<Constructable, 5>>;
 TYPED_TEST_SUITE(SmallVectorTest, SmallVectorTestTypes, );
 
 // Constructor test.
@@ -537,11 +534,11 @@ TYPED_TEST(SmallVectorTest, AppendNonIterTest) {
 }
 
 struct output_iterator {
-  typedef std::output_iterator_tag iterator_category;
-  typedef int value_type;
-  typedef int difference_type;
-  typedef value_type *pointer;
-  typedef value_type &reference;
+  using iterator_category = std::output_iterator_tag;
+  using value_type = int;
+  using difference_type = int;
+  using pointer = value_type *;
+  using reference = value_type &;
   operator int() { return 2; }
   operator Constructable() { return 7; }
 };
@@ -896,7 +893,7 @@ class DualSmallVectorsTest<std::pair<VectorT1, VectorT2>> : public SmallVectorTe
   VectorT2 otherVector;
 };
 
-typedef ::testing::Types<
+using DualSmallVectorTestTypes = ::testing::Types<
     // Small mode -> Small mode.
     std::pair<SmallVector<Constructable, 4>, SmallVector<Constructable, 4>>,
     // Small mode -> Big mode.
@@ -904,8 +901,7 @@ typedef ::testing::Types<
     // Big mode -> Small mode.
     std::pair<SmallVector<Constructable, 2>, SmallVector<Constructable, 4>>,
     // Big mode -> Big mode.
-    std::pair<SmallVector<Constructable, 2>, SmallVector<Constructable, 2>>
-  > DualSmallVectorTestTypes;
+    std::pair<SmallVector<Constructable, 2>, SmallVector<Constructable, 2>>>;
 
 TYPED_TEST_SUITE(DualSmallVectorsTest, DualSmallVectorTestTypes, );
 
diff --git a/llvm/unittests/ADT/SparseMultiSetTest.cpp b/llvm/unittests/ADT/SparseMultiSetTest.cpp
index 54f7bc99b52fa..91d37f4684b9e 100644
--- a/llvm/unittests/ADT/SparseMultiSetTest.cpp
+++ b/llvm/unittests/ADT/SparseMultiSetTest.cpp
@@ -13,7 +13,7 @@ using namespace llvm;
 
 namespace {
 
-typedef SparseMultiSet<unsigned> USet;
+using USet = SparseMultiSet<unsigned>;
 
 // Empty set tests.
 TEST(SparseMultiSetTest, EmptySet) {
@@ -211,7 +211,7 @@ struct Alt {
 };
 
 TEST(SparseMultiSetTest, AltStructSet) {
-  typedef SparseMultiSet<Alt> ASet;
+  using ASet = SparseMultiSet<Alt>;
   ASet Set;
   Set.setUniverse(10);
   Set.insert(Alt(1005));
diff --git a/llvm/unittests/ADT/SparseSetTest.cpp b/llvm/unittests/ADT/SparseSetTest.cpp
index 4fbf1caa247b7..f2b932907dc38 100644
--- a/llvm/unittests/ADT/SparseSetTest.cpp
+++ b/llvm/unittests/ADT/SparseSetTest.cpp
@@ -13,7 +13,7 @@ using namespace llvm;
 
 namespace {
 
-typedef SparseSet<unsigned> USet;
+using USet = SparseSet<unsigned>;
 
 // Empty set tests.
 TEST(SparseSetTest, EmptySet) {
@@ -166,7 +166,7 @@ struct Alt {
 };
 
 TEST(SparseSetTest, AltStructSet) {
-  typedef SparseSet<Alt> ASet;
+  using ASet = SparseSet<Alt>;
   ASet Set;
   Set.setUniverse(10);
   Set.insert(Alt(1005));
diff --git a/llvm/unittests/ADT/StringMapTest.cpp b/llvm/unittests/ADT/StringMapTest.cpp
index 92ae364d45d5e..1d92de4e92325 100644
--- a/llvm/unittests/ADT/StringMapTest.cpp
+++ b/llvm/unittests/ADT/StringMapTest.cpp
@@ -367,7 +367,7 @@ TEST_F(StringMapTest, NonDefaultConstructable) {
 }
 
 struct Immovable {
-  Immovable() {}
+  Immovable() = default;
   Immovable(Immovable &&) = delete; // will disable the other special members
 };
 
diff --git a/llvm/unittests/ADT/StringSwitchTest.cpp b/llvm/unittests/ADT/StringSwitchTest.cpp
index c94feb54d0b7d..75d50f4dd1b5b 100644
--- a/llvm/unittests/ADT/StringSwitchTest.cpp
+++ b/llvm/unittests/ADT/StringSwitchTest.cpp
@@ -240,6 +240,23 @@ TEST(StringSwitchTest, CasesCopies) {
   EXPECT_EQ(NumCopies, 1u);
 }
 
+TEST(StringSwitchTest, StringSwitchMultipleMatches) {
+  auto Translate = [](StringRef S) {
+    return llvm::StringSwitch<int>(S)
+        .CaseLower("A", 0)
+        .Case("b", 1)
+        .Case("a", 2)
+        .CasesLower({"a", "b"}, 3)
+        .DefaultUnreachable();
+  };
+
+  // Check that the value of the first match is returned.
+  EXPECT_EQ(0, Translate("A"));
+  EXPECT_EQ(0, Translate("a"));
+  EXPECT_EQ(3, Translate("B"));
+  EXPECT_EQ(1, Translate("b"));
+}
+
 TEST(StringSwitchTest, DefaultUnreachable) {
   auto Translate = [](StringRef S) {
     return llvm::StringSwitch<int>(S)
diff --git a/llvm/unittests/ADT/TestGraph.h b/llvm/unittests/ADT/TestGraph.h
index a59ab504f7144..bb2ec47a0d5fe 100644
--- a/llvm/unittests/ADT/TestGraph.h
+++ b/llvm/unittests/ADT/TestGraph.h
@@ -34,7 +34,7 @@ class Graph {
 
   /// NodeSubset - A subset of the graph's nodes.
   class NodeSubset {
-    typedef unsigned char BitVector; // Where the limitation N <= 8 comes from.
+    using BitVector = unsigned char; // Where the limitation N <= 8 comes from.
     BitVector Elements;
     NodeSubset(BitVector e) : Elements(e) {}
   public:
@@ -96,7 +96,7 @@ class Graph {
   };
 
   /// NodeType - Node index and set of children of the node.
-  typedef std::pair<unsigned, NodeSubset> NodeType;
+  using NodeType = std::pair<unsigned, NodeSubset>;
 
 private:
   /// Nodes - The list of nodes for this graph.
@@ -233,8 +233,8 @@ class Graph {
 
 template <unsigned N>
 struct GraphTraits<Graph<N> > {
-  typedef typename Graph<N>::NodeType *NodeRef;
-  typedef typename Graph<N>::ChildIterator ChildIteratorType;
+  using NodeRef = typename Graph<N>::NodeType *;
+  using ChildIteratorType = typename Graph<N>::ChildIterator;
 
   static NodeRef getEntryNode(const Graph<N> &G) { return G.AccessNode(0); }
   static ChildIteratorType child_begin(NodeRef Node) {
diff --git a/llvm/unittests/ADT/TinyPtrVectorTest.cpp b/llvm/unittests/ADT/TinyPtrVectorTest.cpp
index af4ae4f4b0db9..c77721df5055c 100644
--- a/llvm/unittests/ADT/TinyPtrVectorTest.cpp
+++ b/llvm/unittests/ADT/TinyPtrVectorTest.cpp
@@ -28,14 +28,14 @@ template <typename PointerTy, unsigned IntBits, typename IntType,
           typename PtrTraits, typename Info>
 struct RemovePointer<
     PointerIntPair<PointerTy, IntBits, IntType, PtrTraits, Info>> {
-  typedef typename RemovePointer<PointerTy>::type type;
+  using type = typename RemovePointer<PointerTy>::type;
 };
 
 template <typename VectorT>
 class TinyPtrVectorTest : public testing::Test {
 protected:
-  typedef typename VectorT::value_type PtrT;
-  typedef typename RemovePointer<PtrT>::type ValueT;
+  using PtrT = typename VectorT::value_type;
+  using ValueT = typename RemovePointer<PtrT>::type;
   using PtrTraits = PointerLikeTypeTraits<PtrT>;
 
   VectorT V;
@@ -78,9 +78,9 @@ class TinyPtrVectorTest : public testing::Test {
   }
 };
 
-typedef ::testing::Types<TinyPtrVector<int *>, TinyPtrVector<double *>,
-                         TinyPtrVector<PointerIntPair<int *, 1>>>
-    TinyPtrVectorTestTypes;
+using TinyPtrVectorTestTypes =
+    ::testing::Types<TinyPtrVector<int *>, TinyPtrVector<double *>,
+                     TinyPtrVector<PointerIntPair<int *, 1>>>;
 TYPED_TEST_SUITE(TinyPtrVectorTest, TinyPtrVectorTestTypes, );
 
 TYPED_TEST(TinyPtrVectorTest, EmptyTest) {
diff --git a/llvm/unittests/ADT/TypeSwitchTest.cpp b/llvm/unittests/ADT/TypeSwitchTest.cpp
index b80122837c1ad..0a9271785d168 100644
--- a/llvm/unittests/ADT/TypeSwitchTest.cpp
+++ b/llvm/unittests/ADT/TypeSwitchTest.cpp
@@ -167,7 +167,7 @@ TEST(TypeSwitchTest, DefaultNullptr) {
 TEST(TypeSwitchTest, DefaultNullptrForPointerLike) {
   struct Value {
     void *ptr;
-    Value(const Value &other) : ptr(other.ptr) {}
+    Value(const Value &other) = default;
     Value(std::nullptr_t) : ptr(nullptr) {}
     Value() : Value(nullptr) {}
   };
diff --git a/llvm/unittests/Analysis/AliasAnalysisTest.cpp b/llvm/unittests/Analysis/AliasAnalysisTest.cpp
index 06066b1b92c51..a28d318ab32c8 100644
--- a/llvm/unittests/Analysis/AliasAnalysisTest.cpp
+++ b/llvm/unittests/Analysis/AliasAnalysisTest.cpp
@@ -232,18 +232,18 @@ TEST_F(AliasAnalysisTest, BatchAAPhiCycles) {
   LLVMContext C;
   SMDiagnostic Err;
   std::unique_ptr<Module> M = parseAssemblyString(R"(
-    define void @f(i8* noalias %a, i1 %c) {
+    define void @f(ptr noalias %a, i1 %c) {
     entry:
       br label %loop
 
     loop:
-      %phi = phi i8* [ null, %entry ], [ %a2, %loop ]
+      %phi = phi ptr [ null, %entry ], [ %a2, %loop ]
       %offset1 = phi i64 [ 0, %entry ], [ %offset2, %loop]
       %offset2 = add i64 %offset1, 1
-      %a1 = getelementptr i8, i8* %a, i64 %offset1
-      %a2 = getelementptr i8, i8* %a, i64 %offset2
-      %s1 = select i1 %c, i8* %a1, i8* %phi
-      %s2 = select i1 %c, i8* %a2, i8* %a1
+      %a1 = getelementptr i8, ptr %a, i64 %offset1
+      %a2 = getelementptr i8, ptr %a, i64 %offset2
+      %s1 = select i1 %c, ptr %a1, ptr %phi
+      %s2 = select i1 %c, ptr %a2, ptr %a1
       br label %loop
     }
   )", Err, C);
@@ -280,15 +280,15 @@ TEST_F(AliasAnalysisTest, BatchAAPhiAssumption) {
   LLVMContext C;
   SMDiagnostic Err;
   std::unique_ptr<Module> M = parseAssemblyString(R"(
-    define void @f(i8* %a.base, i8* %b.base, i1 %c) {
+    define void @f(ptr %a.base, ptr %b.base, i1 %c) {
     entry:
       br label %loop
 
     loop:
-      %a = phi i8* [ %a.next, %loop ], [ %a.base, %entry ]
-      %b = phi i8* [ %b.next, %loop ], [ %b.base, %entry ]
-      %a.next = getelementptr i8, i8* %a, i64 1
-      %b.next = getelementptr i8, i8* %b, i64 1
+      %a = phi ptr [ %a.next, %loop ], [ %a.base, %entry ]
+      %b = phi ptr [ %b.next, %loop ], [ %b.base, %entry ]
+      %a.next = getelementptr i8, ptr %a, i64 1
+      %b.next = getelementptr i8, ptr %b, i64 1
       br label %loop
     }
   )", Err, C);
@@ -318,16 +318,16 @@ TEST_F(AliasAnalysisTest, PartialAliasOffset) {
   LLVMContext C;
   SMDiagnostic Err;
   std::unique_ptr<Module> M = parseAssemblyString(R"(
-    define void @foo(float* %arg, i32 %i) {
+    define void @foo(ptr %arg, i32 %i) {
     bb:
       %i2 = zext i32 %i to i64
-      %i3 = getelementptr inbounds float, float* %arg, i64 %i2
-      %i4 = bitcast float* %i3 to <2 x float>*
-      %L1 = load <2 x float>, <2 x float>* %i4, align 16
+      %i3 = getelementptr inbounds float, ptr %arg, i64 %i2
+      %i4 = bitcast ptr %i3 to ptr
+      %L1 = load <2 x float>, ptr %i4, align 16
       %i7 = add nuw nsw i32 %i, 1
       %i8 = zext i32 %i7 to i64
-      %i9 = getelementptr inbounds float, float* %arg, i64 %i8
-      %L2 = load float, float* %i9, align 4
+      %i9 = getelementptr inbounds float, ptr %arg, i64 %i8
+      %L2 = load float, ptr %i9, align 4
       ret void
     }
   )",
@@ -353,11 +353,11 @@ TEST_F(AliasAnalysisTest, PartialAliasOffsetSign) {
   LLVMContext C;
   SMDiagnostic Err;
   std::unique_ptr<Module> M = parseAssemblyString(R"(
-    define void @f(i64* %p) {
-      %L1 = load i64, i64* %p
-      %p.i8 = bitcast i64* %p to i8*
-      %q = getelementptr i8,  i8* %p.i8, i32 1
-      %L2 = load i8, i8* %q
+    define void @f(ptr %p) {
+      %L1 = load i64, ptr %p
+      %p.i8 = bitcast ptr %p to ptr
+      %q = getelementptr i8,  ptr %p.i8, i32 1
+      %L2 = load i8, ptr %q
       ret void
     }
   )",
@@ -388,10 +388,10 @@ class AAPassInfraTest : public testing::Test {
 
 public:
   AAPassInfraTest()
-      : M(parseAssemblyString("define i32 @f(i32* %x, i32* %y) {\n"
+      : M(parseAssemblyString("define i32 @f(ptr %x, ptr %y) {\n"
                               "entry:\n"
-                              "  %lx = load i32, i32* %x\n"
-                              "  %ly = load i32, i32* %y\n"
+                              "  %lx = load i32, ptr %x\n"
+                              "  %ly = load i32, ptr %y\n"
                               "  %sum = add i32 %lx, %ly\n"
                               "  ret i32 %sum\n"
                               "}\n",
diff --git a/llvm/unittests/Analysis/AliasSetTrackerTest.cpp b/llvm/unittests/Analysis/AliasSetTrackerTest.cpp
index e784e6eefb79c..b5adc84185c96 100644
--- a/llvm/unittests/Analysis/AliasSetTrackerTest.cpp
+++ b/llvm/unittests/Analysis/AliasSetTrackerTest.cpp
@@ -26,13 +26,13 @@ TEST(AliasSetTracker, AliasUnknownInst) {
 
     ; Function Attrs: nounwind ssp uwtable
     define i32 @read_a() #0 {
-      %1 = load i32, i32* @a, align 4, !tbaa !3
+      %1 = load i32, ptr @a, align 4, !tbaa !3
       ret i32 %1
     }
 
     ; Function Attrs: nounwind ssp uwtable
     define void @write_b() #0 {
-      store float 1.000000e+01, float* @b, align 4, !tbaa !7
+      store float 1.000000e+01, ptr @b, align 4, !tbaa !7
       ret void
     }
 
@@ -72,7 +72,7 @@ TEST(AliasSetTracker, AliasUnknownInst) {
   AliasSetTracker AST(BAA);
   for (auto &BB : *Test)
     AST.add(BB);
-  // There should be 2 disjoint alias sets. 1 from each call. 
+  // There should be 2 disjoint alias sets. 1 from each call.
   ASSERT_EQ((int)AST.getAliasSets().size(), 2);
 
   // Directly test aliasesUnknownInst.
diff --git a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp
index 5fd2ecc4f29b6..921e2aa8cd30b 100644
--- a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp
+++ b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp
@@ -74,18 +74,18 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) {
   EnableKnowledgeRetention.setValue(true);
   StringRef Head =
       "declare void @llvm.assume(i1)\n"
-      "declare void @func(i32*, i32*, i32*)\n"
-      "declare void @func1(i32*, i32*, i32*, i32*)\n"
-      "declare void @func_many(i32*) \"no-jump-tables\" nounwind "
+      "declare void @func(ptr, ptr, ptr)\n"
+      "declare void @func1(ptr, ptr, ptr, ptr)\n"
+      "declare void @func_many(ptr) \"no-jump-tables\" nounwind "
       "\"less-precise-fpmad\" willreturn norecurse\n"
-      "define void @test(i32* %P, i32* %P1, i32* %P2, i32* %P3) {\n";
+      "define void @test(ptr %P, ptr %P1, ptr %P2, ptr %P3) {\n";
   StringRef Tail = "ret void\n"
                    "}";
   std::vector<std::pair<StringRef, llvm::function_ref<void(Instruction *)>>>
       Tests;
   Tests.push_back(std::make_pair(
-      "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align "
-      "8 noalias %P1, i32* align 8 noundef %P2)\n",
+      "call void @func(ptr nonnull align 4 dereferenceable(16) %P, ptr align "
+      "8 noalias %P1, ptr align 8 noundef %P2)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -103,11 +103,11 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) {
                                      Attribute::AttrKind::Alignment, 4));
       }));
   Tests.push_back(std::make_pair(
-      "call void @func1(i32* nonnull align 32 dereferenceable(48) %P, i32* "
+      "call void @func1(ptr nonnull align 32 dereferenceable(48) %P, ptr "
       "nonnull "
-      "align 8 dereferenceable(28) %P, i32* nonnull align 64 "
+      "align 8 dereferenceable(28) %P, ptr nonnull align 64 "
       "dereferenceable(4) "
-      "%P, i32* nonnull align 16 dereferenceable(12) %P)\n",
+      "%P, ptr nonnull align 16 dereferenceable(12) %P)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -127,7 +127,7 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) {
                                      Attribute::AttrKind::Alignment, 64));
       }));
   Tests.push_back(std::make_pair(
-      "call void @func_many(i32* align 8 noundef %P1) cold\n", [](Instruction *I) {
+      "call void @func_many(ptr align 8 noundef %P1) cold\n", [](Instruction *I) {
         ShouldPreserveAllAttributes.setValue(true);
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -142,11 +142,11 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) {
         ASSERT_TRUE(hasMatchesExactlyAttributes(Assume, nullptr, ""));
       }));
   Tests.push_back(std::make_pair(
-      "call void @func1(i32* readnone align 32 "
-      "dereferenceable(48) noalias %P, i32* "
-      "align 8 dereferenceable(28) %P1, i32* align 64 "
+      "call void @func1(ptr readnone align 32 "
+      "dereferenceable(48) noalias %P, ptr "
+      "align 8 dereferenceable(28) %P1, ptr align 64 "
       "dereferenceable(4) "
-      "%P2, i32* nonnull align 16 dereferenceable(12) %P3)\n",
+      "%P2, ptr nonnull align 16 dereferenceable(12) %P3)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -178,11 +178,11 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) {
       }));
 
   Tests.push_back(std::make_pair(
-      "call void @func1(i32* readnone align 32 "
-      "dereferenceable(48) noalias %P, i32* "
-      "align 8 dereferenceable(28) %P1, i32* align 64 "
+      "call void @func1(ptr readnone align 32 "
+      "dereferenceable(48) noalias %P, ptr "
+      "align 8 dereferenceable(28) %P1, ptr align 64 "
       "dereferenceable(4) "
-      "%P2, i32* nonnull align 16 dereferenceable(12) %P3)\n",
+      "%P2, ptr nonnull align 16 dereferenceable(12) %P3)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -204,8 +204,8 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) {
                                      Attribute::AttrKind::Dereferenceable, 48));
       }));
   Tests.push_back(std::make_pair(
-      "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align "
-      "8 noalias %P1, i32* %P1)\n",
+      "call void @func(ptr nonnull align 4 dereferenceable(16) %P, ptr align "
+      "8 noalias %P1, ptr %P1)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -251,18 +251,18 @@ TEST(AssumeQueryAPI, fillMapFromAssume) {
   EnableKnowledgeRetention.setValue(true);
   StringRef Head =
       "declare void @llvm.assume(i1)\n"
-      "declare void @func(i32*, i32*, i32*)\n"
-      "declare void @func1(i32*, i32*, i32*, i32*)\n"
-      "declare void @func_many(i32*) \"no-jump-tables\" nounwind "
+      "declare void @func(ptr, ptr, ptr)\n"
+      "declare void @func1(ptr, ptr, ptr, ptr)\n"
+      "declare void @func_many(ptr) \"no-jump-tables\" nounwind "
       "\"less-precise-fpmad\" willreturn norecurse\n"
-      "define void @test(i32* %P, i32* %P1, i32* %P2, i32* %P3) {\n";
+      "define void @test(ptr %P, ptr %P1, ptr %P2, ptr %P3) {\n";
   StringRef Tail = "ret void\n"
                    "}";
   std::vector<std::pair<StringRef, llvm::function_ref<void(Instruction *)>>>
       Tests;
   Tests.push_back(std::make_pair(
-      "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align "
-      "8 noalias %P1, i32* align 8 dereferenceable(8) %P2)\n",
+      "call void @func(ptr nonnull align 4 dereferenceable(16) %P, ptr align "
+      "8 noalias %P1, ptr align 8 dereferenceable(8) %P2)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -283,11 +283,11 @@ TEST(AssumeQueryAPI, fillMapFromAssume) {
                                {4, 4}));
       }));
   Tests.push_back(std::make_pair(
-      "call void @func1(i32* nonnull align 32 dereferenceable(48) %P, i32* "
+      "call void @func1(ptr nonnull align 32 dereferenceable(48) %P, ptr "
       "nonnull "
-      "align 8 dereferenceable(28) %P, i32* nonnull align 64 "
+      "align 8 dereferenceable(28) %P, ptr nonnull align 64 "
       "dereferenceable(4) "
-      "%P, i32* nonnull align 16 dereferenceable(12) %P)\n",
+      "%P, ptr nonnull align 16 dereferenceable(12) %P)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -310,7 +310,7 @@ TEST(AssumeQueryAPI, fillMapFromAssume) {
             Map, Assume, {I->getOperand(0), Attribute::Alignment}, {64, 64}));
       }));
   Tests.push_back(std::make_pair(
-      "call void @func_many(i32* align 8 %P1) cold\n", [](Instruction *I) {
+      "call void @func_many(ptr align 8 %P1) cold\n", [](Instruction *I) {
         ShouldPreserveAllAttributes.setValue(true);
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -331,11 +331,11 @@ TEST(AssumeQueryAPI, fillMapFromAssume) {
         ASSERT_TRUE(Map.empty());
       }));
   Tests.push_back(std::make_pair(
-      "call void @func1(i32* readnone align 32 "
-      "dereferenceable(48) noalias %P, i32* "
-      "align 8 dereferenceable(28) %P1, i32* align 64 "
+      "call void @func1(ptr readnone align 32 "
+      "dereferenceable(48) noalias %P, ptr "
+      "align 8 dereferenceable(28) %P1, ptr align 64 "
       "dereferenceable(4) "
-      "%P2, i32* nonnull align 16 dereferenceable(12) %P3)\n",
+      "%P2, ptr nonnull align 16 dereferenceable(12) %P3)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -371,8 +371,8 @@ TEST(AssumeQueryAPI, fillMapFromAssume) {
 
   /// Keep this test last as it modifies the function.
   Tests.push_back(std::make_pair(
-      "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align "
-      "8 noalias %P1, i32* %P2)\n",
+      "call void @func(ptr nonnull align 4 dereferenceable(16) %P, ptr align "
+      "8 noalias %P1, ptr %P2)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -507,11 +507,11 @@ TEST(AssumeQueryAPI, AssumptionCache) {
   SMDiagnostic Err;
   std::unique_ptr<Module> Mod = parseAssemblyString(
       "declare void @llvm.assume(i1)\n"
-      "define void @test(i32* %P, i32* %P1, i32* %P2, i32* %P3, i1 %B) {\n"
-      "call void @llvm.assume(i1 true) [\"nonnull\"(i32* %P), \"align\"(i32* "
-      "%P2, i32 4), \"align\"(i32* %P, i32 8)]\n"
-      "call void @llvm.assume(i1 %B) [\"test\"(i32* %P1), "
-      "\"dereferenceable\"(i32* %P, i32 4)]\n"
+      "define void @test(ptr %P, ptr %P1, ptr %P2, ptr %P3, i1 %B) {\n"
+      "call void @llvm.assume(i1 true) [\"nonnull\"(ptr %P), \"align\"(ptr "
+      "%P2, i32 4), \"align\"(ptr %P, i32 8)]\n"
+      "call void @llvm.assume(i1 %B) [\"test\"(ptr %P1), "
+      "\"dereferenceable\"(ptr %P, i32 4)]\n"
       "ret void\n}\n",
       Err, C);
   if (!Mod)
@@ -569,11 +569,11 @@ TEST(AssumeQueryAPI, Alignment) {
   SMDiagnostic Err;
   std::unique_ptr<Module> Mod = parseAssemblyString(
       "declare void @llvm.assume(i1)\n"
-      "define void @test(i32* %P, i32* %P1, i32* %P2, i32 %I3, i1 %B) {\n"
-      "call void @llvm.assume(i1 true) [\"align\"(i32* %P, i32 8, i32 %I3)]\n"
-      "call void @llvm.assume(i1 true) [\"align\"(i32* %P1, i32 %I3, i32 "
+      "define void @test(ptr %P, ptr %P1, ptr %P2, i32 %I3, i1 %B) {\n"
+      "call void @llvm.assume(i1 true) [\"align\"(ptr %P, i32 8, i32 %I3)]\n"
+      "call void @llvm.assume(i1 true) [\"align\"(ptr %P1, i32 %I3, i32 "
       "%I3)]\n"
-      "call void @llvm.assume(i1 true) [\"align\"(i32* %P2, i32 16, i32 8)]\n"
+      "call void @llvm.assume(i1 true) [\"align\"(ptr %P2, i32 16, i32 8)]\n"
       "ret void\n}\n",
       Err, C);
   if (!Mod)
diff --git a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
index 17240a1c73bce..bf5afe8e79354 100644
--- a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
+++ b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
@@ -1936,26 +1936,26 @@ TEST_F(CGSCCPassManagerTest, TestDeletionOfFunctionInNonTrivialRefSCC) {
 TEST_F(CGSCCPassManagerTest, TestInsertionOfNewNonTrivialCallEdge) {
   std::unique_ptr<Module> M = parseIR("define void @f1() {\n"
                                       "entry:\n"
-                                      "  %a = bitcast void ()* @f4 to i8*\n"
-                                      "  %b = bitcast void ()* @f2 to i8*\n"
+                                      "  %a = bitcast ptr @f4 to ptr\n"
+                                      "  %b = bitcast ptr @f2 to ptr\n"
                                       "  ret void\n"
                                       "}\n"
                                       "define void @f2() {\n"
                                       "entry:\n"
-                                      "  %a = bitcast void ()* @f1 to i8*\n"
-                                      "  %b = bitcast void ()* @f3 to i8*\n"
+                                      "  %a = bitcast ptr @f1 to ptr\n"
+                                      "  %b = bitcast ptr @f3 to ptr\n"
                                       "  ret void\n"
                                       "}\n"
                                       "define void @f3() {\n"
                                       "entry:\n"
-                                      "  %a = bitcast void ()* @f2 to i8*\n"
-                                      "  %b = bitcast void ()* @f4 to i8*\n"
+                                      "  %a = bitcast ptr @f2 to ptr\n"
+                                      "  %b = bitcast ptr @f4 to ptr\n"
                                       "  ret void\n"
                                       "}\n"
                                       "define void @f4() {\n"
                                       "entry:\n"
-                                      "  %a = bitcast void ()* @f3 to i8*\n"
-                                      "  %b = bitcast void ()* @f1 to i8*\n"
+                                      "  %a = bitcast ptr @f3 to ptr\n"
+                                      "  %b = bitcast ptr @f1 to ptr\n"
                                       "  ret void\n"
                                       "}\n");
 
diff --git a/llvm/unittests/Analysis/CaptureTrackingTest.cpp b/llvm/unittests/Analysis/CaptureTrackingTest.cpp
index ea3f21efc014c..d7ee5252d50be 100644
--- a/llvm/unittests/Analysis/CaptureTrackingTest.cpp
+++ b/llvm/unittests/Analysis/CaptureTrackingTest.cpp
@@ -20,27 +20,27 @@ using namespace llvm;
 TEST(CaptureTracking, MaxUsesToExplore) {
   StringRef Assembly = R"(
     ; Function Attrs: nounwind ssp uwtable
-    declare void @doesnt_capture(i8* nocapture, i8* nocapture, i8* nocapture, 
-                                 i8* nocapture, i8* nocapture)
+    declare void @doesnt_capture(ptr nocapture, ptr nocapture, ptr nocapture,
+                                 ptr nocapture, ptr nocapture)
 
     ; %arg has 5 uses
-    define void @test_few_uses(i8* %arg) {
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
+    define void @test_few_uses(ptr %arg) {
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
       ret void
     }
 
     ; %arg has 50 uses
-    define void @test_many_uses(i8* %arg) {
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
+    define void @test_many_uses(ptr %arg) {
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
       ret void
     }
   )";
@@ -85,12 +85,12 @@ struct CollectingCaptureTracker : public CaptureTracker {
 
 TEST(CaptureTracking, MultipleUsesInSameInstruction) {
   StringRef Assembly = R"(
-    declare void @call(i8*, i8*, i8*)
+    declare void @call(ptr, ptr, ptr)
 
-    define void @test(i8* %arg, i8** %ptr) {
-      call void @call(i8* %arg, i8* nocapture %arg, i8* %arg) [ "bundle"(i8* %arg) ]
-      cmpxchg i8** %ptr, i8* %arg, i8* %arg acq_rel monotonic
-      icmp eq i8* %arg, %arg
+    define void @test(ptr %arg, ptr %ptr) {
+      call void @call(ptr %arg, ptr nocapture %arg, ptr %arg) [ "bundle"(ptr %arg) ]
+      cmpxchg ptr %ptr, ptr %arg, ptr %arg acq_rel monotonic
+      icmp eq ptr %arg, %arg
       ret void
     }
   )";
diff --git a/llvm/unittests/Analysis/DDGTest.cpp b/llvm/unittests/Analysis/DDGTest.cpp
index 7fcdfdb62da43..12944a3f0cf3f 100644
--- a/llvm/unittests/Analysis/DDGTest.cpp
+++ b/llvm/unittests/Analysis/DDGTest.cpp
@@ -51,7 +51,7 @@ TEST(DDGTest, getDependencies) {
       "target datalayout = \"e-m:e-i64:64-n32:64\"\n"
       "target triple = \"powerpc64le-unknown-linux-gnu\"\n"
       "\n"
-      "define dso_local void @foo(i32 signext %n, i32* noalias %A, i32* "
+      "define dso_local void @foo(i32 signext %n, ptr noalias %A, ptr "
       "noalias %B) {\n"
       "entry:\n"
       "   %cmp1 = icmp sgt i32 %n, 0\n"
@@ -64,16 +64,16 @@ TEST(DDGTest, getDependencies) {
       " for.body:\n"
       "   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ "
       "%indvars.iv.next, %for.body ]\n"
-      "   %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv\n"
+      "   %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv\n"
       "  %0 = trunc i64 %indvars.iv to i32\n"
-      "  store i32 %0, i32* %arrayidx, align 4\n"
+      "  store i32 %0, ptr %arrayidx, align 4\n"
       "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
-      "  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 "
+      "  %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 "
       "%indvars.iv.next\n"
-      "  %1 = load i32, i32* %arrayidx2, align 4\n"
+      "  %1 = load i32, ptr %arrayidx2, align 4\n"
       "  %add3 = add nsw i32 %1, 1\n"
-      "  %arrayidx5 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv\n"
-      "  store i32 %add3, i32* %arrayidx5, align 4\n"
+      "  %arrayidx5 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv\n"
+      "  store i32 %add3, ptr %arrayidx5, align 4\n"
       "  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count\n"
       "  br i1 %exitcond, label %for.body, label %for.end.loopexit\n"
       "\n"
@@ -142,8 +142,8 @@ TEST(DDGTest, avoidDuplicateEdgesToFromPiBlocks) {
   const char *ModuleStr =
       "target datalayout = \"e-m:e-i64:64-n32:64-v256:256:256-v512:512:512\"\n"
       "\n"
-      "define void @foo(float* noalias %A, float* noalias %B, float* noalias "
-      "%C, float* noalias %D, i32 signext %n) {\n"
+      "define void @foo(ptr noalias %A, ptr noalias %B, ptr noalias "
+      "%C, ptr noalias %D, i32 signext %n) {\n"
       "entry:\n"
       "  %cmp1 = icmp sgt i32 %n, 0\n"
       "  br i1 %cmp1, label %for.body.preheader, label %for.end\n"
@@ -156,26 +156,26 @@ TEST(DDGTest, avoidDuplicateEdgesToFromPiBlocks) {
       "%for.body.preheader, %if.end\n"
       "  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, "
       "%if.end ]\n"
-      "  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv\n"
-      "  %loadASubI = load float, float* %arrayidx, align 4\n"
-      "  %arrayidx2 = getelementptr inbounds float, float* %B, i64 "
+      "  %arrayidx = getelementptr inbounds float, ptr %A, i64 %indvars.iv\n"
+      "  %loadASubI = load float, ptr %arrayidx, align 4\n"
+      "  %arrayidx2 = getelementptr inbounds float, ptr %B, i64 "
       "%indvars.iv\n"
-      "  %loadBSubI = load float, float* %arrayidx2, align 4\n"
+      "  %loadBSubI = load float, ptr %arrayidx2, align 4\n"
       "  %add = fadd fast float %loadASubI, %loadBSubI\n"
-      "  %arrayidx4 = getelementptr inbounds float, float* %A, i64 "
+      "  %arrayidx4 = getelementptr inbounds float, ptr %A, i64 "
       "%indvars.iv\n"
-      "  store float %add, float* %arrayidx4, align 4\n"
-      "  %arrayidx6 = getelementptr inbounds float, float* %A, i64 "
+      "  store float %add, ptr %arrayidx4, align 4\n"
+      "  %arrayidx6 = getelementptr inbounds float, ptr %A, i64 "
       "%indvars.iv\n"
-      "  %0 = load float, float* %arrayidx6, align 4\n"
+      "  %0 = load float, ptr %arrayidx6, align 4\n"
       "  %add7 = fadd fast float %0, 1.000000e+00\n"
       "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
-      "  %arrayidx10 = getelementptr inbounds float, float* %B, i64 "
+      "  %arrayidx10 = getelementptr inbounds float, ptr %B, i64 "
       "%indvars.iv.next\n"
-      "  store float %add7, float* %arrayidx10, align 4\n"
-      "  %arrayidx12 = getelementptr inbounds float, float* %A, i64 "
+      "  store float %add7, ptr %arrayidx10, align 4\n"
+      "  %arrayidx12 = getelementptr inbounds float, ptr %A, i64 "
       "%indvars.iv\n"
-      "  %1 = load float, float* %arrayidx12, align 4\n"
+      "  %1 = load float, ptr %arrayidx12, align 4\n"
       "  %cmp13 = fcmp fast ogt float %1, 1.000000e+02\n"
       "  br i1 %cmp13, label %if.then, label %if.else\n"
       "\n"
@@ -188,7 +188,7 @@ TEST(DDGTest, avoidDuplicateEdgesToFromPiBlocks) {
       "if.end:                                           ; preds = %if.else, "
       "%if.then\n"
       "  %ff.0 = phi float [ %add, %if.then ], [ %add7, %if.else ]\n"
-      "  store float %ff.0, float* %C, align 4\n"
+      "  store float %ff.0, ptr %C, align 4\n"
       "  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count\n"
       "  br i1 %exitcond, label %for.body, label %for.end.loopexit\n"
       "\n"
diff --git a/llvm/unittests/Analysis/DomTreeUpdaterTest.cpp b/llvm/unittests/Analysis/DomTreeUpdaterTest.cpp
index cabfc2aba57cf..9f5fe5742a44d 100644
--- a/llvm/unittests/Analysis/DomTreeUpdaterTest.cpp
+++ b/llvm/unittests/Analysis/DomTreeUpdaterTest.cpp
@@ -17,7 +17,6 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "gtest/gtest.h"
-#include <algorithm>
 
 using namespace llvm;
 
diff --git a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
index 497da8f3fc70b..dc5d0a8a7ca9b 100644
--- a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
+++ b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
@@ -457,7 +457,7 @@ define internal void @callee() {
   ret void
 }
 
-define i32 @caller() personality i32 (...)* @__gxx_personality_v0 {
+define i32 @caller() personality ptr @__gxx_personality_v0 {
 entry:
   invoke void @callee()
       to label %cont unwind label %exc
@@ -466,7 +466,7 @@ define i32 @caller() personality i32 (...)* @__gxx_personality_v0 {
   ret i32 0
 
 exc:
-  %exn = landingpad {i8*, i32}
+  %exn = landingpad {ptr, i32}
          cleanup
   ret i32 1
 }
@@ -498,7 +498,7 @@ TEST_F(FunctionPropertiesAnalysisTest, InvokeUnreachableHandler) {
                                              R"IR(
 declare void @might_throw()
 
-define internal i32 @callee() personality i32 (...)* @__gxx_personality_v0 {
+define internal i32 @callee() personality ptr @__gxx_personality_v0 {
 entry:
   invoke void @might_throw()
       to label %cont unwind label %exc
@@ -507,12 +507,12 @@ define internal i32 @callee() personality i32 (...)* @__gxx_personality_v0 {
   ret i32 0
 
 exc:
-  %exn = landingpad {i8*, i32}
+  %exn = landingpad {ptr, i32}
          cleanup
-  resume { i8*, i32 } %exn
+  resume { ptr, i32 } %exn
 }
 
-define i32 @caller() personality i32 (...)* @__gxx_personality_v0 {
+define i32 @caller() personality ptr @__gxx_personality_v0 {
 entry:
   %X = invoke i32 @callee()
            to label %cont unwind label %Handler
@@ -521,7 +521,7 @@ define i32 @caller() personality i32 (...)* @__gxx_personality_v0 {
   ret i32 %X
 
 Handler:
-  %exn = landingpad {i8*, i32}
+  %exn = landingpad {ptr, i32}
          cleanup
   ret i32 1
 }
@@ -554,7 +554,7 @@ TEST_F(FunctionPropertiesAnalysisTest, Rethrow) {
                                              R"IR(
 declare void @might_throw()
 
-define internal i32 @callee() personality i32 (...)* @__gxx_personality_v0 {
+define internal i32 @callee() personality ptr @__gxx_personality_v0 {
 entry:
   invoke void @might_throw()
       to label %cont unwind label %exc
@@ -563,12 +563,12 @@ define internal i32 @callee() personality i32 (...)* @__gxx_personality_v0 {
   ret i32 0
 
 exc:
-  %exn = landingpad {i8*, i32}
+  %exn = landingpad {ptr, i32}
          cleanup
-  resume { i8*, i32 } %exn
+  resume { ptr, i32 } %exn
 }
 
-define i32 @caller() personality i32 (...)* @__gxx_personality_v0 {
+define i32 @caller() personality ptr @__gxx_personality_v0 {
 entry:
   %X = invoke i32 @callee()
            to label %cont unwind label %Handler
@@ -577,7 +577,7 @@ define i32 @caller() personality i32 (...)* @__gxx_personality_v0 {
   ret i32 %X
 
 Handler:
-  %exn = landingpad {i8*, i32}
+  %exn = landingpad {ptr, i32}
          cleanup
   ret i32 1
 }
@@ -612,18 +612,18 @@ declare void @external_func()
 @exception_type2 = external global i8
 
 
-define internal void @inner() personality i8* null {
+define internal void @inner() personality ptr null {
   invoke void @external_func()
       to label %cont unwind label %lpad
 cont:
   ret void
 lpad:
   %lp = landingpad i32
-      catch i8* @exception_type1
+      catch ptr @exception_type1
   resume i32 %lp
 }
 
-define void @outer() personality i8* null {
+define void @outer() personality ptr null {
   invoke void @inner()
       to label %cont unwind label %lpad
 cont:
@@ -631,7 +631,7 @@ define void @outer() personality i8* null {
 lpad:
   %lp = landingpad i32
       cleanup
-      catch i8* @exception_type2
+      catch ptr @exception_type2
   resume i32 %lp
 }
 
@@ -666,18 +666,18 @@ declare void @external_func()
 @exception_type2 = external global i8
 
 
-define internal void @inner() personality i8* null {
+define internal void @inner() personality ptr null {
   invoke void @external_func()
       to label %cont unwind label %lpad
 cont:
   ret void
 lpad:
   %lp = landingpad i32
-      catch i8* @exception_type1
+      catch ptr @exception_type1
   resume i32 %lp
 }
 
-define void @outer(i32 %a) personality i8* null {
+define void @outer(i32 %a) personality ptr null {
 entry:
   %i = icmp slt i32 %a, 0
   br i1 %i, label %if.then, label %cont
@@ -689,7 +689,7 @@ if.then:
 lpad:
   %lp = landingpad i32
       cleanup
-      catch i8* @exception_type2
+      catch ptr @exception_type2
   resume i32 %lp
 }
 
@@ -931,9 +931,9 @@ TEST_F(FunctionPropertiesAnalysisTest, DetailedOperandCount) {
 @a = global i64 1
 
 define i64 @f1(i64 %e) {
-	%b = load i64, i64* @a
+	%b = load i64, ptr @a
   %c = add i64 %b, 2
-  %d = call i64 asm "mov $1,$0", "=r,r" (i64 %c)																						
+  %d = call i64 asm "mov $1,$0", "=r,r" (i64 %c)
 	%f = add i64 %d, %e
 	ret i64 %f
 }
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp
index 8ffc5f61d5e55..1aa0e423d363a 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -18,7 +18,6 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include <map>
 #include <vector>
 
 using namespace llvm;
diff --git a/llvm/unittests/Analysis/LazyCallGraphTest.cpp b/llvm/unittests/Analysis/LazyCallGraphTest.cpp
index 4a4ff3242c4c4..5c0bfbd74555e 100644
--- a/llvm/unittests/Analysis/LazyCallGraphTest.cpp
+++ b/llvm/unittests/Analysis/LazyCallGraphTest.cpp
@@ -142,78 +142,78 @@ static const char DiamondOfTriangles[] =
 static const char DiamondOfTrianglesRefGraph[] =
      "define void @a1() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @a2, void ()** %a\n"
-     "  store void ()* @b2, void ()** %a\n"
-     "  store void ()* @c3, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @a2, ptr %a\n"
+     "  store ptr @b2, ptr %a\n"
+     "  store ptr @c3, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @a2() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @a3, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @a3, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @a3() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @a1, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @a1, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @b1() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @b2, void ()** %a\n"
-     "  store void ()* @d3, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @b2, ptr %a\n"
+     "  store ptr @d3, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @b2() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @b3, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @b3, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @b3() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @b1, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @b1, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @c1() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @c2, void ()** %a\n"
-     "  store void ()* @d2, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @c2, ptr %a\n"
+     "  store ptr @d2, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @c2() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @c3, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @c3, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @c3() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @c1, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @c1, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @d1() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @d2, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @d2, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @d2() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @d3, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @d3, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @d3() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @d1, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @d1, ptr %a\n"
      "  ret void\n"
      "}\n";
 
@@ -1005,20 +1005,20 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionLargeRefCycle) {
   std::unique_ptr<Module> M =
       parseAssembly(Context, "define void @a() {\n"
                              "entry:\n"
-                             "  %p = alloca void ()*\n"
-                             "  store void ()* @b, void ()** %p\n"
+                             "  %p = alloca ptr\n"
+                             "  store ptr @b, ptr %p\n"
                              "  ret void\n"
                              "}\n"
                              "define void @b() {\n"
                              "entry:\n"
-                             "  %p = alloca void ()*\n"
-                             "  store void ()* @c, void ()** %p\n"
+                             "  %p = alloca ptr\n"
+                             "  store ptr @c, ptr %p\n"
                              "  ret void\n"
                              "}\n"
                              "define void @c() {\n"
                              "entry:\n"
-                             "  %p = alloca void ()*\n"
-                             "  store void ()* @d, void ()** %p\n"
+                             "  %p = alloca ptr\n"
+                             "  store ptr @d, ptr %p\n"
                              "  ret void\n"
                              "}\n"
                              "define void @d() {\n"
@@ -1306,25 +1306,25 @@ TEST(LazyCallGraphTest, InternalEdgeRemoval) {
   LLVMContext Context;
   // A nice fully connected (including self-edges) RefSCC.
   std::unique_ptr<Module> M = parseAssembly(
-      Context, "define void @a(i8** %ptr) {\n"
+      Context, "define void @a(ptr %ptr) {\n"
                "entry:\n"
-               "  store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
+               "  store ptr @a, ptr %ptr\n"
+               "  store ptr @b, ptr %ptr\n"
+               "  store ptr @c, ptr %ptr\n"
                "  ret void\n"
                "}\n"
-               "define void @b(i8** %ptr) {\n"
+               "define void @b(ptr %ptr) {\n"
                "entry:\n"
-               "  store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
+               "  store ptr @a, ptr %ptr\n"
+               "  store ptr @b, ptr %ptr\n"
+               "  store ptr @c, ptr %ptr\n"
                "  ret void\n"
                "}\n"
-               "define void @c(i8** %ptr) {\n"
+               "define void @c(ptr %ptr) {\n"
                "entry:\n"
-               "  store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
+               "  store ptr @a, ptr %ptr\n"
+               "  store ptr @b, ptr %ptr\n"
+               "  store ptr @c, ptr %ptr\n"
                "  ret void\n"
                "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -1384,25 +1384,25 @@ TEST(LazyCallGraphTest, InternalMultiEdgeRemoval) {
   LLVMContext Context;
   // A nice fully connected (including self-edges) RefSCC.
   std::unique_ptr<Module> M = parseAssembly(
-      Context, "define void @a(i8** %ptr) {\n"
+      Context, "define void @a(ptr %ptr) {\n"
                "entry:\n"
-               "  store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
+               "  store ptr @a, ptr %ptr\n"
+               "  store ptr @b, ptr %ptr\n"
+               "  store ptr @c, ptr %ptr\n"
                "  ret void\n"
                "}\n"
-               "define void @b(i8** %ptr) {\n"
+               "define void @b(ptr %ptr) {\n"
                "entry:\n"
-               "  store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
+               "  store ptr @a, ptr %ptr\n"
+               "  store ptr @b, ptr %ptr\n"
+               "  store ptr @c, ptr %ptr\n"
                "  ret void\n"
                "}\n"
-               "define void @c(i8** %ptr) {\n"
+               "define void @c(ptr %ptr) {\n"
                "entry:\n"
-               "  store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
+               "  store ptr @a, ptr %ptr\n"
+               "  store ptr @b, ptr %ptr\n"
+               "  store ptr @c, ptr %ptr\n"
                "  ret void\n"
                "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -1454,22 +1454,22 @@ TEST(LazyCallGraphTest, InternalNoOpEdgeRemoval) {
   // Reference edges: a -> b -> c -> a
   //      Call edges: a -> c -> b -> a
   std::unique_ptr<Module> M = parseAssembly(
-      Context, "define void @a(i8** %ptr) {\n"
+      Context, "define void @a(ptr %ptr) {\n"
                "entry:\n"
-               "  call void @b(i8** %ptr)\n"
-               "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
+               "  call void @b(ptr %ptr)\n"
+               "  store ptr @c, ptr %ptr\n"
                "  ret void\n"
                "}\n"
-               "define void @b(i8** %ptr) {\n"
+               "define void @b(ptr %ptr) {\n"
                "entry:\n"
-               "  store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n"
-               "  call void @c(i8** %ptr)\n"
+               "  store ptr @a, ptr %ptr\n"
+               "  call void @c(ptr %ptr)\n"
                "  ret void\n"
                "}\n"
-               "define void @c(i8** %ptr) {\n"
+               "define void @c(ptr %ptr) {\n"
                "entry:\n"
-               "  call void @a(i8** %ptr)\n"
-               "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
+               "  call void @a(ptr %ptr)\n"
+               "  store ptr @b, ptr %ptr\n"
                "  ret void\n"
                "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -1622,24 +1622,24 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCall) {
                              "entry:\n"
                              "  call void @b()\n"
                              "  call void @c()\n"
-                             "  store void()* @d, void()** undef\n"
+                             "  store ptr @d, ptr undef\n"
                              "  ret void\n"
                              "}\n"
                              "define void @b() {\n"
                              "entry:\n"
-                             "  store void()* @c, void()** undef\n"
+                             "  store ptr @c, ptr undef\n"
                              "  call void @d()\n"
                              "  ret void\n"
                              "}\n"
                              "define void @c() {\n"
                              "entry:\n"
-                             "  store void()* @b, void()** undef\n"
+                             "  store ptr @b, ptr undef\n"
                              "  call void @d()\n"
                              "  ret void\n"
                              "}\n"
                              "define void @d() {\n"
                              "entry:\n"
-                             "  store void()* @a, void()** undef\n"
+                             "  store ptr @a, ptr undef\n"
                              "  ret void\n"
                              "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -1745,13 +1745,13 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCallNoCycleInterleaved) {
                              "}\n"
                              "define void @c3() {\n"
                              "entry:\n"
-                             "  store void()* @b1, void()** undef\n"
+                             "  store ptr @b1, ptr undef\n"
                              "  call void @d()\n"
                              "  ret void\n"
                              "}\n"
                              "define void @d() {\n"
                              "entry:\n"
-                             "  store void()* @a, void()** undef\n"
+                             "  store ptr @a, ptr undef\n"
                              "  ret void\n"
                              "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -1875,13 +1875,13 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCallBothPartitionAndMerge) {
                              "}\n"
                              "define void @f() {\n"
                              "entry:\n"
-                             "  store void()* @b, void()** undef\n"
+                             "  store ptr @b, ptr undef\n"
                              "  call void @g()\n"
                              "  ret void\n"
                              "}\n"
                              "define void @g() {\n"
                              "entry:\n"
-                             "  store void()* @a, void()** undef\n"
+                             "  store ptr @a, ptr undef\n"
                              "  ret void\n"
                              "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -1962,9 +1962,9 @@ TEST(LazyCallGraphTest, HandleBlockAddress) {
                              "bb:\n"
                              "  unreachable\n"
                              "}\n"
-                             "define void @g(i8** %ptr) {\n"
+                             "define void @g(ptr %ptr) {\n"
                              "entry:\n"
-                             "  store i8* blockaddress(@f, %bb), i8** %ptr\n"
+                             "  store ptr blockaddress(@f, %bb), ptr %ptr\n"
                              "  ret void\n"
                              "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -1991,9 +1991,9 @@ TEST(LazyCallGraphTest, HandleBlockAddress2) {
       parseAssembly(Context, "define void @f() {\n"
                              "  ret void\n"
                              "}\n"
-                             "define void @g(i8** %ptr) {\n"
+                             "define void @g(ptr %ptr) {\n"
                              "bb:\n"
-                             "  store i8* blockaddress(@g, %bb), i8** %ptr\n"
+                             "  store ptr blockaddress(@g, %bb), ptr %ptr\n"
                              "  ret void\n"
                              "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -2018,31 +2018,31 @@ TEST(LazyCallGraphTest, ReplaceNodeFunction) {
   // function.
   std::unique_ptr<Module> M =
       parseAssembly(Context,
-                    "define void @a(i8** %ptr) {\n"
+                    "define void @a(ptr %ptr) {\n"
                     "entry:\n"
-                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
+                    "  store ptr @d, ptr %ptr\n"
                     "  ret void\n"
                     "}\n"
-                    "define void @b(i8** %ptr) {\n"
+                    "define void @b(ptr %ptr) {\n"
                     "entry:\n"
-                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
-                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
-                    "  call void @d(i8** %ptr)"
+                    "  store ptr @d, ptr %ptr\n"
+                    "  store ptr @d, ptr %ptr\n"
+                    "  call void @d(ptr %ptr)"
                     "  ret void\n"
                     "}\n"
-                    "define void @c(i8** %ptr) {\n"
+                    "define void @c(ptr %ptr) {\n"
                     "entry:\n"
-                    "  call void @d(i8** %ptr)"
-                    "  call void @d(i8** %ptr)"
-                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
+                    "  call void @d(ptr %ptr)"
+                    "  call void @d(ptr %ptr)"
+                    "  store ptr @d, ptr %ptr\n"
                     "  ret void\n"
                     "}\n"
-                    "define void @d(i8** %ptr) {\n"
+                    "define void @d(ptr %ptr) {\n"
                     "entry:\n"
-                    "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
-                    "  call void @c(i8** %ptr)"
-                    "  call void @d(i8** %ptr)"
-                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
+                    "  store ptr @b, ptr %ptr\n"
+                    "  call void @c(ptr %ptr)"
+                    "  call void @d(ptr %ptr)"
+                    "  store ptr @d, ptr %ptr\n"
                     "  ret void\n"
                     "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -2098,25 +2098,25 @@ TEST(LazyCallGraphTest, RemoveFunctionWithSpuriousRef) {
   // A graph with a couple of RefSCCs.
   std::unique_ptr<Module> M =
       parseAssembly(Context,
-                    "define void @a(i8** %ptr) {\n"
+                    "define void @a(ptr %ptr) {\n"
                     "entry:\n"
-                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
+                    "  store ptr @d, ptr %ptr\n"
                     "  ret void\n"
                     "}\n"
-                    "define void @b(i8** %ptr) {\n"
+                    "define void @b(ptr %ptr) {\n"
                     "entry:\n"
-                    "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
+                    "  store ptr @c, ptr %ptr\n"
                     "  ret void\n"
                     "}\n"
-                    "define void @c(i8** %ptr) {\n"
+                    "define void @c(ptr %ptr) {\n"
                     "entry:\n"
-                    "  call void @d(i8** %ptr)"
+                    "  call void @d(ptr %ptr)"
                     "  ret void\n"
                     "}\n"
-                    "define void @d(i8** %ptr) {\n"
+                    "define void @d(ptr %ptr) {\n"
                     "entry:\n"
-                    "  call void @c(i8** %ptr)"
-                    "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
+                    "  call void @c(ptr %ptr)"
+                    "  store ptr @b, ptr %ptr\n"
                     "  ret void\n"
                     "}\n"
                     "define void @dead() {\n"
@@ -2965,7 +2965,7 @@ TEST(LazyCallGraphTest, AddSplitFunctions5) {
   LLVMContext Context;
   std::unique_ptr<Module> M =
       parseAssembly(Context, "define void @f() {\n"
-                             "  %1 = bitcast void ()* @f2 to i8*\n"
+                             "  %1 = bitcast ptr @f2 to ptr\n"
                              "  ret void\n"
                              "}\n"
                              "define void @f2() {\n"
diff --git a/llvm/unittests/Analysis/SparsePropagation.cpp b/llvm/unittests/Analysis/SparsePropagation.cpp
index ca73a480cbb2d..0cbf5de81c808 100644
--- a/llvm/unittests/Analysis/SparsePropagation.cpp
+++ b/llvm/unittests/Analysis/SparsePropagation.cpp
@@ -357,9 +357,9 @@ TEST_F(SparsePropagationTest, GlobalVariableOverDefined) {
 
 /// Test that we propagate information through function returns.
 ///
-/// define internal i64 @f(i1* %cond) {
+/// define internal i64 @f(ptr %cond) {
 /// if:
-///   %0 = load i1, i1* %cond
+///   %0 = load i1, ptr %cond
 ///   br i1 %0, label %then, label %else
 ///
 /// then:
@@ -397,9 +397,9 @@ TEST_F(SparsePropagationTest, FunctionDefined) {
 
 /// Test that we propagate information through function returns.
 ///
-/// define internal i64 @f(i1* %cond) {
+/// define internal i64 @f(ptr %cond) {
 /// if:
-///   %0 = load i1, i1* %cond
+///   %0 = load i1, ptr %cond
 ///   br i1 %0, label %then, label %else
 ///
 /// then:
diff --git a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
index b33419545efa8..787a32407ad95 100644
--- a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
+++ b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
@@ -277,6 +277,12 @@ TEST_F(TargetLibraryInfoTest, ValidProto) {
       "declare x86_fp80 @logbl(x86_fp80)\n"
       "declare float @logf(float)\n"
       "declare x86_fp80 @logl(x86_fp80)\n"
+      "declare double @nextafter(double, double)\n"
+      "declare float @nextafterf(float, float)\n"
+      "declare x86_fp80 @nextafterl(x86_fp80, x86_fp80)\n"
+      "declare double @nexttoward(double, x86_fp80)\n"
+      "declare float @nexttowardf(float, x86_fp80)\n"
+      "declare x86_fp80 @nexttowardl(x86_fp80, x86_fp80)\n"
       "declare i8* @malloc(i64)\n"
       "declare i8* @memccpy(i8*, i8*, i32, i64)\n"
       "declare i8* @memchr(i8*, i32, i64)\n"
diff --git a/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp b/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp
index d5ba1757ce35c..3c7ee7ad1334e 100644
--- a/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp
+++ b/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp
@@ -214,18 +214,18 @@ TEST(UnrollAnalyzerTest, PtrCmpSimplifications) {
       "target datalayout = \"e-m:o-i64:64-f80:128-n8:16:32:64-S128\"\n"
       "define void @ptr_cmp(i8 *%a) {\n"
       "entry:\n"
-      "  %limit = getelementptr i8, i8* %a, i64 40\n"
-      "  %start.iv2 = getelementptr i8, i8* %a, i64 7\n"
+      "  %limit = getelementptr i8, ptr %a, i64 40\n"
+      "  %start.iv2 = getelementptr i8, ptr %a, i64 7\n"
       "  br label %loop.body\n"
       "loop.body:\n"
-      "  %iv.0 = phi i8* [ %a, %entry ], [ %iv.1, %loop.body ]\n"
-      "  %iv2.0 = phi i8* [ %start.iv2, %entry ], [ %iv2.1, %loop.body ]\n"
-      "  %cmp = icmp eq i8* %iv2.0, %iv.0\n"
-      "  %cmp2 = icmp slt i8* %iv2.0, %iv.0\n"
-      "  %cmp3 = icmp ult i8* %iv2.0, %iv.0\n"
-      "  %iv.1 = getelementptr inbounds i8, i8* %iv.0, i64 1\n"
-      "  %iv2.1 = getelementptr inbounds i8, i8* %iv2.0, i64 1\n"
-      "  %exitcond = icmp ne i8* %iv.1, %limit\n"
+      "  %iv.0 = phi ptr [ %a, %entry ], [ %iv.1, %loop.body ]\n"
+      "  %iv2.0 = phi ptr [ %start.iv2, %entry ], [ %iv2.1, %loop.body ]\n"
+      "  %cmp = icmp eq ptr %iv2.0, %iv.0\n"
+      "  %cmp2 = icmp slt ptr %iv2.0, %iv.0\n"
+      "  %cmp3 = icmp ult ptr %iv2.0, %iv.0\n"
+      "  %iv.1 = getelementptr inbounds i8, ptr %iv.0, i64 1\n"
+      "  %iv2.1 = getelementptr inbounds i8, ptr %iv2.0, i64 1\n"
+      "  %exitcond = icmp ne ptr %iv.1, %limit\n"
       "  br i1 %exitcond, label %loop.body, label %loop.exit\n"
       "loop.exit:\n"
       "  ret void\n"
@@ -248,14 +248,14 @@ TEST(UnrollAnalyzerTest, PtrCmpSimplifications) {
   Instruction *Cmp2 = &*BBI++;
   Instruction *Cmp3 = &*BBI++;
   // Check simplification expected on the 5th iteration.
-  // Check that "%cmp = icmp eq i8* %iv2.0, %iv.0" is simplified to 0.
+  // Check that "%cmp = icmp eq ptr %iv2.0, %iv.0" is simplified to 0.
   auto I1 = SimplifiedValuesVector[5].find(Cmp1);
   EXPECT_TRUE(I1 != SimplifiedValuesVector[5].end());
   EXPECT_EQ(cast<ConstantInt>((*I1).second)->getZExtValue(), 0U);
-  // Check that "%cmp2 = icmp slt i8* %iv2.0, %iv.0" does not simplify
+  // Check that "%cmp2 = icmp slt ptr %iv2.0, %iv.0" does not simplify
   auto I2 = SimplifiedValuesVector[5].find(Cmp2);
   EXPECT_TRUE(I2 == SimplifiedValuesVector[5].end());
-  // Check that "%cmp3 = icmp ult i8* %iv2.0, %iv.0" is simplified to 0.
+  // Check that "%cmp3 = icmp ult ptr %iv2.0, %iv.0" is simplified to 0.
   auto I3 = SimplifiedValuesVector[5].find(Cmp3);
   EXPECT_TRUE(I3 != SimplifiedValuesVector[5].end());
   EXPECT_EQ(cast<ConstantInt>((*I1).second)->getZExtValue(), 0U);
@@ -271,8 +271,8 @@ TEST(UnrollAnalyzerTest, CastSimplifications) {
       "\n"
       "loop:\n"
       "  %iv = phi i64 [ 0, %entry ], [ %inc, %loop ]\n"
-      "  %array_const_idx = getelementptr inbounds [10 x i32], [10 x i32]* @known_constant, i64 0, i64 %iv\n"
-      "  %const_array_element = load i32, i32* %array_const_idx, align 4\n"
+      "  %array_const_idx = getelementptr inbounds [10 x i32], ptr @known_constant, i64 0, i64 %iv\n"
+      "  %const_array_element = load i32, ptr %array_const_idx, align 4\n"
       "  %se = sext i32 %const_array_element to i64\n"
       "  %ze = zext i32 %const_array_element to i64\n"
       "  %tr = trunc i32 %const_array_element to i8\n"
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index bb0280ee69cfd..89a0faefc5988 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -2697,9 +2697,9 @@ TEST_F(ComputeKnownBitsTest, ComputeKnownBitsGEPWithRange) {
   parseAssembly(
       "define void @test(ptr %p) {\n"
       "  %A = load i64, ptr %p, !range !{i64 64, i64 65536}\n"
-      "  %APtr = inttoptr i64 %A to float*"
-      "  %APtrPlus512 = getelementptr float, float* %APtr, i32 128\n"
-      "  %c = icmp ugt float* %APtrPlus512, inttoptr (i32 523 to float*)\n"
+      "  %APtr = inttoptr i64 %A to ptr"
+      "  %APtrPlus512 = getelementptr float, ptr %APtr, i32 128\n"
+      "  %c = icmp ugt ptr %APtrPlus512, inttoptr (i32 523 to ptr)\n"
       "  call void @llvm.assume(i1 %c)\n"
       "  ret void\n"
       "}\n"
@@ -2730,9 +2730,9 @@ TEST_F(ComputeKnownBitsTest, ComputeKnownBitsGEPWithRangeNoOverlap) {
   parseAssembly(
       "define void @test(ptr %p) {\n"
       "  %A = load i64, ptr %p, !range !{i64 32, i64 64}\n"
-      "  %APtr = inttoptr i64 %A to float*"
-      "  %APtrPlus512 = getelementptr float, float* %APtr, i32 128\n"
-      "  %c = icmp ugt float* %APtrPlus512, inttoptr (i32 523 to float*)\n"
+      "  %APtr = inttoptr i64 %A to ptr"
+      "  %APtrPlus512 = getelementptr float, ptr %APtr, i32 128\n"
+      "  %c = icmp ugt ptr %APtrPlus512, inttoptr (i32 523 to ptr)\n"
       "  call void @llvm.assume(i1 %c)\n"
       "  ret void\n"
       "}\n"
diff --git a/llvm/unittests/BinaryFormat/MsgPackWriterTest.cpp b/llvm/unittests/BinaryFormat/MsgPackWriterTest.cpp
index 3a5ba9af08f6c..dd8826ca6ccd5 100644
--- a/llvm/unittests/BinaryFormat/MsgPackWriterTest.cpp
+++ b/llvm/unittests/BinaryFormat/MsgPackWriterTest.cpp
@@ -40,7 +40,7 @@ TEST_F(MsgPackWriter, TestWriteFixPositiveInt) {
     MPWriter.write(u);
     std::string Output = OStream.str();
     EXPECT_EQ(Output.size(), 1u);
-    EXPECT_EQ(Output.data()[0], static_cast<uint8_t>(u));
+    EXPECT_EQ(Output[0], static_cast<uint8_t>(u));
   }
 }
 
@@ -128,7 +128,7 @@ TEST_F(MsgPackWriter, TestWriteFixNegativeInt) {
     MPWriter.write(i);
     std::string Output = OStream.str();
     EXPECT_EQ(Output.size(), 1u);
-    EXPECT_EQ(static_cast<int8_t>(Output.data()[0]), static_cast<int8_t>(i));
+    EXPECT_EQ(static_cast<int8_t>(Output[0]), static_cast<int8_t>(i));
   }
 }
 
diff --git a/llvm/unittests/CAS/ActionCacheTest.cpp b/llvm/unittests/CAS/ActionCacheTest.cpp
index db67e30ca203b..692da230b6e09 100644
--- a/llvm/unittests/CAS/ActionCacheTest.cpp
+++ b/llvm/unittests/CAS/ActionCacheTest.cpp
@@ -21,7 +21,7 @@ using namespace llvm;
 using namespace llvm::cas;
 
 TEST_P(CASTest, ActionCacheHit) {
-  std::shared_ptr<ObjectStore> CAS = createObjectStore();
+  std::unique_ptr<ObjectStore> CAS = createObjectStore();
   std::unique_ptr<ActionCache> Cache = createActionCache();
 
   std::optional<ObjectProxy> ID;
@@ -36,7 +36,7 @@ TEST_P(CASTest, ActionCacheHit) {
 }
 
 TEST_P(CASTest, ActionCacheMiss) {
-  std::shared_ptr<ObjectStore> CAS = createObjectStore();
+  std::unique_ptr<ObjectStore> CAS = createObjectStore();
   std::unique_ptr<ActionCache> Cache = createActionCache();
 
   std::optional<ObjectProxy> ID1, ID2;
@@ -59,7 +59,7 @@ TEST_P(CASTest, ActionCacheMiss) {
 }
 
 TEST_P(CASTest, ActionCacheRewrite) {
-  std::shared_ptr<ObjectStore> CAS = createObjectStore();
+  std::unique_ptr<ObjectStore> CAS = createObjectStore();
   std::unique_ptr<ActionCache> Cache = createActionCache();
 
   std::optional<ObjectProxy> ID1, ID2;
diff --git a/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp b/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp
new file mode 100644
index 0000000000000..19522e9372d85
--- /dev/null
+++ b/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
+#include "CASTestConfig.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+TEST_F(OnDiskCASTest, UnifiedCASMaterializationCheckPreventsGarbageCollection) {
+  unittest::TempDir Temp("on-disk-unified-cas", /*Unique=*/true);
+
+  auto WithCAS = [&](llvm::function_ref<void(ObjectStore &)> Action) {
+    std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>> DBs;
+    ASSERT_THAT_ERROR(
+        createOnDiskUnifiedCASDatabases(Temp.path()).moveInto(DBs),
+        Succeeded());
+    ObjectStore &CAS = *DBs.first;
+    ASSERT_THAT_ERROR(CAS.setSizeLimit(1), Succeeded());
+    Action(CAS);
+  };
+
+  std::optional<CASID> ID;
+
+  // Create an object in the CAS.
+  WithCAS([&ID](ObjectStore &CAS) {
+    std::optional<ObjectRef> Ref;
+    ASSERT_THAT_ERROR(CAS.store({}, "blah").moveInto(Ref), Succeeded());
+    ASSERT_TRUE(Ref.has_value());
+
+    ID = CAS.getID(*Ref);
+  });
+
+  // Check materialization and prune the storage.
+  WithCAS([&ID](ObjectStore &CAS) {
+    std::optional<ObjectRef> Ref = CAS.getReference(*ID);
+    ASSERT_TRUE(Ref.has_value());
+
+    std::optional<bool> IsMaterialized;
+    ASSERT_THAT_ERROR(CAS.isMaterialized(*Ref).moveInto(IsMaterialized),
+                      Succeeded());
+    ASSERT_TRUE(IsMaterialized);
+
+    ASSERT_THAT_ERROR(CAS.pruneStorageData(), Succeeded());
+  });
+
+  // Verify that the previous materialization check kept the object in the CAS.
+  WithCAS([&ID](ObjectStore &CAS) {
+    std::optional<ObjectRef> Ref = CAS.getReference(*ID);
+    ASSERT_TRUE(Ref.has_value());
+
+    std::optional<bool> IsMaterialized;
+    ASSERT_THAT_ERROR(CAS.isMaterialized(*Ref).moveInto(IsMaterialized),
+                      Succeeded());
+    ASSERT_TRUE(IsMaterialized);
+  });
+}
diff --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp
index 10e4b689e151e..08cbf1daf727d 100644
--- a/llvm/unittests/CAS/CASTestConfig.cpp
+++ b/llvm/unittests/CAS/CASTestConfig.cpp
@@ -8,6 +8,7 @@
 
 #include "CASTestConfig.h"
 #include "llvm/CAS/ObjectStore.h"
+#include "llvm/Testing/Support/Error.h"
 #include "gtest/gtest.h"
 #include <mutex>
 
@@ -15,7 +16,8 @@ using namespace llvm;
 using namespace llvm::cas;
 
 static CASTestingEnv createInMemory(int I) {
-  return CASTestingEnv{createInMemoryCAS(), createInMemoryActionCache()};
+  return CASTestingEnv{createInMemoryCAS(), createInMemoryActionCache(),
+                       std::nullopt};
 }
 
 INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest,
@@ -23,7 +25,7 @@ INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest,
 
 #if LLVM_ENABLE_ONDISK_CAS
 namespace llvm::cas::ondisk {
-extern void setMaxMappingSize(uint64_t Size);
+void setMaxMappingSize(uint64_t Size);
 } // namespace llvm::cas::ondisk
 
 void setMaxOnDiskCASMappingSize() {
@@ -31,6 +33,17 @@ void setMaxOnDiskCASMappingSize() {
   std::call_once(
       Flag, [] { llvm::cas::ondisk::setMaxMappingSize(100 * 1024 * 1024); });
 }
+
+static CASTestingEnv createOnDisk(int I) {
+  unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+  std::unique_ptr<ObjectStore> CAS;
+  EXPECT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded());
+  std::unique_ptr<ActionCache> Cache;
+  EXPECT_THAT_ERROR(createOnDiskActionCache(Temp.path()).moveInto(Cache),
+                    Succeeded());
+  return CASTestingEnv{std::move(CAS), std::move(Cache), std::move(Temp)};
+}
+INSTANTIATE_TEST_SUITE_P(OnDiskCAS, CASTest, ::testing::Values(createOnDisk));
 #else
 void setMaxOnDiskCASMappingSize() {}
 #endif /* LLVM_ENABLE_ONDISK_CAS */
diff --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h
index 8d3c55305f1b3..b1c0e59ff2b92 100644
--- a/llvm/unittests/CAS/CASTestConfig.h
+++ b/llvm/unittests/CAS/CASTestConfig.h
@@ -6,16 +6,28 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_UNITTESTS_CASTESTCONFIG_H
+#define LLVM_UNITTESTS_CASTESTCONFIG_H
+
 #include "llvm/CAS/ActionCache.h"
 #include "llvm/CAS/ObjectStore.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
 #include "gtest/gtest.h"
+#include <memory>
 
-#ifndef LLVM_UNITTESTS_CASTESTCONFIG_H
-#define LLVM_UNITTESTS_CASTESTCONFIG_H
+namespace llvm::unittest::cas {
+class MockEnv {
+  void anchor();
+
+public:
+  virtual ~MockEnv();
+};
+} // namespace llvm::unittest::cas
 
 struct CASTestingEnv {
   std::unique_ptr<llvm::cas::ObjectStore> CAS;
   std::unique_ptr<llvm::cas::ActionCache> Cache;
+  std::optional<llvm::unittest::TempDir> Temp;
 };
 
 void setMaxOnDiskCASMappingSize();
@@ -24,26 +36,47 @@ void setMaxOnDiskCASMappingSize();
 class OnDiskCASTest : public ::testing::Test {
 protected:
   void SetUp() override {
+#if !LLVM_ENABLE_ONDISK_CAS
+    GTEST_SKIP() << "OnDiskCAS is not enabled";
+#endif
     // Use a smaller database size for testing to conserve disk space.
     setMaxOnDiskCASMappingSize();
   }
 };
 
+// Parametered test fixture for ObjectStore and ActionCache tests.
 class CASTest
     : public testing::TestWithParam<std::function<CASTestingEnv(int)>> {
 protected:
   std::optional<int> NextCASIndex;
 
+  llvm::SmallVector<llvm::unittest::TempDir> Dirs;
+
+  llvm::SmallVector<std::unique_ptr<llvm::unittest::cas::MockEnv>> Envs;
+
   std::unique_ptr<llvm::cas::ObjectStore> createObjectStore() {
     auto TD = GetParam()(++(*NextCASIndex));
+    if (TD.Temp)
+      Dirs.push_back(std::move(*TD.Temp));
     return std::move(TD.CAS);
   }
   std::unique_ptr<llvm::cas::ActionCache> createActionCache() {
     auto TD = GetParam()(++(*NextCASIndex));
+    if (TD.Temp)
+      Dirs.push_back(std::move(*TD.Temp));
     return std::move(TD.Cache);
   }
-  void SetUp() override { NextCASIndex = 0; }
-  void TearDown() override { NextCASIndex = std::nullopt; }
+
+  void SetUp() override {
+    NextCASIndex = 0;
+    setMaxOnDiskCASMappingSize();
+  }
+
+  void TearDown() override {
+    NextCASIndex = std::nullopt;
+    Dirs.clear();
+    Envs.clear();
+  }
 };
 
 #endif
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index da469f7fccb5a..91e49be770745 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -1,9 +1,11 @@
 set(ONDISK_CAS_TEST_SOURCES
+  BuiltinUnifiedCASDatabasesTest.cpp
   OnDiskGraphDBTest.cpp
   OnDiskDataAllocatorTest.cpp
   OnDiskKeyValueDBTest.cpp
   OnDiskTrieRawHashMapTest.cpp
   ProgramTest.cpp
+  UnifiedOnDiskCacheTest.cpp
   )
 
 set(LLVM_OPTIONAL_SOURCES
diff --git a/llvm/unittests/CAS/ObjectStoreTest.cpp b/llvm/unittests/CAS/ObjectStoreTest.cpp
index 54083fdb408f6..b43ae33d74127 100644
--- a/llvm/unittests/CAS/ObjectStoreTest.cpp
+++ b/llvm/unittests/CAS/ObjectStoreTest.cpp
@@ -1,4 +1,4 @@
-//===- ObjectStoreTest.cpp ------------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -76,7 +76,7 @@ multiline text multiline text multiline text multiline text multiline text)",
 
   // Run validation on all CASIDs.
   for (int I = 0, E = IDs.size(); I != E; ++I)
-    ASSERT_THAT_ERROR(CAS1->validate(IDs[I]), Succeeded());
+    ASSERT_THAT_ERROR(CAS1->validateObject(IDs[I]), Succeeded());
 
   // Check that the blobs can be retrieved multiple times.
   for (int I = 0, E = IDs.size(); I != E; ++I) {
@@ -120,15 +120,15 @@ TEST_P(CASTest, BlobsBig) {
     std::optional<CASID> ID2;
     ASSERT_THAT_ERROR(CAS->createProxy({}, String1).moveInto(ID1), Succeeded());
     ASSERT_THAT_ERROR(CAS->createProxy({}, String1).moveInto(ID2), Succeeded());
-    ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded());
-    ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded());
+    ASSERT_THAT_ERROR(CAS->validateObject(*ID1), Succeeded());
+    ASSERT_THAT_ERROR(CAS->validateObject(*ID2), Succeeded());
     ASSERT_EQ(ID1, ID2);
 
     String1.append(String2);
     ASSERT_THAT_ERROR(CAS->createProxy({}, String2).moveInto(ID1), Succeeded());
     ASSERT_THAT_ERROR(CAS->createProxy({}, String2).moveInto(ID2), Succeeded());
-    ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded());
-    ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded());
+    ASSERT_THAT_ERROR(CAS->validateObject(*ID1), Succeeded());
+    ASSERT_THAT_ERROR(CAS->validateObject(*ID2), Succeeded());
     ASSERT_EQ(ID1, ID2);
     String2.append(String1);
   }
@@ -176,10 +176,11 @@ multiline text multiline text multiline text multiline text multiline text)",
 
     // Check basic printing of IDs.
     IDs.push_back(CAS1->getID(*Node));
-    auto ID = CAS1->getID(Nodes.back());
-    EXPECT_EQ(ID.toString(), IDs.back().toString());
-    EXPECT_EQ(*Node, Nodes.back());
-    EXPECT_EQ(ID, IDs.back());
+    EXPECT_EQ(IDs.back().toString(), IDs.back().toString());
+    EXPECT_EQ(Nodes.front(), Nodes.front());
+    EXPECT_EQ(Nodes.back(), Nodes.back());
+    EXPECT_EQ(IDs.front(), IDs.front());
+    EXPECT_EQ(IDs.back(), IDs.back());
     if (Nodes.size() <= 1)
       continue;
     EXPECT_NE(Nodes.front(), Nodes.back());
@@ -266,7 +267,7 @@ TEST_P(CASTest, NodesBig) {
   }
 
   for (auto ID : CreatedNodes)
-    ASSERT_THAT_ERROR(CAS->validate(CAS->getID(ID)), Succeeded());
+    ASSERT_THAT_ERROR(CAS->validateObject(CAS->getID(ID)), Succeeded());
 }
 
 #if LLVM_ENABLE_THREADS
@@ -332,17 +333,124 @@ static void testBlobsParallel1(ObjectStore &CAS, uint64_t BlobSize) {
 }
 
 TEST_P(CASTest, BlobsParallel) {
-  std::shared_ptr<ObjectStore> CAS = createObjectStore();
+  std::unique_ptr<ObjectStore> CAS = createObjectStore();
   uint64_t Size = 1ULL * 1024;
   ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size));
 }
 
 #ifdef EXPENSIVE_CHECKS
 TEST_P(CASTest, BlobsBigParallel) {
-  std::shared_ptr<ObjectStore> CAS = createObjectStore();
+  std::unique_ptr<ObjectStore> CAS = createObjectStore();
   // 100k is large enough to be standalone files in our on-disk cas.
   uint64_t Size = 100ULL * 1024;
   ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size));
 }
 #endif // EXPENSIVE_CHECKS
+
+#ifndef _WIN32 // create_link won't work for directories on Windows
+TEST_F(OnDiskCASTest, OnDiskCASBlobsParallelMultiCAS) {
+  // This test intentionally uses symlinked paths to the same CAS to subvert the
+  // shared memory mappings that would normally be created within a single
+  // process. This breaks the lock file guarantees, so we must be careful not
+  // to create or destroy the CAS objects concurrently, which is when the locks
+  // are normally important.
+  unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+  ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")),
+            std::error_code());
+
+  std::unique_ptr<ObjectStore> CAS1, CAS2, CAS3, CAS4;
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4),
+                    Succeeded());
+
+  uint64_t Size = 1ULL * 1024;
+  ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size));
+}
+
+TEST_F(OnDiskCASTest, OnDiskCASBlobsBigParallelMultiCAS) {
+  // See comment in BlobsParallelMultiCAS.
+  unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+  ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")),
+            std::error_code());
+
+  std::unique_ptr<ObjectStore> CAS1, CAS2, CAS3, CAS4;
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4),
+                    Succeeded());
+
+  // 100k is large enough to be standalone files in our on-disk cas.
+  uint64_t Size = 100ULL * 1024;
+  ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size));
+}
+#endif // _WIN32
 #endif // LLVM_ENABLE_THREADS
+
+TEST_F(OnDiskCASTest, OnDiskCASDiskSize) {
+  unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+  std::unique_ptr<ObjectStore> CAS;
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded());
+
+  uint64_t MaxSize = 100 * 1024 * 1024;
+
+  // Check that we map the files to the correct size.
+  auto CheckFileSizes = [&](bool Mapped) {
+    bool FoundIndex = false, FoundData = false;
+    std::error_code EC;
+    for (sys::fs::directory_iterator I(Temp.path(), EC), E; I != E && !EC;
+         I.increment(EC)) {
+      StringRef Filename = sys::path::filename(I->path());
+      if (Filename.starts_with("index.") && !Filename.ends_with(".shared")) {
+        FoundIndex = true;
+        ASSERT_TRUE(I->status());
+        if (Mapped)
+          EXPECT_EQ(I->status()->getSize(), MaxSize);
+        else
+          EXPECT_LT(I->status()->getSize(), MaxSize);
+      }
+      if (Filename.starts_with("data.") && !Filename.ends_with(".shared")) {
+        FoundData = true;
+        ASSERT_TRUE(I->status());
+        if (Mapped)
+          EXPECT_EQ(I->status()->getSize(), MaxSize);
+        else
+          EXPECT_LT(I->status()->getSize(), MaxSize);
+      }
+    }
+    ASSERT_TRUE(FoundIndex);
+    ASSERT_TRUE(FoundData);
+  };
+
+  // Check that we have the full mapping size when the CAS is open.
+  CheckFileSizes(/*Mapped=*/true);
+  CAS.reset();
+  // Check that the CAS is shrunk to a smaller size.
+  CheckFileSizes(/*Mapped=*/false);
+
+  // Repeat the checks when starting from an existing CAS.
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded());
+  CheckFileSizes(/*Mapped=*/true);
+  CAS.reset();
+  CheckFileSizes(/*Mapped=*/false);
+}
diff --git a/llvm/unittests/CAS/OnDiskCommonUtils.h b/llvm/unittests/CAS/OnDiskCommonUtils.h
index 89f93e08366c9..48a1830f9b219 100644
--- a/llvm/unittests/CAS/OnDiskCommonUtils.h
+++ b/llvm/unittests/CAS/OnDiskCommonUtils.h
@@ -12,6 +12,8 @@
 
 #include "llvm/CAS/BuiltinObjectHasher.h"
 #include "llvm/CAS/OnDiskGraphDB.h"
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
 #include "llvm/Support/BLAKE3.h"
 #include "llvm/Testing/Support/Error.h"
 
@@ -58,6 +60,25 @@ inline Expected<ObjectID> store(OnDiskGraphDB &DB, StringRef Data,
   return ID;
 }
 
+inline Expected<ObjectID> cachePut(OnDiskKeyValueDB &DB, ArrayRef<uint8_t> Key,
+                                   ObjectID ID) {
+  auto Value = UnifiedOnDiskCache::getValueFromObjectID(ID);
+  auto Result = DB.put(Key, Value);
+  if (!Result)
+    return Result.takeError();
+  return UnifiedOnDiskCache::getObjectIDFromValue(*Result);
+}
+
+inline Expected<std::optional<ObjectID>> cacheGet(OnDiskKeyValueDB &DB,
+                                                  ArrayRef<uint8_t> Key) {
+  auto Result = DB.get(Key);
+  if (!Result)
+    return Result.takeError();
+  if (!*Result)
+    return std::nullopt;
+  return UnifiedOnDiskCache::getObjectIDFromValue(**Result);
+}
+
 inline Error printTree(OnDiskGraphDB &DB, ObjectID ID, raw_ostream &OS,
                        unsigned Indent = 0) {
   std::optional<ondisk::ObjectHandle> Obj;
diff --git a/llvm/unittests/CAS/OnDiskGraphDBTest.cpp b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
index 3c2e96318a5ed..e9c73bfb6c8d3 100644
--- a/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
+++ b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
@@ -102,7 +102,7 @@ TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInSingleNode) {
   std::unique_ptr<OnDiskGraphDB> DB;
   ASSERT_THAT_ERROR(
       OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
-                          std::move(UpstreamDB),
+                          UpstreamDB.get(),
                           OnDiskGraphDB::FaultInPolicy::SingleNode)
           .moveInto(DB),
       Succeeded());
@@ -208,7 +208,7 @@ TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInFullTree) {
   unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
   std::unique_ptr<OnDiskGraphDB> DB;
   ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
-                                        std::move(UpstreamDB),
+                                        UpstreamDB.get(),
                                         OnDiskGraphDB::FaultInPolicy::FullTree)
                         .moveInto(DB),
                     Succeeded());
@@ -264,14 +264,14 @@ TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInPolicyConflict) {
     unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
     std::unique_ptr<OnDiskGraphDB> DB;
     ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3",
-                                          sizeof(HashType),
-                                          std::move(UpstreamDB), Policy1)
+                                          sizeof(HashType), UpstreamDB.get(),
+                                          Policy1)
                           .moveInto(DB),
                       Succeeded());
     DB.reset();
     ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3",
-                                          sizeof(HashType),
-                                          std::move(UpstreamDB), Policy2)
+                                          sizeof(HashType), UpstreamDB.get(),
+                                          Policy2)
                           .moveInto(DB),
                       Failed());
   };
diff --git a/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp
new file mode 100644
index 0000000000000..09aebc2d4bc19
--- /dev/null
+++ b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp
@@ -0,0 +1,198 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "CASTestConfig.h"
+#include "OnDiskCommonUtils.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+using namespace llvm::unittest::cas;
+
+/// Visits all the files of a directory recursively and returns the sum of their
+/// sizes.
+static Expected<size_t> countFileSizes(StringRef Path) {
+  size_t TotalSize = 0;
+  std::error_code EC;
+  for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE;
+       DirI.increment(EC)) {
+    if (DirI->type() == sys::fs::file_type::directory_file) {
+      Expected<size_t> Subsize = countFileSizes(DirI->path());
+      if (!Subsize)
+        return Subsize.takeError();
+      TotalSize += *Subsize;
+      continue;
+    }
+    ErrorOr<sys::fs::basic_file_status> Stat = DirI->status();
+    if (!Stat)
+      return createFileError(DirI->path(), Stat.getError());
+    TotalSize += Stat->getSize();
+  }
+  if (EC)
+    return createFileError(Path, EC);
+  return TotalSize;
+}
+
+TEST_F(OnDiskCASTest, UnifiedOnDiskCacheTest) {
+  unittest::TempDir Temp("ondisk-unified", /*Unique=*/true);
+  std::unique_ptr<UnifiedOnDiskCache> UniDB;
+
+  const uint64_t SizeLimit = 1024ull * 64;
+  auto reopenDB = [&]() {
+    UniDB.reset();
+    ASSERT_THAT_ERROR(UnifiedOnDiskCache::open(Temp.path(), SizeLimit, "blake3",
+                                               sizeof(HashType))
+                          .moveInto(UniDB),
+                      Succeeded());
+  };
+
+  reopenDB();
+
+  HashType RootHash;
+  HashType OtherHash;
+  HashType Key1Hash;
+  HashType Key2Hash;
+  {
+    OnDiskGraphDB &DB = UniDB->getGraphDB();
+    std::optional<ObjectID> ID1;
+    ASSERT_THAT_ERROR(store(DB, "1", {}).moveInto(ID1), Succeeded());
+    std::optional<ObjectID> ID2;
+    ASSERT_THAT_ERROR(store(DB, "2", {}).moveInto(ID2), Succeeded());
+    std::optional<ObjectID> IDRoot;
+    ASSERT_THAT_ERROR(store(DB, "root", {*ID1, *ID2}).moveInto(IDRoot),
+                      Succeeded());
+    ArrayRef<uint8_t> Digest = DB.getDigest(*IDRoot);
+    ASSERT_EQ(Digest.size(), RootHash.size());
+    llvm::copy(Digest, RootHash.data());
+
+    std::optional<ObjectID> IDOther;
+    ASSERT_THAT_ERROR(store(DB, "other", {}).moveInto(IDOther), Succeeded());
+    Digest = DB.getDigest(*IDOther);
+    ASSERT_EQ(Digest.size(), OtherHash.size());
+    llvm::copy(Digest, OtherHash.data());
+
+    Key1Hash = digest("key1");
+    std::optional<ObjectID> Val;
+    ASSERT_THAT_ERROR(
+        cachePut(UniDB->getKeyValueDB(), Key1Hash, *IDRoot).moveInto(Val),
+        Succeeded());
+    EXPECT_EQ(IDRoot, Val);
+
+    Key2Hash = digest("key2");
+    std::optional<ObjectID> KeyID;
+    ASSERT_THAT_ERROR(DB.getReference(Key2Hash).moveInto(KeyID), Succeeded());
+    ASSERT_THAT_ERROR(cachePut(UniDB->getKeyValueDB(),
+                               UniDB->getGraphDB().getDigest(*KeyID), *ID1)
+                          .moveInto(Val),
+                      Succeeded());
+  }
+
+  auto checkTree = [&](const HashType &Digest, StringRef ExpectedTree) {
+    OnDiskGraphDB &DB = UniDB->getGraphDB();
+    std::optional<ObjectID> ID;
+    ASSERT_THAT_ERROR(DB.getReference(Digest).moveInto(ID), Succeeded());
+    std::string PrintedTree;
+    raw_string_ostream OS(PrintedTree);
+    ASSERT_THAT_ERROR(printTree(DB, *ID, OS), Succeeded());
+    EXPECT_EQ(PrintedTree, ExpectedTree);
+  };
+  auto checkRootTree = [&]() {
+    return checkTree(RootHash, "root\n  1\n  2\n");
+  };
+
+  auto checkKey = [&](const HashType &Key, StringRef ExpectedData) {
+    OnDiskGraphDB &DB = UniDB->getGraphDB();
+    std::optional<ObjectID> Val;
+    ASSERT_THAT_ERROR(cacheGet(UniDB->getKeyValueDB(), Key).moveInto(Val),
+                      Succeeded());
+
+    ASSERT_TRUE(Val.has_value());
+    std::optional<ondisk::ObjectHandle> Obj;
+    ASSERT_THAT_ERROR(DB.load(*Val).moveInto(Obj), Succeeded());
+    EXPECT_EQ(toStringRef(DB.getObjectData(*Obj)), ExpectedData);
+  };
+
+  checkRootTree();
+  checkTree(OtherHash, "other\n");
+  checkKey(Key1Hash, "root");
+  checkKey(Key2Hash, "1");
+
+  auto storeBigObject = [&](unsigned Index) {
+    SmallString<1000> Buf;
+    Buf.append(970, 'a');
+    raw_svector_ostream(Buf) << Index;
+    std::optional<ObjectID> ID;
+    ASSERT_THAT_ERROR(store(UniDB->getGraphDB(), Buf, {}).moveInto(ID),
+                      Succeeded());
+  };
+
+  uint64_t PrevStoreSize = UniDB->getStorageSize();
+  unsigned Index = 0;
+  while (!UniDB->hasExceededSizeLimit()) {
+    storeBigObject(Index++);
+  }
+  EXPECT_GT(UniDB->getStorageSize(), PrevStoreSize);
+  UniDB->setSizeLimit(SizeLimit * 2);
+  EXPECT_FALSE(UniDB->hasExceededSizeLimit());
+  UniDB->setSizeLimit(SizeLimit);
+  EXPECT_TRUE(UniDB->hasExceededSizeLimit());
+
+  reopenDB();
+
+  EXPECT_FALSE(UniDB->hasExceededSizeLimit());
+  EXPECT_FALSE(UniDB->needsGarbageCollection());
+
+  checkRootTree();
+  checkKey(Key1Hash, "root");
+
+  while (!UniDB->hasExceededSizeLimit()) {
+    storeBigObject(Index++);
+  }
+  PrevStoreSize = UniDB->getStorageSize();
+  ASSERT_THAT_ERROR(UniDB->close(), Succeeded());
+  EXPECT_TRUE(UniDB->needsGarbageCollection());
+
+  reopenDB();
+  EXPECT_TRUE(UniDB->needsGarbageCollection());
+
+  std::optional<size_t> DirSizeBefore;
+  ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeBefore),
+                    Succeeded());
+
+  ASSERT_THAT_ERROR(UnifiedOnDiskCache::collectGarbage(Temp.path()),
+                    Succeeded());
+
+  std::optional<size_t> DirSizeAfter;
+  ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeAfter),
+                    Succeeded());
+  EXPECT_LT(*DirSizeAfter, *DirSizeBefore);
+
+  reopenDB();
+  EXPECT_FALSE(UniDB->needsGarbageCollection());
+
+  checkRootTree();
+  checkKey(Key1Hash, "root");
+
+  EXPECT_LT(UniDB->getStorageSize(), PrevStoreSize);
+
+  // 'Other' tree and 'Key2' got garbage-collected.
+  {
+    OnDiskGraphDB &DB = UniDB->getGraphDB();
+    std::optional<ObjectID> ID;
+    ASSERT_THAT_ERROR(DB.getReference(OtherHash).moveInto(ID), Succeeded());
+    EXPECT_FALSE(DB.containsObject(*ID));
+    std::optional<ObjectID> Val;
+    ASSERT_THAT_ERROR(cacheGet(UniDB->getKeyValueDB(), Key2Hash).moveInto(Val),
+                      Succeeded());
+    EXPECT_FALSE(Val.has_value());
+  }
+}
diff --git a/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp b/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
index af2d56df33d38..d0991e6201343 100644
--- a/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
+++ b/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
@@ -383,7 +383,7 @@ class AsmPrinterHandlerTest : public AsmPrinterFixtureBase {
 
   public:
     TestHandler(AsmPrinterHandlerTest &Test) : Test(Test) {}
-    ~TestHandler() override {}
+    ~TestHandler() override = default;
     void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
     void beginModule(Module *M) override { Test.BeginCount++; }
     void endModule() override { Test.EndCount++; }
diff --git a/llvm/unittests/CodeGen/CMakeLists.txt b/llvm/unittests/CodeGen/CMakeLists.txt
index 18332d20bf8ff..4d07462babefa 100644
--- a/llvm/unittests/CodeGen/CMakeLists.txt
+++ b/llvm/unittests/CodeGen/CMakeLists.txt
@@ -39,6 +39,7 @@ add_llvm_unittest(CodeGenTests
   MachineOperandTest.cpp
   MIR2VecTest.cpp
   RegAllocScoreTest.cpp
+  RegisterTest.cpp
   PassManagerTest.cpp
   ScalableVectorMVTsTest.cpp
   SchedBoundary.cpp
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index ff87e7b6a1018..235a53dcc156e 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -1113,7 +1113,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondSpills) {
   // Create a stack location and ensure it's tracked.
   SpillLoc SL = {getRegByName("RSP"), StackOffset::getFixed(-8)};
   SpillLocationNo SpillNo = *MTracker->getOrTrackSpillLoc(SL);
-  ASSERT_EQ(MTracker->getNumLocs(), 13u); // Tracks all possible stack locs.
+  ASSERT_EQ(MTracker->getNumLocs(), 11u); // Tracks all possible stack locs.
   // Locations are: RSP, stack slots from 2^3 bits wide up to 2^9 for zmm regs,
   // then slots for sub_8bit_hi and sub_16bit_hi ({8, 8} and {16, 16}).
   // Finally, one for spilt fp80 registers.
diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc
index a86a68cb4adf1..3ea51de751fad 100644
--- a/llvm/unittests/CodeGen/MFCommon.inc
+++ b/llvm/unittests/CodeGen/MFCommon.inc
@@ -45,7 +45,7 @@ public:
     static RegClassWeight Bogus{1, 16};
     return Bogus;
   }
-  unsigned getRegUnitWeight(unsigned RegUnit) const override { return 1; }
+  unsigned getRegUnitWeight(MCRegUnit RegUnit) const override { return 1; }
   unsigned getNumRegPressureSets() const override { return 0; }
   const char *getRegPressureSetName(unsigned Idx) const override {
     return "bogus";
@@ -59,7 +59,7 @@ public:
     static const int Bogus[] = {0, -1};
     return &Bogus[0];
   }
-  const int *getRegUnitPressureSets(unsigned RegUnit) const override {
+  const int *getRegUnitPressureSets(MCRegUnit RegUnit) const override {
     static const int Bogus[] = {0, -1};
     return &Bogus[0];
   }
@@ -76,8 +76,10 @@ public:
 };
 
 class BogusTargetInstrInfo : public TargetInstrInfo {
+  BogusRegisterInfo RegInfo;
+
 public:
-  BogusTargetInstrInfo() : TargetInstrInfo() {}
+  BogusTargetInstrInfo() : TargetInstrInfo(RegInfo) {}
 };
 
 class BogusSubtarget : public TargetSubtargetInfo {
@@ -86,7 +88,7 @@ public:
       : TargetSubtargetInfo(Triple(""), "", "", "", {}, {}, {}, nullptr,
                             nullptr, nullptr, nullptr, nullptr, nullptr),
         FL(), TL(TM) {}
-  ~BogusSubtarget() override {}
+  ~BogusSubtarget() override = default;
 
   const TargetFrameLowering *getFrameLowering() const override { return &FL; }
 
@@ -117,7 +119,7 @@ public:
             Reloc::Static, CodeModel::Small, CodeGenOptLevel::Default),
         ST(*this) {}
 
-  ~BogusTargetMachine() override {}
+  ~BogusTargetMachine() override = default;
 
   const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override {
     return &ST;
diff --git a/llvm/unittests/CodeGen/MachineOperandTest.cpp b/llvm/unittests/CodeGen/MachineOperandTest.cpp
index 3f3f48fcc7c58..0373c7a0f629b 100644
--- a/llvm/unittests/CodeGen/MachineOperandTest.cpp
+++ b/llvm/unittests/CodeGen/MachineOperandTest.cpp
@@ -424,4 +424,24 @@ TEST(MachineOperandTest, HashValue) {
   ASSERT_TRUE(MO1.isIdenticalTo(MO2));
 }
 
+TEST(MachineOperandTest, RegisterLiveOutHashValue) {
+  LLVMContext Ctx;
+  Module Mod("Module", Ctx);
+  auto MF = createMachineFunction(Ctx, Mod);
+  MachineBasicBlock *MBB = MF->CreateMachineBasicBlock();
+  MCInstrDesc MCID = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  auto *MI1 = MF->CreateMachineInstr(MCID, DebugLoc());
+  auto *MI2 = MF->CreateMachineInstr(MCID, DebugLoc());
+  MBB->insert(MBB->begin(), MI1);
+  MBB->insert(MBB->begin(), MI2);
+  uint32_t Mask1 = 0;
+  uint32_t Mask2 = 0;
+  MI1->addOperand(*MF, MachineOperand::CreateRegLiveOut(&Mask1));
+  MI2->addOperand(*MF, MachineOperand::CreateRegLiveOut(&Mask2));
+  auto MO1 = MI1->getOperand(0);
+  auto MO2 = MI2->getOperand(0);
+  EXPECT_EQ(hash_value(MO1), hash_value(MO2));
+  EXPECT_TRUE(MO1.isIdenticalTo(MO2));
+}
+
 } // end namespace
diff --git a/llvm/unittests/CodeGen/RegisterTest.cpp b/llvm/unittests/CodeGen/RegisterTest.cpp
new file mode 100644
index 0000000000000..db2747ccc718e
--- /dev/null
+++ b/llvm/unittests/CodeGen/RegisterTest.cpp
@@ -0,0 +1,38 @@
+//===- RegisterTest.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Register.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+TEST(RegisterTest, Idx2StackSlot) {
+  EXPECT_EQ(Register::index2StackSlot(0), Register::StackSlotZero);
+  EXPECT_EQ(Register::index2StackSlot(1), Register::StackSlotZero | 1);
+  EXPECT_EQ(Register::index2StackSlot(-1),
+            Register::StackSlotZero | Register::StackSlotMask);
+  int MaxPowOf2 = 1 << (Register::MaxFrameIndexBitwidth - 1);
+  // Check the highest possible value of frame index
+  EXPECT_EQ(Register::index2StackSlot(MaxPowOf2 - 1),
+            Register::StackSlotZero | (MaxPowOf2 - 1));
+  // Check the lowest possible value of frame index
+  EXPECT_EQ(Register::index2StackSlot(-MaxPowOf2),
+            Register::StackSlotZero | (-MaxPowOf2 & Register::StackSlotMask));
+}
+
+TEST(RegisterTest, StackSlotIndex) {
+  int MaxPowOf2 = 1 << (Register::MaxFrameIndexBitwidth - 1);
+  std::vector<int> FIs = {0, 1 - 1, MaxPowOf2 - 1, -MaxPowOf2};
+
+  for (int FI : FIs) {
+    Register Reg = Register::index2StackSlot(FI);
+    EXPECT_EQ(Reg.stackSlotIndex(), FI);
+  }
+}
+} // end namespace
diff --git a/llvm/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp b/llvm/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
index 5c961998a4157..fab40b963731b 100644
--- a/llvm/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
+++ b/llvm/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
@@ -84,7 +84,7 @@ class MockCallbacks : public TypeVisitorCallbacks {
 
 class RandomAccessVisitorTest : public testing::Test {
 public:
-  RandomAccessVisitorTest() {}
+  RandomAccessVisitorTest() = default;
 
   static void SetUpTestCase() {
     GlobalState = std::make_unique<GlobalTestState>();
diff --git a/llvm/unittests/DebugInfo/CodeView/TypeIndexDiscoveryTest.cpp b/llvm/unittests/DebugInfo/CodeView/TypeIndexDiscoveryTest.cpp
index b1f19e9e20891..62b75912814dd 100644
--- a/llvm/unittests/DebugInfo/CodeView/TypeIndexDiscoveryTest.cpp
+++ b/llvm/unittests/DebugInfo/CodeView/TypeIndexDiscoveryTest.cpp
@@ -21,7 +21,7 @@ using namespace llvm::codeview;
 
 class TypeIndexIteratorTest : public testing::Test {
 public:
-  TypeIndexIteratorTest() {}
+  TypeIndexIteratorTest() = default;
 
   void SetUp() override {
     Refs.clear();
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
index 2fe52600df923..aa5b2926ab55a 100644
--- a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
@@ -864,7 +864,7 @@ TEST_F(DebugLineBasicFixture, CallbackUsedForUnterminatedSequence) {
 }
 
 struct AdjustAddressFixtureBase : public CommonFixture {
-  virtual ~AdjustAddressFixtureBase() {}
+  virtual ~AdjustAddressFixtureBase() = default;
 
   // Create and update the prologue as specified by the subclass, then return
   // the length of the table.
diff --git a/llvm/unittests/DebugInfo/LogicalView/StringPoolTest.cpp b/llvm/unittests/DebugInfo/LogicalView/StringPoolTest.cpp
index 27ff449c98e64..e4dc77bb72c1c 100644
--- a/llvm/unittests/DebugInfo/LogicalView/StringPoolTest.cpp
+++ b/llvm/unittests/DebugInfo/LogicalView/StringPoolTest.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/LogicalView/Core/LVStringPool.h"
-#include <vector>
 
 #include "gtest/gtest.h"
 
diff --git a/llvm/unittests/DebugInfo/PDB/NativeSessionTest.cpp b/llvm/unittests/DebugInfo/PDB/NativeSessionTest.cpp
index b010b5fce8675..cffaf7c9543fb 100644
--- a/llvm/unittests/DebugInfo/PDB/NativeSessionTest.cpp
+++ b/llvm/unittests/DebugInfo/PDB/NativeSessionTest.cpp
@@ -19,8 +19,6 @@
 
 #include "gtest/gtest.h"
 
-#include <vector>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.h b/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.h
index f03c82f76c324..a2732e36a8a68 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.h
+++ b/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.h
@@ -19,7 +19,7 @@ class MockJITLinkMemoryManager : public llvm::jitlink::JITLinkMemoryManager {
 public:
   class Alloc {
   public:
-    virtual ~Alloc() {}
+    virtual ~Alloc() = default;
   };
 
   class SimpleAlloc : public Alloc {
diff --git a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
index b06aa2565bb04..7b563d7bcc68c 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -26,6 +26,7 @@ add_llvm_unittest(OrcJITTests
   IndirectionUtilsTest.cpp
   JITTargetMachineBuilderTest.cpp
   LazyCallThroughAndReexportsTest.cpp
+  LibraryResolverTest.cpp
   LookupAndRecordAddrsTest.cpp
   MachOPlatformTest.cpp
   MapperJITLinkMemoryManagerTest.cpp
diff --git a/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp b/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
index ec94083859bc5..13070e81914b6 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Testing/Support/Error.h"
 
 #include <deque>
-#include <set>
 #include <thread>
 
 using namespace llvm;
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_linux.yaml
new file mode 100644
index 0000000000000..afd1d9e69448d
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_linux.yaml
@@ -0,0 +1,460 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .rela.plt
+    Align:           0x1000
+    Offset:          0x0
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .init
+    LastSec:         .fini
+    VAddr:           0x1000
+    Align:           0x1000
+    Offset:          0x1000
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .rodata
+    LastSec:         .eh_frame
+    VAddr:           0x2000
+    Align:           0x1000
+    Offset:          0x2000
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .bss
+    VAddr:           0x3E10
+    Align:           0x1000
+    Offset:          0x2E10
+  - Type:            PT_DYNAMIC
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .dynamic
+    LastSec:         .dynamic
+    VAddr:           0x3E20
+    Align:           0x8
+    Offset:          0x2E20
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.build-id
+    LastSec:         .note.gnu.build-id
+    VAddr:           0x2C8
+    Align:           0x4
+    Offset:          0x2C8
+  - Type:            PT_GNU_PROPERTY
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_GNU_EH_FRAME
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame_hdr
+    VAddr:           0x2010
+    Align:           0x4
+    Offset:          0x2010
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x10
+    Offset:          0x0
+  - Type:            PT_GNU_RELRO
+    Flags:           [ PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .got
+    VAddr:           0x3E10
+    Offset:          0x2E10
+Sections:
+  - Name:            .note.gnu.property
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2A8
+    AddressAlign:    0x8
+    Notes:
+      - Name:            GNU
+        Desc:            020000C0040000000300000000000000
+        Type:            NT_GNU_PROPERTY_TYPE_0
+  - Name:            .note.gnu.build-id
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2C8
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            73604396C95840D5C380A0950F085A778F94EE7C
+        Type:            NT_PRPSINFO
+  - Name:            .gnu.hash
+    Type:            SHT_GNU_HASH
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2F0
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Header:
+      SymNdx:          0x6
+      Shift2:          0x6
+    BloomFilter:     [ 0x400000080000 ]
+    HashBuckets:     [ 0x0, 0x6 ]
+    HashValues:      [ 0x7C9DCB93 ]
+  - Name:            .dynsym
+    Type:            SHT_DYNSYM
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x318
+    Link:            .dynstr
+    AddressAlign:    0x8
+  - Name:            .dynstr
+    Type:            SHT_STRTAB
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3C0
+    AddressAlign:    0x1
+  - Name:            .gnu.version
+    Type:            SHT_GNU_versym
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x436
+    Link:            .dynsym
+    AddressAlign:    0x2
+    Entries:         [ 0, 1, 2, 1, 1, 2, 1 ]
+  - Name:            .gnu.version_r
+    Type:            SHT_GNU_verneed
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x448
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Dependencies:
+      - Version:         1
+        File:            libc.so.6
+        Entries:
+          - Name:            GLIBC_2.2.5
+            Hash:            157882997
+            Flags:           0
+            Other:           2
+  - Name:            .rela.dyn
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x468
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Relocations:
+      - Offset:          0x3E10
+        Type:            R_X86_64_RELATIVE
+        Addend:          4368
+      - Offset:          0x3E18
+        Type:            R_X86_64_RELATIVE
+        Addend:          4304
+      - Offset:          0x4020
+        Type:            R_X86_64_RELATIVE
+        Addend:          16416
+      - Offset:          0x3FE0
+        Symbol:          _ITM_deregisterTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FE8
+        Symbol:          __gmon_start__
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF0
+        Symbol:          _ITM_registerTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF8
+        Symbol:          __cxa_finalize
+        Type:            R_X86_64_GLOB_DAT
+  - Name:            .rela.plt
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC, SHF_INFO_LINK ]
+    Address:         0x510
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Info:            .got.plt
+    Relocations:
+      - Offset:          0x4018
+        Symbol:          puts
+        Type:            R_X86_64_JUMP_SLOT
+  - Name:            .init
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1000
+    AddressAlign:    0x4
+    Offset:          0x1000
+    Content:         F30F1EFA4883EC08488B05D92F00004885C07402FFD04883C408C3
+  - Name:            .plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1020
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90
+  - Name:            .plt.got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1040
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25AD2F00000F1F440000
+  - Name:            .plt.sec
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1050
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25BD2F00000F1F440000
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1060
+    AddressAlign:    0x10
+    Content:         488D3DC12F0000488D05BA2F00004839F87415488B05662F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D912F0000488D358A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05352F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D4D2F000000752B5548833D122F0000004889E5740C488B3D2E2F0000E849FFFFFFE864FFFFFFC605252F0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5488D05D80E00004889C7E820FFFFFF905DC3
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1134
+    AddressAlign:    0x4
+    Content:         F30F1EFA4883EC084883C408C3
+  - Name:            .rodata
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2000
+    AddressAlign:    0x1
+    Offset:          0x2000
+    Content:         48656C6C6F2066726F6D204100
+  - Name:            .eh_frame_hdr
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2010
+    AddressAlign:    0x4
+    Content:         011B033B2C0000000400000010F0FFFF4800000030F0FFFF7000000040F0FFFF8800000009F1FFFFA0000000
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2040
+    AddressAlign:    0x8
+    Content:         1400000000000000017A5200017810011B0C070890010000240000001C000000C0EFFFFF20000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000B8EFFFFF100000000000000000000000140000005C000000B0EFFFFF1000000000000000000000001C0000007400000061F0FFFF1A00000000450E108602430D06510C070800000000000000
+  - Name:            .init_array
+    Type:            SHT_INIT_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E10
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Offset:          0x2E10
+    Content:         '1011000000000000'
+  - Name:            .fini_array
+    Type:            SHT_FINI_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E18
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         D010000000000000
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E20
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Entries:
+      - Tag:             DT_NEEDED
+        Value:           0x5F
+      - Tag:             DT_INIT
+        Value:           0x1000
+      - Tag:             DT_FINI
+        Value:           0x1134
+      - Tag:             DT_INIT_ARRAY
+        Value:           0x3E10
+      - Tag:             DT_INIT_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_FINI_ARRAY
+        Value:           0x3E18
+      - Tag:             DT_FINI_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_GNU_HASH
+        Value:           0x2F0
+      - Tag:             DT_STRTAB
+        Value:           0x3C0
+      - Tag:             DT_SYMTAB
+        Value:           0x318
+      - Tag:             DT_STRSZ
+        Value:           0x75
+      - Tag:             DT_SYMENT
+        Value:           0x18
+      - Tag:             DT_PLTGOT
+        Value:           0x4000
+      - Tag:             DT_PLTRELSZ
+        Value:           0x18
+      - Tag:             DT_PLTREL
+        Value:           0x7
+      - Tag:             DT_JMPREL
+        Value:           0x510
+      - Tag:             DT_RELA
+        Value:           0x468
+      - Tag:             DT_RELASZ
+        Value:           0xA8
+      - Tag:             DT_RELAENT
+        Value:           0x18
+      - Tag:             DT_VERNEED
+        Value:           0x448
+      - Tag:             DT_VERNEEDNUM
+        Value:           0x1
+      - Tag:             DT_VERSYM
+        Value:           0x436
+      - Tag:             DT_RELACOUNT
+        Value:           0x3
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+  - Name:            .got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3FE0
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '0000000000000000000000000000000000000000000000000000000000000000'
+  - Name:            .got.plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4000
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '203E000000000000000000000000000000000000000000003010000000000000'
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4020
+    AddressAlign:    0x8
+    Content:         '2040000000000000'
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4028
+    AddressAlign:    0x1
+    Size:            0x8
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000
+Symbols:
+  - Name:            crtstuff.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            deregister_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1060
+  - Name:            register_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1090
+  - Name:            __do_global_dtors_aux
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10D0
+  - Name:            completed.0
+    Type:            STT_OBJECT
+    Section:         .bss
+    Value:           0x4028
+    Size:            0x1
+  - Name:            __do_global_dtors_aux_fini_array_entry
+    Type:            STT_OBJECT
+    Section:         .fini_array
+    Value:           0x3E18
+  - Name:            frame_dummy
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1110
+  - Name:            __frame_dummy_init_array_entry
+    Type:            STT_OBJECT
+    Section:         .init_array
+    Value:           0x3E10
+  - Name:            libA.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            'crtstuff.c (1)'
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __FRAME_END__
+    Type:            STT_OBJECT
+    Section:         .eh_frame
+    Value:           0x20D0
+  - Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            _fini
+    Type:            STT_FUNC
+    Section:         .fini
+    Value:           0x1134
+  - Name:            __dso_handle
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4020
+  - Name:            _DYNAMIC
+    Type:            STT_OBJECT
+    Section:         .dynamic
+    Value:           0x3E20
+  - Name:            __GNU_EH_FRAME_HDR
+    Section:         .eh_frame_hdr
+    Value:           0x2010
+  - Name:            __TMC_END__
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4028
+  - Name:            _GLOBAL_OFFSET_TABLE_
+    Type:            STT_OBJECT
+    Section:         .got.plt
+    Value:           0x4000
+  - Name:            _init
+    Type:            STT_FUNC
+    Section:         .init
+    Value:           0x1000
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            'puts@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            sayA
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            '__cxa_finalize@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+DynamicSymbols:
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            puts
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            __cxa_finalize
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+  - Name:            sayA
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_macho.yaml
new file mode 100644
index 0000000000000..2e851a90c21ed
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_macho.yaml
@@ -0,0 +1,723 @@
+--- !fat-mach-o
+FatHeader:
+  magic:           0xCAFEBABE
+  nfat_arch:       3
+FatArchs:
+  - cputype:         0x1000007
+    cpusubtype:      0x3
+    offset:          0x1000
+    size:            8376
+    align:           12
+  - cputype:         0x100000C
+    cpusubtype:      0x0
+    offset:          0x4000
+    size:            33376
+    align:           14
+  - cputype:         0x100000C
+    cpusubtype:      0x80000002
+    offset:          0x10000
+    size:            33376
+    align:           14
+Slices:
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x1000007
+      cpusubtype:      0x3
+      filetype:        0x6
+      ncmds:           14
+      sizeofcmds:      960
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          4096
+        fileoff:         0
+        filesize:        4096
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0xF80
+            size:            20
+            offset:          0xF80
+            align:           4
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         554889E5488D3D0F000000B000E8020000005DC3
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0xF94
+            size:            6
+            offset:          0xF94
+            align:           1
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x6
+            reserved3:       0x0
+            content:         FF2566000000
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0xF9A
+            size:            14
+            offset:          0xF9A
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20410A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0xFA8
+            size:            88
+            offset:          0xFA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000800F00004000000040000000940F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          4096
+        vmsize:          4096
+        fileoff:         4096
+        filesize:        4096
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x1000
+            size:            8
+            offset:          0x1000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          8192
+        vmsize:          4096
+        fileoff:         8192
+        filesize:        184
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         8192
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         8288
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          8320
+        nsyms:           2
+        stroff:          8360
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  8352
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            ADFFA141-C3EE-37CD-B1E7-906D69F81BCB
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         8312
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         8320
+        datasize:        0
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayA
+            Flags:           0x0
+            Address:         0xF80
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         3968
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayA
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0xF80 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0, 
+                         0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x0
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F70
+            size:            28
+            offset:          0x3F70
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8C0035FD6
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0x3F8C
+            size:            12
+            offset:          0x3F8C
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0xC
+            reserved3:       0x0
+            content:         100000B0100240F900021FD6
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20410A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000703F000040000000400000008C3F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            C45227E0-C6C0-3137-969B-36AABF9D5487
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayA
+            Flags:           0x0
+            Address:         0x3F70
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16240
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayA
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F70 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x80000002
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F68
+            size:            32
+            offset:          0x3F68
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         7F2303D5FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8FF0F5FD6
+          - sectname:        __auth_stubs
+            segname:         __TEXT
+            addr:            0x3F88
+            size:            16
+            offset:          0x3F88
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x10
+            reserved3:       0x0
+            content:         110000B031020091300240F9110A1FD7
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20410A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000883F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __auth_got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         00000000000001C0
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            C9DC00C2-E721-365C-9C2D-E9FDB7C838BB
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayA
+            Flags:           0x0
+            Address:         0x3F68
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16232
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayA
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F68 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_linux.yaml
new file mode 100644
index 0000000000000..fe4393e108d96
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_linux.yaml
@@ -0,0 +1,460 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .rela.plt
+    Align:           0x1000
+    Offset:          0x0
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .init
+    LastSec:         .fini
+    VAddr:           0x1000
+    Align:           0x1000
+    Offset:          0x1000
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .rodata
+    LastSec:         .eh_frame
+    VAddr:           0x2000
+    Align:           0x1000
+    Offset:          0x2000
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .bss
+    VAddr:           0x3E10
+    Align:           0x1000
+    Offset:          0x2E10
+  - Type:            PT_DYNAMIC
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .dynamic
+    LastSec:         .dynamic
+    VAddr:           0x3E20
+    Align:           0x8
+    Offset:          0x2E20
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.build-id
+    LastSec:         .note.gnu.build-id
+    VAddr:           0x2C8
+    Align:           0x4
+    Offset:          0x2C8
+  - Type:            PT_GNU_PROPERTY
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_GNU_EH_FRAME
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame_hdr
+    VAddr:           0x2010
+    Align:           0x4
+    Offset:          0x2010
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x10
+    Offset:          0x0
+  - Type:            PT_GNU_RELRO
+    Flags:           [ PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .got
+    VAddr:           0x3E10
+    Offset:          0x2E10
+Sections:
+  - Name:            .note.gnu.property
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2A8
+    AddressAlign:    0x8
+    Notes:
+      - Name:            GNU
+        Desc:            020000C0040000000300000000000000
+        Type:            NT_GNU_PROPERTY_TYPE_0
+  - Name:            .note.gnu.build-id
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2C8
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            6337F7C1BF21A1DE17630C55602EB4CAC50435BB
+        Type:            NT_PRPSINFO
+  - Name:            .gnu.hash
+    Type:            SHT_GNU_HASH
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2F0
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Header:
+      SymNdx:          0x6
+      Shift2:          0x6
+    BloomFilter:     [ 0x400000100000 ]
+    HashBuckets:     [ 0x6, 0x0 ]
+    HashValues:      [ 0x7C9DCB95 ]
+  - Name:            .dynsym
+    Type:            SHT_DYNSYM
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x318
+    Link:            .dynstr
+    AddressAlign:    0x8
+  - Name:            .dynstr
+    Type:            SHT_STRTAB
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3C0
+    AddressAlign:    0x1
+  - Name:            .gnu.version
+    Type:            SHT_GNU_versym
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x436
+    Link:            .dynsym
+    AddressAlign:    0x2
+    Entries:         [ 0, 1, 2, 1, 1, 2, 1 ]
+  - Name:            .gnu.version_r
+    Type:            SHT_GNU_verneed
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x448
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Dependencies:
+      - Version:         1
+        File:            libc.so.6
+        Entries:
+          - Name:            GLIBC_2.2.5
+            Hash:            157882997
+            Flags:           0
+            Other:           2
+  - Name:            .rela.dyn
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x468
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Relocations:
+      - Offset:          0x3E10
+        Type:            R_X86_64_RELATIVE
+        Addend:          4368
+      - Offset:          0x3E18
+        Type:            R_X86_64_RELATIVE
+        Addend:          4304
+      - Offset:          0x4020
+        Type:            R_X86_64_RELATIVE
+        Addend:          16416
+      - Offset:          0x3FE0
+        Symbol:          _ITM_deregisterTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FE8
+        Symbol:          __gmon_start__
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF0
+        Symbol:          _ITM_registerTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF8
+        Symbol:          __cxa_finalize
+        Type:            R_X86_64_GLOB_DAT
+  - Name:            .rela.plt
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC, SHF_INFO_LINK ]
+    Address:         0x510
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Info:            .got.plt
+    Relocations:
+      - Offset:          0x4018
+        Symbol:          puts
+        Type:            R_X86_64_JUMP_SLOT
+  - Name:            .init
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1000
+    AddressAlign:    0x4
+    Offset:          0x1000
+    Content:         F30F1EFA4883EC08488B05D92F00004885C07402FFD04883C408C3
+  - Name:            .plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1020
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90
+  - Name:            .plt.got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1040
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25AD2F00000F1F440000
+  - Name:            .plt.sec
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1050
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25BD2F00000F1F440000
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1060
+    AddressAlign:    0x10
+    Content:         488D3DC12F0000488D05BA2F00004839F87415488B05662F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D912F0000488D358A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05352F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D4D2F000000752B5548833D122F0000004889E5740C488B3D2E2F0000E849FFFFFFE864FFFFFFC605252F0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5488D05D80E00004889C7E820FFFFFF905DC3
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1134
+    AddressAlign:    0x4
+    Content:         F30F1EFA4883EC084883C408C3
+  - Name:            .rodata
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2000
+    AddressAlign:    0x1
+    Offset:          0x2000
+    Content:         48656C6C6F2066726F6D204200
+  - Name:            .eh_frame_hdr
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2010
+    AddressAlign:    0x4
+    Content:         011B033B2C0000000400000010F0FFFF4800000030F0FFFF7000000040F0FFFF8800000009F1FFFFA0000000
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2040
+    AddressAlign:    0x8
+    Content:         1400000000000000017A5200017810011B0C070890010000240000001C000000C0EFFFFF20000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000B8EFFFFF100000000000000000000000140000005C000000B0EFFFFF1000000000000000000000001C0000007400000061F0FFFF1A00000000450E108602430D06510C070800000000000000
+  - Name:            .init_array
+    Type:            SHT_INIT_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E10
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Offset:          0x2E10
+    Content:         '1011000000000000'
+  - Name:            .fini_array
+    Type:            SHT_FINI_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E18
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         D010000000000000
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E20
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Entries:
+      - Tag:             DT_NEEDED
+        Value:           0x5F
+      - Tag:             DT_INIT
+        Value:           0x1000
+      - Tag:             DT_FINI
+        Value:           0x1134
+      - Tag:             DT_INIT_ARRAY
+        Value:           0x3E10
+      - Tag:             DT_INIT_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_FINI_ARRAY
+        Value:           0x3E18
+      - Tag:             DT_FINI_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_GNU_HASH
+        Value:           0x2F0
+      - Tag:             DT_STRTAB
+        Value:           0x3C0
+      - Tag:             DT_SYMTAB
+        Value:           0x318
+      - Tag:             DT_STRSZ
+        Value:           0x75
+      - Tag:             DT_SYMENT
+        Value:           0x18
+      - Tag:             DT_PLTGOT
+        Value:           0x4000
+      - Tag:             DT_PLTRELSZ
+        Value:           0x18
+      - Tag:             DT_PLTREL
+        Value:           0x7
+      - Tag:             DT_JMPREL
+        Value:           0x510
+      - Tag:             DT_RELA
+        Value:           0x468
+      - Tag:             DT_RELASZ
+        Value:           0xA8
+      - Tag:             DT_RELAENT
+        Value:           0x18
+      - Tag:             DT_VERNEED
+        Value:           0x448
+      - Tag:             DT_VERNEEDNUM
+        Value:           0x1
+      - Tag:             DT_VERSYM
+        Value:           0x436
+      - Tag:             DT_RELACOUNT
+        Value:           0x3
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+  - Name:            .got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3FE0
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '0000000000000000000000000000000000000000000000000000000000000000'
+  - Name:            .got.plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4000
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '203E000000000000000000000000000000000000000000003010000000000000'
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4020
+    AddressAlign:    0x8
+    Content:         '2040000000000000'
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4028
+    AddressAlign:    0x1
+    Size:            0x8
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000
+Symbols:
+  - Name:            crtstuff.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            deregister_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1060
+  - Name:            register_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1090
+  - Name:            __do_global_dtors_aux
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10D0
+  - Name:            completed.0
+    Type:            STT_OBJECT
+    Section:         .bss
+    Value:           0x4028
+    Size:            0x1
+  - Name:            __do_global_dtors_aux_fini_array_entry
+    Type:            STT_OBJECT
+    Section:         .fini_array
+    Value:           0x3E18
+  - Name:            frame_dummy
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1110
+  - Name:            __frame_dummy_init_array_entry
+    Type:            STT_OBJECT
+    Section:         .init_array
+    Value:           0x3E10
+  - Name:            libB.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            'crtstuff.c (1)'
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __FRAME_END__
+    Type:            STT_OBJECT
+    Section:         .eh_frame
+    Value:           0x20D0
+  - Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            _fini
+    Type:            STT_FUNC
+    Section:         .fini
+    Value:           0x1134
+  - Name:            __dso_handle
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4020
+  - Name:            _DYNAMIC
+    Type:            STT_OBJECT
+    Section:         .dynamic
+    Value:           0x3E20
+  - Name:            __GNU_EH_FRAME_HDR
+    Section:         .eh_frame_hdr
+    Value:           0x2010
+  - Name:            __TMC_END__
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4028
+  - Name:            _GLOBAL_OFFSET_TABLE_
+    Type:            STT_OBJECT
+    Section:         .got.plt
+    Value:           0x4000
+  - Name:            _init
+    Type:            STT_FUNC
+    Section:         .init
+    Value:           0x1000
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            'puts@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            sayB
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            '__cxa_finalize@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+DynamicSymbols:
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            puts
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            __cxa_finalize
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+  - Name:            sayB
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_macho.yaml
new file mode 100644
index 0000000000000..3d57c4f9271c6
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_macho.yaml
@@ -0,0 +1,723 @@
+--- !fat-mach-o
+FatHeader:
+  magic:           0xCAFEBABE
+  nfat_arch:       3
+FatArchs:
+  - cputype:         0x1000007
+    cpusubtype:      0x3
+    offset:          0x1000
+    size:            8376
+    align:           12
+  - cputype:         0x100000C
+    cpusubtype:      0x0
+    offset:          0x4000
+    size:            33376
+    align:           14
+  - cputype:         0x100000C
+    cpusubtype:      0x80000002
+    offset:          0x10000
+    size:            33376
+    align:           14
+Slices:
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x1000007
+      cpusubtype:      0x3
+      filetype:        0x6
+      ncmds:           14
+      sizeofcmds:      960
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          4096
+        fileoff:         0
+        filesize:        4096
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0xF80
+            size:            20
+            offset:          0xF80
+            align:           4
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         554889E5488D3D0F000000B000E8020000005DC3
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0xF94
+            size:            6
+            offset:          0xF94
+            align:           1
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x6
+            reserved3:       0x0
+            content:         FF2566000000
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0xF9A
+            size:            14
+            offset:          0xF9A
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20420A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0xFA8
+            size:            88
+            offset:          0xFA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000800F00004000000040000000940F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          4096
+        vmsize:          4096
+        fileoff:         4096
+        filesize:        4096
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x1000
+            size:            8
+            offset:          0x1000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          8192
+        vmsize:          4096
+        fileoff:         8192
+        filesize:        184
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         8192
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         8288
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          8320
+        nsyms:           2
+        stroff:          8360
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  8352
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            88B60B3C-13D3-3D7E-AEED-5F3E991FDF08
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         8312
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         8320
+        datasize:        0
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayB
+            Flags:           0x0
+            Address:         0xF80
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         3968
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayB
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0xF80 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0, 
+                         0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x0
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F70
+            size:            28
+            offset:          0x3F70
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8C0035FD6
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0x3F8C
+            size:            12
+            offset:          0x3F8C
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0xC
+            reserved3:       0x0
+            content:         100000B0100240F900021FD6
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20420A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000703F000040000000400000008C3F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            90C3787A-22E1-35AE-9284-97A4842F88AF
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayB
+            Flags:           0x0
+            Address:         0x3F70
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16240
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayB
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F70 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x80000002
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F68
+            size:            32
+            offset:          0x3F68
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         7F2303D5FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8FF0F5FD6
+          - sectname:        __auth_stubs
+            segname:         __TEXT
+            addr:            0x3F88
+            size:            16
+            offset:          0x3F88
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x10
+            reserved3:       0x0
+            content:         110000B031020091300240F9110A1FD7
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20420A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000883F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __auth_got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         00000000000001C0
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            76B41B3A-00EC-388B-A432-478A96772CC4
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayB
+            Flags:           0x0
+            Address:         0x3F68
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16232
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayB
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F68 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_linux.yaml
new file mode 100644
index 0000000000000..3fabf9a62e336
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_linux.yaml
@@ -0,0 +1,450 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .rela.plt
+    Align:           0x1000
+    Offset:          0x0
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .init
+    LastSec:         .fini
+    VAddr:           0x1000
+    Align:           0x1000
+    Offset:          0x1000
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame
+    VAddr:           0x2000
+    Align:           0x1000
+    Offset:          0x2000
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .bss
+    VAddr:           0x3E10
+    Align:           0x1000
+    Offset:          0x2E10
+  - Type:            PT_DYNAMIC
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .dynamic
+    LastSec:         .dynamic
+    VAddr:           0x3E20
+    Align:           0x8
+    Offset:          0x2E20
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.build-id
+    LastSec:         .note.gnu.build-id
+    VAddr:           0x2C8
+    Align:           0x4
+    Offset:          0x2C8
+  - Type:            PT_GNU_PROPERTY
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_GNU_EH_FRAME
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame_hdr
+    VAddr:           0x2000
+    Align:           0x4
+    Offset:          0x2000
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x10
+    Offset:          0x0
+  - Type:            PT_GNU_RELRO
+    Flags:           [ PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .got
+    VAddr:           0x3E10
+    Offset:          0x2E10
+Sections:
+  - Name:            .note.gnu.property
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2A8
+    AddressAlign:    0x8
+    Notes:
+      - Name:            GNU
+        Desc:            020000C0040000000300000000000000
+        Type:            NT_GNU_PROPERTY_TYPE_0
+  - Name:            .note.gnu.build-id
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2C8
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            0318D63E46BF31CEFF90D5C7F0475D9F78676EC8
+        Type:            NT_PRPSINFO
+  - Name:            .gnu.hash
+    Type:            SHT_GNU_HASH
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2F0
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Header:
+      SymNdx:          0x8
+      Shift2:          0x6
+    BloomFilter:     [ 0x400000200000 ]
+    HashBuckets:     [ 0x0, 0x8 ]
+    HashValues:      [ 0x7C9DCB95 ]
+  - Name:            .dynsym
+    Type:            SHT_DYNSYM
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x318
+    Link:            .dynstr
+    AddressAlign:    0x8
+  - Name:            .dynstr
+    Type:            SHT_STRTAB
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3F0
+    AddressAlign:    0x1
+    Content:         "6C6962412E736F006C6962422E736F006C69625A2E736F00244F524947494E2F2E2E2F413A244F524947494E2F2E2E2F423A244F524947494E2F2E2E2F5A"
+  - Name:            .rela.dyn
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x498
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Relocations:
+      - Offset:          0x3E10
+        Type:            R_X86_64_RELATIVE
+        Addend:          4432
+      - Offset:          0x3E18
+        Type:            R_X86_64_RELATIVE
+        Addend:          4368
+      - Offset:          0x4030
+        Type:            R_X86_64_RELATIVE
+        Addend:          16432
+      - Offset:          0x3FE0
+        Symbol:          __cxa_finalize
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FE8
+        Symbol:          _ITM_registerTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF0
+        Symbol:          _ITM_deregisterTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF8
+        Symbol:          __gmon_start__
+        Type:            R_X86_64_GLOB_DAT
+  - Name:            .rela.plt
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC, SHF_INFO_LINK ]
+    Address:         0x540
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Info:            .got.plt
+    Relocations:
+      - Offset:          0x4018
+        Symbol:          sayA
+        Type:            R_X86_64_JUMP_SLOT
+      - Offset:          0x4020
+        Symbol:          sayB
+        Type:            R_X86_64_JUMP_SLOT
+      - Offset:          0x4028
+        Symbol:          sayZ
+        Type:            R_X86_64_JUMP_SLOT
+  - Name:            .init
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1000
+    AddressAlign:    0x4
+    Offset:          0x1000
+    Content:         F30F1EFA4883EC08488B05E92F00004885C07402FFD04883C408C3
+  - Name:            .plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1020
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90F30F1EFA6801000000F2E9D1FFFFFF90F30F1EFA6802000000F2E9C1FFFFFF90
+  - Name:            .plt.got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1060
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25752F00000F1F440000
+  - Name:            .plt.sec
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1070
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF259D2F00000F1F440000F30F1EFAF2FF25952F00000F1F440000F30F1EFAF2FF258D2F00000F1F440000
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x10A0
+    AddressAlign:    0x10
+    Content:         488D3D912F0000488D058A2F00004839F87415488B05362F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D612F0000488D355A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05ED2E00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D1D2F000000752B5548833DBA2E0000004889E5740C488B3DFE2E0000E829FFFFFFE864FFFFFFC605F52E0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5B800000000E805FFFFFFB800000000E80BFFFFFFB800000000E811FFFFFF905DC3
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1184
+    AddressAlign:    0x4
+    Content:         F30F1EFA4883EC084883C408C3
+  - Name:            .eh_frame_hdr
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2000
+    AddressAlign:    0x4
+    Offset:          0x2000
+    Content:         011B033B2C0000000400000020F0FFFF4800000060F0FFFF7000000070F0FFFF8800000059F1FFFFA0000000
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2030
+    AddressAlign:    0x8
+    Content:         1400000000000000017A5200017810011B0C070890010000240000001C000000D0EFFFFF40000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000E8EFFFFF100000000000000000000000140000005C000000E0EFFFFF3000000000000000000000001C00000074000000B1F0FFFF2900000000450E108602430D06600C070800000000000000
+  - Name:            .init_array
+    Type:            SHT_INIT_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E10
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Offset:          0x2E10
+    Content:         '5011000000000000'
+  - Name:            .fini_array
+    Type:            SHT_FINI_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E18
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '1011000000000000'
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E20
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Entries:
+      - Tag:             DT_NEEDED
+        Value:           0x0
+      - Tag:             DT_NEEDED
+        Value:           0x8
+      - Tag:             DT_NEEDED
+        Value:           0x10
+      - Tag:             DT_RUNPATH
+        Value:           0x18
+      - Tag:             DT_INIT
+        Value:           0x1000
+      - Tag:             DT_FINI
+        Value:           0x1184
+      - Tag:             DT_INIT_ARRAY
+        Value:           0x3E10
+      - Tag:             DT_INIT_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_FINI_ARRAY
+        Value:           0x3E18
+      - Tag:             DT_FINI_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_GNU_HASH
+        Value:           0x2F0
+      - Tag:             DT_STRTAB
+        Value:           0x3F0
+      - Tag:             DT_SYMTAB
+        Value:           0x318
+      - Tag:             DT_STRSZ
+        Value:           0xA8
+      - Tag:             DT_SYMENT
+        Value:           0x18
+      - Tag:             DT_PLTGOT
+        Value:           0x4000
+      - Tag:             DT_PLTRELSZ
+        Value:           0x48
+      - Tag:             DT_PLTREL
+        Value:           0x7
+      - Tag:             DT_JMPREL
+        Value:           0x540
+      - Tag:             DT_RELA
+        Value:           0x498
+      - Tag:             DT_RELASZ
+        Value:           0xA8
+      - Tag:             DT_RELAENT
+        Value:           0x18
+      - Tag:             DT_RELACOUNT
+        Value:           0x3
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+  - Name:            .got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3FE0
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '0000000000000000000000000000000000000000000000000000000000000000'
+  - Name:            .got.plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4000
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '203E00000000000000000000000000000000000000000000301000000000000040100000000000005010000000000000'
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4030
+    AddressAlign:    0x8
+    Content:         '3040000000000000'
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4038
+    AddressAlign:    0x1
+    Size:            0x8
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000
+Symbols:
+  - Name:            crtstuff.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            deregister_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10A0
+  - Name:            register_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10D0
+  - Name:            __do_global_dtors_aux
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1110
+  - Name:            completed.0
+    Type:            STT_OBJECT
+    Section:         .bss
+    Value:           0x4038
+    Size:            0x1
+  - Name:            __do_global_dtors_aux_fini_array_entry
+    Type:            STT_OBJECT
+    Section:         .fini_array
+    Value:           0x3E18
+  - Name:            frame_dummy
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1150
+  - Name:            __frame_dummy_init_array_entry
+    Type:            STT_OBJECT
+    Section:         .init_array
+    Value:           0x3E10
+  - Name:            libC.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            'crtstuff.c (1)'
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __FRAME_END__
+    Type:            STT_OBJECT
+    Section:         .eh_frame
+    Value:           0x20C0
+  - Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            _DYNAMIC
+    Type:            STT_OBJECT
+    Section:         .dynamic
+    Value:           0x3E20
+  - Name:            __TMC_END__
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4038
+  - Name:            __dso_handle
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4030
+  - Name:            _init
+    Type:            STT_FUNC
+    Section:         .init
+    Value:           0x1000
+  - Name:            __GNU_EH_FRAME_HDR
+    Section:         .eh_frame_hdr
+    Value:           0x2000
+  - Name:            _fini
+    Type:            STT_FUNC
+    Section:         .fini
+    Value:           0x1184
+  - Name:            _GLOBAL_OFFSET_TABLE_
+    Type:            STT_OBJECT
+    Section:         .got.plt
+    Value:           0x4000
+  - Name:            __cxa_finalize
+    Binding:         STB_WEAK
+  - Name:            sayC
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1159
+    Size:            0x29
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            sayA
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            sayB
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            sayZ
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+DynamicSymbols:
+  - Name:            __cxa_finalize
+    Binding:         STB_WEAK
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            sayA
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            sayB
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            sayZ
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            sayC
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1159
+    Size:            0x29
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_macho.yaml
new file mode 100644
index 0000000000000..ba33483c5122f
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_macho.yaml
@@ -0,0 +1,870 @@
+--- !fat-mach-o
+FatHeader:
+  magic:           0xCAFEBABE
+  nfat_arch:       3
+FatArchs:
+  - cputype:         0x1000007
+    cpusubtype:      0x3
+    offset:          0x1000
+    size:            8456
+    align:           12
+  - cputype:         0x100000C
+    cpusubtype:      0x0
+    offset:          0x4000
+    size:            33456
+    align:           14
+  - cputype:         0x100000C
+    cpusubtype:      0x80000002
+    offset:          0x10000
+    size:            33456
+    align:           14
+Slices:
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x1000007
+      cpusubtype:      0x3
+      filetype:        0x6
+      ncmds:           20
+      sizeofcmds:      1120
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         312
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          4096
+        fileoff:         0
+        filesize:        4096
+        maxprot:         5
+        initprot:        5
+        nsects:          3
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0xF70
+            size:            27
+            offset:          0xF70
+            align:           4
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         554889E5B000E811000000B000E810000000B000E80F0000005DC3
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0xF8C
+            size:            18
+            offset:          0xF8C
+            align:           1
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x6
+            reserved3:       0x0
+            content:         FF256E000000FF2570000000FF2572000000
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0xFA0
+            size:            88
+            offset:          0xFA0
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000700F000040000000400000008B0F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          4096
+        vmsize:          4096
+        fileoff:         4096
+        filesize:        4096
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x1000
+            size:            24
+            offset:          0x1000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x3
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '000000000000108001000000000010800200000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          8192
+        vmsize:          4096
+        fileoff:         8192
+        filesize:        264
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libC.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         8192
+        datasize:        112
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         8304
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          8336
+        nsyms:           4
+        stroff:          8424
+        strsize:         32
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       3
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  8400
+        nindirectsyms:   6
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            2AA1F9E9-F250-366F-B382-51A91DE06BED
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../A'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../B'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../Z'
+        ZeroPadBytes:    3
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         8328
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         8336
+        datasize:        0
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayC
+            Flags:           0x0
+            Address:         0xF70
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         3952
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+        - n_strx:          14
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          512
+          n_value:         0
+        - n_strx:          20
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          768
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayC
+        - _sayA
+        - _sayB
+        - _sayZ
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+      IndirectSymbols: [ 0x1, 0x2, 0x3, 0x1, 0x2, 0x3 ]
+      FunctionStarts:  [ 0xF70 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x58, 0x0, 0x0, 0x0, 0x3, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0, 
+                         0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x2, 0xE, 0x0, 0x0, 0x3, 0x1A, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x73, 0x61, 0x79, 
+                         0x41, 0x0, 0x5F, 0x73, 0x61, 0x79, 0x42, 0x0, 
+                         0x5F, 0x73, 0x61, 0x79, 0x5A, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x0
+      filetype:        0x6
+      ncmds:           21
+      sizeofcmds:      1136
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         312
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          3
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F68
+            size:            28
+            offset:          0x3F68
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         FD7BBFA9FD030091050000940700009409000094FD7BC1A8C0035FD6
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0x3F84
+            size:            36
+            offset:          0x3F84
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0xC
+            reserved3:       0x0
+            content:         100000B0100240F900021FD6100000B0100640F900021FD6100000B0100A40F900021FD6
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000843F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            24
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x3
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '000000000000108001000000000010800200000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        688
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libC.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        112
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32880
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32912
+        nsyms:           4
+        stroff:          33000
+        strsize:         32
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       3
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32976
+        nindirectsyms:   6
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            02B69690-925D-35EE-A8AB-6D99813D2A16
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../A'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../B'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../Z'
+        ZeroPadBytes:    3
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32904
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32912
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         33040
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayC
+            Flags:           0x0
+            Address:         0x3F68
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16232
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+        - n_strx:          14
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          512
+          n_value:         0
+        - n_strx:          20
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          768
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayC
+        - _sayA
+        - _sayB
+        - _sayZ
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+      IndirectSymbols: [ 0x1, 0x2, 0x3, 0x1, 0x2, 0x3 ]
+      FunctionStarts:  [ 0x3F68 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x58, 0x0, 0x0, 0x0, 0x3, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x2, 0xE, 0x0, 0x0, 0x3, 0x1A, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x73, 0x61, 0x79, 
+                         0x41, 0x0, 0x5F, 0x73, 0x61, 0x79, 0x42, 0x0, 
+                         0x5F, 0x73, 0x61, 0x79, 0x5A, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x80000002
+      filetype:        0x6
+      ncmds:           21
+      sizeofcmds:      1136
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         312
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          3
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F58
+            size:            32
+            offset:          0x3F58
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         7F2303D5FD7BBFA9FD03009105000094080000940B000094FD7BC1A8FF0F5FD6
+          - sectname:        __auth_stubs
+            segname:         __TEXT
+            addr:            0x3F78
+            size:            48
+            offset:          0x3F78
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x10
+            reserved3:       0x0
+            content:         110000B031020091300240F9110A1FD7110000B031220091300240F9110A1FD7110000B031420091300240F9110A1FD7
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000583F00004000000040000000783F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __auth_got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            24
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x3
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         00000000000009C001000000000009C002000000000001C0
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        688
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libC.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        112
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32880
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32912
+        nsyms:           4
+        stroff:          33000
+        strsize:         32
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       3
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32976
+        nindirectsyms:   6
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            F54076AA-8888-3DED-8BDF-BC7FB3E6FE8A
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../A'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../B'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../Z'
+        ZeroPadBytes:    3
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32904
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32912
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         33040
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayC
+            Flags:           0x0
+            Address:         0x3F58
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16216
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+        - n_strx:          14
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          512
+          n_value:         0
+        - n_strx:          20
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          768
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayC
+        - _sayA
+        - _sayB
+        - _sayZ
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+      IndirectSymbols: [ 0x1, 0x2, 0x3, 0x1, 0x2, 0x3 ]
+      FunctionStarts:  [ 0x3F58 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x58, 0x0, 0x0, 0x0, 0x3, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x2, 0xE, 0x0, 0x0, 0x3, 0x1A, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x73, 0x61, 0x79, 
+                         0x41, 0x0, 0x5F, 0x73, 0x61, 0x79, 0x42, 0x0, 
+                         0x5F, 0x73, 0x61, 0x79, 0x5A, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0 ]
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml
new file mode 100644
index 0000000000000..5561f29a93602
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml
@@ -0,0 +1,460 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .rela.plt
+    Align:           0x1000
+    Offset:          0x0
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .init
+    LastSec:         .fini
+    VAddr:           0x1000
+    Align:           0x1000
+    Offset:          0x1000
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .rodata
+    LastSec:         .eh_frame
+    VAddr:           0x2000
+    Align:           0x1000
+    Offset:          0x2000
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .bss
+    VAddr:           0x3E10
+    Align:           0x1000
+    Offset:          0x2E10
+  - Type:            PT_DYNAMIC
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .dynamic
+    LastSec:         .dynamic
+    VAddr:           0x3E20
+    Align:           0x8
+    Offset:          0x2E20
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.build-id
+    LastSec:         .note.gnu.build-id
+    VAddr:           0x2C8
+    Align:           0x4
+    Offset:          0x2C8
+  - Type:            PT_GNU_PROPERTY
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_GNU_EH_FRAME
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame_hdr
+    VAddr:           0x2010
+    Align:           0x4
+    Offset:          0x2010
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x10
+    Offset:          0x0
+  - Type:            PT_GNU_RELRO
+    Flags:           [ PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .got
+    VAddr:           0x3E10
+    Offset:          0x2E10
+Sections:
+  - Name:            .note.gnu.property
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2A8
+    AddressAlign:    0x8
+    Notes:
+      - Name:            GNU
+        Desc:            020000C0040000000300000000000000
+        Type:            NT_GNU_PROPERTY_TYPE_0
+  - Name:            .note.gnu.build-id
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2C8
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            640A4A3AC0DF6BA3DAC3B51CCD727245117E0B30
+        Type:            NT_PRPSINFO
+  - Name:            .gnu.hash
+    Type:            SHT_GNU_HASH
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2F0
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Header:
+      SymNdx:          0x6
+      Shift2:          0x6
+    BloomFilter:     [ 0x500000000000 ]
+    HashBuckets:     [ 0x6, 0x0 ]
+    HashValues:      [ 0x7C9DCBAD ]
+  - Name:            .dynsym
+    Type:            SHT_DYNSYM
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x318
+    Link:            .dynstr
+    AddressAlign:    0x8
+  - Name:            .dynstr
+    Type:            SHT_STRTAB
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3C0
+    AddressAlign:    0x1
+  - Name:            .gnu.version
+    Type:            SHT_GNU_versym
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x436
+    Link:            .dynsym
+    AddressAlign:    0x2
+    Entries:         [ 0, 1, 2, 1, 1, 2, 1 ]
+  - Name:            .gnu.version_r
+    Type:            SHT_GNU_verneed
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x448
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Dependencies:
+      - Version:         1
+        File:            libc.so.6
+        Entries:
+          - Name:            GLIBC_2.2.5
+            Hash:            157882997
+            Flags:           0
+            Other:           2
+  - Name:            .rela.dyn
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x468
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Relocations:
+      - Offset:          0x3E10
+        Type:            R_X86_64_RELATIVE
+        Addend:          4368
+      - Offset:          0x3E18
+        Type:            R_X86_64_RELATIVE
+        Addend:          4304
+      - Offset:          0x4020
+        Type:            R_X86_64_RELATIVE
+        Addend:          16416
+      - Offset:          0x3FE0
+        Symbol:          _ITM_deregisterTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FE8
+        Symbol:          __gmon_start__
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF0
+        Symbol:          _ITM_registerTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF8
+        Symbol:          __cxa_finalize
+        Type:            R_X86_64_GLOB_DAT
+  - Name:            .rela.plt
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC, SHF_INFO_LINK ]
+    Address:         0x510
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Info:            .got.plt
+    Relocations:
+      - Offset:          0x4018
+        Symbol:          puts
+        Type:            R_X86_64_JUMP_SLOT
+  - Name:            .init
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1000
+    AddressAlign:    0x4
+    Offset:          0x1000
+    Content:         F30F1EFA4883EC08488B05D92F00004885C07402FFD04883C408C3
+  - Name:            .plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1020
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90
+  - Name:            .plt.got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1040
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25AD2F00000F1F440000
+  - Name:            .plt.sec
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1050
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25BD2F00000F1F440000
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1060
+    AddressAlign:    0x10
+    Content:         488D3DC12F0000488D05BA2F00004839F87415488B05662F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D912F0000488D358A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05352F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D4D2F000000752B5548833D122F0000004889E5740C488B3D2E2F0000E849FFFFFFE864FFFFFFC605252F0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5488D05D80E00004889C7E820FFFFFF905DC3
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1134
+    AddressAlign:    0x4
+    Content:         F30F1EFA4883EC084883C408C3
+  - Name:            .rodata
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2000
+    AddressAlign:    0x1
+    Offset:          0x2000
+    Content:         48656C6C6F2066726F6D205A00
+  - Name:            .eh_frame_hdr
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2010
+    AddressAlign:    0x4
+    Content:         011B033B2C0000000400000010F0FFFF4800000030F0FFFF7000000040F0FFFF8800000009F1FFFFA0000000
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2040
+    AddressAlign:    0x8
+    Content:         1400000000000000017A5200017810011B0C070890010000240000001C000000C0EFFFFF20000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000B8EFFFFF100000000000000000000000140000005C000000B0EFFFFF1000000000000000000000001C0000007400000061F0FFFF1A00000000450E108602430D06510C070800000000000000
+  - Name:            .init_array
+    Type:            SHT_INIT_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E10
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Offset:          0x2E10
+    Content:         '1011000000000000'
+  - Name:            .fini_array
+    Type:            SHT_FINI_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E18
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         D010000000000000
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E20
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Entries:
+      - Tag:             DT_NEEDED
+        Value:           0x5F
+      - Tag:             DT_INIT
+        Value:           0x1000
+      - Tag:             DT_FINI
+        Value:           0x1134
+      - Tag:             DT_INIT_ARRAY
+        Value:           0x3E10
+      - Tag:             DT_INIT_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_FINI_ARRAY
+        Value:           0x3E18
+      - Tag:             DT_FINI_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_GNU_HASH
+        Value:           0x2F0
+      - Tag:             DT_STRTAB
+        Value:           0x3C0
+      - Tag:             DT_SYMTAB
+        Value:           0x318
+      - Tag:             DT_STRSZ
+        Value:           0x75
+      - Tag:             DT_SYMENT
+        Value:           0x18
+      - Tag:             DT_PLTGOT
+        Value:           0x4000
+      - Tag:             DT_PLTRELSZ
+        Value:           0x18
+      - Tag:             DT_PLTREL
+        Value:           0x7
+      - Tag:             DT_JMPREL
+        Value:           0x510
+      - Tag:             DT_RELA
+        Value:           0x468
+      - Tag:             DT_RELASZ
+        Value:           0xA8
+      - Tag:             DT_RELAENT
+        Value:           0x18
+      - Tag:             DT_VERNEED
+        Value:           0x448
+      - Tag:             DT_VERNEEDNUM
+        Value:           0x1
+      - Tag:             DT_VERSYM
+        Value:           0x436
+      - Tag:             DT_RELACOUNT
+        Value:           0x3
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+  - Name:            .got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3FE0
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '0000000000000000000000000000000000000000000000000000000000000000'
+  - Name:            .got.plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4000
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '203E000000000000000000000000000000000000000000003010000000000000'
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4020
+    AddressAlign:    0x8
+    Content:         '2040000000000000'
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4028
+    AddressAlign:    0x1
+    Size:            0x8
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000
+Symbols:
+  - Name:            crtstuff.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            deregister_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1060
+  - Name:            register_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1090
+  - Name:            __do_global_dtors_aux
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10D0
+  - Name:            completed.0
+    Type:            STT_OBJECT
+    Section:         .bss
+    Value:           0x4028
+    Size:            0x1
+  - Name:            __do_global_dtors_aux_fini_array_entry
+    Type:            STT_OBJECT
+    Section:         .fini_array
+    Value:           0x3E18
+  - Name:            frame_dummy
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1110
+  - Name:            __frame_dummy_init_array_entry
+    Type:            STT_OBJECT
+    Section:         .init_array
+    Value:           0x3E10
+  - Name:            libZ.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            'crtstuff.c (1)'
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __FRAME_END__
+    Type:            STT_OBJECT
+    Section:         .eh_frame
+    Value:           0x20D0
+  - Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            _fini
+    Type:            STT_FUNC
+    Section:         .fini
+    Value:           0x1134
+  - Name:            __dso_handle
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4020
+  - Name:            _DYNAMIC
+    Type:            STT_OBJECT
+    Section:         .dynamic
+    Value:           0x3E20
+  - Name:            __GNU_EH_FRAME_HDR
+    Section:         .eh_frame_hdr
+    Value:           0x2010
+  - Name:            __TMC_END__
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4028
+  - Name:            _GLOBAL_OFFSET_TABLE_
+    Type:            STT_OBJECT
+    Section:         .got.plt
+    Value:           0x4000
+  - Name:            _init
+    Type:            STT_FUNC
+    Section:         .init
+    Value:           0x1000
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            'puts@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            sayZ
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            '__cxa_finalize@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+DynamicSymbols:
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            puts
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            __cxa_finalize
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+  - Name:            sayZ
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml
new file mode 100644
index 0000000000000..c0c18265ab667
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml
@@ -0,0 +1,723 @@
+--- !fat-mach-o
+FatHeader:
+  magic:           0xCAFEBABE
+  nfat_arch:       3
+FatArchs:
+  - cputype:         0x1000007
+    cpusubtype:      0x3
+    offset:          0x1000
+    size:            8376
+    align:           12
+  - cputype:         0x100000C
+    cpusubtype:      0x0
+    offset:          0x4000
+    size:            33376
+    align:           14
+  - cputype:         0x100000C
+    cpusubtype:      0x80000002
+    offset:          0x10000
+    size:            33376
+    align:           14
+Slices:
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x1000007
+      cpusubtype:      0x3
+      filetype:        0x6
+      ncmds:           14
+      sizeofcmds:      960
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          4096
+        fileoff:         0
+        filesize:        4096
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0xF80
+            size:            20
+            offset:          0xF80
+            align:           4
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         554889E5488D3D0F000000B000E8020000005DC3
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0xF94
+            size:            6
+            offset:          0xF94
+            align:           1
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x6
+            reserved3:       0x0
+            content:         FF2566000000
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0xF9A
+            size:            14
+            offset:          0xF9A
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D205A0A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0xFA8
+            size:            88
+            offset:          0xFA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000800F00004000000040000000940F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          4096
+        vmsize:          4096
+        fileoff:         4096
+        filesize:        4096
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x1000
+            size:            8
+            offset:          0x1000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          8192
+        vmsize:          4096
+        fileoff:         8192
+        filesize:        184
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         8192
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         8288
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          8320
+        nsyms:           2
+        stroff:          8360
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  8352
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            399E203C-FF9A-3B80-872C-85F3A759A78B
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         8312
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         8320
+        datasize:        0
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayZ
+            Flags:           0x0
+            Address:         0xF80
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         3968
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayZ
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0xF80 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0, 
+                         0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x0
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F70
+            size:            28
+            offset:          0x3F70
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8C0035FD6
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0x3F8C
+            size:            12
+            offset:          0x3F8C
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0xC
+            reserved3:       0x0
+            content:         100000B0100240F900021FD6
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D205A0A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000703F000040000000400000008C3F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            6E8E78AF-EDB2-3830-BE1E-013390302CC5
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayZ
+            Flags:           0x0
+            Address:         0x3F70
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16240
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayZ
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F70 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x80000002
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F68
+            size:            32
+            offset:          0x3F68
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         7F2303D5FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8FF0F5FD6
+          - sectname:        __auth_stubs
+            segname:         __TEXT
+            addr:            0x3F88
+            size:            16
+            offset:          0x3F88
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x10
+            reserved3:       0x0
+            content:         110000B031020091300240F9110A1FD7
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D205A0A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000883F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __auth_got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         00000000000001C0
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            E74F368D-238F-31FA-BF40-FA2964FED986
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayZ
+            Flags:           0x0
+            Address:         0x3F68
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16232
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayZ
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F68 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp b/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp
new file mode 100644
index 0000000000000..441344b281411
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp
@@ -0,0 +1,762 @@
+//===- LibraryResolverTest.cpp - Unit tests for LibraryResolver -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+#include "llvm/ObjectYAML/MachOYAML.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/Testing/Support/SupportHelpers.h"
+
+#include "gtest/gtest.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::orc;
+
+#if defined(__APPLE__) || defined(__linux__)
+// TODO: Add COFF (Windows) support for these tests.
+// this facility also works correctly on Windows (COFF),
+// so we should eventually enable and run these tests for that platform as well.
+namespace {
+
+#if defined(__APPLE__)
+constexpr const char *ext = ".dylib";
+#elif defined(_WIN32)
+constexpr const char *ext = ".dll";
+#else
+constexpr const char *ext = ".so";
+#endif
+
+bool EnvReady = false;
+
+Triple getTargetTriple() {
+  auto JTMB = JITTargetMachineBuilder::detectHost();
+  if (!JTMB) {
+    consumeError(JTMB.takeError());
+    return Triple();
+  }
+  return JTMB->getTargetTriple();
+}
+
+static bool CheckHostSupport() {
+  auto Triple = getTargetTriple();
+  // TODO: Extend support to COFF (Windows) once test setup and YAML conversion
+  // are verified.
+  if (!Triple.isOSBinFormatMachO() &&
+      !(Triple.isOSBinFormatELF() && Triple.getArch() == Triple::x86_64))
+    return false;
+
+  return true;
+}
+
+std::string getYamlFilePlatformExt() {
+  auto Triple = getTargetTriple();
+  if (Triple.isOSBinFormatMachO())
+    return "_macho";
+  else if (Triple.isOSBinFormatELF())
+    return "_linux";
+
+  return "";
+}
+
+unsigned getYamlDocNum() {
+  // auto Triple = getTargetTriple();
+  // if (Triple.isOSBinFormatELF())
+  //   return 1;
+
+  return 1;
+}
+
+class LibraryTestEnvironment : public ::testing::Environment {
+  std::vector<std::string> CreatedDylibsDir;
+  std::vector<std::string> CreatedDylibs;
+  SmallVector<char, 128> DirPath;
+
+public:
+  void SetUp() override {
+    if (!CheckHostSupport()) {
+      EnvReady = false;
+      return;
+    }
+
+    StringRef ThisFile = __FILE__;
+    SmallVector<char, 128> InputDirPath(ThisFile.begin(), ThisFile.end());
+    sys::path::remove_filename(InputDirPath);
+    sys::path::append(InputDirPath, "Inputs");
+    if (!sys::fs::exists(InputDirPath))
+      return;
+
+    SmallString<512> ExecPath(sys::fs::getMainExecutable(nullptr, nullptr));
+    sys::path::remove_filename(ExecPath);
+    SmallString<128> UniqueDir(ExecPath);
+    std::error_code EC = sys::fs::createUniqueDirectory(UniqueDir, DirPath);
+
+    if (EC)
+      return;
+
+    // given yamlPath + DylibPath, validate + convert
+    auto processYamlToDylib = [&](const SmallVector<char, 128> &YamlPath,
+                                  const SmallVector<char, 128> &DylibPath,
+                                  unsigned DocNum) -> bool {
+      if (!sys::fs::exists(YamlPath)) {
+        errs() << "YAML file missing: "
+               << StringRef(YamlPath.data(), YamlPath.size()) << "\n";
+        EnvReady = false;
+        return false;
+      }
+
+      auto BufOrErr = MemoryBuffer::getFile(YamlPath);
+      if (!BufOrErr) {
+        errs() << "Failed to read "
+               << StringRef(YamlPath.data(), YamlPath.size()) << ": "
+               << BufOrErr.getError().message() << "\n";
+        EnvReady = false;
+        return false;
+      }
+
+      yaml::Input yin(BufOrErr->get()->getBuffer());
+      std::error_code EC;
+      raw_fd_ostream outFile(StringRef(DylibPath.data(), DylibPath.size()), EC,
+                             sys::fs::OF_None);
+
+      if (EC) {
+        errs() << "Failed to open "
+               << StringRef(DylibPath.data(), DylibPath.size())
+               << " for writing: " << EC.message() << "\n";
+        EnvReady = false;
+        return false;
+      }
+
+      if (!yaml::convertYAML(
+              yin, outFile,
+              [](const Twine &M) {
+                // Handle or ignore errors here
+                errs() << "Yaml Error :" << M << "\n";
+              },
+              DocNum)) {
+        errs() << "Failed to convert "
+               << StringRef(YamlPath.data(), YamlPath.size()) << " to "
+               << StringRef(DylibPath.data(), DylibPath.size()) << "\n";
+        EnvReady = false;
+        return false;
+      }
+
+      CreatedDylibsDir.push_back(std::string(sys::path::parent_path(
+          StringRef(DylibPath.data(), DylibPath.size()))));
+      CreatedDylibs.push_back(std::string(DylibPath.begin(), DylibPath.end()));
+      return true;
+    };
+
+    std::vector<const char *> LibDirs = {"Z", "A", "B", "C"};
+
+    unsigned DocNum = getYamlDocNum();
+    std::string YamlPltExt = getYamlFilePlatformExt();
+    for (const auto &LibdirName : LibDirs) {
+      // YAML path
+      SmallVector<char, 128> YamlPath(InputDirPath.begin(), InputDirPath.end());
+      SmallVector<char, 128> YamlFileName;
+      YamlFileName.append(LibdirName, LibdirName + strlen(LibdirName));
+      YamlFileName.append(YamlPltExt.begin(), YamlPltExt.end());
+      sys::path::append(YamlPath, LibdirName, YamlFileName);
+      sys::path::replace_extension(YamlPath, ".yaml");
+
+      // dylib path
+      SmallVector<char, 128> DylibPath(DirPath.begin(), DirPath.end());
+      SmallVector<char, 128> DylibFileName;
+      StringRef prefix("lib");
+      DylibFileName.append(prefix.begin(), prefix.end());
+      DylibFileName.append(LibdirName, LibdirName + strlen(LibdirName));
+
+      sys::path::append(DylibPath, LibdirName);
+      if (!sys::fs::exists(DylibPath)) {
+        auto EC = sys::fs::create_directory(DylibPath);
+        if (EC)
+          return;
+      }
+      sys::path::append(DylibPath, DylibFileName);
+      sys::path::replace_extension(DylibPath, ext);
+      if (!processYamlToDylib(YamlPath, DylibPath, DocNum))
+        return;
+    }
+
+    EnvReady = true;
+  }
+
+  void TearDown() override { sys::fs::remove_directories(DirPath); }
+
+  std::string getBaseDir() const {
+    return std::string(DirPath.begin(), DirPath.end());
+  }
+
+  std::vector<std::string> getDylibPaths() const { return CreatedDylibs; }
+};
+
+static LibraryTestEnvironment *GlobalEnv =
+    static_cast<LibraryTestEnvironment *>(
+        ::testing::AddGlobalTestEnvironment(new LibraryTestEnvironment()));
+
+inline std::string libPath(const std::string &BaseDir,
+                           const std::string &name) {
+#if defined(__APPLE__)
+  return BaseDir + "/" + name + ".dylib";
+#elif defined(_WIN32)
+  return BaseDir + "/" + name + ".dll";
+#else
+  return BaseDir + "/" + name + ".so";
+#endif
+}
+
+inline std::string withext(const std::string &lib) {
+  SmallString<128> P(lib);
+  sys::path::replace_extension(P, ext);
+  return P.str().str();
+}
+
+inline std::string platformSymbolName(const std::string &name) {
+#if defined(__APPLE__)
+  return "_" + name; // macOS prepends underscore
+#else
+  return name;
+#endif
+}
+
+struct TestLibrary {
+  std::string path;
+  std::vector<std::string> Syms;
+};
+
+class LibraryResolverIT : public ::testing::Test {
+protected:
+  std::string BaseDir;
+  std::unordered_map<std::string, TestLibrary> libs;
+
+  void addLib(const std::string &name) {
+    SmallString<512> path;
+    std::error_code EC =
+        sys::fs::real_path(libPath(BaseDir, name + "/lib" + name), path);
+    if (EC || path.empty() || !sys::fs::exists(path))
+      GTEST_SKIP();
+    libs[name] = {path.str().str(), {platformSymbolName("say" + name)}};
+  }
+
+  void SetUp() override {
+    if (!EnvReady || GlobalEnv == nullptr)
+      GTEST_SKIP() << "Skipping test: environment setup failed.";
+
+    {
+      SmallString<512> path;
+      std::error_code EC = sys::fs::real_path(GlobalEnv->getBaseDir(), path);
+      if (path.empty() || EC)
+        GTEST_SKIP() << "Base directory resolution failed: " << EC.message();
+      BaseDir = path.str().str();
+    }
+
+    for (const auto &P : GlobalEnv->getDylibPaths()) {
+      if (!sys::fs::exists(P))
+        GTEST_SKIP() << "Missing dylib path: " << P;
+    }
+
+    const std::vector<std::string> libNames = {"A", "B", "C", "Z"};
+    for (const auto &name : libNames)
+      addLib(name);
+
+    if (!EnvReady)
+      GTEST_SKIP() << "Skipping test: environment setup failed.";
+  }
+
+  const std::vector<std::string> &sym(const std::string &key) {
+    return libs[key].Syms;
+  }
+  const std::string &lib(const std::string &key) { return libs[key].path; }
+  const std::string libdir(const std::string &key) {
+    SmallString<512> P(libs[key].path);
+    sys::path::remove_filename(P);
+    return P.str().str();
+  }
+  const std::string libname(const std::string &key) {
+    return sys::path::filename(libs[key].path).str();
+  }
+};
+
+// Helper: allow either "sayA" or "_sayA" depending on how your
+// SymbolEnumerator reports.
+static bool matchesEitherUnderscore(const std::string &got,
+                                    const std::string &bare) {
+  return got == bare || got == ("_" + bare);
+}
+
+// Helper: normalize path ending check (we only care that it resolved to the
+// right dylib)
+static bool endsWith(const std::string &s, const std::string &suffix) {
+  if (s.size() < suffix.size())
+    return false;
+  return std::equal(suffix.rbegin(), suffix.rend(), s.rbegin());
+}
+
+TEST_F(LibraryResolverIT, EnumerateSymbols_ExportsOnly_DefaultFlags) {
+  const std::string libC = lib("C");
+  SymbolEnumeratorOptions Opts = SymbolEnumeratorOptions::defaultOptions();
+
+  std::vector<std::string> seen;
+  auto onEach = [&](llvm::StringRef sym) -> EnumerateResult {
+    seen.emplace_back(sym.str());
+    return EnumerateResult::Continue;
+  };
+
+  ASSERT_TRUE(SymbolEnumerator::enumerateSymbols(libC, onEach, Opts));
+
+  // sayC is exported, others are undefined → only sayC expected
+  EXPECT_TRUE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayC");
+  }));
+  EXPECT_FALSE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayA");
+  }));
+  EXPECT_FALSE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayB");
+  }));
+}
+
+TEST_F(LibraryResolverIT, EnumerateSymbols_IncludesUndefineds) {
+  const std::string libC = lib("C");
+
+  SymbolEnumeratorOptions Opts;
+  Opts.FilterFlags =
+      SymbolEnumeratorOptions::IgnoreWeak |
+      SymbolEnumeratorOptions::IgnoreIndirect; // no IgnoreUndefined
+
+  std::vector<std::string> seen;
+  auto onEach = [&](llvm::StringRef sym) -> EnumerateResult {
+    seen.emplace_back(sym.str());
+    return EnumerateResult::Continue;
+  };
+
+  ASSERT_TRUE(SymbolEnumerator::enumerateSymbols(libC, onEach, Opts));
+
+  // Now we should see both sayC (export) and the undefined refs sayA, sayB,
+  // sayZ
+  EXPECT_TRUE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayC");
+  }));
+  EXPECT_TRUE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayA");
+  }));
+  EXPECT_TRUE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayB");
+  }));
+}
+
+// Full resolution via LibraryResolutionDriver/LibraryResolver ---
+TEST_F(LibraryResolverIT, DriverResolvesSymbolsToCorrectLibraries) {
+  // Create the resolver from real base paths (our fixtures dir)
+  auto Stup = LibraryResolver::Setup::create({BaseDir});
+
+  // Full system behavior: no mocks
+  auto Driver = LibraryResolutionDriver::create(Stup);
+  ASSERT_NE(Driver, nullptr);
+
+  // Tell the Driver about the scan path kinds (User/System) as your
+  // production code expects.
+  Driver->addScanPath(libdir("A"), PathType::User);
+  Driver->addScanPath(libdir("B"), PathType::User);
+  Driver->addScanPath(libdir("Z"), PathType::User);
+
+  // Symbols to resolve (bare names; class handles underscore differences
+  // internally)
+  std::vector<std::string> Syms = {platformSymbolName("sayA"),
+                                   platformSymbolName("sayB"),
+                                   platformSymbolName("sayZ")};
+
+  bool CallbackRan = false;
+  Driver->resolveSymbols(Syms, [&](SymbolQuery &Q) {
+    CallbackRan = true;
+
+    // sayA should resolve to A.dylib
+    {
+      auto lib = Q.getResolvedLib(platformSymbolName("sayA"));
+      ASSERT_TRUE(lib.has_value()) << "sayA should be resolved";
+      EXPECT_TRUE(endsWith(lib->str(), libname("A")))
+          << "sayA resolved to: " << lib->str();
+    }
+
+    // sayB should resolve to B.dylib
+    {
+      auto lib = Q.getResolvedLib(platformSymbolName("sayB"));
+      ASSERT_TRUE(lib.has_value()) << "sayB should be resolved";
+      EXPECT_TRUE(endsWith(lib->str(), libname("B")))
+          << "sayB resolved to: " << lib->str();
+    }
+
+    // sayZ should resolve to B.dylib
+    {
+      auto lib = Q.getResolvedLib(platformSymbolName("sayZ"));
+      ASSERT_TRUE(lib.has_value()) << "sayZ should be resolved";
+      EXPECT_TRUE(endsWith(lib->str(), libname("Z")))
+          << "sayZ resolved to: " << lib->str();
+    }
+
+    EXPECT_TRUE(Q.allResolved());
+  });
+
+  EXPECT_TRUE(CallbackRan);
+}
+
+// stress SymbolQuery with the real resolve flow
+// And resolve libC dependency libA, libB, libZ ---
+TEST_F(LibraryResolverIT, ResolveManySymbols) {
+  auto Stup = LibraryResolver::Setup::create({BaseDir});
+  auto Driver = LibraryResolutionDriver::create(Stup);
+  ASSERT_NE(Driver, nullptr);
+  Driver->addScanPath(libdir("C"), PathType::User);
+
+  // Many duplicates to provoke concurrent updates inside SymbolQuery
+  std::vector<std::string> Syms = {
+      platformSymbolName("sayA"), platformSymbolName("sayB"),
+      platformSymbolName("sayA"), platformSymbolName("sayB"),
+      platformSymbolName("sayZ"), platformSymbolName("sayZ"),
+      platformSymbolName("sayZ"), platformSymbolName("sayZ"),
+      platformSymbolName("sayA"), platformSymbolName("sayB"),
+      platformSymbolName("sayA"), platformSymbolName("sayB")};
+
+  bool CallbackRan = false;
+  Driver->resolveSymbols(Syms, [&](SymbolQuery &Q) {
+    CallbackRan = true;
+    EXPECT_TRUE(Q.isResolved(platformSymbolName("sayA")));
+    EXPECT_TRUE(Q.isResolved(platformSymbolName("sayB")));
+    EXPECT_TRUE(Q.isResolved(platformSymbolName("sayZ")));
+
+    auto A = Q.getResolvedLib(platformSymbolName("sayA"));
+    auto B = Q.getResolvedLib(platformSymbolName("sayB"));
+    auto Z = Q.getResolvedLib(platformSymbolName("sayZ"));
+    ASSERT_TRUE(A.has_value());
+    ASSERT_TRUE(B.has_value());
+    ASSERT_TRUE(Z.has_value());
+    EXPECT_TRUE(endsWith(A->str(), libname("A")));
+    EXPECT_TRUE(endsWith(B->str(), libname("B")));
+    EXPECT_TRUE(endsWith(Z->str(), libname("Z")));
+    EXPECT_TRUE(Q.allResolved());
+  });
+
+  EXPECT_TRUE(CallbackRan);
+}
+
+TEST_F(LibraryResolverIT, ScanAndResolveDependencyGraph) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+  LibraryScanHelper ScanH({}, LibPathCache, PResolver);
+
+  ScanH.addBasePath(libdir("C"), PathType::User);
+
+  LibraryManager LibMgr;
+  LibraryScanner Scanner(ScanH, LibMgr);
+
+  Scanner.scanNext(PathType::User, 0);
+
+  size_t numLibs = 0;
+  LibMgr.forEachLibrary([&](const LibraryInfo &L) {
+    numLibs++;
+    return true;
+  });
+
+  EXPECT_GT(numLibs, 0u) << "Expected at least one library scanned";
+
+  // Validate that each scanned library path is resolvable
+  std::error_code EC;
+  LibMgr.forEachLibrary([&](const LibraryInfo &L) {
+    auto R = PResolver->resolve(L.getFullPath(), EC);
+    EXPECT_TRUE(R.has_value());
+    EXPECT_FALSE(EC);
+    return true;
+  });
+}
+
+TEST_F(LibraryResolverIT, ScanEmptyPath) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+  LibraryScanHelper ScanH({}, LibPathCache, PResolver);
+
+  ScanH.addBasePath("/tmp/empty", PathType::User);
+
+  LibraryManager LibMgr;
+  LibraryScanner Scanner(ScanH, LibMgr);
+
+  Scanner.scanNext(PathType::User, 0);
+
+  size_t count = 0;
+  LibMgr.forEachLibrary([&](const LibraryInfo &) {
+    count++;
+    return true;
+  });
+  EXPECT_EQ(count, 0u);
+}
+
+TEST_F(LibraryResolverIT, PathResolverResolvesKnownPaths) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  std::error_code EC;
+  auto Missing = PResolver->resolve("temp/foo/bar", EC);
+  EXPECT_FALSE(Missing.has_value()) << "Unexpectedly resolved a bogus path";
+  EXPECT_TRUE(EC) << "Expected error resolving path";
+
+  auto DirPath = PResolver->resolve(BaseDir, EC);
+  ASSERT_TRUE(DirPath.has_value());
+  EXPECT_FALSE(EC) << "Expected no error resolving path";
+  EXPECT_EQ(*DirPath, BaseDir);
+
+  auto DylibPath = PResolver->resolve(lib("C"), EC);
+  ASSERT_TRUE(DylibPath.has_value());
+  EXPECT_FALSE(EC) << "Expected no error resolving path";
+  EXPECT_EQ(*DylibPath, lib("C"));
+}
+
+TEST_F(LibraryResolverIT, PathResolverNormalizesDotAndDotDot) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  std::error_code EC;
+
+  // e.g. BaseDir + "/./C/../C/C.dylib" → BaseDir + "/C.dylib"
+  std::string Messy = BaseDir + "/C/./../C/./libC" + ext;
+  auto Resolved = PResolver->resolve(Messy, EC);
+  ASSERT_TRUE(Resolved.has_value());
+  EXPECT_FALSE(EC);
+  EXPECT_EQ(*Resolved, lib("C")) << "Expected realpath to collapse . and ..";
+}
+
+#if !defined(_WIN32)
+TEST_F(LibraryResolverIT, PathResolverFollowsSymlinks) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  std::error_code EC;
+
+  // Create a symlink temp -> BaseDir (only if filesystem allows it)
+  std::string linkName = BaseDir + withext("/link_to_C");
+  std::string target = lib("C");
+  if (::symlink(target.c_str(), linkName.c_str()) != 0)
+    GTEST_SKIP() << "Failed to create symlink: " << strerror(errno);
+
+  auto resolved = PResolver->resolve(linkName, EC);
+  ASSERT_TRUE(resolved.has_value());
+  EXPECT_FALSE(EC);
+  EXPECT_EQ(*resolved, target);
+
+  (void)::unlink(linkName.c_str()); // cleanup
+}
+
+TEST_F(LibraryResolverIT, PathResolverCachesResults) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  SmallString<128> TmpDylib;
+  std::error_code EC;
+  EC = sys::fs::createUniqueFile(withext("A-copy"), TmpDylib);
+  if (EC)
+    GTEST_SKIP() << "Failed to create temp dylib" << EC.message();
+
+  EC = sys::fs::copy_file(lib("A"), TmpDylib);
+  if (EC)
+    GTEST_SKIP() << "Failed to copy libA: " << EC.message();
+  EC.clear();
+
+  // First resolve -> should populate LibPathCache
+  auto first = PResolver->resolve(TmpDylib, EC);
+  ASSERT_TRUE(first.has_value());
+
+  // Forcefully remove the file from disk
+  (void)::unlink(TmpDylib.c_str());
+
+  // Second resolve -> should still succeed from LibPathCache
+  auto second = PResolver->resolve(TmpDylib, EC);
+  EXPECT_TRUE(second.has_value());
+  EXPECT_EQ(*second, *first);
+}
+#endif
+
+TEST_F(LibraryResolverIT, LoaderPathSubstitutionAndResolve) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  DylibSubstitutor substitutor;
+  substitutor.configure(libdir("C"));
+#if defined(__APPLE__)
+  // Substitute @loader_path with BaseDir
+  std::string substituted =
+      substitutor.substitute(withext("@loader_path/libC"));
+#elif defined(__linux__)
+  // Substitute $origin with BaseDir
+  std::string substituted = substitutor.substitute(withext("$ORIGIN/libC"));
+#endif
+  ASSERT_FALSE(substituted.empty());
+  EXPECT_EQ(substituted, lib("C"));
+
+  // Now try resolving the substituted path
+  std::error_code EC;
+  auto resolved = PResolver->resolve(substituted, EC);
+  ASSERT_TRUE(resolved.has_value()) << "Expected to resolve substituted dylib";
+  EXPECT_EQ(*resolved, lib("C"));
+  EXPECT_FALSE(EC) << "Expected no error resolving substituted dylib";
+}
+
+TEST_F(LibraryResolverIT, ResolveFromUsrOrSystemPaths) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  DylibPathValidator validator(*PResolver);
+
+  std::vector<std::string> Paths = {"/foo/bar/", "temp/foo",  libdir("C"),
+                                    libdir("A"), libdir("B"), libdir("Z")};
+
+  SmallVector<StringRef> P(Paths.begin(), Paths.end());
+
+  DylibResolver Resolver(validator);
+  Resolver.configure("", {{P, SearchPathType::UsrOrSys}});
+
+  // Check "C"
+  auto ValOptC = Resolver.resolve("libC", true);
+  EXPECT_TRUE(ValOptC.has_value());
+  EXPECT_EQ(*ValOptC, lib("C"));
+
+  auto ValOptCdylib = Resolver.resolve(withext("libC"));
+  EXPECT_TRUE(ValOptCdylib.has_value());
+  EXPECT_EQ(*ValOptCdylib, lib("C"));
+
+  // Check "A"
+  auto ValOptA = Resolver.resolve("libA", true);
+  EXPECT_TRUE(ValOptA.has_value());
+  EXPECT_EQ(*ValOptA, lib("A"));
+
+  auto ValOptAdylib = Resolver.resolve(withext("libA"));
+  EXPECT_TRUE(ValOptAdylib.has_value());
+  EXPECT_EQ(*ValOptAdylib, lib("A"));
+
+  // Check "B"
+  auto ValOptB = Resolver.resolve("libB", true);
+  EXPECT_TRUE(ValOptB.has_value());
+  EXPECT_EQ(*ValOptB, lib("B"));
+
+  auto ValOptBdylib = Resolver.resolve(withext("libB"));
+  EXPECT_TRUE(ValOptBdylib.has_value());
+  EXPECT_EQ(*ValOptBdylib, lib("B"));
+
+  // Check "Z"
+  auto ValOptZ = Resolver.resolve("libZ", true);
+  EXPECT_TRUE(ValOptZ.has_value());
+  EXPECT_EQ(*ValOptZ, lib("Z"));
+
+  auto ValOptZdylib = Resolver.resolve(withext("libZ"));
+  EXPECT_TRUE(ValOptZdylib.has_value());
+  EXPECT_EQ(*ValOptZdylib, lib("Z"));
+}
+
+#if defined(__APPLE__)
+TEST_F(LibraryResolverIT, ResolveViaLoaderPathAndRPathSubstitution) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  DylibPathValidator validator(*PResolver);
+
+  std::vector<std::string> Paths = {"@loader_path/../A", "@loader_path/../B",
+                                    "@loader_path/../C", "@loader_path/../Z"};
+
+  SmallVector<StringRef> P(Paths.begin(), Paths.end());
+
+  DylibResolver Resolver(validator);
+
+  // Use only RPath config
+  Resolver.configure(lib("C"), {{P, SearchPathType::RPath}});
+
+  // --- Check A ---
+  auto ValOptA = Resolver.resolve("@rpath/libA", true);
+  EXPECT_TRUE(ValOptA.has_value());
+  EXPECT_EQ(*ValOptA, lib("A"));
+
+  auto ValOptAdylib = Resolver.resolve(withext("@rpath/libA"));
+  EXPECT_TRUE(ValOptAdylib.has_value());
+  EXPECT_EQ(*ValOptAdylib, lib("A"));
+
+  // --- Check B ---
+  auto ValOptB = Resolver.resolve("@rpath/libB", true);
+  EXPECT_TRUE(ValOptB.has_value());
+  EXPECT_EQ(*ValOptB, lib("B"));
+
+  auto ValOptBdylib = Resolver.resolve(withext("@rpath/libB"));
+  EXPECT_TRUE(ValOptBdylib.has_value());
+  EXPECT_EQ(*ValOptBdylib, lib("B"));
+
+  // --- Check Z ---
+  auto ValOptZ = Resolver.resolve("@rpath/libZ", true);
+  EXPECT_TRUE(ValOptZ.has_value());
+  EXPECT_EQ(*ValOptZ, lib("Z"));
+
+  auto ValOptZdylib = Resolver.resolve(withext("@rpath/libZ"));
+  EXPECT_TRUE(ValOptZdylib.has_value());
+  EXPECT_EQ(*ValOptZdylib, lib("Z"));
+}
+#endif
+
+#if defined(__linux__)
+TEST_F(LibraryResolverIT, ResolveViaOriginAndRPathSubstitution) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  DylibPathValidator validator(*PResolver);
+
+  // On Linux, $ORIGIN works like @loader_path
+  std::vector<std::string> Paths = {"$ORIGIN/../A", "$ORIGIN/../B",
+                                    "$ORIGIN/../C", "$ORIGIN/../Z"};
+
+  SmallVector<StringRef> P(Paths.begin(), Paths.end());
+
+  DylibResolver Resolver(validator);
+
+  // Use only RPath config
+  Resolver.configure(lib("C"), {{P, SearchPathType::RunPath}});
+
+  // --- Check A ---
+  auto ValOptA = Resolver.resolve("libA", true);
+  EXPECT_TRUE(ValOptA.has_value());
+  EXPECT_EQ(*ValOptA, lib("A"));
+
+  auto valOptASO = Resolver.resolve(withext("libA"));
+  EXPECT_TRUE(valOptASO.has_value());
+  EXPECT_EQ(*valOptASO, lib("A"));
+
+  // --- Check B ---
+  auto ValOptB = Resolver.resolve("libB", true);
+  EXPECT_TRUE(ValOptB.has_value());
+  EXPECT_EQ(*ValOptB, lib("B"));
+
+  auto valOptBSO = Resolver.resolve(withext("libB"));
+  EXPECT_TRUE(valOptBSO.has_value());
+  EXPECT_EQ(*valOptBSO, lib("B"));
+
+  // --- Check Z ---
+  auto ValOptZ = Resolver.resolve("libZ", true);
+  EXPECT_TRUE(ValOptZ.has_value());
+  EXPECT_EQ(*ValOptZ, lib("Z"));
+
+  auto valOptZSO = Resolver.resolve(withext("libZ"));
+  EXPECT_TRUE(valOptZSO.has_value());
+  EXPECT_EQ(*valOptZSO, lib("Z"));
+}
+#endif
+} // namespace
+#endif // defined(__APPLE__)
diff --git a/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp
index a5269f79fc1e9..566fdb8194852 100644
--- a/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp
@@ -13,8 +13,6 @@
 #include "llvm/ExecutionEngine/Orc/MemoryMapper.h"
 #include "llvm/Testing/Support/Error.h"
 
-#include <vector>
-
 using namespace llvm;
 using namespace llvm::jitlink;
 using namespace llvm::orc;
diff --git a/llvm/unittests/ExecutionEngine/Orc/SimpleExecutorMemoryManagerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/SimpleExecutorMemoryManagerTest.cpp
index 9c6f19c82c998..005c67bcaa0ca 100644
--- a/llvm/unittests/ExecutionEngine/Orc/SimpleExecutorMemoryManagerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/SimpleExecutorMemoryManagerTest.cpp
@@ -11,7 +11,6 @@
 #include "gtest/gtest.h"
 
 #include <limits>
-#include <vector>
 
 using namespace llvm;
 using namespace llvm::orc;
diff --git a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
index a8706ceac141f..23c3c4d5d192c 100644
--- a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
@@ -88,6 +88,7 @@ using DistSchedule = tomp::clause::DistScheduleT<TypeTy, IdTy, ExprTy>;
 using Doacross = tomp::clause::DoacrossT<TypeTy, IdTy, ExprTy>;
 using DynamicAllocators =
     tomp::clause::DynamicAllocatorsT<TypeTy, IdTy, ExprTy>;
+using DynGroupprivate = tomp::clause::DynGroupprivateT<TypeTy, IdTy, ExprTy>;
 using Enter = tomp::clause::EnterT<TypeTy, IdTy, ExprTy>;
 using Exclusive = tomp::clause::ExclusiveT<TypeTy, IdTy, ExprTy>;
 using Fail = tomp::clause::FailT<TypeTy, IdTy, ExprTy>;
@@ -276,16 +277,42 @@ struct StringifyClause {
   std::string Str;
 };
 
+std::string stringify(const omp::Clause &C) { //
+  return StringifyClause(C).Str;
+}
+
 std::string stringify(const omp::DirectiveWithClauses &DWC) {
   std::stringstream Stream;
 
   Stream << getOpenMPDirectiveName(DWC.id, llvm::omp::FallbackVersion).str();
   for (const omp::Clause &C : DWC.clauses)
-    Stream << ' ' << StringifyClause(C).Str;
+    Stream << ' ' << stringify(C);
 
   return Stream.str();
 }
 
+std::string stringify(tomp::ErrorCode E) {
+  switch (E) {
+  case tomp::ErrorCode::NoLeafAllowing:
+    return "no leaf that allows this clause";
+  case tomp::ErrorCode::NoLeafPrivatizing:
+    return "no leaf with a privatizing clause";
+  case tomp::ErrorCode::InvalidDirNameMod:
+    return "invalid directive name modifier";
+  case tomp::ErrorCode::RedModNotApplied:
+    return "the reduction modifier cannot be applied";
+  }
+  return "unrecognized error code " + std::to_string(llvm::to_underlying(E));
+}
+
+std::string stringify(std::pair<const omp::Clause *, tomp::ErrorCode> &ER) {
+  std::stringstream Stream;
+
+  Stream << "error while applying '" << stringify(*ER.first)
+         << "': " << stringify(ER.second);
+  return Stream.str();
+}
+
 // --- Tests ----------------------------------------------------------
 
 namespace red {
@@ -1108,4 +1135,81 @@ TEST_F(OpenMPDecompositionTest, Misc1) {
   std::string Dir0 = stringify(Dec.output[0]);
   ASSERT_EQ(Dir0, "simd linear(, , (x)) lastprivate(, (x))");
 }
+
+// --- Failure/error reporting tests
+
+TEST_F(OpenMPDecompositionTest, Error1) {
+  // "parallel for at(compilation)" is invalid because the "at" clause
+  // does not apply to either "parallel" or "for".
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_at, omp::clause::At{omp::clause::At::ActionTime::Compilation}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_for,
+                                  Clauses);
+  ASSERT_EQ(Dec.errors.size(), 1u);
+  std::string Err0 = stringify(Dec.errors[0]);
+  ASSERT_EQ(Err0,
+            "error while applying 'at(0)': no leaf that allows this clause");
+}
+
+TEST_F(OpenMPDecompositionTest, Error2) {
+  // "parallel loop allocate(x) private(x)" is invalid because "allocate"
+  // can only be applied to "parallel", while "private" is applied to "loop".
+  // This violates the requirement that the leaf with an "allocate" also has
+  // a privatizing clause.
+
+  omp::Object x{"x"};
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_allocate, omp::clause::Allocate{{std::nullopt, std::nullopt, {x}}}},
+      {OMPC_private, omp::clause::Private{{x}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_loop,
+                                  Clauses);
+  ASSERT_EQ(Dec.errors.size(), 1u);
+  std::string Err0 = stringify(Dec.errors[0]);
+  ASSERT_EQ(Err0, "error while applying 'allocate(, , (x))': no leaf with a "
+                  "privatizing clause");
+}
+
+TEST_F(OpenMPDecompositionTest, Error3) {
+  // "parallel for if(target: e)" is invalid because the "target" directive-
+  // name-modifier does not refer to a constituent directive.
+
+  omp::ExprTy e;
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_if, omp::clause::If{{llvm::omp::Directive::OMPD_target, e}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_for,
+                                  Clauses);
+  ASSERT_EQ(Dec.errors.size(), 1u);
+  std::string Err0 = stringify(Dec.errors[0]);
+  ASSERT_EQ(Err0, "error while applying 'if(target, expr)': invalid directive "
+                  "name modifier");
+}
+
+TEST_F(OpenMPDecompositionTest, Error4) {
+  // "masked taskloop reduction(+, task: x)" is invalid because the "task"
+  // modifier can only be applied to "parallel" or a worksharing directive.
+
+  omp::Object x{"x"};
+  auto Add = red::makeOp(omp::clause::DefinedOperator::IntrinsicOperator::Add);
+  auto TaskMod = omp::clause::Reduction::ReductionModifier::Task;
+
+  omp::List<omp::Clause> Clauses{
+      {OMPC_reduction, omp::clause::Reduction{{TaskMod, {Add}, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_masked_taskloop,
+                                  Clauses);
+  ASSERT_EQ(Dec.errors.size(), 1u);
+  std::string Err0 = stringify(Dec.errors[0]);
+  ASSERT_EQ(Err0, "error while applying 'reduction(2, (3), (x))': the "
+                  "reduction modifier cannot be applied");
+}
 } // namespace
diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp
index 972dac82d3331..1142c559c97f8 100644
--- a/llvm/unittests/IR/PatternMatch.cpp
+++ b/llvm/unittests/IR/PatternMatch.cpp
@@ -2657,4 +2657,31 @@ TEST_F(PatternMatchTest, ShiftOrSelf) {
   EXPECT_EQ(ShAmtC, 0U);
 }
 
+TEST_F(PatternMatchTest, CommutativeDeferredIntrinsicMatch) {
+  Value *X = ConstantFP::get(IRB.getDoubleTy(), 1.0);
+  Value *Y = ConstantFP::get(IRB.getDoubleTy(), 2.0);
+
+  auto CheckMatch = [X, Y](Value *Pattern) {
+    Value *tX = nullptr, *tY = nullptr;
+    EXPECT_TRUE(
+        match(Pattern, m_c_Intrinsic<Intrinsic::minimum>(
+                           m_Value(tX), m_c_Intrinsic<Intrinsic::minimum>(
+                                            m_Deferred(tX), m_Value(tY)))));
+    EXPECT_EQ(tX, X);
+    EXPECT_EQ(tY, Y);
+  };
+  CheckMatch(IRB.CreateBinaryIntrinsic(
+      Intrinsic::minimum, X,
+      IRB.CreateBinaryIntrinsic(Intrinsic::minimum, X, Y)));
+  CheckMatch(IRB.CreateBinaryIntrinsic(
+      Intrinsic::minimum, X,
+      IRB.CreateBinaryIntrinsic(Intrinsic::minimum, Y, X)));
+  CheckMatch(IRB.CreateBinaryIntrinsic(
+      Intrinsic::minimum, IRB.CreateBinaryIntrinsic(Intrinsic::minimum, X, Y),
+      X));
+  CheckMatch(IRB.CreateBinaryIntrinsic(
+      Intrinsic::minimum, IRB.CreateBinaryIntrinsic(Intrinsic::minimum, Y, X),
+      X));
+}
+
 } // anonymous namespace.
diff --git a/llvm/unittests/IR/VFABIDemanglerTest.cpp b/llvm/unittests/IR/VFABIDemanglerTest.cpp
index e30e0f865f719..7d946131d4cba 100644
--- a/llvm/unittests/IR/VFABIDemanglerTest.cpp
+++ b/llvm/unittests/IR/VFABIDemanglerTest.cpp
@@ -15,7 +15,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
-#include <optional>
 
 using namespace llvm;
 
diff --git a/llvm/unittests/MC/SystemZ/SystemZMCDisassemblerTest.cpp b/llvm/unittests/MC/SystemZ/SystemZMCDisassemblerTest.cpp
index 87fad37635320..25c22d1b4ba73 100644
--- a/llvm/unittests/MC/SystemZ/SystemZMCDisassemblerTest.cpp
+++ b/llvm/unittests/MC/SystemZ/SystemZMCDisassemblerTest.cpp
@@ -61,7 +61,7 @@ Context &getContext() {
 class SystemZMCSymbolizerTest : public MCSymbolizer {
 public:
   SystemZMCSymbolizerTest(MCContext &MC) : MCSymbolizer(MC, nullptr) {}
-  ~SystemZMCSymbolizerTest() override {}
+  ~SystemZMCSymbolizerTest() override = default;
 
   bool tryAddingSymbolicOperand([[maybe_unused]] MCInst &Inst,
                                 [[maybe_unused]] raw_ostream &CStream,
diff --git a/llvm/unittests/MC/X86/X86MCDisassemblerTest.cpp b/llvm/unittests/MC/X86/X86MCDisassemblerTest.cpp
index 286528fdd21c3..6d44151b378b2 100644
--- a/llvm/unittests/MC/X86/X86MCDisassemblerTest.cpp
+++ b/llvm/unittests/MC/X86/X86MCDisassemblerTest.cpp
@@ -62,7 +62,7 @@ Context &getContext() {
 class X86MCSymbolizerTest : public MCSymbolizer {
 public:
   X86MCSymbolizerTest(MCContext &MC) : MCSymbolizer(MC, nullptr) {}
-  ~X86MCSymbolizerTest() override {}
+  ~X86MCSymbolizerTest() override = default;
 
   struct OpInfo {
     int64_t Value = 0;
diff --git a/llvm/unittests/MIR/MachineMetadata.cpp b/llvm/unittests/MIR/MachineMetadata.cpp
index 0f038d9dc2234..8c3637704fc71 100644
--- a/llvm/unittests/MIR/MachineMetadata.cpp
+++ b/llvm/unittests/MIR/MachineMetadata.cpp
@@ -33,7 +33,7 @@ using namespace llvm;
 
 class MachineMetadataTest : public testing::Test {
 public:
-  MachineMetadataTest() {}
+  MachineMetadataTest() = default;
 
 protected:
   LLVMContext Context;
@@ -205,8 +205,8 @@ TEST_F(MachineMetadataTest, MMSlotTrackerAArch64) {
 
   StringRef MIRString = R"MIR(
 --- |
-  define i32 @test0(i32* %p) {
-    %r = load i32, i32* %p, align 4
+  define i32 @test0(ptr %p) {
+    %r = load i32, ptr %p, align 4
     ret i32 %r
   }
 ...
@@ -354,8 +354,8 @@ TEST_F(MachineMetadataTest, MMSlotTrackerX64) {
 
   StringRef MIRString = R"MIR(
 --- |
-  define i32 @test0(i32* %p) {
-    %r = load i32, i32* %p, align 4
+  define i32 @test0(ptr %p) {
+    %r = load i32, ptr %p, align 4
     ret i32 %r
   }
 ...
@@ -446,8 +446,8 @@ TEST_F(MachineMetadataTest, MMSlotTrackerAMDGPU) {
 
   StringRef MIRString = R"MIR(
 --- |
-  define i32 @test0(i32* %p) {
-    %r = load i32, i32* %p, align 4
+  define i32 @test0(ptr %p) {
+    %r = load i32, ptr %p, align 4
     ret i32 %r
   }
 ...
diff --git a/llvm/unittests/MIR/MachineStableHashTest.cpp b/llvm/unittests/MIR/MachineStableHashTest.cpp
index ea0de1a73da62..bedecb17df51f 100644
--- a/llvm/unittests/MIR/MachineStableHashTest.cpp
+++ b/llvm/unittests/MIR/MachineStableHashTest.cpp
@@ -22,7 +22,7 @@ using namespace llvm;
 
 class MachineStableHashTest : public testing::Test {
 public:
-  MachineStableHashTest() {}
+  MachineStableHashTest() = default;
 
 protected:
   LLVMContext Context;
diff --git a/llvm/unittests/Object/ELFObjectFileTest.cpp b/llvm/unittests/Object/ELFObjectFileTest.cpp
index d6a3ca53b2154..1e2955ae40a66 100644
--- a/llvm/unittests/Object/ELFObjectFileTest.cpp
+++ b/llvm/unittests/Object/ELFObjectFileTest.cpp
@@ -531,7 +531,7 @@ TEST(ELFObjectFileTest, InvalidDecodeBBAddrMap) {
   // Check that we can detect unsupported versions.
   SmallString<128> UnsupportedVersionYamlString(CommonYamlString);
   UnsupportedVersionYamlString += R"(
-      - Version: 5
+      - Version: 6
         BBRanges:
           - BaseAddress: 0x11111
             BBEntries:
@@ -543,7 +543,7 @@ TEST(ELFObjectFileTest, InvalidDecodeBBAddrMap) {
   {
     SCOPED_TRACE("unsupported version");
     DoCheck(UnsupportedVersionYamlString,
-            "unsupported SHT_LLVM_BB_ADDR_MAP version: 5");
+            "unsupported SHT_LLVM_BB_ADDR_MAP version: 6");
   }
 
   SmallString<128> ZeroBBRangesYamlString(CommonYamlString);
@@ -1181,8 +1181,8 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) {
     Type: SHT_LLVM_BB_ADDR_MAP
   # Link: 0 (by default, can be overriden)
     Entries:
-      - Version: 2
-        Feature: 0x7
+      - Version: 5
+        Feature: 0x87
         BBRanges:
           - BaseAddress: 0x44444
             BBEntries:
@@ -1205,7 +1205,8 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) {
     PGOAnalyses:
       - FuncEntryCount: 1000
         PGOBBEntries:
-          - BBFreq:         1000
+          - BBFreq:          1000
+            PostLinkBBFreq:  50
             Successors:
             - ID:          1
               BrProb:      0x22222222
@@ -1243,8 +1244,8 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) {
     Type: SHT_LLVM_BB_ADDR_MAP
   # Link: 0 (by default, can be overriden)
     Entries:
-      - Version: 2
-        Feature: 0xc
+      - Version: 5
+        Feature: 0x8c
         BBRanges:
           - BaseAddress: 0x66666
             BBEntries:
@@ -1265,8 +1266,9 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) {
     PGOAnalyses:
       - PGOBBEntries:
          - Successors:
-            - ID:          1
-              BrProb:      0x22222222
+            - ID:              1
+              BrProb:          0x22222222
+              PostLinkBrFreq:  7
             - ID:          2
               BrProb:      0xcccccccc
          - Successors:
@@ -1278,59 +1280,66 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) {
   BBAddrMap E1 = {
       {{0x11111, {{1, 0x0, 0x1, {false, true, false, false, false}, {}, 0}}}}};
   PGOAnalysisMap P1 = {
-      892, {}, {true, false, false, false, false, false, false}};
+      892, {}, {true, false, false, false, false, false, false, false}};
   BBAddrMap E2 = {
       {{0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}, {}, 0}}}}};
   PGOAnalysisMap P2 = {{},
-                       {{BlockFrequency(343), {}}},
-                       {false, true, false, false, false, false, false}};
+                       {{BlockFrequency(343), 0, {}}},
+                       {false, true, false, false, false, false, false, false}};
   BBAddrMap E3 = {
       {{0x33333,
         {{0, 0x0, 0x3, {false, true, true, false, false}, {}, 0},
          {1, 0x3, 0x3, {false, false, true, false, false}, {}, 0},
          {2, 0x6, 0x3, {false, false, false, false, false}, {}, 0}}}}};
-  PGOAnalysisMap P3 = {{},
-                       {{{},
-                         {{1, BranchProbability::getRaw(0x1111'1111)},
-                          {2, BranchProbability::getRaw(0xeeee'eeee)}}},
-                        {{}, {{2, BranchProbability::getRaw(0xffff'ffff)}}},
-                        {{}, {}}},
-                       {false, false, true, false, false, false, false}};
+  PGOAnalysisMap P3 = {
+      {},
+      {{{},
+        0,
+        {{1, BranchProbability::getRaw(0x1111'1111), 0},
+         {2, BranchProbability::getRaw(0xeeee'eeee), 0}}},
+       {{}, 0, {{2, BranchProbability::getRaw(0xffff'ffff), 0}}},
+       {{}, 0, {}}},
+      {false, false, true, false, false, false, false, false}};
   BBAddrMap E4 = {
       {{0x44444,
         {{0, 0x0, 0x4, {false, false, false, true, true}, {}, 0},
          {1, 0x4, 0x4, {false, false, false, false, false}, {}, 0},
          {2, 0x8, 0x4, {false, false, false, false, false}, {}, 0},
          {3, 0xc, 0x4, {false, false, false, false, false}, {}, 0}}}}};
-  PGOAnalysisMap P4 = {
-      1000,
-      {{BlockFrequency(1000),
-        {{1, BranchProbability::getRaw(0x2222'2222)},
-         {2, BranchProbability::getRaw(0x3333'3333)},
-         {3, BranchProbability::getRaw(0xaaaa'aaaa)}}},
-       {BlockFrequency(133),
-        {{2, BranchProbability::getRaw(0x1111'1111)},
-         {3, BranchProbability::getRaw(0xeeee'eeee)}}},
-       {BlockFrequency(18), {{3, BranchProbability::getRaw(0xffff'ffff)}}},
-       {BlockFrequency(1000), {}}},
-      {true, true, true, false, false, false, false}};
+  PGOAnalysisMap P4 = {1000,
+                       {{BlockFrequency(1000),
+                         50,
+                         {{1, BranchProbability::getRaw(0x2222'2222), 0},
+                          {2, BranchProbability::getRaw(0x3333'3333), 0},
+                          {3, BranchProbability::getRaw(0xaaaa'aaaa), 0}}},
+                        {BlockFrequency(133),
+                         0,
+                         {{2, BranchProbability::getRaw(0x1111'1111), 0},
+                          {3, BranchProbability::getRaw(0xeeee'eeee), 0}}},
+                        {BlockFrequency(18),
+                         0,
+                         {{3, BranchProbability::getRaw(0xffff'ffff), 0}}},
+                        {BlockFrequency(1000), 0, {}}},
+                       {true, true, true, false, false, false, false, true}};
   BBAddrMap E5 = {
       {{0x55555, {{2, 0x0, 0x2, {false, false, true, false, false}, {}, 0}}}}};
   PGOAnalysisMap P5 = {
-      {}, {}, {false, false, false, false, false, false, false}};
+      {}, {}, {false, false, false, false, false, false, false, false}};
   BBAddrMap E6 = {
       {{0x66666,
         {{0, 0x0, 0x6, {false, true, true, false, false}, {}, 0},
          {1, 0x6, 0x6, {false, false, true, false, false}, {}, 0}}},
        {0x666661,
         {{2, 0x0, 0x6, {false, false, false, false, false}, {}, 0}}}}};
-  PGOAnalysisMap P6 = {{},
-                       {{{},
-                         {{1, BranchProbability::getRaw(0x2222'2222)},
-                          {2, BranchProbability::getRaw(0xcccc'cccc)}}},
-                        {{}, {{2, BranchProbability::getRaw(0x8888'8888)}}},
-                        {{}, {}}},
-                       {false, false, true, true, false, false, false}};
+  PGOAnalysisMap P6 = {
+      {},
+      {{{},
+        0,
+        {{1, BranchProbability::getRaw(0x2222'2222), 7},
+         {2, BranchProbability::getRaw(0xcccc'cccc), 0}}},
+       {{}, 0, {{2, BranchProbability::getRaw(0x8888'8888), 0}}},
+       {{}, 0, {}}},
+      {false, false, true, true, false, false, false, true}};
 
   std::vector<BBAddrMap> Section0BBAddrMaps = {E4, E5, E6};
   std::vector<BBAddrMap> Section1BBAddrMaps = {E3};
@@ -1465,7 +1474,7 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) {
     DoCheckFails(
         TruncatedYamlString, /*TextSectionIndex=*/std::nullopt,
         "unable to read SHT_LLVM_BB_ADDR_MAP section with index 6: "
-        "unexpected end of data at offset 0xa while reading [0x3, 0xb)");
+        "unexpected end of data at offset 0xa while reading [0x4, 0xc)");
     // Check that we can read the other section's bb-address-maps which are
     // valid.
     DoCheckSucceeds(TruncatedYamlString, /*TextSectionIndex=*/2,
diff --git a/llvm/unittests/Object/ELFTypesTest.cpp b/llvm/unittests/Object/ELFTypesTest.cpp
index 1765e15003963..9e99b4a6d7bf3 100644
--- a/llvm/unittests/Object/ELFTypesTest.cpp
+++ b/llvm/unittests/Object/ELFTypesTest.cpp
@@ -101,22 +101,24 @@ static_assert(
     "PGOAnalysisMap should use the same type for basic block ID as BBAddrMap");
 
 TEST(ELFTypesTest, BBAddrMapFeaturesEncodingTest) {
-  const std::array<BBAddrMap::Features, 12> Decoded = {
-      {{false, false, false, false, false, false, false},
-       {true, false, false, false, false, false, false},
-       {false, true, false, false, false, false, false},
-       {false, false, true, false, false, false, false},
-       {false, false, false, true, false, false, false},
-       {true, true, false, false, false, false, false},
-       {false, true, true, false, false, false, false},
-       {false, true, true, true, false, false, false},
-       {true, true, true, true, false, false, false},
-       {false, false, false, false, true, false, false},
-       {false, false, false, false, false, true, false},
-       {false, false, false, false, false, false, true}}};
-  const std::array<uint8_t, 12> Encoded = {
+  const std::array<BBAddrMap::Features, 14> Decoded = {
+      {{false, false, false, false, false, false, false, false},
+       {true, false, false, false, false, false, false, false},
+       {false, true, false, false, false, false, false, false},
+       {false, false, true, false, false, false, false, false},
+       {false, false, false, true, false, false, false, false},
+       {true, true, false, false, false, false, false, false},
+       {false, true, true, false, false, false, false, false},
+       {false, true, true, true, false, false, false, false},
+       {true, true, true, true, false, false, false, false},
+       {false, false, false, false, true, false, false, false},
+       {false, false, false, false, false, true, false, false},
+       {false, false, false, false, false, false, true, false},
+       {false, false, false, false, false, false, false, true},
+       {false, false, false, false, false, false, true, true}}};
+  const std::array<uint16_t, 14> Encoded = {
       {0b0000, 0b0001, 0b0010, 0b0100, 0b1000, 0b0011, 0b0110, 0b1110, 0b1111,
-       0b1'0000, 0b10'0000, 0b100'0000}};
+       0b1'0000, 0b10'0000, 0b100'0000, 0b1000'0000, 0b1100'0000}};
   for (const auto &[Feat, EncodedVal] : llvm::zip(Decoded, Encoded))
     EXPECT_EQ(Feat.encode(), EncodedVal);
   for (const auto &[Feat, EncodedVal] : llvm::zip(Decoded, Encoded)) {
@@ -129,9 +131,9 @@ TEST(ELFTypesTest, BBAddrMapFeaturesEncodingTest) {
 
 TEST(ELFTypesTest, BBAddrMapFeaturesInvalidEncodingTest) {
   const std::array<std::string, 2> Errors = {
-      "invalid encoding for BBAddrMap::Features: 0x80",
-      "invalid encoding for BBAddrMap::Features: 0xf0"};
-  const std::array<uint8_t, 2> Values = {{0b1000'0000, 0b1111'0000}};
+      "invalid encoding for BBAddrMap::Features: 0x100",
+      "invalid encoding for BBAddrMap::Features: 0x1000"};
+  const std::array<uint16_t, 2> Values = {{0b1'0000'0000, 0b1'0000'0000'0000}};
   for (const auto &[Val, Error] : llvm::zip(Values, Errors)) {
     EXPECT_THAT_ERROR(BBAddrMap::Features::decode(Val).takeError(),
                       FailedWithMessage(Error));
diff --git a/llvm/unittests/Object/XCOFFObjectFileTest.cpp b/llvm/unittests/Object/XCOFFObjectFileTest.cpp
index f696cde99e0ec..10217f6c2c396 100644
--- a/llvm/unittests/Object/XCOFFObjectFileTest.cpp
+++ b/llvm/unittests/Object/XCOFFObjectFileTest.cpp
@@ -18,10 +18,10 @@ using namespace llvm::XCOFF;
 TEST(XCOFFObjectFileTest, XCOFFObjectType) {
   // Create an arbitrary object of a non-XCOFF type and test that
   // dyn_cast<XCOFFObjectFile> returns null for it.
-  char Buf[sizeof(typename ELF64LE::Ehdr)] = {};
+  char Buf[sizeof(ELF64LE::Ehdr)] = {};
   memcpy(Buf, "\177ELF", 4);
 
-  auto *EHdr = reinterpret_cast<typename ELF64LE::Ehdr *>(Buf);
+  auto *EHdr = reinterpret_cast<ELF64LE::Ehdr *>(Buf);
   EHdr->e_ident[llvm::ELF::EI_CLASS] = llvm::ELF::ELFCLASS64;
   EHdr->e_ident[llvm::ELF::EI_DATA] = llvm::ELF::ELFDATA2LSB;
 
diff --git a/llvm/unittests/Option/OptionSubCommandsTest.cpp b/llvm/unittests/Option/OptionSubCommandsTest.cpp
index e31a3262f135e..d4744c90b0069 100644
--- a/llvm/unittests/Option/OptionSubCommandsTest.cpp
+++ b/llvm/unittests/Option/OptionSubCommandsTest.cpp
@@ -192,6 +192,19 @@ TYPED_TEST(OptSubCommandTableTest, SubCommandParsing) {
         std::string::npos,
         ErrMsg.find("Option [lowercase] is not valid for SubCommand [bar]"));
   }
+
+  {
+    // Test case 7: Check valid use of a valid subcommand following more
+    // positional arguments.
+    const char *Args[] = {"bar", "input"};
+    InputArgList AL = T.ParseArgs(Args, MAI, MAC);
+    StringRef SC = AL.getSubCommand(
+        T.getSubCommands(), HandleMultipleSubcommands, HandleOtherPositionals);
+    EXPECT_EQ(SC, "bar"); // valid subcommand
+    EXPECT_NE(std::string::npos,
+              ErrMsg.find("Unregistered positionals passed"));
+    EXPECT_NE(std::string::npos, ErrMsg.find("input"));
+  }
 }
 
 TYPED_TEST(OptSubCommandTableTest, SubCommandHelp) {
diff --git a/llvm/unittests/Support/AlignOfTest.cpp b/llvm/unittests/Support/AlignOfTest.cpp
index 979f2cf18cc15..53358a2815daa 100644
--- a/llvm/unittests/Support/AlignOfTest.cpp
+++ b/llvm/unittests/Support/AlignOfTest.cpp
@@ -79,14 +79,14 @@ struct V8 : V5, virtual V6, V7 { double zz;
 
 double S6::f() { return 0.0; }
 float D2::g() { return 0.0f; }
-V1::~V1() {}
-V2::~V2() {}
-V3::~V3() {}
-V4::~V4() {}
-V5::~V5() {}
-V6::~V6() {}
-V7::~V7() {}
-V8::~V8() {}
+V1::~V1() = default;
+V2::~V2() = default;
+V3::~V3() = default;
+V4::~V4() = default;
+V5::~V5() = default;
+V6::~V6() = default;
+V7::~V7() = default;
+V8::~V8() = default;
 
 template <typename M> struct T { M m; };
 
diff --git a/llvm/unittests/Support/AllocatorTest.cpp b/llvm/unittests/Support/AllocatorTest.cpp
index 1069e436d0a16..2337f34143bad 100644
--- a/llvm/unittests/Support/AllocatorTest.cpp
+++ b/llvm/unittests/Support/AllocatorTest.cpp
@@ -235,7 +235,7 @@ class MockSlabAllocator {
   static size_t LastSlabSize;
 
 public:
-  ~MockSlabAllocator() { }
+  ~MockSlabAllocator() = default;
 
   void *Allocate(size_t Size, size_t /*Alignment*/) {
     // Allocate space for the alignment, the slab, and a void* that goes right
diff --git a/llvm/unittests/Support/BinaryStreamTest.cpp b/llvm/unittests/Support/BinaryStreamTest.cpp
index 70cd4036fb2a6..06ed12b28f597 100644
--- a/llvm/unittests/Support/BinaryStreamTest.cpp
+++ b/llvm/unittests/Support/BinaryStreamTest.cpp
@@ -110,7 +110,7 @@ constexpr uint32_t NumStreams = 2 * NumEndians;
 class BinaryStreamTest : public testing::Test {
 
 public:
-  BinaryStreamTest() {}
+  BinaryStreamTest() = default;
 
   void SetUp() override {
     Streams.clear();
diff --git a/llvm/unittests/Support/Casting.cpp b/llvm/unittests/Support/Casting.cpp
index 18327f6dd1675..0df8b9fcab452 100644
--- a/llvm/unittests/Support/Casting.cpp
+++ b/llvm/unittests/Support/Casting.cpp
@@ -23,7 +23,7 @@ template <typename T> IllegalCast *cast(...) { return nullptr; }
 // with conversion facility
 //
 struct bar {
-  bar() {}
+  bar() = default;
   bar(const bar &) = delete;
   struct foo *baz();
   struct foo *caz();
@@ -36,7 +36,7 @@ struct foo {
 };
 
 struct base {
-  virtual ~base() {}
+  virtual ~base() = default;
 };
 
 struct derived : public base {
@@ -375,12 +375,12 @@ namespace inferred_upcasting {
 class Base {
 public:
   // No classof. We are testing that the upcast is inferred.
-  Base() {}
+  Base() = default;
 };
 
 class Derived : public Base {
 public:
-  Derived() {}
+  Derived() = default;
 };
 
 // Even with no explicit classof() in Base, we should still be able to cast
@@ -529,7 +529,7 @@ TEST(CastingTest, smart_dyn_cast_or_null) {
 #ifndef NDEBUG
 namespace assertion_checks {
 struct Base {
-  virtual ~Base() {}
+  virtual ~Base() = default;
 };
 
 struct Derived : public Base {
@@ -561,6 +561,47 @@ TEST(CastingTest, assertion_check_unique_ptr) {
       << "Invalid cast of const ref did not cause an abort()";
 }
 
+TEST(Casting, StaticCastPredicate) {
+  uint32_t Value = 1;
+
+  static_assert(
+      std::is_same_v<decltype(StaticCastTo<uint64_t>(Value)), uint64_t>);
+}
+
+TEST(Casting, LLVMRTTIPredicates) {
+  struct Base {
+    enum Kind { BK_Base, BK_Derived };
+    const Kind K;
+    Base(Kind K = BK_Base) : K(K) {}
+    Kind getKind() const { return K; }
+    virtual ~Base() = default;
+  };
+
+  struct Derived : Base {
+    Derived() : Base(BK_Derived) {}
+    static bool classof(const Base *B) { return B->getKind() == BK_Derived; }
+    bool Field = false;
+  };
+
+  Base B;
+  Derived D;
+  Base *BD = &D;
+  Base *Null = nullptr;
+
+  // Pointers.
+  EXPECT_EQ(DynCastTo<Derived>(BD), &D);
+  EXPECT_EQ(CastTo<Derived>(BD), &D);
+  EXPECT_EQ(DynCastTo<Derived>(&B), nullptr);
+  EXPECT_EQ(CastIfPresentTo<Derived>(BD), &D);
+  EXPECT_EQ(CastIfPresentTo<Derived>(Null), nullptr);
+  EXPECT_EQ(DynCastIfPresentTo<Derived>(BD), &D);
+  EXPECT_EQ(DynCastIfPresentTo<Derived>(Null), nullptr);
+
+  Base &R = D;
+  CastTo<Derived>(R).Field = true;
+  EXPECT_TRUE(D.Field);
+}
+
 } // end namespace assertion_checks
 #endif
 } // end namespace
diff --git a/llvm/unittests/Support/DynamicLibrary/PipSqueak.cpp b/llvm/unittests/Support/DynamicLibrary/PipSqueak.cpp
index 8ff2c30f4925a..8f115ae9095b8 100644
--- a/llvm/unittests/Support/DynamicLibrary/PipSqueak.cpp
+++ b/llvm/unittests/Support/DynamicLibrary/PipSqueak.cpp
@@ -8,6 +8,8 @@
 
 #include "PipSqueak.h"
 
+#include <vector>
+
 struct Global {
   std::string *Str;
   std::vector<std::string> *Vec;
diff --git a/llvm/unittests/Support/DynamicLibrary/PipSqueak.h b/llvm/unittests/Support/DynamicLibrary/PipSqueak.h
index dc069ca3d876b..f69c7bfab5531 100644
--- a/llvm/unittests/Support/DynamicLibrary/PipSqueak.h
+++ b/llvm/unittests/Support/DynamicLibrary/PipSqueak.h
@@ -15,11 +15,9 @@
 #pragma warning(disable: 4530)
 #pragma warning(disable: 4577)
 #include <string>
-#include <vector>
 #pragma warning(pop)
 #else
 #include <string>
-#include <vector>
 #endif
 
 #if defined(_WIN32) || defined(__CYGWIN__)
diff --git a/llvm/unittests/Support/HashBuilderTest.cpp b/llvm/unittests/Support/HashBuilderTest.cpp
index 0aacfcd185ba0..70cb92b21848c 100644
--- a/llvm/unittests/Support/HashBuilderTest.cpp
+++ b/llvm/unittests/Support/HashBuilderTest.cpp
@@ -15,7 +15,6 @@
 
 #include <list>
 #include <string>
-#include <type_traits>
 #include <utility>
 #include <vector>
 
diff --git a/llvm/unittests/Support/InstructionCostTest.cpp b/llvm/unittests/Support/InstructionCostTest.cpp
index efe838897a684..5392689131071 100644
--- a/llvm/unittests/Support/InstructionCostTest.cpp
+++ b/llvm/unittests/Support/InstructionCostTest.cpp
@@ -14,7 +14,7 @@ using namespace llvm;
 namespace {
 
 struct CostTest : public testing::Test {
-  CostTest() {}
+  CostTest() = default;
 };
 
 } // namespace
diff --git a/llvm/unittests/Support/LockFileManagerTest.cpp b/llvm/unittests/Support/LockFileManagerTest.cpp
index 627b2daef650c..bd61b6c36efb3 100644
--- a/llvm/unittests/Support/LockFileManagerTest.cpp
+++ b/llvm/unittests/Support/LockFileManagerTest.cpp
@@ -12,7 +12,6 @@
 #include "llvm/Testing/Support/Error.h"
 #include "llvm/Testing/Support/SupportHelpers.h"
 #include "gtest/gtest.h"
-#include <memory>
 
 using namespace llvm;
 using llvm::unittest::TempDir;
diff --git a/llvm/unittests/Support/NativeFormatTests.cpp b/llvm/unittests/Support/NativeFormatTests.cpp
index ac04c5a53d74a..974709efce9df 100644
--- a/llvm/unittests/Support/NativeFormatTests.cpp
+++ b/llvm/unittests/Support/NativeFormatTests.cpp
@@ -10,8 +10,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "gtest/gtest.h"
 
-#include <type_traits>
-
 using namespace llvm;
 
 namespace {
diff --git a/llvm/unittests/Support/OptimizedStructLayoutTest.cpp b/llvm/unittests/Support/OptimizedStructLayoutTest.cpp
index e8cd5f4285e52..0bcae0dcd5603 100644
--- a/llvm/unittests/Support/OptimizedStructLayoutTest.cpp
+++ b/llvm/unittests/Support/OptimizedStructLayoutTest.cpp
@@ -25,7 +25,7 @@ class LayoutTest {
   bool Verified = false;
 
 public:
-  LayoutTest() {}
+  LayoutTest() = default;
   LayoutTest(const LayoutTest &) = delete;
   LayoutTest &operator=(const LayoutTest &) = delete;
   ~LayoutTest() { assert(Verified); }
diff --git a/llvm/unittests/Support/ParallelTest.cpp b/llvm/unittests/Support/ParallelTest.cpp
index 041067d068883..c7ecc4eff6c29 100644
--- a/llvm/unittests/Support/ParallelTest.cpp
+++ b/llvm/unittests/Support/ParallelTest.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Config/llvm-config.h" // for LLVM_ENABLE_THREADS
 #include "llvm/Support/ThreadPool.h"
 #include "gtest/gtest.h"
-#include <array>
 #include <random>
 
 uint32_t array[1024 * 1024];
diff --git a/llvm/unittests/Support/SpecialCaseListTest.cpp b/llvm/unittests/Support/SpecialCaseListTest.cpp
index 750fedaf0a436..812e0d3d8520c 100644
--- a/llvm/unittests/Support/SpecialCaseListTest.cpp
+++ b/llvm/unittests/Support/SpecialCaseListTest.cpp
@@ -308,43 +308,49 @@ TEST_F(SpecialCaseListTest, Version2) {
 }
 
 TEST_F(SpecialCaseListTest, DotSlash) {
-  std::unique_ptr<SpecialCaseList> SCL2 = makeSpecialCaseList("[dot]\n"
-                                                              "fun:./foo\n"
-                                                              "src:./bar\n"
-                                                              "[not]\n"
-                                                              "fun:foo\n"
-                                                              "src:bar\n");
-  std::unique_ptr<SpecialCaseList> SCL3 = makeSpecialCaseList("[dot]\n"
-                                                              "fun:./foo\n"
-                                                              "src:./bar\n"
-                                                              "[not]\n"
-                                                              "fun:foo\n"
-                                                              "src:bar\n",
-                                                              /*Version=*/3);
+  StringRef IgnoreList = "[dot]\n"
+                         "fun:./foo\n"
+                         "src:./bar\n"
+                         "[not]\n"
+                         "fun:foo\n"
+                         "src:bar\n";
+  std::unique_ptr<SpecialCaseList> SCL2 = makeSpecialCaseList(IgnoreList);
+  std::unique_ptr<SpecialCaseList> SCL3 =
+      makeSpecialCaseList(IgnoreList, /*Version=*/3);
+  std::unique_ptr<SpecialCaseList> SCL4 = makeSpecialCaseList(IgnoreList,
+                                                              /*Version=*/4);
 
   EXPECT_TRUE(SCL2->inSection("dot", "fun", "./foo"));
   EXPECT_TRUE(SCL3->inSection("dot", "fun", "./foo"));
+  EXPECT_TRUE(SCL4->inSection("dot", "fun", "./foo"));
 
   EXPECT_FALSE(SCL2->inSection("dot", "fun", "foo"));
   EXPECT_FALSE(SCL3->inSection("dot", "fun", "foo"));
+  EXPECT_FALSE(SCL4->inSection("dot", "fun", "foo"));
 
   EXPECT_TRUE(SCL2->inSection("dot", "src", "./bar"));
   EXPECT_FALSE(SCL3->inSection("dot", "src", "./bar"));
+  EXPECT_FALSE(SCL4->inSection("dot", "src", "./bar"));
 
   EXPECT_FALSE(SCL2->inSection("dot", "src", "bar"));
   EXPECT_FALSE(SCL3->inSection("dot", "src", "bar"));
+  EXPECT_FALSE(SCL4->inSection("dot", "src", "bar"));
 
   EXPECT_FALSE(SCL2->inSection("not", "fun", "./foo"));
   EXPECT_FALSE(SCL3->inSection("not", "fun", "./foo"));
+  EXPECT_FALSE(SCL4->inSection("not", "fun", "./foo"));
 
   EXPECT_TRUE(SCL2->inSection("not", "fun", "foo"));
   EXPECT_TRUE(SCL3->inSection("not", "fun", "foo"));
+  EXPECT_TRUE(SCL4->inSection("not", "fun", "foo"));
 
   EXPECT_FALSE(SCL2->inSection("not", "src", "./bar"));
   EXPECT_TRUE(SCL3->inSection("not", "src", "./bar"));
+  EXPECT_TRUE(SCL4->inSection("not", "src", "./bar"));
 
   EXPECT_TRUE(SCL2->inSection("not", "src", "bar"));
   EXPECT_TRUE(SCL3->inSection("not", "src", "bar"));
+  EXPECT_TRUE(SCL4->inSection("not", "src", "bar"));
 }
 
 TEST_F(SpecialCaseListTest, LinesInSection) {
diff --git a/llvm/unittests/Support/ThreadPool.cpp b/llvm/unittests/Support/ThreadPool.cpp
index aa7f8744e1417..b5268c82e4199 100644
--- a/llvm/unittests/Support/ThreadPool.cpp
+++ b/llvm/unittests/Support/ThreadPool.cpp
@@ -183,6 +183,20 @@ TYPED_TEST(ThreadPoolTest, Async) {
   ASSERT_EQ(2, i.load());
 }
 
+TYPED_TEST(ThreadPoolTest, AsyncMoveOnly) {
+  CHECK_UNSUPPORTED();
+  DefaultThreadPool Pool;
+  std::promise<int> p;
+  std::future<int> f = p.get_future();
+  Pool.async([this, p = std::move(p)]() mutable {
+    this->waitForMainThread();
+    p.set_value(42);
+  });
+  this->setMainThreadReady();
+  Pool.wait();
+  ASSERT_EQ(42, f.get());
+}
+
 TYPED_TEST(ThreadPoolTest, GetFuture) {
   CHECK_UNSUPPORTED();
   DefaultThreadPool Pool(hardware_concurrency(2));
diff --git a/llvm/unittests/Support/YAMLIOTest.cpp b/llvm/unittests/Support/YAMLIOTest.cpp
index 283e5f829ba46..7446c07ccb9a8 100644
--- a/llvm/unittests/Support/YAMLIOTest.cpp
+++ b/llvm/unittests/Support/YAMLIOTest.cpp
@@ -3221,12 +3221,12 @@ template <> struct TaggedScalarTraits<Scalar> {
 
 template <> struct CustomMappingTraits<Map> {
   static void inputOne(IO &IO, StringRef Key, Map &M) {
-    IO.mapRequired(Key.str().c_str(), M[Key]);
+    IO.mapRequired(Key, M[Key]);
   }
 
   static void output(IO &IO, Map &M) {
     for (auto &N : M)
-      IO.mapRequired(N.getKey().str().c_str(), N.getValue());
+      IO.mapRequired(N.getKey(), N.getValue());
   }
 };
 
diff --git a/llvm/unittests/Support/raw_ostream_proxy_test.cpp b/llvm/unittests/Support/raw_ostream_proxy_test.cpp
index 864dda712aac8..446e64aa1ff59 100644
--- a/llvm/unittests/Support/raw_ostream_proxy_test.cpp
+++ b/llvm/unittests/Support/raw_ostream_proxy_test.cpp
@@ -40,8 +40,6 @@ class BufferedNoPwriteSmallVectorStream : public raw_ostream {
   bool IsDisplayed = false;
 };
 
-constexpr size_t BufferedNoPwriteSmallVectorStream::PreferredBufferSize;
-
 TEST(raw_ostream_proxyTest, write) {
   // Besides confirming that "write" works, this test confirms that the proxy
   // takes on the buffer from the stream it's proxying, such that writes to the
diff --git a/llvm/unittests/Target/AArch64/AArch64InstPrinterTest.cpp b/llvm/unittests/Target/AArch64/AArch64InstPrinterTest.cpp
index 4dfc0bcb0dc4c..a835a34fea58b 100644
--- a/llvm/unittests/Target/AArch64/AArch64InstPrinterTest.cpp
+++ b/llvm/unittests/Target/AArch64/AArch64InstPrinterTest.cpp
@@ -36,10 +36,8 @@ static std::string AArch64InstPrinterTestPrintAlignedLabel(uint64_t value) {
   MCAsmInfo MAI;
   MCInstrInfo MII;
   MCRegisterInfo MRI;
-  MCSubtargetInfo STI(Triple(""), "", "", "", {},
-                      ArrayRef((SubtargetFeatureKV *)NULL, (size_t)0),
-                      ArrayRef((SubtargetSubTypeKV *)NULL, (size_t)0), NULL,
-                      NULL, NULL, NULL, NULL, NULL);
+  MCSubtargetInfo STI(Triple(""), "", "", "", {}, {}, {}, nullptr, nullptr,
+                      nullptr, nullptr, nullptr, nullptr);
   MCContext Ctx(Triple(""), &MAI, &MRI, &STI);
   MCInst MI;
 
diff --git a/llvm/unittests/Target/SPIRV/SPIRVAPITest.cpp b/llvm/unittests/Target/SPIRV/SPIRVAPITest.cpp
index da713007f662d..321c092bc7518 100644
--- a/llvm/unittests/Target/SPIRV/SPIRVAPITest.cpp
+++ b/llvm/unittests/Target/SPIRV/SPIRVAPITest.cpp
@@ -20,7 +20,6 @@
 #include "gtest/gtest.h"
 #include <gmock/gmock.h>
 #include <string>
-#include <utility>
 
 namespace llvm {
 
diff --git a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
index d7f2908bb079b..c36ed93a20cef 100644
--- a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
+++ b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
@@ -75,7 +75,7 @@ TEST(WebAssemblyExceptionInfoTest, TEST0) {
 
   declare i32 @__gxx_wasm_personality_v0(...)
 
-  define void @test0() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+  define void @test0() personality ptr @__gxx_wasm_personality_v0 {
     unreachable
   }
 
@@ -237,7 +237,7 @@ TEST(WebAssemblyExceptionInfoTest, TEST1) {
 
   declare i32 @__gxx_wasm_personality_v0(...)
 
-  define void @test1() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+  define void @test1() personality ptr @__gxx_wasm_personality_v0 {
     unreachable
   }
 
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index bfc127530570d..c55cd94048cc5 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -1204,6 +1204,7 @@ Experimental extensions
     zvfofp8min           0.2
     zvkgs                0.7
     zvqdotq              0.0
+    smpmpmt              0.6
     svukte               0.3
     xqccmp               0.3
     xqcia                0.7
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 759109a3f6cbd..328013e961411 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -737,8 +737,7 @@ TEST(TargetParserTest, ARMFPUNeonSupportLevel) {
   for (ARM::FPUKind FK = static_cast<ARM::FPUKind>(0);
        FK <= ARM::FPUKind::FK_LAST;
        FK = static_cast<ARM::FPUKind>(static_cast<unsigned>(FK) + 1))
-    if (FK == ARM::FK_LAST ||
-        ARM::getFPUName(FK).find("neon") == std::string::npos)
+    if (FK == ARM::FK_LAST || !ARM::getFPUName(FK).contains("neon"))
       EXPECT_EQ(ARM::NeonSupportLevel::None, ARM::getFPUNeonSupportLevel(FK));
     else
       EXPECT_NE(ARM::NeonSupportLevel::None, ARM::getFPUNeonSupportLevel(FK));
@@ -748,9 +747,8 @@ TEST(TargetParserTest, ARMFPURestriction) {
   for (ARM::FPUKind FK = static_cast<ARM::FPUKind>(0);
        FK <= ARM::FPUKind::FK_LAST;
        FK = static_cast<ARM::FPUKind>(static_cast<unsigned>(FK) + 1)) {
-    if (FK == ARM::FK_LAST ||
-        (ARM::getFPUName(FK).find("d16") == std::string::npos &&
-         ARM::getFPUName(FK).find("vfpv3xd") == std::string::npos))
+    if (FK == ARM::FK_LAST || (!ARM::getFPUName(FK).contains("d16") &&
+                               !ARM::getFPUName(FK).contains("vfpv3xd")))
       EXPECT_EQ(ARM::FPURestriction::None, ARM::getFPURestriction(FK));
     else
       EXPECT_NE(ARM::FPURestriction::None, ARM::getFPURestriction(FK));
@@ -1454,7 +1452,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_GCIE,         AArch64::AEK_SME2P3,
       AArch64::AEK_SVE2P3,       AArch64::AEK_SVE_B16MM,
       AArch64::AEK_F16MM,        AArch64::AEK_F16F32DOT,
-      AArch64::AEK_F16F32MM,
+      AArch64::AEK_F16F32MM,     AArch64::AEK_MOPS_GO,
   };
 
   std::vector<StringRef> Features;
@@ -1578,6 +1576,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+f16mm"));
   EXPECT_TRUE(llvm::is_contained(Features, "+f16f32dot"));
   EXPECT_TRUE(llvm::is_contained(Features, "+f16f32mm"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+mops-go"));
 
   // Assuming we listed every extension above, this should produce the same
   // result.
@@ -1756,6 +1755,7 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
       {"f16mm", "nof16mm", "+f16mm", "-f16mm"},
       {"f16f32dot", "nof16f32dot", "+f16f32dot", "-f16f32dot"},
       {"f16f32mm", "nof16f32mm", "+f16f32mm", "-f16f32mm"},
+      {"mops-go", "nomops-go", "+mops-go", "-mops-go"},
   };
 
   for (unsigned i = 0; i < std::size(ArchExt); i++) {
diff --git a/llvm/unittests/TargetParser/TripleTest.cpp b/llvm/unittests/TargetParser/TripleTest.cpp
index 3e803691cfd1f..df8284d7be66a 100644
--- a/llvm/unittests/TargetParser/TripleTest.cpp
+++ b/llvm/unittests/TargetParser/TripleTest.cpp
@@ -2630,6 +2630,17 @@ TEST(TripleTest, isMacOSVersionLT) {
   EXPECT_FALSE(T.isMacOSXVersionLT(10, 15, 0));
 }
 
+TEST(TripleTest, isMacOSVersionGE) {
+  Triple T = Triple("x86_64-apple-macos11");
+  EXPECT_FALSE(T.isMacOSXVersionGE(11, 1, 0));
+  EXPECT_TRUE(T.isMacOSXVersionGE(10, 15, 0));
+
+  T = Triple("x86_64-apple-darwin20");
+  EXPECT_FALSE(T.isMacOSXVersionGE(11, 1, 0));
+  EXPECT_TRUE(T.isMacOSXVersionGE(11, 0, 0));
+  EXPECT_TRUE(T.isMacOSXVersionGE(10, 15, 0));
+}
+
 TEST(TripleTest, CanonicalizeOSVersion) {
   EXPECT_EQ(VersionTuple(10, 15, 4),
             Triple::getCanonicalVersionForOS(Triple::MacOSX,
diff --git a/llvm/unittests/TextAPI/TextStubHelpers.h b/llvm/unittests/TextAPI/TextStubHelpers.h
index 7c9c74a252760..87ca7e1c0b4d4 100644
--- a/llvm/unittests/TextAPI/TextStubHelpers.h
+++ b/llvm/unittests/TextAPI/TextStubHelpers.h
@@ -8,7 +8,6 @@
 
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/TextAPI/InterfaceFile.h"
-#include <algorithm>
 #include <string>
 
 #ifndef TEXT_STUB_HELPERS_H
diff --git a/llvm/unittests/TextAPI/TextStubV1Tests.cpp b/llvm/unittests/TextAPI/TextStubV1Tests.cpp
index 3778f75294bb1..950e79da77c9e 100644
--- a/llvm/unittests/TextAPI/TextStubV1Tests.cpp
+++ b/llvm/unittests/TextAPI/TextStubV1Tests.cpp
@@ -12,7 +12,6 @@
 #include "llvm/TextAPI/TextAPIWriter.h"
 #include "gtest/gtest.h"
 #include <string>
-#include <vector>
 
 using namespace llvm;
 using namespace llvm::MachO;
diff --git a/llvm/unittests/TextAPI/TextStubV2Tests.cpp b/llvm/unittests/TextAPI/TextStubV2Tests.cpp
index 11dc0b41b9d22..548f74852dafc 100644
--- a/llvm/unittests/TextAPI/TextStubV2Tests.cpp
+++ b/llvm/unittests/TextAPI/TextStubV2Tests.cpp
@@ -11,7 +11,6 @@
 #include "llvm/TextAPI/TextAPIWriter.h"
 #include "gtest/gtest.h"
 #include <string>
-#include <vector>
 
 using namespace llvm;
 using namespace llvm::MachO;
diff --git a/llvm/unittests/TextAPI/TextStubV4Tests.cpp b/llvm/unittests/TextAPI/TextStubV4Tests.cpp
index 350ef413964a1..1a10c9119a43c 100644
--- a/llvm/unittests/TextAPI/TextStubV4Tests.cpp
+++ b/llvm/unittests/TextAPI/TextStubV4Tests.cpp
@@ -12,7 +12,6 @@
 #include "llvm/TextAPI/TextAPIWriter.h"
 #include "gtest/gtest.h"
 #include <string>
-#include <vector>
 
 using namespace llvm;
 using namespace llvm::MachO;
diff --git a/llvm/unittests/Transforms/IPO/AttributorTest.cpp b/llvm/unittests/Transforms/IPO/AttributorTest.cpp
index e442dae9aa3ad..8d90b308f840c 100644
--- a/llvm/unittests/Transforms/IPO/AttributorTest.cpp
+++ b/llvm/unittests/Transforms/IPO/AttributorTest.cpp
@@ -17,7 +17,6 @@
 #include "llvm/Testing/Support/Error.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
 #include "gtest/gtest.h"
-#include <memory>
 
 namespace llvm {
 
@@ -78,17 +77,17 @@ TEST_F(AttributorTestBase, AAReachabilityTest) {
   const char *ModuleString = R"(
     @x = external global i32
     define void @func4() {
-      store i32 0, i32* @x
+      store i32 0, ptr @x
       ret void
     }
 
     define internal void @func3() {
-      store i32 0, i32* @x
+      store i32 0, ptr @x
       ret void
     }
 
     define internal void @func8() {
-      store i32 0, i32* @x
+      store i32 0, ptr @x
       ret void
     }
 
@@ -105,7 +104,7 @@ TEST_F(AttributorTestBase, AAReachabilityTest) {
     }
 
     declare void @unknown()
-    define internal void @func5(void ()* %ptr) {
+    define internal void @func5(ptr %ptr) {
     entry:
       call void %ptr()
       call void @unknown()
@@ -114,8 +113,8 @@ TEST_F(AttributorTestBase, AAReachabilityTest) {
 
     define void @func6() {
     entry:
-      store i32 0, i32* @x
-      call void @func5(void ()* @func3)
+      store i32 0, ptr @x
+      call void @func5(ptr @func3)
       ret void
     }
 
diff --git a/llvm/unittests/Transforms/Scalar/LICMTest.cpp b/llvm/unittests/Transforms/Scalar/LICMTest.cpp
index 98a69bbb47de1..a193993ba04d6 100644
--- a/llvm/unittests/Transforms/Scalar/LICMTest.cpp
+++ b/llvm/unittests/Transforms/Scalar/LICMTest.cpp
@@ -37,13 +37,13 @@ TEST(LICMTest, TestSCEVInvalidationOnHoisting) {
 
   SMDiagnostic Error;
   StringRef Text = R"(
-    define void @foo(i64* %ptr) {
+    define void @foo(ptr %ptr) {
     entry:
       br label %loop
 
     loop:
       %iv = phi i64 [ 0, %entry ], [ %iv.inc, %loop ]
-      %n = load i64, i64* %ptr, !invariant.load !0
+      %n = load i64, ptr %ptr, !invariant.load !0
       %iv.inc = add i64 %iv, 1
       %cmp = icmp ult i64 %iv.inc, %n
       br i1 %cmp, label %loop, label %exit
@@ -62,17 +62,17 @@ TEST(LICMTest, TestSCEVInvalidationOnHoisting) {
   BasicBlock &EntryBB = F->getEntryBlock();
   BasicBlock *LoopBB = EntryBB.getUniqueSuccessor();
 
-  // Select `load i64, i64* %ptr`.
+  // Select `load i64, ptr %ptr`.
   Instruction *IBefore = &*LoopBB->getFirstNonPHIIt();
   // Make sure the right instruction was selected.
   ASSERT_TRUE(isa<LoadInst>(IBefore));
-  // Upon this query SCEV caches disposition of <load i64, i64* %ptr> SCEV.
+  // Upon this query SCEV caches disposition of <load i64, ptr %ptr> SCEV.
   ASSERT_EQ(SE.getBlockDisposition(SE.getSCEV(IBefore), LoopBB),
             ScalarEvolution::BlockDisposition::DominatesBlock);
 
   MPM.run(*M, MAM);
 
-  // Select `load i64, i64* %ptr` after it was hoisted.
+  // Select `load i64, ptr %ptr` after it was hoisted.
   Instruction *IAfter = &*EntryBB.getFirstNonPHIIt();
   // Make sure the right instruction was selected.
   ASSERT_TRUE(isa<LoadInst>(IAfter));
@@ -84,7 +84,7 @@ TEST(LICMTest, TestSCEVInvalidationOnHoisting) {
       SE.getBlockDisposition(SE.getSCEV(IAfter), LoopBB);
 
   // If LICM have properly invalidated SCEV,
-  //   1. SCEV of <load i64, i64* %ptr> should properly dominate the "loop" BB,
+  //   1. SCEV of <load i64, ptr %ptr> should properly dominate the "loop" BB,
   //   2. extra invalidation shouldn't change result of the query.
   EXPECT_EQ(DispositionBeforeInvalidation,
             ScalarEvolution::BlockDisposition::ProperlyDominatesBlock);
diff --git a/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp b/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
index cb3d1001e4110..88eaa875a803a 100644
--- a/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
+++ b/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
@@ -265,21 +265,21 @@ class LoopPassManagerTest : public ::testing::Test {
 public:
   LoopPassManagerTest()
       : M(parseIR(Context,
-                  "define void @f(i1* %ptr) {\n"
+                  "define void @f(ptr %ptr) {\n"
                   "entry:\n"
                   "  br label %loop.0\n"
                   "loop.0:\n"
-                  "  %cond.0 = load volatile i1, i1* %ptr\n"
+                  "  %cond.0 = load volatile i1, ptr %ptr\n"
                   "  br i1 %cond.0, label %loop.0.0.ph, label %end\n"
                   "loop.0.0.ph:\n"
                   "  br label %loop.0.0\n"
                   "loop.0.0:\n"
-                  "  %cond.0.0 = load volatile i1, i1* %ptr\n"
+                  "  %cond.0.0 = load volatile i1, ptr %ptr\n"
                   "  br i1 %cond.0.0, label %loop.0.0, label %loop.0.1.ph\n"
                   "loop.0.1.ph:\n"
                   "  br label %loop.0.1\n"
                   "loop.0.1:\n"
-                  "  %cond.0.1 = load volatile i1, i1* %ptr\n"
+                  "  %cond.0.1 = load volatile i1, ptr %ptr\n"
                   "  br i1 %cond.0.1, label %loop.0.1, label %loop.0.latch\n"
                   "loop.0.latch:\n"
                   "  br label %loop.0\n"
@@ -287,11 +287,11 @@ class LoopPassManagerTest : public ::testing::Test {
                   "  ret void\n"
                   "}\n"
                   "\n"
-                  "define void @g(i1* %ptr) {\n"
+                  "define void @g(ptr %ptr) {\n"
                   "entry:\n"
                   "  br label %loop.g.0\n"
                   "loop.g.0:\n"
-                  "  %cond.0 = load volatile i1, i1* %ptr\n"
+                  "  %cond.0 = load volatile i1, ptr %ptr\n"
                   "  br i1 %cond.0, label %loop.g.0, label %end\n"
                   "end:\n"
                   "  ret void\n"
@@ -861,26 +861,26 @@ TEST_F(LoopPassManagerTest, IndirectOuterPassInvalidation) {
 
 TEST_F(LoopPassManagerTest, LoopChildInsertion) {
   // Super boring module with three loops in a single loop nest.
-  M = parseIR(Context, "define void @f(i1* %ptr) {\n"
+  M = parseIR(Context, "define void @f(ptr %ptr) {\n"
                        "entry:\n"
                        "  br label %loop.0\n"
                        "loop.0:\n"
-                       "  %cond.0 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0, label %loop.0.0.ph, label %end\n"
                        "loop.0.0.ph:\n"
                        "  br label %loop.0.0\n"
                        "loop.0.0:\n"
-                       "  %cond.0.0 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.0 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.0, label %loop.0.0, label %loop.0.1.ph\n"
                        "loop.0.1.ph:\n"
                        "  br label %loop.0.1\n"
                        "loop.0.1:\n"
-                       "  %cond.0.1 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.1 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.1, label %loop.0.1, label %loop.0.2.ph\n"
                        "loop.0.2.ph:\n"
                        "  br label %loop.0.2\n"
                        "loop.0.2:\n"
-                       "  %cond.0.2 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.2 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.2, label %loop.0.2, label %loop.0.latch\n"
                        "loop.0.latch:\n"
                        "  br label %loop.0\n"
@@ -1064,28 +1064,28 @@ TEST_F(LoopPassManagerTest, LoopChildInsertion) {
 TEST_F(LoopPassManagerTest, LoopPeerInsertion) {
   // Super boring module with two loop nests and loop nest with two child
   // loops.
-  M = parseIR(Context, "define void @f(i1* %ptr) {\n"
+  M = parseIR(Context, "define void @f(ptr %ptr) {\n"
                        "entry:\n"
                        "  br label %loop.0\n"
                        "loop.0:\n"
-                       "  %cond.0 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0, label %loop.0.0.ph, label %loop.2.ph\n"
                        "loop.0.0.ph:\n"
                        "  br label %loop.0.0\n"
                        "loop.0.0:\n"
-                       "  %cond.0.0 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.0 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.0, label %loop.0.0, label %loop.0.2.ph\n"
                        "loop.0.2.ph:\n"
                        "  br label %loop.0.2\n"
                        "loop.0.2:\n"
-                       "  %cond.0.2 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.2 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.2, label %loop.0.2, label %loop.0.latch\n"
                        "loop.0.latch:\n"
                        "  br label %loop.0\n"
                        "loop.2.ph:\n"
                        "  br label %loop.2\n"
                        "loop.2:\n"
-                       "  %cond.2 = load volatile i1, i1* %ptr\n"
+                       "  %cond.2 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.2, label %loop.2, label %end\n"
                        "end:\n"
                        "  ret void\n"
@@ -1318,31 +1318,31 @@ TEST_F(LoopPassManagerTest, LoopDeletion) {
   // Build a module with a single loop nest that contains one outer loop with
   // three subloops, and one of those with its own subloop. We will
   // incrementally delete all of these to test different deletion scenarios.
-  M = parseIR(Context, "define void @f(i1* %ptr) {\n"
+  M = parseIR(Context, "define void @f(ptr %ptr) {\n"
                        "entry:\n"
                        "  br label %loop.0\n"
                        "loop.0:\n"
-                       "  %cond.0 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0, label %loop.0.0.ph, label %end\n"
                        "loop.0.0.ph:\n"
                        "  br label %loop.0.0\n"
                        "loop.0.0:\n"
-                       "  %cond.0.0 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.0 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.0, label %loop.0.0, label %loop.0.1.ph\n"
                        "loop.0.1.ph:\n"
                        "  br label %loop.0.1\n"
                        "loop.0.1:\n"
-                       "  %cond.0.1 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.1 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.1, label %loop.0.1, label %loop.0.2.ph\n"
                        "loop.0.2.ph:\n"
                        "  br label %loop.0.2\n"
                        "loop.0.2:\n"
-                       "  %cond.0.2 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.2 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.2, label %loop.0.2.0.ph, label %loop.0.latch\n"
                        "loop.0.2.0.ph:\n"
                        "  br label %loop.0.2.0\n"
                        "loop.0.2.0:\n"
-                       "  %cond.0.2.0 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.2.0 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.2.0, label %loop.0.2.0, label %loop.0.2.latch\n"
                        "loop.0.2.latch:\n"
                        "  br label %loop.0.2\n"
diff --git a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
index 4235c93f275f0..00d9e9ff81e05 100644
--- a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
@@ -484,9 +484,9 @@ while.body:
 TEST(BasicBlockUtils, SplitIndirectBrCriticalEdgesIgnorePHIs) {
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"IR(
-define void @crit_edge(i8* %tgt, i1 %cond0, i1 %cond1) {
+define void @crit_edge(ptr %tgt, i1 %cond0, i1 %cond1) {
 entry:
-  indirectbr i8* %tgt, [label %bb0, label %bb1, label %bb2]
+  indirectbr ptr %tgt, [label %bb0, label %bb1, label %bb2]
 bb0:
   br i1 %cond0, label %bb1, label %bb2
 bb1:
@@ -526,9 +526,9 @@ define void @crit_edge(i8* %tgt, i1 %cond0, i1 %cond1) {
 TEST(BasicBlockUtils, SplitIndirectBrCriticalEdges) {
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"IR(
-define void @crit_edge(i8* %tgt, i1 %cond0, i1 %cond1) {
+define void @crit_edge(ptr %tgt, i1 %cond0, i1 %cond1) {
 entry:
-  indirectbr i8* %tgt, [label %bb0, label %bb1, label %bb2]
+  indirectbr ptr %tgt, [label %bb0, label %bb1, label %bb2]
 bb0:
   br i1 %cond0, label %bb1, label %bb2
 bb1:
diff --git a/llvm/unittests/Transforms/Utils/CloningTest.cpp b/llvm/unittests/Transforms/Utils/CloningTest.cpp
index d990808d31fe2..237bc6e873f94 100644
--- a/llvm/unittests/Transforms/Utils/CloningTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CloningTest.cpp
@@ -394,7 +394,7 @@ TEST(CloneLoop, CloneLoopNest) {
 
   std::unique_ptr<Module> M = parseIR(
     Context,
-    R"(define void @foo(i32* %A, i32 %ub) {
+    R"(define void @foo(ptr %A, i32 %ub) {
 entry:
   %guardcmp = icmp slt i32 0, %ub
   br i1 %guardcmp, label %for.outer.preheader, label %for.end
@@ -408,8 +408,8 @@ for.inner.preheader:
 for.inner:
   %i = phi i32 [ 0, %for.inner.preheader ], [ %inc, %for.inner ]
   %idxprom = sext i32 %i to i64
-  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom
-  store i32 %i, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %idxprom
+  store i32 %i, ptr %arrayidx, align 4
   %inc = add nsw i32 %i, 1
   %cmp = icmp slt i32 %inc, %ub
   br i1 %cmp, label %for.inner, label %for.inner.exit
@@ -728,10 +728,10 @@ TEST(CloneFunction, CloneEmptyFunction) {
 
 TEST(CloneFunction, CloneFunctionWithInalloca) {
   StringRef ImplAssembly = R"(
-    declare void @a(i32* inalloca(i32))
+    declare void @a(ptr inalloca(i32))
     define void @foo() {
       %a = alloca inalloca i32
-      call void @a(i32* inalloca(i32) %a)
+      call void @a(ptr inalloca(i32) %a)
       ret void
     }
     declare void @bar()
diff --git a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
index 9ea8de3da1e5b..90f06204ec9b3 100644
--- a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
@@ -154,13 +154,13 @@ TEST(CodeExtractor, ExitBlockOrderingPhis) {
       %0 = alloca i32, align 4
       br label %test0
     test0:
-      %c = load i32, i32* %0, align 4
+      %c = load i32, ptr %0, align 4
       br label %test1
     test1:
-      %e = load i32, i32* %0, align 4
+      %e = load i32, ptr %0, align 4
       br i1 true, label %first, label %test
     test:
-      %d = load i32, i32* %0, align 4
+      %d = load i32, ptr %0, align 4
       br i1 true, label %first, label %next
     first:
       %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
@@ -212,13 +212,13 @@ TEST(CodeExtractor, ExitBlockOrdering) {
       %0 = alloca i32, align 4
       br label %test0
     test0:
-      %c = load i32, i32* %0, align 4
+      %c = load i32, ptr %0, align 4
       br label %test1
     test1:
-      %e = load i32, i32* %0, align 4
+      %e = load i32, ptr %0, align 4
       br i1 true, label %first, label %test
     test:
-      %d = load i32, i32* %0, align 4
+      %d = load i32, ptr %0, align 4
       br i1 true, label %first, label %next
     first:
       ret void
@@ -317,7 +317,7 @@ TEST(CodeExtractor, StoreOutputInvokeResultAfterEHPad) {
   std::unique_ptr<Module> M(parseAssemblyString(R"invalid(
     declare i8 @hoge()
 
-    define i32 @foo() personality i8* null {
+    define i32 @foo() personality ptr null {
       entry:
         %call = invoke i8 @hoge()
                 to label %invoke.cont unwind label %lpad
@@ -326,8 +326,8 @@ TEST(CodeExtractor, StoreOutputInvokeResultAfterEHPad) {
         unreachable
 
       lpad:                                             ; preds = %entry
-        %0 = landingpad { i8*, i32 }
-                catch i8* null
+        %0 = landingpad { ptr, i32 }
+                catch ptr null
         br i1 undef, label %catch, label %finally.catchall
 
       catch:                                            ; preds = %lpad
@@ -342,13 +342,13 @@ TEST(CodeExtractor, StoreOutputInvokeResultAfterEHPad) {
         unreachable
 
       lpad2:                                           ; preds = %invoke.cont2, %catch
-        %ex.1 = phi i8* [ undef, %invoke.cont2 ], [ null, %catch ]
-        %1 = landingpad { i8*, i32 }
-                catch i8* null
+        %ex.1 = phi ptr [ undef, %invoke.cont2 ], [ null, %catch ]
+        %1 = landingpad { ptr, i32 }
+                catch ptr null
         br label %finally.catchall
 
       finally.catchall:                                 ; preds = %lpad33, %lpad
-        %ex.2 = phi i8* [ %ex.1, %lpad2 ], [ null, %lpad ]
+        %ex.2 = phi ptr [ %ex.1, %lpad2 ], [ null, %lpad ]
         unreachable
     }
   )invalid", Err, Ctx));
@@ -384,7 +384,7 @@ TEST(CodeExtractor, StoreOutputInvokeResultInExitStub) {
   std::unique_ptr<Module> M(parseAssemblyString(R"invalid(
     declare i32 @bar()
 
-    define i32 @foo() personality i8* null {
+    define i32 @foo() personality ptr null {
     entry:
       %0 = invoke i32 @bar() to label %exit unwind label %lpad
 
@@ -392,9 +392,9 @@ TEST(CodeExtractor, StoreOutputInvokeResultInExitStub) {
       ret i32 %0
 
     lpad:
-      %1 = landingpad { i8*, i32 }
+      %1 = landingpad { ptr, i32 }
               cleanup
-      resume { i8*, i32 } %1
+      resume { ptr, i32 } %1
     }
   )invalid",
                                                 Err, Ctx));
@@ -421,7 +421,7 @@ TEST(CodeExtractor, ExtractAndInvalidateAssumptionCache) {
         target triple = "aarch64"
 
         %b = type { i64 }
-        declare void @g(i8*)
+        declare void @g(ptr)
 
         declare void @llvm.assume(i1) #0
 
@@ -430,9 +430,9 @@ TEST(CodeExtractor, ExtractAndInvalidateAssumptionCache) {
           br label %label
 
         label:
-          %0 = load %b*, %b** inttoptr (i64 8 to %b**), align 8
-          %1 = getelementptr inbounds %b, %b* %0, i64 undef, i32 0
-          %2 = load i64, i64* %1, align 8
+          %0 = load ptr, ptr inttoptr (i64 8 to ptr), align 8
+          %1 = getelementptr inbounds %b, ptr %0, i64 undef, i32 0
+          %2 = load i64, ptr %1, align 8
           %3 = icmp ugt i64 %2, 1
           br i1 %3, label %if.then, label %if.else
 
@@ -440,8 +440,8 @@ TEST(CodeExtractor, ExtractAndInvalidateAssumptionCache) {
           unreachable
 
         if.else:
-          call void @g(i8* undef)
-          store i64 undef, i64* null, align 536870912
+          call void @g(ptr undef)
+          store i64 undef, ptr null, align 536870912
           %4 = icmp eq i64 %2, 0
           call void @llvm.assume(i1 %4)
           unreachable
@@ -473,9 +473,9 @@ TEST(CodeExtractor, RemoveBitcastUsesFromOuterLifetimeMarkers) {
     target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
     target triple = "x86_64-unknown-linux-gnu"
 
-    declare void @use(i32*)
-    declare void @llvm.lifetime.start.p0i8(i64, i8*)
-    declare void @llvm.lifetime.end.p0i8(i64, i8*)
+    declare void @use(ptr)
+    declare void @llvm.lifetime.start.p0i8(i64, ptr)
+    declare void @llvm.lifetime.end.p0i8(i64, ptr)
 
     define void @foo() {
     entry:
@@ -483,14 +483,14 @@ TEST(CodeExtractor, RemoveBitcastUsesFromOuterLifetimeMarkers) {
       br label %extract
 
     extract:
-      %1 = bitcast i32* %0 to i8*
-      call void @llvm.lifetime.start.p0i8(i64 4, i8* %1)
-      call void @use(i32* %0)
+      %1 = bitcast ptr %0 to ptr
+      call void @llvm.lifetime.start.p0i8(i64 4, ptr %1)
+      call void @use(ptr %0)
       br label %exit
 
     exit:
-      call void @use(i32* %0)
-      call void @llvm.lifetime.end.p0i8(i64 4, i8* %1)
+      call void @use(ptr %0)
+      call void @llvm.lifetime.end.p0i8(i64 4, ptr %1)
       ret void
     }
   )ir",
diff --git a/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp b/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp
index 9466977d00649..191ccc3a9dbd9 100644
--- a/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp
@@ -75,21 +75,21 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentSimpleTest) {
   //     i = 3;
   // }
   std::unique_ptr<Module> M =
-      parseIR(C, R"(define void @foo(i32* %i, i1 %cond1, i1 %cond2) {
+      parseIR(C, R"(define void @foo(ptr %i, i1 %cond1, i1 %cond2) {
                  entry:
                    br i1 %cond1, label %if.first, label %if.first.end
                  if.first:
-                   store i32 1, i32* %i, align 4
+                   store i32 1, ptr %i, align 4
                    br label %if.first.end
                  if.first.end:
                    br i1 %cond1, label %if.second, label %if.second.end
                  if.second:
-                   store i32 2, i32* %i, align 4
+                   store i32 2, ptr %i, align 4
                    br label %if.second.end
                  if.second.end:
                    br i1 %cond2, label %if.third, label %if.third.end
                  if.third:
-                   store i32 3, i32* %i, align 4
+                   store i32 3, ptr %i, align 4
                    br label %if.third.end
                  if.third.end:
                    ret void
@@ -136,51 +136,51 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentOppositeCondTest) {
   //     i = 9;
   // }
   std::unique_ptr<Module> M =
-      parseIR(C, R"(define void @foo(i32* %i, i32 %X, i32 %Y) {
+      parseIR(C, R"(define void @foo(ptr %i, i32 %X, i32 %Y) {
                  entry:
                    %cmp1 = icmp ult i32 %X, %Y
                    br i1 %cmp1, label %if.first, label %if.first.end
                  if.first:
-                   store i32 1, i32* %i, align 4
+                   store i32 1, ptr %i, align 4
                    br label %if.first.end
                  if.first.end:
                    %cmp2 = icmp ugt i32 %Y, %X
                    br i1 %cmp2, label %if.second, label %if.second.end
                  if.second:
-                   store i32 2, i32* %i, align 4
+                   store i32 2, ptr %i, align 4
                    br label %if.second.end
                  if.second.end:
                    %cmp3 = icmp uge i32 %X, %Y
                    br i1 %cmp3, label %if.third, label %if.third.else
                  if.third:
-                   store i32 3, i32* %i, align 4
+                   store i32 3, ptr %i, align 4
                    br label %if.third.end
                  if.third.else:
-                   store i32 4, i32* %i, align 4
+                   store i32 4, ptr %i, align 4
                    br label %if.third.end
                  if.third.end:
                    %cmp4 = icmp eq i32 %X, %Y
                    br i1 %cmp4, label %if.fourth, label %if.fourth.end
                  if.fourth:
-                   store i32 5, i32* %i, align 4
+                   store i32 5, ptr %i, align 4
                    br label %if.fourth.end
                  if.fourth.end:
                    %cmp5 = icmp eq i32 %Y, %X
                    br i1 %cmp5, label %if.fifth, label %if.fifth.else
                  if.fifth:
-                   store i32 6, i32* %i, align 4
+                   store i32 6, ptr %i, align 4
                    br label %if.fifth.end
                  if.fifth.else:
-                   store i32 7, i32* %i, align 4
+                   store i32 7, ptr %i, align 4
                    br label %if.fifth.end
                  if.fifth.end:
                    %cmp6 = icmp ne i32 %X, %Y
                    br i1 %cmp6, label %if.sixth, label %if.sixth.else
                  if.sixth:
-                   store i32 8, i32* %i, align 4
+                   store i32 8, ptr %i, align 4
                    br label %if.sixth.end
                  if.sixth.else:
-                   store i32 9, i32* %i, align 4
+                   store i32 9, ptr %i, align 4
                    br label %if.sixth.end
                  if.sixth.end:
                    ret void
@@ -227,20 +227,20 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentCondNestTest) {
   //       i = 2;
   // }
   std::unique_ptr<Module> M =
-      parseIR(C, R"(define void @foo(i32* %i, i1 %cond1, i1 %cond2) {
+      parseIR(C, R"(define void @foo(ptr %i, i1 %cond1, i1 %cond2) {
          entry:
            br i1 %cond1, label %if.outer.first, label %if.first.end
          if.outer.first:
            br i1 %cond2, label %if.inner.first, label %if.first.end
          if.inner.first:
-           store i32 1, i32* %i, align 4
+           store i32 1, ptr %i, align 4
            br label %if.first.end
          if.first.end:
            br i1 %cond2, label %if.outer.second, label %if.second.end
          if.outer.second:
            br i1 %cond1, label %if.inner.second, label %if.second.end
          if.inner.second:
-           store i32 2, i32* %i, align 4
+           store i32 2, ptr %i, align 4
            br label %if.second.end
          if.second.end:
            ret void
@@ -283,7 +283,7 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentImbalanceTest) {
   //     i = 4;
   // }
   std::unique_ptr<Module> M =
-      parseIR(C, R"(define void @foo(i32* %i, i1 %cond1, i1 %cond2, i1 %cond3) {
+      parseIR(C, R"(define void @foo(ptr %i, i1 %cond1, i1 %cond2, i1 %cond3) {
          entry:
            br i1 %cond1, label %if.outer.first, label %if.first.end
          if.outer.first:
@@ -291,26 +291,26 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentImbalanceTest) {
          if.middle.first:
            br i1 %cond3, label %if.inner.first, label %if.first.end
          if.inner.first:
-           store i32 1, i32* %i, align 4
+           store i32 1, ptr %i, align 4
            br label %if.first.end
          if.first.end:
            br i1 %cond2, label %if.outer.second, label %if.second.end
          if.outer.second:
            br i1 %cond3, label %if.inner.second, label %if.second.end
          if.inner.second:
-           store i32 2, i32* %i, align 4
+           store i32 2, ptr %i, align 4
            br label %if.second.end
          if.second.end:
            br i1 %cond1, label %if.outer.third, label %if.third.end
          if.outer.third:
            br i1 %cond1, label %if.inner.third, label %if.third.end
          if.inner.third:
-           store i32 3, i32* %i, align 4
+           store i32 3, ptr %i, align 4
            br label %if.third.end
          if.third.end:
            br i1 %cond1, label %if.fourth, label %if.fourth.end
          if.fourth:
-           store i32 4, i32* %i, align 4
+           store i32 4, ptr %i, align 4
            br label %if.fourth.end
          if.fourth.end:
            ret void
@@ -343,28 +343,28 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentPointerTest) {
   //     i = 3;
   // }
   std::unique_ptr<Module> M =
-      parseIR(C, R"(define void @foo(i32* %i, i32* %cond) {
+      parseIR(C, R"(define void @foo(ptr %i, ptr %cond) {
                  entry:
-                   %0 = load i32, i32* %cond, align 4
+                   %0 = load i32, ptr %cond, align 4
                    %tobool1 = icmp ne i32 %0, 0
                    br i1 %tobool1, label %if.first, label %if.first.end
                  if.first:
-                   store i32 1, i32* %i, align 4
+                   store i32 1, ptr %i, align 4
                    br label %if.first.end
                  if.first.end:
-                   %1 = load i32, i32* %cond, align 4
+                   %1 = load i32, ptr %cond, align 4
                    %tobool2 = icmp ne i32 %1, 0
                    br i1 %tobool2, label %if.second, label %if.second.end
                  if.second:
-                   store i32 2, i32* %i, align 4
+                   store i32 2, ptr %i, align 4
                    br label %if.second.end
                  if.second.end:
-                   store i32 1, i32* %cond, align 4
-                   %2 = load i32, i32* %cond, align 4
+                   store i32 1, ptr %cond, align 4
+                   %2 = load i32, ptr %cond, align 4
                    %tobool3 = icmp ne i32 %2, 0
                    br i1 %tobool3, label %if.third, label %if.third.end
                  if.third:
-                   store i32 3, i32* %i, align 4
+                   store i32 3, ptr %i, align 4
                    br label %if.third.end
                  if.third.end:
                    ret void
@@ -450,7 +450,7 @@ TEST(CodeMoverUtils, IsSafeToMoveTest1) {
   //   }
   // }
   std::unique_ptr<Module> M = parseIR(
-      C, R"(define void @foo(i32* noalias %A, i32* noalias %B, i32* noalias %C
+      C, R"(define void @foo(ptr noalias %A, ptr noalias %B, ptr noalias %C
                            , i64 %N) {
          entry:
            %X = sdiv i64 1, %N
@@ -461,18 +461,18 @@ TEST(CodeMoverUtils, IsSafeToMoveTest1) {
            br i1 %cmp1, label %for.body, label %for.end
          for.body:
            %i = phi i64 [ 0, %entry ], [ %inc, %for.body ]
-           %arrayidx_A5 = getelementptr inbounds i32, i32* %A, i64 5
-           store i32 5, i32* %arrayidx_A5, align 4
-           %arrayidx_A = getelementptr inbounds i32, i32* %A, i64 %i
-           store i32 0, i32* %arrayidx_A, align 4
-           %load1 = load i32, i32* %arrayidx_A, align 4
-           %arrayidx_B = getelementptr inbounds i32, i32* %B, i64 %i
-           store i32 %load1, i32* %arrayidx_B, align 4
-           %load2 = load i32, i32* %arrayidx_A, align 4
-           %arrayidx_C = getelementptr inbounds i32, i32* %C, i64 %i
-           store i32 %load2, i32* %arrayidx_C, align 4
-           %arrayidx_A6 = getelementptr inbounds i32, i32* %A, i64 6
-           store i32 6, i32* %arrayidx_A6, align 4
+           %arrayidx_A5 = getelementptr inbounds i32, ptr %A, i64 5
+           store i32 5, ptr %arrayidx_A5, align 4
+           %arrayidx_A = getelementptr inbounds i32, ptr %A, i64 %i
+           store i32 0, ptr %arrayidx_A, align 4
+           %load1 = load i32, ptr %arrayidx_A, align 4
+           %arrayidx_B = getelementptr inbounds i32, ptr %B, i64 %i
+           store i32 %load1, ptr %arrayidx_B, align 4
+           %load2 = load i32, ptr %arrayidx_A, align 4
+           %arrayidx_C = getelementptr inbounds i32, ptr %C, i64 %i
+           store i32 %load2, ptr %arrayidx_C, align 4
+           %arrayidx_A6 = getelementptr inbounds i32, ptr %A, i64 6
+           store i32 6, ptr %arrayidx_A6, align 4
            %inc = add nsw i64 %i, 1
            %cmp = icmp slt i64 %inc, %N
            br i1 %cmp, label %for.body, label %for.end
@@ -686,19 +686,19 @@ TEST(CodeMoverUtils, IsSafeToMoveTest5) {
   LLVMContext C;
 
   std::unique_ptr<Module> M =
-      parseIR(C, R"(define void @dependence(i32* noalias %A, i32* noalias %B){
+      parseIR(C, R"(define void @dependence(ptr noalias %A, ptr noalias %B){
 entry:
-    store i32 0, i32* %A, align 4 ; storeA0
-    store i32 2, i32* %A, align 4 ; storeA1
-    %tmp0 = load i32, i32* %A, align 4 ; loadA0
-    store i32 1, i32* %B, align 4 ; storeB0
-    %tmp1 = load i32, i32* %A, align 4 ; loadA1
-    store i32 2, i32* %A, align 4 ; storeA2 
-    store i32 4, i32* %B, align 4 ; StoreB1 
-    %tmp2 = load i32, i32* %A, align 4 ; loadA2 
-    %tmp3 = load i32, i32* %A, align 4 ; loadA3 
-    %tmp4 = load i32, i32* %B, align 4 ; loadB2
-    %tmp5 = load i32, i32* %B, align 4 ; loadB3
+    store i32 0, ptr %A, align 4 ; storeA0
+    store i32 2, ptr %A, align 4 ; storeA1
+    %tmp0 = load i32, ptr %A, align 4 ; loadA0
+    store i32 1, ptr %B, align 4 ; storeB0
+    %tmp1 = load i32, ptr %A, align 4 ; loadA1
+    store i32 2, ptr %A, align 4 ; storeA2
+    store i32 4, ptr %B, align 4 ; StoreB1
+    %tmp2 = load i32, ptr %A, align 4 ; loadA2
+    %tmp3 = load i32, ptr %A, align 4 ; loadA3
+    %tmp4 = load i32, ptr %B, align 4 ; loadB2
+    %tmp5 = load i32, ptr %B, align 4 ; loadB3
     ret void
 })");
 
@@ -763,63 +763,63 @@ TEST(CodeMoverUtils, IsSafeToMoveTest6) {
   LLVMContext C;
 
   std::unique_ptr<Module> M = parseIR(
-      C, R"(define void @dependence(i1 %cond, i32* noalias %A, i32* noalias %B){
+      C, R"(define void @dependence(i1 %cond, ptr noalias %A, ptr noalias %B){
    entry:
         br i1 %cond, label %bb0, label %bb1
    bb0:
         br label %bb1
     bb1:
-        store i32 0, i32* %A, align 4 ; storeA0
+        store i32 0, ptr %A, align 4 ; storeA0
         br i1 %cond, label %bb2, label %bb3
     bb2:
         br label %bb3
     bb3:
-        store i32 2, i32* %A, align 4 ; storeA1
+        store i32 2, ptr %A, align 4 ; storeA1
         br i1 %cond, label %bb4, label %bb5
     bb4:
         br label %bb5
     bb5:
-        %tmp0 = load i32, i32* %A, align 4 ; loadA0
+        %tmp0 = load i32, ptr %A, align 4 ; loadA0
         br i1 %cond, label %bb6, label %bb7
     bb6:
         br label %bb7
     bb7:
-        store i32 1, i32* %B, align 4 ; storeB0
+        store i32 1, ptr %B, align 4 ; storeB0
         br i1 %cond, label %bb8, label %bb9
     bb8:
         br label %bb9
     bb9:
-        %tmp1 = load i32, i32* %A, align 4 ; loadA1
+        %tmp1 = load i32, ptr %A, align 4 ; loadA1
         br i1 %cond, label %bb10, label %bb11
     bb10:
         br label %bb11
     bb11:
-        store i32 2, i32* %A, align 4 ; storeA2
+        store i32 2, ptr %A, align 4 ; storeA2
         br i1 %cond, label %bb12, label %bb13
     bb12:
         br label %bb13
     bb13:
-        store i32 4, i32* %B, align 4 ; StoreB1
+        store i32 4, ptr %B, align 4 ; StoreB1
         br i1 %cond, label %bb14, label %bb15
     bb14:
         br label %bb15
     bb15:
-        %tmp2 = load i32, i32* %A, align 4 ; loadA2
+        %tmp2 = load i32, ptr %A, align 4 ; loadA2
         br i1 %cond, label %bb16, label %bb17
     bb16:
         br label %bb17
     bb17:
-        %tmp3 = load i32, i32* %A, align 4 ; loadA3
+        %tmp3 = load i32, ptr %A, align 4 ; loadA3
         br i1 %cond, label %bb18, label %bb19
     bb18:
         br label %bb19
     bb19:
-        %tmp4 = load i32, i32* %B, align 4 ; loadB2
+        %tmp4 = load i32, ptr %B, align 4 ; loadB2
         br i1 %cond, label %bb20, label %bb21
     bb20:
        br label %bb21
     bb21:
-       %tmp5 = load i32, i32* %B, align 4 ; loadB3
+       %tmp5 = load i32, ptr %B, align 4 ; loadB3
        ret void
    })");
   run(*M, "dependence",
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index 4b53cc3d6e516..c37ed5d8613b0 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -183,7 +183,7 @@ TEST(Local, MergeBasicBlockIntoOnlyPred) {
   auto resetIR = [&]() {
     M = parseIR(C,
                 R"(
-      define i32 @f(i8* %str) {
+      define i32 @f(ptr %str) {
       entry:
         br label %bb2.i
       bb2.i:                                            ; preds = %bb4.i, %entry
@@ -411,7 +411,7 @@ TEST(Local, ConstantFoldTerminator) {
 
       define void @indirectbr() {
       entry:
-        indirectbr i8* blockaddress(@indirectbr, %bb0), [label %bb0, label %bb1]
+        indirectbr ptr blockaddress(@indirectbr, %bb0), [label %bb0, label %bb1]
       bb0:
         ret void
       bb1:
@@ -420,14 +420,14 @@ TEST(Local, ConstantFoldTerminator) {
 
       define void @indirectbr_repeated() {
       entry:
-        indirectbr i8* blockaddress(@indirectbr_repeated, %bb0), [label %bb0, label %bb0]
+        indirectbr ptr blockaddress(@indirectbr_repeated, %bb0), [label %bb0, label %bb0]
       bb0:
         ret void
       }
 
       define void @indirectbr_unreachable() {
       entry:
-        indirectbr i8* blockaddress(@indirectbr_unreachable, %bb0), [label %bb1]
+        indirectbr ptr blockaddress(@indirectbr_unreachable, %bb0), [label %bb1]
       bb0:
         ret void
       bb1:
@@ -925,7 +925,7 @@ TEST(Local, RemoveUnreachableBlocks) {
 
       declare i32 @__gxx_personality_v0(...)
 
-      define void @invoke_terminator() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+      define void @invoke_terminator() personality ptr @__gxx_personality_v0 {
       entry:
         br i1 undef, label %invoke.block, label %exit
 
@@ -943,8 +943,8 @@ TEST(Local, RemoveUnreachableBlocks) {
         unreachable
 
       lpad.block:
-        %lp = landingpad { i8*, i32 }
-                catch i8* null
+        %lp = landingpad { ptr, i32 }
+                catch ptr null
         br label %exit
 
       exit:
@@ -1153,7 +1153,7 @@ TEST(Local, ExpressionForConstant) {
   IntegerType *Int1Ty = Type::getInt1Ty(Context);
   Expr = createExpression(ConstantInt::getTrue(Context), Int1Ty);
   EXPECT_NE(Expr, nullptr);
-  EXPECT_EQ(Expr->getElement(1), 18446744073709551615U);
+  EXPECT_EQ(Expr->getElement(1), 1U);
 
   Expr = createExpression(ConstantInt::getFalse(Context), Int1Ty);
   EXPECT_NE(Expr, nullptr);
diff --git a/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp b/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp
index b97bc311f4655..dd03b4f2ae971 100644
--- a/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp
+++ b/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp
@@ -98,13 +98,13 @@ struct MemTransferLowerTest : public testing::Test {
 // For that reason expandMemCpyAsLoop is expected to  explicitly mark
 // loads from source and stores to destination as not aliasing.
 TEST_F(MemTransferLowerTest, MemCpyKnownLength) {
-  ParseAssembly("declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8 *, i64, i1)\n"
-                "define void @foo(i8* %dst, i8* %src, i64 %n) optsize {\n"
+  ParseAssembly("declare void @llvm.memcpy.p0i8.p0i8.i64(ptr, ptr, i64, i1)\n"
+                "define void @foo(ptr %dst, ptr %src, i64 %n) optsize {\n"
                 "entry:\n"
-                "  %is_not_equal = icmp ne i8* %dst, %src\n"
+                "  %is_not_equal = icmp ne ptr %dst, %src\n"
                 "  br i1 %is_not_equal, label %memcpy, label %exit\n"
                 "memcpy:\n"
-                "  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, "
+                "  call void @llvm.memcpy.p0i8.p0i8.i64(ptr %dst, ptr %src, "
                 "i64 1024, i1 false)\n"
                 "  br label %exit\n"
                 "exit:\n"
@@ -138,13 +138,13 @@ TEST_F(MemTransferLowerTest, MemCpyKnownLength) {
 // llvm.memcpy lowering) doesn't alias by making sure the loop can be
 // successfully vectorized without additional runtime checks.
 TEST_F(MemTransferLowerTest, VecMemCpyKnownLength) {
-  ParseAssembly("declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8 *, i64, i1)\n"
-                "define void @foo(i8* %dst, i8* %src, i64 %n) optsize {\n"
+  ParseAssembly("declare void @llvm.memcpy.p0i8.p0i8.i64(ptr, ptr, i64, i1)\n"
+                "define void @foo(ptr %dst, ptr %src, i64 %n) optsize {\n"
                 "entry:\n"
-                "  %is_not_equal = icmp ne i8* %dst, %src\n"
+                "  %is_not_equal = icmp ne ptr %dst, %src\n"
                 "  br i1 %is_not_equal, label %memcpy, label %exit\n"
                 "memcpy:\n"
-                "  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, "
+                "  call void @llvm.memcpy.p0i8.p0i8.i64(ptr %dst, ptr %src, "
                 "i64 1024, i1 false)\n"
                 "  br label %exit\n"
                 "exit:\n"
@@ -176,16 +176,16 @@ TEST_F(MemTransferLowerTest, VecMemCpyKnownLength) {
 
 TEST_F(MemTransferLowerTest, AtomicMemCpyKnownLength) {
   ParseAssembly("declare void "
-                "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(i32*, "
+                "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(ptr, "
                 "i32 *, i64, i32)\n"
-                "define void @foo(i32* %dst, i32* %src, i64 %n) optsize {\n"
+                "define void @foo(ptr %dst, ptr %src, i64 %n) optsize {\n"
                 "entry:\n"
-                "  %is_not_equal = icmp ne i32* %dst, %src\n"
+                "  %is_not_equal = icmp ne ptr %dst, %src\n"
                 "  br i1 %is_not_equal, label %memcpy, label %exit\n"
                 "memcpy:\n"
                 "  call void "
-                "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(i32* "
-                "%dst, i32* %src, "
+                "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(ptr "
+                "%dst, ptr %src, "
                 "i64 1024, i32 4)\n"
                 "  br label %exit\n"
                 "exit:\n"
@@ -221,16 +221,16 @@ TEST_F(MemTransferLowerTest, AtomicMemCpyKnownLength) {
 
 TEST_F(MemTransferLowerTest, AtomicMemCpyUnKnownLength) {
   ParseAssembly("declare void "
-                "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(i32*, "
+                "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(ptr, "
                 "i32 *, i64, i32)\n"
-                "define void @foo(i32* %dst, i32* %src, i64 %n) optsize {\n"
+                "define void @foo(ptr %dst, ptr %src, i64 %n) optsize {\n"
                 "entry:\n"
-                "  %is_not_equal = icmp ne i32* %dst, %src\n"
+                "  %is_not_equal = icmp ne ptr %dst, %src\n"
                 "  br i1 %is_not_equal, label %memcpy, label %exit\n"
                 "memcpy:\n"
                 "  call void "
-                "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(i32* "
-                "%dst, i32* %src, "
+                "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(ptr "
+                "%dst, ptr %src, "
                 "i64 %n, i32 4)\n"
                 "  br label %exit\n"
                 "exit:\n"
diff --git a/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp b/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp
index 716f5f21302a8..13cfaf3a0345e 100644
--- a/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp
+++ b/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp
@@ -393,9 +393,8 @@ static void RunEliminateNewDuplicatePHINode(
   }
 
   Function *F = M->getFunction("main");
-  auto BBIt = std::find_if(F->begin(), F->end(), [](const BasicBlock &Block) {
-    return Block.getName() == "testbb";
-  });
+  auto BBIt = llvm::find_if(
+      *F, [](const BasicBlock &Block) { return Block.getName() == "testbb"; });
   ASSERT_NE(BBIt, F->end());
   Check(*BBIt, EliminateNewDuplicatePHINodes);
 }
diff --git a/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp b/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
index 55eae64fe0f6d..4fe30805f9499 100644
--- a/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
+++ b/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
@@ -121,18 +121,18 @@ TEST_F(ScalarEvolutionExpanderTest, ExpandPtrTypeSCEV) {
 TEST_F(ScalarEvolutionExpanderTest, SCEVZeroExtendExprNonIntegral) {
   /*
    * Create the following code:
-   * func(i64 addrspace(10)* %arg)
+   * func(ptr addrspace(10) %arg)
    * top:
    *  br label %L.ph
    * L.ph:
-   *  %gepbase = getelementptr i64 addrspace(10)* %arg, i64 1
+   *  %gepbase = getelementptr ptr addrspace(10) %arg, i64 1
    *  br label %L
    * L:
    *  %phi = phi i64 [i64 0, %L.ph], [ %add, %L2 ]
    *  %add = add i64 %phi2, 1
    *  br i1 undef, label %post, label %L2
    * post:
-   *  #= %gep = getelementptr i64 addrspace(10)* %gepbase, i64 %add =#
+   *  #= %gep = getelementptr ptr addrspace(10) %gepbase, i64 %add =#
    *  ret void
    *
    * We will create the appropriate SCEV expression for %gep and expand it,
@@ -199,7 +199,7 @@ TEST_F(ScalarEvolutionExpanderTest, SCEVZeroExtendExprNonIntegral) {
 TEST_F(ScalarEvolutionExpanderTest, SCEVExpanderIsSafeToExpandAt) {
   /*
    * Create the following code:
-   * func(i64 addrspace(10)* %arg)
+   * func(ptr addrspace(10) %arg)
    * top:
    *  br label %L.ph
    * L.ph:
@@ -704,14 +704,14 @@ TEST_F(ScalarEvolutionExpanderTest, SCEVExpanderShlNSW) {
     EXPECT_FALSE(I->hasNoSignedWrap());
   };
 
-  checkOneCase("define void @f(i16* %arrayidx) { "
-               "  %1 = load i16, i16* %arrayidx "
+  checkOneCase("define void @f(ptr %arrayidx) { "
+               "  %1 = load i16, ptr %arrayidx "
                "  %2 = and i16 %1, -32768 "
                "  ret void "
                "} ");
 
-  checkOneCase("define void @f(i8* %arrayidx) { "
-               "  %1 = load i8, i8* %arrayidx "
+  checkOneCase("define void @f(ptr %arrayidx) { "
+               "  %1 = load i8, ptr %arrayidx "
                "  %2 = and i8 %1, -128 "
                "  ret void "
                "} ");
diff --git a/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp b/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
index eec10110e6af4..7ba259deb574c 100644
--- a/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
+++ b/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
@@ -34,7 +34,7 @@ TEST(LoopUnrollRuntime, Latch) {
 
   std::unique_ptr<Module> M = parseIR(
     C,
-    R"(define i32 @test(i32* %a, i32* %b, i32* %c, i64 %n) {
+    R"(define i32 @test(ptr %a, ptr %b, ptr %c, i64 %n) {
 entry:
   br label %while.cond
 
@@ -44,13 +44,13 @@ while.cond:                                       ; preds = %while.body, %entry
   br i1 %cmp, label %while.body, label %while.end
 
 while.body:                                       ; preds = %while.cond
-  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.0
-  %0 = load i32, i32* %arrayidx
-  %arrayidx1 = getelementptr inbounds i32, i32* %c, i64 %i.0
-  %1 = load i32, i32* %arrayidx1
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %i.0
+  %0 = load i32, ptr %arrayidx
+  %arrayidx1 = getelementptr inbounds i32, ptr %c, i64 %i.0
+  %1 = load i32, ptr %arrayidx1
   %mul = mul nsw i32 %0, %1
-  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %i.0
-  store i32 %mul, i32* %arrayidx2
+  %arrayidx2 = getelementptr inbounds i32, ptr %a, i64 %i.0
+  store i32 %mul, ptr %arrayidx2
   %inc = add nsw i64 %i.0, 1
   br label %while.cond
 
diff --git a/llvm/unittests/Transforms/Utils/ValueMapperTest.cpp b/llvm/unittests/Transforms/Utils/ValueMapperTest.cpp
index e39cd7038b280..7f12deae2ad1b 100644
--- a/llvm/unittests/Transforms/Utils/ValueMapperTest.cpp
+++ b/llvm/unittests/Transforms/Utils/ValueMapperTest.cpp
@@ -74,7 +74,7 @@ TEST(ValueMapperTest, mapMDNodeDuplicatedCycle) {
 
   // Create a cycle that references G0.
   MDNode *N0; // !0 = !{!1}
-  MDNode *N1; // !1 = !{!0, i8* @G0}
+  MDNode *N1; // !1 = !{!0, ptr @G0}
   {
     auto T0 = MDTuple::getTemporary(Context, nullptr);
     Metadata *Ops1[] = {T0.get(), ConstantAsMetadata::get(G0.get())};
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 59a9ea1a720b3..82ecc16074a8f 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -704,6 +704,20 @@ TEST_F(VPBasicBlockTest, reassociateBlocks) {
   }
 }
 
+TEST_F(VPBasicBlockTest, splitAtEnd) {
+  VPlan &Plan = getPlan();
+  VPInstruction *VPI = new VPInstruction(0, {});
+  VPBasicBlock *VPBB = Plan.createVPBasicBlock("VPBB1", VPI);
+  VPBlockUtils::connectBlocks(Plan.getEntry(), VPBB);
+  VPBlockUtils::connectBlocks(VPBB, Plan.getScalarHeader());
+  VPBB->splitAt(VPBB->end());
+  EXPECT_EQ(VPBB->size(), 1u);
+  EXPECT_EQ(&VPBB->front(), VPI);
+  auto *Split = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
+  EXPECT_TRUE(Split->empty());
+  EXPECT_EQ(Split->getSingleSuccessor(), Plan.getScalarHeader());
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 TEST_F(VPBasicBlockTest, print) {
   VPInstruction *TC = new VPInstruction(Instruction::PHI, {});
@@ -955,16 +969,40 @@ compound=true
 #endif
 
 using VPRecipeTest = VPlanTestBase;
+
+namespace {
+template <typename RecipeT, typename T, typename... Rest>
+void checkVPRecipeCastImpl(RecipeT *R) {
+  // Direct checks on recipe pointer
+  EXPECT_TRUE(isa<T>(R));
+  EXPECT_EQ(R, dyn_cast<T>(R));
+  (void)cast<T>(R); // Verify cast succeeds (asserts on failure)
+
+  // Check through base pointer
+  VPRecipeBase *BaseR = R;
+  EXPECT_TRUE(isa<T>(BaseR));
+  EXPECT_EQ(R, dyn_cast<T>(BaseR));
+  (void)cast<T>(BaseR);
+
+  // Check through const base pointer
+  const VPRecipeBase *ConstBaseR = R;
+  EXPECT_TRUE(isa<T>(ConstBaseR));
+  EXPECT_EQ(R, dyn_cast<T>(ConstBaseR));
+  (void)cast<T>(ConstBaseR);
+
+  if constexpr (sizeof...(Rest) > 0)
+    checkVPRecipeCastImpl<RecipeT, Rest...>(R);
+}
+} // namespace
+
 TEST_F(VPRecipeTest, CastVPInstructionToVPUser) {
   IntegerType *Int32 = IntegerType::get(C, 32);
   VPlan &Plan = getPlan();
   VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
   VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
   VPInstruction Recipe(Instruction::Add, {Op1, Op2});
-  EXPECT_TRUE(isa<VPUser>(&Recipe));
-  VPRecipeBase *BaseR = &Recipe;
-  EXPECT_TRUE(isa<VPUser>(BaseR));
-  EXPECT_EQ(&Recipe, BaseR);
+
+  checkVPRecipeCastImpl<VPInstruction, VPUser>(&Recipe);
 }
 
 TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) {
@@ -978,10 +1016,8 @@ TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) {
   Args.push_back(Op1);
   Args.push_back(Op2);
   VPWidenRecipe WidenR(*AI, make_range(Args.begin(), Args.end()));
-  EXPECT_TRUE(isa<VPUser>(&WidenR));
-  VPRecipeBase *WidenRBase = &WidenR;
-  EXPECT_TRUE(isa<VPUser>(WidenRBase));
-  EXPECT_EQ(&WidenR, WidenRBase);
+
+  checkVPRecipeCastImpl<VPWidenRecipe, VPUser>(&WidenR);
   delete AI;
 }
 
@@ -999,10 +1035,8 @@ TEST_F(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) {
   Args.push_back(Op2);
   Args.push_back(CalledFn);
   VPWidenCallRecipe Recipe(Call, Fn, Args);
-  EXPECT_TRUE(isa<VPUser>(&Recipe));
-  VPRecipeBase *BaseR = &Recipe;
-  EXPECT_TRUE(isa<VPUser>(BaseR));
-  EXPECT_EQ(&Recipe, BaseR);
+
+  checkVPRecipeCastImpl<VPWidenCallRecipe, VPUser>(&Recipe);
 
   VPValue *VPV = &Recipe;
   EXPECT_TRUE(VPV->getDefiningRecipe());
@@ -1027,13 +1061,10 @@ TEST_F(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) {
   Args.push_back(Op3);
   VPWidenSelectRecipe WidenSelectR(*SelectI,
                                    make_range(Args.begin(), Args.end()));
-  EXPECT_TRUE(isa<VPUser>(&WidenSelectR));
-  VPRecipeBase *BaseR = &WidenSelectR;
-  EXPECT_TRUE(isa<VPUser>(BaseR));
-  EXPECT_EQ(&WidenSelectR, BaseR);
+
+  checkVPRecipeCastImpl<VPWidenSelectRecipe, VPUser>(&WidenSelectR);
 
   VPValue *VPV = &WidenSelectR;
-  EXPECT_TRUE(isa<VPRecipeBase>(VPV->getDefiningRecipe()));
   EXPECT_EQ(&WidenSelectR, VPV->getDefiningRecipe());
 
   delete SelectI;
@@ -1051,10 +1082,8 @@ TEST_F(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) {
   Args.push_back(Op1);
   Args.push_back(Op2);
   VPWidenGEPRecipe Recipe(GEP, make_range(Args.begin(), Args.end()));
-  EXPECT_TRUE(isa<VPUser>(&Recipe));
-  VPRecipeBase *BaseR = &Recipe;
-  EXPECT_TRUE(isa<VPUser>(BaseR));
-  EXPECT_EQ(&Recipe, BaseR);
+
+  checkVPRecipeCastImpl<VPWidenGEPRecipe, VPUser>(&Recipe);
 
   VPValue *VPV = &Recipe;
   EXPECT_TRUE(isa<VPRecipeBase>(VPV->getDefiningRecipe()));
@@ -1063,6 +1092,28 @@ TEST_F(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) {
   delete GEP;
 }
 
+TEST_F(VPRecipeTest, CastVPWidenCastRecipeToVPUser) {
+  VPlan &Plan = getPlan();
+  IntegerType *Int32 = IntegerType::get(C, 32);
+  IntegerType *Int64 = IntegerType::get(C, 64);
+  auto *Cast = CastInst::CreateZExtOrBitCast(PoisonValue::get(Int32), Int64);
+  VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPWidenCastRecipe Recipe(Instruction::ZExt, Op1, Int64, *Cast);
+
+  checkVPRecipeCastImpl<VPWidenCastRecipe, VPUser>(&Recipe);
+  delete Cast;
+}
+
+TEST_F(VPRecipeTest, CastVPWidenIntrinsicRecipeToVPUser) {
+  VPlan &Plan = getPlan();
+  IntegerType *Int32 = IntegerType::get(C, 32);
+  VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+  VPWidenIntrinsicRecipe Recipe(Intrinsic::smax, {Op1, Op2}, Int32);
+
+  checkVPRecipeCastImpl<VPWidenIntrinsicRecipe, VPUser>(&Recipe);
+}
+
 TEST_F(VPRecipeTest, CastVPBlendRecipeToVPUser) {
   VPlan &Plan = getPlan();
   IntegerType *Int32 = IntegerType::get(C, 32);
@@ -1076,9 +1127,9 @@ TEST_F(VPRecipeTest, CastVPBlendRecipeToVPUser) {
   Args.push_back(I2);
   Args.push_back(M2);
   VPBlendRecipe Recipe(Phi, Args, {});
-  EXPECT_TRUE(isa<VPUser>(&Recipe));
-  VPRecipeBase *BaseR = &Recipe;
-  EXPECT_TRUE(isa<VPUser>(BaseR));
+
+  checkVPRecipeCastImpl<VPBlendRecipe, VPUser>(&Recipe);
+
   delete Phi;
 }
 
@@ -1089,10 +1140,8 @@ TEST_F(VPRecipeTest, CastVPInterleaveRecipeToVPUser) {
   VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
   InterleaveGroup<Instruction> IG(4, false, Align(4));
   VPInterleaveRecipe Recipe(&IG, Addr, {}, Mask, false, {}, DebugLoc());
-  EXPECT_TRUE(isa<VPUser>(&Recipe));
-  VPRecipeBase *BaseR = &Recipe;
-  EXPECT_TRUE(isa<VPUser>(BaseR));
-  EXPECT_EQ(&Recipe, BaseR);
+
+  checkVPRecipeCastImpl<VPInterleaveRecipe, VPUser>(&Recipe);
 }
 
 TEST_F(VPRecipeTest, CastVPReplicateRecipeToVPUser) {
@@ -1107,9 +1156,9 @@ TEST_F(VPRecipeTest, CastVPReplicateRecipeToVPUser) {
   FunctionType *FTy = FunctionType::get(Int32, false);
   auto *Call = CallInst::Create(FTy, PoisonValue::get(FTy));
   VPReplicateRecipe Recipe(Call, make_range(Args.begin(), Args.end()), true);
-  EXPECT_TRUE(isa<VPUser>(&Recipe));
-  VPRecipeBase *BaseR = &Recipe;
-  EXPECT_TRUE(isa<VPUser>(BaseR));
+
+  checkVPRecipeCastImpl<VPReplicateRecipe, VPUser>(&Recipe);
+
   delete Call;
 }
 
@@ -1118,10 +1167,8 @@ TEST_F(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) {
   IntegerType *Int32 = IntegerType::get(C, 32);
   VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
   VPBranchOnMaskRecipe Recipe(Mask, {});
-  EXPECT_TRUE(isa<VPUser>(&Recipe));
-  VPRecipeBase *BaseR = &Recipe;
-  EXPECT_TRUE(isa<VPUser>(BaseR));
-  EXPECT_EQ(&Recipe, BaseR);
+
+  checkVPRecipeCastImpl<VPBranchOnMaskRecipe, VPUser>(&Recipe);
 }
 
 TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
@@ -1132,12 +1179,9 @@ TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
       new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
   VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
   VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
-  VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, Load->getAlign(), {},
-                           {});
-  EXPECT_TRUE(isa<VPUser>(&Recipe));
-  VPRecipeBase *BaseR = &Recipe;
-  EXPECT_TRUE(isa<VPUser>(BaseR));
-  EXPECT_EQ(&Recipe, BaseR);
+  VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {});
+
+  checkVPRecipeCastImpl<VPWidenLoadRecipe, VPUser>(&Recipe);
 
   VPValue *VPV = Recipe.getVPSingleValue();
   EXPECT_TRUE(isa<VPRecipeBase>(VPV->getDefiningRecipe()));
@@ -1146,6 +1190,71 @@ TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
   delete Load;
 }
 
+TEST_F(VPRecipeTest, CastVPInterleaveEVLRecipeToVPUser) {
+  VPlan &Plan = getPlan();
+  IntegerType *Int32 = IntegerType::get(C, 32);
+  VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+  VPValue *EVL = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 8));
+  InterleaveGroup<Instruction> IG(4, false, Align(4));
+  VPInterleaveRecipe BaseRecipe(&IG, Addr, {}, Mask, false, {}, DebugLoc());
+  VPInterleaveEVLRecipe Recipe(BaseRecipe, *EVL, Mask);
+
+  checkVPRecipeCastImpl<VPInterleaveEVLRecipe, VPUser>(&Recipe);
+}
+
+TEST_F(VPRecipeTest, CastVPWidenLoadEVLRecipeToVPUser) {
+  VPlan &Plan = getPlan();
+  IntegerType *Int32 = IntegerType::get(C, 32);
+  PointerType *Int32Ptr = PointerType::get(C, 0);
+  auto *Load =
+      new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
+  VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+  VPValue *EVL = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 8));
+  VPWidenLoadRecipe BaseLoad(*Load, Addr, Mask, true, false, {}, {});
+  VPWidenLoadEVLRecipe Recipe(BaseLoad, Addr, *EVL, Mask);
+
+  checkVPRecipeCastImpl<VPWidenLoadEVLRecipe, VPUser>(&Recipe);
+
+  delete Load;
+}
+
+TEST_F(VPRecipeTest, CastVPWidenStoreRecipeToVPUser) {
+  VPlan &Plan = getPlan();
+  IntegerType *Int32 = IntegerType::get(C, 32);
+  PointerType *Int32Ptr = PointerType::get(C, 0);
+  auto *Store = new StoreInst(PoisonValue::get(Int32),
+                              PoisonValue::get(Int32Ptr), false, Align(1));
+  VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *StoredVal = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 42));
+  VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+  VPWidenStoreRecipe Recipe(*Store, Addr, StoredVal, Mask, true, false, {}, {});
+
+  checkVPRecipeCastImpl<VPWidenStoreRecipe, VPUser>(&Recipe);
+
+  delete Store;
+}
+
+TEST_F(VPRecipeTest, CastVPWidenStoreEVLRecipeToVPUser) {
+  VPlan &Plan = getPlan();
+  IntegerType *Int32 = IntegerType::get(C, 32);
+  PointerType *Int32Ptr = PointerType::get(C, 0);
+  auto *Store = new StoreInst(PoisonValue::get(Int32),
+                              PoisonValue::get(Int32Ptr), false, Align(1));
+  VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
+  VPValue *StoredVal = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 42));
+  VPValue *EVL = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 8));
+  VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
+  VPWidenStoreRecipe BaseStore(*Store, Addr, StoredVal, Mask, true, false, {},
+                               {});
+  VPWidenStoreEVLRecipe Recipe(BaseStore, Addr, *EVL, Mask);
+
+  checkVPRecipeCastImpl<VPWidenStoreEVLRecipe, VPUser>(&Recipe);
+
+  delete Store;
+}
+
 TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
   IntegerType *Int1 = IntegerType::get(C, 1);
   IntegerType *Int32 = IntegerType::get(C, 32);
@@ -1250,8 +1359,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
         new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
     VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
     VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
-    VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, Load->getAlign(),
-                             {}, {});
+    VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {});
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_TRUE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1265,8 +1373,8 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
     VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
     VPValue *StoredV = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3));
-    VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false,
-                              Store->getAlign(), {}, {});
+    VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, {},
+                              {});
     EXPECT_TRUE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_TRUE(Recipe.mayWriteToMemory());
@@ -1594,9 +1702,7 @@ TEST_F(VPRecipeTest, CastVPReductionRecipeToVPUser) {
   VPValue *CondOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 3));
   VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp,
                            CondOp, VecOp, false);
-  EXPECT_TRUE(isa<VPUser>(&Recipe));
-  VPRecipeBase *BaseR = &Recipe;
-  EXPECT_TRUE(isa<VPUser>(BaseR));
+  checkVPRecipeCastImpl<VPReductionRecipe, VPUser>(&Recipe);
   delete Add;
 }
 
@@ -1611,9 +1717,7 @@ TEST_F(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) {
                            CondOp, VecOp, false);
   VPValue *EVL = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 0));
   VPReductionEVLRecipe EVLRecipe(Recipe, *EVL, CondOp);
-  EXPECT_TRUE(isa<VPUser>(&EVLRecipe));
-  VPRecipeBase *BaseR = &EVLRecipe;
-  EXPECT_TRUE(isa<VPUser>(BaseR));
+  checkVPRecipeCastImpl<VPReductionEVLRecipe, VPUser>(&EVLRecipe);
   delete Add;
 }
 } // namespace
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
index 50ad4d5fa61ff..169114ed6c310 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
@@ -21,7 +21,7 @@ using VPVerifierTest = VPlanTestBase;
 namespace {
 TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
   VPlan &Plan = getPlan();
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero});
   VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI});
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
@@ -56,7 +56,7 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
 
 TEST_F(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
   VPlan &Plan = getPlan();
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero});
   VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI});
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
@@ -184,7 +184,7 @@ TEST_F(VPVerifierTest, VPPhiIncomingValueDoesntDominateIncomingBlock) {
 
 TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
   VPlan &Plan = getPlan();
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   VPInstruction *I1 = new VPInstruction(Instruction::Add, {Zero});
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   VPInstruction *BranchOnCond =
@@ -218,7 +218,7 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
 
 TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
   VPlan &Plan = getPlan();
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   VPInstruction *I1 = new VPInstruction(Instruction::Add, {Zero});
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   VPInstruction *BranchOnCond =
@@ -259,7 +259,7 @@ TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) {
   VPBasicBlock *VPBB1 = Plan.getEntry();
   VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
 
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   VPBB2->appendRecipe(CanIV);
 
@@ -288,7 +288,7 @@ TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) {
 
 TEST_F(VPVerifierTest, NonHeaderPHIInHeader) {
   VPlan &Plan = getPlan();
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   auto *BranchOnCond = new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
 
@@ -326,22 +326,18 @@ TEST_F(VPVerifierTest, NonHeaderPHIInHeader) {
 
 class VPIRVerifierTest : public VPlanTestIRBase {};
 
-TEST_F(VPIRVerifierTest, testVerifyIRPhi) {
+TEST_F(VPIRVerifierTest, testVerifyIRPhiInScalarHeaderVPIRBB) {
   const char *ModuleString =
       "define void @f(ptr %A, i64 %N) {\n"
       "entry:\n"
       "  br label %loop\n"
       "loop:\n"
       "  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]\n"
-      "  %arr.idx = getelementptr inbounds i32, ptr %A, i64 %iv\n"
-      "  %l1 = load i32, ptr %arr.idx, align 4\n"
-      "  %res = add i32 %l1, 10\n"
-      "  store i32 %res, ptr %arr.idx, align 4\n"
       "  %iv.next = add i64 %iv, 1\n"
       "  %exitcond = icmp ne i64 %iv.next, %N\n"
       "  br i1 %exitcond, label %loop, label %for.end\n"
       "for.end:\n"
-      "  %p = phi i32 [ %l1, %loop ]\n"
+      "  %p = phi i64 [ %iv, %loop ]\n"
       "  ret void\n"
       "}\n";
 
@@ -351,8 +347,48 @@ TEST_F(VPIRVerifierTest, testVerifyIRPhi) {
   BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
   auto Plan = buildVPlan(LoopHeader);
 
-  Plan->getExitBlocks()[0]->front().addOperand(
-      Plan->getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(*Ctx), 0)));
+#if GTEST_HAS_STREAM_REDIRECTION
+  ::testing::internal::CaptureStderr();
+#endif
+  EXPECT_FALSE(verifyVPlanIsValid(*Plan));
+#if GTEST_HAS_STREAM_REDIRECTION
+  EXPECT_STREQ(
+      "Phi-like recipe with different number of operands and predecessors.\n",
+      ::testing::internal::GetCapturedStderr().c_str());
+#endif
+}
+
+TEST_F(VPIRVerifierTest, testVerifyIRPhiInExitVPIRBB) {
+  const char *ModuleString =
+      "define void @f(ptr %A, i64 %N) {\n"
+      "entry:\n"
+      "  br label %loop\n"
+      "loop:\n"
+      "  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]\n"
+      "  %iv.next = add i64 %iv, 1\n"
+      "  %exitcond = icmp ne i64 %iv.next, %N\n"
+      "  br i1 %exitcond, label %loop, label %for.end\n"
+      "for.end:\n"
+      "  %p = phi i64 [ %iv, %loop ]\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("f");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildVPlan(LoopHeader);
+
+  // Create a definition in the vector loop header that will be used by the phi.
+  auto *HeaderBlock =
+      cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getEntry());
+  VPInstruction *DefI =
+      new VPInstruction(VPInstruction::ExtractLastElement,
+                        {HeaderBlock->front().getVPSingleValue()});
+  DefI->insertBefore(Plan->getMiddleBlock()->getTerminator());
+  Plan->getExitBlocks()[0]->front().addOperand(DefI);
+  VPValue *Zero = Plan->getConstantInt(32, 0);
+  Plan->getScalarHeader()->front().addOperand(Zero);
 
 #if GTEST_HAS_STREAM_REDIRECTION
   ::testing::internal::CaptureStderr();
diff --git a/llvm/unittests/XRay/GraphTest.cpp b/llvm/unittests/XRay/GraphTest.cpp
index 37f07cc721c61..0d46a3d9ad377 100644
--- a/llvm/unittests/XRay/GraphTest.cpp
+++ b/llvm/unittests/XRay/GraphTest.cpp
@@ -23,8 +23,8 @@ struct EAttr {
   unsigned EA;
 };
 typedef Graph<VAttr, EAttr, unsigned> GraphT;
-typedef typename GraphT::VertexIdentifier VI;
-typedef typename GraphT::EdgeIdentifier EI;
+typedef GraphT::VertexIdentifier VI;
+typedef GraphT::EdgeIdentifier EI;
 
 // Test Fixture
 template <typename T> class GraphTest : public testing::Test {
@@ -56,8 +56,8 @@ template <typename T> class GraphTest : public testing::Test {
 
 typedef ::testing::Types<GraphT, const GraphT> GraphTestTypes;
 
-using VVT = typename GraphT::VertexValueType;
-using EVT = typename GraphT::EdgeValueType;
+using VVT = GraphT::VertexValueType;
+using EVT = GraphT::EdgeValueType;
 
 TYPED_TEST_SUITE(GraphTest, GraphTestTypes, );
 
diff --git a/llvm/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp
index 7a66117b7080c..5bc489bd3df65 100644
--- a/llvm/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp
@@ -28,7 +28,7 @@ using testing::IsEmpty;
 using testing::Not;
 using testing::NotNull;
 
-constexpr const char kTriple[] = "aarch64-unknown-linux";
+constexpr char kTriple[] = "aarch64-unknown-linux";
 
 class AArch64TargetTest : public ::testing::Test {
 protected:
diff --git a/llvm/unittests/tools/llvm-exegesis/Mips/RegisterAliasingTest.cpp b/llvm/unittests/tools/llvm-exegesis/Mips/RegisterAliasingTest.cpp
index 9cce106cc138e..62a7ee5407707 100644
--- a/llvm/unittests/tools/llvm-exegesis/Mips/RegisterAliasingTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/Mips/RegisterAliasingTest.cpp
@@ -9,7 +9,6 @@
 #include "RegisterAliasing.h"
 
 #include <cassert>
-#include <memory>
 
 #include "MipsInstrInfo.h"
 #include "TestBase.h"
diff --git a/llvm/unittests/tools/llvm-exegesis/Mips/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/Mips/TargetTest.cpp
index 1f458c0ef1b34..d18ece99d4a6b 100644
--- a/llvm/unittests/tools/llvm-exegesis/Mips/TargetTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/Mips/TargetTest.cpp
@@ -9,7 +9,6 @@
 #include "Target.h"
 
 #include <cassert>
-#include <memory>
 
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "TestBase.h"
diff --git a/llvm/unittests/tools/llvm-exegesis/PowerPC/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/PowerPC/TargetTest.cpp
index 3708f18369eaa..0e90654ab0b4a 100644
--- a/llvm/unittests/tools/llvm-exegesis/PowerPC/TargetTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/PowerPC/TargetTest.cpp
@@ -20,16 +20,13 @@
 
 namespace llvm{
 namespace exegesis {
-
-void InitializePowerPCExegesisTarget();
-
 namespace {
 
 using testing::NotNull;
 using testing::IsEmpty;
 using testing::Not;
 
-constexpr const char kTriple[] = "powerpc64le-unknown-linux";
+constexpr char kTriple[] = "powerpc64le-unknown-linux";
 
 class PowerPCTargetTest : public PPCTestBase {
 protected:
diff --git a/llvm/unittests/tools/llvm-exegesis/RISCV/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/RISCV/TargetTest.cpp
index c86a4363e7b42..b45adc6a85a9d 100644
--- a/llvm/unittests/tools/llvm-exegesis/RISCV/TargetTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/RISCV/TargetTest.cpp
@@ -9,7 +9,6 @@
 #include "Target.h"
 
 #include <cassert>
-#include <memory>
 
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "TestBase.h"
@@ -20,9 +19,6 @@
 
 namespace llvm {
 namespace exegesis {
-
-void InitializeRISCVExegesisTarget();
-
 namespace {
 
 using testing::IsEmpty;
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/RegisterAliasingTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/RegisterAliasingTest.cpp
index e24c8b8140f71..418c97f2238f6 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/RegisterAliasingTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/RegisterAliasingTest.cpp
@@ -10,7 +10,6 @@
 #include "RegisterAliasing.h"
 
 #include <cassert>
-#include <memory>
 
 #include "TestBase.h"
 #include "X86InstrInfo.h"
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SchedClassResolutionTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SchedClassResolutionTest.cpp
index 97b9a1997e0d0..20a55d8c4f0de 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SchedClassResolutionTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SchedClassResolutionTest.cpp
@@ -9,7 +9,6 @@
 #include "SchedClassResolution.h"
 
 #include <cassert>
-#include <memory>
 
 #include "TestBase.h"
 #include "llvm/MC/TargetRegistry.h"
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
index de883ab750d20..755a74811eb69 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
@@ -23,9 +23,6 @@
 
 namespace llvm {
 namespace exegesis {
-
-void InitializeX86ExegesisTarget();
-
 namespace {
 
 using testing::ElementsAre;
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
index 60c726212062d..3e0b1f4b131c5 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
@@ -16,13 +16,8 @@
 #include "X86InstrInfo.h"
 #include "llvm/ADT/SetOperations.h"
 
-#include <unordered_set>
-
 namespace llvm {
 namespace exegesis {
-
-void InitializeX86ExegesisTarget();
-
 namespace {
 
 using testing::AnyOf;
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
index 41ee4028051bb..7a40901de9291 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
@@ -16,9 +16,6 @@
 
 namespace llvm {
 namespace exegesis {
-
-void InitializeX86ExegesisTarget();
-
 namespace {
 
 using testing::ElementsAre;
@@ -52,8 +49,8 @@ class X86SnippetRepetitorTest : public X86TestBase {
     Fill(Sink);
   }
 
-  static constexpr const unsigned kMinInstructions = 3;
-  static constexpr const unsigned kLoopBodySize = 5;
+  static constexpr unsigned kMinInstructions = 3;
+  static constexpr unsigned kLoopBodySize = 5;
 
   std::unique_ptr<TargetMachine> TM;
   std::unique_ptr<LLVMContext> Context;
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
index a0cad289e978f..08c18e4ed11e2 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
@@ -29,7 +29,7 @@ namespace exegesis {
 
 // This needs to be updated anytime a test is added or removed from the test
 // suite.
-static constexpr const size_t TestCount = 4;
+static constexpr size_t TestCount = 4;
 
 class SubprocessMemoryTest : public X86TestBase {
 protected:
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
index 846729c6f85ee..e37585666a09f 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
@@ -9,7 +9,6 @@
 #include "Target.h"
 
 #include <cassert>
-#include <memory>
 
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "MmapUtils.h"
@@ -53,9 +52,6 @@ bool operator==(const MCInst &a, const MCInst &b) {
 
 namespace llvm {
 namespace exegesis {
-
-void InitializeX86ExegesisTarget();
-
 namespace {
 
 using testing::AllOf;
@@ -585,7 +581,7 @@ TEST_F(X86Core2TargetTest, SetRegToDf0) {
 TEST_F(X86Core2Avx512TargetTest, FillMemoryOperands_ADD64rm) {
   const Instruction &I = getInstr(X86::ADD64rm);
   InstructionTemplate IT(&I);
-  constexpr const int kOffset = 42;
+  constexpr int kOffset = 42;
   State.getExegesisTarget().fillMemoryOperands(IT, X86::RDI, kOffset);
   // Memory is operands 2-6.
   EXPECT_THAT(IT.getValueFor(I.Operands[2]), IsReg(X86::RDI));
@@ -598,7 +594,7 @@ TEST_F(X86Core2Avx512TargetTest, FillMemoryOperands_ADD64rm) {
 TEST_F(X86Core2Avx512TargetTest, FillMemoryOperands_VGATHERDPSZ128rm) {
   const Instruction &I = getInstr(X86::VGATHERDPSZ128rm);
   InstructionTemplate IT(&I);
-  constexpr const int kOffset = 42;
+  constexpr int kOffset = 42;
   State.getExegesisTarget().fillMemoryOperands(IT, X86::RDI, kOffset);
   // Memory is operands 4-8.
   EXPECT_THAT(IT.getValueFor(I.Operands[4]), IsReg(X86::RDI));
@@ -628,9 +624,9 @@ TEST_F(X86Core2TargetTest, GenerateLowerMunmapTest) {
 }
 
 #ifdef __arm__
-static constexpr const uintptr_t VAddressSpaceCeiling = 0xC0000000;
+static constexpr uintptr_t VAddressSpaceCeiling = 0xC0000000;
 #else
-static constexpr const uintptr_t VAddressSpaceCeiling = 0x0000800000000000;
+static constexpr uintptr_t VAddressSpaceCeiling = 0x0000800000000000;
 #endif
 
 TEST_F(X86Core2TargetTest, GenerateUpperMunmapTest) {
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/TestBase.h b/llvm/unittests/tools/llvm-exegesis/X86/TestBase.h
index 4122726aef94a..b4c84d178c1f9 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/TestBase.h
+++ b/llvm/unittests/tools/llvm-exegesis/X86/TestBase.h
@@ -22,7 +22,7 @@ namespace exegesis {
 
 void InitializeX86ExegesisTarget();
 
-constexpr const char kTriple[] = "x86_64-unknown-linux";
+constexpr char kTriple[] = "x86_64-unknown-linux";
 
 class X86TestBase : public ::testing::Test {
 protected:
diff --git a/llvm/utils/FileCheck/FileCheck.cpp b/llvm/utils/FileCheck/FileCheck.cpp
index 305c28b4c7257..a5473f951492e 100644
--- a/llvm/utils/FileCheck/FileCheck.cpp
+++ b/llvm/utils/FileCheck/FileCheck.cpp
@@ -193,7 +193,7 @@ struct MarkerStyle {
   std::string Note;
   /// Does this marker indicate inclusion by -dump-input-filter=error?
   bool FiltersAsError;
-  MarkerStyle() {}
+  MarkerStyle() = default;
   MarkerStyle(char Lead, raw_ostream::Colors Color,
               const std::string &Note = "", bool FiltersAsError = false)
       : Lead(Lead), Color(Color), Note(Note), FiltersAsError(FiltersAsError) {
diff --git a/llvm/utils/KillTheDoctor/KillTheDoctor.cpp b/llvm/utils/KillTheDoctor/KillTheDoctor.cpp
index 4f642f8886df5..0495560b6c1dc 100644
--- a/llvm/utils/KillTheDoctor/KillTheDoctor.cpp
+++ b/llvm/utils/KillTheDoctor/KillTheDoctor.cpp
@@ -43,7 +43,6 @@
 #include "llvm/Support/WindowsError.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/type_traits.h"
-#include <algorithm>
 #include <cerrno>
 #include <cstdlib>
 #include <map>
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index e1f2f06d755f1..63c9c3bfff169 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -141,7 +141,7 @@ class AsmMatcherInfo;
 // RegisterSets can be seen in the outputted AsmMatcher tables occasionally, and
 // can even affect compiler output (at least seen in diagnostics produced when
 // all matches fail). So we use a type that sorts them consistently.
-typedef std::set<const Record *, LessRecordByID> RegisterSet;
+using RegisterSet = std::set<const Record *, LessRecordByID>;
 
 class AsmMatcherEmitter {
   const RecordKeeper &Records;
@@ -779,8 +779,8 @@ class AsmMatcherInfo {
   std::vector<OperandMatchEntry> OperandMatchInfo;
 
   /// Map of Register records to their class information.
-  typedef std::map<const Record *, ClassInfo *, LessRecordByID>
-      RegisterClassesTy;
+  using RegisterClassesTy =
+      std::map<const Record *, ClassInfo *, LessRecordByID>;
   RegisterClassesTy RegisterClasses;
 
   /// Map of Predicate records to their subtarget information.
@@ -1275,7 +1275,7 @@ void AsmMatcherInfo::buildRegisterClasses(
   const auto &Registers = Target.getRegBank().getRegisters();
   auto &RegClassList = Target.getRegBank().getRegClasses();
 
-  typedef std::set<RegisterSet, LessRegisterSet> RegisterSetSet;
+  using RegisterSetSet = std::set<RegisterSet, LessRegisterSet>;
 
   // The register sets used for matching.
   RegisterSetSet RegisterSets;
@@ -1515,7 +1515,7 @@ AsmMatcherInfo::AsmMatcherInfo(const Record *asmParser,
 void AsmMatcherInfo::buildOperandMatchInfo() {
   /// Map containing a mask with all operands indices that can be found for
   /// that class inside a instruction.
-  typedef std::map<ClassInfo *, unsigned, deref<std::less<>>> OpClassMaskTy;
+  using OpClassMaskTy = std::map<ClassInfo *, unsigned, deref<std::less<>>>;
   OpClassMaskTy OpClassMask;
 
   bool CallCustomParserForAllOperands =
@@ -4164,7 +4164,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "        MII.getDeprecatedInfo(Inst, getSTI(), Info)) {\n";
     OS << "      SMLoc Loc = ((" << Target.getName()
        << "Operand &)*Operands[0]).getStartLoc();\n";
-    OS << "      getParser().Warning(Loc, Info, std::nullopt);\n";
+    OS << "      getParser().Warning(Loc, Info, {});\n";
     OS << "    }\n";
   }
 
diff --git a/llvm/utils/TableGen/AsmWriterEmitter.cpp b/llvm/utils/TableGen/AsmWriterEmitter.cpp
index c8c6c23bea014..8901ecb7210a7 100644
--- a/llvm/utils/TableGen/AsmWriterEmitter.cpp
+++ b/llvm/utils/TableGen/AsmWriterEmitter.cpp
@@ -814,7 +814,7 @@ static unsigned CountNumOperands(StringRef AsmString, unsigned Variant) {
 namespace {
 
 struct AliasPriorityComparator {
-  typedef std::pair<CodeGenInstAlias, int> ValueType;
+  using ValueType = std::pair<CodeGenInstAlias, int>;
   bool operator()(const ValueType &LHS, const ValueType &RHS) const {
     if (LHS.second == RHS.second) {
       // We don't actually care about the order, but for consistency it
@@ -845,8 +845,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   bool PassSubtarget = AsmWriter->getValueAsInt("PassSubtarget");
 
   // Create a map from the qualified name to a list of potential matches.
-  typedef std::set<std::pair<CodeGenInstAlias, int>, AliasPriorityComparator>
-      AliasWithPriority;
+  using AliasWithPriority =
+      std::set<std::pair<CodeGenInstAlias, int>, AliasPriorityComparator>;
   std::map<std::string, AliasWithPriority> AliasMap;
   for (const Record *R : Records.getAllDerivedDefinitions("InstAlias")) {
     int Priority = R->getValueAsInt("EmitPriority");
diff --git a/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp
index 3f284ee1b1032..b63ce3671f922 100644
--- a/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp
@@ -220,7 +220,7 @@ static void emitARMTargetDef(const RecordKeeper &RK, raw_ostream &OS) {
                           ProfileLower + "'");
 
     // Name of the object in C++
-    const std::string CppSpelling = ArchInfoName(Major, Minor, ProfileUpper);
+    std::string CppSpelling = ArchInfoName(Major, Minor, ProfileUpper);
     OS << "inline constexpr ArchInfo " << CppSpelling << " = {\n";
     CppSpellings.push_back(std::move(CppSpelling));
 
diff --git a/llvm/utils/TableGen/Basic/CMakeLists.txt b/llvm/utils/TableGen/Basic/CMakeLists.txt
index b4a66ecce6440..2030e9add7f30 100644
--- a/llvm/utils/TableGen/Basic/CMakeLists.txt
+++ b/llvm/utils/TableGen/Basic/CMakeLists.txt
@@ -16,6 +16,7 @@ add_llvm_library(LLVMTableGenBasic OBJECT EXCLUDE_FROM_ALL DISABLE_LLVM_LINK_LLV
   IntrinsicEmitter.cpp
   RISCVTargetDefEmitter.cpp
   RuntimeLibcallsEmitter.cpp
+  RuntimeLibcalls.cpp
   SDNodeProperties.cpp
   TableGen.cpp
   TargetFeaturesEmitter.cpp
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
index cd866469792a2..ff894853b9771 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
@@ -407,6 +407,8 @@ void CodeGenIntrinsic::setProperty(const Record *R) {
     hasSideEffects = true;
   else if (R->getName() == "IntrStrictFP")
     isStrictFP = true;
+  else if (R->getName() == "IntrNoCreateUndefOrPoison")
+    isNoCreateUndefOrPoison = true;
   else if (R->isSubClassOf("NoCapture")) {
     unsigned ArgNo = R->getValueAsInt("ArgNo");
     addArgAttribute(ArgNo, NoCapture);
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
index 2e86149514f46..15e803c4feba1 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
@@ -114,6 +114,9 @@ struct CodeGenIntrinsic {
   // True if the intrinsic is marked as strictfp.
   bool isStrictFP = false;
 
+  // True if the intrinsic is marked as IntrNoCreateUndefOrPoison.
+  bool isNoCreateUndefOrPoison = false;
+
   enum ArgAttrKind {
     NoCapture,
     NoAlias,
diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
index 3c6ff1132230b..0bb743dc8a7f5 100644
--- a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
@@ -81,6 +81,7 @@ static void generateEnumExports(ArrayRef<const Record *> Records,
     std::string N = getIdentifierName(R, Prefix);
     OS << "constexpr auto " << N << " = " << Enum << "::" << N << ";\n";
   }
+  OS << "\n";
 }
 
 // Generate enum class. Entries are emitted in the order in which they appear
@@ -88,7 +89,6 @@ static void generateEnumExports(ArrayRef<const Record *> Records,
 static void generateEnumClass(ArrayRef<const Record *> Records, raw_ostream &OS,
                               StringRef Enum, StringRef Prefix,
                               bool ExportEnums) {
-  OS << "\n";
   OS << "enum class " << Enum << " {\n";
   if (!Records.empty()) {
     std::string N;
@@ -104,17 +104,15 @@ static void generateEnumClass(ArrayRef<const Record *> Records, raw_ostream &OS,
   OS << "};\n";
   OS << "\n";
   OS << "static constexpr std::size_t " << Enum
-     << "_enumSize = " << Records.size() << ";\n";
+     << "_enumSize = " << Records.size() << ";\n\n";
 
   // Make the enum values available in the defined namespace. This allows us to
   // write something like Enum_X if we have a `using namespace <CppNamespace>`.
   // At the same time we do not loose the strong type guarantees of the enum
   // class, that is we cannot pass an unsigned as Directive without an explicit
   // cast.
-  if (ExportEnums) {
-    OS << "\n";
+  if (ExportEnums)
     generateEnumExports(Records, OS, Enum, Prefix);
-  }
 }
 
 // Generate enum class with values corresponding to different bit positions.
@@ -127,7 +125,6 @@ static void generateEnumBitmask(ArrayRef<const Record *> Records,
   StringRef Type = Records.size() <= 32 ? "uint32_t" : "uint64_t";
   StringRef TypeSuffix = Records.size() <= 32 ? "U" : "ULL";
 
-  OS << "\n";
   OS << "enum class " << Enum << " : " << Type << " {\n";
   std::string LastName;
   for (auto [I, R] : llvm::enumerate(Records)) {
@@ -138,17 +135,15 @@ static void generateEnumBitmask(ArrayRef<const Record *> Records,
   OS << "};\n";
   OS << "\n";
   OS << "static constexpr std::size_t " << Enum
-     << "_enumSize = " << Records.size() << ";\n";
+     << "_enumSize = " << Records.size() << ";\n\n";
 
   // Make the enum values available in the defined namespace. This allows us to
   // write something like Enum_X if we have a `using namespace <CppNamespace>`.
   // At the same time we do not loose the strong type guarantees of the enum
   // class, that is we cannot pass an unsigned as Directive without an explicit
   // cast.
-  if (ExportEnums) {
-    OS << "\n";
+  if (ExportEnums)
     generateEnumExports(Records, OS, Enum, Prefix);
-  }
 }
 
 // Generate enums for values that clauses can take.
@@ -170,7 +165,6 @@ static void generateClauseEnumVal(ArrayRef<const Record *> Records,
       return;
     }
 
-    OS << "\n";
     OS << "enum class " << Enum << " {\n";
     for (const EnumVal Val : ClauseVals)
       OS << "  " << Val.getRecordName() << "=" << Val.getValue() << ",\n";
@@ -182,6 +176,7 @@ static void generateClauseEnumVal(ArrayRef<const Record *> Records,
         OS << "constexpr auto " << CV->getName() << " = " << Enum
            << "::" << CV->getName() << ";\n";
       }
+      OS << "\n";
       EnumHelperFuncs += (Twine("LLVM_ABI ") + Twine(Enum) + Twine(" get") +
                           Twine(Enum) + Twine("(StringRef Str);\n"))
                              .str();
@@ -284,7 +279,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
   NamespaceEmitter DirLangNS(OS, DirLang.getCppNamespace());
 
   if (DirLang.hasEnableBitmaskEnumInNamespace())
-    OS << "\nLLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();\n";
+    OS << "LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();\n\n";
 
   // Emit Directive associations
   std::vector<const Record *> Associations;
@@ -315,7 +310,6 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
   generateClauseEnumVal(DirLang.getClauses(), OS, DirLang, EnumHelperFuncs);
 
   // Generic function signatures
-  OS << "\n";
   OS << "// Enumeration helper functions\n";
 
   OS << "LLVM_ABI std::pair<Directive, directive::VersionRange> get" << Lang
@@ -353,10 +347,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
   OS << "LLVM_ABI Association getDirectiveAssociation(Directive D);\n";
   OS << "LLVM_ABI Category getDirectiveCategory(Directive D);\n";
   OS << "LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D);\n";
-  if (EnumHelperFuncs.length() > 0) {
-    OS << EnumHelperFuncs;
-    OS << "\n";
-  }
+  OS << EnumHelperFuncs;
 
   DirLangNS.close();
 
@@ -368,7 +359,6 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
     OS << "  static constexpr bool is_iterable = true;\n";
     OS << "};\n";
   }
-  LlvmNS.close();
 }
 
 // Given a list of spellings (for a given clause/directive), order them
@@ -940,27 +930,20 @@ static void generateClauseSet(ArrayRef<const Record *> VerClauses,
 // Generate an enum set for the 4 kinds of clauses linked to a directive.
 static void generateDirectiveClauseSets(const DirectiveLanguage &DirLang,
                                         Frontend FE, raw_ostream &OS) {
+  IfDefEmitter Scope(OS, "GEN_" + getFESpelling(FE).upper() +
+                             "_DIRECTIVE_CLAUSE_SETS");
 
-  std::string IfDefName{"GEN_"};
-  IfDefName += getFESpelling(FE).upper();
-  IfDefName += "_DIRECTIVE_CLAUSE_SETS";
-  IfDefEmitter Scope(OS, IfDefName);
-
-  StringRef Namespace =
-      getFESpelling(FE == Frontend::Flang ? Frontend::LLVM : FE);
+  std::string Namespace =
+      getFESpelling(FE == Frontend::Flang ? Frontend::LLVM : FE).str();
   // The namespace has to be different for clang vs flang, as 2 structs with the
   // same name but different layout is UB.  So just put the 'clang' on in the
   // clang namespace.
-  OS << "namespace " << Namespace << " {\n";
-
-  // Open namespaces defined in the directive language.
-  SmallVector<StringRef, 2> Namespaces;
-  SplitString(DirLang.getCppNamespace(), Namespaces, "::");
-  for (auto Ns : Namespaces)
-    OS << "namespace " << Ns << " {\n";
+  // Additionally, open namespaces defined in the directive language.
+  if (!DirLang.getCppNamespace().empty())
+    Namespace += "::" + DirLang.getCppNamespace().str();
+  NamespaceEmitter NS(OS, Namespace);
 
   for (const Directive Dir : DirLang.getDirectives()) {
-    OS << "\n";
     OS << "// Sets for " << Dir.getSpellingForIdentifier() << "\n";
 
     generateClauseSet(Dir.getAllowedClauses(), OS, "allowedClauses_", Dir,
@@ -972,12 +955,6 @@ static void generateDirectiveClauseSets(const DirectiveLanguage &DirLang,
     generateClauseSet(Dir.getRequiredClauses(), OS, "requiredClauses_", Dir,
                       DirLang, FE);
   }
-
-  // Closing namespaces
-  for (auto Ns : reverse(Namespaces))
-    OS << "} // namespace " << Ns << "\n";
-
-  OS << "} // namespace " << Namespace << "\n";
 }
 
 // Generate a map of directive (key) with DirectiveClauses struct as values.
@@ -985,10 +962,8 @@ static void generateDirectiveClauseSets(const DirectiveLanguage &DirLang,
 // allowances (allowed, allowed once, allowed exclusive and required).
 static void generateDirectiveClauseMap(const DirectiveLanguage &DirLang,
                                        Frontend FE, raw_ostream &OS) {
-  std::string IfDefName{"GEN_"};
-  IfDefName += getFESpelling(FE).upper();
-  IfDefName += "_DIRECTIVE_CLAUSE_MAP";
-  IfDefEmitter Scope(OS, IfDefName);
+  IfDefEmitter Scope(OS, "GEN_" + getFESpelling(FE).upper() +
+                             "_DIRECTIVE_CLAUSE_MAP");
 
   OS << "{\n";
 
diff --git a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
index 75dffb18fca5a..452d2b08f25c3 100644
--- a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
@@ -421,7 +421,8 @@ static bool compareFnAttributes(const CodeGenIntrinsic *L,
     return std::tie(I->canThrow, I->isNoDuplicate, I->isNoMerge, I->isNoReturn,
                     I->isNoCallback, I->isNoSync, I->isNoFree, I->isWillReturn,
                     I->isCold, I->isConvergent, I->isSpeculatable,
-                    I->hasSideEffects, I->isStrictFP);
+                    I->hasSideEffects, I->isStrictFP,
+                    I->isNoCreateUndefOrPoison);
   };
 
   auto TieL = TieBoolAttributes(L);
@@ -446,7 +447,8 @@ static bool hasFnAttributes(const CodeGenIntrinsic &Int) {
   return !Int.canThrow || Int.isNoReturn || Int.isNoCallback || Int.isNoSync ||
          Int.isNoFree || Int.isWillReturn || Int.isCold || Int.isNoDuplicate ||
          Int.isNoMerge || Int.isConvergent || Int.isSpeculatable ||
-         Int.isStrictFP || getEffectiveME(Int) != MemoryEffects::unknown();
+         Int.isStrictFP || Int.isNoCreateUndefOrPoison ||
+         getEffectiveME(Int) != MemoryEffects::unknown();
 }
 
 namespace {
@@ -605,6 +607,8 @@ static AttributeSet getIntrinsicFnAttributeSet(LLVMContext &C, unsigned ID) {
       addAttribute("Speculatable");
     if (Int.isStrictFP)
       addAttribute("StrictFP");
+    if (Int.isNoCreateUndefOrPoison)
+      addAttribute("NoCreateUndefOrPoison");
 
     const MemoryEffects ME = getEffectiveME(Int);
     if (ME != MemoryEffects::unknown()) {
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcalls.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcalls.cpp
new file mode 100644
index 0000000000000..1e609a2a8880b
--- /dev/null
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcalls.cpp
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RuntimeLibcalls.h"
+#include "llvm/TableGen/Error.h"
+
+using namespace llvm;
+
+RuntimeLibcalls::RuntimeLibcalls(const RecordKeeper &Records) {
+  ArrayRef<const Record *> AllRuntimeLibcalls =
+      Records.getAllDerivedDefinitions("RuntimeLibcall");
+
+  RuntimeLibcallDefList.reserve(AllRuntimeLibcalls.size());
+
+  size_t CallTypeEnumVal = 0;
+  for (const Record *RuntimeLibcallDef : AllRuntimeLibcalls) {
+    RuntimeLibcallDefList.emplace_back(RuntimeLibcallDef, CallTypeEnumVal++);
+    Def2RuntimeLibcall[RuntimeLibcallDef] = &RuntimeLibcallDefList.back();
+  }
+
+  for (RuntimeLibcall &LibCall : RuntimeLibcallDefList)
+    Def2RuntimeLibcall[LibCall.getDef()] = &LibCall;
+
+  ArrayRef<const Record *> AllRuntimeLibcallImplsRaw =
+      Records.getAllDerivedDefinitions("RuntimeLibcallImpl");
+
+  SmallVector<const Record *, 1024> AllRuntimeLibcallImpls(
+      AllRuntimeLibcallImplsRaw);
+
+  // Sort by libcall impl name and secondarily by the enum name.
+  sort(AllRuntimeLibcallImpls, [](const Record *A, const Record *B) {
+    return std::pair(A->getValueAsString("LibCallFuncName"), A->getName()) <
+           std::pair(B->getValueAsString("LibCallFuncName"), B->getName());
+  });
+
+  RuntimeLibcallImplDefList.reserve(AllRuntimeLibcallImpls.size());
+
+  size_t LibCallImplEnumVal = 1;
+  for (const Record *LibCallImplDef : AllRuntimeLibcallImpls) {
+    RuntimeLibcallImplDefList.emplace_back(LibCallImplDef, Def2RuntimeLibcall,
+                                           LibCallImplEnumVal++);
+
+    const RuntimeLibcallImpl &LibCallImpl = RuntimeLibcallImplDefList.back();
+    Def2RuntimeLibcallImpl[LibCallImplDef] = &LibCallImpl;
+
+    if (LibCallImpl.isDefault()) {
+      const RuntimeLibcall *Provides = LibCallImpl.getProvides();
+      if (!Provides)
+        PrintFatalError(LibCallImplDef->getLoc(),
+                        "default implementations must provide a libcall");
+      LibCallToDefaultImpl[Provides] = &LibCallImpl;
+    }
+  }
+}
+
+void LibcallPredicateExpander::expand(SetTheory &ST, const Record *Def,
+                                      SetTheory::RecSet &Elts) {
+  assert(Def->isSubClassOf("LibcallImpls"));
+
+  SetTheory::RecSet TmpElts;
+
+  ST.evaluate(Def->getValueInit("MemberList"), TmpElts, Def->getLoc());
+
+  Elts.insert(TmpElts.begin(), TmpElts.end());
+
+  AvailabilityPredicate AP(Def->getValueAsDef("AvailabilityPredicate"));
+  const Record *CCClass = Def->getValueAsOptionalDef("CallingConv");
+
+  // This is assuming we aren't conditionally applying a calling convention to
+  // some subsets, and not another, but this doesn't appear to be used.
+
+  for (const Record *LibcallImplDef : TmpElts) {
+    const RuntimeLibcallImpl *LibcallImpl =
+        Libcalls.getRuntimeLibcallImpl(LibcallImplDef);
+    if (!AP.isAlwaysAvailable() || CCClass) {
+      auto [It, Inserted] = Func2Preds.insert({LibcallImpl, {{}, CCClass}});
+      if (!Inserted) {
+        PrintError(
+            Def,
+            "combining nested libcall set predicates currently unhandled: '" +
+                LibcallImpl->getLibcallFuncName() + "'");
+      }
+
+      It->second.first.push_back(AP.getDef());
+      It->second.second = CCClass;
+    }
+  }
+}
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcalls.h b/llvm/utils/TableGen/Basic/RuntimeLibcalls.h
new file mode 100644
index 0000000000000..6c9897602b2fa
--- /dev/null
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcalls.h
@@ -0,0 +1,189 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UTILS_TABLEGEN_COMMON_RUNTIMELIBCALLS_H
+#define LLVM_UTILS_TABLEGEN_COMMON_RUNTIMELIBCALLS_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/SetTheory.h"
+
+namespace llvm {
+
+class AvailabilityPredicate {
+  const Record *TheDef;
+  StringRef PredicateString;
+
+public:
+  AvailabilityPredicate(const Record *Def) : TheDef(Def) {
+    if (TheDef)
+      PredicateString = TheDef->getValueAsString("Cond");
+  }
+
+  const Record *getDef() const { return TheDef; }
+
+  bool isAlwaysAvailable() const { return PredicateString.empty(); }
+
+  void emitIf(raw_ostream &OS) const {
+    OS << "if (" << PredicateString << ") {\n";
+  }
+
+  void emitEndIf(raw_ostream &OS) const { OS << "}\n"; }
+
+  void emitTableVariableNameSuffix(raw_ostream &OS) const {
+    if (TheDef)
+      OS << '_' << TheDef->getName();
+  }
+};
+
+class RuntimeLibcalls;
+class RuntimeLibcallImpl;
+
+/// Used to apply predicates to nested sets of libcalls.
+struct LibcallPredicateExpander : SetTheory::Expander {
+  const RuntimeLibcalls &Libcalls;
+  DenseMap<const RuntimeLibcallImpl *,
+           std::pair<std::vector<const Record *>, const Record *>> &Func2Preds;
+
+  LibcallPredicateExpander(
+      const RuntimeLibcalls &Libcalls,
+      DenseMap<const RuntimeLibcallImpl *,
+               std::pair<std::vector<const Record *>, const Record *>>
+          &Func2Preds)
+      : Libcalls(Libcalls), Func2Preds(Func2Preds) {}
+
+  void expand(SetTheory &ST, const Record *Def,
+              SetTheory::RecSet &Elts) override;
+};
+
+class RuntimeLibcall {
+  const Record *TheDef = nullptr;
+  const size_t EnumVal;
+
+public:
+  RuntimeLibcall() = delete;
+  RuntimeLibcall(const Record *Def, size_t EnumVal)
+      : TheDef(Def), EnumVal(EnumVal) {
+    assert(Def);
+  }
+
+  ~RuntimeLibcall() { assert(TheDef); }
+
+  const Record *getDef() const { return TheDef; }
+
+  StringRef getName() const { return TheDef->getName(); }
+
+  size_t getEnumVal() const { return EnumVal; }
+
+  void emitEnumEntry(raw_ostream &OS) const {
+    OS << "RTLIB::" << TheDef->getValueAsString("Name");
+  }
+};
+
+class RuntimeLibcallImpl {
+  const Record *TheDef;
+  const RuntimeLibcall *Provides = nullptr;
+  const size_t EnumVal;
+
+public:
+  RuntimeLibcallImpl(
+      const Record *Def,
+      const DenseMap<const Record *, const RuntimeLibcall *> &ProvideMap,
+      size_t EnumVal)
+      : TheDef(Def), EnumVal(EnumVal) {
+    if (const Record *ProvidesDef = Def->getValueAsDef("Provides"))
+      Provides = ProvideMap.lookup(ProvidesDef);
+  }
+
+  ~RuntimeLibcallImpl() = default;
+
+  const Record *getDef() const { return TheDef; }
+
+  StringRef getName() const { return TheDef->getName(); }
+
+  size_t getEnumVal() const { return EnumVal; }
+
+  const RuntimeLibcall *getProvides() const { return Provides; }
+
+  StringRef getLibcallFuncName() const {
+    return TheDef->getValueAsString("LibCallFuncName");
+  }
+
+  const Record *getCallingConv() const {
+    return TheDef->getValueAsOptionalDef("CallingConv");
+  }
+
+  void emitQuotedLibcallFuncName(raw_ostream &OS) const {
+    OS << '\"' << getLibcallFuncName() << '\"';
+  }
+
+  bool isDefault() const { return TheDef->getValueAsBit("IsDefault"); }
+
+  void emitEnumEntry(raw_ostream &OS) const {
+    OS << "RTLIB::impl_" << this->getName();
+  }
+
+  void emitSetImplCall(raw_ostream &OS) const {
+    OS << "setLibcallImpl(";
+    Provides->emitEnumEntry(OS);
+    OS << ", ";
+    emitEnumEntry(OS);
+    OS << "); // " << getLibcallFuncName() << '\n';
+  }
+
+  void emitTableEntry(raw_ostream &OS) const {
+    OS << '{';
+    Provides->emitEnumEntry(OS);
+    OS << ", ";
+    emitEnumEntry(OS);
+    OS << "}, // " << getLibcallFuncName() << '\n';
+  }
+
+  void emitSetCallingConv(raw_ostream &OS) const {}
+};
+
+struct LibcallsWithCC {
+  std::vector<const RuntimeLibcallImpl *> LibcallImpls;
+  const Record *CallingConv = nullptr;
+};
+
+class RuntimeLibcalls {
+private:
+  DenseMap<const Record *, const RuntimeLibcall *> Def2RuntimeLibcall;
+  DenseMap<const Record *, const RuntimeLibcallImpl *> Def2RuntimeLibcallImpl;
+
+  std::vector<RuntimeLibcall> RuntimeLibcallDefList;
+  std::vector<RuntimeLibcallImpl> RuntimeLibcallImplDefList;
+
+  DenseMap<const RuntimeLibcall *, const RuntimeLibcallImpl *>
+      LibCallToDefaultImpl;
+
+public:
+  RuntimeLibcalls(const RecordKeeper &Records);
+
+  ArrayRef<RuntimeLibcall> getRuntimeLibcallDefList() const {
+    return RuntimeLibcallDefList;
+  }
+
+  ArrayRef<RuntimeLibcallImpl> getRuntimeLibcallImplDefList() const {
+    return RuntimeLibcallImplDefList;
+  }
+
+  const RuntimeLibcall *getRuntimeLibcall(const Record *Def) const {
+    return Def2RuntimeLibcall.lookup(Def);
+  }
+
+  const RuntimeLibcallImpl *getRuntimeLibcallImpl(const Record *Def) const {
+    return Def2RuntimeLibcallImpl.lookup(Def);
+  }
+};
+
+} // namespace llvm
+
+#endif // LLVM_UTILS_TABLEGEN_COMMON_RUNTIMELIBCALLS_H
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
index ed802e20477d3..7aca87a63d0a2 100644
--- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
@@ -8,6 +8,8 @@
 
 #define DEBUG_TYPE "runtime-libcall-emitter"
 
+#include "RuntimeLibcalls.h"
+
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
@@ -65,160 +67,12 @@ template <> struct DenseMapInfo<PredicateWithCC, void> {
     return LHS == RHS;
   }
 };
-} // namespace llvm
-
-namespace {
-
-class AvailabilityPredicate {
-  const Record *TheDef;
-  StringRef PredicateString;
-
-public:
-  AvailabilityPredicate(const Record *Def) : TheDef(Def) {
-    if (TheDef)
-      PredicateString = TheDef->getValueAsString("Cond");
-  }
-
-  const Record *getDef() const { return TheDef; }
-
-  bool isAlwaysAvailable() const { return PredicateString.empty(); }
-
-  void emitIf(raw_ostream &OS) const {
-    OS << "if (" << PredicateString << ") {\n";
-  }
-
-  void emitEndIf(raw_ostream &OS) const { OS << "}\n"; }
-
-  void emitTableVariableNameSuffix(raw_ostream &OS) const {
-    if (TheDef)
-      OS << '_' << TheDef->getName();
-  }
-};
-
-class RuntimeLibcallEmitter;
-class RuntimeLibcallImpl;
-
-/// Used to apply predicates to nested sets of libcalls.
-struct LibcallPredicateExpander : SetTheory::Expander {
-  const RuntimeLibcallEmitter &LibcallEmitter;
-  DenseMap<const RuntimeLibcallImpl *,
-           std::pair<std::vector<const Record *>, const Record *>> &Func2Preds;
-
-  LibcallPredicateExpander(
-      const RuntimeLibcallEmitter &LibcallEmitter,
-      DenseMap<const RuntimeLibcallImpl *,
-               std::pair<std::vector<const Record *>, const Record *>>
-          &Func2Preds)
-      : LibcallEmitter(LibcallEmitter), Func2Preds(Func2Preds) {}
-
-  void expand(SetTheory &ST, const Record *Def,
-              SetTheory::RecSet &Elts) override;
-};
-
-class RuntimeLibcall {
-  const Record *TheDef = nullptr;
-  const size_t EnumVal;
-
-public:
-  RuntimeLibcall() = delete;
-  RuntimeLibcall(const Record *Def, size_t EnumVal)
-      : TheDef(Def), EnumVal(EnumVal) {
-    assert(Def);
-  }
-
-  ~RuntimeLibcall() { assert(TheDef); }
-
-  const Record *getDef() const { return TheDef; }
-
-  StringRef getName() const { return TheDef->getName(); }
-
-  size_t getEnumVal() const { return EnumVal; }
-
-  void emitEnumEntry(raw_ostream &OS) const {
-    OS << "RTLIB::" << TheDef->getValueAsString("Name");
-  }
-};
-
-class RuntimeLibcallImpl {
-  const Record *TheDef;
-  const RuntimeLibcall *Provides = nullptr;
-  const size_t EnumVal;
-
-public:
-  RuntimeLibcallImpl(
-      const Record *Def,
-      const DenseMap<const Record *, const RuntimeLibcall *> &ProvideMap,
-      size_t EnumVal)
-      : TheDef(Def), EnumVal(EnumVal) {
-    if (const Record *ProvidesDef = Def->getValueAsDef("Provides"))
-      Provides = ProvideMap.lookup(ProvidesDef);
-  }
-
-  ~RuntimeLibcallImpl() {}
-
-  const Record *getDef() const { return TheDef; }
-
-  StringRef getName() const { return TheDef->getName(); }
-
-  size_t getEnumVal() const { return EnumVal; }
-
-  const RuntimeLibcall *getProvides() const { return Provides; }
-
-  StringRef getLibcallFuncName() const {
-    return TheDef->getValueAsString("LibCallFuncName");
-  }
-
-  const Record *getCallingConv() const {
-    return TheDef->getValueAsOptionalDef("CallingConv");
-  }
-
-  void emitQuotedLibcallFuncName(raw_ostream &OS) const {
-    OS << '\"' << getLibcallFuncName() << '\"';
-  }
-
-  bool isDefault() const { return TheDef->getValueAsBit("IsDefault"); }
-
-  void emitEnumEntry(raw_ostream &OS) const {
-    OS << "RTLIB::impl_" << this->getName();
-  }
-
-  void emitSetImplCall(raw_ostream &OS) const {
-    OS << "setLibcallImpl(";
-    Provides->emitEnumEntry(OS);
-    OS << ", ";
-    emitEnumEntry(OS);
-    OS << "); // " << getLibcallFuncName() << '\n';
-  }
-
-  void emitTableEntry(raw_ostream &OS) const {
-    OS << '{';
-    Provides->emitEnumEntry(OS);
-    OS << ", ";
-    emitEnumEntry(OS);
-    OS << "}, // " << getLibcallFuncName() << '\n';
-  }
-
-  void emitSetCallingConv(raw_ostream &OS) const {}
-};
-
-struct LibcallsWithCC {
-  std::vector<const RuntimeLibcallImpl *> LibcallImpls;
-  const Record *CallingConv = nullptr;
-};
 
 class RuntimeLibcallEmitter {
 private:
   const RecordKeeper &Records;
-  DenseMap<const Record *, const RuntimeLibcall *> Def2RuntimeLibcall;
-  DenseMap<const Record *, const RuntimeLibcallImpl *> Def2RuntimeLibcallImpl;
+  RuntimeLibcalls Libcalls;
 
-  std::vector<RuntimeLibcall> RuntimeLibcallDefList;
-  std::vector<RuntimeLibcallImpl> RuntimeLibcallImplDefList;
-
-  DenseMap<const RuntimeLibcall *, const RuntimeLibcallImpl *>
-      LibCallToDefaultImpl;
-
-private:
   void emitGetRuntimeLibcallEnum(raw_ostream &OS) const;
 
   void emitNameMatchHashTable(raw_ostream &OS,
@@ -229,61 +83,7 @@ class RuntimeLibcallEmitter {
   void emitSystemRuntimeLibrarySetCalls(raw_ostream &OS) const;
 
 public:
-  RuntimeLibcallEmitter(const RecordKeeper &R) : Records(R) {
-
-    ArrayRef<const Record *> AllRuntimeLibcalls =
-        Records.getAllDerivedDefinitions("RuntimeLibcall");
-
-    RuntimeLibcallDefList.reserve(AllRuntimeLibcalls.size());
-
-    size_t CallTypeEnumVal = 0;
-    for (const Record *RuntimeLibcallDef : AllRuntimeLibcalls) {
-      RuntimeLibcallDefList.emplace_back(RuntimeLibcallDef, CallTypeEnumVal++);
-      Def2RuntimeLibcall[RuntimeLibcallDef] = &RuntimeLibcallDefList.back();
-    }
-
-    for (RuntimeLibcall &LibCall : RuntimeLibcallDefList)
-      Def2RuntimeLibcall[LibCall.getDef()] = &LibCall;
-
-    ArrayRef<const Record *> AllRuntimeLibcallImplsRaw =
-        Records.getAllDerivedDefinitions("RuntimeLibcallImpl");
-
-    SmallVector<const Record *, 1024> AllRuntimeLibcallImpls(
-        AllRuntimeLibcallImplsRaw);
-
-    // Sort by libcall impl name and secondarily by the enum name.
-    sort(AllRuntimeLibcallImpls, [](const Record *A, const Record *B) {
-      return std::pair(A->getValueAsString("LibCallFuncName"), A->getName()) <
-             std::pair(B->getValueAsString("LibCallFuncName"), B->getName());
-    });
-
-    RuntimeLibcallImplDefList.reserve(AllRuntimeLibcallImpls.size());
-
-    size_t LibCallImplEnumVal = 1;
-    for (const Record *LibCallImplDef : AllRuntimeLibcallImpls) {
-      RuntimeLibcallImplDefList.emplace_back(LibCallImplDef, Def2RuntimeLibcall,
-                                             LibCallImplEnumVal++);
-
-      const RuntimeLibcallImpl &LibCallImpl = RuntimeLibcallImplDefList.back();
-      Def2RuntimeLibcallImpl[LibCallImplDef] = &LibCallImpl;
-
-      if (LibCallImpl.isDefault()) {
-        const RuntimeLibcall *Provides = LibCallImpl.getProvides();
-        if (!Provides)
-          PrintFatalError(LibCallImplDef->getLoc(),
-                          "default implementations must provide a libcall");
-        LibCallToDefaultImpl[Provides] = &LibCallImpl;
-      }
-    }
-  }
-
-  const RuntimeLibcall *getRuntimeLibcall(const Record *Def) const {
-    return Def2RuntimeLibcall.lookup(Def);
-  }
-
-  const RuntimeLibcallImpl *getRuntimeLibcallImpl(const Record *Def) const {
-    return Def2RuntimeLibcallImpl.lookup(Def);
-  }
+  RuntimeLibcallEmitter(const RecordKeeper &R) : Records(R), Libcalls(R) {}
 
   void run(raw_ostream &OS);
 };
@@ -297,24 +97,25 @@ void RuntimeLibcallEmitter::emitGetRuntimeLibcallEnum(raw_ostream &OS) const {
         "namespace RTLIB {\n"
         "enum Libcall : unsigned short {\n";
 
-  for (const RuntimeLibcall &LibCall : RuntimeLibcallDefList) {
+  for (const RuntimeLibcall &LibCall : Libcalls.getRuntimeLibcallDefList()) {
     StringRef Name = LibCall.getName();
     OS << "  " << Name << " = " << LibCall.getEnumVal() << ",\n";
   }
 
-  OS << "  UNKNOWN_LIBCALL = " << RuntimeLibcallDefList.size()
+  OS << "  UNKNOWN_LIBCALL = " << Libcalls.getRuntimeLibcallDefList().size()
      << "\n};\n\n"
         "enum LibcallImpl : unsigned short {\n"
         "  Unsupported = 0,\n";
 
-  for (const RuntimeLibcallImpl &LibCall : RuntimeLibcallImplDefList) {
+  for (const RuntimeLibcallImpl &LibCall :
+       Libcalls.getRuntimeLibcallImplDefList()) {
     OS << "  impl_" << LibCall.getName() << " = " << LibCall.getEnumVal()
        << ", // " << LibCall.getLibcallFuncName() << '\n';
   }
 
   OS << "};\n"
      << "constexpr size_t NumLibcallImpls = "
-     << RuntimeLibcallImplDefList.size() + 1
+     << Libcalls.getRuntimeLibcallImplDefList().size() + 1
      << ";\n"
         "} // End namespace RTLIB\n"
         "} // End namespace llvm\n";
@@ -394,6 +195,8 @@ constructPerfectHashTable(ArrayRef<RuntimeLibcallImpl> Keywords,
 /// Generate hash table based lookup by name.
 void RuntimeLibcallEmitter::emitNameMatchHashTable(
     raw_ostream &OS, StringToOffsetTable &OffsetTable) const {
+  ArrayRef<RuntimeLibcallImpl> RuntimeLibcallImplDefList =
+      Libcalls.getRuntimeLibcallImplDefList();
   std::vector<uint64_t> Hashes(RuntimeLibcallImplDefList.size());
   std::vector<unsigned> TableValues(RuntimeLibcallImplDefList.size());
   DenseSet<StringRef> SeenFuncNames;
@@ -495,7 +298,8 @@ void RuntimeLibcallEmitter::emitGetInitRuntimeLibcallNames(
   {
     IfDefEmitter IfDef(OS, "GET_INIT_RUNTIME_LIBCALL_NAMES");
 
-    for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList)
+    for (const RuntimeLibcallImpl &LibCallImpl :
+         Libcalls.getRuntimeLibcallImplDefList())
       Table.GetOrAddStringOffset(LibCallImpl.getLibcallFuncName());
 
     Table.EmitStringTableDef(OS, "RuntimeLibcallImplNameTable");
@@ -505,7 +309,8 @@ const uint16_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameOffsetTable[] = {
 
     OS << formatv("  {}, // {}\n", Table.GetStringOffset(""),
                   ""); // Unsupported entry
-    for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList) {
+    for (const RuntimeLibcallImpl &LibCallImpl :
+         Libcalls.getRuntimeLibcallImplDefList()) {
       StringRef ImplName = LibCallImpl.getLibcallFuncName();
       OS << formatv("  {}, // {}\n", Table.GetStringOffset(ImplName), ImplName);
     }
@@ -516,7 +321,8 @@ const uint8_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameSizeTable[] = {
 )";
 
     OS << "  0,\n";
-    for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList)
+    for (const RuntimeLibcallImpl &LibCallImpl :
+         Libcalls.getRuntimeLibcallImplDefList())
       OS << "  " << LibCallImpl.getLibcallFuncName().size() << ",\n";
     OS << "};\n\n";
 
@@ -525,7 +331,8 @@ const uint8_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameSizeTable[] = {
           "ImplToLibcall[RTLIB::NumLibcallImpls] = {\n"
           "  RTLIB::UNKNOWN_LIBCALL, // RTLIB::Unsupported\n";
 
-    for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList) {
+    for (const RuntimeLibcallImpl &LibCallImpl :
+         Libcalls.getRuntimeLibcallImplDefList()) {
       const RuntimeLibcall *Provides = LibCallImpl.getProvides();
       OS << "  ";
       Provides->emitEnumEntry(OS);
@@ -533,6 +340,7 @@ const uint8_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameSizeTable[] = {
       LibCallImpl.emitEnumEntry(OS);
       OS << '\n';
     }
+
     OS << "};\n\n";
   }
 
@@ -544,11 +352,8 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
   OS << "void llvm::RTLIB::RuntimeLibcallsInfo::setTargetRuntimeLibcallSets("
         "const llvm::Triple &TT, ExceptionHandling ExceptionModel, "
         "FloatABI::ABIType FloatABI, EABI EABIVersion, "
-        "StringRef ABIName) {\n"
-        "  struct LibcallImplPair {\n"
-        "    RTLIB::Libcall Func;\n"
-        "    RTLIB::LibcallImpl Impl;\n"
-        "  };\n";
+        "StringRef ABIName) {\n";
+
   ArrayRef<const Record *> AllLibs =
       Records.getAllDerivedDefinitions("SystemRuntimeLibrary");
 
@@ -579,7 +384,7 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
              std::pair<std::vector<const Record *>, const Record *>>
         Func2Preds;
     Sets.addExpander("LibcallImpls", std::make_unique<LibcallPredicateExpander>(
-                                         *this, Func2Preds));
+                                         Libcalls, Func2Preds));
 
     const SetTheory::RecVec *Elements =
         Sets.expand(R->getValueAsDef("MemberList"));
@@ -592,11 +397,12 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
     constexpr unsigned BitsPerStorageElt = 64;
     DenseMap<PredicateWithCC, LibcallsWithCC> Pred2Funcs;
 
-    SmallVector<uint64_t, 32> BitsetValues(
-        divideCeil(RuntimeLibcallImplDefList.size() + 1, BitsPerStorageElt));
+    SmallVector<uint64_t, 32> BitsetValues(divideCeil(
+        Libcalls.getRuntimeLibcallImplDefList().size() + 1, BitsPerStorageElt));
 
     for (const Record *Elt : *Elements) {
-      const RuntimeLibcallImpl *LibCallImpl = getRuntimeLibcallImpl(Elt);
+      const RuntimeLibcallImpl *LibCallImpl =
+          Libcalls.getRuntimeLibcallImpl(Elt);
       if (!LibCallImpl) {
         PrintError(R, "entry for SystemLibrary is not a RuntimeLibcallImpl");
         PrintNote(Elt->getLoc(), "invalid entry `" + Elt->getName() + "`");
@@ -703,7 +509,7 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
       Funcs.erase(UniqueI, Funcs.end());
 
       OS << indent(IndentDepth + 2)
-         << "static const LibcallImplPair LibraryCalls";
+         << "static const RTLIB::LibcallImpl LibraryCalls";
       SubsetPredicate.emitTableVariableNameSuffix(OS);
       if (FuncsWithCC.CallingConv)
         OS << '_' << FuncsWithCC.CallingConv->getName();
@@ -711,18 +517,18 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
       OS << "[] = {\n";
       for (const RuntimeLibcallImpl *LibCallImpl : Funcs) {
         OS << indent(IndentDepth + 6);
-        LibCallImpl->emitTableEntry(OS);
+        LibCallImpl->emitEnumEntry(OS);
+        OS << ", // " << LibCallImpl->getLibcallFuncName() << '\n';
       }
 
       OS << indent(IndentDepth + 2) << "};\n\n"
          << indent(IndentDepth + 2)
-         << "for (const auto [Func, Impl] : LibraryCalls";
+         << "for (const RTLIB::LibcallImpl Impl : LibraryCalls";
       SubsetPredicate.emitTableVariableNameSuffix(OS);
       if (FuncsWithCC.CallingConv)
         OS << '_' << FuncsWithCC.CallingConv->getName();
 
-      OS << ") {\n"
-         << indent(IndentDepth + 4) << "setLibcallImpl(Func, Impl);\n";
+      OS << ") {\n" << indent(IndentDepth + 4) << "setAvailable(Impl);\n";
 
       if (FuncsWithCC.CallingConv) {
         StringRef CCEnum =
@@ -759,44 +565,10 @@ void RuntimeLibcallEmitter::run(raw_ostream &OS) {
   emitGetInitRuntimeLibcallNames(OS);
 
   {
-    IfDefEmitter IfDef(OS, "GET_SET_TARGET_RUNTIME_LIBCALL_SETS");
+    IfDefEmitter IfDef(OS, "GET_RUNTIME_LIBCALLS_INFO");
     emitSystemRuntimeLibrarySetCalls(OS);
   }
 }
 
-void LibcallPredicateExpander::expand(SetTheory &ST, const Record *Def,
-                                      SetTheory::RecSet &Elts) {
-  assert(Def->isSubClassOf("LibcallImpls"));
-
-  SetTheory::RecSet TmpElts;
-
-  ST.evaluate(Def->getValueInit("MemberList"), TmpElts, Def->getLoc());
-
-  Elts.insert(TmpElts.begin(), TmpElts.end());
-
-  AvailabilityPredicate AP(Def->getValueAsDef("AvailabilityPredicate"));
-  const Record *CCClass = Def->getValueAsOptionalDef("CallingConv");
-
-  // This is assuming we aren't conditionally applying a calling convention to
-  // some subsets, and not another, but this doesn't appear to be used.
-
-  for (const Record *LibcallImplDef : TmpElts) {
-    const RuntimeLibcallImpl *LibcallImpl =
-        LibcallEmitter.getRuntimeLibcallImpl(LibcallImplDef);
-    if (!AP.isAlwaysAvailable() || CCClass) {
-      auto [It, Inserted] = Func2Preds.insert({LibcallImpl, {{}, CCClass}});
-      if (!Inserted) {
-        PrintError(
-            Def,
-            "combining nested libcall set predicates currently unhandled: '" +
-                LibcallImpl->getLibcallFuncName() + "'");
-      }
-
-      It->second.first.push_back(AP.getDef());
-      It->second.second = CCClass;
-    }
-  }
-}
-
 static TableGen::Emitter::OptClass<RuntimeLibcallEmitter>
     X("gen-runtime-libcalls", "Generate RuntimeLibcalls");
diff --git a/llvm/utils/TableGen/Basic/SequenceToOffsetTable.h b/llvm/utils/TableGen/Basic/SequenceToOffsetTable.h
index 8da6fbef0672e..761ef1fcf12fe 100644
--- a/llvm/utils/TableGen/Basic/SequenceToOffsetTable.h
+++ b/llvm/utils/TableGen/Basic/SequenceToOffsetTable.h
@@ -44,7 +44,7 @@ inline void printChar(raw_ostream &OS, char C) {
 /// @tparam Less A stable comparator for SeqT elements.
 template <typename SeqT, typename Less = std::less<typename SeqT::value_type>>
 class SequenceToOffsetTable {
-  typedef typename SeqT::value_type ElemT;
+  using ElemT = typename SeqT::value_type;
 
   // Define a comparator for SeqT that sorts a suffix immediately before a
   // sequence with that suffix.
@@ -58,7 +58,7 @@ class SequenceToOffsetTable {
 
   // Keep sequences ordered according to SeqLess so suffixes are easy to find.
   // Map each sequence to its offset in the table.
-  typedef std::map<SeqT, unsigned, SeqLess> SeqMap;
+  using SeqMap = std::map<SeqT, unsigned, SeqLess>;
 
   // Sequences added so far, with suffixes removed.
   SeqMap Seqs;
diff --git a/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.h b/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.h
index 99e4820c614c2..412f323d04821 100644
--- a/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.h
+++ b/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.h
@@ -43,7 +43,7 @@ class TargetFeaturesEmitter {
   void printFeatureKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
   void printCPUKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
   virtual void run(raw_ostream &O);
-  virtual ~TargetFeaturesEmitter() {};
+  virtual ~TargetFeaturesEmitter() = default;
 };
 } // namespace llvm
 #endif
diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp
index 90f0a2ac8c268..e5025784d304d 100644
--- a/llvm/utils/TableGen/CodeGenMapTable.cpp
+++ b/llvm/utils/TableGen/CodeGenMapTable.cpp
@@ -84,9 +84,9 @@
 #include "llvm/TableGen/Record.h"
 
 using namespace llvm;
-typedef std::map<std::string, std::vector<const Record *>> InstrRelMapTy;
-typedef std::map<std::vector<const Init *>, std::vector<const Record *>>
-    RowInstrMapTy;
+using InstrRelMapTy = std::map<std::string, std::vector<const Record *>>;
+using RowInstrMapTy =
+    std::map<std::vector<const Init *>, std::vector<const Record *>>;
 
 namespace {
 
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index 8076ce2486f56..34355d5d6b743 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -776,7 +776,7 @@ bool TypeInfer::EnforceSameSize(TypeSetByHwMode &A, TypeSetByHwMode &B) {
   if (B.empty())
     Changed |= EnforceAny(B);
 
-  typedef SmallSet<TypeSize, 2, TypeSizeComparator> TypeSizeSet;
+  using TypeSizeSet = SmallSet<TypeSize, 2, TypeSizeComparator>;
 
   auto NoSize = [](const TypeSizeSet &Sizes, MVT T) -> bool {
     return !Sizes.contains(T.getSizeInBits());
@@ -4129,7 +4129,7 @@ void CodeGenDAGPatterns::ParseInstructions() {
   }
 }
 
-typedef std::pair<TreePatternNode *, unsigned> NameRecord;
+using NameRecord = std::pair<TreePatternNode *, unsigned>;
 
 static void FindNames(TreePatternNode &P,
                       std::map<StringRef, NameRecord> &Names,
@@ -4590,7 +4590,7 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() {
 }
 
 /// Dependent variable map for CodeGenDAGPattern variant generation
-typedef StringMap<int> DepVarMap;
+using DepVarMap = StringMap<int>;
 
 static void FindDepVarsOf(TreePatternNode &N, DepVarMap &DepMap) {
   if (N.isLeaf()) {
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
index 2ed8d1376b045..aa9a0a442424d 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
@@ -349,7 +349,7 @@ struct TypeInfer {
 };
 
 /// Set type used to track multiply used variables in patterns
-typedef StringSet<> MultipleUseVarSet;
+using MultipleUseVarSet = StringSet<>;
 
 /// SDTypeConstraint - This is a discriminated union of constraints,
 /// corresponding to the SDTypeConstraint tablegen class in Target.td.
@@ -1217,13 +1217,13 @@ class CodeGenDAGPatterns {
   iterator_range<pf_iterator> ptfs() const { return PatternFragments; }
 
   // Patterns to match information.
-  typedef std::vector<PatternToMatch>::const_iterator ptm_iterator;
+  using ptm_iterator = std::vector<PatternToMatch>::const_iterator;
   ptm_iterator ptm_begin() const { return PatternsToMatch.begin(); }
   ptm_iterator ptm_end() const { return PatternsToMatch.end(); }
   iterator_range<ptm_iterator> ptms() const { return PatternsToMatch; }
 
   /// Parse the Pattern for an instruction, and insert the result in DAGInsts.
-  typedef std::map<const Record *, DAGInstruction, LessRecordByID> DAGInstMap;
+  using DAGInstMap = std::map<const Record *, DAGInstruction, LessRecordByID>;
   void parseInstructionPattern(CodeGenInstruction &CGI, const ListInit *Pattern,
                                DAGInstMap &DAGInsts);
 
diff --git a/llvm/utils/TableGen/Common/CodeGenHwModes.h b/llvm/utils/TableGen/Common/CodeGenHwModes.h
index 5e1b31ae39e43..7e45f8ef1de49 100644
--- a/llvm/utils/TableGen/Common/CodeGenHwModes.h
+++ b/llvm/utils/TableGen/Common/CodeGenHwModes.h
@@ -15,7 +15,6 @@
 #include "llvm/ADT/StringRef.h"
 #include <cassert>
 #include <map>
-#include <string>
 #include <utility>
 #include <vector>
 
@@ -36,7 +35,7 @@ struct HwMode {
 
 struct HwModeSelect {
   HwModeSelect(const Record *R, CodeGenHwModes &CGH);
-  typedef std::pair<unsigned, const Record *> PairType;
+  using PairType = std::pair<unsigned, const Record *>;
   std::vector<PairType> Items;
   void dump() const;
 };
diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.h b/llvm/utils/TableGen/Common/CodeGenInstruction.h
index ed0bfa7098eb7..72958375ab298 100644
--- a/llvm/utils/TableGen/Common/CodeGenInstruction.h
+++ b/llvm/utils/TableGen/Common/CodeGenInstruction.h
@@ -158,8 +158,8 @@ class CGIOperandList {
   OperandInfo &back() { return OperandList.back(); }
   const OperandInfo &back() const { return OperandList.back(); }
 
-  typedef std::vector<OperandInfo>::iterator iterator;
-  typedef std::vector<OperandInfo>::const_iterator const_iterator;
+  using iterator = std::vector<OperandInfo>::iterator;
+  using const_iterator = std::vector<OperandInfo>::const_iterator;
   iterator begin() { return OperandList.begin(); }
   const_iterator begin() const { return OperandList.begin(); }
   iterator end() { return OperandList.end(); }
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index 8d0ec9abd23ae..2f0ff3f59c47c 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -1651,8 +1651,7 @@ template <> struct llvm::GraphTraits<SubRegIndexCompositionGraph> {
   struct ChildIteratorType
       : public iterator_adaptor_base<
             ChildIteratorType, CompMapIt,
-            typename std::iterator_traits<CompMapIt>::iterator_category,
-            NodeRef> {
+            std::iterator_traits<CompMapIt>::iterator_category, NodeRef> {
     ChildIteratorType(CompMapIt I)
         : ChildIteratorType::iterator_adaptor_base(I) {}
 
@@ -2164,7 +2163,7 @@ void CodeGenRegBank::computeRegUnitLaneMasks() {
     CodeGenRegister::RegUnitLaneMaskList RegUnitLaneMasks(
         RegUnits.count(), LaneBitmask::getAll());
     // Iterate through SubRegisters.
-    typedef CodeGenRegister::SubRegMap SubRegMap;
+    using SubRegMap = CodeGenRegister::SubRegMap;
     const SubRegMap &SubRegs = Register.getSubRegs();
     for (auto [SubRegIndex, SubReg] : SubRegs) {
       // Ignore non-leaf subregisters, their lane masks are fully covered by
@@ -2283,9 +2282,8 @@ void CodeGenRegBank::inferCommonSubClass(CodeGenRegisterClass *RC) {
 //
 void CodeGenRegBank::inferSubClassWithSubReg(CodeGenRegisterClass *RC) {
   // Map SubRegIndex to set of registers in RC supporting that SubRegIndex.
-  typedef std::map<const CodeGenSubRegIndex *, CodeGenRegister::Vec,
-                   deref<std::less<>>>
-      SubReg2SetMap;
+  using SubReg2SetMap = std::map<const CodeGenSubRegIndex *,
+                                 CodeGenRegister::Vec, deref<std::less<>>>;
 
   // Compute the set of registers supporting each SubRegIndex.
   SubReg2SetMap SRSets;
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.h b/llvm/utils/TableGen/Common/CodeGenRegisters.h
index 89dac125e3d15..c02d04b648534 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.h
@@ -93,9 +93,8 @@ class CodeGenSubRegIndex {
   std::string getQualifiedName() const;
 
   // Map of composite subreg indices.
-  typedef std::map<CodeGenSubRegIndex *, CodeGenSubRegIndex *,
-                   deref<std::less<>>>
-      CompMap;
+  using CompMap =
+      std::map<CodeGenSubRegIndex *, CodeGenSubRegIndex *, deref<std::less<>>>;
 
   // Returns the subreg index that results from composing this with Idx.
   // Returns NULL if this and Idx don't compose.
@@ -180,8 +179,8 @@ class CodeGenRegister {
   bool Constant = false;
 
   // Map SubRegIndex -> Register.
-  typedef std::map<CodeGenSubRegIndex *, CodeGenRegister *, deref<std::less<>>>
-      SubRegMap;
+  using SubRegMap =
+      std::map<CodeGenSubRegIndex *, CodeGenRegister *, deref<std::less<>>>;
 
   CodeGenRegister(const Record *R, unsigned Enum);
 
@@ -220,7 +219,7 @@ class CodeGenRegister {
     return SubReg2Idx.lookup(Reg);
   }
 
-  typedef std::vector<const CodeGenRegister *> SuperRegList;
+  using SuperRegList = std::vector<const CodeGenRegister *>;
 
   // Get the list of super-registers in topological order, small to large.
   // This is valid after computeSubRegs visits all registers during RegBank
@@ -248,8 +247,8 @@ class CodeGenRegister {
   }
 
   // List of register units in ascending order.
-  typedef SparseBitVector<> RegUnitList;
-  typedef SmallVector<LaneBitmask, 16> RegUnitLaneMaskList;
+  using RegUnitList = SparseBitVector<>;
+  using RegUnitLaneMaskList = SmallVector<LaneBitmask, 16>;
 
   // How many entries in RegUnitList are native?
   RegUnitList NativeRegUnits;
@@ -281,7 +280,7 @@ class CodeGenRegister {
   unsigned getWeight(const CodeGenRegBank &RegBank) const;
 
   // Canonically ordered set.
-  typedef std::vector<const CodeGenRegister *> Vec;
+  using Vec = std::vector<const CodeGenRegister *>;
 
 private:
   bool SubRegsComplete;
@@ -590,7 +589,7 @@ struct RegUnit {
 
 // Each RegUnitSet is a sorted vector with a name.
 struct RegUnitSet {
-  typedef std::vector<unsigned>::const_iterator iterator;
+  using iterator = std::vector<unsigned>::const_iterator;
 
   std::string Name;
   std::vector<unsigned> Units;
@@ -602,7 +601,7 @@ struct RegUnitSet {
 
 // Base vector for identifying TopoSigs. The contents uniquely identify a
 // TopoSig, only computeSuperRegs needs to know how.
-typedef SmallVector<unsigned, 16> TopoSigId;
+using TopoSigId = SmallVector<unsigned, 16>;
 
 // CodeGenRegBank - Represent a target's registers and the relations between
 // them.
@@ -621,8 +620,8 @@ class CodeGenRegBank {
 
   CodeGenSubRegIndex *createSubRegIndex(StringRef Name, StringRef NameSpace);
 
-  typedef std::map<SmallVector<CodeGenSubRegIndex *, 8>, CodeGenSubRegIndex *>
-      ConcatIdxMap;
+  using ConcatIdxMap =
+      std::map<SmallVector<CodeGenSubRegIndex *, 8>, CodeGenSubRegIndex *>;
   ConcatIdxMap ConcatIdx;
 
   // Registers.
@@ -639,7 +638,7 @@ class CodeGenRegBank {
   // Register classes.
   std::list<CodeGenRegisterClass> RegClasses;
   DenseMap<const Record *, CodeGenRegisterClass *> Def2RC;
-  typedef std::map<CodeGenRegisterClass::Key, CodeGenRegisterClass *> RCKeyMap;
+  using RCKeyMap = std::map<CodeGenRegisterClass::Key, CodeGenRegisterClass *>;
   RCKeyMap Key2RC;
 
   // Register categories.
diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.cpp b/llvm/utils/TableGen/Common/CodeGenTarget.cpp
index 3db0d07eec88f..c0daac127f71a 100644
--- a/llvm/utils/TableGen/Common/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenTarget.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
-#include <iterator>
 #include <tuple>
 using namespace llvm;
 
@@ -80,7 +79,7 @@ CodeGenTarget::CodeGenTarget(const RecordKeeper &records)
   MacroFusions = Records.getAllDerivedDefinitions("Fusion");
 }
 
-CodeGenTarget::~CodeGenTarget() {}
+CodeGenTarget::~CodeGenTarget() = default;
 
 StringRef CodeGenTarget::getName() const { return TargetRec->getName(); }
 
diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.h b/llvm/utils/TableGen/Common/CodeGenTarget.h
index b6f9d4209a13e..c1ed70d1739f0 100644
--- a/llvm/utils/TableGen/Common/CodeGenTarget.h
+++ b/llvm/utils/TableGen/Common/CodeGenTarget.h
@@ -28,7 +28,6 @@
 #include "llvm/CodeGenTypes/MachineValueType.h"
 #include <cassert>
 #include <memory>
-#include <optional>
 #include <string>
 #include <vector>
 
diff --git a/llvm/utils/TableGen/Common/DAGISelMatcher.h b/llvm/utils/TableGen/Common/DAGISelMatcher.h
index f87de757f4f8b..a19f4442f5f4d 100644
--- a/llvm/utils/TableGen/Common/DAGISelMatcher.h
+++ b/llvm/utils/TableGen/Common/DAGISelMatcher.h
@@ -105,7 +105,7 @@ class Matcher {
   Matcher(KindTy K) : Kind(K) {}
 
 public:
-  virtual ~Matcher() {}
+  virtual ~Matcher() = default;
 
   unsigned getSize() const { return Size; }
   void setSize(unsigned sz) { Size = sz; }
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index 5d49715879280..7af757c037612 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -457,7 +457,7 @@ std::optional<LLTCodeGen> llvm::gi::MVTToLLT(MVT::SimpleValueType SVT) {
 
 void Matcher::optimize() {}
 
-Matcher::~Matcher() {}
+Matcher::~Matcher() = default;
 
 //===- GroupMatcher -------------------------------------------------------===//
 
@@ -1150,11 +1150,11 @@ void RuleMatcher::insnmatchers_pop_front() { Matchers.erase(Matchers.begin()); }
 
 //===- PredicateMatcher ---------------------------------------------------===//
 
-PredicateMatcher::~PredicateMatcher() {}
+PredicateMatcher::~PredicateMatcher() = default;
 
 //===- OperandPredicateMatcher --------------------------------------------===//
 
-OperandPredicateMatcher::~OperandPredicateMatcher() {}
+OperandPredicateMatcher::~OperandPredicateMatcher() = default;
 
 bool OperandPredicateMatcher::isHigherPriorityThan(
     const OperandPredicateMatcher &B) const {
@@ -1941,7 +1941,7 @@ bool InstructionOperandMatcher::isHigherPriorityThan(
 
 //===- OperandRenderer ----------------------------------------------------===//
 
-OperandRenderer::~OperandRenderer() {}
+OperandRenderer::~OperandRenderer() = default;
 
 //===- CopyRenderer -------------------------------------------------------===//
 
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
index 0f1241eb4d63f..d71fdb450e1a9 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
@@ -528,10 +528,10 @@ class RuleMatcher : public Matcher {
 
   ArrayRef<SMLoc> SrcLoc;
 
-  typedef std::tuple<const Record *, unsigned, unsigned>
-      DefinedComplexPatternSubOperand;
-  typedef StringMap<DefinedComplexPatternSubOperand>
-      DefinedComplexPatternSubOperandMap;
+  using DefinedComplexPatternSubOperand =
+      std::tuple<const Record *, unsigned, unsigned>;
+  using DefinedComplexPatternSubOperandMap =
+      StringMap<DefinedComplexPatternSubOperand>;
   /// A map of Symbolic Names to ComplexPattern sub-operands.
   DefinedComplexPatternSubOperandMap ComplexSubOperands;
   /// A map used to for multiple referenced error check of ComplexSubOperand.
@@ -621,7 +621,7 @@ class RuleMatcher : public Matcher {
   DefinedInsnVariablesMap::const_iterator defined_insn_vars_end() const {
     return InsnVariableIDs.end();
   }
-  iterator_range<typename DefinedInsnVariablesMap::const_iterator>
+  iterator_range<DefinedInsnVariablesMap::const_iterator>
   defined_insn_vars() const {
     return make_range(defined_insn_vars_begin(), defined_insn_vars_end());
   }
@@ -632,8 +632,7 @@ class RuleMatcher : public Matcher {
   MutatableInsnSet::const_iterator mutatable_insns_end() const {
     return MutatableInsns.end();
   }
-  iterator_range<typename MutatableInsnSet::const_iterator>
-  mutatable_insns() const {
+  iterator_range<MutatableInsnSet::const_iterator> mutatable_insns() const {
     return make_range(mutatable_insns_begin(), mutatable_insns_end());
   }
   void reserveInsnMatcherForMutation(InstructionMatcher *InsnMatcher) {
@@ -1375,7 +1374,7 @@ class InstructionPredicateMatcher : public PredicateMatcher {
 public:
   InstructionPredicateMatcher(PredicateKind Kind, unsigned InsnVarID)
       : PredicateMatcher(Kind, InsnVarID) {}
-  ~InstructionPredicateMatcher() override {}
+  ~InstructionPredicateMatcher() override = default;
 
   /// Compare the priority of this object and B.
   ///
@@ -1776,7 +1775,7 @@ class OneUsePredicateMatcher : public InstructionPredicateMatcher {
 /// * Has an nsw/nuw flag or doesn't.
 class InstructionMatcher final : public PredicateListMatcher<PredicateMatcher> {
 protected:
-  typedef std::vector<std::unique_ptr<OperandMatcher>> OperandVec;
+  using OperandVec = std::vector<std::unique_ptr<OperandMatcher>>;
 
   RuleMatcher &Rule;
 
@@ -2319,7 +2318,7 @@ class MatchAction {
 
   ActionKind getKind() const { return Kind; }
 
-  virtual ~MatchAction() {}
+  virtual ~MatchAction() = default;
 
   // Some actions may need to add extra predicates to ensure they can run.
   virtual void emitAdditionalPredicates(MatchTable &Table,
diff --git a/llvm/utils/TableGen/Common/InfoByHwMode.cpp b/llvm/utils/TableGen/Common/InfoByHwMode.cpp
index 4c8197dc60c14..a16fdbb58e788 100644
--- a/llvm/utils/TableGen/Common/InfoByHwMode.cpp
+++ b/llvm/utils/TableGen/Common/InfoByHwMode.cpp
@@ -174,7 +174,7 @@ bool RegSizeInfoByHwMode::hasStricterSpillThan(
 }
 
 void RegSizeInfoByHwMode::writeToStream(raw_ostream &OS) const {
-  typedef typename decltype(Map)::value_type PairType;
+  using PairType = decltype(Map)::value_type;
   std::vector<const PairType *> Pairs;
   for (const auto &P : Map)
     Pairs.push_back(&P);
diff --git a/llvm/utils/TableGen/Common/InfoByHwMode.h b/llvm/utils/TableGen/Common/InfoByHwMode.h
index c730b7397c173..ce84960ef79a7 100644
--- a/llvm/utils/TableGen/Common/InfoByHwMode.h
+++ b/llvm/utils/TableGen/Common/InfoByHwMode.h
@@ -24,7 +24,6 @@
 #include <map>
 #include <string>
 #include <tuple>
-#include <utility>
 
 namespace llvm {
 
@@ -87,10 +86,10 @@ void union_modes(const InfoByHwMode<InfoT> &A, const InfoByHwMode<InfoT> &B,
 }
 
 template <typename InfoT> struct InfoByHwMode {
-  typedef std::map<unsigned, InfoT> MapType;
-  typedef typename MapType::value_type PairType;
-  typedef typename MapType::iterator iterator;
-  typedef typename MapType::const_iterator const_iterator;
+  using MapType = std::map<unsigned, InfoT>;
+  using PairType = typename MapType::value_type;
+  using iterator = typename MapType::iterator;
+  using const_iterator = typename MapType::const_iterator;
 
   InfoByHwMode() = default;
   InfoByHwMode(const MapType &M) : Map(M) {}
diff --git a/llvm/utils/TableGen/DFAEmitter.cpp b/llvm/utils/TableGen/DFAEmitter.cpp
index 65052e7881e1b..caa7a6fee9538 100644
--- a/llvm/utils/TableGen/DFAEmitter.cpp
+++ b/llvm/utils/TableGen/DFAEmitter.cpp
@@ -32,7 +32,6 @@
 #include <cassert>
 #include <cstdint>
 #include <deque>
-#include <map>
 #include <set>
 #include <string>
 #include <variant>
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index e0be104c883c5..ed05af05572c2 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -71,7 +71,7 @@ class ImmPredicateSet {
 
   const TreePredicateFn &getPredicate(unsigned Idx) { return PredsByName[Idx]; }
 
-  typedef std::vector<TreePredicateFn>::const_iterator iterator;
+  using iterator = std::vector<TreePredicateFn>::const_iterator;
   iterator begin() const { return PredsByName.begin(); }
   iterator end() const { return PredsByName.end(); }
 };
@@ -85,7 +85,7 @@ struct OperandsSignature {
     char Repr = OK_Invalid;
 
   public:
-    OpKind() {}
+    OpKind() = default;
 
     bool operator<(OpKind RHS) const { return Repr < RHS.Repr; }
     bool operator==(OpKind RHS) const { return Repr == RHS.Repr; }
@@ -366,12 +366,12 @@ struct OperandsSignature {
 class FastISelMap {
   // A multimap is needed instead of a "plain" map because the key is
   // the instruction's complexity (an int) and they are not unique.
-  typedef std::multimap<int, InstructionMemo> PredMap;
-  typedef std::map<MVT::SimpleValueType, PredMap> RetPredMap;
-  typedef std::map<MVT::SimpleValueType, RetPredMap> TypeRetPredMap;
-  typedef std::map<StringRef, TypeRetPredMap> OpcodeTypeRetPredMap;
-  typedef std::map<OperandsSignature, OpcodeTypeRetPredMap>
-      OperandsOpcodeTypeRetPredMap;
+  using PredMap = std::multimap<int, InstructionMemo>;
+  using RetPredMap = std::map<MVT::SimpleValueType, PredMap>;
+  using TypeRetPredMap = std::map<MVT::SimpleValueType, RetPredMap>;
+  using OpcodeTypeRetPredMap = std::map<StringRef, TypeRetPredMap>;
+  using OperandsOpcodeTypeRetPredMap =
+      std::map<OperandsSignature, OpcodeTypeRetPredMap>;
 
   OperandsOpcodeTypeRetPredMap SimplePatterns;
 
diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
index 043bc6286146c..50e63a4bdc462 100644
--- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
@@ -2441,7 +2441,7 @@ class GICombinerEmitter final : public GlobalISelMatchTableExecutorEmitter {
   explicit GICombinerEmitter(const RecordKeeper &RK,
                              const CodeGenTarget &Target, StringRef Name,
                              const Record *Combiner);
-  ~GICombinerEmitter() override {}
+  ~GICombinerEmitter() override = default;
 
   void run(raw_ostream &OS);
 };
diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index 0b90f9104208a..32994c12aa98b 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -68,9 +68,9 @@ class InstrInfoEmitter {
   void emitEnums(raw_ostream &OS,
                  ArrayRef<const CodeGenInstruction *> NumberedInstructions);
 
-  typedef std::vector<std::string> OperandInfoTy;
-  typedef std::vector<OperandInfoTy> OperandInfoListTy;
-  typedef std::map<OperandInfoTy, unsigned> OperandInfoMapTy;
+  using OperandInfoTy = std::vector<std::string>;
+  using OperandInfoListTy = std::vector<OperandInfoTy>;
+  using OperandInfoMapTy = std::map<OperandInfoTy, unsigned>;
 
   /// Generate member functions in the target-specific GenInstrInfo class.
   ///
@@ -341,8 +341,6 @@ emitGetOperandIdxName(raw_ostream &OS,
 void InstrInfoEmitter::emitOperandNameMappings(
     raw_ostream &OS, const CodeGenTarget &Target,
     ArrayRef<const CodeGenInstruction *> TargetInstructions) {
-  StringRef Namespace = Target.getInstNamespace();
-
   // Map of operand names to their ID.
   MapVector<StringRef, unsigned> OperandNameToID;
 
@@ -383,38 +381,35 @@ void InstrInfoEmitter::emitOperandNameMappings(
   const size_t NumOperandNames = OperandNameToID.size();
   const unsigned MaxNumOperands = MaxOperandNo + 1;
 
-  OS << "#ifdef GET_INSTRINFO_OPERAND_ENUM\n";
-  OS << "#undef GET_INSTRINFO_OPERAND_ENUM\n";
-  OS << "namespace llvm::" << Namespace << " {\n";
-
-  assert(NumOperandNames <= UINT16_MAX &&
-         "Too many operands for the operand index -> name table");
-  StringRef EnumType = getMinimalTypeForRange(NumOperandNames);
-  OS << "enum class OpName : " << EnumType << " {\n";
-  for (const auto &[Op, I] : OperandNameToID)
-    OS << "  " << Op << " = " << I << ",\n";
-  OS << "  NUM_OPERAND_NAMES = " << NumOperandNames << ",\n";
-  OS << "}; // enum class OpName\n\n";
-
-  OS << "LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, OpName "
-        "Name);\n";
-  OS << "LLVM_READONLY OpName getOperandIdxName(uint16_t Opcode, int16_t "
-        "Idx);\n";
-  OS << "} // end namespace llvm::" << Namespace << '\n';
-  OS << "#endif //GET_INSTRINFO_OPERAND_ENUM\n\n";
-
-  OS << "#ifdef GET_INSTRINFO_NAMED_OPS\n";
-  OS << "#undef GET_INSTRINFO_NAMED_OPS\n";
-  OS << "namespace llvm::" << Namespace << " {\n";
-
-  emitGetInstructionIndexForOpLookup(OS, OperandMap, InstructionIndex);
+  const SmallString<32> Namespace({"llvm::", Target.getInstNamespace()});
+  {
+    IfDefEmitter IfDef(OS, "GET_INSTRINFO_OPERAND_ENUM");
+    NamespaceEmitter NS(OS, Namespace);
+
+    assert(NumOperandNames <= UINT16_MAX &&
+           "Too many operands for the operand index -> name table");
+    StringRef EnumType = getMinimalTypeForRange(NumOperandNames);
+    OS << "enum class OpName : " << EnumType << " {\n";
+    for (const auto &[Op, I] : OperandNameToID)
+      OS << "  " << Op << " = " << I << ",\n";
+    OS << "  NUM_OPERAND_NAMES = " << NumOperandNames << ",\n";
+    OS << "}; // enum class OpName\n\n";
+
+    OS << "LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, OpName "
+          "Name);\n";
+    OS << "LLVM_READONLY OpName getOperandIdxName(uint16_t Opcode, int16_t "
+          "Idx);\n";
+  }
 
-  emitGetNamedOperandIdx(OS, OperandMap, MaxOperandNo, NumOperandNames);
-  emitGetOperandIdxName(OS, OperandNameToID, OperandMap, MaxNumOperands,
-                        NumOperandNames);
+  {
+    IfDefEmitter IfDef(OS, "GET_INSTRINFO_NAMED_OPS");
+    NamespaceEmitter NS(OS, Namespace);
+    emitGetInstructionIndexForOpLookup(OS, OperandMap, InstructionIndex);
 
-  OS << "} // end namespace llvm::" << Namespace << '\n';
-  OS << "#endif //GET_INSTRINFO_NAMED_OPS\n\n";
+    emitGetNamedOperandIdx(OS, OperandMap, MaxOperandNo, NumOperandNames);
+    emitGetOperandIdxName(OS, OperandNameToID, OperandMap, MaxNumOperands,
+                          NumOperandNames);
+  }
 }
 
 /// Generate an enum for all the operand types for this target, under the
@@ -439,121 +434,121 @@ void InstrInfoEmitter::emitOperandTypeMappings(
   ArrayRef<const Record *> RegisterClasses =
       Records.getAllDerivedDefinitions("RegisterClass");
 
-  OS << "#ifdef GET_INSTRINFO_OPERAND_TYPES_ENUM\n";
-  OS << "#undef GET_INSTRINFO_OPERAND_TYPES_ENUM\n";
-  OS << "namespace llvm::" << Namespace << "::OpTypes {\n";
-  OS << "enum OperandType {\n";
-
   unsigned EnumVal = 0;
-  for (ArrayRef<const Record *> RecordsToAdd :
-       {Operands, RegisterOperands, RegisterClasses}) {
-    for (const Record *Op : RecordsToAdd) {
-      if (!Op->isAnonymous())
-        OS << "  " << Op->getName() << " = " << EnumVal << ",\n";
-      ++EnumVal;
+
+  {
+    IfDefEmitter IfDef(OS, "GET_INSTRINFO_OPERAND_TYPES_ENUM");
+    NamespaceEmitter NS(OS, ("llvm::" + Namespace + "::OpTypes").str());
+    OS << "enum OperandType {\n";
+
+    for (ArrayRef<const Record *> RecordsToAdd :
+         {Operands, RegisterOperands, RegisterClasses}) {
+      for (const Record *Op : RecordsToAdd) {
+        if (!Op->isAnonymous())
+          OS << "  " << Op->getName() << " = " << EnumVal << ",\n";
+        ++EnumVal;
+      }
     }
+
+    OS << "  OPERAND_TYPE_LIST_END" << "\n};\n";
   }
 
-  OS << "  OPERAND_TYPE_LIST_END"
-     << "\n};\n";
-  OS << "} // end namespace llvm::" << Namespace << "::OpTypes\n";
-  OS << "#endif // GET_INSTRINFO_OPERAND_TYPES_ENUM\n\n";
-
-  OS << "#ifdef GET_INSTRINFO_OPERAND_TYPE\n";
-  OS << "#undef GET_INSTRINFO_OPERAND_TYPE\n";
-  OS << "namespace llvm::" << Namespace << " {\n";
-  OS << "LLVM_READONLY\n";
-  OS << "static int getOperandType(uint16_t Opcode, uint16_t OpIdx) {\n";
-  auto getInstrName = [&](int I) -> StringRef {
-    return NumberedInstructions[I]->getName();
-  };
-  // TODO: Factor out duplicate operand lists to compress the tables.
-  std::vector<size_t> OperandOffsets;
-  std::vector<const Record *> OperandRecords;
-  size_t CurrentOffset = 0;
-  for (const CodeGenInstruction *Inst : NumberedInstructions) {
-    OperandOffsets.push_back(CurrentOffset);
-    for (const auto &Op : Inst->Operands) {
-      const DagInit *MIOI = Op.MIOperandInfo;
-      if (!ExpandMIOperandInfo || !MIOI || MIOI->getNumArgs() == 0) {
-        // Single, anonymous, operand.
-        OperandRecords.push_back(Op.Rec);
-        ++CurrentOffset;
-      } else {
-        for (const Init *Arg : MIOI->getArgs()) {
-          OperandRecords.push_back(cast<DefInit>(Arg)->getDef());
+  {
+    IfDefEmitter IfDef(OS, "GET_INSTRINFO_OPERAND_TYPE");
+    NamespaceEmitter NS(OS, ("llvm::" + Namespace).str());
+    OS << "LLVM_READONLY\n";
+    OS << "static int getOperandType(uint16_t Opcode, uint16_t OpIdx) {\n";
+    auto getInstrName = [&](int I) -> StringRef {
+      return NumberedInstructions[I]->getName();
+    };
+    // TODO: Factor out duplicate operand lists to compress the tables.
+    std::vector<size_t> OperandOffsets;
+    std::vector<const Record *> OperandRecords;
+    size_t CurrentOffset = 0;
+    for (const CodeGenInstruction *Inst : NumberedInstructions) {
+      OperandOffsets.push_back(CurrentOffset);
+      for (const auto &Op : Inst->Operands) {
+        const DagInit *MIOI = Op.MIOperandInfo;
+        if (!ExpandMIOperandInfo || !MIOI || MIOI->getNumArgs() == 0) {
+          // Single, anonymous, operand.
+          OperandRecords.push_back(Op.Rec);
           ++CurrentOffset;
+        } else {
+          for (const Init *Arg : MIOI->getArgs()) {
+            OperandRecords.push_back(cast<DefInit>(Arg)->getDef());
+            ++CurrentOffset;
+          }
         }
       }
     }
-  }
 
-  // Emit the table of offsets (indexes) into the operand type table.
-  // Size the unsigned integer offset to save space.
-  assert(OperandRecords.size() <= UINT32_MAX &&
-         "Too many operands for offset table");
-  OS << "  static constexpr " << getMinimalTypeForRange(OperandRecords.size());
-  OS << " Offsets[] = {\n";
-  for (const auto &[Idx, Offset] : enumerate(OperandOffsets))
-    OS << "    " << Offset << ", // " << getInstrName(Idx) << '\n';
-  OS << "  };\n";
+    // Emit the table of offsets (indexes) into the operand type table.
+    // Size the unsigned integer offset to save space.
+    assert(OperandRecords.size() <= UINT32_MAX &&
+           "Too many operands for offset table");
+    OS << "  static constexpr "
+       << getMinimalTypeForRange(OperandRecords.size());
+    OS << " Offsets[] = {\n";
+    for (const auto &[Idx, Offset] : enumerate(OperandOffsets))
+      OS << "    " << Offset << ", // " << getInstrName(Idx) << '\n';
+    OS << "  };\n";
 
-  // Add an entry for the end so that we don't need to special case it below.
-  OperandOffsets.push_back(OperandRecords.size());
-
-  // Emit the actual operand types in a flat table.
-  // Size the signed integer operand type to save space.
-  assert(EnumVal <= INT16_MAX &&
-         "Too many operand types for operand types table");
-  OS << "\n  using namespace OpTypes;\n";
-  OS << "  static";
-  OS << (EnumVal <= INT8_MAX ? " constexpr int8_t" : " constexpr int16_t");
-  OS << " OpcodeOperandTypes[] = {";
-  size_t CurOffset = 0;
-  for (auto [Idx, OpR] : enumerate(OperandRecords)) {
-    // We print each Opcode's operands in its own row.
-    if (Idx == OperandOffsets[CurOffset]) {
-      OS << "\n    /* " << getInstrName(CurOffset) << " */\n    ";
-      while (OperandOffsets[++CurOffset] == Idx)
-        OS << "/* " << getInstrName(CurOffset) << " */\n    ";
+    // Add an entry for the end so that we don't need to special case it below.
+    OperandOffsets.push_back(OperandRecords.size());
+
+    // Emit the actual operand types in a flat table.
+    // Size the signed integer operand type to save space.
+    assert(EnumVal <= INT16_MAX &&
+           "Too many operand types for operand types table");
+    OS << "\n  using namespace OpTypes;\n";
+    OS << "  static";
+    OS << (EnumVal <= INT8_MAX ? " constexpr int8_t" : " constexpr int16_t");
+    OS << " OpcodeOperandTypes[] = {";
+    size_t CurOffset = 0;
+    for (auto [Idx, OpR] : enumerate(OperandRecords)) {
+      // We print each Opcode's operands in its own row.
+      if (Idx == OperandOffsets[CurOffset]) {
+        OS << "\n    /* " << getInstrName(CurOffset) << " */\n    ";
+        while (OperandOffsets[++CurOffset] == Idx)
+          OS << "/* " << getInstrName(CurOffset) << " */\n    ";
+      }
+      if ((OpR->isSubClassOf("Operand") ||
+           OpR->isSubClassOf("RegisterOperand") ||
+           OpR->isSubClassOf("RegisterClass")) &&
+          !OpR->isAnonymous())
+        OS << OpR->getName();
+      else
+        OS << -1;
+      OS << ", ";
     }
-    if ((OpR->isSubClassOf("Operand") || OpR->isSubClassOf("RegisterOperand") ||
-         OpR->isSubClassOf("RegisterClass")) &&
-        !OpR->isAnonymous())
-      OS << OpR->getName();
-    else
-      OS << -1;
-    OS << ", ";
-  }
-  OS << "\n  };\n";
+    OS << "\n  };\n";
 
-  OS << "  return OpcodeOperandTypes[Offsets[Opcode] + OpIdx];\n";
-  OS << "}\n";
-  OS << "} // end namespace llvm::" << Namespace << '\n';
-  OS << "#endif // GET_INSTRINFO_OPERAND_TYPE\n\n";
-
-  OS << "#ifdef GET_INSTRINFO_MEM_OPERAND_SIZE\n";
-  OS << "#undef GET_INSTRINFO_MEM_OPERAND_SIZE\n";
-  OS << "namespace llvm::" << Namespace << " {\n";
-  OS << "LLVM_READONLY\n";
-  OS << "static int getMemOperandSize(int OpType) {\n";
-  OS << "  switch (OpType) {\n";
-  std::map<int, SmallVector<StringRef, 0>> SizeToOperandName;
-  for (const Record *Op : Operands) {
-    if (!Op->isSubClassOf("X86MemOperand"))
-      continue;
-    if (int Size = Op->getValueAsInt("Size"))
-      SizeToOperandName[Size].push_back(Op->getName());
+    OS << "  return OpcodeOperandTypes[Offsets[Opcode] + OpIdx];\n";
+    OS << "}\n";
   }
-  OS << "  default: return 0;\n";
-  for (const auto &[Size, OperandNames] : SizeToOperandName) {
-    for (const StringRef &OperandName : OperandNames)
-      OS << "  case OpTypes::" << OperandName << ":\n";
-    OS << "    return " << Size << ";\n\n";
+
+  {
+    IfDefEmitter IfDef(OS, "GET_INSTRINFO_MEM_OPERAND_SIZE");
+    NamespaceEmitter NS(OS, ("llvm::" + Namespace).str());
+
+    OS << "LLVM_READONLY\n";
+    OS << "static int getMemOperandSize(int OpType) {\n";
+    OS << "  switch (OpType) {\n";
+    std::map<int, SmallVector<StringRef, 0>> SizeToOperandName;
+    for (const Record *Op : Operands) {
+      if (!Op->isSubClassOf("X86MemOperand"))
+        continue;
+      if (int Size = Op->getValueAsInt("Size"))
+        SizeToOperandName[Size].push_back(Op->getName());
+    }
+    OS << "  default: return 0;\n";
+    for (const auto &[Size, OperandNames] : SizeToOperandName) {
+      for (const StringRef &OperandName : OperandNames)
+        OS << "  case OpTypes::" << OperandName << ":\n";
+      OS << "    return " << Size << ";\n\n";
+    }
+    OS << "  }\n}\n";
   }
-  OS << "  }\n}\n";
-  OS << "} // end namespace llvm::" << Namespace << '\n';
-  OS << "#endif // GET_INSTRINFO_MEM_OPERAND_SIZE\n\n";
 }
 
 // Fixed/Predefined instructions do not have UseLogicalOperandMappings
@@ -587,9 +582,8 @@ void InstrInfoEmitter::emitLogicalOperandSizeMappings(
     InstMap[I->second].push_back((Namespace + "::" + Inst->getName()).str());
   }
 
-  OS << "#ifdef GET_INSTRINFO_LOGICAL_OPERAND_SIZE_MAP\n";
-  OS << "#undef GET_INSTRINFO_LOGICAL_OPERAND_SIZE_MAP\n";
-  OS << "namespace llvm::" << Namespace << " {\n";
+  IfDefEmitter IfDef(OS, "GET_INSTRINFO_LOGICAL_OPERAND_SIZE_MAP");
+  NamespaceEmitter NS(OS, ("llvm::" + Namespace).str());
   OS << "LLVM_READONLY static unsigned\n";
   OS << "getLogicalOperandSize(uint16_t Opcode, uint16_t LogicalOpIdx) {\n";
   if (!InstMap.empty()) {
@@ -637,9 +631,6 @@ void InstrInfoEmitter::emitLogicalOperandSizeMappings(
   OS << "    S += getLogicalOperandSize(Opcode, i);\n";
   OS << "  return S;\n";
   OS << "}\n";
-
-  OS << "} // end namespace llvm::" << Namespace << '\n';
-  OS << "#endif // GET_INSTRINFO_LOGICAL_OPERAND_SIZE_MAP\n\n";
 }
 
 void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
@@ -647,48 +638,38 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
   ArrayRef<const Record *> TIIPredicates =
       Records.getAllDerivedDefinitions("TIIPredicate");
 
-  OS << "#ifdef GET_INSTRINFO_MC_HELPER_DECLS\n";
-  OS << "#undef GET_INSTRINFO_MC_HELPER_DECLS\n\n";
-
-  OS << "namespace llvm {\n";
-  OS << "class MCInst;\n";
-  OS << "class FeatureBitset;\n\n";
+  {
+    IfDefEmitter IfDef(OS, "GET_INSTRINFO_MC_HELPER_DECLS");
+    NamespaceEmitter LlvmNS(OS, "llvm");
+    OS << "class MCInst;\n";
+    OS << "class FeatureBitset;\n\n";
 
-  OS << "namespace " << TargetName << "_MC {\n\n";
+    NamespaceEmitter TargetNS(OS, (TargetName + "_MC").str());
+    for (const Record *Rec : TIIPredicates)
+      OS << "bool " << Rec->getValueAsString("FunctionName")
+         << "(const MCInst &MI);\n";
 
-  for (const Record *Rec : TIIPredicates) {
-    OS << "bool " << Rec->getValueAsString("FunctionName")
-       << "(const MCInst &MI);\n";
+    OS << "void verifyInstructionPredicates(unsigned Opcode, const "
+          "FeatureBitset "
+          "&Features);\n";
   }
 
-  OS << "void verifyInstructionPredicates(unsigned Opcode, const FeatureBitset "
-        "&Features);\n";
-
-  OS << "\n} // end namespace " << TargetName << "_MC\n";
-  OS << "} // end namespace llvm\n\n";
-
-  OS << "#endif // GET_INSTRINFO_MC_HELPER_DECLS\n\n";
-
-  OS << "#ifdef GET_INSTRINFO_MC_HELPERS\n";
-  OS << "#undef GET_INSTRINFO_MC_HELPERS\n\n";
-
-  OS << "namespace llvm::" << TargetName << "_MC {\n";
+  {
+    IfDefEmitter IfDef(OS, "GET_INSTRINFO_MC_HELPERS");
+    NamespaceEmitter NS(OS, ("llvm::" + TargetName + "_MC").str());
 
-  PredicateExpander PE(TargetName);
-  PE.setExpandForMC(true);
+    PredicateExpander PE(TargetName);
+    PE.setExpandForMC(true);
 
-  for (const Record *Rec : TIIPredicates) {
-    OS << "bool " << Rec->getValueAsString("FunctionName");
-    OS << "(const MCInst &MI) {\n";
+    for (const Record *Rec : TIIPredicates) {
+      OS << "bool " << Rec->getValueAsString("FunctionName");
+      OS << "(const MCInst &MI) {\n";
 
-    OS << PE.getIndent();
-    PE.expandStatement(OS, Rec->getValueAsDef("Body"));
-    OS << "\n}\n\n";
+      OS << PE.getIndent();
+      PE.expandStatement(OS, Rec->getValueAsDef("Body"));
+      OS << "\n}\n\n";
+    }
   }
-
-  OS << "} // end namespace llvm::" << TargetName << "_MC\n";
-
-  OS << "#endif // GET_GENISTRINFO_MC_HELPERS\n\n";
 }
 
 static std::string
@@ -710,148 +691,143 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS,
      << "    defined(GET_AVAILABLE_OPCODE_CHECKER)\n"
      << "#define GET_COMPUTE_FEATURES\n"
      << "#endif\n";
-  OS << "#ifdef GET_COMPUTE_FEATURES\n"
-     << "#undef GET_COMPUTE_FEATURES\n"
-     << "namespace llvm::" << Target.getName() << "_MC {\n";
-
-  // Emit the subtarget feature enumeration.
-  SubtargetFeatureInfo::emitSubtargetFeatureBitEnumeration(SubtargetFeatures,
-                                                           OS);
-  // Emit the available features compute function.
-  OS << "inline ";
-  SubtargetFeatureInfo::emitComputeAssemblerAvailableFeatures(
-      Target.getName(), "", "computeAvailableFeatures", SubtargetFeatures, OS);
-
-  std::vector<std::vector<const Record *>> FeatureBitsets;
-  for (const CodeGenInstruction *Inst : Target.getInstructions()) {
-    FeatureBitsets.emplace_back();
-    for (const Record *Predicate :
-         Inst->TheDef->getValueAsListOfDefs("Predicates")) {
-      const auto &I = SubtargetFeatures.find(Predicate);
-      if (I != SubtargetFeatures.end())
-        FeatureBitsets.back().push_back(I->second.TheDef);
+  std::string Namespace = ("llvm::" + Target.getName() + "_MC").str();
+  {
+    IfDefEmitter IfDef(OS, "GET_COMPUTE_FEATURES");
+    NamespaceEmitter NS(OS, Namespace);
+
+    // Emit the subtarget feature enumeration.
+    SubtargetFeatureInfo::emitSubtargetFeatureBitEnumeration(SubtargetFeatures,
+                                                             OS);
+    // Emit the available features compute function.
+    OS << "inline ";
+    SubtargetFeatureInfo::emitComputeAssemblerAvailableFeatures(
+        Target.getName(), "", "computeAvailableFeatures", SubtargetFeatures,
+        OS);
+
+    std::vector<std::vector<const Record *>> FeatureBitsets;
+    for (const CodeGenInstruction *Inst : Target.getInstructions()) {
+      FeatureBitsets.emplace_back();
+      for (const Record *Predicate :
+           Inst->TheDef->getValueAsListOfDefs("Predicates")) {
+        const auto &I = SubtargetFeatures.find(Predicate);
+        if (I != SubtargetFeatures.end())
+          FeatureBitsets.back().push_back(I->second.TheDef);
+      }
     }
-  }
 
-  llvm::sort(FeatureBitsets, [&](ArrayRef<const Record *> A,
-                                 ArrayRef<const Record *> B) {
-    if (A.size() < B.size())
-      return true;
-    if (A.size() > B.size())
-      return false;
-    for (auto Pair : zip(A, B)) {
-      if (std::get<0>(Pair)->getName() < std::get<1>(Pair)->getName())
+    llvm::sort(FeatureBitsets, [&](ArrayRef<const Record *> A,
+                                   ArrayRef<const Record *> B) {
+      if (A.size() < B.size())
         return true;
-      if (std::get<0>(Pair)->getName() > std::get<1>(Pair)->getName())
+      if (A.size() > B.size())
         return false;
+      for (auto Pair : zip(A, B)) {
+        if (std::get<0>(Pair)->getName() < std::get<1>(Pair)->getName())
+          return true;
+        if (std::get<0>(Pair)->getName() > std::get<1>(Pair)->getName())
+          return false;
+      }
+      return false;
+    });
+    FeatureBitsets.erase(llvm::unique(FeatureBitsets), FeatureBitsets.end());
+    OS << "inline FeatureBitset computeRequiredFeatures(unsigned Opcode) {\n"
+       << "  enum : " << getMinimalTypeForRange(FeatureBitsets.size()) << " {\n"
+       << "    CEFBS_None,\n";
+    for (const auto &FeatureBitset : FeatureBitsets) {
+      if (FeatureBitset.empty())
+        continue;
+      OS << "    " << getNameForFeatureBitset(FeatureBitset) << ",\n";
     }
-    return false;
-  });
-  FeatureBitsets.erase(llvm::unique(FeatureBitsets), FeatureBitsets.end());
-  OS << "inline FeatureBitset computeRequiredFeatures(unsigned Opcode) {\n"
-     << "  enum : " << getMinimalTypeForRange(FeatureBitsets.size()) << " {\n"
-     << "    CEFBS_None,\n";
-  for (const auto &FeatureBitset : FeatureBitsets) {
-    if (FeatureBitset.empty())
-      continue;
-    OS << "    " << getNameForFeatureBitset(FeatureBitset) << ",\n";
-  }
-  OS << "  };\n\n"
-     << "  static constexpr FeatureBitset FeatureBitsets[] = {\n"
-     << "    {}, // CEFBS_None\n";
-  for (const auto &FeatureBitset : FeatureBitsets) {
-    if (FeatureBitset.empty())
-      continue;
-    OS << "    {";
-    for (const auto &Feature : FeatureBitset) {
-      const auto &I = SubtargetFeatures.find(Feature);
-      assert(I != SubtargetFeatures.end() && "Didn't import predicate?");
-      OS << I->second.getEnumBitName() << ", ";
+    OS << "  };\n\n"
+       << "  static constexpr FeatureBitset FeatureBitsets[] = {\n"
+       << "    {}, // CEFBS_None\n";
+    for (const auto &FeatureBitset : FeatureBitsets) {
+      if (FeatureBitset.empty())
+        continue;
+      OS << "    {";
+      for (const auto &Feature : FeatureBitset) {
+        const auto &I = SubtargetFeatures.find(Feature);
+        assert(I != SubtargetFeatures.end() && "Didn't import predicate?");
+        OS << I->second.getEnumBitName() << ", ";
+      }
+      OS << "},\n";
     }
-    OS << "},\n";
-  }
-  OS << "  };\n"
-     << "  static constexpr " << getMinimalTypeForRange(FeatureBitsets.size())
-     << " RequiredFeaturesRefs[] = {\n";
-  ArrayRef<const CodeGenInstruction *> NumberedInstructions =
-      Target.getInstructions();
-  for (const CodeGenInstruction *Inst : NumberedInstructions) {
-    OS << "    CEFBS";
-    unsigned NumPredicates = 0;
-    for (const Record *Predicate :
-         Inst->TheDef->getValueAsListOfDefs("Predicates")) {
-      const auto &I = SubtargetFeatures.find(Predicate);
-      if (I != SubtargetFeatures.end()) {
-        OS << '_' << I->second.TheDef->getName();
-        NumPredicates++;
+    OS << "  };\n"
+       << "  static constexpr " << getMinimalTypeForRange(FeatureBitsets.size())
+       << " RequiredFeaturesRefs[] = {\n";
+    ArrayRef<const CodeGenInstruction *> NumberedInstructions =
+        Target.getInstructions();
+    for (const CodeGenInstruction *Inst : NumberedInstructions) {
+      OS << "    CEFBS";
+      unsigned NumPredicates = 0;
+      for (const Record *Predicate :
+           Inst->TheDef->getValueAsListOfDefs("Predicates")) {
+        const auto &I = SubtargetFeatures.find(Predicate);
+        if (I != SubtargetFeatures.end()) {
+          OS << '_' << I->second.TheDef->getName();
+          NumPredicates++;
+        }
       }
+      if (!NumPredicates)
+        OS << "_None";
+      OS << ", // " << Inst->getName() << '\n';
     }
-    if (!NumPredicates)
-      OS << "_None";
-    OS << ", // " << Inst->getName() << '\n';
+    OS << "  };\n\n"
+       << "  assert(Opcode < " << NumberedInstructions.size() << ");\n"
+       << "  return FeatureBitsets[RequiredFeaturesRefs[Opcode]];\n"
+       << "}\n\n";
+  } // end scope for GET_COMPUTE_FEATURES
+
+  {
+    IfDefEmitter IfDef(OS, "GET_AVAILABLE_OPCODE_CHECKER");
+    NamespaceEmitter NS(OS, Namespace);
+    OS << "bool isOpcodeAvailable("
+       << "unsigned Opcode, const FeatureBitset &Features) {\n"
+       << "  FeatureBitset AvailableFeatures = "
+       << "computeAvailableFeatures(Features);\n"
+       << "  FeatureBitset RequiredFeatures = "
+       << "computeRequiredFeatures(Opcode);\n"
+       << "  FeatureBitset MissingFeatures =\n"
+       << "      (AvailableFeatures & RequiredFeatures) ^\n"
+       << "      RequiredFeatures;\n"
+       << "  return !MissingFeatures.any();\n"
+       << "}\n";
+  }
+
+  {
+    IfDefEmitter IfDef(OS, "ENABLE_INSTR_PREDICATE_VERIFIER");
+    OS << "#include <sstream>\n\n";
+    NamespaceEmitter NS(OS, Namespace);
+    // Emit the name table for error messages.
+    OS << "#ifndef NDEBUG\n";
+    SubtargetFeatureInfo::emitNameTable(SubtargetFeatures, OS);
+    OS << "#endif // NDEBUG\n\n";
+    // Emit the predicate verifier.
+    OS << "void verifyInstructionPredicates(\n"
+       << "    unsigned Opcode, const FeatureBitset &Features) {\n"
+       << "#ifndef NDEBUG\n";
+    OS << "  FeatureBitset AvailableFeatures = "
+          "computeAvailableFeatures(Features);\n";
+    OS << "  FeatureBitset RequiredFeatures = "
+       << "computeRequiredFeatures(Opcode);\n";
+    OS << "  FeatureBitset MissingFeatures =\n"
+       << "      (AvailableFeatures & RequiredFeatures) ^\n"
+       << "      RequiredFeatures;\n"
+       << "  if (MissingFeatures.any()) {\n"
+       << "    std::ostringstream Msg;\n"
+       << "    Msg << \"Attempting to emit \" << &" << Target.getName()
+       << "InstrNameData[" << Target.getName() << "InstrNameIndices[Opcode]]\n"
+       << "        << \" instruction but the \";\n"
+       << "    for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i)\n"
+       << "      if (MissingFeatures.test(i))\n"
+       << "        Msg << SubtargetFeatureNames[i] << \" \";\n"
+       << "    Msg << \"predicate(s) are not met\";\n"
+       << "    report_fatal_error(Msg.str().c_str());\n"
+       << "  }\n"
+       << "#endif // NDEBUG\n";
+    OS << "}\n";
   }
-  OS << "  };\n\n"
-     << "  assert(Opcode < " << NumberedInstructions.size() << ");\n"
-     << "  return FeatureBitsets[RequiredFeaturesRefs[Opcode]];\n"
-     << "}\n\n";
-
-  OS << "} // end namespace llvm::" << Target.getName() << "_MC\n"
-     << "#endif // GET_COMPUTE_FEATURES\n\n";
-
-  OS << "#ifdef GET_AVAILABLE_OPCODE_CHECKER\n"
-     << "#undef GET_AVAILABLE_OPCODE_CHECKER\n"
-     << "namespace llvm::" << Target.getName() << "_MC {\n";
-  OS << "bool isOpcodeAvailable("
-     << "unsigned Opcode, const FeatureBitset &Features) {\n"
-     << "  FeatureBitset AvailableFeatures = "
-     << "computeAvailableFeatures(Features);\n"
-     << "  FeatureBitset RequiredFeatures = "
-     << "computeRequiredFeatures(Opcode);\n"
-     << "  FeatureBitset MissingFeatures =\n"
-     << "      (AvailableFeatures & RequiredFeatures) ^\n"
-     << "      RequiredFeatures;\n"
-     << "  return !MissingFeatures.any();\n"
-     << "}\n";
-  OS << "} // end namespace llvm::" << Target.getName() << "_MC\n"
-     << "#endif // GET_AVAILABLE_OPCODE_CHECKER\n\n";
-
-  OS << "#ifdef ENABLE_INSTR_PREDICATE_VERIFIER\n"
-     << "#undef ENABLE_INSTR_PREDICATE_VERIFIER\n"
-     << "#include <sstream>\n\n";
-
-  OS << "namespace llvm::" << Target.getName() << "_MC {\n";
-
-  // Emit the name table for error messages.
-  OS << "#ifndef NDEBUG\n";
-  SubtargetFeatureInfo::emitNameTable(SubtargetFeatures, OS);
-  OS << "#endif // NDEBUG\n\n";
-
-  // Emit the predicate verifier.
-  OS << "void verifyInstructionPredicates(\n"
-     << "    unsigned Opcode, const FeatureBitset &Features) {\n"
-     << "#ifndef NDEBUG\n";
-  OS << "  FeatureBitset AvailableFeatures = "
-        "computeAvailableFeatures(Features);\n";
-  OS << "  FeatureBitset RequiredFeatures = "
-     << "computeRequiredFeatures(Opcode);\n";
-  OS << "  FeatureBitset MissingFeatures =\n"
-     << "      (AvailableFeatures & RequiredFeatures) ^\n"
-     << "      RequiredFeatures;\n"
-     << "  if (MissingFeatures.any()) {\n"
-     << "    std::ostringstream Msg;\n"
-     << "    Msg << \"Attempting to emit \" << &" << Target.getName()
-     << "InstrNameData[" << Target.getName() << "InstrNameIndices[Opcode]]\n"
-     << "        << \" instruction but the \";\n"
-     << "    for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i)\n"
-     << "      if (MissingFeatures.test(i))\n"
-     << "        Msg << SubtargetFeatureNames[i] << \" \";\n"
-     << "    Msg << \"predicate(s) are not met\";\n"
-     << "    report_fatal_error(Msg.str().c_str());\n"
-     << "  }\n"
-     << "#endif // NDEBUG\n";
-  OS << "}\n";
-  OS << "} // end namespace llvm::" << Target.getName() << "_MC\n";
-  OS << "#endif // ENABLE_INSTR_PREDICATE_VERIFIER\n\n";
 }
 
 void InstrInfoEmitter::emitTIIHelperMethods(raw_ostream &OS,
@@ -954,270 +930,266 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
   OS << "#endif // defined(GET_INSTRINFO_MC_DESC) || "
         "defined(GET_INSTRINFO_CTOR_DTOR)\n\n";
 
-  OS << "#ifdef GET_INSTRINFO_MC_DESC\n";
-  OS << "#undef GET_INSTRINFO_MC_DESC\n";
-  OS << "namespace llvm {\n\n";
-
-  // Emit all of the MCInstrDesc records in reverse ENUM ordering.
-  Timer.startTimer("Emit InstrDesc records");
-  OS << "static_assert(sizeof(MCOperandInfo) % sizeof(MCPhysReg) == 0);\n";
-  OS << "static constexpr unsigned " << TargetName << "ImpOpBase = sizeof "
-     << TargetName << "InstrTable::OperandInfo / (sizeof(MCPhysReg));\n\n";
-
-  OS << "extern const " << TargetName << "InstrTable " << TargetName
-     << "Descs = {\n  {\n";
-  SequenceToOffsetTable<StringRef> InstrNames;
-  unsigned Num = NumberedInstructions.size();
-  for (const CodeGenInstruction *Inst : reverse(NumberedInstructions)) {
-    // Keep a list of the instruction names.
-    InstrNames.add(Inst->getName());
-    // Emit the record into the table.
-    emitRecord(*Inst, --Num, InstrInfo, EmittedLists, OperandInfoMap, OS);
-  }
-
-  OS << "  }, {\n";
-
-  // Emit all of the operand info records.
-  Timer.startTimer("Emit operand info");
-  EmitOperandInfo(OS, OperandInfoList);
-
-  OS << "  }, {\n";
-
-  // Emit all of the instruction's implicit uses and defs.
-  Timer.startTimer("Emit uses/defs");
-  for (auto &List : ImplicitLists) {
-    OS << "    /* " << EmittedLists[List] << " */";
-    for (auto &Reg : List)
-      OS << ' ' << getQualifiedName(Reg) << ',';
-    OS << '\n';
-  }
-
-  OS << "  }\n};\n\n";
-
-  // Emit the array of instruction names.
-  Timer.startTimer("Emit instruction names");
-  InstrNames.layout();
-  InstrNames.emitStringLiteralDef(OS, Twine("extern const char ") + TargetName +
-                                          "InstrNameData[]");
   const CodeGenRegBank &RegBank = Target.getRegBank();
   const CodeGenHwModes &CGH = Target.getHwModes();
   unsigned NumModes = CGH.getNumModeIds();
   ArrayRef<const Record *> RegClassByHwMode = Target.getAllRegClassByHwMode();
   unsigned NumClassesByHwMode = RegClassByHwMode.size();
 
-  OS << "extern const unsigned " << TargetName << "InstrNameIndices[] = {";
-  Num = 0;
-  for (const CodeGenInstruction *Inst : NumberedInstructions) {
-    // Newline every eight entries.
-    if (Num % 8 == 0)
-      OS << "\n    ";
-    OS << InstrNames.get(Inst->getName()) << "U, ";
-    ++Num;
-  }
-  OS << "\n};\n\n";
-
   bool HasDeprecationFeatures =
       llvm::any_of(NumberedInstructions, [](const CodeGenInstruction *Inst) {
         return !Inst->HasComplexDeprecationPredicate &&
                !Inst->DeprecatedReason.empty();
       });
-  if (HasDeprecationFeatures) {
-    OS << "extern const uint8_t " << TargetName
-       << "InstrDeprecationFeatures[] = {";
-    Num = 0;
-    for (const CodeGenInstruction *Inst : NumberedInstructions) {
-      if (Num % 8 == 0)
-        OS << "\n    ";
-      if (!Inst->HasComplexDeprecationPredicate &&
-          !Inst->DeprecatedReason.empty())
-        OS << Target.getInstNamespace() << "::" << Inst->DeprecatedReason
-           << ", ";
-      else
-        OS << "uint8_t(-1), ";
-      ++Num;
-    }
-    OS << "\n};\n\n";
-  }
-
   bool HasComplexDeprecationInfos =
       llvm::any_of(NumberedInstructions, [](const CodeGenInstruction *Inst) {
         return Inst->HasComplexDeprecationPredicate;
       });
-  if (HasComplexDeprecationInfos) {
-    OS << "extern const MCInstrInfo::ComplexDeprecationPredicate " << TargetName
-       << "InstrComplexDeprecationInfos[] = {";
+
+  {
+    IfDefEmitter IfDef(OS, "GET_INSTRINFO_MC_DESC");
+    NamespaceEmitter LlvmNS(OS, "llvm");
+
+    // Emit all of the MCInstrDesc records in reverse ENUM ordering.
+    Timer.startTimer("Emit InstrDesc records");
+    OS << "static_assert(sizeof(MCOperandInfo) % sizeof(MCPhysReg) == 0);\n";
+    OS << "static constexpr unsigned " << TargetName << "ImpOpBase = sizeof "
+       << TargetName << "InstrTable::OperandInfo / (sizeof(MCPhysReg));\n\n";
+
+    OS << "extern const " << TargetName << "InstrTable " << TargetName
+       << "Descs = {\n  {\n";
+    SequenceToOffsetTable<StringRef> InstrNames;
+    unsigned Num = NumberedInstructions.size();
+    for (const CodeGenInstruction *Inst : reverse(NumberedInstructions)) {
+      // Keep a list of the instruction names.
+      InstrNames.add(Inst->getName());
+      // Emit the record into the table.
+      emitRecord(*Inst, --Num, InstrInfo, EmittedLists, OperandInfoMap, OS);
+    }
+
+    OS << "  }, {\n";
+
+    // Emit all of the operand info records.
+    Timer.startTimer("Emit operand info");
+    EmitOperandInfo(OS, OperandInfoList);
+
+    OS << "  }, {\n";
+
+    // Emit all of the instruction's implicit uses and defs.
+    Timer.startTimer("Emit uses/defs");
+    for (auto &List : ImplicitLists) {
+      OS << "    /* " << EmittedLists[List] << " */";
+      for (auto &Reg : List)
+        OS << ' ' << getQualifiedName(Reg) << ',';
+      OS << '\n';
+    }
+
+    OS << "  }\n};\n\n";
+
+    // Emit the array of instruction names.
+    Timer.startTimer("Emit instruction names");
+    InstrNames.layout();
+    InstrNames.emitStringLiteralDef(OS, Twine("extern const char ") +
+                                            TargetName + "InstrNameData[]");
+    OS << "extern const unsigned " << TargetName << "InstrNameIndices[] = {";
     Num = 0;
     for (const CodeGenInstruction *Inst : NumberedInstructions) {
+      // Newline every eight entries.
       if (Num % 8 == 0)
         OS << "\n    ";
-      if (Inst->HasComplexDeprecationPredicate)
-        // Emit a function pointer to the complex predicate method.
-        OS << "&get" << Inst->DeprecatedReason << "DeprecationInfo, ";
-      else
-        OS << "nullptr, ";
+      OS << InstrNames.get(Inst->getName()) << "U, ";
       ++Num;
     }
     OS << "\n};\n\n";
-  }
-
-  // MCInstrInfo initialization routine.
-  Timer.startTimer("Emit initialization routine");
-
-  if (NumClassesByHwMode != 0) {
-    OS << "extern const int16_t " << TargetName << "RegClassByHwModeTables["
-       << NumModes << "][" << NumClassesByHwMode << "] = {\n";
 
-    for (unsigned M = 0; M < NumModes; ++M) {
-      OS << "  { // " << CGH.getModeName(M, /*IncludeDefault=*/true) << '\n';
-      for (unsigned I = 0; I != NumClassesByHwMode; ++I) {
-        const Record *Class = RegClassByHwMode[I];
-        const HwModeSelect &ModeSelect = CGH.getHwModeSelect(Class);
-
-        auto FoundMode =
-            find_if(ModeSelect.Items, [=](const HwModeSelect::PairType P) {
-              return P.first == M;
-            });
-
-        if (FoundMode == ModeSelect.Items.end()) {
-          // If a RegClassByHwMode doesn't have an entry corresponding to a
-          // mode, pad with default register class.
-          OS << indent(4) << "-1, // Missing mode entry\n";
-        } else {
-          const CodeGenRegisterClass *RegClass =
-              RegBank.getRegClass(FoundMode->second);
-          OS << indent(4) << RegClass->getQualifiedIdName() << ",\n";
-        }
+    if (HasDeprecationFeatures) {
+      OS << "extern const uint8_t " << TargetName
+         << "InstrDeprecationFeatures[] = {";
+      Num = 0;
+      for (const CodeGenInstruction *Inst : NumberedInstructions) {
+        if (Num % 8 == 0)
+          OS << "\n    ";
+        if (!Inst->HasComplexDeprecationPredicate &&
+            !Inst->DeprecatedReason.empty())
+          OS << Target.getInstNamespace() << "::" << Inst->DeprecatedReason
+             << ", ";
+        else
+          OS << "uint8_t(-1), ";
+        ++Num;
       }
-
-      OS << "  },\n";
+      OS << "\n};\n\n";
     }
 
-    OS << "};\n\n";
-  }
+    if (HasComplexDeprecationInfos) {
+      OS << "extern const MCInstrInfo::ComplexDeprecationPredicate "
+         << TargetName << "InstrComplexDeprecationInfos[] = {";
+      Num = 0;
+      for (const CodeGenInstruction *Inst : NumberedInstructions) {
+        if (Num % 8 == 0)
+          OS << "\n    ";
+        if (Inst->HasComplexDeprecationPredicate)
+          // Emit a function pointer to the complex predicate method.
+          OS << "&get" << Inst->DeprecatedReason << "DeprecationInfo, ";
+        else
+          OS << "nullptr, ";
+        ++Num;
+      }
+      OS << "\n};\n\n";
+    }
 
-  OS << "static inline void Init" << TargetName
-     << "MCInstrInfo(MCInstrInfo *II) {\n";
-  OS << "  II->InitMCInstrInfo(" << TargetName << "Descs.Insts, " << TargetName
-     << "InstrNameIndices, " << TargetName << "InstrNameData, ";
-  if (HasDeprecationFeatures)
-    OS << TargetName << "InstrDeprecationFeatures, ";
-  else
-    OS << "nullptr, ";
-  if (HasComplexDeprecationInfos)
-    OS << TargetName << "InstrComplexDeprecationInfos, ";
-  else
-    OS << "nullptr, ";
-  OS << NumberedInstructions.size() << ", ";
-
-  if (NumClassesByHwMode != 0) {
-    OS << '&' << TargetName << "RegClassByHwModeTables[0][0], "
-       << NumClassesByHwMode;
-  } else
-    OS << "nullptr, 0";
-
-  OS << ");\n}\n\n";
+    // MCInstrInfo initialization routine.
+    Timer.startTimer("Emit initialization routine");
+
+    if (NumClassesByHwMode != 0) {
+      OS << "extern const int16_t " << TargetName << "RegClassByHwModeTables["
+         << NumModes << "][" << NumClassesByHwMode << "] = {\n";
+
+      for (unsigned M = 0; M < NumModes; ++M) {
+        OS << "  { // " << CGH.getModeName(M, /*IncludeDefault=*/true) << '\n';
+        for (unsigned I = 0; I != NumClassesByHwMode; ++I) {
+          const Record *Class = RegClassByHwMode[I];
+          const HwModeSelect &ModeSelect = CGH.getHwModeSelect(Class);
+
+          auto FoundMode =
+              find_if(ModeSelect.Items, [=](const HwModeSelect::PairType P) {
+                return P.first == M;
+              });
+
+          if (FoundMode == ModeSelect.Items.end()) {
+            // If a RegClassByHwMode doesn't have an entry corresponding to a
+            // mode, pad with default register class.
+            OS << indent(4) << "-1, // Missing mode entry\n";
+          } else {
+            const CodeGenRegisterClass *RegClass =
+                RegBank.getRegClass(FoundMode->second);
+            OS << indent(4) << RegClass->getQualifiedIdName() << ",\n";
+          }
+        }
 
-  OS << "} // end namespace llvm\n";
+        OS << "  },\n";
+      }
 
-  OS << "#endif // GET_INSTRINFO_MC_DESC\n\n";
+      OS << "};\n\n";
+    }
 
-  // Create a TargetInstrInfo subclass to hide the MC layer initialization.
-  OS << "#ifdef GET_INSTRINFO_HEADER\n";
-  OS << "#undef GET_INSTRINFO_HEADER\n";
+    OS << "static inline void Init" << TargetName
+       << "MCInstrInfo(MCInstrInfo *II) {\n";
+    OS << "  II->InitMCInstrInfo(" << TargetName << "Descs.Insts, "
+       << TargetName << "InstrNameIndices, " << TargetName << "InstrNameData, ";
+    if (HasDeprecationFeatures)
+      OS << TargetName << "InstrDeprecationFeatures, ";
+    else
+      OS << "nullptr, ";
+    if (HasComplexDeprecationInfos)
+      OS << TargetName << "InstrComplexDeprecationInfos, ";
+    else
+      OS << "nullptr, ";
+    OS << NumberedInstructions.size() << ", ";
 
-  Twine ClassName = TargetName + "GenInstrInfo";
-  OS << "namespace llvm {\n";
-  OS << "struct " << ClassName << " : public TargetInstrInfo {\n"
-     << "  explicit " << ClassName
-     << "(const TargetSubtargetInfo &STI, unsigned CFSetupOpcode = ~0u, "
-        "unsigned CFDestroyOpcode = ~0u, "
-        "unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u);\n"
-     << "  ~" << ClassName << "() override = default;\n";
+    if (NumClassesByHwMode != 0) {
+      OS << '&' << TargetName << "RegClassByHwModeTables[0][0], "
+         << NumClassesByHwMode;
+    } else
+      OS << "nullptr, 0";
 
-  OS << "\n};\n} // end namespace llvm\n";
+    OS << ");\n}\n\n";
+  } // end GET_INSTRINFO_MC_DESC scope.
 
   {
-    NamespaceEmitter LlvmNS(OS, "llvm");
-    NamespaceEmitter TargetNS(OS, Target.getInstNamespace());
+    // Create a TargetInstrInfo subclass to hide the MC layer initialization.
+    IfDefEmitter IfDef(OS, "GET_INSTRINFO_HEADER");
+    {
+      NamespaceEmitter LlvmNS(OS, "llvm");
+      Twine ClassName = TargetName + "GenInstrInfo";
+      OS << "struct " << ClassName << " : public TargetInstrInfo {\n"
+         << "  explicit " << ClassName
+         << "(const TargetSubtargetInfo &STI, const TargetRegisterInfo &TRI, "
+            "unsigned CFSetupOpcode = ~0u, "
+            "unsigned CFDestroyOpcode = ~0u, "
+            "unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u);\n"
+         << "  ~" << ClassName << "() override = default;\n"
+         << "};\n";
+    } // end llvm namespace.
+
+    OS << "\n";
+    NamespaceEmitter InstNS(OS, ("llvm::" + Target.getInstNamespace()).str());
     for (const Record *R : Records.getAllDerivedDefinitions("Operand")) {
       if (R->isAnonymous())
         continue;
-      if (const DagInit *D = R->getValueAsDag("MIOperandInfo")) {
-        for (unsigned i = 0, e = D->getNumArgs(); i < e; ++i) {
-          if (const StringInit *Name = D->getArgName(i))
-            OS << "constexpr unsigned SUBOP_" << R->getName() << "_"
-               << Name->getValue() << " = " << i << ";\n";
-        }
+      const DagInit *D = R->getValueAsDag("MIOperandInfo");
+      if (!D)
+        continue;
+      for (const auto &[Idx, Name] : enumerate(D->getArgNames())) {
+        if (Name)
+          OS << "constexpr unsigned SUBOP_" << R->getName() << "_"
+             << Name->getValue() << " = " << Idx << ";\n";
       }
     }
-  }
-
-  OS << "#endif // GET_INSTRINFO_HEADER\n\n";
-
-  OS << "#ifdef GET_INSTRINFO_HELPER_DECLS\n";
-  OS << "#undef GET_INSTRINFO_HELPER_DECLS\n\n";
-  emitTIIHelperMethods(OS, TargetName, /* ExpandDefinition = */ false);
-  OS << '\n';
-  OS << "#endif // GET_INSTRINFO_HELPER_DECLS\n\n";
-
-  OS << "#ifdef GET_INSTRINFO_HELPERS\n";
-  OS << "#undef GET_INSTRINFO_HELPERS\n\n";
-  emitTIIHelperMethods(OS, TargetName, /* ExpandDefinition = */ true);
-  OS << "#endif // GET_INSTRINFO_HELPERS\n\n";
+  } // end GET_INSTRINFO_HEADER scope.
 
-  OS << "#ifdef GET_INSTRINFO_CTOR_DTOR\n";
-  OS << "#undef GET_INSTRINFO_CTOR_DTOR\n";
-
-  OS << "namespace llvm {\n";
-  OS << "extern const " << TargetName << "InstrTable " << TargetName
-     << "Descs;\n";
-  OS << "extern const unsigned " << TargetName << "InstrNameIndices[];\n";
-  OS << "extern const char " << TargetName << "InstrNameData[];\n";
-
-  if (NumClassesByHwMode != 0) {
-    OS << "extern const int16_t " << TargetName << "RegClassByHwModeTables["
-       << NumModes << "][" << NumClassesByHwMode << "];\n";
+  {
+    IfDefEmitter IfDef(OS, "GET_INSTRINFO_HELPER_DECLS");
+    emitTIIHelperMethods(OS, TargetName, /* ExpandDefinition = */ false);
   }
 
-  if (HasDeprecationFeatures)
-    OS << "extern const uint8_t " << TargetName
-       << "InstrDeprecationFeatures[];\n";
-  if (HasComplexDeprecationInfos)
-    OS << "extern const MCInstrInfo::ComplexDeprecationPredicate " << TargetName
-       << "InstrComplexDeprecationInfos[];\n";
-  OS << ClassName << "::" << ClassName
-     << "(const TargetSubtargetInfo &STI, unsigned CFSetupOpcode, unsigned "
-        "CFDestroyOpcode, unsigned CatchRetOpcode, unsigned ReturnOpcode)\n"
-     << "  : TargetInstrInfo(CFSetupOpcode, CFDestroyOpcode, CatchRetOpcode, "
-        "ReturnOpcode";
-  if (NumClassesByHwMode != 0)
-    OS << ", " << TargetName
-       << "RegClassByHwModeTables[STI.getHwMode(MCSubtargetInfo::HwMode_"
-          "RegInfo)]";
-
-  OS << ") {\n"
-     << "  InitMCInstrInfo(" << TargetName << "Descs.Insts, " << TargetName
-     << "InstrNameIndices, " << TargetName << "InstrNameData, ";
-  if (HasDeprecationFeatures)
-    OS << TargetName << "InstrDeprecationFeatures, ";
-  else
-    OS << "nullptr, ";
-  if (HasComplexDeprecationInfos)
-    OS << TargetName << "InstrComplexDeprecationInfos, ";
-  else
-    OS << "nullptr, ";
-  OS << NumberedInstructions.size();
-
-  if (NumClassesByHwMode != 0) {
-    OS << ", &" << TargetName << "RegClassByHwModeTables[0][0], "
-       << NumClassesByHwMode;
+  {
+    IfDefEmitter IfDef(OS, "GET_INSTRINFO_HELPERS");
+    emitTIIHelperMethods(OS, TargetName, /* ExpandDefinition = */ true);
   }
 
-  OS << ");\n"
-        "}\n"
-        "} // end namespace llvm\n";
+  {
+    IfDefEmitter IfDef(OS, "GET_INSTRINFO_CTOR_DTOR");
+    NamespaceEmitter LlvmNS(OS, "llvm");
+    OS << "extern const " << TargetName << "InstrTable " << TargetName
+       << "Descs;\n";
+    OS << "extern const unsigned " << TargetName << "InstrNameIndices[];\n";
+    OS << "extern const char " << TargetName << "InstrNameData[];\n";
+
+    if (NumClassesByHwMode != 0) {
+      OS << "extern const int16_t " << TargetName << "RegClassByHwModeTables["
+         << NumModes << "][" << NumClassesByHwMode << "];\n";
+    }
+
+    if (HasDeprecationFeatures)
+      OS << "extern const uint8_t " << TargetName
+         << "InstrDeprecationFeatures[];\n";
+    if (HasComplexDeprecationInfos)
+      OS << "extern const MCInstrInfo::ComplexDeprecationPredicate "
+         << TargetName << "InstrComplexDeprecationInfos[];\n";
+    Twine ClassName = TargetName + "GenInstrInfo";
+    OS << ClassName << "::" << ClassName
+       << "(const TargetSubtargetInfo &STI, const TargetRegisterInfo &TRI, "
+          "unsigned CFSetupOpcode, unsigned "
+          "CFDestroyOpcode, unsigned CatchRetOpcode, unsigned ReturnOpcode)\n"
+       << "  : TargetInstrInfo(TRI, CFSetupOpcode, CFDestroyOpcode, "
+          "CatchRetOpcode, "
+          "ReturnOpcode";
+    if (NumClassesByHwMode != 0)
+      OS << ", " << TargetName
+         << "RegClassByHwModeTables[STI.getHwMode(MCSubtargetInfo::HwMode_"
+            "RegInfo)]";
+
+    OS << ") {\n"
+       << "  InitMCInstrInfo(" << TargetName << "Descs.Insts, " << TargetName
+       << "InstrNameIndices, " << TargetName << "InstrNameData, ";
+    if (HasDeprecationFeatures)
+      OS << TargetName << "InstrDeprecationFeatures, ";
+    else
+      OS << "nullptr, ";
+    if (HasComplexDeprecationInfos)
+      OS << TargetName << "InstrComplexDeprecationInfos, ";
+    else
+      OS << "nullptr, ";
+    OS << NumberedInstructions.size();
 
-  OS << "#endif // GET_INSTRINFO_CTOR_DTOR\n\n";
+    if (NumClassesByHwMode != 0) {
+      OS << ", &" << TargetName << "RegClassByHwModeTables[0][0], "
+         << NumClassesByHwMode;
+    }
+
+    OS << ");\n"
+          "}\n";
+  } // end GET_INSTRINFO_CTOR_DTOR scope.
 
   ArrayRef<const CodeGenInstruction *> TargetInstructions =
       Target.getTargetInstructions();
@@ -1384,8 +1356,6 @@ void InstrInfoEmitter::emitRecord(
 void InstrInfoEmitter::emitEnums(
     raw_ostream &OS,
     ArrayRef<const CodeGenInstruction *> NumberedInstructions) {
-  OS << "#ifdef GET_INSTRINFO_ENUM\n";
-  OS << "#undef GET_INSTRINFO_ENUM\n";
 
   const CodeGenTarget &Target = CDP.getTargetInfo();
   StringRef Namespace = Target.getInstNamespace();
@@ -1393,48 +1363,48 @@ void InstrInfoEmitter::emitEnums(
   if (Namespace.empty())
     PrintFatalError("No instructions defined!");
 
-  OS << "namespace llvm::" << Namespace << " {\n";
+  {
+    IfDefEmitter IfDef(OS, "GET_INSTRINFO_ENUM");
+    NamespaceEmitter NS(OS, ("llvm::" + Namespace).str());
 
-  auto II = llvm::max_element(
-      NumberedInstructions,
-      [](const CodeGenInstruction *InstA, const CodeGenInstruction *InstB) {
-        return InstA->getName().size() < InstB->getName().size();
-      });
-  size_t MaxNameSize = (*II)->getName().size();
+    auto II = llvm::max_element(
+        NumberedInstructions,
+        [](const CodeGenInstruction *InstA, const CodeGenInstruction *InstB) {
+          return InstA->getName().size() < InstB->getName().size();
+        });
+    size_t MaxNameSize = (*II)->getName().size();
 
-  OS << "  enum {\n";
-  for (const CodeGenInstruction *Inst : NumberedInstructions) {
-    OS << "    " << left_justify(Inst->getName(), MaxNameSize) << " = "
-       << Target.getInstrIntValue(Inst->TheDef) << ", // "
-       << SrcMgr.getFormattedLocationNoOffset(Inst->TheDef->getLoc().front())
-       << '\n';
+    OS << "  enum {\n";
+    for (const CodeGenInstruction *Inst : NumberedInstructions) {
+      OS << "    " << left_justify(Inst->getName(), MaxNameSize) << " = "
+         << Target.getInstrIntValue(Inst->TheDef) << ", // "
+         << SrcMgr.getFormattedLocationNoOffset(Inst->TheDef->getLoc().front())
+         << '\n';
+    }
+    OS << "    INSTRUCTION_LIST_END = " << NumberedInstructions.size() << '\n';
+    OS << "  };\n";
+
+    ArrayRef<const Record *> RegClassesByHwMode =
+        Target.getAllRegClassByHwMode();
+    if (!RegClassesByHwMode.empty()) {
+      OS << "  enum RegClassByHwModeUses : uint16_t {\n";
+      for (const Record *ClassByHwMode : RegClassesByHwMode)
+        OS << indent(4) << ClassByHwMode->getName() << ",\n";
+      OS << "  };\n";
+    }
   }
-  OS << "    INSTRUCTION_LIST_END = " << NumberedInstructions.size() << '\n';
-  OS << "  };\n";
 
-  ArrayRef<const Record *> RegClassesByHwMode = Target.getAllRegClassByHwMode();
-  if (!RegClassesByHwMode.empty()) {
-    OS << "  enum RegClassByHwModeUses : uint16_t {\n";
-    for (const Record *ClassByHwMode : RegClassesByHwMode)
-      OS << indent(4) << ClassByHwMode->getName() << ",\n";
+  {
+    IfDefEmitter IfDef(OS, "GET_INSTRINFO_SCHED_ENUM");
+    NamespaceEmitter NS(OS, ("llvm::" + Namespace + "::Sched").str());
+
+    OS << "  enum {\n";
+    auto ExplictClasses = SchedModels.explicitSchedClasses();
+    for (const auto &[Idx, Class] : enumerate(ExplictClasses))
+      OS << "    " << Class.Name << "\t= " << Idx << ",\n";
+    OS << "    SCHED_LIST_END = " << ExplictClasses.size() << '\n';
     OS << "  };\n";
   }
-
-  OS << "} // end namespace llvm::" << Namespace << '\n';
-  OS << "#endif // GET_INSTRINFO_ENUM\n\n";
-
-  OS << "#ifdef GET_INSTRINFO_SCHED_ENUM\n";
-  OS << "#undef GET_INSTRINFO_SCHED_ENUM\n";
-  OS << "namespace llvm::" << Namespace << "::Sched {\n";
-  OS << "  enum {\n";
-  auto ExplictClasses = SchedModels.explicitSchedClasses();
-  for (const auto &[Idx, Class] : enumerate(ExplictClasses))
-    OS << "    " << Class.Name << "\t= " << Idx << ",\n";
-  OS << "    SCHED_LIST_END = " << ExplictClasses.size() << '\n';
-  OS << "  };\n";
-  OS << "} // end namespace llvm::" << Namespace << "::Sched\n";
-
-  OS << "#endif // GET_INSTRINFO_SCHED_ENUM\n\n";
 }
 
 static TableGen::Emitter::OptClass<InstrInfoEmitter>
diff --git a/llvm/utils/TableGen/OptionParserEmitter.cpp b/llvm/utils/TableGen/OptionParserEmitter.cpp
index 48ae1a0a92b1c..45edde3546f6f 100644
--- a/llvm/utils/TableGen/OptionParserEmitter.cpp
+++ b/llvm/utils/TableGen/OptionParserEmitter.cpp
@@ -266,8 +266,8 @@ static void emitOptionParser(const RecordKeeper &Records, raw_ostream &OS) {
   emitSourceFileHeader("Option Parsing Definitions", OS);
 
   // Generate prefix groups.
-  typedef SmallVector<SmallString<2>, 2> PrefixKeyT;
-  typedef std::map<PrefixKeyT, unsigned> PrefixesT;
+  using PrefixKeyT = SmallVector<SmallString<2>, 2>;
+  using PrefixesT = std::map<PrefixKeyT, unsigned>;
   PrefixesT Prefixes;
   Prefixes.try_emplace(PrefixKeyT(), 0);
   for (const Record &R : llvm::make_pointee_range(Opts)) {
@@ -277,8 +277,8 @@ static void emitOptionParser(const RecordKeeper &Records, raw_ostream &OS) {
   }
 
   // Generate sub command groups.
-  typedef SmallVector<StringRef, 2> SubCommandKeyT;
-  typedef std::map<SubCommandKeyT, unsigned> SubCommandIDsT;
+  using SubCommandKeyT = SmallVector<StringRef, 2>;
+  using SubCommandIDsT = std::map<SubCommandKeyT, unsigned>;
   SubCommandIDsT SubCommandIDs;
 
   auto PrintSubCommandIdsOffset = [&SubCommandIDs, &OS](const Record &R) {
@@ -378,9 +378,9 @@ static void emitOptionParser(const RecordKeeper &Records, raw_ostream &OS) {
       assert((CurIndex == 0 || !SubCommand.empty()) &&
              "Only first subcommand set should be empty!");
       for (const auto &SubCommandKey : SubCommand) {
-        auto It = std::find_if(
-            SubCommands.begin(), SubCommands.end(),
-            [&](const Record *R) { return R->getName() == SubCommandKey; });
+        auto It = llvm::find_if(SubCommands, [&](const Record *R) {
+          return R->getName() == SubCommandKey;
+        });
         assert(It != SubCommands.end() && "SubCommand not found");
         OS << ", " << std::distance(SubCommands.begin(), It) << " /* '"
            << SubCommandKey << "' */";
diff --git a/llvm/utils/TableGen/RegisterBankEmitter.cpp b/llvm/utils/TableGen/RegisterBankEmitter.cpp
index 61b0b661c0f32..271888ba26820 100644
--- a/llvm/utils/TableGen/RegisterBankEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterBankEmitter.cpp
@@ -30,7 +30,7 @@ namespace {
 class RegisterBank {
 
   /// A vector of register classes that are included in the register bank.
-  typedef std::vector<const CodeGenRegisterClass *> RegisterClassesTy;
+  using RegisterClassesTy = std::vector<const CodeGenRegisterClass *>;
 
 private:
   const Record &TheDef;
@@ -100,8 +100,7 @@ class RegisterBank {
     return RCsWithLargestRegSize[HwMode];
   }
 
-  iterator_range<typename RegisterClassesTy::const_iterator>
-  register_classes() const {
+  iterator_range<RegisterClassesTy::const_iterator> register_classes() const {
     return RCs;
   }
 };
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index a67a5a944b1e1..eac908d9bc2bd 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -234,9 +234,9 @@ void RegisterInfoEmitter::EmitRegUnitPressure(raw_ostream &OS,
   }
   OS << "/// Get the weight in units of pressure for this register unit.\n"
      << "unsigned " << ClassName << "::\n"
-     << "getRegUnitWeight(unsigned RegUnit) const {\n"
-     << "  assert(RegUnit < " << RegBank.getNumNativeRegUnits()
-     << " && \"invalid register unit\");\n";
+     << "getRegUnitWeight(MCRegUnit RegUnit) const {\n"
+     << "  assert(static_cast<unsigned>(RegUnit) < "
+     << RegBank.getNumNativeRegUnits() << " && \"invalid register unit\");\n";
   if (!RegUnitsHaveUnitWeight) {
     OS << "  static const uint8_t RUWeightTable[] = {\n    ";
     for (unsigned UnitIdx = 0, UnitEnd = RegBank.getNumNativeRegUnits();
@@ -246,7 +246,7 @@ void RegisterInfoEmitter::EmitRegUnitPressure(raw_ostream &OS,
       OS << RU.Weight << ", ";
     }
     OS << "};\n"
-       << "  return RUWeightTable[RegUnit];\n";
+       << "  return RUWeightTable[static_cast<unsigned>(RegUnit)];\n";
   } else {
     OS << "  // All register units have unit weight.\n"
        << "  return 1;\n";
@@ -330,9 +330,9 @@ void RegisterInfoEmitter::EmitRegUnitPressure(raw_ostream &OS,
      << "register unit.\n"
      << "/// Returns a -1 terminated array of pressure set IDs\n"
      << "const int *" << ClassName << "::\n"
-     << "getRegUnitPressureSets(unsigned RegUnit) const {\n"
-     << "  assert(RegUnit < " << RegBank.getNumNativeRegUnits()
-     << " && \"invalid register unit\");\n";
+     << "getRegUnitPressureSets(MCRegUnit RegUnit) const {\n"
+     << "  assert(static_cast<unsigned>(RegUnit) < "
+     << RegBank.getNumNativeRegUnits() << " && \"invalid register unit\");\n";
   OS << "  static const " << getMinimalTypeForRange(PSetsSeqs.size() - 1, 32)
      << " RUSetStartTable[] = {\n    ";
   for (unsigned UnitIdx = 0, UnitEnd = RegBank.getNumNativeRegUnits();
@@ -341,7 +341,8 @@ void RegisterInfoEmitter::EmitRegUnitPressure(raw_ostream &OS,
        << ",";
   }
   OS << "};\n"
-     << "  return &RCSetsTable[RUSetStartTable[RegUnit]];\n"
+     << "  return "
+        "&RCSetsTable[RUSetStartTable[static_cast<unsigned>(RegUnit)]];\n"
      << "}\n\n";
 }
 
@@ -623,8 +624,8 @@ static void printSubRegIndex(raw_ostream &OS, const CodeGenSubRegIndex *Idx) {
 // The initial value depends on the specific list. The list is terminated by a
 // 0 differential which means we can't encode repeated elements.
 
-typedef SmallVector<int16_t, 4> DiffVec;
-typedef SmallVector<LaneBitmask, 4> MaskVec;
+using DiffVec = SmallVector<int16_t, 4>;
+using MaskVec = SmallVector<LaneBitmask, 4>;
 
 // Fills V with differentials between every two consecutive elements of List.
 static DiffVec &diffEncode(DiffVec &V, SparseBitVector<> List) {
@@ -912,7 +913,7 @@ void RegisterInfoEmitter::runMCDesc(raw_ostream &OS) {
   auto &SubRegIndices = RegBank.getSubRegIndices();
   // The lists of sub-registers and super-registers go in the same array.  That
   // allows us to share suffixes.
-  typedef std::vector<const CodeGenRegister *> RegVec;
+  using RegVec = std::vector<const CodeGenRegister *>;
 
   // Differentially encoded lists.
   SequenceToOffsetTable<DiffVec> DiffSeqs;
@@ -926,7 +927,7 @@ void RegisterInfoEmitter::runMCDesc(raw_ostream &OS) {
 
   // Keep track of sub-register names as well. These are not differentially
   // encoded.
-  typedef SmallVector<const CodeGenSubRegIndex *, 4> SubRegIdxVec;
+  using SubRegIdxVec = SmallVector<const CodeGenSubRegIndex *, 4>;
   SequenceToOffsetTable<SubRegIdxVec, deref<std::less<>>> SubRegIdxSeqs(
       /*Terminator=*/std::nullopt);
   SmallVector<SubRegIdxVec, 4> SubRegIdxLists(Regs.size());
@@ -1168,7 +1169,7 @@ void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS) {
   }
   OS << "  const RegClassWeight &getRegClassWeight("
      << "const TargetRegisterClass *RC) const override;\n"
-     << "  unsigned getRegUnitWeight(unsigned RegUnit) const override;\n"
+     << "  unsigned getRegUnitWeight(MCRegUnit RegUnit) const override;\n"
      << "  unsigned getNumRegPressureSets() const override;\n"
      << "  const char *getRegPressureSetName(unsigned Idx) const override;\n"
      << "  unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned "
@@ -1176,7 +1177,7 @@ void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS) {
      << "  const int *getRegClassPressureSets("
      << "const TargetRegisterClass *RC) const override;\n"
      << "  const int *getRegUnitPressureSets("
-     << "unsigned RegUnit) const override;\n"
+     << "MCRegUnit RegUnit) const override;\n"
      << "  ArrayRef<const char *> getRegMaskNames() const override;\n"
      << "  ArrayRef<const uint32_t *> getRegMasks() const override;\n"
      << "  bool isGeneralPurposeRegister(const MachineFunction &, "
@@ -1348,7 +1349,7 @@ void RegisterInfoEmitter::runTargetDesc(raw_ostream &OS) {
     // Every bit mask present in the list has at least one bit set.
 
     // Compress the sub-reg index lists.
-    typedef std::vector<const CodeGenSubRegIndex *> IdxList;
+    using IdxList = std::vector<const CodeGenSubRegIndex *>;
     SmallVector<IdxList, 8> SuperRegIdxLists(RegisterClasses.size());
     SequenceToOffsetTable<IdxList, deref<std::less<>>> SuperRegIdxSeqs;
     BitVector MaskBV(RegisterClasses.size());
diff --git a/llvm/utils/TableGen/SDNodeInfoEmitter.cpp b/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
index 64f03dae83e7d..dd18d29e6c676 100644
--- a/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
+++ b/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
@@ -10,6 +10,7 @@
 #include "Common/CodeGenDAGPatterns.h" // For SDNodeInfo.
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/TableGen/CodeGenHelpers.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/StringToOffsetTable.h"
 #include "llvm/TableGen/TableGenBackend.h"
@@ -129,9 +130,8 @@ SDNodeInfoEmitter::SDNodeInfoEmitter(const RecordKeeper &RK)
 }
 
 void SDNodeInfoEmitter::emitEnum(raw_ostream &OS) const {
-  OS << "#ifdef GET_SDNODE_ENUM\n";
-  OS << "#undef GET_SDNODE_ENUM\n\n";
-  OS << "namespace llvm::" << TargetSDNodeNamespace << " {\n\n";
+  IfDefEmitter IfDef(OS, "GET_SDNODE_ENUM");
+  NamespaceEmitter NS(OS, "llvm::" + TargetSDNodeNamespace);
 
   if (!NodesByName.empty()) {
     StringRef FirstName = NodesByName.begin()->first;
@@ -145,14 +145,11 @@ void SDNodeInfoEmitter::emitEnum(raw_ostream &OS) const {
 
     OS << "};\n\n";
     OS << "static constexpr unsigned GENERATED_OPCODE_END = " << LastName
-       << " + 1;\n\n";
+       << " + 1;\n";
   } else {
     OS << "static constexpr unsigned GENERATED_OPCODE_END = "
-          "ISD::BUILTIN_OP_END;\n\n";
+          "ISD::BUILTIN_OP_END;\n";
   }
-
-  OS << "} // namespace llvm::" << TargetSDNodeNamespace << "\n\n";
-  OS << "#endif // GET_SDNODE_ENUM\n\n";
 }
 
 std::vector<unsigned> SDNodeInfoEmitter::emitNodeNames(raw_ostream &OS) const {
@@ -324,9 +321,8 @@ static void emitDesc(raw_ostream &OS, StringRef EnumName,
 void SDNodeInfoEmitter::emitDescs(raw_ostream &OS) const {
   StringRef TargetName = Target.getName();
 
-  OS << "#ifdef GET_SDNODE_DESC\n";
-  OS << "#undef GET_SDNODE_DESC\n\n";
-  OS << "namespace llvm {\n";
+  IfDefEmitter IfDef(OS, "GET_SDNODE_DESC");
+  NamespaceEmitter NS(OS, "llvm");
 
   std::vector<unsigned> NameOffsets = emitNodeNames(OS);
   std::vector<std::pair<unsigned, unsigned>> ConstraintOffsetsAndCounts =
@@ -343,11 +339,8 @@ void SDNodeInfoEmitter::emitDescs(raw_ostream &OS) const {
 
   OS << formatv("static const SDNodeInfo {0}GenSDNodeInfo(\n"
                 "    /*NumOpcodes=*/{1}, {0}SDNodeDescs,\n"
-                "    {0}SDNodeNames, {0}SDTypeConstraints);\n\n",
+                "    {0}SDNodeNames, {0}SDTypeConstraints);\n",
                 TargetName, NodesByName.size());
-
-  OS << "} // namespace llvm\n\n";
-  OS << "#endif // GET_SDNODE_DESC\n\n";
 }
 
 void SDNodeInfoEmitter::run(raw_ostream &OS) const {
diff --git a/llvm/utils/TableGen/SearchableTableEmitter.cpp b/llvm/utils/TableGen/SearchableTableEmitter.cpp
index d17d90b452bd7..0dc8c92a5a37a 100644
--- a/llvm/utils/TableGen/SearchableTableEmitter.cpp
+++ b/llvm/utils/TableGen/SearchableTableEmitter.cpp
@@ -116,7 +116,7 @@ class SearchableTableEmitter {
   void run(raw_ostream &OS);
 
 private:
-  typedef std::pair<const Init *, int> SearchTableEntry;
+  using SearchTableEntry = std::pair<const Init *, int>;
 
   enum TypeContext {
     TypeInStaticStruct,
diff --git a/llvm/utils/TableGen/TableGenBackends.h b/llvm/utils/TableGen/TableGenBackends.h
index 6e13864d12e16..261e17172abee 100644
--- a/llvm/utils/TableGen/TableGenBackends.h
+++ b/llvm/utils/TableGen/TableGenBackends.h
@@ -15,8 +15,6 @@
 #ifndef LLVM_UTILS_TABLEGEN_TABLEGENBACKENDS_H
 #define LLVM_UTILS_TABLEGEN_TABLEGENBACKENDS_H
 
-#include <string>
-
 // A TableGen backend is a function that looks like
 //
 //    EmitFoo(RecordKeeper &RK, raw_ostream &OS /*, anything else you need */ )
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
index 3414190b9c9b4..483fc58fcfba4 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -708,7 +708,7 @@ DisassemblerTables::DisassemblerTables() {
   HasConflicts = false;
 }
 
-DisassemblerTables::~DisassemblerTables() {}
+DisassemblerTables::~DisassemblerTables() = default;
 
 void DisassemblerTables::emitModRMDecision(raw_ostream &o1, raw_ostream &o2,
                                            unsigned &i1, unsigned &i2,
@@ -864,8 +864,8 @@ void DisassemblerTables::emitInstructionInfo(raw_ostream &o,
   o << "static const struct OperandSpecifier x86OperandSets[]["
     << X86_MAX_OPERANDS << "] = {\n";
 
-  typedef SmallVector<std::pair<OperandEncoding, OperandType>, X86_MAX_OPERANDS>
-      OperandListTy;
+  using OperandListTy =
+      SmallVector<std::pair<OperandEncoding, OperandType>, X86_MAX_OPERANDS>;
   std::map<OperandListTy, unsigned> OperandSets;
 
   unsigned OperandSetNum = 0;
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.h b/llvm/utils/TableGen/X86DisassemblerTables.h
index 0f382741530ce..325bb572b0587 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.h
+++ b/llvm/utils/TableGen/X86DisassemblerTables.h
@@ -51,7 +51,7 @@ class DisassemblerTables {
   std::unique_ptr<ContextDecision> Tables[12];
 
   // Table of ModRM encodings.
-  typedef std::map<std::vector<unsigned>, unsigned> ModRMMapTy;
+  using ModRMMapTy = std::map<std::vector<unsigned>, unsigned>;
   mutable ModRMMapTy ModRMTable;
 
   /// The instruction information table
diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
index 6f523b5a197d3..cbb7f89bee679 100644
--- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -175,9 +175,8 @@ class X86FoldTablesEmitter {
     }
   };
 
-  typedef std::map<const CodeGenInstruction *, X86FoldTableEntry,
-                   CompareInstrsByEnum>
-      FoldTable;
+  using FoldTable = std::map<const CodeGenInstruction *, X86FoldTableEntry,
+                             CompareInstrsByEnum>;
   // Table2Addr - Holds instructions which their memory form performs
   //              load+store.
   //
diff --git a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
index 2745ba72ba624..9abb1943380f2 100644
--- a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
+++ b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
@@ -35,10 +35,10 @@ class X86InstrMappingEmitter {
   // to make the search more efficient
   std::map<uint64_t, std::vector<const CodeGenInstruction *>> CompressedInsts;
 
-  typedef std::pair<const CodeGenInstruction *, const CodeGenInstruction *>
-      Entry;
-  typedef std::map<StringRef, std::vector<const CodeGenInstruction *>>
-      PredicateInstMap;
+  using Entry =
+      std::pair<const CodeGenInstruction *, const CodeGenInstruction *>;
+  using PredicateInstMap =
+      std::map<StringRef, std::vector<const CodeGenInstruction *>>;
 
   // Hold all compressed instructions that need to check predicate
   PredicateInstMap PredicateInsts;
diff --git a/llvm/utils/TableGen/X86ModRMFilters.h b/llvm/utils/TableGen/X86ModRMFilters.h
index 7bf111ffa1d50..4eb57b0a4623b 100644
--- a/llvm/utils/TableGen/X86ModRMFilters.h
+++ b/llvm/utils/TableGen/X86ModRMFilters.h
@@ -28,7 +28,7 @@ class ModRMFilter {
 
 public:
   /// Destructor    - Override as necessary.
-  virtual ~ModRMFilter() {}
+  virtual ~ModRMFilter() = default;
 
   /// isDumb        - Indicates whether this filter returns the same value for
   ///                 any value of the ModR/M byte.
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
index a006888a2352c..44b76ae7e8487 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -1141,7 +1141,6 @@ OperandType RecognizableInstr::typeFromString(StringRef Str, bool hasREX_W,
           .Case("vz64mem", TYPE_MVSIBZ)
           .Case("BNDR", TYPE_BNDR)
           .Case("TILE", TYPE_TMM)
-          .Case("TILEPair", TYPE_TMM_PAIR)
           .Default(TYPE_NONE);
   // clang-format on
 
diff --git a/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp b/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
index 4379c78889b94..c7ab9cc4c7d77 100644
--- a/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
+++ b/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
@@ -15,7 +15,6 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include <algorithm>
-#include <array>
 #include <deque>
 #include <fstream>
 #include <memory>
diff --git a/llvm/utils/UpdateTestChecks/asm.py b/llvm/utils/UpdateTestChecks/asm.py
index 469e27facedb0..82377862885c8 100644
--- a/llvm/utils/UpdateTestChecks/asm.py
+++ b/llvm/utils/UpdateTestChecks/asm.py
@@ -51,9 +51,9 @@ class string:
 )
 
 ASM_FUNCTION_AMDGPU_RE = re.compile(
-    r"\.type\s+_?(?P<func>[^,\n]+),@function\n"
+    r"\.type\s+(_|\.L)?(?P<func>[^,\n]+),@function\n"
     r"(^\s*\.amdgpu_hsa_kernel (?P=func)\n)?"
-    r'^_?(?P=func):(?:[ \t]*;+[ \t]*@"?(?P=func)"?)?\n'
+    r'^(_|\.L)?(?P=func):(?:[ \t]*;+[ \t]*@"?(?P=func)"?)?\n'
     r"(?P<body>.*?)\n"  # (body of the function)
     # This list is incomplete
     r"^\s*(\.Lfunc_end[0-9]+:\n|\.section)",
@@ -576,6 +576,7 @@ def get_run_handler(triple):
         "armv7-apple-ios": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_IOS_RE),
         "armv7-apple-darwin": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_DARWIN_RE),
         "armv7k-apple-watchos": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_DARWIN_RE),
+        "thumbv7k-apple-watchos": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_DARWIN_RE),
         "thumb": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_RE),
         "thumb-macho": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_MACHO_RE),
         "thumbv5-macho": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_MACHO_RE),
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index b6b80ea117672..baa0377cd8b81 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -29,7 +29,7 @@
    'none' and 'all'. 'smart' is the default.
 5: Basic block labels are matched by FileCheck expressions
 6: The semantics of TBAA checks has been incorporated in the check lines.
-7: Indent switch-cases correctly.
+7: Indent switch-cases correctly; CHECK-EMPTY instead of skipping blank lines.
 """
 DEFAULT_VERSION = 6
 
@@ -605,9 +605,10 @@ def invoke_tool(exe, cmd_args, ir, preprocess_cmd=None, verbose=False):
 TRIPLE_ARG_RE = re.compile(r"-m?triple[= ]([^ ]+)")
 MARCH_ARG_RE = re.compile(r"-march[= ]([^ ]+)")
 DEBUG_ONLY_ARG_RE = re.compile(r"-debug-only[= ]([^ ]+)")
+STOP_PASS_RE = re.compile(r"-stop-(before|after)=(\w+)")
 
 IS_DEBUG_RECORD_RE = re.compile(r"^(\s+)#dbg_")
-IS_SWITCH_CASE_RE = re.compile(r"^\s+i\d+ \d+, label %\w+")
+IS_SWITCH_CASE_RE = re.compile(r"^\s+i\d+ \d+, label %\S+")
 
 SCRUB_LEADING_WHITESPACE_RE = re.compile(r"^(\s+)")
 SCRUB_WHITESPACE_RE = re.compile(r"(?!^(|  \w))[ \t]+", flags=re.M)
@@ -1123,6 +1124,8 @@ def processed_prefixes(self, prefixes):
 ##### Generator of LLVM IR CHECK lines
 
 SCRUB_IR_COMMENT_RE = re.compile(r"\s*;.*")
+# Comments to indicate the predecessors of a block in the IR.
+SCRUB_PRED_COMMENT_RE = re.compile(r"\s*; preds = .*")
 SCRUB_IR_FUNC_META_RE = re.compile(r"((?:\!(?!dbg\b)[a-zA-Z_]\w*(?:\s+![0-9]+)?)\s*)+")
 
 # TODO: We should also derive check lines for global, debug, loop declarations, etc..
@@ -1361,7 +1364,7 @@ def make_ir_generalizer(version, no_meta_details):
     ]
 
     prefix = r"(\s*)"
-    suffix = r"([,\s\(\)\}]|\Z)"
+    suffix = r"([,\s\(\)\}\]]|\Z)"
 
     # values = [
     #     nameless_value
@@ -1877,6 +1880,7 @@ def generalize_check_lines(
     *,
     unstable_globals_only=False,
     no_meta_details=False,
+    ignore_all_comments=True,  # If False, only ignore comments of predecessors
 ):
     if unstable_globals_only:
         regexp = ginfo.get_unstable_globals_regexp()
@@ -1904,8 +1908,12 @@ def escape_braces(match_obj):
                         line,
                     )
                     break
-            # Ignore any comments, since the check lines will too.
-            scrubbed_line = SCRUB_IR_COMMENT_RE.sub(r"", line)
+            if ignore_all_comments:
+                # Ignore any comments, since the check lines will too.
+                scrubbed_line = SCRUB_IR_COMMENT_RE.sub(r"", line)
+            else:
+                # Ignore comments of predecessors only.
+                scrubbed_line = SCRUB_PRED_COMMENT_RE.sub(r"", line)
             # Ignore the metadata details if check global is none
             if no_meta_details:
                 scrubbed_line = SCRUB_IR_FUNC_META_RE.sub(r"{{.*}}", scrubbed_line)
@@ -2083,6 +2091,7 @@ def add_checks(
     global_tbaa_records_for_prefixes={},
     preserve_names=False,
     original_check_lines: Mapping[str, List[str]] = {},
+    check_inst_comments=True,
 ):
     # prefix_exclusions are prefixes we cannot use to print the function because it doesn't exist in run lines that use these prefixes as well.
     prefix_exclusions = set()
@@ -2272,6 +2281,14 @@ def add_checks(
             # For IR output, change all defs to FileCheck variables, so we're immune
             # to variable naming fashions.
             else:
+                if ginfo.get_version() >= 7:
+                    # Record the indices of blank lines in the function body preemptively.
+                    blank_line_indices = {
+                        i for i, line in enumerate(func_body) if line.strip() == ""
+                    }
+                else:
+                    blank_line_indices = set()
+
                 func_body = generalize_check_lines(
                     func_body,
                     ginfo,
@@ -2280,6 +2297,8 @@ def add_checks(
                     global_tbaa_records,
                     preserve_names,
                     original_check_lines=original_check_lines.get(checkprefix),
+                    # IR output might require comments checks, e.g., print-predicate-info, print<memssa>
+                    ignore_all_comments=not check_inst_comments,
                 )
 
                 # This could be selectively enabled with an optional invocation argument.
@@ -2295,12 +2314,22 @@ def add_checks(
 
                 is_blank_line = False
 
-                for func_line in func_body:
+                for idx, func_line in enumerate(func_body):
                     if func_line.strip() == "":
-                        is_blank_line = True
+                        # We should distinguish if the line is a 'fake' blank line generated by
+                        # generalize_check_lines removing comments.
+                        # Fortunately, generalize_check_lines does not change the index of each line,
+                        # we can record the indices of blank lines preemptively.
+                        if idx in blank_line_indices:
+                            output_lines.append(
+                                "{} {}-EMPTY:".format(comment_marker, checkprefix)
+                            )
+                        else:
+                            is_blank_line = True
                         continue
-                    # Do not waste time checking IR comments.
-                    func_line = SCRUB_IR_COMMENT_RE.sub(r"", func_line)
+                    if not check_inst_comments:
+                        # Do not waste time checking IR comments unless necessary.
+                        func_line = SCRUB_IR_COMMENT_RE.sub(r"", func_line)
 
                     # Skip blank lines instead of checking them.
                     if is_blank_line:
@@ -2342,6 +2371,7 @@ def add_ir_checks(
     global_vars_seen_dict,
     global_tbaa_records_for_prefixes,
     is_filtered,
+    check_inst_comments=False,
     original_check_lines={},
 ):
     assert ginfo.is_ir()
@@ -2368,6 +2398,7 @@ def add_ir_checks(
         global_tbaa_records_for_prefixes,
         preserve_names,
         original_check_lines=original_check_lines,
+        check_inst_comments=check_inst_comments,
     )
 
 
diff --git a/llvm/utils/UpdateTestChecks/mir.py b/llvm/utils/UpdateTestChecks/mir.py
index 24bb8b341d335..01ee0e19f7cb9 100644
--- a/llvm/utils/UpdateTestChecks/mir.py
+++ b/llvm/utils/UpdateTestChecks/mir.py
@@ -163,13 +163,15 @@ def add_mir_checks_for_function(
     print_fixed_stack,
     first_check_is_next,
     at_the_function_name,
+    check_indent=None,
 ):
     printed_prefixes = set()
     for run in run_list:
         for prefix in run[0]:
             if prefix in printed_prefixes:
                 break
-            if not func_dict[prefix][func_name]:
+            # func_info can be empty if there was a prefix conflict.
+            if not func_dict[prefix].get(func_name):
                 continue
             if printed_prefixes:
                 # Add some space between different check prefixes.
@@ -185,6 +187,7 @@ def add_mir_checks_for_function(
                 func_dict[prefix][func_name],
                 print_fixed_stack,
                 first_check_is_next,
+                check_indent,
             )
             break
         else:
@@ -204,6 +207,7 @@ def add_mir_check_lines(
     func_info,
     print_fixed_stack,
     first_check_is_next,
+    check_indent=None,
 ):
     func_body = str(func_info).splitlines()
     if single_bb:
@@ -220,7 +224,10 @@ def add_mir_check_lines(
     first_line = func_body[0]
     indent = len(first_line) - len(first_line.lstrip(" "))
     # A check comment, indented the appropriate amount
-    check = "{:>{}}; {}".format("", indent, prefix)
+    if check_indent is not None:
+        check = "{}; {}".format(check_indent, prefix)
+    else:
+        check = "{:>{}}; {}".format("", indent, prefix)
 
     output_lines.append("{}-LABEL: name: {}".format(check, func_name))
 
diff --git a/llvm/utils/extract_symbols.py b/llvm/utils/extract_symbols.py
index 0cbfd2e2910e1..5254d16e410d5 100755
--- a/llvm/utils/extract_symbols.py
+++ b/llvm/utils/extract_symbols.py
@@ -97,6 +97,16 @@ def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
     # don't
     elif symbol.startswith("??_G") or symbol.startswith("??_E"):
         return None
+    # Delete template instantiations. These start with ?$ and can be discarded
+    # because they will be instantiated in the importing translation unit if
+    # needed.
+    elif symbol.startswith("??$"):
+        return None
+    # Delete lambda object constructors and operator() functions. These start
+    # with ??R<lambda_ or ??0<lambda_ and can be discarded because lambdas are
+    # usually local to a function.
+    elif symbol.startswith("??R<lambda_") or symbol.startswith("??0<lambda_"):
+        return None
     # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
     # that mentions an anonymous namespace can be discarded, as the anonymous
     # namespace doesn't exist outside of that translation unit.
@@ -131,7 +141,24 @@ def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
     #                 ::= .+@ (list of types)
     #                 ::= .*Z (list of types, varargs)
     # <throw-spec> ::= exceptions are not allowed
-    elif re.search(r"(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$", symbol):
+    elif re.search(r"@(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$", symbol):
+        # Remove llvm::<Class>::dump and clang::<Class>::dump methods because
+        # they are used for debugging only.
+        if symbol.startswith("?dump@"):
+            return None
+        return symbol
+    # Keep mangled global variables and static class members in llvm:: namespace.
+    # These have a type mangling that looks like (this is derived from
+    # clang/lib/AST/MicrosoftMangle.cpp):
+    # <type-encoding> ::= <storage-class> <variable-type>
+    # <storage-class> ::= 0  # private static member
+    #                 ::= 1  # protected static member
+    #                 ::= 2  # public static member
+    #                 ::= 3  # global
+    #                 ::= 4  # static local
+    # <variable-type> ::= <type> <cvr-qualifiers>
+    #                 ::= <type> <pointee-cvr-qualifiers> # pointers, references
+    elif re.search(r"@llvm@@[0-3].*$", symbol):
         return symbol
     return None
 
diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
index dff7f78ce64a2..f6b28f480b8a2 100755
--- a/llvm/utils/git/code-format-helper.py
+++ b/llvm/utils/git/code-format-helper.py
@@ -486,8 +486,6 @@ def hook_main():
         if fmt.has_tool():
             if not fmt.run(args.changed_files, args):
                 failed_fmts.append(fmt.name)
-            if fmt.comment:
-                comments.append(fmt.comment)
         else:
             print(f"Couldn't find {fmt.name}, can't check " + fmt.friendly_name.lower())
 
diff --git a/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn
index 393309ee39bfe..a261f2866be47 100644
--- a/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn
@@ -12,7 +12,7 @@ static_library("Passes") {
     "//llvm/utils/gn/build/libs/pthread",
   ]
   sources = [
-    "ADRRelaxationPass.cpp",
+    "AArch64RelaxationPass.cpp",
     "Aligner.cpp",
     "AllocCombiner.cpp",
     "AsmDump.cpp",
diff --git a/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn
index 764ebb90c8a4e..4ab1c95b430a8 100644
--- a/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn
@@ -34,6 +34,7 @@ static_library("Rewrite") {
     "MachORewriteInstance.cpp",
     "MetadataManager.cpp",
     "PseudoProbeRewriter.cpp",
+    "RSeqRewriter.cpp",
     "RewriteInstance.cpp",
     "SDTRewriter.cpp",
   ]
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index 036123371d24c..5c5958a94b978 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -30,13 +30,17 @@ static_library("bugprone") {
     "CommandProcessorCheck.cpp",
     "ComparePointerToMemberVirtualFunctionCheck.cpp",
     "CopyConstructorInitCheck.cpp",
+    "CopyConstructorMutatesArgumentCheck.cpp",
     "CrtpConstructorAccessibilityCheck.cpp",
     "DanglingHandleCheck.cpp",
+    "DefaultOperatorNewOnOveralignedTypeCheck.cpp",
     "DerivedMethodShadowingBaseMethodCheck.cpp",
     "DynamicStaticInitializersCheck.cpp",
     "EasilySwappableParametersCheck.cpp",
     "EmptyCatchCheck.cpp",
+    "ExceptionCopyConstructorThrowsCheck.cpp",
     "ExceptionEscapeCheck.cpp",
+    "FloatLoopCounterCheck.cpp",
     "FoldInitTypeCheck.cpp",
     "ForwardDeclarationNamespaceCheck.cpp",
     "ForwardingReferenceOverloadCheck.cpp",
@@ -69,6 +73,8 @@ static_library("bugprone") {
     "ParentVirtualCallCheck.cpp",
     "PointerArithmeticOnPolymorphicObjectCheck.cpp",
     "PosixReturnCheck.cpp",
+    "RandomGeneratorSeedCheck.cpp",
+    "RawMemoryCallOnNonTrivialTypeCheck.cpp",
     "RedundantBranchConditionCheck.cpp",
     "ReservedIdentifierCheck.cpp",
     "ReturnConstRefFromParameterCheck.cpp",
@@ -80,6 +86,7 @@ static_library("bugprone") {
     "SmartPtrArrayMismatchCheck.cpp",
     "SpuriouslyWakeUpFunctionsCheck.cpp",
     "StandaloneEmptyCheck.cpp",
+    "StdNamespaceModificationCheck.cpp",
     "StringConstructorCheck.cpp",
     "StringIntegerAssignmentCheck.cpp",
     "StringLiteralWithEmbeddedNulCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
index b097e139b9c7f..d373ac52e8963 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
@@ -16,13 +16,6 @@ static_library("cert") {
   ]
   sources = [
     "CERTTidyModule.cpp",
-    "DefaultOperatorNewAlignmentCheck.cpp",
-    "DontModifyStdNamespaceCheck.cpp",
-    "FloatLoopCounter.cpp",
     "LimitedRandomnessCheck.cpp",
-    "MutatingCopyCheck.cpp",
-    "NonTrivialTypesLibcMemoryCallsCheck.cpp",
-    "ProperlySeededRandomGeneratorCheck.cpp",
-    "ThrownExceptionTypeCheck.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn
index da380c9c1f438..52311b233933a 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn
@@ -33,7 +33,7 @@ static_library("cppcoreguidelines") {
     "OwningMemoryCheck.cpp",
     "PreferMemberInitializerCheck.cpp",
     "ProBoundsArrayToPointerDecayCheck.cpp",
-    "ProBoundsAvoidUncheckedContainerAccess.cpp",
+    "ProBoundsAvoidUncheckedContainerAccessCheck.cpp",
     "ProBoundsConstantArrayIndexCheck.cpp",
     "ProBoundsPointerArithmeticCheck.cpp",
     "ProTypeConstCastCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn
index a6848b3c9f241..72b6188f799f6 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn
@@ -39,12 +39,12 @@ static_library("misc") {
     "HeaderIncludeCycleCheck.cpp",
     "IncludeCleanerCheck.cpp",
     "MiscTidyModule.cpp",
-    "MisleadingBidirectional.cpp",
-    "MisleadingIdentifier.cpp",
+    "MisleadingBidirectionalCheck.cpp",
+    "MisleadingIdentifierCheck.cpp",
     "MisplacedConstCheck.cpp",
     "NewDeleteOverloadsCheck.cpp",
     "NoRecursionCheck.cpp",
-    "NonCopyableObjects.cpp",
+    "NonCopyableObjectsCheck.cpp",
     "NonPrivateMemberVariablesInClassesCheck.cpp",
     "OverrideWithDifferentVisibilityCheck.cpp",
     "RedundantExpressionCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/objc/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/objc/BUILD.gn
index e75b37635e4a0..bc822252405b2 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/objc/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/objc/BUILD.gn
@@ -11,7 +11,7 @@ static_library("objc") {
     "//llvm/lib/Support",
   ]
   sources = [
-    "AssertEquals.cpp",
+    "AssertEqualsCheck.cpp",
     "AvoidNSErrorInitCheck.cpp",
     "DeallocInCategoryCheck.cpp",
     "ForbiddenSubclassingCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/performance/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/performance/BUILD.gn
index 5a29775d5ea2d..74de89f726bab 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/performance/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/performance/BUILD.gn
@@ -31,7 +31,7 @@ static_library("performance") {
     "PerformanceTidyModule.cpp",
     "TriviallyDestructibleCheck.cpp",
     "TypePromotionInMathFnCheck.cpp",
-    "UnnecessaryCopyInitialization.cpp",
+    "UnnecessaryCopyInitializationCheck.cpp",
     "UnnecessaryValueParamCheck.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
index 3b0f38a518b1f..4fe37402f987c 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
@@ -13,7 +13,7 @@ static_library("readability") {
   ]
   sources = [
     "AmbiguousSmartptrResetCallCheck.cpp",
-    "AvoidConstParamsInDecls.cpp",
+    "AvoidConstParamsInDeclsCheck.cpp",
     "AvoidNestedConditionalOperatorCheck.cpp",
     "AvoidReturnWithVoidValueCheck.cpp",
     "AvoidUnconditionalPreprocessorIfCheck.cpp",
@@ -22,7 +22,7 @@ static_library("readability") {
     "ContainerContainsCheck.cpp",
     "ContainerDataPointerCheck.cpp",
     "ContainerSizeEmptyCheck.cpp",
-    "ConvertMemberFunctionsToStatic.cpp",
+    "ConvertMemberFunctionsToStaticCheck.cpp",
     "DeleteNullPointerCheck.cpp",
     "DuplicateIncludeCheck.cpp",
     "ElseAfterReturnCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
index f8c4838ab7ee3..bd225dd1e5656 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
@@ -52,6 +52,7 @@ static_library("clangd") {
     "//clang/lib/Frontend",
     "//clang/lib/Index",
     "//clang/lib/Lex",
+    "//clang/lib/Options",
     "//clang/lib/Sema",
     "//clang/lib/Serialization",
     "//clang/lib/Tooling",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/modularize/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/modularize/BUILD.gn
index 1d01bd8e87d37..8f1c1d8bb0ed7 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/modularize/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/modularize/BUILD.gn
@@ -6,6 +6,7 @@ executable("modularize") {
     "//clang/lib/Driver",
     "//clang/lib/Frontend",
     "//clang/lib/Lex",
+    "//clang/lib/Options",
     "//clang/lib/Serialization",
     "//clang/lib/Tooling",
     "//llvm/lib/Option",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/pp-trace/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/pp-trace/BUILD.gn
index 2c0eac074d355..5acee9dd52518 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/pp-trace/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/pp-trace/BUILD.gn
@@ -5,6 +5,7 @@ executable("pp-trace") {
     "//clang/lib/Basic",
     "//clang/lib/Frontend",
     "//clang/lib/Lex",
+    "//clang/lib/Options",
     "//clang/lib/Serialization",
     "//clang/lib/Tooling",
     "//llvm/lib/Support",
diff --git a/llvm/utils/gn/secondary/clang/include/clang/Driver/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Options/BUILD.gn
similarity index 100%
rename from llvm/utils/gn/secondary/clang/include/clang/Driver/BUILD.gn
rename to llvm/utils/gn/secondary/clang/include/clang/Options/BUILD.gn
diff --git a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
index e1f31179b9a20..66dbf6152472a 100644
--- a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
@@ -14,6 +14,7 @@ static_library("Driver") {
     # See the review thread of r311958 for details.
     "//clang/include/clang/StaticAnalyzer/Checkers",
     "//clang/lib/Basic",
+    "//clang/lib/Options",
     "//llvm/include/llvm/Config:llvm-config",
     "//llvm/lib/BinaryFormat",
     "//llvm/lib/Option",
@@ -21,10 +22,6 @@ static_library("Driver") {
     "//llvm/lib/TargetParser",
     "//llvm/lib/WindowsDriver",
   ]
-  public_deps = [
-    # public_dep because public header Options.h includes generated Options.inc.
-    "//clang/include/clang/Driver:Options",
-  ]
   if (host_os == "win") {
     # MSVCToolChain.cpp uses version.dll.
     libs = [ "version.lib" ]
@@ -34,12 +31,10 @@ static_library("Driver") {
     "Compilation.cpp",
     "Distro.cpp",
     "Driver.cpp",
-    "DriverOptions.cpp",
     "Job.cpp",
     "Multilib.cpp",
     "MultilibBuilder.cpp",
     "OffloadBundler.cpp",
-    "OptionUtils.cpp",
     "Phases.cpp",
     "SanitizerArgs.cpp",
     "Tool.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn
index 948d1405676b7..4009cfc609f4a 100644
--- a/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn
@@ -9,6 +9,7 @@ static_library("Frontend") {
     "//clang/lib/Driver",
     "//clang/lib/Edit",
     "//clang/lib/Lex",
+    "//clang/lib/Options",
     "//clang/lib/Parse",
     "//clang/lib/Sema",
     "//clang/lib/Serialization",
diff --git a/llvm/utils/gn/secondary/clang/lib/FrontendTool/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/FrontendTool/BUILD.gn
index 9f7140ed391dd..707eabf1af70b 100644
--- a/llvm/utils/gn/secondary/clang/lib/FrontendTool/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/FrontendTool/BUILD.gn
@@ -11,6 +11,7 @@ static_library("FrontendTool") {
     "//clang/lib/ExtractAPI",
     "//clang/lib/Frontend",
     "//clang/lib/Frontend/Rewrite",
+    "//clang/lib/Options",
     "//llvm/lib/Option",
     "//llvm/lib/Support",
   ]
diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index 3c523aeada6cb..03e5294b03860 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -132,18 +132,12 @@ copy("Headers") {
     "amdgpuintrin.h",
     "ammintrin.h",
     "amxavx512intrin.h",
-    "amxbf16transposeintrin.h",
     "amxcomplexintrin.h",
-    "amxcomplextransposeintrin.h",
     "amxfp16intrin.h",
-    "amxfp16transposeintrin.h",
     "amxfp8intrin.h",
     "amxintrin.h",
     "amxmovrsintrin.h",
-    "amxmovrstransposeintrin.h",
     "amxtf32intrin.h",
-    "amxtf32transposeintrin.h",
-    "amxtransposeintrin.h",
     "andes_vector.h",
     "arm64intr.h",
     "arm_acle.h",
diff --git a/llvm/utils/gn/secondary/clang/lib/Options/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Options/BUILD.gn
new file mode 100644
index 0000000000000..3f022ed7f7480
--- /dev/null
+++ b/llvm/utils/gn/secondary/clang/lib/Options/BUILD.gn
@@ -0,0 +1,19 @@
+static_library("Options") {
+  output_name = "clangOptions"
+  configs += [ "//llvm/utils/gn/build:clang_code" ]
+  include_dirs = [ "." ]
+  deps = [
+    "//clang/include/clang/Config",
+    "//clang/lib/Basic",
+    "//llvm/lib/Option",
+    "//llvm/lib/Support",
+  ]
+  public_deps = [
+    # public_dep because public header Options.h includes generated Options.inc.
+    "//clang/include/clang/Options:Options",
+  ]
+  sources = [
+    "DriverOptions.cpp",
+    "OptionUtils.cpp",
+  ]
+}
diff --git a/llvm/utils/gn/secondary/clang/lib/Tooling/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Tooling/BUILD.gn
index 0087aad982672..5ec5500010b39 100644
--- a/llvm/utils/gn/secondary/clang/lib/Tooling/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Tooling/BUILD.gn
@@ -2,7 +2,7 @@ static_library("Tooling") {
   output_name = "clangTooling"
   configs += [ "//llvm/utils/gn/build:clang_code" ]
   deps = [
-    "//clang/include/clang/Driver:Options",
+    "//clang/include/clang/Options:Options",
     "//clang/lib/AST",
     "//clang/lib/ASTMatchers",
     "//clang/lib/Basic",
@@ -10,6 +10,7 @@ static_library("Tooling") {
     "//clang/lib/Format",
     "//clang/lib/Frontend",
     "//clang/lib/Lex",
+    "//clang/lib/Options",
     "//clang/lib/Rewrite",
     "//clang/lib/Tooling/Core",
     "//llvm/lib/TargetParser",
diff --git a/llvm/utils/gn/secondary/clang/tools/clang-check/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-check/BUILD.gn
index 39a92f6ac921f..dd796d42038b1 100644
--- a/llvm/utils/gn/secondary/clang/tools/clang-check/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/tools/clang-check/BUILD.gn
@@ -6,6 +6,7 @@ executable("clang-check") {
     "//clang/lib/Driver",
     "//clang/lib/Frontend",
     "//clang/lib/Frontend/Rewrite",
+    "//clang/lib/Options",
     "//clang/lib/StaticAnalyzer/Frontend",
     "//clang/lib/Tooling",
     "//clang/lib/Tooling/Syntax",
diff --git a/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn
index 89f0107686f91..774402a9931a9 100644
--- a/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn
@@ -13,6 +13,7 @@ driver_executable("clang-installapi") {
     "//clang/lib/Driver",
     "//clang/lib/Frontend",
     "//clang/lib/InstallAPI",
+    "//clang/lib/Options",
     "//clang/lib/Tooling",
     "//llvm/lib/Support",
     "//llvm/lib/TargetParser",
diff --git a/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn
index 54fca3bf1f50b..b402bbba59ca0 100644
--- a/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn
@@ -57,6 +57,7 @@ driver_executable("clang") {
     "//clang/lib/Frontend",
     "//clang/lib/FrontendTool",
     "//clang/lib/Headers",
+    "//clang/lib/Options",
     "//clang/tools/clang-linker-wrapper",
     "//clang/tools/clang-nvlink-wrapper",
     "//clang/tools/clang-offload-bundler",
diff --git a/llvm/utils/gn/secondary/clang/unittests/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/CodeGen/BUILD.gn
index 065fc6cdd74a3..bd8d9610c2a4a 100644
--- a/llvm/utils/gn/secondary/clang/unittests/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/CodeGen/BUILD.gn
@@ -17,6 +17,7 @@ unittest("ClangCodeGenTests") {
     "BufferSourceTest.cpp",
     "CheckTargetFeaturesTest.cpp",
     "CodeGenExternalTest.cpp",
+    "DemangleTrapReasonInDebugInfo.cpp",
     "TBAAMetadataTest.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index c20d2aa033690..3ca6b7fa0565e 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -336,6 +336,7 @@ if (current_toolchain == default_toolchain) {
       "__chrono/gps_clock.h",
       "__chrono/hh_mm_ss.h",
       "__chrono/high_resolution_clock.h",
+      "__chrono/is_clock.h",
       "__chrono/leap_second.h",
       "__chrono/literals.h",
       "__chrono/local_info.h",
@@ -401,6 +402,8 @@ if (current_toolchain == default_toolchain) {
       "__configuration/abi.h",
       "__configuration/availability.h",
       "__configuration/compiler.h",
+      "__configuration/experimental.h",
+      "__configuration/hardening.h",
       "__configuration/language.h",
       "__configuration/platform.h",
       "__coroutine/coroutine_handle.h",
@@ -1431,7 +1434,6 @@ if (current_toolchain == default_toolchain) {
       "__tuple/tuple_like.h",
       "__tuple/tuple_like_no_subrange.h",
       "__tuple/tuple_size.h",
-      "__tuple/tuple_types.h",
       "__type_traits/add_cv_quals.h",
       "__type_traits/add_pointer.h",
       "__type_traits/add_reference.h",
@@ -1503,7 +1505,6 @@ if (current_toolchain == default_toolchain) {
       "__type_traits/is_reference.h",
       "__type_traits/is_reference_wrapper.h",
       "__type_traits/is_referenceable.h",
-      "__type_traits/is_replaceable.h",
       "__type_traits/is_same.h",
       "__type_traits/is_scalar.h",
       "__type_traits/is_signed.h",
@@ -1524,6 +1525,7 @@ if (current_toolchain == default_toolchain) {
       "__type_traits/is_valid_expansion.h",
       "__type_traits/is_void.h",
       "__type_traits/is_volatile.h",
+      "__type_traits/is_within_lifetime.h",
       "__type_traits/lazy.h",
       "__type_traits/make_32_64_or_128_bit.h",
       "__type_traits/make_const_lvalue_ref.h",
@@ -1710,7 +1712,6 @@ if (current_toolchain == default_toolchain) {
       "sstream",
       "stack",
       "stdatomic.h",
-      "stdbool.h",
       "stddef.h",
       "stdexcept",
       "stdio.h",
diff --git a/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
index cf6e3515d0c09..24d96bcdf9eaf 100644
--- a/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
@@ -68,6 +68,7 @@ target(liblldb_type, "liblldb") {
     "SBFileSpecList.cpp",
     "SBFormat.cpp",
     "SBFrame.cpp",
+    "SBFrameList.cpp",
     "SBFunction.cpp",
     "SBHostOS.cpp",
     "SBInstruction.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/LanguageRuntime/CPlusPlus/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/LanguageRuntime/CPlusPlus/BUILD.gn
index 9848efef70568..fa99fa8649caf 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/LanguageRuntime/CPlusPlus/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/LanguageRuntime/CPlusPlus/BUILD.gn
@@ -1,10 +1,16 @@
 static_library("CPlusPlus") {
   output_name = "lldbPluginCPPRuntime"
-  configs += [ "//llvm/utils/gn/build:lldb_code" ]
+  configs += [
+    "//llvm/utils/gn/build:clang_code",
+    "//llvm/utils/gn/build:lldb_code",
+  ]
   deps = [
     "//lldb/source/Core",
     "//lldb/source/Symbol",
     "//lldb/source/Target",
   ]
-  sources = [ "CPPLanguageRuntime.cpp" ]
+  sources = [
+    "CPPLanguageRuntime.cpp",
+    "VerboseTrapFrameRecognizer.cpp",
+  ]
 }
diff --git a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
index a863baf912051..679373d741661 100644
--- a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
@@ -72,6 +72,7 @@ static_library("Target") {
     "Statistics.cpp",
     "StopInfo.cpp",
     "StructuredDataPlugin.cpp",
+    "SyntheticFrameProvider.cpp",
     "SystemRuntime.cpp",
     "Target.cpp",
     "TargetList.cpp",
@@ -105,6 +106,5 @@ static_library("Target") {
     "UnixSignals.cpp",
     "UnwindAssembly.cpp",
     "UnwindLLDB.cpp",
-    "VerboseTrapFrameRecognizer.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
index a42f781271a21..b6c2f465a7292 100644
--- a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
@@ -21,6 +21,7 @@ static_library("lib") {
   sources = [
     "Breakpoint.cpp",
     "BreakpointBase.cpp",
+    "ClientLauncher.cpp",
     "CommandPlugins.cpp",
     "DAP.cpp",
     "DAPError.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/tool/BUILD.gn b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/tool/BUILD.gn
index 8b764843ac82f..1773fe3c73f57 100644
--- a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/tool/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/tool/BUILD.gn
@@ -5,7 +5,7 @@ import("//llvm/version.gni")
 tablegen("Options") {
   visibility = [ ":lldb-dap" ]
   args = [ "-gen-opt-parser-defs" ]
-  td_file = "../Options.td"
+  td_file = "Options.td"
 }
 
 if (host_os == "mac") {
diff --git a/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
index ab25c263095fa..51555f2ee8acb 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
@@ -68,7 +68,6 @@ static_library("Analysis") {
     "InlineAdvisor.cpp",
     "InlineCost.cpp",
     "InlineOrder.cpp",
-    "InlineSizeEstimatorAnalysis.cpp",
     "InstCount.cpp",
     "InstructionPrecedenceTracking.cpp",
     "InstructionSimplify.cpp",
@@ -116,6 +115,7 @@ static_library("Analysis") {
     "RegionPass.cpp",
     "RegionPrinter.cpp",
     "ReplayInlineAdvisor.cpp",
+    "RuntimeLibcallInfo.cpp",
     "ScalarEvolution.cpp",
     "ScalarEvolutionAliasAnalysis.cpp",
     "ScalarEvolutionDivision.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
index 1a890f6733597..a234d2be67f66 100644
--- a/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
@@ -12,7 +12,6 @@ static_library("BinaryFormat") {
     "ELF.cpp",
     "MachO.cpp",
     "Magic.cpp",
-    "Minidump.cpp",
     "MsgPackDocument.cpp",
     "MsgPackDocumentYAML.cpp",
     "MsgPackReader.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
index 5590b27ac3784..1e0e918d3370c 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
@@ -4,14 +4,17 @@ static_library("CAS") {
     "ActionCache.cpp",
     "ActionCaches.cpp",
     "BuiltinCAS.cpp",
+    "BuiltinUnifiedCASDatabases.cpp",
     "DatabaseFile.cpp",
     "InMemoryCAS.cpp",
     "MappedFileRegionArena.cpp",
     "ObjectStore.cpp",
+    "OnDiskCAS.cpp",
     "OnDiskCommon.cpp",
     "OnDiskDataAllocator.cpp",
     "OnDiskGraphDB.cpp",
     "OnDiskKeyValueDB.cpp",
     "OnDiskTrieRawHashMap.cpp",
+    "UnifiedOnDiskCache.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index 444670212cafb..eb41df208941a 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -88,6 +88,7 @@ static_library("CodeGen") {
     "LatencyPriorityQueue.cpp",
     "LazyMachineBlockFrequencyInfo.cpp",
     "LexicalScopes.cpp",
+    "LibcallLoweringInfo.cpp",
     "LiveDebugValues/InstrRefBasedImpl.cpp",
     "LiveDebugValues/LiveDebugValues.cpp",
     "LiveDebugValues/VarLocBasedImpl.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn
index d8dfcbd3bac2d..854aa296ba161 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn
@@ -32,6 +32,7 @@ static_library("JITLink") {
     "ELF_loongarch.cpp",
     "ELF_ppc64.cpp",
     "ELF_riscv.cpp",
+    "ELF_systemz.cpp",
     "ELF_x86.cpp",
     "ELF_x86_64.cpp",
     "JITLink.cpp",
@@ -49,6 +50,7 @@ static_library("JITLink") {
     "loongarch.cpp",
     "ppc64.cpp",
     "riscv.cpp",
+    "systemz.cpp",
     "x86.cpp",
     "x86_64.cpp",
   ]
diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/TargetProcess/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/TargetProcess/BUILD.gn
index c4ce9906a7021..937e81b476967 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/TargetProcess/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/TargetProcess/BUILD.gn
@@ -12,6 +12,8 @@ static_library("TargetProcess") {
     "JITLoaderGDB.cpp",
     "JITLoaderPerf.cpp",
     "JITLoaderVTune.cpp",
+    "LibraryResolver.cpp",
+    "LibraryScanner.cpp",
     "OrcRTBootstrap.cpp",
     "RegisterEHFrames.cpp",
     "SimpleExecutorDylibManager.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
index 22aa0b6418132..8037c8d693cb8 100644
--- a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
@@ -82,6 +82,7 @@ static_library("IR") {
     "SafepointIRVerifier.cpp",
     "Statepoint.cpp",
     "StructuralHash.cpp",
+    "SystemLibraries.cpp",
     "Type.cpp",
     "TypeFinder.cpp",
     "TypedPointerType.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn
index 6dc75540731a0..3f6de22922a7c 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn
@@ -30,6 +30,12 @@ tablegen("BPFGenMCPseudoLowering") {
   td_file = "BPF.td"
 }
 
+tablegen("BPFGenSDNodeInfo") {
+  visibility = [ ":LLVMBPFCodeGen" ]
+  args = [ "-gen-sd-node-info" ]
+  td_file = "BPF.td"
+}
+
 tablegen("BPFGenRegisterBank") {
   visibility = [ ":LLVMBPFCodeGen" ]
   args = [ "-gen-register-bank" ]
@@ -43,6 +49,7 @@ static_library("LLVMBPFCodeGen") {
     ":BPFGenFastISel",
     ":BPFGenGlobalISel",
     ":BPFGenMCPseudoLowering",
+    ":BPFGenSDNodeInfo",
     ":BPFGenRegisterBank",
     "MCTargetDesc",
     "TargetInfo",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
index a1f5b475e2096..ad72c0069237d 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
@@ -151,6 +151,7 @@ static_library("LLVMRISCVCodeGen") {
     "RISCVMoveMerger.cpp",
     "RISCVOptWInstrs.cpp",
     "RISCVPostRAExpandPseudoInsts.cpp",
+    "RISCVPromoteConstant.cpp",
     "RISCVPushPopOptimizer.cpp",
     "RISCVRedundantCopyElimination.cpp",
     "RISCVRegisterInfo.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
index 2ece91331c5d8..11a57fcb008cd 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
@@ -12,10 +12,17 @@ tablegen("WebAssemblyGenFastISel") {
   td_file = "WebAssembly.td"
 }
 
+tablegen("WebAssemblyGenSDNodeInfo") {
+  visibility = [ ":LLVMWebAssemblyCodeGen" ]
+  args = [ "-gen-sd-node-info" ]
+  td_file = "WebAssembly.td"
+}
+
 static_library("LLVMWebAssemblyCodeGen") {
   deps = [
     ":WebAssemblyGenDAGISel",
     ":WebAssemblyGenFastISel",
+    ":WebAssemblyGenSDNodeInfo",
     "MCTargetDesc",
     "TargetInfo",
     "//llvm/include/llvm/Config:llvm-config",
diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
index b297dbd520def..1c91cb8c7eae8 100644
--- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
@@ -195,6 +195,12 @@ write_lit_config("lit_site_cfg") {
     extra_values += [ "LLVM_ENABLE_LIBXML2=0" ]  # Must be 0.
   }
 
+  if (llvm_enable_ondisk_cas) {
+    extra_values += [ "LLVM_ENABLE_ONDISK_CAS=1" ]
+  } else {
+    extra_values += [ "LLVM_ENABLE_ONDISK_CAS=0" ]  # Must be 0.
+  }
+
   if (llvm_enable_expensive_checks) {
     extra_values += [ "LLVM_ENABLE_EXPENSIVE_CHECKS=1" ]
   } else {
@@ -265,6 +271,7 @@ group("test") {
     "//llvm/tools/llvm-as",
     "//llvm/tools/llvm-bcanalyzer",
     "//llvm/tools/llvm-c-test",
+    "//llvm/tools/llvm-cas",
     "//llvm/tools/llvm-cat",
     "//llvm/tools/llvm-cfi-verify",
     "//llvm/tools/llvm-cgdata",
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-cas/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-cas/BUILD.gn
new file mode 100644
index 0000000000000..d4cd26a79c3b3
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-cas/BUILD.gn
@@ -0,0 +1,16 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("Options") {
+  visibility = [ ":llvm-cas" ]
+  args = [ "-gen-opt-parser-defs" ]
+}
+
+executable("llvm-cas") {
+  deps = [
+    ":Options",
+    "//llvm/lib/CAS",
+    "//llvm/lib/Option",
+    "//llvm/lib/Support",
+  ]
+  sources = [ "llvm-cas.cpp" ]
+}
diff --git a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
index 2d9eb6814c376..b10e0e6706cc3 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
@@ -9,8 +9,10 @@ unittest("CASTests") {
   ]
   sources = [
     "ActionCacheTest.cpp",
+    "BuiltinUnifiedCASDatabasesTest.cpp",
     "CASTestConfig.cpp",
     "ObjectStoreTest.cpp",
+    "UnifiedOnDiskCacheTest.cpp",
   ]
 
   if (llvm_enable_ondisk_cas) {
diff --git a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
index 6879a92e7769c..e40a8ee04dd38 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
@@ -40,6 +40,7 @@ unittest("CodeGenTests") {
     "MachineOperandTest.cpp",
     "PassManagerTest.cpp",
     "RegAllocScoreTest.cpp",
+    "RegisterTest.cpp",
     "ScalableVectorMVTsTest.cpp",
     "SchedBoundary.cpp",
     "SelectionDAGAddressAnalysisTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
index dfe6d6da005dd..111e4c997de92 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
@@ -24,6 +24,7 @@ unittest("OrcJITTests") {
     "JITLinkRedirectionManagerTest.cpp",
     "JITTargetMachineBuilderTest.cpp",
     "LazyCallThroughAndReexportsTest.cpp",
+    "LibraryResolverTest.cpp",
     "LookupAndRecordAddrsTest.cpp",
     "MachOPlatformTest.cpp",
     "MapperJITLinkMemoryManagerTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn
index 43916cef756ff..918132b38b6ed 100644
--- a/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn
@@ -10,6 +10,7 @@ source_set("Basic") {
     "DirectiveEmitter.cpp",
     "IntrinsicEmitter.cpp",
     "RISCVTargetDefEmitter.cpp",
+    "RuntimeLibcalls.cpp",
     "RuntimeLibcallsEmitter.cpp",
     "SDNodeProperties.cpp",
     "TableGen.cpp",
diff --git a/llvm/utils/lit/lit/LitConfig.py b/llvm/utils/lit/lit/LitConfig.py
index 8cef3c1fd8569..71dad85bbaddd 100644
--- a/llvm/utils/lit/lit/LitConfig.py
+++ b/llvm/utils/lit/lit/LitConfig.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import
 import inspect
 import os
+import enum
 import platform
 import sys
 
@@ -25,7 +26,7 @@ def __init__(
         self,
         progname,
         path,
-        quiet,
+        diagnostic_level,
         useValgrind,
         valgrindLeakCheck,
         valgrindArgs,
@@ -46,7 +47,7 @@ def __init__(
         self.progname = progname
         # The items to add to the PATH environment variable.
         self.path = [str(p) for p in path]
-        self.quiet = bool(quiet)
+        self.diagnostic_level = diagnostic_level
         self.useValgrind = bool(useValgrind)
         self.valgrindLeakCheck = bool(valgrindLeakCheck)
         self.valgrindUserArgs = list(valgrindArgs)
@@ -155,8 +156,7 @@ def per_test_coverage(self, value):
     def load_config(self, config, path):
         """load_config(config, path) - Load a config object from an alternate
         path."""
-        if self.debug:
-            self.note("load_config from %r" % path)
+        self.dbg("load_config from %r" % path)
         config.load_from_path(path, self)
         return config
 
@@ -209,6 +209,8 @@ def getToolsPath(self, dir, paths, tools):
         return dir
 
     def _write_message(self, kind, message):
+        if not self.diagnostic_level_enabled(kind):
+            return
         # Get the file/line where this message was generated.
         f = inspect.currentframe()
         # Step out of _write_message, and then out of wrapper.
@@ -234,13 +236,21 @@ def substitute(self, string):
                 "unable to find %r parameter, use '--param=%s=VALUE'" % (key, key)
             )
 
+    def diagnostic_level_enabled(self, kind):
+        if kind == "debug":
+            return self.debug
+        return DiagnosticLevel.create(self.diagnostic_level) >= DiagnosticLevel.create(
+            kind
+        )
+
+    def dbg(self, message):
+        self._write_message("debug", message)
+
     def note(self, message):
-        if not self.quiet:
-            self._write_message("note", message)
+        self._write_message("note", message)
 
     def warning(self, message):
-        if not self.quiet:
-            self._write_message("warning", message)
+        self._write_message("warning", message)
         self.numWarnings += 1
 
     def error(self, message):
@@ -250,3 +260,25 @@ def error(self, message):
     def fatal(self, message):
         self._write_message("fatal", message)
         sys.exit(2)
+
+
+@enum.unique
+class DiagnosticLevel(enum.IntEnum):
+    FATAL = 0
+    ERROR = 1
+    WARNING = 2
+    NOTE = 3
+
+    @classmethod
+    def create(cls, value):
+        if value == "fatal":
+            return cls.FATAL
+        if value == "error":
+            return cls.ERROR
+        if value == "warning":
+            return cls.WARNING
+        if value == "note":
+            return cls.NOTE
+        raise ValueError(
+            f"invalid diagnostic level {repr(value)} of type {type(value)}"
+        )
diff --git a/llvm/utils/lit/lit/LitTestCase.py b/llvm/utils/lit/lit/LitTestCase.py
index 566d068ad11ea..690b7cb6f13d5 100644
--- a/llvm/utils/lit/lit/LitTestCase.py
+++ b/llvm/utils/lit/lit/LitTestCase.py
@@ -46,7 +46,7 @@ def load_test_suite(inputs):
     lit_config = lit.LitConfig.LitConfig(
         progname="lit",
         path=[],
-        quiet=False,
+        diagnostic_level="note",
         useValgrind=False,
         valgrindLeakCheck=False,
         valgrindArgs=[],
diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 3176b1a257434..64148c6098327 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -462,16 +462,15 @@ def executeBuiltinMkdir(cmd, cmd_shenv):
     stderr = StringIO()
     exitCode = 0
     for dir in args:
-        cwd = cmd_shenv.cwd
-        dir = to_unicode(dir) if kIsWindows else to_bytes(dir)
-        cwd = to_unicode(cwd) if kIsWindows else to_bytes(cwd)
-        if not os.path.isabs(dir):
-            dir = lit.util.abs_path_preserve_drive(os.path.join(cwd, dir))
+        dir = pathlib.Path(dir)
+        cwd = pathlib.Path(to_unicode(cmd_shenv.cwd))
+        if not dir.is_absolute():
+            dir = lit.util.abs_path_preserve_drive(cwd / dir)
         if parent:
-            lit.util.mkdir_p(dir)
+            dir.mkdir(parents=True, exist_ok=True)
         else:
             try:
-                lit.util.mkdir(dir)
+                dir.mkdir(exist_ok=True)
             except OSError as err:
                 stderr.write("Error: 'mkdir' command failed, %s\n" % str(err))
                 exitCode = 1
@@ -2411,7 +2410,7 @@ def runOnce(
         return out, err, exitCode, timeoutInfo, status
 
     # Create the output directory if it does not already exist.
-    lit.util.mkdir_p(os.path.dirname(tmpBase))
+    pathlib.Path(tmpBase).parent.mkdir(parents=True, exist_ok=True)
 
     # Re-run failed tests up to test.allowed_retries times.
     execdir = os.path.dirname(test.getExecPath())
diff --git a/llvm/utils/lit/lit/TestingConfig.py b/llvm/utils/lit/lit/TestingConfig.py
index c250838250547..e7e545cc8e300 100644
--- a/llvm/utils/lit/lit/TestingConfig.py
+++ b/llvm/utils/lit/lit/TestingConfig.py
@@ -143,8 +143,7 @@ def load_from_path(self, path, litConfig):
         cfg_globals["__file__"] = path
         try:
             exec(compile(data, path, "exec"), cfg_globals, None)
-            if litConfig.debug:
-                litConfig.note("... loaded config %r" % path)
+            litConfig.dbg("... loaded config %r" % path)
         except SystemExit:
             e = sys.exc_info()[1]
             # We allow normal system exit inside a config file to just
diff --git a/llvm/utils/lit/lit/cl_arguments.py b/llvm/utils/lit/lit/cl_arguments.py
index 8238bc42395af..5c2ff4e70a3aa 100644
--- a/llvm/utils/lit/lit/cl_arguments.py
+++ b/llvm/utils/lit/lit/cl_arguments.py
@@ -15,6 +15,59 @@ class TestOrder(enum.Enum):
     SMART = "smart"
 
 
+@enum.unique
+class TestOutputLevel(enum.IntEnum):
+    OFF = 0
+    FAILED = 1
+    ALL = 2
+
+    @classmethod
+    def create(cls, value):
+        if value == "off":
+            return cls.OFF
+        if value == "failed":
+            return cls.FAILED
+        if value == "all":
+            return cls.ALL
+        raise ValueError(f"invalid output level {repr(value)} of type {type(value)}")
+
+
+class TestOutputAction(argparse.Action):
+    def __init__(self, option_strings, dest, **kwargs):
+        super().__init__(option_strings, dest, nargs=None, **kwargs)
+
+    def __call__(self, parser, namespace, value, option_string=None):
+        TestOutputAction.setOutputLevel(namespace, self.dest, value)
+
+    @classmethod
+    def setOutputLevel(cls, namespace, dest, value):
+        setattr(namespace, dest, value)
+        if dest == "test_output" and TestOutputLevel.create(
+            namespace.print_result_after
+        ) < TestOutputLevel.create(value):
+            setattr(namespace, "print_result_after", value)
+        elif dest == "print_result_after" and TestOutputLevel.create(
+            namespace.test_output
+        ) > TestOutputLevel.create(value):
+            setattr(namespace, "test_output", value)
+
+
+class AliasAction(argparse.Action):
+    def __init__(self, option_strings, dest, nargs=None, **kwargs):
+        self.expansion = kwargs.pop("alias", None)
+        if not self.expansion:
+            raise ValueError("no aliases expansion provided")
+        super().__init__(option_strings, dest, nargs=0, **kwargs)
+
+    def __call__(self, parser, namespace, value, option_string=None):
+        for e in self.expansion:
+            if callable(e):
+                e(namespace)
+            else:
+                dest, val = e
+                setattr(namespace, dest, val)
+
+
 def parse_args():
     parser = argparse.ArgumentParser(prog="lit", fromfile_prefix_chars="@")
     parser.add_argument(
@@ -55,41 +108,103 @@ def parse_args():
     )
 
     format_group = parser.add_argument_group("Output Format")
-    # FIXME: I find these names very confusing, although I like the
-    # functionality.
     format_group.add_argument(
-        "-q", "--quiet", help="Suppress no error output", action="store_true"
+        "--test-output",
+        help="Control whether the executed commands and their outputs are printed after each test has executed (default off). "
+        "If --print-result-after is set lower than the level given to --test-output, --print-result-after is raised to match.",
+        choices=["off", "failed", "all"],
+        default="off",
+        action=TestOutputAction,
+    )
+    format_group.add_argument(
+        "--print-result-after",
+        help="Control which the executed test names and results are printed after each test has executed (default all). "
+        "If --test-output is set higher than the level given to --print-result-after, --test-output is lowered to match.",
+        choices=["off", "failed", "all"],
+        default="all",
+        action=TestOutputAction,
+    )
+    format_group.add_argument(
+        "--diagnostic-level",
+        help="Control how verbose lit diagnostics should be (default note)",
+        choices=["error", "warning", "note"],
+        default="note",
+    )
+    format_group.add_argument(
+        "--terse-summary",
+        help="Print the elapsed time and the number of passed tests after all tests have finished (default on)",
+        action="store_true",
+        dest="terse_summary",
+    )
+    format_group.add_argument(
+        "--no-terse-summary",
+        help="Don't show the elapsed time after all tests have finished, and only show the number of failed tests.",
+        action="store_false",
+        dest="terse_summary",
+    )
+    parser.set_defaults(terse_summary=False)
+    format_group.add_argument(
+        "-q",
+        "--quiet",
+        help="Alias for '--diagnostic-level=error --test-output=off --terse-summary'",
+        action=AliasAction,
+        alias=[
+            lambda namespace: TestOutputAction.setOutputLevel(
+                namespace, "print_result_after", "failed"
+            ),
+            lambda namespace: TestOutputAction.setOutputLevel(
+                namespace, "test_output", "off"
+            ),
+            ("diagnostic_level", "error"),
+            ("terse_summary", True),
+        ],
     )
     format_group.add_argument(
         "-s",
         "--succinct",
-        help="Reduce amount of output."
-        " Additionally, show a progress bar,"
-        " unless --no-progress-bar is specified.",
-        action="store_true",
+        help="Alias for '--progress-bar --print-result-after=failed'",
+        action=AliasAction,
+        alias=[
+            ("useProgressBar", True),
+            lambda namespace: TestOutputAction.setOutputLevel(
+                namespace, "print_result_after", "failed"
+            ),
+        ],
     )
     format_group.add_argument(
         "-v",
         "--verbose",
-        dest="showOutput",
         help="For failed tests, show all output. For example, each command is"
         " printed before it is executed, so the last printed command is the one"
-        " that failed.",
-        action="store_true",
+        " that failed. Alias for '--test-output=failed'",
+        action=AliasAction,
+        alias=[
+            lambda namespace: TestOutputAction.setOutputLevel(
+                namespace, "test_output", "failed"
+            ),
+        ],
     )
     format_group.add_argument(
         "-vv",
         "--echo-all-commands",
-        dest="showOutput",
         help="Deprecated alias for -v.",
-        action="store_true",
+        action=AliasAction,
+        alias=[
+            lambda namespace: TestOutputAction.setOutputLevel(
+                namespace, "test_output", "failed"
+            ),
+        ],
     )
     format_group.add_argument(
         "-a",
         "--show-all",
-        dest="showAllOutput",
-        help="Enable -v, but for all tests not just failed tests.",
-        action="store_true",
+        help="Enable -v, but for all tests not just failed tests. Alias for '--test-output=all'",
+        action=AliasAction,
+        alias=[
+            lambda namespace: TestOutputAction.setOutputLevel(
+                namespace, "test_output", "all"
+            ),
+        ],
     )
     format_group.add_argument(
         "-r",
@@ -105,10 +220,16 @@ def parse_args():
         help="Write test results to the provided path",
         metavar="PATH",
     )
+    format_group.add_argument(
+        "--progress-bar",
+        dest="useProgressBar",
+        help="Show curses based progress bar",
+        action="store_true",
+    )
     format_group.add_argument(
         "--no-progress-bar",
         dest="useProgressBar",
-        help="Do not use curses based progress bar",
+        help="Do not use curses based progress bar (default)",
         action="store_false",
     )
 
diff --git a/llvm/utils/lit/lit/discovery.py b/llvm/utils/lit/lit/discovery.py
index 2e7f90c6bb0c9..ac06223b45345 100644
--- a/llvm/utils/lit/lit/discovery.py
+++ b/llvm/utils/lit/lit/discovery.py
@@ -62,8 +62,7 @@ def search1(path):
                 cfgpath = target
 
         # We found a test suite, create a new config for it and load it.
-        if litConfig.debug:
-            litConfig.note("loading suite config %r" % cfgpath)
+        litConfig.dbg("loading suite config %r" % cfgpath)
 
         cfg = TestingConfig.fromdefaults(litConfig)
         cfg.load_from_path(cfgpath, litConfig)
@@ -115,8 +114,7 @@ def search1(path_in_suite):
         # Otherwise, copy the current config and load the local configuration
         # file into it.
         config = copy.deepcopy(parent)
-        if litConfig.debug:
-            litConfig.note("loading local config %r" % cfgpath)
+        litConfig.dbg("loading local config %r" % cfgpath)
         config.load_from_path(cfgpath, litConfig)
         return config
 
@@ -137,8 +135,7 @@ def getTests(path, litConfig, testSuiteCache, localConfigCache):
         litConfig.warning("unable to find test suite for %r" % path)
         return (), ()
 
-    if litConfig.debug:
-        litConfig.note("resolved input %r to %r::%r" % (path, ts.name, path_in_suite))
+    litConfig.dbg("resolved input %r to %r::%r" % (path, ts.name, path_in_suite))
 
     return ts, getTestsInSuite(
         ts,
diff --git a/llvm/utils/lit/lit/display.py b/llvm/utils/lit/lit/display.py
index b565bbc7a4f93..4dc04d93d3ea7 100644
--- a/llvm/utils/lit/lit/display.py
+++ b/llvm/utils/lit/lit/display.py
@@ -2,7 +2,7 @@
 
 
 def create_display(opts, tests, total_tests, workers):
-    if opts.quiet:
+    if opts.print_result_after == "off" and not opts.useProgressBar:
         return NopDisplay()
 
     num_tests = len(tests)
@@ -10,7 +10,7 @@ def create_display(opts, tests, total_tests, workers):
     header = "-- Testing: %d%s tests, %d workers --" % (num_tests, of_total, workers)
 
     progress_bar = None
-    if opts.succinct and opts.useProgressBar:
+    if opts.useProgressBar:
         import lit.ProgressBar
 
         try:
@@ -96,8 +96,8 @@ def update(self, test):
 
         show_result = (
             test.isFailure()
-            or self.opts.showAllOutput
-            or (not self.opts.quiet and not self.opts.succinct)
+            and self.opts.print_result_after == "failed"
+            or self.opts.print_result_after == "all"
         )
         if show_result:
             if self.progress_bar:
@@ -134,7 +134,9 @@ def print_result(self, test):
         )
 
         # Show the test failure output, if requested.
-        if (test.isFailure() and self.opts.showOutput) or self.opts.showAllOutput:
+        if (
+            test.isFailure() and self.opts.test_output == "failed"
+        ) or self.opts.test_output == "all":
             if test.isFailure():
                 print("%s TEST '%s' FAILED %s" % ("*" * 20, test_name, "*" * 20))
             out = test.result.output
diff --git a/llvm/utils/lit/lit/llvm/config.py b/llvm/utils/lit/lit/llvm/config.py
index 913ba69d63328..59982c94b787c 100644
--- a/llvm/utils/lit/lit/llvm/config.py
+++ b/llvm/utils/lit/lit/llvm/config.py
@@ -53,7 +53,10 @@ def __init__(self, lit_config, config):
             self.use_lit_shell = True
 
             global lit_path_displayed
-            if not self.lit_config.quiet and lit_path_displayed is False:
+            if (
+                self.lit_config.diagnostic_level_enabled("note")
+                and lit_path_displayed is False
+            ):
                 self.lit_config.note("using lit tools: {}".format(path))
                 lit_path_displayed = True
 
@@ -527,7 +530,7 @@ def use_llvm_tool(
 
         if tool:
             tool = os.path.normpath(tool)
-            if not self.lit_config.quiet and not quiet:
+            if not quiet:
                 self.lit_config.note("using {}: {}".format(name, tool))
         return tool
 
@@ -637,10 +640,9 @@ def clang_setup(
                 ("%ms_abi_triple", self.make_msabi_triple(self.config.target_triple))
             )
         else:
-            if not self.lit_config.quiet:
-                self.lit_config.note(
-                    "No default target triple was found, some tests may fail as a result."
-                )
+            self.lit_config.note(
+                "No default target triple was found, some tests may fail as a result."
+            )
             self.config.substitutions.append(("%itanium_abi_triple", ""))
             self.config.substitutions.append(("%ms_abi_triple", ""))
 
diff --git a/llvm/utils/lit/lit/main.py b/llvm/utils/lit/lit/main.py
index a585cc0abdd48..07e809b168dc2 100755
--- a/llvm/utils/lit/lit/main.py
+++ b/llvm/utils/lit/lit/main.py
@@ -30,7 +30,7 @@ def main(builtin_params={}):
     lit_config = lit.LitConfig.LitConfig(
         progname=os.path.basename(sys.argv[0]),
         path=opts.path,
-        quiet=opts.quiet,
+        diagnostic_level=opts.diagnostic_level,
         useValgrind=opts.useValgrind,
         valgrindLeakCheck=opts.valgrindLeakCheck,
         valgrindArgs=opts.valgrindArgs,
@@ -332,7 +332,7 @@ def print_results(tests, elapsed, opts):
             opts.printPathRelativeCWD,
         )
 
-    print_summary(total_tests, tests_by_code, opts.quiet, elapsed)
+    print_summary(total_tests, tests_by_code, opts.terse_summary, elapsed)
 
 
 def print_group(tests, code, shown_codes, printPathRelativeCWD):
diff --git a/llvm/utils/lit/lit/util.py b/llvm/utils/lit/lit/util.py
index ce4c3c2df3436..e4e031b3e0898 100644
--- a/llvm/utils/lit/lit/util.py
+++ b/llvm/utils/lit/lit/util.py
@@ -5,6 +5,7 @@
 import math
 import numbers
 import os
+import pathlib
 import platform
 import re
 import signal
@@ -131,48 +132,17 @@ def abs_path_preserve_drive(path):
         # Since Python 3.8, os.path.realpath resolves sustitute drives,
         # so we should not use it. In Python 3.7, os.path.realpath
         # was implemented as os.path.abspath.
+        if isinstance(path, pathlib.Path):
+            return path.absolute()
         return os.path.abspath(path)
     else:
         # On UNIX, the current directory always has symbolic links resolved,
         # so any program accepting relative paths cannot preserve symbolic
         # links in paths and we should always use os.path.realpath.
+        if isinstance(path, pathlib.Path):
+            return path.resolve()
         return os.path.realpath(path)
 
-def mkdir(path):
-    try:
-        if platform.system() == "Windows":
-            from ctypes import windll
-            from ctypes import GetLastError, WinError
-
-            path = os.path.abspath(path)
-            # Make sure that the path uses backslashes here, in case
-            # python would have happened to use forward slashes, as the
-            # NT path format only supports backslashes.
-            path = path.replace("/", "\\")
-            NTPath = to_unicode(r"\\?\%s" % path)
-            if not windll.kernel32.CreateDirectoryW(NTPath, None):
-                raise WinError(GetLastError())
-        else:
-            os.mkdir(path)
-    except OSError:
-        e = sys.exc_info()[1]
-        # ignore EEXIST, which may occur during a race condition
-        if e.errno != errno.EEXIST:
-            raise
-
-
-def mkdir_p(path):
-    """mkdir_p(path) - Make the "path" directory, if it does not exist; this
-    will also make directories for any missing parent directories."""
-    if not path or os.path.exists(path):
-        return
-
-    parent = os.path.dirname(path)
-    if parent != path:
-        mkdir_p(parent)
-
-    mkdir(path)
-
 
 def listdir_files(dirname, suffixes=None, exclude_filenames=None, prefixes=None):
     """Yields files in a directory.
diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir1.input/empty b/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir1.input/empty
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir2.input/empty b/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir2.input/empty
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt
index d1329f5dbfaae..71972411bc4cf 100644
--- a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt
+++ b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt
@@ -1,2 +1,7 @@
 ## Tests glob pattern handling in the mkdir command.
+
+## This mkdir should fail because the `example_file*.input`s are regular files.
 # RUN: not mkdir %S/example_file*.input
+
+## This mkdir should succeed (so RUN should fail) because the `example_dir*.input`s that already exist are directories.
+# RUN: not mkdir %S/example_dir*.input
diff --git a/llvm/utils/lit/tests/Inputs/test-data-micro/dummy_format.py b/llvm/utils/lit/tests/Inputs/test-data-micro/dummy_format.py
index b400083a0d967..27b738edf8e14 100644
--- a/llvm/utils/lit/tests/Inputs/test-data-micro/dummy_format.py
+++ b/llvm/utils/lit/tests/Inputs/test-data-micro/dummy_format.py
@@ -1,9 +1,5 @@
 import os
-
-try:
-    import ConfigParser
-except ImportError:
-    import configparser as ConfigParser
+import configparser
 
 import lit.formats
 import lit.Test
@@ -16,7 +12,7 @@ def execute(self, test, lit_config):
 
         source_path = test.getSourcePath()
 
-        cfg = ConfigParser.ConfigParser()
+        cfg = configparser.ConfigParser()
         cfg.read(source_path)
 
         # Create the basic test result.
diff --git a/llvm/utils/lit/tests/Inputs/test-data/dummy_format.py b/llvm/utils/lit/tests/Inputs/test-data/dummy_format.py
index 30bd1814a6a42..b4c1b92637d01 100644
--- a/llvm/utils/lit/tests/Inputs/test-data/dummy_format.py
+++ b/llvm/utils/lit/tests/Inputs/test-data/dummy_format.py
@@ -1,9 +1,5 @@
 import os
-
-try:
-    import ConfigParser
-except ImportError:
-    import configparser as ConfigParser
+import configparser
 
 import lit.formats
 import lit.Test
@@ -16,7 +12,7 @@ def execute(self, test, lit_config):
 
         source_path = test.getSourcePath()
 
-        cfg = ConfigParser.ConfigParser()
+        cfg = configparser.ConfigParser()
         cfg.read(source_path)
 
         # Create the basic test result.
diff --git a/llvm/utils/lit/tests/Inputs/verbosity/fail.txt b/llvm/utils/lit/tests/Inputs/verbosity/fail.txt
new file mode 100644
index 0000000000000..2bcca02683614
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/verbosity/fail.txt
@@ -0,0 +1,2 @@
+RUN: echo "fail test output"
+RUN: fail
\ No newline at end of file
diff --git a/llvm/utils/lit/tests/Inputs/verbosity/lit.cfg b/llvm/utils/lit/tests/Inputs/verbosity/lit.cfg
new file mode 100644
index 0000000000000..c3a1f4f4d873a
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/verbosity/lit.cfg
@@ -0,0 +1,11 @@
+import lit.formats
+
+config.name = "verbosity"
+config.suffixes = [".txt"]
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
+
+lit_config.dbg("this is a debug log")
+lit_config.note("this is a note")
+lit_config.warning("this is a warning")
diff --git a/llvm/utils/lit/tests/Inputs/verbosity/pass.txt b/llvm/utils/lit/tests/Inputs/verbosity/pass.txt
new file mode 100644
index 0000000000000..f64843827e147
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/verbosity/pass.txt
@@ -0,0 +1 @@
+RUN: echo "pass test output"
\ No newline at end of file
diff --git a/llvm/utils/lit/tests/Inputs/verbosity/unsupported.txt b/llvm/utils/lit/tests/Inputs/verbosity/unsupported.txt
new file mode 100644
index 0000000000000..f5ebd4da178f8
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/verbosity/unsupported.txt
@@ -0,0 +1,2 @@
+REQUIRES: asdf
+RUN: not echo "unsupported test output"
diff --git a/llvm/utils/lit/tests/Inputs/verbosity/xfail.txt b/llvm/utils/lit/tests/Inputs/verbosity/xfail.txt
new file mode 100644
index 0000000000000..85001cc22b08e
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/verbosity/xfail.txt
@@ -0,0 +1,2 @@
+XFAIL: *
+RUN: not echo "xfail test output"
\ No newline at end of file
diff --git a/llvm/utils/lit/tests/Inputs/verbosity/xpass.txt b/llvm/utils/lit/tests/Inputs/verbosity/xpass.txt
new file mode 100644
index 0000000000000..87c95ec75ecdc
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/verbosity/xpass.txt
@@ -0,0 +1,2 @@
+XFAIL: *
+RUN: echo "xpass test output"
diff --git a/llvm/utils/lit/tests/Inputs/xunit-output/dummy_format.py b/llvm/utils/lit/tests/Inputs/xunit-output/dummy_format.py
index efac0b561c44b..43da0973df614 100644
--- a/llvm/utils/lit/tests/Inputs/xunit-output/dummy_format.py
+++ b/llvm/utils/lit/tests/Inputs/xunit-output/dummy_format.py
@@ -1,9 +1,5 @@
 import os
-
-try:
-    import ConfigParser
-except ImportError:
-    import configparser as ConfigParser
+import configparser
 
 import lit.formats
 import lit.Test
@@ -16,7 +12,7 @@ def execute(self, test, lit_config):
 
         source_path = test.getSourcePath()
 
-        cfg = ConfigParser.ConfigParser()
+        cfg = configparser.ConfigParser()
         cfg.read(source_path)
 
         # Create the basic test result.
diff --git a/llvm/utils/lit/tests/lit-opts.py b/llvm/utils/lit/tests/lit-opts.py
index a533a59d9d124..0759c1d17be58 100644
--- a/llvm/utils/lit/tests/lit-opts.py
+++ b/llvm/utils/lit/tests/lit-opts.py
@@ -12,13 +12,13 @@
 
 # Check that LIT_OPTS understands multiple options with arbitrary spacing.
 #
-# RUN: env LIT_OPTS='-a -v  -Dvar=foobar' \
+# RUN: env LIT_OPTS='-v -a  -Dvar=foobar' \
 # RUN: %{lit} -s %{inputs}/lit-opts \
 # RUN: | FileCheck -check-prefix=SHOW-ALL -DVAR=foobar %s
 
 # Check that LIT_OPTS parses shell-like quotes and escapes.
 #
-# RUN: env LIT_OPTS='-a   -v -Dvar="foo bar"\ baz' \
+# RUN: env LIT_OPTS='-v   -a -Dvar="foo bar"\ baz' \
 # RUN: %{lit} -s %{inputs}/lit-opts \
 # RUN: | FileCheck -check-prefix=SHOW-ALL -DVAR="foo bar baz" %s
 
diff --git a/llvm/utils/lit/tests/per-test-coverage-by-lit-cfg.py b/llvm/utils/lit/tests/per-test-coverage-by-lit-cfg.py
index 189c1cebd623b..b3af606c52f18 100644
--- a/llvm/utils/lit/tests/per-test-coverage-by-lit-cfg.py
+++ b/llvm/utils/lit/tests/per-test-coverage-by-lit-cfg.py
@@ -1,10 +1,10 @@
 # Test if lit_config.per_test_coverage in lit.cfg sets individual test case coverage.
 
-# RUN: %{lit} -a -vv -Dexecute_external=False \
+# RUN: %{lit} -a -Dexecute_external=False \
 # RUN:     %{inputs}/per-test-coverage-by-lit-cfg/per-test-coverage-by-lit-cfg.py | \
 # RUN:   FileCheck -DOUT=stdout %s
 
-# RUN: %{lit} -a -vv -Dexecute_external=True \
+# RUN: %{lit} -a -Dexecute_external=True \
 # RUN:     %{inputs}/per-test-coverage-by-lit-cfg/per-test-coverage-by-lit-cfg.py | \
 # RUN:   FileCheck -DOUT=stderr %s
 
diff --git a/llvm/utils/lit/tests/per-test-coverage.py b/llvm/utils/lit/tests/per-test-coverage.py
index cf5e82c44dc51..ba513554ae76e 100644
--- a/llvm/utils/lit/tests/per-test-coverage.py
+++ b/llvm/utils/lit/tests/per-test-coverage.py
@@ -1,10 +1,10 @@
 # Test LLVM_PROFILE_FILE is set when --per-test-coverage is passed to command line.
 
-# RUN: %{lit} -a -vv --per-test-coverage -Dexecute_external=False \
+# RUN: %{lit} -a --per-test-coverage -Dexecute_external=False \
 # RUN:     %{inputs}/per-test-coverage/per-test-coverage.py | \
 # RUN:   FileCheck -DOUT=stdout %s
 
-# RUN: %{lit} -a -vv --per-test-coverage -Dexecute_external=True \
+# RUN: %{lit} -a --per-test-coverage -Dexecute_external=True \
 # RUN:        %{inputs}/per-test-coverage/per-test-coverage.py | \
 # RUN:   FileCheck -DOUT=stderr %s
 
diff --git a/llvm/utils/lit/tests/shtest-cat.py b/llvm/utils/lit/tests/shtest-cat.py
index 5efe25c41684a..9763f9fbf1a9d 100644
--- a/llvm/utils/lit/tests/shtest-cat.py
+++ b/llvm/utils/lit/tests/shtest-cat.py
@@ -1,6 +1,6 @@
 ## Test the cat command.
 #
-# RUN: not %{lit} -a -v %{inputs}/shtest-cat \
+# RUN: not %{lit} -v %{inputs}/shtest-cat \
 # RUN: | FileCheck -match-full-lines %s
 # END.
 
diff --git a/llvm/utils/lit/tests/shtest-env-negative.py b/llvm/utils/lit/tests/shtest-env-negative.py
index c8b59b224e7c4..236c6a19e694b 100644
--- a/llvm/utils/lit/tests/shtest-env-negative.py
+++ b/llvm/utils/lit/tests/shtest-env-negative.py
@@ -1,6 +1,6 @@
 ## Test the env command (failing tests).
 
-# RUN: not %{lit} -a -v %{inputs}/shtest-env-negative \
+# RUN: not %{lit} -v %{inputs}/shtest-env-negative \
 # RUN: | FileCheck -match-full-lines %s
 #
 # END.
diff --git a/llvm/utils/lit/tests/shtest-env-path.py b/llvm/utils/lit/tests/shtest-env-path.py
index bf459ae53fbc0..7f04756ed6ad5 100644
--- a/llvm/utils/lit/tests/shtest-env-path.py
+++ b/llvm/utils/lit/tests/shtest-env-path.py
@@ -1,9 +1,9 @@
 ## Tests env command for setting the PATH variable.
 
 # The test is using /bin/sh. Limit to system known to have /bin/sh.
-# REQUIRES: system-linux
+# REQUIRES: system-linux || system-darwin
 
-# RUN: %{lit} -a -v %{inputs}/shtest-env-path/path.txt \
+# RUN: %{lit} -a %{inputs}/shtest-env-path/path.txt \
 # RUN:   | FileCheck -match-full-lines %s
 #
 # END.
diff --git a/llvm/utils/lit/tests/shtest-env-positive.py b/llvm/utils/lit/tests/shtest-env-positive.py
index 4f07b69ecc7d3..089acd308c5c5 100644
--- a/llvm/utils/lit/tests/shtest-env-positive.py
+++ b/llvm/utils/lit/tests/shtest-env-positive.py
@@ -1,6 +1,6 @@
 ## Test the env command (passing tests).
 
-# RUN: %{lit} -a -v %{inputs}/shtest-env-positive \
+# RUN: %{lit} -a %{inputs}/shtest-env-positive \
 # RUN:   | FileCheck -match-full-lines %s
 #
 # END.
diff --git a/llvm/utils/lit/tests/shtest-export.py b/llvm/utils/lit/tests/shtest-export.py
index f2de8e8cd8b5f..d45a94a5eb830 100644
--- a/llvm/utils/lit/tests/shtest-export.py
+++ b/llvm/utils/lit/tests/shtest-export.py
@@ -1,6 +1,6 @@
 ## Test the export command.
 
-# RUN: not %{lit} -a -v %{inputs}/shtest-export \
+# RUN: not %{lit} -v %{inputs}/shtest-export \
 # RUN: | FileCheck -match-full-lines %s
 #
 # END.
diff --git a/llvm/utils/lit/tests/shtest-glob.py b/llvm/utils/lit/tests/shtest-glob.py
index ae90f31907d49..ba609e036c166 100644
--- a/llvm/utils/lit/tests/shtest-glob.py
+++ b/llvm/utils/lit/tests/shtest-glob.py
@@ -1,12 +1,13 @@
 ## Tests glob pattern handling in echo command.
 
-# RUN: not %{lit} -a -v %{inputs}/shtest-glob \
-# RUN: | FileCheck -dump-input=fail -match-full-lines %s
-#
+# RUN: not %{lit} -v %{inputs}/shtest-glob \
+# RUN: | FileCheck -dump-input=fail -match-full-lines --implicit-check-not=Error: %s
 # END.
 
 # CHECK: UNRESOLVED: shtest-glob :: glob-echo.txt ({{[^)]*}})
 # CHECK: TypeError: string argument expected, got 'GlobItem'
 
-# CHECK: FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}})
-# CHECK: # error: command failed with exit status: 1
+# CHECK:      FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}})
+# CHECK:      # | Error: 'mkdir' command failed, {{.*}}example_file1.input'
+# CHECK-NEXT: # | Error: 'mkdir' command failed, {{.*}}example_file2.input'
+# CHECK:      # error: command failed with exit status: 1
diff --git a/llvm/utils/lit/tests/shtest-not.py b/llvm/utils/lit/tests/shtest-not.py
index b42769ffd9383..e735d38260b37 100644
--- a/llvm/utils/lit/tests/shtest-not.py
+++ b/llvm/utils/lit/tests/shtest-not.py
@@ -1,6 +1,6 @@
 # Check the not command
 
-# RUN: not %{lit} -a -v %{inputs}/shtest-not \
+# RUN: not %{lit} -a %{inputs}/shtest-not \
 # RUN: | FileCheck -match-full-lines %s
 #
 # END.
diff --git a/llvm/utils/lit/tests/shtest-pushd-popd.py b/llvm/utils/lit/tests/shtest-pushd-popd.py
index f917c1a4a4599..799e9d6d65951 100644
--- a/llvm/utils/lit/tests/shtest-pushd-popd.py
+++ b/llvm/utils/lit/tests/shtest-pushd-popd.py
@@ -1,6 +1,6 @@
 # Check the pushd and popd commands
 
-# RUN: not %{lit} -a -v %{inputs}/shtest-pushd-popd \
+# RUN: not %{lit} -v %{inputs}/shtest-pushd-popd \
 # RUN: | FileCheck -match-full-lines %s
 #
 # END.
diff --git a/llvm/utils/lit/tests/shtest-readfile-external.py b/llvm/utils/lit/tests/shtest-readfile-external.py
index 6fe1088efd674..0d8e3ad1242bf 100644
--- a/llvm/utils/lit/tests/shtest-readfile-external.py
+++ b/llvm/utils/lit/tests/shtest-readfile-external.py
@@ -4,7 +4,7 @@
 # ALLOW_RETRIES: 2
 
 # UNSUPPORTED: system-windows
-# RUN: env LIT_USE_INTERNAL_SHELL=0 not %{lit} -a -v %{inputs}/shtest-readfile | FileCheck -match-full-lines -DTEMP_PATH=%S/Inputs/shtest-readfile/Output %s
+# RUN: env LIT_USE_INTERNAL_SHELL=0 not %{lit} -v %{inputs}/shtest-readfile | FileCheck -match-full-lines -DTEMP_PATH=%S/Inputs/shtest-readfile/Output %s
 
 # CHECK: -- Testing: 5 tests{{.*}}
 
diff --git a/llvm/utils/lit/tests/shtest-readfile.py b/llvm/utils/lit/tests/shtest-readfile.py
index 218da2257bcff..ca57db82e6617 100644
--- a/llvm/utils/lit/tests/shtest-readfile.py
+++ b/llvm/utils/lit/tests/shtest-readfile.py
@@ -3,7 +3,7 @@
 # TODO(boomanaiden154): This sometimes fails, possibly due to buffers not being flushed.
 # ALLOW_RETRIES: 2
 
-# RUN: env LIT_USE_INTERNAL_SHELL=1  not %{lit} -a -v %{inputs}/shtest-readfile | FileCheck -match-full-lines -DTEMP_PATH=%S%{fs-sep}Inputs%{fs-sep}shtest-readfile%{fs-sep}Output %s
+# RUN: env LIT_USE_INTERNAL_SHELL=1  not %{lit} -v %{inputs}/shtest-readfile | FileCheck -match-full-lines -DTEMP_PATH=%S%{fs-sep}Inputs%{fs-sep}shtest-readfile%{fs-sep}Output %s
 
 # CHECK: -- Testing: 5 tests{{.*}}
 
diff --git a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
index 893270ec68f68..d5340a7d2efb9 100644
--- a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
+++ b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
@@ -2,9 +2,9 @@
 
 # ulimit does not work on non-POSIX platforms.
 # These tests are specific to options that Darwin does not support.
-# UNSUPPORTED: system-windows, system-darwin, system-aix
+# UNSUPPORTED: system-windows, system-cygwin, system-darwin, system-aix, system-solaris
 
-# RUN: not %{lit} -a -v %{inputs}/shtest-ulimit-nondarwin | FileCheck %s
+# RUN: not %{lit} -v %{inputs}/shtest-ulimit-nondarwin | FileCheck %s
 
 # CHECK: -- Testing: 2 tests{{.*}}
 
diff --git a/llvm/utils/lit/tests/shtest-ulimit.py b/llvm/utils/lit/tests/shtest-ulimit.py
index 21e5a5e2491d1..582477bef65fc 100644
--- a/llvm/utils/lit/tests/shtest-ulimit.py
+++ b/llvm/utils/lit/tests/shtest-ulimit.py
@@ -8,7 +8,7 @@
 # RUN: %{python} %S/Inputs/shtest-ulimit/print_limits.py | grep RLIMIT_NOFILE \
 # RUN:   | sed -n -e 's/.*=//p' | tr -d '\n' > %t.nofile_limit
 
-# RUN: not %{lit} -a -v %{inputs}/shtest-ulimit --order=lexical \
+# RUN: not %{lit} -v %{inputs}/shtest-ulimit --order=lexical \
 # RUN:   | FileCheck -DBASE_NOFILE_LIMIT=%{readfile:%t.nofile_limit} %s
 
 # CHECK: -- Testing: 3 tests{{.*}}
diff --git a/llvm/utils/lit/tests/shtest-umask.py b/llvm/utils/lit/tests/shtest-umask.py
index e67f0308db661..8af81ec3b4ebd 100644
--- a/llvm/utils/lit/tests/shtest-umask.py
+++ b/llvm/utils/lit/tests/shtest-umask.py
@@ -1,6 +1,6 @@
 # Check the umask command
 
-# RUN: not %{lit} -a -v %{inputs}/shtest-umask | FileCheck -match-full-lines %s
+# RUN: not %{lit} -v %{inputs}/shtest-umask | FileCheck -match-full-lines %s
 # TODO(boomanaiden154): We should be asserting that we get expected behavior
 # on Windows rather than just listing this as unsupported.
 # UNSUPPORTED: system-windows
diff --git a/llvm/utils/lit/tests/unit/TestRunner.py b/llvm/utils/lit/tests/unit/TestRunner.py
index 09470c7b9386e..a3fa62e1ef0e1 100644
--- a/llvm/utils/lit/tests/unit/TestRunner.py
+++ b/llvm/utils/lit/tests/unit/TestRunner.py
@@ -30,7 +30,7 @@ def load_keyword_parser_lit_tests():
         lit_config = lit.LitConfig.LitConfig(
             progname="lit",
             path=[],
-            quiet=False,
+            diagnostic_level="note",
             useValgrind=False,
             valgrindLeakCheck=False,
             valgrindArgs=[],
diff --git a/llvm/utils/lit/tests/verbosity.py b/llvm/utils/lit/tests/verbosity.py
new file mode 100644
index 0000000000000..9b1690695d392
--- /dev/null
+++ b/llvm/utils/lit/tests/verbosity.py
@@ -0,0 +1,1130 @@
+# Test various combinations of options controlling lit stdout and stderr output
+
+# RUN: mkdir -p %t
+
+### Test default
+
+# RUN: not %{lit} %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# NO-ARGS:      -- Testing: 5 tests, 1 workers --
+# NO-ARGS-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# NO-ARGS-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# NO-ARGS-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# NO-ARGS-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# NO-ARGS-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# NO-ARGS-NEXT: ********************
+# NO-ARGS-NEXT: Failed Tests (1):
+# NO-ARGS-NEXT:   verbosity :: fail.txt
+# NO-ARGS-EMPTY:
+# NO-ARGS-NEXT: ********************
+# NO-ARGS-NEXT: Unexpectedly Passed Tests (1):
+# NO-ARGS-NEXT:   verbosity :: xpass.txt
+# NO-ARGS-EMPTY:
+# NO-ARGS-EMPTY:
+# NO-ARGS-NEXT: Testing Time: {{.*}}s
+# NO-ARGS-EMPTY:
+# NO-ARGS-NEXT: Total Discovered Tests: 5
+# NO-ARGS-NEXT:   Unsupported        : 1 (20.00%)
+# NO-ARGS-NEXT:   Passed             : 1 (20.00%)
+# NO-ARGS-NEXT:   Expectedly Failed  : 1 (20.00%)
+# NO-ARGS-NEXT:   Failed             : 1 (20.00%)
+# NO-ARGS-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# NO-ARGS-ERR: lit.py: {{.*}}lit.cfg:{{[0-9]+}}: note: this is a note
+# NO-ARGS-ERR-NEXT: lit.py: {{.*}}lit.cfg:{{[0-9]+}}: warning: this is a warning
+# NO-ARGS-ERR-EMPTY:
+# NO-ARGS-ERR-NEXT: 1 warning(s) in tests
+
+
+### Test aliases
+
+# RUN: not %{lit} --succinct %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix SUCCINCT < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# SUCCINCT:      -- Testing: 5 tests, 1 workers --
+# SUCCINCT-NEXT: Testing:
+# SUCCINCT-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# SUCCINCT-NEXT: Testing:  0.. 10..
+# SUCCINCT-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# SUCCINCT-NEXT: Testing:  0.. 10.. 20.. 30.. 40.. 50.. 60.. 70.. 80.. 90..
+# SUCCINCT-NEXT: ********************
+# SUCCINCT-NEXT: Failed Tests (1):
+# SUCCINCT-NEXT:   verbosity :: fail.txt
+# SUCCINCT-EMPTY:
+# SUCCINCT-NEXT: ********************
+# SUCCINCT-NEXT: Unexpectedly Passed Tests (1):
+# SUCCINCT-NEXT:   verbosity :: xpass.txt
+# SUCCINCT-EMPTY:
+# SUCCINCT-EMPTY:
+# SUCCINCT-NEXT: Testing Time: {{.*}}s
+# SUCCINCT-EMPTY:
+# SUCCINCT-NEXT: Total Discovered Tests: 5
+# SUCCINCT-NEXT:   Unsupported        : 1 (20.00%)
+# SUCCINCT-NEXT:   Passed             : 1 (20.00%)
+# SUCCINCT-NEXT:   Expectedly Failed  : 1 (20.00%)
+# SUCCINCT-NEXT:   Failed             : 1 (20.00%)
+# SUCCINCT-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} --verbose %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix VERBOSE < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# VERBOSE:      -- Testing: 5 tests, 1 workers --
+# VERBOSE-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# VERBOSE-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# VERBOSE-NEXT: Exit Code: 127
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: Command Output (stdout):
+# VERBOSE-NEXT: --
+# VERBOSE-NEXT: # {{R}}UN: at line 1
+# VERBOSE-NEXT: echo "fail test output"
+# VERBOSE-NEXT: # executed command: echo 'fail test output'
+# VERBOSE-NEXT: # .---command stdout------------
+# VERBOSE-NEXT: # | fail test output
+# VERBOSE-NEXT: # `-----------------------------
+# VERBOSE-NEXT: # {{R}}UN: at line 2
+# VERBOSE-NEXT: fail
+# VERBOSE-NEXT: # executed command: fail
+# VERBOSE-NEXT: # .---command stderr------------
+# VERBOSE-NEXT: # | 'fail': command not found
+# VERBOSE-NEXT: # `-----------------------------
+# VERBOSE-NEXT: # error: command failed with exit status: 127
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: --
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: ********************
+# VERBOSE-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# VERBOSE-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# VERBOSE-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# VERBOSE-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# VERBOSE-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# VERBOSE-NEXT: Exit Code: 0
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: Command Output (stdout):
+# VERBOSE-NEXT: --
+# VERBOSE-NEXT: # {{R}}UN: at line 2
+# VERBOSE-NEXT: echo "xpass test output"
+# VERBOSE-NEXT: # executed command: echo 'xpass test output'
+# VERBOSE-NEXT: # .---command stdout------------
+# VERBOSE-NEXT: # | xpass test output
+# VERBOSE-NEXT: # `-----------------------------
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: --
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: ********************
+# VERBOSE-NEXT: ********************
+# VERBOSE-NEXT: Failed Tests (1):
+# VERBOSE-NEXT:   verbosity :: fail.txt
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: ********************
+# VERBOSE-NEXT: Unexpectedly Passed Tests (1):
+# VERBOSE-NEXT:   verbosity :: xpass.txt
+# VERBOSE-EMPTY:
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: Testing Time: {{.*}}s
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: Total Discovered Tests: 5
+# VERBOSE-NEXT:   Unsupported        : 1 (20.00%)
+# VERBOSE-NEXT:   Passed             : 1 (20.00%)
+# VERBOSE-NEXT:   Expectedly Failed  : 1 (20.00%)
+# VERBOSE-NEXT:   Failed             : 1 (20.00%)
+# VERBOSE-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} --show-all %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix SHOW-ALL < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# SHOW-ALL:      -- Testing: 5 tests, 1 workers --
+# SHOW-ALL-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# SHOW-ALL-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# SHOW-ALL-NEXT: Exit Code: 127
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: Command Output (stdout):
+# SHOW-ALL-NEXT: --
+# SHOW-ALL-NEXT: # {{R}}UN: at line 1
+# SHOW-ALL-NEXT: echo "fail test output"
+# SHOW-ALL-NEXT: # executed command: echo 'fail test output'
+# SHOW-ALL-NEXT: # .---command stdout------------
+# SHOW-ALL-NEXT: # | fail test output
+# SHOW-ALL-NEXT: # `-----------------------------
+# SHOW-ALL-NEXT: # {{R}}UN: at line 2
+# SHOW-ALL-NEXT: fail
+# SHOW-ALL-NEXT: # executed command: fail
+# SHOW-ALL-NEXT: # .---command stderr------------
+# SHOW-ALL-NEXT: # | 'fail': command not found
+# SHOW-ALL-NEXT: # `-----------------------------
+# SHOW-ALL-NEXT: # error: command failed with exit status: 127
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: --
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: ********************
+# SHOW-ALL-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# SHOW-ALL-NEXT: Exit Code: 0
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: Command Output (stdout):
+# SHOW-ALL-NEXT: --
+# SHOW-ALL-NEXT: # {{R}}UN: at line 1
+# SHOW-ALL-NEXT: echo "pass test output"
+# SHOW-ALL-NEXT: # executed command: echo 'pass test output'
+# SHOW-ALL-NEXT: # .---command stdout------------
+# SHOW-ALL-NEXT: # | pass test output
+# SHOW-ALL-NEXT: # `-----------------------------
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: --
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: ********************
+# SHOW-ALL-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# SHOW-ALL-NEXT: Test requires the following unavailable features: asdf
+# SHOW-ALL-NEXT: ********************
+# SHOW-ALL-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# SHOW-ALL-NEXT: Exit Code: 1
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: Command Output (stdout):
+# SHOW-ALL-NEXT: --
+# SHOW-ALL-NEXT: # {{R}}UN: at line 2
+# SHOW-ALL-NEXT: not echo "xfail test output"
+# SHOW-ALL-NEXT: # executed command: not echo 'xfail test output'
+# SHOW-ALL-NEXT: # .---command stdout------------
+# SHOW-ALL-NEXT: # | xfail test output
+# SHOW-ALL-NEXT: # `-----------------------------
+# SHOW-ALL-NEXT: # error: command failed with exit status: 1
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: --
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: ********************
+# SHOW-ALL-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# SHOW-ALL-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# SHOW-ALL-NEXT: Exit Code: 0
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: Command Output (stdout):
+# SHOW-ALL-NEXT: --
+# SHOW-ALL-NEXT: # {{R}}UN: at line 2
+# SHOW-ALL-NEXT: echo "xpass test output"
+# SHOW-ALL-NEXT: # executed command: echo 'xpass test output'
+# SHOW-ALL-NEXT: # .---command stdout------------
+# SHOW-ALL-NEXT: # | xpass test output
+# SHOW-ALL-NEXT: # `-----------------------------
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: --
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: ********************
+# SHOW-ALL-NEXT: ********************
+# SHOW-ALL-NEXT: Failed Tests (1):
+# SHOW-ALL-NEXT:   verbosity :: fail.txt
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: ********************
+# SHOW-ALL-NEXT: Unexpectedly Passed Tests (1):
+# SHOW-ALL-NEXT:   verbosity :: xpass.txt
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: Testing Time: {{.*}}s
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: Total Discovered Tests: 5
+# SHOW-ALL-NEXT:   Unsupported        : 1 (20.00%)
+# SHOW-ALL-NEXT:   Passed             : 1 (20.00%)
+# SHOW-ALL-NEXT:   Expectedly Failed  : 1 (20.00%)
+# SHOW-ALL-NEXT:   Failed             : 1 (20.00%)
+# SHOW-ALL-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} --quiet %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-ERR --implicit-check-not lit < %t/stderr.txt
+
+# QUIET:      -- Testing: 5 tests, 1 workers --
+# QUIET-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# QUIET-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# QUIET-NEXT: ********************
+# QUIET-NEXT: Failed Tests (1):
+# QUIET-NEXT:   verbosity :: fail.txt
+# QUIET-EMPTY:
+# QUIET-NEXT: ********************
+# QUIET-NEXT: Unexpectedly Passed Tests (1):
+# QUIET-NEXT:   verbosity :: xpass.txt
+# QUIET-EMPTY:
+# QUIET-EMPTY:
+# QUIET-NEXT: Total Discovered Tests: 5
+# QUIET-NEXT:   Failed             : 1 (20.00%)
+# QUIET-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# QUIET-ERR: 1 warning(s) in tests
+
+
+### Test log output
+
+# RUN: not %{lit} --debug %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix DEBUG < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix DEBUG-ERR --implicit-check-not lit < %t/stderr.txt
+
+# DEBUG:      -- Testing: 5 tests, 1 workers --
+# DEBUG-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# DEBUG-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# DEBUG-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# DEBUG-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# DEBUG-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# DEBUG-NEXT: ********************
+# DEBUG-NEXT: Failed Tests (1):
+# DEBUG-NEXT:   verbosity :: fail.txt
+# DEBUG-EMPTY:
+# DEBUG-NEXT: ********************
+# DEBUG-NEXT: Unexpectedly Passed Tests (1):
+# DEBUG-NEXT:   verbosity :: xpass.txt
+# DEBUG-EMPTY:
+# DEBUG-EMPTY:
+# DEBUG-NEXT: Testing Time: {{.*}}s
+# DEBUG-EMPTY:
+# DEBUG-NEXT: Total Discovered Tests: 5
+# DEBUG-NEXT:   Unsupported        : 1 (20.00%)
+# DEBUG-NEXT:   Passed             : 1 (20.00%)
+# DEBUG-NEXT:   Expectedly Failed  : 1 (20.00%)
+# DEBUG-NEXT:   Failed             : 1 (20.00%)
+# DEBUG-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# DEBUG-ERR:      lit.py: {{.*}}discovery.py:{{[0-9]+}}: debug: loading suite config '{{.*}}lit.cfg'
+# DEBUG-ERR-NEXT: lit.py: {{.*}}lit.cfg:{{[0-9]+}}: debug: this is a debug log
+# DEBUG-ERR-NEXT: lit.py: {{.*}}lit.cfg:{{[0-9]+}}: note: this is a note
+# DEBUG-ERR-NEXT: lit.py: {{.*}}lit.cfg:{{[0-9]+}}: warning: this is a warning
+# DEBUG-ERR-NEXT: lit.py: {{.*}}TestingConfig.py:{{[0-9]+}}: debug: ... loaded config '{{.*}}lit.cfg'
+# DEBUG-ERR-NEXT: lit.py: {{.*}}discovery.py:{{[0-9]+}}: debug: resolved input '{{.*}}verbosity' to 'verbosity'::()
+# DEBUG-ERR-EMPTY:
+# DEBUG-ERR-NEXT: 1 warning(s) in tests
+
+
+# RUN: not %{lit} --diagnostic-level note %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RUN: not %{lit} --diagnostic-level warning %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix WARNING-ERR --implicit-check-not lit < %t/stderr.txt
+
+# WARNING-ERR: lit.py: {{.*}}lit.cfg:{{[0-9]+}}: warning: this is a warning
+# WARNING-ERR-EMPTY:
+# WARNING-ERR-NEXT: 1 warning(s) in tests
+
+# RUN: not %{lit} --diagnostic-level error %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix ERROR-ERR --implicit-check-not lit < %t/stderr.txt
+
+# ERROR-ERR: 1 warning(s) in tests
+
+
+### Test --test-output
+
+# RUN: not %{lit} --test-output off  %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RUN: not %{lit} --test-output failed  %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix VERBOSE < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# TEST-OUTPUT-OFF:      -- Testing: 5 tests, 1 workers --
+# TEST-OUTPUT-OFF-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# TEST-OUTPUT-OFF-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# TEST-OUTPUT-OFF-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# TEST-OUTPUT-OFF-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# TEST-OUTPUT-OFF-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# TEST-OUTPUT-OFF-NEXT: ********************
+# TEST-OUTPUT-OFF-NEXT: Failed Tests (1):
+# TEST-OUTPUT-OFF-NEXT:   verbosity :: fail.txt
+# TEST-OUTPUT-OFF-EMPTY:
+# TEST-OUTPUT-OFF-NEXT: ********************
+# TEST-OUTPUT-OFF-NEXT: Unexpectedly Passed Tests (1):
+# TEST-OUTPUT-OFF-NEXT:   verbosity :: xpass.txt
+# TEST-OUTPUT-OFF-EMPTY:
+# TEST-OUTPUT-OFF-EMPTY:
+# TEST-OUTPUT-OFF-NEXT: Testing Time: {{.*}}s
+# TEST-OUTPUT-OFF-EMPTY:
+# TEST-OUTPUT-OFF-NEXT: Total Discovered Tests: 5
+# TEST-OUTPUT-OFF-NEXT:   Unsupported        : 1 (20.00%)
+# TEST-OUTPUT-OFF-NEXT:   Passed             : 1 (20.00%)
+# TEST-OUTPUT-OFF-NEXT:   Expectedly Failed  : 1 (20.00%)
+# TEST-OUTPUT-OFF-NEXT:   Failed             : 1 (20.00%)
+# TEST-OUTPUT-OFF-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} --test-output all  %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix SHOW-ALL < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+
+### Test --print-result-after
+
+# RUN: not %{lit} --print-result-after off  %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix RESULT-OFF < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RESULT-OFF:      ********************
+# RESULT-OFF-NEXT: Failed Tests (1):
+# RESULT-OFF-NEXT:   verbosity :: fail.txt
+# RESULT-OFF-EMPTY:
+# RESULT-OFF-NEXT: ********************
+# RESULT-OFF-NEXT: Unexpectedly Passed Tests (1):
+# RESULT-OFF-NEXT:   verbosity :: xpass.txt
+# RESULT-OFF-EMPTY:
+# RESULT-OFF-EMPTY:
+# RESULT-OFF-NEXT: Testing Time: {{.*}}s
+# RESULT-OFF-EMPTY:
+# RESULT-OFF-NEXT: Total Discovered Tests: 5
+# RESULT-OFF-NEXT:   Unsupported        : 1 (20.00%)
+# RESULT-OFF-NEXT:   Passed             : 1 (20.00%)
+# RESULT-OFF-NEXT:   Expectedly Failed  : 1 (20.00%)
+# RESULT-OFF-NEXT:   Failed             : 1 (20.00%)
+# RESULT-OFF-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+
+# RUN: not %{lit} --print-result-after failed  %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix RESULT-FAILED < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RESULT-FAILED:      -- Testing: 5 tests, 1 workers --
+# RESULT-FAILED-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# RESULT-FAILED-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# RESULT-FAILED-NEXT: ********************
+# RESULT-FAILED-NEXT: Failed Tests (1):
+# RESULT-FAILED-NEXT:   verbosity :: fail.txt
+# RESULT-FAILED-EMPTY:
+# RESULT-FAILED-NEXT: ********************
+# RESULT-FAILED-NEXT: Unexpectedly Passed Tests (1):
+# RESULT-FAILED-NEXT:   verbosity :: xpass.txt
+# RESULT-FAILED-EMPTY:
+# RESULT-FAILED-EMPTY:
+# RESULT-FAILED-NEXT: Testing Time: {{.*}}s
+# RESULT-FAILED-EMPTY:
+# RESULT-FAILED-NEXT: Total Discovered Tests: 5
+# RESULT-FAILED-NEXT:   Unsupported        : 1 (20.00%)
+# RESULT-FAILED-NEXT:   Passed             : 1 (20.00%)
+# RESULT-FAILED-NEXT:   Expectedly Failed  : 1 (20.00%)
+# RESULT-FAILED-NEXT:   Failed             : 1 (20.00%)
+# RESULT-FAILED-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+
+# RUN: not %{lit} --print-result-after all  %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+
+### Test combinations of --print-result-after followed by --test-output
+
+# RUN: not %{lit} --print-result-after off --test-output failed %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix RESULT-OFF-OUTPUT-FAILED < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RESULT-OFF-OUTPUT-FAILED:      -- Testing: 5 tests, 1 workers --
+# RESULT-OFF-OUTPUT-FAILED-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# RESULT-OFF-OUTPUT-FAILED-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# RESULT-OFF-OUTPUT-FAILED-NEXT: Exit Code: 127
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: Command Output (stdout):
+# RESULT-OFF-OUTPUT-FAILED-NEXT: --
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # {{R}}UN: at line 1
+# RESULT-OFF-OUTPUT-FAILED-NEXT: echo "fail test output"
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # executed command: echo 'fail test output'
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # .---command stdout------------
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # | fail test output
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # `-----------------------------
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # {{R}}UN: at line 2
+# RESULT-OFF-OUTPUT-FAILED-NEXT: fail
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # executed command: fail
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # .---command stderr------------
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # | 'fail': command not found
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # `-----------------------------
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # error: command failed with exit status: 127
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: --
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: ********************
+# RESULT-OFF-OUTPUT-FAILED-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# RESULT-OFF-OUTPUT-FAILED-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# RESULT-OFF-OUTPUT-FAILED-NEXT: Exit Code: 0
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: Command Output (stdout):
+# RESULT-OFF-OUTPUT-FAILED-NEXT: --
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # {{R}}UN: at line 2
+# RESULT-OFF-OUTPUT-FAILED-NEXT: echo "xpass test output"
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # executed command: echo 'xpass test output'
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # .---command stdout------------
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # | xpass test output
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # `-----------------------------
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: --
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: ********************
+# RESULT-OFF-OUTPUT-FAILED-NEXT: ********************
+# RESULT-OFF-OUTPUT-FAILED-NEXT: Failed Tests (1):
+# RESULT-OFF-OUTPUT-FAILED-NEXT:   verbosity :: fail.txt
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: ********************
+# RESULT-OFF-OUTPUT-FAILED-NEXT: Unexpectedly Passed Tests (1):
+# RESULT-OFF-OUTPUT-FAILED-NEXT:   verbosity :: xpass.txt
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: Testing Time: {{.*}}s
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: Total Discovered Tests: 5
+# RESULT-OFF-OUTPUT-FAILED-NEXT:   Unsupported        : 1 (20.00%)
+# RESULT-OFF-OUTPUT-FAILED-NEXT:   Passed             : 1 (20.00%)
+# RESULT-OFF-OUTPUT-FAILED-NEXT:   Expectedly Failed  : 1 (20.00%)
+# RESULT-OFF-OUTPUT-FAILED-NEXT:   Failed             : 1 (20.00%)
+# RESULT-OFF-OUTPUT-FAILED-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} --print-result-after all --test-output off %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RUN: not %{lit} --print-result-after failed --test-output all %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix SHOW-ALL < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+
+### Test combinations of --test-output followed by --print-result-after
+
+# RUN: not %{lit} --test-output failed --print-result-after off %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix RESULT-OFF < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RUN: not %{lit} --test-output off --print-result-after all %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RUN: not %{lit} --test-output all --print-result-after failed %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix OUTPUT-ALL-RESULT-FAILED < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# OUTPUT-ALL-RESULT-FAILED:      -- Testing: 5 tests, 1 workers --
+# OUTPUT-ALL-RESULT-FAILED-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# OUTPUT-ALL-RESULT-FAILED-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# OUTPUT-ALL-RESULT-FAILED-NEXT: Exit Code: 127
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: Command Output (stdout):
+# OUTPUT-ALL-RESULT-FAILED-NEXT: --
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # {{R}}UN: at line 1
+# OUTPUT-ALL-RESULT-FAILED-NEXT: echo "fail test output"
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # executed command: echo 'fail test output'
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # .---command stdout------------
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # | fail test output
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # `-----------------------------
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # {{R}}UN: at line 2
+# OUTPUT-ALL-RESULT-FAILED-NEXT: fail
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # executed command: fail
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # .---command stderr------------
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # | 'fail': command not found
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # `-----------------------------
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # error: command failed with exit status: 127
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: --
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: ********************
+# OUTPUT-ALL-RESULT-FAILED-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# OUTPUT-ALL-RESULT-FAILED-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# OUTPUT-ALL-RESULT-FAILED-NEXT: Exit Code: 0
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: Command Output (stdout):
+# OUTPUT-ALL-RESULT-FAILED-NEXT: --
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # {{R}}UN: at line 2
+# OUTPUT-ALL-RESULT-FAILED-NEXT: echo "xpass test output"
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # executed command: echo 'xpass test output'
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # .---command stdout------------
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # | xpass test output
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # `-----------------------------
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: --
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: ********************
+# OUTPUT-ALL-RESULT-FAILED-NEXT: ********************
+# OUTPUT-ALL-RESULT-FAILED-NEXT: Failed Tests (1):
+# OUTPUT-ALL-RESULT-FAILED-NEXT:   verbosity :: fail.txt
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: ********************
+# OUTPUT-ALL-RESULT-FAILED-NEXT: Unexpectedly Passed Tests (1):
+# OUTPUT-ALL-RESULT-FAILED-NEXT:   verbosity :: xpass.txt
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: Testing Time: {{.*}}
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: Total Discovered Tests: 5
+# OUTPUT-ALL-RESULT-FAILED-NEXT:   Unsupported        : 1 (20.00%)
+# OUTPUT-ALL-RESULT-FAILED-NEXT:   Passed             : 1 (20.00%)
+# OUTPUT-ALL-RESULT-FAILED-NEXT:   Expectedly Failed  : 1 (20.00%)
+# OUTPUT-ALL-RESULT-FAILED-NEXT:   Failed             : 1 (20.00%)
+# OUTPUT-ALL-RESULT-FAILED-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+
+### Test progress bar and terse summary in isolation
+
+# RUN: not %{lit} --progress-bar %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix PROGRESS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# PROGRESS:      -- Testing: 5 tests, 1 workers --
+# PROGRESS-NEXT: Testing:
+# PROGRESS-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# PROGRESS-NEXT: Testing:  0..
+# PROGRESS-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# PROGRESS-NEXT: Testing:  0.. 10..
+# PROGRESS-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# PROGRESS-NEXT: Testing:  0.. 10.. 20..
+# PROGRESS-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# PROGRESS-NEXT: Testing:  0.. 10.. 20.. 30..
+# PROGRESS-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# PROGRESS-NEXT: Testing:  0.. 10.. 20.. 30.. 40.. 50.. 60.. 70.. 80.. 90..
+# PROGRESS-NEXT: ********************
+# PROGRESS-NEXT: Failed Tests (1):
+# PROGRESS-NEXT:   verbosity :: fail.txt
+# PROGRESS-EMPTY:
+# PROGRESS-NEXT: ********************
+# PROGRESS-NEXT: Unexpectedly Passed Tests (1):
+# PROGRESS-NEXT:   verbosity :: xpass.txt
+# PROGRESS-EMPTY:
+# PROGRESS-EMPTY:
+# PROGRESS-NEXT: Testing Time: {{.*}}s
+# PROGRESS-EMPTY:
+# PROGRESS-NEXT: Total Discovered Tests: 5
+# PROGRESS-NEXT:   Unsupported        : 1 (20.00%)
+# PROGRESS-NEXT:   Passed             : 1 (20.00%)
+# PROGRESS-NEXT:   Expectedly Failed  : 1 (20.00%)
+# PROGRESS-NEXT:   Failed             : 1 (20.00%)
+# PROGRESS-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} --terse-summary %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix TERSE < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# TERSE:      -- Testing: 5 tests, 1 workers --
+# TERSE-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# TERSE-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# TERSE-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# TERSE-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# TERSE-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# TERSE-NEXT: ********************
+# TERSE-NEXT: Failed Tests (1):
+# TERSE-NEXT:   verbosity :: fail.txt
+# TERSE-EMPTY:
+# TERSE-NEXT: ********************
+# TERSE-NEXT: Unexpectedly Passed Tests (1):
+# TERSE-NEXT:   verbosity :: xpass.txt
+# TERSE-EMPTY:
+# TERSE-EMPTY:
+# TERSE-NEXT: Total Discovered Tests: 5
+# TERSE-NEXT:   Failed             : 1 (20.00%)
+# TERSE-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+
+### Aliases in combination
+
+# RUN: not %{lit} -a -s %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix AS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# AS:      -- Testing: 5 tests, 1 workers --
+# AS-NEXT: Testing:
+# AS-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# AS-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# AS-NEXT: Exit Code: 127
+# AS-EMPTY:
+# AS-NEXT: Command Output (stdout):
+# AS-NEXT: --
+# AS-NEXT: # {{R}}UN: at line 1
+# AS-NEXT: echo "fail test output"
+# AS-NEXT: # executed command: echo 'fail test output'
+# AS-NEXT: # .---command stdout------------
+# AS-NEXT: # | fail test output
+# AS-NEXT: # `-----------------------------
+# AS-NEXT: # {{R}}UN: at line 2
+# AS-NEXT: fail
+# AS-NEXT: # executed command: fail
+# AS-NEXT: # .---command stderr------------
+# AS-NEXT: # | 'fail': command not found
+# AS-NEXT: # `-----------------------------
+# AS-NEXT: # error: command failed with exit status: 127
+# AS-EMPTY:
+# AS-NEXT: --
+# AS-EMPTY:
+# AS-NEXT: ********************
+# AS-NEXT: Testing:  0.. 10..
+# AS-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# AS-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# AS-NEXT: Exit Code: 0
+# AS-EMPTY:
+# AS-NEXT: Command Output (stdout):
+# AS-NEXT: --
+# AS-NEXT: # {{R}}UN: at line 2
+# AS-NEXT: echo "xpass test output"
+# AS-NEXT: # executed command: echo 'xpass test output'
+# AS-NEXT: # .---command stdout------------
+# AS-NEXT: # | xpass test output
+# AS-NEXT: # `-----------------------------
+# AS-EMPTY:
+# AS-NEXT: --
+# AS-EMPTY:
+# AS-NEXT: ********************
+# AS-NEXT: Testing:  0.. 10.. 20.. 30.. 40.. 50.. 60.. 70.. 80.. 90..
+# AS-NEXT: ********************
+# AS-NEXT: Failed Tests (1):
+# AS-NEXT:   verbosity :: fail.txt
+# AS-EMPTY:
+# AS-NEXT: ********************
+# AS-NEXT: Unexpectedly Passed Tests (1):
+# AS-NEXT:   verbosity :: xpass.txt
+# AS-EMPTY:
+# AS-EMPTY:
+# AS-NEXT: Testing Time: {{.*}}s
+# AS-EMPTY:
+# AS-NEXT: Total Discovered Tests: 5
+# AS-NEXT:   Unsupported        : 1 (20.00%)
+# AS-NEXT:   Passed             : 1 (20.00%)
+# AS-NEXT:   Expectedly Failed  : 1 (20.00%)
+# AS-NEXT:   Failed             : 1 (20.00%)
+# AS-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+
+# RUN: not %{lit} -s -a %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix SA < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# SA:      -- Testing: 5 tests, 1 workers --
+# SA-NEXT: Testing:
+# SA-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# SA-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# SA-NEXT: Exit Code: 127
+# SA-EMPTY:
+# SA-NEXT: Command Output (stdout):
+# SA-NEXT: --
+# SA-NEXT: # {{R}}UN: at line 1
+# SA-NEXT: echo "fail test output"
+# SA-NEXT: # executed command: echo 'fail test output'
+# SA-NEXT: # .---command stdout------------
+# SA-NEXT: # | fail test output
+# SA-NEXT: # `-----------------------------
+# SA-NEXT: # {{R}}UN: at line 2
+# SA-NEXT: fail
+# SA-NEXT: # executed command: fail
+# SA-NEXT: # .---command stderr------------
+# SA-NEXT: # | 'fail': command not found
+# SA-NEXT: # `-----------------------------
+# SA-NEXT: # error: command failed with exit status: 127
+# SA-EMPTY:
+# SA-NEXT: --
+# SA-EMPTY:
+# SA-NEXT: ********************
+# SA-NEXT: Testing:  0.. 10..
+# SA-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# SA-NEXT: Exit Code: 0
+# SA-EMPTY:
+# SA-NEXT: Command Output (stdout):
+# SA-NEXT: --
+# SA-NEXT: # {{R}}UN: at line 1
+# SA-NEXT: echo "pass test output"
+# SA-NEXT: # executed command: echo 'pass test output'
+# SA-NEXT: # .---command stdout------------
+# SA-NEXT: # | pass test output
+# SA-NEXT: # `-----------------------------
+# SA-EMPTY:
+# SA-NEXT: --
+# SA-EMPTY:
+# SA-NEXT: ********************
+# SA-NEXT: Testing:  0.. 10.. 20..
+# SA-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# SA-NEXT: Test requires the following unavailable features: asdf
+# SA-NEXT: ********************
+# SA-NEXT: Testing:  0.. 10.. 20.. 30..
+# SA-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# SA-NEXT: Exit Code: 1
+# SA-EMPTY:
+# SA-NEXT: Command Output (stdout):
+# SA-NEXT: --
+# SA-NEXT: # {{R}}UN: at line 2
+# SA-NEXT: not echo "xfail test output"
+# SA-NEXT: # executed command: not echo 'xfail test output'
+# SA-NEXT: # .---command stdout------------
+# SA-NEXT: # | xfail test output
+# SA-NEXT: # `-----------------------------
+# SA-NEXT: # error: command failed with exit status: 1
+# SA-EMPTY:
+# SA-NEXT: --
+# SA-EMPTY:
+# SA-NEXT: ********************
+# SA-NEXT: Testing:  0.. 10.. 20.. 30.. 40..
+# SA-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# SA-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# SA-NEXT: Exit Code: 0
+# SA-EMPTY:
+# SA-NEXT: Command Output (stdout):
+# SA-NEXT: --
+# SA-NEXT: # {{R}}UN: at line 2
+# SA-NEXT: echo "xpass test output"
+# SA-NEXT: # executed command: echo 'xpass test output'
+# SA-NEXT: # .---command stdout------------
+# SA-NEXT: # | xpass test output
+# SA-NEXT: # `-----------------------------
+# SA-EMPTY:
+# SA-NEXT: --
+# SA-EMPTY:
+# SA-NEXT: ********************
+# SA-NEXT: Testing:  0.. 10.. 20.. 30.. 40.. 50.. 60.. 70.. 80.. 90..
+# SA-NEXT: ********************
+# SA-NEXT: Failed Tests (1):
+# SA-NEXT:   verbosity :: fail.txt
+# SA-EMPTY:
+# SA-NEXT: ********************
+# SA-NEXT: Unexpectedly Passed Tests (1):
+# SA-NEXT:   verbosity :: xpass.txt
+# SA-EMPTY:
+# SA-EMPTY:
+# SA-NEXT: Testing Time: {{.*}}s
+# SA-EMPTY:
+# SA-NEXT: Total Discovered Tests: 5
+# SA-NEXT:   Unsupported        : 1 (20.00%)
+# SA-NEXT:   Passed             : 1 (20.00%)
+# SA-NEXT:   Expectedly Failed  : 1 (20.00%)
+# SA-NEXT:   Failed             : 1 (20.00%)
+# SA-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+
+# RUN: not %{lit} -q -a %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QA < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-ERR --implicit-check-not lit < %t/stderr.txt
+
+# QA:      -- Testing: 5 tests, 1 workers --
+# QA-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# QA-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# QA-NEXT: Exit Code: 127
+# QA-EMPTY:
+# QA-NEXT: Command Output (stdout):
+# QA-NEXT: --
+# QA-NEXT: # {{R}}UN: at line 1
+# QA-NEXT: echo "fail test output"
+# QA-NEXT: # executed command: echo 'fail test output'
+# QA-NEXT: # .---command stdout------------
+# QA-NEXT: # | fail test output
+# QA-NEXT: # `-----------------------------
+# QA-NEXT: # {{R}}UN: at line 2
+# QA-NEXT: fail
+# QA-NEXT: # executed command: fail
+# QA-NEXT: # .---command stderr------------
+# QA-NEXT: # | 'fail': command not found
+# QA-NEXT: # `-----------------------------
+# QA-NEXT: # error: command failed with exit status: 127
+# QA-EMPTY:
+# QA-NEXT: --
+# QA-EMPTY:
+# QA-NEXT: ********************
+# QA-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# QA-NEXT: Exit Code: 0
+# QA-EMPTY:
+# QA-NEXT: Command Output (stdout):
+# QA-NEXT: --
+# QA-NEXT: # {{R}}UN: at line 1
+# QA-NEXT: echo "pass test output"
+# QA-NEXT: # executed command: echo 'pass test output'
+# QA-NEXT: # .---command stdout------------
+# QA-NEXT: # | pass test output
+# QA-NEXT: # `-----------------------------
+# QA-EMPTY:
+# QA-NEXT: --
+# QA-EMPTY:
+# QA-NEXT: ********************
+# QA-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# QA-NEXT: Test requires the following unavailable features: asdf
+# QA-NEXT: ********************
+# QA-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# QA-NEXT: Exit Code: 1
+# QA-EMPTY:
+# QA-NEXT: Command Output (stdout):
+# QA-NEXT: --
+# QA-NEXT: # {{R}}UN: at line 2
+# QA-NEXT: not echo "xfail test output"
+# QA-NEXT: # executed command: not echo 'xfail test output'
+# QA-NEXT: # .---command stdout------------
+# QA-NEXT: # | xfail test output
+# QA-NEXT: # `-----------------------------
+# QA-NEXT: # error: command failed with exit status: 1
+# QA-EMPTY:
+# QA-NEXT: --
+# QA-EMPTY:
+# QA-NEXT: ********************
+# QA-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# QA-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# QA-NEXT: Exit Code: 0
+# QA-EMPTY:
+# QA-NEXT: Command Output (stdout):
+# QA-NEXT: --
+# QA-NEXT: # {{R}}UN: at line 2
+# QA-NEXT: echo "xpass test output"
+# QA-NEXT: # executed command: echo 'xpass test output'
+# QA-NEXT: # .---command stdout------------
+# QA-NEXT: # | xpass test output
+# QA-NEXT: # `-----------------------------
+# QA-EMPTY:
+# QA-NEXT: --
+# QA-EMPTY:
+# QA-NEXT: ********************
+# QA-NEXT: ********************
+# QA-NEXT: Failed Tests (1):
+# QA-NEXT:   verbosity :: fail.txt
+# QA-EMPTY:
+# QA-NEXT: ********************
+# QA-NEXT: Unexpectedly Passed Tests (1):
+# QA-NEXT:   verbosity :: xpass.txt
+# QA-EMPTY:
+# QA-EMPTY:
+# QA-NEXT: Total Discovered Tests: 5
+# QA-NEXT:   Failed             : 1 (20.00%)
+# QA-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} -a -q %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RUN: not %{lit} -sqav %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix SQAV < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-ERR --implicit-check-not lit < %t/stderr.txt
+
+# SQAV:      -- Testing: 5 tests, 1 workers --
+# SQAV-NEXT: Testing:
+# SQAV-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# SQAV-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# SQAV-NEXT: Exit Code: 127
+# SQAV-EMPTY:
+# SQAV-NEXT: Command Output (stdout):
+# SQAV-NEXT: --
+# SQAV-NEXT: # {{R}}UN: at line 1
+# SQAV-NEXT: echo "fail test output"
+# SQAV-NEXT: # executed command: echo 'fail test output'
+# SQAV-NEXT: # .---command stdout------------
+# SQAV-NEXT: # | fail test output
+# SQAV-NEXT: # `-----------------------------
+# SQAV-NEXT: # {{R}}UN: at line 2
+# SQAV-NEXT: fail
+# SQAV-NEXT: # executed command: fail
+# SQAV-NEXT: # .---command stderr------------
+# SQAV-NEXT: # | 'fail': command not found
+# SQAV-NEXT: # `-----------------------------
+# SQAV-NEXT: # error: command failed with exit status: 127
+# SQAV-EMPTY:
+# SQAV-NEXT: --
+# SQAV-EMPTY:
+# SQAV-NEXT: ********************
+# SQAV-NEXT: Testing:  0.. 10..
+# SQAV-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# SQAV-NEXT: Testing:  0.. 10.. 20..
+# SQAV-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# SQAV-NEXT: Testing:  0.. 10.. 20.. 30..
+# SQAV-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# SQAV-NEXT: Testing:  0.. 10.. 20.. 30.. 40..
+# SQAV-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# SQAV-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# SQAV-NEXT: Exit Code: 0
+# SQAV-EMPTY:
+# SQAV-NEXT: Command Output (stdout):
+# SQAV-NEXT: --
+# SQAV-NEXT: # {{R}}UN: at line 2
+# SQAV-NEXT: echo "xpass test output"
+# SQAV-NEXT: # executed command: echo 'xpass test output'
+# SQAV-NEXT: # .---command stdout------------
+# SQAV-NEXT: # | xpass test output
+# SQAV-NEXT: # `-----------------------------
+# SQAV-EMPTY:
+# SQAV-NEXT: --
+# SQAV-EMPTY:
+# SQAV-NEXT: ********************
+# SQAV-NEXT: Testing:  0.. 10.. 20.. 30.. 40.. 50.. 60.. 70.. 80.. 90..
+# SQAV-NEXT: ********************
+# SQAV-NEXT: Failed Tests (1):
+# SQAV-NEXT:   verbosity :: fail.txt
+# SQAV-EMPTY:
+# SQAV-NEXT: ********************
+# SQAV-NEXT: Unexpectedly Passed Tests (1):
+# SQAV-NEXT:   verbosity :: xpass.txt
+# SQAV-EMPTY:
+# SQAV-EMPTY:
+# SQAV-NEXT: Total Discovered Tests: 5
+# SQAV-NEXT:   Failed             : 1 (20.00%)
+# SQAV-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+
+### Aliases with specific overrides
+
+# RUN: not %{lit} --quiet --no-terse-summary %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-W-SUMMARY < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-ERR --implicit-check-not lit < %t/stderr.txt
+
+# QUIET-W-SUMMARY:      -- Testing: 5 tests, 1 workers --
+# QUIET-W-SUMMARY-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# QUIET-W-SUMMARY-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# QUIET-W-SUMMARY-NEXT: ********************
+# QUIET-W-SUMMARY-NEXT: Failed Tests (1):
+# QUIET-W-SUMMARY-NEXT:   verbosity :: fail.txt
+# QUIET-W-SUMMARY-EMPTY:
+# QUIET-W-SUMMARY-NEXT: ********************
+# QUIET-W-SUMMARY-NEXT: Unexpectedly Passed Tests (1):
+# QUIET-W-SUMMARY-NEXT:   verbosity :: xpass.txt
+# QUIET-W-SUMMARY-EMPTY:
+# QUIET-W-SUMMARY-EMPTY:
+# QUIET-W-SUMMARY-NEXT: Testing Time: {{.*}}s
+# QUIET-W-SUMMARY-EMPTY:
+# QUIET-W-SUMMARY-NEXT: Total Discovered Tests: 5
+# QUIET-W-SUMMARY-NEXT:   Unsupported        : 1 (20.00%)
+# QUIET-W-SUMMARY-NEXT:   Passed             : 1 (20.00%)
+# QUIET-W-SUMMARY-NEXT:   Expectedly Failed  : 1 (20.00%)
+# QUIET-W-SUMMARY-NEXT:   Failed             : 1 (20.00%)
+# QUIET-W-SUMMARY-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+
+# RUN: not %{lit} --quiet --progress-bar %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-W-PROGRESS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-ERR --implicit-check-not lit < %t/stderr.txt
+
+# QUIET-W-PROGRESS: -- Testing: 5 tests, 1 workers --
+# QUIET-W-PROGRESS-NEXT: Testing:
+# QUIET-W-PROGRESS-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# QUIET-W-PROGRESS-NEXT: Testing:  0.. 10..
+# QUIET-W-PROGRESS-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# QUIET-W-PROGRESS-NEXT: Testing:  0.. 10.. 20.. 30.. 40.. 50.. 60.. 70.. 80.. 90..
+# QUIET-W-PROGRESS-NEXT: ********************
+# QUIET-W-PROGRESS-NEXT: Failed Tests (1):
+# QUIET-W-PROGRESS-NEXT:   verbosity :: fail.txt
+# QUIET-W-PROGRESS-EMPTY:
+# QUIET-W-PROGRESS-NEXT: ********************
+# QUIET-W-PROGRESS-NEXT: Unexpectedly Passed Tests (1):
+# QUIET-W-PROGRESS-NEXT:   verbosity :: xpass.txt
+# QUIET-W-PROGRESS-EMPTY:
+# QUIET-W-PROGRESS-EMPTY:
+# QUIET-W-PROGRESS-NEXT: Total Discovered Tests: 5
+# QUIET-W-PROGRESS-NEXT:   Failed             : 1 (20.00%)
+# QUIET-W-PROGRESS-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} --show-all --terse-summary %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix ALL-TERSE < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# ALL-TERSE: -- Testing: 5 tests, 1 workers --
+# ALL-TERSE-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# ALL-TERSE-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# ALL-TERSE-NEXT: Exit Code: 127
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: Command Output (stdout):
+# ALL-TERSE-NEXT: --
+# ALL-TERSE-NEXT: # {{R}}UN: at line 1
+# ALL-TERSE-NEXT: echo "fail test output"
+# ALL-TERSE-NEXT: # executed command: echo 'fail test output'
+# ALL-TERSE-NEXT: # .---command stdout------------
+# ALL-TERSE-NEXT: # | fail test output
+# ALL-TERSE-NEXT: # `-----------------------------
+# ALL-TERSE-NEXT: # {{R}}UN: at line 2
+# ALL-TERSE-NEXT: fail
+# ALL-TERSE-NEXT: # executed command: fail
+# ALL-TERSE-NEXT: # .---command stderr------------
+# ALL-TERSE-NEXT: # | 'fail': command not found
+# ALL-TERSE-NEXT: # `-----------------------------
+# ALL-TERSE-NEXT: # error: command failed with exit status: 127
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: --
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: ********************
+# ALL-TERSE-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# ALL-TERSE-NEXT: Exit Code: 0
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: Command Output (stdout):
+# ALL-TERSE-NEXT: --
+# ALL-TERSE-NEXT: # {{R}}UN: at line 1
+# ALL-TERSE-NEXT: echo "pass test output"
+# ALL-TERSE-NEXT: # executed command: echo 'pass test output'
+# ALL-TERSE-NEXT: # .---command stdout------------
+# ALL-TERSE-NEXT: # | pass test output
+# ALL-TERSE-NEXT: # `-----------------------------
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: --
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: ********************
+# ALL-TERSE-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# ALL-TERSE-NEXT: Test requires the following unavailable features: asdf
+# ALL-TERSE-NEXT: ********************
+# ALL-TERSE-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# ALL-TERSE-NEXT: Exit Code: 1
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: Command Output (stdout):
+# ALL-TERSE-NEXT: --
+# ALL-TERSE-NEXT: # {{R}}UN: at line 2
+# ALL-TERSE-NEXT: not echo "xfail test output"
+# ALL-TERSE-NEXT: # executed command: not echo 'xfail test output'
+# ALL-TERSE-NEXT: # .---command stdout------------
+# ALL-TERSE-NEXT: # | xfail test output
+# ALL-TERSE-NEXT: # `-----------------------------
+# ALL-TERSE-NEXT: # error: command failed with exit status: 1
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: --
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: ********************
+# ALL-TERSE-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# ALL-TERSE-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# ALL-TERSE-NEXT: Exit Code: 0
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: Command Output (stdout):
+# ALL-TERSE-NEXT: --
+# ALL-TERSE-NEXT: # {{R}}UN: at line 2
+# ALL-TERSE-NEXT: echo "xpass test output"
+# ALL-TERSE-NEXT: # executed command: echo 'xpass test output'
+# ALL-TERSE-NEXT: # .---command stdout------------
+# ALL-TERSE-NEXT: # | xpass test output
+# ALL-TERSE-NEXT: # `-----------------------------
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: --
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: ********************
+# ALL-TERSE-NEXT: ********************
+# ALL-TERSE-NEXT: Failed Tests (1):
+# ALL-TERSE-NEXT:   verbosity :: fail.txt
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: ********************
+# ALL-TERSE-NEXT: Unexpectedly Passed Tests (1):
+# ALL-TERSE-NEXT:   verbosity :: xpass.txt
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: Total Discovered Tests: 5
+# ALL-TERSE-NEXT:   Failed             : 1 (20.00%)
+# ALL-TERSE-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} --show-all --diagnostic-level error %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix SHOW-ALL < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RUN: not %{lit} --show-all --test-output off %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RUN: not %{lit} --succinct --print-result-after all %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix SUCCINCT-RESULT-ALL < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# SUCCINCT-RESULT-ALL:      -- Testing: 5 tests, 1 workers --
+# SUCCINCT-RESULT-ALL-NEXT: Testing:
+# SUCCINCT-RESULT-ALL-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# SUCCINCT-RESULT-ALL-NEXT: Testing:  0.. 10.
+# SUCCINCT-RESULT-ALL-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# SUCCINCT-RESULT-ALL-NEXT: Testing:  0.. 10.. 20..
+# SUCCINCT-RESULT-ALL-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# SUCCINCT-RESULT-ALL-NEXT: Testing:  0.. 10.. 20.. 30..
+# SUCCINCT-RESULT-ALL-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# SUCCINCT-RESULT-ALL-NEXT: Testing:  0.. 10.. 20.. 30.. 40..
+# SUCCINCT-RESULT-ALL-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# SUCCINCT-RESULT-ALL-NEXT: Testing:  0.. 10.. 20.. 30.. 40.. 50.. 60.. 70.. 80.. 90..
+# SUCCINCT-RESULT-ALL-NEXT: ********************
+# SUCCINCT-RESULT-ALL-NEXT: Failed Tests (1):
+# SUCCINCT-RESULT-ALL-NEXT:   verbosity :: fail.txt
+# SUCCINCT-RESULT-ALL-EMPTY:
+# SUCCINCT-RESULT-ALL-NEXT: ********************
+# SUCCINCT-RESULT-ALL-NEXT: Unexpectedly Passed Tests (1):
+# SUCCINCT-RESULT-ALL-NEXT:   verbosity :: xpass.txt
+# SUCCINCT-RESULT-ALL-EMPTY:
+# SUCCINCT-RESULT-ALL-EMPTY:
+# SUCCINCT-RESULT-ALL-NEXT: Testing Time: {{.*}}s
+# SUCCINCT-RESULT-ALL-EMPTY:
+# SUCCINCT-RESULT-ALL-NEXT: Total Discovered Tests: 5
+# SUCCINCT-RESULT-ALL-NEXT:   Unsupported        : 1 (20.00%)
+# SUCCINCT-RESULT-ALL-NEXT:   Passed             : 1 (20.00%)
+# SUCCINCT-RESULT-ALL-NEXT:   Expectedly Failed  : 1 (20.00%)
+# SUCCINCT-RESULT-ALL-NEXT:   Failed             : 1 (20.00%)
+# SUCCINCT-RESULT-ALL-NEXT:   Unexpectedly Passed: 1 (20.00%)
diff --git a/llvm/utils/prepare-code-coverage-artifact.py b/llvm/utils/prepare-code-coverage-artifact.py
index b865211088c8e..5d4f549278638 100755
--- a/llvm/utils/prepare-code-coverage-artifact.py
+++ b/llvm/utils/prepare-code-coverage-artifact.py
@@ -1,6 +1,4 @@
-#!/usr/bin/env python
-
-from __future__ import print_function
+#!/usr/bin/env python3
 
 """Prepare a code coverage artifact.
 
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index aef7c0987fda7..10a7b62229b8d 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -1,8 +1,3 @@
-Analysis/LoopAccessAnalysis/memcheck-ni.ll
-Analysis/MemorySSA/pr116227.ll
-Analysis/MemorySSA/pr43641.ll
-Analysis/MemorySSA/pr46574.ll
-Analysis/MemorySSA/update-remove-dead-blocks.ll
 Bitcode/fcmp-fast.ll
 Bitcode/flags.ll
 CodeGen/AArch64/cgdata-merge-local.ll
@@ -10,61 +5,6 @@ CodeGen/AArch64/llvm-masked-gather-legal-for-sve.ll
 CodeGen/AArch64/llvm-masked-scatter-legal-for-sve.ll
 CodeGen/AArch64/selectopt-cast.ll
 CodeGen/AArch64/selectopt.ll
-CodeGen/AMDGPU/amdgpu-attributor-min-agpr-alloc.ll
-CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
-CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access-asan.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-indirect-access.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test-asan.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-dynamic-lds-test.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-lower-all.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return-asan.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-multiple-blocks-return.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access-asan.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-multi-static-dynamic-indirect-access.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access-asan.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-indirect-access.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test-asan.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-dynamic-lds-test.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param-asan.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-function-param.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-lower-all.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-no-heap-ptr.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-O0.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-asan.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomic-cmpxchg-asan.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test-atomicrmw-asan.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-test.ll
-CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-vector-ptrs.ll
-CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll
-CodeGen/AMDGPU/atomic-optimizer-promote-i8.ll
-CodeGen/AMDGPU/attributor-flatscratchinit.ll
-CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
-CodeGen/AMDGPU/global-atomic-scan.ll
-CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
-CodeGen/AMDGPU/global_atomics_iterative_scan.ll
-CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
-CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
-CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
-CodeGen/AMDGPU/lower-ctor-dtor.ll
-CodeGen/AMDGPU/lower-intrinsics-cluster-barrier.ll
-CodeGen/AMDGPU/lower-mem-intrinsics.ll
-CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
-CodeGen/AMDGPU/opencl-printf-and-hostcall.ll
-CodeGen/AMDGPU/opencl-printf.ll
-CodeGen/AMDGPU/opencl-printf-pipeline.ll
-CodeGen/AMDGPU/printf_builtin.ll
-CodeGen/AMDGPU/printf-existing-format-strings.ll
-CodeGen/AMDGPU/printf_nobuiltin.ll
-CodeGen/AMDGPU/private-memory-atomics.ll
-CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
-CodeGen/AMDGPU/simple-indirect-call-2.ll
 CodeGen/ARM/loopvectorize_pr33804.ll
 CodeGen/ARM/sjljeh-swifterror.ll
 CodeGen/Hexagon/autohvx/interleave.ll
@@ -75,529 +15,18 @@ CodeGen/Hexagon/masked_gather.ll
 CodeGen/NVPTX/lower-ctor-dtor.ll
 CodeGen/RISCV/zmmul.ll
 CodeGen/WebAssembly/memory-interleave.ll
+CodeGen/X86/AMX/amx-low-intrinsics.ll
 CodeGen/X86/masked_gather_scatter.ll
 CodeGen/X86/nocfivalue.ll
 DebugInfo/AArch64/ir-outliner.ll
 DebugInfo/assignment-tracking/X86/hotcoldsplit.ll
 DebugInfo/Generic/block-asan.ll
-DebugInfo/KeyInstructions/Generic/loop-unswitch.ll
 DebugInfo/X86/asan_debug_info.ll
-Instrumentation/AddressSanitizer/aarch64be.ll
-Instrumentation/AddressSanitizer/adaptive_global_redzones.ll
-Instrumentation/AddressSanitizer/AMDGPU/adaptive_constant_global_redzones.ll
-Instrumentation/AddressSanitizer/AMDGPU/adaptive_global_redzones.ll
-Instrumentation/AddressSanitizer/AMDGPU/asan_do_not_instrument_lds.ll
-Instrumentation/AddressSanitizer/AMDGPU/asan_do_not_instrument_scratch.ll
-Instrumentation/AddressSanitizer/AMDGPU/asan_instrument_constant_address_space.ll
-Instrumentation/AddressSanitizer/AMDGPU/asan_instrument_generic_address_space.ll
-Instrumentation/AddressSanitizer/AMDGPU/asan_instrument_global_address_space.ll
-Instrumentation/AddressSanitizer/AMDGPU/asan_instrument_mem_intrinsics.ll
-Instrumentation/AddressSanitizer/AMDGPU/global_metadata_addrspacecasts.ll
-Instrumentation/AddressSanitizer/AMDGPU/instrument-stack.ll
-Instrumentation/AddressSanitizer/AMDGPU/no_redzones_in_lds_globals.ll
-Instrumentation/AddressSanitizer/AMDGPU/no_redzones_in_scratch_globals.ll
-Instrumentation/AddressSanitizer/asan_address_space_attr.ll
-Instrumentation/AddressSanitizer/asan-detect-invalid-pointer-pair.ll
-Instrumentation/AddressSanitizer/asan-disable-sanitizer-instrumentation.ll
-Instrumentation/AddressSanitizer/asan-funclet.ll
-Instrumentation/AddressSanitizer/asan-masked-load-store.ll
-Instrumentation/AddressSanitizer/asan-optimize-callbacks.ll
-Instrumentation/AddressSanitizer/asan-pass-second-run.ll
-Instrumentation/AddressSanitizer/asan-scalable-vector.ll
-Instrumentation/AddressSanitizer/asan-stack-safety.ll
-Instrumentation/AddressSanitizer/asan-struct-scalable.ll
-Instrumentation/AddressSanitizer/asan-vp-load-store.ll
-Instrumentation/AddressSanitizer/asan-vs-gvn.ll
-Instrumentation/AddressSanitizer/asan-win-dont-instrument-catchpad.ll
-Instrumentation/AddressSanitizer/basic.ll
-Instrumentation/AddressSanitizer/basic-msvc64.ll
-Instrumentation/AddressSanitizer/byref-args.ll
-Instrumentation/AddressSanitizer/byval-args.ll
-Instrumentation/AddressSanitizer/calls-only.ll
-Instrumentation/AddressSanitizer/calls-only-smallfn.ll
-Instrumentation/AddressSanitizer/coro-byval-param.ll
-Instrumentation/AddressSanitizer/debug-info-alloca.ll
-Instrumentation/AddressSanitizer/debug-info-global-var.ll
-Instrumentation/AddressSanitizer/debug_info.ll
-Instrumentation/AddressSanitizer/debug_info_noninstrumented_alloca2.ll
-Instrumentation/AddressSanitizer/debug_info_noninstrumented_alloca.ll
-Instrumentation/AddressSanitizer/do-not-instrument-globals-darwin.ll
-Instrumentation/AddressSanitizer/do-not-instrument-globals-linux.ll
-Instrumentation/AddressSanitizer/do-not-instrument-globals-windows.ll
-Instrumentation/AddressSanitizer/do-not-instrument-internal-globals.ll
-Instrumentation/AddressSanitizer/do-not-instrument-netbsd-link_set.ll
-Instrumentation/AddressSanitizer/do-not-instrument-profiling-globals.ll
-Instrumentation/AddressSanitizer/do-not-instrument-promotable-allocas.ll
-Instrumentation/AddressSanitizer/do-not-instrument-sanitizers.ll
-Instrumentation/AddressSanitizer/do-not-touch-comdat-global.ll
-Instrumentation/AddressSanitizer/do-not-touch-odr-global.ll
-Instrumentation/AddressSanitizer/do-not-touch-threadlocal.ll
-Instrumentation/AddressSanitizer/dynamic-shadow-darwin.ll
-Instrumentation/AddressSanitizer/experiment-call.ll
-Instrumentation/AddressSanitizer/experiment.ll
-Instrumentation/AddressSanitizer/fake-stack.ll
-Instrumentation/AddressSanitizer/force-dynamic-shadow.ll
-Instrumentation/AddressSanitizer/freebsd.ll
-Instrumentation/AddressSanitizer/global_addrspace.ll
-Instrumentation/AddressSanitizer/global_cstring_darwin.ll
-Instrumentation/AddressSanitizer/global_lto_merge.ll
-Instrumentation/AddressSanitizer/global_metadata_array.ll
-Instrumentation/AddressSanitizer/global_metadata_bitcasts.ll
-Instrumentation/AddressSanitizer/global-metadata-code-model-medium.ll
-Instrumentation/AddressSanitizer/global-metadata-code-model-small.ll
-Instrumentation/AddressSanitizer/global_metadata_darwin.ll
-Instrumentation/AddressSanitizer/global_metadata_external_comdat.ll
-Instrumentation/AddressSanitizer/global_metadata.ll
-Instrumentation/AddressSanitizer/global_metadata_windows.ll
-Instrumentation/AddressSanitizer/global_with_comdat.ll
-Instrumentation/AddressSanitizer/hoist-argument-init-insts.ll
-Instrumentation/AddressSanitizer/instrumentation-with-call-threshold.ll
-Instrumentation/AddressSanitizer/instrument-dynamic-allocas.ll
-Instrumentation/AddressSanitizer/instrument_global.ll
-Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
-Instrumentation/AddressSanitizer/instrument_initializer_without_global.ll
-Instrumentation/AddressSanitizer/instrument_late_initializer.ll
-Instrumentation/AddressSanitizer/instrument_load_then_store.ll
-Instrumentation/AddressSanitizer/instrument-no-return.ll
-Instrumentation/AddressSanitizer/instrument-section-invalid-c-ident.ll
-Instrumentation/AddressSanitizer/instrument-stack.ll
-Instrumentation/AddressSanitizer/kcfi.ll
-Instrumentation/AddressSanitizer/kcfi-offset.ll
-Instrumentation/AddressSanitizer/keep_going.ll
-Instrumentation/AddressSanitizer/lifetime.ll
-Instrumentation/AddressSanitizer/lifetime-throw.ll
-Instrumentation/AddressSanitizer/lifetime-uar-uas.ll
-Instrumentation/AddressSanitizer/local_alias.ll
-Instrumentation/AddressSanitizer/localescape.ll
-Instrumentation/AddressSanitizer/local_stack_base.ll
-Instrumentation/AddressSanitizer/mem-intrinsics.ll
-Instrumentation/AddressSanitizer/missing_dbg.ll
-Instrumentation/AddressSanitizer/module-flags-aarch64.ll
-Instrumentation/AddressSanitizer/module-flags.ll
-Instrumentation/AddressSanitizer/musttail.ll
-Instrumentation/AddressSanitizer/no-global-ctors.ll
-Instrumentation/AddressSanitizer/no_global_dtors.ll
-Instrumentation/AddressSanitizer/no-globals.ll
-Instrumentation/AddressSanitizer/odr-check-ignore.ll
-Instrumentation/AddressSanitizer/program-addrspace.ll
-Instrumentation/AddressSanitizer/ps4.ll
-Instrumentation/AddressSanitizer/remove-memory-effects.ll
-Instrumentation/AddressSanitizer/RISCV/asan-rvv-intrinsics.ll
-Instrumentation/AddressSanitizer/scale-offset.ll
-Instrumentation/AddressSanitizer/skip-coro.ll
-Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll
-Instrumentation/AddressSanitizer/stack_layout.ll
-Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll
-Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll
-Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll
-Instrumentation/AddressSanitizer/stack-poisoning.ll
-Instrumentation/AddressSanitizer/str-nobuiltin.ll
-Instrumentation/AddressSanitizer/test64.ll
-Instrumentation/AddressSanitizer/twice.ll
-Instrumentation/AddressSanitizer/ubsan.ll
-Instrumentation/AddressSanitizer/vector-load-store.ll
-Instrumentation/AddressSanitizer/version-mismatch-check.ll
-Instrumentation/AddressSanitizer/win-sorted-sections.ll
-Instrumentation/AddressSanitizer/win-string-literal.ll
-Instrumentation/AddressSanitizer/with-ifunc.ll
-Instrumentation/AddressSanitizer/X86/asm_cpuid.ll
-Instrumentation/AddressSanitizer/X86/asm_more_registers_than_available.ll
-Instrumentation/AddressSanitizer/X86/bug_11395.ll
-Instrumentation/BoundsChecking/many-trap.ll
-Instrumentation/BoundsChecking/many-traps-2.ll
-Instrumentation/BoundsChecking/opt.ll
-Instrumentation/BoundsChecking/phi.ll
-Instrumentation/BoundsChecking/runtimes.ll
-Instrumentation/BoundsChecking/simple.ll
-Instrumentation/BoundsChecking/ubsan-unique-traps.ll
-Instrumentation/DataFlowSanitizer/abilist_aggregate.ll
-Instrumentation/DataFlowSanitizer/abilist.ll
-Instrumentation/DataFlowSanitizer/atomics.ll
-Instrumentation/DataFlowSanitizer/basic.ll
-Instrumentation/DataFlowSanitizer/custom_fun_callback_attributes.ll
-Instrumentation/DataFlowSanitizer/custom_fun_varargs_attributes.ll
-Instrumentation/DataFlowSanitizer/dataflow-disable-sanitizer-instrumentation.ll
-Instrumentation/DataFlowSanitizer/debug.ll
-Instrumentation/DataFlowSanitizer/extern_weak.ll
-Instrumentation/DataFlowSanitizer/ignore_persnality_routine.ll
-Instrumentation/DataFlowSanitizer/origin_abilist.ll
-Instrumentation/DataFlowSanitizer/origin_cached_shadows.ll
-Instrumentation/DataFlowSanitizer/origin_load.ll
-Instrumentation/DataFlowSanitizer/origin_other_ops.ll
-Instrumentation/DataFlowSanitizer/origin_phi.ll
-Instrumentation/DataFlowSanitizer/origin_select.ll
-Instrumentation/DataFlowSanitizer/origin_store.ll
-Instrumentation/DataFlowSanitizer/origin_track_load.ll
-Instrumentation/DataFlowSanitizer/select.ll
-Instrumentation/DataFlowSanitizer/shadow-args-zext.ll
-Instrumentation/DataFlowSanitizer/struct.ll
-Instrumentation/DataFlowSanitizer/uninstrumented_local_functions.ll
-Instrumentation/HeapProfiler/basic-histogram.ll
-Instrumentation/HeapProfiler/basic.ll
-Instrumentation/HeapProfiler/filename.ll
-Instrumentation/HeapProfiler/instrumentation-use-callbacks.ll
-Instrumentation/HeapProfiler/masked-load-store.ll
-Instrumentation/HeapProfiler/memprof-options.ll
-Instrumentation/HeapProfiler/no-instrumentation.ll
-Instrumentation/HeapProfiler/scale-granularity.ll
-Instrumentation/HeapProfiler/shadow.ll
-Instrumentation/HeapProfiler/skip-compiler-inserted.ll
-Instrumentation/HeapProfiler/stack.ll
-Instrumentation/HeapProfiler/version-mismatch-check.ll
-Instrumentation/HWAddressSanitizer/alloca-array.ll
-Instrumentation/HWAddressSanitizer/alloca-compat.ll
-Instrumentation/HWAddressSanitizer/alloca.ll
-Instrumentation/HWAddressSanitizer/alloca-uninteresting.ll
-Instrumentation/HWAddressSanitizer/alloca-with-calls.ll
-Instrumentation/HWAddressSanitizer/atomic.ll
-Instrumentation/HWAddressSanitizer/basic-compat.ll
-Instrumentation/HWAddressSanitizer/basic.ll
-Instrumentation/HWAddressSanitizer/coro-byval-param.ll
-Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll
-Instrumentation/HWAddressSanitizer/dbg-declare-tag-offset.ll
-Instrumentation/HWAddressSanitizer/dbg-value-tag-offset.ll
-Instrumentation/HWAddressSanitizer/dbg-value-tag-offset-nopad.ll
-Instrumentation/HWAddressSanitizer/exception-lifetime.ll
-Instrumentation/HWAddressSanitizer/fixed-shadow.ll
-Instrumentation/HWAddressSanitizer/fuchsia.ll
-Instrumentation/HWAddressSanitizer/globals-access.ll
-Instrumentation/HWAddressSanitizer/globals.ll
-Instrumentation/HWAddressSanitizer/globals-tag.ll
-Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll
-Instrumentation/HWAddressSanitizer/kernel-inline.ll
-Instrumentation/HWAddressSanitizer/landingpad.ll
-Instrumentation/HWAddressSanitizer/mapping-override.ll
-Instrumentation/HWAddressSanitizer/memaccess-clobber.ll
-Instrumentation/HWAddressSanitizer/mem-attr.ll
-Instrumentation/HWAddressSanitizer/mem-intrinsics.ll
-Instrumentation/HWAddressSanitizer/musttail.ll
-Instrumentation/HWAddressSanitizer/personality-bti.ll
-Instrumentation/HWAddressSanitizer/personality.ll
-Instrumentation/HWAddressSanitizer/pgo-opt-out.ll
-Instrumentation/HWAddressSanitizer/pgo-opt-out-no-ps.ll
-Instrumentation/HWAddressSanitizer/prologue.ll
-Instrumentation/HWAddressSanitizer/RISCV/alloca.ll
-Instrumentation/HWAddressSanitizer/RISCV/alloca-with-calls.ll
-Instrumentation/HWAddressSanitizer/RISCV/atomic.ll
-Instrumentation/HWAddressSanitizer/RISCV/basic.ll
-Instrumentation/HWAddressSanitizer/RISCV/exception-lifetime.ll
-Instrumentation/HWAddressSanitizer/RISCV/use-after-scope-setjmp.ll
-Instrumentation/HWAddressSanitizer/RISCV/with-calls.ll
-Instrumentation/HWAddressSanitizer/stack-coloring.ll
-Instrumentation/HWAddressSanitizer/stack-safety-analysis.ll
-Instrumentation/HWAddressSanitizer/str-nobuiltin.ll
-Instrumentation/HWAddressSanitizer/use-after-scope.ll
-Instrumentation/HWAddressSanitizer/use-after-scope-setjmp.ll
-Instrumentation/HWAddressSanitizer/vector-load-store.ll
-Instrumentation/HWAddressSanitizer/with-calls.ll
-Instrumentation/HWAddressSanitizer/X86/alloca-array.ll
-Instrumentation/HWAddressSanitizer/X86/alloca.ll
-Instrumentation/HWAddressSanitizer/X86/alloca-with-calls.ll
-Instrumentation/HWAddressSanitizer/X86/atomic.ll
-Instrumentation/HWAddressSanitizer/X86/basic.ll
-Instrumentation/HWAddressSanitizer/X86/globals.ll
-Instrumentation/HWAddressSanitizer/X86/with-calls.ll
-Instrumentation/HWAddressSanitizer/zero-ptr.ll
-Instrumentation/InstrProfiling/always_inline.ll
-Instrumentation/InstrProfiling/atomic-updates.ll
-Instrumentation/InstrProfiling/comdat.ll
-Instrumentation/InstrProfiling/conditional-counter-updates.ll
-Instrumentation/InstrProfiling/early-exit.ll
-Instrumentation/InstrProfiling/icall-nocomdat.ll
-Instrumentation/InstrProfiling/no-counters.ll
-Instrumentation/InstrProfiling/platform.ll
-Instrumentation/InstrProfiling/profiling.ll
-Instrumentation/JustMyCode/jmc-instrument-elf.ll
-Instrumentation/JustMyCode/jmc-instrument.ll
-Instrumentation/JustMyCode/jmc-instrument-x86.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-cvt.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-fminv.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-smaxv.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-sminv.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-st1_lane.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-st1.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-st1_origins.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-tbl.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-umaxv.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-uminv.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-vadd.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-vaddlv.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-vaddv.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-vcvt.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-vmax.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-vmovn.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-vmul.ll
-Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll
-Instrumentation/MemorySanitizer/AArch64/module-flags-aarch64.ll
-Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll
-Instrumentation/MemorySanitizer/AArch64/qshrn.ll
-Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll
-Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll
-Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
-Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
-Instrumentation/MemorySanitizer/AArch64/vararg.ll
-Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll
-Instrumentation/MemorySanitizer/abs-vector.ll
-Instrumentation/MemorySanitizer/alloca.ll
-Instrumentation/MemorySanitizer/ARM32/vararg-arm32.ll
-Instrumentation/MemorySanitizer/array_types.ll
-Instrumentation/MemorySanitizer/atomics.ll
-Instrumentation/MemorySanitizer/attributes.ll
-Instrumentation/MemorySanitizer/bitreverse.ll
-Instrumentation/MemorySanitizer/bmi.ll
-Instrumentation/MemorySanitizer/byval-alignment.ll
-Instrumentation/MemorySanitizer/byval.ll
-Instrumentation/MemorySanitizer/check_access_address.ll
-Instrumentation/MemorySanitizer/check-array.ll
-Instrumentation/MemorySanitizer/check-constant-shadow.ll
-Instrumentation/MemorySanitizer/check-struct.ll
-Instrumentation/MemorySanitizer/clmul.ll
-Instrumentation/MemorySanitizer/count-zeroes.ll
-Instrumentation/MemorySanitizer/csr.ll
-Instrumentation/MemorySanitizer/disambiguate-origin.ll
-Instrumentation/MemorySanitizer/expand-experimental-reductions.ll
-Instrumentation/MemorySanitizer/freeze.ll
-Instrumentation/MemorySanitizer/funnel_shift.ll
-Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
-Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll
-Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
-Instrumentation/MemorySanitizer/i386/msan_i386intrinsics.ll
-Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
-Instrumentation/MemorySanitizer/i386/sse41-intrinsics-i386.ll
-Instrumentation/MemorySanitizer/i386/sse-intrinsics-i386.ll
-Instrumentation/MemorySanitizer/i386/vararg_call.ll
-Instrumentation/MemorySanitizer/i386/vararg.ll
-Instrumentation/MemorySanitizer/i386/vararg_shadow.ll
-Instrumentation/MemorySanitizer/i386/vararg-too-large.ll
-Instrumentation/MemorySanitizer/instrumentation-with-call-threshold.ll
-Instrumentation/MemorySanitizer/invalidate_global_aa.ll
-Instrumentation/MemorySanitizer/is-fpclass.ll
-Instrumentation/MemorySanitizer/libatomic.ll
-Instrumentation/MemorySanitizer/LoongArch/vararg.ll
-Instrumentation/MemorySanitizer/LoongArch/vararg-loongarch64.ll
-Instrumentation/MemorySanitizer/manual-shadow.ll
-Instrumentation/MemorySanitizer/masked-store-load.ll
-Instrumentation/MemorySanitizer/Mips32/vararg-mipsel.ll
-Instrumentation/MemorySanitizer/Mips32/vararg-mips.ll
-Instrumentation/MemorySanitizer/Mips/vararg-mips64el.ll
-Instrumentation/MemorySanitizer/Mips/vararg-mips64.ll
-Instrumentation/MemorySanitizer/missing_origin.ll
-Instrumentation/MemorySanitizer/msan_asm_conservative.ll
-Instrumentation/MemorySanitizer/msan_basic.ll
-Instrumentation/MemorySanitizer/msan_debug_info.ll
-Instrumentation/MemorySanitizer/msan-disable-checks.ll
-Instrumentation/MemorySanitizer/msan_eager.ll
-Instrumentation/MemorySanitizer/msan_invalidate.ll
-Instrumentation/MemorySanitizer/msan_llvm_is_constant.ll
-Instrumentation/MemorySanitizer/msan_llvm_launder_invariant.ll
-Instrumentation/MemorySanitizer/msan_llvm_strip_invariant.ll
-Instrumentation/MemorySanitizer/msan-pass-second-run.ll
-Instrumentation/MemorySanitizer/mul_by_constant.ll
-Instrumentation/MemorySanitizer/no-check-rt-unaligned.ll
-Instrumentation/MemorySanitizer/nosanitize.ll
-Instrumentation/MemorySanitizer/opaque-ptr.ll
-Instrumentation/MemorySanitizer/origin-alignment.ll
-Instrumentation/MemorySanitizer/origin-array.ll
-Instrumentation/MemorySanitizer/or.ll
-Instrumentation/MemorySanitizer/overflow.ll
-Instrumentation/MemorySanitizer/partial-poison.ll
-Instrumentation/MemorySanitizer/PowerPC32/vararg-ppcle.ll
-Instrumentation/MemorySanitizer/PowerPC32/vararg-ppc.ll
-Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64le.ll
-Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll
-Instrumentation/MemorySanitizer/pr32842.ll
-Instrumentation/MemorySanitizer/reduce.ll
-Instrumentation/MemorySanitizer/return_from_main.ll
-Instrumentation/MemorySanitizer/RISCV32/vararg-riscv32.ll
-Instrumentation/MemorySanitizer/saturating.ll
-Instrumentation/MemorySanitizer/scmp.ll
-Instrumentation/MemorySanitizer/stable_set_alloca_origin.ll
-Instrumentation/MemorySanitizer/store-long-origin.ll
-Instrumentation/MemorySanitizer/store-origin.ll
-Instrumentation/MemorySanitizer/str-nobuiltin.ll
-Instrumentation/MemorySanitizer/SystemZ/vararg.ll
-Instrumentation/MemorySanitizer/ucmp.ll
-Instrumentation/MemorySanitizer/unreachable.ll
-Instrumentation/MemorySanitizer/unsized_type.ll
-Instrumentation/MemorySanitizer/vector_arith.ll
-Instrumentation/MemorySanitizer/vector_cmp.ll
-Instrumentation/MemorySanitizer/vector_cvt.ll
-Instrumentation/MemorySanitizer/vector-load-store.ll
-Instrumentation/MemorySanitizer/vector_pack.ll
-Instrumentation/MemorySanitizer/vector-reduce-fadd.ll
-Instrumentation/MemorySanitizer/vector-reduce-fmul.ll
-Instrumentation/MemorySanitizer/vector_shift.ll
-Instrumentation/MemorySanitizer/vector-track-origins-neon.ll
-Instrumentation/MemorySanitizer/vector-track-origins-struct.ll
-Instrumentation/MemorySanitizer/vscale.ll
-Instrumentation/MemorySanitizer/with-call-type-size.ll
-Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
-Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
-Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
-Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
-Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
-Instrumentation/MemorySanitizer/X86/avx512fp16-arith-intrinsics.ll
-Instrumentation/MemorySanitizer/X86/avx512fp16-arith-vl-intrinsics.ll
-Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll
-Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll
-Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
-Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
-Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
-Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
-Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
-Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
-Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
-Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
-Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll
-Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll
-Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
-Instrumentation/MemorySanitizer/X86/f16c-intrinsics.ll
-Instrumentation/MemorySanitizer/X86/f16c-intrinsics-upgrade.ll
-Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
-Instrumentation/MemorySanitizer/X86/msan_x86intrinsics.ll
-Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
-Instrumentation/MemorySanitizer/X86/sse41-intrinsics-x86.ll
-Instrumentation/MemorySanitizer/X86/sse-intrinsics-x86.ll
-Instrumentation/MemorySanitizer/X86/vararg_call.ll
-Instrumentation/MemorySanitizer/X86/vararg_shadow.ll
-Instrumentation/MemorySanitizer/X86/vararg-too-large.ll
-Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
-Instrumentation/NumericalStabilitySanitizer/basic.ll
-Instrumentation/NumericalStabilitySanitizer/cfg.ll
-Instrumentation/NumericalStabilitySanitizer/invoke.ll
-Instrumentation/NumericalStabilitySanitizer/memory.ll
-Instrumentation/NumericalStabilitySanitizer/non_float_store.ll
-Instrumentation/NumericalStabilitySanitizer/scalable_vector.ll
-Instrumentation/RealtimeSanitizer/rtsan_blocking.ll
-Instrumentation/RealtimeSanitizer/rtsan.ll
-Instrumentation/RealtimeSanitizer/rtsan_multi_return.ll
-Instrumentation/SanitizerBinaryMetadata/atomics.ll
-Instrumentation/SanitizerBinaryMetadata/ctor.ll
-Instrumentation/SanitizerBinaryMetadata/pretend-atomic-access.ll
-Instrumentation/SanitizerBinaryMetadata/shared-mutable.ll
-Instrumentation/SanitizerCoverage/backedge-pruning.ll
-Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
-Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_64.ll
-Instrumentation/SanitizerCoverage/cmp-tracing.ll
-Instrumentation/SanitizerCoverage/coff-comdat.ll
-Instrumentation/SanitizerCoverage/coff-pc-table-inline-8bit-counters.ll
-Instrumentation/SanitizerCoverage/coff-pc-table-inline-bool-flag.ll
-Instrumentation/SanitizerCoverage/coff-used-ctor.ll
-Instrumentation/SanitizerCoverage/const-cmp-tracing.ll
-Instrumentation/SanitizerCoverage/control-flow.ll
-Instrumentation/SanitizerCoverage/coverage2-dbg.ll
-Instrumentation/SanitizerCoverage/coverage-dbg.ll
-Instrumentation/SanitizerCoverage/coverage-disable-sanitizer-instrumentation.ll
-Instrumentation/SanitizerCoverage/coverage.ll
-Instrumentation/SanitizerCoverage/div-tracing.ll
-Instrumentation/SanitizerCoverage/gep-tracing.ll
-Instrumentation/SanitizerCoverage/inline-8bit-counters.ll
-Instrumentation/SanitizerCoverage/inline-bool-flag.ll
-Instrumentation/SanitizerCoverage/interposable-symbol.ll
-Instrumentation/SanitizerCoverage/missing_dbg.ll
-Instrumentation/SanitizerCoverage/opaque-ptr.ll
-Instrumentation/SanitizerCoverage/pc-table.ll
-Instrumentation/SanitizerCoverage/seh.ll
-Instrumentation/SanitizerCoverage/stack-depth.ll
-Instrumentation/SanitizerCoverage/switch-tracing.ll
-Instrumentation/SanitizerCoverage/trace-pc-guard-inline-8bit-counters.ll
-Instrumentation/SanitizerCoverage/trace-pc-guard-inline-bool-flag.ll
-Instrumentation/SanitizerCoverage/trace-pc-guard.ll
-Instrumentation/SanitizerCoverage/tracing-comdat.ll
-Instrumentation/SanitizerCoverage/tracing.ll
-Instrumentation/SanitizerCoverage/unreachable-critedge.ll
-Instrumentation/sanitizers-naked.ll
-Instrumentation/ThreadSanitizer/do-not-instrument-memory-access.ll
-Instrumentation/ThreadSanitizer/tsan_basic.ll
-Instrumentation/ThreadSanitizer/tsan-pass-second-run.ll
-Instrumentation/TypeSanitizer/access-with-offset.ll
-Instrumentation/TypeSanitizer/alloca.ll
-Instrumentation/TypeSanitizer/alloca-only.ll
-Instrumentation/TypeSanitizer/anon.ll
-Instrumentation/TypeSanitizer/basic.ll
-Instrumentation/TypeSanitizer/basic-nosan.ll
-Instrumentation/TypeSanitizer/byval.ll
-Instrumentation/TypeSanitizer/globals.ll
-Instrumentation/TypeSanitizer/invalid-metadata.ll
-Instrumentation/TypeSanitizer/memintrinsics.ll
-Instrumentation/TypeSanitizer/nosanitize.ll
-Instrumentation/TypeSanitizer/sanitize-no-tbaa.ll
-Instrumentation/TypeSanitizer/swifterror.ll
 LTO/X86/diagnostic-handler-remarks-with-hotness.ll
 Other/optimization-remarks-auto.ll
 Other/X86/debugcounter-partiallyinlinelibcalls.ll
-tools/llvm-objcopy/ELF/auto-remove-add-symtab-shndx.test
-tools/UpdateTestChecks/update_analyze_test_checks/loop-access-analysis.test
-tools/UpdateTestChecks/update_analyze_test_checks/loop-distribute.test
-tools/UpdateTestChecks/update_test_checks/argument_name_reuse.test
-tools/UpdateTestChecks/update_test_checks/basic.test
-tools/UpdateTestChecks/update_test_checks/check_attrs.test
-tools/UpdateTestChecks/update_test_checks/difile_absolute_filenames.test
-tools/UpdateTestChecks/update_test_checks/filter_out_after.test
-tools/UpdateTestChecks/update_test_checks/generated_funcs_prefix_reuse.test
-tools/UpdateTestChecks/update_test_checks/generated_funcs.test
-tools/UpdateTestChecks/update_test_checks/global_preserve_name.test
-tools/UpdateTestChecks/update_test_checks/if_target.test
-tools/UpdateTestChecks/update_test_checks/named_function_arguments_split.test
-tools/UpdateTestChecks/update_test_checks/on_the_fly_arg_change.test
-tools/UpdateTestChecks/update_test_checks/phi-labels.test
-tools/UpdateTestChecks/update_test_checks/pre-process.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values2.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values3.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values4.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values5.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values6.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values_funcs.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values.test
-tools/UpdateTestChecks/update_test_checks/switch_case.test
-tools/UpdateTestChecks/update_test_checks/tbaa-semantics-checks.test
-tools/UpdateTestChecks/update_test_checks/various_ir_values_dbgrecords.test
-Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
-Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
-Transforms/AtomicExpand/AArch64/pcsections.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-private-gas.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization-preserve-name.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-nand.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-simplify-cfg-CAS-block.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll
-Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll
-Transforms/AtomicExpand/AMDGPU/expand-cmpxchg-flat-maybe-private.ll
 Transforms/AtomicExpand/ARM/atomic-expansion-v7.ll
-Transforms/AtomicExpand/ARM/atomic-expansion-v8.ll
-Transforms/AtomicExpand/ARM/atomicrmw-fp.ll
-Transforms/AtomicExpand/Hexagon/atomicrmw-fp.ll
-Transforms/AtomicExpand/LoongArch/atomicrmw-fp.ll
-Transforms/AtomicExpand/Mips/atomicrmw-fp.ll
-Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
-Transforms/AtomicExpand/RISCV/atomicrmw-fp.ll
-Transforms/AtomicExpand/SPARC/libcalls.ll
 Transforms/AtomicExpand/SPARC/partword.ll
-Transforms/AtomicExpand/X86/expand-atomic-rmw-fp.ll
-Transforms/AtomicExpand/X86/expand-atomic-rmw-initial-load.ll
-Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
 Transforms/Attributor/align.ll
 Transforms/Attributor/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
 Transforms/Attributor/ArgumentPromotion/2008-07-02-array-indexing.ll
@@ -710,8 +139,6 @@ Transforms/CorrelatedValuePropagation/urem.ll
 Transforms/CrossDSOCFI/basic.ll
 Transforms/CrossDSOCFI/cfi_functions.ll
 Transforms/CrossDSOCFI/thumb.ll
-Transforms/ExpandFp/AMDGPU/frem-inf.ll
-Transforms/ExpandFp/AMDGPU/frem.ll
 Transforms/ExpandLargeDivRem/X86/sdiv129.ll
 Transforms/ExpandLargeDivRem/X86/srem129.ll
 Transforms/ExpandLargeDivRem/X86/udiv129.ll
@@ -729,6 +156,7 @@ Transforms/ExpandVariadics/expand-va-intrinsic-split-simple.ll
 Transforms/ExpandVariadics/intrinsics.ll
 Transforms/FixIrreducible/basic.ll
 Transforms/FixIrreducible/bug45623.ll
+Transforms/FixIrreducible/callbr.ll
 Transforms/FixIrreducible/nested.ll
 Transforms/FixIrreducible/switch.ll
 Transforms/GCOVProfiling/atomic-counter.ll
@@ -813,18 +241,17 @@ Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll
 Transforms/InstCombine/AArch64/sve-intrinsic-simplify-shift.ll
 Transforms/InstCombine/add-mask.ll
 Transforms/InstCombine/add-shl-mul-umax.ll
-Transforms/InstCombine/AMDGPU/addrspacecast.ll
 Transforms/InstCombine/and2.ll
 Transforms/InstCombine/and-fcmp.ll
 Transforms/InstCombine/and.ll
 Transforms/InstCombine/and-or-icmps.ll
-Transforms/InstCombine/and-or-implied-cond-not.ll
 Transforms/InstCombine/apint-div1.ll
 Transforms/InstCombine/apint-div2.ll
 Transforms/InstCombine/ashr-demand.ll
 Transforms/InstCombine/atomic.ll
 Transforms/InstCombine/binop-cast.ll
 Transforms/InstCombine/binop-select-cast-of-select-cond.ll
+Transforms/InstCombine/binop-select.ll
 Transforms/InstCombine/bit-checks.ll
 Transforms/InstCombine/bitreverse.ll
 Transforms/InstCombine/branch.ll
@@ -850,7 +277,6 @@ Transforms/InstCombine/fold-ctpop-of-not.ll
 Transforms/InstCombine/fold-ext-eq-c-with-op.ll
 Transforms/InstCombine/free-inversion.ll
 Transforms/InstCombine/icmp-and-lowbit-mask.ll
-Transforms/InstCombine/icmp-equality-test.ll
 Transforms/InstCombine/icmp.ll
 Transforms/InstCombine/icmp-mul-and.ll
 Transforms/InstCombine/icmp-of-and-x.ll
@@ -859,7 +285,6 @@ Transforms/InstCombine/icmp-select-implies-common-op.ll
 Transforms/InstCombine/icmp-select.ll
 Transforms/InstCombine/icmp-with-selects.ll
 Transforms/InstCombine/intrinsic-select.ll
-Transforms/InstCombine/known-never-nan.ll
 Transforms/InstCombine/ldexp-ext.ll
 Transforms/InstCombine/ldexp.ll
 Transforms/InstCombine/load-bitcast-select.ll
@@ -899,13 +324,11 @@ Transforms/InstCombine/or.ll
 Transforms/InstCombine/pow-1.ll
 Transforms/InstCombine/pow-3.ll
 Transforms/InstCombine/pow-sqrt.ll
-Transforms/InstCombine/pr24354.ll
 Transforms/InstCombine/pull-conditional-binop-through-shift.ll
 Transforms/InstCombine/rem.ll
 Transforms/InstCombine/sdiv-canonicalize.ll
 Transforms/InstCombine/sdiv-guard.ll
 Transforms/InstCombine/select-and-or.ll
-Transforms/InstCombine/select-bitext.ll
 Transforms/InstCombine/select-cmp-br.ll
 Transforms/InstCombine/select-cmp.ll
 Transforms/InstCombine/select-factorize.ll
@@ -913,9 +336,7 @@ Transforms/InstCombine/select_frexp.ll
 Transforms/InstCombine/select.ll
 Transforms/InstCombine/select-min-max.ll
 Transforms/InstCombine/select-of-symmetric-selects.ll
-Transforms/InstCombine/select-safe-transforms.ll
 Transforms/InstCombine/select-select.ll
-Transforms/InstCombine/select-with-extreme-eq-cond.ll
 Transforms/InstCombine/shift.ll
 Transforms/InstCombine/shuffle-select-narrow-inseltpoison.ll
 Transforms/InstCombine/shuffle-select-narrow.ll
@@ -1065,65 +486,12 @@ Transforms/LoopBoundSplit/bug51866.ll
 Transforms/LoopBoundSplit/bug-loop-bound-split-phi-in-exit-block.ll
 Transforms/LoopBoundSplit/loop-bound-split.ll
 Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll
-Transforms/LoopDistribute/basic-with-memchecks.ll
-Transforms/LoopDistribute/bounds-expansion-bug.ll
-Transforms/LoopDistribute/cross-partition-access.ll
-Transforms/LoopDistribute/debug-loc.ll
-Transforms/LoopDistribute/debug-print.ll
-Transforms/LoopDistribute/diagnostics.ll
-Transforms/LoopDistribute/followup.ll
-Transforms/LoopDistribute/laa-invalidation.ll
-Transforms/LoopDistribute/outside-use.ll
-Transforms/LoopDistribute/pointer-phi-in-loop.ll
-Transforms/LoopDistribute/scev-inserted-runtime-check.ll
-Transforms/LoopDistribute/symbolic-stride.ll
-Transforms/LoopFlatten/loop-flatten-version.ll
 Transforms/LoopIdiom/AArch64/byte-compare-index.ll
 Transforms/LoopIdiom/AArch64/find-first-byte.ll
 Transforms/LoopIdiom/RISCV/byte-compare-index.ll
-Transforms/LoopIdiom/X86/arithmetic-right-shift-until-zero.ll
-Transforms/LoopIdiom/X86/left-shift-until-bittest.ll
-Transforms/LoopIdiom/X86/left-shift-until-zero.ll
-Transforms/LoopIdiom/X86/logical-right-shift-until-zero-debuginfo.ll
-Transforms/LoopIdiom/X86/logical-right-shift-until-zero.ll
-Transforms/LoopLoadElim/forward.ll
-Transforms/LoopLoadElim/invalidate-laa-after-versioning.ll
-Transforms/LoopLoadElim/memcheck.ll
-Transforms/LoopLoadElim/pr47457.ll
-Transforms/LoopLoadElim/symbolic-stride.ll
-Transforms/LoopLoadElim/unknown-stride-known-dep.ll
-Transforms/LoopLoadElim/versioning-scev-invalidation.ll
-Transforms/LoopPredication/preserve-bpi.ll
-Transforms/LoopSimplifyCFG/constant-fold-branch.ll
-Transforms/LoopSimplifyCFG/handle_dead_exits.ll
-Transforms/LoopSimplifyCFG/invalidate-scev-dispositions-2.ll
-Transforms/LoopSimplifyCFG/invalidate-scev-dispositions.ll
-Transforms/LoopSimplifyCFG/lcssa.ll
-Transforms/LoopSimplifyCFG/live_block_marking.ll
-Transforms/LoopSimplifyCFG/mssa_update.ll
-Transforms/LoopSimplifyCFG/pr117537.ll
-Transforms/LoopSimplifyCFG/update_parents.ll
 Transforms/LoopUnroll/peel-last-iteration-expansion-cost.ll
 Transforms/LoopUnroll/peel-last-iteration-with-guards.ll
 Transforms/LoopUnroll/peel-last-iteration-with-variable-trip-count.ll
-Transforms/LoopVersioning/add-phi-update-users.ll
-Transforms/LoopVersioning/basic.ll
-Transforms/LoopVersioning/bound-check-partially-known.ll
-Transforms/LoopVersioning/crash-36998.ll
-Transforms/LoopVersioning/exit-block-dominates-rt-check-block.ll
-Transforms/LoopVersioning/incorrect-phi.ll
-Transforms/LoopVersioning/invalidate-laa-after-versioning.ll
-Transforms/LoopVersioning/lcssa.ll
-Transforms/LoopVersioningLICM/load-from-unknown-address.ll
-Transforms/LoopVersioningLICM/loopversioningLICM1.ll
-Transforms/LoopVersioningLICM/loopversioningLICM2.ll
-Transforms/LoopVersioningLICM/metadata.ll
-Transforms/LoopVersioning/loop-invariant-bound.ll
-Transforms/LoopVersioning/noalias.ll
-Transforms/LoopVersioning/noalias-version-twice.ll
-Transforms/LoopVersioning/single-iteration.ll
-Transforms/LoopVersioning/wrapping-pointer-non-integral-addrspace.ll
-Transforms/LoopVersioning/wrapping-pointer-versioning.ll
 Transforms/LowerAtomic/atomic-load.ll
 Transforms/LowerAtomic/atomic-swap.ll
 Transforms/LowerConstantIntrinsics/builtin-object-size-phi.ll
@@ -1240,15 +608,8 @@ Transforms/OpenMP/spmdization.ll
 Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
 Transforms/OpenMP/spmdization_remarks.ll
 Transforms/PartiallyInlineLibCalls/X86/good-prototype.ll
-Transforms/PGOProfile/chr-dead-pred.ll
-Transforms/PGOProfile/chr-dup-threshold.ll
-Transforms/PGOProfile/chr-lifetimes.ll
-Transforms/PGOProfile/chr-poison.ll
 Transforms/PGOProfile/comdat.ll
 Transforms/PGOProfile/memop_profile_funclet_wasm.ll
-Transforms/PGOProfile/profcheck-select.ll
-Transforms/PGOProfile/prof-verify.ll
-Transforms/PGOProfile/prof-verify-no-entrycount.ll
 Transforms/PGOProfile/X86/macho.ll
 Transforms/PhaseOrdering/AArch64/constraint-elimination-placement.ll
 Transforms/PhaseOrdering/AArch64/globals-aa-required-for-vectorization.ll
@@ -1279,10 +640,6 @@ Transforms/SandboxVectorizer/special_opcodes.ll
 Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll
 Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-store.ll
 Transforms/ScalarizeMaskedMemIntrin/AArch64/streaming-compatible-expand-masked-gather-scatter.ll
-Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expamd-masked-load.ll
-Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expand-masked-gather.ll
-Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expand-masked-scatter.ll
-Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expand-masked-store.ll
 Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-compressstore.ll
 Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-expandload.ll
 Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-gather.ll
@@ -1296,31 +653,8 @@ Transforms/Scalarizer/scatter-order.ll
 Transforms/Scalarizer/variable-extractelement.ll
 Transforms/Scalarizer/variable-insertelement.ll
 Transforms/Scalarizer/vector-of-pointer-to-vector.ll
-Transforms/SimpleLoopUnswitch/debuginfo.ll
-Transforms/SimpleLoopUnswitch/delete-dead-blocks.ll
-Transforms/SimpleLoopUnswitch/endless-unswitch.ll
-Transforms/SimpleLoopUnswitch/guards.ll
-Transforms/SimpleLoopUnswitch/inject-invariant-conditions-exponential.ll
-Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
-Transforms/SimpleLoopUnswitch/LIV-loop-condtion.ll
-Transforms/SimpleLoopUnswitch/nontrivial-unswitch-freeze.ll
-Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
-Transforms/SimpleLoopUnswitch/nontrivial-unswitch-select.ll
-Transforms/SimpleLoopUnswitch/nontrivial-unswitch-skip-selects-in-guards.ll
-Transforms/SimpleLoopUnswitch/partial-unswitch.ll
-Transforms/SimpleLoopUnswitch/partial-unswitch-loop-and-block-dispositions.ll
-Transforms/SimpleLoopUnswitch/partial-unswitch-mssa-threshold.ll
-Transforms/SimpleLoopUnswitch/partial-unswitch-update-memoryssa.ll
-Transforms/SimpleLoopUnswitch/pr138509.ll
-Transforms/SimpleLoopUnswitch/pr59546.ll
-Transforms/SimpleLoopUnswitch/pr60736.ll
-Transforms/SimpleLoopUnswitch/trivial-unswitch-freeze-individual-conditions.ll
-Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
-Transforms/SimpleLoopUnswitch/trivial-unswitch-logical-and-or.ll
-Transforms/SimplifyCFG/RISCV/switch-of-powers-of-two.ll
-Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll
 Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
-Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll
+Transforms/StructurizeCFG/callbr.ll
 Transforms/StructurizeCFG/hoist-zerocost.ll
 Transforms/StructurizeCFG/loop-break-phi.ll
 Transforms/StructurizeCFG/nested-loop-order.ll
diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat
index 001339f2a8f05..3aeedaa3288a2 100644
--- a/llvm/utils/release/build_llvm_release.bat
+++ b/llvm/utils/release/build_llvm_release.bat
@@ -1,6 +1,9 @@
 @echo off
-setlocal enabledelayedexpansion
 
+REM Filter out tests that are known to fail.
+set "LIT_FILTER_OUT=gh110231.cpp|crt_initializers.cpp|init-order-atexit.cpp|use_after_return_linkage.cpp|initialization-bug.cpp|initialization-bug-no-global.cpp|trace-malloc-unbalanced.test|trace-malloc-2.test|TraceMallocTest"
+
+setlocal enabledelayedexpansion
 goto begin
 
 :usage
@@ -24,6 +27,7 @@ echo.
 echo Example: build_llvm_release.bat --version 15.0.0 --x86 --x64
 exit /b 1
 
+
 :begin
 
 ::==============================================================================
@@ -163,7 +167,8 @@ set common_cmake_flags=^
   -DCMAKE_CXX_FLAGS="%common_compiler_flags%" ^
   -DLLVM_ENABLE_RPMALLOC=ON ^
   -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld" ^
-  -DLLVM_ENABLE_RUNTIMES="compiler-rt;openmp"
+  -DLLVM_ENABLE_RUNTIMES="compiler-rt;openmp" ^
+  -DCOMPILER_RT_BUILD_ORC=OFF
 
 if "%force-msvc%" == "" (
   where /q clang-cl
@@ -224,7 +229,7 @@ ninja || ninja || ninja || exit /b 1
 REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
 REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
 ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
+REM ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1
 REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
 cd..
 
@@ -249,7 +254,7 @@ ninja || ninja || ninja || exit /b 1
 REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
 REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
 ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
+REM ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1
 REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
 ninja package || exit /b 1
 cd ..
diff --git a/llvm/utils/update_llc_test_checks.py b/llvm/utils/update_llc_test_checks.py
index 8c57e75f34f75..98864be62875b 100755
--- a/llvm/utils/update_llc_test_checks.py
+++ b/llvm/utils/update_llc_test_checks.py
@@ -15,7 +15,7 @@
 import os  # Used to advertise this file's name ("autogenerated_note").
 import sys
 
-from UpdateTestChecks import common
+from UpdateTestChecks import common, mir
 
 # llc is the only llc-like in the LLVM tree but downstream forks can add
 # additional ones here if they have them.
@@ -33,6 +33,7 @@ def update_test(ti: common.TestInfo):
             break
 
     run_list = []
+    mir_run_list = []
     for l in ti.run_lines:
         if "|" not in l:
             common.warn("Skipping unparsable RUN line: " + l)
@@ -57,9 +58,14 @@ def update_test(ti: common.TestInfo):
         if m:
             march_in_cmd = m.groups()[0]
 
+        target_list = run_list
         m = common.DEBUG_ONLY_ARG_RE.search(llc_cmd)
         if m and m.groups()[0] == "isel":
             from UpdateTestChecks import isel as output_type
+        elif not m and common.STOP_PASS_RE.search(llc_cmd):
+            # MIR output mode. If -debug-only is present assume
+            # the debug output is the main point of interest.
+            target_list = mir_run_list
         else:
             from UpdateTestChecks import asm as output_type
 
@@ -84,7 +90,7 @@ def update_test(ti: common.TestInfo):
 
         # FIXME: We should use multiple check prefixes to common check lines. For
         # now, we just ignore all but the last.
-        run_list.append(
+        target_list.append(
             (
                 check_prefixes,
                 llc_tool,
@@ -119,14 +125,20 @@ def update_test(ti: common.TestInfo):
         ginfo=ginfo,
     )
 
-    for (
-        prefixes,
-        llc_tool,
-        llc_args,
-        preprocess_cmd,
-        triple_in_cmd,
-        march_in_cmd,
-    ) in run_list:
+    # Dictionary to store MIR function bodies separately
+    mir_func_dict = {}
+    for run_tuple, is_mir in [(run, False) for run in run_list] + [
+        (run, True) for run in mir_run_list
+    ]:
+        (
+            prefixes,
+            llc_tool,
+            llc_args,
+            preprocess_cmd,
+            triple_in_cmd,
+            march_in_cmd,
+        ) = run_tuple
+
         common.debug("Extracted LLC cmd:", llc_tool, llc_args)
         common.debug("Extracted FileCheck prefixes:", str(prefixes))
 
@@ -141,22 +153,54 @@ def update_test(ti: common.TestInfo):
         if not triple:
             triple = common.get_triple_from_march(march_in_cmd)
 
-        scrubber, function_re = output_type.get_run_handler(triple)
-        if 0 == builder.process_run_line(
-            function_re, scrubber, raw_tool_output, prefixes
-        ):
-            common.warn(
-                "Couldn't match any function. Possibly the wrong target triple has been provided"
+        if is_mir:
+            # MIR output mode
+            common.debug("Detected MIR output mode for prefixes:", str(prefixes))
+            for prefix in prefixes:
+                if prefix not in mir_func_dict:
+                    mir_func_dict[prefix] = {}
+
+            mir.build_function_info_dictionary(
+                ti.path,
+                raw_tool_output,
+                triple,
+                prefixes,
+                mir_func_dict,
+                ti.args.verbose,
             )
-        builder.processed_prefixes(prefixes)
+        else:
+            # ASM output mode
+            scrubber, function_re = output_type.get_run_handler(triple)
+            if 0 == builder.process_run_line(
+                function_re, scrubber, raw_tool_output, prefixes
+            ):
+                common.warn(
+                    "Couldn't match any function. Possibly the wrong target triple has been provided"
+                )
+            builder.processed_prefixes(prefixes)
 
     func_dict = builder.finish_and_get_func_dict()
+
+    # Check for conflicts: same prefix used for both ASM and MIR
+    conflicting_prefixes = set(func_dict.keys()) & set(mir_func_dict.keys())
+    if conflicting_prefixes:
+        common.warn(
+            "The following prefixes are used for both ASM and MIR output, which will cause FileCheck failures: {}".format(
+                ", ".join(sorted(conflicting_prefixes))
+            ),
+            test_file=ti.path,
+        )
+        for prefix in conflicting_prefixes:
+            mir_func_dict[prefix] = {}
+            func_dict[prefix] = {}
+
     global_vars_seen_dict = {}
 
     is_in_function = False
     is_in_function_start = False
     func_name = None
     prefix_set = set([prefix for p in run_list for prefix in p[0]])
+    prefix_set.update([prefix for p in mir_run_list for prefix in p[0]])
     common.debug("Rewriting FileCheck prefixes:", str(prefix_set))
     output_lines = []
 
@@ -221,6 +265,22 @@ def update_test(ti: common.TestInfo):
                         is_filtered=builder.is_filtered(),
                     )
                 )
+
+                # Also add MIR checks if we have them for this function
+                if mir_run_list and func_name:
+                    mir.add_mir_checks_for_function(
+                        ti.path,
+                        output_lines,
+                        mir_run_list,
+                        mir_func_dict,
+                        func_name,
+                        single_bb=False,  # Don't skip basic block labels.
+                        print_fixed_stack=False,  # Don't print fixed stack (ASM tests don't need it).
+                        first_check_is_next=False,  # First check is LABEL, not NEXT.
+                        at_the_function_name=False,  # Use "name:" not "@name".
+                        check_indent="",  # No indentation for IR files (not MIR files).
+                    )
+
                 is_in_function_start = False
 
             if is_in_function:
diff --git a/llvm/utils/update_test_checks.py b/llvm/utils/update_test_checks.py
index 42227b20fca76..74e87787fd5b8 100755
--- a/llvm/utils/update_test_checks.py
+++ b/llvm/utils/update_test_checks.py
@@ -197,6 +197,7 @@ def update_test(ti: common.TestInfo):
                     global_tbaa_records_for_prefixes,
                     is_filtered=builder.is_filtered(),
                     original_check_lines=original_check_lines.get(func, {}),
+                    check_inst_comments=args.check_inst_comments,
                 ),
             )
         )
@@ -230,6 +231,7 @@ def update_test(ti: common.TestInfo):
                         global_tbaa_records_for_prefixes,
                         is_filtered=builder.is_filtered(),
                         original_check_lines=original_check_lines.get(func_name, {}),
+                        check_inst_comments=args.check_inst_comments,
                     )
                 )
                 is_in_function_start = False
@@ -362,6 +364,12 @@ def main():
         choices=["none", "smart", "all"],
         help="Check global entries (global variables, metadata, attribute sets, ...) for functions",
     )
+    parser.add_argument(
+        "--check-inst-comments",
+        action="store_true",
+        default=False,
+        help="Check the generated comments describing instructions (e.g., -print-predicate-info/print<memssa>)",
+    )
     parser.add_argument(
         "--reset-variable-names",
         action="store_true",
diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake
index fa6aec8a603a9..8196e2a2a3321 100644
--- a/mlir/cmake/modules/AddMLIRPython.cmake
+++ b/mlir/cmake/modules/AddMLIRPython.cmake
@@ -791,7 +791,6 @@ function(add_mlir_python_extension libname extname)
       get_property(NB_LIBRARY_TARGET_NAME TARGET ${libname} PROPERTY LINK_LIBRARIES)
       target_compile_options(${NB_LIBRARY_TARGET_NAME}
         PRIVATE
-          -Wall -Wextra -Wpedantic
           -Wno-c++98-compat-extra-semi
           -Wno-cast-qual
           -Wno-covered-switch-default
@@ -799,11 +798,11 @@ function(add_mlir_python_extension libname extname)
           -Wno-nested-anon-types
           -Wno-unused-parameter
           -Wno-zero-length-array
+          -Wno-missing-field-initializers
           ${eh_rtti_enable})
 
       target_compile_options(${libname}
         PRIVATE
-          -Wall -Wextra -Wpedantic
           -Wno-c++98-compat-extra-semi
           -Wno-cast-qual
           -Wno-covered-switch-default
@@ -811,6 +810,7 @@ function(add_mlir_python_extension libname extname)
           -Wno-nested-anon-types
           -Wno-unused-parameter
           -Wno-zero-length-array
+          -Wno-missing-field-initializers
           ${eh_rtti_enable})
     endif()
 
diff --git a/mlir/docs/LangRef.md b/mlir/docs/LangRef.md
index 21cfdc78a6a43..10cfba908ee45 100644
--- a/mlir/docs/LangRef.md
+++ b/mlir/docs/LangRef.md
@@ -440,7 +440,7 @@ region is not imposed by the IR. Instead, the containing operation defines the
 semantics of the regions it contains. MLIR currently defines two kinds of
 regions: [SSACFG regions](#control-flow-and-ssacfg-regions), which describe
 control flow between blocks, and [Graph regions](#graph-regions), which do not
-require control flow between block. The kinds of regions within an operation are
+require control flow between blocks. The kinds of regions within an operation are
 described using the [RegionKindInterface](Interfaces.md/#regionkindinterfaces).
 
 Regions do not have a name or an address, only the blocks contained in a region
diff --git a/mlir/docs/PassManagement.md b/mlir/docs/PassManagement.md
index a920d57c7cd25..8d20b496cd3a6 100644
--- a/mlir/docs/PassManagement.md
+++ b/mlir/docs/PassManagement.md
@@ -835,6 +835,12 @@ each pass, the generator produces a `registerPassName` where
 generates a `registerGroupPasses`, where `Group` is the tag provided via the
 `-name` input parameter, that registers all of the passes present.
 
+These declarations can be enabled for the whole group of passes by
+defining the `GEN_PASS_REGISTRATION` macro, or on a per-pass basis by
+defining `GEN_PASS_REGISTRATION_PASSNAME` where `PASSNAME` is the
+uppercase version of the name of the pass (similar to pass def and
+decls).
+
 ```c++
 // Tablegen options: -gen-pass-decls -name="Example"
 
diff --git a/mlir/include/mlir-c/Dialect/Linalg.h b/mlir/include/mlir-c/Dialect/Linalg.h
index 339e63d667c5e..003b0cde39652 100644
--- a/mlir/include/mlir-c/Dialect/Linalg.h
+++ b/mlir/include/mlir-c/Dialect/Linalg.h
@@ -10,6 +10,7 @@
 #ifndef MLIR_C_DIALECT_LINALG_H
 #define MLIR_C_DIALECT_LINALG_H
 
+#include "mlir-c/AffineMap.h"
 #include "mlir-c/IR.h"
 #include "mlir-c/Support.h"
 
@@ -34,6 +35,10 @@ typedef struct MlirLinalgContractionDimensions {
 MLIR_CAPI_EXPORTED MlirLinalgContractionDimensions
 mlirLinalgInferContractionDimensions(MlirOperation op);
 
+MLIR_CAPI_EXPORTED MlirLinalgContractionDimensions
+mlirLinalgInferContractionDimensionsFromMaps(const MlirAffineMap *indexingMaps,
+                                             size_t numMaps);
+
 MLIR_CAPI_EXPORTED bool mlirLinalgIsAConvolutionOp(MlirOperation op);
 
 typedef struct MlirLinalgConvolutionDimensions {
diff --git a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
index 3c87c453a4cf0..5b7b45fdd1d58 100644
--- a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
+++ b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
@@ -127,6 +127,18 @@ class AbstractDenseForwardDataFlowAnalysis : public DataFlowAnalysis {
   /// them into the same equivalent class.
   virtual void buildOperationEquivalentLatticeAnchor(Operation *op) {}
 
+  /// Visit a block and propagate the dense lattice forward along the control
+  /// flow edge from predecessor to block. `point` corresponds to the program
+  /// point before `block`. The default implementation merges in the state from
+  /// the predecessor's terminator.
+  virtual void visitBlockTransfer(Block *block, ProgramPoint *point,
+                                  Block *predecessor,
+                                  const AbstractDenseLattice &before,
+                                  AbstractDenseLattice *after) {
+    // Merge in the state from the predecessor's terminator.
+    join(after, before);
+  }
+
   /// Propagate the dense lattice forward along the control flow edge from
   /// `regionFrom` to `regionTo` regions of the `branch` operation. `nullopt`
   /// values correspond to control flow branches originating at or targeting the
@@ -259,6 +271,22 @@ class DenseForwardDataFlowAnalysis
         branch, regionFrom, regionTo, before, after);
   }
 
+  /// Hook for customizing the behavior of lattice propagation along the control
+  /// flow edges between blocks. The control flows from `predecessor` to
+  /// `block`. The lattice is propagated forward along this edge. The lattices
+  /// are as follows:
+  ///   - `before` is the lattice at the end of the predecessor block;
+  ///   - `after` is the lattice at the beginning of the block.
+  /// By default, the `after` state is simply joined with the `before` state.
+  /// Concrete analyses can override this behavior or delegate to the parent
+  /// call for the default behavior.
+  virtual void visitBlockTransfer(Block *block, ProgramPoint *point,
+                                  Block *predecessor, const LatticeT &before,
+                                  LatticeT *after) {
+    AbstractDenseForwardDataFlowAnalysis::visitBlockTransfer(
+        block, point, predecessor, before, after);
+  }
+
 protected:
   /// Get the dense lattice on this lattice anchor.
   LatticeT *getLattice(LatticeAnchor anchor) override {
@@ -306,6 +334,13 @@ class DenseForwardDataFlowAnalysis
                                          static_cast<const LatticeT &>(before),
                                          static_cast<LatticeT *>(after));
   }
+  void visitBlockTransfer(Block *block, ProgramPoint *point, Block *predecessor,
+                          const AbstractDenseLattice &before,
+                          AbstractDenseLattice *after) final {
+    visitBlockTransfer(block, point, predecessor,
+                       static_cast<const LatticeT &>(before),
+                       static_cast<LatticeT *>(after));
+  }
 };
 
 //===----------------------------------------------------------------------===//
@@ -388,6 +423,17 @@ class AbstractDenseBackwardDataFlowAnalysis : public DataFlowAnalysis {
   /// them into the same equivalent class.
   virtual void buildOperationEquivalentLatticeAnchor(Operation *op) {}
 
+  /// Visit a block and propagate the dense lattice backward along the control
+  /// flow edge from successor to block. `point` corresponds to the program
+  /// point after `block`. The default implementation merges in the state from
+  /// the successor's first operation or the block itself when empty.
+  virtual void visitBlockTransfer(Block *block, ProgramPoint *point,
+                                  Block *successor,
+                                  const AbstractDenseLattice &after,
+                                  AbstractDenseLattice *before) {
+    meet(before, after);
+  }
+
   /// Propagate the dense lattice backwards along the control flow edge from
   /// `regionFrom` to `regionTo` regions of the `branch` operation. `nullopt`
   /// values correspond to control flow branches originating at or targeting the
@@ -531,6 +577,22 @@ class DenseBackwardDataFlowAnalysis
         branch, regionFrom, regionTo, after, before);
   }
 
+  /// Hook for customizing the behavior of lattice propagation along the control
+  /// flow edges between blocks. The control flows from `successor` to
+  /// `block`. The lattice is propagated back along this edge. The lattices
+  /// are as follows:
+  ///   - `after` is the lattice at the beginning of the successor block;
+  ///   - `before` is the lattice at the end of the block.
+  /// By default, the `before` state is simply met with the `after` state.
+  /// Concrete analyses can override this behavior or delegate to the parent
+  /// call for the default behavior.
+  virtual void visitBlockTransfer(Block *block, ProgramPoint *point,
+                                  Block *successor, const LatticeT &after,
+                                  LatticeT *before) {
+    AbstractDenseBackwardDataFlowAnalysis::visitBlockTransfer(
+        block, point, successor, after, before);
+  }
+
 protected:
   /// Get the dense lattice at the given lattice anchor.
   LatticeT *getLattice(LatticeAnchor anchor) override {
@@ -577,6 +639,13 @@ class DenseBackwardDataFlowAnalysis
                                          static_cast<const LatticeT &>(after),
                                          static_cast<LatticeT *>(before));
   }
+  void visitBlockTransfer(Block *block, ProgramPoint *point, Block *successor,
+                          const AbstractDenseLattice &after,
+                          AbstractDenseLattice *before) final {
+    visitBlockTransfer(block, point, successor,
+                       static_cast<const LatticeT &>(after),
+                       static_cast<LatticeT *>(before));
+  }
 };
 
 } // end namespace dataflow
diff --git a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h
index 985573476ab78..360d3c7e62000 100644
--- a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h
+++ b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h
@@ -138,28 +138,25 @@ class Lattice : public AbstractSparseLattice {
 
   /// Meet (intersect) the information contained in the 'rhs' value with this
   /// lattice. Returns if the state of the current lattice changed.  If the
-  /// lattice elements don't have a `meet` method, this is a no-op (see below.)
-  template <typename VT,
-            std::enable_if_t<lattice_has_meet<VT>::value> * = nullptr>
+  /// lattice elements don't have a `meet` method, this is a no-op.
+  template <typename VT>
   ChangeResult meet(const VT &rhs) {
-    ValueT newValue = ValueT::meet(value, rhs);
-    assert(ValueT::meet(newValue, value) == newValue &&
-           "expected `meet` to be monotonic");
-    assert(ValueT::meet(newValue, rhs) == newValue &&
-           "expected `meet` to be monotonic");
-
-    // Update the current optimistic value if something changed.
-    if (newValue == value)
+    if constexpr (!lattice_has_meet<VT>::value) {
       return ChangeResult::NoChange;
-
-    value = newValue;
-    return ChangeResult::Change;
-  }
-
-  template <typename VT,
-            std::enable_if_t<!lattice_has_meet<VT>::value> * = nullptr>
-  ChangeResult meet(const VT &rhs) {
-    return ChangeResult::NoChange;
+    } else {
+      ValueT newValue = ValueT::meet(value, rhs);
+      assert(ValueT::meet(newValue, value) == newValue &&
+             "expected `meet` to be monotonic");
+      assert(ValueT::meet(newValue, rhs) == newValue &&
+             "expected `meet` to be monotonic");
+
+      // Update the current optimistic value if something changed.
+      if (newValue == value)
+        return ChangeResult::NoChange;
+
+      value = newValue;
+      return ChangeResult::Change;
+    }
   }
 
   /// Print the lattice element.
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
index 964281592cc65..e7ab63abfeaa1 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
@@ -86,18 +86,52 @@ class AttrConvertPassThrough {
 /// ArrayRef<NamedAttribute>.
 template <typename SourceOp, typename TargetOp,
           template <typename, typename> typename AttrConvert =
-              AttrConvertPassThrough>
+              AttrConvertPassThrough,
+          bool FailOnUnsupportedFP = false>
 class VectorConvertToLLVMPattern : public ConvertOpToLLVMPattern<SourceOp> {
 public:
   using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
   using Super = VectorConvertToLLVMPattern<SourceOp, TargetOp>;
 
+  /// Return the given type if it's a floating point type. If the given type is
+  /// a vector type, return its element type if it's a floating point type.
+  static FloatType getFloatingPointType(Type type) {
+    if (auto floatType = dyn_cast<FloatType>(type))
+      return floatType;
+    if (auto vecType = dyn_cast<VectorType>(type))
+      return dyn_cast<FloatType>(vecType.getElementType());
+    return nullptr;
+  }
+
   LogicalResult
   matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     static_assert(
         std::is_base_of<OpTrait::OneResult<SourceOp>, SourceOp>::value,
         "expected single result op");
+
+    // The pattern should not apply if a floating-point operand is converted to
+    // a non-floating-point type. This indicates that the floating point type
+    // is not supported by the LLVM lowering. (Such types are converted to
+    // integers.)
+    auto checkType = [&](Value v) -> LogicalResult {
+      FloatType floatType = getFloatingPointType(v.getType());
+      if (!floatType)
+        return success();
+      Type convertedType = this->getTypeConverter()->convertType(floatType);
+      if (!isa_and_nonnull<FloatType>(convertedType))
+        return rewriter.notifyMatchFailure(op,
+                                           "unsupported floating point type");
+      return success();
+    };
+    if (FailOnUnsupportedFP) {
+      for (Value operand : op->getOperands())
+        if (failed(checkType(operand)))
+          return failure();
+      if (failed(checkType(op->getResult(0))))
+        return failure();
+    }
+
     // Determine attributes for the target op
     AttrConvert<SourceOp, TargetOp> attrConvert(op);
 
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td
index 246ae77f327cf..c7775f2407ebd 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td
@@ -138,7 +138,7 @@ def BufferizableOpInterface : OpInterface<"BufferizableOpInterface"> {
         /*retType=*/"bool",
         /*methodName=*/"bufferizesToElementwiseAccess",
         /*args=*/(ins "const ::mlir::bufferization::AnalysisState &":$state,
-                      "ArrayRef<OpOperand *>":$opOperands),
+                      "::llvm::ArrayRef<::mlir::OpOperand *>":$opOperands),
         /*methodBody=*/"",
         /*defaultImplementation=*/[{
           // It is always safe to assume that the op is not element-wise.
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
index 6724d4c483101..a9b2b9f39519d 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
@@ -28,7 +28,8 @@ class Bufferization_Op<string mnemonic, list<Trait> traits = []>
 
 def Bufferization_AllocTensorOp : Bufferization_Op<"alloc_tensor",
     [AttrSizedOperandSegments, BufferizableOpInterface,
-     DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>]> {
+     DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface, [
+       "reifyResultShapes"]>]> {
   let summary = "allocate buffer for a tensor";
 
   let description = [{
@@ -219,7 +220,8 @@ def Bufferization_MaterializeInDestinationOp
     : Bufferization_Op<"materialize_in_destination",
         [AllElementTypesMatch<["source", "dest"]>,
          BufferizableOpInterface, DestinationStyleOpInterface,
-         DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>,
+         DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface, [
+           "reifyResultShapes"]>,
          DeclareOpInterfaceMethods<SubsetOpInterface,
             ["operatesOnEquivalentSubset", "operatesOnDisjointSubset"]>,
          DeclareOpInterfaceMethods<SubsetInsertionOpInterface,
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
index 67ac487d8226d..ea158914e445b 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
@@ -171,6 +171,9 @@ struct BufferResultsToOutParamsOpts {
   /// If true, the pass eliminates the memref.alloc and memcpy if the returned
   /// memref is allocated in the current function and has dynamic shape.
   bool hoistDynamicAllocs = false;
+
+  /// If true, the pass modifies the function signatures of public functions.
+  bool modifyPublicFunctions = false;
 };
 
 /// Replace buffers that are returned from a function with an out parameter.
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
index cad44cb15f479..1eb692586bcfc 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -258,6 +258,9 @@ def BufferResultsToOutParamsPass
               /*default=*/"false", "Hoist static allocations to call sites.">,
        Option<"hoistDynamicAllocs", "hoist-dynamic-allocs", "bool",
               /*default=*/"false", "Hoist dynamic allocations to call sites.">,
+       Option<"modifyPublicFunctions", "modify-public-functions", "bool",
+              /*default=*/"false", "Modify function signatures of public "
+              "functions.">,
   ];
   let dependentDialects = ["memref::MemRefDialect"];
 }
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
index 7b53594a1c8e2..a9886d1f21ca0 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
@@ -178,8 +178,8 @@ enum class SparseHandleKind { SpMat, DnTensor, SpGEMMOp };
 class SparseDnTensorHandleType
     : public Type::TypeBase<SparseDnTensorHandleType, Type, TypeStorage> {
 public:
-  using Base = typename Type::TypeBase<SparseDnTensorHandleType, Type,
-                                       TypeStorage>::Base;
+  using Base =
+      Type::TypeBase<SparseDnTensorHandleType, Type, TypeStorage>::Base;
   using Base::Base;
 
   static constexpr StringLiteral name = "gpu.sparse.dntensor_handle";
@@ -188,8 +188,7 @@ class SparseDnTensorHandleType
 class SparseSpMatHandleType
     : public Type::TypeBase<SparseSpMatHandleType, Type, TypeStorage> {
 public:
-  using Base =
-      typename Type::TypeBase<SparseSpMatHandleType, Type, TypeStorage>::Base;
+  using Base = Type::TypeBase<SparseSpMatHandleType, Type, TypeStorage>::Base;
   using Base::Base;
 
   static constexpr StringLiteral name = "gpu.sparse.spmat_handle";
@@ -198,8 +197,8 @@ class SparseSpMatHandleType
 class SparseSpGEMMOpHandleType
     : public Type::TypeBase<SparseSpGEMMOpHandleType, Type, TypeStorage> {
 public:
-  using Base = typename Type::TypeBase<SparseSpGEMMOpHandleType, Type,
-                                       TypeStorage>::Base;
+  using Base =
+      Type::TypeBase<SparseSpGEMMOpHandleType, Type, TypeStorage>::Base;
   using Base::Base;
 
   static constexpr StringLiteral name = "gpu.sparse.spgemmop_handle";
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td
index e7b44fd94b26d..e2edab44153ca 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td
@@ -758,13 +758,16 @@ def FramePointerKindAll
     : LLVM_EnumAttrCase<"All", "all", "All", 2>;
 def FramePointerKindReserved
     : LLVM_EnumAttrCase<"Reserved", "reserved", "Reserved", 3>;
+def FramePointerKindNonLeafNoReserve
+    : LLVM_EnumAttrCase<"NonLeafNoReserve", "non-leaf-no-reserve", "NonLeafNoReserve", 4>;
 
 def FramePointerKindEnum : LLVM_EnumAttr<
     "FramePointerKind",
     "::llvm::FramePointerKind",
     "LLVM FramePointerKind",
     [FramePointerKindNone, FramePointerKindNonLeaf,
-     FramePointerKindAll, FramePointerKindReserved]> {
+     FramePointerKindAll, FramePointerKindReserved,
+     FramePointerKindNonLeafNoReserve]> {
   let cppNamespace = "::mlir::LLVM::framePointerKind";
 }
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 4f483859ac18d..4c13c5ddb2886 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -476,9 +476,9 @@ def ReduxKind : I32EnumAttr<"ReduxKind", "NVVM redux kind",
 def ReduxKindAttr : EnumAttr<NVVM_Dialect, ReduxKind, "redux_kind">;
 
 def NVVM_ReduxOp :
-  NVVM_Op<"redux.sync", [NVVMRequiresSM<80>]>,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_Type:$val,
+  NVVM_Op<"redux.sync", [NVVMRequiresSM<80>, AllTypesMatch<["res", "val"]>]>,
+  Results<(outs AnyTypeOf<[I32, F32]>:$res)>,
+  Arguments<(ins AnyTypeOf<[I32, F32]>:$val,
                  ReduxKindAttr:$kind,
                  I32:$mask_and_clamp,
                  DefaultValuedAttr<BoolAttr, "false">:$abs,
@@ -496,6 +496,8 @@ def NVVM_ReduxOp :
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-redux-sync)
   }];
+  let hasVerifier = 1;
+
   string llvmBuilder = [{
       auto intId = getReduxIntrinsicId($_resultType, $kind, $abs, $nan);
       $res = createIntrinsicCall(builder, intId, {$val, $mask_and_clamp});
@@ -510,8 +512,7 @@ def NVVM_ReduxOp :
 //===----------------------------------------------------------------------===//
 
 def NVVM_NanosleepOp : NVVM_Op<"nanosleep">,
-  Arguments<(ins 
-  ConfinedAttr<I32Attr, [IntMinValue<1>, IntMaxValue<1000000>]>:$duration)> 
+  Arguments<(ins I32:$duration)> 
 {
   let summary = "Suspends the thread for a specified duration.";
 
@@ -529,8 +530,7 @@ def NVVM_NanosleepOp : NVVM_Op<"nanosleep">,
   
   string llvmBuilder = [{
       createIntrinsicCall(builder, 
-                          llvm::Intrinsic::nvvm_nanosleep, 
-                          {builder.getInt32($duration)});
+                          llvm::Intrinsic::nvvm_nanosleep, {$duration});
   }];
   let assemblyFormat = "attr-dict $duration";
 }
@@ -579,7 +579,8 @@ def NVVM_PMEventOp : NVVM_PTXBuilder_Op<"pmevent">,
 
 /// mbarrier.init instruction with generic pointer type
 def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">,
-  Arguments<(ins LLVM_AnyPointer:$addr, I32:$count, PtxPredicate:$predicate)> {
+  Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
+                 I32:$count, PtxPredicate:$predicate)> {
   let summary = "MBarrier Initialization Op";
   let description = [{
     The `nvvm.mbarrier.init` operation initializes an *mbarrier object* at the specified 
@@ -592,48 +593,35 @@ def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">,
     - Transaction count (tx-count): 0
 
     The operation takes the following operands:
-    - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
-      addressing, but the address must still be in the shared memory space.
+    - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
+      must be a pointer to generic or shared::cta memory. When it is generic, the
+      underlying address must be within the shared::cta memory space; otherwise
+      the behavior is undefined.
     - `count`: Integer specifying the number of threads that will participate in barrier
       synchronization. Must be in the range [1, 2²⁰ - 1].
     - `predicate`: Optional predicate for conditional execution.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init)
   }];
-  string llvmBuilder = [{
-      createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_init, {$addr, $count});
-  }];
   let assemblyFormat = "$addr `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
+
   let extraClassDeclaration = [{
     bool hasIntrinsic() { if(getPredicate()) return false; return true; }
-  }];
-  let extraClassDefinition = [{
-    std::string $cppClass::getPtx() { return std::string("mbarrier.init.b64 [%0], %1;"); }
-  }];
-}
 
-/// mbarrier.init instruction with shared pointer type
-def NVVM_MBarrierInitSharedOp : NVVM_PTXBuilder_Op<"mbarrier.init.shared", [NVVMRequiresSM<80>, DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]>,
-  Arguments<(ins LLVM_PointerShared:$addr, I32:$count, PtxPredicate:$predicate)> {
-  let summary = "Shared MBarrier Initialization Op";
-  let description = [{
-    This Op is the same as `nvvm.mbarrier.init` except that the *mbarrier object*
-    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
-
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init)
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
   }];
+
   string llvmBuilder = [{
-      createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_init_shared, {$addr, $count});
-  }];
-  let assemblyFormat = "$addr `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
-  let extraClassDeclaration = "bool hasIntrinsic() { return !getPredicate(); }";
-  let extraClassDefinition = [{
-    std::string $cppClass::getPtx() { return std::string("mbarrier.init.shared.b64 [%0], %1;"); }
+    auto [id, args] = NVVM::MBarrierInitOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, id, args);
   }];
 }
 
 def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">,
-  Arguments<(ins LLVM_AnyPointer:$addr)> {
+  Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr)> {
   let summary = "MBarrier Invalidation Operation";
   let description = [{
     The `nvvm.mbarrier.inval` operation invalidates an *mbarrier object* at the 
@@ -644,35 +632,32 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">,
     It is undefined behavior if the *mbarrier object* is already invalid.
     
     The operation takes the following operand:
-    - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
-      addressing, but the address must still be in the shared memory space.
+    - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
+      must be a pointer to generic or shared::cta memory. When it is generic, the
+      underlying address must be within the shared::cta memory space; otherwise
+      the behavior is undefined.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval)
   }];
-  string llvmBuilder = [{
-      createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_inval, {$addr});
-  }];
-  let assemblyFormat = "$addr attr-dict `:` type(operands)";
-}
 
-def NVVM_MBarrierInvalSharedOp : NVVM_Op<"mbarrier.inval.shared">,
-  Arguments<(ins LLVM_PointerShared:$addr)> {
-  let summary = "Shared MBarrier Invalidation Operation";
-  let description = [{
-    This Op is the same as `nvvm.mbarrier.inval` except that the *mbarrier object*
-    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
+  let assemblyFormat = "$addr attr-dict `:` type(operands)";
 
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval)
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
   }];
+
   string llvmBuilder = [{
-      createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_inval_shared, {$addr});
+    auto [id, args] = NVVM::MBarrierInvalOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, id, args);
   }];
-  let assemblyFormat = "$addr attr-dict `:` type(operands)";
 }
 
 def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_AnyPointer:$addr)> {
+  Results<(outs I64:$res)>,
+  Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr)> {
   let summary = "MBarrier Arrive Operation";
   let description = [{
     The `nvvm.mbarrier.arrive` operation performs an arrive-on operation on the 
@@ -689,36 +674,32 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
     value are implementation-specific.
 
     The operation takes the following operand:
-    - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
-      addressing, but the address must still be in the shared memory space.
+    - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
+      must be a pointer to generic or shared::cta memory. When it is generic, the
+      underlying address must be within the shared::cta memory space; otherwise
+      the behavior is undefined.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
   }];
-  string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive, {$addr});
-  }];
   let assemblyFormat = "$addr attr-dict `:` type($addr) `->` type($res)";
-}
-
-def NVVM_MBarrierArriveSharedOp : NVVM_Op<"mbarrier.arrive.shared">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_PointerShared:$addr)> {
-  let summary = "Shared MBarrier Arrive Operation";
-  let description = [{
-    This Op is the same as `nvvm.mbarrier.arrive` except that the *mbarrier object*
-    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
 
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
   }];
+
   string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_shared, {$addr});
+    auto [id, args] = NVVM::MBarrierArriveOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    $res = createIntrinsicCall(builder, id, args);
   }];
-  let assemblyFormat = "$addr attr-dict `:` qualified(type($addr)) `->` type($res)";
 }
 
 def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_AnyPointer:$addr, I32:$count)> {
+  Results<(outs I64:$res)>,
+  Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
+                 I32:$count)> {
   let summary = "MBarrier Arrive No-Complete Operation";
   let description = [{
     The `nvvm.mbarrier.arrive.nocomplete` operation performs an arrive-on operation 
@@ -736,37 +717,35 @@ def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
     captures the phase of the *mbarrier object* prior to the arrive-on operation.
 
     The operation takes the following operands:
-    - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
-      addressing, but the address must still be in the shared memory space.
+    - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
+      must be a pointer to generic or shared::cta memory. When it is generic, the
+      underlying address must be within the shared::cta memory space; otherwise
+      the behavior is undefined.
     - `count`: Integer specifying the count argument to the arrive-on operation. 
       Must be in the valid range as specified in the *mbarrier object* contents.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
   }];
-  string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete, {$addr, $count});
-  }];
-  let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";
-}
 
-def NVVM_MBarrierArriveNocompleteSharedOp : NVVM_Op<"mbarrier.arrive.nocomplete.shared">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_PointerShared:$addr, I32:$count)> {
-  let summary = "Shared MBarrier Arrive No-Complete Operation";
-  let description = [{
-    This Op is the same as `nvvm.mbarrier.arrive.nocomplete` except that the *mbarrier object*
-    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
+  let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";
 
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
   }];
+
   string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete_shared, {$addr, $count});
+    auto [id, args] = NVVM::MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    $res = createIntrinsicCall(builder, id, args);
   }];
-  let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";
 }
 
 def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,  
-  Arguments<(ins LLVM_AnyPointer:$addr, I32:$txcount, PtxPredicate:$predicate)> {
+  Arguments<(ins
+    AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
+    I32:$txcount, PtxPredicate:$predicate)> {
   let summary = "MBarrier Arrive with Expected Transaction Count";
   let description = [{
     The `nvvm.mbarrier.arrive.expect_tx` operation performs an expect-tx operation 
@@ -794,28 +773,12 @@ def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_t
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
   }];
   let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
-  let extraClassDefinition = [{
-    std::string $cppClass::getPtx() { return std::string("mbarrier.arrive.expect_tx.b64 _, [%0], %1;"); }
-  }];
-}
-
-def NVVM_MBarrierArriveExpectTxSharedOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx.shared">,  
-  Arguments<(ins LLVM_PointerShared:$addr, I32:$txcount, PtxPredicate:$predicate)> {
-  let summary = "Shared MBarrier Arrive with Expected Transaction Count";
-  let description = [{
-    This Op is the same as `nvvm.mbarrier.arrive.expect_tx` except that the *mbarrier object*
-    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
-
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
-  }];    
-  let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
-  let extraClassDefinition = [{
-    std::string $cppClass::getPtx() { return std::string("mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"); }
-  }];
 }
 
 def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity">,  
-  Arguments<(ins LLVM_AnyPointer:$addr, I32:$phase, I32:$ticks)> {
+  Arguments<(ins
+    AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
+    I32:$phase, I32:$ticks)> {
   let summary = "MBarrier Potentially-Blocking Try Wait with Phase Parity";
   let description = [{
     The `nvvm.mbarrier.try_wait.parity` operation performs a potentially-blocking 
@@ -868,51 +831,12 @@ def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity"
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
   }];  
   let assemblyFormat = "$addr `,` $phase `,` $ticks attr-dict `:` type(operands)";
-  let extraClassDefinition = [{
-    std::string $cppClass::getPtx() {
-      return std::string(
-        "{\n\t"
-        ".reg .pred       P1; \n\t"
-        "LAB_WAIT: \n\t"
-        "mbarrier.try_wait.parity.b64 P1, [%0], %1, %2; \n\t"
-        "@P1 bra.uni DONE; \n\t"
-        "bra.uni     LAB_WAIT; \n\t"
-        "DONE: \n\t"
-        "}"
-      ); 
-    }
-  }];
-}
-
-def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity.shared">,  
-  Arguments<(ins LLVM_PointerShared:$addr, I32:$phase, I32:$ticks)> {
-  let summary = "Shared MBarrier Potentially-Blocking Try Wait with Phase Parity";
-  let description = [{
-    This Op is the same as `nvvm.mbarrier.try_wait.parity` except that the *mbarrier object*
-    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
-
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
-  }];  
-  let assemblyFormat = "$addr `,` $phase `,` $ticks attr-dict `:` type(operands)";
-  let extraClassDefinition = [{
-    std::string $cppClass::getPtx() {
-      return std::string(
-        "{\n\t"
-        ".reg .pred       P1; \n\t"
-        "LAB_WAIT: \n\t"
-        "mbarrier.try_wait.parity.shared.b64 P1, [%0], %1, %2; \n\t"
-        "@P1 bra.uni DONE; \n\t"
-        "bra.uni     LAB_WAIT; \n\t"
-        "DONE: \n\t"
-        "}"
-      ); 
-    }
-  }];
 }
 
 def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_AnyPointer:$addr, LLVM_Type:$state)> {
+  Results<(outs I1:$res)>,
+  Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
+                 I64:$state)> {
   let summary = "MBarrier Non-Blocking Test Wait Operation";
   let description = [{
     The `nvvm.mbarrier.test.wait` operation performs a non-blocking test for the 
@@ -959,26 +883,20 @@ def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
   }];
-  string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait, {$addr, $state});
-  }];
-  let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)";
-}
 
-def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_PointerShared:$addr, LLVM_Type:$state)> {
-  let summary = "Shared MBarrier Non-Blocking Test Wait Operation";
-  let description = [{
-    This Op is the same as `nvvm.mbarrier.test.wait` except that the *mbarrier object*
-    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
+  let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)";
 
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
   }];
+
   string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait_shared, {$addr, $state});
+    auto [id, args] = NVVM::MBarrierTestWaitOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    $res = createIntrinsicCall(builder, id, args);
   }];
-  let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)";
 }
 
 //===----------------------------------------------------------------------===//
@@ -1003,6 +921,23 @@ def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
   }];
 }
 
+// Attrs describing the reduction operations for the barrier operation.
+def BarrierReductionPopc : I32EnumAttrCase<"POPC", 0, "popc">;
+def BarrierReductionAnd : I32EnumAttrCase<"AND", 1, "and">;
+def BarrierReductionOr : I32EnumAttrCase<"OR", 2, "or">;
+
+def BarrierReduction
+    : I32EnumAttr<"BarrierReduction", "NVVM barrier reduction operation",
+                  [BarrierReductionPopc, BarrierReductionAnd,
+                   BarrierReductionOr]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::NVVM";
+}
+def BarrierReductionAttr
+    : EnumAttr<NVVM_Dialect, BarrierReduction, "reduction"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
 def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
   let summary = "CTA Barrier Synchronization Op";
   let description = [{
@@ -1017,6 +952,9 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
     - `numberOfThreads`: Specifies the number of threads participating in the barrier. 
       When specified, the value must be a multiple of the warp size. If not specified, 
       all threads in the CTA participate in the barrier.
+    - `reductionOp`: specifies the reduction operation (`popc`, `and`, `or`).
+    - `reductionPredicate`: specifies the predicate to be used with the
+      `reductionOp`. 
 
     The barrier operation guarantees that when the barrier completes, prior memory 
     accesses requested by participating threads are performed relative to all threads 
@@ -1033,31 +971,37 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar)
   }];
 
-  let arguments = (ins     
-    Optional<I32>:$barrierId,
-    Optional<I32>:$numberOfThreads);
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+      getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                            llvm::IRBuilderBase& builder);
+  }];
+
+  let arguments = (ins Optional<I32>:$barrierId, Optional<I32>:$numberOfThreads,
+      OptionalAttr<BarrierReductionAttr>:$reductionOp,
+      Optional<I32>:$reductionPredicate);
   string llvmBuilder = [{
-    llvm::Value *id = $barrierId ? $barrierId : builder.getInt32(0);
-    if ($numberOfThreads)
-      createIntrinsicCall(
-          builder, llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_count,
-          {id, $numberOfThreads});
-    else
-      createIntrinsicCall(
-          builder, llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_all, {id});
+    auto [id, args] = NVVM::BarrierOp::getIntrinsicIDAndArgs(
+                        *op, moduleTranslation, builder);
+    if ($reductionOp)
+      $res = createIntrinsicCall(builder, id, args);
+    else 
+      createIntrinsicCall(builder, id, args);
   }];
+  let results = (outs Optional<I32>:$res);
+
   let hasVerifier = 1;
 
-  let assemblyFormat = "(`id` `=` $barrierId^)? (`number_of_threads` `=` $numberOfThreads^)? attr-dict";
+  let assemblyFormat =
+      "(`id` `=` $barrierId^)? (`number_of_threads` `=` $numberOfThreads^)? "
+      "($reductionOp^ $reductionPredicate)? (`->` type($res)^)? attr-dict";
 
-  let builders = [
-    OpBuilder<(ins), [{
-      return build($_builder, $_state, Value{}, Value{});
+  let builders = [OpBuilder<(ins), [{
+      return build($_builder, $_state, TypeRange{}, Value{}, Value{}, {}, Value{});
     }]>,
-    OpBuilder<(ins "Value":$barrierId), [{
-      return build($_builder, $_state, barrierId, Value{});
-    }]>
-  ];
+                  OpBuilder<(ins "Value":$barrierId), [{
+      return build($_builder, $_state, TypeRange{}, barrierId, Value{}, {}, Value{});
+    }]>];
 }
 
 def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive"> 
@@ -1262,6 +1206,23 @@ def NVVM_FenceProxyAcquireOp : NVVM_Op<"fence.proxy.acquire">,
   let hasVerifier = 1;
 }
 
+def NVVM_MembarOp : NVVM_Op<"memory.barrier">,
+                    Arguments<(ins MemScopeKindAttr:$scope)> {
+  let summary = "Memory barrier operation";
+  let description = [{
+    `membar` operation guarantees that prior memory accesses requested by this
+    thread are performed at the specified `scope`, before later memory
+    operations requested by this thread following the membar instruction.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar)
+  }];
+
+  let assemblyFormat = "$scope attr-dict";
+  let llvmBuilder = [{
+    createIntrinsicCall(builder, getMembarIntrinsicID($scope), {});
+  }];
+}
+
 def NVVM_FenceProxyReleaseOp : NVVM_Op<"fence.proxy.release">,
       Arguments<(ins MemScopeKindAttr:$scope,
                      DefaultValuedAttr<ProxyKindAttr,
@@ -1341,9 +1302,9 @@ def ShflKindAttr : EnumAttr<NVVM_Dialect, ShflKind, "shfl_kind">;
 
 def NVVM_ShflOp :
   NVVM_Op<"shfl.sync", [NVVMRequiresSM<30>]>,
-  Results<(outs LLVM_Type:$res)>,
+  Results<(outs AnyTypeOf<[I32, F32, LLVMStructType]>:$res)>,
   Arguments<(ins I32:$thread_mask,
-                 LLVM_Type:$val,
+                 AnyTypeOf<[I32, F32]>:$val,
                  I32:$offset,
                  I32:$mask_and_clamp,
                  ShflKindAttr:$kind,
@@ -1359,6 +1320,11 @@ def NVVM_ShflOp :
     a mask for logically splitting warps into sub-segments and an upper bound
     for clamping the source lane index.
     
+    The `return_value_and_is_valid` unit attribute can be specified to indicate 
+    that the return value is a two-element struct, where the first element is 
+    the result value and the second element is a predicate indicating if the 
+    computed source lane index is valid.
+    
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl-sync)
   }];
   string llvmBuilder = [{
@@ -1549,47 +1515,30 @@ def NVVM_CpAsyncMBarrierArriveOp : NVVM_Op<"cp.async.mbarrier.arrive"> {
     The `cp.async.mbarrier.arrive` Op makes the *mbarrier object* track
     all prior cp.async operations initiated by the executing thread.
     The `addr` operand specifies the address of the *mbarrier object*
-    in generic address space. The `noinc` attr impacts how the
-    mbarrier's state is updated.
+    in generic or shared::cta address space. When it is generic, the
+    underlying memory should fall within the shared::cta space;
+    otherwise the behavior is undefined. The `noinc` attr impacts
+    how the mbarrier's state is updated.
     
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive)
   }];
-  let assemblyFormat = "$addr attr-dict `:` type(operands)";
 
   let arguments = (ins
-    LLVM_AnyPointer:$addr, DefaultValuedAttr<I1Attr, "0">:$noinc);
-
-  string llvmBuilder = [{
-    auto intId = $noinc ?
-      llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc :
-      llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive;
-
-    createIntrinsicCall(builder, intId, {$addr});
-  }];
-}
+    AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
+    DefaultValuedAttr<I1Attr, "0">:$noinc);
 
-def NVVM_CpAsyncMBarrierArriveSharedOp : NVVM_Op<"cp.async.mbarrier.arrive.shared"> {
-  let summary = "NVVM Dialect Op for cp.async.mbarrier.arrive.shared";
-  let description = [{
-    The `cp.async.mbarrier.arrive.shared` Op makes the *mbarrier object*
-    track all prior cp.async operations initiated by the executing thread.
-    The `addr` operand specifies the address of the *mbarrier object* in
-    shared memory. The `noinc` attr impacts how the mbarrier's state
-    is updated. 
-    
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive)
-  }];
   let assemblyFormat = "$addr attr-dict `:` type(operands)";
 
-  let arguments = (ins
-    LLVM_PointerShared:$addr, DefaultValuedAttr<I1Attr, "0">:$noinc);
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
+  }];
 
   string llvmBuilder = [{
-    auto intId = $noinc ?
-      llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc_shared :
-      llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_shared;
-
-    createIntrinsicCall(builder, intId, {$addr});
+    auto [id, args] = NVVM::CpAsyncMBarrierArriveOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, id, args);
   }];
 }
 
@@ -1604,10 +1553,11 @@ def FPRoundingModeRM   : I32EnumAttrCase<"RM",   2, "rm">;
 def FPRoundingModeRP   : I32EnumAttrCase<"RP",   3, "rp">;
 def FPRoundingModeRZ   : I32EnumAttrCase<"RZ",   4, "rz">;
 def FPRoundingModeRNA  : I32EnumAttrCase<"RNA",  5, "rna">;
+def FPRoundingModeRS   : I32EnumAttrCase<"RS",   6, "rs">;
 
 def FPRoundingMode : I32EnumAttr<"FPRoundingMode", "NVVM FPRoundingMode kind",
   [FPRoundingModeNone, FPRoundingModeRN, FPRoundingModeRM,
-    FPRoundingModeRP, FPRoundingModeRZ, FPRoundingModeRNA]> {
+    FPRoundingModeRP, FPRoundingModeRZ, FPRoundingModeRNA, FPRoundingModeRS]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::NVVM";
 }
@@ -1921,6 +1871,96 @@ def NVVM_ConvertF6x2ToF16x2Op :
 def NVVM_ConvertF4x2ToF16x2Op :
   NVVM_ConvertToFP16x2Op_Base<"F4", I8, "F16">;
 
+//===----------------------------------------------------------------------===//
+// NVVM Stochastic Rounding Conversion Ops
+//===----------------------------------------------------------------------===//
+
+// Base class for conversions from F32x2 to FPx2 formats
+// (F16x2, BF16x2)
+// TODO: In separate PR, add .rn and .rz rounding variants for this conversion
+// as currently only support .rs rounding mode
+class NVVM_ConvertF32x2ToFPx2OpBase<string dstFormat, string mnemonic, Type dstType> :
+  NVVM_Op<mnemonic, [Pure, NVVMRequiresSMa<[100, 103]>]>,
+  Results<(outs dstType:$dst)>,
+  Arguments<(ins F32:$src_hi, F32:$src_lo, I32:$rbits,
+                 DefaultValuedAttr<FPRoundingModeAttr, "FPRoundingMode::RS">:$rnd,
+                 DefaultValuedAttr<SaturationModeAttr, "SaturationMode::NONE">:$sat,
+                 DefaultValuedAttr<BoolAttr, "false">:$relu)> {
+  let summary = "Convert two F32 values to packed " # dstFormat # " with stochastic rounding (.rs)";
+  let description = [{
+    Converts two F32 values to packed }] # dstFormat # [{ format using stochastic 
+    rounding (.rs) mode with randomness provided by the `rbits` parameter. The 
+    `relu` attribute clamps negative results to 0. The `sat` attribute determines 
+    saturation behavior. The `src_hi` and `src_lo` parameters correspond to operands 
+    `a` and `b` in the PTX ISA, respectively.
+    
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt)
+  }];
+  
+  let assemblyFormat = "$src_hi `,` $src_lo `,` $rbits attr-dict `:` type($dst)";
+
+  let hasVerifier = 1;
+  
+  let extraClassDeclaration = [{
+    llvm::Intrinsic::ID getIntrinsicID();
+  }];
+  
+  string llvmBuilder = [{
+    auto intId = op.getIntrinsicID();
+    $dst = createIntrinsicCall(builder, intId, {$src_hi, $src_lo, $rbits});
+  }];
+  }
+
+// F32x2 -> F16x2 with stochastic rounding
+def NVVM_ConvertF32x2ToF16x2Op : NVVM_ConvertF32x2ToFPx2OpBase<"f16x2", "convert.f32x2.to.f16x2", VectorOfLengthAndType<[2], [F16]>>;
+
+// F32x2 -> BF16x2 with stochastic rounding
+def NVVM_ConvertF32x2ToBF16x2Op : NVVM_ConvertF32x2ToFPx2OpBase<"bf16x2", "convert.f32x2.to.bf16x2", VectorOfLengthAndType<[2], [BF16]>>;
+
+// Base class for stochastic rounding conversions from F32x4 to FPx4 formats
+// (E4M3x4, E5M2x4, E2M3x4, E3M2x4, E2M1x4)
+// These operations always use RS (stochastic rounding) mode with SATFINITE saturation.
+class NVVM_ConvertF32x4ToFPx4OpBase<string dstFormat, string mnemonic, Type dstType> :
+  NVVM_Op<mnemonic, [Pure, NVVMRequiresSMa<[100, 103]>]>,
+  Results<(outs dstType:$dst)>,
+  Arguments<(ins VectorOfLengthAndType<[4], [F32]>:$src, I32:$rbits,
+                 DefaultValuedAttr<BoolAttr, "false">:$relu,
+                 TypeAttr:$dstTy)> {
+  let summary = "Convert vector<4xf32> to packed " # dstFormat # " with stochastic rounding (.rs) and satfinite";
+  let description = [{
+    Converts a vector<4xf32> to packed }] # dstFormat # [{ format using 
+    stochastic rounding (.rs) mode with SATFINITE saturation. Randomness is 
+    provided by the `rbits` parameter. The `dstTy` attribute specifies the 
+    target floating-point format. The `relu` attribute clamps negative results to 0.
+    
+    Note: These operations always use RS rounding mode and SATFINITE saturation mode.
+    
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt)
+  }];
+  
+  let assemblyFormat = "$src `,` $rbits attr-dict `:` type($src) `->` type($dst) `(` $dstTy `)`";
+
+  let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+    llvm::Intrinsic::ID getIntrinsicID();
+  }];
+  
+  string llvmBuilder = [{
+    auto intId = op.getIntrinsicID();
+    $dst = createIntrinsicCall(builder, intId, {$src, $rbits});
+  }];
+}
+
+// F32x4 -> F8x4 with stochastic rounding (supports E4M3FN, E5M2)
+def NVVM_ConvertF32x4ToF8x4Op : NVVM_ConvertF32x4ToFPx4OpBase<"f8x4", "convert.f32x4.to.f8x4", VectorOfLengthAndType<[4], [I8]>>;
+
+// F32x4 -> F6x4 with stochastic rounding (supports E2M3FN, E3M2FN)
+def NVVM_ConvertF32x4ToF6x4Op : NVVM_ConvertF32x4ToFPx4OpBase<"f6x4", "convert.f32x4.to.f6x4", VectorOfLengthAndType<[4], [I8]>>;
+
+// F32x4 -> F4x4 with stochastic rounding (supports E2M1FN)
+def NVVM_ConvertF32x4ToF4x4Op : NVVM_ConvertF32x4ToFPx4OpBase<"f4x4", "convert.f32x4.to.f4x4", I16>;
+
 //===----------------------------------------------------------------------===//
 // NVVM MMA Ops
 //===----------------------------------------------------------------------===//
@@ -2014,6 +2054,9 @@ class MMA_LDST_OPS<list<GEOM> Geom, list<string> Frags, list<string> Types> {
 // llvm supports and can be extended as needed.
 class NVVM_MMA_OPS {
   // "wmma" operations
+  list<list<WMMA_REGS>> fp64_wmma_ops = MMA_OPS<
+            [GEOM<8, 8, 4>],
+            ["f64"], [], ["f64"], []>.ret;
   list<list<WMMA_REGS>> tf32_wmma_ops = MMA_OPS<
             [GEOM<16, 16, 8>],
             ["tf32"], [], ["f32"], []>.ret;
@@ -2024,6 +2067,7 @@ class NVVM_MMA_OPS {
             [GEOM<16, 16, 16>, GEOM<32, 8, 16>, GEOM<8, 32, 16>],
             ["s8","u8"], [], ["s32"], []>.ret;
   list<list<WMMA_REGS>> all_wmma_ops = !listconcat(
+            fp64_wmma_ops,
             tf32_wmma_ops,
             fp_wmma_ops,
             i8_wmma_ops);
@@ -2040,9 +2084,17 @@ class NVVM_MMA_OPS {
   list<WMMA_REGS> ldst_tf32_cd_ops = MMA_LDST_OPS<
             [GEOM<16, 16, 8>],
             ["c", "d"], ["f32"]>.ret;
+  list<WMMA_REGS> ldst_f64_ab_ops = MMA_LDST_OPS<
+            [GEOM<8, 8, 4>],
+            ["a", "b"], ["f64"]>.ret;
+  list<WMMA_REGS> ldst_f64_cd_ops = MMA_LDST_OPS<
+            [GEOM<8, 8, 4>],
+            ["c", "d"], ["f64"]>.ret;
   list<WMMA_REGS> all_ldst_ops = !listconcat(ldst_ab_ops, ldst_cd_ops,
                                              ldst_tf32_ab_ops,
-                                             ldst_tf32_cd_ops);
+                                             ldst_tf32_cd_ops, 
+                                             ldst_f64_ab_ops,
+                                             ldst_f64_cd_ops);
   // Separate A/B/C fragments (loads) from D (stores).
   list<WMMA_REGS> all_ld_ops = !filter(op, all_ldst_ops, !ne(op.frag, "d"));
   list<WMMA_REGS> all_st_ops = !filter(op, all_ldst_ops, !eq(op.frag, "d"));
@@ -2349,7 +2401,7 @@ def MMAFragAttr : EnumAttr<NVVM_Dialect, MMAFrag, "mma_frag"> {
 }
 
 def NVVM_WMMALoadOp: NVVM_Op<"wmma.load">,
-  Results<(outs LLVM_AnyStruct:$res)>,
+  Results<(outs AnyTypeOf<[LLVM_AnyStruct, F64]>:$res)>,
   Arguments<(ins LLVM_AnyPointer: $ptr, I32: $stride, I32Attr:$m,
              I32Attr:$n, I32Attr:$k, MMALayoutAttr:$layout,
              MMATypesAttr:$eltype, MMAFragAttr:$frag)> {
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 5241f9a6f2b43..3d8bf9c169406 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -321,6 +321,7 @@ def ROCDL_BarrierOp : ROCDL_Op<"barrier"> {
   let assemblyFormat = "attr-dict";
 }
 
+def ROCDLGlobalBuffer : LLVM_PointerInAddressSpace<1>;
 def ROCDLBufferLDS : LLVM_PointerInAddressSpace<3>;
 
 def ROCDL_BarrierInitOp : ROCDL_IntrOp<"s.barrier.init", [], [], [], 0, 0, 0, 0, [1], ["id"]>,
@@ -631,8 +632,6 @@ def ROCDL_wmma_i32_16x16x64_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x64.iu8", [1]
 //===---------------------------------------------------------------------===//
 // LDS transpose intrinsics (available in GFX950)
 
-def ROCDLGlobalBuffer : LLVM_PointerInAddressSpace<1>;
-
 class ROCDL_LDS_Read_Tr_IntrOp<string mnemonic> :
   ROCDL_IntrOp<mnemonic, [1], [], [], 1, 0, 1> {
   dag args = (ins Arg<ROCDLBufferLDS, "", [MemRead]>:$ptr);
@@ -650,6 +649,58 @@ def ROCDL_ds_read_tr8_b64 : ROCDL_LDS_Read_Tr_IntrOp<"ds.read.tr8.b64">;
 def ROCDL_ds_read_tr6_b96 : ROCDL_LDS_Read_Tr_IntrOp<"ds.read.tr6.b96">;
 def ROCDL_ds_read_tr16_b64 : ROCDL_LDS_Read_Tr_IntrOp<"ds.read.tr16.b64">;
 
+
+
+//===---------------------------------------------------------------------===//
+// Glb/DS load-transpose intrinsics (available in GFX1250+)
+
+class AddrKind<string n, int s> {
+  string name = n;
+  int space = s;
+}
+def GlobalAddrKind : AddrKind<"global", 1>;
+def DSAddrKind : AddrKind<"ds", 3>;
+
+class ROCDL_TrLoadOpMeta<AddrKind kind, int inElemBits, int outElemBits> {
+  AddrKind addrKind = kind;
+  string inBits = !cast<string>(inElemBits);
+  string outBits = !cast<string>(outElemBits);
+  string inBitsEnc = !if(!eq(addrKind.space, 1),
+                     !if(!or(!eq(inElemBits, 8), !eq(inElemBits, 16)), "", inBits), inBits);
+  string mnemonic = addrKind.name # ".load.tr" # inBitsEnc # ".b" # outBits;
+}
+
+class ROCDL_TrLoadOp<ROCDL_TrLoadOpMeta meta> :
+  ROCDL_IntrOp<meta.mnemonic, [1], [], [], 1, 0, 1> {
+
+  dag args = (ins Arg<LLVM_PointerInAddressSpace<meta.addrKind.space>, "", [MemRead]>:$ptr);
+  let arguments = !con(args, baseArgs);
+  let summary = "Loads and transposes a matrix from " # meta.addrKind.name # " memory to registers (available in gfx1250+).";
+  let description = [{
+    Load a matrix of }] # meta.inBits # [{-bit data from the }] # meta.addrKind.name # [{ memory,
+    transpose data between row-major and column-major order,
+    and store the result into a }] # meta.outBits # [{-bit vector register.
+
+    Available in gfx1250+.
+  }];
+  let assemblyFormat = "$ptr attr-dict `:` qualified(type($ptr)) `->` type($res)";
+  let extraClassDefinition = [{
+    ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+      return {getPtr()};
+    }
+  }];
+}
+
+def ROCDL_GlobalLoadTr4_B64 : ROCDL_TrLoadOp<ROCDL_TrLoadOpMeta<GlobalAddrKind, 4, 64>>;
+def ROCDL_GlobalLoadTr8_B64 : ROCDL_TrLoadOp<ROCDL_TrLoadOpMeta<GlobalAddrKind, 8, 64>>;
+def ROCDL_GlobalLoadTr6_B96 : ROCDL_TrLoadOp<ROCDL_TrLoadOpMeta<GlobalAddrKind, 6, 96>>;
+def ROCDL_GlobalLoadTr8_B128 : ROCDL_TrLoadOp<ROCDL_TrLoadOpMeta<GlobalAddrKind, 16, 128>>;
+
+def ROCDL_DsLoadTr4_B64 : ROCDL_TrLoadOp<ROCDL_TrLoadOpMeta<DSAddrKind, 4, 64>>;
+def ROCDL_DsLoadTr8_B64 : ROCDL_TrLoadOp<ROCDL_TrLoadOpMeta<DSAddrKind, 8, 64>>;
+def ROCDL_DsLoadTr6_B96 : ROCDL_TrLoadOp<ROCDL_TrLoadOpMeta<DSAddrKind, 6, 96>>;
+def ROCDL_DsLoadTr16_B128 : ROCDL_TrLoadOp<ROCDL_TrLoadOpMeta<DSAddrKind, 16, 128>>;
+
 //===---------------------------------------------------------------------===//
 // Load to LDS intrinsic (available in GFX9 and GFX10)
 //===---------------------------------------------------------------------===//
@@ -692,6 +743,38 @@ def ROCDL_GlobalLoadLDSOp :
   }];
 }
 
+//===---------------------------------------------------------------------===//
+// Async load to LDS intrinsic (available in GFX1250)
+//===---------------------------------------------------------------------===//
+
+foreach bitsVal = [8, 32, 64, 128] in {
+  defvar bitsStr = "b" # !cast<string>(bitsVal);
+  def ROCDL_GlobalLoadAsyncToLDS # !toupper(bitsStr) # Op :
+    ROCDL_IntrOp<"global.load.async.to.lds." # bitsStr, [], [], [], 0, 0, 1, 0, [2, 3], ["offset", "aux"]> {
+    dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr,
+                   Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
+                   I32Attr:$offset,
+                   I32Attr:$aux);
+    let arguments = !con(args, baseArgs);
+    let assemblyFormat = [{
+      $globalPtr `,`  $ldsPtr `,` $offset `,` $aux
+      attr-dict `:` type($globalPtr) `,` type($ldsPtr)
+    }];
+    let description = [{
+      Asynchronously loads }] # !cast<string>(bitsVal) # [{ bits of data from a global memory pointer
+      to a Local Data Share (LDS) pointer.
+
+      Available on gfx1250+.
+    }];
+
+    let extraClassDefinition = [{
+      ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+        return {getGlobalPtr(), getLdsPtr()};
+      }
+    }];
+  }
+}
+
 //===---------------------------------------------------------------------===//
 // Tensor load/store intrinsics (available in GFX1250)
 //===---------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td
index 2dd612139fa2d..388efaaa25117 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td
@@ -463,10 +463,9 @@ def XeVM_PrefetchOp
 
 def XeVM_BlockPrefetch2dOp
     : XeVM_Op<"blockprefetch2d">,
-      Arguments<(ins Arg<LLVM_AnyPointer, "", [MemRead]>:$ptr, I32:$base_width,
-          I32:$base_height, I32:$base_pitch, I32:$x, I32:$y,
-          I32Attr:$elem_size_in_bits, I32Attr:$tile_width, I32Attr:$tile_height,
-          I32Attr:$v_blocks,
+      Arguments<(ins LLVM_AnyPointer:$ptr, I32:$base_width, I32:$base_height,
+          I32:$base_pitch, I32:$x, I32:$y, I32Attr:$elem_size_in_bits,
+          I32Attr:$tile_width, I32Attr:$tile_height, I32Attr:$v_blocks,
           OptionalAttr<XeVM_LoadCacheControlAttr>:$cache_control)> {
 
   let summary = "2D block prefetch";
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
index 7ff44c2e1d2ed..2754ee3b4f586 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
@@ -94,7 +94,8 @@ def Linalg_IndexOp : Linalg_Op<"index", [Pure]>,
 def Linalg_SoftmaxOp : Linalg_Op<"softmax",
     [DestinationStyleOpInterface,
      PredOpTrait<"input and output have same element type", TCopVTEtIsSameAs<0, 1>>,
-     DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>,
+     DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface,
+      ["reifyResultShapes"]>,
      DeclareOpInterfaceMethods<AggregatedOpInterface, ["decomposeOperation"]>,
      DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
      DeclareOpInterfaceMethods<TilingInterface,
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td
index 6504ca8664d49..784bdd8e22f1f 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td
@@ -35,7 +35,8 @@ class Linalg_RelayoutOp<string mnemonic, list<Trait> traits = []> :
         DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
         DestinationStyleOpInterface, LinalgRelayoutOpInterface,
         ConditionallySpeculatable, NoMemoryEffect,
-        DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>,
+        DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface, [
+          "reifyResultShapes"]>,
         TypesMatchWith<"result type matches type of dest",
                    "dest", "result",
                    "$_self">])> {
@@ -108,7 +109,7 @@ def Linalg_PackOp : Linalg_RelayoutOp<"pack", [
     within [0, n).
      - The tiled dimensions (of size `inner_tiles`) are added to the end of the
      result tensor in the order in which they appear, i.e.
-     `shape(result)[rank(result) + i] = inner_tiles[i]` for `0 <= i < k`.
+     `shape(result)[rank(source) + i] = inner_tiles[i]` for `0 <= i < k`.
      - The following relationship for the tiled dimensions holds:
      `shape(result)[inner_dims_pos[i]] = shape(source)[inner_dims_pos[i]] / inner_tiles[i]`,
      where (⌈/⌉ indicates CeilDiv).
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index ecd036d452b27..dfb32a056a4d4 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -235,8 +235,7 @@ def TensorOrMemref :
 
 def MapOp : LinalgStructuredBase_Op<"map", [
     DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
-    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmBlockArgumentNames"]>,
-    SingleBlockImplicitTerminator<"YieldOp">]> {
+    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmBlockArgumentNames"]>]> {
   let summary = "Elementwise operations";
   let description = [{
     Models elementwise operations on tensors in terms of arithmetic operations
@@ -318,8 +317,7 @@ def MapOp : LinalgStructuredBase_Op<"map", [
 def ReduceOp : LinalgStructuredBase_Op<"reduce", [
     DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
     DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmBlockArgumentNames"]>,
-    SameVariadicOperandSize,
-    SingleBlockImplicitTerminator<"YieldOp">]> {
+    SameVariadicOperandSize]> {
   let summary = "Reduce operator";
   let description = [{
     Executes `combiner` on the `dimensions` of `inputs` and returns the
@@ -400,8 +398,7 @@ def ReduceOp : LinalgStructuredBase_Op<"reduce", [
 //===----------------------------------------------------------------------===//
 
 def TransposeOp : LinalgStructuredBase_Op<"transpose", [
-    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
-    SingleBlockImplicitTerminator<"YieldOp">]> {
+    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>]> {
   let summary = "Transpose operator";
   let description = [{
     Permutes the dimensions of `input` according to the given `permutation`.
@@ -477,8 +474,7 @@ def TransposeOp : LinalgStructuredBase_Op<"transpose", [
 //===----------------------------------------------------------------------===//
 
 def BroadcastOp : LinalgStructuredBase_Op<"broadcast", [
-    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
-    SingleBlockImplicitTerminator<"YieldOp">]> {
+    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>]> {
   let summary = "Static broadcast operator";
   let description = [{
     Broadcast the input into the given shape by adding `dimensions`.
@@ -547,8 +543,9 @@ def BroadcastOp : LinalgStructuredBase_Op<"broadcast", [
 //===----------------------------------------------------------------------===//
 // Op definition for ElementwiseOp
 //===----------------------------------------------------------------------===//
+
 def ElementwiseOp : LinalgStructuredBase_Op<"elementwise", [
-                   AttrSizedOperandSegments]> {
+                    AttrSizedOperandSegments]> {
   let summary = [{ Performs element-wise operation }];
   let description = [{
     The attribute `kind` describes arithmetic operation to perform. The
@@ -684,7 +681,6 @@ def ElementwiseOp : LinalgStructuredBase_Op<"elementwise", [
 def MatmulOp : LinalgStructuredBase_Op<"matmul", [
                AttrSizedOperandSegments,
                LinalgContractionOpInterface]> {
-
   let summary = [{
     Performs a matrix multiplication of two 2D inputs without broadcast or transpose.
     }];
@@ -816,8 +812,8 @@ def MatmulOp : LinalgStructuredBase_Op<"matmul", [
 //===----------------------------------------------------------------------===//
 
 def ContractOp : LinalgStructuredBase_Op<"contract", [
-               AttrSizedOperandSegments,
-               LinalgContractionOpInterface]> {
+                 AttrSizedOperandSegments,
+                 LinalgContractionOpInterface]> {
   let summary = [{
     Perform a contraction on two inputs, accumulating into the third.
   }];
@@ -954,9 +950,9 @@ def ContractOp : LinalgStructuredBase_Op<"contract", [
 // Op definition for BatchMatmulOp
 //===----------------------------------------------------------------------===//
 
-def BatchMatmulOp : LinalgStructuredBase_Op<"batch_matmul", !listconcat([AttrSizedOperandSegments],
-  /*extraInterfaces=*/[LinalgContractionOpInterface])> {
-    
+def BatchMatmulOp : LinalgStructuredBase_Op<"batch_matmul", [
+                    AttrSizedOperandSegments,
+                    LinalgContractionOpInterface]> {
   let summary = [{Performs a batched matrix multiplication of two 3D inputs.}];
   let description = [{Numeric casting is performed on the operands to the inner multiply, promoting
     them to the same data type as the accumulator/output.
@@ -1087,7 +1083,6 @@ def BatchMatmulOp : LinalgStructuredBase_Op<"batch_matmul", !listconcat([AttrSiz
 def BatchReduceMatmulOp : LinalgStructuredBase_Op<"batch_reduce_matmul", [
                           AttrSizedOperandSegments,
                           LinalgContractionOpInterface]> {
-    
   let summary = [{Performs a batch-reduce matrix multiplication on two inputs.
     The partial multiplication results are reduced into a 2D output.}];
   let description = [{
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 8728e666cd59d..70d424bae9285 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -21,13 +21,6 @@ include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/IR/OpBase.td"
 include "mlir/IR/RegionKindInterface.td"
 
-// This is roughly similar to OpFoldResult assuming the handle produces a single
-// value in the payload IR.
-def TransformAnyParamTypeOrAnyHandle : Type<
-    Or<[TransformHandleTypeInterface.predicate,
-        TransformParamTypeInterface.predicate]>,
-    "transform any param type or any handle type">;
-
 //===----------------------------------------------------------------------===//
 // Apply...PatternsOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
index 48978eb7663d5..de07f500a8669 100644
--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -33,22 +33,14 @@ namespace linalg {
 //===----------------------------------------------------------------------===//
 // Utilities for inferring various semantics properties of Linalg ops.
 //===----------------------------------------------------------------------===//
-/// Shell function to compute the Destination Permutation of PackOp
-/// This function uses the helper function `computePackUnPackPerm` to get
-/// the permutation vector. Only major difference between UnPack and Pack is
-/// that packOp uses destination rank whereas unpack Uses source rank.
-SmallVector<int64_t> getPackInverseDestPerm(linalg::PackOp packOp);
-
-/// Shell function to compute the Source Permutation of unPackOp.
-/// This function, like the getPackInverseDestPerm uses the helper function
-/// computePackUnPackPerm` to get the permutation vector.
-/// Only major difference between UnPack and Pack is that packOp uses
-/// destination rank whereas unpack Uses source rank.
-SmallVector<int64_t> getUnPackInverseSrcPerm(linalg::UnPackOp unpackOp);
-
-/// Shell function to compute the Source rank permutation for unpackOp
-/// Unpack requires some packing metadata data information, so created
-/// another function where this value is passed by reference.
+
+/// Compute inverse permutation for the destination tensor (i.e. in the packed
+/// domain).
+SmallVector<int64_t> getPackInverseDestPerm(linalg::PackOp packOp,
+                                            PackingMetadata &metadata);
+
+/// Compute inverse permutation for the source tensor (i.e. in the packed
+/// domain).
 SmallVector<int64_t> getUnPackInverseSrcPerm(linalg::UnPackOp,
                                              PackingMetadata &metadata);
 
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h b/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h
index 69447f74ec403..b7abcdea10a2a 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h
@@ -13,6 +13,7 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
 #include "mlir/IR/Dialect.h"
+#include "mlir/Interfaces/AlignmentAttrInterface.h"
 #include "mlir/Interfaces/CallInterfaces.h"
 #include "mlir/Interfaces/CastInterfaces.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index b39207fc30dd7..0bf22928f6900 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -11,6 +11,7 @@
 
 include "mlir/Dialect/Arith/IR/ArithBase.td"
 include "mlir/Dialect/MemRef/IR/MemRefBase.td"
+include "mlir/Interfaces/AlignmentAttrInterface.td"
 include "mlir/Interfaces/CastInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/Interfaces/InferIntRangeInterface.td"
@@ -65,15 +66,15 @@ class AllocLikeOp<string mnemonic,
                   list<Trait> traits = []> :
     MemRef_Op<mnemonic,
     !listconcat([
-      AttrSizedOperandSegments
+      AttrSizedOperandSegments,
+      DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
     ], traits)> {
 
   let arguments = (ins Variadic<Index>:$dynamicSizes,
                        // The symbolic operands (the ones in square brackets)
                        // bind to the symbols of the memref's layout map.
                        Variadic<Index>:$symbolOperands,
-                       ConfinedAttr<OptionalAttr<I64Attr>,
-                                [IntMinValue<0>]>:$alignment);
+                       OptionalAttr<IntValidAlignment<I64Attr>>:$alignment);
   let results = (outs Res<AnyMemRef, "",
                           [MemAlloc<resource, 0, FullEffect>]>:$memref);
 
@@ -269,7 +270,8 @@ def MemRef_AllocOp : AllocLikeOp<"alloc", DefaultResource, [
 //===----------------------------------------------------------------------===//
 
 
-def MemRef_ReallocOp : MemRef_Op<"realloc"> {
+def MemRef_ReallocOp : MemRef_Op<"realloc",
+    [DeclareOpInterfaceMethods<AlignmentAttrOpInterface>]> {
   let summary = "memory reallocation operation";
   let description = [{
     The `realloc` operation changes the size of a memory region. The memory
@@ -323,8 +325,8 @@ def MemRef_ReallocOp : MemRef_Op<"realloc"> {
 
     ```mlir
     %new = memref.realloc %old : memref<64xf32> to memref<124xf32>
-    %4 = memref.load %new[%index]   // ok
-    %5 = memref.load %old[%index]   // undefined behavior
+    %4 = memref.load %new[%index] : memref<124xf32> // ok
+    %5 = memref.load %old[%index] : memref<64xf32>  // undefined behavior
     ```
   }];
 
@@ -335,8 +337,7 @@ def MemRef_ReallocOp : MemRef_Op<"realloc"> {
   let arguments = (ins Arg<MemRefRankOf<[AnyType], [1]>, "",
                                         [MemFreeAt<0, FullEffect>]>:$source,
                    Optional<Index>:$dynamicResultSize,
-                   ConfinedAttr<OptionalAttr<I64Attr>,
-                                [IntMinValue<0>]>:$alignment);
+                   OptionalAttr<IntValidAlignment<I64Attr>>:$alignment);
 
   let results = (outs Res<MemRefRankOf<[AnyType], [1]>, "",
                                        [MemAlloc<DefaultResource, 1,
@@ -445,9 +446,10 @@ def MemRef_AllocaScopeOp : MemRef_Op<"alloca_scope",
     operation:
 
     ```mlir
-    %result = memref.alloca_scope {
+    %result = memref.alloca_scope -> f32 {
+      %value = arith.constant 1.0 : f32
       ...
-      memref.alloca_scope.return %value
+      memref.alloca_scope.return %value : f32
     }
     ```
 
@@ -478,7 +480,7 @@ def MemRef_AllocaScopeReturnOp : MemRef_Op<"alloca_scope.return",
     to indicate which values are going to be returned. For example:
 
     ```mlir
-    memref.alloca_scope.return %value
+    memref.alloca_scope.return %value : f32
     ```
   }];
 
@@ -543,11 +545,11 @@ def MemRef_CastOp : MemRef_Op<"cast", [
     Example:
 
     ```mlir
-    Cast to concrete shape.
-        %4 = memref.cast %1 : memref<*xf32> to memref<4x?xf32>
+    // Cast to concrete shape.
+    %4 = memref.cast %1 : memref<*xf32> to memref<4x?xf32>
 
-    Erase rank information.
-        %5 = memref.cast %1 : memref<4x?xf32> to memref<*xf32>
+    // Erase rank information.
+    %5 = memref.cast %1 : memref<4x?xf32> to memref<*xf32>
     ```
   }];
 
@@ -613,8 +615,8 @@ def MemRef_DeallocOp : MemRef_Op<"dealloc", [MemRefsNormalizable]> {
     Example:
 
     ```mlir
-    %0 = memref.alloc() : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1), 1>>
-    memref.dealloc %0 : memref<8x64xf32,  affine_map<(d0, d1) -> (d0, d1), 1>>
+    %0 = memref.alloc() : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1>
+    memref.dealloc %0 : memref<8x64xf32,  affine_map<(d0, d1) -> (d0, d1)>, 1>
     ```
   }];
 
@@ -728,13 +730,13 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
     space 1 at indices [%k, %l], would be specified as follows:
 
     ```mlir
-    %num_elements = arith.constant 256
+    %num_elements = arith.constant 256 : index
     %idx = arith.constant 0 : index
-    %tag = memref.alloc() : memref<1 x i32, affine_map<(d0) -> (d0)>, 4>
-    dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx] :
-      memref<40 x 128 x f32>, affine_map<(d0) -> (d0)>, 0>,
-      memref<2 x 1024 x f32>, affine_map<(d0) -> (d0)>, 1>,
-      memref<1 x i32>, affine_map<(d0) -> (d0)>, 2>
+    %tag = memref.alloc() : memref<1 x i32, affine_map<(d0) -> (d0)>, 2>
+    memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx] :
+      memref<40 x 128 x f32, affine_map<(d0, d1) -> (d0, d1)>, 0>,
+      memref<2 x 1024 x f32, affine_map<(d0, d1) -> (d0, d1)>, 1>,
+      memref<1 x i32, affine_map<(d0) -> (d0)>, 2>
     ```
 
     If %stride and %num_elt_per_stride are specified, the DMA is expected to
@@ -742,8 +744,8 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
     memory space 0 until %num_elements are transferred.
 
     ```mlir
-    dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx], %stride,
-              %num_elt_per_stride :
+    memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx], %stride,
+                     %num_elt_per_stride :
     ```
 
     * TODO: add additional operands to allow source and destination striding, and
@@ -891,10 +893,10 @@ def MemRef_DmaWaitOp : MemRef_Op<"dma_wait"> {
    Example:
 
    ```mlir
-    dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%index] :
-      memref<2048 x f32>, affine_map<(d0) -> (d0)>, 0>,
-      memref<256 x f32>, affine_map<(d0) -> (d0)>, 1>
-      memref<1 x i32>, affine_map<(d0) -> (d0)>, 2>
+    memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%index] :
+      memref<2048 x f32, affine_map<(d0) -> (d0)>, 0>,
+      memref<256 x f32, affine_map<(d0) -> (d0)>, 1>,
+      memref<1 x i32, affine_map<(d0) -> (d0)>, 2>
     ...
     ...
     dma_wait %tag[%index], %num_elements : memref<1 x i32, affine_map<(d0) -> (d0)>, 2>
@@ -1004,8 +1006,8 @@ def MemRef_ExtractStridedMetadataOp : MemRef_Op<"extract_strided_metadata", [
 
     ```mlir
       %base, %offset, %sizes:2, %strides:2 =
-        memref.extract_strided_metadata %memref :
-          memref<10x?xf32>, index, index, index, index, index
+        memref.extract_strided_metadata %memref : memref<10x?xf32>
+          -> memref<f32>, index, index, index, index, index
 
       // After folding, the type of %m2 can be memref<10x?xf32> and further
       // folded to %memref.
@@ -1013,7 +1015,7 @@ def MemRef_ExtractStridedMetadataOp : MemRef_Op<"extract_strided_metadata", [
           offset: [%offset],
           sizes: [%sizes#0, %sizes#1],
           strides: [%strides#0, %strides#1]
-        : memref<f32> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        : memref<f32> to memref<?x?xf32, strided<[?, ?], offset:?>>
     ```
   }];
 
@@ -1159,7 +1161,8 @@ def MemRef_GetGlobalOp : MemRef_Op<"get_global",
 // GlobalOp
 //===----------------------------------------------------------------------===//
 
-def MemRef_GlobalOp : MemRef_Op<"global", [Symbol]> {
+def MemRef_GlobalOp : MemRef_Op<"global", [Symbol,
+    DeclareOpInterfaceMethods<AlignmentAttrOpInterface>]> {
   let summary = "declare or define a global memref variable";
   let description = [{
     The `memref.global` operation declares or defines a named global memref
@@ -1182,10 +1185,10 @@ def MemRef_GlobalOp : MemRef_Op<"global", [Symbol]> {
 
     ```mlir
     // Private variable with an initial value.
-    memref.global "private" @x : memref<2xf32> = dense<0.0,2.0>
+    memref.global "private" @x : memref<2xf32> = dense<[0.0, 2.0]>
 
     // Private variable with an initial value and an alignment (power of 2).
-    memref.global "private" @x : memref<2xf32> = dense<0.0,2.0> {alignment = 64}
+    memref.global "private" @x : memref<2xf32> = dense<[0.0, 2.0]> {alignment = 64}
 
     // Declaration of an external variable.
     memref.global "private" @y : memref<4xi32>
@@ -1194,7 +1197,7 @@ def MemRef_GlobalOp : MemRef_Op<"global", [Symbol]> {
     memref.global @z : memref<3xf16> = uninitialized
 
     // Externally visible constant variable.
-    memref.global constant @c : memref<2xi32> = dense<1, 4>
+    memref.global constant @c : memref<2xi32> = dense<[1, 4]>
     ```
   }];
 
@@ -1234,6 +1237,7 @@ def LoadOp : MemRef_Op<"load",
                      "memref", "result",
                      "::llvm::cast<MemRefType>($_self).getElementType()">,
       MemRefsNormalizable,
+      DeclareOpInterfaceMethods<AlignmentAttrOpInterface>,
       DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
       DeclareOpInterfaceMethods<PromotableMemOpInterface>,
       DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>]> {
@@ -1555,7 +1559,8 @@ def MemRef_ReinterpretCastOp
     %dst = memref.reinterpret_cast %src to
       offset: [%offset],
       sizes: [%sizes],
-      strides: [%strides]
+      strides: [%strides] :
+      memref<*xf32> to memref<?x?xf32, strided<[?, ?], offset: ?>>
     ```
     means that `%dst`'s descriptor will be:
     ```mlir
@@ -1695,12 +1700,12 @@ def MemRef_ReshapeOp: MemRef_Op<"reshape", [
     ```mlir
     // Reshape statically-shaped memref.
     %dst = memref.reshape %src(%shape)
-             : (memref<4x1xf32>, memref<1xi32>) to memref<4xf32>
+             : (memref<4x1xf32>, memref<1xi32>) -> memref<4xf32>
     %dst0 = memref.reshape %src(%shape0)
-             : (memref<4x1xf32>, memref<2xi32>) to memref<2x2xf32>
+             : (memref<4x1xf32>, memref<2xi32>) -> memref<2x2xf32>
     // Flatten unranked memref.
     %dst = memref.reshape %src(%shape)
-             : (memref<*xf32>, memref<1xi32>) to memref<?xf32>
+             : (memref<*xf32>, memref<1xi32>) -> memref<?xf32>
     ```
 
     b. Source type is ranked or unranked. Shape argument has dynamic size.
@@ -1709,10 +1714,10 @@ def MemRef_ReshapeOp: MemRef_Op<"reshape", [
     ```mlir
     // Reshape dynamically-shaped 1D memref.
     %dst = memref.reshape %src(%shape)
-             : (memref<?xf32>, memref<?xi32>) to memref<*xf32>
+             : (memref<?xf32>, memref<?xi32>) -> memref<*xf32>
     // Reshape unranked memref.
     %dst = memref.reshape %src(%shape)
-             : (memref<*xf32>, memref<?xi32>) to memref<*xf32>
+             : (memref<*xf32>, memref<?xi32>) -> memref<*xf32>
     ```
   }];
 
@@ -1778,7 +1783,8 @@ class MemRef_ReassociativeReshapeOp<string mnemonic, list<Trait> traits = []> :
 def MemRef_ExpandShapeOp : MemRef_ReassociativeReshapeOp<"expand_shape", [
     DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
     DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
-    DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>]> {
+    DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface,
+      ["reifyResultShapes"]>]> {
   let summary = "operation to produce a memref with a higher rank.";
   let description = [{
     The `memref.expand_shape` op produces a new view with a higher rank whose
@@ -2008,6 +2014,7 @@ def MemRef_StoreOp : MemRef_Op<"store",
                      "memref", "value",
                      "::llvm::cast<MemRefType>($_self).getElementType()">,
       MemRefsNormalizable,
+      DeclareOpInterfaceMethods<AlignmentAttrOpInterface>,
       DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
       DeclareOpInterfaceMethods<PromotableMemOpInterface>,
       DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>]> {
diff --git a/mlir/include/mlir/Dialect/MemRef/Transforms/Passes.td b/mlir/include/mlir/Dialect/MemRef/Transforms/Passes.td
index f3e40aaa29075..c403386bd214a 100644
--- a/mlir/include/mlir/Dialect/MemRef/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/MemRef/Transforms/Passes.td
@@ -164,6 +164,11 @@ def ResolveRankedShapeTypeResultDimsPass
     implement the `ReifyRankedShapedTypeOpInterface` in terms of
     shapes of its operands.
   }];
+  let options = [
+    Option<"errorOnPatternIterationLimit", "error-on-pattern-iteration-limit", "bool",
+           /*default=*/"true",
+           "Throw an error when pattern rewriter hits iteration limit">,
+  ];
   let dependentDialects = [
     "memref::MemRefDialect", "tensor::TensorDialect"
   ];
@@ -177,6 +182,11 @@ def ResolveShapedTypeResultDimsPass : Pass<"resolve-shaped-type-result-dims"> {
     `ReifyRankedShapedTypeOpInterface` in terms of shapes of its
     operands.
   }];
+  let options = [
+    Option<"errorOnPatternIterationLimit", "error-on-pattern-iteration-limit", "bool",
+           /*default=*/"true",
+           "Throw an error when pattern rewriter hits iteration limit">,
+  ];
   let dependentDialects = [
     "affine::AffineDialect", "memref::MemRefDialect", "tensor::TensorDialect"
   ];
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 2f4517ddfe754..5b89f741e296d 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2184,6 +2184,8 @@ def OpenACC_KernelEnvironmentOp : OpenACC_Op<"kernel_environment",
     )
     $region attr-dict
   }];
+
+  let hasCanonicalizer = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2557,6 +2559,12 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
     device-type-aware getter methods. When modifying these operands, the
     corresponding `device_type` attributes must be updated to maintain
     consistency between operands and their target device types.
+
+    The `unstructured` attribute indicates that the loops inside the OpenACC
+    construct contain early exits and cannot be lowered to structured MLIR
+    operations. When this flag is set, the acc.loop should have no induction
+    variables and the loop must be implemented via explicit control flow
+    inside its body.
   }];
 
   let arguments = (ins
@@ -2590,7 +2598,8 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
       OptionalAttr<SymbolRefArrayAttr>:$firstprivatizationRecipes,
       Variadic<AnyType>:$reductionOperands,
       OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes,
-      OptionalAttr<OpenACC_CombinedConstructsAttr>:$combined
+      OptionalAttr<OpenACC_CombinedConstructsAttr>:$combined,
+      UnitAttr:$unstructured
   );
 
   let results = (outs Variadic<AnyType>:$results);
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
index 6fb9a950489f8..054c13a88a552 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
@@ -26,4 +26,22 @@ def ComputeRegionOpInterface : OpInterface<"ComputeRegionOpInterface"> {
   ];
 }
 
+def PartialEntityAccessOpInterface : OpInterface<"PartialEntityAccessOpInterface"> {
+  let cppNamespace = "::mlir::acc";
+
+  let description = [{
+    An interface for operations that access a partial entity such as
+    field or array element access.
+  }];
+
+  let methods = [
+    InterfaceMethod<"Get the base entity being accessed", "::mlir::Value",
+      "getBaseEntity", (ins)>,
+    InterfaceMethod<"Check if this is a complete view of the entity", "bool",
+      "isCompleteView", (ins), [{
+        return false;
+      }]>,
+  ];
+}
+
 #endif // OPENACC_OPS_INTERFACES
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
index 93e9e3d0689f7..d1bbc7f206ce6 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
@@ -259,6 +259,18 @@ def OpenACC_MappableTypeInterface : TypeInterface<"MappableType"> {
         return {};
       }]
     >,
+    InterfaceMethod<
+      /*description=*/[{
+        Returns true if the dimensions of this type are not known. This occurs
+        when the MLIR type does not encode dimensional information and there is
+        no associated descriptor or metadata in the current entity that would
+        make this information extractable. For example, an opaque pointer type
+        pointing to an array without dimension information would have unknown
+        dimensions.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"hasUnknownDimensions"
+    >,
     InterfaceMethod<
       /*description=*/[{
         Returns explicit `acc.bounds` operations that envelop the whole
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
index 563c1e0099fc0..964735755c4a3 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
@@ -47,6 +47,11 @@ std::string getVariableName(mlir::Value v);
 /// Returns an empty string if not possible to generate a recipe name.
 std::string getRecipeName(mlir::acc::RecipeKind kind, mlir::Type type);
 
+// Get the base entity from partial entity access. This is used for getting
+// the base `struct` from an operation that only accesses a field or the
+// base `array` from an operation that only accesses a subarray.
+mlir::Value getBaseEntity(mlir::Value val);
+
 } // namespace acc
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.h b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.h
index 57d532b078b9e..27f65aa15f040 100644
--- a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.h
@@ -9,6 +9,9 @@
 #ifndef MLIR_DIALECT_OPENACC_TRANSFORMS_PASSES_H
 #define MLIR_DIALECT_OPENACC_TRANSFORMS_PASSES_H
 
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
diff --git a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
index 9ceb91e5679a1..40ccd1fc6c1a0 100644
--- a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
@@ -27,4 +27,40 @@ def LegalizeDataValuesInRegion : Pass<"openacc-legalize-data-values", "mlir::fun
   ];
 }
 
+def ACCImplicitData : Pass<"acc-implicit-data", "mlir::ModuleOp"> {
+  let summary = "Generate implicit data attributes for OpenACC compute constructs";
+  let description = [{
+    This pass implements the OpenACC specification for "Variables with
+    Implicitly Determined Data Attributes" (OpenACC 3.4 spec, section 2.6.2).
+
+    The pass automatically generates data clause operations for variables used
+    within OpenACC compute constructs (parallel, kernels, serial) that do not
+    already have explicit data clauses. The semantics follow these rules:
+
+    1. If there is a default(none) clause visible, no implicit data actions
+       apply.
+
+    2. An aggregate variable (arrays, derived types, etc.) will be treated as:
+       - In a present clause when default(present) is visible.
+       - In a copy clause otherwise.
+
+    3. A scalar variable will be treated as if it appears in:
+       - A copy clause if the compute construct is a kernels construct.
+       - A firstprivate clause otherwise (parallel, serial).
+  }];
+  let dependentDialects = ["mlir::acc::OpenACCDialect",
+      "mlir::memref::MemRefDialect",
+      "mlir::arith::ArithDialect"];
+  let options = [
+    Option<"enableImplicitReductionCopy", "enable-implicit-reduction-copy",
+           "bool", "true",
+           "Enable applying implicit copy in lieu of implicit firstprivate for "
+           "reduction variables. This allows uniform treatment of reduction "
+           "variables between combined constructs (e.g., 'parallel loop') and "
+           "separate constructs (e.g., 'parallel' followed by 'loop'), where "
+           "the OpenACC spec requires copy semantics for the former but "
+           "firstprivate would normally apply for the latter.">
+  ];
+}
+
 #endif // MLIR_DIALECT_OPENACC_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
index 0e42d08cdb1fc..b628f1a3f7b20 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
@@ -395,7 +395,7 @@ def SPV_INTEL_fpga_buffer_location               : I32EnumAttrCase<"SPV_INTEL_fp
 def SPV_INTEL_arbitrary_precision_fixed_point    : I32EnumAttrCase<"SPV_INTEL_arbitrary_precision_fixed_point", 4019>;
 def SPV_INTEL_usm_storage_classes                : I32EnumAttrCase<"SPV_INTEL_usm_storage_classes", 4020>;
 def SPV_INTEL_io_pipes                           : I32EnumAttrCase<"SPV_INTEL_io_pipes", 4021>;
-def SPV_INTEL_blocking_pipes                     : I32EnumAttrCase<"SPV_INTEL_blocking_pipes", 4022>;
+def SPV_ALTERA_blocking_pipes                     : I32EnumAttrCase<"SPV_ALTERA_blocking_pipes", 4022>;
 def SPV_INTEL_fpga_reg                           : I32EnumAttrCase<"SPV_INTEL_fpga_reg", 4023>;
 def SPV_INTEL_long_constant_composite            : I32EnumAttrCase<"SPV_INTEL_long_constant_composite", 4024>;
 def SPV_INTEL_optnone                            : I32EnumAttrCase<"SPV_INTEL_optnone", 4025>;
@@ -465,7 +465,7 @@ def SPIRV_ExtensionAttr :
       SPV_INTEL_kernel_attributes, SPV_INTEL_fpga_memory_accesses,
       SPV_INTEL_fpga_cluster_attributes, SPV_INTEL_loop_fuse,
       SPV_INTEL_fpga_buffer_location, SPV_INTEL_arbitrary_precision_fixed_point,
-      SPV_INTEL_usm_storage_classes, SPV_INTEL_io_pipes, SPV_INTEL_blocking_pipes,
+      SPV_INTEL_usm_storage_classes, SPV_INTEL_io_pipes, SPV_ALTERA_blocking_pipes,
       SPV_INTEL_fpga_reg, SPV_INTEL_long_constant_composite, SPV_INTEL_optnone,
       SPV_INTEL_debug_module, SPV_INTEL_fp_fast_math_mode,
       SPV_INTEL_memory_access_aliasing, SPV_INTEL_split_barrier,
@@ -807,9 +807,9 @@ def SPIRV_C_IOPipesINTEL                                : I32EnumAttrCase<"IOPip
     Extension<[SPV_INTEL_io_pipes]>
   ];
 }
-def SPIRV_C_BlockingPipesINTEL                          : I32EnumAttrCase<"BlockingPipesINTEL", 5945> {
+def SPIRV_C_BlockingPipesALTERA                          : I32EnumAttrCase<"BlockingPipesALTERA", 5945> {
   list<Availability> availability = [
-    Extension<[SPV_INTEL_blocking_pipes]>
+    Extension<[SPV_ALTERA_blocking_pipes]>
   ];
 }
 def SPIRV_C_FPGARegINTEL                                : I32EnumAttrCase<"FPGARegINTEL", 5948> {
@@ -1519,7 +1519,7 @@ def SPIRV_CapabilityAttr :
       SPIRV_C_FPGAMemoryAccessesINTEL, SPIRV_C_FPGAClusterAttributesINTEL,
       SPIRV_C_LoopFuseINTEL, SPIRV_C_MemoryAccessAliasingINTEL,
       SPIRV_C_FPGABufferLocationINTEL, SPIRV_C_ArbitraryPrecisionFixedPointINTEL,
-      SPIRV_C_USMStorageClassesINTEL, SPIRV_C_IOPipesINTEL, SPIRV_C_BlockingPipesINTEL,
+      SPIRV_C_USMStorageClassesINTEL, SPIRV_C_IOPipesINTEL, SPIRV_C_BlockingPipesALTERA,
       SPIRV_C_FPGARegINTEL, SPIRV_C_DotProductInputAll,
       SPIRV_C_DotProductInput4x8BitPacked, SPIRV_C_DotProduct, SPIRV_C_RayCullMaskKHR,
       SPIRV_C_CooperativeMatrixKHR, SPIRV_C_ReplicatedCompositesEXT,
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCooperativeMatrixOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCooperativeMatrixOps.td
index 827ac901d22de..e8124b8b0bed9 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCooperativeMatrixOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCooperativeMatrixOps.td
@@ -16,6 +16,8 @@
 #ifndef MLIR_DIALECT_SPIRV_IR_COOPERATIVE_MATRIX_OPS
 #define MLIR_DIALECT_SPIRV_IR_COOPERATIVE_MATRIX_OPS
 
+include "mlir/Interfaces/AlignmentAttrInterface.td"
+
 //===----------------------------------------------------------------------===//
 // SPV_KHR_cooperative_matrix extension ops.
 //===----------------------------------------------------------------------===//
@@ -62,7 +64,7 @@ def SPIRV_KHRCooperativeMatrixLengthOp :
 
 // -----
 
-def SPIRV_KHRCooperativeMatrixLoadOp : SPIRV_KhrVendorOp<"CooperativeMatrixLoad", []> {
+def SPIRV_KHRCooperativeMatrixLoadOp : SPIRV_KhrVendorOp<"CooperativeMatrixLoad", [DeclareOpInterfaceMethods<AlignmentAttrOpInterface>]> {
   let summary = "Loads a cooperative matrix through a pointer";
 
   let description = [{
@@ -148,7 +150,7 @@ def SPIRV_KHRCooperativeMatrixLoadOp : SPIRV_KhrVendorOp<"CooperativeMatrixLoad"
 
 // -----
 
-def SPIRV_KHRCooperativeMatrixStoreOp : SPIRV_KhrVendorOp<"CooperativeMatrixStore", []> {
+def SPIRV_KHRCooperativeMatrixStoreOp : SPIRV_KhrVendorOp<"CooperativeMatrixStore", [DeclareOpInterfaceMethods<AlignmentAttrOpInterface>]> {
   let summary = "Stores a cooperative matrix through a pointer";
 
   let description = [{
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td
index 6108decdb9706..0b3d70f80bed4 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td
@@ -15,6 +15,8 @@
 #define MLIR_DIALECT_SPIRV_IR_MEMORY_OPS
 
 include "mlir/Dialect/SPIRV/IR/SPIRVBase.td"
+include "mlir/Interfaces/AlignmentAttrInterface.td"
+
 
 // -----
 
@@ -79,7 +81,7 @@ def SPIRV_AccessChainOp : SPIRV_Op<"AccessChain", [Pure]> {
 
 // -----
 
-def SPIRV_CopyMemoryOp : SPIRV_Op<"CopyMemory", []> {
+def SPIRV_CopyMemoryOp : SPIRV_Op<"CopyMemory", [DeclareOpInterfaceMethods<AlignmentAttrOpInterface>]> {
   let summary = [{
     Copy from the memory pointed to by Source to the memory pointed to by
     Target. Both operands must be non-void pointers and having the same <id>
@@ -182,7 +184,7 @@ def SPIRV_InBoundsPtrAccessChainOp : SPIRV_Op<"InBoundsPtrAccessChain", [Pure]>
 
 // -----
 
-def SPIRV_LoadOp : SPIRV_Op<"Load", []> {
+def SPIRV_LoadOp : SPIRV_Op<"Load", [DeclareOpInterfaceMethods<AlignmentAttrOpInterface>]> {
   let summary = "Load through a pointer.";
 
   let description = [{
@@ -310,7 +312,7 @@ def SPIRV_PtrAccessChainOp : SPIRV_Op<"PtrAccessChain", [Pure]> {
 
 // -----
 
-def SPIRV_StoreOp : SPIRV_Op<"Store", []> {
+def SPIRV_StoreOp : SPIRV_Op<"Store", [DeclareOpInterfaceMethods<AlignmentAttrOpInterface>]> {
   let summary = "Store through a pointer.";
 
   let description = [{
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.h b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.h
index 2676e921c73fb..0e1f6e79a3670 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.h
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.h
@@ -20,6 +20,7 @@
 #include "mlir/Dialect/SPIRV/Interfaces/SPIRVImageInterfaces.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/OpImplementation.h"
+#include "mlir/Interfaces/AlignmentAttrInterface.h"
 #include "mlir/Interfaces/CallInterfaces.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
diff --git a/mlir/include/mlir/Dialect/Shape/Transforms/Passes.td b/mlir/include/mlir/Dialect/Shape/Transforms/Passes.td
index 588b5ebc5e37a..0b8c465fdcbbe 100644
--- a/mlir/include/mlir/Dialect/Shape/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Shape/Transforms/Passes.td
@@ -16,7 +16,7 @@ def OutlineShapeComputationPass
   let summary = "Using shape.func to preserve shape computation";
   let description = [{
     This pass outlines the shape computation part in high level IR by adding
-    shape.func and populate corresponding mapping infoemation into
+    shape.func and populate corresponding mapping information into
     ShapeMappingAnalysis. The shape computation part is usually introduced by
     shape reification, and each single dynamic shape is denoted by shape.with_shape.
 
@@ -80,12 +80,12 @@ def OutlineShapeComputationPass
 
     For the above example, the shape computation is inlined in the input IR,
     which is used for two values' (test.abs and test.concat) shape. And the shape
-    compuatation part is outlined in the output IR.
+    computation part is outlined in the output IR.
 
-    And the shape mapping infomation will be:
+    And the shape mapping information will be:
 
     ```
-    // ---- Shape Mapping Infomation -----
+    // ---- Shape Mapping Information -----
     // - Shape for: %0 = "test.abs"(%arg0) : (tensor<?x4x?xf32>) -> tensor<?x4x?xf32> :: @shape_cal_0(<block argument> of type 'tensor<?x4x?xf32>' at index: 0)
     // - Shape for: %1 = "test.concat"(%0, %arg1) {axis = 0 : i64} : (tensor<?x4x?xf32>, tensor<2x4x?xf32>) -> tensor<?x4x?xf32> :: @shape_cal_1(<block argument> of type 'tensor<?x4x?xf32>' at index: 0)
     ```
diff --git a/mlir/include/mlir/Dialect/Shard/Transforms/Simplifications.h b/mlir/include/mlir/Dialect/Shard/Transforms/Simplifications.h
index 452d4f6b4ed61..45ae758ec14c2 100644
--- a/mlir/include/mlir/Dialect/Shard/Transforms/Simplifications.h
+++ b/mlir/include/mlir/Dialect/Shard/Transforms/Simplifications.h
@@ -50,10 +50,8 @@ void populateAllReduceEndomorphismSimplificationPatterns(
   auto getAlgebraicOpOperands = [](Operation *op,
                                    SmallVector<OpOperand *> &operands) {
     auto algebraicOp = llvm::cast<AlgebraicOp>(op);
-    std::transform(algebraicOp->getOpOperands().begin(),
-                   algebraicOp->getOpOperands().end(),
-                   std::back_inserter(operands),
-                   [](OpOperand &operand) { return &operand; });
+    llvm::append_range(operands,
+                       llvm::make_pointer_range(algebraicOp->getOpOperands()));
   };
   auto getAlgebraicOpResult = [](Operation *op) {
     auto algebraicOp = llvm::cast<AlgebraicOp>(op);
diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
index 2453cf5b5b5a4..3e93e58575e65 100644
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -131,7 +131,9 @@ def Tensor_CastOp : Tensor_Op<"cast", [
 def Tensor_ConcatOp : Tensor_Op<"concat",
     [Pure,
      DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
-     DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>]> {
+     DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface, [
+       "reifyResultShapes"]>,
+     ]> {
   let summary = "tensor concatenation operation";
   let description = [{
     The "concat" operation constructs a tensor out of a variadic list of input
@@ -261,7 +263,8 @@ def Tensor_DimOp : Tensor_Op<"dim", [
 
 def Tensor_EmptyOp : Tensor_Op<"empty",
     [Pure,
-     DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>]> {
+     DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface, [
+       "reifyResultShapes"]>]> {
   let summary = "empty tensor operation";
 
   let description = [{
@@ -358,7 +361,8 @@ def Tensor_ExtractOp : Tensor_Op<"extract", [
 
 def Tensor_ExtractSliceOp : Tensor_OpWithOffsetSizesAndStrides<"extract_slice", [
     DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
-    DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>,
+    DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface, [
+      "reifyResultShapes"]>,
     AttrSizedOperandSegments,
     Pure,
     OffsetSizeAndStrideOpInterface
@@ -740,7 +744,8 @@ def Tensor_GatherOp : Tensor_Op<"gather", [
 def Tensor_GenerateOp : Tensor_Op<"generate", [
     DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
     RecursiveMemoryEffects,
-    DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>,
+    DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface, [
+      "reifyResultShapes"]>,
     SingleBlockImplicitTerminator<"mlir::tensor::YieldOp">]> {
   let summary = "Creates a dynamically sized tensor from elements";
   let description = [{
@@ -835,7 +840,8 @@ def Tensor_InsertOp : Tensor_Op<"insert", [
 
 def Tensor_InsertSliceOp : Tensor_OpWithOffsetSizesAndStrides<"insert_slice", [
     DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
-    DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>,
+    DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface, [
+      "reifyResultShapes"]>,
     AttrSizedOperandSegments,
     DestinationStyleOpInterface,
     Pure,
@@ -1256,7 +1262,8 @@ def Tensor_CollapseShapeOp : Tensor_ReassociativeReshapeOp<"collapse_shape"> {
 
 def Tensor_PadOp : Tensor_Op<"pad", [
     DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
-    DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>,
+    DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface, [
+      "reifyResultShapes"]>,
     AttrSizedOperandSegments,
     Pure,
     SingleBlockImplicitTerminator<"mlir::tensor::YieldOp">]> {
@@ -1764,7 +1771,8 @@ def Tensor_ScatterOp : Tensor_Op<"scatter", [
 
 def Tensor_SplatOp : Tensor_Op<"splat", [
     DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
-    DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>,
+    DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface, 
+      ["reifyResultShapes"]>,
     Pure,
     TypesMatchWith<"operand type matches element type of result",
                    "aggregate", "input",
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
index c774d870a8c45..0005402cd1f44 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
@@ -476,7 +476,16 @@ extensionComplianceMap = {
         {{fp32T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
       {{Extension::fp8e4m3}, {{{fp8e4m3T, i32T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2}, {{{fp8e5m2T, i32T}, SpecificationVersion::V_1_0}}},
-      {{Extension::bf16}, {{{bf16T, i32T}, SpecificationVersion::V_1_0}}}}},
+      {{Extension::bf16}, {{{bf16T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3, Extension::int64},
+       {{{fp8e4m3T, i64T}, SpecificationVersion::V_1_1_DRAFT}},
+       allOf},
+      {{Extension::fp8e5m2, Extension::int64},
+       {{{fp8e5m2T, i64T}, SpecificationVersion::V_1_1_DRAFT}},
+       allOf},
+      {{Extension::bf16, Extension::int64},
+       {{{bf16T, i64T}, SpecificationVersion::V_1_1_DRAFT}},
+       allOf}}},
     {"tosa.avg_pool2d",
      {{{Extension::int16},
        {{{i16T, i16T, i16T, i32T, i16T}, SpecificationVersion::V_1_0}}},
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
index 5b595dd8eb32a..cc23955f31f23 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
@@ -240,6 +240,7 @@ class Tosa_I32EnumAttr<string name, string description, string mnemonic,
 // DOUBLEROUND  : Adds double rounding support to the RESCALE operator.
 // INEXACTROUND : Adds inexact rounding support to the RESCALE operator.
 // DYNAMIC      : Removes all Compile Time Constant state for CTC inputs.
+// MXFP         : Microscaling formats.
 //===----------------------------------------------------------------------===//
 
 def Tosa_NONE : I32EnumAttrCase<"none", 0>;
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index 467dba3232f2b..2b36b2c5113e1 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -826,12 +826,12 @@ def Tosa_IntDivOp : Tosa_ElementwiseOp<"intdiv", [SameOperandsAndResultElementTy
   }];
 
   let arguments = (ins
-    Tosa_Int32Tensor:$input1,
-    Tosa_Int32Tensor:$input2
+    Tosa_Int32Or64Tensor:$input1,
+    Tosa_Int32Or64Tensor:$input2
   );
 
   let results = (outs
-    Tosa_Int32Tensor:$output
+    Tosa_Int32Or64Tensor:$output
   );
 
   list<Availability> availability = [
@@ -2219,7 +2219,8 @@ def Tosa_TileOp : Tosa_InferShapedTypeOp<"tile"> {
 // Operator: transpose
 //===----------------------------------------------------------------------===//
 def Tosa_TransposeOp : Tosa_InferShapedTypeOp<"transpose",
-                [DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>,
+                [DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface ,
+                                           ["reifyResultShapes"]>,
                  AllElementTypesMatch<["input1", "output"]>]> {
   let summary = "Transpose operator.";
 
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformTypes.td b/mlir/include/mlir/Dialect/Transform/IR/TransformTypes.td
index 2d9a26e165b67..3e3fff4c63d2b 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformTypes.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformTypes.td
@@ -103,4 +103,9 @@ def TransformAnyHandle : Type<
         TransformValueHandleTypeInterface.predicate]>,
     "transform operation or value handle">;
 
+def TransformAnyParamTypeOrAnyHandle : Type<
+    Or<[TransformHandleTypeInterface.predicate,
+        TransformParamTypeInterface.predicate]>,
+    "transform any param type or any handle type">;
+
 #endif  // MLIR_DIALECT_TRANSFORM_IR_TRANSFORMTYPES
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
index bbf55f5d507e3..b3a0653b90765 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h
@@ -23,6 +23,7 @@
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/AlignmentAttrInterface.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/DestinationStyleOpInterface.h"
 #include "mlir/Interfaces/IndexingMapOpInterface.h"
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 6e15b1e7df606..43172ff2082df 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -19,6 +19,7 @@ include "mlir/Dialect/Vector/Interfaces/MaskableOpInterface.td"
 include "mlir/Dialect/Vector/Interfaces/MaskingOpInterface.td"
 include "mlir/Dialect/Vector/IR/Vector.td"
 include "mlir/Dialect/Vector/IR/VectorAttributes.td"
+include "mlir/Interfaces/AlignmentAttrInterface.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/Interfaces/DestinationStyleOpInterface.td"
 include "mlir/Interfaces/IndexingMapOpInterface.td"
@@ -1653,7 +1654,8 @@ def Vector_TransferWriteOp :
 
 def Vector_LoadOp : Vector_Op<"load", [
     DeclareOpInterfaceMethods<VectorUnrollOpInterface, ["getShapeForUnroll"]>,
-    DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>
+    DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
+    DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
   ]> {
   let summary = "reads an n-D slice of memory into an n-D vector";
   let description = [{
@@ -1770,7 +1772,8 @@ def Vector_LoadOp : Vector_Op<"load", [
 
 def Vector_StoreOp : Vector_Op<"store", [
     DeclareOpInterfaceMethods<VectorUnrollOpInterface, ["getShapeForUnroll"]>,
-    DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>
+    DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
+    DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
   ]> {
   let summary = "writes an n-D vector to an n-D slice of memory";
   let description = [{
@@ -1875,7 +1878,10 @@ def Vector_StoreOp : Vector_Op<"store", [
 }
 
 def Vector_MaskedLoadOp :
-  Vector_Op<"maskedload", [DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>]>,
+  Vector_Op<"maskedload", [
+      DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
+      DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
+    ]>,
     Arguments<(ins Arg<AnyMemRef, "", [MemRead]>:$base,
                Variadic<Index>:$indices,
                VectorOfNonZeroRankOf<[I1]>:$mask,
@@ -1967,7 +1973,10 @@ def Vector_MaskedLoadOp :
 }
 
 def Vector_MaskedStoreOp :
-  Vector_Op<"maskedstore", [DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>]>,
+  Vector_Op<"maskedstore", [
+      DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
+      DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
+    ]>,
     Arguments<(ins Arg<AnyMemRef, "", [MemWrite]>:$base,
                Variadic<Index>:$indices,
                VectorOfNonZeroRankOf<[I1]>:$mask,
@@ -2048,7 +2057,8 @@ def Vector_GatherOp :
   Vector_Op<"gather", [
     DeclareOpInterfaceMethods<MaskableOpInterface>,
     DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
-    DeclareOpInterfaceMethods<VectorUnrollOpInterface, ["getShapeForUnroll"]>
+    DeclareOpInterfaceMethods<VectorUnrollOpInterface, ["getShapeForUnroll"]>,
+    DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
   ]>,
     Arguments<(ins Arg<TensorOrMemRef<[AnyType]>, "", [MemRead]>:$base,
                Variadic<Index>:$offsets,
@@ -2151,7 +2161,10 @@ def Vector_GatherOp :
 }
 
 def Vector_ScatterOp :
-  Vector_Op<"scatter", [DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>]>,
+  Vector_Op<"scatter", [
+      DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
+      DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
+    ]>,
     Arguments<(ins Arg<AnyMemRef, "", [MemWrite]>:$base,
                Variadic<Index>:$offsets,
                VectorOfNonZeroRankOf<[AnyInteger, Index]>:$indices,
@@ -2236,7 +2249,10 @@ def Vector_ScatterOp :
 }
 
 def Vector_ExpandLoadOp :
-  Vector_Op<"expandload", [DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>]>,
+  Vector_Op<"expandload", [
+      DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
+      DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
+    ]>,
     Arguments<(ins Arg<AnyMemRef, "", [MemRead]>:$base,
                Variadic<Index>:$indices,
                FixedVectorOfNonZeroRankOf<[I1]>:$mask,
@@ -2324,7 +2340,10 @@ def Vector_ExpandLoadOp :
 }
 
 def Vector_CompressStoreOp :
-  Vector_Op<"compressstore", [DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>]>,
+  Vector_Op<"compressstore", [
+      DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
+      DeclareOpInterfaceMethods<AlignmentAttrOpInterface>
+    ]>,
     Arguments<(ins Arg<AnyMemRef, "", [MemWrite]>:$base,
                Variadic<Index>:$indices,
                FixedVectorOfNonZeroRankOf<[I1]>:$mask,
diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
index a57aadcdcc5b0..45626aa280946 100644
--- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
+++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
@@ -219,13 +219,17 @@ bool isLinearizableVector(VectorType type);
 
 /// Creates a TransferReadOp from `source`.
 ///
-/// The shape of the vector to read is specified via `inputVectorSizes`. If the
-/// shape of the output vector differs from the shape of the value being read,
-/// masking is used to avoid out-of-bounds accesses. Set
+/// If the shape of vector to read differs from the shape of the value being
+/// read, masking is used to avoid out-of-bounds accesses. Set
 /// `useInBoundsInsteadOfMasking` to `true` to use the "in_bounds" attribute
 /// instead of explicit masks.
 ///
 /// Note: all read offsets are set to 0.
+Value createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source,
+                             const VectorType &vecToReadTy,
+                             std::optional<Value> padValue = std::nullopt,
+                             bool useInBoundsInsteadOfMasking = false);
+
 Value createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source,
                              ArrayRef<int64_t> inputVectorSizes,
                              std::optional<Value> padValue = std::nullopt,
diff --git a/mlir/include/mlir/Dialect/XeGPU/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/CMakeLists.txt
index 9f57627c321fb..cb1e9d01821a2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/XeGPU/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(IR)
 add_subdirectory(Transforms)
+add_subdirectory(TransformOps)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 1481859e94a92..0c059967bb898 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -30,9 +30,11 @@ class SliceAttr;
 } // namespace xegpu
 } // namespace mlir
 
+// clang-format off
+#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
-#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
+// clang-format on
 
 #define GET_ATTRDEF_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.h.inc>
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 19a52317956d2..3f27d690f949b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -223,17 +223,17 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
     InterfaceMethod<"Derive a new layout by dropping InstData",
                     "xegpu::DistributeLayoutAttr",
                     "dropInstData">,
-    InterfaceMethod<[{Delinearizes a linear subgroup ID into its multidimensional
-                      indices based on the effective subgroup layout.}],
+    InterfaceMethod<[{Delinearizes a linear ID into its multidimensional
+                      indices based on the effective layout level.}],
                     "FailureOr<SmallVector<Value>>",
-                    "delinearizeSubgroupId",
+                    "delinearizeId",
                     (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId)>,
-    InterfaceMethod<[{Generates instructions to compute multidimensional offsets for blocks
-                      assigned to a subgroup identified by linearId. The shape parameter
-                      represents the workgroup-level problem size. Each subgroup may access
+    InterfaceMethod<[{Generates instructions to compute multidimensional coordinates for dist units
+                      assigned to a level identified by linearId. The shape parameter
+                      represents the higher-level problem size. Each level may access
                       multiple blocks according to round-robin distribution rules.}],
                     "FailureOr<SmallVector<SmallVector<Value>>>",
-                    "getOffsets",
+                    "computeDistributedCoords",
                     (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef<int64_t>":$shape)>,
     InterfaceMethod</*desc=*/[{Check if this layout can be achieved by applying a transpose
                      to some other layout according to given permutation of (0...n-1).}],
@@ -379,28 +379,39 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
   );
 
   let builders = [
-    AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $lane_layout,
+    AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $inst_data),
+      [{
+        auto sg_layout = DenseI32ArrayAttr();
+        auto sg_data = DenseI32ArrayAttr();
+        auto order = DenseI32ArrayAttr();
+        auto lane_layout = DenseI32ArrayAttr();
+        auto lane_data = DenseI32ArrayAttr();
+        return $_get($_ctxt, sg_layout, sg_data,
+                     DenseI32ArrayAttr::get($_ctxt, inst_data),
+                     lane_layout, lane_data, order);
+      }]>,
+    AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $inst_data,
+                      "llvm::ArrayRef<int32_t>": $lane_layout,
                      "llvm::ArrayRef<int32_t>": $lane_data),
       [{
         auto sg_layout = DenseI32ArrayAttr();
         auto sg_data = DenseI32ArrayAttr();
-        auto inst_data = DenseI32ArrayAttr();
         auto order = DenseI32ArrayAttr();
-        return $_get($_ctxt, sg_layout, sg_data, inst_data,
+        return $_get($_ctxt, sg_layout, sg_data,
+                     DenseI32ArrayAttr::get($_ctxt, inst_data),
                      DenseI32ArrayAttr::get($_ctxt, lane_layout),
                      DenseI32ArrayAttr::get($_ctxt, lane_data), order);
       }]>,
     AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $lane_layout,
-                     "llvm::ArrayRef<int32_t>": $lane_data,
-                     "llvm::ArrayRef<int32_t>": $order),
+                     "llvm::ArrayRef<int32_t>": $lane_data),
       [{
-        return $_get($_ctxt,
-                     /*sg_layout =*/ nullptr,
-                     /*sg_data   =*/ nullptr,
-                     /*inst_data =*/ nullptr,
+        auto sg_layout = DenseI32ArrayAttr();
+        auto sg_data = DenseI32ArrayAttr();
+        auto inst_data = DenseI32ArrayAttr();
+        auto order = DenseI32ArrayAttr();
+        return $_get($_ctxt, sg_layout, sg_data, inst_data,
                      DenseI32ArrayAttr::get($_ctxt, lane_layout),
-                     DenseI32ArrayAttr::get($_ctxt, lane_data),
-                     DenseI32ArrayAttr::get($_ctxt, order));
+                     DenseI32ArrayAttr::get($_ctxt, lane_data), order);
       }]>,
     AttrBuilder<(ins "DenseI32ArrayAttr": $lane_layout,
                      "DenseI32ArrayAttr": $lane_data,
@@ -476,17 +487,17 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
       return {};
     }
 
-    /// Delinearizes a linear subgroup ID into its multidimensional indices
-    /// based on the effective subgroup layout.
+    /// Delinearizes a linear ID into its multidimensional indices
+    /// based on the effective level of the layout.
     FailureOr<SmallVector<Value>>
-    delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
+    delinearizeId(OpBuilder &builder, Location loc, Value linearId);
 
-    /// Generates instructions to compute multidimensional offsets for blocks
-    /// assigned to a subgroup identified by linearId. The shape parameter
-    /// represents the workgroup-level problem size. Each subgroup may access
+    /// Generates instructions to compute multidimensional coordinates for dist units
+    /// assigned to a level identified by linearId. The shape parameter
+    /// represents the higher-level problem size. Each `level` may access
     /// multiple blocks according to round-robin distribution rules.
     FailureOr<SmallVector<SmallVector<Value>>>
-    getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
+    computeDistributedCoords(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
 
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other) { return false; }
@@ -643,14 +654,15 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
     /// Delinearizes a linear subgroup ID into its multidimensional indices
     /// based on the effective subgroup layout.
     FailureOr<SmallVector<Value>>
-    delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
+    delinearizeId(OpBuilder &builder, Location loc, Value linearId);
 
-    /// Generates instructions to compute multidimensional offsets for blocks
+    /// Generates instructions to compute multidimensional coordinates for blocks
     /// assigned to a subgroup identified by linearId. The shape parameter
     /// represents the workgroup-level problem size. Each subgroup may access
     /// multiple blocks according to round-robin distribution rules.
+
     FailureOr<SmallVector<SmallVector<Value>>>
-    getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
+    computeDistributedCoords(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
 
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other);
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 426377fcf598f..689ebd0d1179a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -843,7 +843,8 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
       AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size,
       OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
-      OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint);
+      OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
+      OptionalAttr<XeGPU_LayoutAttr>:$layout);
   let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value);
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
@@ -895,7 +896,14 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
                     "IntegerAttr": $chunk_size,
                     "xegpu::CachePolicyAttr": $l1_hint,
                     "xegpu::CachePolicyAttr": $l2_hint,
-                    "xegpu::CachePolicyAttr": $l3_hint)>
+                    "xegpu::CachePolicyAttr": $l3_hint)>,
+    OpBuilder<(ins "Type": $value, "Value": $source,
+                    "ArrayRef<OpFoldResult>": $offsets, "Value": $mask,
+                    "IntegerAttr": $chunk_size,
+                    "xegpu::CachePolicyAttr": $l1_hint,
+                    "xegpu::CachePolicyAttr": $l2_hint,
+                    "xegpu::CachePolicyAttr": $l3_hint,
+                    "xegpu::LayoutAttr": $layout)>
    ];
 
   let hasVerifier = 1;
@@ -979,7 +987,8 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
       AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size,
       OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
-      OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint);
+      OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
+      OptionalAttr<XeGPU_LayoutAttr>:$layout);
 
   let extraClassDeclaration = extraBaseClassDeclaration#[{
     Type getDestType() {
@@ -1030,7 +1039,14 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
                     "IntegerAttr": $chunk_size,
                     "xegpu::CachePolicyAttr": $l1_hint,
                     "xegpu::CachePolicyAttr": $l2_hint,
-                    "xegpu::CachePolicyAttr": $l3_hint)>
+                    "xegpu::CachePolicyAttr": $l3_hint)>,
+    OpBuilder<(ins "Value": $value, "Value": $dest,
+                    "ArrayRef<OpFoldResult>": $offsets, "Value": $mask,
+                    "IntegerAttr": $chunk_size,
+                    "xegpu::CachePolicyAttr": $l1_hint,
+                    "xegpu::CachePolicyAttr": $l2_hint,
+                    "xegpu::CachePolicyAttr": $l3_hint,
+                    "xegpu::LayoutAttr": $layout)>
    ];
 
   let hasVerifier = 1;
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h
deleted file mode 100644
index 8aa9536cb67c1..0000000000000
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===- XeGPUTargetInfo.h - Target constants ---------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_
-#define MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_
-
-namespace mlir {
-namespace xegpu {
-/// HW dependent constants.
-/// TODO: These constants should be queried from the target information.
-namespace targetinfo {
-constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup.
-/// If DPAS A or B operands have low precision element types they must be packed
-/// according to the following sizes.
-constexpr unsigned packedSizeInBitsForDefault =
-    16; // Minimum packing size per register for DPAS A.
-constexpr unsigned packedSizeInBitsForDpasB =
-    32; // Minimum packing size per register for DPAS B.
-constexpr unsigned packedSizeInBitsForGatherScatter =
-    32; // Minimum packing size per register for Gather and Scatter ops.
-} // namespace targetinfo
-} // namespace xegpu
-} // namespace mlir
-
-#endif // MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_
diff --git a/mlir/include/mlir/Dialect/XeGPU/TransformOps/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/TransformOps/CMakeLists.txt
new file mode 100644
index 0000000000000..5924606402a02
--- /dev/null
+++ b/mlir/include/mlir/Dialect/XeGPU/TransformOps/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(LLVM_TARGET_DEFINITIONS XeGPUTransformOps.td)
+mlir_tablegen(XeGPUTransformOps.h.inc -gen-op-decls)
+mlir_tablegen(XeGPUTransformOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRXeGPUTransformOpsIncGen)
+
+add_mlir_doc(XeGPUTransformOps XeGPUTransformOps Dialects/ -gen-op-doc)
diff --git a/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h
new file mode 100644
index 0000000000000..3e16d1e4a7c94
--- /dev/null
+++ b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h
@@ -0,0 +1,28 @@
+//===- XeGPUTransformOps.h - XeGPU transformation ops -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_XEGPU_TRANSFORMOPS_XEGPUTRANSFORMOPS_H
+#define MLIR_DIALECT_XEGPU_TRANSFORMOPS_XEGPUTRANSFORMOPS_H
+
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h.inc"
+
+namespace mlir {
+class DialectRegistry;
+
+namespace xegpu {
+void registerTransformDialectExtension(DialectRegistry &registry);
+} // namespace xegpu
+} // namespace mlir
+
+#endif // MLIR_DIALECT_XEGPU_TRANSFORMOPS_XEGPUTRANSFORMOPS_H
diff --git a/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td
new file mode 100644
index 0000000000000..16044838aa27d
--- /dev/null
+++ b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td
@@ -0,0 +1,333 @@
+//===- XeGPUTransformOps.td - XeGPU transformation ops -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XEGPU_TRANSFORM_OPS
+#define XEGPU_TRANSFORM_OPS
+
+include "mlir/Dialect/Transform/IR/TransformAttrs.td"
+include "mlir/Dialect/Transform/IR/TransformDialect.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
+include "mlir/Dialect/Transform/IR/TransformTypes.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/OpBase.td"
+
+def GetDescOp : Op<Transform_Dialect, "xegpu.get_desc_op", [
+  DeclareOpInterfaceMethods<TransformOpInterface>,
+  NavigationTransformOpTrait, MemoryEffectsOpInterface
+]> {
+
+  let summary = "Get a handle to the descriptor op of a value.";
+  let description = [{
+    Traces the producers of the given value until an `xegpu.create_nd_tdesc`
+    descriptor op is found. Returns a handle to it. Currently traces
+    producers by following only the first operand of producer ops.
+  }];
+
+  let arguments = (ins TransformValueHandleTypeInterface:$target);
+
+  let results = (outs TransformHandleTypeInterface:$descHandle);
+  let assemblyFormat = "$target attr-dict `:` functional-type(operands, results)";
+}
+
+def SetDescLayoutOp : Op<Transform_Dialect, "xegpu.set_desc_layout", [
+  AttrSizedOperandSegments,
+  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+  TransformOpInterface
+]> {
+
+  let summary = "Set xegpu.layout attribute to a xegpu.create_nd_desc op result.";
+  let description = [{
+    Given an `xegpu.create_nd_desc` operation, this transform adds `xegpu.layout`
+    attribute to the result tensor descriptor. The layout is defined by the
+    `sg_layout`, and `sg_data` and optional `inst_data` attributes. Returns a handle
+    to the transformed op.
+  }];
+
+  let arguments = (ins
+                   TransformHandleTypeInterface:$target,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$sg_layout,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$sg_data,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$inst_data,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_sg_layout,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_sg_data,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_inst_data
+                   );
+
+  let results = (outs TransformHandleTypeInterface:$transformed);
+  let builders = [
+    OpBuilder<(ins "Value":$target,
+                   "ArrayRef<OpFoldResult>":$mixedSgLayout,
+                   "ArrayRef<OpFoldResult>":$mixedSgData,
+                   "ArrayRef<OpFoldResult>":$mixedInstData
+                   )>,
+  ];
+
+  let assemblyFormat = [{
+    $target
+    `sg_layout` `=` custom<DynamicIndexList>($sg_layout, $static_sg_layout)
+    `sg_data` `=` custom<DynamicIndexList>($sg_data, $static_sg_data)
+    (`inst_data` `=` custom<DynamicIndexList>($inst_data, $static_inst_data)^)?
+    attr-dict `:` functional-type(operands, results)
+  }];
+
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure apply(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::transform::TransformResults &transformResults,
+        ::mlir::transform::TransformState &state);
+
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedSgLayout() {
+      Builder b(getContext());
+      return getMixedValues(getStaticSgLayout(), getSgLayout(), b);
+    }
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedSgData() {
+      Builder b(getContext());
+      return getMixedValues(getStaticSgData(), getSgData(), b);
+    }
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedInstData() {
+      Builder b(getContext());
+      return getMixedValues(getStaticInstData(), getInstData(), b);
+    }
+  }];
+}
+
+def SetOpLayoutAttrOp : Op<Transform_Dialect, "xegpu.set_op_layout_attr", [
+  AttrSizedOperandSegments,
+  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+  TransformOpInterface
+]> {
+
+  let summary = "Set xegpu.layout attribute of an op.";
+  let description = [{
+    Sets the `xegpu.layout` attribute of an op. If `result=true`, sets the
+    `layout_result_{index}`, otherwise `layout_operand_{index}` attribute. The
+    target operand/result value is defined by the `index` argument. The layout
+    is defined by the `sg_layout`, `sg_data` and optional `inst_data` attributes.
+  }];
+
+  let arguments = (ins TransformHandleTypeInterface:$target,
+                   DefaultValuedOptionalAttr<I64Attr, "0">:$index,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$sg_layout,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$sg_data,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$inst_data,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_sg_layout,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_sg_data,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_inst_data,
+                   DefaultValuedAttr<UnitAttr, "false">:$result
+                   );
+
+  let results = (outs);
+  let builders = [
+    OpBuilder<(ins "Value":$target,
+                   "int64_t":$index,
+                   "ArrayRef<OpFoldResult>":$mixedSgLayout,
+                   "ArrayRef<OpFoldResult>":$mixedSgData,
+                   "ArrayRef<OpFoldResult>":$mixedInstData,
+                   CArg<"bool", "false">:$result
+                   )>,
+  ];
+
+  let assemblyFormat = [{
+    $target (`result` $result^)? (`index` `=` $index^)?
+    `sg_layout` `=` custom<DynamicIndexList>($sg_layout, $static_sg_layout)
+    `sg_data` `=` custom<DynamicIndexList>($sg_data, $static_sg_data)
+    (`inst_data` `=` custom<DynamicIndexList>($inst_data, $static_inst_data)^)?
+    attr-dict `:` qualified(type(operands))
+  }];
+
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure apply(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::transform::TransformResults &transformResults,
+        ::mlir::transform::TransformState &state);
+
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedSgLayout() {
+      Builder b(getContext());
+      return getMixedValues(getStaticSgLayout(), getSgLayout(), b);
+    }
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedSgData() {
+      Builder b(getContext());
+      return getMixedValues(getStaticSgData(), getSgData(), b);
+    }
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedInstData() {
+      Builder b(getContext());
+      return getMixedValues(getStaticInstData(), getInstData(), b);
+    }
+  }];
+}
+
+def SetGPULaunchThreadsOp
+    : Op<Transform_Dialect, "xegpu.set_gpu_launch_threads", [
+      DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+      TransformOpInterface
+    ]> {
+
+  let summary = "Set number of threads for a given gpu.launch operation";
+  let description = [{
+    Overrides the x,y,z threads operands of a given `gpu.launch` operation in-place.
+  }];
+
+  let arguments = (ins TransformHandleTypeInterface:$target,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$threads,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_threads
+                   );
+  let results = (outs);
+  let builders = [
+    OpBuilder<(ins "Value":$target, "ArrayRef<OpFoldResult>":$mixedThreads)>,
+  ];
+
+  let assemblyFormat = [{
+    $target
+    `threads` `=` custom<DynamicIndexList>($threads, $static_threads)
+    attr-dict `:` qualified(type(operands))
+  }];
+
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure apply(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::transform::TransformResults &transformResults,
+        ::mlir::transform::TransformState &state);
+
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedThreads() {
+      Builder b(getContext());
+      return getMixedValues(getStaticThreads(), getThreads(), b);
+    }
+  }];
+}
+
+def InsertPrefetchOp : Op<Transform_Dialect, "xegpu.insert_prefetch", [
+  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+  TransformOpInterface
+]> {
+
+  let summary = "Adds xegpu prefetch ops to matmul operand tiles.";
+  let description = [{
+    Given a target value (e.g., `vector`) residing in a `scf.for` loop, this
+    transform finds the corresponding `xegpu.load_nd` op and inserts
+    `xegpu.prefetch_nd` operations for the tile. The load op must reside within
+    the `scf.for` loop. Number of prefetch steps is set by the `nb_prefetch`
+    argument (default value is 1). Returns a handle to the created
+    `xegpu.create_nd_desc` op.
+  }];
+
+  let arguments = (ins TransformValueHandleTypeInterface:$target,
+                   Optional<TransformAnyParamTypeOrAnyHandle>:$dynamic_nb_prefetch,
+                   DefaultValuedOptionalAttr<I64Attr, "1">:$static_nb_prefetch
+                   );
+
+  let results = (outs TransformHandleTypeInterface:$desc_op);
+
+  let assemblyFormat = [{
+    $target
+    `nb_prefetch` `=` ($dynamic_nb_prefetch^):($static_nb_prefetch)?
+    attr-dict `:` functional-type(operands, results)
+  }];
+
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure apply(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::transform::TransformResults &transformResults,
+        ::mlir::transform::TransformState &state);
+
+    OpFoldResult getNbPrefetch() {
+      auto cxt = getContext();
+      if (getDynamicNbPrefetch())
+        return OpFoldResult(getDynamicNbPrefetch());
+      return OpFoldResult(IntegerAttr::get(
+                          IntegerType::get(cxt, 64), getStaticNbPrefetch()));
+    }
+  }];
+}
+
+def ConvertLayoutOp : Op<Transform_Dialect, "xegpu.convert_layout", [
+  AttrSizedOperandSegments,
+  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+  TransformOpInterface
+]> {
+
+  let summary = "Convert xegpu.layout attribute for a value.";
+  let description = [{
+    Adds an `xegpu.convert_layout` op to convert the `xegpu.layout` attribute
+    of a value. The input and target layouts are defined by the `*sg_layout`,
+    `*sg_data` and optional `*inst_data` attributes. Returns a handle to the
+    emitted `xegpu.convert_layout` op.
+  }];
+
+  let arguments = (ins TransformValueHandleTypeInterface:$target,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$input_sg_layout,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$input_sg_data,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$input_inst_data,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$target_sg_layout,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$target_sg_data,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$target_inst_data,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_input_sg_layout,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_input_sg_data,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_input_inst_data,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_target_sg_layout,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_target_sg_data,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_target_inst_data
+                   );
+
+  let results = (outs TransformHandleTypeInterface:$newConvertOp);
+  let builders = [
+    OpBuilder<(ins "Value":$target,
+                   "ArrayRef<OpFoldResult>":$mixedInputSgLayout,
+                   "ArrayRef<OpFoldResult>":$mixedInputSgData,
+                   "ArrayRef<OpFoldResult>":$mixedInputInstData,
+                   "ArrayRef<OpFoldResult>":$mixedTargetSgLayout,
+                   "ArrayRef<OpFoldResult>":$mixedTargetSgData,
+                   "ArrayRef<OpFoldResult>":$mixedTargetInstData
+                   )>,
+  ];
+
+  let assemblyFormat = [{
+    $target
+    `input_sg_layout` `=` custom<DynamicIndexList>($input_sg_layout, $static_input_sg_layout)
+    `input_sg_data` `=` custom<DynamicIndexList>($input_sg_data, $static_input_sg_data)
+    (`input_inst_data` `=` custom<DynamicIndexList>($input_inst_data, $static_input_inst_data)^)?
+    `target_sg_layout` `=` custom<DynamicIndexList>($target_sg_layout, $static_target_sg_layout)
+    `target_sg_data` `=` custom<DynamicIndexList>($target_sg_data, $static_target_sg_data)
+    (`target_inst_data` `=` custom<DynamicIndexList>($target_inst_data, $static_target_inst_data)^)?
+    attr-dict `:` functional-type(operands, results)
+  }];
+
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure apply(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::transform::TransformResults &transformResults,
+        ::mlir::transform::TransformState &state);
+
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedInputSgLayout() {
+      Builder b(getContext());
+      return getMixedValues(getStaticInputSgLayout(), getInputSgLayout(), b);
+    }
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedInputSgData() {
+      Builder b(getContext());
+      return getMixedValues(getStaticInputSgData(), getInputSgData(), b);
+    }
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedInputInstData() {
+      Builder b(getContext());
+      return getMixedValues(getStaticInputInstData(), getInputInstData(), b);
+    }
+
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedTargetSgLayout() {
+      Builder b(getContext());
+      return getMixedValues(getStaticTargetSgLayout(), getTargetSgLayout(), b);
+    }
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedTargetSgData() {
+      Builder b(getContext());
+      return getMixedValues(getStaticTargetSgData(), getTargetSgData(), b);
+    }
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedTargetInstData() {
+      Builder b(getContext());
+      return getMixedValues(getStaticTargetInstData(), getTargetInstData(), b);
+    }
+  }];
+}
+
+#endif // XEGPU_TRANSFORM_OPS
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 564d9c4d5422b..12270af870b3b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -26,7 +26,7 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> {
     The pass distributes subgroup level (SIMD) XeGPU ops to work items.
   }];
   let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
-                           "vector::VectorDialect"];
+                           "vector::VectorDialect", "index::IndexDialect"];
 }
 
 def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {
@@ -43,7 +43,12 @@ def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {
   let options = [Option<
     "printOnly", "print-analysis-only", "bool",
     /*default=*/"false",
-    "Print the result of layout propagation analysis and exit.">];
+    "Print the result of layout propagation analysis and exit.">,
+    Option<
+    "layoutKind", "layout-kind", "std::string",
+    /*default=*/"\"lane\"",
+    "Propagate `inst` / `lane` level of xegpu layouts.">
+  ];
 }
 
 def XeGPUWgToSgDistribute : Pass<"xegpu-wg-to-sg-distribute"> {
@@ -80,4 +85,16 @@ def XeGPUVectorLinearize : Pass<"xegpu-vector-linearize"> {
                            "scf::SCFDialect", "ub::UBDialect", "vector::VectorDialect"];
 }
 
+def XeGPUOptimizeBlockLoads : Pass<"xegpu-optimize-block-loads"> {
+  let summary = "Optimize XeGPU block load operations";
+  let description = [{
+    This pass rewrites XeGPU loadNd operations into more optimal forms
+    to improve performance. This includes,
+    - Rewriting transpose B loads into more optimal forms to use HW block
+      transpose instructions for better performance.
+  }];
+  let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
+                           "vector::VectorDialect"];
+}
+
 #endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index a480195eebd00..1776a209d0bf1 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -61,7 +61,8 @@ struct UnrollOptions {
 
 /// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`.
 void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns);
-
+/// Appends patterns for optimizing block load operations into `patterns`.
+void populateXeGPUOptimizeBlockLoadsPatterns(RewritePatternSet &patterns);
 /// Appends patterns for XeGPU SIMT distribution into `patterns`.
 void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns);
 /// Appends patterns for moving function body into gpu.warp_execute_on_lane0 op.
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 04cfd58d846a7..58092c3bb9ed2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -104,11 +104,15 @@ void removeLayoutAttrs(Operation *op);
 
 /// Sets the DistributeLayoutAttr for a given OpOperand or OpResult by attaching
 /// it to the owner's dictionary attributes
+/// If `respectPermLayout` is true the existing permament layout
+/// attribute will be kept and assigned to the attribute dict instead
+/// of the provided layout.
 template <typename T,
           typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
                                       std::is_same_v<T, OpResult>>>
 void setDistributeLayoutAttr(const T &operandOrResult,
-                             const DistributeLayoutAttr layout);
+                             const DistributeLayoutAttr layout,
+                             bool respectPermLayout = false);
 
 /// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given
 /// operation. If the operation contains regions, it is also applied recursively
@@ -162,6 +166,15 @@ SmallVector<OpFoldResult> addElementwise(OpBuilder &builder, Location loc,
 SmallVector<OpFoldResult> addWithRightAligned(OpBuilder &builder, Location loc,
                                               ArrayRef<OpFoldResult> lhs,
                                               ArrayRef<OpFoldResult> rhs);
+
+/// Helper Function to find a proper instruction multiple for the user-supplied
+/// sg-level data shape (diven by `dim`). `candidates` are uArch allowed shapes.
+/// `candidateMultiples` are uArch multiples of such shapes (i.e. block count or
+/// array length).
+template <typename T>
+int getLargestDivisor(T dim, ArrayRef<T> candidates,
+                      ArrayRef<T> candidateMultiples = {});
+
 } // namespace xegpu
 
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h
index dcb2ad5d67a25..b3231a173f33a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h
+++ b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h
@@ -270,6 +270,8 @@ inline const uArch *getUArch(llvm::StringRef archName) {
     return PVCuArch::getInstance();
   else if (archName.equals_insensitive("bmg"))
     return BMGuArch::getInstance();
+  else
+    llvm_unreachable("No matching uArch found");
 
   return nullptr;
 }
diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h
index ea33e885c78ff..8f23b89134773 100644
--- a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h
+++ b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h
@@ -29,6 +29,8 @@ namespace mlir {
 namespace xegpu {
 namespace uArch {
 
+constexpr unsigned generalPackedFormatBitSize{32};
+
 // An enum class to represent the scope of an instruction
 enum class InstructionScope { Lane, Subgroup, Workgroup, Cluster };
 enum class InstructionKind {
diff --git a/mlir/include/mlir/IR/BlockSupport.h b/mlir/include/mlir/IR/BlockSupport.h
index f9fbef2f3753f..a2e080ef4f63f 100644
--- a/mlir/include/mlir/IR/BlockSupport.h
+++ b/mlir/include/mlir/IR/BlockSupport.h
@@ -206,12 +206,12 @@ namespace ilist_detail {
 // operations to have trailing Regions without a circular include
 // dependence.
 template <>
-struct SpecificNodeAccess<
-    typename compute_node_options<::mlir::Operation>::type> : NodeAccess {
+struct SpecificNodeAccess<compute_node_options<::mlir::Operation>::type>
+    : NodeAccess {
 protected:
-  using OptionsT = typename compute_node_options<mlir::Operation>::type;
-  using pointer = typename OptionsT::pointer;
-  using const_pointer = typename OptionsT::const_pointer;
+  using OptionsT = compute_node_options<mlir::Operation>::type;
+  using pointer = OptionsT::pointer;
+  using const_pointer = OptionsT::const_pointer;
   using node_type = ilist_node_impl<OptionsT>;
 
   static node_type *getNodePtr(pointer N);
diff --git a/mlir/include/mlir/Interfaces/AlignmentAttrInterface.h b/mlir/include/mlir/Interfaces/AlignmentAttrInterface.h
new file mode 100644
index 0000000000000..5b52c22d4a824
--- /dev/null
+++ b/mlir/include/mlir/Interfaces/AlignmentAttrInterface.h
@@ -0,0 +1,21 @@
+//===- AlignmentAttrInterface.h - Alignment attribute interface -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_INTERFACES_ALIGNMENTATTRINTERFACE_H
+#define MLIR_INTERFACES_ALIGNMENTATTRINTERFACE_H
+
+#include "mlir/IR/OpDefinition.h"
+#include "llvm/Support/Alignment.h"
+
+namespace mlir {
+class MLIRContext;
+} // namespace mlir
+
+#include "mlir/Interfaces/AlignmentAttrInterface.h.inc"
+
+#endif // MLIR_INTERFACES_ALIGNMENTATTRINTERFACE_H
diff --git a/mlir/include/mlir/Interfaces/AlignmentAttrInterface.td b/mlir/include/mlir/Interfaces/AlignmentAttrInterface.td
new file mode 100644
index 0000000000000..931af6990f40a
--- /dev/null
+++ b/mlir/include/mlir/Interfaces/AlignmentAttrInterface.td
@@ -0,0 +1,65 @@
+//===- AlignmentAttrInterface.td - Alignment attribute interface -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an interface for operations that expose an optional
+// alignment attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_INTERFACES_ALIGNMENTATTRINTERFACE_TD
+#define MLIR_INTERFACES_ALIGNMENTATTRINTERFACE_TD
+
+include "mlir/IR/OpBase.td"
+
+def AlignmentAttrOpInterface : OpInterface<"AlignmentAttrOpInterface"> {
+  let description = [{
+    An interface for operations that carry an optional alignment attribute and
+    want to expose it as an `llvm::MaybeAlign` helper.
+  }];
+
+  let cppNamespace = "::mlir";
+
+  let methods = [
+    InterfaceMethod<[{
+        Returns the alignment encoded on the operation as an `llvm::MaybeAlign`.
+        Operations providing a differently named accessor can override the
+        default implementation.
+      }],
+      "::llvm::MaybeAlign",
+      "getMaybeAlign",
+      (ins),
+      [{
+        // Defensive: trait implementations are expected to validate power-of-two
+        // alignments, but we still guard against accidental misuse.
+        auto alignmentOpt = $_op.getAlignment();
+        if (!alignmentOpt || *alignmentOpt <= 0)
+          return ::llvm::MaybeAlign();
+        uint64_t value = static_cast<uint64_t>(*alignmentOpt);
+        if (!::llvm::isPowerOf2_64(value))
+          return ::llvm::MaybeAlign();
+        return ::llvm::MaybeAlign(value);
+      }]
+    >
+  ];
+
+  let extraTraitClassDeclaration = [{
+    ::llvm::MaybeAlign getMaybeAlign() {
+      // Defensive: trait implementations are expected to validate power-of-two
+      // alignments, but we still guard against accidental misuse.
+      auto alignmentOpt = (*static_cast<ConcreteOp *>(this)).getAlignment();
+      if (!alignmentOpt || *alignmentOpt <= 0)
+        return ::llvm::MaybeAlign();
+      uint64_t value = static_cast<uint64_t>(*alignmentOpt);
+      if (!::llvm::isPowerOf2_64(value))
+        return ::llvm::MaybeAlign();
+      return ::llvm::MaybeAlign(value);
+    }
+  }];
+}
+
+#endif // MLIR_INTERFACES_ALIGNMENTATTRINTERFACE_TD
diff --git a/mlir/include/mlir/Interfaces/CMakeLists.txt b/mlir/include/mlir/Interfaces/CMakeLists.txt
index 72ed046a1ba5d..eb96a68861116 100644
--- a/mlir/include/mlir/Interfaces/CMakeLists.txt
+++ b/mlir/include/mlir/Interfaces/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_mlir_interface(AlignmentAttrInterface)
 add_mlir_interface(CallInterfaces)
 add_mlir_interface(CastInterfaces)
 add_mlir_interface(ControlFlowInterfaces)
diff --git a/mlir/include/mlir/Interfaces/InferTypeOpInterface.h b/mlir/include/mlir/Interfaces/InferTypeOpInterface.h
index 4fcbeff9df560..1bfb66e681d8d 100644
--- a/mlir/include/mlir/Interfaces/InferTypeOpInterface.h
+++ b/mlir/include/mlir/Interfaces/InferTypeOpInterface.h
@@ -33,6 +33,10 @@ using ReifiedRankedShapedTypeDims = SmallVector<SmallVector<OpFoldResult>>;
 LogicalResult
 reifyResultShapes(OpBuilder &b, Operation *op,
                   ReifiedRankedShapedTypeDims &reifiedReturnShapes);
+FailureOr<SmallVector<OpFoldResult>>
+reifyShapeOfResult(OpBuilder &b, Operation *op, int resultIndex);
+FailureOr<OpFoldResult> reifyDimOfResult(OpBuilder &b, Operation *op,
+                                         int resultIndex, int dim);
 
 /// Adaptor class to abstract the differences between whether value is from
 /// a ShapedType or ShapedTypeComponents or DenseIntElementsAttribute.
diff --git a/mlir/include/mlir/Interfaces/InferTypeOpInterface.td b/mlir/include/mlir/Interfaces/InferTypeOpInterface.td
index 1a2c05fc16ed5..67568f731f597 100644
--- a/mlir/include/mlir/Interfaces/InferTypeOpInterface.td
+++ b/mlir/include/mlir/Interfaces/InferTypeOpInterface.td
@@ -361,20 +361,76 @@ def ReifyRankedShapedTypeOpInterface :
   let methods = [
     InterfaceMethod<
       /*desc=*/[{
-        Reify the shape of the result of an operation (typically in terms of the
-        shape of its operands).
+        Reify the shapes of all the result of an operation (typically in terms 
+        of the shape of its operands).
 
         `reifiedReturnShapes` is populated with one vector per op result. Each
         of those vectors contains an OpFoldResult for each dimension of the
         shaped type. The given builder may be used to insert ops that compute
         result shapes.
 
-        If the shape of a particular result cannot be computed it must be empty.
+        If the shape of a particular result cannot be computed it in terms of
+        its operands it must be left empty. If any dimension of the result cannot
+        be computed it must be set to OpFoldResult().
       }],
       /*retTy=*/"::llvm::LogicalResult",
       /*methodName=*/"reifyResultShapes",
       /*args=*/(ins "::mlir::OpBuilder &":$builder,
-        "::mlir::ReifiedRankedShapedTypeDims &":$reifiedReturnShapes)
+        "::mlir::ReifiedRankedShapedTypeDims &":$reifiedReturnShapes),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{ return ::mlir::failure(); }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Reify the shape of a single result of an operation (typically in terms 
+        of the shape of its operands).
+
+        Returns the shape of a single result of the operation as a
+        `SmallVector<OpFoldResult>`, one per dimension of the shaped type. The
+        given builder may be used to insert ops that compute result shapes.
+
+        If any dimension of the result cannot be computed it must be set to
+        OpFoldResult().
+      }],
+      /*retTy=*/"::llvm::FailureOr<::llvm::SmallVector<::mlir::OpFoldResult>>",
+      /*methodName=*/"reifyShapeOfResult",
+      /*args=*/(ins "::mlir::OpBuilder &":$builder,
+        "int":$resultIndex),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        ReifiedRankedShapedTypeDims reifiedShapes;
+        if (failed(cast<ReifyRankedShapedTypeOpInterface>($_op.getOperation()).reifyResultShapes(builder, reifiedShapes)))
+          return failure();
+        if (resultIndex < 0 || resultIndex >= static_cast<int>(reifiedShapes.size()))
+          return $_op.emitOpError("invalid result index");
+        return reifiedShapes[resultIndex];
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Reify the shape of a dimension of a given result of an operation
+        (typically in terms of the shape of its operands).
+
+        Returns the shape of a specific dimension of a result of the operation as
+        an OpFoldResult. The given builder may be used to insert ops that compute
+        the shapes.
+
+        If the dimension of the result cannot be computed the method must return
+        `failure()`.
+      }],
+      /*retTy=*/"::llvm::FailureOr<::mlir::OpFoldResult>",
+      /*methodName=*/"reifyDimOfResult",
+      /*args=*/(ins "::mlir::OpBuilder &":$builder,
+        "int":$resultIndex, "int":$dim),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        auto shapes = cast<ReifyRankedShapedTypeOpInterface>($_op.getOperation()).reifyShapeOfResult(builder, resultIndex);
+        if (failed(shapes))
+          return failure();
+        if (dim < 0 || dim >= static_cast<int>((*shapes).size()))
+          return $_op.emitOpError("invalid dimension");
+        return (*shapes)[dim];
+      }]
     >
   ];
 }
diff --git a/mlir/include/mlir/Interfaces/TilingInterface.td b/mlir/include/mlir/Interfaces/TilingInterface.td
index 0c0fc88aec95a..e0516abdfcf0c 100644
--- a/mlir/include/mlir/Interfaces/TilingInterface.td
+++ b/mlir/include/mlir/Interfaces/TilingInterface.td
@@ -57,8 +57,8 @@ def TilingInterface : OpInterface<"TilingInterface"> {
     For an operation to be "tiled and fused" with its (already tiled) producer,
     an operation has to implement the following additional methods (see
     description below):
-      - `getTiledImplementationFromOperandTile`
-      - `getIterationDomainTileFromOperandTile`.
+      - `getTiledImplementationFromOperandTiles`
+      - `getIterationDomainTileFromOperandTiles`.
   }];
   let cppNamespace = "::mlir";
   let methods = [
diff --git a/mlir/include/mlir/Support/Timing.h b/mlir/include/mlir/Support/Timing.h
index a8a4bfd1c6cf1..50ae84735dbf0 100644
--- a/mlir/include/mlir/Support/Timing.h
+++ b/mlir/include/mlir/Support/Timing.h
@@ -44,7 +44,7 @@ class DefaultTimingManagerImpl;
 /// This is a POD type with pointer size, so it should be passed around by
 /// value. The underlying data is owned by the `TimingManager`.
 class TimingIdentifier {
-  using EntryType = llvm::StringMapEntry<std::nullopt_t>;
+  using EntryType = llvm::StringMapEntry<llvm::EmptyStringSetTag>;
 
 public:
   TimingIdentifier(const TimingIdentifier &) = default;
@@ -473,6 +473,11 @@ void registerDefaultTimingManagerCLOptions();
 /// 'registerDefaultTimingManagerOptions' to a `DefaultTimingManager`.
 void applyDefaultTimingManagerCLOptions(DefaultTimingManager &tm);
 
+/// Create an output strategy for the specified format, to be passed to
+/// DefaultTimingManager::setOutput().
+std::unique_ptr<OutputStrategy>
+createOutputStrategy(DefaultTimingManager::OutputFormat fmt, raw_ostream &os);
+
 } // namespace mlir
 
 #endif // MLIR_SUPPORT_TIMING_H
diff --git a/mlir/include/mlir/TableGen/CodeGenHelpers.h b/mlir/include/mlir/TableGen/CodeGenHelpers.h
index 997aef26bdc01..b56172f55a157 100644
--- a/mlir/include/mlir/TableGen/CodeGenHelpers.h
+++ b/mlir/include/mlir/TableGen/CodeGenHelpers.h
@@ -52,6 +52,15 @@ class DialectNamespaceEmitter {
   std::optional<llvm::NamespaceEmitter> nsEmitter;
 };
 
+/// This class represents how an error stream string being constructed will be
+/// consumed.
+enum class ErrorStreamType {
+  // Inside a string that's streamed into an InflightDiagnostic.
+  InString,
+  // Inside a string inside an OpError.
+  InsideOpError,
+};
+
 /// This class deduplicates shared operation verification code by emitting
 /// static functions alongside the op definitions. These methods are local to
 /// the definition file, and are invoked within the operation verify methods.
@@ -192,7 +201,8 @@ class StaticVerifierFunctionEmitter {
 
   /// A generic function to emit constraints
   void emitConstraints(const ConstraintMap &constraints, StringRef selfName,
-                       const char *codeTemplate);
+                       const char *codeTemplate,
+                       ErrorStreamType errorStreamType);
 
   /// Assign a unique name to a unique constraint.
   std::string getUniqueName(StringRef kind, unsigned index);
@@ -243,6 +253,18 @@ std::string stringify(T &&t) {
       apply(std::forward<T>(t));
 }
 
+/// Helper to generate a C++ streaming error message from a given message.
+/// Message can contain '{{...}}' placeholders that are substituted with
+/// C-expressions via tgfmt. It would effectively convert:
+///   "failed to verify {{foo}}"
+/// into:
+///   "failed to verify " << bar
+/// where bar is the result of evaluating 'tgfmt("foo", &ctx)' at compile
+/// time.
+std::string buildErrorStreamingString(
+    StringRef message, const FmtContext &ctx,
+    ErrorStreamType errorStreamType = ErrorStreamType::InString);
+
 } // namespace tblgen
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
index b7394387b0f9a..79dfd7a2795f0 100644
--- a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
+++ b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
@@ -359,6 +359,20 @@ class MlirOptMainConfig {
 /// the loaded IR.
 using PassPipelineFn = llvm::function_ref<LogicalResult(PassManager &pm)>;
 
+/// Register basic command line options.
+/// - toolName is used for the header displayed by `--help`.
+/// - registry should contain all the dialects that can be parsed in the source.
+/// - return std::string for help header.
+std::string registerCLIOptions(llvm::StringRef toolName,
+                               DialectRegistry &registry);
+
+/// Parse command line options.
+/// - helpHeader is used for the header displayed by `--help`.
+/// - return std::pair<std::string, std::string> for
+///   inputFilename and outputFilename command line option values.
+std::pair<std::string, std::string> parseCLIOptions(int argc, char **argv,
+                                                    llvm::StringRef helpHeader);
+
 /// Register and parse command line options.
 /// - toolName is used for the header displayed by `--help`.
 /// - registry should contain all the dialects that can be parsed in the source.
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index ed7e2a08ebfd9..5ac9e26e8636d 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -981,6 +981,28 @@ class ConversionPatternRewriter final : public PatternRewriter {
   /// Return a reference to the internal implementation.
   detail::ConversionPatternRewriterImpl &getImpl();
 
+  /// Attempt to legalize the given operation. This can be used within
+  /// conversion patterns to change the default pre-order legalization order.
+  /// Returns "success" if the operation was legalized, "failure" otherwise.
+  ///
+  /// Note: In a partial conversion, this function returns "success" even if
+  /// the operation could not be legalized, as long as it was not explicitly
+  /// marked as illegal in the conversion target.
+  LogicalResult legalize(Operation *op);
+
+  /// Attempt to legalize the given region. This can be used within
+  /// conversion patterns to change the default pre-order legalization order.
+  /// Returns "success" if the region was legalized, "failure" otherwise.
+  ///
+  /// If the current pattern runs with a type converter, the entry block
+  /// signature will be converted before legalizing the operations in the
+  /// region.
+  ///
+  /// Note: In a partial conversion, this function returns "success" even if
+  /// an operation could not be legalized, as long as it was not explicitly
+  /// marked as illegal in the conversion target.
+  LogicalResult legalize(Region *r);
+
 private:
   // Allow OperationConverter to construct new rewriters.
   friend struct OperationConverter;
@@ -989,7 +1011,8 @@ class ConversionPatternRewriter final : public PatternRewriter {
   /// conversions. They apply some IR rewrites in a delayed fashion and could
   /// bring the IR into an inconsistent state when used standalone.
   explicit ConversionPatternRewriter(MLIRContext *ctx,
-                                     const ConversionConfig &config);
+                                     const ConversionConfig &config,
+                                     OperationConverter &converter);
 
   // Hide unsupported pattern rewriter API.
   using OpBuilder::setListener;
diff --git a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
index 0682e5f26785a..22bc0b32a9bd1 100644
--- a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
@@ -266,9 +266,10 @@ void AbstractDenseForwardDataFlowAnalysis::visitBlock(Block *block) {
     }
 
     LDBG() << "    Joining state from predecessor " << predecessor;
+    const AbstractDenseLattice &before = *getLatticeFor(
+        point, getProgramPointAfter(predecessor->getTerminator()));
     // Merge in the state from the predecessor's terminator.
-    join(after, *getLatticeFor(
-                    point, getProgramPointAfter(predecessor->getTerminator())));
+    visitBlockTransfer(block, point, predecessor, before, after);
   }
 }
 
@@ -614,7 +615,9 @@ void AbstractDenseBackwardDataFlowAnalysis::visitBlock(Block *block) {
     LDBG() << "    Meeting state from successor " << successor;
     // Merge in the state from the successor: either the first operation, or the
     // block itself when empty.
-    meet(before, *getLatticeFor(point, getProgramPointBefore(successor)));
+    visitBlockTransfer(block, point, successor,
+                       *getLatticeFor(point, getProgramPointBefore(successor)),
+                       before);
   }
 }
 
diff --git a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
index 6588b53cbd9f8..0e0c5f2159382 100644
--- a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
+++ b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
@@ -530,7 +530,7 @@ std::pair<AffineMap, AffineMap> FlatLinearConstraints::getLowerAndUpperBound(
     // i - j + 1 >= 0 is the constraint, 'pos' is for i the lower bound is j
     // - 1.
     addCoeffs(ineq, lb);
-    std::transform(lb.begin(), lb.end(), lb.begin(), std::negate<int64_t>());
+    llvm::transform(lb, lb.begin(), std::negate<int64_t>());
     auto expr =
         getAffineExprFromFlatForm(lb, dimCount, symCount, localExprs, context);
     // expr ceildiv divisor is (expr + divisor - 1) floordiv divisor
@@ -559,7 +559,7 @@ std::pair<AffineMap, AffineMap> FlatLinearConstraints::getLowerAndUpperBound(
     auto eq = getEquality64(idx);
     addCoeffs(eq, b);
     if (eq[pos + offset] > 0)
-      std::transform(b.begin(), b.end(), b.begin(), std::negate<int64_t>());
+      llvm::transform(b, b.begin(), std::negate<int64_t>());
 
     // Extract the upper bound (in terms of other coeff's + const).
     auto expr =
diff --git a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
index 0dcdd5bb97bc8..812043d1af0d9 100644
--- a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
+++ b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugLog.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -716,7 +717,7 @@ bool IntegerRelation::isEmpty() const {
     // that aren't the intended use case for IntegerRelation. This is
     // needed since FM has a worst case exponential complexity in theory.
     if (tmpCst.getNumConstraints() >= kExplosionFactor * getNumVars()) {
-      LLVM_DEBUG(llvm::dbgs() << "FM constraint explosion detected\n");
+      LDBG() << "FM constraint explosion detected";
       return false;
     }
 
@@ -1943,7 +1944,7 @@ void IntegerRelation::removeTrivialRedundancy() {
 // which can prove the existence of a solution if there is one.
 void IntegerRelation::fourierMotzkinEliminate(unsigned pos, bool darkShadow,
                                               bool *isResultIntegerExact) {
-  LLVM_DEBUG(llvm::dbgs() << "FM input (eliminate pos " << pos << "):\n");
+  LDBG() << "FM input (eliminate pos " << pos << "):";
   LLVM_DEBUG(dump());
   assert(pos < getNumVars() && "invalid position");
   assert(hasConsistentState());
@@ -1955,7 +1956,7 @@ void IntegerRelation::fourierMotzkinEliminate(unsigned pos, bool darkShadow,
       LogicalResult ret = gaussianEliminateVar(pos);
       (void)ret;
       assert(ret.succeeded() && "Gaussian elimination guaranteed to succeed");
-      LLVM_DEBUG(llvm::dbgs() << "FM output (through Gaussian elimination):\n");
+      LDBG() << "FM output (through Gaussian elimination):";
       LLVM_DEBUG(dump());
       return;
     }
@@ -1969,7 +1970,7 @@ void IntegerRelation::fourierMotzkinEliminate(unsigned pos, bool darkShadow,
     // If it doesn't appear, just remove the column and return.
     // TODO: refactor removeColumns to use it from here.
     removeVar(pos);
-    LLVM_DEBUG(llvm::dbgs() << "FM output:\n");
+    LDBG() << "FM output:";
     LLVM_DEBUG(dump());
     return;
   }
@@ -2052,8 +2053,7 @@ void IntegerRelation::fourierMotzkinEliminate(unsigned pos, bool darkShadow,
     }
   }
 
-  LLVM_DEBUG(llvm::dbgs() << "FM isResultIntegerExact: " << allLCMsAreOne
-                          << "\n");
+  LDBG() << "FM isResultIntegerExact: " << allLCMsAreOne;
   if (allLCMsAreOne && isResultIntegerExact)
     *isResultIntegerExact = true;
 
@@ -2090,7 +2090,7 @@ void IntegerRelation::fourierMotzkinEliminate(unsigned pos, bool darkShadow,
   newRel.normalizeConstraintsByGCD();
   newRel.removeTrivialRedundancy();
   clearAndCopyFrom(newRel);
-  LLVM_DEBUG(llvm::dbgs() << "FM output:\n");
+  LDBG() << "FM output:";
   LLVM_DEBUG(dump());
 }
 
diff --git a/mlir/lib/Analysis/Presburger/Utils.cpp b/mlir/lib/Analysis/Presburger/Utils.cpp
index 2aaa4c033d0ce..b06a8a1b9ccf8 100644
--- a/mlir/lib/Analysis/Presburger/Utils.cpp
+++ b/mlir/lib/Analysis/Presburger/Utils.cpp
@@ -52,8 +52,8 @@ static void normalizeDivisionByGCD(MutableArrayRef<DynamicAPInt> dividend,
   }
 
   // Normalize the dividend and the denominator.
-  std::transform(dividend.begin(), dividend.end(), dividend.begin(),
-                 [gcd](DynamicAPInt &n) { return floorDiv(n, gcd); });
+  llvm::transform(dividend, dividend.begin(),
+                  [gcd](DynamicAPInt &n) { return floorDiv(n, gcd); });
   divisor /= gcd;
 }
 
@@ -331,8 +331,7 @@ presburger::getDivLowerBound(ArrayRef<DynamicAPInt> dividend,
   assert(dividend[localVarIdx] == 0 &&
          "Local to be set to division must have zero coeff!");
   SmallVector<DynamicAPInt, 8> ineq(dividend.size());
-  std::transform(dividend.begin(), dividend.end(), ineq.begin(),
-                 std::negate<DynamicAPInt>());
+  llvm::transform(dividend, ineq.begin(), std::negate<DynamicAPInt>());
   ineq[localVarIdx] = divisor;
   ineq.back() += divisor - 1;
   return ineq;
@@ -522,15 +521,13 @@ void DivisionRepr::dump() const { print(llvm::errs()); }
 SmallVector<DynamicAPInt, 8>
 presburger::getDynamicAPIntVec(ArrayRef<int64_t> range) {
   SmallVector<DynamicAPInt, 8> result(range.size());
-  std::transform(range.begin(), range.end(), result.begin(),
-                 dynamicAPIntFromInt64);
+  llvm::transform(range, result.begin(), dynamicAPIntFromInt64);
   return result;
 }
 
 SmallVector<int64_t, 8> presburger::getInt64Vec(ArrayRef<DynamicAPInt> range) {
   SmallVector<int64_t, 8> result(range.size());
-  std::transform(range.begin(), range.end(), result.begin(),
-                 int64fromDynamicAPInt);
+  llvm::transform(range, result.begin(), int64fromDynamicAPInt);
   return result;
 }
 
diff --git a/mlir/lib/AsmParser/Parser.cpp b/mlir/lib/AsmParser/Parser.cpp
index 82bdb844480f1..74936e32bd9d9 100644
--- a/mlir/lib/AsmParser/Parser.cpp
+++ b/mlir/lib/AsmParser/Parser.cpp
@@ -407,8 +407,8 @@ Parser::parseFloatFromIntegerLiteral(std::optional<APFloat> &result,
                      "hexadecimal float constant out of range for type");
   }
 
-  APInt truncatedValue(typeSizeInBits, intValue.getNumWords(),
-                       intValue.getRawData());
+  APInt truncatedValue(typeSizeInBits,
+                       ArrayRef(intValue.getRawData(), intValue.getNumWords()));
   result.emplace(semantics, truncatedValue);
   return success();
 }
diff --git a/mlir/lib/Bindings/Python/DialectLinalg.cpp b/mlir/lib/Bindings/Python/DialectLinalg.cpp
index 015502371c65b..0b079b404d42d 100644
--- a/mlir/lib/Bindings/Python/DialectLinalg.cpp
+++ b/mlir/lib/Bindings/Python/DialectLinalg.cpp
@@ -80,6 +80,28 @@ static void populateDialectLinalgSubmodule(nb::module_ m) {
         "op.",
         nb::arg("op"));
 
+  m.def(
+      "infer_contraction_dimensions_from_maps",
+      [](std::vector<MlirAffineMap> indexingMaps)
+          -> std::optional<MlirLinalgContractionDimensions> {
+        if (indexingMaps.empty())
+          return std::nullopt;
+
+        MlirLinalgContractionDimensions dims =
+            mlirLinalgInferContractionDimensionsFromMaps(indexingMaps.data(),
+                                                         indexingMaps.size());
+
+        // Detect "empty" result from invalid input or failed inference.
+        if (mlirAttributeIsNull(dims.batch) && mlirAttributeIsNull(dims.m) &&
+            mlirAttributeIsNull(dims.n) && mlirAttributeIsNull(dims.k)) {
+          return std::nullopt;
+        }
+        return dims;
+      },
+      "Infers contraction dimensions (batch/m/n/k) from a list of affine "
+      "maps.",
+      nb::arg("indexing_maps"));
+
   m.def("isa_convolution_op", &mlirLinalgIsAConvolutionOp,
         "Checks if the given operation is a Linalg convolution operation.",
         nb::arg("op"));
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index cda4fe19c16f8..40a466beee159 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -32,33 +32,6 @@ using llvm::SmallVector;
 using llvm::StringRef;
 using llvm::Twine;
 
-//------------------------------------------------------------------------------
-// Docstrings (trivial, non-duplicated docstrings are included inline).
-//------------------------------------------------------------------------------
-
-static const char kContextParseTypeDocstring[] =
-    R"(Parses the assembly form of a type.
-
-Returns a Type object or raises an MLIRError if the type cannot be parsed.
-
-See also: https://mlir.llvm.org/docs/LangRef/#type-system
-)";
-
-static const char kContextGetCallSiteLocationDocstring[] =
-    R"(Gets a Location representing a caller and callsite)";
-
-static const char kContextGetFileLocationDocstring[] =
-    R"(Gets a Location representing a file, line and column)";
-
-static const char kContextGetFileRangeDocstring[] =
-    R"(Gets a Location representing a file, line and column range)";
-
-static const char kContextGetFusedLocationDocstring[] =
-    R"(Gets a Location representing a fused location with optional metadata)";
-
-static const char kContextGetNameLocationDocString[] =
-    R"(Gets a Location representing a named location with optional child location)";
-
 static const char kModuleParseDocstring[] =
     R"(Parses a module's assembly format from a string.
 
@@ -67,132 +40,12 @@ Returns a new MlirModule or raises an MLIRError if the parsing fails.
 See also: https://mlir.llvm.org/docs/LangRef/
 )";
 
-static const char kModuleCAPICreate[] =
-    R"(Creates a Module from a MlirModule wrapped by a capsule (i.e. module._CAPIPtr).
-Note this returns a new object BUT _clear_mlir_module(module) must be called to
-prevent double-frees (of the underlying mlir::Module).
-)";
-
-static const char kOperationCreateDocstring[] =
-    R"(Creates a new operation.
-
-Args:
-  name: Operation name (e.g. "dialect.operation").
-  results: Sequence of Type representing op result types.
-  attributes: Dict of str:Attribute.
-  successors: List of Block for the operation's successors.
-  regions: Number of regions to create.
-  location: A Location object (defaults to resolve from context manager).
-  ip: An InsertionPoint (defaults to resolve from context manager or set to
-    False to disable insertion, even with an insertion point set in the
-    context manager).
-  infer_type: Whether to infer result types.
-Returns:
-  A new "detached" Operation object. Detached operations can be added
-  to blocks, which causes them to become "attached."
-)";
-
-static const char kOperationPrintDocstring[] =
-    R"(Prints the assembly form of the operation to a file like object.
-
-Args:
-  file: The file like object to write to. Defaults to sys.stdout.
-  binary: Whether to write bytes (True) or str (False). Defaults to False.
-  large_elements_limit: Whether to elide elements attributes above this
-    number of elements. Defaults to None (no limit).
-  large_resource_limit: Whether to elide resource attributes above this
-    number of characters. Defaults to None (no limit). If large_elements_limit
-    is set and this is None, the behavior will be to use large_elements_limit
-    as large_resource_limit.
-  enable_debug_info: Whether to print debug/location information. Defaults
-    to False.
-  pretty_debug_info: Whether to format debug information for easier reading
-    by a human (warning: the result is unparseable).
-  print_generic_op_form: Whether to print the generic assembly forms of all
-    ops. Defaults to False.
-  use_local_Scope: Whether to print in a way that is more optimized for
-    multi-threaded access but may not be consistent with how the overall
-    module prints.
-  assume_verified: By default, if not printing generic form, the verifier
-    will be run and if it fails, generic form will be printed with a comment
-    about failed verification. While a reasonable default for interactive use,
-    for systematic use, it is often better for the caller to verify explicitly
-    and report failures in a more robust fashion. Set this to True if doing this
-    in order to avoid running a redundant verification. If the IR is actually
-    invalid, behavior is undefined.
-  skip_regions: Whether to skip printing regions. Defaults to False.
-)";
-
-static const char kOperationPrintStateDocstring[] =
-    R"(Prints the assembly form of the operation to a file like object.
-
-Args:
-  file: The file like object to write to. Defaults to sys.stdout.
-  binary: Whether to write bytes (True) or str (False). Defaults to False.
-  state: AsmState capturing the operation numbering and flags.
-)";
-
-static const char kOperationGetAsmDocstring[] =
-    R"(Gets the assembly form of the operation with all options available.
-
-Args:
-  binary: Whether to return a bytes (True) or str (False) object. Defaults to
-    False.
-  ... others ...: See the print() method for common keyword arguments for
-    configuring the printout.
-Returns:
-  Either a bytes or str object, depending on the setting of the 'binary'
-  argument.
-)";
-
-static const char kOperationPrintBytecodeDocstring[] =
-    R"(Write the bytecode form of the operation to a file like object.
-
-Args:
-  file: The file like object to write to.
-  desired_version: The version of bytecode to emit.
-Returns:
-  The bytecode writer status.
-)";
-
-static const char kOperationStrDunderDocstring[] =
-    R"(Gets the assembly form of the operation with default options.
-
-If more advanced control over the assembly formatting or I/O options is needed,
-use the dedicated print or get_asm method, which supports keyword arguments to
-customize behavior.
-)";
-
 static const char kDumpDocstring[] =
-    R"(Dumps a debug representation of the object to stderr.)";
-
-static const char kAppendBlockDocstring[] =
-    R"(Appends a new block, with argument types as positional args.
-
-Returns:
-  The created block.
-)";
-
-static const char kValueDunderStrDocstring[] =
-    R"(Returns the string form of the value.
-
-If the value is a block argument, this is the assembly form of its type and the
-position in the argument list. If the value is an operation result, this is
-equivalent to printing the operation that produced it.
-)";
-
-static const char kGetNameAsOperand[] =
-    R"(Returns the string form of value as an operand (i.e., the ValueID).
-)";
-
-static const char kValueReplaceAllUsesWithDocstring[] =
-    R"(Replace all uses of value with the new value, updating anything in
-the IR that uses 'self' to use the other value instead.
-)";
+    "Dumps a debug representation of the object to stderr.";
 
 static const char kValueReplaceAllUsesExceptDocstring[] =
-    R"("Replace all uses of this value with the 'with' value, except for those
-in 'exceptions'. 'exceptions' can be either a single operation or a list of
+    R"(Replace all uses of this value with the `with` value, except for those
+in `exceptions`. `exceptions` can be either a single operation or a list of
 operations.
 )";
 
@@ -274,22 +127,26 @@ struct PyGlobalDebugFlag {
     // Debug flags.
     nb::class_<PyGlobalDebugFlag>(m, "_GlobalDebug")
         .def_prop_rw_static("flag", &PyGlobalDebugFlag::get,
-                            &PyGlobalDebugFlag::set, "LLVM-wide debug flag")
+                            &PyGlobalDebugFlag::set, "LLVM-wide debug flag.")
         .def_static(
             "set_types",
             [](const std::string &type) {
               nb::ft_lock_guard lock(mutex);
               mlirSetGlobalDebugType(type.c_str());
             },
-            "types"_a, "Sets specific debug types to be produced by LLVM")
-        .def_static("set_types", [](const std::vector<std::string> &types) {
-          std::vector<const char *> pointers;
-          pointers.reserve(types.size());
-          for (const std::string &str : types)
-            pointers.push_back(str.c_str());
-          nb::ft_lock_guard lock(mutex);
-          mlirSetGlobalDebugTypes(pointers.data(), pointers.size());
-        });
+            "types"_a, "Sets specific debug types to be produced by LLVM.")
+        .def_static(
+            "set_types",
+            [](const std::vector<std::string> &types) {
+              std::vector<const char *> pointers;
+              pointers.reserve(types.size());
+              for (const std::string &str : types)
+                pointers.push_back(str.c_str());
+              nb::ft_lock_guard lock(mutex);
+              mlirSetGlobalDebugTypes(pointers.data(), pointers.size());
+            },
+            "types"_a,
+            "Sets multiple specific debug types to be produced by LLVM.");
   }
 
 private:
@@ -316,12 +173,18 @@ struct PyAttrBuilderMap {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyAttrBuilderMap>(m, "AttrBuilder")
-        .def_static("contains", &PyAttrBuilderMap::dunderContains)
-        .def_static("get", &PyAttrBuilderMap::dunderGetItemNamed)
+        .def_static("contains", &PyAttrBuilderMap::dunderContains,
+                    "attribute_kind"_a,
+                    "Checks whether an attribute builder is registered for the "
+                    "given attribute kind.")
+        .def_static("get", &PyAttrBuilderMap::dunderGetItemNamed,
+                    "attribute_kind"_a,
+                    "Gets the registered attribute builder for the given "
+                    "attribute kind.")
         .def_static("insert", &PyAttrBuilderMap::dunderSetItemNamed,
                     "attribute_kind"_a, "attr_builder"_a, "replace"_a = false,
                     "Register an attribute builder for building MLIR "
-                    "attributes from python values.");
+                    "attributes from Python values.");
   }
 };
 
@@ -341,8 +204,8 @@ namespace {
 
 class PyRegionIterator {
 public:
-  PyRegionIterator(PyOperationRef operation)
-      : operation(std::move(operation)) {}
+  PyRegionIterator(PyOperationRef operation, int nextIndex)
+      : operation(std::move(operation)), nextIndex(nextIndex) {}
 
   PyRegionIterator &dunderIter() { return *this; }
 
@@ -357,13 +220,15 @@ class PyRegionIterator {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyRegionIterator>(m, "RegionIterator")
-        .def("__iter__", &PyRegionIterator::dunderIter)
-        .def("__next__", &PyRegionIterator::dunderNext);
+        .def("__iter__", &PyRegionIterator::dunderIter,
+             "Returns an iterator over the regions in the operation.")
+        .def("__next__", &PyRegionIterator::dunderNext,
+             "Returns the next region in the iteration.");
   }
 
 private:
   PyOperationRef operation;
-  int nextIndex = 0;
+  intptr_t nextIndex = 0;
 };
 
 /// Regions of an op are fixed length and indexed numerically so are represented
@@ -382,11 +247,12 @@ class PyRegionList : public Sliceable<PyRegionList, PyRegion> {
 
   PyRegionIterator dunderIter() {
     operation->checkValid();
-    return PyRegionIterator(operation);
+    return PyRegionIterator(operation, startIndex);
   }
 
   static void bindDerived(ClassTy &c) {
-    c.def("__iter__", &PyRegionList::dunderIter);
+    c.def("__iter__", &PyRegionList::dunderIter,
+          "Returns an iterator over the regions in the sequence.");
   }
 
 private:
@@ -430,8 +296,10 @@ class PyBlockIterator {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyBlockIterator>(m, "BlockIterator")
-        .def("__iter__", &PyBlockIterator::dunderIter)
-        .def("__next__", &PyBlockIterator::dunderNext);
+        .def("__iter__", &PyBlockIterator::dunderIter,
+             "Returns an iterator over the blocks in the operation's region.")
+        .def("__next__", &PyBlockIterator::dunderNext,
+             "Returns the next block in the iteration.");
   }
 
 private:
@@ -493,10 +361,19 @@ class PyBlockList {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyBlockList>(m, "BlockList")
-        .def("__getitem__", &PyBlockList::dunderGetItem)
-        .def("__iter__", &PyBlockList::dunderIter)
-        .def("__len__", &PyBlockList::dunderLen)
-        .def("append", &PyBlockList::appendBlock, kAppendBlockDocstring,
+        .def("__getitem__", &PyBlockList::dunderGetItem,
+             "Returns the block at the specified index.")
+        .def("__iter__", &PyBlockList::dunderIter,
+             "Returns an iterator over blocks in the operation's region.")
+        .def("__len__", &PyBlockList::dunderLen,
+             "Returns the number of blocks in the operation's region.")
+        .def("append", &PyBlockList::appendBlock,
+             R"(
+              Appends a new block, with argument types as positional args.
+
+              Returns:
+                The created block.
+             )",
              nb::arg("args"), nb::kw_only(),
              nb::arg("arg_locs") = std::nullopt);
   }
@@ -527,8 +404,10 @@ class PyOperationIterator {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyOperationIterator>(m, "OperationIterator")
-        .def("__iter__", &PyOperationIterator::dunderIter)
-        .def("__next__", &PyOperationIterator::dunderNext);
+        .def("__iter__", &PyOperationIterator::dunderIter,
+             "Returns an iterator over the operations in an operation's block.")
+        .def("__next__", &PyOperationIterator::dunderNext,
+             "Returns the next operation in the iteration.");
   }
 
 private:
@@ -584,9 +463,12 @@ class PyOperationList {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyOperationList>(m, "OperationList")
-        .def("__getitem__", &PyOperationList::dunderGetItem)
-        .def("__iter__", &PyOperationList::dunderIter)
-        .def("__len__", &PyOperationList::dunderLen);
+        .def("__getitem__", &PyOperationList::dunderGetItem,
+             "Returns the operation at the specified index.")
+        .def("__iter__", &PyOperationList::dunderIter,
+             "Returns an iterator over operations in the list.")
+        .def("__len__", &PyOperationList::dunderLen,
+             "Returns the number of operations in the list.");
   }
 
 private:
@@ -609,8 +491,10 @@ class PyOpOperand {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyOpOperand>(m, "OpOperand")
-        .def_prop_ro("owner", &PyOpOperand::getOwner)
-        .def_prop_ro("operand_number", &PyOpOperand::getOperandNumber);
+        .def_prop_ro("owner", &PyOpOperand::getOwner,
+                     "Returns the operation that owns this operand.")
+        .def_prop_ro("operand_number", &PyOpOperand::getOperandNumber,
+                     "Returns the operand number in the owning operation.");
   }
 
 private:
@@ -634,8 +518,10 @@ class PyOpOperandIterator {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyOpOperandIterator>(m, "OpOperandIterator")
-        .def("__iter__", &PyOpOperandIterator::dunderIter)
-        .def("__next__", &PyOpOperandIterator::dunderNext);
+        .def("__iter__", &PyOpOperandIterator::dunderIter,
+             "Returns an iterator over operands.")
+        .def("__next__", &PyOpOperandIterator::dunderNext,
+             "Returns the next operand in the iteration.");
   }
 
 private:
@@ -1626,16 +1512,21 @@ class PyOpResult : public PyConcreteValue<PyOpResult> {
 
   static void bindDerived(ClassTy &c) {
     c.def_prop_ro(
-        "owner", [](PyOpResult &self) -> nb::typed<nb::object, PyOperation> {
+        "owner",
+        [](PyOpResult &self) -> nb::typed<nb::object, PyOperation> {
           assert(mlirOperationEqual(self.getParentOperation()->get(),
                                     mlirOpResultGetOwner(self.get())) &&
                  "expected the owner of the value in Python to match that in "
                  "the IR");
           return self.getParentOperation().getObject();
-        });
-    c.def_prop_ro("result_number", [](PyOpResult &self) {
-      return mlirOpResultGetResultNumber(self.get());
-    });
+        },
+        "Returns the operation that produces this result.");
+    c.def_prop_ro(
+        "result_number",
+        [](PyOpResult &self) {
+          return mlirOpResultGetResultNumber(self.get());
+        },
+        "Returns the position of this result in the operation's result list.");
   }
 };
 
@@ -1671,13 +1562,18 @@ class PyOpResultList : public Sliceable<PyOpResultList, PyOpResult> {
         operation(std::move(operation)) {}
 
   static void bindDerived(ClassTy &c) {
-    c.def_prop_ro("types", [](PyOpResultList &self) {
-      return getValueTypes(self, self.operation->getContext());
-    });
-    c.def_prop_ro("owner",
-                  [](PyOpResultList &self) -> nb::typed<nb::object, PyOpView> {
-                    return self.operation->createOpView();
-                  });
+    c.def_prop_ro(
+        "types",
+        [](PyOpResultList &self) {
+          return getValueTypes(self, self.operation->getContext());
+        },
+        "Returns a list of types for all results in this result list.");
+    c.def_prop_ro(
+        "owner",
+        [](PyOpResultList &self) -> nb::typed<nb::object, PyOpView> {
+          return self.operation->createOpView();
+        },
+        "Returns the operation that owns this result list.");
   }
 
   PyOperationRef &getOperation() { return operation; }
@@ -2427,19 +2323,25 @@ class PyBlockArgument : public PyConcreteValue<PyBlockArgument> {
   using PyConcreteValue::PyConcreteValue;
 
   static void bindDerived(ClassTy &c) {
-    c.def_prop_ro("owner", [](PyBlockArgument &self) {
-      return PyBlock(self.getParentOperation(),
-                     mlirBlockArgumentGetOwner(self.get()));
-    });
-    c.def_prop_ro("arg_number", [](PyBlockArgument &self) {
-      return mlirBlockArgumentGetArgNumber(self.get());
-    });
+    c.def_prop_ro(
+        "owner",
+        [](PyBlockArgument &self) {
+          return PyBlock(self.getParentOperation(),
+                         mlirBlockArgumentGetOwner(self.get()));
+        },
+        "Returns the block that owns this argument.");
+    c.def_prop_ro(
+        "arg_number",
+        [](PyBlockArgument &self) {
+          return mlirBlockArgumentGetArgNumber(self.get());
+        },
+        "Returns the position of this argument in the block's argument list.");
     c.def(
         "set_type",
         [](PyBlockArgument &self, PyType type) {
           return mlirBlockArgumentSetType(self.get(), type);
         },
-        nb::arg("type"));
+        nb::arg("type"), "Sets the type of this block argument.");
   }
 };
 
@@ -2462,9 +2364,12 @@ class PyBlockArgumentList
         operation(std::move(operation)), block(block) {}
 
   static void bindDerived(ClassTy &c) {
-    c.def_prop_ro("types", [](PyBlockArgumentList &self) {
-      return getValueTypes(self, self.operation->getContext());
-    });
+    c.def_prop_ro(
+        "types",
+        [](PyBlockArgumentList &self) {
+          return getValueTypes(self, self.operation->getContext());
+        },
+        "Returns a list of types for all arguments in this argument list.");
   }
 
 private:
@@ -2516,7 +2421,9 @@ class PyOpOperandList : public Sliceable<PyOpOperandList, PyValue> {
   }
 
   static void bindDerived(ClassTy &c) {
-    c.def("__setitem__", &PyOpOperandList::dunderSetItem);
+    c.def("__setitem__", &PyOpOperandList::dunderSetItem, nb::arg("index"),
+          nb::arg("value"),
+          "Sets the operand at the specified index to a new value.");
   }
 
 private:
@@ -2571,7 +2478,8 @@ class PyOpSuccessors : public Sliceable<PyOpSuccessors, PyBlock> {
   }
 
   static void bindDerived(ClassTy &c) {
-    c.def("__setitem__", &PyOpSuccessors::dunderSetItem);
+    c.def("__setitem__", &PyOpSuccessors::dunderSetItem, nb::arg("index"),
+          nb::arg("block"), "Sets the successor block at the specified index.");
   }
 
 private:
@@ -2743,55 +2651,70 @@ class PyOpAttributeMap {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyOpAttributeMap>(m, "OpAttributeMap")
-        .def("__contains__", &PyOpAttributeMap::dunderContains)
-        .def("__len__", &PyOpAttributeMap::dunderLen)
-        .def("__getitem__", &PyOpAttributeMap::dunderGetItemNamed)
-        .def("__getitem__", &PyOpAttributeMap::dunderGetItemIndexed)
-        .def("__setitem__", &PyOpAttributeMap::dunderSetItem)
-        .def("__delitem__", &PyOpAttributeMap::dunderDelItem)
-        .def("__iter__",
-             [](PyOpAttributeMap &self) {
-               nb::list keys;
-               PyOpAttributeMap::forEachAttr(
-                   self.operation->get(),
-                   [&](MlirStringRef name, MlirAttribute) {
-                     keys.append(nb::str(name.data, name.length));
-                   });
-               return nb::iter(keys);
-             })
-        .def("keys",
-             [](PyOpAttributeMap &self) {
-               nb::list out;
-               PyOpAttributeMap::forEachAttr(
-                   self.operation->get(),
-                   [&](MlirStringRef name, MlirAttribute) {
-                     out.append(nb::str(name.data, name.length));
-                   });
-               return out;
-             })
-        .def("values",
-             [](PyOpAttributeMap &self) {
-               nb::list out;
-               PyOpAttributeMap::forEachAttr(
-                   self.operation->get(),
-                   [&](MlirStringRef, MlirAttribute attr) {
-                     out.append(PyAttribute(self.operation->getContext(), attr)
-                                    .maybeDownCast());
-                   });
-               return out;
-             })
-        .def("items", [](PyOpAttributeMap &self) {
-          nb::list out;
-          PyOpAttributeMap::forEachAttr(
-              self.operation->get(),
-              [&](MlirStringRef name, MlirAttribute attr) {
-                out.append(nb::make_tuple(
-                    nb::str(name.data, name.length),
-                    PyAttribute(self.operation->getContext(), attr)
-                        .maybeDownCast()));
-              });
-          return out;
-        });
+        .def("__contains__", &PyOpAttributeMap::dunderContains, nb::arg("name"),
+             "Checks if an attribute with the given name exists in the map.")
+        .def("__len__", &PyOpAttributeMap::dunderLen,
+             "Returns the number of attributes in the map.")
+        .def("__getitem__", &PyOpAttributeMap::dunderGetItemNamed,
+             nb::arg("name"), "Gets an attribute by name.")
+        .def("__getitem__", &PyOpAttributeMap::dunderGetItemIndexed,
+             nb::arg("index"), "Gets a named attribute by index.")
+        .def("__setitem__", &PyOpAttributeMap::dunderSetItem, nb::arg("name"),
+             nb::arg("attr"), "Sets an attribute with the given name.")
+        .def("__delitem__", &PyOpAttributeMap::dunderDelItem, nb::arg("name"),
+             "Deletes an attribute with the given name.")
+        .def(
+            "__iter__",
+            [](PyOpAttributeMap &self) {
+              nb::list keys;
+              PyOpAttributeMap::forEachAttr(
+                  self.operation->get(),
+                  [&](MlirStringRef name, MlirAttribute) {
+                    keys.append(nb::str(name.data, name.length));
+                  });
+              return nb::iter(keys);
+            },
+            "Iterates over attribute names.")
+        .def(
+            "keys",
+            [](PyOpAttributeMap &self) {
+              nb::list out;
+              PyOpAttributeMap::forEachAttr(
+                  self.operation->get(),
+                  [&](MlirStringRef name, MlirAttribute) {
+                    out.append(nb::str(name.data, name.length));
+                  });
+              return out;
+            },
+            "Returns a list of attribute names.")
+        .def(
+            "values",
+            [](PyOpAttributeMap &self) {
+              nb::list out;
+              PyOpAttributeMap::forEachAttr(
+                  self.operation->get(),
+                  [&](MlirStringRef, MlirAttribute attr) {
+                    out.append(PyAttribute(self.operation->getContext(), attr)
+                                   .maybeDownCast());
+                  });
+              return out;
+            },
+            "Returns a list of attribute values.")
+        .def(
+            "items",
+            [](PyOpAttributeMap &self) {
+              nb::list out;
+              PyOpAttributeMap::forEachAttr(
+                  self.operation->get(),
+                  [&](MlirStringRef name, MlirAttribute attr) {
+                    out.append(nb::make_tuple(
+                        nb::str(name.data, name.length),
+                        PyAttribute(self.operation->getContext(), attr)
+                            .maybeDownCast()));
+                  });
+              return out;
+            },
+            "Returns a list of `(name, attribute)` tuples.");
   }
 
 private:
@@ -2979,62 +2902,103 @@ void mlir::python::populateIRCore(nb::module_ &m) {
   // Mapping of Diagnostics.
   //----------------------------------------------------------------------------
   nb::class_<PyDiagnostic>(m, "Diagnostic")
-      .def_prop_ro("severity", &PyDiagnostic::getSeverity)
-      .def_prop_ro("location", &PyDiagnostic::getLocation)
-      .def_prop_ro("message", &PyDiagnostic::getMessage)
-      .def_prop_ro("notes", &PyDiagnostic::getNotes)
-      .def("__str__", [](PyDiagnostic &self) -> nb::str {
-        if (!self.isValid())
-          return nb::str("<Invalid Diagnostic>");
-        return self.getMessage();
-      });
+      .def_prop_ro("severity", &PyDiagnostic::getSeverity,
+                   "Returns the severity of the diagnostic.")
+      .def_prop_ro("location", &PyDiagnostic::getLocation,
+                   "Returns the location associated with the diagnostic.")
+      .def_prop_ro("message", &PyDiagnostic::getMessage,
+                   "Returns the message text of the diagnostic.")
+      .def_prop_ro("notes", &PyDiagnostic::getNotes,
+                   "Returns a tuple of attached note diagnostics.")
+      .def(
+          "__str__",
+          [](PyDiagnostic &self) -> nb::str {
+            if (!self.isValid())
+              return nb::str("<Invalid Diagnostic>");
+            return self.getMessage();
+          },
+          "Returns the diagnostic message as a string.");
 
   nb::class_<PyDiagnostic::DiagnosticInfo>(m, "DiagnosticInfo")
-      .def("__init__",
-           [](PyDiagnostic::DiagnosticInfo &self, PyDiagnostic diag) {
-             new (&self) PyDiagnostic::DiagnosticInfo(diag.getInfo());
-           })
-      .def_ro("severity", &PyDiagnostic::DiagnosticInfo::severity)
-      .def_ro("location", &PyDiagnostic::DiagnosticInfo::location)
-      .def_ro("message", &PyDiagnostic::DiagnosticInfo::message)
-      .def_ro("notes", &PyDiagnostic::DiagnosticInfo::notes)
-      .def("__str__",
-           [](PyDiagnostic::DiagnosticInfo &self) { return self.message; });
+      .def(
+          "__init__",
+          [](PyDiagnostic::DiagnosticInfo &self, PyDiagnostic diag) {
+            new (&self) PyDiagnostic::DiagnosticInfo(diag.getInfo());
+          },
+          "diag"_a, "Creates a DiagnosticInfo from a Diagnostic.")
+      .def_ro("severity", &PyDiagnostic::DiagnosticInfo::severity,
+              "The severity level of the diagnostic.")
+      .def_ro("location", &PyDiagnostic::DiagnosticInfo::location,
+              "The location associated with the diagnostic.")
+      .def_ro("message", &PyDiagnostic::DiagnosticInfo::message,
+              "The message text of the diagnostic.")
+      .def_ro("notes", &PyDiagnostic::DiagnosticInfo::notes,
+              "List of attached note diagnostics.")
+      .def(
+          "__str__",
+          [](PyDiagnostic::DiagnosticInfo &self) { return self.message; },
+          "Returns the diagnostic message as a string.");
 
   nb::class_<PyDiagnosticHandler>(m, "DiagnosticHandler")
-      .def("detach", &PyDiagnosticHandler::detach)
-      .def_prop_ro("attached", &PyDiagnosticHandler::isAttached)
-      .def_prop_ro("had_error", &PyDiagnosticHandler::getHadError)
-      .def("__enter__", &PyDiagnosticHandler::contextEnter)
+      .def("detach", &PyDiagnosticHandler::detach,
+           "Detaches the diagnostic handler from the context.")
+      .def_prop_ro("attached", &PyDiagnosticHandler::isAttached,
+                   "Returns True if the handler is attached to a context.")
+      .def_prop_ro("had_error", &PyDiagnosticHandler::getHadError,
+                   "Returns True if an error was encountered during diagnostic "
+                   "handling.")
+      .def("__enter__", &PyDiagnosticHandler::contextEnter,
+           "Enters the diagnostic handler as a context manager.")
       .def("__exit__", &PyDiagnosticHandler::contextExit,
            nb::arg("exc_type").none(), nb::arg("exc_value").none(),
-           nb::arg("traceback").none());
+           nb::arg("traceback").none(),
+           "Exits the diagnostic handler context manager.");
 
   // Expose DefaultThreadPool to python
   nb::class_<PyThreadPool>(m, "ThreadPool")
-      .def("__init__", [](PyThreadPool &self) { new (&self) PyThreadPool(); })
-      .def("get_max_concurrency", &PyThreadPool::getMaxConcurrency)
-      .def("_mlir_thread_pool_ptr", &PyThreadPool::_mlir_thread_pool_ptr);
+      .def(
+          "__init__", [](PyThreadPool &self) { new (&self) PyThreadPool(); },
+          "Creates a new thread pool with default concurrency.")
+      .def("get_max_concurrency", &PyThreadPool::getMaxConcurrency,
+           "Returns the maximum number of threads in the pool.")
+      .def("_mlir_thread_pool_ptr", &PyThreadPool::_mlir_thread_pool_ptr,
+           "Returns the raw pointer to the LLVM thread pool as a string.");
 
   nb::class_<PyMlirContext>(m, "Context")
-      .def("__init__",
-           [](PyMlirContext &self) {
-             MlirContext context = mlirContextCreateWithThreading(false);
-             new (&self) PyMlirContext(context);
-           })
-      .def_static("_get_live_count", &PyMlirContext::getLiveCount)
-      .def("_get_context_again",
-           [](PyMlirContext &self) -> nb::typed<nb::object, PyMlirContext> {
-             PyMlirContextRef ref = PyMlirContext::forContext(self.get());
-             return ref.releaseObject();
-           })
-      .def("_get_live_module_count", &PyMlirContext::getLiveModuleCount)
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyMlirContext::getCapsule)
+      .def(
+          "__init__",
+          [](PyMlirContext &self) {
+            MlirContext context = mlirContextCreateWithThreading(false);
+            new (&self) PyMlirContext(context);
+          },
+          R"(
+            Creates a new MLIR context.
+
+            The context is the top-level container for all MLIR objects. It owns the storage
+            for types, attributes, locations, and other core IR objects. A context can be
+            configured to allow or disallow unregistered dialects and can have dialects
+            loaded on-demand.)")
+      .def_static("_get_live_count", &PyMlirContext::getLiveCount,
+                  "Gets the number of live Context objects.")
+      .def(
+          "_get_context_again",
+          [](PyMlirContext &self) -> nb::typed<nb::object, PyMlirContext> {
+            PyMlirContextRef ref = PyMlirContext::forContext(self.get());
+            return ref.releaseObject();
+          },
+          "Gets another reference to the same context.")
+      .def("_get_live_module_count", &PyMlirContext::getLiveModuleCount,
+           "Gets the number of live modules owned by this context.")
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyMlirContext::getCapsule,
+                   "Gets a capsule wrapping the MlirContext.")
       .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR,
-                  &PyMlirContext::createFromCapsule)
-      .def("__enter__", &PyMlirContext::contextEnter)
+                  &PyMlirContext::createFromCapsule,
+                  "Creates a Context from a capsule wrapping MlirContext.")
+      .def("__enter__", &PyMlirContext::contextEnter,
+           "Enters the context as a context manager.")
       .def("__exit__", &PyMlirContext::contextExit, nb::arg("exc_type").none(),
-           nb::arg("exc_value").none(), nb::arg("traceback").none())
+           nb::arg("exc_value").none(), nb::arg("traceback").none(),
+           "Exits the context manager.")
       .def_prop_ro_static(
           "current",
           [](nb::object & /*class*/)
@@ -3045,14 +3009,15 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return nb::cast(context);
           },
           nb::sig("def current(/) -> Context | None"),
-          "Gets the Context bound to the current thread or raises ValueError")
+          "Gets the Context bound to the current thread or returns None if no "
+          "context is set.")
       .def_prop_ro(
           "dialects",
           [](PyMlirContext &self) { return PyDialects(self.getRef()); },
-          "Gets a container for accessing dialects by name")
+          "Gets a container for accessing dialects by name.")
       .def_prop_ro(
           "d", [](PyMlirContext &self) { return PyDialects(self.getRef()); },
-          "Alias for 'dialect'")
+          "Alias for `dialects`.")
       .def(
           "get_dialect_descriptor",
           [=](PyMlirContext &self, std::string &name) {
@@ -3065,7 +3030,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return PyDialectDescriptor(self.getRef(), dialect);
           },
           nb::arg("dialect_name"),
-          "Gets or loads a dialect by name, returning its descriptor object")
+          "Gets or loads a dialect by name, returning its descriptor object.")
       .def_prop_rw(
           "allow_unregistered_dialects",
           [](PyMlirContext &self) -> bool {
@@ -3073,67 +3038,110 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           },
           [](PyMlirContext &self, bool value) {
             mlirContextSetAllowUnregisteredDialects(self.get(), value);
-          })
+          },
+          "Controls whether unregistered dialects are allowed in this context.")
       .def("attach_diagnostic_handler", &PyMlirContext::attachDiagnosticHandler,
            nb::arg("callback"),
-           "Attaches a diagnostic handler that will receive callbacks")
+           "Attaches a diagnostic handler that will receive callbacks.")
       .def(
           "enable_multithreading",
           [](PyMlirContext &self, bool enable) {
             mlirContextEnableMultithreading(self.get(), enable);
           },
-          nb::arg("enable"))
-      .def("set_thread_pool",
-           [](PyMlirContext &self, PyThreadPool &pool) {
-             // we should disable multi-threading first before setting
-             // new thread pool otherwise the assert in
-             // MLIRContext::setThreadPool will be raised.
-             mlirContextEnableMultithreading(self.get(), false);
-             mlirContextSetThreadPool(self.get(), pool.get());
-           })
-      .def("get_num_threads",
-           [](PyMlirContext &self) {
-             return mlirContextGetNumThreads(self.get());
-           })
-      .def("_mlir_thread_pool_ptr",
-           [](PyMlirContext &self) {
-             MlirLlvmThreadPool pool = mlirContextGetThreadPool(self.get());
-             std::stringstream ss;
-             ss << pool.ptr;
-             return ss.str();
-           })
+          nb::arg("enable"),
+          R"(
+            Enables or disables multi-threading support in the context.
+
+            Args:
+              enable: Whether to enable (True) or disable (False) multi-threading.
+          )")
+      .def(
+          "set_thread_pool",
+          [](PyMlirContext &self, PyThreadPool &pool) {
+            // we should disable multi-threading first before setting
+            // new thread pool otherwise the assert in
+            // MLIRContext::setThreadPool will be raised.
+            mlirContextEnableMultithreading(self.get(), false);
+            mlirContextSetThreadPool(self.get(), pool.get());
+          },
+          R"(
+            Sets a custom thread pool for the context to use.
+
+            Args:
+              pool: A ThreadPool object to use for parallel operations.
+
+            Note:
+              Multi-threading is automatically disabled before setting the thread pool.)")
+      .def(
+          "get_num_threads",
+          [](PyMlirContext &self) {
+            return mlirContextGetNumThreads(self.get());
+          },
+          "Gets the number of threads in the context's thread pool.")
+      .def(
+          "_mlir_thread_pool_ptr",
+          [](PyMlirContext &self) {
+            MlirLlvmThreadPool pool = mlirContextGetThreadPool(self.get());
+            std::stringstream ss;
+            ss << pool.ptr;
+            return ss.str();
+          },
+          "Gets the raw pointer to the LLVM thread pool as a string.")
       .def(
           "is_registered_operation",
           [](PyMlirContext &self, std::string &name) {
             return mlirContextIsRegisteredOperation(
                 self.get(), MlirStringRef{name.data(), name.size()});
           },
-          nb::arg("operation_name"))
+          nb::arg("operation_name"),
+          R"(
+            Checks whether an operation with the given name is registered.
+
+            Args:
+              operation_name: The fully qualified name of the operation (e.g., `arith.addf`).
+
+            Returns:
+              True if the operation is registered, False otherwise.)")
       .def(
           "append_dialect_registry",
           [](PyMlirContext &self, PyDialectRegistry &registry) {
             mlirContextAppendDialectRegistry(self.get(), registry);
           },
-          nb::arg("registry"))
+          nb::arg("registry"),
+          R"(
+            Appends the contents of a dialect registry to the context.
+
+            Args:
+              registry: A DialectRegistry containing dialects to append.)")
       .def_prop_rw("emit_error_diagnostics",
                    &PyMlirContext::getEmitErrorDiagnostics,
                    &PyMlirContext::setEmitErrorDiagnostics,
-                   "Emit error diagnostics to diagnostic handlers. By default "
-                   "error diagnostics are captured and reported through "
-                   "MLIRError exceptions.")
-      .def("load_all_available_dialects", [](PyMlirContext &self) {
-        mlirContextLoadAllAvailableDialects(self.get());
-      });
+                   R"(
+            Controls whether error diagnostics are emitted to diagnostic handlers.
+
+            By default, error diagnostics are captured and reported through MLIRError exceptions.)")
+      .def(
+          "load_all_available_dialects",
+          [](PyMlirContext &self) {
+            mlirContextLoadAllAvailableDialects(self.get());
+          },
+          R"(
+            Loads all dialects available in the registry into the context.
+
+            This eagerly loads all dialects that have been registered, making them
+            immediately available for use.)");
 
   //----------------------------------------------------------------------------
   // Mapping of PyDialectDescriptor
   //----------------------------------------------------------------------------
   nb::class_<PyDialectDescriptor>(m, "DialectDescriptor")
-      .def_prop_ro("namespace",
-                   [](PyDialectDescriptor &self) {
-                     MlirStringRef ns = mlirDialectGetNamespace(self.get());
-                     return nb::str(ns.data, ns.length);
-                   })
+      .def_prop_ro(
+          "namespace",
+          [](PyDialectDescriptor &self) {
+            MlirStringRef ns = mlirDialectGetNamespace(self.get());
+            return nb::str(ns.data, ns.length);
+          },
+          "Returns the namespace of the dialect.")
       .def(
           "__repr__",
           [](PyDialectDescriptor &self) {
@@ -3143,35 +3151,43 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             repr.append(">");
             return repr;
           },
-          nb::sig("def __repr__(self) -> str"));
+          nb::sig("def __repr__(self) -> str"),
+          "Returns a string representation of the dialect descriptor.");
 
   //----------------------------------------------------------------------------
   // Mapping of PyDialects
   //----------------------------------------------------------------------------
   nb::class_<PyDialects>(m, "Dialects")
-      .def("__getitem__",
-           [=](PyDialects &self, std::string keyName) {
-             MlirDialect dialect =
-                 self.getDialectForKey(keyName, /*attrError=*/false);
-             nb::object descriptor =
-                 nb::cast(PyDialectDescriptor{self.getContext(), dialect});
-             return createCustomDialectWrapper(keyName, std::move(descriptor));
-           })
-      .def("__getattr__", [=](PyDialects &self, std::string attrName) {
-        MlirDialect dialect =
-            self.getDialectForKey(attrName, /*attrError=*/true);
-        nb::object descriptor =
-            nb::cast(PyDialectDescriptor{self.getContext(), dialect});
-        return createCustomDialectWrapper(attrName, std::move(descriptor));
-      });
+      .def(
+          "__getitem__",
+          [=](PyDialects &self, std::string keyName) {
+            MlirDialect dialect =
+                self.getDialectForKey(keyName, /*attrError=*/false);
+            nb::object descriptor =
+                nb::cast(PyDialectDescriptor{self.getContext(), dialect});
+            return createCustomDialectWrapper(keyName, std::move(descriptor));
+          },
+          "Gets a dialect by name using subscript notation.")
+      .def(
+          "__getattr__",
+          [=](PyDialects &self, std::string attrName) {
+            MlirDialect dialect =
+                self.getDialectForKey(attrName, /*attrError=*/true);
+            nb::object descriptor =
+                nb::cast(PyDialectDescriptor{self.getContext(), dialect});
+            return createCustomDialectWrapper(attrName, std::move(descriptor));
+          },
+          "Gets a dialect by name using attribute notation.");
 
   //----------------------------------------------------------------------------
   // Mapping of PyDialect
   //----------------------------------------------------------------------------
   nb::class_<PyDialect>(m, "Dialect")
-      .def(nb::init<nb::object>(), nb::arg("descriptor"))
-      .def_prop_ro("descriptor",
-                   [](PyDialect &self) { return self.getDescriptor(); })
+      .def(nb::init<nb::object>(), nb::arg("descriptor"),
+           "Creates a Dialect from a DialectDescriptor.")
+      .def_prop_ro(
+          "descriptor", [](PyDialect &self) { return self.getDescriptor(); },
+          "Returns the DialectDescriptor for this dialect.")
       .def(
           "__repr__",
           [](const nb::object &self) {
@@ -3181,31 +3197,43 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                    nb::str(" (class ") + clazz.attr("__module__") +
                    nb::str(".") + clazz.attr("__name__") + nb::str(")>");
           },
-          nb::sig("def __repr__(self) -> str"));
+          nb::sig("def __repr__(self) -> str"),
+          "Returns a string representation of the dialect.");
 
   //----------------------------------------------------------------------------
   // Mapping of PyDialectRegistry
   //----------------------------------------------------------------------------
   nb::class_<PyDialectRegistry>(m, "DialectRegistry")
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyDialectRegistry::getCapsule)
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyDialectRegistry::getCapsule,
+                   "Gets a capsule wrapping the MlirDialectRegistry.")
       .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR,
-                  &PyDialectRegistry::createFromCapsule)
-      .def(nb::init<>());
+                  &PyDialectRegistry::createFromCapsule,
+                  "Creates a DialectRegistry from a capsule wrapping "
+                  "`MlirDialectRegistry`.")
+      .def(nb::init<>(), "Creates a new empty dialect registry.");
 
   //----------------------------------------------------------------------------
   // Mapping of Location
   //----------------------------------------------------------------------------
   nb::class_<PyLocation>(m, "Location")
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyLocation::getCapsule)
-      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyLocation::createFromCapsule)
-      .def("__enter__", &PyLocation::contextEnter)
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyLocation::getCapsule,
+                   "Gets a capsule wrapping the MlirLocation.")
+      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyLocation::createFromCapsule,
+                  "Creates a Location from a capsule wrapping MlirLocation.")
+      .def("__enter__", &PyLocation::contextEnter,
+           "Enters the location as a context manager.")
       .def("__exit__", &PyLocation::contextExit, nb::arg("exc_type").none(),
-           nb::arg("exc_value").none(), nb::arg("traceback").none())
-      .def("__eq__",
-           [](PyLocation &self, PyLocation &other) -> bool {
-             return mlirLocationEqual(self, other);
-           })
-      .def("__eq__", [](PyLocation &self, nb::object other) { return false; })
+           nb::arg("exc_value").none(), nb::arg("traceback").none(),
+           "Exits the location context manager.")
+      .def(
+          "__eq__",
+          [](PyLocation &self, PyLocation &other) -> bool {
+            return mlirLocationEqual(self, other);
+          },
+          "Compares two locations for equality.")
+      .def(
+          "__eq__", [](PyLocation &self, nb::object other) { return false; },
+          "Compares location with non-location object (always returns False).")
       .def_prop_ro_static(
           "current",
           [](nb::object & /*class*/) -> std::optional<PyLocation *> {
@@ -3217,7 +3245,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           // clang-format off
           nb::sig("def current(/) -> Location | None"),
           // clang-format on
-          "Gets the Location bound to the current thread or raises ValueError")
+          "Gets the Location bound to the current thread or raises ValueError.")
       .def_static(
           "unknown",
           [](DefaultingPyMlirContext context) {
@@ -3225,13 +3253,13 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                               mlirLocationUnknownGet(context->get()));
           },
           nb::arg("context") = nb::none(),
-          "Gets a Location representing an unknown location")
+          "Gets a Location representing an unknown location.")
       .def_static(
           "callsite",
           [](PyLocation callee, const std::vector<PyLocation> &frames,
              DefaultingPyMlirContext context) {
             if (frames.empty())
-              throw nb::value_error("No caller frames provided");
+              throw nb::value_error("No caller frames provided.");
             MlirLocation caller = frames.back().get();
             for (const PyLocation &frame :
                  llvm::reverse(llvm::ArrayRef(frames).drop_back()))
@@ -3240,18 +3268,23 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                               mlirLocationCallSiteGet(callee.get(), caller));
           },
           nb::arg("callee"), nb::arg("frames"), nb::arg("context") = nb::none(),
-          kContextGetCallSiteLocationDocstring)
-      .def("is_a_callsite", mlirLocationIsACallSite)
-      .def_prop_ro("callee",
-                   [](PyLocation &self) {
-                     return PyLocation(self.getContext(),
-                                       mlirLocationCallSiteGetCallee(self));
-                   })
-      .def_prop_ro("caller",
-                   [](PyLocation &self) {
-                     return PyLocation(self.getContext(),
-                                       mlirLocationCallSiteGetCaller(self));
-                   })
+          "Gets a Location representing a caller and callsite.")
+      .def("is_a_callsite", mlirLocationIsACallSite,
+           "Returns True if this location is a CallSiteLoc.")
+      .def_prop_ro(
+          "callee",
+          [](PyLocation &self) {
+            return PyLocation(self.getContext(),
+                              mlirLocationCallSiteGetCallee(self));
+          },
+          "Gets the callee location from a CallSiteLoc.")
+      .def_prop_ro(
+          "caller",
+          [](PyLocation &self) {
+            return PyLocation(self.getContext(),
+                              mlirLocationCallSiteGetCaller(self));
+          },
+          "Gets the caller location from a CallSiteLoc.")
       .def_static(
           "file",
           [](std::string filename, int line, int col,
@@ -3262,7 +3295,8 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                     context->get(), toMlirStringRef(filename), line, col));
           },
           nb::arg("filename"), nb::arg("line"), nb::arg("col"),
-          nb::arg("context") = nb::none(), kContextGetFileLocationDocstring)
+          nb::arg("context") = nb::none(),
+          "Gets a Location representing a file, line and column.")
       .def_static(
           "file",
           [](std::string filename, int startLine, int startCol, int endLine,
@@ -3274,17 +3308,25 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           },
           nb::arg("filename"), nb::arg("start_line"), nb::arg("start_col"),
           nb::arg("end_line"), nb::arg("end_col"),
-          nb::arg("context") = nb::none(), kContextGetFileRangeDocstring)
-      .def("is_a_file", mlirLocationIsAFileLineColRange)
-      .def_prop_ro("filename",
-                   [](MlirLocation loc) {
-                     return mlirIdentifierStr(
-                         mlirLocationFileLineColRangeGetFilename(loc));
-                   })
-      .def_prop_ro("start_line", mlirLocationFileLineColRangeGetStartLine)
-      .def_prop_ro("start_col", mlirLocationFileLineColRangeGetStartColumn)
-      .def_prop_ro("end_line", mlirLocationFileLineColRangeGetEndLine)
-      .def_prop_ro("end_col", mlirLocationFileLineColRangeGetEndColumn)
+          nb::arg("context") = nb::none(),
+          "Gets a Location representing a file, line and column range.")
+      .def("is_a_file", mlirLocationIsAFileLineColRange,
+           "Returns True if this location is a FileLineColLoc.")
+      .def_prop_ro(
+          "filename",
+          [](MlirLocation loc) {
+            return mlirIdentifierStr(
+                mlirLocationFileLineColRangeGetFilename(loc));
+          },
+          "Gets the filename from a FileLineColLoc.")
+      .def_prop_ro("start_line", mlirLocationFileLineColRangeGetStartLine,
+                   "Gets the start line number from a `FileLineColLoc`.")
+      .def_prop_ro("start_col", mlirLocationFileLineColRangeGetStartColumn,
+                   "Gets the start column number from a `FileLineColLoc`.")
+      .def_prop_ro("end_line", mlirLocationFileLineColRangeGetEndLine,
+                   "Gets the end line number from a `FileLineColLoc`.")
+      .def_prop_ro("end_col", mlirLocationFileLineColRangeGetEndColumn,
+                   "Gets the end column number from a `FileLineColLoc`.")
       .def_static(
           "fused",
           [](const std::vector<PyLocation> &pyLocations,
@@ -3300,8 +3342,11 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return PyLocation(context->getRef(), location);
           },
           nb::arg("locations"), nb::arg("metadata") = nb::none(),
-          nb::arg("context") = nb::none(), kContextGetFusedLocationDocstring)
-      .def("is_a_fused", mlirLocationIsAFused)
+          nb::arg("context") = nb::none(),
+          "Gets a Location representing a fused location with optional "
+          "metadata.")
+      .def("is_a_fused", mlirLocationIsAFused,
+           "Returns True if this location is a `FusedLoc`.")
       .def_prop_ro(
           "locations",
           [](PyLocation &self) {
@@ -3314,7 +3359,8 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             for (unsigned i = 0; i < numLocations; ++i)
               pyLocations.emplace_back(self.getContext(), locations[i]);
             return pyLocations;
-          })
+          },
+          "Gets the list of locations from a `FusedLoc`.")
       .def_static(
           "name",
           [](std::string name, std::optional<PyLocation> childLoc,
@@ -3327,17 +3373,24 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                              : mlirLocationUnknownGet(context->get())));
           },
           nb::arg("name"), nb::arg("childLoc") = nb::none(),
-          nb::arg("context") = nb::none(), kContextGetNameLocationDocString)
-      .def("is_a_name", mlirLocationIsAName)
-      .def_prop_ro("name_str",
-                   [](MlirLocation loc) {
-                     return mlirIdentifierStr(mlirLocationNameGetName(loc));
-                   })
-      .def_prop_ro("child_loc",
-                   [](PyLocation &self) {
-                     return PyLocation(self.getContext(),
-                                       mlirLocationNameGetChildLoc(self));
-                   })
+          nb::arg("context") = nb::none(),
+          "Gets a Location representing a named location with optional child "
+          "location.")
+      .def("is_a_name", mlirLocationIsAName,
+           "Returns True if this location is a `NameLoc`.")
+      .def_prop_ro(
+          "name_str",
+          [](MlirLocation loc) {
+            return mlirIdentifierStr(mlirLocationNameGetName(loc));
+          },
+          "Gets the name string from a `NameLoc`.")
+      .def_prop_ro(
+          "child_loc",
+          [](PyLocation &self) {
+            return PyLocation(self.getContext(),
+                              mlirLocationNameGetChildLoc(self));
+          },
+          "Gets the child location from a `NameLoc`.")
       .def_static(
           "from_attr",
           [](PyAttribute &attribute, DefaultingPyMlirContext context) {
@@ -3345,41 +3398,59 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                               mlirLocationFromAttribute(attribute));
           },
           nb::arg("attribute"), nb::arg("context") = nb::none(),
-          "Gets a Location from a LocationAttr")
+          "Gets a Location from a `LocationAttr`.")
       .def_prop_ro(
           "context",
           [](PyLocation &self) -> nb::typed<nb::object, PyMlirContext> {
             return self.getContext().getObject();
           },
-          "Context that owns the Location")
+          "Context that owns the `Location`.")
       .def_prop_ro(
           "attr",
           [](PyLocation &self) {
             return PyAttribute(self.getContext(),
                                mlirLocationGetAttribute(self));
           },
-          "Get the underlying LocationAttr")
+          "Get the underlying `LocationAttr`.")
       .def(
           "emit_error",
           [](PyLocation &self, std::string message) {
             mlirEmitError(self, message.c_str());
           },
-          nb::arg("message"), "Emits an error at this location")
-      .def("__repr__", [](PyLocation &self) {
-        PyPrintAccumulator printAccum;
-        mlirLocationPrint(self, printAccum.getCallback(),
-                          printAccum.getUserData());
-        return printAccum.join();
-      });
+          nb::arg("message"),
+          R"(
+            Emits an error diagnostic at this location.
+
+            Args:
+              message: The error message to emit.)")
+      .def(
+          "__repr__",
+          [](PyLocation &self) {
+            PyPrintAccumulator printAccum;
+            mlirLocationPrint(self, printAccum.getCallback(),
+                              printAccum.getUserData());
+            return printAccum.join();
+          },
+          "Returns the assembly representation of the location.");
 
   //----------------------------------------------------------------------------
   // Mapping of Module
   //----------------------------------------------------------------------------
   nb::class_<PyModule>(m, "Module", nb::is_weak_referenceable())
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyModule::getCapsule)
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyModule::getCapsule,
+                   "Gets a capsule wrapping the MlirModule.")
       .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyModule::createFromCapsule,
-                  kModuleCAPICreate)
-      .def("_clear_mlir_module", &PyModule::clearMlirModule)
+                  R"(
+                    Creates a Module from a `MlirModule` wrapped by a capsule (i.e. `module._CAPIPtr`).
+
+                    This returns a new object **BUT** `_clear_mlir_module(module)` must be called to
+                    prevent double-frees (of the underlying `mlir::Module`).)")
+      .def("_clear_mlir_module", &PyModule::clearMlirModule,
+           R"(
+             Clears the internal MLIR module reference.
+
+             This is used internally to prevent double-free when ownership is transferred
+             via the C API capsule mechanism. Not intended for normal use.)")
       .def_static(
           "parse",
           [](const std::string &moduleAsm, DefaultingPyMlirContext context)
@@ -3427,13 +3498,13 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             MlirModule module = mlirModuleCreateEmpty(pyLoc.get());
             return PyModule::forModule(module).releaseObject();
           },
-          nb::arg("loc") = nb::none(), "Creates an empty module")
+          nb::arg("loc") = nb::none(), "Creates an empty module.")
       .def_prop_ro(
           "context",
           [](PyModule &self) -> nb::typed<nb::object, PyMlirContext> {
             return self.getContext().getObject();
           },
-          "Context that created the Module")
+          "Context that created the `Module`.")
       .def_prop_ro(
           "operation",
           [](PyModule &self) -> nb::typed<nb::object, PyOperation> {
@@ -3442,7 +3513,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                                              self.getRef().releaseObject())
                 .releaseObject();
           },
-          "Accesses the module as an operation")
+          "Accesses the module as an operation.")
       .def_prop_ro(
           "body",
           [](PyModule &self) {
@@ -3452,7 +3523,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             PyBlock returnBlock(moduleOp, mlirModuleGetBody(self.get()));
             return returnBlock;
           },
-          "Return the block for this module")
+          "Return the block for this module.")
       .def(
           "dump",
           [](PyModule &self) {
@@ -3465,39 +3536,59 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             // Defer to the operation's __str__.
             return self.attr("operation").attr("__str__")();
           },
-          nb::sig("def __str__(self) -> str"), kOperationStrDunderDocstring)
+          nb::sig("def __str__(self) -> str"),
+          R"(
+            Gets the assembly form of the operation with default options.
+
+            If more advanced control over the assembly formatting or I/O options is needed,
+            use the dedicated print or get_asm method, which supports keyword arguments to
+            customize behavior.
+          )")
       .def(
           "__eq__",
           [](PyModule &self, PyModule &other) {
             return mlirModuleEqual(self.get(), other.get());
           },
-          "other"_a)
-      .def("__hash__",
-           [](PyModule &self) { return mlirModuleHashValue(self.get()); });
+          "other"_a, "Compares two modules for equality.")
+      .def(
+          "__hash__",
+          [](PyModule &self) { return mlirModuleHashValue(self.get()); },
+          "Returns the hash value of the module.");
 
   //----------------------------------------------------------------------------
   // Mapping of Operation.
   //----------------------------------------------------------------------------
   nb::class_<PyOperationBase>(m, "_OperationBase")
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR,
-                   [](PyOperationBase &self) {
-                     return self.getOperation().getCapsule();
-                   })
-      .def("__eq__",
-           [](PyOperationBase &self, PyOperationBase &other) {
-             return mlirOperationEqual(self.getOperation().get(),
-                                       other.getOperation().get());
-           })
-      .def("__eq__",
-           [](PyOperationBase &self, nb::object other) { return false; })
-      .def("__hash__",
-           [](PyOperationBase &self) {
-             return mlirOperationHashValue(self.getOperation().get());
-           })
-      .def_prop_ro("attributes",
-                   [](PyOperationBase &self) {
-                     return PyOpAttributeMap(self.getOperation().getRef());
-                   })
+      .def_prop_ro(
+          MLIR_PYTHON_CAPI_PTR_ATTR,
+          [](PyOperationBase &self) {
+            return self.getOperation().getCapsule();
+          },
+          "Gets a capsule wrapping the `MlirOperation`.")
+      .def(
+          "__eq__",
+          [](PyOperationBase &self, PyOperationBase &other) {
+            return mlirOperationEqual(self.getOperation().get(),
+                                      other.getOperation().get());
+          },
+          "Compares two operations for equality.")
+      .def(
+          "__eq__",
+          [](PyOperationBase &self, nb::object other) { return false; },
+          "Compares operation with non-operation object (always returns "
+          "False).")
+      .def(
+          "__hash__",
+          [](PyOperationBase &self) {
+            return mlirOperationHashValue(self.getOperation().get());
+          },
+          "Returns the hash value of the operation.")
+      .def_prop_ro(
+          "attributes",
+          [](PyOperationBase &self) {
+            return PyOpAttributeMap(self.getOperation().getRef());
+          },
+          "Returns a dictionary-like map of operation attributes.")
       .def_prop_ro(
           "context",
           [](PyOperationBase &self) -> nb::typed<nb::object, PyMlirContext> {
@@ -3505,22 +3596,28 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             concreteOperation.checkValid();
             return concreteOperation.getContext().getObject();
           },
-          "Context that owns the Operation")
-      .def_prop_ro("name",
-                   [](PyOperationBase &self) {
-                     auto &concreteOperation = self.getOperation();
-                     concreteOperation.checkValid();
-                     MlirOperation operation = concreteOperation.get();
-                     return mlirIdentifierStr(mlirOperationGetName(operation));
-                   })
-      .def_prop_ro("operands",
-                   [](PyOperationBase &self) {
-                     return PyOpOperandList(self.getOperation().getRef());
-                   })
-      .def_prop_ro("regions",
-                   [](PyOperationBase &self) {
-                     return PyRegionList(self.getOperation().getRef());
-                   })
+          "Context that owns the operation.")
+      .def_prop_ro(
+          "name",
+          [](PyOperationBase &self) {
+            auto &concreteOperation = self.getOperation();
+            concreteOperation.checkValid();
+            MlirOperation operation = concreteOperation.get();
+            return mlirIdentifierStr(mlirOperationGetName(operation));
+          },
+          "Returns the fully qualified name of the operation.")
+      .def_prop_ro(
+          "operands",
+          [](PyOperationBase &self) {
+            return PyOpOperandList(self.getOperation().getRef());
+          },
+          "Returns the list of operation operands.")
+      .def_prop_ro(
+          "regions",
+          [](PyOperationBase &self) {
+            return PyRegionList(self.getOperation().getRef());
+          },
+          "Returns the list of operation regions.")
       .def_prop_ro(
           "results",
           [](PyOperationBase &self) {
@@ -3551,14 +3648,16 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                          "defined or derived from."),
           nb::for_setter("Sets the source location the operation was defined "
                          "or derived from."))
-      .def_prop_ro("parent",
-                   [](PyOperationBase &self)
-                       -> std::optional<nb::typed<nb::object, PyOperation>> {
-                     auto parent = self.getOperation().getParentOperation();
-                     if (parent)
-                       return parent->getObject();
-                     return {};
-                   })
+      .def_prop_ro(
+          "parent",
+          [](PyOperationBase &self)
+              -> std::optional<nb::typed<nb::object, PyOperation>> {
+            auto parent = self.getOperation().getParentOperation();
+            if (parent)
+              return parent->getObject();
+            return {};
+          },
+          "Returns the parent operation, or `None` if at top level.")
       .def(
           "__str__",
           [](PyOperationBase &self) {
@@ -3579,7 +3678,14 @@ void mlir::python::populateIRCore(nb::module_ &m) {
            nb::overload_cast<PyAsmState &, nb::object, bool>(
                &PyOperationBase::print),
            nb::arg("state"), nb::arg("file") = nb::none(),
-           nb::arg("binary") = false, kOperationPrintStateDocstring)
+           nb::arg("binary") = false,
+           R"(
+             Prints the assembly form of the operation to a file like object.
+
+             Args:
+               state: `AsmState` capturing the operation numbering and flags.
+               file: Optional file like object to write to. Defaults to sys.stdout.
+               binary: Whether to write `bytes` (True) or `str` (False). Defaults to False.)")
       .def("print",
            nb::overload_cast<std::optional<int64_t>, std::optional<int64_t>,
                              bool, bool, bool, bool, bool, bool, nb::object,
@@ -3594,10 +3700,47 @@ void mlir::python::populateIRCore(nb::module_ &m) {
            nb::arg("use_name_loc_as_prefix") = false,
            nb::arg("assume_verified") = false, nb::arg("file") = nb::none(),
            nb::arg("binary") = false, nb::arg("skip_regions") = false,
-           kOperationPrintDocstring)
+           R"(
+             Prints the assembly form of the operation to a file like object.
+
+             Args:
+               large_elements_limit: Whether to elide elements attributes above this
+                 number of elements. Defaults to None (no limit).
+               large_resource_limit: Whether to elide resource attributes above this
+                 number of characters. Defaults to None (no limit). If large_elements_limit
+                 is set and this is None, the behavior will be to use large_elements_limit
+                 as large_resource_limit.
+               enable_debug_info: Whether to print debug/location information. Defaults
+                 to False.
+               pretty_debug_info: Whether to format debug information for easier reading
+                 by a human (warning: the result is unparseable). Defaults to False.
+               print_generic_op_form: Whether to print the generic assembly forms of all
+                 ops. Defaults to False.
+               use_local_scope: Whether to print in a way that is more optimized for
+                 multi-threaded access but may not be consistent with how the overall
+                 module prints.
+               use_name_loc_as_prefix: Whether to use location attributes (NameLoc) as
+                 prefixes for the SSA identifiers. Defaults to False.
+               assume_verified: By default, if not printing generic form, the verifier
+                 will be run and if it fails, generic form will be printed with a comment
+                 about failed verification. While a reasonable default for interactive use,
+                 for systematic use, it is often better for the caller to verify explicitly
+                 and report failures in a more robust fashion. Set this to True if doing this
+                 in order to avoid running a redundant verification. If the IR is actually
+                 invalid, behavior is undefined.
+               file: The file like object to write to. Defaults to sys.stdout.
+               binary: Whether to write bytes (True) or str (False). Defaults to False.
+               skip_regions: Whether to skip printing regions. Defaults to False.)")
       .def("write_bytecode", &PyOperationBase::writeBytecode, nb::arg("file"),
            nb::arg("desired_version") = nb::none(),
-           kOperationPrintBytecodeDocstring)
+           R"(
+             Write the bytecode form of the operation to a file like object.
+
+             Args:
+               file: The file like object to write to.
+               desired_version: Optional version of bytecode to emit.
+             Returns:
+               The bytecode writer status.)")
       .def("get_asm", &PyOperationBase::getAsm,
            // Careful: Lots of arguments must match up with get_asm method.
            nb::arg("binary") = false,
@@ -3609,7 +3752,17 @@ void mlir::python::populateIRCore(nb::module_ &m) {
            nb::arg("use_local_scope") = false,
            nb::arg("use_name_loc_as_prefix") = false,
            nb::arg("assume_verified") = false, nb::arg("skip_regions") = false,
-           kOperationGetAsmDocstring)
+           R"(
+            Gets the assembly form of the operation with all options available.
+
+            Args:
+              binary: Whether to return a bytes (True) or str (False) object. Defaults to
+                False.
+              ... others ...: See the print() method for common keyword arguments for
+                configuring the printout.
+            Returns:
+              Either a bytes or str object, depending on the setting of the `binary`
+              argument.)")
       .def("verify", &PyOperationBase::verify,
            "Verify the operation. Raises MLIRError if verification fails, and "
            "returns true otherwise.")
@@ -3621,18 +3774,31 @@ void mlir::python::populateIRCore(nb::module_ &m) {
            "block.")
       .def("is_before_in_block", &PyOperationBase::isBeforeInBlock,
            nb::arg("other"),
-           "Given an operation 'other' that is within the same parent block, "
-           "return"
-           "whether the current operation is before 'other' in the operation "
-           "list"
-           "of the parent block.")
+           R"(
+             Checks if this operation is before another in the same block.
+
+             Args:
+               other: Another operation in the same parent block.
+
+             Returns:
+               True if this operation is before `other` in the operation list of the parent block.)")
       .def(
           "clone",
           [](PyOperationBase &self,
              const nb::object &ip) -> nb::typed<nb::object, PyOperation> {
             return self.getOperation().clone(ip);
           },
-          nb::arg("ip") = nb::none())
+          nb::arg("ip") = nb::none(),
+          R"(
+            Creates a deep copy of the operation.
+
+            Args:
+              ip: Optional insertion point where the cloned operation should be inserted.
+                If None, the current insertion point is used. If False, the operation
+                remains detached.
+
+            Returns:
+              A new Operation that is a clone of this operation.)")
       .def(
           "detach_from_parent",
           [](PyOperationBase &self) -> nb::typed<nb::object, PyOpView> {
@@ -3653,13 +3819,24 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return operation.isAttached();
           },
           "Reports if the operation is attached to its parent block.")
-      .def("erase", [](PyOperationBase &self) { self.getOperation().erase(); })
+      .def(
+          "erase", [](PyOperationBase &self) { self.getOperation().erase(); },
+          R"(
+            Erases the operation and frees its memory.
+
+            Note:
+              After erasing, any Python references to the operation become invalid.)")
       .def("walk", &PyOperationBase::walk, nb::arg("callback"),
            nb::arg("walk_order") = MlirWalkPostOrder,
            // clang-format off
-          nb::sig("def walk(self, callback: Callable[[Operation], WalkResult], walk_order: WalkOrder) -> None")
+          nb::sig("def walk(self, callback: Callable[[Operation], WalkResult], walk_order: WalkOrder) -> None"),
            // clang-format on
-      );
+           R"(
+             Walks the operation tree with a callback function.
+
+             Args:
+               callback: A callable that takes an Operation and returns a WalkResult.
+               walk_order: The order of traversal (PRE_ORDER or POST_ORDER).)");
 
   nb::class_<PyOperation, PyOperationBase>(m, "Operation")
       .def_static(
@@ -3692,7 +3869,22 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           nb::arg("operands") = nb::none(), nb::arg("attributes") = nb::none(),
           nb::arg("successors") = nb::none(), nb::arg("regions") = 0,
           nb::arg("loc") = nb::none(), nb::arg("ip") = nb::none(),
-          nb::arg("infer_type") = false, kOperationCreateDocstring)
+          nb::arg("infer_type") = false,
+          R"(
+            Creates a new operation.
+
+            Args:
+              name: Operation name (e.g. `dialect.operation`).
+              results: Optional sequence of Type representing op result types.
+              operands: Optional operands of the operation.
+              attributes: Optional Dict of {str: Attribute}.
+              successors: Optional List of Block for the operation's successors.
+              regions: Number of regions to create (default = 0).
+              location: Optional Location object (defaults to resolve from context manager).
+              ip: Optional InsertionPoint (defaults to resolve from context manager or set to False to disable insertion, even with an insertion point set in the context manager).
+              infer_type: Whether to infer result types (default = False).
+            Returns:
+              A new detached Operation object. Detached operations can be added to blocks, which causes them to become attached.)")
       .def_static(
           "parse",
           [](const std::string &sourceStr, const std::string &sourceName,
@@ -3705,18 +3897,30 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           nb::arg("context") = nb::none(),
           "Parses an operation. Supports both text assembly format and binary "
           "bytecode format.")
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyOperation::getCapsule)
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyOperation::getCapsule,
+                   "Gets a capsule wrapping the MlirOperation.")
       .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR,
-                  &PyOperation::createFromCapsule)
-      .def_prop_ro("operation",
-                   [](nb::object self) -> nb::typed<nb::object, PyOperation> {
-                     return self;
-                   })
-      .def_prop_ro("opview",
-                   [](PyOperation &self) -> nb::typed<nb::object, PyOpView> {
-                     return self.createOpView();
-                   })
-      .def_prop_ro("block", &PyOperation::getBlock)
+                  &PyOperation::createFromCapsule,
+                  "Creates an Operation from a capsule wrapping MlirOperation.")
+      .def_prop_ro(
+          "operation",
+          [](nb::object self) -> nb::typed<nb::object, PyOperation> {
+            return self;
+          },
+          "Returns self (the operation).")
+      .def_prop_ro(
+          "opview",
+          [](PyOperation &self) -> nb::typed<nb::object, PyOpView> {
+            return self.createOpView();
+          },
+          R"(
+            Returns an OpView of this operation.
+
+            Note:
+              If the operation has a registered and loaded dialect then this OpView will
+              be concrete wrapper class.)")
+      .def_prop_ro("block", &PyOperation::getBlock,
+                   "Returns the block containing this operation.")
       .def_prop_ro(
           "successors",
           [](PyOperationBase &self) {
@@ -3830,7 +4034,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
       },
       nb::arg("cls"), nb::arg("source"), nb::kw_only(),
       nb::arg("source_name") = "", nb::arg("context") = nb::none(),
-      "Parses a specific, generated OpView based on class level attributes");
+      "Parses a specific, generated OpView based on class level attributes.");
 
   //----------------------------------------------------------------------------
   // Mapping of PyRegion.
@@ -3856,17 +4060,22 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return PyBlockIterator(self.getParentOperation(), firstBlock);
           },
           "Iterates over blocks in the region.")
-      .def("__eq__",
-           [](PyRegion &self, PyRegion &other) {
-             return self.get().ptr == other.get().ptr;
-           })
-      .def("__eq__", [](PyRegion &self, nb::object &other) { return false; });
+      .def(
+          "__eq__",
+          [](PyRegion &self, PyRegion &other) {
+            return self.get().ptr == other.get().ptr;
+          },
+          "Compares two regions for pointer equality.")
+      .def(
+          "__eq__", [](PyRegion &self, nb::object &other) { return false; },
+          "Compares region with non-region object (always returns False).");
 
   //----------------------------------------------------------------------------
   // Mapping of PyBlock.
   //----------------------------------------------------------------------------
   nb::class_<PyBlock>(m, "Block")
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyBlock::getCapsule)
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyBlock::getCapsule,
+                   "Gets a capsule wrapping the MlirBlock.")
       .def_prop_ro(
           "owner",
           [](PyBlock &self) -> nb::typed<nb::object, PyOpView> {
@@ -3893,14 +4102,26 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                                    mlirBlockAddArgument(self.get(), type, loc));
           },
           "type"_a, "loc"_a,
-          "Append an argument of the specified type to the block and returns "
-          "the newly added argument.")
+          R"(
+            Appends an argument of the specified type to the block.
+
+            Args:
+              type: The type of the argument to add.
+              loc: The source location for the argument.
+
+            Returns:
+              The newly added block argument.)")
       .def(
           "erase_argument",
           [](PyBlock &self, unsigned index) {
             return mlirBlockEraseArgument(self.get(), index);
           },
-          "Erase the argument at 'index' and remove it from the argument list.")
+          nb::arg("index"),
+          R"(
+            Erases the argument at the specified index.
+
+            Args:
+              index: The index of the argument to erase.)")
       .def_prop_ro(
           "operations",
           [](PyBlock &self) {
@@ -3928,7 +4149,14 @@ void mlir::python::populateIRCore(nb::module_ &m) {
               mlirBlockDetach(b);
             mlirRegionAppendOwnedBlock(region.get(), b);
           },
-          "Append this block to a region, transferring ownership if necessary")
+          nb::arg("region"),
+          R"(
+            Appends this block to a region.
+
+            Transfers ownership if the block is currently owned by another region.
+
+            Args:
+              region: The region to append the block to.)")
       .def(
           "create_before",
           [](PyBlock &self, const nb::args &pyArgTypes,
@@ -3969,15 +4197,21 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                                        firstOperation);
           },
           "Iterates over operations in the block.")
-      .def("__eq__",
-           [](PyBlock &self, PyBlock &other) {
-             return self.get().ptr == other.get().ptr;
-           })
-      .def("__eq__", [](PyBlock &self, nb::object &other) { return false; })
-      .def("__hash__",
-           [](PyBlock &self) {
-             return static_cast<size_t>(llvm::hash_value(self.get().ptr));
-           })
+      .def(
+          "__eq__",
+          [](PyBlock &self, PyBlock &other) {
+            return self.get().ptr == other.get().ptr;
+          },
+          "Compares two blocks for pointer equality.")
+      .def(
+          "__eq__", [](PyBlock &self, nb::object &other) { return false; },
+          "Compares block with non-block object (always returns False).")
+      .def(
+          "__hash__",
+          [](PyBlock &self) {
+            return static_cast<size_t>(llvm::hash_value(self.get().ptr));
+          },
+          "Returns the hash value of the block.")
       .def(
           "__str__",
           [](PyBlock &self) {
@@ -4000,8 +4234,13 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                 self.getParentOperation().getObject());
           },
           nb::arg("operation"),
-          "Appends an operation to this block. If the operation is currently "
-          "in another block, it will be moved.")
+          R"(
+            Appends an operation to this block.
+
+            If the operation is currently in another block, it will be moved.
+
+            Args:
+              operation: The operation to append to the block.)")
       .def_prop_ro(
           "successors",
           [](PyBlock &self) {
@@ -4022,10 +4261,12 @@ void mlir::python::populateIRCore(nb::module_ &m) {
   nb::class_<PyInsertionPoint>(m, "InsertionPoint")
       .def(nb::init<PyBlock &>(), nb::arg("block"),
            "Inserts after the last operation but still inside the block.")
-      .def("__enter__", &PyInsertionPoint::contextEnter)
+      .def("__enter__", &PyInsertionPoint::contextEnter,
+           "Enters the insertion point as a context manager.")
       .def("__exit__", &PyInsertionPoint::contextExit,
            nb::arg("exc_type").none(), nb::arg("exc_value").none(),
-           nb::arg("traceback").none())
+           nb::arg("traceback").none(),
+           "Exits the insertion point context manager.")
       .def_prop_ro_static(
           "current",
           [](nb::object & /*class*/) {
@@ -4036,20 +4277,50 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           },
           nb::sig("def current(/) -> InsertionPoint"),
           "Gets the InsertionPoint bound to the current thread or raises "
-          "ValueError if none has been set")
+          "ValueError if none has been set.")
       .def(nb::init<PyOperationBase &>(), nb::arg("beforeOperation"),
            "Inserts before a referenced operation.")
       .def_static("at_block_begin", &PyInsertionPoint::atBlockBegin,
-                  nb::arg("block"), "Inserts at the beginning of the block.")
+                  nb::arg("block"),
+                  R"(
+                    Creates an insertion point at the beginning of a block.
+
+                    Args:
+                      block: The block at whose beginning operations should be inserted.
+
+                    Returns:
+                      An InsertionPoint at the block's beginning.)")
       .def_static("at_block_terminator", &PyInsertionPoint::atBlockTerminator,
-                  nb::arg("block"), "Inserts before the block terminator.")
+                  nb::arg("block"),
+                  R"(
+                    Creates an insertion point before a block's terminator.
+
+                    Args:
+                      block: The block whose terminator to insert before.
+
+                    Returns:
+                      An InsertionPoint before the terminator.
+
+                    Raises:
+                      ValueError: If the block has no terminator.)")
       .def_static("after", &PyInsertionPoint::after, nb::arg("operation"),
-                  "Inserts after the operation.")
+                  R"(
+                    Creates an insertion point immediately after an operation.
+
+                    Args:
+                      operation: The operation after which to insert.
+
+                    Returns:
+                      An InsertionPoint after the operation.)")
       .def("insert", &PyInsertionPoint::insert, nb::arg("operation"),
-           "Inserts an operation.")
+           R"(
+             Inserts an operation at this insertion point.
+
+             Args:
+               operation: The operation to insert.)")
       .def_prop_ro(
           "block", [](PyInsertionPoint &self) { return self.getBlock(); },
-          "Returns the block that this InsertionPoint points to.")
+          "Returns the block that this `InsertionPoint` points to.")
       .def_prop_ro(
           "ref_operation",
           [](PyInsertionPoint &self)
@@ -4061,7 +4332,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           },
           "The reference operation before which new operations are "
           "inserted, or None if the insertion point is at the end of "
-          "the block");
+          "the block.");
 
   //----------------------------------------------------------------------------
   // Mapping of PyAttribute.
@@ -4070,10 +4341,12 @@ void mlir::python::populateIRCore(nb::module_ &m) {
       // Delegate to the PyAttribute copy constructor, which will also lifetime
       // extend the backing context which owns the MlirAttribute.
       .def(nb::init<PyAttribute &>(), nb::arg("cast_from_type"),
-           "Casts the passed attribute to the generic Attribute")
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyAttribute::getCapsule)
-      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR,
-                  &PyAttribute::createFromCapsule)
+           "Casts the passed attribute to the generic `Attribute`.")
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyAttribute::getCapsule,
+                   "Gets a capsule wrapping the MlirAttribute.")
+      .def_static(
+          MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyAttribute::createFromCapsule,
+          "Creates an Attribute from a capsule wrapping `MlirAttribute`.")
       .def_static(
           "parse",
           [](const std::string &attrSpec, DefaultingPyMlirContext context)
@@ -4086,33 +4359,49 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return PyAttribute(context.get()->getRef(), attr).maybeDownCast();
           },
           nb::arg("asm"), nb::arg("context") = nb::none(),
-          "Parses an attribute from an assembly form. Raises an MLIRError on "
+          "Parses an attribute from an assembly form. Raises an `MLIRError` on "
           "failure.")
       .def_prop_ro(
           "context",
           [](PyAttribute &self) -> nb::typed<nb::object, PyMlirContext> {
             return self.getContext().getObject();
           },
-          "Context that owns the Attribute")
-      .def_prop_ro("type",
-                   [](PyAttribute &self) -> nb::typed<nb::object, PyType> {
-                     return PyType(self.getContext(),
-                                   mlirAttributeGetType(self))
-                         .maybeDownCast();
-                   })
+          "Context that owns the `Attribute`.")
+      .def_prop_ro(
+          "type",
+          [](PyAttribute &self) -> nb::typed<nb::object, PyType> {
+            return PyType(self.getContext(), mlirAttributeGetType(self))
+                .maybeDownCast();
+          },
+          "Returns the type of the `Attribute`.")
       .def(
           "get_named",
           [](PyAttribute &self, std::string name) {
             return PyNamedAttribute(self, std::move(name));
           },
-          nb::keep_alive<0, 1>(), "Binds a name to the attribute")
-      .def("__eq__",
-           [](PyAttribute &self, PyAttribute &other) { return self == other; })
-      .def("__eq__", [](PyAttribute &self, nb::object &other) { return false; })
-      .def("__hash__",
-           [](PyAttribute &self) {
-             return static_cast<size_t>(llvm::hash_value(self.get().ptr));
-           })
+          nb::keep_alive<0, 1>(),
+          R"(
+            Binds a name to the attribute, creating a `NamedAttribute`.
+
+            Args:
+              name: The name to bind to the `Attribute`.
+
+            Returns:
+              A `NamedAttribute` with the given name and this attribute.)")
+      .def(
+          "__eq__",
+          [](PyAttribute &self, PyAttribute &other) { return self == other; },
+          "Compares two attributes for equality.")
+      .def(
+          "__eq__", [](PyAttribute &self, nb::object &other) { return false; },
+          "Compares attribute with non-attribute object (always returns "
+          "False).")
+      .def(
+          "__hash__",
+          [](PyAttribute &self) {
+            return static_cast<size_t>(llvm::hash_value(self.get().ptr));
+          },
+          "Returns the hash value of the attribute.")
       .def(
           "dump", [](PyAttribute &self) { mlirAttributeDump(self); },
           kDumpDocstring)
@@ -4125,61 +4414,69 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return printAccum.join();
           },
           "Returns the assembly form of the Attribute.")
-      .def("__repr__",
-           [](PyAttribute &self) {
-             // Generally, assembly formats are not printed for __repr__ because
-             // this can cause exceptionally long debug output and exceptions.
-             // However, attribute values are generally considered useful and
-             // are printed. This may need to be re-evaluated if debug dumps end
-             // up being excessive.
-             PyPrintAccumulator printAccum;
-             printAccum.parts.append("Attribute(");
-             mlirAttributePrint(self, printAccum.getCallback(),
-                                printAccum.getUserData());
-             printAccum.parts.append(")");
-             return printAccum.join();
-           })
-      .def_prop_ro("typeid",
-                   [](PyAttribute &self) {
-                     MlirTypeID mlirTypeID = mlirAttributeGetTypeID(self);
-                     assert(!mlirTypeIDIsNull(mlirTypeID) &&
-                            "mlirTypeID was expected to be non-null.");
-                     return PyTypeID(mlirTypeID);
-                   })
-      .def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR,
-           [](PyAttribute &self) -> nb::typed<nb::object, PyAttribute> {
-             return self.maybeDownCast();
-           });
+      .def(
+          "__repr__",
+          [](PyAttribute &self) {
+            // Generally, assembly formats are not printed for __repr__ because
+            // this can cause exceptionally long debug output and exceptions.
+            // However, attribute values are generally considered useful and
+            // are printed. This may need to be re-evaluated if debug dumps end
+            // up being excessive.
+            PyPrintAccumulator printAccum;
+            printAccum.parts.append("Attribute(");
+            mlirAttributePrint(self, printAccum.getCallback(),
+                               printAccum.getUserData());
+            printAccum.parts.append(")");
+            return printAccum.join();
+          },
+          "Returns a string representation of the attribute.")
+      .def_prop_ro(
+          "typeid",
+          [](PyAttribute &self) {
+            MlirTypeID mlirTypeID = mlirAttributeGetTypeID(self);
+            assert(!mlirTypeIDIsNull(mlirTypeID) &&
+                   "mlirTypeID was expected to be non-null.");
+            return PyTypeID(mlirTypeID);
+          },
+          "Returns the `TypeID` of the attribute.")
+      .def(
+          MLIR_PYTHON_MAYBE_DOWNCAST_ATTR,
+          [](PyAttribute &self) -> nb::typed<nb::object, PyAttribute> {
+            return self.maybeDownCast();
+          },
+          "Downcasts the attribute to a more specific attribute if possible.");
 
   //----------------------------------------------------------------------------
   // Mapping of PyNamedAttribute
   //----------------------------------------------------------------------------
   nb::class_<PyNamedAttribute>(m, "NamedAttribute")
-      .def("__repr__",
-           [](PyNamedAttribute &self) {
-             PyPrintAccumulator printAccum;
-             printAccum.parts.append("NamedAttribute(");
-             printAccum.parts.append(
-                 nb::str(mlirIdentifierStr(self.namedAttr.name).data,
-                         mlirIdentifierStr(self.namedAttr.name).length));
-             printAccum.parts.append("=");
-             mlirAttributePrint(self.namedAttr.attribute,
-                                printAccum.getCallback(),
-                                printAccum.getUserData());
-             printAccum.parts.append(")");
-             return printAccum.join();
-           })
+      .def(
+          "__repr__",
+          [](PyNamedAttribute &self) {
+            PyPrintAccumulator printAccum;
+            printAccum.parts.append("NamedAttribute(");
+            printAccum.parts.append(
+                nb::str(mlirIdentifierStr(self.namedAttr.name).data,
+                        mlirIdentifierStr(self.namedAttr.name).length));
+            printAccum.parts.append("=");
+            mlirAttributePrint(self.namedAttr.attribute,
+                               printAccum.getCallback(),
+                               printAccum.getUserData());
+            printAccum.parts.append(")");
+            return printAccum.join();
+          },
+          "Returns a string representation of the named attribute.")
       .def_prop_ro(
           "name",
           [](PyNamedAttribute &self) {
             return mlirIdentifierStr(self.namedAttr.name);
           },
-          "The name of the NamedAttribute binding")
+          "The name of the `NamedAttribute` binding.")
       .def_prop_ro(
           "attr",
           [](PyNamedAttribute &self) { return self.namedAttr.attribute; },
           nb::keep_alive<0, 1>(), nb::sig("def attr(self) -> Attribute"),
-          "The underlying generic attribute of the NamedAttribute binding");
+          "The underlying generic attribute of the `NamedAttribute` binding.");
 
   //----------------------------------------------------------------------------
   // Mapping of PyType.
@@ -4188,9 +4485,11 @@ void mlir::python::populateIRCore(nb::module_ &m) {
       // Delegate to the PyType copy constructor, which will also lifetime
       // extend the backing context which owns the MlirType.
       .def(nb::init<PyType &>(), nb::arg("cast_from_type"),
-           "Casts the passed type to the generic Type")
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyType::getCapsule)
-      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyType::createFromCapsule)
+           "Casts the passed type to the generic `Type`.")
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyType::getCapsule,
+                   "Gets a capsule wrapping the `MlirType`.")
+      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyType::createFromCapsule,
+                  "Creates a Type from a capsule wrapping `MlirType`.")
       .def_static(
           "parse",
           [](std::string typeSpec,
@@ -4203,21 +4502,31 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return PyType(context.get()->getRef(), type).maybeDownCast();
           },
           nb::arg("asm"), nb::arg("context") = nb::none(),
-          kContextParseTypeDocstring)
+          R"(
+            Parses the assembly form of a type.
+
+            Returns a Type object or raises an `MLIRError` if the type cannot be parsed.
+
+            See also: https://mlir.llvm.org/docs/LangRef/#type-system)")
       .def_prop_ro(
           "context",
           [](PyType &self) -> nb::typed<nb::object, PyMlirContext> {
             return self.getContext().getObject();
           },
-          "Context that owns the Type")
-      .def("__eq__", [](PyType &self, PyType &other) { return self == other; })
+          "Context that owns the `Type`.")
+      .def(
+          "__eq__", [](PyType &self, PyType &other) { return self == other; },
+          "Compares two types for equality.")
       .def(
           "__eq__", [](PyType &self, nb::object &other) { return false; },
-          nb::arg("other").none())
-      .def("__hash__",
-           [](PyType &self) {
-             return static_cast<size_t>(llvm::hash_value(self.get().ptr));
-           })
+          nb::arg("other").none(),
+          "Compares type with non-type object (always returns False).")
+      .def(
+          "__hash__",
+          [](PyType &self) {
+            return static_cast<size_t>(llvm::hash_value(self.get().ptr));
+          },
+          "Returns the hash value of the `Type`.")
       .def(
           "dump", [](PyType &self) { mlirTypeDump(self); }, kDumpDocstring)
       .def(
@@ -4228,60 +4537,81 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                           printAccum.getUserData());
             return printAccum.join();
           },
-          "Returns the assembly form of the type.")
-      .def("__repr__",
-           [](PyType &self) {
-             // Generally, assembly formats are not printed for __repr__ because
-             // this can cause exceptionally long debug output and exceptions.
-             // However, types are an exception as they typically have compact
-             // assembly forms and printing them is useful.
-             PyPrintAccumulator printAccum;
-             printAccum.parts.append("Type(");
-             mlirTypePrint(self, printAccum.getCallback(),
-                           printAccum.getUserData());
-             printAccum.parts.append(")");
-             return printAccum.join();
-           })
-      .def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR,
-           [](PyType &self) -> nb::typed<nb::object, PyType> {
-             return self.maybeDownCast();
-           })
-      .def_prop_ro("typeid", [](PyType &self) {
-        MlirTypeID mlirTypeID = mlirTypeGetTypeID(self);
-        if (!mlirTypeIDIsNull(mlirTypeID))
-          return PyTypeID(mlirTypeID);
-        auto origRepr = nb::cast<std::string>(nb::repr(nb::cast(self)));
-        throw nb::value_error(
-            (origRepr + llvm::Twine(" has no typeid.")).str().c_str());
-      });
+          "Returns the assembly form of the `Type`.")
+      .def(
+          "__repr__",
+          [](PyType &self) {
+            // Generally, assembly formats are not printed for __repr__ because
+            // this can cause exceptionally long debug output and exceptions.
+            // However, types are an exception as they typically have compact
+            // assembly forms and printing them is useful.
+            PyPrintAccumulator printAccum;
+            printAccum.parts.append("Type(");
+            mlirTypePrint(self, printAccum.getCallback(),
+                          printAccum.getUserData());
+            printAccum.parts.append(")");
+            return printAccum.join();
+          },
+          "Returns a string representation of the `Type`.")
+      .def(
+          MLIR_PYTHON_MAYBE_DOWNCAST_ATTR,
+          [](PyType &self) -> nb::typed<nb::object, PyType> {
+            return self.maybeDownCast();
+          },
+          "Downcasts the Type to a more specific `Type` if possible.")
+      .def_prop_ro(
+          "typeid",
+          [](PyType &self) {
+            MlirTypeID mlirTypeID = mlirTypeGetTypeID(self);
+            if (!mlirTypeIDIsNull(mlirTypeID))
+              return PyTypeID(mlirTypeID);
+            auto origRepr = nb::cast<std::string>(nb::repr(nb::cast(self)));
+            throw nb::value_error(
+                (origRepr + llvm::Twine(" has no typeid.")).str().c_str());
+          },
+          "Returns the `TypeID` of the `Type`, or raises `ValueError` if "
+          "`Type` has no "
+          "`TypeID`.");
 
   //----------------------------------------------------------------------------
   // Mapping of PyTypeID.
   //----------------------------------------------------------------------------
   nb::class_<PyTypeID>(m, "TypeID")
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyTypeID::getCapsule)
-      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyTypeID::createFromCapsule)
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyTypeID::getCapsule,
+                   "Gets a capsule wrapping the `MlirTypeID`.")
+      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyTypeID::createFromCapsule,
+                  "Creates a `TypeID` from a capsule wrapping `MlirTypeID`.")
       // Note, this tests whether the underlying TypeIDs are the same,
       // not whether the wrapper MlirTypeIDs are the same, nor whether
       // the Python objects are the same (i.e., PyTypeID is a value type).
-      .def("__eq__",
-           [](PyTypeID &self, PyTypeID &other) { return self == other; })
-      .def("__eq__",
-           [](PyTypeID &self, const nb::object &other) { return false; })
+      .def(
+          "__eq__",
+          [](PyTypeID &self, PyTypeID &other) { return self == other; },
+          "Compares two `TypeID`s for equality.")
+      .def(
+          "__eq__",
+          [](PyTypeID &self, const nb::object &other) { return false; },
+          "Compares TypeID with non-TypeID object (always returns False).")
       // Note, this gives the hash value of the underlying TypeID, not the
       // hash value of the Python object, nor the hash value of the
       // MlirTypeID wrapper.
-      .def("__hash__", [](PyTypeID &self) {
-        return static_cast<size_t>(mlirTypeIDHashValue(self));
-      });
+      .def(
+          "__hash__",
+          [](PyTypeID &self) {
+            return static_cast<size_t>(mlirTypeIDHashValue(self));
+          },
+          "Returns the hash value of the `TypeID`.");
 
   //----------------------------------------------------------------------------
   // Mapping of Value.
   //----------------------------------------------------------------------------
   nb::class_<PyValue>(m, "Value")
-      .def(nb::init<PyValue &>(), nb::keep_alive<0, 1>(), nb::arg("value"))
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyValue::getCapsule)
-      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyValue::createFromCapsule)
+      .def(nb::init<PyValue &>(), nb::keep_alive<0, 1>(), nb::arg("value"),
+           "Creates a Value reference from another `Value`.")
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyValue::getCapsule,
+                   "Gets a capsule wrapping the `MlirValue`.")
+      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyValue::createFromCapsule,
+                  "Creates a `Value` from a capsule wrapping `MlirValue`.")
       .def_prop_ro(
           "context",
           [](PyValue &self) -> nb::typed<nb::object, PyMlirContext> {
@@ -4312,23 +4642,30 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             assert(false && "Value must be a block argument or an op result");
             return nb::none();
           },
-          // clang-format off
-          nb::sig("def owner(self) -> Operation | Block | None"))
-      // clang-format on
-      .def_prop_ro("uses",
-                   [](PyValue &self) {
-                     return PyOpOperandIterator(
-                         mlirValueGetFirstUse(self.get()));
-                   })
-      .def("__eq__",
-           [](PyValue &self, PyValue &other) {
-             return self.get().ptr == other.get().ptr;
-           })
-      .def("__eq__", [](PyValue &self, nb::object other) { return false; })
-      .def("__hash__",
-           [](PyValue &self) {
-             return static_cast<size_t>(llvm::hash_value(self.get().ptr));
-           })
+          "Returns the owner of the value (`Operation` for results, `Block` "
+          "for "
+          "arguments).")
+      .def_prop_ro(
+          "uses",
+          [](PyValue &self) {
+            return PyOpOperandIterator(mlirValueGetFirstUse(self.get()));
+          },
+          "Returns an iterator over uses of this value.")
+      .def(
+          "__eq__",
+          [](PyValue &self, PyValue &other) {
+            return self.get().ptr == other.get().ptr;
+          },
+          "Compares two values for pointer equality.")
+      .def(
+          "__eq__", [](PyValue &self, nb::object other) { return false; },
+          "Compares value with non-value object (always returns False).")
+      .def(
+          "__hash__",
+          [](PyValue &self) {
+            return static_cast<size_t>(llvm::hash_value(self.get().ptr));
+          },
+          "Returns the hash value of the value.")
       .def(
           "__str__",
           [](PyValue &self) {
@@ -4339,7 +4676,13 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             printAccum.parts.append(")");
             return printAccum.join();
           },
-          kValueDunderStrDocstring)
+          R"(
+            Returns the string form of the value.
+
+            If the value is a block argument, this is the assembly form of its type and the
+            position in the argument list. If the value is an operation result, this is
+            equivalent to printing the operation that produced it.
+          )")
       .def(
           "get_name",
           [](PyValue &self, bool useLocalScope, bool useNameLocAsPrefix) {
@@ -4359,7 +4702,16 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return printAccum.join();
           },
           nb::arg("use_local_scope") = false,
-          nb::arg("use_name_loc_as_prefix") = false)
+          nb::arg("use_name_loc_as_prefix") = false,
+          R"(
+            Returns the string form of value as an operand.
+
+            Args:
+              use_local_scope: Whether to use local scope for naming.
+              use_name_loc_as_prefix: Whether to use the location attribute (NameLoc) as prefix.
+
+            Returns:
+              The value's name as it appears in IR (e.g., `%0`, `%arg0`).)")
       .def(
           "get_name",
           [](PyValue &self, PyAsmState &state) {
@@ -4370,25 +4722,29 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                                     printAccum.getUserData());
             return printAccum.join();
           },
-          nb::arg("state"), kGetNameAsOperand)
-      .def_prop_ro("type",
-                   [](PyValue &self) -> nb::typed<nb::object, PyType> {
-                     return PyType(self.getParentOperation()->getContext(),
-                                   mlirValueGetType(self.get()))
-                         .maybeDownCast();
-                   })
+          nb::arg("state"),
+          "Returns the string form of value as an operand (i.e., the ValueID).")
+      .def_prop_ro(
+          "type",
+          [](PyValue &self) -> nb::typed<nb::object, PyType> {
+            return PyType(self.getParentOperation()->getContext(),
+                          mlirValueGetType(self.get()))
+                .maybeDownCast();
+          },
+          "Returns the type of the value.")
       .def(
           "set_type",
           [](PyValue &self, const PyType &type) {
-            return mlirValueSetType(self.get(), type);
+            mlirValueSetType(self.get(), type);
           },
-          nb::arg("type"))
+          nb::arg("type"), "Sets the type of the value.")
       .def(
           "replace_all_uses_with",
           [](PyValue &self, PyValue &with) {
             mlirValueReplaceAllUsesOfWith(self.get(), with.get());
           },
-          kValueReplaceAllUsesWithDocstring)
+          "Replace all uses of value with the new value, updating anything in "
+          "the IR that uses `self` to use the other value instead.")
       .def(
           "replace_all_uses_except",
           [](PyValue &self, PyValue &with, PyOperation &exception) {
@@ -4434,10 +4790,12 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           },
           nb::arg("with_"), nb::arg("exceptions"),
           kValueReplaceAllUsesExceptDocstring)
-      .def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR,
-           [](PyValue &self) -> nb::typed<nb::object, PyValue> {
-             return self.maybeDownCast();
-           })
+      .def(
+          MLIR_PYTHON_MAYBE_DOWNCAST_ATTR,
+          [](PyValue &self) -> nb::typed<nb::object, PyValue> {
+            return self.maybeDownCast();
+          },
+          "Downcasts the `Value` to a more specific kind if possible.")
       .def_prop_ro(
           "location",
           [](MlirValue self) {
@@ -4445,7 +4803,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                 PyMlirContext::forContext(mlirValueGetContext(self)),
                 mlirValueGetLocation(self));
           },
-          "Returns the source location the value");
+          "Returns the source location of the value.");
 
   PyBlockArgument::bind(m);
   PyOpResult::bind(m);
@@ -4453,43 +4811,105 @@ void mlir::python::populateIRCore(nb::module_ &m) {
 
   nb::class_<PyAsmState>(m, "AsmState")
       .def(nb::init<PyValue &, bool>(), nb::arg("value"),
-           nb::arg("use_local_scope") = false)
+           nb::arg("use_local_scope") = false,
+           R"(
+             Creates an `AsmState` for consistent SSA value naming.
+
+             Args:
+               value: The value to create state for.
+               use_local_scope: Whether to use local scope for naming.)")
       .def(nb::init<PyOperationBase &, bool>(), nb::arg("op"),
-           nb::arg("use_local_scope") = false);
+           nb::arg("use_local_scope") = false,
+           R"(
+             Creates an AsmState for consistent SSA value naming.
+
+             Args:
+               op: The operation to create state for.
+               use_local_scope: Whether to use local scope for naming.)");
 
   //----------------------------------------------------------------------------
   // Mapping of SymbolTable.
   //----------------------------------------------------------------------------
   nb::class_<PySymbolTable>(m, "SymbolTable")
-      .def(nb::init<PyOperationBase &>())
-      .def("__getitem__",
-           [](PySymbolTable &self,
-              const std::string &name) -> nb::typed<nb::object, PyOpView> {
-             return self.dunderGetItem(name);
-           })
-      .def("insert", &PySymbolTable::insert, nb::arg("operation"))
-      .def("erase", &PySymbolTable::erase, nb::arg("operation"))
-      .def("__delitem__", &PySymbolTable::dunderDel)
-      .def("__contains__",
-           [](PySymbolTable &table, const std::string &name) {
-             return !mlirOperationIsNull(mlirSymbolTableLookup(
-                 table, mlirStringRefCreate(name.data(), name.length())));
-           })
+      .def(nb::init<PyOperationBase &>(),
+           R"(
+             Creates a symbol table for an operation.
+
+             Args:
+               operation: The `Operation` that defines a symbol table (e.g., a `ModuleOp`).
+
+             Raises:
+               TypeError: If the operation is not a symbol table.)")
+      .def(
+          "__getitem__",
+          [](PySymbolTable &self,
+             const std::string &name) -> nb::typed<nb::object, PyOpView> {
+            return self.dunderGetItem(name);
+          },
+          R"(
+            Looks up a symbol by name in the symbol table.
+
+            Args:
+              name: The name of the symbol to look up.
+
+            Returns:
+              The operation defining the symbol.
+
+            Raises:
+              KeyError: If the symbol is not found.)")
+      .def("insert", &PySymbolTable::insert, nb::arg("operation"),
+           R"(
+             Inserts a symbol operation into the symbol table.
+
+             Args:
+               operation: An operation with a symbol name to insert.
+
+             Returns:
+               The symbol name attribute of the inserted operation.
+
+             Raises:
+               ValueError: If the operation does not have a symbol name.)")
+      .def("erase", &PySymbolTable::erase, nb::arg("operation"),
+           R"(
+             Erases a symbol operation from the symbol table.
+
+             Args:
+               operation: The symbol operation to erase.
+
+             Note:
+               The operation is also erased from the IR and invalidated.)")
+      .def("__delitem__", &PySymbolTable::dunderDel,
+           "Deletes a symbol by name from the symbol table.")
+      .def(
+          "__contains__",
+          [](PySymbolTable &table, const std::string &name) {
+            return !mlirOperationIsNull(mlirSymbolTableLookup(
+                table, mlirStringRefCreate(name.data(), name.length())));
+          },
+          "Checks if a symbol with the given name exists in the table.")
       // Static helpers.
       .def_static("set_symbol_name", &PySymbolTable::setSymbolName,
-                  nb::arg("symbol"), nb::arg("name"))
+                  nb::arg("symbol"), nb::arg("name"),
+                  "Sets the symbol name for a symbol operation.")
       .def_static("get_symbol_name", &PySymbolTable::getSymbolName,
-                  nb::arg("symbol"))
+                  nb::arg("symbol"),
+                  "Gets the symbol name from a symbol operation.")
       .def_static("get_visibility", &PySymbolTable::getVisibility,
-                  nb::arg("symbol"))
+                  nb::arg("symbol"),
+                  "Gets the visibility attribute of a symbol operation.")
       .def_static("set_visibility", &PySymbolTable::setVisibility,
-                  nb::arg("symbol"), nb::arg("visibility"))
+                  nb::arg("symbol"), nb::arg("visibility"),
+                  "Sets the visibility attribute of a symbol operation.")
       .def_static("replace_all_symbol_uses",
                   &PySymbolTable::replaceAllSymbolUses, nb::arg("old_symbol"),
-                  nb::arg("new_symbol"), nb::arg("from_op"))
+                  nb::arg("new_symbol"), nb::arg("from_op"),
+                  "Replaces all uses of a symbol with a new symbol name within "
+                  "the given operation.")
       .def_static("walk_symbol_tables", &PySymbolTable::walkSymbolTables,
                   nb::arg("from_op"), nb::arg("all_sym_uses_visible"),
-                  nb::arg("callback"));
+                  nb::arg("callback"),
+                  "Walks symbol tables starting from an operation with a "
+                  "callback function.");
 
   // Container bindings.
   PyBlockArgumentList::bind(m);
diff --git a/mlir/lib/Bindings/Python/NanobindUtils.h b/mlir/lib/Bindings/Python/NanobindUtils.h
index 64ea4329f65f1..658e8ad5330ef 100644
--- a/mlir/lib/Bindings/Python/NanobindUtils.h
+++ b/mlir/lib/Bindings/Python/NanobindUtils.h
@@ -395,7 +395,6 @@ class Sliceable {
   /// Hook for derived classes willing to bind more methods.
   static void bindDerived(ClassTy &) {}
 
-private:
   intptr_t startIndex;
   intptr_t length;
   intptr_t step;
diff --git a/mlir/lib/CAPI/Dialect/Linalg.cpp b/mlir/lib/CAPI/Dialect/Linalg.cpp
index 5c2a65d2c4c8a..75c811aed6cc5 100644
--- a/mlir/lib/CAPI/Dialect/Linalg.cpp
+++ b/mlir/lib/CAPI/Dialect/Linalg.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir-c/Dialect/Linalg.h"
+#include "mlir/CAPI/AffineMap.h"
 #include "mlir/CAPI/Registration.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 
@@ -62,9 +63,8 @@ mlirLinalgInferContractionDimensions(MlirOperation op) {
   const linalg::ContractionDimensions &contractionDims = *maybeDims;
   MLIRContext *ctx = linalgOp.getContext();
 
-  auto toAttr = [&ctx](const SmallVector<unsigned, 2> &vals) -> MlirAttribute {
-    return wrap(
-        DenseI32ArrayAttr::get(ctx, llvm::to_vector_of<int32_t, 2>(vals)));
+  auto toAttr = [ctx](ArrayRef<unsigned> vals) -> MlirAttribute {
+    return wrap(DenseI32ArrayAttr::get(ctx, llvm::to_vector_of<int32_t>(vals)));
   };
 
   result.batch = toAttr(contractionDims.batch);
@@ -75,6 +75,38 @@ mlirLinalgInferContractionDimensions(MlirOperation op) {
   return result;
 }
 
+MLIR_CAPI_EXPORTED MlirLinalgContractionDimensions
+mlirLinalgInferContractionDimensionsFromMaps(const MlirAffineMap *indexingMaps,
+                                             size_t numMaps) {
+  MlirLinalgContractionDimensions result{};
+  if (!indexingMaps || numMaps == 0)
+    return result;
+
+  SmallVector<AffineMap, 3> maps;
+  maps.reserve(numMaps);
+  for (size_t i = 0; i < numMaps; ++i) {
+    maps.push_back(unwrap(indexingMaps[i]));
+  }
+
+  FailureOr<linalg::ContractionDimensions> maybeDims =
+      linalg::inferContractionDims(maps);
+  if (failed(maybeDims))
+    return result;
+
+  MLIRContext *ctx = maps[0].getContext();
+
+  auto toAttr = [ctx](ArrayRef<unsigned> vals) -> MlirAttribute {
+    return wrap(DenseI32ArrayAttr::get(ctx, llvm::to_vector_of<int32_t>(vals)));
+  };
+
+  result.batch = toAttr(maybeDims->batch);
+  result.m = toAttr(maybeDims->m);
+  result.n = toAttr(maybeDims->n);
+  result.k = toAttr(maybeDims->k);
+
+  return result;
+}
+
 MLIR_CAPI_EXPORTED bool mlirLinalgIsAConvolutionOp(MlirOperation op) {
   auto linalgOp = llvm::dyn_cast<mlir::linalg::LinalgOp>(unwrap(op));
   if (!linalgOp)
diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
index ba57155ab9b45..b6099902cc337 100644
--- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
+++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
@@ -36,20 +36,23 @@ namespace {
 /// attribute.
 template <typename SourceOp, typename TargetOp, bool Constrained,
           template <typename, typename> typename AttrConvert =
-              AttrConvertPassThrough>
+              AttrConvertPassThrough,
+          bool FailOnUnsupportedFP = false>
 struct ConstrainedVectorConvertToLLVMPattern
-    : public VectorConvertToLLVMPattern<SourceOp, TargetOp, AttrConvert> {
-  using VectorConvertToLLVMPattern<SourceOp, TargetOp,
-                                   AttrConvert>::VectorConvertToLLVMPattern;
+    : public VectorConvertToLLVMPattern<SourceOp, TargetOp, AttrConvert,
+                                        FailOnUnsupportedFP> {
+  using VectorConvertToLLVMPattern<
+      SourceOp, TargetOp, AttrConvert,
+      FailOnUnsupportedFP>::VectorConvertToLLVMPattern;
 
   LogicalResult
   matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     if (Constrained != static_cast<bool>(op.getRoundingModeAttr()))
       return failure();
-    return VectorConvertToLLVMPattern<SourceOp, TargetOp,
-                                      AttrConvert>::matchAndRewrite(op, adaptor,
-                                                                    rewriter);
+    return VectorConvertToLLVMPattern<
+        SourceOp, TargetOp, AttrConvert,
+        FailOnUnsupportedFP>::matchAndRewrite(op, adaptor, rewriter);
   }
 };
 
@@ -78,7 +81,8 @@ struct IdentityBitcastLowering final
 
 using AddFOpLowering =
     VectorConvertToLLVMPattern<arith::AddFOp, LLVM::FAddOp,
-                               arith::AttrConvertFastMathToLLVM>;
+                               arith::AttrConvertFastMathToLLVM,
+                               /*FailOnUnsupportedFP=*/true>;
 using AddIOpLowering =
     VectorConvertToLLVMPattern<arith::AddIOp, LLVM::AddOp,
                                arith::AttrConvertOverflowToLLVM>;
@@ -87,53 +91,67 @@ using BitcastOpLowering =
     VectorConvertToLLVMPattern<arith::BitcastOp, LLVM::BitcastOp>;
 using DivFOpLowering =
     VectorConvertToLLVMPattern<arith::DivFOp, LLVM::FDivOp,
-                               arith::AttrConvertFastMathToLLVM>;
+                               arith::AttrConvertFastMathToLLVM,
+                               /*FailOnUnsupportedFP=*/true>;
 using DivSIOpLowering =
     VectorConvertToLLVMPattern<arith::DivSIOp, LLVM::SDivOp>;
 using DivUIOpLowering =
     VectorConvertToLLVMPattern<arith::DivUIOp, LLVM::UDivOp>;
-using ExtFOpLowering = VectorConvertToLLVMPattern<arith::ExtFOp, LLVM::FPExtOp>;
+using ExtFOpLowering = VectorConvertToLLVMPattern<arith::ExtFOp, LLVM::FPExtOp,
+                                                  AttrConvertPassThrough,
+                                                  /*FailOnUnsupportedFP=*/true>;
 using ExtSIOpLowering =
     VectorConvertToLLVMPattern<arith::ExtSIOp, LLVM::SExtOp>;
 using ExtUIOpLowering =
     VectorConvertToLLVMPattern<arith::ExtUIOp, LLVM::ZExtOp>;
 using FPToSIOpLowering =
-    VectorConvertToLLVMPattern<arith::FPToSIOp, LLVM::FPToSIOp>;
+    VectorConvertToLLVMPattern<arith::FPToSIOp, LLVM::FPToSIOp,
+                               AttrConvertPassThrough,
+                               /*FailOnUnsupportedFP=*/true>;
 using FPToUIOpLowering =
-    VectorConvertToLLVMPattern<arith::FPToUIOp, LLVM::FPToUIOp>;
+    VectorConvertToLLVMPattern<arith::FPToUIOp, LLVM::FPToUIOp,
+                               AttrConvertPassThrough,
+                               /*FailOnUnsupportedFP=*/true>;
 using MaximumFOpLowering =
     VectorConvertToLLVMPattern<arith::MaximumFOp, LLVM::MaximumOp,
-                               arith::AttrConvertFastMathToLLVM>;
+                               arith::AttrConvertFastMathToLLVM,
+                               /*FailOnUnsupportedFP=*/true>;
 using MaxNumFOpLowering =
     VectorConvertToLLVMPattern<arith::MaxNumFOp, LLVM::MaxNumOp,
-                               arith::AttrConvertFastMathToLLVM>;
+                               arith::AttrConvertFastMathToLLVM,
+                               /*FailOnUnsupportedFP=*/true>;
 using MaxSIOpLowering =
     VectorConvertToLLVMPattern<arith::MaxSIOp, LLVM::SMaxOp>;
 using MaxUIOpLowering =
     VectorConvertToLLVMPattern<arith::MaxUIOp, LLVM::UMaxOp>;
 using MinimumFOpLowering =
     VectorConvertToLLVMPattern<arith::MinimumFOp, LLVM::MinimumOp,
-                               arith::AttrConvertFastMathToLLVM>;
+                               arith::AttrConvertFastMathToLLVM,
+                               /*FailOnUnsupportedFP=*/true>;
 using MinNumFOpLowering =
     VectorConvertToLLVMPattern<arith::MinNumFOp, LLVM::MinNumOp,
-                               arith::AttrConvertFastMathToLLVM>;
+                               arith::AttrConvertFastMathToLLVM,
+                               /*FailOnUnsupportedFP=*/true>;
 using MinSIOpLowering =
     VectorConvertToLLVMPattern<arith::MinSIOp, LLVM::SMinOp>;
 using MinUIOpLowering =
     VectorConvertToLLVMPattern<arith::MinUIOp, LLVM::UMinOp>;
 using MulFOpLowering =
     VectorConvertToLLVMPattern<arith::MulFOp, LLVM::FMulOp,
-                               arith::AttrConvertFastMathToLLVM>;
+                               arith::AttrConvertFastMathToLLVM,
+                               /*FailOnUnsupportedFP=*/true>;
 using MulIOpLowering =
     VectorConvertToLLVMPattern<arith::MulIOp, LLVM::MulOp,
                                arith::AttrConvertOverflowToLLVM>;
 using NegFOpLowering =
     VectorConvertToLLVMPattern<arith::NegFOp, LLVM::FNegOp,
-                               arith::AttrConvertFastMathToLLVM>;
+                               arith::AttrConvertFastMathToLLVM,
+                               /*FailOnUnsupportedFP=*/true>;
 using OrIOpLowering = VectorConvertToLLVMPattern<arith::OrIOp, LLVM::OrOp>;
 using RemFOpLowering =
     VectorConvertToLLVMPattern<arith::RemFOp, LLVM::FRemOp,
-                               arith::AttrConvertFastMathToLLVM>;
+                               arith::AttrConvertFastMathToLLVM,
+                               /*FailOnUnsupportedFP=*/true>;
 using RemSIOpLowering =
     VectorConvertToLLVMPattern<arith::RemSIOp, LLVM::SRemOp>;
 using RemUIOpLowering =
@@ -151,21 +169,25 @@ using SIToFPOpLowering =
     VectorConvertToLLVMPattern<arith::SIToFPOp, LLVM::SIToFPOp>;
 using SubFOpLowering =
     VectorConvertToLLVMPattern<arith::SubFOp, LLVM::FSubOp,
-                               arith::AttrConvertFastMathToLLVM>;
+                               arith::AttrConvertFastMathToLLVM,
+                               /*FailOnUnsupportedFP=*/true>;
 using SubIOpLowering =
     VectorConvertToLLVMPattern<arith::SubIOp, LLVM::SubOp,
                                arith::AttrConvertOverflowToLLVM>;
 using TruncFOpLowering =
     ConstrainedVectorConvertToLLVMPattern<arith::TruncFOp, LLVM::FPTruncOp,
-                                          false>;
+                                          false, AttrConvertPassThrough,
+                                          /*FailOnUnsupportedFP=*/true>;
 using ConstrainedTruncFOpLowering = ConstrainedVectorConvertToLLVMPattern<
     arith::TruncFOp, LLVM::ConstrainedFPTruncIntr, true,
-    arith::AttrConverterConstrainedFPToLLVM>;
+    arith::AttrConverterConstrainedFPToLLVM, /*FailOnUnsupportedFP=*/true>;
 using TruncIOpLowering =
     VectorConvertToLLVMPattern<arith::TruncIOp, LLVM::TruncOp,
                                arith::AttrConvertOverflowToLLVM>;
 using UIToFPOpLowering =
-    VectorConvertToLLVMPattern<arith::UIToFPOp, LLVM::UIToFPOp>;
+    VectorConvertToLLVMPattern<arith::UIToFPOp, LLVM::UIToFPOp,
+                               AttrConvertPassThrough,
+                               /*FailOnUnsupportedFP=*/true>;
 using XOrIOpLowering = VectorConvertToLLVMPattern<arith::XOrIOp, LLVM::XOrOp>;
 
 //===----------------------------------------------------------------------===//
@@ -240,8 +262,7 @@ struct CmpFOpLowering : public ConvertOpToLLVMPattern<arith::CmpFOp> {
 
 struct SelectOpOneToNLowering : public ConvertOpToLLVMPattern<arith::SelectOp> {
   using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
-  using Adaptor =
-      typename ConvertOpToLLVMPattern<arith::SelectOp>::OneToNOpAdaptor;
+  using Adaptor = ConvertOpToLLVMPattern<arith::SelectOp>::OneToNOpAdaptor;
 
   LogicalResult
   matchAndRewrite(arith::SelectOp op, Adaptor adaptor,
diff --git a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
index 798d8b04eed76..b75968eb31c58 100644
--- a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
+++ b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
@@ -137,8 +137,7 @@ static SmallVector<Value> flattenValues(ArrayRef<ValueRange> values) {
 /// op to llvm.br.
 struct BranchOpLowering : public ConvertOpToLLVMPattern<cf::BranchOp> {
   using ConvertOpToLLVMPattern<cf::BranchOp>::ConvertOpToLLVMPattern;
-  using Adaptor =
-      typename ConvertOpToLLVMPattern<cf::BranchOp>::OneToNOpAdaptor;
+  using Adaptor = ConvertOpToLLVMPattern<cf::BranchOp>::OneToNOpAdaptor;
 
   LogicalResult
   matchAndRewrite(cf::BranchOp op, Adaptor adaptor,
@@ -163,8 +162,7 @@ struct BranchOpLowering : public ConvertOpToLLVMPattern<cf::BranchOp> {
 /// branch op to llvm.cond_br.
 struct CondBranchOpLowering : public ConvertOpToLLVMPattern<cf::CondBranchOp> {
   using ConvertOpToLLVMPattern<cf::CondBranchOp>::ConvertOpToLLVMPattern;
-  using Adaptor =
-      typename ConvertOpToLLVMPattern<cf::CondBranchOp>::OneToNOpAdaptor;
+  using Adaptor = ConvertOpToLLVMPattern<cf::CondBranchOp>::OneToNOpAdaptor;
 
   LogicalResult
   matchAndRewrite(cf::CondBranchOp op, Adaptor adaptor,
@@ -204,7 +202,7 @@ struct SwitchOpLowering : public ConvertOpToLLVMPattern<cf::SwitchOp> {
   using ConvertOpToLLVMPattern<cf::SwitchOp>::ConvertOpToLLVMPattern;
 
   LogicalResult
-  matchAndRewrite(cf::SwitchOp op, typename cf::SwitchOp::Adaptor adaptor,
+  matchAndRewrite(cf::SwitchOp op, cf::SwitchOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     // Get or convert default block.
     FailureOr<Block *> convertedDefaultBlock = getConvertedBlock(
diff --git a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
index a2dfc12cc9d63..a922338176f11 100644
--- a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
+++ b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
@@ -68,7 +68,7 @@ struct ClampFOpConversion final
       return LLVM::detail::handleMultidimensionalVectors(
           op.getOperation(), adaptor.getOperands(), *getTypeConverter(),
           [&](Type llvm1DVectorTy, ValueRange operands) -> Value {
-            typename math::ClampFOp::Adaptor adaptor(operands);
+            math::ClampFOp::Adaptor adaptor(operands);
             return ROCDL::FMed3Op::create(rewriter, op.getLoc(), llvm1DVectorTy,
                                           adaptor.getValue(), adaptor.getMin(),
                                           adaptor.getMax());
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index a9efada28a320..3a70f787da124 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -846,13 +846,8 @@ struct NVGPUMBarrierInitLowering
     Value barrier = getMbarrierPtr(b, mbarrierType, adaptor.getBarriers(),
                                    adaptor.getMbarId(), rewriter);
     Value count = truncToI32(b, adaptor.getCount());
-    if (isMbarrierShared(mbarrierType)) {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierInitSharedOp>(
-          op, barrier, count, adaptor.getPredicate());
-    } else {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierInitOp>(op, barrier, count,
-                                                        adaptor.getPredicate());
-    }
+    rewriter.replaceOpWithNewOp<NVVM::MBarrierInitOp>(op, barrier, count,
+                                                      adaptor.getPredicate());
     return success();
   }
 };
@@ -870,13 +865,7 @@ struct NVGPUMBarrierArriveLowering
                        adaptor.getMbarId(), rewriter);
     Type tokenType = getTypeConverter()->convertType(
         nvgpu::MBarrierTokenType::get(op->getContext()));
-    if (isMbarrierShared(op.getBarriers().getType())) {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveSharedOp>(op, tokenType,
-                                                                barrier);
-    } else {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveOp>(op, tokenType,
-                                                          barrier);
-    }
+    rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveOp>(op, tokenType, barrier);
     return success();
   }
 };
@@ -897,13 +886,8 @@ struct NVGPUMBarrierArriveNoCompleteLowering
     Type tokenType = getTypeConverter()->convertType(
         nvgpu::MBarrierTokenType::get(op->getContext()));
     Value count = truncToI32(b, adaptor.getCount());
-    if (isMbarrierShared(op.getBarriers().getType())) {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteSharedOp>(
-          op, tokenType, barrier, count);
-    } else {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteOp>(
-          op, tokenType, barrier, count);
-    }
+    rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteOp>(
+        op, tokenType, barrier, count);
     return success();
   }
 };
@@ -920,13 +904,8 @@ struct NVGPUMBarrierTestWaitLowering
         getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
                        adaptor.getMbarId(), rewriter);
     Type retType = rewriter.getI1Type();
-    if (isMbarrierShared(op.getBarriers().getType())) {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitSharedOp>(
-          op, retType, barrier, adaptor.getToken());
-    } else {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitOp>(
-          op, retType, barrier, adaptor.getToken());
-    }
+    rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitOp>(op, retType, barrier,
+                                                          adaptor.getToken());
     return success();
   }
 };
@@ -943,13 +922,6 @@ struct NVGPUMBarrierArriveExpectTxLowering
         getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
                        adaptor.getMbarId(), rewriter);
     Value txcount = truncToI32(b, adaptor.getTxcount());
-
-    if (isMbarrierShared(op.getBarriers().getType())) {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveExpectTxSharedOp>(
-          op, barrier, txcount, adaptor.getPredicate());
-      return success();
-    }
-
     rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveExpectTxOp>(
         op, barrier, txcount, adaptor.getPredicate());
     return success();
@@ -970,13 +942,6 @@ struct NVGPUMBarrierTryWaitParityLowering
     Value ticks = truncToI32(b, adaptor.getTicks());
     Value phase =
         LLVM::ZExtOp::create(b, b.getI32Type(), adaptor.getPhaseParity());
-
-    if (isMbarrierShared(op.getBarriers().getType())) {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierTryWaitParitySharedOp>(
-          op, barrier, phase, ticks);
-      return success();
-    }
-
     rewriter.replaceOpWithNewOp<NVVM::MBarrierTryWaitParityOp>(op, barrier,
                                                                phase, ticks);
     return success();
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 41d8d532757ad..69a317ecd101f 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -716,7 +716,7 @@ lowerReductionWithStartValue(ConversionPatternRewriter &rewriter, Location loc,
   accumulator = getOrCreateAccumulator<ReductionNeutral>(rewriter, loc,
                                                          llvmType, accumulator);
   return LLVMRedIntrinOp::create(rewriter, loc, llvmType,
-                                 /*startValue=*/accumulator, vectorOperand,
+                                 /*start_value=*/accumulator, vectorOperand,
                                  fmf);
 }
 
@@ -743,7 +743,7 @@ static Value lowerPredicatedReductionWithStartValue(
   Value vectorLength =
       createVectorLengthValue(rewriter, loc, vectorOperand.getType());
   return LLVMVPRedIntrinOp::create(rewriter, loc, llvmType,
-                                   /*startValue=*/accumulator, vectorOperand,
+                                   /*satrt_value=*/accumulator, vectorOperand,
                                    mask, vectorLength);
 }
 
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
index 91c1aa55fdb4e..1b4d1a42614ea 100644
--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -97,57 +97,23 @@ static LogicalResult transferPreconditions(PatternRewriter &rewriter,
   return success();
 }
 
-static xegpu::CreateNdDescOp
-createNdDescriptor(PatternRewriter &rewriter, Location loc,
-                   xegpu::TensorDescType descType, TypedValue<MemRefType> src,
-                   Operation::operand_range offsets) {
+static xegpu::CreateNdDescOp createNdDescriptor(PatternRewriter &rewriter,
+                                                Location loc,
+                                                xegpu::TensorDescType descType,
+                                                TypedValue<MemRefType> src) {
   MemRefType srcTy = src.getType();
   auto [strides, offset] = srcTy.getStridesAndOffset();
 
   xegpu::CreateNdDescOp ndDesc;
   if (srcTy.hasStaticShape()) {
-    ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src,
-                                           getAsOpFoldResult(offsets));
+    ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src);
   } else {
     // In case of any dynamic shapes, source's shape and strides have to be
     // explicitly provided.
-    SmallVector<Value> sourceDims;
-    unsigned srcRank = srcTy.getRank();
-    for (unsigned i = 0; i < srcRank; ++i)
-      sourceDims.push_back(memref::DimOp::create(rewriter, loc, src, i));
-
-    SmallVector<int64_t> constOffsets;
-    SmallVector<Value> dynOffsets;
-    for (Value offset : offsets) {
-      std::optional<int64_t> staticVal = getConstantIntValue(offset);
-      if (!staticVal)
-        dynOffsets.push_back(offset);
-      constOffsets.push_back(staticVal.value_or(ShapedType::kDynamic));
-    }
-
-    SmallVector<Value> dynShapes;
-    for (auto [idx, shape] : llvm::enumerate(srcTy.getShape())) {
-      if (shape == ShapedType::kDynamic)
-        dynShapes.push_back(sourceDims[idx]);
-    }
-
-    // Compute strides in reverse order.
-    SmallVector<Value> dynStrides;
-    Value accStride = arith::ConstantIndexOp::create(rewriter, loc, 1);
-    // Last stride is guaranteed to be static and unit.
-    for (int i = static_cast<int>(strides.size()) - 2; i >= 0; --i) {
-      accStride =
-          arith::MulIOp::create(rewriter, loc, accStride, sourceDims[i + 1]);
-      if (strides[i] == ShapedType::kDynamic)
-        dynStrides.push_back(accStride);
-    }
-    std::reverse(dynStrides.begin(), dynStrides.end());
-
-    ndDesc = xegpu::CreateNdDescOp::create(
-        rewriter, loc, descType, src, dynOffsets, dynShapes, dynStrides,
-        DenseI64ArrayAttr::get(rewriter.getContext(), constOffsets),
-        DenseI64ArrayAttr::get(rewriter.getContext(), srcTy.getShape()),
-        DenseI64ArrayAttr::get(rewriter.getContext(), strides));
+    auto meta = memref::ExtractStridedMetadataOp::create(rewriter, loc, src);
+    ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src,
+                                           meta.getConstifiedMixedSizes(),
+                                           meta.getConstifiedMixedStrides());
   }
 
   return ndDesc;
@@ -392,6 +358,62 @@ static Value computeOffsets(PatternRewriter &rewriter, OpType gatScatOp,
       .getResult();
 }
 
+// Collapses shapes of a nD memref to the target rank while applying offsets for
+// the collapsed dimensions. Returns the new memref value and the remaining
+// offsets for the last targetRank dimensions. For example:
+//   input: %memref = memref<2x4x8x32xf32>, offsets=[%i0, %i1, %i2, %i3],
+//   output: %memref[%i0, %i1, 0, 0] -> memref<8x32xf32>, offsets: [%i2, %i3]
+static std::pair<Value, SmallVector<OpFoldResult>>
+convertMemrefAndOffsetsToTargetRank(PatternRewriter &rewriter, Location loc,
+                                    Value memref,
+                                    SmallVector<OpFoldResult> offsets,
+                                    int64_t targetRank) {
+  auto memrefType = cast<MemRefType>(memref.getType());
+  unsigned rank = memrefType.getRank();
+
+  if (rank <= targetRank)
+    return {memref, offsets};
+
+  int64_t numCombinedDims = rank - targetRank;
+  SmallVector<OpFoldResult> subviewOffsets;
+  SmallVector<OpFoldResult> subviewSizes;
+  SmallVector<OpFoldResult> subviewStrides;
+
+  // For the combined dimensions: use the provided offsets, size=1, stride=1
+  for (unsigned i = 0; i < numCombinedDims; ++i) {
+    subviewOffsets.push_back(offsets[i]);
+    subviewSizes.push_back(rewriter.getI64IntegerAttr(1));
+    subviewStrides.push_back(rewriter.getI64IntegerAttr(1));
+  }
+
+  // For the last targetRank dimensions: offset=0, use full size, stride=1
+  SmallVector<int64_t> resultShape;
+  auto originalShape = memrefType.getShape();
+  auto meta = memref::ExtractStridedMetadataOp::create(rewriter, loc, memref);
+  for (unsigned i = numCombinedDims; i < rank; ++i) {
+    subviewOffsets.push_back(rewriter.getI64IntegerAttr(0));
+    if (ShapedType::isDynamic(originalShape[i])) {
+      subviewSizes.push_back(meta.getSizes()[i]);
+      resultShape.push_back(ShapedType::kDynamic);
+    } else {
+      subviewSizes.push_back(rewriter.getI64IntegerAttr(originalShape[i]));
+      resultShape.push_back(originalShape[i]);
+    }
+    subviewStrides.push_back(rewriter.getI64IntegerAttr(1));
+  }
+
+  auto resultType = memref::SubViewOp::inferRankReducedResultType(
+      resultShape, memrefType, subviewOffsets, subviewSizes, subviewStrides);
+  auto subviewOp =
+      memref::SubViewOp::create(rewriter, loc, resultType, memref,
+                                subviewOffsets, subviewSizes, subviewStrides);
+
+  // Return the remaining offsets for the last targetRank dimensions
+  SmallVector<OpFoldResult> newOffsets(offsets.begin() + numCombinedDims,
+                                       offsets.end());
+  return {subviewOp.getResult(), newOffsets};
+}
+
 template <
     typename OpType,
     typename = std::enable_if_t<llvm::is_one_of<
@@ -435,7 +457,8 @@ static LogicalResult lowerToScatteredLoadOp(vector::TransferReadOp readOp,
       /*chunk_size=*/IntegerAttr{},
       /*l1_hint=*/xegpu::CachePolicyAttr{},
       /*l2_hint=*/xegpu::CachePolicyAttr{},
-      /*l3_hint=*/xegpu::CachePolicyAttr{});
+      /*l3_hint=*/xegpu::CachePolicyAttr{},
+      /*layout=*/nullptr);
 
   rewriter.replaceOp(readOp, gatherOp.getResult());
   return success();
@@ -469,7 +492,8 @@ static LogicalResult lowerToScatteredStoreOp(vector::TransferWriteOp writeOp,
                                 /*chunk_size=*/IntegerAttr{},
                                 /*l1_hint=*/xegpu::CachePolicyAttr{},
                                 /*l2_hint=*/xegpu::CachePolicyAttr{},
-                                /*l3_hint=*/xegpu::CachePolicyAttr{});
+                                /*l3_hint=*/xegpu::CachePolicyAttr{},
+                                /*layout=*/nullptr);
   rewriter.eraseOp(writeOp);
   return success();
 }
@@ -523,18 +547,19 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
         descShape, elementType, /*array_length=*/1,
         /*boundary_check=*/isOutOfBounds, xegpu::MemorySpace::Global);
 
-    xegpu::CreateNdDescOp ndDesc =
-        createNdDescriptor(rewriter, loc, descType,
-                           dyn_cast<TypedValue<MemRefType>>(readOp.getBase()),
-                           readOp.getIndices());
-
     DenseI64ArrayAttr transposeAttr =
         !isTransposeLoad ? nullptr
                          : DenseI64ArrayAttr::get(rewriter.getContext(),
                                                   ArrayRef<int64_t>{1, 0});
+    auto [src, indices] = convertMemrefAndOffsetsToTargetRank(
+        rewriter, loc, readOp.getBase(), getAsOpFoldResult(readOp.getIndices()),
+        vecTy.getRank());
     // By default, no specific caching policy is assigned.
     xegpu::CachePolicyAttr hint = nullptr;
-    auto loadOp = xegpu::LoadNdOp::create(rewriter, loc, vecTy, ndDesc,
+    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
+        rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src));
+
+    auto loadOp = xegpu::LoadNdOp::create(rewriter, loc, vecTy, ndDesc, indices,
                                           /*packed=*/nullptr, transposeAttr,
                                           /*l1_hint=*/hint,
                                           /*l2_hint=*/hint, /*l3_hint=*/hint);
@@ -575,21 +600,23 @@ struct TransferWriteLowering
     if (!map.isMinorIdentity())
       return rewriter.notifyMatchFailure(writeOp, "Expects identity map");
 
+    auto [src, indices] = convertMemrefAndOffsetsToTargetRank(
+        rewriter, loc, writeOp.getBase(),
+        getAsOpFoldResult(writeOp.getIndices()), vecTy.getRank());
+
     auto descType = xegpu::TensorDescType::get(
         vecTy.getShape(), vecTy.getElementType(),
         /*array_length=*/1, /*boundary_check=*/writeOp.hasOutOfBoundsDim(),
         xegpu::MemorySpace::Global);
-    xegpu::CreateNdDescOp ndDesc =
-        createNdDescriptor(rewriter, loc, descType,
-                           dyn_cast<TypedValue<MemRefType>>(writeOp.getBase()),
-                           writeOp.getIndices());
-
     // By default, no specific caching policy is assigned.
     xegpu::CachePolicyAttr hint = nullptr;
-    auto storeOp =
-        xegpu::StoreNdOp::create(rewriter, loc, writeOp.getVector(), ndDesc,
-                                 /*l1_hint=*/hint,
-                                 /*l2_hint=*/hint, /*l3_hint=*/hint);
+    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
+        rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src));
+
+    auto storeOp = xegpu::StoreNdOp::create(rewriter, loc, writeOp.getVector(),
+                                            ndDesc, indices,
+                                            /*l1_hint=*/hint,
+                                            /*l2_hint=*/hint, /*l3_hint=*/hint);
     rewriter.replaceOp(writeOp, storeOp);
 
     return success();
@@ -621,7 +648,8 @@ struct GatherLowering : public OpRewritePattern<vector::GatherOp> {
         /*chunk_size=*/IntegerAttr{},
         /*l1_hint=*/xegpu::CachePolicyAttr{},
         /*l2_hint=*/xegpu::CachePolicyAttr{},
-        /*l3_hint=*/xegpu::CachePolicyAttr{});
+        /*l3_hint=*/xegpu::CachePolicyAttr{},
+        /*layout=*/nullptr);
 
     auto selectOp =
         arith::SelectOp::create(rewriter, loc, gatherOp.getMask(),
@@ -655,7 +683,8 @@ struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> {
                                   /*chunk_size=*/IntegerAttr{},
                                   /*l1_hint=*/xegpu::CachePolicyAttr{},
                                   /*l2_hint=*/xegpu::CachePolicyAttr{},
-                                  /*l3_hint=*/xegpu::CachePolicyAttr{});
+                                  /*l3_hint=*/xegpu::CachePolicyAttr{},
+                                  /*layout=*/nullptr);
     rewriter.eraseOp(scatterOp);
     return success();
   }
@@ -674,19 +703,24 @@ struct LoadLowering : public OpRewritePattern<vector::LoadOp> {
 
     // Boundary check is available only for block instructions.
     bool boundaryCheck = vecTy.getRank() > 1;
+    // By default, no specific caching policy is assigned.
+    xegpu::CachePolicyAttr hint = nullptr;
+
+    auto [src, indices] = convertMemrefAndOffsetsToTargetRank(
+        rewriter, loc, loadOp.getBase(), getAsOpFoldResult(loadOp.getIndices()),
+        vecTy.getRank());
 
     auto descType = xegpu::TensorDescType::get(
         vecTy.getShape(), vecTy.getElementType(), /*array_length=*/1,
         boundaryCheck, xegpu::MemorySpace::Global);
-    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
-        rewriter, loc, descType, loadOp.getBase(), loadOp.getIndices());
 
-    // By default, no specific caching policy is assigned.
-    xegpu::CachePolicyAttr hint = nullptr;
-    auto loadNdOp = xegpu::LoadNdOp::create(
-        rewriter, loc, vecTy, ndDesc, /*packed=*/nullptr, /*transpose=*/nullptr,
-        /*l1_hint=*/hint,
-        /*l2_hint=*/hint, /*l3_hint=*/hint);
+    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
+        rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src));
+    auto loadNdOp =
+        xegpu::LoadNdOp::create(rewriter, loc, vecTy, ndDesc, indices,
+                                /*packed=*/nullptr, /*transpose=*/nullptr,
+                                /*l1_hint=*/hint,
+                                /*l2_hint=*/hint, /*l3_hint=*/hint);
     rewriter.replaceOp(loadOp, loadNdOp);
 
     return success();
@@ -708,18 +742,24 @@ struct StoreLowering : public OpRewritePattern<vector::StoreOp> {
     // Boundary check is available only for block instructions.
     bool boundaryCheck = vecTy.getRank() > 1;
 
+    auto [src, indices] = convertMemrefAndOffsetsToTargetRank(
+        rewriter, loc, storeOp.getBase(),
+        getAsOpFoldResult(storeOp.getIndices()), vecTy.getRank());
+
     auto descType = xegpu::TensorDescType::get(
         vecTy.getShape(), vecTy.getElementType(),
         /*array_length=*/1, boundaryCheck, xegpu::MemorySpace::Global);
-    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
-        rewriter, loc, descType, storeOp.getBase(), storeOp.getIndices());
 
     // By default, no specific caching policy is assigned.
     xegpu::CachePolicyAttr hint = nullptr;
+    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
+        rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src));
+
     auto storeNdOp =
-        xegpu::StoreNdOp::create(rewriter, loc, vector, ndDesc,
+        xegpu::StoreNdOp::create(rewriter, loc, vector, ndDesc, indices,
                                  /*l1_hint=*/hint,
                                  /*l2_hint=*/hint, /*l3_hint=*/hint);
+
     rewriter.replaceOp(storeOp, storeNdOp);
 
     return success();
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
index 33e8f2ed1f6ed..705298f497d20 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
+++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -186,9 +186,6 @@ class CreateNdDescToXeVMPattern
     SmallVector<OpFoldResult> mixedSizes = op.getMixedSizes();
     // Descriptor shape is expected to be 2D.
     int64_t rank = mixedSizes.size();
-    if (rank != 2)
-      return rewriter.notifyMatchFailure(op, "Expected 2D shape.");
-
     auto sourceTy = source.getType();
     auto sourceMemrefTy = dyn_cast<MemRefType>(sourceTy);
     // If source is a memref, we need to extract the aligned pointer as index.
@@ -199,8 +196,19 @@ class CreateNdDescToXeVMPattern
       }
       baseAddr =
           memref::ExtractAlignedPointerAsIndexOp::create(rewriter, loc, source);
+      // Cast index to i64.
+      baseAddr = arith::IndexCastUIOp::create(rewriter, loc, i64Ty, baseAddr);
     } else {
       baseAddr = adaptor.getSource();
+      if (baseAddr.getType() != i64Ty) {
+        // Pointer type may be i32. Cast to i64 if needed.
+        baseAddr = arith::ExtUIOp::create(rewriter, loc, i64Ty, baseAddr);
+      }
+    }
+    // 1D tensor descriptor is just the base address.
+    if (rank == 1) {
+      rewriter.replaceOp(op, baseAddr);
+      return success();
     }
     // Utility for creating offset values from op fold result.
     auto createOffset = [&](SmallVector<OpFoldResult> &ofrVec,
@@ -215,13 +223,6 @@ class CreateNdDescToXeVMPattern
     // Get shape values from op fold results.
     baseShapeW = createOffset(mixedSizes, 1);
     baseShapeH = createOffset(mixedSizes, 0);
-    if (sourceMemrefTy) {
-      // Cast index to i64.
-      baseAddr = arith::IndexCastUIOp::create(rewriter, loc, i64Ty, baseAddr);
-    } else if (baseAddr.getType() != i64Ty) {
-      // Pointer type may be i32. Cast to i64 if needed.
-      baseAddr = arith::ExtUIOp::create(rewriter, loc, i64Ty, baseAddr);
-    }
     // Populate payload.
     Value payLoadAsI64 =
         vector::BitCastOp::create(rewriter, loc, payloadI64Ty, payload);
@@ -257,108 +258,175 @@ class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern<OpType> {
                   ConversionPatternRewriter &rewriter) const override {
     auto mixedOffsets = op.getMixedOffsets();
     int64_t opOffsetsSize = mixedOffsets.size();
-    if (opOffsetsSize != 2)
-      return rewriter.notifyMatchFailure(op, "Expected 2D offsets.");
     auto loc = op.getLoc();
     auto ctxt = rewriter.getContext();
 
     auto tdesc = adaptor.getTensorDesc();
     auto tdescTy = op.getTensorDescType();
-    if (tdescTy.getRank() != 2)
-      return rewriter.notifyMatchFailure(op, "Expected 2D tensor descriptor.");
+    auto tileRank = tdescTy.getRank();
+    if (opOffsetsSize != tileRank)
+      return rewriter.notifyMatchFailure(
+          op, "Expected offset rank to match descriptor rank.");
     auto elemType = tdescTy.getElementType();
     auto elemBitSize = elemType.getIntOrFloatBitWidth();
     if (elemBitSize % 8 != 0)
       return rewriter.notifyMatchFailure(
           op, "Expected element type bit width to be multiple of 8.");
 
-    VectorType payloadI64Ty = VectorType::get(4, rewriter.getI64Type());
-    Value payLoadAsI64 =
-        vector::BitCastOp::create(rewriter, loc, payloadI64Ty, tdesc);
-    Value basePtr = vector::ExtractOp::create(
-        rewriter, loc, payLoadAsI64, static_cast<int>(NdTdescOffset::BasePtr));
-    Value baseShapeW = vector::ExtractOp::create(
-        rewriter, loc, tdesc, static_cast<int>(NdTdescOffset::BaseShapeW));
-    Value baseShapeH = vector::ExtractOp::create(
-        rewriter, loc, tdesc, static_cast<int>(NdTdescOffset::BaseShapeH));
-    // Offsets are provided by the op.
-    // convert them to i32.
-    Value offsetW =
-        getValueOrCreateConstantIntOp(rewriter, loc, mixedOffsets[1]);
-    offsetW = getValueOrCreateCastToIndexLike(rewriter, loc,
-                                              rewriter.getI32Type(), offsetW);
-    Value offsetH =
-        getValueOrCreateConstantIntOp(rewriter, loc, mixedOffsets[0]);
-    offsetH = getValueOrCreateCastToIndexLike(rewriter, loc,
-                                              rewriter.getI32Type(), offsetH);
     // Get address space from tensor descriptor memory space.
     auto ptrTypeLLVM = LLVM::LLVMPointerType::get(
         ctxt, getNumericXeVMAddrSpace(tdescTy.getMemorySpace()));
-    // Convert base pointer (i64) to LLVM pointer type.
-    Value basePtrLLVM =
-        LLVM::IntToPtrOp::create(rewriter, loc, ptrTypeLLVM, basePtr);
-    // Compute element byte size and surface width in bytes.
-    Value elemByteSize = arith::ConstantIntOp::create(
-        rewriter, loc, rewriter.getI32Type(), elemBitSize / 8);
-    Value surfaceW =
-        arith::MulIOp::create(rewriter, loc, baseShapeW, elemByteSize);
-
-    // Get tile sizes and vblocks from the tensor descriptor type.
-    auto tileW = tdescTy.getDimSize(1);
-    auto tileH = tdescTy.getDimSize(0);
-    int32_t vblocks = tdescTy.getArrayLength();
-    if constexpr (std::is_same_v<OpType, xegpu::StoreNdOp>) {
-      Value src = adaptor.getValue();
-      // If store value is a scalar, get value from op instead of adaptor.
-      // Adaptor might have optimized away single element vector
-      if (src.getType().isIntOrFloat()) {
-        src = op.getValue();
-      }
-      VectorType srcVecTy = dyn_cast<VectorType>(src.getType());
-      if (!srcVecTy)
-        return rewriter.notifyMatchFailure(
-            op, "Expected store value to be a vector type.");
-      // Get flat vector type of integer type with matching element bit size.
-      VectorType newSrcVecTy =
-          encodeVectorTypeTo(srcVecTy, rewriter.getIntegerType(elemBitSize));
-      if (srcVecTy != newSrcVecTy)
-        src = vector::BitCastOp::create(rewriter, loc, newSrcVecTy, src);
-      auto storeCacheControl =
-          translateStoreXeGPUCacheHint(op.getL1Hint(), op.getL3Hint());
-      xevm::BlockStore2dOp::create(
-          rewriter, loc, basePtrLLVM, surfaceW, baseShapeH, surfaceW, offsetW,
-          offsetH, elemBitSize, tileW, tileH, src,
-          xevm::StoreCacheControlAttr::get(ctxt, storeCacheControl));
-      rewriter.eraseOp(op);
-    } else {
-      auto loadCacheControl =
-          translateLoadXeGPUCacheHint(op.getL1Hint(), op.getL3Hint());
-      if constexpr (std::is_same_v<OpType, xegpu::PrefetchNdOp>) {
-        xevm::BlockPrefetch2dOp::create(
+    if (tileRank == 2) {
+      // Compute element byte size.
+      Value elemByteSize = arith::ConstantIntOp::create(
+          rewriter, loc, rewriter.getI32Type(), elemBitSize / 8);
+      VectorType payloadI64Ty = VectorType::get(4, rewriter.getI64Type());
+      Value payLoadAsI64 =
+          vector::BitCastOp::create(rewriter, loc, payloadI64Ty, tdesc);
+      Value basePtr =
+          vector::ExtractOp::create(rewriter, loc, payLoadAsI64,
+                                    static_cast<int>(NdTdescOffset::BasePtr));
+      Value baseShapeW = vector::ExtractOp::create(
+          rewriter, loc, tdesc, static_cast<int>(NdTdescOffset::BaseShapeW));
+      Value baseShapeH = vector::ExtractOp::create(
+          rewriter, loc, tdesc, static_cast<int>(NdTdescOffset::BaseShapeH));
+      // Offsets are provided by the op.
+      // convert them to i32.
+      Value offsetW =
+          getValueOrCreateConstantIntOp(rewriter, loc, mixedOffsets[1]);
+      offsetW = getValueOrCreateCastToIndexLike(rewriter, loc,
+                                                rewriter.getI32Type(), offsetW);
+      Value offsetH =
+          getValueOrCreateConstantIntOp(rewriter, loc, mixedOffsets[0]);
+      offsetH = getValueOrCreateCastToIndexLike(rewriter, loc,
+                                                rewriter.getI32Type(), offsetH);
+      // Convert base pointer (i64) to LLVM pointer type.
+      Value basePtrLLVM =
+          LLVM::IntToPtrOp::create(rewriter, loc, ptrTypeLLVM, basePtr);
+      // Compute width in bytes.
+      Value surfaceW =
+          arith::MulIOp::create(rewriter, loc, baseShapeW, elemByteSize);
+
+      // Get tile width from the tensor descriptor type.
+      auto tileW = tdescTy.getDimSize(tileRank - 1);
+      // Get tile height from the tensor descriptor type.
+      auto tileH = tdescTy.getDimSize(0);
+      // Get vblocks from the tensor descriptor type.
+      int32_t vblocks = tdescTy.getArrayLength();
+      if constexpr (std::is_same_v<OpType, xegpu::StoreNdOp>) {
+        Value src = adaptor.getValue();
+        // If store value is a scalar, get value from op instead of adaptor.
+        // Adaptor might have optimized away single element vector
+        if (src.getType().isIntOrFloat()) {
+          src = op.getValue();
+        }
+        VectorType srcVecTy = dyn_cast<VectorType>(src.getType());
+        if (!srcVecTy)
+          return rewriter.notifyMatchFailure(
+              op, "Expected store value to be a vector type.");
+        // Get flat vector type of integer type with matching element bit size.
+        VectorType newSrcVecTy =
+            encodeVectorTypeTo(srcVecTy, rewriter.getIntegerType(elemBitSize));
+        if (srcVecTy != newSrcVecTy)
+          src = vector::BitCastOp::create(rewriter, loc, newSrcVecTy, src);
+        auto storeCacheControl =
+            translateStoreXeGPUCacheHint(op.getL1Hint(), op.getL3Hint());
+        xevm::BlockStore2dOp::create(
             rewriter, loc, basePtrLLVM, surfaceW, baseShapeH, surfaceW, offsetW,
-            offsetH, elemBitSize, tileW, tileH, vblocks,
-            xevm::LoadCacheControlAttr::get(ctxt, loadCacheControl));
+            offsetH, elemBitSize, tileW, tileH, src,
+            xevm::StoreCacheControlAttr::get(ctxt, storeCacheControl));
         rewriter.eraseOp(op);
       } else {
-        VectorType dstVecTy = cast<VectorType>(op.getValue().getType());
-        const bool vnni = op.getPacked().value_or(false);
-        auto transposeValue = op.getTranspose();
-        bool transpose =
-            transposeValue.has_value() && transposeValue.value()[0] == 1;
-        VectorType loadedTy = encodeVectorTypeTo(
-            dstVecTy, vnni ? rewriter.getI32Type()
-                           : rewriter.getIntegerType(elemBitSize));
-
-        Value resultFlatVec = xevm::BlockLoad2dOp::create(
-            rewriter, loc, loadedTy, basePtrLLVM, surfaceW, baseShapeH,
-            surfaceW, offsetW, offsetH, elemBitSize, tileW, tileH, vblocks,
-            transpose, vnni,
+        auto loadCacheControl =
+            translateLoadXeGPUCacheHint(op.getL1Hint(), op.getL3Hint());
+        if constexpr (std::is_same_v<OpType, xegpu::PrefetchNdOp>) {
+          xevm::BlockPrefetch2dOp::create(
+              rewriter, loc, basePtrLLVM, surfaceW, baseShapeH, surfaceW,
+              offsetW, offsetH, elemBitSize, tileW, tileH, vblocks,
+              xevm::LoadCacheControlAttr::get(ctxt, loadCacheControl));
+          rewriter.eraseOp(op);
+        } else {
+          VectorType dstVecTy = cast<VectorType>(op.getValue().getType());
+          const bool vnni = op.getPacked().value_or(false);
+          auto transposeValue = op.getTranspose();
+          bool transpose =
+              transposeValue.has_value() && transposeValue.value()[0] == 1;
+          VectorType loadedTy = encodeVectorTypeTo(
+              dstVecTy, vnni ? rewriter.getI32Type()
+                             : rewriter.getIntegerType(elemBitSize));
+
+          Value resultFlatVec = xevm::BlockLoad2dOp::create(
+              rewriter, loc, loadedTy, basePtrLLVM, surfaceW, baseShapeH,
+              surfaceW, offsetW, offsetH, elemBitSize, tileW, tileH, vblocks,
+              transpose, vnni,
+              xevm::LoadCacheControlAttr::get(ctxt, loadCacheControl));
+          resultFlatVec = vector::BitCastOp::create(
+              rewriter, loc,
+              encodeVectorTypeTo(loadedTy, dstVecTy.getElementType()),
+              resultFlatVec);
+          rewriter.replaceOp(op, resultFlatVec);
+        }
+      }
+    } else {
+      // 1D tensor descriptor.
+      // `tdesc` represents base address as i64
+      // Offset in number of elements, need to multiply by element byte size.
+      // Compute byte offset.
+      //   byteOffset = offset * elementByteSize
+      Value offset =
+          getValueOrCreateConstantIntOp(rewriter, loc, mixedOffsets[0]);
+      offset = getValueOrCreateCastToIndexLike(rewriter, loc,
+                                               rewriter.getI64Type(), offset);
+      // Compute element byte size.
+      Value elemByteSize = arith::ConstantIntOp::create(
+          rewriter, loc, rewriter.getI64Type(), elemBitSize / 8);
+      Value byteOffset =
+          rewriter.createOrFold<arith::MulIOp>(loc, offset, elemByteSize);
+      // Final address = basePtr + byteOffset
+      Value finalAddrI64 = rewriter.createOrFold<arith::AddIOp>(
+          loc, tdesc,
+          getValueOrCreateCastToIndexLike(rewriter, loc, rewriter.getI64Type(),
+                                          byteOffset));
+      // Convert base pointer (i64) to LLVM pointer type.
+      Value finalPtrLLVM =
+          LLVM::IntToPtrOp::create(rewriter, loc, ptrTypeLLVM, finalAddrI64);
+      if constexpr (std::is_same_v<OpType, xegpu::StoreNdOp>) {
+        Value src = adaptor.getValue();
+        // If store value is a scalar, get value from op instead of adaptor.
+        // Adaptor might have optimized away single element vector
+        if (src.getType().isIntOrFloat()) {
+          src = op.getValue();
+        }
+        VectorType srcVecTy = dyn_cast<VectorType>(src.getType());
+        if (!srcVecTy)
+          return rewriter.notifyMatchFailure(
+              op, "Expected store value to be a vector type.");
+        // Get flat vector type of integer type with matching element bit size.
+        VectorType newSrcVecTy =
+            encodeVectorTypeTo(srcVecTy, rewriter.getIntegerType(elemBitSize));
+        if (srcVecTy != newSrcVecTy)
+          src = vector::BitCastOp::create(rewriter, loc, newSrcVecTy, src);
+        auto storeCacheControl =
+            translateStoreXeGPUCacheHint(op.getL1Hint(), op.getL3Hint());
+        rewriter.replaceOpWithNewOp<xevm::BlockStoreOp>(
+            op, finalPtrLLVM, src,
+            xevm::StoreCacheControlAttr::get(ctxt, storeCacheControl));
+      } else if constexpr (std::is_same_v<OpType, xegpu::LoadNdOp>) {
+        auto loadCacheControl =
+            translateLoadXeGPUCacheHint(op.getL1Hint(), op.getL3Hint());
+        VectorType resTy = cast<VectorType>(op.getValue().getType());
+        VectorType loadedTy =
+            encodeVectorTypeTo(resTy, rewriter.getIntegerType(elemBitSize));
+        Value load = xevm::BlockLoadOp::create(
+            rewriter, loc, loadedTy, finalPtrLLVM,
             xevm::LoadCacheControlAttr::get(ctxt, loadCacheControl));
-        resultFlatVec = vector::BitCastOp::create(
-            rewriter, loc,
-            encodeVectorTypeTo(loadedTy, dstVecTy.getElementType()),
-            resultFlatVec);
-        rewriter.replaceOp(op, resultFlatVec);
+        if (loadedTy != resTy)
+          load = vector::BitCastOp::create(rewriter, loc, resTy, load);
+        rewriter.replaceOp(op, load);
+      } else {
+        return rewriter.notifyMatchFailure(
+            op, "Unsupported operation: xegpu.prefetch_nd with tensor "
+                "descriptor rank == 1");
       }
     }
     return success();
@@ -562,6 +630,8 @@ class LoadStoreMatrixToXeVMPattern : public OpConversionPattern<OpType> {
     VectorType valOrResVecTy = dyn_cast<VectorType>(data.getType());
     if (!valOrResVecTy)
       valOrResVecTy = VectorType::get(1, data.getType());
+    if (valOrResVecTy.getShape().size() != 1)
+      return rewriter.notifyMatchFailure(op, "Expected 1D data vector.");
 
     int64_t elemBitWidth =
         valOrResVecTy.getElementType().getIntOrFloatBitWidth();
@@ -927,7 +997,10 @@ struct ConvertXeGPUToXeVMPass
       return VectorType::get(sum, elemType);
     });
     typeConverter.addConversion([&](xegpu::TensorDescType type) -> Type {
+      // Scattered descriptors are not supported in XeVM lowering.
       if (type.isScattered())
+        return {};
+      if (type.getRank() == 1)
         return IntegerType::get(&getContext(), 64);
       auto i32Type = IntegerType::get(&getContext(), 32);
       return VectorType::get(8, i32Type);
diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
index 4d2d8738aa4ad..3d1a73417d1ea 100644
--- a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
@@ -66,9 +66,10 @@ static Value getSupportedReduction(AffineForOp forOp, unsigned pos,
           .Case([](arith::MaxSIOp) { return arith::AtomicRMWKind::maxs; })
           .Case([](arith::MinUIOp) { return arith::AtomicRMWKind::minu; })
           .Case([](arith::MaxUIOp) { return arith::AtomicRMWKind::maxu; })
+          .Case([](arith::XOrIOp) { return arith::AtomicRMWKind::xori; })
+          .Case([](arith::MaxNumFOp) { return arith::AtomicRMWKind::maxnumf; })
+          .Case([](arith::MinNumFOp) { return arith::AtomicRMWKind::minnumf; })
           .Default([](Operation *) -> std::optional<arith::AtomicRMWKind> {
-            // TODO: AtomicRMW supports other kinds of reductions this is
-            // currently not detecting, add those when the need arises.
             return std::nullopt;
           });
   if (!maybeKind)
diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp
index b405ec2201bf8..edfae7ee96039 100644
--- a/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp
@@ -342,8 +342,7 @@ void FlatAffineValueConstraints::getIneqAsAffineValueMap(
 
   if (inequality[pos] > 0)
     // Lower bound.
-    std::transform(bound.begin(), bound.end(), bound.begin(),
-                   std::negate<int64_t>());
+    llvm::transform(bound, bound.begin(), std::negate<int64_t>());
   else
     // Upper bound (which is exclusive).
     bound.back() += 1;
diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
index e08cc6f645d71..d428fbf2886ff 100644
--- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -1106,10 +1106,7 @@ static bool isUniformDefinition(Value value,
       return false;
   }
 
-  if (!value.getType().isIntOrIndexOrFloat())
-    return false;
-
-  return true;
+  return value.getType().isIntOrIndexOrFloat();
 }
 
 /// Generates a broadcast op for the provided uniform value using the
diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
index 845be20d15b69..deba1600e28a0 100644
--- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -1327,9 +1327,6 @@ LogicalResult mlir::affine::replaceAllMemRefUsesWith(
   assert(cast<MemRefType>(oldMemRef.getType()).getElementType() ==
          cast<MemRefType>(newMemRef.getType()).getElementType());
 
-  std::unique_ptr<DominanceInfo> domInfo;
-  std::unique_ptr<PostDominanceInfo> postDomInfo;
-
   // Walk all uses of old memref; collect ops to perform replacement. We use a
   // DenseSet since an operation could potentially have multiple uses of a
   // memref (although rare), and the replacement later is going to erase ops.
diff --git a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
index adeb50b6da628..c4e81e5dbed21 100644
--- a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
@@ -35,7 +35,7 @@ static Value createConst(Location loc, Type type, int value,
 }
 
 /// Create a float constant.
-static Value createFloatConst(Location loc, Type type, APFloat value,
+static Value createFloatConst(Location loc, Type type, const APFloat &value,
                               PatternRewriter &rewriter) {
   auto attr = rewriter.getFloatAttr(getElementTypeOrSelf(type), value);
   if (auto shapedTy = dyn_cast<ShapedType>(type)) {
diff --git a/mlir/lib/Dialect/Async/IR/Async.cpp b/mlir/lib/Dialect/Async/IR/Async.cpp
index 8e4a49df76b52..e19b917787df1 100644
--- a/mlir/lib/Dialect/Async/IR/Async.cpp
+++ b/mlir/lib/Dialect/Async/IR/Async.cpp
@@ -17,8 +17,6 @@ using namespace mlir::async;
 
 #include "mlir/Dialect/Async/IR/AsyncOpsDialect.cpp.inc"
 
-constexpr StringRef AsyncDialect::kAllowedToBlockAttrName;
-
 void AsyncDialect::initialize() {
   addOperations<
 #define GET_OP_LIST
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationDialect.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationDialect.cpp
index 6c08cdfb669f3..d6c3cd62ee742 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizationDialect.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationDialect.cpp
@@ -21,25 +21,6 @@ using namespace mlir::bufferization;
 
 #include "mlir/Dialect/Bufferization/IR/BufferizationOpsDialect.cpp.inc"
 
-/// Attribute name used to mark function arguments who's buffers can be written
-/// to during One-Shot Module Bufferize.
-constexpr const ::llvm::StringLiteral BufferizationDialect::kWritableAttrName;
-
-/// Attribute name used to mark the bufferization layout for region arguments
-/// during One-Shot Module Bufferize.
-constexpr const ::llvm::StringLiteral
-    BufferizationDialect::kBufferLayoutAttrName;
-
-/// An attribute that can be attached to ops with an allocation and/or
-/// deallocation side effect. It indicates that the op is under a "manual
-/// deallocation" scheme. In the case of an allocation op, the returned
-/// value is *not* an automatically managed allocation and assigned an
-/// ownership of "false". Furthermore, only deallocation ops that are
-/// guaranteed to deallocate a buffer under "manual deallocation" are
-/// allowed to have this attribute. (Deallocation ops without this
-/// attribute are rejected by the ownership-based buffer deallocation pass.)
-constexpr const ::llvm::StringLiteral BufferizationDialect::kManualDeallocation;
-
 //===----------------------------------------------------------------------===//
 // Bufferization Dialect Interfaces
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
index b9ee0a4d401f3..d0742ec27ed60 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
@@ -217,7 +217,9 @@ updateCalls(ModuleOp module, const AllocDynamicSizesMap &map,
     }
     if (!options.filterFn(&callee))
       return;
-    if (callee.isExternal() || callee.isPublic())
+    if (callee.isPublic() && !options.modifyPublicFunctions)
+      return;
+    if (callee.isExternal())
       return;
 
     SmallVector<Value, 6> replaceWithNewCallResults;
@@ -295,7 +297,9 @@ LogicalResult mlir::bufferization::promoteBufferResultsToOutParams(
   // function.
   AllocDynamicSizesMap map;
   for (auto func : module.getOps<func::FuncOp>()) {
-    if (func.isExternal() || func.isPublic())
+    if (func.isPublic() && !options.modifyPublicFunctions)
+      continue;
+    if (func.isExternal())
       continue;
     if (!options.filterFn(&func))
       continue;
@@ -326,6 +330,8 @@ struct BufferResultsToOutParamsPass
       options.hoistStaticAllocs = true;
     if (hoistDynamicAllocs)
       options.hoistDynamicAllocs = true;
+    if (modifyPublicFunctions)
+      options.modifyPublicFunctions = true;
 
     if (failed(bufferization::promoteBufferResultsToOutParams(getOperation(),
                                                               options)))
diff --git a/mlir/lib/Dialect/DLTI/DLTI.cpp b/mlir/lib/Dialect/DLTI/DLTI.cpp
index 173d58b7673e1..da572f123fdc1 100644
--- a/mlir/lib/Dialect/DLTI/DLTI.cpp
+++ b/mlir/lib/Dialect/DLTI/DLTI.cpp
@@ -606,11 +606,6 @@ FailureOr<Attribute> dlti::query(Operation *op, ArrayRef<StringRef> keys,
   return dlti::query(op, entryKeys, emitError);
 }
 
-constexpr const StringLiteral mlir::DLTIDialect::kDataLayoutAttrName;
-constexpr const StringLiteral mlir::DLTIDialect::kDataLayoutEndiannessKey;
-constexpr const StringLiteral mlir::DLTIDialect::kDataLayoutEndiannessBig;
-constexpr const StringLiteral mlir::DLTIDialect::kDataLayoutEndiannessLittle;
-
 namespace {
 class TargetDataLayoutInterface : public DataLayoutDialectInterface {
 public:
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index 0992ce14b4afb..d478220221f7a 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -584,6 +584,10 @@ void ForOp::print(OpAsmPrinter &p) {
 LogicalResult ForOp::verifyRegions() {
   // Check that the body defines as single block argument for the induction
   // variable.
+  if (getBody()->getNumArguments() != 1)
+    return emitOpError("expected body to have a single block argument for the "
+                       "induction variable");
+
   if (getInductionVar().getType() != getLowerBound().getType())
     return emitOpError(
         "expected induction variable to be same type as bounds and step");
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
index 1a1485ba2e02c..b097d3a0c9686 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
@@ -63,13 +63,20 @@ void buildGPUPassPipeline(OpPassManager &pm,
   if (options.xegpuOpLevel == "workgroup") {
     pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUWgToSgDistribute());
     pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+    xegpu::XeGPUPropagateLayoutOptions layoutOptions;
+    layoutOptions.layoutKind = "inst";
+    pm.addNestedPass<gpu::GPUModuleOp>(
+        xegpu::createXeGPUPropagateLayout(layoutOptions));
     pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUBlocking());
     pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
     pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
   }
   if (options.xegpuOpLevel == "subgroup" ||
       options.xegpuOpLevel == "workgroup") {
-    pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUPropagateLayout());
+    xegpu::XeGPUPropagateLayoutOptions layoutOptions;
+    layoutOptions.layoutKind = "lane";
+    pm.addNestedPass<gpu::GPUModuleOp>(
+        xegpu::createXeGPUPropagateLayout(layoutOptions));
     pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUSubgroupDistribute());
     pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
     pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
diff --git a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
index 3c447337d821f..95d5cadbd4e1a 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
@@ -39,10 +39,10 @@ void GpuModuleToBinaryPass::runOnOperation() {
   RewritePatternSet patterns(&getContext());
   auto targetFormat =
       llvm::StringSwitch<std::optional<CompilationTarget>>(compilationTarget)
-          .Cases("offloading", "llvm", CompilationTarget::Offload)
-          .Cases("assembly", "isa", CompilationTarget::Assembly)
-          .Cases("binary", "bin", CompilationTarget::Binary)
-          .Cases("fatbinary", "fatbin", CompilationTarget::Fatbin)
+          .Cases({"offloading", "llvm"}, CompilationTarget::Offload)
+          .Cases({"assembly", "isa"}, CompilationTarget::Assembly)
+          .Cases({"binary", "bin"}, CompilationTarget::Binary)
+          .Cases({"fatbinary", "fatbin"}, CompilationTarget::Fatbin)
           .Default(std::nullopt);
   if (!targetFormat)
     getOperation()->emitError() << "Invalid format specified.";
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 81c3069cec16e..ec1571a56fe4a 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -416,13 +416,39 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
   if (ci.clusterSize >= 32) {
     if (chipset.majorVersion <= 9) {
       // Broadcast last value from each row to next row.
-      // Use row mask to avoid polluting rows 1 and 3.
+      // Use row mask to avoid polluting row 0 (and row 2 if wave-64).
       dpp = amdgpu::DPPOp::create(rewriter, loc, res.getType(), res, res,
                                   amdgpu::DPPPerm::row_bcast_15,
                                   rewriter.getUnitAttr(), 0xa, allBanks,
                                   /*bound_ctrl*/ false);
       res = vector::makeArithReduction(
           rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
+
+      // For subgroupSize = 64, at this point lanes [16, 32) contain the full
+      // reduction over lanes [0, 32), but lanes [0, 16) do not. Similarly,
+      // lanes [48, 64) contain the full reduction over lanes [32, 64), but
+      // lanes [32, 48) do not.
+      //
+      // If subgroup size is 64 and cluster size is 64, we don't need lanes [0,
+      // 16) and [32, 48) to have the correct cluster-32 reduction values at
+      // this point, because only lane 63's value will ultimately be read in
+      // this full-cluster case.
+      //
+      // If subgroup size is 64 and cluster size is 32, we need to ensure that
+      // lanes [0, 16) and [32, 48) have the correct final cluster-32 reduction
+      // values (subgroup_reduce guarantees that all lanes within each cluster
+      // contain the final reduction value). We do this by broadcasting lane
+      // 31's value to lanes [0, 16) and lanes 63's value to lanes [32, 48).
+      //
+      // See https://gpuopen.com/learn/amd-gcn-assembly-cross-lane-operations
+      // for an illustration of how this within-cluster broadcast works with a
+      // swizzle.
+      if (ci.subgroupSize == 64 && ci.clusterSize == 32) {
+        res =
+            amdgpu::SwizzleBitModeOp::create(rewriter, loc, res, /*and_mask=*/0,
+                                             /*or_mask=*/31,
+                                             /*xor_mask=*/0);
+      }
     } else if (chipset.majorVersion <= 12) {
       // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
       Value uint32Max = arith::ConstantOp::create(
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 2731069d6ef54..1bf4a1c508843 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -640,8 +640,6 @@ SuccessorOperands SwitchOp::getSuccessorOperands(unsigned index) {
 // Code for LLVM::GEPOp.
 //===----------------------------------------------------------------------===//
 
-constexpr int32_t GEPOp::kDynamicIndex;
-
 GEPIndicesAdaptor<ValueRange> GEPOp::getIndices() {
   return GEPIndicesAdaptor<ValueRange>(getRawConstantIndicesAttr(),
                                        getDynamicIndices());
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index f0de4dbcc1d4b..0f7b3638fb30d 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -47,6 +47,19 @@ using namespace NVVM;
 
 static constexpr unsigned notIntrinsic = llvm::Intrinsic::not_intrinsic;
 
+//===----------------------------------------------------------------------===//
+// Helper/Utility methods
+//===----------------------------------------------------------------------===//
+
+static bool isPtrInAddrSpace(mlir::Value ptr, NVVMMemorySpace targetAS) {
+  auto ptrTy = llvm::cast<LLVM::LLVMPointerType>(ptr.getType());
+  return ptrTy.getAddressSpace() == static_cast<unsigned>(targetAS);
+}
+
+static bool isPtrInSharedCTASpace(mlir::Value ptr) {
+  return isPtrInAddrSpace(ptr, NVVMMemorySpace::Shared);
+}
+
 //===----------------------------------------------------------------------===//
 // Verifier methods
 //===----------------------------------------------------------------------===//
@@ -365,6 +378,59 @@ LogicalResult ConvertF4x2ToF16x2Op::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// Stochastic Rounding Conversion Ops
+//===----------------------------------------------------------------------===//
+
+LogicalResult ConvertF32x2ToF16x2Op::verify() {
+  if (getRnd() != FPRoundingMode::RS)
+    return emitOpError("Only RS rounding mode is supported for "
+                       "conversions from f32x2 to f16x2.");
+  return success();
+}
+
+LogicalResult ConvertF32x2ToBF16x2Op::verify() {
+  if (getRnd() != FPRoundingMode::RS)
+    return emitOpError("Only RS rounding mode is supported for "
+                       "conversions from f32x2 to bf16x2.");
+  return success();
+}
+
+LogicalResult ConvertF32x4ToF8x4Op::verify() {
+  mlir::MLIRContext *ctx = getContext();
+
+  if (!llvm::isa<mlir::Float8E4M3FNType, mlir::Float8E5M2Type>(getDstTy()))
+    return emitOpError("Only ")
+           << mlir::Float8E4M3FNType::get(ctx) << " and "
+           << mlir::Float8E5M2Type::get(ctx)
+           << " types are supported for conversions from f32x4 to f8x4.";
+
+  return success();
+}
+
+LogicalResult ConvertF32x4ToF6x4Op::verify() {
+  mlir::MLIRContext *ctx = getContext();
+
+  if (!llvm::isa<mlir::Float6E2M3FNType, mlir::Float6E3M2FNType>(getDstTy()))
+    return emitOpError("Only ")
+           << mlir::Float6E2M3FNType::get(ctx) << " and "
+           << mlir::Float6E3M2FNType::get(ctx)
+           << " types are supported for conversions from f32x4 to f6x4.";
+
+  return success();
+}
+
+LogicalResult ConvertF32x4ToF4x4Op::verify() {
+  mlir::MLIRContext *ctx = getContext();
+
+  if (!llvm::isa<mlir::Float4E2M1FNType>(getDstTy()))
+    return emitOpError("Only ") << mlir::Float4E2M1FNType::get(ctx)
+                                << " type is supported for conversions from "
+                                   "f32x4 to f4x4.";
+
+  return success();
+}
+
 LogicalResult BulkStoreOp::verify() {
   if (getInitVal() != 0)
     return emitOpError("only 0 is supported for initVal, got ") << getInitVal();
@@ -867,15 +933,40 @@ LogicalResult MmaOp::verify() {
 }
 
 LogicalResult ShflOp::verify() {
-  if (!(*this)->getAttrOfType<UnitAttr>("return_value_and_is_valid"))
-    return success();
-  auto type = llvm::dyn_cast<LLVM::LLVMStructType>(getType());
-  auto elementType = (type && type.getBody().size() == 2)
-                         ? llvm::dyn_cast<IntegerType>(type.getBody()[1])
-                         : nullptr;
-  if (!elementType || elementType.getWidth() != 1)
-    return emitError("expected return type to be a two-element struct with "
-                     "i1 as the second element");
+  auto returnStructType = llvm::dyn_cast<LLVM::LLVMStructType>(getType());
+
+  auto verifyTypeError = [&](Twine desc, Type expectedType,
+                             Type actualType) -> LogicalResult {
+    return emitOpError("expected " + desc + " to be of type ")
+           << expectedType << " but got " << actualType << " instead";
+  };
+
+  if (returnStructType) {
+    if (!getReturnValueAndIsValid())
+      return emitOpError("\"return_value_and_is_valid\" attribute must be "
+                         "specified when the return type is a struct type");
+
+    if (returnStructType.getBody().size() != 2)
+      return emitOpError("expected return type to be a two-element struct");
+
+    llvm::ArrayRef<Type> returnStruct = returnStructType.getBody();
+    auto resultType = returnStruct[0];
+    if (resultType != getVal().getType())
+      return verifyTypeError("first element in the returned struct",
+                             getVal().getType(), resultType);
+
+    auto predicateType = returnStruct[1];
+    if (!predicateType.isInteger(1))
+      return verifyTypeError("second element in the returned struct",
+                             mlir::IntegerType::get(getContext(), 1),
+                             predicateType);
+  } else {
+    if (getReturnValueAndIsValid())
+      return emitOpError("expected return type to be a two-element struct");
+
+    if (getType() != getVal().getType())
+      return verifyTypeError("return type", getVal().getType(), getType());
+  }
   return success();
 }
 
@@ -896,6 +987,12 @@ std::pair<mlir::Type, unsigned> NVVM::inferMMAType(NVVM::MMATypes type,
   } else if (type == NVVM::MMATypes::f32) {
     elementType = builder.getF32Type();
     numberElements = 8;
+  } else if (type == NVVM::MMATypes::f64) {
+    elementType = builder.getF64Type();
+    if (frag == NVVM::MMAFrag::a || frag == NVVM::MMAFrag::b)
+      numberElements = 1;
+    else
+      numberElements = 2;
   } else if (type == NVVM::MMATypes::tf32) {
     elementType = builder.getI32Type();
     numberElements = 4;
@@ -954,6 +1051,14 @@ LogicalResult NVVM::WMMALoadOp::verify() {
     return emitOpError() << "invalid attribute combination";
   std::pair<Type, unsigned> typeInfo = inferMMATypeFromMNK(
       getEltype(), getFrag(), getM(), getN(), getK(), getContext());
+  // Special case for f64 fragments
+  Type f64Ty = Float64Type::get(getContext());
+  if (typeInfo.first == f64Ty && typeInfo.second == 1) {
+    if (getType() != f64Ty)
+      return emitOpError("expected destination type to be f64");
+    return success();
+  }
+  // Everything else is a struct
   Type dstType = LLVM::LLVMStructType::getLiteral(
       getContext(), SmallVector<Type, 8>(typeInfo.second, typeInfo.first));
   if (getType() != dstType)
@@ -1412,6 +1517,15 @@ LogicalResult NVVM::BarrierOp::verify() {
   if (getNumberOfThreads() && !getBarrierId())
     return emitOpError(
         "barrier id is missing, it should be set between 0 to 15");
+
+  if (getBarrierId() && (getReductionOp() || getReductionPredicate()))
+    return emitOpError("reduction are only available when id is 0");
+
+  if ((getReductionOp() && !getReductionPredicate()) ||
+      (!getReductionOp() && getReductionPredicate()))
+    return emitOpError("reduction predicate and reduction operation must be "
+                       "specified together");
+
   return success();
 }
 
@@ -1563,6 +1677,43 @@ LogicalResult NVVM::ClusterLaunchControlQueryCancelOp::verify() {
   return success();
 }
 
+LogicalResult NVVM::ReduxOp::verify() {
+  mlir::Type reduxType = getType();
+
+  if (!reduxType.isF32()) {
+    if (getAbs())
+      return emitOpError("abs attribute is supported only for f32 type");
+    if (getNan())
+      return emitOpError("nan attribute is supported only for f32 type");
+  }
+
+  NVVM::ReduxKind kind = getKind();
+  switch (kind) {
+  case NVVM::ReduxKind::ADD:
+  case NVVM::ReduxKind::AND:
+  case NVVM::ReduxKind::OR:
+  case NVVM::ReduxKind::XOR:
+  case NVVM::ReduxKind::MAX:
+  case NVVM::ReduxKind::MIN:
+  case NVVM::ReduxKind::UMAX:
+  case NVVM::ReduxKind::UMIN:
+    if (!reduxType.isInteger(32))
+      return emitOpError("'")
+             << stringifyEnum(kind) << "' redux kind unsupported with "
+             << reduxType << " type. Only supported type is 'i32'.";
+    break;
+  case NVVM::ReduxKind::FMIN:
+  case NVVM::ReduxKind::FMAX:
+    if (!reduxType.isF32())
+      return emitOpError("'")
+             << stringifyEnum(kind) << "' redux kind unsupported with "
+             << reduxType << " type. Only supported type is 'f32'.";
+    break;
+  }
+
+  return success();
+}
+
 /// Packs the given `field` into the `result`.
 /// The `result` is 64-bits and each `field` can be 32-bits or narrower.
 static llvm::Value *
@@ -1607,10 +1758,159 @@ void Tcgen05MmaSmemDescOp::createSmemDescriptor(Operation &op,
   mt.mapValue(thisOp.getRes()) = smemDesc;
 }
 
+//===----------------------------------------------------------------------===//
+// getPtx methods
+//===----------------------------------------------------------------------===//
+
+std::string NVVM::MBarrierInitOp::getPtx() {
+  bool isShared = isPtrInSharedCTASpace(getAddr());
+  return isShared ? std::string("mbarrier.init.shared.b64 [%0], %1;")
+                  : std::string("mbarrier.init.b64 [%0], %1;");
+}
+
+std::string NVVM::MBarrierArriveExpectTxOp::getPtx() {
+  bool isShared = isPtrInSharedCTASpace(getAddr());
+  return isShared
+             ? std::string("mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;")
+             : std::string("mbarrier.arrive.expect_tx.b64 _, [%0], %1;");
+}
+
+std::string NVVM::MBarrierTryWaitParityOp::getPtx() {
+  bool isShared = isPtrInSharedCTASpace(getAddr());
+  llvm::StringRef space = isShared ? ".shared" : "";
+
+  return llvm::formatv("{\n\t"
+                       ".reg .pred       P1; \n\t"
+                       "LAB_WAIT: \n\t"
+                       "mbarrier.try_wait.parity{0}.b64 P1, [%0], %1, %2; \n\t"
+                       "@P1 bra.uni DONE; \n\t"
+                       "bra.uni     LAB_WAIT; \n\t"
+                       "DONE: \n\t"
+                       "}",
+                       space);
+}
+
 //===----------------------------------------------------------------------===//
 // getIntrinsicID/getIntrinsicIDAndArgs methods
 //===----------------------------------------------------------------------===//
 
+mlir::NVVM::IDArgPair NVVM::BarrierOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::BarrierOp>(op);
+  llvm::Value *barrierId = thisOp.getBarrierId()
+                               ? mt.lookupValue(thisOp.getBarrierId())
+                               : builder.getInt32(0);
+  llvm::Intrinsic::ID id;
+  llvm::SmallVector<llvm::Value *> args;
+  if (thisOp.getNumberOfThreads()) {
+    id = llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_count;
+    args.push_back(barrierId);
+    args.push_back(mt.lookupValue(thisOp.getNumberOfThreads()));
+  } else if (thisOp.getReductionOp()) {
+    switch (*thisOp.getReductionOp()) {
+    case NVVM::BarrierReduction::AND:
+      id = llvm::Intrinsic::nvvm_barrier0_and;
+      break;
+    case NVVM::BarrierReduction::OR:
+      id = llvm::Intrinsic::nvvm_barrier0_or;
+      break;
+    case NVVM::BarrierReduction::POPC:
+      id = llvm::Intrinsic::nvvm_barrier0_popc;
+      break;
+    }
+    args.push_back(mt.lookupValue(thisOp.getReductionPredicate()));
+  } else {
+    id = llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_all;
+    args.push_back(barrierId);
+  }
+
+  return {id, std::move(args)};
+}
+
+mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::MBarrierInitOp>(op);
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+  llvm::Intrinsic::ID id = isShared ? llvm::Intrinsic::nvvm_mbarrier_init_shared
+                                    : llvm::Intrinsic::nvvm_mbarrier_init;
+
+  // Fill the Intrinsic Args
+  llvm::SmallVector<llvm::Value *> args;
+  args.push_back(mt.lookupValue(thisOp.getAddr()));
+  args.push_back(mt.lookupValue(thisOp.getCount()));
+
+  return {id, std::move(args)};
+}
+
+mlir::NVVM::IDArgPair MBarrierInvalOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::MBarrierInvalOp>(op);
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+  llvm::Intrinsic::ID id = isShared
+                               ? llvm::Intrinsic::nvvm_mbarrier_inval_shared
+                               : llvm::Intrinsic::nvvm_mbarrier_inval;
+
+  return {id, {mt.lookupValue(thisOp.getAddr())}};
+}
+
+mlir::NVVM::IDArgPair MBarrierArriveOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::MBarrierArriveOp>(op);
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+  llvm::Intrinsic::ID id = isShared
+                               ? llvm::Intrinsic::nvvm_mbarrier_arrive_shared
+                               : llvm::Intrinsic::nvvm_mbarrier_arrive;
+
+  return {id, {mt.lookupValue(thisOp.getAddr())}};
+}
+
+mlir::NVVM::IDArgPair MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::MBarrierArriveNocompleteOp>(op);
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+  llvm::Intrinsic::ID id =
+      isShared ? llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete_shared
+               : llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete;
+  // Fill the Intrinsic Args
+  llvm::SmallVector<llvm::Value *> args;
+  args.push_back(mt.lookupValue(thisOp.getAddr()));
+  args.push_back(mt.lookupValue(thisOp.getCount()));
+
+  return {id, std::move(args)};
+}
+
+mlir::NVVM::IDArgPair MBarrierTestWaitOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::MBarrierTestWaitOp>(op);
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+  llvm::Intrinsic::ID id = isShared
+                               ? llvm::Intrinsic::nvvm_mbarrier_test_wait_shared
+                               : llvm::Intrinsic::nvvm_mbarrier_test_wait;
+  // Fill the Intrinsic Args
+  llvm::SmallVector<llvm::Value *> args;
+  args.push_back(mt.lookupValue(thisOp.getAddr()));
+  args.push_back(mt.lookupValue(thisOp.getState()));
+
+  return {id, std::move(args)};
+}
+
+mlir::NVVM::IDArgPair CpAsyncMBarrierArriveOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::CpAsyncMBarrierArriveOp>(op);
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+
+  llvm::Intrinsic::ID id;
+  if (thisOp.getNoinc()) {
+    id = isShared ? llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc_shared
+                  : llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc;
+  } else {
+    id = isShared ? llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_shared
+                  : llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive;
+  }
+
+  return {id, {mt.lookupValue(thisOp.getAddr())}};
+}
+
 #define CP_ASYNC_ID_IMPL(mod, size, suffix)                                    \
   llvm::Intrinsic::nvvm_cp_async_##mod##_shared_global_##size##suffix
 
@@ -2412,6 +2712,85 @@ Tcgen05CommitOp::getIntrinsicIDAndArgs(Operation &op,
     return TCGEN05_CP_2CTA(shape_mc, , is_2cta);                               \
   }()
 
+llvm::Intrinsic::ID ConvertF32x2ToF16x2Op::getIntrinsicID() {
+  bool hasRelu = getRelu();
+  bool hasSatFinite = (getSat() == NVVM::SaturationMode::SATFINITE);
+
+  if (hasRelu && hasSatFinite)
+    return llvm::Intrinsic::nvvm_ff2f16x2_rs_relu_satfinite;
+  if (hasRelu)
+    return llvm::Intrinsic::nvvm_ff2f16x2_rs_relu;
+  if (hasSatFinite)
+    return llvm::Intrinsic::nvvm_ff2f16x2_rs_satfinite;
+  return llvm::Intrinsic::nvvm_ff2f16x2_rs;
+}
+
+llvm::Intrinsic::ID ConvertF32x2ToBF16x2Op::getIntrinsicID() {
+  bool hasRelu = getRelu();
+  bool hasSatFinite = (getSat() == NVVM::SaturationMode::SATFINITE);
+
+  if (hasRelu && hasSatFinite)
+    return llvm::Intrinsic::nvvm_ff2bf16x2_rs_relu_satfinite;
+  if (hasRelu)
+    return llvm::Intrinsic::nvvm_ff2bf16x2_rs_relu;
+  if (hasSatFinite)
+    return llvm::Intrinsic::nvvm_ff2bf16x2_rs_satfinite;
+  return llvm::Intrinsic::nvvm_ff2bf16x2_rs;
+}
+
+llvm::Intrinsic::ID ConvertF32x4ToF8x4Op::getIntrinsicID() {
+  mlir::Type dstTy = getDstTy();
+  bool hasRelu = getRelu();
+
+  return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy)
+      .Case<mlir::Float8E4M3FNType>([&](mlir::Float8E4M3FNType) {
+        return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite
+                       : llvm::Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite;
+      })
+      .Case<mlir::Float8E5M2Type>([&](mlir::Float8E5M2Type) {
+        return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite
+                       : llvm::Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite;
+      })
+      .Default([](mlir::Type) {
+        llvm_unreachable("Invalid F8 type in ConvertF32x4ToF8x4Op");
+        return llvm::Intrinsic::not_intrinsic;
+      });
+}
+
+llvm::Intrinsic::ID ConvertF32x4ToF6x4Op::getIntrinsicID() {
+  mlir::Type dstTy = getDstTy();
+  bool hasRelu = getRelu();
+
+  return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy)
+      .Case<mlir::Float6E2M3FNType>([&](mlir::Float6E2M3FNType) {
+        return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite
+                       : llvm::Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite;
+      })
+      .Case<mlir::Float6E3M2FNType>([&](mlir::Float6E3M2FNType) {
+        return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite
+                       : llvm::Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite;
+      })
+      .Default([](mlir::Type) {
+        llvm_unreachable("Invalid F6 type in ConvertF32x4ToF6x4Op");
+        return llvm::Intrinsic::not_intrinsic;
+      });
+}
+
+llvm::Intrinsic::ID ConvertF32x4ToF4x4Op::getIntrinsicID() {
+  mlir::Type dstTy = getDstTy();
+  bool hasRelu = getRelu();
+
+  return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy)
+      .Case<mlir::Float4E2M1FNType>([&](mlir::Float4E2M1FNType) {
+        return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite
+                       : llvm::Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite;
+      })
+      .Default([](mlir::Type) {
+        llvm_unreachable("Invalid F4 type in ConvertF32x4ToF4x4Op");
+        return llvm::Intrinsic::not_intrinsic;
+      });
+}
+
 llvm::Intrinsic::ID Tcgen05CpOp::getIntrinsicID(Operation &op) {
   auto curOp = cast<NVVM::Tcgen05CpOp>(op);
   bool is2CTA = curOp.getGroup() == CTAGroupKind::CTA_2;
@@ -2451,6 +2830,9 @@ LogicalResult Tcgen05LdOp::verify() {
   if (getShape() == NVVM::Tcgen05LdStShape::SHAPE_16X32BX2 && !getOffset())
     result = emitError("shape 16x32bx2 requires offset argument");
 
+  if (getShape() != NVVM::Tcgen05LdStShape::SHAPE_16X32BX2 && getOffset())
+    result = emitError("offset argument is only supported for shape 16x32bx2");
+
   auto resTy = getRes().getType();
   unsigned resLen = isa<VectorType>(resTy)
                         ? llvm::cast<VectorType>(resTy).getNumElements()
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index b09112bcf0bb7..aa8206347e9b1 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -997,8 +997,11 @@ tileAndFuseFirstExtractUse(RewriterBase &rewriter, Diagnostic &diag,
       // Iterate over the outputs of the producer and over the loop bbArgs and
       // check if any bbArg points to the same value as the producer output. In
       // such case, make the producer output point to the bbArg directly.
-      for (OpOperand &initOperandPtr :
-           cast<DestinationStyleOpInterface>(clone).getDpsInitsMutable()) {
+      auto dpsInterface = dyn_cast<DestinationStyleOpInterface>(clone);
+      if (!dpsInterface)
+        return;
+
+      for (OpOperand &initOperandPtr : dpsInterface.getDpsInitsMutable()) {
         Value producerOperand =
             clone->getOperand(initOperandPtr.getOperandNumber());
         for (BlockArgument containerIterArg :
@@ -1060,7 +1063,7 @@ tileAndFuseFirstExtractUse(RewriterBase &rewriter, Diagnostic &diag,
       resultNumber, offsets, sizes);
 
   // Cleanup clone.
-  if (dyn_cast<LoopLikeOpInterface>(containingOp))
+  if (isa<LoopLikeOpInterface>(containingOp))
     rewriter.eraseOp(tileableProducer);
 
   return std::make_tuple(tileAndFuseResult->tiledOps, newContainingOp);
@@ -1958,7 +1961,7 @@ enum class OuterOrInnerPerm { Outer = 0, Inner = 1 };
 /// Return true if either `op` or `permutation` are empty to allow a simpler
 /// polymorphic implementation.
 template <typename RelayoutOpTy>
-bool isValidPackingPermutation(
+static bool isValidPackingPermutation(
     RelayoutOpTy op, ArrayRef<int64_t> permutation,
     OuterOrInnerPerm outerOrInnerPerm = OuterOrInnerPerm::Outer) {
   static_assert(
@@ -4322,9 +4325,10 @@ DiagnosedSilenceableFailure transform::TransposeMatmulOp::applyToOne(
 // InsertSliceToCopyOp
 //===----------------------------------------------------------------------===//
 template <typename OpTy>
-DiagnosedSilenceableFailure doit(RewriterBase &rewriter, OpTy target,
-                                 transform::ApplyToEachResultList &results,
-                                 transform::TransformState &state) {
+static DiagnosedSilenceableFailure
+doit(RewriterBase &rewriter, OpTy target,
+     transform::ApplyToEachResultList &results,
+     transform::TransformState &state) {
   static_assert(llvm::is_one_of<OpTy, tensor::InsertSliceOp,
                                 tensor::ParallelInsertSliceOp>() &&
                 "wrong op type");
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
index 57b610b31e964..8a0440bcc6fb9 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
@@ -216,8 +216,6 @@ struct LinalgOpTilingInterface
       SmallVectorImpl<OpFoldResult> &iterDomainSizes) const {
     auto linalgOp = cast<LinalgOp>(op);
 
-    std::optional<SmallVector<OpFoldResult>> iterationSpaceOffsets,
-        iterationSpaceSizes;
     SmallVector<AffineMap> indexingMaps =
         llvm::map_to_vector(operandNumbers, [&](unsigned operandNumber) {
           OpOperand &opOperand = linalgOp->getOpOperand(operandNumber);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index bd25e946908b6..027268cc20ddd 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -232,10 +232,9 @@ FailureOr<LowerPackResult> linalg::lowerPack(RewriterBase &rewriter,
 
   // 2. Compute the permutation vector to shuffle packed shape into the shape
   // before any outer or inner permutations have been applied.
-  PackingMetadata packingMetadata = computePackingMetadata(
-      packedTensorType.getRank(), packOp.getInnerDimsPos());
+  PackingMetadata packingMetadata;
   SmallVector<int64_t> packedToStripMinedShapePerm =
-      getPackInverseDestPerm(packOp);
+      getPackInverseDestPerm(packOp, packingMetadata);
 
   // 3. Compute the stripMinedShape: this is the packed shape before any outer
   // or inner permutations have been applied.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index cb6199f026e03..dcf84c46949f3 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1564,13 +1564,6 @@ vectorizeAsLinalgGeneric(RewriterBase &rewriter, VectorizationState &state,
   return success();
 }
 
-/// Given a linalg::PackOp, return the `dest` shape before any packing
-/// permutations.
-static SmallVector<int64_t> getTiledPackShape(linalg::PackOp packOp,
-                                              ArrayRef<int64_t> destShape) {
-  return applyPermutation(destShape, linalg::getPackInverseDestPerm(packOp));
-}
-
 /// Determines whether a mask for xfer_write is trivially "all true"
 ///
 /// Given all the inputs required to generate a mask (mask sizes and shapes),
@@ -1761,99 +1754,6 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vecToStore,
   return mlir::vector::maskOperation(builder, write, maskForWrite);
 }
 
-/// Vectorize linalg::PackOp with (1) static inner_tiles (2) constant
-/// padding value and (3) input vector sizes into:
-///
-///   masked_transfer_read->shape_cast->transpose->transfer_write_in_bounds
-///
-/// As in the following example:
-/// %pack = tensor.pack %src inner_dims_pos = [2, 1] inner_tiles = [16, 2]
-///     into %dst : tensor<32x8x16xf32> -> tensor<32x4x1x16x2xf32>
-///
-/// This pack would be vectorized to:
-///
-/// %load = vector.mask %mask {
-///     vector.transfer_read %arg0[%c0, %c0, %c0], %cst
-///         {in_bounds = [true, true, true]} :
-///         tensor<32x7x16xf32>, vector<32x8x16xf32>
-/// } : vector<32x8x16xi1> -> vector<32x8x16xf32>
-/// %shape_cast = vector.shape_cast %load : vector<32x8x16xf32>
-///                                         to vector<32x4x2x1x16xf32>
-/// %transpose = vector.transpose %shape_cast, [0, 1, 3, 4, 2]
-///     : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
-/// %write = vector.transfer_write %transpose,
-///     %empty[%c0_0, %c0_0, %c0_0, %c0_0, %c0_0]
-///     {in_bounds = [true, true, true, true, true]}
-///     : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
-///
-/// If the (3) input vector sizes are not provided, the vector sizes are
-/// determined by the result tensor shape and the `in_bounds`
-/// attribute is used instead of masking to mark out-of-bounds accesses.
-///
-/// NOTE: The input vector sizes specify the dimensions corresponding to the
-/// outer dimensions of the output tensor. The remaining dimensions are
-/// computed based on, e.g., the static inner tiles.
-/// Supporting dynamic inner tiles will require the user to specify the
-/// missing vector sizes. This is left as a TODO.
-static LogicalResult
-vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp,
-                        ArrayRef<int64_t> inputVectorSizes,
-                        SmallVectorImpl<Value> &newResults) {
-  // TODO: Introduce a parent class that will handle the insertion point update.
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(packOp);
-
-  Location loc = packOp.getLoc();
-  std::optional<Value> padValue = packOp.getPaddingValue()
-                                      ? std::optional(packOp.getPaddingValue())
-                                      : std::nullopt;
-
-  // If the input vector sizes are not provided, then the vector sizes are
-  // determined by the result tensor shape. In case the vector sizes aren't
-  // provided, we update the inBounds attribute instead of masking.
-  bool useInBoundsInsteadOfMasking = false;
-  if (inputVectorSizes.empty()) {
-    ArrayRef<int64_t> resultTensorShape = packOp.getDestType().getShape();
-    inputVectorSizes = resultTensorShape.take_front(packOp.getSourceRank());
-    useInBoundsInsteadOfMasking = true;
-  }
-
-  // Create masked TransferReadOp.
-  SmallVector<int64_t> inputShape(inputVectorSizes);
-  auto innerTiles = packOp.getStaticInnerTiles();
-  auto innerDimsPos = packOp.getInnerDimsPos();
-  auto outerDimsPerm = packOp.getOuterDimsPerm();
-  if (!outerDimsPerm.empty())
-    applyPermutationToVector(inputShape,
-                             invertPermutationVector(outerDimsPerm));
-  for (auto [idx, size] : enumerate(innerTiles))
-    inputShape[innerDimsPos[idx]] *= size;
-  auto maskedRead = vector::createReadOrMaskedRead(
-      rewriter, loc, packOp.getSource(), inputShape, padValue,
-      useInBoundsInsteadOfMasking,
-      /*inputScalableVecSizes=*/{});
-
-  // Create ShapeCastOp.
-  SmallVector<int64_t> destShape(inputVectorSizes);
-  destShape.append(innerTiles.begin(), innerTiles.end());
-  auto tiledPackType = VectorType::get(getTiledPackShape(packOp, destShape),
-                                       packOp.getDestType().getElementType());
-  auto shapeCastOp =
-      vector::ShapeCastOp::create(rewriter, loc, tiledPackType, maskedRead);
-
-  // Create TransposeOp.
-  auto destPermutation =
-      invertPermutationVector(getPackInverseDestPerm(packOp));
-  auto transposeOp = vector::TransposeOp::create(
-      rewriter, loc, shapeCastOp.getResult(), destPermutation);
-
-  // Create TransferWriteOp.
-  Operation *write = createWriteOrMaskedWrite(
-      rewriter, loc, transposeOp.getResult(), packOp.getDest());
-  newResults.push_back(write->getResult(0));
-  return success();
-}
-
 /// Given the re-associations, "collapses" the input Vector type
 ///
 /// This is similar to CollapseShapeOp::inferCollapsedType with two notable
@@ -1901,12 +1801,120 @@ static VectorType getCollapsedVecType(VectorType type,
   return VectorType::get(newShape, type.getElementType(), newScalableFlags);
 }
 
+/// Vectorize `linalg.pack` as:
+///   * xfer_read -> shape_cast -> transpose -> xfer_write
+///
+/// The input-vector-sizes specify the _write_ vector sizes (i.e. the vector
+/// sizes for the xfer_write operation). This is sufficient to infer the other
+/// vector sizes required here.
+///
+/// If the vector sizes are not provided:
+///  * the vector sizes are determined from the destination tensor static shape.
+///  * the inBounds attribute is used instead of masking.
+///
+/// EXAMPLE (no vector sizes):
+/// ```
+///   %pack = tensor.pack %src
+///     inner_dims_pos = [2, 1]
+///     inner_tiles = [16, 2]
+///     into %dst : tensor<32x8x16xf32> -> tensor<32x4x1x16x2xf32>
+/// ``
+/// is vectorizes as:
+/// ```
+///   %read = vector.transfer_read %src
+///     : tensor<32x7x16xf32>, vector<32x8x16xf32>
+///   %sc = vector.shape_cast %read
+///     : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
+///   %tr = vector.transpose %sc, [0, 1, 3, 4, 2]
+///     : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
+///   %write = vector.transfer_write %tr into %dest
+///     : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
+/// ```
+static LogicalResult
+vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp,
+                        ArrayRef<int64_t> inputVectorSizes,
+                        SmallVectorImpl<Value> &newResults) {
+  if (!inputVectorSizes.empty()) {
+    assert(inputVectorSizes.size() == packOp.getDestRank() &&
+           "Invalid number of input vector sizes!");
+  }
+
+  // TODO: Introduce a parent class that will handle the insertion point update.
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(packOp);
+
+  Location loc = packOp.getLoc();
+  std::optional<Value> padValue = packOp.getPaddingValue()
+                                      ? std::optional(packOp.getPaddingValue())
+                                      : std::nullopt;
+
+  SmallVector<int64_t> destShape =
+      SmallVector<int64_t>(packOp.getDestType().getShape());
+
+  // This is just a convenience alias to clearly communicate that the input
+  // vector sizes determine the _write_ sizes.
+  ArrayRef<int64_t> &writeVectorSizes = inputVectorSizes;
+
+  // In the absence of input-vector-sizes, use the _static_ input tensor shape.
+  // In addition, use the inBounds attribute instead of masking.
+  bool useInBoundsInsteadOfMasking = false;
+  if (writeVectorSizes.empty()) {
+    if (ShapedType::isDynamicShape(destShape))
+      return rewriter.notifyMatchFailure(packOp,
+                                         "unable to infer vector sizes");
+
+    writeVectorSizes = destShape;
+    useInBoundsInsteadOfMasking = true;
+  }
+
+  // Compute pre-transpose-write-vector-type, i.e. the write vector type
+  // _before_ the transposition (i.e. before dimension permutation). This is
+  // done by inverting the permutation/transposition that's part of the Pack
+  // operation. This type is required to:
+  //   1) compute the read vector type for masked-read below, and
+  //   2) generate shape-cast Op below that expands the read vector type.
+  PackingMetadata packMetadata;
+  SmallVector<int64_t> preTransposeWriteVecSizses(writeVectorSizes);
+  auto destInvPermutation = getPackInverseDestPerm(packOp, packMetadata);
+  applyPermutationToVector(preTransposeWriteVecSizses, destInvPermutation);
+  auto preTransposeWriteVecType = VectorType::get(
+      preTransposeWriteVecSizses, packOp.getType().getElementType());
+
+  // Compute vector type for the _read_ opeartion. This is simply
+  // pre-transpose-write-vector-type with the dimensions collapsed
+  // as per the Pack operation.
+  VectorType readVecType = getCollapsedVecType(
+      preTransposeWriteVecType,
+      getSymbolLessAffineMaps(convertReassociationIndicesToExprs(
+          rewriter.getContext(), packMetadata.reassociations)));
+
+  // Create masked TransferReadOp.
+  auto maskedRead = vector::createReadOrMaskedRead(
+      rewriter, loc, packOp.getSource(), readVecType, padValue,
+      useInBoundsInsteadOfMasking);
+
+  // Create ShapeCastOp.
+  auto shapeCastOp = vector::ShapeCastOp::create(
+      rewriter, loc, preTransposeWriteVecType, maskedRead);
+
+  // Create TransposeOp.
+  auto destPermutation = invertPermutationVector(destInvPermutation);
+  auto transposeOp = vector::TransposeOp::create(
+      rewriter, loc, shapeCastOp.getResult(), destPermutation);
+
+  // Create TransferWriteOp.
+  Operation *write = createWriteOrMaskedWrite(
+      rewriter, loc, transposeOp.getResult(), packOp.getDest());
+  newResults.push_back(write->getResult(0));
+  return success();
+}
+
 /// Vectorize `linalg.unpack` as:
 ///   * xfer_read -> vector.transpose -> vector.shape_cast -> xfer_write
 ///
-/// The input-vector-sizes specify the read vector sizes (i.e. the vector sizes
-/// for the xfer_read operation). This is sufficient to infer the other vector
-/// sizes required here.
+/// The input-vector-sizes specify the _read_ vector sizes (i.e. the vector
+/// sizes for the xfer_read operation). This is sufficient to infer the other
+/// vector sizes required here.
 ///
 /// If the vector sizes are not provided:
 ///  * the vector sizes are determined from the input tensor static shape.
@@ -1960,16 +1968,20 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
   // In the absence of input-vector-sizes, use the _static_ input tensor shape.
   if (inputVectorSizes.empty()) {
     if (ShapedType::isDynamicShape(sourceShape))
-      return failure();
+      return rewriter.notifyMatchFailure(unpackOp,
+                                         "Unable to infer vector sizes!");
 
     readVectorSizes.assign(sourceShape.begin(), sourceShape.end());
     useInBoundsInsteadOfMasking = true;
   }
 
   // -- Generate the read operation --
+  VectorType readVecType =
+      VectorType::get(readVectorSizes, unpackTensorType.getElementType(),
+                      readScalableVectorFlags);
   Value readResult = vector::createReadOrMaskedRead(
-      rewriter, loc, unpackOp.getSource(), readVectorSizes, std::nullopt,
-      useInBoundsInsteadOfMasking, readScalableVectorFlags);
+      rewriter, loc, unpackOp.getSource(), readVecType, std::nullopt,
+      useInBoundsInsteadOfMasking);
 
   // -- Generate the transpose operation --
   PackingMetadata packMetadata;
@@ -2015,9 +2027,10 @@ vectorizeAsTensorPadOp(RewriterBase &rewriter, tensor::PadOp padOp,
           .reifyResultShapes(rewriter, reifiedReturnShapes);
   (void)status; // prevent unused variable warning on non-assert builds
   assert(succeeded(status) && "failed to reify result shapes");
+  auto readType = VectorType::get(inputVectorSizes, padValue.getType());
   auto maskedRead = vector::createReadOrMaskedRead(
-      rewriter, loc, padOp.getSource(), inputVectorSizes, padValue,
-      /*useInBoundsInsteadOfMasking=*/false, /*inputScalableVecSizes=*/{});
+      rewriter, loc, padOp.getSource(), readType, padValue,
+      /*useInBoundsInsteadOfMasking=*/false);
 
   // Create Xfer write Op
   Value dest = tensor::EmptyOp::create(rewriter, loc, reifiedReturnShapes[0],
@@ -2212,9 +2225,9 @@ vectorizeAsLinalgContraction(RewriterBase &rewriter, VectorizationState &state,
         state.getCanonicalVecType(elemType, readMap.compose(indexingMap));
 
     Value read = mlir::vector::createReadOrMaskedRead(
-        rewriter, loc, opOperand.get(), readType.getShape(),
+        rewriter, loc, opOperand.get(), readType,
         /*padding=*/arith::getZeroConstant(rewriter, loc, elemType),
-        /*useInBoundsInsteadOfMasking=*/false, readType.getScalableDims());
+        /*useInBoundsInsteadOfMasking=*/false);
     vecOperands.push_back(read);
   }
 
@@ -2443,6 +2456,7 @@ vectorizePackOpPrecondition(linalg::PackOp packOp,
                             ArrayRef<int64_t> inputVectorSizes) {
   auto padValue = packOp.getPaddingValue();
   Attribute cstAttr;
+  // TODO: Relax this condiiton
   if (padValue && !matchPattern(padValue, m_Constant(&cstAttr))) {
     LDBG() << "pad value is not constant: " << packOp;
     return failure();
@@ -3154,9 +3168,8 @@ vectorizeAsInsertSliceOp(RewriterBase &rewriter, tensor::InsertSliceOp sliceOp,
   SmallVector<Value> readIndices(
       vecType.getRank(), arith::ConstantIndexOp::create(rewriter, loc, 0));
   Value read = mlir::vector::createReadOrMaskedRead(
-      rewriter, loc, source, vecType.getShape(), padValue,
-      /*useInBoundsInsteadOfMasking=*/inputVectorSizes.empty(),
-      /*inputScalableVecSizes=*/{});
+      rewriter, loc, source, vecType, padValue,
+      /*useInBoundsInsteadOfMasking=*/inputVectorSizes.empty());
 
   // Create write
   auto writeIndices =
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 24d3722cf5426..6eeb2063e0a9e 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -171,29 +171,24 @@ computePackUnPackPerm(int64_t rank, ArrayRef<int64_t> &innerDimsPos,
 namespace mlir {
 namespace linalg {
 
-SmallVector<int64_t> getPackInverseDestPerm(PackOp packOp) {
+SmallVector<int64_t> getPackInverseDestPerm(PackOp packOp,
+                                            PackingMetadata &metadata) {
 
-  PackingMetadata pMetadata;
   int64_t packedRank = packOp.getDestType().getRank();
   ArrayRef<int64_t> innerDimPos = packOp.getInnerDimsPos();
   ArrayRef<int64_t> outerPerm = packOp.getOuterDimsPerm();
   SmallVector<int64_t> packInvDestPerm =
-      computePackUnPackPerm(packedRank, innerDimPos, outerPerm, pMetadata);
+      computePackUnPackPerm(packedRank, innerDimPos, outerPerm, metadata);
   return packInvDestPerm;
 }
 
-SmallVector<int64_t> getUnPackInverseSrcPerm(UnPackOp unpackOp) {
-  PackingMetadata metadata;
-  return getUnPackInverseSrcPerm(unpackOp, metadata);
-}
-
 SmallVector<int64_t> getUnPackInverseSrcPerm(UnPackOp unpackOp,
                                              PackingMetadata &metadata) {
-  int64_t unpackRank = unpackOp.getSourceType().getRank();
+  int64_t packedRank = unpackOp.getSourceType().getRank();
   ArrayRef<int64_t> innerDimPos = unpackOp.getInnerDimsPos();
   ArrayRef<int64_t> outerPerm = unpackOp.getOuterDimsPerm();
   SmallVector<int64_t> unpackInvSrcPerm =
-      computePackUnPackPerm(unpackRank, innerDimPos, outerPerm, metadata);
+      computePackUnPackPerm(packedRank, innerDimPos, outerPerm, metadata);
   return unpackInvSrcPerm;
 }
 
diff --git a/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp b/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp
index bd02516d5b527..c9352e8f700d7 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp
@@ -959,7 +959,11 @@ class RewriteExtractAlignedPointerAsIndexOfViewLikeOp
                   PatternRewriter &rewriter) const override {
     auto viewLikeOp =
         extractOp.getSource().getDefiningOp<ViewLikeOpInterface>();
-    if (!viewLikeOp || extractOp.getSource() != viewLikeOp.getViewDest())
+    // ViewLikeOpInterface by itself doesn't guarantee to preserve the base
+    // pointer in general and `memref.view` is one such example, so just check
+    // for a few specific cases.
+    if (!viewLikeOp || extractOp.getSource() != viewLikeOp.getViewDest() ||
+        !isa<memref::SubViewOp, memref::ReinterpretCastOp>(viewLikeOp))
       return rewriter.notifyMatchFailure(extractOp, "not a ViewLike source");
     rewriter.modifyOpInPlace(extractOp, [&]() {
       extractOp.getSourceMutable().assign(viewLikeOp.getViewSource());
diff --git a/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp b/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp
index 6a81a15f30e47..c498c8a60bf6e 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp
@@ -90,17 +90,16 @@ struct DimOfReifyRankedShapedTypeOpInterface : public OpRewritePattern<OpTy> {
     if (!dimIndex)
       return failure();
 
-    ReifiedRankedShapedTypeDims reifiedResultShapes;
-    if (failed(reifyResultShapes(rewriter, dimValue.getOwner(),
-                                 reifiedResultShapes)))
+    FailureOr<OpFoldResult> replacement = reifyDimOfResult(
+        rewriter, dimValue.getOwner(), dimValue.getResultNumber(), *dimIndex);
+    if (failed(replacement))
       return failure();
-    unsigned resultNumber = dimValue.getResultNumber();
-    // Do not apply pattern if the IR is invalid (dim out of bounds).
-    if ((size_t)(*dimIndex) >= reifiedResultShapes[resultNumber].size())
-      return rewriter.notifyMatchFailure(dimOp, "dimension is out of bounds");
-    Value replacement = getValueOrCreateConstantIndexOp(
-        rewriter, dimOp.getLoc(), reifiedResultShapes[resultNumber][*dimIndex]);
-    rewriter.replaceOp(dimOp, replacement);
+    // Check if the OpFoldResult is empty (unreifiable dimension).
+    if (!replacement.value())
+      return failure();
+    Value replacementVal = getValueOrCreateConstantIndexOp(
+        rewriter, dimOp.getLoc(), replacement.value());
+    rewriter.replaceOp(dimOp, replacementVal);
     return success();
   }
 };
@@ -166,12 +165,14 @@ namespace {
 struct ResolveRankedShapeTypeResultDimsPass final
     : public memref::impl::ResolveRankedShapeTypeResultDimsPassBase<
           ResolveRankedShapeTypeResultDimsPass> {
+  using Base::Base;
   void runOnOperation() override;
 };
 
 struct ResolveShapedTypeResultDimsPass final
     : public memref::impl::ResolveShapedTypeResultDimsPassBase<
           ResolveShapedTypeResultDimsPass> {
+  using Base::Base;
   void runOnOperation() override;
 };
 
@@ -195,14 +196,22 @@ void memref::populateResolveShapedTypeResultDimsPatterns(
 void ResolveRankedShapeTypeResultDimsPass::runOnOperation() {
   RewritePatternSet patterns(&getContext());
   memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
-  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
+  auto result = applyPatternsGreedily(getOperation(), std::move(patterns));
+  if (errorOnPatternIterationLimit && failed(result)) {
+    getOperation()->emitOpError(
+        "dim operation resolution hit pattern iteration limit");
     return signalPassFailure();
+  }
 }
 
 void ResolveShapedTypeResultDimsPass::runOnOperation() {
   RewritePatternSet patterns(&getContext());
   memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
   memref::populateResolveShapedTypeResultDimsPatterns(patterns);
-  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
+  auto result = applyPatternsGreedily(getOperation(), std::move(patterns));
+  if (errorOnPatternIterationLimit && failed(result)) {
+    getOperation()->emitOpError(
+        "dim operation resolution hit pattern iteration limit");
     return signalPassFailure();
+  }
 }
diff --git a/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp b/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp
index 14152c5a1af0c..e5cc41e2c43ba 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp
@@ -268,61 +268,82 @@ struct SubViewOpInterface
     MemRefType sourceType = subView.getSource().getType();
 
     // For each dimension, assert that:
-    // 0 <= offset < dim_size
-    // 0 <= offset + (size - 1) * stride < dim_size
+    // For empty slices (size == 0)   : 0 <= offset <= dim_size
+    // For non-empty slices (size > 0): 0 <= offset < dim_size
+    //                                  0 <= offset + (size - 1) * stride
+    //                                  dim_size
     Value zero = arith::ConstantIndexOp::create(builder, loc, 0);
     Value one = arith::ConstantIndexOp::create(builder, loc, 1);
+
     auto metadataOp =
         ExtractStridedMetadataOp::create(builder, loc, subView.getSource());
+
     for (int64_t i : llvm::seq<int64_t>(0, sourceType.getRank())) {
-      // Reset insertion point to before the operation for each dimension
+      // Reset insertion point to before the operation for each dimension.
       builder.setInsertionPoint(subView);
+
       Value offset = getValueOrCreateConstantIndexOp(
           builder, loc, subView.getMixedOffsets()[i]);
       Value size = getValueOrCreateConstantIndexOp(builder, loc,
                                                    subView.getMixedSizes()[i]);
       Value stride = getValueOrCreateConstantIndexOp(
           builder, loc, subView.getMixedStrides()[i]);
-
-      // Verify that offset is in-bounds.
       Value dimSize = metadataOp.getSizes()[i];
-      Value offsetInBounds =
-          generateInBoundsCheck(builder, loc, offset, zero, dimSize);
-      cf::AssertOp::create(builder, loc, offsetInBounds,
+
+      // Verify that offset is in-bounds (conditional on slice size).
+      Value sizeIsZero = arith::CmpIOp::create(
+          builder, loc, arith::CmpIPredicate::eq, size, zero);
+      auto offsetCheckIf = scf::IfOp::create(
+          builder, loc, sizeIsZero,
+          [&](OpBuilder &b, Location loc) {
+            // For empty slices, offset can be at the boundary: 0 <= offset <=
+            // dimSize.
+            Value offsetGEZero = arith::CmpIOp::create(
+                b, loc, arith::CmpIPredicate::sge, offset, zero);
+            Value offsetLEDimSize = arith::CmpIOp::create(
+                b, loc, arith::CmpIPredicate::sle, offset, dimSize);
+            Value emptyOffsetValid =
+                arith::AndIOp::create(b, loc, offsetGEZero, offsetLEDimSize);
+            scf::YieldOp::create(b, loc, emptyOffsetValid);
+          },
+          [&](OpBuilder &b, Location loc) {
+            // For non-empty slices, offset must be a valid index: 0 <= offset
+            // dimSize.
+            Value offsetInBounds =
+                generateInBoundsCheck(b, loc, offset, zero, dimSize);
+            scf::YieldOp::create(b, loc, offsetInBounds);
+          });
+
+      Value offsetCondition = offsetCheckIf.getResult(0);
+      cf::AssertOp::create(builder, loc, offsetCondition,
                            generateErrorMessage(op, "offset " +
                                                         std::to_string(i) +
                                                         " is out-of-bounds"));
 
-      // Only verify if size > 0
+      // Verify that the slice endpoint is in-bounds (only for non-empty
+      // slices).
       Value sizeIsNonZero = arith::CmpIOp::create(
           builder, loc, arith::CmpIPredicate::sgt, size, zero);
+      auto ifOp = scf::IfOp::create(
+          builder, loc, sizeIsNonZero,
+          [&](OpBuilder &b, Location loc) {
+            // Verify that slice does not run out-of-bounds.
+            Value sizeMinusOne = arith::SubIOp::create(b, loc, size, one);
+            Value sizeMinusOneTimesStride =
+                arith::MulIOp::create(b, loc, sizeMinusOne, stride);
+            Value lastPos =
+                arith::AddIOp::create(b, loc, offset, sizeMinusOneTimesStride);
+            Value lastPosInBounds =
+                generateInBoundsCheck(b, loc, lastPos, zero, dimSize);
+            scf::YieldOp::create(b, loc, lastPosInBounds);
+          },
+          [&](OpBuilder &b, Location loc) {
+            Value trueVal =
+                arith::ConstantOp::create(b, loc, b.getBoolAttr(true));
+            scf::YieldOp::create(b, loc, trueVal);
+          });
 
-      auto ifOp = scf::IfOp::create(builder, loc, builder.getI1Type(),
-                                    sizeIsNonZero, /*withElseRegion=*/true);
-
-      // Populate the "then" region (for size > 0).
-      builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-
-      // Verify that slice does not run out-of-bounds.
-      Value sizeMinusOne = arith::SubIOp::create(builder, loc, size, one);
-      Value sizeMinusOneTimesStride =
-          arith::MulIOp::create(builder, loc, sizeMinusOne, stride);
-      Value lastPos =
-          arith::AddIOp::create(builder, loc, offset, sizeMinusOneTimesStride);
-      Value lastPosInBounds =
-          generateInBoundsCheck(builder, loc, lastPos, zero, dimSize);
-
-      scf::YieldOp::create(builder, loc, lastPosInBounds);
-
-      // Populate the "else" region (for size == 0).
-      builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-      Value trueVal =
-          arith::ConstantOp::create(builder, loc, builder.getBoolAttr(true));
-      scf::YieldOp::create(builder, loc, trueVal);
-
-      builder.setInsertionPointAfter(ifOp);
       Value finalCondition = ifOp.getResult(0);
-
       cf::AssertOp::create(
           builder, loc, finalCondition,
           generateErrorMessage(op,
diff --git a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
index 6200366cded29..e6adcde72ad66 100644
--- a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
+++ b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
@@ -140,6 +140,9 @@ static bool resultIsNotRead(Operation *op, std::vector<Operation *> &uses) {
   std::vector<Operation *> opUses;
   for (OpOperand &use : op->getUses()) {
     Operation *useOp = use.getOwner();
+    // Use escaped the scope
+    if (useOp->mightHaveTrait<OpTrait::IsTerminator>())
+      return false;
     if (isa<memref::DeallocOp>(useOp) ||
         (useOp->getNumResults() == 0 && useOp->getNumRegions() == 0 &&
          !mlir::hasEffect<MemoryEffects::Read>(useOp)) ||
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 35eba724a9059..8c9c137b8aebb 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -1042,6 +1042,65 @@ struct RemoveConstantIfConditionWithRegion : public OpRewritePattern<OpTy> {
   }
 };
 
+/// Remove empty acc.kernel_environment operations. If the operation has wait
+/// operands, create a acc.wait operation to preserve synchronization.
+struct RemoveEmptyKernelEnvironment
+    : public OpRewritePattern<acc::KernelEnvironmentOp> {
+  using OpRewritePattern<acc::KernelEnvironmentOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(acc::KernelEnvironmentOp op,
+                                PatternRewriter &rewriter) const override {
+    assert(op->getNumRegions() == 1 && "expected op to have one region");
+
+    Block &block = op.getRegion().front();
+    if (!block.empty())
+      return failure();
+
+    // Conservatively disable canonicalization of empty acc.kernel_environment
+    // operations if the wait operands in the kernel_environment cannot be fully
+    // represented by acc.wait operation.
+
+    // Disable canonicalization if device type is not the default
+    if (auto deviceTypeAttr = op.getWaitOperandsDeviceTypeAttr()) {
+      for (auto attr : deviceTypeAttr) {
+        if (auto dtAttr = mlir::dyn_cast<acc::DeviceTypeAttr>(attr)) {
+          if (dtAttr.getValue() != mlir::acc::DeviceType::None)
+            return failure();
+        }
+      }
+    }
+
+    // Disable canonicalization if any wait segment has a devnum
+    if (auto hasDevnumAttr = op.getHasWaitDevnumAttr()) {
+      for (auto attr : hasDevnumAttr) {
+        if (auto boolAttr = mlir::dyn_cast<mlir::BoolAttr>(attr)) {
+          if (boolAttr.getValue())
+            return failure();
+        }
+      }
+    }
+
+    // Disable canonicalization if there are multiple wait segments
+    if (auto segmentsAttr = op.getWaitOperandsSegmentsAttr()) {
+      if (segmentsAttr.size() > 1)
+        return failure();
+    }
+
+    // Remove empty kernel environment.
+    // Preserve synchronization by creating acc.wait operation if needed.
+    if (!op.getWaitOperands().empty() || op.getWaitOnlyAttr())
+      rewriter.replaceOpWithNewOp<acc::WaitOp>(op, op.getWaitOperands(),
+                                               /*asyncOperand=*/Value(),
+                                               /*waitDevnum=*/Value(),
+                                               /*async=*/nullptr,
+                                               /*ifCond=*/Value());
+    else
+      rewriter.eraseOp(op);
+
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Recipe Region Helpers
 //===----------------------------------------------------------------------===//
@@ -2690,6 +2749,15 @@ void acc::HostDataOp::getCanonicalizationPatterns(RewritePatternSet &results,
   results.add<RemoveConstantIfConditionWithRegion<HostDataOp>>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// KernelEnvironmentOp
+//===----------------------------------------------------------------------===//
+
+void acc::KernelEnvironmentOp::getCanonicalizationPatterns(
+    RewritePatternSet &results, MLIRContext *context) {
+  results.add<RemoveEmptyKernelEnvironment>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // LoopOp
 //===----------------------------------------------------------------------===//
@@ -3068,8 +3136,12 @@ LogicalResult acc::LoopOp::verify() {
   if (getRegion().empty())
     return emitError("expected non-empty body.");
 
-  // When it is container-like - it is expected to hold a loop-like operation.
-  if (isContainerLike()) {
+  if (getUnstructured()) {
+    if (!isContainerLike())
+      return emitError(
+          "unstructured acc.loop must not have induction variables");
+  } else if (isContainerLike()) {
+    // When it is container-like - it is expected to hold a loop-like operation.
     // Obtain the maximum collapse count - we use this to check that there
     // are enough loops contained.
     uint64_t collapseCount = getCollapseValue().value_or(1);
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitData.cpp b/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitData.cpp
new file mode 100644
index 0000000000000..91262bd76ca31
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitData.cpp
@@ -0,0 +1,879 @@
+//===- ACCImplicitData.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements the OpenACC specification for "Variables with
+// Implicitly Determined Data Attributes" (OpenACC 3.4 spec, section 2.6.2).
+//
+// Overview:
+// ---------
+// The pass automatically generates data clause operations for variables used
+// within OpenACC compute constructs (parallel, kernels, serial) that do not
+// already have explicit data clauses. The semantics follow these rules:
+//
+// 1. If there is a default(none) clause visible, no implicit data actions
+//    apply.
+//
+// 2. An aggregate variable (arrays, derived types, etc.) will be treated as:
+//    - In a present clause when default(present) is visible.
+//    - In a copy clause otherwise.
+//
+// 3. A scalar variable will be treated as if it appears in:
+//    - A copy clause if the compute construct is a kernels construct.
+//    - A firstprivate clause otherwise (parallel, serial).
+//
+// Requirements:
+// -------------
+// To use this pass in a pipeline, the following requirements must be met:
+//
+// 1. Type Interface Implementation: Variables from the dialect being used
+//    must implement one or both of the following MLIR interfaces:
+//    `acc::MappableType` and/or `acc::PointerLikeType`
+//
+//    These interfaces provide the necessary methods for the pass to:
+//    - Determine variable type categories (scalar vs. aggregate)
+//    - Generate appropriate bounds information
+//    - Generate privatization recipes
+//
+// 2. Operation Interface Implementation: Operations that access partial
+//    entities or create views should implement the following MLIR
+//    interfaces: `acc::PartialEntityAccess` and/or
+//    `mlir::ViewLikeOpInterface`
+//
+//    These interfaces are used for proper data clause ordering, ensuring
+//    that base entities are mapped before derived entities (e.g., a
+//    struct is mapped before its fields, an array is mapped before
+//    subarray views).
+//
+// 3. Analysis Registration (Optional): If custom behavior is needed for
+//    variable name extraction or alias analysis, the dialect should
+//    pre-register the `acc::OpenACCSupport` and `mlir::AliasAnalysis` analyses.
+//
+//    If not registered, default behavior will be used.
+//
+// Implementation Details:
+// -----------------------
+// The pass performs the following operations:
+//
+// 1. Finds candidate variables which are live-in to the compute region and
+//    are not already in a data clause or private clause.
+//
+// 2. Generates both data "entry" and "exit" clause operations that match
+//    the intended action depending on variable type:
+//    - copy -> acc.copyin (entry) + acc.copyout (exit)
+//    - present -> acc.present (entry) + acc.delete (exit)
+//    - firstprivate -> acc.firstprivate (entry only, no exit)
+//
+// 3. Ensures that default clause is taken into consideration by looking
+//    through current construct and parent constructs to find the "visible
+//    default clause".
+//
+// 4. Fixes up SSA value links so that uses in the acc region reference the
+//    result of the newly created data clause operations.
+//
+// 5. When generating implicit data clause operations, it also adds variable
+//    name information and marks them with the implicit flag.
+//
+// 6. Recipes are generated by calling the appropriate entrypoints in the
+//    MappableType and PointerLikeType interfaces.
+//
+// 7. AliasAnalysis is used to determine if a variable is already covered by
+//    an existing data clause (e.g., an interior pointer covered by its parent).
+//
+// Examples:
+// ---------
+//
+// Example 1: Scalar in parallel construct (implicit firstprivate)
+//
+// Before:
+//   func.func @test() {
+//     %scalar = memref.alloca() {acc.var_name = "x"} : memref<f32>
+//     acc.parallel {
+//       %val = memref.load %scalar[] : memref<f32>
+//       acc.yield
+//     }
+//   }
+//
+// After:
+//   func.func @test() {
+//     %scalar = memref.alloca() {acc.var_name = "x"} : memref<f32>
+//     %firstpriv = acc.firstprivate varPtr(%scalar : memref<f32>)
+//                    -> memref<f32> {implicit = true, name = "x"}
+//     acc.parallel firstprivate(@recipe -> %firstpriv : memref<f32>) {
+//       %val = memref.load %firstpriv[] : memref<f32>
+//       acc.yield
+//     }
+//   }
+//
+// Example 2: Scalar in kernels construct (implicit copy)
+//
+// Before:
+//   func.func @test() {
+//     %scalar = memref.alloca() {acc.var_name = "n"} : memref<i32>
+//     acc.kernels {
+//       %val = memref.load %scalar[] : memref<i32>
+//       acc.terminator
+//     }
+//   }
+//
+// After:
+//   func.func @test() {
+//     %scalar = memref.alloca() {acc.var_name = "n"} : memref<i32>
+//     %copyin = acc.copyin varPtr(%scalar : memref<i32>) -> memref<i32>
+//                 {dataClause = #acc<data_clause acc_copy>,
+//                  implicit = true, name = "n"}
+//     acc.kernels dataOperands(%copyin : memref<i32>) {
+//       %val = memref.load %copyin[] : memref<i32>
+//       acc.terminator
+//     }
+//     acc.copyout accPtr(%copyin : memref<i32>)
+//                 to varPtr(%scalar : memref<i32>)
+//                 {dataClause = #acc<data_clause acc_copy>,
+//                  implicit = true, name = "n"}
+//   }
+//
+// Example 3: Array (aggregate) in parallel (implicit copy)
+//
+// Before:
+//   func.func @test() {
+//     %array = memref.alloca() {acc.var_name = "arr"} : memref<100xf32>
+//     acc.parallel {
+//       %c0 = arith.constant 0 : index
+//       %val = memref.load %array[%c0] : memref<100xf32>
+//       acc.yield
+//     }
+//   }
+//
+// After:
+//   func.func @test() {
+//     %array = memref.alloca() {acc.var_name = "arr"} : memref<100xf32>
+//     %copyin = acc.copyin varPtr(%array : memref<100xf32>)
+//                 -> memref<100xf32>
+//                 {dataClause = #acc<data_clause acc_copy>,
+//                  implicit = true, name = "arr"}
+//     acc.parallel dataOperands(%copyin : memref<100xf32>) {
+//       %c0 = arith.constant 0 : index
+//       %val = memref.load %copyin[%c0] : memref<100xf32>
+//       acc.yield
+//     }
+//     acc.copyout accPtr(%copyin : memref<100xf32>)
+//                 to varPtr(%array : memref<100xf32>)
+//                 {dataClause = #acc<data_clause acc_copy>,
+//                  implicit = true, name = "arr"}
+//   }
+//
+// Example 4: Array with default(present)
+//
+// Before:
+//   func.func @test() {
+//     %array = memref.alloca() {acc.var_name = "arr"} : memref<100xf32>
+//     acc.parallel {
+//       %c0 = arith.constant 0 : index
+//       %val = memref.load %array[%c0] : memref<100xf32>
+//       acc.yield
+//     } attributes {defaultAttr = #acc<defaultvalue present>}
+//   }
+//
+// After:
+//   func.func @test() {
+//     %array = memref.alloca() {acc.var_name = "arr"} : memref<100xf32>
+//     %present = acc.present varPtr(%array : memref<100xf32>)
+//                  -> memref<100xf32>
+//                  {implicit = true, name = "arr"}
+//     acc.parallel dataOperands(%present : memref<100xf32>)
+//                  attributes {defaultAttr = #acc<defaultvalue present>} {
+//       %c0 = arith.constant 0 : index
+//       %val = memref.load %present[%c0] : memref<100xf32>
+//       acc.yield
+//     }
+//     acc.delete accPtr(%present : memref<100xf32>)
+//                {dataClause = #acc<data_clause acc_present>,
+//                 implicit = true, name = "arr"}
+//   }
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/Transforms/Passes.h"
+
+#include "mlir/Analysis/AliasAnalysis.h"
+#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/OpenACC/OpenACCUtils.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <type_traits>
+
+namespace mlir {
+namespace acc {
+#define GEN_PASS_DEF_ACCIMPLICITDATA
+#include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc"
+} // namespace acc
+} // namespace mlir
+
+#define DEBUG_TYPE "acc-implicit-data"
+
+using namespace mlir;
+
+namespace {
+
+class ACCImplicitData : public acc::impl::ACCImplicitDataBase<ACCImplicitData> {
+public:
+  using acc::impl::ACCImplicitDataBase<ACCImplicitData>::ACCImplicitDataBase;
+
+  void runOnOperation() override;
+
+private:
+  /// Collects all data clauses that dominate the compute construct.
+  /// Needed to determine if a variable is already covered by an existing data
+  /// clause.
+  SmallVector<Value> getDominatingDataClauses(Operation *computeConstructOp);
+
+  /// Looks through the `dominatingDataClauses` to find the original data clause
+  /// op for an alias. Returns nullptr if no original data clause op is found.
+  template <typename OpT>
+  Operation *getOriginalDataClauseOpForAlias(
+      Value var, OpBuilder &builder, OpT computeConstructOp,
+      const SmallVector<Value> &dominatingDataClauses);
+
+  /// Generates the appropriate `acc.copyin`, `acc.present`,`acc.firstprivate`,
+  /// etc. data clause op for a candidate variable.
+  template <typename OpT>
+  Operation *generateDataClauseOpForCandidate(
+      Value var, ModuleOp &module, OpBuilder &builder, OpT computeConstructOp,
+      const SmallVector<Value> &dominatingDataClauses,
+      const std::optional<acc::ClauseDefaultValue> &defaultClause);
+
+  /// Generates the implicit data ops for a compute construct.
+  template <typename OpT>
+  void generateImplicitDataOps(
+      ModuleOp &module, OpT computeConstructOp,
+      std::optional<acc::ClauseDefaultValue> &defaultClause);
+
+  /// Generates a private recipe for a variable.
+  acc::PrivateRecipeOp generatePrivateRecipe(ModuleOp &module, Value var,
+                                             Location loc, OpBuilder &builder,
+                                             acc::OpenACCSupport &accSupport);
+
+  /// Generates a firstprivate recipe for a variable.
+  acc::FirstprivateRecipeOp
+  generateFirstprivateRecipe(ModuleOp &module, Value var, Location loc,
+                             OpBuilder &builder,
+                             acc::OpenACCSupport &accSupport);
+
+  /// Generates recipes for a list of variables.
+  void generateRecipes(ModuleOp &module, OpBuilder &builder,
+                       Operation *computeConstructOp,
+                       const SmallVector<Value> &newOperands,
+                       SmallVector<Attribute> &newRecipeSyms);
+};
+
+/// Determines if a variable is a candidate for implicit data mapping.
+/// Returns true if the variable is a candidate, false otherwise.
+static bool isCandidateForImplicitData(Value val, Region &accRegion) {
+  // Ensure the variable is an allowed type for data clause.
+  if (!acc::isPointerLikeType(val.getType()) &&
+      !acc::isMappableType(val.getType()))
+    return false;
+
+  // If this is already coming from a data clause, we do not need to generate
+  // another.
+  if (isa_and_nonnull<ACC_DATA_ENTRY_OPS>(val.getDefiningOp()))
+    return false;
+
+  // If this is only used by private clauses, it is not a real live-in.
+  if (acc::isOnlyUsedByPrivateClauses(val, accRegion))
+    return false;
+
+  return true;
+}
+
+SmallVector<Value>
+ACCImplicitData::getDominatingDataClauses(Operation *computeConstructOp) {
+  llvm::SmallSetVector<Value, 8> dominatingDataClauses;
+
+  llvm::TypeSwitch<Operation *>(computeConstructOp)
+      .Case<acc::ParallelOp, acc::KernelsOp, acc::SerialOp>([&](auto op) {
+        for (auto dataClause : op.getDataClauseOperands()) {
+          dominatingDataClauses.insert(dataClause);
+        }
+      })
+      .Default([](Operation *) {});
+
+  // Collect the data clauses from enclosing data constructs.
+  Operation *currParentOp = computeConstructOp->getParentOp();
+  while (currParentOp) {
+    if (isa<acc::DataOp>(currParentOp)) {
+      for (auto dataClause :
+           dyn_cast<acc::DataOp>(currParentOp).getDataClauseOperands()) {
+        dominatingDataClauses.insert(dataClause);
+      }
+    }
+    currParentOp = currParentOp->getParentOp();
+  }
+
+  // Find the enclosing function/subroutine
+  auto funcOp = computeConstructOp->getParentOfType<FunctionOpInterface>();
+  if (!funcOp)
+    return dominatingDataClauses.takeVector();
+
+  // Walk the function to find `acc.declare_enter`/`acc.declare_exit` pairs that
+  // dominate and post-dominate the compute construct and add their data
+  // clauses to the list.
+  auto &domInfo = this->getAnalysis<DominanceInfo>();
+  auto &postDomInfo = this->getAnalysis<PostDominanceInfo>();
+  funcOp->walk([&](acc::DeclareEnterOp declareEnterOp) {
+    if (domInfo.dominates(declareEnterOp.getOperation(), computeConstructOp)) {
+      // Collect all `acc.declare_exit` ops for this token.
+      SmallVector<acc::DeclareExitOp> exits;
+      for (auto *user : declareEnterOp.getToken().getUsers())
+        if (auto declareExit = dyn_cast<acc::DeclareExitOp>(user))
+          exits.push_back(declareExit);
+
+      // Only add clauses if every `acc.declare_exit` op post-dominates the
+      // compute construct.
+      if (!exits.empty() && llvm::all_of(exits, [&](acc::DeclareExitOp exitOp) {
+            return postDomInfo.postDominates(exitOp, computeConstructOp);
+          })) {
+        for (auto dataClause : declareEnterOp.getDataClauseOperands())
+          dominatingDataClauses.insert(dataClause);
+      }
+    }
+  });
+
+  return dominatingDataClauses.takeVector();
+}
+
+template <typename OpT>
+Operation *ACCImplicitData::getOriginalDataClauseOpForAlias(
+    Value var, OpBuilder &builder, OpT computeConstructOp,
+    const SmallVector<Value> &dominatingDataClauses) {
+  auto &aliasAnalysis = this->getAnalysis<AliasAnalysis>();
+  for (auto dataClause : dominatingDataClauses) {
+    if (auto *dataClauseOp = dataClause.getDefiningOp()) {
+      // Only accept clauses that guarantee that the alias is present.
+      if (isa<acc::CopyinOp, acc::CreateOp, acc::PresentOp, acc::NoCreateOp,
+              acc::DevicePtrOp>(dataClauseOp))
+        if (aliasAnalysis.alias(acc::getVar(dataClauseOp), var).isMust())
+          return dataClauseOp;
+    }
+  }
+  return nullptr;
+}
+
+// Generates bounds for variables that have unknown dimensions
+static void fillInBoundsForUnknownDimensions(Operation *dataClauseOp,
+                                             OpBuilder &builder) {
+
+  if (!acc::getBounds(dataClauseOp).empty())
+    // If bounds are already present, do not overwrite them.
+    return;
+
+  // For types that have unknown dimensions, attempt to generate bounds by
+  // relying on MappableType being able to extract it from the IR.
+  auto var = acc::getVar(dataClauseOp);
+  auto type = var.getType();
+  if (auto mappableTy = dyn_cast<acc::MappableType>(type)) {
+    if (mappableTy.hasUnknownDimensions()) {
+      TypeSwitch<Operation *>(dataClauseOp)
+          .Case<ACC_DATA_ENTRY_OPS, ACC_DATA_EXIT_OPS>([&](auto dataClauseOp) {
+            if (std::is_same_v<decltype(dataClauseOp), acc::DevicePtrOp>)
+              return;
+            OpBuilder::InsertionGuard guard(builder);
+            builder.setInsertionPoint(dataClauseOp);
+            auto bounds = mappableTy.generateAccBounds(var, builder);
+            if (!bounds.empty())
+              dataClauseOp.getBoundsMutable().assign(bounds);
+          });
+    }
+  }
+}
+
+acc::PrivateRecipeOp
+ACCImplicitData::generatePrivateRecipe(ModuleOp &module, Value var,
+                                       Location loc, OpBuilder &builder,
+                                       acc::OpenACCSupport &accSupport) {
+  auto type = var.getType();
+  std::string recipeName =
+      accSupport.getRecipeName(acc::RecipeKind::private_recipe, type, var);
+
+  // Check if recipe already exists
+  auto existingRecipe = module.lookupSymbol<acc::PrivateRecipeOp>(recipeName);
+  if (existingRecipe)
+    return existingRecipe;
+
+  // Set insertion point to module body in a scoped way
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(module.getBody());
+
+  auto recipe =
+      acc::PrivateRecipeOp::createAndPopulate(builder, loc, recipeName, type);
+  if (!recipe.has_value())
+    return accSupport.emitNYI(loc, "implicit private"), nullptr;
+  return recipe.value();
+}
+
+acc::FirstprivateRecipeOp
+ACCImplicitData::generateFirstprivateRecipe(ModuleOp &module, Value var,
+                                            Location loc, OpBuilder &builder,
+                                            acc::OpenACCSupport &accSupport) {
+  auto type = var.getType();
+  std::string recipeName =
+      accSupport.getRecipeName(acc::RecipeKind::firstprivate_recipe, type, var);
+
+  // Check if recipe already exists
+  auto existingRecipe =
+      module.lookupSymbol<acc::FirstprivateRecipeOp>(recipeName);
+  if (existingRecipe)
+    return existingRecipe;
+
+  // Set insertion point to module body in a scoped way
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(module.getBody());
+
+  auto recipe = acc::FirstprivateRecipeOp::createAndPopulate(builder, loc,
+                                                             recipeName, type);
+  if (!recipe.has_value())
+    return accSupport.emitNYI(loc, "implicit firstprivate"), nullptr;
+  return recipe.value();
+}
+
+void ACCImplicitData::generateRecipes(ModuleOp &module, OpBuilder &builder,
+                                      Operation *computeConstructOp,
+                                      const SmallVector<Value> &newOperands,
+                                      SmallVector<Attribute> &newRecipeSyms) {
+  auto &accSupport = this->getAnalysis<acc::OpenACCSupport>();
+  for (auto var : newOperands) {
+    auto loc{var.getLoc()};
+    if (isa<acc::PrivateOp>(var.getDefiningOp())) {
+      auto recipe = generatePrivateRecipe(
+          module, acc::getVar(var.getDefiningOp()), loc, builder, accSupport);
+      if (recipe)
+        newRecipeSyms.push_back(SymbolRefAttr::get(module->getContext(),
+                                                   recipe.getSymName().str()));
+    } else if (isa<acc::FirstprivateOp>(var.getDefiningOp())) {
+      auto recipe = generateFirstprivateRecipe(
+          module, acc::getVar(var.getDefiningOp()), loc, builder, accSupport);
+      if (recipe)
+        newRecipeSyms.push_back(SymbolRefAttr::get(module->getContext(),
+                                                   recipe.getSymName().str()));
+    } else {
+      accSupport.emitNYI(var.getLoc(), "implicit reduction");
+    }
+  }
+}
+
+// Generates the data entry data op clause so that it adheres to OpenACC
+// rules as follows (line numbers and specification from OpenACC 3.4):
+// 1388 An aggregate variable will be treated as if it appears either:
+// 1389 - In a present clause if there is a default(present) clause visible at
+// the compute construct.
+// 1391 - In a copy clause otherwise.
+// 1392 A scalar variable will be treated as if it appears either:
+// 1393 - In a copy clause if the compute construct is a kernels construct.
+// 1394 - In a firstprivate clause otherwise.
+template <typename OpT>
+Operation *ACCImplicitData::generateDataClauseOpForCandidate(
+    Value var, ModuleOp &module, OpBuilder &builder, OpT computeConstructOp,
+    const SmallVector<Value> &dominatingDataClauses,
+    const std::optional<acc::ClauseDefaultValue> &defaultClause) {
+  auto &accSupport = this->getAnalysis<acc::OpenACCSupport>();
+  acc::VariableTypeCategory typeCategory =
+      acc::VariableTypeCategory::uncategorized;
+  if (auto mappableTy = dyn_cast<acc::MappableType>(var.getType())) {
+    typeCategory = mappableTy.getTypeCategory(var);
+  } else if (auto pointerLikeTy =
+                 dyn_cast<acc::PointerLikeType>(var.getType())) {
+    typeCategory = pointerLikeTy.getPointeeTypeCategory(
+        cast<TypedValue<acc::PointerLikeType>>(var),
+        pointerLikeTy.getElementType());
+  }
+
+  bool isScalar =
+      acc::bitEnumContainsAny(typeCategory, acc::VariableTypeCategory::scalar);
+  bool isAnyAggregate = acc::bitEnumContainsAny(
+      typeCategory, acc::VariableTypeCategory::aggregate);
+  Location loc = computeConstructOp->getLoc();
+
+  Operation *op = nullptr;
+  op = getOriginalDataClauseOpForAlias(var, builder, computeConstructOp,
+                                       dominatingDataClauses);
+  if (op) {
+    if (isa<acc::NoCreateOp>(op))
+      return acc::NoCreateOp::create(builder, loc, var,
+                                     /*structured=*/true, /*implicit=*/true,
+                                     accSupport.getVariableName(var),
+                                     acc::getBounds(op));
+
+    if (isa<acc::DevicePtrOp>(op))
+      return acc::DevicePtrOp::create(builder, loc, var,
+                                      /*structured=*/true, /*implicit=*/true,
+                                      accSupport.getVariableName(var),
+                                      acc::getBounds(op));
+
+    // The original data clause op is a PresentOp, CopyinOp, or CreateOp,
+    // hence guaranteed to be present.
+    return acc::PresentOp::create(builder, loc, var,
+                                  /*structured=*/true, /*implicit=*/true,
+                                  accSupport.getVariableName(var),
+                                  acc::getBounds(op));
+  } else if (isScalar) {
+    if (enableImplicitReductionCopy &&
+        acc::isOnlyUsedByReductionClauses(var,
+                                          computeConstructOp->getRegion(0))) {
+      auto copyinOp =
+          acc::CopyinOp::create(builder, loc, var,
+                                /*structured=*/true, /*implicit=*/true,
+                                accSupport.getVariableName(var));
+      copyinOp.setDataClause(acc::DataClause::acc_reduction);
+      return copyinOp.getOperation();
+    }
+    if constexpr (std::is_same_v<OpT, acc::KernelsOp> ||
+                  std::is_same_v<OpT, acc::KernelEnvironmentOp>) {
+      // Scalars are implicit copyin in kernels construct.
+      // We also do the same for acc.kernel_environment because semantics
+      // of user variable mappings should be applied while ACC construct exists
+      // and at this point we should only be dealing with unmapped variables
+      // that were made live-in by the compiler.
+      // TODO: This may be revisited.
+      auto copyinOp =
+          acc::CopyinOp::create(builder, loc, var,
+                                /*structured=*/true, /*implicit=*/true,
+                                accSupport.getVariableName(var));
+      copyinOp.setDataClause(acc::DataClause::acc_copy);
+      return copyinOp.getOperation();
+    } else {
+      // Scalars are implicit firstprivate in parallel and serial construct.
+      return acc::FirstprivateOp::create(builder, loc, var,
+                                         /*structured=*/true, /*implicit=*/true,
+                                         accSupport.getVariableName(var));
+    }
+  } else if (isAnyAggregate) {
+    Operation *newDataOp = nullptr;
+
+    // When default(present) is true, the implicit behavior is present.
+    if (defaultClause.has_value() &&
+        defaultClause.value() == acc::ClauseDefaultValue::Present) {
+      newDataOp = acc::PresentOp::create(builder, loc, var,
+                                         /*structured=*/true, /*implicit=*/true,
+                                         accSupport.getVariableName(var));
+    } else {
+      auto copyinOp =
+          acc::CopyinOp::create(builder, loc, var,
+                                /*structured=*/true, /*implicit=*/true,
+                                accSupport.getVariableName(var));
+      copyinOp.setDataClause(acc::DataClause::acc_copy);
+      newDataOp = copyinOp.getOperation();
+    }
+
+    return newDataOp;
+  } else {
+    // This is not a fatal error - for example when the element type is
+    // pointer type (aka we have a pointer of pointer), it is potentially a
+    // deep copy scenario which is not being handled here.
+    // Other types need to be canonicalized. Thus just log unhandled cases.
+    LLVM_DEBUG(llvm::dbgs()
+               << "Unhandled case for implicit data mapping " << var << "\n");
+  }
+  return nullptr;
+}
+
+// Ensures that result values from the acc data clause ops are used inside the
+// acc region. ie:
+// acc.kernels {
+//   use %val
+// }
+// =>
+// %dev = acc.dataop %val
+// acc.kernels {
+//   use %dev
+// }
+static void legalizeValuesInRegion(Region &accRegion,
+                                   SmallVector<Value> &newPrivateOperands,
+                                   SmallVector<Value> &newDataClauseOperands) {
+  for (Value dataClause :
+       llvm::concat<Value>(newDataClauseOperands, newPrivateOperands)) {
+    Value var = acc::getVar(dataClause.getDefiningOp());
+    replaceAllUsesInRegionWith(var, dataClause, accRegion);
+  }
+}
+
+// Adds the private operands and private recipes to the data construct
+// operation in a valid way (ensures that the index in the privatizationRecipes
+// array matches the position of the private operand).
+template <typename OpT>
+static void
+addNewPrivateOperands(OpT &accOp, const SmallVector<Value> &privateOperands,
+                      const SmallVector<Attribute> &privateRecipeSyms) {
+  assert(privateOperands.size() == privateRecipeSyms.size());
+  if (privateOperands.empty())
+    return;
+
+  SmallVector<Attribute> completePrivateRecipesSyms;
+  SmallVector<Attribute> completeFirstprivateRecipesSyms;
+  SmallVector<Value> newPrivateOperands;
+  SmallVector<Value> newFirstprivateOperands;
+
+  // Collect all of the existing recipes since they are held in an attribute.
+  // To add to it, we need to create a brand new one.
+  if (accOp.getPrivatizationRecipes().has_value())
+    for (auto privatization : accOp.getPrivatizationRecipesAttr())
+      completePrivateRecipesSyms.push_back(privatization);
+  if (accOp.getFirstprivatizationRecipes().has_value())
+    for (auto privatization : accOp.getFirstprivatizationRecipesAttr())
+      completeFirstprivateRecipesSyms.push_back(privatization);
+
+  // Now separate between private and firstprivate operands.
+  for (auto [priv, privateRecipeSym] :
+       llvm::zip(privateOperands, privateRecipeSyms)) {
+    if (isa<acc::PrivateOp>(priv.getDefiningOp())) {
+      newPrivateOperands.push_back(priv);
+      completePrivateRecipesSyms.push_back(privateRecipeSym);
+    } else if (isa<acc::FirstprivateOp>(priv.getDefiningOp())) {
+      newFirstprivateOperands.push_back(priv);
+      completeFirstprivateRecipesSyms.push_back(privateRecipeSym);
+    } else {
+      llvm_unreachable("unhandled private operand");
+    }
+  }
+
+  // Append all of the new private operands to their appropriate list.
+  accOp.getPrivateOperandsMutable().append(newPrivateOperands);
+  accOp.getFirstprivateOperandsMutable().append(newFirstprivateOperands);
+
+  // Update the privatizationRecipes attributes to hold all of the new recipes.
+  if (!completePrivateRecipesSyms.empty())
+    accOp.setPrivatizationRecipesAttr(
+        ArrayAttr::get(accOp.getContext(), completePrivateRecipesSyms));
+  if (!completeFirstprivateRecipesSyms.empty())
+    accOp.setFirstprivatizationRecipesAttr(
+        ArrayAttr::get(accOp.getContext(), completeFirstprivateRecipesSyms));
+}
+
+static Operation *findDataExitOp(Operation *dataEntryOp) {
+  auto res = acc::getAccVar(dataEntryOp);
+  for (auto *user : res.getUsers())
+    if (isa<ACC_DATA_EXIT_OPS>(user))
+      return user;
+  return nullptr;
+}
+
+// Generates matching data exit operation as described in the acc dialect
+// for how data clauses are decomposed:
+// https://mlir.llvm.org/docs/Dialects/OpenACCDialect/#operation-categories
+// Key ones used here:
+// * acc {construct} copy -> acc.copyin (before region) + acc.copyout (after
+// region)
+// * acc {construct} present -> acc.present (before region) + acc.delete
+// (after region)
+static void
+generateDataExitOperations(OpBuilder &builder, Operation *accOp,
+                           const SmallVector<Value> &newDataClauseOperands,
+                           const SmallVector<Value> &sortedDataClauseOperands) {
+  builder.setInsertionPointAfter(accOp);
+  Value lastDataClause = nullptr;
+  for (auto dataEntry : llvm::reverse(sortedDataClauseOperands)) {
+    if (llvm::find(newDataClauseOperands, dataEntry) ==
+        newDataClauseOperands.end()) {
+      // If this is not a new data clause operand, we should not generate an
+      // exit operation for it.
+      lastDataClause = dataEntry;
+      continue;
+    }
+    if (lastDataClause)
+      if (auto *dataExitOp = findDataExitOp(lastDataClause.getDefiningOp()))
+        builder.setInsertionPointAfter(dataExitOp);
+    Operation *dataEntryOp = dataEntry.getDefiningOp();
+    if (isa<acc::CopyinOp>(dataEntryOp)) {
+      auto copyoutOp = acc::CopyoutOp::create(
+          builder, dataEntryOp->getLoc(), dataEntry, acc::getVar(dataEntryOp),
+          /*structured=*/true, /*implicit=*/true,
+          acc::getVarName(dataEntryOp).value(), acc::getBounds(dataEntryOp));
+      copyoutOp.setDataClause(acc::DataClause::acc_copy);
+    } else if (isa<acc::PresentOp, acc::NoCreateOp>(dataEntryOp)) {
+      auto deleteOp = acc::DeleteOp::create(
+          builder, dataEntryOp->getLoc(), dataEntry,
+          /*structured=*/true, /*implicit=*/true,
+          acc::getVarName(dataEntryOp).value(), acc::getBounds(dataEntryOp));
+      deleteOp.setDataClause(acc::getDataClause(dataEntryOp).value());
+    } else if (isa<acc::DevicePtrOp>(dataEntryOp)) {
+      // Do nothing.
+    } else {
+      llvm_unreachable("unhandled data exit");
+    }
+    lastDataClause = dataEntry;
+  }
+}
+
+/// Returns all base references of a value in order.
+/// So for example, if we have a reference to a struct field like
+/// s.f1.f2.f3, this will return <s, s.f1, s.f1.f2, s.f1.f2.f3>.
+/// Any intermediate casts/view-like operations are included in the
+/// chain as well.
+static SmallVector<Value> getBaseRefsChain(Value val) {
+  SmallVector<Value> baseRefs;
+  baseRefs.push_back(val);
+  while (true) {
+    Value prevVal = val;
+
+    val = acc::getBaseEntity(val);
+    if (val != baseRefs.front())
+      baseRefs.insert(baseRefs.begin(), val);
+
+    // If this is a view-like operation, it is effectively another
+    // view of the same entity so we should add it to the chain also.
+    if (auto viewLikeOp = val.getDefiningOp<ViewLikeOpInterface>()) {
+      val = viewLikeOp.getViewSource();
+      baseRefs.insert(baseRefs.begin(), val);
+    }
+
+    // Continue loop if we made any progress
+    if (val == prevVal)
+      break;
+  }
+
+  return baseRefs;
+}
+
+static void insertInSortedOrder(SmallVector<Value> &sortedDataClauseOperands,
+                                Operation *newClause) {
+  auto *insertPos =
+      std::find_if(sortedDataClauseOperands.begin(),
+                   sortedDataClauseOperands.end(), [&](Value dataClauseVal) {
+                     // Get the base refs for the current clause we are looking
+                     // at.
+                     auto var = acc::getVar(dataClauseVal.getDefiningOp());
+                     auto baseRefs = getBaseRefsChain(var);
+
+                     // If the newClause is of a base ref of an existing clause,
+                     // we should insert it right before the current clause.
+                     // Thus return true to stop iteration when this is the
+                     // case.
+                     return std::find(baseRefs.begin(), baseRefs.end(),
+                                      acc::getVar(newClause)) != baseRefs.end();
+                   });
+
+  if (insertPos != sortedDataClauseOperands.end()) {
+    newClause->moveBefore(insertPos->getDefiningOp());
+    sortedDataClauseOperands.insert(insertPos, acc::getAccVar(newClause));
+  } else {
+    sortedDataClauseOperands.push_back(acc::getAccVar(newClause));
+  }
+}
+
+template <typename OpT>
+void ACCImplicitData::generateImplicitDataOps(
+    ModuleOp &module, OpT computeConstructOp,
+    std::optional<acc::ClauseDefaultValue> &defaultClause) {
+  // Implicit data attributes are only applied if "[t]here is no default(none)
+  // clause visible at the compute construct."
+  if (defaultClause.has_value() &&
+      defaultClause.value() == acc::ClauseDefaultValue::None)
+    return;
+  assert(!defaultClause.has_value() ||
+         defaultClause.value() == acc::ClauseDefaultValue::Present);
+
+  // 1) Collect live-in values.
+  Region &accRegion = computeConstructOp->getRegion(0);
+  SetVector<Value> liveInValues;
+  getUsedValuesDefinedAbove(accRegion, liveInValues);
+
+  // 2) Run the filtering to find relevant pointers that need copied.
+  auto isCandidate{[&](Value val) -> bool {
+    return isCandidateForImplicitData(val, accRegion);
+  }};
+  auto candidateVars(
+      llvm::to_vector(llvm::make_filter_range(liveInValues, isCandidate)));
+  if (candidateVars.empty())
+    return;
+
+  // 3) Generate data clauses for the variables.
+  SmallVector<Value> newPrivateOperands;
+  SmallVector<Value> newDataClauseOperands;
+  OpBuilder builder(computeConstructOp);
+  if (!candidateVars.empty()) {
+    LLVM_DEBUG(llvm::dbgs() << "== Generating clauses for ==\n"
+                            << computeConstructOp << "\n");
+  }
+  auto dominatingDataClauses = getDominatingDataClauses(computeConstructOp);
+  for (auto var : candidateVars) {
+    auto newDataClauseOp = generateDataClauseOpForCandidate(
+        var, module, builder, computeConstructOp, dominatingDataClauses,
+        defaultClause);
+    fillInBoundsForUnknownDimensions(newDataClauseOp, builder);
+    LLVM_DEBUG(llvm::dbgs() << "Generated data clause for " << var << ":\n"
+                            << "\t" << *newDataClauseOp << "\n");
+    if (isa_and_nonnull<acc::PrivateOp, acc::FirstprivateOp, acc::ReductionOp>(
+            newDataClauseOp)) {
+      newPrivateOperands.push_back(acc::getAccVar(newDataClauseOp));
+    } else if (isa_and_nonnull<ACC_DATA_CLAUSE_OPS>(newDataClauseOp)) {
+      newDataClauseOperands.push_back(acc::getAccVar(newDataClauseOp));
+      dominatingDataClauses.push_back(acc::getAccVar(newDataClauseOp));
+    }
+  }
+
+  // 4) Legalize values in region (aka the uses in the region are the result
+  // of the data clause ops)
+  legalizeValuesInRegion(accRegion, newPrivateOperands, newDataClauseOperands);
+
+  SmallVector<Attribute> newPrivateRecipeSyms;
+  // 5) Generate private recipes which are required for properly attaching
+  // private operands.
+  if constexpr (!std::is_same_v<OpT, acc::KernelsOp> &&
+                !std::is_same_v<OpT, acc::KernelEnvironmentOp>)
+    generateRecipes(module, builder, computeConstructOp, newPrivateOperands,
+                    newPrivateRecipeSyms);
+
+  // 6) Figure out insertion order for the new data clause operands.
+  SmallVector<Value> sortedDataClauseOperands(
+      computeConstructOp.getDataClauseOperands());
+  for (auto newClause : newDataClauseOperands)
+    insertInSortedOrder(sortedDataClauseOperands, newClause.getDefiningOp());
+
+  // 7) Generate the data exit operations.
+  generateDataExitOperations(builder, computeConstructOp, newDataClauseOperands,
+                             sortedDataClauseOperands);
+
+  // 8) Add all of the new operands to the compute construct op.
+  assert(newPrivateOperands.size() == newPrivateRecipeSyms.size() &&
+         "sizes must match");
+  if constexpr (!std::is_same_v<OpT, acc::KernelsOp> &&
+                !std::is_same_v<OpT, acc::KernelEnvironmentOp>)
+    addNewPrivateOperands(computeConstructOp, newPrivateOperands,
+                          newPrivateRecipeSyms);
+
+  computeConstructOp.getDataClauseOperandsMutable().assign(
+      sortedDataClauseOperands);
+}
+
+void ACCImplicitData::runOnOperation() {
+  ModuleOp module = this->getOperation();
+  module.walk([&](Operation *op) {
+    if (isa<ACC_COMPUTE_CONSTRUCT_OPS, acc::KernelEnvironmentOp>(op)) {
+      assert(op->getNumRegions() == 1 && "must have 1 region");
+
+      auto defaultClause = acc::getDefaultAttr(op);
+      llvm::TypeSwitch<Operation *, void>(op)
+          .Case<ACC_COMPUTE_CONSTRUCT_OPS, acc::KernelEnvironmentOp>(
+              [&](auto op) {
+                generateImplicitDataOps(module, op, defaultClause);
+              })
+          .Default([&](Operation *) {});
+    }
+  });
+}
+
+} // namespace
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
index 7d934956089a5..f8fff5958f8c7 100644
--- a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_mlir_dialect_library(MLIROpenACCTransforms
+  ACCImplicitData.cpp
   LegalizeDataValues.cpp
 
   ADDITIONAL_HEADER_DIRS
@@ -14,7 +15,10 @@ add_mlir_dialect_library(MLIROpenACCTransforms
   MLIROpenACCTypeInterfacesIncGen
 
   LINK_LIBS PUBLIC
+  MLIRAnalysis
+  MLIROpenACCAnalysis
   MLIROpenACCDialect
+  MLIROpenACCUtils
   MLIRFuncDialect
   MLIRIR
   MLIRPass
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
index 660c3138af0ec..fbac28e740750 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
@@ -145,3 +145,13 @@ std::string mlir::acc::getRecipeName(mlir::acc::RecipeKind kind,
 
   return recipeName;
 }
+
+mlir::Value mlir::acc::getBaseEntity(mlir::Value val) {
+  if (auto partialEntityAccessOp =
+          dyn_cast<PartialEntityAccessOpInterface>(val.getDefiningOp())) {
+    if (!partialEntityAccessOp.isCompleteView())
+      return partialEntityAccessOp.getBaseEntity();
+  }
+
+  return val;
+}
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 2946b53c8cb36..881e256a8797b 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -2565,6 +2565,39 @@ struct ConvertTrivialIfToSelect : public OpRewritePattern<IfOp> {
 struct ConditionPropagation : public OpRewritePattern<IfOp> {
   using OpRewritePattern<IfOp>::OpRewritePattern;
 
+  /// Kind of parent region in the ancestor cache.
+  enum class Parent { Then, Else, None };
+
+  /// Returns the kind of region ("then", "else", or "none") of the
+  /// IfOp that the given region is transitively nested in. Updates
+  /// the cache accordingly.
+  static Parent getParentType(Region *toCheck, IfOp op,
+                              DenseMap<Region *, Parent> &cache,
+                              Region *endRegion) {
+    SmallVector<Region *> seen;
+    while (toCheck != endRegion) {
+      auto found = cache.find(toCheck);
+      if (found != cache.end())
+        return found->second;
+      seen.push_back(toCheck);
+      if (&op.getThenRegion() == toCheck) {
+        for (Region *region : seen)
+          cache[region] = Parent::Then;
+        return Parent::Then;
+      }
+      if (&op.getElseRegion() == toCheck) {
+        for (Region *region : seen)
+          cache[region] = Parent::Else;
+        return Parent::Else;
+      }
+      toCheck = toCheck->getParentRegion();
+    }
+
+    for (Region *region : seen)
+      cache[region] = Parent::None;
+    return Parent::None;
+  }
+
   LogicalResult matchAndRewrite(IfOp op,
                                 PatternRewriter &rewriter) const override {
     // Early exit if the condition is constant since replacing a constant
@@ -2580,9 +2613,12 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> {
     Value constantTrue = nullptr;
     Value constantFalse = nullptr;
 
+    DenseMap<Region *, Parent> cache;
     for (OpOperand &use :
          llvm::make_early_inc_range(op.getCondition().getUses())) {
-      if (op.getThenRegion().isAncestor(use.getOwner()->getParentRegion())) {
+      switch (getParentType(use.getOwner()->getParentRegion(), op, cache,
+                            op.getCondition().getParentRegion())) {
+      case Parent::Then: {
         changed = true;
 
         if (!constantTrue)
@@ -2591,8 +2627,9 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> {
 
         rewriter.modifyOpInPlace(use.getOwner(),
                                  [&]() { use.set(constantTrue); });
-      } else if (op.getElseRegion().isAncestor(
-                     use.getOwner()->getParentRegion())) {
+        break;
+      }
+      case Parent::Else: {
         changed = true;
 
         if (!constantFalse)
@@ -2601,6 +2638,10 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> {
 
         rewriter.modifyOpInPlace(use.getOwner(),
                                  [&]() { use.set(constantFalse); });
+        break;
+      }
+      case Parent::None:
+        break;
       }
     }
 
diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelForToNestedFors.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelForToNestedFors.cpp
index 8f7d5e308f433..c469a991fff64 100644
--- a/mlir/lib/Dialect/SCF/Transforms/ParallelForToNestedFors.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ParallelForToNestedFors.cpp
@@ -44,7 +44,6 @@ mlir::scf::parallelForToNestedFors(RewriterBase &rewriter,
          lowerBounds.size() == steps.size() &&
          "Mismatched parallel loop bounds");
 
-  SmallVector<Value> ivs;
   scf::LoopNest loopNest =
       scf::buildLoopNest(rewriter, loc, lowerBounds, upperBounds, steps);
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index 7a26cd301eb99..1fbcf5fdc68db 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -1050,7 +1050,7 @@ class SparseInsertConverter : public OpConversionPattern<tensor::InsertOp> {
 /// Sparse codegen rule for position accesses.
 class SparseToPositionsConverter : public OpConversionPattern<ToPositionsOp> {
 public:
-  using OpAdaptor = typename ToPositionsOp::Adaptor;
+  using OpAdaptor = ToPositionsOp::Adaptor;
   using OpConversionPattern<ToPositionsOp>::OpConversionPattern;
   LogicalResult
   matchAndRewrite(ToPositionsOp op, OneToNOpAdaptor adaptor,
@@ -1073,7 +1073,7 @@ class SparseToPositionsConverter : public OpConversionPattern<ToPositionsOp> {
 class SparseToCoordinatesConverter
     : public OpConversionPattern<ToCoordinatesOp> {
 public:
-  using OpAdaptor = typename ToCoordinatesOp::Adaptor;
+  using OpAdaptor = ToCoordinatesOp::Adaptor;
   using OpConversionPattern<ToCoordinatesOp>::OpConversionPattern;
   LogicalResult
   matchAndRewrite(ToCoordinatesOp op, OneToNOpAdaptor adaptor,
@@ -1099,7 +1099,7 @@ class SparseToCoordinatesConverter
 class SparseToCoordinatesBufferConverter
     : public OpConversionPattern<ToCoordinatesBufferOp> {
 public:
-  using OpAdaptor = typename ToCoordinatesBufferOp::Adaptor;
+  using OpAdaptor = ToCoordinatesBufferOp::Adaptor;
   using OpConversionPattern<ToCoordinatesBufferOp>::OpConversionPattern;
   LogicalResult
   matchAndRewrite(ToCoordinatesBufferOp op, OneToNOpAdaptor adaptor,
@@ -1121,7 +1121,7 @@ class SparseToCoordinatesBufferConverter
 /// Sparse codegen rule for value accesses.
 class SparseToValuesConverter : public OpConversionPattern<ToValuesOp> {
 public:
-  using OpAdaptor = typename ToValuesOp::Adaptor;
+  using OpAdaptor = ToValuesOp::Adaptor;
   using OpConversionPattern<ToValuesOp>::OpConversionPattern;
   LogicalResult
   matchAndRewrite(ToValuesOp op, OneToNOpAdaptor adaptor,
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
index 869d27a443f5d..7e8d3600293f8 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
@@ -22,7 +22,6 @@
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
-#include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp
index 4ec13e189f621..686f6eed1f8c7 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp
@@ -77,6 +77,9 @@ namespace {
 struct ReifyExpandShapeOp
     : public ReifyRankedShapedTypeOpInterface::ExternalModel<ReifyExpandShapeOp,
                                                              ExpandShapeOp> {
+  using Base =
+      ReifyRankedShapedTypeOpInterface::ExternalModel<ReifyExpandShapeOp,
+                                                      ExpandShapeOp>;
   LogicalResult
   reifyResultShapes(Operation *op, OpBuilder &b,
                     ReifiedRankedShapedTypeDims &reifyResultShapes) const {
diff --git a/mlir/lib/Dialect/Tensor/Transforms/RuntimeOpVerification.cpp b/mlir/lib/Dialect/Tensor/Transforms/RuntimeOpVerification.cpp
index 753cb95b1c906..d35f458cbdb36 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/RuntimeOpVerification.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/RuntimeOpVerification.cpp
@@ -155,13 +155,15 @@ struct ExtractSliceOpInterface
     RankedTensorType sourceType = extractSliceOp.getSource().getType();
 
     // For each dimension, assert that:
-    // 0 <= offset < dim_size
-    // 0 <= offset + (size - 1) * stride < dim_size
+    // For empty slices (size == 0)   : 0 <= offset <= dim_size
+    // For non-empty slices (size > 0): 0 <= offset < dim_size
+    //                                  0 <= offset + (size - 1) * stride <
+    //                                  dim_size
     Value zero = arith::ConstantIndexOp::create(builder, loc, 0);
     Value one = arith::ConstantIndexOp::create(builder, loc, 1);
 
     for (int64_t i : llvm::seq<int64_t>(0, sourceType.getRank())) {
-      // Reset insertion point to before the operation for each dimension
+
       builder.setInsertionPoint(extractSliceOp);
 
       Value offset = getValueOrCreateConstantIndexOp(
@@ -170,46 +172,63 @@ struct ExtractSliceOpInterface
           builder, loc, extractSliceOp.getMixedSizes()[i]);
       Value stride = getValueOrCreateConstantIndexOp(
           builder, loc, extractSliceOp.getMixedStrides()[i]);
-
-      // Verify that offset is in-bounds.
       Value dimSize = builder.createOrFold<tensor::DimOp>(
           loc, extractSliceOp.getSource(), i);
-      Value offsetInBounds =
-          generateInBoundsCheck(builder, loc, offset, zero, dimSize);
-      cf::AssertOp::create(builder, loc, offsetInBounds,
+
+      // Verify that offset is in-bounds (conditional on slice size).
+      Value sizeIsZero = arith::CmpIOp::create(
+          builder, loc, arith::CmpIPredicate::eq, size, zero);
+      auto offsetCheckIf = scf::IfOp::create(
+          builder, loc, sizeIsZero,
+          [&](OpBuilder &b, Location loc) {
+            // For empty slices, offset can be at the boundary: 0 <= offset <=
+            // dimSize.
+            Value offsetGEZero = arith::CmpIOp::create(
+                b, loc, arith::CmpIPredicate::sge, offset, zero);
+            Value offsetLEDimSize = arith::CmpIOp::create(
+                b, loc, arith::CmpIPredicate::sle, offset, dimSize);
+            Value emptyOffsetValid =
+                arith::AndIOp::create(b, loc, offsetGEZero, offsetLEDimSize);
+            scf::YieldOp::create(b, loc, emptyOffsetValid);
+          },
+          [&](OpBuilder &b, Location loc) {
+            // For non-empty slices, offset must be a valid index: 0 <= offset <
+            // dimSize.
+            Value offsetInBounds =
+                generateInBoundsCheck(b, loc, offset, zero, dimSize);
+            scf::YieldOp::create(b, loc, offsetInBounds);
+          });
+
+      Value offsetCondition = offsetCheckIf.getResult(0);
+      cf::AssertOp::create(builder, loc, offsetCondition,
                            generateErrorMessage(op, "offset " +
                                                         std::to_string(i) +
                                                         " is out-of-bounds"));
 
-      // Only verify if size > 0
+      // Verify that the slice endpoint is in-bounds (only for non-empty
+      // slices).
       Value sizeIsNonZero = arith::CmpIOp::create(
           builder, loc, arith::CmpIPredicate::sgt, size, zero);
+      auto ifOp = scf::IfOp::create(
+          builder, loc, sizeIsNonZero,
+          [&](OpBuilder &b, Location loc) {
+            // Verify that slice does not run out-of-bounds.
+            Value sizeMinusOne = arith::SubIOp::create(b, loc, size, one);
+            Value sizeMinusOneTimesStride =
+                arith::MulIOp::create(b, loc, sizeMinusOne, stride);
+            Value lastPos =
+                arith::AddIOp::create(b, loc, offset, sizeMinusOneTimesStride);
+            Value lastPosInBounds =
+                generateInBoundsCheck(b, loc, lastPos, zero, dimSize);
+            scf::YieldOp::create(b, loc, lastPosInBounds);
+          },
+          [&](OpBuilder &b, Location loc) {
+            Value trueVal =
+                arith::ConstantOp::create(b, loc, b.getBoolAttr(true));
+            scf::YieldOp::create(b, loc, trueVal);
+          });
 
-      auto ifOp = scf::IfOp::create(builder, loc, builder.getI1Type(),
-                                    sizeIsNonZero, /*withElseRegion=*/true);
-
-      // Populate the "then" region (for size > 0).
-      builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-
-      // Verify that slice does not run out-of-bounds.
-      Value sizeMinusOne = arith::SubIOp::create(builder, loc, size, one);
-      Value sizeMinusOneTimesStride =
-          arith::MulIOp::create(builder, loc, sizeMinusOne, stride);
-      Value lastPos =
-          arith::AddIOp::create(builder, loc, offset, sizeMinusOneTimesStride);
-      Value lastPosInBounds =
-          generateInBoundsCheck(builder, loc, lastPos, zero, dimSize);
-      scf::YieldOp::create(builder, loc, lastPosInBounds);
-
-      // Populate the "else" region (for size == 0).
-      builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-      Value trueVal =
-          arith::ConstantOp::create(builder, loc, builder.getBoolAttr(true));
-      scf::YieldOp::create(builder, loc, trueVal);
-
-      builder.setInsertionPointAfter(ifOp);
       Value finalCondition = ifOp.getResult(0);
-
       cf::AssertOp::create(
           builder, loc, finalCondition,
           generateErrorMessage(
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index bf3810ff231da..65e0a59d39168 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -1761,6 +1761,11 @@ LogicalResult tosa::ConcatOp::verify() {
       }
     }
 
+    const ShapeAdaptor outputShape(outType);
+    if (outputShape.hasRank() && outputShape.getRank() != firstInputRank)
+      return emitOpError("expect output rank to match inputs rank, got ")
+             << outputShape.getRank() << " vs " << firstInputRank;
+
     // ERROR_IF(axis_sum != shape[axis]);
     int64_t axisSum = 0;
     for (const auto &input : inputList) {
@@ -1772,7 +1777,7 @@ LogicalResult tosa::ConcatOp::verify() {
       }
       axisSum += inputShape.getDimSize(axis);
     }
-    const ShapeAdaptor outputShape(outType);
+
     if (axisSum >= 0 && outputShape.hasRank() &&
         !outputShape.isDynamicDim(axis) &&
         axisSum != outputShape.getDimSize(axis))
@@ -2628,7 +2633,7 @@ static LogicalResult verifyZeroPoint(T op, Value val, const int64_t &zp,
   if (!zpElemType.isInteger(8) && zp != 0) {
     // convert operand to lower case for error message
     std::string lower = operand;
-    std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
+    llvm::transform(lower, lower.begin(), ::tolower);
     return op.emitOpError()
            << lower << " zero point must be zero for non-int8 integer types";
   }
diff --git a/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp b/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp
index 4f4620a4497d5..8859541c78c91 100644
--- a/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp
+++ b/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp
@@ -47,8 +47,6 @@ static bool happensBefore(Operation *a, Operation *b) {
 // TransformState
 //===----------------------------------------------------------------------===//
 
-constexpr const Value transform::TransformState::kTopLevelValue;
-
 transform::TransformState::TransformState(
     Region *region, Operation *payloadRoot,
     const RaggedArray<MappedValue> &extraMappings,
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index ad8255a95cb4e..daef0ba02100a 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -717,7 +717,15 @@ Value mlir::vector::getVectorReductionOp(arith::AtomicRMWKind op,
   case arith::AtomicRMWKind::ori:
     return vector::ReductionOp::create(builder, vector.getLoc(),
                                        CombiningKind::OR, vector);
-  // TODO: Add remaining reduction operations.
+  case arith::AtomicRMWKind::minnumf:
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::MINNUMF, vector);
+  case arith::AtomicRMWKind::maxnumf:
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::MAXNUMF, vector);
+  case arith::AtomicRMWKind::xori:
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::XOR, vector);
   default:
     (void)emitOptionalError(loc, "Reduction operation type not supported");
     break;
@@ -4336,7 +4344,7 @@ OpFoldResult ExtractStridedSliceOp::fold(FoldAdaptor adaptor) {
   // ExtractStridedSliceOp(splat ConstantOp) -> ConstantOp.
   if (auto splat =
           llvm::dyn_cast_if_present<SplatElementsAttr>(adaptor.getSource()))
-    DenseElementsAttr::get(getType(), splat.getSplatValue<Attribute>());
+    return DenseElementsAttr::get(getType(), splat.getSplatValue<Attribute>());
 
   // ExtractStridedSliceOp(non-splat ConstantOp) -> ConstantOp.
   return foldExtractStridedSliceNonSplatConstant(*this, adaptor.getSource());
diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
index c809c50206793..c307fb441e3ad 100644
--- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -322,46 +322,61 @@ Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc,
                                      std::optional<Value> padValue,
                                      bool useInBoundsInsteadOfMasking,
                                      ArrayRef<bool> inputScalableVecDims) {
-  assert(!llvm::is_contained(inputVectorSizes, ShapedType::kDynamic) &&
+  VectorType vecToReadTy = VectorType::get(
+      inputVectorSizes, cast<ShapedType>(source.getType()).getElementType(),
+      inputScalableVecDims);
+
+  return createReadOrMaskedRead(builder, loc, source, vecToReadTy, padValue,
+                                useInBoundsInsteadOfMasking);
+}
+
+Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc,
+                                     Value source,
+                                     const VectorType &vecToReadTy,
+                                     std::optional<Value> padValue,
+                                     bool useInBoundsInsteadOfMasking) {
+  assert(!llvm::is_contained(vecToReadTy.getScalableDims(),
+                             ShapedType::kDynamic) &&
          "invalid input vector sizes");
   auto sourceShapedType = cast<ShapedType>(source.getType());
   auto sourceShape = sourceShapedType.getShape();
-  assert(sourceShape.size() == inputVectorSizes.size() &&
+
+  int64_t vecToReadRank = vecToReadTy.getRank();
+  auto vecToReadShape = vecToReadTy.getShape();
+
+  assert(sourceShape.size() == static_cast<size_t>(vecToReadRank) &&
          "expected same ranks.");
-  auto vectorType =
-      VectorType::get(inputVectorSizes, sourceShapedType.getElementType(),
-                      inputScalableVecDims);
   assert((!padValue.has_value() ||
           padValue.value().getType() == sourceShapedType.getElementType()) &&
          "expected same pad element type to match source element type");
-  int64_t readRank = inputVectorSizes.size();
+
   auto zero = arith::ConstantIndexOp::create(builder, loc, 0);
-  SmallVector<bool> inBoundsVal(readRank, true);
+  SmallVector<bool> inBoundsVal(vecToReadRank, true);
 
   if (useInBoundsInsteadOfMasking) {
     // Update the inBounds attribute.
     // FIXME: This computation is too weak - it ignores the read indices.
-    for (unsigned i = 0; i < readRank; i++)
-      inBoundsVal[i] = (sourceShape[i] == inputVectorSizes[i]) &&
+    for (unsigned i = 0; i < vecToReadRank; i++)
+      inBoundsVal[i] = (sourceShape[i] == vecToReadShape[i]) &&
                        ShapedType::isStatic(sourceShape[i]);
   }
   auto transferReadOp = vector::TransferReadOp::create(
       builder, loc,
-      /*vectorType=*/vectorType,
+      /*vectorType=*/vecToReadTy,
       /*source=*/source,
-      /*indices=*/SmallVector<Value>(readRank, zero),
+      /*indices=*/SmallVector<Value>(vecToReadRank, zero),
       /*padding=*/padValue,
       /*inBounds=*/inBoundsVal);
 
-  if (llvm::equal(inputVectorSizes, sourceShape) || useInBoundsInsteadOfMasking)
+  if (llvm::equal(vecToReadTy.getShape(), sourceShape) ||
+      useInBoundsInsteadOfMasking)
     return transferReadOp;
   SmallVector<OpFoldResult> mixedSourceDims =
       isa<MemRefType>(source.getType())
           ? memref::getMixedSizes(builder, loc, source)
           : tensor::getMixedSizes(builder, loc, source);
 
-  auto maskType = VectorType::get(inputVectorSizes, builder.getI1Type(),
-                                  inputScalableVecDims);
+  auto maskType = vecToReadTy.cloneWith(/*shape=*/{}, builder.getI1Type());
   Value mask =
       vector::CreateMaskOp::create(builder, loc, maskType, mixedSourceDims);
   return mlir::vector::maskOperation(builder, transferReadOp, mask)
diff --git a/mlir/lib/Dialect/XeGPU/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/CMakeLists.txt
index 31167e6af908b..46b8251a57797 100644
--- a/mlir/lib/Dialect/XeGPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(IR)
 add_subdirectory(Transforms)
 add_subdirectory(Utils)
+add_subdirectory(TransformOps)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index f9aa28d5203db..fb5d1e758dbd1 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -11,7 +11,6 @@
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
 #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
@@ -38,55 +37,61 @@ void XeGPUDialect::initialize() {
       >();
 }
 
-/// Generates instructions to compute offsets for a subgroup identified by
-/// its multidimensional indices (sgId), using the specified subgroup layout
-/// (sgLayout), subgroup data dimensions (sizePerSg), and the overall data
-/// dimensions (sizePerWg).
+// A `srcShape` consists of N distribution units, each being `subShapesLayout` x
+// `subShape`. A `delinearizedId` is used to identify a particular `subShape`
+// within each distribution unit.
+// Example:
+// WG data is 128x256. SG data is 16x32, in 4x2 layout, this gives a
+// distribution unit of shape 64x64, we have 2x4 such distribution units.
+// `delinearizedId` is used to identify a 16x32 of a subgroup in each
+// distribution unit.
 static SmallVector<SmallVector<Value>>
-genOffsetsComputingInsts(OpBuilder &builder, Location loc,
-                         SmallVector<Value> sgId, ArrayRef<int64_t> sgLayout,
-                         ArrayRef<int64_t> sizePerSg,
-                         ArrayRef<int64_t> sizePerWg) {
-
-  SmallVector<SmallVector<Value>> offsets;
+genCoordinates(OpBuilder &builder, Location loc,
+               SmallVector<Value> delinearizedId,
+               ArrayRef<int64_t> subShapesLayout, ArrayRef<int64_t> subShape,
+               ArrayRef<int64_t> srcShape) {
+  SmallVector<SmallVector<Value>> coordinates;
+
+  // A distribution unit must be less than or equal to `srcShape`
+  SmallVector<int64_t> distUnitShape = llvm::map_to_vector(
+      llvm::zip_equal(srcShape,
+                      computeElementwiseMul(subShapesLayout, subShape)),
+      [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
 
-  // nd local offset, localOffset[i] = sgId[i] * sizePerSg[i]
-  SmallVector<Value> localOffsets = llvm::map_to_vector(
-      llvm::zip(sgId, sizePerSg), [&](const auto &t) -> Value {
+  // Get the offset of `subShape` within a distribution unit.
+  SmallVector<Value> distUnitLocalOffset = llvm::map_to_vector(
+      llvm::zip(delinearizedId, subShape), [&](const auto &t) -> Value {
         return builder.createOrFold<index::MulOp>(
             loc, std::get<0>(t),
             builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t)));
       });
 
-  // distUnit[i] is the minimum value between sizePerWg[i] and
-  // sgLayout[i] * sizePerSg[i]
-  SmallVector<int64_t> distUnit = llvm::map_to_vector(
-      llvm::zip_equal(sizePerWg, computeElementwiseMul(sgLayout, sizePerSg)),
-      [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
-
+  // For each dist unit
   for (SmallVector<int64_t> unitOffs :
-       StaticTileOffsetRange(sizePerWg, distUnit)) {
+       StaticTileOffsetRange(srcShape, distUnitShape)) {
+    // Get dist unit offset within `srcShape`.
     SmallVector<Value> base =
         llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
           return arith::ConstantIndexOp::create(builder, loc, d);
         });
-
-    SmallVector<Value> adds = llvm::map_to_vector(
-        llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
-          return builder.createOrFold<arith::AddIOp>(loc, std::get<0>(t),
-                                                     std::get<1>(t));
-        });
-
+    // Calculate `subShape` offset within `srcShape`.
+    SmallVector<Value> adds =
+        llvm::map_to_vector(llvm::zip_equal(base, distUnitLocalOffset),
+                            [&](const auto &t) -> Value {
+                              return builder.createOrFold<arith::AddIOp>(
+                                  loc, std::get<0>(t), std::get<1>(t));
+                            });
+    // Do not go beyond `srcShape` bounds.
     SmallVector<Value> mods = llvm::map_to_vector(
-        llvm::zip_equal(adds, sizePerWg), [&](const auto &t) -> Value {
+        llvm::zip_equal(adds, srcShape), [&](const auto &t) -> Value {
           return builder.createOrFold<index::RemUOp>(
               loc, std::get<0>(t),
               arith::ConstantIndexOp::create(builder, loc, std::get<1>(t)));
         });
 
-    offsets.push_back(mods);
+    coordinates.push_back(mods);
   }
-  return offsets;
+  return coordinates;
 }
 
 // Checks if the given shape can be evenly distributed based on the layout
@@ -229,8 +234,10 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
   }
 
   if (inst_data && lane_layout && inst_data.size() != lane_layout.size()) {
-    return emitError()
-           << "expected inst_data and lane_layout to have the same rank";
+    return emitError() << "expected inst_data and lane_layout to have the same "
+                          "rank, got inst_data "
+                       << inst_data.size() << ", lane_layout "
+                       << lane_layout.size();
   }
 
   // sg_data is optional for Workgroup layout, but its presence requires
@@ -271,56 +278,117 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
 }
 
 FailureOr<SmallVector<Value>>
-LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
-                                  Value linearId) {
-  // delinearizeSubgroupId is only available for
-  // workgroup-level layout attribute
-  if (!isForWorkgroup())
+LayoutAttr::delinearizeId(OpBuilder &builder, Location loc, Value linearId) {
+
+  SmallVector<int64_t> sgLayoutInt;
+  if (isForWorkgroup()) {
+    sgLayoutInt = getEffectiveSgLayoutAsInt();
+  } else if (isForSubgroup()) {
+    sgLayoutInt = getEffectiveLaneLayoutAsInt();
+  } else {
     return failure();
+  }
 
-  // TODO: handle order attribute
-  auto hasDefaultOrder = [&]() {
-    DenseI32ArrayAttr order = getOrder();
-    return !order || isIdentityPermutation(llvm::to_vector_of<int64_t>(
-                         llvm::reverse(order.asArrayRef())));
-  };
-  if (!hasDefaultOrder())
-    return mlir::emitError(loc, "order attribute is currently not supported.");
+  DenseI32ArrayAttr orderAttr = getOrder();
 
-  auto dims =
-      llvm::map_to_vector(getEffectiveSgLayoutAsInt(), [&](int64_t d) -> Value {
-        return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
-      });
+  // Handle order attribute
+  SmallVector<int64_t> order;
+  if (orderAttr && !orderAttr.empty()) {
+    order = llvm::to_vector(
+        llvm::map_range(orderAttr.asArrayRef(),
+                        [](int32_t idx) { return static_cast<int64_t>(idx); }));
+  } else {
+    // Default order: [1, 0] for 2D (row-major), [2, 1, 0] for 3D, etc.
+    order = llvm::to_vector(
+        llvm::reverse(llvm::seq<int64_t>(0, sgLayoutInt.size())));
+  }
+
+  if (order.size() != sgLayoutInt.size()) {
+    return failure();
+  }
 
-  return affine::delinearizeIndex(builder, loc, linearId, dims);
+  SmallVector<Value> result(sgLayoutInt.size());
+  Value remaining = linearId;
+
+  /// Process dimensions in the order they appear in the order array
+  /// The first dimension in order is the fastest-changing
+  ///
+  /// Example walkthrough for linearId=22, sgLayout=[2,4,4], order=[2,1,0]:
+  ///
+  /// Initial: remaining=22, dimIdx = order[i], dimSize = sgLayout[dimIdx],
+  /// result=[?,?,?]
+  ///
+  /// i=0 (process columns, dimIdx=2, dimSize=4):
+  ///   result[2] = 22 % 4 = 2  (column coordinate)
+  ///   remaining = 22 / 4 = 5  (5 complete groups of 4 columns processed)
+  ///
+  /// i=1 (process rows, dimIdx=1, dimSize=4):
+  ///   result[1] = 5 % 4 = 1   (row coordinate)
+  ///   remaining = 5 / 4 = 1   (1 complete group of 4 rows processed)
+  ///
+  /// i=2 (process layers, dimIdx=0, dimSize=2):
+  ///   result[0] = 1 % 2 = 1   (layer coordinate)
+  ///   (no remaining update - last iteration)
+  ///
+  /// Final result: [1,1,2] = Layer 1, Row 1, Column 2
+  for (size_t i = 0; i < order.size(); ++i) {
+    int64_t dimIdx = order[i];
+    int64_t dimSize = sgLayoutInt[dimIdx];
+
+    Value dimSizeVal =
+        builder.createOrFold<arith::ConstantIndexOp>(loc, dimSize);
+
+    /// Extract the coordinate for this dimension using modulo operation
+    /// This gives us "how far within this dimension" we are
+    /// e.g., linearId=22, dimSize=4: 22 % 4 = 2 (we're at position 2 within
+    /// this dimension)
+    result[dimIdx] =
+        builder.createOrFold<index::RemUOp>(loc, remaining, dimSizeVal);
+
+    /// Update remaining for the next dimension by removing what we've already
+    /// processed. Division tells us "how many complete groups of this dimension
+    /// we've gone through" e.g., linearId=22, dimSize=4: 22 / 4 = 5 (we've
+    /// completed 5 groups of 4) Skip this for the last iteration since there's
+    /// no next dimension to process
+    if (i < order.size() - 1) {
+      remaining =
+          builder.createOrFold<index::DivUOp>(loc, remaining, dimSizeVal);
+    }
+  }
+  return result;
 }
 
-/// Implements DistributeLayoutAttr::getOffsets to generate
+/// Implements DistributeLayoutAttr::computeDistributedCoords to generate
 /// instructions for computing multi-dimensional offsets when distributed by
 /// LayoutAttr.
 FailureOr<SmallVector<SmallVector<Value>>>
-LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
-                       ArrayRef<int64_t> shape) {
-  if (!isForWorkgroup())
+LayoutAttr::computeDistributedCoords(OpBuilder &builder, Location loc,
+                                     Value linearId, ArrayRef<int64_t> shape) {
+  SmallVector<int64_t> layout;
+  SmallVector<int64_t> subShape;
+  if (isForWorkgroup()) {
+    layout = getEffectiveSgLayoutAsInt();
+    subShape = getEffectiveSgDataAsInt();
+  } else if (isForSubgroup()) {
+    layout = getEffectiveLaneLayoutAsInt();
+    subShape = getEffectiveLaneDataAsInt();
+  } else {
     return failure();
-
-  SmallVector<int64_t> sgLayout = getEffectiveSgLayoutAsInt();
-  SmallVector<int64_t> sgShape = getEffectiveSgDataAsInt();
-  if (sgShape.empty()) {
-    if (auto derivedShape = computeShapeRatio(shape, sgLayout))
-      sgShape = derivedShape.value();
+  }
+  if (subShape.empty()) {
+    if (auto derivedShape = computeShapeRatio(shape, layout))
+      subShape = derivedShape.value();
     else
       return failure();
   }
 
   // delinearize Ids
-  auto maybeIds = delinearizeSubgroupId(builder, loc, linearId);
+  auto maybeIds = delinearizeId(builder, loc, linearId);
   if (failed(maybeIds))
     return failure();
-  SmallVector<Value> sgIds = *maybeIds;
+  SmallVector<Value> ids = *maybeIds;
 
-  return genOffsetsComputingInsts(builder, loc, sgIds, sgLayout, sgShape,
-                                  shape);
+  return genCoordinates(builder, loc, ids, layout, subShape, shape);
 }
 
 //===----------------------------------------------------------------------===//
@@ -374,34 +442,43 @@ SliceAttr SliceAttr::flatten() const {
 }
 
 FailureOr<SmallVector<Value>>
-SliceAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
-                                 Value linearId) {
+SliceAttr::delinearizeId(OpBuilder &builder, Location loc, Value linearId) {
   SliceAttr attr = flatten();
   auto parent = dyn_cast<LayoutAttr>(attr.getParent());
-  return parent.delinearizeSubgroupId(builder, loc, linearId);
+  return parent.delinearizeId(builder, loc, linearId);
 }
 
-/// Implements DistributeLayoutAttr::getOffsets to generate
-/// instructions for computing multi-dimensional offsets when distributed by
-/// SliceAttr.
+// Implements DistributeLayoutAttr::computeDistributedCoords to generate
+// instructions for computing multi-dimensional offsets when distributed by
+// LayoutAttr.
 FailureOr<SmallVector<SmallVector<Value>>>
-SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
-                      ArrayRef<int64_t> shape) {
+SliceAttr::computeDistributedCoords(OpBuilder &builder, Location loc,
+                                    Value linearId, ArrayRef<int64_t> shape) {
   assert(getRank() == static_cast<int64_t>(shape.size()) && "invalid shape.");
   if (!isForWorkgroup())
     return failure();
 
-  SmallVector<int64_t> sgLayout = getEffectiveSgLayoutAsInt();
-  SmallVector<int64_t> sgShape = getEffectiveSgDataAsInt();
-  if (sgShape.empty()) {
-    if (auto derivedShape = computeShapeRatio(shape, sgLayout))
-      sgShape = derivedShape.value();
+  SmallVector<int64_t> layout;
+  SmallVector<int64_t> subShape;
+  if (isForWorkgroup()) {
+    layout = getEffectiveSgLayoutAsInt();
+    subShape = getEffectiveSgDataAsInt();
+  } else if (isForSubgroup()) {
+    layout = getEffectiveLaneLayoutAsInt();
+    subShape = getEffectiveLaneDataAsInt();
+  } else {
+    return failure();
+  }
+
+  if (subShape.empty()) {
+    if (auto derivedShape = computeShapeRatio(shape, layout))
+      subShape = derivedShape.value();
     else
       return failure();
   }
 
   // delinearize Ids
-  auto maybeIds = delinearizeSubgroupId(builder, loc, linearId);
+  auto maybeIds = delinearizeId(builder, loc, linearId);
   if (failed(maybeIds))
     return failure();
 
@@ -411,8 +488,7 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
   SmallVector<Value> sgIds =
       XeGPUDialect::slice(ArrayRef<Value>(*maybeIds), dims);
 
-  return genOffsetsComputingInsts(builder, loc, sgIds, sgLayout, sgShape,
-                                  shape);
+  return genCoordinates(builder, loc, sgIds, layout, subShape, shape);
 }
 
 bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) {
@@ -569,8 +645,8 @@ TensorDescType::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
   // for gather and scatter ops, Low-precision types are packed in 32-bit units.
   unsigned bitWidth = elementType.getIntOrFloatBitWidth();
   int chunkAlignmentFactor =
-      bitWidth < targetinfo::packedSizeInBitsForGatherScatter
-          ? targetinfo::packedSizeInBitsForGatherScatter / bitWidth
+      bitWidth < xegpu::uArch::generalPackedFormatBitSize
+          ? xegpu::uArch::generalPackedFormatBitSize / bitWidth
           : 1;
   auto scatterAttr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(encoding);
   if (scatterAttr) {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index abd12e2e69ac0..4dd10bedc6d84 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -175,13 +175,13 @@ isValidGatherScatterBufferParams(Type offsetsTy, Type maskTy,
 
 LogicalResult
 IsValidMatrixOpParams(VectorType dataTy, MemDescType mdescTy,
-                      UnitAttr subgroup_block_io,
+                      UnitAttr subgroup_block_io, DistributeLayoutAttr layout,
                       function_ref<InFlightDiagnostic()> emitError) {
 
   if (!dataTy) {
     if (subgroup_block_io)
       return emitError() << "subgroup_block_io "
-                            "are only allowed when result is a 1D VectorType.";
+                            "are only allowed when result is a VectorType.";
     else
       return success();
   }
@@ -192,15 +192,37 @@ IsValidMatrixOpParams(VectorType dataTy, MemDescType mdescTy,
   ArrayRef<int64_t> dataShape = dataTy.getShape();
   ArrayRef<int64_t> mdescShape = mdescTy.getShape();
 
+  SmallVector<int64_t> blockShape = mdescTy.getBlockShape();
+  ArrayAttr strideAttr = mdescTy.getStrideAttr();
+  SmallVector<int64_t> strides;
+  for (Attribute attr : strideAttr.getValue()) {
+    strides.push_back(cast<IntegerAttr>(attr).getInt());
+  }
+  if (subgroup_block_io && layout) {
+    auto laneData = layout.getEffectiveLaneDataAsInt();
+    auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
+    if (!laneData.empty()) {
+      bool isLaneDataContiguous =
+          std::all_of(laneData.begin(), std::prev(laneData.end()),
+                      [](int x) { return x == 1; });
+      if (!isLaneDataContiguous)
+        return emitError() << "With subgroup_block_io, accessed data must be "
+                              "contiguous and coalesced.";
+      for (size_t i = 0; i < laneData.size(); ++i) {
+        if (laneLayout[i] != blockShape[i])
+          return emitError() << "With subgroup_block_io, the block shape must "
+                                "match the lane layout.";
+        if (laneLayout[i] != 1 && strides[i] != 1)
+          return emitError() << "With subgroup_block_io, the distributed "
+                                "dimensions must be contiguous.";
+      }
+    }
+  }
   if (dataShape.size() == 2) {
-    if (subgroup_block_io)
-      return emitError() << "subgroup_block_io "
-                            "are only allowed when result is a 1D VectorType.";
     if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape),
                      [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
       return emitError() << "data shape must not exceed mem_desc shape.";
   } else {
-    SmallVector<int64_t> blockShape = mdescTy.getBlockShape();
     // if the subgroup_block_io attribute is set,  mdescTy must have block
     // attribute
     if (subgroup_block_io && !blockShape.size())
@@ -258,8 +280,10 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
     auto [memrefStrides, _] = memrefTy.getStridesAndOffset();
 
     // if shape and strides are from Memref, we don't need attributes for them
-    // to keep the IR print clean.
-    if (staticShape == memrefShape && staticStrides == memrefStrides) {
+    // to keep the IR print clean (only do so for full-static case, otherwise
+    // printer would fail trying to print empty array-attr).
+    if (staticShape == memrefShape && staticStrides == memrefStrides &&
+        dynamicShape.empty() && dynamicStrides.empty()) {
       staticShapeAttr = DenseI64ArrayAttr();
       staticStridesAttr = DenseI64ArrayAttr();
     }
@@ -320,8 +344,10 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
     auto [memrefStrides, _] = memrefTy.getStridesAndOffset();
 
     // if shape and strides are from Memref, we don't need attributes for them
-    // to keep the IR print clean.
-    if (staticShape == memrefShape && staticStrides == memrefStrides) {
+    // to keep the IR print clean (only do so for full-static case, otherwise
+    // printer would fail trying to print empty array-attr).
+    if (staticShape == memrefShape && staticStrides == memrefStrides &&
+        dynamicShape.empty() && dynamicStrides.empty()) {
       staticShapeAttr = DenseI64ArrayAttr();
       staticStridesAttr = DenseI64ArrayAttr();
     }
@@ -472,11 +498,8 @@ LogicalResult PrefetchNdOp::verify() {
     return emitOpError("invalid l3_hint: ") << getL3HintAttr();
 
   int64_t tDescRank = tdescTy.getRank();
-  int64_t offsetSize = static_cast<int64_t>(getOffsets().size());
-  int64_t constOffsetSize =
-      getConstOffsetsAttr() ? getConstOffsetsAttr().size() : 0;
-  if (((offsetSize != 0) && (offsetSize != tDescRank)) ||
-      ((constOffsetSize != 0) && (constOffsetSize != tDescRank)))
+  int64_t offsetSize = getMixedOffsets().size();
+  if (offsetSize != 0 && offsetSize != tDescRank)
     return emitOpError(
         "Mismatched ranks between offsets and tensor descriptor");
 
@@ -597,11 +620,8 @@ LogicalResult LoadNdOp::verify() {
                          << tdescTy;
 
   int64_t tDescRank = tdescTy.getRank();
-  int64_t offsetSize = static_cast<int64_t>(getOffsets().size());
-  int64_t constOffsetSize =
-      getConstOffsetsAttr() ? getConstOffsetsAttr().size() : 0;
-  if (((offsetSize != 0) && (offsetSize != tDescRank)) ||
-      ((constOffsetSize != 0) && (constOffsetSize != tDescRank)))
+  int64_t offsetSize = getMixedOffsets().size();
+  if (offsetSize != 0 && offsetSize != tDescRank)
     return emitOpError(
         "Mismatched ranks between offsets and tensor descriptor");
 
@@ -691,11 +711,8 @@ LogicalResult StoreNdOp::verify() {
                          << dstTy;
 
   int64_t tDescRank = dstTy.getRank();
-  int64_t offsetSize = static_cast<int64_t>(getOffsets().size());
-  int64_t constOffsetSize =
-      getConstOffsetsAttr() ? getConstOffsetsAttr().size() : 0;
-  if (((offsetSize != 0) && (offsetSize != tDescRank)) ||
-      ((constOffsetSize != 0) && (constOffsetSize != tDescRank)))
+  int64_t offsetSize = getMixedOffsets().size();
+  if (offsetSize != 0 && offsetSize != tDescRank)
     return emitOpError(
         "Mismatched ranks between offsets and tensor descriptor");
 
@@ -859,7 +876,7 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
                          xegpu::CachePolicyAttr l2_hint,
                          xegpu::CachePolicyAttr l3_hint) {
   build(builder, state, valueType, source, Value(), mask, IntegerAttr(),
-        l1_hint, l2_hint, l3_hint);
+        l1_hint, l2_hint, l3_hint, /*layout=*/nullptr);
 }
 
 void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
@@ -875,7 +892,24 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
   auto offset = vector::FromElementsOp::create(builder, loc, type, values);
 
   build(builder, state, valueType, source, offset, mask, chunk_size, l1_hint,
-        l2_hint, l3_hint);
+        l2_hint, l3_hint, /*layout=*/nullptr);
+}
+
+void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
+                         Type valueType, Value source,
+                         ArrayRef<OpFoldResult> offsets, Value mask,
+                         IntegerAttr chunk_size, xegpu::CachePolicyAttr l1_hint,
+                         xegpu::CachePolicyAttr l2_hint,
+                         xegpu::CachePolicyAttr l3_hint,
+                         xegpu::LayoutAttr layout) {
+  auto loc = source.getLoc();
+  int64_t size = static_cast<int64_t>(offsets.size());
+  auto type = VectorType::get(size, builder.getIndexType());
+  auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets);
+  auto offset = vector::FromElementsOp::create(builder, loc, type, values);
+
+  build(builder, state, valueType, source, offset, mask, chunk_size, l1_hint,
+        l2_hint, l3_hint, layout);
 }
 
 //===----------------------------------------------------------------------===//
@@ -926,7 +960,7 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
                            xegpu::CachePolicyAttr l2_hint,
                            xegpu::CachePolicyAttr l3_hint) {
   build(builder, state, value, dest, Value(), mask, IntegerAttr(), l1_hint,
-        l2_hint, l3_hint);
+        l2_hint, l3_hint, /*layout=*/nullptr);
 }
 
 void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
@@ -944,7 +978,23 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
 
   // Call the correct builder overload that does not expect result types.
   build(builder, state, value, dest, offset, mask, chunk_size, l1_hint, l2_hint,
-        l3_hint);
+        l3_hint, /*layout=*/nullptr);
+}
+
+void StoreScatterOp::build(
+    OpBuilder &builder, OperationState &state, Value value, Value dest,
+    ArrayRef<OpFoldResult> offsets, Value mask, IntegerAttr chunk_size,
+    xegpu::CachePolicyAttr l1_hint, xegpu::CachePolicyAttr l2_hint,
+    xegpu::CachePolicyAttr l3_hint, xegpu::LayoutAttr layout) {
+  auto loc = dest.getLoc();
+  int64_t size = static_cast<int64_t>(offsets.size());
+  auto type = VectorType::get(size, builder.getIndexType());
+  auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets);
+  auto offset = vector::FromElementsOp::create(builder, loc, type, values);
+
+  // Call the correct builder overload that does not expect result types.
+  build(builder, state, value, dest, offset, mask, chunk_size, l1_hint, l2_hint,
+        l3_hint, layout);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1105,7 +1155,7 @@ LogicalResult LoadMatrixOp::verify() {
   MemDescType mdescTy = getMemDesc().getType();
 
   return IsValidMatrixOpParams(resTy, mdescTy, subgroup_block_io,
-                               [&]() { return emitError(); });
+                               getLayoutAttr(), [&]() { return emitError(); });
 }
 
 //===----------------------------------------------------------------------===//
@@ -1129,7 +1179,7 @@ LogicalResult StoreMatrixOp::verify() {
   UnitAttr subgroup_block_io = getSubgroupBlockIoAttr();
   MemDescType mdescTy = getMemDesc().getType();
   return IsValidMatrixOpParams(dataTy, mdescTy, subgroup_block_io,
-                               [&]() { return emitError(); });
+                               getLayoutAttr(), [&]() { return emitError(); });
 }
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/XeGPU/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/TransformOps/CMakeLists.txt
new file mode 100644
index 0000000000000..48fe841afaa83
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/TransformOps/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_mlir_dialect_library(MLIRXeGPUTransformOps
+  XeGPUTransformOps.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${PROJECT_SOURCE_DIR}/mlir/Dialect/XeGPU/TransformOps/
+
+  DEPENDS
+  MLIRXeGPUTransformOpsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRXeGPUDialect
+  MLIRXeGPUTransforms
+  MLIRIR
+  MLIRTransformDialect
+  MLIRFuncDialect
+  MLIRSCFDialect
+)
diff --git a/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp
new file mode 100644
index 0000000000000..e301d4d9bd108
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp
@@ -0,0 +1,673 @@
+//===- XeGPUTransformOps.cpp - Implementation of XeGPU transformation ops -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Utils/Utils.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+
+#include <optional>
+
+#include "llvm/Support/DebugLog.h"
+#define DEBUG_TYPE "xegpu-transforms"
+
+using namespace mlir;
+using namespace mlir::transform;
+
+/// Assuming that `ofr` is an index attr or a param of index type
+/// or a transform dialect handle mapped to exactly one op
+/// with one index result, get that value and cast it to int type.
+static DiagnosedSilenceableFailure convertMixedValuesToInt(
+    transform::TransformState &state, TransformOpInterface transformOp,
+    SmallVectorImpl<int32_t> &result, ArrayRef<OpFoldResult> ofrs) {
+  for (OpFoldResult ofr : ofrs) {
+    // Attribute case.
+    if (auto attr = dyn_cast<Attribute>(ofr)) {
+      if (auto intAttr = dyn_cast<IntegerAttr>(attr)) {
+        result.push_back(intAttr.getInt());
+        continue;
+      }
+      return transformOp.emitDefiniteFailure() << "expected IntegerAttr";
+    }
+
+    // Transform param case.
+    Value transformValue = cast<Value>(ofr);
+    if (isa<TransformParamTypeInterface>(transformValue.getType())) {
+      ArrayRef<Attribute> params = state.getParams(transformValue);
+      if (params.size() != 1)
+        return transformOp.emitDefiniteFailure()
+               << "requires exactly one parameter associated";
+      result.push_back(
+          cast<IntegerAttr>(params.front()).getValue().getSExtValue());
+      continue;
+    }
+
+    // Payload value case.
+    auto payloadOps = state.getPayloadOps(transformValue);
+    if (!llvm::hasSingleElement(payloadOps)) {
+      DiagnosedSilenceableFailure diag =
+          transformOp.emitSilenceableError()
+          << "handle must be mapped to exactly one payload op";
+      diag.attachNote(transformValue.getLoc())
+          << "mapped to " << llvm::range_size(payloadOps) << " payload ops";
+      return diag;
+    }
+
+    Operation *op = *payloadOps.begin();
+    if (op->getNumResults() != 1 || !op->getResult(0).getType().isIndex()) {
+      DiagnosedSilenceableFailure diag =
+          transformOp.emitSilenceableError()
+          << "payload op must have exactly 1 index result";
+      diag.attachNote(op->getLoc())
+          << "has " << op->getNumResults() << " results";
+      return diag;
+    }
+
+    IntegerAttr intAttr;
+    if (!matchPattern(op->getResult(0), m_Constant(&intAttr)))
+      return transformOp.emitSilenceableError()
+             << "requires param or handle to be the result of a constant like "
+                "op";
+
+    result.push_back(intAttr.getInt());
+  }
+  return DiagnosedSilenceableFailure::success();
+}
+
+/// Find producer operation of type T for the given value.
+/// It's assumed that producer ops are chained through their first operand.
+/// Producer chain is traced trough loop block arguments (init values).
+template <typename T>
+static std::optional<T> findProducerOfType(Value val) {
+  Value currentValue = val;
+  if (!currentValue.getDefiningOp()) {
+    // Value may be a block argument initialized outside a loop.
+    if (val.getNumUses() == 0) {
+      LDBG() << "Failed to find producer op, value has no uses.";
+      return std::nullopt;
+    }
+    auto userOp = val.getUsers().begin();
+    auto parentLoop = userOp->getParentOfType<LoopLikeOpInterface>();
+    if (!parentLoop) {
+      LDBG() << "Failed to find producer op, not in a loop.";
+      return std::nullopt;
+    }
+    int64_t iterArgIdx;
+    if (auto iterArg = llvm::dyn_cast<BlockArgument>(currentValue)) {
+      auto numInductionVars = parentLoop.getLoopInductionVars()->size();
+      iterArgIdx = iterArg.getArgNumber() - numInductionVars;
+      currentValue = parentLoop.getInits()[iterArgIdx];
+    } else {
+      LDBG() << "Failed to find producer op, value not in init values.";
+      return std::nullopt;
+    }
+  }
+  Operation *producerOp = currentValue.getDefiningOp();
+
+  if (auto matchingOp = dyn_cast<T>(producerOp))
+    return matchingOp;
+
+  if (producerOp->getNumOperands() == 0)
+    return std::nullopt;
+
+  return findProducerOfType<T>(producerOp->getOperand(0));
+}
+
+/// Create a layout attribute from the given parameters.
+static xegpu::LayoutAttr
+createLayoutAttr(MLIRContext *ctx, ArrayRef<int32_t> sgLayout,
+                 ArrayRef<int32_t> sgData,
+                 std::optional<ArrayRef<int32_t>> instData) {
+  return xegpu::LayoutAttr::get(
+      ctx, DenseI32ArrayAttr::get(ctx, sgLayout),
+      DenseI32ArrayAttr::get(ctx, sgData),
+      instData ? DenseI32ArrayAttr::get(ctx, instData.value()) : nullptr,
+      /*lane_layout=*/nullptr,
+      /*lane_data=*/nullptr,
+      /*order=*/nullptr);
+}
+
+/// Generate `xegpu::LayoutAttr` from op mixed layout values.
+DiagnosedSilenceableFailure
+getLayoutAttrFromOperands(MLIRContext *ctx, transform::TransformState &state,
+                          TransformOpInterface transformOp,
+                          ArrayRef<::mlir::OpFoldResult> mixedSgLayout,
+                          ArrayRef<::mlir::OpFoldResult> mixedSgData,
+                          ArrayRef<::mlir::OpFoldResult> mixedInstData,
+                          xegpu::LayoutAttr &layoutAttr) {
+  SmallVector<int32_t> sgLayout, sgData, instData;
+  auto status =
+      convertMixedValuesToInt(state, transformOp, sgLayout, mixedSgLayout);
+  if (!status.succeeded())
+    return status;
+
+  status = convertMixedValuesToInt(state, transformOp, sgData, mixedSgData);
+  if (!status.succeeded())
+    return status;
+
+  status = convertMixedValuesToInt(state, transformOp, instData, mixedInstData);
+  if (!status.succeeded())
+    return status;
+  auto maybeInstData = instData.empty()
+                           ? std::nullopt
+                           : std::optional<ArrayRef<int32_t>>(instData);
+
+  layoutAttr = createLayoutAttr(ctx, sgLayout, sgData, maybeInstData);
+
+  return DiagnosedSilenceableFailure::success();
+}
+
+/// Replace xegpu.create_nd_desc op with a new one with the given layout.
+static xegpu::CreateNdDescOp
+setDescLayout(transform::TransformRewriter &rewriter,
+              xegpu::CreateNdDescOp descOp, xegpu::LayoutAttr layout) {
+  assert(descOp.getMixedOffsets().size() == 0 &&
+         "create desc op with offsets is not supported");
+  auto oldTensorDesc = descOp.getType();
+  auto descType = xegpu::TensorDescType::get(
+      oldTensorDesc.getShape(), oldTensorDesc.getElementType(),
+      /*array_length=*/oldTensorDesc.getArrayLength(),
+      /*boundary_check=*/oldTensorDesc.getBoundaryCheck(),
+      /*memory_space=*/oldTensorDesc.getMemorySpace(),
+      /*layout=*/layout);
+
+  rewriter.setInsertionPointAfter(descOp);
+  auto newDescOp = rewriter.replaceOpWithNewOp<xegpu::CreateNdDescOp>(
+      descOp, descType, descOp.getSource(), descOp.getMixedSizes(),
+      descOp.getMixedStrides());
+  return newDescOp;
+}
+
+DiagnosedSilenceableFailure
+transform::GetDescOp::apply(transform::TransformRewriter &rewriter,
+                            transform::TransformResults &results,
+                            transform::TransformState &state) {
+  auto targetValues = state.getPayloadValues(getTarget());
+  if (!llvm::hasSingleElement(targetValues)) {
+    return emitDefiniteFailure()
+           << "requires exactly one target value handle (got "
+           << llvm::range_size(targetValues) << ")";
+  }
+
+  auto maybeDescOp =
+      findProducerOfType<xegpu::CreateNdDescOp>(*targetValues.begin());
+  if (!maybeDescOp) {
+    return emitSilenceableFailure(getLoc())
+           << "Could not find a matching descriptor op when walking the "
+              "producer chain of the first operand.";
+  }
+
+  results.set(llvm::cast<OpResult>(getResult()), {*maybeDescOp});
+  return DiagnosedSilenceableFailure::success();
+}
+
+void transform::SetDescLayoutOp::build(OpBuilder &builder,
+                                       OperationState &result, Value target,
+                                       ArrayRef<OpFoldResult> mixedSgLayout,
+                                       ArrayRef<OpFoldResult> mixedSgData,
+                                       ArrayRef<OpFoldResult> mixedInstData) {
+  SmallVector<int64_t> staticSgLayout, staticSgData, staticInstData;
+  SmallVector<Value> dynamicSgLayout, dynamicSgData, dynamicInstData;
+  dispatchIndexOpFoldResults(mixedSgLayout, dynamicSgLayout, staticSgLayout);
+  dispatchIndexOpFoldResults(mixedSgData, dynamicSgData, staticSgData);
+  dispatchIndexOpFoldResults(mixedInstData, dynamicInstData, staticInstData);
+  build(builder, result, target.getType(),
+        /*target=*/target,
+        /*sg_layout=*/dynamicSgLayout,
+        /*sg_data=*/dynamicSgData,
+        /*inst_data=*/dynamicInstData,
+        /*static_sg_layout=*/staticSgLayout,
+        /*static_sg_data=*/staticSgData,
+        /*static_inst_data=*/staticInstData);
+}
+
+DiagnosedSilenceableFailure
+transform::SetDescLayoutOp::apply(transform::TransformRewriter &rewriter,
+                                  transform::TransformResults &results,
+                                  transform::TransformState &state) {
+  auto targetOps = state.getPayloadOps(getTarget());
+  if (!llvm::hasSingleElement(targetOps)) {
+    return emitDefiniteFailure() << "requires exactly one targetOp handle (got "
+                                 << llvm::range_size(targetOps) << ")";
+  }
+  Operation *target = *targetOps.begin();
+
+  xegpu::LayoutAttr layoutAttr = nullptr;
+  auto status = getLayoutAttrFromOperands(getContext(), state, (*this),
+                                          getMixedSgLayout(), getMixedSgData(),
+                                          getMixedInstData(), layoutAttr);
+  if (!status.succeeded())
+    return status;
+
+  // For now only create_nd_desc op is supported.
+  auto descOp = dyn_cast<xegpu::CreateNdDescOp>(target);
+  if (!descOp) {
+    auto diag = emitSilenceableFailure(getLoc())
+                << "Expected a xegpu.create_nd_desc op, but got: "
+                << target->getName();
+    diag.attachNote(target->getLoc()) << "target op";
+    return diag;
+  }
+
+  // Set layout attr in desc op's return type. Replaces old desc op.
+  auto newdescOp = setDescLayout(rewriter, descOp, layoutAttr);
+
+  // Map result handles.
+  results.set(cast<OpResult>(getTransformed()), {newdescOp.getOperation()});
+
+  return DiagnosedSilenceableFailure::success();
+}
+
+void transform::SetDescLayoutOp::getEffects(
+    ::llvm::SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  consumesHandle(getTargetMutable(), effects);
+  onlyReadsHandle(getSgLayoutMutable(), effects);
+  onlyReadsHandle(getSgDataMutable(), effects);
+  onlyReadsHandle(getInstDataMutable(), effects);
+  producesHandle(getOperation()->getOpResults(), effects);
+  modifiesPayload(effects);
+}
+
+void transform::SetOpLayoutAttrOp::build(
+    OpBuilder &builder, OperationState &ostate, Value target, int64_t index,
+    ArrayRef<OpFoldResult> mixedSgLayout, ArrayRef<OpFoldResult> mixedSgData,
+    ArrayRef<OpFoldResult> mixedInstData, bool result) {
+  SmallVector<int64_t> staticSgLayout, staticSgData, staticInstData;
+  SmallVector<Value> dynamicSgLayout, dynamicSgData, dynamicInstData;
+  dispatchIndexOpFoldResults(mixedSgLayout, dynamicSgLayout, staticSgLayout);
+  dispatchIndexOpFoldResults(mixedSgData, dynamicSgData, staticSgData);
+  dispatchIndexOpFoldResults(mixedInstData, dynamicInstData, staticInstData);
+  build(builder, ostate, target.getType(),
+        /*target=*/target,
+        /*index=*/index,
+        /*sg_layout=*/dynamicSgLayout,
+        /*sg_data=*/dynamicSgData,
+        /*inst_data=*/dynamicInstData,
+        /*static_sg_layout=*/staticSgLayout,
+        /*static_sg_data=*/staticSgData,
+        /*static_inst_data=*/staticInstData,
+        /*result=*/result);
+}
+
+DiagnosedSilenceableFailure
+transform::SetOpLayoutAttrOp::apply(transform::TransformRewriter &rewriter,
+                                    transform::TransformResults &results,
+                                    transform::TransformState &state) {
+  auto targetOps = state.getPayloadOps(getTarget());
+  if (!llvm::hasSingleElement(targetOps)) {
+    return emitDefiniteFailure() << "Requires exactly one targetOp handle (got "
+                                 << llvm::range_size(targetOps) << ")";
+  }
+  Operation *target = *targetOps.begin();
+
+  bool resultTarget = getResult();
+
+  int64_t index = getIndex();
+  if (resultTarget && index >= target->getNumResults()) {
+    return emitSilenceableFailure(getLoc())
+           << "Index exceeds the number of op results";
+  }
+  if (!resultTarget && index >= target->getNumOperands()) {
+    return emitSilenceableFailure(getLoc())
+           << "Index exceeds the number of op operands";
+  }
+
+  xegpu::LayoutAttr layoutAttr = nullptr;
+  auto status = getLayoutAttrFromOperands(getContext(), state, (*this),
+                                          getMixedSgLayout(), getMixedSgData(),
+                                          getMixedInstData(), layoutAttr);
+  if (!status.succeeded())
+    return status;
+
+  // Set layout attribute for the op result or operand
+  if (resultTarget)
+    xegpu::setDistributeLayoutAttr(target->getResult(index), layoutAttr);
+  else
+    xegpu::setDistributeLayoutAttr(target->getOpOperand(index), layoutAttr);
+  return DiagnosedSilenceableFailure::success();
+}
+
+void transform::SetOpLayoutAttrOp::getEffects(
+    ::llvm::SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  onlyReadsHandle(getTargetMutable(), effects);
+  onlyReadsHandle(getSgLayoutMutable(), effects);
+  onlyReadsHandle(getSgDataMutable(), effects);
+  onlyReadsHandle(getInstDataMutable(), effects);
+  modifiesPayload(effects);
+}
+
+void transform::SetGPULaunchThreadsOp::build(
+    OpBuilder &builder, OperationState &ostate, Value target,
+    ArrayRef<OpFoldResult> mixedThreads) {
+  SmallVector<int64_t> staticThreads;
+  SmallVector<Value> dynamicThreads;
+  dispatchIndexOpFoldResults(mixedThreads, dynamicThreads, staticThreads);
+  build(builder, ostate, target.getType(),
+        /*target=*/target,
+        /*threads=*/dynamicThreads,
+        /*static_threads=*/staticThreads);
+}
+
+DiagnosedSilenceableFailure
+transform::SetGPULaunchThreadsOp::apply(transform::TransformRewriter &rewriter,
+                                        transform::TransformResults &results,
+                                        transform::TransformState &state) {
+  auto targetOps = state.getPayloadOps(getTarget());
+  if (!llvm::hasSingleElement(targetOps)) {
+    return emitDefiniteFailure() << "Requires exactly one targetOp handle (got "
+                                 << llvm::range_size(targetOps) << ")";
+  }
+  Operation *target = *targetOps.begin();
+
+  auto launchOp = dyn_cast<gpu::LaunchOp>(target);
+  if (!launchOp) {
+    auto diag = emitSilenceableFailure(getLoc())
+                << "Expected a gpu.launch op, but got: " << target->getName();
+    diag.attachNote(target->getLoc()) << "target op";
+    return diag;
+  }
+
+  SmallVector<int32_t> threads;
+  DiagnosedSilenceableFailure status =
+      convertMixedValuesToInt(state, (*this), threads, getMixedThreads());
+  if (!status.succeeded())
+    return status;
+
+  if (threads.size() != 3) {
+    return emitSilenceableFailure(getLoc())
+           << "Expected threads argument to consist of three values (got "
+           << threads.size() << ")";
+  }
+
+  rewriter.setInsertionPoint(launchOp);
+  auto createConstValue = [&](int value) {
+    return arith::ConstantIndexOp::create(rewriter, launchOp.getLoc(), value);
+  };
+
+  // Replace threads in-place.
+  launchOp.getBlockSizeXMutable().assign(createConstValue(threads[0]));
+  launchOp.getBlockSizeYMutable().assign(createConstValue(threads[1]));
+  launchOp.getBlockSizeZMutable().assign(createConstValue(threads[2]));
+
+  return DiagnosedSilenceableFailure::success();
+}
+
+void transform::SetGPULaunchThreadsOp::getEffects(
+    ::llvm::SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  onlyReadsHandle(getTargetMutable(), effects);
+  onlyReadsHandle(getThreadsMutable(), effects);
+  modifiesPayload(effects);
+}
+
+DiagnosedSilenceableFailure
+transform::InsertPrefetchOp::apply(transform::TransformRewriter &rewriter,
+                                   transform::TransformResults &results,
+                                   transform::TransformState &state) {
+  auto targetValues = state.getPayloadValues(getTarget());
+  if (!llvm::hasSingleElement(targetValues))
+    return emitDefiniteFailure()
+           << "requires exactly one target value handle (got "
+           << llvm::range_size(targetValues) << ")";
+  auto value = *targetValues.begin();
+
+  int64_t nbPrefetch = getStaticNbPrefetch();
+  if (getDynamicNbPrefetch()) {
+    // Get dynamic prefetch count from transform param or handle.
+    SmallVector<int32_t> dynamicNbPrefetch;
+    auto status = convertMixedValuesToInt(state, (*this), dynamicNbPrefetch,
+                                          {getDynamicNbPrefetch()});
+    if (!status.succeeded())
+      return status;
+    if (dynamicNbPrefetch.size() != 1)
+      return emitDefiniteFailure()
+             << "requires exactly one value for dynamic_nb_prefetch";
+    nbPrefetch = dynamicNbPrefetch[0];
+  }
+  if (nbPrefetch <= 0)
+    return emitSilenceableFailure(getLoc())
+           << "nb_prefetch must be a positive integer.";
+
+  // Find load operation of the operand.
+  auto maybeLoadOp = findProducerOfType<xegpu::LoadNdOp>(value);
+  if (!maybeLoadOp)
+    return emitSilenceableFailure(getLoc()) << "Could not find load op.";
+  auto loadOp = *maybeLoadOp;
+  if (loadOp.getMixedOffsets().size() == 0) {
+    auto diag = emitSilenceableFailure(getLoc())
+                << "Load op must have offsets.";
+    diag.attachNote(loadOp.getLoc()) << "load op";
+    return diag;
+  }
+
+  // Find the parent scf.for loop.
+  auto forOp = loadOp->getParentOfType<scf::ForOp>();
+  if (!forOp) {
+    auto diag = emitSilenceableFailure(getLoc())
+                << "Load op is not contained in a scf.for loop.";
+    diag.attachNote(loadOp.getLoc()) << "load op";
+    return diag;
+  }
+
+  // Find descriptor op.
+  auto maybeDescOp = findProducerOfType<xegpu::CreateNdDescOp>(value);
+  if (!maybeDescOp)
+    return emitSilenceableFailure(getLoc()) << "Could not find descriptor op.";
+  auto descOp = *maybeDescOp;
+  if (descOp.getMixedOffsets().size() > 0) {
+    auto diag = emitSilenceableFailure(getLoc())
+                << "desc op with offsets is not supported.";
+    diag.attachNote(descOp.getLoc()) << "desc op";
+  }
+
+  // Clone desc op outside the loop.
+  rewriter.setInsertionPoint(forOp);
+  auto newDescOp =
+      cast<xegpu::CreateNdDescOp>(rewriter.clone(*descOp.getOperation()));
+
+  // Clone reduction loop to emit initial prefetches.
+  // Compute upper bound of the init loop: start + nbPrefetch * step.
+  auto nbPrefetchCst =
+      arith::ConstantIndexOp::create(rewriter, forOp.getLoc(), nbPrefetch);
+  auto nbStep = rewriter.createOrFold<arith::MulIOp>(
+      forOp.getLoc(), nbPrefetchCst, forOp.getStep());
+  auto initUpBound = rewriter.createOrFold<arith::AddIOp>(
+      forOp.getLoc(), forOp.getLowerBound(), nbStep);
+  auto initForOp =
+      scf::ForOp::create(rewriter, forOp.getLoc(), forOp.getLowerBound(),
+                         initUpBound, forOp.getStep());
+
+  auto ctx = rewriter.getContext();
+  auto readCacheHint =
+      xegpu::CachePolicyAttr::get(ctx, xegpu::CachePolicy::CACHED);
+
+  // Modify loadOp mixedOffsets by replacing the for loop induction variable
+  // with the given value.
+  auto getPrefetchOffsets =
+      [&](Value replacementVal) -> SmallVector<OpFoldResult> {
+    IRMapping mapping;
+    mapping.map(forOp.getInductionVar(), replacementVal);
+    SmallVector<Value> dynamicOffsets =
+        llvm::to_vector(llvm::map_range(loadOp.getOffsets(), [&](Value v) {
+          return mapping.lookupOrDefault(v);
+        }));
+    auto constOffsets = loadOp.getConstOffsets().value();
+    return getMixedValues(constOffsets, dynamicOffsets, ctx);
+  };
+
+  // Insert prefetch op in init loop.
+  // Replace induction var with the init loop induction var.
+  rewriter.setInsertionPointToStart(initForOp.getBody());
+  xegpu::PrefetchNdOp::create(rewriter, newDescOp.getLoc(),
+                              newDescOp.getResult(),
+                              getPrefetchOffsets(initForOp.getInductionVar()),
+                              readCacheHint, readCacheHint, readCacheHint);
+
+  // Insert prefetch op in main loop.
+  // Calculate prefetch offset after the init prefetches have been issued.
+  rewriter.setInsertionPointToStart(forOp.getBody());
+  auto prefetchOffset = arith::AddIOp::create(rewriter, forOp.getLoc(),
+                                              forOp.getInductionVar(), nbStep);
+  // Replace induction var with correct offset.
+  xegpu::PrefetchNdOp::create(rewriter, newDescOp.getLoc(),
+                              newDescOp.getResult(),
+                              getPrefetchOffsets(prefetchOffset), readCacheHint,
+                              readCacheHint, readCacheHint);
+
+  // Unroll the init loop.
+  if (failed(loopUnrollFull(initForOp)))
+    return emitSilenceableFailure(getLoc()) << "Failed to unroll the loop";
+
+  results.set(llvm::cast<OpResult>(getResult()), {newDescOp});
+
+  return DiagnosedSilenceableFailure::success();
+}
+
+void transform::InsertPrefetchOp::getEffects(
+    ::llvm::SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  onlyReadsHandle(getTargetMutable(), effects);
+  onlyReadsHandle(getDynamicNbPrefetchMutable(), effects);
+  producesHandle(getOperation()->getOpResults(), effects);
+  modifiesPayload(effects);
+}
+
+void transform::ConvertLayoutOp::build(
+    OpBuilder &builder, OperationState &ostate, Value target,
+    ArrayRef<OpFoldResult> mixedInputSgLayout,
+    ArrayRef<OpFoldResult> mixedInputSgData,
+    ArrayRef<OpFoldResult> mixedInputInstData,
+    ArrayRef<OpFoldResult> mixedTargetSgLayout,
+    ArrayRef<OpFoldResult> mixedTargetSgData,
+    ArrayRef<OpFoldResult> mixedTargetInstData) {
+  SmallVector<int64_t> staticInputSgLayout, staticInputSgData,
+      staticInputInstData;
+  SmallVector<Value> dynamicInputSgLayout, dynamicInputSgData,
+      dynamicInputInstData;
+  dispatchIndexOpFoldResults(mixedInputSgLayout, dynamicInputSgLayout,
+                             staticInputSgLayout);
+  dispatchIndexOpFoldResults(mixedInputSgData, dynamicInputSgData,
+                             staticInputSgData);
+  dispatchIndexOpFoldResults(mixedInputInstData, dynamicInputInstData,
+                             staticInputInstData);
+  SmallVector<int64_t> staticTargetSgLayout, staticTargetSgData,
+      staticTargetInstData;
+  SmallVector<Value> dynamicTargetSgLayout, dynamicTargetSgData,
+      dynamicTargetInstData;
+  dispatchIndexOpFoldResults(mixedTargetSgLayout, dynamicTargetSgLayout,
+                             staticTargetSgLayout);
+  dispatchIndexOpFoldResults(mixedTargetSgData, dynamicTargetSgData,
+                             staticTargetSgData);
+  dispatchIndexOpFoldResults(mixedTargetInstData, dynamicTargetInstData,
+                             staticTargetInstData);
+  build(builder, ostate, target.getType(),
+        /*target=*/target,
+        /*input_sg_layout=*/dynamicInputSgLayout,
+        /*input_sg_data=*/dynamicInputSgData,
+        /*input_inst_data=*/dynamicInputInstData,
+        /*target_sg_layout=*/dynamicTargetSgLayout,
+        /*target_sg_data=*/dynamicTargetSgData,
+        /*target_inst_data=*/dynamicTargetInstData,
+        /*static_input_sg_layout=*/staticInputSgLayout,
+        /*static_input_sg_data=*/staticInputSgData,
+        /*static_input_inst_data=*/staticInputInstData,
+        /*static_target_sg_layout=*/staticTargetSgLayout,
+        /*static_target_sg_data=*/staticTargetSgData,
+        /*static_target_inst_data=*/staticTargetInstData);
+}
+
+DiagnosedSilenceableFailure
+transform::ConvertLayoutOp::apply(transform::TransformRewriter &rewriter,
+                                  transform::TransformResults &results,
+                                  transform::TransformState &state) {
+  auto targetValues = state.getPayloadValues(getTarget());
+  if (!llvm::hasSingleElement(targetValues))
+    return emitDefiniteFailure()
+           << "requires exactly one target value handle (got "
+           << llvm::range_size(targetValues) << ")";
+  auto value = *targetValues.begin();
+
+  // Construct layout attributes.
+  xegpu::LayoutAttr inputLayoutAttr = nullptr;
+  auto status = getLayoutAttrFromOperands(
+      getContext(), state, (*this), getMixedInputSgLayout(),
+      getMixedInputSgData(), getMixedInputInstData(), inputLayoutAttr);
+  if (!status.succeeded())
+    return status;
+
+  xegpu::LayoutAttr targetLayoutAttr = nullptr;
+  status = getLayoutAttrFromOperands(
+      getContext(), state, (*this), getMixedTargetSgLayout(),
+      getMixedTargetSgData(), getMixedTargetInstData(), targetLayoutAttr);
+  if (!status.succeeded())
+    return status;
+
+  // Find first user op to define insertion point for layout conversion.
+  if (value.use_empty())
+    return emitSilenceableFailure(getLoc())
+           << "Value has no users to insert layout conversion.";
+  Operation *userOp = *value.getUsers().begin();
+
+  // Emit convert_layout op.
+  rewriter.setInsertionPoint(userOp);
+  auto convLayoutOp =
+      xegpu::ConvertLayoutOp::create(rewriter, value.getLoc(), value.getType(),
+                                     value, inputLayoutAttr, targetLayoutAttr);
+  // Replace load op result with the converted layout.
+  rewriter.replaceUsesWithIf(
+      value, convLayoutOp.getResult(), [&](OpOperand &use) {
+        return use.getOwner() != convLayoutOp.getOperation();
+      });
+
+  results.set(llvm::cast<OpResult>(getResult()), {convLayoutOp});
+  return DiagnosedSilenceableFailure::success();
+}
+
+void transform::ConvertLayoutOp::getEffects(
+    ::llvm::SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  onlyReadsHandle(getTargetMutable(), effects);
+  onlyReadsHandle(getInputSgLayoutMutable(), effects);
+  onlyReadsHandle(getInputSgDataMutable(), effects);
+  onlyReadsHandle(getInputInstDataMutable(), effects);
+  onlyReadsHandle(getTargetSgLayoutMutable(), effects);
+  onlyReadsHandle(getTargetSgDataMutable(), effects);
+  onlyReadsHandle(getTargetInstDataMutable(), effects);
+  producesHandle(getOperation()->getOpResults(), effects);
+  modifiesPayload(effects);
+}
+
+namespace {
+class XeGPUTransformDialectExtension
+    : public transform::TransformDialectExtension<
+          XeGPUTransformDialectExtension> {
+public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(XeGPUTransformDialectExtension)
+
+  using Base::Base;
+
+  void init();
+};
+
+void XeGPUTransformDialectExtension::init() {
+  declareGeneratedDialect<scf::SCFDialect>();
+  declareGeneratedDialect<arith::ArithDialect>();
+  declareGeneratedDialect<xegpu::XeGPUDialect>();
+
+  registerTransformOps<
+#define GET_OP_LIST
+#include "mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp.inc"
+      >();
+}
+} // namespace
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp.inc"
+
+void mlir::xegpu::registerTransformDialectExtension(DialectRegistry &registry) {
+  registry.addExtensions<XeGPUTransformDialectExtension>();
+}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
index e6f76067094ce..29b645feab2c6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms
   XeGPUWgToSgDistribute.cpp
   XeGPUPropagateLayout.cpp
   XeGPUVectorLinearize.cpp
+  XeGPUOptimizeBlockLoads.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
new file mode 100644
index 0000000000000..4dc5ea4f7bb24
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
@@ -0,0 +1,490 @@
+//===- XeGPUOptimizeBlockLoads.cpp - XeGPU optimize block loads -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
+#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
+#include "mlir/Dialect/XeGPU/uArch/uArchBase.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include <optional>
+
+namespace mlir {
+namespace xegpu {
+#define GEN_PASS_DEF_XEGPUOPTIMIZEBLOCKLOADS
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
+} // namespace xegpu
+} // namespace mlir
+
+#define DEBUG_TYPE "xegpu-optimize-block-loads"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
+using namespace mlir;
+
+namespace {
+
+/// Get the 2D lane data from a tensor desc type if it exists.
+static std::optional<SmallVector<int64_t>>
+getMaybeLaneData(xegpu::TensorDescType tdescType) {
+  auto layout = tdescType.getLayoutAttr();
+  if (!layout)
+    return std::nullopt;
+  auto laneData = layout.getEffectiveLaneDataAsInt();
+  if (laneData.size() != 2)
+    return std::nullopt;
+  return laneData;
+}
+
+/// Get the 2D lane layout from a tensor desc type if it exists.
+static std::optional<SmallVector<int64_t>>
+getMaybeLaneLayout(xegpu::TensorDescType tdescType) {
+  auto layout = tdescType.getLayoutAttr();
+  if (!layout)
+    return std::nullopt;
+  auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
+  if (laneLayout.size() != 2)
+    return std::nullopt;
+  return laneLayout;
+}
+
+/// A layout can be optimized if its lane layout is transposed (lane[0] != 1 &&
+/// lane[1] == 1), but inner lane data is not equal to [1, 1].
+/// Example:
+///     !xegpu.tensor_desc<16x16xf16,
+///         #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
+/// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form)
+/// indicating that this is a load that requires transpose effect. However,
+/// lane data is [1, 2], meaning that each lane must grab 2 f16 elements from
+/// the inner dimension. We convert this to a optimized form by converting the
+/// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the
+/// later lowering easily use the load with transpose instruction.
+static bool canBeOptimizedForTranspose(ArrayRef<int64_t> laneLayout,
+                                       ArrayRef<int64_t> laneData) {
+  if (laneLayout.size() != 2 || laneData.size() != 2)
+    return false;
+  if (laneLayout[0] == 1 || laneLayout[1] != 1)
+    return false;
+  if (laneData[0] != 1 || laneData[1] == 1)
+    return false;
+  return true;
+}
+
+/// A tensor desc type can be optimized if its element type is less than 32 bits
+/// and its layout can be optimized.
+static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) {
+  // If the dtype is greater or equal to 32 bits, layout must be valid.
+  int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
+  if (elementTyBitwidth >= 32)
+    return false;
+  auto maybeLaneLayout = getMaybeLaneLayout(tdescType);
+  auto maybeLaneData = getMaybeLaneData(tdescType);
+  if (!maybeLaneData || !maybeLaneLayout)
+    return false;
+  return canBeOptimizedForTranspose(*maybeLaneLayout, *maybeLaneData);
+}
+
+/// Check if a tensor desc type can be optimized for transpose, if so return the
+/// new optimized tensor desc type with a valid transpose layout.
+static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
+                                         const uArch *targetuArch) {
+  if (!canBeOptimizedForTranspose(tdescType))
+    return tdescType;
+  auto laneData = getMaybeLaneData(tdescType)
+                      .value(); // Lane data must exist if we reach here.
+  int64_t innerLaneData = laneData[1];
+  int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
+  // Required shape is total shape of the vector result that this tensor desc
+  // must eventually load after adjusting for the new bitwidth and array
+  // length.
+  SmallVector<int64_t> requiredShape(tdescType.getShape());
+  requiredShape.back() =
+      requiredShape.back() * tdescType.getArrayLength() / innerLaneData;
+  int newBitWidth = elementTyBitwidth * innerLaneData;
+  Type newElemTy = IntegerType::get(tdescType.getContext(), newBitWidth);
+  // Supported shape is the max transpose shape that can be supported by
+  // hardware that is less than or equal to required shape.
+  auto *blockLoadTarget = dyn_cast<Subgroup2DBlockLoadInstruction>(
+      targetuArch->getInstruction(InstructionKind::Subgroup2DBlockLoad));
+  auto maybeHWParams = blockLoadTarget->getBlockWidthHeightCount(
+      newElemTy, /** has transform */ false, /** has transpose */ true);
+  // If no HW params found, return the original type.
+  if (!maybeHWParams)
+    return tdescType;
+  auto [widths, heights, counts] = maybeHWParams.value();
+  // TODO: Currently we expect array length to be 1 for transpose case.
+  if (counts.size() != 1 || counts[0] != 1)
+    return tdescType;
+  int arrayLen = counts[0];
+  int supportedHeight =
+      xegpu::getLargestDivisor(static_cast<int>(requiredShape[0]), heights);
+  int supportedWidth =
+      xegpu::getLargestDivisor(static_cast<int>(requiredShape[1]), widths);
+  // If no supported height or width found, return the original type.
+  if (supportedHeight == -1 || supportedWidth == -1)
+    return tdescType;
+
+  SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth};
+  xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
+      tdescType.getContext(),
+      tdescType.getLayoutAttr().getLaneLayout().asArrayRef(), {1, 1});
+  // Array length can not be larger than 1 for transpose case.
+  return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
+                                    tdescType.getBoundaryCheck(),
+                                    tdescType.getMemorySpace(), newLayout);
+}
+
+/// Helper to convert an OpFoldResult to Value.
+static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc,
+                            OpFoldResult ofr) {
+  std::optional<int64_t> mayBeInt = getConstantIntValue(ofr);
+  if (mayBeInt)
+    return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt).getResult();
+  return llvm::cast<Value>(ofr);
+}
+
+/// Helper to divide a Value by a constant integer.
+static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc,
+                              Value val, int64_t constant) {
+  // If the constant is a power of 2, use right shift for division.
+  if (llvm::isPowerOf2_64(constant)) {
+    int64_t shiftAmount = llvm::Log2_64(constant);
+    return arith::ShRUIOp::create(
+               rewriter, loc, val,
+               arith::ConstantIndexOp::create(rewriter, loc, shiftAmount)
+                   .getResult())
+        .getResult();
+  }
+  auto constantOp =
+      arith::ConstantIndexOp::create(rewriter, loc, constant).getResult();
+  return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult();
+}
+
+/// This function takes a larger register block `data` and generates multiple
+/// smaller loads (size given by `newTensorDesc`) to fill in the `data` block
+/// starting from `offsets`.
+static Value generateLoads(ConversionPatternRewriter &rewriter,
+                           TypedValue<VectorType> data,
+                           SmallVector<OpFoldResult> offsets,
+                           TypedValue<xegpu::TensorDescType> newTensorDesc,
+                           xegpu::LoadNdOp origLoadOp) {
+  Location loc = data.getLoc();
+  assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp");
+  Value offsetDim0 = convertToValue(rewriter, loc, offsets[offsets.size() - 2]);
+  Value offsetDim1 = convertToValue(rewriter, loc, offsets[offsets.size() - 1]);
+  SmallVector<int64_t> supportedShape(newTensorDesc.getType().getShape());
+  // Compute the ratio between original shape and supported shape. We need to
+  // generate loads in this ratio arrangement.
+  auto shapeRatio = computeShapeRatio(data.getType().getShape(),
+                                      supportedShape)
+                        .value(); // `ratio` must be defined if we reach here.
+  for (int64_t h = 0; h < shapeRatio[0]; ++h) {
+    for (int64_t w = 0; w < shapeRatio[1]; ++w) {
+      int64_t localOffsetDim0 = h * supportedShape[0];
+      int64_t localOffsetDim1 = w * supportedShape[1];
+      Value loadOffsetX = arith::AddIOp::create(
+          rewriter, loc, offsetDim0,
+          arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim0)
+              .getResult());
+      Value loadOffsetY = arith::AddIOp::create(
+          rewriter, loc, offsetDim1,
+          arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim1)
+              .getResult());
+      auto loadOp = xegpu::LoadNdOp::create(
+          rewriter, loc,
+          VectorType::get(supportedShape, data.getType().getElementType()),
+          newTensorDesc, ArrayRef<OpFoldResult>{loadOffsetX, loadOffsetY},
+          origLoadOp.getPackedAttr(), origLoadOp.getTransposeAttr(),
+          origLoadOp.getL1HintAttr(), origLoadOp.getL2HintAttr(),
+          origLoadOp.getL3HintAttr());
+      // Set the layout for the loadOp.
+      auto layoutAttr = newTensorDesc.getType().getLayoutAttr();
+      xegpu::setDistributeLayoutAttr(loadOp->getOpResult(0), layoutAttr);
+      // Insert the loaded block into the right position in data.
+      auto insertOp = vector::InsertStridedSliceOp::create(
+          rewriter, loc, loadOp.getResult(), data,
+          ArrayRef<int64_t>{localOffsetDim0, localOffsetDim1},
+          ArrayRef<int64_t>{1, 1});
+      // InsertOp must have the same layout as newTensorDesc.
+      xegpu::setDistributeLayoutAttr(insertOp->getOpResult(0), layoutAttr);
+      data = insertOp.getResult();
+    }
+  }
+  return data;
+}
+
+/// Checks if a CreateNdDescOp can be optimized for transpose, if so creates a
+/// new CreateNdDescOp with optimized tensor desc type. This involves extracting
+/// the base pointer from the original memory source and adjusting the shape and
+/// strides of the tensor desc to fit with the new optimized transpose layout.
+class XeGPUCreateNdDescOpPattern final
+    : public OpConversionPattern<xegpu::CreateNdDescOp> {
+public:
+  using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto tdescTy = createNdOp.getType();
+    // Get the target uArch info.
+    auto chipStr = xegpu::getChipStr(createNdOp);
+    // Check if the chip is supported.
+    assert(
+        chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg") &&
+        "Expecting target chip to be pvc or bmg for transpose optimization.");
+    const uArch *targetuArch = xegpu::uArch::getUArch(chipStr.value());
+
+    auto convertType = tryOptimize(tdescTy, targetuArch);
+    if (convertType == tdescTy)
+      return failure();
+    auto strides = createNdOp.getMixedStrides();
+    auto maybeConstInnerStride = getConstantIntValue(strides.back());
+    // Only row-major memrefs are expected for now.
+    if (!maybeConstInnerStride || *maybeConstInnerStride != 1)
+      return rewriter.notifyMatchFailure(
+          createNdOp, "Expecting row-major memref for transpose optimization.");
+    Value source = createNdOp.getSource();
+    auto optionalLaneData = getMaybeLaneData(tdescTy);
+    assert(optionalLaneData && "Expected 2D lane data");
+    auto laneData = optionalLaneData.value();
+    int64_t innerLaneData = laneData[1];
+    auto memrefType = dyn_cast<MemRefType>(source.getType());
+    // Inner dimension of the shape must be adjusted based on innerLaneData.
+    SmallVector<OpFoldResult> modifiedShape(createNdOp.getMixedSizes());
+    modifiedShape.back() = divideByConstant(
+        rewriter, createNdOp.getLoc(),
+        convertToValue(rewriter, createNdOp.getLoc(), modifiedShape.back()),
+        innerLaneData);
+    // Similarly, second to last stride must be adjusted.
+    assert(strides.size() >= 2 &&
+           "Expected at least 2 strides for CreateNdDescOp");
+    SmallVector<OpFoldResult> modifiedStrides(strides);
+    modifiedStrides[modifiedStrides.size() - 2] = divideByConstant(
+        rewriter, createNdOp.getLoc(),
+        convertToValue(rewriter, createNdOp.getLoc(),
+                       modifiedStrides[modifiedStrides.size() - 2]),
+        innerLaneData);
+
+    // If the source is a static memref, we need to extract the pointer to
+    // base address.
+    if (memrefType && memrefType.hasStaticShape()) {
+      auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create(
+          rewriter, createNdOp.getLoc(), source);
+      source = arith::IndexCastOp::create(rewriter, createNdOp.getLoc(),
+                                          rewriter.getI64Type(),
+                                          extractOp.getResult())
+                   .getResult();
+    }
+    // Create a new CreateNdDescOp with the modified shape and converted type.
+    auto newCreateNdDescOp = xegpu::CreateNdDescOp::create(
+        rewriter, createNdOp.getLoc(), convertType, source, modifiedShape,
+        modifiedStrides);
+    rewriter.replaceOp(createNdOp, newCreateNdDescOp.getResult());
+    return success();
+  }
+};
+
+/// Checks if a LoadNdOp consumes a tensor desc type that was rewritten for
+/// tranpose optimization. If so, rewrites the LoadNdOp to to align with the
+/// adjusted tensor desc type. This can result in multiple LoadNdOps being
+/// generated to fill in the original load shape.
+class XeGPULoadNdDescOpPattern final
+    : public OpConversionPattern<xegpu::LoadNdOp> {
+public:
+  using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(xegpu::LoadNdOp loadNdOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto origTensorDescType = loadNdOp.getTensorDescType();
+    auto adaptorType =
+        cast<xegpu::TensorDescType>(adaptor.getTensorDesc().getType());
+    if (adaptorType == origTensorDescType)
+      return failure();
+    // Offsets must be adjusted based on innerLaneData.
+    auto laneData = getMaybeLaneData(loadNdOp.getTensorDescType()).value();
+    int64_t innerLaneData = laneData[1];
+    auto offsets = loadNdOp.getMixedOffsets();
+    if (offsets.empty())
+      return rewriter.notifyMatchFailure(loadNdOp,
+                                         "Expecting offsets in LoadNd");
+    SmallVector<OpFoldResult> modifiedOffsets(offsets);
+    modifiedOffsets.back() = divideByConstant(
+        rewriter, loadNdOp.getLoc(),
+        convertToValue(rewriter, loadNdOp.getLoc(), modifiedOffsets.back()),
+        innerLaneData);
+    // Get the 2D data shape of this loadNdOp in its original type including
+    // array length.
+    SmallVector<int64_t> origDataShape(origTensorDescType.getShape());
+    // Adjust the data shape based on innerLaneData.
+    origDataShape.back() /= innerLaneData;
+    // HW supported shape is the new tensor desc shape after conversion.
+    SmallVector<int64_t> hwSupportedShape(adaptorType.getShape());
+    VectorType origVectorType =
+        VectorType::get(origDataShape, adaptorType.getElementType());
+    Value data;
+    // Orig data shape is 3D for the array length case.
+    if (origTensorDescType.getArrayLength() > 1) {
+      SmallVector<Value> arraySlices;
+      for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) {
+        Value slice = arith::ConstantOp::create(
+            rewriter, loadNdOp->getLoc(), origVectorType,
+            rewriter.getZeroAttr(origVectorType));
+        // Increase the Y offset for each array slice.
+        Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(),
+                                       modifiedOffsets.back());
+        modifiedOffsets.back() =
+            arith::AddIOp::create(
+                rewriter, loadNdOp->getLoc(), offsetY,
+                arith::ConstantIndexOp::create(rewriter, loadNdOp->getLoc(),
+                                               i * origDataShape[1])
+                    .getResult())
+                .getResult();
+        slice = generateLoads(
+            rewriter, cast<TypedValue<VectorType>>(slice), modifiedOffsets,
+            cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
+            loadNdOp);
+        // BitCast back to original load shape without array length.
+        auto bitcastType = VectorType::get(origTensorDescType.getShape(),
+                                           origTensorDescType.getElementType());
+        auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
+                                                   bitcastType, slice);
+        // BitCastOp must have the same layout as the original loadNdOp.
+        xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0),
+                                       origTensorDescType.getLayoutAttr());
+        arraySlices.push_back(bitCastOp.getResult());
+      }
+      rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices});
+      return success();
+    }
+    data = arith::ConstantOp::create(
+        rewriter, loadNdOp->getLoc(),
+        VectorType::get(origDataShape, adaptorType.getElementType()),
+        rewriter.getZeroAttr(origVectorType));
+    data = generateLoads(
+        rewriter, cast<TypedValue<VectorType>>(data), modifiedOffsets,
+        cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
+        loadNdOp);
+    auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
+                                               loadNdOp.getType(), data);
+    // BitCastOp must have the same layout as the original loadNdOp.
+    xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0),
+                                   origTensorDescType.getLayoutAttr());
+    rewriter.replaceOp(loadNdOp, bitCastOp);
+    return success();
+  }
+};
+
+/// Vector ExtractOp must be processed if the original tensor desc type has
+/// array length greater than 1. In this case, the LoadNdOp is replaced with
+/// multiple LoadNdOps for each array slice making the extraction unnecessary.
+/// In this case, we simply remove the ExtractOp.
+class VectorExtractOpPattern final
+    : public OpConversionPattern<vector::ExtractOp> {
+public:
+  using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(vector::ExtractOp extractOp, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Check if the source of the extraction is split to multiple values.
+    if (adaptor.getSource().size() == 1)
+      return failure();
+    auto mixedPos = extractOp.getMixedPosition();
+    if (mixedPos.size() != 1)
+      return failure();
+    auto mayBeInt = getConstantIntValue(mixedPos[0]);
+    if (!mayBeInt)
+      return failure();
+    rewriter.replaceOp(extractOp, adaptor.getSource()[*mayBeInt]);
+    return success();
+  }
+};
+
+} // namespace
+
+void xegpu::populateXeGPUOptimizeBlockLoadsPatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
+               VectorExtractOpPattern>(patterns.getContext());
+}
+
+namespace {
+
+struct XeGPUOptimizeBlockLoadsPass final
+    : public xegpu::impl::XeGPUOptimizeBlockLoadsBase<
+          XeGPUOptimizeBlockLoadsPass> {
+  void runOnOperation() override {
+    MLIRContext &context = getContext();
+    TypeConverter converter;
+    RewritePatternSet patterns(&context);
+    ConversionTarget target(context);
+
+    // This pass is only meant for PVC and BMG targets. If unsupported target
+    // is found, exit early.
+    bool isTargetSupported = false;
+    getOperation()->walk([&](gpu::GPUFuncOp funcOp) {
+      auto chipStr = xegpu::getChipStr(funcOp);
+      if (chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg"))
+        isTargetSupported = true;
+    });
+
+    if (!isTargetSupported) {
+      DBGS() << "XeGPUOptimizeBlockLoadsPass only supports PVC and BMG targets."
+             << "\n";
+      return;
+    }
+
+    // CreateNdDescOp and LoadNdOp with optimizable tensor desc types must be
+    // converted.
+    target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
+        [&](xegpu::CreateNdDescOp createNdOp) {
+          return !canBeOptimizedForTranspose(createNdOp.getType());
+        });
+    target.addDynamicallyLegalOp<xegpu::LoadNdOp>(
+        [&](xegpu::LoadNdOp loadNdOp) {
+          return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType());
+        });
+    // Vector ExtractOps can have optimizable layouts if they extract from
+    // LoadNdOps with array length greater than 1. These ExtractOps must be
+    // converted.
+    target.addDynamicallyLegalOp<vector::ExtractOp>(
+        [&](vector::ExtractOp extractOp) {
+          auto layout = xegpu::getDistributeLayoutAttr(extractOp.getResult());
+          if (!layout)
+            return true;
+          auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
+          auto laneData = layout.getEffectiveLaneDataAsInt();
+          return !canBeOptimizedForTranspose(laneLayout, laneData);
+        });
+    converter.addConversion([](Type type) { return type; });
+
+    target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
+                           vector::VectorDialect>();
+    scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
+                                                         target);
+    xegpu::populateXeGPUOptimizeBlockLoadsPatterns(patterns);
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns)))) {
+      DBGS() << "Optimize block loads pass failed.\n";
+      return signalPassFailure();
+    }
+  }
+};
+
+} // namespace
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 8fab255d6347f..b3a780abd3f12 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -14,7 +14,6 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/IR/Attributes.h"
@@ -37,6 +36,8 @@
 #include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
+
 namespace mlir {
 namespace xegpu {
 #define GEN_PASS_DEF_XEGPUPROPAGATELAYOUT
@@ -52,6 +53,8 @@ using namespace mlir::dataflow;
 
 namespace {
 
+enum class LayoutKind { Lane, InstData };
+
 //===----------------------------------------------------------------------===//
 // LayoutInfo
 //===----------------------------------------------------------------------===//
@@ -104,6 +107,8 @@ struct LayoutInfo {
 
   SmallVector<int> getLaneData() const;
 
+  SmallVector<int> getInstData() const;
+
   bool isSliceLayout() const {
     if (!isAssigned())
       return false;
@@ -137,6 +142,13 @@ SmallVector<int> LayoutInfo::getLaneData() const {
                              [](int64_t val) { return static_cast<int>(val); });
 }
 
+SmallVector<int> LayoutInfo::getInstData() const {
+  if (!isAssigned())
+    return {};
+  return llvm::map_to_vector(storage.getEffectiveInstDataAsInt(),
+                             [](int64_t val) { return static_cast<int>(val); });
+}
+
 void LayoutInfo::print(raw_ostream &os) const {
   if (isAssigned()) {
     os << storage;
@@ -156,7 +168,8 @@ LayoutInfo LayoutInfo::join(const LayoutInfo &lhs, const LayoutInfo &rhs) {
   llvm_unreachable("Join should not be triggered by layout propagation.");
 }
 
-/// Construct a new layout with the transposed lane layout and lane data.
+/// Construct a new layout with the transposed inst_data or lane_layout,
+/// lane_data.
 LayoutInfo LayoutInfo::transpose(ArrayRef<int64_t> permutation) const {
   if (!isAssigned())
     return {};
@@ -174,12 +187,22 @@ LayoutInfo LayoutInfo::transpose(ArrayRef<int64_t> permutation) const {
 
   SmallVector<int32_t> laneLayout;
   SmallVector<int32_t> laneData;
+  SmallVector<int32_t> instData;
   for (int64_t idx : permutation) {
-    laneLayout.push_back(static_cast<int32_t>(getLaneLayout()[idx]));
-    laneData.push_back(static_cast<int32_t>(getLaneData()[idx]));
+    if (getLaneLayout().size()) {
+      laneLayout.push_back(static_cast<int32_t>(getLaneLayout()[idx]));
+      laneData.push_back(static_cast<int32_t>(getLaneData()[idx]));
+    }
+    if (getInstData().size())
+      instData.push_back(static_cast<int32_t>(getInstData()[idx]));
   }
-  return LayoutInfo(
-      xegpu::LayoutAttr::get(storage.getContext(), laneLayout, laneData));
+  xegpu::LayoutAttr layoutAttr;
+  if (getLaneLayout().size())
+    layoutAttr =
+        xegpu::LayoutAttr::get(storage.getContext(), laneLayout, laneData);
+  if (getInstData().size())
+    layoutAttr = xegpu::LayoutAttr::get(storage.getContext(), instData);
+  return LayoutInfo(layoutAttr);
 }
 
 //===----------------------------------------------------------------------===//
@@ -200,18 +223,30 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> {
 /// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1].
 /// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1].
 static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
-                                           unsigned rank) {
+                                           unsigned rank,
+                                           const xegpu::uArch::uArch *uArch) {
   assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
   if (rank == 1) {
     return LayoutInfo(
-        xegpu::LayoutAttr::get(ctx, {xegpu::targetinfo::subgroupSize}, {1}));
+        xegpu::LayoutAttr::get(ctx, {uArch->getSubgroupSize()}, {1}));
   }
-  return LayoutInfo(xegpu::LayoutAttr::get(
-      ctx, {1, xegpu::targetinfo::subgroupSize}, {1, 1}));
+  return LayoutInfo(
+      xegpu::LayoutAttr::get(ctx, {1, uArch->getSubgroupSize()}, {1, 1}));
+}
+
+static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
+                                           unsigned rank, int subgroupSize) {
+  assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
+  if (rank == 1) {
+    return LayoutInfo(xegpu::LayoutAttr::get(ctx, {subgroupSize}, {1}));
+  }
+  return LayoutInfo(xegpu::LayoutAttr::get(ctx, {1, subgroupSize}, {1, 1}));
 }
 
 /// Helper to get the default layout for a vector type.
 static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy,
+                                           const xegpu::uArch::uArch *uArch,
+                                           unsigned packingSize,
                                            bool isScattered = false) {
   // Expecting a 1D or 2D vector.
   assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) &&
@@ -221,28 +256,24 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy,
          "Expected int or float element type.");
   // If the rank is 1, then return default layout for 1D vector.
   if (vectorTy.getRank() == 1)
-    return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1);
+    return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1, uArch);
   // Packing factor is determined by the element type bitwidth.
-  int packingFactor = 1;
   unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth();
+  int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1;
   if (isScattered) {
-    packingFactor =
-        bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter
-            ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth
-            : 1;
-    return LayoutInfo(xegpu::LayoutAttr::get(
-        vectorTy.getContext(), {xegpu::targetinfo::subgroupSize, 1},
-        {1, packingFactor}));
+    return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(),
+                                             {uArch->getSubgroupSize(), 1},
+                                             {1, packingFactor}));
   }
-  if (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault)
-    packingFactor = xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth;
   return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(),
-                                           {1, xegpu::targetinfo::subgroupSize},
+                                           {1, uArch->getSubgroupSize()},
                                            {1, packingFactor}));
 }
 
 /// Helper to get the default layout for a vector type.
 static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
+                                           const xegpu::uArch::uArch *uArch,
+                                           unsigned packingSize,
                                            bool isScattered = false) {
   // Expecting a 1D or 2D vector.
   assert((tdescTy.getRank() == 1 || tdescTy.getRank() == 2) &&
@@ -252,27 +283,18 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
          "Expected int or float element type.");
   // If the rank is 1, then return default layout for 1D vector.
   if (tdescTy.getRank() == 1)
-    return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1);
+    return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1, uArch);
   // Packing factor is determined by the element type bitwidth.
   unsigned bitwidth = tdescTy.getElementType().getIntOrFloatBitWidth();
-
+  int subgroupSize = uArch->getSubgroupSize();
+  int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1;
   if (isScattered) {
-    int packingFactor =
-        bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter
-            ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth
-            : 1;
     return LayoutInfo(xegpu::LayoutAttr::get(
-        tdescTy.getContext(), {xegpu::targetinfo::subgroupSize, 1},
-        {1, packingFactor}));
+        tdescTy.getContext(), {subgroupSize, 1}, {1, packingFactor}));
   }
 
-  int packingFactor =
-      (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault)
-          ? xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth
-          : 1;
-  return LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(),
-                                           {1, xegpu::targetinfo::subgroupSize},
-                                           {1, packingFactor}));
+  return LayoutInfo(xegpu::LayoutAttr::get(
+      tdescTy.getContext(), {1, subgroupSize}, {1, packingFactor}));
 }
 
 /// Helper Function to get the expected layouts for DPAS operands. `lane_data`
@@ -281,25 +303,25 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
 /// `packedSizeInBitsForDefault`
 /// * For B operand, the data must be packed in minimum
 /// `packedSizeInBitsForDpasB`
-static LayoutInfo getSIMTLayoutInfoForDPASOperand(VectorType vectorTy,
-                                                  unsigned operandNum) {
+static LayoutInfo
+getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, unsigned operandNum,
+                                const xegpu::uArch::uArch *uArch,
+                                unsigned packingSize) {
   Type elementTy = vectorTy.getElementType();
   assert(elementTy.isIntOrFloat() &&
          "Expected int or float type in DPAS operands");
-  SmallVector<int32_t, 2> layout({1, xegpu::targetinfo::subgroupSize});
+  SmallVector<int32_t, 2> layout({1, uArch->getSubgroupSize()});
   // For B operand, data must be packed in minimum `packedDpasBSizeInBits` and
   // must have the VNNI format.
-  if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() <
-                             xegpu::targetinfo::packedSizeInBitsForDpasB) {
+  if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() < packingSize) {
     SmallVector<int32_t, 2> data(
-        {static_cast<int32_t>(xegpu::targetinfo::packedSizeInBitsForDpasB /
-                              elementTy.getIntOrFloatBitWidth()),
+        {static_cast<int32_t>(packingSize / elementTy.getIntOrFloatBitWidth()),
          1});
     return LayoutInfo(
         xegpu::LayoutAttr::get(vectorTy.getContext(), layout, data));
   }
   // Otherwise, return the default layout for the vector type.
-  return getDefaultSIMTLayoutInfo(vectorTy);
+  return getDefaultSIMTLayoutInfo(vectorTy, uArch, packingSize);
 }
 
 //===----------------------------------------------------------------------===//
@@ -314,6 +336,7 @@ static LayoutInfo getSIMTLayoutInfoForDPASOperand(VectorType vectorTy,
 class LayoutInfoPropagation
     : public SparseBackwardDataFlowAnalysis<LayoutInfoLattice> {
 private:
+  LayoutKind layoutKind;
   void visitDpasOp(xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
                    ArrayRef<const LayoutInfoLattice *> results);
 
@@ -366,8 +389,10 @@ class LayoutInfoPropagation
 
 public:
   LayoutInfoPropagation(DataFlowSolver &solver,
-                        SymbolTableCollection &symbolTable)
-      : SparseBackwardDataFlowAnalysis(solver, symbolTable) {}
+                        SymbolTableCollection &symbolTable,
+                        LayoutKind layoutKind)
+      : SparseBackwardDataFlowAnalysis(solver, symbolTable),
+        layoutKind(layoutKind) {}
   using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis;
 
   LogicalResult
@@ -456,7 +481,43 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
   // Here we assign the default layout to the tensor descriptor operand of
   // prefetch.
   auto tdescTy = prefetch.getTensorDescType();
-  auto prefetchLayout = getDefaultSIMTLayoutInfo(tdescTy);
+
+  auto uArch = getUArch(getChipStr(prefetch).value_or(""));
+  const auto *uArchInstruction =
+      dyn_cast<xegpu::uArch::Subgroup2DBlockPrefetchInstruction>(
+          uArch->getInstruction(
+              xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch));
+
+  auto blockWHC =
+      uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType());
+  if (!blockWHC)
+    prefetch.emitWarning("No known block params found for the element type.");
+  auto [bWidth, bHeight, bCount] = blockWHC.value();
+  SmallVector<int> instData;
+  int instWidth = xegpu::getLargestDivisor(
+      static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth,
+      bCount);
+  if (instWidth == -1)
+    prefetch.emitWarning(
+        "No suitable instruction multiple found for the given shape.");
+  if (tdescTy.getRank() == 1)
+    instData = {instWidth};
+  else {
+    int instHeight = xegpu::getLargestDivisor(
+        static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight);
+    if (instHeight == -1)
+      prefetch.emitWarning(
+          "No suitable instruction multiple found for the given shape.");
+    instData = {instHeight, instWidth};
+  }
+  LayoutInfo prefetchLayout;
+  if (layoutKind == LayoutKind::InstData)
+    prefetchLayout =
+        LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), instData));
+  else
+    prefetchLayout = getDefaultSIMTLayoutInfo(
+        tdescTy, uArch, uArchInstruction->getPackedFormatBitSize());
+
   // Propagate the layout to the source tensor descriptor.
   propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout));
 }
@@ -475,10 +536,11 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
     reduction.emitWarning("Expecting output type to be 1D vector.");
     return;
   }
+  auto uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
   // Given that the result is 1D, the layout of the operand should be 2D with
   // default layout.
-  LayoutInfo operandLayout =
-      getDefaultSIMTLayoutInfo(reduction->getContext(), 2);
+  LayoutInfo operandLayout = getDefaultSIMTLayoutInfo(
+      reduction->getContext(), 2, uArch->getSubgroupSize());
   propagateIfChanged(operands[0], operands[0]->meet(operandLayout));
   // Accumulator should have the same layout as the result.
   propagateIfChanged(operands[1], operands[1]->meet(resultLayout));
@@ -557,15 +619,68 @@ void LayoutInfoPropagation::visitDpasOp(
     ArrayRef<const LayoutInfoLattice *> results) {
   VectorType aTy = dpas.getLhsType();
   VectorType bTy = dpas.getRhsType();
-  propagateIfChanged(
-      operands[0], operands[0]->meet(getSIMTLayoutInfoForDPASOperand(aTy, 0)));
-  propagateIfChanged(
-      operands[1], operands[1]->meet(getSIMTLayoutInfoForDPASOperand(bTy, 1)));
+
+  auto uArch = getUArch(getChipStr(dpas).value_or(""));
+  const int subgroupSize = uArch->getSubgroupSize();
+  const auto *uArchInstruction =
+      dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction(
+          xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc));
+
+  const unsigned dataALen = aTy.getShape().front();
+  auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType());
+  const int maxALen =
+      xegpu::getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen));
+  if (maxALen == -1)
+    dpas.emitWarning(
+        "No suitable instruction multiple found for the given shape.");
+
+  const unsigned dataBLen = bTy.getShape().back();
+  auto supportedBLen = uArchInstruction->getSupportedK(bTy.getElementType());
+  const int maxBLen =
+      xegpu::getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen));
+  if (maxBLen == -1)
+    dpas.emitWarning(
+        "No suitable instruction multiple found for the given shape.");
+  SmallVector<int> instDataA = {maxALen, subgroupSize};
+  SmallVector<int> instDataB = {subgroupSize, maxBLen};
+
+  LayoutInfo dpasALayout;
+  LayoutInfo dpasBLayout;
+  LayoutInfo dpasCLayout;
+
+  if (layoutKind == LayoutKind::InstData) {
+    dpasALayout =
+        LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataA));
+    dpasBLayout =
+        LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataB));
+  } else {
+    dpasALayout = getSIMTLayoutInfoForDPASOperand(
+        aTy, 0, uArch, uArchInstruction->getPackedFormatBitSizeA());
+    dpasBLayout = getSIMTLayoutInfoForDPASOperand(
+        bTy, 1, uArch, uArchInstruction->getPackedFormatBitSizeB());
+  }
+
+  propagateIfChanged(operands[0], operands[0]->meet(dpasALayout));
+  propagateIfChanged(operands[1], operands[1]->meet(dpasBLayout));
   if (operands.size() > 2) {
     VectorType cTy = dpas.getAccType();
-    propagateIfChanged(
-        operands[2],
-        operands[2]->meet(getSIMTLayoutInfoForDPASOperand(cTy, 2)));
+    const unsigned dataCLen = bTy.getShape().back();
+    auto supportedCLen = uArchInstruction->getSupportedN(bTy.getElementType());
+    const int maxCLen =
+        xegpu::getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen));
+    if (maxCLen == -1)
+      dpas.emitWarning(
+          "No suitable instruction multiple found for the given shape.");
+    SmallVector<int> instDataC = {maxALen, maxCLen};
+
+    if (layoutKind == LayoutKind::InstData)
+      dpasCLayout =
+          LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataC));
+    else
+      dpasCLayout = getSIMTLayoutInfoForDPASOperand(
+          cTy, 2, uArch, uArchInstruction->getPackedFormatBitSizeB());
+
+    propagateIfChanged(operands[2], operands[2]->meet(dpasCLayout));
   }
 }
 
@@ -573,7 +688,44 @@ void LayoutInfoPropagation::visitDpasOp(
 void LayoutInfoPropagation::visitStoreNdOp(
     xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands,
     ArrayRef<const LayoutInfoLattice *> results) {
-  LayoutInfo storeLayout = getDefaultSIMTLayoutInfo(store.getValueType());
+
+  auto uArch = getUArch(getChipStr(store).value_or(""));
+  const auto *uArchInstruction =
+      dyn_cast<xegpu::uArch::Subgroup2DBlockStoreInstruction>(
+          uArch->getInstruction(
+              xegpu::uArch::InstructionKind::Subgroup2DBlockStore));
+  VectorType dataTy = store.getValueType();
+  auto blockWHC = uArchInstruction->getBlockWidthHeightCount(
+      store.getValueType().getElementType());
+  if (!blockWHC)
+    store.emitWarning("No known block params found for the element type.");
+  auto [bWidth, bHeight, bCount] = blockWHC.value();
+  SmallVector<int> instData;
+  int instWidth = xegpu::getLargestDivisor(
+      static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth,
+      bCount);
+  if (instWidth == -1)
+    store.emitWarning(
+        "No suitable instruction multiple found for the given shape.");
+  if (dataTy.getRank() == 1)
+    instData = {instWidth};
+  else {
+    int instHeight = xegpu::getLargestDivisor(
+        static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight);
+    if (instHeight == -1)
+      store.emitWarning(
+          "No suitable instruction multiple found for the given shape.");
+    instData = {instHeight, instWidth};
+  }
+
+  LayoutInfo storeLayout;
+  if (layoutKind == LayoutKind::InstData)
+    storeLayout =
+        LayoutInfo(xegpu::LayoutAttr::get(dataTy.getContext(), instData));
+  else
+    storeLayout =
+        getDefaultSIMTLayoutInfo(store.getValueType(), uArch,
+                                 uArchInstruction->getPackedFormatBitSize());
   // Both operands should have the same layout
   for (LayoutInfoLattice *operand : operands)
     propagateIfChanged(operand, operand->meet(storeLayout));
@@ -694,10 +846,27 @@ void LayoutInfoPropagation::visitLoadGatherOp(
     load.emitWarning("Not propagating, non-vector payload supplied.");
     return;
   }
-  LayoutInfo layout = getDefaultSIMTLayoutInfo(payloadTy, /*scattered*/ true);
+  auto uArch = getUArch(getChipStr(load).value_or(""));
+  const int subgroupSize = uArch->getSubgroupSize();
+  SmallVector<int> instData{subgroupSize};
+  if (auto chunkSize = load.getChunkSize().value_or(0); chunkSize > 1)
+    instData.push_back(chunkSize);
+  else if (auto srcTdescTy =
+               dyn_cast<xegpu::TensorDescType>(load.getSourceType())) {
+    if (srcTdescTy.getChunkSizeAsInt() > 1)
+      instData.push_back(chunkSize);
+  }
+  LayoutInfo layout;
+  if (layoutKind == LayoutKind::InstData)
+    layout = LayoutInfo(xegpu::LayoutAttr::get(load.getContext(), instData));
+  else
+    layout = getDefaultSIMTLayoutInfo(payloadTy, uArch,
+                                      uArch->getGeneralPackedFormatBitSize(),
+                                      /*scattered*/ true);
 
   // Mask operand should have 1D default layout.
-  LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(load->getContext(), 1);
+  LayoutInfo maskLayout =
+      getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize);
 
   // Propagate the new layout to the tensor descriptor operand.
   if (isa<xegpu::TensorDescType>(load.getSourceType()))
@@ -717,8 +886,10 @@ void LayoutInfoPropagation::visitCreateDescOp(
   // Need the layout of the descriptor to propagate to the operands.
   if (!descLayout.isAssigned())
     return;
+  auto uArch = getUArch(getChipStr(createDesc).value_or(""));
   // For offset operand propagate 1D default layout.
-  LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1);
+  LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1,
+                                               uArch->getSubgroupSize());
   propagateIfChanged(operands[1], operands[1]->meet(layout));
 }
 
@@ -735,18 +906,40 @@ void LayoutInfoPropagation::visitStoreScatterOp(
     storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
     return;
   }
-  auto payloadShape = payloadTy.getShape();
-  if (payloadShape.size() > 1)
-    assert(
-        payloadShape[0] == xegpu::targetinfo::subgroupSize &&
-        "Expected the first dimension of 2D tensor descriptor to be equal to "
-        "subgroup size.");
+  LayoutInfo payloadLayout;
+  auto uArch = getUArch(getChipStr(storeScatter).value_or(""));
+  const int subgroupSize = uArch->getSubgroupSize();
 
-  LayoutInfo payloadLayout =
-      getDefaultSIMTLayoutInfo(payloadTy, /*scattered=*/true);
+  if (auto layout = storeScatter.getLayoutAttr()) {
+    payloadLayout = LayoutInfo(layout);
+  } else {
+    if (layoutKind == LayoutKind::InstData) {
+      SmallVector<int> instData{subgroupSize};
+      if (auto chunkSize = storeScatter.getChunkSize().value_or(0);
+          chunkSize > 1)
+        instData.push_back(chunkSize);
+      else if (auto dstTdescTy = dyn_cast<xegpu::TensorDescType>(
+                   storeScatter.getDestType())) {
+        if (dstTdescTy.getChunkSizeAsInt() > 1)
+          instData.push_back(chunkSize);
+      }
+      payloadLayout = LayoutInfo(
+          xegpu::LayoutAttr::get(storeScatter.getContext(), instData));
+    } else {
+      auto payloadShape = payloadTy.getShape();
+      if (payloadShape.size() > 1)
+        assert(payloadShape[0] == subgroupSize &&
+               "Expected the first dimension of 2D tensor descriptor to be "
+               "equal to "
+               "subgroup size.");
+      payloadLayout = getDefaultSIMTLayoutInfo(
+          payloadTy, uArch, uArch->getGeneralPackedFormatBitSize(),
+          /*scattered=*/true);
+    }
+  }
 
   LayoutInfo maskLayout =
-      getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1);
+      getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize);
   // Propagate the payload operand layout
   propagateIfChanged(operands[0], operands[0]->meet(payloadLayout));
   // Propagate the destination (if tdesc) operand layout
@@ -768,10 +961,10 @@ class RunLayoutInfoPropagation {
 public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RunLayoutInfoPropagation)
 
-  RunLayoutInfoPropagation(Operation *op) : target(op) {
+  RunLayoutInfoPropagation(Operation *op, LayoutKind layoutKind) : target(op) {
     SymbolTableCollection symbolTable;
     loadBaselineAnalyses(solver);
-    solver.load<LayoutInfoPropagation>(symbolTable);
+    solver.load<LayoutInfoPropagation>(symbolTable, layoutKind);
     (void)solver.initializeAndRun(op);
   }
 
@@ -878,7 +1071,7 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op,
     }
     // If the result is a vector type, add a temporary layout attribute to the
     // op.
-    xegpu::setDistributeLayoutAttr(result, layout);
+    xegpu::setDistributeLayoutAttr(result, layout, /*respectPermLayout*/ true);
   }
   return success();
 }
@@ -1011,7 +1204,18 @@ struct XeGPUPropagateLayoutPass final
 } // namespace
 
 void XeGPUPropagateLayoutPass::runOnOperation() {
-  auto &analysis = getAnalysis<RunLayoutInfoPropagation>();
+  LayoutKind layoutKind;
+  if (this->layoutKind == "lane") {
+    layoutKind = LayoutKind::Lane;
+  } else if (this->layoutKind == "inst") {
+    layoutKind = LayoutKind::InstData;
+  } else {
+    getOperation()->emitError("Unsupported layout kind option: " +
+                              this->layoutKind);
+    signalPassFailure();
+    return;
+  }
+  RunLayoutInfoPropagation analysis(getOperation(), layoutKind);
   // Print the analysis result and exit. (for debugging purposes)
   if (printOnly) {
     auto &os = llvm::outs();
@@ -1023,9 +1227,11 @@ void XeGPUPropagateLayoutPass::runOnOperation() {
     LayoutInfo layout = analysis.getLayoutInfo(val);
     if (!layout.isAssigned())
       return {};
+    xegpu::DistributeLayoutAttr layoutAttr =
+        cast<xegpu::DistributeLayoutAttr>(layout.get());
     if (layout.isSliceLayout())
-      return cast<xegpu::SliceAttr>(layout.get());
-    return cast<xegpu::LayoutAttr>(layout.get());
+      return cast<xegpu::SliceAttr>(layoutAttr);
+    return cast<xegpu::LayoutAttr>(layoutAttr);
   };
 
   mlir::OpBuilder builder(&getContext());
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index d09dc196c0bf7..bbd7733e89c29 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -7,14 +7,15 @@
 //===----------------------------------------------------------------------===//
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@@ -159,17 +160,18 @@ static bool requirePacked(const xegpu::LayoutAttr layout) {
 
 /// Helper function to check if the layout requires a transpose effect.
 static bool requireTranspose(const xegpu::LayoutAttr layout,
-                             const std::string &chipStr) {
+                             const xegpu::uArch::uArch *uArch) {
   // Return false for unsupported targets.
   // TODO: Add more support or move to target info.
-  if (chipStr != "pvc" && chipStr != "bmg")
+  if (uArch->getName().equals_insensitive("pvc") &&
+      uArch->getName().equals_insensitive("bmg"))
     return false;
   if (!layout)
     return false;
   auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
   if (laneLayout.size() != 2)
     return false;
-  return laneLayout[0] == xegpu::targetinfo::subgroupSize && laneLayout[1] == 1;
+  return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
 }
 
 /// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
@@ -199,6 +201,11 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
   using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
                                 PatternRewriter &rewriter) const override {
+    auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or(""));
+    if (!uArch)
+      return rewriter.notifyMatchFailure(
+          gpuFuncOp, "Subgroup distribution requires target attribute attached "
+                     "to set the warp size");
     // If the function only contains a single void return, skip.
     if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
           return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
@@ -230,7 +237,7 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
     ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
     auto warpOp = gpu::WarpExecuteOnLane0Op::create(
         rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
-        xegpu::targetinfo::subgroupSize, newGpuFunc.getArguments(),
+        uArch->getSubgroupSize(), newGpuFunc.getArguments(),
         newGpuFunc.getArgumentTypes());
     Block &warpBodyBlock = warpOp.getBodyRegion().front();
     // Replace the ReturnOp of the original gpu function with a YieldOp.
@@ -495,14 +502,14 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
           warpOp, "warp result is not a xegpu::LoadNd op");
 
     auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
+    auto uArch = getUArch(xegpu::getChipStr(loadOp).value_or(""));
+    if (!uArch)
+      return rewriter.notifyMatchFailure(
+          loadOp, "xegpu::LoadNdOp require target attribute attached to "
+                  "determine transpose "
+                  "requirement");
     // Chip information is required to decide if the layout requires transpose
     // effect.
-    auto chipStr = xegpu::getChipStr(loadOp);
-    if (!chipStr)
-      return rewriter.notifyMatchFailure(
-          loadOp,
-          "xegpu::LoadNdOp require chip information to determine transpose "
-          "requirement");
     // Expecting offsets to be present.
     SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets();
     if (offsets.empty())
@@ -556,7 +563,7 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
     // Set the packed attribute if the layout requires it.
     newLoadOp.setPacked(requirePacked(layout));
     // Set the transpose attribute if the layout requires it.
-    if (requireTranspose(layout, chipStr.value()))
+    if (requireTranspose(layout, uArch))
       newLoadOp.setTranspose(
           DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
     Value distributedVal = newWarpOp.getResult(operandIdx);
@@ -906,6 +913,186 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
   }
 };
 
+static SmallVector<Value> computeDistributedCoordinatesForMatrixOp(
+    PatternRewriter &rewriter, Location loc, xegpu::DistributeLayoutAttr layout,
+    Value laneId, ArrayRef<int64_t> payloadShape, ValueRange origOffsets) {
+  SmallVector<Value> newCoods;
+  auto maybeCoords =
+      layout.computeDistributedCoords(rewriter, loc, laneId, payloadShape);
+  if (failed(maybeCoords))
+    return {};
+  assert(maybeCoords.value().size() == 1 &&
+         "Expected one set of distributed offsets");
+  SmallVector<OpFoldResult> ofrVec = xegpu::addWithRightAligned(
+      rewriter, loc, getAsOpFoldResult(maybeCoords.value()[0]),
+      getAsOpFoldResult(origOffsets));
+  newCoods = llvm::to_vector(llvm::map_range(
+      ofrVec, [&](OpFoldResult ofr) -> Value { return cast<Value>(ofr); }));
+  return newCoods;
+}
+
+/// Pattern for distributing xegpu::LoadMatrixOp.
+struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override {
+    gpu::YieldOp yield = warpOp.getTerminator();
+    Operation *lastNode = yield->getPrevNode();
+    auto matrixOp = dyn_cast_or_null<xegpu::LoadMatrixOp>(lastNode);
+    if (!matrixOp)
+      return failure();
+
+    OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
+      return isa<xegpu::LoadMatrixOp>(op) && matrixOp == op;
+    });
+    if (!producedByLastLoad)
+      return rewriter.notifyMatchFailure(
+          warpOp, "The last op is not xegpu::LoadMatrixOp");
+    const int operandIdx = producedByLastLoad->getOperandNumber();
+
+    VectorType sgPayloadTy =
+        dyn_cast<VectorType>(matrixOp.getResult().getType());
+    VectorType warpResultTy =
+        cast<VectorType>(warpOp.getResult(operandIdx).getType());
+    if (!sgPayloadTy)
+      return rewriter.notifyMatchFailure(
+          matrixOp, "the matrix op payload must be a vector type");
+
+    auto loc = matrixOp.getLoc();
+    auto offsets = matrixOp.getMixedOffsets();
+    if (offsets.empty())
+      return rewriter.notifyMatchFailure(matrixOp,
+                                         "the load op must have offsets");
+    SmallVector<Value> offsetsAsValues =
+        vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
+
+    auto layout = matrixOp.getLayoutAttr();
+    if (!layout)
+      return rewriter.notifyMatchFailure(
+          matrixOp, "the matrix operation lacks layout attribute");
+
+    FailureOr<VectorType> distPayloadByWarpOpOrFailure =
+        getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
+    if (failed(distPayloadByWarpOpOrFailure))
+      return rewriter.notifyMatchFailure(
+          matrixOp, "Failed to distribute matrix op payload based on layout.");
+
+    SmallVector<Value> operands = {matrixOp.getMemDesc()};
+    const unsigned offsetsStartIdx = operands.size();
+    operands.append(offsetsAsValues);
+
+    SmallVector<Type> operandTypes = llvm::to_vector(
+        llvm::map_range(operands, [](Value v) { return v.getType(); }));
+
+    SmallVector<size_t> newRetIndices;
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, warpOp, operands, operandTypes, newRetIndices);
+    SmallVector<Value> newOperands = llvm::map_to_vector(
+        newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
+
+    SmallVector<int64_t> newConstOffsets{matrixOp.getConstOffsets()};
+    std::fill(newConstOffsets.begin(), newConstOffsets.end(),
+              ShapedType::kDynamic);
+    DenseI64ArrayAttr newConstOffsetsAttr =
+        rewriter.getDenseI64ArrayAttr(newConstOffsets);
+    ValueRange currentOffsets =
+        ValueRange(newOperands).drop_front(offsetsStartIdx);
+
+    SmallVector<Value> newCoords = currentOffsets;
+    rewriter.setInsertionPointAfter(newWarpOp);
+
+    if (!matrixOp.getSubgroupBlockIoAttr()) {
+      newCoords = computeDistributedCoordinatesForMatrixOp(
+          rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
+          currentOffsets);
+    }
+    xegpu::LoadMatrixOp newOp = xegpu::LoadMatrixOp::create(
+        rewriter, newWarpOp.getLoc(), *distPayloadByWarpOpOrFailure,
+        newOperands[0], ValueRange(newCoords), newConstOffsetsAttr,
+        matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
+    // Resolve the output type and replace all uses.
+    rewriter.replaceAllUsesWith(
+        newWarpOp.getResult(operandIdx),
+        resolveDistributedTy(newOp.getResult(), warpResultTy, rewriter));
+    return success();
+  }
+};
+
+/// Pattern for distributing xegpu::StoreMatrixOp.
+struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override {
+    gpu::YieldOp yield = warpOp.getTerminator();
+    Operation *lastNode = yield->getPrevNode();
+    auto matrixOp = dyn_cast_or_null<xegpu::StoreMatrixOp>(lastNode);
+    if (!matrixOp)
+      return failure();
+
+    VectorType sgPayloadTy = dyn_cast<VectorType>(matrixOp.getData().getType());
+    if (!sgPayloadTy)
+      return rewriter.notifyMatchFailure(
+          matrixOp, "the matrix op payload must be a vector type");
+
+    auto loc = matrixOp.getLoc();
+    auto offsets = matrixOp.getMixedOffsets();
+    if (offsets.empty())
+      return rewriter.notifyMatchFailure(matrixOp,
+                                         "the store op must have offsets");
+    SmallVector<Value> offsetsAsValues =
+        vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
+
+    auto layout = matrixOp.getLayoutAttr();
+    if (!layout)
+      return rewriter.notifyMatchFailure(
+          matrixOp, "the matrix operation lacks layout attribute");
+
+    FailureOr<VectorType> distPayloadByWarpOpOrFailure =
+        getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
+    if (failed(distPayloadByWarpOpOrFailure))
+      return rewriter.notifyMatchFailure(
+          matrixOp, "Failed to distribute matrix op payload based on layout.");
+
+    SmallVector<Value> operands = {matrixOp.getData(), matrixOp.getMemDesc()};
+    const unsigned offsetsStartIdx = operands.size();
+    operands.append(offsetsAsValues);
+
+    SmallVector<Type> operandTypes = llvm::to_vector(
+        llvm::map_range(operands, [](Value v) { return v.getType(); }));
+    operandTypes[0] = *distPayloadByWarpOpOrFailure;
+
+    SmallVector<size_t> newRetIndices;
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, warpOp, operands, operandTypes, newRetIndices);
+    SmallVector<Value> newOperands = llvm::map_to_vector(
+        newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
+
+    SmallVector<int64_t> newConstOffsets{matrixOp.getConstOffsets()};
+    std::fill(newConstOffsets.begin(), newConstOffsets.end(),
+              ShapedType::kDynamic);
+    DenseI64ArrayAttr newConstOffsetsAttr =
+        rewriter.getDenseI64ArrayAttr(newConstOffsets);
+    ValueRange currentOffsets =
+        ValueRange(newOperands).drop_front(offsetsStartIdx);
+
+    SmallVector<Value> newCoords = currentOffsets;
+    rewriter.setInsertionPointAfter(newWarpOp);
+
+    if (!matrixOp.getSubgroupBlockIoAttr()) {
+      newCoords = computeDistributedCoordinatesForMatrixOp(
+          rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
+          currentOffsets);
+    }
+
+    xegpu::StoreMatrixOp::create(
+        rewriter, loc, TypeRange{}, newOperands[0], newOperands[1],
+        ValueRange(newCoords), newConstOffsetsAttr,
+        matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
+    rewriter.eraseOp(matrixOp);
+    return success();
+  }
+};
+
 /// Distribute a scattered load op. The logic and requirements are the same as
 /// for the scattered store distribution. The warpOp's payload vector is
 /// expected to be distributed by the load's result consumer.
@@ -1437,7 +1624,8 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
                LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
                GpuBarrierDistribution, VectorMultiReductionDistribution,
                LoadDistribution, StoreDistribution, VectorTransposeDistribution,
-               VectorBitcastDistribution,
+               VectorBitcastDistribution, LoadMatrixDistribution,
+               StoreMatrixDistribution,
                MemrefExtractAlignedPointerAsIndexDistribution>(
       patterns.getContext(),
       /*pattern benefit=*/regularPatternBenefit);
@@ -1462,6 +1650,8 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
       // Layouts are needed for vector type only.
       if (!isa<VectorType>(operand.get().getType()))
         continue;
+      if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(op))
+        continue;
 
       auto layout = xegpu::getDistributeLayoutAttr(operand.get());
       if (!layout) {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index e6e71cc29a80a..c3bf9606693a8 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -678,12 +678,16 @@ struct UnrollLoadGatherOpWithOffset
           pack(offsets, convertedOffsetTypes, *targetShape, loc, rewriter);
     }
 
+    auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(op.getLayoutAttr());
+    if (layout)
+      layout = layout.dropInstData();
+
     SmallVector<Value> newOps;
     for (auto [o, m] : llvm::zip(convertedOffsets, convertedMasks)) {
       auto newOp = xegpu::LoadGatherOp::create(
           rewriter, loc, newValueTy, op.getSource(), o, m,
           rewriter.getI64IntegerAttr(chunkSize), op.getL1HintAttr(),
-          op.getL2HintAttr(), op.getL3HintAttr());
+          op.getL2HintAttr(), op.getL3HintAttr(), layout);
       newOps.push_back(newOp);
     }
 
@@ -774,12 +778,16 @@ struct UnrollStoreScatterOpWithOffsets
     SmallVector<Value> convertedValues =
         pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter);
 
+    auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(op.getLayoutAttr());
+    if (layout)
+      layout = layout.dropInstData();
+
     for (auto [v, o, m] :
          llvm::zip(convertedValues, convertedOffsets, convertedMasks)) {
       xegpu::StoreScatterOp::create(rewriter, loc, v, op.getDest(), o, m,
                                     rewriter.getI64IntegerAttr(chunkSize),
                                     op.getL1HintAttr(), op.getL2HintAttr(),
-                                    op.getL3HintAttr());
+                                    op.getL3HintAttr(), layout);
     }
 
     rewriter.eraseOp(op);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 9fc5ad9af5c7b..0a9ef0aa6df96 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -114,7 +114,8 @@ genOffsetsList(ConversionPatternRewriter &rewriter, OpType op,
   // Compute the list of subgroup-relative offsets for sub-tensors or sub-memory
   // descriptors to be accessed, based on the layout information.
   ArrayRef<int64_t> wgShape = op.getDataShape();
-  auto maybeDescOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape);
+  auto maybeDescOffsets =
+      layout.computeDistributedCoords(rewriter, loc, sgId, wgShape);
   if (failed(maybeDescOffsets))
     return failure();
 
@@ -830,8 +831,8 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
       // Get subgroup id
       Value sgId =
           gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
-
-      auto sgOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape);
+      auto sgOffsets =
+          layout.computeDistributedCoords(rewriter, loc, sgId, wgShape);
       if (failed(sgOffsets))
         return failure();
 
@@ -888,8 +889,8 @@ struct WgToSgLoadGatherOpWithOffset
       return failure();
     ArrayRef<int64_t> wgShape = resultType.getShape();
 
-    xegpu::DistributeLayoutAttr layout =
-        xegpu::getDistributeLayoutAttr(op.getResult());
+    xegpu::LayoutAttr layout = dyn_cast_if_present<xegpu::LayoutAttr>(
+        xegpu::getDistributeLayoutAttr(op.getResult()));
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -914,9 +915,8 @@ struct WgToSgLoadGatherOpWithOffset
          llvm::zip(adaptor.getOffsets(), adaptor.getMask())) {
       auto newLoadOp = xegpu::LoadGatherOp::create(
           rewriter, loc, newTy, op.getSource(), offsets, mask, chunkSizeAttr,
-          op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr());
-      xegpu::setDistributeLayoutAttr(newLoadOp->getResult(0),
-                                     layout.dropSgLayoutAndData());
+          op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(),
+          layout.dropSgLayoutAndData());
       newLoadOps.push_back(newLoadOp);
     }
     rewriter.replaceOpWithMultiple(op, {newLoadOps});
@@ -941,8 +941,8 @@ struct WgToSgStoreScatterOpWithOffset
     if (!valueType)
       return failure();
 
-    xegpu::DistributeLayoutAttr layout =
-        xegpu::getDistributeLayoutAttr(op.getOperand(0));
+    xegpu::LayoutAttr layout = dyn_cast_if_present<xegpu::LayoutAttr>(
+        xegpu::getDistributeLayoutAttr(op.getOperand(0)));
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -964,7 +964,8 @@ struct WgToSgStoreScatterOpWithOffset
              adaptor.getValue(), adaptor.getOffsets(), adaptor.getMask())) {
       auto store = xegpu::StoreScatterOp::create(
           rewriter, loc, val, op.getDest(), offs, mask, chunkSizeAttr,
-          op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr());
+          op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(),
+          layout.dropSgLayoutAndData());
       // Update the layout attribute to drop sg_layout and sg_data.
       if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
           !layout.getEffectiveInstDataAsInt().empty()) {
@@ -1052,7 +1053,8 @@ struct WgToSgVectorStepOp : public OpConversionPattern<vector::StepOp> {
 
     Value sgId =
         gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
-    auto sgOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape);
+    auto sgOffsets =
+        layout.computeDistributedCoords(rewriter, loc, sgId, wgShape);
     if (failed(sgOffsets))
       return failure();
 
@@ -1217,6 +1219,70 @@ struct WgToSgMultiDimReductionOp
   }
 };
 
+// This pattern transforms vector.transpose ops to work at subgroup level.
+struct WgToSgVectorTransposeOp
+    : public OpConversionPattern<vector::TransposeOp> {
+  using OpConversionPattern<vector::TransposeOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::TransposeOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    VectorType resultType = op.getResultVectorType();
+
+    ArrayRef<int64_t> wgShape = resultType.getShape();
+    xegpu::DistributeLayoutAttr layout =
+        xegpu::getDistributeLayoutAttr(op.getResult());
+    if (!layout || !layout.isForWorkgroup())
+      return failure();
+
+    xegpu::DistributeLayoutAttr sourceLayout =
+        xegpu::getDistributeLayoutAttr(op.getVector());
+    if (!sourceLayout || !sourceLayout.isForWorkgroup())
+      return failure();
+
+    SmallVector<int64_t> sourceSgLayout =
+        sourceLayout.getEffectiveSgLayoutAsInt();
+    SmallVector<int64_t> resultSgLayout = layout.getEffectiveSgLayoutAsInt();
+    DenseI32ArrayAttr sourceOrder = sourceLayout.getOrder();
+    DenseI32ArrayAttr resultOrder = layout.getOrder();
+
+    if (!sourceOrder || !resultOrder) {
+      return rewriter.notifyMatchFailure(
+          op, "Both source and result must have order attributes");
+    }
+
+    ArrayRef<int64_t> permutation = op.getPermutation();
+    size_t permutationSize = permutation.size();
+    if (sourceSgLayout.size() != permutationSize ||
+        resultSgLayout.size() != permutationSize) {
+      return rewriter.notifyMatchFailure(
+          op, "Layouts and permutation must have the same rank");
+    }
+
+    // Check that sgLayout, sgData & order are properly transposed for source
+    // and result
+    if (!layout.isTransposeOf(sourceLayout, permutation))
+      return rewriter.notifyMatchFailure(
+          op, "Result layout is not a valid transpose of source layout "
+              "according to permutation");
+
+    SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
+    VectorType newResultType =
+        VectorType::get(sgShape, resultType.getElementType());
+    SmallVector<Value> newTransposeOps;
+    for (auto src : adaptor.getVector()) {
+      auto newTranspose = vector::TransposeOp::create(
+          rewriter, op.getLoc(), newResultType, src, permutation);
+      xegpu::setDistributeLayoutAttr(newTranspose->getResult(0),
+                                     layout.dropSgLayoutAndData());
+      newTransposeOps.push_back(newTranspose.getResult());
+    }
+
+    rewriter.replaceOpWithMultiple(op, {newTransposeOps});
+    return success();
+  }
+};
+
 } // namespace
 
 namespace mlir {
@@ -1231,7 +1297,8 @@ void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
            WgToSgArithConstantOp, WgToSgLoadGatherOpWithOffset,
            WgToSgStoreScatterOpWithOffset, WgToSgLoadMatrixOp,
            WgToSgStoreMatrixOp, WgToSgVectorStepOp, WgToSgVectorShapeCastOp,
-           WgToSgMultiDimReductionOp>(patterns.getContext());
+           WgToSgMultiDimReductionOp, WgToSgVectorTransposeOp>(
+          patterns.getContext());
 }
 } // namespace xegpu
 } // namespace mlir
@@ -1358,7 +1425,9 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
         return isLegal(layout);
       });
 
-  target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp>(
+  target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp,
+                               vector::TransposeOp, vector::BroadcastOp,
+                               vector::MultiDimReductionOp>(
       [=](Operation *op) -> bool {
         // Check for either a SliceAttr or LayoutAttr on the result.
         auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
@@ -1377,16 +1446,6 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
         return isLegal(layout);
       });
 
-  target.addDynamicallyLegalOp<vector::BroadcastOp>(
-      [=](vector::BroadcastOp op) -> bool {
-        return isLegal(xegpu::getDistributeLayoutAttr(op.getResult()));
-      });
-
-  target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
-      [=](vector::MultiDimReductionOp op) -> bool {
-        return isLegal(xegpu::getDistributeLayoutAttr(op.getResult()));
-      });
-
   target.addDynamicallyLegalOp<xegpu::ConvertLayoutOp>(
       [=](xegpu::ConvertLayoutOp op) -> bool {
         return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout());
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index a38993e0c55b1..de9e09d427665 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -144,6 +144,11 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
     std::string layoutName = getLayoutName(result);
     if (defOp->hasAttr(layoutName))
       return defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+
+    // check for "permament" layout only after "temporary" layout name lookup
+    // for backward compatibility
+    if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(defOp))
+      return loadGatherOp.getLayoutAttr();
   }
 
   if (auto arg = dyn_cast<BlockArgument>(value)) {
@@ -171,27 +176,77 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
   std::string layoutName = xegpu::getLayoutName(opr);
   if (op->hasAttr(layoutName))
     return op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+
+  // check for "permament" layout only after "temporary" layout name lookup
+  if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
+    if (auto layout = storeScatterOp.getLayoutAttr())
+      return layout;
+
   return getDistributeLayoutAttr(opr.get());
 }
 
+// Returns the permanent layout attribute for the given result if it's
+// available on the defining op. Otherwise returns the provided layout.
+xegpu::DistributeLayoutAttr
+maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
+                         const OpResult &result, mlir::Operation *owner,
+                         const std::string &name) {
+  xegpu::DistributeLayoutAttr candidate = layout;
+
+  if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(owner)) {
+    if (auto perm = loadOp.getLayoutAttr())
+      candidate = perm;
+  }
+
+  return candidate;
+}
+
+// Returns the permanent layout attribute for the given operand if it's
+// available on the defining op. Otherwise returns the provided layout.
+xegpu::DistributeLayoutAttr
+maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
+                         const OpOperand &operand, mlir::Operation *owner,
+                         const std::string &name) {
+  xegpu::DistributeLayoutAttr candidate = layout;
+  unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
+
+  if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(owner)) {
+    if (idx == 0) {
+      if (auto perm = storeOp.getLayoutAttr())
+        candidate = perm;
+    }
+  }
+
+  return candidate;
+}
+
 template <typename T, typename>
 void xegpu::setDistributeLayoutAttr(const T &operandOrResult,
-                                    const DistributeLayoutAttr layout) {
+                                    const DistributeLayoutAttr layout,
+                                    bool respectPermLayout) {
   Operation *owner = operandOrResult.getOwner();
   std::string name = xegpu::getLayoutName(operandOrResult);
-  if (layout && !owner->hasAttrOfType<DistributeLayoutAttr>(name))
-    owner->setAttr(name, layout);
+
+  if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
+    return;
+
+  DistributeLayoutAttr candidate = layout;
+  if (respectPermLayout)
+    candidate = maybePickPermanentLayout(layout, operandOrResult, owner, name);
+
+  if (candidate)
+    owner->setAttr(name, candidate);
 }
 
 // Explicit instantiation for OpResult
 template void xegpu::setDistributeLayoutAttr<mlir::OpResult>(
     const mlir::OpResult &result,
-    const mlir::xegpu::DistributeLayoutAttr layout);
+    const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout);
 
 // Explicit instantiation for OpOperand
 template void xegpu::setDistributeLayoutAttr<mlir::OpOperand>(
     const mlir::OpOperand &operand,
-    const mlir::xegpu::DistributeLayoutAttr layout);
+    const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout);
 
 void xegpu::setDistributeLayoutAttrs(
     Operation *op, function_ref<DistributeLayoutAttr(Value)> getLayoutImpl) {
@@ -500,3 +555,29 @@ xegpu::addWithRightAligned(OpBuilder &builder, Location loc,
   results.append(addElementwise(builder, loc, a, b));
   return results;
 }
+
+template <typename T>
+int xegpu::getLargestDivisor(T dim, ArrayRef<T> candidates,
+                             ArrayRef<T> candidateMultiples) {
+  static_assert(std::is_integral<T>::value, "T must be an integer type");
+  int largest = -1;
+  SmallVector<T> multiples = {1};
+  if (!candidateMultiples.empty())
+    multiples =
+        SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end());
+  for (T candidate : candidates) {
+    for (T multiple : multiples) {
+      int value = static_cast<int>(candidate * multiple);
+      if (value != 0 && dim % value == 0 && value > largest)
+        largest = value;
+    }
+  }
+  return largest;
+}
+
+/// Explicit instantiations
+template int xegpu::getLargestDivisor<int>(int dim, ArrayRef<int> candidates,
+                                           ArrayRef<int> candidateMultiples);
+template int
+xegpu::getLargestDivisor<unsigned>(unsigned dim, ArrayRef<unsigned> candidates,
+                                   ArrayRef<unsigned> candidateMultiples);
diff --git a/mlir/lib/IR/BuiltinTypeInterfaces.cpp b/mlir/lib/IR/BuiltinTypeInterfaces.cpp
index ce076a3b4ac5e..031752bffeab8 100644
--- a/mlir/lib/IR/BuiltinTypeInterfaces.cpp
+++ b/mlir/lib/IR/BuiltinTypeInterfaces.cpp
@@ -34,8 +34,6 @@ unsigned FloatType::getFPMantissaWidth() {
 // ShapedType
 //===----------------------------------------------------------------------===//
 
-constexpr int64_t ShapedType::kDynamic;
-
 int64_t ShapedType::getNumElements(ArrayRef<int64_t> shape) {
   int64_t num = 1;
   for (int64_t dim : shape) {
diff --git a/mlir/lib/IR/Operation.cpp b/mlir/lib/IR/Operation.cpp
index ce421f4bf7e0e..bf8a918641dfb 100644
--- a/mlir/lib/IR/Operation.cpp
+++ b/mlir/lib/IR/Operation.cpp
@@ -375,9 +375,6 @@ llvm::hash_code Operation::hashProperties() {
 // Operation Ordering
 //===----------------------------------------------------------------------===//
 
-constexpr unsigned Operation::kInvalidOrderIdx;
-constexpr unsigned Operation::kOrderStride;
-
 /// Given an operation 'other' that is within the same parent block, return
 /// whether the current operation is before 'other' in the operation list
 /// of the parent block.
@@ -463,28 +460,26 @@ void Operation::updateOrderIfNecessary() {
 //===----------------------------------------------------------------------===//
 
 auto llvm::ilist_detail::SpecificNodeAccess<
-    typename llvm::ilist_detail::compute_node_options<
-        ::mlir::Operation>::type>::getNodePtr(pointer n) -> node_type * {
+    llvm::ilist_detail::compute_node_options<::mlir::Operation>::type>::
+    getNodePtr(pointer n) -> node_type * {
   return NodeAccess::getNodePtr<OptionsT>(n);
 }
 
 auto llvm::ilist_detail::SpecificNodeAccess<
-    typename llvm::ilist_detail::compute_node_options<
-        ::mlir::Operation>::type>::getNodePtr(const_pointer n)
-    -> const node_type * {
+    llvm::ilist_detail::compute_node_options<::mlir::Operation>::type>::
+    getNodePtr(const_pointer n) -> const node_type * {
   return NodeAccess::getNodePtr<OptionsT>(n);
 }
 
 auto llvm::ilist_detail::SpecificNodeAccess<
-    typename llvm::ilist_detail::compute_node_options<
-        ::mlir::Operation>::type>::getValuePtr(node_type *n) -> pointer {
+    llvm::ilist_detail::compute_node_options<::mlir::Operation>::type>::
+    getValuePtr(node_type *n) -> pointer {
   return NodeAccess::getValuePtr<OptionsT>(n);
 }
 
 auto llvm::ilist_detail::SpecificNodeAccess<
-    typename llvm::ilist_detail::compute_node_options<
-        ::mlir::Operation>::type>::getValuePtr(const node_type *n)
-    -> const_pointer {
+    llvm::ilist_detail::compute_node_options<::mlir::Operation>::type>::
+    getValuePtr(const node_type *n) -> const_pointer {
   return NodeAccess::getValuePtr<OptionsT>(n);
 }
 
diff --git a/mlir/lib/IR/PatternMatch.cpp b/mlir/lib/IR/PatternMatch.cpp
index 9332f55bd9393..8bc0fcd4517d8 100644
--- a/mlir/lib/IR/PatternMatch.cpp
+++ b/mlir/lib/IR/PatternMatch.cpp
@@ -80,10 +80,10 @@ Pattern::Pattern(const void *rootValue, RootKind rootKind,
   if (generatedNames.empty())
     return;
   generatedOps.reserve(generatedNames.size());
-  std::transform(generatedNames.begin(), generatedNames.end(),
-                 std::back_inserter(generatedOps), [context](StringRef name) {
-                   return OperationName(name, context);
-                 });
+  llvm::append_range(generatedOps,
+                     llvm::map_range(generatedNames, [context](StringRef name) {
+                       return OperationName(name, context);
+                     }));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/BinaryFormat/Minidump.cpp b/mlir/lib/Interfaces/AlignmentAttrInterface.cpp
similarity index 55%
rename from llvm/lib/BinaryFormat/Minidump.cpp
rename to mlir/lib/Interfaces/AlignmentAttrInterface.cpp
index b618fb1570126..fe985adb5e79a 100644
--- a/llvm/lib/BinaryFormat/Minidump.cpp
+++ b/mlir/lib/Interfaces/AlignmentAttrInterface.cpp
@@ -1,4 +1,4 @@
-//===-- Minidump.cpp - Minidump constants and structures ---------*- C++-*-===//
+//===- AlignmentAttrInterface.cpp -----------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,9 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/BinaryFormat/Minidump.h"
+#include "mlir/Interfaces/AlignmentAttrInterface.h"
 
-using namespace llvm::minidump;
+using namespace mlir;
 
-constexpr uint32_t Header::MagicSignature;
-constexpr uint16_t Header::MagicVersion;
+#include "mlir/Interfaces/AlignmentAttrInterface.cpp.inc"
diff --git a/mlir/lib/Interfaces/CMakeLists.txt b/mlir/lib/Interfaces/CMakeLists.txt
index f96af02db0be7..ad3e2b61be418 100644
--- a/mlir/lib/Interfaces/CMakeLists.txt
+++ b/mlir/lib/Interfaces/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_OPTIONAL_SOURCES
+  AlignmentAttrInterface.cpp
   CallInterfaces.cpp
   CastInterfaces.cpp
   ControlFlowInterfaces.cpp
@@ -41,6 +42,7 @@ function(add_mlir_interface_library name)
 endfunction(add_mlir_interface_library)
 
 
+add_mlir_interface_library(AlignmentAttrInterface)
 add_mlir_interface_library(CallInterfaces)
 add_mlir_interface_library(CastInterfaces)
 add_mlir_interface_library(ControlFlowInterfaces)
diff --git a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
index 1e56810ff7aaf..7420412f09360 100644
--- a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
+++ b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
@@ -328,7 +328,6 @@ static bool traverseRegionGraph(Region *begin,
              << nextRegion->getRegionNumber() << ", returning true";
       return true;
     }
-    llvm::dbgs() << "Region: " << nextRegion << "\n";
     if (!nextRegion->getParentOp()) {
       llvm::errs() << "Region " << *nextRegion << " has no parent op\n";
       return false;
diff --git a/mlir/lib/Interfaces/InferTypeOpInterface.cpp b/mlir/lib/Interfaces/InferTypeOpInterface.cpp
index 9f4f672fb9f4d..c31e0ae7470e2 100644
--- a/mlir/lib/Interfaces/InferTypeOpInterface.cpp
+++ b/mlir/lib/Interfaces/InferTypeOpInterface.cpp
@@ -58,6 +58,22 @@ mlir::reifyResultShapes(OpBuilder &b, Operation *op,
   return status;
 }
 
+FailureOr<SmallVector<OpFoldResult>>
+mlir::reifyShapeOfResult(OpBuilder &b, Operation *op, int resultIndex) {
+  auto reifiableOp = dyn_cast<ReifyRankedShapedTypeOpInterface>(op);
+  if (!reifiableOp)
+    return failure();
+  return reifiableOp.reifyShapeOfResult(b, resultIndex);
+}
+
+FailureOr<OpFoldResult> mlir::reifyDimOfResult(OpBuilder &b, Operation *op,
+                                               int resultIndex, int dim) {
+  auto reifiableOp = dyn_cast<ReifyRankedShapedTypeOpInterface>(op);
+  if (!reifiableOp)
+    return failure();
+  return reifiableOp.reifyDimOfResult(b, resultIndex, dim);
+}
+
 bool ShapeAdaptor::hasRank() const {
   if (val.isNull())
     return false;
diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
index d2bafb701046e..a5bfde11479ff 100644
--- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
+++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Interfaces/ViewLikeInterface.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugLog.h"
 
 #define DEBUG_TYPE "value-bounds-op-interface"
 
@@ -195,7 +196,7 @@ void ValueBoundsConstraintSet::addBound(BoundType type, int64_t pos,
     // Even without this bound, there may be enough information in the
     // constraint system to compute the requested bound. In case this bound is
     // actually needed, `computeBound` will return `failure`.
-    LLVM_DEBUG(llvm::dbgs() << "Failed to add bound: " << expr << "\n");
+    LDBG() << "Failed to add bound: " << expr << "\n";
   }
 }
 
@@ -271,11 +272,9 @@ int64_t ValueBoundsConstraintSet::insert(Value value,
   assert(!valueDimToPosition.contains(valueDim) && "already mapped");
   int64_t pos = isSymbol ? cstr.appendVar(VarKind::Symbol)
                          : cstr.appendVar(VarKind::SetDim);
-  LLVM_DEBUG(llvm::dbgs() << "Inserting constraint set column " << pos
-                          << " for: " << value
-                          << " (dim: " << dim.value_or(kIndexValue)
-                          << ", owner: " << getOwnerOfValue(value)->getName()
-                          << ")\n");
+  LDBG() << "Inserting constraint set column " << pos << " for: " << value
+         << " (dim: " << dim.value_or(kIndexValue)
+         << ", owner: " << getOwnerOfValue(value)->getName() << ")";
   positionToValueDim.insert(positionToValueDim.begin() + pos, valueDim);
   // Update reverse mapping.
   for (int64_t i = pos, e = positionToValueDim.size(); i < e; ++i)
@@ -283,8 +282,8 @@ int64_t ValueBoundsConstraintSet::insert(Value value,
       valueDimToPosition[*positionToValueDim[i]] = i;
 
   if (addToWorklist) {
-    LLVM_DEBUG(llvm::dbgs() << "Push to worklist: " << value
-                            << " (dim: " << dim.value_or(kIndexValue) << ")\n");
+    LDBG() << "Push to worklist: " << value
+           << " (dim: " << dim.value_or(kIndexValue) << ")";
     worklist.push(pos);
   }
 
@@ -294,8 +293,7 @@ int64_t ValueBoundsConstraintSet::insert(Value value,
 int64_t ValueBoundsConstraintSet::insert(bool isSymbol) {
   int64_t pos = isSymbol ? cstr.appendVar(VarKind::Symbol)
                          : cstr.appendVar(VarKind::SetDim);
-  LLVM_DEBUG(llvm::dbgs() << "Inserting anonymous constraint set column " << pos
-                          << "\n");
+  LDBG() << "Inserting anonymous constraint set column " << pos;
   positionToValueDim.insert(positionToValueDim.begin() + pos, std::nullopt);
   // Update reverse mapping.
   for (int64_t i = pos, e = positionToValueDim.size(); i < e; ++i)
@@ -339,10 +337,9 @@ int64_t ValueBoundsConstraintSet::getPos(Value value,
           cast<BlockArgument>(value).getOwner()->isEntryBlock()) &&
          "unstructured control flow is not supported");
 #endif // NDEBUG
-  LLVM_DEBUG(llvm::dbgs() << "Getting pos for: " << value
-                          << " (dim: " << dim.value_or(kIndexValue)
-                          << ", owner: " << getOwnerOfValue(value)->getName()
-                          << ")\n");
+  LDBG() << "Getting pos for: " << value
+         << " (dim: " << dim.value_or(kIndexValue)
+         << ", owner: " << getOwnerOfValue(value)->getName() << ")";
   auto it =
       valueDimToPosition.find(std::make_pair(value, dim.value_or(kIndexValue)));
   assert(it != valueDimToPosition.end() && "expected mapped entry");
@@ -364,7 +361,7 @@ bool ValueBoundsConstraintSet::isMapped(Value value,
 }
 
 void ValueBoundsConstraintSet::processWorklist() {
-  LLVM_DEBUG(llvm::dbgs() << "Processing value bounds worklist...\n");
+  LDBG() << "Processing value bounds worklist...";
   while (!worklist.empty()) {
     int64_t pos = worklist.front();
     worklist.pop();
@@ -386,8 +383,8 @@ void ValueBoundsConstraintSet::processWorklist() {
     // Do not process any further if the stop condition is met.
     auto maybeDim = dim == kIndexValue ? std::nullopt : std::make_optional(dim);
     if (stopCondition(value, maybeDim, *this)) {
-      LLVM_DEBUG(llvm::dbgs() << "Stop condition met for: " << value
-                              << " (dim: " << maybeDim << ")\n");
+      LDBG() << "Stop condition met for: " << value << " (dim: " << maybeDim
+             << ")";
       continue;
     }
 
@@ -395,9 +392,8 @@ void ValueBoundsConstraintSet::processWorklist() {
     // the worklist.
     auto valueBoundsOp =
         dyn_cast<ValueBoundsOpInterface>(getOwnerOfValue(value));
-    LLVM_DEBUG(llvm::dbgs()
-               << "Query value bounds for: " << value
-               << " (owner: " << getOwnerOfValue(value)->getName() << ")\n");
+    LDBG() << "Query value bounds for: " << value
+           << " (owner: " << getOwnerOfValue(value)->getName() << ")";
     if (valueBoundsOp) {
       if (dim == kIndexValue) {
         valueBoundsOp.populateBoundsForIndexValue(value, *this);
@@ -406,7 +402,7 @@ void ValueBoundsConstraintSet::processWorklist() {
       }
       continue;
     }
-    LLVM_DEBUG(llvm::dbgs() << "--> ValueBoundsOpInterface not implemented\n");
+    LDBG() << "--> ValueBoundsOpInterface not implemented";
 
     // If the op does not implement `ValueBoundsOpInterface`, check if it
     // implements the `DestinationStyleOpInterface`. OpResults of such ops are
@@ -705,9 +701,7 @@ bool ValueBoundsConstraintSet::comparePos(int64_t lhsPos,
 
   // We cannot prove anything if the constraint set is already empty.
   if (cstr.isEmpty()) {
-    LLVM_DEBUG(
-        llvm::dbgs()
-        << "cannot compare value/dims: constraint system is already empty");
+    LDBG() << "cannot compare value/dims: constraint system is already empty";
     return false;
   }
 
diff --git a/mlir/lib/RegisterAllExtensions.cpp b/mlir/lib/RegisterAllExtensions.cpp
index 3839172fd0b42..c857c38df717c 100644
--- a/mlir/lib/RegisterAllExtensions.cpp
+++ b/mlir/lib/RegisterAllExtensions.cpp
@@ -56,6 +56,7 @@
 #include "mlir/Dialect/Transform/SMTExtension/SMTExtension.h"
 #include "mlir/Dialect/Transform/TuneExtension/TuneExtension.h"
 #include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
+#include "mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h"
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
@@ -112,6 +113,7 @@ void mlir::registerAllExtensions(DialectRegistry &registry) {
   transform::registerSMTExtension(registry);
   transform::registerTuneExtension(registry);
   vector::registerTransformDialectExtension(registry);
+  xegpu::registerTransformDialectExtension(registry);
   arm_neon::registerTransformDialectExtension(registry);
   arm_sve::registerTransformDialectExtension(registry);
 
diff --git a/mlir/lib/Support/Timing.cpp b/mlir/lib/Support/Timing.cpp
index 16306d72815f7..b0ac3798bc984 100644
--- a/mlir/lib/Support/Timing.cpp
+++ b/mlir/lib/Support/Timing.cpp
@@ -50,7 +50,8 @@ class TimingManagerImpl {
   llvm::sys::SmartRWMutex<true> identifierMutex;
 
   /// A thread local cache of identifiers to reduce lock contention.
-  ThreadLocalCache<llvm::StringMap<llvm::StringMapEntry<std::nullopt_t> *>>
+  ThreadLocalCache<
+      llvm::StringMap<llvm::StringMapEntry<llvm::EmptyStringSetTag> *>>
       localIdentifierCache;
 
   TimingManagerImpl() : identifiers(identifierAllocator) {}
@@ -618,11 +619,17 @@ void mlir::applyDefaultTimingManagerCLOptions(DefaultTimingManager &tm) {
     return;
   tm.setEnabled(options->timing);
   tm.setDisplayMode(options->displayMode);
+  tm.setOutput(createOutputStrategy(options->outputFormat, llvm::errs()));
+}
 
-  std::unique_ptr<OutputStrategy> printer;
-  if (options->outputFormat == OutputFormat::Text)
-    printer = std::make_unique<OutputTextStrategy>(llvm::errs());
-  else if (options->outputFormat == OutputFormat::Json)
-    printer = std::make_unique<OutputJsonStrategy>(llvm::errs());
-  tm.setOutput(std::move(printer));
+std::unique_ptr<OutputStrategy>
+mlir::createOutputStrategy(DefaultTimingManager::OutputFormat fmt,
+                           raw_ostream &os) {
+  switch (fmt) {
+  case OutputFormat::Text:
+    return std::make_unique<OutputTextStrategy>(os);
+  case OutputFormat::Json:
+    return std::make_unique<OutputJsonStrategy>(os);
+  }
+  llvm_unreachable("Invalid output format");
 }
diff --git a/mlir/lib/TableGen/CodeGenHelpers.cpp b/mlir/lib/TableGen/CodeGenHelpers.cpp
index d52d5e769ee6d..9ad031eb701ad 100644
--- a/mlir/lib/TableGen/CodeGenHelpers.cpp
+++ b/mlir/lib/TableGen/CodeGenHelpers.cpp
@@ -12,12 +12,26 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/TableGen/CodeGenHelpers.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/TableGen/Argument.h"
+#include "mlir/TableGen/Attribute.h"
+#include "mlir/TableGen/Format.h"
 #include "mlir/TableGen/Operator.h"
 #include "mlir/TableGen/Pattern.h"
+#include "mlir/TableGen/Property.h"
+#include "mlir/TableGen/Region.h"
+#include "mlir/TableGen/Successor.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/CodeGenHelpers.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
+#include <cassert>
+#include <optional>
+#include <string>
 
 using namespace llvm;
 using namespace mlir;
@@ -112,6 +126,55 @@ StringRef StaticVerifierFunctionEmitter::getRegionConstraintFn(
 // Constraint Emission
 //===----------------------------------------------------------------------===//
 
+/// Helper to generate a C++ string expression from a given message.
+/// Message can contain '{{...}}' placeholders that are substituted with
+/// C-expressions via tgfmt.
+std::string mlir::tblgen::buildErrorStreamingString(
+    StringRef message, const FmtContext &ctx, ErrorStreamType errorStreamType) {
+  std::string result;
+  raw_string_ostream os(result);
+
+  std::string msgStr = escapeString(message);
+  StringRef msg = msgStr;
+
+  // Split the message by '{{' and '}}' and build a streaming expression.
+  auto split = msg.split("{{");
+  os << split.first;
+  if (split.second.empty()) {
+    return msgStr;
+  }
+
+  if (errorStreamType == ErrorStreamType::InsideOpError)
+    os << "\")";
+  else
+    os << '"';
+
+  msg = split.second;
+  while (!msg.empty()) {
+    split = msg.split("}}");
+    StringRef var = split.first;
+    StringRef rest = split.second;
+
+    os << " << " << tgfmt(var, &ctx);
+
+    if (rest.empty())
+      break;
+
+    split = rest.split("{{");
+    if (split.second.empty() &&
+        errorStreamType == ErrorStreamType::InsideOpError) {
+      // To enable having part of string post, this adds a parenthesis before
+      // the last string segment to match the existing one.
+      os << " << (\"" << split.first;
+    } else {
+      os << " << \"" << split.first;
+    }
+    msg = split.second;
+  }
+
+  return os.str();
+}
+
 /// Code templates for emitting type, attribute, successor, and region
 /// constraints. Each of these templates require the following arguments:
 ///
@@ -224,22 +287,24 @@ static ::llvm::LogicalResult {0}(
 
 void StaticVerifierFunctionEmitter::emitConstraints(
     const ConstraintMap &constraints, StringRef selfName,
-    const char *const codeTemplate) {
+    const char *const codeTemplate, ErrorStreamType errorStreamType) {
   FmtContext ctx;
   ctx.addSubst("_op", "*op").withSelf(selfName);
+
   for (auto &it : constraints) {
     os << formatv(codeTemplate, it.second,
                   tgfmt(it.first.getConditionTemplate(), &ctx),
-                  escapeString(it.first.getSummary()));
+                  buildErrorStreamingString(it.first.getSummary(), ctx));
   }
 }
-
 void StaticVerifierFunctionEmitter::emitTypeConstraints() {
-  emitConstraints(typeConstraints, "type", typeConstraintCode);
+  emitConstraints(typeConstraints, "type", typeConstraintCode,
+                  ErrorStreamType::InString);
 }
 
 void StaticVerifierFunctionEmitter::emitAttrConstraints() {
-  emitConstraints(attrConstraints, "attr", attrConstraintCode);
+  emitConstraints(attrConstraints, "attr", attrConstraintCode,
+                  ErrorStreamType::InString);
 }
 
 /// Unlike with the other helpers, this one has to substitute in the interface
@@ -251,17 +316,19 @@ void StaticVerifierFunctionEmitter::emitPropConstraints() {
     auto propConstraint = cast<PropConstraint>(it.first);
     os << formatv(propConstraintCode, it.second,
                   tgfmt(propConstraint.getConditionTemplate(), &ctx),
-                  escapeString(it.first.getSummary()),
+                  buildErrorStreamingString(it.first.getSummary(), ctx),
                   propConstraint.getInterfaceType());
   }
 }
 
 void StaticVerifierFunctionEmitter::emitSuccessorConstraints() {
-  emitConstraints(successorConstraints, "successor", successorConstraintCode);
+  emitConstraints(successorConstraints, "successor", successorConstraintCode,
+                  ErrorStreamType::InString);
 }
 
 void StaticVerifierFunctionEmitter::emitRegionConstraints() {
-  emitConstraints(regionConstraints, "region", regionConstraintCode);
+  emitConstraints(regionConstraints, "region", regionConstraintCode,
+                  ErrorStreamType::InString);
 }
 
 void StaticVerifierFunctionEmitter::emitPatternConstraints() {
@@ -270,13 +337,14 @@ void StaticVerifierFunctionEmitter::emitPatternConstraints() {
   for (auto &it : typeConstraints) {
     os << formatv(patternConstraintCode, it.second,
                   tgfmt(it.first.getConditionTemplate(), &ctx),
-                  escapeString(it.first.getSummary()), "::mlir::Type type");
+                  buildErrorStreamingString(it.first.getSummary(), ctx),
+                  "::mlir::Type type");
   }
   ctx.withSelf("attr");
   for (auto &it : attrConstraints) {
     os << formatv(patternConstraintCode, it.second,
                   tgfmt(it.first.getConditionTemplate(), &ctx),
-                  escapeString(it.first.getSummary()),
+                  buildErrorStreamingString(it.first.getSummary(), ctx),
                   "::mlir::Attribute attr");
   }
   ctx.withSelf("prop");
@@ -291,7 +359,7 @@ void StaticVerifierFunctionEmitter::emitPatternConstraints() {
     }
     os << formatv(patternConstraintCode, it.second,
                   tgfmt(propConstraint.getConditionTemplate(), &ctx),
-                  escapeString(propConstraint.getSummary()),
+                  buildErrorStreamingString(propConstraint.getSummary(), ctx),
                   Twine(interfaceType) + " prop");
   }
 }
diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
index 12435119b98a1..ae209679ece6c 100644
--- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp
+++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
@@ -402,6 +402,57 @@ static bool shouldBeInlined(ExpressionOp expressionOp) {
   return false;
 }
 
+static LogicalResult printOperation(CppEmitter &emitter,
+                                    emitc::GetFieldOp getFieldOp) {
+  emitter.cacheDeferredOpResult(getFieldOp.getResult(),
+                                getFieldOp.getFieldName());
+  return success();
+}
+
+static LogicalResult printOperation(CppEmitter &emitter,
+                                    emitc::GetGlobalOp getGlobalOp) {
+  emitter.cacheDeferredOpResult(getGlobalOp.getResult(), getGlobalOp.getName());
+  return success();
+}
+
+static LogicalResult printOperation(CppEmitter &emitter,
+                                    emitc::LiteralOp literalOp) {
+  emitter.cacheDeferredOpResult(literalOp.getResult(), literalOp.getValue());
+  return success();
+}
+
+static LogicalResult printOperation(CppEmitter &emitter,
+                                    emitc::MemberOp memberOp) {
+  std::string out;
+  llvm::raw_string_ostream ss(out);
+  ss << emitter.getOrCreateName(memberOp.getOperand());
+  ss << "." << memberOp.getMember();
+  emitter.cacheDeferredOpResult(memberOp.getResult(), out);
+  return success();
+}
+
+static LogicalResult printOperation(CppEmitter &emitter,
+                                    emitc::MemberOfPtrOp memberOfPtrOp) {
+  std::string out;
+  llvm::raw_string_ostream ss(out);
+  ss << emitter.getOrCreateName(memberOfPtrOp.getOperand());
+  ss << "->" << memberOfPtrOp.getMember();
+  emitter.cacheDeferredOpResult(memberOfPtrOp.getResult(), out);
+  return success();
+}
+
+static LogicalResult printOperation(CppEmitter &emitter,
+                                    emitc::SubscriptOp subscriptOp) {
+  std::string out;
+  llvm::raw_string_ostream ss(out);
+  ss << emitter.getOrCreateName(subscriptOp.getValue());
+  for (auto index : subscriptOp.getIndices()) {
+    ss << "[" << emitter.getOrCreateName(index) << "]";
+  }
+  emitter.cacheDeferredOpResult(subscriptOp.getResult(), out);
+  return success();
+}
+
 static LogicalResult printConstantOp(CppEmitter &emitter, Operation *operation,
                                      Attribute value) {
   OpResult result = operation->getResult(0);
@@ -1761,41 +1812,19 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) {
                 emitc::CmpOp, emitc::ConditionalOp, emitc::ConstantOp,
                 emitc::DeclareFuncOp, emitc::DivOp, emitc::DoOp,
                 emitc::ExpressionOp, emitc::FieldOp, emitc::FileOp,
-                emitc::ForOp, emitc::FuncOp, emitc::GlobalOp, emitc::IfOp,
-                emitc::IncludeOp, emitc::LoadOp, emitc::LogicalAndOp,
-                emitc::LogicalNotOp, emitc::LogicalOrOp, emitc::MulOp,
-                emitc::RemOp, emitc::ReturnOp, emitc::SubOp, emitc::SwitchOp,
-                emitc::UnaryMinusOp, emitc::UnaryPlusOp, emitc::VariableOp,
-                emitc::VerbatimOp>(
+                emitc::ForOp, emitc::FuncOp, emitc::GetFieldOp,
+                emitc::GetGlobalOp, emitc::GlobalOp, emitc::IfOp,
+                emitc::IncludeOp, emitc::LiteralOp, emitc::LoadOp,
+                emitc::LogicalAndOp, emitc::LogicalNotOp, emitc::LogicalOrOp,
+                emitc::MemberOfPtrOp, emitc::MemberOp, emitc::MulOp,
+                emitc::RemOp, emitc::ReturnOp, emitc::SubscriptOp, emitc::SubOp,
+                emitc::SwitchOp, emitc::UnaryMinusOp, emitc::UnaryPlusOp,
+                emitc::VariableOp, emitc::VerbatimOp>(
 
               [&](auto op) { return printOperation(*this, op); })
           // Func ops.
           .Case<func::CallOp, func::FuncOp, func::ReturnOp>(
               [&](auto op) { return printOperation(*this, op); })
-          .Case<emitc::GetGlobalOp>([&](auto op) {
-            cacheDeferredOpResult(op.getResult(), op.getName());
-            return success();
-          })
-          .Case<emitc::GetFieldOp>([&](auto op) {
-            cacheDeferredOpResult(op.getResult(), op.getFieldName());
-            return success();
-          })
-          .Case<emitc::LiteralOp>([&](auto op) {
-            cacheDeferredOpResult(op.getResult(), op.getValue());
-            return success();
-          })
-          .Case<emitc::MemberOp>([&](auto op) {
-            cacheDeferredOpResult(op.getResult(), createMemberAccess(op));
-            return success();
-          })
-          .Case<emitc::MemberOfPtrOp>([&](auto op) {
-            cacheDeferredOpResult(op.getResult(), createMemberAccess(op));
-            return success();
-          })
-          .Case<emitc::SubscriptOp>([&](auto op) {
-            cacheDeferredOpResult(op.getResult(), getSubscriptName(op));
-            return success();
-          })
           .Default([&](Operation *) {
             return op.emitOpError("unable to find printer for op");
           });
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index 3d86b09b32538..cecff51e637a5 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -36,9 +36,6 @@ using mlir::LLVM::detail::createIntrinsicCall;
 static llvm::Intrinsic::ID getReduxIntrinsicId(llvm::Type *resultType,
                                                NVVM::ReduxKind kind,
                                                bool hasAbs, bool hasNaN) {
-  if (!(resultType->isIntegerTy(32) || resultType->isFloatTy()))
-    llvm_unreachable("unsupported data type for redux");
-
   switch (kind) {
   case NVVM::ReduxKind::ADD:
     return llvm::Intrinsic::nvvm_redux_sync_add;
@@ -294,6 +291,20 @@ static unsigned getUnidirectionalFenceProxyID(NVVM::ProxyKind fromProxy,
   llvm_unreachable("Unsupported proxy kinds");
 }
 
+static unsigned getMembarIntrinsicID(NVVM::MemScopeKind scope) {
+  switch (scope) {
+  case NVVM::MemScopeKind::CTA:
+    return llvm::Intrinsic::nvvm_membar_cta;
+  case NVVM::MemScopeKind::CLUSTER:
+    return llvm::Intrinsic::nvvm_fence_sc_cluster;
+  case NVVM::MemScopeKind::GPU:
+    return llvm::Intrinsic::nvvm_membar_gl;
+  case NVVM::MemScopeKind::SYS:
+    return llvm::Intrinsic::nvvm_membar_sys;
+  }
+  llvm_unreachable("Unknown scope for memory barrier");
+}
+
 #define TCGEN05LD(SHAPE, NUM) llvm::Intrinsic::nvvm_tcgen05_ld_##SHAPE##_##NUM
 
 static llvm::Intrinsic::ID
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index f28454075f1d3..8edec990eaaba 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -4084,12 +4084,13 @@ static omp::MapInfoOp getFirstOrLastMappedMemberPtr(omp::MapInfoOp mapInfo,
 ///
 /// Fortran
 ///     map(tofrom: array(2:5, 3:2))
-///   or
-/// C++
-///   map(tofrom: array[1:4][2:3])
+///
 /// We must calculate the initial pointer offset to pass across, this function
 /// performs this using bounds.
 ///
+/// TODO/WARNING: This only supports Fortran's column major indexing currently
+/// as is noted in the note below and comments in the function, we must extend
+/// this function when we add a C++ frontend.
 /// NOTE: which while specified in row-major order it currently needs to be
 /// flipped for Fortran's column order array allocation and access (as
 /// opposed to C++'s row-major, hence the backwards processing where order is
@@ -4125,46 +4126,28 @@ calculateBoundsOffset(LLVM::ModuleTranslation &moduleTranslation,
     // with a pointer that's being treated like an array and we have the
     // underlying type e.g. an i32, or f64 etc, e.g. a fortran descriptor base
     // address (pointer pointing to the actual data) so we must caclulate the
-    // offset using a single index which the following two loops attempts to
-    // compute.
-
-    // Calculates the size offset we need to make per row e.g. first row or
-    // column only needs to be offset by one, but the next would have to be
-    // the previous row/column offset multiplied by the extent of current row.
+    // offset using a single index which the following loop attempts to
+    // compute using the standard column-major algorithm e.g for a 3D array:
     //
-    // For example ([1][10][100]):
+    // ((((c_idx * b_len) + b_idx) * a_len) + a_idx)
     //
-    //  - First row/column we move by 1 for each index increment
-    //  - Second row/column we move by 1 (first row/column) * 10 (extent/size of
-    //  current) for 10 for each index increment
-    //  - Third row/column we would move by 10 (second row/column) *
-    //  (extent/size of current) 100 for 1000 for each index increment
-    std::vector<llvm::Value *> dimensionIndexSizeOffset{builder.getInt64(1)};
-    for (size_t i = 1; i < bounds.size(); ++i) {
-      if (auto boundOp = dyn_cast_if_present<omp::MapBoundsOp>(
-              bounds[i].getDefiningOp())) {
-        dimensionIndexSizeOffset.push_back(builder.CreateMul(
-            moduleTranslation.lookupValue(boundOp.getExtent()),
-            dimensionIndexSizeOffset[i - 1]));
-      }
-    }
-
-    // Now that we have calculated how much we move by per index, we must
-    // multiply each lower bound offset in indexes by the size offset we
-    // have calculated in the previous and accumulate the results to get
-    // our final resulting offset.
+    // It is of note that it's doing column-major rather than row-major at the
+    // moment, but having a way for the frontend to indicate which major format
+    // to use or standardizing/canonicalizing the order of the bounds to compute
+    // the offset may be useful in the future when there's other frontends with
+    // different formats.
+    std::vector<llvm::Value *> dimensionIndexSizeOffset;
     for (int i = bounds.size() - 1; i >= 0; --i) {
       if (auto boundOp = dyn_cast_if_present<omp::MapBoundsOp>(
               bounds[i].getDefiningOp())) {
-        if (idx.empty())
-          idx.emplace_back(builder.CreateMul(
-              moduleTranslation.lookupValue(boundOp.getLowerBound()),
-              dimensionIndexSizeOffset[i]));
+        if (i == ((int)bounds.size() - 1))
+          idx.emplace_back(
+              moduleTranslation.lookupValue(boundOp.getLowerBound()));
         else
           idx.back() = builder.CreateAdd(
-              idx.back(), builder.CreateMul(moduleTranslation.lookupValue(
-                                                boundOp.getLowerBound()),
-                                            dimensionIndexSizeOffset[i]));
+              builder.CreateMul(idx.back(), moduleTranslation.lookupValue(
+                                                boundOp.getExtent())),
+              moduleTranslation.lookupValue(boundOp.getLowerBound()));
       }
     }
   }
diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index 9ef405dad5a70..018a188d09109 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -681,17 +681,8 @@ processBuffer(raw_ostream &os, std::unique_ptr<MemoryBuffer> ownedBuffer,
   return success();
 }
 
-std::pair<std::string, std::string>
-mlir::registerAndParseCLIOptions(int argc, char **argv,
-                                 llvm::StringRef toolName,
-                                 DialectRegistry &registry) {
-  static cl::opt<std::string> inputFilename(
-      cl::Positional, cl::desc("<input file>"), cl::init("-"));
-
-  static cl::opt<std::string> outputFilename("o", cl::desc("Output filename"),
-                                             cl::value_desc("filename"),
-                                             cl::init("-"));
-  // Register any command line options.
+std::string mlir::registerCLIOptions(llvm::StringRef toolName,
+                                     DialectRegistry &registry) {
   MlirOptMainConfig::registerCLOptions(registry);
   registerAsmPrinterCLOptions();
   registerMLIRContextCLOptions();
@@ -706,11 +697,29 @@ mlir::registerAndParseCLIOptions(int argc, char **argv,
     interleaveComma(registry.getDialectNames(), os,
                     [&](auto name) { os << name; });
   }
-  // Parse pass names in main to ensure static initialization completed.
+  return helpHeader;
+}
+
+std::pair<std::string, std::string>
+mlir::parseCLIOptions(int argc, char **argv, llvm::StringRef helpHeader) {
+  static cl::opt<std::string> inputFilename(
+      cl::Positional, cl::desc("<input file>"), cl::init("-"));
+
+  static cl::opt<std::string> outputFilename("o", cl::desc("Output filename"),
+                                             cl::value_desc("filename"),
+                                             cl::init("-"));
   cl::ParseCommandLineOptions(argc, argv, helpHeader);
   return std::make_pair(inputFilename.getValue(), outputFilename.getValue());
 }
 
+std::pair<std::string, std::string>
+mlir::registerAndParseCLIOptions(int argc, char **argv,
+                                 llvm::StringRef toolName,
+                                 DialectRegistry &registry) {
+  auto helpHeader = registerCLIOptions(toolName, registry);
+  return parseCLIOptions(argc, argv, helpHeader);
+}
+
 static LogicalResult printRegisteredDialects(DialectRegistry &registry) {
   llvm::outs() << "Available Dialects: ";
   interleave(registry.getDialectNames(), llvm::outs(), ",");
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 3a23bbfd70eac..365bfddeaab73 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include <optional>
+#include <utility>
 
 using namespace mlir;
 using namespace mlir::detail;
@@ -92,6 +93,22 @@ static OpBuilder::InsertPoint computeInsertPoint(ArrayRef<Value> vals) {
   return pt;
 }
 
+namespace {
+enum OpConversionMode {
+  /// In this mode, the conversion will ignore failed conversions to allow
+  /// illegal operations to co-exist in the IR.
+  Partial,
+
+  /// In this mode, all operations must be legal for the given target for the
+  /// conversion to succeed.
+  Full,
+
+  /// In this mode, operations are analyzed for legality. No actual rewrites are
+  /// applied to the operations on success.
+  Analysis,
+};
+} // namespace
+
 //===----------------------------------------------------------------------===//
 // ConversionValueMapping
 //===----------------------------------------------------------------------===//
@@ -866,8 +883,9 @@ namespace mlir {
 namespace detail {
 struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   explicit ConversionPatternRewriterImpl(ConversionPatternRewriter &rewriter,
-                                         const ConversionConfig &config)
-      : rewriter(rewriter), config(config),
+                                         const ConversionConfig &config,
+                                         OperationConverter &opConverter)
+      : rewriter(rewriter), config(config), opConverter(opConverter),
         notifyingRewriter(rewriter.getContext(), config.listener) {}
 
   //===--------------------------------------------------------------------===//
@@ -1034,7 +1052,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
         MLIRContext *context,
         std::function<void(Operation *)> opErasedCallback = nullptr)
         : RewriterBase(context, /*listener=*/this),
-          opErasedCallback(opErasedCallback) {}
+          opErasedCallback(std::move(opErasedCallback)) {}
 
     /// Erase the given op (unless it was already erased).
     void eraseOp(Operation *op) override {
@@ -1105,10 +1123,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// A set of operations that were modified by the current pattern.
   SetVector<Operation *> patternModifiedOps;
 
-  /// A set of blocks that were inserted (newly-created blocks or moved blocks)
-  /// by the current pattern.
-  SetVector<Block *> patternInsertedBlocks;
-
   /// A list of unresolved materializations that were created by the current
   /// pattern.
   DenseSet<UnrealizedConversionCastOp> patternMaterializations;
@@ -1128,6 +1142,9 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// Dialect conversion configuration.
   const ConversionConfig &config;
 
+  /// The operation converter to use for recursive legalization.
+  OperationConverter &opConverter;
+
   /// A set of erased operations. This set is utilized only if
   /// `allowPatternRollback` is set to "false". Conceptually, this set is
   /// similar to `replacedOps` (which is maintained when the flag is set to
@@ -2046,8 +2063,6 @@ void ConversionPatternRewriterImpl::notifyBlockInserted(
   if (!config.allowPatternRollback && config.listener)
     config.listener->notifyBlockInserted(block, previous, previousIt);
 
-  patternInsertedBlocks.insert(block);
-
   if (wasDetached) {
     // If the block was detached, it is most likely a newly created block.
     if (config.allowPatternRollback) {
@@ -2090,9 +2105,10 @@ void ConversionPatternRewriterImpl::notifyMatchFailure(
 //===----------------------------------------------------------------------===//
 
 ConversionPatternRewriter::ConversionPatternRewriter(
-    MLIRContext *ctx, const ConversionConfig &config)
-    : PatternRewriter(ctx),
-      impl(new detail::ConversionPatternRewriterImpl(*this, config)) {
+    MLIRContext *ctx, const ConversionConfig &config,
+    OperationConverter &opConverter)
+    : PatternRewriter(ctx), impl(new detail::ConversionPatternRewriterImpl(
+                                *this, config, opConverter)) {
   setListener(impl.get());
 }
 
@@ -2213,6 +2229,37 @@ ConversionPatternRewriter::getRemappedValues(ValueRange keys,
   return success();
 }
 
+LogicalResult ConversionPatternRewriter::legalize(Region *r) {
+  // Fast path: If the region is empty, there is nothing to legalize.
+  if (r->empty())
+    return success();
+
+  // Gather a list of all operations to legalize. This is done before
+  // converting the entry block signature because unrealized_conversion_cast
+  // ops should not be included.
+  SmallVector<Operation *> ops;
+  for (Block &b : *r)
+    for (Operation &op : b)
+      ops.push_back(&op);
+
+  // If the current pattern runs with a type converter, convert the entry block
+  // signature.
+  if (const TypeConverter *converter = impl->currentTypeConverter) {
+    std::optional<TypeConverter::SignatureConversion> conversion =
+        converter->convertBlockSignature(&r->front());
+    if (!conversion)
+      return failure();
+    applySignatureConversion(&r->front(), *conversion, converter);
+  }
+
+  // Legalize all operations in the region.
+  for (Operation *op : ops)
+    if (failed(legalize(op)))
+      return failure();
+
+  return success();
+}
+
 void ConversionPatternRewriter::inlineBlockBefore(Block *source, Block *dest,
                                                   Block::iterator before,
                                                   ValueRange argValues) {
@@ -2399,17 +2446,12 @@ class OperationLegalizer {
   bool canApplyPattern(Operation *op, const Pattern &pattern);
 
   /// Legalize the resultant IR after successfully applying the given pattern.
-  LogicalResult legalizePatternResult(Operation *op, const Pattern &pattern,
-                                      const RewriterState &curState,
-                                      const SetVector<Operation *> &newOps,
-                                      const SetVector<Operation *> &modifiedOps,
-                                      const SetVector<Block *> &insertedBlocks);
-
-  /// Legalizes the actions registered during the execution of a pattern.
   LogicalResult
-  legalizePatternBlockRewrites(Operation *op,
-                               const SetVector<Block *> &insertedBlocks,
-                               const SetVector<Operation *> &newOps);
+  legalizePatternResult(Operation *op, const Pattern &pattern,
+                        const RewriterState &curState,
+                        const SetVector<Operation *> &newOps,
+                        const SetVector<Operation *> &modifiedOps);
+
   LogicalResult
   legalizePatternCreatedOperations(const SetVector<Operation *> &newOps);
   LogicalResult
@@ -2608,7 +2650,6 @@ LogicalResult OperationLegalizer::legalizeWithFold(Operation *op) {
   auto cleanup = llvm::make_scope_exit([&]() {
     rewriterImpl.patternNewOps.clear();
     rewriterImpl.patternModifiedOps.clear();
-    rewriterImpl.patternInsertedBlocks.clear();
   });
 
   // Upon failure, undo all changes made by the folder.
@@ -2662,24 +2703,16 @@ LogicalResult OperationLegalizer::legalizeWithFold(Operation *op) {
 static void
 reportNewIrLegalizationFatalError(const Pattern &pattern,
                                   const SetVector<Operation *> &newOps,
-                                  const SetVector<Operation *> &modifiedOps,
-                                  const SetVector<Block *> &insertedBlocks) {
+                                  const SetVector<Operation *> &modifiedOps) {
   auto newOpNames = llvm::map_range(
       newOps, [](Operation *op) { return op->getName().getStringRef(); });
   auto modifiedOpNames = llvm::map_range(
       modifiedOps, [](Operation *op) { return op->getName().getStringRef(); });
-  StringRef detachedBlockStr = "(detached block)";
-  auto insertedBlockNames = llvm::map_range(insertedBlocks, [&](Block *block) {
-    if (block->getParentOp())
-      return block->getParentOp()->getName().getStringRef();
-    return detachedBlockStr;
-  });
-  llvm::report_fatal_error(
-      "pattern '" + pattern.getDebugName() +
-      "' produced IR that could not be legalized. " + "new ops: {" +
-      llvm::join(newOpNames, ", ") + "}, " + "modified ops: {" +
-      llvm::join(modifiedOpNames, ", ") + "}, " + "inserted block into ops: {" +
-      llvm::join(insertedBlockNames, ", ") + "}");
+  llvm::report_fatal_error("pattern '" + pattern.getDebugName() +
+                           "' produced IR that could not be legalized. " +
+                           "new ops: {" + llvm::join(newOpNames, ", ") + "}, " +
+                           "modified ops: {" +
+                           llvm::join(modifiedOpNames, ", ") + "}");
 }
 
 LogicalResult OperationLegalizer::legalizeWithPattern(Operation *op) {
@@ -2743,7 +2776,6 @@ LogicalResult OperationLegalizer::legalizeWithPattern(Operation *op) {
     }
     rewriterImpl.patternNewOps.clear();
     rewriterImpl.patternModifiedOps.clear();
-    rewriterImpl.patternInsertedBlocks.clear();
     LLVM_DEBUG({
       logFailure(rewriterImpl.logger, "pattern failed to match");
       if (rewriterImpl.config.notifyCallback) {
@@ -2777,15 +2809,12 @@ LogicalResult OperationLegalizer::legalizeWithPattern(Operation *op) {
     SetVector<Operation *> newOps = moveAndReset(rewriterImpl.patternNewOps);
     SetVector<Operation *> modifiedOps =
         moveAndReset(rewriterImpl.patternModifiedOps);
-    SetVector<Block *> insertedBlocks =
-        moveAndReset(rewriterImpl.patternInsertedBlocks);
-    auto result = legalizePatternResult(op, pattern, curState, newOps,
-                                        modifiedOps, insertedBlocks);
+    auto result =
+        legalizePatternResult(op, pattern, curState, newOps, modifiedOps);
     appliedPatterns.erase(&pattern);
     if (failed(result)) {
       if (!rewriterImpl.config.allowPatternRollback)
-        reportNewIrLegalizationFatalError(pattern, newOps, modifiedOps,
-                                          insertedBlocks);
+        reportNewIrLegalizationFatalError(pattern, newOps, modifiedOps);
       rewriterImpl.resetState(curState, pattern.getDebugName());
     }
     if (config.listener)
@@ -2823,28 +2852,28 @@ bool OperationLegalizer::canApplyPattern(Operation *op,
 LogicalResult OperationLegalizer::legalizePatternResult(
     Operation *op, const Pattern &pattern, const RewriterState &curState,
     const SetVector<Operation *> &newOps,
-    const SetVector<Operation *> &modifiedOps,
-    const SetVector<Block *> &insertedBlocks) {
+    const SetVector<Operation *> &modifiedOps) {
   [[maybe_unused]] auto &impl = rewriter.getImpl();
   assert(impl.pendingRootUpdates.empty() && "dangling root updates");
 
 #if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
-  // Check that the root was either replaced or updated in place.
-  auto newRewrites = llvm::drop_begin(impl.rewrites, curState.numRewrites);
-  auto replacedRoot = [&] {
-    return hasRewrite<ReplaceOperationRewrite>(newRewrites, op);
-  };
-  auto updatedRootInPlace = [&] {
-    return hasRewrite<ModifyOperationRewrite>(newRewrites, op);
-  };
-  if (!replacedRoot() && !updatedRootInPlace())
-    llvm::report_fatal_error(
-        "expected pattern to replace the root operation or modify it in place");
+  if (impl.config.allowPatternRollback) {
+    // Check that the root was either replaced or updated in place.
+    auto newRewrites = llvm::drop_begin(impl.rewrites, curState.numRewrites);
+    auto replacedRoot = [&] {
+      return hasRewrite<ReplaceOperationRewrite>(newRewrites, op);
+    };
+    auto updatedRootInPlace = [&] {
+      return hasRewrite<ModifyOperationRewrite>(newRewrites, op);
+    };
+    if (!replacedRoot() && !updatedRootInPlace())
+      llvm::report_fatal_error("expected pattern to replace the root operation "
+                               "or modify it in place");
+  }
 #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
 
   // Legalize each of the actions registered during application.
-  if (failed(legalizePatternBlockRewrites(op, insertedBlocks, newOps)) ||
-      failed(legalizePatternRootUpdates(modifiedOps)) ||
+  if (failed(legalizePatternRootUpdates(modifiedOps)) ||
       failed(legalizePatternCreatedOperations(newOps))) {
     return failure();
   }
@@ -2853,53 +2882,6 @@ LogicalResult OperationLegalizer::legalizePatternResult(
   return success();
 }
 
-LogicalResult OperationLegalizer::legalizePatternBlockRewrites(
-    Operation *op, const SetVector<Block *> &insertedBlocks,
-    const SetVector<Operation *> &newOps) {
-  ConversionPatternRewriterImpl &impl = rewriter.getImpl();
-  SmallPtrSet<Operation *, 16> alreadyLegalized;
-
-  // If the pattern moved or created any blocks, make sure the types of block
-  // arguments get legalized.
-  for (Block *block : insertedBlocks) {
-    if (impl.erasedBlocks.contains(block))
-      continue;
-
-    // Only check blocks outside of the current operation.
-    Operation *parentOp = block->getParentOp();
-    if (!parentOp || parentOp == op || block->getNumArguments() == 0)
-      continue;
-
-    // If the region of the block has a type converter, try to convert the block
-    // directly.
-    if (auto *converter = impl.regionToConverter.lookup(block->getParent())) {
-      std::optional<TypeConverter::SignatureConversion> conversion =
-          converter->convertBlockSignature(block);
-      if (!conversion) {
-        LLVM_DEBUG(logFailure(impl.logger, "failed to convert types of moved "
-                                           "block"));
-        return failure();
-      }
-      impl.applySignatureConversion(block, converter, *conversion);
-      continue;
-    }
-
-    // Otherwise, try to legalize the parent operation if it was not generated
-    // by this pattern. This is because we will attempt to legalize the parent
-    // operation, and blocks in regions created by this pattern will already be
-    // legalized later on.
-    if (!newOps.count(parentOp) && alreadyLegalized.insert(parentOp).second) {
-      if (failed(legalize(parentOp))) {
-        LLVM_DEBUG(logFailure(
-            impl.logger, "operation '{0}'({1}) became illegal after rewrite",
-            parentOp->getName(), parentOp));
-        return failure();
-      }
-    }
-  }
-  return success();
-}
-
 LogicalResult OperationLegalizer::legalizePatternCreatedOperations(
     const SetVector<Operation *> &newOps) {
   for (Operation *op : newOps) {
@@ -3265,22 +3247,6 @@ static void reconcileUnrealizedCasts(
 // OperationConverter
 //===----------------------------------------------------------------------===//
 
-namespace {
-enum OpConversionMode {
-  /// In this mode, the conversion will ignore failed conversions to allow
-  /// illegal operations to co-exist in the IR.
-  Partial,
-
-  /// In this mode, all operations must be legal for the given target for the
-  /// conversion to succeed.
-  Full,
-
-  /// In this mode, operations are analyzed for legality. No actual rewrites are
-  /// applied to the operations on success.
-  Analysis,
-};
-} // namespace
-
 namespace mlir {
 // This class converts operations to a given conversion target via a set of
 // rewrite patterns. The conversion behaves differently depending on the
@@ -3290,16 +3256,20 @@ struct OperationConverter {
                               const FrozenRewritePatternSet &patterns,
                               const ConversionConfig &config,
                               OpConversionMode mode)
-      : rewriter(ctx, config), opLegalizer(rewriter, target, patterns),
+      : rewriter(ctx, config, *this), opLegalizer(rewriter, target, patterns),
         mode(mode) {}
 
   /// Converts the given operations to the conversion target.
   LogicalResult convertOperations(ArrayRef<Operation *> ops);
 
-private:
-  /// Converts an operation with the given rewriter.
-  LogicalResult convert(Operation *op);
+  /// Converts a single operation. If `isRecursiveLegalization` is "true", the
+  /// conversion is a recursive legalization request, triggered from within a
+  /// pattern. In that case, do not emit errors because there will be another
+  /// attempt at legalizing the operation later (via the regular pre-order
+  /// legalization mechanism).
+  LogicalResult convert(Operation *op, bool isRecursiveLegalization = false);
 
+private:
   /// The rewriter to use when converting operations.
   ConversionPatternRewriter rewriter;
 
@@ -3311,32 +3281,42 @@ struct OperationConverter {
 };
 } // namespace mlir
 
-LogicalResult OperationConverter::convert(Operation *op) {
+LogicalResult ConversionPatternRewriter::legalize(Operation *op) {
+  return impl->opConverter.convert(op, /*isRecursiveLegalization=*/true);
+}
+
+LogicalResult OperationConverter::convert(Operation *op,
+                                          bool isRecursiveLegalization) {
   const ConversionConfig &config = rewriter.getConfig();
 
   // Legalize the given operation.
   if (failed(opLegalizer.legalize(op))) {
     // Handle the case of a failed conversion for each of the different modes.
     // Full conversions expect all operations to be converted.
-    if (mode == OpConversionMode::Full)
-      return op->emitError()
-             << "failed to legalize operation '" << op->getName() << "'";
+    if (mode == OpConversionMode::Full) {
+      if (!isRecursiveLegalization)
+        op->emitError() << "failed to legalize operation '" << op->getName()
+                        << "'";
+      return failure();
+    }
     // Partial conversions allow conversions to fail iff the operation was not
     // explicitly marked as illegal. If the user provided a `unlegalizedOps`
     // set, non-legalizable ops are added to that set.
     if (mode == OpConversionMode::Partial) {
-      if (opLegalizer.isIllegal(op))
-        return op->emitError()
-               << "failed to legalize operation '" << op->getName()
-               << "' that was explicitly marked illegal";
-      if (config.unlegalizedOps)
+      if (opLegalizer.isIllegal(op)) {
+        if (!isRecursiveLegalization)
+          op->emitError() << "failed to legalize operation '" << op->getName()
+                          << "' that was explicitly marked illegal";
+        return failure();
+      }
+      if (config.unlegalizedOps && !isRecursiveLegalization)
         config.unlegalizedOps->insert(op);
     }
   } else if (mode == OpConversionMode::Analysis) {
     // Analysis conversions don't fail if any operations fail to legalize,
     // they are only interested in the operations that were successfully
     // legalized.
-    if (config.legalizableOps)
+    if (config.legalizableOps && !isRecursiveLegalization)
       config.legalizableOps->insert(op);
   }
   return success();
@@ -3800,10 +3780,11 @@ static LogicalResult convertFuncOpTypes(FunctionOpInterface funcOp,
   TypeConverter::SignatureConversion result(type.getNumInputs());
   SmallVector<Type, 1> newResults;
   if (failed(typeConverter.convertSignatureArgs(type.getInputs(), result)) ||
-      failed(typeConverter.convertTypes(type.getResults(), newResults)) ||
-      failed(rewriter.convertRegionTypes(&funcOp.getFunctionBody(),
-                                         typeConverter, &result)))
+      failed(typeConverter.convertTypes(type.getResults(), newResults)))
     return failure();
+  if (!funcOp.getFunctionBody().empty())
+    rewriter.applySignatureConversion(&funcOp.getFunctionBody().front(), result,
+                                      &typeConverter);
 
   // Update the function signature in-place.
   auto newType = FunctionType::get(rewriter.getContext(),
diff --git a/mlir/lib/Transforms/ViewOpGraph.cpp b/mlir/lib/Transforms/ViewOpGraph.cpp
index 08cac1fe3695c..5790a77cc4e2b 100644
--- a/mlir/lib/Transforms/ViewOpGraph.cpp
+++ b/mlir/lib/Transforms/ViewOpGraph.cpp
@@ -158,7 +158,8 @@ class PrintOpPass : public impl::ViewOpGraphBase<PrintOpPass> {
 
   /// Emit a cluster (subgraph). The specified builder generates the body of the
   /// cluster. Return the anchor node of the cluster.
-  Node emitClusterStmt(function_ref<void()> builder, std::string label = "") {
+  Node emitClusterStmt(function_ref<void()> builder,
+                       const std::string &label = "") {
     int clusterId = ++counter;
     os << "subgraph cluster_" << clusterId << " {\n";
     os.indent();
@@ -269,7 +270,7 @@ class PrintOpPass : public impl::ViewOpGraphBase<PrintOpPass> {
   }
 
   /// Emit a node statement.
-  Node emitNodeStmt(std::string label, StringRef shape = kShapeNode,
+  Node emitNodeStmt(const std::string &label, StringRef shape = kShapeNode,
                     StringRef background = "") {
     int nodeId = ++counter;
     AttributeMap attrs;
diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt
index 20ed3ab41a0b4..51c75764faf3c 100644
--- a/mlir/python/CMakeLists.txt
+++ b/mlir/python/CMakeLists.txt
@@ -322,6 +322,15 @@ declare_mlir_dialect_extension_python_bindings(
     "../../include/mlir/Dialect/Vector/Transforms/VectorTransformsBase.td"
 )
 
+declare_mlir_dialect_extension_python_bindings(
+  ADD_TO_PARENT MLIRPythonSources.Dialects
+  ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
+  TD_FILE dialects/XeGPUTransformOps.td
+  SOURCES
+    dialects/transform/xegpu.py
+  DIALECT_NAME transform
+  EXTENSION_NAME xegpu_transform)
+
 declare_mlir_dialect_python_bindings(
   ADD_TO_PARENT MLIRPythonSources.Dialects
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
diff --git a/mlir/python/mlir/dialects/XeGPUTransformOps.td b/mlir/python/mlir/dialects/XeGPUTransformOps.td
new file mode 100644
index 0000000000000..5a5e7b912c4a5
--- /dev/null
+++ b/mlir/python/mlir/dialects/XeGPUTransformOps.td
@@ -0,0 +1,19 @@
+//===---- XeGPUTransformOps.td -----------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Entry point of the Python bindings generator for the XeGPU transform ops.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef PYTHON_BINDINGS_XEGPU_TRANSFORM_OPS
+#define PYTHON_BINDINGS_XEGPU_TRANSFORM_OPS
+
+include "mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td"
+
+#endif // PYTHON_BINDINGS_XEGPU_TRANSFORM_OPS
diff --git a/mlir/python/mlir/dialects/linalg/__init__.py b/mlir/python/mlir/dialects/linalg/__init__.py
index d387c12deeed9..c92bda74c12bf 100644
--- a/mlir/python/mlir/dialects/linalg/__init__.py
+++ b/mlir/python/mlir/dialects/linalg/__init__.py
@@ -352,3 +352,7 @@ def unpack(
             ip=ip,
         )
     )
+
+
+reduce = region_op(ReduceOp, terminator=YieldOp)
+map = region_op(MapOp, terminator=YieldOp)
diff --git a/mlir/python/mlir/dialects/scf.py b/mlir/python/mlir/dialects/scf.py
index 678ceeebac204..9e22df3dd50a9 100644
--- a/mlir/python/mlir/dialects/scf.py
+++ b/mlir/python/mlir/dialects/scf.py
@@ -12,6 +12,7 @@
     from ._ods_common import (
         get_op_result_or_value as _get_op_result_or_value,
         get_op_results_or_values as _get_op_results_or_values,
+        get_op_result_or_op_results as _get_op_result_or_op_results,
         _cext as _ods_cext,
     )
 except ImportError as e:
@@ -254,3 +255,77 @@ def for_(
             yield iv, iter_args[0], for_op.results[0]
         else:
             yield iv
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class IndexSwitchOp(IndexSwitchOp):
+    __doc__ = IndexSwitchOp.__doc__
+
+    def __init__(
+        self,
+        results,
+        arg,
+        cases,
+        case_body_builder=None,
+        default_body_builder=None,
+        loc=None,
+        ip=None,
+    ):
+        cases = DenseI64ArrayAttr.get(cases)
+        super().__init__(
+            results, arg, cases, num_caseRegions=len(cases), loc=loc, ip=ip
+        )
+        for region in self.regions:
+            region.blocks.append()
+
+        if default_body_builder is not None:
+            with InsertionPoint(self.default_block):
+                default_body_builder(self)
+
+        if case_body_builder is not None:
+            for i, case in enumerate(cases):
+                with InsertionPoint(self.case_block(i)):
+                    case_body_builder(self, i, self.cases[i])
+
+    @property
+    def default_region(self) -> Region:
+        return self.regions[0]
+
+    @property
+    def default_block(self) -> Block:
+        return self.default_region.blocks[0]
+
+    @property
+    def case_regions(self) -> Sequence[Region]:
+        return self.regions[1:]
+
+    def case_region(self, i: int) -> Region:
+        return self.case_regions[i]
+
+    @property
+    def case_blocks(self) -> Sequence[Block]:
+        return [region.blocks[0] for region in self.case_regions]
+
+    def case_block(self, i: int) -> Block:
+        return self.case_regions[i].blocks[0]
+
+
+def index_switch(
+    results,
+    arg,
+    cases,
+    case_body_builder=None,
+    default_body_builder=None,
+    loc=None,
+    ip=None,
+) -> Union[OpResult, OpResultList, IndexSwitchOp]:
+    op = IndexSwitchOp(
+        results=results,
+        arg=arg,
+        cases=cases,
+        case_body_builder=case_body_builder,
+        default_body_builder=default_body_builder,
+        loc=loc,
+        ip=ip,
+    )
+    return _get_op_result_or_op_results(op)
diff --git a/mlir/python/mlir/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/__init__.py
index b075919d1ef0f..de414dc52c0a0 100644
--- a/mlir/python/mlir/dialects/transform/__init__.py
+++ b/mlir/python/mlir/dialects/transform/__init__.py
@@ -39,16 +39,32 @@ def __init__(
         super().__init__(result_type, _get_op_result_or_value(target), loc=loc, ip=ip)
 
 
+def cast(
+    result_type: Type, target: Union[Operation, Value], *, loc=None, ip=None
+) -> OpResult:
+    return CastOp(result_type=result_type, target=target, loc=loc, ip=ip).result
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class ApplyPatternsOp(ApplyPatternsOp):
     def __init__(
         self,
         target: Union[Operation, Value, OpView],
+        apply_cse: bool = False,
+        max_iterations: Optional[Union[IntegerAttr, int]] = None,
+        max_num_rewrites: Optional[Union[IntegerAttr, int]] = None,
         *,
         loc=None,
         ip=None,
     ):
-        super().__init__(target, loc=loc, ip=ip)
+        super().__init__(
+            target,
+            apply_cse=apply_cse,
+            max_iterations=max_iterations,
+            max_num_rewrites=max_num_rewrites,
+            loc=loc,
+            ip=ip,
+        )
         self.regions[0].blocks.append()
 
     @property
@@ -56,6 +72,25 @@ def patterns(self) -> Block:
         return self.regions[0].blocks[0]
 
 
+def apply_patterns(
+    target: Union[Operation, Value, OpView],
+    apply_cse: bool = False,
+    max_iterations: Optional[Union[IntegerAttr, int]] = None,
+    max_num_rewrites: Optional[Union[IntegerAttr, int]] = None,
+    *,
+    loc=None,
+    ip=None,
+) -> ApplyPatternsOp:
+    return ApplyPatternsOp(
+        target=target,
+        apply_cse=apply_cse,
+        max_iterations=max_iterations,
+        max_num_rewrites=max_num_rewrites,
+        loc=loc,
+        ip=ip,
+    )
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class GetParentOp(GetParentOp):
     def __init__(
@@ -64,6 +99,7 @@ def __init__(
         target: Union[Operation, Value],
         *,
         isolated_from_above: bool = False,
+        allow_empty_results: bool = False,
         op_name: Optional[str] = None,
         deduplicate: bool = False,
         nth_parent: int = 1,
@@ -74,6 +110,7 @@ def __init__(
             result_type,
             _get_op_result_or_value(target),
             isolated_from_above=isolated_from_above,
+            allow_empty_results=allow_empty_results,
             op_name=op_name,
             deduplicate=deduplicate,
             nth_parent=nth_parent,
@@ -82,6 +119,31 @@ def __init__(
         )
 
 
+def get_parent_op(
+    result_type: Type,
+    target: Union[Operation, Value],
+    *,
+    isolated_from_above: bool = False,
+    allow_empty_results: bool = False,
+    op_name: Optional[str] = None,
+    deduplicate: bool = False,
+    nth_parent: int = 1,
+    loc=None,
+    ip=None,
+) -> OpResult:
+    return GetParentOp(
+        result_type=result_type,
+        target=target,
+        isolated_from_above=isolated_from_above,
+        allow_empty_results=allow_empty_results,
+        op_name=op_name,
+        deduplicate=deduplicate,
+        nth_parent=nth_parent,
+        loc=loc,
+        ip=ip,
+    ).result
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class MergeHandlesOp(MergeHandlesOp):
     def __init__(
@@ -89,17 +151,32 @@ def __init__(
         handles: Sequence[Union[Operation, Value]],
         *,
         deduplicate: bool = False,
+        results: Optional[Sequence[Type]] = None,
         loc=None,
         ip=None,
     ):
         super().__init__(
             [_get_op_result_or_value(h) for h in handles],
             deduplicate=deduplicate,
+            results=results,
             loc=loc,
             ip=ip,
         )
 
 
+def merge_handles(
+    handles: Sequence[Union[Operation, Value]],
+    *,
+    deduplicate: bool = False,
+    results: Optional[Sequence[Type]] = None,
+    loc=None,
+    ip=None,
+) -> OpResult:
+    return MergeHandlesOp(
+        handles=handles, deduplicate=deduplicate, results=results, loc=loc, ip=ip
+    ).result
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class ReplicateOp(ReplicateOp):
     def __init__(
@@ -119,16 +196,31 @@ def __init__(
         )
 
 
+def replicate(
+    pattern: Union[Operation, Value],
+    handles: Sequence[Union[Operation, Value]],
+    *,
+    loc=None,
+    ip=None,
+) -> Union[OpResult, OpResultList, ReplicateOp]:
+    op = ReplicateOp(pattern=pattern, handles=handles, loc=loc, ip=ip)
+    results = op.results
+    return results if len(results) > 1 else (results[0] if len(results) == 1 else op)
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class SequenceOp(SequenceOp):
     def __init__(
         self,
-        failure_propagation_mode,
+        failure_propagation_mode: FailurePropagationMode,
         results: Sequence[Type],
         target: Union[Operation, Value, Type],
         extra_bindings: Optional[
             Union[Sequence[Value], Sequence[Type], Operation, OpView]
         ] = None,
+        *,
+        loc=None,
+        ip=None,
     ):
         root = (
             _get_op_result_or_value(target)
@@ -155,6 +247,8 @@ def __init__(
             failure_propagation_mode=failure_propagation_mode,
             root=root,
             extra_bindings=extra_bindings,
+            loc=loc,
+            ip=ip,
         )
         self.regions[0].blocks.append(*tuple([root_type] + extra_binding_types))
 
@@ -171,16 +265,42 @@ def bodyExtraArgs(self) -> BlockArgumentList:
         return self.body.arguments[1:]
 
 
+def sequence(
+    failure_propagation_mode: FailurePropagationMode,
+    results: Sequence[Type],
+    target: Union[Operation, Value, Type],
+    extra_bindings: Optional[
+        Union[Sequence[Value], Sequence[Type], Operation, OpView]
+    ] = None,
+    *,
+    loc=None,
+    ip=None,
+) -> Union[OpResult, OpResultList, SequenceOp]:
+    op = SequenceOp(
+        results=results,
+        failure_propagation_mode=failure_propagation_mode,
+        extra_bindings=extra_bindings,
+        target=target,
+        loc=loc,
+        ip=ip,
+    )
+    results = op.results
+    return results if len(results) > 1 else (results[0] if len(results) == 1 else op)
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class NamedSequenceOp(NamedSequenceOp):
     def __init__(
         self,
-        sym_name,
+        sym_name: Union[str, SymbolRefAttr],
         input_types: Sequence[Type],
         result_types: Sequence[Type],
-        sym_visibility=None,
-        arg_attrs=None,
-        res_attrs=None,
+        *,
+        sym_visibility: Optional[Union[str, StringAttr]] = None,
+        arg_attrs: Optional[Union[Sequence[dict], "DictArrayAttr"]] = None,
+        res_attrs: Optional[Union[Sequence[dict], "DictArrayAttr"]] = None,
+        loc=None,
+        ip=None,
     ):
         function_type = FunctionType.get(input_types, result_types)
         super().__init__(
@@ -205,6 +325,29 @@ def bodyExtraArgs(self) -> BlockArgumentList:
         return self.body.arguments[1:]
 
 
+def named_sequence(
+    sym_name: Union[str, SymbolRefAttr],
+    input_types: Sequence[Type],
+    result_types: Sequence[Type],
+    *,
+    sym_visibility: Optional[Union[str, StringAttr]] = None,
+    arg_attrs: Optional[Union[Sequence[dict], "DictArrayAttr"]] = None,
+    res_attrs: Optional[Union[Sequence[dict], "DictArrayAttr"]] = None,
+    loc=None,
+    ip=None,
+) -> NamedSequenceOp:
+    return NamedSequenceOp(
+        sym_name=sym_name,
+        input_types=input_types,
+        result_types=result_types,
+        sym_visibility=sym_visibility,
+        arg_attrs=arg_attrs,
+        res_attrs=res_attrs,
+        loc=loc,
+        ip=ip,
+    )
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class YieldOp(YieldOp):
     def __init__(
@@ -219,6 +362,12 @@ def __init__(
         super().__init__(_get_op_results_or_values(operands), loc=loc, ip=ip)
 
 
+def yield_(
+    operands: Optional[Union[Operation, Sequence[Value]]] = None, *, loc=None, ip=None
+) -> YieldOp:
+    return YieldOp(operands=operands, loc=loc, ip=ip)
+
+
 OptionValueTypes = Union[
     Sequence["OptionValueTypes"], Attribute, Value, Operation, OpView, str, int, bool
 ]
@@ -247,7 +396,7 @@ def __init__(
         def option_value_to_attr(value):
             nonlocal cur_param_operand_idx
             if isinstance(value, (Value, Operation, OpView)):
-                dynamic_options.append(_get_op_result_or_value(value))
+                dynamic_options.append(value)
                 cur_param_operand_idx += 1
                 return ParamOperandAttr(cur_param_operand_idx - 1, context)
             elif isinstance(value, Attribute):
diff --git a/mlir/python/mlir/dialects/transform/structured.py b/mlir/python/mlir/dialects/transform/structured.py
index 14c7380e432f0..d9ab504f0de54 100644
--- a/mlir/python/mlir/dialects/transform/structured.py
+++ b/mlir/python/mlir/dialects/transform/structured.py
@@ -713,6 +713,8 @@ def __init__(
         disable_transfer_permutation_map_lowering_patterns: bool = False,
         vectorize_nd_extract: bool = False,
         vectorize_padding: bool = False,
+        flatten_1d_depthwise_conv: bool = False,
+        fold_type_extensions_into_contract: bool = False,
         loc=None,
         ip=None,
     ):
@@ -722,8 +724,10 @@ def __init__(
             target,
             disable_multi_reduction_to_contract_patterns=disable_multi_reduction_to_contract_patterns,
             disable_transfer_permutation_map_lowering_patterns=disable_transfer_permutation_map_lowering_patterns,
+            flatten_1d_depthwise_conv=flatten_1d_depthwise_conv,
             vectorize_nd_extract=vectorize_nd_extract,
             vectorize_padding=vectorize_padding,
+            fold_type_extensions_into_contract=fold_type_extensions_into_contract,
             loc=loc,
             ip=ip,
         )
diff --git a/mlir/python/mlir/dialects/transform/xegpu.py b/mlir/python/mlir/dialects/transform/xegpu.py
new file mode 100644
index 0000000000000..7169b5e28ab5e
--- /dev/null
+++ b/mlir/python/mlir/dialects/transform/xegpu.py
@@ -0,0 +1,358 @@
+#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#  See https://llvm.org/LICENSE.txt for license information.
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from .._xegpu_transform_ops_gen import *
+from .._xegpu_transform_ops_gen import _Dialect
+
+try:
+    from ...ir import *
+    from ...dialects import transform
+    from .._ods_common import _cext as _ods_cext
+    from .._ods_common import (
+        MixedValues,
+        MixedInt,
+        get_op_result_or_value as _get_op_result_or_value,
+        _dispatch_dynamic_index_list,
+    )
+
+except ImportError as e:
+    raise RuntimeError("Error loading imports from extension module") from e
+
+from typing import Union, Optional
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class GetDescOp(GetDescOp):
+    """Specialization for GetDescOp class."""
+
+    def __init__(
+        self,
+        target: Value,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        desc_type = transform.AnyOpType.get()
+        super().__init__(
+            desc_type,
+            target,
+            loc=loc,
+            ip=ip,
+        )
+
+
+def get_desc_op(
+    target: Value,
+    *,
+    loc=None,
+    ip=None,
+) -> OpResult:
+    return GetDescOp(target, loc=loc, ip=ip).result
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class SetDescLayoutOp(SetDescLayoutOp):
+    """Specialization for SetDescLayoutOp class."""
+
+    def __init__(
+        self,
+        target: Union[Operation, Value],
+        sg_layout: MixedValues,
+        sg_data: MixedValues,
+        *,
+        inst_data: Optional[MixedValues] = None,
+        loc=None,
+        ip=None,
+    ):
+        target_handle = _get_op_result_or_value(target)
+        inst_data = [] if inst_data is None else inst_data
+        (
+            dynamic_sg_layout,
+            static_sg_layout,
+            _,
+        ) = _dispatch_dynamic_index_list(sg_layout)
+        (
+            dynamic_sg_data,
+            static_sg_data,
+            _,
+        ) = _dispatch_dynamic_index_list(sg_data)
+        (
+            dynamic_inst_data,
+            static_inst_data,
+            _,
+        ) = _dispatch_dynamic_index_list(inst_data)
+
+        super().__init__(
+            target_handle.type,
+            target_handle,
+            dynamic_sg_layout,
+            dynamic_sg_data,
+            dynamic_inst_data,
+            static_sg_layout=static_sg_layout,
+            static_sg_data=static_sg_data,
+            static_inst_data=static_inst_data,
+            loc=loc,
+            ip=ip,
+        )
+
+
+def set_desc_layout(
+    target: Union[Operation, Value],
+    sg_layout: MixedValues,
+    sg_data: MixedValues,
+    *,
+    inst_data: Optional[MixedValues] = None,
+    loc=None,
+    ip=None,
+) -> OpResult:
+    return SetDescLayoutOp(
+        target,
+        sg_layout,
+        sg_data,
+        inst_data=inst_data,
+        loc=loc,
+        ip=ip,
+    ).result
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class SetOpLayoutAttrOp(SetOpLayoutAttrOp):
+    """Specialization for SetOpLayoutAttrOp class."""
+
+    def __init__(
+        self,
+        target: Union[Operation, Value],
+        sg_layout: MixedValues,
+        sg_data: MixedValues,
+        *,
+        inst_data: Optional[MixedValues] = None,
+        index: Optional[Union[int, Attribute]] = None,
+        result: Optional[Union[bool, Attribute]] = None,
+        loc=None,
+        ip=None,
+    ):
+        inst_data = [] if inst_data is None else inst_data
+        (
+            dynamic_sg_layout,
+            static_sg_layout,
+            _,
+        ) = _dispatch_dynamic_index_list(sg_layout)
+        (
+            dynamic_sg_data,
+            static_sg_data,
+            _,
+        ) = _dispatch_dynamic_index_list(sg_data)
+        (
+            dynamic_inst_data,
+            static_inst_data,
+            _,
+        ) = _dispatch_dynamic_index_list(inst_data)
+        super().__init__(
+            _get_op_result_or_value(target),
+            dynamic_sg_layout,
+            dynamic_sg_data,
+            dynamic_inst_data,
+            static_sg_layout=static_sg_layout,
+            static_sg_data=static_sg_data,
+            static_inst_data=static_inst_data,
+            index=index,
+            result=result,
+            loc=loc,
+            ip=ip,
+        )
+
+
+def set_op_layout_attr(
+    target: Union[Operation, Value],
+    sg_layout: MixedValues,
+    sg_data: MixedValues,
+    *,
+    inst_data: Optional[MixedValues] = None,
+    index: Optional[Union[int, Attribute]] = None,
+    result: Optional[Union[bool, Attribute]] = None,
+    loc=None,
+    ip=None,
+) -> SetOpLayoutAttrOp:
+    return SetOpLayoutAttrOp(
+        target,
+        sg_layout,
+        sg_data,
+        inst_data=inst_data,
+        index=index,
+        result=result,
+        loc=loc,
+        ip=ip,
+    )
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class SetGPULaunchThreadsOp(SetGPULaunchThreadsOp):
+    """Specialization for SetGPULaunchThreadsOp class."""
+
+    def __init__(
+        self,
+        launch_op: Union[Operation, Value],
+        threads: MixedValues,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        (
+            dynamic_threads,
+            static_threads,
+            _,
+        ) = _dispatch_dynamic_index_list(threads)
+
+        super().__init__(
+            _get_op_result_or_value(launch_op),
+            dynamic_threads,
+            static_threads=static_threads,
+            loc=loc,
+            ip=ip,
+        )
+
+
+def set_gpu_launch_threads(
+    launch_op: Union[Operation, Value],
+    threads: MixedValues,
+    *,
+    loc=None,
+    ip=None,
+) -> SetGPULaunchThreadsOp:
+    return SetGPULaunchThreadsOp(launch_op, threads, loc=loc, ip=ip)
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class InsertPrefetchOp(InsertPrefetchOp):
+    """Specialization for InsertPrefetchOp class."""
+
+    def __init__(
+        self,
+        target: Value,
+        *,
+        nb_prefetch: Optional[MixedInt] = 1,
+        loc=None,
+        ip=None,
+    ):
+        static_nb_prefetch = 1
+        dynamic_nb_prefetch = None
+        if isinstance(nb_prefetch, int):
+            static_nb_prefetch = nb_prefetch
+        elif isinstance(nb_prefetch, IntegerAttr):
+            static_nb_prefetch = nb_prefetch.value  # pytype: disable=attribute-error
+        elif isinstance(nb_prefetch, (Operation, Value, OpView)):
+            dynamic_nb_prefetch = nb_prefetch
+
+        super().__init__(
+            transform.AnyOpType.get(),
+            target,
+            dynamic_nb_prefetch=dynamic_nb_prefetch,
+            static_nb_prefetch=static_nb_prefetch,
+            loc=loc,
+            ip=ip,
+        )
+
+
+def insert_prefetch(
+    target: Value,
+    *,
+    nb_prefetch: Optional[MixedInt] = 1,
+    loc=None,
+    ip=None,
+) -> OpResult:
+    return InsertPrefetchOp(target, nb_prefetch=nb_prefetch, loc=loc, ip=ip).result
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class ConvertLayoutOp(ConvertLayoutOp):
+    """Specialization for ConvertLayoutOp class."""
+
+    def __init__(
+        self,
+        target: Value,
+        input_sg_layout: MixedValues,
+        input_sg_data: MixedValues,
+        target_sg_layout: MixedValues,
+        target_sg_data: MixedValues,
+        *,
+        input_inst_data: Optional[MixedValues] = None,
+        target_inst_data: Optional[MixedValues] = None,
+        loc=None,
+        ip=None,
+    ):
+        input_inst_data = [] if input_inst_data is None else input_inst_data
+        target_inst_data = [] if target_inst_data is None else target_inst_data
+        (
+            dynamic_input_sg_layout,
+            static_input_sg_layout,
+            _,
+        ) = _dispatch_dynamic_index_list(input_sg_layout)
+        (
+            dynamic_input_sg_data,
+            static_input_sg_data,
+            _,
+        ) = _dispatch_dynamic_index_list(input_sg_data)
+        (
+            dynamic_input_inst_data,
+            static_input_inst_data,
+            _,
+        ) = _dispatch_dynamic_index_list(input_inst_data)
+        (
+            dynamic_target_sg_layout,
+            static_target_sg_layout,
+            _,
+        ) = _dispatch_dynamic_index_list(target_sg_layout)
+        (
+            dynamic_target_sg_data,
+            static_target_sg_data,
+            _,
+        ) = _dispatch_dynamic_index_list(target_sg_data)
+        (
+            dynamic_target_inst_data,
+            static_target_inst_data,
+            _,
+        ) = _dispatch_dynamic_index_list(target_inst_data)
+        super().__init__(
+            transform.AnyOpType.get(),
+            target,
+            dynamic_input_sg_layout,
+            dynamic_input_sg_data,
+            dynamic_input_inst_data,
+            dynamic_target_sg_layout,
+            dynamic_target_sg_data,
+            dynamic_target_inst_data,
+            static_input_sg_layout=static_input_sg_layout,
+            static_input_sg_data=static_input_sg_data,
+            static_input_inst_data=static_input_inst_data,
+            static_target_sg_layout=static_target_sg_layout,
+            static_target_sg_data=static_target_sg_data,
+            static_target_inst_data=static_target_inst_data,
+            loc=loc,
+            ip=ip,
+        )
+
+
+def convert_layout(
+    target: Value,
+    input_sg_layout: MixedValues,
+    input_sg_data: MixedValues,
+    target_sg_layout: MixedValues,
+    target_sg_data: MixedValues,
+    *,
+    input_inst_data: Optional[MixedValues] = None,
+    target_inst_data: Optional[MixedValues] = None,
+    loc=None,
+    ip=None,
+) -> ConvertLayoutOp:
+    return ConvertLayoutOp(
+        target,
+        input_sg_layout,
+        input_sg_data,
+        target_sg_layout,
+        target_sg_data,
+        input_inst_data=input_inst_data,
+        target_inst_data=target_inst_data,
+        loc=loc,
+        ip=ip,
+    ).result
diff --git a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
index ba12ff29ebef9..5f1ec66234df2 100644
--- a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
+++ b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
@@ -747,3 +747,31 @@ func.func @memref_bitcast(%1: memref<?xi16>) -> memref<?xbf16> {
   %2 = arith.bitcast %1 : memref<?xi16> to memref<?xbf16>
   func.return %2 : memref<?xbf16>
 }
+
+// -----
+
+// CHECK-LABEL: func @unsupported_fp_type
+//       CHECK:   arith.addf {{.*}} : f4E2M1FN
+//       CHECK:   arith.addf {{.*}} : vector<4xf4E2M1FN>
+//       CHECK:   arith.addf {{.*}} : vector<8x4xf4E2M1FN>
+//       CHECK:   llvm.select {{.*}} : i1, i4
+func.func @unsupported_fp_type(%arg0: f4E2M1FN, %arg1: vector<4xf4E2M1FN>, %arg2: vector<8x4xf4E2M1FN>, %arg3: f4E2M1FN, %arg4: i1) {
+  %0 = arith.addf %arg0, %arg0 : f4E2M1FN
+  %1 = arith.addf %arg1, %arg1 : vector<4xf4E2M1FN>
+  %2 = arith.addf %arg2, %arg2 : vector<8x4xf4E2M1FN>
+  %3 = arith.select %arg4, %arg0, %arg3 : f4E2M1FN
+  return
+}
+
+// -----
+
+//   CHECK-LABEL: func @supported_fp_type
+//         CHECK:   llvm.fadd {{.*}} : f32
+//         CHECK:   llvm.fadd {{.*}} : vector<4xf32>
+// CHECK-COUNT-4:   llvm.fadd {{.*}} : vector<8xf32>
+func.func @supported_fp_type(%arg0: f32, %arg1: vector<4xf32>, %arg2: vector<4x8xf32>) -> (f32, vector<4xf32>, vector<4x8xf32>) {
+  %0 = arith.addf %arg0, %arg0 : f32
+  %1 = arith.addf %arg1, %arg1 : vector<4xf32>
+  %2 = arith.addf %arg2, %arg2 : vector<4x8xf32>
+  return %0, %1, %2 : f32, vector<4xf32>, vector<4x8xf32>
+}
diff --git a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
index a75f30d57fa74..cd8cfc8736915 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
@@ -275,6 +275,42 @@ func.func @reduction_minimumf(%v : vector<3xf32>, %s: f32) -> f32 {
 
 // -----
 
+// CHECK-LABEL:   spirv.func @reduction_minnumf(
+// CHECK-SAME:      %[[V:.*]]: vector<3xf32>,
+// CHECK-SAME:      %[[S:.*]]: f32) -> f32 "None" {
+// CHECK:           %[[S0:.*]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xf32>
+// CHECK:           %[[S1:.*]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<3xf32>
+// CHECK:           %[[S2:.*]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<3xf32>
+// CHECK:           %[[MIN0:.*]] = spirv.GL.FMin %[[S0]], %[[S1]] : f32
+// CHECK:           %[[MIN1:.*]] = spirv.GL.FMin %[[MIN0]], %[[S2]] : f32
+// CHECK:           %[[MIN2:.*]] = spirv.GL.FMin %[[MIN1]], %[[S]] : f32
+// CHECK:           spirv.ReturnValue %[[MIN2]] : f32
+// CHECK:         }
+func.func @reduction_minnumf(%v : vector<3xf32>, %s: f32) -> f32 {
+  %reduce = vector.reduction <minnumf>, %v, %s : vector<3xf32> into f32
+  return %reduce : f32
+}
+
+// -----
+
+// CHECK-LABEL:   spirv.func @reduction_maxnumf(
+// CHECK-SAME:      %[[V:.*]]: vector<3xf32>,
+// CHECK-SAME:      %[[S:.*]]: f32) -> f32 "None" {
+// CHECK:           %[[S0:.*]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xf32>
+// CHECK:           %[[S1:.*]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<3xf32>
+// CHECK:           %[[S2:.*]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<3xf32>
+// CHECK:           %[[MAX0:.*]] = spirv.GL.FMax %[[S0]], %[[S1]] : f32
+// CHECK:           %[[MAX1:.*]] = spirv.GL.FMax %[[MAX0]], %[[S2]] : f32
+// CHECK:           %[[MAX2:.*]] = spirv.GL.FMax %[[MAX1]], %[[S]] : f32
+// CHECK:           spirv.ReturnValue %[[MAX2]] : f32
+// CHECK:         }
+func.func @reduction_maxnumf(%v : vector<3xf32>, %s: f32) -> f32 {
+  %reduce = vector.reduction <maxnumf>, %v, %s : vector<3xf32> into f32
+  return %reduce : f32
+}
+
+// -----
+
 // CHECK-LABEL: func @reduction_maxsi
 //  CHECK-SAME: (%[[V:.+]]: vector<3xi32>, %[[S:.+]]: i32)
 //       CHECK:   %[[S0:.+]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xi32>
diff --git a/mlir/test/Conversion/FuncToLLVM/convert-data-layout.mlir b/mlir/test/Conversion/FuncToLLVM/convert-data-layout.mlir
index cb6bc35d452e1..c16702f0442ba 100644
--- a/mlir/test/Conversion/FuncToLLVM/convert-data-layout.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/convert-data-layout.mlir
@@ -1,8 +1,8 @@
 // RUN: mlir-opt -set-llvm-module-datalayout -convert-func-to-llvm %s | FileCheck %s
 
-// RUN-32: mlir-opt -set-llvm-module-datalayout='data-layout=p:32:32:32' -convert-func-to-llvm %s \
-// RUN-32: | FileCheck %s
+// RUN: mlir-opt -set-llvm-module-datalayout='data-layout=p:32:32:32' -convert-func-to-llvm %s \
+// RUN: | FileCheck %s --check-prefix=CHECK-32
 
 // CHECK: module attributes {llvm.data_layout = ""}
-// CHECK-32: module attributes {llvm.data_layout ="p:32:32:32"}
+// CHECK-32: module attributes {llvm.data_layout = "p:32:32:32"}
 module {}
diff --git a/mlir/test/Conversion/FuncToLLVM/func-memref.mlir b/mlir/test/Conversion/FuncToLLVM/func-memref.mlir
index 15a96543eb6b7..16ed4847c0d58 100644
--- a/mlir/test/Conversion/FuncToLLVM/func-memref.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/func-memref.mlir
@@ -1,5 +1,6 @@
 // RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s
 // RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm{use-bare-ptr-memref-call-conv=1},convert-cf-to-llvm,reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s --check-prefix=BAREPTR
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm{index-bitwidth=32}),convert-func-to-llvm{index-bitwidth=32},convert-cf-to-llvm{index-bitwidth=32},reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s --check-prefix=CHECK32
 
 // BAREPTR-LABEL: func @check_noalias
 // BAREPTR-SAME: %{{.*}}: !llvm.ptr {llvm.noalias}, %{{.*}}: !llvm.ptr {llvm.noalias}
diff --git a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
index 68c3e9f5e26ec..e0e4a61e821ce 100644
--- a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-opt -expand-strided-metadata -finalize-memref-to-llvm -lower-affine -convert-arith-to-llvm -cse %s -split-input-file | FileCheck %s
+// RUN: mlir-opt -expand-strided-metadata -finalize-memref-to-llvm='index-bitwidth=32' -lower-affine -convert-arith-to-llvm='index-bitwidth=32' -cse %s -split-input-file | FileCheck %s --check-prefix=CHECK32
 //
 // This test demonstrates a full "memref to llvm" pipeline where
 // we first expand some of the memref operations (using affine,
@@ -441,10 +442,31 @@ func.func @collapse_shape_dynamic_with_non_identity_layout(
 // CHECK:           %[[RES:.*]] = builtin.unrealized_conversion_cast %[[DESC6]] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<4x?xf32, strided<[?, ?], offset: ?>>
 // CHECK:           return %[[RES]] : memref<4x?xf32, strided<[?, ?], offset: ?>>
 // CHECK:         }
-// CHECK32-LABEL: func @collapse_shape_dynamic_with_non_identity_layout(
-//       CHECK32:      llvm.mlir.constant(1 : index) : i32
-//       CHECK32:      llvm.mlir.constant(4 : index) : i32
-//       CHECK32:      llvm.mlir.constant(1 : index) : i32
+// CHECK32-LABEL:   func.func @collapse_shape_dynamic_with_non_identity_layout(
+// CHECK32-SAME:                                                               %[[ARG:.*]]: memref<4x?x?xf32, strided<[?, 4, 1], offset: ?>>) -> memref<4x?xf32, strided<[?, ?], offset: ?>> {
+// CHECK32:           %[[MEM:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : memref<4x?x?xf32, strided<[?, 4, 1], offset: ?>> to !llvm.struct<(ptr, ptr, i32, array<3 x i32>, array<3 x i32>)>
+// CHECK32:           %[[BASE_BUFFER:.*]] = llvm.extractvalue %[[MEM]][0] : !llvm.struct<(ptr, ptr, i32,
+// CHECK32:           %[[ALIGNED_BUFFER:.*]] = llvm.extractvalue %[[MEM]][1] : !llvm.struct<(ptr, ptr, i32,
+// CHECK32:           %[[OFFSET:.*]] = llvm.extractvalue %[[MEM]][2] : !llvm.struct<(ptr, ptr, i32, array<3 x i32>, array<3 x i32>)>
+// CHECK32:           %[[SIZE1:.*]] = llvm.extractvalue %[[MEM]][3, 1] : !llvm.struct<(ptr, ptr, i32, array<3 x i32>, array<3 x i32>)>
+// CHECK32:           %[[SIZE2:.*]] = llvm.extractvalue %[[MEM]][3, 2] : !llvm.struct<(ptr, ptr, i32, array<3 x i32>, array<3 x i32>)>
+// CHECK32:           %[[STRIDE0:.*]] = llvm.extractvalue %[[MEM]][4, 0] : !llvm.struct<(ptr, ptr, i32, array<3 x i32>, array<3 x i32>)>
+// CHECK32:           %[[FINAL_SIZE1_I32:.*]] = llvm.mul %[[SIZE1]], %[[SIZE2]] overflow<nsw> : i32
+// CHECK32:           %[[SIZE1_TO_IDX:.*]] = builtin.unrealized_conversion_cast %[[FINAL_SIZE1_I32]] : i32 to index
+// CHECK32:           %[[FINAL_SIZE1_CAST:.*]] = builtin.unrealized_conversion_cast %[[SIZE1_TO_IDX]] : index to i32
+// CHECK32:           %[[DESC:.*]] = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i32, array<2 x i32>, array<2 x i32>)>
+// CHECK32:           %[[DESC0:.*]] = llvm.insertvalue %[[BASE_BUFFER]], %[[DESC]][0] : !llvm.struct<(ptr, ptr, i32, array<2 x i32>, array<2 x i32>)>
+// CHECK32:           %[[DESC1:.*]] = llvm.insertvalue %[[ALIGNED_BUFFER]], %[[DESC0]][1] : !llvm.struct<(ptr, ptr, i32, array<2 x i32>, array<2 x i32>)>
+// CHECK32:           %[[DESC2:.*]] = llvm.insertvalue %[[OFFSET]], %[[DESC1]][2] : !llvm.struct<(ptr, ptr, i32, array<2 x i32>, array<2 x i32>)>
+// CHECK32:           %[[C4_I32:.*]] = llvm.mlir.constant(4 : index) : i32
+// CHECK32:           %[[DESC3:.*]] = llvm.insertvalue %[[C4_I32]], %[[DESC2]][3, 0] : !llvm.struct<(ptr, ptr, i32, array<2 x i32>, array<2 x i32>)>
+// CHECK32:           %[[DESC4:.*]] = llvm.insertvalue %[[STRIDE0]], %[[DESC3]][4, 0] : !llvm.struct<(ptr, ptr, i32, array<2 x i32>, array<2 x i32>)>
+// CHECK32:           %[[DESC5:.*]] = llvm.insertvalue %[[FINAL_SIZE1_CAST]], %[[DESC4]][3, 1] : !llvm.struct<(ptr, ptr, i32, array<2 x i32>, array<2 x i32>)>
+// CHECK32:           %[[C1_I32:.*]] = llvm.mlir.constant(1 : index) : i32
+// CHECK32:           %[[DESC6:.*]] = llvm.insertvalue %[[C1_I32]], %[[DESC5]][4, 1] : !llvm.struct<(ptr, ptr, i32, array<2 x i32>, array<2 x i32>)>
+// CHECK32:           %[[RES:.*]] = builtin.unrealized_conversion_cast %[[DESC6]] : !llvm.struct<(ptr, ptr, i32, array<2 x i32>, array<2 x i32>)> to memref<4x?xf32, strided<[?, ?], offset: ?>>
+// CHECK32:           return %[[RES]] : memref<4x?xf32, strided<[?, ?], offset: ?>>
+// CHECK32:         }
 
 // -----
 
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 5755ca9258283..0eb44789fe31d 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -486,17 +486,17 @@ func.func @mbarrier() {
   // CHECK: %[[barStr:.+]] =  builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-  // CHECK: nvvm.mbarrier.init.shared %[[barPtr]]
+  // CHECK: nvvm.mbarrier.init %[[barPtr]]
     nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType
 
   // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-  // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.shared %[[barPtr2]]
+  // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive %[[barPtr2]]
   %token = nvgpu.mbarrier.arrive %barrier[%c0] : !barrierType -> !tokenType
 
   // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-  // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]]
+  // CHECK: nvvm.mbarrier.test.wait %[[barPtr3]], %[[token]]
   %isDone = nvgpu.mbarrier.test.wait %barrier[%c0], %token : !barrierType, !tokenType
 
   func.return
@@ -516,17 +516,17 @@ func.func @mbarrier_nocomplete() {
   // CHECK: %[[barStr:.+]] =  builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-  // CHECK: nvvm.mbarrier.init.shared %[[barPtr]]
+  // CHECK: nvvm.mbarrier.init %[[barPtr]]
   nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType
 
   // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-  // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.nocomplete.shared %[[barPtr2]]
+  // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.nocomplete %[[barPtr2]]
   %token = nvgpu.mbarrier.arrive.nocomplete %barrier[%c0], %count : !barrierType -> !tokenType
 
   // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-  // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]]
+  // CHECK: nvvm.mbarrier.test.wait %[[barPtr3]], %[[token]]
   %isDone = nvgpu.mbarrier.test.wait %barrier[%c0], %token : !barrierType, !tokenType
 
   func.return
@@ -572,7 +572,7 @@ func.func @mbarrier_wait(%barriers : !nvgpu.mbarrier.group<memorySpace = #gpu.ad
 // CHECK: %[[S3:.+]] = builtin.unrealized_conversion_cast %[[S2]] : index to i64
 // CHECK: %[[S4:.+]] = llvm.extractvalue %[[CARG0]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
 // CHECK: %[[S5:.+]] = llvm.getelementptr %[[S4]][%[[S3]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-// CHECK: nvvm.mbarrier.test.wait.shared {{.*}}, %[[CARG1]]
+// CHECK: nvvm.mbarrier.test.wait {{.*}}, %[[CARG1]]
     %mbarId = arith.remui %i, %numBarriers : index
     %isDone = nvgpu.mbarrier.test.wait %barriers[%mbarId], %token : !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>, num_barriers = 5>, !tokenType
   }
@@ -592,7 +592,7 @@ func.func @mbarrier_txcount() {
     // CHECK: %[[barStr:.+]] =  builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
     // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
     // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-    // CHECK: nvvm.mbarrier.init.shared %[[barPtr]]
+    // CHECK: nvvm.mbarrier.init %[[barPtr]]
     nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType
 
     %tidxreg = nvvm.read.ptx.sreg.tid.x : i32
@@ -603,14 +603,14 @@ func.func @mbarrier_txcount() {
       %txcount = arith.constant 256 : index
       // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
       // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-      // CHECK: nvvm.mbarrier.arrive.expect_tx.shared %[[barPtr2]]
+      // CHECK: nvvm.mbarrier.arrive.expect_tx %[[barPtr2]]
       nvgpu.mbarrier.arrive.expect_tx %barrier[%c0], %txcount : !barrierType
       scf.yield
     } else {
       %txcount = arith.constant 0 : index
       // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
       // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-      // CHECK: nvvm.mbarrier.arrive.expect_tx.shared %[[barPtr2]]
+      // CHECK: nvvm.mbarrier.arrive.expect_tx %[[barPtr2]]
       nvgpu.mbarrier.arrive.expect_tx %barrier[%c0], %txcount : !barrierType
       scf.yield
     }
@@ -620,7 +620,7 @@ func.func @mbarrier_txcount() {
     %ticks = arith.constant 10000000 : index
     // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
     // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-    // CHECK: nvvm.mbarrier.try_wait.parity.shared %[[barPtr3]]
+    // CHECK: nvvm.mbarrier.try_wait.parity %[[barPtr3]]
     nvgpu.mbarrier.try_wait.parity %barrier[%c0], %phase_c0, %ticks : !barrierType
 
     func.return
@@ -643,20 +643,20 @@ func.func @mbarrier_txcount_pred() {
     // CHECK: %[[barStr:.+]] =  builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
     // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
     // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-    // CHECK: nvvm.mbarrier.init.shared %[[barPtr]], {{.*}}, predicate = %[[P]]
+    // CHECK: nvvm.mbarrier.init %[[barPtr]], {{.*}}, predicate = %[[P]]
     nvgpu.mbarrier.init %barrier[%c0], %mine, predicate = %pred : !barrierType
 
     %txcount = arith.constant 256 : index
     // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
     // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-    // CHECK: nvvm.mbarrier.arrive.expect_tx.shared %[[barPtr2]], {{.*}}, predicate = %[[P]]
+    // CHECK: nvvm.mbarrier.arrive.expect_tx %[[barPtr2]], {{.*}}, predicate = %[[P]]
     nvgpu.mbarrier.arrive.expect_tx %barrier[%c0], %txcount, predicate = %pred : !barrierType
 
     %phase_c0 = arith.constant 0 : i1
     %ticks = arith.constant 10000000 : index
     // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
     // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-    // CHECK: nvvm.mbarrier.try_wait.parity.shared %[[barPtr3]]
+    // CHECK: nvvm.mbarrier.try_wait.parity %[[barPtr3]]
     nvgpu.mbarrier.try_wait.parity %barrier[%c0], %phase_c0, %ticks : !barrierType
 
     func.return
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index 6960e83be3573..a94fcb4856db4 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -8,7 +8,7 @@
 // CHECK-LABEL: @init_mbarrier
 llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %count : i32, %pred : i1) {
   //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.shared.b64 [$0], $1;", "r,r,b" 
-  nvvm.mbarrier.init.shared %barrier, %count, predicate = %pred : !llvm.ptr<3>, i32, i1 
+  nvvm.mbarrier.init %barrier, %count, predicate = %pred : !llvm.ptr<3>, i32, i1 
   //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.b64 [$0], $1;", "l,r,b" 
   nvvm.mbarrier.init %barrier_gen, %count, predicate = %pred : !llvm.ptr, i32, i1
   llvm.return
@@ -17,9 +17,9 @@ llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %cou
 // CHECK-LABEL: @init_mbarrier_arrive_expect_tx
 llvm.func @init_mbarrier_arrive_expect_tx(%barrier : !llvm.ptr<3>, %txcount : i32, %pred : i1) {
   //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r"
-  nvvm.mbarrier.arrive.expect_tx.shared %barrier, %txcount : !llvm.ptr<3>, i32
+  nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr<3>, i32
   //CHECK:  llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r,b"
-  nvvm.mbarrier.arrive.expect_tx.shared %barrier, %txcount, predicate = %pred : !llvm.ptr<3>, i32, i1 
+  nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred : !llvm.ptr<3>, i32, i1 
   llvm.return
 }
 
@@ -44,7 +44,7 @@ llvm.func @init_mbarrier_try_wait_shared(%barrier : !llvm.ptr<3>, %ticks : i32,
   // CHECK-SAME: DONE:
   // CHECK-SAME: }",
   // CHECK-SAME: "r,r,r"
-   nvvm.mbarrier.try_wait.parity.shared %barrier, %phase, %ticks : !llvm.ptr<3>, i32, i32
+   nvvm.mbarrier.try_wait.parity %barrier, %phase, %ticks : !llvm.ptr<3>, i32, i32
   llvm.return
 }
 
@@ -88,10 +88,10 @@ func.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.p
   nvvm.cp.async.mbarrier.arrive %bar_gen : !llvm.ptr
   // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} {noinc = true}
   nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr
-  // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}}
-  nvvm.cp.async.mbarrier.arrive.shared %bar_shared : !llvm.ptr<3>
-  // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} {noinc = true}
-  nvvm.cp.async.mbarrier.arrive.shared %bar_shared {noinc = true} : !llvm.ptr<3>
+  // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}}
+  nvvm.cp.async.mbarrier.arrive %bar_shared : !llvm.ptr<3>
+  // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} {noinc = true}
+  nvvm.cp.async.mbarrier.arrive %bar_shared {noinc = true} : !llvm.ptr<3>
   llvm.return
 }
 
diff --git a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir
index 9908205f07c92..ae5141db16c09 100644
--- a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir
@@ -9,11 +9,12 @@ func.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vecto
 // CHECK-LABEL: @load_1D_vector(
 // CHECK-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32,
+// CHECK-SAME:    %[[COLLAPSED]]
+// CHECK-SAME:    memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32,
 // CHECK-SAME:    boundary_check = false
-// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8xf32>
+// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]]]{{.*}}-> vector<8xf32>
 // CHECK:       return %[[VEC]]
 
 // -----
@@ -28,35 +29,29 @@ func.func @load_2D_vector(%source: memref<8x16x32xf32>,
 // CHECK-LABEL: @load_2D_vector(
 // CHECK-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// CHECK-SAME:    %[[COLLAPSED]]
+// CHECK-SAME:    memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32>
 // CHECK:       return %[[VEC]]
 
 // -----
 
 func.func @load_dynamic_source(%source: memref<?x?x?xf32>,
-    %offset: index) -> vector<8x16xf32> {
-  %0 = vector.load %source[%offset, %offset, %offset]
+    %i: index, %j: index, %k: index) -> vector<8x16xf32> {
+  %0 = vector.load %source[%i, %j, %k]
     : memref<?x?x?xf32>, vector<8x16xf32>
   return %0 : vector<8x16xf32>
 }
 
 // CHECK-LABEL: @load_dynamic_source(
 // CHECK-SAME:  %[[SRC:.+]]: memref<?x?x?xf32>,
-// CHECK-SAME:  %[[OFFSET:.+]]: index
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
-// CHECK-DAG:   %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]]
-// CHECK-DAG:   %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]]
-// CHECK-DAG:   %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
-// CHECK:       %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
-// CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:  , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
-// CHECK-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// CHECK-SAME:  %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0]
+// CHECK:       {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]]
+// CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]]
+// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF1]], %[[OFF2]]]{{.*}}-> vector<8x16xf32>
 // CHECK:       return %[[VEC]]
 
 // -----
@@ -72,9 +67,9 @@ func.func @load_out_of_bounds(%source: memref<7x15xf32>,
 // CHECK-SAME:  %[[SRC:.+]]: memref<7x15xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]]]
+// CHECK-SAME:    %[[SRC]]
 // CHECK-SAME:    memref<7x15xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32>
 // CHECK:       return %[[VEC]]
 
 // -----
diff --git a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir
index 2c498dcc2a071..1a10d917623cc 100644
--- a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir
@@ -11,11 +11,12 @@ func.func @store_1D_vector(%vec: vector<8xf32>,
 // CHECK-SAME:  %[[VEC:.+]]: vector<8xf32>,
 // CHECK-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32,
+// CHECK-SAME:    %[[COLLAPSED]]
+// CHECK-SAME:    memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32,
 // CHECK-SAME:    boundary_check = false
-// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8xf32>
+// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]]] : vector<8xf32>
 
 // -----
 
@@ -30,16 +31,17 @@ func.func @store_2D_vector(%vec: vector<8x16xf32>,
 // CHECK-SAME:  %[[VEC:.+]]: vector<8x16xf32>,
 // CHECK-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// CHECK-SAME:    %[[COLLAPSED]]
+// CHECK-SAME:    memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>
 
 // -----
 
 func.func @store_dynamic_source(%vec: vector<8x16xf32>,
-    %source: memref<?x?x?xf32>, %offset: index) {
-  vector.store %vec, %source[%offset, %offset, %offset]
+    %source: memref<?x?x?xf32>, %i: index, %j: index, %k: index) {
+  vector.store %vec, %source[%i, %j, %k]
     : memref<?x?x?xf32>, vector<8x16xf32>
   return
 }
@@ -47,18 +49,11 @@ func.func @store_dynamic_source(%vec: vector<8x16xf32>,
 // CHECK-LABEL: @store_dynamic_source(
 // CHECK-SAME:  %[[VEC:.+]]: vector<8x16xf32>,
 // CHECK-SAME:  %[[SRC:.+]]: memref<?x?x?xf32>,
-// CHECK-SAME:  %[[OFFSET:.+]]: index
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
-// CHECK-DAG:   %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]]
-// CHECK-DAG:   %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]]
-// CHECK-DAG:   %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
-// CHECK:       %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
-// CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:  , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
-// CHECK-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// CHECK-SAME:  %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0]
+// CHECK:       {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]]
+// CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]]
+// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF1]], %[[OFF2]]] : vector<8x16xf32>
 
 // -----
 
@@ -74,9 +69,9 @@ func.func @store_out_of_bounds(%vec: vector<8x16xf32>,
 // CHECK-SAME:  %[[SRC:.+]]: memref<7x64xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]]]
+// CHECK-SAME:    %[[SRC]]
 // CHECK-SAME:    memref<7x64xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>
 
 // -----
 
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
index c4ca79af1bd9a..c87a5304babfe 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
@@ -12,11 +12,12 @@ gpu.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vector
 // LOAD-ND-LABEL:  @load_1D_vector(
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<8x16x32xf32>,
 // LOAD-ND-SAME:   %[[OFFSET:.+]]: index
+// LOAD-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0]
 // LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc
-// LOAD-ND-SAME:     %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// LOAD-ND-SAME:     memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32,
+// LOAD-ND-SAME:     %[[COLLAPSED]]
+// LOAD-ND-SAME:     memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32,
 // LOAD-ND-SAME:     boundary_check = false
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8xf32>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]]]{{.*}}-> vector<8xf32>
 // LOAD-ND:        return %[[VEC]]
 
 // LOAD-GATHER-LABEL:  @load_1D_vector(
@@ -46,11 +47,12 @@ gpu.func @load_2D_vector(%source: memref<8x16x32xf32>,
 // LOAD-ND-LABEL:  @load_2D_vector(
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<8x16x32xf32>,
 // LOAD-ND-SAME:   %[[OFFSET:.+]]: index
+// LOAD-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0]
 // LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc
-// LOAD-ND-SAME:     %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// LOAD-ND-SAME:     memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32,
+// LOAD-ND-SAME:     %[[COLLAPSED]]
+// LOAD-ND-SAME:     memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32,
 // LOAD-ND-SAME:     boundary_check = false
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32>
 // LOAD-ND:        return %[[VEC]]
 
 // LOAD-GATHER-LABEL:  @load_2D_vector(
@@ -83,9 +85,9 @@ gpu.func @load_zero_pad_out_of_bounds(%source: memref<32x64xf32>,
 // LOAD-ND-LABEL:  @load_zero_pad_out_of_bounds(
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<32x64xf32>,
 // LOAD-ND-SAME:   %[[OFFSET:.+]]: index
-// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]]]
+// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]]
 // LOAD-ND-SAME:     memref<32x64xf32> -> !xegpu.tensor_desc<8x16xf32>
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32>
 // LOAD-ND:        return %[[VEC]]
 
 // LOAD-GATHER-LABEL:  @load_zero_pad_out_of_bounds(
@@ -109,9 +111,9 @@ gpu.func @load_transposed(%source: memref<32x64xf32>,
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<32x64xf32>,
 // LOAD-ND-SAME:   %[[OFFSET1:.+]]: index, 
 // LOAD-ND-SAME:   %[[OFFSET2:.+]]: index  
-// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET1]], %[[OFFSET2]]]
+// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]]
 // LOAD-ND-SAME:     memref<32x64xf32> -> !xegpu.tensor_desc<16x8xf32
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]] <{transpose = array<i64: 1, 0>}>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET1]], %[[OFFSET2]]] <{transpose = array<i64: 1, 0>}>
 // LOAD-ND-SAME:     -> vector<8x16xf32>
 // LOAD-ND:        return %[[VEC]]
 
@@ -143,16 +145,11 @@ gpu.func @load_dynamic_source(%source: memref<?x?x?xf32>,
 }
 // LOAD-ND-LABEL:  @load_dynamic_source(
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<?x?x?xf32>,
-// LOAD-ND-SAME:   %[[OFFSET:.+]]: index
-// LOAD-ND:        %[[C2:.+]] = arith.constant 2 : index
-// LOAD-ND:        %[[C1:.+]] = arith.constant 1 : index
-// LOAD-ND:        %[[C0:.+]] = arith.constant 0 : index
-// LOAD-ND-DAG:    %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]]
-// LOAD-ND-DAG:    %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]]
-// LOAD-ND-DAG:    %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
-// LOAD-ND:        %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
-// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET:.+]], %[[OFFSET:.+]], %[[OFFSET:.+]]]
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// LOAD-ND-SAME:   %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// LOAD-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0]
+// LOAD-ND:        {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]]
+// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]]
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF1]], %[[OFF2]]]{{.*}}-> vector<8x16xf32>
 // LOAD-ND:        return %[[VEC]]
 
 
@@ -184,10 +181,11 @@ gpu.func @load_dynamic_source2(%source: memref<?x8x16xf32>,
 }
 
 // LOAD-ND-LABEL:  @load_dynamic_source2(
-// LOAD-ND-DAG:    %[[C0:.+]] = arith.constant 0 : index
-// LOAD-ND-DAG:    %[[DIM:.+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x8x16xf32>
-// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], shape : [%[[DIM]], 8, 16], strides : [128, 16, 1] : memref<?x8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]] : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>> -> vector<8x16xf32>
+// LOAD-ND-SAME:   %[[SRC:.+]]: memref<?x8x16xf32>,
+// LOAD-ND-SAME:   %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// LOAD-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0]
+// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32, strided<[16, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%{{.*}}, %{{.*}}] : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>> -> vector<8x16xf32>
 // LOAD-ND:        return %[[VEC]] : vector<8x16xf32>
 
 // LOAD-GATHER-LABEL:  @load_dynamic_source2(
@@ -418,11 +416,12 @@ gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2:
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<4096x4096xf16>,
 // LOAD-ND-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
 // LOAD-ND:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> 
+// LOAD-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SUBVIEW]][%[[OFF2]], 0]
 // LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc
-// LOAD-ND-SAME:     %[[SUBVIEW]][%[[OFF2]], %[[OFF2]]]
-// LOAD-ND-SAME:     memref<256x256xf16, strided<[4096, 1], offset: ?>> -> !xegpu.tensor_desc<8xf16,
+// LOAD-ND-SAME:     %[[COLLAPSED]]
+// LOAD-ND-SAME:     memref<256xf16, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf16,
 // LOAD-ND-SAME:     boundary_check = false
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8xf16>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF2]]]{{.*}}-> vector<8xf16>
 // LOAD-ND:        return %[[VEC]]
 
 // LOAD-GATHER-LABEL:  @load_from_subview(
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
index fcfc9414da4f6..43a1a7206e2cc 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --xevm-attach-target='module=xevm.* O=3 chip=pvc' -convert-vector-to-xegpu -split-input-file | FileCheck %s --check-prefix=STORE-ND
+// RUN: mlir-opt %s --xevm-attach-target='module=xevm_* O=3 chip=pvc' -convert-vector-to-xegpu -split-input-file | FileCheck %s --check-prefix=STORE-ND
 // RUN: mlir-opt %s -convert-vector-to-xegpu -split-input-file | FileCheck %s --check-prefix=STORE-SCATTER
 
 
@@ -15,11 +15,12 @@ gpu.func @store_1D_vector(%vec: vector<8xf32>,
 // STORE-ND-SAME:  %[[VEC:.+]]: vector<8xf32>,
 // STORE-ND-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // STORE-ND-SAME:  %[[OFFSET:.+]]: index
+// STORE-ND:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0]
 // STORE-ND:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// STORE-ND-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// STORE-ND-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32,
+// STORE-ND-SAME:    %[[COLLAPSED]]
+// STORE-ND-SAME:    memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32,
 // STORE-ND-SAME:    boundary_check = false
-// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8xf32>
+// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]]] : vector<8xf32>
 
 // STORE-SCATTER-LABEL:  @store_1D_vector(
 // STORE-SCATTER-SAME:   %[[VEC:.+]]: vector<8xf32>,
@@ -49,11 +50,12 @@ gpu.func @store_2D_vector(%vec: vector<8x16xf32>,
 // STORE-ND-SAME:  %[[VEC:.+]]: vector<8x16xf32>,
 // STORE-ND-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // STORE-ND-SAME:  %[[OFFSET:.+]]: index
+// STORE-ND:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0]
 // STORE-ND:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// STORE-ND-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// STORE-ND-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32,
+// STORE-ND-SAME:    %[[COLLAPSED]]
+// STORE-ND-SAME:    memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32,
 // STORE-ND-SAME:    boundary_check = false
-// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>
 
 // STORE-SCATTER-LABEL:  @store_2D_vector(
 // STORE-SCATTER-SAME:   %[[VEC:.+]]: vector<8x16xf32>,
@@ -73,8 +75,8 @@ gpu.func @store_2D_vector(%vec: vector<8x16xf32>,
 // -----
 gpu.module @xevm_module {
 gpu.func @store_dynamic_source(%vec: vector<8x16xf32>,
-    %source: memref<?x?x?xf32>, %offset: index) {
-  vector.transfer_write %vec, %source[%offset, %offset, %offset]
+    %source: memref<?x?x?xf32>, %i: index, %j: index, %k: index) {
+  vector.transfer_write %vec, %source[%i, %j, %k]
     {in_bounds = [true, true]}
     : vector<8x16xf32>, memref<?x?x?xf32>
   gpu.return
@@ -83,18 +85,11 @@ gpu.func @store_dynamic_source(%vec: vector<8x16xf32>,
 // STORE-ND-LABEL: @store_dynamic_source(
 // STORE-ND-SAME:  %[[VEC:.+]]: vector<8x16xf32>,
 // STORE-ND-SAME:  %[[SRC:.+]]: memref<?x?x?xf32>,
-// STORE-ND-SAME:  %[[OFFSET:.+]]: index
-// STORE-ND-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// STORE-ND-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// STORE-ND-DAG:   %[[C2:.+]] = arith.constant 2 : index
-// STORE-ND-DAG:   %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]]
-// STORE-ND-DAG:   %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]]
-// STORE-ND-DAG:   %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
-// STORE-ND:       %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
-// STORE-ND:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// STORE-ND-SAME:  , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
-// STORE-ND-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32
-// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// STORE-ND-SAME:  %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// STORE-ND:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0]
+// STORE-ND:       {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]]
+// STORE-ND:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]]
+// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF1]], %[[OFF2]]] : vector<8x16xf32>
 
 // STORE-SCATTER-LABEL: @store_dynamic_source(
 // STORE-SCATTER-SAME:  %[[VEC:.+]]: vector<8x16xf32>,
@@ -126,9 +121,9 @@ gpu.func @store_out_of_bounds(%vec: vector<8x16xf32>,
 // STORE-ND-SAME:  %[[SRC:.+]]: memref<7x64xf32>,
 // STORE-ND-SAME:  %[[OFFSET:.+]]: index
 // STORE-ND:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// STORE-ND-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]]]
+// STORE-ND-SAME:    %[[SRC]]
 // STORE-ND-SAME:    memref<7x64xf32> -> !xegpu.tensor_desc<8x16xf32>
-// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>
 
 // STORE-SCATTER-LABEL:  @store_out_of_bounds(
 // STORE-SCATTER:   vector.transfer_write
@@ -298,13 +293,13 @@ gpu.func @store_to_subview(%vec: vector<8xf16>,
 // STORE-ND-SAME:   %[[VEC:.+]]: vector<8xf16>,
 // STORE-ND-SAME:   %[[SRC:.+]]: memref<4096x4096xf16>,
 // STORE-ND-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
-// STORE-ND:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1]
-// STORE-ND-SAME:     : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+// STORE-ND:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+// STORE-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SUBVIEW]][%[[OFF2]], 0]
 // STORE-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc
-// STORE-ND-SAME:     %[[SUBVIEW]][%[[OFF2]], %[[OFF2]]]
-// STORE-ND-SAME:     memref<256x256xf16, strided<[4096, 1], offset: ?>> -> !xegpu.tensor_desc<8xf16,
+// STORE-ND-SAME:     %[[COLLAPSED]]
+// STORE-ND-SAME:     memref<256xf16, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf16,
 // STORE-ND-SAME:     boundary_check = false
-// STORE-ND:        xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8xf16>
+// STORE-ND:        xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF2]]] : vector<8xf16>
 
 // STORE-SCATTER-LABEL:  @store_to_subview(
 // STORE-SCATTER-SAME:   %[[VEC:.+]]: vector<8xf16>,
diff --git a/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir b/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir
index 09ef76c9d1740..109312218afae 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir
@@ -29,13 +29,13 @@ gpu.module @create_nd_tdesc {
 
         // CHECK: %[[CST_1:.*]] = arith.constant dense<0> : vector<8xi32>
         // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[MEMSPACECAST]] : memref<16x32xf32> -> index
+        // CHECK: %[[BASE_ADDR2:.*]] = arith.index_castui %[[INTPTR]] : index to i64
         // CHECK: %[[OFFSET_W2:.*]] = arith.constant 0 : i32
         // CHECK: %[[OFFSET_H2:.*]] = arith.constant 0 : i32
         // CHECK: %[[C32_I64:.*]] = arith.constant 32 : i64
         // CHECK: %[[SHAPE_W2:.*]] = arith.trunci %[[C32_I64]] : i64 to i32
         // CHECK: %[[C16_I64:.*]] = arith.constant 16 : i64
         // CHECK: %[[SHAPE_H2:.*]] = arith.trunci %[[C16_I64]] : i64 to i32
-        // CHECK: %[[BASE_ADDR2:.*]] = arith.index_castui %[[INTPTR]] : index to i64
         // CHECK: %[[VAR14:.*]] = vector.bitcast %[[CST_1]] : vector<8xi32> to vector<4xi64>
         // CHECK: %[[VAR15:.*]] = vector.insert %[[BASE_ADDR2]], %[[VAR14]] [0] : i64 into vector<4xi64>
         // CHECK: %[[VAR16:.*]] = vector.bitcast %[[VAR15]] : vector<4xi64> to vector<8xi32>
@@ -53,11 +53,11 @@ gpu.module @create_nd_tdesc {
         %BLOCK_DMODEL = arith.constant 16 : index
         // CHECK: %[[CST_4:.*]] = arith.constant dense<0> : vector<8xi32>
         // CHECK: %[[INTPTR_5:.*]] = memref.extract_aligned_pointer_as_index %[[DYN]] : memref<?x?xf16> -> index
+        // CHECK: %[[VAR23:.*]] = arith.index_castui %[[INTPTR_5]] : index to i64
         // CHECK: %[[C0_I32_6:.*]] = arith.constant 0 : i32
         // CHECK: %[[C0_I32_7:.*]] = arith.constant 0 : i32
         // CHECK: %[[VAR21:.*]] = arith.index_cast %[[C16]] : index to i32
         // CHECK: %[[VAR22:.*]] = arith.index_cast %[[C64]] : index to i32
-        // CHECK: %[[VAR23:.*]] = arith.index_castui %[[INTPTR_5]] : index to i64
         // CHECK: %[[VAR24:.*]] = vector.bitcast %[[CST_4]] : vector<8xi32> to vector<4xi64>
         // CHECK: %[[VAR25:.*]] = vector.insert %[[VAR23]], %[[VAR24]] [0] : i64 into vector<4xi64>
         // CHECK: %[[VAR26:.*]] = vector.bitcast %[[VAR25]] : vector<4xi64> to vector<8xi32>
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_1d.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_1d.mlir
new file mode 100644
index 0000000000000..7b4ad9ec2df03
--- /dev/null
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstore_1d.mlir
@@ -0,0 +1,36 @@
+// RUN: mlir-opt -convert-xegpu-to-xevm -canonicalize %s | FileCheck %s
+
+gpu.module @load_store_check {
+    // CHECK-LABEL: @load_store(
+    // CHECK-SAME: %[[SRC:.*]]: memref<512xf32, 1>, %[[DST:.*]]: memref<256xf32, 1>
+    gpu.func @load_store(%src: memref<512xf32, 1>, %dst: memref<256xf32, 1>) kernel {
+        // CHECK: %[[C512:.*]] = arith.constant 512 : i64
+        // CHECK: %[[C384:.*]] = arith.constant 384 : i64
+
+        // CHECK: %[[SRCCE:.*]] = memref.memory_space_cast %[[SRC]] : memref<512xf32, 1> to memref<512xf32>
+        %srcce = memref.memory_space_cast %src : memref<512xf32, 1> to memref<512xf32>
+        // CHECK: %[[DSTTE:.*]] = memref.memory_space_cast %[[DST]] : memref<256xf32, 1> to memref<256xf32>
+        %dstte = memref.memory_space_cast %dst : memref<256xf32, 1> to memref<256xf32>
+
+        // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[SRCCE]] : memref<512xf32> -> index
+        // CHECK: %[[INTPTR_I64:.*]] = arith.index_castui %[[INTPTR]] : index to i64
+        %src_tdesc = xegpu.create_nd_tdesc %srcce : memref<512xf32> -> !xegpu.tensor_desc<32xf32>
+        // CHECK: %[[ADDR:.*]] = arith.addi %[[INTPTR_I64]], %[[C384]] : i64
+        // CHECK: %[[PTR:.*]] = llvm.inttoptr %[[ADDR]] : i64 to !llvm.ptr<1>
+        // CHECK: %[[LOAD:.*]] = xevm.blockload %[[PTR]] <{cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>}>
+        // CHECK-SAME: : (!llvm.ptr<1>) -> vector<2xi32>
+        %loaded = xegpu.load_nd %src_tdesc[96] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+            : !xegpu.tensor_desc<32xf32> -> vector<2xf32>
+
+        // CHECK: %[[INTPTR1:.*]] = memref.extract_aligned_pointer_as_index %[[DSTTE]] : memref<256xf32> -> index
+        // CHECK: %[[INTPTR1_I64:.*]] = arith.index_castui %[[INTPTR1]] : index to i64
+        %dst_tdesc = xegpu.create_nd_tdesc %dstte : memref<256xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.block_tdesc_attr<memory_space = global>>
+        // CHECK: %[[ADDR1:.*]] = arith.addi %[[INTPTR1_I64]], %[[C512]] : i64
+        // CHECK: %[[PTR1:.*]] = llvm.inttoptr %[[ADDR1]] : i64 to !llvm.ptr<1>
+        // CHECK: xevm.blockstore %[[PTR1]], %[[LOAD]] <{cache_control = #xevm.store_cache_control<L1wb_L2uc_L3uc>}>
+        // CHECK-SAME: : (!llvm.ptr<1>, vector<2xi32>)
+        xegpu.store_nd %loaded, %dst_tdesc[128] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
+            : vector<2xf32>, !xegpu.tensor_desc<32xf32, #xegpu.block_tdesc_attr<memory_space = global>>
+        gpu.return
+    }
+}
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd.mlir
index 4c6bbf25b4728..95774ca67c4f2 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd.mlir
@@ -16,6 +16,7 @@ gpu.module @load_store_check {
         %src_tdesc = xegpu.create_nd_tdesc %srcce : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 
 
+        //CHECK: %[[LD_SIZEOF_F32:.*]] = arith.constant 4 : i32
         //CHECK: %[[LD_DESC_I64:.*]] = vector.bitcast %[[LD_DESC]] : vector<8xi32> to vector<4xi64>
         //CHECK: %[[LD_INTPTR:.*]] = vector.extract %[[LD_DESC_I64]][0] : i64 from vector<4xi64>
         //CHECK: %[[LD_BASE_W:.*]] = vector.extract %[[LD_DESC]][2] : i32 from vector<8xi32>
@@ -25,7 +26,6 @@ gpu.module @load_store_check {
         //CHECK: %[[LD_TILE_H64:.*]] = arith.constant 0 : i64
         //CHECK: %[[LD_TILE_H:.*]] = arith.trunci %[[LD_TILE_H64]] : i64 to i32
         //CHECK: %[[LD_LLVMPTR:.*]] = llvm.inttoptr %[[LD_INTPTR]] : i64 to !llvm.ptr<1>
-        //CHECK: %[[LD_SIZEOF_F32:.*]] = arith.constant 4 : i32
         //CHECK: %[[LD_BASE_ROW_IN_BYTES:.*]] = arith.muli %[[LD_BASE_W]], %[[LD_SIZEOF_F32]] : i32
         //CHECK: %[[LD_LOADED_I32:.*]] = xevm.blockload2d %[[LD_LLVMPTR]], %[[LD_BASE_ROW_IN_BYTES]],
         //CHECK-SAME: %[[LD_BASE_H]], %[[LD_BASE_ROW_IN_BYTES]], %[[LD_TILE_W]], %[[LD_TILE_H]]
@@ -52,6 +52,7 @@ gpu.module @load_store_check {
         // CHECK: %[[DESC:.*]] = vector.insert {{.*}}, %[[DESC_4]] [5] : i32 into vector<8xi32>
         %dst_tdesc = xegpu.create_nd_tdesc %dstte : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
 
+        //CHECK: %[[SIZEOF_F32:.*]] = arith.constant 4 : i32
         //CHECK: %[[DESC_I64:.*]] = vector.bitcast %[[DESC]] : vector<8xi32> to vector<4xi64>
         //CHECK: %[[INTPTR:.*]] = vector.extract %[[DESC_I64]][0] : i64 from vector<4xi64>
         //CHECK: %[[BASE_W:.*]] = vector.extract %[[DESC]][2] : i32 from vector<8xi32>
@@ -61,7 +62,6 @@ gpu.module @load_store_check {
         //CHECK: %[[TILE_H64:.*]] = arith.constant 0 : i64
         //CHECK: %[[TILE_H:.*]] = arith.trunci %[[TILE_H64]] : i64 to i32
         //CHECK: %[[LLVMPTR:.*]] = llvm.inttoptr %[[INTPTR]] : i64 to !llvm.ptr<1>
-        //CHECK: %[[SIZEOF_F32:.*]] = arith.constant 4 : i32
         //CHECK: %[[BASE_ROW_IN_BYTES:.*]] = arith.muli %[[BASE_W]], %[[SIZEOF_F32]] : i32
         //CHECK: %[[FLAT_VALUE_I32:.*]] = vector.bitcast %[[LOADED_F32_MODIFIED]] : vector<8xf32> to vector<8xi32>
         //CHECK: xevm.blockstore2d %[[LLVMPTR]], %[[BASE_ROW_IN_BYTES]], %[[BASE_H]], %[[BASE_ROW_IN_BYTES]],
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir
index 9c552d849c12c..d606cf51435dc 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir
@@ -1,15 +1,16 @@
-// RUN: mlir-opt %s --split-input-file -convert-xegpu-to-xevm | FileCheck %s
+// RUN: mlir-opt %s --split-input-file -convert-xegpu-to-xevm -canonicalize | FileCheck %s
 
 gpu.module @test {
 // CHECK-LABEL: @load_gather_i64_src_value_offset
-// CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex>
-gpu.func @load_gather_i64_src_value_offset(%src: i64, %offset: vector<1xindex>) {
+// CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex>, %[[ARG2:.*]]: memref<1xf16>
+// CHECK-SAME: %[[ARG3:.*]]: vector<1xi1>
+gpu.func @load_gather_i64_src_value_offset(%src: i64, %offset: vector<1xindex>, %dst: memref<1xf16>, %mask: vector<1xi1>) {
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[CST_0:.*]] = arith.constant 0.000000e+00 : f16
+  // CHECK: %[[C2_I64:.*]] = arith.constant 2 : i64
+  // CHECK: %[[VAR2:.*]] = vector.extract %[[ARG3]][0] : i1 from vector<1xi1>
   // CHECK: %[[VAR0:.*]] = vector.extract %[[ARG1]][0] : index from vector<1xindex>
   // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64
-  // CHECK: %[[CST:.*]] = arith.constant dense<true> : vector<1xi1>
-  // CHECK: %[[VAR2:.*]] = vector.extract %[[CST]][0] : i1 from vector<1xi1>
-  %1 = arith.constant dense<1>: vector<1xi1>
-  // CHECK: %[[C2_I64:.*]] = arith.constant 2 : i64
   // CHECK: %[[VAR3:.*]] = arith.muli %[[VAR1]], %[[C2_I64]] : i64
   // CHECK: %[[VAR4:.*]] = arith.addi %[[ARG0]], %[[VAR3]] : i64
   // CHECK: %[[VAR5:.*]] = llvm.inttoptr %[[VAR4]] : i64 to !llvm.ptr<1>
@@ -17,11 +18,12 @@ gpu.func @load_gather_i64_src_value_offset(%src: i64, %offset: vector<1xindex>)
   // CHECK:   %[[VAR7:.*]] = llvm.load %[[VAR5]] {cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>} : !llvm.ptr<1> -> f16
   // CHECK:   scf.yield %[[VAR7]] : f16
   // CHECK: } else {
-  // CHECK:   %[[CST_0:.*]] = arith.constant 0.000000e+00 : f16
   // CHECK:   scf.yield %[[CST_0]] : f16
   // CHECK: }
-  %3 = xegpu.load %src[%offset], %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+  %0 = xegpu.load %src[%offset], %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
       : i64, vector<1xindex>, vector<1xi1> -> vector<1xf16>
+  %c0 = arith.constant 0 : index
+  vector.store %0, %dst[%c0] : memref<1xf16>, vector<1xf16>
   gpu.return
 }
 }
@@ -30,16 +32,16 @@ gpu.func @load_gather_i64_src_value_offset(%src: i64, %offset: vector<1xindex>)
 gpu.module @test {
 // CHECK-LABEL: @source_materialize_single_elem_vec
 // CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex>, %[[ARG2:.*]]: memref<1xf16>
-gpu.func @source_materialize_single_elem_vec(%src: i64, %offset: vector<1xindex>, %dst: memref<1xf16>) {
-  %1 = arith.constant dense<1>: vector<1xi1>
-  %3 = xegpu.load %src[%offset], %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+// CHECK-SAME: %[[ARG3:.*]]: vector<1xi1>
+gpu.func @source_materialize_single_elem_vec(%src: i64, %offset: vector<1xindex>, %dst: memref<1xf16>, %mask: vector<1xi1>) {
+  %0 = xegpu.load %src[%offset], %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
       : i64, vector<1xindex>, vector<1xi1> -> vector<1xf16>
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[VAR_IF:.*]] = scf.if
   // CHECK: %[[VAR_RET:.*]] = vector.broadcast %[[VAR_IF]] : f16 to vector<1xf16>
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: vector.store %[[VAR_RET]], %[[ARG2]][%[[C0]]] : memref<1xf16>, vector<1xf16>
   %c0 = arith.constant 0 : index
-  vector.store %3, %dst[%c0] : memref<1xf16>, vector<1xf16>
+  vector.store %0, %dst[%c0] : memref<1xf16>, vector<1xf16>
   gpu.return
 }
 }
@@ -48,24 +50,21 @@ gpu.func @source_materialize_single_elem_vec(%src: i64, %offset: vector<1xindex>
 
 gpu.module @test {
 // CHECK-LABEL: @store_scatter_i64_src_value_offset
-// CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex>
-gpu.func @store_scatter_i64_src_value_offset(%src: i64, %offset: vector<1xindex>) {
+// CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex>, %[[ARG2:.*]]: vector<1xi1>
+gpu.func @store_scatter_i64_src_value_offset(%src: i64, %offset: vector<1xindex>, %mask: vector<1xi1>) {
+  // CHECK: %[[CST_0:.*]] = arith.constant 2.900000e+00 : f32
+  // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64
+  // CHECK: %[[VAR2:.*]] = vector.extract %[[ARG2]][0] : i1 from vector<1xi1>
   // CHECK: %[[VAR0:.*]] = vector.extract %[[ARG1]][0] : index from vector<1xindex>
   // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64
-  // CHECK: %[[CST:.*]] = arith.constant dense<true> : vector<1xi1>
-  // CHECK: %[[VAR2:.*]] = vector.extract %[[CST]][0] : i1 from vector<1xi1>
-  %1 = arith.constant dense<1>: vector<1xi1>
-  // CHECK: %[[CST_0:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32>
-  // CHECK: %[[VAR3:.*]] = vector.extract %[[CST_0]][0] : f32 from vector<1xf32>
-  %2 = arith.constant dense<2.9>: vector<1xf32>
-  // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64
+  %0 = arith.constant dense<2.9>: vector<1xf32>
   // CHECK: %[[VAR4:.*]] = arith.muli %[[VAR1]], %[[C4_I64]] : i64
   // CHECK: %[[VAR5:.*]] = arith.addi %[[ARG0]], %[[VAR4]] : i64
   // CHECK: %[[VAR6:.*]] = llvm.inttoptr %[[VAR5]] : i64 to !llvm.ptr<1>
   // CHECK: scf.if %[[VAR2]] {
-  // CHECK:   llvm.store %[[VAR3]], %[[VAR6]] {cache_control = #xevm.store_cache_control<L1wb_L2uc_L3uc>} : f32, !llvm.ptr<1>
+  // CHECK:   llvm.store %[[CST_0]], %[[VAR6]] {cache_control = #xevm.store_cache_control<L1wb_L2uc_L3uc>} : f32, !llvm.ptr<1>
   // CHECK: }
-  xegpu.store %2, %src[%offset], %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
+  xegpu.store %0, %src[%offset], %mask <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
       : vector<1xf32>, i64, vector<1xindex>, vector<1xi1>
   gpu.return
 }
@@ -76,9 +75,9 @@ gpu.module @test {
 // CHECK-LABEL: @prefetch_i64_src_value_offset
 // CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex>
 gpu.func @prefetch_i64_src_value_offset(%src: i64, %offset: vector<1xindex>) {
+  // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64
   // CHECK: %[[VAR0:.*]] = vector.extract %[[ARG1]][0] : index from vector<1xindex>
   // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64
-  // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64
   // CHECK: %[[VAR2:.*]] = arith.muli %[[VAR1]], %[[C4_I64]] : i64
   // CHECK: %[[VAR3:.*]] = arith.addi %[[ARG0]], %[[VAR2]] : i64
   // CHECK: %[[VAR4:.*]] = llvm.inttoptr %[[VAR3]] : i64 to !llvm.ptr<1>
@@ -94,11 +93,11 @@ gpu.module @test {
 // CHECK-LABEL: @prefetch_memref_src_value_offset
 // CHECK-SAME: %[[ARG0:.*]]: memref<256xf32>, %[[ARG1:.*]]: vector<1xindex>
 gpu.func @prefetch_memref_src_value_offset(%src: memref<256xf32>, %offset: vector<1xindex>) {
+  // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64
   // CHECK: %[[VAR0:.*]] = vector.extract %[[ARG1]][0] : index from vector<1xindex>
   // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64
   // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<256xf32> -> index
   // CHECK: %[[VAR2:.*]] = arith.index_castui %[[INTPTR]] : index to i64
-  // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64
   // CHECK: %[[VAR3:.*]] = arith.muli %[[VAR1]], %[[C4_I64]] : i64
   // CHECK: %[[VAR4:.*]] = arith.addi %[[VAR2]], %[[VAR3]] : i64
   // CHECK: %[[VAR5:.*]] = llvm.inttoptr %[[VAR4]] : i64 to !llvm.ptr<1>
diff --git a/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir b/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir
index 873478aed57e3..e4b303087ea9b 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir
@@ -1,34 +1,29 @@
-// RUN: mlir-opt -convert-xegpu-to-xevm -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -convert-xegpu-to-xevm -canonicalize %s | FileCheck %s
 
-gpu.module @fence_check {
-    gpu.func @fence(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel {
+gpu.module @prefetch_nd_check {
+    // CHECK-LABEL: gpu.func @prefetch_nd
+    gpu.func @prefetch_nd(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel {
+        // CHECK: %[[PREF_BASE_ROW_IN_BYTES:.*]] = arith.constant 64 : i32
+        // CHECK: %[[LD_CREATE_DESC_I64:.*]] = arith.constant dense<0> : vector<4xi64>
+        // CHECK: %[[PREF_BASE_H:.*]] = arith.constant 8 : i32
+        // CHECK: %[[PREF_BASE_W:.*]] = arith.constant 16 : i32
+        // CHECK: %[[OFFSET_ZERO:.*]] = arith.constant 0 : i32
         %srcce = memref.memory_space_cast %src : memref<8x16xf32, 1> to memref<8x16xf32>
-        %dstte = memref.memory_space_cast %dst : memref<8x16xf32, 1> to memref<8x16xf32>
-
         // CHECK: %[[LD_PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64
-        // CHECK: %[[LD_CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64>
         // CHECK: %[[LD_DESC_0:.*]] = vector.insert %[[LD_PTR_AS_I64]], %[[LD_CREATE_DESC_I64]] [0] : i64 into vector<4xi64>
         // CHECK: %[[LD_DESC_1:.*]] = vector.bitcast %[[LD_DESC_0]] : vector<4xi64> to vector<8xi32>
-        // CHECK: %[[LD_DESC_2:.*]] = vector.insert {{.*}}, %[[LD_DESC_1]] [2] : i32 into vector<8xi32>
-        // CHECK: %[[LD_DESC_3:.*]] = vector.insert {{.*}}, %[[LD_DESC_2]] [3] : i32 into vector<8xi32>
-        // CHECK: %[[LD_DESC_4:.*]] = vector.insert {{.*}}, %[[LD_DESC_3]] [4] : i32 into vector<8xi32>
-        // CHECK: %[[LD_DESC:.*]] = vector.insert {{.*}}, %[[LD_DESC_4]] [5] : i32 into vector<8xi32>
+        // CHECK: %[[LD_DESC_2:.*]] = vector.insert %[[PREF_BASE_W]], %[[LD_DESC_1]] [2] : i32 into vector<8xi32>
+        // CHECK: %[[LD_DESC_3:.*]] = vector.insert %[[PREF_BASE_H]], %[[LD_DESC_2]] [3] : i32 into vector<8xi32>
+        // CHECK: %[[LD_DESC_4:.*]] = vector.insert %[[OFFSET_ZERO]], %[[LD_DESC_3]] [4] : i32 into vector<8xi32>
+        // CHECK: %[[LD_DESC:.*]] = vector.insert %[[OFFSET_ZERO]], %[[LD_DESC_4]] [5] : i32 into vector<8xi32>
         %src_tdesc = xegpu.create_nd_tdesc %srcce : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32,
             #xegpu.block_tdesc_attr<memory_space = global>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
 
         //CHECK: %[[LD_DESC_I64:.*]] = vector.bitcast %[[LD_DESC]] : vector<8xi32> to vector<4xi64>
         //CHECK: %[[PREF_INTPTR:.*]] = vector.extract %[[LD_DESC_I64]][0] : i64 from vector<4xi64>
-        //CHECK: %[[PREF_BASE_W:.*]] = vector.extract %[[LD_DESC]][2] : i32 from vector<8xi32>
-        //CHECK: %[[PREF_BASE_H:.*]] = vector.extract %[[LD_DESC]][3] : i32 from vector<8xi32>
-        //CHECK: %[[PREF_TILE_W64:.*]] = arith.constant 0 : i64
-        //CHECK: %[[PREF_TILE_W:.*]] = arith.trunci %[[PREF_TILE_W64]] : i64 to i32
-        //CHECK: %[[PREF_TILE_H64:.*]] = arith.constant 0 : i64
-        //CHECK: %[[PREF_TILE_H:.*]] = arith.trunci %[[PREF_TILE_H64]] : i64 to i32
         //CHECK: %[[PREF_LLVMPTR:.*]] = llvm.inttoptr %[[PREF_INTPTR]] : i64 to !llvm.ptr<1>
-        //CHECK: %[[PREF_SIZEOF_F32:.*]] = arith.constant 4 : i32
-        //CHECK: %[[PREF_BASE_ROW_IN_BYTES:.*]] = arith.muli %[[PREF_BASE_W]], %[[PREF_SIZEOF_F32]] : i32
         //CHECK: xevm.blockprefetch2d %[[PREF_LLVMPTR]], %[[PREF_BASE_ROW_IN_BYTES]], %[[PREF_BASE_H]],
-        //CHECK-SAME:   %[[PREF_BASE_ROW_IN_BYTES]], %[[PREF_TILE_W]], %[[PREF_TILE_H]]
+        //CHECK-SAME:   %[[PREF_BASE_ROW_IN_BYTES]], %[[OFFSET_ZERO]], %[[OFFSET_ZERO]]
         //CHECK-SAME:   <{cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>, elem_size_in_bits = 32 : i32,
         //CHECK-SAME:     tile_height = 8 : i32, tile_width = 16 : i32, v_blocks = 1 : i32}>
         //CHECK-SAME:   : (!llvm.ptr<1>, i32, i32, i32, i32, i32)
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir
index b616632a6fe24..b062736575ad7 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir
@@ -243,6 +243,106 @@ func.func @vecdim_reduction_ori(%in: memref<256x512xi32>, %out: memref<256xi32>)
 // CHECK:         affine.store %[[final_red]], %{{.*}} : memref<256xi32>
 // CHECK:       }
 
+// -----
+
+func.func @vecdim_reduction_xori(%in: memref<256x512xi32>, %out: memref<256xi32>) {
+ %cst = arith.constant 0 : i32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xi32>
+     %xor = arith.xori %red_iter, %ld : i32
+     affine.yield %xor : i32
+   }
+   affine.store %final_red, %out[%i] : memref<256xi32>
+ }
+ return
+}
+
+// CHECK-LABEL:   func.func @vecdim_reduction_xori(
+// CHECK-SAME:      %[[input:.*]]: memref<256x512xi32>,
+// CHECK-SAME:      %[[output:.*]]: memref<256xi32>) {
+// CHECK:           %[[cst:.*]] = arith.constant 0 : i32
+// CHECK:           affine.for %{{.*}} = 0 to 256 {
+// CHECK:             %[[vzero:.*]] = arith.constant dense<0> : vector<128xi32>
+// CHECK:             %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xi32>) {
+// CHECK:               %[[poison:.*]] = ub.poison : i32
+// CHECK:               %[[ld:.*]] = vector.transfer_read %[[input]]{{\[}}%{{.*}}, %{{.*}}], %[[poison]] : memref<256x512xi32>, vector<128xi32>
+// CHECK:               %[[xor:.*]] = arith.xori %[[red_iter]], %[[ld]] : vector<128xi32>
+// CHECK:               affine.yield %[[xor]] : vector<128xi32>
+// CHECK:             }
+// CHECK:             %[[final_red:.*]] = vector.reduction <xor>, %[[vred]] : vector<128xi32> into i32
+// CHECK:             affine.store %[[final_red]], %[[output]]{{\[}}%{{.*}}] : memref<256xi32>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @vecdim_reduction_minnumf(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = arith.constant 0xFF800000 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %min = arith.minnumf %red_iter, %ld : f32
+     affine.yield %min : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL:   func.func @vecdim_reduction_minnumf(
+// CHECK-SAME:      %[[input:.*]]: memref<256x512xf32>,
+// CHECK-SAME:      %[[output:.*]]: memref<256xf32>) {
+// CHECK:           %[[cst:.*]] = arith.constant 0xFF800000 : f32
+// CHECK:           affine.for %{{.*}} = 0 to 256 {
+// CHECK:             %[[vzero:.*]] = arith.constant dense<0x7FC00000> : vector<128xf32>
+// CHECK:             %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:               %[[poison:.*]] = ub.poison : f32
+// CHECK:               %[[ld:.*]] = vector.transfer_read %[[input]]{{\[}}%{{.*}}, %{{.*}}], %[[poison]] : memref<256x512xf32>, vector<128xf32>
+// CHECK:               %[[min:.*]] = arith.minnumf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK:               affine.yield %[[min]] : vector<128xf32>
+// CHECK:             }
+// CHECK:             %[[red_scalar:.*]] = vector.reduction <minnumf>, %[[vred]] : vector<128xf32> into f32
+// CHECK:             %[[final_red:.*]] = arith.minnumf %[[red_scalar]], %[[cst]] : f32
+// CHECK:             affine.store %[[final_red]], %[[output]]{{\[}}%{{.*}}] : memref<256xf32>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @vecdim_reduction_maxnumf(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = arith.constant 0xFF800000 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %max = arith.maxnumf %red_iter, %ld : f32
+     affine.yield %max : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL:   func.func @vecdim_reduction_maxnumf(
+// CHECK-SAME:      %[[input:.*]]: memref<256x512xf32>,
+// CHECK-SAME:      %[[output:.*]]: memref<256xf32>) {
+// CHECK:           %[[cst:.*]] = arith.constant 0xFF800000 : f32
+// CHECK:           affine.for %{{.*}} = 0 to 256 {
+// CHECK:             %[[vzero:.*]] = arith.constant dense<0xFFC00000> : vector<128xf32>
+// CHECK:             %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:               %[[poison:.*]] = ub.poison : f32
+// CHECK:               %[[ld:.*]] = vector.transfer_read %[[input]]{{\[}}%{{.*}}, %{{.*}}], %[[poison]] : memref<256x512xf32>, vector<128xf32>
+// CHECK:               %[[max:.*]] = arith.maxnumf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK:               affine.yield %[[max]] : vector<128xf32>
+// CHECK:             }
+// CHECK:             %[[red_scalar:.*]] = vector.reduction <maxnumf>, %[[vred]] : vector<128xf32> into f32
+// CHECK:             %[[final_red:.*]] = arith.maxnumf %[[red_scalar]], %[[cst]] : f32
+// CHECK:             affine.store %[[final_red]], %[[output]]{{\[}}%{{.*}}] : memref<256xf32>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
 
 // -----
 
diff --git a/mlir/test/Dialect/EmitC/invalid_ops.mlir b/mlir/test/Dialect/EmitC/invalid_ops.mlir
index 5f594fb08c43f..f285196d466ce 100644
--- a/mlir/test/Dialect/EmitC/invalid_ops.mlir
+++ b/mlir/test/Dialect/EmitC/invalid_ops.mlir
@@ -876,3 +876,41 @@ func.func @test_do(%arg0 : !emitc.ptr<i32>) {
 
   return
 }
+
+// -----
+
+func.func @test_for_none_block_argument(%arg0: index) {
+  // expected-error@+1 {{expected body to have a single block argument for the induction variable}}
+  "emitc.for"(%arg0, %arg0, %arg0) (
+    {
+      emitc.yield
+    }
+  ) : (index, index, index) -> ()
+  return
+}
+
+// -----
+
+func.func @test_for_more_than_one_block_argument(%arg0: index) {
+  // expected-error@+1 {{expected body to have a single block argument for the induction variable}}
+  "emitc.for"(%arg0, %arg0, %arg0) (
+    {
+    ^bb0(%i0 : index, %i1 : index):
+      emitc.yield
+    }
+  ) : (index, index, index) -> ()
+  return
+}
+
+// -----
+
+func.func @test_for_unmatch_type(%arg0: index) {
+  // expected-error@+1 {{expected induction variable to be same type as bounds}}
+  "emitc.for"(%arg0, %arg0, %arg0) (
+    {
+    ^bb0(%i0 : f32):
+      emitc.yield
+    }
+  ) : (index, index, index) -> ()
+  return
+}
diff --git a/mlir/test/Dialect/Func/duplicate-function-elimination.mlir b/mlir/test/Dialect/Func/duplicate-function-elimination.mlir
index 1f8da78b6d63d..bc04e8fa9cd23 100644
--- a/mlir/test/Dialect/Func/duplicate-function-elimination.mlir
+++ b/mlir/test/Dialect/Func/duplicate-function-elimination.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s --split-input-file --duplicate-function-elimination | \
-// RUN: FileCheck %s
+// RUN: FileCheck %s --check-prefixes=CHECK,CHECK-2,CHECK-3
 
 func.func @identity(%arg0: tensor<f32>) -> tensor<f32> {
   return %arg0 : tensor<f32>
diff --git a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
index 87a31ca20eb7b..1adc4181e05d3 100644
--- a/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
+++ b/mlir/test/Dialect/GPU/subgroup-reduce-lowering.mlir
@@ -8,11 +8,11 @@
 
 // RUN: mlir-opt --allow-unregistered-dialect \
 // RUN:   --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx942" %s \
-// RUN:   | FileCheck %s --check-prefix=CHECK-GFX9
+// RUN:   | FileCheck %s --check-prefixes=CHECK-GFX,CHECK-GFX9
 
 // RUN: mlir-opt --allow-unregistered-dialect \
 // RUN:   --test-gpu-subgroup-reduce-lowering="expand-to-shuffles target=gfx1030" %s \
-// RUN:   | FileCheck %s --check-prefix=CHECK-GFX10
+// RUN:   | FileCheck %s --check-prefixes=CHECK-GFX,CHECK-GFX10
 
 // CHECK-SUB:  gpu.module @kernels {
 // CHECK-SHFL: gpu.module @kernels {
@@ -24,8 +24,7 @@ gpu.module @kernels {
   // CHECK-SUB-SAME:     %[[ARG0:.+]]: vector<5xf16>)
   //
   // CHECK-SHFL-LABEL: gpu.func @kernel0(
-  // CHECK-GFX9-LABEL: gpu.func @kernel0(
-  // CHECK-GFX10-LABEL: gpu.func @kernel0(
+  // CHECK-GFX-LABEL: gpu.func @kernel0(
   gpu.func @kernel0(%arg0: vector<5xf16>) kernel {
     // CHECK-SUB: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16>
     // CHECK-SUB: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16>
@@ -56,8 +55,7 @@ gpu.module @kernels {
 
     // CHECK-SUB-COUNT-3: gpu.subgroup_reduce mul {{.+}} cluster(size = 4)
     // CHECK-SUB: "test.consume"
-    // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}}
-    // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}}
+    // CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}}
     %sum2 = gpu.subgroup_reduce mul %arg0 cluster(size = 4) : (vector<5xf16>) -> (vector<5xf16>)
     "test.consume"(%sum2) : (vector<5xf16>) -> ()
 
@@ -74,8 +72,7 @@ gpu.module @kernels {
   // CHECK-SUB-SAME:     %[[ARG0:.+]]: vector<1xf32>)
   //
   // CHECK-SHFL-LABEL: gpu.func @kernel1(
-  // CHECK-GFX9-LABEL: gpu.func @kernel1(
-  // CHECK-GFX10-LABEL: gpu.func @kernel1(
+  // CHECK-GFX-LABEL: gpu.func @kernel1(
   gpu.func @kernel1(%arg0: vector<1xf32>) kernel {
     // CHECK-SUB: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32>
     // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32
@@ -100,17 +97,14 @@ gpu.module @kernels {
     // Note stride is dropped because it is == 1.
     // CHECK-SUB: gpu.subgroup_reduce add {{.+}} cluster(size = 8) : (f32) -> f32
     // CHECK-SUB: "test.consume"
-    // CHECK-GFX9-COUNT-2: amdgpu.dpp {{.+}} quad_perm
-    // CHECK-GFX9: amdgpu.dpp {{.+}} row_half_mirror
-    // CHECK-GFX10-COUNT-2: amdgpu.dpp {{.+}} quad_perm
-    // CHECK-GFX10: amdgpu.dpp {{.+}} row_half_mirror
+    // CHECK-GFX-COUNT-2: amdgpu.dpp {{.+}} quad_perm
+    // CHECK-GFX: amdgpu.dpp {{.+}} row_half_mirror
     %sum2 = gpu.subgroup_reduce add %arg0 cluster(size = 8, stride = 1) : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum2) : (vector<1xf32>) -> ()
 
     // CHECK-SUB: gpu.subgroup_reduce add {{.+}} uniform cluster(size = 8, stride = 4) : (f32) -> f32
     // CHECK-SUB: "test.consume"
-    // CHECK-GFX9-NOT: amdgpu.dpp
-    // CHECK-GFX10-NOT: amdgpu.dpp
+    // CHECK-GFX-NOT: amdgpu.dpp
     // CHECK-GFX10-NOT: rocdl.permlanex16
     %sum3 = gpu.subgroup_reduce add %arg0 uniform cluster(size = 8, stride = 4) : (vector<1xf32>) -> (vector<1xf32>)
     "test.consume"(%sum3) : (vector<1xf32>) -> ()
@@ -126,11 +120,8 @@ gpu.module @kernels {
   //
   // CHECK-SHFL-LABEL: gpu.func @kernel2(
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel2(
-  // CHECK-GFX9-NOT: amdgpu.dpp
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel2(
-  // CHECK-GFX10-NOT: amdgpu.dpp
+  // CHECK-GFX-LABEL: gpu.func @kernel2(
+  // CHECK-GFX-NOT: amdgpu.dpp
   gpu.func @kernel2(%arg0: vector<3xi8>, %arg1: vector<4xi8>) kernel {
     // CHECK-SUB: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8>
     // CHECK-SUB: "test.consume"(%[[R0]]) : (vector<3xi8>) -> ()
@@ -148,8 +139,7 @@ gpu.module @kernels {
 
   // CHECK-SHFL-LABEL: gpu.func @kernel3(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i32)
-  // CHECK-GFX9-LABEL: gpu.func @kernel3(
-  // CHECK-GFX10-LABEL: gpu.func @kernel3(
+  // CHECK-GFX-LABEL: gpu.func @kernel3(
   gpu.func @kernel3(%arg0: i32) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -169,9 +159,9 @@ gpu.module @kernels {
     // CHECK-SHFL: %[[S4:.+]], %{{.+}} = gpu.shuffle xor %[[A3]], %[[C16]], %[[C32]] : i32
     // CHECK-SHFL: %[[A4:.+]] = arith.addi %[[A3]], %[[S4]] : i32
     // CHECK-SHFL: "test.consume"(%[[A4]]) : (i32) -> ()
-    
+
     // CHECK-GFX9-COUNT-6: amdgpu.dpp
-    
+
     // CHECK-GFX10-COUNT-4: amdgpu.dpp
     // CHECK-GFX10: rocdl.permlanex16
     // CHECK-GFX10-COUNT-2: rocdl.readlane
@@ -185,11 +175,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i32)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered(
-  // CHECK-GFX9-SAME:    %[[ARG0:.+]]: i32)
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered(
-  // CHECK-GFX10-SAME:    %[[ARG0:.+]]: i32)
+  // CHECK-GFX-LABEL: gpu.func @kernel3_clustered(
+  // CHECK-GFX-SAME:    %[[ARG0:.+]]: i32)
   gpu.func @kernel3_clustered(%arg0: i32) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -204,19 +191,13 @@ gpu.module @kernels {
     // CHECK-SHFL: %[[A2:.+]] = arith.addi %[[A1]], %[[S2]] : i32
     // CHECK-SHFL: "test.consume"(%[[A2]]) : (i32) -> ()
 
-    // CHECK-GFX9: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
-    // CHECK-GFX9: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
-    // CHECK-GFX9: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
-    // CHECK-GFX9: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
-    // CHECK-GFX9: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]]  row_half_mirror(unit) {bound_ctrl = true} : i32
-    // CHECK-GFX9: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
-
-    // CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
-    // CHECK-GFX10: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
-    // CHECK-GFX10: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
-    // CHECK-GFX10: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
-    // CHECK-GFX10: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]]  row_half_mirror(unit) {bound_ctrl = true} : i32
-    // CHECK-GFX10: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
+    // CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i32
+    // CHECK-GFX: %[[A0:.+]] = arith.addi %[[ARG0]], %[[D0]] : i32
+    // CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i32
+    // CHECK-GFX: %[[A1:.+]] = arith.addi %[[A0]], %[[D1]] : i32
+    // CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]]  row_half_mirror(unit) {bound_ctrl = true} : i32
+    // CHECK-GFX: %[[A2:.+]] = arith.addi %[[A1]], %[[D2]] : i32
+
     // CHECK-GFX10: "test.consume"(%[[A2]]) : (i32) -> ()
     %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 8) : (i32) -> i32
     "test.consume"(%sum0) : (i32) -> ()
@@ -228,11 +209,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel3_clustered_strided(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i32)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel3_clustered_strided(
-  // CHECK-GFX9-NOT: amdgpu.dpp
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel3_clustered_strided(
-  // CHECK-GFX10-NOT: amdgpu.dpp
+  // CHECK-GFX-LABEL: gpu.func @kernel3_clustered_strided(
+  // CHECK-GFX-NOT: amdgpu.dpp
   gpu.func @kernel3_clustered_strided(%arg0: i32) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 4 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 8 : i32
@@ -256,11 +234,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel4(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<2xf16>)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel4(
-  // CHECK-GFX9-NOT: amdgpu.dpp
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel4(
-  // CHECK-GFX10-NOT: amdgpu.dpp
+  // CHECK-GFX-LABEL: gpu.func @kernel4(
+  // CHECK-GFX-NOT: amdgpu.dpp
   gpu.func @kernel4(%arg0: vector<2xf16>) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -298,11 +273,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel4_clustered(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<2xf16>)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel4_clustered(
-  // CHECK-GFX9-NOT: amdgpu.dpp
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel4_clustered(
-  // CHECK-GFX10-NOT: amdgpu.dpp
+  // CHECK-GFX-LABEL: gpu.func @kernel4_clustered(
+  // CHECK-GFX-NOT: amdgpu.dpp
   gpu.func @kernel4_clustered(%arg0: vector<2xf16>) kernel {
     // CHECK-SHFL-DAG: %[[C1:.+]] = arith.constant 1 : i32
     // CHECK-SHFL-DAG: %[[C2:.+]] = arith.constant 2 : i32
@@ -319,10 +291,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel5(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i16)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel5(
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel5(
-  // CHECK-GFX10-SAME:    %[[ARG0:.+]]: i16)
+  // CHECK-GFX-LABEL: gpu.func @kernel5(
+  // CHECK-GFX-SAME:    %[[ARG0:.+]]: i16)
   gpu.func @kernel5(%arg0: i16) kernel {
     // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
     // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -334,7 +304,7 @@ gpu.module @kernels {
     // CHECK-SHFL: arith.trunci {{.+}} : i32 to i16
     // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
     // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
-    
+
     // CHECK-GFX9-COUNT-6: amdgpu.dpp
 
     // CHECK-GFX10: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
@@ -361,11 +331,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel5_clustered(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: i16)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel5_clustered
-  // CHECK-GFX9-SAME:    %[[ARG0:.+]]: i16)
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel5_clustered
-  // CHECK-GFX10-SAME:    %[[ARG0:.+]]: i16)
+  // CHECK-GFX-LABEL: gpu.func @kernel5_clustered
+  // CHECK-GFX-SAME:    %[[ARG0:.+]]: i16)
   gpu.func @kernel5_clustered(%arg0: i16) kernel {
     // CHECK-SHFL: %[[E0:.+]] = arith.extui %[[ARG0]] : i16 to i32
     // CHECK-SHFL: %[[S0:.+]], %{{.+}} = gpu.shuffle xor %[[E0]], {{.+}} : i32
@@ -378,25 +345,15 @@ gpu.module @kernels {
     // CHECK-SHFL: %[[AL:.+]] = arith.addi {{.+}} : i16
     // CHECK-SHFL: "test.consume"(%[[AL]]) : (i16) -> ()
 
-    // CHECK-GFX9: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
-    // CHECK-GFX9: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
-    // CHECK-GFX9: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
-    // CHECK-GFX9: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
-    // CHECK-GFX9: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]]  row_half_mirror(unit) {bound_ctrl = true} : i16
-    // CHECK-GFX9: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
-    // CHECK-GFX9: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]]  row_mirror(unit) {bound_ctrl = true} : i16
-    // CHECK-GFX9: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
-    // CHECK-GFX9: "test.consume"(%[[VAR7]]) : (i16) -> ()
-
-    // CHECK-GFX10: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
-    // CHECK-GFX10: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
-    // CHECK-GFX10: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
-    // CHECK-GFX10: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
-    // CHECK-GFX10: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]]  row_half_mirror(unit) {bound_ctrl = true} : i16
-    // CHECK-GFX10: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
-    // CHECK-GFX10: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]]  row_mirror(unit) {bound_ctrl = true} : i16
-    // CHECK-GFX10: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
-    // CHECK-GFX10: "test.consume"(%[[VAR7]]) : (i16) -> ()
+    // CHECK-GFX: %[[VAR0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : i16
+    // CHECK-GFX: %[[VAR1:.+]] = arith.addi %[[ARG0]], %[[VAR0]] : i16
+    // CHECK-GFX: %[[VAR2:.+]] = amdgpu.dpp %[[VAR1]] %[[VAR1]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : i16
+    // CHECK-GFX: %[[VAR3:.+]] = arith.addi %[[VAR1]], %[[VAR2]] : i16
+    // CHECK-GFX: %[[VAR4:.+]] = amdgpu.dpp %[[VAR3]] %[[VAR3]]  row_half_mirror(unit) {bound_ctrl = true} : i16
+    // CHECK-GFX: %[[VAR5:.+]] = arith.addi %[[VAR3]], %[[VAR4]] : i16
+    // CHECK-GFX: %[[VAR6:.+]] = amdgpu.dpp %[[VAR5]] %[[VAR5]]  row_mirror(unit) {bound_ctrl = true} : i16
+    // CHECK-GFX: %[[VAR7:.+]] = arith.addi %[[VAR5]], %[[VAR6]] : i16
+    // CHECK-GFX: "test.consume"(%[[VAR7]]) : (i16) -> ()
     %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 16) : (i16) -> i16
     "test.consume"(%sum0) : (i16) -> ()
 
@@ -407,11 +364,8 @@ gpu.module @kernels {
   // CHECK-SHFL-LABEL: gpu.func @kernel6(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<3xi8>)
   //
-  // CHECK-GFX9-LABEL: gpu.func @kernel6(
-  // CHECK-GFX9-NOT: amdgpu.dpp
-  //
-  // CHECK-GFX10-LABEL: gpu.func @kernel6(
-  // CHECK-GFX10-NOT: amdgpu.dpp
+  // CHECK-GFX-LABEL: gpu.func @kernel6(
+  // CHECK-GFX-NOT: amdgpu.dpp
   gpu.func @kernel6(%arg0: vector<3xi8>) kernel {
     // CHECK-SHFL: %[[CZ:.+]] = arith.constant dense<0> : vector<4xi8>
     // CHECK-SHFL: %[[V0:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CZ]] {offsets = [0], strides = [1]} : vector<3xi8> into vector<4xi8>
@@ -433,6 +387,44 @@ gpu.module @kernels {
     gpu.return
   }
 
+  // CHECK-GFX-LABEL: gpu.func @kernel7(
+  // CHECK-GFX-SAME:    %[[ARG0:.+]]: f32)
+  //
+  //   Checks, common to gfx942 and gfx1030, of
+  //     (1) quad_perm, followed by reduction resulting in reduction over 2 consecutive lanes,
+  //     (2) quad_perm, followed by reduction resulting in reduction over 4 consecutive lanes,
+  //     (3) row_half_mirror, followed by reduction resulting in reduction over 8 consecutive lanes, and
+  //     (4) row_mirror, followed by reduction resulting in reduction over 16 consecutive lanes.
+  // CHECK-GFX: %[[D0:.+]] = amdgpu.dpp %[[ARG0]] %[[ARG0]]  quad_perm([1 : i32, 0 : i32, 3 : i32, 2 : i32]) {bound_ctrl = true} : f32
+  // CHECK-GFX: %[[A0:.+]] = arith.addf %[[ARG0]], %[[D0]] : f32
+  // CHECK-GFX: %[[D1:.+]] = amdgpu.dpp %[[A0]] %[[A0]]  quad_perm([2 : i32, 3 : i32, 0 : i32, 1 : i32]) {bound_ctrl = true} : f32
+  // CHECK-GFX: %[[A1:.+]] = arith.addf %[[A0]], %[[D1]] : f32
+  // CHECK-GFX: %[[D2:.+]] = amdgpu.dpp %[[A1]] %[[A1]]  row_half_mirror(unit) {bound_ctrl = true} : f32
+  // CHECK-GFX: %[[A2:.+]] = arith.addf %[[A1]], %[[D2]] : f32
+  // CHECK-GFX: %[[D3:.+]] = amdgpu.dpp %[[A2]] %[[A2]]  row_mirror(unit) {bound_ctrl = true} : f32
+  // CHECK-GFX: %[[A3:.+]] = arith.addf %[[A2]], %[[D3]] : f32
+  //
+  //   Now, on gfx942:
+  //     (1) Lane 15 gets broadcast to lanes [16, 32) and lane 31 gets broadcast to lanes [48, 64], after which
+  //         the reduction in lanes [16, 32) is over the full cluster of the first 32 lanes, and the reduction in lanes
+  //         [48, 64) is over the full cluster of the last 32 lanes.
+  //     (2) Update the reduction value in lanes [0, 16) and [32, 48) with the final reduction result from
+  //         lanes [16, 32) and [48, 64), respectively.
+  // CHECK-GFX9: %[[BCAST15:.+]] = amdgpu.dpp %[[A3]] %[[A3]]  row_bcast_15(unit) {row_mask = 10 : i32} : f32
+  // CHECK-GFX9: %[[SUM:.+]] = arith.addf %[[A3]], %[[BCAST15]] : f32
+  // CHECK-GFX9: %[[SWIZ:.+]] = amdgpu.swizzle_bitmode %[[SUM]] 0 31 0 : f32
+  // CHECK-GFX9: "test.consume"(%[[SWIZ]]) : (f32) -> ()
+  //
+  //   On gfx1030, the final step is to permute the lanes and perform final reduction:
+  // CHECK-GFX10: rocdl.permlanex16
+  // CHECK-GFX10: arith.addf
+  // CHECK-GFX10: "test.consume"
+   gpu.func @kernel7(%arg0: f32) kernel {
+     %sum0 = gpu.subgroup_reduce add %arg0 cluster(size = 32) : (f32) -> (f32)
+     "test.consume"(%sum0) : (f32) -> ()
+     gpu.return
+   }
+
   // CHECK-SHFL-LABEL: gpu.func @kernel_cluster_size_is_subgroup_size(
   // CHECK-SHFL-SAME:    %[[ARG0:.+]]: vector<3xi8>)
   //
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index aaf9f8024bfbe..49b6342aea538 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -664,21 +664,21 @@ func.func @zero_non_llvm_type() {
 // -----
 
 func.func @nvvm_invalid_shfl_pred_1(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) {
-  // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}}
+  // expected-error@+1 {{expected return type to be a two-element struct}}
   %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> i32
 }
 
 // -----
 
 func.func @nvvm_invalid_shfl_pred_2(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) {
-  // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}}
+  // expected-error@+1 {{expected return type to be a two-element struct}}
   %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32)>
 }
 
 // -----
 
 func.func @nvvm_invalid_shfl_pred_3(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) {
-  // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}}
+  // expected-error@+1 {{expected second element in the returned struct to be of type 'i1' but got 'i32' instead}}
   %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i32)>
 }
 
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index 0243f5eb8c862..cd7bd37da5763 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -419,8 +419,8 @@ llvm.func private @mbarrier_init_generic(%barrier: !llvm.ptr) {
 
 llvm.func private @mbarrier_init_shared(%barrier: !llvm.ptr<3>) {
   %count = nvvm.read.ptx.sreg.ntid.x : i32
-  // CHECK:   nvvm.mbarrier.init.shared %{{.*}}, %{{.*}} : !llvm.ptr<3>, i32
-  nvvm.mbarrier.init.shared %barrier, %count : !llvm.ptr<3>, i32
+  // CHECK:   nvvm.mbarrier.init %{{.*}}, %{{.*}} : !llvm.ptr<3>, i32
+  nvvm.mbarrier.init %barrier, %count : !llvm.ptr<3>, i32
   llvm.return
 }
 
@@ -433,8 +433,8 @@ llvm.func private @mbarrier_inval_generic(%barrier: !llvm.ptr) {
 
 
 llvm.func private @mbarrier_inval_shared(%barrier: !llvm.ptr<3>) {
-  // CHECK:   nvvm.mbarrier.inval.shared %{{.*}} : !llvm.ptr<3>
-  nvvm.mbarrier.inval.shared %barrier : !llvm.ptr<3>
+  // CHECK:   nvvm.mbarrier.inval %{{.*}} : !llvm.ptr<3>
+  nvvm.mbarrier.inval %barrier : !llvm.ptr<3>
   llvm.return
 }
 
@@ -445,8 +445,8 @@ llvm.func private @mbarrier_arrive(%barrier: !llvm.ptr) {
 }
 
 llvm.func private @mbarrier_arrive_shared(%barrier: !llvm.ptr<3>) {
-  // CHECK:   nvvm.mbarrier.arrive.shared %{{.*}} : !llvm.ptr<3>
-  %0 = nvvm.mbarrier.arrive.shared %barrier : !llvm.ptr<3> -> i64
+  // CHECK:   nvvm.mbarrier.arrive %{{.*}} : !llvm.ptr<3>
+  %0 = nvvm.mbarrier.arrive %barrier : !llvm.ptr<3> -> i64
   llvm.return
 }
 
@@ -459,8 +459,8 @@ llvm.func private @mbarrier_arrive_nocomplete(%barrier: !llvm.ptr) {
 
 llvm.func private @mbarrier_arrive_nocomplete_shared(%barrier: !llvm.ptr<3>) {
   %count = nvvm.read.ptx.sreg.ntid.x : i32
-  // CHECK:   nvvm.mbarrier.arrive.nocomplete.shared %{{.*}} : !llvm.ptr<3>
-  %0 = nvvm.mbarrier.arrive.nocomplete.shared %barrier, %count : !llvm.ptr<3>, i32  -> i64
+  // CHECK:   nvvm.mbarrier.arrive.nocomplete %{{.*}} : !llvm.ptr<3>
+  %0 = nvvm.mbarrier.arrive.nocomplete %barrier, %count : !llvm.ptr<3>, i32  -> i64
   llvm.return
 }
 
@@ -472,8 +472,8 @@ llvm.func private @mbarrier_test_wait(%barrier: !llvm.ptr, %token : i64) -> i1 {
 
 llvm.func private @mbarrier_test_wait_shared(%barrier: !llvm.ptr<3>, %token : i64) {
   %count = nvvm.read.ptx.sreg.ntid.x : i32
-  // CHECK:   nvvm.mbarrier.test.wait.shared %{{.*}}
-  %isComplete = nvvm.mbarrier.test.wait.shared %barrier, %token : !llvm.ptr<3>, i64 -> i1
+  // CHECK:   nvvm.mbarrier.test.wait %{{.*}}
+  %isComplete = nvvm.mbarrier.test.wait %barrier, %token : !llvm.ptr<3>, i64 -> i1
   llvm.return
 }
 
diff --git a/mlir/test/Dialect/LLVMIR/nvvm/invalid-convert-stochastic-rounding.mlir b/mlir/test/Dialect/LLVMIR/nvvm/invalid-convert-stochastic-rounding.mlir
new file mode 100644
index 0000000000000..35f5e1b3c8ba2
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/nvvm/invalid-convert-stochastic-rounding.mlir
@@ -0,0 +1,90 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// Test invalid target architecture (sm_100 instead of sm_100a)
+gpu.module @invalid_arch_sm_100 [#nvvm.target<chip = "sm_100">] {
+  func.func @convert_rs() {
+    %f1 = llvm.mlir.constant(1.0 : f32) : f32
+    %f2 = llvm.mlir.constant(2.0 : f32) : f32
+    %rbits = llvm.mlir.constant(0x12345678 : i32) : i32
+    // expected-error@+1 {{'nvvm.convert.f32x2.to.f16x2' op is not supported on sm_100}}
+    %res = nvvm.convert.f32x2.to.f16x2 %f1, %f2, %rbits : vector<2xf16>
+    return
+  }
+}
+
+// -----
+
+// Test that operations require stochastic rounding mode
+llvm.func @invalid_rnd_mode_f16x2(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> {
+  // expected-error@+1 {{Only RS rounding mode is supported for conversions from f32x2 to f16x2.}}
+  %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {rnd = #nvvm.fp_rnd_mode<rn>} : vector<2xf16>
+  llvm.return %res : vector<2xf16>
+}
+
+// -----
+
+llvm.func @invalid_rnd_mode_bf16x2(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> {
+  // expected-error@+1 {{Only RS rounding mode is supported for conversions from f32x2 to bf16x2.}}
+  %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xbf16>
+  llvm.return %res : vector<2xbf16>
+}
+
+// -----
+
+// Test invalid destination types for f8x4 (should only accept f8E4M3FN, f8E5M2)
+llvm.func @invalid_dst_type_f8x4_e3m4(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // expected-error@+1 {{Only 'f8E4M3FN' and 'f8E5M2' types are supported for conversions from f32x4 to f8x4.}}
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E3M4)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+llvm.func @invalid_dst_type_f8x4_e8m0(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // expected-error@+1 {{Only 'f8E4M3FN' and 'f8E5M2' types are supported for conversions from f32x4 to f8x4.}}
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E8M0FNU)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test invalid destination types for f6x4 (should only accept f6E2M3FN, f6E3M2FN)
+llvm.func @invalid_dst_type_f6x4_f8(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // expected-error@+1 {{Only 'f6E2M3FN' and 'f6E3M2FN' types are supported for conversions from f32x4 to f6x4.}}
+  %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E4M3FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test invalid destination type for f4x4 (should only accept f4E2M1FN)
+llvm.func @invalid_dst_type_f4x4_f6(%src : vector<4xf32>, %rbits : i32) -> i16 {
+  // expected-error@+1 {{Only 'f4E2M1FN' type is supported for conversions from f32x4 to f4x4.}}
+  %res = nvvm.convert.f32x4.to.f4x4 %src, %rbits : vector<4xf32> -> i16 (f6E2M3FN)
+  llvm.return %res : i16
+}
+
+// -----
+
+// Test invalid rounding modes for non-stochastic ops
+llvm.func @convert_float_to_tf32_rs_not_supported(%src : f32) -> i32 {
+  // expected-error @below {{Only {rn,rz,rna} rounding modes supported for ConvertFloatToTF32Op.}}
+  %res = nvvm.convert.float.to.tf32 %src {rnd = #nvvm.fp_rnd_mode<rs>}
+  llvm.return %res : i32
+}
+
+// -----
+
+llvm.func @convert_f32x2_to_f8x2_rs_not_supported(%a : f32, %b : f32) {
+  // expected-error @below {{Only RN rounding mode is supported for conversions from f32x2 to 'f8E4M3FN' and 'f8E5M2' types}}
+  %res = nvvm.convert.f32x2.to.f8x2 %a, %b {rnd = #nvvm.fp_rnd_mode<rs>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E4M3FN)
+  llvm.return
+}
+
+// -----
+
+llvm.func @convert_bf16x2_to_f8x2_rs_not_supported(%src : vector<2xbf16>) {
+  // expected-error @below {{Only RZ and RP rounding modes are supported for conversions from bf16x2 to f8x2.}}
+  %res = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rs>} : vector<2xbf16> -> i16 (f8E8M0FNU)
+  llvm.return
+}
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index e703600c71c8e..d50cc41684e3c 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -650,6 +650,39 @@ llvm.func @rocdl.ds.read.tr(%ptr : !llvm.ptr<3>) -> vector<4xf16> {
   llvm.return %r3 : vector<4xf16>
 }
 
+llvm.func @rocdl.load.tr.ops(%gl_ptr : !llvm.ptr<1>, %ds_ptr : !llvm.ptr<3>) {
+  // CHECK-LABEL: @rocdl.load.tr.ops
+  // CHECK-SAME: (%[[GL_PTR:.+]]: !llvm.ptr<1>, %[[DS_OTR:.+]]: !llvm.ptr<3>)
+  // CHECK: rocdl.global.load.tr4.b64 %[[GL_PTR]] : !llvm.ptr<1> -> vector<2xi32>
+  // CHECK: rocdl.global.load.tr.b64 %[[GL_PTR]] : !llvm.ptr<1> -> vector<2xi32>
+  // CHECK: rocdl.global.load.tr6.b96 %[[GL_PTR]] : !llvm.ptr<1> -> vector<3xi32>
+  // CHECK: rocdl.global.load.tr.b128 %[[GL_PTR]] : !llvm.ptr<1> -> vector<8xi16>
+  // CHECK: rocdl.global.load.tr.b128 %[[GL_PTR]] : !llvm.ptr<1> -> vector<8xf16>
+  // CHECK: rocdl.global.load.tr.b128 %[[GL_PTR]] : !llvm.ptr<1> -> vector<8xbf16>
+  // CHECK: rocdl.ds.load.tr4.b64 %[[DS_OTR]] : !llvm.ptr<3> -> vector<2xi32>
+  // CHECK: rocdl.ds.load.tr8.b64 %[[DS_OTR]] : !llvm.ptr<3> -> vector<2xi32>
+  // CHECK: rocdl.ds.load.tr6.b96 %[[DS_OTR]] : !llvm.ptr<3> -> vector<3xi32>
+  // CHECK: rocdl.ds.load.tr16.b128 %[[DS_OTR]] : !llvm.ptr<3> -> vector<8xi16>
+  // CHECK: rocdl.ds.load.tr16.b128 %[[DS_OTR]] : !llvm.ptr<3> -> vector<8xf16>
+  // CHECK: rocdl.ds.load.tr16.b128 %[[DS_OTR]] : !llvm.ptr<3> -> vector<8xbf16>
+  // CHECK: llvm.return
+
+  rocdl.global.load.tr4.b64 %gl_ptr : !llvm.ptr<1> -> vector<2xi32>
+  rocdl.global.load.tr.b64 %gl_ptr : !llvm.ptr<1> -> vector<2xi32>
+  rocdl.global.load.tr6.b96 %gl_ptr : !llvm.ptr<1> -> vector<3xi32>
+  rocdl.global.load.tr.b128 %gl_ptr : !llvm.ptr<1> -> vector<8xi16>
+  rocdl.global.load.tr.b128 %gl_ptr : !llvm.ptr<1> -> vector<8xf16>
+  rocdl.global.load.tr.b128 %gl_ptr : !llvm.ptr<1> -> vector<8xbf16>
+
+  rocdl.ds.load.tr4.b64 %ds_ptr : !llvm.ptr<3> -> vector<2xi32>
+  rocdl.ds.load.tr8.b64 %ds_ptr : !llvm.ptr<3> -> vector<2xi32>
+  rocdl.ds.load.tr6.b96 %ds_ptr : !llvm.ptr<3> -> vector<3xi32>
+  rocdl.ds.load.tr16.b128 %ds_ptr : !llvm.ptr<3> -> vector<8xi16>
+  rocdl.ds.load.tr16.b128 %ds_ptr : !llvm.ptr<3> -> vector<8xf16>
+  rocdl.ds.load.tr16.b128 %ds_ptr : !llvm.ptr<3> -> vector<8xbf16>
+  llvm.return
+}
+
 llvm.func @rocdl.load.to.lds(%src : !llvm.ptr<7>, %dst: !llvm.ptr<3>) {
   // CHECK-LABEL @rocdl.load.to.lds
   //CHECK: rocdl.load.to.lds %{{.*}}, %{{.*}}, 4, 0, 0 : <7>
@@ -664,6 +697,19 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   llvm.return
 }
 
+llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+  // CHECK-LABEL @rocdl.global.load.async.to.lds
+  // CHECK: rocdl.global.load.async.to.lds.b8 %{{.*}}, %{{.*}}, 0, 0
+  // CHECK: rocdl.global.load.async.to.lds.b32 %{{.*}}, %{{.*}}, 0, 0
+  // CHECK: rocdl.global.load.async.to.lds.b64 %{{.*}}, %{{.*}}, 0, 0
+  // CHECK: rocdl.global.load.async.to.lds.b128 %{{.*}}, %{{.*}}, 0, 0
+  rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : <1>, <3>
+  rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : <1>, <3>
+  rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : <1>, <3>
+  rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : <1>, <3>
+  llvm.return
+}
+
 // CHECK-LABEL @rocdl.tensor.load.to.lds
 llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
                                     %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
diff --git a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
index e5216089692b4..ab38f9f2f5943 100644
--- a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
@@ -253,6 +253,40 @@ module {
 
 // -----
 
+#map = affine_map<(d0) -> (d0 * 2)>
+#map1 = affine_map<(d0) -> (d0 * 4)>
+module {
+  // CHECK-LABEL: func.func @fuse_tileable_op_no_dps
+  func.func @fuse_tileable_op_no_dps(%arg0: tensor<4x4x4xf32>, %arg1: tensor<4x4x4xf32>) -> tensor<4x4x4xf32> {
+    %0 = "test.tiling_no_dps_op"(%arg0, %arg1) : (tensor<4x4x4xf32>, tensor<4x4x4xf32>) -> tensor<4x4x4xf32>
+    %1 = tensor.empty() : tensor<4x4x4xf32>
+    // CHECK: scf.forall
+    %2 = scf.forall (%arg2, %arg3, %arg4) in (4, 2, 1) shared_outs(%arg5 = %1) -> (tensor<4x4x4xf32>) {
+      %3 = affine.apply #map(%arg3)
+      %4 = affine.apply #map1(%arg4)
+      // CHECK: "test.tiling_no_dps_op"
+      // CHECK: "test.unregistered_op"
+      %extracted_slice = tensor.extract_slice %0[%arg2, %3, %4] [1, 2, 4] [1, 1, 1] : tensor<4x4x4xf32> to tensor<1x2x4xf32>
+      %5 = "test.unregistered_op"(%extracted_slice, %extracted_slice) : (tensor<1x2x4xf32>, tensor<1x2x4xf32>) -> tensor<1x2x4xf32>
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %5 into %arg5[%arg2, %3, %4] [1, 2, 4] [1, 1, 1] : tensor<1x2x4xf32> into tensor<4x4x4xf32>
+      }
+    }
+    return %2 : tensor<4x4x4xf32>
+  }
+
+  module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+      %op = transform.structured.match ops{["test.tiling_no_dps_op"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+      %forall = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+      %fused, %new_containing = transform.structured.fuse_into_containing_op %op into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.yield
+    }
+  }
+}
+
+// -----
+
 module {
   // CHECK-LABEL: func.func @fuse_tileable_op_through_bbarg_inout_nested
   //  CHECK-SAME:   %[[ARG0:[0-9a-z]+]]: tensor<?x?x?xf32>
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
index aa2c1da4b6274..9a14ab7d38d3e 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
@@ -285,6 +285,8 @@ module attributes {transform.with_named_sequence} {
 
 ///----------------------------------------------------------------------------------------
 /// Tests for linalg.pack
+///
+/// TODO: Add similar tests for linalg.unpack
 ///----------------------------------------------------------------------------------------
 
 // Note, see a similar test in:
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
index 1304a90349f71..170bae6141609 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
@@ -1335,7 +1335,7 @@ func.func @pack_no_padding(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x16x2x
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%src: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.pack"]} in %src : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 vector_sizes [4, 1, 32] : !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [4, 1, 32, 16, 2] : !transform.any_op
     transform.yield
   }
 }
@@ -1378,7 +1378,7 @@ func.func @pack_with_padding(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 vector_sizes [32, 4, 1] : !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [32, 4, 1, 16, 2] : !transform.any_op
     transform.yield
   }
 }
@@ -1424,8 +1424,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK-LABEL: func @pack_with_dynamic_dims
 // CHECK-SAME:      %[[SRC:.*]]: tensor<?x?xf32>,
 // CHECK-SAME:      %[[DEST:.*]]: tensor<?x?x16x2xf32>
-func.func @pack_with_dynamic_dims(%src: tensor<?x?xf32>, %dest: tensor<?x?x16x2xf32>) -> tensor<?x?x16x2xf32> {
-  %pack = linalg.pack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?xf32> -> tensor<?x?x16x2xf32>
+func.func @pack_with_dynamic_dims(
+    %src: tensor<?x?xf32>, 
+    %dest: tensor<?x?x16x2xf32>) -> tensor<?x?x16x2xf32> {
+  %pack = linalg.pack %src 
+    inner_dims_pos = [1, 0]
+    inner_tiles = [16, 2]
+    into %dest : tensor<?x?xf32> -> tensor<?x?x16x2xf32>
   return %pack : tensor<?x?x16x2xf32>
 }
 
@@ -1433,30 +1438,108 @@ func.func @pack_with_dynamic_dims(%src: tensor<?x?xf32>, %dest: tensor<?x?x16x2x
 //  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
 //  CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index
 //  CHECK-DAG: %[[C1_0:.*]] = arith.constant 1 : index
+
+/// Compute mask for xfer_read
 //  CHECK-DAG: %[[D0_0:.*]] = tensor.dim {{.*}} %[[C0_0]] : tensor<?x?xf32>
 //  CHECK-DAG: %[[D1_0:.*]] = tensor.dim {{.*}} %[[C1_0]] : tensor<?x?xf32>
 //      CHECK: %[[MASK:.*]] = vector.create_mask %[[D0_0]], %[[D1_0]] : vector<8x16xi1>
+
+/// --= read =---
 //      CHECK: %[[READ:.*]] = vector.mask %[[MASK]] {
 // CHECK-SAME:   vector.transfer_read %{{.*}}[%[[C0_1]], %[[C0_1]]], %[[CST]]
 // CHECK-SAME:   {in_bounds = [true, true]} : tensor<?x?xf32>, vector<8x16xf32>
 // CHECK-SAME: } : vector<8x16xi1> -> vector<8x16xf32>
+
+/// --= shape_cast =---
 //      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<8x16xf32> to vector<4x2x1x16xf32>
+
+/// --= transpose =---
 //      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 2, 3, 1] : vector<4x2x1x16xf32> to vector<4x1x16x2xf32>
+
+/// Compute mask for xfer_write
 //  CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index
 //  CHECK-DAG: %[[C16:.*]] = arith.constant 16 : index
 //  CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
 //  CHECK-DAG: %[[D2:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor<?x?x16x2xf32>
 //  CHECK-DAG: %[[D3:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor<?x?x16x2xf32>
 //      CHECK: %[[MASK_0:.*]] = vector.create_mask %[[D2]], %[[D3]], %[[C16]], %[[C2]] : vector<4x1x16x2xi1>
+
+/// --= write =---
 //      CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_0]] {
 // CHECK-SAME:   vector.transfer_write %[[TR]], %[[DEST]][%[[C0_2]], %[[C0_2]], %[[C0_2]], %[[C0_2]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true]} : vector<4x1x16x2xf32>, tensor<?x?x16x2xf32>
+
 //      CHECK: return %[[WRITE]] : tensor<?x?x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 vector_sizes [4, 1] : !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [4, 1, 16, 2] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+/// Similar to the test above, but one of the inner tile sizes is dynamic. As a
+/// result, more output dims are dynamic (and, e.g., output mask calcuation is a bit different).
+
+// CHECK-LABEL: func @pack_with_dynamic_dims_and_dynamic_inner_tile
+// CHECK-SAME:      %[[SRC:.*]]: tensor<?x?xf32>,
+// CHECK-SAME:      %[[DEST:.*]]: tensor<?x?x?x2xf32>
+func.func @pack_with_dynamic_dims_and_dynamic_inner_tile(
+    %src: tensor<?x?xf32>,
+    %dest: tensor<?x?x?x2xf32>) -> tensor<?x?x?x2xf32> {
+  %c16 = arith.constant 16 : index
+  %pack = linalg.pack %src
+    inner_dims_pos = [1, 0]
+    inner_tiles = [%c16, 2]
+    into %dest : tensor<?x?xf32> -> tensor<?x?x?x2xf32>
+  return %pack : tensor<?x?x?x2xf32>
+}
+
+//  CHECK-DAG: %[[CST:.*]] = ub.poison : f32
+//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[C1_0:.*]] = arith.constant 1 : index
+
+/// Compute mask for xfer_read
+//  CHECK-DAG: %[[D0_0:.*]] = tensor.dim {{.*}} %[[C0_0]] : tensor<?x?xf32>
+//  CHECK-DAG: %[[D1_0:.*]] = tensor.dim {{.*}} %[[C1_0]] : tensor<?x?xf32>
+//      CHECK: %[[MASK:.*]] = vector.create_mask %[[D0_0]], %[[D1_0]] : vector<8x16xi1>
+
+/// --= read =---
+//      CHECK: %[[READ:.*]] = vector.mask %[[MASK]] {
+// CHECK-SAME:   vector.transfer_read %{{.*}}[%[[C0_1]], %[[C0_1]]], %[[CST]]
+// CHECK-SAME:   {in_bounds = [true, true]} : tensor<?x?xf32>, vector<8x16xf32>
+// CHECK-SAME: } : vector<8x16xi1> -> vector<8x16xf32>
+
+/// --= shape_cast =---
+//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<8x16xf32> to vector<4x2x1x16xf32>
+
+/// --= transpose =---
+//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 2, 3, 1] : vector<4x2x1x16xf32> to vector<4x1x16x2xf32>
+
+/// Compute mask for xfer_write
+//  CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+//  CHECK-DAG: %[[C2_2:.*]] = arith.constant 2 : index
+//  CHECK-DAG: %[[D2:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor<?x?x?x2xf32>
+//  CHECK-DAG: %[[D3:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor<?x?x?x2xf32>
+//  CHECK-DAG: %[[D4:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor<?x?x?x2xf32>
+//      CHECK: %[[MASK_0:.*]] = vector.create_mask %[[D2]], %[[D3]], %[[D4]], %[[C2_2]] : vector<4x1x16x2xi1>
+
+/// --= write =---
+//      CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_0]] {
+// CHECK-SAME:   vector.transfer_write %[[TR]], %[[DEST]][%[[C0_2]], %[[C0_2]], %[[C0_2]], %[[C0_2]]]
+// CHECK-SAME:   {in_bounds = [true, true, true, true]} : vector<4x1x16x2xf32>, tensor<?x?x?x2xf32>
+
+//      CHECK: return %[[WRITE]] : tensor<?x?x?x2xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [4, 1, 16, 2] : !transform.any_op
     transform.yield
   }
 }
diff --git a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir
index 18cdfb73f6ba8..4ed8d4b20229f 100644
--- a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir
+++ b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir
@@ -1455,3 +1455,20 @@ func.func @extract_strided_metadata_of_memory_space_cast_no_base(%base: memref<2
 
 // CHECK-LABEL:  func @extract_strided_metadata_of_memory_space_cast_no_base
 //   CHECK-NOT:  memref.memory_space_cast
+
+// -----
+
+func.func @negative_memref_view_extract_aligned_pointer(%arg0: memref<?xi8>) -> index {
+  // `extract_aligned_pointer_as_index` must not be folded as `memref.view` can change the base pointer
+  // CHECK-LABEL: func @negative_memref_view_extract_aligned_pointer
+  // CHECK-SAME: (%[[ARG0:.*]]: memref<?xi8>)
+  // CHECK: %[[C10:.*]] = arith.constant 10 : index
+  // CHECK: %[[VIEW:.*]] = memref.view %[[ARG0]][%[[C10]]][] : memref<?xi8> to memref<f32>
+  // CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[VIEW]] : memref<f32> -> index
+  // CHECK: return %[[PTR]] : index
+
+  %c10 = arith.constant 10 : index
+  %0 = memref.view %arg0[%c10][] : memref<?xi8> to memref<f32>
+  %1 = memref.extract_aligned_pointer_as_index %0: memref<f32> -> index
+  return %1 : index
+}
diff --git a/mlir/test/Dialect/MemRef/invalid.mlir b/mlir/test/Dialect/MemRef/invalid.mlir
index 5ff292058ccc1..d10651f363711 100644
--- a/mlir/test/Dialect/MemRef/invalid.mlir
+++ b/mlir/test/Dialect/MemRef/invalid.mlir
@@ -992,6 +992,22 @@ func.func @invalid_store_alignment(%memref: memref<4xi32>, %val: i32) {
 
 // -----
 
+func.func @invalid_alloc_alignment() {
+  // expected-error @below {{'memref.alloc' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+  %0 = memref.alloc() {alignment = 3} : memref<4xf32>
+  return
+}
+
+// -----
+
+func.func @invalid_realloc_alignment(%src: memref<4xf32>) {
+  // expected-error @below {{'memref.realloc' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+  %0 = memref.realloc %src {alignment = 7} : memref<4xf32> to memref<8xf32>
+  return
+}
+
+// -----
+
 func.func @test_alloc_memref_map_rank_mismatch() {
 ^bb0:
   // expected-error@+1 {{memref layout mismatch between rank and affine map: 2 != 1}}
diff --git a/mlir/test/Dialect/MemRef/transform-ops.mlir b/mlir/test/Dialect/MemRef/transform-ops.mlir
index 3b37c62fcb28e..6e130912c47e9 100644
--- a/mlir/test/Dialect/MemRef/transform-ops.mlir
+++ b/mlir/test/Dialect/MemRef/transform-ops.mlir
@@ -306,6 +306,23 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// CHECK-LABEL: func.func @dead_alloc_escaped
+func.func @dead_alloc_escaped() -> memref<8x64xf32, 3> {
+  // CHECK: %{{.+}} = memref.alloc
+  %0 = memref.alloc() : memref<8x64xf32, 3>
+  return %0 : memref<8x64xf32, 3>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.memref.erase_dead_alloc_and_stores %0 : (!transform.any_op) -> ()
+    transform.yield
+  }
+}
+
+// -----
+
 // CHECK-LABEL: func.func @dead_alloc
 func.func @dead_alloc() {
   // CHECK-NOT: %{{.+}} = memref.alloc
diff --git a/mlir/test/Dialect/OpenACC/acc-implicit-data-reduction.mlir b/mlir/test/Dialect/OpenACC/acc-implicit-data-reduction.mlir
new file mode 100644
index 0000000000000..cff118b79aec2
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/acc-implicit-data-reduction.mlir
@@ -0,0 +1,109 @@
+// RUN: mlir-opt %s -acc-implicit-data=enable-implicit-reduction-copy=true -split-input-file | FileCheck %s --check-prefix=COPY
+// RUN: mlir-opt %s -acc-implicit-data=enable-implicit-reduction-copy=false -split-input-file | FileCheck %s --check-prefix=FIRSTPRIVATE
+
+// Test case: scalar reduction variable in parallel loop
+// When enable-implicit-reduction-copy=true: expect copyin/copyout for reduction variable
+// When enable-implicit-reduction-copy=false: expect firstprivate for reduction variable
+
+acc.reduction.recipe @reduction_add_memref_i32 : memref<i32> reduction_operator <add> init {
+^bb0(%arg0: memref<i32>):
+  %c0_i32 = arith.constant 0 : i32
+  %alloc = memref.alloca() : memref<i32>
+  memref.store %c0_i32, %alloc[] : memref<i32>
+  acc.yield %alloc : memref<i32>
+} combiner {
+^bb0(%arg0: memref<i32>, %arg1: memref<i32>):
+  %0 = memref.load %arg0[] : memref<i32>
+  %1 = memref.load %arg1[] : memref<i32>
+  %2 = arith.addi %0, %1 : i32
+  memref.store %2, %arg0[] : memref<i32>
+  acc.yield %arg0 : memref<i32>
+}
+
+func.func @test_reduction_implicit_copy() {
+  %c1_i32 = arith.constant 1 : i32
+  %c100_i32 = arith.constant 100 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %r = memref.alloca() : memref<i32>
+  memref.store %c0_i32, %r[] : memref<i32>
+
+  acc.parallel {
+    %red_var = acc.reduction varPtr(%r : memref<i32>) -> memref<i32> {name = "r"}
+    acc.loop reduction(@reduction_add_memref_i32 -> %red_var : memref<i32>) control(%iv : i32) = (%c1_i32 : i32) to (%c100_i32 : i32) step (%c1_i32 : i32) {
+      %load = memref.load %red_var[] : memref<i32>
+      %add = arith.addi %load, %c1_i32 : i32
+      memref.store %add, %red_var[] : memref<i32>
+      acc.yield
+    } attributes {independent = [#acc.device_type<none>]}
+    acc.yield
+  }
+  return
+}
+
+// When enable-implicit-reduction-copy=true: expect copyin/copyout for reduction variable
+// COPY-LABEL: func.func @test_reduction_implicit_copy
+// COPY: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : memref<i32>) -> memref<i32> {dataClause = #acc<data_clause acc_reduction>, implicit = true, name = ""}
+// COPY: acc.copyout accPtr(%[[COPYIN]] : memref<i32>) to varPtr({{.*}} : memref<i32>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+
+// When enable-implicit-reduction-copy=false: expect firstprivate for reduction variable  
+// FIRSTPRIVATE-LABEL: func.func @test_reduction_implicit_copy
+// FIRSTPRIVATE: acc.firstprivate varPtr({{.*}} : memref<i32>) -> memref<i32> {implicit = true, name = ""}
+// FIRSTPRIVATE-NOT: acc.copyin
+// FIRSTPRIVATE-NOT: acc.copyout
+
+// -----
+
+// Test case: reduction variable used both in loop and outside
+// Should be firstprivate regardless of the flag setting
+
+acc.reduction.recipe @reduction_add_memref_i32_2 : memref<i32> reduction_operator <add> init {
+^bb0(%arg0: memref<i32>):
+  %c0_i32 = arith.constant 0 : i32
+  %alloc = memref.alloca() : memref<i32>
+  memref.store %c0_i32, %alloc[] : memref<i32>
+  acc.yield %alloc : memref<i32>
+} combiner {
+^bb0(%arg0: memref<i32>, %arg1: memref<i32>):
+  %0 = memref.load %arg0[] : memref<i32>
+  %1 = memref.load %arg1[] : memref<i32>
+  %2 = arith.addi %0, %1 : i32
+  memref.store %2, %arg0[] : memref<i32>
+  acc.yield %arg0 : memref<i32>
+}
+
+func.func @test_reduction_with_usage_outside_loop() {
+  %c1_i32 = arith.constant 1 : i32
+  %c100_i32 = arith.constant 100 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %r = memref.alloca() : memref<i32>
+  %out = memref.alloca() : memref<i32>
+  memref.store %c0_i32, %r[] : memref<i32>
+
+  %out_create = acc.create varPtr(%out : memref<i32>) -> memref<i32> {dataClause = #acc<data_clause acc_copyout>, name = "out"}
+  acc.parallel dataOperands(%out_create : memref<i32>) {
+    %red_var = acc.reduction varPtr(%r : memref<i32>) -> memref<i32> {name = "r"}
+    acc.loop reduction(@reduction_add_memref_i32_2 -> %red_var : memref<i32>) control(%iv : i32) = (%c1_i32 : i32) to (%c100_i32 : i32) step (%c1_i32 : i32) {
+      %load = memref.load %red_var[] : memref<i32>
+      %add = arith.addi %load, %c1_i32 : i32
+      memref.store %add, %red_var[] : memref<i32>
+      acc.yield
+    } attributes {independent = [#acc.device_type<none>]}
+    // out = r (usage of r outside the loop)
+    %final_r = memref.load %r[] : memref<i32>
+    memref.store %final_r, %out_create[] : memref<i32>
+    acc.yield
+  }
+  acc.copyout accPtr(%out_create : memref<i32>) to varPtr(%out : memref<i32>) {dataClause = #acc<data_clause acc_copyout>, name = "out"}
+  return
+}
+
+// In this case, r should be firstprivate regardless of the flag setting 
+// because it's used outside the reduction context
+// COPY-LABEL: func.func @test_reduction_with_usage_outside_loop
+// COPY: acc.firstprivate varPtr({{.*}} : memref<i32>) -> memref<i32> {implicit = true, name = ""}
+// COPY-NOT: acc.copyin varPtr({{.*}} : memref<i32>) -> memref<i32> {{.*}} name = ""
+
+// FIRSTPRIVATE-LABEL: func.func @test_reduction_with_usage_outside_loop
+// FIRSTPRIVATE: acc.firstprivate varPtr({{.*}} : memref<i32>) -> memref<i32> {implicit = true, name = ""}
+// FIRSTPRIVATE-NOT: acc.copyin varPtr({{.*}} : memref<i32>) -> memref<i32> {{.*}} name = ""
+
diff --git a/mlir/test/Dialect/OpenACC/acc-implicit-data.mlir b/mlir/test/Dialect/OpenACC/acc-implicit-data.mlir
new file mode 100644
index 0000000000000..cf09c33ca5197
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/acc-implicit-data.mlir
@@ -0,0 +1,224 @@
+// RUN: mlir-opt %s -acc-implicit-data -split-input-file | FileCheck %s
+
+// -----
+
+// Test scalar in serial construct - should generate firstprivate
+func.func @test_scalar_in_serial() {
+  %alloc = memref.alloca() : memref<i64>
+  acc.serial {
+    %load = memref.load %alloc[] : memref<i64>
+    acc.yield
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @test_scalar_in_serial
+// CHECK: acc.firstprivate varPtr({{.*}} : memref<i64>) -> memref<i64> {implicit = true, name = ""}
+
+// -----
+
+// Test scalar in parallel construct - should generate firstprivate
+func.func @test_scalar_in_parallel() {
+  %alloc = memref.alloca() : memref<f32>
+  acc.parallel {
+    %load = memref.load %alloc[] : memref<f32>
+    acc.yield
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @test_scalar_in_parallel
+// CHECK: acc.firstprivate varPtr({{.*}} : memref<f32>) -> memref<f32> {implicit = true, name = ""}
+
+// -----
+
+// Test scalar in kernels construct - should generate copyin/copyout
+func.func @test_scalar_in_kernels() {
+  %alloc = memref.alloca() : memref<f64>
+  acc.kernels {
+    %load = memref.load %alloc[] : memref<f64>
+    acc.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @test_scalar_in_kernels
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : memref<f64>) -> memref<f64> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : memref<f64>) to varPtr({{.*}} : memref<f64>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+
+// -----
+
+// Test scalar in parallel with default(none) - should NOT generate implicit data
+func.func @test_scalar_parallel_defaultnone() {
+  %alloc = memref.alloca() : memref<f32>
+  acc.parallel {
+    %load = memref.load %alloc[] : memref<f32>
+    acc.yield
+  } attributes {defaultAttr = #acc<defaultvalue none>}
+  return
+}
+
+// CHECK-LABEL: func.func @test_scalar_parallel_defaultnone
+// CHECK-NOT: acc.firstprivate
+// CHECK-NOT: acc.copyin
+
+// -----
+
+// Test array in parallel - should generate copyin/copyout
+func.func @test_array_in_parallel() {
+  %alloc = memref.alloca() : memref<10xf32>
+  acc.parallel {
+    %c0 = arith.constant 0 : index
+    %load = memref.load %alloc[%c0] : memref<10xf32>
+    acc.yield
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @test_array_in_parallel
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : memref<10xf32>) -> memref<10xf32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : memref<10xf32>) to varPtr({{.*}} : memref<10xf32>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+
+// -----
+
+// Test array in kernels - should generate copyin/copyout
+func.func @test_array_in_kernels() {
+  %alloc = memref.alloca() : memref<20xi32>
+  acc.kernels {
+    %c0 = arith.constant 0 : index
+    %load = memref.load %alloc[%c0] : memref<20xi32>
+    acc.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @test_array_in_kernels
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : memref<20xi32>) -> memref<20xi32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : memref<20xi32>) to varPtr({{.*}} : memref<20xi32>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+
+// -----
+
+// Test array with default(present) - should generate present
+func.func @test_array_parallel_defaultpresent() {
+  %alloc = memref.alloca() : memref<10xf32>
+  acc.parallel {
+    %c0 = arith.constant 0 : index
+    %load = memref.load %alloc[%c0] : memref<10xf32>
+    acc.yield
+  } attributes {defaultAttr = #acc<defaultvalue present>}
+  return
+}
+
+// CHECK-LABEL: func.func @test_array_parallel_defaultpresent
+// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : memref<10xf32>) -> memref<10xf32> {implicit = true, name = ""}
+// CHECK: acc.delete accPtr(%[[PRESENT]] : memref<10xf32>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = ""}
+
+// -----
+
+// Test scalar with default(present) - should still generate firstprivate (scalars ignore default(present))
+func.func @test_scalar_parallel_defaultpresent() {
+  %alloc = memref.alloca() : memref<f32>
+  acc.parallel {
+    %load = memref.load %alloc[] : memref<f32>
+    acc.yield
+  } attributes {defaultAttr = #acc<defaultvalue present>}
+  return
+}
+
+// CHECK-LABEL: func.func @test_scalar_parallel_defaultpresent
+// CHECK: acc.firstprivate varPtr({{.*}} : memref<f32>) -> memref<f32> {implicit = true, name = ""}
+
+// -----
+
+// Test multidimensional array
+func.func @test_multidim_array_in_parallel() {
+  %alloc = memref.alloca() : memref<8x16xf32>
+  acc.parallel {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %load = memref.load %alloc[%c0, %c1] : memref<8x16xf32>
+    acc.yield
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @test_multidim_array_in_parallel
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : memref<8x16xf32>) -> memref<8x16xf32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : memref<8x16xf32>) to varPtr({{.*}} : memref<8x16xf32>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+
+// -----
+
+// Test dynamic size array
+func.func @test_dynamic_array(%size: index) {
+  %alloc = memref.alloca(%size) : memref<?xf64>
+  acc.parallel {
+    %c0 = arith.constant 0 : index
+    %load = memref.load %alloc[%c0] : memref<?xf64>
+    acc.yield
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @test_dynamic_array
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : memref<?xf64>) -> memref<?xf64> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : memref<?xf64>) to varPtr({{.*}} : memref<?xf64>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+
+// -----
+
+// Test variable with explicit data clause - implicit should recognize it
+func.func @test_with_explicit_copyin() {
+  %alloc = memref.alloca() : memref<100xf32>
+  %copyin = acc.copyin varPtr(%alloc : memref<100xf32>) -> memref<100xf32> {name = "explicit"}
+  acc.parallel dataOperands(%copyin : memref<100xf32>) {
+    %c0 = arith.constant 0 : index
+    %load = memref.load %alloc[%c0] : memref<100xf32>
+    acc.yield
+  }
+  acc.copyout accPtr(%copyin : memref<100xf32>) to varPtr(%alloc : memref<100xf32>) {name = "explicit"}
+  return
+}
+
+// CHECK-LABEL: func.func @test_with_explicit_copyin
+// CHECK: acc.present varPtr({{.*}} : memref<100xf32>) -> memref<100xf32> {implicit = true, name = ""}
+
+// -----
+
+// Test multiple variables
+func.func @test_multiple_variables() {
+  %alloc1 = memref.alloca() : memref<f32>
+  %alloc2 = memref.alloca() : memref<10xi32>
+  acc.parallel {
+    %load1 = memref.load %alloc1[] : memref<f32>
+    %c0 = arith.constant 0 : index
+    %load2 = memref.load %alloc2[%c0] : memref<10xi32>
+    acc.yield
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @test_multiple_variables
+// CHECK: acc.firstprivate varPtr({{.*}} : memref<f32>) -> memref<f32> {implicit = true, name = ""}
+// CHECK: %[[COPYIN:.*]] = acc.copyin varPtr({{.*}} : memref<10xi32>) -> memref<10xi32> {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+// CHECK: acc.copyout accPtr(%[[COPYIN]] : memref<10xi32>) to varPtr({{.*}} : memref<10xi32>) {dataClause = #acc<data_clause acc_copy>, implicit = true, name = ""}
+
+// -----
+
+// Test memref.view aliasing - view of explicitly copied buffer should generate present
+func.func @test_memref_view(%size: index) {
+  %c0 = arith.constant 0 : index
+  %buffer = memref.alloca(%size) : memref<?xi8>
+  %copyin = acc.copyin varPtr(%buffer : memref<?xi8>) -> memref<?xi8> {name = "buffer"}
+  %view = memref.view %buffer[%c0][] : memref<?xi8> to memref<8x64xf32>
+  acc.kernels dataOperands(%copyin : memref<?xi8>) {
+    %c0_0 = arith.constant 0 : index
+    %c0_1 = arith.constant 0 : index
+    %load = memref.load %view[%c0_0, %c0_1] : memref<8x64xf32>
+    acc.terminator
+  }
+  acc.copyout accPtr(%copyin : memref<?xi8>) to varPtr(%buffer : memref<?xi8>) {name = "buffer"}
+  return
+}
+
+// CHECK-LABEL: func.func @test_memref_view
+// CHECK: acc.present varPtr({{.*}} : memref<8x64xf32>) -> memref<8x64xf32> {implicit = true, name = ""}
+
diff --git a/mlir/test/Dialect/OpenACC/canonicalize.mlir b/mlir/test/Dialect/OpenACC/canonicalize.mlir
index fdc8e6b5cae6e..38d3df31305ad 100644
--- a/mlir/test/Dialect/OpenACC/canonicalize.mlir
+++ b/mlir/test/Dialect/OpenACC/canonicalize.mlir
@@ -219,3 +219,30 @@ func.func @update_unnecessary_computations(%x: memref<i32>) {
 // CHECK-LABEL: func.func @update_unnecessary_computations
 // CHECK-NOT: acc.atomic.update
 // CHECK: acc.atomic.write
+
+// -----
+
+func.func @kernel_environment_canonicalization(%q1: i32, %q2: i32, %q3: i32) {
+  // Empty kernel_environment (no wait) - should be removed
+  acc.kernel_environment {
+  }
+
+  acc.kernel_environment wait({%q1 : i32, %q2 : i32}) {
+  }
+
+  acc.kernel_environment wait {
+  }
+
+  acc.kernel_environment wait({%q3 : i32} [#acc.device_type<nvidia>]) {
+  }
+
+  return
+}
+
+// CHECK-LABEL: func.func @kernel_environment_canonicalization
+// CHECK-SAME: ([[Q1:%.*]]: i32, [[Q2:%.*]]: i32, [[Q3:%.*]]: i32)
+// CHECK-NOT: acc.kernel_environment wait({{.*}}[#acc.device_type<none>])
+// CHECK: acc.wait([[Q1]], [[Q2]] : i32, i32)
+// CHECK: acc.wait{{$}}
+// CHECK: acc.kernel_environment wait({{.*}}[#acc.device_type<nvidia>])
+// CHECK: return
diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir
index 26b63fbe182ea..0e75894eaeceb 100644
--- a/mlir/test/Dialect/OpenACC/invalid.mlir
+++ b/mlir/test/Dialect/OpenACC/invalid.mlir
@@ -492,6 +492,15 @@ func.func @fct1(%0 : !llvm.ptr) -> () {
 
 // -----
 
+%i1 = arith.constant 1 : i32
+%i2 = arith.constant 10 : i32
+// expected-error@+1 {{unstructured acc.loop must not have induction variables}}
+acc.loop control(%iv : i32) = (%i1 : i32) to (%i2 : i32) step (%i1 : i32) {
+  acc.yield
+} attributes {independent = [#acc.device_type<none>], unstructured}
+
+// -----
+
 // expected-error@+1 {{expect at least one of num, dim or static values}}
 acc.loop gang({}) {
   "test.openacc_dummy_op"() : () -> ()
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 042ee2503cb95..fc11bae60d9e0 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -1,8 +1,8 @@
-// RUN: mlir-opt -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -split-input-file %s | FileCheck %s --check-prefixes=CHECK,CHECK-3
 // Verify the printed output can be parsed.
-// RUN: mlir-opt -split-input-file %s | mlir-opt  -split-input-file  | FileCheck %s
+// RUN: mlir-opt -split-input-file %s | mlir-opt  -split-input-file  | FileCheck %s --check-prefixes=CHECK,CHECK-3
 // Verify the generic form can be parsed.
-// RUN: mlir-opt -split-input-file -mlir-print-op-generic %s | mlir-opt -split-input-file | FileCheck %s
+// RUN: mlir-opt -split-input-file -mlir-print-op-generic %s | mlir-opt -split-input-file | FileCheck %s --check-prefixes=CHECK,CHECK-3
 
 func.func @compute1(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x10xf32>) -> memref<10x10xf32> {
   %c0 = arith.constant 0 : index
@@ -2143,6 +2143,20 @@ func.func @acc_loop_container() {
 
 // -----
 
+func.func @acc_unstructured_loop() {
+  acc.loop {
+    acc.yield
+  } attributes {independent = [#acc.device_type<none>], unstructured}
+  return
+}
+
+// CHECK-LABEL: func.func @acc_unstructured_loop
+// CHECK:       acc.loop
+// CHECK:         acc.yield
+// CHECK:       } attributes {independent = [#acc.device_type<none>], unstructured}
+
+// -----
+
 // Test private recipe with data bounds for array slicing
 acc.private.recipe @privatization_memref_slice : memref<10x10xf32> init {
 ^bb0(%arg0: memref<10x10xf32>, %bounds0: !acc.data_bounds_ty, %bounds1: !acc.data_bounds_ty):
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index 119991ca7b451..c9e03ca53a729 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -306,6 +306,14 @@ func.func @test_concat_input_rank_mismatch(%arg0: tensor<1x2x3xf32>, %arg1: tens
 
 // -----
 
+func.func @test_concat_input_output_rank_mismatch(%arg0: tensor<2x2xf32>, %arg1: tensor<2x1xf32>) -> tensor<2xf32> {
+  // expected-error@+1 {{'tosa.concat' op expect output rank to match inputs rank, got 1 vs 2}}
+  %0 = tosa.concat %arg0, %arg1 {axis = 1 : i32} : (tensor<2x2xf32>, tensor<2x1xf32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
 func.func @test_pad_invalid_padConst_rank(%arg0: tensor<13x21xf32>) {
   %0 = tosa.const_shape {values = dense<1> : tensor<4xindex>} : () -> !tosa.shape<4>
   %1 = "tosa.const"() {values = dense<3.14> : tensor<2xf32>} : () -> tensor<2xf32>
diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir
index 22fde3b7d28a5..b2a71ab882230 100644
--- a/mlir/test/Dialect/Tosa/ops.mlir
+++ b/mlir/test/Dialect/Tosa/ops.mlir
@@ -342,6 +342,13 @@ func.func @test_intdiv(%arg0: tensor<13x21x1xi32>, %arg1: tensor<13x21x3xi32>) -
   return %0 : tensor<13x21x3xi32>
 }
 
+// -----
+// CHECK-LABEL: intdiv_i64
+func.func @test_intdiv_i64(%arg0: tensor<13x21x1xi64>, %arg1: tensor<13x21x3xi64>) -> tensor<13x21x3xi64> {
+  %0 = tosa.intdiv %arg0, %arg1 : (tensor<13x21x1xi64>, tensor<13x21x3xi64>) -> tensor<13x21x3xi64>
+  return %0 : tensor<13x21x3xi64>
+}
+
 // -----
 // CHECK-LABEL: logical_and
 func.func @test_logical_and(%arg0: tensor<13x21x3xi1>, %arg1: tensor<13x21x1xi1>) -> tensor<13x21x3xi1> {
@@ -1276,6 +1283,42 @@ func.func @test_matmul_t_block_scaled_mxint8(%arg0: tensor<4x8x32x!tosa.mxint8>,
   return %0 : tensor<4x8x16xf32>
 }
 
+// -----
+// CHECK-LABEL: test_matmul_t_block_scaled_fp6e3m2_e2e
+func.func @test_matmul_t_block_scaled_fp6e3m2_e2e(%arg0: tensor<6x2x32xf32>, %arg1: tensor<6x64x32xf32>) -> tensor<6x2x64xf32> {
+  %a, %sa = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x2x32xf32>) -> (tensor<6x2x32xf6E3M2FN>, tensor<6x2x1xf8E8M0FNU>)
+  %b, %sb = tosa.cast_to_block_scaled %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x64x32xf32>) -> (tensor<6x64x32xf6E3M2FN>, tensor<6x64x1xf8E8M0FNU>)
+  %res = tosa.matmul_t_block_scaled %a, %sa, %b, %sb {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x2x32xf6E3M2FN>, tensor<6x2x1xf8E8M0FNU>, tensor<6x64x32xf6E3M2FN>, tensor<6x64x1xf8E8M0FNU>) -> tensor<6x2x64xf32>
+  return %res : tensor<6x2x64xf32>
+}
+
+// -----
+// CHECK-LABEL: test_matmul_t_block_scaled_fp6e2m3_e2e
+func.func @test_matmul_t_block_scaled_fp6e2m3_e2e(%arg0: tensor<6x2x32xf32>, %arg1: tensor<6x64x32xf32>) -> tensor<6x2x64xf32> {
+  %a, %sa = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x2x32xf32>) -> (tensor<6x2x32xf6E2M3FN>, tensor<6x2x1xf8E8M0FNU>)
+  %b, %sb = tosa.cast_to_block_scaled %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x64x32xf32>) -> (tensor<6x64x32xf6E2M3FN>, tensor<6x64x1xf8E8M0FNU>)
+  %res = tosa.matmul_t_block_scaled %a, %sa, %b, %sb {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x2x32xf6E2M3FN>, tensor<6x2x1xf8E8M0FNU>, tensor<6x64x32xf6E2M3FN>, tensor<6x64x1xf8E8M0FNU>) -> tensor<6x2x64xf32>
+  return %res : tensor<6x2x64xf32>
+}
+
+// -----
+// CHECK-LABEL: test_matmul_t_block_scaled_fp4e2m1_e2e
+func.func @test_matmul_t_block_scaled_fp4e2m1_e2e(%arg0: tensor<6x2x32xf32>, %arg1: tensor<6x64x32xf32>) -> tensor<6x2x64xf32> {
+  %a, %sa = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x2x32xf32>) -> (tensor<6x2x32xf4E2M1FN>, tensor<6x2x1xf8E8M0FNU>)
+  %b, %sb = tosa.cast_to_block_scaled %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x64x32xf32>) -> (tensor<6x64x32xf4E2M1FN>, tensor<6x64x1xf8E8M0FNU>)
+  %res = tosa.matmul_t_block_scaled %a, %sa, %b, %sb {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x2x32xf4E2M1FN>, tensor<6x2x1xf8E8M0FNU>, tensor<6x64x32xf4E2M1FN>, tensor<6x64x1xf8E8M0FNU>) -> tensor<6x2x64xf32>
+  return %res : tensor<6x2x64xf32>
+}
+
+// -----
+// CHECK-LABEL: test_matmul_t_block_scaled_mxint8_e2e
+func.func @test_matmul_t_block_scaled_mxint8_e2e(%arg0: tensor<6x2x32xf32>, %arg1: tensor<6x64x32xf32>) -> tensor<6x2x64xf32> {
+  %a, %sa = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x2x32xf32>) -> (tensor<6x2x32x!tosa.mxint8>, tensor<6x2x1xf8E8M0FNU>)
+  %b, %sb = tosa.cast_to_block_scaled %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x64x32xf32>) -> (tensor<6x64x32x!tosa.mxint8>, tensor<6x64x1xf8E8M0FNU>)
+  %res = tosa.matmul_t_block_scaled %a, %sa, %b, %sb {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x2x32x!tosa.mxint8>, tensor<6x2x1xf8E8M0FNU>, tensor<6x64x32x!tosa.mxint8>, tensor<6x64x1xf8E8M0FNU>) -> tensor<6x2x64xf32>
+  return %res : tensor<6x2x64xf32>
+}
+
 // -----
 // CHECK-LABEL: test_cast_from_block_scaled_static
 func.func @test_cast_from_block_scaled_static(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32> {
@@ -1307,7 +1350,7 @@ func.func @test_cast_to_block_scaled_unranked(%arg0: tensor<*xf32>) -> (tensor<*
 // -----
 // CHECK-LABEL: test_cast_to_block_scaled_mxint8
 func.func @test_cast_to_block_scaled_mxint8(%arg0: tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>) {
-  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32, stochastic_round = false} : (tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>)
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>)
   return %0#0, %0#1 : tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>
 }
 
diff --git a/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
index 9bd7aa8f0783e..acbff73b8b948 100644
--- a/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
@@ -130,3 +130,19 @@ func.func @test_cast_to_block_scaled_mxint8(%arg0: tensor<4x32xf32>) -> (tensor<
   %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32, stochastic_round = false} : (tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>)
   return %0#0, %0#1 : tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>
 }
+
+// -----
+
+// CHECK-LABEL: test_argmax_fp8_i64
+func.func @test_argmax_fp8_i64(%arg0: tensor<12x8x16xf8E5M2>) -> tensor<12x16xi64> {
+  %0 = tosa.argmax %arg0 { axis = 1 : i32 } : (tensor<12x8x16xf8E5M2>) -> tensor<12x16xi64>
+  return %0 : tensor<12x16xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_argmax_bf16_i64
+func.func @test_argmax_bf16_i64(%arg0: tensor<12x8x16xbf16>) -> tensor<12x16xi64> {
+  %0 = tosa.argmax %arg0 { axis = 1 : i32 } : (tensor<12x8x16xbf16>) -> tensor<12x16xi64>
+  return %0 : tensor<12x16xi64>
+}
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index ebbe3ce0ec0d0..92f353717ac59 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -451,7 +451,7 @@ func.func @store_scatter_offset_wi_1(%src: memref<?xf16>) {
   %offsets = arith.constant dense<[0]> : vector<1xindex>
   %mask = arith.constant dense<1>: vector<1xi1>
   // expected-error@+1 {{Mask should match value except the chunk size dim}}
-  xegpu.store %val, %src[%offsets], %mask 
+  xegpu.store %val, %src[%offsets], %mask
         : vector<4xf16>, memref<?xf16>, vector<1xindex>, vector<1xi1>
   return
 }
@@ -870,14 +870,6 @@ func.func @load_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>) {
   return
 }
 
-// -----
-func.func @load_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>) {
-  // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}}
-  %data2 = xegpu.load_matrix %arg0[8, 8] <{subgroup_block_io}>: !xegpu.mem_desc<16x64xf16> -> vector<16x16xf16>
-  return
-}
-
-
 // -----
 func.func @store_mem_desc_mismatch_element_type(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf32>) {
   // expected-error@+1 {{failed to verify that all of {mem_desc, data} have same element type}}
@@ -900,16 +892,25 @@ func.func @store_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>, %arg1: ve
 }
 
 // -----
-func.func @store_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>, %data: vector<16x16xf16>) {
-  // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}}
-  xegpu.store_matrix %data,  %arg0[8, 8] <{subgroup_block_io}>: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
+func.func @simt_store_matrix_vector_nonlinear(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1]>>, %arg1: vector<2x16xf32>) {
+  // expected-error@+1 {{With subgroup_block_io, accessed data must be contiguous and coalesced}}
+  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+        vector<2x16xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1]>>
   return
 }
 
 // -----
-func.func @store_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>, %data: vector<16x16xf16>) {
-  // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}}
-  xegpu.store_matrix %data,  %arg0[8, 8] <{subgroup_block_io}>: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
+func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [1, 16]>>, %arg1: vector<16x2xf32>) {
+  // expected-error@+1 {{With subgroup_block_io, the distributed dimensions must be contiguous}}
+  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} :
+        vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [1, 16]>>
   return
 }
 
+// -----
+func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1], block = [1, 17]>>, %arg1: vector<16x2xf32>) {
+  // expected-error@+1 {{With subgroup_block_io, the block shape must match the lane layout}}
+  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+        vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1], block = [1, 17]>>
+  return
+}
diff --git a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
index d289d73e863c7..2780212d2917f 100644
--- a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
+++ b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s
+// RUN: mlir-opt -xevm-attach-target='chip=pvc' -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s
 
 gpu.module @test {
 gpu.func @empty()  {
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 0a10f6814ae96..9b3829664108d 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -278,6 +278,15 @@ gpu.func @subgroup_load_nd_offset_1(%src: memref<24x32xf32>, %x : index, %y : in
   gpu.return
 }
 
+// CHECK: func @subgroup_load_nd_offset_2(%[[arg0:.*]]: memref<24x32xf32>, %arg1: index) {
+gpu.func @subgroup_load_nd_offset_2(%src: memref<24x32xf32>, %x : index) {
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+  %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][%arg1, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32>
+  %2 = xegpu.load_nd %1[%x, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32>
+  gpu.return
+}
+
 // CHECK: func @simt_load_nd_8(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @simt_load_nd_8(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir
new file mode 100644
index 0000000000000..24a0de6ed48a5
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir
@@ -0,0 +1,280 @@
+// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc'  \
+// RUN:   --xegpu-optimize-block-loads --canonicalize --cse --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: gpu.func @no_scf(
+// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xf16>, %{{.*}}: vector<8x16xf16>) -> vector<8x16xf32> {
+// CHECK:         %[[C16:.*]] = arith.constant 16 : index
+// CHECK:         %[[C32:.*]] = arith.constant 32 : index
+// CHECK:         %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<64x64xf16> -> index
+// CHECK:         %[[T0:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:         %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C32]]], strides : [%[[C32]], 1] : i64
+// CHECK-SAME:      -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK-NEXT:    %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}, %[[C16]]]
+// CHECK-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:      : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
+// CHECK:         %[[BITCAST:.*]] = vector.bitcast %[[B]]
+// CHECK-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : vector<16x8xi32> to vector<16x16xf16>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
+gpu.module @xevm_module {
+gpu.func @no_scf(%arg0: memref<64x64xf16>, %arg1: vector<8x16xf16>) -> vector<8x16xf32> {
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<64x64xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
+  %1 = xegpu.load_nd %0[%c0, %c32] { result_layout = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+  %2 = vector.transpose %1, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+  %6 = xegpu.dpas %arg1, %2 { layout_result_0 = #a } : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+  gpu.return %6 : vector<8x16xf32>
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @no_scf_i8(
+// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xi8>, %{{.*}}: vector<8x32xi8>) -> vector<8x16xi32> {
+// CHECK:         %[[C16:.*]] = arith.constant 16 : index
+// CHECK:         %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<64x64xi8> -> index
+// CHECK:         %[[T0:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:         %[[T1:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C16]]], strides : [%[[C16]], 1] : i64
+// CHECK-SAME:      -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK:         %[[T2:.*]] = xegpu.load_nd %[[T1]][%{{.*}}, %[[C16]]]
+// CHECK-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:      : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
+// CHECK:         %[[T3:.*]] = vector.bitcast %[[T2]]
+// CHECK-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 4]>} : vector<16x8xi32> to vector<16x32xi8>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 4]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>
+#c = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+gpu.module @xevm_module {
+gpu.func @no_scf_i8(%arg0: memref<64x64xi8>, %arg1: vector<8x32xi8>) -> vector<8x16xi32> {
+  %c0 = arith.constant 0 : index
+  %c64 = arith.constant 64 : index
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<64x64xi8> -> !xegpu.tensor_desc<16x32xi8, #b>
+  %1 = xegpu.load_nd %0[%c0, %c64] { result_layout = #b } : !xegpu.tensor_desc<16x32xi8, #b> -> vector<16x32xi8>
+  %2 = vector.transpose %1, [1, 0] { layout_result_0 = #bt } : vector<16x32xi8> to vector<32x16xi8>
+  %6 = xegpu.dpas %arg1, %2 { layout_result_0 = #c } : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
+  gpu.return %6 : vector<8x16xi32>
+}
+}
+
+
+// -----
+// CHECK-LABEL:   gpu.func @gemm_b_transpose(
+// CHECK-SAME:      %{{.*}} memref<256x256xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) {
+// CHECK:           %[[C128:.*]] = arith.constant 128 : index
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[C16:.*]] = arith.constant 16 : index
+// CHECK:           %[[C256:.*]] = arith.constant 256 : index
+// CHECK:           %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index
+// CHECK:           %[[T3:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:           %[[T4:.*]] = xegpu.create_nd_tdesc %[[T3]], shape : [256, %[[C128]]], strides : [%c128, 1]
+// CHECK-SAME:        : i64 -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK:           %{{.*}} = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) {
+// CHECK:             %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index
+// CHECK-NEXT:        %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]]
+// CHECK-SAME:          {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
+// CHECK-SAME:          !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
+// CHECK-NEXT:        %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-SAME:          : vector<16x8xi32> to vector<16x16xf16>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
+gpu.module @xevm_module {
+gpu.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) {
+  %c0 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %c256 = arith.constant 256 : index
+  %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a>
+  %1 = xegpu.load_nd %0[%c0, %c0]  { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
+  %2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #a>
+  %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
+  %4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args(%arg4 = %1) -> (vector<8x16xf32>) {
+    %5 = xegpu.load_nd %2[%c0, %arg3] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf16, #a> -> vector<8x16xf16>
+    %6 = xegpu.load_nd %3[%c0, %arg3]  { layout_result_0 = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+    %7 = vector.transpose %6, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    scf.yield %8 : vector<8x16xf32>
+  } {layout_result_0 = #a}
+  xegpu.store_nd %4, %0[%c0, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @nested_scf(
+// CHECK-SAME:     %{{.*}}: memref<256x256xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) {
+// CHECK:          %[[C128:.*]] = arith.constant 128 : index
+// CHECK:          %[[C1:.*]] = arith.constant 1 : index
+// CHECK:          %[[C16:.*]] = arith.constant 16 : index
+// CHECK:          %[[C256:.*]] = arith.constant 256 : index
+// CHECK:          scf.for %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK:            %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index
+// CHECK:            %[[T3:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:            %[[T4:.*]] = xegpu.create_nd_tdesc %[[T3]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64
+// CHECK-SAME:          -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK:            %{{.*}} = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) {
+// CHECK:              %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index
+// CHECK-NEXT:         %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]]  {layout_result_0 = #xegpu.layout<
+// CHECK-SAME:          lane_layout = [16, 1], lane_data = [1, 1]>} :
+// CHECK-SAME:          !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
+// CHECK-NEXT:         %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-SAME:          : vector<16x8xi32> to vector<16x16xf16>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
+gpu.module @xevm_module {
+gpu.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) {
+  %c0 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %c256 = arith.constant 256 : index
+  scf.for %arg8 = %c0 to %c256 step %c16 {
+    %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a>
+    %1 = xegpu.load_nd %0[%arg8, %c0]  { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
+    %2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #a>
+    %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
+    %4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args(%arg4 = %1) -> (vector<8x16xf32>) {
+      %5 = xegpu.load_nd %2[%arg8, %arg3] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf16, #a> -> vector<8x16xf16>
+      %6 = xegpu.load_nd %3[%arg8, %arg3]  { layout_result_0 = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+      %7 = vector.transpose %6, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+      %8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      scf.yield %8 : vector<8x16xf32>
+    } {layout_result_0 = #a}
+    xegpu.store_nd %4, %0[%c0, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  }
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL:   gpu.func @large_loads(
+// CHECK-SAME:      %{{.*}}: vector<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) {
+// CHECK:           %[[C128:.*]] = arith.constant 128 : index
+// CHECK:           %[[C8:.*]] = arith.constant 8 : index
+// CHECK:           %[[CST:.*]] = arith.constant dense<0> : vector<32x16xi32>
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index
+// CHECK:           %[[T2:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:           %[[T3:.*]] = xegpu.create_nd_tdesc %[[T2]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64
+// CHECK-SAME:        -> !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK:           %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
+// CHECK:             %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index
+// CHECK:             %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:          : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
+// CHECK:             %[[T7:.*]] = vector.insert_strided_slice %[[T6]], %[[CST]]
+// CHECK-SAME:          {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, offsets = [0, 0], strides = [1, 1]}
+// CHECK-SAME:          : vector<32x8xi32> into vector<32x16xi32>
+// CHECK:             %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index
+// CHECK:             %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:          : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
+// CHECK:             %[[T10:.*]] = vector.insert_strided_slice %[[T9]], %[[T7]]
+// CHECK-SAME:          {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, offsets = [0, 8], strides = [1, 1]}
+// CHECK-SAME:          : vector<32x8xi32> into vector<32x16xi32>
+// CHECK:             %{{.*}} = vector.bitcast %[[T10]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-SAME:          : vector<32x16xi32> to vector<32x32xf16>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
+gpu.module @xevm_module {
+gpu.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) {
+  %c0 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %c32 = arith.constant 32 : index
+  %c256 = arith.constant 256 : index
+  %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a>
+  %1 = xegpu.load_nd %0[%c0, %c0]  { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
+  %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<32x32xf16, #b>
+  %4:4 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %1, %arg5 = %1, %arg6 = %1, %arg7 = %1)
+    -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
+    %6 = xegpu.load_nd %3[%c0, %arg3]  { layout_result_0 = #b } : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
+    %7 = vector.extract_strided_slice %6 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x32xf16> to vector<16x16xf16>
+    %8 = vector.extract_strided_slice %6 {offsets = [0, 16], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x32xf16> to vector<16x16xf16>
+    %9 = vector.extract_strided_slice %6 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x32xf16> to vector<16x16xf16>
+    %10 = vector.extract_strided_slice %6 {offsets = [16, 16], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x32xf16> to vector<16x16xf16>
+    %11 = vector.transpose %7, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %12 = vector.transpose %8, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %13 = vector.transpose %9, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %14 = vector.transpose %10, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %15 = xegpu.dpas %arg0, %11, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %16 = xegpu.dpas %arg0, %12, %arg5 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %17 = xegpu.dpas %arg0, %13, %arg6 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %18 = xegpu.dpas %arg0, %14, %arg7 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    scf.yield %15, %16, %17, %18 : vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>
+  } {layout_result_0 = #a, layout_result_1 = #a, layout_result_2 = #a, layout_result_3 = #a}
+  xegpu.store_nd %4#0, %0[%c0, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#1, %0[%c0, %c16]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#2, %0[%c16, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#3, %0[%c16, %c16]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL:  gpu.func @array_length(
+// CHECK-SAME:      %{{.*}}: vector<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %arg2: memref<256x256xf32>) {
+// CHECK:           %[[C128:.*]] = arith.constant 128 : index
+// CHECK:           %[[C8:.*]] = arith.constant 8 : index
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index
+// CHECK:           %[[T2:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:           %[[T3:.*]] = xegpu.create_nd_tdesc %[[T2]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64 ->
+// CHECK-SAME:        !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK:           %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
+// CHECK:             %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index
+// CHECK:             %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:          : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
+// CHECK:             %[[T7:.*]] = vector.bitcast %[[T6]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-SAME:          : vector<32x8xi32> to vector<32x16xf16>
+// CHECK:             %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index
+// CHECK:             %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:          : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
+// CHECK:             %[[T10:.*]] = vector.bitcast %[[T9]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-SAME:          : vector<32x8xi32> to vector<32x16xf16>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
+gpu.module @xevm_module {
+gpu.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) {
+  %c0 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %c32 = arith.constant 32 : index
+  %c256 = arith.constant 256 : index
+  %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a>
+  %1 = xegpu.load_nd %0[%c0, %c0]  { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
+  %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16>
+    -> !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+  %4:4 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %1, %arg5 = %1, %arg6 = %1, %arg7 = %1)
+    -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
+    %6 = xegpu.load_nd %3[%c0, %arg3]  { layout_result_0 = #b }
+      : !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x32x16xf16>
+    %19 = vector.extract %6[0] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16>
+    %20 = vector.extract %6[1] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16>
+    %7 = vector.extract_strided_slice %19 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x16xf16> to vector<16x16xf16>
+    %8 = vector.extract_strided_slice %19 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x16xf16> to vector<16x16xf16>
+    %9 = vector.extract_strided_slice %20 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x16xf16> to vector<16x16xf16>
+    %10 = vector.extract_strided_slice %20 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x16xf16> to vector<16x16xf16>
+    %11 = vector.transpose %7, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %12 = vector.transpose %8, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %13 = vector.transpose %9, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %14 = vector.transpose %10, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %15 = xegpu.dpas %arg0, %11, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %16 = xegpu.dpas %arg0, %12, %arg5 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %17 = xegpu.dpas %arg0, %13, %arg6 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %18 = xegpu.dpas %arg0, %14, %arg7 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    scf.yield %15, %16, %17, %18 : vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>
+  } {layout_result_0 = #a, layout_result_1 = #a, layout_result_2 = #a, layout_result_3 = #a}
+  xegpu.store_nd %4#0, %0[%c0, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#1, %0[%c0, %c16]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#2, %0[%c16, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#3, %0[%c16, %c16]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  gpu.return
+}
+}
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
new file mode 100644
index 0000000000000..c31ef323a94d2
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -0,0 +1,128 @@
+// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=inst" -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: func.func @dpas_f16(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} dense<0.000000e+00> : vector<8x16xf32>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>
+// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>>
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]]  {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<8x16xf16>
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]]  {layout_result_0 = #xegpu.layout<inst_data = [16, 16]>} :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf16>
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>>
+gpu.module @test {
+
+func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  %2 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+  %3 = xegpu.load_nd %1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+  %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+  xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  return
+}
+}
+
+// -----
+gpu.module @test_kernel {
+  gpu.func @elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
+    %c0 = arith.constant 0 : index
+    %c32 = arith.constant 32 : index
+    %c1024 = arith.constant 1024 : index
+    %block_id_x = gpu.block_id x
+    %block_id_y = gpu.block_id y
+    %m = arith.muli %block_id_x, %c32 : index
+
+    %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+    %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+    %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+
+    %out:3 = scf.for %k = %c0 to %c1024 step %c32
+      iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
+      -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) {
+      //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<16x32xf16>
+      %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
+      %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
+
+      //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : vector<16x32xf16>
+      %c = arith.addf %a, %b : vector<16x32xf16>
+
+      //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>>
+      xegpu.store_nd %c, %arg2: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16>
+
+      //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>>
+      %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
+      %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
+      %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
+      scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc
+        : !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>
+    }
+    gpu.return
+  }
+}
+
+// -----
+gpu.module @test_kernel {
+  gpu.func @elementwise_with_inst_data_12(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
+    %c0 = arith.constant 0 : index
+    %c32 = arith.constant 32 : index
+    %c1024 = arith.constant 1024 : index
+    %block_id_x = gpu.block_id x
+    %block_id_y = gpu.block_id y
+    %m = arith.muli %block_id_x, %c32 : index
+
+    %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
+    %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
+    %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
+
+    %out:3 = scf.for %k = %c0 to %c1024 step %c32
+      iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
+      -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) {
+      //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16]>} :
+      //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>> -> vector<12x32xf16>
+      %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
+      %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
+
+      //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16]>} : vector<12x32xf16>
+      %c = arith.addf %a, %b : vector<12x32xf16>
+
+      //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>>
+      xegpu.store_nd %c, %arg2: vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16>
+
+      //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>>
+      %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
+      %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
+      %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
+      scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc
+        : !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>
+    }
+    gpu.return
+  }
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @scatter_ops_chunksize(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
+// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
+// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}>
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<inst_data = [16, 8]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
+  %1 = arith.constant dense<1>: vector<16xi1>
+  %offset = arith.constant dense<12> : vector<16xindex>
+  %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}>
+      : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+  xegpu.store %3, %src[%offset], %1 <{chunk_size=8}>
+      : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+  return
+}
+}
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 30f785ded975a..eb004932af4be 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -1,5 +1,6 @@
-// RUN: mlir-opt -xegpu-propagate-layout -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=lane" -split-input-file %s | FileCheck %s
 
+gpu.module @test {
 // CHECK-LABEL: func.func @dpas_f16(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
@@ -25,8 +26,10 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me
   xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @dpas_i8(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) {
 // CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16],
@@ -37,8 +40,10 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre
   xegpu.store_nd %0, %1  : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @load_with_transpose_effect(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{transpose = array<i64: 1, 0>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
@@ -55,8 +60,10 @@ func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x
   xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_transpose(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %{{.*}} = vector.transpose %{{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16> to vector<16x16xf16>
@@ -73,8 +80,10 @@ func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %
   xegpu.store_nd %5, %6  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @extf_truncf(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, %[[ARG1:[0-9a-zA-Z]+]]:
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>) -> vector<8x16xf32> {
@@ -88,8 +97,10 @@ func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor
   %4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
   return %4 : vector<8x16xf32>
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @load_gather_with_chunksize(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
@@ -113,8 +124,10 @@ func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256
   xegpu.store_nd %5, %6  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @load_gather_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
@@ -132,8 +145,9 @@ func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf
   xegpu.store_nd %1, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @store_scatter_with_chunksize(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<128xf32>) {
 // CHECK: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %{{.*}} : memref<128xf32>, vector<16xindex> ->
@@ -148,8 +162,9 @@ func.func @store_scatter_with_chunksize(%arg0: memref<128xf32>) {
   xegpu.store %cst, %0, %cst_0 : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<16xi1>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @store_scatter_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) {
 // CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}}  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>,
@@ -161,8 +176,9 @@ func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
   xegpu.store %arg0, %0, %cst_0  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @scatter_ops_chunksize(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
@@ -179,8 +195,9 @@ func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
       : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @scatter_ops(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
@@ -195,8 +212,49 @@ func.func @scatter_ops(%src: memref<256xf16>) {
   xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
   return
 }
-
+}
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @scatter_ops_custom_perm_layout(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
+// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
+// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16>
+// CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
+// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+func.func @scatter_ops_custom_perm_layout(%src: memref<256xf16>) {
+  %1 = arith.constant dense<1>: vector<16xi1>
+  %offset = arith.constant dense<12> : vector<16xindex>
+  %3 = xegpu.load %src[%offset], %1 : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+  %4 = arith.addf %3, %3 : vector<16xf16>
+  xegpu.store %4, %src[%offset], %1 <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+  return
+}
+}
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @scatter_ops_preserve_load_perm_layout(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
+// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
+// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}>
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16>
+// CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
+// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) {
+  %1 = arith.constant dense<1>: vector<16xi1>
+  %offset = arith.constant dense<12> : vector<16xindex>
+  %3 = xegpu.load %src[%offset], %1 <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+  %4 = arith.addf %3, %3 : vector<16xf16>
+  xegpu.store %4, %src[%offset], %1 <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+  return
+}
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16(
 // CHECK:       %[[LOAD0:.*]] = xegpu.load_nd %{{.*}}  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:     !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16>
@@ -219,8 +277,9 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1
   xegpu.store_nd %6, %7  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i32_to_f16(
 // CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
 // CHECK-SAME:     !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
@@ -239,8 +298,9 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8
   xegpu.store_nd %6, %7  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i16_to_i32(
 // CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}}  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
 // CHECK-SAME:     !xegpu.tensor_desc<8x32xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>> -> vector<8x32xi16>
@@ -255,8 +315,9 @@ func.func @vector_bitcast_i16_to_i32(%arg0: memref<8x32xi16>, %arg1: memref<8x16
   xegpu.store_nd %3, %1  : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_require_cross_lane_shuffle(
 // CHECK:     %[[LOAD:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32>
 // CHECK:     %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
@@ -270,9 +331,10 @@ func.func @vector_bitcast_require_cross_lane_shuffle(%arg0: memref<8x16xi32>, %a
   xegpu.store_nd %3, %1  : vector<8x32xi16>, !xegpu.tensor_desc<8x32xi16>
   return
 }
-
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @binary_op_one_use(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
@@ -291,8 +353,9 @@ func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.
   xegpu.store_nd %4, %arg2  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @binary_op_multiple_uses(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
@@ -312,8 +375,9 @@ func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !
   xegpu.store_nd %2, %arg3  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @for_op(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x128xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<128x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -353,8 +417,9 @@ func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: me
   xegpu.store_nd %2#2, %3  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @if_single_use(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
@@ -381,8 +446,9 @@ func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tens
   xegpu.store_nd %2, %arg3  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @if_multiple_uses(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
@@ -411,8 +477,9 @@ func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t
   xegpu.store_nd %1, %arg4  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_outer_reduction(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
 // CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [0] : vector<16x16xf32> to vector<16xf32>
@@ -422,8 +489,9 @@ func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor
   xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_inner_reduction(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
 // CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [1] : vector<16x16xf32> to vector<16xf32>
@@ -433,8 +501,9 @@ func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor
   xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @update_nd_offset_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
@@ -448,8 +517,9 @@ func.func @update_nd_offset_1d(%arg0: memref<256xf32>){
   xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @update_nd_offset_2d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -463,8 +533,9 @@ func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){
   xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @prefetch_2d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -475,8 +546,9 @@ func.func @prefetch_2d(%arg0: memref<256x256xf16>){
   xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16x16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @prefetch_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
@@ -487,8 +559,9 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){
   xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @scf_while_and_condition(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) {
 // CHECK: %{{.*}}:3 = scf.while ({{.*}}) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>)
@@ -520,8 +593,9 @@ func.func @scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32
   }
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed(
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
@@ -541,8 +615,9 @@ func.func @vector_shape_cast_1d_to_2d_dim1_distributed(%arg0: !xegpu.tensor_desc
   xegpu.store_nd %5, %arg1  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(
 // CHECK-SAME:     %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME:     %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
@@ -563,3 +638,4 @@ func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc
   xegpu.store_nd %5, %arg1  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   return
 }
+}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 27a3dc373c739..8fd3cca5594cb 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -265,3 +265,68 @@ gpu.module @xevm_module{
     gpu.return
   }
 }
+
+// -----
+// CHECK-LABEL: gpu.func @load_store_matrix_1({{.*}}) {
+// CHECK: %[[C2:.*]] = arith.constant 2 : index
+// CHECK: %[[C8:.*]] = arith.constant 8 : index
+// CHECK: %[[LANE_ID:.*]] = gpu.lane_id
+// CHECK: %[[REMU1:.*]] = index.remu %[[LANE_ID]], %[[C8]]
+// CHECK: %[[DIVU:.*]] = index.divu %[[LANE_ID]], %[[C8]]
+// CHECK: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C2]]
+// CHECK: %[[REMU3:.*]] = index.remu %[[REMU2]], %[[C2]]
+// CHECK: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C8]]
+// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[REMU4]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<1x1xf32>
+// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[REMU4]]] : vector<1x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+gpu.module @xevm_module{
+  gpu.func @load_store_matrix_1(%arg0: !xegpu.mem_desc<32x32xf32>) {
+    %c0 = arith.constant 0 : index
+    %1 = xegpu.load_matrix %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32>
+    xegpu.store_matrix %1, %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_store_matrix_2({{.*}}) {
+// CHECK: %[[C8:.*]] = arith.constant 8 : index
+// CHECK: %[[C2:.*]] = arith.constant 2 : index
+// CHECK: %[[C4:.*]] = arith.constant 4 : index
+// CHECK: %[[C1:.*]] = arith.constant 1 : index
+// CHECK: %[[LANE_ID:.*]] = gpu.lane_id
+// CHECK: %[[REMU1:.*]] = index.remu %[[LANE_ID]], %[[C4]]
+// CHECK: %[[DIVU:.*]] = index.divu %[[LANE_ID]], %[[C4]]
+// CHECK: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C4]]
+// CHECK: %[[MUL:.*]] = index.mul %[[REMU2]], %[[C2]]
+// CHECK: %[[REMU3:.*]] = index.remu %[[MUL]], %[[C8]]
+// CHECK: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C4]]
+// CHECK: %[[ADD:.*]] = index.add %[[REMU4]], %[[C1]]
+// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[ADD]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x1xf32>
+// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[ADD]]] : vector<2x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+gpu.module @xevm_module{
+  gpu.func @load_store_matrix_2(%arg0: !xegpu.mem_desc<32x32xf32>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.load_matrix %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32>
+    xegpu.store_matrix %1, %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_store_matrix_3({{.*}}) {
+// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%{{.*}}, %{{.*}}] <{subgroup_block_io}>:
+// CHECK-SAME: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<block = [16, 1], stride = [1, 32]>>, index, index -> vector<1x2xf32>
+// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%{{.*}}, %{{.*}}] <{subgroup_block_io}>:
+// CHECK-SAME: vector<1x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<block = [16, 1], stride = [1, 32]>>, index, index
+gpu.module @xevm_module{
+  gpu.func @load_store_matrix_3(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.load_matrix %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
+      !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index -> vector<16x2xf32>
+    xegpu.store_matrix %1, %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
+      vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index
+    gpu.return
+  }
+}
diff --git a/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir b/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir
new file mode 100644
index 0000000000000..dce4a41982550
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir
@@ -0,0 +1,157 @@
+// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics
+
+func.func @set_desc_layout(%arg0: memref<4096x4096xf16>) {
+  %c32 = arith.constant 32 : index // expected-note {{target op}}
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error@below {{Expected a xegpu.create_nd_desc op, but got: arith.constant}}
+    %1 = transform.xegpu.set_desc_layout %0 sg_layout = [8, 4] sg_data = [32, 32] : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_bad_result_index
+func.func @set_op_layout_attr_bad_result_index(%arg0: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error@below {{Index exceeds the number of op results}}
+    transform.xegpu.set_op_layout_attr %0 result index = 1 sg_layout = [8, 4] sg_data = [32, 64] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_bad_operand_index
+func.func @set_op_layout_attr_bad_operand_index(%arg0: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error@below {{Index exceeds the number of op operands}}
+    transform.xegpu.set_op_layout_attr %0 index = 1 sg_layout = [8, 4] sg_data = [32, 64] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_multiple
+func.func @set_op_layout_attr_multiple(%arg0: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32>
+  %3 = arith.extf %2 : vector<256x32xf32> to vector<256x32xf64>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error@below {{Requires exactly one targetOp handle (got 2)}}
+    transform.xegpu.set_op_layout_attr %0 sg_layout = [8, 4] sg_data = [32, 64] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @set_gpu_launch_threads_bad_handle(%arg0: memref<4096x4096xf16>) {
+  %c32 = arith.constant 32 : index // expected-note {{target op}}
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error@below {{Expected a gpu.launch op, but got: arith.constant}}
+    transform.xegpu.set_gpu_launch_threads %0 threads = [8, 4, 1] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @set_gpu_launch_threads_many_handles(%arg0: memref<4096x4096xf16>) {
+  %c32 = arith.constant 32 : index
+  %c64 = arith.constant 64 : index
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error@below {{Requires exactly one targetOp handle (got 2)}}
+    transform.xegpu.set_gpu_launch_threads %0 threads = [8, 4, 1] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @set_gpu_launch_threads_bad_threads(%arg0: memref<4096x4096xf16>) {
+  %c1 = arith.constant 1 : index
+  %c16 = arith.constant 16 : index
+  gpu.launch blocks(%arg3, %arg4, %arg5) in (%arg9 = %c16, %arg10 = %c16, %arg11 = %c1) threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) {
+    gpu.terminator
+  }
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["gpu.launch"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error@below {{Expected threads argument to consist of three values (got 2)}}
+    transform.xegpu.set_gpu_launch_threads %0 threads = [8, 4] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @insert_prefetch_dpas_c
+func.func @insert_prefetch_dpas_c(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) {
+  %c32 = arith.constant 32 : index
+  %c4096 = arith.constant 4096 : index
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16>
+  // expected-note@below {{load op}}
+  %1 = xegpu.load_nd %0[%c0, %c0]  : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16>
+  %3 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %4 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x256xf16>
+  %2 = scf.for %arg3 = %c0 to %c4096 step %c32 iter_args(%arg4 = %1) -> (vector<256x256xf16>) {
+    %5 = xegpu.load_nd %3[%c0, %arg3] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+    %6 = xegpu.load_nd %4[%arg3, %c0] : !xegpu.tensor_desc<32x256xf16> -> vector<32x256xf16>
+    %7 = xegpu.dpas %5, %6, %arg4 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16>
+    scf.yield %7 : vector<256x256xf16>
+  }
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["xegpu.dpas"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_operand %0[2] : (!transform.any_op) -> !transform.any_value
+    // expected-error@below {{Load op is not contained in a scf.for loop.}}
+    %2 = transform.xegpu.insert_prefetch %1 nb_prefetch = 1 : (!transform.any_value) -> !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/XeGPU/transform-ops.mlir b/mlir/test/Dialect/XeGPU/transform-ops.mlir
new file mode 100644
index 0000000000000..ff0accdec7532
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/transform-ops.mlir
@@ -0,0 +1,471 @@
+// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: @get_desc_op_a
+func.func @get_desc_op_a(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) {
+  %c32 = arith.constant 32 : index
+  %c4096 = arith.constant 4096 : index
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16>
+  %1 = xegpu.load_nd %0[%c0, %c0]  : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16>
+  // expected-remark @below {{found desc op}}
+  %3 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %4 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x256xf16>
+  %2 = scf.for %arg3 = %c0 to %c4096 step %c32 iter_args(%arg4 = %1) -> (vector<256x256xf16>) {
+    %5 = xegpu.load_nd %3[%c0, %arg3] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+    %6 = xegpu.load_nd %4[%arg3, %c0] : !xegpu.tensor_desc<32x256xf16> -> vector<32x256xf16>
+    %7 = xegpu.dpas %5, %6, %arg4 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16>
+    scf.yield %7 : vector<256x256xf16>
+  }
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["xegpu.dpas"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_operand %0[0] : (!transform.any_op) -> !transform.any_value
+    %2 = transform.xegpu.get_desc_op %1 : (!transform.any_value) -> !transform.any_op
+    transform.debug.emit_remark_at %2, "found desc op" : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @get_desc_op_c
+func.func @get_desc_op_c(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) {
+  %c32 = arith.constant 32 : index
+  %c4096 = arith.constant 4096 : index
+  %c0 = arith.constant 0 : index
+  // expected-remark @below {{found desc op}}
+  %0 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16>
+  %1 = xegpu.load_nd %0[%c0, %c0]  : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16>
+  %3 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %4 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x256xf16>
+  %2 = scf.for %arg3 = %c0 to %c4096 step %c32 iter_args(%arg4 = %1) -> (vector<256x256xf16>) {
+    %5 = xegpu.load_nd %3[%c0, %arg3] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+    %6 = xegpu.load_nd %4[%arg3, %c0] : !xegpu.tensor_desc<32x256xf16> -> vector<32x256xf16>
+    %7 = xegpu.dpas %5, %6, %arg4 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16>
+    scf.yield %7 : vector<256x256xf16>
+  }
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["xegpu.dpas"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_operand %0[2] : (!transform.any_op) -> !transform.any_value
+    %2 = transform.xegpu.get_desc_op %1 : (!transform.any_value) -> !transform.any_op
+    transform.debug.emit_remark_at %2, "found desc op" : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_desc_layout
+func.func @set_desc_layout(%arg0: memref<4096x4096xf16>) {
+  // CHECK: %[[V0:.+]] = xegpu.create_nd_tdesc %arg0
+  // CHECK-SAME: #xegpu.block_tdesc_attr<boundary_check = false>
+  // CHECK-SAME: #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [8, 16]>>
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16, #xegpu.block_tdesc_attr<boundary_check = false>>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["xegpu.create_nd_tdesc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_desc_layout %{{.*}}
+    %1 = transform.xegpu.set_desc_layout %0 sg_layout = [8, 4] sg_data = [32, 32] inst_data = [8, 16] : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_desc_layout_minimal
+func.func @set_desc_layout_minimal(%arg0: memref<4096x4096xf16>) {
+  // CHECK: %[[V0:.+]] = xegpu.create_nd_tdesc %arg0
+  // CHECK-SAME: #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["xegpu.create_nd_tdesc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_desc_layout %{{.*}}
+    %1 = transform.xegpu.set_desc_layout %0 sg_layout = [8, 4] sg_data = [32, 32] : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_desc_layout_param
+func.func @set_desc_layout_param(%arg0: memref<4096x4096xf16>) {
+  // CHECK: %[[V0:.+]] = xegpu.create_nd_tdesc %arg0
+  // CHECK-SAME: #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [8, 16]>>
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["xegpu.create_nd_tdesc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_desc_layout %{{.*}}
+    %layout0 = transform.param.constant 8 : i64 -> !transform.param<i64>
+    %1 = transform.xegpu.set_desc_layout %0 sg_layout = [%layout0, 4] sg_data = [32, 32] inst_data = [8, 16] : (!transform.any_op, !transform.param<i64>) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_result_default_index
+func.func @set_op_layout_attr_result_default_index(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  %2 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x256xf16>
+  %3 = xegpu.load_nd %2[0, 0]  : !xegpu.tensor_desc<32x256xf16> -> vector<32x256xf16>
+  %4 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16>
+  %5 = xegpu.load_nd %4[0, 0]  : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16>
+  // CHECK: = xegpu.dpas
+  // CHECK-SAME: {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], inst_data = [8, 16]>}
+  %6 = xegpu.dpas %1, %3, %5 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["xegpu.dpas"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_op_layout_attr %{{.*}}
+    transform.xegpu.set_op_layout_attr %0 result sg_layout = [8, 4] sg_data = [32, 64] inst_data = [8, 16] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_result_sg_param
+func.func @set_op_layout_attr_result_sg_param(%arg0: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  // CHECK: = arith.extf %1
+  // CHECK-SAME: {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], inst_data = [8, 16]>}
+  %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_op_layout_attr %{{.*}}
+    %layout0 = transform.param.constant 8 : i64 -> !transform.param<i64>
+    transform.xegpu.set_op_layout_attr %0 result sg_layout = [%layout0, 4] sg_data = [32, 64] inst_data = [8, 16] : !transform.any_op, !transform.param<i64>
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_result_sg_param2
+func.func @set_op_layout_attr_result_sg_param2(%arg0: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  // CHECK: = arith.extf %1
+  // CHECK-SAME: {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], inst_data = [8, 16]>}
+  %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_op_layout_attr %{{.*}}
+    %layout0 = transform.param.constant 8 : i64 -> !transform.param<i64>
+    %layout1 = transform.param.constant 4 : i64 -> !transform.param<i64>
+    transform.xegpu.set_op_layout_attr %0 result sg_layout = [%layout0, %layout1] sg_data = [32, 64] inst_data = [8, 16] : !transform.any_op, !transform.param<i64>, !transform.param<i64>
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_result0
+func.func @set_op_layout_attr_result0(%arg0: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  // CHECK: = arith.extf %1
+  // CHECK-SAME: {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], inst_data = [8, 16]>}
+  %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_op_layout_attr %{{.*}}
+    transform.xegpu.set_op_layout_attr %0 result index = 0 sg_layout = [8, 4] sg_data = [32, 64] inst_data = [8, 16] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_operand_minimal
+func.func @set_op_layout_attr_operand_minimal(%arg0: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  // CHECK: = arith.extf %1
+  // CHECK-SAME: {layout_operand_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64]>}
+  %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_op_layout_attr %{{.*}}
+    transform.xegpu.set_op_layout_attr %0 sg_layout = [8, 4] sg_data = [32, 64] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_operand1
+func.func @set_op_layout_attr_operand1(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  %2 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %3 = xegpu.load_nd %2[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  // CHECK: = arith.addf %1, %3
+  // CHECK-SAME: {layout_operand_1 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], inst_data = [8, 16]>}
+  %6 = arith.addf %1, %3 : vector<256x32xf16>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.addf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_op_layout_attr %{{.*}}
+    transform.xegpu.set_op_layout_attr %0 index = 1 sg_layout = [8, 4] sg_data = [32, 64] inst_data = [8, 16] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_gpu_launch_threads
+func.func @set_gpu_launch_threads(%arg0: memref<4096x4096xf16>) {
+  // CHECK: %[[C1:.+]] = arith.constant 1 : index
+  %c1 = arith.constant 1 : index
+  // CHECK: %[[C16:.+]] = arith.constant 16 : index
+  %c16 = arith.constant 16 : index
+  // CHECK: %[[C8:.+]] = arith.constant 8 : index
+  // CHECK: %[[C4:.+]] = arith.constant 4 : index
+  // CHECK: %[[C1_0:.+]] = arith.constant 1 : index
+  // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C16]], %{{.*}} = %[[C16]], %{{.*}} = %[[C1]])
+  // CHECK-SAME: threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C8]], %{{.*}} = %[[C4]], %{{.*}} = %[[C1_0]])
+  gpu.launch blocks(%arg3, %arg4, %arg5) in (%arg9 = %c16, %arg10 = %c16, %arg11 = %c1) threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) {
+    gpu.terminator
+  }
+  return
+}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["gpu.launch"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_gpu_launch_threads %{{.*}}
+    transform.xegpu.set_gpu_launch_threads %0 threads = [8, 4, 1] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_gpu_launch_threads_param
+func.func @set_gpu_launch_threads_param(%arg0: memref<4096x4096xf16>) {
+  // CHECK: %[[C1:.+]] = arith.constant 1 : index
+  %c1 = arith.constant 1 : index
+  // CHECK: %[[C16:.+]] = arith.constant 16 : index
+  %c16 = arith.constant 16 : index
+  // CHECK: %[[C8:.+]] = arith.constant 8 : index
+  // CHECK: %[[C4:.+]] = arith.constant 4 : index
+  // CHECK: %[[C1_0:.+]] = arith.constant 1 : index
+  // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C16]], %{{.*}} = %[[C16]], %{{.*}} = %[[C1]])
+  // CHECK-SAME: threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C8]], %{{.*}} = %[[C4]], %{{.*}} = %[[C1_0]])
+  gpu.launch blocks(%arg3, %arg4, %arg5) in (%arg9 = %c16, %arg10 = %c16, %arg11 = %c1) threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) {
+    gpu.terminator
+  }
+  return
+}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["gpu.launch"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_gpu_launch_threads %{{.*}}
+    %th1 = transform.param.constant 4 : i64 -> !transform.param<i64>
+    transform.xegpu.set_gpu_launch_threads %0 threads = [8, %th1, 1] : !transform.any_op, !transform.param<i64>
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @insert_prefetch_dpas_a
+func.func @insert_prefetch_dpas_a(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) {
+  // CHECK: %[[C32:.+]] = arith.constant 32 : index
+  %c32 = arith.constant 32 : index
+  %c4096 = arith.constant 4096 : index
+  // CHECK: %[[C0:.+]] = arith.constant 0 : index
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16>
+  %1 = xegpu.load_nd %0[%c0, %c0]  : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16>
+  // CHECK: xegpu.create_nd_tdesc %arg0
+  // CHECK: xegpu.create_nd_tdesc %arg1
+  // CHECK: %[[V0:.+]] = xegpu.create_nd_tdesc %arg0
+  // CHECK-SAME: !xegpu.tensor_desc<256x32xf16
+  // CHECK: xegpu.prefetch_nd %[[V0]][%[[C0]], %[[C0]]]
+  %3 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %4 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x256xf16>
+  // CHECK: scf.for %[[ARG3:.+]] = %[[C0]]
+  %2 = scf.for %arg3 = %c0 to %c4096 step %c32 iter_args(%arg4 = %1) -> (vector<256x256xf16>) {
+    // CHECK: %[[ADD:.+]] = arith.addi %[[ARG3]], %[[C32]]
+    // CHECK: xegpu.prefetch_nd %[[V0]][%[[C0]], %[[ADD]]]
+    %5 = xegpu.load_nd %3[%c0, %arg3] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+    %6 = xegpu.load_nd %4[%arg3, %c0] : !xegpu.tensor_desc<32x256xf16> -> vector<32x256xf16>
+    %7 = xegpu.dpas %5, %6, %arg4 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16>
+    scf.yield %7 : vector<256x256xf16>
+  }
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %0 = transform.structured.match ops{["xegpu.dpas"]} in %func : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_operand %0[0] : (!transform.any_op) -> !transform.any_value
+    // CHECK: transform.xegpu.insert_prefetch %{{.*}}
+    %2 = transform.xegpu.insert_prefetch %1 nb_prefetch = 1 : (!transform.any_value) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @insert_prefetch_dpas_a_nb_param2
+func.func @insert_prefetch_dpas_a_nb_param2(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) {
+  // CHECK: %[[C64:.+]] = arith.constant 64 : index
+  // CHECK: %[[C32:.+]] = arith.constant 32 : index
+  %c32 = arith.constant 32 : index
+  %c4096 = arith.constant 4096 : index
+  // CHECK: %[[C0:.+]] = arith.constant 0 : index
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16>
+  %1 = xegpu.load_nd %0[0, 0]  : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16>
+  // CHECK: xegpu.create_nd_tdesc %arg0
+  // CHECK: xegpu.create_nd_tdesc %arg1
+  // CHECK: %[[V0:.+]] = xegpu.create_nd_tdesc %arg0
+  // CHECK-SAME: !xegpu.tensor_desc<256x32xf16
+  // CHECK: xegpu.prefetch_nd %[[V0]][0, %[[C0]]]
+  // CHECK: xegpu.prefetch_nd %[[V0]][0, %[[C32]]]
+  %3 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %4 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x256xf16>
+  // CHECK: scf.for %[[ARG3:.+]] = %[[C0]]
+  %2 = scf.for %arg3 = %c0 to %c4096 step %c32 iter_args(%arg4 = %1) -> (vector<256x256xf16>) {
+    // CHECK: %[[ADD:.+]] = arith.addi %[[ARG3]], %[[C64]]
+    // CHECK: xegpu.prefetch_nd %[[V0]][0, %[[ADD]]]
+    %5 = xegpu.load_nd %3[0, %arg3] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+    %6 = xegpu.load_nd %4[%arg3, 0] : !xegpu.tensor_desc<32x256xf16> -> vector<32x256xf16>
+    %7 = xegpu.dpas %5, %6, %arg4 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16>
+    scf.yield %7 : vector<256x256xf16>
+  }
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %0 = transform.structured.match ops{["xegpu.dpas"]} in %func : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_operand %0[0] : (!transform.any_op) -> !transform.any_value
+    %nb = transform.param.constant 2 : i64 -> !transform.param<i64>
+    // CHECK: transform.xegpu.insert_prefetch %{{.*}}
+    %2 = transform.xegpu.insert_prefetch %1 nb_prefetch = %nb :  (!transform.any_value, !transform.param<i64>) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @convert_layout_a
+func.func @convert_layout_a(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) {
+  %c0 = arith.constant 0 : index
+  // CHECK: %[[V0:.+]] = xegpu.create_nd_tdesc %arg0
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16]>>
+  // CHECK: %[[V1:.+]] = xegpu.load_nd %[[V0]]
+  %1 = xegpu.load_nd %0[%c0, %c0]  : !xegpu.tensor_desc<256x32xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16]>> -> vector<256x32xf16>
+  // CHECK: %[[V2:.+]] = xegpu.convert_layout %[[V1]]
+  // CHECK: input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16]>
+  // CHECK: target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [8, 16]>
+  %2 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x256xf16>
+  %3 = xegpu.load_nd %2[%c0, %c0]  : !xegpu.tensor_desc<32x256xf16> -> vector<32x256xf16>
+  %4 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16>
+  %5 = xegpu.load_nd %4[%c0, %c0]  : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16>
+  // CHECK: = xegpu.dpas %[[V2]]
+  %6 = xegpu.dpas %1, %3, %5 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["xegpu.dpas"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_operand %0[0] : (!transform.any_op) -> !transform.any_value
+    // CHECK: transform.xegpu.convert_layout %{{.*}}
+    transform.xegpu.convert_layout %1
+      input_sg_layout = [8, 4] input_sg_data = [32, 32] input_inst_data = [32, 16]
+      target_sg_layout = [8, 4] target_sg_data = [32, 32] target_inst_data = [8, 16]
+      : (!transform.any_value) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @convert_layout_a_sg_param
+func.func @convert_layout_a_sg_param(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) {
+  %c0 = arith.constant 0 : index
+  // CHECK: %[[V0:.+]] = xegpu.create_nd_tdesc %arg0
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16]>>
+  // CHECK: %[[V1:.+]] = xegpu.load_nd %[[V0]]
+  %1 = xegpu.load_nd %0[%c0, %c0]  : !xegpu.tensor_desc<256x32xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16]>> -> vector<256x32xf16>
+  // CHECK: %[[V2:.+]] = xegpu.convert_layout %[[V1]]
+  // CHECK: input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16]>
+  // CHECK: target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [8, 16]>
+  %2 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x256xf16>
+  %3 = xegpu.load_nd %2[%c0, %c0]  : !xegpu.tensor_desc<32x256xf16> -> vector<32x256xf16>
+  %4 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16>
+  %5 = xegpu.load_nd %4[%c0, %c0]  : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16>
+  // CHECK: = xegpu.dpas %[[V2]]
+  %6 = xegpu.dpas %1, %3, %5 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["xegpu.dpas"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_operand %0[0] : (!transform.any_op) -> !transform.any_value
+    %layout0 = transform.param.constant 8 : i64 -> !transform.param<i64>
+    // CHECK: transform.xegpu.convert_layout %{{.*}}
+    transform.xegpu.convert_layout %1
+      input_sg_layout = [%layout0, 4] input_sg_data = [32, 32] input_inst_data = [32, 16]
+      target_sg_layout = [%layout0, 4] target_sg_data = [32, 32] target_inst_data = [8, 16]
+      : (!transform.any_value, !transform.param<i64>, !transform.param<i64>) -> !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
index b73bc69393dab..02c5f71d5c83d 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
@@ -1,33 +1,32 @@
 // RUN: mlir-opt --test-xegpu-layout-interface --cse -split-input-file %s | FileCheck %s
 
-//CHECk: #map = affine_map<()[s0] -> (s0 floordiv 8)>
 gpu.module @test {
   gpu.func @slice_attr() -> vector<128xindex> {
-    //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
-    //CHECK: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
-    //CHECK: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
-    //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
-    //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
-    //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C8:.*]]
+    // CHECK-DAG: %[[REMU:.*]] = index.remu %[[DIVU]], %[[C4:.*]]
+    // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU]], %[[C32:.*]]
+    // CHECK-DAG: %[[MOD:.*]] = index.remu %[[MUL]], %[[C128:.*]]
+    // CHECK-DAG: %[[BASE:.*]] = vector.step : vector<32xindex>
+    // CHECK-DAG: %[[CAST:.*]] = vector.broadcast %[[MOD]] : index to vector<32xindex>
+    // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[BASE]], %[[CAST]] : vector<32xindex>
     %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex>
     gpu.return %step : vector<128xindex>
   }
 
   gpu.func @nested_slice_attr() -> vector<128xindex> {
-    //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
-    //CHECK: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
-    //CHECK: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
-    //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
-    //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
-    //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: %[[DIVU1:.*]] = index.divu %[[SGID]], %[[C1:.*]]
+    // CHECK-DAG: %[[DIVU2:.*]] = index.divu %[[DIVU1]], %[[C8:.*]]
+    // CHECK-DAG: %[[REMU:.*]] = index.remu %[[DIVU2]], %[[C4:.*]]
+    // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU]], %[[C32:.*]]
+    // CHECK-DAG: %[[MOD:.*]] = index.remu %[[MUL]], %[[C128:.*]]
+    // CHECK-DAG: %[[BASE:.*]] = vector.step : vector<32xindex>
+    // CHECK-DAG: %[[CAST:.*]] = vector.broadcast %[[MOD]] : index to vector<32xindex>
+    // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[BASE]], %[[CAST]] : vector<32xindex>
     %0 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 1], sg_data = [32, 32, 1]>, dims = [2]>, dims = [1]>} : vector<128xindex>
     gpu.return %0 : vector<128xindex>
   }
 
-}
\ No newline at end of file
+}
+
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
index 09df1e4da43e2..9580769d37313 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
@@ -166,14 +166,12 @@ gpu.module @test_elementwise_ops {
     %load_b = xegpu.load_nd %tdesc_b
       : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
       -> vector<24x32xf32>
-    // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-12: : vector<2x2xf32>
+    // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
     // CHECK-NOT: arith.negf
     %negf = arith.negf %load_a
       {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
       : vector<24x32xf32>
-    // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-12: : vector<2x2xf32>
+    // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
     // CHECK-NOT: math.powf
     %powf = math.powf %load_a, %load_b
       {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index d2d250cbe0f66..01134d8eaabec 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -1,14 +1,10 @@
 // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
 
-#map = affine_map<()[s0] -> (s0 floordiv 4)>
-#map1 = affine_map<()[s0] -> (s0 mod 4)>
-
 gpu.module @test_round_robin_assignment {
   // CHECK-LABEL: create_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) {
-      // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32>
-      // CHECK-SAME: -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
       // CHECK-NOT: xegpu.create_nd_tdesc
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
         -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -16,22 +12,23 @@ gpu.module @test_round_robin_assignment {
     }
 
   // CHECK-LABEL: create_nd_tdesc_with_shared_data
-  // CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32>
+  // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @create_nd_tdesc_with_shared_data(%src: memref<256x128xf32>) {
-    //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[IdY:%.+]] = affine.apply #map()[[[sgId]]]
-    //CHECK: [[IdX:%.+]] = affine.apply #map1()[[[sgId]]]
-    //CHECK: [[C16:%.+]] = arith.constant 16 : index
-    //CHECK: [[LY:%.+]] = index.mul [[IdY]], [[C16]]
-    //CHECK: [[C64:%.+]] = arith.constant 64 : index
-    //CHECK: [[LX:%.+]] = index.mul [[IdX]], [[C64]]
-    //CHECK: [[C0:%.+]] = arith.constant 0 : index
-    //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
-    //CHECK: [[C128:%.+]] = arith.constant 128 : index
-    //CHECK: [[offY:%.+]] = index.remu [[LY]], [[C128]]
-    //CHECK: [[C64_2:%.+]] = arith.constant 64 : index
-    //CHECK: [[offX:%.+]] = index.remu [[LX]], [[C64_2]]
-    //CHECK: xegpu.create_nd_tdesc [[ARG_0]][[[offY]], [[offX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32>
+    // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK: %[[C4:.*]] = arith.constant 4 : index
+    // CHECK: %[[IDX:.*]] = index.remu %[[SGID]], %[[C4]]
+    // CHECK: %[[IDY_DIV:.*]] = index.divu %[[SGID]], %[[C4]]
+    // CHECK: %[[C8:.*]] = arith.constant 8 : index
+    // CHECK: %[[IDY:.*]] = index.remu %[[IDY_DIV]], %[[C8]]
+    // CHECK: %[[C16:.*]] = arith.constant 16 : index
+    // CHECK: %[[LY:.*]] = index.mul %[[IDY]], %[[C16]]
+    // CHECK: %[[C64:.*]] = arith.constant 64 : index
+    // CHECK: %[[LX:.*]] = index.mul %[[IDX]], %[[C64]]
+    // CHECK: %[[C128:.*]] = arith.constant 128 : index
+    // CHECK: %[[OFFY:.*]] = index.remu %[[LY]], %[[C128]]
+    // CHECK: %[[C64_1:.*]] = arith.constant 64 : index
+    // CHECK: %[[OFFX:.*]] = index.remu %[[LX]], %[[C64_1]]
+    // CHECK: xegpu.create_nd_tdesc %[[ARG_0]][%[[OFFY]], %[[OFFX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32>
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>>
     gpu.return
@@ -42,9 +39,7 @@ gpu.module @test_round_robin_assignment {
   gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) {
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
         -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-      // CHECK-COUNT-4: xegpu.load_nd %{{.*}}
-      // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-      // CHECK-SAME-COUNT-4: -> vector<16x16xf32>
+      // CHECK-COUNT-4: xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
       // CHECK-NOT: xegpu.load_nd
       %load =  xegpu.load_nd %tdesc
         : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -57,9 +52,8 @@ gpu.module @test_round_robin_assignment {
   gpu.func @store_nd(%src: memref<256x128xf32>) {
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
         -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-      // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}}
-      // CHECK-SAME-COUNT-4: : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-      // CHECK-NOT : xegpu.store_nd
+      // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}} : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      // CHECK-NOT: xegpu.store_nd
       %load = xegpu.load_nd %tdesc
         : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
         -> vector<256x128xf32>
@@ -73,8 +67,7 @@ gpu.module @test_round_robin_assignment {
   gpu.func @update_nd(%src: memref<256x128xf32>){
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       ->  !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-    // CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16]
-    // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>>
+    // CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.update_nd_offset
     %update = xegpu.update_nd_offset %tdesc, [0, 16]
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -84,15 +77,9 @@ gpu.module @test_round_robin_assignment {
   // CHECK-LABEL: dpas
   // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>)
   gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) {
-    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16>
-    // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    // CHECK-NOT: xegpu.create_nd_tdesc
-    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16>
-    // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [4, 8], lane_data = [1, 1]>>
-    // CHECK-NOT: xegpu.create_nd_tdesc
-    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}}
-    // CHECK-SAME-COUNT-16: {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-16: : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     // CHECK-NOT: xegpu.dpas
     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<256x128xf16>
       -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -113,8 +100,7 @@ gpu.module @test_round_robin_assignment {
   // CHECK-LABEL: prefetch_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @prefetch_nd_tdesc(%src: memref<256x128xf32>) {
-    // CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}}
-    // CHECK-SAME-COUNT-4: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.prefetch_nd
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -131,9 +117,7 @@ gpu.module @test_round_robin_assignment {
     %load =  xegpu.load_nd %tdesc
       : !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
       -> vector<128x1xf32>
-    // CHECK-COUNT-2: vector.broadcast {{.*}}
-    // CHECK-SAME-COUNT-2: {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-2: : vector<16x1xf32> to vector<16x32xf32>
+    // CHECK-COUNT-2: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>} : vector<16x1xf32> to vector<16x32xf32>
     // CHECK-NOT: vector.broadcast
     %broadcast = vector.broadcast %load
       {layout_result_0 = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
@@ -171,10 +155,10 @@ gpu.module @test_round_robin_assignment {
     %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
     %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
     %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
-    //CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32)
+    // CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32)
     %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) {
       %4 = arith.cmpi slt, %arg3, %c10_i32 : i32
-      //CHECK: scf.condition{{.*}} : vector<16xf32>, vector<16xf32>, i32
+      // CHECK: scf.condition{{.*}} : vector<16xf32>, vector<16xf32>, i32
       scf.condition(%4) %arg2, %arg3 : vector<256xf32>, i32
     } do {
     // CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: vector<16xf32>, [[arg4:%.+]]: i32)
@@ -195,16 +179,16 @@ gpu.module @test_round_robin_assignment {
     %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
     %3 = arith.cmpi eq, %0, %c10 : index
     // CHECK-LABEL: scf.if
-    //  CHECK-SAME: (vector<16xf32>, vector<16xf32>)
+    // CHECK-SAME: (vector<16xf32>, vector<16xf32>)
     %4 = scf.if %3 -> (vector<256xf32>) {
       %5 = xegpu.load_nd %1  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
       // CHECK-LABEL: scf.yield
-      //  CHECK-SAME: vector<16xf32>, vector<16xf32>
+      // CHECK-SAME: vector<16xf32>, vector<16xf32>
       scf.yield %5 : vector<256xf32>
     } else {
       %5 = xegpu.load_nd %2  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
       // CHECK-LABEL: scf.yield
-      //  CHECK-SAME: vector<16xf32>, vector<16xf32>
+      // CHECK-SAME: vector<16xf32>, vector<16xf32>
       scf.yield %5 : vector<256xf32>
     } {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [16]>}
     xegpu.store_nd %4, %1  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
@@ -220,16 +204,16 @@ gpu.module @test_round_robin_assignment {
 
     %0 = arith.cmpi eq, %id, %c10 : index
     // CHECK-LABEL: scf.if
-    //  CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>)
+    // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>)
     %1 = scf.if %0 -> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>) {
       %2 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
       // CHECK-LABEL: scf.yield
-      //  CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
+      // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
       scf.yield %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
     } else {
       %3 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
       // CHECK-LABEL: scf.yield
-      //  CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
+      // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
       scf.yield %3 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
     }
     xegpu.store_nd %d, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
@@ -238,8 +222,8 @@ gpu.module @test_round_robin_assignment {
 
   gpu.func @convert_layout_optimal(%arg0: memref<32x64xf32>) {
     %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>>
-    //CHECK-2: xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32>
-    //CHECK-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32>
+    // CHECK-COUNT-2: xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32>
+    // CHECK-COUNT-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32>
     %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>> -> vector<32x64xf32>
     %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>,
                                    target_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]>}> : vector<32x64xf32>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
index 86a021b66949c..84ce80f477a55 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
@@ -14,13 +14,11 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: load_nd_tdesc_with_offset
   gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) {
-    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}]
-    // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    // CHECK-SAME-COUNT-4: -> vector<16x16xf32>
+    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
     // CHECK-NOT: xegpu.load_nd
     %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-    %load =  xegpu.load_nd %tdesc[0, 0]
+    %load = xegpu.load_nd %tdesc[0, 0]
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
       -> vector<256x128xf32>
     gpu.return
@@ -28,8 +26,7 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: store_nd_with_offset
   gpu.func @store_nd_with_offset(%src: memref<256x128xf32>) {
-    // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}]
-    // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.store_nd
     %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -42,10 +39,8 @@ gpu.module @test_distribution {
   }
 
   // CHECK-LABEL: prefetch_nd_tdesc_with_offset
-  // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @prefetch_nd_tdesc_with_offset(%src: memref<256x128xf32>) {
-    // CHECK-COUNT-4: xegpu.prefetch_nd {{%.*}}[{{%.*}}, {{%.*}}]
-    // CHECK-SAME-COUNT-4: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.prefetch_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.prefetch_nd
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -57,15 +52,11 @@ gpu.module @test_distribution {
   // CHECK-LABEL: dpas
   // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>)
   gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) {
-    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf16>
-    // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    // CHECK-NOT: xegpu.create_nd_tdesc
-    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16>
-    // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [4, 8], lane_data = [1, 1]>>
-    // CHECK-NOT: xegpu.create_nd_tdesc
-    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}}
-    // CHECK-SAME-COUNT-16: {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-16: : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     // CHECK-NOT: xegpu.dpas
     %tdesc_a = xegpu.create_nd_tdesc %a : memref<256x128xf16>
       -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -102,27 +93,42 @@ gpu.module @test_distribution {
   gpu.func @non_splat_constant() {
     // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{.*}}> : vector<2x1xindex>
     // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
-    // CHECK-DAG: %[[MAP4:.*]] = affine.apply #map4()[%[[SGID]]]
-    // CHECK-DAG: %[[MAP5:.*]] = affine.apply #map5()[%[[SGID]]]
-    // CHECK-DAG: %[[MUL:.*]] = index.mul %[[MAP4]], %[[C2:.*]]
-    // CHECK-DAG: %[[REMU1:.*]] = index.remu %[[MUL]], %[[C32:.*]]
-    // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[MAP5]], %[[C1:.*]]
+    // CHECK-DAG: %[[REMU1:.*]] = index.remu %[[SGID]], %[[C1:.*]]
+    // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C1:.*]]
+    // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C8:.*]]
+    // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU2]], %[[C2:.*]]
+    // CHECK-DAG: %[[REMU3:.*]] = index.remu %[[MUL]], %[[C32:.*]]
+    // CHECK-DAG: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C1:.*]]
     // CHECK-DAG: %[[ADD16:.*]] = arith.addi %[[MUL]], %[[C16:.*]] : index
-    // CHECK-DAG: %[[REMU3:.*]] = index.remu %[[ADD16]], %[[C32:.*]]
-    // CHECK-DAG: %[[REMU4:.*]] = index.remu %[[MAP5]], %[[C1:.*]]
-    // CHECK-DAG: %[[STRIDE1:.*]] = arith.muli %[[REMU1]], %[[C16:.*]] : index
+    // CHECK-DAG: %[[REMU5:.*]] = index.remu %[[ADD16]], %[[C32:.*]]
+    // CHECK-DAG: %[[REMU6:.*]] = index.remu %[[REMU1]], %[[C1:.*]]
+    // CHECK-DAG: %[[STRIDE1:.*]] = arith.muli %[[REMU3]], %[[C16:.*]] : index
     // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[STRIDE1]] : index
-    // CHECK-DAG: %[[STRIDE2:.*]] = arith.muli %[[REMU2]], %[[C0:.*]] : index
+    // CHECK-DAG: %[[STRIDE2:.*]] = arith.muli %[[REMU4]], %[[C0:.*]] : index
     // CHECK-DAG: %[[ADDSTRIDES1:.*]] = arith.addi %[[ADDSTRIDES]], %[[STRIDE2]] : index
     // CHECK-DAG: %[[BCAST1:.*]] = vector.broadcast %[[ADDSTRIDES1]] : index to vector<2x1xindex>
     // CHECK-DAG: %[[RESULT1:.*]] = arith.addi %[[BASECST]], %[[BCAST1]] : vector<2x1xindex>
-    // CHECK-DAG: %[[STRIDE3:.*]] = arith.muli %[[REMU3]], %[[C16:.*]] : index
+    // CHECK-DAG: %[[STRIDE3:.*]] = arith.muli %[[REMU5]], %[[C16:.*]] : index
     // CHECK-DAG: %[[ADDSTRIDES2:.*]] = arith.addi %[[C0:.*]], %[[STRIDE3]] : index
-    // CHECK-DAG: %[[STRIDE4:.*]] = arith.muli %[[REMU4]], %[[C0:.*]] : index
+    // CHECK-DAG: %[[STRIDE4:.*]] = arith.muli %[[REMU6]], %[[C0:.*]] : index
     // CHECK-DAG: %[[ADDSTRIDES3:.*]] = arith.addi %[[ADDSTRIDES2]], %[[STRIDE4]] : index
     // CHECK-DAG: %[[BCAST2:.*]] = vector.broadcast %[[ADDSTRIDES3]] : index to vector<2x1xindex>
     // CHECK-DAG: %[[RESULT2:.*]] = arith.addi %[[BASECST]], %[[BCAST2]] : vector<2x1xindex>
     %cst_2 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 1], sg_data = [2, 1]>} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex>
     gpu.return
   }
+
+  // CHECK-LABEL: vector_transpose
+  gpu.func @vector_transpose(%src: memref<256x128xf32>) {
+    %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
+        -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+    %load = xegpu.load_nd %tdesc[0, 0]
+        : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+        -> vector<256x128xf32>
+    // CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<32x16xf32> to vector<16x32xf32>
+    // CHECK-NOT: vector.transpose
+    %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 32], lane_layout = [1, 16], lane_data = [1, 1], order =[1, 0]>} : vector<256x128xf32> to vector<128x256xf32>
+      gpu.return
+  }
 }
+
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 52acde4dffc2e..4fbb566cfbe73 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -1,8 +1,5 @@
 // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
 
-//CHECK: #map = affine_map<()[s0] -> (s0 floordiv 4)>
-//CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)>
-//CHECK: #map2 = affine_map<()[s0] -> (s0 floordiv 8)>
 gpu.module @test_distribution {
   // CHECK-LABEL: create_nd_tdesc_no_offset
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
@@ -26,13 +23,23 @@ gpu.module @test_distribution {
   }
 
   // CHECK-LABEL: load_nd_tdesc_with_offset
-  // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) {
-    //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
-    //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
-    //CHECK: %[[LOAD:.*]] = xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32>
-    %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
+    //CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    //CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    //CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
+    //CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %[[C4]]
+    //CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %[[C4]]
+    //CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
+    //CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %[[C8]]
+    //CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index
+    //CHECK-DAG: %[[L_OFF_Y:.*]] = index.mul %[[SGIDY]], %[[C32]]
+    //CHECK-DAG: %[[L_OFF_X:.*]] = index.mul %[[SGIDX]], %[[C32]]
+    //CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index
+    //CHECK-DAG: %[[OFF_Y:.*]] = index.remu %[[L_OFF_Y]], %[[C256]]
+    //CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index
+    //CHECK-DAG: %[[OFF_X:.*]] = index.remu %[[L_OFF_X]], %[[C128]]
+    //CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32>
+    %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
     %load =  xegpu.load_nd %tdesc[0, 0]
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -43,9 +50,6 @@ gpu.module @test_distribution {
   // CHECK-LABEL: store_nd_with_offsets
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @store_nd_with_offsets(%src: memref<256x128xf32>) {
-    //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
-    //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
     //CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}]  : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -60,9 +64,6 @@ gpu.module @test_distribution {
   // CHECK-LABEL: prefetch_nd_tdesc_with_offset
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @prefetch_nd_tdesc_with_offset(%src: memref<256x128xf32>) {
-    //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
-    //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
     //CHECK: xegpu.prefetch_nd %{{.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %cst0 = arith.constant 0 : index
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
@@ -285,7 +286,7 @@ gpu.module @test_distribution {
     // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<2.550000e+01> : vector<8xf16>
     // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<0> : vector<8xindex>
     // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<true> : vector<8xi1>
-    // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}>
+    // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>, layout = #xegpu.layout<inst_data = [8]>}>
     // CHECK-SAME: {layout_operand_0 = #xegpu.layout<inst_data = [8]>, layout_operand_2 = #xegpu.layout<inst_data = [8]>,
     // CHECK-SAME: layout_operand_3 = #xegpu.layout<inst_data = [8]>}
     // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
@@ -319,17 +320,15 @@ gpu.module @test_distribution {
   gpu.func @distribute_load_matrix(%arg0: memref<32768xi8, 3>) {
     //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[arg0]] : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
     //CHECK: [[sgid:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[c2:%.+]] = arith.constant 2 : index
     //CHECK: [[c4:%.+]] = arith.constant 4 : index
-    //CHECK: [[c4_0:%.+]] = arith.constant 4 : index
-    //CHECK: [[id_y:%.+]] = affine.apply #map()[[[sgid]]]
-    //CHECK: [[id_x:%.+]] = affine.apply #map1()[[[sgid]]]
+    //CHECK: [[sgidx:%.+]] = index.remu [[sgid]], [[c4]]
+    //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgid]], [[c4]]
+    //CHECK: [[c2:%.+]] = arith.constant 2 : index
+    //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c2]]
     //CHECK: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK: [[l_off_y:%.+]] = index.mul [[id_y]], [[c32]]
-    //CHECK: [[c32_1:%.+]] = arith.constant 32 : index
-    //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]]
-    //CHECK: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK: [[c0_1:%.+]] = arith.constant 0 : index
+    //CHECK: [[l_off_y:%.+]] = index.mul [[sgidy]], [[c32]]
+    //CHECK: [[c32_0:%.+]] = arith.constant 32 : index
+    //CHECK: [[l_off_x:%.+]] = index.mul [[sgidx]], [[c32_0]]
     //CHECK: [[c64:%.+]] = arith.constant 64 : index
     //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]]
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
@@ -346,17 +345,15 @@ gpu.module @test_distribution {
     //CHECK: [[cst:%.+]] = arith.constant dense<1.000000e+00> : vector<32x32xf32>
     //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[arg0]] : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
     //CHECK: [[sgid:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[c2:%.+]] = arith.constant 2 : index
     //CHECK: [[c4:%.+]] = arith.constant 4 : index
-    //CHECK: [[c4_0:%.+]] = arith.constant 4 : index
-    //CHECK: [[id_y:%.+]] = affine.apply #map()[[[sgid]]]
-    //CHECK: [[id_x:%.+]] = affine.apply #map1()[[[sgid]]]
+    //CHECK: [[sgidx:%.+]] = index.remu [[sgid]], [[c4]]
+    //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgid]], [[c4]]
+    //CHECK: [[c2:%.+]] = arith.constant 2 : index
+    //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c2]]
     //CHECK: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK: [[l_off_y:%.+]] = index.mul [[id_y]], [[c32]]
-    //CHECK: [[c32_1:%.+]] = arith.constant 32 : index
-    //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]]
-    //CHECK: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK: [[c0_2:%.+]] = arith.constant 0 : index
+    //CHECK: [[l_off_y:%.+]] = index.mul [[sgidy]], [[c32]]
+    //CHECK: [[c32_0:%.+]] = arith.constant 32 : index
+    //CHECK: [[l_off_x:%.+]] = index.mul [[sgidx]], [[c32_0]]
     //CHECK: [[c64:%.+]] = arith.constant 64 : index
     //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]]
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
@@ -411,14 +408,17 @@ gpu.module @test_distribution {
   // CHECK-LABEL: vector_step_op
   gpu.func @vector_step_op_slice_attr() {
     //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
-    //CHECK-DAG: [[IDY:%.+]] = affine.apply #map2()[[[sgId]]]
-    //CHECK-DAG: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK-DAG: [[LY:%.+]] = index.mul [[IDY]], [[c32]]
-    //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK-DAG: [[MODY:%.+]] = index.remu [[LY]], [[c128]]
-    //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<32xindex>
-    //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
+    //CHECK: [[c8:%.+]] = arith.constant 8 : index
+    //CHECK: [[sgidx:%.+]] = index.remu [[sgId]], [[c8]]
+    //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgId]], [[c8]]
+    //CHECK: [[c4:%.+]] = arith.constant 4 : index
+    //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c4]]
+    //CHECK: [[c32:%.+]] = arith.constant 32 : index
+    //CHECK: [[LY:%.+]] = index.mul [[sgidy]], [[c32]]
+    //CHECK: [[c128:%.+]] = arith.constant 128 : index
+    //CHECK: [[MODY:%.+]] = index.remu [[LY]], [[c128]]
+    //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
+    //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
     %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex>
     gpu.return
@@ -426,14 +426,14 @@ gpu.module @test_distribution {
 
   gpu.func @vector_step_op_layout_attr() {
     //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
-    //CHECK-DAG: [[c16:%.+]] = arith.constant 16 : index
-    //CHECK-DAG: [[c8:%.+]] = arith.constant 8 : index
-    //CHECK-DAG: [[LOCALY:%.+]] = index.mul [[sgId]], [[c8]]
-    //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK-DAG: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
-    //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<8xindex>
-    //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex>
+    //CHECK: [[c16:%.+]] = arith.constant 16 : index
+    //CHECK: [[sgidx:%.+]] = index.remu [[sgId]], [[c16]]
+    //CHECK: [[c8:%.+]] = arith.constant 8 : index
+    //CHECK: [[LOCALY:%.+]] = index.mul [[sgidx]], [[c8]]
+    //CHECK: [[c128:%.+]] = arith.constant 128 : index
+    //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
+    //CHECK: [[BASE:%.+]] = vector.step : vector<8xindex>
+    //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex>
     %step = vector.step {layout_result_0 = #xegpu.layout<sg_layout = [16], sg_data = [8]>}: vector<128xindex>
     gpu.return
@@ -464,14 +464,27 @@ gpu.module @test_distribution {
     gpu.return
   }
 
+  // CHECK-LABEL: vector_transpose
+  gpu.func @vector_transpose(%src: memref<256x32xf32>) {
+    %tdesc = xegpu.create_nd_tdesc %src : memref<256x32xf32>
+        -> !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+    %load = xegpu.load_nd %tdesc[0, 0]
+        : !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+        -> vector<256x32xf32>
+    //CHECK: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<64x32xf32> to vector<32x64xf32>
+    %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], lane_layout = [1, 16], lane_data = [1, 1], order =[1, 0]>} : vector<256x32xf32> to vector<32x256xf32>
+      gpu.return
+  }
+
   // CHECK-LABEL: non_splat_constant_2D
   gpu.func @non_splat_constant_2D() {
     // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1x1xindex>
     // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
-    // CHECK-DAG: affine.apply #map4()[%[[SGID]]]
-    // CHECK-DAG: affine.apply #map5()[%[[SGID]]]
-    // CHECK-DAG: %[[IDY:.*]] = index.remu %{{.*}}, %[[C32:.*]]
-    // CHECK-DAG: %[[IDX:.*]] = index.remu %{{.*}}, %[[C1:.*]]
+    // CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %{{.*}}
+    // CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %{{.*}}
+    // CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %{{.*}}
+    // CHECK-DAG: %[[IDY:.*]] = index.remu %[[SGIDY]], %{{.*}}
+    // CHECK-DAG: %[[IDX:.*]] = index.remu %[[SGIDX]], %{{.*}}
     // CHECK-DAG: %[[STRIDECOL:.*]] = arith.muli %[[IDY]], %[[C16:.*]] : index
     // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[C0:.*]], %[[STRIDECOL]] : index
     // CHECK-DAG: %[[STRIDEROW:.*]] = arith.muli %[[IDX]], %[[C0:.*]] : index
@@ -484,20 +497,19 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: non_splat_constant_2D_non_unit_dim
   gpu.func @non_splat_constant_2D_non_unit_dim() {
-    // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{.*}} : vector<2x2xindex>
+    // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{\[}}{{\[}}0, 16{{\]}}, {{\[}}8, 24{{\]}}{{\]}}> : vector<2x2xindex>
     // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
-    // CHECK-DAG: %[[IDY:.*]] = affine.apply #map()[%[[SGID]]]
-    // CHECK-DAG: %[[IDX:.*]] = affine.apply #map1()[%[[SGID]]]
-    // CHECK-DAG: %[[MULY:.*]] = index.mul %[[IDY]], %[[C2:.*]]
-    // CHECK-DAG: %[[C2_2:.*]] = arith.constant 2 : index
-    // CHECK-DAG: %[[MULX:.*]] = index.mul %[[IDX]], %[[C2:.*]]
+    // CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %{{.*}}
+    // CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %{{.*}}
+    // CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %{{.*}}
+    // CHECK-DAG: %[[MULY:.*]] = index.mul %[[SGIDY]], %[[C2:.*]]
+    // CHECK-DAG: %[[MULX:.*]] = index.mul %[[SGIDX]], %{{.*}}
     // CHECK-DAG: %[[REMU_Y:.*]] = index.remu %[[MULY]], %[[C8:.*]]
-    // CHECK-DAG: %[[C8_2:.*]] = arith.constant 8 : index
-    // CHECK-DAG: %[[REMU_X:.*]] = index.remu %[[MULX]], %[[C8:.*]]
-    // CHECK-DAG: %[[MUL5:.*]] = arith.muli %[[REMU_Y]], %[[C8:.*]] : index
+    // CHECK-DAG: %[[REMU_X:.*]] = index.remu %[[MULX]], %{{.*}}
+    // CHECK-DAG: %[[MUL5:.*]] = arith.muli %[[REMU_Y]], %{{.*}} : index
     // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[C0:.*]], %[[MUL5]] : index
     // CHECK-DAG: %[[MUL6:.*]] = arith.muli %[[REMU_X]], %[[C16:.*]] : index
-    // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi  %[[ADD]], %[[MUL6]] : index
+    // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[ADD]], %[[MUL6]] : index
     // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<2x2xindex>
     // CHECK-DAG: %[[ADDCST:.*]] = arith.addi %[[BASECST]], %[[BCAST]] : vector<2x2xindex>
     %cst_8x8 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2]>} dense<[
@@ -517,13 +529,14 @@ gpu.module @test_distribution {
   gpu.func @non_splat_constant() {
     // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex>
     // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
-    // CHECK-DAG: %[[REMU:.*]] = index.remu %[[SGID]], %[[C32:.*]]
-    // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU]], %[[C16:.*]] : index
+    // CHECK-DAG: %[[REMU:.*]] = index.remu %[[SGID]], %{{.*}}
+    // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[REMU]], %{{.*}}
+    // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU2]], %[[C16:.*]] : index
     // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[MUL]] : index
     // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<1xindex>
     // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[CST]], %[[BCAST]] : vector<1xindex>
     %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [1]>} dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496]> : vector<32xindex>
-    // CHECK: arith.constant dense<{{\[}}[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]{{\]}}> : vector<1x16xindex>
+    // CHECK: arith.constant dense<{{\[}}{{\[}}0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15{{\]}}{{\]}}> : vector<1x16xindex>
     %cst_1 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 16]>} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex>
     gpu.return
   }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index e83229e3a3995..5ce3d1d0fb5d6 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -1,47 +1,35 @@
 // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
 
-//CHECK: #map = affine_map<()[s0] -> (s0 floordiv 4)>
-//CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)>
 gpu.module @test_1_1_assignment {
   // CHECK-LABEL: create_nd_tdesc
-  // CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32>
+  // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) {
-    //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
-    //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
-    //CHECK: [[C32:%.+]] = arith.constant 32 : index
-    //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C32]]
-    //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
-    //CHECK: [[C0:%.+]] = arith.constant 0 : index
-    //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
-    //CHECK: [[C256:%.+]] = arith.constant 256 : index
-    //CHECK: [[Y:%.+]] = index.remu [[LY]], [[C256]]
-    //CHECK: [[C128:%.+]] = arith.constant 128 : index
-    //CHECK: [[X:%.+]] = index.remu [[LX]], [[C128]]
-    //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: %[[REMUX:.*]] = index.remu %[[SGID]], %[[C4:.*]]
+    // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C4:.*]]
+    // CHECK-DAG: %[[REMUY:.*]] = index.remu %[[DIVU]], %[[C8:.*]]
+    // CHECK-DAG: %[[MULY:.*]] = index.mul %[[REMUY]], %[[C32:.*]]
+    // CHECK-DAG: %[[MULX:.*]] = index.mul %[[REMUX]], %[[C32:.*]]
+    // CHECK-DAG: %[[MODY:.*]] = index.remu %[[MULY]], %[[C256:.*]]
+    // CHECK-DAG: %[[MODX:.*]] = index.remu %[[MULX]], %[[C128:.*]]
+    // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[MODY]], %[[MODX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
   }
 
   // CHECK-LABEL: create_nd_tdesc_from_higher_rank_memref
-  // CHECK-SAME: [[ARG_0:%.*]]: memref<3x256x128xf32>
+  // CHECK-SAME: %[[ARG_0:.*]]: memref<3x256x128xf32>
   gpu.func @create_nd_tdesc_from_higher_rank_memref(%src: memref<3x256x128xf32>) {
-    //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
-    //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
-    //CHECK: [[C32:%.+]] = arith.constant 32 : index
-    //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C32]]
-    //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
-    //CHECK: [[C0:%.+]] = arith.constant 0 : index
-    //CHECK: [[C0_2:%.+]] = arith.constant 0 : index
-    //CHECK: [[C256:%.+]] = arith.constant 256 : index
-    //CHECK: [[MODY:%.+]] = index.remu [[LY]], [[C256]]
-    //CHECK: [[C128:%.+]] = arith.constant 128 : index
-    //CHECK: [[MODX:%.+]] = index.remu [[LX]], [[C128]]
-    //CHECK: [[C0_3:%.+]] = arith.constant 0 : index
-    //CHECK: [[C0_4:%.+]] = arith.constant 0 : index
-    //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[MODY]], [[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: %[[REMUX:.*]] = index.remu %[[SGID]], %[[C4:.*]]
+    // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C4:.*]]
+    // CHECK-DAG: %[[REMUY:.*]] = index.remu %[[DIVU]], %[[C8:.*]]
+    // CHECK-DAG: %[[MULY:.*]] = index.mul %[[REMUY]], %[[C32:.*]]
+    // CHECK-DAG: %[[MULX:.*]] = index.mul %[[REMUX]], %[[C32:.*]]
+    // CHECK-DAG: %[[MODY:.*]] = index.remu %[[MULY]], %[[C256:.*]]
+    // CHECK-DAG: %[[MODX:.*]] = index.remu %[[MULX]], %[[C128:.*]]
+    // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][1, %[[MODY]], %[[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %tdesc = xegpu.create_nd_tdesc %src[1, 0, 0] : memref<3x256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
@@ -81,25 +69,24 @@ gpu.module @test_1_1_assignment {
     xegpu.store_nd %load, %tdesc
       : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
-}
+  }
 
-// CHECK-LABEL: update_nd
-// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
-gpu.func @update_nd(%src: memref<256x128xf32>){
-  // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32>
-  // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16]
-  // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
-    -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-  %update = xegpu.update_nd_offset %tdesc, [0, 16]
-    : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
+  // CHECK-LABEL: update_nd
+  // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
+  gpu.func @update_nd(%src: memref<256x128xf32>){
+    // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32>
+    // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16]
+    // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
+      -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+    %update = xegpu.update_nd_offset %tdesc, [0, 16]
+      : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
 
-// CHECK-LABEL: dpas
-gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
-    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
+  // CHECK-LABEL: dpas
+  gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16>
       -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
     %load_a =  xegpu.load_nd %tdesc_a
@@ -110,16 +97,15 @@ gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     %load_b =  xegpu.load_nd %tdesc_b
       : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
       -> vector<128x128xf16>
+    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
     %dpas = xegpu.dpas %load_a, %load_b
       {layout_result_0 =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
     gpu.return
   }
 
-
-// CHECK-LABEL: dpas_no_sg_data
-gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
-    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+  // CHECK-LABEL: dpas_no_sg_data
+  gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16>
       -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
       order = [1, 0]>>
@@ -134,6 +120,7 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
       : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
       order = [1, 0]>>
       -> vector<128x128xf16>
+    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     %dpas = xegpu.dpas %load_a, %load_b
       {layout_result_0 =  #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
       : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
@@ -196,9 +183,9 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
   }
 
   gpu.func @scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) {
-    //CHECK: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK: [[c1024:%.+]] = arith.constant 1024 : index
+    // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index
+    // CHECK-DAG: %[[C1024:.*]] = arith.constant 1024 : index
     %c0 = arith.constant 0 : index
     %c128 = arith.constant 128 : index
     %c1024 = arith.constant 1024 : index
@@ -211,15 +198,15 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     %4 = xegpu.create_nd_tdesc %arg0[%0, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
     %5 = xegpu.create_nd_tdesc %arg1[%c0, %1] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
 
-    //      CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]]
-    // CHECK-SAME: iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) ->
+    // CHECK: %[[SCF:.*]]:3 = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C1024]] step %[[C128]]
+    // CHECK-SAME: iter_args(%[[ARG4:.*]] = {{.*}}, %[[ARG5:.*]] = {{.*}}, %[[ARG6:.*]] = {{.*}}) ->
     // CHECK-SAME: (!xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>)
-    //      CHECK: [[a:%.+]] = xegpu.load_nd [[arg4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16>
-    //      CHECK: [[b:%.+]] = xegpu.load_nd [[arg5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
-    //      CHECK: [[c:%.+]] = xegpu.dpas [[a]], [[b]], [[arg6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32>
-    //      CHECK: [[at:%.+]] = xegpu.update_nd_offset [[arg4]], [[[c0]], [[c128]]] : !xegpu.tensor_desc<16x128xf16>
-    //      CHECK: [[bt:%.+]] = xegpu.update_nd_offset [[arg5]], [[[c128]], [[c0]]] : !xegpu.tensor_desc<128x16xf16>
-    //      CHECK: scf.yield [[at]], [[bt]], [[c]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>
+    // CHECK: %[[A:.*]] = xegpu.load_nd %[[ARG4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16>
+    // CHECK: %[[B:.*]] = xegpu.load_nd %[[ARG5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
+    // CHECK: %[[C:.*]] = xegpu.dpas %[[A]], %[[B]], %[[ARG6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32>
+    // CHECK: %[[AT:.*]] = xegpu.update_nd_offset %[[ARG4]], [%[[C0]], %[[C128]]] : !xegpu.tensor_desc<16x128xf16>
+    // CHECK: %[[BT:.*]] = xegpu.update_nd_offset %[[ARG5]], [%[[C128]], %[[C0]]] : !xegpu.tensor_desc<128x16xf16>
+    // CHECK: scf.yield %[[AT]], %[[BT]], %[[C]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>
     %6:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3)
         -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>,
             !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>, vector<128x128xf32>) {
@@ -252,7 +239,7 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
       // CHECK: scf.condition{{.*}} : vector<16xf32>, i32
       scf.condition(%4) %arg2, %arg3 : vector<256xf32>, i32
     } do {
-    // CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: i32)
+    // CHECK: (%[[ARG2:.*]]: vector<16xf32>, %[[ARG3:.*]]: i32)
     ^bb0(%arg2: vector<256xf32>, %arg3: i32):
       xegpu.store_nd %arg2, %2  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
       %4 = arith.addi %arg3, %c1_i32 : i32
@@ -344,9 +331,9 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     %cond4 = arith.cmpi slt, %sg_id, %c31 : index
     %cond5 = arith.andi %cond3, %cond4 : i1
     scf.if %cond5 {
-      // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index
-      // CHECK: %[[C2:.*]] = arith.constant 2 : index
-      // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]]
+        // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index
+        // CHECK: %[[C2:.*]] = arith.constant 2 : index
+        // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]]
       %tdesc = xegpu.create_nd_tdesc %src2[0, 0] : memref<128x64xf32>
         -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
       %load =  xegpu.load_nd %tdesc
diff --git a/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir
index 84875675ac3d0..09cfee16ccd00 100644
--- a/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir
@@ -50,6 +50,17 @@ func.func @subview_zero_size_dim(%memref: memref<10x4x1xf32, strided<[?, ?, ?],
     return
 }
 
+func.func @subview_with_empty_slice(%memref: memref<10x4x1xf32, strided<[?, ?, ?], offset: ?>>, 
+                                 %dim_0: index, 
+                                 %dim_1: index, 
+                                 %dim_2: index,
+                                 %offset: index) {
+    %subview = memref.subview %memref[%offset, 0, 0] [%dim_0, %dim_1, %dim_2] [1, 1, 1] :
+        memref<10x4x1xf32, strided<[?, ?, ?], offset: ?>> to
+        memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>
+    return
+}
+
 
 func.func @main() {
   %0 = arith.constant 0 : index
@@ -127,5 +138,9 @@ func.func @main() {
   func.call @subview_zero_size_dim(%alloca_10x4x1_dyn_stride, %dim_0, %dim_1, %dim_2)
                                         : (memref<10x4x1xf32, strided<[?, ?, ?], offset: ?>>, index, index, index) -> ()
 
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  %offset = arith.constant 10 : index
+  func.call @subview_with_empty_slice(%alloca_10x4x1_dyn_stride, %dim_0, %dim_1, %dim_2, %offset)
+                                        : (memref<10x4x1xf32, strided<[?, ?, ?], offset: ?>>, index, index, index, index) -> ()
   return
 }
diff --git a/mlir/test/Integration/Dialect/Tensor/extract_slice-runtime-verification.mlir b/mlir/test/Integration/Dialect/Tensor/extract_slice-runtime-verification.mlir
index a77fa310a3699..745eea37f7fca 100644
--- a/mlir/test/Integration/Dialect/Tensor/extract_slice-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Tensor/extract_slice-runtime-verification.mlir
@@ -39,6 +39,11 @@ func.func @extract_slice_zero_size_dim(%arg0: tensor<10x4x1xf32>, %dim_0: index,
     return
 }
 
+func.func @extract_slice_empty_tensor(%arg0: tensor<10x4x1xf32>, %dim_0: index, %dim_1: index, %dim_2: index, %offset: index) {
+    tensor.extract_slice %arg0[%offset, 0, 0] [%dim_0, %dim_1, %dim_2] [1, 1, 1] : tensor<10x4x1xf32> to tensor<?x?x?xf32>
+    return
+}
+
 
 func.func @main() {
   %0 = arith.constant 0 : index
@@ -115,5 +120,9 @@ func.func @main() {
   %dim_2 = arith.constant 1 : index
   func.call @extract_slice_zero_size_dim(%cst10x4x1xf32, %dim_0, %dim_1, %dim_2) : (tensor<10x4x1xf32>, index, index, index) -> ()
 
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  %offset = arith.constant 10 : index  
+  func.call @extract_slice_empty_tensor(%cst10x4x1xf32, %dim_0, %dim_1, %dim_2, %offset) : (tensor<10x4x1xf32>, index, index, index, index) -> ()
+
   return
 }
diff --git a/mlir/test/Interfaces/InferShapedTypeOpInterface/resolve-shaped-type-result-dims.mlir b/mlir/test/Interfaces/InferShapedTypeOpInterface/resolve-shaped-type-result-dims.mlir
index 4fa7406f21042..624e0990a4bb3 100644
--- a/mlir/test/Interfaces/InferShapedTypeOpInterface/resolve-shaped-type-result-dims.mlir
+++ b/mlir/test/Interfaces/InferShapedTypeOpInterface/resolve-shaped-type-result-dims.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-opt %s -resolve-shaped-type-result-dims -split-input-file | FileCheck %s
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(resolve-shaped-type-result-dims{error-on-pattern-iteration-limit=false}))" -split-input-file | FileCheck %s
+// See %test_unreifiable_result_shape below for why `error-on-partition-iteration-limit` is set to false.
 
 func.func @result_shape(%arg0 : tensor<2x3x?xf32>, %arg1 : tensor<?x5xf32>)
     -> (index, index, index, index, index) {
@@ -27,12 +28,14 @@ func.func @result_shape(%arg0 : tensor<2x3x?xf32>, %arg1 : tensor<?x5xf32>)
 
 // -----
 
-func.func @result_shape_per_dim(%arg0 : tensor<2x3x?xf32>, %arg1 : tensor<?x5xf32>)
+// Test result shape reification for an operation that implements only 
+// `reifyResultShapes` method of the `InferShapedTypeOpInterface`.
+func.func @reify_shaped_type_using_reify_result_shapes(%arg0 : tensor<2x3x?xf32>, %arg1 : tensor<?x5xf32>)
     -> (index, index, index, index, index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
-  %0:2 = "test.op_with_result_shape_per_dim_interface"(%arg0, %arg1)
+  %0:2 = "test.reify_shaped_type_using_reify_result_shapes"(%arg0, %arg1)
       : (tensor<2x3x?xf32>, tensor<?x5xf32>) -> (tensor<?x5xf32>, tensor<2x3x?xf32>)
   %1 = tensor.dim %0#0, %c0 : tensor<?x5xf32>
   %2 = tensor.dim %0#0, %c1 : tensor<?x5xf32>
@@ -41,7 +44,7 @@ func.func @result_shape_per_dim(%arg0 : tensor<2x3x?xf32>, %arg1 : tensor<?x5xf3
   %5 = tensor.dim %0#1, %c2 : tensor<2x3x?xf32>
   return %1, %2, %3, %4, %5 : index, index, index, index, index
 }
-// CHECK-LABEL: func @result_shape_per_dim(
+// CHECK-LABEL: func @reify_shaped_type_using_reify_result_shapes(
 //  CHECK-SAME:   %[[ARG_0:[a-z0-9]*]]: tensor<2x3x?xf32>
 //  CHECK-SAME:   %[[ARG_1:[a-z0-9]*]]: tensor<?x5xf32>)
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
@@ -51,3 +54,127 @@ func.func @result_shape_per_dim(%arg0 : tensor<2x3x?xf32>, %arg1 : tensor<?x5xf3
 //   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[ARG_1]], %[[C0]]
 //   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[ARG_0]], %[[C2]]
 //       CHECK:   return %[[D0]], %[[C5]], %[[C2]], %[[C3]], %[[D1]]
+
+// -----
+
+// Test result shape reification for an operation that implements only 
+// `reifyShapeOfResult` method of the `InferShapedTypeOpInterface`.
+func.func @reify_shaped_type_using_reify_shape_of_result(%arg0 : tensor<2x3x?xf32>, %arg1 : tensor<?x5xf32>)
+    -> (index, index, index, index, index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %0:2 = "test.reify_shaped_type_using_reify_result_shapes"(%arg0, %arg1)
+      : (tensor<2x3x?xf32>, tensor<?x5xf32>) -> (tensor<?x5xf32>, tensor<2x3x?xf32>)
+  %1 = tensor.dim %0#0, %c0 : tensor<?x5xf32>
+  %2 = tensor.dim %0#0, %c1 : tensor<?x5xf32>
+  %3 = tensor.dim %0#1, %c0 : tensor<2x3x?xf32>
+  %4 = tensor.dim %0#1, %c1 : tensor<2x3x?xf32>
+  %5 = tensor.dim %0#1, %c2 : tensor<2x3x?xf32>
+  return %1, %2, %3, %4, %5 : index, index, index, index, index
+}
+// CHECK-LABEL: func @reify_shaped_type_using_reify_shape_of_result(
+//  CHECK-SAME:   %[[ARG_0:[a-z0-9]*]]: tensor<2x3x?xf32>
+//  CHECK-SAME:   %[[ARG_1:[a-z0-9]*]]: tensor<?x5xf32>)
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[C5:.+]] = arith.constant 5 : index
+//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[ARG_1]], %[[C0]]
+//   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[ARG_0]], %[[C2]]
+//       CHECK:   return %[[D0]], %[[C5]], %[[C2]], %[[C3]], %[[D1]]
+
+// -----
+
+// Test result shape reification for an operation that implements only 
+// `reifyDimOfResult` method of the `InferShapedTypeOpInterface`.
+func.func @reify_shaped_type_using_reify_dim_of_result(%arg0 : tensor<2x3x?xf32>, %arg1 : tensor<?x5xf32>)
+    -> (index, index, index, index, index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %0:2 = "test.reify_shaped_type_using_reify_result_shapes"(%arg0, %arg1)
+      : (tensor<2x3x?xf32>, tensor<?x5xf32>) -> (tensor<?x5xf32>, tensor<2x3x?xf32>)
+  %1 = tensor.dim %0#0, %c0 : tensor<?x5xf32>
+  %2 = tensor.dim %0#0, %c1 : tensor<?x5xf32>
+  %3 = tensor.dim %0#1, %c0 : tensor<2x3x?xf32>
+  %4 = tensor.dim %0#1, %c1 : tensor<2x3x?xf32>
+  %5 = tensor.dim %0#1, %c2 : tensor<2x3x?xf32>
+  return %1, %2, %3, %4, %5 : index, index, index, index, index
+}
+// CHECK-LABEL: func @reify_shaped_type_using_reify_dim_of_result(
+//  CHECK-SAME:   %[[ARG_0:[a-z0-9]*]]: tensor<2x3x?xf32>
+//  CHECK-SAME:   %[[ARG_1:[a-z0-9]*]]: tensor<?x5xf32>)
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[C5:.+]] = arith.constant 5 : index
+//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[ARG_1]], %[[C0]]
+//   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[ARG_0]], %[[C2]]
+//       CHECK:   return %[[D0]], %[[C5]], %[[C2]], %[[C3]], %[[D1]]
+
+// -----
+
+// This tests also indicates a problem with the approach of just using `reifyShapes`
+// without being specific about {result, dim} that needs to be resolved. The
+// `reifyShapes` implementations introduces `dim` operations that are effectively
+// dead, but it creates an infinite loop on pattern application (which eventually
+// bails on hitting the iteration limit). This is the pitfall of this legacy
+// mechanism.
+
+func.func @test_unreifiable_result_shapes(%arg0 : tensor<?x?xf32>)
+    -> (index, index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = "test.unreifiable_result_shapes"(%arg0) : (tensor<?x?xf32>) -> tensor<?x?xf32>
+  %d0 = tensor.dim %0, %c0 : tensor<?x?xf32>
+  %d1 = tensor.dim %0, %c1 : tensor<?x?xf32>
+  return %d0, %d1 : index, index
+}
+// CHECK-LABEL: func @test_unreifiable_result_shapes(
+//  CHECK-SAME:   %[[ARG0:.+]]: tensor<?x?xf32>)
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+//   CHECK-DAG:   %[[OP:.+]] = "test.unreifiable_result_shapes"(%[[ARG0]])
+//       CHECK:   %[[D1:.+]] = tensor.dim %[[OP]], %[[C1]]
+//       CHECK:   return %[[D0]], %[[D1]]
+// -----
+
+func.func @test_unreifiable_result_shape(%arg0 : tensor<?x?xf32>)
+    -> (index, index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = "test.unreifiable_result_shape"(%arg0) : (tensor<?x?xf32>) -> tensor<?x?xf32>
+  %d0 = tensor.dim %0, %c0 : tensor<?x?xf32>
+  %d1 = tensor.dim %0, %c1 : tensor<?x?xf32>
+  return %d0, %d1 : index, index
+}
+// CHECK-LABEL: func @test_unreifiable_result_shape(
+//  CHECK-SAME:   %[[ARG0:.+]]: tensor<?x?xf32>)
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+//   CHECK-DAG:   %[[OP:.+]] = "test.unreifiable_result_shape"(%[[ARG0]])
+//       CHECK:   %[[D1:.+]] = tensor.dim %[[OP]], %[[C1]]
+//       CHECK:   return %[[D0]], %[[D1]]
+
+// -----
+
+func.func @test_unreifiable_dim_of_result_shape(%arg0 : tensor<?x?xf32>)
+    -> (index, index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = "test.unreifiable_dim_of_result_shape"(%arg0) : (tensor<?x?xf32>) -> tensor<?x?xf32>
+  %d0 = tensor.dim %0, %c0 : tensor<?x?xf32>
+  %d1 = tensor.dim %0, %c1 : tensor<?x?xf32>
+  return %d0, %d1 : index, index
+}
+// CHECK-LABEL: func @test_unreifiable_dim_of_result_shape(
+//  CHECK-SAME:   %[[ARG0:.+]]: tensor<?x?xf32>)
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+//   CHECK-DAG:   %[[OP:.+]] = "test.unreifiable_dim_of_result_shape"(%[[ARG0]])
+//       CHECK:   %[[D1:.+]] = tensor.dim %[[OP]], %[[C1]]
+//       CHECK:   return %[[D0]], %[[D1]]
diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index 60bd24a27868e..1e4cf8d4589cb 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -1308,7 +1308,7 @@ llvm.func @experimental_constrained_fpext(%s: f32, %v: vector<4xf32>) {
 // CHECK-DAG: declare float @llvm.cos.f32(float)
 // CHECK-DAG: declare <8 x float> @llvm.cos.v8f32(<8 x float>) #0
 // CHECK-DAG: declare { float, float } @llvm.sincos.f32(float)
-// CHECK-DAG: declare { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float>) #0
+// CHECK-DAG: declare { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float>)
 // CHECK-DAG: declare float @llvm.copysign.f32(float, float)
 // CHECK-DAG: declare float @llvm.rint.f32(float)
 // CHECK-DAG: declare double @llvm.rint.f64(double)
diff --git a/mlir/test/Target/LLVMIR/nvvm/barrier.mlir b/mlir/test/Target/LLVMIR/nvvm/barrier.mlir
new file mode 100644
index 0000000000000..d89f93101c1fc
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/barrier.mlir
@@ -0,0 +1,20 @@
+// RUN: mlir-translate -mlir-to-llvmir %s  -split-input-file --verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: @llvm_nvvm_barrier(
+// CHECK-SAME: i32 %[[barId:.*]], i32 %[[numThreads:.*]], i32 %[[redOperand:.*]])
+llvm.func @llvm_nvvm_barrier(%barID : i32, %numberOfThreads : i32, %redOperand : i32) {
+  // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+  nvvm.barrier
+  // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %[[barId]])
+  nvvm.barrier id = %barID
+  // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 %[[barId]], i32 %[[numThreads]])
+  nvvm.barrier id = %barID number_of_threads = %numberOfThreads
+  // CHECK: %{{.*}} = call i32 @llvm.nvvm.barrier0.and(i32 %[[redOperand]])
+  %0 = nvvm.barrier #nvvm.reduction<and> %redOperand -> i32
+  // CHECK: %{{.*}} = call i32 @llvm.nvvm.barrier0.or(i32 %[[redOperand]])
+  %1 = nvvm.barrier #nvvm.reduction<or> %redOperand -> i32
+  // CHECK: %{{.*}} = call i32 @llvm.nvvm.barrier0.popc(i32 %[[redOperand]])
+  %2 = nvvm.barrier #nvvm.reduction<popc> %redOperand -> i32
+
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/convert_stochastic_rounding.mlir b/mlir/test/Target/LLVMIR/nvvm/convert_stochastic_rounding.mlir
new file mode 100644
index 0000000000000..b5bb22350dcd7
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/convert_stochastic_rounding.mlir
@@ -0,0 +1,182 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// -----
+
+// Test valid architectures work
+
+// Valid case on sm_100a
+gpu.module @valid_f16x2_rs_sm_100a [#nvvm.target<chip = "sm_100a">] {
+  func.func @convert_rs() {
+    %f1 = llvm.mlir.constant(1.0 : f32) : f32
+    %f2 = llvm.mlir.constant(2.0 : f32) : f32
+    %rbits = llvm.mlir.constant(0x12345678 : i32) : i32
+    %res = nvvm.convert.f32x2.to.f16x2 %f1, %f2, %rbits : vector<2xf16>
+    return
+  }
+}
+
+// Valid case on sm_103a
+gpu.module @valid_bf16x2_rs_sm_103a [#nvvm.target<chip = "sm_103a">] {
+  func.func @convert_rs() {
+    %f1 = llvm.mlir.constant(1.0 : f32) : f32
+    %f2 = llvm.mlir.constant(2.0 : f32) : f32
+    %rbits = llvm.mlir.constant(0 : i32) : i32
+    %res = nvvm.convert.f32x2.to.bf16x2 %f1, %f2, %rbits : vector<2xbf16>
+    return
+  }
+}
+
+// -----
+
+// Test F32x2 -> F16x2 with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x2_to_f16x2_rs
+llvm.func @convert_f32x2_to_f16x2_rs(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> {
+  // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB,  %rbits : vector<2xf16>
+  llvm.return %res : vector<2xf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_f16x2_rs_satfinite
+llvm.func @convert_f32x2_to_f16x2_rs_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> {
+  // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {sat = #nvvm.sat_mode<satfinite>} : vector<2xf16>
+  llvm.return %res : vector<2xf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_f16x2_rs_relu
+llvm.func @convert_f32x2_to_f16x2_rs_relu(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> {
+  // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {relu = true} : vector<2xf16>
+  llvm.return %res : vector<2xf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_f16x2_rs_relu_satfinite
+llvm.func @convert_f32x2_to_f16x2_rs_relu_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> {
+  // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {relu = true, sat = #nvvm.sat_mode<satfinite>} : vector<2xf16>
+  llvm.return %res : vector<2xf16>
+}
+
+// -----
+
+// Test F32x2 -> BF16x2 with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs
+llvm.func @convert_f32x2_to_bf16x2_rs(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> {
+  // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB,  %rbits : vector<2xbf16>
+  llvm.return %res : vector<2xbf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs_satfinite
+llvm.func @convert_f32x2_to_bf16x2_rs_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> {
+  // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16>
+  llvm.return %res : vector<2xbf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs_relu
+llvm.func @convert_f32x2_to_bf16x2_rs_relu(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> {
+  // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {relu = true} : vector<2xbf16>
+  llvm.return %res : vector<2xbf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs_relu_satfinite
+llvm.func @convert_f32x2_to_bf16x2_rs_relu_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> {
+  // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {relu = true, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16>
+  llvm.return %res : vector<2xbf16>
+}
+
+// -----
+
+// Test F32x4 -> F8x4 (E4M3) with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x4_to_f8x4_e4m3_rs
+llvm.func @convert_f32x4_to_f8x4_e4m3_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E4M3FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// CHECK-LABEL: @convert_f32x4_to_f8x4_e4m3_rs_relu
+llvm.func @convert_f32x4_to_f8x4_e4m3_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f8E4M3FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test F32x4 -> F8x4 (E5M2) with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x4_to_f8x4_e5m2_rs
+llvm.func @convert_f32x4_to_f8x4_e5m2_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E5M2)
+  llvm.return %res : vector<4xi8>
+}
+
+// CHECK-LABEL: @convert_f32x4_to_f8x4_e5m2_rs_relu
+llvm.func @convert_f32x4_to_f8x4_e5m2_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f8E5M2)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test F32x4 -> F6x4 (E2M3) with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x4_to_f6x4_e2m3_rs
+llvm.func @convert_f32x4_to_f6x4_e2m3_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f6E2M3FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// CHECK-LABEL: @convert_f32x4_to_f6x4_e2m3_rs_relu
+llvm.func @convert_f32x4_to_f6x4_e2m3_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f6E2M3FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test F32x4 -> F6x4 (E3M2) with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x4_to_f6x4_e3m2_rs
+llvm.func @convert_f32x4_to_f6x4_e3m2_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f6E3M2FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// CHECK-LABEL: @convert_f32x4_to_f6x4_e3m2_rs_relu
+llvm.func @convert_f32x4_to_f6x4_e3m2_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f6E3M2FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test F32x4 -> F4x4 (E2M1) with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x4_to_f4x4_e2m1_rs
+llvm.func @convert_f32x4_to_f4x4_e2m1_rs(%src : vector<4xf32>, %rbits : i32) -> i16 {
+  // CHECK: %{{.*}} = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f4x4 %src, %rbits : vector<4xf32> -> i16 (f4E2M1FN)
+  llvm.return %res : i16
+}
+
+// CHECK-LABEL: @convert_f32x4_to_f4x4_e2m1_rs_relu
+llvm.func @convert_f32x4_to_f4x4_e2m1_rs_relu(%src : vector<4xf32>, %rbits : i32) -> i16 {
+  // CHECK: %{{.*}} = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f4x4 %src, %rbits {relu = true} : vector<4xf32> -> i16 (f4E2M1FN)
+  llvm.return %res : i16
+}
+
diff --git a/mlir/test/Target/LLVMIR/nvvm/mbarriers.mlir b/mlir/test/Target/LLVMIR/nvvm/mbarriers.mlir
new file mode 100644
index 0000000000000..9bb3b082777fd
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/mbarriers.mlir
@@ -0,0 +1,116 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+llvm.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.ptr) {
+  // CHECK-LABEL: define void @cp_async_mbarrier_arrive(ptr addrspace(3) %0, ptr %1) {
+  // CHECK-NEXT: call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %1)
+  // CHECK-NEXT: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %1)
+  // CHECK-NEXT: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %0)
+  // CHECK-NEXT: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  nvvm.cp.async.mbarrier.arrive %bar_gen : !llvm.ptr
+  nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr
+  nvvm.cp.async.mbarrier.arrive %bar_shared : !llvm.ptr<3>
+  nvvm.cp.async.mbarrier.arrive %bar_shared {noinc = true} : !llvm.ptr<3>
+  llvm.return
+}
+
+llvm.func @mbarrier_init_generic(%barrier: !llvm.ptr) {
+  // CHECK-LABEL: define void @mbarrier_init_generic(ptr %0) {
+  // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-NEXT: call void @llvm.nvvm.mbarrier.init(ptr %0, i32 %2)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %count = nvvm.read.ptx.sreg.ntid.x : i32
+  nvvm.mbarrier.init %barrier, %count : !llvm.ptr, i32
+  llvm.return
+}
+
+llvm.func @mbarrier_init_shared(%barrier: !llvm.ptr<3>) {
+  // CHECK-LABEL: define void @mbarrier_init_shared(ptr addrspace(3) %0) {
+  // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-NEXT: call void @llvm.nvvm.mbarrier.init.shared(ptr addrspace(3) %0, i32 %2)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %count = nvvm.read.ptx.sreg.ntid.x : i32
+  nvvm.mbarrier.init %barrier, %count : !llvm.ptr<3>, i32
+  llvm.return
+}
+
+llvm.func @mbarrier_inval_generic(%barrier: !llvm.ptr) {
+  // CHECK-LABEL: define void @mbarrier_inval_generic(ptr %0) {
+  // CHECK-NEXT: call void @llvm.nvvm.mbarrier.inval(ptr %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  nvvm.mbarrier.inval %barrier : !llvm.ptr
+  llvm.return
+}
+
+llvm.func @mbarrier_inval_shared(%barrier: !llvm.ptr<3>) {
+  // CHECK-LABEL: define void @mbarrier_inval_shared(ptr addrspace(3) %0) {
+  // CHECK-NEXT: call void @llvm.nvvm.mbarrier.inval.shared(ptr addrspace(3) %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  nvvm.mbarrier.inval %barrier : !llvm.ptr<3>
+  llvm.return
+}
+
+llvm.func @mbarrier_arrive(%barrier: !llvm.ptr) {
+  // CHECK-LABEL: define void @mbarrier_arrive(ptr %0) {
+  // CHECK-NEXT: %2 = call i64 @llvm.nvvm.mbarrier.arrive(ptr %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %0 = nvvm.mbarrier.arrive %barrier : !llvm.ptr  -> i64
+  llvm.return
+}
+
+llvm.func @mbarrier_arrive_shared(%barrier: !llvm.ptr<3>) {
+  // CHECK-LABEL: define void @mbarrier_arrive_shared(ptr addrspace(3) %0) {
+  // CHECK-NEXT: %2 = call i64 @llvm.nvvm.mbarrier.arrive.shared(ptr addrspace(3) %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %0 = nvvm.mbarrier.arrive %barrier : !llvm.ptr<3> -> i64
+  llvm.return
+}
+
+llvm.func @mbarrier_arrive_nocomplete(%barrier: !llvm.ptr) {
+  // CHECK-LABEL: define void @mbarrier_arrive_nocomplete(ptr %0) {
+  // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.noComplete(ptr %0, i32 %2)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %count = nvvm.read.ptx.sreg.ntid.x : i32
+  %0 = nvvm.mbarrier.arrive.nocomplete %barrier, %count : !llvm.ptr, i32 -> i64
+  llvm.return
+}
+
+llvm.func @mbarrier_arrive_nocomplete_shared(%barrier: !llvm.ptr<3>) {
+  // CHECK-LABEL: define void @mbarrier_arrive_nocomplete_shared(ptr addrspace(3) %0) {
+  // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared(ptr addrspace(3) %0, i32 %2)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %count = nvvm.read.ptx.sreg.ntid.x : i32
+  %0 = nvvm.mbarrier.arrive.nocomplete %barrier, %count : !llvm.ptr<3>, i32  -> i64
+  llvm.return
+}
+
+llvm.func @mbarrier_test_wait(%barrier: !llvm.ptr, %token : i64) -> i1 {
+  // CHECK-LABEL: define i1 @mbarrier_test_wait(ptr %0, i64 %1) {
+  // CHECK-NEXT: %3 = call i1 @llvm.nvvm.mbarrier.test.wait(ptr %0, i64 %1)
+  // CHECK-NEXT: ret i1 %3
+  // CHECK-NEXT: }
+  %isComplete = nvvm.mbarrier.test.wait %barrier, %token : !llvm.ptr, i64 -> i1
+  llvm.return %isComplete : i1
+}
+
+llvm.func @mbarrier_test_wait_shared(%barrier: !llvm.ptr<3>, %token : i64) {
+  // CHECK-LABEL: define void @mbarrier_test_wait_shared(ptr addrspace(3) %0, i64 %1) {
+  // CHECK-NEXT: %3 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-NEXT: %4 = call i1 @llvm.nvvm.mbarrier.test.wait.shared(ptr addrspace(3) %0, i64 %1)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %count = nvvm.read.ptx.sreg.ntid.x : i32
+  %isComplete = nvvm.mbarrier.test.wait %barrier, %token : !llvm.ptr<3>, i64 -> i1
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/membar.mlir b/mlir/test/Target/LLVMIR/nvvm/membar.mlir
new file mode 100644
index 0000000000000..1b794f663b573
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/membar.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-translate -mlir-to-llvmir %s  -split-input-file --verify-diagnostics | FileCheck %s
+
+// CHECK-lABEL: @memorybarrier()
+llvm.func @memorybarrier() {
+  // CHECK: call void @llvm.nvvm.membar.cta()
+  nvvm.memory.barrier #nvvm.mem_scope<cta>
+  // CHECK: call void @llvm.nvvm.fence.sc.cluster()
+  nvvm.memory.barrier #nvvm.mem_scope<cluster>
+  // CHECK: call void @llvm.nvvm.membar.gl()
+  nvvm.memory.barrier #nvvm.mem_scope<gpu>
+  // CHECK: call void @llvm.nvvm.membar.sys()
+  nvvm.memory.barrier #nvvm.mem_scope<sys>
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir
new file mode 100644
index 0000000000000..a8a743006fbf8
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir
@@ -0,0 +1,49 @@
+// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
+
+// -----
+
+llvm.func @redux_sync_i32_with_abs(%value: i32, %offset: i32) {
+  // expected-error@+1 {{abs attribute is supported only for f32 type}}
+  %res = nvvm.redux.sync add %value, %offset {abs = true}: i32 -> i32
+  llvm.return
+}
+
+// -----
+
+llvm.func @redux_sync_i32_with_nan(%value: i32, %offset: i32) {
+  // expected-error@+1 {{nan attribute is supported only for f32 type}}
+  %res = nvvm.redux.sync add %value, %offset {nan = true}: i32 -> i32
+  llvm.return
+}
+
+// -----
+
+llvm.func @redux_sync_f32_with_invalid_kind_add(%value: f32, %offset: i32) {
+  // expected-error@+1 {{'add' redux kind unsupported with 'f32' type. Only supported type is 'i32'.}}
+  %res = nvvm.redux.sync add %value, %offset: f32 -> f32
+  llvm.return
+}
+
+// -----
+
+llvm.func @redux_sync_f32_with_invalid_kind_and(%value: f32, %offset: i32) {
+  // expected-error@+1 {{'and' redux kind unsupported with 'f32' type. Only supported type is 'i32'.}}
+  %res = nvvm.redux.sync and %value, %offset: f32 -> f32
+  llvm.return
+}
+
+// -----
+
+llvm.func @redux_sync_i32_with_invalid_kind_fmin(%value: i32, %offset: i32) {
+  // expected-error@+1 {{'fmin' redux kind unsupported with 'i32' type. Only supported type is 'f32'.}}
+  %res = nvvm.redux.sync fmin %value, %offset: i32 -> i32
+  llvm.return
+}
+
+// -----
+
+llvm.func @redux_sync_non_matching_types(%value: i32, %offset: i32) {
+  // expected-error@+1 {{failed to verify that all of {res, val} have same type}}
+  %res = nvvm.redux.sync add %value, %offset: i32 -> f32
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/shfl-sync-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/shfl-sync-invalid.mlir
new file mode 100644
index 0000000000000..f2ccfe71a3f23
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/shfl-sync-invalid.mlir
@@ -0,0 +1,22 @@
+// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
+
+// -----
+
+func.func @nvvm_invalid_shfl_pred(%arg0 : i32, %arg1 : f32, %arg2 : i32, %arg3 : i32) {
+  // expected-error@+1 {{"return_value_and_is_valid" attribute must be specified when the return type is a struct type}}
+  %0 = nvvm.shfl.sync bfly %arg0, %arg1, %arg2, %arg3 : f32 -> !llvm.struct<(f32, i1)>
+}
+
+// -----
+
+func.func @nvvm_invalid_shfl_invalid_return_type_1(%arg0 : i32, %arg1 : f32, %arg2 : i32, %arg3 : i32) {
+  // expected-error@+1 {{expected return type to be of type 'f32' but got 'i32' instead}}
+  %0 = nvvm.shfl.sync bfly %arg0, %arg1, %arg2, %arg3 : f32 -> i32
+}
+
+// -----
+
+func.func @nvvm_invalid_shfl_invalid_return_type_2(%arg0 : i32, %arg1 : f32, %arg2 : i32, %arg3 : i32) {
+  // expected-error@+1 {{expected first element in the returned struct to be of type 'f32' but got 'i32' instead}}
+  %0 = nvvm.shfl.sync bfly %arg0, %arg1, %arg2, %arg3 {return_value_and_is_valid} : f32 -> !llvm.struct<(i32, i1)>
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-invalid.mlir
new file mode 100644
index 0000000000000..1b93f20c15b99
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-invalid.mlir
@@ -0,0 +1,9 @@
+// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
+
+// -----
+
+llvm.func @nvvm_tcgen05_ld_32x32b_offset(%tmemAddr : !llvm.ptr<6>, %offset : i64) -> () {
+  // expected-error@+1 {{offset argument is only supported for shape 16x32bx2}}
+  %ldv2 = nvvm.tcgen05.ld %tmemAddr, %offset { pack, shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<2 x i32>
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
index 09b8f593154b5..d5868ee73cc50 100644
--- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -578,14 +578,6 @@ llvm.func @ld_matrix(%arg0: !llvm.ptr<3>) {
 
 // -----
 
-llvm.func @nanosleep() {
-  // expected-error@+1 {{integer constant out of range for attribute}}
-  nvvm.nanosleep 100000000000000
-  llvm.return
-}
-
-// -----
-
 llvm.func @clusterlaunchcontrol_query_cancel_is_canceled_invalid_return_type(%try_cancel_response: i128) {
   // expected-error@+1 {{'nvvm.clusterlaunchcontrol.query.cancel' op is_canceled query type returns an i1}}
   %res = nvvm.clusterlaunchcontrol.query.cancel query = is_canceled, %try_cancel_response : i32
@@ -621,3 +613,14 @@ func.func @invalid_range_equal_bounds() {
   %0 = nvvm.read.ptx.sreg.warpsize range <i32, 32, 32> : i32
   return
 }
+
+// -----
+
+// Test for correct return type check for wmma.load fragment a for f64 
+llvm.func @nvvm_wmma_load_a_f64(%arg0: !llvm.ptr, %arg1 : i32) {
+  // expected-error @below {{'nvvm.wmma.load' op expected destination type to be f64}}
+  %0 = nvvm.wmma.load %arg0, %arg1
+    {eltype = #nvvm.mma_type<f64>, frag = #nvvm.mma_frag<a>, k = 4 : i32, layout = #nvvm.mma_layout<row>, m = 8 : i32, n = 8 : i32}
+    : (!llvm.ptr) -> !llvm.struct<(f64)>
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 594ae4849e3eb..5cba5c4fceefd 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -166,25 +166,6 @@ llvm.func @nvvm_rcp(%0: f32) -> f32 {
   llvm.return %1 : f32
 }
 
-// CHECK-LABEL: @llvm_nvvm_barrier0
-llvm.func @llvm_nvvm_barrier0() {
-  // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
-  nvvm.barrier0
-  llvm.return
-}
-
-// CHECK-LABEL: @llvm_nvvm_barrier(
-// CHECK-SAME: i32 %[[barId:.*]], i32 %[[numThreads:.*]])
-llvm.func @llvm_nvvm_barrier(%barID : i32, %numberOfThreads : i32) {
-  // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
-  nvvm.barrier
-  // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %[[barId]])
-  nvvm.barrier id = %barID
-  // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 %[[barId]], i32 %[[numThreads]])
-  nvvm.barrier id = %barID number_of_threads = %numberOfThreads
-  llvm.return
-}
-
 // CHECK-LABEL: @llvm_nvvm_cluster_arrive
 llvm.func @llvm_nvvm_cluster_arrive() {
   // CHECK: call void @llvm.nvvm.barrier.cluster.arrive()
@@ -463,6 +444,43 @@ llvm.func @nvvm_wmma_mma(%0 : i32, %1 : i32, %2 : i32, %3 : i32, %4 : i32, %5 :
   llvm.return
 }
 
+// CHECK-LABEL: @nvvm_wmma_load_a_f64
+llvm.func @nvvm_wmma_load_a_f64(%arg0: !llvm.ptr, %arg1 : i32) {
+  // CHECK: call double @llvm.nvvm.wmma.m8n8k4.load.a.row.stride.f64.p0(ptr %{{.*}}, i32 %{{.*}})
+  %0 = nvvm.wmma.load %arg0, %arg1
+    {eltype = #nvvm.mma_type<f64>, frag = #nvvm.mma_frag<a>, k = 4 : i32, layout = #nvvm.mma_layout<row>, m = 8 : i32, n = 8 : i32}
+    : (!llvm.ptr) -> f64
+  llvm.return
+}
+
+// CHECK-LABEL: @nvvm_wmma_load_c_f64
+llvm.func @nvvm_wmma_load_c_f64(%arg0: !llvm.ptr, %arg1 : i32) {
+  // CHECK: call { double, double } @llvm.nvvm.wmma.m8n8k4.load.c.row.stride.f64.p0(ptr %{{.*}}, i32 %{{.*}})
+  %0 = nvvm.wmma.load %arg0, %arg1
+    {eltype = #nvvm.mma_type<f64>, frag = #nvvm.mma_frag<c>, k = 4 : i32, layout = #nvvm.mma_layout<row>, m = 8 : i32, n = 8 : i32}
+    : (!llvm.ptr) -> !llvm.struct<(f64, f64)>
+  llvm.return
+}
+
+// CHECK-LABEL: @nvvm_wmma_mma_f64
+llvm.func @nvvm_wmma_mma_f64(%0 : f64, %1 : f64, %2 : f64, %3 : f64) {
+  // CHECK: { double, double } @llvm.nvvm.wmma.m8n8k4.mma.row.col.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}})
+  %r = nvvm.wmma.mma %0, %1, %2, %3
+    {eltypeA = #nvvm.mma_type<f64>, eltypeB = #nvvm.mma_type<f64>, k = 4 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>, m = 8 : i32, n = 8 : i32}
+    : (f64, f64, f64, f64)
+    -> !llvm.struct<(f64, f64)>
+  llvm.return
+}
+
+// CHECK-LABEL: @nvvm_wmma_store_d_f64
+llvm.func @nvvm_wmma_store_d_f64(%arg0: !llvm.ptr, %arg1 : i32, %arg2 : f64, %arg3 : f64) {
+  // CHECK: call void @llvm.nvvm.wmma.m8n8k4.store.d.row.stride.f64.p0(ptr %{{.*}}, double %{{.*}}, double %{{.*}}, i32 %{{.*}})
+  nvvm.wmma.store %arg0, %arg1, %arg2, %arg3
+    {eltype = #nvvm.mma_type<f64>, k = 4 : i32, layout = #nvvm.mma_layout<row>, m = 8 : i32, n = 8 : i32}
+    : !llvm.ptr, f64, f64
+  llvm.return
+}
+
 // CHECK-LABEL: @cp_async
 llvm.func @cp_async(%arg0: !llvm.ptr<3>, %arg1: !llvm.ptr<1>) {
   // CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
@@ -494,19 +512,6 @@ llvm.func @async_cp_zfill(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>, %cpSize: i32)
   llvm.return
 }
 
-// CHECK-LABEL: @cp_async_mbarrier_arrive
-llvm.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.ptr) {
-  // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %{{.*}})
-  nvvm.cp.async.mbarrier.arrive %bar_gen : !llvm.ptr
-  // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %{{.*}})
-  nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr
-  // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %{{.*}})
-  nvvm.cp.async.mbarrier.arrive.shared %bar_shared : !llvm.ptr<3>
-  // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %{{.*}})
-  nvvm.cp.async.mbarrier.arrive.shared %bar_shared {noinc = true} : !llvm.ptr<3>
-  llvm.return
-}
-
 // CHECK-LABEL: @llvm_nvvm_setmaxregister
 llvm.func @llvm_nvvm_setmaxregister() {
   // CHECK: call void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 256)
@@ -946,8 +951,8 @@ llvm.func @nvvm_pmevent() {
 // -----
 
 // CHECK-LABEL: @nanosleep
-llvm.func @nanosleep() {
-  // CHECK: call void @llvm.nvvm.nanosleep(i32 4000)
-  nvvm.nanosleep 4000
+llvm.func @nanosleep(%duration: i32) {
+  // CHECK: call void @llvm.nvvm.nanosleep(i32 %{{.*}})
+  nvvm.nanosleep %duration
   llvm.return
 }
diff --git a/mlir/test/Target/LLVMIR/omptarget-record-type-with-ptr-member-host.mlir b/mlir/test/Target/LLVMIR/omptarget-record-type-with-ptr-member-host.mlir
index a1e415c35e4b6..9640f03311af7 100644
--- a/mlir/test/Target/LLVMIR/omptarget-record-type-with-ptr-member-host.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-record-type-with-ptr-member-host.mlir
@@ -81,9 +81,8 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
 // CHECK: %[[ARR_SECT_SIZE:.*]] = mul i64 %[[ARR_SECT_SIZE1]], 4
 // CHECK: %[[LFULL_ARR:.*]] = load ptr, ptr @full_arr, align 8
 // CHECK: %[[FULL_ARR_PTR:.*]] = getelementptr inbounds float, ptr %[[LFULL_ARR]], i64 0
-// CHECK: %[[ARR_SECT_OFFSET1:.*]] = mul i64 %[[ARR_SECT_OFFSET2]], 1
 // CHECK: %[[LARR_SECT:.*]] = load ptr, ptr @sect_arr, align 8
-// CHECK: %[[ARR_SECT_PTR:.*]] = getelementptr inbounds i32, ptr %[[LARR_SECT]], i64 %[[ARR_SECT_OFFSET1]]
+// CHECK: %[[ARR_SECT_PTR:.*]] = getelementptr inbounds i32, ptr %[[LARR_SECT]], i64 %[[ARR_SECT_OFFSET2]]
 // CHECK: %[[SCALAR_PTR_LOAD:.*]] = load ptr, ptr %[[SCALAR_BASE]], align 8
 // CHECK: %[[FULL_ARR_DESC_SIZE:.*]] = sdiv exact i64 48, ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
 // CHECK: %[[FULL_ARR_SIZE_CMP:.*]] = icmp eq ptr %[[FULL_ARR_PTR]], null
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 8a848221a50dd..db02918d7186c 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1028,6 +1028,39 @@ llvm.func @rocdl.ds.read.tr(%ptr : !llvm.ptr<3>) -> vector<4xf16> {
   llvm.return %r3 : vector<4xf16>
 }
 
+llvm.func @rocdl.load.tr.ops(%gl_ptr : !llvm.ptr<1>, %ds_ptr : !llvm.ptr<3>) {
+  // CHECK-LABEL: rocdl.load.tr.ops
+  // CHECK-SAME: (ptr addrspace(1) %[[GL_PTR:.+]], ptr addrspace(3) %[[DS_PTR:.+]])
+  // CHECK: call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1) %[[GL_PTR]])
+  // CHECK: call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) %[[GL_PTR]])
+  // CHECK: call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1) %[[GL_PTR]])
+  // CHECK: call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) %[[GL_PTR]])
+  // CHECK: call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) %[[GL_PTR]])
+  // CHECK: call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) %[[GL_PTR]])
+
+  // CHECK: call <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32(ptr addrspace(3) %[[DS_PTR]])
+  // CHECK: call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) %[[DS_PTR]])
+  // CHECK: call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32(ptr addrspace(3) %[[DS_PTR]])
+  // CHECK: call <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16(ptr addrspace(3) %[[DS_PTR]])
+  // CHECK: call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) %[[DS_PTR]])
+  // CHECK: call <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16(ptr addrspace(3) %[[DS_PTR]])
+
+  rocdl.global.load.tr4.b64 %gl_ptr : !llvm.ptr<1> -> vector<2xi32>
+  rocdl.global.load.tr.b64 %gl_ptr : !llvm.ptr<1> -> vector<2xi32>
+  rocdl.global.load.tr6.b96 %gl_ptr : !llvm.ptr<1> -> vector<3xi32>
+  rocdl.global.load.tr.b128 %gl_ptr : !llvm.ptr<1> -> vector<8xi16>
+  rocdl.global.load.tr.b128 %gl_ptr : !llvm.ptr<1> -> vector<8xf16>
+  rocdl.global.load.tr.b128 %gl_ptr : !llvm.ptr<1> -> vector<8xbf16>
+
+  rocdl.ds.load.tr4.b64 %ds_ptr : !llvm.ptr<3> -> vector<2xi32>
+  rocdl.ds.load.tr8.b64 %ds_ptr : !llvm.ptr<3> -> vector<2xi32>
+  rocdl.ds.load.tr6.b96 %ds_ptr : !llvm.ptr<3> -> vector<3xi32>
+  rocdl.ds.load.tr16.b128 %ds_ptr : !llvm.ptr<3> -> vector<8xi16>
+  rocdl.ds.load.tr16.b128 %ds_ptr : !llvm.ptr<3> -> vector<8xf16>
+  rocdl.ds.load.tr16.b128 %ds_ptr : !llvm.ptr<3> -> vector<8xbf16>
+  llvm.return
+}
+
 llvm.func @rocdl.load.to.lds(%src : !llvm.ptr<7>, %dst: !llvm.ptr<3>) {
   //CHECK: call void @llvm.amdgcn.load.to.lds.p7
   rocdl.load.to.lds %src, %dst, 4, 0, 0 : !llvm.ptr<7>
@@ -1040,6 +1073,19 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   llvm.return
 }
 
+// CHECK-LABEL: rocdl.global.load.async.to.lds
+llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+  // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b8
+  rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+  // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b32
+  rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+  // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b64
+  rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+  // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b128
+  rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+  llvm.return
+}
+
 // CHECK-LABEL: rocdl.tensor.load.to.lds
 llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
                                     %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
diff --git a/mlir/test/Target/SPIRV/group-ops.mlir b/mlir/test/Target/SPIRV/group-ops.mlir
index cf519cba961c5..6f19b3553dd37 100644
--- a/mlir/test/Target/SPIRV/group-ops.mlir
+++ b/mlir/test/Target/SPIRV/group-ops.mlir
@@ -1,11 +1,13 @@
-// RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip -split-input-file %s | FileCheck %s
+// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip --split-input-file %s | FileCheck %s
 
 // RUN: %if spirv-tools %{ rm -rf %t %}
 // RUN: %if spirv-tools %{ mkdir %t %}
 // RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
 // RUN: %if spirv-tools %{ spirv-val %t %}
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.3, [Shader, Linkage, SubgroupBallotKHR, Groups, SubgroupBufferBlockIOINTEL, GroupNonUniformArithmetic, GroupUniformArithmeticKHR], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_shader_ballot, SPV_INTEL_subgroups, SPV_KHR_uniform_group_instructions]> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.3,
+  [Shader, Linkage, SubgroupBallotKHR, Groups, GroupNonUniformArithmetic, GroupUniformArithmeticKHR],
+  [SPV_KHR_storage_buffer_storage_class, SPV_KHR_shader_ballot, SPV_KHR_uniform_group_instructions]> {
   // CHECK-LABEL: @subgroup_ballot
   spirv.func @subgroup_ballot(%predicate: i1) -> vector<4xi32> "None" {
     // CHECK: %{{.*}} = spirv.KHR.SubgroupBallot %{{.*}}: vector<4xi32>
@@ -24,30 +26,6 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.3, [Shader, Linkage, Subgrou
     %0 = spirv.GroupBroadcast <Workgroup> %value, %localid : f32, vector<3xi32>
     spirv.ReturnValue %0: f32
   }
-  // CHECK-LABEL: @subgroup_block_read_intel
-  spirv.func @subgroup_block_read_intel(%ptr : !spirv.ptr<i32, StorageBuffer>) -> i32 "None" {
-    // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> i32
-    %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> i32
-    spirv.ReturnValue %0: i32
-  }
-  // CHECK-LABEL: @subgroup_block_read_intel_vector
-  spirv.func @subgroup_block_read_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>) -> vector<3xi32> "None" {
-    // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32>
-    %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32>
-    spirv.ReturnValue %0: vector<3xi32>
-  }
-  // CHECK-LABEL: @subgroup_block_write_intel
-  spirv.func @subgroup_block_write_intel(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: i32) -> () "None" {
-    // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : i32
-    spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : i32
-    spirv.Return
-  }
-  // CHECK-LABEL: @subgroup_block_write_intel_vector
-  spirv.func @subgroup_block_write_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: vector<3xi32>) -> () "None" {
-    // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : vector<3xi32>
-    spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : vector<3xi32>
-    spirv.Return
-  }
   // CHECK-LABEL: @group_iadd
   spirv.func @group_iadd(%value: i32) -> i32 "None" {
     // CHECK: spirv.GroupIAdd <Workgroup> <Reduce> %{{.*}} : i32
diff --git a/mlir/test/Target/SPIRV/loop.mlir b/mlir/test/Target/SPIRV/loop.mlir
index 95b87b319ac2d..b9a4295a04e7c 100644
--- a/mlir/test/Target/SPIRV/loop.mlir
+++ b/mlir/test/Target/SPIRV/loop.mlir
@@ -1,5 +1,10 @@
 // RUN: mlir-translate -no-implicit-module -split-input-file -test-spirv-roundtrip %s | FileCheck %s
 
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
 // Single loop
 
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
@@ -62,7 +67,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // Single loop with block arguments
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
   spirv.GlobalVariable @GV1 bind(0, 0) : !spirv.ptr<!spirv.struct<(!spirv.array<10 x f32, stride=4> [0])>, StorageBuffer>
   spirv.GlobalVariable @GV2 bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.array<10 x f32, stride=4> [0])>, StorageBuffer>
 // CHECK-LABEL: @loop_kernel
diff --git a/mlir/test/Target/SPIRV/phi.mlir b/mlir/test/Target/SPIRV/phi.mlir
index ca635a469eeaa..92a3387c2f8d4 100644
--- a/mlir/test/Target/SPIRV/phi.mlir
+++ b/mlir/test/Target/SPIRV/phi.mlir
@@ -1,5 +1,10 @@
 // RUN: mlir-translate -no-implicit-module -split-input-file -test-spirv-roundtrip %s | FileCheck %s
 
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
 // Test branch with one block argument
 
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
@@ -295,15 +300,26 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %true = spirv.Constant true
     %zero = spirv.Constant 0 : i32
     %one = spirv.Constant 1 : i32
+    spirv.mlir.selection {
 // CHECK:   spirv.BranchConditional %{{.*}}, ^[[true1:.*]](%{{.*}}, %{{.*}} : i32, i32), ^[[false1:.*]]
-    spirv.BranchConditional %true, ^true1(%zero, %zero: i32, i32), ^false1
+      spirv.BranchConditional %true, ^true1(%zero, %zero: i32, i32), ^false1
 // CHECK: [[true1]](%{{.*}}: i32, %{{.*}}: i32)
-  ^true1(%arg0: i32, %arg1: i32):
-    spirv.Return
+    ^true1(%arg0: i32, %arg1: i32):
+      spirv.Return
 // CHECK: [[false1]]:
-  ^false1:
+    ^false1:
+      spirv.Return
+    ^merge:
+      spirv.mlir.merge
+    }
+
+    spirv.Return
+  }
+
+  spirv.func @main() -> () "None" {
     spirv.Return
   }
+  spirv.EntryPoint "GLCompute" @main
 }
 
 // -----
@@ -314,15 +330,26 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %true = spirv.Constant true
     %zero = spirv.Constant 0 : i32
     %one = spirv.Constant 1 : i32
+    spirv.mlir.selection {
 // CHECK:   spirv.BranchConditional %{{.*}}, ^[[true1:.*]], ^[[false1:.*]](%{{.*}}, %{{.*}} : i32, i32)
-    spirv.BranchConditional %true, ^true1, ^false1(%zero, %zero: i32, i32)
+      spirv.BranchConditional %true, ^true1, ^false1(%zero, %zero: i32, i32)
 // CHECK: [[true1]]:
-  ^true1:
-    spirv.Return
+    ^true1:
+      spirv.Return
 // CHECK: [[false1]](%{{.*}}: i32, %{{.*}}: i32):
-  ^false1(%arg0: i32, %arg1: i32):
+    ^false1(%arg0: i32, %arg1: i32):
+      spirv.Return
+    ^merge:
+      spirv.mlir.merge
+    }
+
+    spirv.Return
+  }
+
+  spirv.func @main() -> () "None" {
     spirv.Return
   }
+  spirv.EntryPoint "GLCompute" @main
 }
 
 // -----
@@ -333,13 +360,24 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %true = spirv.Constant true
     %zero = spirv.Constant 0 : i32
     %one = spirv.Constant 1 : i32
+    spirv.mlir.selection {
 // CHECK:   spirv.BranchConditional %{{.*}}, ^[[true1:.*]](%{{.*}} : i32), ^[[false1:.*]](%{{.*}}, %{{.*}} : i32, i32)
-    spirv.BranchConditional %true, ^true1(%one: i32), ^false1(%zero, %zero: i32, i32)
+      spirv.BranchConditional %true, ^true1(%one: i32), ^false1(%zero, %zero: i32, i32)
 // CHECK: [[true1]](%{{.*}}: i32):
-  ^true1(%arg0: i32):
-    spirv.Return
+    ^true1(%arg0: i32):
+      spirv.Return
 // CHECK: [[false1]](%{{.*}}: i32, %{{.*}}: i32):
-  ^false1(%arg1: i32, %arg2: i32):
+    ^false1(%arg1: i32, %arg2: i32):
+      spirv.Return
+    ^merge:
+      spirv.mlir.merge
+    }
+
     spirv.Return
   }
+
+  spirv.func @main() -> () "None" {
+    spirv.Return
+  }
+  spirv.EntryPoint "GLCompute" @main
 }
diff --git a/mlir/test/Target/SPIRV/selection.mlir b/mlir/test/Target/SPIRV/selection.mlir
index 44625cc299230..12daf68538d0a 100644
--- a/mlir/test/Target/SPIRV/selection.mlir
+++ b/mlir/test/Target/SPIRV/selection.mlir
@@ -1,5 +1,10 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip -split-input-file %s | FileCheck %s
 
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
 // Selection with both then and else branches
 
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
@@ -136,19 +141,31 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // CHECK-NEXT:    spirv.Load "Function" %[[VAR]]
     %cond = spirv.Load "Function" %var : i1
+    spirv.mlir.selection {
 //      CHECK:    spirv.BranchConditional %1, ^[[THEN1:.+]](%{{.+}} : i32), ^[[ELSE1:.+]](%{{.+}}, %{{.+}} : i32, i32)
-    spirv.BranchConditional %cond, ^then1(%one: i32), ^else1(%zero, %zero: i32, i32)
+      spirv.BranchConditional %cond, ^then1(%one: i32), ^else1(%zero, %zero: i32, i32)
 
 // CHECK-NEXT:  ^[[THEN1]](%{{.+}}: i32):
 // CHECK-NEXT:    spirv.Return
-  ^then1(%arg0: i32):
-    spirv.Return
+    ^then1(%arg0: i32):
+      spirv.Return
 
 // CHECK-NEXT:  ^[[ELSE1]](%{{.+}}: i32, %{{.+}}: i32):
 // CHECK-NEXT:    spirv.Return
-  ^else1(%arg1: i32, %arg2: i32):
+    ^else1(%arg1: i32, %arg2: i32):
+      spirv.Return
+    ^merge:
+      spirv.mlir.merge
+    }
+
     spirv.Return
   }
+
+  spirv.func @main() -> () "None" {
+    spirv.Return
+  }
+  spirv.EntryPoint "GLCompute" @main
+  spirv.ExecutionMode @main "LocalSize", 1, 1, 1
 }
 
 // -----
diff --git a/mlir/test/Target/SPIRV/struct.mlir b/mlir/test/Target/SPIRV/struct.mlir
index 4984ee79f903d..c4235005b07bb 100644
--- a/mlir/test/Target/SPIRV/struct.mlir
+++ b/mlir/test/Target/SPIRV/struct.mlir
@@ -1,6 +1,11 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Addresses, Float64, Int64, Linkage], [SPV_KHR_storage_buffer_storage_class]> {
   // CHECK: !spirv.ptr<!spirv.struct<(!spirv.array<128 x f32, stride=4> [0])>, Input>
   spirv.GlobalVariable @var0 bind(0, 1) : !spirv.ptr<!spirv.struct<(!spirv.array<128 x f32, stride=4> [0])>, Input>
 
@@ -16,8 +21,8 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
   // CHECK: !spirv.ptr<!spirv.struct<(f32 [0, NonWritable], i32 [4]), Block>, StorageBuffer>
   spirv.GlobalVariable @var4 : !spirv.ptr<!spirv.struct<(f32 [0, NonWritable], i32 [4]), Block>, StorageBuffer>
 
-  // CHECK: !spirv.ptr<!spirv.struct<(f32 [NonWritable], i32 [NonWritable, NonReadable]), Block>, StorageBuffer>
-  spirv.GlobalVariable @var5 : !spirv.ptr<!spirv.struct<(f32 [NonWritable], i32 [NonWritable, NonReadable]), Block>, StorageBuffer>
+  // CHECK: !spirv.ptr<!spirv.struct<(f32 [0, NonWritable], i32 [4, NonWritable, NonReadable]), Block>, StorageBuffer>
+  spirv.GlobalVariable @var5 : !spirv.ptr<!spirv.struct<(f32 [0, NonWritable], i32 [4, NonWritable, NonReadable]), Block>, StorageBuffer>
 
   // CHECK: !spirv.ptr<!spirv.struct<(f32 [0, NonWritable], i32 [4, NonWritable, NonReadable]), Block>, StorageBuffer>
   spirv.GlobalVariable @var6 : !spirv.ptr<!spirv.struct<(f32 [0, NonWritable], i32 [4, NonWritable, NonReadable]), Block>, StorageBuffer>
@@ -34,14 +39,14 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
   // CHECK: !spirv.ptr<!spirv.struct<test_id, (!spirv.array<128 x f32, stride=4> [0])>, Input>
   spirv.GlobalVariable @id_var0 : !spirv.ptr<!spirv.struct<test_id, (!spirv.array<128 x f32, stride=4> [0])>, Input>
 
-  // CHECK: !spirv.ptr<!spirv.struct<rec, (!spirv.ptr<!spirv.struct<rec>, StorageBuffer>), Block>, StorageBuffer>
-  spirv.GlobalVariable @recursive_simple : !spirv.ptr<!spirv.struct<rec, (!spirv.ptr<!spirv.struct<rec>, StorageBuffer>), Block>, StorageBuffer>
+  // CHECK: !spirv.ptr<!spirv.struct<rec, (!spirv.ptr<!spirv.struct<rec>, StorageBuffer> [0]), Block>, StorageBuffer>
+  spirv.GlobalVariable @recursive_simple : !spirv.ptr<!spirv.struct<rec, (!spirv.ptr<!spirv.struct<rec>, StorageBuffer> [0]), Block>, StorageBuffer>
 
-  // CHECK: !spirv.ptr<!spirv.struct<a, (!spirv.ptr<!spirv.struct<b, (!spirv.ptr<!spirv.struct<a>, Uniform>), Block>, Uniform>), Block>, Uniform>
-  spirv.GlobalVariable @recursive_2 : !spirv.ptr<!spirv.struct<a, (!spirv.ptr<!spirv.struct<b, (!spirv.ptr<!spirv.struct<a>, Uniform>), Block>, Uniform>), Block>, Uniform>
+  // CHECK: !spirv.ptr<!spirv.struct<a, (!spirv.ptr<!spirv.struct<b, (!spirv.ptr<!spirv.struct<a>, Uniform> [0]), Block>, Uniform> [0]), Block>, Uniform>
+  spirv.GlobalVariable @recursive_2 : !spirv.ptr<!spirv.struct<a, (!spirv.ptr<!spirv.struct<b, (!spirv.ptr<!spirv.struct<a>, Uniform> [0]), Block>, Uniform> [0]), Block>, Uniform>
 
-  // CHECK: !spirv.ptr<!spirv.struct<axx, (!spirv.ptr<!spirv.struct<bxx, (!spirv.ptr<!spirv.struct<axx>, Uniform>, !spirv.ptr<!spirv.struct<bxx>, Uniform>), Block>, Uniform>), Block>, Uniform>
-  spirv.GlobalVariable @recursive_3 : !spirv.ptr<!spirv.struct<axx, (!spirv.ptr<!spirv.struct<bxx, (!spirv.ptr<!spirv.struct<axx>, Uniform>, !spirv.ptr<!spirv.struct<bxx>, Uniform>), Block>, Uniform>), Block>, Uniform>
+  // CHECK: !spirv.ptr<!spirv.struct<axx, (!spirv.ptr<!spirv.struct<bxx, (!spirv.ptr<!spirv.struct<axx>, Uniform> [0], !spirv.ptr<!spirv.struct<bxx>, Uniform> [8]), Block>, Uniform> [0]), Block>, Uniform>
+  spirv.GlobalVariable @recursive_3 : !spirv.ptr<!spirv.struct<axx, (!spirv.ptr<!spirv.struct<bxx, (!spirv.ptr<!spirv.struct<axx>, Uniform> [0], !spirv.ptr<!spirv.struct<bxx>, Uniform> [8]), Block>, Uniform> [0]), Block>, Uniform>
 
   // CHECK: spirv.GlobalVariable @block : !spirv.ptr<!spirv.struct<vert, (vector<4xf32> [BuiltIn=0], f32 [BuiltIn=1]), Block>, Output>
   spirv.GlobalVariable @block : !spirv.ptr<!spirv.struct<vert, (vector<4xf32> [BuiltIn=0], f32 [BuiltIn=1]), Block>, Output>
diff --git a/mlir/test/Target/SPIRV/subgroup-block-intel.mlir b/mlir/test/Target/SPIRV/subgroup-block-intel.mlir
new file mode 100644
index 0000000000000..14060e632fffd
--- /dev/null
+++ b/mlir/test/Target/SPIRV/subgroup-block-intel.mlir
@@ -0,0 +1,34 @@
+// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s
+
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Physical64 GLSL450 requires #spirv.vce<v1.3, [Addresses, Shader, Linkage, SubgroupBufferBlockIOINTEL],
+                                                          [SPV_KHR_storage_buffer_storage_class, SPV_INTEL_subgroups]> {
+  // CHECK-LABEL: @subgroup_block_read_intel
+  spirv.func @subgroup_block_read_intel(%ptr : !spirv.ptr<i32, StorageBuffer>) -> i32 "None" {
+    // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> i32
+    %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> i32
+    spirv.ReturnValue %0: i32
+  }
+  // CHECK-LABEL: @subgroup_block_read_intel_vector
+  spirv.func @subgroup_block_read_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>) -> vector<3xi32> "None" {
+    // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32>
+    %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32>
+    spirv.ReturnValue %0: vector<3xi32>
+  }
+  // CHECK-LABEL: @subgroup_block_write_intel
+  spirv.func @subgroup_block_write_intel(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: i32) -> () "None" {
+    // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : i32
+    spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : i32
+    spirv.Return
+  }
+  // CHECK-LABEL: @subgroup_block_write_intel_vector
+  spirv.func @subgroup_block_write_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: vector<3xi32>) -> () "None" {
+    // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : vector<3xi32>
+    spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : vector<3xi32>
+    spirv.Return
+  }
+}
diff --git a/mlir/test/Transforms/buffer-results-to-out-params-modify-public-functions.mlir b/mlir/test/Transforms/buffer-results-to-out-params-modify-public-functions.mlir
new file mode 100644
index 0000000000000..c99bde3f34986
--- /dev/null
+++ b/mlir/test/Transforms/buffer-results-to-out-params-modify-public-functions.mlir
@@ -0,0 +1,40 @@
+// RUN: mlir-opt -p 'builtin.module(buffer-results-to-out-params{modify-public-functions})' %s | FileCheck %s
+
+// Test if `public` functions' return values are transformed into out parameters
+// when `buffer-results-to-out-params` is invoked with `modifyPublicFunctions`.
+
+// CHECK-LABEL:   func.func @basic(
+// CHECK-SAME:                     %[[ARG0:.*]]: memref<f32>) {
+// CHECK:           %[[VAL_0:.*]] = "test.source"() : () -> memref<f32>
+// CHECK:           memref.copy %[[VAL_0]], %[[ARG0]] : memref<f32> to memref<f32>
+// CHECK:           return
+// CHECK:         }
+func.func @basic() -> (memref<f32>) {
+  %0 = "test.source"() : () -> (memref<f32>)
+  return %0 : memref<f32>
+}
+
+// CHECK-LABEL:   func.func @presence_of_existing_arguments(
+// CHECK-SAME:      %[[ARG0:.*]]: memref<1xf32>,
+// CHECK-SAME:      %[[ARG1:.*]]: memref<2xf32>) {
+// CHECK:           %[[VAL_0:.*]] = "test.source"() : () -> memref<2xf32>
+// CHECK:           memref.copy %[[VAL_0]], %[[ARG1]] : memref<2xf32> to memref<2xf32>
+// CHECK:           return
+// CHECK:         }
+func.func @presence_of_existing_arguments(%arg0: memref<1xf32>) -> (memref<2xf32>) {
+  %0 = "test.source"() : () -> (memref<2xf32>)
+  return %0 : memref<2xf32>
+}
+
+// CHECK-LABEL:   func.func @multiple_results(
+// CHECK-SAME:      %[[ARG0:.*]]: memref<1xf32>,
+// CHECK-SAME:      %[[ARG1:.*]]: memref<2xf32>) {
+// CHECK:           %[[VAL_0:.*]]:2 = "test.source"() : () -> (memref<1xf32>, memref<2xf32>)
+// CHECK:           memref.copy %[[VAL_0]]#0, %[[ARG0]] : memref<1xf32> to memref<1xf32>
+// CHECK:           memref.copy %[[VAL_0]]#1, %[[ARG1]] : memref<2xf32> to memref<2xf32>
+// CHECK:           return
+// CHECK:         }
+func.func @multiple_results() -> (memref<1xf32>, memref<2xf32>) {
+  %0, %1 = "test.source"() : () -> (memref<1xf32>, memref<2xf32>)
+  return %0, %1 : memref<1xf32>, memref<2xf32>
+}
diff --git a/mlir/test/Transforms/test-legalizer-full.mlir b/mlir/test/Transforms/test-legalizer-full.mlir
index 42cec68b9fbbb..8da9109a32762 100644
--- a/mlir/test/Transforms/test-legalizer-full.mlir
+++ b/mlir/test/Transforms/test-legalizer-full.mlir
@@ -72,3 +72,21 @@ builtin.module {
   }
 
 }
+
+// -----
+
+// The region of "test.post_order_legalization" is converted before the op.
+
+// expected-remark@+1 {{applyFullConversion failed}}
+builtin.module {
+func.func @test_preorder_legalization() {
+  // expected-error@+1 {{failed to legalize operation 'test.post_order_legalization'}}
+  "test.post_order_legalization"() ({
+  ^bb0(%arg0: i64):
+    // Not-explicitly-legal ops are not allowed to survive.
+    "test.remaining_consumer"(%arg0) : (i64) -> ()
+    "test.invalid"(%arg0) : (i64) -> ()
+  }) : () -> ()
+  return
+}
+}
diff --git a/mlir/test/Transforms/test-legalizer-no-materializations.mlir b/mlir/test/Transforms/test-legalizer-no-materializations.mlir
new file mode 100644
index 0000000000000..82dd7422b22b2
--- /dev/null
+++ b/mlir/test/Transforms/test-legalizer-no-materializations.mlir
@@ -0,0 +1,67 @@
+// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0 build-materializations=0 attach-debug-materialization-kind=1" -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-KIND
+
+// CHECK-LABEL: func @dropped_input_in_use
+// CHECK-KIND-LABEL: func @dropped_input_in_use
+func.func @dropped_input_in_use(%arg: i16, %arg2: i64) {
+  // CHECK-NEXT: %[[cast:.*]] = "test.cast"() : () -> i16
+  // CHECK-NEXT: "work"(%[[cast]]) : (i16)
+  // CHECK-KIND-NEXT: %[[cast:.*]] = builtin.unrealized_conversion_cast to i16 {__kind__ = "source"}
+  // CHECK-KIND-NEXT: "work"(%[[cast]]) : (i16)
+  // expected-remark@+1 {{op 'work' is not legalizable}}
+  "work"(%arg) : (i16) -> ()
+}
+
+// -----
+
+// CHECK-KIND-LABEL: func @test_lookup_without_converter
+//       CHECK-KIND:   %[[producer:.*]] = "test.valid_producer"() : () -> i16
+//       CHECK-KIND:   %[[cast:.*]] = builtin.unrealized_conversion_cast %[[producer]] : i16 to f64 {__kind__ = "target"}
+//       CHECK-KIND:   "test.valid_consumer"(%[[cast]]) : (f64) -> ()
+//       CHECK-KIND:   "test.valid_consumer"(%[[producer]]) : (i16) -> ()
+func.func @test_lookup_without_converter() {
+  %0 = "test.replace_with_valid_producer"() {type = i16} : () -> (i64)
+  "test.replace_with_valid_consumer"(%0) {with_converter} : (i64) -> ()
+  // Make sure that the second "replace_with_valid_consumer" lowering does not
+  // lookup the materialization that was created for the above op.
+  "test.replace_with_valid_consumer"(%0) : (i64) -> ()
+  // expected-remark@+1 {{op 'func.return' is not legalizable}}
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @remap_moved_region_args
+func.func @remap_moved_region_args() {
+  // CHECK-NEXT: return
+  // CHECK-NEXT: ^bb1(%[[arg0:.*]]: i64, %[[arg1:.*]]: i16, %[[arg2:.*]]: i64, %[[arg3:.*]]: f32):
+  // CHECK-NEXT: %[[cast1:.*]]:2 = builtin.unrealized_conversion_cast %[[arg3]] : f32 to f16, f16
+  // CHECK-NEXT: %[[cast2:.*]] = builtin.unrealized_conversion_cast %[[arg2]] : i64 to f64
+  // CHECK-NEXT: %[[cast3:.*]] = builtin.unrealized_conversion_cast %[[arg0]] : i64 to f64
+  // CHECK-NEXT: %[[cast4:.*]] = "test.cast"(%[[cast1]]#0, %[[cast1]]#1) : (f16, f16) -> f32
+  // CHECK-NEXT: "test.valid"(%[[cast3]], %[[cast2]], %[[cast4]]) : (f64, f64, f32)
+  "test.region"() ({
+    ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32):
+      "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> ()
+  }) : () -> ()
+  // expected-remark@+1 {{op 'func.return' is not legalizable}}
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @remap_cloned_region_args
+func.func @remap_cloned_region_args() {
+  // CHECK-NEXT: return
+  // CHECK-NEXT: ^bb1(%[[arg0:.*]]: i64, %[[arg1:.*]]: i16, %[[arg2:.*]]: i64, %[[arg3:.*]]: f32):
+  // CHECK-NEXT: %[[cast1:.*]]:2 = builtin.unrealized_conversion_cast %[[arg3]] : f32 to f16, f16
+  // CHECK-NEXT: %[[cast2:.*]] = builtin.unrealized_conversion_cast %[[arg2]] : i64 to f64
+  // CHECK-NEXT: %[[cast3:.*]] = builtin.unrealized_conversion_cast %[[arg0]] : i64 to f64
+  // CHECK-NEXT: %[[cast4:.*]] = "test.cast"(%[[cast1]]#0, %[[cast1]]#1) : (f16, f16) -> f32
+  // CHECK-NEXT: "test.valid"(%[[cast3]], %[[cast2]], %[[cast4]]) : (f64, f64, f32)
+  "test.region"() ({
+    ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32):
+      "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> ()
+  }) {legalizer.should_clone} : () -> ()
+  // expected-remark@+1 {{op 'func.return' is not legalizable}}
+  return
+}
diff --git a/mlir/test/Transforms/test-legalizer-rollback.mlir b/mlir/test/Transforms/test-legalizer-rollback.mlir
index 71e11782e14b0..4bcca6b7e5228 100644
--- a/mlir/test/Transforms/test-legalizer-rollback.mlir
+++ b/mlir/test/Transforms/test-legalizer-rollback.mlir
@@ -163,3 +163,22 @@ func.func @create_unregistered_op_in_pattern() -> i32 {
   "test.return"(%0) : (i32) -> ()
 }
 }
+
+// -----
+
+// CHECK-LABEL: func @test_failed_preorder_legalization
+//       CHECK:   "test.post_order_legalization"() ({
+//       CHECK:     %[[r:.*]] = "test.illegal_op_g"() : () -> i32
+//       CHECK:     "test.return"(%[[r]]) : (i32) -> ()
+//       CHECK:   }) : () -> ()
+// expected-remark @+1 {{applyPartialConversion failed}}
+module {
+func.func @test_failed_preorder_legalization() {
+  // expected-error @+1 {{failed to legalize operation 'test.post_order_legalization' that was explicitly marked illegal}}
+  "test.post_order_legalization"() ({
+    %0 = "test.illegal_op_g"() : () -> (i32)
+    "test.return"(%0) : (i32) -> ()
+  }) : () -> ()
+  return
+}
+}
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index 94c5bb4e93b06..88a71cc26ab0c 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -1,7 +1,6 @@
 // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=1" -verify-diagnostics %s | FileCheck %s
 // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=1" -verify-diagnostics -profile-actions-to=- %s | FileCheck %s --check-prefix=CHECK-PROFILER
 // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0" -verify-diagnostics %s | FileCheck %s
-// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0 build-materializations=0 attach-debug-materialization-kind=1" -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-KIND
 
 // CHECK-PROFILER: "name": "pass-execution", "cat": "PERF", "ph": "B"
 // CHECK-PROFILER: "name": "apply-conversion", "cat": "PERF", "ph": "B"
@@ -146,36 +145,6 @@ func.func @no_remap_nested() {
 
 // -----
 
-// CHECK-LABEL: func @remap_moved_region_args
-func.func @remap_moved_region_args() {
-  // CHECK-NEXT: return
-  // CHECK-NEXT: ^bb1(%{{.*}}: f64, %{{.*}}: f64, %{{.*}}: f16, %{{.*}}: f16):
-  // CHECK-NEXT: "test.cast"{{.*}} : (f16, f16) -> f32
-  // CHECK-NEXT: "test.valid"{{.*}} : (f64, f64, f32)
-  "test.region"() ({
-    ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32):
-      "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> ()
-  }) : () -> ()
-  // expected-remark@+1 {{op 'func.return' is not legalizable}}
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @remap_cloned_region_args
-func.func @remap_cloned_region_args() {
-  // CHECK-NEXT: return
-  // CHECK-NEXT: ^bb1(%{{.*}}: f64, %{{.*}}: f64, %{{.*}}: f16, %{{.*}}: f16):
-  // CHECK-NEXT: "test.cast"{{.*}} : (f16, f16) -> f32
-  // CHECK-NEXT: "test.valid"{{.*}} : (f64, f64, f32)
-  "test.region"() ({
-    ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32):
-      "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> ()
-  }) {legalizer.should_clone} : () -> ()
-  // expected-remark@+1 {{op 'func.return' is not legalizable}}
-  return
-}
-
 // CHECK-LABEL: func @remap_drop_region
 func.func @remap_drop_region() {
   // CHECK-NEXT: return
@@ -191,12 +160,9 @@ func.func @remap_drop_region() {
 // -----
 
 // CHECK-LABEL: func @dropped_input_in_use
-// CHECK-KIND-LABEL: func @dropped_input_in_use
 func.func @dropped_input_in_use(%arg: i16, %arg2: i64) {
   // CHECK-NEXT: %[[cast:.*]] = "test.cast"() : () -> i16
   // CHECK-NEXT: "work"(%[[cast]]) : (i16)
-  // CHECK-KIND-NEXT: %[[cast:.*]] = builtin.unrealized_conversion_cast to i16 {__kind__ = "source"}
-  // CHECK-KIND-NEXT: "work"(%[[cast]]) : (i16)
   // expected-remark@+1 {{op 'work' is not legalizable}}
   "work"(%arg) : (i16) -> ()
 }
@@ -452,11 +418,6 @@ func.func @test_multiple_1_to_n_replacement() {
 //       CHECK:   %[[cast:.*]] = "test.cast"(%[[producer]]) : (i16) -> f64
 //       CHECK:   "test.valid_consumer"(%[[cast]]) : (f64) -> ()
 //       CHECK:   "test.valid_consumer"(%[[producer]]) : (i16) -> ()
-// CHECK-KIND-LABEL: func @test_lookup_without_converter
-//       CHECK-KIND:   %[[producer:.*]] = "test.valid_producer"() : () -> i16
-//       CHECK-KIND:   %[[cast:.*]] = builtin.unrealized_conversion_cast %[[producer]] : i16 to f64 {__kind__ = "target"}
-//       CHECK-KIND:   "test.valid_consumer"(%[[cast]]) : (f64) -> ()
-//       CHECK-KIND:   "test.valid_consumer"(%[[producer]]) : (i16) -> ()
 func.func @test_lookup_without_converter() {
   %0 = "test.replace_with_valid_producer"() {type = i16} : () -> (i64)
   "test.replace_with_valid_consumer"(%0) {with_converter} : (i64) -> ()
@@ -487,3 +448,35 @@ func.func @test_working_1to1_pattern(%arg0: f16) {
   "test.type_consumer"(%arg0) : (f16) -> ()
   "test.return"() : () -> ()
 }
+
+// -----
+
+// The region of "test.post_order_legalization" is converted before the op.
+
+// CHECK: notifyBlockInserted into test.post_order_legalization: was unlinked
+// CHECK: notifyOperationInserted: test.invalid
+// CHECK: notifyBlockErased
+// CHECK: notifyOperationInserted: test.valid, was unlinked
+// CHECK: notifyOperationReplaced: test.invalid
+// CHECK: notifyOperationErased: test.invalid
+// CHECK: notifyOperationModified: test.post_order_legalization
+
+// CHECK-LABEL: func @test_preorder_legalization
+//       CHECK:   "test.post_order_legalization"() ({
+//       CHECK:   ^{{.*}}(%[[arg0:.*]]: f64):
+// Note: The survival of a not-explicitly-invalid operation does *not* cause
+// a conversion failure in when applying a partial conversion.
+//       CHECK:     %[[cast:.*]] = "test.cast"(%[[arg0]]) : (f64) -> i64
+//       CHECK:     "test.remaining_consumer"(%[[cast]]) : (i64) -> ()
+//       CHECK:     "test.valid"(%[[arg0]]) : (f64) -> ()
+//       CHECK:   }) {is_legal} : () -> ()
+func.func @test_preorder_legalization() {
+  "test.post_order_legalization"() ({
+  ^bb0(%arg0: i64):
+    // expected-remark @+1 {{'test.remaining_consumer' is not legalizable}}
+    "test.remaining_consumer"(%arg0) : (i64) -> ()
+    "test.invalid"(%arg0) : (i64) -> ()
+  }) : () -> ()
+  // expected-remark @+1 {{'func.return' is not legalizable}}
+  return
+}
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index 21d75f58b0a3a..43392d78b37d2 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -37,7 +37,6 @@
 #include "llvm/Support/Base64.h"
 #include "llvm/Support/Casting.h"
 
-#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Interfaces/FoldInterfaces.h"
 #include "mlir/Reducer/ReductionPatternInterface.h"
diff --git a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
index 4d4ec02546bc7..c153211c68f92 100644
--- a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
+++ b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
@@ -320,10 +320,10 @@ LogicalResult OpWithResultShapeInterfaceOp::reifyReturnTypeShapes(
 }
 
 //===----------------------------------------------------------------------===//
-// OpWithResultShapePerDimInterfaceOp
+// ReifyShapedTypeUsingReifyResultShapesOp
 //===----------------------------------------------------------------------===//
 
-LogicalResult OpWithResultShapePerDimInterfaceOp::reifyResultShapes(
+LogicalResult ReifyShapedTypeUsingReifyResultShapesOp::reifyResultShapes(
     OpBuilder &builder, ReifiedRankedShapedTypeDims &shapes) {
   Location loc = getLoc();
   shapes.reserve(getNumOperands());
@@ -344,6 +344,103 @@ LogicalResult OpWithResultShapePerDimInterfaceOp::reifyResultShapes(
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// ReifyShapedTypeUsingReifyShapeOfResultOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult ReifyShapedTypeUsingReifyShapeOfResultOp::reifyResultShapes(
+    OpBuilder &builder, ReifiedRankedShapedTypeDims &shapes) {
+  return failure();
+}
+
+FailureOr<SmallVector<OpFoldResult>>
+ReifyShapedTypeUsingReifyShapeOfResultOp::reifyShapeOfResult(OpBuilder &builder,
+                                                             int resultIndex) {
+  Location loc = getLoc();
+  Value sourceOperand = getOperand(getNumOperands() - 1 - resultIndex);
+  SmallVector<OpFoldResult> shape =
+      tensor::getMixedSizes(builder, loc, sourceOperand);
+  return shape;
+}
+
+//===----------------------------------------------------------------------===//
+// ReifyShapedTypeUsingReifyDimOfResultOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult ReifyShapedTypeUsingReifyDimOfResultOp::reifyResultShapes(
+    OpBuilder &builder, ReifiedRankedShapedTypeDims &shapes) {
+  return failure();
+}
+
+FailureOr<SmallVector<OpFoldResult>>
+ReifyShapedTypeUsingReifyDimOfResultOp::reifyShapeOfResult(OpBuilder &builder,
+                                                           int resultIndex) {
+  return failure();
+}
+
+FailureOr<OpFoldResult>
+ReifyShapedTypeUsingReifyDimOfResultOp::reifyDimOfResult(OpBuilder &builder,
+                                                         int resultIndex,
+                                                         int dim) {
+  Location loc = getLoc();
+  Value sourceOperand = getOperand(getNumOperands() - 1 - resultIndex);
+  OpFoldResult shape = tensor::getMixedSize(builder, loc, sourceOperand, dim);
+  return shape;
+}
+
+//===----------------------------------------------------------------------===//
+// UnreifableResultShapesOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult UnreifiableResultShapesOp::reifyResultShapes(
+    OpBuilder &builder, ReifiedRankedShapedTypeDims &shapes) {
+  Location loc = getLoc();
+  shapes.resize(1);
+  shapes[0] = {tensor::getMixedSize(builder, loc, getOperand(), 0),
+               OpFoldResult()};
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// UnreifableResultShapeOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult UnreifiableResultShapeOp::reifyResultShapes(
+    OpBuilder &builder, ReifiedRankedShapedTypeDims &shapes) {
+  return failure();
+}
+
+FailureOr<SmallVector<OpFoldResult>>
+UnreifiableResultShapeOp::reifyShapeOfResult(OpBuilder &builder,
+                                             int resultIndex) {
+  SmallVector<OpFoldResult> shape = {
+      tensor::getMixedSize(builder, getLoc(), getOperand(), 0), OpFoldResult()};
+  return shape;
+}
+
+//===----------------------------------------------------------------------===//
+// UnreifableResultShapeOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult UnreifiableDimOfResultShapeOp::reifyResultShapes(
+    OpBuilder &builder, ReifiedRankedShapedTypeDims &shapes) {
+  return failure();
+}
+
+FailureOr<SmallVector<OpFoldResult>>
+UnreifiableDimOfResultShapeOp::reifyShapeOfResult(OpBuilder &builder,
+                                                  int resultIndex) {
+  return failure();
+}
+
+FailureOr<OpFoldResult>
+UnreifiableDimOfResultShapeOp::reifyDimOfResult(OpBuilder &builder,
+                                                int resultIndex, int dim) {
+  if (dim == 0)
+    return tensor::getMixedSize(builder, getLoc(), getOperand(), 0);
+  return failure();
+}
+
 //===----------------------------------------------------------------------===//
 // SideEffectOp
 //===----------------------------------------------------------------------===//
@@ -1051,6 +1148,32 @@ LogicalResult OpWithRefineTypeInterfaceOp::refineReturnTypes(
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// TilingNoDpsOp
+//===----------------------------------------------------------------------===//
+
+SmallVector<Range> TilingNoDpsOp::getIterationDomain(OpBuilder &builder) {
+  return {};
+}
+
+SmallVector<utils::IteratorType> TilingNoDpsOp::getLoopIteratorTypes() {
+  return {};
+}
+
+FailureOr<TilingResult>
+TilingNoDpsOp::getTiledImplementation(OpBuilder &builder,
+                                      ArrayRef<OpFoldResult> offsets,
+                                      ArrayRef<OpFoldResult> sizes) {
+  return failure();
+}
+
+LogicalResult TilingNoDpsOp::getResultTilePosition(
+    OpBuilder &builder, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes, SmallVector<OpFoldResult> &resultOffsets,
+    SmallVector<OpFoldResult> &resultSizes) {
+  return failure();
+}
+
 //===----------------------------------------------------------------------===//
 // OpWithShapedTypeInferTypeAdaptorInterfaceOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestOps.h b/mlir/test/lib/Dialect/Test/TestOps.h
index 4201ade9795e7..679274346fb13 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.h
+++ b/mlir/test/lib/Dialect/Test/TestOps.h
@@ -42,6 +42,7 @@
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
 
 namespace test {
 class TestDialect;
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index a3430ba49a291..cae0083f728e0 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -30,6 +30,7 @@ include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/Interfaces/MemorySlotInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/TilingInterface.td"
 include "mlir/Interfaces/ValueBoundsOpInterface.td"
 include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td"
 include "mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.td"
@@ -914,13 +915,97 @@ def OpWithResultShapeInterfaceOp : TEST_Op<"op_with_result_shape_interface",
   let results = (outs AnyRankedTensor:$result1, AnyRankedTensor:$result2);
 }
 
-def OpWithResultShapePerDimInterfaceOp :
-    TEST_Op<"op_with_result_shape_per_dim_interface",
-        [DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>]> {
+def ReifyShapedTypeUsingReifyResultShapesOp :
+    TEST_Op<"reify_shaped_type_using_reify_result_shapes",
+        [DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface, 
+            ["reifyResultShapes"]>]> {
+  let description = [{
+    Test that when resolving a single dimension of a result for an operation
+    that doesnt implement `reifyShapeOfResult` nor implements `reifyDimOfResult`
+    calls into the implementation of `reifyResultShapes` to get the required value.
+    The op semantics is that the first result has the same shape as the second operand
+    and the second result has the same shape as the first operand.
+  }];
+  let arguments = (ins AnyRankedTensor:$operand1, AnyRankedTensor:$operand2);
+  let results = (outs AnyRankedTensor:$result1, AnyRankedTensor:$result2);
+}
+
+def ReifyShapedTypeUsingReifyShapeOfResultOp :
+    TEST_Op<"reify_shaped_type_using_reify_shape_of_result",
+        [DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface, 
+            ["reifyResultShapes", "reifyShapeOfResult"]>]> {
+  let description = [{
+    Test that when resolving a single dimension of a result for an operation
+    that doesnt implement `reifyDimOfResult` but implements `reifyShapeOfResult`, which
+    is used to get the required value. `reifyResultShapes` is implemented as a failure
+    (which is also the default implementation) to ensure it is not called.
+    The op semantics is that the first result has the same shape as the second operand
+    and the second result has the same shape as the first operand.
+  }];
+  let arguments = (ins AnyRankedTensor:$operand1, AnyRankedTensor:$operand2);
+  let results = (outs AnyRankedTensor:$result1, AnyRankedTensor:$result2);
+}
+
+def ReifyShapedTypeUsingReifyDimOfResultOp :
+    TEST_Op<"reify_shaped_type_using_reify_dim_of_result",
+        [DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface, 
+            ["reifyResultShapes", "reifyShapeOfResult", "reifyDimOfResult"]>]> {
+  let description = [{
+    Test that when resolving a single dimension of a result for an operation
+    that implements `reifyDimOfResult`, which is used to get the required value.
+    `reifyResultShapes` and `reifyShapeOfResult` are implemented as failures
+    to ensure they are not called. The op semantics is that the first result has
+    the same shape as the second operand and the second result has the same shape
+    as the first operand.
+  }];
   let arguments = (ins AnyRankedTensor:$operand1, AnyRankedTensor:$operand2);
   let results = (outs AnyRankedTensor:$result1, AnyRankedTensor:$result2);
 }
 
+def UnreifiableResultShapesOp : TEST_Op<"unreifiable_result_shapes",
+    [DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface,
+        ["reifyResultShapes"]>]> {
+  let description = [{
+    Test handling of case where some dimension of the result cannot be
+    reified. This tests the path when `reifyResultShapes` is implemented.
+
+    Expected that dim 0 of `result` is reifable as dim 0 of `operand`, but
+    dim 1 of `result` is not reifiable.
+  }];
+  let arguments = (ins 2DTensorOf<[AnyType]>:$operand);
+  let results = (outs 2DTensorOf<[AnyType]>:$result);
+}
+
+def UnreifiableResultShapeOp : TEST_Op<"unreifiable_result_shape",
+    [DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface,
+        ["reifyResultShapes", "reifyShapeOfResult"]>]> {
+  let description = [{
+    Test handling of case where some dimension of the result cannot be
+    reified. This tests the path when `reifyShapeOfResult` is implemented,
+    but not `reifyDimOfResult` with `reifyResultShapes` implemented as a failure.
+
+    Expected that dim 0 of `result` is reifable as dim 0 of `operand`, but
+    dim 1 of `result` is not reifiable.
+  }];
+  let arguments = (ins 2DTensorOf<[AnyType]>:$operand);
+  let results = (outs 2DTensorOf<[AnyType]>:$result);
+}
+
+def UnreifiableDimOfResultShapeOp : TEST_Op<"unreifiable_dim_of_result_shape",
+    [DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface,
+        ["reifyResultShapes", "reifyShapeOfResult", "reifyDimOfResult"]>]> {
+  let description = [{
+    Test handling of case where some dimension of the result cannot be
+    reified. This tests the path when `reifyDimOfResult` is implemented,
+    and `reifyDimOfResult` with `reifyResultShapes` are implemented as a failure.
+
+    Expected that dim 0 of `result` is reifable as dim 0 of `operand`, but
+    dim 1 of `result` is not reifiable.
+  }];
+  let arguments = (ins 2DTensorOf<[AnyType]>:$operand);
+  let results = (outs 2DTensorOf<[AnyType]>:$result);
+}
+
 def IsNotScalar : Constraint<CPred<"$0.getType().getRank() != 0">>;
 
 def UpdateAttr : Pat<(I32ElementsAttrOp $attr),
@@ -2887,6 +2972,20 @@ def TestLinalgFillOp :
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// Test TilingInterface.
+//===----------------------------------------------------------------------===//
+
+def Test_TilingNoDpsOp : TEST_Op<"tiling_no_dps_op",
+    [Pure, DeclareOpInterfaceMethods<TilingInterface,
+      ["getIterationDomain",
+       "getLoopIteratorTypes",
+       "getResultTilePosition",
+       "getTiledImplementation"]>]> {
+  let arguments = (ins AnyRankedTensor:$lhs, AnyRankedTensor:$rhs);
+  let results = (outs AnyRankedTensor:$result);
+}
+
 //===----------------------------------------------------------------------===//
 // Test NVVM RequiresSM trait.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index fd2b943ff1296..9b64bc691588d 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -1418,6 +1418,22 @@ class TestTypeConsumerOpPattern
   }
 };
 
+class TestPostOrderLegalization : public ConversionPattern {
+public:
+  TestPostOrderLegalization(MLIRContext *ctx, const TypeConverter &converter)
+      : ConversionPattern(converter, "test.post_order_legalization", 1, ctx) {}
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<ValueRange> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    for (Region &r : op->getRegions())
+      if (failed(rewriter.legalize(&r)))
+        return failure();
+    rewriter.modifyOpInPlace(
+        op, [&]() { op->setAttr("is_legal", rewriter.getUnitAttr()); });
+    return success();
+  }
+};
+
 /// Test unambiguous overload resolution of replaceOpWithMultiple. This
 /// function is just to trigger compiler errors. It is never executed.
 [[maybe_unused]] void testReplaceOpWithMultipleOverloads(
@@ -1532,7 +1548,8 @@ struct TestLegalizePatternDriver
     patterns.add<TestDropOpSignatureConversion, TestDropAndReplaceInvalidOp,
                  TestPassthroughInvalidOp, TestMultiple1ToNReplacement,
                  TestValueReplace, TestReplaceWithValidConsumer,
-                 TestTypeConsumerOpPattern>(&getContext(), converter);
+                 TestTypeConsumerOpPattern, TestPostOrderLegalization>(
+        &getContext(), converter);
     patterns.add<TestConvertBlockArgs>(converter, &getContext());
     mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns,
                                                               converter);
@@ -1553,14 +1570,16 @@ struct TestLegalizePatternDriver
                            [](Type type) { return type.isF32(); });
     });
     target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
-      return converter.isSignatureLegal(op.getFunctionType()) &&
-             converter.isLegal(&op.getBody());
+      return converter.isSignatureLegal(op.getFunctionType());
     });
     target.addDynamicallyLegalOp<func::CallOp>(
         [&](func::CallOp op) { return converter.isLegal(op); });
     target.addDynamicallyLegalOp(
         OperationName("test.value_replace", &getContext()),
         [](Operation *op) { return op->hasAttr("is_legal"); });
+    target.addDynamicallyLegalOp(
+        OperationName("test.post_order_legalization", &getContext()),
+        [](Operation *op) { return op->hasAttr("is_legal"); });
 
     // TestCreateUnregisteredOp creates `arith.constant` operation,
     // which was not added to target intentionally to test
@@ -2156,8 +2175,7 @@ struct TestTypeConversionDriver
               recursiveType.getName() == "outer_converted_type");
     });
     target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
-      return converter.isSignatureLegal(op.getFunctionType()) &&
-             converter.isLegal(&op.getBody());
+      return converter.isSignatureLegal(op.getFunctionType());
     });
     target.addDynamicallyLegalOp<TestCastOp>([&](TestCastOp op) {
       // Allow casts from F64 to F32.
diff --git a/mlir/test/lib/Dialect/Test/TestTypeDefs.td b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
index ea20597231d58..9859bd06cb526 100644
--- a/mlir/test/lib/Dialect/Test/TestTypeDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
@@ -470,4 +470,11 @@ def TestMemrefType : Test_Type<"TestMemref",
   }];
 }
 
+// Test implementation of an interface with methods specifying a
+// method body
+def TestBaseBody : Test_Type<"TestBaseBody",
+    [DeclareTypeInterfaceMethods<TestBaseTypeInterfacePrintTypeA>]> {
+  let mnemonic = "test_base_body";
+}
+
 #endif // TEST_TYPEDEFS
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 76d461108b296..93d51441f5b81 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -200,7 +200,8 @@ class TestStepOpPattern : public OpConversionPattern<vector::StepOp> {
 
     Value sgId =
         gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
-    auto maybeOffsets = sliceAttr.getOffsets(rewriter, loc, sgId, wgShape);
+    auto maybeOffsets =
+        sliceAttr.computeDistributedCoords(rewriter, loc, sgId, wgShape);
     if (failed(maybeOffsets))
       return failure();
 
diff --git a/mlir/test/mlir-runner/memref-reshape.mlir b/mlir/test/mlir-runner/memref-reshape.mlir
index 8c17f1fd02358..b264e0285953f 100644
--- a/mlir/test/mlir-runner/memref-reshape.mlir
+++ b/mlir/test/mlir-runner/memref-reshape.mlir
@@ -65,8 +65,8 @@ func.func @reshape_ranked_memref_to_ranked(%input : memref<2x3xf32>,
 func.func @reshape_unranked_memref_to_ranked(%input : memref<2x3xf32>,
                                         %shape : memref<2xindex>) {
   %unranked_input = memref.cast %input : memref<2x3xf32> to memref<*xf32>
-  %output = memref.reshape %input(%shape)
-                : (memref<2x3xf32>, memref<2xindex>) -> memref<?x?xf32>
+  %output = memref.reshape %unranked_input(%shape)
+                : (memref<*xf32>, memref<2xindex>) -> memref<?x?xf32>
 
   %unranked_output = memref.cast %output : memref<?x?xf32> to memref<*xf32>
   call @printMemrefF32(%unranked_output) : (memref<*xf32>) -> ()
@@ -95,8 +95,8 @@ func.func @reshape_unranked_memref_to_unranked(%input : memref<2x3xf32>,
                                           %shape : memref<2xindex>) {
   %unranked_input = memref.cast %input : memref<2x3xf32> to memref<*xf32>
   %dyn_size_shape = memref.cast %shape : memref<2xindex> to memref<?xindex>
-  %output = memref.reshape %input(%dyn_size_shape)
-                : (memref<2x3xf32>, memref<?xindex>) -> memref<*xf32>
+  %output = memref.reshape %unranked_input(%dyn_size_shape)
+                : (memref<*xf32>, memref<?xindex>) -> memref<*xf32>
 
   call @printMemrefF32(%output) : (memref<*xf32>) -> ()
   // CHECK: rank = 2 offset = 0 sizes = [3, 2] strides = [2, 1] data =
diff --git a/mlir/test/mlir-tblgen/constraint-unique.td b/mlir/test/mlir-tblgen/constraint-unique.td
index d51e1a5f43ee7..3f2e5cd4bfad4 100644
--- a/mlir/test/mlir-tblgen/constraint-unique.td
+++ b/mlir/test/mlir-tblgen/constraint-unique.td
@@ -16,7 +16,7 @@ def AType : Type<ATypePred, "a type">;
 def OtherType : Type<ATypePred, "another type">;
 
 def AnAttrPred : CPred<"attrPred($_self, $_op)">;
-def AnAttr : Attr<AnAttrPred, "an attribute">;
+def AnAttr : Attr<AnAttrPred, "an attribute (got {{reformat($_self)}})">;
 def OtherAttr : Attr<AnAttrPred, "another attribute">;
 
 def ASuccessorPred : CPred<"successorPred($_self, $_op)">;
@@ -24,7 +24,7 @@ def ASuccessor : Successor<ASuccessorPred, "a successor">;
 def OtherSuccessor : Successor<ASuccessorPred, "another successor">;
 
 def ARegionPred : CPred<"regionPred($_self, $_op)">;
-def ARegion : Region<ARegionPred, "a region">;
+def ARegion : Region<ARegionPred, "a region ({{find(foo)}})">;
 def OtherRegion : Region<ARegionPred, "another region">;
 
 // OpA and OpB have the same type, attribute, successor, and region constraints.
@@ -71,10 +71,10 @@ def OpC : NS_Op<"op_c"> {
 // CHECK:    static ::llvm::LogicalResult [[$A_ATTR_CONSTRAINT:__mlir_ods_local_attr_constraint.*]](
 // CHECK:      if (attr && !((attrPred(attr, *op))))
 // CHECK-NEXT:   return emitError() << "attribute '" << attrName
-// CHECK-NEXT:       << "' failed to satisfy constraint: an attribute";
+// CHECK-NEXT:       << "' failed to satisfy constraint: an attribute (got " << reformat(attr) << ")";
 
 /// Test that duplicate attribute constraint was not generated.
-// CHECK-NOT:        << "' failed to satisfy constraint: an attribute";
+// CHECK-NOT:        << "' failed to satisfy constraint: an attribute
 
 /// Test that a attribute constraint with a different description was generated.
 // CHECK:    static ::llvm::LogicalResult [[$O_ATTR_CONSTRAINT:__mlir_ods_local_attr_constraint.*]](
@@ -103,7 +103,7 @@ def OpC : NS_Op<"op_c"> {
 // CHECK:      if (!((regionPred(region, *op)))) {
 // CHECK-NEXT:   return op->emitOpError("region #") << regionIndex
 // CHECK-NEXT:       << (regionName.empty() ? " " : " ('" + regionName + "') ")
-// CHECK-NEXT:       << "failed to verify constraint: a region";
+// CHECK-NEXT:       << "failed to verify constraint: a region (" << find(foo) << ")";
 
 /// Test that duplicate region constraint was not generated.
 // CHECK-NOT:        << "failed to verify constraint: a region";
diff --git a/mlir/test/mlir-tblgen/cpp-class-comments.td b/mlir/test/mlir-tblgen/cpp-class-comments.td
index 9dcf975e45286..0d3445d6647af 100644
--- a/mlir/test/mlir-tblgen/cpp-class-comments.td
+++ b/mlir/test/mlir-tblgen/cpp-class-comments.td
@@ -36,6 +36,7 @@ def A_SomeOp1 : Op<A_Dialect, "some_op1", []>{
 
   let cppNamespace = "OP1";
 // OP: namespace OP1
+// OP-EMPTY:
 // OP-NEXT: /// Some Op1 summary line1
 // OP-NEXT: /// summary line2
 // OP-NEXT: /// Some Op1 description
@@ -97,6 +98,7 @@ def EncodingTrait : AttrInterface<"EncodingTrait"> {
   let methods = [
   ];
 // ATTR-INTERFACE: namespace mlir::a::traits {
+// ATTR-INTERFACE-EMPTY:
 // ATTR-INTERFACE-NEXT: /// Common trait for all layouts.
 // ATTR-INTERFACE-NEXT: class EncodingTrait;
 }
@@ -104,6 +106,7 @@ def EncodingTrait : AttrInterface<"EncodingTrait"> {
 def SimpleEncodingTrait : AttrInterface<"SimpleEncodingTrait"> {
   let cppNamespace = "a::traits";
 // ATTR-INTERFACE: namespace a::traits {
+// ATTR-INTERFACE-EMPTY:
 // ATTR-INTERFACE-NEXT: class SimpleEncodingTrait;
 }
 
@@ -114,6 +117,7 @@ def SimpleOpInterface : OpInterface<"SimpleOpInterface"> {
     Simple Op Interface description
     }];
 // OP-INTERFACE: namespace a::traits {
+// OP-INTERFACE-EMPTY:
 // OP-INTERFACE-NEXT: /// Simple Op Interface description
 // OP-INTERFACE-NEXT: class SimpleOpInterface;
 }
diff --git a/mlir/test/mlir-tblgen/op-attribute.td b/mlir/test/mlir-tblgen/op-attribute.td
index 549830e06042f..a3cb9a41a5b7f 100644
--- a/mlir/test/mlir-tblgen/op-attribute.td
+++ b/mlir/test/mlir-tblgen/op-attribute.td
@@ -69,19 +69,19 @@ def AOp : NS_Op<"a_op", []> {
 
 // DEF:      ::llvm::LogicalResult AOpAdaptor::verify
 // DEF-NEXT: auto tblgen_aAttr = getProperties().aAttr; (void)tblgen_aAttr;
-// DEF-NEXT: if (!tblgen_aAttr) return emitError(loc, "'test.a_op' op ""requires attribute 'aAttr'");
+// DEF-NEXT: if (!tblgen_aAttr) return emitError(loc, "'test.a_op' op requires attribute 'aAttr'");
 // DEF-NEXT: auto tblgen_bAttr = getProperties().bAttr; (void)tblgen_bAttr;
 // DEF-NEXT: auto tblgen_cAttr = getProperties().cAttr; (void)tblgen_cAttr;
 // DEF-NEXT: auto tblgen_dAttr = getProperties().dAttr; (void)tblgen_dAttr;
 
 // DEF:      if (tblgen_aAttr && !((some-condition)))
-// DEF-NEXT:   return emitError(loc, "'test.a_op' op ""attribute 'aAttr' failed to satisfy constraint: some attribute kind");
+// DEF-NEXT:   return emitError(loc, "'test.a_op' op attribute 'aAttr' failed to satisfy constraint: some attribute kind");
 // DEF:      if (tblgen_bAttr && !((some-condition)))
-// DEF-NEXT:   return emitError(loc, "'test.a_op' op ""attribute 'bAttr' failed to satisfy constraint: some attribute kind");
+// DEF-NEXT:   return emitError(loc, "'test.a_op' op attribute 'bAttr' failed to satisfy constraint: some attribute kind");
 // DEF:      if (tblgen_cAttr && !((some-condition)))
-// DEF-NEXT:   return emitError(loc, "'test.a_op' op ""attribute 'cAttr' failed to satisfy constraint: some attribute kind");
+// DEF-NEXT:   return emitError(loc, "'test.a_op' op attribute 'cAttr' failed to satisfy constraint: some attribute kind");
 // DEF:      if (tblgen_dAttr && !((some-condition)))
-// DEF-NEXT:   return emitError(loc, "'test.a_op' op ""attribute 'dAttr' failed to satisfy constraint: some attribute kind");
+// DEF-NEXT:   return emitError(loc, "'test.a_op' op attribute 'dAttr' failed to satisfy constraint: some attribute kind");
 
 // Test getter methods
 // ---
@@ -219,13 +219,13 @@ def AgetOp : Op<Test2_Dialect, "a_get_op", []> {
 
 // DEF:      ::llvm::LogicalResult AgetOpAdaptor::verify
 // DEF: auto tblgen_aAttr = getProperties().aAttr; (void)tblgen_aAttr;
-// DEF: if (!tblgen_aAttr) return emitError(loc, "'test2.a_get_op' op ""requires attribute 'aAttr'");
+// DEF: if (!tblgen_aAttr) return emitError(loc, "'test2.a_get_op' op requires attribute 'aAttr'");
 // DEF: auto tblgen_bAttr = getProperties().bAttr; (void)tblgen_bAttr;
 // DEF: auto tblgen_cAttr = getProperties().cAttr; (void)tblgen_cAttr;
 // DEF:      if (tblgen_bAttr && !((some-condition)))
-// DEF-NEXT:   return emitError(loc, "'test2.a_get_op' op ""attribute 'bAttr' failed to satisfy constraint: some attribute kind");
+// DEF-NEXT:   return emitError(loc, "'test2.a_get_op' op attribute 'bAttr' failed to satisfy constraint: some attribute kind");
 // DEF:      if (tblgen_cAttr && !((some-condition)))
-// DEF-NEXT:   return emitError(loc, "'test2.a_get_op' op ""attribute 'cAttr' failed to satisfy constraint: some attribute kind");
+// DEF-NEXT:   return emitError(loc, "'test2.a_get_op' op attribute 'cAttr' failed to satisfy constraint: some attribute kind");
 
 // Test getter methods
 // ---
diff --git a/mlir/test/mlir-tblgen/op-decl-and-defs.td b/mlir/test/mlir-tblgen/op-decl-and-defs.td
index 87b41f9dea995..0e87373b2f6c2 100644
--- a/mlir/test/mlir-tblgen/op-decl-and-defs.td
+++ b/mlir/test/mlir-tblgen/op-decl-and-defs.td
@@ -65,6 +65,9 @@ def NS_AOp : NS_Op<"a_op", [IsolatedFromAbove, IsolatedFromAbove]> {
 // CHECK:   ::std::optional< ::llvm::APFloat > getSomeAttr2();
 // CHECK:   ::mlir::Region &getSomeRegion() {
 // CHECK:   ::mlir::RegionRange getSomeRegions() {
+// CHECK-NEXT:     return odsRegions.drop_front(1);
+// CHECK:   ::mlir::RegionRange getRegions() {
+// CHECK-NEXT:     return odsRegions;
 // CHECK: };
 // CHECK: }
 
@@ -152,10 +155,6 @@ def NS_AOp : NS_Op<"a_op", [IsolatedFromAbove, IsolatedFromAbove]> {
 
 // Check that `getAttrDictionary()` is used when not using properties.
 
-// DECLS: ::mlir::RegionRange AOpGenericAdaptorBase::getSomeRegions()
-// DECLS-NEXT: return odsRegions.drop_front(1);
-// DECLS: ::mlir::RegionRange AOpGenericAdaptorBase::getRegions()
-
 // Check AttrSizedOperandSegments
 // ---
 
diff --git a/mlir/test/mlir-tblgen/op-properties-predicates.td b/mlir/test/mlir-tblgen/op-properties-predicates.td
index af09ee7c12f53..7cc9633850069 100644
--- a/mlir/test/mlir-tblgen/op-properties-predicates.td
+++ b/mlir/test/mlir-tblgen/op-properties-predicates.td
@@ -74,7 +74,7 @@ def OpWithPredicates : NS_Op<"op_with_predicates"> {
 // Note: comprehensive emission of verifiers is tested in verifyINvariantsImpl() below
 // CHECK: int64_t tblgen_scalar = this->getScalar();
 // CHECK: if (!((tblgen_scalar >= 0)))
-// CHECK: return emitError(loc, "'test.op_with_predicates' op ""property 'scalar' failed to satisfy constraint: non-negative int64_t");
+// CHECK: return emitError(loc, "'test.op_with_predicates' op property 'scalar' failed to satisfy constraint: non-negative int64_t");
 
 // CHECK-LABEL: OpWithPredicates::verifyInvariantsImpl()
 // Note: for test readability, we capture [[maybe_unused]] into the variable maybe_unused
diff --git a/mlir/test/mlir-tblgen/op-properties.td b/mlir/test/mlir-tblgen/op-properties.td
index a9c784cba0b6d..cb9bd3dc868fe 100644
--- a/mlir/test/mlir-tblgen/op-properties.td
+++ b/mlir/test/mlir-tblgen/op-properties.td
@@ -32,7 +32,7 @@ def OpWithProps : NS_Op<"op_with_props"> {
     ArrayProp<StringProp>:$strings,
     DefaultValuedProp<I32Prop, "0">:$default_int,
     OptionalProp<I64Prop>:$optional,
-    DefaultI64Array:$intArray
+    DefaultI64Array:$value
   );
 }
 
@@ -94,10 +94,10 @@ def OpWithOptionalPropsAndAttrs :
 // DECL: ::llvm::ArrayRef<std::string> getStrings()
 // DECL: using default_intTy = int32_t;
 // DECL: default_intTy default_int = 0;
-// DECL: intArrayTy intArray = ::llvm::SmallVector<int64_t>{};
-// DECL: ::llvm::ArrayRef<int64_t> getIntArray()
+// DECL: valueTy value = ::llvm::SmallVector<int64_t>{};
+// DECL: ::llvm::ArrayRef<int64_t> getValue()
 // DECL: return ::llvm::ArrayRef<int64_t>{propStorage}
-// DECL: void setIntArray(::llvm::ArrayRef<int64_t> propValue)
+// DECL: void setValue(::llvm::ArrayRef<int64_t> propValue)
 // DECL: propStorage.assign
 // DECL-LABEL: class OpWithProps :
 // DECL: setString(::llvm::StringRef newString)
@@ -111,14 +111,14 @@ def OpWithOptionalPropsAndAttrs :
 // DECL-SAME: ::llvm::ArrayRef<std::string> strings,
 // DECL-SAME: /*optional*/int32_t default_int = 0,
 // DECL-SAME: /*optional*/std::optional<int64_t> optional = std::nullopt,
-// DECL-SAME: /*optional*/::llvm::ArrayRef<int64_t> intArray = ::llvm::ArrayRef<int64_t>{});
+// DECL-SAME: /*optional*/::llvm::ArrayRef<int64_t> value = ::llvm::ArrayRef<int64_t>{});
 
 // DEFS-LABEL: OpWithProps::computePropertiesHash
-// DEFS: hash_intArray
+// DEFS: hash_value_
 // DEFS: using ::llvm::hash_value;
 // DEFS-NEXT: return hash_value(::llvm::ArrayRef<int64_t>{propStorage})
 // DEFS: hash_value(prop.optional)
-// DEFS: hash_intArray(prop.intArray)
+// DEFS: hash_value_(prop.value)
 
 // -----
 
diff --git a/mlir/test/mlir-tblgen/predicate.td b/mlir/test/mlir-tblgen/predicate.td
index c1fcd3fa76089..41e041f171213 100644
--- a/mlir/test/mlir-tblgen/predicate.td
+++ b/mlir/test/mlir-tblgen/predicate.td
@@ -55,7 +55,7 @@ def OpF : NS_Op<"op_for_int_min_val", []> {
 
 // CHECK-LABEL: OpFAdaptor::verify
 // CHECK:       (::llvm::cast<::mlir::IntegerAttr>(tblgen_attr).getInt() >= 10)
-// CHECK-NEXT:  "attribute 'attr' failed to satisfy constraint: 32-bit signless integer attribute whose minimum value is 10"
+// CHECK-NEXT:  attribute 'attr' failed to satisfy constraint: 32-bit signless integer attribute whose minimum value is 10"
 
 def OpFX : NS_Op<"op_for_int_max_val", []> {
   let arguments = (ins ConfinedAttr<I32Attr, [IntMaxValue<10>]>:$attr);
@@ -63,7 +63,7 @@ def OpFX : NS_Op<"op_for_int_max_val", []> {
 
 // CHECK-LABEL: OpFXAdaptor::verify
 // CHECK:       (::llvm::cast<::mlir::IntegerAttr>(tblgen_attr).getInt() <= 10)
-// CHECK-NEXT:  "attribute 'attr' failed to satisfy constraint: 32-bit signless integer attribute whose maximum value is 10"
+// CHECK-NEXT:  attribute 'attr' failed to satisfy constraint: 32-bit signless integer attribute whose maximum value is 10"
 
 def OpG : NS_Op<"op_for_arr_min_count", []> {
   let arguments = (ins ConfinedAttr<ArrayAttr, [ArrayMinCount<8>]>:$attr);
@@ -71,7 +71,7 @@ def OpG : NS_Op<"op_for_arr_min_count", []> {
 
 // CHECK-LABEL: OpGAdaptor::verify
 // CHECK:       (::llvm::cast<::mlir::ArrayAttr>(tblgen_attr).size() >= 8)
-// CHECK-NEXT:  "attribute 'attr' failed to satisfy constraint: array attribute with at least 8 elements"
+// CHECK-NEXT:  attribute 'attr' failed to satisfy constraint: array attribute with at least 8 elements"
 
 def OpH : NS_Op<"op_for_arr_value_at_index", []> {
   let arguments = (ins ConfinedAttr<ArrayAttr, [IntArrayNthElemEq<0, 8>]>:$attr);
@@ -79,7 +79,7 @@ def OpH : NS_Op<"op_for_arr_value_at_index", []> {
 
 // CHECK-LABEL: OpHAdaptor::verify
 // CHECK: (((::llvm::cast<::mlir::ArrayAttr>(tblgen_attr).size() > 0)) && ((::llvm::cast<::mlir::IntegerAttr>(::llvm::cast<::mlir::ArrayAttr>(tblgen_attr)[0]).getInt() == 8)))))
-// CHECK-NEXT:  "attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be 8"
+// CHECK-NEXT:  attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be 8"
 
 def OpI: NS_Op<"op_for_arr_min_value_at_index", []> {
   let arguments = (ins ConfinedAttr<ArrayAttr, [IntArrayNthElemMinValue<0, 8>]>:$attr);
@@ -87,7 +87,7 @@ def OpI: NS_Op<"op_for_arr_min_value_at_index", []> {
 
 // CHECK-LABEL: OpIAdaptor::verify
 // CHECK: (((::llvm::cast<::mlir::ArrayAttr>(tblgen_attr).size() > 0)) && ((::llvm::cast<::mlir::IntegerAttr>(::llvm::cast<::mlir::ArrayAttr>(tblgen_attr)[0]).getInt() >= 8)))))
-// CHECK-NEXT: "attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be at least 8"
+// CHECK-NEXT: attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be at least 8"
 
 def OpJ: NS_Op<"op_for_arr_max_value_at_index", []> {
   let arguments = (ins ConfinedAttr<ArrayAttr, [IntArrayNthElemMaxValue<0, 8>]>:$attr);
@@ -95,7 +95,7 @@ def OpJ: NS_Op<"op_for_arr_max_value_at_index", []> {
 
 // CHECK-LABEL: OpJAdaptor::verify
 // CHECK: (((::llvm::cast<::mlir::ArrayAttr>(tblgen_attr).size() > 0)) && ((::llvm::cast<::mlir::IntegerAttr>(::llvm::cast<::mlir::ArrayAttr>(tblgen_attr)[0]).getInt() <= 8)))))
-// CHECK-NEXT: "attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be at most 8"
+// CHECK-NEXT: attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be at most 8"
 
 def OpK: NS_Op<"op_for_arr_in_range_at_index", []> {
   let arguments = (ins ConfinedAttr<ArrayAttr, [IntArrayNthElemInRange<0, 4, 8>]>:$attr);
@@ -103,7 +103,7 @@ def OpK: NS_Op<"op_for_arr_in_range_at_index", []> {
 
 // CHECK-LABEL: OpKAdaptor::verify
 // CHECK: (((::llvm::cast<::mlir::ArrayAttr>(tblgen_attr).size() > 0)) && ((::llvm::cast<::mlir::IntegerAttr>(::llvm::cast<::mlir::ArrayAttr>(tblgen_attr)[0]).getInt() >= 4)) && ((::llvm::cast<::mlir::IntegerAttr>(::llvm::cast<::mlir::ArrayAttr>(tblgen_attr)[0]).getInt() <= 8)))))
-// CHECK-NEXT: "attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be at least 4 and at most 8"
+// CHECK-NEXT: attribute 'attr' failed to satisfy constraint: array attribute whose 0-th element must be at least 4 and at most 8"
 
 def OpL: NS_Op<"op_for_TCopVTEtAreSameAt", [
                 PredOpTrait<"operands indexed at 0, 2, 3 should all have "
@@ -121,7 +121,7 @@ def OpL: NS_Op<"op_for_TCopVTEtAreSameAt", [
 // CHECK:      ::llvm::all_equal(::llvm::map_range(
 // CHECK-SAME:   ::mlir::ArrayRef<unsigned>({0, 2, 3}),
 // CHECK-SAME:   [this](unsigned i) { return getElementTypeOrSelf(this->getOperand(i)); }))
-// CHECK: "failed to verify that operands indexed at 0, 2, 3 should all have the same type"
+// CHECK: failed to verify that operands indexed at 0, 2, 3 should all have the same type"
 
 def OpM : NS_Op<"op_for_AnyTensorOf", []> {
   let arguments = (ins TensorOf<[F32, I32]>:$x);
diff --git a/mlir/test/python/CMakeLists.txt b/mlir/test/python/CMakeLists.txt
index 2c123811c2998..c81f75fc6a1af 100644
--- a/mlir/test/python/CMakeLists.txt
+++ b/mlir/test/python/CMakeLists.txt
@@ -11,7 +11,7 @@ add_public_tablegen_target(MLIRPythonTestIncGen)
 
 add_subdirectory(lib)
 
-set(MLIR_PYTHON_TEST_DEPENDS MLIRPythonModules mlir-runner)
+set(MLIR_PYTHON_TEST_DEPENDS MLIRPythonModules mlir-runner mlir_c_runner_utils mlir_runner_utils)
 if(NOT MLIR_STANDALONE_BUILD)
   list(APPEND MLIR_PYTHON_TEST_DEPENDS FileCheck count not)
 endif()
diff --git a/mlir/test/python/dialects/linalg/ops.py b/mlir/test/python/dialects/linalg/ops.py
index 709a1d2424f35..92591cd59fb40 100644
--- a/mlir/test/python/dialects/linalg/ops.py
+++ b/mlir/test/python/dialects/linalg/ops.py
@@ -1,7 +1,8 @@
 # RUN: %PYTHON %s | FileCheck %s
 
-from mlir.dialects import arith, func, linalg, tensor, memref
+from mlir.dialects import arith, func, linalg, tensor, memref, builtin
 from mlir.dialects.linalg.opdsl.lang import *
+from mlir.extras import types as T
 from mlir.ir import *
 
 
@@ -857,3 +858,76 @@ def elementwise_op(
                     )
 
         print(module)
+
+
+@run
+def testReduceOp():
+    with Context(), Location.unknown():
+        f32 = T.f32()
+        tensor_type = T.tensor(10, f32)
+
+        @builtin.module
+        def module():
+            @func.func(tensor_type)
+            def reduce_op(input):
+                c1 = arith.constant(f32, 1.0)
+                single_result = ir.RankedTensorType.get((), f32)
+                dims = ir.DenseI64ArrayAttr.get([0])
+                init = tensor.splat(single_result, c1, [])
+
+                @linalg.reduce(
+                    result=[single_result],
+                    inputs=[input],
+                    inits=[init],
+                    dimensions=dims,
+                )
+                def reduced(element: f32, acc: f32):
+                    return arith.mulf(acc, element)
+
+                return tensor.extract(reduced, [])
+
+        print(module)
+
+
+# CHECK-LABEL:   func.func @reduce_op(
+# CHECK-SAME:      %[[ARG0:.*]]: tensor<10xf32>) -> f32 {
+# CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1.000000e+00 : f32
+# CHECK:           %[[SPLAT_0:.*]] = tensor.splat %[[CONSTANT_0]] : tensor<f32>
+# CHECK:           %[[REDUCE_0:.*]] = linalg.reduce { arith.mulf } ins(%[[ARG0]] : tensor<10xf32>) outs(%[[SPLAT_0]] : tensor<f32>) dimensions = [0]
+# CHECK:           %[[EXTRACT_0:.*]] = tensor.extract %[[REDUCE_0]][] : tensor<f32>
+# CHECK:           return %[[EXTRACT_0]] : f32
+# CHECK:         }
+
+
+@run
+def testMapOp():
+    with Context(), Location.unknown():
+        f32 = T.f32()
+        tensor_type = T.tensor(10, f32)
+
+        @builtin.module
+        def module():
+            @func.func(tensor_type)
+            def map_op(input):
+                empty = tensor.empty(tensor_type.shape, f32)
+
+                @linalg.map(
+                    result=[tensor_type],
+                    inputs=[input, input],
+                    init=empty,
+                )
+                def add(element: f32, acc: f32, init: f32):
+                    return arith.addf(element, acc)
+
+                return add
+
+        module.verify()
+        print(module)
+
+
+# CHECK-LABEL:   func.func @map_op(
+# CHECK-SAME:                      %[[ARG0:.*]]: tensor<10xf32>) -> tensor<10xf32> {
+# CHECK:           %[[EMPTY_0:.*]] = tensor.empty() : tensor<10xf32>
+# CHECK:           %[[MAP_0:.*]] = linalg.map { arith.addf } ins(%[[ARG0]], %[[ARG0]] : tensor<10xf32>, tensor<10xf32>) outs(%[[EMPTY_0]] : tensor<10xf32>)
+# CHECK:           return %[[MAP_0]] : tensor<10xf32>
+# CHECK:         }
diff --git a/mlir/test/python/dialects/linalg/utils.py b/mlir/test/python/dialects/linalg/utils.py
index 5f7cb6a6c83cb..8ab53b4e28743 100644
--- a/mlir/test/python/dialects/linalg/utils.py
+++ b/mlir/test/python/dialects/linalg/utils.py
@@ -208,3 +208,43 @@ def matmul_func(a, b, c):
                 assert maps[0] == a_map
                 assert maps[1] == b_map
                 assert maps[2] == c_map
+
+
+@run
+def test_infer_contraction_dimensions_from_maps():
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            # === Test valid contraction (matmul) ===
+            dim_m = AffineDimExpr.get(0)
+            dim_n = AffineDimExpr.get(1)
+            dim_k = AffineDimExpr.get(2)
+            a_map = AffineMap.get(3, 0, [dim_m, dim_k])
+            b_map = AffineMap.get(3, 0, [dim_k, dim_n])
+            c_map = AffineMap.get(3, 0, [dim_m, dim_n])
+
+            dims = linalg.infer_contraction_dimensions_from_maps([a_map, b_map, c_map])
+            assert dims is not None
+
+            # Expect m=[0], n=[1], k=[2] as per standard matmul.
+            assert list(dims.m) == [0], f"Expected m=[0], got {list(dims.m)}"
+            assert list(dims.n) == [1], f"Expected n=[1], got {list(dims.n)}"
+            assert list(dims.k) == [2], f"Expected k=[2], got {list(dims.k)}"
+            assert list(dims.batch) == [], f"Expected batch=[], got {list(dims.batch)}"
+
+            # === Test invalid input (wrong number of maps) ===
+            invalid_dims = linalg.infer_contraction_dimensions_from_maps([a_map, b_map])
+            assert invalid_dims is None
+
+            # === Test element-wise operation ===
+            dim_i = AffineDimExpr.get(0)
+            dim_j = AffineDimExpr.get(1)
+            elementwise_map = AffineMap.get(2, 0, [dim_i, dim_j])
+            elementwise_dims = linalg.infer_contraction_dimensions_from_maps(
+                [elementwise_map, elementwise_map, elementwise_map]
+            )
+            assert elementwise_dims is not None
+            assert len(elementwise_dims.m) == 0
+            assert len(elementwise_dims.n) == 0
+            assert len(elementwise_dims.k) == 0
+            assert list(elementwise_dims.batch) == [0, 1]
diff --git a/mlir/test/python/dialects/scf.py b/mlir/test/python/dialects/scf.py
index 62d11d5e189c8..0c0c9b986562b 100644
--- a/mlir/test/python/dialects/scf.py
+++ b/mlir/test/python/dialects/scf.py
@@ -1,10 +1,14 @@
 # RUN: %PYTHON %s | FileCheck %s
 
 from mlir.ir import *
-from mlir.dialects import arith
-from mlir.dialects import func
-from mlir.dialects import memref
-from mlir.dialects import scf
+from mlir.extras import types as T
+from mlir.dialects import (
+    arith,
+    func,
+    memref,
+    scf,
+    cf,
+)
 from mlir.passmanager import PassManager
 
 
@@ -355,3 +359,117 @@ def simple_if_else(cond):
 # CHECK:   scf.yield %[[TWO]], %[[THREE]]
 # CHECK: arith.addi %[[RET]]#0, %[[RET]]#1
 # CHECK: return
+
+
+@constructAndPrintInModule
+def testIndexSwitch():
+    i32 = T.i32()
+
+    @func.FuncOp.from_py_func(T.index(), results=[i32])
+    def index_switch(index):
+        c1 = arith.constant(i32, 1)
+        c0 = arith.constant(i32, 0)
+        value = arith.constant(i32, 5)
+        switch_op = scf.IndexSwitchOp([i32], index, range(3))
+
+        assert switch_op.regions[0] == switch_op.default_region
+        assert switch_op.regions[1] == switch_op.case_regions[0]
+        assert switch_op.regions[1] == switch_op.case_region(0)
+        assert len(switch_op.case_regions) == 3
+        assert len(switch_op.regions) == 4
+
+        with InsertionPoint(switch_op.default_block):
+            cf.assert_(arith.constant(T.bool(), 0), "Whoops!")
+            scf.yield_([c1])
+
+        for i, block in enumerate(switch_op.case_blocks):
+            with InsertionPoint(block):
+                scf.yield_([arith.constant(i32, i)])
+
+        func.return_([switch_op.results[0]])
+
+    return index_switch
+
+
+# CHECK-LABEL:   func.func @index_switch(
+# CHECK-SAME:      %[[ARG0:.*]]: index) -> i32 {
+# CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1 : i32
+# CHECK:           %[[CONSTANT_1:.*]] = arith.constant 0 : i32
+# CHECK:           %[[CONSTANT_2:.*]] = arith.constant 5 : i32
+# CHECK:           %[[INDEX_SWITCH_0:.*]] = scf.index_switch %[[ARG0]] -> i32
+# CHECK:           case 0 {
+# CHECK:             %[[CONSTANT_3:.*]] = arith.constant 0 : i32
+# CHECK:             scf.yield %[[CONSTANT_3]] : i32
+# CHECK:           }
+# CHECK:           case 1 {
+# CHECK:             %[[CONSTANT_4:.*]] = arith.constant 1 : i32
+# CHECK:             scf.yield %[[CONSTANT_4]] : i32
+# CHECK:           }
+# CHECK:           case 2 {
+# CHECK:             %[[CONSTANT_5:.*]] = arith.constant 2 : i32
+# CHECK:             scf.yield %[[CONSTANT_5]] : i32
+# CHECK:           }
+# CHECK:           default {
+# CHECK:             %[[CONSTANT_6:.*]] = arith.constant false
+# CHECK:             cf.assert %[[CONSTANT_6]], "Whoops!"
+# CHECK:             scf.yield %[[CONSTANT_0]] : i32
+# CHECK:           }
+# CHECK:           return %[[INDEX_SWITCH_0]] : i32
+# CHECK:         }
+
+
+@constructAndPrintInModule
+def testIndexSwitchWithBodyBuilders():
+    i32 = T.i32()
+
+    @func.FuncOp.from_py_func(T.index(), results=[i32])
+    def index_switch(index):
+        c1 = arith.constant(i32, 1)
+        c0 = arith.constant(i32, 0)
+        value = arith.constant(i32, 5)
+
+        def default_body_builder(switch_op):
+            cf.assert_(arith.constant(T.bool(), 0), "Whoops!")
+            scf.yield_([c1])
+
+        def case_body_builder(switch_op, case_index: int, case_value: int):
+            scf.yield_([arith.constant(i32, case_value)])
+
+        result = scf.index_switch(
+            results=[i32],
+            arg=index,
+            cases=range(3),
+            case_body_builder=case_body_builder,
+            default_body_builder=default_body_builder,
+        )
+
+        func.return_([result])
+
+    return index_switch
+
+
+# CHECK-LABEL:   func.func @index_switch(
+# CHECK-SAME:      %[[ARG0:.*]]: index) -> i32 {
+# CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1 : i32
+# CHECK:           %[[CONSTANT_1:.*]] = arith.constant 0 : i32
+# CHECK:           %[[CONSTANT_2:.*]] = arith.constant 5 : i32
+# CHECK:           %[[INDEX_SWITCH_0:.*]] = scf.index_switch %[[ARG0]] -> i32
+# CHECK:           case 0 {
+# CHECK:             %[[CONSTANT_3:.*]] = arith.constant 0 : i32
+# CHECK:             scf.yield %[[CONSTANT_3]] : i32
+# CHECK:           }
+# CHECK:           case 1 {
+# CHECK:             %[[CONSTANT_4:.*]] = arith.constant 1 : i32
+# CHECK:             scf.yield %[[CONSTANT_4]] : i32
+# CHECK:           }
+# CHECK:           case 2 {
+# CHECK:             %[[CONSTANT_5:.*]] = arith.constant 2 : i32
+# CHECK:             scf.yield %[[CONSTANT_5]] : i32
+# CHECK:           }
+# CHECK:           default {
+# CHECK:             %[[CONSTANT_6:.*]] = arith.constant false
+# CHECK:             cf.assert %[[CONSTANT_6]], "Whoops!"
+# CHECK:             scf.yield %[[CONSTANT_0]] : i32
+# CHECK:           }
+# CHECK:           return %[[INDEX_SWITCH_0]] : i32
+# CHECK:         }
diff --git a/mlir/test/python/dialects/transform.py b/mlir/test/python/dialects/transform.py
index 6c5e4e5505b1c..f58442d04fc66 100644
--- a/mlir/test/python/dialects/transform.py
+++ b/mlir/test/python/dialects/transform.py
@@ -43,6 +43,26 @@ def testTypes(module: Module):
     print(param.type)
 
 
+@run
+def testSequenceOp(module: Module):
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [transform.AnyOpType.get()],
+        transform.AnyOpType.get(),
+    )
+    with InsertionPoint(sequence.body):
+        res = transform.CastOp(transform.AnyOpType.get(), sequence.bodyTarget)
+        res2 = transform.cast(transform.any_op_t(), res.result)
+        transform.YieldOp([res2])
+    # CHECK-LABEL: TEST: testSequenceOp
+    # CHECK: transform.sequence
+    # CHECK: ^{{.*}}(%[[ARG0:.+]]: !transform.any_op):
+    # CHECK:   %[[RES:.+]] = cast %[[ARG0]] : !transform.any_op to !transform.any_op
+    # CHECK:   %[[RES2:.+]] = cast %[[RES]] : !transform.any_op to !transform.any_op
+    # CHECK:   yield %[[RES2]] : !transform.any_op
+    # CHECK: }
+
+
 @run
 def testSequenceOp(module: Module):
     sequence = transform.SequenceOp(
@@ -58,6 +78,7 @@ def testSequenceOp(module: Module):
     # CHECK:   yield %[[ARG0]] : !transform.any_op
     # CHECK: }
 
+
 @run
 def testNestedSequenceOp(module: Module):
     sequence = transform.SequenceOp(
@@ -103,55 +124,65 @@ def testSequenceOpWithExtras(module: Module):
     # CHECK-LABEL: TEST: testSequenceOpWithExtras
     # CHECK: transform.sequence failures(propagate)
     # CHECK: ^{{.*}}(%{{.*}}: !transform.any_op, %{{.*}}: !transform.any_op, %{{.*}}: !transform.op<"foo.bar">):
+    sequence = transform.sequence(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.AnyOpType.get(),
+        [transform.AnyOpType.get(), transform.OperationType.get("foo.bar")],
+    )
+    with InsertionPoint(sequence.body):
+        transform.yield_()
+    # CHECK: transform.sequence failures(propagate)
+    # CHECK: ^{{.*}}(%{{.*}}: !transform.any_op, %{{.*}}: !transform.any_op, %{{.*}}: !transform.op<"foo.bar">):
 
 
 @run
 def testNestedSequenceOpWithExtras(module: Module):
-  sequence = transform.SequenceOp(
+    sequence = transform.SequenceOp(
         transform.FailurePropagationMode.Propagate,
         [],
         transform.AnyOpType.get(),
         [transform.AnyOpType.get(), transform.OperationType.get("foo.bar")],
     )
-  with InsertionPoint(sequence.body):
-    nested = transform.SequenceOp(
+    with InsertionPoint(sequence.body):
+        nested = transform.SequenceOp(
             transform.FailurePropagationMode.Propagate,
             [],
             sequence.bodyTarget,
             sequence.bodyExtraArgs,
         )
-    with InsertionPoint(nested.body):
-      transform.YieldOp()
-    transform.YieldOp()
-  # CHECK-LABEL: TEST: testNestedSequenceOpWithExtras
-  # CHECK: transform.sequence failures(propagate)
-  # CHECK: ^{{.*}}(%[[ARG0:.*]]: !transform.any_op, %[[ARG1:.*]]: !transform.any_op, %[[ARG2:.*]]: !transform.op<"foo.bar">):
-  # CHECK:   sequence %[[ARG0]], %[[ARG1]], %[[ARG2]] : (!transform.any_op, !transform.any_op, !transform.op<"foo.bar">)
+        with InsertionPoint(nested.body):
+            transform.YieldOp()
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: testNestedSequenceOpWithExtras
+    # CHECK: transform.sequence failures(propagate)
+    # CHECK: ^{{.*}}(%[[ARG0:.*]]: !transform.any_op, %[[ARG1:.*]]: !transform.any_op, %[[ARG2:.*]]: !transform.op<"foo.bar">):
+    # CHECK:   sequence %[[ARG0]], %[[ARG1]], %[[ARG2]] : (!transform.any_op, !transform.any_op, !transform.op<"foo.bar">)
 
 
 @run
 def testTransformPDLOps(module: Module):
-  withPdl = transform_pdl.WithPDLPatternsOp(transform.AnyOpType.get())
-  with InsertionPoint(withPdl.body):
-    sequence = transform.SequenceOp(
-        transform.FailurePropagationMode.Propagate,
-        [transform.AnyOpType.get()],
-        withPdl.bodyTarget,
-    )
-    with InsertionPoint(sequence.body):
-      match = transform_pdl.PDLMatchOp(
-          transform.AnyOpType.get(), sequence.bodyTarget, "pdl_matcher"
-      )
-      transform.YieldOp(match)
-  # CHECK-LABEL: TEST: testTransformPDLOps
-  # CHECK: transform.with_pdl_patterns {
-  # CHECK: ^{{.*}}(%[[ARG0:.+]]: !transform.any_op):
-  # CHECK:   = sequence %[[ARG0]] : !transform.any_op -> !transform.any_op failures(propagate) {
-  # CHECK:   ^{{.*}}(%[[ARG1:.+]]: !transform.any_op):
-  # CHECK:     %[[RES:.+]] = pdl_match @pdl_matcher in %[[ARG1]]
-  # CHECK:     yield %[[RES]] : !transform.any_op
-  # CHECK:   }
-  # CHECK: }
+    withPdl = transform_pdl.WithPDLPatternsOp(transform.AnyOpType.get())
+    with InsertionPoint(withPdl.body):
+        sequence = transform.SequenceOp(
+            transform.FailurePropagationMode.Propagate,
+            [transform.AnyOpType.get()],
+            withPdl.bodyTarget,
+        )
+        with InsertionPoint(sequence.body):
+            match = transform_pdl.PDLMatchOp(
+                transform.AnyOpType.get(), sequence.bodyTarget, "pdl_matcher"
+            )
+            transform.YieldOp(match)
+    # CHECK-LABEL: TEST: testTransformPDLOps
+    # CHECK: transform.with_pdl_patterns {
+    # CHECK: ^{{.*}}(%[[ARG0:.+]]: !transform.any_op):
+    # CHECK:   = sequence %[[ARG0]] : !transform.any_op -> !transform.any_op failures(propagate) {
+    # CHECK:   ^{{.*}}(%[[ARG1:.+]]: !transform.any_op):
+    # CHECK:     %[[RES:.+]] = pdl_match @pdl_matcher in %[[ARG1]]
+    # CHECK:     yield %[[RES]] : !transform.any_op
+    # CHECK:   }
+    # CHECK: }
 
 
 @run
@@ -161,32 +192,53 @@ def testNamedSequenceOp(module: Module):
         "__transform_main",
         [transform.AnyOpType.get()],
         [transform.AnyOpType.get()],
-        arg_attrs = [{"transform.consumed": UnitAttr.get()}])
+        arg_attrs=[{"transform.consumed": UnitAttr.get()}],
+    )
     with InsertionPoint(named_sequence.body):
         transform.YieldOp([named_sequence.bodyTarget])
     # CHECK-LABEL: TEST: testNamedSequenceOp
     # CHECK: module attributes {transform.with_named_sequence} {
-    # CHECK: transform.named_sequence @__transform_main(%[[ARG0:.+]]: !transform.any_op {transform.consumed}) -> !transform.any_op {
-    # CHECK:   yield %[[ARG0]] : !transform.any_op
+    # CHECK:   transform.named_sequence @__transform_main(%[[ARG0:.+]]: !transform.any_op {transform.consumed}) -> !transform.any_op {
+    # CHECK:     yield %[[ARG0]] : !transform.any_op
+    named_sequence = transform.named_sequence(
+        "other_seq",
+        [transform.AnyOpType.get()],
+        [transform.AnyOpType.get()],
+        arg_attrs=[{"transform.consumed": UnitAttr.get()}],
+    )
+    with InsertionPoint(named_sequence.body):
+        transform.yield_([named_sequence.bodyTarget])
+    # CHECK:   transform.named_sequence @other_seq(%[[ARG1:.+]]: !transform.any_op {transform.consumed}) -> !transform.any_op {
+    # CHECK:     yield %[[ARG1]] : !transform.any_op
 
 
 @run
 def testGetParentOp(module: Module):
-  sequence = transform.SequenceOp(
-      transform.FailurePropagationMode.Propagate, [], transform.AnyOpType.get()
-  )
-  with InsertionPoint(sequence.body):
-    transform.GetParentOp(
-        transform.AnyOpType.get(),
-        sequence.bodyTarget,
-        isolated_from_above=True,
-        nth_parent=2,
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate, [], transform.AnyOpType.get()
     )
-    transform.YieldOp()
-  # CHECK-LABEL: TEST: testGetParentOp
-  # CHECK: transform.sequence
-  # CHECK: ^{{.*}}(%[[ARG1:.+]]: !transform.any_op):
-  # CHECK:   = get_parent_op %[[ARG1]] {isolated_from_above, nth_parent = 2 : i64}
+    with InsertionPoint(sequence.body):
+        transform.GetParentOp(
+            transform.AnyOpType.get(),
+            sequence.bodyTarget,
+            isolated_from_above=True,
+            nth_parent=2,
+        )
+        transform.get_parent_op(
+            transform.AnyOpType.get(),
+            sequence.bodyTarget,
+            isolated_from_above=True,
+            nth_parent=2,
+            allow_empty_results=True,
+            op_name="func.func",
+            deduplicate=True,
+        )
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: testGetParentOp
+    # CHECK: transform.sequence
+    # CHECK: ^{{.*}}(%[[ARG1:.+]]: !transform.any_op):
+    # CHECK:   = get_parent_op %[[ARG1]] {isolated_from_above, nth_parent = 2 : i64}
+    # CHECK:   = get_parent_op %[[ARG1]] {allow_empty_results, deduplicate, isolated_from_above, nth_parent = 2 : i64, op_name = "func.func"}
 
 
 @run
@@ -195,43 +247,58 @@ def testMergeHandlesOp(module: Module):
         transform.FailurePropagationMode.Propagate, [], transform.AnyOpType.get()
     )
     with InsertionPoint(sequence.body):
-        transform.MergeHandlesOp([sequence.bodyTarget])
+        res = transform.MergeHandlesOp([sequence.bodyTarget])
+        transform.merge_handles([res.result], deduplicate=True)
         transform.YieldOp()
     # CHECK-LABEL: TEST: testMergeHandlesOp
     # CHECK: transform.sequence
     # CHECK: ^{{.*}}(%[[ARG1:.+]]: !transform.any_op):
-    # CHECK:   = merge_handles %[[ARG1]]
+    # CHECK:  %[[RES1:.+]] = merge_handles %[[ARG1]] : !transform.any_op
+    # CHECK:               = merge_handles deduplicate %[[RES1]] : !transform.any_op
 
 
 @run
 def testApplyPatternsOpCompact(module: Module):
-  sequence = transform.SequenceOp(
-      transform.FailurePropagationMode.Propagate, [], transform.AnyOpType.get()
-  )
-  with InsertionPoint(sequence.body):
-    with InsertionPoint(transform.ApplyPatternsOp(sequence.bodyTarget).patterns):
-      transform.ApplyCanonicalizationPatternsOp()
-    transform.YieldOp()
-    # CHECK-LABEL: TEST: testApplyPatternsOpCompact
-    # CHECK: apply_patterns to
-    # CHECK: transform.apply_patterns.canonicalization
-    # CHECK: !transform.any_op
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate, [], transform.AnyOpType.get()
+    )
+    with InsertionPoint(sequence.body):
+        with InsertionPoint(transform.ApplyPatternsOp(sequence.bodyTarget).patterns):
+            transform.ApplyCanonicalizationPatternsOp()
+        with InsertionPoint(
+            transform.apply_patterns(
+                sequence.bodyTarget,
+                apply_cse=True,
+                max_iterations=3,
+                max_num_rewrites=5,
+            ).patterns
+        ):
+            transform.ApplyCanonicalizationPatternsOp()
+        transform.YieldOp()
+        # CHECK-LABEL: TEST: testApplyPatternsOpCompact
+        # CHECK: apply_patterns to
+        # CHECK: transform.apply_patterns.canonicalization
+        # CHECK: } : !transform.any_op
+        # CHECK: apply_patterns to
+        # CHECK: transform.apply_patterns.canonicalization
+        # CHECK: } {apply_cse, max_iterations = 3 : i64, max_num_rewrites = 5 : i64} : !transform.any_op
 
 
 @run
 def testApplyPatternsOpWithType(module: Module):
-  sequence = transform.SequenceOp(
-      transform.FailurePropagationMode.Propagate, [],
-      transform.OperationType.get('test.dummy')
-  )
-  with InsertionPoint(sequence.body):
-    with InsertionPoint(transform.ApplyPatternsOp(sequence.bodyTarget).patterns):
-      transform.ApplyCanonicalizationPatternsOp()
-    transform.YieldOp()
-    # CHECK-LABEL: TEST: testApplyPatternsOp
-    # CHECK: apply_patterns to
-    # CHECK: transform.apply_patterns.canonicalization
-    # CHECK: !transform.op<"test.dummy">
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("test.dummy"),
+    )
+    with InsertionPoint(sequence.body):
+        with InsertionPoint(transform.ApplyPatternsOp(sequence.bodyTarget).patterns):
+            transform.ApplyCanonicalizationPatternsOp()
+        transform.YieldOp()
+        # CHECK-LABEL: TEST: testApplyPatternsOp
+        # CHECK: apply_patterns to
+        # CHECK: transform.apply_patterns.canonicalization
+        # CHECK: !transform.op<"test.dummy">
 
 
 @run
@@ -249,11 +316,13 @@ def testReplicateOp(module: Module):
                 transform.AnyOpType.get(), sequence.bodyTarget, "second"
             )
             transform.ReplicateOp(m1, [m2])
+            transform.replicate(m1, [m2])
             transform.YieldOp()
     # CHECK-LABEL: TEST: testReplicateOp
     # CHECK: %[[FIRST:.+]] = pdl_match
     # CHECK: %[[SECOND:.+]] = pdl_match
     # CHECK: %{{.*}} = replicate num(%[[FIRST]]) %[[SECOND]]
+    # CHECK: %{{.*}} = replicate num(%[[FIRST]]) %[[SECOND]]
 
 
 # CHECK-LABEL: TEST: testApplyRegisteredPassOp
diff --git a/mlir/test/python/dialects/transform_structured_ext.py b/mlir/test/python/dialects/transform_structured_ext.py
index d6b70dc9d1978..e58b7646316fc 100644
--- a/mlir/test/python/dialects/transform_structured_ext.py
+++ b/mlir/test/python/dialects/transform_structured_ext.py
@@ -627,12 +627,16 @@ def testVectorizeChildrenAndApplyPatternsAllAttrs(target):
         disable_transfer_permutation_map_lowering_patterns=True,
         vectorize_nd_extract=True,
         vectorize_padding=True,
+        flatten_1d_depthwise_conv=True,
+        fold_type_extensions_into_contract=True,
     )
     # CHECK-LABEL: TEST: testVectorizeChildrenAndApplyPatternsAllAttrs
     # CHECK: transform.sequence
     # CHECK: = transform.structured.vectorize
     # CHECK-SAME: disable_multi_reduction_to_contract_patterns
     # CHECK-SAME: disable_transfer_permutation_map_lowering_patterns
+    # CHECK-SAME: flatten_1d_depthwise_conv
+    # CHECK-SAME: fold_type_extensions_into_contract
     # CHECK-SAME: vectorize_nd_extract
     # CHECK-SAME: vectorize_padding
 
@@ -646,12 +650,16 @@ def testVectorizeChildrenAndApplyPatternsNoAttrs(target):
         disable_transfer_permutation_map_lowering_patterns=False,
         vectorize_nd_extract=False,
         vectorize_padding=False,
+        flatten_1d_depthwise_conv=False,
+        fold_type_extensions_into_contract=False,
     )
     # CHECK-LABEL: TEST: testVectorizeChildrenAndApplyPatternsNoAttrs
     # CHECK: transform.sequence
     # CHECK: = transform.structured.vectorize
     # CHECK-NOT: disable_multi_reduction_to_contract_patterns
     # CHECK-NOT: disable_transfer_permutation_map_lowering_patterns
+    # CHECK-NOT: flatten_1d_depthwise_conv
+    # CHECK-NOT: fold_type_extensions_into_contract
     # CHECK-NOT: vectorize_nd_extract
     # CHECK-NOT: vectorize_padding
 
diff --git a/mlir/test/python/dialects/transform_xegpu_ext.py b/mlir/test/python/dialects/transform_xegpu_ext.py
new file mode 100644
index 0000000000000..4f89982ad1c44
--- /dev/null
+++ b/mlir/test/python/dialects/transform_xegpu_ext.py
@@ -0,0 +1,249 @@
+# RUN: %PYTHON %s | FileCheck %s
+
+from mlir.ir import *
+from mlir.dialects import transform
+from mlir.dialects.transform import xegpu
+from mlir.dialects.transform import structured, AnyValueType
+
+
+def run(f):
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            print("\nTEST:", f.__name__)
+            f()
+        print(module)
+    return f
+
+
+@run
+def getDescOpDefaultIndex():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("xegpu.dpas"),
+    )
+    with InsertionPoint(sequence.body):
+        operand = transform.GetOperandOp(AnyValueType.get(), sequence.bodyTarget, [0])
+        desc_handle = xegpu.get_desc_op(operand)
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: getDescOpDefaultIndex
+    # CHECK: transform.xegpu.get_desc_op %
+
+
+@run
+def setDescLayoutMinimal():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("xegpu.create_nd_tdesc"),
+    )
+    with InsertionPoint(sequence.body):
+        xegpu.set_desc_layout(sequence.bodyTarget, sg_layout=[6, 4], sg_data=[32, 16])
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: setDescLayoutMinimal
+    # CHECK: %0 = transform.xegpu.set_desc_layout %
+    # CHECK: sg_layout = [6, 4]
+    # CHECK: sg_data = [32, 16]
+
+
+@run
+def setDescLayoutInstData():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("xegpu.create_nd_tdesc"),
+    )
+    with InsertionPoint(sequence.body):
+        xegpu.set_desc_layout(
+            sequence.bodyTarget, sg_layout=[6, 4], sg_data=[32, 16], inst_data=[8, 16]
+        )
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: setDescLayoutInstData
+    # CHECK: %0 = transform.xegpu.set_desc_layout %
+    # CHECK: sg_layout = [6, 4]
+    # CHECK: sg_data = [32, 16]
+    # CHECK: inst_data = [8, 16]
+
+
+@run
+def setOpLayoutAttrOperandMinimal():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("xegpu.dpas"),
+    )
+    with InsertionPoint(sequence.body):
+        xegpu.set_op_layout_attr(
+            sequence.bodyTarget,
+            sg_layout=[6, 4],
+            sg_data=[32, 16],
+        )
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: setOpLayoutAttr
+    # CHECK: transform.xegpu.set_op_layout_attr %
+    # NO-CHECK: index = 0
+    # NO-CHECK: result
+    # CHECK: sg_layout = [6, 4]
+    # CHECK: sg_data = [32, 16]
+    # NO-CHECK: inst_data
+
+
+@run
+def setOpLayoutAttrResult():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("xegpu.dpas"),
+    )
+    with InsertionPoint(sequence.body):
+        xegpu.set_op_layout_attr(
+            sequence.bodyTarget,
+            index=0,
+            sg_layout=[6, 4],
+            sg_data=[32, 16],
+            inst_data=[8, 16],
+            result=True,
+        )
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: setOpLayoutAttr
+    # CHECK: transform.xegpu.set_op_layout_attr %
+    # NO-CHECK: index = 0
+    # CHECK: result
+    # CHECK: sg_layout = [6, 4]
+    # CHECK: sg_data = [32, 16]
+    # CHECK: inst_data = [8, 16]
+
+
+@run
+def setGPULaunchThreadsOp():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("gpu.launch"),
+    )
+    with InsertionPoint(sequence.body):
+        xegpu.set_gpu_launch_threads(sequence.bodyTarget, threads=[8, 4, 1])
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: setGPULaunchThreadsOp
+    # CHECK: transform.xegpu.set_gpu_launch_threads
+    # CHECK: threads = [8, 4, 1]
+
+
+@run
+def insertPrefetch0():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("xegpu.dpas"),
+    )
+    with InsertionPoint(sequence.body):
+        operand = transform.GetOperandOp(AnyValueType.get(), sequence.bodyTarget, [0])
+        xegpu.insert_prefetch(
+            operand,
+        )
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: insertPrefetch0
+    # CHECK: %[[OPR:.*]] = get_operand
+    # CHECK: transform.xegpu.insert_prefetch %[[OPR]]
+
+
+@run
+def insertPrefetchNbPrefetch():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("xegpu.dpas"),
+    )
+    with InsertionPoint(sequence.body):
+        operand = transform.GetOperandOp(AnyValueType.get(), sequence.bodyTarget, [0])
+        xegpu.insert_prefetch(
+            operand,
+            nb_prefetch=2,
+        )
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: insertPrefetchNbPrefetch
+    # CHECK: %[[OPR:.*]] = get_operand
+    # CHECK: transform.xegpu.insert_prefetch %[[OPR]]
+    # CHECK-SAME: nb_prefetch = 2
+
+
+@run
+def insertPrefetchNbPrefetchParam():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("xegpu.dpas"),
+    )
+    with InsertionPoint(sequence.body):
+        operand = transform.GetOperandOp(AnyValueType.get(), sequence.bodyTarget, [0])
+        int32_t = IntegerType.get_signless(32)
+        param_int32_t = transform.ParamType.get(int32_t)
+        nb_param = transform.ParamConstantOp(
+            param_int32_t,
+            IntegerAttr.get(int32_t, 2),
+        )
+        xegpu.insert_prefetch(
+            operand,
+            nb_prefetch=nb_param,
+        )
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: insertPrefetchNbPrefetchParam
+    # CHECK: %[[OPR:.*]] = get_operand
+    # CHECK: %[[PARAM_OP:.*]] = transform.param.constant 2
+    # CHECK: transform.xegpu.insert_prefetch %[[OPR]]
+    # CHECK-SAME: nb_prefetch = %[[PARAM_OP]]
+
+
+@run
+def ConvertLayoutMinimal():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("xegpu.dpas"),
+    )
+    with InsertionPoint(sequence.body):
+        operand = transform.GetOperandOp(AnyValueType.get(), sequence.bodyTarget, [0])
+        xegpu.convert_layout(
+            operand,
+            input_sg_layout=[6, 4],
+            input_sg_data=[32, 16],
+            target_sg_layout=[6, 4],
+            target_sg_data=[8, 16],
+        )
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: ConvertLayoutMinimal
+    # CHECK: transform.xegpu.convert_layout %
+    # CHECK: input_sg_layout = [6, 4]
+    # CHECK: input_sg_data = [32, 16]
+    # CHECK: target_sg_layout = [6, 4]
+    # CHECK: target_sg_data = [8, 16]
+
+
+@run
+def ConvertLayout():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("xegpu.dpas"),
+    )
+    with InsertionPoint(sequence.body):
+        operand = transform.GetOperandOp(AnyValueType.get(), sequence.bodyTarget, [1])
+        xegpu.convert_layout(
+            operand,
+            input_sg_layout=[6, 4],
+            input_sg_data=[32, 32],
+            input_inst_data=[32, 16],
+            target_sg_layout=[6, 4],
+            target_sg_data=[32, 32],
+            target_inst_data=[8, 16],
+        )
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: ConvertLayout
+    # CHECK: transform.xegpu.convert_layout %
+    # CHECK: input_sg_layout = [6, 4]
+    # CHECK: input_sg_data = [32, 32]
+    # CHECK: input_inst_data = [32, 16]
+    # CHECK: target_sg_layout = [6, 4]
+    # CHECK: target_sg_data = [32, 32]
+    # CHECK: target_inst_data = [8, 16]
diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py
index f5fa4dad856f8..66ba5d28e49b2 100644
--- a/mlir/test/python/ir/operation.py
+++ b/mlir/test/python/ir/operation.py
@@ -2,12 +2,12 @@
 
 import gc
 import io
-import itertools
 from tempfile import NamedTemporaryFile
 from mlir.ir import *
 from mlir.dialects.builtin import ModuleOp
-from mlir.dialects import arith
+from mlir.dialects import arith, func, scf
 from mlir.dialects._ods_common import _cext
+from mlir.extras import types as T
 
 
 def run(f):
@@ -1199,3 +1199,25 @@ def testGetOwnerConcreteOpview():
             r = arith.AddIOp(a, a, overflowFlags=arith.IntegerOverflowFlags.nsw)
             for u in a.result.uses:
                 assert isinstance(u.owner, arith.AddIOp)
+
+
+# CHECK-LABEL: TEST: testIndexSwitch
+@run
+def testIndexSwitch():
+    with Context() as ctx, Location.unknown():
+        i32 = T.i32()
+        module = Module.create()
+        with InsertionPoint(module.body):
+
+            @func.FuncOp.from_py_func(T.index())
+            def index_switch(index):
+                c1 = arith.constant(i32, 1)
+                switch_op = scf.IndexSwitchOp(results=[i32], arg=index, cases=range(3))
+
+                assert len(switch_op.regions) == 4
+                assert len(switch_op.regions[2:]) == 2
+                assert len([i for i in switch_op.regions[2:]]) == 2
+                assert len(switch_op.caseRegions) == 3
+                assert len([i for i in switch_op.caseRegions]) == 3
+                assert len(switch_op.caseRegions[1:]) == 2
+                assert len([i for i in switch_op.caseRegions[1:]]) == 2
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
index 8ec2e03095423..2a513c3b8cc9b 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
@@ -637,8 +637,10 @@ void DefGen::emitTraitMethods(const InterfaceTrait &trait) {
   for (auto &method : iface.getMethods()) {
     // Don't declare if the method has a body. Or if the method has a default
     // implementation and the def didn't request that it always be declared.
-    if (method.getBody() || (method.getDefaultImplementation() &&
-                             !alwaysDeclared.count(method.getName()))) {
+    if (method.getBody())
+      continue;
+    if (method.getDefaultImplementation() &&
+        !alwaysDeclared.count(method.getName())) {
       genTraitMethodUsingDecl(trait, method);
       continue;
     }
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 371864830a3c1..3b10842f2a127 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -17,6 +17,7 @@
 #include "OpGenHelpers.h"
 #include "mlir/TableGen/Argument.h"
 #include "mlir/TableGen/Attribute.h"
+#include "mlir/TableGen/Builder.h"
 #include "mlir/TableGen/Class.h"
 #include "mlir/TableGen/CodeGenHelpers.h"
 #include "mlir/TableGen/Format.h"
@@ -24,16 +25,24 @@
 #include "mlir/TableGen/Interfaces.h"
 #include "mlir/TableGen/Operator.h"
 #include "mlir/TableGen/Property.h"
+#include "mlir/TableGen/Region.h"
 #include "mlir/TableGen/SideEffects.h"
+#include "mlir/TableGen/Successor.h"
 #include "mlir/TableGen/Trait.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/CodeGenHelpers.h"
@@ -380,9 +389,8 @@ class OpOrAdaptorHelper {
   Formatter emitErrorPrefix() const {
     return [this](raw_ostream &os) -> raw_ostream & {
       if (emitForOp)
-        return os << "emitOpError(";
-      return os << formatv("emitError(loc, \"'{0}' op \"",
-                           op.getOperationName());
+        return os << "emitOpError(\"";
+      return os << formatv("emitError(loc, \"'{0}' op ", op.getOperationName());
     };
   }
 
@@ -940,7 +948,7 @@ genAttributeVerifier(const OpOrAdaptorHelper &emitHelper, FmtContext &ctx,
   // {4}: Attribute/constraint description.
   const char *const verifyAttrInline = R"(
   if ({0} && !({1}))
-    return {2}"attribute '{3}' failed to satisfy constraint: {4}");
+    return {2}attribute '{3}' failed to satisfy constraint: {4}");
 )";
   // Verify the attribute using a uniqued constraint. Can only be used within
   // the context of an op.
@@ -993,10 +1001,11 @@ while (true) {{
         (constraintFn = staticVerifierEmitter.getAttrConstraintFn(attr))) {
       body << formatv(verifyAttrUnique, *constraintFn, varName, attrName);
     } else {
-      body << formatv(verifyAttrInline, varName,
-                      tgfmt(condition, &ctx.withSelf(varName)),
-                      emitHelper.emitErrorPrefix(), attrName,
-                      escapeString(attr.getSummary()));
+      body << formatv(
+          verifyAttrInline, varName, tgfmt(condition, &ctx.withSelf(varName)),
+          emitHelper.emitErrorPrefix(), attrName,
+          buildErrorStreamingString(attr.getSummary(), ctx.withSelf(varName),
+                                    ErrorStreamType::InsideOpError));
     }
   };
 
@@ -1017,7 +1026,7 @@ while (true) {{
           it.first);
       if (metadata.isRequired)
         body << formatv(
-            "if (!tblgen_{0}) return {1}\"requires attribute '{0}'\");\n",
+            "if (!tblgen_{0}) return {1}requires attribute '{0}'\");\n",
             it.first, emitHelper.emitErrorPrefix());
     }
   } else {
@@ -1099,7 +1108,7 @@ static void genPropertyVerifier(
   // {3}: Property description.
   const char *const verifyPropertyInline = R"(
   if (!({0}))
-    return {1}"property '{2}' failed to satisfy constraint: {3}");
+    return {1}property '{2}' failed to satisfy constraint: {3}");
 )";
 
   // Verify the property using a uniqued constraint. Can only be used
@@ -1143,9 +1152,12 @@ static void genPropertyVerifier(
     if (uniquedFn.has_value() && emitHelper.isEmittingForOp())
       body << formatv(verifyPropertyUniqued, *uniquedFn, varName, prop.name);
     else
-      body << formatv(
-          verifyPropertyInline, tgfmt(rawCondition, &ctx.withSelf(varName)),
-          emitHelper.emitErrorPrefix(), prop.name, prop.prop.getSummary());
+      body << formatv(verifyPropertyInline,
+                      tgfmt(rawCondition, &ctx.withSelf(varName)),
+                      emitHelper.emitErrorPrefix(), prop.name,
+                      buildErrorStreamingString(
+                          prop.prop.getSummary(), ctx.withSelf(varName),
+                          ErrorStreamType::InsideOpError));
   }
 }
 
@@ -1629,7 +1641,7 @@ void OpEmitter::genPropertiesSupport() {
   // Hashing for the property
 
   const char *propHashFmt = R"decl(
-  auto hash_{0} = [] (const auto &propStorage) -> llvm::hash_code {
+  auto hash_{0}_ = [] (const auto &propStorage) -> llvm::hash_code {
     using ::llvm::hash_value;
     return {1};
   };
@@ -1655,7 +1667,7 @@ void OpEmitter::genPropertiesSupport() {
         if (const auto *namedProperty =
                 llvm::dyn_cast_if_present<const NamedProperty *>(attrOrProp)) {
           if (!namedProperty->prop.getHashPropertyCall().empty()) {
-            hashMethod << "\n    hash_" << namedProperty->name << "(prop."
+            hashMethod << "\n    hash_" << namedProperty->name << "_(prop."
                        << namedProperty->name << ")";
           } else {
             hashMethod << "\n    hash_value(prop." << namedProperty->name
@@ -2632,11 +2644,13 @@ void OpEmitter::genInlineCreateBody(
     interleaveComma(nonBuilderStateArgsList, nonBuilderStateArgsOS);
     nonBuilderStateArgs = ", " + nonBuilderStateArgs;
   }
-  cWithLoc->body() << llvm::formatv(inlineCreateBody, locParamName,
-                                    nonBuilderStateArgs,
-                                    opClass.getClassName());
-  cImplicitLoc->body() << llvm::formatv(inlineCreateBodyImplicitLoc,
-                                        nonBuilderStateArgs);
+  if (cWithLoc)
+    cWithLoc->body() << llvm::formatv(inlineCreateBody, locParamName,
+                                      nonBuilderStateArgs,
+                                      opClass.getClassName());
+  if (cImplicitLoc)
+    cImplicitLoc->body() << llvm::formatv(inlineCreateBodyImplicitLoc,
+                                          nonBuilderStateArgs);
 }
 
 void OpEmitter::genSeparateArgParamBuilder() {
diff --git a/mlir/tools/mlir-tblgen/PassGen.cpp b/mlir/tools/mlir-tblgen/PassGen.cpp
index f7134ce02b72f..f4b8eb43b49b8 100644
--- a/mlir/tools/mlir-tblgen/PassGen.cpp
+++ b/mlir/tools/mlir-tblgen/PassGen.cpp
@@ -57,19 +57,23 @@ const char *const passRegistrationCode = R"(
 //===----------------------------------------------------------------------===//
 // {0} Registration
 //===----------------------------------------------------------------------===//
+#ifdef {1}
 
 inline void register{0}() {{
   ::mlir::registerPass([]() -> std::unique_ptr<::mlir::Pass> {{
-    return {1};
+    return {2};
   });
 }
 
 // Old registration code, kept for temporary backwards compatibility.
 inline void register{0}Pass() {{
   ::mlir::registerPass([]() -> std::unique_ptr<::mlir::Pass> {{
-    return {1};
+    return {2};
   });
 }
+
+#undef {1}
+#endif // {1}
 )";
 
 /// The code snippet used to generate a function to register all passes in a
@@ -116,6 +120,10 @@ static std::string getPassDeclVarName(const Pass &pass) {
   return "GEN_PASS_DECL_" + pass.getDef()->getName().upper();
 }
 
+static std::string getPassRegistrationVarName(const Pass &pass) {
+  return "GEN_PASS_REGISTRATION_" + pass.getDef()->getName().upper();
+}
+
 /// Emit the code to be included in the public header of the pass.
 static void emitPassDecls(const Pass &pass, raw_ostream &os) {
   StringRef passName = pass.getDef()->getName();
@@ -143,18 +151,25 @@ static void emitPassDecls(const Pass &pass, raw_ostream &os) {
 /// PassRegistry.
 static void emitRegistrations(llvm::ArrayRef<Pass> passes, raw_ostream &os) {
   os << "#ifdef GEN_PASS_REGISTRATION\n";
+  os << "// Generate registrations for all passes.\n";
+  for (const Pass &pass : passes)
+    os << "#define " << getPassRegistrationVarName(pass) << "\n";
+  os << "#endif // GEN_PASS_REGISTRATION\n";
 
   for (const Pass &pass : passes) {
+    std::string passName = pass.getDef()->getName().str();
+    std::string passEnableVarName = getPassRegistrationVarName(pass);
+
     std::string constructorCall;
     if (StringRef constructor = pass.getConstructor(); !constructor.empty())
       constructorCall = constructor.str();
     else
-      constructorCall = formatv("create{0}()", pass.getDef()->getName()).str();
-
-    os << formatv(passRegistrationCode, pass.getDef()->getName(),
+      constructorCall = formatv("create{0}()", passName).str();
+    os << formatv(passRegistrationCode, passName, passEnableVarName,
                   constructorCall);
   }
 
+  os << "#ifdef GEN_PASS_REGISTRATION\n";
   os << formatv(passGroupRegistrationCode, groupName);
 
   for (const Pass &pass : passes)
diff --git a/mlir/tools/tblgen-to-irdl/tblgen-to-irdl.cpp b/mlir/tools/tblgen-to-irdl/tblgen-to-irdl.cpp
index 092ec2ebe81a6..33421b4e2ddd6 100644
--- a/mlir/tools/tblgen-to-irdl/tblgen-to-irdl.cpp
+++ b/mlir/tools/tblgen-to-irdl/tblgen-to-irdl.cpp
@@ -18,10 +18,11 @@ using namespace llvm;
 using namespace mlir;
 
 // Generator that prints records.
-GenRegistration printRecords("print-records", "Print all records to stdout",
-                             [](const RecordKeeper &records, raw_ostream &os) {
-                               os << records;
-                               return false;
-                             });
+static GenRegistration
+    printRecords("print-records", "Print all records to stdout",
+                 [](const RecordKeeper &records, raw_ostream &os) {
+                   os << records;
+                   return false;
+                 });
 
 int main(int argc, char **argv) { return MlirTblgenMain(argc, argv); }
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
index 6ac9a873e6154..d6203b97e00d7 100644
--- a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
@@ -766,7 +766,9 @@ void testShortDataEntryOpBuildersMappableVar(OpBuilder &b, MLIRContext &context,
 
 struct IntegerOpenACCMappableModel
     : public mlir::acc::MappableType::ExternalModel<IntegerOpenACCMappableModel,
-                                                    IntegerType> {};
+                                                    IntegerType> {
+  bool hasUnknownDimensions(mlir::Type type) const { return false; }
+};
 
 TEST_F(OpenACCOpsTest, mappableTypeBuilderDataEntry) {
   // First, set up the test by attaching MappableInterface to IntegerType.
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp
index f1fe53c15a6f5..6f4e30585b2c9 100644
--- a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp
@@ -570,3 +570,107 @@ TEST_F(OpenACCUtilsTest, getRecipeNamePrivateUnrankedMemref) {
       getRecipeName(RecipeKind::private_recipe, unrankedMemrefTy);
   EXPECT_EQ(recipeName, "privatization_memref_Zxi32_");
 }
+
+//===----------------------------------------------------------------------===//
+// getBaseEntity Tests
+//===----------------------------------------------------------------------===//
+
+// Local implementation of PartialEntityAccessOpInterface for memref.subview.
+// This is implemented locally in the test rather than officially because memref
+// operations already have ViewLikeOpInterface, which serves a similar purpose
+// for walking through views to the base entity. This test demonstrates how
+// getBaseEntity() would work if the interface were attached to memref.subview.
+namespace {
+struct SubViewOpPartialEntityAccessOpInterface
+    : public acc::PartialEntityAccessOpInterface::ExternalModel<
+          SubViewOpPartialEntityAccessOpInterface, memref::SubViewOp> {
+  Value getBaseEntity(Operation *op) const {
+    auto subviewOp = cast<memref::SubViewOp>(op);
+    return subviewOp.getSource();
+  }
+
+  bool isCompleteView(Operation *op) const {
+    // For testing purposes, we'll consider it a partial view (return false).
+    // The real implementation would need to look at the offsets.
+    return false;
+  }
+};
+} // namespace
+
+TEST_F(OpenACCUtilsTest, getBaseEntityFromSubview) {
+  // Register the local interface implementation for memref.subview
+  memref::SubViewOp::attachInterface<SubViewOpPartialEntityAccessOpInterface>(
+      context);
+
+  // Create a base memref
+  auto memrefTy = MemRefType::get({10, 20}, b.getF32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+  Value baseMemref = allocOp->getResult();
+
+  // Create a subview of the base memref with non-zero offsets
+  // This creates a 5x10 view starting at [2, 3] in the original 10x20 memref
+  SmallVector<OpFoldResult> offsets = {b.getIndexAttr(2), b.getIndexAttr(3)};
+  SmallVector<OpFoldResult> sizes = {b.getIndexAttr(5), b.getIndexAttr(10)};
+  SmallVector<OpFoldResult> strides = {b.getIndexAttr(1), b.getIndexAttr(1)};
+
+  OwningOpRef<memref::SubViewOp> subviewOp =
+      memref::SubViewOp::create(b, loc, baseMemref, offsets, sizes, strides);
+  Value subview = subviewOp->getResult();
+
+  // Test that getBaseEntity returns the base memref, not the subview
+  Value baseEntity = getBaseEntity(subview);
+  EXPECT_EQ(baseEntity, baseMemref);
+}
+
+TEST_F(OpenACCUtilsTest, getBaseEntityNoInterface) {
+  // Create a memref without the interface
+  auto memrefTy = MemRefType::get({10}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+  Value varPtr = allocOp->getResult();
+
+  // Test that getBaseEntity returns the value itself when there's no interface
+  Value baseEntity = getBaseEntity(varPtr);
+  EXPECT_EQ(baseEntity, varPtr);
+}
+
+TEST_F(OpenACCUtilsTest, getBaseEntityChainedSubviews) {
+  // Register the local interface implementation for memref.subview
+  memref::SubViewOp::attachInterface<SubViewOpPartialEntityAccessOpInterface>(
+      context);
+
+  // Create a base memref
+  auto memrefTy = MemRefType::get({100, 200}, b.getI64Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+  Value baseMemref = allocOp->getResult();
+
+  // Create first subview
+  SmallVector<OpFoldResult> offsets1 = {b.getIndexAttr(10), b.getIndexAttr(20)};
+  SmallVector<OpFoldResult> sizes1 = {b.getIndexAttr(50), b.getIndexAttr(80)};
+  SmallVector<OpFoldResult> strides1 = {b.getIndexAttr(1), b.getIndexAttr(1)};
+
+  OwningOpRef<memref::SubViewOp> subview1Op =
+      memref::SubViewOp::create(b, loc, baseMemref, offsets1, sizes1, strides1);
+  Value subview1 = subview1Op->getResult();
+
+  // Create second subview (subview of subview)
+  SmallVector<OpFoldResult> offsets2 = {b.getIndexAttr(5), b.getIndexAttr(10)};
+  SmallVector<OpFoldResult> sizes2 = {b.getIndexAttr(20), b.getIndexAttr(30)};
+  SmallVector<OpFoldResult> strides2 = {b.getIndexAttr(1), b.getIndexAttr(1)};
+
+  OwningOpRef<memref::SubViewOp> subview2Op =
+      memref::SubViewOp::create(b, loc, subview1, offsets2, sizes2, strides2);
+  Value subview2 = subview2Op->getResult();
+
+  // Test that getBaseEntity on the nested subview returns the first subview
+  // (since our implementation returns the immediate source, not the ultimate
+  // base)
+  Value baseEntity = getBaseEntity(subview2);
+  EXPECT_EQ(baseEntity, subview1);
+
+  // Test that calling getBaseEntity again returns the original base
+  Value ultimateBase = getBaseEntity(baseEntity);
+  EXPECT_EQ(ultimateBase, baseMemref);
+}
diff --git a/mlir/unittests/TableGen/CMakeLists.txt b/mlir/unittests/TableGen/CMakeLists.txt
index c51bda6e8d6cc..4d8e508ecdf5b 100644
--- a/mlir/unittests/TableGen/CMakeLists.txt
+++ b/mlir/unittests/TableGen/CMakeLists.txt
@@ -25,6 +25,6 @@ target_include_directories(MLIRTableGenTests
 )
 
 target_link_libraries(MLIRTableGenTests
-  PRIVATE MLIRTableGen MLIRIR
+  PRIVATE LLVMTableGen MLIRTableGen MLIRIR
   PUBLIC MLIRTestDialect
 )
diff --git a/mlir/utils/generate-test-checks.py b/mlir/utils/generate-test-checks.py
index 3712a6b9c963d..22774468bb403 100755
--- a/mlir/utils/generate-test-checks.py
+++ b/mlir/utils/generate-test-checks.py
@@ -230,14 +230,11 @@ def process_line(line_chunks, variable_namer, use_ssa_name=False, strict_name_re
 
 
 # Process the source file lines. The source file doesn't have to be .mlir.
-def process_source_lines(source_lines, note, args):
+def process_source_lines(source_lines, args):
     source_split_re = re.compile(args.source_delim_regex)
 
     source_segments = [[]]
     for line in source_lines:
-        # Remove previous note.
-        if line in note:
-            continue
         # Remove previous CHECK lines.
         if line.find(args.check_prefix) != -1:
             continue
@@ -359,9 +356,10 @@ def main():
 
     source_segments = None
     if args.source:
-        source_segments = process_source_lines(
-            [l.rstrip() for l in open(args.source, "r")], autogenerated_note, args
-        )
+        with open(args.source, "r") as f:
+            raw_source = f.read().replace(autogenerated_note, "")
+            raw_source_lines = [l.rstrip() for l in raw_source.splitlines()]
+        source_segments = process_source_lines(raw_source_lines, args)
 
     if args.inplace:
         assert args.output is None
diff --git a/mlir/utils/pygments/mlir_lexer.py b/mlir/utils/pygments/mlir_lexer.py
index 179a058e9110c..4cbe0fe236fc4 100644
--- a/mlir/utils/pygments/mlir_lexer.py
+++ b/mlir/utils/pygments/mlir_lexer.py
@@ -2,37 +2,132 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from pygments.lexer import RegexLexer
+from pygments.lexer import RegexLexer, bygroups, include, using
 from pygments.token import *
+import re
 
 
 class MlirLexer(RegexLexer):
+    """Pygments lexer for MLIR.
+
+    This lexer focuses on accurate tokenization of common MLIR constructs:
+    - SSA values (%%... / %...)
+    - attribute and type aliases (#name =, !name =)
+    - types (builtin and dialect types, parametric types)
+    - attribute dictionaries and nested containers to a reasonable depth
+    - numbers (ints, floats with exponents, hex)
+    - strings with common escapes
+    - line comments (// ...)
+    - block labels (^foo) and operations
+    """
+
     name = "MLIR"
     aliases = ["mlir"]
     filenames = ["*.mlir"]
 
+    flags = re.MULTILINE
+
+    class VariableList(RegexLexer):
+        """Lexer for lists of SSA variables separated by commas."""
+
+        tokens = {
+            "root": [
+                (r"\s+", Text),
+                (r",", Punctuation),
+                (r"%[_A-Za-z0-9\.\$\-:#]+", Name.Variable),
+            ]
+        }
+
     tokens = {
         "root": [
-            (r"%[a-zA-Z0-9_]+", Name.Variable),
-            (r"@[a-zA-Z_][a-zA-Z0-9_]+", Name.Function),
-            (r"\^[a-zA-Z0-9_]+", Name.Label),
-            (r"#[a-zA-Z0-9_]+", Name.Constant),
-            (r"![a-zA-Z0-9_]+", Keyword.Type),
-            (r"[a-zA-Z_][a-zA-Z0-9_]*\.", Name.Entity),
-            (r"memref[^.]", Keyword.Type),
-            (r"index", Keyword.Type),
-            (r"i[0-9]+", Keyword.Type),
-            (r"f[0-9]+", Keyword.Type),
+            # Comments
+            (r"//.*?$", Comment.Single),
+            # operation name with assignment: %... = op.name
+            (
+                r"^(\s*)(%[\%_A-Za-z0-9\:#\,\s]+)(=)(\s*)([A-Za-z0-9_\.\$\-]+)\b",
+                bygroups(Text, using(VariableList), Operator, Text, Name.Builtin),
+            ),
+            # operation name without result
+            (r"^(\s*)([A-Za-z0-9_\.\$\-]+)\b(?=[^<:])", bygroups(Text, Name.Builtin)),
+            # Attribute alias definition:  #name =
+            (
+                r"^(\s*)(#[_A-Za-z0-9\$\-\.]+)(\b)(\s*=)",
+                bygroups(Text, Name.Constant, Text, Operator),
+            ),
+            # Type alias definition: !name =
+            (
+                r"^(\s*)(![_A-Za-z0-9\$\-\.]+)(\b)(\s*=)",
+                bygroups(Text, Keyword.Type, Text, Operator),
+            ),
+            # SSA values (uses)
+            (r"%[_A-Za-z0-9\.\$\-:#]+", Name.Variable),
+            # attribute refs, constants and named attributes
+            (r"#[_A-Za-z0-9\$\-\.]+\b", Name.Constant),
+            # symbol refs / function-like names
+            (r"@[_A-Za-z][_A-Za-z0-9\$\-\.]*\b", Name.Function),
+            # blocks
+            (r"\^[A-Za-z0-9_\$\.\-]+", Name.Label),
+            # types by exclamation or builtin names
+            (r"![_A-Za-z0-9\$\-\.]+\b", Keyword.Type),
+            # NOTE: please sync changes to corresponding builtin type rule in "angled-type"
+            (r"\b(bf16|f16|f32|f64|f80|f128|index|none|(u|s)?i[0-9]+)\b", Keyword.Type),
+            # container-like dialect types (tensor<...>, memref<...>, vector<...>)
+            (
+                r"\b(complex|memref|tensor|tuple|vector)\s*(<)",
+                bygroups(Keyword.Type, Punctuation),
+                "angled-type",
+            ),
+            # affine constructs
+            (r"\b(affine_map|affine_set)\b", Keyword.Reserved),
+            # common builtin operators / functions inside affine_map
+            (r"\b(ceildiv|floordiv|mod|symbol)\b", Name.Other),
+            # identifiers / bare words
+            (r"\b[_A-Za-z][_A-Za-z0-9\.-]*\b", Name.Other),
+            # numbers: hex, float (with exponent), integer
+            (r"\b0x[0-9A-Fa-f]+\b", Number.Hex),
+            (r"\b([0-9]+(\.[0-9]*)?|\.[0-9]+)([eE][+-]?[0-9]+)?\b", Number.Float),
+            (r"\b[0-9]+\b", Number.Integer),
+            # strings
+            (r'"', String.Double, "string"),
+            # punctuation and arrow-like tokens
+            (r"->|>=|<=|\>=|\<=|\->|\=>", Operator),
+            (r"[()\[\]{}<>,.:=]", Punctuation),
+            # operators
+            (r"[-+*/%]", Operator),
+        ],
+        # string state with common escapes
+        "string": [
+            (r'\\[ntr"\\]', String.Escape),
+            (r'[^"\\]+', String.Double),
+            (r'"', String.Double, "#pop"),
+        ],
+        # angled-type content
+        "angled-type": [
+            # match nested '<' and '>'
+            (r"<", Punctuation, "#push"),
+            (r">", Punctuation, "#pop"),
+            # dimensions like 3x or 3x3x... and standalone numbers:
+            # - match numbers that are followed by an 'x' (dimension separator)
+            (r"([0-9]+)(?=(?:x))", Number.Integer),
+            # - match bare numbers (sizes)
             (r"[0-9]+", Number.Integer),
-            (r"[0-9]*\.[0-9]*", Number.Float),
-            (r'"[^"]*"', String.Double),
-            (r"affine_map", Keyword.Reserved),
-            # TODO: this should be within affine maps only
-            (r"\+-\*\/", Operator),
-            (r"floordiv", Operator.Word),
-            (r"ceildiv", Operator.Word),
-            (r"mod", Operator.Word),
-            (r"()\[\]<>,{}", Punctuation),
-            (r"\/\/.*\n", Comment.Single),
-        ]
+            # dynamic dimension '?'
+            (r"\?", Name.Integer),
+            # the 'x' dimension separator (treat as punctuation)
+            (r"x", Punctuation),
+            # element / builtin types inside angle brackets (no word-boundary)
+            # NOTE: please sync changes to corresponding builtin type rule in "root"
+            (
+                r"(?:bf16|f16|f32|f64|f80|f128|index|none|(?:[us]?i[0-9]+))",
+                Keyword.Type,
+            ),
+            # also allow nested container-like types to be recognized
+            (
+                r"\b(complex|memref|tensor|tuple|vector)\s*(<)",
+                bygroups(Keyword.Type, Punctuation),
+                "angled-type",
+            ),
+            # fall back to root rules for anything else
+            include("root"),
+        ],
     }
diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h
index 2a283bd6fa4ed..79e45fd8e082d 100644
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -21,7 +21,6 @@ enum class DeviceDebugKind : uint32_t {
   Assertion = 1U << 0,
   FunctionTracing = 1U << 1,
   CommonIssues = 1U << 2,
-  AllocationTracker = 1U << 3,
   PGODump = 1U << 4,
 };
 
@@ -36,27 +35,6 @@ struct DeviceEnvironmentTy {
   uint64_t HardwareParallelism;
 };
 
-struct DeviceMemoryPoolTy {
-  void *Ptr;
-  uint64_t Size;
-};
-
-struct DeviceMemoryPoolTrackingTy {
-  uint64_t NumAllocations;
-  uint64_t AllocationTotal;
-  uint64_t AllocationMin;
-  uint64_t AllocationMax;
-
-  void combine(DeviceMemoryPoolTrackingTy &Other) {
-    NumAllocations += Other.NumAllocations;
-    AllocationTotal += Other.AllocationTotal;
-    AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin
-                                                        : AllocationMin;
-    AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax
-                                                        : AllocationMax;
-  }
-};
-
 // NOTE: Please don't change the order of those members as their indices are
 // used in the middle end. Always add the new data member at the end.
 // Different from KernelEnvironmentTy below, this structure contains members
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 89aa468689eaf..fbb4a06accf84 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -103,10 +103,6 @@ enum TargetAllocTy : int32_t {
   TARGET_ALLOC_DEFAULT,
 };
 
-inline KernelArgsTy CTorDTorKernelArgs = {
-    1,       0, nullptr,   nullptr,   nullptr,   nullptr, nullptr,
-    nullptr, 0, {0, 0, 0}, {1, 0, 0}, {1, 0, 0}, 0};
-
 struct DeviceTy;
 
 /// The libomptarget wrapper around a __tgt_async_info object directly
diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td
index 5b54c79d83f9d..e9c154818c4a1 100644
--- a/offload/liboffload/API/Device.td
+++ b/offload/liboffload/API/Device.td
@@ -29,6 +29,7 @@ def ol_device_info_t : Enum {
     TaggedEtor<"PLATFORM", "ol_platform_handle_t", "the platform associated with the device">,
     TaggedEtor<"NAME", "char[]", "Device name">,
     TaggedEtor<"PRODUCT_NAME", "char[]", "Device user-facing marketing name">,
+    TaggedEtor<"UID", "char[]", "Device UID">,
     TaggedEtor<"VENDOR", "char[]", "Device vendor">,
     TaggedEtor<"DRIVER_VERSION", "char[]", "Driver version">,
     TaggedEtor<"MAX_WORK_GROUP_SIZE", "uint32_t", "Maximum total work group size in work items">,
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 6d22faeb0e57e..84bc414396811 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -147,8 +147,8 @@ llvm::Error ol_platform_impl_t::init() {
     if (llvm::Error Err = Plugin->initDevice(Id))
       return Err;
 
-    auto Device = &Plugin->getDevice(Id);
-    auto Info = Device->obtainInfoImpl();
+    GenericDeviceTy *Device = &Plugin->getDevice(Id);
+    llvm::Expected<InfoTreeNode> Info = Device->obtainInfo();
     if (llvm::Error Err = Info.takeError())
       return Err;
     Devices.emplace_back(std::make_unique<ol_device_impl_t>(Id, Device, *this,
@@ -467,6 +467,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
   switch (PropName) {
   case OL_DEVICE_INFO_NAME:
   case OL_DEVICE_INFO_PRODUCT_NAME:
+  case OL_DEVICE_INFO_UID:
   case OL_DEVICE_INFO_VENDOR:
   case OL_DEVICE_INFO_DRIVER_VERSION: {
     // String values
@@ -544,6 +545,8 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device,
     return Info.writeString("Virtual Host Device");
   case OL_DEVICE_INFO_PRODUCT_NAME:
     return Info.writeString("Virtual Host Device");
+  case OL_DEVICE_INFO_UID:
+    return Info.writeString(GenericPluginTy::getHostDeviceUid());
   case OL_DEVICE_INFO_VENDOR:
     return Info.writeString("Liboffload");
   case OL_DEVICE_INFO_DRIVER_VERSION:
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 29cfe78082dbb..ddfa65c76cf2d 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -72,6 +72,7 @@ typedef enum hsa_amd_agent_info_s {
   HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A,
   HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B,
   HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010,
+  HSA_AMD_AGENT_INFO_UUID = 0xA011,
   HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY = 0xA016,
 } hsa_amd_agent_info_t;
 
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 0b03ef534d273..04b394452a448 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2083,6 +2083,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       return Err;
     ComputeUnitKind = GPUName;
 
+    // From the ROCm HSA documentation:
+    // Query the UUID of the agent. The value is an Ascii string with a maximum
+    // of 21 chars including NUL. The string value consists of two parts: header
+    // and body. The header identifies the device type (GPU, CPU, DSP) while the
+    // body encodes the UUID as a 16 digit hex string.
+    //
+    // Agents that do not support UUID will return the string "GPU-XX" or
+    // "CPU-XX" or "DSP-XX" depending on their device type.
+    char UUID[24] = {0};
+    if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_UUID, UUID))
+      return Err;
+    if (!StringRef(UUID).ends_with("-XX"))
+      setDeviceUidFromVendorUid(UUID);
+
     // Get the wavefront size.
     uint32_t WavefrontSize = 0;
     if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize))
@@ -3095,17 +3109,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     StackSize = Value;
     return Plugin::success();
   }
-  Error getDeviceHeapSize(uint64_t &Value) override {
-    Value = DeviceMemoryPoolSize;
-    return Plugin::success();
-  }
-  Error setDeviceHeapSize(uint64_t Value) override {
-    for (DeviceImageTy *Image : LoadedImages)
-      if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value))
-        return Err;
-    DeviceMemoryPoolSize = Value;
-    return Plugin::success();
-  }
   Error getDeviceMemorySize(uint64_t &Value) override {
     for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
       if (Pool->isGlobal()) {
@@ -3307,9 +3310,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Reference to the host device.
   AMDHostDeviceTy &HostDevice;
 
-  /// The current size of the global device memory pool (managed by us).
-  uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/;
-
   /// The current size of the stack that will be used in cases where it could
   /// not be statically determined.
   uint64_t StackSize = 16 * 1024 /* 16 KB */;
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index f9bff9abd903c..2135e0608323e 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -791,6 +791,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// this id is not unique between different plugins; they may overlap.
   int32_t getDeviceId() const { return DeviceId; }
 
+  /// Get the unique identifier of the device.
+  const char *getDeviceUid() const { return DeviceUid.c_str(); }
+
   /// Set the context of the device if needed, before calling device-specific
   /// functions. Plugins may implement this function as a no-op if not needed.
   virtual Error setContext() = 0;
@@ -816,10 +819,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Error unloadBinary(DeviceImageTy *Image);
   virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0;
 
-  /// Setup the global device memory pool, if the plugin requires one.
-  Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
-                              uint64_t PoolSize);
-
   // Setup the RPC server for this device if needed. This may not run on some
   // plugins like the CPU targets. By default, it will not be executed so it is
   // up to the target to override this using the shouldSetupRPCServer function.
@@ -989,9 +988,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Error syncEvent(void *EventPtr);
   virtual Error syncEventImpl(void *EventPtr) = 0;
 
+  /// Obtain information about the device.
+  Expected<InfoTreeNode> obtainInfo();
+  virtual Expected<InfoTreeNode> obtainInfoImpl() = 0;
+
   /// Print information about the device.
   Error printInfo();
-  virtual Expected<InfoTreeNode> obtainInfoImpl() = 0;
 
   /// Return true if the device has work that is either queued or currently
   /// running
@@ -1061,6 +1063,16 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   virtual Error getDeviceStackSize(uint64_t &V) = 0;
 
+  virtual bool hasDeviceHeapSize() { return false; }
+  virtual Error getDeviceHeapSize(uint64_t &V) {
+    return Plugin::error(error::ErrorCode::UNSUPPORTED,
+                         "%s not supported by platform", __func__);
+  }
+  virtual Error setDeviceHeapSize(uint64_t V) {
+    return Plugin::error(error::ErrorCode::UNSUPPORTED,
+                         "%s not supported by platform", __func__);
+  }
+
   /// Returns true if current plugin architecture is an APU
   /// and unified_shared_memory was not requested by the program.
   bool useAutoZeroCopy();
@@ -1153,12 +1165,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// plugin can implement the setters as no-op and setting the output
   /// value to zero for the getters.
   virtual Error setDeviceStackSize(uint64_t V) = 0;
-  virtual Error getDeviceHeapSize(uint64_t &V) = 0;
-  virtual Error setDeviceHeapSize(uint64_t V) = 0;
-
-  /// Indicate whether the device should setup the global device memory pool. If
-  /// false is return the value on the device will be uninitialized.
-  virtual bool shouldSetupDeviceMemoryPool() const { return true; }
 
   /// Indicate whether or not the device should setup the RPC server. This is
   /// only necessary for unhosted targets like the GPU.
@@ -1204,6 +1210,14 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// global device id and is not the device id visible to the OpenMP user.
   const int32_t DeviceId;
 
+  /// The unique identifier of the device.
+  /// Per default, the unique identifier of the device is set to the device id,
+  /// combined with the plugin name, since the offload device id may overlap
+  /// between different plugins.
+  std::string DeviceUid;
+  /// Construct the device UID from the vendor (U)UID.
+  void setDeviceUidFromVendorUid(StringRef VendorUid);
+
   /// The default grid values used for this device.
   llvm::omp::GV GridValues;
 
@@ -1237,10 +1251,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Internal representation for OMPT device (initialize & finalize)
   std::atomic<bool> OmptInitialized;
 #endif
-
-private:
-  DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
-  DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
 };
 
 /// Class implementing common functionalities of offload plugins. Each plugin
@@ -1290,6 +1300,9 @@ struct GenericPluginTy {
     return UserDeviceIds.at(DeviceId);
   }
 
+  /// Get the UID for the host device.
+  static constexpr const char *getHostDeviceUid() { return "HOST"; }
+
   /// Get the ELF code to recognize the binary image of this plugin.
   virtual uint16_t getMagicElfBits() const = 0;
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 36d643b65922d..ee2ecbcfd3098 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -715,6 +715,10 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
       DeviceId(DeviceId), GridValues(OMPGridValues),
       PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
       PinnedAllocs(*this), RPCServer(nullptr) {
+  // Conservative fall-back to the plugin's device uid for the case that no real
+  // vendor (u)uid will become available later.
+  setDeviceUidFromVendorUid(std::to_string(static_cast<uint64_t>(DeviceId)));
+
 #ifdef OMPT_SUPPORT
   OmptInitialized.store(false);
   // Bind the callbacks to this device's member functions
@@ -758,13 +762,15 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
     return StackSizeEnvarOrErr.takeError();
   OMPX_TargetStackSize = std::move(*StackSizeEnvarOrErr);
 
-  auto HeapSizeEnvarOrErr = UInt64Envar::create(
-      "LIBOMPTARGET_HEAP_SIZE",
-      [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); },
-      [this](uint64_t V) -> Error { return setDeviceHeapSize(V); });
-  if (!HeapSizeEnvarOrErr)
-    return HeapSizeEnvarOrErr.takeError();
-  OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr);
+  if (hasDeviceHeapSize()) {
+    auto HeapSizeEnvarOrErr = UInt64Envar::create(
+        "LIBOMPTARGET_HEAP_SIZE",
+        [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); },
+        [this](uint64_t V) -> Error { return setDeviceHeapSize(V); });
+    if (!HeapSizeEnvarOrErr)
+      return HeapSizeEnvarOrErr.takeError();
+    OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr);
+  }
 
   // Update the maximum number of teams and threads after the device
   // initialization sets the corresponding hardware limit.
@@ -791,19 +797,6 @@ Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) {
   if (auto Err = callGlobalDestructors(Plugin, *Image))
     return Err;
 
-  if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
-    GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
-    DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
-    GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
-                           sizeof(DeviceMemoryPoolTrackingTy),
-                           &ImageDeviceMemoryPoolTracking);
-    if (auto Err =
-            GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
-      consumeError(std::move(Err));
-    }
-    DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
-  }
-
   GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
   auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
   if (!ProfOrErr)
@@ -829,22 +822,6 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
       return Err;
   LoadedImages.clear();
 
-  if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
-    // TODO: Write this by default into a file.
-    printf("\n\n|-----------------------\n"
-           "| Device memory tracker:\n"
-           "|-----------------------\n"
-           "| #Allocations: %lu\n"
-           "| Byes allocated: %lu\n"
-           "| Minimal allocation: %lu\n"
-           "| Maximal allocation: %lu\n"
-           "|-----------------------\n\n\n",
-           DeviceMemoryPoolTracking.NumAllocations,
-           DeviceMemoryPoolTracking.AllocationTotal,
-           DeviceMemoryPoolTracking.AllocationMin,
-           DeviceMemoryPoolTracking.AllocationMax);
-  }
-
   // Delete the memory manager before deinitializing the device. Otherwise,
   // we may delete device allocations after the device is deinitialized.
   if (MemoryManager)
@@ -897,18 +874,6 @@ Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
   // Add the image to list.
   LoadedImages.push_back(Image);
 
-  // Setup the global device memory pool if needed.
-  if (!Plugin.getRecordReplay().isReplaying() &&
-      shouldSetupDeviceMemoryPool()) {
-    uint64_t HeapSize;
-    auto SizeOrErr = getDeviceHeapSize(HeapSize);
-    if (SizeOrErr) {
-      REPORT("No global device memory pool due to error: %s\n",
-             toString(std::move(SizeOrErr)).data());
-    } else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
-      return std::move(Err);
-  }
-
   if (auto Err = setupRPCServer(Plugin, *Image))
     return std::move(Err);
 
@@ -932,51 +897,6 @@ Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
   return Image;
 }
 
-Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
-                                             DeviceImageTy &Image,
-                                             uint64_t PoolSize) {
-  // Free the old pool, if any.
-  if (DeviceMemoryPool.Ptr) {
-    if (auto Err = dataDelete(DeviceMemoryPool.Ptr,
-                              TargetAllocTy::TARGET_ALLOC_DEVICE))
-      return Err;
-  }
-
-  DeviceMemoryPool.Size = PoolSize;
-  auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr,
-                              TargetAllocTy::TARGET_ALLOC_DEVICE);
-  if (AllocOrErr) {
-    DeviceMemoryPool.Ptr = *AllocOrErr;
-  } else {
-    auto Err = AllocOrErr.takeError();
-    REPORT("Failure to allocate device memory for global memory pool: %s\n",
-           toString(std::move(Err)).data());
-    DeviceMemoryPool.Ptr = nullptr;
-    DeviceMemoryPool.Size = 0;
-  }
-
-  // Create the metainfo of the device environment global.
-  GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
-  if (!GHandler.isSymbolInImage(*this, Image,
-                                "__omp_rtl_device_memory_pool_tracker")) {
-    DP("Skip the memory pool as there is no tracker symbol in the image.");
-    return Error::success();
-  }
-
-  GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
-                         sizeof(DeviceMemoryPoolTrackingTy),
-                         &DeviceMemoryPoolTracking);
-  if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
-    return Err;
-
-  // Create the metainfo of the device environment global.
-  GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
-                        sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
-
-  // Write device environment values to the device.
-  return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
-}
-
 Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
                                       DeviceImageTy &Image) {
   // The plugin either does not need an RPC server or it is unavailable.
@@ -1524,15 +1444,22 @@ Error GenericDeviceTy::enqueueHostCall(void (*Callback)(void *), void *UserData,
   return Err;
 }
 
+Expected<InfoTreeNode> GenericDeviceTy::obtainInfo() {
+  auto InfoOrErr = obtainInfoImpl();
+  if (InfoOrErr)
+    InfoOrErr->add("UID", getDeviceUid(), "", DeviceInfo::UID);
+  return InfoOrErr;
+}
+
 Error GenericDeviceTy::printInfo() {
-  auto Info = obtainInfoImpl();
+  auto InfoOrErr = obtainInfo();
 
   // Get the vendor-specific info entries describing the device properties.
-  if (auto Err = Info.takeError())
+  if (auto Err = InfoOrErr.takeError())
     return Err;
 
   // Print all info entries.
-  Info->print();
+  InfoOrErr->print();
 
   return Plugin::success();
 }
@@ -1603,6 +1530,10 @@ Expected<bool> GenericDeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) {
   return isAccessiblePtrImpl(Ptr, Size);
 }
 
+void GenericDeviceTy::setDeviceUidFromVendorUid(StringRef VendorUid) {
+  DeviceUid = std::string(Plugin.getName()) + "-" + std::string(VendorUid);
+}
+
 Error GenericPluginTy::init() {
   if (Initialized)
     return Plugin::success();
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index f5b2d074a47e7..e7a1ca38b3c13 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -35,6 +35,7 @@ DLWRAP(cuFuncSetAttribute, 3)
 
 // Device info
 DLWRAP(cuDeviceGetName, 3)
+DLWRAP(cuDeviceGetUuid, 2)
 DLWRAP(cuDeviceTotalMem, 2)
 DLWRAP(cuDriverGetVersion, 1)
 
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index dec4e33508c62..a470d6df1079d 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -33,6 +33,9 @@ typedef struct CUfunc_st *CUfunction;
 typedef void (*CUhostFn)(void *userData);
 typedef struct CUstream_st *CUstream;
 typedef struct CUevent_st *CUevent;
+typedef struct CUuuid_st {
+  char bytes[16];
+} CUuuid;
 
 #define CU_DEVICE_INVALID ((CUdevice)(-2))
 
@@ -301,6 +304,7 @@ CUresult cuFuncSetAttribute(CUfunction, CUfunction_attribute, int);
 
 // Device info
 CUresult cuDeviceGetName(char *, int, CUdevice);
+CUresult cuDeviceGetUuid(CUuuid *, CUdevice);
 CUresult cuDeviceTotalMem(size_t *, CUdevice);
 CUresult cuDriverGetVersion(int *);
 
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index db94f7f2dd995..45e580e7e0cd7 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -25,6 +25,7 @@
 #include "PluginInterface.h"
 #include "Utils/ELF.h"
 
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
@@ -293,6 +294,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (auto Err = Plugin::check(Res, "error in cuDeviceGet: %s"))
       return Err;
 
+    CUuuid UUID = {0};
+    Res = cuDeviceGetUuid(&UUID, Device);
+    if (auto Err = Plugin::check(Res, "error in cuDeviceGetUuid: %s"))
+      return Err;
+    setDeviceUidFromVendorUid(toHex(UUID.bytes, true));
+
     // Query the current flags of the primary context and set its flags if
     // it is inactive.
     unsigned int FormerPrimaryCtxFlags = 0;
@@ -1228,11 +1235,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Info;
   }
 
-  virtual bool shouldSetupDeviceMemoryPool() const override {
-    /// We use the CUDA malloc for now.
-    return false;
-  }
-
   /// Getters and setters for stack and heap sizes.
   Error getDeviceStackSize(uint64_t &Value) override {
     return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
@@ -1240,6 +1242,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
   Error setDeviceStackSize(uint64_t Value) override {
     return setCtxLimit(CU_LIMIT_STACK_SIZE, Value);
   }
+  bool hasDeviceHeapSize() override { return true; }
   Error getDeviceHeapSize(uint64_t &Value) override {
     return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
   }
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index eb4ecac9907a1..48de1fefa29d6 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -380,9 +380,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     return Info;
   }
 
-  /// This plugin should not setup the device environment or memory pool.
-  virtual bool shouldSetupDeviceMemoryPool() const override { return false; };
-
   /// Getters and setters for stack size and heap size not relevant.
   Error getDeviceStackSize(uint64_t &Value) override {
     Value = 0;
@@ -391,11 +388,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   Error setDeviceStackSize(uint64_t Value) override {
     return Plugin::success();
   }
-  Error getDeviceHeapSize(uint64_t &Value) override {
-    Value = 0;
-    return Plugin::success();
-  }
-  Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
 
 private:
   /// Grid values for Generic ELF64 plugins.
diff --git a/offload/test/offloading/malloc_parallel.c b/offload/test/libc/malloc_parallel.c
similarity index 100%
rename from offload/test/offloading/malloc_parallel.c
rename to offload/test/libc/malloc_parallel.c
diff --git a/offload/test/mapping/lambda_mapping.cpp b/offload/test/mapping/lambda_mapping.cpp
index 63b1719fbbc36..8e640b7fff3aa 100644
--- a/offload/test/mapping/lambda_mapping.cpp
+++ b/offload/test/mapping/lambda_mapping.cpp
@@ -4,6 +4,8 @@
 // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
 // RUN: %libomptarget-compileoptxx-run-and-check-generic
 
+// REQUIRES: libc
+
 #include <iostream>
 
 template <typename LOOP_BODY>
diff --git a/offload/test/offloading/fortran/descriptor-array-slice-map.f90 b/offload/test/offloading/fortran/descriptor-array-slice-map.f90
new file mode 100644
index 0000000000000..69abb320adc35
--- /dev/null
+++ b/offload/test/offloading/fortran/descriptor-array-slice-map.f90
@@ -0,0 +1,61 @@
+! Offloading test which aims to test that an allocatable/descriptor type map
+! will allow the appropriate slicing behaviour.
+! REQUIRES: flang, amdgpu
+
+subroutine slice_writer(n, a, b, c)
+    implicit none
+    integer, intent(in) :: n
+    real(8), intent(in) :: a(n)
+    real(8), intent(in) :: b(n)
+    real(8), intent(out) :: c(n)
+    integer :: i
+
+    !$omp target teams distribute parallel do
+    do i=1,n
+       c(i) = b(i) + a(i)
+    end do
+end subroutine slice_writer
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    implicit none
+    real(kind=8), allocatable :: a(:,:,:)
+    integer :: i, j, k, idx, idx1, idx2, idx3
+
+    i=50
+    j=100
+    k=2
+
+    allocate(a(1:i,1:j,1:k))
+
+    do idx1=1, i
+        do idx2=1, j
+            do idx3=1, k
+                a(idx1,idx2,idx3) = idx2
+            end do
+        end do
+    end do
+
+    do idx=1,k
+        !$omp target enter data map(alloc: a(1:i,:, idx))
+
+        !$omp target update to(a(1:i, 1:30, idx), &
+        !$omp&                 a(1:i, 61:100, idx))
+
+        call slice_writer(i, a(:, 1, idx), a(:, 61, idx), a(:, 31, idx))
+        call slice_writer(i, a(:, 30, idx), a(:, 100, idx), a(:, 60, idx))
+
+        !$omp target update from(a(1:i, 31:60, idx))
+        !$omp target exit data map(delete: a(1:i, :, idx))
+
+        print *, a(1, 31, idx), a(2, 31, idx), a(i, 31, idx)
+        print *, a(1, 60, idx), a(2, 60, idx), a(i, 60, idx)
+    enddo
+
+    deallocate(a)
+end program
+
+! CHECK: 62. 62. 62.
+! CHECK: 130. 130. 130.
+! CHECK: 62. 62. 62.
+! CHECK: 130. 130. 130.
diff --git a/offload/test/offloading/gpupgo/pgo_atomic_teams.c b/offload/test/offloading/gpupgo/pgo_atomic_teams.c
index 42d8ae43beba1..b3b72db080392 100644
--- a/offload/test/offloading/gpupgo/pgo_atomic_teams.c
+++ b/offload/test/offloading/gpupgo/pgo_atomic_teams.c
@@ -18,7 +18,6 @@
 
 // REQUIRES: amdgpu
 // REQUIRES: pgo
-// XFAIL: amdgpu
 
 int test1(int a) { return a / 2; }
 int test2(int a) { return a * 2; }
diff --git a/offload/test/offloading/gpupgo/pgo_atomic_threads.c b/offload/test/offloading/gpupgo/pgo_atomic_threads.c
index 09a4dc1577822..440a6b533317d 100644
--- a/offload/test/offloading/gpupgo/pgo_atomic_threads.c
+++ b/offload/test/offloading/gpupgo/pgo_atomic_threads.c
@@ -18,7 +18,6 @@
 
 // REQUIRES: amdgpu
 // REQUIRES: pgo
-// XFAIL: amdgpu
 
 int test1(int a) { return a / 2; }
 
diff --git a/offload/test/offloading/gpupgo/pgo_device_and_host.c b/offload/test/offloading/gpupgo/pgo_device_and_host.c
index c53e69a25e50d..3e95791ce9a50 100644
--- a/offload/test/offloading/gpupgo/pgo_device_and_host.c
+++ b/offload/test/offloading/gpupgo/pgo_device_and_host.c
@@ -50,7 +50,6 @@
 
 // REQUIRES: amdgpu
 // REQUIRES: pgo
-// XFAIL: amdgpu
 
 int main() {
   int host_var = 0;
diff --git a/offload/test/offloading/gpupgo/pgo_device_only.c b/offload/test/offloading/gpupgo/pgo_device_only.c
index 644df6e7b0339..2939af613b6dd 100644
--- a/offload/test/offloading/gpupgo/pgo_device_only.c
+++ b/offload/test/offloading/gpupgo/pgo_device_only.c
@@ -16,7 +16,6 @@
 
 // REQUIRES: amdgpu
 // REQUIRES: pgo
-// XFAIL: amdgpu
 
 int test1(int a) { return a / 2; }
 int test2(int a) { return a * 2; }
diff --git a/offload/test/offloading/interop-print.c b/offload/test/offloading/interop-print.c
index a3864209e17bc..f7b37d992f17f 100644
--- a/offload/test/offloading/interop-print.c
+++ b/offload/test/offloading/interop-print.c
@@ -8,6 +8,7 @@
 
 // REQUIRES: gpu
 // XFAIL: nvptx64-nvidia-cuda
+// XFAIL: nvptx64-nvidia-cuda-LTO
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/offloading/malloc.c b/offload/test/offloading/malloc.c
index 7b98e1f1110e5..04e72561d3127 100644
--- a/offload/test/offloading/malloc.c
+++ b/offload/test/offloading/malloc.c
@@ -10,7 +10,7 @@ int main() {
   int Threads = 64;
   int Teams = 10;
 
-  // Allocate ~55MB on the device.
+  // Allocate ~160 KiB on the device.
 #pragma omp target map(from : DP)
   DP = (long unsigned *)malloc(sizeof(long unsigned) * N * Threads * Teams);
 
diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
index 9b58d67f017ca..42ffb97d6d77c 100644
--- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp
+++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
@@ -176,6 +176,7 @@ ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) {
       printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_NAME, "Name"));
   OFFLOAD_ERR(printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_PRODUCT_NAME,
                                              "Product Name"));
+  OFFLOAD_ERR(printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_UID, "UID"));
   OFFLOAD_ERR(
       printDeviceValue<ol_device_type_t>(S, D, OL_DEVICE_INFO_TYPE, "Type"));
   OFFLOAD_ERR(printDeviceValue<const char *>(
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
index 8cb0b8065c33e..30eafee026316 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
@@ -98,6 +98,16 @@ TEST_P(olGetDeviceInfoTest, SuccessProductName) {
   ASSERT_EQ(std::strlen(Name.data()), Size - 1);
 }
 
+TEST_P(olGetDeviceInfoTest, SuccessUID) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_UID, &Size));
+  ASSERT_GT(Size, 0ul);
+  std::vector<char> UID;
+  UID.resize(Size);
+  ASSERT_SUCCESS(olGetDeviceInfo(Device, OL_DEVICE_INFO_UID, Size, UID.data()));
+  ASSERT_EQ(std::strlen(UID.data()), Size - 1);
+}
+
 TEST_P(olGetDeviceInfoTest, HostProductName) {
   size_t Size = 0;
   ASSERT_SUCCESS(olGetDeviceInfoSize(Host, OL_DEVICE_INFO_PRODUCT_NAME, &Size));
@@ -109,6 +119,16 @@ TEST_P(olGetDeviceInfoTest, HostProductName) {
   ASSERT_EQ(std::strlen(Name.data()), Size - 1);
 }
 
+TEST_P(olGetDeviceInfoTest, HostUID) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(olGetDeviceInfoSize(Host, OL_DEVICE_INFO_UID, &Size));
+  ASSERT_GT(Size, 0ul);
+  std::vector<char> UID;
+  UID.resize(Size);
+  ASSERT_SUCCESS(olGetDeviceInfo(Host, OL_DEVICE_INFO_UID, Size, UID.data()));
+  ASSERT_EQ(std::strlen(UID.data()), Size - 1);
+}
+
 TEST_P(olGetDeviceInfoTest, SuccessVendor) {
   size_t Size = 0;
   ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_VENDOR, &Size));
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
index c4a3c2d5e3c75..79a18c1d133dc 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
@@ -32,6 +32,7 @@ OL_DEVICE_INFO_SIZE_TEST_EQ(Platform, ol_platform_handle_t,
                             OL_DEVICE_INFO_PLATFORM);
 OL_DEVICE_INFO_SIZE_TEST_NONZERO(Name, OL_DEVICE_INFO_NAME);
 OL_DEVICE_INFO_SIZE_TEST_NONZERO(ProductName, OL_DEVICE_INFO_PRODUCT_NAME);
+OL_DEVICE_INFO_SIZE_TEST_NONZERO(UID, OL_DEVICE_INFO_UID);
 OL_DEVICE_INFO_SIZE_TEST_NONZERO(Vendor, OL_DEVICE_INFO_VENDOR);
 OL_DEVICE_INFO_SIZE_TEST_NONZERO(DriverVersion, OL_DEVICE_INFO_DRIVER_VERSION);
 OL_DEVICE_INFO_SIZE_TEST_EQ(MaxWorkGroupSize, uint32_t,
diff --git a/openmp/device/include/Allocator.h b/openmp/device/include/Allocator.h
index dc4d029ed75f3..507ec6327126a 100644
--- a/openmp/device/include/Allocator.h
+++ b/openmp/device/include/Allocator.h
@@ -14,18 +14,12 @@
 
 #include "DeviceTypes.h"
 
-// Forward declaration.
-struct KernelEnvironmentTy;
-
 namespace ompx {
 
 namespace allocator {
 
 static uint64_t constexpr ALIGNMENT = 16;
 
-/// Initialize the allocator according to \p KernelEnvironment
-void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
-
 /// Allocate \p Size bytes.
 [[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void *
 alloc(uint64_t Size);
diff --git a/openmp/device/src/Allocator.cpp b/openmp/device/src/Allocator.cpp
index aac2a6005158e..34c945c979ffb 100644
--- a/openmp/device/src/Allocator.cpp
+++ b/openmp/device/src/Allocator.cpp
@@ -18,42 +18,36 @@
 #include "Synchronization.h"
 
 using namespace ompx;
+using namespace allocator;
+
+// Provide a default implementation of malloc / free for AMDGPU platforms built
+// without 'libc' support.
+extern "C" {
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
+[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
+#else
+[[gnu::leaf]] void *malloc(size_t Size);
+[[gnu::leaf]] void free(void *Ptr);
+#endif
+}
 
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility(
-      "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
-    __omp_rtl_device_memory_pool_tracker;
+static constexpr uint64_t MEMORY_SIZE = /* 1 MiB */ 1024 * 1024;
+alignas(ALIGNMENT) static uint8_t Memory[MEMORY_SIZE] = {0};
 
-/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
-/// directly.
+// Fallback bump pointer interface for platforms without a functioning
+// allocator.
 struct BumpAllocatorTy final {
+  uint64_t Offset = 0;
 
   void *alloc(uint64_t Size) {
     Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));
 
-    if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
-      atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
-                  atomic::seq_cst);
-      atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
-                  atomic::seq_cst);
-      atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
-                  atomic::seq_cst);
-      atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
-                  atomic::seq_cst);
-    }
-
-    uint64_t *Data =
-        reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
-    uint64_t End =
-        reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;
-
-    uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
-    if (OldData + Size > End)
+    uint64_t OldData = atomic::add(&Offset, Size, atomic::seq_cst);
+    if (OldData + Size >= MEMORY_SIZE)
       __builtin_trap();
 
-    return reinterpret_cast<void *>(OldData);
+    return &Memory[OldData];
   }
 
   void free(void *) {}
@@ -65,13 +59,20 @@ BumpAllocatorTy BumpAllocator;
 ///
 ///{
 
-void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
-  // TODO: Check KernelEnvironment for an allocator choice as soon as we have
-  // more than one.
+void *allocator::alloc(uint64_t Size) {
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+  return BumpAllocator.alloc(Size);
+#else
+  return ::malloc(Size);
+#endif
 }
 
-void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
-
-void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
+void allocator::free(void *Ptr) {
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+  BumpAllocator.free(Ptr);
+#else
+  ::free(Ptr);
+#endif
+}
 
 ///}
diff --git a/openmp/device/src/Kernel.cpp b/openmp/device/src/Kernel.cpp
index 8c2828b270419..05af35d242ac5 100644
--- a/openmp/device/src/Kernel.cpp
+++ b/openmp/device/src/Kernel.cpp
@@ -41,7 +41,6 @@ inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
   synchronize::init(IsSPMD);
   mapping::init(IsSPMD);
   state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment);
-  allocator::init(IsSPMD, KernelEnvironment);
   workshare::init(IsSPMD);
 }
 
diff --git a/openmp/device/src/Misc.cpp b/openmp/device/src/Misc.cpp
index 563f674d166e5..a53fb4302fdb5 100644
--- a/openmp/device/src/Misc.cpp
+++ b/openmp/device/src/Misc.cpp
@@ -100,7 +100,7 @@ void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
   case omp_const_mem_alloc:
   case omp_high_bw_mem_alloc:
   case omp_low_lat_mem_alloc:
-    return malloc(size);
+    return ompx::allocator::alloc(size);
   default:
     return nullptr;
   }
@@ -113,7 +113,7 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
   case omp_const_mem_alloc:
   case omp_high_bw_mem_alloc:
   case omp_low_lat_mem_alloc:
-    free(ptr);
+    ompx::allocator::free(ptr);
     return;
   case omp_null_allocator:
   default:
diff --git a/openmp/device/src/State.cpp b/openmp/device/src/State.cpp
index 475395102f47b..9f38cf26f8c6f 100644
--- a/openmp/device/src/State.cpp
+++ b/openmp/device/src/State.cpp
@@ -44,26 +44,6 @@ using namespace ompx;
 
 namespace {
 
-/// Fallback implementations are missing to trigger a link time error.
-/// Implementations for new devices, including the host, should go into a
-/// dedicated begin/end declare variant.
-///
-///{
-extern "C" {
-#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
-
-[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
-[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
-
-#else
-
-[[gnu::weak, gnu::leaf]] void *malloc(size_t Size);
-[[gnu::weak, gnu::leaf]] void free(void *Ptr);
-
-#endif
-}
-///}
-
 /// A "smart" stack in shared memory.
 ///
 /// The stack exposes a malloc/free interface but works like a stack internally.
@@ -171,13 +151,13 @@ void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
 }
 
 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
-  void *Ptr = malloc(Bytes);
+  void *Ptr = allocator::alloc(Bytes);
   if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
     printf("nullptr returned by malloc!\n");
   return Ptr;
 }
 
-void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
+void memory::freeGlobal(void *Ptr, const char *Reason) { allocator::free(Ptr); }
 
 ///}
 
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index cd78a5ba88e2c..1b6f30ae73a33 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -1521,5 +1521,4 @@ debugging features are supported.
 
     * Enable debugging assertions in the device. ``0x01``
     * Enable diagnosing common problems during offloading . ``0x4``
-    * Enable device malloc statistics (amdgpu only). ``0x8``
     * Dump device PGO counters (only if PGO on GPU is enabled). ``0x10``
diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index 6ac047a833fe5..5dd7f4b33612d 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -254,23 +254,35 @@ set(LIBOMP_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
 
 # Add symbolic links to libomp
 if(NOT WIN32)
-  add_custom_command(TARGET omp POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE}
-      libgomp${LIBOMP_LIBRARY_SUFFIX}
-    COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE}
-      libiomp5${LIBOMP_LIBRARY_SUFFIX}
-    WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR}
-  )
-  if(LIBOMP_ENABLE_SHARED)
-    if(APPLE)
-      set(VERSIONED_LIBGOMP_NAME libgomp.1${LIBOMP_LIBRARY_SUFFIX})
-    else()
-      set(VERSIONED_LIBGOMP_NAME libgomp${LIBOMP_LIBRARY_SUFFIX}.1)
-    endif()
+  if(AIX)
+    # On AIX, libomp.a is the name for both static and shared objects.
+    set(LIBOMP_AIX_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
     add_custom_command(TARGET omp POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} ${VERSIONED_LIBGOMP_NAME}
+      COMMAND ${CMAKE_COMMAND} -E create_symlink
+        ${LIBOMP_LIB_NAME}${LIBOMP_AIX_SUFFIX} libgomp${LIBOMP_AIX_SUFFIX}
+      COMMAND ${CMAKE_COMMAND} -E create_symlink
+        ${LIBOMP_LIB_NAME}${LIBOMP_AIX_SUFFIX} libiomp5${LIBOMP_AIX_SUFFIX}
       WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR}
     )
+  else()
+    add_custom_command(TARGET omp POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE}
+        libiomp5${LIBOMP_LIBRARY_SUFFIX}
+      COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE}
+        libgomp${LIBOMP_LIBRARY_SUFFIX}
+      WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR}
+    )
+    if(LIBOMP_ENABLE_SHARED)
+      if(APPLE)
+        set(VERSIONED_LIBGOMP_NAME libgomp.1${LIBOMP_LIBRARY_SUFFIX})
+      else()
+        set(VERSIONED_LIBGOMP_NAME libgomp${LIBOMP_LIBRARY_SUFFIX}.1)
+      endif()
+      add_custom_command(TARGET omp POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} ${VERSIONED_LIBGOMP_NAME}
+        WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR}
+      )
+    endif()
   endif()
 endif()
 
diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S
index 89359759fcb42..12fea67e000e8 100644
--- a/openmp/runtime/src/z_Linux_asm.S
+++ b/openmp/runtime/src/z_Linux_asm.S
@@ -121,8 +121,7 @@ KMP_PREFIX_UNDERSCORE(\proc):
 # endif // KMP_OS_DARWIN
 #endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
 
-#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS || KMP_OS_OPENBSD) &&     \
-    (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM)
+#if KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM
 
 # if KMP_OS_DARWIN
 #  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
@@ -237,8 +236,7 @@ KMP_PREFIX_UNDERSCORE(\proc):
 #  define PACBTI_RET
 #  define GNU_PROPERTY_BTI_PAC
 # endif
-#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS || KMP_OS_OPENBSD) && \
-          (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM)
+#endif // KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM
 
 .macro COMMON name, size, align_power
 #if KMP_OS_DARWIN
@@ -1302,7 +1300,7 @@ KMP_LABEL(kmp_no_args):
 #endif /* KMP_ARCH_X86_64 */
 
 // '
-#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32)
+#if KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32
 
 //------------------------------------------------------------------------
 // int
@@ -1360,10 +1358,10 @@ __tid = 8
 	PROC __kmp_invoke_microtask
 	PACBTI_C
 
-	stp	x29, x30, [sp, #-16]!
 # if OMPT_SUPPORT
 	stp	x19, x20, [sp, #-16]!
 # endif
+	stp	x29, x30, [sp, #-16]!
 	mov	x29, sp
 
 	orr	w9, wzr, #1
@@ -1417,20 +1415,20 @@ KMP_LABEL(kmp_1):
 	blr	x8
 	orr	w0, wzr, #1
 	mov	sp, x29
+	ldp	x29, x30, [sp], #16
 # if OMPT_SUPPORT
 	str	xzr, [x19]
 	ldp	x19, x20, [sp], #16
 # endif
-	ldp	x29, x30, [sp], #16
 	PACBTI_RET
 	ret
 
 	DEBUG_INFO __kmp_invoke_microtask
 // -- End  __kmp_invoke_microtask
 
-#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32) */
+#endif /* KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 */
 
-#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM
+#if KMP_ARCH_ARM
 
 //------------------------------------------------------------------------
 // int
@@ -1573,7 +1571,7 @@ KMP_LABEL(kmp_1):
 	DEBUG_INFO __kmp_invoke_microtask
 // -- End  __kmp_invoke_microtask
 
-#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM */
+#endif /* KMP_ARCH_ARM */
 
 #if KMP_ARCH_PPC64
 
diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
index 368c0b6e872cc..c7fe0642cea63 100644
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -2736,8 +2736,7 @@ int __kmp_get_load_balance(int max) {
 
 #endif // USE_LOAD_BALANCE
 
-#if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC ||                            \
-      ((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) ||                 \
+#if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC || KMP_ARCH_AARCH64 ||        \
       KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||            \
       KMP_ARCH_ARM || KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_PPC_XCOFF ||   \
       KMP_ARCH_AARCH64_32)
diff --git a/orc-rt/include/orc-rt-c/WrapperFunction.h b/orc-rt/include/orc-rt-c/WrapperFunction.h
index 280e513c9c0e6..fefdf03ff3f06 100644
--- a/orc-rt/include/orc-rt-c/WrapperFunction.h
+++ b/orc-rt/include/orc-rt-c/WrapperFunction.h
@@ -54,7 +54,7 @@ typedef struct {
  * Asynchronous return function for an orc-rt wrapper function.
  */
 typedef void (*orc_rt_WrapperFunctionReturn)(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionBuffer ResultBytes);
 
 /**
@@ -62,10 +62,11 @@ typedef void (*orc_rt_WrapperFunctionReturn)(
  *
  * ArgBytes contains the serialized arguments for the wrapper function.
  * Session holds a reference to the session object.
- * CallCtx holds a pointer to the context object for this particular call.
+ * CallId holds a pointer to the context object for this particular call.
  * Return holds a pointer to the return function.
  */
-typedef void (*orc_rt_WrapperFunction)(orc_rt_SessionRef Session, void *CallCtx,
+typedef void (*orc_rt_WrapperFunction)(orc_rt_SessionRef Session,
+                                       uint64_t CallId,
                                        orc_rt_WrapperFunctionReturn Return,
                                        orc_rt_WrapperFunctionBuffer ArgBytes);
 
diff --git a/orc-rt/include/orc-rt/Endian.h b/orc-rt/include/orc-rt/Endian.h
new file mode 100644
index 0000000000000..538eb3f23d0f0
--- /dev/null
+++ b/orc-rt/include/orc-rt/Endian.h
@@ -0,0 +1,44 @@
+//===----- Endian.h - Endianness helpers for the ORC runtime ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Endianness helper functions for the ORC runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ORC_RT_ENDIAN_H
+#define ORC_RT_ENDIAN_H
+
+#include "bit.h"
+#include <cstring>
+#include <type_traits>
+
+namespace orc_rt {
+
+/// Read a value with the given endianness from memory.
+template <typename T>
+[[nodiscard]] inline std::enable_if_t<std::is_integral_v<T>, T>
+endian_read(const void *Src, orc_rt::endian E) noexcept {
+  T Val;
+  memcpy(&Val, Src, sizeof(T));
+  if (E != orc_rt::endian::native)
+    Val = orc_rt::byteswap(Val);
+  return Val;
+}
+
+/// Write a value with the given endianness to memory.
+template <typename T>
+inline std::enable_if_t<std::is_integral_v<T>>
+endian_write(void *Dst, T Val, orc_rt::endian E) noexcept {
+  if (E != orc_rt::endian::native)
+    Val = orc_rt::byteswap(Val);
+  memcpy(Dst, &Val, sizeof(T));
+}
+
+} // namespace orc_rt
+
+#endif // ORC_RT_ENDIAN_H
diff --git a/orc-rt/include/orc-rt/SPSWrapperFunction.h b/orc-rt/include/orc-rt/SPSWrapperFunction.h
index 46c08a0c688d0..1ae2c130dc0cf 100644
--- a/orc-rt/include/orc-rt/SPSWrapperFunction.h
+++ b/orc-rt/include/orc-rt/SPSWrapperFunction.h
@@ -124,10 +124,10 @@ template <typename SPSSig> struct SPSWrapperFunction {
   }
 
   template <typename Handler>
-  static void handle(orc_rt_SessionRef Session, void *CallCtx,
+  static void handle(orc_rt_SessionRef Session, uint64_t CallId,
                      orc_rt_WrapperFunctionReturn Return,
                      WrapperFunctionBuffer ArgBytes, Handler &&H) {
-    WrapperFunction::handle(Session, CallCtx, Return, std::move(ArgBytes),
+    WrapperFunction::handle(Session, CallId, Return, std::move(ArgBytes),
                             WrapperFunctionSPSSerializer<SPSSig>(),
                             std::forward<Handler>(H));
   }
diff --git a/orc-rt/include/orc-rt/Session.h b/orc-rt/include/orc-rt/Session.h
index c198d374bc849..fbace053bd72f 100644
--- a/orc-rt/include/orc-rt/Session.h
+++ b/orc-rt/include/orc-rt/Session.h
@@ -17,6 +17,8 @@
 #include "orc-rt/ResourceManager.h"
 #include "orc-rt/move_only_function.h"
 
+#include "orc-rt-c/CoreTypes.h"
+
 #include <vector>
 
 namespace orc_rt {
@@ -40,6 +42,8 @@ class Session {
   // Sessions are not copyable or moveable.
   Session(const Session &) = delete;
   Session &operator=(const Session &) = delete;
+  Session(Session &&) = delete;
+  Session &operator=(Session &&) = delete;
 
   ~Session();
 
@@ -69,6 +73,14 @@ class Session {
   std::vector<std::unique_ptr<ResourceManager>> ResourceMgrs;
 };
 
+inline orc_rt_SessionRef wrap(Session *S) noexcept {
+  return reinterpret_cast<orc_rt_SessionRef>(S);
+}
+
+inline Session *unwrap(orc_rt_SessionRef S) noexcept {
+  return reinterpret_cast<Session *>(S);
+}
+
 } // namespace orc_rt
 
 #endif // ORC_RT_SESSION_H
diff --git a/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h b/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h
index cf0e4ac732ca0..20b080e960dea 100644
--- a/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h
+++ b/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h
@@ -114,21 +114,21 @@ class SimpleNativeMemoryMap : public ResourceManager {
 } // namespace orc_rt
 
 ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_reserve_sps_wrapper(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes);
 
 ORC_RT_SPS_INTERFACE void
 orc_rt_SimpleNativeMemoryMap_releaseMultiple_sps_wrapper(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes);
 
 ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_initialize_sps_wrapper(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes);
 
 ORC_RT_SPS_INTERFACE void
 orc_rt_SimpleNativeMemoryMap_deinitializeMultiple_sps_wrapper(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes);
 
 #endif // ORC_RT_SIMPLENATIVEMEMORYMAP_H
diff --git a/orc-rt/include/orc-rt/WrapperFunction.h b/orc-rt/include/orc-rt/WrapperFunction.h
index 6e8b84e980dc0..494e2a0dc0bb1 100644
--- a/orc-rt/include/orc-rt/WrapperFunction.h
+++ b/orc-rt/include/orc-rt/WrapperFunction.h
@@ -137,14 +137,14 @@ using WFHandlerTraits = CallableTraitsHelper<WFHandlerTraitsImpl, C>;
 
 template <typename Serializer> class StructuredYieldBase {
 public:
-  StructuredYieldBase(orc_rt_SessionRef Session, void *CallCtx,
+  StructuredYieldBase(orc_rt_SessionRef Session, uint64_t CallId,
                       orc_rt_WrapperFunctionReturn Return, Serializer &&S)
-      : Session(Session), CallCtx(CallCtx), Return(Return),
+      : Session(Session), CallId(CallId), Return(Return),
         S(std::forward<Serializer>(S)) {}
 
 protected:
   orc_rt_SessionRef Session;
-  void *CallCtx;
+  uint64_t CallId;
   orc_rt_WrapperFunctionReturn Return;
   std::decay_t<Serializer> S;
 };
@@ -158,9 +158,9 @@ class StructuredYield<std::tuple<RetT>, Serializer>
   using StructuredYieldBase<Serializer>::StructuredYieldBase;
   void operator()(RetT &&R) {
     if (auto ResultBytes = this->S.result().serialize(std::forward<RetT>(R)))
-      this->Return(this->Session, this->CallCtx, ResultBytes->release());
+      this->Return(this->Session, this->CallId, ResultBytes->release());
     else
-      this->Return(this->Session, this->CallCtx,
+      this->Return(this->Session, this->CallId,
                    WrapperFunctionBuffer::createOutOfBandError(
                        "Could not serialize wrapper function result data")
                        .release());
@@ -173,7 +173,7 @@ class StructuredYield<std::tuple<>, Serializer>
 public:
   using StructuredYieldBase<Serializer>::StructuredYieldBase;
   void operator()() {
-    this->Return(this->Session, this->CallCtx,
+    this->Return(this->Session, this->CallId,
                  WrapperFunctionBuffer().release());
   }
 };
@@ -251,12 +251,12 @@ struct WrapperFunction {
   ///
   ///
   ///   static void adder_add_async_sps_wrapper(
-  ///       orc_rt_SessionRef Session, void *CallCtx,
+  ///       orc_rt_SessionRef Session, uint64_t CallId,
   ///       orc_rt_WrapperFunctionReturn Return,
   ///       orc_rt_WrapperFunctionBuffer ArgBytes) {
   ///     using SPSSig = SPSString(SPSExecutorAddr, int32_t, bool);
   ///     SPSWrapperFunction<SPSSig>::handle(
-  ///         Session, CallCtx, Return, ArgBytes,
+  ///         Session, CallId, Return, ArgBytes,
   ///         WrapperFunction::handleWithAsyncMethod(&MyClass::myMethod));
   ///   }
   ///   @endcode
@@ -313,12 +313,12 @@ struct WrapperFunction {
   ///
   ///
   ///   static void adder_add_sync_sps_wrapper(
-  ///       orc_rt_SessionRef Session, void *CallCtx,
+  ///       orc_rt_SessionRef Session, uint64_t CallId,
   ///       orc_rt_WrapperFunctionReturn Return,
   ///       orc_rt_WrapperFunctionBuffer ArgBytes) {
   ///     using SPSSig = SPSString(SPSExecutorAddr, int32_t, bool);
   ///     SPSWrapperFunction<SPSSig>::handle(
-  ///         Session, CallCtx, Return, ArgBytes,
+  ///         Session, CallId, Return, ArgBytes,
   ///         WrapperFunction::handleWithSyncMethod(&Adder::addSync));
   ///   }
   ///   @endcode
@@ -368,7 +368,7 @@ struct WrapperFunction {
   /// This utility deserializes and serializes arguments and return values
   /// (using the given Serializer), and calls the given handler.
   template <typename Serializer, typename Handler>
-  static void handle(orc_rt_SessionRef Session, void *CallCtx,
+  static void handle(orc_rt_SessionRef Session, uint64_t CallId,
                      orc_rt_WrapperFunctionReturn Return,
                      WrapperFunctionBuffer ArgBytes, Serializer &&S,
                      Handler &&H) {
@@ -380,16 +380,16 @@ struct WrapperFunction {
     typedef typename CallableArgInfo<Yield>::args_tuple_type RetTupleType;
 
     if (ArgBytes.getOutOfBandError())
-      return Return(Session, CallCtx, ArgBytes.release());
+      return Return(Session, CallId, ArgBytes.release());
 
     if (auto Args = S.arguments().template deserialize<ArgTuple>(ArgBytes))
       std::apply(HandlerTraits::forwardArgsAsRequested(bind_front(
                      std::forward<Handler>(H),
                      detail::StructuredYield<RetTupleType, Serializer>(
-                         Session, CallCtx, Return, std::move(S)))),
+                         Session, CallId, Return, std::move(S)))),
                  *Args);
     else
-      Return(Session, CallCtx,
+      Return(Session, CallId,
              WrapperFunctionBuffer::createOutOfBandError(
                  "Could not deserialize wrapper function arg data")
                  .release());
diff --git a/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp b/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp
index 5d410acbc65d9..c6a005d6a70e6 100644
--- a/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp
+++ b/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp
@@ -367,45 +367,45 @@ Error SimpleNativeMemoryMap::recordDeallocActions(
 }
 
 ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_reserve_sps_wrapper(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionReturn Return,
     orc_rt_WrapperFunctionBuffer ArgBytes) {
   using Sig = SPSExpected<SPSExecutorAddr>(SPSExecutorAddr, SPSSize);
   SPSWrapperFunction<Sig>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       WrapperFunction::handleWithAsyncMethod(&SimpleNativeMemoryMap::reserve));
 }
 
 ORC_RT_SPS_INTERFACE void
 orc_rt_SimpleNativeMemoryMap_releaseMultiple_sps_wrapper(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionReturn Return,
     orc_rt_WrapperFunctionBuffer ArgBytes) {
   using Sig = SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddr>);
-  SPSWrapperFunction<Sig>::handle(Session, CallCtx, Return, ArgBytes,
+  SPSWrapperFunction<Sig>::handle(Session, CallId, Return, ArgBytes,
                                   WrapperFunction::handleWithAsyncMethod(
                                       &SimpleNativeMemoryMap::releaseMultiple));
 }
 
 ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_initialize_sps_wrapper(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionReturn Return,
     orc_rt_WrapperFunctionBuffer ArgBytes) {
   using Sig = SPSExpected<SPSExecutorAddr>(
       SPSExecutorAddr, SPSSimpleNativeMemoryMapInitializeRequest);
-  SPSWrapperFunction<Sig>::handle(Session, CallCtx, Return, ArgBytes,
+  SPSWrapperFunction<Sig>::handle(Session, CallId, Return, ArgBytes,
                                   WrapperFunction::handleWithAsyncMethod(
                                       &SimpleNativeMemoryMap::initialize));
 }
 
 ORC_RT_SPS_INTERFACE void
 orc_rt_SimpleNativeMemoryMap_deinitializeMultiple_sps_wrapper(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionReturn Return,
     orc_rt_WrapperFunctionBuffer ArgBytes) {
   using Sig = SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddr>);
   SPSWrapperFunction<Sig>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       WrapperFunction::handleWithAsyncMethod(
           &SimpleNativeMemoryMap::deinitializeMultiple));
 }
diff --git a/orc-rt/unittests/CMakeLists.txt b/orc-rt/unittests/CMakeLists.txt
index 2928fc862b8d0..7b943e8039449 100644
--- a/orc-rt/unittests/CMakeLists.txt
+++ b/orc-rt/unittests/CMakeLists.txt
@@ -15,6 +15,7 @@ add_orc_rt_unittest(CoreTests
   AllocActionTest.cpp
   BitmaskEnumTest.cpp
   CallableTraitsHelperTest.cpp
+  EndianTest.cpp
   ErrorTest.cpp
   ExecutorAddressTest.cpp
   IntervalMapTest.cpp
diff --git a/orc-rt/unittests/DirectCaller.h b/orc-rt/unittests/DirectCaller.h
index 7c5c9d300d882..fab006717c157 100644
--- a/orc-rt/unittests/DirectCaller.h
+++ b/orc-rt/unittests/DirectCaller.h
@@ -22,10 +22,11 @@ class DirectCaller {
     virtual ~DirectResultSender() {}
     virtual void send(orc_rt_SessionRef Session,
                       orc_rt::WrapperFunctionBuffer ResultBytes) = 0;
-    static void send(orc_rt_SessionRef Session, void *CallCtx,
+    static void send(orc_rt_SessionRef Session, uint64_t CallId,
                      orc_rt_WrapperFunctionBuffer ResultBytes) {
       std::unique_ptr<DirectResultSender>(
-          reinterpret_cast<DirectResultSender *>(CallCtx))
+          reinterpret_cast<DirectResultSender *>(
+              static_cast<uintptr_t>(CallId)))
           ->send(Session, ResultBytes);
     }
   };
@@ -59,7 +60,8 @@ class DirectCaller {
                   orc_rt::WrapperFunctionBuffer ArgBytes) {
     auto DR =
         makeDirectResultSender(std::forward<HandleResultFn>(HandleResult));
-    Fn(Session, reinterpret_cast<void *>(DR.release()),
+    Fn(Session,
+       static_cast<uint64_t>(reinterpret_cast<uintptr_t>(DR.release())),
        DirectResultSender::send, ArgBytes.release());
   }
 
diff --git a/orc-rt/unittests/EndianTest.cpp b/orc-rt/unittests/EndianTest.cpp
new file mode 100644
index 0000000000000..74dc6ef620bc2
--- /dev/null
+++ b/orc-rt/unittests/EndianTest.cpp
@@ -0,0 +1,100 @@
+//===- EndianTest.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests for orc-rt's Endian.h APIs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "orc-rt/Endian.h"
+#include "gtest/gtest.h"
+
+#include <algorithm>
+#include <limits>
+
+using namespace orc_rt;
+
+template <typename T> static void endianRead(T Value, endian E) {
+  char Buffer[sizeof(T)];
+  memcpy(Buffer, &Value, sizeof(T));
+
+  if (E != endian::native)
+    std::reverse(Buffer, Buffer + sizeof(T));
+
+  T NewVal = endian_read<T>(Buffer, E);
+  EXPECT_EQ(NewVal, Value);
+}
+
+template <typename T> static void endianWrite(T Value, endian E) {
+  char Buffer[sizeof(T)];
+
+  endian_write(Buffer, Value, E);
+
+  if (E != endian::native)
+    std::reverse(Buffer, Buffer + sizeof(T));
+
+  T NewVal;
+  memcpy(&NewVal, Buffer, sizeof(T));
+
+  EXPECT_EQ(NewVal, Value);
+}
+
+template <typename T> static void endianReadAndWrite(T Value, endian E) {
+  endianRead(Value, E);
+  endianWrite(Value, E);
+}
+
+template <typename T> static void bothEndiansReadAndWrite(T Value) {
+  endianReadAndWrite(Value, endian::little);
+  endianReadAndWrite(Value, endian::big);
+}
+
+// Rotate the given bit pattern through all valid rotations for T, testing that
+// the given operation works for the given pattern.
+template <typename Op, typename T>
+void forAllRotatedValues(Op O, T InitialValue) {
+  T V = InitialValue;
+  for (size_t I = 0; I != CHAR_BIT * sizeof(T); ++I) {
+    O(V);
+    V = llvm::rotl(V, 1);
+  }
+}
+
+template <typename Op, typename T>
+void forAllShiftedValues(Op O, T InitialValue) {
+  T V = InitialValue;
+  constexpr T TopValueBit = 1 << (std::numeric_limits<T>::digits - 1);
+  for (size_t I = 0; I != CHAR_BIT * sizeof(T); ++I) {
+    O(V);
+    if (V & TopValueBit)
+      break;
+    V << 1;
+  }
+}
+
+TEST(EndianTest, ReadWrite) {
+  bothEndiansReadAndWrite<uint8_t>(0);
+  bothEndiansReadAndWrite<uint8_t>(0xff);
+  forAllRotatedValues(bothEndiansReadAndWrite<uint8_t>, uint8_t(1));
+  forAllRotatedValues(bothEndiansReadAndWrite<uint8_t>, uint8_t(0x5A));
+
+  bothEndiansReadAndWrite<uint16_t>(0);
+  bothEndiansReadAndWrite<uint16_t>(0xffff);
+  forAllRotatedValues(bothEndiansReadAndWrite<uint16_t>, uint16_t(1));
+  forAllRotatedValues(bothEndiansReadAndWrite<uint16_t>, uint16_t(0x5A5A));
+
+  bothEndiansReadAndWrite<uint32_t>(0);
+  bothEndiansReadAndWrite<uint32_t>(0xffffffff);
+  forAllRotatedValues(bothEndiansReadAndWrite<uint32_t>, uint32_t(1));
+  forAllRotatedValues(bothEndiansReadAndWrite<uint32_t>, uint32_t(0x5A5A5A5A));
+
+  bothEndiansReadAndWrite<uint64_t>(0);
+  bothEndiansReadAndWrite<uint64_t>(0xffffffffffffffff);
+  forAllRotatedValues(bothEndiansReadAndWrite<uint64_t>, uint64_t(1));
+  forAllRotatedValues(bothEndiansReadAndWrite<uint64_t>,
+                      uint64_t(0x5A5A5A5A5A5A5A5A));
+}
diff --git a/orc-rt/unittests/SPSWrapperFunctionTest.cpp b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
index 81e5755e821f3..3b1592d89d8f1 100644
--- a/orc-rt/unittests/SPSWrapperFunctionTest.cpp
+++ b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
@@ -22,11 +22,11 @@
 
 using namespace orc_rt;
 
-static void void_noop_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+static void void_noop_sps_wrapper(orc_rt_SessionRef Session, uint64_t CallId,
                                   orc_rt_WrapperFunctionReturn Return,
                                   orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<void()>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       [](move_only_function<void()> Return) { Return(); });
 }
 
@@ -40,11 +40,12 @@ TEST(SPSWrapperFunctionUtilsTest, VoidNoop) {
   EXPECT_TRUE(Ran);
 }
 
-static void add_via_lambda_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+static void add_via_lambda_sps_wrapper(orc_rt_SessionRef Session,
+                                       uint64_t CallId,
                                        orc_rt_WrapperFunctionReturn Return,
                                        orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<int32_t(int32_t, int32_t)>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       [](move_only_function<void(int32_t)> Return, int32_t X, int32_t Y) {
         Return(X + Y);
       });
@@ -64,11 +65,11 @@ static void add_via_function(move_only_function<void(int32_t)> Return,
 }
 
 static void
-add_via_function_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+add_via_function_sps_wrapper(orc_rt_SessionRef Session, uint64_t CallId,
                              orc_rt_WrapperFunctionReturn Return,
                              orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<int32_t(int32_t, int32_t)>::handle(
-      Session, CallCtx, Return, ArgBytes, add_via_function);
+      Session, CallId, Return, ArgBytes, add_via_function);
 }
 
 TEST(SPSWrapperFunctionUtilsTest, BinaryOpViaFunction) {
@@ -80,11 +81,11 @@ TEST(SPSWrapperFunctionUtilsTest, BinaryOpViaFunction) {
 }
 
 static void
-add_via_function_pointer_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+add_via_function_pointer_sps_wrapper(orc_rt_SessionRef Session, uint64_t CallId,
                                      orc_rt_WrapperFunctionReturn Return,
                                      orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<int32_t(int32_t, int32_t)>::handle(
-      Session, CallCtx, Return, ArgBytes, &add_via_function);
+      Session, CallId, Return, ArgBytes, &add_via_function);
 }
 
 TEST(SPSWrapperFunctionUtilsTest, BinaryOpViaFunctionPointer) {
@@ -96,11 +97,12 @@ TEST(SPSWrapperFunctionUtilsTest, BinaryOpViaFunctionPointer) {
 }
 
 static void
-round_trip_string_via_span_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+round_trip_string_via_span_sps_wrapper(orc_rt_SessionRef Session,
+                                       uint64_t CallId,
                                        orc_rt_WrapperFunctionReturn Return,
                                        orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<SPSString(SPSString)>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       [](move_only_function<void(std::string)> Return, span<const char> S) {
         Return({S.data(), S.size()});
       });
@@ -119,11 +121,11 @@ TEST(SPSWrapperFunctionUtilsTest, RoundTripStringViaSpan) {
 }
 
 static void improbable_feat_sps_wrapper(orc_rt_SessionRef Session,
-                                        void *CallCtx,
+                                        uint64_t CallId,
                                         orc_rt_WrapperFunctionReturn Return,
                                         orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<SPSError(bool)>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       [](move_only_function<void(Error)> Return, bool LuckyHat) {
         if (LuckyHat)
           Return(Error::success());
@@ -155,11 +157,11 @@ TEST(SPSWrapperFunctionUtilsTest, TransparentConversionErrorFailureCase) {
   EXPECT_EQ(ErrMsg, "crushed by boulder");
 }
 
-static void halve_number_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+static void halve_number_sps_wrapper(orc_rt_SessionRef Session, uint64_t CallId,
                                      orc_rt_WrapperFunctionReturn Return,
                                      orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<SPSExpected<int32_t>(int32_t)>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       [](move_only_function<void(Expected<int32_t>)> Return, int N) {
         if (N % 2 == 0)
           Return(N >> 1);
@@ -208,12 +210,12 @@ class SPSSerializationTraits<SPSOpCounter<N>, OpCounter<N>> {
 
 static void
 handle_with_reference_types_sps_wrapper(orc_rt_SessionRef Session,
-                                        void *CallCtx,
+                                        uint64_t CallId,
                                         orc_rt_WrapperFunctionReturn Return,
                                         orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<void(
       SPSOpCounter<0>, SPSOpCounter<1>, SPSOpCounter<2>,
-      SPSOpCounter<3>)>::handle(Session, CallCtx, Return, ArgBytes,
+      SPSOpCounter<3>)>::handle(Session, CallId, Return, ArgBytes,
                                 [](move_only_function<void()> Return,
                                    OpCounter<0>, OpCounter<1> &,
                                    const OpCounter<2> &,
@@ -281,11 +283,11 @@ class Adder {
 } // anonymous namespace
 
 static void adder_add_async_sps_wrapper(orc_rt_SessionRef Session,
-                                        void *CallCtx,
+                                        uint64_t CallId,
                                         orc_rt_WrapperFunctionReturn Return,
                                         orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<int32_t(SPSExecutorAddr, int32_t, int32_t)>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       WrapperFunction::handleWithAsyncMethod(&Adder::addAsync));
 }
 
@@ -300,11 +302,12 @@ TEST(SPSWrapperFunctionUtilsTest, HandleWtihAsyncMethod) {
   EXPECT_EQ(Result, 42);
 }
 
-static void adder_add_sync_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+static void adder_add_sync_sps_wrapper(orc_rt_SessionRef Session,
+                                       uint64_t CallId,
                                        orc_rt_WrapperFunctionReturn Return,
                                        orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<int32_t(SPSExecutorAddr, int32_t, int32_t)>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       WrapperFunction::handleWithSyncMethod(&Adder::addSync));
 }
 
diff --git a/polly/lib/CodeGen/CodeGeneration.cpp b/polly/lib/CodeGen/CodeGeneration.cpp
index 2d8b393cc039c..062cdfbcfe3b5 100644
--- a/polly/lib/CodeGen/CodeGeneration.cpp
+++ b/polly/lib/CodeGen/CodeGeneration.cpp
@@ -235,15 +235,6 @@ static bool generateCode(Scop &S, IslAstInfo &AI, LoopInfo &LI,
   NodeBuilder.allocateNewArrays(StartExitBlocks);
   Annotator.buildAliasScopes(S);
 
-  // The code below annotates the "llvm.loop.vectorize.enable" to false
-  // for the code flow taken when RTCs fail. Because we don't want the
-  // Loop Vectorizer to come in later and vectorize the original fall back
-  // loop when Polly is enabled.
-  for (Loop *L : LI.getLoopsInPreorder()) {
-    if (S.contains(L))
-      addStringMetadataToLoop(L, "llvm.loop.vectorize.enable", 0);
-  }
-
   if (PerfMonitoring) {
     PerfMonitor P(S, EnteringBB->getParent()->getParent());
     P.initialize();
@@ -285,6 +276,21 @@ static bool generateCode(Scop &S, IslAstInfo &AI, LoopInfo &LI,
 
     Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC);
 
+    auto *CI = dyn_cast<ConstantInt>(RTC);
+    // The code below annotates the "llvm.loop.vectorize.enable" to false
+    // for the code flow taken when RTCs fail. Because we don't want the
+    // Loop Vectorizer to come in later and vectorize the original fall back
+    // loop when Polly is enabled. This avoids loop versioning on fallback
+    // loop by Loop Vectorizer. Don't do this when Polly's RTC value is
+    // false (due to code generation failure), as we are left with only one
+    // version of Loop.
+    if (!(CI && CI->isZero())) {
+      for (Loop *L : LI.getLoopsInPreorder()) {
+        if (S.contains(L))
+          addStringMetadataToLoop(L, "llvm.loop.vectorize.enable", 0);
+      }
+    }
+
     // Explicitly set the insert point to the end of the block to avoid that a
     // split at the builder's current
     // insert position would move the malloc calls to the wrong BasicBlock.
diff --git a/polly/lib/Support/GICHelper.cpp b/polly/lib/Support/GICHelper.cpp
index 027e0194732f4..948bb6a9b9614 100644
--- a/polly/lib/Support/GICHelper.cpp
+++ b/polly/lib/Support/GICHelper.cpp
@@ -59,7 +59,7 @@ APInt polly::APIntFromVal(__isl_take isl_val *Val) {
   Data = (uint64_t *)malloc(NumChunks * ChunkSize);
   isl_val_get_abs_num_chunks(Val, ChunkSize, Data);
   int NumBits = CHAR_BIT * ChunkSize * NumChunks;
-  APInt A(NumBits, NumChunks, Data);
+  APInt A(NumBits, ArrayRef(Data, NumChunks));
 
   // As isl provides only an interface to obtain data that describes the
   // absolute value of an isl_val, A at this point always contains a positive
diff --git a/polly/lib/Transform/CodePreparation.cpp b/polly/lib/Transform/CodePreparation.cpp
index 7c8579eb93218..d045fb6b62c90 100644
--- a/polly/lib/Transform/CodePreparation.cpp
+++ b/polly/lib/Transform/CodePreparation.cpp
@@ -27,6 +27,26 @@
 using namespace llvm;
 using namespace polly;
 
+static bool runCodePreprationImpl(Function &F, DominatorTree *DT, LoopInfo *LI,
+                                  RegionInfo *RI) {
+  // Find first non-alloca instruction. Every basic block has a non-alloca
+  // instruction, as every well formed basic block has a terminator.
+  auto &EntryBlock = F.getEntryBlock();
+  BasicBlock::iterator I = EntryBlock.begin();
+  while (isa<AllocaInst>(I))
+    ++I;
+
+  // Abort if not necessary to split
+  if (I->isTerminator() && isa<BranchInst>(I) &&
+      cast<BranchInst>(I)->isUnconditional())
+    return false;
+
+  // splitBlock updates DT, LI and RI.
+  splitEntryBlockForAlloca(&EntryBlock, DT, LI, RI);
+
+  return true;
+}
+
 namespace {
 
 /// Prepare the IR for the scop detection.
@@ -35,9 +55,6 @@ class CodePreparation final : public FunctionPass {
   CodePreparation(const CodePreparation &) = delete;
   const CodePreparation &operator=(const CodePreparation &) = delete;
 
-  LoopInfo *LI;
-  ScalarEvolution *SE;
-
   void clear();
 
 public:
@@ -58,19 +75,11 @@ class CodePreparation final : public FunctionPass {
 
 PreservedAnalyses CodePreparationPass::run(Function &F,
                                            FunctionAnalysisManager &FAM) {
-
-  // Find first non-alloca instruction. Every basic block has a non-alloca
-  // instruction, as every well formed basic block has a terminator.
-  auto &EntryBlock = F.getEntryBlock();
-  BasicBlock::iterator I = EntryBlock.begin();
-  while (isa<AllocaInst>(I))
-    ++I;
-
   auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
   auto &LI = FAM.getResult<LoopAnalysis>(F);
-
-  // splitBlock updates DT, LI and RI.
-  splitEntryBlockForAlloca(&EntryBlock, &DT, &LI, nullptr);
+  bool Changed = runCodePreprationImpl(F, &DT, &LI, nullptr);
+  if (!Changed)
+    return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
   PA.preserve<DominatorTreeAnalysis>();
@@ -84,7 +93,6 @@ CodePreparation::~CodePreparation() { clear(); }
 
 void CodePreparation::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<LoopInfoWrapperPass>();
-  AU.addRequired<ScalarEvolutionWrapperPass>();
 
   AU.addPreserved<LoopInfoWrapperPass>();
   AU.addPreserved<RegionInfoPass>();
@@ -96,10 +104,11 @@ bool CodePreparation::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  RegionInfo *RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
 
-  splitEntryBlockForAlloca(&F.getEntryBlock(), this);
+  runCodePreprationImpl(F, DT, LI, RI);
 
   return true;
 }
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 070700a64a168..f01d3decd9a1c 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -237,6 +237,7 @@ struct OptimizerAdditionalInfoTy {
   bool Postopts;
   bool Prevect;
   bool &DepsChanged;
+  IslMaxOperationsGuard &MaxOpGuard;
 };
 
 class ScheduleTreeOptimizer final {
@@ -381,6 +382,8 @@ class ScheduleTreeOptimizer final {
 isl::schedule_node
 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
                                                int VectorWidth) {
+  if (Node.is_null())
+    return {};
   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
   Node = Node.child(0).child(0);
   isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation();
@@ -391,6 +394,8 @@ ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
   isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1);
   Node = Node.parent().parent();
   isl::union_set Options = IsolateOption.unite(AtomicOption);
+  if (Node.is_null())
+    return {};
   isl::schedule_node_band Result =
       Node.as<isl::schedule_node_band>().set_ast_build_options(Options);
   return Result;
@@ -411,9 +416,13 @@ struct InsertSimdMarkers final : ScheduleNodeRewriter<InsertSimdMarkers> {
 
 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
     isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) {
+  if (Node.is_null())
+    return {};
   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
 
   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
+  if (Space.is_null())
+    return {};
   unsigned ScheduleDimensions = unsignedFromIslSize(Space.dim(isl::dim::set));
   assert(DimToVectorize < ScheduleDimensions);
 
@@ -439,12 +448,15 @@ isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
   // Sink the inner loop into the smallest possible statements to make them
   // represent a single vector instruction if possible.
   Node = isl::manage(isl_schedule_node_band_sink(Node.release()));
+  if (Node.is_null())
+    return {};
 
   // Add SIMD markers to those vector statements.
   InsertSimdMarkers SimdMarkerInserter;
   Node = SimdMarkerInserter.visit(Node);
 
-  PrevectOpts++;
+  if (!Node.is_null())
+    PrevectOpts++;
   return Node.parent();
 }
 
@@ -535,6 +547,8 @@ ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) {
 isl::schedule_node
 ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node) {
   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
+  if (Space.is_null())
+    return {};
   int Dims = unsignedFromIslSize(Space.dim(isl::dim::set));
 
   for (int i = Dims - 1; i >= 0; i--)
@@ -572,9 +586,14 @@ ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
     Node = applyTileBandOpt(Node);
 
   if (OAI->Prevect) {
+    IslQuotaScope MaxScope = OAI->MaxOpGuard.enter();
+
     // FIXME: Prevectorization requirements are different from those checked by
     // isTileableBandNode.
     Node = applyPrevectBandOpt(Node);
+
+    if (OAI->MaxOpGuard.hasQuotaExceeded() || Node.is_null())
+      return (isl::schedule_node()).release();
   }
 
   return Node.release();
@@ -771,6 +790,10 @@ static void runIslScheduleOptimizer(
     return;
   }
 
+  isl_ctx *Ctx = S.getIslCtx().get();
+  IslMaxOperationsGuard MaxOpGuard(Ctx, ScheduleComputeOut,
+                                   /*AutoEnter=*/false);
+
   // Apply ISL's algorithm only if not overridden by the user. Note that
   // post-rescheduling optimizations (tiling, pattern-based, prevectorization)
   // rely on the coincidence/permutable annotations on schedule tree bands that
@@ -853,8 +876,6 @@ static void runIslScheduleOptimizer(
       IslOuterCoincidence = 0;
     }
 
-    isl_ctx *Ctx = S.getIslCtx().get();
-
     isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
     isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
     isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
@@ -870,28 +891,20 @@ static void runIslScheduleOptimizer(
     SC = SC.set_coincidence(Validity);
 
     {
-      IslMaxOperationsGuard MaxOpGuard(Ctx, ScheduleComputeOut);
+      IslQuotaScope MaxOpScope = MaxOpGuard.enter();
       Schedule = SC.compute_schedule();
-
-      if (MaxOpGuard.hasQuotaExceeded())
-        POLLY_DEBUG(
-            dbgs() << "Schedule optimizer calculation exceeds ISL quota\n");
     }
 
     isl_options_set_on_error(Ctx, OnErrorStatus);
 
-    ScopsRescheduled++;
+    if (!Schedule.is_null())
+      ScopsRescheduled++;
     POLLY_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling"));
   }
 
   walkScheduleTreeForStatistics(Schedule, 1);
 
-  // In cases the scheduler is not able to optimize the code, we just do not
-  // touch the schedule.
-  if (Schedule.is_null())
-    return;
-
-  if (GreedyFusion) {
+  if (GreedyFusion && !Schedule.is_null()) {
     isl::union_map Validity = D.getDependences(
         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW);
     Schedule = applyGreedyFusion(Schedule, Validity);
@@ -905,14 +918,36 @@ static void runIslScheduleOptimizer(
       /*PatternOpts=*/!HasUserTransformation && PMBasedOpts,
       /*Postopts=*/!HasUserTransformation && EnablePostopts,
       /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE,
-      DepsChanged};
-  if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) {
+      DepsChanged,
+      MaxOpGuard};
+  if (!Schedule.is_null() && (OAI.PatternOpts || OAI.Postopts || OAI.Prevect)) {
     Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);
     Schedule = hoistExtensionNodes(Schedule);
     POLLY_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations"));
     walkScheduleTreeForStatistics(Schedule, 2);
   }
 
+  // Check for why any computation could have failed
+  if (MaxOpGuard.hasQuotaExceeded()) {
+    POLLY_DEBUG(dbgs() << "Schedule optimizer calculation exceeds ISL quota\n");
+    return;
+  } else if (isl_ctx_last_error(Ctx) != isl_error_none) {
+    POLLY_DEBUG({
+      const char *File = isl_ctx_last_error_file(Ctx);
+      int Line = isl_ctx_last_error_line(Ctx);
+      const char *Msg = isl_ctx_last_error_msg(Ctx);
+      dbgs() << "ISL reported an error during the computation of a new "
+                "schedule at "
+             << File << ":" << Line << ": " << Msg;
+    });
+    isl_ctx_reset_error(Ctx);
+    return;
+  } else if (Schedule.is_null()) {
+    POLLY_DEBUG(dbgs() << "Schedule optimizer did not compute a new schedule "
+                          "for unknown reasons\n");
+    return;
+  }
+
   // Skip profitability check if user transformation(s) have been applied.
   if (!HasUserTransformation &&
       !ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule))
diff --git a/polly/lib/Transform/ScheduleTreeTransform.cpp b/polly/lib/Transform/ScheduleTreeTransform.cpp
index 3f3630027e6e3..c95c55858f038 100644
--- a/polly/lib/Transform/ScheduleTreeTransform.cpp
+++ b/polly/lib/Transform/ScheduleTreeTransform.cpp
@@ -972,6 +972,9 @@ BandAttr *polly::getBandAttr(isl::schedule_node MarkOrBand) {
 }
 
 isl::schedule polly::hoistExtensionNodes(isl::schedule Sched) {
+  if (Sched.is_null())
+    return {};
+
   // If there is no extension node in the first place, return the original
   // schedule tree.
   if (!containsExtensionNode(Sched))
@@ -1126,6 +1129,8 @@ isl::set polly::getPartialTilePrefixes(isl::set ScheduleRange,
 
 isl::union_set polly::getIsolateOptions(isl::set IsolateDomain,
                                         unsigned OutDimsNum) {
+  if (IsolateDomain.is_null())
+    return {};
   unsigned Dims = unsignedFromIslSize(IsolateDomain.tuple_dim());
   assert(OutDimsNum <= Dims &&
          "The isl::set IsolateDomain is used to describe the range of schedule "
diff --git a/polly/test/CodeGen/Metadata/fallback_vec_annotate.ll b/polly/test/CodeGen/Metadata/fallback_vec_annotate.ll
new file mode 100644
index 0000000000000..317d30649ab1d
--- /dev/null
+++ b/polly/test/CodeGen/Metadata/fallback_vec_annotate.ll
@@ -0,0 +1,28 @@
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-annotate-metadata-vectorize < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+
+; Verify vectorization is not disabled when RTC of Polly is false
+
+; CHECK: attributes {{.*}} = { "polly-optimized" }
+; CHECK-NOT: {{.*}} = !{!"llvm.loop.vectorize.enable", i32 0}
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-android10000"
+
+define void @ham(i64 %arg) {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb3, %bb
+  %phi = phi ptr [ %getelementptr4, %bb3 ], [ null, %bb ]
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb1
+  %getelementptr = getelementptr i8, ptr %phi, i64 1
+  store i8 0, ptr %getelementptr, align 1
+  br i1 false, label %bb2, label %bb3
+
+bb3:                                              ; preds = %bb2
+  %getelementptr4 = getelementptr i8, ptr %phi, i64 %arg
+  br label %bb1
+}
diff --git a/polly/test/ScheduleOptimizer/prevectorization_islbound.ll b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll
new file mode 100644
index 0000000000000..0bc3c2cf642e8
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll
@@ -0,0 +1,37 @@
+; RUN: opt %loadNPMPolly -S -polly-vectorizer=stripmine -passes=polly-opt-isl -polly-debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+define void @ham(ptr %arg, ptr %arg1, i32 %arg2, i32 %arg3, ptr %arg4, i32 %arg5, i32 %arg6) {
+bb:
+  %getelementptr = getelementptr [7 x float], ptr null, i32 0, i32 %arg3
+  br label %bb7
+
+bb7:                                              ; preds = %bb11, %bb
+  %phi = phi i32 [ 0, %bb ], [ %add16, %bb11 ]
+  br label %bb8
+
+bb8:                                              ; preds = %bb8, %bb7
+  %phi9 = phi i32 [ 0, %bb7 ], [ %add, %bb8 ]
+  %getelementptr10 = getelementptr [7 x float], ptr null, i32 0, i32 %phi9
+  store float 0.000000e+00, ptr %getelementptr10, align 4
+  %add = add i32 %phi9, 1
+  %icmp = icmp eq i32 %phi9, 0
+  br i1 %icmp, label %bb8, label %bb11
+
+bb11:                                             ; preds = %bb8
+  %load = load float, ptr %getelementptr, align 4
+  store float %load, ptr %arg4, align 4
+  %getelementptr12 = getelementptr [7 x float], ptr null, i32 0, i32 %arg5
+  %load13 = load float, ptr %getelementptr12, align 4
+  store float %load13, ptr %arg, align 4
+  %getelementptr14 = getelementptr [7 x float], ptr null, i32 0, i32 %arg6
+  %load15 = load float, ptr %getelementptr14, align 4
+  store float %load15, ptr %arg1, align 4
+  %add16 = add i32 %phi, 1
+  %icmp17 = icmp ne i32 %phi, %arg2
+  br i1 %icmp17, label %bb7, label %bb18
+
+bb18:                                             ; preds = %bb11
+  ret void
+}
+; CHECK:Schedule optimizer calculation exceeds ISL quota
diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt
index d3280a5867dec..b17bd8f665361 100644
--- a/runtimes/CMakeLists.txt
+++ b/runtimes/CMakeLists.txt
@@ -233,6 +233,20 @@ endif()
 option(LLVM_INCLUDE_TESTS "Generate build targets for the runtimes unit tests." ON)
 option(LLVM_INCLUDE_DOCS "Generate build targets for the runtimes documentation." ON)
 option(LLVM_ENABLE_SPHINX "Use Sphinx to generate the runtimes documentation." OFF)
+option(RUNTIMES_EXECUTE_ONLY_CODE "Compile runtime libraries as execute-only." OFF)
+
+if (RUNTIMES_EXECUTE_ONLY_CODE)
+  # If a target doesn't support or recognise -mexecute-only, Clang will simply ignore the flag.
+  # We can check for this case using -Werror=unused-command-line-argument.
+  check_c_compiler_flag("-mexecute-only -Werror=unused-command-line-argument" C_SUPPORTS_MEXECUTE_ONLY)
+  if (NOT C_SUPPORTS_MEXECUTE_ONLY)
+    message(FATAL_ERROR "RUNTIMES_EXECUTE_ONLY_CODE was turned on, but the target '${LLVM_TARGET_TRIPLE}'"
+                        " doesn't support the -mexecute-only flag")
+  endif()
+
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mexecute-only")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mexecute-only")
+endif()
 
 # Use libtool instead of ar if you are both on an Apple host, and targeting Apple.
 if(CMAKE_HOST_APPLE AND APPLE)
@@ -242,6 +256,20 @@ endif()
 # This can be used to detect whether we're in the runtimes build.
 set(LLVM_RUNTIMES_BUILD ON)
 
+# Make GTest available to all runtimes
+# The runtime must call make_gtest_target() for it to become available. This is
+# to avoid build failures when gtest is not actually needed. In particular,
+# mingw-incomplete-sysroot is missing C++ header files that GTest needs to
+# compile.
+function(build_gtest)
+  if(TARGET default_gtest)
+    # Already available
+    return()
+  endif()
+
+  add_subdirectory("${LLVM_THIRD_PARTY_DIR}/unittest" "${CMAKE_BINARY_DIR}/third-party/runtimes_gtest")
+endfunction()
+
 foreach(entry ${runtimes})
   get_filename_component(projName ${entry} NAME)
 
diff --git a/runtimes/cmake/Modules/HandleLibC.cmake b/runtimes/cmake/Modules/HandleLibC.cmake
index 51fbf04df7e3b..f8869512f99d3 100644
--- a/runtimes/cmake/Modules/HandleLibC.cmake
+++ b/runtimes/cmake/Modules/HandleLibC.cmake
@@ -30,6 +30,9 @@ elseif (RUNTIMES_USE_LIBC STREQUAL "llvm-libc")
   check_cxx_compiler_flag(-nostdlibinc CXX_SUPPORTS_NOSTDLIBINC_FLAG)
   if(CXX_SUPPORTS_NOSTDLIBINC_FLAG)
     target_compile_options(runtimes-libc-headers INTERFACE "-nostdlibinc")
+    if(LIBC_KERNEL_HEADERS)
+      target_compile_options(runtimes-libc-headers INTERFACE "-idirafter${LIBC_KERNEL_HEADERS}")
+    endif()
   endif()
 
   add_library(runtimes-libc-static INTERFACE)
diff --git a/third-party/benchmark/include/benchmark/benchmark.h b/third-party/benchmark/include/benchmark/benchmark.h
index 08cfe29da344e..c2debb216d64f 100644
--- a/third-party/benchmark/include/benchmark/benchmark.h
+++ b/third-party/benchmark/include/benchmark/benchmark.h
@@ -250,6 +250,10 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
   _Pragma("GCC diagnostic push")             \
   _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
 #define BENCHMARK_RESTORE_DEPRECATED_WARNING _Pragma("GCC diagnostic pop")
+#define BENCHMARK_DISABLE_PEDANTIC_WARNING \
+  _Pragma("GCC diagnostic push")             \
+  _Pragma("GCC diagnostic ignored \"-Wpedantic\"")
+#define BENCHMARK_RESTORE_PEDANTIC_WARNING _Pragma("GCC diagnostic pop")
 #elif defined(__NVCOMPILER)
 #define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
 #define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
@@ -257,6 +261,8 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
   _Pragma("diagnostic push") \
   _Pragma("diag_suppress deprecated_entity_with_custom_message")
 #define BENCHMARK_RESTORE_DEPRECATED_WARNING _Pragma("diagnostic pop")
+#define BENCHMARK_DISABLE_PEDANTIC_WARNING
+#define BENCHMARK_RESTORE_PEDANTIC_WARNING
 #else
 #define BENCHMARK_BUILTIN_EXPECT(x, y) x
 #define BENCHMARK_DEPRECATED_MSG(msg)
@@ -265,6 +271,8 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
       __LINE__) ") : warning note: " msg))
 #define BENCHMARK_DISABLE_DEPRECATED_WARNING
 #define BENCHMARK_RESTORE_DEPRECATED_WARNING
+#define BENCHMARK_DISABLE_PEDANTIC_WARNING
+#define BENCHMARK_RESTORE_PEDANTIC_WARNING
 #endif
 // clang-format on
 
@@ -1462,11 +1470,13 @@ class Fixture : public internal::Benchmark {
 // Check that __COUNTER__ is defined and that __COUNTER__ increases by 1
 // every time it is expanded. X + 1 == X + 0 is used in case X is defined to be
 // empty. If X is empty the expression becomes (+1 == +0).
+BENCHMARK_DISABLE_PEDANTIC_WARNING
 #if defined(__COUNTER__) && (__COUNTER__ + 1 == __COUNTER__ + 0)
 #define BENCHMARK_PRIVATE_UNIQUE_ID __COUNTER__
 #else
 #define BENCHMARK_PRIVATE_UNIQUE_ID __LINE__
 #endif
+BENCHMARK_RESTORE_PEDANTIC_WARNING
 
 // Helpers for generating unique variable names
 #ifdef BENCHMARK_HAS_CXX11
diff --git a/third-party/unittest/CMakeLists.txt b/third-party/unittest/CMakeLists.txt
index 3fa885a16ea1e..79535a1de4616 100644
--- a/third-party/unittest/CMakeLists.txt
+++ b/third-party/unittest/CMakeLists.txt
@@ -11,6 +11,34 @@
 #
 # Project-wide settings
 
+set(LLVM_SUBPROJECT_TITLE "Third-Party/Google Test")
+
+if(LLVM_RUNTIMES_BUILD)
+  # This instance of GTest is use for unittests for the runtime libraries. It
+  # must not link to LLVMSupport (used for llvm::raw_ostream and llvm::cl
+  # support) of the host build: It may be a different architecture and/or
+  # using a different C++ ABI such as libcxx/libstdc++.
+  set(GTEST_LLVM_COMPONENTS "")
+
+  # We cannot use llvm_gtest for the target; it would clash with
+  # find_package(LLVM) with LLVM_EXPORT_GTEST=ON. Instead, we define an alias
+  # default_gtest that points to llvm_gtest in the LLVM build and
+  # runtimes_gtest in an runtimes build.
+  set(gtest_name "runtimes_gtest")
+
+  # Override locally; never install the runtimes-GTest.
+  set(LLVM_INSTALL_GTEST OFF)
+
+  # Build the library containing main() so unittests need less boilerplate.
+  # UnitTestMain/TestMain.cpp always needs LLVMSupport, use GTest's original
+  # main when unavailable.
+  set(gtest_main_src googletest/src/gtest_main.cc)
+else()
+  set(GTEST_LLVM_COMPONENTS "Support")
+  set(gtest_name "llvm_gtest")
+  set(gtest_main_src UnitTestMain/TestMain.cpp)
+endif()
+
 if(WIN32)
   add_definitions(-DGTEST_OS_WINDOWS=1)
 endif()
@@ -38,17 +66,13 @@ if (HAVE_LIBPTHREAD)
   list(APPEND LIBS pthread)
 endif()
 
-# Make available for runtimes using the LLVM buildtree
-# (required for unittests in bootstrapping builds)
-set(EXCLUDE_FROM_ALL OFF)
-
 # Install GTest only if requested.
 set(BUILDTREE_ONLY BUILDTREE_ONLY)
 if (LLVM_INSTALL_GTEST)
   set(BUILDTREE_ONLY "")
 endif ()
 
-add_llvm_library(llvm_gtest
+add_llvm_library("${gtest_name}"
   googletest/src/gtest-all.cc
   googlemock/src/gmock-all.cc
 
@@ -56,7 +80,7 @@ add_llvm_library(llvm_gtest
   ${LIBS}
 
   LINK_COMPONENTS
-  Support # Depends on llvm::raw_ostream
+  ${GTEST_LLVM_COMPONENTS}
 
   # This is a library meant only for the build tree.
   ${BUILDTREE_ONLY}
@@ -67,15 +91,14 @@ add_llvm_library(llvm_gtest
 # that warning here for any targets that link to gtest.
 if(CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG)
   add_definitions("-Wno-suggest-override")
-  set_target_properties(llvm_gtest PROPERTIES INTERFACE_COMPILE_OPTIONS "-Wno-suggest-override")
+  set_target_properties("${gtest_name}" PROPERTIES INTERFACE_COMPILE_OPTIONS "-Wno-suggest-override")
 endif()
 
 if (NOT LLVM_ENABLE_THREADS)
-  target_compile_definitions(llvm_gtest PUBLIC GTEST_HAS_PTHREAD=0)
+  target_compile_definitions("${gtest_name}" PUBLIC GTEST_HAS_PTHREAD=0)
 endif ()
 
-# Top-level include directory required for "llvm/Support/raw_os_ostream.h"
-target_include_directories(llvm_gtest
+target_include_directories("${gtest_name}"
   PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/googletest/include>
          $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/googlemock/include>
          $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/>
@@ -84,21 +107,36 @@ target_include_directories(llvm_gtest
   PRIVATE googletest googlemock
   )
 
-# When used from the buildtree, also force use of buildtree LLVM headers,
-# (instead locally installed version)
-# FIXME: Shouldn't this be done for all LLVM libraries? Currently, LLVM uses a
-# big giant `include_directories( ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR})`
-# which CMake does not add to the import library.
-target_include_directories(llvm_gtest BEFORE
-  PUBLIC $<BUILD_INTERFACE:${LLVM_SOURCE_DIR}/include>
-         $<BUILD_INTERFACE:${LLVM_BINARY_DIR}/include>
-  )
+if(LLVM_RUNTIMES_BUILD)
+  target_compile_definitions("${gtest_name}" PUBLIC GTEST_NO_LLVM_SUPPORT=1)
+else()
+  # When used from the buildtree, also force use of buildtree LLVM headers,
+  # (instead locally installed version)
+  # FIXME: Shouldn't this be done for all LLVM libraries? Currently, LLVM uses a
+  # big giant `include_directories( ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR})`
+  # which CMake does not add to the import library.
+  target_include_directories("${gtest_name}" BEFORE
+    PUBLIC $<BUILD_INTERFACE:${LLVM_SOURCE_DIR}/include>
+           $<BUILD_INTERFACE:${LLVM_BINARY_DIR}/include>
+    )
+endif()
+
 
-add_subdirectory(UnitTestMain)
+add_llvm_library("${gtest_name}_main"
+  ${gtest_main_src}
+
+  LINK_LIBS
+  "${gtest_name}"
+
+  LINK_COMPONENTS
+  ${GTEST_LLVM_COMPONENTS}
+
+  ${BUILDTREE_ONLY}
+)
 
 if (LLVM_INSTALL_GTEST)
-  install(DIRECTORY googletest/include/gtest/ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/llvm-gtest/gtest/" COMPONENT llvm_gtest)
-  install(DIRECTORY googlemock/include/gmock/ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/llvm-gmock/gmock/" COMPONENT llvm_gtest)
+  install(DIRECTORY googletest/include/gtest/ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/llvm-gtest/gtest/" COMPONENT "${gtest_name}")
+  install(DIRECTORY googlemock/include/gmock/ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/llvm-gmock/gmock/" COMPONENT "${gtest_name}")
 endif()
 
 # When LLVM_LINK_LLVM_DYLIB is enabled, libLLVM.so is added to the interface
@@ -118,5 +156,14 @@ function (gtest_remove_dylib_from_link_interface target)
   endif()
 endfunction()
 
-gtest_remove_dylib_from_link_interface(llvm_gtest)
-gtest_remove_dylib_from_link_interface(llvm_gtest_main)
+if (NOT LLVM_RUNTIMES_BUILD)
+  gtest_remove_dylib_from_link_interface("${gtest_name}")
+  gtest_remove_dylib_from_link_interface("${gtest_name}_main")
+endif ()
+
+# The build processing this file always uses this GTest for unittests.
+# Projects that do not use the LLVM_ENABLE_PROJECTS mechanism, but are
+# standalone-builds using find_package(LLVM) can define these aliases
+# explicitly.
+add_library(default_gtest ALIAS "${gtest_name}")
+add_library(default_gtest_main ALIAS "${gtest_name}_main")
diff --git a/third-party/unittest/UnitTestMain/CMakeLists.txt b/third-party/unittest/UnitTestMain/CMakeLists.txt
deleted file mode 100644
index 729ea7e3fa7e2..0000000000000
--- a/third-party/unittest/UnitTestMain/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-set(BUILDTREE_ONLY BUILDTREE_ONLY)
-if (LLVM_INSTALL_GTEST)
-  set(BUILDTREE_ONLY "")
-endif ()
-
-add_llvm_library(llvm_gtest_main
-  TestMain.cpp
-
-  LINK_LIBS
-  llvm_gtest
-
-  LINK_COMPONENTS
-  Support # Depends on llvm::cl
-
-  ${BUILDTREE_ONLY}
-  )
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 4d279bffd3723..deb56dc0957e9 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -1231,6 +1231,7 @@ cc_library(
         ":format",
         ":frontend",
         ":lex",
+        ":options",
         ":rewrite",
         ":support",
         ":tooling_core",
@@ -1501,12 +1502,25 @@ cc_library(
 
 gentbl_cc_library(
     name = "driver_options_inc_gen",
-    tbl_outs = {"include/clang/Driver/Options.inc": ["-gen-opt-parser-defs"]},
+    tbl_outs = {"include/clang/Options/Options.inc": ["-gen-opt-parser-defs"]},
     tblgen = "//llvm:llvm-tblgen",
-    td_file = "include/clang/Driver/Options.td",
+    td_file = "include/clang/Options/Options.td",
     deps = ["//llvm:OptParserTdFiles"],
 )
 
+cc_library(
+    name = "options",
+    srcs = glob(["lib/Options/*.cpp"]),
+    hdrs = glob(["include/clang/Options/*.h"]),
+    includes = ["include"],
+    deps = [
+        ":basic",
+        ":driver_options_inc_gen",
+        ":static_analyzer_checkers_gen",
+        "//llvm:Option",
+    ],
+)
+
 cc_library(
     name = "driver",
     srcs = glob(
@@ -1544,6 +1558,7 @@ cc_library(
         ":config",
         ":driver_options_inc_gen",
         ":lex",
+        ":options",
         ":parse",
         ":static_analyzer_checkers_gen",
         "//llvm:BinaryFormat",
@@ -1700,6 +1715,7 @@ cc_library(
         ":driver_options_inc_gen",
         ":edit",
         ":lex",
+        ":options",
         ":parse",
         ":sema",
         ":serialization",
@@ -1769,6 +1785,7 @@ cc_library(
         ":frontend",
         ":frontend_tool",
         ":lex",
+        ":options",
         ":parse",
         ":sema",
         ":serialization",
@@ -2001,6 +2018,7 @@ cc_library(
         ":extract_api",
         ":frontend",
         ":frontend_rewrite",
+        ":options",
         ":static_analyzer_frontend",
         "//llvm:Option",
         "//llvm:Support",
@@ -2176,6 +2194,7 @@ cc_library(
         ":frontend_rewrite",
         ":frontend_tool",
         ":lex",
+        ":options",
         ":parse",
         ":sema",
         ":serialization",
@@ -2258,6 +2277,7 @@ cc_binary(
         ":driver",
         ":frontend",
         ":frontend_rewrite",
+        ":options",
         ":serialization",
         ":static_analyzer_frontend",
         ":tooling",
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 599bc4b3d8bbf..ecd11b91d0d86 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1000,6 +1000,7 @@ libc_support_library(
         ":__support_ctype_utils",
         ":__support_str_to_num_result",
         ":__support_uint128",
+        ":__support_wctype_utils",
         ":hdr_errno_macros",
     ],
 )
@@ -1804,6 +1805,7 @@ libc_support_library(
         ":__support_cpp_optional",
         ":__support_macros_attributes",
         ":__support_macros_config",
+        ":types_wchar_t",
         ":types_wint_t",
     ],
 )
@@ -1858,6 +1860,7 @@ libc_function(
     hdrs = ["src/ctype/isalnum.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1868,6 +1871,7 @@ libc_function(
     hdrs = ["src/ctype/isalpha.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1908,6 +1912,7 @@ libc_function(
     hdrs = ["src/ctype/isdigit.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1918,6 +1923,7 @@ libc_function(
     hdrs = ["src/ctype/isgraph.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1928,6 +1934,7 @@ libc_function(
     hdrs = ["src/ctype/islower.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1948,6 +1955,7 @@ libc_function(
     hdrs = ["src/ctype/ispunct.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1958,6 +1966,7 @@ libc_function(
     hdrs = ["src/ctype/isspace.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1968,6 +1977,7 @@ libc_function(
     hdrs = ["src/ctype/isupper.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1978,6 +1988,7 @@ libc_function(
     hdrs = ["src/ctype/isxdigit.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1998,6 +2009,7 @@ libc_function(
     hdrs = ["src/ctype/tolower.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -2008,6 +2020,7 @@ libc_function(
     hdrs = ["src/ctype/toupper.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -2963,6 +2976,22 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_exp2m1f16",
+    hdrs = ["src/__support/math/exp2m1f16.h"],
+    deps = [
+        ":__support_fputil_except_value_utils",
+        ":__support_fputil_fma",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_nearest_integer",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_rounding_mode",
+        ":__support_macros_optimization",
+        ":__support_math_common_constants",
+        ":__support_math_expxf16_utils",
+    ],
+)
+
 libc_support_library(
     name = "__support_math_exp10",
     hdrs = ["src/__support/math/exp10.h"],
@@ -3761,7 +3790,7 @@ libc_math_function(
 libc_math_function(
     name = "exp2m1f16",
     additional_deps = [
-        ":__support_math_expxf16_utils",
+        ":__support_math_exp2m1f16",
     ],
 )
 
@@ -5252,6 +5281,7 @@ libc_function(
     hdrs = ["src/stdlib/strfromf.h"],
     deps = [
         ":__support_common",
+        ":printf_error_mapper",
         ":str_from_util",
     ],
 )
@@ -5262,6 +5292,7 @@ libc_function(
     hdrs = ["src/stdlib/strfromd.h"],
     deps = [
         ":__support_common",
+        ":printf_error_mapper",
         ":str_from_util",
     ],
 )
@@ -5272,6 +5303,7 @@ libc_function(
     hdrs = ["src/stdlib/strfroml.h"],
     deps = [
         ":__support_common",
+        ":printf_error_mapper",
         ":str_from_util",
     ],
 )
@@ -6484,6 +6516,27 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "printf_error_mapper",
+    hdrs = [
+        "src/stdio/printf_core/error_mapper.h",
+    ] + select({
+        "@platforms//os:linux": [
+            "src/stdio/printf_core/linux/error_mapper.h",
+        ],
+        "//conditions:default": [
+            "src/stdio/printf_core/generic/error_mapper.h",
+        ],
+    }),
+    deps = [
+        ":__support_cpp_type_traits",
+        ":__support_error_or",
+        ":__support_macros_properties_architectures",
+        ":hdr_errno_macros",
+        ":printf_core_structs",
+    ],
+)
+
 libc_support_library(
     name = "printf_main",
     hdrs = ["src/stdio/printf_core/printf_main.h"],
@@ -6491,6 +6544,7 @@ libc_support_library(
         ":__support_arg_list",
         ":printf_converter",
         ":printf_core_structs",
+        ":printf_error_mapper",
         ":printf_parser",
         ":printf_writer",
     ],
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
index cbc6d13ce45bd..e33199c054fb9 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
@@ -87,6 +87,8 @@ libc_test(
     name = "fprintf_test",
     srcs = ["fprintf_test.cpp"],
     deps = [
+        "//libc:__support_cpp_limits",
+        "//libc:__support_macros_properties_architectures",
         "//libc:fprintf",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
index fd4389e8c0579..8d30869056ec7 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
@@ -174,6 +174,7 @@ libc_test_library(
     deps = [
         "//libc:__support_cpp_type_traits",
         "//libc:__support_fputil_fp_bits",
+        "//libc:__support_macros_properties_architectures",
         "//libc/test/UnitTest:LibcUnitTest",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
index 7d62afc982be8..8ebb6ab5daa3c 100644
--- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
@@ -550,6 +550,7 @@ cc_library(
             "include/lldb/Host/posix/*.h",
         ]),
     }),
+    features = ["-layering_check"],  # histedit.h breaks this.
     includes = ["include"],
     # TODO: Move this to Config library when https://github.com/bazelbuild/bazel/issues/21884 is fixed
     linkopts = select({
@@ -1051,7 +1052,7 @@ gentbl_cc_library(
     strip_include_prefix = "tools/lldb-dap",
     tbl_outs = {"tools/lldb-dap/Options.inc": ["-gen-opt-parser-defs"]},
     tblgen = "//llvm:llvm-tblgen",
-    td_file = "tools/lldb-dap/Options.td",
+    td_file = "tools/lldb-dap/tool/Options.td",
     deps = ["//llvm:OptParserTdFiles"],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
index ae9ff2878e03b..3ded8bdf89514 100644
--- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
@@ -80,6 +80,7 @@ cc_library(
     hdrs = glob(["LanguageRuntime/CPlusPlus/*.h"]),
     includes = [".."],
     deps = [
+        "//clang:codegen",
         "//lldb:CoreHeaders",
         "//lldb:Headers",
         "//lldb:SymbolHeaders",
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index bcee6c4e243a4..67c397e34b8c7 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -693,6 +693,8 @@ cc_binary(
         "utils/TableGen/Basic/DirectiveEmitter.cpp",
         "utils/TableGen/Basic/IntrinsicEmitter.cpp",
         "utils/TableGen/Basic/RISCVTargetDefEmitter.cpp",
+        "utils/TableGen/Basic/RuntimeLibcalls.cpp",
+        "utils/TableGen/Basic/RuntimeLibcalls.h",
         "utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp",
         "utils/TableGen/Basic/SDNodeProperties.cpp",
         "utils/TableGen/Basic/SDNodeProperties.h",
@@ -1727,6 +1729,7 @@ cc_library(
     copts = llvm_copts,
     deps = [
         ":Analysis",
+        ":Core",
         ":ProfileData",
         ":Support",
         ":TargetParser",
@@ -2315,6 +2318,7 @@ llvm_target_lib_list = [lib for lib in [
             "lib/Target/BPF/BPFGenInstrInfo.inc": ["-gen-instr-info"],
             "lib/Target/BPF/BPFGenRegisterInfo.inc": ["-gen-register-info"],
             "lib/Target/BPF/BPFGenSubtargetInfo.inc": ["-gen-subtarget"],
+            "lib/Target/BPF/BPFGenSDNodeInfo.inc": ["-gen-sd-node-info"],
         },
     },
     {
@@ -2551,6 +2555,7 @@ llvm_target_lib_list = [lib for lib in [
             "lib/Target/WebAssembly/WebAssemblyGenRegisterInfo.inc": ["-gen-register-info"],
             "lib/Target/WebAssembly/WebAssemblyGenSubtargetInfo.inc": ["-gen-subtarget"],
             "lib/Target/WebAssembly/WebAssemblyGenAsmMatcher.inc": ["-gen-asm-matcher"],
+            "lib/Target/WebAssembly/WebAssemblyGenSDNodeInfo.inc": ["-gen-sd-node-info"],
         },
     },
     {
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 101dfb7cf68ae..5bcf625568190 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -8,6 +8,7 @@
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
+load("@rules_python//python:defs.bzl", "py_binary")
 load("//llvm:targets.bzl", "llvm_targets")
 load(
     ":build_defs.bzl",
@@ -357,6 +358,34 @@ cc_library(
     ],
 )
 
+td_library(
+    name = "AlignmentAttrInterfaceTdFiles",
+    srcs = ["include/mlir/Interfaces/AlignmentAttrInterface.td"],
+    includes = ["include"],
+    deps = [":OpBaseTdFiles"],
+)
+
+gentbl_cc_library(
+    name = "AlignmentAttrInterfaceIncGen",
+    tbl_outs = {
+        "include/mlir/Interfaces/AlignmentAttrInterface.h.inc": ["-gen-op-interface-decls"],
+        "include/mlir/Interfaces/AlignmentAttrInterface.cpp.inc": ["-gen-op-interface-defs"],
+    },
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Interfaces/AlignmentAttrInterface.td",
+    deps = [":OpBaseTdFiles"],
+)
+
+cc_library(
+    name = "AlignmentAttrInterface",
+    hdrs = ["include/mlir/Interfaces/AlignmentAttrInterface.h"],
+    deps = [
+        ":AlignmentAttrInterfaceIncGen",
+        ":IR",
+        "//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "IR",
     srcs = glob([
@@ -3781,6 +3810,7 @@ cc_library(
         ":XeGPUDialect",
         ":XeGPUPassIncGen",
         ":XeGPUUtils",
+        ":XeGPUuArch",
         "//llvm:Support",
     ],
 )
@@ -3815,6 +3845,55 @@ cc_library(
     ],
 )
 
+td_library(
+    name = "XeGPUTransformOpsTdFiles",
+    srcs = [
+        "include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td",
+    ],
+    includes = ["include"],
+    deps = [
+        ":TransformDialectTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "XeGPUTransformOpsIncGen",
+    tbl_outs = {
+        "include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h.inc": ["-gen-op-decls"],
+        "include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp.inc": ["-gen-op-defs"],
+    },
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td",
+    deps = [
+        ":TransformDialectTdFiles",
+        ":XeGPUTransformOpsTdFiles",
+    ],
+)
+
+cc_library(
+    name = "XeGPUTransformOps",
+    srcs = [
+        "lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp",
+    ],
+    hdrs = [
+        "include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":DialectUtils",
+        ":GPUDialect",
+        ":IR",
+        ":SCFDialect",
+        ":SCFUtils",
+        ":TransformDialect",
+        ":TransformDialectInterfaces",
+        ":XeGPUDialect",
+        ":XeGPUTransformOpsIncGen",
+        ":XeGPUUtils",
+        "//llvm:Support",
+    ],
+)
+
 td_library(
     name = "FuncTdFiles",
     srcs = [
@@ -4842,6 +4921,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":AffineDialect",
+        ":AlignmentAttrInterface",
         ":Analysis",
         ":ArithDialect",
         ":ArithUtils",
@@ -6801,6 +6881,7 @@ td_library(
     srcs = glob(["include/mlir/Dialect/SPIRV/IR/*.td"]),
     includes = ["include"],
     deps = [
+        ":AlignmentAttrInterfaceTdFiles",
         ":BuiltinDialectTdFiles",
         ":CallInterfacesTdFiles",
         ":ControlFlowInterfacesTdFiles",
@@ -6915,6 +6996,7 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
+        ":AlignmentAttrInterface",
         ":BytecodeOpInterface",
         ":CallOpInterfaces",
         ":CommonFolders",
@@ -9469,6 +9551,7 @@ cc_library(
         ":UBToLLVM",
         ":VectorToLLVM",
         ":VectorTransformOps",
+        ":XeGPUTransformOps",
         ":XeVMToLLVM",
         ":XeVMToLLVMIRTranslation",
     ],
@@ -10211,12 +10294,18 @@ cc_library(
     hdrs = glob(["include/mlir/Dialect/OpenACC/Transforms/*.h"]),
     includes = ["include"],
     deps = [
+        ":Analysis",
+        ":ArithDialect",
         ":FuncDialect",
+        ":FunctionInterfaces",
         ":IR",
+        ":MemRefDialect",
+        ":OpenACCAnalysis",
         ":OpenACCDialect",
         ":OpenACCPassIncGen",
         ":Pass",
         ":TransformUtils",
+        ":ViewLikeInterface",
         "//llvm:Support",
     ],
 )
@@ -11285,6 +11374,7 @@ td_library(
     ],
     includes = ["include"],
     deps = [
+        ":AlignmentAttrInterfaceTdFiles",
         ":ControlFlowInterfacesTdFiles",
         ":DestinationStyleOpInterfaceTdFiles",
         ":IndexingMapOpInterfaceTdFiles",
@@ -12865,6 +12955,7 @@ td_library(
     ],
     includes = ["include"],
     deps = [
+        ":AlignmentAttrInterfaceTdFiles",
         ":ArithOpsTdFiles",
         ":CastInterfacesTdFiles",
         ":ControlFlowInterfacesTdFiles",
@@ -12946,6 +13037,7 @@ cc_library(
     ],
     includes = ["include"],
     deps = [
+        ":AlignmentAttrInterface",
         ":AllocationOpInterface",
         ":ArithDialect",
         ":ArithUtils",
@@ -14340,3 +14432,8 @@ cc_library(
         "//llvm:Support",
     ],
 )
+
+py_binary(
+    name = "generate-test-checks",
+    srcs = ["utils/generate-test-checks.py"],
+)
diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
index be18ba5b46806..9658e4387a6c2 100644
--- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
@@ -1541,6 +1541,7 @@ filegroup(
         ":TransformSMTExtensionOpsPyGen",
         ":VectorTransformEnumPyGen",
         ":VectorTransformOpsPyGen",
+        ":XeGPUTransformOpsPyGen",
     ],
 )
 
@@ -1626,3 +1627,21 @@ filegroup(
         ":VectorOpsPyGen",
     ],
 )
+
+##---------------------------------------------------------------------------##
+# XEGPU dialect.
+##---------------------------------------------------------------------------##
+
+gentbl_filegroup(
+    name = "XeGPUTransformOpsPyGen",
+    tbl_outs = {"mlir/dialects/_xegpu_transform_ops_gen.py": [
+        "-gen-python-op-bindings",
+        "-bind-dialect=transform",
+        "-dialect-extension=xegpu_transform",
+    ]},
+    tblgen = "//mlir:mlir-tblgen",
+    td_file = "mlir/dialects/XeGPUTransformOps.td",
+    deps = [
+        "//mlir:XeGPUTransformOpsTdFiles",
+    ],
+)